From 3b0ce0e28db46d0403929aba45c682285e1ac217 Mon Sep 17 00:00:00 2001 From: Matt Macy Date: Mon, 24 Aug 2020 22:48:19 +0000 Subject: Vendor import of openzfs master @ 184df27eef0abdc7ab2105b21257f753834b936b Sponsored by: iX Systems, Inc. --- AUTHORS | 308 + CODE_OF_CONDUCT.md | 2 + COPYRIGHT | 31 + LICENSE | 384 + META | 10 + Makefile.am | 256 + NEWS | 3 + NOTICE | 16 + README.md | 35 + TEST | 50 + autogen.sh | 4 + cmd/Makefile.am | 10 + cmd/arc_summary/.gitignore | 1 + cmd/arc_summary/Makefile.am | 13 + cmd/arc_summary/arc_summary2 | 1093 + cmd/arc_summary/arc_summary3 | 943 + cmd/arcstat/.gitignore | 1 + cmd/arcstat/Makefile.am | 5 + cmd/arcstat/arcstat.in | 494 + cmd/dbufstat/.gitignore | 1 + cmd/dbufstat/Makefile.am | 5 + cmd/dbufstat/dbufstat.in | 669 + cmd/fsck_zfs/Makefile.am | 1 + cmd/fsck_zfs/fsck.zfs | 9 + cmd/mount_zfs/.gitignore | 1 + cmd/mount_zfs/Makefile.am | 20 + cmd/mount_zfs/mount_zfs.c | 408 + cmd/raidz_test/.gitignore | 1 + cmd/raidz_test/Makefile.am | 20 + cmd/raidz_test/raidz_bench.c | 227 + cmd/raidz_test/raidz_test.c | 782 + cmd/raidz_test/raidz_test.h | 116 + cmd/vdev_id/Makefile.am | 1 + cmd/vdev_id/vdev_id | 605 + cmd/zdb/.gitignore | 1 + cmd/zdb/Makefile.am | 16 + cmd/zdb/zdb.c | 8606 +++++ cmd/zdb/zdb.h | 33 + cmd/zdb/zdb_il.c | 431 + cmd/zed/.gitignore | 1 + cmd/zed/Makefile.am | 49 + cmd/zed/agents/README.md | 112 + cmd/zed/agents/fmd_api.c | 760 + cmd/zed/agents/fmd_api.h | 246 + cmd/zed/agents/fmd_serd.c | 316 + cmd/zed/agents/fmd_serd.h | 86 + cmd/zed/agents/zfs_agents.c | 422 + cmd/zed/agents/zfs_agents.h | 46 + cmd/zed/agents/zfs_diagnosis.c | 981 + cmd/zed/agents/zfs_mod.c | 956 + cmd/zed/agents/zfs_retire.c | 557 + cmd/zed/zed.c | 306 + cmd/zed/zed.d/.gitignore | 1 + cmd/zed/zed.d/Makefile.am | 53 + cmd/zed/zed.d/README | 30 + cmd/zed/zed.d/all-debug.sh | 26 + cmd/zed/zed.d/all-syslog.sh | 14 + cmd/zed/zed.d/data-notify.sh | 43 + cmd/zed/zed.d/generic-notify.sh | 54 + cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in | 85 + cmd/zed/zed.d/pool_import-led.sh | 1 + cmd/zed/zed.d/resilver_finish-notify.sh | 1 + cmd/zed/zed.d/resilver_finish-start-scrub.sh | 19 + cmd/zed/zed.d/scrub_finish-notify.sh | 59 + cmd/zed/zed.d/statechange-led.sh | 177 + cmd/zed/zed.d/statechange-notify.sh | 74 + cmd/zed/zed.d/trim_finish-notify.sh | 37 + cmd/zed/zed.d/vdev_attach-led.sh | 1 + cmd/zed/zed.d/vdev_clear-led.sh | 1 + cmd/zed/zed.d/zed-functions.sh | 538 + cmd/zed/zed.d/zed.rc | 122 + cmd/zed/zed.h | 58 + cmd/zed/zed_conf.c | 735 + cmd/zed/zed_conf.h | 62 + cmd/zed/zed_disk_event.c | 416 + cmd/zed/zed_disk_event.h | 31 + cmd/zed/zed_event.c | 965 + cmd/zed/zed_event.h | 29 + cmd/zed/zed_exec.c | 232 + cmd/zed/zed_exec.h | 25 + cmd/zed/zed_file.c | 217 + cmd/zed/zed_file.h | 35 + cmd/zed/zed_log.c | 256 + cmd/zed/zed_log.h | 44 + cmd/zed/zed_strings.c | 247 + cmd/zed/zed_strings.h | 27 + cmd/zfs/.gitignore | 1 + cmd/zfs/Makefile.am | 23 + cmd/zfs/zfs_iter.c | 512 + cmd/zfs/zfs_iter.h | 61 + cmd/zfs/zfs_main.c | 8620 +++++ cmd/zfs/zfs_project.c | 295 + cmd/zfs/zfs_projectutil.h | 49 + cmd/zfs/zfs_util.h | 42 + cmd/zfs_ids_to_path/.gitignore | 1 + cmd/zfs_ids_to_path/Makefile.am | 9 + cmd/zfs_ids_to_path/zfs_ids_to_path.c | 96 + cmd/zgenhostid/Makefile.am | 1 + cmd/zgenhostid/zgenhostid | 61 + cmd/zhack/.gitignore | 1 + cmd/zhack/Makefile.am | 14 + cmd/zhack/zhack.c | 532 + cmd/zinject/.gitignore | 1 + cmd/zinject/Makefile.am | 13 + cmd/zinject/translate.c | 397 + cmd/zinject/zinject.c | 1287 + cmd/zinject/zinject.h | 70 + cmd/zpool/.gitignore | 1 + cmd/zpool/Makefile.am | 136 + cmd/zpool/os/freebsd/zpool_vdev_os.c | 103 + cmd/zpool/os/linux/zpool_vdev_os.c | 410 + cmd/zpool/zpool.d/README | 9 + cmd/zpool/zpool.d/ata_err | 1 + cmd/zpool/zpool.d/cmd_to | 1 + cmd/zpool/zpool.d/defect | 1 + cmd/zpool/zpool.d/dm-deps | 29 + cmd/zpool/zpool.d/enc | 1 + cmd/zpool/zpool.d/encdev | 1 + cmd/zpool/zpool.d/fault_led | 1 + cmd/zpool/zpool.d/health | 1 + cmd/zpool/zpool.d/hours_on | 1 + cmd/zpool/zpool.d/iostat | 77 + cmd/zpool/zpool.d/iostat-10s | 1 + cmd/zpool/zpool.d/iostat-1s | 1 + cmd/zpool/zpool.d/label | 1 + cmd/zpool/zpool.d/locate_led | 1 + cmd/zpool/zpool.d/lsblk | 83 + cmd/zpool/zpool.d/media | 27 + cmd/zpool/zpool.d/model | 1 + cmd/zpool/zpool.d/nonmed | 1 + cmd/zpool/zpool.d/nvme_err | 1 + cmd/zpool/zpool.d/off_ucor | 1 + cmd/zpool/zpool.d/pend_sec | 1 + cmd/zpool/zpool.d/pwr_cyc | 1 + cmd/zpool/zpool.d/r_proc | 1 + cmd/zpool/zpool.d/r_ucor | 1 + cmd/zpool/zpool.d/realloc | 1 + cmd/zpool/zpool.d/rep_ucor | 1 + cmd/zpool/zpool.d/serial | 1 + cmd/zpool/zpool.d/ses | 52 + cmd/zpool/zpool.d/size | 1 + cmd/zpool/zpool.d/slot | 1 + cmd/zpool/zpool.d/smart | 243 + cmd/zpool/zpool.d/smart_test | 1 + cmd/zpool/zpool.d/smartx | 1 + cmd/zpool/zpool.d/temp | 1 + cmd/zpool/zpool.d/test_ended | 1 + cmd/zpool/zpool.d/test_progress | 1 + cmd/zpool/zpool.d/test_status | 1 + cmd/zpool/zpool.d/test_type | 1 + cmd/zpool/zpool.d/upath | 7 + cmd/zpool/zpool.d/vendor | 1 + cmd/zpool/zpool.d/w_proc | 1 + cmd/zpool/zpool.d/w_ucor | 1 + cmd/zpool/zpool_iter.c | 757 + cmd/zpool/zpool_main.c | 10326 ++++++ cmd/zpool/zpool_util.c | 125 + cmd/zpool/zpool_util.h | 137 + cmd/zpool/zpool_vdev.c | 1581 + cmd/zstream/.gitignore | 1 + cmd/zstream/Makefile.am | 15 + cmd/zstream/zstream.c | 66 + cmd/zstream/zstream.h | 36 + cmd/zstream/zstream_dump.c | 799 + cmd/zstream/zstream_redup.c | 469 + cmd/zstream/zstream_token.c | 78 + cmd/zstreamdump/Makefile.am | 1 + cmd/zstreamdump/zstreamdump | 3 + cmd/ztest/.gitignore | 1 + cmd/ztest/Makefile.am | 23 + cmd/ztest/ztest.c | 7818 ++++ cmd/zvol_id/.gitignore | 1 + cmd/zvol_id/Makefile.am | 10 + cmd/zvol_id/zvol_id_main.c | 110 + cmd/zvol_wait/Makefile.am | 1 + cmd/zvol_wait/zvol_wait | 116 + config/.gitignore | 9 + config/Rules.am | 62 + config/Substfiles.am | 34 + config/always-arch.m4 | 41 + config/always-compiler-options.m4 | 204 + config/always-python.m4 | 70 + config/always-pyzfs.m4 | 105 + config/always-sed.m4 | 16 + config/always-system.m4 | 26 + config/ax_code_coverage.m4 | 268 + config/ax_python_devel.m4 | 345 + config/ax_restore_flags.m4 | 31 + config/ax_save_flags.m4 | 31 + config/config.awk | 15 + config/config.rpath | 684 + config/deb.am | 74 + config/find_system_library.m4 | 93 + config/gettext.m4 | 386 + config/host-cpu-c-abi.m4 | 644 + config/iconv.m4 | 287 + config/intlmacosx.m4 | 72 + config/kernel-access-ok-type.m4 | 27 + config/kernel-acl.m4 | 289 + config/kernel-aio-fsync.m4 | 23 + config/kernel-automount.m4 | 25 + config/kernel-bdi.m4 | 79 + config/kernel-bio.m4 | 403 + config/kernel-blk-queue.m4 | 302 + config/kernel-blkdev.m4 | 212 + config/kernel-block-device-operations.m4 | 63 + config/kernel-clear-inode.m4 | 39 + config/kernel-commit-metadata.m4 | 24 + config/kernel-config-defined.m4 | 183 + config/kernel-current-time.m4 | 23 + config/kernel-declare-event-class.m4 | 55 + config/kernel-dentry-operations.m4 | 190 + config/kernel-dirty-inode.m4 | 29 + config/kernel-discard-granularity.m4 | 21 + config/kernel-encode-fh-inode.m4 | 27 + config/kernel-evict-inode.m4 | 24 + config/kernel-fallocate.m4 | 27 + config/kernel-file-dentry.m4 | 24 + config/kernel-file-inode.m4 | 23 + config/kernel-fmode-t.m4 | 20 + config/kernel-follow-down-one.m4 | 22 + config/kernel-fpu.m4 | 131 + config/kernel-fst-mount.m4 | 30 + config/kernel-fsync.m4 | 53 + config/kernel-generic_io_acct.m4 | 64 + config/kernel-generic_readlink.m4 | 25 + config/kernel-get-disk-and-module.m4 | 24 + config/kernel-get-disk-ro.m4 | 20 + config/kernel-get-link.m4 | 104 + config/kernel-global_page_state.m4 | 137 + config/kernel-group-info.m4 | 22 + config/kernel-in-compat-syscall.m4 | 24 + config/kernel-inode-create.m4 | 26 + config/kernel-inode-getattr.m4 | 53 + config/kernel-inode-lock.m4 | 24 + config/kernel-inode-lookup.m4 | 26 + config/kernel-inode-set-flags.m4 | 22 + config/kernel-inode-set-iversion.m4 | 23 + config/kernel-inode-times.m4 | 50 + config/kernel-insert-inode-locked.m4 | 21 + config/kernel-is_owner_or_cap.m4 | 23 + config/kernel-kmap-atomic-args.m4 | 22 + config/kernel-kmem-cache.m4 | 41 + config/kernel-kmem.m4 | 108 + config/kernel-kstrtoul.m4 | 21 + config/kernel-ktime.m4 | 55 + config/kernel-kuid-helpers.m4 | 24 + config/kernel-kuidgid.m4 | 21 + config/kernel-lseek-execute.m4 | 27 + config/kernel-make-request-fn.m4 | 101 + config/kernel-misc-minor.m4 | 26 + config/kernel-mkdir-umode-t.m4 | 32 + config/kernel-mod-param.m4 | 33 + config/kernel-objtool.m4 | 45 + config/kernel-pde-data.m4 | 20 + config/kernel-percpu.m4 | 34 + config/kernel-proc-operations.m4 | 41 + config/kernel-put-link.m4 | 61 + config/kernel-rename.m4 | 29 + config/kernel-rw.m4 | 69 + config/kernel-rwsem.m4 | 88 + config/kernel-sched.m4 | 82 + config/kernel-security-inode-init.m4 | 36 + config/kernel-set-nlink.m4 | 22 + config/kernel-setattr-prepare.m4 | 27 + config/kernel-sget-args.m4 | 25 + config/kernel-show-options.m4 | 25 + config/kernel-shrink.m4 | 151 + config/kernel-super-userns.m4 | 25 + config/kernel-timer.m4 | 75 + config/kernel-tmpfile.m4 | 25 + config/kernel-totalhigh_pages.m4 | 21 + config/kernel-totalram-pages-func.m4 | 23 + config/kernel-truncate-setsize.m4 | 21 + config/kernel-userns-capabilities.m4 | 106 + config/kernel-usleep_range.m4 | 23 + config/kernel-vfs-direct_IO.m4 | 109 + config/kernel-vfs-fsync.m4 | 20 + config/kernel-vfs-getattr.m4 | 86 + config/kernel-vfs-iterate.m4 | 83 + config/kernel-vfs-rw-iterate.m4 | 80 + config/kernel-wait.m4 | 99 + config/kernel-xattr-handler.m4 | 397 + config/kernel-zlib.m4 | 26 + config/kernel.m4 | 861 + config/lib-ld.m4 | 168 + config/lib-link.m4 | 774 + config/lib-prefix.m4 | 248 + config/mount-helper.m4 | 8 + config/nls.m4 | 32 + config/pkg.m4 | 275 + config/po.m4 | 450 + config/progtest.m4 | 91 + config/rpm.am | 93 + config/tgz.am | 30 + config/toolchain-simd.m4 | 424 + config/user-clock_gettime.m4 | 12 + config/user-dracut.m4 | 22 + config/user-gettext.m4 | 6 + config/user-libaio.m4 | 6 + config/user-libblkid.m4 | 9 + config/user-libcrypto.m4 | 8 + config/user-libexec.m4 | 9 + config/user-libtirpc.m4 | 30 + config/user-libudev.m4 | 17 + config/user-libuuid.m4 | 8 + config/user-makedev.m4 | 39 + config/user-pam.m4 | 38 + config/user-runstatedir.m4 | 6 + config/user-systemd.m4 | 53 + config/user-sysvinit.m4 | 11 + config/user-udev.m4 | 29 + config/user-zlib.m4 | 8 + config/user.m4 | 45 + config/zfs-build.m4 | 554 + config/zfs-meta.m4 | 207 + configure.ac | 409 + contrib/Makefile.am | 8 + contrib/bash_completion.d/Makefile.am | 5 + contrib/bash_completion.d/zfs | 481 + contrib/bpftrace/Makefile.am | 3 + contrib/bpftrace/taskqlatency.bt | 54 + contrib/bpftrace/zfs-trace.sh | 10 + contrib/dracut/02zfsexpandknowledge/.gitignore | 1 + contrib/dracut/02zfsexpandknowledge/Makefile.am | 7 + .../dracut/02zfsexpandknowledge/module-setup.sh.in | 136 + contrib/dracut/90zfs/.gitignore | 11 + contrib/dracut/90zfs/Makefile.am | 19 + contrib/dracut/90zfs/export-zfs.sh.in | 30 + contrib/dracut/90zfs/module-setup.sh.in | 119 + contrib/dracut/90zfs/mount-zfs.sh.in | 86 + contrib/dracut/90zfs/parse-zfs.sh.in | 66 + contrib/dracut/90zfs/zfs-env-bootfs.service.in | 14 + contrib/dracut/90zfs/zfs-generator.sh.in | 61 + contrib/dracut/90zfs/zfs-lib.sh.in | 174 + contrib/dracut/90zfs/zfs-load-key.sh.in | 58 + contrib/dracut/90zfs/zfs-needshutdown.sh.in | 10 + .../dracut/90zfs/zfs-rollback-bootfs.service.in | 14 + .../dracut/90zfs/zfs-snapshot-bootfs.service.in | 14 + contrib/dracut/Makefile.am | 3 + contrib/dracut/README.dracut.markdown | 225 + contrib/initramfs/Makefile.am | 9 + contrib/initramfs/README.initramfs.markdown | 84 + contrib/initramfs/conf-hooks.d/Makefile.am | 4 + contrib/initramfs/conf-hooks.d/zfs | 9 + contrib/initramfs/conf.d/Makefile.am | 4 + contrib/initramfs/conf.d/zfs | 8 + contrib/initramfs/hooks/.gitignore | 2 + contrib/initramfs/hooks/Makefile.am | 9 + contrib/initramfs/hooks/zfs.in | 109 + contrib/initramfs/hooks/zfsunlock.in | 18 + contrib/initramfs/scripts/.gitignore | 1 + contrib/initramfs/scripts/Makefile.am | 6 + contrib/initramfs/scripts/local-top/Makefile.am | 4 + contrib/initramfs/scripts/local-top/zfs | 60 + contrib/initramfs/scripts/zfs | 1011 + contrib/initramfs/zfsunlock | 42 + contrib/pam_zfs_key/Makefile.am | 19 + contrib/pam_zfs_key/pam_zfs_key.c | 741 + contrib/pam_zfs_key/zfs_key | 13 + contrib/pyzfs/.gitignore | 3 + contrib/pyzfs/LICENSE | 202 + contrib/pyzfs/Makefile.am | 40 + contrib/pyzfs/README | 28 + contrib/pyzfs/docs/source/conf.py | 304 + contrib/pyzfs/docs/source/index.rst | 44 + contrib/pyzfs/libzfs_core/__init__.py | 154 + contrib/pyzfs/libzfs_core/_constants.py | 114 + contrib/pyzfs/libzfs_core/_error_translation.py | 840 + contrib/pyzfs/libzfs_core/_libzfs_core.py | 1955 + contrib/pyzfs/libzfs_core/_nvlist.py | 295 + contrib/pyzfs/libzfs_core/bindings/__init__.py | 60 + contrib/pyzfs/libzfs_core/bindings/libnvpair.py | 134 + contrib/pyzfs/libzfs_core/bindings/libzfs_core.py | 146 + contrib/pyzfs/libzfs_core/ctypes.py | 71 + contrib/pyzfs/libzfs_core/exceptions.py | 601 + contrib/pyzfs/libzfs_core/test/__init__.py | 1 + contrib/pyzfs/libzfs_core/test/test_libzfs_core.py | 4380 +++ contrib/pyzfs/libzfs_core/test/test_nvlist.py | 636 + contrib/pyzfs/setup.py.in | 61 + contrib/zcp/Makefile.am | 1 + contrib/zcp/autosnap.lua | 75 + copy-builtin | 79 + cppcheck-suppressions.txt | 8 + etc/Makefile.am | 5 + etc/default/.gitignore | 1 + etc/default/Makefile.am | 5 + etc/default/zfs.in | 103 + etc/init.d/.gitignore | 5 + etc/init.d/Makefile.am | 7 + etc/init.d/README.md | 72 + etc/init.d/zfs-import.in | 340 + etc/init.d/zfs-mount.in | 227 + etc/init.d/zfs-share.in | 85 + etc/init.d/zfs-zed.in | 134 + etc/modules-load.d/.gitignore | 1 + etc/modules-load.d/Makefile.am | 2 + etc/modules-load.d/zfs.conf | 3 + etc/sudoers.d/Makefile.am | 5 + etc/sudoers.d/zfs | 9 + etc/systemd/Makefile.am | 1 + etc/systemd/system-generators/.gitignore | 1 + etc/systemd/system-generators/Makefile.am | 6 + .../system-generators/zfs-mount-generator.in | 450 + etc/systemd/system/.gitignore | 3 + etc/systemd/system/50-zfs.preset.in | 9 + etc/systemd/system/Makefile.am | 21 + etc/systemd/system/zfs-import-cache.service.in | 20 + etc/systemd/system/zfs-import-scan.service.in | 19 + etc/systemd/system/zfs-import.target.in | 6 + etc/systemd/system/zfs-mount.service.in | 18 + etc/systemd/system/zfs-share.service.in | 18 + etc/systemd/system/zfs-volume-wait.service.in | 13 + etc/systemd/system/zfs-volumes.target.in | 7 + etc/systemd/system/zfs-zed.service.in | 11 + etc/systemd/system/zfs.target.in | 5 + etc/zfs/.gitignore | 1 + etc/zfs/Makefile.am | 15 + etc/zfs/vdev_id.conf.alias.example | 4 + etc/zfs/vdev_id.conf.multipath.example | 7 + etc/zfs/vdev_id.conf.sas_direct.example | 28 + etc/zfs/vdev_id.conf.sas_switch.example | 7 + etc/zfs/vdev_id.conf.scsi.example | 9 + etc/zfs/zfs-functions.in | 434 + include/.gitignore | 1 + include/Makefile.am | 33 + include/cityhash.h | 41 + include/libnvpair.h | 196 + include/libuutil.h | 384 + include/libuutil_common.h | 35 + include/libuutil_impl.h | 181 + include/libzfs.h | 920 + include/libzfs_core.h | 144 + include/libzfs_impl.h | 261 + include/libzutil.h | 166 + include/os/Makefile.am | 6 + include/os/freebsd/Makefile.am | 1 + include/os/freebsd/linux/Makefile.am | 5 + include/os/freebsd/linux/compiler.h | 101 + include/os/freebsd/linux/types.h | 77 + include/os/freebsd/spl/Makefile.am | 1 + include/os/freebsd/spl/acl/Makefile.am | 4 + include/os/freebsd/spl/acl/acl_common.h | 67 + include/os/freebsd/spl/rpc/Makefile.am | 4 + include/os/freebsd/spl/rpc/xdr.h | 71 + include/os/freebsd/spl/sys/Makefile.am | 71 + include/os/freebsd/spl/sys/acl.h | 216 + include/os/freebsd/spl/sys/acl_impl.h | 61 + include/os/freebsd/spl/sys/atomic.h | 182 + include/os/freebsd/spl/sys/byteorder.h | 107 + include/os/freebsd/spl/sys/callb.h | 213 + include/os/freebsd/spl/sys/ccompat.h | 153 + include/os/freebsd/spl/sys/ccompile.h | 271 + include/os/freebsd/spl/sys/cmn_err.h | 100 + include/os/freebsd/spl/sys/condvar.h | 205 + include/os/freebsd/spl/sys/console.h | 35 + include/os/freebsd/spl/sys/cred.h | 188 + include/os/freebsd/spl/sys/ctype.h | 45 + include/os/freebsd/spl/sys/debug.h | 168 + include/os/freebsd/spl/sys/dirent.h | 45 + include/os/freebsd/spl/sys/disp.h | 36 + include/os/freebsd/spl/sys/dkio.h | 494 + include/os/freebsd/spl/sys/extdirent.h | 73 + include/os/freebsd/spl/sys/file.h | 51 + include/os/freebsd/spl/sys/freebsd_rwlock.h | 34 + include/os/freebsd/spl/sys/idmap.h | 97 + include/os/freebsd/spl/sys/inttypes.h | 1 + include/os/freebsd/spl/sys/isa_defs.h | 712 + include/os/freebsd/spl/sys/kidmap.h | 41 + include/os/freebsd/spl/sys/kmem.h | 97 + include/os/freebsd/spl/sys/kmem_cache.h | 49 + include/os/freebsd/spl/sys/kstat.h | 207 + include/os/freebsd/spl/sys/list.h | 67 + include/os/freebsd/spl/sys/list_impl.h | 53 + include/os/freebsd/spl/sys/lock.h | 40 + include/os/freebsd/spl/sys/misc.h | 56 + include/os/freebsd/spl/sys/mod_os.h | 106 + include/os/freebsd/spl/sys/mode.h | 1 + include/os/freebsd/spl/sys/mount.h | 42 + include/os/freebsd/spl/sys/mutex.h | 72 + include/os/freebsd/spl/sys/param.h | 41 + include/os/freebsd/spl/sys/policy.h | 78 + include/os/freebsd/spl/sys/proc.h | 114 + include/os/freebsd/spl/sys/processor.h | 63 + include/os/freebsd/spl/sys/procfs_list.h | 64 + include/os/freebsd/spl/sys/random.h | 48 + include/os/freebsd/spl/sys/rwlock.h | 97 + include/os/freebsd/spl/sys/sdt.h | 45 + include/os/freebsd/spl/sys/sid.h | 86 + include/os/freebsd/spl/sys/sig.h | 65 + include/os/freebsd/spl/sys/simd.h | 43 + include/os/freebsd/spl/sys/simd_x86.h | 300 + include/os/freebsd/spl/sys/spl_condvar.h | 81 + include/os/freebsd/spl/sys/string.h | 39 + include/os/freebsd/spl/sys/strings.h | 1 + include/os/freebsd/spl/sys/sunddi.h | 69 + include/os/freebsd/spl/sys/sysmacros.h | 404 + include/os/freebsd/spl/sys/systeminfo.h | 34 + include/os/freebsd/spl/sys/systm.h | 43 + include/os/freebsd/spl/sys/taskq.h | 115 + include/os/freebsd/spl/sys/thread.h | 34 + include/os/freebsd/spl/sys/time.h | 96 + include/os/freebsd/spl/sys/timer.h | 38 + include/os/freebsd/spl/sys/trace.h | 1 + include/os/freebsd/spl/sys/trace_zfs.h | 1 + include/os/freebsd/spl/sys/types.h | 108 + include/os/freebsd/spl/sys/types32.h | 37 + include/os/freebsd/spl/sys/uio.h | 110 + include/os/freebsd/spl/sys/uuid.h | 99 + include/os/freebsd/spl/sys/vfs.h | 127 + include/os/freebsd/spl/sys/vm.h | 73 + include/os/freebsd/spl/sys/vmsystm.h | 34 + include/os/freebsd/spl/sys/vnode.h | 211 + include/os/freebsd/spl/sys/vnode_impl.h | 268 + include/os/freebsd/spl/sys/zmod.h | 68 + include/os/freebsd/spl/sys/zone.h | 68 + include/os/freebsd/zfs/Makefile.am | 1 + include/os/freebsd/zfs/sys/Makefile.am | 14 + include/os/freebsd/zfs/sys/freebsd_crypto.h | 98 + include/os/freebsd/zfs/sys/sha2.h | 200 + include/os/freebsd/zfs/sys/vdev_os.h | 33 + include/os/freebsd/zfs/sys/zfs_context_os.h | 91 + include/os/freebsd/zfs/sys/zfs_ctldir.h | 65 + include/os/freebsd/zfs/sys/zfs_dir.h | 74 + include/os/freebsd/zfs/sys/zfs_ioctl_compat.h | 160 + include/os/freebsd/zfs/sys/zfs_vfsops.h | 177 + include/os/freebsd/zfs/sys/zfs_vnops.h | 56 + include/os/freebsd/zfs/sys/zfs_znode_impl.h | 183 + include/os/freebsd/zfs/sys/zpl.h | 1 + include/os/linux/Makefile.am | 1 + include/os/linux/kernel/Makefile.am | 1 + include/os/linux/kernel/linux/Makefile.am | 22 + include/os/linux/kernel/linux/blkdev_compat.h | 506 + include/os/linux/kernel/linux/compiler_compat.h | 35 + include/os/linux/kernel/linux/dcache_compat.h | 64 + include/os/linux/kernel/linux/kmap_compat.h | 43 + include/os/linux/kernel/linux/mod_compat.h | 151 + include/os/linux/kernel/linux/page_compat.h | 92 + include/os/linux/kernel/linux/percpu_compat.h | 44 + include/os/linux/kernel/linux/simd.h | 45 + include/os/linux/kernel/linux/simd_aarch64.h | 54 + include/os/linux/kernel/linux/simd_powerpc.h | 109 + include/os/linux/kernel/linux/simd_x86.h | 646 + include/os/linux/kernel/linux/utsname_compat.h | 29 + include/os/linux/kernel/linux/vfs_compat.h | 439 + include/os/linux/kernel/linux/xattr_compat.h | 184 + include/os/linux/spl/Makefile.am | 1 + include/os/linux/spl/rpc/Makefile.am | 7 + include/os/linux/spl/rpc/xdr.h | 156 + include/os/linux/spl/sys/Makefile.am | 64 + include/os/linux/spl/sys/acl.h | 119 + include/os/linux/spl/sys/atomic.h | 79 + include/os/linux/spl/sys/byteorder.h | 87 + include/os/linux/spl/sys/callb.h | 54 + include/os/linux/spl/sys/callo.h | 52 + include/os/linux/spl/sys/cmn_err.h | 42 + include/os/linux/spl/sys/condvar.h | 113 + include/os/linux/spl/sys/console.h | 31 + include/os/linux/spl/sys/cred.h | 63 + include/os/linux/spl/sys/ctype.h | 30 + include/os/linux/spl/sys/debug.h | 169 + include/os/linux/spl/sys/disp.h | 34 + include/os/linux/spl/sys/dkio.h | 40 + include/os/linux/spl/sys/errno.h | 57 + include/os/linux/spl/sys/fcntl.h | 37 + include/os/linux/spl/sys/file.h | 52 + include/os/linux/spl/sys/inttypes.h | 28 + include/os/linux/spl/sys/isa_defs.h | 254 + include/os/linux/spl/sys/kmem.h | 211 + include/os/linux/spl/sys/kmem_cache.h | 216 + include/os/linux/spl/sys/kstat.h | 223 + include/os/linux/spl/sys/list.h | 210 + include/os/linux/spl/sys/mod_os.h | 28 + include/os/linux/spl/sys/mutex.h | 185 + include/os/linux/spl/sys/param.h | 36 + include/os/linux/spl/sys/proc.h | 42 + include/os/linux/spl/sys/processor.h | 32 + include/os/linux/spl/sys/procfs_list.h | 72 + include/os/linux/spl/sys/random.h | 40 + include/os/linux/spl/sys/rwlock.h | 201 + include/os/linux/spl/sys/shrinker.h | 110 + include/os/linux/spl/sys/sid.h | 61 + include/os/linux/spl/sys/signal.h | 55 + include/os/linux/spl/sys/simd.h | 31 + include/os/linux/spl/sys/stat.h | 30 + include/os/linux/spl/sys/strings.h | 31 + include/os/linux/spl/sys/sunddi.h | 58 + include/os/linux/spl/sys/sysmacros.h | 206 + include/os/linux/spl/sys/systeminfo.h | 36 + include/os/linux/spl/sys/taskq.h | 164 + include/os/linux/spl/sys/thread.h | 74 + include/os/linux/spl/sys/time.h | 118 + include/os/linux/spl/sys/timer.h | 86 + include/os/linux/spl/sys/trace.h | 175 + include/os/linux/spl/sys/trace_spl.h | 30 + include/os/linux/spl/sys/trace_taskq.h | 89 + include/os/linux/spl/sys/tsd.h | 46 + include/os/linux/spl/sys/types.h | 58 + include/os/linux/spl/sys/types32.h | 35 + include/os/linux/spl/sys/uio.h | 144 + include/os/linux/spl/sys/user.h | 42 + include/os/linux/spl/sys/vfs.h | 51 + include/os/linux/spl/sys/vmem.h | 102 + include/os/linux/spl/sys/vmsystm.h | 103 + include/os/linux/spl/sys/vnode.h | 107 + include/os/linux/spl/sys/wait.h | 55 + include/os/linux/spl/sys/zmod.h | 70 + include/os/linux/spl/sys/zone.h | 36 + include/os/linux/zfs/Makefile.am | 1 + include/os/linux/zfs/sys/Makefile.am | 30 + include/os/linux/zfs/sys/policy.h | 61 + include/os/linux/zfs/sys/sha2.h | 151 + include/os/linux/zfs/sys/trace_acl.h | 164 + include/os/linux/zfs/sys/trace_arc.h | 419 + include/os/linux/zfs/sys/trace_common.h | 112 + include/os/linux/zfs/sys/trace_dbgmsg.h | 89 + include/os/linux/zfs/sys/trace_dbuf.h | 169 + include/os/linux/zfs/sys/trace_dmu.h | 136 + include/os/linux/zfs/sys/trace_dnode.h | 129 + include/os/linux/zfs/sys/trace_multilist.h | 89 + include/os/linux/zfs/sys/trace_rrwlock.h | 31 + include/os/linux/zfs/sys/trace_txg.h | 89 + include/os/linux/zfs/sys/trace_vdev.h | 140 + include/os/linux/zfs/sys/trace_zfs.h | 53 + include/os/linux/zfs/sys/trace_zil.h | 229 + include/os/linux/zfs/sys/trace_zio.h | 97 + include/os/linux/zfs/sys/trace_zrlock.h | 94 + include/os/linux/zfs/sys/zfs_context_os.h | 30 + include/os/linux/zfs/sys/zfs_ctldir.h | 103 + include/os/linux/zfs/sys/zfs_dir.h | 76 + include/os/linux/zfs/sys/zfs_vfsops.h | 227 + include/os/linux/zfs/sys/zfs_vnops.h | 91 + include/os/linux/zfs/sys/zfs_znode_impl.h | 168 + include/os/linux/zfs/sys/zpl.h | 183 + include/sys/Makefile.am | 147 + include/sys/abd.h | 162 + include/sys/abd_impl.h | 150 + include/sys/aggsum.h | 59 + include/sys/arc.h | 335 + include/sys/arc_impl.h | 928 + include/sys/avl.h | 325 + include/sys/avl_impl.h | 164 + include/sys/bitops.h | 90 + include/sys/blkptr.h | 39 + include/sys/bplist.h | 59 + include/sys/bpobj.h | 106 + include/sys/bptree.h | 65 + include/sys/bqueue.h | 56 + include/sys/btree.h | 243 + include/sys/crypto/Makefile.am | 16 + include/sys/crypto/api.h | 425 + include/sys/crypto/common.h | 583 + include/sys/crypto/icp.h | 50 + include/sys/dataset_kstats.h | 75 + include/sys/dbuf.h | 472 + include/sys/ddt.h | 257 + include/sys/dmu.h | 1090 + include/sys/dmu_impl.h | 264 + include/sys/dmu_objset.h | 273 + include/sys/dmu_recv.h | 88 + include/sys/dmu_redact.h | 58 + include/sys/dmu_send.h | 72 + include/sys/dmu_traverse.h | 92 + include/sys/dmu_tx.h | 177 + include/sys/dmu_zfetch.h | 77 + include/sys/dnode.h | 627 + include/sys/dsl_bookmark.h | 157 + include/sys/dsl_crypt.h | 225 + include/sys/dsl_dataset.h | 508 + include/sys/dsl_deadlist.h | 128 + include/sys/dsl_deleg.h | 90 + include/sys/dsl_destroy.h | 70 + include/sys/dsl_dir.h | 227 + include/sys/dsl_pool.h | 199 + include/sys/dsl_prop.h | 126 + include/sys/dsl_scan.h | 193 + include/sys/dsl_synctask.h | 127 + include/sys/dsl_userhold.h | 57 + include/sys/edonr.h | 98 + include/sys/efi_partition.h | 381 + include/sys/fm/Makefile.am | 17 + include/sys/fm/fs/Makefile.am | 14 + include/sys/fm/fs/zfs.h | 121 + include/sys/fm/protocol.h | 369 + include/sys/fm/util.h | 118 + include/sys/frame.h | 36 + include/sys/fs/Makefile.am | 14 + include/sys/fs/zfs.h | 1589 + include/sys/hkdf.h | 29 + include/sys/lua/Makefile.am | 17 + include/sys/lua/lauxlib.h | 175 + include/sys/lua/lua.h | 445 + include/sys/lua/luaconf.h | 562 + include/sys/lua/lualib.h | 57 + include/sys/metaslab.h | 147 + include/sys/metaslab_impl.h | 562 + include/sys/mmp.h | 75 + include/sys/mntent.h | 112 + include/sys/mod.h | 41 + include/sys/multilist.h | 107 + include/sys/note.h | 56 + include/sys/nvpair.h | 358 + include/sys/nvpair_impl.h | 90 + include/sys/objlist.h | 51 + include/sys/pathname.h | 72 + include/sys/qat.h | 199 + include/sys/range_tree.h | 330 + include/sys/rrwlock.h | 117 + include/sys/sa.h | 174 + include/sys/sa_impl.h | 289 + include/sys/skein.h | 183 + include/sys/spa.h | 1224 + include/sys/spa_boot.h | 42 + include/sys/spa_checkpoint.h | 44 + include/sys/spa_checksum.h | 72 + include/sys/spa_impl.h | 458 + include/sys/spa_log_spacemap.h | 79 + include/sys/space_map.h | 246 + include/sys/space_reftree.h | 57 + include/sys/sysevent.h | 36 + include/sys/sysevent/Makefile.am | 15 + include/sys/sysevent/dev.h | 261 + include/sys/sysevent/eventdefs.h | 136 + include/sys/txg.h | 153 + include/sys/txg_impl.h | 125 + include/sys/u8_textprep.h | 113 + include/sys/u8_textprep_data.h | 35376 +++++++++++++++++++ include/sys/uberblock.h | 50 + include/sys/uberblock_impl.h | 145 + include/sys/uio_impl.h | 49 + include/sys/unique.h | 57 + include/sys/uuid.h | 94 + include/sys/vdev.h | 201 + include/sys/vdev_disk.h | 46 + include/sys/vdev_file.h | 47 + include/sys/vdev_impl.h | 620 + include/sys/vdev_indirect_births.h | 80 + include/sys/vdev_indirect_mapping.h | 141 + include/sys/vdev_initialize.h | 47 + include/sys/vdev_raidz.h | 64 + include/sys/vdev_raidz_impl.h | 375 + include/sys/vdev_rebuild.h | 97 + include/sys/vdev_removal.h | 96 + include/sys/vdev_trim.h | 54 + include/sys/xvattr.h | 338 + include/sys/zap.h | 510 + include/sys/zap_impl.h | 240 + include/sys/zap_leaf.h | 255 + include/sys/zcp.h | 194 + include/sys/zcp_global.h | 35 + include/sys/zcp_iter.h | 41 + include/sys/zcp_prop.h | 34 + include/sys/zcp_set.h | 44 + include/sys/zfeature.h | 73 + include/sys/zfs_acl.h | 248 + include/sys/zfs_context.h | 763 + include/sys/zfs_debug.h | 111 + include/sys/zfs_delay.h | 41 + include/sys/zfs_file.h | 62 + include/sys/zfs_fuid.h | 132 + include/sys/zfs_ioctl.h | 581 + include/sys/zfs_ioctl_impl.h | 97 + include/sys/zfs_onexit.h | 63 + include/sys/zfs_project.h | 83 + include/sys/zfs_quota.h | 45 + include/sys/zfs_ratelimit.h | 44 + include/sys/zfs_refcount.h | 126 + include/sys/zfs_rlock.h | 83 + include/sys/zfs_sa.h | 153 + include/sys/zfs_stat.h | 56 + include/sys/zfs_sysfs.h | 50 + include/sys/zfs_znode.h | 295 + include/sys/zil.h | 529 + include/sys/zil_impl.h | 230 + include/sys/zio.h | 710 + include/sys/zio_checksum.h | 144 + include/sys/zio_compress.h | 198 + include/sys/zio_crypt.h | 160 + include/sys/zio_impl.h | 270 + include/sys/zio_priority.h | 43 + include/sys/zrlock.h | 63 + include/sys/zstd/Makefile.am | 18 + include/sys/zstd/zstd.h | 98 + include/sys/zthr.h | 42 + include/sys/zvol.h | 65 + include/sys/zvol_impl.h | 108 + include/thread_pool.h | 72 + include/zfeature_common.h | 135 + include/zfs_comutil.h | 52 + include/zfs_deleg.h | 99 + include/zfs_fletcher.h | 158 + include/zfs_namecheck.h | 71 + include/zfs_prop.h | 133 + lib/Makefile.am | 14 + lib/libavl/Makefile.am | 14 + lib/libefi/Makefile.am | 12 + lib/libefi/rdwr_efi.c | 1655 + lib/libicp/Makefile.am | 73 + lib/libnvpair/Makefile.am | 44 + lib/libnvpair/libnvpair.c | 1277 + lib/libnvpair/libnvpair_json.c | 406 + lib/libnvpair/nvpair_alloc_system.c | 65 + lib/libshare/Makefile.am | 25 + lib/libshare/libshare.c | 366 + lib/libshare/libshare_impl.h | 61 + lib/libshare/nfs.h | 27 + lib/libshare/os/freebsd/nfs.c | 448 + lib/libshare/os/freebsd/smb.c | 128 + lib/libshare/os/linux/nfs.c | 712 + lib/libshare/os/linux/smb.c | 453 + lib/libshare/smb.h | 49 + lib/libspl/Makefile.am | 56 + lib/libspl/asm-generic/.gitignore | 1 + lib/libspl/asm-generic/atomic.c | 450 + lib/libspl/asm-i386/atomic.S | 840 + lib/libspl/asm-x86_64/atomic.S | 691 + lib/libspl/assert.c | 46 + lib/libspl/include/Makefile.am | 22 + lib/libspl/include/assert.h | 151 + lib/libspl/include/atomic.h | 296 + lib/libspl/include/ia32/Makefile.am | 1 + lib/libspl/include/ia32/sys/Makefile.am | 3 + lib/libspl/include/ia32/sys/asm_linkage.h | 302 + lib/libspl/include/libdevinfo.h | 30 + lib/libspl/include/libgen.h | 35 + lib/libspl/include/libshare.h | 86 + lib/libspl/include/limits.h | 40 + lib/libspl/include/locale.h | 35 + lib/libspl/include/os/Makefile.am | 7 + lib/libspl/include/os/freebsd/Makefile.am | 1 + lib/libspl/include/os/freebsd/sys/Makefile.am | 11 + lib/libspl/include/os/freebsd/sys/byteorder.h | 192 + lib/libspl/include/os/freebsd/sys/file.h | 42 + lib/libspl/include/os/freebsd/sys/mnttab.h | 85 + lib/libspl/include/os/freebsd/sys/mount.h | 108 + lib/libspl/include/os/freebsd/sys/param.h | 66 + lib/libspl/include/os/freebsd/sys/stat.h | 71 + lib/libspl/include/os/freebsd/sys/sysmacros.h | 1 + lib/libspl/include/os/freebsd/sys/vfs.h | 37 + lib/libspl/include/os/freebsd/sys/zfs_context_os.h | 34 + lib/libspl/include/os/linux/Makefile.am | 1 + lib/libspl/include/os/linux/sys/Makefile.am | 10 + lib/libspl/include/os/linux/sys/byteorder.h | 223 + lib/libspl/include/os/linux/sys/errno.h | 46 + lib/libspl/include/os/linux/sys/mnttab.h | 89 + lib/libspl/include/os/linux/sys/mount.h | 98 + lib/libspl/include/os/linux/sys/param.h | 67 + lib/libspl/include/os/linux/sys/stat.h | 50 + lib/libspl/include/os/linux/sys/sysmacros.h | 103 + lib/libspl/include/os/linux/sys/zfs_context_os.h | 25 + lib/libspl/include/rpc/Makefile.am | 3 + lib/libspl/include/rpc/xdr.h | 68 + lib/libspl/include/statcommon.h | 41 + lib/libspl/include/stdio.h | 34 + lib/libspl/include/stdlib.h | 34 + lib/libspl/include/string.h | 40 + lib/libspl/include/stropts.h | 25 + lib/libspl/include/sys/Makefile.am | 47 + lib/libspl/include/sys/acl.h | 287 + lib/libspl/include/sys/acl_impl.h | 59 + lib/libspl/include/sys/callb.h | 30 + lib/libspl/include/sys/cmn_err.h | 30 + lib/libspl/include/sys/cred.h | 32 + lib/libspl/include/sys/debug.h | 40 + lib/libspl/include/sys/dkio.h | 483 + lib/libspl/include/sys/dklabel.h | 268 + lib/libspl/include/sys/dktp/Makefile.am | 4 + lib/libspl/include/sys/dktp/fdisk.h | 173 + lib/libspl/include/sys/feature_tests.h | 32 + lib/libspl/include/sys/int_limits.h | 30 + lib/libspl/include/sys/int_types.h | 32 + lib/libspl/include/sys/inttypes.h | 34 + lib/libspl/include/sys/isa_defs.h | 264 + lib/libspl/include/sys/kmem.h | 45 + lib/libspl/include/sys/kstat.h | 822 + lib/libspl/include/sys/list.h | 65 + lib/libspl/include/sys/list_impl.h | 51 + lib/libspl/include/sys/mhd.h | 159 + lib/libspl/include/sys/mkdev.h | 30 + lib/libspl/include/sys/policy.h | 26 + lib/libspl/include/sys/poll.h | 41 + lib/libspl/include/sys/priv.h | 30 + lib/libspl/include/sys/processor.h | 34 + lib/libspl/include/sys/sha2.h | 151 + lib/libspl/include/sys/simd.h | 502 + lib/libspl/include/sys/stack.h | 74 + lib/libspl/include/sys/stdtypes.h | 54 + lib/libspl/include/sys/strings.h | 33 + lib/libspl/include/sys/stropts.h | 29 + lib/libspl/include/sys/sunddi.h | 29 + lib/libspl/include/sys/systeminfo.h | 38 + lib/libspl/include/sys/time.h | 107 + lib/libspl/include/sys/trace_spl.h | 24 + lib/libspl/include/sys/trace_zfs.h | 24 + lib/libspl/include/sys/types.h | 77 + lib/libspl/include/sys/types32.h | 89 + lib/libspl/include/sys/tzfile.h | 164 + lib/libspl/include/sys/uio.h | 153 + lib/libspl/include/sys/va_list.h | 32 + lib/libspl/include/sys/varargs.h | 30 + lib/libspl/include/sys/vnode.h | 30 + lib/libspl/include/sys/vtoc.h | 350 + lib/libspl/include/sys/zone.h | 30 + lib/libspl/include/thread.h | 30 + lib/libspl/include/tzfile.h | 32 + lib/libspl/include/ucred.h | 32 + lib/libspl/include/umem.h | 208 + lib/libspl/include/unistd.h | 39 + lib/libspl/include/util/Makefile.am | 3 + lib/libspl/include/util/sscanf.h | 30 + lib/libspl/include/zone.h | 53 + lib/libspl/list.c | 243 + lib/libspl/mkdirp.c | 212 + lib/libspl/os/freebsd/getexecname.c | 71 + lib/libspl/os/freebsd/gethostid.c | 36 + lib/libspl/os/freebsd/getmntany.c | 67 + lib/libspl/os/freebsd/mnttab.c | 216 + lib/libspl/os/linux/getexecname.c | 59 + lib/libspl/os/linux/gethostid.c | 86 + lib/libspl/os/linux/getmntany.c | 165 + lib/libspl/page.c | 34 + lib/libspl/strlcat.c | 60 + lib/libspl/strlcpy.c | 56 + lib/libspl/timestamp.c | 63 + lib/libspl/zone.c | 63 + lib/libtpool/Makefile.am | 9 + lib/libtpool/thread_pool.c | 599 + lib/libtpool/thread_pool_impl.h | 93 + lib/libunicode/Makefile.am | 15 + lib/libuutil/Makefile.am | 34 + lib/libuutil/uu_alloc.c | 135 + lib/libuutil/uu_avl.c | 569 + lib/libuutil/uu_dprintf.c | 130 + lib/libuutil/uu_ident.c | 122 + lib/libuutil/uu_list.c | 718 + lib/libuutil/uu_misc.c | 281 + lib/libuutil/uu_open.c | 70 + lib/libuutil/uu_pname.c | 207 + lib/libuutil/uu_string.c | 54 + lib/libzfs/.gitignore | 1 + lib/libzfs/Makefile.am | 93 + lib/libzfs/THIRDPARTYLICENSE.openssl | 127 + lib/libzfs/THIRDPARTYLICENSE.openssl.descrip | 1 + lib/libzfs/libzfs.pc.in | 14 + lib/libzfs/libzfs_changelist.c | 781 + lib/libzfs/libzfs_config.c | 457 + lib/libzfs/libzfs_crypto.c | 1621 + lib/libzfs/libzfs_dataset.c | 5492 +++ lib/libzfs/libzfs_diff.c | 797 + lib/libzfs/libzfs_import.c | 472 + lib/libzfs/libzfs_iter.c | 600 + lib/libzfs/libzfs_mount.c | 1614 + lib/libzfs/libzfs_pool.c | 4531 +++ lib/libzfs/libzfs_sendrecv.c | 5170 +++ lib/libzfs/libzfs_status.c | 508 + lib/libzfs/libzfs_util.c | 2092 ++ lib/libzfs/os/freebsd/libzfs_compat.c | 321 + lib/libzfs/os/freebsd/libzfs_ioctl_compat.c | 432 + lib/libzfs/os/freebsd/libzfs_zmount.c | 139 + lib/libzfs/os/linux/libzfs_mount_os.c | 411 + lib/libzfs/os/linux/libzfs_pool_os.c | 345 + lib/libzfs/os/linux/libzfs_sendrecv_os.c | 52 + lib/libzfs/os/linux/libzfs_util_os.c | 215 + lib/libzfs_core/.gitignore | 1 + lib/libzfs_core/Makefile.am | 29 + lib/libzfs_core/libzfs_core.c | 1644 + lib/libzfs_core/libzfs_core.pc.in | 13 + lib/libzpool/Makefile.am | 230 + lib/libzpool/kernel.c | 1410 + lib/libzpool/taskq.c | 380 + lib/libzpool/util.c | 257 + lib/libzstd/Makefile.am | 21 + lib/libzutil/Makefile.am | 50 + lib/libzutil/os/freebsd/zutil_compat.c | 124 + lib/libzutil/os/freebsd/zutil_device_path_os.c | 132 + lib/libzutil/os/freebsd/zutil_import_os.c | 239 + lib/libzutil/os/linux/zutil_compat.c | 30 + lib/libzutil/os/linux/zutil_device_path_os.c | 506 + lib/libzutil/os/linux/zutil_import_os.c | 871 + lib/libzutil/zutil_device_path.c | 174 + lib/libzutil/zutil_import.c | 1587 + lib/libzutil/zutil_import.h | 76 + lib/libzutil/zutil_nicenum.c | 172 + lib/libzutil/zutil_pool.c | 145 + man/Makefile.am | 1 + man/man1/Makefile.am | 12 + man/man1/arcstat.1 | 502 + man/man1/cstyle.1 | 167 + man/man1/raidz_test.1 | 97 + man/man1/zhack.1 | 98 + man/man1/ztest.1 | 179 + man/man1/zvol_wait.1 | 21 + man/man5/Makefile.am | 16 + man/man5/spl-module-parameters.5 | 326 + man/man5/vdev_id.conf.5 | 222 + man/man5/zfs-events.5 | 965 + man/man5/zfs-module-parameters.5 | 4084 +++ man/man5/zpool-features.5 | 985 + man/man8/.gitignore | 2 + man/man8/Makefile.am | 101 + man/man8/fsck.zfs.8 | 67 + man/man8/mount.zfs.8 | 144 + man/man8/vdev_id.8 | 77 + man/man8/zdb.8 | 478 + man/man8/zed.8.in | 268 + man/man8/zfs-allow.8 | 372 + man/man8/zfs-bookmark.8 | 69 + man/man8/zfs-change-key.8 | 1 + man/man8/zfs-clone.8 | 77 + man/man8/zfs-create.8 | 244 + man/man8/zfs-destroy.8 | 178 + man/man8/zfs-diff.8 | 91 + man/man8/zfs-get.8 | 1 + man/man8/zfs-groupspace.8 | 1 + man/man8/zfs-hold.8 | 110 + man/man8/zfs-inherit.8 | 1 + man/man8/zfs-jail.8 | 119 + man/man8/zfs-list.8 | 169 + man/man8/zfs-load-key.8 | 295 + man/man8/zfs-mount-generator.8.in | 248 + man/man8/zfs-mount.8 | 128 + man/man8/zfs-program.8 | 634 + man/man8/zfs-project.8 | 153 + man/man8/zfs-projectspace.8 | 1 + man/man8/zfs-promote.8 | 69 + man/man8/zfs-receive.8 | 375 + man/man8/zfs-recv.8 | 1 + man/man8/zfs-redact.8 | 1 + man/man8/zfs-release.8 | 1 + man/man8/zfs-rename.8 | 91 + man/man8/zfs-rollback.8 | 81 + man/man8/zfs-send.8 | 609 + man/man8/zfs-set.8 | 188 + man/man8/zfs-share.8 | 88 + man/man8/zfs-snapshot.8 | 83 + man/man8/zfs-unallow.8 | 1 + man/man8/zfs-unjail.8 | 1 + man/man8/zfs-unload-key.8 | 1 + man/man8/zfs-unmount.8 | 1 + man/man8/zfs-upgrade.8 | 106 + man/man8/zfs-userspace.8 | 186 + man/man8/zfs-wait.8 | 71 + man/man8/zfs.8 | 741 + man/man8/zfs_ids_to_path.8 | 50 + man/man8/zfsconcepts.8 | 198 + man/man8/zfsprops.8 | 1988 ++ man/man8/zgenhostid.8 | 71 + man/man8/zinject.8 | 198 + man/man8/zpool-add.8 | 102 + man/man8/zpool-attach.8 | 103 + man/man8/zpool-checkpoint.8 | 82 + man/man8/zpool-clear.8 | 60 + man/man8/zpool-create.8 | 207 + man/man8/zpool-destroy.8 | 55 + man/man8/zpool-detach.8 | 61 + man/man8/zpool-events.8 | 71 + man/man8/zpool-export.8 | 83 + man/man8/zpool-get.8 | 102 + man/man8/zpool-history.8 | 64 + man/man8/zpool-import.8 | 388 + man/man8/zpool-initialize.8 | 81 + man/man8/zpool-iostat.8 | 247 + man/man8/zpool-labelclear.8 | 66 + man/man8/zpool-list.8 | 119 + man/man8/zpool-offline.8 | 89 + man/man8/zpool-online.8 | 1 + man/man8/zpool-reguid.8 | 53 + man/man8/zpool-remove.8 | 109 + man/man8/zpool-reopen.8 | 55 + man/man8/zpool-replace.8 | 105 + man/man8/zpool-resilver.8 | 59 + man/man8/zpool-scrub.8 | 105 + man/man8/zpool-set.8 | 1 + man/man8/zpool-split.8 | 124 + man/man8/zpool-status.8 | 138 + man/man8/zpool-sync.8 | 57 + man/man8/zpool-trim.8 | 94 + man/man8/zpool-upgrade.8 | 96 + man/man8/zpool-wait.8 | 114 + man/man8/zpool.8 | 568 + man/man8/zpoolconcepts.8 | 414 + man/man8/zpoolprops.8 | 368 + man/man8/zstream.8 | 110 + man/man8/zstreamdump.8 | 58 + module/.gitignore | 26 + module/Kbuild.in | 47 + module/Makefile.bsd | 359 + module/Makefile.in | 115 + module/avl/Makefile.in | 10 + module/avl/avl.c | 1093 + module/icp/Makefile.in | 96 + module/icp/algs/aes/aes_impl.c | 443 + module/icp/algs/aes/aes_impl_aesni.c | 124 + module/icp/algs/aes/aes_impl_generic.c | 1242 + module/icp/algs/aes/aes_impl_x86-64.c | 63 + module/icp/algs/aes/aes_modes.c | 135 + module/icp/algs/edonr/edonr.c | 746 + module/icp/algs/edonr/edonr_byteorder.h | 216 + module/icp/algs/modes/cbc.c | 273 + module/icp/algs/modes/ccm.c | 907 + module/icp/algs/modes/ctr.c | 228 + module/icp/algs/modes/ecb.c | 128 + module/icp/algs/modes/gcm.c | 1543 + module/icp/algs/modes/gcm_generic.c | 83 + module/icp/algs/modes/gcm_pclmulqdq.c | 64 + module/icp/algs/modes/modes.c | 157 + module/icp/algs/sha1/sha1.c | 835 + module/icp/algs/sha2/sha2.c | 956 + module/icp/algs/skein/THIRDPARTYLICENSE | 3 + module/icp/algs/skein/THIRDPARTYLICENSE.descrip | 1 + module/icp/algs/skein/skein.c | 911 + module/icp/algs/skein/skein_block.c | 790 + module/icp/algs/skein/skein_impl.h | 292 + module/icp/algs/skein/skein_iv.c | 185 + module/icp/algs/skein/skein_port.h | 116 + module/icp/api/kcf_cipher.c | 930 + module/icp/api/kcf_ctxops.c | 151 + module/icp/api/kcf_digest.c | 491 + module/icp/api/kcf_mac.c | 645 + module/icp/api/kcf_miscapi.c | 127 + .../icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman | 23 + .../aes/THIRDPARTYLICENSE.gladman.descrip | 1 + .../icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl | 127 + .../aes/THIRDPARTYLICENSE.openssl.descrip | 1 + module/icp/asm-x86_64/aes/aes_aesni.S | 748 + module/icp/asm-x86_64/aes/aes_amd64.S | 906 + module/icp/asm-x86_64/aes/aeskey.c | 580 + module/icp/asm-x86_64/aes/aesopt.h | 770 + module/icp/asm-x86_64/aes/aestab.h | 165 + module/icp/asm-x86_64/aes/aestab2.h | 594 + .../asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams | 36 + .../modes/THIRDPARTYLICENSE.cryptogams.descrip | 1 + .../icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl | 177 + .../modes/THIRDPARTYLICENSE.openssl.descrip | 1 + module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S | 1245 + module/icp/asm-x86_64/modes/gcm_pclmulqdq.S | 254 + module/icp/asm-x86_64/modes/ghash-x86_64.S | 714 + module/icp/asm-x86_64/sha1/sha1-x86_64.S | 1353 + module/icp/asm-x86_64/sha2/sha256_impl.S | 2063 ++ module/icp/asm-x86_64/sha2/sha512_impl.S | 2088 ++ module/icp/core/kcf_callprov.c | 1567 + module/icp/core/kcf_mech_tabs.c | 791 + module/icp/core/kcf_prov_lib.c | 227 + module/icp/core/kcf_prov_tabs.c | 645 + module/icp/core/kcf_sched.c | 1782 + module/icp/illumos-crypto.c | 158 + module/icp/include/aes/aes_impl.h | 227 + module/icp/include/modes/gcm_impl.h | 75 + module/icp/include/modes/modes.h | 411 + module/icp/include/sha1/sha1.h | 61 + module/icp/include/sha1/sha1_consts.h | 65 + module/icp/include/sha1/sha1_impl.h | 73 + module/icp/include/sha2/sha2_consts.h | 219 + module/icp/include/sha2/sha2_impl.h | 64 + module/icp/include/sys/asm_linkage.h | 46 + module/icp/include/sys/bitmap.h | 183 + module/icp/include/sys/crypto/elfsign.h | 137 + module/icp/include/sys/crypto/impl.h | 1363 + module/icp/include/sys/crypto/ioctl.h | 1480 + module/icp/include/sys/crypto/ioctladmin.h | 136 + module/icp/include/sys/crypto/ops_impl.h | 630 + module/icp/include/sys/crypto/sched_impl.h | 531 + module/icp/include/sys/crypto/spi.h | 726 + module/icp/include/sys/ia32/asm_linkage.h | 307 + module/icp/include/sys/ia32/stack.h | 160 + module/icp/include/sys/ia32/trap.h | 107 + module/icp/include/sys/modctl.h | 477 + module/icp/include/sys/modhash.h | 147 + module/icp/include/sys/modhash_impl.h | 108 + module/icp/include/sys/stack.h | 36 + module/icp/include/sys/trap.h | 36 + module/icp/io/aes.c | 1439 + module/icp/io/edonr_mod.c | 63 + module/icp/io/sha1_mod.c | 1230 + module/icp/io/sha2_mod.c | 1399 + module/icp/io/skein_mod.c | 729 + module/icp/os/modconf.c | 173 + module/icp/os/modhash.c | 927 + module/icp/spi/kcf_spi.c | 925 + module/lua/Makefile.in | 39 + module/lua/README.zfs | 80 + module/lua/lapi.c | 1345 + module/lua/lapi.h | 26 + module/lua/lauxlib.c | 800 + module/lua/lbaselib.c | 296 + module/lua/lcode.c | 884 + module/lua/lcode.h | 85 + module/lua/lcompat.c | 101 + module/lua/lcorolib.c | 159 + module/lua/lctype.c | 52 + module/lua/lctype.h | 94 + module/lua/ldebug.c | 609 + module/lua/ldebug.h | 36 + module/lua/ldo.c | 748 + module/lua/ldo.h | 47 + module/lua/lfunc.c | 160 + module/lua/lfunc.h | 35 + module/lua/lgc.c | 1218 + module/lua/lgc.h | 159 + module/lua/llex.c | 531 + module/lua/llex.h | 83 + module/lua/llimits.h | 323 + module/lua/lmem.c | 98 + module/lua/lmem.h | 56 + module/lua/lobject.c | 282 + module/lua/lobject.h | 605 + module/lua/lopcodes.c | 108 + module/lua/lopcodes.h | 290 + module/lua/lparser.c | 1643 + module/lua/lparser.h | 121 + module/lua/lstate.c | 320 + module/lua/lstate.h | 230 + module/lua/lstring.c | 186 + module/lua/lstring.h | 48 + module/lua/lstrlib.c | 1040 + module/lua/ltable.c | 592 + module/lua/ltable.h | 47 + module/lua/ltablib.c | 289 + module/lua/ltm.c | 76 + module/lua/ltm.h | 59 + module/lua/lvm.c | 932 + module/lua/lvm.h | 46 + module/lua/lzio.c | 74 + module/lua/lzio.h | 67 + module/lua/setjmp/setjmp.S | 19 + module/lua/setjmp/setjmp_aarch64.S | 86 + module/lua/setjmp/setjmp_arm.S | 84 + module/lua/setjmp/setjmp_i386.S | 69 + module/lua/setjmp/setjmp_mips.S | 105 + module/lua/setjmp/setjmp_ppc.S | 165 + module/lua/setjmp/setjmp_rv64g.S | 91 + module/lua/setjmp/setjmp_s390x.S | 64 + module/lua/setjmp/setjmp_sparc64.S | 105 + module/lua/setjmp/setjmp_x86_64.S | 77 + module/nvpair/Makefile.in | 13 + module/nvpair/fnvpair.c | 660 + module/nvpair/nvpair.c | 3729 ++ module/nvpair/nvpair_alloc_fixed.c | 115 + module/nvpair/nvpair_alloc_spl.c | 96 + module/os/freebsd/spl/acl_common.c | 1709 + module/os/freebsd/spl/callb.c | 373 + module/os/freebsd/spl/list.c | 246 + module/os/freebsd/spl/sha224.h | 96 + module/os/freebsd/spl/sha256.h | 99 + module/os/freebsd/spl/sha256c.c | 378 + module/os/freebsd/spl/sha384.h | 96 + module/os/freebsd/spl/sha512.h | 101 + module/os/freebsd/spl/sha512c.c | 508 + module/os/freebsd/spl/sha512t.h | 143 + module/os/freebsd/spl/spl_acl.c | 223 + module/os/freebsd/spl/spl_atomic.c | 123 + module/os/freebsd/spl/spl_cmn_err.c | 77 + module/os/freebsd/spl/spl_dtrace.c | 38 + module/os/freebsd/spl/spl_kmem.c | 352 + module/os/freebsd/spl/spl_kstat.c | 351 + module/os/freebsd/spl/spl_misc.c | 113 + module/os/freebsd/spl/spl_policy.c | 437 + module/os/freebsd/spl/spl_procfs_list.c | 79 + module/os/freebsd/spl/spl_string.c | 107 + module/os/freebsd/spl/spl_sunddi.c | 75 + module/os/freebsd/spl/spl_sysevent.c | 262 + module/os/freebsd/spl/spl_taskq.c | 409 + module/os/freebsd/spl/spl_uio.c | 92 + module/os/freebsd/spl/spl_vfs.c | 285 + module/os/freebsd/spl/spl_vm.c | 75 + module/os/freebsd/spl/spl_zlib.c | 242 + module/os/freebsd/spl/spl_zone.c | 266 + module/os/freebsd/zfs/abd_os.c | 498 + module/os/freebsd/zfs/arc_os.c | 245 + module/os/freebsd/zfs/crypto_os.c | 611 + module/os/freebsd/zfs/dmu_os.c | 349 + module/os/freebsd/zfs/hkdf.c | 102 + module/os/freebsd/zfs/kmod_core.c | 381 + module/os/freebsd/zfs/spa_os.c | 281 + module/os/freebsd/zfs/spa_stats.c | 114 + module/os/freebsd/zfs/sysctl_os.c | 693 + module/os/freebsd/zfs/vdev_file.c | 328 + module/os/freebsd/zfs/vdev_geom.c | 1206 + module/os/freebsd/zfs/vdev_label_os.c | 74 + module/os/freebsd/zfs/zfs_acl.c | 2700 ++ module/os/freebsd/zfs/zfs_ctldir.c | 1350 + module/os/freebsd/zfs/zfs_debug.c | 251 + module/os/freebsd/zfs/zfs_dir.c | 967 + module/os/freebsd/zfs/zfs_file_os.c | 309 + module/os/freebsd/zfs/zfs_ioctl_compat.c | 361 + module/os/freebsd/zfs/zfs_ioctl_os.c | 161 + module/os/freebsd/zfs/zfs_onexit_os.c | 70 + module/os/freebsd/zfs/zfs_vfsops.c | 2482 ++ module/os/freebsd/zfs/zfs_vnops.c | 6629 ++++ module/os/freebsd/zfs/zfs_znode.c | 2067 ++ module/os/freebsd/zfs/zio_crypt.c | 1882 + module/os/freebsd/zfs/zvol_os.c | 1454 + module/os/linux/spl/Makefile.in | 17 + module/os/linux/spl/README.md | 16 + module/os/linux/spl/THIRDPARTYLICENSE.gplv2 | 339 + .../os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip | 1 + module/os/linux/spl/spl-atomic.c | 36 + module/os/linux/spl/spl-condvar.c | 466 + module/os/linux/spl/spl-cred.c | 196 + module/os/linux/spl/spl-err.c | 124 + module/os/linux/spl/spl-generic.c | 844 + module/os/linux/spl/spl-kmem-cache.c | 1469 + module/os/linux/spl/spl-kmem.c | 618 + module/os/linux/spl/spl-kstat.c | 778 + module/os/linux/spl/spl-proc.c | 791 + module/os/linux/spl/spl-procfs-list.c | 264 + module/os/linux/spl/spl-taskq.c | 1308 + module/os/linux/spl/spl-thread.c | 161 + module/os/linux/spl/spl-trace.c | 33 + module/os/linux/spl/spl-tsd.c | 720 + module/os/linux/spl/spl-vmem.c | 91 + module/os/linux/spl/spl-xdr.c | 513 + module/os/linux/spl/spl-zlib.c | 218 + module/os/linux/zfs/Makefile.in | 37 + module/os/linux/zfs/abd_os.c | 1074 + module/os/linux/zfs/arc_os.c | 465 + module/os/linux/zfs/mmp_os.c | 41 + module/os/linux/zfs/policy.c | 374 + module/os/linux/zfs/qat.c | 105 + module/os/linux/zfs/qat_compress.c | 569 + module/os/linux/zfs/qat_crypt.c | 630 + module/os/linux/zfs/spa_misc_os.c | 110 + module/os/linux/zfs/spa_stats.c | 1047 + module/os/linux/zfs/trace.c | 55 + module/os/linux/zfs/vdev_disk.c | 873 + module/os/linux/zfs/vdev_file.c | 348 + module/os/linux/zfs/zfs_acl.c | 2932 ++ module/os/linux/zfs/zfs_ctldir.c | 1241 + module/os/linux/zfs/zfs_debug.c | 254 + module/os/linux/zfs/zfs_dir.c | 1224 + module/os/linux/zfs/zfs_file_os.c | 440 + module/os/linux/zfs/zfs_ioctl_os.c | 329 + module/os/linux/zfs/zfs_sysfs.c | 662 + module/os/linux/zfs/zfs_vfsops.c | 2165 ++ module/os/linux/zfs/zfs_vnops.c | 5031 +++ module/os/linux/zfs/zfs_znode.c | 2249 ++ module/os/linux/zfs/zio_crypt.c | 2036 ++ module/os/linux/zfs/zpl_ctldir.c | 551 + module/os/linux/zfs/zpl_export.c | 154 + module/os/linux/zfs/zpl_file.c | 1079 + module/os/linux/zfs/zpl_inode.c | 747 + module/os/linux/zfs/zpl_super.c | 326 + module/os/linux/zfs/zpl_xattr.c | 1480 + module/os/linux/zfs/zvol_os.c | 1125 + module/spl/Makefile.in | 13 + module/unicode/Makefile.in | 11 + module/unicode/u8_textprep.c | 2151 ++ module/unicode/uconv.c | 863 + module/zcommon/Makefile.in | 29 + module/zcommon/cityhash.c | 67 + module/zcommon/zfeature_common.c | 602 + module/zcommon/zfs_comutil.c | 263 + module/zcommon/zfs_deleg.c | 249 + module/zcommon/zfs_fletcher.c | 940 + module/zcommon/zfs_fletcher_aarch64_neon.c | 215 + module/zcommon/zfs_fletcher_avx512.c | 225 + module/zcommon/zfs_fletcher_intel.c | 173 + module/zcommon/zfs_fletcher_sse.c | 232 + module/zcommon/zfs_fletcher_superscalar.c | 163 + module/zcommon/zfs_fletcher_superscalar4.c | 229 + module/zcommon/zfs_namecheck.c | 471 + module/zcommon/zfs_prop.c | 1052 + module/zcommon/zfs_uio.c | 287 + module/zcommon/zpool_prop.c | 275 + module/zcommon/zprop_common.c | 480 + module/zfs/Makefile.in | 153 + module/zfs/THIRDPARTYLICENSE.cityhash | 19 + module/zfs/THIRDPARTYLICENSE.cityhash.descrip | 1 + module/zfs/abd.c | 1213 + module/zfs/aggsum.c | 237 + module/zfs/arc.c | 10565 ++++++ module/zfs/blkptr.c | 153 + module/zfs/bplist.c | 91 + module/zfs/bpobj.c | 943 + module/zfs/bptree.c | 303 + module/zfs/bqueue.c | 155 + module/zfs/btree.c | 2124 ++ module/zfs/dataset_kstats.c | 215 + module/zfs/dbuf.c | 4741 +++ module/zfs/dbuf_stats.c | 231 + module/zfs/ddt.c | 1187 + module/zfs/ddt_zap.c | 168 + module/zfs/dmu.c | 2482 ++ module/zfs/dmu_diff.c | 240 + module/zfs/dmu_object.c | 525 + module/zfs/dmu_objset.c | 2992 ++ module/zfs/dmu_recv.c | 3386 ++ module/zfs/dmu_redact.c | 1186 + module/zfs/dmu_send.c | 3091 ++ module/zfs/dmu_traverse.c | 786 + module/zfs/dmu_tx.c | 1404 + module/zfs/dmu_zfetch.c | 384 + module/zfs/dnode.c | 2575 ++ module/zfs/dnode_sync.c | 846 + module/zfs/dsl_bookmark.c | 1742 + module/zfs/dsl_crypt.c | 2872 ++ module/zfs/dsl_dataset.c | 5027 +++ module/zfs/dsl_deadlist.c | 1012 + module/zfs/dsl_deleg.c | 774 + module/zfs/dsl_destroy.c | 1286 + module/zfs/dsl_dir.c | 2414 ++ module/zfs/dsl_pool.c | 1384 + module/zfs/dsl_prop.c | 1287 + module/zfs/dsl_scan.c | 4426 +++ module/zfs/dsl_synctask.c | 261 + module/zfs/dsl_userhold.c | 691 + module/zfs/edonr_zfs.c | 115 + module/zfs/fm.c | 1674 + module/zfs/gzip.c | 106 + module/zfs/hkdf.c | 171 + module/zfs/lz4.c | 1031 + module/zfs/lzjb.c | 132 + module/zfs/metaslab.c | 6236 ++++ module/zfs/mmp.c | 728 + module/zfs/multilist.c | 431 + module/zfs/objlist.c | 84 + module/zfs/pathname.c | 96 + module/zfs/range_tree.c | 921 + module/zfs/refcount.c | 327 + module/zfs/rrwlock.c | 396 + module/zfs/sa.c | 2258 ++ module/zfs/sha256.c | 105 + module/zfs/skein_zfs.c | 102 + module/zfs/spa.c | 9754 +++++ module/zfs/spa_boot.c | 50 + module/zfs/spa_checkpoint.c | 636 + module/zfs/spa_config.c | 620 + module/zfs/spa_errlog.c | 416 + module/zfs/spa_history.c | 629 + module/zfs/spa_log_spacemap.c | 1322 + module/zfs/spa_misc.c | 2921 ++ module/zfs/space_map.c | 1105 + module/zfs/space_reftree.c | 152 + module/zfs/txg.c | 1059 + module/zfs/uberblock.c | 74 + module/zfs/unique.c | 112 + module/zfs/vdev.c | 5090 +++ module/zfs/vdev_cache.c | 437 + module/zfs/vdev_indirect.c | 1890 + module/zfs/vdev_indirect_births.c | 226 + module/zfs/vdev_indirect_mapping.c | 616 + module/zfs/vdev_initialize.c | 750 + module/zfs/vdev_label.c | 1901 + module/zfs/vdev_mirror.c | 863 + module/zfs/vdev_missing.c | 113 + module/zfs/vdev_queue.c | 1070 + module/zfs/vdev_raidz.c | 2421 ++ module/zfs/vdev_raidz_math.c | 666 + module/zfs/vdev_raidz_math_aarch64_neon.c | 2279 ++ module/zfs/vdev_raidz_math_aarch64_neon_common.h | 684 + module/zfs/vdev_raidz_math_aarch64_neonx2.c | 232 + module/zfs/vdev_raidz_math_avx2.c | 413 + module/zfs/vdev_raidz_math_avx512bw.c | 413 + module/zfs/vdev_raidz_math_avx512f.c | 494 + module/zfs/vdev_raidz_math_impl.h | 1477 + module/zfs/vdev_raidz_math_powerpc_altivec.c | 4337 +++ .../zfs/vdev_raidz_math_powerpc_altivec_common.h | 690 + module/zfs/vdev_raidz_math_scalar.c | 337 + module/zfs/vdev_raidz_math_sse2.c | 631 + module/zfs/vdev_raidz_math_ssse3.c | 2477 ++ module/zfs/vdev_rebuild.c | 1108 + module/zfs/vdev_removal.c | 2340 ++ module/zfs/vdev_root.c | 158 + module/zfs/vdev_trim.c | 1701 + module/zfs/zap.c | 1384 + module/zfs/zap_leaf.c | 849 + module/zfs/zap_micro.c | 1697 + module/zfs/zcp.c | 1456 + module/zfs/zcp_get.c | 813 + module/zfs/zcp_global.c | 89 + module/zfs/zcp_iter.c | 751 + module/zfs/zcp_set.c | 100 + module/zfs/zcp_synctask.c | 544 + module/zfs/zfeature.c | 526 + module/zfs/zfs_byteswap.c | 211 + module/zfs/zfs_fm.c | 1083 + module/zfs/zfs_fuid.c | 815 + module/zfs/zfs_ioctl.c | 7629 ++++ module/zfs/zfs_log.c | 774 + module/zfs/zfs_onexit.c | 173 + module/zfs/zfs_quota.c | 476 + module/zfs/zfs_ratelimit.c | 99 + module/zfs/zfs_replay.c | 992 + module/zfs/zfs_rlock.c | 691 + module/zfs/zfs_sa.c | 445 + module/zfs/zil.c | 3688 ++ module/zfs/zio.c | 4969 +++ module/zfs/zio_checksum.c | 570 + module/zfs/zio_compress.c | 220 + module/zfs/zio_inject.c | 966 + module/zfs/zle.c | 91 + module/zfs/zrlock.c | 188 + module/zfs/zthr.c | 536 + module/zfs/zvol.c | 1729 + module/zstd/Makefile.in | 35 + module/zstd/README.md | 65 + module/zstd/include/aarch64_compat.h | 37 + module/zstd/include/limits.h | 63 + module/zstd/include/stddef.h | 62 + module/zstd/include/stdint.h | 62 + module/zstd/include/stdio.h | 54 + module/zstd/include/stdlib.h | 58 + module/zstd/include/string.h | 62 + module/zstd/include/zstd_compat_wrapper.h | 460 + module/zstd/lib/zstd.c | 27826 +++++++++++++++ module/zstd/lib/zstd.h | 2115 ++ module/zstd/lib/zstd_errors.h | 94 + module/zstd/zfs_zstd.c | 738 + module/zstd/zstd-in.c | 68 + rpm/Makefile.am | 1 + rpm/generic/.gitignore | 3 + rpm/generic/Makefile.am | 1 + rpm/generic/zfs-dkms.spec.in | 102 + rpm/generic/zfs-kmod.spec.in | 164 + rpm/generic/zfs.spec.in | 523 + rpm/redhat/.gitignore | 3 + rpm/redhat/Makefile.am | 1 + rpm/redhat/zfs-dkms.spec.in | 1 + rpm/redhat/zfs-kmod.spec.in | 88 + rpm/redhat/zfs.spec.in | 1 + scripts/.gitignore | 1 + scripts/Makefile.am | 77 + scripts/commitcheck.sh | 218 + scripts/common.sh.in | 21 + scripts/cstyle.pl | 1018 + scripts/dkms.mkconf | 120 + scripts/dkms.postbuild | 24 + scripts/enum-extract.pl | 58 + scripts/kmodtool | 611 + scripts/make_gitrev.sh | 78 + scripts/man-dates.sh | 12 + scripts/paxcheck.sh | 44 + scripts/zfs-helpers.sh | 194 + scripts/zfs-tests.sh | 706 + scripts/zfs.sh | 279 + scripts/zfs2zol-patch.sed | 32 + scripts/zimport.sh | 517 + scripts/zloop.sh | 304 + scripts/zol2zfs-patch.sed | 20 + tests/Makefile.am | 3 + tests/README.md | 152 + tests/runfiles/Makefile.am | 8 + tests/runfiles/common.run | 900 + tests/runfiles/freebsd.run | 27 + tests/runfiles/linux.run | 179 + tests/runfiles/longevity.run | 23 + tests/runfiles/perf-regression.run | 33 + tests/runfiles/sunos.run | 53 + tests/test-runner/Makefile.am | 1 + tests/test-runner/bin/.gitignore | 2 + tests/test-runner/bin/Makefile.am | 8 + tests/test-runner/bin/test-runner.py.in | 1056 + tests/test-runner/bin/zts-report.py.in | 390 + tests/test-runner/include/Makefile.am | 5 + tests/test-runner/include/logapi.shlib | 530 + tests/test-runner/include/stf.shlib | 57 + tests/test-runner/man/Makefile.am | 4 + tests/test-runner/man/test-runner.1 | 386 + tests/zfs-tests/Makefile.am | 1 + tests/zfs-tests/callbacks/Makefile.am | 6 + tests/zfs-tests/callbacks/zfs_dbgmsg.ksh | 29 + tests/zfs-tests/callbacks/zfs_dmesg.ksh | 30 + tests/zfs-tests/callbacks/zfs_failsafe.ksh | 8 + tests/zfs-tests/callbacks/zfs_mmp.ksh | 37 + tests/zfs-tests/cmd/Makefile.am | 34 + tests/zfs-tests/cmd/btree_test/.gitignore | 1 + tests/zfs-tests/cmd/btree_test/Makefile.am | 32 + tests/zfs-tests/cmd/btree_test/btree_test.c | 554 + tests/zfs-tests/cmd/chg_usr_exec/.gitignore | 1 + tests/zfs-tests/cmd/chg_usr_exec/Makefile.am | 6 + tests/zfs-tests/cmd/chg_usr_exec/chg_usr_exec.c | 77 + tests/zfs-tests/cmd/devname2devid/.gitignore | 1 + tests/zfs-tests/cmd/devname2devid/Makefile.am | 10 + tests/zfs-tests/cmd/devname2devid/devname2devid.c | 160 + tests/zfs-tests/cmd/dir_rd_update/.gitignore | 1 + tests/zfs-tests/cmd/dir_rd_update/Makefile.am | 6 + tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c | 136 + tests/zfs-tests/cmd/file_check/.gitignore | 1 + tests/zfs-tests/cmd/file_check/Makefile.am | 6 + tests/zfs-tests/cmd/file_check/file_check.c | 86 + tests/zfs-tests/cmd/file_common.h | 70 + tests/zfs-tests/cmd/file_trunc/.gitignore | 1 + tests/zfs-tests/cmd/file_trunc/Makefile.am | 6 + tests/zfs-tests/cmd/file_trunc/file_trunc.c | 240 + tests/zfs-tests/cmd/file_write/.gitignore | 1 + tests/zfs-tests/cmd/file_write/Makefile.am | 6 + tests/zfs-tests/cmd/file_write/file_write.c | 258 + tests/zfs-tests/cmd/get_diff/.gitignore | 1 + tests/zfs-tests/cmd/get_diff/Makefile.am | 6 + tests/zfs-tests/cmd/get_diff/get_diff.c | 109 + tests/zfs-tests/cmd/largest_file/.gitignore | 1 + tests/zfs-tests/cmd/largest_file/Makefile.am | 6 + tests/zfs-tests/cmd/largest_file/largest_file.c | 145 + tests/zfs-tests/cmd/libzfs_input_check/.gitignore | 1 + tests/zfs-tests/cmd/libzfs_input_check/Makefile.am | 10 + .../cmd/libzfs_input_check/libzfs_input_check.c | 1060 + tests/zfs-tests/cmd/mkbusy/.gitignore | 1 + tests/zfs-tests/cmd/mkbusy/Makefile.am | 6 + tests/zfs-tests/cmd/mkbusy/mkbusy.c | 177 + tests/zfs-tests/cmd/mkfile/.gitignore | 1 + tests/zfs-tests/cmd/mkfile/Makefile.am | 8 + tests/zfs-tests/cmd/mkfile/mkfile.c | 282 + tests/zfs-tests/cmd/mkfiles/.gitignore | 1 + tests/zfs-tests/cmd/mkfiles/Makefile.am | 6 + tests/zfs-tests/cmd/mkfiles/mkfiles.c | 66 + tests/zfs-tests/cmd/mktree/.gitignore | 1 + tests/zfs-tests/cmd/mktree/Makefile.am | 6 + tests/zfs-tests/cmd/mktree/mktree.c | 191 + tests/zfs-tests/cmd/mmap_exec/.gitignore | 1 + tests/zfs-tests/cmd/mmap_exec/Makefile.am | 6 + tests/zfs-tests/cmd/mmap_exec/mmap_exec.c | 72 + tests/zfs-tests/cmd/mmap_libaio/.gitignore | 1 + tests/zfs-tests/cmd/mmap_libaio/Makefile.am | 10 + tests/zfs-tests/cmd/mmap_libaio/mmap_libaio.c | 88 + tests/zfs-tests/cmd/mmapwrite/.gitignore | 1 + tests/zfs-tests/cmd/mmapwrite/Makefile.am | 7 + tests/zfs-tests/cmd/mmapwrite/mmapwrite.c | 162 + tests/zfs-tests/cmd/nvlist_to_lua/.gitignore | 1 + tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am | 10 + tests/zfs-tests/cmd/nvlist_to_lua/nvlist_to_lua.c | 305 + tests/zfs-tests/cmd/randfree_file/.gitignore | 1 + tests/zfs-tests/cmd/randfree_file/Makefile.am | 6 + tests/zfs-tests/cmd/randfree_file/randfree_file.c | 125 + tests/zfs-tests/cmd/randwritecomp/.gitignore | 1 + tests/zfs-tests/cmd/randwritecomp/Makefile.am | 9 + tests/zfs-tests/cmd/randwritecomp/randwritecomp.c | 194 + tests/zfs-tests/cmd/readmmap/.gitignore | 1 + tests/zfs-tests/cmd/readmmap/Makefile.am | 6 + tests/zfs-tests/cmd/readmmap/readmmap.c | 138 + tests/zfs-tests/cmd/rename_dir/.gitignore | 1 + tests/zfs-tests/cmd/rename_dir/Makefile.am | 6 + tests/zfs-tests/cmd/rename_dir/rename_dir.c | 88 + tests/zfs-tests/cmd/rm_lnkcnt_zero_file/.gitignore | 1 + .../zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile.am | 7 + .../cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c | 159 + tests/zfs-tests/cmd/stride_dd/.gitignore | 1 + tests/zfs-tests/cmd/stride_dd/Makefile.am | 7 + tests/zfs-tests/cmd/stride_dd/stride_dd.c | 214 + tests/zfs-tests/cmd/threadsappend/.gitignore | 1 + tests/zfs-tests/cmd/threadsappend/Makefile.am | 7 + tests/zfs-tests/cmd/threadsappend/threadsappend.c | 135 + tests/zfs-tests/cmd/user_ns_exec/.gitignore | 1 + tests/zfs-tests/cmd/user_ns_exec/Makefile.am | 6 + tests/zfs-tests/cmd/user_ns_exec/user_ns_exec.c | 179 + tests/zfs-tests/cmd/xattrtest/.gitignore | 1 + tests/zfs-tests/cmd/xattrtest/Makefile.am | 6 + tests/zfs-tests/cmd/xattrtest/xattrtest.c | 718 + tests/zfs-tests/include/.gitignore | 1 + tests/zfs-tests/include/Makefile.am | 14 + tests/zfs-tests/include/blkdev.shlib | 621 + tests/zfs-tests/include/commands.cfg | 218 + tests/zfs-tests/include/default.cfg.in | 223 + tests/zfs-tests/include/libtest.shlib | 4194 +++ tests/zfs-tests/include/math.shlib | 144 + tests/zfs-tests/include/properties.shlib | 155 + tests/zfs-tests/include/tunables.cfg | 89 + tests/zfs-tests/include/zpool_script.shlib | 49 + tests/zfs-tests/tests/Makefile.am | 1 + tests/zfs-tests/tests/functional/Makefile.am | 90 + tests/zfs-tests/tests/functional/acl/Makefile.am | 6 + tests/zfs-tests/tests/functional/acl/acl.cfg | 64 + .../tests/functional/acl/acl_common.kshlib | 630 + .../tests/functional/acl/posix/Makefile.am | 7 + .../tests/functional/acl/posix/cleanup.ksh | 33 + .../tests/functional/acl/posix/posix_001_pos.ksh | 97 + .../tests/functional/acl/posix/posix_002_pos.ksh | 76 + .../tests/functional/acl/posix/posix_003_pos.ksh | 60 + .../zfs-tests/tests/functional/acl/posix/setup.ksh | 52 + .../tests/functional/alloc_class/Makefile.am | 21 + .../tests/functional/alloc_class/alloc_class.cfg | 31 + .../functional/alloc_class/alloc_class.kshlib | 67 + .../functional/alloc_class/alloc_class_001_pos.ksh | 43 + .../functional/alloc_class/alloc_class_002_neg.ksh | 48 + .../functional/alloc_class/alloc_class_003_pos.ksh | 57 + .../functional/alloc_class/alloc_class_004_pos.ksh | 67 + .../functional/alloc_class/alloc_class_005_pos.ksh | 71 + .../functional/alloc_class/alloc_class_006_pos.ksh | 41 + .../functional/alloc_class/alloc_class_007_pos.ksh | 41 + .../functional/alloc_class/alloc_class_008_pos.ksh | 56 + .../functional/alloc_class/alloc_class_009_pos.ksh | 69 + .../functional/alloc_class/alloc_class_010_pos.ksh | 50 + .../functional/alloc_class/alloc_class_011_neg.ksh | 44 + .../functional/alloc_class/alloc_class_012_pos.ksh | 122 + .../functional/alloc_class/alloc_class_013_pos.ksh | 63 + .../tests/functional/alloc_class/cleanup.ksh | 27 + .../tests/functional/alloc_class/setup.ksh | 26 + tests/zfs-tests/tests/functional/arc/Makefile.am | 8 + .../functional/arc/arcstats_runtime_tuning.ksh | 46 + tests/zfs-tests/tests/functional/arc/cleanup.ksh | 29 + .../tests/functional/arc/dbufstats_001_pos.ksh | 84 + .../tests/functional/arc/dbufstats_002_pos.ksh | 80 + .../tests/functional/arc/dbufstats_003_pos.ksh | 43 + tests/zfs-tests/tests/functional/arc/setup.ksh | 30 + tests/zfs-tests/tests/functional/atime/Makefile.am | 14 + tests/zfs-tests/tests/functional/atime/atime.cfg | 30 + .../tests/functional/atime/atime_001_pos.ksh | 75 + .../tests/functional/atime/atime_002_neg.ksh | 71 + .../tests/functional/atime/atime_003_pos.ksh | 70 + .../tests/functional/atime/atime_common.kshlib | 102 + tests/zfs-tests/tests/functional/atime/cleanup.ksh | 30 + .../tests/functional/atime/root_atime_off.ksh | 74 + .../tests/functional/atime/root_atime_on.ksh | 78 + .../tests/functional/atime/root_relatime_on.ksh | 76 + tests/zfs-tests/tests/functional/atime/setup.ksh | 31 + .../zfs-tests/tests/functional/bootfs/Makefile.am | 12 + .../tests/functional/bootfs/bootfs_001_pos.ksh | 86 + .../tests/functional/bootfs/bootfs_002_neg.ksh | 86 + .../tests/functional/bootfs/bootfs_003_pos.ksh | 86 + .../tests/functional/bootfs/bootfs_004_neg.ksh | 95 + .../tests/functional/bootfs/bootfs_005_neg.ksh | 79 + .../tests/functional/bootfs/bootfs_006_pos.ksh | 156 + .../tests/functional/bootfs/bootfs_007_pos.ksh | 69 + .../tests/functional/bootfs/bootfs_008_pos.ksh | 83 + .../zfs-tests/tests/functional/bootfs/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/bootfs/setup.ksh | 34 + tests/zfs-tests/tests/functional/btree/Makefile.am | 20 + .../tests/functional/btree/btree_negative.ksh | 38 + .../tests/functional/btree/btree_positive.ksh | 35 + tests/zfs-tests/tests/functional/cache/Makefile.am | 20 + tests/zfs-tests/tests/functional/cache/cache.cfg | 71 + .../zfs-tests/tests/functional/cache/cache.kshlib | 154 + .../tests/functional/cache/cache_001_pos.ksh | 65 + .../tests/functional/cache/cache_002_pos.ksh | 65 + .../tests/functional/cache/cache_003_pos.ksh | 69 + .../tests/functional/cache/cache_004_neg.ksh | 64 + .../tests/functional/cache/cache_005_neg.ksh | 65 + .../tests/functional/cache/cache_006_pos.ksh | 85 + .../tests/functional/cache/cache_007_neg.ksh | 63 + .../tests/functional/cache/cache_008_neg.ksh | 67 + .../tests/functional/cache/cache_009_pos.ksh | 69 + .../tests/functional/cache/cache_010_pos.ksh | 104 + .../tests/functional/cache/cache_011_pos.ksh | 68 + .../tests/functional/cache/cache_012_pos.ksh | 110 + tests/zfs-tests/tests/functional/cache/cleanup.ksh | 46 + tests/zfs-tests/tests/functional/cache/setup.ksh | 41 + .../tests/functional/cachefile/Makefile.am | 12 + .../tests/functional/cachefile/cachefile.cfg | 37 + .../tests/functional/cachefile/cachefile.kshlib | 46 + .../functional/cachefile/cachefile_001_pos.ksh | 94 + .../functional/cachefile/cachefile_002_pos.ksh | 82 + .../functional/cachefile/cachefile_003_pos.ksh | 100 + .../functional/cachefile/cachefile_004_pos.ksh | 124 + .../tests/functional/cachefile/cleanup.ksh | 30 + .../zfs-tests/tests/functional/cachefile/setup.ksh | 46 + .../tests/functional/casenorm/Makefile.am | 25 + .../tests/functional/casenorm/case_all_values.ksh | 42 + .../tests/functional/casenorm/casenorm.cfg | 35 + .../tests/functional/casenorm/casenorm.kshlib | 131 + .../tests/functional/casenorm/cleanup.ksh | 20 + .../casenorm/insensitive_formd_delete.ksh | 53 + .../casenorm/insensitive_formd_lookup.ksh | 51 + .../casenorm/insensitive_none_delete.ksh | 71 + .../casenorm/insensitive_none_lookup.ksh | 66 + .../functional/casenorm/mixed_create_failure.ksh | 136 + .../functional/casenorm/mixed_formd_delete.ksh | 59 + .../functional/casenorm/mixed_formd_lookup.ksh | 56 + .../functional/casenorm/mixed_formd_lookup_ci.ksh | 51 + .../functional/casenorm/mixed_none_delete.ksh | 56 + .../functional/casenorm/mixed_none_lookup.ksh | 53 + .../functional/casenorm/mixed_none_lookup_ci.ksh | 66 + .../tests/functional/casenorm/norm_all_values.ksh | 61 + .../functional/casenorm/sensitive_formd_delete.ksh | 59 + .../functional/casenorm/sensitive_formd_lookup.ksh | 56 + .../functional/casenorm/sensitive_none_delete.ksh | 56 + .../functional/casenorm/sensitive_none_lookup.ksh | 53 + .../zfs-tests/tests/functional/casenorm/setup.ksh | 27 + .../tests/functional/channel_program/Makefile.am | 6 + .../channel_program/channel_common.kshlib | 230 + .../channel_program/lua_core/Makefile.am | 46 + .../channel_program/lua_core/cleanup.ksh | 19 + .../functional/channel_program/lua_core/setup.ksh | 21 + .../channel_program/lua_core/tst.args_to_lua.ksh | 30 + .../channel_program/lua_core/tst.args_to_lua.out | 1 + .../channel_program/lua_core/tst.args_to_lua.zcp | 25 + .../lua_core/tst.divide_by_zero.err | 4 + .../lua_core/tst.divide_by_zero.ksh | 30 + .../lua_core/tst.divide_by_zero.zcp | 16 + .../channel_program/lua_core/tst.exists.ksh | 45 + .../channel_program/lua_core/tst.exists.zcp | 26 + .../lua_core/tst.integer_illegal.ksh | 41 + .../lua_core/tst.integer_overflow.ksh | 32 + .../lua_core/tst.language_functions_neg.ksh | 52 + .../lua_core/tst.language_functions_pos.ksh | 42 + .../channel_program/lua_core/tst.large_prog.ksh | 30 + .../channel_program/lua_core/tst.large_prog.out | 1 + .../channel_program/lua_core/tst.large_prog.zcp | 280 + .../channel_program/lua_core/tst.lib_base.lua | 469 + .../channel_program/lua_core/tst.lib_coroutine.lua | 362 + .../channel_program/lua_core/tst.lib_strings.lua | 241 + .../channel_program/lua_core/tst.lib_table.lua | 252 + .../channel_program/lua_core/tst.libraries.ksh | 31 + .../channel_program/lua_core/tst.memory_limit.ksh | 77 + .../channel_program/lua_core/tst.nested_neg.ksh | 30 + .../channel_program/lua_core/tst.nested_neg.zcp | 770 + .../channel_program/lua_core/tst.nested_pos.ksh | 23 + .../channel_program/lua_core/tst.nested_pos.zcp | 71 + .../channel_program/lua_core/tst.nvlist_to_lua.ksh | 30 + .../channel_program/lua_core/tst.recursive.zcp | 31 + .../channel_program/lua_core/tst.recursive_neg.ksh | 24 + .../channel_program/lua_core/tst.recursive_pos.ksh | 23 + .../channel_program/lua_core/tst.return_large.ksh | 54 + .../channel_program/lua_core/tst.return_large.zcp | 24 + .../lua_core/tst.return_nvlist_neg.ksh | 63 + .../lua_core/tst.return_nvlist_pos.ksh | 57 + .../lua_core/tst.return_recursive_table.ksh | 31 + .../lua_core/tst.return_recursive_table.zcp | 21 + .../channel_program/lua_core/tst.stack_gsub.err | 18 + .../channel_program/lua_core/tst.stack_gsub.ksh | 33 + .../channel_program/lua_core/tst.stack_gsub.zcp | 20 + .../channel_program/lua_core/tst.timeout.ksh | 55 + .../channel_program/lua_core/tst.timeout.zcp | 22 + .../channel_program/synctask_core/Makefile.am | 53 + .../channel_program/synctask_core/cleanup.ksh | 22 + .../channel_program/synctask_core/setup.ksh | 25 + .../synctask_core/tst.bookmark.copy.ksh | 45 + .../synctask_core/tst.bookmark.copy.zcp | 32 + .../synctask_core/tst.bookmark.create.ksh | 43 + .../synctask_core/tst.bookmark.create.zcp | 26 + .../synctask_core/tst.destroy_fs.ksh | 45 + .../synctask_core/tst.destroy_snap.ksh | 44 + .../synctask_core/tst.get_count_and_limit.ksh | 89 + .../synctask_core/tst.get_index_props.ksh | 41 + .../synctask_core/tst.get_index_props.out | 4 + .../synctask_core/tst.get_index_props.zcp | 77 + .../synctask_core/tst.get_mountpoint.ksh | 88 + .../channel_program/synctask_core/tst.get_neg.ksh | 43 + .../synctask_core/tst.get_number_props.ksh | 52 + .../synctask_core/tst.get_number_props.out | 5 + .../synctask_core/tst.get_number_props.zcp | 101 + .../synctask_core/tst.get_string_props.ksh | 45 + .../synctask_core/tst.get_string_props.out | 5 + .../synctask_core/tst.get_string_props.zcp | 73 + .../channel_program/synctask_core/tst.get_type.ksh | 54 + .../synctask_core/tst.get_userquota.ksh | 80 + .../synctask_core/tst.get_written.ksh | 57 + .../channel_program/synctask_core/tst.inherit.ksh | 39 + .../synctask_core/tst.list_bookmarks.ksh | 120 + .../synctask_core/tst.list_children.ksh | 125 + .../synctask_core/tst.list_clones.ksh | 116 + .../synctask_core/tst.list_holds.ksh | 121 + .../synctask_core/tst.list_snapshots.ksh | 112 + .../synctask_core/tst.list_system_props.ksh | 54 + .../synctask_core/tst.list_user_props.ksh | 147 + .../synctask_core/tst.parse_args_neg.ksh | 50 + .../synctask_core/tst.promote_conflict.ksh | 55 + .../synctask_core/tst.promote_conflict.zcp | 23 + .../synctask_core/tst.promote_multiple.ksh | 71 + .../synctask_core/tst.promote_simple.ksh | 47 + .../synctask_core/tst.rollback_mult.ksh | 60 + .../synctask_core/tst.rollback_one.ksh | 49 + .../synctask_core/tst.set_props.ksh | 39 + .../synctask_core/tst.set_props.zcp | 109 + .../synctask_core/tst.snapshot_destroy.ksh | 39 + .../synctask_core/tst.snapshot_destroy.zcp | 24 + .../synctask_core/tst.snapshot_neg.ksh | 45 + .../synctask_core/tst.snapshot_neg.zcp | 35 + .../synctask_core/tst.snapshot_recursive.ksh | 61 + .../synctask_core/tst.snapshot_recursive.zcp | 28 + .../synctask_core/tst.snapshot_simple.ksh | 40 + .../synctask_core/tst.snapshot_simple.zcp | 26 + .../synctask_core/tst.terminate_by_signal.ksh | 98 + .../zfs-tests/tests/functional/chattr/Makefile.am | 6 + .../tests/functional/chattr/chattr_001_pos.ksh | 94 + .../tests/functional/chattr/chattr_002_neg.ksh | 81 + .../zfs-tests/tests/functional/chattr/cleanup.ksh | 37 + tests/zfs-tests/tests/functional/chattr/setup.ksh | 56 + .../zfs-tests/tests/functional/checksum/.gitignore | 4 + .../tests/functional/checksum/Makefile.am | 32 + .../tests/functional/checksum/cleanup.ksh | 38 + .../tests/functional/checksum/default.cfg | 36 + .../tests/functional/checksum/edonr_test.c | 218 + .../tests/functional/checksum/filetest_001_pos.ksh | 117 + .../tests/functional/checksum/run_edonr_test.ksh | 30 + .../tests/functional/checksum/run_sha2_test.ksh | 30 + .../tests/functional/checksum/run_skein_test.ksh | 30 + .../zfs-tests/tests/functional/checksum/setup.ksh | 36 + .../tests/functional/checksum/sha2_test.c | 250 + .../tests/functional/checksum/skein_test.c | 340 + .../tests/functional/clean_mirror/Makefile.am | 12 + .../clean_mirror/clean_mirror_001_pos.ksh | 53 + .../clean_mirror/clean_mirror_002_pos.ksh | 53 + .../clean_mirror/clean_mirror_003_pos.ksh | 53 + .../clean_mirror/clean_mirror_004_pos.ksh | 53 + .../clean_mirror/clean_mirror_common.kshlib | 83 + .../tests/functional/clean_mirror/cleanup.ksh | 41 + .../tests/functional/clean_mirror/default.cfg | 37 + .../tests/functional/clean_mirror/setup.ksh | 41 + .../tests/functional/cli_root/Makefile.am | 66 + .../tests/functional/cli_root/cli_common.kshlib | 90 + .../tests/functional/cli_root/zdb/Makefile.am | 17 + .../tests/functional/cli_root/zdb/zdb_002_pos.ksh | 51 + .../tests/functional/cli_root/zdb/zdb_003_pos.ksh | 72 + .../tests/functional/cli_root/zdb/zdb_004_pos.ksh | 93 + .../tests/functional/cli_root/zdb/zdb_005_pos.ksh | 78 + .../tests/functional/cli_root/zdb/zdb_006_pos.ksh | 64 + .../tests/functional/cli_root/zdb/zdb_args_neg.ksh | 83 + .../tests/functional/cli_root/zdb/zdb_args_pos.ksh | 104 + .../cli_root/zdb/zdb_block_size_histogram.ksh | 272 + .../tests/functional/cli_root/zdb/zdb_checksum.ksh | 64 + .../functional/cli_root/zdb/zdb_decompress.ksh | 119 + .../cli_root/zdb/zdb_decompress_zstd.ksh | 114 + .../functional/cli_root/zdb/zdb_display_block.ksh | 128 + .../cli_root/zdb/zdb_object_range_neg.ksh | 72 + .../cli_root/zdb/zdb_object_range_pos.ksh | 171 + .../functional/cli_root/zdb/zdb_objset_id.ksh | 96 + .../tests/functional/cli_root/zfs/Makefile.am | 7 + .../tests/functional/cli_root/zfs/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs/setup.ksh | 32 + .../tests/functional/cli_root/zfs/zfs_001_neg.ksh | 82 + .../tests/functional/cli_root/zfs/zfs_002_pos.ksh | 123 + .../tests/functional/cli_root/zfs/zfs_003_neg.ksh | 68 + .../functional/cli_root/zfs_bookmark/Makefile.am | 5 + .../functional/cli_root/zfs_bookmark/cleanup.ksh | 31 + .../functional/cli_root/zfs_bookmark/setup.ksh | 35 + .../cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh | 238 + .../functional/cli_root/zfs_change-key/Makefile.am | 12 + .../functional/cli_root/zfs_change-key/cleanup.ksh | 30 + .../functional/cli_root/zfs_change-key/setup.ksh | 32 + .../cli_root/zfs_change-key/zfs_change-key.ksh | 62 + .../zfs_change-key/zfs_change-key_child.ksh | 91 + .../zfs_change-key/zfs_change-key_clones.ksh | 80 + .../zfs_change-key/zfs_change-key_format.ksh | 72 + .../zfs_change-key/zfs_change-key_inherit.ksh | 78 + .../zfs_change-key/zfs_change-key_load.ksh | 58 + .../zfs_change-key/zfs_change-key_location.ksh | 65 + .../zfs_change-key/zfs_change-key_pbkdf2iters.ksh | 75 + .../functional/cli_root/zfs_clone/Makefile.am | 17 + .../functional/cli_root/zfs_clone/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs_clone/setup.ksh | 36 + .../cli_root/zfs_clone/zfs_clone_001_neg.ksh | 126 + .../cli_root/zfs_clone/zfs_clone_002_pos.ksh | 89 + .../cli_root/zfs_clone/zfs_clone_003_pos.ksh | 76 + .../cli_root/zfs_clone/zfs_clone_004_pos.ksh | 87 + .../cli_root/zfs_clone/zfs_clone_005_pos.ksh | 78 + .../cli_root/zfs_clone/zfs_clone_006_pos.ksh | 86 + .../cli_root/zfs_clone/zfs_clone_007_pos.ksh | 85 + .../cli_root/zfs_clone/zfs_clone_008_neg.ksh | 80 + .../cli_root/zfs_clone/zfs_clone_009_neg.ksh | 80 + .../cli_root/zfs_clone/zfs_clone_010_pos.ksh | 229 + .../cli_root/zfs_clone/zfs_clone_deeply_nested.ksh | 69 + .../cli_root/zfs_clone/zfs_clone_encrypted.ksh | 83 + .../cli_root/zfs_clone/zfs_clone_rm_nested.ksh | 77 + .../functional/cli_root/zfs_copies/Makefile.am | 14 + .../functional/cli_root/zfs_copies/cleanup.ksh | 44 + .../tests/functional/cli_root/zfs_copies/setup.ksh | 31 + .../functional/cli_root/zfs_copies/zfs_copies.cfg | 37 + .../cli_root/zfs_copies/zfs_copies.kshlib | 158 + .../cli_root/zfs_copies/zfs_copies_001_pos.ksh | 116 + .../cli_root/zfs_copies/zfs_copies_002_pos.ksh | 119 + .../cli_root/zfs_copies/zfs_copies_003_pos.ksh | 68 + .../cli_root/zfs_copies/zfs_copies_004_neg.ksh | 58 + .../cli_root/zfs_copies/zfs_copies_005_neg.ksh | 80 + .../cli_root/zfs_copies/zfs_copies_006_pos.ksh | 77 + .../functional/cli_root/zfs_create/Makefile.am | 27 + .../functional/cli_root/zfs_create/cleanup.ksh | 30 + .../cli_root/zfs_create/properties.kshlib | 73 + .../tests/functional/cli_root/zfs_create/setup.ksh | 32 + .../functional/cli_root/zfs_create/zfs_create.cfg | 65 + .../cli_root/zfs_create/zfs_create_001_pos.ksh | 71 + .../cli_root/zfs_create/zfs_create_002_pos.ksh | 93 + .../cli_root/zfs_create/zfs_create_003_pos.ksh | 69 + .../cli_root/zfs_create/zfs_create_004_pos.ksh | 73 + .../cli_root/zfs_create/zfs_create_005_pos.ksh | 85 + .../cli_root/zfs_create/zfs_create_006_pos.ksh | 84 + .../cli_root/zfs_create/zfs_create_007_pos.ksh | 91 + .../cli_root/zfs_create/zfs_create_008_neg.ksh | 113 + .../cli_root/zfs_create/zfs_create_009_neg.ksh | 130 + .../cli_root/zfs_create/zfs_create_010_neg.ksh | 149 + .../cli_root/zfs_create/zfs_create_011_pos.ksh | 70 + .../cli_root/zfs_create/zfs_create_012_pos.ksh | 71 + .../cli_root/zfs_create/zfs_create_013_pos.ksh | 82 + .../cli_root/zfs_create/zfs_create_014_pos.ksh | 59 + .../cli_root/zfs_create/zfs_create_common.kshlib | 54 + .../zfs_create/zfs_create_crypt_combos.ksh | 99 + .../cli_root/zfs_create/zfs_create_dryrun.ksh | 169 + .../cli_root/zfs_create/zfs_create_encrypted.ksh | 136 + .../cli_root/zfs_create/zfs_create_verbose.ksh | 165 + .../functional/cli_root/zfs_destroy/Makefile.am | 29 + .../functional/cli_root/zfs_destroy/cleanup.ksh | 30 + .../functional/cli_root/zfs_destroy/setup.ksh | 32 + .../zfs_clone_livelist_condense_and_disable.ksh | 125 + .../zfs_clone_livelist_condense_races.ksh | 117 + .../cli_root/zfs_destroy/zfs_destroy.cfg | 39 + .../cli_root/zfs_destroy/zfs_destroy_001_pos.ksh | 235 + .../cli_root/zfs_destroy/zfs_destroy_002_pos.ksh | 95 + .../cli_root/zfs_destroy/zfs_destroy_003_pos.ksh | 156 + .../cli_root/zfs_destroy/zfs_destroy_004_pos.ksh | 126 + .../cli_root/zfs_destroy/zfs_destroy_005_neg.ksh | 204 + .../cli_root/zfs_destroy/zfs_destroy_006_neg.ksh | 67 + .../cli_root/zfs_destroy/zfs_destroy_007_neg.ksh | 76 + .../cli_root/zfs_destroy/zfs_destroy_008_pos.ksh | 64 + .../cli_root/zfs_destroy/zfs_destroy_009_pos.ksh | 73 + .../cli_root/zfs_destroy/zfs_destroy_010_pos.ksh | 77 + .../cli_root/zfs_destroy/zfs_destroy_011_pos.ksh | 63 + .../cli_root/zfs_destroy/zfs_destroy_012_pos.ksh | 75 + .../cli_root/zfs_destroy/zfs_destroy_013_neg.ksh | 62 + .../cli_root/zfs_destroy/zfs_destroy_014_pos.ksh | 79 + .../cli_root/zfs_destroy/zfs_destroy_015_pos.ksh | 161 + .../cli_root/zfs_destroy/zfs_destroy_016_pos.ksh | 186 + .../zfs_destroy/zfs_destroy_clone_livelist.ksh | 164 + .../cli_root/zfs_destroy/zfs_destroy_common.kshlib | 174 + .../zfs_destroy/zfs_destroy_dev_removal.ksh | 68 + .../zfs_destroy_dev_removal_condense.ksh | 94 + .../tests/functional/cli_root/zfs_diff/.gitignore | 1 + .../tests/functional/cli_root/zfs_diff/Makefile.am | 17 + .../tests/functional/cli_root/zfs_diff/cleanup.ksh | 19 + .../tests/functional/cli_root/zfs_diff/setup.ksh | 21 + .../tests/functional/cli_root/zfs_diff/socket.c | 58 + .../cli_root/zfs_diff/zfs_diff_changes.ksh | 96 + .../cli_root/zfs_diff/zfs_diff_cliargs.ksh | 80 + .../cli_root/zfs_diff/zfs_diff_encrypted.ksh | 63 + .../cli_root/zfs_diff/zfs_diff_timestamp.ksh | 104 + .../cli_root/zfs_diff/zfs_diff_types.ksh | 134 + .../tests/functional/cli_root/zfs_get/Makefile.am | 18 + .../tests/functional/cli_root/zfs_get/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs_get/setup.ksh | 32 + .../cli_root/zfs_get/zfs_get_001_pos.ksh | 163 + .../cli_root/zfs_get/zfs_get_002_pos.ksh | 97 + .../cli_root/zfs_get/zfs_get_003_pos.ksh | 65 + .../cli_root/zfs_get/zfs_get_004_pos.ksh | 227 + .../cli_root/zfs_get/zfs_get_005_neg.ksh | 156 + .../cli_root/zfs_get/zfs_get_006_neg.ksh | 68 + .../cli_root/zfs_get/zfs_get_007_neg.ksh | 64 + .../cli_root/zfs_get/zfs_get_008_pos.ksh | 117 + .../cli_root/zfs_get/zfs_get_009_pos.ksh | 112 + .../cli_root/zfs_get/zfs_get_010_neg.ksh | 59 + .../cli_root/zfs_get/zfs_get_common.kshlib | 106 + .../cli_root/zfs_get/zfs_get_list_d.kshlib | 84 + .../cli_root/zfs_ids_to_path/Makefile.am | 5 + .../cli_root/zfs_ids_to_path/cleanup.ksh | 29 + .../functional/cli_root/zfs_ids_to_path/setup.ksh | 31 + .../zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh | 96 + .../functional/cli_root/zfs_inherit/Makefile.am | 8 + .../functional/cli_root/zfs_inherit/cleanup.ksh | 30 + .../functional/cli_root/zfs_inherit/setup.ksh | 32 + .../cli_root/zfs_inherit/zfs_inherit_001_neg.ksh | 82 + .../cli_root/zfs_inherit/zfs_inherit_002_neg.ksh | 107 + .../cli_root/zfs_inherit/zfs_inherit_003_pos.ksh | 90 + .../zfs_inherit/zfs_inherit_mountpoint.ksh | 62 + .../tests/functional/cli_root/zfs_jail/Makefile.am | 6 + .../tests/functional/cli_root/zfs_jail/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs_jail/jail.conf | 9 + .../tests/functional/cli_root/zfs_jail/setup.ksh | 32 + .../cli_root/zfs_jail/zfs_jail_001_pos.ksh | 84 + .../functional/cli_root/zfs_load-key/Makefile.am | 14 + .../functional/cli_root/zfs_load-key/cleanup.ksh | 30 + .../functional/cli_root/zfs_load-key/setup.ksh | 32 + .../cli_root/zfs_load-key/zfs_load-key.cfg | 26 + .../cli_root/zfs_load-key/zfs_load-key.ksh | 85 + .../cli_root/zfs_load-key/zfs_load-key_all.ksh | 77 + .../zfs_load-key/zfs_load-key_common.kshlib | 101 + .../cli_root/zfs_load-key/zfs_load-key_file.ksh | 58 + .../zfs_load-key/zfs_load-key_location.ksh | 73 + .../cli_root/zfs_load-key/zfs_load-key_noop.ksh | 54 + .../zfs_load-key/zfs_load-key_recursive.ksh | 66 + .../functional/cli_root/zfs_mount/Makefile.am | 27 + .../functional/cli_root/zfs_mount/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs_mount/setup.ksh | 32 + .../functional/cli_root/zfs_mount/zfs_mount.cfg | 39 + .../functional/cli_root/zfs_mount/zfs_mount.kshlib | 134 + .../cli_root/zfs_mount/zfs_mount_001_pos.ksh | 67 + .../cli_root/zfs_mount/zfs_mount_002_pos.ksh | 78 + .../cli_root/zfs_mount/zfs_mount_003_pos.ksh | 90 + .../cli_root/zfs_mount/zfs_mount_004_pos.ksh | 83 + .../cli_root/zfs_mount/zfs_mount_005_pos.ksh | 99 + .../cli_root/zfs_mount/zfs_mount_006_pos.ksh | 147 + .../cli_root/zfs_mount/zfs_mount_007_pos.ksh | 163 + .../cli_root/zfs_mount/zfs_mount_008_pos.ksh | 97 + .../cli_root/zfs_mount/zfs_mount_009_neg.ksh | 111 + .../cli_root/zfs_mount/zfs_mount_010_neg.ksh | 75 + .../cli_root/zfs_mount/zfs_mount_011_neg.ksh | 81 + .../cli_root/zfs_mount/zfs_mount_012_pos.ksh | 53 + .../cli_root/zfs_mount/zfs_mount_all_001_pos.ksh | 202 + .../cli_root/zfs_mount/zfs_mount_all_fail.ksh | 105 + .../zfs_mount/zfs_mount_all_mountpoints.ksh | 162 + .../cli_root/zfs_mount/zfs_mount_encrypted.ksh | 69 + .../cli_root/zfs_mount/zfs_mount_remount.ksh | 171 + .../cli_root/zfs_mount/zfs_mount_test_race.ksh | 116 + .../cli_root/zfs_mount/zfs_multi_mount.ksh | 109 + .../functional/cli_root/zfs_program/Makefile.am | 5 + .../functional/cli_root/zfs_program/cleanup.ksh | 30 + .../functional/cli_root/zfs_program/setup.ksh | 32 + .../cli_root/zfs_program/zfs_program_json.ksh | 148 + .../functional/cli_root/zfs_promote/Makefile.am | 16 + .../functional/cli_root/zfs_promote/cleanup.ksh | 30 + .../functional/cli_root/zfs_promote/setup.ksh | 32 + .../cli_root/zfs_promote/zfs_promote.cfg | 43 + .../cli_root/zfs_promote/zfs_promote_001_pos.ksh | 127 + .../cli_root/zfs_promote/zfs_promote_002_pos.ksh | 102 + .../cli_root/zfs_promote/zfs_promote_003_pos.ksh | 134 + .../cli_root/zfs_promote/zfs_promote_004_pos.ksh | 140 + .../cli_root/zfs_promote/zfs_promote_005_pos.ksh | 73 + .../cli_root/zfs_promote/zfs_promote_006_neg.ksh | 90 + .../cli_root/zfs_promote/zfs_promote_007_neg.ksh | 78 + .../cli_root/zfs_promote/zfs_promote_008_pos.ksh | 84 + .../zfs_promote/zfs_promote_encryptionroot.ksh | 93 + .../functional/cli_root/zfs_property/Makefile.am | 5 + .../functional/cli_root/zfs_property/cleanup.ksh | 34 + .../functional/cli_root/zfs_property/setup.ksh | 36 + .../zfs_property/zfs_written_property_001_pos.ksh | 230 + .../functional/cli_root/zfs_receive/Makefile.am | 31 + .../functional/cli_root/zfs_receive/cleanup.ksh | 33 + .../zfs_receive/receive-o-x_props_override.ksh | 320 + .../functional/cli_root/zfs_receive/setup.ksh | 36 + .../cli_root/zfs_receive/zfs_receive_-e.ksh | 106 + .../cli_root/zfs_receive/zfs_receive_001_pos.ksh | 176 + .../cli_root/zfs_receive/zfs_receive_002_pos.ksh | 108 + .../cli_root/zfs_receive/zfs_receive_003_pos.ksh | 93 + .../cli_root/zfs_receive/zfs_receive_004_neg.ksh | 110 + .../cli_root/zfs_receive/zfs_receive_005_neg.ksh | 99 + .../cli_root/zfs_receive/zfs_receive_006_pos.ksh | 108 + .../cli_root/zfs_receive/zfs_receive_007_neg.ksh | 84 + .../cli_root/zfs_receive/zfs_receive_008_pos.ksh | 148 + .../cli_root/zfs_receive/zfs_receive_009_neg.ksh | 118 + .../cli_root/zfs_receive/zfs_receive_010_pos.ksh | 177 + .../cli_root/zfs_receive/zfs_receive_011_pos.ksh | 88 + .../cli_root/zfs_receive/zfs_receive_012_pos.ksh | 83 + .../cli_root/zfs_receive/zfs_receive_013_pos.ksh | 75 + .../cli_root/zfs_receive/zfs_receive_014_pos.ksh | 122 + .../cli_root/zfs_receive/zfs_receive_015_pos.ksh | 83 + .../cli_root/zfs_receive/zfs_receive_016_pos.ksh | 85 + .../zfs_receive/zfs_receive_from_encrypted.ksh | 83 + .../cli_root/zfs_receive/zfs_receive_from_zstd.ksh | 112 + .../cli_root/zfs_receive/zfs_receive_raw.ksh | 94 + .../cli_root/zfs_receive/zfs_receive_raw_-d.ksh | 62 + .../zfs_receive/zfs_receive_raw_incremental.ksh | 96 + .../zfs_receive/zfs_receive_to_encrypted.ksh | 77 + .../cli_root/zfs_receive/zstd_test_data.txt | 1 + .../functional/cli_root/zfs_rename/Makefile.am | 25 + .../functional/cli_root/zfs_rename/cleanup.ksh | 43 + .../tests/functional/cli_root/zfs_rename/setup.ksh | 49 + .../functional/cli_root/zfs_rename/zfs_rename.cfg | 39 + .../cli_root/zfs_rename/zfs_rename.kshlib | 129 + .../cli_root/zfs_rename/zfs_rename_001_pos.ksh | 109 + .../cli_root/zfs_rename/zfs_rename_002_pos.ksh | 92 + .../cli_root/zfs_rename/zfs_rename_003_pos.ksh | 68 + .../cli_root/zfs_rename/zfs_rename_004_neg.ksh | 112 + .../cli_root/zfs_rename/zfs_rename_005_neg.ksh | 91 + .../cli_root/zfs_rename/zfs_rename_006_pos.ksh | 85 + .../cli_root/zfs_rename/zfs_rename_007_pos.ksh | 155 + .../cli_root/zfs_rename/zfs_rename_008_pos.ksh | 88 + .../cli_root/zfs_rename/zfs_rename_009_neg.ksh | 86 + .../cli_root/zfs_rename/zfs_rename_010_neg.ksh | 73 + .../cli_root/zfs_rename/zfs_rename_011_pos.ksh | 78 + .../cli_root/zfs_rename/zfs_rename_012_neg.ksh | 67 + .../cli_root/zfs_rename/zfs_rename_013_pos.ksh | 85 + .../cli_root/zfs_rename/zfs_rename_014_neg.ksh | 110 + .../zfs_rename/zfs_rename_encrypted_child.ksh | 78 + .../cli_root/zfs_rename/zfs_rename_mountpoint.ksh | 88 + .../zfs_rename/zfs_rename_to_encrypted.ksh | 53 + .../cli_root/zfs_reservation/Makefile.am | 6 + .../cli_root/zfs_reservation/cleanup.ksh | 30 + .../functional/cli_root/zfs_reservation/setup.ksh | 32 + .../zfs_reservation/zfs_reservation_001_pos.ksh | 64 + .../zfs_reservation/zfs_reservation_002_pos.ksh | 88 + .../functional/cli_root/zfs_rollback/Makefile.am | 12 + .../functional/cli_root/zfs_rollback/cleanup.ksh | 32 + .../functional/cli_root/zfs_rollback/setup.ksh | 31 + .../cli_root/zfs_rollback/zfs_rollback.cfg | 45 + .../cli_root/zfs_rollback/zfs_rollback_001_pos.ksh | 174 + .../cli_root/zfs_rollback/zfs_rollback_002_pos.ksh | 68 + .../cli_root/zfs_rollback/zfs_rollback_003_neg.ksh | 84 + .../cli_root/zfs_rollback/zfs_rollback_004_neg.ksh | 86 + .../zfs_rollback/zfs_rollback_common.kshlib | 321 + .../tests/functional/cli_root/zfs_send/Makefile.am | 19 + .../tests/functional/cli_root/zfs_send/cleanup.ksh | 33 + .../tests/functional/cli_root/zfs_send/setup.ksh | 32 + .../functional/cli_root/zfs_send/zfs_send-b.ksh | 102 + .../functional/cli_root/zfs_send/zfs_send.cfg | 32 + .../cli_root/zfs_send/zfs_send_001_pos.ksh | 127 + .../cli_root/zfs_send/zfs_send_002_pos.ksh | 139 + .../cli_root/zfs_send/zfs_send_003_pos.ksh | 69 + .../cli_root/zfs_send/zfs_send_004_neg.ksh | 109 + .../cli_root/zfs_send/zfs_send_005_pos.ksh | 66 + .../cli_root/zfs_send/zfs_send_006_pos.ksh | 202 + .../cli_root/zfs_send/zfs_send_007_pos.ksh | 100 + .../cli_root/zfs_send/zfs_send_encrypted.ksh | 76 + .../zfs_send/zfs_send_encrypted_unloaded.ksh | 59 + .../functional/cli_root/zfs_send/zfs_send_raw.ksh | 79 + .../cli_root/zfs_send/zfs_send_sparse.ksh | 83 + .../tests/functional/cli_root/zfs_set/Makefile.am | 35 + .../functional/cli_root/zfs_set/cache_001_pos.ksh | 63 + .../functional/cli_root/zfs_set/cache_002_neg.ksh | 67 + .../cli_root/zfs_set/canmount_001_pos.ksh | 123 + .../cli_root/zfs_set/canmount_002_pos.ksh | 161 + .../cli_root/zfs_set/canmount_003_pos.ksh | 111 + .../cli_root/zfs_set/canmount_004_pos.ksh | 96 + .../cli_root/zfs_set/checksum_001_pos.ksh | 68 + .../tests/functional/cli_root/zfs_set/cleanup.ksh | 30 + .../cli_root/zfs_set/compression_001_pos.ksh | 64 + .../cli_root/zfs_set/mountpoint_001_pos.ksh | 100 + .../cli_root/zfs_set/mountpoint_002_pos.ksh | 98 + .../cli_root/zfs_set/mountpoint_003_pos.ksh | 145 + .../functional/cli_root/zfs_set/onoffs_001_pos.ksh | 108 + .../cli_root/zfs_set/property_alias_001_pos.ksh | 142 + .../cli_root/zfs_set/readonly_001_pos.ksh | 160 + .../cli_root/zfs_set/reservation_001_neg.ksh | 101 + .../cli_root/zfs_set/ro_props_001_pos.ksh | 124 + .../tests/functional/cli_root/zfs_set/setup.ksh | 32 + .../cli_root/zfs_set/share_mount_001_neg.ksh | 64 + .../cli_root/zfs_set/snapdir_001_pos.ksh | 112 + .../cli_root/zfs_set/user_property_001_pos.ksh | 84 + .../cli_root/zfs_set/user_property_002_pos.ksh | 120 + .../cli_root/zfs_set/user_property_003_neg.ksh | 82 + .../cli_root/zfs_set/user_property_004_pos.ksh | 101 + .../cli_root/zfs_set/version_001_neg.ksh | 91 + .../cli_root/zfs_set/zfs_set_001_neg.ksh | 87 + .../cli_root/zfs_set/zfs_set_002_neg.ksh | 67 + .../cli_root/zfs_set/zfs_set_003_neg.ksh | 77 + .../cli_root/zfs_set/zfs_set_common.kshlib | 376 + .../zfs_set/zfs_set_feature_activation.ksh | 98 + .../cli_root/zfs_set/zfs_set_keylocation.ksh | 101 + .../functional/cli_root/zfs_share/Makefile.am | 20 + .../functional/cli_root/zfs_share/cleanup.ksh | 30 + .../tests/functional/cli_root/zfs_share/setup.ksh | 36 + .../functional/cli_root/zfs_share/zfs_share.cfg | 32 + .../cli_root/zfs_share/zfs_share_001_pos.ksh | 160 + .../cli_root/zfs_share/zfs_share_002_pos.ksh | 73 + .../cli_root/zfs_share/zfs_share_003_pos.ksh | 110 + .../cli_root/zfs_share/zfs_share_004_pos.ksh | 96 + .../cli_root/zfs_share/zfs_share_005_pos.ksh | 93 + .../cli_root/zfs_share/zfs_share_006_pos.ksh | 105 + .../cli_root/zfs_share/zfs_share_007_neg.ksh | 85 + .../cli_root/zfs_share/zfs_share_008_neg.ksh | 74 + .../cli_root/zfs_share/zfs_share_009_neg.ksh | 73 + .../cli_root/zfs_share/zfs_share_010_neg.ksh | 60 + .../cli_root/zfs_share/zfs_share_011_pos.ksh | 89 + .../cli_root/zfs_share/zfs_share_012_pos.ksh | 85 + .../zfs_share/zfs_share_concurrent_shares.ksh | 201 + .../functional/cli_root/zfs_snapshot/Makefile.am | 16 + .../functional/cli_root/zfs_snapshot/cleanup.ksh | 30 + .../functional/cli_root/zfs_snapshot/setup.ksh | 32 + .../cli_root/zfs_snapshot/zfs_snapshot.cfg | 42 + .../cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh | 117 + .../cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh | 98 + .../cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh | 66 + .../cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh | 96 + .../cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh | 96 + .../cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh | 124 + .../cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh | 137 + .../cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh | 68 + .../cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh | 115 + .../functional/cli_root/zfs_sysfs/Makefile.am | 10 + .../functional/cli_root/zfs_sysfs/cleanup.ksh | 34 + .../tests/functional/cli_root/zfs_sysfs/setup.ksh | 36 + .../zfs_sysfs/zfeature_set_unsupported.ksh | 54 + .../cli_root/zfs_sysfs/zfs_get_unsupported.ksh | 53 + .../cli_root/zfs_sysfs/zfs_set_unsupported.ksh | 54 + .../cli_root/zfs_sysfs/zfs_sysfs_live.ksh | 59 + .../cli_root/zfs_sysfs/zpool_get_unsupported.ksh | 55 + .../cli_root/zfs_sysfs/zpool_set_unsupported.ksh | 54 + .../functional/cli_root/zfs_unload-key/Makefile.am | 7 + .../functional/cli_root/zfs_unload-key/cleanup.ksh | 30 + .../functional/cli_root/zfs_unload-key/setup.ksh | 32 + .../cli_root/zfs_unload-key/zfs_unload-key.ksh | 69 + .../cli_root/zfs_unload-key/zfs_unload-key_all.ksh | 76 + .../zfs_unload-key/zfs_unload-key_recursive.ksh | 72 + .../functional/cli_root/zfs_unmount/Makefile.am | 20 + .../functional/cli_root/zfs_unmount/cleanup.ksh | 30 + .../functional/cli_root/zfs_unmount/setup.ksh | 32 + .../cli_root/zfs_unmount/zfs_unmount.cfg | 39 + .../cli_root/zfs_unmount/zfs_unmount.kshlib | 77 + .../cli_root/zfs_unmount/zfs_unmount_001_pos.ksh | 117 + .../cli_root/zfs_unmount/zfs_unmount_002_pos.ksh | 98 + .../cli_root/zfs_unmount/zfs_unmount_003_pos.ksh | 109 + .../cli_root/zfs_unmount/zfs_unmount_004_pos.ksh | 99 + .../cli_root/zfs_unmount/zfs_unmount_005_pos.ksh | 117 + .../cli_root/zfs_unmount/zfs_unmount_006_pos.ksh | 71 + .../cli_root/zfs_unmount/zfs_unmount_007_neg.ksh | 110 + .../cli_root/zfs_unmount/zfs_unmount_008_neg.ksh | 146 + .../cli_root/zfs_unmount/zfs_unmount_009_pos.ksh | 142 + .../zfs_unmount/zfs_unmount_all_001_pos.ksh | 203 + .../cli_root/zfs_unmount/zfs_unmount_nested.ksh | 193 + .../zfs_unmount/zfs_unmount_unload_keys.ksh | 79 + .../functional/cli_root/zfs_unshare/Makefile.am | 11 + .../functional/cli_root/zfs_unshare/cleanup.ksh | 30 + .../functional/cli_root/zfs_unshare/setup.ksh | 39 + .../cli_root/zfs_unshare/zfs_unshare_001_pos.ksh | 177 + .../cli_root/zfs_unshare/zfs_unshare_002_pos.ksh | 185 + .../cli_root/zfs_unshare/zfs_unshare_003_pos.ksh | 94 + .../cli_root/zfs_unshare/zfs_unshare_004_neg.ksh | 84 + .../cli_root/zfs_unshare/zfs_unshare_005_neg.ksh | 60 + .../cli_root/zfs_unshare/zfs_unshare_006_pos.ksh | 92 + .../cli_root/zfs_unshare/zfs_unshare_007_pos.ksh | 68 + .../functional/cli_root/zfs_upgrade/Makefile.am | 14 + .../functional/cli_root/zfs_upgrade/cleanup.ksh | 33 + .../functional/cli_root/zfs_upgrade/setup.ksh | 42 + .../cli_root/zfs_upgrade/zfs_upgrade.kshlib | 194 + .../cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh | 139 + .../cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh | 67 + .../cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh | 104 + .../cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh | 108 + .../cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh | 108 + .../cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh | 58 + .../cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh | 59 + .../tests/functional/cli_root/zfs_wait/Makefile.am | 8 + .../tests/functional/cli_root/zfs_wait/cleanup.ksh | 20 + .../tests/functional/cli_root/zfs_wait/setup.ksh | 21 + .../functional/cli_root/zfs_wait/zfs_wait.kshlib | 80 + .../cli_root/zfs_wait/zfs_wait_deleteq.ksh | 57 + .../tests/functional/cli_root/zpool/Makefile.am | 8 + .../tests/functional/cli_root/zpool/cleanup.ksh | 30 + .../tests/functional/cli_root/zpool/setup.ksh | 32 + .../functional/cli_root/zpool/zpool_001_neg.ksh | 70 + .../functional/cli_root/zpool/zpool_002_pos.ksh | 112 + .../functional/cli_root/zpool/zpool_003_pos.ksh | 90 + .../functional/cli_root/zpool/zpool_colors.ksh | 91 + .../functional/cli_root/zpool_add/Makefile.am | 21 + .../functional/cli_root/zpool_add/add-o_ashift.ksh | 85 + .../zpool_add/add_nested_replacing_spare.ksh | 110 + .../cli_root/zpool_add/add_prop_ashift.ksh | 94 + .../functional/cli_root/zpool_add/cleanup.ksh | 35 + .../tests/functional/cli_root/zpool_add/setup.ksh | 37 + .../functional/cli_root/zpool_add/zpool_add.cfg | 39 + .../functional/cli_root/zpool_add/zpool_add.kshlib | 111 + .../cli_root/zpool_add/zpool_add_001_pos.ksh | 116 + .../cli_root/zpool_add/zpool_add_002_pos.ksh | 68 + .../cli_root/zpool_add/zpool_add_003_pos.ksh | 97 + .../cli_root/zpool_add/zpool_add_004_pos.ksh | 77 + .../cli_root/zpool_add/zpool_add_005_pos.ksh | 91 + .../cli_root/zpool_add/zpool_add_006_pos.ksh | 71 + .../cli_root/zpool_add/zpool_add_007_neg.ksh | 68 + .../cli_root/zpool_add/zpool_add_008_neg.ksh | 68 + .../cli_root/zpool_add/zpool_add_009_neg.ksh | 65 + .../cli_root/zpool_add/zpool_add_010_pos.ksh | 210 + .../functional/cli_root/zpool_attach/Makefile.am | 6 + .../cli_root/zpool_attach/attach-o_ashift.ksh | 100 + .../functional/cli_root/zpool_attach/cleanup.ksh | 32 + .../functional/cli_root/zpool_attach/setup.ksh | 35 + .../cli_root/zpool_attach/zpool_attach_001_neg.ksh | 83 + .../functional/cli_root/zpool_clear/Makefile.am | 11 + .../functional/cli_root/zpool_clear/cleanup.ksh | 30 + .../functional/cli_root/zpool_clear/setup.ksh | 32 + .../cli_root/zpool_clear/zpool_clear.cfg | 33 + .../cli_root/zpool_clear/zpool_clear_001_pos.ksh | 207 + .../cli_root/zpool_clear/zpool_clear_002_neg.ksh | 76 + .../cli_root/zpool_clear/zpool_clear_003_neg.ksh | 73 + .../cli_root/zpool_clear/zpool_clear_readonly.ksh | 71 + .../functional/cli_root/zpool_create/Makefile.am | 40 + .../functional/cli_root/zpool_create/cleanup.ksh | 37 + .../cli_root/zpool_create/create-o_ashift.ksh | 135 + .../functional/cli_root/zpool_create/setup.ksh | 37 + .../cli_root/zpool_create/zpool_create.cfg | 66 + .../cli_root/zpool_create/zpool_create.shlib | 109 + .../cli_root/zpool_create/zpool_create_001_pos.ksh | 77 + .../cli_root/zpool_create/zpool_create_002_pos.ksh | 128 + .../cli_root/zpool_create/zpool_create_003_pos.ksh | 95 + .../cli_root/zpool_create/zpool_create_004_pos.ksh | 76 + .../cli_root/zpool_create/zpool_create_005_pos.ksh | 124 + .../cli_root/zpool_create/zpool_create_006_pos.ksh | 137 + .../cli_root/zpool_create/zpool_create_007_neg.ksh | 77 + .../cli_root/zpool_create/zpool_create_008_pos.ksh | 90 + .../cli_root/zpool_create/zpool_create_009_neg.ksh | 87 + .../cli_root/zpool_create/zpool_create_010_neg.ksh | 82 + .../cli_root/zpool_create/zpool_create_011_neg.ksh | 131 + .../cli_root/zpool_create/zpool_create_012_neg.ksh | 71 + .../cli_root/zpool_create/zpool_create_014_neg.ksh | 86 + .../cli_root/zpool_create/zpool_create_015_neg.ksh | 91 + .../cli_root/zpool_create/zpool_create_016_pos.ksh | 88 + .../cli_root/zpool_create/zpool_create_017_neg.ksh | 77 + .../cli_root/zpool_create/zpool_create_018_pos.ksh | 98 + .../cli_root/zpool_create/zpool_create_019_pos.ksh | 68 + .../cli_root/zpool_create/zpool_create_020_pos.ksh | 103 + .../cli_root/zpool_create/zpool_create_021_pos.ksh | 93 + .../cli_root/zpool_create/zpool_create_022_pos.ksh | 95 + .../cli_root/zpool_create/zpool_create_023_neg.ksh | 97 + .../cli_root/zpool_create/zpool_create_024_pos.ksh | 152 + .../zpool_create/zpool_create_crypt_combos.ksh | 90 + .../zpool_create/zpool_create_encrypted.ksh | 100 + .../zpool_create/zpool_create_features_001_pos.ksh | 71 + .../zpool_create/zpool_create_features_002_pos.ksh | 70 + .../zpool_create/zpool_create_features_003_pos.ksh | 70 + .../zpool_create/zpool_create_features_004_neg.ksh | 62 + .../zpool_create/zpool_create_features_005_pos.ksh | 93 + .../zpool_create/zpool_create_tempname.ksh | 70 + .../functional/cli_root/zpool_destroy/Makefile.am | 8 + .../cli_root/zpool_destroy/zpool_destroy.cfg | 37 + .../zpool_destroy/zpool_destroy_001_pos.ksh | 86 + .../zpool_destroy/zpool_destroy_002_pos.ksh | 117 + .../zpool_destroy/zpool_destroy_003_neg.ksh | 57 + .../functional/cli_root/zpool_detach/Makefile.am | 5 + .../functional/cli_root/zpool_detach/cleanup.ksh | 32 + .../functional/cli_root/zpool_detach/setup.ksh | 35 + .../cli_root/zpool_detach/zpool_detach_001_neg.ksh | 70 + .../functional/cli_root/zpool_events/Makefile.am | 13 + .../functional/cli_root/zpool_events/cleanup.ksh | 19 + .../functional/cli_root/zpool_events/setup.ksh | 21 + .../cli_root/zpool_events/zpool_events.cfg | 16 + .../cli_root/zpool_events/zpool_events.kshlib | 39 + .../cli_root/zpool_events/zpool_events_clear.ksh | 58 + .../cli_root/zpool_events/zpool_events_cliargs.ksh | 89 + .../cli_root/zpool_events/zpool_events_errors.ksh | 152 + .../cli_root/zpool_events/zpool_events_follow.ksh | 64 + .../zpool_events/zpool_events_poolname.ksh | 74 + .../functional/cli_root/zpool_expand/Makefile.am | 12 + .../functional/cli_root/zpool_expand/cleanup.ksh | 35 + .../functional/cli_root/zpool_expand/setup.ksh | 46 + .../cli_root/zpool_expand/zpool_expand.cfg | 37 + .../cli_root/zpool_expand/zpool_expand_001_pos.ksh | 167 + .../cli_root/zpool_expand/zpool_expand_002_pos.ksh | 168 + .../cli_root/zpool_expand/zpool_expand_003_neg.ksh | 138 + .../cli_root/zpool_expand/zpool_expand_004_pos.ksh | 100 + .../cli_root/zpool_expand/zpool_expand_005_pos.ksh | 99 + .../functional/cli_root/zpool_export/Makefile.am | 11 + .../functional/cli_root/zpool_export/cleanup.ksh | 30 + .../functional/cli_root/zpool_export/setup.ksh | 37 + .../cli_root/zpool_export/zpool_export.cfg | 59 + .../cli_root/zpool_export/zpool_export_001_pos.ksh | 71 + .../cli_root/zpool_export/zpool_export_002_pos.ksh | 81 + .../cli_root/zpool_export/zpool_export_003_neg.ksh | 69 + .../cli_root/zpool_export/zpool_export_004_pos.ksh | 103 + .../functional/cli_root/zpool_get/Makefile.am | 12 + .../functional/cli_root/zpool_get/cleanup.ksh | 36 + .../tests/functional/cli_root/zpool_get/setup.ksh | 37 + .../functional/cli_root/zpool_get/zpool_get.cfg | 104 + .../cli_root/zpool_get/zpool_get_001_pos.ksh | 61 + .../cli_root/zpool_get/zpool_get_002_pos.ksh | 97 + .../cli_root/zpool_get/zpool_get_003_pos.ksh | 77 + .../cli_root/zpool_get/zpool_get_004_neg.ksh | 61 + .../cli_root/zpool_get/zpool_get_005_pos.ksh | 78 + .../cli_root/zpool_get/zpool_get_parsable.cfg | 33 + .../functional/cli_root/zpool_history/Makefile.am | 6 + .../functional/cli_root/zpool_history/cleanup.ksh | 34 + .../functional/cli_root/zpool_history/setup.ksh | 35 + .../zpool_history/zpool_history_001_neg.ksh | 69 + .../zpool_history/zpool_history_002_pos.ksh | 53 + .../functional/cli_root/zpool_import/Makefile.am | 48 + .../cli_root/zpool_import/blockfiles/Makefile.am | 5 + .../zpool_import/blockfiles/cryptv0.dat.bz2 | Bin 0 -> 94716 bytes .../zpool_import/blockfiles/missing_ivset.dat.bz2 | Bin 0 -> 167066 bytes .../zpool_import/blockfiles/unclean_export.dat.bz2 | Bin 0 -> 14161 bytes .../functional/cli_root/zpool_import/cleanup.ksh | 50 + .../zpool_import/import_cachefile_device_added.ksh | 76 + .../import_cachefile_device_removed.ksh | 145 + .../import_cachefile_device_replaced.ksh | 160 + .../import_cachefile_mirror_attached.ksh | 72 + .../import_cachefile_mirror_detached.ksh | 70 + .../import_cachefile_shared_device.ksh | 113 + .../zpool_import/import_devices_missing.ksh | 122 + .../cli_root/zpool_import/import_paths_changed.ksh | 98 + .../zpool_import/import_rewind_config_changed.ksh | 261 + .../zpool_import/import_rewind_device_replaced.ksh | 181 + .../functional/cli_root/zpool_import/setup.ksh | 56 + .../cli_root/zpool_import/zpool_import.cfg | 64 + .../cli_root/zpool_import/zpool_import.kshlib | 356 + .../cli_root/zpool_import/zpool_import_001_pos.ksh | 138 + .../cli_root/zpool_import/zpool_import_002_pos.ksh | 142 + .../cli_root/zpool_import/zpool_import_003_pos.ksh | 67 + .../cli_root/zpool_import/zpool_import_004_pos.ksh | 88 + .../cli_root/zpool_import/zpool_import_005_pos.ksh | 85 + .../cli_root/zpool_import/zpool_import_006_pos.ksh | 83 + .../cli_root/zpool_import/zpool_import_007_pos.ksh | 90 + .../cli_root/zpool_import/zpool_import_008_pos.ksh | 91 + .../cli_root/zpool_import/zpool_import_009_neg.ksh | 103 + .../cli_root/zpool_import/zpool_import_010_pos.ksh | 92 + .../cli_root/zpool_import/zpool_import_011_neg.ksh | 81 + .../cli_root/zpool_import/zpool_import_012_pos.ksh | 211 + .../cli_root/zpool_import/zpool_import_013_neg.ksh | 74 + .../cli_root/zpool_import/zpool_import_014_pos.ksh | 91 + .../cli_root/zpool_import/zpool_import_015_pos.ksh | 54 + .../zpool_import/zpool_import_all_001_pos.ksh | 168 + .../zpool_import/zpool_import_encrypted.ksh | 59 + .../zpool_import/zpool_import_encrypted_load.ksh | 59 + .../cli_root/zpool_import/zpool_import_errata3.ksh | 103 + .../cli_root/zpool_import/zpool_import_errata4.ksh | 143 + .../zpool_import/zpool_import_features_001_pos.ksh | 71 + .../zpool_import/zpool_import_features_002_neg.ksh | 87 + .../zpool_import/zpool_import_features_003_pos.ksh | 106 + .../zpool_import/zpool_import_missing_001_pos.ksh | 203 + .../zpool_import/zpool_import_missing_002_pos.ksh | 197 + .../zpool_import/zpool_import_missing_003_pos.ksh | 237 + .../zpool_import/zpool_import_rename_001_pos.ksh | 174 + .../cli_root/zpool_initialize/Makefile.am | 17 + .../cli_root/zpool_initialize/cleanup.ksh | 31 + .../zpool_initialize/zpool_initialize.kshlib | 43 + .../zpool_initialize_attach_detach_add_remove.ksh | 68 + .../zpool_initialize_import_export.ksh | 78 + ...ool_initialize_offline_export_import_online.ksh | 66 + .../zpool_initialize_online_offline.ksh | 74 + .../zpool_initialize/zpool_initialize_split.ksh | 64 + .../zpool_initialize_start_and_cancel_neg.ksh | 60 + .../zpool_initialize_start_and_cancel_pos.ksh | 52 + .../zpool_initialize_suspend_resume.ksh | 63 + .../zpool_initialize_unsupported_vdevs.ksh | 74 + .../zpool_initialize_verify_checksums.ksh | 59 + .../zpool_initialize_verify_initialized.ksh | 89 + .../cli_root/zpool_labelclear/Makefile.am | 9 + .../cli_root/zpool_labelclear/labelclear.cfg | 28 + .../zpool_labelclear/zpool_labelclear_active.ksh | 68 + .../zpool_labelclear/zpool_labelclear_exported.ksh | 74 + .../zpool_labelclear/zpool_labelclear_removed.ksh | 62 + .../zpool_labelclear/zpool_labelclear_valid.ksh | 94 + .../functional/cli_root/zpool_offline/Makefile.am | 7 + .../functional/cli_root/zpool_offline/cleanup.ksh | 32 + .../functional/cli_root/zpool_offline/setup.ksh | 35 + .../zpool_offline/zpool_offline_001_pos.ksh | 124 + .../zpool_offline/zpool_offline_002_neg.ksh | 96 + .../zpool_offline/zpool_offline_003_pos.ksh | 111 + .../functional/cli_root/zpool_online/Makefile.am | 6 + .../functional/cli_root/zpool_online/cleanup.ksh | 32 + .../functional/cli_root/zpool_online/setup.ksh | 35 + .../cli_root/zpool_online/zpool_online_001_pos.ksh | 124 + .../cli_root/zpool_online/zpool_online_002_neg.ksh | 75 + .../functional/cli_root/zpool_remove/Makefile.am | 10 + .../functional/cli_root/zpool_remove/cleanup.ksh | 38 + .../functional/cli_root/zpool_remove/setup.ksh | 37 + .../cli_root/zpool_remove/zpool_remove.cfg | 31 + .../cli_root/zpool_remove/zpool_remove_001_neg.ksh | 95 + .../cli_root/zpool_remove/zpool_remove_002_pos.ksh | 68 + .../cli_root/zpool_remove/zpool_remove_003_pos.ksh | 71 + .../functional/cli_root/zpool_reopen/Makefile.am | 15 + .../functional/cli_root/zpool_reopen/cleanup.ksh | 37 + .../functional/cli_root/zpool_reopen/setup.ksh | 30 + .../cli_root/zpool_reopen/zpool_reopen.cfg | 43 + .../cli_root/zpool_reopen/zpool_reopen.shlib | 124 + .../cli_root/zpool_reopen/zpool_reopen_001_pos.ksh | 70 + .../cli_root/zpool_reopen/zpool_reopen_002_pos.ksh | 70 + .../cli_root/zpool_reopen/zpool_reopen_003_pos.ksh | 100 + .../cli_root/zpool_reopen/zpool_reopen_004_pos.ksh | 88 + .../cli_root/zpool_reopen/zpool_reopen_005_pos.ksh | 86 + .../cli_root/zpool_reopen/zpool_reopen_006_neg.ksh | 43 + .../cli_root/zpool_reopen/zpool_reopen_007_pos.ksh | 67 + .../functional/cli_root/zpool_replace/Makefile.am | 7 + .../functional/cli_root/zpool_replace/cleanup.ksh | 32 + .../cli_root/zpool_replace/replace-o_ashift.ksh | 101 + .../cli_root/zpool_replace/replace_prop_ashift.ksh | 97 + .../functional/cli_root/zpool_replace/setup.ksh | 35 + .../zpool_replace/zpool_replace_001_neg.ksh | 83 + .../functional/cli_root/zpool_resilver/Makefile.am | 9 + .../functional/cli_root/zpool_resilver/cleanup.ksh | 33 + .../functional/cli_root/zpool_resilver/setup.ksh | 39 + .../cli_root/zpool_resilver/zpool_resilver.cfg | 30 + .../zpool_resilver/zpool_resilver_bad_args.ksh | 71 + .../zpool_resilver/zpool_resilver_restart.ksh | 86 + .../functional/cli_root/zpool_scrub/Makefile.am | 16 + .../functional/cli_root/zpool_scrub/cleanup.ksh | 34 + .../functional/cli_root/zpool_scrub/setup.ksh | 44 + .../cli_root/zpool_scrub/zpool_scrub.cfg | 35 + .../cli_root/zpool_scrub/zpool_scrub_001_neg.ksh | 63 + .../cli_root/zpool_scrub/zpool_scrub_002_pos.ksh | 78 + .../cli_root/zpool_scrub/zpool_scrub_003_pos.ksh | 65 + .../cli_root/zpool_scrub/zpool_scrub_004_pos.ksh | 78 + .../cli_root/zpool_scrub/zpool_scrub_005_pos.ksh | 57 + .../zpool_scrub/zpool_scrub_encrypted_unloaded.ksh | 67 + .../zpool_scrub/zpool_scrub_multiple_copies.ksh | 77 + .../zpool_scrub/zpool_scrub_offline_device.ksh | 133 + .../zpool_scrub/zpool_scrub_print_repairing.ksh | 74 + .../functional/cli_root/zpool_set/Makefile.am | 9 + .../functional/cli_root/zpool_set/cleanup.ksh | 30 + .../tests/functional/cli_root/zpool_set/setup.ksh | 32 + .../cli_root/zpool_set/zpool_set_001_pos.ksh | 60 + .../cli_root/zpool_set/zpool_set_002_neg.ksh | 124 + .../cli_root/zpool_set/zpool_set_003_neg.ksh | 73 + .../cli_root/zpool_set/zpool_set_ashift.ksh | 78 + .../cli_root/zpool_set/zpool_set_features.ksh | 75 + .../functional/cli_root/zpool_split/Makefile.am | 18 + .../functional/cli_root/zpool_split/cleanup.ksh | 19 + .../functional/cli_root/zpool_split/setup.ksh | 17 + .../cli_root/zpool_split/zpool_split.cfg | 18 + .../cli_root/zpool_split/zpool_split_cliargs.ksh | 80 + .../cli_root/zpool_split/zpool_split_devices.ksh | 101 + .../zpool_split/zpool_split_encryption.ksh | 59 + .../cli_root/zpool_split/zpool_split_indirect.ksh | 69 + .../cli_root/zpool_split/zpool_split_props.ksh | 95 + .../cli_root/zpool_split/zpool_split_resilver.ksh | 105 + .../cli_root/zpool_split/zpool_split_vdevs.ksh | 150 + .../cli_root/zpool_split/zpool_split_wholedisk.ksh | 84 + .../functional/cli_root/zpool_status/Makefile.am | 6 + .../functional/cli_root/zpool_status/cleanup.ksh | 30 + .../functional/cli_root/zpool_status/setup.ksh | 32 + .../cli_root/zpool_status/zpool_status_001_pos.ksh | 62 + .../cli_root/zpool_status/zpool_status_002_pos.ksh | 67 + .../functional/cli_root/zpool_sync/Makefile.am | 6 + .../functional/cli_root/zpool_sync/cleanup.ksh | 32 + .../tests/functional/cli_root/zpool_sync/setup.ksh | 34 + .../cli_root/zpool_sync/zpool_sync_001_pos.ksh | 88 + .../cli_root/zpool_sync/zpool_sync_002_neg.ksh | 44 + .../functional/cli_root/zpool_trim/Makefile.am | 24 + .../functional/cli_root/zpool_trim/cleanup.ksh | 34 + .../tests/functional/cli_root/zpool_trim/setup.ksh | 43 + .../cli_root/zpool_trim/zpool_trim.kshlib | 43 + .../zpool_trim_attach_detach_add_remove.ksh | 64 + .../zpool_trim/zpool_trim_import_export.ksh | 88 + .../cli_root/zpool_trim/zpool_trim_multiple.ksh | 65 + .../cli_root/zpool_trim/zpool_trim_neg.ksh | 53 + .../zpool_trim_offline_export_import_online.ksh | 62 + .../zpool_trim/zpool_trim_online_offline.ksh | 70 + .../cli_root/zpool_trim/zpool_trim_partial.ksh | 114 + .../cli_root/zpool_trim/zpool_trim_rate.ksh | 90 + .../cli_root/zpool_trim/zpool_trim_rate_neg.ksh | 53 + .../cli_root/zpool_trim/zpool_trim_secure.ksh | 59 + .../cli_root/zpool_trim/zpool_trim_split.ksh | 60 + .../zpool_trim/zpool_trim_start_and_cancel_neg.ksh | 56 + .../zpool_trim/zpool_trim_start_and_cancel_pos.ksh | 48 + .../zpool_trim/zpool_trim_suspend_resume.ksh | 74 + .../zpool_trim/zpool_trim_unsupported_vdevs.ksh | 70 + .../zpool_trim/zpool_trim_verify_checksums.ksh | 69 + .../zpool_trim/zpool_trim_verify_trimmed.ksh | 81 + .../functional/cli_root/zpool_upgrade/Makefile.am | 19 + .../cli_root/zpool_upgrade/blockfiles/Makefile.am | 54 + .../blockfiles/zfs-broken-mirror1.dat.bz2 | Bin 0 -> 8871 bytes .../blockfiles/zfs-broken-mirror2.dat.bz2 | Bin 0 -> 29281 bytes .../zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 | Bin 0 -> 31464 bytes .../zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 | Bin 0 -> 31549 bytes .../zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 | Bin 0 -> 29695 bytes .../zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 | Bin 0 -> 29786 bytes .../zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 | Bin 0 -> 30554 bytes .../zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 | Bin 0 -> 30605 bytes .../zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 | Bin 0 -> 33172 bytes .../blockfiles/zfs-pool-v1mirror1.dat.bz2 | Bin 0 -> 32989 bytes .../blockfiles/zfs-pool-v1mirror2.dat.bz2 | Bin 0 -> 32965 bytes .../blockfiles/zfs-pool-v1mirror3.dat.bz2 | Bin 0 -> 32956 bytes .../blockfiles/zfs-pool-v1raidz1.dat.bz2 | Bin 0 -> 28792 bytes .../blockfiles/zfs-pool-v1raidz2.dat.bz2 | Bin 0 -> 28480 bytes .../blockfiles/zfs-pool-v1raidz3.dat.bz2 | Bin 0 -> 28779 bytes .../blockfiles/zfs-pool-v1stripe1.dat.bz2 | Bin 0 -> 20361 bytes .../blockfiles/zfs-pool-v1stripe2.dat.bz2 | Bin 0 -> 20251 bytes .../blockfiles/zfs-pool-v1stripe3.dat.bz2 | Bin 0 -> 32498 bytes .../zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 | Bin 0 -> 32492 bytes .../blockfiles/zfs-pool-v2mirror1.dat.bz2 | Bin 0 -> 44185 bytes .../blockfiles/zfs-pool-v2mirror2.dat.bz2 | Bin 0 -> 44155 bytes .../blockfiles/zfs-pool-v2mirror3.dat.bz2 | Bin 0 -> 44170 bytes .../blockfiles/zfs-pool-v2raidz1.dat.bz2 | Bin 0 -> 45894 bytes .../blockfiles/zfs-pool-v2raidz2.dat.bz2 | Bin 0 -> 49452 bytes .../blockfiles/zfs-pool-v2raidz3.dat.bz2 | Bin 0 -> 44503 bytes .../blockfiles/zfs-pool-v2stripe1.dat.bz2 | Bin 0 -> 35305 bytes .../blockfiles/zfs-pool-v2stripe2.dat.bz2 | Bin 0 -> 28513 bytes .../blockfiles/zfs-pool-v2stripe3.dat.bz2 | Bin 0 -> 35344 bytes .../zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 | Bin 0 -> 27124 bytes .../blockfiles/zfs-pool-v3hotspare1.dat.bz2 | Bin 0 -> 42097 bytes .../blockfiles/zfs-pool-v3hotspare2.dat.bz2 | Bin 0 -> 35584 bytes .../blockfiles/zfs-pool-v3hotspare3.dat.bz2 | Bin 0 -> 19501 bytes .../blockfiles/zfs-pool-v3mirror1.dat.bz2 | Bin 0 -> 28160 bytes .../blockfiles/zfs-pool-v3mirror2.dat.bz2 | Bin 0 -> 28149 bytes .../blockfiles/zfs-pool-v3mirror3.dat.bz2 | Bin 0 -> 28166 bytes .../blockfiles/zfs-pool-v3raidz1.dat.bz2 | Bin 0 -> 29077 bytes .../blockfiles/zfs-pool-v3raidz2.dat.bz2 | Bin 0 -> 29340 bytes .../blockfiles/zfs-pool-v3raidz21.dat.bz2 | Bin 0 -> 28067 bytes .../blockfiles/zfs-pool-v3raidz22.dat.bz2 | Bin 0 -> 27999 bytes .../blockfiles/zfs-pool-v3raidz23.dat.bz2 | Bin 0 -> 28046 bytes .../blockfiles/zfs-pool-v3raidz3.dat.bz2 | Bin 0 -> 29120 bytes .../blockfiles/zfs-pool-v3stripe1.dat.bz2 | Bin 0 -> 26174 bytes .../blockfiles/zfs-pool-v3stripe2.dat.bz2 | Bin 0 -> 24408 bytes .../blockfiles/zfs-pool-v3stripe3.dat.bz2 | Bin 0 -> 26213 bytes .../zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 | Bin 0 -> 39824 bytes .../zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 | Bin 0 -> 44358 bytes .../zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 | Bin 0 -> 42006 bytes .../zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 | Bin 0 -> 38100 bytes .../zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 | Bin 0 -> 38287 bytes .../zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 | Bin 0 -> 33474 bytes .../zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 | Bin 0 -> 31807 bytes .../blockfiles/zfs-pool-vBROKEN.dat.bz2 | Bin 0 -> 26328 bytes .../functional/cli_root/zpool_upgrade/cleanup.ksh | 39 + .../functional/cli_root/zpool_upgrade/setup.ksh | 38 + .../cli_root/zpool_upgrade/zpool_upgrade.cfg | 161 + .../cli_root/zpool_upgrade/zpool_upgrade.kshlib | 140 + .../zpool_upgrade/zpool_upgrade_001_pos.ksh | 78 + .../zpool_upgrade/zpool_upgrade_002_pos.ksh | 59 + .../zpool_upgrade/zpool_upgrade_003_pos.ksh | 59 + .../zpool_upgrade/zpool_upgrade_004_pos.ksh | 83 + .../zpool_upgrade/zpool_upgrade_005_neg.ksh | 54 + .../zpool_upgrade/zpool_upgrade_006_neg.ksh | 57 + .../zpool_upgrade/zpool_upgrade_007_pos.ksh | 70 + .../zpool_upgrade/zpool_upgrade_008_pos.ksh | 80 + .../zpool_upgrade/zpool_upgrade_009_neg.ksh | 66 + .../functional/cli_root/zpool_wait/Makefile.am | 22 + .../functional/cli_root/zpool_wait/cleanup.ksh | 20 + .../cli_root/zpool_wait/scan/Makefile.am | 11 + .../cli_root/zpool_wait/scan/cleanup.ksh | 20 + .../functional/cli_root/zpool_wait/scan/setup.ksh | 32 + .../zpool_wait/scan/zpool_wait_rebuild.ksh | 64 + .../zpool_wait/scan/zpool_wait_replace.ksh | 71 + .../zpool_wait/scan/zpool_wait_replace_cancel.ksh | 64 + .../zpool_wait/scan/zpool_wait_resilver.ksh | 64 + .../zpool_wait/scan/zpool_wait_scrub_basic.ksh | 49 + .../zpool_wait/scan/zpool_wait_scrub_cancel.ksh | 66 + .../zpool_wait/scan/zpool_wait_scrub_flag.ksh | 52 + .../tests/functional/cli_root/zpool_wait/setup.ksh | 23 + .../cli_root/zpool_wait/zpool_wait.kshlib | 124 + .../cli_root/zpool_wait/zpool_wait_discard.ksh | 87 + .../cli_root/zpool_wait/zpool_wait_freeing.ksh | 112 + .../zpool_wait/zpool_wait_initialize_basic.ksh | 63 + .../zpool_wait/zpool_wait_initialize_cancel.ksh | 77 + .../zpool_wait/zpool_wait_initialize_flag.ksh | 88 + .../cli_root/zpool_wait/zpool_wait_multiple.ksh | 83 + .../cli_root/zpool_wait/zpool_wait_no_activity.ksh | 52 + .../cli_root/zpool_wait/zpool_wait_remove.ksh | 85 + .../zpool_wait/zpool_wait_remove_cancel.ksh | 62 + .../cli_root/zpool_wait/zpool_wait_trim_basic.ksh | 68 + .../cli_root/zpool_wait/zpool_wait_trim_cancel.ksh | 77 + .../cli_root/zpool_wait/zpool_wait_trim_flag.ksh | 88 + .../cli_root/zpool_wait/zpool_wait_usage.ksh | 47 + .../tests/functional/cli_user/Makefile.am | 6 + .../tests/functional/cli_user/misc/Makefile.am | 52 + .../cli_user/misc/arc_summary_001_pos.ksh | 60 + .../cli_user/misc/arc_summary_002_neg.ksh | 38 + .../functional/cli_user/misc/arcstat_001_pos.ksh | 41 + .../tests/functional/cli_user/misc/cleanup.ksh | 51 + .../tests/functional/cli_user/misc/misc.cfg | 123 + .../tests/functional/cli_user/misc/setup.ksh | 161 + .../tests/functional/cli_user/misc/zdb_001_neg.ksh | 78 + .../tests/functional/cli_user/misc/zfs_001_neg.ksh | 60 + .../functional/cli_user/misc/zfs_allow_001_neg.ksh | 67 + .../functional/cli_user/misc/zfs_clone_001_neg.ksh | 54 + .../cli_user/misc/zfs_create_001_neg.ksh | 61 + .../cli_user/misc/zfs_destroy_001_neg.ksh | 67 + .../functional/cli_user/misc/zfs_get_001_neg.ksh | 64 + .../cli_user/misc/zfs_inherit_001_neg.ksh | 56 + .../functional/cli_user/misc/zfs_mount_001_neg.ksh | 57 + .../cli_user/misc/zfs_promote_001_neg.ksh | 56 + .../cli_user/misc/zfs_receive_001_neg.ksh | 58 + .../cli_user/misc/zfs_rename_001_neg.ksh | 56 + .../cli_user/misc/zfs_rollback_001_neg.ksh | 60 + .../functional/cli_user/misc/zfs_send_001_neg.ksh | 67 + .../functional/cli_user/misc/zfs_set_001_neg.ksh | 69 + .../functional/cli_user/misc/zfs_share_001_neg.ksh | 67 + .../cli_user/misc/zfs_snapshot_001_neg.ksh | 56 + .../cli_user/misc/zfs_unallow_001_neg.ksh | 64 + .../cli_user/misc/zfs_unmount_001_neg.ksh | 65 + .../cli_user/misc/zfs_unshare_001_neg.ksh | 68 + .../cli_user/misc/zfs_upgrade_001_neg.ksh | 67 + .../functional/cli_user/misc/zpool_001_neg.ksh | 64 + .../functional/cli_user/misc/zpool_add_001_neg.ksh | 77 + .../cli_user/misc/zpool_attach_001_neg.ksh | 67 + .../cli_user/misc/zpool_clear_001_neg.ksh | 52 + .../cli_user/misc/zpool_create_001_neg.ksh | 77 + .../cli_user/misc/zpool_destroy_001_neg.ksh | 59 + .../cli_user/misc/zpool_detach_001_neg.ksh | 58 + .../cli_user/misc/zpool_export_001_neg.ksh | 65 + .../functional/cli_user/misc/zpool_get_001_neg.ksh | 67 + .../cli_user/misc/zpool_history_001_neg.ksh | 55 + .../cli_user/misc/zpool_import_001_neg.ksh | 66 + .../cli_user/misc/zpool_import_002_neg.ksh | 64 + .../cli_user/misc/zpool_offline_001_neg.ksh | 66 + .../cli_user/misc/zpool_online_001_neg.ksh | 66 + .../cli_user/misc/zpool_remove_001_neg.ksh | 59 + .../cli_user/misc/zpool_replace_001_neg.ksh | 68 + .../cli_user/misc/zpool_scrub_001_neg.ksh | 55 + .../functional/cli_user/misc/zpool_set_001_neg.ksh | 71 + .../cli_user/misc/zpool_status_001_neg.ksh | 72 + .../cli_user/misc/zpool_upgrade_001_neg.ksh | 64 + .../cli_user/misc/zpool_wait_privilege.ksh | 35 + .../tests/functional/cli_user/zfs_list/Makefile.am | 14 + .../tests/functional/cli_user/zfs_list/cleanup.ksh | 36 + .../tests/functional/cli_user/zfs_list/setup.ksh | 70 + .../functional/cli_user/zfs_list/zfs_list.cfg | 35 + .../functional/cli_user/zfs_list/zfs_list.kshlib | 118 + .../cli_user/zfs_list/zfs_list_001_pos.ksh | 116 + .../cli_user/zfs_list/zfs_list_002_pos.ksh | 176 + .../cli_user/zfs_list/zfs_list_003_pos.ksh | 76 + .../cli_user/zfs_list/zfs_list_004_neg.ksh | 63 + .../cli_user/zfs_list/zfs_list_007_pos.ksh | 90 + .../cli_user/zfs_list/zfs_list_008_neg.ksh | 56 + .../functional/cli_user/zpool_iostat/Makefile.am | 12 + .../functional/cli_user/zpool_iostat/cleanup.ksh | 34 + .../functional/cli_user/zpool_iostat/setup.ksh | 36 + .../zpool_iostat/zpool_iostat_-c_disable.ksh | 54 + .../zpool_iostat/zpool_iostat_-c_homedir.ksh | 76 + .../zpool_iostat/zpool_iostat_-c_searchpath.ksh | 88 + .../cli_user/zpool_iostat/zpool_iostat_001_neg.ksh | 63 + .../cli_user/zpool_iostat/zpool_iostat_002_pos.ksh | 74 + .../cli_user/zpool_iostat/zpool_iostat_003_neg.ksh | 67 + .../cli_user/zpool_iostat/zpool_iostat_004_pos.ksh | 76 + .../cli_user/zpool_iostat/zpool_iostat_005_pos.ksh | 76 + .../functional/cli_user/zpool_list/Makefile.am | 6 + .../functional/cli_user/zpool_list/cleanup.ksh | 34 + .../tests/functional/cli_user/zpool_list/setup.ksh | 36 + .../cli_user/zpool_list/zpool_list_001_pos.ksh | 64 + .../cli_user/zpool_list/zpool_list_002_neg.ksh | 57 + .../functional/cli_user/zpool_status/Makefile.am | 8 + .../functional/cli_user/zpool_status/cleanup.ksh | 30 + .../functional/cli_user/zpool_status/setup.ksh | 32 + .../zpool_status/zpool_status_-c_disable.ksh | 54 + .../zpool_status/zpool_status_-c_homedir.ksh | 76 + .../zpool_status/zpool_status_-c_searchpath.ksh | 88 + .../cli_user/zpool_status/zpool_status_003_pos.ksh | 76 + .../tests/functional/compression/Makefile.am | 15 + .../tests/functional/compression/cleanup.ksh | 34 + .../tests/functional/compression/compress.cfg | 33 + .../functional/compression/compress_001_pos.ksh | 71 + .../functional/compression/compress_002_pos.ksh | 76 + .../functional/compression/compress_003_pos.ksh | 96 + .../functional/compression/compress_004_pos.ksh | 140 + .../compression/l2arc_compressed_arc.ksh | 97 + .../compression/l2arc_compressed_arc_disabled.ksh | 98 + .../functional/compression/l2arc_encrypted.ksh | 103 + .../l2arc_encrypted_no_compressed_arc.ksh | 103 + .../tests/functional/compression/setup.ksh | 36 + .../zfs-tests/tests/functional/cp_files/.gitignore | 1 + .../tests/functional/cp_files/Makefile.am | 13 + .../tests/functional/cp_files/cleanup.ksh | 34 + .../zfs-tests/tests/functional/cp_files/cp_files.c | 58 + .../tests/functional/cp_files/cp_files_001_pos.ksh | 74 + .../zfs-tests/tests/functional/cp_files/setup.ksh | 35 + tests/zfs-tests/tests/functional/ctime/.gitignore | 1 + tests/zfs-tests/tests/functional/ctime/Makefile.am | 13 + tests/zfs-tests/tests/functional/ctime/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/ctime/ctime.c | 376 + .../tests/functional/ctime/ctime_001_pos.ksh | 44 + tests/zfs-tests/tests/functional/ctime/setup.ksh | 35 + .../zfs-tests/tests/functional/deadman/Makefile.am | 7 + .../zfs-tests/tests/functional/deadman/deadman.cfg | 32 + .../tests/functional/deadman/deadman_sync.ksh | 86 + .../tests/functional/deadman/deadman_zio.ksh | 113 + .../tests/functional/delegate/Makefile.am | 28 + .../tests/functional/delegate/cleanup.ksh | 54 + .../tests/functional/delegate/delegate.cfg | 72 + .../functional/delegate/delegate_common.kshlib | 1713 + .../zfs-tests/tests/functional/delegate/setup.ksh | 82 + .../functional/delegate/zfs_allow_001_pos.ksh | 98 + .../functional/delegate/zfs_allow_002_pos.ksh | 95 + .../functional/delegate/zfs_allow_003_pos.ksh | 93 + .../functional/delegate/zfs_allow_004_pos.ksh | 96 + .../functional/delegate/zfs_allow_005_pos.ksh | 78 + .../functional/delegate/zfs_allow_006_pos.ksh | 72 + .../functional/delegate/zfs_allow_007_pos.ksh | 104 + .../functional/delegate/zfs_allow_008_pos.ksh | 78 + .../functional/delegate/zfs_allow_009_neg.ksh | 63 + .../functional/delegate/zfs_allow_010_pos.ksh | 194 + .../functional/delegate/zfs_allow_011_neg.ksh | 68 + .../functional/delegate/zfs_allow_012_neg.ksh | 93 + .../functional/delegate/zfs_unallow_001_pos.ksh | 65 + .../functional/delegate/zfs_unallow_002_pos.ksh | 61 + .../functional/delegate/zfs_unallow_003_pos.ksh | 71 + .../functional/delegate/zfs_unallow_004_pos.ksh | 59 + .../functional/delegate/zfs_unallow_005_pos.ksh | 73 + .../functional/delegate/zfs_unallow_006_pos.ksh | 71 + .../functional/delegate/zfs_unallow_007_neg.ksh | 64 + .../functional/delegate/zfs_unallow_008_neg.ksh | 84 + .../zfs-tests/tests/functional/devices/Makefile.am | 11 + .../zfs-tests/tests/functional/devices/cleanup.ksh | 34 + .../zfs-tests/tests/functional/devices/devices.cfg | 32 + .../tests/functional/devices/devices_001_pos.ksh | 72 + .../tests/functional/devices/devices_002_neg.ksh | 70 + .../tests/functional/devices/devices_003_pos.ksh | 49 + .../tests/functional/devices/devices_common.kshlib | 119 + tests/zfs-tests/tests/functional/devices/setup.ksh | 35 + .../zfs-tests/tests/functional/events/Makefile.am | 11 + .../zfs-tests/tests/functional/events/cleanup.ksh | 31 + tests/zfs-tests/tests/functional/events/events.cfg | 38 + .../tests/functional/events/events_001_pos.ksh | 153 + .../tests/functional/events/events_002_pos.ksh | 103 + .../tests/functional/events/events_common.kshlib | 130 + tests/zfs-tests/tests/functional/events/setup.ksh | 33 + .../tests/functional/events/zed_rc_filter.ksh | 104 + tests/zfs-tests/tests/functional/exec/.gitignore | 1 + tests/zfs-tests/tests/functional/exec/Makefile.am | 9 + tests/zfs-tests/tests/functional/exec/cleanup.ksh | 34 + .../tests/functional/exec/exec_001_pos.ksh | 63 + .../tests/functional/exec/exec_002_neg.ksh | 89 + tests/zfs-tests/tests/functional/exec/setup.ksh | 35 + .../tests/functional/fallocate/Makefile.am | 6 + .../tests/functional/fallocate/cleanup.ksh | 27 + .../functional/fallocate/fallocate_prealloc.ksh | 63 + .../functional/fallocate/fallocate_punch-hole.ksh | 97 + .../zfs-tests/tests/functional/fallocate/setup.ksh | 29 + tests/zfs-tests/tests/functional/fault/Makefile.am | 19 + .../functional/fault/auto_offline_001_pos.ksh | 191 + .../tests/functional/fault/auto_online_001_pos.ksh | 155 + .../functional/fault/auto_replace_001_pos.ksh | 108 + .../tests/functional/fault/auto_spare_001_pos.ksh | 94 + .../tests/functional/fault/auto_spare_002_pos.ksh | 96 + .../tests/functional/fault/auto_spare_ashift.ksh | 101 + .../tests/functional/fault/auto_spare_multiple.ksh | 155 + .../tests/functional/fault/auto_spare_shared.ksh | 119 + tests/zfs-tests/tests/functional/fault/cleanup.ksh | 36 + .../tests/functional/fault/decompress_fault.ksh | 58 + .../tests/functional/fault/decrypt_fault.ksh | 58 + tests/zfs-tests/tests/functional/fault/fault.cfg | 55 + .../functional/fault/scrub_after_resilver.ksh | 65 + tests/zfs-tests/tests/functional/fault/setup.ksh | 34 + .../tests/functional/fault/zpool_status_-s.ksh | 77 + .../tests/functional/features/Makefile.am | 3 + .../functional/features/async_destroy/Makefile.am | 5 + .../async_destroy/async_destroy_001_pos.ksh | 102 + .../functional/features/async_destroy/cleanup.ksh | 34 + .../functional/features/async_destroy/setup.ksh | 36 + .../functional/features/large_dnode/Makefile.am | 13 + .../functional/features/large_dnode/cleanup.ksh | 30 + .../features/large_dnode/large_dnode_001_pos.ksh | 82 + .../features/large_dnode/large_dnode_002_pos.ksh | 83 + .../features/large_dnode/large_dnode_003_pos.ksh | 70 + .../features/large_dnode/large_dnode_004_neg.ksh | 73 + .../features/large_dnode/large_dnode_005_pos.ksh | 80 + .../features/large_dnode/large_dnode_006_pos.ksh | 72 + .../features/large_dnode/large_dnode_007_neg.ksh | 70 + .../features/large_dnode/large_dnode_008_pos.ksh | 78 + .../features/large_dnode/large_dnode_009_pos.ksh | 71 + .../functional/features/large_dnode/setup.ksh | 32 + tests/zfs-tests/tests/functional/grow/Makefile.am | 7 + tests/zfs-tests/tests/functional/grow/grow.cfg | 42 + .../tests/functional/grow/grow_pool_001_pos.ksh | 91 + .../functional/grow/grow_replicas_001_pos.ksh | 89 + .../zfs-tests/tests/functional/history/Makefile.am | 23 + .../zfs-tests/tests/functional/history/cleanup.ksh | 36 + .../zfs-tests/tests/functional/history/history.cfg | 49 + .../tests/functional/history/history_001_pos.ksh | 122 + .../tests/functional/history/history_002_pos.ksh | 198 + .../tests/functional/history/history_003_pos.ksh | 99 + .../tests/functional/history/history_004_pos.ksh | 100 + .../tests/functional/history/history_005_neg.ksh | 65 + .../tests/functional/history/history_006_neg.ksh | 88 + .../tests/functional/history/history_007_pos.ksh | 112 + .../tests/functional/history/history_008_pos.ksh | 74 + .../tests/functional/history/history_009_pos.ksh | 114 + .../tests/functional/history/history_010_pos.ksh | 88 + .../tests/functional/history/history_common.kshlib | 422 + .../functional/history/i386.migratedpool.DAT.Z | Bin 0 -> 173047 bytes .../tests/functional/history/i386.orig_history.txt | 12 + tests/zfs-tests/tests/functional/history/setup.ksh | 35 + .../functional/history/sparc.migratedpool.DAT.Z | Bin 0 -> 163879 bytes .../functional/history/sparc.orig_history.txt | 12 + .../tests/functional/history/zfs-pool-v4.dat.Z | Bin 0 -> 73415 bytes tests/zfs-tests/tests/functional/hkdf/.gitignore | 1 + tests/zfs-tests/tests/functional/hkdf/Makefile.am | 17 + tests/zfs-tests/tests/functional/hkdf/cleanup.ksh | 22 + tests/zfs-tests/tests/functional/hkdf/hkdf_test.c | 235 + .../tests/functional/hkdf/run_hkdf_test.ksh | 30 + tests/zfs-tests/tests/functional/hkdf/setup.ksh | 22 + .../tests/functional/inheritance/Makefile.am | 57 + .../tests/functional/inheritance/README.config | 67 + .../tests/functional/inheritance/README.state | 109 + .../tests/functional/inheritance/cleanup.ksh | 35 + .../tests/functional/inheritance/config001.cfg | 33 + .../tests/functional/inheritance/config002.cfg | 33 + .../tests/functional/inheritance/config003.cfg | 33 + .../tests/functional/inheritance/config004.cfg | 33 + .../tests/functional/inheritance/config005.cfg | 33 + .../tests/functional/inheritance/config006.cfg | 33 + .../tests/functional/inheritance/config007.cfg | 33 + .../tests/functional/inheritance/config008.cfg | 33 + .../tests/functional/inheritance/config009.cfg | 33 + .../tests/functional/inheritance/config010.cfg | 33 + .../tests/functional/inheritance/config011.cfg | 33 + .../tests/functional/inheritance/config012.cfg | 33 + .../tests/functional/inheritance/config013.cfg | 33 + .../tests/functional/inheritance/config014.cfg | 33 + .../tests/functional/inheritance/config015.cfg | 33 + .../tests/functional/inheritance/config016.cfg | 33 + .../tests/functional/inheritance/config017.cfg | 33 + .../tests/functional/inheritance/config018.cfg | 33 + .../tests/functional/inheritance/config019.cfg | 33 + .../tests/functional/inheritance/config020.cfg | 33 + .../tests/functional/inheritance/config021.cfg | 33 + .../tests/functional/inheritance/config022.cfg | 33 + .../tests/functional/inheritance/config023.cfg | 33 + .../tests/functional/inheritance/config024.cfg | 33 + .../tests/functional/inheritance/inherit.kshlib | 114 + .../functional/inheritance/inherit_001_pos.ksh | 452 + .../tests/functional/inheritance/state001.cfg | 44 + .../tests/functional/inheritance/state002.cfg | 45 + .../tests/functional/inheritance/state003.cfg | 43 + .../tests/functional/inheritance/state004.cfg | 44 + .../tests/functional/inheritance/state005.cfg | 45 + .../tests/functional/inheritance/state006.cfg | 47 + .../tests/functional/inheritance/state007.cfg | 45 + .../tests/functional/inheritance/state008.cfg | 44 + .../tests/functional/inheritance/state009.cfg | 57 + .../tests/functional/inheritance/state010.cfg | 56 + .../tests/functional/inheritance/state011.cfg | 58 + .../tests/functional/inheritance/state012.cfg | 62 + .../tests/functional/inheritance/state013.cfg | 56 + .../tests/functional/inheritance/state014.cfg | 62 + .../tests/functional/inheritance/state015.cfg | 66 + .../tests/functional/inheritance/state016.cfg | 62 + .../tests/functional/inheritance/state017.cfg | 67 + .../tests/functional/inheritance/state018.cfg | 64 + .../tests/functional/inheritance/state019.cfg | 63 + .../tests/functional/inheritance/state020.cfg | 64 + .../tests/functional/inheritance/state021.cfg | 64 + .../tests/functional/inheritance/state022.cfg | 63 + .../tests/functional/inheritance/state023.cfg | 65 + .../tests/functional/inheritance/state024.cfg | 63 + tests/zfs-tests/tests/functional/inuse/Makefile.am | 14 + tests/zfs-tests/tests/functional/inuse/inuse.cfg | 62 + .../tests/functional/inuse/inuse_001_pos.ksh | 84 + .../tests/functional/inuse/inuse_003_pos.ksh | 168 + .../tests/functional/inuse/inuse_004_pos.ksh | 103 + .../tests/functional/inuse/inuse_005_pos.ksh | 96 + .../tests/functional/inuse/inuse_006_pos.ksh | 95 + .../tests/functional/inuse/inuse_007_pos.ksh | 101 + .../tests/functional/inuse/inuse_008_pos.ksh | 97 + .../tests/functional/inuse/inuse_009_pos.ksh | 94 + tests/zfs-tests/tests/functional/inuse/setup.ksh | 36 + tests/zfs-tests/tests/functional/io/Makefile.am | 12 + tests/zfs-tests/tests/functional/io/cleanup.ksh | 31 + tests/zfs-tests/tests/functional/io/io.cfg | 25 + tests/zfs-tests/tests/functional/io/libaio.ksh | 65 + tests/zfs-tests/tests/functional/io/mmap.ksh | 69 + tests/zfs-tests/tests/functional/io/posixaio.ksh | 65 + tests/zfs-tests/tests/functional/io/psync.ksh | 65 + tests/zfs-tests/tests/functional/io/setup.ksh | 31 + tests/zfs-tests/tests/functional/io/sync.ksh | 65 + .../tests/functional/large_files/Makefile.am | 6 + .../tests/functional/large_files/cleanup.ksh | 34 + .../functional/large_files/large_files_001_pos.ksh | 53 + .../functional/large_files/large_files_002_pos.ksh | 59 + .../tests/functional/large_files/setup.ksh | 36 + .../tests/functional/largest_pool/Makefile.am | 6 + .../tests/functional/largest_pool/largest_pool.cfg | 43 + .../largest_pool/largest_pool_001_pos.ksh | 161 + tests/zfs-tests/tests/functional/libzfs/.gitignore | 1 + .../zfs-tests/tests/functional/libzfs/Makefile.am | 17 + .../zfs-tests/tests/functional/libzfs/cleanup.ksh | 34 + .../tests/functional/libzfs/libzfs_input.ksh | 30 + tests/zfs-tests/tests/functional/libzfs/many_fds.c | 73 + tests/zfs-tests/tests/functional/libzfs/setup.ksh | 35 + .../zfs-tests/tests/functional/limits/Makefile.am | 9 + .../zfs-tests/tests/functional/limits/cleanup.ksh | 21 + .../tests/functional/limits/filesystem_count.ksh | 123 + .../tests/functional/limits/filesystem_limit.ksh | 139 + tests/zfs-tests/tests/functional/limits/setup.ksh | 28 + .../tests/functional/limits/snapshot_count.ksh | 100 + .../tests/functional/limits/snapshot_limit.ksh | 166 + .../tests/functional/link_count/Makefile.am | 6 + .../tests/functional/link_count/cleanup.ksh | 34 + .../tests/functional/link_count/link_count_001.ksh | 95 + .../link_count/link_count_root_inode.ksh | 119 + .../tests/functional/link_count/setup.ksh | 36 + .../tests/functional/log_spacemap/Makefile.am | 2 + .../log_spacemap/log_spacemap_import_logs.ksh | 81 + .../tests/functional/migration/Makefile.am | 20 + .../tests/functional/migration/cleanup.ksh | 63 + .../tests/functional/migration/migration.cfg | 110 + .../tests/functional/migration/migration.kshlib | 153 + .../functional/migration/migration_001_pos.ksh | 66 + .../functional/migration/migration_002_pos.ksh | 66 + .../functional/migration/migration_003_pos.ksh | 66 + .../functional/migration/migration_004_pos.ksh | 73 + .../functional/migration/migration_005_pos.ksh | 73 + .../functional/migration/migration_006_pos.ksh | 73 + .../functional/migration/migration_007_pos.ksh | 66 + .../functional/migration/migration_008_pos.ksh | 66 + .../functional/migration/migration_009_pos.ksh | 66 + .../functional/migration/migration_010_pos.ksh | 66 + .../functional/migration/migration_011_pos.ksh | 66 + .../functional/migration/migration_012_pos.ksh | 66 + .../zfs-tests/tests/functional/migration/setup.ksh | 66 + tests/zfs-tests/tests/functional/mmap/Makefile.am | 10 + tests/zfs-tests/tests/functional/mmap/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/mmap/mmap.cfg | 59 + .../tests/functional/mmap/mmap_libaio_001_pos.ksh | 61 + .../tests/functional/mmap/mmap_read_001_pos.ksh | 57 + .../tests/functional/mmap/mmap_write_001_pos.ksh | 66 + tests/zfs-tests/tests/functional/mmap/setup.ksh | 36 + tests/zfs-tests/tests/functional/mmp/Makefile.am | 21 + tests/zfs-tests/tests/functional/mmp/cleanup.ksh | 28 + tests/zfs-tests/tests/functional/mmp/mmp.cfg | 46 + tests/zfs-tests/tests/functional/mmp/mmp.kshlib | 333 + .../tests/functional/mmp/mmp_active_import.ksh | 125 + .../tests/functional/mmp/mmp_exported_import.ksh | 106 + .../zfs-tests/tests/functional/mmp/mmp_hostid.ksh | 94 + .../tests/functional/mmp/mmp_inactive_import.ksh | 111 + .../tests/functional/mmp/mmp_interval.ksh | 47 + .../zfs-tests/tests/functional/mmp/mmp_on_off.ksh | 79 + .../tests/functional/mmp/mmp_on_thread.ksh | 64 + .../tests/functional/mmp/mmp_on_uberblocks.ksh | 85 + .../zfs-tests/tests/functional/mmp/mmp_on_zdb.ksh | 81 + .../tests/functional/mmp/mmp_reset_interval.ksh | 119 + .../functional/mmp/mmp_write_distribution.ksh | 92 + .../tests/functional/mmp/mmp_write_uberblocks.ksh | 58 + .../tests/functional/mmp/multihost_history.ksh | 67 + tests/zfs-tests/tests/functional/mmp/setup.ksh | 34 + tests/zfs-tests/tests/functional/mount/Makefile.am | 7 + tests/zfs-tests/tests/functional/mount/cleanup.ksh | 38 + tests/zfs-tests/tests/functional/mount/setup.ksh | 48 + .../tests/functional/mount/umount_001.ksh | 54 + .../functional/mount/umount_unlinked_drain.ksh | 118 + .../tests/functional/mount/umountall_001.ksh | 81 + .../tests/functional/mv_files/Makefile.am | 11 + .../tests/functional/mv_files/cleanup.ksh | 48 + .../tests/functional/mv_files/mv_files.cfg | 50 + .../tests/functional/mv_files/mv_files_001_pos.ksh | 69 + .../tests/functional/mv_files/mv_files_002_pos.ksh | 71 + .../functional/mv_files/mv_files_common.kshlib | 205 + .../tests/functional/mv_files/random_creation.ksh | 48 + .../zfs-tests/tests/functional/mv_files/setup.ksh | 47 + .../tests/functional/nestedfs/Makefile.am | 5 + .../tests/functional/nestedfs/cleanup.ksh | 34 + .../tests/functional/nestedfs/nestedfs_001_pos.ksh | 59 + .../zfs-tests/tests/functional/nestedfs/setup.ksh | 36 + .../tests/functional/no_space/Makefile.am | 11 + .../tests/functional/no_space/cleanup.ksh | 39 + .../zfs-tests/tests/functional/no_space/enospc.cfg | 39 + .../tests/functional/no_space/enospc_001_pos.ksh | 77 + .../tests/functional/no_space/enospc_002_pos.ksh | 92 + .../tests/functional/no_space/enospc_003_pos.ksh | 71 + .../tests/functional/no_space/enospc_df.ksh | 73 + .../zfs-tests/tests/functional/no_space/setup.ksh | 40 + .../tests/functional/nopwrite/Makefile.am | 15 + .../tests/functional/nopwrite/cleanup.ksh | 20 + .../tests/functional/nopwrite/nopwrite.shlib | 68 + .../tests/functional/nopwrite/nopwrite_copies.ksh | 71 + .../tests/functional/nopwrite/nopwrite_mtime.ksh | 89 + .../functional/nopwrite/nopwrite_negative.ksh | 90 + .../nopwrite/nopwrite_promoted_clone.ksh | 58 + .../tests/functional/nopwrite/nopwrite_recsize.ksh | 57 + .../tests/functional/nopwrite/nopwrite_sync.ksh | 55 + .../nopwrite/nopwrite_varying_compression.ksh | 65 + .../tests/functional/nopwrite/nopwrite_volume.ksh | 59 + .../zfs-tests/tests/functional/nopwrite/setup.ksh | 23 + .../tests/functional/online_offline/Makefile.am | 10 + .../tests/functional/online_offline/cleanup.ksh | 36 + .../functional/online_offline/online_offline.cfg | 42 + .../online_offline/online_offline_001_pos.ksh | 98 + .../online_offline/online_offline_002_neg.ksh | 137 + .../online_offline/online_offline_003_neg.ksh | 81 + .../tests/functional/online_offline/setup.ksh | 47 + tests/zfs-tests/tests/functional/pam/Makefile.am | 7 + tests/zfs-tests/tests/functional/pam/cleanup.ksh | 32 + tests/zfs-tests/tests/functional/pam/pam_basic.ksh | 49 + .../tests/functional/pam/pam_nounmount.ksh | 51 + tests/zfs-tests/tests/functional/pam/setup.ksh | 41 + .../tests/functional/pam/utilities.kshlib | 40 + .../tests/functional/persist_l2arc/Makefile.am | 15 + .../tests/functional/persist_l2arc/cleanup.ksh | 31 + .../functional/persist_l2arc/persist_l2arc.cfg | 37 + .../persist_l2arc/persist_l2arc_001_pos.ksh | 106 + .../persist_l2arc/persist_l2arc_002_pos.ksh | 112 + .../persist_l2arc/persist_l2arc_003_neg.ksh | 87 + .../persist_l2arc/persist_l2arc_004_pos.ksh | 101 + .../persist_l2arc/persist_l2arc_005_pos.ksh | 108 + .../persist_l2arc/persist_l2arc_006_pos.ksh | 98 + .../persist_l2arc/persist_l2arc_007_pos.ksh | 95 + .../persist_l2arc/persist_l2arc_008_pos.ksh | 143 + .../tests/functional/persist_l2arc/setup.ksh | 29 + .../tests/functional/pool_checkpoint/Makefile.am | 26 + .../pool_checkpoint/checkpoint_after_rewind.ksh | 55 + .../pool_checkpoint/checkpoint_big_rewind.ksh | 57 + .../pool_checkpoint/checkpoint_capacity.ksh | 92 + .../pool_checkpoint/checkpoint_conf_change.ksh | 43 + .../pool_checkpoint/checkpoint_discard.ksh | 53 + .../pool_checkpoint/checkpoint_discard_busy.ksh | 110 + .../pool_checkpoint/checkpoint_discard_many.ksh | 52 + .../pool_checkpoint/checkpoint_indirect.ksh | 59 + .../pool_checkpoint/checkpoint_invalid.ksh | 80 + .../pool_checkpoint/checkpoint_lun_expsz.ksh | 61 + .../functional/pool_checkpoint/checkpoint_open.ksh | 48 + .../pool_checkpoint/checkpoint_removal.ksh | 72 + .../pool_checkpoint/checkpoint_rewind.ksh | 49 + .../pool_checkpoint/checkpoint_ro_rewind.ksh | 57 + .../pool_checkpoint/checkpoint_sm_scale.ksh | 92 + .../pool_checkpoint/checkpoint_twice.ksh | 40 + .../pool_checkpoint/checkpoint_vdev_add.ksh | 63 + .../functional/pool_checkpoint/checkpoint_zdb.ksh | 98 + .../pool_checkpoint/checkpoint_zhack_feat.ksh | 66 + .../tests/functional/pool_checkpoint/cleanup.ksh | 23 + .../pool_checkpoint/pool_checkpoint.kshlib | 413 + .../tests/functional/pool_checkpoint/setup.ksh | 25 + .../tests/functional/pool_names/Makefile.am | 4 + .../functional/pool_names/pool_names_001_pos.ksh | 116 + .../functional/pool_names/pool_names_002_neg.ksh | 129 + .../tests/functional/poolversion/Makefile.am | 6 + .../tests/functional/poolversion/cleanup.ksh | 42 + .../functional/poolversion/poolversion_001_pos.ksh | 58 + .../functional/poolversion/poolversion_002_pos.ksh | 71 + .../tests/functional/poolversion/setup.ksh | 45 + .../tests/functional/privilege/Makefile.am | 6 + .../tests/functional/privilege/cleanup.ksh | 55 + .../functional/privilege/privilege_001_pos.ksh | 95 + .../functional/privilege/privilege_002_pos.ksh | 105 + .../zfs-tests/tests/functional/privilege/setup.ksh | 67 + .../zfs-tests/tests/functional/procfs/Makefile.am | 8 + .../zfs-tests/tests/functional/procfs/cleanup.ksh | 29 + .../tests/functional/procfs/pool_state.ksh | 146 + .../tests/functional/procfs/procfs_list_basic.ksh | 95 + .../procfs/procfs_list_concurrent_readers.ksh | 82 + .../functional/procfs/procfs_list_stale_read.ksh | 98 + tests/zfs-tests/tests/functional/procfs/setup.ksh | 29 + .../tests/functional/projectquota/Makefile.am | 27 + .../tests/functional/projectquota/cleanup.ksh | 37 + .../functional/projectquota/projectid_001_pos.ksh | 113 + .../functional/projectquota/projectid_002_pos.ksh | 88 + .../functional/projectquota/projectid_003_pos.ksh | 81 + .../tests/functional/projectquota/projectquota.cfg | 46 + .../projectquota/projectquota_001_pos.ksh | 88 + .../projectquota/projectquota_002_pos.ksh | 86 + .../projectquota/projectquota_003_pos.ksh | 98 + .../projectquota/projectquota_004_neg.ksh | 87 + .../projectquota/projectquota_005_pos.ksh | 68 + .../projectquota/projectquota_006_pos.ksh | 75 + .../projectquota/projectquota_007_pos.ksh | 58 + .../projectquota/projectquota_008_pos.ksh | 91 + .../projectquota/projectquota_009_pos.ksh | 131 + .../projectquota/projectquota_common.kshlib | 101 + .../projectquota/projectspace_001_pos.ksh | 93 + .../projectquota/projectspace_002_pos.ksh | 85 + .../projectquota/projectspace_003_pos.ksh | 118 + .../projectquota/projectspace_004_pos.ksh | 76 + .../projectquota/projecttree_001_pos.ksh | 108 + .../projectquota/projecttree_002_pos.ksh | 120 + .../projectquota/projecttree_003_neg.ksh | 103 + .../tests/functional/projectquota/setup.ksh | 56 + tests/zfs-tests/tests/functional/pyzfs/.gitignore | 1 + tests/zfs-tests/tests/functional/pyzfs/Makefile.am | 7 + .../tests/functional/pyzfs/pyzfs_unittest.ksh.in | 57 + tests/zfs-tests/tests/functional/quota/Makefile.am | 14 + tests/zfs-tests/tests/functional/quota/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/quota/quota.cfg | 35 + .../zfs-tests/tests/functional/quota/quota.kshlib | 97 + .../tests/functional/quota/quota_001_pos.ksh | 77 + .../tests/functional/quota/quota_002_pos.ksh | 76 + .../tests/functional/quota/quota_003_pos.ksh | 80 + .../tests/functional/quota/quota_004_pos.ksh | 77 + .../tests/functional/quota/quota_005_pos.ksh | 76 + .../tests/functional/quota/quota_006_neg.ksh | 71 + tests/zfs-tests/tests/functional/quota/setup.ksh | 36 + tests/zfs-tests/tests/functional/raidz/Makefile.am | 6 + tests/zfs-tests/tests/functional/raidz/cleanup.ksh | 30 + .../tests/functional/raidz/raidz_001_neg.ksh | 38 + .../tests/functional/raidz/raidz_002_pos.ksh | 41 + tests/zfs-tests/tests/functional/raidz/setup.ksh | 32 + .../tests/functional/redacted_send/Makefile.am | 25 + .../tests/functional/redacted_send/cleanup.ksh | 33 + .../tests/functional/redacted_send/redacted.cfg | 86 + .../tests/functional/redacted_send/redacted.kshlib | 266 + .../redacted_send/redacted_compressed.ksh | 71 + .../functional/redacted_send/redacted_contents.ksh | 162 + .../functional/redacted_send/redacted_deleted.ksh | 103 + .../redacted_send/redacted_disabled_feature.ksh | 71 + .../functional/redacted_send/redacted_embedded.ksh | 103 + .../functional/redacted_send/redacted_holes.ksh | 120 + .../redacted_send/redacted_incrementals.ksh | 152 + .../redacted_send/redacted_largeblocks.ksh | 63 + .../redacted_send/redacted_many_clones.ksh | 68 + .../redacted_send/redacted_mixed_recsize.ksh | 77 + .../functional/redacted_send/redacted_mounts.ksh | 109 + .../functional/redacted_send/redacted_negative.ksh | 92 + .../functional/redacted_send/redacted_origin.ksh | 87 + .../functional/redacted_send/redacted_props.ksh | 77 + .../functional/redacted_send/redacted_resume.ksh | 88 + .../functional/redacted_send/redacted_size.ksh | 64 + .../functional/redacted_send/redacted_volume.ksh | 105 + .../tests/functional/redacted_send/setup.ksh | 36 + .../tests/functional/redundancy/Makefile.am | 12 + .../tests/functional/redundancy/cleanup.ksh | 38 + .../tests/functional/redundancy/redundancy.cfg | 38 + .../tests/functional/redundancy/redundancy.kshlib | 335 + .../functional/redundancy/redundancy_001_pos.ksh | 77 + .../functional/redundancy/redundancy_002_pos.ksh | 84 + .../functional/redundancy/redundancy_003_pos.ksh | 94 + .../functional/redundancy/redundancy_004_neg.ksh | 62 + .../tests/functional/redundancy/setup.ksh | 36 + .../tests/functional/refquota/Makefile.am | 12 + .../tests/functional/refquota/cleanup.ksh | 35 + .../tests/functional/refquota/refquota_001_pos.ksh | 77 + .../tests/functional/refquota/refquota_002_pos.ksh | 90 + .../tests/functional/refquota/refquota_003_pos.ksh | 83 + .../tests/functional/refquota/refquota_004_pos.ksh | 76 + .../tests/functional/refquota/refquota_005_pos.ksh | 77 + .../tests/functional/refquota/refquota_006_neg.ksh | 70 + .../tests/functional/refquota/refquota_007_neg.ksh | 61 + .../tests/functional/refquota/refquota_008_neg.ksh | 71 + .../zfs-tests/tests/functional/refquota/setup.ksh | 36 + .../tests/functional/refreserv/Makefile.am | 14 + .../tests/functional/refreserv/cleanup.ksh | 35 + .../tests/functional/refreserv/refreserv.cfg | 31 + .../functional/refreserv/refreserv_001_pos.ksh | 75 + .../functional/refreserv/refreserv_002_pos.ksh | 114 + .../functional/refreserv/refreserv_003_pos.ksh | 77 + .../functional/refreserv/refreserv_004_pos.ksh | 90 + .../functional/refreserv/refreserv_005_pos.ksh | 73 + .../functional/refreserv/refreserv_multi_raidz.ksh | 201 + .../tests/functional/refreserv/refreserv_raidz.ksh | 135 + .../zfs-tests/tests/functional/refreserv/setup.ksh | 36 + .../zfs-tests/tests/functional/removal/Makefile.am | 37 + .../zfs-tests/tests/functional/removal/cleanup.ksh | 23 + .../tests/functional/removal/removal.kshlib | 156 + .../tests/functional/removal/removal_all_vdev.ksh | 39 + .../tests/functional/removal/removal_cancel.ksh | 94 + .../functional/removal/removal_check_space.ksh | 44 + .../functional/removal/removal_condense_export.ksh | 93 + .../removal/removal_multiple_indirection.ksh | 93 + .../tests/functional/removal/removal_nopwrite.ksh | 87 + .../functional/removal/removal_remap_deadlists.ksh | 83 + .../functional/removal/removal_reservation.ksh | 63 + .../functional/removal/removal_resume_export.ksh | 100 + .../tests/functional/removal/removal_sanity.ksh | 39 + .../tests/functional/removal/removal_with_add.ksh | 48 + .../functional/removal/removal_with_create_fs.ksh | 36 + .../functional/removal/removal_with_dedup.ksh | 49 + .../functional/removal/removal_with_errors.ksh | 114 + .../functional/removal/removal_with_export.ksh | 47 + .../functional/removal/removal_with_faulted.ksh | 104 + .../functional/removal/removal_with_ganging.ksh | 47 + .../functional/removal/removal_with_remove.ksh | 35 + .../functional/removal/removal_with_scrub.ksh | 29 + .../tests/functional/removal/removal_with_send.ksh | 37 + .../functional/removal/removal_with_send_recv.ksh | 38 + .../functional/removal/removal_with_snapshot.ksh | 36 + .../functional/removal/removal_with_write.ksh | 34 + .../tests/functional/removal/removal_with_zdb.ksh | 78 + .../tests/functional/removal/remove_expanded.ksh | 89 + .../tests/functional/removal/remove_indirect.ksh | 58 + .../tests/functional/removal/remove_mirror.ksh | 57 + .../functional/removal/remove_mirror_sanity.ksh | 58 + .../tests/functional/removal/remove_raidz.ksh | 50 + .../tests/functional/rename_dirs/Makefile.am | 5 + .../tests/functional/rename_dirs/cleanup.ksh | 34 + .../functional/rename_dirs/rename_dirs_001_pos.ksh | 71 + .../tests/functional/rename_dirs/setup.ksh | 35 + .../tests/functional/replacement/Makefile.am | 21 + .../tests/functional/replacement/attach_import.ksh | 67 + .../functional/replacement/attach_multiple.ksh | 111 + .../functional/replacement/attach_rebuild.ksh | 173 + .../functional/replacement/attach_resilver.ksh | 172 + .../tests/functional/replacement/cleanup.ksh | 36 + .../tests/functional/replacement/detach.ksh | 161 + .../replacement/rebuild_disabled_feature.ksh | 78 + .../functional/replacement/rebuild_multiple.ksh | 126 + .../tests/functional/replacement/rebuild_raidz.ksh | 70 + .../functional/replacement/replace_import.ksh | 67 + .../functional/replacement/replace_rebuild.ksh | 158 + .../functional/replacement/replace_resilver.ksh | 155 + .../tests/functional/replacement/replacement.cfg | 43 + .../replacement/resilver_restart_001.ksh | 187 + .../replacement/resilver_restart_002.ksh | 102 + .../tests/functional/replacement/scrub_cancel.ksh | 112 + .../tests/functional/replacement/setup.ksh | 47 + .../tests/functional/reservation/Makefile.am | 30 + .../tests/functional/reservation/cleanup.ksh | 34 + .../tests/functional/reservation/reservation.cfg | 44 + .../tests/functional/reservation/reservation.shlib | 201 + .../functional/reservation/reservation_001_pos.ksh | 124 + .../functional/reservation/reservation_002_pos.ksh | 100 + .../functional/reservation/reservation_003_pos.ksh | 134 + .../functional/reservation/reservation_004_pos.ksh | 130 + .../functional/reservation/reservation_005_pos.ksh | 118 + .../functional/reservation/reservation_006_pos.ksh | 81 + .../functional/reservation/reservation_007_pos.ksh | 128 + .../functional/reservation/reservation_008_pos.ksh | 124 + .../functional/reservation/reservation_009_pos.ksh | 100 + .../functional/reservation/reservation_010_pos.ksh | 101 + .../functional/reservation/reservation_011_pos.ksh | 75 + .../functional/reservation/reservation_012_pos.ksh | 88 + .../functional/reservation/reservation_013_pos.ksh | 112 + .../functional/reservation/reservation_014_pos.ksh | 116 + .../functional/reservation/reservation_015_pos.ksh | 99 + .../functional/reservation/reservation_016_pos.ksh | 98 + .../functional/reservation/reservation_017_pos.ksh | 101 + .../functional/reservation/reservation_018_pos.ksh | 72 + .../functional/reservation/reservation_019_pos.ksh | 63 + .../functional/reservation/reservation_020_pos.ksh | 64 + .../functional/reservation/reservation_021_neg.ksh | 72 + .../functional/reservation/reservation_022_pos.ksh | 82 + .../tests/functional/reservation/setup.ksh | 35 + .../tests/functional/rootpool/Makefile.am | 7 + .../tests/functional/rootpool/cleanup.ksh | 36 + .../tests/functional/rootpool/rootpool_002_neg.ksh | 69 + .../tests/functional/rootpool/rootpool_003_neg.ksh | 61 + .../tests/functional/rootpool/rootpool_007_pos.ksh | 71 + .../zfs-tests/tests/functional/rootpool/setup.ksh | 42 + tests/zfs-tests/tests/functional/rsend/Makefile.am | 63 + tests/zfs-tests/tests/functional/rsend/cleanup.ksh | 47 + .../tests/functional/rsend/dedup.zsend.bz2 | Bin 0 -> 18561 bytes .../functional/rsend/dedup_encrypted_zvol.bz2 | Bin 0 -> 1482112 bytes .../rsend/dedup_encrypted_zvol.zsend.bz2 | Bin 0 -> 836685 bytes tests/zfs-tests/tests/functional/rsend/fs.tar.gz | Bin 0 -> 13196 bytes .../tests/functional/rsend/recv_dedup.ksh | 53 + .../functional/rsend/recv_dedup_encrypted_zvol.ksh | 60 + tests/zfs-tests/tests/functional/rsend/rsend.cfg | 39 + .../zfs-tests/tests/functional/rsend/rsend.kshlib | 850 + .../tests/functional/rsend/rsend_001_pos.ksh | 73 + .../tests/functional/rsend/rsend_002_pos.ksh | 93 + .../tests/functional/rsend/rsend_003_pos.ksh | 95 + .../tests/functional/rsend/rsend_004_pos.ksh | 120 + .../tests/functional/rsend/rsend_005_pos.ksh | 104 + .../tests/functional/rsend/rsend_006_pos.ksh | 82 + .../tests/functional/rsend/rsend_007_pos.ksh | 97 + .../tests/functional/rsend/rsend_008_pos.ksh | 131 + .../tests/functional/rsend/rsend_009_pos.ksh | 92 + .../tests/functional/rsend/rsend_010_pos.ksh | 77 + .../tests/functional/rsend/rsend_011_pos.ksh | 120 + .../tests/functional/rsend/rsend_012_pos.ksh | 186 + .../tests/functional/rsend/rsend_013_pos.ksh | 86 + .../tests/functional/rsend/rsend_014_pos.ksh | 56 + .../tests/functional/rsend/rsend_016_neg.ksh | 33 + .../tests/functional/rsend/rsend_019_pos.ksh | 54 + .../tests/functional/rsend/rsend_020_pos.ksh | 50 + .../tests/functional/rsend/rsend_021_pos.ksh | 53 + .../tests/functional/rsend/rsend_022_pos.ksh | 61 + .../tests/functional/rsend/rsend_024_pos.ksh | 53 + .../tests/functional/rsend/send-L_toggle.ksh | 65 + .../functional/rsend/send-c_embedded_blocks.ksh | 109 + .../tests/functional/rsend/send-c_incremental.ksh | 100 + .../tests/functional/rsend/send-c_lz4_disabled.ksh | 73 + .../functional/rsend/send-c_mixed_compression.ksh | 54 + .../tests/functional/rsend/send-c_props.ksh | 67 + .../tests/functional/rsend/send-c_recv_dedup.ksh | 55 + .../functional/rsend/send-c_recv_lz4_disabled.ksh | 68 + .../tests/functional/rsend/send-c_resume.ksh | 49 + .../rsend/send-c_stream_size_estimate.ksh | 97 + .../functional/rsend/send-c_verify_contents.ksh | 55 + .../tests/functional/rsend/send-c_verify_ratio.ksh | 67 + .../tests/functional/rsend/send-c_volume.ksh | 78 + .../tests/functional/rsend/send-c_zstreamdump.ksh | 75 + .../functional/rsend/send-cpL_varied_recsize.ksh | 203 + .../functional/rsend/send-wR_encrypted_zvol.ksh | 108 + .../functional/rsend/send_encrypted_files.ksh | 120 + .../functional/rsend/send_encrypted_hierarchy.ksh | 96 + .../functional/rsend/send_encrypted_props.ksh | 215 + .../rsend/send_encrypted_truncated_files.ksh | 126 + .../tests/functional/rsend/send_freeobjects.ksh | 81 + .../tests/functional/rsend/send_holds.ksh | 177 + .../tests/functional/rsend/send_hole_birth.ksh | 123 + .../tests/functional/rsend/send_mixed_raw.ksh | 118 + .../functional/rsend/send_partial_dataset.ksh | 110 + .../functional/rsend/send_realloc_dnode_size.ksh | 112 + .../rsend/send_realloc_encrypted_files.ksh | 124 + .../tests/functional/rsend/send_realloc_files.ksh | 111 + .../tests/functional/rsend/send_spill_block.ksh | 155 + tests/zfs-tests/tests/functional/rsend/setup.ksh | 46 + .../tests/functional/scrub_mirror/Makefile.am | 12 + .../tests/functional/scrub_mirror/cleanup.ksh | 42 + .../tests/functional/scrub_mirror/default.cfg | 37 + .../scrub_mirror/scrub_mirror_001_pos.ksh | 53 + .../scrub_mirror/scrub_mirror_002_pos.ksh | 53 + .../scrub_mirror/scrub_mirror_003_pos.ksh | 53 + .../scrub_mirror/scrub_mirror_004_pos.ksh | 53 + .../scrub_mirror/scrub_mirror_common.kshlib | 76 + .../tests/functional/scrub_mirror/setup.ksh | 42 + tests/zfs-tests/tests/functional/slog/Makefile.am | 26 + tests/zfs-tests/tests/functional/slog/cleanup.ksh | 54 + tests/zfs-tests/tests/functional/slog/setup.ksh | 41 + tests/zfs-tests/tests/functional/slog/slog.cfg | 39 + tests/zfs-tests/tests/functional/slog/slog.kshlib | 157 + .../tests/functional/slog/slog_001_pos.ksh | 69 + .../tests/functional/slog/slog_002_pos.ksh | 68 + .../tests/functional/slog/slog_003_pos.ksh | 75 + .../tests/functional/slog/slog_004_pos.ksh | 74 + .../tests/functional/slog/slog_005_pos.ksh | 66 + .../tests/functional/slog/slog_006_pos.ksh | 73 + .../tests/functional/slog/slog_007_pos.ksh | 94 + .../tests/functional/slog/slog_008_neg.ksh | 65 + .../tests/functional/slog/slog_009_neg.ksh | 70 + .../tests/functional/slog/slog_010_neg.ksh | 65 + .../tests/functional/slog/slog_011_neg.ksh | 71 + .../tests/functional/slog/slog_012_neg.ksh | 74 + .../tests/functional/slog/slog_013_pos.ksh | 94 + .../tests/functional/slog/slog_014_pos.ksh | 90 + .../tests/functional/slog/slog_015_neg.ksh | 74 + .../tests/functional/slog/slog_replay_fs_001.ksh | 223 + .../tests/functional/slog/slog_replay_fs_002.ksh | 137 + .../tests/functional/slog/slog_replay_volume.ksh | 179 + .../tests/functional/snapshot/Makefile.am | 28 + .../tests/functional/snapshot/cleanup.ksh | 38 + .../tests/functional/snapshot/clone_001_pos.ksh | 171 + .../tests/functional/snapshot/rollback_001_pos.ksh | 115 + .../tests/functional/snapshot/rollback_002_pos.ksh | 133 + .../tests/functional/snapshot/rollback_003_pos.ksh | 111 + .../zfs-tests/tests/functional/snapshot/setup.ksh | 40 + .../tests/functional/snapshot/snapshot.cfg | 53 + .../tests/functional/snapshot/snapshot_001_pos.ksh | 91 + .../tests/functional/snapshot/snapshot_002_pos.ksh | 129 + .../tests/functional/snapshot/snapshot_003_pos.ksh | 103 + .../tests/functional/snapshot/snapshot_004_pos.ksh | 90 + .../tests/functional/snapshot/snapshot_005_pos.ksh | 90 + .../tests/functional/snapshot/snapshot_006_pos.ksh | 127 + .../tests/functional/snapshot/snapshot_007_pos.ksh | 107 + .../tests/functional/snapshot/snapshot_008_pos.ksh | 103 + .../tests/functional/snapshot/snapshot_009_pos.ksh | 117 + .../tests/functional/snapshot/snapshot_010_pos.ksh | 100 + .../tests/functional/snapshot/snapshot_011_pos.ksh | 113 + .../tests/functional/snapshot/snapshot_012_pos.ksh | 104 + .../tests/functional/snapshot/snapshot_013_pos.ksh | 99 + .../tests/functional/snapshot/snapshot_014_pos.ksh | 78 + .../tests/functional/snapshot/snapshot_015_pos.ksh | 121 + .../tests/functional/snapshot/snapshot_016_pos.ksh | 103 + .../tests/functional/snapshot/snapshot_017_pos.ksh | 202 + .../tests/functional/snapused/Makefile.am | 12 + .../tests/functional/snapused/cleanup.ksh | 34 + .../zfs-tests/tests/functional/snapused/setup.ksh | 36 + .../tests/functional/snapused/snapused.kshlib | 185 + .../tests/functional/snapused/snapused_001_pos.ksh | 91 + .../tests/functional/snapused/snapused_002_pos.ksh | 82 + .../tests/functional/snapused/snapused_003_pos.ksh | 82 + .../tests/functional/snapused/snapused_004_pos.ksh | 95 + .../tests/functional/snapused/snapused_005_pos.ksh | 73 + .../zfs-tests/tests/functional/sparse/Makefile.am | 8 + .../zfs-tests/tests/functional/sparse/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/sparse/setup.ksh | 36 + tests/zfs-tests/tests/functional/sparse/sparse.cfg | 43 + .../tests/functional/sparse/sparse_001_pos.ksh | 80 + tests/zfs-tests/tests/functional/suid/.gitignore | 1 + tests/zfs-tests/tests/functional/suid/Makefile.am | 16 + tests/zfs-tests/tests/functional/suid/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/suid/setup.ksh | 35 + .../tests/functional/suid/suid_write_to_file.c | 133 + .../tests/functional/suid/suid_write_to_none.ksh | 52 + .../tests/functional/suid/suid_write_to_sgid.ksh | 52 + .../tests/functional/suid/suid_write_to_suid.ksh | 52 + .../functional/suid/suid_write_to_suid_sgid.ksh | 52 + .../tests/functional/threadsappend/.gitignore | 1 + .../tests/functional/threadsappend/Makefile.am | 8 + .../tests/functional/threadsappend/cleanup.ksh | 34 + .../tests/functional/threadsappend/setup.ksh | 36 + .../threadsappend/threadsappend_001_pos.ksh | 80 + .../zfs-tests/tests/functional/tmpfile/.gitignore | 5 + .../zfs-tests/tests/functional/tmpfile/Makefile.am | 16 + .../zfs-tests/tests/functional/tmpfile/cleanup.ksh | 34 + tests/zfs-tests/tests/functional/tmpfile/setup.ksh | 42 + .../tests/functional/tmpfile/tmpfile_001_pos.c | 109 + .../tests/functional/tmpfile/tmpfile_002_pos.c | 98 + .../tests/functional/tmpfile/tmpfile_003_pos.c | 68 + .../tests/functional/tmpfile/tmpfile_stat_mode.c | 121 + .../tests/functional/tmpfile/tmpfile_test.c | 53 + tests/zfs-tests/tests/functional/trim/Makefile.am | 12 + .../tests/functional/trim/autotrim_config.ksh | 103 + .../tests/functional/trim/autotrim_integrity.ksh | 87 + .../functional/trim/autotrim_trim_integrity.ksh | 92 + tests/zfs-tests/tests/functional/trim/cleanup.ksh | 48 + tests/zfs-tests/tests/functional/trim/setup.ksh | 43 + tests/zfs-tests/tests/functional/trim/trim.cfg | 33 + tests/zfs-tests/tests/functional/trim/trim.kshlib | 122 + .../tests/functional/trim/trim_config.ksh | 102 + .../tests/functional/trim/trim_integrity.ksh | 88 + .../zfs-tests/tests/functional/trim/trim_l2arc.ksh | 106 + .../zfs-tests/tests/functional/truncate/.gitignore | 1 + .../tests/functional/truncate/Makefile.am | 18 + .../tests/functional/truncate/cleanup.ksh | 30 + .../zfs-tests/tests/functional/truncate/setup.ksh | 32 + .../tests/functional/truncate/truncate.cfg | 39 + .../tests/functional/truncate/truncate_001_pos.ksh | 79 + .../tests/functional/truncate/truncate_002_pos.ksh | 67 + .../tests/functional/truncate/truncate_test.c | 103 + .../functional/truncate/truncate_timestamps.ksh | 84 + .../zfs-tests/tests/functional/upgrade/Makefile.am | 10 + .../zfs-tests/tests/functional/upgrade/cleanup.ksh | 42 + tests/zfs-tests/tests/functional/upgrade/setup.ksh | 43 + .../tests/functional/upgrade/upgrade_common.kshlib | 41 + .../upgrade/upgrade_projectquota_001_pos.ksh | 128 + .../functional/upgrade/upgrade_readonly_pool.ksh | 66 + .../functional/upgrade/upgrade_userobj_001_pos.ksh | 96 + .../tests/functional/user_namespace/Makefile.am | 9 + .../tests/functional/user_namespace/cleanup.ksh | 25 + .../tests/functional/user_namespace/setup.ksh | 32 + .../functional/user_namespace/user_namespace.cfg | 23 + .../user_namespace/user_namespace_001.ksh | 89 + .../user_namespace/user_namespace_common.kshlib | 23 + .../tests/functional/userquota/Makefile.am | 27 + .../tests/functional/userquota/cleanup.ksh | 41 + .../functional/userquota/groupspace_001_pos.ksh | 79 + .../functional/userquota/groupspace_002_pos.ksh | 79 + .../functional/userquota/groupspace_003_pos.ksh | 109 + .../zfs-tests/tests/functional/userquota/setup.ksh | 56 + .../tests/functional/userquota/userquota.cfg | 46 + .../functional/userquota/userquota_001_pos.ksh | 74 + .../functional/userquota/userquota_002_pos.ksh | 89 + .../functional/userquota/userquota_003_pos.ksh | 61 + .../functional/userquota/userquota_004_pos.ksh | 85 + .../functional/userquota/userquota_005_neg.ksh | 94 + .../functional/userquota/userquota_006_pos.ksh | 79 + .../functional/userquota/userquota_007_pos.ksh | 75 + .../functional/userquota/userquota_008_pos.ksh | 60 + .../functional/userquota/userquota_009_pos.ksh | 92 + .../functional/userquota/userquota_010_pos.ksh | 75 + .../functional/userquota/userquota_011_pos.ksh | 127 + .../functional/userquota/userquota_012_neg.ksh | 66 + .../functional/userquota/userquota_013_pos.ksh | 77 + .../functional/userquota/userquota_common.kshlib | 123 + .../functional/userquota/userspace_001_pos.ksh | 78 + .../functional/userquota/userspace_002_pos.ksh | 81 + .../functional/userquota/userspace_003_pos.ksh | 122 + .../tests/functional/vdev_zaps/Makefile.am | 14 + .../tests/functional/vdev_zaps/cleanup.ksh | 20 + .../zfs-tests/tests/functional/vdev_zaps/setup.ksh | 21 + .../tests/functional/vdev_zaps/vdev_zaps.kshlib | 114 + .../functional/vdev_zaps/vdev_zaps_001_pos.ksh | 42 + .../functional/vdev_zaps/vdev_zaps_002_pos.ksh | 44 + .../functional/vdev_zaps/vdev_zaps_003_pos.ksh | 47 + .../functional/vdev_zaps/vdev_zaps_004_pos.ksh | 94 + .../functional/vdev_zaps/vdev_zaps_005_pos.ksh | 63 + .../functional/vdev_zaps/vdev_zaps_006_pos.ksh | 46 + .../functional/vdev_zaps/vdev_zaps_007_pos.ksh | 74 + .../tests/functional/write_dirs/Makefile.am | 6 + .../tests/functional/write_dirs/cleanup.ksh | 37 + .../tests/functional/write_dirs/setup.ksh | 38 + .../functional/write_dirs/write_dirs_001_pos.ksh | 75 + .../functional/write_dirs/write_dirs_002_pos.ksh | 76 + tests/zfs-tests/tests/functional/xattr/Makefile.am | 21 + tests/zfs-tests/tests/functional/xattr/cleanup.ksh | 46 + tests/zfs-tests/tests/functional/xattr/setup.ksh | 55 + tests/zfs-tests/tests/functional/xattr/xattr.cfg | 30 + .../tests/functional/xattr/xattr_001_pos.ksh | 69 + .../tests/functional/xattr/xattr_002_neg.ksh | 62 + .../tests/functional/xattr/xattr_003_neg.ksh | 76 + .../tests/functional/xattr/xattr_004_pos.ksh | 143 + .../tests/functional/xattr/xattr_005_pos.ksh | 78 + .../tests/functional/xattr/xattr_006_pos.ksh | 63 + .../tests/functional/xattr/xattr_007_neg.ksh | 89 + .../tests/functional/xattr/xattr_008_pos.ksh | 80 + .../tests/functional/xattr/xattr_009_neg.ksh | 62 + .../tests/functional/xattr/xattr_010_neg.ksh | 66 + .../tests/functional/xattr/xattr_011_pos.ksh | 241 + .../tests/functional/xattr/xattr_012_pos.ksh | 122 + .../tests/functional/xattr/xattr_013_pos.ksh | 104 + .../tests/functional/xattr/xattr_common.kshlib | 137 + tests/zfs-tests/tests/functional/zvol/Makefile.am | 10 + tests/zfs-tests/tests/functional/zvol/zvol.cfg | 39 + .../tests/functional/zvol/zvol_ENOSPC/Makefile.am | 8 + .../tests/functional/zvol/zvol_ENOSPC/cleanup.ksh | 44 + .../tests/functional/zvol/zvol_ENOSPC/setup.ksh | 50 + .../functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg | 41 + .../zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh | 80 + .../tests/functional/zvol/zvol_cli/Makefile.am | 10 + .../tests/functional/zvol/zvol_cli/cleanup.ksh | 39 + .../tests/functional/zvol/zvol_cli/setup.ksh | 45 + .../tests/functional/zvol/zvol_cli/zvol_cli.cfg | 41 + .../functional/zvol/zvol_cli/zvol_cli_001_pos.ksh | 63 + .../functional/zvol/zvol_cli/zvol_cli_002_pos.ksh | 62 + .../functional/zvol/zvol_cli/zvol_cli_003_neg.ksh | 59 + .../tests/functional/zvol/zvol_common.shlib | 134 + .../tests/functional/zvol/zvol_misc/Makefile.am | 19 + .../tests/functional/zvol/zvol_misc/cleanup.ksh | 41 + .../tests/functional/zvol/zvol_misc/setup.ksh | 41 + .../zvol/zvol_misc/zvol_misc_001_neg.ksh | 72 + .../zvol/zvol_misc/zvol_misc_002_pos.ksh | 117 + .../zvol/zvol_misc/zvol_misc_003_neg.ksh | 81 + .../zvol/zvol_misc/zvol_misc_004_pos.ksh | 117 + .../zvol/zvol_misc/zvol_misc_005_neg.ksh | 83 + .../zvol/zvol_misc/zvol_misc_006_pos.ksh | 80 + .../zvol/zvol_misc/zvol_misc_common.kshlib | 143 + .../zvol/zvol_misc/zvol_misc_hierarchy.ksh | 93 + .../zvol/zvol_misc/zvol_misc_rename_inuse.ksh | 71 + .../zvol/zvol_misc/zvol_misc_snapdev.ksh | 121 + .../zvol/zvol_misc/zvol_misc_volmode.ksh | 242 + .../functional/zvol/zvol_misc/zvol_misc_zil.ksh | 74 + .../tests/functional/zvol/zvol_swap/Makefile.am | 13 + .../tests/functional/zvol/zvol_swap/cleanup.ksh | 60 + .../tests/functional/zvol/zvol_swap/setup.ksh | 49 + .../tests/functional/zvol/zvol_swap/zvol_swap.cfg | 44 + .../zvol/zvol_swap/zvol_swap_001_pos.ksh | 77 + .../zvol/zvol_swap/zvol_swap_002_pos.ksh | 76 + .../zvol/zvol_swap/zvol_swap_003_pos.ksh | 96 + .../zvol/zvol_swap/zvol_swap_004_pos.ksh | 76 + .../zvol/zvol_swap/zvol_swap_005_pos.ksh | 69 + .../zvol/zvol_swap/zvol_swap_006_pos.ksh | 108 + tests/zfs-tests/tests/perf/Makefile.am | 9 + tests/zfs-tests/tests/perf/fio/Makefile.am | 10 + tests/zfs-tests/tests/perf/fio/mkfiles.fio | 33 + tests/zfs-tests/tests/perf/fio/random_reads.fio | 32 + .../zfs-tests/tests/perf/fio/random_readwrite.fio | 39 + .../tests/perf/fio/random_readwrite_fixed.fio | 39 + tests/zfs-tests/tests/perf/fio/random_writes.fio | 37 + .../zfs-tests/tests/perf/fio/sequential_reads.fio | 32 + .../tests/perf/fio/sequential_readwrite.fio | 39 + .../zfs-tests/tests/perf/fio/sequential_writes.fio | 37 + tests/zfs-tests/tests/perf/nfs-sample.cfg | 46 + tests/zfs-tests/tests/perf/perf.shlib | 575 + tests/zfs-tests/tests/perf/regression/Makefile.am | 13 + .../tests/perf/regression/random_reads.ksh | 114 + .../tests/perf/regression/random_readwrite.ksh | 114 + .../perf/regression/random_readwrite_fixed.ksh | 106 + .../tests/perf/regression/random_writes.ksh | 105 + .../tests/perf/regression/random_writes_zil.ksh | 100 + .../tests/perf/regression/sequential_reads.ksh | 116 + .../regression/sequential_reads_arc_cached.ksh | 106 + .../sequential_reads_arc_cached_clone.ksh | 132 + .../regression/sequential_reads_dbuf_cached.ksh | 112 + .../tests/perf/regression/sequential_writes.ksh | 105 + tests/zfs-tests/tests/perf/regression/setup.ksh | 23 + tests/zfs-tests/tests/perf/scripts/Makefile.am | 2 + tests/zfs-tests/tests/perf/scripts/prefetch_io.sh | 82 + tests/zfs-tests/tests/stress/Makefile.am | 1 + udev/Makefile.am | 1 + udev/rules.d/.gitignore | 4 + udev/rules.d/60-zvol.rules.in | 6 + udev/rules.d/69-vdev.rules.in | 15 + udev/rules.d/90-zfs.rules.in | 12 + udev/rules.d/Makefile.am | 8 + zfs.release.in | 1 + 3671 files changed, 770607 insertions(+) create mode 100644 AUTHORS create mode 100644 CODE_OF_CONDUCT.md create mode 100644 COPYRIGHT create mode 100644 LICENSE create mode 100644 META create mode 100644 Makefile.am create mode 100644 NEWS create mode 100644 NOTICE create mode 100644 README.md create mode 100644 TEST create mode 100755 autogen.sh create mode 100644 cmd/Makefile.am create mode 100644 cmd/arc_summary/.gitignore create mode 100644 cmd/arc_summary/Makefile.am create mode 100755 cmd/arc_summary/arc_summary2 create mode 100755 cmd/arc_summary/arc_summary3 create mode 100644 cmd/arcstat/.gitignore create mode 100644 cmd/arcstat/Makefile.am create mode 100755 cmd/arcstat/arcstat.in create mode 100644 cmd/dbufstat/.gitignore create mode 100644 cmd/dbufstat/Makefile.am create mode 100755 cmd/dbufstat/dbufstat.in create mode 100644 cmd/fsck_zfs/Makefile.am create mode 100755 cmd/fsck_zfs/fsck.zfs create mode 100644 cmd/mount_zfs/.gitignore create mode 100644 cmd/mount_zfs/Makefile.am create mode 100644 cmd/mount_zfs/mount_zfs.c create mode 100644 cmd/raidz_test/.gitignore create mode 100644 cmd/raidz_test/Makefile.am create mode 100644 cmd/raidz_test/raidz_bench.c create mode 100644 cmd/raidz_test/raidz_test.c create mode 100644 cmd/raidz_test/raidz_test.h create mode 100644 cmd/vdev_id/Makefile.am create mode 100755 cmd/vdev_id/vdev_id create mode 100644 cmd/zdb/.gitignore create mode 100644 cmd/zdb/Makefile.am create mode 100644 cmd/zdb/zdb.c create mode 100644 cmd/zdb/zdb.h create mode 100644 cmd/zdb/zdb_il.c create mode 100644 cmd/zed/.gitignore create mode 100644 cmd/zed/Makefile.am create mode 100644 cmd/zed/agents/README.md create mode 100644 cmd/zed/agents/fmd_api.c create mode 100644 cmd/zed/agents/fmd_api.h create mode 100644 cmd/zed/agents/fmd_serd.c create mode 100644 cmd/zed/agents/fmd_serd.h create mode 100644 cmd/zed/agents/zfs_agents.c create mode 100644 cmd/zed/agents/zfs_agents.h create mode 100644 cmd/zed/agents/zfs_diagnosis.c create mode 100644 cmd/zed/agents/zfs_mod.c create mode 100644 cmd/zed/agents/zfs_retire.c create mode 100644 cmd/zed/zed.c create mode 100644 cmd/zed/zed.d/.gitignore create mode 100644 cmd/zed/zed.d/Makefile.am create mode 100644 cmd/zed/zed.d/README create mode 100755 cmd/zed/zed.d/all-debug.sh create mode 100755 cmd/zed/zed.d/all-syslog.sh create mode 100755 cmd/zed/zed.d/data-notify.sh create mode 100755 cmd/zed/zed.d/generic-notify.sh create mode 100755 cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in create mode 120000 cmd/zed/zed.d/pool_import-led.sh create mode 120000 cmd/zed/zed.d/resilver_finish-notify.sh create mode 100755 cmd/zed/zed.d/resilver_finish-start-scrub.sh create mode 100755 cmd/zed/zed.d/scrub_finish-notify.sh create mode 100755 cmd/zed/zed.d/statechange-led.sh create mode 100755 cmd/zed/zed.d/statechange-notify.sh create mode 100755 cmd/zed/zed.d/trim_finish-notify.sh create mode 120000 cmd/zed/zed.d/vdev_attach-led.sh create mode 120000 cmd/zed/zed.d/vdev_clear-led.sh create mode 100755 cmd/zed/zed.d/zed-functions.sh create mode 100644 cmd/zed/zed.d/zed.rc create mode 100644 cmd/zed/zed.h create mode 100644 cmd/zed/zed_conf.c create mode 100644 cmd/zed/zed_conf.h create mode 100644 cmd/zed/zed_disk_event.c create mode 100644 cmd/zed/zed_disk_event.h create mode 100644 cmd/zed/zed_event.c create mode 100644 cmd/zed/zed_event.h create mode 100644 cmd/zed/zed_exec.c create mode 100644 cmd/zed/zed_exec.h create mode 100644 cmd/zed/zed_file.c create mode 100644 cmd/zed/zed_file.h create mode 100644 cmd/zed/zed_log.c create mode 100644 cmd/zed/zed_log.h create mode 100644 cmd/zed/zed_strings.c create mode 100644 cmd/zed/zed_strings.h create mode 100644 cmd/zfs/.gitignore create mode 100644 cmd/zfs/Makefile.am create mode 100644 cmd/zfs/zfs_iter.c create mode 100644 cmd/zfs/zfs_iter.h create mode 100644 cmd/zfs/zfs_main.c create mode 100644 cmd/zfs/zfs_project.c create mode 100644 cmd/zfs/zfs_projectutil.h create mode 100644 cmd/zfs/zfs_util.h create mode 100644 cmd/zfs_ids_to_path/.gitignore create mode 100644 cmd/zfs_ids_to_path/Makefile.am create mode 100644 cmd/zfs_ids_to_path/zfs_ids_to_path.c create mode 100644 cmd/zgenhostid/Makefile.am create mode 100755 cmd/zgenhostid/zgenhostid create mode 100644 cmd/zhack/.gitignore create mode 100644 cmd/zhack/Makefile.am create mode 100644 cmd/zhack/zhack.c create mode 100644 cmd/zinject/.gitignore create mode 100644 cmd/zinject/Makefile.am create mode 100644 cmd/zinject/translate.c create mode 100644 cmd/zinject/zinject.c create mode 100644 cmd/zinject/zinject.h create mode 100644 cmd/zpool/.gitignore create mode 100644 cmd/zpool/Makefile.am create mode 100644 cmd/zpool/os/freebsd/zpool_vdev_os.c create mode 100644 cmd/zpool/os/linux/zpool_vdev_os.c create mode 100644 cmd/zpool/zpool.d/README create mode 120000 cmd/zpool/zpool.d/ata_err create mode 120000 cmd/zpool/zpool.d/cmd_to create mode 120000 cmd/zpool/zpool.d/defect create mode 100755 cmd/zpool/zpool.d/dm-deps create mode 120000 cmd/zpool/zpool.d/enc create mode 120000 cmd/zpool/zpool.d/encdev create mode 120000 cmd/zpool/zpool.d/fault_led create mode 120000 cmd/zpool/zpool.d/health create mode 120000 cmd/zpool/zpool.d/hours_on create mode 100755 cmd/zpool/zpool.d/iostat create mode 120000 cmd/zpool/zpool.d/iostat-10s create mode 120000 cmd/zpool/zpool.d/iostat-1s create mode 120000 cmd/zpool/zpool.d/label create mode 120000 cmd/zpool/zpool.d/locate_led create mode 100755 cmd/zpool/zpool.d/lsblk create mode 100755 cmd/zpool/zpool.d/media create mode 120000 cmd/zpool/zpool.d/model create mode 120000 cmd/zpool/zpool.d/nonmed create mode 120000 cmd/zpool/zpool.d/nvme_err create mode 120000 cmd/zpool/zpool.d/off_ucor create mode 120000 cmd/zpool/zpool.d/pend_sec create mode 120000 cmd/zpool/zpool.d/pwr_cyc create mode 120000 cmd/zpool/zpool.d/r_proc create mode 120000 cmd/zpool/zpool.d/r_ucor create mode 120000 cmd/zpool/zpool.d/realloc create mode 120000 cmd/zpool/zpool.d/rep_ucor create mode 120000 cmd/zpool/zpool.d/serial create mode 100755 cmd/zpool/zpool.d/ses create mode 120000 cmd/zpool/zpool.d/size create mode 120000 cmd/zpool/zpool.d/slot create mode 100755 cmd/zpool/zpool.d/smart create mode 120000 cmd/zpool/zpool.d/smart_test create mode 120000 cmd/zpool/zpool.d/smartx create mode 120000 cmd/zpool/zpool.d/temp create mode 120000 cmd/zpool/zpool.d/test_ended create mode 120000 cmd/zpool/zpool.d/test_progress create mode 120000 cmd/zpool/zpool.d/test_status create mode 120000 cmd/zpool/zpool.d/test_type create mode 100755 cmd/zpool/zpool.d/upath create mode 120000 cmd/zpool/zpool.d/vendor create mode 120000 cmd/zpool/zpool.d/w_proc create mode 120000 cmd/zpool/zpool.d/w_ucor create mode 100644 cmd/zpool/zpool_iter.c create mode 100644 cmd/zpool/zpool_main.c create mode 100644 cmd/zpool/zpool_util.c create mode 100644 cmd/zpool/zpool_util.h create mode 100644 cmd/zpool/zpool_vdev.c create mode 100644 cmd/zstream/.gitignore create mode 100644 cmd/zstream/Makefile.am create mode 100644 cmd/zstream/zstream.c create mode 100644 cmd/zstream/zstream.h create mode 100644 cmd/zstream/zstream_dump.c create mode 100644 cmd/zstream/zstream_redup.c create mode 100644 cmd/zstream/zstream_token.c create mode 100644 cmd/zstreamdump/Makefile.am create mode 100755 cmd/zstreamdump/zstreamdump create mode 100644 cmd/ztest/.gitignore create mode 100644 cmd/ztest/Makefile.am create mode 100644 cmd/ztest/ztest.c create mode 100644 cmd/zvol_id/.gitignore create mode 100644 cmd/zvol_id/Makefile.am create mode 100644 cmd/zvol_id/zvol_id_main.c create mode 100644 cmd/zvol_wait/Makefile.am create mode 100755 cmd/zvol_wait/zvol_wait create mode 100644 config/.gitignore create mode 100644 config/Rules.am create mode 100644 config/Substfiles.am create mode 100644 config/always-arch.m4 create mode 100644 config/always-compiler-options.m4 create mode 100644 config/always-python.m4 create mode 100644 config/always-pyzfs.m4 create mode 100644 config/always-sed.m4 create mode 100644 config/always-system.m4 create mode 100644 config/ax_code_coverage.m4 create mode 100644 config/ax_python_devel.m4 create mode 100644 config/ax_restore_flags.m4 create mode 100644 config/ax_save_flags.m4 create mode 100644 config/config.awk create mode 100755 config/config.rpath create mode 100644 config/deb.am create mode 100644 config/find_system_library.m4 create mode 100644 config/gettext.m4 create mode 100644 config/host-cpu-c-abi.m4 create mode 100644 config/iconv.m4 create mode 100644 config/intlmacosx.m4 create mode 100644 config/kernel-access-ok-type.m4 create mode 100644 config/kernel-acl.m4 create mode 100644 config/kernel-aio-fsync.m4 create mode 100644 config/kernel-automount.m4 create mode 100644 config/kernel-bdi.m4 create mode 100644 config/kernel-bio.m4 create mode 100644 config/kernel-blk-queue.m4 create mode 100644 config/kernel-blkdev.m4 create mode 100644 config/kernel-block-device-operations.m4 create mode 100644 config/kernel-clear-inode.m4 create mode 100644 config/kernel-commit-metadata.m4 create mode 100644 config/kernel-config-defined.m4 create mode 100644 config/kernel-current-time.m4 create mode 100644 config/kernel-declare-event-class.m4 create mode 100644 config/kernel-dentry-operations.m4 create mode 100644 config/kernel-dirty-inode.m4 create mode 100644 config/kernel-discard-granularity.m4 create mode 100644 config/kernel-encode-fh-inode.m4 create mode 100644 config/kernel-evict-inode.m4 create mode 100644 config/kernel-fallocate.m4 create mode 100644 config/kernel-file-dentry.m4 create mode 100644 config/kernel-file-inode.m4 create mode 100644 config/kernel-fmode-t.m4 create mode 100644 config/kernel-follow-down-one.m4 create mode 100644 config/kernel-fpu.m4 create mode 100644 config/kernel-fst-mount.m4 create mode 100644 config/kernel-fsync.m4 create mode 100644 config/kernel-generic_io_acct.m4 create mode 100644 config/kernel-generic_readlink.m4 create mode 100644 config/kernel-get-disk-and-module.m4 create mode 100644 config/kernel-get-disk-ro.m4 create mode 100644 config/kernel-get-link.m4 create mode 100644 config/kernel-global_page_state.m4 create mode 100644 config/kernel-group-info.m4 create mode 100644 config/kernel-in-compat-syscall.m4 create mode 100644 config/kernel-inode-create.m4 create mode 100644 config/kernel-inode-getattr.m4 create mode 100644 config/kernel-inode-lock.m4 create mode 100644 config/kernel-inode-lookup.m4 create mode 100644 config/kernel-inode-set-flags.m4 create mode 100644 config/kernel-inode-set-iversion.m4 create mode 100644 config/kernel-inode-times.m4 create mode 100644 config/kernel-insert-inode-locked.m4 create mode 100644 config/kernel-is_owner_or_cap.m4 create mode 100644 config/kernel-kmap-atomic-args.m4 create mode 100644 config/kernel-kmem-cache.m4 create mode 100644 config/kernel-kmem.m4 create mode 100644 config/kernel-kstrtoul.m4 create mode 100644 config/kernel-ktime.m4 create mode 100644 config/kernel-kuid-helpers.m4 create mode 100644 config/kernel-kuidgid.m4 create mode 100644 config/kernel-lseek-execute.m4 create mode 100644 config/kernel-make-request-fn.m4 create mode 100644 config/kernel-misc-minor.m4 create mode 100644 config/kernel-mkdir-umode-t.m4 create mode 100644 config/kernel-mod-param.m4 create mode 100644 config/kernel-objtool.m4 create mode 100644 config/kernel-pde-data.m4 create mode 100644 config/kernel-percpu.m4 create mode 100644 config/kernel-proc-operations.m4 create mode 100644 config/kernel-put-link.m4 create mode 100644 config/kernel-rename.m4 create mode 100644 config/kernel-rw.m4 create mode 100644 config/kernel-rwsem.m4 create mode 100644 config/kernel-sched.m4 create mode 100644 config/kernel-security-inode-init.m4 create mode 100644 config/kernel-set-nlink.m4 create mode 100644 config/kernel-setattr-prepare.m4 create mode 100644 config/kernel-sget-args.m4 create mode 100644 config/kernel-show-options.m4 create mode 100644 config/kernel-shrink.m4 create mode 100644 config/kernel-super-userns.m4 create mode 100644 config/kernel-timer.m4 create mode 100644 config/kernel-tmpfile.m4 create mode 100644 config/kernel-totalhigh_pages.m4 create mode 100644 config/kernel-totalram-pages-func.m4 create mode 100644 config/kernel-truncate-setsize.m4 create mode 100644 config/kernel-userns-capabilities.m4 create mode 100644 config/kernel-usleep_range.m4 create mode 100644 config/kernel-vfs-direct_IO.m4 create mode 100644 config/kernel-vfs-fsync.m4 create mode 100644 config/kernel-vfs-getattr.m4 create mode 100644 config/kernel-vfs-iterate.m4 create mode 100644 config/kernel-vfs-rw-iterate.m4 create mode 100644 config/kernel-wait.m4 create mode 100644 config/kernel-xattr-handler.m4 create mode 100644 config/kernel-zlib.m4 create mode 100644 config/kernel.m4 create mode 100644 config/lib-ld.m4 create mode 100644 config/lib-link.m4 create mode 100644 config/lib-prefix.m4 create mode 100644 config/mount-helper.m4 create mode 100644 config/nls.m4 create mode 100644 config/pkg.m4 create mode 100644 config/po.m4 create mode 100644 config/progtest.m4 create mode 100644 config/rpm.am create mode 100644 config/tgz.am create mode 100644 config/toolchain-simd.m4 create mode 100644 config/user-clock_gettime.m4 create mode 100644 config/user-dracut.m4 create mode 100644 config/user-gettext.m4 create mode 100644 config/user-libaio.m4 create mode 100644 config/user-libblkid.m4 create mode 100644 config/user-libcrypto.m4 create mode 100644 config/user-libexec.m4 create mode 100644 config/user-libtirpc.m4 create mode 100644 config/user-libudev.m4 create mode 100644 config/user-libuuid.m4 create mode 100644 config/user-makedev.m4 create mode 100644 config/user-pam.m4 create mode 100644 config/user-runstatedir.m4 create mode 100644 config/user-systemd.m4 create mode 100644 config/user-sysvinit.m4 create mode 100644 config/user-udev.m4 create mode 100644 config/user-zlib.m4 create mode 100644 config/user.m4 create mode 100644 config/zfs-build.m4 create mode 100644 config/zfs-meta.m4 create mode 100644 configure.ac create mode 100644 contrib/Makefile.am create mode 100644 contrib/bash_completion.d/Makefile.am create mode 100644 contrib/bash_completion.d/zfs create mode 100644 contrib/bpftrace/Makefile.am create mode 100644 contrib/bpftrace/taskqlatency.bt create mode 100755 contrib/bpftrace/zfs-trace.sh create mode 100644 contrib/dracut/02zfsexpandknowledge/.gitignore create mode 100644 contrib/dracut/02zfsexpandknowledge/Makefile.am create mode 100755 contrib/dracut/02zfsexpandknowledge/module-setup.sh.in create mode 100644 contrib/dracut/90zfs/.gitignore create mode 100644 contrib/dracut/90zfs/Makefile.am create mode 100755 contrib/dracut/90zfs/export-zfs.sh.in create mode 100755 contrib/dracut/90zfs/module-setup.sh.in create mode 100755 contrib/dracut/90zfs/mount-zfs.sh.in create mode 100755 contrib/dracut/90zfs/parse-zfs.sh.in create mode 100644 contrib/dracut/90zfs/zfs-env-bootfs.service.in create mode 100755 contrib/dracut/90zfs/zfs-generator.sh.in create mode 100755 contrib/dracut/90zfs/zfs-lib.sh.in create mode 100755 contrib/dracut/90zfs/zfs-load-key.sh.in create mode 100755 contrib/dracut/90zfs/zfs-needshutdown.sh.in create mode 100644 contrib/dracut/90zfs/zfs-rollback-bootfs.service.in create mode 100644 contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in create mode 100644 contrib/dracut/Makefile.am create mode 100644 contrib/dracut/README.dracut.markdown create mode 100644 contrib/initramfs/Makefile.am create mode 100644 contrib/initramfs/README.initramfs.markdown create mode 100644 contrib/initramfs/conf-hooks.d/Makefile.am create mode 100644 contrib/initramfs/conf-hooks.d/zfs create mode 100644 contrib/initramfs/conf.d/Makefile.am create mode 100644 contrib/initramfs/conf.d/zfs create mode 100644 contrib/initramfs/hooks/.gitignore create mode 100644 contrib/initramfs/hooks/Makefile.am create mode 100755 contrib/initramfs/hooks/zfs.in create mode 100644 contrib/initramfs/hooks/zfsunlock.in create mode 100644 contrib/initramfs/scripts/.gitignore create mode 100644 contrib/initramfs/scripts/Makefile.am create mode 100644 contrib/initramfs/scripts/local-top/Makefile.am create mode 100755 contrib/initramfs/scripts/local-top/zfs create mode 100644 contrib/initramfs/scripts/zfs create mode 100755 contrib/initramfs/zfsunlock create mode 100644 contrib/pam_zfs_key/Makefile.am create mode 100644 contrib/pam_zfs_key/pam_zfs_key.c create mode 100644 contrib/pam_zfs_key/zfs_key create mode 100644 contrib/pyzfs/.gitignore create mode 100644 contrib/pyzfs/LICENSE create mode 100644 contrib/pyzfs/Makefile.am create mode 100644 contrib/pyzfs/README create mode 100644 contrib/pyzfs/docs/source/conf.py create mode 100644 contrib/pyzfs/docs/source/index.rst create mode 100644 contrib/pyzfs/libzfs_core/__init__.py create mode 100644 contrib/pyzfs/libzfs_core/_constants.py create mode 100644 contrib/pyzfs/libzfs_core/_error_translation.py create mode 100644 contrib/pyzfs/libzfs_core/_libzfs_core.py create mode 100644 contrib/pyzfs/libzfs_core/_nvlist.py create mode 100644 contrib/pyzfs/libzfs_core/bindings/__init__.py create mode 100644 contrib/pyzfs/libzfs_core/bindings/libnvpair.py create mode 100644 contrib/pyzfs/libzfs_core/bindings/libzfs_core.py create mode 100644 contrib/pyzfs/libzfs_core/ctypes.py create mode 100644 contrib/pyzfs/libzfs_core/exceptions.py create mode 100644 contrib/pyzfs/libzfs_core/test/__init__.py create mode 100644 contrib/pyzfs/libzfs_core/test/test_libzfs_core.py create mode 100644 contrib/pyzfs/libzfs_core/test/test_nvlist.py create mode 100644 contrib/pyzfs/setup.py.in create mode 100644 contrib/zcp/Makefile.am create mode 100644 contrib/zcp/autosnap.lua create mode 100755 copy-builtin create mode 100644 cppcheck-suppressions.txt create mode 100644 etc/Makefile.am create mode 100644 etc/default/.gitignore create mode 100644 etc/default/Makefile.am create mode 100644 etc/default/zfs.in create mode 100644 etc/init.d/.gitignore create mode 100644 etc/init.d/Makefile.am create mode 100644 etc/init.d/README.md create mode 100755 etc/init.d/zfs-import.in create mode 100755 etc/init.d/zfs-mount.in create mode 100755 etc/init.d/zfs-share.in create mode 100755 etc/init.d/zfs-zed.in create mode 100644 etc/modules-load.d/.gitignore create mode 100644 etc/modules-load.d/Makefile.am create mode 100644 etc/modules-load.d/zfs.conf create mode 100644 etc/sudoers.d/Makefile.am create mode 100644 etc/sudoers.d/zfs create mode 100644 etc/systemd/Makefile.am create mode 100644 etc/systemd/system-generators/.gitignore create mode 100644 etc/systemd/system-generators/Makefile.am create mode 100755 etc/systemd/system-generators/zfs-mount-generator.in create mode 100644 etc/systemd/system/.gitignore create mode 100644 etc/systemd/system/50-zfs.preset.in create mode 100644 etc/systemd/system/Makefile.am create mode 100644 etc/systemd/system/zfs-import-cache.service.in create mode 100644 etc/systemd/system/zfs-import-scan.service.in create mode 100644 etc/systemd/system/zfs-import.target.in create mode 100644 etc/systemd/system/zfs-mount.service.in create mode 100644 etc/systemd/system/zfs-share.service.in create mode 100644 etc/systemd/system/zfs-volume-wait.service.in create mode 100644 etc/systemd/system/zfs-volumes.target.in create mode 100644 etc/systemd/system/zfs-zed.service.in create mode 100644 etc/systemd/system/zfs.target.in create mode 100644 etc/zfs/.gitignore create mode 100644 etc/zfs/Makefile.am create mode 100644 etc/zfs/vdev_id.conf.alias.example create mode 100644 etc/zfs/vdev_id.conf.multipath.example create mode 100644 etc/zfs/vdev_id.conf.sas_direct.example create mode 100644 etc/zfs/vdev_id.conf.sas_switch.example create mode 100644 etc/zfs/vdev_id.conf.scsi.example create mode 100644 etc/zfs/zfs-functions.in create mode 100644 include/.gitignore create mode 100644 include/Makefile.am create mode 100644 include/cityhash.h create mode 100644 include/libnvpair.h create mode 100644 include/libuutil.h create mode 100644 include/libuutil_common.h create mode 100644 include/libuutil_impl.h create mode 100644 include/libzfs.h create mode 100644 include/libzfs_core.h create mode 100644 include/libzfs_impl.h create mode 100644 include/libzutil.h create mode 100644 include/os/Makefile.am create mode 100644 include/os/freebsd/Makefile.am create mode 100644 include/os/freebsd/linux/Makefile.am create mode 100644 include/os/freebsd/linux/compiler.h create mode 100644 include/os/freebsd/linux/types.h create mode 100644 include/os/freebsd/spl/Makefile.am create mode 100644 include/os/freebsd/spl/acl/Makefile.am create mode 100644 include/os/freebsd/spl/acl/acl_common.h create mode 100644 include/os/freebsd/spl/rpc/Makefile.am create mode 100644 include/os/freebsd/spl/rpc/xdr.h create mode 100644 include/os/freebsd/spl/sys/Makefile.am create mode 100644 include/os/freebsd/spl/sys/acl.h create mode 100644 include/os/freebsd/spl/sys/acl_impl.h create mode 100644 include/os/freebsd/spl/sys/atomic.h create mode 100644 include/os/freebsd/spl/sys/byteorder.h create mode 100644 include/os/freebsd/spl/sys/callb.h create mode 100644 include/os/freebsd/spl/sys/ccompat.h create mode 100644 include/os/freebsd/spl/sys/ccompile.h create mode 100644 include/os/freebsd/spl/sys/cmn_err.h create mode 100644 include/os/freebsd/spl/sys/condvar.h create mode 100644 include/os/freebsd/spl/sys/console.h create mode 100644 include/os/freebsd/spl/sys/cred.h create mode 100644 include/os/freebsd/spl/sys/ctype.h create mode 100644 include/os/freebsd/spl/sys/debug.h create mode 100644 include/os/freebsd/spl/sys/dirent.h create mode 100644 include/os/freebsd/spl/sys/disp.h create mode 100644 include/os/freebsd/spl/sys/dkio.h create mode 100644 include/os/freebsd/spl/sys/extdirent.h create mode 100644 include/os/freebsd/spl/sys/file.h create mode 100644 include/os/freebsd/spl/sys/freebsd_rwlock.h create mode 100644 include/os/freebsd/spl/sys/idmap.h create mode 100644 include/os/freebsd/spl/sys/inttypes.h create mode 100644 include/os/freebsd/spl/sys/isa_defs.h create mode 100644 include/os/freebsd/spl/sys/kidmap.h create mode 100644 include/os/freebsd/spl/sys/kmem.h create mode 100644 include/os/freebsd/spl/sys/kmem_cache.h create mode 100644 include/os/freebsd/spl/sys/kstat.h create mode 100644 include/os/freebsd/spl/sys/list.h create mode 100644 include/os/freebsd/spl/sys/list_impl.h create mode 100644 include/os/freebsd/spl/sys/lock.h create mode 100644 include/os/freebsd/spl/sys/misc.h create mode 100644 include/os/freebsd/spl/sys/mod_os.h create mode 100644 include/os/freebsd/spl/sys/mode.h create mode 100644 include/os/freebsd/spl/sys/mount.h create mode 100644 include/os/freebsd/spl/sys/mutex.h create mode 100644 include/os/freebsd/spl/sys/param.h create mode 100644 include/os/freebsd/spl/sys/policy.h create mode 100644 include/os/freebsd/spl/sys/proc.h create mode 100644 include/os/freebsd/spl/sys/processor.h create mode 100644 include/os/freebsd/spl/sys/procfs_list.h create mode 100644 include/os/freebsd/spl/sys/random.h create mode 100644 include/os/freebsd/spl/sys/rwlock.h create mode 100644 include/os/freebsd/spl/sys/sdt.h create mode 100644 include/os/freebsd/spl/sys/sid.h create mode 100644 include/os/freebsd/spl/sys/sig.h create mode 100644 include/os/freebsd/spl/sys/simd.h create mode 100644 include/os/freebsd/spl/sys/simd_x86.h create mode 100644 include/os/freebsd/spl/sys/spl_condvar.h create mode 100644 include/os/freebsd/spl/sys/string.h create mode 100644 include/os/freebsd/spl/sys/strings.h create mode 100644 include/os/freebsd/spl/sys/sunddi.h create mode 100644 include/os/freebsd/spl/sys/sysmacros.h create mode 100644 include/os/freebsd/spl/sys/systeminfo.h create mode 100644 include/os/freebsd/spl/sys/systm.h create mode 100644 include/os/freebsd/spl/sys/taskq.h create mode 100644 include/os/freebsd/spl/sys/thread.h create mode 100644 include/os/freebsd/spl/sys/time.h create mode 100644 include/os/freebsd/spl/sys/timer.h create mode 100644 include/os/freebsd/spl/sys/trace.h create mode 100644 include/os/freebsd/spl/sys/trace_zfs.h create mode 100644 include/os/freebsd/spl/sys/types.h create mode 100644 include/os/freebsd/spl/sys/types32.h create mode 100644 include/os/freebsd/spl/sys/uio.h create mode 100644 include/os/freebsd/spl/sys/uuid.h create mode 100644 include/os/freebsd/spl/sys/vfs.h create mode 100644 include/os/freebsd/spl/sys/vm.h create mode 100644 include/os/freebsd/spl/sys/vmsystm.h create mode 100644 include/os/freebsd/spl/sys/vnode.h create mode 100644 include/os/freebsd/spl/sys/vnode_impl.h create mode 100644 include/os/freebsd/spl/sys/zmod.h create mode 100644 include/os/freebsd/spl/sys/zone.h create mode 100644 include/os/freebsd/zfs/Makefile.am create mode 100644 include/os/freebsd/zfs/sys/Makefile.am create mode 100644 include/os/freebsd/zfs/sys/freebsd_crypto.h create mode 100644 include/os/freebsd/zfs/sys/sha2.h create mode 100644 include/os/freebsd/zfs/sys/vdev_os.h create mode 100644 include/os/freebsd/zfs/sys/zfs_context_os.h create mode 100644 include/os/freebsd/zfs/sys/zfs_ctldir.h create mode 100644 include/os/freebsd/zfs/sys/zfs_dir.h create mode 100644 include/os/freebsd/zfs/sys/zfs_ioctl_compat.h create mode 100644 include/os/freebsd/zfs/sys/zfs_vfsops.h create mode 100644 include/os/freebsd/zfs/sys/zfs_vnops.h create mode 100644 include/os/freebsd/zfs/sys/zfs_znode_impl.h create mode 100644 include/os/freebsd/zfs/sys/zpl.h create mode 100644 include/os/linux/Makefile.am create mode 100644 include/os/linux/kernel/Makefile.am create mode 100644 include/os/linux/kernel/linux/Makefile.am create mode 100644 include/os/linux/kernel/linux/blkdev_compat.h create mode 100644 include/os/linux/kernel/linux/compiler_compat.h create mode 100644 include/os/linux/kernel/linux/dcache_compat.h create mode 100644 include/os/linux/kernel/linux/kmap_compat.h create mode 100644 include/os/linux/kernel/linux/mod_compat.h create mode 100644 include/os/linux/kernel/linux/page_compat.h create mode 100644 include/os/linux/kernel/linux/percpu_compat.h create mode 100644 include/os/linux/kernel/linux/simd.h create mode 100644 include/os/linux/kernel/linux/simd_aarch64.h create mode 100644 include/os/linux/kernel/linux/simd_powerpc.h create mode 100644 include/os/linux/kernel/linux/simd_x86.h create mode 100644 include/os/linux/kernel/linux/utsname_compat.h create mode 100644 include/os/linux/kernel/linux/vfs_compat.h create mode 100644 include/os/linux/kernel/linux/xattr_compat.h create mode 100644 include/os/linux/spl/Makefile.am create mode 100644 include/os/linux/spl/rpc/Makefile.am create mode 100644 include/os/linux/spl/rpc/xdr.h create mode 100644 include/os/linux/spl/sys/Makefile.am create mode 100644 include/os/linux/spl/sys/acl.h create mode 100644 include/os/linux/spl/sys/atomic.h create mode 100644 include/os/linux/spl/sys/byteorder.h create mode 100644 include/os/linux/spl/sys/callb.h create mode 100644 include/os/linux/spl/sys/callo.h create mode 100644 include/os/linux/spl/sys/cmn_err.h create mode 100644 include/os/linux/spl/sys/condvar.h create mode 100644 include/os/linux/spl/sys/console.h create mode 100644 include/os/linux/spl/sys/cred.h create mode 100644 include/os/linux/spl/sys/ctype.h create mode 100644 include/os/linux/spl/sys/debug.h create mode 100644 include/os/linux/spl/sys/disp.h create mode 100644 include/os/linux/spl/sys/dkio.h create mode 100644 include/os/linux/spl/sys/errno.h create mode 100644 include/os/linux/spl/sys/fcntl.h create mode 100644 include/os/linux/spl/sys/file.h create mode 100644 include/os/linux/spl/sys/inttypes.h create mode 100644 include/os/linux/spl/sys/isa_defs.h create mode 100644 include/os/linux/spl/sys/kmem.h create mode 100644 include/os/linux/spl/sys/kmem_cache.h create mode 100644 include/os/linux/spl/sys/kstat.h create mode 100644 include/os/linux/spl/sys/list.h create mode 100644 include/os/linux/spl/sys/mod_os.h create mode 100644 include/os/linux/spl/sys/mutex.h create mode 100644 include/os/linux/spl/sys/param.h create mode 100644 include/os/linux/spl/sys/proc.h create mode 100644 include/os/linux/spl/sys/processor.h create mode 100644 include/os/linux/spl/sys/procfs_list.h create mode 100644 include/os/linux/spl/sys/random.h create mode 100644 include/os/linux/spl/sys/rwlock.h create mode 100644 include/os/linux/spl/sys/shrinker.h create mode 100644 include/os/linux/spl/sys/sid.h create mode 100644 include/os/linux/spl/sys/signal.h create mode 100644 include/os/linux/spl/sys/simd.h create mode 100644 include/os/linux/spl/sys/stat.h create mode 100644 include/os/linux/spl/sys/strings.h create mode 100644 include/os/linux/spl/sys/sunddi.h create mode 100644 include/os/linux/spl/sys/sysmacros.h create mode 100644 include/os/linux/spl/sys/systeminfo.h create mode 100644 include/os/linux/spl/sys/taskq.h create mode 100644 include/os/linux/spl/sys/thread.h create mode 100644 include/os/linux/spl/sys/time.h create mode 100644 include/os/linux/spl/sys/timer.h create mode 100644 include/os/linux/spl/sys/trace.h create mode 100644 include/os/linux/spl/sys/trace_spl.h create mode 100644 include/os/linux/spl/sys/trace_taskq.h create mode 100644 include/os/linux/spl/sys/tsd.h create mode 100644 include/os/linux/spl/sys/types.h create mode 100644 include/os/linux/spl/sys/types32.h create mode 100644 include/os/linux/spl/sys/uio.h create mode 100644 include/os/linux/spl/sys/user.h create mode 100644 include/os/linux/spl/sys/vfs.h create mode 100644 include/os/linux/spl/sys/vmem.h create mode 100644 include/os/linux/spl/sys/vmsystm.h create mode 100644 include/os/linux/spl/sys/vnode.h create mode 100644 include/os/linux/spl/sys/wait.h create mode 100644 include/os/linux/spl/sys/zmod.h create mode 100644 include/os/linux/spl/sys/zone.h create mode 100644 include/os/linux/zfs/Makefile.am create mode 100644 include/os/linux/zfs/sys/Makefile.am create mode 100644 include/os/linux/zfs/sys/policy.h create mode 100644 include/os/linux/zfs/sys/sha2.h create mode 100644 include/os/linux/zfs/sys/trace_acl.h create mode 100644 include/os/linux/zfs/sys/trace_arc.h create mode 100644 include/os/linux/zfs/sys/trace_common.h create mode 100644 include/os/linux/zfs/sys/trace_dbgmsg.h create mode 100644 include/os/linux/zfs/sys/trace_dbuf.h create mode 100644 include/os/linux/zfs/sys/trace_dmu.h create mode 100644 include/os/linux/zfs/sys/trace_dnode.h create mode 100644 include/os/linux/zfs/sys/trace_multilist.h create mode 100644 include/os/linux/zfs/sys/trace_rrwlock.h create mode 100644 include/os/linux/zfs/sys/trace_txg.h create mode 100644 include/os/linux/zfs/sys/trace_vdev.h create mode 100644 include/os/linux/zfs/sys/trace_zfs.h create mode 100644 include/os/linux/zfs/sys/trace_zil.h create mode 100644 include/os/linux/zfs/sys/trace_zio.h create mode 100644 include/os/linux/zfs/sys/trace_zrlock.h create mode 100644 include/os/linux/zfs/sys/zfs_context_os.h create mode 100644 include/os/linux/zfs/sys/zfs_ctldir.h create mode 100644 include/os/linux/zfs/sys/zfs_dir.h create mode 100644 include/os/linux/zfs/sys/zfs_vfsops.h create mode 100644 include/os/linux/zfs/sys/zfs_vnops.h create mode 100644 include/os/linux/zfs/sys/zfs_znode_impl.h create mode 100644 include/os/linux/zfs/sys/zpl.h create mode 100644 include/sys/Makefile.am create mode 100644 include/sys/abd.h create mode 100644 include/sys/abd_impl.h create mode 100644 include/sys/aggsum.h create mode 100644 include/sys/arc.h create mode 100644 include/sys/arc_impl.h create mode 100644 include/sys/avl.h create mode 100644 include/sys/avl_impl.h create mode 100644 include/sys/bitops.h create mode 100644 include/sys/blkptr.h create mode 100644 include/sys/bplist.h create mode 100644 include/sys/bpobj.h create mode 100644 include/sys/bptree.h create mode 100644 include/sys/bqueue.h create mode 100644 include/sys/btree.h create mode 100644 include/sys/crypto/Makefile.am create mode 100644 include/sys/crypto/api.h create mode 100644 include/sys/crypto/common.h create mode 100644 include/sys/crypto/icp.h create mode 100644 include/sys/dataset_kstats.h create mode 100644 include/sys/dbuf.h create mode 100644 include/sys/ddt.h create mode 100644 include/sys/dmu.h create mode 100644 include/sys/dmu_impl.h create mode 100644 include/sys/dmu_objset.h create mode 100644 include/sys/dmu_recv.h create mode 100644 include/sys/dmu_redact.h create mode 100644 include/sys/dmu_send.h create mode 100644 include/sys/dmu_traverse.h create mode 100644 include/sys/dmu_tx.h create mode 100644 include/sys/dmu_zfetch.h create mode 100644 include/sys/dnode.h create mode 100644 include/sys/dsl_bookmark.h create mode 100644 include/sys/dsl_crypt.h create mode 100644 include/sys/dsl_dataset.h create mode 100644 include/sys/dsl_deadlist.h create mode 100644 include/sys/dsl_deleg.h create mode 100644 include/sys/dsl_destroy.h create mode 100644 include/sys/dsl_dir.h create mode 100644 include/sys/dsl_pool.h create mode 100644 include/sys/dsl_prop.h create mode 100644 include/sys/dsl_scan.h create mode 100644 include/sys/dsl_synctask.h create mode 100644 include/sys/dsl_userhold.h create mode 100644 include/sys/edonr.h create mode 100644 include/sys/efi_partition.h create mode 100644 include/sys/fm/Makefile.am create mode 100644 include/sys/fm/fs/Makefile.am create mode 100644 include/sys/fm/fs/zfs.h create mode 100644 include/sys/fm/protocol.h create mode 100644 include/sys/fm/util.h create mode 100644 include/sys/frame.h create mode 100644 include/sys/fs/Makefile.am create mode 100644 include/sys/fs/zfs.h create mode 100644 include/sys/hkdf.h create mode 100644 include/sys/lua/Makefile.am create mode 100644 include/sys/lua/lauxlib.h create mode 100644 include/sys/lua/lua.h create mode 100644 include/sys/lua/luaconf.h create mode 100644 include/sys/lua/lualib.h create mode 100644 include/sys/metaslab.h create mode 100644 include/sys/metaslab_impl.h create mode 100644 include/sys/mmp.h create mode 100644 include/sys/mntent.h create mode 100644 include/sys/mod.h create mode 100644 include/sys/multilist.h create mode 100644 include/sys/note.h create mode 100644 include/sys/nvpair.h create mode 100644 include/sys/nvpair_impl.h create mode 100644 include/sys/objlist.h create mode 100644 include/sys/pathname.h create mode 100644 include/sys/qat.h create mode 100644 include/sys/range_tree.h create mode 100644 include/sys/rrwlock.h create mode 100644 include/sys/sa.h create mode 100644 include/sys/sa_impl.h create mode 100644 include/sys/skein.h create mode 100644 include/sys/spa.h create mode 100644 include/sys/spa_boot.h create mode 100644 include/sys/spa_checkpoint.h create mode 100644 include/sys/spa_checksum.h create mode 100644 include/sys/spa_impl.h create mode 100644 include/sys/spa_log_spacemap.h create mode 100644 include/sys/space_map.h create mode 100644 include/sys/space_reftree.h create mode 100644 include/sys/sysevent.h create mode 100644 include/sys/sysevent/Makefile.am create mode 100644 include/sys/sysevent/dev.h create mode 100644 include/sys/sysevent/eventdefs.h create mode 100644 include/sys/txg.h create mode 100644 include/sys/txg_impl.h create mode 100644 include/sys/u8_textprep.h create mode 100644 include/sys/u8_textprep_data.h create mode 100644 include/sys/uberblock.h create mode 100644 include/sys/uberblock_impl.h create mode 100644 include/sys/uio_impl.h create mode 100644 include/sys/unique.h create mode 100644 include/sys/uuid.h create mode 100644 include/sys/vdev.h create mode 100644 include/sys/vdev_disk.h create mode 100644 include/sys/vdev_file.h create mode 100644 include/sys/vdev_impl.h create mode 100644 include/sys/vdev_indirect_births.h create mode 100644 include/sys/vdev_indirect_mapping.h create mode 100644 include/sys/vdev_initialize.h create mode 100644 include/sys/vdev_raidz.h create mode 100644 include/sys/vdev_raidz_impl.h create mode 100644 include/sys/vdev_rebuild.h create mode 100644 include/sys/vdev_removal.h create mode 100644 include/sys/vdev_trim.h create mode 100644 include/sys/xvattr.h create mode 100644 include/sys/zap.h create mode 100644 include/sys/zap_impl.h create mode 100644 include/sys/zap_leaf.h create mode 100644 include/sys/zcp.h create mode 100644 include/sys/zcp_global.h create mode 100644 include/sys/zcp_iter.h create mode 100644 include/sys/zcp_prop.h create mode 100644 include/sys/zcp_set.h create mode 100644 include/sys/zfeature.h create mode 100644 include/sys/zfs_acl.h create mode 100644 include/sys/zfs_context.h create mode 100644 include/sys/zfs_debug.h create mode 100644 include/sys/zfs_delay.h create mode 100644 include/sys/zfs_file.h create mode 100644 include/sys/zfs_fuid.h create mode 100644 include/sys/zfs_ioctl.h create mode 100644 include/sys/zfs_ioctl_impl.h create mode 100644 include/sys/zfs_onexit.h create mode 100644 include/sys/zfs_project.h create mode 100644 include/sys/zfs_quota.h create mode 100644 include/sys/zfs_ratelimit.h create mode 100644 include/sys/zfs_refcount.h create mode 100644 include/sys/zfs_rlock.h create mode 100644 include/sys/zfs_sa.h create mode 100644 include/sys/zfs_stat.h create mode 100644 include/sys/zfs_sysfs.h create mode 100644 include/sys/zfs_znode.h create mode 100644 include/sys/zil.h create mode 100644 include/sys/zil_impl.h create mode 100644 include/sys/zio.h create mode 100644 include/sys/zio_checksum.h create mode 100644 include/sys/zio_compress.h create mode 100644 include/sys/zio_crypt.h create mode 100644 include/sys/zio_impl.h create mode 100644 include/sys/zio_priority.h create mode 100644 include/sys/zrlock.h create mode 100644 include/sys/zstd/Makefile.am create mode 100644 include/sys/zstd/zstd.h create mode 100644 include/sys/zthr.h create mode 100644 include/sys/zvol.h create mode 100644 include/sys/zvol_impl.h create mode 100644 include/thread_pool.h create mode 100644 include/zfeature_common.h create mode 100644 include/zfs_comutil.h create mode 100644 include/zfs_deleg.h create mode 100644 include/zfs_fletcher.h create mode 100644 include/zfs_namecheck.h create mode 100644 include/zfs_prop.h create mode 100644 lib/Makefile.am create mode 100644 lib/libavl/Makefile.am create mode 100644 lib/libefi/Makefile.am create mode 100644 lib/libefi/rdwr_efi.c create mode 100644 lib/libicp/Makefile.am create mode 100644 lib/libnvpair/Makefile.am create mode 100644 lib/libnvpair/libnvpair.c create mode 100644 lib/libnvpair/libnvpair_json.c create mode 100644 lib/libnvpair/nvpair_alloc_system.c create mode 100644 lib/libshare/Makefile.am create mode 100644 lib/libshare/libshare.c create mode 100644 lib/libshare/libshare_impl.h create mode 100644 lib/libshare/nfs.h create mode 100644 lib/libshare/os/freebsd/nfs.c create mode 100644 lib/libshare/os/freebsd/smb.c create mode 100644 lib/libshare/os/linux/nfs.c create mode 100644 lib/libshare/os/linux/smb.c create mode 100644 lib/libshare/smb.h create mode 100644 lib/libspl/Makefile.am create mode 100644 lib/libspl/asm-generic/.gitignore create mode 100644 lib/libspl/asm-generic/atomic.c create mode 100644 lib/libspl/asm-i386/atomic.S create mode 100644 lib/libspl/asm-x86_64/atomic.S create mode 100644 lib/libspl/assert.c create mode 100644 lib/libspl/include/Makefile.am create mode 100644 lib/libspl/include/assert.h create mode 100644 lib/libspl/include/atomic.h create mode 100644 lib/libspl/include/ia32/Makefile.am create mode 100644 lib/libspl/include/ia32/sys/Makefile.am create mode 100644 lib/libspl/include/ia32/sys/asm_linkage.h create mode 100644 lib/libspl/include/libdevinfo.h create mode 100644 lib/libspl/include/libgen.h create mode 100644 lib/libspl/include/libshare.h create mode 100644 lib/libspl/include/limits.h create mode 100644 lib/libspl/include/locale.h create mode 100644 lib/libspl/include/os/Makefile.am create mode 100644 lib/libspl/include/os/freebsd/Makefile.am create mode 100644 lib/libspl/include/os/freebsd/sys/Makefile.am create mode 100644 lib/libspl/include/os/freebsd/sys/byteorder.h create mode 100644 lib/libspl/include/os/freebsd/sys/file.h create mode 100644 lib/libspl/include/os/freebsd/sys/mnttab.h create mode 100644 lib/libspl/include/os/freebsd/sys/mount.h create mode 100644 lib/libspl/include/os/freebsd/sys/param.h create mode 100644 lib/libspl/include/os/freebsd/sys/stat.h create mode 100644 lib/libspl/include/os/freebsd/sys/sysmacros.h create mode 100644 lib/libspl/include/os/freebsd/sys/vfs.h create mode 100644 lib/libspl/include/os/freebsd/sys/zfs_context_os.h create mode 100644 lib/libspl/include/os/linux/Makefile.am create mode 100644 lib/libspl/include/os/linux/sys/Makefile.am create mode 100644 lib/libspl/include/os/linux/sys/byteorder.h create mode 100644 lib/libspl/include/os/linux/sys/errno.h create mode 100644 lib/libspl/include/os/linux/sys/mnttab.h create mode 100644 lib/libspl/include/os/linux/sys/mount.h create mode 100644 lib/libspl/include/os/linux/sys/param.h create mode 100644 lib/libspl/include/os/linux/sys/stat.h create mode 100644 lib/libspl/include/os/linux/sys/sysmacros.h create mode 100644 lib/libspl/include/os/linux/sys/zfs_context_os.h create mode 100644 lib/libspl/include/rpc/Makefile.am create mode 100644 lib/libspl/include/rpc/xdr.h create mode 100644 lib/libspl/include/statcommon.h create mode 100644 lib/libspl/include/stdio.h create mode 100644 lib/libspl/include/stdlib.h create mode 100644 lib/libspl/include/string.h create mode 100644 lib/libspl/include/stropts.h create mode 100644 lib/libspl/include/sys/Makefile.am create mode 100644 lib/libspl/include/sys/acl.h create mode 100644 lib/libspl/include/sys/acl_impl.h create mode 100644 lib/libspl/include/sys/callb.h create mode 100644 lib/libspl/include/sys/cmn_err.h create mode 100644 lib/libspl/include/sys/cred.h create mode 100644 lib/libspl/include/sys/debug.h create mode 100644 lib/libspl/include/sys/dkio.h create mode 100644 lib/libspl/include/sys/dklabel.h create mode 100644 lib/libspl/include/sys/dktp/Makefile.am create mode 100644 lib/libspl/include/sys/dktp/fdisk.h create mode 100644 lib/libspl/include/sys/feature_tests.h create mode 100644 lib/libspl/include/sys/int_limits.h create mode 100644 lib/libspl/include/sys/int_types.h create mode 100644 lib/libspl/include/sys/inttypes.h create mode 100644 lib/libspl/include/sys/isa_defs.h create mode 100644 lib/libspl/include/sys/kmem.h create mode 100644 lib/libspl/include/sys/kstat.h create mode 100644 lib/libspl/include/sys/list.h create mode 100644 lib/libspl/include/sys/list_impl.h create mode 100644 lib/libspl/include/sys/mhd.h create mode 100644 lib/libspl/include/sys/mkdev.h create mode 100644 lib/libspl/include/sys/policy.h create mode 100644 lib/libspl/include/sys/poll.h create mode 100644 lib/libspl/include/sys/priv.h create mode 100644 lib/libspl/include/sys/processor.h create mode 100644 lib/libspl/include/sys/sha2.h create mode 100644 lib/libspl/include/sys/simd.h create mode 100644 lib/libspl/include/sys/stack.h create mode 100644 lib/libspl/include/sys/stdtypes.h create mode 100644 lib/libspl/include/sys/strings.h create mode 100644 lib/libspl/include/sys/stropts.h create mode 100644 lib/libspl/include/sys/sunddi.h create mode 100644 lib/libspl/include/sys/systeminfo.h create mode 100644 lib/libspl/include/sys/time.h create mode 100644 lib/libspl/include/sys/trace_spl.h create mode 100644 lib/libspl/include/sys/trace_zfs.h create mode 100644 lib/libspl/include/sys/types.h create mode 100644 lib/libspl/include/sys/types32.h create mode 100644 lib/libspl/include/sys/tzfile.h create mode 100644 lib/libspl/include/sys/uio.h create mode 100644 lib/libspl/include/sys/va_list.h create mode 100644 lib/libspl/include/sys/varargs.h create mode 100644 lib/libspl/include/sys/vnode.h create mode 100644 lib/libspl/include/sys/vtoc.h create mode 100644 lib/libspl/include/sys/zone.h create mode 100644 lib/libspl/include/thread.h create mode 100644 lib/libspl/include/tzfile.h create mode 100644 lib/libspl/include/ucred.h create mode 100644 lib/libspl/include/umem.h create mode 100644 lib/libspl/include/unistd.h create mode 100644 lib/libspl/include/util/Makefile.am create mode 100644 lib/libspl/include/util/sscanf.h create mode 100644 lib/libspl/include/zone.h create mode 100644 lib/libspl/list.c create mode 100644 lib/libspl/mkdirp.c create mode 100644 lib/libspl/os/freebsd/getexecname.c create mode 100644 lib/libspl/os/freebsd/gethostid.c create mode 100644 lib/libspl/os/freebsd/getmntany.c create mode 100644 lib/libspl/os/freebsd/mnttab.c create mode 100644 lib/libspl/os/linux/getexecname.c create mode 100644 lib/libspl/os/linux/gethostid.c create mode 100644 lib/libspl/os/linux/getmntany.c create mode 100644 lib/libspl/page.c create mode 100644 lib/libspl/strlcat.c create mode 100644 lib/libspl/strlcpy.c create mode 100644 lib/libspl/timestamp.c create mode 100644 lib/libspl/zone.c create mode 100644 lib/libtpool/Makefile.am create mode 100644 lib/libtpool/thread_pool.c create mode 100644 lib/libtpool/thread_pool_impl.h create mode 100644 lib/libunicode/Makefile.am create mode 100644 lib/libuutil/Makefile.am create mode 100644 lib/libuutil/uu_alloc.c create mode 100644 lib/libuutil/uu_avl.c create mode 100644 lib/libuutil/uu_dprintf.c create mode 100644 lib/libuutil/uu_ident.c create mode 100644 lib/libuutil/uu_list.c create mode 100644 lib/libuutil/uu_misc.c create mode 100644 lib/libuutil/uu_open.c create mode 100644 lib/libuutil/uu_pname.c create mode 100644 lib/libuutil/uu_string.c create mode 100644 lib/libzfs/.gitignore create mode 100644 lib/libzfs/Makefile.am create mode 100644 lib/libzfs/THIRDPARTYLICENSE.openssl create mode 100644 lib/libzfs/THIRDPARTYLICENSE.openssl.descrip create mode 100644 lib/libzfs/libzfs.pc.in create mode 100644 lib/libzfs/libzfs_changelist.c create mode 100644 lib/libzfs/libzfs_config.c create mode 100644 lib/libzfs/libzfs_crypto.c create mode 100644 lib/libzfs/libzfs_dataset.c create mode 100644 lib/libzfs/libzfs_diff.c create mode 100644 lib/libzfs/libzfs_import.c create mode 100644 lib/libzfs/libzfs_iter.c create mode 100644 lib/libzfs/libzfs_mount.c create mode 100644 lib/libzfs/libzfs_pool.c create mode 100644 lib/libzfs/libzfs_sendrecv.c create mode 100644 lib/libzfs/libzfs_status.c create mode 100644 lib/libzfs/libzfs_util.c create mode 100644 lib/libzfs/os/freebsd/libzfs_compat.c create mode 100644 lib/libzfs/os/freebsd/libzfs_ioctl_compat.c create mode 100644 lib/libzfs/os/freebsd/libzfs_zmount.c create mode 100644 lib/libzfs/os/linux/libzfs_mount_os.c create mode 100644 lib/libzfs/os/linux/libzfs_pool_os.c create mode 100644 lib/libzfs/os/linux/libzfs_sendrecv_os.c create mode 100644 lib/libzfs/os/linux/libzfs_util_os.c create mode 100644 lib/libzfs_core/.gitignore create mode 100644 lib/libzfs_core/Makefile.am create mode 100644 lib/libzfs_core/libzfs_core.c create mode 100644 lib/libzfs_core/libzfs_core.pc.in create mode 100644 lib/libzpool/Makefile.am create mode 100644 lib/libzpool/kernel.c create mode 100644 lib/libzpool/taskq.c create mode 100644 lib/libzpool/util.c create mode 100644 lib/libzstd/Makefile.am create mode 100644 lib/libzutil/Makefile.am create mode 100644 lib/libzutil/os/freebsd/zutil_compat.c create mode 100644 lib/libzutil/os/freebsd/zutil_device_path_os.c create mode 100644 lib/libzutil/os/freebsd/zutil_import_os.c create mode 100644 lib/libzutil/os/linux/zutil_compat.c create mode 100644 lib/libzutil/os/linux/zutil_device_path_os.c create mode 100644 lib/libzutil/os/linux/zutil_import_os.c create mode 100644 lib/libzutil/zutil_device_path.c create mode 100644 lib/libzutil/zutil_import.c create mode 100644 lib/libzutil/zutil_import.h create mode 100644 lib/libzutil/zutil_nicenum.c create mode 100644 lib/libzutil/zutil_pool.c create mode 100644 man/Makefile.am create mode 100644 man/man1/Makefile.am create mode 100644 man/man1/arcstat.1 create mode 100644 man/man1/cstyle.1 create mode 100644 man/man1/raidz_test.1 create mode 100644 man/man1/zhack.1 create mode 100644 man/man1/ztest.1 create mode 100644 man/man1/zvol_wait.1 create mode 100644 man/man5/Makefile.am create mode 100644 man/man5/spl-module-parameters.5 create mode 100644 man/man5/vdev_id.conf.5 create mode 100644 man/man5/zfs-events.5 create mode 100644 man/man5/zfs-module-parameters.5 create mode 100644 man/man5/zpool-features.5 create mode 100644 man/man8/.gitignore create mode 100644 man/man8/Makefile.am create mode 100644 man/man8/fsck.zfs.8 create mode 100644 man/man8/mount.zfs.8 create mode 100644 man/man8/vdev_id.8 create mode 100644 man/man8/zdb.8 create mode 100644 man/man8/zed.8.in create mode 100644 man/man8/zfs-allow.8 create mode 100644 man/man8/zfs-bookmark.8 create mode 120000 man/man8/zfs-change-key.8 create mode 100644 man/man8/zfs-clone.8 create mode 100644 man/man8/zfs-create.8 create mode 100644 man/man8/zfs-destroy.8 create mode 100644 man/man8/zfs-diff.8 create mode 120000 man/man8/zfs-get.8 create mode 120000 man/man8/zfs-groupspace.8 create mode 100644 man/man8/zfs-hold.8 create mode 120000 man/man8/zfs-inherit.8 create mode 100644 man/man8/zfs-jail.8 create mode 100644 man/man8/zfs-list.8 create mode 100644 man/man8/zfs-load-key.8 create mode 100644 man/man8/zfs-mount-generator.8.in create mode 100644 man/man8/zfs-mount.8 create mode 100644 man/man8/zfs-program.8 create mode 100644 man/man8/zfs-project.8 create mode 120000 man/man8/zfs-projectspace.8 create mode 100644 man/man8/zfs-promote.8 create mode 100644 man/man8/zfs-receive.8 create mode 120000 man/man8/zfs-recv.8 create mode 120000 man/man8/zfs-redact.8 create mode 120000 man/man8/zfs-release.8 create mode 100644 man/man8/zfs-rename.8 create mode 100644 man/man8/zfs-rollback.8 create mode 100644 man/man8/zfs-send.8 create mode 100644 man/man8/zfs-set.8 create mode 100644 man/man8/zfs-share.8 create mode 100644 man/man8/zfs-snapshot.8 create mode 120000 man/man8/zfs-unallow.8 create mode 120000 man/man8/zfs-unjail.8 create mode 120000 man/man8/zfs-unload-key.8 create mode 120000 man/man8/zfs-unmount.8 create mode 100644 man/man8/zfs-upgrade.8 create mode 100644 man/man8/zfs-userspace.8 create mode 100644 man/man8/zfs-wait.8 create mode 100644 man/man8/zfs.8 create mode 100644 man/man8/zfs_ids_to_path.8 create mode 100644 man/man8/zfsconcepts.8 create mode 100644 man/man8/zfsprops.8 create mode 100644 man/man8/zgenhostid.8 create mode 100644 man/man8/zinject.8 create mode 100644 man/man8/zpool-add.8 create mode 100644 man/man8/zpool-attach.8 create mode 100644 man/man8/zpool-checkpoint.8 create mode 100644 man/man8/zpool-clear.8 create mode 100644 man/man8/zpool-create.8 create mode 100644 man/man8/zpool-destroy.8 create mode 100644 man/man8/zpool-detach.8 create mode 100644 man/man8/zpool-events.8 create mode 100644 man/man8/zpool-export.8 create mode 100644 man/man8/zpool-get.8 create mode 100644 man/man8/zpool-history.8 create mode 100644 man/man8/zpool-import.8 create mode 100644 man/man8/zpool-initialize.8 create mode 100644 man/man8/zpool-iostat.8 create mode 100644 man/man8/zpool-labelclear.8 create mode 100644 man/man8/zpool-list.8 create mode 100644 man/man8/zpool-offline.8 create mode 120000 man/man8/zpool-online.8 create mode 100644 man/man8/zpool-reguid.8 create mode 100644 man/man8/zpool-remove.8 create mode 100644 man/man8/zpool-reopen.8 create mode 100644 man/man8/zpool-replace.8 create mode 100644 man/man8/zpool-resilver.8 create mode 100644 man/man8/zpool-scrub.8 create mode 120000 man/man8/zpool-set.8 create mode 100644 man/man8/zpool-split.8 create mode 100644 man/man8/zpool-status.8 create mode 100644 man/man8/zpool-sync.8 create mode 100644 man/man8/zpool-trim.8 create mode 100644 man/man8/zpool-upgrade.8 create mode 100644 man/man8/zpool-wait.8 create mode 100644 man/man8/zpool.8 create mode 100644 man/man8/zpoolconcepts.8 create mode 100644 man/man8/zpoolprops.8 create mode 100644 man/man8/zstream.8 create mode 100644 man/man8/zstreamdump.8 create mode 100644 module/.gitignore create mode 100644 module/Kbuild.in create mode 100644 module/Makefile.bsd create mode 100644 module/Makefile.in create mode 100644 module/avl/Makefile.in create mode 100644 module/avl/avl.c create mode 100644 module/icp/Makefile.in create mode 100644 module/icp/algs/aes/aes_impl.c create mode 100644 module/icp/algs/aes/aes_impl_aesni.c create mode 100644 module/icp/algs/aes/aes_impl_generic.c create mode 100644 module/icp/algs/aes/aes_impl_x86-64.c create mode 100644 module/icp/algs/aes/aes_modes.c create mode 100644 module/icp/algs/edonr/edonr.c create mode 100644 module/icp/algs/edonr/edonr_byteorder.h create mode 100644 module/icp/algs/modes/cbc.c create mode 100644 module/icp/algs/modes/ccm.c create mode 100644 module/icp/algs/modes/ctr.c create mode 100644 module/icp/algs/modes/ecb.c create mode 100644 module/icp/algs/modes/gcm.c create mode 100644 module/icp/algs/modes/gcm_generic.c create mode 100644 module/icp/algs/modes/gcm_pclmulqdq.c create mode 100644 module/icp/algs/modes/modes.c create mode 100644 module/icp/algs/sha1/sha1.c create mode 100644 module/icp/algs/sha2/sha2.c create mode 100644 module/icp/algs/skein/THIRDPARTYLICENSE create mode 100644 module/icp/algs/skein/THIRDPARTYLICENSE.descrip create mode 100644 module/icp/algs/skein/skein.c create mode 100644 module/icp/algs/skein/skein_block.c create mode 100644 module/icp/algs/skein/skein_impl.h create mode 100644 module/icp/algs/skein/skein_iv.c create mode 100644 module/icp/algs/skein/skein_port.h create mode 100644 module/icp/api/kcf_cipher.c create mode 100644 module/icp/api/kcf_ctxops.c create mode 100644 module/icp/api/kcf_digest.c create mode 100644 module/icp/api/kcf_mac.c create mode 100644 module/icp/api/kcf_miscapi.c create mode 100644 module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman create mode 100644 module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip create mode 100644 module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl create mode 100644 module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip create mode 100644 module/icp/asm-x86_64/aes/aes_aesni.S create mode 100644 module/icp/asm-x86_64/aes/aes_amd64.S create mode 100644 module/icp/asm-x86_64/aes/aeskey.c create mode 100644 module/icp/asm-x86_64/aes/aesopt.h create mode 100644 module/icp/asm-x86_64/aes/aestab.h create mode 100644 module/icp/asm-x86_64/aes/aestab2.h create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl create mode 100644 module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip create mode 100644 module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S create mode 100644 module/icp/asm-x86_64/modes/gcm_pclmulqdq.S create mode 100644 module/icp/asm-x86_64/modes/ghash-x86_64.S create mode 100644 module/icp/asm-x86_64/sha1/sha1-x86_64.S create mode 100644 module/icp/asm-x86_64/sha2/sha256_impl.S create mode 100644 module/icp/asm-x86_64/sha2/sha512_impl.S create mode 100644 module/icp/core/kcf_callprov.c create mode 100644 module/icp/core/kcf_mech_tabs.c create mode 100644 module/icp/core/kcf_prov_lib.c create mode 100644 module/icp/core/kcf_prov_tabs.c create mode 100644 module/icp/core/kcf_sched.c create mode 100644 module/icp/illumos-crypto.c create mode 100644 module/icp/include/aes/aes_impl.h create mode 100644 module/icp/include/modes/gcm_impl.h create mode 100644 module/icp/include/modes/modes.h create mode 100644 module/icp/include/sha1/sha1.h create mode 100644 module/icp/include/sha1/sha1_consts.h create mode 100644 module/icp/include/sha1/sha1_impl.h create mode 100644 module/icp/include/sha2/sha2_consts.h create mode 100644 module/icp/include/sha2/sha2_impl.h create mode 100644 module/icp/include/sys/asm_linkage.h create mode 100644 module/icp/include/sys/bitmap.h create mode 100644 module/icp/include/sys/crypto/elfsign.h create mode 100644 module/icp/include/sys/crypto/impl.h create mode 100644 module/icp/include/sys/crypto/ioctl.h create mode 100644 module/icp/include/sys/crypto/ioctladmin.h create mode 100644 module/icp/include/sys/crypto/ops_impl.h create mode 100644 module/icp/include/sys/crypto/sched_impl.h create mode 100644 module/icp/include/sys/crypto/spi.h create mode 100644 module/icp/include/sys/ia32/asm_linkage.h create mode 100644 module/icp/include/sys/ia32/stack.h create mode 100644 module/icp/include/sys/ia32/trap.h create mode 100644 module/icp/include/sys/modctl.h create mode 100644 module/icp/include/sys/modhash.h create mode 100644 module/icp/include/sys/modhash_impl.h create mode 100644 module/icp/include/sys/stack.h create mode 100644 module/icp/include/sys/trap.h create mode 100644 module/icp/io/aes.c create mode 100644 module/icp/io/edonr_mod.c create mode 100644 module/icp/io/sha1_mod.c create mode 100644 module/icp/io/sha2_mod.c create mode 100644 module/icp/io/skein_mod.c create mode 100644 module/icp/os/modconf.c create mode 100644 module/icp/os/modhash.c create mode 100644 module/icp/spi/kcf_spi.c create mode 100644 module/lua/Makefile.in create mode 100644 module/lua/README.zfs create mode 100644 module/lua/lapi.c create mode 100644 module/lua/lapi.h create mode 100644 module/lua/lauxlib.c create mode 100644 module/lua/lbaselib.c create mode 100644 module/lua/lcode.c create mode 100644 module/lua/lcode.h create mode 100644 module/lua/lcompat.c create mode 100644 module/lua/lcorolib.c create mode 100644 module/lua/lctype.c create mode 100644 module/lua/lctype.h create mode 100644 module/lua/ldebug.c create mode 100644 module/lua/ldebug.h create mode 100644 module/lua/ldo.c create mode 100644 module/lua/ldo.h create mode 100644 module/lua/lfunc.c create mode 100644 module/lua/lfunc.h create mode 100644 module/lua/lgc.c create mode 100644 module/lua/lgc.h create mode 100644 module/lua/llex.c create mode 100644 module/lua/llex.h create mode 100644 module/lua/llimits.h create mode 100644 module/lua/lmem.c create mode 100644 module/lua/lmem.h create mode 100644 module/lua/lobject.c create mode 100644 module/lua/lobject.h create mode 100644 module/lua/lopcodes.c create mode 100644 module/lua/lopcodes.h create mode 100644 module/lua/lparser.c create mode 100644 module/lua/lparser.h create mode 100644 module/lua/lstate.c create mode 100644 module/lua/lstate.h create mode 100644 module/lua/lstring.c create mode 100644 module/lua/lstring.h create mode 100644 module/lua/lstrlib.c create mode 100644 module/lua/ltable.c create mode 100644 module/lua/ltable.h create mode 100644 module/lua/ltablib.c create mode 100644 module/lua/ltm.c create mode 100644 module/lua/ltm.h create mode 100644 module/lua/lvm.c create mode 100644 module/lua/lvm.h create mode 100644 module/lua/lzio.c create mode 100644 module/lua/lzio.h create mode 100644 module/lua/setjmp/setjmp.S create mode 100644 module/lua/setjmp/setjmp_aarch64.S create mode 100644 module/lua/setjmp/setjmp_arm.S create mode 100644 module/lua/setjmp/setjmp_i386.S create mode 100644 module/lua/setjmp/setjmp_mips.S create mode 100644 module/lua/setjmp/setjmp_ppc.S create mode 100644 module/lua/setjmp/setjmp_rv64g.S create mode 100644 module/lua/setjmp/setjmp_s390x.S create mode 100644 module/lua/setjmp/setjmp_sparc64.S create mode 100644 module/lua/setjmp/setjmp_x86_64.S create mode 100644 module/nvpair/Makefile.in create mode 100644 module/nvpair/fnvpair.c create mode 100644 module/nvpair/nvpair.c create mode 100644 module/nvpair/nvpair_alloc_fixed.c create mode 100644 module/nvpair/nvpair_alloc_spl.c create mode 100644 module/os/freebsd/spl/acl_common.c create mode 100644 module/os/freebsd/spl/callb.c create mode 100644 module/os/freebsd/spl/list.c create mode 100644 module/os/freebsd/spl/sha224.h create mode 100644 module/os/freebsd/spl/sha256.h create mode 100644 module/os/freebsd/spl/sha256c.c create mode 100644 module/os/freebsd/spl/sha384.h create mode 100644 module/os/freebsd/spl/sha512.h create mode 100644 module/os/freebsd/spl/sha512c.c create mode 100644 module/os/freebsd/spl/sha512t.h create mode 100644 module/os/freebsd/spl/spl_acl.c create mode 100644 module/os/freebsd/spl/spl_atomic.c create mode 100644 module/os/freebsd/spl/spl_cmn_err.c create mode 100644 module/os/freebsd/spl/spl_dtrace.c create mode 100644 module/os/freebsd/spl/spl_kmem.c create mode 100644 module/os/freebsd/spl/spl_kstat.c create mode 100644 module/os/freebsd/spl/spl_misc.c create mode 100644 module/os/freebsd/spl/spl_policy.c create mode 100644 module/os/freebsd/spl/spl_procfs_list.c create mode 100644 module/os/freebsd/spl/spl_string.c create mode 100644 module/os/freebsd/spl/spl_sunddi.c create mode 100644 module/os/freebsd/spl/spl_sysevent.c create mode 100644 module/os/freebsd/spl/spl_taskq.c create mode 100644 module/os/freebsd/spl/spl_uio.c create mode 100644 module/os/freebsd/spl/spl_vfs.c create mode 100644 module/os/freebsd/spl/spl_vm.c create mode 100644 module/os/freebsd/spl/spl_zlib.c create mode 100644 module/os/freebsd/spl/spl_zone.c create mode 100644 module/os/freebsd/zfs/abd_os.c create mode 100644 module/os/freebsd/zfs/arc_os.c create mode 100644 module/os/freebsd/zfs/crypto_os.c create mode 100644 module/os/freebsd/zfs/dmu_os.c create mode 100644 module/os/freebsd/zfs/hkdf.c create mode 100644 module/os/freebsd/zfs/kmod_core.c create mode 100644 module/os/freebsd/zfs/spa_os.c create mode 100644 module/os/freebsd/zfs/spa_stats.c create mode 100644 module/os/freebsd/zfs/sysctl_os.c create mode 100644 module/os/freebsd/zfs/vdev_file.c create mode 100644 module/os/freebsd/zfs/vdev_geom.c create mode 100644 module/os/freebsd/zfs/vdev_label_os.c create mode 100644 module/os/freebsd/zfs/zfs_acl.c create mode 100644 module/os/freebsd/zfs/zfs_ctldir.c create mode 100644 module/os/freebsd/zfs/zfs_debug.c create mode 100644 module/os/freebsd/zfs/zfs_dir.c create mode 100644 module/os/freebsd/zfs/zfs_file_os.c create mode 100644 module/os/freebsd/zfs/zfs_ioctl_compat.c create mode 100644 module/os/freebsd/zfs/zfs_ioctl_os.c create mode 100644 module/os/freebsd/zfs/zfs_onexit_os.c create mode 100644 module/os/freebsd/zfs/zfs_vfsops.c create mode 100644 module/os/freebsd/zfs/zfs_vnops.c create mode 100644 module/os/freebsd/zfs/zfs_znode.c create mode 100644 module/os/freebsd/zfs/zio_crypt.c create mode 100644 module/os/freebsd/zfs/zvol_os.c create mode 100644 module/os/linux/spl/Makefile.in create mode 100644 module/os/linux/spl/README.md create mode 100644 module/os/linux/spl/THIRDPARTYLICENSE.gplv2 create mode 100644 module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip create mode 100644 module/os/linux/spl/spl-atomic.c create mode 100644 module/os/linux/spl/spl-condvar.c create mode 100644 module/os/linux/spl/spl-cred.c create mode 100644 module/os/linux/spl/spl-err.c create mode 100644 module/os/linux/spl/spl-generic.c create mode 100644 module/os/linux/spl/spl-kmem-cache.c create mode 100644 module/os/linux/spl/spl-kmem.c create mode 100644 module/os/linux/spl/spl-kstat.c create mode 100644 module/os/linux/spl/spl-proc.c create mode 100644 module/os/linux/spl/spl-procfs-list.c create mode 100644 module/os/linux/spl/spl-taskq.c create mode 100644 module/os/linux/spl/spl-thread.c create mode 100644 module/os/linux/spl/spl-trace.c create mode 100644 module/os/linux/spl/spl-tsd.c create mode 100644 module/os/linux/spl/spl-vmem.c create mode 100644 module/os/linux/spl/spl-xdr.c create mode 100644 module/os/linux/spl/spl-zlib.c create mode 100644 module/os/linux/zfs/Makefile.in create mode 100644 module/os/linux/zfs/abd_os.c create mode 100644 module/os/linux/zfs/arc_os.c create mode 100644 module/os/linux/zfs/mmp_os.c create mode 100644 module/os/linux/zfs/policy.c create mode 100644 module/os/linux/zfs/qat.c create mode 100644 module/os/linux/zfs/qat_compress.c create mode 100644 module/os/linux/zfs/qat_crypt.c create mode 100644 module/os/linux/zfs/spa_misc_os.c create mode 100644 module/os/linux/zfs/spa_stats.c create mode 100644 module/os/linux/zfs/trace.c create mode 100644 module/os/linux/zfs/vdev_disk.c create mode 100644 module/os/linux/zfs/vdev_file.c create mode 100644 module/os/linux/zfs/zfs_acl.c create mode 100644 module/os/linux/zfs/zfs_ctldir.c create mode 100644 module/os/linux/zfs/zfs_debug.c create mode 100644 module/os/linux/zfs/zfs_dir.c create mode 100644 module/os/linux/zfs/zfs_file_os.c create mode 100644 module/os/linux/zfs/zfs_ioctl_os.c create mode 100644 module/os/linux/zfs/zfs_sysfs.c create mode 100644 module/os/linux/zfs/zfs_vfsops.c create mode 100644 module/os/linux/zfs/zfs_vnops.c create mode 100644 module/os/linux/zfs/zfs_znode.c create mode 100644 module/os/linux/zfs/zio_crypt.c create mode 100644 module/os/linux/zfs/zpl_ctldir.c create mode 100644 module/os/linux/zfs/zpl_export.c create mode 100644 module/os/linux/zfs/zpl_file.c create mode 100644 module/os/linux/zfs/zpl_inode.c create mode 100644 module/os/linux/zfs/zpl_super.c create mode 100644 module/os/linux/zfs/zpl_xattr.c create mode 100644 module/os/linux/zfs/zvol_os.c create mode 100644 module/spl/Makefile.in create mode 100644 module/unicode/Makefile.in create mode 100644 module/unicode/u8_textprep.c create mode 100644 module/unicode/uconv.c create mode 100644 module/zcommon/Makefile.in create mode 100644 module/zcommon/cityhash.c create mode 100644 module/zcommon/zfeature_common.c create mode 100644 module/zcommon/zfs_comutil.c create mode 100644 module/zcommon/zfs_deleg.c create mode 100644 module/zcommon/zfs_fletcher.c create mode 100644 module/zcommon/zfs_fletcher_aarch64_neon.c create mode 100644 module/zcommon/zfs_fletcher_avx512.c create mode 100644 module/zcommon/zfs_fletcher_intel.c create mode 100644 module/zcommon/zfs_fletcher_sse.c create mode 100644 module/zcommon/zfs_fletcher_superscalar.c create mode 100644 module/zcommon/zfs_fletcher_superscalar4.c create mode 100644 module/zcommon/zfs_namecheck.c create mode 100644 module/zcommon/zfs_prop.c create mode 100644 module/zcommon/zfs_uio.c create mode 100644 module/zcommon/zpool_prop.c create mode 100644 module/zcommon/zprop_common.c create mode 100644 module/zfs/Makefile.in create mode 100644 module/zfs/THIRDPARTYLICENSE.cityhash create mode 100644 module/zfs/THIRDPARTYLICENSE.cityhash.descrip create mode 100644 module/zfs/abd.c create mode 100644 module/zfs/aggsum.c create mode 100644 module/zfs/arc.c create mode 100644 module/zfs/blkptr.c create mode 100644 module/zfs/bplist.c create mode 100644 module/zfs/bpobj.c create mode 100644 module/zfs/bptree.c create mode 100644 module/zfs/bqueue.c create mode 100644 module/zfs/btree.c create mode 100644 module/zfs/dataset_kstats.c create mode 100644 module/zfs/dbuf.c create mode 100644 module/zfs/dbuf_stats.c create mode 100644 module/zfs/ddt.c create mode 100644 module/zfs/ddt_zap.c create mode 100644 module/zfs/dmu.c create mode 100644 module/zfs/dmu_diff.c create mode 100644 module/zfs/dmu_object.c create mode 100644 module/zfs/dmu_objset.c create mode 100644 module/zfs/dmu_recv.c create mode 100644 module/zfs/dmu_redact.c create mode 100644 module/zfs/dmu_send.c create mode 100644 module/zfs/dmu_traverse.c create mode 100644 module/zfs/dmu_tx.c create mode 100644 module/zfs/dmu_zfetch.c create mode 100644 module/zfs/dnode.c create mode 100644 module/zfs/dnode_sync.c create mode 100644 module/zfs/dsl_bookmark.c create mode 100644 module/zfs/dsl_crypt.c create mode 100644 module/zfs/dsl_dataset.c create mode 100644 module/zfs/dsl_deadlist.c create mode 100644 module/zfs/dsl_deleg.c create mode 100644 module/zfs/dsl_destroy.c create mode 100644 module/zfs/dsl_dir.c create mode 100644 module/zfs/dsl_pool.c create mode 100644 module/zfs/dsl_prop.c create mode 100644 module/zfs/dsl_scan.c create mode 100644 module/zfs/dsl_synctask.c create mode 100644 module/zfs/dsl_userhold.c create mode 100644 module/zfs/edonr_zfs.c create mode 100644 module/zfs/fm.c create mode 100644 module/zfs/gzip.c create mode 100644 module/zfs/hkdf.c create mode 100644 module/zfs/lz4.c create mode 100644 module/zfs/lzjb.c create mode 100644 module/zfs/metaslab.c create mode 100644 module/zfs/mmp.c create mode 100644 module/zfs/multilist.c create mode 100644 module/zfs/objlist.c create mode 100644 module/zfs/pathname.c create mode 100644 module/zfs/range_tree.c create mode 100644 module/zfs/refcount.c create mode 100644 module/zfs/rrwlock.c create mode 100644 module/zfs/sa.c create mode 100644 module/zfs/sha256.c create mode 100644 module/zfs/skein_zfs.c create mode 100644 module/zfs/spa.c create mode 100644 module/zfs/spa_boot.c create mode 100644 module/zfs/spa_checkpoint.c create mode 100644 module/zfs/spa_config.c create mode 100644 module/zfs/spa_errlog.c create mode 100644 module/zfs/spa_history.c create mode 100644 module/zfs/spa_log_spacemap.c create mode 100644 module/zfs/spa_misc.c create mode 100644 module/zfs/space_map.c create mode 100644 module/zfs/space_reftree.c create mode 100644 module/zfs/txg.c create mode 100644 module/zfs/uberblock.c create mode 100644 module/zfs/unique.c create mode 100644 module/zfs/vdev.c create mode 100644 module/zfs/vdev_cache.c create mode 100644 module/zfs/vdev_indirect.c create mode 100644 module/zfs/vdev_indirect_births.c create mode 100644 module/zfs/vdev_indirect_mapping.c create mode 100644 module/zfs/vdev_initialize.c create mode 100644 module/zfs/vdev_label.c create mode 100644 module/zfs/vdev_mirror.c create mode 100644 module/zfs/vdev_missing.c create mode 100644 module/zfs/vdev_queue.c create mode 100644 module/zfs/vdev_raidz.c create mode 100644 module/zfs/vdev_raidz_math.c create mode 100644 module/zfs/vdev_raidz_math_aarch64_neon.c create mode 100644 module/zfs/vdev_raidz_math_aarch64_neon_common.h create mode 100644 module/zfs/vdev_raidz_math_aarch64_neonx2.c create mode 100644 module/zfs/vdev_raidz_math_avx2.c create mode 100644 module/zfs/vdev_raidz_math_avx512bw.c create mode 100644 module/zfs/vdev_raidz_math_avx512f.c create mode 100644 module/zfs/vdev_raidz_math_impl.h create mode 100644 module/zfs/vdev_raidz_math_powerpc_altivec.c create mode 100644 module/zfs/vdev_raidz_math_powerpc_altivec_common.h create mode 100644 module/zfs/vdev_raidz_math_scalar.c create mode 100644 module/zfs/vdev_raidz_math_sse2.c create mode 100644 module/zfs/vdev_raidz_math_ssse3.c create mode 100644 module/zfs/vdev_rebuild.c create mode 100644 module/zfs/vdev_removal.c create mode 100644 module/zfs/vdev_root.c create mode 100644 module/zfs/vdev_trim.c create mode 100644 module/zfs/zap.c create mode 100644 module/zfs/zap_leaf.c create mode 100644 module/zfs/zap_micro.c create mode 100644 module/zfs/zcp.c create mode 100644 module/zfs/zcp_get.c create mode 100644 module/zfs/zcp_global.c create mode 100644 module/zfs/zcp_iter.c create mode 100644 module/zfs/zcp_set.c create mode 100644 module/zfs/zcp_synctask.c create mode 100644 module/zfs/zfeature.c create mode 100644 module/zfs/zfs_byteswap.c create mode 100644 module/zfs/zfs_fm.c create mode 100644 module/zfs/zfs_fuid.c create mode 100644 module/zfs/zfs_ioctl.c create mode 100644 module/zfs/zfs_log.c create mode 100644 module/zfs/zfs_onexit.c create mode 100644 module/zfs/zfs_quota.c create mode 100644 module/zfs/zfs_ratelimit.c create mode 100644 module/zfs/zfs_replay.c create mode 100644 module/zfs/zfs_rlock.c create mode 100644 module/zfs/zfs_sa.c create mode 100644 module/zfs/zil.c create mode 100644 module/zfs/zio.c create mode 100644 module/zfs/zio_checksum.c create mode 100644 module/zfs/zio_compress.c create mode 100644 module/zfs/zio_inject.c create mode 100644 module/zfs/zle.c create mode 100644 module/zfs/zrlock.c create mode 100644 module/zfs/zthr.c create mode 100644 module/zfs/zvol.c create mode 100644 module/zstd/Makefile.in create mode 100644 module/zstd/README.md create mode 100644 module/zstd/include/aarch64_compat.h create mode 100644 module/zstd/include/limits.h create mode 100644 module/zstd/include/stddef.h create mode 100644 module/zstd/include/stdint.h create mode 100644 module/zstd/include/stdio.h create mode 100644 module/zstd/include/stdlib.h create mode 100644 module/zstd/include/string.h create mode 100644 module/zstd/include/zstd_compat_wrapper.h create mode 100644 module/zstd/lib/zstd.c create mode 100644 module/zstd/lib/zstd.h create mode 100644 module/zstd/lib/zstd_errors.h create mode 100644 module/zstd/zfs_zstd.c create mode 100644 module/zstd/zstd-in.c create mode 100644 rpm/Makefile.am create mode 100644 rpm/generic/.gitignore create mode 100644 rpm/generic/Makefile.am create mode 100644 rpm/generic/zfs-dkms.spec.in create mode 100644 rpm/generic/zfs-kmod.spec.in create mode 100644 rpm/generic/zfs.spec.in create mode 100644 rpm/redhat/.gitignore create mode 100644 rpm/redhat/Makefile.am create mode 120000 rpm/redhat/zfs-dkms.spec.in create mode 100644 rpm/redhat/zfs-kmod.spec.in create mode 120000 rpm/redhat/zfs.spec.in create mode 100644 scripts/.gitignore create mode 100644 scripts/Makefile.am create mode 100755 scripts/commitcheck.sh create mode 100644 scripts/common.sh.in create mode 100755 scripts/cstyle.pl create mode 100755 scripts/dkms.mkconf create mode 100755 scripts/dkms.postbuild create mode 100755 scripts/enum-extract.pl create mode 100755 scripts/kmodtool create mode 100755 scripts/make_gitrev.sh create mode 100755 scripts/man-dates.sh create mode 100755 scripts/paxcheck.sh create mode 100755 scripts/zfs-helpers.sh create mode 100755 scripts/zfs-tests.sh create mode 100755 scripts/zfs.sh create mode 100755 scripts/zfs2zol-patch.sed create mode 100755 scripts/zimport.sh create mode 100755 scripts/zloop.sh create mode 100755 scripts/zol2zfs-patch.sed create mode 100644 tests/Makefile.am create mode 100644 tests/README.md create mode 100644 tests/runfiles/Makefile.am create mode 100644 tests/runfiles/common.run create mode 100644 tests/runfiles/freebsd.run create mode 100644 tests/runfiles/linux.run create mode 100644 tests/runfiles/longevity.run create mode 100644 tests/runfiles/perf-regression.run create mode 100644 tests/runfiles/sunos.run create mode 100644 tests/test-runner/Makefile.am create mode 100644 tests/test-runner/bin/.gitignore create mode 100644 tests/test-runner/bin/Makefile.am create mode 100755 tests/test-runner/bin/test-runner.py.in create mode 100755 tests/test-runner/bin/zts-report.py.in create mode 100644 tests/test-runner/include/Makefile.am create mode 100644 tests/test-runner/include/logapi.shlib create mode 100644 tests/test-runner/include/stf.shlib create mode 100644 tests/test-runner/man/Makefile.am create mode 100644 tests/test-runner/man/test-runner.1 create mode 100644 tests/zfs-tests/Makefile.am create mode 100644 tests/zfs-tests/callbacks/Makefile.am create mode 100755 tests/zfs-tests/callbacks/zfs_dbgmsg.ksh create mode 100755 tests/zfs-tests/callbacks/zfs_dmesg.ksh create mode 100755 tests/zfs-tests/callbacks/zfs_failsafe.ksh create mode 100755 tests/zfs-tests/callbacks/zfs_mmp.ksh create mode 100644 tests/zfs-tests/cmd/Makefile.am create mode 100644 tests/zfs-tests/cmd/btree_test/.gitignore create mode 100644 tests/zfs-tests/cmd/btree_test/Makefile.am create mode 100644 tests/zfs-tests/cmd/btree_test/btree_test.c create mode 100644 tests/zfs-tests/cmd/chg_usr_exec/.gitignore create mode 100644 tests/zfs-tests/cmd/chg_usr_exec/Makefile.am create mode 100644 tests/zfs-tests/cmd/chg_usr_exec/chg_usr_exec.c create mode 100644 tests/zfs-tests/cmd/devname2devid/.gitignore create mode 100644 tests/zfs-tests/cmd/devname2devid/Makefile.am create mode 100644 tests/zfs-tests/cmd/devname2devid/devname2devid.c create mode 100644 tests/zfs-tests/cmd/dir_rd_update/.gitignore create mode 100644 tests/zfs-tests/cmd/dir_rd_update/Makefile.am create mode 100644 tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c create mode 100644 tests/zfs-tests/cmd/file_check/.gitignore create mode 100644 tests/zfs-tests/cmd/file_check/Makefile.am create mode 100644 tests/zfs-tests/cmd/file_check/file_check.c create mode 100644 tests/zfs-tests/cmd/file_common.h create mode 100644 tests/zfs-tests/cmd/file_trunc/.gitignore create mode 100644 tests/zfs-tests/cmd/file_trunc/Makefile.am create mode 100644 tests/zfs-tests/cmd/file_trunc/file_trunc.c create mode 100644 tests/zfs-tests/cmd/file_write/.gitignore create mode 100644 tests/zfs-tests/cmd/file_write/Makefile.am create mode 100644 tests/zfs-tests/cmd/file_write/file_write.c create mode 100644 tests/zfs-tests/cmd/get_diff/.gitignore create mode 100644 tests/zfs-tests/cmd/get_diff/Makefile.am create mode 100644 tests/zfs-tests/cmd/get_diff/get_diff.c create mode 100644 tests/zfs-tests/cmd/largest_file/.gitignore create mode 100644 tests/zfs-tests/cmd/largest_file/Makefile.am create mode 100644 tests/zfs-tests/cmd/largest_file/largest_file.c create mode 100644 tests/zfs-tests/cmd/libzfs_input_check/.gitignore create mode 100644 tests/zfs-tests/cmd/libzfs_input_check/Makefile.am create mode 100644 tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c create mode 100644 tests/zfs-tests/cmd/mkbusy/.gitignore create mode 100644 tests/zfs-tests/cmd/mkbusy/Makefile.am create mode 100644 tests/zfs-tests/cmd/mkbusy/mkbusy.c create mode 100644 tests/zfs-tests/cmd/mkfile/.gitignore create mode 100644 tests/zfs-tests/cmd/mkfile/Makefile.am create mode 100644 tests/zfs-tests/cmd/mkfile/mkfile.c create mode 100644 tests/zfs-tests/cmd/mkfiles/.gitignore create mode 100644 tests/zfs-tests/cmd/mkfiles/Makefile.am create mode 100644 tests/zfs-tests/cmd/mkfiles/mkfiles.c create mode 100644 tests/zfs-tests/cmd/mktree/.gitignore create mode 100644 tests/zfs-tests/cmd/mktree/Makefile.am create mode 100644 tests/zfs-tests/cmd/mktree/mktree.c create mode 100644 tests/zfs-tests/cmd/mmap_exec/.gitignore create mode 100644 tests/zfs-tests/cmd/mmap_exec/Makefile.am create mode 100644 tests/zfs-tests/cmd/mmap_exec/mmap_exec.c create mode 100644 tests/zfs-tests/cmd/mmap_libaio/.gitignore create mode 100644 tests/zfs-tests/cmd/mmap_libaio/Makefile.am create mode 100644 tests/zfs-tests/cmd/mmap_libaio/mmap_libaio.c create mode 100644 tests/zfs-tests/cmd/mmapwrite/.gitignore create mode 100644 tests/zfs-tests/cmd/mmapwrite/Makefile.am create mode 100644 tests/zfs-tests/cmd/mmapwrite/mmapwrite.c create mode 100644 tests/zfs-tests/cmd/nvlist_to_lua/.gitignore create mode 100644 tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am create mode 100644 tests/zfs-tests/cmd/nvlist_to_lua/nvlist_to_lua.c create mode 100644 tests/zfs-tests/cmd/randfree_file/.gitignore create mode 100644 tests/zfs-tests/cmd/randfree_file/Makefile.am create mode 100644 tests/zfs-tests/cmd/randfree_file/randfree_file.c create mode 100644 tests/zfs-tests/cmd/randwritecomp/.gitignore create mode 100644 tests/zfs-tests/cmd/randwritecomp/Makefile.am create mode 100644 tests/zfs-tests/cmd/randwritecomp/randwritecomp.c create mode 100644 tests/zfs-tests/cmd/readmmap/.gitignore create mode 100644 tests/zfs-tests/cmd/readmmap/Makefile.am create mode 100644 tests/zfs-tests/cmd/readmmap/readmmap.c create mode 100644 tests/zfs-tests/cmd/rename_dir/.gitignore create mode 100644 tests/zfs-tests/cmd/rename_dir/Makefile.am create mode 100644 tests/zfs-tests/cmd/rename_dir/rename_dir.c create mode 100644 tests/zfs-tests/cmd/rm_lnkcnt_zero_file/.gitignore create mode 100644 tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile.am create mode 100644 tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c create mode 100644 tests/zfs-tests/cmd/stride_dd/.gitignore create mode 100644 tests/zfs-tests/cmd/stride_dd/Makefile.am create mode 100644 tests/zfs-tests/cmd/stride_dd/stride_dd.c create mode 100644 tests/zfs-tests/cmd/threadsappend/.gitignore create mode 100644 tests/zfs-tests/cmd/threadsappend/Makefile.am create mode 100644 tests/zfs-tests/cmd/threadsappend/threadsappend.c create mode 100644 tests/zfs-tests/cmd/user_ns_exec/.gitignore create mode 100644 tests/zfs-tests/cmd/user_ns_exec/Makefile.am create mode 100644 tests/zfs-tests/cmd/user_ns_exec/user_ns_exec.c create mode 100644 tests/zfs-tests/cmd/xattrtest/.gitignore create mode 100644 tests/zfs-tests/cmd/xattrtest/Makefile.am create mode 100644 tests/zfs-tests/cmd/xattrtest/xattrtest.c create mode 100644 tests/zfs-tests/include/.gitignore create mode 100644 tests/zfs-tests/include/Makefile.am create mode 100644 tests/zfs-tests/include/blkdev.shlib create mode 100644 tests/zfs-tests/include/commands.cfg create mode 100644 tests/zfs-tests/include/default.cfg.in create mode 100644 tests/zfs-tests/include/libtest.shlib create mode 100644 tests/zfs-tests/include/math.shlib create mode 100644 tests/zfs-tests/include/properties.shlib create mode 100644 tests/zfs-tests/include/tunables.cfg create mode 100644 tests/zfs-tests/include/zpool_script.shlib create mode 100644 tests/zfs-tests/tests/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/acl/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/acl/acl.cfg create mode 100644 tests/zfs-tests/tests/functional/acl/acl_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/acl/posix/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/acl/posix/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/acl/posix/posix_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/acl/posix/posix_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/acl/posix/posix_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/acl/posix/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/alloc_class/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/alloc_class/alloc_class.cfg create mode 100644 tests/zfs-tests/tests/functional/alloc_class/alloc_class.kshlib create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/alloc_class_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/alloc_class/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/arc/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/arc/arcstats_runtime_tuning.ksh create mode 100755 tests/zfs-tests/tests/functional/arc/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/arc/dbufstats_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/arc/dbufstats_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/arc/dbufstats_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/arc/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/atime/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/atime/atime.cfg create mode 100755 tests/zfs-tests/tests/functional/atime/atime_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/atime_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/atime_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/atime/atime_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/atime/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/root_atime_off.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/root_atime_on.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/root_relatime_on.ksh create mode 100755 tests/zfs-tests/tests/functional/atime/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/bootfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/bootfs_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/bootfs/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/btree/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/btree/btree_negative.ksh create mode 100755 tests/zfs-tests/tests/functional/btree/btree_positive.ksh create mode 100644 tests/zfs-tests/tests/functional/cache/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cache/cache.cfg create mode 100644 tests/zfs-tests/tests/functional/cache/cache.kshlib create mode 100755 tests/zfs-tests/tests/functional/cache/cache_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cache/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cachefile/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cachefile/cachefile.cfg create mode 100644 tests/zfs-tests/tests/functional/cachefile/cachefile.kshlib create mode 100755 tests/zfs-tests/tests/functional/cachefile/cachefile_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cachefile/cachefile_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cachefile/cachefile_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cachefile/cachefile_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cachefile/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cachefile/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/casenorm/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/casenorm/case_all_values.ksh create mode 100644 tests/zfs-tests/tests/functional/casenorm/casenorm.cfg create mode 100644 tests/zfs-tests/tests/functional/casenorm/casenorm.kshlib create mode 100755 tests/zfs-tests/tests/functional/casenorm/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/insensitive_formd_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/insensitive_formd_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/insensitive_none_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/insensitive_none_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_create_failure.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_formd_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_formd_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_formd_lookup_ci.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_none_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_none_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/mixed_none_lookup_ci.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/norm_all_values.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/sensitive_formd_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/sensitive_formd_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/sensitive_none_delete.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/sensitive_none_lookup.ksh create mode 100755 tests/zfs-tests/tests/functional/casenorm/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/channel_program/channel_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.args_to_lua.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.args_to_lua.out create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.args_to_lua.zcp create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.divide_by_zero.err create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.divide_by_zero.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.divide_by_zero.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.exists.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.integer_illegal.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.integer_overflow.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.language_functions_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.language_functions_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.large_prog.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.large_prog.out create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.large_prog.zcp create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.lib_base.lua create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.lib_coroutine.lua create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.lib_strings.lua create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.lib_table.lua create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.libraries.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.memory_limit.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.nested_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.nested_neg.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.nested_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.nested_pos.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.nvlist_to_lua.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.recursive.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.recursive_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.recursive_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_large.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_nvlist_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_nvlist_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_recursive_table.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.return_recursive_table.zcp create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.err create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.stack_gsub.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/lua_core/tst.timeout.zcp create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.copy.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.bookmark.create.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.destroy_fs.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.destroy_snap.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_count_and_limit.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.out create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_index_props.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_mountpoint.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.out create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_number_props.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.out create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_string_props.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_type.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_userquota.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.get_written.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.inherit.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_bookmarks.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_children.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_clones.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_holds.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_snapshots.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_system_props.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.list_user_props.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.parse_args_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.promote_conflict.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.promote_conflict.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.promote_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.promote_simple.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.rollback_mult.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.rollback_one.ksh create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.set_props.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_destroy.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_destroy.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_neg.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_recursive.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_recursive.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_simple.ksh create mode 100644 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.snapshot_simple.zcp create mode 100755 tests/zfs-tests/tests/functional/channel_program/synctask_core/tst.terminate_by_signal.ksh create mode 100644 tests/zfs-tests/tests/functional/chattr/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/chattr/chattr_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/chattr/chattr_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/chattr/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/chattr/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/checksum/.gitignore create mode 100644 tests/zfs-tests/tests/functional/checksum/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/checksum/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/checksum/default.cfg create mode 100644 tests/zfs-tests/tests/functional/checksum/edonr_test.c create mode 100755 tests/zfs-tests/tests/functional/checksum/filetest_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/checksum/run_edonr_test.ksh create mode 100755 tests/zfs-tests/tests/functional/checksum/run_sha2_test.ksh create mode 100755 tests/zfs-tests/tests/functional/checksum/run_skein_test.ksh create mode 100755 tests/zfs-tests/tests/functional/checksum/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/checksum/sha2_test.c create mode 100644 tests/zfs-tests/tests/functional/checksum/skein_test.c create mode 100644 tests/zfs-tests/tests/functional/clean_mirror/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/clean_mirror_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/clean_mirror_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/clean_mirror_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/clean_mirror_004_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/clean_mirror/clean_mirror_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/clean_mirror/default.cfg create mode 100755 tests/zfs-tests/tests/functional/clean_mirror/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/cli_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/cli_root/zdb/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_args_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_checksum.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_decompress_zstd.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_display_block.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_object_range_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zdb/zdb_objset_id.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs/zfs_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs/zfs_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs/zfs_003_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_child.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_clones.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_format.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_inherit.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_load.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_location.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_change-key/zfs_change-key_pbkdf2iters.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_deeply_nested.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_clone/zfs_clone_rm_nested.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_copies/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_copies/zfs_copies_006_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_create/properties.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_014_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_crypt_combos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_dryrun.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_create/zfs_create_verbose.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_and_disable.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_clone_livelist_condense_races.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_013_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_015_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_016_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_clone_livelist.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_destroy/zfs_destroy_dev_removal_condense.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_diff/.gitignore create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_diff/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_diff/socket.c create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_changes.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_cliargs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_timestamp.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_diff/zfs_diff_types.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_get/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_010_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_get/zfs_get_list_d.kshlib create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/zfs_ids_to_path_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_inherit/zfs_inherit_mountpoint.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_jail/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_jail/jail.conf create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_jail/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_jail/zfs_jail_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_all.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_file.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_location.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_noop.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_load-key/zfs_load-key_recursive.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_fail.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_all_mountpoints.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_remount.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_mount_test_race.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_mount/zfs_multi_mount.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_program/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_program/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_program/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_program/zfs_program_json.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_promote/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_promote/zfs_promote_encryptionroot.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_property/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_property/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_property/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_property/zfs_written_property_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/receive-o-x_props_override.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_-e.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_015_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_016_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_from_zstd.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_-d.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_raw_incremental.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zfs_receive_to_encrypted.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_receive/zstd_test_data.txt create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_012_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_014_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_encrypted_child.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_mountpoint.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rename/zfs_rename_to_encrypted.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_reservation/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_reservation/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_reservation/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_reservation/zfs_reservation_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_reservation/zfs_reservation_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_004_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_rollback/zfs_rollback_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send-b.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_encrypted_unloaded.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_raw.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_send/zfs_send_sparse.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/cache_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/canmount_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/checksum_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/compression_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/mountpoint_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/onoffs_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/property_alias_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/readonly_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/reservation_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/ro_props_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/share_mount_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/snapdir_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/user_property_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/version_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_003_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_feature_activation.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_set/zfs_set_keylocation.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/zfs_snapshot_009_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zfeature_set_unsupported.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zfs_get_unsupported.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zfs_set_unsupported.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zfs_sysfs_live.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zpool_get_unsupported.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/zpool_set_unsupported.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_all.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/zfs_unload-key_recursive.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_all_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_nested.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unmount/zfs_unmount_unload_keys.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_unshare/zfs_unshare_007_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/zfs_upgrade_007_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_wait/zfs_wait_deleteq.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/zpool_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/zpool_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/zpool_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool/zpool_colors.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/add-o_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/add_nested_replacing_spare.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/add_prop_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_add/zpool_add_010_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_attach/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_attach/attach-o_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_attach/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_attach/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_attach/zpool_attach_001_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_clear/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_clear/zpool_clear_readonly.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/create-o_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create.shlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_012_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_014_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_015_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_016_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_017_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_018_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_019_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_020_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_021_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_022_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_023_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_024_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_crypt_combos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_features_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_create/zpool_create_tempname.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_destroy/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_destroy/zpool_destroy_003_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_detach/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_detach/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_detach/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_detach/zpool_detach_001_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_clear.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_cliargs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_errors.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_follow.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_events/zpool_events_poolname.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_expand/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_expand/zpool_expand_005_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_export/zpool_export_004_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_005_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_get/zpool_get_parsable.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_history/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_history/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_history/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_history/zpool_history_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/cryptv0.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/missing_ivset.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/unclean_export.dat.bz2 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_added.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_removed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_device_replaced.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_mirror_attached.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_mirror_detached.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_cachefile_shared_device.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_devices_missing.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_paths_changed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_config_changed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/import_rewind_device_replaced.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_013_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_015_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_all_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_encrypted_load.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata3.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_errata4.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_features_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_features_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_features_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_missing_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_import/zpool_import_rename_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_attach_detach_add_remove.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_import_export.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_offline_export_import_online.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_online_offline.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_split.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_start_and_cancel_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_suspend_resume.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_unsupported_vdevs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_checksums.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_initialize/zpool_initialize_verify_initialized.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/labelclear.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_active.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_exported.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_removed.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/zpool_labelclear_valid.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_offline/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_offline/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_offline/zpool_offline_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_offline/zpool_offline_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_offline/zpool_offline_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_online/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_online/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_online/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_online/zpool_online_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_online/zpool_online_002_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_remove/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_remove/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_remove/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_remove/zpool_remove_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen.shlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_reopen/zpool_reopen_007_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_replace/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_replace/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace-o_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_replace/replace_prop_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_replace/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_replace/zpool_replace_001_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_bad_args.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_resilver/zpool_resilver_restart.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_encrypted_unloaded.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_multiple_copies.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_offline_device.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_print_repairing.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_set/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_set/zpool_set_features.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_cliargs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_devices.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_encryption.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_indirect.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_props.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_resilver.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_vdevs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_split/zpool_split_wholedisk.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_sync/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_sync/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_sync/zpool_sync_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_sync/zpool_sync_002_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_attach_detach_add_remove.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_import_export.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_offline_export_import_online.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_online_offline.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_partial.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_rate_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_secure.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_split.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_start_and_cancel_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_suspend_resume.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_unsupported_vdevs.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_checksums.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_trim/zpool_trim_verify_trimmed.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-broken-mirror2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v10.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v11.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v12.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v13.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v14.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v15.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1mirror3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1raidz3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v1stripe3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2mirror3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2raidz3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v2stripe3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3hotspare3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3mirror3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz21.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz22.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz23.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3raidz3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe1.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe2.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v3stripe3.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v4.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v5.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v6.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v7.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v8.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v9.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-v999.dat.bz2 create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/zfs-pool-vBROKEN.dat.bz2 create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/zpool_upgrade_009_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_rebuild.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_replace_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_resilver.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/zpool_wait_scrub_flag.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_discard.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_freeing.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_initialize_flag.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_no_activity.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_remove_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_trim_flag.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_wait/zpool_wait_usage.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/cli_user/misc/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/arc_summary_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/arcstat_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/misc/misc.cfg create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zdb_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_allow_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_clone_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_create_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_destroy_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_get_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_inherit_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_mount_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_promote_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_receive_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_rename_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_rollback_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_send_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_set_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_share_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_snapshot_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_unallow_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_unmount_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_unshare_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zfs_upgrade_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_add_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_attach_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_clear_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_create_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_destroy_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_detach_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_export_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_get_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_history_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_import_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_import_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_offline_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_online_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_remove_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_replace_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_scrub_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_set_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_status_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_upgrade_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/misc/zpool_wait_privilege.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list.cfg create mode 100644 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list.kshlib create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zfs_list/zfs_list_008_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_disable.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_homedir.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_-c_searchpath.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_iostat/zpool_iostat_005_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_list/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_list/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_list/zpool_list_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_list/zpool_list_002_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_disable.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_homedir.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_-c_searchpath.ksh create mode 100755 tests/zfs-tests/tests/functional/cli_user/zpool_status/zpool_status_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/compression/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/compression/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/compression/compress.cfg create mode 100755 tests/zfs-tests/tests/functional/compression/compress_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/compress_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/compress_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/compress_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/l2arc_compressed_arc_disabled.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/l2arc_encrypted.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/l2arc_encrypted_no_compressed_arc.ksh create mode 100755 tests/zfs-tests/tests/functional/compression/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/cp_files/.gitignore create mode 100644 tests/zfs-tests/tests/functional/cp_files/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/cp_files/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/cp_files/cp_files.c create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/cp_files/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/ctime/.gitignore create mode 100644 tests/zfs-tests/tests/functional/ctime/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/ctime/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/ctime/ctime.c create mode 100755 tests/zfs-tests/tests/functional/ctime/ctime_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/ctime/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/deadman/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/deadman/deadman.cfg create mode 100755 tests/zfs-tests/tests/functional/deadman/deadman_sync.ksh create mode 100755 tests/zfs-tests/tests/functional/deadman/deadman_zio.ksh create mode 100644 tests/zfs-tests/tests/functional/delegate/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/delegate/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/delegate/delegate.cfg create mode 100644 tests/zfs-tests/tests/functional/delegate/delegate_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/delegate/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_allow_012_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/delegate/zfs_unallow_008_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/devices/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/devices/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/devices/devices.cfg create mode 100755 tests/zfs-tests/tests/functional/devices/devices_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/devices/devices_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/devices/devices_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/devices/devices_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/devices/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/events/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/events/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/events/events.cfg create mode 100755 tests/zfs-tests/tests/functional/events/events_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/events/events_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/events/events_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/events/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/events/zed_rc_filter.ksh create mode 100644 tests/zfs-tests/tests/functional/exec/.gitignore create mode 100644 tests/zfs-tests/tests/functional/exec/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/exec/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/exec/exec_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/exec/exec_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/exec/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/fallocate/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/fallocate/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/fallocate/fallocate_prealloc.ksh create mode 100755 tests/zfs-tests/tests/functional/fallocate/fallocate_punch-hole.ksh create mode 100755 tests/zfs-tests/tests/functional/fallocate/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/fault/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/fault/auto_offline_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_online_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_replace_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_ashift.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/auto_spare_shared.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/decompress_fault.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/decrypt_fault.ksh create mode 100644 tests/zfs-tests/tests/functional/fault/fault.cfg create mode 100755 tests/zfs-tests/tests/functional/fault/scrub_after_resilver.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/fault/zpool_status_-s.ksh create mode 100644 tests/zfs-tests/tests/functional/features/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/features/async_destroy/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/features/async_destroy/async_destroy_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/async_destroy/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/features/async_destroy/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/features/large_dnode/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/large_dnode_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/features/large_dnode/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/grow/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/grow/grow.cfg create mode 100755 tests/zfs-tests/tests/functional/grow/grow_pool_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/grow/grow_replicas_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/history/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/history/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/history/history.cfg create mode 100755 tests/zfs-tests/tests/functional/history/history_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/history/history_010_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/history/history_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/history/i386.migratedpool.DAT.Z create mode 100644 tests/zfs-tests/tests/functional/history/i386.orig_history.txt create mode 100755 tests/zfs-tests/tests/functional/history/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/history/sparc.migratedpool.DAT.Z create mode 100644 tests/zfs-tests/tests/functional/history/sparc.orig_history.txt create mode 100644 tests/zfs-tests/tests/functional/history/zfs-pool-v4.dat.Z create mode 100644 tests/zfs-tests/tests/functional/hkdf/.gitignore create mode 100644 tests/zfs-tests/tests/functional/hkdf/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/hkdf/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/hkdf/hkdf_test.c create mode 100755 tests/zfs-tests/tests/functional/hkdf/run_hkdf_test.ksh create mode 100755 tests/zfs-tests/tests/functional/hkdf/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/inheritance/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/inheritance/README.config create mode 100644 tests/zfs-tests/tests/functional/inheritance/README.state create mode 100755 tests/zfs-tests/tests/functional/inheritance/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/inheritance/config001.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config002.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config003.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config004.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config005.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config006.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config007.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config008.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config009.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config010.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config011.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config012.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config013.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config014.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config015.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config016.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config017.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config018.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config019.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config020.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config021.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config022.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config023.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/config024.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/inherit.kshlib create mode 100755 tests/zfs-tests/tests/functional/inheritance/inherit_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/inheritance/state001.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state002.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state003.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state004.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state005.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state006.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state007.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state008.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state009.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state010.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state011.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state012.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state013.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state014.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state015.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state016.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state017.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state018.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state019.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state020.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state021.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state022.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state023.cfg create mode 100644 tests/zfs-tests/tests/functional/inheritance/state024.cfg create mode 100644 tests/zfs-tests/tests/functional/inuse/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/inuse/inuse.cfg create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/inuse_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/inuse/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/io/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/io/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/io/io.cfg create mode 100755 tests/zfs-tests/tests/functional/io/libaio.ksh create mode 100755 tests/zfs-tests/tests/functional/io/mmap.ksh create mode 100755 tests/zfs-tests/tests/functional/io/posixaio.ksh create mode 100755 tests/zfs-tests/tests/functional/io/psync.ksh create mode 100755 tests/zfs-tests/tests/functional/io/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/io/sync.ksh create mode 100644 tests/zfs-tests/tests/functional/large_files/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/large_files/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/large_files/large_files_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/large_files/large_files_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/large_files/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/largest_pool/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/largest_pool/largest_pool.cfg create mode 100755 tests/zfs-tests/tests/functional/largest_pool/largest_pool_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/libzfs/.gitignore create mode 100644 tests/zfs-tests/tests/functional/libzfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/libzfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/libzfs/libzfs_input.ksh create mode 100644 tests/zfs-tests/tests/functional/libzfs/many_fds.c create mode 100755 tests/zfs-tests/tests/functional/libzfs/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/limits/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/limits/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/limits/filesystem_count.ksh create mode 100755 tests/zfs-tests/tests/functional/limits/filesystem_limit.ksh create mode 100755 tests/zfs-tests/tests/functional/limits/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/limits/snapshot_count.ksh create mode 100755 tests/zfs-tests/tests/functional/limits/snapshot_limit.ksh create mode 100644 tests/zfs-tests/tests/functional/link_count/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/link_count/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/link_count/link_count_001.ksh create mode 100755 tests/zfs-tests/tests/functional/link_count/link_count_root_inode.ksh create mode 100755 tests/zfs-tests/tests/functional/link_count/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/log_spacemap/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/log_spacemap/log_spacemap_import_logs.ksh create mode 100644 tests/zfs-tests/tests/functional/migration/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/migration/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/migration/migration.cfg create mode 100644 tests/zfs-tests/tests/functional/migration/migration.kshlib create mode 100755 tests/zfs-tests/tests/functional/migration/migration_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/migration_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/migration/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/mmap/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/mmap/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/mmap/mmap.cfg create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_libaio_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_read_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/mmap/mmap_write_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/mmap/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/mmp/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/mmp/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/mmp/mmp.cfg create mode 100644 tests/zfs-tests/tests/functional/mmp/mmp.kshlib create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_active_import.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_exported_import.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_hostid.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_inactive_import.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_interval.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_on_off.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_on_thread.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_on_uberblocks.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_on_zdb.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_reset_interval.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_distribution.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/mmp_write_uberblocks.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/multihost_history.ksh create mode 100755 tests/zfs-tests/tests/functional/mmp/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/mount/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/mount/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/mount/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/mount/umount_001.ksh create mode 100755 tests/zfs-tests/tests/functional/mount/umount_unlinked_drain.ksh create mode 100755 tests/zfs-tests/tests/functional/mount/umountall_001.ksh create mode 100644 tests/zfs-tests/tests/functional/mv_files/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/mv_files/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/mv_files/mv_files.cfg create mode 100755 tests/zfs-tests/tests/functional/mv_files/mv_files_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/mv_files/mv_files_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/mv_files/mv_files_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/mv_files/random_creation.ksh create mode 100755 tests/zfs-tests/tests/functional/mv_files/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/nestedfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/nestedfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/nestedfs/nestedfs_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/nestedfs/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/no_space/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/no_space/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/no_space/enospc.cfg create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/no_space/enospc_df.ksh create mode 100755 tests/zfs-tests/tests/functional/no_space/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/nopwrite/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/nopwrite/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/nopwrite/nopwrite.shlib create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_copies.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_mtime.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_negative.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_promoted_clone.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_recsize.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_sync.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_varying_compression.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/nopwrite_volume.ksh create mode 100755 tests/zfs-tests/tests/functional/nopwrite/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/online_offline/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/online_offline/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/online_offline/online_offline.cfg create mode 100755 tests/zfs-tests/tests/functional/online_offline/online_offline_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/online_offline/online_offline_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/online_offline/online_offline_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/online_offline/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/pam/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/pam/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/pam/pam_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/pam/pam_nounmount.ksh create mode 100755 tests/zfs-tests/tests/functional/pam/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/pam/utilities.kshlib create mode 100644 tests/zfs-tests/tests/functional/persist_l2arc/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc.cfg create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/persist_l2arc_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/persist_l2arc/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/pool_checkpoint/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_after_rewind.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_big_rewind.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_capacity.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_conf_change.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_busy.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_discard_many.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_indirect.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_invalid.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_lun_expsz.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_open.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_removal.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_rewind.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_ro_rewind.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_sm_scale.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_twice.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_vdev_add.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zdb.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/checkpoint_zhack_feat.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/pool_checkpoint/pool_checkpoint.kshlib create mode 100755 tests/zfs-tests/tests/functional/pool_checkpoint/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/pool_names/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/pool_names/pool_names_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/pool_names/pool_names_002_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/poolversion/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/poolversion/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/poolversion/poolversion_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/poolversion/poolversion_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/poolversion/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/privilege/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/privilege/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/privilege/privilege_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/privilege/privilege_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/privilege/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/procfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/procfs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/pool_state.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_basic.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_concurrent_readers.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/procfs_list_stale_read.ksh create mode 100755 tests/zfs-tests/tests/functional/procfs/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/projectquota/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/projectquota/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectid_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectid_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectid_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/projectquota/projectquota.cfg create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectquota_009_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/projectquota/projectquota_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectspace_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectspace_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectspace_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projectspace_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projecttree_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projecttree_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/projecttree_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/projectquota/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/pyzfs/.gitignore create mode 100644 tests/zfs-tests/tests/functional/pyzfs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/pyzfs/pyzfs_unittest.ksh.in create mode 100644 tests/zfs-tests/tests/functional/quota/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/quota/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/quota/quota.cfg create mode 100644 tests/zfs-tests/tests/functional/quota/quota.kshlib create mode 100755 tests/zfs-tests/tests/functional/quota/quota_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/quota_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/quota_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/quota_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/quota_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/quota_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/quota/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/raidz/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/raidz/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/raidz_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/raidz/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/redacted_send/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/redacted_send/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/redacted_send/redacted.cfg create mode 100644 tests/zfs-tests/tests/functional/redacted_send/redacted.kshlib create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_compressed.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_contents.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_deleted.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_disabled_feature.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_embedded.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_holes.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_incrementals.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_largeblocks.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_many_clones.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_mixed_recsize.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_mounts.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_negative.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_origin.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_props.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_resume.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_size.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/redacted_volume.ksh create mode 100755 tests/zfs-tests/tests/functional/redacted_send/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/redundancy/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/redundancy/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/redundancy/redundancy.cfg create mode 100644 tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/redundancy_004_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/redundancy/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/refquota/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/refquota/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_006_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/refquota_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/refquota/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/refreserv/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/refreserv/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/refreserv/refreserv.cfg create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_multi_raidz.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/refreserv_raidz.ksh create mode 100755 tests/zfs-tests/tests/functional/refreserv/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/removal/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/removal/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/removal/removal.kshlib create mode 100755 tests/zfs-tests/tests/functional/removal/removal_all_vdev.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_check_space.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_condense_export.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_multiple_indirection.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_nopwrite.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_remap_deadlists.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_reservation.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_resume_export.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_sanity.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_add.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_create_fs.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_dedup.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_errors.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_export.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_faulted.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_ganging.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_remove.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_scrub.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_send.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_send_recv.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_snapshot.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_write.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/removal_with_zdb.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/remove_expanded.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/remove_indirect.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/remove_mirror.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/remove_mirror_sanity.ksh create mode 100755 tests/zfs-tests/tests/functional/removal/remove_raidz.ksh create mode 100644 tests/zfs-tests/tests/functional/rename_dirs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/rename_dirs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/rename_dirs/rename_dirs_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rename_dirs/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/replacement/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_import.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_rebuild.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/attach_resilver.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/detach.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_disabled_feature.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_multiple.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/rebuild_raidz.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_import.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_rebuild.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/replace_resilver.ksh create mode 100644 tests/zfs-tests/tests/functional/replacement/replacement.cfg create mode 100755 tests/zfs-tests/tests/functional/replacement/resilver_restart_001.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/resilver_restart_002.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/scrub_cancel.ksh create mode 100755 tests/zfs-tests/tests/functional/replacement/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/reservation/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/reservation/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/reservation/reservation.cfg create mode 100644 tests/zfs-tests/tests/functional/reservation/reservation.shlib create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_015_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_016_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_017_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_018_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_019_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_020_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_021_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/reservation_022_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/reservation/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/rootpool/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/rootpool/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/rootpool/rootpool_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/rootpool/rootpool_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/rootpool/rootpool_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rootpool/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/rsend/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/rsend/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/rsend/dedup.zsend.bz2 create mode 100644 tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.bz2 create mode 100644 tests/zfs-tests/tests/functional/rsend/dedup_encrypted_zvol.zsend.bz2 create mode 100644 tests/zfs-tests/tests/functional/rsend/fs.tar.gz create mode 100755 tests/zfs-tests/tests/functional/rsend/recv_dedup.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/recv_dedup_encrypted_zvol.ksh create mode 100644 tests/zfs-tests/tests/functional/rsend/rsend.cfg create mode 100644 tests/zfs-tests/tests/functional/rsend/rsend.kshlib create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_016_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_019_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_020_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_021_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_022_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/rsend_024_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-L_toggle.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_embedded_blocks.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_incremental.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_lz4_disabled.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_mixed_compression.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_props.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_recv_dedup.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_recv_lz4_disabled.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_resume.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_stream_size_estimate.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_verify_contents.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_verify_ratio.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_volume.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-c_zstreamdump.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-cpL_varied_recsize.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send-wR_encrypted_zvol.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_encrypted_files.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_encrypted_hierarchy.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_encrypted_props.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_encrypted_truncated_files.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_freeobjects.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_holds.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_hole_birth.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_mixed_raw.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_partial_dataset.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_realloc_dnode_size.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_realloc_encrypted_files.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_realloc_files.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/send_spill_block.ksh create mode 100755 tests/zfs-tests/tests/functional/rsend/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/scrub_mirror/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/cleanup.ksh create mode 100644 tests/zfs-tests/tests/functional/scrub_mirror/default.cfg create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/scrub_mirror_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/scrub_mirror_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/scrub_mirror_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/scrub_mirror_004_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/scrub_mirror/scrub_mirror_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/scrub_mirror/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/slog/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/slog/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/slog/slog.cfg create mode 100644 tests/zfs-tests/tests/functional/slog/slog.kshlib create mode 100755 tests/zfs-tests/tests/functional/slog/slog_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_008_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_011_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_012_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_015_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_replay_fs_001.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_replay_fs_002.ksh create mode 100755 tests/zfs-tests/tests/functional/slog/slog_replay_volume.ksh create mode 100644 tests/zfs-tests/tests/functional/snapshot/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/snapshot/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/clone_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/rollback_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/rollback_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/rollback_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/snapshot/snapshot.cfg create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_013_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_014_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_015_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_016_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapshot/snapshot_017_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/snapused/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/snapused/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/snapused/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/snapused/snapused.kshlib create mode 100755 tests/zfs-tests/tests/functional/snapused/snapused_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapused/snapused_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapused/snapused_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapused/snapused_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/snapused/snapused_005_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/sparse/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/sparse/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/sparse/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/sparse/sparse.cfg create mode 100755 tests/zfs-tests/tests/functional/sparse/sparse_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/suid/.gitignore create mode 100644 tests/zfs-tests/tests/functional/suid/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/suid/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/suid/suid_write_to_file.c create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_none.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_sgid.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid.ksh create mode 100755 tests/zfs-tests/tests/functional/suid/suid_write_to_suid_sgid.ksh create mode 100644 tests/zfs-tests/tests/functional/threadsappend/.gitignore create mode 100644 tests/zfs-tests/tests/functional/threadsappend/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/threadsappend/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/threadsappend/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/threadsappend/threadsappend_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/tmpfile/.gitignore create mode 100644 tests/zfs-tests/tests/functional/tmpfile/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/tmpfile/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/tmpfile/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/tmpfile/tmpfile_001_pos.c create mode 100644 tests/zfs-tests/tests/functional/tmpfile/tmpfile_002_pos.c create mode 100644 tests/zfs-tests/tests/functional/tmpfile/tmpfile_003_pos.c create mode 100644 tests/zfs-tests/tests/functional/tmpfile/tmpfile_stat_mode.c create mode 100644 tests/zfs-tests/tests/functional/tmpfile/tmpfile_test.c create mode 100644 tests/zfs-tests/tests/functional/trim/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/trim/autotrim_config.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/autotrim_integrity.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/autotrim_trim_integrity.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/trim/trim.cfg create mode 100644 tests/zfs-tests/tests/functional/trim/trim.kshlib create mode 100755 tests/zfs-tests/tests/functional/trim/trim_config.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/trim_integrity.ksh create mode 100755 tests/zfs-tests/tests/functional/trim/trim_l2arc.ksh create mode 100644 tests/zfs-tests/tests/functional/truncate/.gitignore create mode 100644 tests/zfs-tests/tests/functional/truncate/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/truncate/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/truncate/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/truncate/truncate.cfg create mode 100755 tests/zfs-tests/tests/functional/truncate/truncate_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/truncate/truncate_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/truncate/truncate_test.c create mode 100755 tests/zfs-tests/tests/functional/truncate/truncate_timestamps.ksh create mode 100644 tests/zfs-tests/tests/functional/upgrade/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/upgrade/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/upgrade/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/upgrade/upgrade_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/upgrade/upgrade_projectquota_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/upgrade/upgrade_readonly_pool.ksh create mode 100755 tests/zfs-tests/tests/functional/upgrade/upgrade_userobj_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/user_namespace/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/user_namespace/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/user_namespace/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/user_namespace/user_namespace.cfg create mode 100755 tests/zfs-tests/tests/functional/user_namespace/user_namespace_001.ksh create mode 100644 tests/zfs-tests/tests/functional/user_namespace/user_namespace_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/userquota/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/userquota/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/groupspace_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/groupspace_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/groupspace_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/userquota/userquota.cfg create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_007_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_009_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_010_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_012_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userquota_013_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/userquota/userquota_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/userquota/userspace_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userspace_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/userquota/userspace_003_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/vdev_zaps/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps.kshlib create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/vdev_zaps/vdev_zaps_007_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/write_dirs/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/write_dirs/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/write_dirs/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/write_dirs/write_dirs_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/write_dirs/write_dirs_002_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/xattr/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/xattr/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/xattr/xattr.cfg create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_002_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_006_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_007_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_008_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_009_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_010_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_011_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_012_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/xattr/xattr_013_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/xattr/xattr_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/zvol/Makefile.am create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol.cfg create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/zvol_ENOSPC.cfg create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_cli/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_cli/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_cli/zvol_cli.cfg create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_cli/zvol_cli_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_cli/zvol_cli_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_cli/zvol_cli_003_neg.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_common.shlib create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/setup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_001_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_003_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_005_neg.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_006_pos.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_common.kshlib create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_hierarchy.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_rename_inuse.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_snapdev.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_volmode.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_misc/zvol_misc_zil.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile.am create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/setup.ksh create mode 100644 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap.cfg create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_001_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_002_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_003_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_004_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_005_pos.ksh create mode 100755 tests/zfs-tests/tests/functional/zvol/zvol_swap/zvol_swap_006_pos.ksh create mode 100644 tests/zfs-tests/tests/perf/Makefile.am create mode 100644 tests/zfs-tests/tests/perf/fio/Makefile.am create mode 100644 tests/zfs-tests/tests/perf/fio/mkfiles.fio create mode 100644 tests/zfs-tests/tests/perf/fio/random_reads.fio create mode 100644 tests/zfs-tests/tests/perf/fio/random_readwrite.fio create mode 100644 tests/zfs-tests/tests/perf/fio/random_readwrite_fixed.fio create mode 100644 tests/zfs-tests/tests/perf/fio/random_writes.fio create mode 100644 tests/zfs-tests/tests/perf/fio/sequential_reads.fio create mode 100644 tests/zfs-tests/tests/perf/fio/sequential_readwrite.fio create mode 100644 tests/zfs-tests/tests/perf/fio/sequential_writes.fio create mode 100644 tests/zfs-tests/tests/perf/nfs-sample.cfg create mode 100644 tests/zfs-tests/tests/perf/perf.shlib create mode 100644 tests/zfs-tests/tests/perf/regression/Makefile.am create mode 100755 tests/zfs-tests/tests/perf/regression/random_reads.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/random_readwrite.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/random_readwrite_fixed.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/random_writes.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/random_writes_zil.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/sequential_reads.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/sequential_reads_arc_cached_clone.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/sequential_reads_dbuf_cached.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/sequential_writes.ksh create mode 100755 tests/zfs-tests/tests/perf/regression/setup.ksh create mode 100644 tests/zfs-tests/tests/perf/scripts/Makefile.am create mode 100755 tests/zfs-tests/tests/perf/scripts/prefetch_io.sh create mode 100644 tests/zfs-tests/tests/stress/Makefile.am create mode 100644 udev/Makefile.am create mode 100644 udev/rules.d/.gitignore create mode 100644 udev/rules.d/60-zvol.rules.in create mode 100644 udev/rules.d/69-vdev.rules.in create mode 100644 udev/rules.d/90-zfs.rules.in create mode 100644 udev/rules.d/Makefile.am create mode 100644 zfs.release.in diff --git a/AUTHORS b/AUTHORS new file mode 100644 index 000000000000..aab8bf29c99f --- /dev/null +++ b/AUTHORS @@ -0,0 +1,308 @@ +MAINTAINERS: + + Brian Behlendorf + Tony Hutter + +PAST MAINTAINERS: + + Ned Bass + +CONTRIBUTORS: + + Aaron Fineman + Adam Leventhal + Adam Stevko + Ahmed G + Akash Ayare + Alan Somers + Alar Aun + Albert Lee + Alec Salazar + Alejandro R. Sedeño + Alek Pinchuk + Alex Braunegg + Alex McWhirter + Alex Reece + Alex Wilson + Alex Zhuravlev + Alexander Eremin + Alexander Motin + Alexander Pyhalov + Alexander Stetsenko + Alexey Shvetsov + Alexey Smirnoff + Allan Jude + AndCycle + Andreas Buschmann + Andreas Dilger + Andrew Barnes + Andrew Hamilton + Andrew Reid + Andrew Stormont + Andrew Tselischev + Andrey Vesnovaty + Andriy Gapon + Andy Bakun + Aniruddha Shankar + Antonio Russo + Arkadiusz BubaÅ‚a + Arne Jansen + Aron Xu + Bart Coddens + Basil Crow + Huang Liu + Ben Allen + Ben Rubson + Benjamin Albrecht + Bill McGonigle + Bill Pijewski + Boris Protopopov + Brad Lewis + Brian Behlendorf + Brian J. Murrell + Caleb James DeLisle + Cao Xuewen + Carlo Landmeter + Carlos Alberto Lopez Perez + Chaoyu Zhang + Chen Can + Chen Haiquan + Chip Parker + Chris Burroughs + Chris Dunlap + Chris Dunlop + Chris Siden + Chris Wedgwood + Chris Williamson + Chris Zubrzycki + Christ Schlacta + Christer Ekholm + Christian Kohlschütter + Christian Neukirchen + Christian Schwarz + Christopher Voltz + Chunwei Chen + Clemens Fruhwirth + Coleman Kane + Colin Ian King + Craig Loomis + Craig Sanders + Cyril Plisko + DHE + Damian WojsÅ‚aw + Dan Kimmel + Dan McDonald + Dan Swartzendruber + Dan Vatca + Daniel Hoffman + Daniel Verite + Daniil Lunev + Darik Horn + Dave Eddy + David Lamparter + David Qian + David Quigley + Debabrata Banerjee + Denys Rtveliashvili + Derek Dai + Dimitri John Ledkov + Dmitry Khasanov + Dominik Hassler + Dominik Honnef + Don Brady + Dr. András Korn + Eli Rosenthal + Eric Desrochers + Eric Dillmann + Eric Schrock + Etienne Dechamps + Evan Susarret + Fabian Grünbichler + Fajar A. Nugraha + Fan Yong + Feng Sun + Frederik Wessels + Frédéric Vanniere + Garrett D'Amore + Garrison Jensen + Gary Mills + Gaurav Kumar + GeLiXin + George Amanakis + George Melikov + George Wilson + Georgy Yakovlev + Giuseppe Di Natale + Gordan Bobic + Gordon Ross + Gregor Kopka + Grischa Zengel + Gunnar Beutner + Gvozden Neskovic + Hajo Möller + Hans Rosenfeld + HÃ¥kan Johansson + Igor Kozhukhov + Igor Lvovsky + Isaac Huang + JK Dingwall + Jacek FefliÅ„ski + James Cowgill + James Lee + James Pan + Jan Engelhardt + Jan Kryl + Jan Sanislo + Jason King + Jason Zaman + Javen Wu + Jeremy Gill + Jeremy Jones + Jerry Jelinek + Jinshan Xiong + Joe Stein + John Albietz + John Eismeier + John L. Hammond + John Layman + John Paul Adrian Glaubitz + John Wren Kennedy + Johnny Stenback + Jorgen Lundman + Josef 'Jeff' Sipek + Joshua M. Clulow + Justin BedÅ‘ + Justin Lecher + Justin T. Gibbs + Jörg Thalheim + KORN Andras + Kamil DomaÅ„ski + Karsten Kretschmer + Kash Pande + Keith M Wesolowski + Kevin Tanguy + KireinaHoro + Kjeld Schouten-Lebbing + Kohsuke Kawaguchi + Kyle Blatter + Kyle Fuller + Loli + Lars Johannsen + Li Dongyang + Li Wei + Lukas Wunner + Madhav Suresh + Manoj Joseph + Manuel Amador (Rudd-O) + Marcel Huber + Marcel Telka + Marcel Wysocki + Mark Shellenbaum + Mark Wright + Martin Matuska + Massimo Maggi + Matt Johnston + Matt Kemp + Matthew Ahrens + Matthew Thode + Matus Kral + Max Grossman + Maximilian Mehnert + Michael Gebetsroither + Michael Kjorling + Michael Martin + Michael Niewöhner + Mike Gerdts + Mike Harsch + Mike Leddy + Mike Swanson + Milan Jurik + Morgan Jones + Moritz Maxeiner + Nathaniel Clark + Nathaniel Wesley Filardo + Nav Ravindranath + Neal Gompa (ニール・ゴンパ) + Ned Bass + Neependra Khare + Neil Stockbridge + Nick Garvey + Nikolay Borisov + Olaf Faaland + Oleg Drokin + Oleg Stepura + Patrik Greco + Paul B. Henson + Paul Dagnelie + Paul Zuchowski + Pavel Boldin + Pavel Zakharov + Pawel Jakub Dawidek + Pedro Giffuni + Peng + Peter Ashford + Prakash Surya + Prasad Joshi + Ralf Ertzinger + Randall Mason + Remy Blank + Ricardo M. Correia + Rich Ercolani + Richard Elling + Richard Laager + Richard Lowe + Richard Sharpe + Richard Yao + Rohan Puri + Romain Dolbeau + Roman Strashkin + Ruben Kerkhof + Saso Kiselkov + Scot W. Stevenson + Sean Eric Fagan + Sebastian Gottschall + Sen Haerens + Serapheim Dimitropoulos + Seth Forshee + Shampavman + Shen Yan + Simon Guest + Simon Klinkert + Sowrabha Gopal + Stanislav Seletskiy + Steffen Müthing + Stephen Blinick + Steve Dougherty + Steven Burgess + Steven Hartland + Steven Johnson + Stian Ellingsen + Suman Chakravartula + Sydney Vanda + Sören Tempel + Thijs Cramer + Tim Chase + Tim Connors + Tim Crawford + Tim Haley + Tobin Harding + Tom Caputi + Tom Matthews + Tom Prince + Tomohiro Kusumi + Tony Hutter + Toomas Soome + Trey Dockendorf + Turbo Fredriksson + Tyler J. Stachecki + Vitaut Bajaryn + Weigang Li + Will Andrews + Will Rouesnel + Wolfgang Bumiller + Xin Li + Ying Zhu + YunQiang Su + Yuri Pankov + Yuxuan Shui + Zachary Bedell diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 000000000000..d314a66b4e2d --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,2 @@ +The [OpenZFS Code of Conduct](http://www.open-zfs.org/wiki/Code_of_Conduct) +applies to spaces associated with the ZFS on Linux project, including GitHub. diff --git a/COPYRIGHT b/COPYRIGHT new file mode 100644 index 000000000000..85556b542f14 --- /dev/null +++ b/COPYRIGHT @@ -0,0 +1,31 @@ +Refer to the git commit log for authoritative copyright attribution. + +The original ZFS source code was obtained from Open Solaris which was +released under the terms of the CDDL open source license. Additional +changes have been included from OpenZFS and the Illumos project which +are similarly licensed. These projects can be found on Github at: + + * https://github.com/illumos/illumos-gate + * https://github.com/openzfs/openzfs + +Unless otherwise noted, all files in this distribution are released +under the Common Development and Distribution License (CDDL). + +Exceptions are noted within the associated source files headers and +by including a THIRDPARTYLICENSE file with the license terms. A few +notable exceptions and their respective licenses include: + + * Skein Checksum Implementation: module/icp/algs/skein/THIRDPARTYLICENSE + * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman + * AES Implementation: module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl + * PBKDF2 Implementation: lib/libzfs/THIRDPARTYLICENSE.openssl + * SPL Implementation: module/os/linux/spl/THIRDPARTYLICENSE.gplv2 + * GCM Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams + * GCM Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl + * GHASH Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams + * GHASH Implementation: module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl + +This product includes software developed by the OpenSSL Project for use +in the OpenSSL Toolkit (http://www.openssl.org/) + +See the LICENSE and NOTICE for more information. diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000000..da23621dc843 --- /dev/null +++ b/LICENSE @@ -0,0 +1,384 @@ +Unless otherwise noted, all files in this distribution are released +under the Common Development and Distribution License (CDDL). +Exceptions are noted within the associated source files. + +-------------------------------------------------------------------- + + +COMMON DEVELOPMENT AND DISTRIBUTION LICENSE Version 1.0 + +1. Definitions. + + 1.1. "Contributor" means each individual or entity that creates + or contributes to the creation of Modifications. + + 1.2. "Contributor Version" means the combination of the Original + Software, prior Modifications used by a Contributor (if any), + and the Modifications made by that particular Contributor. + + 1.3. "Covered Software" means (a) the Original Software, or (b) + Modifications, or (c) the combination of files containing + Original Software with files containing Modifications, in + each case including portions thereof. + + 1.4. "Executable" means the Covered Software in any form other + than Source Code. + + 1.5. "Initial Developer" means the individual or entity that first + makes Original Software available under this License. + + 1.6. "Larger Work" means a work which combines Covered Software or + portions thereof with code not governed by the terms of this + License. + + 1.7. "License" means this document. + + 1.8. "Licensable" means having the right to grant, to the maximum + extent possible, whether at the time of the initial grant or + subsequently acquired, any and all of the rights conveyed + herein. + + 1.9. "Modifications" means the Source Code and Executable form of + any of the following: + + A. Any file that results from an addition to, deletion from or + modification of the contents of a file containing Original + Software or previous Modifications; + + B. Any new file that contains any part of the Original + Software or previous Modifications; or + + C. Any new file that is contributed or otherwise made + available under the terms of this License. + + 1.10. "Original Software" means the Source Code and Executable + form of computer software code that is originally released + under this License. + + 1.11. "Patent Claims" means any patent claim(s), now owned or + hereafter acquired, including without limitation, method, + process, and apparatus claims, in any patent Licensable by + grantor. + + 1.12. "Source Code" means (a) the common form of computer software + code in which modifications are made and (b) associated + documentation included in or with such code. + + 1.13. "You" (or "Your") means an individual or a legal entity + exercising rights under, and complying with all of the terms + of, this License. For legal entities, "You" includes any + entity which controls, is controlled by, or is under common + control with You. For purposes of this definition, + "control" means (a) the power, direct or indirect, to cause + the direction or management of such entity, whether by + contract or otherwise, or (b) ownership of more than fifty + percent (50%) of the outstanding shares or beneficial + ownership of such entity. + +2. License Grants. + + 2.1. The Initial Developer Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, the Initial + Developer hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Initial Developer, to use, + reproduce, modify, display, perform, sublicense and + distribute the Original Software (or portions thereof), + with or without Modifications, and/or as part of a Larger + Work; and + + (b) under Patent Claims infringed by the making, using or + selling of Original Software, to make, have made, use, + practice, sell, and offer for sale, and/or otherwise + dispose of the Original Software (or portions thereof). + + (c) The licenses granted in Sections 2.1(a) and (b) are + effective on the date Initial Developer first distributes + or otherwise makes the Original Software available to a + third party under the terms of this License. + + (d) Notwithstanding Section 2.1(b) above, no patent license is + granted: (1) for code that You delete from the Original + Software, or (2) for infringements caused by: (i) the + modification of the Original Software, or (ii) the + combination of the Original Software with other software + or devices. + + 2.2. Contributor Grant. + + Conditioned upon Your compliance with Section 3.1 below and + subject to third party intellectual property claims, each + Contributor hereby grants You a world-wide, royalty-free, + non-exclusive license: + + (a) under intellectual property rights (other than patent or + trademark) Licensable by Contributor to use, reproduce, + modify, display, perform, sublicense and distribute the + Modifications created by such Contributor (or portions + thereof), either on an unmodified basis, with other + Modifications, as Covered Software and/or as part of a + Larger Work; and + + (b) under Patent Claims infringed by the making, using, or + selling of Modifications made by that Contributor either + alone and/or in combination with its Contributor Version + (or portions of such combination), to make, use, sell, + offer for sale, have made, and/or otherwise dispose of: + (1) Modifications made by that Contributor (or portions + thereof); and (2) the combination of Modifications made by + that Contributor with its Contributor Version (or portions + of such combination). + + (c) The licenses granted in Sections 2.2(a) and 2.2(b) are + effective on the date Contributor first distributes or + otherwise makes the Modifications available to a third + party. + + (d) Notwithstanding Section 2.2(b) above, no patent license is + granted: (1) for any code that Contributor has deleted + from the Contributor Version; (2) for infringements caused + by: (i) third party modifications of Contributor Version, + or (ii) the combination of Modifications made by that + Contributor with other software (except as part of the + Contributor Version) or other devices; or (3) under Patent + Claims infringed by Covered Software in the absence of + Modifications made by that Contributor. + +3. Distribution Obligations. + + 3.1. Availability of Source Code. + + Any Covered Software that You distribute or otherwise make + available in Executable form must also be made available in Source + Code form and that Source Code form must be distributed only under + the terms of this License. You must include a copy of this + License with every copy of the Source Code form of the Covered + Software You distribute or otherwise make available. You must + inform recipients of any such Covered Software in Executable form + as to how they can obtain such Covered Software in Source Code + form in a reasonable manner on or through a medium customarily + used for software exchange. + + 3.2. Modifications. + + The Modifications that You create or to which You contribute are + governed by the terms of this License. You represent that You + believe Your Modifications are Your original creation(s) and/or + You have sufficient rights to grant the rights conveyed by this + License. + + 3.3. Required Notices. + + You must include a notice in each of Your Modifications that + identifies You as the Contributor of the Modification. You may + not remove or alter any copyright, patent or trademark notices + contained within the Covered Software, or any notices of licensing + or any descriptive text giving attribution to any Contributor or + the Initial Developer. + + 3.4. Application of Additional Terms. + + You may not offer or impose any terms on any Covered Software in + Source Code form that alters or restricts the applicable version + of this License or the recipients' rights hereunder. You may + choose to offer, and to charge a fee for, warranty, support, + indemnity or liability obligations to one or more recipients of + Covered Software. However, you may do so only on Your own behalf, + and not on behalf of the Initial Developer or any Contributor. + You must make it absolutely clear that any such warranty, support, + indemnity or liability obligation is offered by You alone, and You + hereby agree to indemnify the Initial Developer and every + Contributor for any liability incurred by the Initial Developer or + such Contributor as a result of warranty, support, indemnity or + liability terms You offer. + + 3.5. Distribution of Executable Versions. + + You may distribute the Executable form of the Covered Software + under the terms of this License or under the terms of a license of + Your choice, which may contain terms different from this License, + provided that You are in compliance with the terms of this License + and that the license for the Executable form does not attempt to + limit or alter the recipient's rights in the Source Code form from + the rights set forth in this License. If You distribute the + Covered Software in Executable form under a different license, You + must make it absolutely clear that any terms which differ from + this License are offered by You alone, not by the Initial + Developer or Contributor. You hereby agree to indemnify the + Initial Developer and every Contributor for any liability incurred + by the Initial Developer or such Contributor as a result of any + such terms You offer. + + 3.6. Larger Works. + + You may create a Larger Work by combining Covered Software with + other code not governed by the terms of this License and + distribute the Larger Work as a single product. In such a case, + You must make sure the requirements of this License are fulfilled + for the Covered Software. + +4. Versions of the License. + + 4.1. New Versions. + + Sun Microsystems, Inc. is the initial license steward and may + publish revised and/or new versions of this License from time to + time. Each version will be given a distinguishing version number. + Except as provided in Section 4.3, no one other than the license + steward has the right to modify this License. + + 4.2. Effect of New Versions. + + You may always continue to use, distribute or otherwise make the + Covered Software available under the terms of the version of the + License under which You originally received the Covered Software. + If the Initial Developer includes a notice in the Original + Software prohibiting it from being distributed or otherwise made + available under any subsequent version of the License, You must + distribute and make the Covered Software available under the terms + of the version of the License under which You originally received + the Covered Software. Otherwise, You may also choose to use, + distribute or otherwise make the Covered Software available under + the terms of any subsequent version of the License published by + the license steward. + + 4.3. Modified Versions. + + When You are an Initial Developer and You want to create a new + license for Your Original Software, You may create and use a + modified version of this License if You: (a) rename the license + and remove any references to the name of the license steward + (except to note that the license differs from this License); and + (b) otherwise make it clear that the license contains terms which + differ from this License. + +5. DISCLAIMER OF WARRANTY. + + COVERED SOFTWARE IS PROVIDED UNDER THIS LICENSE ON AN "AS IS" + BASIS, WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED OR IMPLIED, + INCLUDING, WITHOUT LIMITATION, WARRANTIES THAT THE COVERED + SOFTWARE IS FREE OF DEFECTS, MERCHANTABLE, FIT FOR A PARTICULAR + PURPOSE OR NON-INFRINGING. THE ENTIRE RISK AS TO THE QUALITY AND + PERFORMANCE OF THE COVERED SOFTWARE IS WITH YOU. SHOULD ANY + COVERED SOFTWARE PROVE DEFECTIVE IN ANY RESPECT, YOU (NOT THE + INITIAL DEVELOPER OR ANY OTHER CONTRIBUTOR) ASSUME THE COST OF ANY + NECESSARY SERVICING, REPAIR OR CORRECTION. THIS DISCLAIMER OF + WARRANTY CONSTITUTES AN ESSENTIAL PART OF THIS LICENSE. NO USE OF + ANY COVERED SOFTWARE IS AUTHORIZED HEREUNDER EXCEPT UNDER THIS + DISCLAIMER. + +6. TERMINATION. + + 6.1. This License and the rights granted hereunder will terminate + automatically if You fail to comply with terms herein and fail to + cure such breach within 30 days of becoming aware of the breach. + Provisions which, by their nature, must remain in effect beyond + the termination of this License shall survive. + + 6.2. If You assert a patent infringement claim (excluding + declaratory judgment actions) against Initial Developer or a + Contributor (the Initial Developer or Contributor against whom You + assert such claim is referred to as "Participant") alleging that + the Participant Software (meaning the Contributor Version where + the Participant is a Contributor or the Original Software where + the Participant is the Initial Developer) directly or indirectly + infringes any patent, then any and all rights granted directly or + indirectly to You by such Participant, the Initial Developer (if + the Initial Developer is not the Participant) and all Contributors + under Sections 2.1 and/or 2.2 of this License shall, upon 60 days + notice from Participant terminate prospectively and automatically + at the expiration of such 60 day notice period, unless if within + such 60 day period You withdraw Your claim with respect to the + Participant Software against such Participant either unilaterally + or pursuant to a written agreement with Participant. + + 6.3. In the event of termination under Sections 6.1 or 6.2 above, + all end user licenses that have been validly granted by You or any + distributor hereunder prior to termination (excluding licenses + granted to You by any distributor) shall survive termination. + +7. LIMITATION OF LIABILITY. + + UNDER NO CIRCUMSTANCES AND UNDER NO LEGAL THEORY, WHETHER TORT + (INCLUDING NEGLIGENCE), CONTRACT, OR OTHERWISE, SHALL YOU, THE + INITIAL DEVELOPER, ANY OTHER CONTRIBUTOR, OR ANY DISTRIBUTOR OF + COVERED SOFTWARE, OR ANY SUPPLIER OF ANY OF SUCH PARTIES, BE + LIABLE TO ANY PERSON FOR ANY INDIRECT, SPECIAL, INCIDENTAL, OR + CONSEQUENTIAL DAMAGES OF ANY CHARACTER INCLUDING, WITHOUT + LIMITATION, DAMAGES FOR LOST PROFITS, LOSS OF GOODWILL, WORK + STOPPAGE, COMPUTER FAILURE OR MALFUNCTION, OR ANY AND ALL OTHER + COMMERCIAL DAMAGES OR LOSSES, EVEN IF SUCH PARTY SHALL HAVE BEEN + INFORMED OF THE POSSIBILITY OF SUCH DAMAGES. THIS LIMITATION OF + LIABILITY SHALL NOT APPLY TO LIABILITY FOR DEATH OR PERSONAL + INJURY RESULTING FROM SUCH PARTY'S NEGLIGENCE TO THE EXTENT + APPLICABLE LAW PROHIBITS SUCH LIMITATION. SOME JURISDICTIONS DO + NOT ALLOW THE EXCLUSION OR LIMITATION OF INCIDENTAL OR + CONSEQUENTIAL DAMAGES, SO THIS EXCLUSION AND LIMITATION MAY NOT + APPLY TO YOU. + +8. U.S. GOVERNMENT END USERS. + + The Covered Software is a "commercial item," as that term is + defined in 48 C.F.R. 2.101 (Oct. 1995), consisting of "commercial + computer software" (as that term is defined at 48 + C.F.R. 252.227-7014(a)(1)) and "commercial computer software + documentation" as such terms are used in 48 C.F.R. 12.212 + (Sept. 1995). Consistent with 48 C.F.R. 12.212 and 48 + C.F.R. 227.7202-1 through 227.7202-4 (June 1995), all + U.S. Government End Users acquire Covered Software with only those + rights set forth herein. This U.S. Government Rights clause is in + lieu of, and supersedes, any other FAR, DFAR, or other clause or + provision that addresses Government rights in computer software + under this License. + +9. MISCELLANEOUS. + + This License represents the complete agreement concerning subject + matter hereof. If any provision of this License is held to be + unenforceable, such provision shall be reformed only to the extent + necessary to make it enforceable. This License shall be governed + by the law of the jurisdiction specified in a notice contained + within the Original Software (except to the extent applicable law, + if any, provides otherwise), excluding such jurisdiction's + conflict-of-law provisions. Any litigation relating to this + License shall be subject to the jurisdiction of the courts located + in the jurisdiction and venue specified in a notice contained + within the Original Software, with the losing party responsible + for costs, including, without limitation, court costs and + reasonable attorneys' fees and expenses. The application of the + United Nations Convention on Contracts for the International Sale + of Goods is expressly excluded. Any law or regulation which + provides that the language of a contract shall be construed + against the drafter shall not apply to this License. You agree + that You alone are responsible for compliance with the United + States export administration regulations (and the export control + laws and regulation of any other countries) when You use, + distribute or otherwise make available any Covered Software. + +10. RESPONSIBILITY FOR CLAIMS. + + As between Initial Developer and the Contributors, each party is + responsible for claims and damages arising, directly or + indirectly, out of its utilization of rights under this License + and You agree to work with Initial Developer and Contributors to + distribute such responsibility on an equitable basis. Nothing + herein is intended or shall be deemed to constitute any admission + of liability. + +-------------------------------------------------------------------- + +NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND +DISTRIBUTION LICENSE (CDDL) + +For Covered Software in this distribution, this License shall +be governed by the laws of the State of California (excluding +conflict-of-law provisions). + +Any litigation relating to this License shall be subject to the +jurisdiction of the Federal Courts of the Northern District of +California and the state courts of the State of California, with +venue lying in Santa Clara County, California. diff --git a/META b/META new file mode 100644 index 000000000000..49d8bd73d516 --- /dev/null +++ b/META @@ -0,0 +1,10 @@ +Meta: 1 +Name: zfs +Branch: 1.0 +Version: 0.8.0 +Release: 1 +Release-Tags: relext +License: CDDL +Author: OpenZFS on Linux +Linux-Maximum: 5.6 +Linux-Minimum: 3.10 diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 000000000000..b409d2196f86 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,256 @@ +ACLOCAL_AMFLAGS = -I config + +SUBDIRS = include +if BUILD_LINUX +SUBDIRS += rpm +endif + +if CONFIG_USER +SUBDIRS += etc man scripts lib tests cmd contrib +if BUILD_LINUX +SUBDIRS += udev +endif +endif +if CONFIG_KERNEL +SUBDIRS += module + +extradir = $(prefix)/src/zfs-$(VERSION) +extra_HEADERS = zfs.release.in zfs_config.h.in + +if BUILD_LINUX +kerneldir = $(prefix)/src/zfs-$(VERSION)/$(LINUX_VERSION) +nodist_kernel_HEADERS = zfs.release zfs_config.h module/$(LINUX_SYMBOLS) +endif +endif + +AUTOMAKE_OPTIONS = foreign +EXTRA_DIST = autogen.sh copy-builtin +EXTRA_DIST += cppcheck-suppressions.txt +EXTRA_DIST += config/config.awk config/rpm.am config/deb.am config/tgz.am +EXTRA_DIST += META AUTHORS COPYRIGHT LICENSE NEWS NOTICE README.md +EXTRA_DIST += CODE_OF_CONDUCT.md +EXTRA_DIST += module/lua/README.zfs module/os/linux/spl/README.md + +# Include all the extra licensing information for modules +EXTRA_DIST += module/icp/algs/skein/THIRDPARTYLICENSE +EXTRA_DIST += module/icp/algs/skein/THIRDPARTYLICENSE.descrip +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl +EXTRA_DIST += module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl +EXTRA_DIST += module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip +EXTRA_DIST += module/os/linux/spl/THIRDPARTYLICENSE.gplv2 +EXTRA_DIST += module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip +EXTRA_DIST += module/zfs/THIRDPARTYLICENSE.cityhash +EXTRA_DIST += module/zfs/THIRDPARTYLICENSE.cityhash.descrip + +@CODE_COVERAGE_RULES@ + +GITREV = include/zfs_gitrev.h + +PHONY = gitrev +gitrev: + $(AM_V_GEN)$(top_srcdir)/scripts/make_gitrev.sh $(GITREV) + +all: gitrev + +# Double-colon rules are allowed; there are multiple independent definitions. +maintainer-clean-local:: + -$(RM) $(GITREV) + +distclean-local:: + -$(RM) -R autom4te*.cache build + -find . \( -name SCCS -o -name BitKeeper -o -name .svn -o -name CVS \ + -o -name .pc -o -name .hg -o -name .git \) -prune -o \ + \( -name '*.orig' -o -name '*.rej' -o -name '*~' \ + -o -name '*.bak' -o -name '#*#' -o -name '.*.orig' \ + -o -name '.*.rej' -o -size 0 -o -name '*%' -o -name '.*.cmd' \ + -o -name 'core' -o -name 'Makefile' -o -name 'Module.symvers' \ + -o -name '*.order' -o -name '*.markers' -o -name '*.gcda' \ + -o -name '*.gcno' \) \ + -type f -print | xargs $(RM) + +all-local: + -[ -x ${top_builddir}/scripts/zfs-tests.sh ] && \ + ${top_builddir}/scripts/zfs-tests.sh -c + +dist-hook: + $(AM_V_GEN)$(top_srcdir)/scripts/make_gitrev.sh -D $(distdir) $(GITREV) + $(SED) ${ac_inplace} -e 's/Release:[[:print:]]*/Release: $(RELEASE)/' \ + $(distdir)/META + +if BUILD_LINUX +# For compatibility, create a matching spl-x.y.z directly which contains +# symlinks to the updated header and object file locations. These +# compatibility links will be removed in the next major release. +if CONFIG_KERNEL +install-data-hook: + rm -rf $(DESTDIR)$(prefix)/src/spl-$(VERSION) && \ + mkdir $(DESTDIR)$(prefix)/src/spl-$(VERSION) && \ + cd $(DESTDIR)$(prefix)/src/spl-$(VERSION) && \ + ln -s ../zfs-$(VERSION)/include/spl include && \ + ln -s ../zfs-$(VERSION)/$(LINUX_VERSION) $(LINUX_VERSION) && \ + ln -s ../zfs-$(VERSION)/zfs_config.h.in spl_config.h.in && \ + ln -s ../zfs-$(VERSION)/zfs.release.in spl.release.in && \ + cd $(DESTDIR)$(prefix)/src/zfs-$(VERSION)/$(LINUX_VERSION) && \ + ln -fs zfs_config.h spl_config.h && \ + ln -fs zfs.release spl.release +endif +endif + +PHONY += codecheck +codecheck: cstyle shellcheck checkbashisms flake8 mancheck testscheck vcscheck + +PHONY += checkstyle +checkstyle: codecheck commitcheck + +PHONY += commitcheck +commitcheck: + @if git rev-parse --git-dir > /dev/null 2>&1; then \ + ${top_srcdir}/scripts/commitcheck.sh; \ + fi + +PHONY += cstyle +cstyle: + @find ${top_srcdir} -name build -prune \ + -o -type f -name '*.[hc]' \ + ! -name 'zfs_config.*' ! -name '*.mod.c' \ + ! -name 'opt_global.h' ! -name '*_if*.h' \ + ! -path './module/zstd/lib/*' \ + -exec ${top_srcdir}/scripts/cstyle.pl -cpP {} \+ + +filter_executable = -exec test -x '{}' \; -print + +PHONY += shellcheck +shellcheck: + @if type shellcheck > /dev/null 2>&1; then \ + shellcheck --exclude=SC1090 --exclude=SC1117 --format=gcc \ + $$(find ${top_srcdir}/scripts/*.sh -type f) \ + $$(find ${top_srcdir}/cmd/zed/zed.d/*.sh -type f) \ + $$(find ${top_srcdir}/cmd/zpool/zpool.d/* \ + -type f ${filter_executable}); \ + else \ + echo "skipping shellcheck because shellcheck is not installed"; \ + fi + +PHONY += checkbashisms +checkbashisms: + @if type checkbashisms > /dev/null 2>&1; then \ + checkbashisms -n -p -x \ + $$(find ${top_srcdir} \ + -name '.git' -prune \ + -o -name 'build' -prune \ + -o -name 'tests' -prune \ + -o -name 'config' -prune \ + -o -name 'zed-functions.sh*' -prune \ + -o -name 'zfs-import*' -prune \ + -o -name 'zfs-mount*' -prune \ + -o -name 'zfs-zed*' -prune \ + -o -name 'smart' -prune \ + -o -name 'paxcheck.sh' -prune \ + -o -name 'make_gitrev.sh' -prune \ + -o -type f ! -name 'config*' \ + ! -name 'libtool' \ + -exec bash -c 'awk "NR==1 && /\#\!.*bin\/sh.*/ {print FILENAME;}" "{}"' \;); \ + else \ + echo "skipping checkbashisms because checkbashisms is not installed"; \ + fi + +PHONY += mancheck +mancheck: + @if type mandoc > /dev/null 2>&1; then \ + find ${top_srcdir}/man/man8 -type f -name 'zfs.8' \ + -o -name 'zpool.8' -o -name 'zdb.8' \ + -o -name 'zgenhostid.8' | \ + xargs mandoc -Tlint -Werror; \ + else \ + echo "skipping mancheck because mandoc is not installed"; \ + fi + +if BUILD_LINUX +stat_fmt = -c '%A %n' +else +stat_fmt = -f '%Sp %N' +endif + +PHONY += testscheck +testscheck: + @find ${top_srcdir}/tests/zfs-tests -type f \ + \( -name '*.ksh' -not ${filter_executable} \) -o \ + \( -name '*.kshlib' ${filter_executable} \) -o \ + \( -name '*.shlib' ${filter_executable} \) -o \ + \( -name '*.cfg' ${filter_executable} \) | \ + xargs -r stat ${stat_fmt} | \ + awk '{c++; print} END {if(c>0) exit 1}' + +PHONY += vcscheck +vcscheck: + @if git rev-parse --git-dir > /dev/null 2>&1; then \ + git ls-files . --exclude-standard --others | \ + awk '{c++; print} END {if(c>0) exit 1}' ; \ + fi + +PHONY += lint +lint: cppcheck paxcheck + +PHONY += cppcheck +cppcheck: + @if type cppcheck > /dev/null 2>&1; then \ + cppcheck --quiet --force --error-exitcode=2 --inline-suppr \ + --suppressions-list=${top_srcdir}/cppcheck-suppressions.txt \ + -UHAVE_SSE2 -UHAVE_AVX512F -UHAVE_UIO_ZEROCOPY \ + ${top_srcdir}; \ + else \ + echo "skipping cppcheck because cppcheck is not installed"; \ + fi + +PHONY += paxcheck +paxcheck: + @if type scanelf > /dev/null 2>&1; then \ + ${top_srcdir}/scripts/paxcheck.sh ${top_builddir}; \ + else \ + echo "skipping paxcheck because scanelf is not installed"; \ + fi + +PHONY += flake8 +flake8: + @if type flake8 > /dev/null 2>&1; then \ + flake8 ${top_srcdir}; \ + else \ + echo "skipping flake8 because flake8 is not installed"; \ + fi + +PHONY += ctags +ctags: + $(RM) tags + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hcS]' -print | xargs ctags -a + +PHONY += etags +etags: + $(RM) TAGS + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hcS]' -print | xargs etags -a + +PHONY += cscopelist +cscopelist: + find $(top_srcdir) -name '.?*' -prune \ + -o -type f -name '*.[hc]' -print >cscope.files + +PHONY += tags +tags: ctags etags + +PHONY += pkg pkg-dkms pkg-kmod pkg-utils +pkg: @DEFAULT_PACKAGE@ +pkg-dkms: @DEFAULT_PACKAGE@-dkms +pkg-kmod: @DEFAULT_PACKAGE@-kmod +pkg-utils: @DEFAULT_PACKAGE@-utils + +include config/rpm.am +include config/deb.am +include config/tgz.am + +.PHONY: $(PHONY) diff --git a/NEWS b/NEWS new file mode 100644 index 000000000000..bbdc2b69bbfe --- /dev/null +++ b/NEWS @@ -0,0 +1,3 @@ +Descriptions of all releases can be found on github: + +https://github.com/zfsonlinux/zfs/releases diff --git a/NOTICE b/NOTICE new file mode 100644 index 000000000000..321892039039 --- /dev/null +++ b/NOTICE @@ -0,0 +1,16 @@ +This work was produced under the auspices of the U.S. Department of Energy by +Lawrence Livermore National Laboratory under Contract DE-AC52-07NA27344. + +This work was prepared as an account of work sponsored by an agency of the +United States Government. Neither the United States Government nor Lawrence +Livermore National Security, LLC, nor any of their employees makes any warranty, +expressed or implied, or assumes any legal liability or responsibility for the +accuracy, completeness, or usefulness of any information, apparatus, product, or +process disclosed, or represents that its use would not infringe privately owned +rights. Reference herein to any specific commercial product, process, or service +by trade name, trademark, manufacturer, or otherwise does not necessarily +constitute or imply its endorsement, recommendation, or favoring by the United +States Government or Lawrence Livermore National Security, LLC. The views and +opinions of authors expressed herein do not necessarily state or reflect those +of the United States Government or Lawrence Livermore National Security, LLC, +and shall not be used for advertising or product endorsement purposes. diff --git a/README.md b/README.md new file mode 100644 index 000000000000..d215cd5d8ca5 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +![img](https://openzfs.github.io/openzfs-docs/_static/img/logo/480px-Open-ZFS-Secondary-Logo-Colour-halfsize.png) + +OpenZFS is an advanced file system and volume manager which was originally +developed for Solaris and is now maintained by the OpenZFS community. +This repository contains the code for running OpenZFS on Linux and FreeBSD. + +[![codecov](https://codecov.io/gh/openzfs/zfs/branch/master/graph/badge.svg)](https://codecov.io/gh/openzfs/zfs) +[![coverity](https://scan.coverity.com/projects/1973/badge.svg)](https://scan.coverity.com/projects/openzfs-zfs) + +# Official Resources + + * [Documentation](https://openzfs.github.io/openzfs-docs/) - for using and developing this repo + * [ZoL Site](https://zfsonlinux.org) - Linux release info & links + * [Mailing lists](https://openzfs.github.io/openzfs-docs/Project%20and%20Community/Mailing%20Lists.html) + * [OpenZFS site](http://open-zfs.org/) - for conference videos and info on other platforms (illumos, OSX, Windows, etc) + +# Installation + +Full documentation for installing OpenZFS on your favorite Linux distribution can +be found at the [ZoL Site](https://zfsonlinux.org/). + +# Contribute & Develop + +We have a separate document with [contribution guidelines](./.github/CONTRIBUTING.md). + +We have a [Code of Conduct](./CODE_OF_CONDUCT.md). + +# Release + +OpenZFS is released under a CDDL license. +For more details see the NOTICE, LICENSE and COPYRIGHT files; `UCRL-CODE-235197` + +# Supported Kernels + * The `META` file contains the officially recognized supported Linux kernel versions. + * Supported FreeBSD versions are 12-STABLE and 13-CURRENT. diff --git a/TEST b/TEST new file mode 100644 index 000000000000..376d6eb691e2 --- /dev/null +++ b/TEST @@ -0,0 +1,50 @@ +#!/bin/sh + +### prepare +#TEST_PREPARE_WATCHDOG="yes" +#TEST_PREPARE_SHARES="yes" + +### ztest +#TEST_ZTEST_SKIP="yes" +#TEST_ZTEST_TIMEOUT=1800 +#TEST_ZTEST_DIR="/var/tmp/" +#TEST_ZTEST_OPTIONS="-V" +#TEST_ZTEST_CORE_DIR="/mnt/zloop" + +### zimport +#TEST_ZIMPORT_SKIP="yes" +#TEST_ZIMPORT_DIR="/var/tmp/zimport" +#TEST_ZIMPORT_VERSIONS="master installed" +#TEST_ZIMPORT_POOLS="zol-0.6.1 zol-0.6.2 master installed" +#TEST_ZIMPORT_OPTIONS="-c" + +### xfstests +#TEST_XFSTESTS_SKIP="yes" +#TEST_XFSTESTS_URL="https://github.com/behlendorf/xfstests/archive/" +#TEST_XFSTESTS_VER="zfs.tar.gz" +#TEST_XFSTESTS_POOL="tank" +#TEST_XFSTESTS_FS="xfstests" +#TEST_XFSTESTS_VDEV="/var/tmp/vdev" +#TEST_XFSTESTS_OPTIONS="" + +### zfs-tests.sh +#TEST_ZFSTESTS_SKIP="yes" +#TEST_ZFSTESTS_DIR="/mnt/" +#TEST_ZFSTESTS_DISKS="vdb vdc vdd" +#TEST_ZFSTESTS_DISKSIZE="8G" +#TEST_ZFSTESTS_ITERS="1" +#TEST_ZFSTESTS_OPTIONS="-vx" +#TEST_ZFSTESTS_RUNFILE="linux.run" +#TEST_ZFSTESTS_TAGS="functional" + +### zfsstress +#TEST_ZFSSTRESS_SKIP="yes" +#TEST_ZFSSTRESS_URL="https://github.com/nedbass/zfsstress/archive/" +#TEST_ZFSSTRESS_VER="master.tar.gz" +#TEST_ZFSSTRESS_RUNTIME=300 +#TEST_ZFSSTRESS_POOL="tank" +#TEST_ZFSSTRESS_FS="fish" +#TEST_ZFSSTRESS_FSOPT="-o overlay=on" +#TEST_ZFSSTRESS_VDEV="/var/tmp/vdev" +#TEST_ZFSSTRESS_DIR="/$TEST_ZFSSTRESS_POOL/$TEST_ZFSSTRESS_FS" +#TEST_ZFSSTRESS_OPTIONS="" diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 000000000000..488e913b2bf4 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +autoreconf -fiv || exit 1 +rm -Rf autom4te.cache diff --git a/cmd/Makefile.am b/cmd/Makefile.am new file mode 100644 index 000000000000..88d32b1c538c --- /dev/null +++ b/cmd/Makefile.am @@ -0,0 +1,10 @@ +SUBDIRS = zfs zpool zdb zhack zinject zstream zstreamdump ztest +SUBDIRS += fsck_zfs vdev_id raidz_test zfs_ids_to_path + +if USING_PYTHON +SUBDIRS += arcstat arc_summary dbufstat +endif + +if BUILD_LINUX +SUBDIRS += mount_zfs zed zgenhostid zvol_id zvol_wait +endif diff --git a/cmd/arc_summary/.gitignore b/cmd/arc_summary/.gitignore new file mode 100644 index 000000000000..50ba15f034e2 --- /dev/null +++ b/cmd/arc_summary/.gitignore @@ -0,0 +1 @@ +arc_summary diff --git a/cmd/arc_summary/Makefile.am b/cmd/arc_summary/Makefile.am new file mode 100644 index 000000000000..1a26c2c199f8 --- /dev/null +++ b/cmd/arc_summary/Makefile.am @@ -0,0 +1,13 @@ +bin_SCRIPTS = arc_summary + +CLEANFILES = arc_summary +EXTRA_DIST = arc_summary2 arc_summary3 + +if USING_PYTHON_2 +SCRIPT = arc_summary2 +else +SCRIPT = arc_summary3 +endif + +arc_summary: $(SCRIPT) + cp $< $@ diff --git a/cmd/arc_summary/arc_summary2 b/cmd/arc_summary/arc_summary2 new file mode 100755 index 000000000000..5dc40d759dce --- /dev/null +++ b/cmd/arc_summary/arc_summary2 @@ -0,0 +1,1093 @@ +#!/usr/bin/env python2 +# +# $Id: arc_summary.pl,v 388:e27800740aa2 2011-07-08 02:53:29Z jhell $ +# +# Copyright (c) 2008 Ben Rockwood , +# Copyright (c) 2010 Martin Matuska , +# Copyright (c) 2010-2011 Jason J. Hellenthal , +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# If you are having troubles when using this script from cron(8) please try +# adjusting your PATH before reporting problems. +# +# Note some of this code uses older code (eg getopt instead of argparse, +# subprocess.Popen() instead of subprocess.run()) because we need to support +# some very old versions of Python. +# + +"""Print statistics on the ZFS Adjustable Replacement Cache (ARC) + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See the +in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +""" + +import getopt +import os +import sys +import time +import errno + +from subprocess import Popen, PIPE +from decimal import Decimal as D + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + base = 'kstat.zfs.misc.%s.' % namespace + return [(kstat.name, D(kstat.value)) for kstat in sysctl.filter(base)] + + def load_tunables(): + return dict((ctl.name, ctl.value) for ctl in sysctl.filter('vfs.zfs')) + +elif sys.platform.startswith('linux'): + + def load_kstats(namespace): + """Collect information on a specific subsystem of the ARC""" + + kstat = 'kstat.zfs.misc.%s.%%s' % namespace + path = '/proc/spl/kstat/zfs/%s' % namespace + with open(path) as f: + entries = [line.strip().split() for line in f][2:] # Skip header + return [(kstat % name, D(value)) for name, _, value in entries] + + def load_tunables(): + basepath = '/sys/module/zfs/parameters' + tunables = {} + for name in os.listdir(basepath): + if not name: + continue + path = '%s/%s' % (basepath, name) + with open(path) as f: + value = f.read() + tunables[name] = value.strip() + return tunables + + +show_tunable_descriptions = False +alternate_tunable_layout = False + + +def handle_Exception(ex_cls, ex, tb): + if ex is IOError: + if ex.errno == errno.EPIPE: + sys.exit() + + if ex is KeyboardInterrupt: + sys.exit() + + +sys.excepthook = handle_Exception + + +def get_Kstat(): + """Collect information on the ZFS subsystem from the /proc virtual + file system. The name "kstat" is a holdover from the Solaris utility + of the same name. + """ + + Kstat = {} + Kstat.update(load_kstats('arcstats')) + Kstat.update(load_kstats('zfetchstats')) + Kstat.update(load_kstats('vdev_cache_stats')) + return Kstat + + +def fBytes(b=0): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. + """ + + prefixes = [ + [2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]] # kibibytes (kilo) + + if b >= 2**10: + + for limit, unit in prefixes: + + if b >= limit: + value = b / limit + break + + result = "%0.2f\t%s" % (value, unit) + + else: + + result = "%d\tBytes" % b + + return result + + +def fHits(hits=0): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = [ + [10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']] # kilo (thousand) + + if hits >= 1000: + + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.2f%s" % (value, symbol) + + else: + + result = "%d" % hits + + return result + + +def fPerc(lVal=0, rVal=0, Decimal=2): + """Calculate percentage value and return in human-readable format""" + + if rVal > 0: + return str("%0." + str(Decimal) + "f") % (100 * (lVal / rVal)) + "%" + else: + return str("%0." + str(Decimal) + "f") % 100 + "%" + + +def get_arc_summary(Kstat): + """Collect general data on the ARC""" + + output = {} + memory_throttle_count = Kstat[ + "kstat.zfs.misc.arcstats.memory_throttle_count" + ] + + if memory_throttle_count > 0: + output['health'] = 'THROTTLED' + else: + output['health'] = 'HEALTHY' + + output['memory_throttle_count'] = fHits(memory_throttle_count) + + # ARC Misc. + deleted = Kstat["kstat.zfs.misc.arcstats.deleted"] + mutex_miss = Kstat["kstat.zfs.misc.arcstats.mutex_miss"] + evict_skip = Kstat["kstat.zfs.misc.arcstats.evict_skip"] + + # ARC Misc. + output["arc_misc"] = {} + output["arc_misc"]["deleted"] = fHits(deleted) + output["arc_misc"]['mutex_miss'] = fHits(mutex_miss) + output["arc_misc"]['evict_skips'] = fHits(evict_skip) + + # ARC Sizing + arc_size = Kstat["kstat.zfs.misc.arcstats.size"] + mru_size = Kstat["kstat.zfs.misc.arcstats.mru_size"] + mfu_size = Kstat["kstat.zfs.misc.arcstats.mfu_size"] + meta_limit = Kstat["kstat.zfs.misc.arcstats.arc_meta_limit"] + meta_size = Kstat["kstat.zfs.misc.arcstats.arc_meta_used"] + dnode_limit = Kstat["kstat.zfs.misc.arcstats.arc_dnode_limit"] + dnode_size = Kstat["kstat.zfs.misc.arcstats.dnode_size"] + target_max_size = Kstat["kstat.zfs.misc.arcstats.c_max"] + target_min_size = Kstat["kstat.zfs.misc.arcstats.c_min"] + target_size = Kstat["kstat.zfs.misc.arcstats.c"] + + target_size_ratio = (target_max_size / target_min_size) + + # ARC Sizing + output['arc_sizing'] = {} + output['arc_sizing']['arc_size'] = { + 'per': fPerc(arc_size, target_max_size), + 'num': fBytes(arc_size), + } + output['arc_sizing']['target_max_size'] = { + 'ratio': target_size_ratio, + 'num': fBytes(target_max_size), + } + output['arc_sizing']['target_min_size'] = { + 'per': fPerc(target_min_size, target_max_size), + 'num': fBytes(target_min_size), + } + output['arc_sizing']['target_size'] = { + 'per': fPerc(target_size, target_max_size), + 'num': fBytes(target_size), + } + output['arc_sizing']['meta_limit'] = { + 'per': fPerc(meta_limit, target_max_size), + 'num': fBytes(meta_limit), + } + output['arc_sizing']['meta_size'] = { + 'per': fPerc(meta_size, meta_limit), + 'num': fBytes(meta_size), + } + output['arc_sizing']['dnode_limit'] = { + 'per': fPerc(dnode_limit, meta_limit), + 'num': fBytes(dnode_limit), + } + output['arc_sizing']['dnode_size'] = { + 'per': fPerc(dnode_size, dnode_limit), + 'num': fBytes(dnode_size), + } + + # ARC Hash Breakdown + output['arc_hash_break'] = {} + output['arc_hash_break']['hash_chain_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chain_max" + ] + output['arc_hash_break']['hash_chains'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_chains" + ] + output['arc_hash_break']['hash_collisions'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_collisions" + ] + output['arc_hash_break']['hash_elements'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements" + ] + output['arc_hash_break']['hash_elements_max'] = Kstat[ + "kstat.zfs.misc.arcstats.hash_elements_max" + ] + + output['arc_size_break'] = {} + output['arc_size_break']['recently_used_cache_size'] = { + 'per': fPerc(mru_size, mru_size + mfu_size), + 'num': fBytes(mru_size), + } + output['arc_size_break']['frequently_used_cache_size'] = { + 'per': fPerc(mfu_size, mru_size + mfu_size), + 'num': fBytes(mfu_size), + } + + # ARC Hash Breakdown + hash_chain_max = Kstat["kstat.zfs.misc.arcstats.hash_chain_max"] + hash_chains = Kstat["kstat.zfs.misc.arcstats.hash_chains"] + hash_collisions = Kstat["kstat.zfs.misc.arcstats.hash_collisions"] + hash_elements = Kstat["kstat.zfs.misc.arcstats.hash_elements"] + hash_elements_max = Kstat["kstat.zfs.misc.arcstats.hash_elements_max"] + + output['arc_hash_break'] = {} + output['arc_hash_break']['elements_max'] = fHits(hash_elements_max) + output['arc_hash_break']['elements_current'] = { + 'per': fPerc(hash_elements, hash_elements_max), + 'num': fHits(hash_elements), + } + output['arc_hash_break']['collisions'] = fHits(hash_collisions) + output['arc_hash_break']['chain_max'] = fHits(hash_chain_max) + output['arc_hash_break']['chains'] = fHits(hash_chains) + + return output + + +def _arc_summary(Kstat): + """Print information on the ARC""" + + # ARC Sizing + arc = get_arc_summary(Kstat) + + sys.stdout.write("ARC Summary: (%s)\n" % arc['health']) + + sys.stdout.write("\tMemory Throttle Count:\t\t\t%s\n" % + arc['memory_throttle_count']) + sys.stdout.write("\n") + + # ARC Misc. + sys.stdout.write("ARC Misc:\n") + sys.stdout.write("\tDeleted:\t\t\t\t%s\n" % arc['arc_misc']['deleted']) + sys.stdout.write("\tMutex Misses:\t\t\t\t%s\n" % + arc['arc_misc']['mutex_miss']) + sys.stdout.write("\tEvict Skips:\t\t\t\t%s\n" % + arc['arc_misc']['evict_skips']) + sys.stdout.write("\n") + + # ARC Sizing + sys.stdout.write("ARC Size:\t\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['arc_size']['per'], + arc['arc_sizing']['arc_size']['num'] + ) + ) + sys.stdout.write("\tTarget Size: (Adaptive)\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_size']['per'], + arc['arc_sizing']['target_size']['num'], + ) + ) + + sys.stdout.write("\tMin Size (Hard Limit):\t\t%s\t%s\n" % ( + arc['arc_sizing']['target_min_size']['per'], + arc['arc_sizing']['target_min_size']['num'], + ) + ) + + sys.stdout.write("\tMax Size (High Water):\t\t%d:1\t%s\n" % ( + arc['arc_sizing']['target_max_size']['ratio'], + arc['arc_sizing']['target_max_size']['num'], + ) + ) + + sys.stdout.write("\nARC Size Breakdown:\n") + sys.stdout.write("\tRecently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['recently_used_cache_size']['per'], + arc['arc_size_break']['recently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tFrequently Used Cache Size:\t%s\t%s\n" % ( + arc['arc_size_break']['frequently_used_cache_size']['per'], + arc['arc_size_break']['frequently_used_cache_size']['num'], + ) + ) + sys.stdout.write("\tMetadata Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['meta_limit']['per'], + arc['arc_sizing']['meta_limit']['num'], + ) + ) + sys.stdout.write("\tMetadata Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['meta_size']['per'], + arc['arc_sizing']['meta_size']['num'], + ) + ) + sys.stdout.write("\tDnode Size (Hard Limit):\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_limit']['per'], + arc['arc_sizing']['dnode_limit']['num'], + ) + ) + sys.stdout.write("\tDnode Size:\t\t\t%s\t%s\n" % ( + arc['arc_sizing']['dnode_size']['per'], + arc['arc_sizing']['dnode_size']['num'], + ) + ) + + sys.stdout.write("\n") + + # ARC Hash Breakdown + sys.stdout.write("ARC Hash Breakdown:\n") + sys.stdout.write("\tElements Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['elements_max']) + sys.stdout.write("\tElements Current:\t\t%s\t%s\n" % ( + arc['arc_hash_break']['elements_current']['per'], + arc['arc_hash_break']['elements_current']['num'], + ) + ) + sys.stdout.write("\tCollisions:\t\t\t\t%s\n" % + arc['arc_hash_break']['collisions']) + sys.stdout.write("\tChain Max:\t\t\t\t%s\n" % + arc['arc_hash_break']['chain_max']) + sys.stdout.write("\tChains:\t\t\t\t\t%s\n" % + arc['arc_hash_break']['chains']) + + +def get_arc_efficiency(Kstat): + """Collect information on the efficiency of the ARC""" + + output = {} + + arc_hits = Kstat["kstat.zfs.misc.arcstats.hits"] + arc_misses = Kstat["kstat.zfs.misc.arcstats.misses"] + demand_data_hits = Kstat["kstat.zfs.misc.arcstats.demand_data_hits"] + demand_data_misses = Kstat["kstat.zfs.misc.arcstats.demand_data_misses"] + demand_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_hits" + ] + demand_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.demand_metadata_misses" + ] + mfu_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mfu_ghost_hits"] + mfu_hits = Kstat["kstat.zfs.misc.arcstats.mfu_hits"] + mru_ghost_hits = Kstat["kstat.zfs.misc.arcstats.mru_ghost_hits"] + mru_hits = Kstat["kstat.zfs.misc.arcstats.mru_hits"] + prefetch_data_hits = Kstat["kstat.zfs.misc.arcstats.prefetch_data_hits"] + prefetch_data_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_data_misses" + ] + prefetch_metadata_hits = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_hits" + ] + prefetch_metadata_misses = Kstat[ + "kstat.zfs.misc.arcstats.prefetch_metadata_misses" + ] + + anon_hits = arc_hits - ( + mfu_hits + mru_hits + mfu_ghost_hits + mru_ghost_hits + ) + arc_accesses_total = (arc_hits + arc_misses) + demand_data_total = (demand_data_hits + demand_data_misses) + prefetch_data_total = (prefetch_data_hits + prefetch_data_misses) + real_hits = (mfu_hits + mru_hits) + + output["total_accesses"] = fHits(arc_accesses_total) + output["cache_hit_ratio"] = { + 'per': fPerc(arc_hits, arc_accesses_total), + 'num': fHits(arc_hits), + } + output["cache_miss_ratio"] = { + 'per': fPerc(arc_misses, arc_accesses_total), + 'num': fHits(arc_misses), + } + output["actual_hit_ratio"] = { + 'per': fPerc(real_hits, arc_accesses_total), + 'num': fHits(real_hits), + } + output["data_demand_efficiency"] = { + 'per': fPerc(demand_data_hits, demand_data_total), + 'num': fHits(demand_data_total), + } + + if prefetch_data_total > 0: + output["data_prefetch_efficiency"] = { + 'per': fPerc(prefetch_data_hits, prefetch_data_total), + 'num': fHits(prefetch_data_total), + } + + if anon_hits > 0: + output["cache_hits_by_cache_list"] = {} + output["cache_hits_by_cache_list"]["anonymously_used"] = { + 'per': fPerc(anon_hits, arc_hits), + 'num': fHits(anon_hits), + } + + output["most_recently_used"] = { + 'per': fPerc(mru_hits, arc_hits), + 'num': fHits(mru_hits), + } + output["most_frequently_used"] = { + 'per': fPerc(mfu_hits, arc_hits), + 'num': fHits(mfu_hits), + } + output["most_recently_used_ghost"] = { + 'per': fPerc(mru_ghost_hits, arc_hits), + 'num': fHits(mru_ghost_hits), + } + output["most_frequently_used_ghost"] = { + 'per': fPerc(mfu_ghost_hits, arc_hits), + 'num': fHits(mfu_ghost_hits), + } + + output["cache_hits_by_data_type"] = {} + output["cache_hits_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_hits, arc_hits), + 'num': fHits(demand_data_hits), + } + output["cache_hits_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_hits, arc_hits), + 'num': fHits(prefetch_data_hits), + } + output["cache_hits_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_hits, arc_hits), + 'num': fHits(demand_metadata_hits), + } + output["cache_hits_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_hits, arc_hits), + 'num': fHits(prefetch_metadata_hits), + } + + output["cache_misses_by_data_type"] = {} + output["cache_misses_by_data_type"]["demand_data"] = { + 'per': fPerc(demand_data_misses, arc_misses), + 'num': fHits(demand_data_misses), + } + output["cache_misses_by_data_type"]["prefetch_data"] = { + 'per': fPerc(prefetch_data_misses, arc_misses), + 'num': fHits(prefetch_data_misses), + } + output["cache_misses_by_data_type"]["demand_metadata"] = { + 'per': fPerc(demand_metadata_misses, arc_misses), + 'num': fHits(demand_metadata_misses), + } + output["cache_misses_by_data_type"]["prefetch_metadata"] = { + 'per': fPerc(prefetch_metadata_misses, arc_misses), + 'num': fHits(prefetch_metadata_misses), + } + + return output + + +def _arc_efficiency(Kstat): + """Print information on the efficiency of the ARC""" + + arc = get_arc_efficiency(Kstat) + + sys.stdout.write("ARC Total accesses:\t\t\t\t\t%s\n" % + arc['total_accesses']) + sys.stdout.write("\tCache Hit Ratio:\t\t%s\t%s\n" % ( + arc['cache_hit_ratio']['per'], + arc['cache_hit_ratio']['num'], + ) + ) + sys.stdout.write("\tCache Miss Ratio:\t\t%s\t%s\n" % ( + arc['cache_miss_ratio']['per'], + arc['cache_miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tActual Hit Ratio:\t\t%s\t%s\n" % ( + arc['actual_hit_ratio']['per'], + arc['actual_hit_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + sys.stdout.write("\tData Demand Efficiency:\t\t%s\t%s\n" % ( + arc['data_demand_efficiency']['per'], + arc['data_demand_efficiency']['num'], + ) + ) + + if 'data_prefetch_efficiency' in arc: + sys.stdout.write("\tData Prefetch Efficiency:\t%s\t%s\n" % ( + arc['data_prefetch_efficiency']['per'], + arc['data_prefetch_efficiency']['num'], + ) + ) + sys.stdout.write("\n") + + sys.stdout.write("\tCACHE HITS BY CACHE LIST:\n") + if 'cache_hits_by_cache_list' in arc: + sys.stdout.write("\t Anonymously Used:\t\t%s\t%s\n" % ( + arc['cache_hits_by_cache_list']['anonymously_used']['per'], + arc['cache_hits_by_cache_list']['anonymously_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used:\t\t%s\t%s\n" % ( + arc['most_recently_used']['per'], + arc['most_recently_used']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used:\t\t%s\t%s\n" % ( + arc['most_frequently_used']['per'], + arc['most_frequently_used']['num'], + ) + ) + sys.stdout.write("\t Most Recently Used Ghost:\t%s\t%s\n" % ( + arc['most_recently_used_ghost']['per'], + arc['most_recently_used_ghost']['num'], + ) + ) + sys.stdout.write("\t Most Frequently Used Ghost:\t%s\t%s\n" % ( + arc['most_frequently_used_ghost']['per'], + arc['most_frequently_used_ghost']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE HITS BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_data']['per'], + arc["cache_hits_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_data']['per'], + arc["cache_hits_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['demand_metadata']['per'], + arc["cache_hits_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_hits_by_data_type"]['prefetch_metadata']['per'], + arc["cache_hits_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + sys.stdout.write("\n\tCACHE MISSES BY DATA TYPE:\n") + sys.stdout.write("\t Demand Data:\t\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_data']['per'], + arc["cache_misses_by_data_type"]['demand_data']['num'], + ) + ) + sys.stdout.write("\t Prefetch Data:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_data']['per'], + arc["cache_misses_by_data_type"]['prefetch_data']['num'], + ) + ) + sys.stdout.write("\t Demand Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['demand_metadata']['per'], + arc["cache_misses_by_data_type"]['demand_metadata']['num'], + ) + ) + sys.stdout.write("\t Prefetch Metadata:\t\t%s\t%s\n" % ( + arc["cache_misses_by_data_type"]['prefetch_metadata']['per'], + arc["cache_misses_by_data_type"]['prefetch_metadata']['num'], + ) + ) + + +def get_l2arc_summary(Kstat): + """Collection information on the L2ARC""" + + output = {} + + l2_abort_lowmem = Kstat["kstat.zfs.misc.arcstats.l2_abort_lowmem"] + l2_cksum_bad = Kstat["kstat.zfs.misc.arcstats.l2_cksum_bad"] + l2_evict_lock_retry = Kstat["kstat.zfs.misc.arcstats.l2_evict_lock_retry"] + l2_evict_reading = Kstat["kstat.zfs.misc.arcstats.l2_evict_reading"] + l2_feeds = Kstat["kstat.zfs.misc.arcstats.l2_feeds"] + l2_free_on_write = Kstat["kstat.zfs.misc.arcstats.l2_free_on_write"] + l2_hdr_size = Kstat["kstat.zfs.misc.arcstats.l2_hdr_size"] + l2_hits = Kstat["kstat.zfs.misc.arcstats.l2_hits"] + l2_io_error = Kstat["kstat.zfs.misc.arcstats.l2_io_error"] + l2_misses = Kstat["kstat.zfs.misc.arcstats.l2_misses"] + l2_rw_clash = Kstat["kstat.zfs.misc.arcstats.l2_rw_clash"] + l2_size = Kstat["kstat.zfs.misc.arcstats.l2_size"] + l2_asize = Kstat["kstat.zfs.misc.arcstats.l2_asize"] + l2_writes_done = Kstat["kstat.zfs.misc.arcstats.l2_writes_done"] + l2_writes_error = Kstat["kstat.zfs.misc.arcstats.l2_writes_error"] + l2_writes_sent = Kstat["kstat.zfs.misc.arcstats.l2_writes_sent"] + + l2_access_total = (l2_hits + l2_misses) + output['l2_health_count'] = (l2_writes_error + l2_cksum_bad + l2_io_error) + + output['l2_access_total'] = l2_access_total + output['l2_size'] = l2_size + output['l2_asize'] = l2_asize + + if l2_size > 0 and l2_access_total > 0: + + if output['l2_health_count'] > 0: + output["health"] = "DEGRADED" + else: + output["health"] = "HEALTHY" + + output["low_memory_aborts"] = fHits(l2_abort_lowmem) + output["free_on_write"] = fHits(l2_free_on_write) + output["rw_clashes"] = fHits(l2_rw_clash) + output["bad_checksums"] = fHits(l2_cksum_bad) + output["io_errors"] = fHits(l2_io_error) + + output["l2_arc_size"] = {} + output["l2_arc_size"]["adative"] = fBytes(l2_size) + output["l2_arc_size"]["actual"] = { + 'per': fPerc(l2_asize, l2_size), + 'num': fBytes(l2_asize) + } + output["l2_arc_size"]["head_size"] = { + 'per': fPerc(l2_hdr_size, l2_size), + 'num': fBytes(l2_hdr_size), + } + + output["l2_arc_evicts"] = {} + output["l2_arc_evicts"]['lock_retries'] = fHits(l2_evict_lock_retry) + output["l2_arc_evicts"]['reading'] = fHits(l2_evict_reading) + + output['l2_arc_breakdown'] = {} + output['l2_arc_breakdown']['value'] = fHits(l2_access_total) + output['l2_arc_breakdown']['hit_ratio'] = { + 'per': fPerc(l2_hits, l2_access_total), + 'num': fHits(l2_hits), + } + output['l2_arc_breakdown']['miss_ratio'] = { + 'per': fPerc(l2_misses, l2_access_total), + 'num': fHits(l2_misses), + } + output['l2_arc_breakdown']['feeds'] = fHits(l2_feeds) + + output['l2_arc_buffer'] = {} + + output['l2_arc_writes'] = {} + output['l2_writes_done'] = l2_writes_done + output['l2_writes_sent'] = l2_writes_sent + if l2_writes_done != l2_writes_sent: + output['l2_arc_writes']['writes_sent'] = { + 'value': "FAULTED", + 'num': fHits(l2_writes_sent), + } + output['l2_arc_writes']['done_ratio'] = { + 'per': fPerc(l2_writes_done, l2_writes_sent), + 'num': fHits(l2_writes_done), + } + output['l2_arc_writes']['error_ratio'] = { + 'per': fPerc(l2_writes_error, l2_writes_sent), + 'num': fHits(l2_writes_error), + } + else: + output['l2_arc_writes']['writes_sent'] = { + 'per': fPerc(100), + 'num': fHits(l2_writes_sent), + } + + return output + + +def _l2arc_summary(Kstat): + """Print information on the L2ARC""" + + arc = get_l2arc_summary(Kstat) + + if arc['l2_size'] > 0 and arc['l2_access_total'] > 0: + sys.stdout.write("L2 ARC Summary: ") + if arc['l2_health_count'] > 0: + sys.stdout.write("(DEGRADED)\n") + else: + sys.stdout.write("(HEALTHY)\n") + sys.stdout.write("\tLow Memory Aborts:\t\t\t%s\n" % + arc['low_memory_aborts']) + sys.stdout.write("\tFree on Write:\t\t\t\t%s\n" % arc['free_on_write']) + sys.stdout.write("\tR/W Clashes:\t\t\t\t%s\n" % arc['rw_clashes']) + sys.stdout.write("\tBad Checksums:\t\t\t\t%s\n" % arc['bad_checksums']) + sys.stdout.write("\tIO Errors:\t\t\t\t%s\n" % arc['io_errors']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Size: (Adaptive)\t\t\t\t%s\n" % + arc["l2_arc_size"]["adative"]) + sys.stdout.write("\tCompressed:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["actual"]["per"], + arc["l2_arc_size"]["actual"]["num"], + ) + ) + sys.stdout.write("\tHeader Size:\t\t\t%s\t%s\n" % ( + arc["l2_arc_size"]["head_size"]["per"], + arc["l2_arc_size"]["head_size"]["num"], + ) + ) + sys.stdout.write("\n") + + if arc["l2_arc_evicts"]['lock_retries'] != '0' or \ + arc["l2_arc_evicts"]["reading"] != '0': + sys.stdout.write("L2 ARC Evicts:\n") + sys.stdout.write("\tLock Retries:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]['lock_retries']) + sys.stdout.write("\tUpon Reading:\t\t\t\t%s\n" % + arc["l2_arc_evicts"]["reading"]) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Breakdown:\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['hit_ratio']['per'], + arc['l2_arc_breakdown']['hit_ratio']['num'], + ) + ) + + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_breakdown']['miss_ratio']['per'], + arc['l2_arc_breakdown']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\tFeeds:\t\t\t\t\t%s\n" % + arc['l2_arc_breakdown']['feeds']) + sys.stdout.write("\n") + + sys.stdout.write("L2 ARC Writes:\n") + if arc['l2_writes_done'] != arc['l2_writes_sent']: + sys.stdout.write("\tWrites Sent: (%s)\t\t\t\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['value'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + sys.stdout.write("\t Done Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['done_ratio']['per'], + arc['l2_arc_writes']['done_ratio']['num'], + ) + ) + sys.stdout.write("\t Error Ratio:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['error_ratio']['per'], + arc['l2_arc_writes']['error_ratio']['num'], + ) + ) + else: + sys.stdout.write("\tWrites Sent:\t\t\t%s\t%s\n" % ( + arc['l2_arc_writes']['writes_sent']['per'], + arc['l2_arc_writes']['writes_sent']['num'], + ) + ) + + +def get_dmu_summary(Kstat): + """Collect information on the DMU""" + + output = {} + + zfetch_hits = Kstat["kstat.zfs.misc.zfetchstats.hits"] + zfetch_misses = Kstat["kstat.zfs.misc.zfetchstats.misses"] + + zfetch_access_total = (zfetch_hits + zfetch_misses) + output['zfetch_access_total'] = zfetch_access_total + + if zfetch_access_total > 0: + output['dmu'] = {} + output['dmu']['efficiency'] = {} + output['dmu']['efficiency']['value'] = fHits(zfetch_access_total) + output['dmu']['efficiency']['hit_ratio'] = { + 'per': fPerc(zfetch_hits, zfetch_access_total), + 'num': fHits(zfetch_hits), + } + output['dmu']['efficiency']['miss_ratio'] = { + 'per': fPerc(zfetch_misses, zfetch_access_total), + 'num': fHits(zfetch_misses), + } + + return output + + +def _dmu_summary(Kstat): + """Print information on the DMU""" + + arc = get_dmu_summary(Kstat) + + if arc['zfetch_access_total'] > 0: + sys.stdout.write("DMU Prefetch Efficiency:\t\t\t\t\t%s\n" % + arc['dmu']['efficiency']['value']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['hit_ratio']['per'], + arc['dmu']['efficiency']['hit_ratio']['num'], + ) + ) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['dmu']['efficiency']['miss_ratio']['per'], + arc['dmu']['efficiency']['miss_ratio']['num'], + ) + ) + + sys.stdout.write("\n") + + +def get_vdev_summary(Kstat): + """Collect information on the VDEVs""" + + output = {} + + vdev_cache_delegations = \ + Kstat["kstat.zfs.misc.vdev_cache_stats.delegations"] + vdev_cache_misses = Kstat["kstat.zfs.misc.vdev_cache_stats.misses"] + vdev_cache_hits = Kstat["kstat.zfs.misc.vdev_cache_stats.hits"] + vdev_cache_total = (vdev_cache_misses + vdev_cache_hits + + vdev_cache_delegations) + + output['vdev_cache_total'] = vdev_cache_total + + if vdev_cache_total > 0: + output['summary'] = fHits(vdev_cache_total) + output['hit_ratio'] = { + 'per': fPerc(vdev_cache_hits, vdev_cache_total), + 'num': fHits(vdev_cache_hits), + } + output['miss_ratio'] = { + 'per': fPerc(vdev_cache_misses, vdev_cache_total), + 'num': fHits(vdev_cache_misses), + } + output['delegations'] = { + 'per': fPerc(vdev_cache_delegations, vdev_cache_total), + 'num': fHits(vdev_cache_delegations), + } + + return output + + +def _vdev_summary(Kstat): + """Print information on the VDEVs""" + + arc = get_vdev_summary(Kstat) + + if arc['vdev_cache_total'] > 0: + sys.stdout.write("VDEV Cache Summary:\t\t\t\t%s\n" % arc['summary']) + sys.stdout.write("\tHit Ratio:\t\t\t%s\t%s\n" % ( + arc['hit_ratio']['per'], + arc['hit_ratio']['num'], + )) + sys.stdout.write("\tMiss Ratio:\t\t\t%s\t%s\n" % ( + arc['miss_ratio']['per'], + arc['miss_ratio']['num'], + )) + sys.stdout.write("\tDelegations:\t\t\t%s\t%s\n" % ( + arc['delegations']['per'], + arc['delegations']['num'], + )) + + +def _tunable_summary(Kstat): + """Print information on tunables, including descriptions if requested""" + + global show_tunable_descriptions + global alternate_tunable_layout + + tunables = load_tunables() + descriptions = {} + + if show_tunable_descriptions: + + command = ["/sbin/modinfo", "zfs", "-0"] + + try: + p = Popen(command, stdin=PIPE, stdout=PIPE, + stderr=PIPE, shell=False, close_fds=True) + p.wait() + + # By default, Python 2 returns a string as the first element of the + # tuple from p.communicate(), while Python 3 returns bytes which + # must be decoded first. The better way to do this would be with + # subprocess.run() or at least .check_output(), but this fails on + # CentOS 6 because of its old version of Python 2 + desc = bytes.decode(p.communicate()[0]) + description_list = desc.strip().split('\0') + + if p.returncode == 0: + for tunable in description_list: + if tunable[0:5] == 'parm:': + tunable = tunable[5:].strip() + name, description = tunable.split(':', 1) + if not description: + description = "Description unavailable" + descriptions[name] = description + else: + sys.stderr.write("%s: '%s' exited with code %i\n" % + (sys.argv[0], command[0], p.returncode)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + except OSError as e: + sys.stderr.write("%s: Cannot run '%s': %s\n" % + (sys.argv[0], command[0], e.strerror)) + sys.stderr.write("Tunable descriptions will be disabled.\n") + + sys.stdout.write("ZFS Tunables:\n") + + if alternate_tunable_layout: + fmt = "\t%s=%s\n" + else: + fmt = "\t%-50s%s\n" + + for name in sorted(tunables.keys()): + if show_tunable_descriptions and name in descriptions: + sys.stdout.write("\t# %s\n" % descriptions[name]) + + sys.stdout.write(fmt % (name, tunables[name])) + + +unSub = [ + _arc_summary, + _arc_efficiency, + _l2arc_summary, + _dmu_summary, + _vdev_summary, + _tunable_summary +] + + +def zfs_header(): + """Print title string with date""" + + daydate = time.strftime('%a %b %d %H:%M:%S %Y') + + sys.stdout.write('\n'+'-'*72+'\n') + sys.stdout.write('ZFS Subsystem Report\t\t\t\t%s' % daydate) + sys.stdout.write('\n') + + +def usage(): + """Print usage information""" + + sys.stdout.write("Usage: arc_summary [-h] [-a] [-d] [-p PAGE]\n\n") + sys.stdout.write("\t -h, --help : " + "Print this help message and exit\n") + sys.stdout.write("\t -a, --alternate : " + "Show an alternate sysctl layout\n") + sys.stdout.write("\t -d, --description : " + "Show the sysctl descriptions\n") + sys.stdout.write("\t -p PAGE, --page=PAGE : " + "Select a single output page to display,\n") + sys.stdout.write("\t " + "should be an integer between 1 and " + + str(len(unSub)) + "\n\n") + sys.stdout.write("Examples:\n") + sys.stdout.write("\tarc_summary -a\n") + sys.stdout.write("\tarc_summary -p 4\n") + sys.stdout.write("\tarc_summary -ad\n") + sys.stdout.write("\tarc_summary --page=2\n") + + +def main(): + """Main function""" + + global show_tunable_descriptions + global alternate_tunable_layout + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "adp:h", ["alternate", "description", "page=", "help"] + ) + except getopt.error as e: + sys.stderr.write("Error: %s\n" % e.msg) + usage() + sys.exit(1) + + args = {} + for opt, arg in opts: + if opt in ('-a', '--alternate'): + args['a'] = True + if opt in ('-d', '--description'): + args['d'] = True + if opt in ('-p', '--page'): + args['p'] = arg + if opt in ('-h', '--help'): + usage() + sys.exit(0) + + Kstat = get_Kstat() + + alternate_tunable_layout = 'a' in args + show_tunable_descriptions = 'd' in args + + pages = [] + + if 'p' in args: + try: + pages.append(unSub[int(args['p']) - 1]) + except IndexError: + sys.stderr.write('the argument to -p must be between 1 and ' + + str(len(unSub)) + '\n') + sys.exit(1) + else: + pages = unSub + + zfs_header() + for page in pages: + page(Kstat) + sys.stdout.write("\n") + + +if __name__ == '__main__': + main() diff --git a/cmd/arc_summary/arc_summary3 b/cmd/arc_summary/arc_summary3 new file mode 100755 index 000000000000..c920b8e5395d --- /dev/null +++ b/cmd/arc_summary/arc_summary3 @@ -0,0 +1,943 @@ +#!/usr/bin/env python3 +# +# Copyright (c) 2008 Ben Rockwood , +# Copyright (c) 2010 Martin Matuska , +# Copyright (c) 2010-2011 Jason J. Hellenthal , +# Copyright (c) 2017 Scot W. Stevenson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +"""Print statistics on the ZFS ARC Cache and other information + +Provides basic information on the ARC, its efficiency, the L2ARC (if present), +the Data Management Unit (DMU), Virtual Devices (VDEVs), and tunables. See +the in-source documentation and code at +https://github.com/zfsonlinux/zfs/blob/master/module/zfs/arc.c for details. +The original introduction to arc_summary can be found at +http://cuddletech.com/?p=454 +""" + +import argparse +import os +import subprocess +import sys +import time + +DESCRIPTION = 'Print ARC and other statistics for ZFS on Linux' +INDENT = ' '*8 +LINE_LENGTH = 72 +DATE_FORMAT = '%a %b %d %H:%M:%S %Y' +TITLE = 'ZFS Subsystem Report' + +SECTIONS = 'arc archits dmu l2arc spl tunables vdev zil'.split() +SECTION_HELP = 'print info from one section ('+' '.join(SECTIONS)+')' + +# Tunables and SPL are handled separately because they come from +# different sources +SECTION_PATHS = {'arc': 'arcstats', + 'dmu': 'dmu_tx', + 'l2arc': 'arcstats', # L2ARC stuff lives in arcstats + 'vdev': 'vdev_cache_stats', + 'xuio': 'xuio_stats', + 'zfetch': 'zfetchstats', + 'zil': 'zil'} + +parser = argparse.ArgumentParser(description=DESCRIPTION) +parser.add_argument('-a', '--alternate', action='store_true', default=False, + help='use alternate formatting for tunables and SPL', + dest='alt') +parser.add_argument('-d', '--description', action='store_true', default=False, + help='print descriptions with tunables and SPL', + dest='desc') +parser.add_argument('-g', '--graph', action='store_true', default=False, + help='print graph on ARC use and exit', dest='graph') +parser.add_argument('-p', '--page', type=int, dest='page', + help='print page by number (DEPRECATED, use "-s")') +parser.add_argument('-r', '--raw', action='store_true', default=False, + help='dump all available data with minimal formatting', + dest='raw') +parser.add_argument('-s', '--section', dest='section', help=SECTION_HELP) +ARGS = parser.parse_args() + + +if sys.platform.startswith('freebsd'): + # Requires py36-sysctl on FreeBSD + import sysctl + + VDEV_CACHE_SIZE = 'vdev.cache_size' + + def load_kstats(section): + base = 'kstat.zfs.misc.{section}.'.format(section=section) + # base is removed from the name + fmt = lambda kstat: '{name} : {value}'.format(name=kstat.name[len(base):], + value=kstat.value) + return [fmt(kstat) for kstat in sysctl.filter(base)] + + def get_params(base): + cut = 8 # = len('vfs.zfs.') + return {ctl.name[cut:]: str(ctl.value) for ctl in sysctl.filter(base)} + + def get_tunable_params(): + return get_params('vfs.zfs') + + def get_vdev_params(): + return get_params('vfs.zfs.vdev') + + def get_version_impl(request): + # FreeBSD reports versions for zpl and spa instead of zfs and spl. + name = {'zfs': 'zpl', + 'spl': 'spa'}[request] + mib = 'vfs.zfs.version.{}'.format(name) + version = sysctl.filter(mib)[0].value + return '{} version {}'.format(name, version) + + def get_descriptions(_request): + # py-sysctl doesn't give descriptions, so we have to shell out. + command = ['sysctl', '-d', 'vfs.zfs'] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + lines = info.stdout.split('\n') + else: + info = subprocess.check_output(command, universal_newlines=True) + lines = info.split('\n') + + def fmt(line): + name, desc = line.split(':', 1) + return (name.strip(), desc.strip()) + + return dict([fmt(line) for line in lines if len(line) > 0]) + + +elif sys.platform.startswith('linux'): + KSTAT_PATH = '/proc/spl/kstat/zfs' + SPL_PATH = '/sys/module/spl/parameters' + TUNABLES_PATH = '/sys/module/zfs/parameters' + + VDEV_CACHE_SIZE = 'zfs_vdev_cache_size' + + def load_kstats(section): + path = os.path.join(KSTAT_PATH, section) + with open(path) as f: + return list(f)[2:] # Get rid of header + + def get_params(basepath): + """Collect information on the Solaris Porting Layer (SPL) or the + tunables, depending on the PATH given. Does not check if PATH is + legal. + """ + result = {} + for name in os.listdir(basepath): + path = os.path.join(basepath, name) + with open(path) as f: + value = f.read() + result[name] = value.strip() + return result + + def get_spl_params(): + return get_params(SPL_PATH) + + def get_tunable_params(): + return get_params(TUNABLES_PATH) + + def get_vdev_params(): + return get_params(TUNABLES_PATH) + + def get_version_impl(request): + # The original arc_summary called /sbin/modinfo/{spl,zfs} to get + # the version information. We switch to /sys/module/{spl,zfs}/version + # to make sure we get what is really loaded in the kernel + command = ["cat", "/sys/module/{0}/version".format(request)] + req = request.upper() + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + version = info.stdout.strip() + else: + info = subprocess.check_output(command, universal_newlines=True) + version = info.strip() + + return version + + def get_descriptions(request): + """Get the descriptions of the Solaris Porting Layer (SPL) or the + tunables, return with minimal formatting. + """ + + if request not in ('spl', 'zfs'): + print('ERROR: description of "{0}" requested)'.format(request)) + sys.exit(1) + + descs = {} + target_prefix = 'parm:' + + # We would prefer to do this with /sys/modules -- see the discussion at + # get_version() -- but there isn't a way to get the descriptions from + # there, so we fall back on modinfo + command = ["/sbin/modinfo", request, "-0"] + + # The recommended way to do this is with subprocess.run(). However, + # some installed versions of Python are < 3.5, so we offer them + # the option of doing it the old way (for now) + info = '' + + try: + + if 'run' in dir(subprocess): + info = subprocess.run(command, stdout=subprocess.PIPE, + universal_newlines=True) + raw_output = info.stdout.split('\0') + else: + info = subprocess.check_output(command, + universal_newlines=True) + raw_output = info.split('\0') + + except subprocess.CalledProcessError: + print("Error: Descriptions not available", + "(can't access kernel module)") + sys.exit(1) + + for line in raw_output: + + if not line.startswith(target_prefix): + continue + + line = line[len(target_prefix):].strip() + name, raw_desc = line.split(':', 1) + desc = raw_desc.rsplit('(', 1)[0] + + if desc == '': + desc = '(No description found)' + + descs[name.strip()] = desc.strip() + + return descs + + +def cleanup_line(single_line): + """Format a raw line of data from /proc and isolate the name value + part, returning a tuple with each. Currently, this gets rid of the + middle '4'. For example "arc_no_grow 4 0" returns the tuple + ("arc_no_grow", "0"). + """ + name, _, value = single_line.split() + + return name, value + + +def draw_graph(kstats_dict): + """Draw a primitive graph representing the basic information on the + ARC -- its size and the proportion used by MFU and MRU -- and quit. + We use max size of the ARC to calculate how full it is. This is a + very rough representation. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + GRAPH_INDENT = ' '*4 + GRAPH_WIDTH = 60 + arc_size = f_bytes(arc_stats['size']) + arc_perc = f_perc(arc_stats['size'], arc_stats['c_max']) + mfu_size = f_bytes(arc_stats['mfu_size']) + mru_size = f_bytes(arc_stats['mru_size']) + meta_limit = f_bytes(arc_stats['arc_meta_limit']) + meta_size = f_bytes(arc_stats['arc_meta_used']) + dnode_limit = f_bytes(arc_stats['arc_dnode_limit']) + dnode_size = f_bytes(arc_stats['dnode_size']) + + info_form = ('ARC: {0} ({1}) MFU: {2} MRU: {3} META: {4} ({5}) ' + 'DNODE {6} ({7})') + info_line = info_form.format(arc_size, arc_perc, mfu_size, mru_size, + meta_size, meta_limit, dnode_size, + dnode_limit) + info_spc = ' '*int((GRAPH_WIDTH-len(info_line))/2) + info_line = GRAPH_INDENT+info_spc+info_line + + graph_line = GRAPH_INDENT+'+'+('-'*(GRAPH_WIDTH-2))+'+' + + mfu_perc = float(int(arc_stats['mfu_size'])/int(arc_stats['c_max'])) + mru_perc = float(int(arc_stats['mru_size'])/int(arc_stats['c_max'])) + arc_perc = float(int(arc_stats['size'])/int(arc_stats['c_max'])) + total_ticks = float(arc_perc)*GRAPH_WIDTH + mfu_ticks = mfu_perc*GRAPH_WIDTH + mru_ticks = mru_perc*GRAPH_WIDTH + other_ticks = total_ticks-(mfu_ticks+mru_ticks) + + core_form = 'F'*int(mfu_ticks)+'R'*int(mru_ticks)+'O'*int(other_ticks) + core_spc = ' '*(GRAPH_WIDTH-(2+len(core_form))) + core_line = GRAPH_INDENT+'|'+core_form+core_spc+'|' + + for line in ('', info_line, graph_line, core_line, graph_line, ''): + print(line) + + +def f_bytes(byte_string): + """Return human-readable representation of a byte value in + powers of 2 (eg "KiB" for "kibibytes", etc) to two decimal + points. Values smaller than one KiB are returned without + decimal points. Note "bytes" is a reserved keyword. + """ + + prefixes = ([2**80, "YiB"], # yobibytes (yotta) + [2**70, "ZiB"], # zebibytes (zetta) + [2**60, "EiB"], # exbibytes (exa) + [2**50, "PiB"], # pebibytes (peta) + [2**40, "TiB"], # tebibytes (tera) + [2**30, "GiB"], # gibibytes (giga) + [2**20, "MiB"], # mebibytes (mega) + [2**10, "KiB"]) # kibibytes (kilo) + + bites = int(byte_string) + + if bites >= 2**10: + for limit, unit in prefixes: + + if bites >= limit: + value = bites / limit + break + + result = '{0:.1f} {1}'.format(value, unit) + else: + result = '{0} Bytes'.format(bites) + + return result + + +def f_hits(hits_string): + """Create a human-readable representation of the number of hits. + The single-letter symbols used are SI to avoid the confusion caused + by the different "short scale" and "long scale" representations in + English, which use the same words for different values. See + https://en.wikipedia.org/wiki/Names_of_large_numbers and: + https://physics.nist.gov/cuu/Units/prefixes.html + """ + + numbers = ([10**24, 'Y'], # yotta (septillion) + [10**21, 'Z'], # zetta (sextillion) + [10**18, 'E'], # exa (quintrillion) + [10**15, 'P'], # peta (quadrillion) + [10**12, 'T'], # tera (trillion) + [10**9, 'G'], # giga (billion) + [10**6, 'M'], # mega (million) + [10**3, 'k']) # kilo (thousand) + + hits = int(hits_string) + + if hits >= 1000: + for limit, symbol in numbers: + + if hits >= limit: + value = hits/limit + break + + result = "%0.1f%s" % (value, symbol) + else: + result = "%d" % hits + + return result + + +def f_perc(value1, value2): + """Calculate percentage and return in human-readable form. If + rounding produces the result '0.0' though the first number is + not zero, include a 'less-than' symbol to avoid confusion. + Division by zero is handled by returning 'n/a'; no error + is called. + """ + + v1 = float(value1) + v2 = float(value2) + + try: + perc = 100 * v1/v2 + except ZeroDivisionError: + result = 'n/a' + else: + result = '{0:0.1f} %'.format(perc) + + if result == '0.0 %' and v1 > 0: + result = '< 0.1 %' + + return result + + +def format_raw_line(name, value): + """For the --raw option for the tunable and SPL outputs, decide on the + correct formatting based on the --alternate flag. + """ + + if ARGS.alt: + result = '{0}{1}={2}'.format(INDENT, name, value) + else: + spc = LINE_LENGTH-(len(INDENT)+len(value)) + result = '{0}{1:<{spc}}{2}'.format(INDENT, name, value, spc=spc) + + return result + + +def get_kstats(): + """Collect information on the ZFS subsystem. The step does not perform any + further processing, giving us the option to only work on what is actually + needed. The name "kstat" is a holdover from the Solaris utility of the same + name. + """ + + result = {} + + for section in SECTION_PATHS.values(): + if section not in result: + result[section] = load_kstats(section) + + return result + + +def get_version(request): + """Get the version number of ZFS or SPL on this machine for header. + Returns an error string, but does not raise an error, if we can't + get the ZFS/SPL version. + """ + + if request not in ('spl', 'zfs'): + error_msg = '(ERROR: "{0}" requested)'.format(request) + return error_msg + + return get_version_impl(request) + + +def print_header(): + """Print the initial heading with date and time as well as info on the + kernel and ZFS versions. This is not called for the graph. + """ + + # datetime is now recommended over time but we keep the exact formatting + # from the older version of arc_summary in case there are scripts + # that expect it in this way + daydate = time.strftime(DATE_FORMAT) + spc_date = LINE_LENGTH-len(daydate) + sys_version = os.uname() + + sys_msg = sys_version.sysname+' '+sys_version.release + zfs = get_version('zfs') + spc_zfs = LINE_LENGTH-len(zfs) + + machine_msg = 'Machine: '+sys_version.nodename+' ('+sys_version.machine+')' + spl = get_version('spl') + spc_spl = LINE_LENGTH-len(spl) + + print('\n'+('-'*LINE_LENGTH)) + print('{0:<{spc}}{1}'.format(TITLE, daydate, spc=spc_date)) + print('{0:<{spc}}{1}'.format(sys_msg, zfs, spc=spc_zfs)) + print('{0:<{spc}}{1}\n'.format(machine_msg, spl, spc=spc_spl)) + + +def print_raw(kstats_dict): + """Print all available data from the system in a minimally sorted format. + This can be used as a source to be piped through 'grep'. + """ + + sections = sorted(kstats_dict.keys()) + + for section in sections: + + print('\n{0}:'.format(section.upper())) + lines = sorted(kstats_dict[section]) + + for line in lines: + name, value = cleanup_line(line) + print(format_raw_line(name, value)) + + # Tunables and SPL must be handled separately because they come from a + # different source and have descriptions the user might request + print() + section_spl() + section_tunables() + + +def isolate_section(section_name, kstats_dict): + """From the complete information on all sections, retrieve only those + for one section. + """ + + try: + section_data = kstats_dict[section_name] + except KeyError: + print('ERROR: Data on {0} not available'.format(section_data)) + sys.exit(1) + + section_dict = dict(cleanup_line(l) for l in section_data) + + return section_dict + + +# Formatted output helper functions + + +def prt_1(text, value): + """Print text and one value, no indent""" + spc = ' '*(LINE_LENGTH-(len(text)+len(value))) + print('{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_i1(text, value): + """Print text and one value, with indent""" + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(value))) + print(INDENT+'{0}{spc}{1}'.format(text, value, spc=spc)) + + +def prt_2(text, value1, value2): + """Print text and two values, no indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(text)+len(values)+2)) + print('{0}{spc} {1}'.format(text, values, spc=spc)) + + +def prt_i2(text, value1, value2): + """Print text and two values, with indent""" + values = '{0:>9} {1:>9}'.format(value1, value2) + spc = ' '*(LINE_LENGTH-(len(INDENT)+len(text)+len(values)+2)) + print(INDENT+'{0}{spc} {1}'.format(text, values, spc=spc)) + + +# The section output concentrates on important parameters instead of +# being exhaustive (that is what the --raw parameter is for) + + +def section_arc(kstats_dict): + """Give basic information on the ARC, MRU and MFU. This is the first + and most used section. + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + + throttle = arc_stats['memory_throttle_count'] + + if throttle == '0': + health = 'HEALTHY' + else: + health = 'THROTTLED' + + prt_1('ARC status:', health) + prt_i1('Memory throttle count:', throttle) + print() + + arc_size = arc_stats['size'] + arc_target_size = arc_stats['c'] + arc_max = arc_stats['c_max'] + arc_min = arc_stats['c_min'] + mfu_size = arc_stats['mfu_size'] + mru_size = arc_stats['mru_size'] + meta_limit = arc_stats['arc_meta_limit'] + meta_size = arc_stats['arc_meta_used'] + dnode_limit = arc_stats['arc_dnode_limit'] + dnode_size = arc_stats['dnode_size'] + target_size_ratio = '{0}:1'.format(int(arc_max) // int(arc_min)) + + prt_2('ARC size (current):', + f_perc(arc_size, arc_max), f_bytes(arc_size)) + prt_i2('Target size (adaptive):', + f_perc(arc_target_size, arc_max), f_bytes(arc_target_size)) + prt_i2('Min size (hard limit):', + f_perc(arc_min, arc_max), f_bytes(arc_min)) + prt_i2('Max size (high water):', + target_size_ratio, f_bytes(arc_max)) + caches_size = int(mfu_size)+int(mru_size) + prt_i2('Most Frequently Used (MFU) cache size:', + f_perc(mfu_size, caches_size), f_bytes(mfu_size)) + prt_i2('Most Recently Used (MRU) cache size:', + f_perc(mru_size, caches_size), f_bytes(mru_size)) + prt_i2('Metadata cache size (hard limit):', + f_perc(meta_limit, arc_max), f_bytes(meta_limit)) + prt_i2('Metadata cache size (current):', + f_perc(meta_size, meta_limit), f_bytes(meta_size)) + prt_i2('Dnode cache size (hard limit):', + f_perc(dnode_limit, meta_limit), f_bytes(dnode_limit)) + prt_i2('Dnode cache size (current):', + f_perc(dnode_size, dnode_limit), f_bytes(dnode_size)) + print() + + print('ARC hash breakdown:') + prt_i1('Elements max:', f_hits(arc_stats['hash_elements_max'])) + prt_i2('Elements current:', + f_perc(arc_stats['hash_elements'], arc_stats['hash_elements_max']), + f_hits(arc_stats['hash_elements'])) + prt_i1('Collisions:', f_hits(arc_stats['hash_collisions'])) + + prt_i1('Chain max:', f_hits(arc_stats['hash_chain_max'])) + prt_i1('Chains:', f_hits(arc_stats['hash_chains'])) + print() + + print('ARC misc:') + prt_i1('Deleted:', f_hits(arc_stats['deleted'])) + prt_i1('Mutex misses:', f_hits(arc_stats['mutex_miss'])) + prt_i1('Eviction skips:', f_hits(arc_stats['evict_skip'])) + print() + + +def section_archits(kstats_dict): + """Print information on how the caches are accessed ("arc hits"). + """ + + arc_stats = isolate_section('arcstats', kstats_dict) + all_accesses = int(arc_stats['hits'])+int(arc_stats['misses']) + actual_hits = int(arc_stats['mfu_hits'])+int(arc_stats['mru_hits']) + + prt_1('ARC total accesses (hits + misses):', f_hits(all_accesses)) + ta_todo = (('Cache hit ratio:', arc_stats['hits']), + ('Cache miss ratio:', arc_stats['misses']), + ('Actual hit ratio (MFU + MRU hits):', actual_hits)) + + for title, value in ta_todo: + prt_i2(title, f_perc(value, all_accesses), f_hits(value)) + + dd_total = int(arc_stats['demand_data_hits']) +\ + int(arc_stats['demand_data_misses']) + prt_i2('Data demand efficiency:', + f_perc(arc_stats['demand_data_hits'], dd_total), + f_hits(dd_total)) + + dp_total = int(arc_stats['prefetch_data_hits']) +\ + int(arc_stats['prefetch_data_misses']) + prt_i2('Data prefetch efficiency:', + f_perc(arc_stats['prefetch_data_hits'], dp_total), + f_hits(dp_total)) + + known_hits = int(arc_stats['mfu_hits']) +\ + int(arc_stats['mru_hits']) +\ + int(arc_stats['mfu_ghost_hits']) +\ + int(arc_stats['mru_ghost_hits']) + + anon_hits = int(arc_stats['hits'])-known_hits + + print() + print('Cache hits by cache type:') + cl_todo = (('Most frequently used (MFU):', arc_stats['mfu_hits']), + ('Most recently used (MRU):', arc_stats['mru_hits']), + ('Most frequently used (MFU) ghost:', + arc_stats['mfu_ghost_hits']), + ('Most recently used (MRU) ghost:', + arc_stats['mru_ghost_hits'])) + + for title, value in cl_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + # For some reason, anon_hits can turn negative, which is weird. Until we + # have figured out why this happens, we just hide the problem, following + # the behavior of the original arc_summary. + if anon_hits >= 0: + prt_i2('Anonymously used:', + f_perc(anon_hits, arc_stats['hits']), f_hits(anon_hits)) + + print() + print('Cache hits by data type:') + dt_todo = (('Demand data:', arc_stats['demand_data_hits']), + ('Demand prefetch data:', arc_stats['prefetch_data_hits']), + ('Demand metadata:', arc_stats['demand_metadata_hits']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_hits'])) + + for title, value in dt_todo: + prt_i2(title, f_perc(value, arc_stats['hits']), f_hits(value)) + + print() + print('Cache misses by data type:') + dm_todo = (('Demand data:', arc_stats['demand_data_misses']), + ('Demand prefetch data:', + arc_stats['prefetch_data_misses']), + ('Demand metadata:', arc_stats['demand_metadata_misses']), + ('Demand prefetch metadata:', + arc_stats['prefetch_metadata_misses'])) + + for title, value in dm_todo: + prt_i2(title, f_perc(value, arc_stats['misses']), f_hits(value)) + + print() + + +def section_dmu(kstats_dict): + """Collect information on the DMU""" + + zfetch_stats = isolate_section('zfetchstats', kstats_dict) + + zfetch_access_total = int(zfetch_stats['hits'])+int(zfetch_stats['misses']) + + prt_1('DMU prefetch efficiency:', f_hits(zfetch_access_total)) + prt_i2('Hit ratio:', f_perc(zfetch_stats['hits'], zfetch_access_total), + f_hits(zfetch_stats['hits'])) + prt_i2('Miss ratio:', f_perc(zfetch_stats['misses'], zfetch_access_total), + f_hits(zfetch_stats['misses'])) + print() + + +def section_l2arc(kstats_dict): + """Collect information on L2ARC device if present. If not, tell user + that we're skipping the section. + """ + + # The L2ARC statistics live in the same section as the normal ARC stuff + arc_stats = isolate_section('arcstats', kstats_dict) + + if arc_stats['l2_size'] == '0': + print('L2ARC not detected, skipping section\n') + return + + l2_errors = int(arc_stats['l2_writes_error']) +\ + int(arc_stats['l2_cksum_bad']) +\ + int(arc_stats['l2_io_error']) + + l2_access_total = int(arc_stats['l2_hits'])+int(arc_stats['l2_misses']) + health = 'HEALTHY' + + if l2_errors > 0: + health = 'DEGRADED' + + prt_1('L2ARC status:', health) + + l2_todo = (('Low memory aborts:', 'l2_abort_lowmem'), + ('Free on write:', 'l2_free_on_write'), + ('R/W clashes:', 'l2_rw_clash'), + ('Bad checksums:', 'l2_cksum_bad'), + ('I/O errors:', 'l2_io_error')) + + for title, value in l2_todo: + prt_i1(title, f_hits(arc_stats[value])) + + print() + prt_1('L2ARC size (adaptive):', f_bytes(arc_stats['l2_size'])) + prt_i2('Compressed:', f_perc(arc_stats['l2_asize'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_asize'])) + prt_i2('Header size:', + f_perc(arc_stats['l2_hdr_size'], arc_stats['l2_size']), + f_bytes(arc_stats['l2_hdr_size'])) + + print() + prt_1('L2ARC breakdown:', f_hits(l2_access_total)) + prt_i2('Hit ratio:', + f_perc(arc_stats['l2_hits'], l2_access_total), + f_hits(arc_stats['l2_hits'])) + prt_i2('Miss ratio:', + f_perc(arc_stats['l2_misses'], l2_access_total), + f_hits(arc_stats['l2_misses'])) + prt_i1('Feeds:', f_hits(arc_stats['l2_feeds'])) + + print() + print('L2ARC writes:') + + if arc_stats['l2_writes_done'] != arc_stats['l2_writes_sent']: + prt_i2('Writes sent:', 'FAULTED', f_hits(arc_stats['l2_writes_sent'])) + prt_i2('Done ratio:', + f_perc(arc_stats['l2_writes_done'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_done'])) + prt_i2('Error ratio:', + f_perc(arc_stats['l2_writes_error'], + arc_stats['l2_writes_sent']), + f_hits(arc_stats['l2_writes_error'])) + else: + prt_i2('Writes sent:', '100 %', f_hits(arc_stats['l2_writes_sent'])) + + print() + print('L2ARC evicts:') + prt_i1('Lock retries:', f_hits(arc_stats['l2_evict_lock_retry'])) + prt_i1('Upon reading:', f_hits(arc_stats['l2_evict_reading'])) + print() + + +def section_spl(*_): + """Print the SPL parameters, if requested with alternative format + and/or descriptions. This does not use kstats. + """ + + if sys.platform.startswith('freebsd'): + # No SPL support in FreeBSD + return + + spls = get_spl_params() + keylist = sorted(spls.keys()) + print('Solaris Porting Layer (SPL):') + + if ARGS.desc: + descriptions = get_descriptions('spl') + + for key in keylist: + value = spls[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_tunables(*_): + """Print the tunables, if requested with alternative format and/or + descriptions. This does not use kstasts. + """ + + tunables = get_tunable_params() + keylist = sorted(tunables.keys()) + print('Tunables:') + + if ARGS.desc: + descriptions = get_descriptions('zfs') + + for key in keylist: + value = tunables[key] + + if ARGS.desc: + try: + print(INDENT+'#', descriptions[key]) + except KeyError: + print(INDENT+'# (No description found)') # paranoid + + print(format_raw_line(key, value)) + + print() + + +def section_vdev(kstats_dict): + """Collect information on VDEV caches""" + + # Currently [Nov 2017] the VDEV cache is disabled, because it is actually + # harmful. When this is the case, we just skip the whole entry. See + # https://github.com/zfsonlinux/zfs/blob/master/module/zfs/vdev_cache.c + # for details + tunables = get_vdev_params() + + if tunables[VDEV_CACHE_SIZE] == '0': + print('VDEV cache disabled, skipping section\n') + return + + vdev_stats = isolate_section('vdev_cache_stats', kstats_dict) + + vdev_cache_total = int(vdev_stats['hits']) +\ + int(vdev_stats['misses']) +\ + int(vdev_stats['delegations']) + + prt_1('VDEV cache summary:', f_hits(vdev_cache_total)) + prt_i2('Hit ratio:', f_perc(vdev_stats['hits'], vdev_cache_total), + f_hits(vdev_stats['hits'])) + prt_i2('Miss ratio:', f_perc(vdev_stats['misses'], vdev_cache_total), + f_hits(vdev_stats['misses'])) + prt_i2('Delegations:', f_perc(vdev_stats['delegations'], vdev_cache_total), + f_hits(vdev_stats['delegations'])) + print() + + +def section_zil(kstats_dict): + """Collect information on the ZFS Intent Log. Some of the information + taken from https://github.com/zfsonlinux/zfs/blob/master/include/sys/zil.h + """ + + zil_stats = isolate_section('zil', kstats_dict) + + prt_1('ZIL committed transactions:', + f_hits(zil_stats['zil_itx_count'])) + prt_i1('Commit requests:', f_hits(zil_stats['zil_commit_count'])) + prt_i1('Flushes to stable storage:', + f_hits(zil_stats['zil_commit_writer_count'])) + prt_i2('Transactions to SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_slog_bytes']), + f_hits(zil_stats['zil_itx_metaslab_slog_count'])) + prt_i2('Transactions to non-SLOG storage pool:', + f_bytes(zil_stats['zil_itx_metaslab_normal_bytes']), + f_hits(zil_stats['zil_itx_metaslab_normal_count'])) + print() + + +section_calls = {'arc': section_arc, + 'archits': section_archits, + 'dmu': section_dmu, + 'l2arc': section_l2arc, + 'spl': section_spl, + 'tunables': section_tunables, + 'vdev': section_vdev, + 'zil': section_zil} + + +def main(): + """Run program. The options to draw a graph and to print all data raw are + treated separately because they come with their own call. + """ + + kstats = get_kstats() + + if ARGS.graph: + draw_graph(kstats) + sys.exit(0) + + print_header() + + if ARGS.raw: + print_raw(kstats) + + elif ARGS.section: + + try: + section_calls[ARGS.section](kstats) + except KeyError: + print('Error: Section "{0}" unknown'.format(ARGS.section)) + sys.exit(1) + + elif ARGS.page: + print('WARNING: Pages are deprecated, please use "--section"\n') + + pages_to_calls = {1: 'arc', + 2: 'archits', + 3: 'l2arc', + 4: 'dmu', + 5: 'vdev', + 6: 'tunables'} + + try: + call = pages_to_calls[ARGS.page] + except KeyError: + print('Error: Page "{0}" not supported'.format(ARGS.page)) + sys.exit(1) + else: + section_calls[call](kstats) + + else: + # If no parameters were given, we print all sections. We might want to + # change the sequence by hand + calls = sorted(section_calls.keys()) + + for section in calls: + section_calls[section](kstats) + + sys.exit(0) + + +if __name__ == '__main__': + main() diff --git a/cmd/arcstat/.gitignore b/cmd/arcstat/.gitignore new file mode 100644 index 000000000000..6d6cd1ab75fc --- /dev/null +++ b/cmd/arcstat/.gitignore @@ -0,0 +1 @@ +arcstat diff --git a/cmd/arcstat/Makefile.am b/cmd/arcstat/Makefile.am new file mode 100644 index 000000000000..d1ba989a0cd8 --- /dev/null +++ b/cmd/arcstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = arcstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/arcstat/arcstat.in b/cmd/arcstat/arcstat.in new file mode 100755 index 000000000000..c83a1c74599e --- /dev/null +++ b/cmd/arcstat/arcstat.in @@ -0,0 +1,494 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out ZFS ARC Statistics exported via kstat(1) +# For a definition of fields, or usage, use arcstat -v +# +# This script was originally a fork of the original arcstat.pl (0.1) +# by Neelakanth Nadgir, originally published on his Sun blog on +# 09/18/2007 +# http://blogs.sun.com/realneel/entry/zfs_arc_statistics +# +# A new version aimed to improve upon the original by adding features +# and fixing bugs as needed. This version was maintained by Mike +# Harsch and was hosted in a public open source repository: +# http://github.com/mharsch/arcstat +# +# but has since moved to the illumos-gate repository. +# +# This Python port was written by John Hixson for FreeNAS, introduced +# in commit e2c29f: +# https://github.com/freenas/freenas +# +# and has been improved by many people since. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# +# Fields have a fixed width. Every interval, we fill the "v" +# hash with its corresponding value (v[field]=value) using calculate(). +# @hdr is the array of fields that needs to be printed, so we +# just iterate over this array and print the values using our pretty printer. +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import time +import getopt +import re +import copy + +from signal import signal, SIGINT, SIGWINCH, SIG_DFL + + +cols = { + # HDR: [Size, Scale, Description] + "time": [8, -1, "Time"], + "hits": [4, 1000, "ARC reads per second"], + "miss": [4, 1000, "ARC misses per second"], + "read": [4, 1000, "Total ARC accesses per second"], + "hit%": [4, 100, "ARC hit percentage"], + "miss%": [5, 100, "ARC miss percentage"], + "dhit": [4, 1000, "Demand hits per second"], + "dmis": [4, 1000, "Demand misses per second"], + "dh%": [3, 100, "Demand hit percentage"], + "dm%": [3, 100, "Demand miss percentage"], + "phit": [4, 1000, "Prefetch hits per second"], + "pmis": [4, 1000, "Prefetch misses per second"], + "ph%": [3, 100, "Prefetch hits percentage"], + "pm%": [3, 100, "Prefetch miss percentage"], + "mhit": [4, 1000, "Metadata hits per second"], + "mmis": [4, 1000, "Metadata misses per second"], + "mread": [5, 1000, "Metadata accesses per second"], + "mh%": [3, 100, "Metadata hit percentage"], + "mm%": [3, 100, "Metadata miss percentage"], + "arcsz": [5, 1024, "ARC size"], + "size": [4, 1024, "ARC size"], + "c": [4, 1024, "ARC target size"], + "mfu": [4, 1000, "MFU list hits per second"], + "mru": [4, 1000, "MRU list hits per second"], + "mfug": [4, 1000, "MFU ghost list hits per second"], + "mrug": [4, 1000, "MRU ghost list hits per second"], + "eskip": [5, 1000, "evict_skip per second"], + "mtxmis": [6, 1000, "mutex_miss per second"], + "dread": [5, 1000, "Demand accesses per second"], + "pread": [5, 1000, "Prefetch accesses per second"], + "l2hits": [6, 1000, "L2ARC hits per second"], + "l2miss": [6, 1000, "L2ARC misses per second"], + "l2read": [6, 1000, "Total L2ARC accesses per second"], + "l2hit%": [6, 100, "L2ARC access hit percentage"], + "l2miss%": [7, 100, "L2ARC access miss percentage"], + "l2asize": [7, 1024, "Actual (compressed) size of the L2ARC"], + "l2size": [6, 1024, "Size of the L2ARC"], + "l2bytes": [7, 1024, "Bytes read per second from the L2ARC"], + "grow": [4, 1000, "ARC grow disabled"], + "need": [4, 1024, "ARC reclaim need"], + "free": [4, 1024, "ARC free memory"], + "avail": [5, 1024, "ARC available memory"], + "waste": [5, 1024, "Wasted memory due to round up to pagesize"], +} + +v = {} +hdr = ["time", "read", "miss", "miss%", "dmis", "dm%", "pmis", "pm%", "mmis", + "mm%", "size", "c", "avail"] +xhdr = ["time", "mfu", "mru", "mfug", "mrug", "eskip", "mtxmis", "dread", + "pread", "read"] +sint = 1 # Default interval is 1 second +count = 1 # Default count is 1 +hdr_intr = 20 # Print header every 20 lines of output +opfile = None +sep = " " # Default separator is 2 spaces +version = "0.4" +l2exist = False +cmd = ("Usage: arcstat [-hvx] [-f fields] [-o file] [-s string] [interval " + "[count]]\n") +cur = {} +d = {} +out = None +kstat = None + + +if sys.platform.startswith('freebsd'): + # Requires py27-sysctl on FreeBSD + import sysctl + + def kstat_update(): + global kstat + + k = sysctl.filter('kstat.zfs.misc.arcstats') + + if not k: + sys.exit(1) + + kstat = {} + + for s in k: + if not s: + continue + + name, value = s.name, s.value + # Trims 'kstat.zfs.misc.arcstats' from the name + kstat[name[24:]] = int(value) + +elif sys.platform.startswith('linux'): + def kstat_update(): + global kstat + + k = [line.strip() for line in open('/proc/spl/kstat/zfs/arcstats')] + + if not k: + sys.exit(1) + + del k[0:2] + kstat = {} + + for s in k: + if not s: + continue + + name, unused, value = s.split() + kstat[name] = int(value) + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("Field definitions are as follows:\n") + for key in cols: + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tarcstat -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -s \",\" -o /tmp/a.log 2 10\n") + sys.stderr.write("\tarcstat -v\n") + sys.stderr.write("\tarcstat -f time,hit%,dh%,ph%,mh% 1\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def snap_stats(): + global cur + global kstat + + prev = copy.deepcopy(cur) + kstat_update() + + cur = kstat + for key in cur: + if re.match(key, "class"): + continue + if key in prev: + d[key] = cur[key] - prev[key] + else: + d[key] = cur[key] + + +def prettynum(sz, scale, num=0): + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + # Special case for date field + if scale == -1: + return "%s" % num + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while abs(num) > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if abs(save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(): + global hdr + global sep + global v + + sys.stdout.write(sep.join( + prettynum(cols[col][0], cols[col][1], v[col]) for col in hdr)) + + sys.stdout.write("\n") + sys.stdout.flush() + + +def print_header(): + global hdr + global sep + + sys.stdout.write(sep.join("%*s" % (cols[col][0], col) for col in hdr)) + + sys.stdout.write("\n") + + +def get_terminal_lines(): + try: + import fcntl + import termios + import struct + data = fcntl.ioctl(sys.stdout.fileno(), termios.TIOCGWINSZ, '1234') + sz = struct.unpack('hh', data) + return sz[0] + except Exception: + pass + + +def update_hdr_intr(): + global hdr_intr + + lines = get_terminal_lines() + if lines and lines > 3: + hdr_intr = lines - 3 + + +def resize_handler(signum, frame): + update_hdr_intr() + + +def init(): + global sint + global count + global hdr + global xhdr + global opfile + global sep + global out + global l2exist + + desired_cols = None + xflag = False + hflag = False + vflag = False + i = 1 + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "xo:hvs:f:", + [ + "extended", + "outfile", + "help", + "verbose", + "separator", + "columns" + ] + ) + except getopt.error as msg: + sys.stderr.write("Error: %s\n" % str(msg)) + usage() + opts = None + + for opt, arg in opts: + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-o', '--outfile'): + opfile = arg + i += 1 + if opt in ('-h', '--help'): + hflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-s', '--separator'): + sep = arg + i += 1 + if opt in ('-f', '--columns'): + desired_cols = arg + i += 1 + i += 1 + + argv = sys.argv[i:] + sint = int(argv[0]) if argv else sint + count = int(argv[1]) if len(argv) > 1 else (0 if len(argv) > 0 else 1) + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + if xflag: + hdr = xhdr + + update_hdr_intr() + + # check if L2ARC exists + snap_stats() + l2_size = cur.get("l2_size") + if l2_size: + l2exist = True + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif not l2exist and ele.startswith("l2"): + sys.stdout.write("No L2ARC Here\n%s\n" % ele) + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if opfile: + try: + out = open(opfile, "w") + sys.stdout = out + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % opfile) + sys.exit(1) + + +def calculate(): + global d + global v + global l2exist + + v = dict() + v["time"] = time.strftime("%H:%M:%S", time.localtime()) + v["hits"] = d["hits"] / sint + v["miss"] = d["misses"] / sint + v["read"] = v["hits"] + v["miss"] + v["hit%"] = 100 * v["hits"] / v["read"] if v["read"] > 0 else 0 + v["miss%"] = 100 - v["hit%"] if v["read"] > 0 else 0 + + v["dhit"] = (d["demand_data_hits"] + d["demand_metadata_hits"]) / sint + v["dmis"] = (d["demand_data_misses"] + d["demand_metadata_misses"]) / sint + + v["dread"] = v["dhit"] + v["dmis"] + v["dh%"] = 100 * v["dhit"] / v["dread"] if v["dread"] > 0 else 0 + v["dm%"] = 100 - v["dh%"] if v["dread"] > 0 else 0 + + v["phit"] = (d["prefetch_data_hits"] + d["prefetch_metadata_hits"]) / sint + v["pmis"] = (d["prefetch_data_misses"] + + d["prefetch_metadata_misses"]) / sint + + v["pread"] = v["phit"] + v["pmis"] + v["ph%"] = 100 * v["phit"] / v["pread"] if v["pread"] > 0 else 0 + v["pm%"] = 100 - v["ph%"] if v["pread"] > 0 else 0 + + v["mhit"] = (d["prefetch_metadata_hits"] + + d["demand_metadata_hits"]) / sint + v["mmis"] = (d["prefetch_metadata_misses"] + + d["demand_metadata_misses"]) / sint + + v["mread"] = v["mhit"] + v["mmis"] + v["mh%"] = 100 * v["mhit"] / v["mread"] if v["mread"] > 0 else 0 + v["mm%"] = 100 - v["mh%"] if v["mread"] > 0 else 0 + + v["arcsz"] = cur["size"] + v["size"] = cur["size"] + v["c"] = cur["c"] + v["mfu"] = d["mfu_hits"] / sint + v["mru"] = d["mru_hits"] / sint + v["mrug"] = d["mru_ghost_hits"] / sint + v["mfug"] = d["mfu_ghost_hits"] / sint + v["eskip"] = d["evict_skip"] / sint + v["mtxmis"] = d["mutex_miss"] / sint + + if l2exist: + v["l2hits"] = d["l2_hits"] / sint + v["l2miss"] = d["l2_misses"] / sint + v["l2read"] = v["l2hits"] + v["l2miss"] + v["l2hit%"] = 100 * v["l2hits"] / v["l2read"] if v["l2read"] > 0 else 0 + + v["l2miss%"] = 100 - v["l2hit%"] if v["l2read"] > 0 else 0 + v["l2asize"] = cur["l2_asize"] + v["l2size"] = cur["l2_size"] + v["l2bytes"] = d["l2_read_bytes"] / sint + + v["grow"] = 0 if cur["arc_no_grow"] else 1 + v["need"] = cur["arc_need_free"] + v["free"] = cur["memory_free_bytes"] + v["avail"] = cur["memory_available_bytes"] + v["waste"] = cur["abd_chunk_waste_size"] + + +def main(): + global sint + global count + global hdr_intr + + i = 0 + count_flag = 0 + + init() + if count > 0: + count_flag = 1 + + signal(SIGINT, SIG_DFL) + signal(SIGWINCH, resize_handler) + while True: + if i == 0: + print_header() + + snap_stats() + calculate() + print_values() + + if count_flag == 1: + if count <= 1: + break + count -= 1 + + i = 0 if i >= hdr_intr else i + 1 + time.sleep(sint) + + if out: + out.close() + + +if __name__ == '__main__': + main() diff --git a/cmd/dbufstat/.gitignore b/cmd/dbufstat/.gitignore new file mode 100644 index 000000000000..2c2e913cef70 --- /dev/null +++ b/cmd/dbufstat/.gitignore @@ -0,0 +1 @@ +dbufstat diff --git a/cmd/dbufstat/Makefile.am b/cmd/dbufstat/Makefile.am new file mode 100644 index 000000000000..e672a01a4227 --- /dev/null +++ b/cmd/dbufstat/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +bin_SCRIPTS = dbufstat + +SUBSTFILES += $(bin_SCRIPTS) diff --git a/cmd/dbufstat/dbufstat.in b/cmd/dbufstat/dbufstat.in new file mode 100755 index 000000000000..98eb79057388 --- /dev/null +++ b/cmd/dbufstat/dbufstat.in @@ -0,0 +1,669 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ +# +# Print out statistics for all cached dmu buffers. This information +# is available through the dbufs kstat and may be post-processed as +# needed by the script. +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright (C) 2013 Lawrence Livermore National Security, LLC. +# Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import sys +import getopt +import errno +import re + +bhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize"] +bxhdr = ["pool", "objset", "object", "level", "blkid", "offset", "dbsize", + "meta", "state", "dbholds", "dbc", "list", "atype", "flags", + "count", "asize", "access", "mru", "gmru", "mfu", "gmfu", "l2", + "l2_dattr", "l2_asize", "l2_comp", "aholds", "dtype", "btype", + "data_bs", "meta_bs", "bsize", "lvls", "dholds", "blocks", "dsize"] +bincompat = ["cached", "direct", "indirect", "bonus", "spill"] + +dhdr = ["pool", "objset", "object", "dtype", "cached"] +dxhdr = ["pool", "objset", "object", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize", "cached", "direct", + "indirect", "bonus", "spill"] +dincompat = ["level", "blkid", "offset", "dbsize", "meta", "state", "dbholds", + "dbc", "list", "atype", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", + "l2_comp", "aholds"] + +thdr = ["pool", "objset", "dtype", "cached"] +txhdr = ["pool", "objset", "dtype", "cached", "direct", "indirect", + "bonus", "spill"] +tincompat = ["object", "level", "blkid", "offset", "dbsize", "meta", "state", + "dbc", "dbholds", "list", "atype", "flags", "count", "asize", + "access", "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", + "l2_asize", "l2_comp", "aholds", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"] + +cols = { + # hdr: [size, scale, description] + "pool": [15, -1, "pool name"], + "objset": [6, -1, "dataset identification number"], + "object": [10, -1, "object number"], + "level": [5, -1, "indirection level of buffer"], + "blkid": [8, -1, "block number of buffer"], + "offset": [12, 1024, "offset in object of buffer"], + "dbsize": [7, 1024, "size of buffer"], + "meta": [4, -1, "is this buffer metadata?"], + "state": [5, -1, "state of buffer (read, cached, etc)"], + "dbholds": [7, 1000, "number of holds on buffer"], + "dbc": [3, -1, "in dbuf cache"], + "list": [4, -1, "which ARC list contains this buffer"], + "atype": [7, -1, "ARC header type (data or metadata)"], + "flags": [9, -1, "ARC read flags"], + "count": [5, -1, "ARC data count"], + "asize": [7, 1024, "size of this ARC buffer"], + "access": [10, -1, "time this ARC buffer was last accessed"], + "mru": [5, 1000, "hits while on the ARC's MRU list"], + "gmru": [5, 1000, "hits while on the ARC's MRU ghost list"], + "mfu": [5, 1000, "hits while on the ARC's MFU list"], + "gmfu": [5, 1000, "hits while on the ARC's MFU ghost list"], + "l2": [5, 1000, "hits while on the L2ARC"], + "l2_dattr": [8, -1, "L2ARC disk address/offset"], + "l2_asize": [8, 1024, "L2ARC alloc'd size (depending on compression)"], + "l2_comp": [21, -1, "L2ARC compression algorithm for buffer"], + "aholds": [6, 1000, "number of holds on this ARC buffer"], + "dtype": [27, -1, "dnode type"], + "btype": [27, -1, "bonus buffer type"], + "data_bs": [7, 1024, "data block size"], + "meta_bs": [7, 1024, "metadata block size"], + "bsize": [6, 1024, "bonus buffer size"], + "lvls": [6, -1, "number of indirection levels"], + "dholds": [6, 1000, "number of holds on dnode"], + "blocks": [8, 1000, "number of allocated blocks"], + "dsize": [12, 1024, "size of dnode"], + "cached": [6, 1024, "bytes cached for all blocks"], + "direct": [6, 1024, "bytes cached for direct blocks"], + "indirect": [8, 1024, "bytes cached for indirect blocks"], + "bonus": [5, 1024, "bytes cached for bonus buffer"], + "spill": [5, 1024, "bytes cached for spill block"], +} + +hdr = None +xhdr = None +sep = " " # Default separator is 2 spaces +cmd = ("Usage: dbufstat [-bdhnrtvx] [-i file] [-f fields] [-o file] " + "[-s string] [-F filter]\n") +raw = 0 + + +def print_incompat_helper(incompat): + cnt = 0 + for key in sorted(incompat): + if cnt is 0: + sys.stderr.write("\t") + elif cnt > 8: + sys.stderr.write(",\n\t") + cnt = 0 + else: + sys.stderr.write(", ") + + sys.stderr.write("%s" % key) + cnt += 1 + + sys.stderr.write("\n\n") + + +def detailed_usage(): + sys.stderr.write("%s\n" % cmd) + + sys.stderr.write("Field definitions incompatible with '-b' option:\n") + print_incompat_helper(bincompat) + + sys.stderr.write("Field definitions incompatible with '-d' option:\n") + print_incompat_helper(dincompat) + + sys.stderr.write("Field definitions incompatible with '-t' option:\n") + print_incompat_helper(tincompat) + + sys.stderr.write("Field definitions are as follows:\n") + for key in sorted(cols.keys()): + sys.stderr.write("%11s : %s\n" % (key, cols[key][2])) + sys.stderr.write("\n") + + sys.exit(0) + + +def usage(): + sys.stderr.write("%s\n" % cmd) + sys.stderr.write("\t -b : Print table of information for each dbuf\n") + sys.stderr.write("\t -d : Print table of information for each dnode\n") + sys.stderr.write("\t -h : Print this help message\n") + sys.stderr.write("\t -n : Exclude header from output\n") + sys.stderr.write("\t -r : Print raw values\n") + sys.stderr.write("\t -t : Print table of information for each dnode type" + "\n") + sys.stderr.write("\t -v : List all possible field headers and definitions" + "\n") + sys.stderr.write("\t -x : Print extended stats\n") + sys.stderr.write("\t -i : Redirect input from the specified file\n") + sys.stderr.write("\t -f : Specify specific fields to print (see -v)\n") + sys.stderr.write("\t -o : Redirect output to the specified file\n") + sys.stderr.write("\t -s : Override default field separator with custom " + "character or string\n") + sys.stderr.write("\t -F : Filter output by value or regex\n") + sys.stderr.write("\nExamples:\n") + sys.stderr.write("\tdbufstat -d -o /tmp/d.log\n") + sys.stderr.write("\tdbufstat -t -s \",\" -o /tmp/t.log\n") + sys.stderr.write("\tdbufstat -v\n") + sys.stderr.write("\tdbufstat -d -f pool,object,objset,dsize,cached\n") + sys.stderr.write("\tdbufstat -bx -F dbc=1,objset=54,pool=testpool\n") + sys.stderr.write("\n") + + sys.exit(1) + + +def prettynum(sz, scale, num=0): + global raw + + suffix = [' ', 'K', 'M', 'G', 'T', 'P', 'E', 'Z'] + index = 0 + save = 0 + + if raw or scale == -1: + return "%*s" % (sz, num) + + # Rounding error, return 0 + elif 0 < num < 1: + num = 0 + + while num > scale and index < 5: + save = num + num = num / scale + index += 1 + + if index == 0: + return "%*d" % (sz, num) + + if (save / scale) < 10: + return "%*.1f%s" % (sz - 1, num, suffix[index]) + else: + return "%*d%s" % (sz - 1, num, suffix[index]) + + +def print_values(v): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%s%s" % ( + prettynum(cols[col][0], cols[col][1], v[col]), sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def print_header(): + global hdr + global sep + + try: + for col in hdr: + sys.stdout.write("%*s%s" % (cols[col][0], col, sep)) + sys.stdout.write("\n") + except IOError as e: + if e.errno == errno.EPIPE: + sys.exit(1) + + +def get_typestring(t): + ot_strings = [ + "DMU_OT_NONE", + # general: + "DMU_OT_OBJECT_DIRECTORY", + "DMU_OT_OBJECT_ARRAY", + "DMU_OT_PACKED_NVLIST", + "DMU_OT_PACKED_NVLIST_SIZE", + "DMU_OT_BPOBJ", + "DMU_OT_BPOBJ_HDR", + # spa: + "DMU_OT_SPACE_MAP_HEADER", + "DMU_OT_SPACE_MAP", + # zil: + "DMU_OT_INTENT_LOG", + # dmu: + "DMU_OT_DNODE", + "DMU_OT_OBJSET", + # dsl: + "DMU_OT_DSL_DIR", + "DMU_OT_DSL_DIR_CHILD_MAP", + "DMU_OT_DSL_DS_SNAP_MAP", + "DMU_OT_DSL_PROPS", + "DMU_OT_DSL_DATASET", + # zpl: + "DMU_OT_ZNODE", + "DMU_OT_OLDACL", + "DMU_OT_PLAIN_FILE_CONTENTS", + "DMU_OT_DIRECTORY_CONTENTS", + "DMU_OT_MASTER_NODE", + "DMU_OT_UNLINKED_SET", + # zvol: + "DMU_OT_ZVOL", + "DMU_OT_ZVOL_PROP", + # other; for testing only! + "DMU_OT_PLAIN_OTHER", + "DMU_OT_UINT64_OTHER", + "DMU_OT_ZAP_OTHER", + # new object types: + "DMU_OT_ERROR_LOG", + "DMU_OT_SPA_HISTORY", + "DMU_OT_SPA_HISTORY_OFFSETS", + "DMU_OT_POOL_PROPS", + "DMU_OT_DSL_PERMS", + "DMU_OT_ACL", + "DMU_OT_SYSACL", + "DMU_OT_FUID", + "DMU_OT_FUID_SIZE", + "DMU_OT_NEXT_CLONES", + "DMU_OT_SCAN_QUEUE", + "DMU_OT_USERGROUP_USED", + "DMU_OT_USERGROUP_QUOTA", + "DMU_OT_USERREFS", + "DMU_OT_DDT_ZAP", + "DMU_OT_DDT_STATS", + "DMU_OT_SA", + "DMU_OT_SA_MASTER_NODE", + "DMU_OT_SA_ATTR_REGISTRATION", + "DMU_OT_SA_ATTR_LAYOUTS", + "DMU_OT_SCAN_XLATE", + "DMU_OT_DEDUP", + "DMU_OT_DEADLIST", + "DMU_OT_DEADLIST_HDR", + "DMU_OT_DSL_CLONES", + "DMU_OT_BPOBJ_SUBOBJ"] + otn_strings = { + 0x80: "DMU_OTN_UINT8_DATA", + 0xc0: "DMU_OTN_UINT8_METADATA", + 0x81: "DMU_OTN_UINT16_DATA", + 0xc1: "DMU_OTN_UINT16_METADATA", + 0x82: "DMU_OTN_UINT32_DATA", + 0xc2: "DMU_OTN_UINT32_METADATA", + 0x83: "DMU_OTN_UINT64_DATA", + 0xc3: "DMU_OTN_UINT64_METADATA", + 0x84: "DMU_OTN_ZAP_DATA", + 0xc4: "DMU_OTN_ZAP_METADATA", + 0xa0: "DMU_OTN_UINT8_ENC_DATA", + 0xe0: "DMU_OTN_UINT8_ENC_METADATA", + 0xa1: "DMU_OTN_UINT16_ENC_DATA", + 0xe1: "DMU_OTN_UINT16_ENC_METADATA", + 0xa2: "DMU_OTN_UINT32_ENC_DATA", + 0xe2: "DMU_OTN_UINT32_ENC_METADATA", + 0xa3: "DMU_OTN_UINT64_ENC_DATA", + 0xe3: "DMU_OTN_UINT64_ENC_METADATA", + 0xa4: "DMU_OTN_ZAP_ENC_DATA", + 0xe4: "DMU_OTN_ZAP_ENC_METADATA"} + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % t + + try: + if t < len(ot_strings): + return ot_strings[t] + else: + return otn_strings[t] + except (IndexError, KeyError): + return "(UNKNOWN)" + + +def get_compstring(c): + comp_strings = ["ZIO_COMPRESS_INHERIT", "ZIO_COMPRESS_ON", + "ZIO_COMPRESS_OFF", "ZIO_COMPRESS_LZJB", + "ZIO_COMPRESS_EMPTY", "ZIO_COMPRESS_GZIP_1", + "ZIO_COMPRESS_GZIP_2", "ZIO_COMPRESS_GZIP_3", + "ZIO_COMPRESS_GZIP_4", "ZIO_COMPRESS_GZIP_5", + "ZIO_COMPRESS_GZIP_6", "ZIO_COMPRESS_GZIP_7", + "ZIO_COMPRESS_GZIP_8", "ZIO_COMPRESS_GZIP_9", + "ZIO_COMPRESS_ZLE", "ZIO_COMPRESS_LZ4", + "ZIO_COMPRESS_ZSTD", "ZIO_COMPRESS_FUNCTION"] + + # If "-rr" option is used, don't convert to string representation + if raw > 1: + return "%i" % c + + try: + return comp_strings[c] + except IndexError: + return "%i" % c + + +def parse_line(line, labels): + global hdr + + new = dict() + val = None + for col in hdr: + # These are "special" fields computed in the update_dict + # function, prevent KeyError exception on labels[col] for these. + if col not in ['bonus', 'cached', 'direct', 'indirect', 'spill']: + val = line[labels[col]] + + if col in ['pool', 'flags']: + new[col] = str(val) + elif col in ['dtype', 'btype']: + new[col] = get_typestring(int(val)) + elif col in ['l2_comp']: + new[col] = get_compstring(int(val)) + else: + new[col] = int(val) + + return new + + +def update_dict(d, k, line, labels): + pool = line[labels['pool']] + objset = line[labels['objset']] + key = line[labels[k]] + + dbsize = int(line[labels['dbsize']]) + blkid = int(line[labels['blkid']]) + level = int(line[labels['level']]) + + if pool not in d: + d[pool] = dict() + + if objset not in d[pool]: + d[pool][objset] = dict() + + if key not in d[pool][objset]: + d[pool][objset][key] = parse_line(line, labels) + d[pool][objset][key]['bonus'] = 0 + d[pool][objset][key]['cached'] = 0 + d[pool][objset][key]['direct'] = 0 + d[pool][objset][key]['indirect'] = 0 + d[pool][objset][key]['spill'] = 0 + + d[pool][objset][key]['cached'] += dbsize + + if blkid == -1: + d[pool][objset][key]['bonus'] += dbsize + elif blkid == -2: + d[pool][objset][key]['spill'] += dbsize + else: + if level == 0: + d[pool][objset][key]['direct'] += dbsize + else: + d[pool][objset][key]['indirect'] += dbsize + + return d + + +def skip_line(vals, filters): + ''' + Determines if a line should be skipped during printing + based on a set of filters + ''' + if len(filters) == 0: + return False + + for key in vals: + if key in filters: + val = prettynum(cols[key][0], cols[key][1], vals[key]).strip() + # we want a full match here + if re.match("(?:" + filters[key] + r")\Z", val) is None: + return True + + return False + + +def print_dict(d, filters, noheader): + if not noheader: + print_header() + for pool in list(d.keys()): + for objset in list(d[pool].keys()): + for v in list(d[pool][objset].values()): + if not skip_line(v, filters): + print_values(v) + + +def dnodes_build_dict(filehandle): + labels = dict() + dnodes = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(dnodes, 'object', line.split(), labels) + + return dnodes + + +def types_build_dict(filehandle): + labels = dict() + types = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + # The rest of the file is buffer information + for line in filehandle: + update_dict(types, 'dtype', line.split(), labels) + + return types + + +def buffers_print_all(filehandle, filters, noheader): + labels = dict() + + # First 3 lines are header information, skip the first two + for i in range(2): + next(filehandle) + + # The third line contains the labels and index locations + for i, v in enumerate(next(filehandle).split()): + labels[v] = i + + if not noheader: + print_header() + + # The rest of the file is buffer information + for line in filehandle: + vals = parse_line(line.split(), labels) + if not skip_line(vals, filters): + print_values(vals) + + +def main(): + global hdr + global sep + global raw + + desired_cols = None + bflag = False + dflag = False + hflag = False + ifile = None + ofile = None + tflag = False + vflag = False + xflag = False + nflag = False + filters = dict() + + try: + opts, args = getopt.getopt( + sys.argv[1:], + "bdf:hi:o:rs:tvxF:n", + [ + "buffers", + "dnodes", + "columns", + "help", + "infile", + "outfile", + "separator", + "types", + "verbose", + "extended", + "filter" + ] + ) + except getopt.error: + usage() + opts = None + + for opt, arg in opts: + if opt in ('-b', '--buffers'): + bflag = True + if opt in ('-d', '--dnodes'): + dflag = True + if opt in ('-f', '--columns'): + desired_cols = arg + if opt in ('-h', '--help'): + hflag = True + if opt in ('-i', '--infile'): + ifile = arg + if opt in ('-o', '--outfile'): + ofile = arg + if opt in ('-r', '--raw'): + raw += 1 + if opt in ('-s', '--separator'): + sep = arg + if opt in ('-t', '--types'): + tflag = True + if opt in ('-v', '--verbose'): + vflag = True + if opt in ('-x', '--extended'): + xflag = True + if opt in ('-n', '--noheader'): + nflag = True + if opt in ('-F', '--filter'): + fils = [x.strip() for x in arg.split(",")] + + for fil in fils: + f = [x.strip() for x in fil.split("=")] + + if len(f) != 2: + sys.stderr.write("Invalid filter '%s'.\n" % fil) + sys.exit(1) + + if f[0] not in cols: + sys.stderr.write("Invalid field '%s' in filter.\n" % f[0]) + sys.exit(1) + + if f[0] in filters: + sys.stderr.write("Field '%s' specified multiple times in " + "filter.\n" % f[0]) + sys.exit(1) + + try: + re.compile("(?:" + f[1] + r")\Z") + except re.error: + sys.stderr.write("Invalid regex for field '%s' in " + "filter.\n" % f[0]) + sys.exit(1) + + filters[f[0]] = f[1] + + if hflag or (xflag and desired_cols): + usage() + + if vflag: + detailed_usage() + + # Ensure at most only one of b, d, or t flags are set + if (bflag and dflag) or (bflag and tflag) or (dflag and tflag): + usage() + + if bflag: + hdr = bxhdr if xflag else bhdr + elif tflag: + hdr = txhdr if xflag else thdr + else: # Even if dflag is False, it's the default if none set + dflag = True + hdr = dxhdr if xflag else dhdr + + if desired_cols: + hdr = desired_cols.split(",") + + invalid = [] + incompat = [] + for ele in hdr: + if ele not in cols: + invalid.append(ele) + elif ((bflag and bincompat and ele in bincompat) or + (dflag and dincompat and ele in dincompat) or + (tflag and tincompat and ele in tincompat)): + incompat.append(ele) + + if len(invalid) > 0: + sys.stderr.write("Invalid column definition! -- %s\n" % invalid) + usage() + + if len(incompat) > 0: + sys.stderr.write("Incompatible field specified! -- %s\n" % + incompat) + usage() + + if ofile: + try: + tmp = open(ofile, "w") + sys.stdout = tmp + + except IOError: + sys.stderr.write("Cannot open %s for writing\n" % ofile) + sys.exit(1) + + if not ifile: + ifile = '/proc/spl/kstat/zfs/dbufs' + + if ifile is not "-": + try: + tmp = open(ifile, "r") + sys.stdin = tmp + except IOError: + sys.stderr.write("Cannot open %s for reading\n" % ifile) + sys.exit(1) + + if bflag: + buffers_print_all(sys.stdin, filters, nflag) + + if dflag: + print_dict(dnodes_build_dict(sys.stdin), filters, nflag) + + if tflag: + print_dict(types_build_dict(sys.stdin), filters, nflag) + + +if __name__ == '__main__': + main() diff --git a/cmd/fsck_zfs/Makefile.am b/cmd/fsck_zfs/Makefile.am new file mode 100644 index 000000000000..2380f56fa4d4 --- /dev/null +++ b/cmd/fsck_zfs/Makefile.am @@ -0,0 +1 @@ +dist_sbin_SCRIPTS = fsck.zfs diff --git a/cmd/fsck_zfs/fsck.zfs b/cmd/fsck_zfs/fsck.zfs new file mode 100755 index 000000000000..129a7f39c388 --- /dev/null +++ b/cmd/fsck_zfs/fsck.zfs @@ -0,0 +1,9 @@ +#!/bin/sh +# +# fsck.zfs: A fsck helper to accommodate distributions that expect +# to be able to execute a fsck on all filesystem types. Currently +# this script does nothing but it could be extended to act as a +# compatibility wrapper for 'zpool scrub'. +# + +exit 0 diff --git a/cmd/mount_zfs/.gitignore b/cmd/mount_zfs/.gitignore new file mode 100644 index 000000000000..cd9254bde3da --- /dev/null +++ b/cmd/mount_zfs/.gitignore @@ -0,0 +1 @@ +mount.zfs diff --git a/cmd/mount_zfs/Makefile.am b/cmd/mount_zfs/Makefile.am new file mode 100644 index 000000000000..6c4d6ff79f16 --- /dev/null +++ b/cmd/mount_zfs/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# +# Ignore the prefix for the mount helper. It must be installed in /sbin/ +# because this path is hardcoded in the mount(8) for security reasons. +# However, if needed, the configure option --with-mounthelperdir= can be used +# to override the default install location. +# +sbindir=$(mounthelperdir) +sbin_PROGRAMS = mount.zfs + +mount_zfs_SOURCES = \ + mount_zfs.c + +mount_zfs_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +mount_zfs_LDADD += $(LTLIBINTL) diff --git a/cmd/mount_zfs/mount_zfs.c b/cmd/mount_zfs/mount_zfs.c new file mode 100644 index 000000000000..87d2ccadcded --- /dev/null +++ b/cmd/mount_zfs/mount_zfs.c @@ -0,0 +1,408 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ZS_COMMENT 0x00000000 /* comment */ +#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ + +libzfs_handle_t *g_zfs; + +/* + * Return the pool/dataset to mount given the name passed to mount. This + * is expected to be of the form pool/dataset, however may also refer to + * a block device if that device contains a valid zfs label. + */ +static char * +parse_dataset(char *dataset) +{ + char cwd[PATH_MAX]; + struct stat64 statbuf; + int error; + int len; + + /* + * We expect a pool/dataset to be provided, however if we're + * given a device which is a member of a zpool we attempt to + * extract the pool name stored in the label. Given the pool + * name we can mount the root dataset. + */ + error = stat64(dataset, &statbuf); + if (error == 0) { + nvlist_t *config; + char *name; + int fd; + + fd = open(dataset, O_RDONLY); + if (fd < 0) + goto out; + + error = zpool_read_label(fd, &config, NULL); + (void) close(fd); + if (error) + goto out; + + error = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name); + if (error) { + nvlist_free(config); + } else { + dataset = strdup(name); + nvlist_free(config); + return (dataset); + } + } +out: + /* + * If a file or directory in your current working directory is + * named 'dataset' then mount(8) will prepend your current working + * directory to the dataset. There is no way to prevent this + * behavior so we simply check for it and strip the prepended + * patch when it is added. + */ + if (getcwd(cwd, PATH_MAX) == NULL) + return (dataset); + + len = strlen(cwd); + + /* Do not add one when cwd already ends in a trailing '/' */ + if (strncmp(cwd, dataset, len) == 0) + return (dataset + len + (cwd[len-1] != '/')); + + return (dataset); +} + +/* + * Update the mtab_* code to use the libmount library when it is commonly + * available otherwise fallback to legacy mode. The mount(8) utility will + * manage the lock file for us to prevent racing updates to /etc/mtab. + */ +static int +mtab_is_writeable(void) +{ + struct stat st; + int error, fd; + + error = lstat("/etc/mtab", &st); + if (error || S_ISLNK(st.st_mode)) + return (0); + + fd = open("/etc/mtab", O_RDWR | O_CREAT, 0644); + if (fd < 0) + return (0); + + close(fd); + return (1); +} + +static int +mtab_update(char *dataset, char *mntpoint, char *type, char *mntopts) +{ + struct mntent mnt; + FILE *fp; + int error; + + mnt.mnt_fsname = dataset; + mnt.mnt_dir = mntpoint; + mnt.mnt_type = type; + mnt.mnt_opts = mntopts ? mntopts : ""; + mnt.mnt_freq = 0; + mnt.mnt_passno = 0; + + fp = setmntent("/etc/mtab", "a+"); + if (!fp) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be opened due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + error = addmntent(fp, &mnt); + if (error) { + (void) fprintf(stderr, gettext( + "filesystem '%s' was mounted, but /etc/mtab " + "could not be updated due to error %d\n"), + dataset, errno); + return (MOUNT_FILEIO); + } + + (void) endmntent(fp); + + return (MOUNT_SUCCESS); +} + +int +main(int argc, char **argv) +{ + zfs_handle_t *zhp; + char prop[ZFS_MAXPROPLEN]; + uint64_t zfs_version = 0; + char mntopts[MNT_LINE_MAX] = { '\0' }; + char badopt[MNT_LINE_MAX] = { '\0' }; + char mtabopt[MNT_LINE_MAX] = { '\0' }; + char mntpoint[PATH_MAX]; + char *dataset; + unsigned long mntflags = 0, zfsflags = 0, remount = 0; + int sloppy = 0, fake = 0, verbose = 0, nomtab = 0, zfsutil = 0; + int error, c; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + /* check options */ + while ((c = getopt_long(argc, argv, "sfnvo:h?", 0, 0)) != -1) { + switch (c) { + case 's': + sloppy = 1; + break; + case 'f': + fake = 1; + break; + case 'n': + nomtab = 1; + break; + case 'v': + verbose++; + break; + case 'o': + (void) strlcpy(mntopts, optarg, sizeof (mntopts)); + break; + case 'h': + case '?': + (void) fprintf(stderr, gettext("Invalid option '%c'\n"), + optopt); + (void) fprintf(stderr, gettext("Usage: mount.zfs " + "[-sfnv] [-o options] \n")); + return (MOUNT_USAGE); + } + } + + argc -= optind; + argv += optind; + + /* check that we only have two arguments */ + if (argc != 2) { + if (argc == 0) + (void) fprintf(stderr, gettext("missing dataset " + "argument\n")); + else if (argc == 1) + (void) fprintf(stderr, + gettext("missing mountpoint argument\n")); + else + (void) fprintf(stderr, gettext("too many arguments\n")); + (void) fprintf(stderr, "usage: mount \n"); + return (MOUNT_USAGE); + } + + dataset = parse_dataset(argv[0]); + + /* canonicalize the mount point */ + if (realpath(argv[1], mntpoint) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted at '%s' due to canonicalization error %d.\n"), + dataset, argv[1], errno); + return (MOUNT_SYSERR); + } + + /* validate mount options and set mntflags */ + error = zfs_parse_mount_options(mntopts, &mntflags, &zfsflags, sloppy, + badopt, mtabopt); + if (error) { + switch (error) { + case ENOMEM: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to a memory allocation " + "failure.\n"), dataset); + return (MOUNT_SYSERR); + case ENOENT: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to invalid option " + "'%s'.\n"), dataset, badopt); + (void) fprintf(stderr, gettext("Use the '-s' option " + "to ignore the bad mount option.\n")); + return (MOUNT_USAGE); + default: + (void) fprintf(stderr, gettext("filesystem '%s' " + "cannot be mounted due to internal error %d.\n"), + dataset, error); + return (MOUNT_SOFTWARE); + } + } + + if (verbose) + (void) fprintf(stdout, gettext("mount.zfs:\n" + " dataset: \"%s\"\n mountpoint: \"%s\"\n" + " mountflags: 0x%lx\n zfsflags: 0x%lx\n" + " mountopts: \"%s\"\n mtabopts: \"%s\"\n"), + dataset, mntpoint, mntflags, zfsflags, mntopts, mtabopt); + + if (mntflags & MS_REMOUNT) { + nomtab = 1; + remount = 1; + } + + if (zfsflags & ZS_ZFSUTIL) + zfsutil = 1; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (MOUNT_SYSERR); + } + + /* try to open the dataset to access the mount point */ + if ((zhp = zfs_open(g_zfs, dataset, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT)) == NULL) { + (void) fprintf(stderr, gettext("filesystem '%s' cannot be " + "mounted, unable to open the dataset\n"), dataset); + libzfs_fini(g_zfs); + return (MOUNT_USAGE); + } + + zfs_adjust_mount_options(zhp, mntpoint, mntopts, mtabopt); + + /* treat all snapshots as legacy mount points */ + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) + (void) strlcpy(prop, ZFS_MOUNTPOINT_LEGACY, ZFS_MAXPROPLEN); + else + (void) zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, prop, + sizeof (prop), NULL, NULL, 0, B_FALSE); + + /* + * Fetch the max supported zfs version in case we get ENOTSUP + * back from the mount command, since we need the zfs handle + * to do so. + */ + zfs_version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (zfs_version == 0) { + fprintf(stderr, gettext("unable to fetch " + "ZFS version for filesystem '%s'\n"), dataset); + return (MOUNT_SYSERR); + } + + zfs_close(zhp); + libzfs_fini(g_zfs); + + /* + * Legacy mount points may only be mounted using 'mount', never using + * 'zfs mount'. However, since 'zfs mount' actually invokes 'mount' + * we differentiate the two cases using the 'zfsutil' mount option. + * This mount option should only be supplied by the 'zfs mount' util. + * + * The only exception to the above rule is '-o remount' which is + * always allowed for non-legacy datasets. This is done because when + * using zfs as your root file system both rc.sysinit/umountroot and + * systemd depend on 'mount -o remount ' to work. + */ + if (zfsutil && (strcmp(prop, ZFS_MOUNTPOINT_LEGACY) == 0)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'zfs mount'.\n" + "Use 'zfs set mountpoint=%s' or 'mount -t zfs %s %s'.\n" + "See zfs(8) for more information.\n"), + dataset, mntpoint, dataset, mntpoint); + return (MOUNT_USAGE); + } + + if (!zfsutil && !(remount || fake) && + strcmp(prop, ZFS_MOUNTPOINT_LEGACY)) { + (void) fprintf(stderr, gettext( + "filesystem '%s' cannot be mounted using 'mount'.\n" + "Use 'zfs set mountpoint=%s' or 'zfs mount %s'.\n" + "See zfs(8) for more information.\n"), + dataset, "legacy", dataset); + return (MOUNT_USAGE); + } + + if (!fake) { + error = mount(dataset, mntpoint, MNTTYPE_ZFS, + mntflags, mntopts); + } + + if (error) { + switch (errno) { + case ENOENT: + (void) fprintf(stderr, gettext("mount point " + "'%s' does not exist\n"), mntpoint); + return (MOUNT_SYSERR); + case EBUSY: + (void) fprintf(stderr, gettext("filesystem " + "'%s' is already mounted\n"), dataset); + return (MOUNT_BUSY); + case ENOTSUP: + if (zfs_version > ZPL_VERSION) { + (void) fprintf(stderr, + gettext("filesystem '%s' (v%d) is not " + "supported by this implementation of " + "ZFS (max v%d).\n"), dataset, + (int)zfs_version, (int)ZPL_VERSION); + } else { + (void) fprintf(stderr, + gettext("filesystem '%s' mount " + "failed for unknown reason.\n"), dataset); + } + return (MOUNT_SYSERR); +#ifdef MS_MANDLOCK + case EPERM: + if (mntflags & MS_MANDLOCK) { + (void) fprintf(stderr, gettext("filesystem " + "'%s' has the 'nbmand=on' property set, " + "this mount\noption may be disabled in " + "your kernel. Use 'zfs set nbmand=off'\n" + "to disable this option and try to " + "mount the filesystem again.\n"), dataset); + return (MOUNT_SYSERR); + } + /* fallthru */ +#endif + default: + (void) fprintf(stderr, gettext("filesystem " + "'%s' can not be mounted: %s\n"), dataset, + strerror(errno)); + return (MOUNT_USAGE); + } + } + + if (!nomtab && mtab_is_writeable()) { + error = mtab_update(dataset, mntpoint, MNTTYPE_ZFS, mtabopt); + if (error) + return (error); + } + + return (MOUNT_SUCCESS); +} diff --git a/cmd/raidz_test/.gitignore b/cmd/raidz_test/.gitignore new file mode 100644 index 000000000000..f8b83d9cce03 --- /dev/null +++ b/cmd/raidz_test/.gitignore @@ -0,0 +1 @@ +/raidz_test diff --git a/cmd/raidz_test/Makefile.am b/cmd/raidz_test/Makefile.am new file mode 100644 index 000000000000..72c914e641e4 --- /dev/null +++ b/cmd/raidz_test/Makefile.am @@ -0,0 +1,20 @@ +include $(top_srcdir)/config/Rules.am + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +bin_PROGRAMS = raidz_test + +raidz_test_SOURCES = \ + raidz_test.h \ + raidz_test.c \ + raidz_bench.c + +raidz_test_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la + +raidz_test_LDADD += -lm diff --git a/cmd/raidz_test/raidz_bench.c b/cmd/raidz_test/raidz_bench.c new file mode 100644 index 000000000000..8a2cec4ca685 --- /dev/null +++ b/cmd/raidz_test/raidz_bench.c @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "raidz_test.h" + +#define GEN_BENCH_MEMORY (((uint64_t)1ULL)<<32) +#define REC_BENCH_MEMORY (((uint64_t)1ULL)<<29) +#define BENCH_ASHIFT 12 +#define MIN_CS_SHIFT BENCH_ASHIFT +#define MAX_CS_SHIFT SPA_MAXBLOCKSHIFT + +static zio_t zio_bench; +static raidz_map_t *rm_bench; +static size_t max_data_size = SPA_MAXBLOCKSIZE; + +static void +bench_init_raidz_map(void) +{ + zio_bench.io_offset = 0; + zio_bench.io_size = max_data_size; + + /* + * To permit larger column sizes these have to be done + * allocated using aligned alloc instead of zio_abd_buf_alloc + */ + zio_bench.io_abd = raidz_alloc(max_data_size); + + init_zio_abd(&zio_bench); +} + +static void +bench_fini_raidz_maps(void) +{ + /* tear down golden zio */ + raidz_free(zio_bench.io_abd, max_data_size); + bzero(&zio_bench, sizeof (zio_t)); +} + +static inline void +run_gen_bench_impl(const char *impl) +{ + int fn, ncols; + uint64_t ds, iter_cnt, iter, disksize; + hrtime_t start; + double elapsed, d_bw; + + /* Benchmark generate functions */ + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + for (ds = MIN_CS_SHIFT; ds <= MAX_CS_SHIFT; ds++) { + /* create suitable raidz_map */ + ncols = rto_opts.rto_dcols + fn + 1; + zio_bench.io_size = 1ULL << ds; + rm_bench = vdev_raidz_map_alloc(&zio_bench, + BENCH_ASHIFT, ncols, fn+1); + + /* estimate iteration count */ + iter_cnt = GEN_BENCH_MEMORY; + iter_cnt /= zio_bench.io_size; + + start = gethrtime(); + for (iter = 0; iter < iter_cnt; iter++) + vdev_raidz_generate_parity(rm_bench); + elapsed = NSEC2SEC((double)(gethrtime() - start)); + + disksize = (1ULL << ds) / rto_opts.rto_dcols; + d_bw = (double)iter_cnt * (double)disksize; + d_bw /= (1024.0 * 1024.0 * elapsed); + + LOG(D_ALL, "%10s, %8s, %zu, %10llu, %lf, %lf, %u\n", + impl, + raidz_gen_name[fn], + rto_opts.rto_dcols, + (1ULL< +#include +#include +#include +#include +#include +#include +#include +#include +#include "raidz_test.h" + +static int *rand_data; +raidz_test_opts_t rto_opts; + +static char gdb[256]; +static const char gdb_tmpl[] = "gdb -ex \"set pagination 0\" -p %d"; + +static void sig_handler(int signo) +{ + struct sigaction action; + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + + if (rto_opts.rto_gdb) + if (system(gdb)) { } + + raise(signo); +} + +static void print_opts(raidz_test_opts_t *opts, boolean_t force) +{ + char *verbose; + switch (opts->rto_v) { + case 0: + verbose = "no"; + break; + case 1: + verbose = "info"; + break; + default: + verbose = "debug"; + break; + } + + if (force || opts->rto_v >= D_INFO) { + (void) fprintf(stdout, DBLSEP "Running with options:\n" + " (-a) zio ashift : %zu\n" + " (-o) zio offset : 1 << %zu\n" + " (-d) number of raidz data columns : %zu\n" + " (-s) size of DATA : 1 << %zu\n" + " (-S) sweep parameters : %s \n" + " (-v) verbose : %s \n\n", + opts->rto_ashift, /* -a */ + ilog2(opts->rto_offset), /* -o */ + opts->rto_dcols, /* -d */ + ilog2(opts->rto_dsize), /* -s */ + opts->rto_sweep ? "yes" : "no", /* -S */ + verbose); /* -v */ + } +} + +static void usage(boolean_t requested) +{ + const raidz_test_opts_t *o = &rto_opts_defaults; + + FILE *fp = requested ? stdout : stderr; + + (void) fprintf(fp, "Usage:\n" + "\t[-a zio ashift (default: %zu)]\n" + "\t[-o zio offset, exponent radix 2 (default: %zu)]\n" + "\t[-d number of raidz data columns (default: %zu)]\n" + "\t[-s zio size, exponent radix 2 (default: %zu)]\n" + "\t[-S parameter sweep (default: %s)]\n" + "\t[-t timeout for parameter sweep test]\n" + "\t[-B benchmark all raidz implementations]\n" + "\t[-v increase verbosity (default: %zu)]\n" + "\t[-h (print help)]\n" + "\t[-T test the test, see if failure would be detected]\n" + "\t[-D debug (attach gdb on SIGSEGV)]\n" + "", + o->rto_ashift, /* -a */ + ilog2(o->rto_offset), /* -o */ + o->rto_dcols, /* -d */ + ilog2(o->rto_dsize), /* -s */ + rto_opts.rto_sweep ? "yes" : "no", /* -S */ + o->rto_v); /* -d */ + + exit(requested ? 0 : 1); +} + +static void process_options(int argc, char **argv) +{ + size_t value; + int opt; + + raidz_test_opts_t *o = &rto_opts; + + bcopy(&rto_opts_defaults, o, sizeof (*o)); + + while ((opt = getopt(argc, argv, "TDBSvha:o:d:s:t:")) != -1) { + value = 0; + + switch (opt) { + case 'a': + value = strtoull(optarg, NULL, 0); + o->rto_ashift = MIN(13, MAX(9, value)); + break; + case 'o': + value = strtoull(optarg, NULL, 0); + o->rto_offset = ((1ULL << MIN(12, value)) >> 9) << 9; + break; + case 'd': + value = strtoull(optarg, NULL, 0); + o->rto_dcols = MIN(255, MAX(1, value)); + break; + case 's': + value = strtoull(optarg, NULL, 0); + o->rto_dsize = 1ULL << MIN(SPA_MAXBLOCKSHIFT, + MAX(SPA_MINBLOCKSHIFT, value)); + break; + case 't': + value = strtoull(optarg, NULL, 0); + o->rto_sweep_timeout = value; + break; + case 'v': + o->rto_v++; + break; + case 'S': + o->rto_sweep = 1; + break; + case 'B': + o->rto_benchmark = 1; + break; + case 'D': + o->rto_gdb = 1; + break; + case 'T': + o->rto_sanity = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } +} + +#define DATA_COL(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_abd) +#define DATA_COL_SIZE(rm, i) ((rm)->rm_col[raidz_parity(rm) + (i)].rc_size) + +#define CODE_COL(rm, i) ((rm)->rm_col[(i)].rc_abd) +#define CODE_COL_SIZE(rm, i) ((rm)->rm_col[(i)].rc_size) + +static int +cmp_code(raidz_test_opts_t *opts, const raidz_map_t *rm, const int parity) +{ + int i, ret = 0; + + VERIFY(parity >= 1 && parity <= 3); + + for (i = 0; i < parity; i++) { + if (abd_cmp(CODE_COL(rm, i), CODE_COL(opts->rm_golden, i)) + != 0) { + ret++; + LOG_OPT(D_DEBUG, opts, + "\nParity block [%d] different!\n", i); + } + } + return (ret); +} + +static int +cmp_data(raidz_test_opts_t *opts, raidz_map_t *rm) +{ + int i, ret = 0; + int dcols = opts->rm_golden->rm_cols - raidz_parity(opts->rm_golden); + + for (i = 0; i < dcols; i++) { + if (abd_cmp(DATA_COL(opts->rm_golden, i), DATA_COL(rm, i)) + != 0) { + ret++; + + LOG_OPT(D_DEBUG, opts, + "\nData block [%d] different!\n", i); + } + } + return (ret); +} + +static int +init_rand(void *data, size_t size, void *private) +{ + int i; + int *dst = (int *)data; + + for (i = 0; i < size / sizeof (int); i++) + dst[i] = rand_data[i]; + + return (0); +} + +static void +corrupt_colums(raidz_map_t *rm, const int *tgts, const int cnt) +{ + int i; + raidz_col_t *col; + + for (i = 0; i < cnt; i++) { + col = &rm->rm_col[tgts[i]]; + abd_iterate_func(col->rc_abd, 0, col->rc_size, init_rand, NULL); + } +} + +void +init_zio_abd(zio_t *zio) +{ + abd_iterate_func(zio->io_abd, 0, zio->io_size, init_rand, NULL); +} + +static void +fini_raidz_map(zio_t **zio, raidz_map_t **rm) +{ + vdev_raidz_map_free(*rm); + raidz_free((*zio)->io_abd, (*zio)->io_size); + umem_free(*zio, sizeof (zio_t)); + + *zio = NULL; + *rm = NULL; +} + +static int +init_raidz_golden_map(raidz_test_opts_t *opts, const int parity) +{ + int err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + const size_t total_ncols = opts->rto_dcols + parity; + + if (opts->rm_golden) { + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + } + + opts->zio_golden = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + zio_test = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + opts->zio_golden->io_offset = zio_test->io_offset = opts->rto_offset; + opts->zio_golden->io_size = zio_test->io_size = opts->rto_dsize; + + opts->zio_golden->io_abd = raidz_alloc(opts->rto_dsize); + zio_test->io_abd = raidz_alloc(opts->rto_dsize); + + init_zio_abd(opts->zio_golden); + init_zio_abd(zio_test); + + VERIFY0(vdev_raidz_impl_set("original")); + + opts->rm_golden = vdev_raidz_map_alloc(opts->zio_golden, + opts->rto_ashift, total_ncols, parity); + rm_test = vdev_raidz_map_alloc(zio_test, + opts->rto_ashift, total_ncols, parity); + + VERIFY(opts->zio_golden); + VERIFY(opts->rm_golden); + + vdev_raidz_generate_parity(opts->rm_golden); + vdev_raidz_generate_parity(rm_test); + + /* sanity check */ + err |= cmp_data(opts, rm_test); + err |= cmp_code(opts, rm_test, parity); + + if (err) + ERR("initializing the golden copy ... [FAIL]!\n"); + + /* tear down raidz_map of test zio */ + fini_raidz_map(&zio_test, &rm_test); + + return (err); +} + +static raidz_map_t * +init_raidz_map(raidz_test_opts_t *opts, zio_t **zio, const int parity) +{ + raidz_map_t *rm = NULL; + const size_t alloc_dsize = opts->rto_dsize; + const size_t total_ncols = opts->rto_dcols + parity; + const int ccols[] = { 0, 1, 2 }; + + VERIFY(zio); + VERIFY(parity <= 3 && parity >= 1); + + *zio = umem_zalloc(sizeof (zio_t), UMEM_NOFAIL); + + (*zio)->io_offset = 0; + (*zio)->io_size = alloc_dsize; + (*zio)->io_abd = raidz_alloc(alloc_dsize); + init_zio_abd(*zio); + + rm = vdev_raidz_map_alloc(*zio, opts->rto_ashift, + total_ncols, parity); + VERIFY(rm); + + /* Make sure code columns are destroyed */ + corrupt_colums(rm, ccols, parity); + + return (rm); +} + +static int +run_gen_check(raidz_test_opts_t *opts) +{ + char **impl_name; + int fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing parity generation...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (0 != vdev_raidz_impl_set(*impl_name)) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else { + LOG(D_INFO, "[SUPPORTED]\n"); + } + + for (fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, fn+1); + VERIFY(rm_test); + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_gen_name[fn]); + + if (!opts->rto_sanity) + vdev_raidz_generate_parity(rm_test); + + if (cmp_code(opts, rm_test, fn+1) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + } else + LOG(D_INFO, "[PASS]\n"); + + fini_raidz_map(&zio_test, &rm_test); + } + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_rec_check_impl(raidz_test_opts_t *opts, raidz_map_t *rm, const int fn) +{ + int x0, x1, x2; + int tgtidx[3]; + int err = 0; + static const int rec_tgts[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + memcpy(tgtidx, rec_tgts[fn], sizeof (tgtidx)); + + if (fn < RAIDZ_REC_PQ) { + /* can reconstruct 1 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d] ", x0); + + tgtidx[2] = x0 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+2, 1); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d]... [FAIL]\n", x0); + } + } + + } else if (fn < RAIDZ_REC_PQR) { + /* can reconstruct 2 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d] ", x0, x1); + + tgtidx[1] = x0 + raidz_parity(rm); + tgtidx[2] = x1 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx+1, 2); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, "\nREC D[%d %d]... " + "[FAIL]\n", x0, x1); + } + } + } + } else { + /* can reconstruct 3 failed data disk */ + for (x0 = 0; x0 < opts->rto_dcols; x0++) { + if (x0 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x1 = x0 + 1; x1 < opts->rto_dcols; x1++) { + if (x1 >= rm->rm_cols - raidz_parity(rm)) + continue; + for (x2 = x1 + 1; x2 < opts->rto_dcols; x2++) { + if (x2 >= + rm->rm_cols - raidz_parity(rm)) + continue; + + /* Check if should stop */ + if (rto_opts.rto_should_stop) + return (err); + + LOG(D_DEBUG, "[%d %d %d]", x0, x1, x2); + + tgtidx[0] = x0 + raidz_parity(rm); + tgtidx[1] = x1 + raidz_parity(rm); + tgtidx[2] = x2 + raidz_parity(rm); + + corrupt_colums(rm, tgtidx, 3); + + if (!opts->rto_sanity) + vdev_raidz_reconstruct(rm, + tgtidx, 3); + + if (cmp_data(opts, rm) != 0) { + err++; + LOG(D_DEBUG, + "\nREC D[%d %d %d]... " + "[FAIL]\n", x0, x1, x2); + } + } + } + } + } + return (err); +} + +static int +run_rec_check(raidz_test_opts_t *opts) +{ + char **impl_name; + unsigned fn, err = 0; + zio_t *zio_test; + raidz_map_t *rm_test; + + err = init_raidz_golden_map(opts, PARITY_PQR); + if (0 != err) + return (err); + + LOG(D_INFO, DBLSEP); + LOG(D_INFO, "Testing data reconstruction...\n"); + + for (impl_name = (char **)raidz_impl_names+1; *impl_name != NULL; + impl_name++) { + + LOG(D_INFO, SEP); + LOG(D_INFO, "\tTesting [%s] implementation...", *impl_name); + + if (vdev_raidz_impl_set(*impl_name) != 0) { + LOG(D_INFO, "[SKIP]\n"); + continue; + } else + LOG(D_INFO, "[SUPPORTED]\n"); + + + /* create suitable raidz_map */ + rm_test = init_raidz_map(opts, &zio_test, PARITY_PQR); + /* generate parity */ + vdev_raidz_generate_parity(rm_test); + + for (fn = 0; fn < RAIDZ_REC_NUM; fn++) { + + LOG(D_INFO, "\t\tTesting method [%s] ...", + raidz_rec_name[fn]); + + if (run_rec_check_impl(opts, rm_test, fn) != 0) { + LOG(D_INFO, "[FAIL]\n"); + err++; + + } else + LOG(D_INFO, "[PASS]\n"); + + } + /* tear down test raidz_map */ + fini_raidz_map(&zio_test, &rm_test); + } + + fini_raidz_map(&opts->zio_golden, &opts->rm_golden); + + return (err); +} + +static int +run_test(raidz_test_opts_t *opts) +{ + int err = 0; + + if (opts == NULL) + opts = &rto_opts; + + print_opts(opts, B_FALSE); + + err |= run_gen_check(opts); + err |= run_rec_check(opts); + + return (err); +} + +#define SWEEP_RUNNING 0 +#define SWEEP_FINISHED 1 +#define SWEEP_ERROR 2 +#define SWEEP_TIMEOUT 3 + +static int sweep_state = 0; +static raidz_test_opts_t failed_opts; + +static kmutex_t sem_mtx; +static kcondvar_t sem_cv; +static int max_free_slots; +static int free_slots; + +static void +sweep_thread(void *arg) +{ + int err = 0; + raidz_test_opts_t *opts = (raidz_test_opts_t *)arg; + VERIFY(opts != NULL); + + err = run_test(opts); + + if (rto_opts.rto_sanity) { + /* 25% chance that a sweep test fails */ + if (rand() < (RAND_MAX/4)) + err = 1; + } + + if (0 != err) { + mutex_enter(&sem_mtx); + memcpy(&failed_opts, opts, sizeof (raidz_test_opts_t)); + sweep_state = SWEEP_ERROR; + mutex_exit(&sem_mtx); + } + + umem_free(opts, sizeof (raidz_test_opts_t)); + + /* signal the next thread */ + mutex_enter(&sem_mtx); + free_slots++; + cv_signal(&sem_cv); + mutex_exit(&sem_mtx); + + thread_exit(); +} + +static int +run_sweep(void) +{ + static const size_t dcols_v[] = { 1, 2, 3, 4, 5, 6, 7, 8, 12, 15, 16 }; + static const size_t ashift_v[] = { 9, 12, 14 }; + static const size_t size_v[] = { 1 << 9, 21 * (1 << 9), 13 * (1 << 12), + 1 << 17, (1 << 20) - (1 << 12), SPA_MAXBLOCKSIZE }; + + (void) setvbuf(stdout, NULL, _IONBF, 0); + + ulong_t total_comb = ARRAY_SIZE(size_v) * ARRAY_SIZE(ashift_v) * + ARRAY_SIZE(dcols_v); + ulong_t tried_comb = 0; + hrtime_t time_diff, start_time = gethrtime(); + raidz_test_opts_t *opts; + int a, d, s; + + max_free_slots = free_slots = MAX(2, boot_ncpus); + + mutex_init(&sem_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&sem_cv, NULL, CV_DEFAULT, NULL); + + for (s = 0; s < ARRAY_SIZE(size_v); s++) + for (a = 0; a < ARRAY_SIZE(ashift_v); a++) + for (d = 0; d < ARRAY_SIZE(dcols_v); d++) { + + if (size_v[s] < (1 << ashift_v[a])) { + total_comb--; + continue; + } + + if (++tried_comb % 20 == 0) + LOG(D_ALL, "%lu/%lu... ", tried_comb, total_comb); + + /* wait for signal to start new thread */ + mutex_enter(&sem_mtx); + while (cv_timedwait_sig(&sem_cv, &sem_mtx, + ddi_get_lbolt() + hz)) { + + /* check if should stop the test (timeout) */ + time_diff = (gethrtime() - start_time) / NANOSEC; + if (rto_opts.rto_sweep_timeout > 0 && + time_diff >= rto_opts.rto_sweep_timeout) { + sweep_state = SWEEP_TIMEOUT; + rto_opts.rto_should_stop = B_TRUE; + mutex_exit(&sem_mtx); + goto exit; + } + + /* check if should stop the test (error) */ + if (sweep_state != SWEEP_RUNNING) { + mutex_exit(&sem_mtx); + goto exit; + } + + /* exit loop if a slot is available */ + if (free_slots > 0) { + break; + } + } + + free_slots--; + mutex_exit(&sem_mtx); + + opts = umem_zalloc(sizeof (raidz_test_opts_t), UMEM_NOFAIL); + opts->rto_ashift = ashift_v[a]; + opts->rto_dcols = dcols_v[d]; + opts->rto_offset = (1 << ashift_v[a]) * rand(); + opts->rto_dsize = size_v[s]; + opts->rto_v = 0; /* be quiet */ + + VERIFY3P(thread_create(NULL, 0, sweep_thread, (void *) opts, + 0, NULL, TS_RUN, defclsyspri), !=, NULL); + } + +exit: + LOG(D_ALL, "\nWaiting for test threads to finish...\n"); + mutex_enter(&sem_mtx); + VERIFY(free_slots <= max_free_slots); + while (free_slots < max_free_slots) { + (void) cv_wait(&sem_cv, &sem_mtx); + } + mutex_exit(&sem_mtx); + + if (sweep_state == SWEEP_ERROR) { + ERR("Sweep test failed! Failed option: \n"); + print_opts(&failed_opts, B_TRUE); + } else { + if (sweep_state == SWEEP_TIMEOUT) + LOG(D_ALL, "Test timeout (%lus). Stopping...\n", + (ulong_t)rto_opts.rto_sweep_timeout); + + LOG(D_ALL, "Sweep test succeeded on %lu raidz maps!\n", + (ulong_t)tried_comb); + } + + mutex_destroy(&sem_mtx); + + return (sweep_state == SWEEP_ERROR ? SWEEP_ERROR : 0); +} + +int +main(int argc, char **argv) +{ + size_t i; + struct sigaction action; + int err = 0; + + /* init gdb string early */ + (void) sprintf(gdb, gdb_tmpl, getpid()); + + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGSEGV, &action, NULL) < 0) { + ERR("raidz_test: cannot catch SIGSEGV: %s.\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + + process_options(argc, argv); + + kernel_init(SPA_MODE_READ); + + /* setup random data because rand() is not reentrant */ + rand_data = (int *)umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + srand((unsigned)time(NULL) * getpid()); + for (i = 0; i < SPA_MAXBLOCKSIZE / sizeof (int); i++) + rand_data[i] = rand(); + + mprotect(rand_data, SPA_MAXBLOCKSIZE, PROT_READ); + + if (rto_opts.rto_benchmark) { + run_raidz_benchmark(); + } else if (rto_opts.rto_sweep) { + err = run_sweep(); + } else { + err = run_test(NULL); + } + + umem_free(rand_data, SPA_MAXBLOCKSIZE); + kernel_fini(); + + return (err); +} diff --git a/cmd/raidz_test/raidz_test.h b/cmd/raidz_test/raidz_test.h new file mode 100644 index 000000000000..09c825ae43c7 --- /dev/null +++ b/cmd/raidz_test/raidz_test.h @@ -0,0 +1,116 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef RAIDZ_TEST_H +#define RAIDZ_TEST_H + +#include + +static const char *raidz_impl_names[] = { + "original", + "scalar", + "sse2", + "ssse3", + "avx2", + "avx512f", + "avx512bw", + "aarch64_neon", + "aarch64_neonx2", + "powerpc_altivec", + NULL +}; + +typedef struct raidz_test_opts { + size_t rto_ashift; + size_t rto_offset; + size_t rto_dcols; + size_t rto_dsize; + size_t rto_v; + size_t rto_sweep; + size_t rto_sweep_timeout; + size_t rto_benchmark; + size_t rto_sanity; + size_t rto_gdb; + + /* non-user options */ + boolean_t rto_should_stop; + + zio_t *zio_golden; + raidz_map_t *rm_golden; +} raidz_test_opts_t; + +static const raidz_test_opts_t rto_opts_defaults = { + .rto_ashift = 9, + .rto_offset = 1ULL << 0, + .rto_dcols = 8, + .rto_dsize = 1<<19, + .rto_v = 0, + .rto_sweep = 0, + .rto_benchmark = 0, + .rto_sanity = 0, + .rto_gdb = 0, + .rto_should_stop = B_FALSE +}; + +extern raidz_test_opts_t rto_opts; + +static inline size_t ilog2(size_t a) +{ + return (a > 1 ? 1 + ilog2(a >> 1) : 0); +} + + +#define D_ALL 0 +#define D_INFO 1 +#define D_DEBUG 2 + +#define LOG(lvl, a...) \ +{ \ + if (rto_opts.rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define LOG_OPT(lvl, opt, a...) \ +{ \ + if (opt->rto_v >= lvl) \ + (void) fprintf(stdout, a); \ +} \ + +#define ERR(a...) (void) fprintf(stderr, a) + + +#define DBLSEP "================\n" +#define SEP "----------------\n" + + +#define raidz_alloc(size) abd_alloc(size, B_FALSE) +#define raidz_free(p, size) abd_free(p) + + +void init_zio_abd(zio_t *zio); + +void run_raidz_benchmark(void); + +#endif /* RAIDZ_TEST_H */ diff --git a/cmd/vdev_id/Makefile.am b/cmd/vdev_id/Makefile.am new file mode 100644 index 000000000000..fb815faad084 --- /dev/null +++ b/cmd/vdev_id/Makefile.am @@ -0,0 +1 @@ +dist_udev_SCRIPTS = vdev_id diff --git a/cmd/vdev_id/vdev_id b/cmd/vdev_id/vdev_id new file mode 100755 index 000000000000..8a75e638b67e --- /dev/null +++ b/cmd/vdev_id/vdev_id @@ -0,0 +1,605 @@ +#!/bin/sh +# +# vdev_id: udev helper to generate user-friendly names for JBOD disks +# +# This script parses the file /etc/zfs/vdev_id.conf to map a +# physical path in a storage topology to a channel name. The +# channel name is combined with a disk enclosure slot number to +# create an alias that reflects the physical location of the drive. +# This is particularly helpful when it comes to tasks like replacing +# failed drives. Slot numbers may also be re-mapped in case the +# default numbering is unsatisfactory. The drive aliases will be +# created as symbolic links in /dev/disk/by-vdev. +# +# The currently supported topologies are sas_direct and sas_switch. +# A multipath mode is supported in which dm-mpath devices are +# handled by examining the first-listed running component disk. In +# multipath mode the configuration file should contain a channel +# definition with the same name for each path to a given enclosure. +# +# The alias keyword provides a simple way to map already-existing +# device symlinks to more convenient names. It is suitable for +# small, static configurations or for sites that have some automated +# way to generate the mapping file. +# +# +# Some example configuration files are given below. + +# # +# # Example vdev_id.conf - sas_direct. +# # +# +# multipath no +# topology sas_direct +# phys_per_port 4 +# slot bay +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 C +# channel 86:00.0 0 D +# +# # Custom mapping for Channel A +# +# # Linux Mapped +# # Slot Slot Channel +# slot 1 7 A +# slot 2 10 A +# slot 3 3 A +# slot 4 6 A +# +# # Default mapping for B, C, and D +# slot 1 4 +# slot 2 2 +# slot 3 1 +# slot 4 3 + +# # +# # Example vdev_id.conf - sas_switch +# # +# +# topology sas_switch +# +# # SWITCH PORT CHANNEL NAME +# channel 1 A +# channel 2 B +# channel 3 C +# channel 4 D + +# # +# # Example vdev_id.conf - multipath +# # +# +# multipath yes +# +# # PCI_ID HBA PORT CHANNEL NAME +# channel 85:00.0 1 A +# channel 85:00.0 0 B +# channel 86:00.0 1 A +# channel 86:00.0 0 B + +# # +# # Example vdev_id.conf - alias +# # +# +# # by-vdev +# # name fully qualified or base name of device link +# alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca +# alias d2 wwn-0x5000c5002def789e + +PATH=/bin:/sbin:/usr/bin:/usr/sbin +CONFIG=/etc/zfs/vdev_id.conf +PHYS_PER_PORT= +DEV= +MULTIPATH= +TOPOLOGY= +BAY= + +usage() { + cat << EOF +Usage: vdev_id [-h] + vdev_id <-d device> [-c config_file] [-p phys_per_port] + [-g sas_direct|sas_switch|scsi] [-m] + + -c specify name of an alternative config file [default=$CONFIG] + -d specify basename of device (i.e. sda) + -e Create enclose device symlinks only (/dev/by-enclosure) + -g Storage network topology [default="$TOPOLOGY"] + -m Run in multipath mode + -p number of phy's per switch port [default=$PHYS_PER_PORT] + -h show this summary +EOF + exit 0 +} + +map_slot() { + LINUX_SLOT=$1 + CHANNEL=$2 + + MAPPED_SLOT=`awk "\\$1 == \"slot\" && \\$2 == ${LINUX_SLOT} && \ + \\$4 ~ /^${CHANNEL}$|^$/ { print \\$3; exit }" $CONFIG` + if [ -z "$MAPPED_SLOT" ] ; then + MAPPED_SLOT=$LINUX_SLOT + fi + printf "%d" ${MAPPED_SLOT} +} + +map_channel() { + MAPPED_CHAN= + PCI_ID=$1 + PORT=$2 + + case $TOPOLOGY in + "sas_switch") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \\$2 == ${PORT} \ + { print \\$3; exit }" $CONFIG` + ;; + "sas_direct"|"scsi") + MAPPED_CHAN=`awk "\\$1 == \"channel\" && \ + \\$2 == \"${PCI_ID}\" && \\$3 == ${PORT} \ + { print \\$4; exit }" $CONFIG` + ;; + esac + printf "%s" ${MAPPED_CHAN} +} + +sas_handler() { + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In sas_switch mode, the directory four levels beneath + # /sys/.../hostX contains symlinks to phy devices that reveal + # the switch port number. In sas_direct mode, the phy links one + # directory down reveal the HBA port. + port_dir=$scsi_host_dir + case $TOPOLOGY in + "sas_switch") j=$(($i + 4)) ;; + "sas_direct") j=$(($i + 1)) ;; + esac + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + PHY=`ls -d $port_dir/phy* 2>/dev/null | head -1 | awk -F: '{print $NF}'` + if [ -z "$PHY" ] ; then + PHY=0 + fi + PORT=$(( $PHY / $PHYS_PER_PORT )) + + # Look in /sys/.../sas_device/end_device-X for the bay_identifier + # attribute. + end_device_dir=$port_dir + while [ $i -lt $num_dirs ] ; do + d=$(eval echo \${$i}) + end_device_dir="$end_device_dir/$d" + if echo $d | grep -q '^end_device' ; then + end_device_dir="$end_device_dir/sas_device/$d" + break + fi + i=$(($i + 1)) + done + + SLOT= + case $BAY in + "bay") + SLOT=`cat $end_device_dir/bay_identifier 2>/dev/null` + ;; + "phy") + SLOT=`cat $end_device_dir/phy_identifier 2>/dev/null` + ;; + "port") + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "id") + i=$(($i + 1)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "lun") + i=$(($i + 2)) + d=$(eval echo \${$i}) + SLOT=`echo $d | sed -e 's/^.*://'` + ;; + "ses") + # look for this SAS path in all SCSI Enclosure Services + # (SES) enclosures + sas_address=`cat $end_device_dir/sas_address 2>/dev/null` + enclosures=`lsscsi -g | \ + sed -n -e '/enclosu/s/^.* \([^ ][^ ]*\) *$/\1/p'` + for enclosure in $enclosures; do + set -- $(sg_ses -p aes $enclosure | \ + awk "/device slot number:/{slot=\$12} \ + /SAS address: $sas_address/\ + {print slot}") + SLOT=$1 + if [ -n "$SLOT" ] ; then + break + fi + done + ;; + esac + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +scsi_handler() { + if [ -z "$FIRST_BAY_NUMBER" ] ; then + FIRST_BAY_NUMBER=`awk "\\$1 == \"first_bay_number\" \ + {print \\$2; exit}" $CONFIG` + fi + FIRST_BAY_NUMBER=${FIRST_BAY_NUMBER:-0} + + if [ -z "$PHYS_PER_PORT" ] ; then + PHYS_PER_PORT=`awk "\\$1 == \"phys_per_port\" \ + {print \\$2; exit}" $CONFIG` + fi + PHYS_PER_PORT=${PHYS_PER_PORT:-4} + if ! echo $PHYS_PER_PORT | grep -q -E '^[0-9]+$' ; then + echo "Error: phys_per_port value $PHYS_PER_PORT is non-numeric" + exit 1 + fi + + if [ -z "$MULTIPATH_MODE" ] ; then + MULTIPATH_MODE=`awk "\\$1 == \"multipath\" \ + {print \\$2; exit}" $CONFIG` + fi + + # Use first running component device if we're handling a dm-mpath device + if [ "$MULTIPATH_MODE" = "yes" ] ; then + # If udev didn't tell us the UUID via DM_NAME, check /dev/mapper + if [ -z "$DM_NAME" ] ; then + DM_NAME=`ls -l --full-time /dev/mapper | + awk "/\/$DEV$/{print \\$9}"` + fi + + # For raw disks udev exports DEVTYPE=partition when + # handling partitions, and the rules can be written to + # take advantage of this to append a -part suffix. For + # dm devices we get DEVTYPE=disk even for partitions so + # we have to append the -part suffix directly in the + # helper. + if [ "$DEVTYPE" != "partition" ] ; then + PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + + # Strip off partition information. + DM_NAME=`echo $DM_NAME | sed 's/p[0-9][0-9]*$//'` + if [ -z "$DM_NAME" ] ; then + return + fi + + # Get the raw scsi device name from multipath -ll. Strip off + # leading pipe symbols to make field numbering consistent. + DEV=`multipath -ll $DM_NAME | + awk '/running/{gsub("^[|]"," "); print $3 ; exit}'` + if [ -z "$DEV" ] ; then + return + fi + fi + + if echo $DEV | grep -q ^/devices/ ; then + sys_path=$DEV + else + sys_path=`udevadm info -q path -p /sys/block/$DEV 2>/dev/null` + fi + + # expect sys_path like this, for example: + # /devices/pci0000:00/0000:00:0b.0/0000:09:00.0/0000:0a:05.0/0000:0c:00.0/host3/target3:1:0/3:1:0:21/block/sdv + + # Use positional parameters as an ad-hoc array + set -- $(echo "$sys_path" | tr / ' ') + num_dirs=$# + scsi_host_dir="/sys" + + # Get path up to /sys/.../hostX + i=1 + while [ $i -le $num_dirs ] ; do + d=$(eval echo \${$i}) + scsi_host_dir="$scsi_host_dir/$d" + echo $d | grep -q -E '^host[0-9]+$' && break + i=$(($i + 1)) + done + + if [ $i = $num_dirs ] ; then + return + fi + + PCI_ID=$(eval echo \${$(($i -1))} | awk -F: '{print $2":"$3}') + + # In scsi mode, the directory two levels beneath + # /sys/.../hostX reveals the port and slot. + port_dir=$scsi_host_dir + j=$(($i + 2)) + + i=$(($i + 1)) + while [ $i -le $j ] ; do + port_dir="$port_dir/$(eval echo \${$i})" + i=$(($i + 1)) + done + + set -- $(echo $port_dir | sed -e 's/^.*:\([^:]*\):\([^:]*\)$/\1 \2/') + PORT=$1 + SLOT=$(($2 + $FIRST_BAY_NUMBER)) + + if [ -z "$SLOT" ] ; then + return + fi + + CHAN=`map_channel $PCI_ID $PORT` + SLOT=`map_slot $SLOT $CHAN` + if [ -z "$CHAN" ] ; then + return + fi + echo ${CHAN}${SLOT}${PART} +} + +# Figure out the name for the enclosure symlink +enclosure_handler () { + # We get all the info we need from udev's DEVPATH variable: + # + # DEVPATH=/sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/subsystem/devices/0:0:0:0/scsi_generic/sg0 + + # Get the enclosure ID ("0:0:0:0") + ENC=$(basename $(readlink -m "/sys/$DEVPATH/../..")) + if [ ! -d /sys/class/enclosure/$ENC ] ; then + # Not an enclosure, bail out + return + fi + + # Get the long sysfs device path to our enclosure. Looks like: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0/ ... /enclosure/0:0:0:0 + + ENC_DEVICE=$(readlink /sys/class/enclosure/$ENC) + + # Grab the full path to the hosts port dir: + # /devices/pci0000:00/0000:00:03.0/0000:05:00.0/host0/port-0:0 + PORT_DIR=$(echo $ENC_DEVICE | grep -Eo '.+host[0-9]+/port-[0-9]+:[0-9]+') + + # Get the port number + PORT_ID=$(echo $PORT_DIR | grep -Eo "[0-9]+$") + + # The PCI directory is two directories up from the port directory + # /sys/devices/pci0000:00/0000:00:03.0/0000:05:00.0 + PCI_ID_LONG=$(basename $(readlink -m "/sys/$PORT_DIR/../..")) + + # Strip down the PCI address from 0000:05:00.0 to 05:00.0 + PCI_ID=$(echo "$PCI_ID_LONG" | sed -r 's/^[0-9]+://g') + + # Name our device according to vdev_id.conf (like "L0" or "U1"). + NAME=$(awk "/channel/{if (\$1 == \"channel\" && \$2 == \"$PCI_ID\" && \ + \$3 == \"$PORT_ID\") {print \$4int(count[\$4])}; count[\$4]++}" $CONFIG) + + echo "${NAME}" +} + +alias_handler () { + # Special handling is needed to correctly append a -part suffix + # to partitions of device mapper devices. The DEVTYPE attribute + # is normally set to "disk" instead of "partition" in this case, + # so the udev rules won't handle that for us as they do for + # "plain" block devices. + # + # For example, we may have the following links for a device and its + # partitions, + # + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0 -> ../../dm-0 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p1 -> ../../dm-1 + # /dev/disk/by-id/dm-name-isw_dibgbfcije_ARRAY0p2 -> ../../dm-3 + # + # and the following alias in vdev_id.conf. + # + # alias A0 dm-name-isw_dibgbfcije_ARRAY0 + # + # The desired outcome is for the following links to be created + # without having explicitly defined aliases for the partitions. + # + # /dev/disk/by-vdev/A0 -> ../../dm-0 + # /dev/disk/by-vdev/A0-part1 -> ../../dm-1 + # /dev/disk/by-vdev/A0-part2 -> ../../dm-3 + # + # Warning: The following grep pattern will misidentify whole-disk + # devices whose names end with 'p' followed by a string of + # digits as partitions, causing alias creation to fail. This + # ambiguity seems unavoidable, so devices using this facility + # must not use such names. + DM_PART= + if echo $DM_NAME | grep -q -E 'p[0-9][0-9]*$' ; then + if [ "$DEVTYPE" != "partition" ] ; then + DM_PART=`echo $DM_NAME | awk -Fp '/p/{print "-part"$2}'` + fi + fi + + # DEVLINKS attribute must have been populated by already-run udev rules. + for link in $DEVLINKS ; do + # Remove partition information to match key of top-level device. + if [ -n "$DM_PART" ] ; then + link=`echo $link | sed 's/p[0-9][0-9]*$//'` + fi + # Check both the fully qualified and the base name of link. + for l in $link `basename $link` ; do + alias=`awk "\\$1 == \"alias\" && \\$3 == \"${l}\" \ + { print \\$2; exit }" $CONFIG` + if [ -n "$alias" ] ; then + echo ${alias}${DM_PART} + return + fi + done + done +} + +while getopts 'c:d:eg:mp:h' OPTION; do + case ${OPTION} in + c) + CONFIG=${OPTARG} + ;; + d) + DEV=${OPTARG} + ;; + e) + # When udev sees a scsi_generic device, it calls this script with -e to + # create the enclosure device symlinks only. We also need + # "enclosure_symlinks yes" set in vdev_id.config to actually create the + # symlink. + ENCLOSURE_MODE=$(awk '{if ($1 == "enclosure_symlinks") print $2}' $CONFIG) + if [ "$ENCLOSURE_MODE" != "yes" ] ; then + exit 0 + fi + ;; + g) + TOPOLOGY=$OPTARG + ;; + p) + PHYS_PER_PORT=${OPTARG} + ;; + m) + MULTIPATH_MODE=yes + ;; + h) + usage + ;; + esac +done + +if [ ! -r $CONFIG ] ; then + exit 0 +fi + +if [ -z "$DEV" ] && [ -z "$ENCLOSURE_MODE" ] ; then + echo "Error: missing required option -d" + exit 1 +fi + +if [ -z "$TOPOLOGY" ] ; then + TOPOLOGY=`awk "\\$1 == \"topology\" {print \\$2; exit}" $CONFIG` +fi + +if [ -z "$BAY" ] ; then + BAY=`awk "\\$1 == \"slot\" {print \\$2; exit}" $CONFIG` +fi + +TOPOLOGY=${TOPOLOGY:-sas_direct} + +# Should we create /dev/by-enclosure symlinks? +if [ "$ENCLOSURE_MODE" = "yes" ] && [ "$TOPOLOGY" = "sas_direct" ] ; then + ID_ENCLOSURE=$(enclosure_handler) + if [ -z "$ID_ENCLOSURE" ] ; then + exit 0 + fi + + # Just create the symlinks to the enclosure devices and then exit. + ENCLOSURE_PREFIX=$(awk '/enclosure_symlinks_prefix/{print $2}' $CONFIG) + if [ -z "$ENCLOSURE_PREFIX" ] ; then + ENCLOSURE_PREFIX="enc" + fi + echo "ID_ENCLOSURE=$ID_ENCLOSURE" + echo "ID_ENCLOSURE_PATH=by-enclosure/$ENCLOSURE_PREFIX-$ID_ENCLOSURE" + exit 0 +fi + +# First check if an alias was defined for this device. +ID_VDEV=`alias_handler` + +if [ -z "$ID_VDEV" ] ; then + BAY=${BAY:-bay} + case $TOPOLOGY in + sas_direct|sas_switch) + ID_VDEV=`sas_handler` + ;; + scsi) + ID_VDEV=`scsi_handler` + ;; + *) + echo "Error: unknown topology $TOPOLOGY" + exit 1 + ;; + esac +fi + +if [ -n "$ID_VDEV" ] ; then + echo "ID_VDEV=${ID_VDEV}" + echo "ID_VDEV_PATH=disk/by-vdev/${ID_VDEV}" +fi diff --git a/cmd/zdb/.gitignore b/cmd/zdb/.gitignore new file mode 100644 index 000000000000..f64a3fc5a160 --- /dev/null +++ b/cmd/zdb/.gitignore @@ -0,0 +1 @@ +/zdb diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am new file mode 100644 index 000000000000..b325cb060bd2 --- /dev/null +++ b/cmd/zdb/Makefile.am @@ -0,0 +1,16 @@ +include $(top_srcdir)/config/Rules.am + +# Unconditionally enable debugging for zdb +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = zdb + +zdb_SOURCES = \ + zdb.c \ + zdb_il.c \ + zdb.h + +zdb_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c new file mode 100644 index 000000000000..e7211711a41c --- /dev/null +++ b/cmd/zdb/zdb.c @@ -0,0 +1,8606 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2017, 2018 Lawrence Livermore National Security, LLC. + * Copyright (c) 2015, 2017, Intel Corporation. + * Copyright (c) 2020 Datto Inc. + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zdb.h" + +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) + +static char * +zdb_ot_name(dmu_object_type_t type) +{ + if (type < DMU_OT_NUMTYPES) + return (dmu_ot[type].ot_name); + else if ((type & DMU_OT_NEWTYPE) && + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); + else + return ("UNKNOWN"); +} + +extern int reference_tracking_enable; +extern int zfs_recover; +extern unsigned long zfs_arc_meta_min, zfs_arc_meta_limit; +extern int zfs_vdev_async_read_max_active; +extern boolean_t spa_load_verify_dryrun; +extern int zfs_reconstruct_indirect_combinations_max; +extern int zfs_btree_verify_intensity; + +static const char cmdname[] = "zdb"; +uint8_t dump_opt[256]; + +typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); + +uint64_t *zopt_metaslab = NULL; +static unsigned zopt_metaslab_args = 0; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; +zopt_object_range_t *zopt_object_ranges = NULL; +static unsigned zopt_object_args = 0; + +static int flagbits[256]; + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ +static int leaked_objects = 0; +static range_tree_t *mos_refd_objs; + +static void snprintf_blkptr_compact(char *, size_t, const blkptr_t *, + boolean_t); +static void mos_obj_refd(uint64_t); +static void mos_obj_refd_multiple(uint64_t); +static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx); + +typedef struct sublivelist_verify { + /* all ALLOC'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_allocs; + + /* all FREE'd blkptr_t in one sub-livelist */ + zfs_btree_t sv_all_frees; + + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +static void zdb_print_blkptr(const blkptr_t *bp, int flags); + +static int +sublivelist_verify_blkptr(void *arg, const blkptr_t *bp, boolean_t free, + dmu_tx_t *tx) +{ + ASSERT3P(tx, ==, NULL); + struct sublivelist_verify *sv = arg; + char blkbuf[BP_SPRINTF_LEN]; + zfs_btree_index_t where; + if (free) { + zfs_btree_add(&sv->sv_pair, bp); + /* Check if the FREE is a duplicate */ + if (zfs_btree_find(&sv->sv_all_frees, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate FREE: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_frees, bp, &where); + } + } else { + /* Check if the ALLOC has been freed */ + if (zfs_btree_find(&sv->sv_pair, bp, &where) != NULL) { + zfs_btree_remove_idx(&sv->sv_pair, &where); + } else { + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + if (DVA_IS_EMPTY(&bp->blk_dva[i])) + break; + sublivelist_verify_block_t svb = { + .svb_dva = bp->blk_dva[i], + .svb_allocated_txg = bp->blk_birth + }; + + if (zfs_btree_find(&sv->sv_leftover, &svb, + &where) == NULL) { + zfs_btree_add_idx(&sv->sv_leftover, + &svb, &where); + } + } + } + /* Check if the ALLOC is a duplicate */ + if (zfs_btree_find(&sv->sv_all_allocs, bp, &where) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, + free); + (void) printf("\tERROR: Duplicate ALLOC: %s\n", blkbuf); + } else { + zfs_btree_add_idx(&sv->sv_all_allocs, bp, &where); + } + } + return (0); +} + +static int +sublivelist_verify_func(void *args, dsl_deadlist_entry_t *dle) +{ + int err; + char blkbuf[BP_SPRINTF_LEN]; + struct sublivelist_verify *sv = args; + + zfs_btree_create(&sv->sv_all_allocs, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_all_frees, livelist_compare, + sizeof (blkptr_t)); + + zfs_btree_create(&sv->sv_pair, livelist_compare, + sizeof (blkptr_t)); + + err = bpobj_iterate_nofree(&dle->dle_bpobj, sublivelist_verify_blkptr, + sv, NULL); + + zfs_btree_clear(&sv->sv_all_allocs); + zfs_btree_destroy(&sv->sv_all_allocs); + + zfs_btree_clear(&sv->sv_all_frees); + zfs_btree_destroy(&sv->sv_all_frees); + + blkptr_t *e; + zfs_btree_index_t *cookie = NULL; + while ((e = zfs_btree_destroy_nodes(&sv->sv_pair, &cookie)) != NULL) { + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), e, B_TRUE); + (void) printf("\tERROR: Unmatched FREE: %s\n", blkbuf); + } + zfs_btree_destroy(&sv->sv_pair); + + return (err); +} + +static int +livelist_block_compare(const void *larg, const void *rarg) +{ + const sublivelist_verify_block_t *l = larg; + const sublivelist_verify_block_t *r = rarg; + + if (DVA_GET_VDEV(&l->svb_dva) < DVA_GET_VDEV(&r->svb_dva)) + return (-1); + else if (DVA_GET_VDEV(&l->svb_dva) > DVA_GET_VDEV(&r->svb_dva)) + return (+1); + + if (DVA_GET_OFFSET(&l->svb_dva) < DVA_GET_OFFSET(&r->svb_dva)) + return (-1); + else if (DVA_GET_OFFSET(&l->svb_dva) > DVA_GET_OFFSET(&r->svb_dva)) + return (+1); + + if (DVA_GET_ASIZE(&l->svb_dva) < DVA_GET_ASIZE(&r->svb_dva)) + return (-1); + else if (DVA_GET_ASIZE(&l->svb_dva) > DVA_GET_ASIZE(&r->svb_dva)) + return (+1); + + return (0); +} + +/* + * Check for errors in a livelist while tracking all unfreed ALLOCs in the + * sublivelist_verify_t: sv->sv_leftover + */ +static void +livelist_verify(dsl_deadlist_t *dl, void *arg) +{ + sublivelist_verify_t *sv = arg; + dsl_deadlist_iterate(dl, sublivelist_verify_func, sv); +} + +/* + * Check for errors in the livelist entry and discard the intermediary + * data structures + */ +/* ARGSUSED */ +static int +sublivelist_verify_lightweight(void *args, dsl_deadlist_entry_t *dle) +{ + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + int err = sublivelist_verify_func(&sv, dle); + zfs_btree_clear(&sv.sv_leftover); + zfs_btree_destroy(&sv.sv_leftover); + return (err); +} + +typedef struct metaslab_verify { + /* + * Tree containing all the leftover ALLOCs from the livelists + * that are part of this metaslab. + */ + zfs_btree_t mv_livelist_allocs; + + /* + * Metaslab information. + */ + uint64_t mv_vdid; + uint64_t mv_msid; + uint64_t mv_start; + uint64_t mv_end; + + /* + * What's currently allocated for this metaslab. + */ + range_tree_t *mv_allocated; +} metaslab_verify_t; + +typedef void ll_iter_t(dsl_deadlist_t *ll, void *arg); + +typedef int (*zdb_log_sm_cb_t)(spa_t *spa, space_map_entry_t *sme, uint64_t txg, + void *arg); + +typedef struct unflushed_iter_cb_arg { + spa_t *uic_spa; + uint64_t uic_txg; + void *uic_arg; + zdb_log_sm_cb_t uic_cb; +} unflushed_iter_cb_arg_t; + +static int +iterate_through_spacemap_logs_cb(space_map_entry_t *sme, void *arg) +{ + unflushed_iter_cb_arg_t *uic = arg; + return (uic->uic_cb(uic->uic_spa, sme, uic->uic_txg, uic->uic_arg)); +} + +static void +iterate_through_spacemap_logs(spa_t *spa, zdb_log_sm_cb_t cb, void *arg) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + unflushed_iter_cb_arg_t uic = { + .uic_spa = spa, + .uic_txg = sls->sls_txg, + .uic_arg = arg, + .uic_cb = cb + }; + VERIFY0(space_map_iterate(sm, space_map_length(sm), + iterate_through_spacemap_logs_cb, &uic)); + space_map_close(sm); + } + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +verify_livelist_allocs(metaslab_verify_t *mv, uint64_t txg, + uint64_t offset, uint64_t size) +{ + sublivelist_verify_block_t svb; + DVA_SET_VDEV(&svb.svb_dva, mv->mv_vdid); + DVA_SET_OFFSET(&svb.svb_dva, offset); + DVA_SET_ASIZE(&svb.svb_dva, size); + zfs_btree_index_t where; + uint64_t end_offset = offset + size; + + /* + * Look for an exact match for spacemap entry in the livelist entries. + * Then, look for other livelist entries that fall within the range + * of the spacemap entry as it may have been condensed + */ + sublivelist_verify_block_t *found = + zfs_btree_find(&mv->mv_livelist_allocs, &svb, &where); + if (found == NULL) { + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where); + } + for (; found != NULL && DVA_GET_VDEV(&found->svb_dva) == mv->mv_vdid && + DVA_GET_OFFSET(&found->svb_dva) < end_offset; + found = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + if (found->svb_allocated_txg <= txg) { + (void) printf("ERROR: Livelist ALLOC [%llx:%llx] " + "from TXG %llx FREED at TXG %llx\n", + (u_longlong_t)DVA_GET_OFFSET(&found->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&found->svb_dva), + (u_longlong_t)found->svb_allocated_txg, + (u_longlong_t)txg); + } + } +} + +static int +metaslab_spacemap_validation_cb(space_map_entry_t *sme, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t txg = sme->sme_txg; + + if (sme->sme_type == SM_ALLOC) { + if (range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE ALLOC: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_add(mv->mv_allocated, + offset, size); + } + } else { + if (!range_tree_contains(mv->mv_allocated, + offset, size)) { + (void) printf("ERROR: DOUBLE FREE: " + "%llu [%llx:%llx] " + "%llu:%llu LOG_SM\n", + (u_longlong_t)txg, (u_longlong_t)offset, + (u_longlong_t)size, (u_longlong_t)mv->mv_vdid, + (u_longlong_t)mv->mv_msid); + } else { + range_tree_remove(mv->mv_allocated, + offset, size); + } + } + + if (sme->sme_type != SM_ALLOC) { + /* + * If something is freed in the spacemap, verify that + * it is not listed as allocated in the livelist. + */ + verify_livelist_allocs(mv, txg, offset, size); + } + return (0); +} + +static int +spacemap_check_sm_log_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + metaslab_verify_t *mv = arg; + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + if (vdev_id != mv->mv_vdid) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + if (ms->ms_id != mv->mv_msid) + return (0); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + + ASSERT3U(txg, ==, sme->sme_txg); + return (metaslab_spacemap_validation_cb(sme, mv)); +} + +static void +spacemap_check_sm_log(spa_t *spa, metaslab_verify_t *mv) +{ + iterate_through_spacemap_logs(spa, spacemap_check_sm_log_cb, mv); +} + +static void +spacemap_check_ms_sm(space_map_t *sm, metaslab_verify_t *mv) +{ + if (sm == NULL) + return; + + VERIFY0(space_map_iterate(sm, space_map_length(sm), + metaslab_spacemap_validation_cb, mv)); +} + +static void iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg); + +/* + * Transfer blocks from sv_leftover tree to the mv_livelist_allocs if + * they are part of that metaslab (mv_msid). + */ +static void +mv_populate_livelist_allocs(metaslab_verify_t *mv, sublivelist_verify_t *sv) +{ + zfs_btree_index_t where; + sublivelist_verify_block_t *svb; + ASSERT3U(zfs_btree_numnodes(&mv->mv_livelist_allocs), ==, 0); + for (svb = zfs_btree_first(&sv->sv_leftover, &where); + svb != NULL; + svb = zfs_btree_next(&sv->sv_leftover, &where, &where)) { + if (DVA_GET_VDEV(&svb->svb_dva) != mv->mv_vdid) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start && + (DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_start) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + if (DVA_GET_OFFSET(&svb->svb_dva) < mv->mv_start) + continue; + + if (DVA_GET_OFFSET(&svb->svb_dva) >= mv->mv_end) + continue; + + if ((DVA_GET_OFFSET(&svb->svb_dva) + + DVA_GET_ASIZE(&svb->svb_dva)) > mv->mv_end) { + (void) printf("ERROR: Found block that crosses " + "metaslab boundary: <%llu:%llx:%llx>\n", + (u_longlong_t)DVA_GET_VDEV(&svb->svb_dva), + (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva)); + continue; + } + + zfs_btree_add(&mv->mv_livelist_allocs, svb); + } + + for (svb = zfs_btree_first(&mv->mv_livelist_allocs, &where); + svb != NULL; + svb = zfs_btree_next(&mv->mv_livelist_allocs, &where, &where)) { + zfs_btree_remove(&sv->sv_leftover, svb); + } +} + +/* + * [Livelist Check] + * Iterate through all the sublivelists and: + * - report leftover frees + * - report double ALLOCs/FREEs + * - record leftover ALLOCs together with their TXG [see Cross Check] + * + * [Spacemap Check] + * for each metaslab: + * - iterate over spacemap and then the metaslab's entries in the + * spacemap log, then report any double FREEs and ALLOCs (do not + * blow up). + * + * [Cross Check] + * After finishing the Livelist Check phase and while being in the + * Spacemap Check phase, we find all the recorded leftover ALLOCs + * of the livelist check that are part of the metaslab that we are + * currently looking at in the Spacemap Check. We report any entries + * that are marked as ALLOCs in the livelists but have been actually + * freed (and potentially allocated again) after their TXG stamp in + * the spacemaps. Also report any ALLOCs from the livelists that + * belong to indirect vdevs (e.g. their vdev completed removal). + * + * Note that this will miss Log Spacemap entries that cancelled each other + * out before being flushed to the metaslab, so we are not guaranteed + * to match all erroneous ALLOCs. + */ +static void +livelist_metaslab_validate(spa_t *spa) +{ + (void) printf("Verifying deleted livelist entries\n"); + + sublivelist_verify_t sv; + zfs_btree_create(&sv.sv_leftover, livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + iterate_deleted_livelists(spa, livelist_verify, &sv); + + (void) printf("Verifying metaslab entries\n"); + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (!vdev_is_concrete(vd)) + continue; + + for (uint64_t mid = 0; mid < vd->vdev_ms_count; mid++) { + metaslab_t *m = vd->vdev_ms[mid]; + + (void) fprintf(stderr, + "\rverifying concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)mid, + (longlong_t)vd->vdev_ms_count); + + uint64_t shift, start; + range_seg_type_t type = + metaslab_calculate_range_tree_type(vd, m, + &start, &shift); + metaslab_verify_t mv; + mv.mv_allocated = range_tree_create(NULL, + type, NULL, start, shift); + mv.mv_vdid = vd->vdev_id; + mv.mv_msid = m->ms_id; + mv.mv_start = m->ms_start; + mv.mv_end = m->ms_start + m->ms_size; + zfs_btree_create(&mv.mv_livelist_allocs, + livelist_block_compare, + sizeof (sublivelist_verify_block_t)); + + mv_populate_livelist_allocs(&mv, &sv); + + spacemap_check_ms_sm(m->ms_sm, &mv); + spacemap_check_sm_log(spa, &mv); + + range_tree_vacate(mv.mv_allocated, NULL, NULL); + range_tree_destroy(mv.mv_allocated); + zfs_btree_clear(&mv.mv_livelist_allocs); + zfs_btree_destroy(&mv.mv_livelist_allocs); + } + } + (void) fprintf(stderr, "\n"); + + /* + * If there are any segments in the leftover tree after we walked + * through all the metaslabs in the concrete vdevs then this means + * that we have segments in the livelists that belong to indirect + * vdevs and are marked as allocated. + */ + if (zfs_btree_numnodes(&sv.sv_leftover) == 0) { + zfs_btree_destroy(&sv.sv_leftover); + return; + } + (void) printf("ERROR: Found livelist blocks marked as allocated " + "for indirect vdevs:\n"); + + zfs_btree_index_t *where = NULL; + sublivelist_verify_block_t *svb; + while ((svb = zfs_btree_destroy_nodes(&sv.sv_leftover, &where)) != + NULL) { + int vdev_id = DVA_GET_VDEV(&svb->svb_dva); + ASSERT3U(vdev_id, <, rvd->vdev_children); + vdev_t *vd = rvd->vdev_child[vdev_id]; + ASSERT(!vdev_is_concrete(vd)); + (void) printf("<%d:%llx:%llx> TXG %llx\n", + vdev_id, (u_longlong_t)DVA_GET_OFFSET(&svb->svb_dva), + (u_longlong_t)DVA_GET_ASIZE(&svb->svb_dva), + (u_longlong_t)svb->svb_allocated_txg); + } + (void) printf("\n"); + zfs_btree_destroy(&sv.sv_leftover); +} + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage:\t%s [-AbcdDFGhikLMPsvXy] [-e [-V] [-p ...]] " + "[-I ]\n" + "\t\t[-o =]... [-t ] [-U ] [-x ]\n" + "\t\t[[/] [ ...]]\n" + "\t%s [-AdiPv] [-e [-V] [-p ...]] [-U ]\n" + "\t\t[[/] [ ...]\n" + "\t%s [-v] \n" + "\t%s -C [-A] [-U ]\n" + "\t%s -l [-Aqu] \n" + "\t%s -m [-AFLPX] [-e [-V] [-p ...]] [-t ] " + "[-U ]\n\t\t [ [ ...]]\n" + "\t%s -O \n" + "\t%s -R [-A] [-e [-V] [-p ...]] [-U ]\n" + "\t\t ::[:]\n" + "\t%s -E [-A] word0:word1:...:word15\n" + "\t%s -S [-AP] [-e [-V] [-p ...]] [-U ] " + "\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, + cmdname, cmdname, cmdname); + + (void) fprintf(stderr, " Dataset name must include at least one " + "separator character '/' or '@'\n"); + (void) fprintf(stderr, " If dataset name is specified, only that " + "dataset is dumped\n"); + (void) fprintf(stderr, " If object numbers or object number " + "ranges are specified, only those\n" + " objects or ranges are dumped.\n\n"); + (void) fprintf(stderr, + " Object ranges take the form :[:]\n" + " start Starting object number\n" + " end Ending object number, or -1 for no upper bound\n" + " flags Optional flags to select object types:\n" + " A All objects (this is the default)\n" + " d ZFS directories\n" + " f ZFS files \n" + " m SPA space maps\n" + " z ZAPs\n" + " - Negate effect of next flag\n\n"); + (void) fprintf(stderr, " Options to control amount of output:\n"); + (void) fprintf(stderr, " -b block statistics\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); + (void) fprintf(stderr, " -D dedup statistics\n"); + (void) fprintf(stderr, " -E decode and display block from an " + "embedded block pointer\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -i intent logs\n"); + (void) fprintf(stderr, " -l read label contents\n"); + (void) fprintf(stderr, " -k examine the checkpointed state " + "of the pool\n"); + (void) fprintf(stderr, " -L disable leak tracking (do not " + "load spacemaps)\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -M metaslab groups\n"); + (void) fprintf(stderr, " -O perform object lookups by path\n"); + (void) fprintf(stderr, " -R read and display block from a " + "device\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all " + "others)\n"); + (void) fprintf(stderr, " -y perform livelist and metaslab " + "validation on any livelists being deleted\n\n"); + (void) fprintf(stderr, " Below options are intended for use " + "with other options:\n"); + (void) fprintf(stderr, " -A ignore assertions (-A), enable " + "panic recovery (-AA) or both (-AAA)\n"); + (void) fprintf(stderr, " -e pool is exported/destroyed/" + "has altroot/not in a cachefile\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -G dump zfs_dbgmsg buffer before " + "exiting\n"); + (void) fprintf(stderr, " -I -- " + "specify the maximum number of\n " + "checksumming I/Os [default is 200]\n"); + (void) fprintf(stderr, " -o = set global " + "variable to an unsigned 32-bit integer\n"); + (void) fprintf(stderr, " -p -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -P print numbers in parseable form\n"); + (void) fprintf(stderr, " -q don't print label contents\n"); + (void) fprintf(stderr, " -t -- highest txg to use when " + "searching for uberblocks\n"); + (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -U -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -V do verbatim import\n"); + (void) fprintf(stderr, " -x -- " + "dump all read blocks into specified directory\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n"); + (void) fprintf(stderr, " -Y attempt all reconstruction " + "combinations for split blocks\n"); + (void) fprintf(stderr, " -Z show ZSTD headers \n"); + (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " + "to make only that option verbose\n"); + (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); + exit(1); +} + +static void +dump_debug_buffer(void) +{ + if (dump_opt['G']) { + (void) printf("\n"); + (void) fflush(stdout); + zfs_dbgmsg_print("zdb"); + } +} + +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + +static void +fatal(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + dump_debug_buffer(); + + exit(1); +} + +/* ARGSUSED */ +static void +dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) +{ + nvlist_t *nv; + size_t nvsize = *(uint64_t *)data; + char *packed = umem_alloc(nvsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); + + VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); + + umem_free(packed, nvsize); + + dump_nvlist(nv, 8); + + nvlist_free(nv); +} + +/* ARGSUSED */ +static void +dump_history_offsets(objset_t *os, uint64_t object, void *data, size_t size) +{ + spa_history_phys_t *shp = data; + + if (shp == NULL) + return; + + (void) printf("\t\tpool_create_len = %llu\n", + (u_longlong_t)shp->sh_pool_create_len); + (void) printf("\t\tphys_max_off = %llu\n", + (u_longlong_t)shp->sh_phys_max_off); + (void) printf("\t\tbof = %llu\n", + (u_longlong_t)shp->sh_bof); + (void) printf("\t\teof = %llu\n", + (u_longlong_t)shp->sh_eof); + (void) printf("\t\trecords_lost = %llu\n", + (u_longlong_t)shp->sh_records_lost); +} + +static void +zdb_nicenum(uint64_t num, char *buf, size_t buflen) +{ + if (dump_opt['P']) + (void) snprintf(buf, buflen, "%llu", (longlong_t)num); + else + nicenum(num, buf, sizeof (buf)); +} + +static const char histo_stars[] = "****************************************"; +static const uint64_t histo_width = sizeof (histo_stars) - 1; + +static void +dump_histogram(const uint64_t *histo, int size, int offset) +{ + int i; + int minidx = size - 1; + int maxidx = 0; + uint64_t max = 0; + + for (i = 0; i < size; i++) { + if (histo[i] > max) + max = histo[i]; + if (histo[i] > 0 && i > maxidx) + maxidx = i; + if (histo[i] > 0 && i < minidx) + minidx = i; + } + + if (max < histo_width) + max = histo_width; + + for (i = minidx; i <= maxidx; i++) { + (void) printf("\t\t\t%3u: %6llu %s\n", + i + offset, (u_longlong_t)histo[i], + &histo_stars[(max - histo[i]) * histo_width / max]); + } +} + +static void +dump_zap_stats(objset_t *os, uint64_t object) +{ + int error; + zap_stats_t zs; + + error = zap_get_stats(os, object, &zs); + if (error) + return; + + if (zs.zs_ptrtbl_len == 0) { + ASSERT(zs.zs_num_blocks == 1); + (void) printf("\tmicrozap: %llu bytes, %llu entries\n", + (u_longlong_t)zs.zs_blocksize, + (u_longlong_t)zs.zs_num_entries); + return; + } + + (void) printf("\tFat ZAP stats:\n"); + + (void) printf("\t\tPointer table:\n"); + (void) printf("\t\t\t%llu elements\n", + (u_longlong_t)zs.zs_ptrtbl_len); + (void) printf("\t\t\tzt_blk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_blk); + (void) printf("\t\t\tzt_numblks: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_numblks); + (void) printf("\t\t\tzt_shift: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_zt_shift); + (void) printf("\t\t\tzt_blks_copied: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_blks_copied); + (void) printf("\t\t\tzt_nextblk: %llu\n", + (u_longlong_t)zs.zs_ptrtbl_nextblk); + + (void) printf("\t\tZAP entries: %llu\n", + (u_longlong_t)zs.zs_num_entries); + (void) printf("\t\tLeaf blocks: %llu\n", + (u_longlong_t)zs.zs_num_leafs); + (void) printf("\t\tTotal blocks: %llu\n", + (u_longlong_t)zs.zs_num_blocks); + (void) printf("\t\tzap_block_type: 0x%llx\n", + (u_longlong_t)zs.zs_block_type); + (void) printf("\t\tzap_magic: 0x%llx\n", + (u_longlong_t)zs.zs_magic); + (void) printf("\t\tzap_salt: 0x%llx\n", + (u_longlong_t)zs.zs_salt); + + (void) printf("\t\tLeafs with 2^n pointers:\n"); + dump_histogram(zs.zs_leafs_with_2n_pointers, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks with n*5 entries:\n"); + dump_histogram(zs.zs_blocks_with_n5_entries, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBlocks n/10 full:\n"); + dump_histogram(zs.zs_blocks_n_tenths_full, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tEntries with n chunks:\n"); + dump_histogram(zs.zs_entries_using_n_chunks, ZAP_HISTOGRAM_SIZE, 0); + + (void) printf("\t\tBuckets with n entries:\n"); + dump_histogram(zs.zs_buckets_with_n_entries, ZAP_HISTOGRAM_SIZE, 0); +} + +/*ARGSUSED*/ +static void +dump_none(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) +{ + (void) printf("\tUNKNOWN OBJECT TYPE\n"); +} + +/*ARGSUSED*/ +static void +dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_uint64(objset_t *os, uint64_t object, void *data, size_t size) +{ + uint64_t *arr; + uint64_t oursize; + if (dump_opt['d'] < 6) + return; + + if (data == NULL) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, object, &doi)); + size = doi.doi_max_offset; + /* + * We cap the size at 1 mebibyte here to prevent + * allocation failures and nigh-infinite printing if the + * object is extremely large. + */ + oursize = MIN(size, 1 << 20); + arr = kmem_alloc(oursize, KM_SLEEP); + + int err = dmu_read(os, object, 0, oursize, arr, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(arr, oursize); + return; + } + } else { + /* + * Even though the allocation is already done in this code path, + * we still cap the size to prevent excessive printing. + */ + oursize = MIN(size, 1 << 20); + arr = data; + } + + if (size == 0) { + (void) printf("\t\t[]\n"); + return; + } + + (void) printf("\t\t[%0llx", (u_longlong_t)arr[0]); + for (size_t i = 1; i * sizeof (uint64_t) < oursize; i++) { + if (i % 4 != 0) + (void) printf(", %0llx", (u_longlong_t)arr[i]); + else + (void) printf(",\n\t\t%0llx", (u_longlong_t)arr[i]); + } + if (oursize != size) + (void) printf(", ... "); + (void) printf("]\n"); + + if (data == NULL) + kmem_free(arr, oursize); +} + +/*ARGSUSED*/ +static void +dump_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + void *prop; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + prop = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + (void) zap_lookup(os, object, attr.za_name, + attr.za_integer_length, attr.za_num_integers, prop); + if (attr.za_integer_length == 1) { + (void) printf("%s", (char *)prop); + } else { + for (i = 0; i < attr.za_num_integers; i++) { + switch (attr.za_integer_length) { + case 2: + (void) printf("%u ", + ((uint16_t *)prop)[i]); + break; + case 4: + (void) printf("%u ", + ((uint32_t *)prop)[i]); + break; + case 8: + (void) printf("%lld ", + (u_longlong_t)((int64_t *)prop)[i]); + break; + } + } + } + (void) printf("\n"); + umem_free(prop, attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +static void +dump_bpobj(objset_t *os, uint64_t object, void *data, size_t size) +{ + bpobj_phys_t *bpop = data; + uint64_t i; + char bytes[32], comp[32], uncomp[32]; + + /* make sure the output won't get truncated */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (bpop == NULL) + return; + + zdb_nicenum(bpop->bpo_bytes, bytes, sizeof (bytes)); + zdb_nicenum(bpop->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpop->bpo_uncomp, uncomp, sizeof (uncomp)); + + (void) printf("\t\tnum_blkptrs = %llu\n", + (u_longlong_t)bpop->bpo_num_blkptrs); + (void) printf("\t\tbytes = %s\n", bytes); + if (size >= BPOBJ_SIZE_V1) { + (void) printf("\t\tcomp = %s\n", comp); + (void) printf("\t\tuncomp = %s\n", uncomp); + } + if (size >= BPOBJ_SIZE_V2) { + (void) printf("\t\tsubobjs = %llu\n", + (u_longlong_t)bpop->bpo_subobjs); + (void) printf("\t\tnum_subobjs = %llu\n", + (u_longlong_t)bpop->bpo_num_subobjs); + } + if (size >= sizeof (*bpop)) { + (void) printf("\t\tnum_freed = %llu\n", + (u_longlong_t)bpop->bpo_num_freed); + } + + if (dump_opt['d'] < 5) + return; + + for (i = 0; i < bpop->bpo_num_blkptrs; i++) { + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t bp; + + int err = dmu_read(os, object, + i * sizeof (bp), sizeof (bp), &bp, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + break; + } + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), &bp, + BP_GET_FREE(&bp)); + (void) printf("\t%s\n", blkbuf); + } +} + +/* ARGSUSED */ +static void +dump_bpobj_subobjs(objset_t *os, uint64_t object, void *data, size_t size) +{ + dmu_object_info_t doi; + int64_t i; + + VERIFY0(dmu_object_info(os, object, &doi)); + uint64_t *subobjs = kmem_alloc(doi.doi_max_offset, KM_SLEEP); + + int err = dmu_read(os, object, 0, doi.doi_max_offset, subobjs, 0); + if (err != 0) { + (void) printf("got error %u from dmu_read\n", err); + kmem_free(subobjs, doi.doi_max_offset); + return; + } + + int64_t last_nonzero = -1; + for (i = 0; i < doi.doi_max_offset / 8; i++) { + if (subobjs[i] != 0) + last_nonzero = i; + } + + for (i = 0; i <= last_nonzero; i++) { + (void) printf("\t%llu\n", (u_longlong_t)subobjs[i]); + } + kmem_free(subobjs, doi.doi_max_offset); +} + +/*ARGSUSED*/ +static void +dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + dump_zap_stats(os, object); + /* contents are printed elsewhere, properly decoded */ +} + +/*ARGSUSED*/ +static void +dump_sa_attrs(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = ", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + (void) printf(" %llx : [%d:%d:%d]\n", + (u_longlong_t)attr.za_first_integer, + (int)ATTR_LENGTH(attr.za_first_integer), + (int)ATTR_BSWAP(attr.za_first_integer), + (int)ATTR_NUM(attr.za_first_integer)); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_sa_layouts(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + uint16_t *layout_attrs; + unsigned i; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = [", attr.za_name); + if (attr.za_num_integers == 0) { + (void) printf("\n"); + continue; + } + + VERIFY(attr.za_integer_length == 2); + layout_attrs = umem_zalloc(attr.za_num_integers * + attr.za_integer_length, UMEM_NOFAIL); + + VERIFY(zap_lookup(os, object, attr.za_name, + attr.za_integer_length, + attr.za_num_integers, layout_attrs) == 0); + + for (i = 0; i != attr.za_num_integers; i++) + (void) printf(" %d ", (int)layout_attrs[i]); + (void) printf("]\n"); + umem_free(layout_attrs, + attr.za_num_integers * attr.za_integer_length); + } + zap_cursor_fini(&zc); +} + +/*ARGSUSED*/ +static void +dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) +{ + zap_cursor_t zc; + zap_attribute_t attr; + const char *typenames[] = { + /* 0 */ "not specified", + /* 1 */ "FIFO", + /* 2 */ "Character Device", + /* 3 */ "3 (invalid)", + /* 4 */ "Directory", + /* 5 */ "5 (invalid)", + /* 6 */ "Block Device", + /* 7 */ "7 (invalid)", + /* 8 */ "Regular File", + /* 9 */ "9 (invalid)", + /* 10 */ "Symbolic Link", + /* 11 */ "11 (invalid)", + /* 12 */ "Socket", + /* 13 */ "Door", + /* 14 */ "Event Port", + /* 15 */ "15 (invalid)", + }; + + dump_zap_stats(os, object); + (void) printf("\n"); + + for (zap_cursor_init(&zc, os, object); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + (void) printf("\t\t%s = %lld (type: %s)\n", + attr.za_name, ZFS_DIRENT_OBJ(attr.za_first_integer), + typenames[ZFS_DIRENT_TYPE(attr.za_first_integer)]); + } + zap_cursor_fini(&zc); +} + +static int +get_dtl_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_ops->vdev_op_leaf) { + space_map_t *sm = vd->vdev_dtl_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + return (1); + return (0); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_dtl_refcount(vd->vdev_child[c]); + return (refcount); +} + +static int +get_metaslab_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd) { + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + space_map_t *sm = vd->vdev_ms[m]->ms_sm; + + if (sm != NULL && + sm->sm_dbuf->db_size == sizeof (space_map_phys_t)) + refcount++; + } + } + for (unsigned c = 0; c < vd->vdev_children; c++) + refcount += get_metaslab_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_obsolete_refcount(vdev_t *vd) +{ + uint64_t obsolete_sm_object; + int refcount = 0; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (vd->vdev_top == vd && obsolete_sm_object != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(vd->vdev_spa->spa_meta_objset, + obsolete_sm_object, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + refcount++; + } + } else { + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + ASSERT3U(obsolete_sm_object, ==, 0); + } + for (unsigned c = 0; c < vd->vdev_children; c++) { + refcount += get_obsolete_refcount(vd->vdev_child[c]); + } + + return (refcount); +} + +static int +get_prev_obsolete_spacemap_refcount(spa_t *spa) +{ + uint64_t prev_obj = + spa->spa_condensing_indirect_phys.scip_prev_obsolete_sm_object; + if (prev_obj != 0) { + dmu_object_info_t doi; + VERIFY0(dmu_object_info(spa->spa_meta_objset, prev_obj, &doi)); + if (doi.doi_bonus_size == sizeof (space_map_phys_t)) { + return (1); + } + } + return (0); +} + +static int +get_checkpoint_refcount(vdev_t *vd) +{ + int refcount = 0; + + if (vd->vdev_top == vd && vd->vdev_top_zap != 0 && + zap_contains(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) == 0) + refcount++; + + for (uint64_t c = 0; c < vd->vdev_children; c++) + refcount += get_checkpoint_refcount(vd->vdev_child[c]); + + return (refcount); +} + +static int +get_log_spacemap_refcount(spa_t *spa) +{ + return (avl_numnodes(&spa->spa_sm_logs_by_txg)); +} + +static int +verify_spacemap_refcounts(spa_t *spa) +{ + uint64_t expected_refcount = 0; + uint64_t actual_refcount; + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_SPACEMAP_HISTOGRAM], + &expected_refcount); + actual_refcount = get_dtl_refcount(spa->spa_root_vdev); + actual_refcount += get_metaslab_refcount(spa->spa_root_vdev); + actual_refcount += get_obsolete_refcount(spa->spa_root_vdev); + actual_refcount += get_prev_obsolete_spacemap_refcount(spa); + actual_refcount += get_checkpoint_refcount(spa->spa_root_vdev); + actual_refcount += get_log_spacemap_refcount(spa); + + if (expected_refcount != actual_refcount) { + (void) printf("space map refcount mismatch: expected %lld != " + "actual %lld\n", + (longlong_t)expected_refcount, + (longlong_t)actual_refcount); + return (2); + } + return (0); +} + +static void +dump_spacemap(objset_t *os, space_map_t *sm) +{ + const char *ddata[] = { "ALLOC", "FREE", "CONDENSE", "INVALID", + "INVALID", "INVALID", "INVALID", "INVALID" }; + + if (sm == NULL) + return; + + (void) printf("space map object %llu:\n", + (longlong_t)sm->sm_object); + (void) printf(" smp_length = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_length); + (void) printf(" smp_alloc = 0x%llx\n", + (longlong_t)sm->sm_phys->smp_alloc); + + if (dump_opt['d'] < 6 && dump_opt['m'] < 4) + return; + + /* + * Print out the freelist entries in both encoded and decoded form. + */ + uint8_t mapshift = sm->sm_shift; + int64_t alloc = 0; + uint64_t word, entry_id = 0; + for (uint64_t offset = 0; offset < space_map_length(sm); + offset += sizeof (word)) { + + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (word), &word, DMU_READ_PREFETCH)); + + if (sm_entry_is_debug(word)) { + uint64_t de_txg = SM_DEBUG_TXG_DECODE(word); + uint64_t de_sync_pass = SM_DEBUG_SYNCPASS_DECODE(word); + if (de_txg == 0) { + (void) printf( + "\t [%6llu] PADDING\n", + (u_longlong_t)entry_id); + } else { + (void) printf( + "\t [%6llu] %s: txg %llu pass %llu\n", + (u_longlong_t)entry_id, + ddata[SM_DEBUG_ACTION_DECODE(word)], + (u_longlong_t)de_txg, + (u_longlong_t)de_sync_pass); + } + entry_id++; + continue; + } + + uint8_t words; + char entry_type; + uint64_t entry_off, entry_run, entry_vdev = SM_NO_VDEVID; + + if (sm_entry_is_single_word(word)) { + entry_type = (SM_TYPE_DECODE(word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM_OFFSET_DECODE(word) << mapshift) + + sm->sm_start; + entry_run = SM_RUN_DECODE(word) << mapshift; + words = 1; + } else { + /* it is a two-word entry so we read another word */ + ASSERT(sm_entry_is_double_word(word)); + + uint64_t extra_word; + offset += sizeof (extra_word); + VERIFY0(dmu_read(os, space_map_object(sm), offset, + sizeof (extra_word), &extra_word, + DMU_READ_PREFETCH)); + + ASSERT3U(offset, <=, space_map_length(sm)); + + entry_run = SM2_RUN_DECODE(word) << mapshift; + entry_vdev = SM2_VDEV_DECODE(word); + entry_type = (SM2_TYPE_DECODE(extra_word) == SM_ALLOC) ? + 'A' : 'F'; + entry_off = (SM2_OFFSET_DECODE(extra_word) << + mapshift) + sm->sm_start; + words = 2; + } + + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx vdev: %06llu words: %u\n", + (u_longlong_t)entry_id, + entry_type, (u_longlong_t)entry_off, + (u_longlong_t)(entry_off + entry_run), + (u_longlong_t)entry_run, + (u_longlong_t)entry_vdev, words); + + if (entry_type == 'A') + alloc += entry_run; + else + alloc -= entry_run; + entry_id++; + } + if (alloc != space_map_allocated(sm)) { + (void) printf("space_map_object alloc (%lld) INCONSISTENT " + "with space map summary (%lld)\n", + (longlong_t)space_map_allocated(sm), (longlong_t)alloc); + } +} + +static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[32]; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *t = &msp->ms_allocatable_by_size; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + + /* max sure nicenum has enough space */ + CTASSERT(sizeof (maxbuf) >= NN_NUMBUF_SZ); + + zdb_nicenum(metaslab_largest_allocatable(msp), maxbuf, sizeof (maxbuf)); + + (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", + "segments", zfs_btree_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); + (void) printf("\tIn-memory histogram:\n"); + dump_histogram(rt->rt_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +dump_metaslab(metaslab_t *msp) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + char freebuf[32]; + + zdb_nicenum(msp->ms_size - space_map_allocated(sm), freebuf, + sizeof (freebuf)); + + (void) printf( + "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", + (u_longlong_t)msp->ms_id, (u_longlong_t)msp->ms_start, + (u_longlong_t)space_map_object(sm), freebuf); + + if (dump_opt['m'] > 2 && !dump_opt['L']) { + mutex_enter(&msp->ms_lock); + VERIFY0(metaslab_load(msp)); + range_tree_stat_verify(msp->ms_allocatable); + dump_metaslab_stats(msp); + metaslab_unload(msp); + mutex_exit(&msp->ms_lock); + } + + if (dump_opt['m'] > 1 && sm != NULL && + spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + /* + * The space map histogram represents free space in chunks + * of sm_shift (i.e. bucket 0 refers to 2^sm_shift). + */ + (void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n", + (u_longlong_t)msp->ms_fragmentation); + dump_histogram(sm->sm_phys->smp_histogram, + SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift); + } + + ASSERT(msp->ms_size == (1ULL << vd->vdev_ms_shift)); + dump_spacemap(spa->spa_meta_objset, msp->ms_sm); + + if (spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + (void) printf("\tFlush data:\n\tunflushed txg=%llu\n\n", + (u_longlong_t)metaslab_unflushed_txg(msp)); + } +} + +static void +print_vdev_metaslab_header(vdev_t *vd) +{ + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *bias_str = ""; + if (alloc_bias == VDEV_BIAS_LOG || vd->vdev_islog) { + bias_str = VDEV_ALLOC_BIAS_LOG; + } else if (alloc_bias == VDEV_BIAS_SPECIAL) { + bias_str = VDEV_ALLOC_BIAS_SPECIAL; + } else if (alloc_bias == VDEV_BIAS_DEDUP) { + bias_str = VDEV_ALLOC_BIAS_DEDUP; + } + + uint64_t ms_flush_data_obj = 0; + if (vd->vdev_top_zap != 0) { + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &ms_flush_data_obj); + if (error != ENOENT) { + ASSERT0(error); + } + } + + (void) printf("\tvdev %10llu %s", + (u_longlong_t)vd->vdev_id, bias_str); + + if (ms_flush_data_obj != 0) { + (void) printf(" ms_unflushed_phys object %llu", + (u_longlong_t)ms_flush_data_obj); + } + + (void) printf("\n\t%-10s%5llu %-19s %-15s %-12s\n", + "metaslabs", (u_longlong_t)vd->vdev_ms_count, + "offset", "spacemap", "free"); + (void) printf("\t%15s %19s %15s %12s\n", + "---------------", "-------------------", + "---------------", "------------"); +} + +static void +dump_metaslab_groups(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + metaslab_class_t *mc = spa_normal_class(spa); + uint64_t fragmentation; + + metaslab_class_histogram_verify(mc); + + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (mg == NULL || mg->mg_class != mc) + continue; + + metaslab_group_histogram_verify(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + (void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t" + "fragmentation", + (u_longlong_t)tvd->vdev_id, + (u_longlong_t)tvd->vdev_ms_count); + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + (void) printf("%3s\n", "-"); + } else { + (void) printf("%3llu%%\n", + (u_longlong_t)mg->mg_fragmentation); + } + dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); + } + + (void) printf("\tpool %s\tfragmentation", spa_name(spa)); + fragmentation = metaslab_class_fragmentation(mc); + if (fragmentation == ZFS_FRAG_INVALID) + (void) printf("\t%3s\n", "-"); + else + (void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation); + dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0); +} + +static void +print_vdev_indirect(vdev_t *vd) +{ + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + if (vim == NULL) { + ASSERT3P(vib, ==, NULL); + return; + } + + ASSERT3U(vdev_indirect_mapping_object(vim), ==, + vic->vic_mapping_object); + ASSERT3U(vdev_indirect_births_object(vib), ==, + vic->vic_births_object); + + (void) printf("indirect births obj %llu:\n", + (longlong_t)vic->vic_births_object); + (void) printf(" vib_count = %llu\n", + (longlong_t)vdev_indirect_births_count(vib)); + for (uint64_t i = 0; i < vdev_indirect_births_count(vib); i++) { + vdev_indirect_birth_entry_phys_t *cur_vibe = + &vib->vib_entries[i]; + (void) printf("\toffset %llx -> txg %llu\n", + (longlong_t)cur_vibe->vibe_offset, + (longlong_t)cur_vibe->vibe_phys_birth_txg); + } + (void) printf("\n"); + + (void) printf("indirect mapping obj %llu:\n", + (longlong_t)vic->vic_mapping_object); + (void) printf(" vim_max_offset = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_max_offset(vim)); + (void) printf(" vim_bytes_mapped = 0x%llx\n", + (longlong_t)vdev_indirect_mapping_bytes_mapped(vim)); + (void) printf(" vim_count = %llu\n", + (longlong_t)vdev_indirect_mapping_num_entries(vim)); + + if (dump_opt['d'] <= 5 && dump_opt['m'] <= 3) + return; + + uint32_t *counts = vdev_indirect_mapping_load_obsolete_counts(vim); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + (void) printf("\t<%llx:%llx:%llx> -> " + "<%llx:%llx:%llx> (%x obsolete)\n", + (longlong_t)vd->vdev_id, + (longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (longlong_t)DVA_GET_VDEV(&vimep->vimep_dst), + (longlong_t)DVA_GET_OFFSET(&vimep->vimep_dst), + (longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + counts[i]); + } + (void) printf("\n"); + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + (void) printf("obsolete space map object %llu:\n", + (u_longlong_t)obsolete_sm_object); + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(space_map_object(vd->vdev_obsolete_sm), ==, + obsolete_sm_object); + dump_spacemap(mos, vd->vdev_obsolete_sm); + (void) printf("\n"); + } +} + +static void +dump_metaslabs(spa_t *spa) +{ + vdev_t *vd, *rvd = spa->spa_root_vdev; + uint64_t m, c = 0, children = rvd->vdev_children; + + (void) printf("\nMetaslabs:\n"); + + if (!dump_opt['d'] && zopt_metaslab_args > 0) { + c = zopt_metaslab[0]; + + if (c >= children) + (void) fatal("bad vdev id: %llu", (u_longlong_t)c); + + if (zopt_metaslab_args > 1) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 1; m < zopt_metaslab_args; m++) { + if (zopt_metaslab[m] < vd->vdev_ms_count) + dump_metaslab( + vd->vdev_ms[zopt_metaslab[m]]); + else + (void) fprintf(stderr, "bad metaslab " + "number %llu\n", + (u_longlong_t)zopt_metaslab[m]); + } + (void) printf("\n"); + return; + } + children = c + 1; + } + for (; c < children; c++) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + print_vdev_indirect(vd); + + for (m = 0; m < vd->vdev_ms_count; m++) + dump_metaslab(vd->vdev_ms[m]); + (void) printf("\n"); + } +} + +static void +dump_log_spacemaps(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + (void) printf("\nLog Space Maps in Pool:\n"); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + VERIFY0(space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + (void) printf("Log Spacemap object %llu txg %llu\n", + (u_longlong_t)sls->sls_sm_obj, (u_longlong_t)sls->sls_txg); + dump_spacemap(spa->spa_meta_objset, sm); + space_map_close(sm); + } + (void) printf("\n"); +} + +static void +dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +{ + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + const char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; + int p; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } +} + +static void +dump_dedup_ratio(const ddt_stat_t *dds) +{ + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); +} + +static void +dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + error = ddt_object_count(ddt, type, class, &count); + ASSERT(error == 0); + if (count == 0) + return; + + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT3U(error, ==, ENOENT); + + (void) printf("\n"); +} + +static void +dump_all_ddts(spa_t *spa) +{ + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); +} + +static void +dump_dtl_seg(void *arg, uint64_t start, uint64_t size) +{ + char *prefix = arg; + + (void) printf("%s [%llu,%llu) length %llu\n", + prefix, + (u_longlong_t)start, + (u_longlong_t)(start + size), + (u_longlong_t)(size)); +} + +static void +dump_dtl(vdev_t *vd, int indent) +{ + spa_t *spa = vd->vdev_spa; + boolean_t required; + const char *name[DTL_TYPES] = { "missing", "partial", "scrub", + "outage" }; + char prefix[256]; + + spa_vdev_state_enter(spa, SCL_NONE); + required = vdev_dtl_required(vd); + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (indent == 0) + (void) printf("\nDirty time logs:\n\n"); + + (void) printf("\t%*s%s [%s]\n", indent, "", + vd->vdev_path ? vd->vdev_path : + vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), + required ? "DTL-required" : "DTL-expendable"); + + for (int t = 0; t < DTL_TYPES; t++) { + range_tree_t *rt = vd->vdev_dtl[t]; + if (range_tree_space(rt) == 0) + continue; + (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", + indent + 2, "", name[t]); + range_tree_walk(rt, dump_dtl_seg, prefix); + if (dump_opt['d'] > 5 && vd->vdev_children == 0) + dump_spacemap(spa->spa_meta_objset, + vd->vdev_dtl_sm); + } + + for (unsigned c = 0; c < vd->vdev_children; c++) + dump_dtl(vd->vdev_child[c], indent + 4); +} + +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char *buf; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + if ((buf = malloc(SPA_OLD_MAXBLOCKSIZE)) == NULL) { + (void) fprintf(stderr, "%s: unable to allocate I/O buffer\n", + __func__); + return; + } + + do { + len = SPA_OLD_MAXBLOCKSIZE; + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + free(buf); + return; + } + + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (unsigned i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + boolean_t printed = B_FALSE; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + goto next; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + goto next; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) + goto next; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + zfs_history_event_names[ievent], + (longlong_t)txg, intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + printed = B_TRUE; + +next: + if (dump_opt['h'] > 1) { + if (!printed) + (void) printf("unrecognized record:\n"); + dump_nvlist(events[i], 2); + } + } + free(buf); +} + +/*ARGSUSED*/ +static void +dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static uint64_t +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb) +{ + if (dnp == NULL) { + ASSERT(zb->zb_level < 0); + if (zb->zb_object == 0) + return (zb->zb_blkid); + return (zb->zb_blkid * BP_GET_LSIZE(bp)); + } + + ASSERT(zb->zb_level >= 0); + + return ((zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); +} + +static void +snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, + const blkptr_t *bp) +{ + abd_t *pabd; + void *buf; + zio_t *zio; + zfs_zstdhdr_t zstd_hdr; + int error; + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_ZSTD) + return; + + if (BP_IS_HOLE(bp)) + return; + + if (BP_IS_EMBEDDED(bp)) { + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + decode_embedded_bp_compressed(bp, buf); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + free(buf); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:EMBEDDED", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + return; + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + zio = zio_root(spa, NULL, NULL, 0); + + /* Decrypt but don't decompress so we can read the compression header */ + zio_nowait(zio_read(zio, spa, bp, pabd, BP_GET_PSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW_COMPRESS, + NULL)); + error = zio_wait(zio); + if (error) { + (void) fprintf(stderr, "read failed: %d\n", error); + return; + } + buf = abd_borrow_buf_copy(pabd, BP_GET_LSIZE(bp)); + memcpy(&zstd_hdr, buf, sizeof (zstd_hdr)); + zstd_hdr.c_len = BE_32(zstd_hdr.c_len); + zstd_hdr.raw_version_level = BE_32(zstd_hdr.raw_version_level); + + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + " ZSTD:size=%u:version=%u:level=%u:NORMAL", + zstd_hdr.c_len, zstd_hdr.version, zstd_hdr.level); + + abd_return_buf_copy(pabd, buf, BP_GET_LSIZE(bp)); +} + +static void +snprintf_blkptr_compact(char *blkbuf, size_t buflen, const blkptr_t *bp, + boolean_t bp_freed) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + int i; + + if (dump_opt['b'] >= 6) { + snprintf_blkptr(blkbuf, buflen, bp); + if (bp_freed) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + } + return; + } + + if (BP_IS_EMBEDDED(bp)) { + (void) sprintf(blkbuf, + "EMBEDDED et=%u %llxL/%llxP B=%llu", + (int)BPE_GET_ETYPE(bp), + (u_longlong_t)BPE_GET_LSIZE(bp), + (u_longlong_t)BPE_GET_PSIZE(bp), + (u_longlong_t)bp->blk_birth); + return; + } + + blkbuf[0] = '\0'; + + for (i = 0; i < ndvas; i++) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), "%llu:%llx:%llx ", + (u_longlong_t)DVA_GET_VDEV(&dva[i]), + (u_longlong_t)DVA_GET_OFFSET(&dva[i]), + (u_longlong_t)DVA_GET_ASIZE(&dva[i])); + + if (BP_IS_HOLE(bp)) { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL B=%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)bp->blk_birth); + } else { + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", + (u_longlong_t)BP_GET_LSIZE(bp), + (u_longlong_t)BP_GET_PSIZE(bp), + (u_longlong_t)BP_GET_FILL(bp), + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); + if (bp_freed) + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " %s", "FREE"); + (void) snprintf(blkbuf + strlen(blkbuf), + buflen - strlen(blkbuf), " cksum=%llx:%llx:%llx:%llx", + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + } +} + +static void +print_indirect(spa_t *spa, blkptr_t *bp, const zbookmark_phys_t *zb, + const dnode_phys_t *dnp) +{ + char blkbuf[BP_SPRINTF_LEN]; + int l; + + if (!BP_IS_EMBEDDED(bp)) { + ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); + ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); + } + + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); + + ASSERT(zb->zb_level >= 0); + + for (l = dnp->dn_nlevels - 1; l >= -1; l--) { + if (l == zb->zb_level) { + (void) printf("L%llx", (u_longlong_t)zb->zb_level); + } else { + (void) printf(" "); + } + } + + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, B_FALSE); + if (dump_opt['Z'] && BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD) + snprintf_zstd_header(spa, blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static int +visit_indirect(spa_t *spa, const dnode_phys_t *dnp, + blkptr_t *bp, const zbookmark_phys_t *zb) +{ + int err = 0; + + if (bp->blk_birth == 0) + return (0); + + print_indirect(spa, bp, zb, dnp); + + if (BP_GET_LEVEL(bp) > 0 && !BP_IS_HOLE(bp)) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + uint64_t fill = 0; + ASSERT(!BP_IS_REDACTED(bp)); + + err = arc_read(NULL, spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err) + return (err); + ASSERT(buf->b_data); + + /* recursively visit blocks below this */ + cbp = buf->b_data; + for (i = 0; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = visit_indirect(spa, dnp, cbp, &czb); + if (err) + break; + fill += BP_GET_FILL(cbp); + } + if (!err) + ASSERT3U(fill, ==, BP_GET_FILL(bp)); + arc_buf_destroy(buf, &buf); + } + + return (err); +} + +/*ARGSUSED*/ +static void +dump_indirect(dnode_t *dn) +{ + dnode_phys_t *dnp = dn->dn_phys; + int j; + zbookmark_phys_t czb; + + (void) printf("Indirect blocks:\n"); + + SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), + dn->dn_object, dnp->dn_nlevels - 1, 0); + for (j = 0; j < dnp->dn_nblkptr; j++) { + czb.zb_blkid = j; + (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, + &dnp->dn_blkptr[j], &czb); + } + + (void) printf("\n"); +} + +/*ARGSUSED*/ +static void +dump_dsl_dir(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dir_phys_t *dd = data; + time_t crtime; + char nice[32]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (nice) >= NN_NUMBUF_SZ); + + if (dd == NULL) + return; + + ASSERT3U(size, >=, sizeof (dsl_dir_phys_t)); + + crtime = dd->dd_creation_time; + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\thead_dataset_obj = %llu\n", + (u_longlong_t)dd->dd_head_dataset_obj); + (void) printf("\t\tparent_dir_obj = %llu\n", + (u_longlong_t)dd->dd_parent_obj); + (void) printf("\t\torigin_obj = %llu\n", + (u_longlong_t)dd->dd_origin_obj); + (void) printf("\t\tchild_dir_zapobj = %llu\n", + (u_longlong_t)dd->dd_child_dir_zapobj); + zdb_nicenum(dd->dd_used_bytes, nice, sizeof (nice)); + (void) printf("\t\tused_bytes = %s\n", nice); + zdb_nicenum(dd->dd_compressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tcompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_uncompressed_bytes, nice, sizeof (nice)); + (void) printf("\t\tuncompressed_bytes = %s\n", nice); + zdb_nicenum(dd->dd_quota, nice, sizeof (nice)); + (void) printf("\t\tquota = %s\n", nice); + zdb_nicenum(dd->dd_reserved, nice, sizeof (nice)); + (void) printf("\t\treserved = %s\n", nice); + (void) printf("\t\tprops_zapobj = %llu\n", + (u_longlong_t)dd->dd_props_zapobj); + (void) printf("\t\tdeleg_zapobj = %llu\n", + (u_longlong_t)dd->dd_deleg_zapobj); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)dd->dd_flags); + +#define DO(which) \ + zdb_nicenum(dd->dd_used_breakdown[DD_USED_ ## which], nice, \ + sizeof (nice)); \ + (void) printf("\t\tused_breakdown[" #which "] = %s\n", nice) + DO(HEAD); + DO(SNAP); + DO(CHILD); + DO(CHILD_RSRV); + DO(REFRSRV); +#undef DO + (void) printf("\t\tclones = %llu\n", + (u_longlong_t)dd->dd_clones); +} + +/*ARGSUSED*/ +static void +dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) +{ + dsl_dataset_phys_t *ds = data; + time_t crtime; + char used[32], compressed[32], uncompressed[32], unique[32]; + char blkbuf[BP_SPRINTF_LEN]; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (used) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (compressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncompressed) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (unique) >= NN_NUMBUF_SZ); + + if (ds == NULL) + return; + + ASSERT(size == sizeof (*ds)); + crtime = ds->ds_creation_time; + zdb_nicenum(ds->ds_referenced_bytes, used, sizeof (used)); + zdb_nicenum(ds->ds_compressed_bytes, compressed, sizeof (compressed)); + zdb_nicenum(ds->ds_uncompressed_bytes, uncompressed, + sizeof (uncompressed)); + zdb_nicenum(ds->ds_unique_bytes, unique, sizeof (unique)); + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ds->ds_bp); + + (void) printf("\t\tdir_obj = %llu\n", + (u_longlong_t)ds->ds_dir_obj); + (void) printf("\t\tprev_snap_obj = %llu\n", + (u_longlong_t)ds->ds_prev_snap_obj); + (void) printf("\t\tprev_snap_txg = %llu\n", + (u_longlong_t)ds->ds_prev_snap_txg); + (void) printf("\t\tnext_snap_obj = %llu\n", + (u_longlong_t)ds->ds_next_snap_obj); + (void) printf("\t\tsnapnames_zapobj = %llu\n", + (u_longlong_t)ds->ds_snapnames_zapobj); + (void) printf("\t\tnum_children = %llu\n", + (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); + (void) printf("\t\tcreation_time = %s", ctime(&crtime)); + (void) printf("\t\tcreation_txg = %llu\n", + (u_longlong_t)ds->ds_creation_txg); + (void) printf("\t\tdeadlist_obj = %llu\n", + (u_longlong_t)ds->ds_deadlist_obj); + (void) printf("\t\tused_bytes = %s\n", used); + (void) printf("\t\tcompressed_bytes = %s\n", compressed); + (void) printf("\t\tuncompressed_bytes = %s\n", uncompressed); + (void) printf("\t\tunique = %s\n", unique); + (void) printf("\t\tfsid_guid = %llu\n", + (u_longlong_t)ds->ds_fsid_guid); + (void) printf("\t\tguid = %llu\n", + (u_longlong_t)ds->ds_guid); + (void) printf("\t\tflags = %llx\n", + (u_longlong_t)ds->ds_flags); + (void) printf("\t\tnext_clones_obj = %llu\n", + (u_longlong_t)ds->ds_next_clones_obj); + (void) printf("\t\tprops_obj = %llu\n", + (u_longlong_t)ds->ds_props_obj); + (void) printf("\t\tbp = %s\n", blkbuf); +} + +/* ARGSUSED */ +static int +dump_bptree_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (bp->blk_birth != 0) { + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("\t%s\n", blkbuf); + } + return (0); +} + +static void +dump_bptree(objset_t *os, uint64_t obj, const char *name) +{ + char bytes[32]; + bptree_phys_t *bt; + dmu_buf_t *db; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + zdb_nicenum(bt->bt_bytes, bytes, sizeof (bytes)); + (void) printf("\n %s: %llu datasets, %s\n", + name, (unsigned long long)(bt->bt_end - bt->bt_begin), bytes); + dmu_buf_rele(db, FTAG); + + if (dump_opt['d'] < 5) + return; + + (void) printf("\n"); + + (void) bptree_iterate(os, obj, B_FALSE, dump_bptree_cb, NULL, NULL); +} + +/* ARGSUSED */ +static int +dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + char blkbuf[BP_SPRINTF_LEN]; + + ASSERT(bp->blk_birth != 0); + snprintf_blkptr_compact(blkbuf, sizeof (blkbuf), bp, bp_freed); + (void) printf("\t%s\n", blkbuf); + return (0); +} + +static void +dump_full_bpobj(bpobj_t *bpo, const char *name, int indent) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + uint64_t i; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + zdb_nicenum(bpo->bpo_phys->bpo_bytes, bytes, sizeof (bytes)); + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + zdb_nicenum(bpo->bpo_phys->bpo_comp, comp, sizeof (comp)); + zdb_nicenum(bpo->bpo_phys->bpo_uncomp, uncomp, sizeof (uncomp)); + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu freed, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } else { + (void) printf(" %*s: object %llu, %llu local " + "blkptrs, %llu subobjs in object %llu, " + "%s (%s/%s comp)\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_subobjs, + (u_longlong_t)bpo->bpo_phys->bpo_subobjs, + bytes, comp, uncomp); + } + + for (i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + dump_full_bpobj(&subbpo, "subobj", indent + 1); + bpobj_close(&subbpo); + } + } else { + if (bpo->bpo_havefreed) { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%llu freed, %s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + (u_longlong_t)bpo->bpo_phys->bpo_num_freed, + bytes); + } else { + (void) printf(" %*s: object %llu, %llu blkptrs, " + "%s\n", + indent * 8, name, + (u_longlong_t)bpo->bpo_object, + (u_longlong_t)bpo->bpo_phys->bpo_num_blkptrs, + bytes); + } + } + + if (dump_opt['d'] < 5) + return; + + + if (indent == 0) { + (void) bpobj_iterate_nofree(bpo, dump_bpobj_cb, NULL, NULL); + (void) printf("\n"); + } +} + +static int +dump_bookmark(dsl_pool_t *dp, char *name, boolean_t print_redact, + boolean_t print_list) +{ + int err = 0; + zfs_bookmark_phys_t prop; + objset_t *mos = dp->dp_spa->spa_meta_objset; + err = dsl_bookmark_lookup(dp, name, NULL, &prop); + + if (err != 0) { + return (err); + } + + (void) printf("\t#%s: ", strchr(name, '#') + 1); + (void) printf("{guid: %llx creation_txg: %llu creation_time: " + "%llu redaction_obj: %llu}\n", (u_longlong_t)prop.zbm_guid, + (u_longlong_t)prop.zbm_creation_txg, + (u_longlong_t)prop.zbm_creation_time, + (u_longlong_t)prop.zbm_redaction_obj); + + IMPLY(print_list, print_redact); + if (!print_redact || prop.zbm_redaction_obj == 0) + return (0); + + redaction_list_t *rl; + VERIFY0(dsl_redaction_list_hold_obj(dp, + prop.zbm_redaction_obj, FTAG, &rl)); + + redaction_list_phys_t *rlp = rl->rl_phys; + (void) printf("\tRedacted:\n\t\tProgress: "); + if (rlp->rlp_last_object != UINT64_MAX || + rlp->rlp_last_blkid != UINT64_MAX) { + (void) printf("%llu %llu (incomplete)\n", + (u_longlong_t)rlp->rlp_last_object, + (u_longlong_t)rlp->rlp_last_blkid); + } else { + (void) printf("complete\n"); + } + (void) printf("\t\tSnapshots: ["); + for (unsigned int i = 0; i < rlp->rlp_num_snaps; i++) { + if (i > 0) + (void) printf(", "); + (void) printf("%0llu", + (u_longlong_t)rlp->rlp_snaps[i]); + } + (void) printf("]\n\t\tLength: %llu\n", + (u_longlong_t)rlp->rlp_num_entries); + + if (!print_list) { + dsl_redaction_list_rele(rl, FTAG); + return (0); + } + + if (rlp->rlp_num_entries == 0) { + dsl_redaction_list_rele(rl, FTAG); + (void) printf("\t\tRedaction List: []\n\n"); + return (0); + } + + redact_block_phys_t *rbp_buf; + uint64_t size; + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(mos, prop.zbm_redaction_obj, &doi)); + size = doi.doi_max_offset; + rbp_buf = kmem_alloc(size, KM_SLEEP); + + err = dmu_read(mos, prop.zbm_redaction_obj, 0, size, + rbp_buf, 0); + if (err != 0) { + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + return (err); + } + + (void) printf("\t\tRedaction List: [{object: %llx, offset: " + "%llx, blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[0].rbp_object, + (u_longlong_t)rbp_buf[0].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[0])), + (u_longlong_t)redact_block_get_count(&rbp_buf[0])); + + for (size_t i = 1; i < rlp->rlp_num_entries; i++) { + (void) printf(",\n\t\t{object: %llx, offset: %llx, " + "blksz: %x, count: %llx}", + (u_longlong_t)rbp_buf[i].rbp_object, + (u_longlong_t)rbp_buf[i].rbp_blkid, + (uint_t)(redact_block_get_size(&rbp_buf[i])), + (u_longlong_t)redact_block_get_count(&rbp_buf[i])); + } + dsl_redaction_list_rele(rl, FTAG); + kmem_free(rbp_buf, size); + (void) printf("]\n\n"); + return (0); +} + +static void +dump_bookmarks(objset_t *os, int verbosity) +{ + zap_cursor_t zc; + zap_attribute_t attr; + dsl_dataset_t *ds = dmu_objset_ds(os); + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + objset_t *mos = os->os_spa->spa_meta_objset; + if (verbosity < 4) + return; + dsl_pool_config_enter(dp, FTAG); + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + zap_cursor_advance(&zc)) { + char osname[ZFS_MAX_DATASET_NAME_LEN]; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dmu_objset_name(os, osname); + VERIFY3S(0, <=, snprintf(buf, sizeof (buf), "%s#%s", osname, + attr.za_name)); + (void) dump_bookmark(dp, buf, verbosity >= 5, verbosity >= 6); + } + zap_cursor_fini(&zc); + dsl_pool_config_exit(dp, FTAG); +} + +static void +bpobj_count_refd(bpobj_t *bpo) +{ + mos_obj_refd(bpo->bpo_object); + + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + mos_obj_refd(bpo->bpo_phys->bpo_subobjs); + for (uint64_t i = 0; i < bpo->bpo_phys->bpo_num_subobjs; i++) { + uint64_t subobj; + bpobj_t subbpo; + int error; + VERIFY0(dmu_read(bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + i * sizeof (subobj), sizeof (subobj), &subobj, 0)); + error = bpobj_open(&subbpo, bpo->bpo_os, subobj); + if (error != 0) { + (void) printf("ERROR %u while trying to open " + "subobj id %llu\n", + error, (u_longlong_t)subobj); + continue; + } + bpobj_count_refd(&subbpo); + bpobj_close(&subbpo); + } + } +} + +static int +dsl_deadlist_entry_count_refd(void *arg, dsl_deadlist_entry_t *dle) +{ + spa_t *spa = arg; + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + if (dle->dle_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dle->dle_bpobj); + return (0); +} + +static int +dsl_deadlist_entry_dump(void *arg, dsl_deadlist_entry_t *dle) +{ + ASSERT(arg == NULL); + if (dump_opt['d'] >= 5) { + char buf[128]; + (void) snprintf(buf, sizeof (buf), + "mintxg %llu -> obj %llu", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + + dump_full_bpobj(&dle->dle_bpobj, buf, 0); + } else { + (void) printf("mintxg %llu -> obj %llu\n", + (longlong_t)dle->dle_mintxg, + (longlong_t)dle->dle_bpobj.bpo_object); + } + return (0); +} + +static void +dump_blkptr_list(dsl_deadlist_t *dl, char *name) +{ + char bytes[32]; + char comp[32]; + char uncomp[32]; + char entries[32]; + spa_t *spa = dmu_objset_spa(dl->dl_os); + uint64_t empty_bpobj = spa->spa_dsl_pool->dp_empty_bpobj; + + if (dl->dl_oldfmt) { + if (dl->dl_bpobj.bpo_object != empty_bpobj) + bpobj_count_refd(&dl->dl_bpobj); + } else { + mos_obj_refd(dl->dl_object); + dsl_deadlist_iterate(dl, dsl_deadlist_entry_count_refd, spa); + } + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (bytes) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (comp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (uncomp) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (entries) >= NN_NUMBUF_SZ); + + if (dump_opt['d'] < 3) + return; + + if (dl->dl_oldfmt) { + dump_full_bpobj(&dl->dl_bpobj, "old-format deadlist", 0); + return; + } + + zdb_nicenum(dl->dl_phys->dl_used, bytes, sizeof (bytes)); + zdb_nicenum(dl->dl_phys->dl_comp, comp, sizeof (comp)); + zdb_nicenum(dl->dl_phys->dl_uncomp, uncomp, sizeof (uncomp)); + zdb_nicenum(avl_numnodes(&dl->dl_tree), entries, sizeof (entries)); + (void) printf("\n %s: %s (%s/%s comp), %s entries\n", + name, bytes, comp, uncomp, entries); + + if (dump_opt['d'] < 4) + return; + + (void) printf("\n"); + + dsl_deadlist_iterate(dl, dsl_deadlist_entry_dump, NULL); +} + +static int +verify_dd_livelist(objset_t *os) +{ + uint64_t ll_used, used, ll_comp, comp, ll_uncomp, uncomp; + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + + ASSERT(!dmu_objset_is_snapshot(os)); + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return (0); + + /* Iterate through the livelist to check for duplicates */ + dsl_deadlist_iterate(&dd->dd_livelist, sublivelist_verify_lightweight, + NULL); + + dsl_pool_config_enter(dp, FTAG); + dsl_deadlist_space(&dd->dd_livelist, &ll_used, + &ll_comp, &ll_uncomp); + + dsl_dataset_t *origin_ds; + ASSERT(dsl_pool_config_held(dp)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin_ds)); + VERIFY0(dsl_dataset_space_written(origin_ds, os->os_dsl_dataset, + &used, &comp, &uncomp)); + dsl_dataset_rele(origin_ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + /* + * It's possible that the dataset's uncomp space is larger than the + * livelist's because livelists do not track embedded block pointers + */ + if (used != ll_used || comp != ll_comp || uncomp < ll_uncomp) { + char nice_used[32], nice_comp[32], nice_uncomp[32]; + (void) printf("Discrepancy in space accounting:\n"); + zdb_nicenum(used, nice_used, sizeof (nice_used)); + zdb_nicenum(comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("dir: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + zdb_nicenum(ll_used, nice_used, sizeof (nice_used)); + zdb_nicenum(ll_comp, nice_comp, sizeof (nice_comp)); + zdb_nicenum(ll_uncomp, nice_uncomp, sizeof (nice_uncomp)); + (void) printf("livelist: used %s, comp %s, uncomp %s\n", + nice_used, nice_comp, nice_uncomp); + return (1); + } + return (0); +} + +static avl_tree_t idx_tree; +static avl_tree_t domain_tree; +static boolean_t fuid_table_loaded; +static objset_t *sa_os = NULL; +static sa_attr_type_t *sa_attr_table = NULL; + +static int +open_objset(const char *path, void *tag, objset_t **osp) +{ + int err; + uint64_t sa_attrs = 0; + uint64_t version = 0; + + VERIFY3P(sa_os, ==, NULL); + /* + * We can't own an objset if it's redacted. Therefore, we do this + * dance: hold the objset, then acquire a long hold on its dataset, then + * release the pool (which is held as part of holding the objset). + */ + err = dmu_objset_hold(path, tag, osp); + if (err != 0) { + (void) fprintf(stderr, "failed to hold dataset '%s': %s\n", + path, strerror(err)); + return (err); + } + dsl_dataset_long_hold(dmu_objset_ds(*osp), tag); + dsl_pool_rele(dmu_objset_pool(*osp), tag); + + if (dmu_objset_type(*osp) == DMU_OST_ZFS && !(*osp)->os_encrypted) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &version); + if (version >= ZPL_VERSION_SA) { + (void) zap_lookup(*osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, + 8, 1, &sa_attrs); + } + err = sa_setup(*osp, sa_attrs, zfs_attr_table, ZPL_END, + &sa_attr_table); + if (err != 0) { + (void) fprintf(stderr, "sa_setup failed: %s\n", + strerror(err)); + dsl_dataset_long_rele(dmu_objset_ds(*osp), tag); + dsl_dataset_rele(dmu_objset_ds(*osp), tag); + *osp = NULL; + } + } + sa_os = *osp; + + return (0); +} + +static void +close_objset(objset_t *os, void *tag) +{ + VERIFY3P(os, ==, sa_os); + if (os->os_sa != NULL) + sa_tear_down(os); + dsl_dataset_long_rele(dmu_objset_ds(os), tag); + dsl_dataset_rele(dmu_objset_ds(os), tag); + sa_attr_table = NULL; + sa_os = NULL; +} + +static void +fuid_table_destroy(void) +{ + if (fuid_table_loaded) { + zfs_fuid_table_destroy(&idx_tree, &domain_tree); + fuid_table_loaded = B_FALSE; + } +} + +/* + * print uid or gid information. + * For normal POSIX id just the id is printed in decimal format. + * For CIFS files with FUID the fuid is printed in hex followed by + * the domain-rid string. + */ +static void +print_idstr(uint64_t id, const char *id_type) +{ + if (FUID_INDEX(id)) { + char *domain; + + domain = zfs_fuid_idx_domain(&idx_tree, FUID_INDEX(id)); + (void) printf("\t%s %llx [%s-%d]\n", id_type, + (u_longlong_t)id, domain, (int)FUID_RID(id)); + } else { + (void) printf("\t%s %llu\n", id_type, (u_longlong_t)id); + } + +} + +static void +dump_uidgid(objset_t *os, uint64_t uid, uint64_t gid) +{ + uint32_t uid_idx, gid_idx; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + + /* Load domain table, if not already loaded */ + if (!fuid_table_loaded && (uid_idx || gid_idx)) { + uint64_t fuid_obj; + + /* first find the fuid object. It lives in the master node */ + VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, + 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); + (void) zfs_fuid_table_load(os, fuid_obj, + &idx_tree, &domain_tree); + fuid_table_loaded = B_TRUE; + } + + print_idstr(uid, "uid"); + print_idstr(gid, "gid"); +} + +static void +dump_znode_sa_xattr(sa_handle_t *hdl) +{ + nvlist_t *sa_xattr; + nvpair_t *elem = NULL; + int sa_xattr_size = 0; + int sa_xattr_entries = 0; + int error; + char *sa_xattr_packed; + + error = sa_size(hdl, sa_attr_table[ZPL_DXATTR], &sa_xattr_size); + if (error || sa_xattr_size == 0) + return; + + sa_xattr_packed = malloc(sa_xattr_size); + if (sa_xattr_packed == NULL) + return; + + error = sa_lookup(hdl, sa_attr_table[ZPL_DXATTR], + sa_xattr_packed, sa_xattr_size); + if (error) { + free(sa_xattr_packed); + return; + } + + error = nvlist_unpack(sa_xattr_packed, sa_xattr_size, &sa_xattr, 0); + if (error) { + free(sa_xattr_packed); + return; + } + + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) + sa_xattr_entries++; + + (void) printf("\tSA xattrs: %d bytes, %d entries\n\n", + sa_xattr_size, sa_xattr_entries); + while ((elem = nvlist_next_nvpair(sa_xattr, elem)) != NULL) { + uchar_t *value; + uint_t cnt, idx; + + (void) printf("\t\t%s = ", nvpair_name(elem)); + nvpair_value_byte_array(elem, &value, &cnt); + for (idx = 0; idx < cnt; ++idx) { + if (isprint(value[idx])) + (void) putchar(value[idx]); + else + (void) printf("\\%3.3o", value[idx]); + } + (void) putchar('\n'); + } + + nvlist_free(sa_xattr); + free(sa_xattr_packed); +} + +static void +dump_znode_symlink(sa_handle_t *hdl) +{ + int sa_symlink_size = 0; + char linktarget[MAXPATHLEN]; + linktarget[0] = '\0'; + int error; + + error = sa_size(hdl, sa_attr_table[ZPL_SYMLINK], &sa_symlink_size); + if (error || sa_symlink_size == 0) { + return; + } + if (sa_lookup(hdl, sa_attr_table[ZPL_SYMLINK], + &linktarget, sa_symlink_size) == 0) + (void) printf("\ttarget %s\n", linktarget); +} + +/*ARGSUSED*/ +static void +dump_znode(objset_t *os, uint64_t object, void *data, size_t size) +{ + char path[MAXPATHLEN * 2]; /* allow for xattr and failure prefix */ + sa_handle_t *hdl; + uint64_t xattr, rdev, gen; + uint64_t uid, gid, mode, fsize, parent, links; + uint64_t pflags; + uint64_t acctm[2], modtm[2], chgtm[2], crtm[2]; + time_t z_crtime, z_atime, z_mtime, z_ctime; + sa_bulk_attr_t bulk[12]; + int idx = 0; + int error; + + VERIFY3P(os, ==, sa_os); + if (sa_handle_get(os, object, NULL, SA_HDL_PRIVATE, &hdl)) { + (void) printf("Failed to get handle for SA znode\n"); + return; + } + + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_UID], NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GID], NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_LINKS], NULL, + &links, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_GEN], NULL, &gen, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MODE], NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_PARENT], + NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_SIZE], NULL, + &fsize, 8); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_ATIME], NULL, + acctm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_MTIME], NULL, + modtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CRTIME], NULL, + crtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_CTIME], NULL, + chgtm, 16); + SA_ADD_BULK_ATTR(bulk, idx, sa_attr_table[ZPL_FLAGS], NULL, + &pflags, 8); + + if (sa_bulk_lookup(hdl, bulk, idx)) { + (void) sa_handle_destroy(hdl); + return; + } + + z_crtime = (time_t)crtm[0]; + z_atime = (time_t)acctm[0]; + z_mtime = (time_t)modtm[0]; + z_ctime = (time_t)chgtm[0]; + + if (dump_opt['d'] > 4) { + error = zfs_obj_to_path(os, object, path, sizeof (path)); + if (error == ESTALE) { + (void) snprintf(path, sizeof (path), "on delete queue"); + } else if (error != 0) { + leaked_objects++; + (void) snprintf(path, sizeof (path), + "path not found, possibly leaked"); + } + (void) printf("\tpath %s\n", path); + } + + if (S_ISLNK(mode)) + dump_znode_symlink(hdl); + dump_uidgid(os, uid, gid); + (void) printf("\tatime %s", ctime(&z_atime)); + (void) printf("\tmtime %s", ctime(&z_mtime)); + (void) printf("\tctime %s", ctime(&z_ctime)); + (void) printf("\tcrtime %s", ctime(&z_crtime)); + (void) printf("\tgen %llu\n", (u_longlong_t)gen); + (void) printf("\tmode %llo\n", (u_longlong_t)mode); + (void) printf("\tsize %llu\n", (u_longlong_t)fsize); + (void) printf("\tparent %llu\n", (u_longlong_t)parent); + (void) printf("\tlinks %llu\n", (u_longlong_t)links); + (void) printf("\tpflags %llx\n", (u_longlong_t)pflags); + if (dmu_objset_projectquota_enabled(os) && (pflags & ZFS_PROJID)) { + uint64_t projid; + + if (sa_lookup(hdl, sa_attr_table[ZPL_PROJID], &projid, + sizeof (uint64_t)) == 0) + (void) printf("\tprojid %llu\n", (u_longlong_t)projid); + } + if (sa_lookup(hdl, sa_attr_table[ZPL_XATTR], &xattr, + sizeof (uint64_t)) == 0) + (void) printf("\txattr %llu\n", (u_longlong_t)xattr); + if (sa_lookup(hdl, sa_attr_table[ZPL_RDEV], &rdev, + sizeof (uint64_t)) == 0) + (void) printf("\trdev 0x%016llx\n", (u_longlong_t)rdev); + dump_znode_sa_xattr(hdl); + sa_handle_destroy(hdl); +} + +/*ARGSUSED*/ +static void +dump_acl(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +/*ARGSUSED*/ +static void +dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) +{ +} + +static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { + dump_none, /* unallocated */ + dump_zap, /* object directory */ + dump_uint64, /* object array */ + dump_none, /* packed nvlist */ + dump_packed_nvlist, /* packed nvlist size */ + dump_none, /* bpobj */ + dump_bpobj, /* bpobj header */ + dump_none, /* SPA space map header */ + dump_none, /* SPA space map */ + dump_none, /* ZIL intent log */ + dump_dnode, /* DMU dnode */ + dump_dmu_objset, /* DMU objset */ + dump_dsl_dir, /* DSL directory */ + dump_zap, /* DSL directory child map */ + dump_zap, /* DSL dataset snap map */ + dump_zap, /* DSL props */ + dump_dsl_dataset, /* DSL dataset */ + dump_znode, /* ZFS znode */ + dump_acl, /* ZFS V0 ACL */ + dump_uint8, /* ZFS plain file */ + dump_zpldir, /* ZFS directory */ + dump_zap, /* ZFS master node */ + dump_zap, /* ZFS delete queue */ + dump_uint8, /* zvol object */ + dump_zap, /* zvol prop */ + dump_uint8, /* other uint8[] */ + dump_uint64, /* other uint64[] */ + dump_zap, /* other ZAP */ + dump_zap, /* persistent error log */ + dump_uint8, /* SPA history */ + dump_history_offsets, /* SPA history offsets */ + dump_zap, /* Pool properties */ + dump_zap, /* DSL permissions */ + dump_acl, /* ZFS ACL */ + dump_uint8, /* ZFS SYSACL */ + dump_none, /* FUID nvlist */ + dump_packed_nvlist, /* FUID nvlist size */ + dump_zap, /* DSL dataset next clones */ + dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group/project used */ + dump_zap, /* ZFS user/group/project quota */ + dump_zap, /* snapshot refcount tags */ + dump_ddt_zap, /* DDT ZAP object */ + dump_zap, /* DDT statistics */ + dump_znode, /* SA object */ + dump_zap, /* SA Master Node */ + dump_sa_attrs, /* SA attribute registration */ + dump_sa_layouts, /* SA attribute layouts */ + dump_zap, /* DSL scrub translations */ + dump_none, /* fake dedup BP */ + dump_zap, /* deadlist */ + dump_none, /* deadlist hdr */ + dump_zap, /* dsl clones */ + dump_bpobj_subobjs, /* bpobj subobjs */ + dump_unknown, /* Unknown type, must be last */ +}; + +static boolean_t +match_object_type(dmu_object_type_t obj_type, uint64_t flags) +{ + boolean_t match = B_TRUE; + + switch (obj_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (!(flags & ZOR_FLAG_DIRECTORY)) + match = B_FALSE; + break; + case DMU_OT_PLAIN_FILE_CONTENTS: + if (!(flags & ZOR_FLAG_PLAIN_FILE)) + match = B_FALSE; + break; + case DMU_OT_SPACE_MAP: + if (!(flags & ZOR_FLAG_SPACE_MAP)) + match = B_FALSE; + break; + default: + if (strcmp(zdb_ot_name(obj_type), "zap") == 0) { + if (!(flags & ZOR_FLAG_ZAP)) + match = B_FALSE; + break; + } + + /* + * If all bits except some of the supported flags are + * set, the user combined the all-types flag (A) with + * a negated flag to exclude some types (e.g. A-f to + * show all object types except plain files). + */ + if ((flags | ZOR_SUPPORTED_FLAGS) != ZOR_FLAG_ALL_TYPES) + match = B_FALSE; + + break; + } + + return (match); +} + +static void +dump_object(objset_t *os, uint64_t object, int verbosity, + boolean_t *print_header, uint64_t *dnode_slots_used, uint64_t flags) +{ + dmu_buf_t *db = NULL; + dmu_object_info_t doi; + dnode_t *dn; + boolean_t dnode_held = B_FALSE; + void *bonus = NULL; + size_t bsize = 0; + char iblk[32], dblk[32], lsize[32], asize[32], fill[32], dnsize[32]; + char bonus_size[32]; + char aux[50]; + int error; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (iblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (dblk) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (bonus_size) >= NN_NUMBUF_SZ); + + if (*print_header) { + (void) printf("\n%10s %3s %5s %5s %5s %6s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "dnsize", + "lsize", "%full", "type"); + *print_header = 0; + } + + if (object == 0) { + dn = DMU_META_DNODE(os); + dmu_object_info_from_dnode(dn, &doi); + } else { + /* + * Encrypted datasets will have sensitive bonus buffers + * encrypted. Therefore we cannot hold the bonus buffer and + * must hold the dnode itself instead. + */ + error = dmu_object_info(os, object, &doi); + if (error) + fatal("dmu_object_info() failed, errno %u", error); + + if (os->os_encrypted && + DMU_OT_IS_ENCRYPTED(doi.doi_bonus_type)) { + error = dnode_hold(os, object, FTAG, &dn); + if (error) + fatal("dnode_hold() failed, errno %u", error); + dnode_held = B_TRUE; + } else { + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) + fatal("dmu_bonus_hold(%llu) failed, errno %u", + object, error); + bonus = db->db_data; + bsize = db->db_size; + dn = DB_DNODE((dmu_buf_impl_t *)db); + } + } + + /* + * Default to showing all object types if no flags were specified. + */ + if (flags != 0 && flags != ZOR_FLAG_ALL_TYPES && + !match_object_type(doi.doi_type, flags)) + goto out; + + if (dnode_slots_used) + *dnode_slots_used = doi.doi_dnodesize / DNODE_MIN_SIZE; + + zdb_nicenum(doi.doi_metadata_block_size, iblk, sizeof (iblk)); + zdb_nicenum(doi.doi_data_block_size, dblk, sizeof (dblk)); + zdb_nicenum(doi.doi_max_offset, lsize, sizeof (lsize)); + zdb_nicenum(doi.doi_physical_blocks_512 << 9, asize, sizeof (asize)); + zdb_nicenum(doi.doi_bonus_size, bonus_size, sizeof (bonus_size)); + zdb_nicenum(doi.doi_dnodesize, dnsize, sizeof (dnsize)); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); + + aux[0] = '\0'; + + if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (K=%s)", ZDB_CHECKSUM_NAME(doi.doi_checksum)); + } + + if (doi.doi_compress == ZIO_COMPRESS_INHERIT && + ZIO_COMPRESS_HASLEVEL(os->os_compress) && verbosity >= 6) { + const char *compname = NULL; + if (zfs_prop_index_to_string(ZFS_PROP_COMPRESSION, + ZIO_COMPRESS_RAW(os->os_compress, os->os_complevel), + &compname) == 0) { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), " (Z=inherit=%s)", + compname); + } else { + (void) snprintf(aux + strlen(aux), + sizeof (aux) - strlen(aux), + " (Z=inherit=%s-unknown)", + ZDB_COMPRESS_NAME(os->os_compress)); + } + } else if (doi.doi_compress == ZIO_COMPRESS_INHERIT && verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=inherit=%s)", ZDB_COMPRESS_NAME(os->os_compress)); + } else if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { + (void) snprintf(aux + strlen(aux), sizeof (aux) - strlen(aux), + " (Z=%s)", ZDB_COMPRESS_NAME(doi.doi_compress)); + } + + (void) printf("%10lld %3u %5s %5s %5s %6s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + asize, dnsize, lsize, fill, zdb_ot_name(doi.doi_type), aux); + + if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { + (void) printf("%10s %3s %5s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", "", bonus_size, "bonus", + zdb_ot_name(doi.doi_bonus_type)); + } + + if (verbosity >= 4) { + (void) printf("\tdnode flags: %s%s%s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) ? + "USEROBJUSED_ACCOUNTED " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? + "SPILL_BLKPTR" : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + + if (!dnode_held) { + object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, + object, bonus, bsize); + } else { + (void) printf("\t\t(bonus encrypted)\n"); + } + + if (!os->os_encrypted || !DMU_OT_IS_ENCRYPTED(doi.doi_type)) { + object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, + NULL, 0); + } else { + (void) printf("\t\t(object encrypted)\n"); + } + + *print_header = B_TRUE; + } + + if (verbosity >= 5) + dump_indirect(dn); + + if (verbosity >= 5) { + /* + * Report the list of segments that comprise the object. + */ + uint64_t start = 0; + uint64_t end; + uint64_t blkfill = 1; + int minlvl = 1; + + if (dn->dn_type == DMU_OT_DNODE) { + minlvl = 0; + blkfill = DNODES_PER_BLOCK; + } + + for (;;) { + char segsize[32]; + /* make sure nicenum has enough space */ + CTASSERT(sizeof (segsize) >= NN_NUMBUF_SZ); + error = dnode_next_offset(dn, + 0, &start, minlvl, blkfill, 0); + if (error) + break; + end = start; + error = dnode_next_offset(dn, + DNODE_FIND_HOLE, &end, minlvl, blkfill, 0); + zdb_nicenum(end - start, segsize, sizeof (segsize)); + (void) printf("\t\tsegment [%016llx, %016llx)" + " size %5s\n", (u_longlong_t)start, + (u_longlong_t)end, segsize); + if (error) + break; + start = end; + } + } + +out: + if (db != NULL) + dmu_buf_rele(db, FTAG); + if (dnode_held) + dnode_rele(dn, FTAG); +} + +static void +count_dir_mos_objects(dsl_dir_t *dd) +{ + mos_obj_refd(dd->dd_object); + mos_obj_refd(dsl_dir_phys(dd)->dd_child_dir_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_deleg_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_props_zapobj); + mos_obj_refd(dsl_dir_phys(dd)->dd_clones); + + /* + * The dd_crypto_obj can be referenced by multiple dsl_dir's. + * Ignore the references after the first one. + */ + mos_obj_refd_multiple(dd->dd_crypto_obj); +} + +static void +count_ds_mos_objects(dsl_dataset_t *ds) +{ + mos_obj_refd(ds->ds_object); + mos_obj_refd(dsl_dataset_phys(ds)->ds_next_clones_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_props_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_userrefs_obj); + mos_obj_refd(dsl_dataset_phys(ds)->ds_snapnames_zapobj); + mos_obj_refd(ds->ds_bookmarks_obj); + + if (!dsl_dataset_is_snapshot(ds)) { + count_dir_mos_objects(ds->ds_dir); + } +} + +static const char *objset_types[DMU_OST_NUMTYPES] = { + "NONE", "META", "ZPL", "ZVOL", "OTHER", "ANY" }; + +/* + * Parse a string denoting a range of object IDs of the form + * [:[:flags]], and store the results in zor. + * Return 0 on success. On error, return 1 and update the msg + * pointer to point to a descriptive error message. + */ +static int +parse_object_range(char *range, zopt_object_range_t *zor, char **msg) +{ + uint64_t flags = 0; + char *p, *s, *dup, *flagstr; + size_t len; + int i; + int rc = 0; + + if (strchr(range, ':') == NULL) { + zor->zor_obj_start = strtoull(range, &p, 0); + if (*p != '\0') { + *msg = "Invalid characters in object ID"; + rc = 1; + } + zor->zor_obj_end = zor->zor_obj_start; + return (rc); + } + + if (strchr(range, ':') == range) { + *msg = "Invalid leading colon"; + rc = 1; + return (rc); + } + + len = strlen(range); + if (range[len - 1] == ':') { + *msg = "Invalid trailing colon"; + rc = 1; + return (rc); + } + + dup = strdup(range); + s = strtok(dup, ":"); + zor->zor_obj_start = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in start object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + zor->zor_obj_end = strtoull(s, &p, 0); + + if (*p != '\0') { + *msg = "Invalid characters in end object ID"; + rc = 1; + goto out; + } + + if (zor->zor_obj_start > zor->zor_obj_end) { + *msg = "Start object ID may not exceed end object ID"; + rc = 1; + goto out; + } + + s = strtok(NULL, ":"); + if (s == NULL) { + zor->zor_flags = ZOR_FLAG_ALL_TYPES; + goto out; + } else if (strtok(NULL, ":") != NULL) { + *msg = "Invalid colon-delimited field after flags"; + rc = 1; + goto out; + } + + flagstr = s; + for (i = 0; flagstr[i]; i++) { + int bit; + boolean_t negation = (flagstr[i] == '-'); + + if (negation) { + i++; + if (flagstr[i] == '\0') { + *msg = "Invalid trailing negation operator"; + rc = 1; + goto out; + } + } + bit = flagbits[(uchar_t)flagstr[i]]; + if (bit == 0) { + *msg = "Invalid flag"; + rc = 1; + goto out; + } + if (negation) + flags &= ~bit; + else + flags |= bit; + } + zor->zor_flags = flags; + +out: + free(dup); + return (rc); +} + +static void +dump_objset(objset_t *os) +{ + dmu_objset_stats_t dds = { 0 }; + uint64_t object, object_count; + uint64_t refdbytes, usedobjs, scratch; + char numbuf[32]; + char blkbuf[BP_SPRINTF_LEN + 20]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + const char *type = "UNKNOWN"; + int verbosity = dump_opt['d']; + boolean_t print_header; + unsigned i; + int error; + uint64_t total_slots_used = 0; + uint64_t max_slot_used = 0; + uint64_t dnode_slots; + uint64_t obj_start; + uint64_t obj_end; + uint64_t flags; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (numbuf) >= NN_NUMBUF_SZ); + + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + + print_header = B_TRUE; + + if (dds.dds_type < DMU_OST_NUMTYPES) + type = objset_types[dds.dds_type]; + + if (dds.dds_type == DMU_OST_META) { + dds.dds_creation_txg = TXG_INITIAL; + usedobjs = BP_GET_FILL(os->os_rootbp); + refdbytes = dsl_dir_phys(os->os_spa->spa_dsl_pool->dp_mos_dir)-> + dd_used_bytes; + } else { + dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); + } + + ASSERT3U(usedobjs, ==, BP_GET_FILL(os->os_rootbp)); + + zdb_nicenum(refdbytes, numbuf, sizeof (numbuf)); + + if (verbosity >= 4) { + (void) snprintf(blkbuf, sizeof (blkbuf), ", rootbp "); + (void) snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), os->os_rootbp); + } else { + blkbuf[0] = '\0'; + } + + dmu_objset_name(os, osname); + + (void) printf("Dataset %s [%s], ID %llu, cr_txg %llu, " + "%s, %llu objects%s%s\n", + osname, type, (u_longlong_t)dmu_objset_id(os), + (u_longlong_t)dds.dds_creation_txg, + numbuf, (u_longlong_t)usedobjs, blkbuf, + (dds.dds_inconsistent) ? " (inconsistent)" : ""); + + for (i = 0; i < zopt_object_args; i++) { + obj_start = zopt_object_ranges[i].zor_obj_start; + obj_end = zopt_object_ranges[i].zor_obj_end; + flags = zopt_object_ranges[i].zor_flags; + + object = obj_start; + if (object == 0 || obj_start == obj_end) + dump_object(os, object, verbosity, &print_header, NULL, + flags); + else + object--; + + while ((dmu_object_next(os, &object, B_FALSE, 0) == 0) && + object <= obj_end) { + dump_object(os, object, verbosity, &print_header, NULL, + flags); + } + } + + if (zopt_object_args > 0) { + (void) printf("\n"); + return; + } + + if (dump_opt['i'] != 0 || verbosity >= 2) + dump_intent_log(dmu_objset_zil(os)); + + if (dmu_objset_ds(os) != NULL) { + dsl_dataset_t *ds = dmu_objset_ds(os); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + dump_blkptr_list(&ds->ds_dir->dd_livelist, "Livelist"); + if (verify_dd_livelist(os) != 0) + fatal("livelist is incorrect"); + } + + if (dsl_dataset_remap_deadlist_exists(ds)) { + (void) printf("ds_remap_deadlist:\n"); + dump_blkptr_list(&ds->ds_remap_deadlist, "Deadlist"); + } + count_ds_mos_objects(ds); + } + + if (dmu_objset_ds(os) != NULL) + dump_bookmarks(os, verbosity); + + if (verbosity < 2) + return; + + if (BP_IS_HOLE(os->os_rootbp)) + return; + + dump_object(os, 0, verbosity, &print_header, NULL, 0); + object_count = 0; + if (DMU_USERUSED_DNODE(os) != NULL && + DMU_USERUSED_DNODE(os)->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header, + NULL, 0); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header, + NULL, 0); + } + + if (DMU_PROJECTUSED_DNODE(os) != NULL && + DMU_PROJECTUSED_DNODE(os)->dn_type != 0) + dump_object(os, DMU_PROJECTUSED_OBJECT, verbosity, + &print_header, NULL, 0); + + object = 0; + while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { + dump_object(os, object, verbosity, &print_header, &dnode_slots, + 0); + object_count++; + total_slots_used += dnode_slots; + max_slot_used = object + dnode_slots - 1; + } + + (void) printf("\n"); + + (void) printf(" Dnode slots:\n"); + (void) printf("\tTotal used: %10llu\n", + (u_longlong_t)total_slots_used); + (void) printf("\tMax used: %10llu\n", + (u_longlong_t)max_slot_used); + (void) printf("\tPercent empty: %10lf\n", + (double)(max_slot_used - total_slots_used)*100 / + (double)max_slot_used); + (void) printf("\n"); + + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } + + ASSERT3U(object_count, ==, usedobjs); + + if (leaked_objects != 0) { + (void) printf("%d potentially leaked objects detected\n", + leaked_objects); + leaked_objects = 0; + } +} + +static void +dump_uberblock(uberblock_t *ub, const char *header, const char *footer) +{ + time_t timestamp = ub->ub_timestamp; + + (void) printf("%s", header ? header : ""); + (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); + (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); + (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); + (void) printf("\tguid_sum = %llu\n", (u_longlong_t)ub->ub_guid_sum); + (void) printf("\ttimestamp = %llu UTC = %s", + (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); + + (void) printf("\tmmp_magic = %016llx\n", + (u_longlong_t)ub->ub_mmp_magic); + if (MMP_VALID(ub)) { + (void) printf("\tmmp_delay = %0llu\n", + (u_longlong_t)ub->ub_mmp_delay); + if (MMP_SEQ_VALID(ub)) + (void) printf("\tmmp_seq = %u\n", + (unsigned int) MMP_SEQ(ub)); + if (MMP_FAIL_INT_VALID(ub)) + (void) printf("\tmmp_fail = %u\n", + (unsigned int) MMP_FAIL_INT(ub)); + if (MMP_INTERVAL_VALID(ub)) + (void) printf("\tmmp_write = %u\n", + (unsigned int) MMP_INTERVAL(ub)); + /* After MMP_* to make summarize_uberblock_mmp cleaner */ + (void) printf("\tmmp_valid = %x\n", + (unsigned int) ub->ub_mmp_config & 0xFF); + } + + if (dump_opt['u'] >= 4) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), &ub->ub_rootbp); + (void) printf("\trootbp = %s\n", blkbuf); + } + (void) printf("\tcheckpoint_txg = %llu\n", + (u_longlong_t)ub->ub_checkpoint_txg); + (void) printf("%s", footer ? footer : ""); +} + +static void +dump_config(spa_t *spa) +{ + dmu_buf_t *db; + size_t nvsize = 0; + int error = 0; + + + error = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db); + + if (error == 0) { + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + (void) printf("\nMOS Configuration:\n"); + dump_packed_nvlist(spa->spa_meta_objset, + spa->spa_config_object, (void *)&nvsize, 1); + } else { + (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", + (u_longlong_t)spa->spa_config_object, error); + } +} + +static void +dump_cachefile(const char *cachefile) +{ + int fd; + struct stat64 statbuf; + char *buf; + nvlist_t *config; + + if ((fd = open64(cachefile, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if (fstat64(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", cachefile, + strerror(errno)); + exit(1); + } + + if ((buf = malloc(statbuf.st_size)) == NULL) { + (void) fprintf(stderr, "failed to allocate %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) fprintf(stderr, "failed to read %llu bytes\n", + (u_longlong_t)statbuf.st_size); + exit(1); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &config, 0) != 0) { + (void) fprintf(stderr, "failed to unpack nvlist\n"); + exit(1); + } + + free(buf); + + dump_nvlist(config, 0); + + nvlist_free(config); +} + +/* + * ZFS label nvlist stats + */ +typedef struct zdb_nvl_stats { + int zns_list_count; + int zns_leaf_count; + size_t zns_leaf_largest; + size_t zns_leaf_total; + nvlist_t *zns_string; + nvlist_t *zns_uint64; + nvlist_t *zns_boolean; +} zdb_nvl_stats_t; + +static void +collect_nvlist_stats(nvlist_t *nvl, zdb_nvl_stats_t *stats) +{ + nvlist_t *list, **array; + nvpair_t *nvp = NULL; + char *name; + uint_t i, items; + + stats->zns_list_count++; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + name = nvpair_name(nvp); + + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: + fnvlist_add_string(stats->zns_string, name, + fnvpair_value_string(nvp)); + break; + case DATA_TYPE_UINT64: + fnvlist_add_uint64(stats->zns_uint64, name, + fnvpair_value_uint64(nvp)); + break; + case DATA_TYPE_BOOLEAN: + fnvlist_add_boolean(stats->zns_boolean, name); + break; + case DATA_TYPE_NVLIST: + if (nvpair_value_nvlist(nvp, &list) == 0) + collect_nvlist_stats(list, stats); + break; + case DATA_TYPE_NVLIST_ARRAY: + if (nvpair_value_nvlist_array(nvp, &array, &items) != 0) + break; + + for (i = 0; i < items; i++) { + collect_nvlist_stats(array[i], stats); + + /* collect stats on leaf vdev */ + if (strcmp(name, "children") == 0) { + size_t size; + + (void) nvlist_size(array[i], &size, + NV_ENCODE_XDR); + stats->zns_leaf_total += size; + if (size > stats->zns_leaf_largest) + stats->zns_leaf_largest = size; + stats->zns_leaf_count++; + } + } + break; + default: + (void) printf("skip type %d!\n", (int)nvpair_type(nvp)); + } + } +} + +static void +dump_nvlist_stats(nvlist_t *nvl, size_t cap) +{ + zdb_nvl_stats_t stats = { 0 }; + size_t size, sum = 0, total; + size_t noise; + + /* requires nvlist with non-unique names for stat collection */ + VERIFY0(nvlist_alloc(&stats.zns_string, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_uint64, 0, 0)); + VERIFY0(nvlist_alloc(&stats.zns_boolean, 0, 0)); + VERIFY0(nvlist_size(stats.zns_boolean, &noise, NV_ENCODE_XDR)); + + (void) printf("\n\nZFS Label NVList Config Stats:\n"); + + VERIFY0(nvlist_size(nvl, &total, NV_ENCODE_XDR)); + (void) printf(" %d bytes used, %d bytes free (using %4.1f%%)\n\n", + (int)total, (int)(cap - total), 100.0 * total / cap); + + collect_nvlist_stats(nvl, &stats); + + VERIFY0(nvlist_size(stats.zns_uint64, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "integers:", + (int)fnvlist_num_pairs(stats.zns_uint64), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_string, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "strings:", + (int)fnvlist_num_pairs(stats.zns_string), + (int)size, 100.0 * size / total); + + VERIFY0(nvlist_size(stats.zns_boolean, &size, NV_ENCODE_XDR)); + size -= noise; + sum += size; + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n", "booleans:", + (int)fnvlist_num_pairs(stats.zns_boolean), + (int)size, 100.0 * size / total); + + size = total - sum; /* treat remainder as nvlist overhead */ + (void) printf("%12s %4d %6d bytes (%5.2f%%)\n\n", "nvlists:", + stats.zns_list_count, (int)size, 100.0 * size / total); + + if (stats.zns_leaf_count > 0) { + size_t average = stats.zns_leaf_total / stats.zns_leaf_count; + + (void) printf("%12s %4d %6d bytes average\n", "leaf vdevs:", + stats.zns_leaf_count, (int)average); + (void) printf("%24d bytes largest\n", + (int)stats.zns_leaf_largest); + + if (dump_opt['l'] >= 3 && average > 0) + (void) printf(" space for %d additional leaf vdevs\n", + (int)((cap - total) / average)); + } + (void) printf("\n"); + + nvlist_free(stats.zns_string); + nvlist_free(stats.zns_uint64); + nvlist_free(stats.zns_boolean); +} + +typedef struct cksum_record { + zio_cksum_t cksum; + boolean_t labels[VDEV_LABELS]; + avl_node_t link; +} cksum_record_t; + +static int +cksum_record_compare(const void *x1, const void *x2) +{ + const cksum_record_t *l = (cksum_record_t *)x1; + const cksum_record_t *r = (cksum_record_t *)x2; + int arraysize = ARRAY_SIZE(l->cksum.zc_word); + int difference; + + for (int i = 0; i < arraysize; i++) { + difference = TREE_CMP(l->cksum.zc_word[i], r->cksum.zc_word[i]); + if (difference) + break; + } + + return (difference); +} + +static cksum_record_t * +cksum_record_alloc(zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = umem_zalloc(sizeof (*rec), UMEM_NOFAIL); + rec->cksum = *cksum; + rec->labels[l] = B_TRUE; + + return (rec); +} + +static cksum_record_t * +cksum_record_lookup(avl_tree_t *tree, zio_cksum_t *cksum) +{ + cksum_record_t lookup = { .cksum = *cksum }; + avl_index_t where; + + return (avl_find(tree, &lookup, &where)); +} + +static cksum_record_t * +cksum_record_insert(avl_tree_t *tree, zio_cksum_t *cksum, int l) +{ + cksum_record_t *rec; + + rec = cksum_record_lookup(tree, cksum); + if (rec) { + rec->labels[l] = B_TRUE; + } else { + rec = cksum_record_alloc(cksum, l); + avl_add(tree, rec); + } + + return (rec); +} + +static int +first_label(cksum_record_t *rec) +{ + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i]) + return (i); + + return (-1); +} + +static void +print_label_numbers(char *prefix, cksum_record_t *rec) +{ + printf("%s", prefix); + for (int i = 0; i < VDEV_LABELS; i++) + if (rec->labels[i] == B_TRUE) + printf("%d ", i); + printf("\n"); +} + +#define MAX_UBERBLOCK_COUNT (VDEV_UBERBLOCK_RING >> UBERBLOCK_SHIFT) + +typedef struct zdb_label { + vdev_label_t label; + nvlist_t *config_nv; + cksum_record_t *config; + cksum_record_t *uberblocks[MAX_UBERBLOCK_COUNT]; + boolean_t header_printed; + boolean_t read_failed; +} zdb_label_t; + +static void +print_label_header(zdb_label_t *label, int l) +{ + + if (dump_opt['q']) + return; + + if (label->header_printed == B_TRUE) + return; + + (void) printf("------------------------------------\n"); + (void) printf("LABEL %d\n", l); + (void) printf("------------------------------------\n"); + + label->header_printed = B_TRUE; +} + +static void +print_l2arc_header(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device header\n"); + (void) printf("------------------------------------\n"); +} + +static void +print_l2arc_log_blocks(void) +{ + (void) printf("------------------------------------\n"); + (void) printf("L2ARC device log blocks\n"); + (void) printf("------------------------------------\n"); +} + +static void +dump_l2arc_log_entries(uint64_t log_entries, + l2arc_log_ent_phys_t *le, uint64_t i) +{ + for (int j = 0; j < log_entries; j++) { + dva_t dva = le[j].le_dva; + (void) printf("lb[%4llu]\tle[%4d]\tDVA asize: %llu, " + "vdev: %llu, offset: %llu\n", + (u_longlong_t)i, j + 1, + (u_longlong_t)DVA_GET_ASIZE(&dva), + (u_longlong_t)DVA_GET_VDEV(&dva), + (u_longlong_t)DVA_GET_OFFSET(&dva)); + (void) printf("|\t\t\t\tbirth: %llu\n", + (u_longlong_t)le[j].le_birth); + (void) printf("|\t\t\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tpsize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcompr: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&le[j])->le_prop)); + (void) printf("|\t\t\t\tcomplevel: %llu\n", + (u_longlong_t)(&le[j])->le_complevel); + (void) printf("|\t\t\t\ttype: %llu\n", + (u_longlong_t)L2BLK_GET_TYPE((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprotected: %llu\n", + (u_longlong_t)L2BLK_GET_PROTECTED((&le[j])->le_prop)); + (void) printf("|\t\t\t\tprefetch: %llu\n", + (u_longlong_t)L2BLK_GET_PREFETCH((&le[j])->le_prop)); + (void) printf("|\t\t\t\taddress: %llu\n", + (u_longlong_t)le[j].le_daddr); + (void) printf("|\n"); + } + (void) printf("\n"); +} + +static void +dump_l2arc_log_blkptr(l2arc_log_blkptr_t lbps) +{ + (void) printf("|\t\tdaddr: %llu\n", (u_longlong_t)lbps.lbp_daddr); + (void) printf("|\t\tpayload_asize: %llu\n", + (u_longlong_t)lbps.lbp_payload_asize); + (void) printf("|\t\tpayload_start: %llu\n", + (u_longlong_t)lbps.lbp_payload_start); + (void) printf("|\t\tlsize: %llu\n", + (u_longlong_t)L2BLK_GET_LSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tasize: %llu\n", + (u_longlong_t)L2BLK_GET_PSIZE((&lbps)->lbp_prop)); + (void) printf("|\t\tcompralgo: %llu\n", + (u_longlong_t)L2BLK_GET_COMPRESS((&lbps)->lbp_prop)); + (void) printf("|\t\tcksumalgo: %llu\n", + (u_longlong_t)L2BLK_GET_CHECKSUM((&lbps)->lbp_prop)); + (void) printf("|\n\n"); +} + +static void +dump_l2arc_log_blocks(int fd, l2arc_dev_hdr_phys_t l2dhdr, + l2arc_dev_hdr_phys_t *rebuild) +{ + l2arc_log_blk_phys_t this_lb; + uint64_t asize; + l2arc_log_blkptr_t lbps[2]; + abd_t *abd; + zio_cksum_t cksum; + int failed = 0; + l2arc_dev_t dev; + + if (!dump_opt['q']) + print_l2arc_log_blocks(); + bcopy((&l2dhdr)->dh_start_lbps, lbps, sizeof (lbps)); + + dev.l2ad_evict = l2dhdr.dh_evict; + dev.l2ad_start = l2dhdr.dh_start; + dev.l2ad_end = l2dhdr.dh_end; + + if (l2dhdr.dh_start_lbps[0].lbp_daddr == 0) { + /* no log blocks to read */ + if (!dump_opt['q']) { + (void) printf("No log blocks to read\n"); + (void) printf("\n"); + } + return; + } else { + dev.l2ad_hand = lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + } + + dev.l2ad_first = !!(l2dhdr.dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + for (;;) { + if (!l2arc_log_blkptr_valid(&dev, &lbps[0])) + break; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + if (pread64(fd, &this_lb, asize, lbps[0].lbp_daddr) != asize) { + if (!dump_opt['q']) { + (void) printf("Error while reading next log " + "block\n\n"); + } + break; + } + + fletcher_4_native_varsize(&this_lb, asize, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, lbps[0].lbp_cksum)) { + failed++; + if (!dump_opt['q']) { + (void) printf("Invalid cksum\n"); + dump_l2arc_log_blkptr(lbps[0]); + } + break; + } + + switch (L2BLK_GET_COMPRESS((&lbps[0])->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + default: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, &this_lb, 0, asize); + zio_decompress_data(L2BLK_GET_COMPRESS( + (&lbps[0])->lbp_prop), abd, &this_lb, + asize, sizeof (this_lb), NULL); + abd_free(abd); + break; + } + + if (this_lb.lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(&this_lb, sizeof (this_lb)); + if (this_lb.lb_magic != L2ARC_LOG_BLK_MAGIC) { + if (!dump_opt['q']) + (void) printf("Invalid log block magic\n\n"); + break; + } + + rebuild->dh_lb_count++; + rebuild->dh_lb_asize += asize; + if (dump_opt['l'] > 1 && !dump_opt['q']) { + (void) printf("lb[%4llu]\tmagic: %llu\n", + (u_longlong_t)rebuild->dh_lb_count, + (u_longlong_t)this_lb.lb_magic); + dump_l2arc_log_blkptr(lbps[0]); + } + + if (dump_opt['l'] > 2 && !dump_opt['q']) + dump_l2arc_log_entries(l2dhdr.dh_log_entries, + this_lb.lb_entries, + rebuild->dh_lb_count); + + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev.l2ad_evict) && + !dev.l2ad_first) + break; + + lbps[0] = lbps[1]; + lbps[1] = this_lb.lb_prev_lbp; + } + + if (!dump_opt['q']) { + (void) printf("log_blk_count:\t %llu with valid cksum\n", + (u_longlong_t)rebuild->dh_lb_count); + (void) printf("\t\t %d with invalid cksum\n", failed); + (void) printf("log_blk_asize:\t %llu\n\n", + (u_longlong_t)rebuild->dh_lb_asize); + } +} + +static int +dump_l2arc_header(int fd) +{ + l2arc_dev_hdr_phys_t l2dhdr, rebuild; + int error = B_FALSE; + + bzero(&l2dhdr, sizeof (l2dhdr)); + bzero(&rebuild, sizeof (rebuild)); + + if (pread64(fd, &l2dhdr, sizeof (l2dhdr), + VDEV_LABEL_START_SIZE) != sizeof (l2dhdr)) { + error = B_TRUE; + } else { + if (l2dhdr.dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(&l2dhdr, sizeof (l2dhdr)); + + if (l2dhdr.dh_magic != L2ARC_DEV_HDR_MAGIC) + error = B_TRUE; + } + + if (error) { + (void) printf("L2ARC device header not found\n\n"); + /* Do not return an error here for backward compatibility */ + return (0); + } else if (!dump_opt['q']) { + print_l2arc_header(); + + (void) printf(" magic: %llu\n", + (u_longlong_t)l2dhdr.dh_magic); + (void) printf(" version: %llu\n", + (u_longlong_t)l2dhdr.dh_version); + (void) printf(" pool_guid: %llu\n", + (u_longlong_t)l2dhdr.dh_spa_guid); + (void) printf(" flags: %llu\n", + (u_longlong_t)l2dhdr.dh_flags); + (void) printf(" start_lbps[0]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[0].lbp_daddr); + (void) printf(" start_lbps[1]: %llu\n", + (u_longlong_t) + l2dhdr.dh_start_lbps[1].lbp_daddr); + (void) printf(" log_blk_ent: %llu\n", + (u_longlong_t)l2dhdr.dh_log_entries); + (void) printf(" start: %llu\n", + (u_longlong_t)l2dhdr.dh_start); + (void) printf(" end: %llu\n", + (u_longlong_t)l2dhdr.dh_end); + (void) printf(" evict: %llu\n", + (u_longlong_t)l2dhdr.dh_evict); + (void) printf(" lb_asize_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_asize); + (void) printf(" lb_count_refcount: %llu\n", + (u_longlong_t)l2dhdr.dh_lb_count); + (void) printf(" trim_action_time: %llu\n", + (u_longlong_t)l2dhdr.dh_trim_action_time); + (void) printf(" trim_state: %llu\n\n", + (u_longlong_t)l2dhdr.dh_trim_state); + } + + dump_l2arc_log_blocks(fd, l2dhdr, &rebuild); + /* + * The total aligned size of log blocks and the number of log blocks + * reported in the header of the device may be less than what zdb + * reports by dump_l2arc_log_blocks() which emulates l2arc_rebuild(). + * This happens because dump_l2arc_log_blocks() lacks the memory + * pressure valve that l2arc_rebuild() has. Thus, if we are on a system + * with low memory, l2arc_rebuild will exit prematurely and dh_lb_asize + * and dh_lb_count will be lower to begin with than what exists on the + * device. This is normal and zdb should not exit with an error. The + * opposite case should never happen though, the values reported in the + * header should never be higher than what dump_l2arc_log_blocks() and + * l2arc_rebuild() report. If this happens there is a leak in the + * accounting of log blocks. + */ + if (l2dhdr.dh_lb_asize > rebuild.dh_lb_asize || + l2dhdr.dh_lb_count > rebuild.dh_lb_count) + return (1); + + return (0); +} + +static void +dump_config_from_label(zdb_label_t *label, size_t buflen, int l) +{ + if (dump_opt['q']) + return; + + if ((dump_opt['l'] < 3) && (first_label(label->config) != l)) + return; + + print_label_header(label, l); + dump_nvlist(label->config_nv, 4); + print_label_numbers(" labels = ", label->config); + + if (dump_opt['l'] >= 2) + dump_nvlist_stats(label->config_nv, buflen); +} + +#define ZDB_MAX_UB_HEADER_SIZE 32 + +static void +dump_label_uberblocks(zdb_label_t *label, uint64_t ashift, int label_num) +{ + + vdev_t vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)&label->label + uoff); + cksum_record_t *rec = label->uberblocks[i]; + + if (rec == NULL) { + if (dump_opt['u'] >= 2) { + print_label_header(label, label_num); + (void) printf(" Uberblock[%d] invalid\n", i); + } + continue; + } + + if ((dump_opt['u'] < 3) && (first_label(rec) != label_num)) + continue; + + if ((dump_opt['u'] < 4) && + (ub->ub_mmp_magic == MMP_MAGIC) && ub->ub_mmp_delay && + (i >= VDEV_UBERBLOCK_COUNT(&vd) - MMP_BLOCKS_PER_LABEL)) + continue; + + print_label_header(label, label_num); + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + " Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + print_label_numbers(" labels = ", rec); + } +} + +static char curpath[PATH_MAX]; + +/* + * Iterate through the path components, recursively passing + * current one's obj and remaining path until we find the obj + * for the last one. + */ +static int +dump_path_impl(objset_t *os, uint64_t obj, char *name) +{ + int err; + boolean_t header = B_TRUE; + uint64_t child_obj; + char *s; + dmu_buf_t *db; + dmu_object_info_t doi; + + if ((s = strchr(name, '/')) != NULL) + *s = '\0'; + err = zap_lookup(os, obj, name, 8, 1, &child_obj); + + (void) strlcat(curpath, name, sizeof (curpath)); + + if (err != 0) { + (void) fprintf(stderr, "failed to lookup %s: %s\n", + curpath, strerror(err)); + return (err); + } + + child_obj = ZFS_DIRENT_OBJ(child_obj); + err = sa_buf_hold(os, child_obj, FTAG, &db); + if (err != 0) { + (void) fprintf(stderr, + "failed to get SA dbuf for obj %llu: %s\n", + (u_longlong_t)child_obj, strerror(err)); + return (EINVAL); + } + dmu_object_info_from_db(db, &doi); + sa_buf_rele(db, FTAG); + + if (doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) { + (void) fprintf(stderr, "invalid bonus type %d for obj %llu\n", + doi.doi_bonus_type, (u_longlong_t)child_obj); + return (EINVAL); + } + + if (dump_opt['v'] > 6) { + (void) printf("obj=%llu %s type=%d bonustype=%d\n", + (u_longlong_t)child_obj, curpath, doi.doi_type, + doi.doi_bonus_type); + } + + (void) strlcat(curpath, "/", sizeof (curpath)); + + switch (doi.doi_type) { + case DMU_OT_DIRECTORY_CONTENTS: + if (s != NULL && *(s + 1) != '\0') + return (dump_path_impl(os, child_obj, s + 1)); + /*FALLTHROUGH*/ + case DMU_OT_PLAIN_FILE_CONTENTS: + dump_object(os, child_obj, dump_opt['v'], &header, NULL, 0); + return (0); + default: + (void) fprintf(stderr, "object %llu has non-file/directory " + "type %d\n", (u_longlong_t)obj, doi.doi_type); + break; + } + + return (EINVAL); +} + +/* + * Dump the blocks for the object specified by path inside the dataset. + */ +static int +dump_path(char *ds, char *path) +{ + int err; + objset_t *os; + uint64_t root_obj; + + err = open_objset(ds, FTAG, &os); + if (err != 0) + return (err); + + err = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, &root_obj); + if (err != 0) { + (void) fprintf(stderr, "can't lookup root znode: %s\n", + strerror(err)); + close_objset(os, FTAG); + return (EINVAL); + } + + (void) snprintf(curpath, sizeof (curpath), "dataset=%s path=/", ds); + + err = dump_path_impl(os, root_obj, path); + + close_objset(os, FTAG); + return (err); +} + +static int +dump_label(const char *dev) +{ + char path[MAXPATHLEN]; + zdb_label_t labels[VDEV_LABELS]; + uint64_t psize, ashift, l2cache; + struct stat64 statbuf; + boolean_t config_found = B_FALSE; + boolean_t error = B_FALSE; + boolean_t read_l2arc_header = B_FALSE; + avl_tree_t config_tree; + avl_tree_t uberblock_tree; + void *node, *cookie; + int fd; + + bzero(labels, sizeof (labels)); + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending expected disk paths and partition numbers. + */ + (void) strlcpy(path, dev, sizeof (path)); + if (dev[0] != '/' && stat64(path, &statbuf) != 0) { + int error; + + error = zfs_resolve_shortname(dev, path, MAXPATHLEN); + if (error == 0 && zfs_dev_is_whole_disk(path)) { + if (zfs_append_partition(path, MAXPATHLEN) == -1) + error = ENOENT; + } + + if (error || (stat64(path, &statbuf) != 0)) { + (void) printf("failed to find device %s, try " + "specifying absolute path instead\n", dev); + return (1); + } + } + + if ((fd = open64(path, O_RDONLY)) < 0) { + (void) printf("cannot open '%s': %s\n", path, strerror(errno)); + exit(1); + } + + if (fstat64_blk(fd, &statbuf) != 0) { + (void) printf("failed to stat '%s': %s\n", path, + strerror(errno)); + (void) close(fd); + exit(1); + } + + if (S_ISBLK(statbuf.st_mode) && zfs_dev_flush(fd) != 0) + (void) printf("failed to invalidate cache '%s' : %s\n", path, + strerror(errno)); + + avl_create(&config_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + avl_create(&uberblock_tree, cksum_record_compare, + sizeof (cksum_record_t), offsetof(cksum_record_t, link)); + + psize = statbuf.st_size; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + ashift = SPA_MINBLOCKSHIFT; + + /* + * 1. Read the label from disk + * 2. Unpack the configuration and insert in config tree. + * 3. Traverse all uberblocks and insert in uberblock tree. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + char *buf = label->label.vl_vdev_phys.vp_nvlist; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + nvlist_t *config; + cksum_record_t *rec; + zio_cksum_t cksum; + vdev_t vd; + + if (pread64(fd, &label->label, sizeof (label->label), + vdev_label_offset(psize, l, 0)) != sizeof (label->label)) { + if (!dump_opt['q']) + (void) printf("failed to read label %d\n", l); + label->read_failed = B_TRUE; + error = B_TRUE; + continue; + } + + label->read_failed = B_FALSE; + + if (nvlist_unpack(buf, buflen, &config, 0) == 0) { + nvlist_t *vdev_tree = NULL; + size_t size; + + if ((nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || + (nvlist_lookup_uint64(vdev_tree, + ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) + ashift = SPA_MINBLOCKSHIFT; + + if (nvlist_size(config, &size, NV_ENCODE_XDR) != 0) + size = buflen; + + /* If the device is a cache device clear the header. */ + if (!read_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + read_l2arc_header = B_TRUE; + } + } + + fletcher_4_native_varsize(buf, size, &cksum); + rec = cksum_record_insert(&config_tree, &cksum, l); + + label->config = rec; + label->config_nv = config; + config_found = B_TRUE; + } else { + error = B_TRUE; + } + + vd.vdev_ashift = ashift; + vd.vdev_top = &vd; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(&vd); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(&vd, i); + uberblock_t *ub = (void *)((char *)label + uoff); + + if (uberblock_verify(ub)) + continue; + + fletcher_4_native_varsize(ub, sizeof (*ub), &cksum); + rec = cksum_record_insert(&uberblock_tree, &cksum, l); + + label->uberblocks[i] = rec; + } + } + + /* + * Dump the label and uberblocks. + */ + for (int l = 0; l < VDEV_LABELS; l++) { + zdb_label_t *label = &labels[l]; + size_t buflen = sizeof (label->label.vl_vdev_phys.vp_nvlist); + + if (label->read_failed == B_TRUE) + continue; + + if (label->config_nv) { + dump_config_from_label(label, buflen, l); + } else { + if (!dump_opt['q']) + (void) printf("failed to unpack label %d\n", l); + } + + if (dump_opt['u']) + dump_label_uberblocks(label, ashift, l); + + nvlist_free(label->config_nv); + } + + /* + * Dump the L2ARC header, if existent. + */ + if (read_l2arc_header) + error |= dump_l2arc_header(fd); + + cookie = NULL; + while ((node = avl_destroy_nodes(&config_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + cookie = NULL; + while ((node = avl_destroy_nodes(&uberblock_tree, &cookie)) != NULL) + umem_free(node, sizeof (cksum_record_t)); + + avl_destroy(&config_tree); + avl_destroy(&uberblock_tree); + + (void) close(fd); + + return (config_found == B_FALSE ? 2 : + (error == B_TRUE ? 1 : 0)); +} + +static uint64_t dataset_feature_count[SPA_FEATURES]; +static uint64_t global_feature_count[SPA_FEATURES]; +static uint64_t remap_deadlist_count = 0; + +/*ARGSUSED*/ +static int +dump_one_objset(const char *dsname, void *arg) +{ + int error; + objset_t *os; + spa_feature_t f; + + error = open_objset(dsname, FTAG, &os); + if (error != 0) + return (0); + + for (f = 0; f < SPA_FEATURES; f++) { + if (!dsl_dataset_feature_is_active(dmu_objset_ds(os), f)) + continue; + ASSERT(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET); + dataset_feature_count[f]++; + } + + if (dsl_dataset_remap_deadlist_exists(dmu_objset_ds(os))) { + remap_deadlist_count++; + } + + for (dsl_bookmark_node_t *dbn = + avl_first(&dmu_objset_ds(os)->ds_bookmarks); dbn != NULL; + dbn = AVL_NEXT(&dmu_objset_ds(os)->ds_bookmarks, dbn)) { + mos_obj_refd(dbn->dbn_phys.zbm_redaction_obj); + if (dbn->dbn_phys.zbm_redaction_obj != 0) + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS]++; + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN]++; + } + + if (dsl_deadlist_is_open(&dmu_objset_ds(os)->ds_dir->dd_livelist) && + !dmu_objset_is_snapshot(os)) { + global_feature_count[SPA_FEATURE_LIVELIST]++; + } + + dump_objset(os); + close_objset(os, FTAG); + fuid_table_destroy(); + return (0); +} + +/* + * Block statistics. + */ +#define PSIZE_HISTO_SIZE (SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 2) +typedef struct zdb_blkstats { + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; + uint64_t zb_gangs; + uint64_t zb_ditto_samevdev; + uint64_t zb_ditto_same_ms; + uint64_t zb_psize_histogram[PSIZE_HISTO_SIZE]; +} zdb_blkstats_t; + +/* + * Extended object types to report deferred frees and dedup auto-ditto blocks. + */ +#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) +#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) +#define ZDB_OT_OTHER (DMU_OT_NUMTYPES + 2) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 3) + +static const char *zdb_ot_extname[] = { + "deferred free", + "dedup ditto", + "other", + "Total", +}; + +#define ZB_TOTAL DN_MAX_LEVELS +#define SPA_MAX_FOR_16M (SPA_MAXBLOCKSHIFT+1) + +typedef struct zdb_cb { + zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_removing_size; + uint64_t zcb_checkpoint_size; + uint64_t zcb_dedup_asize; + uint64_t zcb_dedup_blocks; + uint64_t zcb_psize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_count[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_lsize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_asize_len[SPA_MAX_FOR_16M]; + uint64_t zcb_psize_total; + uint64_t zcb_lsize_total; + uint64_t zcb_asize_total; + uint64_t zcb_embedded_blocks[NUM_BP_EMBEDDED_TYPES]; + uint64_t zcb_embedded_histogram[NUM_BP_EMBEDDED_TYPES] + [BPE_PAYLOAD_SIZE + 1]; + uint64_t zcb_start; + hrtime_t zcb_lastprint; + uint64_t zcb_totalasize; + uint64_t zcb_errors[256]; + int zcb_readfails; + int zcb_haderrors; + spa_t *zcb_spa; + uint32_t **zcb_vd_obsolete_counts; +} zdb_cb_t; + +/* test if two DVA offsets from same vdev are within the same metaslab */ +static boolean_t +same_metaslab(spa_t *spa, uint64_t vdev, uint64_t off1, uint64_t off2) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t ms_shift = vd->vdev_ms_shift; + + return ((off1 >> ms_shift) == (off2 >> ms_shift)); +} + +/* + * Used to simplify reporting of the histogram data. + */ +typedef struct one_histo { + char *name; + uint64_t *count; + uint64_t *len; + uint64_t cumulative; +} one_histo_t; + +/* + * The number of separate histograms processed for psize, lsize and asize. + */ +#define NUM_HISTO 3 + +/* + * This routine will create a fixed column size output of three different + * histograms showing by blocksize of 512 - 2^ SPA_MAX_FOR_16M + * the count, length and cumulative length of the psize, lsize and + * asize blocks. + * + * All three types of blocks are listed on a single line + * + * By default the table is printed in nicenumber format (e.g. 123K) but + * if the '-P' parameter is specified then the full raw number (parseable) + * is printed out. + */ +static void +dump_size_histograms(zdb_cb_t *zcb) +{ + /* + * A temporary buffer that allows us to convert a number into + * a string using zdb_nicenumber to allow either raw or human + * readable numbers to be output. + */ + char numbuf[32]; + + /* + * Define titles which are used in the headers of the tables + * printed by this routine. + */ + const char blocksize_title1[] = "block"; + const char blocksize_title2[] = "size"; + const char count_title[] = "Count"; + const char length_title[] = "Size"; + const char cumulative_title[] = "Cum."; + + /* + * Setup the histogram arrays (psize, lsize, and asize). + */ + one_histo_t parm_histo[NUM_HISTO]; + + parm_histo[0].name = "psize"; + parm_histo[0].count = zcb->zcb_psize_count; + parm_histo[0].len = zcb->zcb_psize_len; + parm_histo[0].cumulative = 0; + + parm_histo[1].name = "lsize"; + parm_histo[1].count = zcb->zcb_lsize_count; + parm_histo[1].len = zcb->zcb_lsize_len; + parm_histo[1].cumulative = 0; + + parm_histo[2].name = "asize"; + parm_histo[2].count = zcb->zcb_asize_count; + parm_histo[2].len = zcb->zcb_asize_len; + parm_histo[2].cumulative = 0; + + + (void) printf("\nBlock Size Histogram\n"); + /* + * Print the first line titles + */ + if (dump_opt['P']) + (void) printf("\n%s\t", blocksize_title1); + else + (void) printf("\n%7s ", blocksize_title1); + + for (int j = 0; j < NUM_HISTO; j++) { + if (dump_opt['P']) { + if (j < NUM_HISTO - 1) { + (void) printf("%s\t\t\t", parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf(" %s", parm_histo[j].name); + } + } else { + if (j < NUM_HISTO - 1) { + /* Left aligned strings in the output */ + (void) printf("%-7s ", + parm_histo[j].name); + } else { + /* Don't print trailing spaces */ + (void) printf("%s", parm_histo[j].name); + } + } + } + (void) printf("\n"); + + /* + * Print the second line titles + */ + if (dump_opt['P']) { + (void) printf("%s\t", blocksize_title2); + } else { + (void) printf("%7s ", blocksize_title2); + } + + for (int i = 0; i < NUM_HISTO; i++) { + if (dump_opt['P']) { + (void) printf("%s\t%s\t%s\t", + count_title, length_title, cumulative_title); + } else { + (void) printf("%7s%7s%7s", + count_title, length_title, cumulative_title); + } + } + (void) printf("\n"); + + /* + * Print the rows + */ + for (int i = SPA_MINBLOCKSHIFT; i < SPA_MAX_FOR_16M; i++) { + + /* + * Print the first column showing the blocksize + */ + zdb_nicenum((1ULL << i), numbuf, sizeof (numbuf)); + + if (dump_opt['P']) { + printf("%s", numbuf); + } else { + printf("%7s:", numbuf); + } + + /* + * Print the remaining set of 3 columns per size: + * for psize, lsize and asize + */ + for (int j = 0; j < NUM_HISTO; j++) { + parm_histo[j].cumulative += parm_histo[j].len[i]; + + zdb_nicenum(parm_histo[j].count[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].len[i], + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + + zdb_nicenum(parm_histo[j].cumulative, + numbuf, sizeof (numbuf)); + if (dump_opt['P']) + (void) printf("\t%s", numbuf); + else + (void) printf("%7s", numbuf); + } + (void) printf("\n"); + } +} + +static void +zdb_count_block(zdb_cb_t *zcb, zilog_t *zilog, const blkptr_t *bp, + dmu_object_type_t type) +{ + uint64_t refcnt = 0; + int i; + + ASSERT(type < ZDB_OT_TOTAL); + + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; + + spa_config_enter(zcb->zcb_spa, SCL_CONFIG, FTAG, RW_READER); + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; + int equal; + zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; + + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_count++; + + /* + * The histogram is only big enough to record blocks up to + * SPA_OLD_MAXBLOCKSIZE; larger blocks go into the last, + * "other", bucket. + */ + unsigned idx = BP_GET_PSIZE(bp) >> SPA_MINBLOCKSHIFT; + idx = MIN(idx, SPA_OLD_MAXBLOCKSIZE / SPA_MINBLOCKSIZE + 1); + zb->zb_psize_histogram[idx]++; + + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) { + zb->zb_ditto_samevdev++; + + if (same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + } + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal != 0) { + zb->zb_ditto_samevdev++; + + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[1]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[0]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + else if (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2]) && + same_metaslab(zcb->zcb_spa, + DVA_GET_VDEV(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[1]), + DVA_GET_OFFSET(&bp->blk_dva[2]))) + zb->zb_ditto_same_ms++; + } + break; + } + } + + spa_config_exit(zcb->zcb_spa, SCL_CONFIG, FTAG); + + if (BP_IS_EMBEDDED(bp)) { + zcb->zcb_embedded_blocks[BPE_GET_ETYPE(bp)]++; + zcb->zcb_embedded_histogram[BPE_GET_ETYPE(bp)] + [BPE_GET_PSIZE(bp)]++; + return; + } + /* + * The binning histogram bins by powers of two up to + * SPA_MAXBLOCKSIZE rather than creating bins for + * every possible blocksize found in the pool. + */ + int bin = highbit64(BP_GET_PSIZE(bp)) - 1; + + zcb->zcb_psize_count[bin]++; + zcb->zcb_psize_len[bin] += BP_GET_PSIZE(bp); + zcb->zcb_psize_total += BP_GET_PSIZE(bp); + + bin = highbit64(BP_GET_LSIZE(bp)) - 1; + + zcb->zcb_lsize_count[bin]++; + zcb->zcb_lsize_len[bin] += BP_GET_LSIZE(bp); + zcb->zcb_lsize_total += BP_GET_LSIZE(bp); + + bin = highbit64(BP_GET_ASIZE(bp)) - 1; + + zcb->zcb_asize_count[bin]++; + zcb->zcb_asize_len[bin] += BP_GET_ASIZE(bp); + zcb->zcb_asize_total += BP_GET_ASIZE(bp); + + if (dump_opt['L']) + return; + + if (BP_GET_DEDUP(bp)) { + ddt_t *ddt; + ddt_entry_t *dde; + + ddt = ddt_select(zcb->zcb_spa, bp); + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_FALSE); + + if (dde == NULL) { + refcnt = 0; + } else { + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); + } + ddt_exit(ddt); + } + + VERIFY3U(zio_wait(zio_claim(NULL, zcb->zcb_spa, + refcnt ? 0 : spa_min_claim_txg(zcb->zcb_spa), + bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); +} + +static void +zdb_blkptr_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + int ioerr = zio->io_error; + zdb_cb_t *zcb = zio->io_private; + zbookmark_phys_t *zb = &zio->io_bookmark; + + abd_free(zio->io_abd); + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + + if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + char blkbuf[BP_SPRINTF_LEN]; + + zcb->zcb_haderrors = 1; + zcb->zcb_errors[ioerr]++; + + if (dump_opt['b'] >= 2) + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + else + blkbuf[0] = '\0'; + + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); + } + mutex_exit(&spa->spa_scrub_lock); +} + +static int +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + zdb_cb_t *zcb = arg; + dmu_object_type_t type; + boolean_t is_metadata; + + if (zb->zb_level == ZB_DNODE_LEVEL) + return (0); + + if (dump_opt['b'] >= 5 && bp->blk_birth > 0) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), + blkbuf); + } + + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) + return (0); + + type = BP_GET_TYPE(bp); + + zdb_count_block(zcb, zilog, bp, + (type & DMU_OT_NEWTYPE) ? ZDB_OT_OTHER : type); + + is_metadata = (BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)); + + if (!BP_IS_EMBEDDED(bp) && + (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata))) { + size_t size = BP_GET_PSIZE(bp); + abd_t *abd = abd_alloc(size, B_FALSE); + int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + flags |= ZIO_FLAG_SPECULATIVE; + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_load_verify_bytes > max_inflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_load_verify_bytes += size; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(NULL, spa, bp, abd, size, + zdb_blkptr_done, zcb, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + } + + zcb->zcb_readfails = 0; + + /* only call gethrtime() every 100 blocks */ + static int iters; + if (++iters > 100) + iters = 0; + else + return (0); + + if (dump_opt['b'] < 5 && gethrtime() > zcb->zcb_lastprint + NANOSEC) { + uint64_t now = gethrtime(); + char buf[10]; + uint64_t bytes = zcb->zcb_type[ZB_TOTAL][ZDB_OT_TOTAL].zb_asize; + int kb_per_sec = + 1 + bytes / (1 + ((now - zcb->zcb_start) / 1000 / 1000)); + int sec_remaining = + (zcb->zcb_totalasize - bytes) / 1024 / kb_per_sec; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (buf) >= NN_NUMBUF_SZ); + + zfs_nicebytes(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "\r%5s completed (%4dMB/s) " + "estimated time remaining: %uhr %02umin %02usec ", + buf, kb_per_sec / 1024, + sec_remaining / 60 / 60, + sec_remaining / 60 % 60, + sec_remaining % 60); + + zcb->zcb_lastprint = now; + } + + return (0); +} + +static void +zdb_leak(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); +} + +static metaslab_ops_t zdb_metaslab_ops = { + NULL /* alloc */ +}; + +/* ARGSUSED */ +static int +load_unflushed_svr_segs_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + spa_vdev_removal_t *svr = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + + /* skip vdevs we don't care about */ + if (sme->sme_vdev != svr->svr_vdev_id) + return (0); + + vdev_t *vd = vdev_lookup_top(spa, sme->sme_vdev); + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + ASSERT(vim != NULL); + if (offset >= vdev_indirect_mapping_max_offset(vim)) + return (0); + + if (sme->sme_type == SM_ALLOC) + range_tree_add(svr->svr_allocd_segs, offset, size); + else + range_tree_remove(svr->svr_allocd_segs, offset, size); + + return (0); +} + +/* ARGSUSED */ +static void +claim_segment_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + /* + * This callback was called through a remap from + * a device being removed. Therefore, the vdev that + * this callback is applied to is a concrete + * vdev. + */ + ASSERT(vdev_is_concrete(vd)); + + VERIFY0(metaslab_claim_impl(vd, offset, size, + spa_min_claim_txg(vd->vdev_spa))); +} + +static void +claim_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + claim_segment_impl_cb, NULL); +} + +/* + * After accounting for all allocated blocks that are directly referenced, + * we might have missed a reference to a block from a partially complete + * (and thus unused) indirect mapping object. We perform a secondary pass + * through the metaslabs we have already mapped and claim the destination + * blocks. + */ +static void +zdb_claim_removing(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return; + + if (spa->spa_vdev_removal == NULL) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + range_tree_t *allocs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(allocs)); + if (msp->ms_sm != NULL) + VERIFY0(space_map_load(msp->ms_sm, allocs, SM_ALLOC)); + range_tree_vacate(allocs, range_tree_add, svr->svr_allocd_segs); + } + range_tree_destroy(allocs); + + iterate_through_spacemap_logs(spa, load_unflushed_svr_segs_cb, svr); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for + * it yet. + */ + range_tree_clear(svr->svr_allocd_segs, + vdev_indirect_mapping_max_offset(vim), + vd->vdev_asize - vdev_indirect_mapping_max_offset(vim)); + + zcb->zcb_removing_size += range_tree_space(svr->svr_allocd_segs); + range_tree_vacate(svr->svr_allocd_segs, claim_segment_cb, vd); + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +static int +increment_indirect_mapping_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + spa_t *spa = zcb->zcb_spa; + vdev_t *vd; + const dva_t *dva = &bp->blk_dva[0]; + + ASSERT(!bp_freed); + ASSERT(!dump_opt['L']); + ASSERT3U(BP_GET_NDVAS(bp), ==, 1); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(zcb->zcb_spa, DVA_GET_VDEV(dva)); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3P(zcb->zcb_vd_obsolete_counts[vd->vdev_id], !=, NULL); + + vdev_indirect_mapping_increment_obsolete_count( + vd->vdev_indirect_mapping, + DVA_GET_OFFSET(dva), DVA_GET_ASIZE(dva), + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + + return (0); +} + +static uint32_t * +zdb_load_obsolete_counts(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint64_t obsolete_sm_object; + uint32_t *counts; + + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + EQUIV(obsolete_sm_object != 0, vd->vdev_obsolete_sm != NULL); + counts = vdev_indirect_mapping_load_obsolete_counts(vim); + if (vd->vdev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + vd->vdev_obsolete_sm); + } + if (scip->scip_vdev == vd->vdev_id && + scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + vdev_indirect_mapping_load_obsolete_spacemap(vim, counts, + prev_obsolete_sm); + space_map_close(prev_obsolete_sm); + } + return (counts); +} + +static void +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + ddt_bookmark_t ddb; + ddt_entry_t dde; + int error; + int p; + + ASSERT(!dump_opt['L']); + + bzero(&ddb, sizeof (ddb)); + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(zcb, NULL, &blk, ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + + ASSERT(error == ENOENT); +} + +typedef struct checkpoint_sm_exclude_entry_arg { + vdev_t *cseea_vd; + uint64_t cseea_checkpoint_size; +} checkpoint_sm_exclude_entry_arg_t; + +static int +checkpoint_sm_exclude_entry_cb(space_map_entry_t *sme, void *arg) +{ + checkpoint_sm_exclude_entry_arg_t *cseea = arg; + vdev_t *vd = cseea->cseea_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + /* + * Since the vdev_checkpoint_sm exists in the vdev level + * and the ms_sm space maps exist in the metaslab level, + * an entry in the checkpoint space map could theoretically + * cross the boundaries of the metaslab that it belongs. + * + * In reality, because of the way that we populate and + * manipulate the checkpoint's space maps currently, + * there shouldn't be any entries that cross metaslabs. + * Hence the assertion below. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * By removing the entry from the allocated segments we + * also verify that the entry is there to begin with. + */ + mutex_enter(&ms->ms_lock); + range_tree_remove(ms->ms_allocatable, sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + cseea->cseea_checkpoint_size += sme->sme_run; + return (0); +} + +static void +zdb_leak_init_vdev_exclude_checkpoint(vdev_t *vd, zdb_cb_t *zcb) +{ + spa_t *spa = vd->vdev_spa; + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + /* + * If there is no vdev_top_zap, we are in a pool whose + * version predates the pool checkpoint feature. + */ + if (vd->vdev_top_zap == 0) + return; + + /* + * If there is no reference of the vdev_checkpoint_sm in + * the vdev_top_zap, then one of the following scenarios + * is true: + * + * 1] There is no checkpoint + * 2] There is a checkpoint, but no checkpointed blocks + * have been freed yet + * 3] The current vdev is indirect + * + * In these cases we return immediately. + */ + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + return; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, + &checkpoint_sm_obj)); + + checkpoint_sm_exclude_entry_arg_t cseea; + cseea.cseea_vd = vd; + cseea.cseea_checkpoint_size = 0; + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + checkpoint_sm_exclude_entry_cb, &cseea)); + space_map_close(checkpoint_sm); + + zcb->zcb_checkpoint_size += cseea.cseea_checkpoint_size; +} + +static void +zdb_leak_init_exclude_checkpoint(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + ASSERT3U(c, ==, rvd->vdev_child[c]->vdev_id); + zdb_leak_init_vdev_exclude_checkpoint(rvd->vdev_child[c], zcb); + } +} + +static int +count_unflushed_space_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + int64_t *ualloc_space = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (sme->sme_type == SM_ALLOC) + *ualloc_space += sme->sme_run; + else + *ualloc_space -= sme->sme_run; + + return (0); +} + +static int64_t +get_unflushed_alloc_space(spa_t *spa) +{ + if (dump_opt['L']) + return (0); + + int64_t ualloc_space = 0; + iterate_through_spacemap_logs(spa, count_unflushed_space_cb, + &ualloc_space); + return (ualloc_space); +} + +static int +load_unflushed_cb(spa_t *spa, space_map_entry_t *sme, uint64_t txg, void *arg) +{ + maptype_t *uic_maptype = arg; + + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint64_t vdev_id = sme->sme_vdev; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* skip indirect vdevs */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + ASSERT(*uic_maptype == SM_ALLOC || *uic_maptype == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + + if (*uic_maptype == sme->sme_type) + range_tree_add(ms->ms_allocatable, offset, size); + else + range_tree_remove(ms->ms_allocatable, offset, size); + + return (0); +} + +static void +load_unflushed_to_ms_allocatables(spa_t *spa, maptype_t maptype) +{ + iterate_through_spacemap_logs(spa, load_unflushed_cb, &maptype); +} + +static void +load_concrete_ms_allocatable_trees(spa_t *spa, maptype_t maptype) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + ASSERT3U(i, ==, vd->vdev_id); + + if (vd->vdev_ops == &vdev_indirect_ops) + continue; + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rloading concrete vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)msp->ms_id, + (longlong_t)vd->vdev_ms_count); + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + msp->ms_allocatable, maptype)); + } + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); + } + } + + load_unflushed_to_ms_allocatables(spa, maptype); +} + +/* + * vm_idxp is an in-out parameter which (for indirect vdevs) is the + * index in vim_entries that has the first entry in this metaslab. + * On return, it will be set to the first entry after this metaslab. + */ +static void +load_indirect_ms_allocatable_tree(vdev_t *vd, metaslab_t *msp, + uint64_t *vim_idxp) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + + /* + * We don't want to spend the CPU manipulating the + * size-ordered tree, so clear the range_tree ops. + */ + msp->ms_allocatable->rt_ops = NULL; + + for (; *vim_idxp < vdev_indirect_mapping_num_entries(vim); + (*vim_idxp)++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[*vim_idxp]; + uint64_t ent_offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + uint64_t ent_len = DVA_GET_ASIZE(&vimep->vimep_dst); + ASSERT3U(ent_offset, >=, msp->ms_start); + if (ent_offset >= msp->ms_start + msp->ms_size) + break; + + /* + * Mappings do not cross metaslab boundaries, + * because we create them by walking the metaslabs. + */ + ASSERT3U(ent_offset + ent_len, <=, + msp->ms_start + msp->ms_size); + range_tree_add(msp->ms_allocatable, ent_offset, ent_len); + } + + if (!msp->ms_loaded) + msp->ms_loaded = B_TRUE; + mutex_exit(&msp->ms_lock); +} + +static void +zdb_leak_init_prepare_indirect_vdevs(spa_t *spa, zdb_cb_t *zcb) +{ + ASSERT(!dump_opt['L']); + + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + ASSERT3U(c, ==, vd->vdev_id); + + if (vd->vdev_ops != &vdev_indirect_ops) + continue; + + /* + * Note: we don't check for mapping leaks on + * removing vdevs because their ms_allocatable's + * are used to look for leaks in allocated space. + */ + zcb->zcb_vd_obsolete_counts[c] = zdb_load_obsolete_counts(vd); + + /* + * Normally, indirect vdevs don't have any + * metaslabs. We want to set them up for + * zio_claim(). + */ + VERIFY0(vdev_metaslab_init(vd, 0)); + + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t vim_idx = 0; + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + + (void) fprintf(stderr, + "\rloading indirect vdev %llu, " + "metaslab %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vd->vdev_ms[m]->ms_id, + (longlong_t)vd->vdev_ms_count); + + load_indirect_ms_allocatable_tree(vd, vd->vdev_ms[m], + &vim_idx); + } + ASSERT3U(vim_idx, ==, vdev_indirect_mapping_num_entries(vim)); + } +} + +static void +zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + zcb->zcb_spa = spa; + + if (dump_opt['L']) + return; + + dsl_pool_t *dp = spa->spa_dsl_pool; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We are going to be changing the meaning of the metaslab's + * ms_allocatable. Ensure that the allocator doesn't try to + * use the tree. + */ + spa->spa_normal_class->mc_ops = &zdb_metaslab_ops; + spa->spa_log_class->mc_ops = &zdb_metaslab_ops; + + zcb->zcb_vd_obsolete_counts = + umem_zalloc(rvd->vdev_children * sizeof (uint32_t *), + UMEM_NOFAIL); + + /* + * For leak detection, we overload the ms_allocatable trees + * to contain allocated segments instead of free segments. + * As a result, we can't use the normal metaslab_load/unload + * interfaces. + */ + zdb_leak_init_prepare_indirect_vdevs(spa, zcb); + load_concrete_ms_allocatable_trees(spa, SM_ALLOC); + + /* + * On load_concrete_ms_allocatable_trees() we loaded all the + * allocated entries from the ms_sm to the ms_allocatable for + * each metaslab. If the pool has a checkpoint or is in the + * middle of discarding a checkpoint, some of these blocks + * may have been freed but their ms_sm may not have been + * updated because they are referenced by the checkpoint. In + * order to avoid false-positives during leak-detection, we + * go through the vdev's checkpoint space map and exclude all + * its entries from their relevant ms_allocatable. + * + * We also aggregate the space held by the checkpoint and add + * it to zcb_checkpoint_size. + * + * Note that at this point we are also verifying that all the + * entries on the checkpoint_sm are marked as allocated in + * the ms_sm of their relevant metaslab. + * [see comment in checkpoint_sm_exclude_entry_cb()] + */ + zdb_leak_init_exclude_checkpoint(spa, zcb); + ASSERT3U(zcb->zcb_checkpoint_size, ==, spa_get_checkpoint_space(spa)); + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); + + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + (void) bpobj_iterate_nofree(&dp->dp_obsolete_bpobj, + increment_indirect_mapping_cb, zcb, NULL); + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + zdb_ddt_leak_init(spa, zcb); + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static boolean_t +zdb_check_for_obsolete_leaks(vdev_t *vd, zdb_cb_t *zcb) +{ + boolean_t leaks = B_FALSE; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t total_leaked = 0; + boolean_t are_precise = B_FALSE; + + ASSERT(vim != NULL); + + for (uint64_t i = 0; i < vdev_indirect_mapping_num_entries(vim); i++) { + vdev_indirect_mapping_entry_phys_t *vimep = + &vim->vim_entries[i]; + uint64_t obsolete_bytes = 0; + uint64_t offset = DVA_MAPPING_GET_SRC_OFFSET(vimep); + metaslab_t *msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + /* + * This is not very efficient but it's easy to + * verify correctness. + */ + for (uint64_t inner_offset = 0; + inner_offset < DVA_GET_ASIZE(&vimep->vimep_dst); + inner_offset += 1 << vd->vdev_ashift) { + if (range_tree_contains(msp->ms_allocatable, + offset + inner_offset, 1 << vd->vdev_ashift)) { + obsolete_bytes += 1 << vd->vdev_ashift; + } + } + + int64_t bytes_leaked = obsolete_bytes - + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]; + ASSERT3U(DVA_GET_ASIZE(&vimep->vimep_dst), >=, + zcb->zcb_vd_obsolete_counts[vd->vdev_id][i]); + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (bytes_leaked != 0 && (are_precise || dump_opt['d'] >= 5)) { + (void) printf("obsolete indirect mapping count " + "mismatch on %llu:%llx:%llx : %llx bytes leaked\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)DVA_MAPPING_GET_SRC_OFFSET(vimep), + (u_longlong_t)DVA_GET_ASIZE(&vimep->vimep_dst), + (u_longlong_t)bytes_leaked); + } + total_leaked += ABS(bytes_leaked); + } + + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (!are_precise && total_leaked > 0) { + int pct_leaked = total_leaked * 100 / + vdev_indirect_mapping_bytes_mapped(vim); + (void) printf("cannot verify obsolete indirect mapping " + "counts of vdev %llu because precise feature was not " + "enabled when it was removed: %d%% (%llx bytes) of mapping" + "unreferenced\n", + (u_longlong_t)vd->vdev_id, pct_leaked, + (u_longlong_t)total_leaked); + } else if (total_leaked > 0) { + (void) printf("obsolete indirect mapping count mismatch " + "for vdev %llu -- %llx total bytes mismatched\n", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)total_leaked); + leaks |= B_TRUE; + } + + vdev_indirect_mapping_free_obsolete_counts(vim, + zcb->zcb_vd_obsolete_counts[vd->vdev_id]); + zcb->zcb_vd_obsolete_counts[vd->vdev_id] = NULL; + + return (leaks); +} + +static boolean_t +zdb_leak_fini(spa_t *spa, zdb_cb_t *zcb) +{ + if (dump_opt['L']) + return (B_FALSE); + + boolean_t leaks = B_FALSE; + vdev_t *rvd = spa->spa_root_vdev; + for (unsigned c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + metaslab_group_t *mg __maybe_unused = vd->vdev_mg; + + if (zcb->zcb_vd_obsolete_counts[c] != NULL) { + leaks |= zdb_check_for_obsolete_leaks(vd, zcb); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + ASSERT3P(mg, ==, msp->ms_group); + + /* + * ms_allocatable has been overloaded + * to contain allocated segments. Now that + * we finished traversing all blocks, any + * block that remains in the ms_allocatable + * represents an allocated block that we + * did not claim during the traversal. + * Claimed blocks would have been removed + * from the ms_allocatable. For indirect + * vdevs, space remaining in the tree + * represents parts of the mapping that are + * not referenced, which is not a bug. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + range_tree_vacate(msp->ms_allocatable, + NULL, NULL); + } else { + range_tree_vacate(msp->ms_allocatable, + zdb_leak, vd); + } + if (msp->ms_loaded) { + msp->ms_loaded = B_FALSE; + } + } + } + + umem_free(zcb->zcb_vd_obsolete_counts, + rvd->vdev_children * sizeof (uint32_t *)); + zcb->zcb_vd_obsolete_counts = NULL; + + return (leaks); +} + +/* ARGSUSED */ +static int +count_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zdb_cb_t *zcb = arg; + + if (dump_opt['b'] >= 5) { + char blkbuf[BP_SPRINTF_LEN]; + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("[%s] %s\n", + "deferred free", blkbuf); + } + zdb_count_block(zcb, NULL, bp, ZDB_OT_DEFERRED); + return (0); +} + +/* + * Iterate over livelists which have been destroyed by the user but + * are still present in the MOS, waiting to be freed + */ +static void +iterate_deleted_livelists(spa_t *spa, ll_iter_t func, void *arg) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + ASSERT0(err); + + zap_cursor_t zc; + zap_attribute_t attr; + dsl_deadlist_t ll; + /* NULL out os prior to dsl_deadlist_open in case it's garbage */ + ll.dl_os = NULL; + for (zap_cursor_init(&zc, mos, zap_obj); + zap_cursor_retrieve(&zc, &attr) == 0; + (void) zap_cursor_advance(&zc)) { + dsl_deadlist_open(&ll, mos, attr.za_first_integer); + func(&ll, arg); + dsl_deadlist_close(&ll); + } + zap_cursor_fini(&zc); +} + +static int +bpobj_count_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (count_block_cb(arg, bp, tx)); +} + +static int +livelist_entry_count_blocks_cb(void *args, dsl_deadlist_entry_t *dle) +{ + zdb_cb_t *zbc = args; + bplist_t blks; + bplist_create(&blks); + /* determine which blocks have been alloc'd but not freed */ + VERIFY0(dsl_process_sub_livelist(&dle->dle_bpobj, &blks, NULL, NULL)); + /* count those blocks */ + (void) bplist_iterate(&blks, count_block_cb, zbc, NULL); + bplist_destroy(&blks); + return (0); +} + +static void +livelist_count_blocks(dsl_deadlist_t *ll, void *arg) +{ + dsl_deadlist_iterate(ll, livelist_entry_count_blocks_cb, arg); +} + +/* + * Count the blocks in the livelists that have been destroyed by the user + * but haven't yet been freed. + */ +static void +deleted_livelists_count_blocks(spa_t *spa, zdb_cb_t *zbc) +{ + iterate_deleted_livelists(spa, livelist_count_blocks, zbc); +} + +static void +dump_livelist_cb(dsl_deadlist_t *ll, void *arg) +{ + ASSERT3P(arg, ==, NULL); + global_feature_count[SPA_FEATURE_LIVELIST]++; + dump_blkptr_list(ll, "Deleted Livelist"); + dsl_deadlist_iterate(ll, sublivelist_verify_lightweight, NULL); +} + +/* + * Print out, register object references to, and increment feature counts for + * livelists that have been destroyed by the user but haven't yet been freed. + */ +static void +deleted_livelists_dump_mos(spa_t *spa) +{ + uint64_t zap_obj; + objset_t *mos = spa->spa_meta_objset; + int err = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (err == ENOENT) + return; + mos_obj_refd(zap_obj); + iterate_deleted_livelists(spa, dump_livelist_cb, NULL); +} + +static int +dump_block_stats(spa_t *spa) +{ + zdb_cb_t zcb; + zdb_blkstats_t *zb, *tzb; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT | TRAVERSE_HARD; + boolean_t leaks = B_FALSE; + int e, c, err; + bp_embedded_type_t i; + + bzero(&zcb, sizeof (zcb)); + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); + + /* + * When leak detection is enabled we load all space maps as SM_ALLOC + * maps, then traverse the pool claiming each block we discover. If + * the pool is perfectly consistent, the segment trees will be empty + * when we're done. Anything left over is a leak; any block we can't + * claim (because it's not part of any space map) is a double + * allocation, reference to a freed block, or an unclaimed log block. + * + * When leak detection is disabled (-L option) we still traverse the + * pool claiming each block we discover, but we skip opening any space + * maps. + */ + bzero(&zcb, sizeof (zdb_cb_t)); + zdb_leak_init(spa, &zcb); + + /* + * If there's a deferred-free bplist, process that first. + */ + (void) bpobj_iterate_nofree(&spa->spa_deferred_bpobj, + bpobj_count_block_cb, &zcb, NULL); + + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_iterate_nofree(&spa->spa_dsl_pool->dp_free_bpobj, + bpobj_count_block_cb, &zcb, NULL); + } + + zdb_claim_removing(spa, &zcb); + + if (spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + VERIFY3U(0, ==, bptree_iterate(spa->spa_meta_objset, + spa->spa_dsl_pool->dp_bptree_obj, B_FALSE, count_block_cb, + &zcb, NULL)); + } + + deleted_livelists_count_blocks(spa, &zcb); + + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_totalasize = metaslab_class_get_alloc(spa_normal_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_special_class(spa)); + zcb.zcb_totalasize += metaslab_class_get_alloc(spa_dedup_class(spa)); + zcb.zcb_start = zcb.zcb_lastprint = gethrtime(); + err = traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); + + /* + * If we've traversed the data blocks then we need to wait for those + * I/Os to complete. We leverage "The Godfather" zio to wait on + * all async I/Os to complete. + */ + if (dump_opt['c']) { + for (c = 0; c < max_ncpus; c++) { + (void) zio_wait(spa->spa_async_zio_root[c]); + spa->spa_async_zio_root[c] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + } + ASSERT0(spa->spa_load_verify_bytes); + + /* + * Done after zio_wait() since zcb_haderrors is modified in + * zdb_blkptr_done() + */ + zcb.zcb_haderrors |= err; + + if (zcb.zcb_haderrors) { + (void) printf("\nError counts:\n\n"); + (void) printf("\t%5s %s\n", "errno", "count"); + for (e = 0; e < 256; e++) { + if (zcb.zcb_errors[e] != 0) { + (void) printf("\t%5d %llu\n", + e, (u_longlong_t)zcb.zcb_errors[e]); + } + } + } + + /* + * Report any leaked segments. + */ + leaks |= zdb_leak_fini(spa, &zcb); + + tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; + + norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + norm_space = metaslab_class_get_space(spa_normal_class(spa)); + + total_alloc = norm_alloc + + metaslab_class_get_alloc(spa_log_class(spa)) + + metaslab_class_get_alloc(spa_special_class(spa)) + + metaslab_class_get_alloc(spa_dedup_class(spa)) + + get_unflushed_alloc_space(spa); + total_found = tzb->zb_asize - zcb.zcb_dedup_asize + + zcb.zcb_removing_size + zcb.zcb_checkpoint_size; + + if (total_found == total_alloc && !dump_opt['L']) { + (void) printf("\n\tNo leaks (block sum matches space" + " maps exactly)\n"); + } else if (!dump_opt['L']) { + (void) printf("block traversal size %llu != alloc %llu " + "(%s %lld)\n", + (u_longlong_t)total_found, + (u_longlong_t)total_alloc, + (dump_opt['L']) ? "unreachable" : "leaked", + (longlong_t)(total_alloc - total_found)); + leaks = B_TRUE; + } + + if (tzb->zb_count == 0) + return (2); + + (void) printf("\n"); + (void) printf("\t%-16s %14llu\n", "bp count:", + (u_longlong_t)tzb->zb_count); + (void) printf("\t%-16s %14llu\n", "ganged count:", + (longlong_t)tzb->zb_gangs); + (void) printf("\t%-16s %14llu avg: %6llu\n", "bp logical:", + (u_longlong_t)tzb->zb_lsize, + (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp physical:", (u_longlong_t)tzb->zb_psize, + (u_longlong_t)(tzb->zb_psize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_psize); + (void) printf("\t%-16s %14llu avg: %6llu compression: %6.2f\n", + "bp allocated:", (u_longlong_t)tzb->zb_asize, + (u_longlong_t)(tzb->zb_asize / tzb->zb_count), + (double)tzb->zb_lsize / tzb->zb_asize); + (void) printf("\t%-16s %14llu ref>1: %6llu deduplication: %6.2f\n", + "bp deduped:", (u_longlong_t)zcb.zcb_dedup_asize, + (u_longlong_t)zcb.zcb_dedup_blocks, + (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\t%-16s %14llu used: %5.2f%%\n", "Normal class:", + (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); + + if (spa_special_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_special_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_special_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Special class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + if (spa_dedup_class(spa)->mc_rotor != NULL) { + uint64_t alloc = metaslab_class_get_alloc( + spa_dedup_class(spa)); + uint64_t space = metaslab_class_get_space( + spa_dedup_class(spa)); + + (void) printf("\t%-16s %14llu used: %5.2f%%\n", + "Dedup class", (u_longlong_t)alloc, + 100.0 * alloc / space); + } + + for (i = 0; i < NUM_BP_EMBEDDED_TYPES; i++) { + if (zcb.zcb_embedded_blocks[i] == 0) + continue; + (void) printf("\n"); + (void) printf("\tadditional, non-pointer bps of type %u: " + "%10llu\n", + i, (u_longlong_t)zcb.zcb_embedded_blocks[i]); + + if (dump_opt['b'] >= 3) { + (void) printf("\t number of (compressed) bytes: " + "number of bps\n"); + dump_histogram(zcb.zcb_embedded_histogram[i], + sizeof (zcb.zcb_embedded_histogram[i]) / + sizeof (zcb.zcb_embedded_histogram[i][0]), 0); + } + } + + if (tzb->zb_ditto_samevdev != 0) { + (void) printf("\tDittoed blocks on same vdev: %llu\n", + (longlong_t)tzb->zb_ditto_samevdev); + } + if (tzb->zb_ditto_same_ms != 0) { + (void) printf("\tDittoed blocks in same metaslab: %llu\n", + (longlong_t)tzb->zb_ditto_same_ms); + } + + for (uint64_t v = 0; v < spa->spa_root_vdev->vdev_children; v++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[v]; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + if (vim == NULL) { + continue; + } + + char mem[32]; + zdb_nicenum(vdev_indirect_mapping_num_entries(vim), + mem, vdev_indirect_mapping_size(vim)); + + (void) printf("\tindirect vdev id %llu has %llu segments " + "(%s in memory)\n", + (longlong_t)vd->vdev_id, + (longlong_t)vdev_indirect_mapping_num_entries(vim), mem); + } + + if (dump_opt['b'] >= 2) { + int l, t, level; + (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" + "\t avg\t comp\t%%Total\tType\n"); + + for (t = 0; t <= ZDB_OT_TOTAL; t++) { + char csize[32], lsize[32], psize[32], asize[32]; + char avg[32], gang[32]; + const char *typename; + + /* make sure nicenum has enough space */ + CTASSERT(sizeof (csize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (lsize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (psize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (asize) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (avg) >= NN_NUMBUF_SZ); + CTASSERT(sizeof (gang) >= NN_NUMBUF_SZ); + + if (t < DMU_OT_NUMTYPES) + typename = dmu_ot[t].ot_name; + else + typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; + + if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { + (void) printf("%6s\t%5s\t%5s\t%5s" + "\t%5s\t%5s\t%6s\t%s\n", + "-", + "-", + "-", + "-", + "-", + "-", + "-", + typename); + continue; + } + + for (l = ZB_TOTAL - 1; l >= -1; l--) { + level = (l == -1 ? ZB_TOTAL : l); + zb = &zcb.zcb_type[level][t]; + + if (zb->zb_asize == 0) + continue; + + if (dump_opt['b'] < 3 && level != ZB_TOTAL) + continue; + + if (level == 0 && zb->zb_asize == + zcb.zcb_type[ZB_TOTAL][t].zb_asize) + continue; + + zdb_nicenum(zb->zb_count, csize, + sizeof (csize)); + zdb_nicenum(zb->zb_lsize, lsize, + sizeof (lsize)); + zdb_nicenum(zb->zb_psize, psize, + sizeof (psize)); + zdb_nicenum(zb->zb_asize, asize, + sizeof (asize)); + zdb_nicenum(zb->zb_asize / zb->zb_count, avg, + sizeof (avg)); + zdb_nicenum(zb->zb_gangs, gang, sizeof (gang)); + + (void) printf("%6s\t%5s\t%5s\t%5s\t%5s" + "\t%5.2f\t%6.2f\t", + csize, lsize, psize, asize, avg, + (double)zb->zb_lsize / zb->zb_psize, + 100.0 * zb->zb_asize / tzb->zb_asize); + + if (level == ZB_TOTAL) + (void) printf("%s\n", typename); + else + (void) printf(" L%d %s\n", + level, typename); + + if (dump_opt['b'] >= 3 && zb->zb_gangs > 0) { + (void) printf("\t number of ganged " + "blocks: %s\n", gang); + } + + if (dump_opt['b'] >= 4) { + (void) printf("psize " + "(in 512-byte sectors): " + "number of blocks\n"); + dump_histogram(zb->zb_psize_histogram, + PSIZE_HISTO_SIZE, 0); + } + } + } + + /* Output a table summarizing block sizes in the pool */ + if (dump_opt['b'] >= 2) { + dump_size_histograms(&zcb); + } + } + + (void) printf("\n"); + + if (leaks) + return (2); + + if (zcb.zcb_haderrors) + return (3); + + return (0); +} + +typedef struct zdb_ddt_entry { + ddt_key_t zdde_key; + uint64_t zdde_ref_blocks; + uint64_t zdde_ref_lsize; + uint64_t zdde_ref_psize; + uint64_t zdde_ref_dsize; + avl_node_t zdde_node; +} zdb_ddt_entry_t; + +/* ARGSUSED */ +static int +zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + avl_tree_t *t = arg; + avl_index_t where; + zdb_ddt_entry_t *zdde, zdde_search; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) + return (0); + + if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { + (void) printf("traversing objset %llu, %llu objects, " + "%lu blocks so far\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)BP_GET_FILL(bp), + avl_numnodes(t)); + } + + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + return (0); + + ddt_key_fill(&zdde_search.zdde_key, bp); + + zdde = avl_find(t, &zdde_search, &where); + + if (zdde == NULL) { + zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); + zdde->zdde_key = zdde_search.zdde_key; + avl_insert(t, zdde, where); + } + + zdde->zdde_ref_blocks += 1; + zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); + zdde->zdde_ref_psize += BP_GET_PSIZE(bp); + zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); + + return (0); +} + +static void +dump_simulated_ddt(spa_t *spa) +{ + avl_tree_t t; + void *cookie = NULL; + zdb_ddt_entry_t *zdde; + ddt_histogram_t ddh_total; + ddt_stat_t dds_total; + + bzero(&ddh_total, sizeof (ddh_total)); + bzero(&dds_total, sizeof (dds_total)); + avl_create(&t, ddt_entry_compare, + sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, zdb_ddt_add_cb, &t); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { + ddt_stat_t dds; + uint64_t refcnt = zdde->zdde_ref_blocks; + ASSERT(refcnt != 0); + + dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; + dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; + dds.dds_psize = zdde->zdde_ref_psize / refcnt; + dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + + dds.dds_ref_blocks = zdde->zdde_ref_blocks; + dds.dds_ref_lsize = zdde->zdde_ref_lsize; + dds.dds_ref_psize = zdde->zdde_ref_psize; + dds.dds_ref_dsize = zdde->zdde_ref_dsize; + + ddt_stat_add(&ddh_total.ddh_stat[highbit64(refcnt) - 1], + &dds, 0); + + umem_free(zdde, sizeof (*zdde)); + } + + avl_destroy(&t); + + ddt_histogram_stat(&dds_total, &ddh_total); + + (void) printf("Simulated DDT histogram:\n"); + + zpool_dump_ddt(&dds_total, &ddh_total); + + dump_dedup_ratio(&dds_total); +} + +static int +verify_device_removal_feature_counts(spa_t *spa) +{ + uint64_t dr_feature_refcount = 0; + uint64_t oc_feature_refcount = 0; + uint64_t indirect_vdev_count = 0; + uint64_t precise_vdev_count = 0; + uint64_t obsolete_counts_object_count = 0; + uint64_t obsolete_sm_count = 0; + uint64_t obsolete_counts_count = 0; + uint64_t scip_count = 0; + uint64_t obsolete_bpobj_count = 0; + int ret = 0; + + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + if (scip->scip_next_mapping_object != 0) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[scip->scip_vdev]; + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + (void) printf("Condensing indirect vdev %llu: new mapping " + "object %llu, prev obsolete sm %llu\n", + (u_longlong_t)scip->scip_vdev, + (u_longlong_t)scip->scip_next_mapping_object, + (u_longlong_t)scip->scip_prev_obsolete_sm_object); + if (scip->scip_prev_obsolete_sm_object != 0) { + space_map_t *prev_obsolete_sm = NULL; + VERIFY0(space_map_open(&prev_obsolete_sm, + spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, + 0, vd->vdev_asize, 0)); + dump_spacemap(spa->spa_meta_objset, prev_obsolete_sm); + (void) printf("\n"); + space_map_close(prev_obsolete_sm); + } + + scip_count += 2; + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (vic->vic_mapping_object != 0) { + ASSERT(vd->vdev_ops == &vdev_indirect_ops || + vd->vdev_removing); + indirect_vdev_count++; + + if (vd->vdev_indirect_mapping->vim_havecounts) { + obsolete_counts_count++; + } + } + + boolean_t are_precise; + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (are_precise) { + ASSERT(vic->vic_mapping_object != 0); + precise_vdev_count++; + } + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vic->vic_mapping_object != 0); + obsolete_sm_count++; + } + } + + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_DEVICE_REMOVAL], + &dr_feature_refcount); + (void) feature_get_refcount(spa, + &spa_feature_table[SPA_FEATURE_OBSOLETE_COUNTS], + &oc_feature_refcount); + + if (dr_feature_refcount != indirect_vdev_count) { + ret = 1; + (void) printf("Number of indirect vdevs (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)indirect_vdev_count, + (u_longlong_t)dr_feature_refcount); + } else { + (void) printf("Verified device_removal feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)dr_feature_refcount); + } + + if (zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ) == 0) { + obsolete_bpobj_count++; + } + + + obsolete_counts_object_count = precise_vdev_count; + obsolete_counts_object_count += obsolete_sm_count; + obsolete_counts_object_count += obsolete_counts_count; + obsolete_counts_object_count += scip_count; + obsolete_counts_object_count += obsolete_bpobj_count; + obsolete_counts_object_count += remap_deadlist_count; + + if (oc_feature_refcount != obsolete_counts_object_count) { + ret = 1; + (void) printf("Number of obsolete counts objects (%llu) " \ + "does not match feature count (%llu)\n", + (u_longlong_t)obsolete_counts_object_count, + (u_longlong_t)oc_feature_refcount); + (void) printf("pv:%llu os:%llu oc:%llu sc:%llu " + "ob:%llu rd:%llu\n", + (u_longlong_t)precise_vdev_count, + (u_longlong_t)obsolete_sm_count, + (u_longlong_t)obsolete_counts_count, + (u_longlong_t)scip_count, + (u_longlong_t)obsolete_bpobj_count, + (u_longlong_t)remap_deadlist_count); + } else { + (void) printf("Verified indirect_refcount feature refcount " \ + "of %llu is correct\n", + (u_longlong_t)oc_feature_refcount); + } + return (ret); +} + +static void +zdb_set_skip_mmp(char *target) +{ + spa_t *spa; + + /* + * Disable the activity check to allow examination of + * active pools. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL) { + spa->spa_import_flags |= ZFS_IMPORT_SKIP_MMP; + } + mutex_exit(&spa_namespace_lock); +} + +#define BOGUS_SUFFIX "_CHECKPOINTED_UNIVERSE" +/* + * Import the checkpointed state of the pool specified by the target + * parameter as readonly. The function also accepts a pool config + * as an optional parameter, else it attempts to infer the config by + * the name of the target pool. + * + * Note that the checkpointed state's pool name will be the name of + * the original pool with the above suffix appended to it. In addition, + * if the target is not a pool name (e.g. a path to a dataset) then + * the new_path parameter is populated with the updated path to + * reflect the fact that we are looking into the checkpointed state. + * + * The function returns a newly-allocated copy of the name of the + * pool containing the checkpointed state. When this copy is no + * longer needed it should be freed with free(3C). Same thing + * applies to the new_path parameter if allocated. + */ +static char * +import_checkpointed_state(char *target, nvlist_t *cfg, char **new_path) +{ + int error = 0; + char *poolname, *bogus_name = NULL; + + /* If the target is not a pool, the extract the pool name */ + char *path_start = strchr(target, '/'); + if (path_start != NULL) { + size_t poolname_len = path_start - target; + poolname = strndup(target, poolname_len); + } else { + poolname = target; + } + + if (cfg == NULL) { + zdb_set_skip_mmp(poolname); + error = spa_get_stats(poolname, &cfg, NULL, 0); + if (error != 0) { + fatal("Tried to read config of pool \"%s\" but " + "spa_get_stats() failed with error %d\n", + poolname, error); + } + } + + if (asprintf(&bogus_name, "%s%s", poolname, BOGUS_SUFFIX) == -1) + return (NULL); + fnvlist_add_string(cfg, ZPOOL_CONFIG_POOL_NAME, bogus_name); + + error = spa_import(bogus_name, cfg, NULL, + ZFS_IMPORT_MISSING_LOG | ZFS_IMPORT_CHECKPOINT | + ZFS_IMPORT_SKIP_MMP); + if (error != 0) { + fatal("Tried to import pool \"%s\" but spa_import() failed " + "with error %d\n", bogus_name, error); + } + + if (new_path != NULL && path_start != NULL) { + if (asprintf(new_path, "%s%s", bogus_name, path_start) == -1) { + if (path_start != NULL) + free(poolname); + return (NULL); + } + } + + if (target != poolname) + free(poolname); + + return (bogus_name); +} + +typedef struct verify_checkpoint_sm_entry_cb_arg { + vdev_t *vcsec_vd; + + /* the following fields are only used for printing progress */ + uint64_t vcsec_entryid; + uint64_t vcsec_num_entries; +} verify_checkpoint_sm_entry_cb_arg_t; + +#define ENTRIES_PER_PROGRESS_UPDATE 10000 + +static int +verify_checkpoint_sm_entry_cb(space_map_entry_t *sme, void *arg) +{ + verify_checkpoint_sm_entry_cb_arg_t *vcsec = arg; + vdev_t *vd = vcsec->vcsec_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + ASSERT(sme->sme_type == SM_FREE); + + if ((vcsec->vcsec_entryid % ENTRIES_PER_PROGRESS_UPDATE) == 0) { + (void) fprintf(stderr, + "\rverifying vdev %llu, space map entry %llu of %llu ...", + (longlong_t)vd->vdev_id, + (longlong_t)vcsec->vcsec_entryid, + (longlong_t)vcsec->vcsec_num_entries); + } + vcsec->vcsec_entryid++; + + /* + * See comment in checkpoint_sm_exclude_entry_cb() + */ + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * The entries in the vdev_checkpoint_sm should be marked as + * allocated in the checkpointed state of the pool, therefore + * their respective ms_allocateable trees should not contain them. + */ + mutex_enter(&ms->ms_lock); + range_tree_verify_not_present(ms->ms_allocatable, + sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + return (0); +} + +/* + * Verify that all segments in the vdev_checkpoint_sm are allocated + * according to the checkpoint's ms_sm (i.e. are not in the checkpoint's + * ms_allocatable). + * + * Do so by comparing the checkpoint space maps (vdev_checkpoint_sm) of + * each vdev in the current state of the pool to the metaslab space maps + * (ms_sm) of the checkpointed state of the pool. + * + * Note that the function changes the state of the ms_allocatable + * trees of the current spa_t. The entries of these ms_allocatable + * trees are cleared out and then repopulated from with the free + * entries of their respective ms_sm space maps. + */ +static void +verify_checkpoint_vdev_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_FREE); + + for (uint64_t c = 0; c < ckpoint_rvd->vdev_children; c++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[c]; + vdev_t *current_vd = current_rvd->vdev_child[c]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * Since we don't allow device removal in a pool + * that has a checkpoint, we expect that all removed + * vdevs were removed from the pool before the + * checkpoint. + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + /* + * If the checkpoint space map doesn't exist, then nothing + * here is checkpointed so there's nothing to verify. + */ + if (current_vd->vdev_top_zap == 0 || + zap_contains(spa_meta_objset(current), + current_vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(current), + current_vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(current), + checkpoint_sm_obj, 0, current_vd->vdev_asize, + current_vd->vdev_ashift)); + + verify_checkpoint_sm_entry_cb_arg_t vcsec; + vcsec.vcsec_vd = ckpoint_vd; + vcsec.vcsec_entryid = 0; + vcsec.vcsec_num_entries = + space_map_length(checkpoint_sm) / sizeof (uint64_t); + VERIFY0(space_map_iterate(checkpoint_sm, + space_map_length(checkpoint_sm), + verify_checkpoint_sm_entry_cb, &vcsec)); + if (dump_opt['m'] > 3) + dump_spacemap(current->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } + + /* + * If we've added vdevs since we took the checkpoint, ensure + * that their checkpoint space maps are empty. + */ + if (ckpoint_rvd->vdev_children < current_rvd->vdev_children) { + for (uint64_t c = ckpoint_rvd->vdev_children; + c < current_rvd->vdev_children; c++) { + vdev_t *current_vd = current_rvd->vdev_child[c]; + ASSERT3P(current_vd->vdev_checkpoint_sm, ==, NULL); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +/* + * Verifies that all space that's allocated in the checkpoint is + * still allocated in the current version, by checking that everything + * in checkpoint's ms_allocatable (which is actually allocated, not + * allocatable/free) is not present in current's ms_allocatable. + * + * Note that the function changes the state of the ms_allocatable + * trees of both spas when called. The entries of all ms_allocatable + * trees are cleared out and then repopulated from their respective + * ms_sm space maps. In the checkpointed state we load the allocated + * entries, and in the current state we load the free entries. + */ +static void +verify_checkpoint_ms_spacemaps(spa_t *checkpoint, spa_t *current) +{ + vdev_t *ckpoint_rvd = checkpoint->spa_root_vdev; + vdev_t *current_rvd = current->spa_root_vdev; + + load_concrete_ms_allocatable_trees(checkpoint, SM_ALLOC); + load_concrete_ms_allocatable_trees(current, SM_FREE); + + for (uint64_t i = 0; i < ckpoint_rvd->vdev_children; i++) { + vdev_t *ckpoint_vd = ckpoint_rvd->vdev_child[i]; + vdev_t *current_vd = current_rvd->vdev_child[i]; + + if (ckpoint_vd->vdev_ops == &vdev_indirect_ops) { + /* + * See comment in verify_checkpoint_vdev_spacemaps() + */ + ASSERT3P(current_vd->vdev_ops, ==, &vdev_indirect_ops); + continue; + } + + for (uint64_t m = 0; m < ckpoint_vd->vdev_ms_count; m++) { + metaslab_t *ckpoint_msp = ckpoint_vd->vdev_ms[m]; + metaslab_t *current_msp = current_vd->vdev_ms[m]; + + (void) fprintf(stderr, + "\rverifying vdev %llu of %llu, " + "metaslab %llu of %llu ...", + (longlong_t)current_vd->vdev_id, + (longlong_t)current_rvd->vdev_children, + (longlong_t)current_vd->vdev_ms[m]->ms_id, + (longlong_t)current_vd->vdev_ms_count); + + /* + * We walk through the ms_allocatable trees that + * are loaded with the allocated blocks from the + * ms_sm spacemaps of the checkpoint. For each + * one of these ranges we ensure that none of them + * exists in the ms_allocatable trees of the + * current state which are loaded with the ranges + * that are currently free. + * + * This way we ensure that none of the blocks that + * are part of the checkpoint were freed by mistake. + */ + range_tree_walk(ckpoint_msp->ms_allocatable, + (range_tree_func_t *)range_tree_verify_not_present, + current_msp->ms_allocatable); + } + } + + /* for cleaner progress output */ + (void) fprintf(stderr, "\n"); +} + +static void +verify_checkpoint_blocks(spa_t *spa) +{ + ASSERT(!dump_opt['L']); + + spa_t *checkpoint_spa; + char *checkpoint_pool; + nvlist_t *config = NULL; + int error = 0; + + /* + * We import the checkpointed state of the pool (under a different + * name) so we can do verification on it against the current state + * of the pool. + */ + checkpoint_pool = import_checkpointed_state(spa->spa_name, config, + NULL); + ASSERT(strcmp(spa->spa_name, checkpoint_pool) != 0); + + error = spa_open(checkpoint_pool, &checkpoint_spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but spa_open() failed with " + "error %d\n", checkpoint_pool, error); + } + + /* + * Ensure that ranges in the checkpoint space maps of each vdev + * are allocated according to the checkpointed state's metaslab + * space maps. + */ + verify_checkpoint_vdev_spacemaps(checkpoint_spa, spa); + + /* + * Ensure that allocated ranges in the checkpoint's metaslab + * space maps remain allocated in the metaslab space maps of + * the current state. + */ + verify_checkpoint_ms_spacemaps(checkpoint_spa, spa); + + /* + * Once we are done, we get rid of the checkpointed state. + */ + spa_close(checkpoint_spa, FTAG); + free(checkpoint_pool); +} + +static void +dump_leftover_checkpoint_blocks(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + + space_map_t *checkpoint_sm = NULL; + uint64_t checkpoint_sm_obj; + + if (vd->vdev_top_zap == 0) + continue; + + if (zap_contains(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM) != 0) + continue; + + VERIFY0(zap_lookup(spa_meta_objset(spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (uint64_t), 1, &checkpoint_sm_obj)); + + VERIFY0(space_map_open(&checkpoint_sm, spa_meta_objset(spa), + checkpoint_sm_obj, 0, vd->vdev_asize, vd->vdev_ashift)); + dump_spacemap(spa->spa_meta_objset, checkpoint_sm); + space_map_close(checkpoint_sm); + } +} + +static int +verify_checkpoint(spa_t *spa) +{ + uberblock_t checkpoint; + int error; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (0); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT && !dump_opt['L']) { + /* + * If the feature is active but the uberblock is missing + * then we must be in the middle of discarding the + * checkpoint. + */ + (void) printf("\nPartially discarded checkpoint " + "state found:\n"); + if (dump_opt['m'] > 3) + dump_leftover_checkpoint_blocks(spa); + return (0); + } else if (error != 0) { + (void) printf("lookup error %d when looking for " + "checkpointed uberblock in MOS\n", error); + return (error); + } + dump_uberblock(&checkpoint, "\nCheckpointed uberblock found:\n", "\n"); + + if (checkpoint.ub_checkpoint_txg == 0) { + (void) printf("\nub_checkpoint_txg not set in checkpointed " + "uberblock\n"); + error = 3; + } + + if (error == 0 && !dump_opt['L']) + verify_checkpoint_blocks(spa); + + return (error); +} + +/* ARGSUSED */ +static void +mos_leaks_cb(void *arg, uint64_t start, uint64_t size) +{ + for (uint64_t i = start; i < size; i++) { + (void) printf("MOS object %llu referenced but not allocated\n", + (u_longlong_t)i); + } +} + +static void +mos_obj_refd(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL) + range_tree_add(mos_refd_objs, obj, 1); +} + +/* + * Call on a MOS object that may already have been referenced. + */ +static void +mos_obj_refd_multiple(uint64_t obj) +{ + if (obj != 0 && mos_refd_objs != NULL && + !range_tree_contains(mos_refd_objs, obj, 1)) + range_tree_add(mos_refd_objs, obj, 1); +} + +static void +mos_leak_vdev_top_zap(vdev_t *vd) +{ + uint64_t ms_flush_data_obj; + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (ms_flush_data_obj), 1, &ms_flush_data_obj); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(ms_flush_data_obj); +} + +static void +mos_leak_vdev(vdev_t *vd) +{ + mos_obj_refd(vd->vdev_dtl_object); + mos_obj_refd(vd->vdev_ms_array); + mos_obj_refd(vd->vdev_indirect_config.vic_births_object); + mos_obj_refd(vd->vdev_indirect_config.vic_mapping_object); + mos_obj_refd(vd->vdev_leaf_zap); + if (vd->vdev_checkpoint_sm != NULL) + mos_obj_refd(vd->vdev_checkpoint_sm->sm_object); + if (vd->vdev_indirect_mapping != NULL) { + mos_obj_refd(vd->vdev_indirect_mapping-> + vim_phys->vimp_counts_object); + } + if (vd->vdev_obsolete_sm != NULL) + mos_obj_refd(vd->vdev_obsolete_sm->sm_object); + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + mos_obj_refd(space_map_object(ms->ms_sm)); + } + + if (vd->vdev_top_zap != 0) { + mos_obj_refd(vd->vdev_top_zap); + mos_leak_vdev_top_zap(vd); + } + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + mos_leak_vdev(vd->vdev_child[c]); + } +} + +static void +mos_leak_log_spacemaps(spa_t *spa) +{ + uint64_t spacemap_zap; + int error = zap_lookup(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_LOG_SPACEMAP_ZAP, + sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) + return; + ASSERT0(error); + + mos_obj_refd(spacemap_zap); + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) + mos_obj_refd(sls->sls_sm_obj); +} + +static int +dump_mos_leaks(spa_t *spa) +{ + int rv = 0; + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + + /* Visit and mark all referenced objects in the MOS */ + + mos_obj_refd(DMU_POOL_DIRECTORY_OBJECT); + mos_obj_refd(spa->spa_pool_props_object); + mos_obj_refd(spa->spa_config_object); + mos_obj_refd(spa->spa_ddt_stat_object); + mos_obj_refd(spa->spa_feat_desc_obj); + mos_obj_refd(spa->spa_feat_enabled_txg_obj); + mos_obj_refd(spa->spa_feat_for_read_obj); + mos_obj_refd(spa->spa_feat_for_write_obj); + mos_obj_refd(spa->spa_history); + mos_obj_refd(spa->spa_errlog_last); + mos_obj_refd(spa->spa_errlog_scrub); + mos_obj_refd(spa->spa_all_vdev_zaps); + mos_obj_refd(spa->spa_dsl_pool->dp_bptree_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_tmp_userrefs_obj); + mos_obj_refd(spa->spa_dsl_pool->dp_scan->scn_phys.scn_queue_obj); + bpobj_count_refd(&spa->spa_deferred_bpobj); + mos_obj_refd(dp->dp_empty_bpobj); + bpobj_count_refd(&dp->dp_obsolete_bpobj); + bpobj_count_refd(&dp->dp_free_bpobj); + mos_obj_refd(spa->spa_l2cache.sav_object); + mos_obj_refd(spa->spa_spares.sav_object); + + if (spa->spa_syncing_log_sm != NULL) + mos_obj_refd(spa->spa_syncing_log_sm->sm_object); + mos_leak_log_spacemaps(spa); + + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_next_mapping_object); + mos_obj_refd(spa->spa_condensing_indirect_phys. + scip_prev_obsolete_sm_object); + if (spa->spa_condensing_indirect_phys.scip_next_mapping_object != 0) { + vdev_indirect_mapping_t *vim = + vdev_indirect_mapping_open(mos, + spa->spa_condensing_indirect_phys.scip_next_mapping_object); + mos_obj_refd(vim->vim_phys->vimp_counts_object); + vdev_indirect_mapping_close(vim); + } + deleted_livelists_dump_mos(spa); + + if (dp->dp_origin_snap != NULL) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(dp->dp_origin_snap)->ds_next_snap_obj, + FTAG, &ds)); + count_ds_mos_objects(ds); + dump_blkptr_list(&ds->ds_deadlist, "Deadlist"); + dsl_dataset_rele(ds, FTAG); + dsl_pool_config_exit(dp, FTAG); + + count_ds_mos_objects(dp->dp_origin_snap); + dump_blkptr_list(&dp->dp_origin_snap->ds_deadlist, "Deadlist"); + } + count_dir_mos_objects(dp->dp_mos_dir); + if (dp->dp_free_dir != NULL) + count_dir_mos_objects(dp->dp_free_dir); + if (dp->dp_leak_dir != NULL) + count_dir_mos_objects(dp->dp_leak_dir); + + mos_leak_vdev(spa->spa_root_vdev); + + for (uint64_t class = 0; class < DDT_CLASSES; class++) { + for (uint64_t type = 0; type < DDT_TYPES; type++) { + for (uint64_t cksum = 0; + cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { + ddt_t *ddt = spa->spa_ddt[cksum]; + mos_obj_refd(ddt->ddt_object[type][class]); + } + } + } + + /* + * Visit all allocated objects and make sure they are referenced. + */ + uint64_t object = 0; + while (dmu_object_next(mos, &object, B_FALSE, 0) == 0) { + if (range_tree_contains(mos_refd_objs, object, 1)) { + range_tree_remove(mos_refd_objs, object, 1); + } else { + dmu_object_info_t doi; + const char *name; + dmu_object_info(mos, object, &doi); + if (doi.doi_type & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(doi.doi_type); + name = dmu_ot_byteswap[bswap].ob_name; + } else { + name = dmu_ot[doi.doi_type].ot_name; + } + + (void) printf("MOS object %llu (%s) leaked\n", + (u_longlong_t)object, name); + rv = 2; + } + } + (void) range_tree_walk(mos_refd_objs, mos_leaks_cb, NULL); + if (!range_tree_is_empty(mos_refd_objs)) + rv = 2; + range_tree_vacate(mos_refd_objs, NULL, NULL); + range_tree_destroy(mos_refd_objs); + return (rv); +} + +typedef struct log_sm_obsolete_stats_arg { + uint64_t lsos_current_txg; + + uint64_t lsos_total_entries; + uint64_t lsos_valid_entries; + + uint64_t lsos_sm_entries; + uint64_t lsos_valid_sm_entries; +} log_sm_obsolete_stats_arg_t; + +static int +log_spacemap_obsolete_stats_cb(spa_t *spa, space_map_entry_t *sme, + uint64_t txg, void *arg) +{ + log_sm_obsolete_stats_arg_t *lsos = arg; + + uint64_t offset = sme->sme_offset; + uint64_t vdev_id = sme->sme_vdev; + + if (lsos->lsos_current_txg == 0) { + /* this is the first log */ + lsos->lsos_current_txg = txg; + } else if (lsos->lsos_current_txg < txg) { + /* we just changed log - print stats and reset */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos->lsos_valid_sm_entries, + (u_longlong_t)lsos->lsos_sm_entries, + (u_longlong_t)lsos->lsos_current_txg); + lsos->lsos_valid_sm_entries = 0; + lsos->lsos_sm_entries = 0; + lsos->lsos_current_txg = txg; + } + ASSERT3U(lsos->lsos_current_txg, ==, txg); + + lsos->lsos_sm_entries++; + lsos->lsos_total_entries++; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(sme->sme_type == SM_ALLOC || sme->sme_type == SM_FREE); + + if (txg < metaslab_unflushed_txg(ms)) + return (0); + lsos->lsos_valid_sm_entries++; + lsos->lsos_valid_entries++; + return (0); +} + +static void +dump_log_spacemap_obsolete_stats(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + log_sm_obsolete_stats_arg_t lsos; + bzero(&lsos, sizeof (lsos)); + + (void) printf("Log Space Map Obsolete Entry Statistics:\n"); + + iterate_through_spacemap_logs(spa, + log_spacemap_obsolete_stats_cb, &lsos); + + /* print stats for latest log */ + (void) printf("%-8llu valid entries out of %-8llu - txg %llu\n", + (u_longlong_t)lsos.lsos_valid_sm_entries, + (u_longlong_t)lsos.lsos_sm_entries, + (u_longlong_t)lsos.lsos_current_txg); + + (void) printf("%-8llu valid entries out of %-8llu - total\n\n", + (u_longlong_t)lsos.lsos_valid_entries, + (u_longlong_t)lsos.lsos_total_entries); +} + +static void +dump_zpool(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + int rc = 0; + + if (dump_opt['y']) { + livelist_metaslab_validate(spa); + } + + if (dump_opt['S']) { + dump_simulated_ddt(spa); + return; + } + + if (!dump_opt['e'] && dump_opt['C'] > 1) { + (void) printf("\nCached configuration:\n"); + dump_nvlist(spa->spa_config, 8); + } + + if (dump_opt['C']) + dump_config(spa); + + if (dump_opt['u']) + dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); + + if (dump_opt['D']) + dump_all_ddts(spa); + + if (dump_opt['d'] > 2 || dump_opt['m']) + dump_metaslabs(spa); + if (dump_opt['M']) + dump_metaslab_groups(spa); + if (dump_opt['d'] > 2 || dump_opt['m']) { + dump_log_spacemaps(spa); + dump_log_spacemap_obsolete_stats(spa); + } + + if (dump_opt['d'] || dump_opt['i']) { + spa_feature_t f; + mos_refd_objs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); + dump_objset(dp->dp_meta_objset); + + if (dump_opt['d'] >= 3) { + dsl_pool_t *dp = spa->spa_dsl_pool; + dump_full_bpobj(&spa->spa_deferred_bpobj, + "Deferred frees", 0); + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + dump_full_bpobj(&dp->dp_free_bpobj, + "Pool snapshot frees", 0); + } + if (bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_DEVICE_REMOVAL)); + dump_full_bpobj(&dp->dp_obsolete_bpobj, + "Pool obsolete blocks", 0); + } + + if (spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dump_bptree(spa->spa_meta_objset, + dp->dp_bptree_obj, + "Pool dataset frees"); + } + dump_dtl(spa->spa_root_vdev, 0); + } + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) + global_feature_count[f] = UINT64_MAX; + global_feature_count[SPA_FEATURE_REDACTION_BOOKMARKS] = 0; + global_feature_count[SPA_FEATURE_BOOKMARK_WRITTEN] = 0; + global_feature_count[SPA_FEATURE_LIVELIST] = 0; + + (void) dmu_objset_find(spa_name(spa), dump_one_objset, + NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + + if (rc == 0 && !dump_opt['L']) + rc = dump_mos_leaks(spa); + + for (f = 0; f < SPA_FEATURES; f++) { + uint64_t refcount; + + uint64_t *arr; + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + if (global_feature_count[f] == UINT64_MAX) + continue; + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(global_feature_count[f]); + continue; + } + arr = global_feature_count; + } else { + if (!spa_feature_is_enabled(spa, f)) { + ASSERT0(dataset_feature_count[f]); + continue; + } + arr = dataset_feature_count; + } + if (feature_get_refcount(spa, &spa_feature_table[f], + &refcount) == ENOTSUP) + continue; + if (arr[f] != refcount) { + (void) printf("%s feature refcount mismatch: " + "%lld consumers != %lld refcount\n", + spa_feature_table[f].fi_uname, + (longlong_t)arr[f], (longlong_t)refcount); + rc = 2; + } else { + (void) printf("Verified %s feature refcount " + "of %llu is correct\n", + spa_feature_table[f].fi_uname, + (longlong_t)refcount); + } + } + + if (rc == 0) + rc = verify_device_removal_feature_counts(spa); + } + + if (rc == 0 && (dump_opt['b'] || dump_opt['c'])) + rc = dump_block_stats(spa); + + if (rc == 0) + rc = verify_spacemap_refcounts(spa); + + if (dump_opt['s']) + show_pool_stats(spa); + + if (dump_opt['h']) + dump_history(spa); + + if (rc == 0) + rc = verify_checkpoint(spa); + + if (rc != 0) { + dump_debug_buffer(); + exit(rc); + } +} + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + +static int flagbits[256]; +static char flagbitstr[16]; + +static void +zdb_print_blkptr(const blkptr_t *bp, int flags) +{ + char blkbuf[BP_SPRINTF_LEN]; + + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s\n", blkbuf); +} + +static void +zdb_dump_indirect(blkptr_t *bp, int nbps, int flags) +{ + int i; + + for (i = 0; i < nbps; i++) + zdb_print_blkptr(&bp[i], flags); +} + +static void +zdb_dump_gbh(void *buf, int flags) +{ + zdb_dump_indirect((blkptr_t *)buf, SPA_GBH_NBLKPTRS, flags); +} + +static void +zdb_dump_block_raw(void *buf, uint64_t size, int flags) +{ + if (flags & ZDB_FLAG_BSWAP) + byteswap_uint64_array(buf, size); + VERIFY(write(fileno(stdout), buf, size) == size); +} + +static void +zdb_dump_block(char *label, void *buf, uint64_t size, int flags) +{ + uint64_t *d = (uint64_t *)buf; + unsigned nwords = size / sizeof (uint64_t); + int do_bswap = !!(flags & ZDB_FLAG_BSWAP); + unsigned i, j; + const char *hdr; + char *c; + + + if (do_bswap) + hdr = " 7 6 5 4 3 2 1 0 f e d c b a 9 8"; + else + hdr = " 0 1 2 3 4 5 6 7 8 9 a b c d e f"; + + (void) printf("\n%s\n%6s %s 0123456789abcdef\n", label, "", hdr); + +#ifdef _LITTLE_ENDIAN + /* correct the endianness */ + do_bswap = !do_bswap; +#endif + for (i = 0; i < nwords; i += 2) { + (void) printf("%06llx: %016llx %016llx ", + (u_longlong_t)(i * sizeof (uint64_t)), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i]) : d[i]), + (u_longlong_t)(do_bswap ? BSWAP_64(d[i + 1]) : d[i + 1])); + + c = (char *)&d[i]; + for (j = 0; j < 2 * sizeof (uint64_t); j++) + (void) printf("%c", isprint(c[j]) ? c[j] : '.'); + (void) printf("\n"); + } +} + +/* + * There are two acceptable formats: + * leaf_name - For example: c1t0d0 or /tmp/ztest.0a + * child[.child]* - For example: 0.1.1 + * + * The second form can be used to specify arbitrary vdevs anywhere + * in the hierarchy. For example, in a pool with a mirror of + * RAID-Zs, you can specify either RAID-Z vdev with 0.0 or 0.1 . + */ +static vdev_t * +zdb_vdev_lookup(vdev_t *vdev, const char *path) +{ + char *s, *p, *q; + unsigned i; + + if (vdev == NULL) + return (NULL); + + /* First, assume the x.x.x.x format */ + i = strtoul(path, &s, 10); + if (s == path || (s && *s != '.' && *s != '\0')) + goto name; + if (i >= vdev->vdev_children) + return (NULL); + + vdev = vdev->vdev_child[i]; + if (s && *s == '\0') + return (vdev); + return (zdb_vdev_lookup(vdev, s+1)); + +name: + for (i = 0; i < vdev->vdev_children; i++) { + vdev_t *vc = vdev->vdev_child[i]; + + if (vc->vdev_path == NULL) { + vc = zdb_vdev_lookup(vc, path); + if (vc == NULL) + continue; + else + return (vc); + } + + p = strrchr(vc->vdev_path, '/'); + p = p ? p + 1 : vc->vdev_path; + q = &vc->vdev_path[strlen(vc->vdev_path) - 2]; + + if (strcmp(vc->vdev_path, path) == 0) + return (vc); + if (strcmp(p, path) == 0) + return (vc); + if (strcmp(q, "s0") == 0 && strncmp(p, path, q - p) == 0) + return (vc); + } + + return (NULL); +} + +static int +name_from_objset_id(spa_t *spa, uint64_t objset_id, char *outstr) +{ + dsl_dataset_t *ds; + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + int error = dsl_dataset_hold_obj(spa->spa_dsl_pool, objset_id, + NULL, &ds); + if (error != 0) { + (void) fprintf(stderr, "failed to hold objset %llu: %s\n", + (u_longlong_t)objset_id, strerror(error)); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (error); + } + dsl_dataset_name(ds, outstr); + dsl_dataset_rele(ds, NULL); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + return (0); +} + +static boolean_t +zdb_parse_block_sizes(char *sizes, uint64_t *lsize, uint64_t *psize) +{ + char *s0, *s1; + + if (sizes == NULL) + return (B_FALSE); + + s0 = strtok(sizes, "/"); + if (s0 == NULL) + return (B_FALSE); + s1 = strtok(NULL, "/"); + *lsize = strtoull(s0, NULL, 16); + *psize = s1 ? strtoull(s1, NULL, 16) : *lsize; + return (*lsize >= *psize && *psize > 0); +} + +#define ZIO_COMPRESS_MASK(alg) (1ULL << (ZIO_COMPRESS_##alg)) + +static boolean_t +zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, + uint64_t psize, int flags) +{ + boolean_t exceeded = B_FALSE; + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + int cfuncs[ZIO_COMPRESS_FUNCTIONS] = { 0 }; + int *cfuncp = cfuncs; + uint64_t maxlsize = SPA_MAXBLOCKSIZE; + uint64_t mask = ZIO_COMPRESS_MASK(ON) | ZIO_COMPRESS_MASK(OFF) | + ZIO_COMPRESS_MASK(INHERIT) | ZIO_COMPRESS_MASK(EMPTY) | + (getenv("ZDB_NO_ZLE") ? ZIO_COMPRESS_MASK(ZLE) : 0); + *cfuncp++ = ZIO_COMPRESS_LZ4; + *cfuncp++ = ZIO_COMPRESS_LZJB; + mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) + if (((1ULL << c) & mask) == 0) + *cfuncp++ = c; + + /* + * On the one hand, with SPA_MAXBLOCKSIZE at 16MB, this + * could take a while and we should let the user know + * we are not stuck. On the other hand, printing progress + * info gets old after a while. User can specify 'v' flag + * to see the progression. + */ + if (lsize == psize) + lsize += SPA_MINBLOCKSIZE; + else + maxlsize = lsize; + for (; lsize <= maxlsize; lsize += SPA_MINBLOCKSIZE) { + for (cfuncp = cfuncs; *cfuncp; cfuncp++) { + if (flags & ZDB_FLAG_VERBOSE) { + (void) fprintf(stderr, + "Trying %05llx -> %05llx (%s)\n", + (u_longlong_t)psize, + (u_longlong_t)lsize, + zio_compress_table[*cfuncp].\ + ci_name); + } + + /* + * We randomize lbuf2, and decompress to both + * lbuf and lbuf2. This way, we will know if + * decompression fill exactly to lsize. + */ + VERIFY0(random_get_pseudo_bytes(lbuf2, lsize)); + + if (zio_decompress_data(*cfuncp, pabd, + lbuf, psize, lsize, NULL) == 0 && + zio_decompress_data(*cfuncp, pabd, + lbuf2, psize, lsize, NULL) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (*cfuncp != 0) + break; + } + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize > maxlsize) { + exceeded = B_TRUE; + } + buf = lbuf; + if (*cfuncp == ZIO_COMPRESS_ZLE) { + printf("\nZLE decompression was selected. If you " + "suspect the results are wrong,\ntry avoiding ZLE " + "by setting and exporting ZDB_NO_ZLE=\"true\"\n"); + } + + return (exceeded); +} + +/* + * Read a block from a pool and print it out. The syntax of the + * block descriptor is: + * + * pool:vdev_specifier:offset:[lsize/]psize[:flags] + * + * pool - The name of the pool you wish to read from + * vdev_specifier - Which vdev (see comment for zdb_vdev_lookup) + * offset - offset, in hex, in bytes + * size - Amount of data to read, in hex, in bytes + * flags - A string of characters specifying options + * b: Decode a blkptr at given offset within block + * c: Calculate and display checksums + * d: Decompress data before dumping + * e: Byteswap data before dumping + * g: Display data as a gang block header + * i: Display as an indirect block + * r: Dump raw data to stdout + * v: Verbose + * + */ +static void +zdb_read_block(char *thing, spa_t *spa) +{ + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; + int flags = 0; + uint64_t offset = 0, psize = 0, lsize = 0, blkptr_offset = 0; + zio_t *zio; + vdev_t *vd; + abd_t *pabd; + void *lbuf, *buf; + char *s, *p, *dup, *vdev, *flagstr, *sizes; + int i, error; + boolean_t borrowed = B_FALSE, found = B_FALSE; + + dup = strdup(thing); + s = strtok(dup, ":"); + vdev = s ? s : ""; + s = strtok(NULL, ":"); + offset = strtoull(s ? s : "", NULL, 16); + sizes = strtok(NULL, ":"); + s = strtok(NULL, ":"); + flagstr = strdup(s ? s : ""); + + s = NULL; + if (!zdb_parse_block_sizes(sizes, &lsize, &psize)) + s = "invalid size(s)"; + if (!IS_P2ALIGNED(psize, DEV_BSIZE) || !IS_P2ALIGNED(lsize, DEV_BSIZE)) + s = "size must be a multiple of sector size"; + if (!IS_P2ALIGNED(offset, DEV_BSIZE)) + s = "offset must be a multiple of sector size"; + if (s) { + (void) printf("Invalid block specifier: %s - %s\n", thing, s); + goto done; + } + + for (s = strtok(flagstr, ":"); s; s = strtok(NULL, ":")) { + for (i = 0; i < strlen(flagstr); i++) { + int bit = flagbits[(uchar_t)flagstr[i]]; + + if (bit == 0) { + (void) printf("***Ignoring flag: %c\n", + (uchar_t)flagstr[i]); + continue; + } + found = B_TRUE; + flags |= bit; + + p = &flagstr[i + 1]; + if (*p != ':' && *p != '\0') { + int j = 0, nextbit = flagbits[(uchar_t)*p]; + char *end, offstr[8] = { 0 }; + if ((bit == ZDB_FLAG_PRINT_BLKPTR) && + (nextbit == 0)) { + /* look ahead to isolate the offset */ + while (nextbit == 0 && + strchr(flagbitstr, *p) == NULL) { + offstr[j] = *p; + j++; + if (i + j > strlen(flagstr)) + break; + p++; + nextbit = flagbits[(uchar_t)*p]; + } + blkptr_offset = strtoull(offstr, &end, + 16); + i += j; + } else if (nextbit == 0) { + (void) printf("***Ignoring flag arg:" + " '%c'\n", (uchar_t)*p); + } + } + } + } + if (blkptr_offset % sizeof (blkptr_t)) { + printf("Block pointer offset 0x%llx " + "must be divisible by 0x%x\n", + (longlong_t)blkptr_offset, (int)sizeof (blkptr_t)); + goto done; + } + if (found == B_FALSE && strlen(flagstr) > 0) { + printf("Invalid flag arg: '%s'\n", flagstr); + goto done; + } + + vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); + if (vd == NULL) { + (void) printf("***Invalid vdev: %s\n", vdev); + free(dup); + return; + } else { + if (vd->vdev_path) + (void) fprintf(stderr, "Found vdev: %s\n", + vd->vdev_path); + else + (void) fprintf(stderr, "Found vdev type: %s\n", + vd->vdev_ops->vdev_op_type); + } + + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + zio = zio_root(spa, NULL, NULL, 0); + + if (vd == vd->vdev_top) { + /* + * Treat this as a normal block read. + */ + zio_nowait(zio_read(zio, spa, bp, pabd, psize, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); + } else { + /* + * Treat this as a vdev child I/O. + */ + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pabd, + psize, ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + + error = zio_wait(zio); + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error) { + (void) printf("Read of %s failed, error: %d\n", thing, error); + goto out; + } + + uint64_t orig_lsize = lsize; + buf = lbuf; + if (flags & ZDB_FLAG_DECOMPRESS) { + boolean_t failed = zdb_decompress_block(pabd, buf, lbuf, + lsize, psize, flags); + if (failed) { + (void) printf("Decompress of %s failed\n", thing); + goto out; + } + } else { + buf = abd_borrow_buf_copy(pabd, lsize); + borrowed = B_TRUE; + } + /* + * Try to detect invalid block pointer. If invalid, try + * decompressing. + */ + if ((flags & ZDB_FLAG_PRINT_BLKPTR || flags & ZDB_FLAG_INDIRECT) && + !(flags & ZDB_FLAG_DECOMPRESS)) { + const blkptr_t *b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (zfs_blkptr_verify(spa, b, B_FALSE, BLK_VERIFY_ONLY) == + B_FALSE) { + abd_return_buf_copy(pabd, buf, lsize); + borrowed = B_FALSE; + buf = lbuf; + boolean_t failed = zdb_decompress_block(pabd, buf, + lbuf, lsize, psize, flags); + b = (const blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset); + if (failed || zfs_blkptr_verify(spa, b, B_FALSE, + BLK_VERIFY_LOG) == B_FALSE) { + printf("invalid block pointer at this DVA\n"); + goto out; + } + } + } + + if (flags & ZDB_FLAG_PRINT_BLKPTR) + zdb_print_blkptr((blkptr_t *)(void *) + ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); + else if (flags & ZDB_FLAG_RAW) + zdb_dump_block_raw(buf, lsize, flags); + else if (flags & ZDB_FLAG_INDIRECT) + zdb_dump_indirect((blkptr_t *)buf, + orig_lsize / sizeof (blkptr_t), flags); + else if (flags & ZDB_FLAG_GBH) + zdb_dump_gbh(buf, flags); + else + zdb_dump_block(thing, buf, lsize, flags); + + /* + * If :c was specified, iterate through the checksum table to + * calculate and display each checksum for our specified + * DVA and length. + */ + if ((flags & ZDB_FLAG_CHECKSUM) && !(flags & ZDB_FLAG_RAW) && + !(flags & ZDB_FLAG_GBH)) { + zio_t *czio; + (void) printf("\n"); + for (enum zio_checksum ck = ZIO_CHECKSUM_LABEL; + ck < ZIO_CHECKSUM_FUNCTIONS; ck++) { + + if ((zio_checksum_table[ck].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED) || + ck == ZIO_CHECKSUM_NOPARITY) { + continue; + } + BP_SET_CHECKSUM(bp, ck); + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + czio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + czio->io_bp = bp; + + if (vd == vd->vdev_top) { + zio_nowait(zio_read(czio, spa, bp, pabd, psize, + NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_DONT_RETRY, NULL)); + } else { + zio_nowait(zio_vdev_child_io(czio, bp, vd, + offset, pabd, psize, ZIO_TYPE_READ, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW | + ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + error = zio_wait(czio); + if (error == 0 || error == ECKSUM) { + zio_t *ck_zio = zio_root(spa, NULL, NULL, 0); + ck_zio->io_offset = + DVA_GET_OFFSET(&bp->blk_dva[0]); + ck_zio->io_bp = bp; + zio_checksum_compute(ck_zio, ck, pabd, lsize); + printf("%12s\tcksum=%llx:%llx:%llx:%llx\n", + zio_checksum_table[ck].ci_name, + (u_longlong_t)bp->blk_cksum.zc_word[0], + (u_longlong_t)bp->blk_cksum.zc_word[1], + (u_longlong_t)bp->blk_cksum.zc_word[2], + (u_longlong_t)bp->blk_cksum.zc_word[3]); + zio_wait(ck_zio); + } else { + printf("error %d reading block\n", error); + } + spa_config_exit(spa, SCL_STATE, FTAG); + } + } + + if (borrowed) + abd_return_buf_copy(pabd, buf, lsize); + +out: + abd_free(pabd); + umem_free(lbuf, SPA_MAXBLOCKSIZE); +done: + free(flagstr); + free(dup); +} + +static void +zdb_embedded_block(char *thing) +{ + blkptr_t bp; + unsigned long long *words = (void *)&bp; + char *buf; + int err; + + bzero(&bp, sizeof (bp)); + err = sscanf(thing, "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx:" + "%llx:%llx:%llx:%llx:%llx:%llx:%llx:%llx", + words + 0, words + 1, words + 2, words + 3, + words + 4, words + 5, words + 6, words + 7, + words + 8, words + 9, words + 10, words + 11, + words + 12, words + 13, words + 14, words + 15); + if (err != 16) { + (void) fprintf(stderr, "invalid input format\n"); + exit(1); + } + ASSERT3U(BPE_GET_LSIZE(&bp), <=, SPA_MAXBLOCKSIZE); + buf = malloc(SPA_MAXBLOCKSIZE); + if (buf == NULL) { + (void) fprintf(stderr, "out of memory\n"); + exit(1); + } + err = decode_embedded_bp(&bp, buf, BPE_GET_LSIZE(&bp)); + if (err != 0) { + (void) fprintf(stderr, "decode failed: %u\n", err); + exit(1); + } + zdb_dump_block_raw(buf, BPE_GET_LSIZE(&bp), 0); + free(buf); +} + +int +main(int argc, char **argv) +{ + int c; + struct rlimit rl = { 1024, 1024 }; + spa_t *spa = NULL; + objset_t *os = NULL; + int dump_all = 1; + int verbose = 0; + int error = 0; + char **searchdirs = NULL; + int nsearch = 0; + char *target, *target_pool, dsname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *policy = NULL; + uint64_t max_txg = UINT64_MAX; + int64_t objset_id = -1; + int flags = ZFS_IMPORT_MISSING_LOG; + int rewind = ZPOOL_NEVER_REWIND; + char *spa_config_path_env, *objset_str; + boolean_t target_is_spa = B_TRUE, dataset_lookup = B_FALSE; + nvlist_t *cfg = NULL; + + (void) setrlimit(RLIMIT_NOFILE, &rl); + (void) enable_extended_FILE_stdio(-1, -1); + + dprintf_setup(&argc, argv); + + /* + * If there is an environment variable SPA_CONFIG_PATH it overrides + * default spa_config_path setting. If -U flag is specified it will + * override this environment variable settings once again. + */ + spa_config_path_env = getenv("SPA_CONFIG_PATH"); + if (spa_config_path_env != NULL) + spa_config_path = spa_config_path_env; + + /* + * For performance reasons, we set this tunable down. We do so before + * the arg parsing section so that the user can override this value if + * they choose. + */ + zfs_btree_verify_intensity = 3; + + while ((c = getopt(argc, argv, + "AbcCdDeEFGhiI:klLmMo:Op:PqRsSt:uU:vVx:XYyZ")) != -1) { + switch (c) { + case 'b': + case 'c': + case 'C': + case 'd': + case 'D': + case 'E': + case 'G': + case 'h': + case 'i': + case 'l': + case 'm': + case 'M': + case 'O': + case 'R': + case 's': + case 'S': + case 'u': + case 'y': + case 'Z': + dump_opt[c]++; + dump_all = 0; + break; + case 'A': + case 'e': + case 'F': + case 'k': + case 'L': + case 'P': + case 'q': + case 'X': + dump_opt[c]++; + break; + case 'Y': + zfs_reconstruct_indirect_combinations_max = INT_MAX; + zfs_deadman_enabled = 0; + break; + /* NB: Sort single match options below. */ + case 'I': + max_inflight_bytes = strtoull(optarg, NULL, 0); + if (max_inflight_bytes == 0) { + (void) fprintf(stderr, "maximum number " + "of inflight bytes must be greater " + "than 0\n"); + usage(); + } + break; + case 'o': + error = set_global_var(optarg); + if (error != 0) + usage(); + break; + case 'p': + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), + UMEM_NOFAIL); + } else { + char **tmp = umem_alloc((nsearch + 1) * + sizeof (char *), UMEM_NOFAIL); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + umem_free(searchdirs, + nsearch * sizeof (char *)); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 't': + max_txg = strtoull(optarg, NULL, 0); + if (max_txg < TXG_INITIAL) { + (void) fprintf(stderr, "incorrect txg " + "specified: %s\n", optarg); + usage(); + } + break; + case 'U': + spa_config_path = optarg; + if (spa_config_path[0] != '/') { + (void) fprintf(stderr, + "cachefile must be an absolute path " + "(i.e. start with a slash)\n"); + usage(); + } + break; + case 'v': + verbose++; + break; + case 'V': + flags = ZFS_IMPORT_VERBATIM; + break; + case 'x': + vn_dumpdir = optarg; + break; + default: + usage(); + break; + } + } + + if (!dump_opt['e'] && searchdirs != NULL) { + (void) fprintf(stderr, "-p option requires use of -e\n"); + usage(); + } + if (dump_opt['d']) { + /* [/ is accepted */ + if (argv[2] && (objset_str = strchr(argv[2], '/')) != NULL && + objset_str++ != NULL) { + char *endptr; + errno = 0; + objset_id = strtoull(objset_str, &endptr, 0); + /* dataset 0 is the same as opening the pool */ + if (errno == 0 && endptr != objset_str && + objset_id != 0) { + target_is_spa = B_FALSE; + dataset_lookup = B_TRUE; + } else if (objset_id != 0) { + printf("failed to open objset %s " + "%llu %s", objset_str, + (u_longlong_t)objset_id, + strerror(errno)); + exit(1); + } + /* normal dataset name not an objset ID */ + if (endptr == objset_str) { + objset_id = -1; + } + } + } + +#if defined(_LP64) + /* + * ZDB does not typically re-read blocks; therefore limit the ARC + * to 256 MB, which can be used entirely for metadata. + */ + zfs_arc_min = zfs_arc_meta_min = 2ULL << SPA_MAXBLOCKSHIFT; + zfs_arc_max = zfs_arc_meta_limit = 256 * 1024 * 1024; +#endif + + /* + * "zdb -c" uses checksum-verifying scrub i/os which are async reads. + * "zdb -b" uses traversal prefetch which uses async reads. + * For good performance, let several of them be active at once. + */ + zfs_vdev_async_read_max_active = 10; + + /* + * Disable reference tracking for better performance. + */ + reference_tracking_enable = B_FALSE; + + /* + * Do not fail spa_load when spa_load_verify fails. This is needed + * to load non-idle pools. + */ + spa_load_verify_dryrun = B_TRUE; + + kernel_init(SPA_MODE_READ); + + if (dump_all) + verbose = MAX(verbose, 1); + + for (c = 0; c < 256; c++) { + if (dump_all && strchr("AeEFklLOPRSXy", c) == NULL) + dump_opt[c] = 1; + if (dump_opt[c]) + dump_opt[c] += verbose; + } + + aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + zfs_recover = (dump_opt['A'] > 1); + + argc -= optind; + argv += optind; + if (argc < 2 && dump_opt['R']) + usage(); + + if (dump_opt['E']) { + if (argc != 1) + usage(); + zdb_embedded_block(argv[0]); + return (0); + } + + if (argc < 1) { + if (!dump_opt['e'] && dump_opt['C']) { + dump_cachefile(spa_config_path); + return (0); + } + usage(); + } + + if (dump_opt['l']) + return (dump_label(argv[0])); + + if (dump_opt['O']) { + if (argc != 2) + usage(); + dump_opt['v'] = verbose + 3; + return (dump_path(argv[0], argv[1])); + } + + if (dump_opt['X'] || dump_opt['F']) + rewind = ZPOOL_DO_REWIND | + (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); + + if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, max_txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, rewind) != 0) + fatal("internal error: %s", strerror(ENOMEM)); + + error = 0; + target = argv[0]; + + if (strpbrk(target, "/@") != NULL) { + size_t targetlen; + + target_pool = strdup(target); + *strpbrk(target_pool, "/@") = '\0'; + + target_is_spa = B_FALSE; + targetlen = strlen(target); + if (targetlen && target[targetlen - 1] == '/') + target[targetlen - 1] = '\0'; + } else { + target_pool = target; + } + + if (dump_opt['e']) { + importargs_t args = { 0 }; + + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_TRUE; + + error = zpool_find_config(NULL, target_pool, &cfg, &args, + &libzpool_config_ops); + + if (error == 0) { + + if (nvlist_add_nvlist(cfg, + ZPOOL_LOAD_POLICY, policy) != 0) { + fatal("can't open '%s': %s", + target, strerror(ENOMEM)); + } + + if (dump_opt['C'] > 1) { + (void) printf("\nConfiguration for import:\n"); + dump_nvlist(cfg, 8); + } + + /* + * Disable the activity check to allow examination of + * active pools. + */ + error = spa_import(target_pool, cfg, NULL, + flags | ZFS_IMPORT_SKIP_MMP); + } + } + + /* + * import_checkpointed_state makes the assumption that the + * target pool that we pass it is already part of the spa + * namespace. Because of that we need to make sure to call + * it always after the -e option has been processed, which + * imports the pool to the namespace if it's not in the + * cachefile. + */ + char *checkpoint_pool = NULL; + char *checkpoint_target = NULL; + if (dump_opt['k']) { + checkpoint_pool = import_checkpointed_state(target, cfg, + &checkpoint_target); + + if (checkpoint_target != NULL) + target = checkpoint_target; + } + + if (target_pool != target) + free(target_pool); + + if (error == 0) { + if (dump_opt['k'] && (target_is_spa || dump_opt['R'])) { + ASSERT(checkpoint_pool != NULL); + ASSERT(checkpoint_target == NULL); + + error = spa_open(checkpoint_pool, &spa, FTAG); + if (error != 0) { + fatal("Tried to open pool \"%s\" but " + "spa_open() failed with error %d\n", + checkpoint_pool, error); + } + + } else if (target_is_spa || dump_opt['R'] || objset_id == 0) { + zdb_set_skip_mmp(target); + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } + } + } else if (strpbrk(target, "#") != NULL) { + dsl_pool_t *dp; + error = dsl_pool_hold(target, FTAG, &dp); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + error = dump_bookmark(dp, target, B_TRUE, verbose > 1); + dsl_pool_rele(dp, FTAG); + if (error != 0) { + fatal("can't dump '%s': %s", target, + strerror(error)); + } + return (error); + } else { + zdb_set_skip_mmp(target); + if (dataset_lookup == B_TRUE) { + /* + * Use the supplied id to get the name + * for open_objset. + */ + error = spa_open(target, &spa, FTAG); + if (error == 0) { + error = name_from_objset_id(spa, + objset_id, dsname); + spa_close(spa, FTAG); + if (error == 0) + target = dsname; + } + } + if (error == 0) + error = open_objset(target, FTAG, &os); + if (error == 0) + spa = dmu_objset_spa(os); + } + } + nvlist_free(policy); + + if (error) + fatal("can't open '%s': %s", target, strerror(error)); + + /* + * Set the pool failure mode to panic in order to prevent the pool + * from suspending. A suspended I/O will have no way to resume and + * can prevent the zdb(8) command from terminating as expected. + */ + if (spa != NULL) + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; + + argv++; + argc--; + if (!dump_opt['R']) { + flagbits['d'] = ZOR_FLAG_DIRECTORY; + flagbits['f'] = ZOR_FLAG_PLAIN_FILE; + flagbits['m'] = ZOR_FLAG_SPACE_MAP; + flagbits['z'] = ZOR_FLAG_ZAP; + flagbits['A'] = ZOR_FLAG_ALL_TYPES; + + if (argc > 0 && dump_opt['d']) { + zopt_object_args = argc; + zopt_object_ranges = calloc(zopt_object_args, + sizeof (zopt_object_range_t)); + for (unsigned i = 0; i < zopt_object_args; i++) { + int err; + char *msg = NULL; + + err = parse_object_range(argv[i], + &zopt_object_ranges[i], &msg); + if (err != 0) + fatal("Bad object or range: '%s': %s\n", + argv[i], msg ? msg : ""); + } + } else if (argc > 0 && dump_opt['m']) { + zopt_metaslab_args = argc; + zopt_metaslab = calloc(zopt_metaslab_args, + sizeof (uint64_t)); + for (unsigned i = 0; i < zopt_metaslab_args; i++) { + errno = 0; + zopt_metaslab[i] = strtoull(argv[i], NULL, 0); + if (zopt_metaslab[i] == 0 && errno != 0) + fatal("bad number %s: %s", argv[i], + strerror(errno)); + } + } + if (os != NULL) { + dump_objset(os); + } else if (zopt_object_args > 0 && !dump_opt['m']) { + dump_objset(spa->spa_meta_objset); + } else { + dump_zpool(spa); + } + } else { + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['r'] = ZDB_FLAG_RAW; + flagbits['v'] = ZDB_FLAG_VERBOSE; + + for (int i = 0; i < argc; i++) + zdb_read_block(argv[i], spa); + } + + if (dump_opt['k']) { + free(checkpoint_pool); + if (!target_is_spa) + free(checkpoint_target); + } + + if (os != NULL) { + close_objset(os, FTAG); + } else { + spa_close(spa, FTAG); + } + + fuid_table_destroy(); + + dump_debug_buffer(); + + kernel_fini(); + + return (error); +} diff --git a/cmd/zdb/zdb.h b/cmd/zdb/zdb.h new file mode 100644 index 000000000000..49579811efbb --- /dev/null +++ b/cmd/zdb/zdb.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Spectra Logic Corp Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _ZDB_H +#define _ZDB_H + +void dump_intent_log(zilog_t *); +extern uint8_t dump_opt[256]; + +#endif /* _ZDB_H */ diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c new file mode 100644 index 000000000000..c12178effae0 --- /dev/null +++ b/cmd/zdb/zdb_il.c @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +/* + * Print intent log header and statistics. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zdb.h" + +extern uint8_t dump_opt[256]; + +static char tab_prefix[4] = "\t\t\t"; + +static void +print_log_bp(const blkptr_t *bp, const char *prefix) +{ + char blkbuf[BP_SPRINTF_LEN]; + + snprintf_blkptr(blkbuf, sizeof (blkbuf), bp); + (void) printf("%s%s\n", prefix, blkbuf); +} + +/* ARGSUSED */ +static void +zil_prt_rec_create(zilog_t *zilog, int txtype, void *arg) +{ + lr_create_t *lr = arg; + time_t crtime = lr->lr_crtime[0]; + char *name, *link; + lr_attr_t *lrattr; + + name = (char *)(lr + 1); + + if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || + lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { + lrattr = (lr_attr_t *)(lr + 1); + name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + if (txtype == TX_SYMLINK) { + link = name + strlen(name) + 1; + (void) printf("%s%s -> %s\n", tab_prefix, name, link); + } else if (txtype != TX_MKXATTR) { + (void) printf("%s%s\n", tab_prefix, name); + } + + (void) printf("%s%s", tab_prefix, ctime(&crtime)); + (void) printf("%sdoid %llu, foid %llu, slots %llu, mode %llo\n", + tab_prefix, (u_longlong_t)lr->lr_doid, + (u_longlong_t)LR_FOID_GET_OBJ(lr->lr_foid), + (u_longlong_t)LR_FOID_GET_SLOTS(lr->lr_foid), + (longlong_t)lr->lr_mode); + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", + tab_prefix, + (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, + (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); +} + +/* ARGSUSED */ +static void +zil_prt_rec_remove(zilog_t *zilog, int txtype, void *arg) +{ + lr_remove_t *lr = arg; + + (void) printf("%sdoid %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_link(zilog_t *zilog, int txtype, void *arg) +{ + lr_link_t *lr = arg; + + (void) printf("%sdoid %llu, link_obj %llu, name %s\n", tab_prefix, + (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, + (char *)(lr + 1)); +} + +/* ARGSUSED */ +static void +zil_prt_rec_rename(zilog_t *zilog, int txtype, void *arg) +{ + lr_rename_t *lr = arg; + char *snm = (char *)(lr + 1); + char *tnm = snm + strlen(snm) + 1; + + (void) printf("%ssdoid %llu, tdoid %llu\n", tab_prefix, + (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); + (void) printf("%ssrc %s tgt %s\n", tab_prefix, snm, tnm); +} + +/* ARGSUSED */ +static int +zil_prt_rec_write_cb(void *data, size_t len, void *unused) +{ + char *cdata = data; + + for (size_t i = 0; i < len; i++) { + if (isprint(*cdata)) + (void) printf("%c ", *cdata); + else + (void) printf("%2X", *cdata); + cdata++; + } + return (0); +} + +/* ARGSUSED */ +static void +zil_prt_rec_write(zilog_t *zilog, int txtype, void *arg) +{ + lr_write_t *lr = arg; + abd_t *data; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int error; + + (void) printf("%sfoid %llu, offset %llx, length %llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); + + if (txtype == TX_WRITE2 || verbose < 5) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + (void) printf("%s\n", tab_prefix); + return; + } + if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + (void) printf("%s\n", + tab_prefix); + return; + } + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + data = abd_alloc(BP_GET_LSIZE(bp), B_FALSE); + error = zio_wait(zio_read(NULL, zilog->zl_spa, + bp, data, BP_GET_LSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + if (error) + goto out; + } else { + /* data is stored after the end of the lr_write record */ + data = abd_alloc(lr->lr_length, B_FALSE); + abd_copy_from_buf(data, lr + 1, lr->lr_length); + } + + (void) printf("%s", tab_prefix); + (void) abd_iterate_func(data, + 0, MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)), + zil_prt_rec_write_cb, NULL); + (void) printf("\n"); + +out: + abd_free(data); +} + +/* ARGSUSED */ +static void +zil_prt_rec_truncate(zilog_t *zilog, int txtype, void *arg) +{ + lr_truncate_t *lr = arg; + + (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); +} + +/* ARGSUSED */ +static void +zil_prt_rec_setattr(zilog_t *zilog, int txtype, void *arg) +{ + lr_setattr_t *lr = arg; + time_t atime = (time_t)lr->lr_atime[0]; + time_t mtime = (time_t)lr->lr_mtime[0]; + + (void) printf("%sfoid %llu, mask 0x%llx\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); + + if (lr->lr_mask & AT_MODE) { + (void) printf("%sAT_MODE %llo\n", tab_prefix, + (longlong_t)lr->lr_mode); + } + + if (lr->lr_mask & AT_UID) { + (void) printf("%sAT_UID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_uid); + } + + if (lr->lr_mask & AT_GID) { + (void) printf("%sAT_GID %llu\n", tab_prefix, + (u_longlong_t)lr->lr_gid); + } + + if (lr->lr_mask & AT_SIZE) { + (void) printf("%sAT_SIZE %llu\n", tab_prefix, + (u_longlong_t)lr->lr_size); + } + + if (lr->lr_mask & AT_ATIME) { + (void) printf("%sAT_ATIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_atime[0], + (u_longlong_t)lr->lr_atime[1], + ctime(&atime)); + } + + if (lr->lr_mask & AT_MTIME) { + (void) printf("%sAT_MTIME %llu.%09llu %s", tab_prefix, + (u_longlong_t)lr->lr_mtime[0], + (u_longlong_t)lr->lr_mtime[1], + ctime(&mtime)); + } +} + +/* ARGSUSED */ +static void +zil_prt_rec_acl(zilog_t *zilog, int txtype, void *arg) +{ + lr_acl_t *lr = arg; + + (void) printf("%sfoid %llu, aclcnt %llu\n", tab_prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); +} + +typedef void (*zil_prt_rec_func_t)(zilog_t *, int, void *); +typedef struct zil_rec_info { + zil_prt_rec_func_t zri_print; + const char *zri_name; + uint64_t zri_count; +} zil_rec_info_t; + +static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { + {.zri_print = NULL, .zri_name = "Total "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKXATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_SYMLINK "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_REMOVE "}, + {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, + {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, + {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, + {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, + {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_CREATE_ACL_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ATTR "}, + {.zri_print = zil_prt_rec_create, .zri_name = "TX_MKDIR_ACL_ATTR "}, + {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE2 "}, +}; + +/* ARGSUSED */ +static int +print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) +{ + int txtype; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + /* reduce size of txtype to strip off TX_CI bit */ + txtype = lr->lrc_txtype; + + ASSERT(txtype != 0 && (uint_t)txtype < TX_MAX_TYPE); + ASSERT(lr->lrc_txg); + + (void) printf("\t\t%s%s len %6llu, txg %llu, seq %llu\n", + (lr->lrc_txtype & TX_CI) ? "CI-" : "", + zil_rec_info[txtype].zri_name, + (u_longlong_t)lr->lrc_reclen, + (u_longlong_t)lr->lrc_txg, + (u_longlong_t)lr->lrc_seq); + + if (txtype && verbose >= 3) { + if (!zilog->zl_os->os_encrypted) { + zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else { + (void) printf("%s(encrypted)\n", tab_prefix); + } + } + + zil_rec_info[txtype].zri_count++; + zil_rec_info[0].zri_count++; + + return (0); +} + +/* ARGSUSED */ +static int +print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + char blkbuf[BP_SPRINTF_LEN + 10]; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + const char *claim; + + if (verbose <= 3) + return (0); + + if (verbose >= 5) { + (void) strcpy(blkbuf, ", "); + snprintf_blkptr(blkbuf + strlen(blkbuf), + sizeof (blkbuf) - strlen(blkbuf), bp); + } else { + blkbuf[0] = '\0'; + } + + if (claim_txg != 0) + claim = "already claimed"; + else if (bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa)) + claim = "will claim"; + else + claim = "won't claim"; + + (void) printf("\tBlock seqno %llu, %s%s\n", + (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); + + return (0); +} + +static void +print_log_stats(int verbose) +{ + unsigned i, w, p10; + + if (verbose > 3) + (void) printf("\n"); + + if (zil_rec_info[0].zri_count == 0) + return; + + for (w = 1, p10 = 10; zil_rec_info[0].zri_count >= p10; p10 *= 10) + w++; + + for (i = 0; i < TX_MAX_TYPE; i++) + if (zil_rec_info[i].zri_count || verbose >= 3) + (void) printf("\t\t%s %*llu\n", + zil_rec_info[i].zri_name, w, + (u_longlong_t)zil_rec_info[i].zri_count); + (void) printf("\n"); +} + +/* ARGSUSED */ +void +dump_intent_log(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + int i; + + if (BP_IS_HOLE(&zh->zh_log) || verbose < 1) + return; + + (void) printf("\n ZIL header: claim_txg %llu, " + "claim_blk_seq %llu, claim_lr_seq %llu", + (u_longlong_t)zh->zh_claim_txg, + (u_longlong_t)zh->zh_claim_blk_seq, + (u_longlong_t)zh->zh_claim_lr_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); + + for (i = 0; i < TX_MAX_TYPE; i++) + zil_rec_info[i].zri_count = 0; + + /* see comment in zil_claim() or zil_check_log_chain() */ + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return; + + if (verbose >= 2) { + (void) printf("\n"); + (void) zil_parse(zilog, print_log_block, print_log_record, NULL, + zh->zh_claim_txg, B_FALSE); + print_log_stats(verbose); + } +} diff --git a/cmd/zed/.gitignore b/cmd/zed/.gitignore new file mode 100644 index 000000000000..76557bb6bb3a --- /dev/null +++ b/cmd/zed/.gitignore @@ -0,0 +1 @@ +/zed diff --git a/cmd/zed/Makefile.am b/cmd/zed/Makefile.am new file mode 100644 index 000000000000..4bd8ac4a53e6 --- /dev/null +++ b/cmd/zed/Makefile.am @@ -0,0 +1,49 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBUDEV_CFLAGS) $(LIBUUID_CFLAGS) + +SUBDIRS = zed.d + +sbin_PROGRAMS = zed + +ZED_SRC = \ + zed.c \ + zed.h \ + zed_conf.c \ + zed_conf.h \ + zed_disk_event.c \ + zed_disk_event.h \ + zed_event.c \ + zed_event.h \ + zed_exec.c \ + zed_exec.h \ + zed_file.c \ + zed_file.h \ + zed_log.c \ + zed_log.h \ + zed_strings.c \ + zed_strings.h + +FMA_SRC = \ + agents/zfs_agents.c \ + agents/zfs_agents.h \ + agents/zfs_diagnosis.c \ + agents/zfs_mod.c \ + agents/zfs_retire.c \ + agents/fmd_api.c \ + agents/fmd_api.h \ + agents/fmd_serd.c \ + agents/fmd_serd.h + +zed_SOURCES = $(ZED_SRC) $(FMA_SRC) + +zed_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zed_LDADD += -lrt $(LIBUDEV_LIBS) $(LIBUUID_LIBS) +zed_LDFLAGS = -pthread + +EXTRA_DIST = agents/README.md diff --git a/cmd/zed/agents/README.md b/cmd/zed/agents/README.md new file mode 100644 index 000000000000..e35b97668a9d --- /dev/null +++ b/cmd/zed/agents/README.md @@ -0,0 +1,112 @@ +## Fault Management Logic for ZED ## + +The integration of Fault Management Daemon (FMD) logic from illumos +is being deployed in three phases. This logic is encapsulated in +several software modules inside ZED. + +### ZED+FM Phase 1 ### + +All the phase 1 work is in current Master branch. Phase I work includes: + +* Add new paths to the persistent VDEV label for device matching. +* Add a disk monitor for generating _disk-add_ and _disk-change_ events. +* Add support for automated VDEV auto-online, auto-replace and auto-expand. +* Expand the statechange event to include all VDEV state transitions. + +### ZED+FM Phase 2 (WIP) ### + +The phase 2 work primarily entails the _Diagnosis Engine_ and the +_Retire Agent_ modules. It also includes infrastructure to support a +crude FMD environment to host these modules. For additional +information see the **FMD Components in ZED** and **Implementation +Notes** sections below. + +### ZED+FM Phase 3 ### + +Future work will add additional functionality and will likely include: + +* Add FMD module garbage collection (periodically call `fmd_module_gc()`). +* Add real module property retrieval (currently hard-coded in accessors). +* Additional diagnosis telemetry (like latency outliers and SMART data). +* Export FMD module statistics. +* Zedlet parallel execution and resiliency (add watchdog). + +### ZFS Fault Management Overview ### + +The primary purpose with ZFS fault management is automated diagnosis +and isolation of VDEV faults. A fault is something we can associate +with an impact (e.g. loss of data redundancy) and a corrective action +(e.g. offline or replace a disk). A typical ZFS fault management stack +is comprised of _error detectors_ (e.g. `zfs_ereport_post()`), a _disk +monitor_, a _diagnosis engine_ and _response agents_. + +After detecting a software error, the ZFS kernel module sends error +events to the ZED user daemon which in turn routes the events to its +internal FMA modules based on their event subscriptions. Likewise, if +a disk is added or changed in the system, the disk monitor sends disk +events which are consumed by a response agent. + +### FMD Components in ZED ### + +There are three FMD modules (aka agents) that are now built into ZED. + + 1. A _Diagnosis Engine_ module (`agents/zfs_diagnosis.c`) + 2. A _Retire Agent_ module (`agents/zfs_retire.c`) + 3. A _Disk Add Agent_ module (`agents/zfs_mod.c`) + +To begin with, a **Diagnosis Engine** consumes per-vdev I/O and checksum +ereports and feeds them into a Soft Error Rate Discrimination (SERD) +algorithm which will generate a corresponding fault diagnosis when the +tracked VDEV encounters **N** events in a given **T** time window. The +initial N and T values for the SERD algorithm are estimates inherited +from illumos (10 errors in 10 minutes). + +In turn, a **Retire Agent** responds to diagnosed faults by isolating +the faulty VDEV. It will notify the ZFS kernel module of the new VDEV +state (degraded or faulted). The retire agent is also responsible for +managing hot spares across all pools. When it encounters a device fault +or a device removal it will replace the device with an appropriate +spare if available. + +Finally, a **Disk Add Agent** responds to events from a libudev disk +monitor (`EC_DEV_ADD` or `EC_DEV_STATUS`) and will online, replace or +expand the associated VDEV. This agent is also known as the `zfs_mod` +or Sysevent Loadable Module (SLM) on the illumos platform. The added +disk is matched to a specific VDEV using its device id, physical path +or VDEV GUID. + +Note that the _auto-replace_ feature (aka hot plug) is opt-in and you +must set the pool's `autoreplace` property to enable it. The new disk +will be matched to the corresponding leaf VDEV by physical location +and labeled with a GPT partition before replacing the original VDEV +in the pool. + +### Implementation Notes ### + +* The FMD module API required for logic modules is emulated and implemented + in the `fmd_api.c` and `fmd_serd.c` source files. This support includes + module registration, memory allocation, module property accessors, basic + case management, one-shot timers and SERD engines. + For detailed information on the FMD module API, see the document -- + _"Fault Management Daemon Programmer's Reference Manual"_. + +* The event subscriptions for the modules (located in a module specific + configuration file on illumos) are currently hard-coded into the ZED + `zfs_agent_dispatch()` function. + +* The FMD modules are called one at a time from a single thread that + consumes events queued to the modules. These events are sourced from + the normal ZED events and also include events posted from the diagnosis + engine and the libudev disk event monitor. + +* The FMD code modules have minimal changes and were intentionally left + as similar as possible to their upstream source files. + +* The sysevent namespace in ZED differs from illumos. For example: + * illumos uses `"resource.sysevent.EC_zfs.ESC_ZFS_vdev_remove"` + * Linux uses `"sysevent.fs.zfs.vdev_remove"` + +* The FMD Modules port was produced by Intel Federal, LLC under award + number B609815 between the U.S. Department of Energy (DOE) and Intel + Federal, LLC. + diff --git a/cmd/zed/agents/fmd_api.c b/cmd/zed/agents/fmd_api.c new file mode 100644 index 000000000000..607b387ca3a8 --- /dev/null +++ b/cmd/zed/agents/fmd_api.c @@ -0,0 +1,760 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * This file implements the minimal FMD module API required to support the + * fault logic modules in ZED. This support includes module registration, + * memory allocation, module property accessors, basic case management, + * one-shot timers and SERD engines. + * + * In the ZED runtime, the modules are called from a single thread so no + * locking is required in this emulated FMD environment. + */ + +#include +#include +#include +#include +#include +#include + +#include "fmd_api.h" +#include "fmd_serd.h" + +#include "zfs_agents.h" +#include "../zed_log.h" + +typedef struct fmd_modstat { + fmd_stat_t ms_accepted; /* total events accepted by module */ + fmd_stat_t ms_caseopen; /* cases currently open */ + fmd_stat_t ms_casesolved; /* total cases solved by module */ + fmd_stat_t ms_caseclosed; /* total cases closed by module */ +} fmd_modstat_t; + +typedef struct fmd_module { + const char *mod_name; /* basename of module (ro) */ + const fmd_hdl_info_t *mod_info; /* module info registered with handle */ + void *mod_spec; /* fmd_hdl_get/setspecific data value */ + fmd_stat_t *mod_ustat; /* module specific custom stats */ + uint_t mod_ustat_cnt; /* count of ustat stats */ + fmd_modstat_t mod_stats; /* fmd built-in per-module statistics */ + fmd_serd_hash_t mod_serds; /* hash of serd engs owned by module */ + char *mod_vers; /* a copy of module version string */ +} fmd_module_t; + +/* + * ZED has two FMD hardwired module instances + */ +fmd_module_t zfs_retire_module; +fmd_module_t zfs_diagnosis_module; + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +/* + * Register a module with fmd and finish module initialization. + * Returns an integer indicating whether it succeeded (zero) or + * failed (non-zero). + */ +int +fmd_hdl_register(fmd_hdl_t *hdl, int version, const fmd_hdl_info_t *mip) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_info = mip; + mp->mod_name = mip->fmdi_desc + 4; /* drop 'ZFS ' prefix */ + mp->mod_spec = NULL; + + /* bare minimum module stats */ + (void) strcpy(mp->mod_stats.ms_accepted.fmds_name, "fmd.accepted"); + (void) strcpy(mp->mod_stats.ms_caseopen.fmds_name, "fmd.caseopen"); + (void) strcpy(mp->mod_stats.ms_casesolved.fmds_name, "fmd.casesolved"); + (void) strcpy(mp->mod_stats.ms_caseclosed.fmds_name, "fmd.caseclosed"); + + fmd_serd_hash_create(&mp->mod_serds); + + fmd_hdl_debug(hdl, "register module"); + + return (0); +} + +void +fmd_hdl_unregister(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_modstat_t *msp = &mp->mod_stats; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + /* dump generic module stats */ + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_accepted.fmds_name, + msp->ms_accepted.fmds_value.ui64); + if (ops->fmdo_close != NULL) { + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseopen.fmds_name, + msp->ms_caseopen.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_casesolved.fmds_name, + msp->ms_casesolved.fmds_value.ui64); + fmd_hdl_debug(hdl, "%s: %llu", msp->ms_caseclosed.fmds_name, + msp->ms_caseclosed.fmds_value.ui64); + } + + /* dump module specific stats */ + if (mp->mod_ustat != NULL) { + int i; + + for (i = 0; i < mp->mod_ustat_cnt; i++) { + fmd_hdl_debug(hdl, "%s: %llu", + mp->mod_ustat[i].fmds_name, + mp->mod_ustat[i].fmds_value.ui64); + } + } + + fmd_serd_hash_destroy(&mp->mod_serds); + + fmd_hdl_debug(hdl, "unregister module"); +} + +/* + * fmd_hdl_setspecific() is used to associate a data pointer with + * the specified handle for the duration of the module's lifetime. + * This pointer can be retrieved using fmd_hdl_getspecific(). + */ +void +fmd_hdl_setspecific(fmd_hdl_t *hdl, void *spec) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + mp->mod_spec = spec; +} + +/* + * Return the module-specific data pointer previously associated + * with the handle using fmd_hdl_setspecific(). + */ +void * +fmd_hdl_getspecific(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_spec); +} + +void * +fmd_hdl_alloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_alloc(size, flags)); +} + +void * +fmd_hdl_zalloc(fmd_hdl_t *hdl, size_t size, int flags) +{ + return (umem_zalloc(size, flags)); +} + +void +fmd_hdl_free(fmd_hdl_t *hdl, void *data, size_t size) +{ + umem_free(data, size); +} + +/* + * Record a module debug message using the specified format. + */ +void +fmd_hdl_debug(fmd_hdl_t *hdl, const char *format, ...) +{ + char message[256]; + va_list vargs; + fmd_module_t *mp = (fmd_module_t *)hdl; + + va_start(vargs, format); + (void) vsnprintf(message, sizeof (message), format, vargs); + va_end(vargs); + + /* prefix message with module name */ + zed_log_msg(LOG_INFO, "%s: %s", mp->mod_name, message); +} + +/* Property Retrieval */ + +int32_t +fmd_prop_get_int32(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "spare_on_remove") == 0) + return (1); + + if (strcmp(name, "io_N") == 0 || strcmp(name, "checksum_N") == 0) + return (10); /* N = 10 events */ + + return (0); +} + +int64_t +fmd_prop_get_int64(fmd_hdl_t *hdl, const char *name) +{ + /* + * These can be looked up in mp->modinfo->fmdi_props + * For now we just hard code for phase 2. In the + * future, there can be a ZED based override. + */ + if (strcmp(name, "remove_timeout") == 0) + return (15ULL * 1000ULL * 1000ULL * 1000ULL); /* 15 sec */ + + if (strcmp(name, "io_T") == 0 || strcmp(name, "checksum_T") == 0) + return (1000ULL * 1000ULL * 1000ULL * 600ULL); /* 10 min */ + + return (0); +} + +/* FMD Statistics */ + +fmd_stat_t * +fmd_stat_create(fmd_hdl_t *hdl, uint_t flags, uint_t nstats, fmd_stat_t *statv) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (flags == FMD_STAT_NOALLOC) { + mp->mod_ustat = statv; + mp->mod_ustat_cnt = nstats; + } + + return (statv); +} + +/* Case Management */ + +fmd_case_t * +fmd_case_open(fmd_hdl_t *hdl, void *data) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + uuid_t uuid; + + fmd_case_t *cp; + + cp = fmd_hdl_zalloc(hdl, sizeof (fmd_case_t), FMD_SLEEP); + cp->ci_mod = hdl; + cp->ci_state = FMD_CASE_UNSOLVED; + cp->ci_flags = FMD_CF_DIRTY; + cp->ci_data = data; + cp->ci_bufptr = NULL; + cp->ci_bufsiz = 0; + + uuid_generate(uuid); + uuid_unparse(uuid, cp->ci_uuid); + + fmd_hdl_debug(hdl, "case opened (%s)", cp->ci_uuid); + mp->mod_stats.ms_caseopen.fmds_value.ui64++; + + return (cp); +} + +void +fmd_case_solve(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + /* + * For ZED, the event was already sent from fmd_case_add_suspect() + */ + + if (cp->ci_state >= FMD_CASE_SOLVED) + fmd_hdl_debug(hdl, "case is already solved or closed"); + + cp->ci_state = FMD_CASE_SOLVED; + + fmd_hdl_debug(hdl, "case solved (%s)", cp->ci_uuid); + mp->mod_stats.ms_casesolved.fmds_value.ui64++; +} + +void +fmd_case_close(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + + fmd_hdl_debug(hdl, "case closed (%s)", cp->ci_uuid); + + if (ops->fmdo_close != NULL) + ops->fmdo_close(hdl, cp); + + mp->mod_stats.ms_caseopen.fmds_value.ui64--; + mp->mod_stats.ms_caseclosed.fmds_value.ui64++; + + if (cp->ci_bufptr != NULL && cp->ci_bufsiz > 0) + fmd_hdl_free(hdl, cp->ci_bufptr, cp->ci_bufsiz); + + fmd_hdl_free(hdl, cp, sizeof (fmd_case_t)); +} + +void +fmd_case_uuresolved(fmd_hdl_t *hdl, const char *uuid) +{ + fmd_hdl_debug(hdl, "case resolved by uuid (%s)", uuid); +} + +int +fmd_case_solved(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return ((cp->ci_state >= FMD_CASE_SOLVED) ? FMD_B_TRUE : FMD_B_FALSE); +} + +void +fmd_case_add_ereport(fmd_hdl_t *hdl, fmd_case_t *cp, fmd_event_t *ep) +{ +} + +static void +zed_log_fault(nvlist_t *nvl, const char *uuid, const char *code) +{ + nvlist_t *rsrc; + char *strval; + uint64_t guid; + uint8_t byte; + + zed_log_msg(LOG_INFO, "\nzed_fault_event:"); + + if (uuid != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_UUID, uuid); + if (nvlist_lookup_string(nvl, FM_CLASS, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_CLASS, strval); + if (code != NULL) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_SUSPECT_DIAG_CODE, code); + if (nvlist_lookup_uint8(nvl, FM_FAULT_CERTAINTY, &byte) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FAULT_CERTAINTY, byte); + if (nvlist_lookup_nvlist(nvl, FM_FAULT_RESOURCE, &rsrc) == 0) { + if (nvlist_lookup_string(rsrc, FM_FMRI_SCHEME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", FM_FMRI_SCHEME, + strval); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_POOL, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", FM_FMRI_ZFS_POOL, + guid); + if (nvlist_lookup_uint64(rsrc, FM_FMRI_ZFS_VDEV, &guid) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu \n", FM_FMRI_ZFS_VDEV, + guid); + } +} + +static const char * +fmd_fault_mkcode(nvlist_t *fault) +{ + char *class, *code = "-"; + + /* + * Note: message codes come from: openzfs/usr/src/cmd/fm/dicts/ZFS.po + */ + if (nvlist_lookup_string(fault, FM_CLASS, &class) == 0) { + if (strcmp(class, "fault.fs.zfs.vdev.io") == 0) + code = "ZFS-8000-FD"; + else if (strcmp(class, "fault.fs.zfs.vdev.checksum") == 0) + code = "ZFS-8000-GH"; + else if (strcmp(class, "fault.fs.zfs.io_failure_wait") == 0) + code = "ZFS-8000-HC"; + else if (strcmp(class, "fault.fs.zfs.io_failure_continue") == 0) + code = "ZFS-8000-JQ"; + else if (strcmp(class, "fault.fs.zfs.log_replay") == 0) + code = "ZFS-8000-K4"; + else if (strcmp(class, "fault.fs.zfs.pool") == 0) + code = "ZFS-8000-CS"; + else if (strcmp(class, "fault.fs.zfs.device") == 0) + code = "ZFS-8000-D3"; + + } + return (code); +} + +void +fmd_case_add_suspect(fmd_hdl_t *hdl, fmd_case_t *cp, nvlist_t *fault) +{ + nvlist_t *nvl; + const char *code = fmd_fault_mkcode(fault); + int64_t tod[2]; + int err = 0; + + /* + * payload derived from fmd_protocol_list() + */ + + (void) gettimeofday(&cp->ci_tv, NULL); + tod[0] = cp->ci_tv.tv_sec; + tod[1] = cp->ci_tv.tv_usec; + + nvl = fmd_nvl_alloc(hdl, FMD_SLEEP); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_SUSPECT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, FM_LIST_SUSPECT_CLASS); + err |= nvlist_add_string(nvl, FM_SUSPECT_UUID, cp->ci_uuid); + err |= nvlist_add_string(nvl, FM_SUSPECT_DIAG_CODE, code); + err |= nvlist_add_int64_array(nvl, FM_SUSPECT_DIAG_TIME, tod, 2); + err |= nvlist_add_uint32(nvl, FM_SUSPECT_FAULT_SZ, 1); + err |= nvlist_add_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, &fault, 1); + + if (err) + zed_log_die("failed to populate nvlist"); + + zed_log_fault(fault, cp->ci_uuid, code); + zfs_agent_post_event(FM_LIST_SUSPECT_CLASS, NULL, nvl); + + nvlist_free(nvl); + nvlist_free(fault); +} + +void +fmd_case_setspecific(fmd_hdl_t *hdl, fmd_case_t *cp, void *data) +{ + cp->ci_data = data; +} + +void * +fmd_case_getspecific(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + return (cp->ci_data); +} + +void +fmd_buf_create(fmd_hdl_t *hdl, fmd_case_t *cp, const char *name, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr == NULL); + assert(size < (1024 * 1024)); + + cp->ci_bufptr = fmd_hdl_alloc(hdl, size, FMD_SLEEP); + cp->ci_bufsiz = size; +} + +void +fmd_buf_read(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(size <= cp->ci_bufsiz); + + bcopy(cp->ci_bufptr, buf, size); +} + +void +fmd_buf_write(fmd_hdl_t *hdl, fmd_case_t *cp, + const char *name, const void *buf, size_t size) +{ + assert(strcmp(name, "data") == 0); + assert(cp->ci_bufptr != NULL); + assert(cp->ci_bufsiz >= size); + + bcopy(buf, cp->ci_bufptr, size); +} + +/* SERD Engines */ + +void +fmd_serd_create(fmd_hdl_t *hdl, const char *name, uint_t n, hrtime_t t) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + if (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL) { + zed_log_msg(LOG_ERR, "failed to create SERD engine '%s': " + " name already exists", name); + return; + } + + (void) fmd_serd_eng_insert(&mp->mod_serds, name, n, t); +} + +void +fmd_serd_destroy(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + fmd_serd_eng_delete(&mp->mod_serds, name); + + fmd_hdl_debug(hdl, "serd_destroy %s", name); +} + +int +fmd_serd_exists(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (fmd_serd_eng_lookup(&mp->mod_serds, name) != NULL); +} + +void +fmd_serd_reset(fmd_hdl_t *hdl, const char *name) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "serd engine '%s' does not exist", name); + return; + } + + fmd_serd_eng_reset(sgp); + + fmd_hdl_debug(hdl, "serd_reset %s", name); +} + +int +fmd_serd_record(fmd_hdl_t *hdl, const char *name, fmd_event_t *ep) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + fmd_serd_eng_t *sgp; + int err; + + if ((sgp = fmd_serd_eng_lookup(&mp->mod_serds, name)) == NULL) { + zed_log_msg(LOG_ERR, "failed to add record to SERD engine '%s'", + name); + return (FMD_B_FALSE); + } + err = fmd_serd_eng_record(sgp, ep->ev_hrt); + + return (err); +} + +/* FMD Timers */ + +static void +_timer_notify(union sigval sv) +{ + fmd_timer_t *ftp = sv.sival_ptr; + fmd_hdl_t *hdl = ftp->ft_hdl; + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + struct itimerspec its; + + fmd_hdl_debug(hdl, "timer fired (%p)", ftp->ft_tid); + + /* disarm the timer */ + bzero(&its, sizeof (struct itimerspec)); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + /* Note that the fmdo_timeout can remove this timer */ + if (ops->fmdo_timeout != NULL) + ops->fmdo_timeout(hdl, ftp, ftp->ft_arg); +} + +/* + * Install a new timer which will fire at least delta nanoseconds after the + * current time. After the timeout has expired, the module's fmdo_timeout + * entry point is called. + */ +fmd_timer_t * +fmd_timer_install(fmd_hdl_t *hdl, void *arg, fmd_event_t *ep, hrtime_t delta) +{ + struct sigevent sev; + struct itimerspec its; + fmd_timer_t *ftp; + + ftp = fmd_hdl_alloc(hdl, sizeof (fmd_timer_t), FMD_SLEEP); + ftp->ft_arg = arg; + ftp->ft_hdl = hdl; + + its.it_value.tv_sec = delta / 1000000000; + its.it_value.tv_nsec = delta % 1000000000; + its.it_interval.tv_sec = its.it_value.tv_sec; + its.it_interval.tv_nsec = its.it_value.tv_nsec; + + sev.sigev_notify = SIGEV_THREAD; + sev.sigev_notify_function = _timer_notify; + sev.sigev_notify_attributes = NULL; + sev.sigev_value.sival_ptr = ftp; + + timer_create(CLOCK_REALTIME, &sev, &ftp->ft_tid); + timer_settime(ftp->ft_tid, 0, &its, NULL); + + fmd_hdl_debug(hdl, "installing timer for %d secs (%p)", + (int)its.it_value.tv_sec, ftp->ft_tid); + + return (ftp); +} + +void +fmd_timer_remove(fmd_hdl_t *hdl, fmd_timer_t *ftp) +{ + fmd_hdl_debug(hdl, "removing timer (%p)", ftp->ft_tid); + + timer_delete(ftp->ft_tid); + + fmd_hdl_free(hdl, ftp, sizeof (fmd_timer_t)); +} + +/* Name-Value Pair Lists */ + +nvlist_t * +fmd_nvl_create_fault(fmd_hdl_t *hdl, const char *class, uint8_t certainty, + nvlist_t *asru, nvlist_t *fru, nvlist_t *resource) +{ + nvlist_t *nvl; + int err = 0; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + zed_log_die("failed to xalloc fault nvlist"); + + err |= nvlist_add_uint8(nvl, FM_VERSION, FM_FAULT_VERSION); + err |= nvlist_add_string(nvl, FM_CLASS, class); + err |= nvlist_add_uint8(nvl, FM_FAULT_CERTAINTY, certainty); + + if (asru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_ASRU, asru); + if (fru != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_FRU, fru); + if (resource != NULL) + err |= nvlist_add_nvlist(nvl, FM_FAULT_RESOURCE, resource); + + if (err) + zed_log_die("failed to populate nvlist: %s\n", strerror(err)); + + return (nvl); +} + +/* + * sourced from fmd_string.c + */ +static int +fmd_strmatch(const char *s, const char *p) +{ + char c; + + if (p == NULL) + return (0); + + if (s == NULL) + s = ""; /* treat NULL string as the empty string */ + + do { + if ((c = *p++) == '\0') + return (*s == '\0'); + + if (c == '*') { + while (*p == '*') + p++; /* consecutive *'s can be collapsed */ + + if (*p == '\0') + return (1); + + while (*s != '\0') { + if (fmd_strmatch(s++, p) != 0) + return (1); + } + + return (0); + } + } while (c == *s++); + + return (0); +} + +int +fmd_nvl_class_match(fmd_hdl_t *hdl, nvlist_t *nvl, const char *pattern) +{ + char *class; + + return (nvl != NULL && + nvlist_lookup_string(nvl, FM_CLASS, &class) == 0 && + fmd_strmatch(class, pattern)); +} + +nvlist_t * +fmd_nvl_alloc(fmd_hdl_t *hdl, int flags) +{ + nvlist_t *nvl = NULL; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + return (nvl); +} + + +/* + * ZED Agent specific APIs + */ + +fmd_hdl_t * +fmd_module_hdl(const char *name) +{ + if (strcmp(name, "zfs-retire") == 0) + return ((fmd_hdl_t *)&zfs_retire_module); + if (strcmp(name, "zfs-diagnosis") == 0) + return ((fmd_hdl_t *)&zfs_diagnosis_module); + + return (NULL); +} + +boolean_t +fmd_module_initialized(fmd_hdl_t *hdl) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + + return (mp->mod_info != NULL); +} + +/* + * fmd_module_recv is called for each event that is received by + * the fault manager that has a class that matches one of the + * module's subscriptions. + */ +void +fmd_module_recv(fmd_hdl_t *hdl, nvlist_t *nvl, const char *class) +{ + fmd_module_t *mp = (fmd_module_t *)hdl; + const fmd_hdl_ops_t *ops = mp->mod_info->fmdi_ops; + fmd_event_t faux_event = {0}; + int64_t *tv; + uint_t n; + + /* + * Will need to normalized this if we persistently store the case data + */ + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0) + faux_event.ev_hrt = tv[0] * NANOSEC + tv[1]; + else + faux_event.ev_hrt = 0; + + ops->fmdo_recv(hdl, &faux_event, nvl, class); + + mp->mod_stats.ms_accepted.fmds_value.ui64++; + + /* TBD - should we initiate fm_module_gc() periodically? */ +} diff --git a/cmd/zed/agents/fmd_api.h b/cmd/zed/agents/fmd_api.h new file mode 100644 index 000000000000..4f06fb244b7b --- /dev/null +++ b/cmd/zed/agents/fmd_api.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_API_H +#define _FMD_API_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Fault Management Daemon Client Interfaces + */ + +#define FMD_API_VERSION 5 + +typedef struct fmd_hdl fmd_hdl_t; + +typedef struct fmd_timer { + timer_t ft_tid; + void *ft_arg; + fmd_hdl_t *ft_hdl; +} fmd_timer_t; + +#define id_t fmd_timer_t * + + +typedef struct fmd_event { + hrtime_t ev_hrt; /* event time used by SERD engines */ +} fmd_event_t; + +typedef struct fmd_case { + char ci_uuid[48]; /* uuid string for this case */ + fmd_hdl_t *ci_mod; /* module that owns this case */ + void *ci_data; /* data from fmd_case_setspecific() */ + ushort_t ci_state; /* case state (see below) */ + ushort_t ci_flags; /* case flags (see below) */ + struct timeval ci_tv; /* time of original diagnosis */ + void *ci_bufptr; /* case data serialization buffer */ + size_t ci_bufsiz; +} fmd_case_t; + + +#define FMD_B_FALSE 0 /* false value for booleans as int */ +#define FMD_B_TRUE 1 /* true value for booleans as int */ + + +#define FMD_CASE_UNSOLVED 0 /* case is not yet solved (waiting) */ +#define FMD_CASE_SOLVED 1 /* case is solved (suspects added) */ +#define FMD_CASE_CLOSE_WAIT 2 /* case is executing fmdo_close() */ +#define FMD_CASE_CLOSED 3 /* case is closed (reconfig done) */ +#define FMD_CASE_REPAIRED 4 /* case is repaired */ +#define FMD_CASE_RESOLVED 5 /* case is resolved (can be freed) */ + +#define FMD_CF_DIRTY 0x01 /* case is in need of checkpoint */ +#define FMD_CF_SOLVED 0x02 /* case has been solved */ +#define FMD_CF_ISOLATED 0x04 /* case has been isolated */ +#define FMD_CF_REPAIRED 0x08 /* case has been repaired */ +#define FMD_CF_RESOLVED 0x10 /* case has been resolved */ + + +#define FMD_TYPE_BOOL 0 /* int */ +#define FMD_TYPE_INT32 1 /* int32_t */ +#define FMD_TYPE_UINT32 2 /* uint32_t */ +#define FMD_TYPE_INT64 3 /* int64_t */ +#define FMD_TYPE_UINT64 4 /* uint64_t */ +#define FMD_TYPE_TIME 5 /* uint64_t */ +#define FMD_TYPE_SIZE 6 /* uint64_t */ + +typedef struct fmd_prop { + const char *fmdp_name; /* property name */ + uint_t fmdp_type; /* property type (see above) */ + const char *fmdp_defv; /* default value */ +} fmd_prop_t; + +typedef struct fmd_stat { + char fmds_name[32]; /* statistic name */ + uint_t fmds_type; /* statistic type (see above) */ + char fmds_desc[64]; /* statistic description */ + union { + int bool; /* FMD_TYPE_BOOL */ + int32_t i32; /* FMD_TYPE_INT32 */ + uint32_t ui32; /* FMD_TYPE_UINT32 */ + int64_t i64; /* FMD_TYPE_INT64 */ + uint64_t ui64; /* FMD_TYPE_UINT64 */ + } fmds_value; +} fmd_stat_t; + +typedef struct fmd_hdl_ops { + void (*fmdo_recv)(fmd_hdl_t *, fmd_event_t *, nvlist_t *, const char *); + void (*fmdo_timeout)(fmd_hdl_t *, id_t, void *); + void (*fmdo_close)(fmd_hdl_t *, fmd_case_t *); + void (*fmdo_stats)(fmd_hdl_t *); + void (*fmdo_gc)(fmd_hdl_t *); +} fmd_hdl_ops_t; + +#define FMD_SEND_SUCCESS 0 /* fmdo_send queued event */ +#define FMD_SEND_FAILED 1 /* fmdo_send unrecoverable error */ +#define FMD_SEND_RETRY 2 /* fmdo_send requests retry */ + +typedef struct fmd_hdl_info { + const char *fmdi_desc; /* fmd client description string */ + const char *fmdi_vers; /* fmd client version string */ + const fmd_hdl_ops_t *fmdi_ops; /* ops vector for client */ + const fmd_prop_t *fmdi_props; /* array of configuration props */ +} fmd_hdl_info_t; + +extern int fmd_hdl_register(fmd_hdl_t *, int, const fmd_hdl_info_t *); +extern void fmd_hdl_unregister(fmd_hdl_t *); + +extern void fmd_hdl_setspecific(fmd_hdl_t *, void *); +extern void *fmd_hdl_getspecific(fmd_hdl_t *); + +#define FMD_SLEEP UMEM_NOFAIL + +extern void *fmd_hdl_alloc(fmd_hdl_t *, size_t, int); +extern void *fmd_hdl_zalloc(fmd_hdl_t *, size_t, int); +extern void fmd_hdl_free(fmd_hdl_t *, void *, size_t); + +extern char *fmd_hdl_strdup(fmd_hdl_t *, const char *, int); +extern void fmd_hdl_strfree(fmd_hdl_t *, char *); + +extern void fmd_hdl_vdebug(fmd_hdl_t *, const char *, va_list); +extern void fmd_hdl_debug(fmd_hdl_t *, const char *, ...); + +extern int32_t fmd_prop_get_int32(fmd_hdl_t *, const char *); +extern int64_t fmd_prop_get_int64(fmd_hdl_t *, const char *); + +#define FMD_STAT_NOALLOC 0x0 /* fmd should use caller's memory */ +#define FMD_STAT_ALLOC 0x1 /* fmd should allocate stats memory */ + +extern fmd_stat_t *fmd_stat_create(fmd_hdl_t *, uint_t, uint_t, fmd_stat_t *); +extern void fmd_stat_destroy(fmd_hdl_t *, uint_t, fmd_stat_t *); +extern void fmd_stat_setstr(fmd_hdl_t *, fmd_stat_t *, const char *); + +extern fmd_case_t *fmd_case_open(fmd_hdl_t *, void *); +extern void fmd_case_reset(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_solve(fmd_hdl_t *, fmd_case_t *); +extern void fmd_case_close(fmd_hdl_t *, fmd_case_t *); + +extern const char *fmd_case_uuid(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_uulookup(fmd_hdl_t *, const char *); +extern void fmd_case_uuclose(fmd_hdl_t *, const char *); +extern int fmd_case_uuclosed(fmd_hdl_t *, const char *); +extern int fmd_case_uuisresolved(fmd_hdl_t *, const char *); +extern void fmd_case_uuresolved(fmd_hdl_t *, const char *); + +extern int fmd_case_solved(fmd_hdl_t *, fmd_case_t *); +extern int fmd_case_closed(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_case_add_ereport(fmd_hdl_t *, fmd_case_t *, fmd_event_t *); +extern void fmd_case_add_serd(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_case_add_suspect(fmd_hdl_t *, fmd_case_t *, nvlist_t *); + +extern void fmd_case_setspecific(fmd_hdl_t *, fmd_case_t *, void *); +extern void *fmd_case_getspecific(fmd_hdl_t *, fmd_case_t *); + +extern fmd_case_t *fmd_case_next(fmd_hdl_t *, fmd_case_t *); +extern fmd_case_t *fmd_case_prev(fmd_hdl_t *, fmd_case_t *); + +extern void fmd_buf_create(fmd_hdl_t *, fmd_case_t *, const char *, size_t); +extern void fmd_buf_destroy(fmd_hdl_t *, fmd_case_t *, const char *); +extern void fmd_buf_read(fmd_hdl_t *, fmd_case_t *, + const char *, void *, size_t); +extern void fmd_buf_write(fmd_hdl_t *, fmd_case_t *, + const char *, const void *, size_t); +extern size_t fmd_buf_size(fmd_hdl_t *, fmd_case_t *, const char *); + +extern void fmd_serd_create(fmd_hdl_t *, const char *, uint_t, hrtime_t); +extern void fmd_serd_destroy(fmd_hdl_t *, const char *); +extern int fmd_serd_exists(fmd_hdl_t *, const char *); +extern void fmd_serd_reset(fmd_hdl_t *, const char *); +extern int fmd_serd_record(fmd_hdl_t *, const char *, fmd_event_t *); +extern int fmd_serd_fired(fmd_hdl_t *, const char *); +extern int fmd_serd_empty(fmd_hdl_t *, const char *); + +extern id_t fmd_timer_install(fmd_hdl_t *, void *, fmd_event_t *, hrtime_t); +extern void fmd_timer_remove(fmd_hdl_t *, id_t); + +extern nvlist_t *fmd_nvl_create_fault(fmd_hdl_t *, + const char *, uint8_t, nvlist_t *, nvlist_t *, nvlist_t *); + +extern int fmd_nvl_class_match(fmd_hdl_t *, nvlist_t *, const char *); + +#define FMD_HAS_FAULT_FRU 0 +#define FMD_HAS_FAULT_ASRU 1 +#define FMD_HAS_FAULT_RESOURCE 2 + +extern void fmd_repair_fru(fmd_hdl_t *, const char *); +extern int fmd_repair_asru(fmd_hdl_t *, const char *); + +extern nvlist_t *fmd_nvl_alloc(fmd_hdl_t *, int); +extern nvlist_t *fmd_nvl_dup(fmd_hdl_t *, nvlist_t *, int); + +/* + * ZED Specific Interfaces + */ + +extern fmd_hdl_t *fmd_module_hdl(const char *); +extern boolean_t fmd_module_initialized(fmd_hdl_t *); +extern void fmd_module_recv(fmd_hdl_t *, nvlist_t *, const char *); + +/* ZFS FMA Retire Agent */ +extern void _zfs_retire_init(fmd_hdl_t *); +extern void _zfs_retire_fini(fmd_hdl_t *); + +/* ZFS FMA Diagnosis Engine */ +extern void _zfs_diagnosis_init(fmd_hdl_t *); +extern void _zfs_diagnosis_fini(fmd_hdl_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_API_H */ diff --git a/cmd/zed/agents/fmd_serd.c b/cmd/zed/agents/fmd_serd.c new file mode 100644 index 000000000000..d4ec37fb7691 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.c @@ -0,0 +1,316 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include + +#include "fmd_api.h" +#include "fmd_serd.h" +#include "../zed_log.h" + + +#define FMD_STR_BUCKETS 211 + + +#ifdef SERD_ENG_DEBUG +#define serd_log_msg(fmt, ...) \ + zed_log_msg(LOG_INFO, fmt, __VA_ARGS__) +#else +#define serd_log_msg(fmt, ...) +#endif + + +/* + * SERD Engine Backend + */ + +/* + * Compute the delta between events in nanoseconds. To account for very old + * events which are replayed, we must handle the case where time is negative. + * We convert the hrtime_t's to unsigned 64-bit integers and then handle the + * case where 'old' is greater than 'new' (i.e. high-res time has wrapped). + */ +static hrtime_t +fmd_event_delta(hrtime_t t1, hrtime_t t2) +{ + uint64_t old = t1; + uint64_t new = t2; + + return (new >= old ? new - old : (UINT64_MAX - old) + new + 1); +} + +static fmd_serd_eng_t * +fmd_serd_eng_alloc(const char *name, uint64_t n, hrtime_t t) +{ + fmd_serd_eng_t *sgp; + + sgp = malloc(sizeof (fmd_serd_eng_t)); + bzero(sgp, sizeof (fmd_serd_eng_t)); + + sgp->sg_name = strdup(name); + sgp->sg_flags = FMD_SERD_DIRTY; + sgp->sg_n = n; + sgp->sg_t = t; + + list_create(&sgp->sg_list, sizeof (fmd_serd_elem_t), + offsetof(fmd_serd_elem_t, se_list)); + + return (sgp); +} + +static void +fmd_serd_eng_free(fmd_serd_eng_t *sgp) +{ + fmd_serd_eng_reset(sgp); + free(sgp->sg_name); + list_destroy(&sgp->sg_list); + free(sgp); +} + +/* + * sourced from fmd_string.c + */ +static ulong_t +fmd_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h); +} + +void +fmd_serd_hash_create(fmd_serd_hash_t *shp) +{ + shp->sh_hashlen = FMD_STR_BUCKETS; + shp->sh_hash = calloc(shp->sh_hashlen, sizeof (void *)); + shp->sh_count = 0; +} + +void +fmd_serd_hash_destroy(fmd_serd_hash_t *shp) +{ + fmd_serd_eng_t *sgp, *ngp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = ngp) { + ngp = sgp->sg_next; + fmd_serd_eng_free(sgp); + } + } + + free(shp->sh_hash); + bzero(shp, sizeof (fmd_serd_hash_t)); +} + +void +fmd_serd_hash_apply(fmd_serd_hash_t *shp, fmd_serd_eng_f *func, void *arg) +{ + fmd_serd_eng_t *sgp; + uint_t i; + + for (i = 0; i < shp->sh_hashlen; i++) { + for (sgp = shp->sh_hash[i]; sgp != NULL; sgp = sgp->sg_next) + func(sgp, arg); + } +} + +fmd_serd_eng_t * +fmd_serd_eng_insert(fmd_serd_hash_t *shp, const char *name, + uint_t n, hrtime_t t) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp = fmd_serd_eng_alloc(name, n, t); + + serd_log_msg(" SERD Engine: inserting %s N %d T %llu", + name, (int)n, (long long unsigned)t); + + sgp->sg_next = shp->sh_hash[h]; + shp->sh_hash[h] = sgp; + shp->sh_count++; + + return (sgp); +} + +fmd_serd_eng_t * +fmd_serd_eng_lookup(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp; + + for (sgp = shp->sh_hash[h]; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(name, sgp->sg_name) == 0) + return (sgp); + } + + return (NULL); +} + +void +fmd_serd_eng_delete(fmd_serd_hash_t *shp, const char *name) +{ + uint_t h = fmd_strhash(name) % shp->sh_hashlen; + fmd_serd_eng_t *sgp, **pp = &shp->sh_hash[h]; + + serd_log_msg(" SERD Engine: deleting %s", name); + + for (sgp = *pp; sgp != NULL; sgp = sgp->sg_next) { + if (strcmp(sgp->sg_name, name) != 0) + pp = &sgp->sg_next; + else + break; + } + + if (sgp != NULL) { + *pp = sgp->sg_next; + fmd_serd_eng_free(sgp); + assert(shp->sh_count != 0); + shp->sh_count--; + } +} + +static void +fmd_serd_eng_discard(fmd_serd_eng_t *sgp, fmd_serd_elem_t *sep) +{ + list_remove(&sgp->sg_list, sep); + sgp->sg_count--; + + serd_log_msg(" SERD Engine: discarding %s, %d remaining", + sgp->sg_name, (int)sgp->sg_count); + + free(sep); +} + +int +fmd_serd_eng_record(fmd_serd_eng_t *sgp, hrtime_t hrt) +{ + fmd_serd_elem_t *sep, *oep; + + /* + * If the fired flag is already set, return false and discard the + * event. This means that the caller will only see the engine "fire" + * once until fmd_serd_eng_reset() is called. The fmd_serd_eng_fired() + * function can also be used in combination with fmd_serd_eng_record(). + */ + if (sgp->sg_flags & FMD_SERD_FIRED) { + serd_log_msg(" SERD Engine: record %s already fired!", + sgp->sg_name); + return (FMD_B_FALSE); + } + + while (sgp->sg_count >= sgp->sg_n) + fmd_serd_eng_discard(sgp, list_tail(&sgp->sg_list)); + + sep = malloc(sizeof (fmd_serd_elem_t)); + sep->se_hrt = hrt; + + list_insert_head(&sgp->sg_list, sep); + sgp->sg_count++; + + serd_log_msg(" SERD Engine: recording %s of %d (%llu)", + sgp->sg_name, (int)sgp->sg_count, (long long unsigned)hrt); + + /* + * Pick up the oldest element pointer for comparison to 'sep'. We must + * do this after adding 'sep' because 'oep' and 'sep' can be the same. + */ + oep = list_tail(&sgp->sg_list); + + if (sgp->sg_count >= sgp->sg_n && + fmd_event_delta(oep->se_hrt, sep->se_hrt) <= sgp->sg_t) { + sgp->sg_flags |= FMD_SERD_FIRED | FMD_SERD_DIRTY; + serd_log_msg(" SERD Engine: fired %s", sgp->sg_name); + return (FMD_B_TRUE); + } + + sgp->sg_flags |= FMD_SERD_DIRTY; + return (FMD_B_FALSE); +} + +int +fmd_serd_eng_fired(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_flags & FMD_SERD_FIRED); +} + +int +fmd_serd_eng_empty(fmd_serd_eng_t *sgp) +{ + return (sgp->sg_count == 0); +} + +void +fmd_serd_eng_reset(fmd_serd_eng_t *sgp) +{ + serd_log_msg(" SERD Engine: resetting %s", sgp->sg_name); + + while (sgp->sg_count != 0) + fmd_serd_eng_discard(sgp, list_head(&sgp->sg_list)); + + sgp->sg_flags &= ~FMD_SERD_FIRED; + sgp->sg_flags |= FMD_SERD_DIRTY; +} + +void +fmd_serd_eng_gc(fmd_serd_eng_t *sgp) +{ + fmd_serd_elem_t *sep, *nep; + hrtime_t hrt; + + if (sgp->sg_count == 0 || (sgp->sg_flags & FMD_SERD_FIRED)) + return; /* no garbage collection needed if empty or fired */ + + sep = list_head(&sgp->sg_list); + if (sep == NULL) + return; + + hrt = sep->se_hrt - sgp->sg_t; + + for (sep = list_head(&sgp->sg_list); sep != NULL; sep = nep) { + if (sep->se_hrt >= hrt) + break; /* sep and subsequent events are all within T */ + + nep = list_next(&sgp->sg_list, sep); + fmd_serd_eng_discard(sgp, sep); + sgp->sg_flags |= FMD_SERD_DIRTY; + } +} diff --git a/cmd/zed/agents/fmd_serd.h b/cmd/zed/agents/fmd_serd.h new file mode 100644 index 000000000000..c35c9acc7785 --- /dev/null +++ b/cmd/zed/agents/fmd_serd.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef _FMD_SERD_H +#define _FMD_SERD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +typedef struct fmd_serd_elem { + list_node_t se_list; /* linked list forward/back pointers */ + hrtime_t se_hrt; /* upper bound on event hrtime */ +} fmd_serd_elem_t; + +typedef struct fmd_serd_eng { + char *sg_name; /* string name for this engine */ + struct fmd_serd_eng *sg_next; /* next engine on hash chain */ + list_t sg_list; /* list of fmd_serd_elem_t's */ + uint_t sg_count; /* count of events in sg_list */ + uint_t sg_flags; /* engine flags (see below) */ + uint_t sg_n; /* engine N parameter (event count) */ + hrtime_t sg_t; /* engine T parameter (nanoseconds) */ +} fmd_serd_eng_t; + +#define FMD_SERD_FIRED 0x1 /* error rate has exceeded threshold */ +#define FMD_SERD_DIRTY 0x2 /* engine needs to be checkpointed */ + +typedef void fmd_serd_eng_f(fmd_serd_eng_t *, void *); + +typedef struct fmd_serd_hash { + fmd_serd_eng_t **sh_hash; /* hash bucket array for buffers */ + uint_t sh_hashlen; /* length of hash bucket array */ + uint_t sh_count; /* count of engines in hash */ +} fmd_serd_hash_t; + +extern void fmd_serd_hash_create(fmd_serd_hash_t *); +extern void fmd_serd_hash_destroy(fmd_serd_hash_t *); +extern void fmd_serd_hash_apply(fmd_serd_hash_t *, fmd_serd_eng_f *, void *); + +extern fmd_serd_eng_t *fmd_serd_eng_insert(fmd_serd_hash_t *, + const char *, uint32_t, hrtime_t); + +extern fmd_serd_eng_t *fmd_serd_eng_lookup(fmd_serd_hash_t *, const char *); +extern void fmd_serd_eng_delete(fmd_serd_hash_t *, const char *); + +extern int fmd_serd_eng_record(fmd_serd_eng_t *, hrtime_t); +extern int fmd_serd_eng_fired(fmd_serd_eng_t *); +extern int fmd_serd_eng_empty(fmd_serd_eng_t *); + +extern void fmd_serd_eng_reset(fmd_serd_eng_t *); +extern void fmd_serd_eng_gc(fmd_serd_eng_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _FMD_SERD_H */ diff --git a/cmd/zed/agents/zfs_agents.c b/cmd/zed/agents/zfs_agents.c new file mode 100644 index 000000000000..006e0ab99f47 --- /dev/null +++ b/cmd/zed/agents/zfs_agents.c @@ -0,0 +1,422 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_agents.h" +#include "fmd_api.h" +#include "../zed_log.h" + +/* + * agent dispatch code + */ + +static pthread_mutex_t agent_lock = PTHREAD_MUTEX_INITIALIZER; +static pthread_cond_t agent_cond = PTHREAD_COND_INITIALIZER; +static list_t agent_events; /* list of pending events */ +static int agent_exiting; + +typedef struct agent_event { + char ae_class[64]; + char ae_subclass[32]; + nvlist_t *ae_nvl; + list_node_t ae_node; +} agent_event_t; + +pthread_t g_agents_tid; + +libzfs_handle_t *g_zfs_hdl; + +/* guid search data */ +typedef enum device_type { + DEVICE_TYPE_L2ARC, /* l2arc device */ + DEVICE_TYPE_SPARE, /* spare device */ + DEVICE_TYPE_PRIMARY /* any primary pool storage device */ +} device_type_t; + +typedef struct guid_search { + uint64_t gs_pool_guid; + uint64_t gs_vdev_guid; + char *gs_devid; + device_type_t gs_vdev_type; + uint64_t gs_vdev_expandtime; /* vdev expansion time */ +} guid_search_t; + +/* + * Walks the vdev tree recursively looking for a matching devid. + * Returns B_TRUE as soon as a matching device is found, B_FALSE otherwise. + */ +static boolean_t +zfs_agent_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *arg) +{ + guid_search_t *gsp = arg; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_PRIMARY; + return (B_TRUE); + } + } + } + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_L2ARC; + return (B_TRUE); + } + } + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if (zfs_agent_iter_vdev(zhp, child[c], gsp)) { + gsp->gs_vdev_type = DEVICE_TYPE_SPARE; + return (B_TRUE); + } + } + } + /* + * On a devid match, grab the vdev guid and expansion time, if any. + */ + if (gsp->gs_devid != NULL && + (nvlist_lookup_string(nvl, ZPOOL_CONFIG_DEVID, &path) == 0) && + (strcmp(gsp->gs_devid, path) == 0)) { + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &gsp->gs_vdev_guid); + (void) nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_EXPANSION_TIME, + &gsp->gs_vdev_expandtime); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +zfs_agent_iter_pool(zpool_handle_t *zhp, void *arg) +{ + guid_search_t *gsp = arg; + nvlist_t *config, *nvl; + + /* + * For each vdev in this pool, look for a match by devid + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvl) == 0) { + (void) zfs_agent_iter_vdev(zhp, nvl, gsp); + } + } + /* + * if a match was found then grab the pool guid + */ + if (gsp->gs_vdev_guid) { + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &gsp->gs_pool_guid); + } + + zpool_close(zhp); + return (gsp->gs_vdev_guid != 0); +} + +void +zfs_agent_post_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + agent_event_t *event; + + if (subclass == NULL) + subclass = ""; + + event = malloc(sizeof (agent_event_t)); + if (event == NULL || nvlist_dup(nvl, &event->ae_nvl, 0) != 0) { + if (event) + free(event); + return; + } + + if (strcmp(class, "sysevent.fs.zfs.vdev_check") == 0) { + class = EC_ZFS; + subclass = ESC_ZFS_VDEV_CHECK; + } + + /* + * On ZFS on Linux, we don't get the expected FM_RESOURCE_REMOVED + * ereport from vdev_disk layer after a hot unplug. Fortunately we + * get a EC_DEV_REMOVE from our disk monitor and it is a suitable + * proxy so we remap it here for the benefit of the diagnosis engine. + */ + if ((strcmp(class, EC_DEV_REMOVE) == 0) && + (strcmp(subclass, ESC_DISK) == 0) && + (nvlist_exists(nvl, ZFS_EV_VDEV_GUID) || + nvlist_exists(nvl, DEV_IDENTIFIER))) { + nvlist_t *payload = event->ae_nvl; + struct timeval tv; + int64_t tod[2]; + uint64_t pool_guid = 0, vdev_guid = 0; + guid_search_t search = { 0 }; + device_type_t devtype = DEVICE_TYPE_PRIMARY; + + class = "resource.fs.zfs.removed"; + subclass = ""; + + (void) nvlist_add_string(payload, FM_CLASS, class); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &pool_guid); + (void) nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &vdev_guid); + + (void) gettimeofday(&tv, NULL); + tod[0] = tv.tv_sec; + tod[1] = tv.tv_usec; + (void) nvlist_add_int64_array(payload, FM_EREPORT_TIME, tod, 2); + + /* + * For multipath, spare and l2arc devices ZFS_EV_VDEV_GUID or + * ZFS_EV_POOL_GUID may be missing so find them. + */ + (void) nvlist_lookup_string(nvl, DEV_IDENTIFIER, + &search.gs_devid); + (void) zpool_iter(g_zfs_hdl, zfs_agent_iter_pool, &search); + pool_guid = search.gs_pool_guid; + vdev_guid = search.gs_vdev_guid; + devtype = search.gs_vdev_type; + + /* + * We want to avoid reporting "remove" events coming from + * libudev for VDEVs which were expanded recently (10s) and + * avoid activating spares in response to partitions being + * deleted and created in rapid succession. + */ + if (search.gs_vdev_expandtime != 0 && + search.gs_vdev_expandtime + 10 > tv.tv_sec) { + zed_log_msg(LOG_INFO, "agent post event: ignoring '%s' " + "for recently expanded device '%s'", EC_DEV_REMOVE, + search.gs_devid); + goto out; + } + + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, pool_guid); + (void) nvlist_add_uint64(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vdev_guid); + switch (devtype) { + case DEVICE_TYPE_L2ARC: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + VDEV_TYPE_L2CACHE); + break; + case DEVICE_TYPE_SPARE: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_SPARE); + break; + case DEVICE_TYPE_PRIMARY: + (void) nvlist_add_string(payload, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, VDEV_TYPE_DISK); + break; + } + + zed_log_msg(LOG_INFO, "agent post event: mapping '%s' to '%s'", + EC_DEV_REMOVE, class); + } + + (void) strlcpy(event->ae_class, class, sizeof (event->ae_class)); + (void) strlcpy(event->ae_subclass, subclass, + sizeof (event->ae_subclass)); + + (void) pthread_mutex_lock(&agent_lock); + list_insert_tail(&agent_events, event); + (void) pthread_mutex_unlock(&agent_lock); + +out: + (void) pthread_cond_signal(&agent_cond); +} + +static void +zfs_agent_dispatch(const char *class, const char *subclass, nvlist_t *nvl) +{ + /* + * The diagnosis engine subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-diagnosis.conf + */ + if (strstr(class, "ereport.fs.zfs.") != NULL || + strstr(class, "resource.fs.zfs.") != NULL || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove_dev") == 0 || + strcmp(class, "sysevent.fs.zfs.pool_destroy") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-diagnosis"), nvl, class); + } + + /* + * The retire agent subscribes to the following events. + * On illumos these subscriptions reside in: + * /usr/lib/fm/fmd/plugins/zfs-retire.conf + * + * NOTE: faults events come directly from our diagnosis engine + * and will not pass through the zfs kernel module. + */ + if (strcmp(class, FM_LIST_SUSPECT_CLASS) == 0 || + strcmp(class, "resource.fs.zfs.removed") == 0 || + strcmp(class, "resource.fs.zfs.statechange") == 0 || + strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + fmd_module_recv(fmd_module_hdl("zfs-retire"), nvl, class); + } + + /* + * The SLM module only consumes disk events and vdev check events + * + * NOTE: disk events come directly from disk monitor and will + * not pass through the zfs kernel module. + */ + if (strstr(class, "EC_dev_") != NULL || + strcmp(class, EC_ZFS) == 0) { + (void) zfs_slm_event(class, subclass, nvl); + } +} + +/* + * Events are consumed and dispatched from this thread + * An agent can also post an event so event list lock + * is not held when calling an agent. + * One event is consumed at a time. + */ +static void * +zfs_agent_consumer_thread(void *arg) +{ + for (;;) { + agent_event_t *event; + + (void) pthread_mutex_lock(&agent_lock); + + /* wait for an event to show up */ + while (!agent_exiting && list_is_empty(&agent_events)) + (void) pthread_cond_wait(&agent_cond, &agent_lock); + + if (agent_exiting) { + (void) pthread_mutex_unlock(&agent_lock); + zed_log_msg(LOG_INFO, "zfs_agent_consumer_thread: " + "exiting"); + return (NULL); + } + + if ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + + (void) pthread_mutex_unlock(&agent_lock); + + /* dispatch to all event subscribers */ + zfs_agent_dispatch(event->ae_class, event->ae_subclass, + event->ae_nvl); + + nvlist_free(event->ae_nvl); + free(event); + continue; + } + + (void) pthread_mutex_unlock(&agent_lock); + } + + return (NULL); +} + +void +zfs_agent_init(libzfs_handle_t *zfs_hdl) +{ + fmd_hdl_t *hdl; + + g_zfs_hdl = zfs_hdl; + + if (zfs_slm_init() != 0) + zed_log_die("Failed to initialize zfs slm"); + zed_log_msg(LOG_INFO, "Add Agent: init"); + + hdl = fmd_module_hdl("zfs-diagnosis"); + _zfs_diagnosis_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs diagnosis"); + + hdl = fmd_module_hdl("zfs-retire"); + _zfs_retire_init(hdl); + if (!fmd_module_initialized(hdl)) + zed_log_die("Failed to initialize zfs retire"); + + list_create(&agent_events, sizeof (agent_event_t), + offsetof(struct agent_event, ae_node)); + + if (pthread_create(&g_agents_tid, NULL, zfs_agent_consumer_thread, + NULL) != 0) { + list_destroy(&agent_events); + zed_log_die("Failed to initialize agents"); + } +} + +void +zfs_agent_fini(void) +{ + fmd_hdl_t *hdl; + agent_event_t *event; + + agent_exiting = 1; + (void) pthread_cond_signal(&agent_cond); + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_agents_tid, NULL); + + /* drain any pending events */ + while ((event = (list_head(&agent_events))) != NULL) { + list_remove(&agent_events, event); + nvlist_free(event->ae_nvl); + free(event); + } + + list_destroy(&agent_events); + + if ((hdl = fmd_module_hdl("zfs-retire")) != NULL) { + _zfs_retire_fini(hdl); + fmd_hdl_unregister(hdl); + } + if ((hdl = fmd_module_hdl("zfs-diagnosis")) != NULL) { + _zfs_diagnosis_fini(hdl); + fmd_hdl_unregister(hdl); + } + + zed_log_msg(LOG_INFO, "Add Agent: fini"); + zfs_slm_fini(); + + g_zfs_hdl = NULL; +} diff --git a/cmd/zed/agents/zfs_agents.h b/cmd/zed/agents/zfs_agents.h new file mode 100644 index 000000000000..d1a459139b1e --- /dev/null +++ b/cmd/zed/agents/zfs_agents.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZFS_AGENTS_H +#define ZFS_AGENTS_H + +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Agent abstraction presented to ZED + */ +extern void zfs_agent_init(libzfs_handle_t *); +extern void zfs_agent_fini(void); +extern void zfs_agent_post_event(const char *, const char *, nvlist_t *); + +/* + * ZFS Sysevent Linkable Module (SLM) + */ +extern int zfs_slm_init(void); +extern void zfs_slm_fini(void); +extern void zfs_slm_event(const char *, const char *, nvlist_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZFS_AGENTS_H */ diff --git a/cmd/zed/agents/zfs_diagnosis.c b/cmd/zed/agents/zfs_diagnosis.c new file mode 100644 index 000000000000..0b27f6702ee8 --- /dev/null +++ b/cmd/zed/agents/zfs_diagnosis.c @@ -0,0 +1,981 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_agents.h" +#include "fmd_api.h" + +/* + * Our serd engines are named 'zfs___{checksum,io}'. This + * #define reserves enough space for two 64-bit hex values plus the length of + * the longest string. + */ +#define MAX_SERDLEN (16 * 2 + sizeof ("zfs___checksum")) + +/* + * On-disk case structure. This must maintain backwards compatibility with + * previous versions of the DE. By default, any members appended to the end + * will be filled with zeros if they don't exist in a previous version. + */ +typedef struct zfs_case_data { + uint64_t zc_version; + uint64_t zc_ena; + uint64_t zc_pool_guid; + uint64_t zc_vdev_guid; + int zc_pool_state; + char zc_serd_checksum[MAX_SERDLEN]; + char zc_serd_io[MAX_SERDLEN]; + int zc_has_remove_timer; +} zfs_case_data_t; + +/* + * Time-of-day + */ +typedef struct er_timeval { + uint64_t ertv_sec; + uint64_t ertv_nsec; +} er_timeval_t; + +/* + * In-core case structure. + */ +typedef struct zfs_case { + boolean_t zc_present; + uint32_t zc_version; + zfs_case_data_t zc_data; + fmd_case_t *zc_case; + uu_list_node_t zc_node; + id_t zc_remove_timer; + char *zc_fru; + er_timeval_t zc_when; +} zfs_case_t; + +#define CASE_DATA "data" +#define CASE_FRU "fru" +#define CASE_DATA_VERSION_INITIAL 1 +#define CASE_DATA_VERSION_SERD 2 + +typedef struct zfs_de_stats { + fmd_stat_t old_drops; + fmd_stat_t dev_drops; + fmd_stat_t vdev_drops; + fmd_stat_t import_drops; + fmd_stat_t resource_drops; +} zfs_de_stats_t; + +zfs_de_stats_t zfs_stats = { + { "old_drops", FMD_TYPE_UINT64, "ereports dropped (from before load)" }, + { "dev_drops", FMD_TYPE_UINT64, "ereports dropped (dev during open)"}, + { "vdev_drops", FMD_TYPE_UINT64, "ereports dropped (weird vdev types)"}, + { "import_drops", FMD_TYPE_UINT64, "ereports dropped (during import)" }, + { "resource_drops", FMD_TYPE_UINT64, "resource related ereports" } +}; + +static hrtime_t zfs_remove_timeout; + +uu_list_pool_t *zfs_case_pool; +uu_list_t *zfs_cases; + +#define ZFS_MAKE_RSRC(type) \ + FM_RSRC_CLASS "." ZFS_ERROR_CLASS "." type +#define ZFS_MAKE_EREPORT(type) \ + FM_EREPORT_CLASS "." ZFS_ERROR_CLASS "." type + +/* + * Write out the persistent representation of an active case. + */ +static void +zfs_case_serialize(fmd_hdl_t *hdl, zfs_case_t *zcp) +{ + zcp->zc_data.zc_version = CASE_DATA_VERSION_SERD; +} + +/* + * Read back the persistent representation of an active case. + */ +static zfs_case_t * +zfs_case_unserialize(fmd_hdl_t *hdl, fmd_case_t *cp) +{ + zfs_case_t *zcp; + + zcp = fmd_hdl_zalloc(hdl, sizeof (zfs_case_t), FMD_SLEEP); + zcp->zc_case = cp; + + fmd_buf_read(hdl, cp, CASE_DATA, &zcp->zc_data, + sizeof (zcp->zc_data)); + + if (zcp->zc_data.zc_version > CASE_DATA_VERSION_SERD) { + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + return (NULL); + } + + /* + * fmd_buf_read() will have already zeroed out the remainder of the + * buffer, so we don't have to do anything special if the version + * doesn't include the SERD engine name. + */ + + if (zcp->zc_data.zc_has_remove_timer) + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, + NULL, zfs_remove_timeout); + + uu_list_node_init(zcp, &zcp->zc_node, zfs_case_pool); + (void) uu_list_insert_before(zfs_cases, NULL, zcp); + + fmd_case_setspecific(hdl, cp, zcp); + + return (zcp); +} + +/* + * Iterate over any active cases. If any cases are associated with a pool or + * vdev which is no longer present on the system, close the associated case. + */ +static void +zfs_mark_vdev(uint64_t pool_guid, nvlist_t *vd, er_timeval_t *loaded) +{ + uint64_t vdev_guid = 0; + uint_t c, children; + nvlist_t **child; + zfs_case_t *zcp; + + (void) nvlist_lookup_uint64(vd, ZPOOL_CONFIG_GUID, &vdev_guid); + + /* + * Mark any cases associated with this (pool, vdev) pair. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == vdev_guid) { + zcp->zc_present = B_TRUE; + zcp->zc_when = *loaded; + } + } + + /* + * Iterate over all children. + */ + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } + + if (nvlist_lookup_nvlist_array(vd, ZPOOL_CONFIG_SPARES, &child, + &children) == 0) { + for (c = 0; c < children; c++) + zfs_mark_vdev(pool_guid, child[c], loaded); + } +} + +/*ARGSUSED*/ +static int +zfs_mark_pool(zpool_handle_t *zhp, void *unused) +{ + zfs_case_t *zcp; + uint64_t pool_guid; + uint64_t *tod; + er_timeval_t loaded = { 0 }; + nvlist_t *config, *vd; + uint_t nelem = 0; + int ret; + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + /* + * Mark any cases associated with just this pool. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) + zcp->zc_present = B_TRUE; + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + (void) nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem); + if (nelem == 2) { + loaded.ertv_sec = tod[0]; + loaded.ertv_nsec = tod[1]; + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid && + zcp->zc_data.zc_vdev_guid == 0) { + zcp->zc_when = loaded; + } + } + } + + ret = nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &vd); + if (ret) { + zpool_close(zhp); + return (-1); + } + + zfs_mark_vdev(pool_guid, vd, &loaded); + + zpool_close(zhp); + + return (0); +} + +struct load_time_arg { + uint64_t lt_guid; + er_timeval_t *lt_time; + boolean_t lt_found; +}; + +static int +zpool_find_load_time(zpool_handle_t *zhp, void *arg) +{ + struct load_time_arg *lta = arg; + uint64_t pool_guid; + uint64_t *tod; + nvlist_t *config; + uint_t nelem; + + if (lta->lt_found) { + zpool_close(zhp); + return (0); + } + + pool_guid = zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL); + if (pool_guid != lta->lt_guid) { + zpool_close(zhp); + return (0); + } + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + zpool_close(zhp); + return (-1); + } + + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_LOADED_TIME, + &tod, &nelem) == 0 && nelem == 2) { + lta->lt_found = B_TRUE; + lta->lt_time->ertv_sec = tod[0]; + lta->lt_time->ertv_nsec = tod[1]; + } + + zpool_close(zhp); + + return (0); +} + +static void +zfs_purge_cases(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + + /* + * There is no way to open a pool by GUID, or lookup a vdev by GUID. No + * matter what we do, we're going to have to stomach an O(vdevs * cases) + * algorithm. In reality, both quantities are likely so small that + * neither will matter. Given that iterating over pools is more + * expensive than iterating over the in-memory case list, we opt for a + * 'present' flag in each case that starts off cleared. We then iterate + * over all pools, marking those that are still present, and removing + * those that aren't found. + * + * Note that we could also construct an FMRI and rely on + * fmd_nvl_fmri_present(), but this would end up doing the same search. + */ + + /* + * Mark the cases as not present. + */ + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) + zcp->zc_present = B_FALSE; + + /* + * Iterate over all pools and mark the pools and vdevs found. If this + * fails (most probably because we're out of memory), then don't close + * any of the cases and we cannot be sure they are accurate. + */ + if (zpool_iter(zhdl, zfs_mark_pool, NULL) != 0) + return; + + /* + * Remove those cases which were not found. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + if (!zcp->zc_present) + fmd_case_close(hdl, zcp->zc_case); + } + uu_list_walk_end(walk); +} + +/* + * Construct the name of a serd engine given the pool/vdev GUID and type (io or + * checksum). + */ +static void +zfs_serd_name(char *buf, uint64_t pool_guid, uint64_t vdev_guid, + const char *type) +{ + (void) snprintf(buf, MAX_SERDLEN, "zfs_%llx_%llx_%s", + (long long unsigned int)pool_guid, + (long long unsigned int)vdev_guid, type); +} + +/* + * Solve a given ZFS case. This first checks to make sure the diagnosis is + * still valid, as well as cleaning up any pending timer associated with the + * case. + */ +static void +zfs_case_solve(fmd_hdl_t *hdl, zfs_case_t *zcp, const char *faultname, + boolean_t checkunusable) +{ + nvlist_t *detector, *fault; + boolean_t serialize; + nvlist_t *fru = NULL; + fmd_hdl_debug(hdl, "solving fault '%s'", faultname); + + /* + * Construct the detector from the case data. The detector is in the + * ZFS scheme, and is either the pool or the vdev, depending on whether + * this is a vdev or pool fault. + */ + detector = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_uint8(detector, FM_VERSION, ZFS_SCHEME_VERSION0); + (void) nvlist_add_string(detector, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS); + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_POOL, + zcp->zc_data.zc_pool_guid); + if (zcp->zc_data.zc_vdev_guid != 0) { + (void) nvlist_add_uint64(detector, FM_FMRI_ZFS_VDEV, + zcp->zc_data.zc_vdev_guid); + } + + fault = fmd_nvl_create_fault(hdl, faultname, 100, detector, + fru, detector); + fmd_case_add_suspect(hdl, zcp->zc_case, fault); + + nvlist_free(fru); + + fmd_case_solve(hdl, zcp->zc_case); + + serialize = B_FALSE; + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + serialize = B_TRUE; + } + if (serialize) + zfs_case_serialize(hdl, zcp); + + nvlist_free(detector); +} + +static boolean_t +timeval_earlier(er_timeval_t *a, er_timeval_t *b) +{ + return (a->ertv_sec < b->ertv_sec || + (a->ertv_sec == b->ertv_sec && a->ertv_nsec < b->ertv_nsec)); +} + +/*ARGSUSED*/ +static void +zfs_ereport_when(fmd_hdl_t *hdl, nvlist_t *nvl, er_timeval_t *when) +{ + int64_t *tod; + uint_t nelem; + + if (nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tod, + &nelem) == 0 && nelem == 2) { + when->ertv_sec = tod[0]; + when->ertv_nsec = tod[1]; + } else { + when->ertv_sec = when->ertv_nsec = UINT64_MAX; + } +} + +/* + * Main fmd entry point. + */ +/*ARGSUSED*/ +static void +zfs_fm_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, const char *class) +{ + zfs_case_t *zcp, *dcp; + int32_t pool_state; + uint64_t ena, pool_guid, vdev_guid; + er_timeval_t pool_load; + er_timeval_t er_when; + nvlist_t *detector; + boolean_t pool_found = B_FALSE; + boolean_t isresource; + char *type; + + /* + * We subscribe to notifications for vdev or pool removal. In these + * cases, there may be cases that no longer apply. Purge any cases + * that no longer apply. + */ + if (fmd_nvl_class_match(hdl, nvl, "sysevent.fs.zfs.*")) { + fmd_hdl_debug(hdl, "purging orphaned cases from %s", + strrchr(class, '.') + 1); + zfs_purge_cases(hdl); + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + isresource = fmd_nvl_class_match(hdl, nvl, "resource.fs.zfs.*"); + + if (isresource) { + /* + * For resources, we don't have a normal payload. + */ + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + pool_state = SPA_LOAD_OPEN; + else + pool_state = SPA_LOAD_NONE; + detector = NULL; + } else { + (void) nvlist_lookup_nvlist(nvl, + FM_EREPORT_DETECTOR, &detector); + (void) nvlist_lookup_int32(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, &pool_state); + } + + /* + * We also ignore all ereports generated during an import of a pool, + * since the only possible fault (.pool) would result in import failure, + * and hence no persistent fault. Some day we may want to do something + * with these ereports, so we continue generating them internally. + */ + if (pool_state == SPA_LOAD_IMPORT) { + zfs_stats.import_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "ignoring '%s' during import", class); + return; + } + + /* + * Device I/O errors are ignored during pool open. + */ + if (pool_state == SPA_LOAD_OPEN && + (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE)))) { + fmd_hdl_debug(hdl, "ignoring '%s' during pool open", class); + zfs_stats.dev_drops.fmds_value.ui64++; + return; + } + + /* + * We ignore ereports for anything except disks and files. + */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &type) == 0) { + if (strcmp(type, VDEV_TYPE_DISK) != 0 && + strcmp(type, VDEV_TYPE_FILE) != 0) { + zfs_stats.vdev_drops.fmds_value.ui64++; + return; + } + } + + /* + * Determine if this ereport corresponds to an open case. + * Each vdev or pool can have a single case. + */ + (void) nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, &pool_guid); + if (nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + vdev_guid = 0; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_ENA, &ena) != 0) + ena = 0; + + zfs_ereport_when(hdl, nvl, &er_when); + + for (zcp = uu_list_first(zfs_cases); zcp != NULL; + zcp = uu_list_next(zfs_cases, zcp)) { + if (zcp->zc_data.zc_pool_guid == pool_guid) { + pool_found = B_TRUE; + pool_load = zcp->zc_when; + } + if (zcp->zc_data.zc_vdev_guid == vdev_guid) + break; + } + + /* + * Avoid falsely accusing a pool of being faulty. Do so by + * not replaying ereports that were generated prior to the + * current import. If the failure that generated them was + * transient because the device was actually removed but we + * didn't receive the normal asynchronous notification, we + * don't want to mark it as faulted and potentially panic. If + * there is still a problem we'd expect not to be able to + * import the pool, or that new ereports will be generated + * once the pool is used. + */ + if (pool_found && timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, er_when.ertv_nsec, + pool_load.ertv_sec, pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + + if (!pool_found) { + /* + * Haven't yet seen this pool, but same situation + * may apply. + */ + libzfs_handle_t *zhdl = fmd_hdl_getspecific(hdl); + struct load_time_arg la; + + la.lt_guid = pool_guid; + la.lt_time = &pool_load; + la.lt_found = B_FALSE; + + if (zhdl != NULL && + zpool_iter(zhdl, zpool_find_load_time, &la) == 0 && + la.lt_found == B_TRUE) { + pool_found = B_TRUE; + + if (timeval_earlier(&er_when, &pool_load)) { + fmd_hdl_debug(hdl, "ignoring pool %llx, " + "ereport time %lld.%lld, " + "pool load time = %lld.%lld", + pool_guid, er_when.ertv_sec, + er_when.ertv_nsec, pool_load.ertv_sec, + pool_load.ertv_nsec); + zfs_stats.old_drops.fmds_value.ui64++; + return; + } + } + } + + if (zcp == NULL) { + fmd_case_t *cs; + zfs_case_data_t data = { 0 }; + + /* + * If this is one of our 'fake' resource ereports, and there is + * no case open, simply discard it. + */ + if (isresource) { + zfs_stats.resource_drops.fmds_value.ui64++; + fmd_hdl_debug(hdl, "discarding '%s for vdev %llu", + class, vdev_guid); + return; + } + + /* + * Skip tracking some ereports + */ + if (strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DATA)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE)) == 0 || + strcmp(class, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_DELAY)) == 0) { + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Open a new case. + */ + cs = fmd_case_open(hdl, NULL); + + fmd_hdl_debug(hdl, "opening case for vdev %llu due to '%s'", + vdev_guid, class); + + /* + * Initialize the case buffer. To commonize code, we actually + * create the buffer with existing data, and then call + * zfs_case_unserialize() to instantiate the in-core structure. + */ + fmd_buf_create(hdl, cs, CASE_DATA, sizeof (zfs_case_data_t)); + + data.zc_version = CASE_DATA_VERSION_SERD; + data.zc_ena = ena; + data.zc_pool_guid = pool_guid; + data.zc_vdev_guid = vdev_guid; + data.zc_pool_state = (int)pool_state; + + fmd_buf_write(hdl, cs, CASE_DATA, &data, sizeof (data)); + + zcp = zfs_case_unserialize(hdl, cs); + assert(zcp != NULL); + if (pool_found) + zcp->zc_when = pool_load; + } + + if (isresource) { + fmd_hdl_debug(hdl, "resource event '%s'", class); + + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_AUTOREPLACE))) { + /* + * The 'resource.fs.zfs.autoreplace' event indicates + * that the pool was loaded with the 'autoreplace' + * property set. In this case, any pending device + * failures should be ignored, as the asynchronous + * autoreplace handling will take care of them. + */ + fmd_case_close(hdl, zcp->zc_case); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_REMOVED))) { + /* + * The 'resource.fs.zfs.removed' event indicates that + * device removal was detected, and the device was + * closed asynchronously. If this is the case, we + * assume that any recent I/O errors were due to the + * device removal, not any fault of the device itself. + * We reset the SERD engine, and cancel any pending + * timers. + */ + if (zcp->zc_data.zc_has_remove_timer) { + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_data.zc_has_remove_timer = 0; + zfs_case_serialize(hdl, zcp); + } + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_reset(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_reset(hdl, + zcp->zc_data.zc_serd_checksum); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_RSRC(FM_RESOURCE_STATECHANGE))) { + uint64_t state = 0; + + if (zcp != NULL && + nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state) == 0 && + state == VDEV_STATE_HEALTHY) { + fmd_hdl_debug(hdl, "closing case after a " + "device statechange to healthy"); + fmd_case_close(hdl, zcp->zc_case); + } + } + zfs_stats.resource_drops.fmds_value.ui64++; + return; + } + + /* + * Associate the ereport with this case. + */ + fmd_case_add_ereport(hdl, zcp->zc_case, ep); + + /* + * Don't do anything else if this case is already solved. + */ + if (fmd_case_solved(hdl, zcp->zc_case)) + return; + + fmd_hdl_debug(hdl, "error event '%s'", class); + + /* + * Determine if we should solve the case and generate a fault. We solve + * a case if: + * + * a. A pool failed to open (ereport.fs.zfs.pool) + * b. A device failed to open (ereport.fs.zfs.pool) while a pool + * was up and running. + * + * We may see a series of ereports associated with a pool open, all + * chained together by the same ENA. If the pool open succeeds, then + * we'll see no further ereports. To detect when a pool open has + * succeeded, we associate a timer with the event. When it expires, we + * close the case. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_POOL))) { + /* + * Pool level fault. Before solving the case, go through and + * close any open device cases that may be pending. + */ + for (dcp = uu_list_first(zfs_cases); dcp != NULL; + dcp = uu_list_next(zfs_cases, dcp)) { + if (dcp->zc_data.zc_pool_guid == + zcp->zc_data.zc_pool_guid && + dcp->zc_data.zc_vdev_guid != 0) + fmd_case_close(hdl, dcp->zc_case); + } + + zfs_case_solve(hdl, zcp, "fault.fs.zfs.pool", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_LOG_REPLAY))) { + /* + * Pool level fault for reading the intent logs. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.log_replay", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, "ereport.fs.zfs.vdev.*")) { + /* + * Device fault. + */ + zfs_case_solve(hdl, zcp, "fault.fs.zfs.device", B_TRUE); + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) || + fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { + char *failmode = NULL; + boolean_t checkremove = B_FALSE; + + /* + * If this is a checksum or I/O error, then toss it into the + * appropriate SERD engine and check to see if it has fired. + * Ideally, we want to do something more sophisticated, + * (persistent errors for a single data block, etc). For now, + * a single SERD engine is sufficient. + */ + if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO))) { + if (zcp->zc_data.zc_serd_io[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_io, + pool_guid, vdev_guid, "io"); + fmd_serd_create(hdl, zcp->zc_data.zc_serd_io, + fmd_prop_get_int32(hdl, "io_N"), + fmd_prop_get_int64(hdl, "io_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, zcp->zc_data.zc_serd_io, ep)) + checkremove = B_TRUE; + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_CHECKSUM))) { + if (zcp->zc_data.zc_serd_checksum[0] == '\0') { + zfs_serd_name(zcp->zc_data.zc_serd_checksum, + pool_guid, vdev_guid, "checksum"); + fmd_serd_create(hdl, + zcp->zc_data.zc_serd_checksum, + fmd_prop_get_int32(hdl, "checksum_N"), + fmd_prop_get_int64(hdl, "checksum_T")); + zfs_case_serialize(hdl, zcp); + } + if (fmd_serd_record(hdl, + zcp->zc_data.zc_serd_checksum, ep)) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.vdev.checksum", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_IO_FAILURE)) && + (nvlist_lookup_string(nvl, + FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, &failmode) == 0) && + failmode != NULL) { + if (strncmp(failmode, FM_EREPORT_FAILMODE_CONTINUE, + strlen(FM_EREPORT_FAILMODE_CONTINUE)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_continue", + B_FALSE); + } else if (strncmp(failmode, FM_EREPORT_FAILMODE_WAIT, + strlen(FM_EREPORT_FAILMODE_WAIT)) == 0) { + zfs_case_solve(hdl, zcp, + "fault.fs.zfs.io_failure_wait", B_FALSE); + } + } else if (fmd_nvl_class_match(hdl, nvl, + ZFS_MAKE_EREPORT(FM_EREPORT_ZFS_PROBE_FAILURE))) { +#ifndef __linux__ + /* This causes an unexpected fault diagnosis on linux */ + checkremove = B_TRUE; +#endif + } + + /* + * Because I/O errors may be due to device removal, we postpone + * any diagnosis until we're sure that we aren't about to + * receive a 'resource.fs.zfs.removed' event. + */ + if (checkremove) { + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + zcp->zc_remove_timer = fmd_timer_install(hdl, zcp, NULL, + zfs_remove_timeout); + if (!zcp->zc_data.zc_has_remove_timer) { + zcp->zc_data.zc_has_remove_timer = 1; + zfs_case_serialize(hdl, zcp); + } + } + } +} + +/* + * The timeout is fired when we diagnosed an I/O error, and it was not due to + * device removal (which would cause the timeout to be cancelled). + */ +/* ARGSUSED */ +static void +zfs_fm_timeout(fmd_hdl_t *hdl, id_t id, void *data) +{ + zfs_case_t *zcp = data; + + if (id == zcp->zc_remove_timer) + zfs_case_solve(hdl, zcp, "fault.fs.zfs.vdev.io", B_FALSE); +} + +/* + * The specified case has been closed and any case-specific + * data structures should be deallocated. + */ +static void +zfs_fm_close(fmd_hdl_t *hdl, fmd_case_t *cs) +{ + zfs_case_t *zcp = fmd_case_getspecific(hdl, cs); + + if (zcp->zc_data.zc_serd_checksum[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_checksum); + if (zcp->zc_data.zc_serd_io[0] != '\0') + fmd_serd_destroy(hdl, zcp->zc_data.zc_serd_io); + if (zcp->zc_data.zc_has_remove_timer) + fmd_timer_remove(hdl, zcp->zc_remove_timer); + + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); +} + +/* + * We use the fmd gc entry point to look for old cases that no longer apply. + * This allows us to keep our set of case data small in a long running system. + */ +static void +zfs_fm_gc(fmd_hdl_t *hdl) +{ + zfs_purge_cases(hdl); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_fm_recv, /* fmdo_recv */ + zfs_fm_timeout, /* fmdo_timeout */ + zfs_fm_close, /* fmdo_close */ + NULL, /* fmdo_stats */ + zfs_fm_gc, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "checksum_N", FMD_TYPE_UINT32, "10" }, + { "checksum_T", FMD_TYPE_TIME, "10min" }, + { "io_N", FMD_TYPE_UINT32, "10" }, + { "io_T", FMD_TYPE_TIME, "10min" }, + { "remove_timeout", FMD_TYPE_TIME, "15sec" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Diagnosis Engine", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_diagnosis_init(fmd_hdl_t *hdl) +{ + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if ((zfs_case_pool = uu_list_pool_create("zfs_case_pool", + sizeof (zfs_case_t), offsetof(zfs_case_t, zc_node), + NULL, UU_LIST_POOL_DEBUG)) == NULL) { + libzfs_fini(zhdl); + return; + } + + if ((zfs_cases = uu_list_create(zfs_case_pool, NULL, + UU_LIST_DEBUG)) == NULL) { + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + libzfs_fini(zhdl); + return; + } + + fmd_hdl_setspecific(hdl, zhdl); + + (void) fmd_stat_create(hdl, FMD_STAT_NOALLOC, sizeof (zfs_stats) / + sizeof (fmd_stat_t), (fmd_stat_t *)&zfs_stats); + + zfs_remove_timeout = fmd_prop_get_int64(hdl, "remove_timeout"); +} + +void +_zfs_diagnosis_fini(fmd_hdl_t *hdl) +{ + zfs_case_t *zcp; + uu_list_walk_t *walk; + libzfs_handle_t *zhdl; + + /* + * Remove all active cases. + */ + walk = uu_list_walk_start(zfs_cases, UU_WALK_ROBUST); + while ((zcp = uu_list_walk_next(walk)) != NULL) { + fmd_hdl_debug(hdl, "removing case ena %llu", + (long long unsigned)zcp->zc_data.zc_ena); + uu_list_remove(zfs_cases, zcp); + uu_list_node_fini(zcp, &zcp->zc_node, zfs_case_pool); + fmd_hdl_free(hdl, zcp, sizeof (zfs_case_t)); + } + uu_list_walk_end(walk); + + uu_list_destroy(zfs_cases); + uu_list_pool_destroy(zfs_case_pool); + + zhdl = fmd_hdl_getspecific(hdl); + libzfs_fini(zhdl); +} diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c new file mode 100644 index 000000000000..8d0a3b420086 --- /dev/null +++ b/cmd/zed/agents/zfs_mod.c @@ -0,0 +1,956 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016, 2017, Intel Corporation. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + */ + +/* + * ZFS syseventd module. + * + * file origin: openzfs/usr/src/cmd/syseventd/modules/zfs_mod/zfs_mod.c + * + * The purpose of this module is to identify when devices are added to the + * system, and appropriately online or replace the affected vdevs. + * + * When a device is added to the system: + * + * 1. Search for any vdevs whose devid matches that of the newly added + * device. + * + * 2. If no vdevs are found, then search for any vdevs whose udev path + * matches that of the new device. + * + * 3. If no vdevs match by either method, then ignore the event. + * + * 4. Attempt to online the device with a flag to indicate that it should + * be unspared when resilvering completes. If this succeeds, then the + * same device was inserted and we should continue normally. + * + * 5. If the pool does not have the 'autoreplace' property set, attempt to + * online the device again without the unspare flag, which will + * generate a FMA fault. + * + * 6. If the pool has the 'autoreplace' property set, and the matching vdev + * is a whole disk, then label the new disk and attempt a 'zpool + * replace'. + * + * The module responds to EC_DEV_ADD events. The special ESC_ZFS_VDEV_CHECK + * event indicates that a device failed to open during pool load, but the + * autoreplace property was set. In this case, we deferred the associated + * FMA fault until our module had a chance to process the autoreplace logic. + * If the device could not be replaced, then the second online attempt will + * trigger the FMA fault that we skipped earlier. + * + * ZFS on Linux porting notes: + * Linux udev provides a disk insert for both the disk and the partition + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_agents.h" +#include "../zed_log.h" + +#define DEV_BYID_PATH "/dev/disk/by-id/" +#define DEV_BYPATH_PATH "/dev/disk/by-path/" +#define DEV_BYVDEV_PATH "/dev/disk/by-vdev/" + +typedef void (*zfs_process_func_t)(zpool_handle_t *, nvlist_t *, boolean_t); + +libzfs_handle_t *g_zfshdl; +list_t g_pool_list; /* list of unavailable pools at initialization */ +list_t g_device_list; /* list of disks with asynchronous label request */ +tpool_t *g_tpool; +boolean_t g_enumeration_done; +pthread_t g_zfs_tid; /* zfs_enum_pools() thread */ + +typedef struct unavailpool { + zpool_handle_t *uap_zhp; + list_node_t uap_node; +} unavailpool_t; + +typedef struct pendingdev { + char pd_physpath[128]; + list_node_t pd_node; +} pendingdev_t; + +static int +zfs_toplevel_state(zpool_handle_t *zhp) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + unsigned int c; + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + return (vs->vs_state); +} + +static int +zfs_unavail_pool(zpool_handle_t *zhp, void *data) +{ + zed_log_msg(LOG_INFO, "zfs_unavail_pool: examining '%s' (state %d)", + zpool_get_name(zhp), (int)zfs_toplevel_state(zhp)); + + if (zfs_toplevel_state(zhp) < VDEV_STATE_DEGRADED) { + unavailpool_t *uap; + uap = malloc(sizeof (unavailpool_t)); + uap->uap_zhp = zhp; + list_insert_tail((list_t *)data, uap); + } else { + zpool_close(zhp); + } + return (0); +} + +/* + * Two stage replace on Linux + * since we get disk notifications + * we can wait for partitioned disk slice to show up! + * + * First stage tags the disk, initiates async partitioning, and returns + * Second stage finds the tag and proceeds to ZFS labeling/replace + * + * disk-add --> label-disk + tag-disk --> partition-add --> zpool_vdev_attach + * + * 1. physical match with no fs, no partition + * tag it top, partition disk + * + * 2. physical match again, see partition and tag + * + */ + +/* + * The device associated with the given vdev (either by devid or physical path) + * has been added to the system. If 'isdisk' is set, then we only attempt a + * replacement if it's a whole disk. This also implies that we should label the + * disk first. + * + * First, we attempt to online the device (making sure to undo any spare + * operation when finished). If this succeeds, then we're done. If it fails, + * and the new state is VDEV_CANT_OPEN, it indicates that the device was opened, + * but that the label was not what we expected. If the 'autoreplace' property + * is enabled, then we relabel the disk (if specified), and attempt a 'zpool + * replace'. If the online is successful, but the new state is something else + * (REMOVED or FAULTED), it indicates that we're out of sync or in some sort of + * race, and we should avoid attempting to relabel the disk. + * + * Also can arrive here from a ESC_ZFS_VDEV_CHECK event + */ +static void +zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) +{ + char *path; + vdev_state_t newstate; + nvlist_t *nvroot, *newvd; + pendingdev_t *device; + uint64_t wholedisk = 0ULL; + uint64_t offline = 0ULL; + uint64_t guid = 0ULL; + char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL; + char rawpath[PATH_MAX], fullpath[PATH_MAX]; + char devpath[PATH_MAX]; + int ret; + boolean_t is_dm = B_FALSE; + boolean_t is_sd = B_FALSE; + uint_t c; + vdev_stat_t *vs; + + if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0) + return; + + /* Skip healthy disks */ + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + if (vs->vs_state == VDEV_STATE_HEALTHY) { + zed_log_msg(LOG_INFO, "%s: %s is already healthy, skip it.", + __func__, path); + return; + } + + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_GUID, &guid); + + if (offline) + return; /* don't intervene if it was taken offline */ + + is_dm = zfs_dev_is_dm(path); + zed_log_msg(LOG_INFO, "zfs_process_add: pool '%s' vdev '%s', phys '%s'" + " wholedisk %d, %s dm (guid %llu)", zpool_get_name(zhp), path, + physpath ? physpath : "NULL", wholedisk, is_dm ? "is" : "not", + (long long unsigned int)guid); + + /* + * The VDEV guid is preferred for identification (gets passed in path) + */ + if (guid != 0) { + (void) snprintf(fullpath, sizeof (fullpath), "%llu", + (long long unsigned int)guid); + } else { + /* + * otherwise use path sans partition suffix for whole disks + */ + (void) strlcpy(fullpath, path, sizeof (fullpath)); + if (wholedisk) { + char *spath = zfs_strip_partition(fullpath); + if (!spath) { + zed_log_msg(LOG_INFO, "%s: Can't alloc", + __func__); + return; + } + + (void) strlcpy(fullpath, spath, sizeof (fullpath)); + free(spath); + } + } + + /* + * Attempt to online the device. + */ + if (zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE, &newstate) == 0 && + (newstate == VDEV_STATE_HEALTHY || + newstate == VDEV_STATE_DEGRADED)) { + zed_log_msg(LOG_INFO, " zpool_vdev_online: vdev %s is %s", + fullpath, (newstate == VDEV_STATE_HEALTHY) ? + "HEALTHY" : "DEGRADED"); + return; + } + + /* + * vdev_id alias rule for using scsi_debug devices (FMA automated + * testing) + */ + if (physpath != NULL && strcmp("scsidebug", physpath) == 0) + is_sd = B_TRUE; + + /* + * If the pool doesn't have the autoreplace property set, then use + * vdev online to trigger a FMA fault by posting an ereport. + */ + if (!zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOREPLACE, NULL) || + !(wholedisk || is_dm) || (physpath == NULL)) { + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + zed_log_msg(LOG_INFO, "Pool's autoreplace is not enabled or " + "not a whole disk for '%s'", fullpath); + return; + } + + /* + * Convert physical path into its current device node. Rawpath + * needs to be /dev/disk/by-vdev for a scsi_debug device since + * /dev/disk/by-path will not be present. + */ + (void) snprintf(rawpath, sizeof (rawpath), "%s%s", + is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath); + + if (realpath(rawpath, devpath) == NULL && !is_dm) { + zed_log_msg(LOG_INFO, " realpath: %s failed (%s)", + rawpath, strerror(errno)); + + (void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT, + &newstate); + + zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)", + fullpath, libzfs_error_description(g_zfshdl)); + return; + } + + /* Only autoreplace bad disks */ + if ((vs->vs_state != VDEV_STATE_DEGRADED) && + (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_CANT_OPEN)) { + return; + } + + nvlist_lookup_string(vdev, "new_devid", &new_devid); + + if (is_dm) { + /* Don't label device mapper or multipath disks. */ + } else if (!labeled) { + /* + * we're auto-replacing a raw disk, so label it first + */ + char *leafname; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. Before we can label the disk, we need + * to map the physical string that was matched on to the under + * lying device node. + * + * If any part of this process fails, then do a force online + * to trigger a ZFS fault for the device (and any hot spare + * replacement). + */ + leafname = strrchr(devpath, '/') + 1; + + /* + * If this is a request to label a whole disk, then attempt to + * write out the label. + */ + if (zpool_label_disk(g_zfshdl, zhp, leafname) != 0) { + zed_log_msg(LOG_INFO, " zpool_label_disk: could not " + "label '%s' (%s)", leafname, + libzfs_error_description(g_zfshdl)); + + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + /* + * The disk labeling is asynchronous on Linux. Just record + * this label request and return as there will be another + * disk add event for the partition after the labeling is + * completed. + */ + device = malloc(sizeof (pendingdev_t)); + (void) strlcpy(device->pd_physpath, physpath, + sizeof (device->pd_physpath)); + list_insert_tail(&g_device_list, device); + + zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)", + leafname, (u_longlong_t)guid); + + return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */ + + } else /* labeled */ { + boolean_t found = B_FALSE; + /* + * match up with request above to label the disk + */ + for (device = list_head(&g_device_list); device != NULL; + device = list_next(&g_device_list, device)) { + if (strcmp(physpath, device->pd_physpath) == 0) { + list_remove(&g_device_list, device); + free(device); + found = B_TRUE; + break; + } + zed_log_msg(LOG_INFO, "zpool_label_disk: %s != %s", + physpath, device->pd_physpath); + } + if (!found) { + /* unexpected partition slice encountered */ + zed_log_msg(LOG_INFO, "labeled disk %s unexpected here", + fullpath); + (void) zpool_vdev_online(zhp, fullpath, + ZFS_ONLINE_FORCEFAULT, &newstate); + return; + } + + zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)", + physpath, (u_longlong_t)guid); + + (void) snprintf(devpath, sizeof (devpath), "%s%s", + DEV_BYID_PATH, new_devid); + } + + /* + * Construct the root vdev to pass to zpool_vdev_attach(). While adding + * the entire vdev structure is harmless, we construct a reduced set of + * path/physpath/wholedisk to keep it simple. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + return; + } + if (nvlist_alloc(&newvd, NV_UNIQUE_NAME, 0) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: nvlist_alloc out of memory"); + nvlist_free(nvroot); + return; + } + + if (nvlist_add_string(newvd, ZPOOL_CONFIG_TYPE, VDEV_TYPE_DISK) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_PATH, path) != 0 || + nvlist_add_string(newvd, ZPOOL_CONFIG_DEVID, new_devid) != 0 || + (physpath != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_PHYS_PATH, physpath) != 0) || + (enc_sysfs_path != NULL && nvlist_add_string(newvd, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, enc_sysfs_path) != 0) || + nvlist_add_uint64(newvd, ZPOOL_CONFIG_WHOLE_DISK, wholedisk) != 0 || + nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &newvd, + 1) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: unable to add nvlist pairs"); + nvlist_free(newvd); + nvlist_free(nvroot); + return; + } + + nvlist_free(newvd); + + /* + * Wait for udev to verify the links exist, then auto-replace + * the leaf disk at same physical location. + */ + if (zpool_label_disk_wait(path, 3000) != 0) { + zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement " + "disk %s is missing", path); + nvlist_free(nvroot); + return; + } + + ret = zpool_vdev_attach(zhp, fullpath, path, nvroot, B_TRUE, B_FALSE); + + zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)", + fullpath, path, (ret == 0) ? "no errors" : + libzfs_error_description(g_zfshdl)); + + nvlist_free(nvroot); +} + +/* + * Utility functions to find a vdev matching given criteria. + */ +typedef struct dev_data { + const char *dd_compare; + const char *dd_prop; + zfs_process_func_t dd_func; + boolean_t dd_found; + boolean_t dd_islabeled; + uint64_t dd_pool_guid; + uint64_t dd_vdev_guid; + const char *dd_new_devid; +} dev_data_t; + +static void +zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data) +{ + dev_data_t *dp = data; + char *path = NULL; + uint_t c, children; + nvlist_t **child; + + /* + * First iterate over any children. + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* + * Iterate over any spares and cache devices + */ + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + zfs_iter_vdev(zhp, child[c], data); + } + + /* once a vdev was matched and processed there is nothing left to do */ + if (dp->dd_found) + return; + + /* + * Match by GUID if available otherwise fallback to devid or physical + */ + if (dp->dd_vdev_guid != 0) { + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid != dp->dd_vdev_guid) { + return; + } + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched on %llu", guid); + dp->dd_found = B_TRUE; + + } else if (dp->dd_compare != NULL) { + /* + * NOTE: On Linux there is an event for partition, so unlike + * illumos, substring matching is not required to accommodate + * the partition suffix. An exact match will be present in + * the dp->dd_compare value. + */ + if (nvlist_lookup_string(nvl, dp->dd_prop, &path) != 0 || + strcmp(dp->dd_compare, path) != 0) + return; + + zed_log_msg(LOG_INFO, " zfs_iter_vdev: matched %s on %s", + dp->dd_prop, path); + dp->dd_found = B_TRUE; + + /* pass the new devid for use by replacing code */ + if (dp->dd_new_devid != NULL) { + (void) nvlist_add_string(nvl, "new_devid", + dp->dd_new_devid); + } + } + + (dp->dd_func)(zhp, nvl, dp->dd_islabeled); +} + +static void +zfs_enable_ds(void *arg) +{ + unavailpool_t *pool = (unavailpool_t *)arg; + + (void) zpool_enable_datasets(pool->uap_zhp, NULL, 0); + zpool_close(pool->uap_zhp); + free(pool); +} + +static int +zfs_iter_pool(zpool_handle_t *zhp, void *data) +{ + nvlist_t *config, *nvl; + dev_data_t *dp = data; + uint64_t pool_guid; + unavailpool_t *pool; + + zed_log_msg(LOG_INFO, "zfs_iter_pool: evaluating vdevs on %s (by %s)", + zpool_get_name(zhp), dp->dd_vdev_guid ? "GUID" : dp->dd_prop); + + /* + * For each vdev in this pool, look for a match to apply dd_func + */ + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + if (dp->dd_pool_guid == 0 || + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0 && pool_guid == dp->dd_pool_guid)) { + (void) nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvl); + zfs_iter_vdev(zhp, nvl, data); + } + } + + /* + * if this pool was originally unavailable, + * then enable its datasets asynchronously + */ + if (g_enumeration_done) { + for (pool = list_head(&g_pool_list); pool != NULL; + pool = list_next(&g_pool_list, pool)) { + + if (strcmp(zpool_get_name(zhp), + zpool_get_name(pool->uap_zhp))) + continue; + if (zfs_toplevel_state(zhp) >= VDEV_STATE_DEGRADED) { + list_remove(&g_pool_list, pool); + (void) tpool_dispatch(g_tpool, zfs_enable_ds, + pool); + break; + } + } + } + + zpool_close(zhp); + return (dp->dd_found); /* cease iteration after a match */ +} + +/* + * Given a physical device location, iterate over all + * (pool, vdev) pairs which correspond to that location. + */ +static boolean_t +devphys_iter(const char *physical, const char *devid, zfs_process_func_t func, + boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = physical; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_PHYS_PATH; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; /* used by auto replace code */ + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Given a device identifier, find any vdevs with a matching devid. + * On Linux we can match devid directly which is always a whole disk. + */ +static boolean_t +devid_iter(const char *devid, zfs_process_func_t func, boolean_t is_slice) +{ + dev_data_t data = { 0 }; + + data.dd_compare = devid; + data.dd_func = func; + data.dd_prop = ZPOOL_CONFIG_DEVID; + data.dd_found = B_FALSE; + data.dd_islabeled = is_slice; + data.dd_new_devid = devid; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (data.dd_found); +} + +/* + * Handle a EC_DEV_ADD.ESC_DISK event. + * + * illumos + * Expects: DEV_PHYS_PATH string in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/dsk/c0t1d0s0' (persistent) + * devid: 'id1,sd@SATA_____Hitachi_HDS72101______JP2940HZ3H74MC/a' + * phys_path: '/pci@0,0/pci103c,1609@11/disk@1,0:a' + * + * linux + * provides: DEV_PHYS_PATH and DEV_IDENTIFIER strings in schema + * Matches: vdev's ZPOOL_CONFIG_PHYS_PATH or ZPOOL_CONFIG_DEVID + * + * path: '/dev/sdc1' (not persistent) + * devid: 'ata-SAMSUNG_HD204UI_S2HGJD2Z805891-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x4433221106000000-lun-0' + */ +static int +zfs_deliver_add(nvlist_t *nvl, boolean_t is_lofi) +{ + char *devpath = NULL, *devid; + boolean_t is_slice; + + /* + * Expecting a devid string and an optional physical location + */ + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &devid) != 0) + return (-1); + + (void) nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devpath); + + is_slice = (nvlist_lookup_boolean(nvl, DEV_IS_PART) == 0); + + zed_log_msg(LOG_INFO, "zfs_deliver_add: adding %s (%s) (is_slice %d)", + devid, devpath ? devpath : "NULL", is_slice); + + /* + * Iterate over all vdevs looking for a match in the following order: + * 1. ZPOOL_CONFIG_DEVID (identifies the unique disk) + * 2. ZPOOL_CONFIG_PHYS_PATH (identifies disk physical location). + * + * For disks, we only want to pay attention to vdevs marked as whole + * disks or are a multipath device. + */ + if (!devid_iter(devid, zfs_process_add, is_slice) && devpath != NULL) + (void) devphys_iter(devpath, devid, zfs_process_add, is_slice); + + return (0); +} + +/* + * Called when we receive a VDEV_CHECK event, which indicates a device could not + * be opened during initial pool open, but the autoreplace property was set on + * the pool. In this case, we treat it as if it were an add event. + */ +static int +zfs_deliver_check(nvlist_t *nvl) +{ + dev_data_t data = { 0 }; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, + &data.dd_pool_guid) != 0 || + nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, + &data.dd_vdev_guid) != 0 || + data.dd_vdev_guid == 0) + return (0); + + zed_log_msg(LOG_INFO, "zfs_deliver_check: pool '%llu', vdev %llu", + data.dd_pool_guid, data.dd_vdev_guid); + + data.dd_func = zfs_process_add; + + (void) zpool_iter(g_zfshdl, zfs_iter_pool, &data); + + return (0); +} + +static int +zfsdle_vdev_online(zpool_handle_t *zhp, void *data) +{ + char *devname = data; + boolean_t avail_spare, l2cache; + nvlist_t *tgt; + int error; + + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: searching for '%s' in '%s'", + devname, zpool_get_name(zhp)); + + if ((tgt = zpool_find_vdev_by_physpath(zhp, devname, + &avail_spare, &l2cache, NULL)) != NULL) { + char *path, fullpath[MAXPATHLEN]; + uint64_t wholedisk; + + error = nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &path); + if (error) { + zpool_close(zhp); + return (0); + } + + error = nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (error) + wholedisk = 0; + + if (wholedisk) { + path = strrchr(path, '/'); + if (path != NULL) { + path = zfs_strip_partition(path + 1); + if (path == NULL) { + zpool_close(zhp); + return (0); + } + } else { + zpool_close(zhp); + return (0); + } + + (void) strlcpy(fullpath, path, sizeof (fullpath)); + free(path); + + /* + * We need to reopen the pool associated with this + * device so that the kernel can update the size of + * the expanded device. When expanding there is no + * need to restart the scrub from the beginning. + */ + boolean_t scrub_restart = B_FALSE; + (void) zpool_reopen_one(zhp, &scrub_restart); + } else { + (void) strlcpy(fullpath, path, sizeof (fullpath)); + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + vdev_state_t newstate; + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL) { + error = zpool_vdev_online(zhp, fullpath, 0, + &newstate); + zed_log_msg(LOG_INFO, "zfsdle_vdev_online: " + "setting device '%s' to ONLINE state " + "in pool '%s': %d", fullpath, + zpool_get_name(zhp), error); + } + } + zpool_close(zhp); + return (1); + } + zpool_close(zhp); + return (0); +} + +/* + * This function handles the ESC_DEV_DLE device change event. Use the + * provided vdev guid when looking up a disk or partition, when the guid + * is not present assume the entire disk is owned by ZFS and append the + * expected -part1 partition information then lookup by physical path. + */ +static int +zfs_deliver_dle(nvlist_t *nvl) +{ + char *devname, name[MAXPATHLEN]; + uint64_t guid; + + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &guid) == 0) { + sprintf(name, "%llu", (u_longlong_t)guid); + } else if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &devname) == 0) { + strlcpy(name, devname, MAXPATHLEN); + zfs_append_partition(name, MAXPATHLEN); + } else { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: no guid or physpath"); + } + + if (zpool_iter(g_zfshdl, zfsdle_vdev_online, name) != 1) { + zed_log_msg(LOG_INFO, "zfs_deliver_dle: device '%s' not " + "found", name); + return (1); + } + + return (0); +} + +/* + * syseventd daemon module event handler + * + * Handles syseventd daemon zfs device related events: + * + * EC_DEV_ADD.ESC_DISK + * EC_DEV_STATUS.ESC_DEV_DLE + * EC_ZFS.ESC_ZFS_VDEV_CHECK + * + * Note: assumes only one thread active at a time (not thread safe) + */ +static int +zfs_slm_deliver_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + int ret; + boolean_t is_lofi = B_FALSE, is_check = B_FALSE, is_dle = B_FALSE; + + if (strcmp(class, EC_DEV_ADD) == 0) { + /* + * We're mainly interested in disk additions, but we also listen + * for new loop devices, to allow for simplified testing. + */ + if (strcmp(subclass, ESC_DISK) == 0) + is_lofi = B_FALSE; + else if (strcmp(subclass, ESC_LOFI) == 0) + is_lofi = B_TRUE; + else + return (0); + + is_check = B_FALSE; + } else if (strcmp(class, EC_ZFS) == 0 && + strcmp(subclass, ESC_ZFS_VDEV_CHECK) == 0) { + /* + * This event signifies that a device failed to open + * during pool load, but the 'autoreplace' property was + * set, so we should pretend it's just been added. + */ + is_check = B_TRUE; + } else if (strcmp(class, EC_DEV_STATUS) == 0 && + strcmp(subclass, ESC_DEV_DLE) == 0) { + is_dle = B_TRUE; + } else { + return (0); + } + + if (is_dle) + ret = zfs_deliver_dle(nvl); + else if (is_check) + ret = zfs_deliver_check(nvl); + else + ret = zfs_deliver_add(nvl, is_lofi); + + return (ret); +} + +/*ARGSUSED*/ +static void * +zfs_enum_pools(void *arg) +{ + (void) zpool_iter(g_zfshdl, zfs_unavail_pool, (void *)&g_pool_list); + /* + * Linux - instead of using a thread pool, each list entry + * will spawn a thread when an unavailable pool transitions + * to available. zfs_slm_fini will wait for these threads. + */ + g_enumeration_done = B_TRUE; + return (NULL); +} + +/* + * called from zed daemon at startup + * + * sent messages from zevents or udev monitor + * + * For now, each agent has its own libzfs instance + */ +int +zfs_slm_init() +{ + if ((g_zfshdl = libzfs_init()) == NULL) + return (-1); + + /* + * collect a list of unavailable pools (asynchronously, + * since this can take a while) + */ + list_create(&g_pool_list, sizeof (struct unavailpool), + offsetof(struct unavailpool, uap_node)); + + if (pthread_create(&g_zfs_tid, NULL, zfs_enum_pools, NULL) != 0) { + list_destroy(&g_pool_list); + libzfs_fini(g_zfshdl); + return (-1); + } + + list_create(&g_device_list, sizeof (struct pendingdev), + offsetof(struct pendingdev, pd_node)); + + return (0); +} + +void +zfs_slm_fini() +{ + unavailpool_t *pool; + pendingdev_t *device; + + /* wait for zfs_enum_pools thread to complete */ + (void) pthread_join(g_zfs_tid, NULL); + /* destroy the thread pool */ + if (g_tpool != NULL) { + tpool_wait(g_tpool); + tpool_destroy(g_tpool); + } + + while ((pool = (list_head(&g_pool_list))) != NULL) { + list_remove(&g_pool_list, pool); + zpool_close(pool->uap_zhp); + free(pool); + } + list_destroy(&g_pool_list); + + while ((device = (list_head(&g_device_list))) != NULL) { + list_remove(&g_device_list, device); + free(device); + } + list_destroy(&g_device_list); + + libzfs_fini(g_zfshdl); +} + +void +zfs_slm_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + zed_log_msg(LOG_INFO, "zfs_slm_event: %s.%s", class, subclass); + (void) zfs_slm_deliver_event(class, subclass, nvl); +} diff --git a/cmd/zed/agents/zfs_retire.c b/cmd/zed/agents/zfs_retire.c new file mode 100644 index 000000000000..9e95e20d5683 --- /dev/null +++ b/cmd/zed/agents/zfs_retire.c @@ -0,0 +1,557 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2016, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +/* + * The ZFS retire agent is responsible for managing hot spares across all pools. + * When we see a device fault or a device removal, we try to open the associated + * pool and look for any hot spares. We iterate over any available hot spares + * and attempt a 'zpool replace' for each one. + * + * For vdevs diagnosed as faulty, the agent is also responsible for proactively + * marking the vdev FAULTY (for I/O errors) or DEGRADED (for checksum errors). + */ + +#include +#include +#include +#include +#include + +#include "zfs_agents.h" +#include "fmd_api.h" + + +typedef struct zfs_retire_repaired { + struct zfs_retire_repaired *zrr_next; + uint64_t zrr_pool; + uint64_t zrr_vdev; +} zfs_retire_repaired_t; + +typedef struct zfs_retire_data { + libzfs_handle_t *zrd_hdl; + zfs_retire_repaired_t *zrd_repaired; +} zfs_retire_data_t; + +static void +zfs_retire_clear_data(fmd_hdl_t *hdl, zfs_retire_data_t *zdp) +{ + zfs_retire_repaired_t *zrp; + + while ((zrp = zdp->zrd_repaired) != NULL) { + zdp->zrd_repaired = zrp->zrr_next; + fmd_hdl_free(hdl, zrp, sizeof (zfs_retire_repaired_t)); + } +} + +/* + * Find a pool with a matching GUID. + */ +typedef struct find_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; + nvlist_t *cb_vdev; +} find_cbdata_t; + +static int +find_pool(zpool_handle_t *zhp, void *data) +{ + find_cbdata_t *cbp = data; + + if (cbp->cb_guid == + zpool_get_prop_int(zhp, ZPOOL_PROP_GUID, NULL)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Find a vdev within a tree with a matching GUID. + */ +static nvlist_t * +find_vdev(libzfs_handle_t *zhdl, nvlist_t *nv, uint64_t search_guid) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + nvlist_t *ret; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + guid == search_guid) { + fmd_hdl_debug(fmd_module_hdl("zfs-retire"), + "matched vdev %llu", guid); + return (nv); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = find_vdev(zhdl, child[c], search_guid)) != NULL) + return (ret); + } + + return (NULL); +} + +/* + * Given a (pool, vdev) GUID pair, find the matching pool and vdev. + */ +static zpool_handle_t * +find_by_guid(libzfs_handle_t *zhdl, uint64_t pool_guid, uint64_t vdev_guid, + nvlist_t **vdevp) +{ + find_cbdata_t cb; + zpool_handle_t *zhp; + nvlist_t *config, *nvroot; + + /* + * Find the corresponding pool and make sure the vdev still exists. + */ + cb.cb_guid = pool_guid; + if (zpool_iter(zhdl, find_pool, &cb) != 1) + return (NULL); + + zhp = cb.cb_zhp; + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (vdev_guid != 0) { + if ((*vdevp = find_vdev(zhdl, nvroot, vdev_guid)) == NULL) { + zpool_close(zhp); + return (NULL); + } + } + + return (zhp); +} + +/* + * Given a vdev, attempt to replace it with every known spare until one + * succeeds or we run out of devices to try. + * Return whether we were successful or not in replacing the device. + */ +static boolean_t +replace_with_spare(fmd_hdl_t *hdl, zpool_handle_t *zhp, nvlist_t *vdev) +{ + nvlist_t *config, *nvroot, *replacement; + nvlist_t **spares; + uint_t s, nspares; + char *dev_name; + zprop_source_t source; + int ashift; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) != 0) + return (B_FALSE); + + /* + * Find out if there are any hot spares available in the pool. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) != 0) + return (B_FALSE); + + /* + * lookup "ashift" pool property, we may need it for the replacement + */ + ashift = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &source); + + replacement = fmd_nvl_alloc(hdl, FMD_SLEEP); + + (void) nvlist_add_string(replacement, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT); + + dev_name = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* + * Try to replace each spare, ending when we successfully + * replace it. + */ + for (s = 0; s < nspares; s++) { + char *spare_name; + + if (nvlist_lookup_string(spares[s], ZPOOL_CONFIG_PATH, + &spare_name) != 0) + continue; + + /* if set, add the "ashift" pool property to the spare nvlist */ + if (source != ZPROP_SRC_DEFAULT) + (void) nvlist_add_uint64(spares[s], + ZPOOL_CONFIG_ASHIFT, ashift); + + (void) nvlist_add_nvlist_array(replacement, + ZPOOL_CONFIG_CHILDREN, &spares[s], 1); + + fmd_hdl_debug(hdl, "zpool_vdev_replace '%s' with spare '%s'", + dev_name, basename(spare_name)); + + if (zpool_vdev_attach(zhp, dev_name, spare_name, + replacement, B_TRUE, B_FALSE) == 0) { + free(dev_name); + nvlist_free(replacement); + return (B_TRUE); + } + } + + free(dev_name); + nvlist_free(replacement); + + return (B_FALSE); +} + +/* + * Repair this vdev if we had diagnosed a 'fault.fs.zfs.device' and + * ASRU is now usable. ZFS has found the device to be present and + * functioning. + */ +/*ARGSUSED*/ +static void +zfs_vdev_repair(fmd_hdl_t *hdl, nvlist_t *nvl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + zfs_retire_repaired_t *zrp; + uint64_t pool_guid, vdev_guid; + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || nvlist_lookup_uint64(nvl, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, &vdev_guid) != 0) + return; + + /* + * Before checking the state of the ASRU, go through and see if we've + * already made an attempt to repair this ASRU. This list is cleared + * whenever we receive any kind of list event, and is designed to + * prevent us from generating a feedback loop when we attempt repairs + * against a faulted pool. The problem is that checking the unusable + * state of the ASRU can involve opening the pool, which can post + * statechange events but otherwise leave the pool in the faulted + * state. This list allows us to detect when a statechange event is + * due to our own request. + */ + for (zrp = zdp->zrd_repaired; zrp != NULL; zrp = zrp->zrr_next) { + if (zrp->zrr_pool == pool_guid && + zrp->zrr_vdev == vdev_guid) + return; + } + + zrp = fmd_hdl_alloc(hdl, sizeof (zfs_retire_repaired_t), FMD_SLEEP); + zrp->zrr_next = zdp->zrd_repaired; + zrp->zrr_pool = pool_guid; + zrp->zrr_vdev = vdev_guid; + zdp->zrd_repaired = zrp; + + fmd_hdl_debug(hdl, "marking repaired vdev %llu on pool %llu", + vdev_guid, pool_guid); +} + +/*ARGSUSED*/ +static void +zfs_retire_recv(fmd_hdl_t *hdl, fmd_event_t *ep, nvlist_t *nvl, + const char *class) +{ + uint64_t pool_guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *resource, *fault; + nvlist_t **faults; + uint_t f, nfaults; + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + libzfs_handle_t *zhdl = zdp->zrd_hdl; + boolean_t fault_device, degrade_device; + boolean_t is_repair; + char *scheme; + nvlist_t *vdev = NULL; + char *uuid; + int repair_done = 0; + boolean_t retire; + boolean_t is_disk; + vdev_aux_t aux; + uint64_t state = 0; + + fmd_hdl_debug(hdl, "zfs_retire_recv: '%s'", class); + + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, &state); + + /* + * If this is a resource notifying us of device removal then simply + * check for an available spare and continue unless the device is a + * l2arc vdev, in which case we just offline it. + */ + if (strcmp(class, "resource.fs.zfs.removed") == 0 || + (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_REMOVED)) { + char *devtype; + char *devname; + + if (nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + &vdev_guid) != 0) + return; + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + return; + + devname = zpool_vdev_name(NULL, zhp, vdev, B_FALSE); + + /* Can't replace l2arc with a spare: offline the device */ + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + &devtype) == 0 && strcmp(devtype, VDEV_TYPE_L2CACHE) == 0) { + fmd_hdl_debug(hdl, "zpool_vdev_offline '%s'", devname); + zpool_vdev_offline(zhp, devname, B_TRUE); + } else if (!fmd_prop_get_int32(hdl, "spare_on_remove") || + replace_with_spare(hdl, zhp, vdev) == B_FALSE) { + /* Could not handle with spare */ + fmd_hdl_debug(hdl, "no spare for '%s'", devname); + } + + free(devname); + zpool_close(zhp); + return; + } + + if (strcmp(class, FM_LIST_RESOLVED_CLASS) == 0) + return; + + /* + * Note: on zfsonlinux statechange events are more than just + * healthy ones so we need to confirm the actual state value. + */ + if (strcmp(class, "resource.fs.zfs.statechange") == 0 && + state == VDEV_STATE_HEALTHY) { + zfs_vdev_repair(hdl, nvl); + return; + } + if (strcmp(class, "sysevent.fs.zfs.vdev_remove") == 0) { + zfs_vdev_repair(hdl, nvl); + return; + } + + zfs_retire_clear_data(hdl, zdp); + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0) + is_repair = B_TRUE; + else + is_repair = B_FALSE; + + /* + * We subscribe to zfs faults as well as all repair events. + */ + if (nvlist_lookup_nvlist_array(nvl, FM_SUSPECT_FAULT_LIST, + &faults, &nfaults) != 0) + return; + + for (f = 0; f < nfaults; f++) { + fault = faults[f]; + + fault_device = B_FALSE; + degrade_device = B_FALSE; + is_disk = B_FALSE; + + if (nvlist_lookup_boolean_value(fault, FM_SUSPECT_RETIRE, + &retire) == 0 && retire == 0) + continue; + + /* + * While we subscribe to fault.fs.zfs.*, we only take action + * for faults targeting a specific vdev (open failure or SERD + * failure). We also subscribe to fault.io.* events, so that + * faulty disks will be faulted in the ZFS configuration. + */ + if (fmd_nvl_class_match(hdl, fault, "fault.fs.zfs.vdev.io")) { + fault_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.vdev.checksum")) { + degrade_device = B_TRUE; + } else if (fmd_nvl_class_match(hdl, fault, + "fault.fs.zfs.device")) { + fault_device = B_FALSE; + } else if (fmd_nvl_class_match(hdl, fault, "fault.io.*")) { + is_disk = B_TRUE; + fault_device = B_TRUE; + } else { + continue; + } + + if (is_disk) { + continue; + } else { + /* + * This is a ZFS fault. Lookup the resource, and + * attempt to find the matching vdev. + */ + if (nvlist_lookup_nvlist(fault, FM_FAULT_RESOURCE, + &resource) != 0 || + nvlist_lookup_string(resource, FM_FMRI_SCHEME, + &scheme) != 0) + continue; + + if (strcmp(scheme, FM_FMRI_SCHEME_ZFS) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_POOL, + &pool_guid) != 0) + continue; + + if (nvlist_lookup_uint64(resource, FM_FMRI_ZFS_VDEV, + &vdev_guid) != 0) { + if (is_repair) + vdev_guid = 0; + else + continue; + } + + if ((zhp = find_by_guid(zhdl, pool_guid, vdev_guid, + &vdev)) == NULL) + continue; + + aux = VDEV_AUX_ERR_EXCEEDED; + } + + if (vdev_guid == 0) { + /* + * For pool-level repair events, clear the entire pool. + */ + fmd_hdl_debug(hdl, "zpool_clear of pool '%s'", + zpool_get_name(zhp)); + (void) zpool_clear(zhp, NULL, NULL); + zpool_close(zhp); + continue; + } + + /* + * If this is a repair event, then mark the vdev as repaired and + * continue. + */ + if (is_repair) { + repair_done = 1; + fmd_hdl_debug(hdl, "zpool_clear of pool '%s' vdev %llu", + zpool_get_name(zhp), vdev_guid); + (void) zpool_vdev_clear(zhp, vdev_guid); + zpool_close(zhp); + continue; + } + + /* + * Actively fault the device if needed. + */ + if (fault_device) + (void) zpool_vdev_fault(zhp, vdev_guid, aux); + if (degrade_device) + (void) zpool_vdev_degrade(zhp, vdev_guid, aux); + + if (fault_device || degrade_device) + fmd_hdl_debug(hdl, "zpool_vdev_%s: vdev %llu on '%s'", + fault_device ? "fault" : "degrade", vdev_guid, + zpool_get_name(zhp)); + + /* + * Attempt to substitute a hot spare. + */ + (void) replace_with_spare(hdl, zhp, vdev); + zpool_close(zhp); + } + + if (strcmp(class, FM_LIST_REPAIRED_CLASS) == 0 && repair_done && + nvlist_lookup_string(nvl, FM_SUSPECT_UUID, &uuid) == 0) + fmd_case_uuresolved(hdl, uuid); +} + +static const fmd_hdl_ops_t fmd_ops = { + zfs_retire_recv, /* fmdo_recv */ + NULL, /* fmdo_timeout */ + NULL, /* fmdo_close */ + NULL, /* fmdo_stats */ + NULL, /* fmdo_gc */ +}; + +static const fmd_prop_t fmd_props[] = { + { "spare_on_remove", FMD_TYPE_BOOL, "true" }, + { NULL, 0, NULL } +}; + +static const fmd_hdl_info_t fmd_info = { + "ZFS Retire Agent", "1.0", &fmd_ops, fmd_props +}; + +void +_zfs_retire_init(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp; + libzfs_handle_t *zhdl; + + if ((zhdl = libzfs_init()) == NULL) + return; + + if (fmd_hdl_register(hdl, FMD_API_VERSION, &fmd_info) != 0) { + libzfs_fini(zhdl); + return; + } + + zdp = fmd_hdl_zalloc(hdl, sizeof (zfs_retire_data_t), FMD_SLEEP); + zdp->zrd_hdl = zhdl; + + fmd_hdl_setspecific(hdl, zdp); +} + +void +_zfs_retire_fini(fmd_hdl_t *hdl) +{ + zfs_retire_data_t *zdp = fmd_hdl_getspecific(hdl); + + if (zdp != NULL) { + zfs_retire_clear_data(hdl, zdp); + libzfs_fini(zdp->zrd_hdl); + fmd_hdl_free(hdl, zdp, sizeof (zfs_retire_data_t)); + } +} diff --git a/cmd/zed/zed.c b/cmd/zed/zed.c new file mode 100644 index 000000000000..0784e3834733 --- /dev/null +++ b/cmd/zed/zed.c @@ -0,0 +1,306 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed.h" +#include "zed_conf.h" +#include "zed_event.h" +#include "zed_file.h" +#include "zed_log.h" + +static volatile sig_atomic_t _got_exit = 0; +static volatile sig_atomic_t _got_hup = 0; + +/* + * Signal handler for SIGINT & SIGTERM. + */ +static void +_exit_handler(int signum) +{ + _got_exit = 1; +} + +/* + * Signal handler for SIGHUP. + */ +static void +_hup_handler(int signum) +{ + _got_hup = 1; +} + +/* + * Register signal handlers. + */ +static void +_setup_sig_handlers(void) +{ + struct sigaction sa; + + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = SA_RESTART; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGPIPE, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGPIPE"); + + sa.sa_handler = _exit_handler; + if (sigaction(SIGINT, &sa, NULL) < 0) + zed_log_die("Failed to register SIGINT handler"); + + if (sigaction(SIGTERM, &sa, NULL) < 0) + zed_log_die("Failed to register SIGTERM handler"); + + sa.sa_handler = _hup_handler; + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to register SIGHUP handler"); +} + +/* + * Lock all current and future pages in the virtual memory address space. + * Access to locked pages will never be delayed by a page fault. + * + * EAGAIN is tested up to max_tries in case this is a transient error. + * + * Note that memory locks are not inherited by a child created via fork() + * and are automatically removed during an execve(). As such, this must + * be called after the daemon fork()s (when running in the background). + */ +static void +_lock_memory(void) +{ +#if HAVE_MLOCKALL + int i = 0; + const int max_tries = 10; + + for (i = 0; i < max_tries; i++) { + if (mlockall(MCL_CURRENT | MCL_FUTURE) == 0) { + zed_log_msg(LOG_INFO, "Locked all pages in memory"); + return; + } + if (errno != EAGAIN) + break; + } + zed_log_die("Failed to lock memory pages: %s", strerror(errno)); + +#else /* HAVE_MLOCKALL */ + zed_log_die("Failed to lock memory pages: mlockall() not supported"); +#endif /* HAVE_MLOCKALL */ +} + +/* + * Start daemonization of the process including the double fork(). + * + * The parent process will block here until _finish_daemonize() is called + * (in the grandchild process), at which point the parent process will exit. + * This prevents the parent process from exiting until initialization is + * complete. + */ +static void +_start_daemonize(void) +{ + pid_t pid; + struct sigaction sa; + + /* Create pipe for communicating with child during daemonization. */ + zed_log_pipe_open(); + + /* Background process and ensure child is not process group leader. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create child process: %s", + strerror(errno)); + } else if (pid > 0) { + + /* Close writes since parent will only read from pipe. */ + zed_log_pipe_close_writes(); + + /* Wait for notification that daemonization is complete. */ + zed_log_pipe_wait(); + + zed_log_pipe_close_reads(); + _exit(EXIT_SUCCESS); + } + + /* Close reads since child will only write to pipe. */ + zed_log_pipe_close_reads(); + + /* Create independent session and detach from terminal. */ + if (setsid() < 0) + zed_log_die("Failed to create new session: %s", + strerror(errno)); + + /* Prevent child from terminating on HUP when session leader exits. */ + if (sigemptyset(&sa.sa_mask) < 0) + zed_log_die("Failed to initialize sigset"); + + sa.sa_flags = 0; + sa.sa_handler = SIG_IGN; + + if (sigaction(SIGHUP, &sa, NULL) < 0) + zed_log_die("Failed to ignore SIGHUP"); + + /* Ensure process cannot re-acquire terminal. */ + pid = fork(); + if (pid < 0) { + zed_log_die("Failed to create grandchild process: %s", + strerror(errno)); + } else if (pid > 0) { + _exit(EXIT_SUCCESS); + } +} + +/* + * Finish daemonization of the process by closing stdin/stdout/stderr. + * + * This must be called at the end of initialization after all external + * communication channels are established and accessible. + */ +static void +_finish_daemonize(void) +{ + int devnull; + + /* Preserve fd 0/1/2, but discard data to/from stdin/stdout/stderr. */ + devnull = open("/dev/null", O_RDWR); + if (devnull < 0) + zed_log_die("Failed to open /dev/null: %s", strerror(errno)); + + if (dup2(devnull, STDIN_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdin: %s", + strerror(errno)); + + if (dup2(devnull, STDOUT_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stdout: %s", + strerror(errno)); + + if (dup2(devnull, STDERR_FILENO) < 0) + zed_log_die("Failed to dup /dev/null onto stderr: %s", + strerror(errno)); + + if ((devnull > STDERR_FILENO) && (close(devnull) < 0)) + zed_log_die("Failed to close /dev/null: %s", strerror(errno)); + + /* Notify parent that daemonization is complete. */ + zed_log_pipe_close_writes(); +} + +/* + * ZFS Event Daemon (ZED). + */ +int +main(int argc, char *argv[]) +{ + struct zed_conf *zcp; + uint64_t saved_eid; + int64_t saved_etime[2]; + + zed_log_init(argv[0]); + zed_log_stderr_open(LOG_NOTICE); + zcp = zed_conf_create(); + zed_conf_parse_opts(zcp, argc, argv); + if (zcp->do_verbose) + zed_log_stderr_open(LOG_INFO); + + if (geteuid() != 0) + zed_log_die("Must be run as root"); + + zed_conf_parse_file(zcp); + + zed_file_close_from(STDERR_FILENO + 1); + + (void) umask(0); + + if (chdir("/") < 0) + zed_log_die("Failed to change to root directory"); + + if (zed_conf_scan_dir(zcp) < 0) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) { + _start_daemonize(); + zed_log_syslog_open(LOG_DAEMON); + } + _setup_sig_handlers(); + + if (zcp->do_memlock) + _lock_memory(); + + if ((zed_conf_write_pid(zcp) < 0) && (!zcp->do_force)) + exit(EXIT_FAILURE); + + if (!zcp->do_foreground) + _finish_daemonize(); + + zed_log_msg(LOG_NOTICE, + "ZFS Event Daemon %s-%s (PID %d)", + ZFS_META_VERSION, ZFS_META_RELEASE, (int)getpid()); + + if (zed_conf_open_state(zcp) < 0) + exit(EXIT_FAILURE); + + if (zed_conf_read_state(zcp, &saved_eid, saved_etime) < 0) + exit(EXIT_FAILURE); + +idle: + /* + * If -I is specified, attempt to open /dev/zfs repeatedly until + * successful. + */ + do { + if (!zed_event_init(zcp)) + break; + /* Wait for some time and try again. tunable? */ + sleep(30); + } while (!_got_exit && zcp->do_idle); + + if (_got_exit) + goto out; + + zed_event_seek(zcp, saved_eid, saved_etime); + + while (!_got_exit) { + int rv; + if (_got_hup) { + _got_hup = 0; + (void) zed_conf_scan_dir(zcp); + } + rv = zed_event_service(zcp); + + /* ENODEV: When kernel module is unloaded (osx) */ + if (rv == ENODEV) + break; + } + + zed_log_msg(LOG_NOTICE, "Exiting"); + zed_event_fini(zcp); + + if (zcp->do_idle && !_got_exit) + goto idle; + +out: + zed_conf_destroy(zcp); + zed_log_fini(); + exit(EXIT_SUCCESS); +} diff --git a/cmd/zed/zed.d/.gitignore b/cmd/zed/zed.d/.gitignore new file mode 100644 index 000000000000..46a00945aa7c --- /dev/null +++ b/cmd/zed/zed.d/.gitignore @@ -0,0 +1 @@ +history_event-zfs-list-cacher.sh diff --git a/cmd/zed/zed.d/Makefile.am b/cmd/zed/zed.d/Makefile.am new file mode 100644 index 000000000000..8b2d0c200286 --- /dev/null +++ b/cmd/zed/zed.d/Makefile.am @@ -0,0 +1,53 @@ +include $(top_srcdir)/config/Rules.am +include $(top_srcdir)/config/Substfiles.am + +EXTRA_DIST += README + +zedconfdir = $(sysconfdir)/zfs/zed.d + +dist_zedconf_DATA = \ + zed-functions.sh \ + zed.rc + +zedexecdir = $(zfsexecdir)/zed.d + +dist_zedexec_SCRIPTS = \ + all-debug.sh \ + all-syslog.sh \ + data-notify.sh \ + generic-notify.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh \ + trim_finish-notify.sh + +nodist_zedexec_SCRIPTS = history_event-zfs-list-cacher.sh + +SUBSTFILES += $(nodist_zedexec_SCRIPTS) + +zedconfdefaults = \ + all-syslog.sh \ + data-notify.sh \ + history_event-zfs-list-cacher.sh \ + resilver_finish-notify.sh \ + scrub_finish-notify.sh \ + statechange-led.sh \ + statechange-notify.sh \ + vdev_clear-led.sh \ + vdev_attach-led.sh \ + pool_import-led.sh \ + resilver_finish-start-scrub.sh + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zedconfdir)" + for f in $(zedconfdefaults); do \ + test -f "$(DESTDIR)$(zedconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zedconfdir)/$${f}" || \ + ln -s "$(zedexecdir)/$${f}" "$(DESTDIR)$(zedconfdir)"; \ + done + chmod 0600 "$(DESTDIR)$(zedconfdir)/zed.rc" diff --git a/cmd/zed/zed.d/README b/cmd/zed/zed.d/README new file mode 100644 index 000000000000..7279b93704e2 --- /dev/null +++ b/cmd/zed/zed.d/README @@ -0,0 +1,30 @@ +Shell scripts are the recommended choice for ZEDLETs that mostly call +other utilities and do relatively little data manipulation. + +Shell scripts MUST work on both bash and dash. + +Shell scripts MUST run cleanly through ShellCheck: + http://www.shellcheck.net/ + +General functions reside in "zed-functions.sh". Use them where applicable. + +Additional references that may be of use: + + Google Shell Style Guide + https://github.com/google/styleguide/blob/gh-pages/shell.xml + + Dash as /bin/sh + https://wiki.ubuntu.com/DashAsBinSh + + Common shell script mistakes + http://www.pixelbeat.org/programming/shell_script_mistakes.html + + Filenames and Pathnames in Shell: How to do it Correctly + http://www.dwheeler.com/essays/filenames-in-shell.html + + Autoconf: Portable Shell Programming + https://www.gnu.org/software/autoconf/manual/autoconf.html#Portable-Shell + +Please BE CONSISTENT with the existing style, check for errors, +minimize dependencies where possible, try to be portable, +and comment anything non-obvious. Festina lente. diff --git a/cmd/zed/zed.d/all-debug.sh b/cmd/zed/zed.d/all-debug.sh new file mode 100755 index 000000000000..14b39caacd9d --- /dev/null +++ b/cmd/zed/zed.d/all-debug.sh @@ -0,0 +1,26 @@ +#!/bin/sh +# +# Log all environment variables to ZED_DEBUG_LOG. +# +# This can be a useful aid when developing/debugging ZEDLETs since it shows the +# environment variables defined for each zevent. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +: "${ZED_DEBUG_LOG:="${TMPDIR:="/tmp"}/zed.debug.log"}" + +zed_exit_if_ignoring_this_event + +lockfile="$(basename -- "${ZED_DEBUG_LOG}").lock" + +umask 077 +zed_lock "${lockfile}" +exec >> "${ZED_DEBUG_LOG}" + +printenv | sort +echo + +exec >&- +zed_unlock "${lockfile}" +exit 0 diff --git a/cmd/zed/zed.d/all-syslog.sh b/cmd/zed/zed.d/all-syslog.sh new file mode 100755 index 000000000000..cb9286500136 --- /dev/null +++ b/cmd/zed/zed.d/all-syslog.sh @@ -0,0 +1,14 @@ +#!/bin/sh +# +# Log the zevent via syslog. + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event + +zed_log_msg "eid=${ZEVENT_EID}" "class=${ZEVENT_SUBCLASS}" \ + "${ZEVENT_POOL_GUID:+"pool_guid=${ZEVENT_POOL_GUID}"}" \ + "${ZEVENT_VDEV_PATH:+"vdev_path=${ZEVENT_VDEV_PATH}"}" \ + "${ZEVENT_VDEV_STATE_STR:+"vdev_state=${ZEVENT_VDEV_STATE_STR}"}" +exit 0 diff --git a/cmd/zed/zed.d/data-notify.sh b/cmd/zed/zed.d/data-notify.sh new file mode 100755 index 000000000000..639b459bdd3b --- /dev/null +++ b/cmd/zed/zed.d/data-notify.sh @@ -0,0 +1,43 @@ +#!/bin/sh +# +# Send notification in response to a DATA error. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool/[vdev] combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool/[vdev]. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 +[ -n "${ZED_NOTIFY_DATA}" ] || exit 3 + +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_VDEV_GUID:-0};${ZEVENT_SUBCLASS};notify" +zed_rate_limit "${rate_limit_tag}" || exit 3 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} error for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has detected a data error:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + echo " error: ${ZEVENT_ZIO_ERR}" + echo " objid: ${ZEVENT_ZIO_OBJSET}:${ZEVENT_ZIO_OBJECT}" + echo " pool: ${ZEVENT_POOL}" +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/generic-notify.sh b/cmd/zed/zed.d/generic-notify.sh new file mode 100755 index 000000000000..e438031a088a --- /dev/null +++ b/cmd/zed/zed.d/generic-notify.sh @@ -0,0 +1,54 @@ +#!/bin/sh +# +# Send notification in response to a given zevent. +# +# This is a generic script than can be symlinked to a file in the +# enabled-zedlets directory to have a notification sent when a particular +# class of zevents occurs. The symlink filename must begin with the zevent +# (sub)class string (e.g., "probe_failure-notify.sh" for the "probe_failure" +# subclass). Refer to the zed(8) manpage for details. +# +# Only one notification per ZED_NOTIFY_INTERVAL_SECS will be sent for a given +# class/pool combination. This protects against spamming the recipient +# should multiple events occur together in time for the same pool. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +# Rate-limit the notification based in part on the filename. +# +rate_limit_tag="${ZEVENT_POOL};${ZEVENT_SUBCLASS};$(basename -- "$0")" +rate_limit_interval="${ZED_NOTIFY_INTERVAL_SECS}" +zed_rate_limit "${rate_limit_tag}" "${rate_limit_interval}" || exit 3 + +umask 077 +pool_str="${ZEVENT_POOL:+" for ${ZEVENT_POOL}"}" +host_str=" on $(hostname)" +note_subject="ZFS ${ZEVENT_SUBCLASS} event${pool_str}${host_str}" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has posted the following event:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + + [ -n "${ZEVENT_POOL}" ] && [ -x "${ZPOOL}" ] \ + && "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in new file mode 100755 index 000000000000..053b4414a768 --- /dev/null +++ b/cmd/zed/zed.d/history_event-zfs-list-cacher.sh.in @@ -0,0 +1,85 @@ +#!/bin/sh +# +# Track changes to enumerated pools for use in early-boot +set -ef + +FSLIST_DIR="@sysconfdir@/zfs/zfs-list.cache" +FSLIST_TMP="@runstatedir@/zfs-list.cache.new" +FSLIST="${FSLIST_DIR}/${ZEVENT_POOL}" + +# If the pool specific cache file is not writeable, abort +[ -w "${FSLIST}" ] || exit 0 + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +zed_exit_if_ignoring_this_event +zed_check_cmd "${ZFS}" sort diff grep + +# If we are acting on a snapshot, we have nothing to do +printf '%s' "${ZEVENT_HISTORY_DSNAME}" | grep '@' && exit 0 + +# We obtain a lock on zfs-list to avoid any simultaneous writes. +# If we run into trouble, log and drop the lock +abort_alter() { + zed_log_msg "Error updating zfs-list.cache!" + zed_unlock zfs-list +} + +finished() { + zed_unlock zfs-list + trap - EXIT + exit 0 +} + +case "${ZEVENT_HISTORY_INTERNAL_NAME}" in + create|"finish receiving"|import|destroy|rename) + ;; + + export) + zed_lock zfs-list + trap abort_alter EXIT + echo > "${FSLIST}" + finished + ;; + + set|inherit) + # Only act if one of the tracked properties is altered. + case "${ZEVENT_HISTORY_INTERNAL_STR%%=*}" in + canmount|mountpoint|atime|relatime|devices|exec|readonly| \ + setuid|nbmand|encroot|keylocation|org.openzfs.systemd:requires| \ + org.openzfs.systemd:requires-mounts-for| \ + org.openzfs.systemd:before|org.openzfs.systemd:after| \ + org.openzfs.systemd:wanted-by|org.openzfs.systemd:required-by| \ + org.openzfs.systemd:nofail|org.openzfs.systemd:ignore \ + ) ;; + *) exit 0 ;; + esac + ;; + + *) + # Ignore all other events. + exit 0 + ;; +esac + +zed_lock zfs-list +trap abort_alter EXIT + +PROPS="name,mountpoint,canmount,atime,relatime,devices,exec\ +,readonly,setuid,nbmand,encroot,keylocation\ +,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for\ +,org.openzfs.systemd:before,org.openzfs.systemd:after\ +,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by\ +,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore" + +"${ZFS}" list -H -t filesystem -o $PROPS -r "${ZEVENT_POOL}" > "${FSLIST_TMP}" + +# Sort the output so that it is stable +sort "${FSLIST_TMP}" -o "${FSLIST_TMP}" + +# Don't modify the file if it hasn't changed +diff -q "${FSLIST_TMP}" "${FSLIST}" || mv "${FSLIST_TMP}" "${FSLIST}" +rm -f "${FSLIST_TMP}" + +finished diff --git a/cmd/zed/zed.d/pool_import-led.sh b/cmd/zed/zed.d/pool_import-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/pool_import-led.sh @@ -0,0 +1 @@ +statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/resilver_finish-notify.sh b/cmd/zed/zed.d/resilver_finish-notify.sh new file mode 120000 index 000000000000..e4c56bc5f816 --- /dev/null +++ b/cmd/zed/zed.d/resilver_finish-notify.sh @@ -0,0 +1 @@ +scrub_finish-notify.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/resilver_finish-start-scrub.sh b/cmd/zed/zed.d/resilver_finish-start-scrub.sh new file mode 100755 index 000000000000..c7cfd1ddba80 --- /dev/null +++ b/cmd/zed/zed.d/resilver_finish-start-scrub.sh @@ -0,0 +1,19 @@ +#!/bin/sh +# resilver_finish-start-scrub.sh +# Run a scrub after a resilver +# +# Exit codes: +# 1: Internal error +# 2: Script wasn't enabled in zed.rc +# 3: Scrubs are automatically started for sequential resilvers +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ "${ZED_SCRUB_AFTER_RESILVER}" = "1" ] || exit 2 +[ "${ZEVENT_RESILVER_TYPE}" != "sequential" ] || exit 3 +[ -n "${ZEVENT_POOL}" ] || exit 1 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 1 +zed_check_cmd "${ZPOOL}" || exit 1 + +zed_log_msg "Starting scrub after resilver on ${ZEVENT_POOL}" +"${ZPOOL}" scrub "${ZEVENT_POOL}" diff --git a/cmd/zed/zed.d/scrub_finish-notify.sh b/cmd/zed/zed.d/scrub_finish-notify.sh new file mode 100755 index 000000000000..2145a100a3fa --- /dev/null +++ b/cmd/zed/zed.d/scrub_finish-notify.sh @@ -0,0 +1,59 @@ +#!/bin/sh +# +# Send notification in response to a RESILVER_FINISH or SCRUB_FINISH. +# +# By default, "zpool status" output will only be included for a scrub_finish +# zevent if the pool is not healthy; to always include its output, set +# ZED_NOTIFY_VERBOSE=1. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: notification suppressed +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +if [ "${ZEVENT_SUBCLASS}" = "resilver_finish" ]; then + action="resilver" +elif [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + action="scrub" +else + zed_log_err "unsupported event class \"${ZEVENT_SUBCLASS}\"" + exit 9 +fi + +zed_check_cmd "${ZPOOL}" || exit 9 + +# For scrub, suppress notification if the pool is healthy +# and verbosity is not enabled. +# +if [ "${ZEVENT_SUBCLASS}" = "scrub_finish" ]; then + healthy="$("${ZPOOL}" status -x "${ZEVENT_POOL}" \ + | grep "'${ZEVENT_POOL}' is healthy")" + [ -n "${healthy}" ] && [ "${ZED_NOTIFY_VERBOSE}" -eq 0 ] && exit 3 +fi + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a ${action}:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh new file mode 100755 index 000000000000..e656e125d378 --- /dev/null +++ b/cmd/zed/zed.d/statechange-led.sh @@ -0,0 +1,177 @@ +#!/bin/sh +# +# Turn off/on the VDEV's enclosure fault LEDs when the pool's state changes. +# +# Turn the VDEV's fault LED on if it becomes FAULTED, DEGRADED or UNAVAIL. +# Turn the LED off when it's back ONLINE again. +# +# This script run in two basic modes: +# +# 1. If $ZEVENT_VDEV_ENC_SYSFS_PATH and $ZEVENT_VDEV_STATE_STR are set, then +# only set the LED for that particular VDEV. This is the case for statechange +# events and some vdev_* events. +# +# 2. If those vars are not set, then check the state of all VDEVs in the pool +# and set the LEDs accordingly. This is the case for pool_import events. +# +# Note that this script requires that your enclosure be supported by the +# Linux SCSI enclosure services (ses) driver. The script will do nothing +# if you have no enclosure, or if your enclosure isn't supported. +# +# Exit codes: +# 0: enclosure led successfully set +# 1: enclosure leds not available +# 2: enclosure leds administratively disabled +# 3: The led sysfs path passed from ZFS does not exist +# 4: $ZPOOL not set +# 5: awk is not installed + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +if [ ! -d /sys/class/enclosure ] ; then + exit 1 +fi + +if [ "${ZED_USE_ENCLOSURE_LEDS}" != "1" ] ; then + exit 2 +fi + +zed_check_cmd "$ZPOOL" || exit 4 +zed_check_cmd awk || exit 5 + +# Global used in set_led debug print +vdev="" + +# check_and_set_led (file, val) +# +# Read an enclosure sysfs file, and write it if it's not already set to 'val' +# +# Arguments +# file: sysfs file to set (like /sys/class/enclosure/0:0:1:0/SLOT 10/fault) +# val: value to set it to +# +# Return +# 0 on success, 3 on missing sysfs path +# +check_and_set_led() +{ + file="$1" + val="$2" + + if [ ! -e "$file" ] ; then + return 3 + fi + + # If another process is accessing the LED when we attempt to update it, + # the update will be lost so retry until the LED actually changes or we + # timeout. + for _ in $(seq 1 5); do + # We want to check the current state first, since writing to the + # 'fault' entry always causes a SES command, even if the + # current state is already what you want. + current=$(cat "${file}") + + # On some enclosures if you write 1 to fault, and read it back, + # it will return 2. Treat all non-zero values as 1 for + # simplicity. + if [ "$current" != "0" ] ; then + current=1 + fi + + if [ "$current" != "$val" ] ; then + echo "$val" > "$file" + zed_log_msg "vdev $vdev set '$file' LED to $val" + else + break + fi + done +} + +state_to_val() +{ + state="$1" + if [ "$state" = "FAULTED" ] || [ "$state" = "DEGRADED" ] || \ + [ "$state" = "UNAVAIL" ] ; then + echo 1 + elif [ "$state" = "ONLINE" ] ; then + echo 0 + fi +} + +# process_pool ([pool]) +# +# Iterate through a pool (or pools) and set the VDEV's enclosure slot LEDs to +# the VDEV's state. +# +# Arguments +# pool: Optional pool name. If not specified, iterate though all pools. +# +# Return +# 0 on success, 3 on missing sysfs path +# +process_pool() +{ + pool="$1" + rc=0 + + # Lookup all the current LED values and paths in parallel + #shellcheck disable=SC2016 + cmd='echo led_token=$(cat "$VDEV_ENC_SYSFS_PATH/fault"),"$VDEV_ENC_SYSFS_PATH",' + out=$($ZPOOL status -vc "$cmd" "$pool" | grep 'led_token=') + + #shellcheck disable=SC2034 + echo "$out" | while read -r vdev state read write chksum therest; do + # Read out current LED value and path + tmp=$(echo "$therest" | sed 's/^.*led_token=//g') + vdev_enc_sysfs_path=$(echo "$tmp" | awk -F ',' '{print $2}') + current_val=$(echo "$tmp" | awk -F ',' '{print $1}') + + if [ "$current_val" != "0" ] ; then + current_val=1 + fi + + if [ -z "$vdev_enc_sysfs_path" ] ; then + # Skip anything with no sysfs LED entries + continue + fi + + if [ ! -e "$vdev_enc_sysfs_path/fault" ] ; then + #shellcheck disable=SC2030 + rc=1 + zed_log_msg "vdev $vdev '$file/fault' doesn't exist" + continue; + fi + + val=$(state_to_val "$state") + + if [ "$current_val" = "$val" ] ; then + # LED is already set correctly + continue; + fi + + if ! check_and_set_led "$vdev_enc_sysfs_path/fault" "$val"; then + rc=1 + fi + + done + + #shellcheck disable=SC2031 + if [ "$rc" = "0" ] ; then + return 0 + else + # We didn't see a sysfs entry that we wanted to set + return 3 + fi +} + +if [ -n "$ZEVENT_VDEV_ENC_SYSFS_PATH" ] && [ -n "$ZEVENT_VDEV_STATE_STR" ] ; then + # Got a statechange for an individual VDEV + val=$(state_to_val "$ZEVENT_VDEV_STATE_STR") + vdev=$(basename "$ZEVENT_VDEV_PATH") + check_and_set_led "$ZEVENT_VDEV_ENC_SYSFS_PATH/fault" "$val" +else + # Process the entire pool + poolname=$(zed_guid_to_pool "$ZEVENT_POOL_GUID") + process_pool "$poolname" +fi diff --git a/cmd/zed/zed.d/statechange-notify.sh b/cmd/zed/zed.d/statechange-notify.sh new file mode 100755 index 000000000000..f46080a03239 --- /dev/null +++ b/cmd/zed/zed.d/statechange-notify.sh @@ -0,0 +1,74 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License Version 1.0 (CDDL-1.0). +# You can obtain a copy of the license from the top-level file +# "OPENSOLARIS.LICENSE" or at . +# You may not use this file except in compliance with the license. +# +# CDDL HEADER END +# + +# +# Send notification in response to a fault induced statechange +# +# ZEVENT_SUBCLASS: 'statechange' +# ZEVENT_VDEV_STATE_STR: 'DEGRADED', 'FAULTED' or 'REMOVED' +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 3: statechange not relevant +# 4: statechange string missing (unexpected) + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_VDEV_STATE_STR}" ] || exit 4 + +if [ "${ZEVENT_VDEV_STATE_STR}" != "FAULTED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "DEGRADED" ] \ + && [ "${ZEVENT_VDEV_STATE_STR}" != "REMOVED" ]; then + exit 3 +fi + +umask 077 +note_subject="ZFS device fault for pool ${ZEVENT_POOL_GUID} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + if [ "${ZEVENT_VDEV_STATE_STR}" = "FAULTED" ] ; then + echo "The number of I/O errors associated with a ZFS device exceeded" + echo "acceptable levels. ZFS has marked the device as faulted." + elif [ "${ZEVENT_VDEV_STATE_STR}" = "DEGRADED" ] ; then + echo "The number of checksum errors associated with a ZFS device" + echo "exceeded acceptable levels. ZFS has marked the device as" + echo "degraded." + else + echo "ZFS has detected that a device was removed." + fi + + echo + echo " impact: Fault tolerance of the pool may be compromised." + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " state: ${ZEVENT_VDEV_STATE_STR}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + [ -n "${ZEVENT_VDEV_TYPE}" ] && echo " vtype: ${ZEVENT_VDEV_TYPE}" + [ -n "${ZEVENT_VDEV_PATH}" ] && echo " vpath: ${ZEVENT_VDEV_PATH}" + [ -n "${ZEVENT_VDEV_PHYSPATH}" ] && echo " vphys: ${ZEVENT_VDEV_PHYSPATH}" + [ -n "${ZEVENT_VDEV_GUID}" ] && echo " vguid: ${ZEVENT_VDEV_GUID}" + [ -n "${ZEVENT_VDEV_DEVID}" ] && echo " devid: ${ZEVENT_VDEV_DEVID}" + + echo " pool: ${ZEVENT_POOL_GUID}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? + +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/trim_finish-notify.sh b/cmd/zed/zed.d/trim_finish-notify.sh new file mode 100755 index 000000000000..5075302997e3 --- /dev/null +++ b/cmd/zed/zed.d/trim_finish-notify.sh @@ -0,0 +1,37 @@ +#!/bin/sh +# +# Send notification in response to a TRIM_FINISH. The event +# will be received for each vdev in the pool which was trimmed. +# +# Exit codes: +# 0: notification sent +# 1: notification failed +# 2: notification not configured +# 9: internal error + +[ -f "${ZED_ZEDLET_DIR}/zed.rc" ] && . "${ZED_ZEDLET_DIR}/zed.rc" +. "${ZED_ZEDLET_DIR}/zed-functions.sh" + +[ -n "${ZEVENT_POOL}" ] || exit 9 +[ -n "${ZEVENT_SUBCLASS}" ] || exit 9 + +zed_check_cmd "${ZPOOL}" || exit 9 + +umask 077 +note_subject="ZFS ${ZEVENT_SUBCLASS} event for ${ZEVENT_POOL} on $(hostname)" +note_pathname="${TMPDIR:="/tmp"}/$(basename -- "$0").${ZEVENT_EID}.$$" +{ + echo "ZFS has finished a trim:" + echo + echo " eid: ${ZEVENT_EID}" + echo " class: ${ZEVENT_SUBCLASS}" + echo " host: $(hostname)" + echo " time: ${ZEVENT_TIME_STRING}" + + "${ZPOOL}" status -t "${ZEVENT_POOL}" + +} > "${note_pathname}" + +zed_notify "${note_subject}" "${note_pathname}"; rv=$? +rm -f "${note_pathname}" +exit "${rv}" diff --git a/cmd/zed/zed.d/vdev_attach-led.sh b/cmd/zed/zed.d/vdev_attach-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/vdev_attach-led.sh @@ -0,0 +1 @@ +statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/vdev_clear-led.sh b/cmd/zed/zed.d/vdev_clear-led.sh new file mode 120000 index 000000000000..7d7404398a4a --- /dev/null +++ b/cmd/zed/zed.d/vdev_clear-led.sh @@ -0,0 +1 @@ +statechange-led.sh \ No newline at end of file diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh new file mode 100755 index 000000000000..44a9b8d23303 --- /dev/null +++ b/cmd/zed/zed.d/zed-functions.sh @@ -0,0 +1,538 @@ +#!/bin/sh +# shellcheck disable=SC2039 +# zed-functions.sh +# +# ZED helper functions for use in ZEDLETs + + +# Variable Defaults +# +: "${ZED_LOCKDIR:="/var/lock"}" +: "${ZED_NOTIFY_INTERVAL_SECS:=3600}" +: "${ZED_NOTIFY_VERBOSE:=0}" +: "${ZED_RUNDIR:="/var/run"}" +: "${ZED_SYSLOG_PRIORITY:="daemon.notice"}" +: "${ZED_SYSLOG_TAG:="zed"}" + +ZED_FLOCK_FD=8 + + +# zed_check_cmd (cmd, ...) +# +# For each argument given, search PATH for the executable command [cmd]. +# Log a message if [cmd] is not found. +# +# Arguments +# cmd: name of executable command for which to search +# +# Return +# 0 if all commands are found in PATH and are executable +# n for a count of the command executables that are not found +# +zed_check_cmd() +{ + local cmd + local rv=0 + + for cmd; do + if ! command -v "${cmd}" >/dev/null 2>&1; then + zed_log_err "\"${cmd}\" not installed" + rv=$((rv + 1)) + fi + done + return "${rv}" +} + + +# zed_log_msg (msg, ...) +# +# Write all argument strings to the system log. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# +# Return +# nothing +# +zed_log_msg() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "$@" +} + + +# zed_log_err (msg, ...) +# +# Write an error message to the system log. This message will contain the +# script name, EID, and all argument strings. +# +# Globals +# ZED_SYSLOG_PRIORITY +# ZED_SYSLOG_TAG +# ZEVENT_EID +# +# Return +# nothing +# +zed_log_err() +{ + logger -p "${ZED_SYSLOG_PRIORITY}" -t "${ZED_SYSLOG_TAG}" -- "error:" \ + "$(basename -- "$0"):""${ZEVENT_EID:+" eid=${ZEVENT_EID}:"}" "$@" +} + + +# zed_lock (lockfile, [fd]) +# +# Obtain an exclusive (write) lock on [lockfile]. If the lock cannot be +# immediately acquired, wait until it becomes available. +# +# Every zed_lock() must be paired with a corresponding zed_unlock(). +# +# By default, flock-style locks associate the lockfile with file descriptor 8. +# The bash manpage warns that file descriptors >9 should be used with care as +# they may conflict with file descriptors used internally by the shell. File +# descriptor 9 is reserved for zed_rate_limit(). If concurrent locks are held +# within the same process, they must use different file descriptors (preferably +# decrementing from 8); otherwise, obtaining a new lock with a given file +# descriptor will release the previous lock associated with that descriptor. +# +# Arguments +# lockfile: pathname of the lock file; the lock will be stored in +# ZED_LOCKDIR unless the pathname contains a "/". +# fd: integer for the file descriptor used by flock (OPTIONAL unless holding +# concurrent locks) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_lock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local umask_bak + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + umask_bak="$(umask)" + umask 077 + + # Obtain a lock on the file bound to the given file descriptor. + # + eval "exec ${fd}> '${lockfile}'" + err="$(flock --exclusive "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to lock \"${lockfile}\": ${err}" + fi + + umask "${umask_bak}" +} + + +# zed_unlock (lockfile, [fd]) +# +# Release the lock on [lockfile]. +# +# Arguments +# lockfile: pathname of the lock file +# fd: integer for the file descriptor used by flock (must match the file +# descriptor passed to the zed_lock function call) +# +# Globals +# ZED_FLOCK_FD +# ZED_LOCKDIR +# +# Return +# nothing +# +zed_unlock() +{ + local lockfile="$1" + local fd="${2:-${ZED_FLOCK_FD}}" + local err + + [ -n "${lockfile}" ] || return + if ! expr "${lockfile}" : '.*/' >/dev/null 2>&1; then + lockfile="${ZED_LOCKDIR}/${lockfile}" + fi + + # Release the lock and close the file descriptor. + err="$(flock --unlock "${fd}" 2>&1)" + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + zed_log_err "failed to unlock \"${lockfile}\": ${err}" + fi + eval "exec ${fd}>&-" +} + + +# zed_notify (subject, pathname) +# +# Send a notification via all available methods. +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Return +# 0: notification succeeded via at least one method +# 1: notification failed +# 2: no notification methods configured +# +zed_notify() +{ + local subject="$1" + local pathname="$2" + local num_success=0 + local num_failure=0 + + zed_notify_email "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_pushbullet "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + zed_notify_slack_webhook "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + + [ "${num_success}" -gt 0 ] && return 0 + [ "${num_failure}" -gt 0 ] && return 1 + return 2 +} + + +# zed_notify_email (subject, pathname) +# +# Send a notification via email to the address specified by ZED_EMAIL_ADDR. +# +# Requires the mail executable to be installed in the standard PATH, or +# ZED_EMAIL_PROG to be defined with the pathname of an executable capable of +# reading a message body from stdin. +# +# Command-line options to the mail executable can be specified in +# ZED_EMAIL_OPTS. This undergoes the following keyword substitutions: +# - @ADDRESS@ is replaced with the space-delimited recipient email address(es) +# - @SUBJECT@ is replaced with the notification subject +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_EMAIL_PROG +# ZED_EMAIL_OPTS +# ZED_EMAIL_ADDR +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_email() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + + : "${ZED_EMAIL_PROG:="mail"}" + : "${ZED_EMAIL_OPTS:="-s '@SUBJECT@' @ADDRESS@"}" + + # For backward compatibility with ZED_EMAIL. + if [ -n "${ZED_EMAIL}" ] && [ -z "${ZED_EMAIL_ADDR}" ]; then + ZED_EMAIL_ADDR="${ZED_EMAIL}" + fi + [ -n "${ZED_EMAIL_ADDR}" ] || return 2 + + zed_check_cmd "${ZED_EMAIL_PROG}" || return 1 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err \ + "$(basename "${ZED_EMAIL_PROG}") cannot read \"${pathname}\"" + return 1 + fi + + ZED_EMAIL_OPTS="$(echo "${ZED_EMAIL_OPTS}" \ + | sed -e "s/@ADDRESS@/${ZED_EMAIL_ADDR}/g" \ + -e "s/@SUBJECT@/${subject}/g")" + + # shellcheck disable=SC2086 + eval "${ZED_EMAIL_PROG}" ${ZED_EMAIL_OPTS} < "${pathname}" >/dev/null 2>&1 + rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "$(basename "${ZED_EMAIL_PROG}") exit=${rv}" + return 1 + fi + return 0 +} + + +# zed_notify_pushbullet (subject, pathname) +# +# Send a notification via Pushbullet . +# The access token (ZED_PUSHBULLET_ACCESS_TOKEN) identifies this client to the +# Pushbullet server. The optional channel tag (ZED_PUSHBULLET_CHANNEL_TAG) is +# for pushing to notification feeds that can be subscribed to; if a channel is +# not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://docs.pushbullet.com/ +# https://www.pushbullet.com/security +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_PUSHBULLET_ACCESS_TOKEN +# ZED_PUSHBULLET_CHANNEL_TAG +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_pushbullet() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="https://api.pushbullet.com/v2/pushes" + + [ -n "${ZED_PUSHBULLET_ACCESS_TOKEN}" ] || return 2 + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "pushbullet cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Push to a channel if one is configured. + # + [ -n "${ZED_PUSHBULLET_CHANNEL_TAG}" ] && msg_tag="$(printf \ + '"channel_tag": "%s", ' "${ZED_PUSHBULLET_CHANNEL_TAG}")" + + # Construct the JSON message for pushing a note. + # + msg_json="$(printf '{%s"type": "note", "title": "%s", "body": "%s"}' \ + "${msg_tag}" "${subject}" "${msg_body}")" + + # Send the POST request and check for errors. + # + msg_out="$(curl -u "${ZED_PUSHBULLET_ACCESS_TOKEN}:" -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "pushbullet \"${msg_err}"\" + return 1 + fi + return 0 +} + + +# zed_notify_slack_webhook (subject, pathname) +# +# Notification via Slack Webhook . +# The Webhook URL (ZED_SLACK_WEBHOOK_URL) identifies this client to the +# Slack channel. +# +# Requires awk, curl, and sed executables to be installed in the standard PATH. +# +# References +# https://api.slack.com/incoming-webhooks +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_SLACK_WEBHOOK_URL +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_slack_webhook() +{ + [ -n "${ZED_SLACK_WEBHOOK_URL}" ] || return 2 + + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_tag + local msg_json + local msg_out + local msg_err + local url="${ZED_SLACK_WEBHOOK_URL}" + + [ -n "${subject}" ] || return 1 + if [ ! -r "${pathname}" ]; then + zed_log_err "slack webhook cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "awk" "curl" "sed" || return 1 + + # Escape the following characters in the message body for JSON: + # newline, backslash, double quote, horizontal tab, vertical tab, + # and carriage return. + # + msg_body="$(awk '{ ORS="\\n" } { gsub(/\\/, "\\\\"); gsub(/"/, "\\\""); + gsub(/\t/, "\\t"); gsub(/\f/, "\\f"); gsub(/\r/, "\\r"); print }' \ + "${pathname}")" + + # Construct the JSON message for posting. + # + msg_json="$(printf '{"text": "*%s*\n%s"}' "${subject}" "${msg_body}" )" + + # Send the POST request and check for errors. + # + msg_out="$(curl -X POST "${url}" \ + --header "Content-Type: application/json" --data-binary "${msg_json}" \ + 2>/dev/null)"; rv=$? + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"error" *:.*"message" *: *"\([^"]*\)".*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "slack webhook \"${msg_err}"\" + return 1 + fi + return 0 +} + +# zed_rate_limit (tag, [interval]) +# +# Check whether an event of a given type [tag] has already occurred within the +# last [interval] seconds. +# +# This function obtains a lock on the statefile using file descriptor 9. +# +# Arguments +# tag: arbitrary string for grouping related events to rate-limit +# interval: time interval in seconds (OPTIONAL) +# +# Globals +# ZED_NOTIFY_INTERVAL_SECS +# ZED_RUNDIR +# +# Return +# 0 if the event should be processed +# 1 if the event should be dropped +# +# State File Format +# time;tag +# +zed_rate_limit() +{ + local tag="$1" + local interval="${2:-${ZED_NOTIFY_INTERVAL_SECS}}" + local lockfile="zed.zedlet.state.lock" + local lockfile_fd=9 + local statefile="${ZED_RUNDIR}/zed.zedlet.state" + local time_now + local time_prev + local umask_bak + local rv=0 + + [ -n "${tag}" ] || return 0 + + zed_lock "${lockfile}" "${lockfile_fd}" + time_now="$(date +%s)" + time_prev="$(grep -E "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + | tail -1 | cut -d\; -f1)" + + if [ -n "${time_prev}" ] \ + && [ "$((time_now - time_prev))" -lt "${interval}" ]; then + rv=1 + else + umask_bak="$(umask)" + umask 077 + grep -E -v "^[0-9]+;${tag}\$" "${statefile}" 2>/dev/null \ + > "${statefile}.$$" + echo "${time_now};${tag}" >> "${statefile}.$$" + mv -f "${statefile}.$$" "${statefile}" + umask "${umask_bak}" + fi + + zed_unlock "${lockfile}" "${lockfile_fd}" + return "${rv}" +} + + +# zed_guid_to_pool (guid) +# +# Convert a pool GUID into its pool name (like "tank") +# Arguments +# guid: pool GUID (decimal or hex) +# +# Return +# Pool name +# +zed_guid_to_pool() +{ + if [ -z "$1" ] ; then + return + fi + + guid=$(printf "%llu" "$1") + if [ -n "$guid" ] ; then + $ZPOOL get -H -ovalue,name guid | awk '$1=='"$guid"' {print $2}' + fi +} + +# zed_exit_if_ignoring_this_event +# +# Exit the script if we should ignore this event, as determined by +# $ZED_SYSLOG_SUBCLASS_INCLUDE and $ZED_SYSLOG_SUBCLASS_EXCLUDE in zed.rc. +# This function assumes you've imported the normal zed variables. +zed_exit_if_ignoring_this_event() +{ + if [ -n "${ZED_SYSLOG_SUBCLASS_INCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_INCLUDE});; + *) exit 0;; + esac" + elif [ -n "${ZED_SYSLOG_SUBCLASS_EXCLUDE}" ]; then + eval "case ${ZEVENT_SUBCLASS} in + ${ZED_SYSLOG_SUBCLASS_EXCLUDE}) exit 0;; + *);; + esac" + fi +} diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc new file mode 100644 index 000000000000..1b220d28db20 --- /dev/null +++ b/cmd/zed/zed.d/zed.rc @@ -0,0 +1,122 @@ +## +# zed.rc +# +# This file should be owned by root and permissioned 0600. +## + +## +# Absolute path to the debug output file. +# +#ZED_DEBUG_LOG="/tmp/zed.debug.log" + +## +# Email address of the zpool administrator for receipt of notifications; +# multiple addresses can be specified if they are delimited by whitespace. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# Disabled by default; uncomment to enable. +# +#ZED_EMAIL_ADDR="root" + +## +# Name or path of executable responsible for sending notifications via email; +# the mail program must be capable of reading a message body from stdin. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_PROG="mail" + +## +# Command-line options for ZED_EMAIL_PROG. +# The string @ADDRESS@ will be replaced with the recipient email address(es). +# The string @SUBJECT@ will be replaced with the notification subject; +# this should be protected with quotes to prevent word-splitting. +# Email will only be sent if ZED_EMAIL_ADDR is defined. +# +#ZED_EMAIL_OPTS="-s '@SUBJECT@' @ADDRESS@" + +## +# Default directory for zed lock files. +# +#ZED_LOCKDIR="/var/lock" + +## +# Minimum number of seconds between notifications for a similar event. +# +#ZED_NOTIFY_INTERVAL_SECS=3600 + +## +# Notification verbosity. +# If set to 0, suppress notification if the pool is healthy. +# If set to 1, send notification regardless of pool health. +# +#ZED_NOTIFY_VERBOSE=0 + +## +# Send notifications for 'ereport.fs.zfs.data' events. +# Disabled by default, any non-empty value will enable the feature. +# +#ZED_NOTIFY_DATA= + +## +# Pushbullet access token. +# This grants full access to your account -- protect it accordingly! +# +# +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_ACCESS_TOKEN="" + +## +# Pushbullet channel tag for push notification feeds that can be subscribed to. +# +# If not defined, push notifications will instead be sent to all devices +# associated with the account specified by the access token. +# Disabled by default; uncomment to enable. +# +#ZED_PUSHBULLET_CHANNEL_TAG="" + +## +# Slack Webhook URL. +# This allows posting to the given channel and includes an access token. +# +# Disabled by default; uncomment to enable. +# +#ZED_SLACK_WEBHOOK_URL="" + +## +# Default directory for zed state files. +# +#ZED_RUNDIR="/var/run" + +## +# Turn on/off enclosure LEDs when drives get DEGRADED/FAULTED. This works for +# device mapper and multipath devices as well. Your enclosure must be +# supported by the Linux SES driver for this to work. +# +ZED_USE_ENCLOSURE_LEDS=1 + +## +# Run a scrub after every resilver +# Disabled by default, 1 to enable and 0 to disable. +#ZED_SCRUB_AFTER_RESILVER=0 + +## +# The syslog priority (e.g., specified as a "facility.level" pair). +# +#ZED_SYSLOG_PRIORITY="daemon.notice" + +## +# The syslog tag for marking zed events. +# +#ZED_SYSLOG_TAG="zed" + +## +# Which set of event subclasses to log +# By default, events from all subclasses are logged. +# If ZED_SYSLOG_SUBCLASS_INCLUDE is set, only subclasses +# matching the pattern are logged. Use the pipe symbol (|) +# or shell wildcards (*, ?) to match multiple subclasses. +# Otherwise, if ZED_SYSLOG_SUBCLASS_EXCLUDE is set, the +# matching subclasses are excluded from logging. +#ZED_SYSLOG_SUBCLASS_INCLUDE="checksum|scrub_*|vdev.*" +#ZED_SYSLOG_SUBCLASS_EXCLUDE="statechange|config_*|history_event" + diff --git a/cmd/zed/zed.h b/cmd/zed/zed.h new file mode 100644 index 000000000000..3ac0e63141e8 --- /dev/null +++ b/cmd/zed/zed.h @@ -0,0 +1,58 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_H +#define ZED_H + +/* + * Absolute path for the default zed configuration file. + */ +#define ZED_CONF_FILE SYSCONFDIR "/zfs/zed.conf" + +/* + * Absolute path for the default zed pid file. + */ +#define ZED_PID_FILE RUNSTATEDIR "/zed.pid" + +/* + * Absolute path for the default zed state file. + */ +#define ZED_STATE_FILE RUNSTATEDIR "/zed.state" + +/* + * Absolute path for the default zed zedlet directory. + */ +#define ZED_ZEDLET_DIR SYSCONFDIR "/zfs/zed.d" + +/* + * Reserved for future use. + */ +#define ZED_MAX_EVENTS 0 + +/* + * Reserved for future use. + */ +#define ZED_MIN_EVENTS 0 + +/* + * String prefix for ZED variables passed via environment variables. + */ +#define ZED_VAR_PREFIX "ZED_" + +/* + * String prefix for ZFS event names passed via environment variables. + */ +#define ZEVENT_VAR_PREFIX "ZEVENT_" + +#endif /* !ZED_H */ diff --git a/cmd/zed/zed_conf.c b/cmd/zed/zed_conf.c new file mode 100644 index 000000000000..52370eb87b29 --- /dev/null +++ b/cmd/zed/zed_conf.c @@ -0,0 +1,735 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed.h" +#include "zed_conf.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +/* + * Return a new configuration with default values. + */ +struct zed_conf * +zed_conf_create(void) +{ + struct zed_conf *zcp; + + zcp = calloc(1, sizeof (*zcp)); + if (!zcp) + goto nomem; + + zcp->syslog_facility = LOG_DAEMON; + zcp->min_events = ZED_MIN_EVENTS; + zcp->max_events = ZED_MAX_EVENTS; + zcp->pid_fd = -1; + zcp->zedlets = NULL; /* created via zed_conf_scan_dir() */ + zcp->state_fd = -1; /* opened via zed_conf_open_state() */ + zcp->zfs_hdl = NULL; /* opened via zed_event_init() */ + zcp->zevent_fd = -1; /* opened via zed_event_init() */ + + if (!(zcp->conf_file = strdup(ZED_CONF_FILE))) + goto nomem; + + if (!(zcp->pid_file = strdup(ZED_PID_FILE))) + goto nomem; + + if (!(zcp->zedlet_dir = strdup(ZED_ZEDLET_DIR))) + goto nomem; + + if (!(zcp->state_file = strdup(ZED_STATE_FILE))) + goto nomem; + + return (zcp); + +nomem: + zed_log_die("Failed to create conf: %s", strerror(errno)); + return (NULL); +} + +/* + * Destroy the configuration [zcp]. + * + * Note: zfs_hdl & zevent_fd are destroyed via zed_event_fini(). + */ +void +zed_conf_destroy(struct zed_conf *zcp) +{ + if (!zcp) + return; + + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + zcp->state_fd = -1; + } + if (zcp->pid_file) { + if ((unlink(zcp->pid_file) < 0) && (errno != ENOENT)) + zed_log_msg(LOG_WARNING, + "Failed to remove PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } + if (zcp->pid_fd >= 0) { + if (close(zcp->pid_fd) < 0) + zed_log_msg(LOG_WARNING, + "Failed to close PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + zcp->pid_fd = -1; + } + if (zcp->conf_file) { + free(zcp->conf_file); + zcp->conf_file = NULL; + } + if (zcp->pid_file) { + free(zcp->pid_file); + zcp->pid_file = NULL; + } + if (zcp->zedlet_dir) { + free(zcp->zedlet_dir); + zcp->zedlet_dir = NULL; + } + if (zcp->state_file) { + free(zcp->state_file); + zcp->state_file = NULL; + } + if (zcp->zedlets) { + zed_strings_destroy(zcp->zedlets); + zcp->zedlets = NULL; + } + free(zcp); +} + +/* + * Display command-line help and exit. + * + * If [got_err] is 0, output to stdout and exit normally; + * otherwise, output to stderr and exit with a failure status. + */ +static void +_zed_conf_display_help(const char *prog, int got_err) +{ + FILE *fp = got_err ? stderr : stdout; + int w1 = 4; /* width of leading whitespace */ + int w2 = 8; /* width of L-justified option field */ + + fprintf(fp, "Usage: %s [OPTION]...\n", (prog ? prog : "zed")); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-h", + "Display help."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-L", + "Display license information."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-V", + "Display version information."); + fprintf(fp, "\n"); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-v", + "Be verbose."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-f", + "Force daemon to run."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-F", + "Run daemon in the foreground."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-I", + "Idle daemon until kernel module is (re)loaded."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-M", + "Lock all pages in memory."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-P", + "$PATH for ZED to use (only used by ZTS)."); + fprintf(fp, "%*c%*s %s\n", w1, 0x20, -w2, "-Z", + "Zero state file."); + fprintf(fp, "\n"); +#if 0 + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-c FILE", + "Read configuration from FILE.", ZED_CONF_FILE); +#endif + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-d DIR", + "Read enabled ZEDLETs from DIR.", ZED_ZEDLET_DIR); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-p FILE", + "Write daemon's PID to FILE.", ZED_PID_FILE); + fprintf(fp, "%*c%*s %s [%s]\n", w1, 0x20, -w2, "-s FILE", + "Write daemon's state to FILE.", ZED_STATE_FILE); + fprintf(fp, "\n"); + + exit(got_err ? EXIT_FAILURE : EXIT_SUCCESS); +} + +/* + * Display license information to stdout and exit. + */ +static void +_zed_conf_display_license(void) +{ + const char **pp; + const char *text[] = { + "The ZFS Event Daemon (ZED) is distributed under the terms of the", + " Common Development and Distribution License (CDDL-1.0)", + " .", + "", + "Developed at Lawrence Livermore National Laboratory" + " (LLNL-CODE-403049).", + "", + NULL + }; + + for (pp = text; *pp; pp++) + printf("%s\n", *pp); + + exit(EXIT_SUCCESS); +} + +/* + * Display version information to stdout and exit. + */ +static void +_zed_conf_display_version(void) +{ + printf("%s-%s-%s\n", + ZFS_META_NAME, ZFS_META_VERSION, ZFS_META_RELEASE); + + exit(EXIT_SUCCESS); +} + +/* + * Copy the [path] string to the [resultp] ptr. + * If [path] is not an absolute path, prefix it with the current working dir. + * If [resultp] is non-null, free its existing string before assignment. + */ +static void +_zed_conf_parse_path(char **resultp, const char *path) +{ + char buf[PATH_MAX]; + + assert(resultp != NULL); + assert(path != NULL); + + if (*resultp) + free(*resultp); + + if (path[0] == '/') { + *resultp = strdup(path); + } else if (!getcwd(buf, sizeof (buf))) { + zed_log_die("Failed to get current working dir: %s", + strerror(errno)); + } else if (strlcat(buf, "/", sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else if (strlcat(buf, path, sizeof (buf)) >= sizeof (buf)) { + zed_log_die("Failed to copy path: %s", strerror(ENAMETOOLONG)); + } else { + *resultp = strdup(buf); + } + if (!*resultp) + zed_log_die("Failed to copy path: %s", strerror(ENOMEM)); +} + +/* + * Parse the command-line options into the configuration [zcp]. + */ +void +zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv) +{ + const char * const opts = ":hLVc:d:p:P:s:vfFMZI"; + int opt; + + if (!zcp || !argv || !argv[0]) + zed_log_die("Failed to parse options: Internal error"); + + opterr = 0; /* suppress default getopt err msgs */ + + while ((opt = getopt(argc, argv, opts)) != -1) { + switch (opt) { + case 'h': + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + break; + case 'L': + _zed_conf_display_license(); + break; + case 'V': + _zed_conf_display_version(); + break; + case 'c': + _zed_conf_parse_path(&zcp->conf_file, optarg); + break; + case 'd': + _zed_conf_parse_path(&zcp->zedlet_dir, optarg); + break; + case 'I': + zcp->do_idle = 1; + break; + case 'p': + _zed_conf_parse_path(&zcp->pid_file, optarg); + break; + case 'P': + _zed_conf_parse_path(&zcp->path, optarg); + break; + case 's': + _zed_conf_parse_path(&zcp->state_file, optarg); + break; + case 'v': + zcp->do_verbose = 1; + break; + case 'f': + zcp->do_force = 1; + break; + case 'F': + zcp->do_foreground = 1; + break; + case 'M': + zcp->do_memlock = 1; + break; + case 'Z': + zcp->do_zero = 1; + break; + case '?': + default: + if (optopt == '?') + _zed_conf_display_help(argv[0], EXIT_SUCCESS); + + fprintf(stderr, "%s: %s '-%c'\n\n", argv[0], + "Invalid option", optopt); + _zed_conf_display_help(argv[0], EXIT_FAILURE); + break; + } + } +} + +/* + * Parse the configuration file into the configuration [zcp]. + * + * FIXME: Not yet implemented. + */ +void +zed_conf_parse_file(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed to parse config: %s", strerror(EINVAL)); +} + +/* + * Scan the [zcp] zedlet_dir for files to exec based on the event class. + * Files must be executable by user, but not writable by group or other. + * Dotfiles are ignored. + * + * Return 0 on success with an updated set of zedlets, + * or -1 on error with errno set. + * + * FIXME: Check if zedlet_dir and all parent dirs are secure. + */ +int +zed_conf_scan_dir(struct zed_conf *zcp) +{ + zed_strings_t *zedlets; + DIR *dirp; + struct dirent *direntp; + char pathname[PATH_MAX]; + struct stat st; + int n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to scan zedlet dir: %s", + strerror(errno)); + return (-1); + } + zedlets = zed_strings_create(); + if (!zedlets) { + errno = ENOMEM; + zed_log_msg(LOG_WARNING, "Failed to scan dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + return (-1); + } + dirp = opendir(zcp->zedlet_dir); + if (!dirp) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to open dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + while ((direntp = readdir(dirp))) { + if (direntp->d_name[0] == '.') + continue; + + n = snprintf(pathname, sizeof (pathname), + "%s/%s", zcp->zedlet_dir, direntp->d_name); + if ((n < 0) || (n >= sizeof (pathname))) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + direntp->d_name, strerror(ENAMETOOLONG)); + continue; + } + if (stat(pathname, &st) < 0) { + zed_log_msg(LOG_WARNING, "Failed to stat \"%s\": %s", + pathname, strerror(errno)); + continue; + } + if (!S_ISREG(st.st_mode)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not a regular file", + direntp->d_name); + continue; + } + if ((st.st_uid != 0) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": not owned by root", + direntp->d_name); + continue; + } + if (!(st.st_mode & S_IXUSR)) { + zed_log_msg(LOG_INFO, + "Ignoring \"%s\": not executable by user", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWGRP) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by group", + direntp->d_name); + continue; + } + if ((st.st_mode & S_IWOTH) && !zcp->do_force) { + zed_log_msg(LOG_NOTICE, + "Ignoring \"%s\": writable by other", + direntp->d_name); + continue; + } + if (zed_strings_add(zedlets, NULL, direntp->d_name) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to register \"%s\": %s", + direntp->d_name, strerror(errno)); + continue; + } + if (zcp->do_verbose) + zed_log_msg(LOG_INFO, + "Registered zedlet \"%s\"", direntp->d_name); + } + if (closedir(dirp) < 0) { + int errno_bak = errno; + zed_log_msg(LOG_WARNING, "Failed to close dir \"%s\": %s", + zcp->zedlet_dir, strerror(errno)); + zed_strings_destroy(zedlets); + errno = errno_bak; + return (-1); + } + if (zcp->zedlets) + zed_strings_destroy(zcp->zedlets); + + zcp->zedlets = zedlets; + return (0); +} + +/* + * Write the PID file specified in [zcp]. + * Return 0 on success, -1 on error. + * + * This must be called after fork()ing to become a daemon (so the correct PID + * is recorded), but before daemonization is complete and the parent process + * exits (for synchronization with systemd). + */ +int +zed_conf_write_pid(struct zed_conf *zcp) +{ + const mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + const mode_t filemode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + char buf[PATH_MAX]; + int n; + char *p; + mode_t mask; + int rv; + + if (!zcp || !zcp->pid_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + return (-1); + } + assert(zcp->pid_fd == -1); + /* + * Create PID file directory if needed. + */ + n = strlcpy(buf, zcp->pid_file, sizeof (buf)); + if (n >= sizeof (buf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_ERR, "Failed to create PID file: %s", + strerror(errno)); + goto err; + } + p = strrchr(buf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(buf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_ERR, "Failed to create directory \"%s\": %s", + buf, strerror(errno)); + goto err; + } + /* + * Obtain PID file lock. + */ + mask = umask(0); + umask(mask | 022); + zcp->pid_fd = open(zcp->pid_file, (O_RDWR | O_CREAT), filemode); + umask(mask); + if (zcp->pid_fd < 0) { + zed_log_msg(LOG_ERR, "Failed to open PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } + rv = zed_file_lock(zcp->pid_fd); + if (rv < 0) { + zed_log_msg(LOG_ERR, "Failed to lock PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + goto err; + } else if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->pid_fd); + if (pid < 0) { + zed_log_msg(LOG_ERR, + "Failed to test lock on PID file \"%s\"", + zcp->pid_file); + } else if (pid > 0) { + zed_log_msg(LOG_ERR, + "Found PID %d bound to PID file \"%s\"", + pid, zcp->pid_file); + } else { + zed_log_msg(LOG_ERR, + "Inconsistent lock state on PID file \"%s\"", + zcp->pid_file); + } + goto err; + } + /* + * Write PID file. + */ + n = snprintf(buf, sizeof (buf), "%d\n", (int)getpid()); + if ((n < 0) || (n >= sizeof (buf))) { + errno = ERANGE; + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (zed_file_write_n(zcp->pid_fd, buf, n) != n) { + zed_log_msg(LOG_ERR, "Failed to write PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else if (fdatasync(zcp->pid_fd) < 0) { + zed_log_msg(LOG_ERR, "Failed to sync PID file \"%s\": %s", + zcp->pid_file, strerror(errno)); + } else { + return (0); + } + +err: + if (zcp->pid_fd >= 0) { + (void) close(zcp->pid_fd); + zcp->pid_fd = -1; + } + return (-1); +} + +/* + * Open and lock the [zcp] state_file. + * Return 0 on success, -1 on error. + * + * FIXME: Move state information into kernel. + */ +int +zed_conf_open_state(struct zed_conf *zcp) +{ + char dirbuf[PATH_MAX]; + mode_t dirmode = S_IRWXU | S_IRGRP | S_IXGRP | S_IROTH | S_IXOTH; + int n; + char *p; + int rv; + + if (!zcp || !zcp->state_file) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + n = strlcpy(dirbuf, zcp->state_file, sizeof (dirbuf)); + if (n >= sizeof (dirbuf)) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, "Failed to open state file: %s", + strerror(errno)); + return (-1); + } + p = strrchr(dirbuf, '/'); + if (p) + *p = '\0'; + + if ((mkdirp(dirbuf, dirmode) < 0) && (errno != EEXIST)) { + zed_log_msg(LOG_WARNING, + "Failed to create directory \"%s\": %s", + dirbuf, strerror(errno)); + return (-1); + } + if (zcp->state_fd >= 0) { + if (close(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to close state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + } + if (zcp->do_zero) + (void) unlink(zcp->state_file); + + zcp->state_fd = open(zcp->state_file, + (O_RDWR | O_CREAT), (S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)); + if (zcp->state_fd < 0) { + zed_log_msg(LOG_WARNING, "Failed to open state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + rv = zed_file_lock(zcp->state_fd); + if (rv < 0) { + zed_log_msg(LOG_WARNING, "Failed to lock state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (rv > 0) { + pid_t pid = zed_file_is_locked(zcp->state_fd); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to test lock on state file \"%s\"", + zcp->state_file); + } else if (pid > 0) { + zed_log_msg(LOG_WARNING, + "Found PID %d bound to state file \"%s\"", + pid, zcp->state_file); + } else { + zed_log_msg(LOG_WARNING, + "Inconsistent lock state on state file \"%s\"", + zcp->state_file); + } + return (-1); + } + return (0); +} + +/* + * Read the opened [zcp] state_file to obtain the eid & etime of the last event + * processed. Write the state from the last event to the [eidp] & [etime] args + * passed by reference. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp || !eidp || !etime) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to read state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = eidp; + len += iov[0].iov_len = sizeof (*eidp); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = readv(zcp->state_fd, iov, 3); + if (n == 0) { + *eidp = 0; + } else if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } else if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to read state file \"%s\": Read %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + return (0); +} + +/* + * Write the [eid] & [etime] of the last processed event to the opened + * [zcp] state_file. Note that etime[] is an array of size 2. + * Return 0 on success, -1 on error. + */ +int +zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]) +{ + ssize_t len; + struct iovec iov[3]; + ssize_t n; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, + "Failed to write state file: %s", strerror(errno)); + return (-1); + } + if (lseek(zcp->state_fd, 0, SEEK_SET) == (off_t)-1) { + zed_log_msg(LOG_WARNING, + "Failed to reposition state file offset: %s", + strerror(errno)); + return (-1); + } + len = 0; + iov[0].iov_base = &eid; + len += iov[0].iov_len = sizeof (eid); + iov[1].iov_base = &etime[0]; + len += iov[1].iov_len = sizeof (etime[0]); + iov[2].iov_base = &etime[1]; + len += iov[2].iov_len = sizeof (etime[1]); + + n = writev(zcp->state_fd, iov, 3); + if (n < 0) { + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + if (n != len) { + errno = EIO; + zed_log_msg(LOG_WARNING, + "Failed to write state file \"%s\": Wrote %d of %d bytes", + zcp->state_file, n, len); + return (-1); + } + if (fdatasync(zcp->state_fd) < 0) { + zed_log_msg(LOG_WARNING, + "Failed to sync state file \"%s\": %s", + zcp->state_file, strerror(errno)); + return (-1); + } + return (0); +} diff --git a/cmd/zed/zed_conf.h b/cmd/zed/zed_conf.h new file mode 100644 index 000000000000..424cb2c01c8c --- /dev/null +++ b/cmd/zed/zed_conf.h @@ -0,0 +1,62 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_CONF_H +#define ZED_CONF_H + +#include +#include +#include "zed_strings.h" + +struct zed_conf { + unsigned do_force:1; /* true if force enabled */ + unsigned do_foreground:1; /* true if run in foreground */ + unsigned do_memlock:1; /* true if locking memory */ + unsigned do_verbose:1; /* true if verbosity enabled */ + unsigned do_zero:1; /* true if zeroing state */ + unsigned do_idle:1; /* true if idle enabled */ + int syslog_facility; /* syslog facility value */ + int min_events; /* RESERVED FOR FUTURE USE */ + int max_events; /* RESERVED FOR FUTURE USE */ + char *conf_file; /* abs path to config file */ + char *pid_file; /* abs path to pid file */ + int pid_fd; /* fd to pid file for lock */ + char *zedlet_dir; /* abs path to zedlet dir */ + zed_strings_t *zedlets; /* names of enabled zedlets */ + char *state_file; /* abs path to state file */ + int state_fd; /* fd to state file */ + libzfs_handle_t *zfs_hdl; /* handle to libzfs */ + int zevent_fd; /* fd for access to zevents */ + char *path; /* custom $PATH for zedlets to use */ +}; + +struct zed_conf *zed_conf_create(void); + +void zed_conf_destroy(struct zed_conf *zcp); + +void zed_conf_parse_opts(struct zed_conf *zcp, int argc, char **argv); + +void zed_conf_parse_file(struct zed_conf *zcp); + +int zed_conf_scan_dir(struct zed_conf *zcp); + +int zed_conf_write_pid(struct zed_conf *zcp); + +int zed_conf_open_state(struct zed_conf *zcp); + +int zed_conf_read_state(struct zed_conf *zcp, uint64_t *eidp, int64_t etime[]); + +int zed_conf_write_state(struct zed_conf *zcp, uint64_t eid, int64_t etime[]); + +#endif /* !ZED_CONF_H */ diff --git a/cmd/zed/zed_disk_event.c b/cmd/zed/zed_disk_event.c new file mode 100644 index 000000000000..174d24523253 --- /dev/null +++ b/cmd/zed/zed_disk_event.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017, Intel Corporation. + */ + +#ifdef HAVE_LIBUDEV + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zed_log.h" +#include "zed_disk_event.h" +#include "agents/zfs_agents.h" + +/* + * Portions of ZED need to see disk events for disks belonging to ZFS pools. + * A libudev monitor is established to monitor block device actions and pass + * them on to internal ZED logic modules. Initially, zfs_mod.c is the only + * consumer and is the Linux equivalent for the illumos syseventd ZFS SLM + * module responsible for handling disk events for ZFS. + */ + +pthread_t g_mon_tid; +struct udev *g_udev; +struct udev_monitor *g_mon; + + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +/* 64MB is minimum usable disk for ZFS */ +#define MINIMUM_SECTORS 131072 + + +/* + * Post disk event to SLM module + * + * occurs in the context of monitor thread + */ +static void +zed_udev_event(const char *class, const char *subclass, nvlist_t *nvl) +{ + char *strval; + uint64_t numval; + + zed_log_msg(LOG_INFO, "zed_disk_event:"); + zed_log_msg(LOG_INFO, "\tclass: %s", class); + zed_log_msg(LOG_INFO, "\tsubclass: %s", subclass); + if (nvlist_lookup_string(nvl, DEV_NAME, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_NAME, strval); + if (nvlist_lookup_string(nvl, DEV_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PATH, strval); + if (nvlist_lookup_string(nvl, DEV_IDENTIFIER, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_IDENTIFIER, strval); + if (nvlist_lookup_string(nvl, DEV_PHYS_PATH, &strval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %s", DEV_PHYS_PATH, strval); + if (nvlist_lookup_uint64(nvl, DEV_SIZE, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", DEV_SIZE, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_POOL_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_POOL_GUID, numval); + if (nvlist_lookup_uint64(nvl, ZFS_EV_VDEV_GUID, &numval) == 0) + zed_log_msg(LOG_INFO, "\t%s: %llu", ZFS_EV_VDEV_GUID, numval); + + (void) zfs_agent_post_event(class, subclass, nvl); +} + +/* + * dev_event_nvlist: place event schema into an nv pair list + * + * NAME VALUE (example) + * -------------- -------------------------------------------------------- + * DEV_NAME /dev/sdl + * DEV_PATH /devices/pci0000:00/0000:00:03.0/0000:04:00.0/host0/... + * DEV_IDENTIFIER ata-Hitachi_HTS725050A9A362_100601PCG420VLJ37DMC + * DEV_PHYS_PATH pci-0000:04:00.0-sas-0x4433221101000000-lun-0 + * DEV_IS_PART --- + * DEV_SIZE 500107862016 + * ZFS_EV_POOL_GUID 17523635698032189180 + * ZFS_EV_VDEV_GUID 14663607734290803088 + */ +static nvlist_t * +dev_event_nvlist(struct udev_device *dev) +{ + nvlist_t *nvl; + char strval[128]; + const char *value, *path; + uint64_t guid; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + + if (zfs_device_get_devid(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_IDENTIFIER, strval); + if (zfs_device_get_physical(dev, strval, sizeof (strval)) == 0) + (void) nvlist_add_string(nvl, DEV_PHYS_PATH, strval); + if ((path = udev_device_get_devnode(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_NAME, path); + if ((value = udev_device_get_devpath(dev)) != NULL) + (void) nvlist_add_string(nvl, DEV_PATH, value); + value = udev_device_get_devtype(dev); + if ((value != NULL && strcmp("partition", value) == 0) || + (udev_device_get_property_value(dev, "ID_PART_ENTRY_NUMBER") + != NULL)) { + (void) nvlist_add_boolean(nvl, DEV_IS_PART); + } + if ((value = udev_device_get_sysattr_value(dev, "size")) != NULL) { + uint64_t numval = DEV_BSIZE; + + numval *= strtoull(value, NULL, 10); + (void) nvlist_add_uint64(nvl, DEV_SIZE, numval); + } + + /* + * Grab the pool and vdev guids from blkid cache + */ + value = udev_device_get_property_value(dev, "ID_FS_UUID"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_POOL_GUID, guid); + + value = udev_device_get_property_value(dev, "ID_FS_UUID_SUB"); + if (value != NULL && (guid = strtoull(value, NULL, 10)) != 0) + (void) nvlist_add_uint64(nvl, ZFS_EV_VDEV_GUID, guid); + + /* + * Either a vdev guid or a devid must be present for matching + */ + if (!nvlist_exists(nvl, DEV_IDENTIFIER) && + !nvlist_exists(nvl, ZFS_EV_VDEV_GUID)) { + nvlist_free(nvl); + return (NULL); + } + + return (nvl); +} + +/* + * Listen for block device uevents + */ +static void * +zed_udev_monitor(void *arg) +{ + struct udev_monitor *mon = arg; + char *tmp, *tmp2; + + zed_log_msg(LOG_INFO, "Waiting for new udev disk events..."); + + while (1) { + struct udev_device *dev; + const char *action, *type, *part, *sectors; + const char *bus, *uuid; + const char *class, *subclass; + nvlist_t *nvl; + boolean_t is_zfs = B_FALSE; + + /* allow a cancellation while blocked (recvmsg) */ + pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, NULL); + + /* blocks at recvmsg until an event occurs */ + if ((dev = udev_monitor_receive_device(mon)) == NULL) { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: receive " + "device error %d", errno); + continue; + } + + /* allow all steps to complete before a cancellation */ + pthread_setcancelstate(PTHREAD_CANCEL_DISABLE, NULL); + + /* + * Strongly typed device is the preferred filter + */ + type = udev_device_get_property_value(dev, "ID_FS_TYPE"); + if (type != NULL && type[0] != '\0') { + if (strcmp(type, "zfs_member") == 0) { + is_zfs = B_TRUE; + } else { + /* not ours, so skip */ + zed_log_msg(LOG_INFO, "zed_udev_monitor: skip " + "%s (in use by %s)", + udev_device_get_devnode(dev), type); + udev_device_unref(dev); + continue; + } + } + + /* + * if this is a disk and it is partitioned, then the + * zfs label will reside in a DEVTYPE=partition and + * we can skip passing this event + */ + type = udev_device_get_property_value(dev, "DEVTYPE"); + part = udev_device_get_property_value(dev, + "ID_PART_TABLE_TYPE"); + if (type != NULL && type[0] != '\0' && + strcmp(type, "disk") == 0 && + part != NULL && part[0] != '\0') { + /* skip and wait for partition event */ + udev_device_unref(dev); + continue; + } + + /* + * ignore small partitions + */ + sectors = udev_device_get_property_value(dev, + "ID_PART_ENTRY_SIZE"); + if (sectors == NULL) + sectors = udev_device_get_sysattr_value(dev, "size"); + if (sectors != NULL && + strtoull(sectors, NULL, 10) < MINIMUM_SECTORS) { + udev_device_unref(dev); + continue; + } + + /* + * If the blkid probe didn't find ZFS, then a persistent + * device id string is required in the message schema + * for matching with vdevs. Preflight here for expected + * udev information. + */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (!is_zfs && (bus == NULL && uuid == NULL)) { + zed_log_msg(LOG_INFO, "zed_udev_monitor: %s no devid " + "source", udev_device_get_devnode(dev)); + udev_device_unref(dev); + continue; + } + + action = udev_device_get_action(dev); + if (strcmp(action, "add") == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else if (strcmp(action, "remove") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } else if (strcmp(action, "change") == 0) { + class = EC_DEV_STATUS; + subclass = ESC_DEV_DLE; + } else { + zed_log_msg(LOG_WARNING, "zed_udev_monitor: %s unknown", + action); + udev_device_unref(dev); + continue; + } + + /* + * Special case an EC_DEV_ADD for multipath devices + * + * When a multipath device is created, udev reports the + * following: + * + * 1. "add" event of the dm device for the multipath device + * (like /dev/dm-3). + * 2. "change" event to create the actual multipath device + * symlink (like /dev/mapper/mpatha). The event also + * passes back the relevant DM vars we care about, like + * DM_UUID. + * 3. Another "change" event identical to #2 (that we ignore). + * + * To get the behavior we want, we treat the "change" event + * in #2 as a "add" event; as if "/dev/mapper/mpatha" was + * a new disk being added. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "DM_UUID") && + udev_device_get_property_value(dev, "MPATH_SBIN_PATH")) { + tmp = (char *)udev_device_get_devnode(dev); + tmp2 = zfs_get_underlying_path(tmp); + if (tmp && tmp2 && (strcmp(tmp, tmp2) != 0)) { + /* + * We have a real underlying device, which + * means that this multipath "change" event is + * an "add" event. + * + * If the multipath device and the underlying + * dev are the same name (i.e. /dev/dm-5), then + * there is no real underlying disk for this + * multipath device, and so this "change" event + * really is a multipath removal. + */ + class = EC_DEV_ADD; + subclass = ESC_DISK; + } else { + tmp = (char *) + udev_device_get_property_value(dev, + "DM_NR_VALID_PATHS"); + /* treat as a multipath remove */ + if (tmp != NULL && strcmp(tmp, "0") == 0) { + class = EC_DEV_REMOVE; + subclass = ESC_DISK; + } + } + free(tmp2); + } + + /* + * Special case an EC_DEV_ADD for scsi_debug devices + * + * These devices require a udevadm trigger command after + * creation in order to register the vdev_id scsidebug alias + * rule (adds a persistent path (phys_path) used for fault + * management automated tests in the ZFS test suite. + * + * After udevadm trigger command, event registers as a "change" + * event but needs to instead be handled as another "add" event + * to allow for disk labeling and partitioning to occur. + */ + if (strcmp(class, EC_DEV_STATUS) == 0 && + udev_device_get_property_value(dev, "ID_VDEV") && + udev_device_get_property_value(dev, "ID_MODEL")) { + const char *id_model, *id_model_sd = "scsi_debug"; + + id_model = udev_device_get_property_value(dev, + "ID_MODEL"); + if (strcmp(id_model, id_model_sd) == 0) { + class = EC_DEV_ADD; + subclass = ESC_DISK; + } + } + + if ((nvl = dev_event_nvlist(dev)) != NULL) { + zed_udev_event(class, subclass, nvl); + nvlist_free(nvl); + } + + udev_device_unref(dev); + } + + return (NULL); +} + +int +zed_disk_event_init() +{ + int fd, fflags; + + if ((g_udev = udev_new()) == NULL) { + zed_log_msg(LOG_WARNING, "udev_new failed (%d)", errno); + return (-1); + } + + /* Set up a udev monitor for block devices */ + g_mon = udev_monitor_new_from_netlink(g_udev, "udev"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", "disk"); + udev_monitor_filter_add_match_subsystem_devtype(g_mon, "block", + "partition"); + udev_monitor_enable_receiving(g_mon); + + /* Make sure monitoring socket is blocking */ + fd = udev_monitor_get_fd(g_mon); + if ((fflags = fcntl(fd, F_GETFL)) & O_NONBLOCK) + (void) fcntl(fd, F_SETFL, fflags & ~O_NONBLOCK); + + /* spawn a thread to monitor events */ + if (pthread_create(&g_mon_tid, NULL, zed_udev_monitor, g_mon) != 0) { + udev_monitor_unref(g_mon); + udev_unref(g_udev); + zed_log_msg(LOG_WARNING, "pthread_create failed"); + return (-1); + } + + zed_log_msg(LOG_INFO, "zed_disk_event_init"); + + return (0); +} + +void +zed_disk_event_fini() +{ + /* cancel monitor thread at recvmsg() */ + (void) pthread_cancel(g_mon_tid); + (void) pthread_join(g_mon_tid, NULL); + + /* cleanup udev resources */ + udev_monitor_unref(g_mon); + udev_unref(g_udev); + + zed_log_msg(LOG_INFO, "zed_disk_event_fini"); +} + +#else + +#include "zed_disk_event.h" + +int +zed_disk_event_init() +{ + return (0); +} + +void +zed_disk_event_fini() +{ +} + +#endif /* HAVE_LIBUDEV */ diff --git a/cmd/zed/zed_disk_event.h b/cmd/zed/zed_disk_event.h new file mode 100644 index 000000000000..ea9813d0a595 --- /dev/null +++ b/cmd/zed/zed_disk_event.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Intel Corporation. + */ + +#ifndef ZED_DISK_EVENT_H +#define ZED_DISK_EVENT_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zed_disk_event_init(void); +extern void zed_disk_event_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* !ZED_DISK_EVENT_H */ diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c new file mode 100644 index 000000000000..1c5d00e297ff --- /dev/null +++ b/cmd/zed/zed_event.c @@ -0,0 +1,965 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include /* FIXME: Replace with libzfs_core. */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed.h" +#include "zed_conf.h" +#include "zed_disk_event.h" +#include "zed_event.h" +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#include "agents/zfs_agents.h" + +#define MAXBUF 4096 + +/* + * Open the libzfs interface. + */ +int +zed_event_init(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_init: %s", strerror(EINVAL)); + + zcp->zfs_hdl = libzfs_init(); + if (!zcp->zfs_hdl) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize libzfs"); + } + + zcp->zevent_fd = open(ZFS_DEV, O_RDWR); + if (zcp->zevent_fd < 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to open \"%s\": %s", + ZFS_DEV, strerror(errno)); + } + + zfs_agent_init(zcp->zfs_hdl); + + if (zed_disk_event_init() != 0) { + if (zcp->do_idle) + return (-1); + zed_log_die("Failed to initialize disk events"); + } + + return (0); +} + +/* + * Close the libzfs interface. + */ +void +zed_event_fini(struct zed_conf *zcp) +{ + if (!zcp) + zed_log_die("Failed zed_event_fini: %s", strerror(EINVAL)); + + zed_disk_event_fini(); + zfs_agent_fini(); + + if (zcp->zevent_fd >= 0) { + if (close(zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to close \"%s\": %s", + ZFS_DEV, strerror(errno)); + + zcp->zevent_fd = -1; + } + if (zcp->zfs_hdl) { + libzfs_fini(zcp->zfs_hdl); + zcp->zfs_hdl = NULL; + } +} + +/* + * Seek to the event specified by [saved_eid] and [saved_etime]. + * This protects against processing a given event more than once. + * Return 0 upon a successful seek to the specified event, or -1 otherwise. + * + * A zevent is considered to be uniquely specified by its (eid,time) tuple. + * The unsigned 64b eid is set to 1 when the kernel module is loaded, and + * incremented by 1 for each new event. Since the state file can persist + * across a kernel module reload, the time must be checked to ensure a match. + */ +int +zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, int64_t saved_etime[]) +{ + uint64_t eid; + int found; + nvlist_t *nvl; + int n_dropped; + int64_t *etime; + uint_t nelem; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to seek zevent: %s", + strerror(errno)); + return (-1); + } + eid = 0; + found = 0; + while ((eid < saved_eid) && !found) { + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, + ZEVENT_NONBLOCK, zcp->zevent_fd); + + if ((rv != 0) || !nvl) + break; + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array(nvl, "time", + &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if ((eid != saved_eid) || + (etime[0] != saved_etime[0]) || + (etime[1] != saved_etime[1])) { + /* no-op */ + } else { + found = 1; + } + free(nvl); + } + if (!found && (saved_eid > 0)) { + if (zpool_events_seek(zcp->zfs_hdl, ZEVENT_SEEK_START, + zcp->zevent_fd) < 0) + zed_log_msg(LOG_WARNING, "Failed to seek to eid=0"); + else + eid = 0; + } + zed_log_msg(LOG_NOTICE, "Processing events since eid=%llu", eid); + return (found ? 0 : -1); +} + +/* + * Return non-zero if nvpair [name] should be formatted in hex; o/w, return 0. + */ +static int +_zed_event_value_is_hex(const char *name) +{ + const char *hex_suffix[] = { + "_guid", + "_guids", + NULL + }; + const char **pp; + char *p; + + if (!name) + return (0); + + for (pp = hex_suffix; *pp; pp++) { + p = strstr(name, *pp); + if (p && strlen(p) == strlen(*pp)) + return (1); + } + return (0); +} + +/* + * Add an environment variable for [eid] to the container [zsp]. + * + * The variable name is the concatenation of [prefix] and [name] converted to + * uppercase with non-alphanumeric characters converted to underscores; + * [prefix] is optional, and [name] must begin with an alphabetic character. + * If the converted variable name already exists within the container [zsp], + * its existing value will be replaced with the new value. + * + * The variable value is specified by the format string [fmt]. + * + * Returns 0 on success, and -1 on error (with errno set). + * + * All environment variables in [zsp] should be added through this function. + */ +static int +_zed_event_add_var(uint64_t eid, zed_strings_t *zsp, + const char *prefix, const char *name, const char *fmt, ...) +{ + char keybuf[MAXBUF]; + char valbuf[MAXBUF]; + char *dstp; + const char *srcp; + const char *lastp; + int n; + int buflen; + va_list vargs; + + assert(zsp != NULL); + assert(fmt != NULL); + + if (!name) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name is empty", eid); + return (-1); + } else if (!isalpha(name[0])) { + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: " + "Name \"%s\" is invalid", eid, name); + return (-1); + } + /* + * Construct the string key by converting PREFIX (if present) and NAME. + */ + dstp = keybuf; + lastp = keybuf + sizeof (keybuf); + if (prefix) { + for (srcp = prefix; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + } + for (srcp = name; *srcp && (dstp < lastp); srcp++) + *dstp++ = isalnum(*srcp) ? toupper(*srcp) : '_'; + + if (dstp == lastp) { + errno = ENAMETOOLONG; + zed_log_msg(LOG_WARNING, + "Failed to add variable for eid=%llu: Name too long", eid); + return (-1); + } + *dstp = '\0'; + /* + * Construct the string specified by "[PREFIX][NAME]=[FMT]". + */ + dstp = valbuf; + buflen = sizeof (valbuf); + n = strlcpy(dstp, keybuf, buflen); + if (n >= sizeof (valbuf)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + dstp += n; + buflen -= n; + + *dstp++ = '='; + buflen--; + + if (buflen <= 0) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } + + va_start(vargs, fmt); + n = vsnprintf(dstp, buflen, fmt, vargs); + va_end(vargs); + + if ((n < 0) || (n >= buflen)) { + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, "Exceeded buffer size"); + return (-1); + } else if (zed_strings_add(zsp, keybuf, valbuf) < 0) { + zed_log_msg(LOG_WARNING, "Failed to add %s for eid=%llu: %s", + keybuf, eid, strerror(errno)); + return (-1); + } + return (0); +} + +static int +_zed_event_add_array_err(uint64_t eid, const char *name) +{ + errno = EMSGSIZE; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Exceeded buffer size", name, eid); + return (-1); +} + +static int +_zed_event_add_int8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int8_t *i8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int8_array(nvp, &i8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint8_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint8_t *u8p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT8_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint8_array(nvp, &u8p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u8p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int16_t *i16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int16_array(nvp, &i16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint16_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint16_t *u16p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT16_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint16_array(nvp, &u16p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u16p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int32_t *i32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int32_array(nvp, &i32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%d ", i32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint32_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + uint32_t *u32p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT32_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_uint32_array(nvp, &u32p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%u ", u32p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_int64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + int64_t *i64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_INT64_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_int64_array(nvp, &i64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%lld ", (u_longlong_t)i64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_uint64_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + const char *fmt; + uint64_t *u64p; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_UINT64_ARRAY)); + + name = nvpair_name(nvp); + fmt = _zed_event_value_is_hex(name) ? "0x%.16llX " : "%llu "; + (void) nvpair_value_uint64_array(nvp, &u64p, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, fmt, (u_longlong_t)u64p[i]); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +static int +_zed_event_add_string_array(uint64_t eid, zed_strings_t *zsp, + const char *prefix, nvpair_t *nvp) +{ + char buf[MAXBUF]; + int buflen = sizeof (buf); + const char *name; + char **strp; + uint_t nelem; + uint_t i; + char *p; + int n; + + assert((nvp != NULL) && (nvpair_type(nvp) == DATA_TYPE_STRING_ARRAY)); + + name = nvpair_name(nvp); + (void) nvpair_value_string_array(nvp, &strp, &nelem); + for (i = 0, p = buf; (i < nelem) && (buflen > 0); i++) { + n = snprintf(p, buflen, "%s ", strp[i] ? strp[i] : ""); + if ((n < 0) || (n >= buflen)) + return (_zed_event_add_array_err(eid, name)); + p += n; + buflen -= n; + } + if (nelem > 0) + *--p = '\0'; + + return (_zed_event_add_var(eid, zsp, prefix, name, "%s", buf)); +} + +/* + * Convert the nvpair [nvp] to a string which is added to the environment + * of the child process. + * Return 0 on success, -1 on error. + * + * FIXME: Refactor with cmd/zpool/zpool_main.c:zpool_do_events_nvprint()? + */ +static void +_zed_event_add_nvpair(uint64_t eid, zed_strings_t *zsp, nvpair_t *nvp) +{ + const char *name; + data_type_t type; + const char *prefix = ZEVENT_VAR_PREFIX; + boolean_t b; + double d; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + + assert(zsp != NULL); + assert(nvp != NULL); + + name = nvpair_name(nvp); + type = nvpair_type(nvp); + + switch (type) { + case DATA_TYPE_BOOLEAN: + _zed_event_add_var(eid, zsp, prefix, name, "%s", "1"); + break; + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + _zed_event_add_var(eid, zsp, prefix, name, "%s", b ? "1" : "0"); + break; + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (int8_t *)&i8); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i8); + break; + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i8); + break; + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (int16_t *)&i16); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i16); + break; + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i16); + break; + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (int32_t *)&i32); + _zed_event_add_var(eid, zsp, prefix, name, "%d", i32); + break; + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + _zed_event_add_var(eid, zsp, prefix, name, "%u", i32); + break; + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (int64_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%lld", (longlong_t)i64); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + _zed_event_add_var(eid, zsp, prefix, name, + (_zed_event_value_is_hex(name) ? "0x%.16llX" : "%llu"), + (u_longlong_t)i64); + /* + * shadow readable strings for vdev state pairs + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_state_to_name(i64, VDEV_AUX_NONE)); + } else + /* + * shadow readable strings for pool state + */ + if (strcmp(name, FM_EREPORT_PAYLOAD_ZFS_POOL_STATE) == 0) { + char alt[32]; + + (void) snprintf(alt, sizeof (alt), "%s_str", name); + _zed_event_add_var(eid, zsp, prefix, alt, "%s", + zpool_pool_state_to_name(i64)); + } + break; + case DATA_TYPE_DOUBLE: + (void) nvpair_value_double(nvp, &d); + _zed_event_add_var(eid, zsp, prefix, name, "%g", d); + break; + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (hrtime_t *)&i64); + _zed_event_add_var(eid, zsp, prefix, name, + "%llu", (u_longlong_t)i64); + break; + case DATA_TYPE_NVLIST: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + _zed_event_add_var(eid, zsp, prefix, name, + "%s", (str ? str : "")); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_BYTE_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + case DATA_TYPE_INT8_ARRAY: + _zed_event_add_int8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT8_ARRAY: + _zed_event_add_uint8_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT16_ARRAY: + _zed_event_add_int16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT16_ARRAY: + _zed_event_add_uint16_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT32_ARRAY: + _zed_event_add_int32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT32_ARRAY: + _zed_event_add_uint32_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_INT64_ARRAY: + _zed_event_add_int64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_UINT64_ARRAY: + _zed_event_add_uint64_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_STRING_ARRAY: + _zed_event_add_string_array(eid, zsp, prefix, nvp); + break; + case DATA_TYPE_NVLIST_ARRAY: + _zed_event_add_var(eid, zsp, prefix, name, + "%s", "_NOT_IMPLEMENTED_"); /* FIXME */ + break; + default: + errno = EINVAL; + zed_log_msg(LOG_WARNING, + "Failed to convert nvpair \"%s\" for eid=%llu: " + "Unrecognized type=%u", name, eid, (unsigned int) type); + break; + } +} + +/* + * Restrict various environment variables to safe and sane values + * when constructing the environment for the child process, unless + * we're running with a custom $PATH (like under the ZFS test suite). + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_restrict(uint64_t eid, zed_strings_t *zsp, + const char *path) +{ + const char *env_restrict[][2] = { + { "IFS", " \t\n" }, + { "PATH", _PATH_STDPATH }, + { "ZDB", SBINDIR "/zdb" }, + { "ZED", SBINDIR "/zed" }, + { "ZFS", SBINDIR "/zfs" }, + { "ZINJECT", SBINDIR "/zinject" }, + { "ZPOOL", SBINDIR "/zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + + /* + * If we have a custom $PATH, use the default ZFS binary locations + * instead of the hard-coded ones. + */ + const char *env_path[][2] = { + { "IFS", " \t\n" }, + { "PATH", NULL }, /* $PATH copied in later on */ + { "ZDB", "zdb" }, + { "ZED", "zed" }, + { "ZFS", "zfs" }, + { "ZINJECT", "zinject" }, + { "ZPOOL", "zpool" }, + { "ZFS_ALIAS", ZFS_META_ALIAS }, + { "ZFS_VERSION", ZFS_META_VERSION }, + { "ZFS_RELEASE", ZFS_META_RELEASE }, + { NULL, NULL } + }; + const char *(*pa)[2]; + + assert(zsp != NULL); + + pa = path != NULL ? env_path : env_restrict; + + for (; *(*pa); pa++) { + /* Use our custom $PATH if we have one */ + if (path != NULL && strcmp((*pa)[0], "PATH") == 0) + (*pa)[1] = path; + + _zed_event_add_var(eid, zsp, NULL, (*pa)[0], "%s", (*pa)[1]); + } +} + +/* + * Preserve specified variables from the parent environment + * when constructing the environment for the child process. + * + * Reference: Secure Programming Cookbook by Viega & Messier, Section 1.1. + */ +static void +_zed_event_add_env_preserve(uint64_t eid, zed_strings_t *zsp) +{ + const char *env_preserve[] = { + "TZ", + NULL + }; + const char **keyp; + const char *val; + + assert(zsp != NULL); + + for (keyp = env_preserve; *keyp; keyp++) { + if ((val = getenv(*keyp))) + _zed_event_add_var(eid, zsp, NULL, *keyp, "%s", val); + } +} + +/* + * Compute the "subclass" by removing the first 3 components of [class] + * (which will always be of the form "*.fs.zfs"). Return a pointer inside + * the string [class], or NULL if insufficient components exist. + */ +static const char * +_zed_event_get_subclass(const char *class) +{ + const char *p; + int i; + + if (!class) + return (NULL); + + p = class; + for (i = 0; i < 3; i++) { + p = strchr(p, '.'); + if (!p) + break; + p++; + } + return (p); +} + +/* + * Convert the zevent time from a 2-element array of 64b integers + * into a more convenient form: + * - TIME_SECS is the second component of the time. + * - TIME_NSECS is the nanosecond component of the time. + * - TIME_STRING is an almost-RFC3339-compliant string representation. + */ +static void +_zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) +{ + struct tm *stp; + char buf[32]; + + assert(zsp != NULL); + assert(etime != NULL); + + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_SECS", + "%lld", (long long int) etime[0]); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_NSECS", + "%lld", (long long int) etime[1]); + + if (!(stp = localtime((const time_t *) &etime[0]))) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "localtime error"); + } else if (!strftime(buf, sizeof (buf), "%Y-%m-%d %H:%M:%S%z", stp)) { + zed_log_msg(LOG_WARNING, "Failed to add %s%s for eid=%llu: %s", + ZEVENT_VAR_PREFIX, "TIME_STRING", eid, "strftime error"); + } else { + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "TIME_STRING", + "%s", buf); + } +} + +/* + * Service the next zevent, blocking until one is available. + */ +int +zed_event_service(struct zed_conf *zcp) +{ + nvlist_t *nvl; + nvpair_t *nvp; + int n_dropped; + zed_strings_t *zsp; + uint64_t eid; + int64_t *etime; + uint_t nelem; + char *class; + const char *subclass; + int rv; + + if (!zcp) { + errno = EINVAL; + zed_log_msg(LOG_ERR, "Failed to service zevent: %s", + strerror(errno)); + return (EINVAL); + } + rv = zpool_events_next(zcp->zfs_hdl, &nvl, &n_dropped, ZEVENT_NONE, + zcp->zevent_fd); + + if ((rv != 0) || !nvl) + return (errno); + + if (n_dropped > 0) { + zed_log_msg(LOG_WARNING, "Missed %d events", n_dropped); + /* + * FIXME: Increase max size of event nvlist in + * /sys/module/zfs/parameters/zfs_zevent_len_max ? + */ + } + if (nvlist_lookup_uint64(nvl, "eid", &eid) != 0) { + zed_log_msg(LOG_WARNING, "Failed to lookup zevent eid"); + } else if (nvlist_lookup_int64_array( + nvl, "time", &etime, &nelem) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu)", eid); + } else if (nelem != 2) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent time (eid=%llu, nelem=%u)", + eid, nelem); + } else if (nvlist_lookup_string(nvl, "class", &class) != 0) { + zed_log_msg(LOG_WARNING, + "Failed to lookup zevent class (eid=%llu)", eid); + } else { + /* let internal modules see this event first */ + zfs_agent_post_event(class, NULL, nvl); + + zsp = zed_strings_create(); + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(nvl, nvp))) + _zed_event_add_nvpair(eid, zsp, nvp); + + _zed_event_add_env_restrict(eid, zsp, zcp->path); + _zed_event_add_env_preserve(eid, zsp); + + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "PID", + "%d", (int)getpid()); + _zed_event_add_var(eid, zsp, ZED_VAR_PREFIX, "ZEDLET_DIR", + "%s", zcp->zedlet_dir); + subclass = _zed_event_get_subclass(class); + _zed_event_add_var(eid, zsp, ZEVENT_VAR_PREFIX, "SUBCLASS", + "%s", (subclass ? subclass : class)); + + _zed_event_add_time_strings(eid, zsp, etime); + + zed_exec_process(eid, class, subclass, + zcp->zedlet_dir, zcp->zedlets, zsp, zcp->zevent_fd); + + zed_conf_write_state(zcp, eid, etime); + + zed_strings_destroy(zsp); + } + nvlist_free(nvl); + return (0); +} diff --git a/cmd/zed/zed_event.h b/cmd/zed/zed_event.h new file mode 100644 index 000000000000..c1455c3a0629 --- /dev/null +++ b/cmd/zed/zed_event.h @@ -0,0 +1,29 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EVENT_H +#define ZED_EVENT_H + +#include + +int zed_event_init(struct zed_conf *zcp); + +void zed_event_fini(struct zed_conf *zcp); + +int zed_event_seek(struct zed_conf *zcp, uint64_t saved_eid, + int64_t saved_etime[]); + +int zed_event_service(struct zed_conf *zcp); + +#endif /* !ZED_EVENT_H */ diff --git a/cmd/zed/zed_exec.c b/cmd/zed/zed_exec.c new file mode 100644 index 000000000000..08b7b5568362 --- /dev/null +++ b/cmd/zed/zed_exec.c @@ -0,0 +1,232 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed_exec.h" +#include "zed_file.h" +#include "zed_log.h" +#include "zed_strings.h" + +#define ZEVENT_FILENO 3 + +/* + * Create an environment string array for passing to execve() using the + * NAME=VALUE strings in container [zsp]. + * Return a newly-allocated environment, or NULL on error. + */ +static char ** +_zed_exec_create_env(zed_strings_t *zsp) +{ + int num_ptrs; + int buflen; + char *buf; + char **pp; + char *p; + const char *q; + int i; + int len; + + num_ptrs = zed_strings_count(zsp) + 1; + buflen = num_ptrs * sizeof (char *); + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) + buflen += strlen(q) + 1; + + buf = calloc(1, buflen); + if (!buf) + return (NULL); + + pp = (char **)buf; + p = buf + (num_ptrs * sizeof (char *)); + i = 0; + for (q = zed_strings_first(zsp); q; q = zed_strings_next(zsp)) { + pp[i] = p; + len = strlen(q) + 1; + memcpy(p, q, len); + p += len; + i++; + } + pp[i] = NULL; + assert(buf + buflen == p); + return ((char **)buf); +} + +/* + * Fork a child process to handle event [eid]. The program [prog] + * in directory [dir] is executed with the environment [env]. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + */ +static void +_zed_exec_fork_child(uint64_t eid, const char *dir, const char *prog, + char *env[], int zfd) +{ + char path[PATH_MAX]; + int n; + pid_t pid; + int fd; + pid_t wpid; + int status; + + assert(dir != NULL); + assert(prog != NULL); + assert(env != NULL); + assert(zfd >= 0); + + n = snprintf(path, sizeof (path), "%s/%s", dir, prog); + if ((n < 0) || (n >= sizeof (path))) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(ENAMETOOLONG)); + return; + } + pid = fork(); + if (pid < 0) { + zed_log_msg(LOG_WARNING, + "Failed to fork \"%s\" for eid=%llu: %s", + prog, eid, strerror(errno)); + return; + } else if (pid == 0) { + (void) umask(022); + if ((fd = open("/dev/null", O_RDWR)) != -1) { + (void) dup2(fd, STDIN_FILENO); + (void) dup2(fd, STDOUT_FILENO); + (void) dup2(fd, STDERR_FILENO); + } + (void) dup2(zfd, ZEVENT_FILENO); + zed_file_close_from(ZEVENT_FILENO + 1); + execle(path, prog, NULL, env); + _exit(127); + } + + /* parent process */ + + zed_log_msg(LOG_INFO, "Invoking \"%s\" eid=%llu pid=%d", + prog, eid, pid); + + /* FIXME: Timeout rogue child processes with sigalarm? */ + + /* + * Wait for child process using WNOHANG to limit + * the time spent waiting to 10 seconds (10,000ms). + */ + for (n = 0; n < 1000; n++) { + wpid = waitpid(pid, &status, WNOHANG); + if (wpid == (pid_t)-1) { + if (errno == EINTR) + continue; + zed_log_msg(LOG_WARNING, + "Failed to wait for \"%s\" eid=%llu pid=%d", + prog, eid, pid); + break; + } else if (wpid == 0) { + struct timespec t; + + /* child still running */ + t.tv_sec = 0; + t.tv_nsec = 10000000; /* 10ms */ + (void) nanosleep(&t, NULL); + continue; + } + + if (WIFEXITED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d exit=%d", + prog, eid, pid, WEXITSTATUS(status)); + } else if (WIFSIGNALED(status)) { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d sig=%d/%s", + prog, eid, pid, WTERMSIG(status), + strsignal(WTERMSIG(status))); + } else { + zed_log_msg(LOG_INFO, + "Finished \"%s\" eid=%llu pid=%d status=0x%X", + prog, eid, (unsigned int) status); + } + break; + } + + /* + * kill child process after 10 seconds + */ + if (wpid == 0) { + zed_log_msg(LOG_WARNING, "Killing hung \"%s\" pid=%d", + prog, pid); + (void) kill(pid, SIGKILL); + } +} + +/* + * Process the event [eid] by synchronously invoking all zedlets with a + * matching class prefix. + * + * Each executable in [zedlets] from the directory [dir] is matched against + * the event's [class], [subclass], and the "all" class (which matches + * all events). Every zedlet with a matching class prefix is invoked. + * The NAME=VALUE strings in [envs] will be passed to the zedlet as + * environment variables. + * + * The file descriptor [zfd] is the zevent_fd used to track the + * current cursor location within the zevent nvlist. + * + * Return 0 on success, -1 on error. + */ +int +zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, int zfd) +{ + const char *class_strings[4]; + const char *allclass = "all"; + const char **csp; + const char *z; + char **e; + int n; + + if (!dir || !zedlets || !envs || zfd < 0) + return (-1); + + csp = class_strings; + + if (class) + *csp++ = class; + + if (subclass) + *csp++ = subclass; + + if (allclass) + *csp++ = allclass; + + *csp = NULL; + + e = _zed_exec_create_env(envs); + + for (z = zed_strings_first(zedlets); z; z = zed_strings_next(zedlets)) { + for (csp = class_strings; *csp; csp++) { + n = strlen(*csp); + if ((strncmp(z, *csp, n) == 0) && !isalpha(z[n])) + _zed_exec_fork_child(eid, dir, z, e, zfd); + } + } + free(e); + return (0); +} diff --git a/cmd/zed/zed_exec.h b/cmd/zed/zed_exec.h new file mode 100644 index 000000000000..4153e5519a46 --- /dev/null +++ b/cmd/zed/zed_exec.h @@ -0,0 +1,25 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_EXEC_H +#define ZED_EXEC_H + +#include +#include "zed_strings.h" + +int zed_exec_process(uint64_t eid, const char *class, const char *subclass, + const char *dir, zed_strings_t *zedlets, zed_strings_t *envs, + int zevent_fd); + +#endif /* !ZED_EXEC_H */ diff --git a/cmd/zed/zed_file.c b/cmd/zed/zed_file.c new file mode 100644 index 000000000000..c3cf3d421c6f --- /dev/null +++ b/cmd/zed/zed_file.c @@ -0,0 +1,217 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed_file.h" +#include "zed_log.h" + +/* + * Read up to [n] bytes from [fd] into [buf]. + * Return the number of bytes read, 0 on EOF, or -1 on error. + */ +ssize_t +zed_file_read_n(int fd, void *buf, size_t n) +{ + unsigned char *p; + size_t n_left; + ssize_t n_read; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_read = read(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } else if (n_read == 0) { + break; + } + n_left -= n_read; + p += n_read; + } + return (n - n_left); +} + +/* + * Write [n] bytes from [buf] out to [fd]. + * Return the number of bytes written, or -1 on error. + */ +ssize_t +zed_file_write_n(int fd, void *buf, size_t n) +{ + const unsigned char *p; + size_t n_left; + ssize_t n_written; + + p = buf; + n_left = n; + while (n_left > 0) { + if ((n_written = write(fd, p, n_left)) < 0) { + if (errno == EINTR) + continue; + else + return (-1); + + } + n_left -= n_written; + p += n_written; + } + return (n); +} + +/* + * Set an exclusive advisory lock on the open file descriptor [fd]. + * Return 0 on success, 1 if a conflicting lock is held by another process, + * or -1 on error (with errno set). + */ +int +zed_file_lock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) { + if ((errno == EACCES) || (errno == EAGAIN)) + return (1); + + return (-1); + } + return (0); +} + +/* + * Release an advisory lock held on the open file descriptor [fd]. + * Return 0 on success, or -1 on error (with errno set). + */ +int +zed_file_unlock(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_UNLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_SETLK, &lock) < 0) + return (-1); + + return (0); +} + +/* + * Test whether an exclusive advisory lock could be obtained for the open + * file descriptor [fd]. + * Return 0 if the file is not locked, >0 for the PID of another process + * holding a conflicting lock, or -1 on error (with errno set). + */ +pid_t +zed_file_is_locked(int fd) +{ + struct flock lock; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + lock.l_type = F_WRLCK; + lock.l_whence = SEEK_SET; + lock.l_start = 0; + lock.l_len = 0; + + if (fcntl(fd, F_GETLK, &lock) < 0) + return (-1); + + if (lock.l_type == F_UNLCK) + return (0); + + return (lock.l_pid); +} + +/* + * Close all open file descriptors greater than or equal to [lowfd]. + * Any errors encountered while closing file descriptors are ignored. + */ +void +zed_file_close_from(int lowfd) +{ + const int maxfd_def = 256; + int errno_bak; + struct rlimit rl; + int maxfd; + int fd; + + errno_bak = errno; + + if (getrlimit(RLIMIT_NOFILE, &rl) < 0) { + maxfd = maxfd_def; + } else if (rl.rlim_max == RLIM_INFINITY) { + maxfd = maxfd_def; + } else { + maxfd = rl.rlim_max; + } + for (fd = lowfd; fd < maxfd; fd++) + (void) close(fd); + + errno = errno_bak; +} + +/* + * Set the CLOEXEC flag on file descriptor [fd] so it will be automatically + * closed upon successful execution of one of the exec functions. + * Return 0 on success, or -1 on error. + * + * FIXME: No longer needed? + */ +int +zed_file_close_on_exec(int fd) +{ + int flags; + + if (fd < 0) { + errno = EBADF; + return (-1); + } + flags = fcntl(fd, F_GETFD); + if (flags == -1) + return (-1); + + flags |= FD_CLOEXEC; + + if (fcntl(fd, F_SETFD, flags) == -1) + return (-1); + + return (0); +} diff --git a/cmd/zed/zed_file.h b/cmd/zed/zed_file.h new file mode 100644 index 000000000000..05f360d20efd --- /dev/null +++ b/cmd/zed/zed_file.h @@ -0,0 +1,35 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_FILE_H +#define ZED_FILE_H + +#include +#include + +ssize_t zed_file_read_n(int fd, void *buf, size_t n); + +ssize_t zed_file_write_n(int fd, void *buf, size_t n); + +int zed_file_lock(int fd); + +int zed_file_unlock(int fd); + +pid_t zed_file_is_locked(int fd); + +void zed_file_close_from(int fd); + +int zed_file_close_on_exec(int fd); + +#endif /* !ZED_FILE_H */ diff --git a/cmd/zed/zed_log.c b/cmd/zed/zed_log.c new file mode 100644 index 000000000000..5a3f2dbdb832 --- /dev/null +++ b/cmd/zed/zed_log.c @@ -0,0 +1,256 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zed_log.h" + +#define ZED_LOG_MAX_LOG_LEN 1024 + +static struct { + unsigned do_stderr:1; + unsigned do_syslog:1; + const char *identity; + int priority; + int pipe_fd[2]; +} _ctx; + +/* + * Initialize the logging subsystem. + */ +void +zed_log_init(const char *identity) +{ + if (identity) { + const char *p = strrchr(identity, '/'); + _ctx.identity = (p != NULL) ? p + 1 : identity; + } else { + _ctx.identity = NULL; + } + _ctx.pipe_fd[0] = -1; + _ctx.pipe_fd[1] = -1; +} + +/* + * Shutdown the logging subsystem. + */ +void +zed_log_fini(void) +{ + zed_log_stderr_close(); + zed_log_syslog_close(); +} + +/* + * Create pipe for communicating daemonization status between the parent and + * child processes across the double-fork(). + */ +void +zed_log_pipe_open(void) +{ + if ((_ctx.pipe_fd[0] != -1) || (_ctx.pipe_fd[1] != -1)) + zed_log_die("Invalid use of zed_log_pipe_open in PID %d", + (int)getpid()); + + if (pipe(_ctx.pipe_fd) < 0) + zed_log_die("Failed to create daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); +} + +/* + * Close the read-half of the daemonize pipe. + * + * This should be called by the child after fork()ing from the parent since + * the child will never read from this pipe. + */ +void +zed_log_pipe_close_reads(void) +{ + if (_ctx.pipe_fd[0] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_reads in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[0]) < 0) + zed_log_die( + "Failed to close reads on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[0] = -1; +} + +/* + * Close the write-half of the daemonize pipe. + * + * This should be called by the parent after fork()ing its child since the + * parent will never write to this pipe. + * + * This should also be called by the child once initialization is complete + * in order to signal the parent that it can safely exit. + */ +void +zed_log_pipe_close_writes(void) +{ + if (_ctx.pipe_fd[1] < 0) + zed_log_die( + "Invalid use of zed_log_pipe_close_writes in PID %d", + (int)getpid()); + + if (close(_ctx.pipe_fd[1]) < 0) + zed_log_die( + "Failed to close writes on daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + + _ctx.pipe_fd[1] = -1; +} + +/* + * Block on reading from the daemonize pipe until signaled by the child + * (via zed_log_pipe_close_writes()) that initialization is complete. + * + * This should only be called by the parent while waiting to exit after + * fork()ing the child. + */ +void +zed_log_pipe_wait(void) +{ + ssize_t n; + char c; + + if (_ctx.pipe_fd[0] < 0) + zed_log_die("Invalid use of zed_log_pipe_wait in PID %d", + (int)getpid()); + + for (;;) { + n = read(_ctx.pipe_fd[0], &c, sizeof (c)); + if (n < 0) { + if (errno == EINTR) + continue; + zed_log_die( + "Failed to read from daemonize pipe in PID %d: %s", + (int)getpid(), strerror(errno)); + } + if (n == 0) { + break; + } + } +} + +/* + * Start logging messages at the syslog [priority] level or higher to stderr. + * Refer to syslog(3) for valid priority values. + */ +void +zed_log_stderr_open(int priority) +{ + _ctx.do_stderr = 1; + _ctx.priority = priority; +} + +/* + * Stop logging messages to stderr. + */ +void +zed_log_stderr_close(void) +{ + if (_ctx.do_stderr) + _ctx.do_stderr = 0; +} + +/* + * Start logging messages to syslog. + * Refer to syslog(3) for valid option/facility values. + */ +void +zed_log_syslog_open(int facility) +{ + _ctx.do_syslog = 1; + openlog(_ctx.identity, LOG_NDELAY | LOG_PID, facility); +} + +/* + * Stop logging messages to syslog. + */ +void +zed_log_syslog_close(void) +{ + if (_ctx.do_syslog) { + _ctx.do_syslog = 0; + closelog(); + } +} + +/* + * Auxiliary function to log a message to syslog and/or stderr. + */ +static void +_zed_log_aux(int priority, const char *fmt, va_list vargs) +{ + char buf[ZED_LOG_MAX_LOG_LEN]; + int n; + + if (!fmt) + return; + + n = vsnprintf(buf, sizeof (buf), fmt, vargs); + if ((n < 0) || (n >= sizeof (buf))) { + buf[sizeof (buf) - 2] = '+'; + buf[sizeof (buf) - 1] = '\0'; + } + + if (_ctx.do_syslog) + syslog(priority, "%s", buf); + + if (_ctx.do_stderr && (priority <= _ctx.priority)) + fprintf(stderr, "%s\n", buf); +} + +/* + * Log a message at the given [priority] level specified by the printf-style + * format string [fmt]. + */ +void +zed_log_msg(int priority, const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(priority, fmt, vargs); + va_end(vargs); + } +} + +/* + * Log a fatal error message specified by the printf-style format string [fmt]. + */ +void +zed_log_die(const char *fmt, ...) +{ + va_list vargs; + + if (fmt) { + va_start(vargs, fmt); + _zed_log_aux(LOG_ERR, fmt, vargs); + va_end(vargs); + } + exit(EXIT_FAILURE); +} diff --git a/cmd/zed/zed_log.h b/cmd/zed/zed_log.h new file mode 100644 index 000000000000..a03a4f53967c --- /dev/null +++ b/cmd/zed/zed_log.h @@ -0,0 +1,44 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_LOG_H +#define ZED_LOG_H + +#include + +void zed_log_init(const char *identity); + +void zed_log_fini(void); + +void zed_log_pipe_open(void); + +void zed_log_pipe_close_reads(void); + +void zed_log_pipe_close_writes(void); + +void zed_log_pipe_wait(void); + +void zed_log_stderr_open(int priority); + +void zed_log_stderr_close(void); + +void zed_log_syslog_open(int facility); + +void zed_log_syslog_close(void); + +void zed_log_msg(int priority, const char *fmt, ...); + +void zed_log_die(const char *fmt, ...); + +#endif /* !ZED_LOG_H */ diff --git a/cmd/zed/zed_strings.c b/cmd/zed/zed_strings.c new file mode 100644 index 000000000000..6b1c669d71f4 --- /dev/null +++ b/cmd/zed/zed_strings.c @@ -0,0 +1,247 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "zed_strings.h" + +struct zed_strings { + avl_tree_t tree; + avl_node_t *iteratorp; +}; + +struct zed_strings_node { + avl_node_t node; + char *key; + char *val; +}; + +typedef struct zed_strings_node zed_strings_node_t; + +/* + * Compare zed_strings_node_t nodes [x1] and [x2]. + * As required for the AVL tree, return -1 for <, 0 for ==, and +1 for >. + */ +static int +_zed_strings_node_compare(const void *x1, const void *x2) +{ + const char *s1; + const char *s2; + int rv; + + assert(x1 != NULL); + assert(x2 != NULL); + + s1 = ((const zed_strings_node_t *) x1)->key; + assert(s1 != NULL); + s2 = ((const zed_strings_node_t *) x2)->key; + assert(s2 != NULL); + rv = strcmp(s1, s2); + + if (rv < 0) + return (-1); + + if (rv > 0) + return (1); + + return (0); +} + +/* + * Return a new string container, or NULL on error. + */ +zed_strings_t * +zed_strings_create(void) +{ + zed_strings_t *zsp; + + zsp = calloc(1, sizeof (*zsp)); + if (!zsp) + return (NULL); + + avl_create(&zsp->tree, _zed_strings_node_compare, + sizeof (zed_strings_node_t), offsetof(zed_strings_node_t, node)); + + zsp->iteratorp = NULL; + return (zsp); +} + +/* + * Destroy the string node [np]. + */ +static void +_zed_strings_node_destroy(zed_strings_node_t *np) +{ + if (!np) + return; + + if (np->key) { + if (np->key != np->val) + free(np->key); + np->key = NULL; + } + if (np->val) { + free(np->val); + np->val = NULL; + } + free(np); +} + +/* + * Return a new string node for storing the string [val], or NULL on error. + * If [key] is specified, it will be used to index the node; otherwise, + * the string [val] will be used. + */ +static zed_strings_node_t * +_zed_strings_node_create(const char *key, const char *val) +{ + zed_strings_node_t *np; + + assert(val != NULL); + + np = calloc(1, sizeof (*np)); + if (!np) + return (NULL); + + np->val = strdup(val); + if (!np->val) + goto nomem; + + if (key) { + np->key = strdup(key); + if (!np->key) + goto nomem; + } else { + np->key = np->val; + } + return (np); + +nomem: + _zed_strings_node_destroy(np); + return (NULL); +} + +/* + * Destroy the string container [zsp] and all nodes within. + */ +void +zed_strings_destroy(zed_strings_t *zsp) +{ + void *cookie; + zed_strings_node_t *np; + + if (!zsp) + return; + + cookie = NULL; + while ((np = avl_destroy_nodes(&zsp->tree, &cookie))) + _zed_strings_node_destroy(np); + + avl_destroy(&zsp->tree); + free(zsp); +} + +/* + * Add a copy of the string [s] indexed by [key] to the container [zsp]. + * If [key] already exists within the container [zsp], it will be replaced + * with the new string [s]. + * If [key] is NULL, the string [s] will be used as the key. + * Return 0 on success, or -1 on error. + */ +int +zed_strings_add(zed_strings_t *zsp, const char *key, const char *s) +{ + zed_strings_node_t *newp, *oldp; + + if (!zsp || !s) { + errno = EINVAL; + return (-1); + } + if (key == s) + key = NULL; + + newp = _zed_strings_node_create(key, s); + if (!newp) + return (-1); + + oldp = avl_find(&zsp->tree, newp, NULL); + if (oldp) { + avl_remove(&zsp->tree, oldp); + _zed_strings_node_destroy(oldp); + } + avl_add(&zsp->tree, newp); + return (0); +} + +/* + * Return the first string in container [zsp]. + * Return NULL if there are no strings, or on error. + * This can be called multiple times to re-traverse [zsp]. + * XXX: Not thread-safe. + */ +const char * +zed_strings_first(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + zsp->iteratorp = avl_first(&zsp->tree); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); + +} + +/* + * Return the next string in container [zsp]. + * Return NULL after the last string, or on error. + * This must be called after zed_strings_first(). + * XXX: Not thread-safe. + */ +const char * +zed_strings_next(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (NULL); + } + if (!zsp->iteratorp) + return (NULL); + + zsp->iteratorp = AVL_NEXT(&zsp->tree, zsp->iteratorp); + if (!zsp->iteratorp) + return (NULL); + + return (((zed_strings_node_t *)zsp->iteratorp)->val); +} + +/* + * Return the number of strings in container [zsp], or -1 on error. + */ +int +zed_strings_count(zed_strings_t *zsp) +{ + if (!zsp) { + errno = EINVAL; + return (-1); + } + return (avl_numnodes(&zsp->tree)); +} diff --git a/cmd/zed/zed_strings.h b/cmd/zed/zed_strings.h new file mode 100644 index 000000000000..37a84cad7ffc --- /dev/null +++ b/cmd/zed/zed_strings.h @@ -0,0 +1,27 @@ +/* + * This file is part of the ZFS Event Daemon (ZED) + * for ZFS on Linux (ZoL) . + * Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). + * Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. + * Refer to the ZoL git commit log for authoritative copyright attribution. + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License Version 1.0 (CDDL-1.0). + * You can obtain a copy of the license from the top-level file + * "OPENSOLARIS.LICENSE" or at . + * You may not use this file except in compliance with the license. + */ + +#ifndef ZED_STRINGS_H +#define ZED_STRINGS_H + +typedef struct zed_strings zed_strings_t; + +zed_strings_t *zed_strings_create(void); +void zed_strings_destroy(zed_strings_t *zsp); +int zed_strings_add(zed_strings_t *zsp, const char *key, const char *s); +const char *zed_strings_first(zed_strings_t *zsp); +const char *zed_strings_next(zed_strings_t *zsp); +int zed_strings_count(zed_strings_t *zsp); + +#endif /* !ZED_STRINGS_H */ diff --git a/cmd/zfs/.gitignore b/cmd/zfs/.gitignore new file mode 100644 index 000000000000..0fd9cc63af2a --- /dev/null +++ b/cmd/zfs/.gitignore @@ -0,0 +1 @@ +/zfs diff --git a/cmd/zfs/Makefile.am b/cmd/zfs/Makefile.am new file mode 100644 index 000000000000..dec5920381d5 --- /dev/null +++ b/cmd/zfs/Makefile.am @@ -0,0 +1,23 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zfs + +zfs_SOURCES = \ + zfs_iter.c \ + zfs_iter.h \ + zfs_main.c \ + zfs_util.h \ + zfs_project.c \ + zfs_projectutil.h + +zfs_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zfs_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zfs_LDADD += -lgeom -ljail +endif diff --git a/cmd/zfs/zfs_iter.c b/cmd/zfs/zfs_iter.c new file mode 100644 index 000000000000..f2359508c16d --- /dev/null +++ b/cmd/zfs/zfs_iter.c @@ -0,0 +1,512 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek . + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zfs_util.h" +#include "zfs_iter.h" + +/* + * This is a private interface used to gather up all the datasets specified on + * the command line so that we can iterate over them in order. + * + * First, we iterate over all filesystems, gathering them together into an + * AVL tree. We report errors for any explicitly specified datasets + * that we couldn't open. + * + * When finished, we have an AVL tree of ZFS handles. We go through and execute + * the provided callback for each one, passing whatever data the user supplied. + */ + +typedef struct zfs_node { + zfs_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; +} zfs_node_t; + +typedef struct callback_data { + uu_avl_t *cb_avl; + int cb_flags; + zfs_type_t cb_types; + zfs_sort_column_t *cb_sortcol; + zprop_list_t **cb_proplist; + int cb_depth_limit; + int cb_depth; + uint8_t cb_props_table[ZFS_NUM_PROPS]; +} callback_data_t; + +uu_avl_pool_t *avl_pool; + +/* + * Include snaps if they were requested or if this a zfs list where types + * were not specified and the "listsnapshots" property is set on this pool. + */ +static boolean_t +zfs_include_snapshots(zfs_handle_t *zhp, callback_data_t *cb) +{ + zpool_handle_t *zph; + + if ((cb->cb_flags & ZFS_ITER_PROP_LISTSNAPS) == 0) + return (cb->cb_types & ZFS_TYPE_SNAPSHOT); + + zph = zfs_get_pool_handle(zhp); + return (zpool_get_prop_int(zph, ZPOOL_PROP_LISTSNAPS, NULL)); +} + +/* + * Called for each dataset. If the object is of an appropriate type, + * add it to the avl tree and recurse over any children as necessary. + */ +static int +zfs_callback(zfs_handle_t *zhp, void *data) +{ + callback_data_t *cb = data; + boolean_t should_close = B_TRUE; + boolean_t include_snaps = zfs_include_snapshots(zhp, cb); + boolean_t include_bmarks = (cb->cb_types & ZFS_TYPE_BOOKMARK); + + if ((zfs_get_type(zhp) & cb->cb_types) || + ((zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) && include_snaps)) { + uu_avl_index_t idx; + zfs_node_t *node = safe_malloc(sizeof (zfs_node_t)); + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, avl_pool); + if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, + &idx) == NULL) { + if (cb->cb_proplist) { + if ((*cb->cb_proplist) && + !(*cb->cb_proplist)->pl_all) + zfs_prune_proplist(zhp, + cb->cb_props_table); + + if (zfs_expand_proplist(zhp, cb->cb_proplist, + (cb->cb_flags & ZFS_ITER_RECVD_PROPS), + (cb->cb_flags & ZFS_ITER_LITERAL_PROPS)) + != 0) { + free(node); + return (-1); + } + } + uu_avl_insert(cb->cb_avl, node, idx); + should_close = B_FALSE; + } else { + free(node); + } + } + + /* + * Recurse if necessary. + */ + if (cb->cb_flags & ZFS_ITER_RECURSE && + ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + cb->cb_depth < cb->cb_depth_limit)) { + cb->cb_depth++; + + /* + * If we are not looking for filesystems, we don't need to + * recurse into filesystems when we are at our depth limit. + */ + if ((cb->cb_depth < cb->cb_depth_limit || + (cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + (cb->cb_types & + (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME))) && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) zfs_iter_filesystems(zhp, zfs_callback, data); + } + + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_snaps) { + (void) zfs_iter_snapshots(zhp, + (cb->cb_flags & ZFS_ITER_SIMPLE) != 0, + zfs_callback, data, 0, 0); + } + + if (((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | + ZFS_TYPE_BOOKMARK)) == 0) && include_bmarks) { + (void) zfs_iter_bookmarks(zhp, zfs_callback, data); + } + + cb->cb_depth--; + } + + if (should_close) + zfs_close(zhp); + + return (0); +} + +int +zfs_add_sort_column(zfs_sort_column_t **sc, const char *name, + boolean_t reverse) +{ + zfs_sort_column_t *col; + zfs_prop_t prop; + + if ((prop = zfs_name_to_prop(name)) == ZPROP_INVAL && + !zfs_prop_user(name)) + return (-1); + + col = safe_malloc(sizeof (zfs_sort_column_t)); + + col->sc_prop = prop; + col->sc_reverse = reverse; + if (prop == ZPROP_INVAL) { + col->sc_user_prop = safe_malloc(strlen(name) + 1); + (void) strcpy(col->sc_user_prop, name); + } + + if (*sc == NULL) { + col->sc_last = col; + *sc = col; + } else { + (*sc)->sc_last->sc_next = col; + (*sc)->sc_last = col; + } + + return (0); +} + +void +zfs_free_sort_columns(zfs_sort_column_t *sc) +{ + zfs_sort_column_t *col; + + while (sc != NULL) { + col = sc->sc_next; + free(sc->sc_user_prop); + free(sc); + sc = col; + } +} + +int +zfs_sort_only_by_name(const zfs_sort_column_t *sc) +{ + return (sc != NULL && sc->sc_next == NULL && + sc->sc_prop == ZFS_PROP_NAME); +} + +/* ARGSUSED */ +static int +zfs_compare(const void *larg, const void *rarg, void *unused) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + const char *lname = zfs_get_name(l); + const char *rname = zfs_get_name(r); + char *lat, *rat; + uint64_t lcreate, rcreate; + int ret; + + lat = (char *)strchr(lname, '@'); + rat = (char *)strchr(rname, '@'); + + if (lat != NULL) + *lat = '\0'; + if (rat != NULL) + *rat = '\0'; + + ret = strcmp(lname, rname); + if (ret == 0 && (lat != NULL || rat != NULL)) { + /* + * If we're comparing a dataset to one of its snapshots, we + * always make the full dataset first. + */ + if (lat == NULL) { + ret = -1; + } else if (rat == NULL) { + ret = 1; + } else { + /* + * If we have two snapshots from the same dataset, then + * we want to sort them according to creation time. We + * use the hidden CREATETXG property to get an absolute + * ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + /* + * Both lcreate and rcreate being 0 means we don't have + * properties and we should compare full name. + */ + if (lcreate == 0 && rcreate == 0) + ret = strcmp(lat + 1, rat + 1); + else if (lcreate < rcreate) + ret = -1; + else if (lcreate > rcreate) + ret = 1; + } + } + + if (lat != NULL) + *lat = '@'; + if (rat != NULL) + *rat = '@'; + + return (ret); +} + +/* + * Sort datasets by specified columns. + * + * o Numeric types sort in ascending order. + * o String types sort in alphabetical order. + * o Types inappropriate for a row sort that row to the literal + * bottom, regardless of the specified ordering. + * + * If no sort columns are specified, or two datasets compare equally + * across all specified columns, they are sorted alphabetically by name + * with snapshots grouped under their parents. + */ +static int +zfs_sort(const void *larg, const void *rarg, void *data) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + zfs_sort_column_t *sc = (zfs_sort_column_t *)data; + zfs_sort_column_t *psc; + + for (psc = sc; psc != NULL; psc = psc->sc_next) { + char lbuf[ZFS_MAXPROPLEN], rbuf[ZFS_MAXPROPLEN]; + char *lstr, *rstr; + uint64_t lnum, rnum; + boolean_t lvalid, rvalid; + int ret = 0; + + /* + * We group the checks below the generic code. If 'lstr' and + * 'rstr' are non-NULL, then we do a string based comparison. + * Otherwise, we compare 'lnum' and 'rnum'. + */ + lstr = rstr = NULL; + if (psc->sc_prop == ZPROP_INVAL) { + nvlist_t *luser, *ruser; + nvlist_t *lval, *rval; + + luser = zfs_get_user_props(l); + ruser = zfs_get_user_props(r); + + lvalid = (nvlist_lookup_nvlist(luser, + psc->sc_user_prop, &lval) == 0); + rvalid = (nvlist_lookup_nvlist(ruser, + psc->sc_user_prop, &rval) == 0); + + if (lvalid) + verify(nvlist_lookup_string(lval, + ZPROP_VALUE, &lstr) == 0); + if (rvalid) + verify(nvlist_lookup_string(rval, + ZPROP_VALUE, &rstr) == 0); + } else if (psc->sc_prop == ZFS_PROP_NAME) { + lvalid = rvalid = B_TRUE; + + (void) strlcpy(lbuf, zfs_get_name(l), sizeof (lbuf)); + (void) strlcpy(rbuf, zfs_get_name(r), sizeof (rbuf)); + + lstr = lbuf; + rstr = rbuf; + } else if (zfs_prop_is_string(psc->sc_prop)) { + lvalid = (zfs_prop_get(l, psc->sc_prop, lbuf, + sizeof (lbuf), NULL, NULL, 0, B_TRUE) == 0); + rvalid = (zfs_prop_get(r, psc->sc_prop, rbuf, + sizeof (rbuf), NULL, NULL, 0, B_TRUE) == 0); + + lstr = lbuf; + rstr = rbuf; + } else { + lvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(l), B_FALSE); + rvalid = zfs_prop_valid_for_type(psc->sc_prop, + zfs_get_type(r), B_FALSE); + + if (lvalid) + (void) zfs_prop_get_numeric(l, psc->sc_prop, + &lnum, NULL, NULL, 0); + if (rvalid) + (void) zfs_prop_get_numeric(r, psc->sc_prop, + &rnum, NULL, NULL, 0); + } + + if (!lvalid && !rvalid) + continue; + else if (!lvalid) + return (1); + else if (!rvalid) + return (-1); + + if (lstr) + ret = strcmp(lstr, rstr); + else if (lnum < rnum) + ret = -1; + else if (lnum > rnum) + ret = 1; + + if (ret != 0) { + if (psc->sc_reverse == B_TRUE) + ret = (ret < 0) ? 1 : -1; + return (ret); + } + } + + return (zfs_compare(larg, rarg, NULL)); +} + +int +zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, + zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, + zfs_iter_f callback, void *data) +{ + callback_data_t cb = {0}; + int ret = 0; + zfs_node_t *node; + uu_avl_walk_t *walk; + + avl_pool = uu_avl_pool_create("zfs_pool", sizeof (zfs_node_t), + offsetof(zfs_node_t, zn_avlnode), zfs_sort, UU_DEFAULT); + + if (avl_pool == NULL) + nomem(); + + cb.cb_sortcol = sortcol; + cb.cb_flags = flags; + cb.cb_proplist = proplist; + cb.cb_types = types; + cb.cb_depth_limit = limit; + /* + * If cb_proplist is provided then in the zfs_handles created we + * retain only those properties listed in cb_proplist and sortcol. + * The rest are pruned. So, the caller should make sure that no other + * properties other than those listed in cb_proplist/sortcol are + * accessed. + * + * If cb_proplist is NULL then we retain all the properties. We + * always retain the zoned property, which some other properties + * need (userquota & friends), and the createtxg property, which + * we need to sort snapshots. + */ + if (cb.cb_proplist && *cb.cb_proplist) { + zprop_list_t *p = *cb.cb_proplist; + + while (p) { + if (p->pl_prop >= ZFS_PROP_TYPE && + p->pl_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[p->pl_prop] = B_TRUE; + } + p = p->pl_next; + } + + while (sortcol) { + if (sortcol->sc_prop >= ZFS_PROP_TYPE && + sortcol->sc_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[sortcol->sc_prop] = B_TRUE; + } + sortcol = sortcol->sc_next; + } + + cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; + cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; + } else { + (void) memset(cb.cb_props_table, B_TRUE, + sizeof (cb.cb_props_table)); + } + + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + if (argc == 0) { + /* + * If given no arguments, iterate over all datasets. + */ + cb.cb_flags |= ZFS_ITER_RECURSE; + ret = zfs_iter_root(g_zfs, zfs_callback, &cb); + } else { + int i; + zfs_handle_t *zhp; + zfs_type_t argtype; + + /* + * If we're recursive, then we always allow filesystems as + * arguments. If we also are interested in snapshots or + * bookmarks, then we can take volumes as well. + */ + argtype = types; + if (flags & ZFS_ITER_RECURSE) { + argtype |= ZFS_TYPE_FILESYSTEM; + if (types & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) + argtype |= ZFS_TYPE_VOLUME; + } + + for (i = 0; i < argc; i++) { + if (flags & ZFS_ITER_ARGS_CAN_BE_PATHS) { + zhp = zfs_path_to_zhandle(g_zfs, argv[i], + argtype); + } else { + zhp = zfs_open(g_zfs, argv[i], argtype); + } + if (zhp != NULL) + ret |= zfs_callback(zhp, &cb); + else + ret = 1; + } + } + + /* + * At this point we've got our AVL tree full of zfs handles, so iterate + * over each one and execute the real user callback. + */ + for (node = uu_avl_first(cb.cb_avl); node != NULL; + node = uu_avl_next(cb.cb_avl, node)) + ret |= callback(node->zn_handle, data); + + /* + * Finally, clean up the AVL tree. + */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + zfs_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(cb.cb_avl); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} diff --git a/cmd/zfs/zfs_iter.h b/cmd/zfs/zfs_iter.h new file mode 100644 index 000000000000..2697fbdca1df --- /dev/null +++ b/cmd/zfs/zfs_iter.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2013 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef ZFS_ITER_H +#define ZFS_ITER_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfs_sort_column { + struct zfs_sort_column *sc_next; + struct zfs_sort_column *sc_last; + zfs_prop_t sc_prop; + char *sc_user_prop; + boolean_t sc_reverse; +} zfs_sort_column_t; + +#define ZFS_ITER_RECURSE (1 << 0) +#define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) +#define ZFS_ITER_PROP_LISTSNAPS (1 << 2) +#define ZFS_ITER_DEPTH_LIMIT (1 << 3) +#define ZFS_ITER_RECVD_PROPS (1 << 4) +#define ZFS_ITER_LITERAL_PROPS (1 << 5) +#define ZFS_ITER_SIMPLE (1 << 6) + +int zfs_for_each(int, char **, int options, zfs_type_t, + zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); +int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); +void zfs_free_sort_columns(zfs_sort_column_t *); +int zfs_sort_only_by_name(const zfs_sort_column_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* ZFS_ITER_H */ diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c new file mode 100644 index 000000000000..650b4fc9b74f --- /dev/null +++ b/cmd/zfs/zfs_main.c @@ -0,0 +1,8620 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2012 Milan Jurik. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright 2016 Igor Kozhukhov . + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, loli10K + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#ifdef HAVE_IDMAP +#include +#include +#endif /* HAVE_IDMAP */ + +#include "zfs_iter.h" +#include "zfs_util.h" +#include "zfs_comutil.h" +#include "libzfs_impl.h" +#include "zfs_projectutil.h" + +libzfs_handle_t *g_zfs; + +static FILE *mnttab_file; +static char history_str[HIS_MAX_RECORD_LEN]; +static boolean_t log_history = B_TRUE; + +static int zfs_do_clone(int argc, char **argv); +static int zfs_do_create(int argc, char **argv); +static int zfs_do_destroy(int argc, char **argv); +static int zfs_do_get(int argc, char **argv); +static int zfs_do_inherit(int argc, char **argv); +static int zfs_do_list(int argc, char **argv); +static int zfs_do_mount(int argc, char **argv); +static int zfs_do_rename(int argc, char **argv); +static int zfs_do_rollback(int argc, char **argv); +static int zfs_do_set(int argc, char **argv); +static int zfs_do_upgrade(int argc, char **argv); +static int zfs_do_snapshot(int argc, char **argv); +static int zfs_do_unmount(int argc, char **argv); +static int zfs_do_share(int argc, char **argv); +static int zfs_do_unshare(int argc, char **argv); +static int zfs_do_send(int argc, char **argv); +static int zfs_do_receive(int argc, char **argv); +static int zfs_do_promote(int argc, char **argv); +static int zfs_do_userspace(int argc, char **argv); +static int zfs_do_allow(int argc, char **argv); +static int zfs_do_unallow(int argc, char **argv); +static int zfs_do_hold(int argc, char **argv); +static int zfs_do_holds(int argc, char **argv); +static int zfs_do_release(int argc, char **argv); +static int zfs_do_diff(int argc, char **argv); +static int zfs_do_bookmark(int argc, char **argv); +static int zfs_do_channel_program(int argc, char **argv); +static int zfs_do_load_key(int argc, char **argv); +static int zfs_do_unload_key(int argc, char **argv); +static int zfs_do_change_key(int argc, char **argv); +static int zfs_do_project(int argc, char **argv); +static int zfs_do_version(int argc, char **argv); +static int zfs_do_redact(int argc, char **argv); +static int zfs_do_wait(int argc, char **argv); + +#ifdef __FreeBSD__ +static int zfs_do_jail(int argc, char **argv); +static int zfs_do_unjail(int argc, char **argv); +#endif + +/* + * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_CLONE, + HELP_CREATE, + HELP_DESTROY, + HELP_GET, + HELP_INHERIT, + HELP_UPGRADE, + HELP_LIST, + HELP_MOUNT, + HELP_PROMOTE, + HELP_RECEIVE, + HELP_RENAME, + HELP_ROLLBACK, + HELP_SEND, + HELP_SET, + HELP_SHARE, + HELP_SNAPSHOT, + HELP_UNMOUNT, + HELP_UNSHARE, + HELP_ALLOW, + HELP_UNALLOW, + HELP_USERSPACE, + HELP_GROUPSPACE, + HELP_PROJECTSPACE, + HELP_PROJECT, + HELP_HOLD, + HELP_HOLDS, + HELP_RELEASE, + HELP_DIFF, + HELP_BOOKMARK, + HELP_CHANNEL_PROGRAM, + HELP_LOAD_KEY, + HELP_UNLOAD_KEY, + HELP_CHANGE_KEY, + HELP_VERSION, + HELP_REDACT, + HELP_JAIL, + HELP_UNJAIL, + HELP_WAIT, +} zfs_help_t; + +typedef struct zfs_command { + const char *name; + int (*func)(int argc, char **argv); + zfs_help_t usage; +} zfs_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zfs_command_t command_table[] = { + { "version", zfs_do_version, HELP_VERSION }, + { NULL }, + { "create", zfs_do_create, HELP_CREATE }, + { "destroy", zfs_do_destroy, HELP_DESTROY }, + { NULL }, + { "snapshot", zfs_do_snapshot, HELP_SNAPSHOT }, + { "rollback", zfs_do_rollback, HELP_ROLLBACK }, + { "clone", zfs_do_clone, HELP_CLONE }, + { "promote", zfs_do_promote, HELP_PROMOTE }, + { "rename", zfs_do_rename, HELP_RENAME }, + { "bookmark", zfs_do_bookmark, HELP_BOOKMARK }, + { "program", zfs_do_channel_program, HELP_CHANNEL_PROGRAM }, + { NULL }, + { "list", zfs_do_list, HELP_LIST }, + { NULL }, + { "set", zfs_do_set, HELP_SET }, + { "get", zfs_do_get, HELP_GET }, + { "inherit", zfs_do_inherit, HELP_INHERIT }, + { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, + { NULL }, + { "userspace", zfs_do_userspace, HELP_USERSPACE }, + { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, + { "projectspace", zfs_do_userspace, HELP_PROJECTSPACE }, + { NULL }, + { "project", zfs_do_project, HELP_PROJECT }, + { NULL }, + { "mount", zfs_do_mount, HELP_MOUNT }, + { "unmount", zfs_do_unmount, HELP_UNMOUNT }, + { "share", zfs_do_share, HELP_SHARE }, + { "unshare", zfs_do_unshare, HELP_UNSHARE }, + { NULL }, + { "send", zfs_do_send, HELP_SEND }, + { "receive", zfs_do_receive, HELP_RECEIVE }, + { NULL }, + { "allow", zfs_do_allow, HELP_ALLOW }, + { NULL }, + { "unallow", zfs_do_unallow, HELP_UNALLOW }, + { NULL }, + { "hold", zfs_do_hold, HELP_HOLD }, + { "holds", zfs_do_holds, HELP_HOLDS }, + { "release", zfs_do_release, HELP_RELEASE }, + { "diff", zfs_do_diff, HELP_DIFF }, + { "load-key", zfs_do_load_key, HELP_LOAD_KEY }, + { "unload-key", zfs_do_unload_key, HELP_UNLOAD_KEY }, + { "change-key", zfs_do_change_key, HELP_CHANGE_KEY }, + { "redact", zfs_do_redact, HELP_REDACT }, + { "wait", zfs_do_wait, HELP_WAIT }, + +#ifdef __FreeBSD__ + { "jail", zfs_do_jail, HELP_JAIL }, + { "unjail", zfs_do_unjail, HELP_UNJAIL }, +#endif +}; + +#define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) + +zfs_command_t *current_command; + +static const char * +get_usage(zfs_help_t idx) +{ + switch (idx) { + case HELP_CLONE: + return (gettext("\tclone [-p] [-o property=value] ... " + " \n")); + case HELP_CREATE: + return (gettext("\tcreate [-Pnpv] [-o property=value] ... " + "\n" + "\tcreate [-Pnpsv] [-b blocksize] [-o property=value] ... " + "-V \n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-fnpRrv] \n" + "\tdestroy [-dnpRrv] " + "@[%][,...]\n" + "\tdestroy #\n")); + case HELP_GET: + return (gettext("\tget [-rHp] [-d max] " + "[-o \"all\" | field[,...]]\n" + "\t [-t type[,...]] [-s source[,...]]\n" + "\t <\"all\" | property[,...]> " + "[filesystem|volume|snapshot|bookmark] ...\n")); + case HELP_INHERIT: + return (gettext("\tinherit [-rS] " + " ...\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade [-v]\n" + "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); + case HELP_LIST: + return (gettext("\tlist [-Hp] [-r|-d max] [-o property[,...]] " + "[-s property]...\n\t [-S property]... [-t type[,...]] " + "[filesystem|volume|snapshot] ...\n")); + case HELP_MOUNT: + return (gettext("\tmount\n" + "\tmount [-flvO] [-o opts] <-a | filesystem>\n")); + case HELP_PROMOTE: + return (gettext("\tpromote \n")); + case HELP_RECEIVE: + return (gettext("\treceive [-vMnsFhu] " + "[-o =] ... [-x ] ...\n" + "\t \n" + "\treceive [-vMnsFhu] [-o =] ... " + "[-x ] ... \n" + "\t [-d | -e] \n" + "\treceive -A \n")); + case HELP_RENAME: + return (gettext("\trename [-f] " + "\n" + "\trename [-f] -p \n" + "\trename -r \n")); + case HELP_ROLLBACK: + return (gettext("\trollback [-rRf] \n")); + case HELP_SEND: + return (gettext("\tsend [-DnPpRvLecwhb] [-[i|I] snapshot] " + "\n" + "\tsend [-nvPLecw] [-i snapshot|bookmark] " + "\n" + "\tsend [-DnPpvLec] [-i bookmark|snapshot] " + "--redact \n" + "\tsend [-nvPe] -t \n" + "\tsend [-Pnv] --saved filesystem\n")); + case HELP_SET: + return (gettext("\tset ... " + " ...\n")); + case HELP_SHARE: + return (gettext("\tshare [-l] <-a [nfs|smb] | filesystem>\n")); + case HELP_SNAPSHOT: + return (gettext("\tsnapshot [-r] [-o property=value] ... " + "@ ...\n")); + case HELP_UNMOUNT: + return (gettext("\tunmount [-fu] " + "<-a | filesystem|mountpoint>\n")); + case HELP_UNSHARE: + return (gettext("\tunshare " + "<-a [nfs|smb] | filesystem|mountpoint>\n")); + case HELP_ALLOW: + return (gettext("\tallow \n" + "\tallow [-ldug] " + "<\"everyone\"|user|group>[,...] [,...]\n" + "\t \n" + "\tallow [-ld] -e [,...] " + "\n" + "\tallow -c [,...] \n" + "\tallow -s @setname [,...] " + "\n")); + case HELP_UNALLOW: + return (gettext("\tunallow [-rldug] " + "<\"everyone\"|user|group>[,...]\n" + "\t [[,...]] \n" + "\tunallow [-rld] -e [[,...]] " + "\n" + "\tunallow [-r] -c [[,...]] " + "\n" + "\tunallow [-r] -s @setname [[,...]] " + "\n")); + case HELP_USERSPACE: + return (gettext("\tuserspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "\n")); + case HELP_GROUPSPACE: + return (gettext("\tgroupspace [-Hinp] [-o field[,...]] " + "[-s field] ...\n" + "\t [-S field] ... [-t type[,...]] " + "\n")); + case HELP_PROJECTSPACE: + return (gettext("\tprojectspace [-Hp] [-o field[,...]] " + "[-s field] ... \n" + "\t [-S field] ... \n")); + case HELP_PROJECT: + return (gettext("\tproject [-d|-r] \n" + "\tproject -c [-0] [-d|-r] [-p id] \n" + "\tproject -C [-k] [-r] \n" + "\tproject [-p id] [-r] [-s] \n")); + case HELP_HOLD: + return (gettext("\thold [-r] ...\n")); + case HELP_HOLDS: + return (gettext("\tholds [-rH] ...\n")); + case HELP_RELEASE: + return (gettext("\trelease [-r] ...\n")); + case HELP_DIFF: + return (gettext("\tdiff [-FHt] " + "[snapshot|filesystem]\n")); + case HELP_BOOKMARK: + return (gettext("\tbookmark " + "\n")); + case HELP_CHANNEL_PROGRAM: + return (gettext("\tprogram [-jn] [-t ] " + "[-m ]\n" + "\t [lua args...]\n")); + case HELP_LOAD_KEY: + return (gettext("\tload-key [-rn] [-L ] " + "<-a | filesystem|volume>\n")); + case HELP_UNLOAD_KEY: + return (gettext("\tunload-key [-r] " + "<-a | filesystem|volume>\n")); + case HELP_CHANGE_KEY: + return (gettext("\tchange-key [-l] [-o keyformat=]\n" + "\t [-o keylocation=] [-o pbkfd2iters=]\n" + "\t \n" + "\tchange-key -i [-l] \n")); + case HELP_VERSION: + return (gettext("\tversion\n")); + case HELP_REDACT: + return (gettext("\tredact " + " ...\n")); + case HELP_JAIL: + return (gettext("\tjail \n")); + case HELP_UNJAIL: + return (gettext("\tunjail \n")); + case HELP_WAIT: + return (gettext("\twait [-t ] \n")); + } + + abort(); + /* NOTREACHED */ +} + +void +nomem(void) +{ + (void) fprintf(stderr, gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Utility function to guarantee malloc() success. + */ + +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + nomem(); + + return (data); +} + +static void * +safe_realloc(void *data, size_t size) +{ + void *newp; + if ((newp = realloc(data, size)) == NULL) { + free(data); + nomem(); + } + + return (newp); +} + +static char * +safe_strdup(char *str) +{ + char *dupstr = strdup(str); + + if (dupstr == NULL) + nomem(); + + return (dupstr); +} + +/* + * Callback routine that will print out information for each of + * the properties. + */ +static int +usage_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-15s ", zfs_prop_to_name(prop)); + + if (zfs_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, "YES "); + + if (zfs_prop_inheritable(prop)) + (void) fprintf(fp, " YES "); + else + (void) fprintf(fp, " NO "); + + if (zfs_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zfs_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +static void +usage(boolean_t requested) +{ + int i; + boolean_t show_properties = B_FALSE; + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + + (void) fprintf(fp, gettext("usage: zfs command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + + (void) fprintf(fp, gettext("\nEach dataset is of the form: " + "pool/[dataset/]*dataset[@name]\n")); + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + (strcmp(current_command->name, "set") == 0 || + strcmp(current_command->name, "get") == 0 || + strcmp(current_command->name, "inherit") == 0 || + strcmp(current_command->name, "list") == 0)) + show_properties = B_TRUE; + + if (show_properties) { + (void) fprintf(fp, + gettext("\nThe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-14s %s %s %s\n\n", + "PROPERTY", "EDIT", "INHERIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_DATASET); + + (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "projectused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "userobjused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "groupobjused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "projectobjused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "projectquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "userobjquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupobjquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "projectobjquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "written@"); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "written#"); + (void) fprintf(fp, " NO NO \n"); + + (void) fprintf(fp, gettext("\nSizes are specified in bytes " + "with standard units such as K, M, G, etc.\n")); + (void) fprintf(fp, gettext("\nUser-defined properties can " + "be specified by using a name containing a colon (:).\n")); + (void) fprintf(fp, gettext("\nThe {user|group|project}" + "[obj]{used|quota}@ properties must be appended with\n" + "a user|group|project specifier of one of these forms:\n" + " POSIX name (eg: \"matt\")\n" + " POSIX id (eg: \"126829\")\n" + " SMB name@domain (eg: \"matt@sun\")\n" + " SMB SID (eg: \"S-1-234-567-89\")\n")); + } else { + (void) fprintf(fp, + gettext("\nFor the property list, run: %s\n"), + "zfs set|get"); + (void) fprintf(fp, + gettext("\nFor the delegated permission list, run: %s\n"), + "zfs allow|unallow"); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +/* + * Take a property=value argument string and add it to the given nvlist. + * Modifies the argument inplace. + */ +static boolean_t +parseprop(nvlist_t *props, char *propname) +{ + char *propval; + + if ((propval = strchr(propname, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for property=value argument\n")); + return (B_FALSE); + } + *propval = '\0'; + propval++; + if (nvlist_exists(props, propname)) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (B_FALSE); + } + if (nvlist_add_string(props, propname, propval) != 0) + nomem(); + return (B_TRUE); +} + +/* + * Take a property name argument and add it to the given nvlist. + * Modifies the argument inplace. + */ +static boolean_t +parsepropname(nvlist_t *props, char *propname) +{ + if (strchr(propname, '=') != NULL) { + (void) fprintf(stderr, gettext("invalid character " + "'=' in property argument\n")); + return (B_FALSE); + } + if (nvlist_exists(props, propname)) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (B_FALSE); + } + if (nvlist_add_boolean(props, propname) != 0) + nomem(); + return (B_TRUE); +} + +static int +parse_depth(char *opt, int *flags) +{ + char *tmp; + int depth; + + depth = (int)strtol(opt, &tmp, 0); + if (*tmp) { + (void) fprintf(stderr, + gettext("%s is not an integer\n"), optarg); + usage(B_FALSE); + } + if (depth < 0) { + (void) fprintf(stderr, + gettext("Depth can not be negative.\n")); + usage(B_FALSE); + } + *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); + return (depth); +} + +#define PROGRESS_DELAY 2 /* seconds */ + +static char *pt_reverse = "\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b\b"; +static time_t pt_begin; +static char *pt_header = NULL; +static boolean_t pt_shown; + +static void +start_progress_timer(void) +{ + pt_begin = time(NULL) + PROGRESS_DELAY; + pt_shown = B_FALSE; +} + +static void +set_progress_header(char *header) +{ + assert(pt_header == NULL); + pt_header = safe_strdup(header); + if (pt_shown) { + (void) printf("%s: ", header); + (void) fflush(stdout); + } +} + +static void +update_progress(char *update) +{ + if (!pt_shown && time(NULL) > pt_begin) { + int len = strlen(update); + + (void) printf("%s: %s%*.*s", pt_header, update, len, len, + pt_reverse); + (void) fflush(stdout); + pt_shown = B_TRUE; + } else if (pt_shown) { + int len = strlen(update); + + (void) printf("%s%*.*s", update, len, len, pt_reverse); + (void) fflush(stdout); + } +} + +static void +finish_progress(char *done) +{ + if (pt_shown) { + (void) printf("%s\n", done); + (void) fflush(stdout); + } + free(pt_header); + pt_header = NULL; +} + +static int +zfs_mount_and_share(libzfs_handle_t *hdl, const char *dataset, zfs_type_t type) +{ + zfs_handle_t *zhp = NULL; + int ret = 0; + + zhp = zfs_open(hdl, dataset, type); + if (zhp == NULL) + return (1); + + /* + * Volumes may neither be mounted or shared. Potentially in the + * future filesystems detected on these volumes could be mounted. + */ + if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) { + zfs_close(zhp); + return (0); + } + + /* + * Mount and/or share the new filesystem as appropriate. We provide a + * verbose error message to let the user know that their filesystem was + * in fact created, even if we failed to mount or share it. + * + * If the user doesn't want the dataset automatically mounted, then + * skip the mount/share step + */ + if (zfs_prop_valid_for_type(ZFS_PROP_CANMOUNT, type, B_FALSE) && + zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON) { + if (zfs_mount_delegation_check()) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but it may only be " + "mounted by root\n")); + ret = 1; + } else if (zfs_mount(zhp, NULL, 0) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not mounted\n")); + ret = 1; + } else if (zfs_share(zhp) != 0) { + (void) fprintf(stderr, gettext("filesystem " + "successfully created, but not shared\n")); + ret = 1; + } + zfs_commit_all_shares(); + } + + zfs_close(zhp); + + return (ret); +} + +/* + * zfs clone [-p] [-o prop=value] ... + * + * Given an existing dataset, create a writable copy whose initial contents + * are the same as the source. The newly created dataset maintains a + * dependency on the original; the original cannot be destroyed so long as + * the clone exists. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +static int +zfs_do_clone(int argc, char **argv) +{ + zfs_handle_t *zhp = NULL; + boolean_t parents = B_FALSE; + nvlist_t *props; + int ret = 0; + int c; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "o:p")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + return (1); + } + break; + case 'p': + parents = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + goto usage; + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto usage; + } + + /* open the source dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) { + nvlist_free(props); + return (1); + } + + if (parents && zfs_name_valid(argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) { + /* + * Now create the ancestors of the target dataset. If the + * target already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) { + zfs_close(zhp); + nvlist_free(props); + return (0); + } + if (zfs_create_ancestors(g_zfs, argv[1]) != 0) { + zfs_close(zhp); + nvlist_free(props); + return (1); + } + } + + /* pass to libzfs */ + ret = zfs_clone(zhp, argv[1], props); + + /* create the mountpoint if necessary */ + if (ret == 0) { + if (log_history) { + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + ret = zfs_mount_and_share(g_zfs, argv[1], ZFS_TYPE_DATASET); + } + + zfs_close(zhp); + nvlist_free(props); + + return (!!ret); + +usage: + ASSERT3P(zhp, ==, NULL); + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + +/* + * zfs create [-Pnpv] [-o prop=value] ... fs + * zfs create [-Pnpsv] [-b blocksize] [-o prop=value] ... -V vol size + * + * Create a new dataset. This command can be used to create filesystems + * and volumes. Snapshot creation is handled by 'zfs snapshot'. + * For volumes, the user must specify a size to be used. + * + * The '-s' flag applies only to volumes, and indicates that we should not try + * to set the reservation for this volume. By default we set a reservation + * equal to the size for any volume. For pools with SPA_VERSION >= + * SPA_VERSION_REFRESERVATION, we set a refreservation instead. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + * + * The '-n' flag is no-op (dry run) mode. This will perform a user-space sanity + * check of arguments and properties, but does not check for permissions, + * available space, etc. + * + * The '-v' flag is for verbose output. + * + * The '-P' flag is used for parseable output. It implies '-v'. + */ +static int +zfs_do_create(int argc, char **argv) +{ + zfs_type_t type = ZFS_TYPE_FILESYSTEM; + zpool_handle_t *zpool_handle = NULL; + nvlist_t *real_props = NULL; + uint64_t volsize = 0; + int c; + boolean_t noreserve = B_FALSE; + boolean_t bflag = B_FALSE; + boolean_t parents = B_FALSE; + boolean_t dryrun = B_FALSE; + boolean_t verbose = B_FALSE; + boolean_t parseable = B_FALSE; + int ret = 1; + nvlist_t *props; + uint64_t intval; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, ":PV:b:nso:pv")) != -1) { + switch (c) { + case 'V': + type = ZFS_TYPE_VOLUME; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), intval) != 0) + nomem(); + volsize = intval; + break; + case 'P': + verbose = B_TRUE; + parseable = B_TRUE; + break; + case 'p': + parents = B_TRUE; + break; + case 'b': + bflag = B_TRUE; + if (zfs_nicestrtonum(g_zfs, optarg, &intval) != 0) { + (void) fprintf(stderr, gettext("bad volume " + "block size '%s': %s\n"), optarg, + libzfs_error_description(g_zfs)); + goto error; + } + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + intval) != 0) + nomem(); + break; + case 'n': + dryrun = B_TRUE; + break; + case 'o': + if (!parseprop(props, optarg)) + goto error; + break; + case 's': + noreserve = B_TRUE; + break; + case 'v': + verbose = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing size " + "argument\n")); + goto badusage; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + if ((bflag || noreserve) && type != ZFS_TYPE_VOLUME) { + (void) fprintf(stderr, gettext("'-s' and '-b' can only be " + "used when creating a volume\n")); + goto badusage; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing %s argument\n"), + zfs_type_to_name(type)); + goto badusage; + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + goto badusage; + } + + if (dryrun || (type == ZFS_TYPE_VOLUME && !noreserve)) { + char msg[ZFS_MAX_DATASET_NAME_LEN * 2]; + char *p; + + if ((p = strchr(argv[0], '/')) != NULL) + *p = '\0'; + zpool_handle = zpool_open(g_zfs, argv[0]); + if (p != NULL) + *p = '/'; + if (zpool_handle == NULL) + goto error; + + (void) snprintf(msg, sizeof (msg), + dryrun ? gettext("cannot verify '%s'") : + gettext("cannot create '%s'"), argv[0]); + if (props && (real_props = zfs_valid_proplist(g_zfs, type, + props, 0, NULL, zpool_handle, B_TRUE, msg)) == NULL) { + zpool_close(zpool_handle); + goto error; + } + } + + /* + * if volsize is not a multiple of volblocksize, round it up to the + * nearest multiple of the volblocksize + */ + if (type == ZFS_TYPE_VOLUME) { + uint64_t volblocksize; + + if (nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize) != 0) + volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + + if (volsize % volblocksize) { + volsize = P2ROUNDUP_TYPED(volsize, volblocksize, + uint64_t); + + if (nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + + + if (type == ZFS_TYPE_VOLUME && !noreserve) { + uint64_t spa_version; + zfs_prop_t resv_prop; + char *strval; + + spa_version = zpool_get_prop_int(zpool_handle, + ZPOOL_PROP_VERSION, NULL); + if (spa_version >= SPA_VERSION_REFRESERVATION) + resv_prop = ZFS_PROP_REFRESERVATION; + else + resv_prop = ZFS_PROP_RESERVATION; + + volsize = zvol_volsize_to_reservation(zpool_handle, volsize, + real_props); + + if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), + &strval) != 0) { + if (nvlist_add_uint64(props, + zfs_prop_to_name(resv_prop), volsize) != 0) { + nvlist_free(props); + nomem(); + } + } + } + if (zpool_handle != NULL) { + zpool_close(zpool_handle); + nvlist_free(real_props); + } + + if (parents && zfs_name_valid(argv[0], type)) { + /* + * Now create the ancestors of target dataset. If the target + * already exists and '-p' option was used we should not + * complain. + */ + if (zfs_dataset_exists(g_zfs, argv[0], type)) { + ret = 0; + goto error; + } + if (verbose) { + (void) printf(parseable ? "create_ancestors\t%s\n" : + dryrun ? "would create ancestors of %s\n" : + "create ancestors of %s\n", argv[0]); + } + if (!dryrun) { + if (zfs_create_ancestors(g_zfs, argv[0]) != 0) { + goto error; + } + } + } + + if (verbose) { + nvpair_t *nvp = NULL; + (void) printf(parseable ? "create\t%s\n" : + dryrun ? "would create %s\n" : "create %s\n", argv[0]); + while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { + uint64_t uval; + char *sval; + + switch (nvpair_type(nvp)) { + case DATA_TYPE_UINT64: + VERIFY0(nvpair_value_uint64(nvp, &uval)); + (void) printf(parseable ? + "property\t%s\t%llu\n" : "\t%s=%llu\n", + nvpair_name(nvp), (u_longlong_t)uval); + break; + case DATA_TYPE_STRING: + VERIFY0(nvpair_value_string(nvp, &sval)); + (void) printf(parseable ? + "property\t%s\t%s\n" : "\t%s=%s\n", + nvpair_name(nvp), sval); + break; + default: + (void) fprintf(stderr, "property '%s' " + "has illegal type %d\n", + nvpair_name(nvp), nvpair_type(nvp)); + abort(); + } + } + } + if (dryrun) { + ret = 0; + goto error; + } + + /* pass to libzfs */ + if (zfs_create(g_zfs, argv[0], type, props) != 0) + goto error; + + if (log_history) { + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + ret = zfs_mount_and_share(g_zfs, argv[0], ZFS_TYPE_DATASET); +error: + nvlist_free(props); + return (ret); +badusage: + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zfs destroy [-rRf] + * zfs destroy [-rRd] + * + * -r Recursively destroy all children + * -R Recursively destroy all dependents, including clones + * -f Force unmounting of any dependents + * -d If we can't destroy now, mark for deferred destruction + * + * Destroys the given dataset. By default, it will unmount any filesystems, + * and refuse to destroy a dataset that has any dependents. A dependent can + * either be a child, or a clone of a child. + */ +typedef struct destroy_cbdata { + boolean_t cb_first; + boolean_t cb_force; + boolean_t cb_recurse; + boolean_t cb_error; + boolean_t cb_doclones; + zfs_handle_t *cb_target; + boolean_t cb_defer_destroy; + boolean_t cb_verbose; + boolean_t cb_parsable; + boolean_t cb_dryrun; + nvlist_t *cb_nvl; + nvlist_t *cb_batchedsnaps; + + /* first snap in contiguous run */ + char *cb_firstsnap; + /* previous snap in contiguous run */ + char *cb_prevsnap; + int64_t cb_snapused; + char *cb_snapspec; + char *cb_bookmark; + uint64_t cb_snap_count; +} destroy_cbdata_t; + +/* + * Check for any dependents based on the '-r' or '-R' flags. + */ +static int +destroy_check_dependent(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cbp = data; + const char *tname = zfs_get_name(cbp->cb_target); + const char *name = zfs_get_name(zhp); + + if (strncmp(tname, name, strlen(tname)) == 0 && + (name[strlen(tname)] == '/' || name[strlen(tname)] == '@')) { + /* + * This is a direct descendant, not a clone somewhere else in + * the hierarchy. + */ + if (cbp->cb_recurse) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has children\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-r' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = B_TRUE; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } else { + /* + * This is a clone. We only want to report this if the '-r' + * wasn't specified, or the target is a snapshot. + */ + if (!cbp->cb_recurse && + zfs_get_type(cbp->cb_target) != ZFS_TYPE_SNAPSHOT) + goto out; + + if (cbp->cb_first) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "%s has dependent clones\n"), + zfs_get_name(cbp->cb_target), + zfs_type_to_name(zfs_get_type(cbp->cb_target))); + (void) fprintf(stderr, gettext("use '-R' to destroy " + "the following datasets:\n")); + cbp->cb_first = B_FALSE; + cbp->cb_error = B_TRUE; + cbp->cb_dryrun = B_TRUE; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + } + +out: + zfs_close(zhp); + return (0); +} + +static int +destroy_batched(destroy_cbdata_t *cb) +{ + int error = zfs_destroy_snaps_nvl(g_zfs, + cb->cb_batchedsnaps, B_FALSE); + fnvlist_free(cb->cb_batchedsnaps); + cb->cb_batchedsnaps = fnvlist_alloc(); + return (error); +} + +static int +destroy_callback(zfs_handle_t *zhp, void *data) +{ + destroy_cbdata_t *cb = data; + const char *name = zfs_get_name(zhp); + int error; + + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + + /* + * Ignore pools (which we've already flagged as an error before getting + * here). + */ + if (strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + zfs_close(zhp); + return (0); + } + if (cb->cb_dryrun) { + zfs_close(zhp); + return (0); + } + + /* + * We batch up all contiguous snapshots (even of different + * filesystems) and destroy them with one ioctl. We can't + * simply do all snap deletions and then all fs deletions, + * because we must delete a clone before its origin. + */ + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT) { + cb->cb_snap_count++; + fnvlist_add_boolean(cb->cb_batchedsnaps, name); + if (cb->cb_snap_count % 10 == 0 && cb->cb_defer_destroy) + error = destroy_batched(cb); + } else { + error = destroy_batched(cb); + if (error != 0 || + zfs_unmount(zhp, NULL, cb->cb_force ? MS_FORCE : 0) != 0 || + zfs_destroy(zhp, cb->cb_defer_destroy) != 0) { + zfs_close(zhp); + /* + * When performing a recursive destroy we ignore errors + * so that the recursive destroy could continue + * destroying past problem datasets + */ + if (cb->cb_recurse) { + cb->cb_error = B_TRUE; + return (0); + } + return (-1); + } + } + + zfs_close(zhp); + return (0); +} + +static int +destroy_print_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + const char *name = zfs_get_name(zhp); + int err = 0; + + if (nvlist_exists(cb->cb_nvl, name)) { + if (cb->cb_firstsnap == NULL) + cb->cb_firstsnap = strdup(name); + if (cb->cb_prevsnap != NULL) + free(cb->cb_prevsnap); + /* this snap continues the current range */ + cb->cb_prevsnap = strdup(name); + if (cb->cb_firstsnap == NULL || cb->cb_prevsnap == NULL) + nomem(); + if (cb->cb_verbose) { + if (cb->cb_parsable) { + (void) printf("destroy\t%s\n", name); + } else if (cb->cb_dryrun) { + (void) printf(gettext("would destroy %s\n"), + name); + } else { + (void) printf(gettext("will destroy %s\n"), + name); + } + } + } else if (cb->cb_firstsnap != NULL) { + /* end of this range */ + uint64_t used = 0; + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + zfs_close(zhp); + return (err); +} + +static int +destroy_print_snapshots(zfs_handle_t *fs_zhp, destroy_cbdata_t *cb) +{ + int err; + assert(cb->cb_firstsnap == NULL); + assert(cb->cb_prevsnap == NULL); + err = zfs_iter_snapshots_sorted(fs_zhp, destroy_print_cb, cb, 0, 0); + if (cb->cb_firstsnap != NULL) { + uint64_t used = 0; + if (err == 0) { + err = lzc_snaprange_space(cb->cb_firstsnap, + cb->cb_prevsnap, &used); + } + cb->cb_snapused += used; + free(cb->cb_firstsnap); + cb->cb_firstsnap = NULL; + free(cb->cb_prevsnap); + cb->cb_prevsnap = NULL; + } + return (err); +} + +static int +snapshot_to_nvl_cb(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + /* Check for clones. */ + if (!cb->cb_doclones && !cb->cb_defer_destroy) { + cb->cb_target = zhp; + cb->cb_first = B_TRUE; + err = zfs_iter_dependents(zhp, B_TRUE, + destroy_check_dependent, cb); + } + + if (err == 0) { + if (nvlist_add_boolean(cb->cb_nvl, zfs_get_name(zhp))) + nomem(); + } + zfs_close(zhp); + return (err); +} + +static int +gather_snapshots(zfs_handle_t *zhp, void *arg) +{ + destroy_cbdata_t *cb = arg; + int err = 0; + + err = zfs_iter_snapspec(zhp, cb->cb_snapspec, snapshot_to_nvl_cb, cb); + if (err == ENOENT) + err = 0; + if (err != 0) + goto out; + + if (cb->cb_verbose) { + err = destroy_print_snapshots(zhp, cb); + if (err != 0) + goto out; + } + + if (cb->cb_recurse) + err = zfs_iter_filesystems(zhp, gather_snapshots, cb); + +out: + zfs_close(zhp); + return (err); +} + +static int +destroy_clones(destroy_cbdata_t *cb) +{ + nvpair_t *pair; + for (pair = nvlist_next_nvpair(cb->cb_nvl, NULL); + pair != NULL; + pair = nvlist_next_nvpair(cb->cb_nvl, pair)) { + zfs_handle_t *zhp = zfs_open(g_zfs, nvpair_name(pair), + ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + boolean_t defer = cb->cb_defer_destroy; + int err; + + /* + * We can't defer destroy non-snapshots, so set it to + * false while destroying the clones. + */ + cb->cb_defer_destroy = B_FALSE; + err = zfs_iter_dependents(zhp, B_FALSE, + destroy_callback, cb); + cb->cb_defer_destroy = defer; + zfs_close(zhp); + if (err != 0) + return (err); + } + } + return (0); +} + +static int +zfs_do_destroy(int argc, char **argv) +{ + destroy_cbdata_t cb = { 0 }; + int rv = 0; + int err = 0; + int c; + zfs_handle_t *zhp = NULL; + char *at, *pound; + zfs_type_t type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, "vpndfrR")) != -1) { + switch (c) { + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'p': + cb.cb_verbose = B_TRUE; + cb.cb_parsable = B_TRUE; + break; + case 'n': + cb.cb_dryrun = B_TRUE; + break; + case 'd': + cb.cb_defer_destroy = B_TRUE; + type = ZFS_TYPE_SNAPSHOT; + break; + case 'f': + cb.cb_force = B_TRUE; + break; + case 'r': + cb.cb_recurse = B_TRUE; + break; + case 'R': + cb.cb_recurse = B_TRUE; + cb.cb_doclones = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc == 0) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + at = strchr(argv[0], '@'); + pound = strchr(argv[0], '#'); + if (at != NULL) { + + /* Build the list of snaps to destroy in cb_nvl. */ + cb.cb_nvl = fnvlist_alloc(); + + *at = '\0'; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + nvlist_free(cb.cb_nvl); + return (1); + } + + cb.cb_snapspec = at + 1; + if (gather_snapshots(zfs_handle_dup(zhp), &cb) != 0 || + cb.cb_error) { + rv = 1; + goto out; + } + + if (nvlist_empty(cb.cb_nvl)) { + (void) fprintf(stderr, gettext("could not find any " + "snapshots to destroy; check snapshot names.\n")); + rv = 1; + goto out; + } + + if (cb.cb_verbose) { + char buf[16]; + zfs_nicebytes(cb.cb_snapused, buf, sizeof (buf)); + if (cb.cb_parsable) { + (void) printf("reclaim\t%llu\n", + (u_longlong_t)cb.cb_snapused); + } else if (cb.cb_dryrun) { + (void) printf(gettext("would reclaim %s\n"), + buf); + } else { + (void) printf(gettext("will reclaim %s\n"), + buf); + } + } + + if (!cb.cb_dryrun) { + if (cb.cb_doclones) { + cb.cb_batchedsnaps = fnvlist_alloc(); + err = destroy_clones(&cb); + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, B_FALSE); + } + if (err != 0) { + rv = 1; + goto out; + } + } + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, cb.cb_nvl, + cb.cb_defer_destroy); + } + } + + if (err != 0) + rv = 1; + } else if (pound != NULL) { + int err; + nvlist_t *nvl; + + if (cb.cb_dryrun) { + (void) fprintf(stderr, + "dryrun is not supported with bookmark\n"); + return (-1); + } + + if (cb.cb_defer_destroy) { + (void) fprintf(stderr, + "defer destroy is not supported with bookmark\n"); + return (-1); + } + + if (cb.cb_recurse) { + (void) fprintf(stderr, + "recursive is not supported with bookmark\n"); + return (-1); + } + + /* + * Unfortunately, zfs_bookmark() doesn't honor the + * casesensitivity setting. However, we can't simply + * remove this check, because lzc_destroy_bookmarks() + * ignores non-existent bookmarks, so this is necessary + * to get a proper error message. + */ + if (!zfs_bookmark_exists(argv[0])) { + (void) fprintf(stderr, gettext("bookmark '%s' " + "does not exist.\n"), argv[0]); + return (1); + } + + nvl = fnvlist_alloc(); + fnvlist_add_boolean(nvl, argv[0]); + + err = lzc_destroy_bookmarks(nvl, NULL); + if (err != 0) { + (void) zfs_standard_error(g_zfs, err, + "cannot destroy bookmark"); + } + + nvlist_free(nvl); + + return (err); + } else { + /* Open the given dataset */ + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) + return (1); + + cb.cb_target = zhp; + + /* + * Perform an explicit check for pools before going any further. + */ + if (!cb.cb_recurse && strchr(zfs_get_name(zhp), '/') == NULL && + zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("cannot destroy '%s': " + "operation does not apply to pools\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zfs destroy -r " + "%s' to destroy all datasets in the pool\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use 'zpool destroy %s' " + "to destroy the pool itself\n"), zfs_get_name(zhp)); + rv = 1; + goto out; + } + + /* + * Check for any dependents and/or clones. + */ + cb.cb_first = B_TRUE; + if (!cb.cb_doclones && + zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, + &cb) != 0) { + rv = 1; + goto out; + } + + if (cb.cb_error) { + rv = 1; + goto out; + } + cb.cb_batchedsnaps = fnvlist_alloc(); + if (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, + &cb) != 0) { + rv = 1; + goto out; + } + + /* + * Do the real thing. The callback will close the + * handle regardless of whether it succeeds or not. + */ + err = destroy_callback(zhp, &cb); + zhp = NULL; + if (err == 0) { + err = zfs_destroy_snaps_nvl(g_zfs, + cb.cb_batchedsnaps, cb.cb_defer_destroy); + } + if (err != 0 || cb.cb_error == B_TRUE) + rv = 1; + } + +out: + fnvlist_free(cb.cb_batchedsnaps); + fnvlist_free(cb.cb_nvl); + if (zhp != NULL) + zfs_close(zhp); + return (rv); +} + +static boolean_t +is_recvd_column(zprop_get_cbdata_t *cbp) +{ + int i; + zfs_get_column_t col; + + for (i = 0; i < ZFS_GET_NCOLS && + (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) + if (col == GET_COL_RECVD) + return (B_TRUE); + return (B_FALSE); +} + +/* + * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] + * < all | property[,property]... > < fs | snap | vol > ... + * + * -r recurse over any child datasets + * -H scripted mode. Headers are stripped, and fields are separated + * by tabs instead of spaces. + * -o Set of fields to display. One of "name,property,value, + * received,source". Default is "name,property,value,source". + * "all" is an alias for all five. + * -s Set of sources to allow. One of + * "local,default,inherited,received,temporary,none". Default is + * all six. + * -p Display values in parsable (literal) format. + * + * Prints properties for the given datasets. The user can control which + * columns to display as well as which property types to allow. + */ + +/* + * Invoked to display the properties for a single dataset. + */ +static int +get_callback(zfs_handle_t *zhp, void *data) +{ + char buf[ZFS_MAXPROPLEN]; + char rbuf[ZFS_MAXPROPLEN]; + zprop_source_t sourcetype; + char source[ZFS_MAX_DATASET_NAME_LEN]; + zprop_get_cbdata_t *cbp = data; + nvlist_t *user_props = zfs_get_user_props(zhp); + zprop_list_t *pl = cbp->cb_proplist; + nvlist_t *propval; + char *strval; + char *sourceval; + boolean_t received = is_recvd_column(cbp); + + for (; pl != NULL; pl = pl->pl_next) { + char *recvdval = NULL; + /* + * Skip the special fake placeholder. This will also skip over + * the name property when 'all' is specified. + */ + if (pl->pl_prop == ZFS_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, buf, + sizeof (buf), &sourcetype, source, + sizeof (source), + cbp->cb_literal) != 0) { + if (pl->pl_all) + continue; + if (!zfs_prop_valid_for_type(pl->pl_prop, + ZFS_TYPE_DATASET, B_FALSE)) { + (void) fprintf(stderr, + gettext("No such property '%s'\n"), + zfs_prop_to_name(pl->pl_prop)); + continue; + } + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + if (received && (zfs_prop_get_recvd(zhp, + zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + zfs_prop_to_name(pl->pl_prop), + buf, sourcetype, source, recvdval); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); + } else if (zfs_prop_written(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); + } else { + if (nvlist_lookup_nvlist(user_props, + pl->pl_user_prop, &propval) != 0) { + if (pl->pl_all) + continue; + sourcetype = ZPROP_SRC_NONE; + strval = "-"; + } else { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + verify(nvlist_lookup_string(propval, + ZPROP_SOURCE, &sourceval) == 0); + + if (strcmp(sourceval, + zfs_get_name(zhp)) == 0) { + sourcetype = ZPROP_SRC_LOCAL; + } else if (strcmp(sourceval, + ZPROP_SOURCE_VAL_RECVD) == 0) { + sourcetype = ZPROP_SRC_RECEIVED; + } else { + sourcetype = ZPROP_SRC_INHERITED; + (void) strlcpy(source, + sourceval, sizeof (source)); + } + } + + if (received && (zfs_prop_get_recvd(zhp, + pl->pl_user_prop, rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, strval, sourcetype, + source, recvdval); + } + } + + return (0); +} + +static int +zfs_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + int i, c, flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + int types = ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK; + char *value, *fields; + int ret = 0; + int limit = 0; + zprop_list_t fake_name = { 0 }; + + /* + * Set up default columns and sources. + */ + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_DATASET; + + /* check options */ + while ((c = getopt(argc, argv, ":d:o:s:rt:Hp")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'o': + /* + * Process the set of columns to display. We zero out + * the structure to give us a blank slate. + */ + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "received", + "source", "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_RECVD; + flags |= ZFS_ITER_RECVD_PROPS; + break; + case 4: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 5: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_RECVD; + cb.cb_columns[4] = GET_COL_SOURCE; + flags |= ZFS_ITER_RECVD_PROPS; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case 's': + cb.cb_sources = 0; + while (*optarg != '\0') { + static char *source_subopts[] = { + "local", "default", "inherited", + "received", "temporary", "none", + NULL }; + + switch (getsubopt(&optarg, source_subopts, + &value)) { + case 0: + cb.cb_sources |= ZPROP_SRC_LOCAL; + break; + case 1: + cb.cb_sources |= ZPROP_SRC_DEFAULT; + break; + case 2: + cb.cb_sources |= ZPROP_SRC_INHERITED; + break; + case 3: + cb.cb_sources |= ZPROP_SRC_RECEIVED; + break; + case 4: + cb.cb_sources |= ZPROP_SRC_TEMPORARY; + break; + case 5: + cb.cb_sources |= ZPROP_SRC_NONE; + break; + default: + (void) fprintf(stderr, + gettext("invalid source " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + + case 't': + types = 0; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "snap", "bookmark", + "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + case 3: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 4: + types |= ZFS_TYPE_BOOKMARK; + break; + case 5: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; + break; + + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + fields = argv[0]; + + /* + * Handle users who want to get all snapshots or bookmarks + * of a dataset (ex. 'zfs get -t snapshot refer '). + */ + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 1 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); + limit = 1; + } + + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + argc--; + argv++; + + /* + * As part of zfs_expand_proplist(), we keep track of the maximum column + * width for each property. For the 'NAME' (and 'SOURCE') columns, we + * need to know the maximum name length. However, the user likely did + * not specify 'name' as one of the properties to fetch, so we need to + * make sure we always include at least this property for + * print_get_headers() to work properly. + */ + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZFS_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + cb.cb_first = B_TRUE; + + /* run for each object */ + ret = zfs_for_each(argc, argv, flags, types, NULL, + &cb.cb_proplist, limit, get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +/* + * inherit [-rS] ... + * + * -r Recurse over all children + * -S Revert to received value, if any + * + * For each dataset specified on the command line, inherit the given property + * from its parent. Inheriting a property at the pool level will cause it to + * use the default value. The '-r' flag will recurse over all children, and is + * useful for setting a property on a hierarchy-wide basis, regardless of any + * local modifications for each dataset. + */ + +typedef struct inherit_cbdata { + const char *cb_propname; + boolean_t cb_received; +} inherit_cbdata_t; + +static int +inherit_recurse_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); + + /* + * If we're doing it recursively, then ignore properties that + * are not valid for this type of dataset. + */ + if (prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, zfs_get_type(zhp), B_FALSE)) + return (0); + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +inherit_cb(zfs_handle_t *zhp, void *data) +{ + inherit_cbdata_t *cb = data; + + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); +} + +static int +zfs_do_inherit(int argc, char **argv) +{ + int c; + zfs_prop_t prop; + inherit_cbdata_t cb = { 0 }; + char *propname; + int ret = 0; + int flags = 0; + boolean_t received = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "rS")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'S': + received = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + + propname = argv[0]; + argc--; + argv++; + + if ((prop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + if (zfs_prop_readonly(prop)) { + (void) fprintf(stderr, gettext( + "%s property is read-only\n"), + propname); + return (1); + } + if (!zfs_prop_inheritable(prop) && !received) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be inherited\n"), propname); + if (prop == ZFS_PROP_QUOTA || + prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { + (void) fprintf(stderr, gettext("use 'zfs set " + "%s=none' to clear\n"), propname); + (void) fprintf(stderr, gettext("use 'zfs " + "inherit -S %s' to revert to received " + "value\n"), propname); + } + return (1); + } + if (received && (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION)) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be reverted to a received value\n"), propname); + return (1); + } + } else if (!zfs_prop_user(propname)) { + (void) fprintf(stderr, gettext("invalid property '%s'\n"), + propname); + usage(B_FALSE); + } + + cb.cb_propname = propname; + cb.cb_received = received; + + if (flags & ZFS_ITER_RECURSE) { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_recurse_cb, &cb); + } else { + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, + NULL, NULL, 0, inherit_cb, &cb); + } + + return (ret); +} + +typedef struct upgrade_cbdata { + uint64_t cb_numupgraded; + uint64_t cb_numsamegraded; + uint64_t cb_numfailed; + uint64_t cb_version; + boolean_t cb_newer; + boolean_t cb_foundone; + char cb_lastfs[ZFS_MAX_DATASET_NAME_LEN]; +} upgrade_cbdata_t; + +static int +same_pool(zfs_handle_t *zhp, const char *name) +{ + int len1 = strcspn(name, "/@"); + const char *zhname = zfs_get_name(zhp); + int len2 = strcspn(zhname, "/@"); + + if (len1 != len2) + return (B_FALSE); + return (strncmp(name, zhname, len1) == 0); +} + +static int +upgrade_list_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + + /* list if it's old/new */ + if ((!cb->cb_newer && version < ZPL_VERSION) || + (cb->cb_newer && version > ZPL_VERSION)) { + char *str; + if (cb->cb_newer) { + str = gettext("The following filesystems are " + "formatted using a newer software version and\n" + "cannot be accessed on the current system.\n\n"); + } else { + str = gettext("The following filesystems are " + "out of date, and can be upgraded. After being\n" + "upgraded, these filesystems (and any 'zfs send' " + "streams generated from\n" + "subsequent snapshots) will no longer be " + "accessible by older software versions.\n\n"); + } + + if (!cb->cb_foundone) { + (void) puts(str); + (void) printf(gettext("VER FILESYSTEM\n")); + (void) printf(gettext("--- ------------\n")); + cb->cb_foundone = B_TRUE; + } + + (void) printf("%2u %s\n", version, zfs_get_name(zhp)); + } + + return (0); +} + +static int +upgrade_set_callback(zfs_handle_t *zhp, void *data) +{ + upgrade_cbdata_t *cb = data; + int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int needed_spa_version; + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + needed_spa_version = zfs_spa_version_map(cb->cb_version); + + if (needed_spa_version < 0) + return (-1); + + if (spa_version < needed_spa_version) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), needed_spa_version); + cb->cb_numfailed++; + return (0); + } + + /* upgrade */ + if (version < cb->cb_version) { + char verstr[16]; + (void) snprintf(verstr, sizeof (verstr), + "%llu", (u_longlong_t)cb->cb_version); + if (cb->cb_lastfs[0] && !same_pool(zhp, cb->cb_lastfs)) { + /* + * If they did "zfs upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + if (zfs_prop_set(zhp, "version", verstr) == 0) + cb->cb_numupgraded++; + else + cb->cb_numfailed++; + (void) strcpy(cb->cb_lastfs, zfs_get_name(zhp)); + } else if (version > cb->cb_version) { + /* can't downgrade */ + (void) printf(gettext("%s: can not be downgraded; " + "it is already at version %u\n"), + zfs_get_name(zhp), version); + cb->cb_numfailed++; + } else { + cb->cb_numsamegraded++; + } + return (0); +} + +/* + * zfs upgrade + * zfs upgrade -v + * zfs upgrade [-r] [-V ] <-a | filesystem> + */ +static int +zfs_do_upgrade(int argc, char **argv) +{ + boolean_t all = B_FALSE; + boolean_t showversions = B_FALSE; + int ret = 0; + upgrade_cbdata_t cb = { 0 }; + int c; + int flags = ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, "rvV:a")) != -1) { + switch (c) { + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + if (zfs_prop_string_to_index(ZFS_PROP_VERSION, + optarg, &cb.cb_version) != 0) { + (void) fprintf(stderr, + gettext("invalid version %s\n"), optarg); + usage(B_FALSE); + } + break; + case 'a': + all = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if ((!all && !argc) && ((flags & ZFS_ITER_RECURSE) | cb.cb_version)) + usage(B_FALSE); + if (showversions && (flags & ZFS_ITER_RECURSE || all || + cb.cb_version || argc)) + usage(B_FALSE); + if ((all || argc) && (showversions)) + usage(B_FALSE); + if (all && argc) + usage(B_FALSE); + + if (showversions) { + /* Show info on available versions. */ + (void) printf(gettext("The following filesystem versions are " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); + (void) printf(gettext(" 2 Enhanced directory entries\n")); + (void) printf(gettext(" 3 Case insensitive and filesystem " + "user identifier (FUID)\n")); + (void) printf(gettext(" 4 userquota, groupquota " + "properties\n")); + (void) printf(gettext(" 5 System attributes\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf("see the ZFS Administration Guide.\n\n"); + ret = 0; + } else if (argc || all) { + /* Upgrade filesystems */ + if (cb.cb_version == 0) + cb.cb_version = ZPL_VERSION; + ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_set_callback, &cb); + (void) printf(gettext("%llu filesystems upgraded\n"), + (u_longlong_t)cb.cb_numupgraded); + if (cb.cb_numsamegraded) { + (void) printf(gettext("%llu filesystems already at " + "this version\n"), + (u_longlong_t)cb.cb_numsamegraded); + } + if (cb.cb_numfailed != 0) + ret = 1; + } else { + /* List old-version filesystems */ + boolean_t found; + (void) printf(gettext("This system is currently running " + "ZFS filesystem version %llu.\n\n"), ZPL_VERSION); + + flags |= ZFS_ITER_RECURSE; + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + found = cb.cb_foundone; + cb.cb_foundone = B_FALSE; + cb.cb_newer = B_TRUE; + + ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, + NULL, NULL, 0, upgrade_list_callback, &cb); + + if (!cb.cb_foundone && !found) { + (void) printf(gettext("All filesystems are " + "formatted with the current version.\n")); + } + } + + return (ret); +} + +/* + * zfs userspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * zfs groupspace [-Hinp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] [-t type[,...]] filesystem | snapshot + * zfs projectspace [-Hp] [-o field[,...]] [-s field [-s field]...] + * [-S field [-S field]...] filesystem | snapshot + * + * -H Scripted mode; elide headers and separate columns by tabs. + * -i Translate SID to POSIX ID. + * -n Print numeric ID instead of user/group name. + * -o Control which fields to display. + * -p Use exact (parsable) numeric output. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * Displays space consumed by, and quotas on, each user in the specified + * filesystem or snapshot. + */ + +/* us_field_types, us_field_hdr and us_field_names should be kept in sync */ +enum us_field_types { + USFIELD_TYPE, + USFIELD_NAME, + USFIELD_USED, + USFIELD_QUOTA, + USFIELD_OBJUSED, + USFIELD_OBJQUOTA +}; +static char *us_field_hdr[] = { "TYPE", "NAME", "USED", "QUOTA", + "OBJUSED", "OBJQUOTA" }; +static char *us_field_names[] = { "type", "name", "used", "quota", + "objused", "objquota" }; +#define USFIELD_LAST (sizeof (us_field_names) / sizeof (char *)) + +#define USTYPE_PSX_GRP (1 << 0) +#define USTYPE_PSX_USR (1 << 1) +#define USTYPE_SMB_GRP (1 << 2) +#define USTYPE_SMB_USR (1 << 3) +#define USTYPE_PROJ (1 << 4) +#define USTYPE_ALL \ + (USTYPE_PSX_GRP | USTYPE_PSX_USR | USTYPE_SMB_GRP | USTYPE_SMB_USR | \ + USTYPE_PROJ) + +static int us_type_bits[] = { + USTYPE_PSX_GRP, + USTYPE_PSX_USR, + USTYPE_SMB_GRP, + USTYPE_SMB_USR, + USTYPE_ALL +}; +static char *us_type_names[] = { "posixgroup", "posixuser", "smbgroup", + "smbuser", "all" }; + +typedef struct us_node { + nvlist_t *usn_nvl; + uu_avl_node_t usn_avlnode; + uu_list_node_t usn_listnode; +} us_node_t; + +typedef struct us_cbdata { + nvlist_t **cb_nvlp; + uu_avl_pool_t *cb_avl_pool; + uu_avl_t *cb_avl; + boolean_t cb_numname; + boolean_t cb_nicenum; + boolean_t cb_sid2posix; + zfs_userquota_prop_t cb_prop; + zfs_sort_column_t *cb_sortcol; + size_t cb_width[USFIELD_LAST]; +} us_cbdata_t; + +static boolean_t us_populated = B_FALSE; + +typedef struct { + zfs_sort_column_t *si_sortcol; + boolean_t si_numname; +} us_sort_info_t; + +static int +us_field_index(char *field) +{ + int i; + + for (i = 0; i < USFIELD_LAST; i++) { + if (strcmp(field, us_field_names[i]) == 0) + return (i); + } + + return (-1); +} + +static int +us_compare(const void *larg, const void *rarg, void *unused) +{ + const us_node_t *l = larg; + const us_node_t *r = rarg; + us_sort_info_t *si = (us_sort_info_t *)unused; + zfs_sort_column_t *sortcol = si->si_sortcol; + boolean_t numname = si->si_numname; + nvlist_t *lnvl = l->usn_nvl; + nvlist_t *rnvl = r->usn_nvl; + int rc = 0; + boolean_t lvb, rvb; + + for (; sortcol != NULL; sortcol = sortcol->sc_next) { + char *lvstr = ""; + char *rvstr = ""; + uint32_t lv32 = 0; + uint32_t rv32 = 0; + uint64_t lv64 = 0; + uint64_t rv64 = 0; + zfs_prop_t prop = sortcol->sc_prop; + const char *propname = NULL; + boolean_t reverse = sortcol->sc_reverse; + + switch (prop) { + case ZFS_PROP_TYPE: + propname = "type"; + (void) nvlist_lookup_uint32(lnvl, propname, &lv32); + (void) nvlist_lookup_uint32(rnvl, propname, &rv32); + if (rv32 != lv32) + rc = (rv32 < lv32) ? 1 : -1; + break; + case ZFS_PROP_NAME: + propname = "name"; + if (numname) { +compare_nums: + (void) nvlist_lookup_uint64(lnvl, propname, + &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, + &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + } else { + if ((nvlist_lookup_string(lnvl, propname, + &lvstr) == ENOENT) || + (nvlist_lookup_string(rnvl, propname, + &rvstr) == ENOENT)) { + goto compare_nums; + } + rc = strcmp(lvstr, rvstr); + } + break; + case ZFS_PROP_USED: + case ZFS_PROP_QUOTA: + if (!us_populated) + break; + if (prop == ZFS_PROP_USED) + propname = "used"; + else + propname = "quota"; + (void) nvlist_lookup_uint64(lnvl, propname, &lv64); + (void) nvlist_lookup_uint64(rnvl, propname, &rv64); + if (rv64 != lv64) + rc = (rv64 < lv64) ? 1 : -1; + break; + + default: + break; + } + + if (rc != 0) { + if (rc < 0) + return (reverse ? 1 : -1); + else + return (reverse ? -1 : 1); + } + } + + /* + * If entries still seem to be the same, check if they are of the same + * type (smbentity is added only if we are doing SID to POSIX ID + * translation where we can have duplicate type/name combinations). + */ + if (nvlist_lookup_boolean_value(lnvl, "smbentity", &lvb) == 0 && + nvlist_lookup_boolean_value(rnvl, "smbentity", &rvb) == 0 && + lvb != rvb) + return (lvb < rvb ? -1 : 1); + + return (0); +} + +static boolean_t +zfs_prop_is_user(unsigned p) +{ + return (p == ZFS_PROP_USERUSED || p == ZFS_PROP_USERQUOTA || + p == ZFS_PROP_USEROBJUSED || p == ZFS_PROP_USEROBJQUOTA); +} + +static boolean_t +zfs_prop_is_group(unsigned p) +{ + return (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA || + p == ZFS_PROP_GROUPOBJUSED || p == ZFS_PROP_GROUPOBJQUOTA); +} + +static boolean_t +zfs_prop_is_project(unsigned p) +{ + return (p == ZFS_PROP_PROJECTUSED || p == ZFS_PROP_PROJECTQUOTA || + p == ZFS_PROP_PROJECTOBJUSED || p == ZFS_PROP_PROJECTOBJQUOTA); +} + +static inline const char * +us_type2str(unsigned field_type) +{ + switch (field_type) { + case USTYPE_PSX_USR: + return ("POSIX User"); + case USTYPE_PSX_GRP: + return ("POSIX Group"); + case USTYPE_SMB_USR: + return ("SMB User"); + case USTYPE_SMB_GRP: + return ("SMB Group"); + case USTYPE_PROJ: + return ("Project"); + default: + return ("Undefined"); + } +} + +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) +{ + us_cbdata_t *cb = (us_cbdata_t *)arg; + zfs_userquota_prop_t prop = cb->cb_prop; + char *name = NULL; + char *propname; + char sizebuf[32]; + us_node_t *node; + uu_avl_pool_t *avl_pool = cb->cb_avl_pool; + uu_avl_t *avl = cb->cb_avl; + uu_avl_index_t idx; + nvlist_t *props; + us_node_t *n; + zfs_sort_column_t *sortcol = cb->cb_sortcol; + unsigned type = 0; + const char *typestr; + size_t namelen; + size_t typelen; + size_t sizelen; + int typeidx, nameidx, sizeidx; + us_sort_info_t sortinfo = { sortcol, cb->cb_numname }; + boolean_t smbentity = B_FALSE; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + node = safe_malloc(sizeof (us_node_t)); + uu_avl_node_init(node, &node->usn_avlnode, avl_pool); + node->usn_nvl = props; + + if (domain != NULL && domain[0] != '\0') { +#ifdef HAVE_IDMAP + /* SMB */ + char sid[MAXNAMELEN + 32]; + uid_t id; + uint64_t classes; + int err; + directory_error_t e; + + smbentity = B_TRUE; + + (void) snprintf(sid, sizeof (sid), "%s-%u", domain, rid); + + if (prop == ZFS_PROP_GROUPUSED || prop == ZFS_PROP_GROUPQUOTA) { + type = USTYPE_SMB_GRP; + err = sid_to_id(sid, B_FALSE, &id); + } else { + type = USTYPE_SMB_USR; + err = sid_to_id(sid, B_TRUE, &id); + } + + if (err == 0) { + rid = id; + if (!cb->cb_sid2posix) { + e = directory_name_from_sid(NULL, sid, &name, + &classes); + if (e != NULL) + directory_error_free(e); + if (name == NULL) + name = sid; + } + } +#else + nvlist_free(props); + free(node); + + return (-1); +#endif /* HAVE_IDMAP */ + } + + if (cb->cb_sid2posix || domain == NULL || domain[0] == '\0') { + /* POSIX or -i */ + if (zfs_prop_is_group(prop)) { + type = USTYPE_PSX_GRP; + if (!cb->cb_numname) { + struct group *g; + + if ((g = getgrgid(rid)) != NULL) + name = g->gr_name; + } + } else if (zfs_prop_is_user(prop)) { + type = USTYPE_PSX_USR; + if (!cb->cb_numname) { + struct passwd *p; + + if ((p = getpwuid(rid)) != NULL) + name = p->pw_name; + } + } else { + type = USTYPE_PROJ; + } + } + + /* + * Make sure that the type/name combination is unique when doing + * SID to POSIX ID translation (hence changing the type from SMB to + * POSIX). + */ + if (cb->cb_sid2posix && + nvlist_add_boolean_value(props, "smbentity", smbentity) != 0) + nomem(); + + /* Calculate/update width of TYPE field */ + typestr = us_type2str(type); + typelen = strlen(gettext(typestr)); + typeidx = us_field_index("type"); + if (typelen > cb->cb_width[typeidx]) + cb->cb_width[typeidx] = typelen; + if (nvlist_add_uint32(props, "type", type) != 0) + nomem(); + + /* Calculate/update width of NAME field */ + if ((cb->cb_numname && cb->cb_sid2posix) || name == NULL) { + if (nvlist_add_uint64(props, "name", rid) != 0) + nomem(); + namelen = snprintf(NULL, 0, "%u", rid); + } else { + if (nvlist_add_string(props, "name", name) != 0) + nomem(); + namelen = strlen(name); + } + nameidx = us_field_index("name"); + if (nameidx >= 0 && namelen > cb->cb_width[nameidx]) + cb->cb_width[nameidx] = namelen; + + /* + * Check if this type/name combination is in the list and update it; + * otherwise add new node to the list. + */ + if ((n = uu_avl_find(avl, node, &sortinfo, &idx)) == NULL) { + uu_avl_insert(avl, node, idx); + } else { + nvlist_free(props); + free(node); + node = n; + props = node->usn_nvl; + } + + /* Calculate/update width of USED/QUOTA fields */ + if (cb->cb_nicenum) { + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || + prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || + prop == ZFS_PROP_PROJECTUSED || + prop == ZFS_PROP_PROJECTQUOTA) { + zfs_nicebytes(space, sizebuf, sizeof (sizebuf)); + } else { + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + } + } else { + (void) snprintf(sizebuf, sizeof (sizebuf), "%llu", + (u_longlong_t)space); + } + sizelen = strlen(sizebuf); + if (prop == ZFS_PROP_USERUSED || prop == ZFS_PROP_GROUPUSED || + prop == ZFS_PROP_PROJECTUSED) { + propname = "used"; + if (!nvlist_exists(props, "quota")) + (void) nvlist_add_uint64(props, "quota", 0); + } else if (prop == ZFS_PROP_USERQUOTA || prop == ZFS_PROP_GROUPQUOTA || + prop == ZFS_PROP_PROJECTQUOTA) { + propname = "quota"; + if (!nvlist_exists(props, "used")) + (void) nvlist_add_uint64(props, "used", 0); + } else if (prop == ZFS_PROP_USEROBJUSED || + prop == ZFS_PROP_GROUPOBJUSED || prop == ZFS_PROP_PROJECTOBJUSED) { + propname = "objused"; + if (!nvlist_exists(props, "objquota")) + (void) nvlist_add_uint64(props, "objquota", 0); + } else if (prop == ZFS_PROP_USEROBJQUOTA || + prop == ZFS_PROP_GROUPOBJQUOTA || + prop == ZFS_PROP_PROJECTOBJQUOTA) { + propname = "objquota"; + if (!nvlist_exists(props, "objused")) + (void) nvlist_add_uint64(props, "objused", 0); + } else { + return (-1); + } + sizeidx = us_field_index(propname); + if (sizeidx >= 0 && sizelen > cb->cb_width[sizeidx]) + cb->cb_width[sizeidx] = sizelen; + + if (nvlist_add_uint64(props, propname, space) != 0) + nomem(); + + return (0); +} + +static void +print_us_node(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, us_node_t *node) +{ + nvlist_t *nvl = node->usn_nvl; + char valstr[MAXNAMELEN]; + boolean_t first = B_TRUE; + int cfield = 0; + int field; + uint32_t ustype; + + /* Check type */ + (void) nvlist_lookup_uint32(nvl, "type", &ustype); + if (!(ustype & types)) + return; + + while ((field = fields[cfield]) != USFIELD_LAST) { + nvpair_t *nvp = NULL; + data_type_t type; + uint32_t val32; + uint64_t val64; + char *strval = "-"; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + if (strcmp(nvpair_name(nvp), + us_field_names[field]) == 0) + break; + } + + type = nvp == NULL ? DATA_TYPE_UNKNOWN : nvpair_type(nvp); + switch (type) { + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &val32); + break; + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &val64); + break; + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &strval); + break; + case DATA_TYPE_UNKNOWN: + break; + default: + (void) fprintf(stderr, "invalid data type\n"); + } + + switch (field) { + case USFIELD_TYPE: + if (type == DATA_TYPE_UINT32) + strval = (char *)us_type2str(val32); + break; + case USFIELD_NAME: + if (type == DATA_TYPE_UINT64) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } + break; + case USFIELD_USED: + case USFIELD_QUOTA: + if (type == DATA_TYPE_UINT64) { + if (parsable) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } else if (field == USFIELD_QUOTA && + val64 == 0) { + strval = "none"; + } else { + zfs_nicebytes(val64, valstr, + sizeof (valstr)); + strval = valstr; + } + } + break; + case USFIELD_OBJUSED: + case USFIELD_OBJQUOTA: + if (type == DATA_TYPE_UINT64) { + if (parsable) { + (void) sprintf(valstr, "%llu", + (u_longlong_t)val64); + strval = valstr; + } else if (field == USFIELD_OBJQUOTA && + val64 == 0) { + strval = "none"; + } else { + zfs_nicenum(val64, valstr, + sizeof (valstr)); + strval = valstr; + } + } + break; + } + + if (!first) { + if (scripted) + (void) printf("\t"); + else + (void) printf(" "); + } + if (scripted) + (void) printf("%s", strval); + else if (field == USFIELD_TYPE || field == USFIELD_NAME) + (void) printf("%-*s", (int)width[field], strval); + else + (void) printf("%*s", (int)width[field], strval); + + first = B_FALSE; + cfield++; + } + + (void) printf("\n"); +} + +static void +print_us(boolean_t scripted, boolean_t parsable, int *fields, int types, + size_t *width, boolean_t rmnode, uu_avl_t *avl) +{ + us_node_t *node; + const char *col; + int cfield = 0; + int field; + + if (!scripted) { + boolean_t first = B_TRUE; + + while ((field = fields[cfield]) != USFIELD_LAST) { + col = gettext(us_field_hdr[field]); + if (field == USFIELD_TYPE || field == USFIELD_NAME) { + (void) printf(first ? "%-*s" : " %-*s", + (int)width[field], col); + } else { + (void) printf(first ? "%*s" : " %*s", + (int)width[field], col); + } + first = B_FALSE; + cfield++; + } + (void) printf("\n"); + } + + for (node = uu_avl_first(avl); node; node = uu_avl_next(avl, node)) { + print_us_node(scripted, parsable, fields, types, width, node); + if (rmnode) + nvlist_free(node->usn_nvl); + } +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + uu_avl_pool_t *avl_pool; + uu_avl_t *avl_tree; + uu_avl_walk_t *walk; + char *delim; + char deffields[] = "type,name,used,quota,objused,objquota"; + char *ofield = NULL; + char *tfield = NULL; + int cfield = 0; + int fields[256]; + int i; + boolean_t scripted = B_FALSE; + boolean_t prtnum = B_FALSE; + boolean_t parsable = B_FALSE; + boolean_t sid2posix = B_FALSE; + int ret = 0; + int c; + zfs_sort_column_t *sortcol = NULL; + int types = USTYPE_PSX_USR | USTYPE_SMB_USR; + us_cbdata_t cb; + us_node_t *node; + us_node_t *rmnode; + uu_list_pool_t *listpool; + uu_list_t *list; + uu_avl_index_t idx = 0; + uu_list_index_t idx2 = 0; + + if (argc < 2) + usage(B_FALSE); + + if (strcmp(argv[0], "groupspace") == 0) { + /* Toggle default group types */ + types = USTYPE_PSX_GRP | USTYPE_SMB_GRP; + } else if (strcmp(argv[0], "projectspace") == 0) { + types = USTYPE_PROJ; + prtnum = B_TRUE; + } + + while ((c = getopt(argc, argv, "nHpo:s:S:t:i")) != -1) { + switch (c) { + case 'n': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 'n'\n")); + usage(B_FALSE); + } + prtnum = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 'o': + ofield = optarg; + break; + case 's': + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + c == 's' ? B_FALSE : B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid field '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 't'\n")); + usage(B_FALSE); + } + tfield = optarg; + break; + case 'i': + if (types == USTYPE_PROJ) { + (void) fprintf(stderr, + gettext("invalid option 'i'\n")); + usage(B_FALSE); + } + sid2posix = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* Use default output fields if not specified using -o */ + if (ofield == NULL) + ofield = deffields; + do { + if ((delim = strchr(ofield, ',')) != NULL) + *delim = '\0'; + if ((fields[cfield++] = us_field_index(ofield)) == -1) { + (void) fprintf(stderr, gettext("invalid type '%s' " + "for -o option\n"), ofield); + return (-1); + } + if (delim != NULL) + ofield = delim + 1; + } while (delim != NULL); + fields[cfield] = USFIELD_LAST; + + /* Override output types (-t option) */ + if (tfield != NULL) { + types = 0; + + do { + boolean_t found = B_FALSE; + + if ((delim = strchr(tfield, ',')) != NULL) + *delim = '\0'; + for (i = 0; i < sizeof (us_type_bits) / sizeof (int); + i++) { + if (strcmp(tfield, us_type_names[i]) == 0) { + found = B_TRUE; + types |= us_type_bits[i]; + break; + } + } + if (!found) { + (void) fprintf(stderr, gettext("invalid type " + "'%s' for -t option\n"), tfield); + return (-1); + } + if (delim != NULL) + tfield = delim + 1; + } while (delim != NULL); + } + + if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + if (zhp->zfs_head_type != ZFS_TYPE_FILESYSTEM) { + (void) fprintf(stderr, gettext("operation is only applicable " + "to filesystems and their snapshots\n")); + zfs_close(zhp); + return (1); + } + + if ((avl_pool = uu_avl_pool_create("us_avl_pool", sizeof (us_node_t), + offsetof(us_node_t, usn_avlnode), us_compare, UU_DEFAULT)) == NULL) + nomem(); + if ((avl_tree = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) + nomem(); + + /* Always add default sorting columns */ + (void) zfs_add_sort_column(&sortcol, "type", B_FALSE); + (void) zfs_add_sort_column(&sortcol, "name", B_FALSE); + + cb.cb_sortcol = sortcol; + cb.cb_numname = prtnum; + cb.cb_nicenum = !parsable; + cb.cb_avl_pool = avl_pool; + cb.cb_avl = avl_tree; + cb.cb_sid2posix = sid2posix; + + for (i = 0; i < USFIELD_LAST; i++) + cb.cb_width[i] = strlen(gettext(us_field_hdr[i])); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + if ((zfs_prop_is_user(p) && + !(types & (USTYPE_PSX_USR | USTYPE_SMB_USR))) || + (zfs_prop_is_group(p) && + !(types & (USTYPE_PSX_GRP | USTYPE_SMB_GRP))) || + (zfs_prop_is_project(p) && types != USTYPE_PROJ)) + continue; + + cb.cb_prop = p; + if ((ret = zfs_userspace(zhp, p, userspace_cb, &cb)) != 0) { + zfs_close(zhp); + return (ret); + } + } + zfs_close(zhp); + + /* Sort the list */ + if ((node = uu_avl_first(avl_tree)) == NULL) + return (0); + + us_populated = B_TRUE; + + listpool = uu_list_pool_create("tmplist", sizeof (us_node_t), + offsetof(us_node_t, usn_listnode), NULL, UU_DEFAULT); + list = uu_list_create(listpool, NULL, UU_DEFAULT); + uu_list_node_init(node, &node->usn_listnode, listpool); + + while (node != NULL) { + rmnode = node; + node = uu_avl_next(avl_tree, node); + uu_avl_remove(avl_tree, rmnode); + if (uu_list_find(list, rmnode, NULL, &idx2) == NULL) + uu_list_insert(list, rmnode, idx2); + } + + for (node = uu_list_first(list); node != NULL; + node = uu_list_next(list, node)) { + us_sort_info_t sortinfo = { sortcol, cb.cb_numname }; + + if (uu_avl_find(avl_tree, node, &sortinfo, &idx) == NULL) + uu_avl_insert(avl_tree, node, idx); + } + + uu_list_destroy(list); + uu_list_pool_destroy(listpool); + + /* Print and free node nvlist memory */ + print_us(scripted, parsable, fields, types, cb.cb_width, B_TRUE, + cb.cb_avl); + + zfs_free_sort_columns(sortcol); + + /* Clean up the AVL tree */ + if ((walk = uu_avl_walk_start(cb.cb_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(cb.cb_avl, node); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(avl_tree); + uu_avl_pool_destroy(avl_pool); + + return (ret); +} + +/* + * list [-Hp][-r|-d max] [-o property[,...]] [-s property] ... [-S property] + * [-t type[,...]] [filesystem|volume|snapshot] ... + * + * -H Scripted mode; elide headers and separate columns by tabs + * -p Display values in parsable (literal) format. + * -r Recurse over all children + * -d Limit recursion by depth. + * -o Control which fields to display. + * -s Specify sort columns, descending order. + * -S Specify sort columns, ascending order. + * -t Control which object types to display. + * + * When given no arguments, list all filesystems in the system. + * Otherwise, list the specified datasets, optionally recursing down them if + * '-r' is specified. + */ +typedef struct list_cbdata { + boolean_t cb_first; + boolean_t cb_literal; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; +} list_cbdata_t; + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZFS_MAXPROPLEN]; + const char *header; + int i; + boolean_t first = B_TRUE; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zfs_prop_column_name(pl->pl_prop); + right_justify = zfs_prop_align_right(pl->pl_prop); + } else { + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", (int)pl->pl_width, header); + else + (void) printf("%-*s", (int)pl->pl_width, header); + } + + (void) printf("\n"); +} + +/* + * Given a dataset and a list of fields, print out all the properties according + * to the described layout. + */ +static void +print_dataset(zfs_handle_t *zhp, list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + boolean_t first = B_TRUE; + char property[ZFS_MAXPROPLEN]; + nvlist_t *userprops = zfs_get_user_props(zhp); + nvlist_t *propval; + char *propstr; + boolean_t right_justify; + + for (; pl != NULL; pl = pl->pl_next) { + if (!first) { + if (cb->cb_scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + if (pl->pl_prop == ZFS_PROP_NAME) { + (void) strlcpy(property, zfs_get_name(zhp), + sizeof (property)); + propstr = property; + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (pl->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, pl->pl_prop, property, + sizeof (property), NULL, NULL, 0, + cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + property, sizeof (property), cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else if (zfs_prop_written(pl->pl_user_prop)) { + if (zfs_prop_get_written(zhp, pl->pl_user_prop, + property, sizeof (property), cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; + } else { + if (nvlist_lookup_nvlist(userprops, + pl->pl_user_prop, &propval) != 0) + propstr = "-"; + else + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &propstr) == 0); + right_justify = B_FALSE; + } + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", (int)pl->pl_width, propstr); + else + (void) printf("%-*s", (int)pl->pl_width, propstr); + } + + (void) printf("\n"); +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +list_callback(zfs_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + if (cbp->cb_first) { + if (!cbp->cb_scripted) + print_header(cbp); + cbp->cb_first = B_FALSE; + } + + print_dataset(zhp, cbp); + + return (0); +} + +static int +zfs_do_list(int argc, char **argv) +{ + int c; + static char default_fields[] = + "name,used,available,referenced,mountpoint"; + int types = ZFS_TYPE_DATASET; + boolean_t types_specified = B_FALSE; + char *fields = NULL; + list_cbdata_t cb = { 0 }; + char *value; + int limit = 0; + int ret = 0; + zfs_sort_column_t *sortcol = NULL; + int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; + + /* check options */ + while ((c = getopt(argc, argv, "HS:d:o:prs:t:")) != -1) { + switch (c) { + case 'o': + fields = optarg; + break; + case 'p': + cb.cb_literal = B_TRUE; + flags |= ZFS_ITER_LITERAL_PROPS; + break; + case 'd': + limit = parse_depth(optarg, &flags); + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 's': + if (zfs_add_sort_column(&sortcol, optarg, + B_FALSE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 'S': + if (zfs_add_sort_column(&sortcol, optarg, + B_TRUE) != 0) { + (void) fprintf(stderr, + gettext("invalid property '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case 't': + types = 0; + types_specified = B_TRUE; + flags &= ~ZFS_ITER_PROP_LISTSNAPS; + while (*optarg != '\0') { + static char *type_subopts[] = { "filesystem", + "volume", "snapshot", "snap", "bookmark", + "all", NULL }; + + switch (getsubopt(&optarg, type_subopts, + &value)) { + case 0: + types |= ZFS_TYPE_FILESYSTEM; + break; + case 1: + types |= ZFS_TYPE_VOLUME; + break; + case 2: + case 3: + types |= ZFS_TYPE_SNAPSHOT; + break; + case 4: + types |= ZFS_TYPE_BOOKMARK; + break; + case 5: + types = ZFS_TYPE_DATASET | + ZFS_TYPE_BOOKMARK; + break; + default: + (void) fprintf(stderr, + gettext("invalid type '%s'\n"), + value); + usage(B_FALSE); + } + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (fields == NULL) + fields = default_fields; + + /* + * If we are only going to list snapshot names and sort by name, + * then we can use faster version. + */ + if (strcmp(fields, "name") == 0 && zfs_sort_only_by_name(sortcol)) + flags |= ZFS_ITER_SIMPLE; + + /* + * If "-o space" and no types were specified, don't display snapshots. + */ + if (strcmp(fields, "space") == 0 && types_specified == B_FALSE) + types &= ~ZFS_TYPE_SNAPSHOT; + + /* + * Handle users who want to list all snapshots or bookmarks + * of the current dataset (ex. 'zfs list -t snapshot '). + */ + if ((types == ZFS_TYPE_SNAPSHOT || types == ZFS_TYPE_BOOKMARK) && + argc > 0 && (flags & ZFS_ITER_RECURSE) == 0 && limit == 0) { + flags |= (ZFS_ITER_DEPTH_LIMIT | ZFS_ITER_RECURSE); + limit = 1; + } + + /* + * If the user specifies '-o all', the zprop_get_list() doesn't + * normally include the name of the dataset. For 'zfs list', we always + * want this property to be first. + */ + if (zprop_get_list(g_zfs, fields, &cb.cb_proplist, ZFS_TYPE_DATASET) + != 0) + usage(B_FALSE); + + cb.cb_first = B_TRUE; + + ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, + limit, list_callback, &cb); + + zprop_free_list(cb.cb_proplist); + zfs_free_sort_columns(sortcol); + + if (ret == 0 && cb.cb_first && !cb.cb_scripted) + (void) fprintf(stderr, gettext("no datasets available\n")); + + return (ret); +} + +/* + * zfs rename [-f] + * zfs rename [-f] -p + * zfs rename -r + * + * Renames the given dataset to another of the same type. + * + * The '-p' flag creates all the non-existing ancestors of the target first. + */ +/* ARGSUSED */ +static int +zfs_do_rename(int argc, char **argv) +{ + zfs_handle_t *zhp; + int c; + int ret = 0; + boolean_t recurse = B_FALSE; + boolean_t parents = B_FALSE; + boolean_t force_unmount = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "prf")) != -1) { + switch (c) { + case 'p': + parents = B_TRUE; + break; + case 'r': + recurse = B_TRUE; + break; + case 'f': + force_unmount = B_TRUE; + break; + case '?': + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing target dataset " + "argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (recurse && parents) { + (void) fprintf(stderr, gettext("-p and -r options are mutually " + "exclusive\n")); + usage(B_FALSE); + } + + if (recurse && strchr(argv[0], '@') == 0) { + (void) fprintf(stderr, gettext("source dataset for recursive " + "rename must be a snapshot\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], parents ? ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME : ZFS_TYPE_DATASET)) == NULL) + return (1); + + /* If we were asked and the name looks good, try to create ancestors. */ + if (parents && zfs_name_valid(argv[1], zfs_get_type(zhp)) && + zfs_create_ancestors(g_zfs, argv[1]) != 0) { + zfs_close(zhp); + return (1); + } + + ret = (zfs_rename(zhp, argv[1], recurse, force_unmount) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs promote + * + * Promotes the given clone fs to be the parent + */ +/* ARGSUSED */ +static int +zfs_do_promote(int argc, char **argv) +{ + zfs_handle_t *zhp; + int ret = 0; + + /* check options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing clone filesystem" + " argument\n")); + usage(B_FALSE); + } + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[1], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + ret = (zfs_promote(zhp) != 0); + + + zfs_close(zhp); + return (ret); +} + +static int +zfs_do_redact(int argc, char **argv) +{ + char *snap = NULL; + char *bookname = NULL; + char **rsnaps = NULL; + int numrsnaps = 0; + argv++; + argc--; + if (argc < 3) { + (void) fprintf(stderr, gettext("too few arguments\n")); + usage(B_FALSE); + } + + snap = argv[0]; + bookname = argv[1]; + rsnaps = argv + 2; + numrsnaps = argc - 2; + + nvlist_t *rsnapnv = fnvlist_alloc(); + + for (int i = 0; i < numrsnaps; i++) { + fnvlist_add_boolean(rsnapnv, rsnaps[i]); + } + + int err = lzc_redact(snap, bookname, rsnapnv); + fnvlist_free(rsnapnv); + + switch (err) { + case 0: + break; + case ENOENT: + (void) fprintf(stderr, + gettext("provided snapshot %s does not exist\n"), snap); + break; + case EEXIST: + (void) fprintf(stderr, gettext("specified redaction bookmark " + "(%s) provided already exists\n"), bookname); + break; + case ENAMETOOLONG: + (void) fprintf(stderr, gettext("provided bookmark name cannot " + "be used, final name would be too long\n")); + break; + case E2BIG: + (void) fprintf(stderr, gettext("too many redaction snapshots " + "specified\n")); + break; + case EINVAL: + if (strchr(bookname, '#') != NULL) + (void) fprintf(stderr, gettext( + "redaction bookmark name must not contain '#'\n")); + else + (void) fprintf(stderr, gettext( + "redaction snapshot must be descendent of " + "snapshot being redacted\n")); + break; + case EALREADY: + (void) fprintf(stderr, gettext("attempted to redact redacted " + "dataset or with respect to redacted dataset\n")); + break; + case ENOTSUP: + (void) fprintf(stderr, gettext("redaction bookmarks feature " + "not enabled\n")); + break; + case EXDEV: + (void) fprintf(stderr, gettext("potentially invalid redaction " + "snapshot; full dataset names required\n")); + break; + default: + (void) fprintf(stderr, gettext("internal error: %s\n"), + strerror(errno)); + } + + return (err); +} + +/* + * zfs rollback [-rRf] + * + * -r Delete any intervening snapshots before doing rollback + * -R Delete any snapshots and their clones + * -f ignored for backwards compatibility + * + * Given a filesystem, rollback to a specific snapshot, discarding any changes + * since then and making it the active dataset. If more recent snapshots exist, + * the command will complain unless the '-r' flag is given. + */ +typedef struct rollback_cbdata { + uint64_t cb_create; + uint8_t cb_younger_ds_printed; + boolean_t cb_first; + int cb_doclones; + char *cb_target; + int cb_error; + boolean_t cb_recurse; +} rollback_cbdata_t; + +static int +rollback_check_dependent(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + + if (cbp->cb_first && cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot rollback to " + "'%s': clones of previous snapshots exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-R' to " + "force deletion of the following clones and " + "dependents:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + (void) fprintf(stderr, "%s\n", zfs_get_name(zhp)); + + zfs_close(zhp); + return (0); +} + + +/* + * Report some snapshots/bookmarks more recent than the one specified. + * Used when '-r' is not specified. We reuse this same callback for the + * snapshot dependents - if 'cb_dependent' is set, then this is a + * dependent and we should report it without checking the transaction group. + */ +static int +rollback_check(zfs_handle_t *zhp, void *data) +{ + rollback_cbdata_t *cbp = data; + /* + * Max number of younger snapshots and/or bookmarks to display before + * we stop the iteration. + */ + const uint8_t max_younger = 32; + + if (cbp->cb_doclones) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { + if (cbp->cb_first && !cbp->cb_recurse) { + (void) fprintf(stderr, gettext("cannot " + "rollback to '%s': more recent snapshots " + "or bookmarks exist\n"), + cbp->cb_target); + (void) fprintf(stderr, gettext("use '-r' to " + "force deletion of the following " + "snapshots and bookmarks:\n")); + cbp->cb_first = 0; + cbp->cb_error = 1; + } + + if (cbp->cb_recurse) { + if (zfs_iter_dependents(zhp, B_TRUE, + rollback_check_dependent, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + } else { + (void) fprintf(stderr, "%s\n", + zfs_get_name(zhp)); + cbp->cb_younger_ds_printed++; + } + } + zfs_close(zhp); + + if (cbp->cb_younger_ds_printed == max_younger) { + /* + * This non-recursive rollback is going to fail due to the + * presence of snapshots and/or bookmarks that are younger than + * the rollback target. + * We printed some of the offending objects, now we stop + * zfs_iter_snapshot/bookmark iteration so we can fail fast and + * avoid iterating over the rest of the younger objects + */ + (void) fprintf(stderr, gettext("Output limited to %d " + "snapshots/bookmarks\n"), max_younger); + return (-1); + } + return (0); +} + +static int +zfs_do_rollback(int argc, char **argv) +{ + int ret = 0; + int c; + boolean_t force = B_FALSE; + rollback_cbdata_t cb = { 0 }; + zfs_handle_t *zhp, *snap; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + char *delim; + uint64_t min_txg = 0; + + /* check options */ + while ((c = getopt(argc, argv, "rRf")) != -1) { + switch (c) { + case 'r': + cb.cb_recurse = 1; + break; + case 'R': + cb.cb_recurse = 1; + cb.cb_doclones = 1; + break; + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing dataset argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* open the snapshot */ + if ((snap = zfs_open(g_zfs, argv[0], ZFS_TYPE_SNAPSHOT)) == NULL) + return (1); + + /* open the parent dataset */ + (void) strlcpy(parentname, argv[0], sizeof (parentname)); + verify((delim = strrchr(parentname, '@')) != NULL); + *delim = '\0'; + if ((zhp = zfs_open(g_zfs, parentname, ZFS_TYPE_DATASET)) == NULL) { + zfs_close(snap); + return (1); + } + + /* + * Check for more recent snapshots and/or clones based on the presence + * of '-r' and '-R'. + */ + cb.cb_target = argv[0]; + cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); + cb.cb_first = B_TRUE; + cb.cb_error = 0; + + if (cb.cb_create > 0) + min_txg = cb.cb_create; + + if ((ret = zfs_iter_snapshots(zhp, B_FALSE, rollback_check, &cb, + min_txg, 0)) != 0) + goto out; + if ((ret = zfs_iter_bookmarks(zhp, rollback_check, &cb)) != 0) + goto out; + + if ((ret = cb.cb_error) != 0) + goto out; + + /* + * Rollback parent to the given snapshot. + */ + ret = zfs_rollback(zhp, snap, force); + +out: + zfs_close(snap); + zfs_close(zhp); + + if (ret == 0) + return (0); + else + return (1); +} + +/* + * zfs set property=value ... { fs | snap | vol } ... + * + * Sets the given properties for all datasets specified on the command line. + */ + +static int +set_callback(zfs_handle_t *zhp, void *data) +{ + nvlist_t *props = data; + + if (zfs_prop_set_list(zhp, props) != 0) { + switch (libzfs_errno(g_zfs)) { + case EZFS_MOUNTFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to remount filesystem\n")); + break; + case EZFS_SHARENFSFAILED: + (void) fprintf(stderr, gettext("property may be set " + "but unable to reshare filesystem\n")); + break; + } + return (1); + } + return (0); +} + +static int +zfs_do_set(int argc, char **argv) +{ + nvlist_t *props = NULL; + int ds_start = -1; /* argv idx of first dataset arg */ + int ret = 0; + int i; + + /* check for options */ + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + /* check number of arguments */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing arguments\n")); + usage(B_FALSE); + } + if (argc < 3) { + if (strchr(argv[1], '=') == NULL) { + (void) fprintf(stderr, gettext("missing property=value " + "argument(s)\n")); + } else { + (void) fprintf(stderr, gettext("missing dataset " + "name(s)\n")); + } + usage(B_FALSE); + } + + /* validate argument order: prop=val args followed by dataset args */ + for (i = 1; i < argc; i++) { + if (strchr(argv[i], '=') != NULL) { + if (ds_start > 0) { + /* out-of-order prop=val argument */ + (void) fprintf(stderr, gettext("invalid " + "argument order\n")); + usage(B_FALSE); + } + } else if (ds_start < 0) { + ds_start = i; + } + } + if (ds_start < 0) { + (void) fprintf(stderr, gettext("missing dataset name(s)\n")); + usage(B_FALSE); + } + + /* Populate a list of property settings */ + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + for (i = 1; i < ds_start; i++) { + if (!parseprop(props, argv[i])) { + ret = -1; + goto error; + } + } + + ret = zfs_for_each(argc - ds_start, argv + ds_start, 0, + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, props); + +error: + nvlist_free(props); + return (ret); +} + +typedef struct snap_cbdata { + nvlist_t *sd_nvl; + boolean_t sd_recursive; + const char *sd_snapname; +} snap_cbdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snap_cbdata_t *sd = arg; + char *name; + int rv = 0; + int error; + + if (sd->sd_recursive && + zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) != 0) { + zfs_close(zhp); + return (0); + } + + error = asprintf(&name, "%s@%s", zfs_get_name(zhp), sd->sd_snapname); + if (error == -1) + nomem(); + fnvlist_add_boolean(sd->sd_nvl, name); + free(name); + + if (sd->sd_recursive) + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + zfs_close(zhp); + return (rv); +} + +/* + * zfs snapshot [-r] [-o prop=value] ... + * + * Creates a snapshot with the given name. While functionally equivalent to + * 'zfs create', it is a separate command to differentiate intent. + */ +static int +zfs_do_snapshot(int argc, char **argv) +{ + int ret = 0; + int c; + nvlist_t *props; + snap_cbdata_t sd = { 0 }; + boolean_t multiple_snaps = B_FALSE; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, "ro:")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(sd.sd_nvl); + nvlist_free(props); + return (1); + } + break; + case 'r': + sd.sd_recursive = B_TRUE; + multiple_snaps = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + goto usage; + } + + if (argc > 1) + multiple_snaps = B_TRUE; + for (; argc > 0; argc--, argv++) { + char *atp; + zfs_handle_t *zhp; + + atp = strchr(argv[0], '@'); + if (atp == NULL) + goto usage; + *atp = '\0'; + sd.sd_snapname = atp + 1; + zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + goto usage; + if (zfs_snapshot_cb(zhp, &sd) != 0) + goto usage; + } + + ret = zfs_snapshot_nvl(g_zfs, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); + nvlist_free(props); + if (ret != 0 && multiple_snaps) + (void) fprintf(stderr, gettext("no snapshots were created\n")); + return (ret != 0); + +usage: + nvlist_free(sd.sd_nvl); + nvlist_free(props); + usage(B_FALSE); + return (-1); +} + + +/* + * Send a backup stream to stdout. + */ +static int +zfs_do_send(int argc, char **argv) +{ + char *fromname = NULL; + char *toname = NULL; + char *resume_token = NULL; + char *cp; + zfs_handle_t *zhp; + sendflags_t flags = { 0 }; + int c, err; + nvlist_t *dbgnv = NULL; + char *redactbook = NULL; + + struct option long_options[] = { + {"replicate", no_argument, NULL, 'R'}, + {"redact", required_argument, NULL, 'd'}, + {"props", no_argument, NULL, 'p'}, + {"parsable", no_argument, NULL, 'P'}, + {"dedup", no_argument, NULL, 'D'}, + {"verbose", no_argument, NULL, 'v'}, + {"dryrun", no_argument, NULL, 'n'}, + {"large-block", no_argument, NULL, 'L'}, + {"embed", no_argument, NULL, 'e'}, + {"resume", required_argument, NULL, 't'}, + {"compressed", no_argument, NULL, 'c'}, + {"raw", no_argument, NULL, 'w'}, + {"backup", no_argument, NULL, 'b'}, + {"holds", no_argument, NULL, 'h'}, + {"saved", no_argument, NULL, 'S'}, + {0, 0, 0, 0} + }; + + /* check options */ + while ((c = getopt_long(argc, argv, ":i:I:RDpvnPLeht:cwbd:S", + long_options, NULL)) != -1) { + switch (c) { + case 'i': + if (fromname) + usage(B_FALSE); + fromname = optarg; + break; + case 'I': + if (fromname) + usage(B_FALSE); + fromname = optarg; + flags.doall = B_TRUE; + break; + case 'R': + flags.replicate = B_TRUE; + break; + case 'd': + redactbook = optarg; + break; + case 'p': + flags.props = B_TRUE; + break; + case 'b': + flags.backup = B_TRUE; + break; + case 'h': + flags.holds = B_TRUE; + break; + case 'P': + flags.parsable = B_TRUE; + break; + case 'v': + flags.verbosity++; + flags.progress = B_TRUE; + break; + case 'D': + (void) fprintf(stderr, + gettext("WARNING: deduplicated send is no " + "longer supported. A regular,\n" + "non-deduplicated stream will be generated.\n\n")); + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'L': + flags.largeblock = B_TRUE; + break; + case 'e': + flags.embed_data = B_TRUE; + break; + case 't': + resume_token = optarg; + break; + case 'c': + flags.compress = B_TRUE; + break; + case 'w': + flags.raw = B_TRUE; + flags.compress = B_TRUE; + flags.embed_data = B_TRUE; + flags.largeblock = B_TRUE; + break; + case 'S': + flags.saved = B_TRUE; + break; + case ':': + /* + * If a parameter was not passed, optopt contains the + * value that would normally lead us into the + * appropriate case statement. If it's > 256, then this + * must be a longopt and we should look at argv to get + * the string. Otherwise it's just the character, so we + * should use it directly. + */ + if (optopt <= UINT8_MAX) { + (void) fprintf(stderr, + gettext("missing argument for '%c' " + "option\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("missing argument for '%s' " + "option\n"), argv[optind - 1]); + } + usage(B_FALSE); + break; + case '?': + /*FALLTHROUGH*/ + default: + /* + * If an invalid flag was passed, optopt contains the + * character if it was a short flag, or 0 if it was a + * longopt. + */ + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + + } + usage(B_FALSE); + } + } + + if (flags.parsable && flags.verbosity == 0) + flags.verbosity = 1; + + argc -= optind; + argv += optind; + + if (resume_token != NULL) { + if (fromname != NULL || flags.replicate || flags.props || + flags.backup || flags.holds || + flags.saved || redactbook != NULL) { + (void) fprintf(stderr, + gettext("invalid flags combined with -t\n")); + usage(B_FALSE); + } + if (argc > 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } else { + if (argc < 1) { + (void) fprintf(stderr, + gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } + + if (flags.saved) { + if (fromname != NULL || flags.replicate || flags.props || + flags.doall || flags.backup || + flags.holds || flags.largeblock || flags.embed_data || + flags.compress || flags.raw || redactbook != NULL) { + (void) fprintf(stderr, gettext("incompatible flags " + "combined with saved send flag\n")); + usage(B_FALSE); + } + if (strchr(argv[0], '@') != NULL) { + (void) fprintf(stderr, gettext("saved send must " + "specify the dataset with partially-received " + "state\n")); + usage(B_FALSE); + } + } + + if (flags.raw && redactbook != NULL) { + (void) fprintf(stderr, + gettext("Error: raw sends may not be redacted.\n")); + return (1); + } + + if (!flags.dryrun && isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n")); + return (1); + } + + if (flags.saved) { + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + + err = zfs_send_saved(zhp, &flags, STDOUT_FILENO, + resume_token); + zfs_close(zhp); + return (err != 0); + } else if (resume_token != NULL) { + return (zfs_send_resume(g_zfs, &flags, STDOUT_FILENO, + resume_token)); + } + + /* + * For everything except -R and -I, use the new, cleaner code path. + */ + if (!(flags.replicate || flags.doall)) { + char frombuf[ZFS_MAX_DATASET_NAME_LEN]; + + if (fromname != NULL && (strchr(fromname, '#') == NULL && + strchr(fromname, '@') == NULL)) { + /* + * Neither bookmark or snapshot was specified. Print a + * warning, and assume snapshot. + */ + (void) fprintf(stderr, "Warning: incremental source " + "didn't specify type, assuming snapshot. Use '@' " + "or '#' prefix to avoid ambiguity.\n"); + (void) snprintf(frombuf, sizeof (frombuf), "@%s", + fromname); + fromname = frombuf; + } + if (fromname != NULL && + (fromname[0] == '#' || fromname[0] == '@')) { + /* + * Incremental source name begins with # or @. + * Default to same fs as target. + */ + char tmpbuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(tmpbuf, fromname, sizeof (tmpbuf)); + (void) strlcpy(frombuf, argv[0], sizeof (frombuf)); + cp = strchr(frombuf, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(frombuf, tmpbuf, sizeof (frombuf)); + fromname = frombuf; + } + + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET); + if (zhp == NULL) + return (1); + err = zfs_send_one(zhp, fromname, STDOUT_FILENO, &flags, + redactbook); + zfs_close(zhp); + return (err != 0); + } + + if (fromname != NULL && strchr(fromname, '#')) { + (void) fprintf(stderr, + gettext("Error: multiple snapshots cannot be " + "sent from a bookmark.\n")); + return (1); + } + + if (redactbook != NULL) { + (void) fprintf(stderr, gettext("Error: multiple snapshots " + "cannot be sent redacted.\n")); + return (1); + } + + if ((cp = strchr(argv[0], '@')) == NULL) { + (void) fprintf(stderr, gettext("Error: " + "Unsupported flag with filesystem or bookmark.\n")); + return (1); + } + *cp = '\0'; + toname = cp + 1; + zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (1); + + /* + * If they specified the full path to the snapshot, chop off + * everything except the short name of the snapshot, but special + * case if they specify the origin. + */ + if (fromname && (cp = strchr(fromname, '@')) != NULL) { + char origin[ZFS_MAX_DATASET_NAME_LEN]; + zprop_source_t src; + + (void) zfs_prop_get(zhp, ZFS_PROP_ORIGIN, + origin, sizeof (origin), &src, NULL, 0, B_FALSE); + + if (strcmp(origin, fromname) == 0) { + fromname = NULL; + flags.fromorigin = B_TRUE; + } else { + *cp = '\0'; + if (cp != fromname && strcmp(argv[0], fromname)) { + (void) fprintf(stderr, + gettext("incremental source must be " + "in same filesystem\n")); + usage(B_FALSE); + } + fromname = cp + 1; + if (strchr(fromname, '@') || strchr(fromname, '/')) { + (void) fprintf(stderr, + gettext("invalid incremental source\n")); + usage(B_FALSE); + } + } + } + + if (flags.replicate && fromname == NULL) + flags.doall = B_TRUE; + + err = zfs_send(zhp, fromname, toname, &flags, STDOUT_FILENO, NULL, 0, + flags.verbosity >= 3 ? &dbgnv : NULL); + + if (flags.verbosity >= 3 && dbgnv != NULL) { + /* + * dump_nvlist prints to stdout, but that's been + * redirected to a file. Make it print to stderr + * instead. + */ + (void) dup2(STDERR_FILENO, STDOUT_FILENO); + dump_nvlist(dbgnv, 0); + nvlist_free(dbgnv); + } + zfs_close(zhp); + + return (err != 0); +} + +/* + * Restore a backup stream from stdin. + */ +static int +zfs_do_receive(int argc, char **argv) +{ + int c, err = 0; + recvflags_t flags = { 0 }; + boolean_t abort_resumable = B_FALSE; + nvlist_t *props; + + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + /* check options */ + while ((c = getopt(argc, argv, ":o:x:dehMnuvFsA")) != -1) { + switch (c) { + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'x': + if (!parsepropname(props, optarg)) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'd': + if (flags.istail) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -d and -e are mutually " + "exclusive\n")); + usage(B_FALSE); + } + flags.isprefix = B_TRUE; + break; + case 'e': + if (flags.isprefix) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -d and -e are mutually " + "exclusive\n")); + usage(B_FALSE); + } + flags.istail = B_TRUE; + break; + case 'h': + flags.skipholds = B_TRUE; + break; + case 'M': + flags.forceunmount = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'u': + flags.nomount = B_TRUE; + break; + case 'v': + flags.verbose = B_TRUE; + break; + case 's': + flags.resumable = B_TRUE; + break; + case 'F': + flags.force = B_TRUE; + break; + case 'A': + abort_resumable = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* zfs recv -e (use "tail" name) implies -d (remove dataset "head") */ + if (flags.istail) + flags.isprefix = B_TRUE; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing snapshot argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (abort_resumable) { + if (flags.isprefix || flags.istail || flags.dryrun || + flags.resumable || flags.nomount) { + (void) fprintf(stderr, gettext("invalid option\n")); + usage(B_FALSE); + } + + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(namebuf, sizeof (namebuf), + "%s/%%recv", argv[0]); + + if (zfs_dataset_exists(g_zfs, namebuf, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) { + zfs_handle_t *zhp = zfs_open(g_zfs, + namebuf, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + nvlist_free(props); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + zfs_close(zhp); + } else { + zfs_handle_t *zhp = zfs_open(g_zfs, + argv[0], ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + if (!zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) || + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == -1) { + (void) fprintf(stderr, + gettext("'%s' does not have any " + "resumable receive state to abort\n"), + argv[0]); + nvlist_free(props); + zfs_close(zhp); + return (1); + } + err = zfs_destroy(zhp, B_FALSE); + zfs_close(zhp); + } + nvlist_free(props); + return (err != 0); + } + + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + gettext("Error: Backup stream can not be read " + "from a terminal.\n" + "You must redirect standard input.\n")); + nvlist_free(props); + return (1); + } + err = zfs_receive(g_zfs, argv[0], props, &flags, STDIN_FILENO, NULL); + nvlist_free(props); + + return (err != 0); +} + +/* + * allow/unallow stuff + */ +/* copied from zfs/sys/dsl_deleg.h */ +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" /* ??? */ +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" +#define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" +#define ZFS_DELEG_PERM_USEROBJUSED "userobjused" +#define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" + +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" +#define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_LOAD_KEY "load-key" +#define ZFS_DELEG_PERM_CHANGE_KEY "change-key" + +#define ZFS_DELEG_PERM_PROJECTUSED "projectused" +#define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" +#define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" +#define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" + +#define ZFS_NUM_DELEG_NOTES ZFS_DELEG_NOTE_NONE + +static zfs_deleg_perm_tab_t zfs_deleg_perm_tbl[] = { + { ZFS_DELEG_PERM_ALLOW, ZFS_DELEG_NOTE_ALLOW }, + { ZFS_DELEG_PERM_CLONE, ZFS_DELEG_NOTE_CLONE }, + { ZFS_DELEG_PERM_CREATE, ZFS_DELEG_NOTE_CREATE }, + { ZFS_DELEG_PERM_DESTROY, ZFS_DELEG_NOTE_DESTROY }, + { ZFS_DELEG_PERM_DIFF, ZFS_DELEG_NOTE_DIFF}, + { ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + { ZFS_DELEG_PERM_MOUNT, ZFS_DELEG_NOTE_MOUNT }, + { ZFS_DELEG_PERM_PROMOTE, ZFS_DELEG_NOTE_PROMOTE }, + { ZFS_DELEG_PERM_RECEIVE, ZFS_DELEG_NOTE_RECEIVE }, + { ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, + { ZFS_DELEG_PERM_RENAME, ZFS_DELEG_NOTE_RENAME }, + { ZFS_DELEG_PERM_ROLLBACK, ZFS_DELEG_NOTE_ROLLBACK }, + { ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_SEND }, + { ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, + { ZFS_DELEG_PERM_SNAPSHOT, ZFS_DELEG_NOTE_SNAPSHOT }, + { ZFS_DELEG_PERM_BOOKMARK, ZFS_DELEG_NOTE_BOOKMARK }, + { ZFS_DELEG_PERM_LOAD_KEY, ZFS_DELEG_NOTE_LOAD_KEY }, + { ZFS_DELEG_PERM_CHANGE_KEY, ZFS_DELEG_NOTE_CHANGE_KEY }, + + { ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + { ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + { ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + { ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + { ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + { ZFS_DELEG_PERM_USEROBJQUOTA, ZFS_DELEG_NOTE_USEROBJQUOTA }, + { ZFS_DELEG_PERM_USEROBJUSED, ZFS_DELEG_NOTE_USEROBJUSED }, + { ZFS_DELEG_PERM_GROUPOBJQUOTA, ZFS_DELEG_NOTE_GROUPOBJQUOTA }, + { ZFS_DELEG_PERM_GROUPOBJUSED, ZFS_DELEG_NOTE_GROUPOBJUSED }, + { ZFS_DELEG_PERM_PROJECTUSED, ZFS_DELEG_NOTE_PROJECTUSED }, + { ZFS_DELEG_PERM_PROJECTQUOTA, ZFS_DELEG_NOTE_PROJECTQUOTA }, + { ZFS_DELEG_PERM_PROJECTOBJUSED, ZFS_DELEG_NOTE_PROJECTOBJUSED }, + { ZFS_DELEG_PERM_PROJECTOBJQUOTA, ZFS_DELEG_NOTE_PROJECTOBJQUOTA }, + { NULL, ZFS_DELEG_NOTE_NONE } +}; + +/* permission structure */ +typedef struct deleg_perm { + zfs_deleg_who_type_t dp_who_type; + const char *dp_name; + boolean_t dp_local; + boolean_t dp_descend; +} deleg_perm_t; + +/* */ +typedef struct deleg_perm_node { + deleg_perm_t dpn_perm; + + uu_avl_node_t dpn_avl_node; +} deleg_perm_node_t; + +typedef struct fs_perm fs_perm_t; + +/* permissions set */ +typedef struct who_perm { + zfs_deleg_who_type_t who_type; + const char *who_name; /* id */ + char who_ug_name[256]; /* user/group name */ + fs_perm_t *who_fsperm; /* uplink */ + + uu_avl_t *who_deleg_perm_avl; /* permissions */ +} who_perm_t; + +/* */ +typedef struct who_perm_node { + who_perm_t who_perm; + uu_avl_node_t who_avl_node; +} who_perm_node_t; + +typedef struct fs_perm_set fs_perm_set_t; +/* fs permissions */ +struct fs_perm { + const char *fsp_name; + + uu_avl_t *fsp_sc_avl; /* sets,create */ + uu_avl_t *fsp_uge_avl; /* user,group,everyone */ + + fs_perm_set_t *fsp_set; /* uplink */ +}; + +/* */ +typedef struct fs_perm_node { + fs_perm_t fspn_fsperm; + uu_avl_t *fspn_avl; + + uu_list_node_t fspn_list_node; +} fs_perm_node_t; + +/* top level structure */ +struct fs_perm_set { + uu_list_pool_t *fsps_list_pool; + uu_list_t *fsps_list; /* list of fs_perms */ + + uu_avl_pool_t *fsps_named_set_avl_pool; + uu_avl_pool_t *fsps_who_perm_avl_pool; + uu_avl_pool_t *fsps_deleg_perm_avl_pool; +}; + +static inline const char * +deleg_perm_type(zfs_deleg_note_t note) +{ + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + case ZFS_DELEG_NOTE_GROUPUSED: + case ZFS_DELEG_NOTE_USERPROP: + case ZFS_DELEG_NOTE_USERQUOTA: + case ZFS_DELEG_NOTE_USERUSED: + case ZFS_DELEG_NOTE_USEROBJQUOTA: + case ZFS_DELEG_NOTE_USEROBJUSED: + case ZFS_DELEG_NOTE_GROUPOBJQUOTA: + case ZFS_DELEG_NOTE_GROUPOBJUSED: + case ZFS_DELEG_NOTE_PROJECTUSED: + case ZFS_DELEG_NOTE_PROJECTQUOTA: + case ZFS_DELEG_NOTE_PROJECTOBJUSED: + case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: + /* other */ + return (gettext("other")); + default: + return (gettext("subcommand")); + } +} + +static int +who_type2weight(zfs_deleg_who_type_t who_type) +{ + int res; + switch (who_type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + res = 0; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + res = 1; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + res = 2; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + res = 3; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + res = 4; + break; + default: + res = -1; + } + + return (res); +} + +/* ARGSUSED */ +static int +who_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const who_perm_node_t *l = larg; + const who_perm_node_t *r = rarg; + zfs_deleg_who_type_t ltype = l->who_perm.who_type; + zfs_deleg_who_type_t rtype = r->who_perm.who_type; + int lweight = who_type2weight(ltype); + int rweight = who_type2weight(rtype); + int res = lweight - rweight; + if (res == 0) + res = strncmp(l->who_perm.who_name, r->who_perm.who_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + if (res > 0) + return (1); + else + return (-1); +} + +/* ARGSUSED */ +static int +deleg_perm_compare(const void *larg, const void *rarg, void *unused) +{ + const deleg_perm_node_t *l = larg; + const deleg_perm_node_t *r = rarg; + int res = strncmp(l->dpn_perm.dp_name, r->dpn_perm.dp_name, + ZFS_MAX_DELEG_NAME-1); + + if (res == 0) + return (0); + + if (res > 0) + return (1); + else + return (-1); +} + +static inline void +fs_perm_set_init(fs_perm_set_t *fspset) +{ + bzero(fspset, sizeof (fs_perm_set_t)); + + if ((fspset->fsps_list_pool = uu_list_pool_create("fsps_list_pool", + sizeof (fs_perm_node_t), offsetof(fs_perm_node_t, fspn_list_node), + NULL, UU_DEFAULT)) == NULL) + nomem(); + if ((fspset->fsps_list = uu_list_create(fspset->fsps_list_pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_named_set_avl_pool = uu_avl_pool_create( + "named_set_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_who_perm_avl_pool = uu_avl_pool_create( + "who_perm_avl_pool", sizeof (who_perm_node_t), offsetof( + who_perm_node_t, who_avl_node), who_perm_compare, + UU_DEFAULT)) == NULL) + nomem(); + + if ((fspset->fsps_deleg_perm_avl_pool = uu_avl_pool_create( + "deleg_perm_avl_pool", sizeof (deleg_perm_node_t), offsetof( + deleg_perm_node_t, dpn_avl_node), deleg_perm_compare, UU_DEFAULT)) + == NULL) + nomem(); +} + +static inline void fs_perm_fini(fs_perm_t *); +static inline void who_perm_fini(who_perm_t *); + +static inline void +fs_perm_set_fini(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = uu_list_first(fspset->fsps_list); + + while (node != NULL) { + fs_perm_node_t *next_node = + uu_list_next(fspset->fsps_list, node); + fs_perm_t *fsperm = &node->fspn_fsperm; + fs_perm_fini(fsperm); + uu_list_remove(fspset->fsps_list, node); + free(node); + node = next_node; + } + + uu_avl_pool_destroy(fspset->fsps_named_set_avl_pool); + uu_avl_pool_destroy(fspset->fsps_who_perm_avl_pool); + uu_avl_pool_destroy(fspset->fsps_deleg_perm_avl_pool); +} + +static inline void +deleg_perm_init(deleg_perm_t *deleg_perm, zfs_deleg_who_type_t type, + const char *name) +{ + deleg_perm->dp_who_type = type; + deleg_perm->dp_name = name; +} + +static inline void +who_perm_init(who_perm_t *who_perm, fs_perm_t *fsperm, + zfs_deleg_who_type_t type, const char *name) +{ + uu_avl_pool_t *pool; + pool = fsperm->fsp_set->fsps_deleg_perm_avl_pool; + + bzero(who_perm, sizeof (who_perm_t)); + + if ((who_perm->who_deleg_perm_avl = uu_avl_create(pool, NULL, + UU_DEFAULT)) == NULL) + nomem(); + + who_perm->who_type = type; + who_perm->who_name = name; + who_perm->who_fsperm = fsperm; +} + +static inline void +who_perm_fini(who_perm_t *who_perm) +{ + deleg_perm_node_t *node = uu_avl_first(who_perm->who_deleg_perm_avl); + + while (node != NULL) { + deleg_perm_node_t *next_node = + uu_avl_next(who_perm->who_deleg_perm_avl, node); + + uu_avl_remove(who_perm->who_deleg_perm_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(who_perm->who_deleg_perm_avl); +} + +static inline void +fs_perm_init(fs_perm_t *fsperm, fs_perm_set_t *fspset, const char *fsname) +{ + uu_avl_pool_t *nset_pool = fspset->fsps_named_set_avl_pool; + uu_avl_pool_t *who_pool = fspset->fsps_who_perm_avl_pool; + + bzero(fsperm, sizeof (fs_perm_t)); + + if ((fsperm->fsp_sc_avl = uu_avl_create(nset_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + if ((fsperm->fsp_uge_avl = uu_avl_create(who_pool, NULL, UU_DEFAULT)) + == NULL) + nomem(); + + fsperm->fsp_set = fspset; + fsperm->fsp_name = fsname; +} + +static inline void +fs_perm_fini(fs_perm_t *fsperm) +{ + who_perm_node_t *node = uu_avl_first(fsperm->fsp_sc_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_sc_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_sc_avl, node); + free(node); + node = next_node; + } + + node = uu_avl_first(fsperm->fsp_uge_avl); + while (node != NULL) { + who_perm_node_t *next_node = uu_avl_next(fsperm->fsp_uge_avl, + node); + who_perm_t *who_perm = &node->who_perm; + who_perm_fini(who_perm); + uu_avl_remove(fsperm->fsp_uge_avl, node); + free(node); + node = next_node; + } + + uu_avl_destroy(fsperm->fsp_sc_avl); + uu_avl_destroy(fsperm->fsp_uge_avl); +} + +static void +set_deleg_perm_node(uu_avl_t *avl, deleg_perm_node_t *node, + zfs_deleg_who_type_t who_type, const char *name, char locality) +{ + uu_avl_index_t idx = 0; + + deleg_perm_node_t *found_node = NULL; + deleg_perm_t *deleg_perm = &node->dpn_perm; + + deleg_perm_init(deleg_perm, who_type, name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) + uu_avl_insert(avl, node, idx); + else { + node = found_node; + deleg_perm = &node->dpn_perm; + } + + + switch (locality) { + case ZFS_DELEG_LOCAL: + deleg_perm->dp_local = B_TRUE; + break; + case ZFS_DELEG_DESCENDENT: + deleg_perm->dp_descend = B_TRUE; + break; + case ZFS_DELEG_NA: + break; + default: + assert(B_FALSE); /* invalid locality */ + } +} + +static inline int +parse_who_perm(who_perm_t *who_perm, nvlist_t *nvl, char locality) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = who_perm->who_fsperm->fsp_set; + uu_avl_t *avl = who_perm->who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_perm->who_type; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + uu_avl_pool_t *avl_pool = fspset->fsps_deleg_perm_avl_pool; + deleg_perm_node_t *node = + safe_malloc(sizeof (deleg_perm_node_t)); + + VERIFY(type == DATA_TYPE_BOOLEAN); + + uu_avl_node_init(node, &node->dpn_avl_node, avl_pool); + set_deleg_perm_node(avl, node, who_type, name, locality); + } + + return (0); +} + +static inline int +parse_fs_perm(fs_perm_t *fsperm, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + fs_perm_set_t *fspset = fsperm->fsp_set; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *name = nvpair_name(nvp); + uu_avl_t *avl = NULL; + uu_avl_pool_t *avl_pool = NULL; + zfs_deleg_who_type_t perm_type = name[0]; + char perm_locality = name[1]; + const char *perm_name = name + 3; + who_perm_t *who_perm = NULL; + + assert('$' == name[2]); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + switch (perm_type) { + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + avl_pool = fspset->fsps_named_set_avl_pool; + avl = fsperm->fsp_sc_avl; + break; + case ZFS_DELEG_USER: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + avl_pool = fspset->fsps_who_perm_avl_pool; + avl = fsperm->fsp_uge_avl; + break; + + default: + assert(!"unhandled zfs_deleg_who_type_t"); + } + + who_perm_node_t *found_node = NULL; + who_perm_node_t *node = safe_malloc( + sizeof (who_perm_node_t)); + who_perm = &node->who_perm; + uu_avl_index_t idx = 0; + + uu_avl_node_init(node, &node->who_avl_node, avl_pool); + who_perm_init(who_perm, fsperm, perm_type, perm_name); + + if ((found_node = uu_avl_find(avl, node, NULL, &idx)) + == NULL) { + if (avl == fsperm->fsp_uge_avl) { + uid_t rid = 0; + struct passwd *p = NULL; + struct group *g = NULL; + const char *nice_name = NULL; + + switch (perm_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + rid = atoi(perm_name); + p = getpwuid(rid); + if (p) + nice_name = p->pw_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + rid = atoi(perm_name); + g = getgrgid(rid); + if (g) + nice_name = g->gr_name; + break; + + default: + break; + } + + if (nice_name != NULL) { + (void) strlcpy( + node->who_perm.who_ug_name, + nice_name, 256); + } else { + /* User or group unknown */ + (void) snprintf( + node->who_perm.who_ug_name, + sizeof (node->who_perm.who_ug_name), + "(unknown: %d)", rid); + } + } + + uu_avl_insert(avl, node, idx); + } else { + node = found_node; + who_perm = &node->who_perm; + } + + assert(who_perm != NULL); + (void) parse_who_perm(who_perm, nvl2, perm_locality); + } + + return (0); +} + +static inline int +parse_fs_perm_set(fs_perm_set_t *fspset, nvlist_t *nvl) +{ + nvpair_t *nvp = NULL; + uu_avl_index_t idx = 0; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + nvlist_t *nvl2 = NULL; + const char *fsname = nvpair_name(nvp); + data_type_t type = nvpair_type(nvp); + fs_perm_t *fsperm = NULL; + fs_perm_node_t *node = safe_malloc(sizeof (fs_perm_node_t)); + if (node == NULL) + nomem(); + + fsperm = &node->fspn_fsperm; + + VERIFY(DATA_TYPE_NVLIST == type); + + uu_list_node_init(node, &node->fspn_list_node, + fspset->fsps_list_pool); + + idx = uu_list_numnodes(fspset->fsps_list); + fs_perm_init(fsperm, fspset, fsname); + + if (nvpair_value_nvlist(nvp, &nvl2) != 0) + return (-1); + + (void) parse_fs_perm(fsperm, nvl2); + + uu_list_insert(fspset->fsps_list, node, idx); + } + + return (0); +} + +static inline const char * +deleg_perm_comment(zfs_deleg_note_t note) +{ + const char *str = ""; + + /* subcommands */ + switch (note) { + /* SUBCOMMANDS */ + case ZFS_DELEG_NOTE_ALLOW: + str = gettext("Must also have the permission that is being" + "\n\t\t\t\tallowed"); + break; + case ZFS_DELEG_NOTE_CLONE: + str = gettext("Must also have the 'create' ability and 'mount'" + "\n\t\t\t\tability in the origin file system"); + break; + case ZFS_DELEG_NOTE_CREATE: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DESTROY: + str = gettext("Must also have the 'mount' ability"); + break; + case ZFS_DELEG_NOTE_DIFF: + str = gettext("Allows lookup of paths within a dataset;" + "\n\t\t\t\tgiven an object number. Ordinary users need this" + "\n\t\t\t\tin order to use zfs diff"); + break; + case ZFS_DELEG_NOTE_HOLD: + str = gettext("Allows adding a user hold to a snapshot"); + break; + case ZFS_DELEG_NOTE_MOUNT: + str = gettext("Allows mount/umount of ZFS datasets"); + break; + case ZFS_DELEG_NOTE_PROMOTE: + str = gettext("Must also have the 'mount'\n\t\t\t\tand" + " 'promote' ability in the origin file system"); + break; + case ZFS_DELEG_NOTE_RECEIVE: + str = gettext("Must also have the 'mount' and 'create'" + " ability"); + break; + case ZFS_DELEG_NOTE_RELEASE: + str = gettext("Allows releasing a user hold which\n\t\t\t\t" + "might destroy the snapshot"); + break; + case ZFS_DELEG_NOTE_RENAME: + str = gettext("Must also have the 'mount' and 'create'" + "\n\t\t\t\tability in the new parent"); + break; + case ZFS_DELEG_NOTE_ROLLBACK: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SEND: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_SHARE: + str = gettext("Allows sharing file systems over NFS or SMB" + "\n\t\t\t\tprotocols"); + break; + case ZFS_DELEG_NOTE_SNAPSHOT: + str = gettext(""); + break; + case ZFS_DELEG_NOTE_LOAD_KEY: + str = gettext("Allows loading or unloading an encryption key"); + break; + case ZFS_DELEG_NOTE_CHANGE_KEY: + str = gettext("Allows changing or adding an encryption key"); + break; +/* + * case ZFS_DELEG_NOTE_VSCAN: + * str = gettext(""); + * break; + */ + /* OTHER */ + case ZFS_DELEG_NOTE_GROUPQUOTA: + str = gettext("Allows accessing any groupquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPUSED: + str = gettext("Allows reading any groupused@... property"); + break; + case ZFS_DELEG_NOTE_USERPROP: + str = gettext("Allows changing any user property"); + break; + case ZFS_DELEG_NOTE_USERQUOTA: + str = gettext("Allows accessing any userquota@... property"); + break; + case ZFS_DELEG_NOTE_USERUSED: + str = gettext("Allows reading any userused@... property"); + break; + case ZFS_DELEG_NOTE_USEROBJQUOTA: + str = gettext("Allows accessing any userobjquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPOBJQUOTA: + str = gettext("Allows accessing any \n\t\t\t\t" + "groupobjquota@... property"); + break; + case ZFS_DELEG_NOTE_GROUPOBJUSED: + str = gettext("Allows reading any groupobjused@... property"); + break; + case ZFS_DELEG_NOTE_USEROBJUSED: + str = gettext("Allows reading any userobjused@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTQUOTA: + str = gettext("Allows accessing any projectquota@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTOBJQUOTA: + str = gettext("Allows accessing any \n\t\t\t\t" + "projectobjquota@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTUSED: + str = gettext("Allows reading any projectused@... property"); + break; + case ZFS_DELEG_NOTE_PROJECTOBJUSED: + str = gettext("Allows accessing any \n\t\t\t\t" + "projectobjused@... property"); + break; + /* other */ + default: + str = ""; + } + + return (str); +} + +struct allow_opts { + boolean_t local; + boolean_t descend; + boolean_t user; + boolean_t group; + boolean_t everyone; + boolean_t create; + boolean_t set; + boolean_t recursive; /* unallow only */ + boolean_t prt_usage; + + boolean_t prt_perms; + char *who; + char *perms; + const char *dataset; +}; + +static inline int +prop_cmp(const void *a, const void *b) +{ + const char *str1 = *(const char **)a; + const char *str2 = *(const char **)b; + return (strcmp(str1, str2)); +} + +static void +allow_usage(boolean_t un, boolean_t requested, const char *msg) +{ + const char *opt_desc[] = { + "-h", gettext("show this help message and exit"), + "-l", gettext("set permission locally"), + "-d", gettext("set permission for descents"), + "-u", gettext("set permission for user"), + "-g", gettext("set permission for group"), + "-e", gettext("set permission for everyone"), + "-c", gettext("set create time permission"), + "-s", gettext("define permission set"), + /* unallow only */ + "-r", gettext("remove permissions recursively"), + }; + size_t unallow_size = sizeof (opt_desc) / sizeof (char *); + size_t allow_size = unallow_size - 2; + const char *props[ZFS_NUM_PROPS]; + int i; + size_t count = 0; + FILE *fp = requested ? stdout : stderr; + zprop_desc_t *pdtbl = zfs_prop_get_table(); + const char *fmt = gettext("%-16s %-14s\t%s\n"); + + (void) fprintf(fp, gettext("Usage: %s\n"), get_usage(un ? HELP_UNALLOW : + HELP_ALLOW)); + (void) fprintf(fp, gettext("Options:\n")); + for (i = 0; i < (un ? unallow_size : allow_size); i += 2) { + const char *opt = opt_desc[i]; + const char *optdsc = opt_desc[i + 1]; + (void) fprintf(fp, gettext(" %-10s %s\n"), opt, optdsc); + } + + (void) fprintf(fp, gettext("\nThe following permissions are " + "supported:\n\n")); + (void) fprintf(fp, fmt, gettext("NAME"), gettext("TYPE"), + gettext("NOTES")); + for (i = 0; i < ZFS_NUM_DELEG_NOTES; i++) { + const char *perm_name = zfs_deleg_perm_tbl[i].z_perm; + zfs_deleg_note_t perm_note = zfs_deleg_perm_tbl[i].z_note; + const char *perm_type = deleg_perm_type(perm_note); + const char *perm_comment = deleg_perm_comment(perm_note); + (void) fprintf(fp, fmt, perm_name, perm_type, perm_comment); + } + + for (i = 0; i < ZFS_NUM_PROPS; i++) { + zprop_desc_t *pd = &pdtbl[i]; + if (pd->pd_visible != B_TRUE) + continue; + + if (pd->pd_attr == PROP_READONLY) + continue; + + props[count++] = pd->pd_name; + } + props[count] = NULL; + + qsort(props, count, sizeof (char *), prop_cmp); + + for (i = 0; i < count; i++) + (void) fprintf(fp, fmt, props[i], gettext("property"), ""); + + if (msg != NULL) + (void) fprintf(fp, gettext("\nzfs: error: %s"), msg); + + exit(requested ? 0 : 2); +} + +static inline const char * +munge_args(int argc, char **argv, boolean_t un, size_t expected_argc, + char **permsp) +{ + if (un && argc == expected_argc - 1) + *permsp = NULL; + else if (argc == expected_argc) + *permsp = argv[argc - 2]; + else + allow_usage(un, B_FALSE, + gettext("wrong number of parameters\n")); + + return (argv[argc - 1]); +} + +static void +parse_allow_args(int argc, char **argv, boolean_t un, struct allow_opts *opts) +{ + int uge_sum = opts->user + opts->group + opts->everyone; + int csuge_sum = opts->create + opts->set + uge_sum; + int ldcsuge_sum = csuge_sum + opts->local + opts->descend; + int all_sum = un ? ldcsuge_sum + opts->recursive : ldcsuge_sum; + + if (uge_sum > 1) + allow_usage(un, B_FALSE, + gettext("-u, -g, and -e are mutually exclusive\n")); + + if (opts->prt_usage) { + if (argc == 0 && all_sum == 0) + allow_usage(un, B_TRUE, NULL); + else + usage(B_FALSE); + } + + if (opts->set) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -s\n")); + + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + if (argv[0][0] != '@') + allow_usage(un, B_FALSE, + gettext("invalid set name: missing '@' prefix\n")); + opts->who = argv[0]; + } else if (opts->create) { + if (ldcsuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -c\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (opts->everyone) { + if (csuge_sum > 1) + allow_usage(un, B_FALSE, + gettext("invalid options combined with -e\n")); + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (uge_sum == 0 && argc > 0 && strcmp(argv[0], "everyone") + == 0) { + opts->everyone = B_TRUE; + argc--; + argv++; + opts->dataset = munge_args(argc, argv, un, 2, &opts->perms); + } else if (argc == 1 && !un) { + opts->prt_perms = B_TRUE; + opts->dataset = argv[argc-1]; + } else { + opts->dataset = munge_args(argc, argv, un, 3, &opts->perms); + opts->who = argv[0]; + } + + if (!opts->local && !opts->descend) { + opts->local = B_TRUE; + opts->descend = B_TRUE; + } +} + +static void +store_allow_perm(zfs_deleg_who_type_t type, boolean_t local, boolean_t descend, + const char *who, char *perms, nvlist_t *top_nvl) +{ + int i; + char ld[2] = { '\0', '\0' }; + char who_buf[MAXNAMELEN + 32]; + char base_type = '\0'; + char set_type = '\0'; + nvlist_t *base_nvl = NULL; + nvlist_t *set_nvl = NULL; + nvlist_t *nvl; + + if (nvlist_alloc(&base_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + if (nvlist_alloc(&set_nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + switch (type) { + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + set_type = ZFS_DELEG_NAMED_SET_SETS; + base_type = ZFS_DELEG_NAMED_SET; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_CREATE_SETS: + case ZFS_DELEG_CREATE: + set_type = ZFS_DELEG_CREATE_SETS; + base_type = ZFS_DELEG_CREATE; + ld[0] = ZFS_DELEG_NA; + break; + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + set_type = ZFS_DELEG_USER_SETS; + base_type = ZFS_DELEG_USER; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + set_type = ZFS_DELEG_GROUP_SETS; + base_type = ZFS_DELEG_GROUP; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + set_type = ZFS_DELEG_EVERYONE_SETS; + base_type = ZFS_DELEG_EVERYONE; + if (local) + ld[0] = ZFS_DELEG_LOCAL; + if (descend) + ld[1] = ZFS_DELEG_DESCENDENT; + break; + + default: + assert(set_type != '\0' && base_type != '\0'); + } + + if (perms != NULL) { + char *curr = perms; + char *end = curr + strlen(perms); + + while (curr < end) { + char *delim = strchr(curr, ','); + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + if (curr[0] == '@') + nvl = set_nvl; + else + nvl = base_nvl; + + (void) nvlist_add_boolean(nvl, curr); + if (delim != end) + *delim = ','; + curr = delim + 1; + } + + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (!nvlist_empty(base_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + base_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + base_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + base_nvl); + } + + + if (!nvlist_empty(set_nvl)) { + if (who != NULL) + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$%s", + set_type, locality, who); + else + (void) snprintf(who_buf, + sizeof (who_buf), "%c%c$", + set_type, locality); + + (void) nvlist_add_nvlist(top_nvl, who_buf, + set_nvl); + } + } + } else { + for (i = 0; i < 2; i++) { + char locality = ld[i]; + if (locality == 0) + continue; + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", base_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", base_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + + if (who != NULL) + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$%s", set_type, locality, who); + else + (void) snprintf(who_buf, sizeof (who_buf), + "%c%c$", set_type, locality); + (void) nvlist_add_boolean(top_nvl, who_buf); + } + } +} + +static int +construct_fsacl_list(boolean_t un, struct allow_opts *opts, nvlist_t **nvlp) +{ + if (nvlist_alloc(nvlp, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + if (opts->set) { + store_allow_perm(ZFS_DELEG_NAMED_SET, opts->local, + opts->descend, opts->who, opts->perms, *nvlp); + } else if (opts->create) { + store_allow_perm(ZFS_DELEG_CREATE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else if (opts->everyone) { + store_allow_perm(ZFS_DELEG_EVERYONE, opts->local, + opts->descend, NULL, opts->perms, *nvlp); + } else { + char *curr = opts->who; + char *end = curr + strlen(curr); + + while (curr < end) { + const char *who; + zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; + char *endch; + char *delim = strchr(curr, ','); + char errbuf[256]; + char id[64]; + struct passwd *p = NULL; + struct group *g = NULL; + + uid_t rid; + if (delim == NULL) + delim = end; + else + *delim = '\0'; + + rid = (uid_t)strtol(curr, &endch, 0); + if (opts->user) { + who_type = ZFS_DELEG_USER; + if (*endch != '\0') + p = getpwnam(curr); + else + p = getpwuid(rid); + + if (p != NULL) + rid = p->pw_uid; + else if (*endch != '\0') { + (void) snprintf(errbuf, 256, gettext( + "invalid user %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else if (opts->group) { + who_type = ZFS_DELEG_GROUP; + if (*endch != '\0') + g = getgrnam(curr); + else + g = getgrgid(rid); + + if (g != NULL) + rid = g->gr_gid; + else if (*endch != '\0') { + (void) snprintf(errbuf, 256, gettext( + "invalid group %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } else { + if (*endch != '\0') { + p = getpwnam(curr); + } else { + p = getpwuid(rid); + } + + if (p == NULL) { + if (*endch != '\0') { + g = getgrnam(curr); + } else { + g = getgrgid(rid); + } + } + + if (p != NULL) { + who_type = ZFS_DELEG_USER; + rid = p->pw_uid; + } else if (g != NULL) { + who_type = ZFS_DELEG_GROUP; + rid = g->gr_gid; + } else { + (void) snprintf(errbuf, 256, gettext( + "invalid user/group %s\n"), curr); + allow_usage(un, B_TRUE, errbuf); + } + } + + (void) sprintf(id, "%u", rid); + who = id; + + store_allow_perm(who_type, opts->local, + opts->descend, who, opts->perms, *nvlp); + curr = delim + 1; + } + } + + return (0); +} + +static void +print_set_creat_perms(uu_avl_t *who_avl) +{ + const char *sc_title[] = { + gettext("Permission sets:\n"), + gettext("Create time permissions:\n"), + NULL + }; + who_perm_node_t *who_node = NULL; + int prev_weight = -1; + + for (who_node = uu_avl_first(who_avl); who_node != NULL; + who_node = uu_avl_next(who_avl, who_node)) { + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + const char *who_name = who_node->who_perm.who_name; + int weight = who_type2weight(who_type); + boolean_t first = B_TRUE; + deleg_perm_node_t *deleg_node; + + if (prev_weight != weight) { + (void) printf("%s", sc_title[weight]); + prev_weight = weight; + } + + if (who_name == NULL || strnlen(who_name, 1) == 0) + (void) printf("\t"); + else + (void) printf("\t%s ", who_name); + + for (deleg_node = uu_avl_first(avl); deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (first) { + (void) printf("%s", + deleg_node->dpn_perm.dp_name); + first = B_FALSE; + } else + (void) printf(",%s", + deleg_node->dpn_perm.dp_name); + } + + (void) printf("\n"); + } +} + +static void +print_uge_deleg_perms(uu_avl_t *who_avl, boolean_t local, boolean_t descend, + const char *title) +{ + who_perm_node_t *who_node = NULL; + boolean_t prt_title = B_TRUE; + uu_avl_walk_t *walk; + + if ((walk = uu_avl_walk_start(who_avl, UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((who_node = uu_avl_walk_next(walk)) != NULL) { + const char *who_name = who_node->who_perm.who_name; + const char *nice_who_name = who_node->who_perm.who_ug_name; + uu_avl_t *avl = who_node->who_perm.who_deleg_perm_avl; + zfs_deleg_who_type_t who_type = who_node->who_perm.who_type; + char delim = ' '; + deleg_perm_node_t *deleg_node; + boolean_t prt_who = B_TRUE; + + for (deleg_node = uu_avl_first(avl); + deleg_node != NULL; + deleg_node = uu_avl_next(avl, deleg_node)) { + if (local != deleg_node->dpn_perm.dp_local || + descend != deleg_node->dpn_perm.dp_descend) + continue; + + if (prt_who) { + const char *who = NULL; + if (prt_title) { + prt_title = B_FALSE; + (void) printf("%s", title); + } + + switch (who_type) { + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_USER: + who = gettext("user"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_GROUP_SETS: + case ZFS_DELEG_GROUP: + who = gettext("group"); + if (nice_who_name) + who_name = nice_who_name; + break; + case ZFS_DELEG_EVERYONE_SETS: + case ZFS_DELEG_EVERYONE: + who = gettext("everyone"); + who_name = NULL; + break; + + default: + assert(who != NULL); + } + + prt_who = B_FALSE; + if (who_name == NULL) + (void) printf("\t%s", who); + else + (void) printf("\t%s %s", who, who_name); + } + + (void) printf("%c%s", delim, + deleg_node->dpn_perm.dp_name); + delim = ','; + } + + if (!prt_who) + (void) printf("\n"); + } + + uu_avl_walk_end(walk); +} + +static void +print_fs_perms(fs_perm_set_t *fspset) +{ + fs_perm_node_t *node = NULL; + char buf[MAXNAMELEN + 32]; + const char *dsname = buf; + + for (node = uu_list_first(fspset->fsps_list); node != NULL; + node = uu_list_next(fspset->fsps_list, node)) { + uu_avl_t *sc_avl = node->fspn_fsperm.fsp_sc_avl; + uu_avl_t *uge_avl = node->fspn_fsperm.fsp_uge_avl; + int left = 0; + + (void) snprintf(buf, sizeof (buf), + gettext("---- Permissions on %s "), + node->fspn_fsperm.fsp_name); + (void) printf("%s", dsname); + left = 70 - strlen(buf); + while (left-- > 0) + (void) printf("-"); + (void) printf("\n"); + + print_set_creat_perms(sc_avl); + print_uge_deleg_perms(uge_avl, B_TRUE, B_FALSE, + gettext("Local permissions:\n")); + print_uge_deleg_perms(uge_avl, B_FALSE, B_TRUE, + gettext("Descendent permissions:\n")); + print_uge_deleg_perms(uge_avl, B_TRUE, B_TRUE, + gettext("Local+Descendent permissions:\n")); + } +} + +static fs_perm_set_t fs_perm_set = { NULL, NULL, NULL, NULL }; + +struct deleg_perms { + boolean_t un; + nvlist_t *nvl; +}; + +static int +set_deleg_perms(zfs_handle_t *zhp, void *data) +{ + struct deleg_perms *perms = (struct deleg_perms *)data; + zfs_type_t zfs_type = zfs_get_type(zhp); + + if (zfs_type != ZFS_TYPE_FILESYSTEM && zfs_type != ZFS_TYPE_VOLUME) + return (0); + + return (zfs_set_fsacl(zhp, perms->un, perms->nvl)); +} + +static int +zfs_do_allow_unallow_impl(int argc, char **argv, boolean_t un) +{ + zfs_handle_t *zhp; + nvlist_t *perm_nvl = NULL; + nvlist_t *update_perm_nvl = NULL; + int error = 1; + int c; + struct allow_opts opts = { 0 }; + + const char *optstr = un ? "ldugecsrh" : "ldugecsh"; + + /* check opts */ + while ((c = getopt(argc, argv, optstr)) != -1) { + switch (c) { + case 'l': + opts.local = B_TRUE; + break; + case 'd': + opts.descend = B_TRUE; + break; + case 'u': + opts.user = B_TRUE; + break; + case 'g': + opts.group = B_TRUE; + break; + case 'e': + opts.everyone = B_TRUE; + break; + case 's': + opts.set = B_TRUE; + break; + case 'c': + opts.create = B_TRUE; + break; + case 'r': + opts.recursive = B_TRUE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case 'h': + opts.prt_usage = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + parse_allow_args(argc, argv, un, &opts); + + /* try to open the dataset */ + if ((zhp = zfs_open(g_zfs, opts.dataset, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + (void) fprintf(stderr, "Failed to open dataset: %s\n", + opts.dataset); + return (-1); + } + + if (zfs_get_fsacl(zhp, &perm_nvl) != 0) + goto cleanup2; + + fs_perm_set_init(&fs_perm_set); + if (parse_fs_perm_set(&fs_perm_set, perm_nvl) != 0) { + (void) fprintf(stderr, "Failed to parse fsacl permissions\n"); + goto cleanup1; + } + + if (opts.prt_perms) + print_fs_perms(&fs_perm_set); + else { + (void) construct_fsacl_list(un, &opts, &update_perm_nvl); + if (zfs_set_fsacl(zhp, un, update_perm_nvl) != 0) + goto cleanup0; + + if (un && opts.recursive) { + struct deleg_perms data = { un, update_perm_nvl }; + if (zfs_iter_filesystems(zhp, set_deleg_perms, + &data) != 0) + goto cleanup0; + } + } + + error = 0; + +cleanup0: + nvlist_free(perm_nvl); + nvlist_free(update_perm_nvl); +cleanup1: + fs_perm_set_fini(&fs_perm_set); +cleanup2: + zfs_close(zhp); + + return (error); +} + +static int +zfs_do_allow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_FALSE)); +} + +static int +zfs_do_unallow(int argc, char **argv) +{ + return (zfs_do_allow_unallow_impl(argc, argv, B_TRUE)); +} + +static int +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) +{ + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + const char *opts = holding ? "rt" : "r"; + int c; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 2) + usage(B_FALSE); + + tag = argv[0]; + --argc; + ++argv; + + if (holding && tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, gettext("tag may not start with '.'\n")); + usage(B_FALSE); + } + + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + const char *delim; + char *path = argv[i]; + + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; + + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (holding) { + if (zfs_hold(zhp, delim+1, tag, recursive, -1) != 0) + ++errors; + } else { + if (zfs_release(zhp, delim+1, tag, recursive) != 0) + ++errors; + } + zfs_close(zhp); + } + + return (errors != 0); +} + +/* + * zfs hold [-r] [-t] ... + * + * -r Recursively hold + * + * Apply a user-hold with the given tag to the list of snapshots. + */ +static int +zfs_do_hold(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); +} + +/* + * zfs release [-r] ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ +static int +zfs_do_release(int argc, char **argv) +{ + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); +} + +typedef struct holds_cbdata { + boolean_t cb_recursive; + const char *cb_snapname; + nvlist_t **cb_nvlp; + size_t cb_max_namelen; + size_t cb_max_taglen; +} holds_cbdata_t; + +#define STRFTIME_FMT_STR "%a %b %e %H:%M %Y" +#define DATETIME_BUF_LEN (32) +/* + * + */ +static void +print_holds(boolean_t scripted, int nwidth, int tagwidth, nvlist_t *nvl) +{ + int i; + nvpair_t *nvp = NULL; + char *hdr_cols[] = { "NAME", "TAG", "TIMESTAMP" }; + const char *col; + + if (!scripted) { + for (i = 0; i < 3; i++) { + col = gettext(hdr_cols[i]); + if (i < 2) + (void) printf("%-*s ", i ? tagwidth : nwidth, + col); + else + (void) printf("%s\n", col); + } + } + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + char *zname = nvpair_name(nvp); + nvlist_t *nvl2; + nvpair_t *nvp2 = NULL; + (void) nvpair_value_nvlist(nvp, &nvl2); + while ((nvp2 = nvlist_next_nvpair(nvl2, nvp2)) != NULL) { + char tsbuf[DATETIME_BUF_LEN]; + char *tagname = nvpair_name(nvp2); + uint64_t val = 0; + time_t time; + struct tm t; + + (void) nvpair_value_uint64(nvp2, &val); + time = (time_t)val; + (void) localtime_r(&time, &t); + (void) strftime(tsbuf, DATETIME_BUF_LEN, + gettext(STRFTIME_FMT_STR), &t); + + if (scripted) { + (void) printf("%s\t%s\t%s\n", zname, + tagname, tsbuf); + } else { + (void) printf("%-*s %-*s %s\n", nwidth, + zname, tagwidth, tagname, tsbuf); + } + } + } +} + +/* + * Generic callback function to list a dataset or snapshot. + */ +static int +holds_callback(zfs_handle_t *zhp, void *data) +{ + holds_cbdata_t *cbp = data; + nvlist_t *top_nvl = *cbp->cb_nvlp; + nvlist_t *nvl = NULL; + nvpair_t *nvp = NULL; + const char *zname = zfs_get_name(zhp); + size_t znamelen = strlen(zname); + + if (cbp->cb_recursive) { + const char *snapname; + char *delim = strchr(zname, '@'); + if (delim == NULL) + return (0); + + snapname = delim + 1; + if (strcmp(cbp->cb_snapname, snapname)) + return (0); + } + + if (zfs_get_holds(zhp, &nvl) != 0) + return (-1); + + if (znamelen > cbp->cb_max_namelen) + cbp->cb_max_namelen = znamelen; + + while ((nvp = nvlist_next_nvpair(nvl, nvp)) != NULL) { + const char *tag = nvpair_name(nvp); + size_t taglen = strlen(tag); + if (taglen > cbp->cb_max_taglen) + cbp->cb_max_taglen = taglen; + } + + return (nvlist_add_nvlist(top_nvl, zname, nvl)); +} + +/* + * zfs holds [-rH] ... + * + * -r Lists holds that are set on the named snapshots recursively. + * -H Scripted mode; elide headers and separate columns by tabs. + */ +static int +zfs_do_holds(int argc, char **argv) +{ + int errors = 0; + int c; + int i; + boolean_t scripted = B_FALSE; + boolean_t recursive = B_FALSE; + const char *opts = "rH"; + nvlist_t *nvl; + + int types = ZFS_TYPE_SNAPSHOT; + holds_cbdata_t cb = { 0 }; + + int limit = 0; + int ret = 0; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { + switch (c) { + case 'r': + recursive = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (recursive) { + types |= ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME; + flags |= ZFS_ITER_RECURSE; + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) + usage(B_FALSE); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + nomem(); + + for (i = 0; i < argc; ++i) { + char *snapshot = argv[i]; + const char *delim; + const char *snapname; + + delim = strchr(snapshot, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), snapshot); + ++errors; + continue; + } + snapname = delim + 1; + if (recursive) + snapshot[delim - snapshot] = '\0'; + + cb.cb_recursive = recursive; + cb.cb_snapname = snapname; + cb.cb_nvlp = &nvl; + + /* + * 1. collect holds data, set format options + */ + ret = zfs_for_each(argc, argv, flags, types, NULL, NULL, limit, + holds_callback, &cb); + if (ret != 0) + ++errors; + } + + /* + * 2. print holds data + */ + print_holds(scripted, cb.cb_max_namelen, cb.cb_max_taglen, nvl); + + if (nvlist_empty(nvl)) + (void) fprintf(stderr, gettext("no datasets available\n")); + + nvlist_free(nvl); + + return (0 != errors); +} + +#define CHECK_SPINNER 30 +#define SPINNER_TIME 3 /* seconds */ +#define MOUNT_TIME 1 /* seconds */ + +typedef struct get_all_state { + boolean_t ga_verbose; + get_all_cb_t *ga_cbp; +} get_all_state_t; + +static int +get_one_dataset(zfs_handle_t *zhp, void *data) +{ + static char *spin[] = { "-", "\\", "|", "/" }; + static int spinval = 0; + static int spincheck = 0; + static time_t last_spin_time = (time_t)0; + get_all_state_t *state = data; + zfs_type_t type = zfs_get_type(zhp); + + if (state->ga_verbose) { + if (--spincheck < 0) { + time_t now = time(NULL); + if (last_spin_time + SPINNER_TIME < now) { + update_progress(spin[spinval++ % 4]); + last_spin_time = now; + } + spincheck = CHECK_SPINNER; + } + } + + /* + * Iterate over any nested datasets. + */ + if (zfs_iter_filesystems(zhp, get_one_dataset, data) != 0) { + zfs_close(zhp); + return (1); + } + + /* + * Skip any datasets whose type does not match. + */ + if ((type & ZFS_TYPE_FILESYSTEM) == 0) { + zfs_close(zhp); + return (0); + } + libzfs_add_handle(state->ga_cbp, zhp); + assert(state->ga_cbp->cb_used <= state->ga_cbp->cb_alloc); + + return (0); +} + +static void +get_all_datasets(get_all_cb_t *cbp, boolean_t verbose) +{ + get_all_state_t state = { + .ga_verbose = verbose, + .ga_cbp = cbp + }; + + if (verbose) + set_progress_header(gettext("Reading ZFS config")); + (void) zfs_iter_root(g_zfs, get_one_dataset, &state); + + if (verbose) + finish_progress(gettext("done.")); +} + +/* + * Generic callback for sharing or mounting filesystems. Because the code is so + * similar, we have a common function with an extra parameter to determine which + * mode we are using. + */ +typedef enum { OP_SHARE, OP_MOUNT } share_mount_op_t; + +typedef struct share_mount_state { + share_mount_op_t sm_op; + boolean_t sm_verbose; + int sm_flags; + char *sm_options; + char *sm_proto; /* only valid for OP_SHARE */ + pthread_mutex_t sm_lock; /* protects the remaining fields */ + uint_t sm_total; /* number of filesystems to process */ + uint_t sm_done; /* number of filesystems processed */ + int sm_status; /* -1 if any of the share/mount operations failed */ +} share_mount_state_t; + +/* + * Share or mount a dataset. + */ +static int +share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, + boolean_t explicit, const char *options) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char smbshareopts[ZFS_MAXPROPLEN]; + const char *cmdname = op == OP_SHARE ? "share" : "mount"; + struct mnttab mnt; + uint64_t zoned, canmount; + boolean_t shared_nfs, shared_smb; + + assert(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM); + + /* + * Check to make sure we can mount/share this dataset. If we + * are in the global zone and the filesystem is exported to a + * local zone, or if we are in a local zone and the + * filesystem is not exported, then it is an error. + */ + zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + + if (zoned && getzoneid() == GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "dataset is exported to a local zone\n"), cmdname, + zfs_get_name(zhp)); + return (1); + + } else if (!zoned && getzoneid() != GLOBAL_ZONEID) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "permission denied\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + + /* + * Ignore any filesystems which don't apply to us. This + * includes those with a legacy mountpoint, or those with + * legacy share options. + */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, shareopts, + sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, + sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); + + if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && + strcmp(smbshareopts, "off") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share '%s': " + "legacy share\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use share(1M) to " + "share this filesystem, or set " + "sharenfs property on\n")); + return (1); + } + + /* + * We cannot share or mount legacy filesystems. If the + * shareopts is non-legacy but the mountpoint is legacy, we + * treat it as a legacy share. + */ + if (strcmp(mountpoint, "legacy") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "legacy mountpoint\n"), cmdname, zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use %s(1M) to " + "%s this filesystem\n"), cmdname, cmdname); + return (1); + } + + if (strcmp(mountpoint, "none") == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': no " + "mountpoint set\n"), cmdname, zfs_get_name(zhp)); + return (1); + } + + /* + * canmount explicit outcome + * on no pass through + * on yes pass through + * off no return 0 + * off yes display error, return 1 + * noauto no return 0 + * noauto yes pass through + */ + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); + if (canmount == ZFS_CANMOUNT_OFF) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "'canmount' property is set to 'off'\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } else if (canmount == ZFS_CANMOUNT_NOAUTO && !explicit) { + /* + * When performing a 'zfs mount -a', we skip any mounts for + * datasets that have 'noauto' set. Sharing a dataset with + * 'noauto' set is only allowed if it's mounted. + */ + if (op == OP_MOUNT) + return (0); + if (op == OP_SHARE && !zfs_is_mounted(zhp, NULL)) { + /* also purge it from existing exports */ + zfs_unshareall_bypath(zhp, mountpoint); + return (0); + } + } + + /* + * If this filesystem is encrypted and does not have + * a loaded key, we can not mount it. + */ + if ((flags & MS_CRYPT) == 0 && + zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF && + zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_UNAVAILABLE) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "encryption key not loaded\n"), cmdname, zfs_get_name(zhp)); + return (1); + } + + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Contains partially-completed state from " + "\"zfs receive -s\", which can be resumed with " + "\"zfs send -t\"\n"), + cmdname, zfs_get_name(zhp)); + return (1); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot %s '%s': " + "Dataset is not complete, was created by receiving " + "a redacted zfs send stream.\n"), cmdname, + zfs_get_name(zhp)); + return (1); + } + + /* + * At this point, we have verified that the mountpoint and/or + * shareopts are appropriate for auto management. If the + * filesystem is already mounted or shared, return (failing + * for explicit requests); otherwise mount or share the + * filesystem. + */ + switch (op) { + case OP_SHARE: + + shared_nfs = zfs_is_shared_nfs(zhp, NULL); + shared_smb = zfs_is_shared_smb(zhp, NULL); + + if ((shared_nfs && shared_smb) || + (shared_nfs && strcmp(shareopts, "on") == 0 && + strcmp(smbshareopts, "off") == 0) || + (shared_smb && strcmp(smbshareopts, "on") == 0 && + strcmp(shareopts, "off") == 0)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot share " + "'%s': filesystem already shared\n"), + zfs_get_name(zhp)); + return (1); + } + + if (!zfs_is_mounted(zhp, NULL) && + zfs_mount(zhp, NULL, flags) != 0) + return (1); + + if (protocol == NULL) { + if (zfs_shareall(zhp) != 0) + return (1); + } else if (strcmp(protocol, "nfs") == 0) { + if (zfs_share_nfs(zhp)) + return (1); + } else if (strcmp(protocol, "smb") == 0) { + if (zfs_share_smb(zhp)) + return (1); + } else { + (void) fprintf(stderr, gettext("cannot share " + "'%s': invalid share type '%s' " + "specified\n"), + zfs_get_name(zhp), protocol); + return (1); + } + + break; + + case OP_MOUNT: + if (options == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = (char *)options; + + if (!hasmntopt(&mnt, MNTOPT_REMOUNT) && + zfs_is_mounted(zhp, NULL)) { + if (!explicit) + return (0); + + (void) fprintf(stderr, gettext("cannot mount " + "'%s': filesystem already mounted\n"), + zfs_get_name(zhp)); + return (1); + } + + if (zfs_mount(zhp, options, flags) != 0) + return (1); + break; + } + + return (0); +} + +/* + * Reports progress in the form "(current/total)". Not thread-safe. + */ +static void +report_mount_progress(int current, int total) +{ + static time_t last_progress_time = 0; + time_t now = time(NULL); + char info[32]; + + /* report 1..n instead of 0..n-1 */ + ++current; + + /* display header if we're here for the first time */ + if (current == 1) { + set_progress_header(gettext("Mounting ZFS filesystems")); + } else if (current != total && last_progress_time + MOUNT_TIME >= now) { + /* too soon to report again */ + return; + } + + last_progress_time = now; + + (void) sprintf(info, "(%d/%d)", current, total); + + if (current == total) + finish_progress(info); + else + update_progress(info); +} + +/* + * zfs_foreach_mountpoint() callback that mounts or shares one filesystem and + * updates the progress meter. + */ +static int +share_mount_one_cb(zfs_handle_t *zhp, void *arg) +{ + share_mount_state_t *sms = arg; + int ret; + + ret = share_mount_one(zhp, sms->sm_op, sms->sm_flags, sms->sm_proto, + B_FALSE, sms->sm_options); + + pthread_mutex_lock(&sms->sm_lock); + if (ret != 0) + sms->sm_status = ret; + sms->sm_done++; + if (sms->sm_verbose) + report_mount_progress(sms->sm_done, sms->sm_total); + pthread_mutex_unlock(&sms->sm_lock); + return (ret); +} + +static void +append_options(char *mntopts, char *newopts) +{ + int len = strlen(mntopts); + + /* original length plus new string to append plus 1 for the comma */ + if (len + 1 + strlen(newopts) >= MNT_LINE_MAX) { + (void) fprintf(stderr, gettext("the opts argument for " + "'%s' option is too long (more than %d chars)\n"), + "-o", MNT_LINE_MAX); + usage(B_FALSE); + } + + if (*mntopts) + mntopts[len++] = ','; + + (void) strcpy(&mntopts[len], newopts); +} + +static int +share_mount(int op, int argc, char **argv) +{ + int do_all = 0; + boolean_t verbose = B_FALSE; + int c, ret = 0; + char *options = NULL; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_MOUNT ? ":alvo:Of" : "al")) + != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'v': + verbose = B_TRUE; + break; + case 'l': + flags |= MS_CRYPT; + break; + case 'o': + if (*optarg == '\0') { + (void) fprintf(stderr, gettext("empty mount " + "options (-o) specified\n")); + usage(B_FALSE); + } + + if (options == NULL) + options = safe_malloc(MNT_LINE_MAX + 1); + + /* option validation is done later */ + append_options(options, optarg); + break; + case 'O': + flags |= MS_OVERLAY; + break; + case 'f': + flags |= MS_FORCE; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (do_all) { + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + start_progress_timer(); + get_all_cb_t cb = { 0 }; + get_all_datasets(&cb, verbose); + + if (cb.cb_used == 0) { + if (options != NULL) + free(options); + return (0); + } + + share_mount_state_t share_mount_state = { 0 }; + share_mount_state.sm_op = op; + share_mount_state.sm_verbose = verbose; + share_mount_state.sm_flags = flags; + share_mount_state.sm_options = options; + share_mount_state.sm_proto = protocol; + share_mount_state.sm_total = cb.cb_used; + pthread_mutex_init(&share_mount_state.sm_lock, NULL); + + /* + * libshare isn't mt-safe, so only do the operation in parallel + * if we're mounting. Additionally, the key-loading option must + * be serialized so that we can prompt the user for their keys + * in a consistent manner. + */ + zfs_foreach_mountpoint(g_zfs, cb.cb_handles, cb.cb_used, + share_mount_one_cb, &share_mount_state, + op == OP_MOUNT && !(flags & MS_CRYPT)); + zfs_commit_all_shares(); + + ret = share_mount_state.sm_status; + + for (int i = 0; i < cb.cb_used; i++) + zfs_close(cb.cb_handles[i]); + free(cb.cb_handles); + } else if (argc == 0) { + struct mnttab entry; + + if ((op == OP_SHARE) || (options != NULL)) { + (void) fprintf(stderr, gettext("missing filesystem " + "argument (specify -a for all)\n")); + usage(B_FALSE); + } + + /* + * When mount is given no arguments, go through + * /proc/self/mounts and display any active ZFS mounts. + * We hide any snapshots, since they are controlled + * automatically. + */ + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) { + if (options != NULL) + free(options); + return (ENOENT); + } + + while (getmntent(mnttab_file, &entry) == 0) { + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0 || + strchr(entry.mnt_special, '@') != NULL) + continue; + + (void) printf("%-30s %s\n", entry.mnt_special, + entry.mnt_mountp); + } + + } else { + zfs_handle_t *zhp; + + if (argc > 1) { + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + } else { + ret = share_mount_one(zhp, op, flags, NULL, B_TRUE, + options); + zfs_commit_all_shares(); + zfs_close(zhp); + } + } + + if (options != NULL) + free(options); + + return (ret); +} + +/* + * zfs mount -a [nfs] + * zfs mount filesystem + * + * Mount all filesystems, or mount the given filesystem. + */ +static int +zfs_do_mount(int argc, char **argv) +{ + return (share_mount(OP_MOUNT, argc, argv)); +} + +/* + * zfs share -a [nfs | smb] + * zfs share filesystem + * + * Share all filesystems, or share the given filesystem. + */ +static int +zfs_do_share(int argc, char **argv) +{ + return (share_mount(OP_SHARE, argc, argv)); +} + +typedef struct unshare_unmount_node { + zfs_handle_t *un_zhp; + char *un_mountp; + uu_avl_node_t un_avlnode; +} unshare_unmount_node_t; + +/* ARGSUSED */ +static int +unshare_unmount_compare(const void *larg, const void *rarg, void *unused) +{ + const unshare_unmount_node_t *l = larg; + const unshare_unmount_node_t *r = rarg; + + return (strcmp(l->un_mountp, r->un_mountp)); +} + +/* + * Convenience routine used by zfs_do_umount() and manual_unmount(). Given an + * absolute path, find the entry /proc/self/mounts, verify that it's a + * ZFS filesystem, and unmount it appropriately. + */ +static int +unshare_unmount_path(int op, char *path, int flags, boolean_t is_manual) +{ + zfs_handle_t *zhp; + int ret = 0; + struct stat64 statbuf; + struct extmnttab entry; + const char *cmdname = (op == OP_SHARE) ? "unshare" : "unmount"; + ino_t path_inode; + + /* + * Search for the given (major,minor) pair in the mount table. + */ + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) + return (ENOENT); + + if (getextmntent(path, &entry, &statbuf) != 0) { + if (op == OP_SHARE) { + (void) fprintf(stderr, gettext("cannot %s '%s': not " + "currently mounted\n"), cmdname, path); + return (1); + } + (void) fprintf(stderr, gettext("warning: %s not in" + "/proc/self/mounts\n"), path); + if ((ret = umount2(path, flags)) != 0) + (void) fprintf(stderr, gettext("%s: %s\n"), path, + strerror(errno)); + return (ret != 0); + } + path_inode = statbuf.st_ino; + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': not a ZFS " + "filesystem\n"), cmdname, path); + return (1); + } + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + ret = 1; + if (stat64(entry.mnt_mountp, &statbuf) != 0) { + (void) fprintf(stderr, gettext("cannot %s '%s': %s\n"), + cmdname, path, strerror(errno)); + goto out; + } else if (statbuf.st_ino != path_inode) { + (void) fprintf(stderr, gettext("cannot " + "%s '%s': not a mountpoint\n"), cmdname, path); + goto out; + } + + if (op == OP_SHARE) { + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char smbshare_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, nfs_mnt_prop, + sizeof (nfs_mnt_prop), NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshare_prop, + sizeof (smbshare_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(smbshare_prop, "off") == 0) { + (void) fprintf(stderr, gettext("cannot unshare " + "'%s': legacy share\n"), path); + (void) fprintf(stderr, gettext("use exportfs(8) " + "or smbcontrol(1) to unshare this filesystem\n")); + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot unshare '%s': " + "not currently shared\n"), path); + } else { + ret = zfs_unshareall_bypath(zhp, path); + zfs_commit_all_shares(); + } + } else { + char mtpt_prop[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, mtpt_prop, + sizeof (mtpt_prop), NULL, NULL, 0, B_FALSE) == 0); + + if (is_manual) { + ret = zfs_unmount(zhp, NULL, flags); + } else if (strcmp(mtpt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot unmount " + "'%s': legacy mountpoint\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use umount(8) " + "to unmount this filesystem\n")); + } else { + ret = zfs_unmountall(zhp, flags); + } + } + +out: + zfs_close(zhp); + + return (ret != 0); +} + +/* + * Generic callback for unsharing or unmounting a filesystem. + */ +static int +unshare_unmount(int op, int argc, char **argv) +{ + int do_all = 0; + int flags = 0; + int ret = 0; + int c; + zfs_handle_t *zhp; + char nfs_mnt_prop[ZFS_MAXPROPLEN]; + char sharesmb[ZFS_MAXPROPLEN]; + + /* check options */ + while ((c = getopt(argc, argv, op == OP_SHARE ? ":a" : "afu")) != -1) { + switch (c) { + case 'a': + do_all = 1; + break; + case 'f': + flags |= MS_FORCE; + break; + case 'u': + flags |= MS_CRYPT; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (do_all) { + /* + * We could make use of zfs_for_each() to walk all datasets in + * the system, but this would be very inefficient, especially + * since we would have to linearly search /proc/self/mounts for + * each one. Instead, do one pass through /proc/self/mounts + * looking for zfs entries and call zfs_unmount() for each one. + * + * Things get a little tricky if the administrator has created + * mountpoints beneath other ZFS filesystems. In this case, we + * have to unmount the deepest filesystems first. To accomplish + * this, we place all the mountpoints in an AVL tree sorted by + * the special type (dataset name), and walk the result in + * reverse to make sure to get any snapshots first. + */ + struct mnttab entry; + uu_avl_pool_t *pool; + uu_avl_t *tree = NULL; + unshare_unmount_node_t *node; + uu_avl_index_t idx; + uu_avl_walk_t *walk; + char *protocol = NULL; + + if (op == OP_SHARE && argc > 0) { + if (strcmp(argv[0], "nfs") != 0 && + strcmp(argv[0], "smb") != 0) { + (void) fprintf(stderr, gettext("share type " + "must be 'nfs' or 'smb'\n")); + usage(B_FALSE); + } + protocol = argv[0]; + argc--; + argv++; + } + + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (((pool = uu_avl_pool_create("unmount_pool", + sizeof (unshare_unmount_node_t), + offsetof(unshare_unmount_node_t, un_avlnode), + unshare_unmount_compare, UU_DEFAULT)) == NULL) || + ((tree = uu_avl_create(pool, NULL, UU_DEFAULT)) == NULL)) + nomem(); + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", mnttab_file) == NULL) + return (ENOENT); + + while (getmntent(mnttab_file, &entry) == 0) { + + /* ignore non-ZFS entries */ + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* ignore snapshots */ + if (strchr(entry.mnt_special, '@') != NULL) + continue; + + if ((zhp = zfs_open(g_zfs, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) { + ret = 1; + continue; + } + + /* + * Ignore datasets that are excluded/restricted by + * parent pool name. + */ + if (zpool_skip_pool(zfs_get_pool_name(zhp))) { + zfs_close(zhp); + continue; + } + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") != 0) + break; + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "off") == 0) + continue; + break; + case OP_MOUNT: + /* Ignore legacy mounts */ + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(nfs_mnt_prop, "legacy") == 0) + continue; + /* Ignore canmount=noauto mounts */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == + ZFS_CANMOUNT_NOAUTO) + continue; + default: + break; + } + + node = safe_malloc(sizeof (unshare_unmount_node_t)); + node->un_zhp = zhp; + node->un_mountp = safe_strdup(entry.mnt_mountp); + + uu_avl_node_init(node, &node->un_avlnode, pool); + + if (uu_avl_find(tree, node, NULL, &idx) == NULL) { + uu_avl_insert(tree, node, idx); + } else { + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + } + + /* + * Walk the AVL tree in reverse, unmounting each filesystem and + * removing it from the AVL tree in the process. + */ + if ((walk = uu_avl_walk_start(tree, + UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) + nomem(); + + while ((node = uu_avl_walk_next(walk)) != NULL) { + const char *mntarg = NULL; + + uu_avl_remove(tree, node); + switch (op) { + case OP_SHARE: + if (zfs_unshareall_bytype(node->un_zhp, + node->un_mountp, protocol) != 0) + ret = 1; + break; + + case OP_MOUNT: + if (zfs_unmount(node->un_zhp, + mntarg, flags) != 0) + ret = 1; + break; + } + + zfs_close(node->un_zhp); + free(node->un_mountp); + free(node); + } + + if (op == OP_SHARE) + zfs_commit_shares(protocol); + + uu_avl_walk_end(walk); + uu_avl_destroy(tree); + uu_avl_pool_destroy(pool); + + } else { + if (argc != 1) { + if (argc == 0) + (void) fprintf(stderr, + gettext("missing filesystem argument\n")); + else + (void) fprintf(stderr, + gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * We have an argument, but it may be a full path or a ZFS + * filesystem. Pass full paths off to unmount_path() (shared by + * manual_unmount), otherwise open the filesystem and pass to + * zfs_unmount(). + */ + if (argv[0][0] == '/') + return (unshare_unmount_path(op, argv[0], + flags, B_FALSE)); + + if ((zhp = zfs_open(g_zfs, argv[0], + ZFS_TYPE_FILESYSTEM)) == NULL) + return (1); + + verify(zfs_prop_get(zhp, op == OP_SHARE ? + ZFS_PROP_SHARENFS : ZFS_PROP_MOUNTPOINT, + nfs_mnt_prop, sizeof (nfs_mnt_prop), NULL, + NULL, 0, B_FALSE) == 0); + + switch (op) { + case OP_SHARE: + verify(zfs_prop_get(zhp, ZFS_PROP_SHARENFS, + nfs_mnt_prop, + sizeof (nfs_mnt_prop), + NULL, NULL, 0, B_FALSE) == 0); + verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, + sharesmb, sizeof (sharesmb), NULL, NULL, + 0, B_FALSE) == 0); + + if (strcmp(nfs_mnt_prop, "off") == 0 && + strcmp(sharesmb, "off") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': legacy share\n"), + zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "unshare(1M) to unshare this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_shared(zhp)) { + (void) fprintf(stderr, gettext("cannot " + "unshare '%s': not currently " + "shared\n"), zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unshareall(zhp) != 0) { + ret = 1; + } + break; + + case OP_MOUNT: + if (strcmp(nfs_mnt_prop, "legacy") == 0) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': legacy " + "mountpoint\n"), zfs_get_name(zhp)); + (void) fprintf(stderr, gettext("use " + "umount(1M) to unmount this " + "filesystem\n")); + ret = 1; + } else if (!zfs_is_mounted(zhp, NULL)) { + (void) fprintf(stderr, gettext("cannot " + "unmount '%s': not currently " + "mounted\n"), + zfs_get_name(zhp)); + ret = 1; + } else if (zfs_unmountall(zhp, flags) != 0) { + ret = 1; + } + break; + } + + zfs_close(zhp); + } + + return (ret); +} + +/* + * zfs unmount [-fu] -a + * zfs unmount [-fu] filesystem + * + * Unmount all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unmount(int argc, char **argv) +{ + return (unshare_unmount(OP_MOUNT, argc, argv)); +} + +/* + * zfs unshare -a + * zfs unshare filesystem + * + * Unshare all filesystems, or a specific ZFS filesystem. + */ +static int +zfs_do_unshare(int argc, char **argv) +{ + return (unshare_unmount(OP_SHARE, argc, argv)); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +static int +zfs_do_diff(int argc, char **argv) +{ + zfs_handle_t *zhp; + int flags = 0; + char *tosnap = NULL; + char *fromsnap = NULL; + char *atp, *copy; + int err = 0; + int c; + struct sigaction sa; + + while ((c = getopt(argc, argv, "FHt")) != -1) { + switch (c) { + case 'F': + flags |= ZFS_DIFF_CLASSIFY; + break; + case 'H': + flags |= ZFS_DIFF_PARSEABLE; + break; + case 't': + flags |= ZFS_DIFF_TIMESTAMP; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, + gettext("must provide at least one snapshot name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + fromsnap = argv[0]; + tosnap = (argc == 2) ? argv[1] : NULL; + + copy = NULL; + if (*fromsnap != '@') + copy = strdup(fromsnap); + else if (tosnap) + copy = strdup(tosnap); + if (copy == NULL) + usage(B_FALSE); + + if ((atp = strchr(copy, '@')) != NULL) + *atp = '\0'; + + if ((zhp = zfs_open(g_zfs, copy, ZFS_TYPE_FILESYSTEM)) == NULL) { + free(copy); + return (1); + } + free(copy); + + /* + * Ignore SIGPIPE so that the library can give us + * information on any failure + */ + if (sigemptyset(&sa.sa_mask) == -1) { + err = errno; + goto out; + } + sa.sa_flags = 0; + sa.sa_handler = SIG_IGN; + if (sigaction(SIGPIPE, &sa, NULL) == -1) { + err = errno; + goto out; + } + + err = zfs_show_diffs(zhp, STDOUT_FILENO, fromsnap, tosnap, flags); +out: + zfs_close(zhp); + + return (err != 0); +} + +/* + * zfs bookmark | + * + * Creates a bookmark with the given name from the source snapshot + * or creates a copy of an existing source bookmark. + */ +static int +zfs_do_bookmark(int argc, char **argv) +{ + char *source, *bookname; + char expbuf[ZFS_MAX_DATASET_NAME_LEN]; + int source_type; + nvlist_t *nvl; + int ret = 0; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + /* check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing source argument\n")); + goto usage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing bookmark argument\n")); + goto usage; + } + + source = argv[0]; + bookname = argv[1]; + + if (strchr(source, '@') == NULL && strchr(source, '#') == NULL) { + (void) fprintf(stderr, + gettext("invalid source name '%s': " + "must contain a '@' or '#'\n"), source); + goto usage; + } + if (strchr(bookname, '#') == NULL) { + (void) fprintf(stderr, + gettext("invalid bookmark name '%s': " + "must contain a '#'\n"), bookname); + goto usage; + } + + /* + * expand source or bookname to full path: + * one of them may be specified as short name + */ + { + char **expand; + char *source_short, *bookname_short; + source_short = strpbrk(source, "@#"); + bookname_short = strpbrk(bookname, "#"); + if (source_short == source && + bookname_short == bookname) { + (void) fprintf(stderr, gettext( + "either source or bookmark must be specified as " + "full dataset paths")); + goto usage; + } else if (source_short != source && + bookname_short != bookname) { + expand = NULL; + } else if (source_short != source) { + strlcpy(expbuf, source, sizeof (expbuf)); + expand = &bookname; + } else if (bookname_short != bookname) { + strlcpy(expbuf, bookname, sizeof (expbuf)); + expand = &source; + } else { + abort(); + } + if (expand != NULL) { + *strpbrk(expbuf, "@#") = '\0'; /* dataset name in buf */ + (void) strlcat(expbuf, *expand, sizeof (expbuf)); + *expand = expbuf; + } + } + + /* determine source type */ + switch (*strpbrk(source, "@#")) { + case '@': source_type = ZFS_TYPE_SNAPSHOT; break; + case '#': source_type = ZFS_TYPE_BOOKMARK; break; + default: abort(); + } + + /* test the source exists */ + zfs_handle_t *zhp; + zhp = zfs_open(g_zfs, source, source_type); + if (zhp == NULL) + goto usage; + zfs_close(zhp); + + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, bookname, source); + ret = lzc_bookmark(nvl, NULL); + fnvlist_free(nvl); + + if (ret != 0) { + const char *err_msg = NULL; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create bookmark '%s'"), bookname); + + switch (ret) { + case EXDEV: + err_msg = "bookmark is in a different pool"; + break; + case ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: + err_msg = "source is not an ancestor of the " + "new bookmark's dataset"; + break; + case EEXIST: + err_msg = "bookmark exists"; + break; + case EINVAL: + err_msg = "invalid argument"; + break; + case ENOTSUP: + err_msg = "bookmark feature not enabled"; + break; + case ENOSPC: + err_msg = "out of space"; + break; + case ENOENT: + err_msg = "dataset does not exist"; + break; + default: + (void) zfs_standard_error(g_zfs, ret, errbuf); + break; + } + if (err_msg != NULL) { + (void) fprintf(stderr, "%s: %s\n", errbuf, + dgettext(TEXT_DOMAIN, err_msg)); + } + } + + return (ret != 0); + +usage: + usage(B_FALSE); + return (-1); +} + +static int +zfs_do_channel_program(int argc, char **argv) +{ + int ret, fd, c; + char *progbuf, *filename, *poolname; + size_t progsize, progread; + nvlist_t *outnvl = NULL; + uint64_t instrlimit = ZCP_DEFAULT_INSTRLIMIT; + uint64_t memlimit = ZCP_DEFAULT_MEMLIMIT; + boolean_t sync_flag = B_TRUE, json_output = B_FALSE; + zpool_handle_t *zhp; + + /* check options */ + while ((c = getopt(argc, argv, "nt:m:j")) != -1) { + switch (c) { + case 't': + case 'm': { + uint64_t arg; + char *endp; + + errno = 0; + arg = strtoull(optarg, &endp, 0); + if (errno != 0 || *endp != '\0') { + (void) fprintf(stderr, gettext( + "invalid argument " + "'%s': expected integer\n"), optarg); + goto usage; + } + + if (c == 't') { + instrlimit = arg; + } else { + ASSERT3U(c, ==, 'm'); + memlimit = arg; + } + break; + } + case 'n': { + sync_flag = B_FALSE; + break; + } + case 'j': { + json_output = B_TRUE; + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto usage; + } + } + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, + gettext("invalid number of arguments\n")); + goto usage; + } + + poolname = argv[0]; + filename = argv[1]; + if (strcmp(filename, "-") == 0) { + fd = 0; + filename = "standard input"; + } else if ((fd = open(filename, O_RDONLY)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), + filename, strerror(errno)); + return (1); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + (void) fprintf(stderr, gettext("cannot open pool '%s'\n"), + poolname); + if (fd != 0) + (void) close(fd); + return (1); + } + zpool_close(zhp); + + /* + * Read in the channel program, expanding the program buffer as + * necessary. + */ + progread = 0; + progsize = 1024; + progbuf = safe_malloc(progsize); + do { + ret = read(fd, progbuf + progread, progsize - progread); + progread += ret; + if (progread == progsize && ret > 0) { + progsize *= 2; + progbuf = safe_realloc(progbuf, progsize); + } + } while (ret > 0); + + if (fd != 0) + (void) close(fd); + if (ret < 0) { + free(progbuf); + (void) fprintf(stderr, + gettext("cannot read '%s': %s\n"), + filename, strerror(errno)); + return (1); + } + progbuf[progread] = '\0'; + + /* + * Any remaining arguments are passed as arguments to the lua script as + * a string array: + * { + * "argv" -> [ "arg 1", ... "arg n" ], + * } + */ + nvlist_t *argnvl = fnvlist_alloc(); + fnvlist_add_string_array(argnvl, ZCP_ARG_CLIARGV, argv + 2, argc - 2); + + if (sync_flag) { + ret = lzc_channel_program(poolname, progbuf, + instrlimit, memlimit, argnvl, &outnvl); + } else { + ret = lzc_channel_program_nosync(poolname, progbuf, + instrlimit, memlimit, argnvl, &outnvl); + } + + if (ret != 0) { + /* + * On error, report the error message handed back by lua if one + * exists. Otherwise, generate an appropriate error message, + * falling back on strerror() for an unexpected return code. + */ + char *errstring = NULL; + const char *msg = gettext("Channel program execution failed"); + uint64_t instructions = 0; + if (outnvl != NULL && nvlist_exists(outnvl, ZCP_RET_ERROR)) { + (void) nvlist_lookup_string(outnvl, + ZCP_RET_ERROR, &errstring); + if (errstring == NULL) + errstring = strerror(ret); + if (ret == ETIME) { + (void) nvlist_lookup_uint64(outnvl, + ZCP_ARG_INSTRLIMIT, &instructions); + } + } else { + switch (ret) { + case EINVAL: + errstring = + "Invalid instruction or memory limit."; + break; + case ENOMEM: + errstring = "Return value too large."; + break; + case ENOSPC: + errstring = "Memory limit exhausted."; + break; + case ETIME: + errstring = "Timed out."; + break; + case EPERM: + errstring = "Permission denied. Channel " + "programs must be run as root."; + break; + default: + (void) zfs_standard_error(g_zfs, ret, msg); + } + } + if (errstring != NULL) + (void) fprintf(stderr, "%s:\n%s\n", msg, errstring); + + if (ret == ETIME && instructions != 0) + (void) fprintf(stderr, + gettext("%llu Lua instructions\n"), + (u_longlong_t)instructions); + } else { + if (json_output) { + (void) nvlist_print_json(stdout, outnvl); + } else if (nvlist_empty(outnvl)) { + (void) fprintf(stdout, gettext("Channel program fully " + "executed and did not produce output.\n")); + } else { + (void) fprintf(stdout, gettext("Channel program fully " + "executed and produced output:\n")); + dump_nvlist(outnvl, 4); + } + } + + free(progbuf); + fnvlist_free(outnvl); + fnvlist_free(argnvl); + return (ret != 0); + +usage: + usage(B_FALSE); + return (-1); +} + + +typedef struct loadkey_cbdata { + boolean_t cb_loadkey; + boolean_t cb_recursive; + boolean_t cb_noop; + char *cb_keylocation; + uint64_t cb_numfailed; + uint64_t cb_numattempted; +} loadkey_cbdata_t; + +static int +load_key_callback(zfs_handle_t *zhp, void *data) +{ + int ret; + boolean_t is_encroot; + loadkey_cbdata_t *cb = data; + uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + + /* + * If we are working recursively, we want to skip loading / unloading + * keys for non-encryption roots and datasets whose keys are already + * in the desired end-state. + */ + if (cb->cb_recursive) { + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + if (ret != 0) + return (ret); + if (!is_encroot) + return (0); + + if ((cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_AVAILABLE) || + (!cb->cb_loadkey && keystatus == ZFS_KEYSTATUS_UNAVAILABLE)) + return (0); + } + + cb->cb_numattempted++; + + if (cb->cb_loadkey) + ret = zfs_crypto_load_key(zhp, cb->cb_noop, cb->cb_keylocation); + else + ret = zfs_crypto_unload_key(zhp); + + if (ret != 0) { + cb->cb_numfailed++; + return (ret); + } + + return (0); +} + +static int +load_unload_keys(int argc, char **argv, boolean_t loadkey) +{ + int c, ret = 0, flags = 0; + boolean_t do_all = B_FALSE; + loadkey_cbdata_t cb = { 0 }; + + cb.cb_loadkey = loadkey; + + while ((c = getopt(argc, argv, "anrL:")) != -1) { + /* noop and alternate keylocations only apply to zfs load-key */ + if (loadkey) { + switch (c) { + case 'n': + cb.cb_noop = B_TRUE; + continue; + case 'L': + cb.cb_keylocation = optarg; + continue; + default: + break; + } + } + + switch (c) { + case 'a': + do_all = B_TRUE; + cb.cb_recursive = B_TRUE; + break; + case 'r': + flags |= ZFS_ITER_RECURSE; + cb.cb_recursive = B_TRUE; + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (!do_all && argc == 0) { + (void) fprintf(stderr, + gettext("Missing dataset argument or -a option\n")); + usage(B_FALSE); + } + + if (do_all && argc != 0) { + (void) fprintf(stderr, + gettext("Cannot specify dataset with -a option\n")); + usage(B_FALSE); + } + + if (cb.cb_recursive && cb.cb_keylocation != NULL && + strcmp(cb.cb_keylocation, "prompt") != 0) { + (void) fprintf(stderr, gettext("alternate keylocation may only " + "be 'prompt' with -r or -a\n")); + usage(B_FALSE); + } + + ret = zfs_for_each(argc, argv, flags, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, NULL, NULL, 0, + load_key_callback, &cb); + + if (cb.cb_noop || (cb.cb_recursive && cb.cb_numattempted != 0)) { + (void) printf(gettext("%llu / %llu key(s) successfully %s\n"), + (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), + (u_longlong_t)cb.cb_numattempted, + loadkey ? (cb.cb_noop ? "verified" : "loaded") : + "unloaded"); + } + + if (cb.cb_numfailed != 0) + ret = -1; + + return (ret); +} + +static int +zfs_do_load_key(int argc, char **argv) +{ + return (load_unload_keys(argc, argv, B_TRUE)); +} + + +static int +zfs_do_unload_key(int argc, char **argv) +{ + return (load_unload_keys(argc, argv, B_FALSE)); +} + +static int +zfs_do_change_key(int argc, char **argv) +{ + int c, ret; + uint64_t keystatus; + boolean_t loadkey = B_FALSE, inheritkey = B_FALSE; + zfs_handle_t *zhp = NULL; + nvlist_t *props = fnvlist_alloc(); + + while ((c = getopt(argc, argv, "lio:")) != -1) { + switch (c) { + case 'l': + loadkey = B_TRUE; + break; + case 'i': + inheritkey = B_TRUE; + break; + case 'o': + if (!parseprop(props, optarg)) { + nvlist_free(props); + return (1); + } + break; + default: + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + usage(B_FALSE); + } + } + + if (inheritkey && !nvlist_empty(props)) { + (void) fprintf(stderr, + gettext("Properties not allowed for inheriting\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing dataset argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("Too many arguments\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[argc - 1], + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + usage(B_FALSE); + + if (loadkey) { + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + if (keystatus != ZFS_KEYSTATUS_AVAILABLE) { + ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); + if (ret != 0) { + nvlist_free(props); + zfs_close(zhp); + return (-1); + } + } + + /* refresh the properties so the new keystatus is visible */ + zfs_refresh_properties(zhp); + } + + ret = zfs_crypto_rewrap(zhp, props, inheritkey); + if (ret != 0) { + nvlist_free(props); + zfs_close(zhp); + return (-1); + } + + nvlist_free(props); + zfs_close(zhp); + return (0); +} + +/* + * 1) zfs project [-d|-r] + * List project ID and inherit flag of file(s) or directories. + * -d: List the directory itself, not its children. + * -r: List subdirectories recursively. + * + * 2) zfs project -C [-k] [-r] + * Clear project inherit flag and/or ID on the file(s) or directories. + * -k: Keep the project ID unchanged. If not specified, the project ID + * will be reset as zero. + * -r: Clear on subdirectories recursively. + * + * 3) zfs project -c [-0] [-d|-r] [-p id] + * Check project ID and inherit flag on the file(s) or directories, + * report the outliers. + * -0: Print file name followed by a NUL instead of newline. + * -d: Check the directory itself, not its children. + * -p: Specify the referenced ID for comparing with the target file(s) + * or directories' project IDs. If not specified, the target (top) + * directory's project ID will be used as the referenced one. + * -r: Check subdirectories recursively. + * + * 4) zfs project [-p id] [-r] [-s] + * Set project ID and/or inherit flag on the file(s) or directories. + * -p: Set the project ID as the given id. + * -r: Set on subdirectories recursively. If not specify "-p" option, + * it will use top-level directory's project ID as the given id, + * then set both project ID and inherit flag on all descendants + * of the top-level directory. + * -s: Set project inherit flag. + */ +static int +zfs_do_project(int argc, char **argv) +{ + zfs_project_control_t zpc = { + .zpc_expected_projid = ZFS_INVALID_PROJID, + .zpc_op = ZFS_PROJECT_OP_DEFAULT, + .zpc_dironly = B_FALSE, + .zpc_keep_projid = B_FALSE, + .zpc_newline = B_TRUE, + .zpc_recursive = B_FALSE, + .zpc_set_flag = B_FALSE, + }; + int ret = 0, c; + + if (argc < 2) + usage(B_FALSE); + + while ((c = getopt(argc, argv, "0Ccdkp:rs")) != -1) { + switch (c) { + case '0': + zpc.zpc_newline = B_FALSE; + break; + case 'C': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_op = ZFS_PROJECT_OP_CLEAR; + break; + case 'c': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_op = ZFS_PROJECT_OP_CHECK; + break; + case 'd': + zpc.zpc_dironly = B_TRUE; + /* overwrite "-r" option */ + zpc.zpc_recursive = B_FALSE; + break; + case 'k': + zpc.zpc_keep_projid = B_TRUE; + break; + case 'p': { + char *endptr; + + errno = 0; + zpc.zpc_expected_projid = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("project ID must be less than " + "%u\n"), UINT32_MAX); + usage(B_FALSE); + } + if (zpc.zpc_expected_projid >= UINT32_MAX) { + (void) fprintf(stderr, + gettext("invalid project ID\n")); + usage(B_FALSE); + } + break; + } + case 'r': + zpc.zpc_recursive = B_TRUE; + /* overwrite "-d" option */ + zpc.zpc_dironly = B_FALSE; + break; + case 's': + if (zpc.zpc_op != ZFS_PROJECT_OP_DEFAULT) { + (void) fprintf(stderr, gettext("cannot " + "specify '-C' '-c' '-s' together\n")); + usage(B_FALSE); + } + + zpc.zpc_set_flag = B_TRUE; + zpc.zpc_op = ZFS_PROJECT_OP_SET; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (zpc.zpc_op == ZFS_PROJECT_OP_DEFAULT) { + if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) + zpc.zpc_op = ZFS_PROJECT_OP_SET; + else + zpc.zpc_op = ZFS_PROJECT_OP_LIST; + } + + switch (zpc.zpc_op) { + case ZFS_PROJECT_OP_LIST: + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_CHECK: + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_CLEAR: + if (zpc.zpc_dironly) { + (void) fprintf(stderr, + gettext("'-d' is useless together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + if (zpc.zpc_expected_projid != ZFS_INVALID_PROJID) { + (void) fprintf(stderr, + gettext("'-p' is useless together with '-C'\n")); + usage(B_FALSE); + } + break; + case ZFS_PROJECT_OP_SET: + if (zpc.zpc_dironly) { + (void) fprintf(stderr, + gettext("'-d' is useless for set project ID and/or " + "inherit flag\n")); + usage(B_FALSE); + } + if (zpc.zpc_keep_projid) { + (void) fprintf(stderr, + gettext("'-k' is only valid together with '-C'\n")); + usage(B_FALSE); + } + if (!zpc.zpc_newline) { + (void) fprintf(stderr, + gettext("'-0' is only valid together with '-c'\n")); + usage(B_FALSE); + } + break; + default: + ASSERT(0); + break; + } + + argv += optind; + argc -= optind; + if (argc == 0) { + (void) fprintf(stderr, + gettext("missing file or directory target(s)\n")); + usage(B_FALSE); + } + + for (int i = 0; i < argc; i++) { + int err; + + err = zfs_project_handle(argv[i], &zpc); + if (err && !ret) + ret = err; + } + + return (ret); +} + +static int +zfs_do_wait(int argc, char **argv) +{ + boolean_t enabled[ZFS_WAIT_NUM_ACTIVITIES]; + int error, i; + char c; + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) + enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "t:")) != -1) { + switch (c) { + case 't': + { + static char *col_subopts[] = { "deleteq", NULL }; + char *value; + + /* Reset activities array */ + bzero(&enabled, sizeof (enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argv += optind; + argc -= optind; + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'filesystem' " + "argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + zfs_handle_t *zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (int i = 0; i < ZFS_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!enabled[i]) + continue; + + error = zfs_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zfs_close(zhp); + + return (error); +} + +/* + * Display version message + */ +static int +zfs_do_version(int argc, char **argv) +{ + if (zfs_version_print() == -1) + return (1); + + return (0); +} + +int +main(int argc, char **argv) +{ + int ret = 0; + int i = 0; + char *cmdname; + char **newargv; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + + opterr = 0; + + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * The 'umount' command is an alias for 'unmount' + */ + if (strcmp(cmdname, "umount") == 0) + cmdname = "unmount"; + + /* + * The 'recv' command is an alias for 'receive' + */ + if (strcmp(cmdname, "recv") == 0) + cmdname = "receive"; + + /* + * The 'snap' command is an alias for 'snapshot' + */ + if (strcmp(cmdname, "snap") == 0) + cmdname = "snapshot"; + + /* + * Special case '-?' + */ + if ((strcmp(cmdname, "-?") == 0) || + (strcmp(cmdname, "--help") == 0)) + usage(B_TRUE); + + /* + * Special case '-V|--version' + */ + if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) + return (zfs_do_version(argc, argv)); + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + mnttab_file = g_zfs->libzfs_mnttab; + + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); + + libzfs_print_on_error(g_zfs, B_TRUE); + + /* + * Many commands modify input strings for string parsing reasons. + * We create a copy to protect the original argv. + */ + newargv = malloc((argc + 1) * sizeof (newargv[0])); + for (i = 0; i < argc; i++) + newargv[i] = strdup(argv[i]); + newargv[argc] = NULL; + + /* + * Run the appropriate command. + */ + libzfs_mnttab_cache(g_zfs, B_TRUE); + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, newargv + 1); + } else if (strchr(cmdname, '=') != NULL) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, newargv); + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + ret = 1; + } + + for (i = 0; i < argc; i++) + free(newargv[i]); + free(newargv); + + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} + +#ifdef __FreeBSD__ +#include +#include +/* + * Attach/detach the given dataset to/from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail_impl(int argc, char **argv, boolean_t attach) +{ + zfs_handle_t *zhp; + int jailid, ret; + + /* check number of arguments */ + if (argc < 3) { + (void) fprintf(stderr, gettext("missing argument(s)\n")); + usage(B_FALSE); + } + if (argc > 3) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + jailid = jail_getid(argv[1]); + if (jailid < 0) { + (void) fprintf(stderr, gettext("invalid jail id or name\n")); + usage(B_FALSE); + } + + zhp = zfs_open(g_zfs, argv[2], ZFS_TYPE_FILESYSTEM); + if (zhp == NULL) + return (1); + + ret = (zfs_jail(zhp, jailid, attach) != 0); + + zfs_close(zhp); + return (ret); +} + +/* + * zfs jail jailid filesystem + * + * Attach the given dataset to the given jail + */ +/* ARGSUSED */ +static int +zfs_do_jail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_TRUE)); +} + +/* + * zfs unjail jailid filesystem + * + * Detach the given dataset from the given jail + */ +/* ARGSUSED */ +static int +zfs_do_unjail(int argc, char **argv) +{ + return (zfs_do_jail_impl(argc, argv, B_FALSE)); +} +#endif diff --git a/cmd/zfs/zfs_project.c b/cmd/zfs/zfs_project.c new file mode 100644 index 000000000000..341cc005de48 --- /dev/null +++ b/cmd/zfs/zfs_project.c @@ -0,0 +1,295 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Intle Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_util.h" +#include "zfs_projectutil.h" + +typedef struct zfs_project_item { + list_node_t zpi_list; + char zpi_name[0]; +} zfs_project_item_t; + +static void +zfs_project_item_alloc(list_t *head, const char *name) +{ + zfs_project_item_t *zpi; + + zpi = safe_malloc(sizeof (zfs_project_item_t) + strlen(name) + 1); + strcpy(zpi->zpi_name, name); + list_insert_tail(head, zpi); +} + +static int +zfs_project_sanity_check(const char *name, zfs_project_control_t *zpc, + struct stat *st) +{ + int ret; + + ret = stat(name, st); + if (ret) { + (void) fprintf(stderr, gettext("failed to stat %s: %s\n"), + name, strerror(errno)); + return (ret); + } + + if (!S_ISREG(st->st_mode) && !S_ISDIR(st->st_mode)) { + (void) fprintf(stderr, gettext("only support project quota on " + "regular file or directory\n")); + return (-1); + } + + if (!S_ISDIR(st->st_mode)) { + if (zpc->zpc_dironly) { + (void) fprintf(stderr, gettext( + "'-d' option on non-dir target %s\n"), name); + return (-1); + } + + if (zpc->zpc_recursive) { + (void) fprintf(stderr, gettext( + "'-r' option on non-dir target %s\n"), name); + return (-1); + } + } + + return (0); +} + +static int +zfs_project_load_projid(const char *name, zfs_project_control_t *zpc) +{ + zfsxattr_t fsx; + int ret, fd; + + fd = open(name, O_RDONLY | O_NOCTTY); + if (fd < 0) { + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + name, strerror(errno)); + return (fd); + } + + ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx); + if (ret) + (void) fprintf(stderr, + gettext("failed to get xattr for %s: %s\n"), + name, strerror(errno)); + else + zpc->zpc_expected_projid = fsx.fsx_projid; + + close(fd); + return (ret); +} + +static int +zfs_project_handle_one(const char *name, zfs_project_control_t *zpc) +{ + zfsxattr_t fsx; + int ret, fd; + + fd = open(name, O_RDONLY | O_NOCTTY); + if (fd < 0) { + if (errno == ENOENT && zpc->zpc_ignore_noent) + return (0); + + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + name, strerror(errno)); + return (fd); + } + + ret = ioctl(fd, ZFS_IOC_FSGETXATTR, &fsx); + if (ret) { + (void) fprintf(stderr, + gettext("failed to get xattr for %s: %s\n"), + name, strerror(errno)); + goto out; + } + + switch (zpc->zpc_op) { + case ZFS_PROJECT_OP_LIST: + (void) printf("%5u %c %s\n", fsx.fsx_projid, + (fsx.fsx_xflags & ZFS_PROJINHERIT_FL) ? 'P' : '-', name); + goto out; + case ZFS_PROJECT_OP_CHECK: + if (fsx.fsx_projid == zpc->zpc_expected_projid && + fsx.fsx_xflags & ZFS_PROJINHERIT_FL) + goto out; + + if (!zpc->zpc_newline) { + char c = '\0'; + + (void) printf("%s%c", name, c); + goto out; + } + + if (fsx.fsx_projid != zpc->zpc_expected_projid) + (void) printf("%s - project ID is not set properly " + "(%u/%u)\n", name, fsx.fsx_projid, + (uint32_t)zpc->zpc_expected_projid); + + if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL)) + (void) printf("%s - project inherit flag is not set\n", + name); + + goto out; + case ZFS_PROJECT_OP_CLEAR: + if (!(fsx.fsx_xflags & ZFS_PROJINHERIT_FL) && + (zpc->zpc_keep_projid || + fsx.fsx_projid == ZFS_DEFAULT_PROJID)) + goto out; + + fsx.fsx_xflags &= ~ZFS_PROJINHERIT_FL; + if (!zpc->zpc_keep_projid) + fsx.fsx_projid = ZFS_DEFAULT_PROJID; + break; + case ZFS_PROJECT_OP_SET: + if (fsx.fsx_projid == zpc->zpc_expected_projid && + (!zpc->zpc_set_flag || fsx.fsx_xflags & ZFS_PROJINHERIT_FL)) + goto out; + + fsx.fsx_projid = zpc->zpc_expected_projid; + if (zpc->zpc_set_flag) + fsx.fsx_xflags |= ZFS_PROJINHERIT_FL; + break; + default: + ASSERT(0); + break; + } + + ret = ioctl(fd, ZFS_IOC_FSSETXATTR, &fsx); + if (ret) + (void) fprintf(stderr, + gettext("failed to set xattr for %s: %s\n"), + name, strerror(errno)); + +out: + close(fd); + return (ret); +} + +static int +zfs_project_handle_dir(const char *name, zfs_project_control_t *zpc, + list_t *head) +{ + char fullname[PATH_MAX]; + struct dirent *ent; + DIR *dir; + int ret = 0; + + dir = opendir(name); + if (dir == NULL) { + if (errno == ENOENT && zpc->zpc_ignore_noent) + return (0); + + ret = -errno; + (void) fprintf(stderr, gettext("failed to opendir %s: %s\n"), + name, strerror(errno)); + return (ret); + } + + /* Non-top item, ignore the case of being removed or renamed by race. */ + zpc->zpc_ignore_noent = B_TRUE; + errno = 0; + while (!ret && (ent = readdir(dir)) != NULL) { + /* skip "." and ".." */ + if (strcmp(ent->d_name, ".") == 0 || + strcmp(ent->d_name, "..") == 0) + continue; + + if (strlen(ent->d_name) + strlen(name) >= + sizeof (fullname) + 1) { + errno = ENAMETOOLONG; + break; + } + + sprintf(fullname, "%s/%s", name, ent->d_name); + ret = zfs_project_handle_one(fullname, zpc); + if (!ret && zpc->zpc_recursive && ent->d_type == DT_DIR) + zfs_project_item_alloc(head, fullname); + } + + if (errno && !ret) { + ret = -errno; + (void) fprintf(stderr, gettext("failed to readdir %s: %s\n"), + name, strerror(errno)); + } + + closedir(dir); + return (ret); +} + +int +zfs_project_handle(const char *name, zfs_project_control_t *zpc) +{ + zfs_project_item_t *zpi; + struct stat st; + list_t head; + int ret; + + ret = zfs_project_sanity_check(name, zpc, &st); + if (ret) + return (ret); + + if ((zpc->zpc_op == ZFS_PROJECT_OP_SET || + zpc->zpc_op == ZFS_PROJECT_OP_CHECK) && + zpc->zpc_expected_projid == ZFS_INVALID_PROJID) { + ret = zfs_project_load_projid(name, zpc); + if (ret) + return (ret); + } + + zpc->zpc_ignore_noent = B_FALSE; + ret = zfs_project_handle_one(name, zpc); + if (ret || !S_ISDIR(st.st_mode) || zpc->zpc_dironly || + (!zpc->zpc_recursive && + zpc->zpc_op != ZFS_PROJECT_OP_LIST && + zpc->zpc_op != ZFS_PROJECT_OP_CHECK)) + return (ret); + + list_create(&head, sizeof (zfs_project_item_t), + offsetof(zfs_project_item_t, zpi_list)); + zfs_project_item_alloc(&head, name); + while ((zpi = list_remove_head(&head)) != NULL) { + if (!ret) + ret = zfs_project_handle_dir(zpi->zpi_name, zpc, &head); + free(zpi); + } + + return (ret); +} diff --git a/cmd/zfs/zfs_projectutil.h b/cmd/zfs/zfs_projectutil.h new file mode 100644 index 000000000000..1792a3383a03 --- /dev/null +++ b/cmd/zfs/zfs_projectutil.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + */ + +#ifndef _ZFS_PROJECTUTIL_H +#define _ZFS_PROJECTUTIL_H + +typedef enum { + ZFS_PROJECT_OP_DEFAULT = 0, + ZFS_PROJECT_OP_LIST = 1, + ZFS_PROJECT_OP_CHECK = 2, + ZFS_PROJECT_OP_CLEAR = 3, + ZFS_PROJECT_OP_SET = 4, +} zfs_project_ops_t; + +typedef struct zfs_project_control { + uint64_t zpc_expected_projid; + zfs_project_ops_t zpc_op; + boolean_t zpc_dironly; + boolean_t zpc_ignore_noent; + boolean_t zpc_keep_projid; + boolean_t zpc_newline; + boolean_t zpc_recursive; + boolean_t zpc_set_flag; +} zfs_project_control_t; + +int zfs_project_handle(const char *name, zfs_project_control_t *zpc); + +#endif /* _ZFS_PROJECTUTIL_H */ diff --git a/cmd/zfs/zfs_util.h b/cmd/zfs/zfs_util.h new file mode 100644 index 000000000000..a56af59adb15 --- /dev/null +++ b/cmd/zfs/zfs_util.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_UTIL_H +#define _ZFS_UTIL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void * safe_malloc(size_t size); +void nomem(void); +extern libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_UTIL_H */ diff --git a/cmd/zfs_ids_to_path/.gitignore b/cmd/zfs_ids_to_path/.gitignore new file mode 100644 index 000000000000..f95f853e48c2 --- /dev/null +++ b/cmd/zfs_ids_to_path/.gitignore @@ -0,0 +1 @@ +zfs_ids_to_path diff --git a/cmd/zfs_ids_to_path/Makefile.am b/cmd/zfs_ids_to_path/Makefile.am new file mode 100644 index 000000000000..176eeb3c72c5 --- /dev/null +++ b/cmd/zfs_ids_to_path/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zfs_ids_to_path + +zfs_ids_to_path_SOURCES = \ + zfs_ids_to_path.c + +zfs_ids_to_path_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la diff --git a/cmd/zfs_ids_to_path/zfs_ids_to_path.c b/cmd/zfs_ids_to_path/zfs_ids_to_path.c new file mode 100644 index 000000000000..6cfaa6f41fa5 --- /dev/null +++ b/cmd/zfs_ids_to_path/zfs_ids_to_path.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +libzfs_handle_t *g_zfs; + +static void +usage(int err) +{ + fprintf(stderr, "Usage: [-v] zfs_ids_to_path " + "\n"); + exit(err); +} + +int +main(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + char c; + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + } + } + argc -= optind; + argv += optind; + + if (argc != 3) { + (void) fprintf(stderr, "Incorrect number of arguments: %d\n", + argc); + usage(1); + } + + uint64_t objset, object; + if (sscanf(argv[1], "%llu", (u_longlong_t *)&objset) != 1) { + (void) fprintf(stderr, "Invalid objset id: %s\n", argv[2]); + usage(2); + } + if (sscanf(argv[2], "%llu", (u_longlong_t *)&object) != 1) { + (void) fprintf(stderr, "Invalid object id: %s\n", argv[3]); + usage(3); + } + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (4); + } + zpool_handle_t *pool = zpool_open(g_zfs, argv[0]); + if (pool == NULL) { + fprintf(stderr, "Could not open pool %s\n", argv[1]); + libzfs_fini(g_zfs); + return (5); + } + + char pathname[PATH_MAX * 2]; + if (verbose) { + zpool_obj_to_path_ds(pool, objset, object, pathname, + sizeof (pathname)); + } else { + zpool_obj_to_path(pool, objset, object, pathname, + sizeof (pathname)); + } + printf("%s\n", pathname); + zpool_close(pool); + libzfs_fini(g_zfs); + return (0); +} diff --git a/cmd/zgenhostid/Makefile.am b/cmd/zgenhostid/Makefile.am new file mode 100644 index 000000000000..69c99ca9d828 --- /dev/null +++ b/cmd/zgenhostid/Makefile.am @@ -0,0 +1 @@ +dist_bin_SCRIPTS = zgenhostid diff --git a/cmd/zgenhostid/zgenhostid b/cmd/zgenhostid/zgenhostid new file mode 100755 index 000000000000..8b468740c72b --- /dev/null +++ b/cmd/zgenhostid/zgenhostid @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +# Emulate genhostid(1) available on RHEL/CENTOS, for use on distros +# which do not provide that utility. +# +# Usage: +# zgenhostid +# zgenhostid +# +# If /etc/hostid already exists and is size > 0, the script exits immediately +# and changes nothing. Unlike genhostid, this generates an error message. +# +# The first form generates a random hostid and stores it in /etc/hostid. +# The second form checks that the provided value is between 0x1 and 0xFFFFFFFF +# and if so, stores it in /etc/hostid. This form is not supported by +# genhostid(1). + +hostid_file=/etc/hostid + +function usage { + echo "$0 [value]" + echo "If $hostid_file is not present, store a hostid in it." >&2 + echo "The optional value must be an 8-digit hex number between" >&2 + echo "1 and 2^32-1. If no value is provided, a random one will" >&2 + echo "be generated. The value must be unique among your systems." >&2 +} + +# hostid(1) ignores contents of /etc/hostid if size < 4 bytes. It would +# be better if this checked size >= 4 bytes but it the method must be +# widely portable. +if [ -s $hostid_file ]; then + echo "$hostid_file already exists. No change made." >&2 + exit 1 +fi + +if [ -n "$1" ]; then + host_id=$1 +else + # $RANDOM goes from 0..32k-1 + number=$((((RANDOM % 4) * 32768 + RANDOM) * 32768 + RANDOM)) + host_id=$(printf "%08x" $number) +fi + +if egrep -o '^0{8}$' <<< $host_id >/dev/null 2>&1; then + usage + exit 2 +fi + +if ! egrep -o '^[a-fA-F0-9]{8}$' <<< $host_id >/dev/null 2>&1; then + usage + exit 3 +fi + +a=${host_id:6:2} +b=${host_id:4:2} +c=${host_id:2:2} +d=${host_id:0:2} + +echo -ne \\x$a\\x$b\\x$c\\x$d > $hostid_file + +exit 0 diff --git a/cmd/zhack/.gitignore b/cmd/zhack/.gitignore new file mode 100644 index 000000000000..763a18898b88 --- /dev/null +++ b/cmd/zhack/.gitignore @@ -0,0 +1 @@ +/zhack diff --git a/cmd/zhack/Makefile.am b/cmd/zhack/Makefile.am new file mode 100644 index 000000000000..5cddac32b5ac --- /dev/null +++ b/cmd/zhack/Makefile.am @@ -0,0 +1,14 @@ +include $(top_srcdir)/config/Rules.am + +# Unconditionally enable debugging for zhack +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = zhack + +zhack_SOURCES = \ + zhack.c + +zhack_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/cmd/zhack/zhack.c b/cmd/zhack/zhack.c new file mode 100644 index 000000000000..4d958fe4365a --- /dev/null +++ b/cmd/zhack/zhack.c @@ -0,0 +1,532 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +/* + * zhack is a debugging tool that can write changes to ZFS pool using libzpool + * for testing purposes. Altering pools with zhack is unsupported and may + * result in corrupted pools. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern boolean_t zfeature_checks_disable; + +const char cmdname[] = "zhack"; +static importargs_t g_importargs; +static char *g_pool; +static boolean_t g_readonly; + +static void +usage(void) +{ + (void) fprintf(stderr, + "Usage: %s [-c cachefile] [-d dir] ...\n" + "where is one of the following:\n" + "\n", cmdname); + + (void) fprintf(stderr, + " feature stat \n" + " print information about enabled features\n" + " feature enable [-r] [-d desc] \n" + " add a new enabled feature to the pool\n" + " -d sets the feature's description\n" + " -r set read-only compatible flag for feature\n" + " feature ref [-md] \n" + " change the refcount on the given feature\n" + " -d decrease instead of increase the refcount\n" + " -m add the feature to the label if increasing refcount\n" + "\n" + " : should be a feature guid\n"); + exit(1); +} + + +static void +fatal(spa_t *spa, void *tag, const char *fmt, ...) +{ + va_list ap; + + if (spa != NULL) { + spa_close(spa, tag); + (void) spa_export(g_pool, NULL, B_TRUE, B_FALSE); + } + + va_start(ap, fmt); + (void) fprintf(stderr, "%s: ", cmdname); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); + (void) fprintf(stderr, "\n"); + + exit(1); +} + +/* ARGSUSED */ +static int +space_delta_cb(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (ENOENT); + (void) fprintf(stderr, "modifying object that needs user accounting"); + abort(); + /* NOTREACHED */ +} + +/* + * Target is the dataset whose pool we want to open. + */ +static void +zhack_import(char *target, boolean_t readonly) +{ + nvlist_t *config; + nvlist_t *props; + int error; + + kernel_init(readonly ? SPA_MODE_READ : + (SPA_MODE_READ | SPA_MODE_WRITE)); + + dmu_objset_register_type(DMU_OST_ZFS, space_delta_cb); + + g_readonly = readonly; + g_importargs.can_be_active = readonly; + g_pool = strdup(target); + + error = zpool_find_config(NULL, target, &config, &g_importargs, + &libzpool_config_ops); + if (error) + fatal(NULL, FTAG, "cannot import '%s'", target); + + props = NULL; + if (readonly) { + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), 1) == 0); + } + + zfeature_checks_disable = B_TRUE; + error = spa_import(target, config, props, + (readonly ? ZFS_IMPORT_SKIP_MMP : ZFS_IMPORT_NORMAL)); + zfeature_checks_disable = B_FALSE; + if (error == EEXIST) + error = 0; + + if (error) + fatal(NULL, FTAG, "can't import '%s': %s", target, + strerror(error)); +} + +static void +zhack_spa_open(char *target, boolean_t readonly, void *tag, spa_t **spa) +{ + int err; + + zhack_import(target, readonly); + + zfeature_checks_disable = B_TRUE; + err = spa_open(target, spa, tag); + zfeature_checks_disable = B_FALSE; + + if (err != 0) + fatal(*spa, FTAG, "cannot open '%s': %s", target, + strerror(err)); + if (spa_version(*spa) < SPA_VERSION_FEATURES) { + fatal(*spa, FTAG, "'%s' has version %d, features not enabled", + target, (int)spa_version(*spa)); + } +} + +static void +dump_obj(objset_t *os, uint64_t obj, const char *name) +{ + zap_cursor_t zc; + zap_attribute_t za; + + (void) printf("%s_obj:\n", name); + + for (zap_cursor_init(&zc, os, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + if (za.za_integer_length == 8) { + ASSERT(za.za_num_integers == 1); + (void) printf("\t%s = %llu\n", + za.za_name, (u_longlong_t)za.za_first_integer); + } else { + ASSERT(za.za_integer_length == 1); + char val[1024]; + VERIFY(zap_lookup(os, obj, za.za_name, + 1, sizeof (val), val) == 0); + (void) printf("\t%s = %s\n", za.za_name, val); + } + } + zap_cursor_fini(&zc); +} + +static void +dump_mos(spa_t *spa) +{ + nvlist_t *nv = spa->spa_label_features; + nvpair_t *pair; + + (void) printf("label config:\n"); + for (pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; + pair = nvlist_next_nvpair(nv, pair)) { + (void) printf("\t%s\n", nvpair_name(pair)); + } +} + +static void +zhack_do_feature_stat(int argc, char **argv) +{ + spa_t *spa; + objset_t *os; + char *target; + + argc--; + argv++; + + if (argc < 1) { + (void) fprintf(stderr, "error: missing pool name\n"); + usage(); + } + target = argv[0]; + + zhack_spa_open(target, B_TRUE, FTAG, &spa); + os = spa->spa_meta_objset; + + dump_obj(os, spa->spa_feat_for_read_obj, "for_read"); + dump_obj(os, spa->spa_feat_for_write_obj, "for_write"); + dump_obj(os, spa->spa_feat_desc_obj, "descriptions"); + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + dump_obj(os, spa->spa_feat_enabled_txg_obj, "enabled_txg"); + } + dump_mos(spa); + + spa_close(spa, FTAG); +} + +static void +zhack_feature_enable_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + + feature_enable_sync(spa, feature, tx); + + spa_history_log_internal(spa, "zhack enable feature", tx, + "name=%s flags=%u", + feature->fi_guid, feature->fi_flags); +} + +static void +zhack_do_feature_enable(int argc, char **argv) +{ + int c; + char *desc, *target; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + /* + * Features are not added to the pool's label until their refcounts + * are incremented, so fi_mos can just be left as false for now. + */ + desc = NULL; + feature.fi_uname = "zhack"; + feature.fi_flags = 0; + feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; + + optind = 1; + while ((c = getopt(argc, argv, "+rd:")) != -1) { + switch (c) { + case 'r': + feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; + break; + case 'd': + desc = strdup(optarg); + break; + default: + usage(); + break; + } + } + + if (desc == NULL) + desc = strdup("zhack injected"); + feature.fi_desc = desc; + + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (zfeature_is_supported(feature.fi_guid)) + fatal(spa, FTAG, "'%s' is a real feature, will not enable"); + if (0 == zap_contains(mos, spa->spa_feat_desc_obj, feature.fi_guid)) + fatal(spa, FTAG, "feature already enabled: %s", + feature.fi_guid); + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + zhack_feature_enable_sync, &feature, 5, ZFS_SPACE_CHECK_NORMAL)); + + spa_close(spa, FTAG); + + free(desc); +} + +static void +feature_incr_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + uint64_t refcount; + + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); + feature_sync(spa, feature, refcount + 1, tx); + spa_history_log_internal(spa, "zhack feature incr", tx, + "name=%s", feature->fi_guid); +} + +static void +feature_decr_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + zfeature_info_t *feature = arg; + uint64_t refcount; + + VERIFY0(feature_get_refcount_from_disk(spa, feature, &refcount)); + feature_sync(spa, feature, refcount - 1, tx); + spa_history_log_internal(spa, "zhack feature decr", tx, + "name=%s", feature->fi_guid); +} + +static void +zhack_do_feature_ref(int argc, char **argv) +{ + int c; + char *target; + boolean_t decr = B_FALSE; + spa_t *spa; + objset_t *mos; + zfeature_info_t feature; + spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + /* + * fi_desc does not matter here because it was written to disk + * when the feature was enabled, but we need to properly set the + * feature for read or write based on the information we read off + * disk later. + */ + feature.fi_uname = "zhack"; + feature.fi_flags = 0; + feature.fi_desc = NULL; + feature.fi_depends = nodeps; + feature.fi_feature = SPA_FEATURE_NONE; + + optind = 1; + while ((c = getopt(argc, argv, "+md")) != -1) { + switch (c) { + case 'm': + feature.fi_flags |= ZFEATURE_FLAG_MOS; + break; + case 'd': + decr = B_TRUE; + break; + default: + usage(); + break; + } + } + argc -= optind; + argv += optind; + + if (argc < 2) { + (void) fprintf(stderr, "error: missing feature or pool name\n"); + usage(); + } + target = argv[0]; + feature.fi_guid = argv[1]; + + if (!zfeature_is_valid_guid(feature.fi_guid)) + fatal(NULL, FTAG, "invalid feature guid: %s", feature.fi_guid); + + zhack_spa_open(target, B_FALSE, FTAG, &spa); + mos = spa->spa_meta_objset; + + if (zfeature_is_supported(feature.fi_guid)) { + fatal(spa, FTAG, + "'%s' is a real feature, will not change refcount"); + } + + if (0 == zap_contains(mos, spa->spa_feat_for_read_obj, + feature.fi_guid)) { + feature.fi_flags &= ~ZFEATURE_FLAG_READONLY_COMPAT; + } else if (0 == zap_contains(mos, spa->spa_feat_for_write_obj, + feature.fi_guid)) { + feature.fi_flags |= ZFEATURE_FLAG_READONLY_COMPAT; + } else { + fatal(spa, FTAG, "feature is not enabled: %s", feature.fi_guid); + } + + if (decr) { + uint64_t count; + if (feature_get_refcount_from_disk(spa, &feature, + &count) == 0 && count == 0) { + fatal(spa, FTAG, "feature refcount already 0: %s", + feature.fi_guid); + } + } + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + decr ? feature_decr_sync : feature_incr_sync, &feature, + 5, ZFS_SPACE_CHECK_NORMAL)); + + spa_close(spa, FTAG); +} + +static int +zhack_do_feature(int argc, char **argv) +{ + char *subcommand; + + argc--; + argv++; + if (argc == 0) { + (void) fprintf(stderr, + "error: no feature operation specified\n"); + usage(); + } + + subcommand = argv[0]; + if (strcmp(subcommand, "stat") == 0) { + zhack_do_feature_stat(argc, argv); + } else if (strcmp(subcommand, "enable") == 0) { + zhack_do_feature_enable(argc, argv); + } else if (strcmp(subcommand, "ref") == 0) { + zhack_do_feature_ref(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + return (0); +} + +#define MAX_NUM_PATHS 1024 + +int +main(int argc, char **argv) +{ + extern void zfs_prop_init(void); + + char *path[MAX_NUM_PATHS]; + const char *subcommand; + int rv = 0; + int c; + + g_importargs.path = path; + + dprintf_setup(&argc, argv); + zfs_prop_init(); + + while ((c = getopt(argc, argv, "+c:d:")) != -1) { + switch (c) { + case 'c': + g_importargs.cachefile = optarg; + break; + case 'd': + assert(g_importargs.paths < MAX_NUM_PATHS); + g_importargs.path[g_importargs.paths++] = optarg; + break; + default: + usage(); + break; + } + } + + argc -= optind; + argv += optind; + optind = 1; + + if (argc == 0) { + (void) fprintf(stderr, "error: no command specified\n"); + usage(); + } + + subcommand = argv[0]; + + if (strcmp(subcommand, "feature") == 0) { + rv = zhack_do_feature(argc, argv); + } else { + (void) fprintf(stderr, "error: unknown subcommand: %s\n", + subcommand); + usage(); + } + + if (!g_readonly && spa_export(g_pool, NULL, B_TRUE, B_FALSE) != 0) { + fatal(NULL, FTAG, "pool export failed; " + "changes may not be committed to disk\n"); + } + + kernel_fini(); + + return (rv); +} diff --git a/cmd/zinject/.gitignore b/cmd/zinject/.gitignore new file mode 100644 index 000000000000..bded8400996c --- /dev/null +++ b/cmd/zinject/.gitignore @@ -0,0 +1 @@ +/zinject diff --git a/cmd/zinject/Makefile.am b/cmd/zinject/Makefile.am new file mode 100644 index 000000000000..091d92cd6026 --- /dev/null +++ b/cmd/zinject/Makefile.am @@ -0,0 +1,13 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zinject + +zinject_SOURCES = \ + translate.c \ + zinject.c \ + zinject.h + +zinject_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/cmd/zinject/translate.c b/cmd/zinject/translate.c new file mode 100644 index 000000000000..4939c0b85b5f --- /dev/null +++ b/cmd/zinject/translate.c @@ -0,0 +1,397 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +#include "zinject.h" + +static int debug; + +static void +ziprintf(const char *fmt, ...) +{ + va_list ap; + + if (!debug) + return; + + va_start(ap, fmt); + (void) vprintf(fmt, ap); + va_end(ap); +} + +static void +compress_slashes(const char *src, char *dest) +{ + while (*src != '\0') { + *dest = *src++; + while (*dest == '/' && *src == '/') + ++src; + ++dest; + } + *dest = '\0'; +} + +/* + * Given a full path to a file, translate into a dataset name and a relative + * path within the dataset. 'dataset' must be at least MAXNAMELEN characters, + * and 'relpath' must be at least MAXPATHLEN characters. We also pass a stat64 + * buffer, which we need later to get the object ID. + */ +static int +parse_pathname(const char *inpath, char *dataset, char *relpath, + struct stat64 *statbuf) +{ + struct extmnttab mp; + const char *rel; + char fullpath[MAXPATHLEN]; + + compress_slashes(inpath, fullpath); + + if (fullpath[0] != '/') { + (void) fprintf(stderr, "invalid object '%s': must be full " + "path\n", fullpath); + usage(); + return (-1); + } + + if (getextmntent(fullpath, &mp, statbuf) != 0) { + (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", + fullpath); + return (-1); + } + + if (strcmp(mp.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, "invalid path '%s': not a ZFS " + "filesystem\n", fullpath); + return (-1); + } + + if (strncmp(fullpath, mp.mnt_mountp, strlen(mp.mnt_mountp)) != 0) { + (void) fprintf(stderr, "invalid path '%s': mountpoint " + "doesn't match path\n", fullpath); + return (-1); + } + + (void) strcpy(dataset, mp.mnt_special); + + rel = fullpath + strlen(mp.mnt_mountp); + if (rel[0] == '/') + rel++; + (void) strcpy(relpath, rel); + + return (0); +} + +/* + * Convert from a dataset to a objset id. Note that + * we grab the object number from the inode number. + */ +static int +object_from_path(const char *dataset, uint64_t object, zinject_record_t *record) +{ + zfs_handle_t *zhp; + + if ((zhp = zfs_open(g_zfs, dataset, ZFS_TYPE_DATASET)) == NULL) + return (-1); + + record->zi_objset = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + record->zi_object = object; + + zfs_close(zhp); + + return (0); +} + +/* + * Initialize the range based on the type, level, and range given. + */ +static int +initialize_range(err_type_t type, int level, char *range, + zinject_record_t *record) +{ + /* + * Determine the numeric range from the string. + */ + if (range == NULL) { + /* + * If range is unspecified, set the range to [0,-1], which + * indicates that the whole object should be treated as an + * error. + */ + record->zi_start = 0; + record->zi_end = -1ULL; + } else { + char *end; + + /* XXX add support for suffixes */ + record->zi_start = strtoull(range, &end, 10); + + + if (*end == '\0') + record->zi_end = record->zi_start + 1; + else if (*end == ',') + record->zi_end = strtoull(end + 1, &end, 10); + + if (*end != '\0') { + (void) fprintf(stderr, "invalid range '%s': must be " + "a numeric range of the form 'start[,end]'\n", + range); + return (-1); + } + } + + switch (type) { + default: + break; + case TYPE_DATA: + break; + + case TYPE_DNODE: + /* + * If this is a request to inject faults into the dnode, then we + * must translate the current (objset,object) pair into an + * offset within the metadnode for the objset. Specifying any + * kind of range with type 'dnode' is illegal. + */ + if (range != NULL) { + (void) fprintf(stderr, "range cannot be specified when " + "type is 'dnode'\n"); + return (-1); + } + + record->zi_start = record->zi_object * sizeof (dnode_phys_t); + record->zi_end = record->zi_start + sizeof (dnode_phys_t); + record->zi_object = 0; + break; + } + + record->zi_level = level; + + return (0); +} + +int +translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset) +{ + char path[MAXPATHLEN]; + char *slash; + struct stat64 statbuf; + int ret = -1; + + debug = (getenv("ZINJECT_DEBUG") != NULL); + + ziprintf("translating: %s\n", object); + + if (MOS_TYPE(type)) { + /* + * MOS objects are treated specially. + */ + switch (type) { + default: + break; + case TYPE_MOS: + record->zi_type = 0; + break; + case TYPE_MOSDIR: + record->zi_type = DMU_OT_OBJECT_DIRECTORY; + break; + case TYPE_METASLAB: + record->zi_type = DMU_OT_OBJECT_ARRAY; + break; + case TYPE_CONFIG: + record->zi_type = DMU_OT_PACKED_NVLIST; + break; + case TYPE_BPOBJ: + record->zi_type = DMU_OT_BPOBJ; + break; + case TYPE_SPACEMAP: + record->zi_type = DMU_OT_SPACE_MAP; + break; + case TYPE_ERRLOG: + record->zi_type = DMU_OT_ERROR_LOG; + break; + } + + dataset[0] = '\0'; + (void) strcpy(poolname, object); + return (0); + } + + /* + * Convert a full path into a (dataset, file) pair. + */ + if (parse_pathname(object, dataset, path, &statbuf) != 0) + goto err; + + ziprintf(" dataset: %s\n", dataset); + ziprintf(" path: %s\n", path); + + /* + * Convert (dataset, file) into (objset, object) + */ + if (object_from_path(dataset, statbuf.st_ino, record) != 0) + goto err; + + ziprintf("raw objset: %llu\n", record->zi_objset); + ziprintf("raw object: %llu\n", record->zi_object); + + /* + * For the given object, initialize the range in bytes + */ + if (initialize_range(type, level, (char *)range, record) != 0) + goto err; + + ziprintf(" objset: %llu\n", record->zi_objset); + ziprintf(" object: %llu\n", record->zi_object); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + ziprintf(" range: all\n"); + else + ziprintf(" range: [%llu, %llu]\n", record->zi_start, + record->zi_end); + + /* + * Copy the pool name + */ + (void) strcpy(poolname, dataset); + if ((slash = strchr(poolname, '/')) != NULL) + *slash = '\0'; + + ret = 0; + +err: + return (ret); +} + +int +translate_raw(const char *str, zinject_record_t *record) +{ + /* + * A raw bookmark of the form objset:object:level:blkid, where each + * number is a hexadecimal value. + */ + if (sscanf(str, "%llx:%llx:%x:%llx", (u_longlong_t *)&record->zi_objset, + (u_longlong_t *)&record->zi_object, &record->zi_level, + (u_longlong_t *)&record->zi_start) != 4) { + (void) fprintf(stderr, "bad raw spec '%s': must be of the form " + "'objset:object:level:blkid'\n", str); + return (-1); + } + + record->zi_end = record->zi_start; + + return (0); +} + +int +translate_device(const char *pool, const char *device, err_type_t label_type, + zinject_record_t *record) +{ + char *end; + zpool_handle_t *zhp; + nvlist_t *tgt; + boolean_t isspare, iscache; + + /* + * Given a device name or GUID, create an appropriate injection record + * with zi_guid set. + */ + if ((zhp = zpool_open(g_zfs, pool)) == NULL) + return (-1); + + record->zi_guid = strtoull(device, &end, 0); + if (record->zi_guid == 0 || *end != '\0') { + tgt = zpool_find_vdev(zhp, device, &isspare, &iscache, NULL); + + if (tgt == NULL) { + (void) fprintf(stderr, "cannot find device '%s' in " + "pool '%s'\n", device, pool); + zpool_close(zhp); + return (-1); + } + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, + &record->zi_guid) == 0); + } + + /* + * Device faults can take on three different forms: + * 1). delayed or hanging I/O + * 2). zfs label faults + * 3). generic disk faults + */ + if (record->zi_timer != 0) { + record->zi_cmd = ZINJECT_DELAY_IO; + } else if (label_type != TYPE_INVAL) { + record->zi_cmd = ZINJECT_LABEL_FAULT; + } else { + record->zi_cmd = ZINJECT_DEVICE_FAULT; + } + + switch (label_type) { + default: + break; + case TYPE_LABEL_UBERBLOCK: + record->zi_start = offsetof(vdev_label_t, vl_uberblock[0]); + record->zi_end = record->zi_start + VDEV_UBERBLOCK_RING - 1; + break; + case TYPE_LABEL_NVLIST: + record->zi_start = offsetof(vdev_label_t, vl_vdev_phys); + record->zi_end = record->zi_start + VDEV_PHYS_SIZE - 1; + break; + case TYPE_LABEL_PAD1: + record->zi_start = offsetof(vdev_label_t, vl_pad1); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + case TYPE_LABEL_PAD2: + record->zi_start = offsetof(vdev_label_t, vl_be); + record->zi_end = record->zi_start + VDEV_PAD_SIZE - 1; + break; + } + zpool_close(zhp); + return (0); +} diff --git a/cmd/zinject/zinject.c b/cmd/zinject/zinject.c new file mode 100644 index 000000000000..bf97b0d68713 --- /dev/null +++ b/cmd/zinject/zinject.c @@ -0,0 +1,1287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * ZFS Fault Injector + * + * This userland component takes a set of options and uses libzpool to translate + * from a user-visible object type and name to an internal representation. + * There are two basic types of faults: device faults and data faults. + * + * + * DEVICE FAULTS + * + * Errors can be injected into a particular vdev using the '-d' option. This + * option takes a path or vdev GUID to uniquely identify the device within a + * pool. There are four types of errors that can be injected, IO, ENXIO, + * ECHILD, and EILSEQ. These can be controlled through the '-e' option and the + * default is ENXIO. For EIO failures, any attempt to read data from the device + * will return EIO, but a subsequent attempt to reopen the device will succeed. + * For ENXIO failures, any attempt to read from the device will return EIO, but + * any attempt to reopen the device will also return ENXIO. The EILSEQ failures + * only apply to read operations (-T read) and will flip a bit after the device + * has read the original data. + * + * For label faults, the -L option must be specified. This allows faults + * to be injected into either the nvlist, uberblock, pad1, or pad2 region + * of all the labels for the specified device. + * + * This form of the command looks like: + * + * zinject -d device [-e errno] [-L ] pool + * + * + * DATA FAULTS + * + * We begin with a tuple of the form: + * + * + * + * type A string describing the type of data to target. Each type + * implicitly describes how to interpret 'object'. Currently, + * the following values are supported: + * + * data User data for a file + * dnode Dnode for a file or directory + * + * The following MOS objects are special. Instead of injecting + * errors on a particular object or blkid, we inject errors across + * all objects of the given type. + * + * mos Any data in the MOS + * mosdir object directory + * config pool configuration + * bpobj blkptr list + * spacemap spacemap + * metaslab metaslab + * errlog persistent error log + * + * level Object level. Defaults to '0', not applicable to all types. If + * a range is given, this corresponds to the indirect block + * corresponding to the specific range. + * + * range A numerical range [start,end) within the object. Defaults to + * the full size of the file. + * + * object A string describing the logical location of the object. For + * files and directories (currently the only supported types), + * this is the path of the object on disk. + * + * This is translated, via libzpool, into the following internal representation: + * + * + * + * These types should be self-explanatory. This tuple is then passed to the + * kernel via a special ioctl() to initiate fault injection for the given + * object. Note that 'type' is not strictly necessary for fault injection, but + * is used when translating existing faults into a human-readable string. + * + * + * The command itself takes one of the forms: + * + * zinject + * zinject <-a | -u pool> + * zinject -c + * zinject [-q] <-t type> [-f freq] [-u] [-a] [-m] [-e errno] [-l level] + * [-r range] + * zinject [-f freq] [-a] [-m] [-u] -b objset:object:level:start:end pool + * + * With no arguments, the command prints all currently registered injection + * handlers, with their numeric identifiers. + * + * The '-c' option will clear the given handler, or all handlers if 'all' is + * specified. + * + * The '-e' option takes a string describing the errno to simulate. This must + * be one of 'io', 'checksum', 'decompress', or 'decrypt'. In most cases this + * will result in the same behavior, but RAID-Z will produce a different set of + * ereports for this situation. + * + * The '-a', '-u', and '-m' flags toggle internal flush behavior. If '-a' is + * specified, then the ARC cache is flushed appropriately. If '-u' is + * specified, then the underlying SPA is unloaded. Either of these flags can be + * specified independently of any other handlers. The '-m' flag automatically + * does an unmount and remount of the underlying dataset to aid in flushing the + * cache. + * + * The '-f' flag controls the frequency of errors injected, expressed as a + * real number percentage between 0.0001 and 100. The default is 100. + * + * The this form is responsible for actually injecting the handler into the + * framework. It takes the arguments described above, translates them to the + * internal tuple using libzpool, and then issues an ioctl() to register the + * handler. + * + * The final form can target a specific bookmark, regardless of whether a + * human-readable interface has been designed. It allows developers to specify + * a particular block by number. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +#undef verify /* both libzfs.h and zfs_context.h want to define this */ + +#include "zinject.h" + +libzfs_handle_t *g_zfs; +int zfs_fd; + +static const char *errtable[TYPE_INVAL] = { + "data", + "dnode", + "mos", + "mosdir", + "metaslab", + "config", + "bpobj", + "spacemap", + "errlog", + "uber", + "nvlist", + "pad1", + "pad2" +}; + +static err_type_t +name_to_type(const char *arg) +{ + int i; + for (i = 0; i < TYPE_INVAL; i++) + if (strcmp(errtable[i], arg) == 0) + return (i); + + return (TYPE_INVAL); +} + +static const char * +type_to_name(uint64_t type) +{ + switch (type) { + case DMU_OT_OBJECT_DIRECTORY: + return ("mosdir"); + case DMU_OT_OBJECT_ARRAY: + return ("metaslab"); + case DMU_OT_PACKED_NVLIST: + return ("config"); + case DMU_OT_BPOBJ: + return ("bpobj"); + case DMU_OT_SPACE_MAP: + return ("spacemap"); + case DMU_OT_ERROR_LOG: + return ("errlog"); + default: + return ("-"); + } +} + + +/* + * Print usage message. + */ +void +usage(void) +{ + (void) printf( + "usage:\n" + "\n" + "\tzinject\n" + "\n" + "\t\tList all active injection records.\n" + "\n" + "\tzinject -c \n" + "\n" + "\t\tClear the particular record (if given a numeric ID), or\n" + "\t\tall records if 'all' is specified.\n" + "\n" + "\tzinject -p pool\n" + "\t\tInject a panic fault at the specified function. Only \n" + "\t\tfunctions which call spa_vdev_config_exit(), or \n" + "\t\tspa_vdev_exit() will trigger a panic.\n" + "\n" + "\tzinject -d device [-e errno] [-L ] [-F]\n" + "\t\t[-T ] [-f frequency] pool\n\n" + "\t\tInject a fault into a particular device or the device's\n" + "\t\tlabel. Label injection can either be 'nvlist', 'uber',\n " + "\t\t'pad1', or 'pad2'.\n" + "\t\t'errno' can be 'nxio' (the default), 'io', 'dtl', or\n" + "\t\t'corrupt' (bit flip).\n" + "\t\t'frequency' is a value between 0.0001 and 100.0 that limits\n" + "\t\tdevice error injection to a percentage of the IOs.\n" + "\n" + "\tzinject -d device -A -D pool\n" + "\t\tPerform a specific action on a particular device.\n" + "\n" + "\tzinject -d device -D latency:lanes pool\n" + "\n" + "\t\tAdd an artificial delay to IO requests on a particular\n" + "\t\tdevice, such that the requests take a minimum of 'latency'\n" + "\t\tmilliseconds to complete. Each delay has an associated\n" + "\t\tnumber of 'lanes' which defines the number of concurrent\n" + "\t\tIO requests that can be processed.\n" + "\n" + "\t\tFor example, with a single lane delay of 10 ms (-D 10:1),\n" + "\t\tthe device will only be able to service a single IO request\n" + "\t\tat a time with each request taking 10 ms to complete. So,\n" + "\t\tif only a single request is submitted every 10 ms, the\n" + "\t\taverage latency will be 10 ms; but if more than one request\n" + "\t\tis submitted every 10 ms, the average latency will be more\n" + "\t\tthan 10 ms.\n" + "\n" + "\t\tSimilarly, if a delay of 10 ms is specified to have two\n" + "\t\tlanes (-D 10:2), then the device will be able to service\n" + "\t\ttwo requests at a time, each with a minimum latency of\n" + "\t\t10 ms. So, if two requests are submitted every 10 ms, then\n" + "\t\tthe average latency will be 10 ms; but if more than two\n" + "\t\trequests are submitted every 10 ms, the average latency\n" + "\t\twill be more than 10 ms.\n" + "\n" + "\t\tAlso note, these delays are additive. So two invocations\n" + "\t\tof '-D 10:1', is roughly equivalent to a single invocation\n" + "\t\tof '-D 10:2'. This also means, one can specify multiple\n" + "\t\tlanes with differing target latencies. For example, an\n" + "\t\tinvocation of '-D 10:1' followed by '-D 25:2' will\n" + "\t\tcreate 3 lanes on the device; one lane with a latency\n" + "\t\tof 10 ms and two lanes with a 25 ms latency.\n" + "\n" + "\tzinject -I [-s | -g ] pool\n" + "\t\tCause the pool to stop writing blocks yet not\n" + "\t\treport errors for a duration. Simulates buggy hardware\n" + "\t\tthat fails to honor cache flush requests.\n" + "\t\tDefault duration is 30 seconds. The machine is panicked\n" + "\t\tat the end of the duration.\n" + "\n" + "\tzinject -b objset:object:level:blkid pool\n" + "\n" + "\t\tInject an error into pool 'pool' with the numeric bookmark\n" + "\t\tspecified by the remaining tuple. Each number is in\n" + "\t\thexadecimal, and only one block can be specified.\n" + "\n" + "\tzinject [-q] <-t type> [-C dvas] [-e errno] [-l level]\n" + "\t\t[-r range] [-a] [-m] [-u] [-f freq] \n" + "\n" + "\t\tInject an error into the object specified by the '-t' option\n" + "\t\tand the object descriptor. The 'object' parameter is\n" + "\t\tinterpreted depending on the '-t' option.\n" + "\n" + "\t\t-q\tQuiet mode. Only print out the handler number added.\n" + "\t\t-e\tInject a specific error. Must be one of 'io',\n" + "\t\t\t'checksum', 'decompress', or 'decrypt'. Default is 'io'.\n" + "\t\t-C\tInject the given error only into specific DVAs. The\n" + "\t\t\tDVAs should be specified as a list of 0-indexed DVAs\n" + "\t\t\tseparated by commas (ex. '0,2').\n" + "\t\t-l\tInject error at a particular block level. Default is " + "0.\n" + "\t\t-m\tAutomatically remount underlying filesystem.\n" + "\t\t-r\tInject error over a particular logical range of an\n" + "\t\t\tobject. Will be translated to the appropriate blkid\n" + "\t\t\trange according to the object's properties.\n" + "\t\t-a\tFlush the ARC cache. Can be specified without any\n" + "\t\t\tassociated object.\n" + "\t\t-u\tUnload the associated pool. Can be specified with only\n" + "\t\t\ta pool object.\n" + "\t\t-f\tOnly inject errors a fraction of the time. Expressed as\n" + "\t\t\ta percentage between 0.0001 and 100.\n" + "\n" + "\t-t data\t\tInject an error into the plain file contents of a\n" + "\t\t\tfile. The object must be specified as a complete path\n" + "\t\t\tto a file on a ZFS filesystem.\n" + "\n" + "\t-t dnode\tInject an error into the metadnode in the block\n" + "\t\t\tcorresponding to the dnode for a file or directory. The\n" + "\t\t\t'-r' option is incompatible with this mode. The object\n" + "\t\t\tis specified as a complete path to a file or directory\n" + "\t\t\ton a ZFS filesystem.\n" + "\n" + "\t-t \tInject errors into the MOS for objects of the given\n" + "\t\t\ttype. Valid types are: mos, mosdir, config, bpobj,\n" + "\t\t\tspacemap, metaslab, errlog. The only valid is\n" + "\t\t\tthe poolname.\n"); +} + +static int +iter_handlers(int (*func)(int, const char *, zinject_record_t *, void *), + void *data) +{ + zfs_cmd_t zc = {"\0"}; + int ret; + + while (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_LIST_NEXT, &zc) == 0) + if ((ret = func((int)zc.zc_guid, zc.zc_name, + &zc.zc_inject_record, data)) != 0) + return (ret); + + if (errno != ENOENT) { + (void) fprintf(stderr, "Unable to list handlers: %s\n", + strerror(errno)); + return (-1); + } + + return (0); +} + +static int +print_data_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid != 0 || record->zi_func[0] != '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-6s %-6s %-8s %3s %-4s " + "%-15s\n", "ID", "POOL", "OBJSET", "OBJECT", "TYPE", + "LVL", "DVAs", "RANGE"); + (void) printf("--- --------------- ------ " + "------ -------- --- ---- ---------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-6llu %-6llu %-8s %-3d 0x%02x ", + id, pool, (u_longlong_t)record->zi_objset, + (u_longlong_t)record->zi_object, type_to_name(record->zi_type), + record->zi_level, record->zi_dvas); + + + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf("all\n"); + else + (void) printf("[%llu, %llu]\n", (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + + return (0); +} + +static int +print_device_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd == ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "GUID"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %llx\n", id, pool, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int +print_delay_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_guid == 0 || record->zi_func[0] != '\0') + return (0); + + if (record->zi_cmd != ZINJECT_DELAY_IO) + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %-15s %-15s %s\n", + "ID", "POOL", "DELAY (ms)", "LANES", "GUID"); + (void) printf("--- --------------- --------------- " + "--------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %-15llu %-15llu %llx\n", id, pool, + (u_longlong_t)NSEC2MSEC(record->zi_timer), + (u_longlong_t)record->zi_nlanes, + (u_longlong_t)record->zi_guid); + + return (0); +} + +static int +print_panic_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + int *count = data; + + if (record->zi_func[0] == '\0') + return (0); + + if (*count == 0) { + (void) printf("%3s %-15s %s\n", "ID", "POOL", "FUNCTION"); + (void) printf("--- --------------- ----------------\n"); + } + + *count += 1; + + (void) printf("%3d %-15s %s\n", id, pool, record->zi_func); + + return (0); +} + +/* + * Print all registered error handlers. Returns the number of handlers + * registered. + */ +static int +print_all_handlers(void) +{ + int count = 0, total = 0; + + (void) iter_handlers(print_device_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_delay_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_data_handler, &count); + if (count > 0) { + total += count; + (void) printf("\n"); + count = 0; + } + + (void) iter_handlers(print_panic_handler, &count); + + return (count + total); +} + +/* ARGSUSED */ +static int +cancel_one_handler(int id, const char *pool, zinject_record_t *record, + void *data) +{ + zfs_cmd_t zc = {"\0"}; + + zc.zc_guid = (uint64_t)id; + + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + return (0); +} + +/* + * Remove all fault injection handlers. + */ +static int +cancel_all_handlers(void) +{ + int ret = iter_handlers(cancel_one_handler, NULL); + + if (ret == 0) + (void) printf("removed all registered handlers\n"); + + return (ret); +} + +/* + * Remove a specific fault injection handler. + */ +static int +cancel_handler(int id) +{ + zfs_cmd_t zc = {"\0"}; + + zc.zc_guid = (uint64_t)id; + + if (zfs_ioctl(g_zfs, ZFS_IOC_CLEAR_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to remove handler %d: %s\n", + id, strerror(errno)); + return (1); + } + + (void) printf("removed handler %d\n", id); + + return (0); +} + +/* + * Register a new fault injection handler. + */ +static int +register_handler(const char *pool, int flags, zinject_record_t *record, + int quiet) +{ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_inject_record = *record; + zc.zc_guid = flags; + + if (zfs_ioctl(g_zfs, ZFS_IOC_INJECT_FAULT, &zc) != 0) { + (void) fprintf(stderr, "failed to add handler: %s\n", + errno == EDOM ? "block level exceeds max level of object" : + strerror(errno)); + return (1); + } + + if (flags & ZINJECT_NULL) + return (0); + + if (quiet) { + (void) printf("%llu\n", (u_longlong_t)zc.zc_guid); + } else { + (void) printf("Added handler %llu with the following " + "properties:\n", (u_longlong_t)zc.zc_guid); + (void) printf(" pool: %s\n", pool); + if (record->zi_guid) { + (void) printf(" vdev: %llx\n", + (u_longlong_t)record->zi_guid); + } else if (record->zi_func[0] != '\0') { + (void) printf(" panic function: %s\n", + record->zi_func); + } else if (record->zi_duration > 0) { + (void) printf(" time: %lld seconds\n", + (u_longlong_t)record->zi_duration); + } else if (record->zi_duration < 0) { + (void) printf(" txgs: %lld \n", + (u_longlong_t)-record->zi_duration); + } else { + (void) printf("objset: %llu\n", + (u_longlong_t)record->zi_objset); + (void) printf("object: %llu\n", + (u_longlong_t)record->zi_object); + (void) printf(" type: %llu\n", + (u_longlong_t)record->zi_type); + (void) printf(" level: %d\n", record->zi_level); + if (record->zi_start == 0 && + record->zi_end == -1ULL) + (void) printf(" range: all\n"); + else + (void) printf(" range: [%llu, %llu)\n", + (u_longlong_t)record->zi_start, + (u_longlong_t)record->zi_end); + (void) printf(" dvas: 0x%x\n", record->zi_dvas); + } + } + + return (0); +} + +static int +perform_action(const char *pool, zinject_record_t *record, int cmd) +{ + zfs_cmd_t zc = {"\0"}; + + ASSERT(cmd == VDEV_STATE_DEGRADED || cmd == VDEV_STATE_FAULTED); + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_guid = record->zi_guid; + zc.zc_cookie = cmd; + + if (zfs_ioctl(g_zfs, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (1); +} + +static int +parse_delay(char *str, uint64_t *delay, uint64_t *nlanes) +{ + unsigned long scan_delay; + unsigned long scan_nlanes; + + if (sscanf(str, "%lu:%lu", &scan_delay, &scan_nlanes) != 2) + return (1); + + /* + * We explicitly disallow a delay of zero here, because we key + * off this value being non-zero in translate_device(), to + * determine if the fault is a ZINJECT_DELAY_IO fault or not. + */ + if (scan_delay == 0) + return (1); + + /* + * The units for the CLI delay parameter is milliseconds, but + * the data passed to the kernel is interpreted as nanoseconds. + * Thus we scale the milliseconds to nanoseconds here, and this + * nanosecond value is used to pass the delay to the kernel. + */ + *delay = MSEC2NSEC(scan_delay); + *nlanes = scan_nlanes; + + return (0); +} + +static int +parse_frequency(const char *str, uint32_t *percent) +{ + double val; + char *post; + + val = strtod(str, &post); + if (post == NULL || *post != '\0') + return (EINVAL); + + /* valid range is [0.0001, 100.0] */ + val /= 100.0f; + if (val < 0.000001f || val > 1.0f) + return (ERANGE); + + /* convert to an integer for use by kernel */ + *percent = ((uint32_t)(val * ZI_PERCENTAGE_MAX)); + + return (0); +} + +/* + * This function converts a string specifier for DVAs into a bit mask. + * The dva's provided by the user should be 0 indexed and separated by + * a comma. For example: + * "1" -> 0b0010 (0x2) + * "0,1" -> 0b0011 (0x3) + * "0,1,2" -> 0b0111 (0x7) + */ +static int +parse_dvas(const char *str, uint32_t *dvas_out) +{ + const char *c = str; + uint32_t mask = 0; + boolean_t need_delim = B_FALSE; + + /* max string length is 5 ("0,1,2") */ + if (strlen(str) > 5 || strlen(str) == 0) + return (EINVAL); + + while (*c != '\0') { + switch (*c) { + case '0': + case '1': + case '2': + /* check for pipe between DVAs */ + if (need_delim) + return (EINVAL); + + /* check if this DVA has been set already */ + if (mask & (1 << ((*c) - '0'))) + return (EINVAL); + + mask |= (1 << ((*c) - '0')); + need_delim = B_TRUE; + break; + case ',': + need_delim = B_FALSE; + break; + default: + /* check for invalid character */ + return (EINVAL); + } + c++; + } + + /* check for dangling delimiter */ + if (!need_delim) + return (EINVAL); + + *dvas_out = mask; + return (0); +} + +int +main(int argc, char **argv) +{ + int c; + char *range = NULL; + char *cancel = NULL; + char *end; + char *raw = NULL; + char *device = NULL; + int level = 0; + int quiet = 0; + int error = 0; + int domount = 0; + int io_type = ZIO_TYPES; + int action = VDEV_STATE_UNKNOWN; + err_type_t type = TYPE_INVAL; + err_type_t label = TYPE_INVAL; + zinject_record_t record = { 0 }; + char pool[MAXNAMELEN] = ""; + char dataset[MAXNAMELEN] = ""; + zfs_handle_t *zhp = NULL; + int nowrites = 0; + int dur_txg = 0; + int dur_secs = 0; + int ret; + int flags = 0; + uint32_t dvas = 0; + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + if ((zfs_fd = open(ZFS_DEV, O_RDWR)) < 0) { + (void) fprintf(stderr, "failed to open ZFS device\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (argc == 1) { + /* + * No arguments. Print the available handlers. If there are no + * available handlers, direct the user to '-h' for help + * information. + */ + if (print_all_handlers() == 0) { + (void) printf("No handlers registered.\n"); + (void) printf("Run 'zinject -h' for usage " + "information.\n"); + } + libzfs_fini(g_zfs); + return (0); + } + + while ((c = getopt(argc, argv, + ":aA:b:C:d:D:f:Fg:qhIc:t:T:l:mr:s:e:uL:p:")) != -1) { + switch (c) { + case 'a': + flags |= ZINJECT_FLUSH_ARC; + break; + case 'A': + if (strcasecmp(optarg, "degrade") == 0) { + action = VDEV_STATE_DEGRADED; + } else if (strcasecmp(optarg, "fault") == 0) { + action = VDEV_STATE_FAULTED; + } else { + (void) fprintf(stderr, "invalid action '%s': " + "must be 'degrade' or 'fault'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'b': + raw = optarg; + break; + case 'c': + cancel = optarg; + break; + case 'C': + ret = parse_dvas(optarg, &dvas); + if (ret != 0) { + (void) fprintf(stderr, "invalid DVA list '%s': " + "DVAs should be 0 indexed and separated by " + "commas.\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'd': + device = optarg; + break; + case 'D': + errno = 0; + ret = parse_delay(optarg, &record.zi_timer, + &record.zi_nlanes); + if (ret != 0) { + + (void) fprintf(stderr, "invalid i/o delay " + "value: '%s'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'e': + if (strcasecmp(optarg, "io") == 0) { + error = EIO; + } else if (strcasecmp(optarg, "checksum") == 0) { + error = ECKSUM; + } else if (strcasecmp(optarg, "decompress") == 0) { + error = EINVAL; + } else if (strcasecmp(optarg, "decrypt") == 0) { + error = EACCES; + } else if (strcasecmp(optarg, "nxio") == 0) { + error = ENXIO; + } else if (strcasecmp(optarg, "dtl") == 0) { + error = ECHILD; + } else if (strcasecmp(optarg, "corrupt") == 0) { + error = EILSEQ; + } else { + (void) fprintf(stderr, "invalid error type " + "'%s': must be 'io', 'checksum' or " + "'nxio'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'f': + ret = parse_frequency(optarg, &record.zi_freq); + if (ret != 0) { + (void) fprintf(stderr, "%sfrequency value must " + "be in the range [0.0001, 100.0]\n", + ret == EINVAL ? "invalid value: " : + ret == ERANGE ? "out of range: " : ""); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'F': + record.zi_failfast = B_TRUE; + break; + case 'g': + dur_txg = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + /* store duration of txgs as its negative */ + record.zi_duration *= -1; + break; + case 'h': + usage(); + libzfs_fini(g_zfs); + return (0); + case 'I': + /* default duration, if one hasn't yet been defined */ + nowrites = 1; + if (dur_secs == 0 && dur_txg == 0) + record.zi_duration = 30; + break; + case 'l': + level = (int)strtol(optarg, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid level '%s': " + "must be an integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'm': + domount = 1; + break; + case 'p': + (void) strlcpy(record.zi_func, optarg, + sizeof (record.zi_func)); + record.zi_cmd = ZINJECT_PANIC; + break; + case 'q': + quiet = 1; + break; + case 'r': + range = optarg; + flags |= ZINJECT_CALC_RANGE; + break; + case 's': + dur_secs = 1; + record.zi_duration = (int)strtol(optarg, &end, 10); + if (record.zi_duration <= 0 || *end != '\0') { + (void) fprintf(stderr, "invalid duration '%s': " + "must be a positive integer\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'T': + if (strcasecmp(optarg, "read") == 0) { + io_type = ZIO_TYPE_READ; + } else if (strcasecmp(optarg, "write") == 0) { + io_type = ZIO_TYPE_WRITE; + } else if (strcasecmp(optarg, "free") == 0) { + io_type = ZIO_TYPE_FREE; + } else if (strcasecmp(optarg, "claim") == 0) { + io_type = ZIO_TYPE_CLAIM; + } else if (strcasecmp(optarg, "all") == 0) { + io_type = ZIO_TYPES; + } else { + (void) fprintf(stderr, "invalid I/O type " + "'%s': must be 'read', 'write', 'free', " + "'claim' or 'all'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 't': + if ((type = name_to_type(optarg)) == TYPE_INVAL && + !MOS_TYPE(type)) { + (void) fprintf(stderr, "invalid type '%s'\n", + optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case 'u': + flags |= ZINJECT_UNLOAD_SPA; + break; + case 'L': + if ((label = name_to_type(optarg)) == TYPE_INVAL && + !LABEL_TYPE(type)) { + (void) fprintf(stderr, "invalid label type " + "'%s'\n", optarg); + usage(); + libzfs_fini(g_zfs); + return (1); + } + break; + case ':': + (void) fprintf(stderr, "option -%c requires an " + "operand\n", optopt); + usage(); + libzfs_fini(g_zfs); + return (1); + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + usage(); + libzfs_fini(g_zfs); + return (2); + } + } + + argc -= optind; + argv += optind; + + if (record.zi_duration != 0) + record.zi_cmd = ZINJECT_IGNORED_WRITES; + + if (cancel != NULL) { + /* + * '-c' is invalid with any other options. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "cancel (-c) incompatible with " + "any other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + if (argc != 0) { + (void) fprintf(stderr, "extraneous argument to '-c'\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (strcmp(cancel, "all") == 0) { + return (cancel_all_handlers()); + } else { + int id = (int)strtol(cancel, &end, 10); + if (*end != '\0') { + (void) fprintf(stderr, "invalid handle id '%s':" + " must be an integer or 'all'\n", cancel); + usage(); + libzfs_fini(g_zfs); + return (1); + } + return (cancel_handler(id)); + } + } + + if (device != NULL) { + /* + * Device (-d) injection uses a completely different mechanism + * for doing injection, so handle it separately here. + */ + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_cmd != ZINJECT_UNINITIALIZED || + dvas != 0) { + (void) fprintf(stderr, "device (-d) incompatible with " + "data error injection\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "device (-d) injection requires " + "a single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + + if (error == ECKSUM) { + (void) fprintf(stderr, "device error type must be " + "'io', 'nxio' or 'corrupt'\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (error == EILSEQ && + (record.zi_freq == 0 || io_type != ZIO_TYPE_READ)) { + (void) fprintf(stderr, "device corrupt errors require " + "io type read and a frequency value\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_iotype = io_type; + if (translate_device(pool, device, label, &record) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = ENXIO; + + if (action != VDEV_STATE_UNKNOWN) + return (perform_action(pool, &record, action)); + + } else if (raw != NULL) { + if (range != NULL || type != TYPE_INVAL || level != 0 || + record.zi_cmd != ZINJECT_UNINITIALIZED || + record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "raw (-b) format with " + "any other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc != 1) { + (void) fprintf(stderr, "raw (-b) format expects a " + "single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + + if (error == ENXIO) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_cmd = ZINJECT_DATA_FAULT; + if (translate_raw(raw, &record) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = EIO; + } else if (record.zi_cmd == ZINJECT_PANIC) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || device != NULL || record.zi_freq > 0 || + dvas != 0) { + (void) fprintf(stderr, "panic (-p) incompatible with " + "other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc < 1 || argc > 2) { + (void) fprintf(stderr, "panic (-p) injection requires " + "a single pool name and an optional id\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + if (argv[1] != NULL) + record.zi_type = atoi(argv[1]); + dataset[0] = '\0'; + } else if (record.zi_cmd == ZINJECT_IGNORED_WRITES) { + if (raw != NULL || range != NULL || type != TYPE_INVAL || + level != 0 || record.zi_freq > 0 || dvas != 0) { + (void) fprintf(stderr, "hardware failure (-I) " + "incompatible with other options\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (nowrites == 0) { + (void) fprintf(stderr, "-s or -g meaningless " + "without -I (ignore writes)\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } else if (dur_secs && dur_txg) { + (void) fprintf(stderr, "choose a duration either " + "in seconds (-s) or a number of txgs (-g) " + "but not both\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } else if (argc != 1) { + (void) fprintf(stderr, "ignore writes (-I) " + "injection requires a single pool name\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + } else if (type == TYPE_INVAL) { + if (flags == 0) { + (void) fprintf(stderr, "at least one of '-b', '-d', " + "'-t', '-a', '-p', '-I' or '-u' " + "must be specified\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (argc == 1 && (flags & ZINJECT_UNLOAD_SPA)) { + (void) strlcpy(pool, argv[0], sizeof (pool)); + dataset[0] = '\0'; + } else if (argc != 0) { + (void) fprintf(stderr, "extraneous argument for " + "'-f'\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + flags |= ZINJECT_NULL; + } else { + if (argc != 1) { + (void) fprintf(stderr, "missing object\n"); + usage(); + libzfs_fini(g_zfs); + return (2); + } + + if (error == ENXIO || error == EILSEQ) { + (void) fprintf(stderr, "data error type must be " + "'checksum' or 'io'\n"); + libzfs_fini(g_zfs); + return (1); + } + + if (dvas != 0) { + if (error == EACCES || error == EINVAL) { + (void) fprintf(stderr, "the '-C' option may " + "not be used with logical data errors " + "'decrypt' and 'decompress'\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_dvas = dvas; + } + + if (error == EACCES) { + if (type != TYPE_DATA) { + (void) fprintf(stderr, "decryption errors " + "may only be injected for 'data' types\n"); + libzfs_fini(g_zfs); + return (1); + } + + record.zi_cmd = ZINJECT_DECRYPT_FAULT; + /* + * Internally, ZFS actually uses ECKSUM for decryption + * errors since EACCES is used to indicate the key was + * not found. + */ + error = ECKSUM; + } else { + record.zi_cmd = ZINJECT_DATA_FAULT; + } + + if (translate_record(type, argv[0], range, level, &record, pool, + dataset) != 0) { + libzfs_fini(g_zfs); + return (1); + } + if (!error) + error = EIO; + } + + /* + * If this is pool-wide metadata, unmount everything. The ioctl() will + * unload the pool, so that we trigger spa-wide reopen of metadata next + * time we access the pool. + */ + if (dataset[0] != '\0' && domount) { + if ((zhp = zfs_open(g_zfs, dataset, + ZFS_TYPE_DATASET)) == NULL) { + libzfs_fini(g_zfs); + return (1); + } + if (zfs_unmount(zhp, NULL, 0) != 0) { + libzfs_fini(g_zfs); + return (1); + } + } + + record.zi_error = error; + + ret = register_handler(pool, flags, &record, quiet); + + if (dataset[0] != '\0' && domount) + ret = (zfs_mount(zhp, NULL, 0) != 0); + + libzfs_fini(g_zfs); + + return (ret); +} diff --git a/cmd/zinject/zinject.h b/cmd/zinject/zinject.h new file mode 100644 index 000000000000..46fdcad8b31f --- /dev/null +++ b/cmd/zinject/zinject.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZINJECT_H +#define _ZINJECT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + TYPE_DATA, /* plain file contents */ + TYPE_DNODE, /* metadnode contents */ + TYPE_MOS, /* all MOS data */ + TYPE_MOSDIR, /* MOS object directory */ + TYPE_METASLAB, /* metaslab objects */ + TYPE_CONFIG, /* MOS config */ + TYPE_BPOBJ, /* block pointer list */ + TYPE_SPACEMAP, /* space map objects */ + TYPE_ERRLOG, /* persistent error log */ + TYPE_LABEL_UBERBLOCK, /* label specific uberblock */ + TYPE_LABEL_NVLIST, /* label specific nvlist */ + TYPE_LABEL_PAD1, /* label specific 8K pad1 area */ + TYPE_LABEL_PAD2, /* label specific 8K pad2 area */ + TYPE_INVAL +} err_type_t; + +#define MOS_TYPE(t) \ + ((t) >= TYPE_MOS && (t) < TYPE_LABEL_UBERBLOCK) + +#define LABEL_TYPE(t) \ + ((t) >= TYPE_LABEL_UBERBLOCK && (t) < TYPE_INVAL) + +int translate_record(err_type_t type, const char *object, const char *range, + int level, zinject_record_t *record, char *poolname, char *dataset); +int translate_raw(const char *raw, zinject_record_t *record); +int translate_device(const char *pool, const char *device, + err_type_t label_type, zinject_record_t *record); +void usage(void); + +extern libzfs_handle_t *g_zfs; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZINJECT_H */ diff --git a/cmd/zpool/.gitignore b/cmd/zpool/.gitignore new file mode 100644 index 000000000000..8ea518af78e5 --- /dev/null +++ b/cmd/zpool/.gitignore @@ -0,0 +1 @@ +/zpool diff --git a/cmd/zpool/Makefile.am b/cmd/zpool/Makefile.am new file mode 100644 index 000000000000..c0378b136901 --- /dev/null +++ b/cmd/zpool/Makefile.am @@ -0,0 +1,136 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUUID_CFLAGS) + +DEFAULT_INCLUDES += -I$(srcdir) + +sbin_PROGRAMS = zpool + +zpool_SOURCES = \ + zpool_iter.c \ + zpool_main.c \ + zpool_util.c \ + zpool_util.h \ + zpool_vdev.c + +if BUILD_FREEBSD +zpool_SOURCES += os/freebsd/zpool_vdev_os.c +endif + +if BUILD_LINUX +zpool_SOURCES += os/linux/zpool_vdev_os.c +endif + +zpool_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +zpool_LDADD += $(LTLIBINTL) + +if BUILD_FREEBSD +zpool_LDADD += -lgeom +endif +zpool_LDADD += -lm $(LIBBLKID_LIBS) $(LIBUUID_LIBS) + +zpoolconfdir = $(sysconfdir)/zfs/zpool.d +zpoolexecdir = $(zfsexecdir)/zpool.d + +EXTRA_DIST = zpool.d/README + +dist_zpoolexec_SCRIPTS = \ + zpool.d/dm-deps \ + zpool.d/enc \ + zpool.d/encdev \ + zpool.d/fault_led \ + zpool.d/iostat \ + zpool.d/iostat-1s \ + zpool.d/iostat-10s \ + zpool.d/label \ + zpool.d/locate_led \ + zpool.d/lsblk \ + zpool.d/media \ + zpool.d/model \ + zpool.d/serial \ + zpool.d/ses \ + zpool.d/size \ + zpool.d/slot \ + zpool.d/smart \ + zpool.d/smartx \ + zpool.d/temp \ + zpool.d/health \ + zpool.d/r_proc \ + zpool.d/w_proc \ + zpool.d/r_ucor \ + zpool.d/w_ucor \ + zpool.d/nonmed \ + zpool.d/defect \ + zpool.d/hours_on \ + zpool.d/realloc \ + zpool.d/rep_ucor \ + zpool.d/cmd_to \ + zpool.d/pend_sec \ + zpool.d/off_ucor \ + zpool.d/ata_err \ + zpool.d/nvme_err \ + zpool.d/pwr_cyc \ + zpool.d/upath \ + zpool.d/vendor \ + zpool.d/smart_test \ + zpool.d/test_type \ + zpool.d/test_status \ + zpool.d/test_progress \ + zpool.d/test_ended + +zpoolconfdefaults = \ + dm-deps \ + enc \ + encdev \ + fault_led \ + iostat \ + iostat-1s \ + iostat-10s \ + label \ + locate_led \ + lsblk \ + media \ + model \ + serial \ + ses \ + size \ + slot \ + smart \ + smartx \ + temp \ + health \ + r_proc \ + w_proc \ + r_ucor \ + w_ucor \ + nonmed \ + defect \ + hours_on \ + realloc \ + rep_ucor \ + cmd_to \ + pend_sec \ + off_ucor \ + ata_err \ + nvme_err \ + pwr_cyc \ + upath \ + vendor \ + smart_test \ + test_type \ + test_status \ + test_progress \ + test_ended + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(zpoolconfdir)" + for f in $(zpoolconfdefaults); do \ + test -f "$(DESTDIR)$(zpoolconfdir)/$${f}" -o \ + -L "$(DESTDIR)$(zpoolconfdir)/$${f}" || \ + ln -s "$(zpoolexecdir)/$${f}" "$(DESTDIR)$(zpoolconfdir)"; \ + done diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c new file mode 100644 index 000000000000..7d48f61a0ee7 --- /dev/null +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libdiskmgt, makes sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" +#include + +int +check_device(const char *name, boolean_t force, boolean_t isspare, + boolean_t iswholedisk) +{ + char path[MAXPATHLEN]; + + if (strncmp(name, _PATH_DEV, sizeof (_PATH_DEV) - 1) != 0) + snprintf(path, sizeof (path), "%s%s", _PATH_DEV, name); + else + strlcpy(path, name, sizeof (path)); + + return (check_file(path, force, isspare)); +} + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + return (0); +} diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c new file mode 100644 index 000000000000..d087c4c14dac --- /dev/null +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -0,0 +1,410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libblkid to make sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpool_util.h" +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct vdev_disk_db_entry +{ + char id[24]; + int sector_size; +} vdev_disk_db_entry_t; + +/* + * Database of block devices that lie about physical sector sizes. The + * identification string must be precisely 24 characters to avoid false + * negatives + */ +static vdev_disk_db_entry_t vdev_disk_database[] = { + {"ATA ADATA SSD S396 3", 8192}, + {"ATA APPLE SSD SM128E", 8192}, + {"ATA APPLE SSD SM256E", 8192}, + {"ATA APPLE SSD SM512E", 8192}, + {"ATA APPLE SSD SM768E", 8192}, + {"ATA C400-MTFDDAC064M", 8192}, + {"ATA C400-MTFDDAC128M", 8192}, + {"ATA C400-MTFDDAC256M", 8192}, + {"ATA C400-MTFDDAC512M", 8192}, + {"ATA Corsair Force 3 ", 8192}, + {"ATA Corsair Force GS", 8192}, + {"ATA INTEL SSDSA2CT04", 8192}, + {"ATA INTEL SSDSA2BZ10", 8192}, + {"ATA INTEL SSDSA2BZ20", 8192}, + {"ATA INTEL SSDSA2BZ30", 8192}, + {"ATA INTEL SSDSA2CW04", 8192}, + {"ATA INTEL SSDSA2CW08", 8192}, + {"ATA INTEL SSDSA2CW12", 8192}, + {"ATA INTEL SSDSA2CW16", 8192}, + {"ATA INTEL SSDSA2CW30", 8192}, + {"ATA INTEL SSDSA2CW60", 8192}, + {"ATA INTEL SSDSC2CT06", 8192}, + {"ATA INTEL SSDSC2CT12", 8192}, + {"ATA INTEL SSDSC2CT18", 8192}, + {"ATA INTEL SSDSC2CT24", 8192}, + {"ATA INTEL SSDSC2CW06", 8192}, + {"ATA INTEL SSDSC2CW12", 8192}, + {"ATA INTEL SSDSC2CW18", 8192}, + {"ATA INTEL SSDSC2CW24", 8192}, + {"ATA INTEL SSDSC2CW48", 8192}, + {"ATA KINGSTON SH100S3", 8192}, + {"ATA KINGSTON SH103S3", 8192}, + {"ATA M4-CT064M4SSD2 ", 8192}, + {"ATA M4-CT128M4SSD2 ", 8192}, + {"ATA M4-CT256M4SSD2 ", 8192}, + {"ATA M4-CT512M4SSD2 ", 8192}, + {"ATA OCZ-AGILITY2 ", 8192}, + {"ATA OCZ-AGILITY3 ", 8192}, + {"ATA OCZ-VERTEX2 3.5 ", 8192}, + {"ATA OCZ-VERTEX3 ", 8192}, + {"ATA OCZ-VERTEX3 LT ", 8192}, + {"ATA OCZ-VERTEX3 MI ", 8192}, + {"ATA OCZ-VERTEX4 ", 8192}, + {"ATA SAMSUNG MZ7WD120", 8192}, + {"ATA SAMSUNG MZ7WD240", 8192}, + {"ATA SAMSUNG MZ7WD480", 8192}, + {"ATA SAMSUNG MZ7WD960", 8192}, + {"ATA SAMSUNG SSD 830 ", 8192}, + {"ATA Samsung SSD 840 ", 8192}, + {"ATA SanDisk SSD U100", 8192}, + {"ATA TOSHIBA THNSNH06", 8192}, + {"ATA TOSHIBA THNSNH12", 8192}, + {"ATA TOSHIBA THNSNH25", 8192}, + {"ATA TOSHIBA THNSNH51", 8192}, + {"ATA APPLE SSD TS064C", 4096}, + {"ATA APPLE SSD TS128C", 4096}, + {"ATA APPLE SSD TS256C", 4096}, + {"ATA APPLE SSD TS512C", 4096}, + {"ATA INTEL SSDSA2M040", 4096}, + {"ATA INTEL SSDSA2M080", 4096}, + {"ATA INTEL SSDSA2M160", 4096}, + {"ATA INTEL SSDSC2MH12", 4096}, + {"ATA INTEL SSDSC2MH25", 4096}, + {"ATA OCZ CORE_SSD ", 4096}, + {"ATA OCZ-VERTEX ", 4096}, + {"ATA SAMSUNG MCCOE32G", 4096}, + {"ATA SAMSUNG MCCOE64G", 4096}, + {"ATA SAMSUNG SSD PM80", 4096}, + /* Flash drives optimized for 4KB IOs on larger pages */ + {"ATA INTEL SSDSC2BA10", 4096}, + {"ATA INTEL SSDSC2BA20", 4096}, + {"ATA INTEL SSDSC2BA40", 4096}, + {"ATA INTEL SSDSC2BA80", 4096}, + {"ATA INTEL SSDSC2BB08", 4096}, + {"ATA INTEL SSDSC2BB12", 4096}, + {"ATA INTEL SSDSC2BB16", 4096}, + {"ATA INTEL SSDSC2BB24", 4096}, + {"ATA INTEL SSDSC2BB30", 4096}, + {"ATA INTEL SSDSC2BB40", 4096}, + {"ATA INTEL SSDSC2BB48", 4096}, + {"ATA INTEL SSDSC2BB60", 4096}, + {"ATA INTEL SSDSC2BB80", 4096}, + {"ATA INTEL SSDSC2BW24", 4096}, + {"ATA INTEL SSDSC2BW48", 4096}, + {"ATA INTEL SSDSC2BP24", 4096}, + {"ATA INTEL SSDSC2BP48", 4096}, + {"NA SmrtStorSDLKAE9W", 4096}, + {"NVMe Amazon EC2 NVMe ", 4096}, + /* Imported from Open Solaris */ + {"ATA MARVELL SD88SA02", 4096}, + /* Advanced format Hard drives */ + {"ATA Hitachi HDS5C303", 4096}, + {"ATA SAMSUNG HD204UI ", 4096}, + {"ATA ST2000DL004 HD20", 4096}, + {"ATA WDC WD10EARS-00M", 4096}, + {"ATA WDC WD10EARS-00S", 4096}, + {"ATA WDC WD10EARS-00Z", 4096}, + {"ATA WDC WD15EARS-00M", 4096}, + {"ATA WDC WD15EARS-00S", 4096}, + {"ATA WDC WD15EARS-00Z", 4096}, + {"ATA WDC WD20EARS-00M", 4096}, + {"ATA WDC WD20EARS-00S", 4096}, + {"ATA WDC WD20EARS-00Z", 4096}, + {"ATA WDC WD1600BEVT-0", 4096}, + {"ATA WDC WD2500BEVT-0", 4096}, + {"ATA WDC WD3200BEVT-0", 4096}, + {"ATA WDC WD5000BEVT-0", 4096}, +}; + + +#define INQ_REPLY_LEN 96 +#define INQ_CMD_LEN 6 + +static const int vdev_disk_database_size = + sizeof (vdev_disk_database) / sizeof (vdev_disk_database[0]); + +boolean_t +check_sector_size_database(char *path, int *sector_size) +{ + unsigned char inq_buff[INQ_REPLY_LEN]; + unsigned char sense_buffer[32]; + unsigned char inq_cmd_blk[INQ_CMD_LEN] = + {INQUIRY, 0, 0, 0, INQ_REPLY_LEN, 0}; + sg_io_hdr_t io_hdr; + int error; + int fd; + int i; + + /* Prepare INQUIRY command */ + memset(&io_hdr, 0, sizeof (sg_io_hdr_t)); + io_hdr.interface_id = 'S'; + io_hdr.cmd_len = sizeof (inq_cmd_blk); + io_hdr.mx_sb_len = sizeof (sense_buffer); + io_hdr.dxfer_direction = SG_DXFER_FROM_DEV; + io_hdr.dxfer_len = INQ_REPLY_LEN; + io_hdr.dxferp = inq_buff; + io_hdr.cmdp = inq_cmd_blk; + io_hdr.sbp = sense_buffer; + io_hdr.timeout = 10; /* 10 milliseconds is ample time */ + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (B_FALSE); + + error = ioctl(fd, SG_IO, (unsigned long) &io_hdr); + + (void) close(fd); + + if (error < 0) + return (B_FALSE); + + if ((io_hdr.info & SG_INFO_OK_MASK) != SG_INFO_OK) + return (B_FALSE); + + for (i = 0; i < vdev_disk_database_size; i++) { + if (memcmp(inq_buff + 8, vdev_disk_database[i].id, 24)) + continue; + + *sector_size = vdev_disk_database[i].sector_size; + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +check_slice(const char *path, blkid_cache cache, int force, boolean_t isspare) +{ + int err; + char *value; + + /* No valid type detected device is safe to use */ + value = blkid_get_tag_value(cache, "TYPE", path); + if (value == NULL) + return (0); + + /* + * If libblkid detects a ZFS device, we check the device + * using check_file() to see if it's safe. The one safe + * case is a spare device shared between multiple pools. + */ + if (strcmp(value, "zfs_member") == 0) { + err = check_file(path, force, isspare); + } else { + if (force) { + err = 0; + } else { + err = -1; + vdev_error(gettext("%s contains a filesystem of " + "type '%s'\n"), path, value); + } + } + + free(value); + + return (err); +} + +/* + * Validate that a disk including all partitions are safe to use. + * + * For EFI labeled disks this can done relatively easily with the libefi + * library. The partition numbers are extracted from the label and used + * to generate the expected /dev/ paths. Each partition can then be + * checked for conflicts. + * + * For non-EFI labeled disks (MBR/EBR/etc) the same process is possible + * but due to the lack of a readily available libraries this scanning is + * not implemented. Instead only the device path as given is checked. + */ +static int +check_disk(const char *path, blkid_cache cache, int force, + boolean_t isspare, boolean_t iswholedisk) +{ + struct dk_gpt *vtoc; + char slice_path[MAXPATHLEN]; + int err = 0; + int fd, i; + int flags = O_RDONLY|O_DIRECT; + + if (!iswholedisk) + return (check_slice(path, cache, force, isspare)); + + /* only spares can be shared, other devices require exclusive access */ + if (!isspare) + flags |= O_EXCL; + + if ((fd = open(path, flags)) < 0) { + char *value = blkid_get_tag_value(cache, "TYPE", path); + (void) fprintf(stderr, gettext("%s is in use and contains " + "a %s filesystem.\n"), path, value ? value : "unknown"); + free(value); + return (-1); + } + + /* + * Expected to fail for non-EFI labeled disks. Just check the device + * as given and do not attempt to detect and scan partitions. + */ + err = efi_alloc_and_read(fd, &vtoc); + if (err) { + (void) close(fd); + return (check_slice(path, cache, force, isspare)); + } + + /* + * The primary efi partition label is damaged however the secondary + * label at the end of the device is intact. Rather than use this + * label we should play it safe and treat this as a non efi device. + */ + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + + if (force) { + /* Partitions will now be created using the backup */ + return (0); + } else { + vdev_error(gettext("%s contains a corrupt primary " + "EFI label.\n"), path); + return (-1); + } + } + + for (i = 0; i < vtoc->efi_nparts; i++) { + + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED || + uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) + continue; + + if (strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, "-part", i+1); + else + (void) snprintf(slice_path, sizeof (slice_path), + "%s%s%d", path, isdigit(path[strlen(path)-1]) ? + "p" : "", i+1); + + err = check_slice(slice_path, cache, force, isspare); + if (err) + break; + } + + efi_free(vtoc); + (void) close(fd); + + return (err); +} + +int +check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk) +{ + blkid_cache cache; + int error; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) { + (void) fprintf(stderr, gettext("unable to access the blkid " + "cache.\n")); + return (-1); + } + + error = check_disk(path, cache, force, isspare, iswholedisk); + blkid_put_cache(cache); + + return (error); +} diff --git a/cmd/zpool/zpool.d/README b/cmd/zpool/zpool.d/README new file mode 100644 index 000000000000..033b7c363f5a --- /dev/null +++ b/cmd/zpool/zpool.d/README @@ -0,0 +1,9 @@ +This directory contains scripts that can be run the zpool status/iostat +-c option: + + zpool status -c script1,script2, ... + + zpool iostat -vc script1,script2, ... + +Some scripts output different values depending on the symlink name that is +used to run them. See the zpool(8) man page for more details. diff --git a/cmd/zpool/zpool.d/ata_err b/cmd/zpool/zpool.d/ata_err new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/ata_err @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/cmd_to b/cmd/zpool/zpool.d/cmd_to new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/cmd_to @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/defect b/cmd/zpool/zpool.d/defect new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/defect @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/dm-deps b/cmd/zpool/zpool.d/dm-deps new file mode 100755 index 000000000000..ee39514e4d92 --- /dev/null +++ b/cmd/zpool/zpool.d/dm-deps @@ -0,0 +1,29 @@ +#!/bin/sh +# +# Show device mapper dependent / underlying devices. This is useful for +# looking up the /dev/sd* devices associated with a dm or multipath device. +# + +if [ "$1" = "-h" ] ; then + echo "Show device mapper dependent (underlying) devices." + exit +fi + +dev="$VDEV_PATH" + +# If the VDEV path is a symlink, resolve it to a real device +if [ -L "$dev" ] ; then + dev=$(readlink "$dev") +fi + +dev=$(basename "$dev") +val="" +if [ -d "/sys/class/block/$dev/slaves" ] ; then + # ls -C: output in columns, no newlines + val=$(ls -C "/sys/class/block/$dev/slaves") + + # ls -C will print two spaces between files; change to one space. + val=$(echo "$val" | sed -r 's/[[:blank:]]+/ /g') +fi + +echo "dm-deps=$val" diff --git a/cmd/zpool/zpool.d/enc b/cmd/zpool/zpool.d/enc new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/cmd/zpool/zpool.d/enc @@ -0,0 +1 @@ +ses \ No newline at end of file diff --git a/cmd/zpool/zpool.d/encdev b/cmd/zpool/zpool.d/encdev new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/cmd/zpool/zpool.d/encdev @@ -0,0 +1 @@ +ses \ No newline at end of file diff --git a/cmd/zpool/zpool.d/fault_led b/cmd/zpool/zpool.d/fault_led new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/cmd/zpool/zpool.d/fault_led @@ -0,0 +1 @@ +ses \ No newline at end of file diff --git a/cmd/zpool/zpool.d/health b/cmd/zpool/zpool.d/health new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/health @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/hours_on b/cmd/zpool/zpool.d/hours_on new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/hours_on @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/iostat b/cmd/zpool/zpool.d/iostat new file mode 100755 index 000000000000..41a3acfae7a4 --- /dev/null +++ b/cmd/zpool/zpool.d/iostat @@ -0,0 +1,77 @@ +#!/bin/sh +# +# Display most relevant iostat bandwidth/latency numbers. The output is +# dependent on the name of the script/symlink used to call it. +# + +helpstr=" +iostat: Show iostat values since boot (summary page). +iostat-1s: Do a single 1-second iostat sample and show values. +iostat-10s: Do a single 10-second iostat sample and show values." + +script=$(basename "$0") +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "iostat-1s" ] ; then + # Do a single one-second sample + interval=1 + # Don't show summary stats + brief="yes" +elif [ "$script" = "iostat-10s" ] ; then + # Do a single ten-second sample + interval=10 + # Don't show summary stats + brief="yes" +fi + +if [ -f "$VDEV_UPATH" ] ; then + # We're a file-based vdev, iostat doesn't work on us. Do nothing. + exit +fi + +if [ "$(uname)" = "FreeBSD" ]; then + out=$(iostat -dKx \ + ${interval:+"-w $interval"} \ + ${interval:+"-c 1"} \ + "$VDEV_UPATH" | tail -n 2) +else + out=$(iostat -kx \ + ${brief:+"-y"} \ + ${interval:+"$interval"} \ + ${interval:+"1"} \ + "$VDEV_UPATH" | awk NF | tail -n 2) +fi + + +# Sample output (we want the last two lines): +# +# Linux 2.6.32-642.13.1.el6.x86_64 (centos68) 03/09/2017 _x86_64_ (6 CPU) +# +# avg-cpu: %user %nice %system %iowait %steal %idle +# 0.00 0.00 0.00 0.00 0.00 100.00 +# +# Device: rrqm/s wrqm/s r/s w/s rkB/s wkB/s avgrq-sz avgqu-sz await r_await w_await svctm %util +# sdb 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 +# + +# Get the column names +cols=$(echo "$out" | head -n 1) + +# Get the values and tab separate them to make them cut-able. +vals=$(echo "$out" | tail -n 1 | sed -r 's/[[:blank:]]+/\t/g') + +i=0 +for col in $cols ; do + i=$((i+1)) + # Skip the first column since it's just the device name + if [ $i -eq 1 ]; then + continue + fi + + # Get i'th value + val=$(echo "$vals" | cut -f "$i") + echo "$col=$val" +done diff --git a/cmd/zpool/zpool.d/iostat-10s b/cmd/zpool/zpool.d/iostat-10s new file mode 120000 index 000000000000..084278d99f0f --- /dev/null +++ b/cmd/zpool/zpool.d/iostat-10s @@ -0,0 +1 @@ +iostat \ No newline at end of file diff --git a/cmd/zpool/zpool.d/iostat-1s b/cmd/zpool/zpool.d/iostat-1s new file mode 120000 index 000000000000..084278d99f0f --- /dev/null +++ b/cmd/zpool/zpool.d/iostat-1s @@ -0,0 +1 @@ +iostat \ No newline at end of file diff --git a/cmd/zpool/zpool.d/label b/cmd/zpool/zpool.d/label new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/cmd/zpool/zpool.d/label @@ -0,0 +1 @@ +lsblk \ No newline at end of file diff --git a/cmd/zpool/zpool.d/locate_led b/cmd/zpool/zpool.d/locate_led new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/cmd/zpool/zpool.d/locate_led @@ -0,0 +1 @@ +ses \ No newline at end of file diff --git a/cmd/zpool/zpool.d/lsblk b/cmd/zpool/zpool.d/lsblk new file mode 100755 index 000000000000..1cdef40494fe --- /dev/null +++ b/cmd/zpool/zpool.d/lsblk @@ -0,0 +1,83 @@ +#!/bin/sh +# +# Print some common lsblk values +# +# Any (lowercased) name symlinked to the lsblk script will be passed to lsblk +# as one of its --output names. Here's a partial list of --output names +# from the lsblk binary: +# +# Available columns (for --output): +# NAME device name +# KNAME internal kernel device name +# MAJ:MIN major:minor device number +# FSTYPE filesystem type +# MOUNTPOINT where the device is mounted +# LABEL filesystem LABEL +# UUID filesystem UUID +# RA read-ahead of the device +# RO read-only device +# RM removable device +# MODEL device identifier +# SIZE size of the device +# STATE state of the device +# OWNER user name +# GROUP group name +# MODE device node permissions +# ALIGNMENT alignment offset +# MIN-IO minimum I/O size +# OPT-IO optimal I/O size +# PHY-SEC physical sector size +# LOG-SEC logical sector size +# ROTA rotational device +# SCHED I/O scheduler name +# RQ-SIZE request queue size +# TYPE device type +# DISC-ALN discard alignment offset +# DISC-GRAN discard granularity +# DISC-MAX discard max bytes +# DISC-ZERO discard zeroes data +# +# If the script is run as just 'lsblk' then print out disk size, vendor, +# and model number. + + +helpstr=" +label: Show filesystem label. +model: Show disk model number. +size: Show the disk capacity. +vendor: Show the disk vendor. +lsblk: Show the disk size, vendor, and model number." + +script=$(basename "$0") + +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "lsblk" ] ; then + list="size vendor model" +else + list=$(echo "$script" | tr '[:upper:]' '[:lower:]') +fi + +# Older versions of lsblk don't support all these values (like SERIAL). +for i in $list ; do + + # Special case: Looking up the size of a file-based vdev can't + # be done with lsblk. + if [ "$i" = "size" ] && [ -f "$VDEV_UPATH" ] ; then + size=$(du -h --apparent-size "$VDEV_UPATH" | cut -f 1) + echo "size=$size" + continue + fi + + + val="" + if val=$(eval "lsblk -dl -n -o $i $VDEV_UPATH 2>/dev/null") ; then + # Remove leading/trailing whitespace from value + val=$(echo "$val" | sed -e 's/^[[:space:]]*//' \ + -e 's/[[:space:]]*$//') + fi + echo "$i=$val" +done diff --git a/cmd/zpool/zpool.d/media b/cmd/zpool/zpool.d/media new file mode 100755 index 000000000000..05bc15918bc9 --- /dev/null +++ b/cmd/zpool/zpool.d/media @@ -0,0 +1,27 @@ +#!/bin/sh +# +# Print out the type of device +# + +if [ "$1" = "-h" ] ; then + echo "Show whether a vdev is a file, hdd, or ssd." + exit +fi + +if [ -b "$VDEV_UPATH" ]; then + device=$(basename "$VDEV_UPATH") + val=$(cat "/sys/block/$device/queue/rotational" 2>/dev/null) + if [ "$val" = "0" ]; then + MEDIA="ssd" + fi + + if [ "$val" = "1" ]; then + MEDIA="hdd" + fi +else + if [ -f "$VDEV_UPATH" ]; then + MEDIA="file" + fi +fi + +echo "media=$MEDIA" diff --git a/cmd/zpool/zpool.d/model b/cmd/zpool/zpool.d/model new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/cmd/zpool/zpool.d/model @@ -0,0 +1 @@ +lsblk \ No newline at end of file diff --git a/cmd/zpool/zpool.d/nonmed b/cmd/zpool/zpool.d/nonmed new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/nonmed @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/nvme_err b/cmd/zpool/zpool.d/nvme_err new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/nvme_err @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/off_ucor b/cmd/zpool/zpool.d/off_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/off_ucor @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/pend_sec b/cmd/zpool/zpool.d/pend_sec new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/pend_sec @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/pwr_cyc b/cmd/zpool/zpool.d/pwr_cyc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/pwr_cyc @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/r_proc b/cmd/zpool/zpool.d/r_proc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/r_proc @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/r_ucor b/cmd/zpool/zpool.d/r_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/r_ucor @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/realloc b/cmd/zpool/zpool.d/realloc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/realloc @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/rep_ucor b/cmd/zpool/zpool.d/rep_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/rep_ucor @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/serial b/cmd/zpool/zpool.d/serial new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/serial @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses new file mode 100755 index 000000000000..f6b7520dfb6c --- /dev/null +++ b/cmd/zpool/zpool.d/ses @@ -0,0 +1,52 @@ +#!/bin/sh +# +# Print SCSI Enclosure Services (SES) info. The output is dependent on the name +# of the script/symlink used to call it. +# +helpstr=" +enc: Show disk enclosure w:x:y:z value. +slot: Show disk slot number as reported by the enclosure. +encdev: Show /dev/sg* device associated with the enclosure disk slot. +fault_led: Show value of the disk enclosure slot fault LED. +locate_led: Show value of the disk enclosure slot locate LED. +ses: Show disk's enc, enc device, slot, and fault/locate LED values." + +script=$(basename "$0") +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +if [ "$script" = "ses" ] ; then + scripts='enc encdev slot fault_led locate_led' +else + scripts="$script" +fi + +for i in $scripts ; do + if [ -z "$VDEV_ENC_SYSFS_PATH" ] ; then + echo "$i=" + continue + fi + + val="" + case $i in + enc) + val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + ;; + slot) + val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + ;; + encdev) + val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) + ;; + fault_led) + val=$(cat "$VDEV_ENC_SYSFS_PATH/fault" 2>/dev/null) + ;; + locate_led) + val=$(cat "$VDEV_ENC_SYSFS_PATH/locate" 2>/dev/null) + ;; + esac + echo "$i=$val" +done + diff --git a/cmd/zpool/zpool.d/size b/cmd/zpool/zpool.d/size new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/cmd/zpool/zpool.d/size @@ -0,0 +1 @@ +lsblk \ No newline at end of file diff --git a/cmd/zpool/zpool.d/slot b/cmd/zpool/zpool.d/slot new file mode 120000 index 000000000000..478d1e8967a1 --- /dev/null +++ b/cmd/zpool/zpool.d/slot @@ -0,0 +1 @@ +ses \ No newline at end of file diff --git a/cmd/zpool/zpool.d/smart b/cmd/zpool/zpool.d/smart new file mode 100755 index 000000000000..f8854b75227c --- /dev/null +++ b/cmd/zpool/zpool.d/smart @@ -0,0 +1,243 @@ +#!/bin/sh +# +# Show SMART stats +# + +helpstr=" +smart: Show SMART temperature and error stats (specific to drive type) +smartx: Show SMART extended drive stats (specific to drive type). +temp: Show SMART drive temperature in celsius (all drives). +health: Show reported SMART status (all drives). +r_proc: Show SMART read GBytes processed over drive lifetime (SAS). +w_proc: Show SMART write GBytes processed over drive lifetime (SAS). +r_ucor: Show SMART read uncorrectable errors (SAS). +w_ucor: Show SMART write uncorrectable errors (SAS). +nonmed: Show SMART non-medium errors (SAS). +defect: Show SMART grown defect list (SAS). +hours_on: Show number of hours drive powered on (all drives). +realloc: Show SMART reallocated sectors count (ATA). +rep_ucor: Show SMART reported uncorrectable count (ATA). +cmd_to: Show SMART command timeout count (ATA). +pend_sec: Show SMART current pending sector count (ATA). +off_ucor: Show SMART offline uncorrectable errors (ATA). +ata_err: Show SMART ATA errors (ATA). +pwr_cyc: Show SMART power cycle count (ATA). +serial: Show disk serial number. +nvme_err: Show SMART NVMe errors (NVMe). +smart_test: Show SMART self-test results summary. +test_type: Show SMART self-test type (short, long... ). +test_status: Show SMART self-test status. +test_progress: Show SMART self-test percentage done. +test_ended: Show when the last SMART self-test ended (if supported). +" + +# Hack for developer testing +# +# If you set $samples to a directory containing smartctl output text files, +# we will use them instead of running smartctl on the vdevs. This can be +# useful if you want to test a bunch of different smartctl outputs. Also, if +# $samples is set, and additional 'file' column is added to the zpool output +# showing the filename. +samples= + +# get_filename_from_dir DIR +# +# Look in directory DIR and return a filename from it. The filename returned +# is chosen quasi-sequentially (based off our PID). This allows us to return +# a different filename every time this script is invoked (which we do for each +# vdev), without having to maintain state. +get_filename_from_dir() +{ + dir=$1 + pid="$$" + num_files=$(find "$dir" -maxdepth 1 -type f | wc -l) + mod=$((pid % num_files)) + i=0 + find "$dir" -type f -printf "%f\n" | while read -r file ; do + if [ "$mod" = "$i" ] ; then + echo "$file" + break + fi + i=$((i+1)) + done +} + +script=$(basename "$0") + +if [ "$1" = "-h" ] ; then + echo "$helpstr" | grep "$script:" | tr -s '\t' | cut -f 2- + exit +fi + +smartctl_path=$(command -v smartctl) + +# shellcheck disable=SC2015 +if [ -b "$VDEV_UPATH" ] && [ -x "$smartctl_path" ] || [ -n "$samples" ] ; then + if [ -n "$samples" ] ; then + # cat a smartctl output text file instead of running smartctl + # on a vdev (only used for developer testing). + file=$(get_filename_from_dir "$samples") + echo "file=$file" + raw_out=$(cat "$samples/$file") + else + raw_out=$(eval "sudo $smartctl_path -a $VDEV_UPATH") + fi + + # What kind of drive are we? Look for the right line in smartctl: + # + # SAS: + # Transport protocol: SAS + # + # SATA: + # ATA Version is: 8 + # + # NVMe: + # SMART/Health Information (NVMe Log 0xnn, NSID 0xnn) + # + out=$(echo "$raw_out" | awk ' +# SAS specific +/read:/{print "rrd="$4"\nr_cor="$5"\nr_proc="$7"\nr_ucor="$8} +/write:/{print "rwr="$4"\nw_cor="$5"\nw_proc="$7"\nw_ucor="$8} +/Non-medium error count/{print "nonmed="$4} +/Elements in grown defect list/{print "defect="$6} + +# SAS common +/SAS/{type="sas"} +/Drive Temperature:/{print "temp="$4} +# Status can be a long string, substitute spaces for '_' +/SMART Health Status:/{printf "health="; for(i=4;i<=NF-1;i++){printf "%s_", $i}; printf "%s\n", $i} +/number of hours powered up/{print "hours_on="$7; hours_on=int($7)} +/Serial number:/{print "serial="$3} + +# SATA specific +/Reallocated_Sector_Ct/{print "realloc="$10} +/Reported_Uncorrect/{print "rep_ucor="$10} +/Command_Timeout/{print "cmd_to="$10} +/Current_Pending_Sector/{print "pend_sec="$10} +/Offline_Uncorrectable/{print "off_ucor="$10} +/ATA Error Count:/{print "ata_err="$4} +/Power_Cycle_Count/{print "pwr_cyc="$10} + +# SATA common +/SATA/{type="sata"} +/Temperature_Celsius/{print "temp="$10} +/Airflow_Temperature_Cel/{print "temp="$10} +/Current Temperature:/{print "temp="$3} +/SMART overall-health self-assessment test result:/{print "health="$6} +/Power_On_Hours/{print "hours_on="$10; hours_on=int($10)} +/Serial Number:/{print "serial="$3} + +# NVMe common +/NVMe/{type="nvme"} +/Temperature:/{print "temp="$2} +/SMART overall-health self-assessment test result:/{print "health="$6} +/Power On Hours:/{gsub("[^0-9]","",$4); print "hours_on="$4} +/Serial Number:/{print "serial="$3} +/Power Cycles:/{print "pwr_cyc="$3} + +# NVMe specific +/Media and Data Integrity Errors:/{print "nvme_err="$6} + +# SMART self-test info +/Self-test execution status:/{progress=tolower($4)} # SAS +/SMART Self-test log/{test_seen=1} # SAS +/SMART Extended Self-test Log/{test_seen=1} # SATA +/# 1/{ + test_type=tolower($3"_"$4); + # Status could be one word ("Completed") or multiple ("Completed: read + # failure"). Look for the ":" to see if we need to grab more words. + + if ($5 ~ ":") + status=tolower($5""$6"_"$7) + else + status=tolower($5) + if (status=="self") + status="running"; + + if (type == "sas") { + hours=int($(NF-4)) + } else { + hours=int($(NF-1)) + # SATA reports percent remaining, rather than percent done + # Convert it to percent done. + progress=(100-int($(NF-2)))"%" + } + # When we int()-ify "hours", it converts stuff like "NOW" and "-" into + # 0. In those cases, set it to hours_on, so they will cancel out in + # the "hours_ago" calculation later on. + if (hours == 0) + hours=hours_on + + if (test_seen) { + print "test="hours_on + print "test_type="test_type + print "test_status="status + print "test_progress="progress + } + # Not all drives report hours_on + if (hours_on && hours) { + total_hours_ago=(hours_on-hours) + days_ago=int(total_hours_ago/24) + hours_ago=(total_hours_ago % 24) + if (days_ago != 0) + ago_str=days_ago"d" + if (hours_ago !=0) + ago_str=ago_str""hours_ago"h" + print "test_ended="ago_str + } +} + +END {print "type="type; ORS="\n"; print ""} +'); +fi +type=$(echo "$out" | grep '^type=' | cut -d '=' -f 2) + +# If type is not set by now, either we don't have a block device +# or smartctl failed. Either way, default to ATA and set $out to +# nothing. +if [ -z "$type" ]; then + type="sata" + out= +fi + +case $script in +smart) + # Print temperature plus common predictors of drive failure + if [ "$type" = "sas" ] ; then + scripts="temp|health|r_ucor|w_ucor" + elif [ "$type" = "sata" ] ; then + scripts="temp|health|ata_err|realloc|rep_ucor|cmd_to|pend_sec|off_ucor" + elif [ "$type" = "nvme" ] ; then + scripts="temp|health|nvme_err" + fi + ;; +smartx) + # Print some other interesting stats + if [ "$type" = "sas" ] ; then + scripts="hours_on|defect|nonmed|r_proc|w_proc" + elif [ "$type" = "sata" ] ; then + scripts="hours_on|pwr_cyc" + elif [ "$type" = "nvme" ] ; then + scripts="hours_on|pwr_cyc" + fi + ;; +smart_test) + scripts="test_type|test_status|test_progress|test_ended" + ;; +*) + scripts="$script" +esac + +with_vals=$(echo "$out" | grep -E "$scripts") +if [ -n "$with_vals" ]; then + echo "$with_vals" + without_vals=$(echo "$scripts" | tr "|" "\n" | + grep -v -E "$(echo "$with_vals" | + awk -F "=" '{print $1}')" | awk '{print $0"="}') +else + without_vals=$(echo "$scripts" | tr "|" "\n" | awk '{print $0"="}') +fi + +if [ -n "$without_vals" ]; then + echo "$without_vals" +fi diff --git a/cmd/zpool/zpool.d/smart_test b/cmd/zpool/zpool.d/smart_test new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/smart_test @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/smartx b/cmd/zpool/zpool.d/smartx new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/smartx @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/temp b/cmd/zpool/zpool.d/temp new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/temp @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_ended b/cmd/zpool/zpool.d/test_ended new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/test_ended @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_progress b/cmd/zpool/zpool.d/test_progress new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/test_progress @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_status b/cmd/zpool/zpool.d/test_status new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/test_status @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/test_type b/cmd/zpool/zpool.d/test_type new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/test_type @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/upath b/cmd/zpool/zpool.d/upath new file mode 100755 index 000000000000..16a4327d4850 --- /dev/null +++ b/cmd/zpool/zpool.d/upath @@ -0,0 +1,7 @@ +#!/bin/sh +if [ "$1" = "-h" ] ; then + echo "Show the underlying path for a device." + exit +fi + +echo upath="$VDEV_UPATH" diff --git a/cmd/zpool/zpool.d/vendor b/cmd/zpool/zpool.d/vendor new file mode 120000 index 000000000000..7d1e766add99 --- /dev/null +++ b/cmd/zpool/zpool.d/vendor @@ -0,0 +1 @@ +lsblk \ No newline at end of file diff --git a/cmd/zpool/zpool.d/w_proc b/cmd/zpool/zpool.d/w_proc new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/w_proc @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool.d/w_ucor b/cmd/zpool/zpool.d/w_ucor new file mode 120000 index 000000000000..94f22861f0ce --- /dev/null +++ b/cmd/zpool/zpool.d/w_ucor @@ -0,0 +1 @@ +smart \ No newline at end of file diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c new file mode 100644 index 000000000000..5f3153bca2c2 --- /dev/null +++ b/cmd/zpool/zpool_iter.c @@ -0,0 +1,757 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2016 Igor Kozhukhov . + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "zpool_util.h" + +/* + * Private interface for iterating over pools specified on the command line. + * Most consumers will call for_each_pool, but in order to support iostat, we + * allow fined grained control through the zpool_list_t interface. + */ + +typedef struct zpool_node { + zpool_handle_t *zn_handle; + uu_avl_node_t zn_avlnode; + int zn_mark; +} zpool_node_t; + +struct zpool_list { + boolean_t zl_findall; + uu_avl_t *zl_avl; + uu_avl_pool_t *zl_pool; + zprop_list_t **zl_proplist; +}; + +/* ARGSUSED */ +static int +zpool_compare(const void *larg, const void *rarg, void *unused) +{ + zpool_handle_t *l = ((zpool_node_t *)larg)->zn_handle; + zpool_handle_t *r = ((zpool_node_t *)rarg)->zn_handle; + const char *lname = zpool_get_name(l); + const char *rname = zpool_get_name(r); + + return (strcmp(lname, rname)); +} + +/* + * Callback function for pool_list_get(). Adds the given pool to the AVL tree + * of known pools. + */ +static int +add_pool(zpool_handle_t *zhp, void *data) +{ + zpool_list_t *zlp = data; + zpool_node_t *node = safe_malloc(sizeof (zpool_node_t)); + uu_avl_index_t idx; + + node->zn_handle = zhp; + uu_avl_node_init(node, &node->zn_avlnode, zlp->zl_pool); + if (uu_avl_find(zlp->zl_avl, node, NULL, &idx) == NULL) { + if (zlp->zl_proplist && + zpool_expand_proplist(zhp, zlp->zl_proplist) != 0) { + zpool_close(zhp); + free(node); + return (-1); + } + uu_avl_insert(zlp->zl_avl, node, idx); + } else { + zpool_close(zhp); + free(node); + return (-1); + } + + return (0); +} + +/* + * Create a list of pools based on the given arguments. If we're given no + * arguments, then iterate over all pools in the system and add them to the AVL + * tree. Otherwise, add only those pool explicitly specified on the command + * line. + */ +zpool_list_t * +pool_list_get(int argc, char **argv, zprop_list_t **proplist, int *err) +{ + zpool_list_t *zlp; + + zlp = safe_malloc(sizeof (zpool_list_t)); + + zlp->zl_pool = uu_avl_pool_create("zfs_pool", sizeof (zpool_node_t), + offsetof(zpool_node_t, zn_avlnode), zpool_compare, UU_DEFAULT); + + if (zlp->zl_pool == NULL) + zpool_no_memory(); + + if ((zlp->zl_avl = uu_avl_create(zlp->zl_pool, NULL, + UU_DEFAULT)) == NULL) + zpool_no_memory(); + + zlp->zl_proplist = proplist; + + if (argc == 0) { + (void) zpool_iter(g_zfs, add_pool, zlp); + zlp->zl_findall = B_TRUE; + } else { + int i; + + for (i = 0; i < argc; i++) { + zpool_handle_t *zhp; + + if ((zhp = zpool_open_canfail(g_zfs, argv[i])) != + NULL) { + if (add_pool(zhp, zlp) != 0) + *err = B_TRUE; + } else { + *err = B_TRUE; + } + } + } + + return (zlp); +} + +/* + * Search for any new pools, adding them to the list. We only add pools when no + * options were given on the command line. Otherwise, we keep the list fixed as + * those that were explicitly specified. + */ +void +pool_list_update(zpool_list_t *zlp) +{ + if (zlp->zl_findall) + (void) zpool_iter(g_zfs, add_pool, zlp); +} + +/* + * Iterate over all pools in the list, executing the callback for each + */ +int +pool_list_iter(zpool_list_t *zlp, int unavail, zpool_iter_f func, + void *data) +{ + zpool_node_t *node, *next_node; + int ret = 0; + + for (node = uu_avl_first(zlp->zl_avl); node != NULL; node = next_node) { + next_node = uu_avl_next(zlp->zl_avl, node); + if (zpool_get_state(node->zn_handle) != POOL_STATE_UNAVAIL || + unavail) + ret |= func(node->zn_handle, data); + } + + return (ret); +} + +/* + * Remove the given pool from the list. When running iostat, we want to remove + * those pools that no longer exist. + */ +void +pool_list_remove(zpool_list_t *zlp, zpool_handle_t *zhp) +{ + zpool_node_t search, *node; + + search.zn_handle = zhp; + if ((node = uu_avl_find(zlp->zl_avl, &search, NULL, NULL)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } +} + +/* + * Free all the handles associated with this list. + */ +void +pool_list_free(zpool_list_t *zlp) +{ + uu_avl_walk_t *walk; + zpool_node_t *node; + + if ((walk = uu_avl_walk_start(zlp->zl_avl, UU_WALK_ROBUST)) == NULL) { + (void) fprintf(stderr, + gettext("internal error: out of memory")); + exit(1); + } + + while ((node = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(zlp->zl_avl, node); + zpool_close(node->zn_handle); + free(node); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(zlp->zl_avl); + uu_avl_pool_destroy(zlp->zl_pool); + + free(zlp); +} + +/* + * Returns the number of elements in the pool list. + */ +int +pool_list_count(zpool_list_t *zlp) +{ + return (uu_avl_numnodes(zlp->zl_avl)); +} + +/* + * High level function which iterates over all pools given on the command line, + * using the pool_list_* interfaces. + */ +int +for_each_pool(int argc, char **argv, boolean_t unavail, + zprop_list_t **proplist, zpool_iter_f func, void *data) +{ + zpool_list_t *list; + int ret = 0; + + if ((list = pool_list_get(argc, argv, proplist, &ret)) == NULL) + return (1); + + if (pool_list_iter(list, unavail, func, data) != 0) + ret = 1; + + pool_list_free(list); + + return (ret); +} + +static int +for_each_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, pool_vdev_iter_f func, + void *data) +{ + nvlist_t **child; + uint_t c, children; + int ret = 0; + int i; + char *type; + + const char *list[] = { + ZPOOL_CONFIG_SPARES, + ZPOOL_CONFIG_L2CACHE, + ZPOOL_CONFIG_CHILDREN + }; + + for (i = 0; i < ARRAY_SIZE(list); i++) { + if (nvlist_lookup_nvlist_array(nv, list[i], &child, + &children) == 0) { + for (c = 0; c < children; c++) { + uint64_t ishole = 0; + + (void) nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole); + + if (ishole) + continue; + + ret |= for_each_vdev_cb(zhp, child[c], func, + data); + } + } + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (ret); + + /* Don't run our function on root vdevs */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0) { + ret |= func(zhp, nv, data); + } + + return (ret); +} + +/* + * This is the equivalent of for_each_pool() for vdevs. It iterates thorough + * all vdevs in the pool, ignoring root vdevs and holes, calling func() on + * each one. + * + * @zhp: Zpool handle + * @func: Function to call on each vdev + * @data: Custom data to pass to the function + */ +int +for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data) +{ + nvlist_t *config, *nvroot = NULL; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + } + return (for_each_vdev_cb(zhp, nvroot, func, data)); +} + +/* + * Process the vcdl->vdev_cmd_data[] array to figure out all the unique column + * names and their widths. When this function is done, vcdl->uniq_cols, + * vcdl->uniq_cols_cnt, and vcdl->uniq_cols_width will be filled in. + */ +static void +process_unique_cmd_columns(vdev_cmd_data_list_t *vcdl) +{ + char **uniq_cols = NULL, **tmp = NULL; + int *uniq_cols_width; + vdev_cmd_data_t *data; + int cnt = 0; + int k; + + /* For each vdev */ + for (int i = 0; i < vcdl->count; i++) { + data = &vcdl->data[i]; + /* For each column the vdev reported */ + for (int j = 0; j < data->cols_cnt; j++) { + /* Is this column in our list of unique column names? */ + for (k = 0; k < cnt; k++) { + if (strcmp(data->cols[j], uniq_cols[k]) == 0) + break; /* yes it is */ + } + if (k == cnt) { + /* No entry for column, add to list */ + tmp = realloc(uniq_cols, sizeof (*uniq_cols) * + (cnt + 1)); + if (tmp == NULL) + break; /* Nothing we can do... */ + uniq_cols = tmp; + uniq_cols[cnt] = data->cols[j]; + cnt++; + } + } + } + + /* + * We now have a list of all the unique column names. Figure out the + * max width of each column by looking at the column name and all its + * values. + */ + uniq_cols_width = safe_malloc(sizeof (*uniq_cols_width) * cnt); + for (int i = 0; i < cnt; i++) { + /* Start off with the column title's width */ + uniq_cols_width[i] = strlen(uniq_cols[i]); + /* For each vdev */ + for (int j = 0; j < vcdl->count; j++) { + /* For each of the vdev's values in a column */ + data = &vcdl->data[j]; + for (k = 0; k < data->cols_cnt; k++) { + /* Does this vdev have a value for this col? */ + if (strcmp(data->cols[k], uniq_cols[i]) == 0) { + /* Is the value width larger? */ + uniq_cols_width[i] = + MAX(uniq_cols_width[i], + strlen(data->lines[k])); + } + } + } + } + + vcdl->uniq_cols = uniq_cols; + vcdl->uniq_cols_cnt = cnt; + vcdl->uniq_cols_width = uniq_cols_width; +} + + +/* + * Process a line of command output + * + * When running 'zpool iostat|status -c' the lines of output can either be + * in the form of: + * + * column_name=value + * + * Or just: + * + * value + * + * Process the column_name (if any) and value. + * + * Returns 0 if line was processed, and there are more lines can still be + * processed. + * + * Returns 1 if this was the last line to process, or error. + */ +static int +vdev_process_cmd_output(vdev_cmd_data_t *data, char *line) +{ + char *col = NULL; + char *val = line; + char *equals; + char **tmp; + + if (line == NULL) + return (1); + + equals = strchr(line, '='); + if (equals != NULL) { + /* + * We have a 'column=value' type line. Split it into the + * column and value strings by turning the '=' into a '\0'. + */ + *equals = '\0'; + col = line; + val = equals + 1; + } else { + val = line; + } + + /* Do we already have a column by this name? If so, skip it. */ + if (col != NULL) { + for (int i = 0; i < data->cols_cnt; i++) { + if (strcmp(col, data->cols[i]) == 0) + return (0); /* Duplicate, skip */ + } + } + + if (val != NULL) { + tmp = realloc(data->lines, + (data->lines_cnt + 1) * sizeof (*data->lines)); + if (tmp == NULL) + return (1); + + data->lines = tmp; + data->lines[data->lines_cnt] = strdup(val); + data->lines_cnt++; + } + + if (col != NULL) { + tmp = realloc(data->cols, + (data->cols_cnt + 1) * sizeof (*data->cols)); + if (tmp == NULL) + return (1); + + data->cols = tmp; + data->cols[data->cols_cnt] = strdup(col); + data->cols_cnt++; + } + + if (val != NULL && col == NULL) + return (1); + + return (0); +} + +/* + * Run the cmd and store results in *data. + */ +static void +vdev_run_cmd(vdev_cmd_data_t *data, char *cmd) +{ + int rc; + char *argv[2] = {cmd, 0}; + char *env[5] = {"PATH=/bin:/sbin:/usr/bin:/usr/sbin", NULL, NULL, NULL, + NULL}; + char **lines = NULL; + int lines_cnt = 0; + int i; + + /* Setup our custom environment variables */ + rc = asprintf(&env[1], "VDEV_PATH=%s", + data->path ? data->path : ""); + if (rc == -1) + goto out; + + rc = asprintf(&env[2], "VDEV_UPATH=%s", + data->upath ? data->upath : ""); + if (rc == -1) + goto out; + + rc = asprintf(&env[3], "VDEV_ENC_SYSFS_PATH=%s", + data->vdev_enc_sysfs_path ? + data->vdev_enc_sysfs_path : ""); + if (rc == -1) + goto out; + + /* Run the command */ + rc = libzfs_run_process_get_stdout_nopath(cmd, argv, env, &lines, + &lines_cnt); + if (rc != 0) + goto out; + + /* Process the output we got */ + for (i = 0; i < lines_cnt; i++) + if (vdev_process_cmd_output(data, lines[i]) != 0) + break; + +out: + if (lines != NULL) + libzfs_free_str_array(lines, lines_cnt); + + /* Start with i = 1 since env[0] was statically allocated */ + for (i = 1; i < ARRAY_SIZE(env); i++) + if (env[i] != NULL) + free(env[i]); +} + +/* + * Generate the search path for zpool iostat/status -c scripts. + * The string returned must be freed. + */ +char * +zpool_get_cmd_search_path(void) +{ + const char *env; + char *sp = NULL; + + env = getenv("ZPOOL_SCRIPTS_PATH"); + if (env != NULL) + return (strdup(env)); + + env = getenv("HOME"); + if (env != NULL) { + if (asprintf(&sp, "%s/.zpool.d:%s", + env, ZPOOL_SCRIPTS_DIR) != -1) { + return (sp); + } + } + + if (asprintf(&sp, "%s", ZPOOL_SCRIPTS_DIR) != -1) + return (sp); + + return (NULL); +} + +/* Thread function run for each vdev */ +static void +vdev_run_cmd_thread(void *cb_cmd_data) +{ + vdev_cmd_data_t *data = cb_cmd_data; + char *cmd = NULL, *cmddup, *cmdrest; + + cmddup = strdup(data->cmd); + if (cmddup == NULL) + return; + + cmdrest = cmddup; + while ((cmd = strtok_r(cmdrest, ",", &cmdrest))) { + char *dir = NULL, *sp, *sprest; + char fullpath[MAXPATHLEN]; + + if (strchr(cmd, '/') != NULL) + continue; + + sp = zpool_get_cmd_search_path(); + if (sp == NULL) + continue; + + sprest = sp; + while ((dir = strtok_r(sprest, ":", &sprest))) { + if (snprintf(fullpath, sizeof (fullpath), + "%s/%s", dir, cmd) == -1) + continue; + + if (access(fullpath, X_OK) == 0) { + vdev_run_cmd(data, fullpath); + break; + } + } + free(sp); + } + free(cmddup); +} + +/* For each vdev in the pool run a command */ +static int +for_each_vdev_run_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_vcdl) +{ + vdev_cmd_data_list_t *vcdl = cb_vcdl; + vdev_cmd_data_t *data; + char *path = NULL; + char *vname = NULL; + char *vdev_enc_sysfs_path = NULL; + int i, match = 0; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return (1); + + nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &vdev_enc_sysfs_path); + + /* Spares show more than once if they're in use, so skip if exists */ + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) == 0) && + (strcmp(vcdl->data[i].pool, zpool_get_name(zhp)) == 0)) { + /* vdev already exists, skip it */ + return (0); + } + } + + /* Check for selected vdevs here, if any */ + for (i = 0; i < vcdl->vdev_names_count; i++) { + vname = zpool_vdev_name(g_zfs, zhp, nv, vcdl->cb_name_flags); + if (strcmp(vcdl->vdev_names[i], vname) == 0) { + free(vname); + match = 1; + break; /* match */ + } + free(vname); + } + + /* If we selected vdevs, and this isn't one of them, then bail out */ + if (!match && vcdl->vdev_names_count) + return (0); + + /* + * Resize our array and add in the new element. + */ + if (!(vcdl->data = realloc(vcdl->data, + sizeof (*vcdl->data) * (vcdl->count + 1)))) + return (ENOMEM); /* couldn't realloc */ + + data = &vcdl->data[vcdl->count]; + + data->pool = strdup(zpool_get_name(zhp)); + data->path = strdup(path); + data->upath = zfs_get_underlying_path(path); + data->cmd = vcdl->cmd; + data->lines = data->cols = NULL; + data->lines_cnt = data->cols_cnt = 0; + if (vdev_enc_sysfs_path) + data->vdev_enc_sysfs_path = strdup(vdev_enc_sysfs_path); + else + data->vdev_enc_sysfs_path = NULL; + + vcdl->count++; + + return (0); +} + +/* Get the names and count of the vdevs */ +static int +all_pools_for_each_vdev_gather_cb(zpool_handle_t *zhp, void *cb_vcdl) +{ + return (for_each_vdev(zhp, for_each_vdev_run_cb, cb_vcdl)); +} + +/* + * Now that vcdl is populated with our complete list of vdevs, spawn + * off the commands. + */ +static void +all_pools_for_each_vdev_run_vcdl(vdev_cmd_data_list_t *vcdl) +{ + tpool_t *t; + + t = tpool_create(1, 5 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); + if (t == NULL) + return; + + /* Spawn off the command for each vdev */ + for (int i = 0; i < vcdl->count; i++) { + (void) tpool_dispatch(t, vdev_run_cmd_thread, + (void *) &vcdl->data[i]); + } + + /* Wait for threads to finish */ + tpool_wait(t); + tpool_destroy(t); +} + +/* + * Run command 'cmd' on all vdevs in all pools in argv. Saves the first line of + * output from the command in vcdk->data[].line for all vdevs. If you want + * to run the command on only certain vdevs, fill in g_zfs, vdev_names, + * vdev_names_count, and cb_name_flags. Otherwise leave them as zero. + * + * Returns a vdev_cmd_data_list_t that must be freed with + * free_vdev_cmd_data_list(); + */ +vdev_cmd_data_list_t * +all_pools_for_each_vdev_run(int argc, char **argv, char *cmd, + libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags) +{ + vdev_cmd_data_list_t *vcdl; + vcdl = safe_malloc(sizeof (vdev_cmd_data_list_t)); + vcdl->cmd = cmd; + + vcdl->vdev_names = vdev_names; + vcdl->vdev_names_count = vdev_names_count; + vcdl->cb_name_flags = cb_name_flags; + vcdl->g_zfs = g_zfs; + + /* Gather our list of all vdevs in all pools */ + for_each_pool(argc, argv, B_TRUE, NULL, + all_pools_for_each_vdev_gather_cb, vcdl); + + /* Run command on all vdevs in all pools */ + all_pools_for_each_vdev_run_vcdl(vcdl); + + /* + * vcdl->data[] now contains all the column names and values for each + * vdev. We need to process that into a master list of unique column + * names, and figure out the width of each column. + */ + process_unique_cmd_columns(vcdl); + + return (vcdl); +} + +/* + * Free the vdev_cmd_data_list_t created by all_pools_for_each_vdev_run() + */ +void +free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl) +{ + free(vcdl->uniq_cols); + free(vcdl->uniq_cols_width); + + for (int i = 0; i < vcdl->count; i++) { + free(vcdl->data[i].path); + free(vcdl->data[i].pool); + free(vcdl->data[i].upath); + + for (int j = 0; j < vcdl->data[i].lines_cnt; j++) + free(vcdl->data[i].lines[j]); + + free(vcdl->data[i].lines); + + for (int j = 0; j < vcdl->data[i].cols_cnt; j++) + free(vcdl->data[i].cols[j]); + + free(vcdl->data[i].cols); + free(vcdl->data[i].vdev_enc_sysfs_path); + } + free(vcdl->data); + free(vcdl); +} diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c new file mode 100644 index 000000000000..f3756a5d9547 --- /dev/null +++ b/cmd/zpool/zpool_main.c @@ -0,0 +1,10326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012 by Frederik Wessels. All rights reserved. + * Copyright (c) 2012 by Cyril Plisko. All rights reserved. + * Copyright (c) 2013 by Prasad Joshi (sTec). All rights reserved. + * Copyright 2016 Igor Kozhukhov . + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#include "zpool_util.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +#include "statcommon.h" + +libzfs_handle_t *g_zfs; + +static int zpool_do_create(int, char **); +static int zpool_do_destroy(int, char **); + +static int zpool_do_add(int, char **); +static int zpool_do_remove(int, char **); +static int zpool_do_labelclear(int, char **); + +static int zpool_do_checkpoint(int, char **); + +static int zpool_do_list(int, char **); +static int zpool_do_iostat(int, char **); +static int zpool_do_status(int, char **); + +static int zpool_do_online(int, char **); +static int zpool_do_offline(int, char **); +static int zpool_do_clear(int, char **); +static int zpool_do_reopen(int, char **); + +static int zpool_do_reguid(int, char **); + +static int zpool_do_attach(int, char **); +static int zpool_do_detach(int, char **); +static int zpool_do_replace(int, char **); +static int zpool_do_split(int, char **); + +static int zpool_do_initialize(int, char **); +static int zpool_do_scrub(int, char **); +static int zpool_do_resilver(int, char **); +static int zpool_do_trim(int, char **); + +static int zpool_do_import(int, char **); +static int zpool_do_export(int, char **); + +static int zpool_do_upgrade(int, char **); + +static int zpool_do_history(int, char **); +static int zpool_do_events(int, char **); + +static int zpool_do_get(int, char **); +static int zpool_do_set(int, char **); + +static int zpool_do_sync(int, char **); + +static int zpool_do_version(int, char **); + +static int zpool_do_wait(int, char **); + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ + +#ifdef DEBUG +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} +#endif + +typedef enum { + HELP_ADD, + HELP_ATTACH, + HELP_CLEAR, + HELP_CREATE, + HELP_CHECKPOINT, + HELP_DESTROY, + HELP_DETACH, + HELP_EXPORT, + HELP_HISTORY, + HELP_IMPORT, + HELP_IOSTAT, + HELP_LABELCLEAR, + HELP_LIST, + HELP_OFFLINE, + HELP_ONLINE, + HELP_REPLACE, + HELP_REMOVE, + HELP_INITIALIZE, + HELP_SCRUB, + HELP_RESILVER, + HELP_TRIM, + HELP_STATUS, + HELP_UPGRADE, + HELP_EVENTS, + HELP_GET, + HELP_SET, + HELP_SPLIT, + HELP_SYNC, + HELP_REGUID, + HELP_REOPEN, + HELP_VERSION, + HELP_WAIT +} zpool_help_t; + + +/* + * Flags for stats to display with "zpool iostats" + */ +enum iostat_type { + IOS_DEFAULT = 0, + IOS_LATENCY = 1, + IOS_QUEUES = 2, + IOS_L_HISTO = 3, + IOS_RQ_HISTO = 4, + IOS_COUNT, /* always last element */ +}; + +/* iostat_type entries as bitmasks */ +#define IOS_DEFAULT_M (1ULL << IOS_DEFAULT) +#define IOS_LATENCY_M (1ULL << IOS_LATENCY) +#define IOS_QUEUES_M (1ULL << IOS_QUEUES) +#define IOS_L_HISTO_M (1ULL << IOS_L_HISTO) +#define IOS_RQ_HISTO_M (1ULL << IOS_RQ_HISTO) + +/* Mask of all the histo bits */ +#define IOS_ANYHISTO_M (IOS_L_HISTO_M | IOS_RQ_HISTO_M) + +/* + * Lookup table for iostat flags to nvlist names. Basically a list + * of all the nvlists a flag requires. Also specifies the order in + * which data gets printed in zpool iostat. + */ +static const char *vsx_type_to_nvlist[IOS_COUNT][13] = { + [IOS_L_HISTO] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_LATENCY] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + NULL}, + [IOS_QUEUES] = { + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + NULL}, + [IOS_RQ_HISTO] = { + ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, + ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + NULL}, +}; + + +/* + * Given a cb->cb_flags with a histogram bit set, return the iostat_type. + * Right now, only one histo bit is ever set at one time, so we can + * just do a highbit64(a) + */ +#define IOS_HISTO_IDX(a) (highbit64(a & IOS_ANYHISTO_M) - 1) + +typedef struct zpool_command { + const char *name; + int (*func)(int, char **); + zpool_help_t usage; +} zpool_command_t; + +/* + * Master command table. Each ZFS command has a name, associated function, and + * usage message. The usage messages need to be internationalized, so we have + * to have a function to return the usage message based on a command index. + * + * These commands are organized according to how they are displayed in the usage + * message. An empty command (one with a NULL name) indicates an empty line in + * the generic usage message. + */ +static zpool_command_t command_table[] = { + { "version", zpool_do_version, HELP_VERSION }, + { NULL }, + { "create", zpool_do_create, HELP_CREATE }, + { "destroy", zpool_do_destroy, HELP_DESTROY }, + { NULL }, + { "add", zpool_do_add, HELP_ADD }, + { "remove", zpool_do_remove, HELP_REMOVE }, + { NULL }, + { "labelclear", zpool_do_labelclear, HELP_LABELCLEAR }, + { NULL }, + { "checkpoint", zpool_do_checkpoint, HELP_CHECKPOINT }, + { NULL }, + { "list", zpool_do_list, HELP_LIST }, + { "iostat", zpool_do_iostat, HELP_IOSTAT }, + { "status", zpool_do_status, HELP_STATUS }, + { NULL }, + { "online", zpool_do_online, HELP_ONLINE }, + { "offline", zpool_do_offline, HELP_OFFLINE }, + { "clear", zpool_do_clear, HELP_CLEAR }, + { "reopen", zpool_do_reopen, HELP_REOPEN }, + { NULL }, + { "attach", zpool_do_attach, HELP_ATTACH }, + { "detach", zpool_do_detach, HELP_DETACH }, + { "replace", zpool_do_replace, HELP_REPLACE }, + { "split", zpool_do_split, HELP_SPLIT }, + { NULL }, + { "initialize", zpool_do_initialize, HELP_INITIALIZE }, + { "resilver", zpool_do_resilver, HELP_RESILVER }, + { "scrub", zpool_do_scrub, HELP_SCRUB }, + { "trim", zpool_do_trim, HELP_TRIM }, + { NULL }, + { "import", zpool_do_import, HELP_IMPORT }, + { "export", zpool_do_export, HELP_EXPORT }, + { "upgrade", zpool_do_upgrade, HELP_UPGRADE }, + { "reguid", zpool_do_reguid, HELP_REGUID }, + { NULL }, + { "history", zpool_do_history, HELP_HISTORY }, + { "events", zpool_do_events, HELP_EVENTS }, + { NULL }, + { "get", zpool_do_get, HELP_GET }, + { "set", zpool_do_set, HELP_SET }, + { "sync", zpool_do_sync, HELP_SYNC }, + { NULL }, + { "wait", zpool_do_wait, HELP_WAIT }, +}; + +#define NCOMMAND (ARRAY_SIZE(command_table)) + +#define VDEV_ALLOC_CLASS_LOGS "logs" + +static zpool_command_t *current_command; +static char history_str[HIS_MAX_RECORD_LEN]; +static boolean_t log_history = B_TRUE; +static uint_t timestamp_fmt = NODATE; + +static const char * +get_usage(zpool_help_t idx) +{ + switch (idx) { + case HELP_ADD: + return (gettext("\tadd [-fgLnP] [-o property=value] " + " ...\n")); + case HELP_ATTACH: + return (gettext("\tattach [-fsw] [-o property=value] " + " \n")); + case HELP_CLEAR: + return (gettext("\tclear [-nF] [device]\n")); + case HELP_CREATE: + return (gettext("\tcreate [-fnd] [-o property=value] ... \n" + "\t [-O file-system-property=value] ... \n" + "\t [-m mountpoint] [-R root] ...\n")); + case HELP_CHECKPOINT: + return (gettext("\tcheckpoint [-d [-w]] ...\n")); + case HELP_DESTROY: + return (gettext("\tdestroy [-f] \n")); + case HELP_DETACH: + return (gettext("\tdetach \n")); + case HELP_EXPORT: + return (gettext("\texport [-af] ...\n")); + case HELP_HISTORY: + return (gettext("\thistory [-il] [] ...\n")); + case HELP_IMPORT: + return (gettext("\timport [-d dir] [-D]\n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " + "[-R root] [-F [-n]] -a\n" + "\timport [-o mntopts] [-o property=value] ... \n" + "\t [-d dir | -c cachefile] [-D] [-l] [-f] [-m] [-N] " + "[-R root] [-F [-n]]\n" + "\t [--rewind-to-checkpoint] [newpool]\n")); + case HELP_IOSTAT: + return (gettext("\tiostat [[[-c [script1,script2,...]" + "[-lq]]|[-rw]] [-T d | u] [-ghHLpPvy]\n" + "\t [[pool ...]|[pool vdev ...]|[vdev ...]]" + " [[-n] interval [count]]\n")); + case HELP_LABELCLEAR: + return (gettext("\tlabelclear [-f] \n")); + case HELP_LIST: + return (gettext("\tlist [-gHLpPv] [-o property[,...]] " + "[-T d|u] [pool] ... \n" + "\t [interval [count]]\n")); + case HELP_OFFLINE: + return (gettext("\toffline [-f] [-t] ...\n")); + case HELP_ONLINE: + return (gettext("\tonline [-e] ...\n")); + case HELP_REPLACE: + return (gettext("\treplace [-fsw] [-o property=value] " + " [new-device]\n")); + case HELP_REMOVE: + return (gettext("\tremove [-npsw] ...\n")); + case HELP_REOPEN: + return (gettext("\treopen [-n] \n")); + case HELP_INITIALIZE: + return (gettext("\tinitialize [-c | -s] [-w] " + "[ ...]\n")); + case HELP_SCRUB: + return (gettext("\tscrub [-s | -p] [-w] ...\n")); + case HELP_RESILVER: + return (gettext("\tresilver ...\n")); + case HELP_TRIM: + return (gettext("\ttrim [-dw] [-r ] [-c | -s] " + "[ ...]\n")); + case HELP_STATUS: + return (gettext("\tstatus [-c [script1,script2,...]] " + "[-igLpPstvxD] [-T d|u] [pool] ... \n" + "\t [interval [count]]\n")); + case HELP_UPGRADE: + return (gettext("\tupgrade\n" + "\tupgrade -v\n" + "\tupgrade [-V version] <-a | pool ...>\n")); + case HELP_EVENTS: + return (gettext("\tevents [-vHf [pool] | -c]\n")); + case HELP_GET: + return (gettext("\tget [-Hp] [-o \"all\" | field[,...]] " + "<\"all\" | property[,...]> ...\n")); + case HELP_SET: + return (gettext("\tset \n")); + case HELP_SPLIT: + return (gettext("\tsplit [-gLnPl] [-R altroot] [-o mntopts]\n" + "\t [-o property=value] " + "[ ...]\n")); + case HELP_REGUID: + return (gettext("\treguid \n")); + case HELP_SYNC: + return (gettext("\tsync [pool] ...\n")); + case HELP_VERSION: + return (gettext("\tversion\n")); + case HELP_WAIT: + return (gettext("\twait [-Hp] [-T d|u] [-t [,...]] " + " [interval]\n")); + } + + abort(); + /* NOTREACHED */ +} + +static void +zpool_collect_leaves(zpool_handle_t *zhp, nvlist_t *nvroot, nvlist_t *res) +{ + uint_t children = 0; + nvlist_t **child; + uint_t i; + + (void) nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (children == 0) { + char *path = zpool_vdev_name(g_zfs, zhp, nvroot, + VDEV_NAME_PATH); + + if (strcmp(path, VDEV_TYPE_INDIRECT) != 0 && + strcmp(path, VDEV_TYPE_HOLE) != 0) + fnvlist_add_boolean(res, path); + + free(path); + return; + } + + for (i = 0; i < children; i++) { + zpool_collect_leaves(zhp, child[i], res); + } +} + +/* + * Callback routine that will print out a pool property value. + */ +static int +print_prop_cb(int prop, void *cb) +{ + FILE *fp = cb; + + (void) fprintf(fp, "\t%-19s ", zpool_prop_to_name(prop)); + + if (zpool_prop_readonly(prop)) + (void) fprintf(fp, " NO "); + else + (void) fprintf(fp, " YES "); + + if (zpool_prop_values(prop) == NULL) + (void) fprintf(fp, "-\n"); + else + (void) fprintf(fp, "%s\n", zpool_prop_values(prop)); + + return (ZPROP_CONT); +} + +/* + * Display usage message. If we're inside a command, display only the usage for + * that command. Otherwise, iterate over the entire command table and display + * a complete usage message. + */ +static void +usage(boolean_t requested) +{ + FILE *fp = requested ? stdout : stderr; + + if (current_command == NULL) { + int i; + + (void) fprintf(fp, gettext("usage: zpool command args ...\n")); + (void) fprintf(fp, + gettext("where 'command' is one of the following:\n\n")); + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + (void) fprintf(fp, "\n"); + else + (void) fprintf(fp, "%s", + get_usage(command_table[i].usage)); + } + } else { + (void) fprintf(fp, gettext("usage:\n")); + (void) fprintf(fp, "%s", get_usage(current_command->usage)); + } + + if (current_command != NULL && + ((strcmp(current_command->name, "set") == 0) || + (strcmp(current_command->name, "get") == 0) || + (strcmp(current_command->name, "list") == 0))) { + + (void) fprintf(fp, + gettext("\nthe following properties are supported:\n")); + + (void) fprintf(fp, "\n\t%-19s %s %s\n\n", + "PROPERTY", "EDIT", "VALUES"); + + /* Iterate over all properties */ + (void) zprop_iter(print_prop_cb, fp, B_FALSE, B_TRUE, + ZFS_TYPE_POOL); + + (void) fprintf(fp, "\t%-19s ", "feature@..."); + (void) fprintf(fp, "YES disabled | enabled | active\n"); + + (void) fprintf(fp, gettext("\nThe feature@ properties must be " + "appended with a feature name.\nSee zpool-features(5).\n")); + } + + /* + * See comments at end of main(). + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + exit(requested ? 0 : 2); +} + +/* + * zpool initialize [-c | -s] [-w] [ ...] + * Initialize all unused blocks in the specified vdevs, or all vdevs in the pool + * if none specified. + * + * -c Cancel. Ends active initializing. + * -s Suspend. Initializing can then be restarted with no flags. + * -w Wait. Blocks until initializing has completed. + */ +int +zpool_do_initialize(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + nvlist_t *vdevs; + int err = 0; + boolean_t wait = B_FALSE; + + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + pool_initialize_func_t cmd_type = POOL_INITIALIZE_START; + while ((c = getopt_long(argc, argv, "csw", long_options, NULL)) != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_INITIALIZE_START && + cmd_type != POOL_INITIALIZE_CANCEL) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_CANCEL; + break; + case 's': + if (cmd_type != POOL_INITIALIZE_START && + cmd_type != POOL_INITIALIZE_SUSPEND) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_INITIALIZE_SUSPEND; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + if (wait && (cmd_type != POOL_INITIALIZE_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + } else { + for (int i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + if (wait) + err = zpool_initialize_wait(zhp, cmd_type, vdevs); + else + err = zpool_initialize(zhp, cmd_type, vdevs); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (err); +} + +/* + * print a pool vdev config for dry runs + */ +static void +print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, + const char *match, int name_flags) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + boolean_t printed = B_FALSE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + if (name != NULL) + (void) printf("\t%*s%s\n", indent, "", name); + return; + } + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *class = ""; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + class = VDEV_ALLOC_BIAS_LOG; + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &class); + if (strcmp(match, class) != 0) + continue; + + if (!printed && name != NULL) { + (void) printf("\t%*s%s\n", indent, "", name); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], name_flags); + print_vdev_tree(zhp, vname, child[c], indent + 2, "", + name_flags); + free(vname); + } +} + +static boolean_t +prop_list_contains_feature(nvlist_t *proplist) +{ + nvpair_t *nvp; + for (nvp = nvlist_next_nvpair(proplist, NULL); NULL != nvp; + nvp = nvlist_next_nvpair(proplist, nvp)) { + if (zpool_prop_feature(nvpair_name(nvp))) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Add a property pair (name, string-value) into a property nvlist. + */ +static int +add_prop_list(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + zpool_prop_t prop = ZPOOL_PROP_INVAL; + nvlist_t *proplist; + const char *normnm; + char *strval; + + if (*props == NULL && + nvlist_alloc(props, NV_UNIQUE_NAME, 0) != 0) { + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + return (1); + } + + proplist = *props; + + if (poolprop) { + const char *vname = zpool_prop_to_name(ZPOOL_PROP_VERSION); + + if ((prop = zpool_name_to_prop(propname)) == ZPOOL_PROP_INVAL && + !zpool_prop_feature(propname)) { + (void) fprintf(stderr, gettext("property '%s' is " + "not a valid pool property\n"), propname); + return (2); + } + + /* + * feature@ properties and version should not be specified + * at the same time. + */ + if ((prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname) && + nvlist_exists(proplist, vname)) || + (prop == ZPOOL_PROP_VERSION && + prop_list_contains_feature(proplist))) { + (void) fprintf(stderr, gettext("'feature@' and " + "'version' properties cannot be specified " + "together\n")); + return (2); + } + + + if (zpool_prop_feature(propname)) + normnm = propname; + else + normnm = zpool_prop_to_name(prop); + } else { + zfs_prop_t fsprop = zfs_name_to_prop(propname); + + if (zfs_prop_valid_for_type(fsprop, ZFS_TYPE_FILESYSTEM, + B_FALSE)) { + normnm = zfs_prop_to_name(fsprop); + } else if (zfs_prop_user(propname) || + zfs_prop_userquota(propname)) { + normnm = propname; + } else { + (void) fprintf(stderr, gettext("property '%s' is " + "not a valid filesystem property\n"), propname); + return (2); + } + } + + if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && + prop != ZPOOL_PROP_CACHEFILE) { + (void) fprintf(stderr, gettext("property '%s' " + "specified multiple times\n"), propname); + return (2); + } + + if (nvlist_add_string(proplist, normnm, propval) != 0) { + (void) fprintf(stderr, gettext("internal " + "error: out of memory\n")); + return (1); + } + + return (0); +} + +/* + * Set a default property pair (name, string-value) in a property nvlist + */ +static int +add_prop_list_default(const char *propname, char *propval, nvlist_t **props, + boolean_t poolprop) +{ + char *pval; + + if (nvlist_lookup_string(*props, propname, &pval) == 0) + return (0); + + return (add_prop_list(propname, propval, props, B_TRUE)); +} + +/* + * zpool add [-fgLnP] [-o property=value] ... + * + * -f Force addition of devices, even if they appear in use + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -n Do not add the devices, but display the resulting layout if + * they were to be added. + * -o Set property=value. + * -P Display full path for vdev name. + * + * Adds the given vdevs to 'pool'. As with create, the bulk of this work is + * handled by make_root_vdev(), which constructs the nvlist needed to pass to + * libzfs. + */ +int +zpool_do_add(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + int name_flags = 0; + int c; + nvlist_t *nvroot; + char *poolname; + int ret; + zpool_handle_t *zhp; + nvlist_t *config; + nvlist_t *props = NULL; + char *propval; + + /* check options */ + while ((c = getopt(argc, argv, "fgLno:P")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'g': + name_flags |= VDEV_NAME_GUID; + break; + case 'L': + name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; + case 'P': + name_flags |= VDEV_NAME_PATH; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + argc--; + argv++; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + return (1); + } + + /* unless manually specified use "ashift" pool property (if set) */ + if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { + int intval; + zprop_source_t src; + char strval[ZPOOL_MAXPROPLEN]; + + intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); + if (src != ZPROP_SRC_DEFAULT) { + (void) sprintf(strval, "%" PRId32, intval); + verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, + &props, B_TRUE) == 0); + } + } + + /* pass off to make_root_vdev for processing */ + nvroot = make_root_vdev(zhp, props, force, !force, B_FALSE, dryrun, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + return (1); + } + + if (dryrun) { + nvlist_t *poolnvroot; + nvlist_t **l2child; + uint_t l2children, c; + char *vname; + boolean_t hadcache = B_FALSE; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &poolnvroot) == 0); + + (void) printf(gettext("would update '%s' to the following " + "configuration:\n"), zpool_get_name(zhp)); + + /* print original main pool and new tree */ + print_vdev_tree(zhp, poolname, poolnvroot, 0, "", + name_flags | VDEV_NAME_TYPE_ID); + print_vdev_tree(zhp, NULL, nvroot, 0, "", name_flags); + + /* print other classes: 'dedup', 'special', and 'log' */ + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", poolnvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_DEDUP)) { + print_vdev_tree(zhp, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, name_flags); + } + + if (zfs_special_devs(poolnvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", poolnvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } else if (zfs_special_devs(nvroot, VDEV_ALLOC_BIAS_SPECIAL)) { + print_vdev_tree(zhp, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, name_flags); + } + + if (num_logs(poolnvroot) > 0) { + print_vdev_tree(zhp, "logs", poolnvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + print_vdev_tree(zhp, NULL, nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } else if (num_logs(nvroot) > 0) { + print_vdev_tree(zhp, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, name_flags); + } + + /* Do the same for the caches */ + if (nvlist_lookup_nvlist_array(poolnvroot, ZPOOL_CONFIG_L2CACHE, + &l2child, &l2children) == 0 && l2children) { + hadcache = B_TRUE; + (void) printf(gettext("\tcache\n")); + for (c = 0; c < l2children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + l2child[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2child, &l2children) == 0 && l2children) { + if (!hadcache) + (void) printf(gettext("\tcache\n")); + for (c = 0; c < l2children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, + l2child[c], name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + + ret = 0; + } else { + ret = (zpool_add(zhp, nvroot) != 0); + } + + nvlist_free(props); + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool remove [-npsw] ... + * + * Removes the given vdev from the pool. + */ +int +zpool_do_remove(int argc, char **argv) +{ + char *poolname; + int i, ret = 0; + zpool_handle_t *zhp = NULL; + boolean_t stop = B_FALSE; + int c; + boolean_t noop = B_FALSE; + boolean_t parsable = B_FALSE; + boolean_t wait = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "npsw")) != -1) { + switch (c) { + case 'n': + noop = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 's': + stop = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + if (stop && noop) { + (void) fprintf(stderr, gettext("stop request ignored\n")); + return (0); + } + + if (stop) { + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + if (zpool_vdev_remove_cancel(zhp) != 0) + ret = 1; + if (wait) { + (void) fprintf(stderr, gettext("invalid option " + "combination: -w cannot be used with -s\n")); + usage(B_FALSE); + } + } else { + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device\n")); + usage(B_FALSE); + } + + for (i = 1; i < argc; i++) { + if (noop) { + uint64_t size; + + if (zpool_vdev_indirect_size(zhp, argv[i], + &size) != 0) { + ret = 1; + break; + } + if (parsable) { + (void) printf("%s %llu\n", + argv[i], (unsigned long long)size); + } else { + char valstr[32]; + zfs_nicenum(size, valstr, + sizeof (valstr)); + (void) printf("Memory that will be " + "used after removing %s: %s\n", + argv[i], valstr); + } + } else { + if (zpool_vdev_remove(zhp, argv[i]) != 0) + ret = 1; + } + } + + if (ret == 0 && wait) + ret = zpool_wait(zhp, ZPOOL_WAIT_REMOVE); + } + zpool_close(zhp); + + return (ret); +} + +/* + * zpool labelclear [-f] + * + * -f Force clearing the label for the vdevs which are members of + * the exported or foreign pools. + * + * Verifies that the vdev is not active and zeros out the label information + * on the device. + */ +int +zpool_do_labelclear(int argc, char **argv) +{ + char vdev[MAXPATHLEN]; + char *name = NULL; + struct stat st; + int c, fd = -1, ret = 0; + nvlist_t *config; + pool_state_t state; + boolean_t inuse = B_FALSE; + boolean_t force = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + default: + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get vdev name */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing vdev name\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + /* + * Check if we were given absolute path and use it as is. + * Otherwise if the provided vdev name doesn't point to a file, + * try prepending expected disk paths and partition numbers. + */ + (void) strlcpy(vdev, argv[0], sizeof (vdev)); + if (vdev[0] != '/' && stat(vdev, &st) != 0) { + int error; + + error = zfs_resolve_shortname(argv[0], vdev, MAXPATHLEN); + if (error == 0 && zfs_dev_is_whole_disk(vdev)) { + if (zfs_append_partition(vdev, MAXPATHLEN) == -1) + error = ENOENT; + } + + if (error || (stat(vdev, &st) != 0)) { + (void) fprintf(stderr, gettext( + "failed to find device %s, try specifying absolute " + "path instead\n"), argv[0]); + return (1); + } + } + + if ((fd = open(vdev, O_RDWR)) < 0) { + (void) fprintf(stderr, gettext("failed to open %s: %s\n"), + vdev, strerror(errno)); + return (1); + } + + /* + * Flush all dirty pages for the block device. This should not be + * fatal when the device does not support BLKFLSBUF as would be the + * case for a file vdev. + */ + if ((zfs_dev_flush(fd) != 0) && (errno != ENOTTY)) + (void) fprintf(stderr, gettext("failed to invalidate " + "cache for %s: %s\n"), vdev, strerror(errno)); + + if (zpool_read_label(fd, &config, NULL) != 0) { + (void) fprintf(stderr, + gettext("failed to read label from %s\n"), vdev); + ret = 1; + goto errout; + } + nvlist_free(config); + + ret = zpool_in_use(g_zfs, fd, &state, &name, &inuse); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to check state for %s\n"), vdev); + ret = 1; + goto errout; + } + + if (!inuse) + goto wipe_label; + + switch (state) { + default: + case POOL_STATE_ACTIVE: + case POOL_STATE_SPARE: + case POOL_STATE_L2CACHE: + (void) fprintf(stderr, gettext( + "%s is a member (%s) of pool \"%s\"\n"), + vdev, zpool_pool_state_to_name(state), name); + ret = 1; + goto errout; + + case POOL_STATE_EXPORTED: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of exported pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_POTENTIALLY_ACTIVE: + if (force) + break; + (void) fprintf(stderr, gettext( + "use '-f' to override the following error:\n" + "%s is a member of potentially active pool \"%s\"\n"), + vdev, name); + ret = 1; + goto errout; + + case POOL_STATE_DESTROYED: + /* inuse should never be set for a destroyed pool */ + assert(0); + break; + } + +wipe_label: + ret = zpool_clear_label(fd); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to clear label for %s\n"), vdev); + } + +errout: + free(name); + (void) close(fd); + + return (ret); +} + +/* + * zpool create [-fnd] [-o property=value] ... + * [-O file-system-property=value] ... + * [-R root] [-m mountpoint] ... + * + * -f Force creation, even if devices appear in use + * -n Do not create the pool, but display the resulting layout if it + * were to be created. + * -R Create a pool under an alternate root + * -m Set default mountpoint for the root dataset. By default it's + * '/' + * -o Set property=value. + * -o Set feature@feature=enabled|disabled. + * -d Don't automatically enable all supported pool features + * (individual features can be enabled with -o). + * -O Set fsproperty=value in the pool's root file system + * + * Creates the named pool according to the given vdev specification. The + * bulk of the vdev processing is done in make_root_vdev() in zpool_vdev.c. + * Once we get the nvlist back from make_root_vdev(), we either print out the + * contents (if '-n' was specified), or pass it to libzfs to do the creation. + */ +int +zpool_do_create(int argc, char **argv) +{ + boolean_t force = B_FALSE; + boolean_t dryrun = B_FALSE; + boolean_t enable_all_pool_feat = B_TRUE; + int c; + nvlist_t *nvroot = NULL; + char *poolname; + char *tname = NULL; + int ret = 1; + char *altroot = NULL; + char *mountpoint = NULL; + nvlist_t *fsprops = NULL; + nvlist_t *props = NULL; + char *propval; + + /* check options */ + while ((c = getopt(argc, argv, ":fndR:m:o:O:t:")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'd': + enable_all_pool_feat = B_FALSE; + break; + case 'R': + altroot = optarg; + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto errout; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + break; + case 'm': + /* Equivalent to -O mountpoint=optarg */ + mountpoint = optarg; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + if (add_prop_list(optarg, propval, &props, B_TRUE)) + goto errout; + + /* + * If the user is creating a pool that doesn't support + * feature flags, don't enable any features. + */ + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_VERSION) { + char *end; + u_longlong_t ver; + + ver = strtoull(propval, &end, 10); + if (*end == '\0' && + ver < SPA_VERSION_FEATURES) { + enable_all_pool_feat = B_FALSE; + } + } + if (zpool_name_to_prop(optarg) == ZPOOL_PROP_ALTROOT) + altroot = propval; + break; + case 'O': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -O option\n")); + goto errout; + } + *propval = '\0'; + propval++; + + /* + * Mountpoints are checked and then added later. + * Uniquely among properties, they can be specified + * more than once, to avoid conflict with -m. + */ + if (0 == strcmp(optarg, + zfs_prop_to_name(ZFS_PROP_MOUNTPOINT))) { + mountpoint = propval; + } else if (add_prop_list(optarg, propval, &fsprops, + B_FALSE)) { + goto errout; + } + break; + case 't': + /* + * Sanity check temporary pool name. + */ + if (strchr(optarg, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create " + "'%s': invalid character '/' in temporary " + "name\n"), optarg); + (void) fprintf(stderr, gettext("use 'zfs " + "create' to create a dataset\n")); + goto errout; + } + + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_TNAME), optarg, &props, B_TRUE)) + goto errout; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto errout; + tname = optarg; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + goto badusage; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + goto badusage; + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + goto badusage; + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing vdev specification\n")); + goto badusage; + } + + poolname = argv[0]; + + /* + * As a special case, check for use of '/' in the name, and direct the + * user to use 'zfs create' instead. + */ + if (strchr(poolname, '/') != NULL) { + (void) fprintf(stderr, gettext("cannot create '%s': invalid " + "character '/' in pool name\n"), poolname); + (void) fprintf(stderr, gettext("use 'zfs create' to " + "create a dataset\n")); + goto errout; + } + + /* pass off to make_root_vdev for bulk processing */ + nvroot = make_root_vdev(NULL, props, force, !force, B_FALSE, dryrun, + argc - 1, argv + 1); + if (nvroot == NULL) + goto errout; + + /* make_root_vdev() allows 0 toplevel children if there are spares */ + if (!zfs_allocatable_devs(nvroot)) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + goto errout; + } + + if (altroot != NULL && altroot[0] != '/') { + (void) fprintf(stderr, gettext("invalid alternate root '%s': " + "must be an absolute path\n"), altroot); + goto errout; + } + + /* + * Check the validity of the mountpoint and direct the user to use the + * '-m' mountpoint option if it looks like its in use. + */ + if (mountpoint == NULL || + (strcmp(mountpoint, ZFS_MOUNTPOINT_LEGACY) != 0 && + strcmp(mountpoint, ZFS_MOUNTPOINT_NONE) != 0)) { + char buf[MAXPATHLEN]; + DIR *dirp; + + if (mountpoint && mountpoint[0] != '/') { + (void) fprintf(stderr, gettext("invalid mountpoint " + "'%s': must be an absolute path, 'legacy', or " + "'none'\n"), mountpoint); + goto errout; + } + + if (mountpoint == NULL) { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s/%s", + altroot, poolname); + else + (void) snprintf(buf, sizeof (buf), "/%s", + poolname); + } else { + if (altroot != NULL) + (void) snprintf(buf, sizeof (buf), "%s%s", + altroot, mountpoint); + else + (void) snprintf(buf, sizeof (buf), "%s", + mountpoint); + } + + if ((dirp = opendir(buf)) == NULL && errno != ENOENT) { + (void) fprintf(stderr, gettext("mountpoint '%s' : " + "%s\n"), buf, strerror(errno)); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a different default\n")); + goto errout; + } else if (dirp) { + int count = 0; + + while (count < 3 && readdir(dirp) != NULL) + count++; + (void) closedir(dirp); + + if (count > 2) { + (void) fprintf(stderr, gettext("mountpoint " + "'%s' exists and is not empty\n"), buf); + (void) fprintf(stderr, gettext("use '-m' " + "option to provide a " + "different default\n")); + goto errout; + } + } + } + + /* + * Now that the mountpoint's validity has been checked, ensure that + * the property is set appropriately prior to creating the pool. + */ + if (mountpoint != NULL) { + ret = add_prop_list(zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), + mountpoint, &fsprops, B_FALSE); + if (ret != 0) + goto errout; + } + + ret = 1; + if (dryrun) { + /* + * For a dry run invocation, print out a basic message and run + * through all the vdevs in the list and print out in an + * appropriate hierarchy. + */ + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), poolname); + + print_vdev_tree(NULL, poolname, nvroot, 0, "", 0); + print_vdev_tree(NULL, "dedup", nvroot, 0, + VDEV_ALLOC_BIAS_DEDUP, 0); + print_vdev_tree(NULL, "special", nvroot, 0, + VDEV_ALLOC_BIAS_SPECIAL, 0); + print_vdev_tree(NULL, "logs", nvroot, 0, + VDEV_ALLOC_BIAS_LOG, 0); + + ret = 0; + } else { + /* + * Hand off to libzfs. + */ + spa_feature_t i; + for (i = 0; i < SPA_FEATURES; i++) { + char propname[MAXPATHLEN]; + char *propval; + zfeature_info_t *feat = &spa_feature_table[i]; + + (void) snprintf(propname, sizeof (propname), + "feature@%s", feat->fi_uname); + + /* + * Only features contained in props will be enabled: + * remove from the nvlist every ZFS_FEATURE_DISABLED + * value and add every missing ZFS_FEATURE_ENABLED if + * enable_all_pool_feat is set. + */ + if (!nvlist_lookup_string(props, propname, &propval)) { + if (strcmp(propval, ZFS_FEATURE_DISABLED) == 0) + (void) nvlist_remove_all(props, + propname); + } else if (enable_all_pool_feat) { + ret = add_prop_list(propname, + ZFS_FEATURE_ENABLED, &props, B_TRUE); + if (ret != 0) + goto errout; + } + } + + ret = 1; + if (zpool_create(g_zfs, poolname, + nvroot, props, fsprops) == 0) { + zfs_handle_t *pool = zfs_open(g_zfs, + tname ? tname : poolname, ZFS_TYPE_FILESYSTEM); + if (pool != NULL) { + if (zfs_mount(pool, NULL, 0) == 0) { + ret = zfs_shareall(pool); + zfs_commit_all_shares(); + } + zfs_close(pool); + } + } else if (libzfs_errno(g_zfs) == EZFS_INVALIDNAME) { + (void) fprintf(stderr, gettext("pool name may have " + "been omitted\n")); + } + } + +errout: + nvlist_free(nvroot); + nvlist_free(fsprops); + nvlist_free(props); + return (ret); +badusage: + nvlist_free(fsprops); + nvlist_free(props); + usage(B_FALSE); + return (2); +} + +/* + * zpool destroy + * + * -f Forcefully unmount any datasets + * + * Destroy the given pool. Automatically unmounts any datasets in the pool. + */ +int +zpool_do_destroy(int argc, char **argv) +{ + boolean_t force = B_FALSE; + int c; + char *pool; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "f")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + /* + * As a special case, check for use of '/' in the name, and + * direct the user to use 'zfs destroy' instead. + */ + if (strchr(pool, '/') != NULL) + (void) fprintf(stderr, gettext("use 'zfs destroy' to " + "destroy a dataset\n")); + return (1); + } + + if (zpool_disable_datasets(zhp, force) != 0) { + (void) fprintf(stderr, gettext("could not destroy '%s': " + "could not unmount datasets\n"), zpool_get_name(zhp)); + zpool_close(zhp); + return (1); + } + + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + ret = (zpool_destroy(zhp, history_str) != 0); + + zpool_close(zhp); + + return (ret); +} + +typedef struct export_cbdata { + boolean_t force; + boolean_t hardforce; +} export_cbdata_t; + +/* + * Export one pool + */ +static int +zpool_export_one(zpool_handle_t *zhp, void *data) +{ + export_cbdata_t *cb = data; + + if (zpool_disable_datasets(zhp, cb->force) != 0) + return (1); + + /* The history must be logged as part of the export */ + log_history = B_FALSE; + + if (cb->hardforce) { + if (zpool_export_force(zhp, history_str) != 0) + return (1); + } else if (zpool_export(zhp, cb->force, history_str) != 0) { + return (1); + } + + return (0); +} + +/* + * zpool export [-f] ... + * + * -a Export all pools + * -f Forcefully unmount datasets + * + * Export the given pools. By default, the command will attempt to cleanly + * unmount any active datasets within the pool. If the '-f' flag is specified, + * then the datasets will be forcefully unmounted. + */ +int +zpool_do_export(int argc, char **argv) +{ + export_cbdata_t cb; + boolean_t do_all = B_FALSE; + boolean_t force = B_FALSE; + boolean_t hardforce = B_FALSE; + int c, ret; + + /* check options */ + while ((c = getopt(argc, argv, "afF")) != -1) { + switch (c) { + case 'a': + do_all = B_TRUE; + break; + case 'f': + force = B_TRUE; + break; + case 'F': + hardforce = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.force = force; + cb.hardforce = hardforce; + argc -= optind; + argv += optind; + + if (do_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, + zpool_export_one, &cb)); + } + + /* check arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + + ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_export_one, &cb); + + return (ret); +} + +/* + * Given a vdev configuration, determine the maximum width needed for the device + * name column. + */ +static int +max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max, + int name_flags) +{ + char *name; + nvlist_t **child; + uint_t c, children; + int ret; + + name = zpool_vdev_name(g_zfs, zhp, nv, name_flags); + if (strlen(name) + depth > max) + max = strlen(name) + depth; + + free(name); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if ((ret = max_width(zhp, child[c], depth + 2, + max, name_flags)) > max) + max = ret; + } + + return (max); +} + +typedef struct spare_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} spare_cbdata_t; + +static boolean_t +find_vdev(nvlist_t *nv, uint64_t search) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + search == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev(child[c], search)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +find_spare(zpool_handle_t *zhp, void *data) +{ + spare_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (find_vdev(nvroot, cbp->cb_guid)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +typedef struct status_cbdata { + int cb_count; + int cb_name_flags; + int cb_namewidth; + boolean_t cb_allpools; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_explain; + boolean_t cb_first; + boolean_t cb_dedup_stats; + boolean_t cb_print_status; + boolean_t cb_print_slow_ios; + boolean_t cb_print_vdev_init; + boolean_t cb_print_vdev_trim; + vdev_cmd_data_list_t *vcdl; +} status_cbdata_t; + +/* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ +static int +is_blank_str(char *str) +{ + while (str != NULL && *str != '\0') { + if (!isblank(*str)) + return (0); + str++; + } + return (1); +} + +/* Print command output lines for specific vdev in a specific pool */ +static void +zpool_print_cmd(vdev_cmd_data_list_t *vcdl, const char *pool, char *path) +{ + vdev_cmd_data_t *data; + int i, j; + char *val; + + for (i = 0; i < vcdl->count; i++) { + if ((strcmp(vcdl->data[i].path, path) != 0) || + (strcmp(vcdl->data[i].pool, pool) != 0)) { + /* Not the vdev we're looking for */ + continue; + } + + data = &vcdl->data[i]; + /* Print out all the output values for this vdev */ + for (j = 0; j < vcdl->uniq_cols_cnt; j++) { + val = NULL; + /* Does this vdev have values for this column? */ + for (int k = 0; k < data->cols_cnt; k++) { + if (strcmp(data->cols[k], + vcdl->uniq_cols[j]) == 0) { + /* yes it does, record the value */ + val = data->lines[k]; + break; + } + } + /* + * Mark empty values with dashes to make output + * awk-able. + */ + if (is_blank_str(val)) + val = "-"; + + printf("%*s", vcdl->uniq_cols_width[j], val); + if (j < vcdl->uniq_cols_cnt - 1) + printf(" "); + } + + /* Print out any values that aren't in a column at the end */ + for (j = data->cols_cnt; j < data->lines_cnt; j++) { + /* Did we have any columns? If so print a spacer. */ + if (vcdl->uniq_cols_cnt > 0) + printf(" "); + + val = data->lines[j]; + printf("%s", val ? val : ""); + } + break; + } +} + +/* + * Print vdev initialization status for leaves + */ +static void +print_status_initialize(vdev_stat_t *vs, boolean_t verbose) +{ + if (verbose) { + if ((vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE || + vs->vs_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vs->vs_initialize_state == VDEV_INITIALIZE_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_initialize_action_time; + int initialize_pct = 100; + if (vs->vs_initialize_state != + VDEV_INITIALIZE_COMPLETE) { + initialize_pct = (vs->vs_initialize_bytes_done * + 100 / (vs->vs_initialize_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_initialize_state) { + case VDEV_INITIALIZE_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("suspended, started at"), tbuf); + break; + case VDEV_INITIALIZE_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("started at"), tbuf); + break; + case VDEV_INITIALIZE_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("completed at"), tbuf); + break; + } + + (void) printf(gettext(" (%d%% initialized%s)"), + initialize_pct, zbuf); + } else { + (void) printf(gettext(" (uninitialized)")); + } + } else if (vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) printf(gettext(" (initializing)")); + } +} + +/* + * Print vdev TRIM status for leaves + */ +static void +print_status_trim(vdev_stat_t *vs, boolean_t verbose) +{ + if (verbose) { + if ((vs->vs_trim_state == VDEV_TRIM_ACTIVE || + vs->vs_trim_state == VDEV_TRIM_SUSPENDED || + vs->vs_trim_state == VDEV_TRIM_COMPLETE) && + !vs->vs_scan_removing) { + char zbuf[1024]; + char tbuf[256]; + struct tm zaction_ts; + + time_t t = vs->vs_trim_action_time; + int trim_pct = 100; + if (vs->vs_trim_state != VDEV_TRIM_COMPLETE) { + trim_pct = (vs->vs_trim_bytes_done * + 100 / (vs->vs_trim_bytes_est + 1)); + } + + (void) localtime_r(&t, &zaction_ts); + (void) strftime(tbuf, sizeof (tbuf), "%c", &zaction_ts); + + switch (vs->vs_trim_state) { + case VDEV_TRIM_SUSPENDED: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("suspended, started at"), tbuf); + break; + case VDEV_TRIM_ACTIVE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("started at"), tbuf); + break; + case VDEV_TRIM_COMPLETE: + (void) snprintf(zbuf, sizeof (zbuf), ", %s %s", + gettext("completed at"), tbuf); + break; + } + + (void) printf(gettext(" (%d%% trimmed%s)"), + trim_pct, zbuf); + } else if (vs->vs_trim_notsup) { + (void) printf(gettext(" (trim unsupported)")); + } else { + (void) printf(gettext(" (untrimmed)")); + } + } else if (vs->vs_trim_state == VDEV_TRIM_ACTIVE) { + (void) printf(gettext(" (trimming)")); + } +} + +/* + * Return the color associated with a health string. This includes returning + * NULL for no color change. + */ +static char * +health_str_to_color(const char *health) +{ + if (strcmp(health, gettext("FAULTED")) == 0 || + strcmp(health, gettext("SUSPENDED")) == 0 || + strcmp(health, gettext("UNAVAIL")) == 0) { + return (ANSI_RED); + } + + if (strcmp(health, gettext("OFFLINE")) == 0 || + strcmp(health, gettext("DEGRADED")) == 0 || + strcmp(health, gettext("REMOVED")) == 0) { + return (ANSI_YELLOW); + } + + return (NULL); +} + +/* + * Print out configuration state as requested by status_callback. + */ +static void +print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, + nvlist_t *nv, int depth, boolean_t isspare, vdev_rebuild_stat_t *vrs) +{ + nvlist_t **child, *root; + uint_t c, i, vsc, children; + pool_scan_stat_t *ps = NULL; + vdev_stat_t *vs; + char rbuf[6], wbuf[6], cbuf[6]; + char *vname; + uint64_t notpresent; + spare_cbdata_t spare_cb; + const char *state; + char *type; + char *path = NULL; + char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) + return; + + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + if (isspare) { + /* + * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for + * online drives. + */ + if (vs->vs_aux == VDEV_AUX_SPARED) + state = gettext("INUSE"); + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = gettext("AVAIL"); + } + + printf_color(health_str_to_color(state), + "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, + name, state); + + if (!isspare) { + if (vs->vs_read_errors) + rcolor = ANSI_RED; + + if (vs->vs_write_errors) + wcolor = ANSI_RED; + + if (vs->vs_checksum_errors) + ccolor = ANSI_RED; + + if (cb->cb_literal) { + printf(" "); + printf_color(rcolor, "%5llu", + (u_longlong_t)vs->vs_read_errors); + printf(" "); + printf_color(wcolor, "%5llu", + (u_longlong_t)vs->vs_write_errors); + printf(" "); + printf_color(ccolor, "%5llu", + (u_longlong_t)vs->vs_checksum_errors); + } else { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, + sizeof (cbuf)); + printf(" "); + printf_color(rcolor, "%5s", rbuf); + printf(" "); + printf_color(wcolor, "%5s", wbuf); + printf(" "); + printf_color(ccolor, "%5s", cbuf); + } + if (cb->cb_print_slow_ios) { + if (children == 0) { + /* Only leafs vdevs have slow IOs */ + zfs_nicenum(vs->vs_slow_ios, rbuf, + sizeof (rbuf)); + } else { + snprintf(rbuf, sizeof (rbuf), "-"); + } + + if (cb->cb_literal) + printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + else + printf(" %5s", rbuf); + } + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + ¬present) == 0) { + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + (void) printf(" %s %s", gettext("was"), path); + } else if (vs->vs_aux != 0) { + (void) printf(" "); + color_start(ANSI_RED); + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + + case VDEV_AUX_SPARED: + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &spare_cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &spare_cb) == 1) { + if (strcmp(zpool_get_name(spare_cb.cb_zhp), + zpool_get_name(zhp)) == 0) + (void) printf(gettext("currently in " + "use")); + else + (void) printf(gettext("in use by " + "pool '%s'"), + zpool_get_name(spare_cb.cb_zhp)); + zpool_close(spare_cb.cb_zhp); + } else { + (void) printf(gettext("currently in use")); + } + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_IO_FAILURE: + (void) printf(gettext("experienced I/O failures")); + break; + + case VDEV_AUX_BAD_LOG: + (void) printf(gettext("bad intent log")); + break; + + case VDEV_AUX_EXTERNAL: + (void) printf(gettext("external device fault")); + break; + + case VDEV_AUX_SPLIT_POOL: + (void) printf(gettext("split into new pool")); + break; + + case VDEV_AUX_ACTIVE: + (void) printf(gettext("currently in use")); + break; + + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + color_end(); + } + + /* The root vdev has the scrub/resilver stats */ + root = fnvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE); + (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c); + + if (ps != NULL && ps->pss_state == DSS_SCANNING && children == 0) { + if (vs->vs_scan_processed != 0) { + (void) printf(gettext(" (%s)"), + (ps->pss_func == POOL_SCAN_RESILVER) ? + "resilvering" : "repairing"); + } else if (vs->vs_resilver_deferred) { + (void) printf(gettext(" (awaiting resilver)")); + } + } + + /* The top-level vdevs have the rebuild stats */ + if (vrs != NULL && vrs->vrs_state == VDEV_REBUILD_ACTIVE && + children == 0) { + if (vs->vs_rebuild_processed != 0) { + (void) printf(gettext(" (resilvering)")); + } + } + + if (cb->vcdl != NULL) { + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + } + } + + /* Display vdev initialization and trim status for leaves */ + if (children == 0) { + print_status_initialize(vs, cb->cb_print_vdev_init); + print_status_trim(vs, cb->cb_print_vdev_trim); + } + + (void) printf("\n"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE, ishole = B_FALSE; + + /* Don't print logs or holes here */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + if (islog || ishole) + continue; + /* Only print normal classes here */ + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + /* Provide vdev_rebuild_stats to children if available */ + if (vrs == NULL) { + (void) nvlist_lookup_uint64_array(nv, + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i); + } + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_status_config(zhp, cb, vname, child[c], depth + 2, + isspare, vrs); + free(vname); + } +} + +/* + * Print the configuration of an exported pool. Iterate over all vdevs in the + * pool, printing out the name and status for each one. + */ +static void +print_import_config(status_cbdata_t *cb, const char *name, nvlist_t *nv, + int depth) +{ + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + char *type, *vname; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_MISSING) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0) + return; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + (void) printf("\t%*s%-*s", depth, "", cb->cb_namewidth - depth, name); + (void) printf(" %s", zpool_state_to_name(vs->vs_state, vs->vs_aux)); + + if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_UNSUP_FEAT: + (void) printf(gettext("unsupported feature(s)")); + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_ACTIVE: + (void) printf(gettext("currently in use")); + break; + + case VDEV_AUX_CHILDREN_OFFLINE: + (void) printf(gettext("all children offline")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } + (void) printf("\n"); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + continue; + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + print_import_config(cb, vname, child[c], depth + 2); + free(vname); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + (void) printf(gettext("\tcache\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + (void) printf(gettext("\tspares\n")); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, NULL, child[c], + cb->cb_name_flags); + (void) printf("\t %s\n", vname); + free(vname); + } + } +} + +/* + * Print specialized class vdevs. + * + * These are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1 or an "alloc_bias" string. We use either + * print_status_config() or print_import_config() to print the top level + * class vdevs then any of their children (eg mirrored slogs) are printed + * recursively - which works because only the top level vdev is marked. + */ +static void +print_class_vdevs(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t *nv, + const char *class) +{ + uint_t c, children; + nvlist_t **child; + boolean_t printed = B_FALSE; + + assert(zhp != NULL || !cb->cb_verbose); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *bias = NULL; + char *type = NULL; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (is_log) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + + if (bias == NULL || strcmp(bias, class) != 0) + continue; + if (!is_log && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + (void) printf("\t%s\t\n", gettext(class)); + printed = B_TRUE; + } + + char *name = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags | VDEV_NAME_TYPE_ID); + if (cb->cb_print_status) + print_status_config(zhp, cb, name, child[c], 2, + B_FALSE, NULL); + else + print_import_config(cb, name, child[c], 2); + free(name); + } +} + +/* + * Display the status for the given pool. + */ +static void +show_import(nvlist_t *config) +{ + uint64_t pool_state; + vdev_stat_t *vs; + char *name; + uint64_t guid; + uint64_t hostid = 0; + char *msgid; + char *hostname = "unknown"; + nvlist_t *nvroot, *nvinfo; + zpool_status_t reason; + zpool_errata_t errata; + const char *health; + uint_t vsc; + char *comment; + status_cbdata_t cb = { 0 }; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + health = zpool_state_to_name(vs->vs_state, vs->vs_aux); + + reason = zpool_import_status(config, &msgid, &errata); + + (void) printf(gettext(" pool: %s\n"), name); + (void) printf(gettext(" id: %llu\n"), (u_longlong_t)guid); + (void) printf(gettext(" state: %s"), health); + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext(" (DESTROYED)")); + (void) printf("\n"); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "missing from the system.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices contains" + " corrupted data.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + (void) printf( + gettext(" status: The pool data is corrupted.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices " + "are offlined.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " + "corrupted.\n")); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk version.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "an incompatible version.\n")); + break; + + case ZPOOL_STATUS_FEAT_DISABLED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported features are " + "not enabled on the pool.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool uses the following " + "feature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); + zpool_print_unsup_feat(config); + color_end(); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + color_start(ANSI_YELLOW); + zpool_print_unsup_feat(config); + color_end(); + break; + + case ZPOOL_STATUS_HOSTID_ACTIVE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is currently " + "imported by another system.\n")); + break; + + case ZPOOL_STATUS_HOSTID_REQUIRED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has the " + "multihost property on. It cannot\n\tbe safely imported " + "when the system hostid is not set.\n")); + break; + + case ZPOOL_STATUS_HOSTID_MISMATCH: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool was last accessed " + "by another system.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + case ZPOOL_STATUS_FAULTED_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record cannot " + "be read.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices were " + "being resilvered.\n")); + break; + + case ZPOOL_STATUS_ERRATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), + errata); + break; + + default: + /* + * No other status can be seen when importing pools. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + /* + * Print out an action according to the overall state of the pool. + */ + if (vs->vs_state == VDEV_STATE_HEALTHY) { + if (reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED) { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric identifier, " + "though\n\tsome features will not be available " + "without an explicit 'zpool upgrade'.\n")); + } else if (reason == ZPOOL_STATUS_HOSTID_MISMATCH) { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric " + "identifier and\n\tthe '-f' flag.\n")); + } else if (reason == ZPOOL_STATUS_ERRATA) { + switch (errata) { + case ZPOOL_ERRATA_NONE: + break; + + case ZPOOL_ERRATA_ZOL_2094_SCRUB: + (void) printf(gettext(" action: The pool can " + "be imported using its name or numeric " + "identifier,\n\thowever there is a compat" + "ibility issue which should be corrected" + "\n\tby running 'zpool scrub'\n")); + break; + + case ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY: + (void) printf(gettext(" action: The pool can" + "not be imported with this version of ZFS " + "due to\n\tan active asynchronous destroy. " + "Revert to an earlier version\n\tand " + "allow the destroy to complete before " + "updating.\n")); + break; + + case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: + (void) printf(gettext(" action: Existing " + "encrypted datasets contain an on-disk " + "incompatibility, which\n\tneeds to be " + "corrected. Backup these datasets to new " + "encrypted datasets\n\tand destroy the " + "old ones.\n")); + break; + + case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: + (void) printf(gettext(" action: Existing " + "encrypted snapshots and bookmarks contain " + "an on-disk\n\tincompatibility. This may " + "cause on-disk corruption if they are used" + "\n\twith 'zfs recv'. To correct the " + "issue, enable the bookmark_v2 feature.\n\t" + "No additional action is needed if there " + "are no encrypted snapshots or\n\t" + "bookmarks. If preserving the encrypted " + "snapshots and bookmarks is\n\trequired, " + "use a non-raw send to backup and restore " + "them. Alternately,\n\tthey may be removed" + " to resolve the incompatibility.\n")); + break; + default: + /* + * All errata must contain an action message. + */ + assert(0); + } + } else { + (void) printf(gettext(" action: The pool can be " + "imported using its name or numeric " + "identifier.\n")); + } + } else if (vs->vs_state == VDEV_STATE_DEGRADED) { + (void) printf(gettext(" action: The pool can be imported " + "despite missing or damaged devices. The\n\tfault " + "tolerance of the pool may be compromised if imported.\n")); + } else { + switch (reason) { + case ZPOOL_STATUS_VERSION_NEWER: + (void) printf(gettext(" action: The pool cannot be " + "imported. Access the pool on a system running " + "newer\n\tsoftware, or recreate the pool from " + "backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " + "imported. Access the pool on a system that " + "supports\n\tthe required feature(s), or recreate " + "the pool from backup.\n")); + break; + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be " + "imported in read-write mode. Import the pool " + "with\n" + "\t\"-o readonly=on\", access the pool on a system " + "that supports the\n\trequired feature(s), or " + "recreate the pool from backup.\n")); + break; + case ZPOOL_STATUS_MISSING_DEV_R: + case ZPOOL_STATUS_MISSING_DEV_NR: + case ZPOOL_STATUS_BAD_GUID_SUM: + (void) printf(gettext(" action: The pool cannot be " + "imported. Attach the missing\n\tdevices and try " + "again.\n")); + break; + case ZPOOL_STATUS_HOSTID_ACTIVE: + VERIFY0(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo)); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID); + + (void) printf(gettext(" action: The pool must be " + "exported from %s (hostid=%lx)\n\tbefore it " + "can be safely imported.\n"), hostname, + (unsigned long) hostid); + break; + case ZPOOL_STATUS_HOSTID_REQUIRED: + (void) printf(gettext(" action: Set a unique system " + "hostid with the zgenhostid(8) command.\n")); + break; + default: + (void) printf(gettext(" action: The pool cannot be " + "imported due to damaged devices or data.\n")); + } + } + + /* Print the comment attached to the pool. */ + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + (void) printf(gettext("comment: %s\n"), comment); + + /* + * If the state is "closed" or "can't open", and the aux state + * is "corrupt data": + */ + if (((vs->vs_state == VDEV_STATE_CLOSED) || + (vs->vs_state == VDEV_STATE_CANT_OPEN)) && + (vs->vs_aux == VDEV_AUX_CORRUPT_DATA)) { + if (pool_state == POOL_STATE_DESTROYED) + (void) printf(gettext("\tThe pool was destroyed, " + "but can be imported using the '-Df' flags.\n")); + else if (pool_state != POOL_STATE_EXPORTED) + (void) printf(gettext("\tThe pool may be active on " + "another system, but can be imported using\n\t" + "the '-f' flag.\n")); + } + + if (msgid != NULL) { + (void) printf(gettext( + " see: https://zfsonlinux.org/msg/%s\n"), msgid); + } + + (void) printf(gettext(" config:\n\n")); + + cb.cb_namewidth = max_width(NULL, nvroot, 0, strlen(name), + VDEV_NAME_TYPE_ID); + if (cb.cb_namewidth < 10) + cb.cb_namewidth = 10; + + print_import_config(&cb, name, nvroot, 0); + + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(NULL, &cb, nvroot, VDEV_ALLOC_CLASS_LOGS); + + if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { + (void) printf(gettext("\n\tAdditional devices are known to " + "be part of this pool, though their\n\texact " + "configuration cannot be determined.\n")); + } +} + +static boolean_t +zfs_force_import_required(nvlist_t *config) +{ + uint64_t state; + uint64_t hostid = 0; + nvlist_t *nvinfo; + + state = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + + if (state != POOL_STATE_EXPORTED && hostid != get_system_hostid()) + return (B_TRUE); + + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) { + mmp_state_t mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (mmp_state != MMP_STATE_INACTIVE) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Perform the import for the given configuration. This passes the heavy + * lifting off to zpool_import_props(), and then mounts the datasets contained + * within the pool. + */ +static int +do_import(nvlist_t *config, const char *newname, const char *mntopts, + nvlist_t *props, int flags) +{ + int ret = 0; + zpool_handle_t *zhp; + char *name; + uint64_t version; + + name = fnvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME); + version = fnvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION); + + if (!SPA_VERSION_IS_SUPPORTED(version)) { + (void) fprintf(stderr, gettext("cannot import '%s': pool " + "is formatted using an unsupported ZFS version\n"), name); + return (1); + } else if (zfs_force_import_required(config) && + !(flags & ZFS_IMPORT_ANY_HOST)) { + mmp_state_t mmp_state = MMP_STATE_INACTIVE; + nvlist_t *nvinfo; + + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_STATE)) + mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (mmp_state == MMP_STATE_ACTIVE) { + char *hostname = ""; + uint64_t hostid = 0; + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID); + + (void) fprintf(stderr, gettext("cannot import '%s': " + "pool is imported on %s (hostid: " + "0x%lx)\nExport the pool on the other system, " + "then run 'zpool import'.\n"), + name, hostname, (unsigned long) hostid); + } else if (mmp_state == MMP_STATE_NO_HOSTID) { + (void) fprintf(stderr, gettext("Cannot import '%s': " + "pool has the multihost property on and the\n" + "system's hostid is not set. Set a unique hostid " + "with the zgenhostid(8) command.\n"), name); + } else { + char *hostname = ""; + uint64_t timestamp = 0; + uint64_t hostid = 0; + + if (nvlist_exists(config, ZPOOL_CONFIG_HOSTNAME)) + hostname = fnvlist_lookup_string(config, + ZPOOL_CONFIG_HOSTNAME); + + if (nvlist_exists(config, ZPOOL_CONFIG_TIMESTAMP)) + timestamp = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_TIMESTAMP); + + if (nvlist_exists(config, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(config, + ZPOOL_CONFIG_HOSTID); + + (void) fprintf(stderr, gettext("cannot import '%s': " + "pool was previously in use from another system.\n" + "Last accessed by %s (hostid=%lx) at %s" + "The pool can be imported, use 'zpool import -f' " + "to import the pool.\n"), name, hostname, + (unsigned long)hostid, ctime((time_t *)×tamp)); + } + + return (1); + } + + if (zpool_import_props(g_zfs, config, newname, props, flags) != 0) + return (1); + + if (newname != NULL) + name = (char *)newname; + + if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) + return (1); + + /* + * Loading keys is best effort. We don't want to return immediately + * if it fails but we do want to give the error to the caller. + */ + if (flags & ZFS_IMPORT_LOAD_KEYS) { + ret = zfs_crypto_attempt_load_keys(g_zfs, name); + if (ret != 0) + ret = 1; + } + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + !(flags & ZFS_IMPORT_ONLY) && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + zpool_close(zhp); + return (1); + } + + zpool_close(zhp); + return (ret); +} + +typedef struct target_exists_args { + const char *poolname; + uint64_t poolguid; +} target_exists_args_t; + +static int +name_or_guid_exists(zpool_handle_t *zhp, void *data) +{ + target_exists_args_t *args = data; + nvlist_t *config = zpool_get_config(zhp, NULL); + int found = 0; + + if (config == NULL) + return (0); + + if (args->poolname != NULL) { + char *pool_name; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pool_name) == 0); + if (strcmp(pool_name, args->poolname) == 0) + found = 1; + } else { + uint64_t pool_guid; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) == 0); + if (pool_guid == args->poolguid) + found = 1; + } + zpool_close(zhp); + + return (found); +} +/* + * zpool checkpoint + * checkpoint --discard + * + * -d Discard the checkpoint from a checkpointed + * --discard pool. + * + * -w Wait for discarding a checkpoint to complete. + * --wait + * + * Checkpoints the specified pool, by taking a "snapshot" of its + * current state. A pool can only have one checkpoint at a time. + */ +int +zpool_do_checkpoint(int argc, char **argv) +{ + boolean_t discard, wait; + char *pool; + zpool_handle_t *zhp; + int c, err; + + struct option long_options[] = { + {"discard", no_argument, NULL, 'd'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + discard = B_FALSE; + wait = B_FALSE; + while ((c = getopt_long(argc, argv, ":dw", long_options, NULL)) != -1) { + switch (c) { + case 'd': + discard = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (wait && !discard) { + (void) fprintf(stderr, gettext("--wait only valid when " + "--discard also specified\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool argument\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + pool = argv[0]; + + if ((zhp = zpool_open(g_zfs, pool)) == NULL) { + /* As a special case, check for use of '/' in the name */ + if (strchr(pool, '/') != NULL) + (void) fprintf(stderr, gettext("'zpool checkpoint' " + "doesn't work on datasets. To save the state " + "of a dataset from a specific point in time " + "please use 'zfs snapshot'\n")); + return (1); + } + + if (discard) { + err = (zpool_discard_checkpoint(zhp) != 0); + if (err == 0 && wait) + err = zpool_wait(zhp, ZPOOL_WAIT_CKPT_DISCARD); + } else { + err = (zpool_checkpoint(zhp) != 0); + } + + zpool_close(zhp); + + return (err); +} + +#define CHECKPOINT_OPT 1024 + +/* + * zpool import [-d dir] [-D] + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] + * [-d dir | -c cachefile] [-f] -a + * import [-o mntopts] [-o prop=value] ... [-R root] [-D] [-l] + * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] + * + * -c Read pool information from a cachefile instead of searching + * devices. + * + * -d Scan in a specific directory, other than /dev/. More than + * one directory can be specified using multiple '-d' options. + * + * -D Scan for previously destroyed pools or import all or only + * specified destroyed pools. + * + * -R Temporarily import the pool, with all mountpoints relative to + * the given root. The pool will remain exported when the machine + * is rebooted. + * + * -V Import even in the presence of faulted vdevs. This is an + * intentionally undocumented option for testing purposes, and + * treats the pool configuration as complete, leaving any bad + * vdevs in the FAULTED state. In other words, it does verbatim + * import. + * + * -f Force import, even if it appears that the pool is active. + * + * -F Attempt rewind if necessary. + * + * -n See if rewind would work, but don't actually rewind. + * + * -N Import the pool but don't mount datasets. + * + * -T Specify a starting txg to use for import. This option is + * intentionally undocumented option for testing purposes. + * + * -a Import all pools found. + * + * -l Load encryption keys while importing. + * + * -o Set property=value and/or temporary mount options (without '='). + * + * -s Scan using the default search path, the libblkid cache will + * not be consulted. + * + * --rewind-to-checkpoint + * Import the pool and revert back to the checkpoint. + * + * The import command scans for pools to import, and import pools based on pool + * name and GUID. The pool can also be renamed as part of the import process. + */ +int +zpool_do_import(int argc, char **argv) +{ + char **searchdirs = NULL; + char *env, *envdup = NULL; + int nsearch = 0; + int c; + int err = 0; + nvlist_t *pools = NULL; + boolean_t do_all = B_FALSE; + boolean_t do_destroyed = B_FALSE; + char *mntopts = NULL; + nvpair_t *elem; + nvlist_t *config; + uint64_t searchguid = 0; + char *searchname = NULL; + char *propval; + nvlist_t *found_config; + nvlist_t *policy = NULL; + nvlist_t *props = NULL; + boolean_t first; + int flags = ZFS_IMPORT_NORMAL; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + boolean_t do_scan = B_FALSE; + boolean_t pool_exists = B_FALSE; + uint64_t pool_state, txg = -1ULL; + char *cachefile = NULL; + importargs_t idata = { 0 }; + char *endptr; + + struct option long_options[] = { + {"rewind-to-checkpoint", no_argument, NULL, CHECKPOINT_OPT}, + {0, 0, 0, 0} + }; + + /* check options */ + while ((c = getopt_long(argc, argv, ":aCc:d:DEfFlmnNo:R:stT:VX", + long_options, NULL)) != -1) { + switch (c) { + case 'a': + do_all = B_TRUE; + break; + case 'c': + cachefile = optarg; + break; + case 'd': + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + } else { + char **tmp = safe_malloc((nsearch + 1) * + sizeof (char *)); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + free(searchdirs); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; + break; + case 'D': + do_destroyed = B_TRUE; + break; + case 'f': + flags |= ZFS_IMPORT_ANY_HOST; + break; + case 'F': + do_rewind = B_TRUE; + break; + case 'l': + flags |= ZFS_IMPORT_LOAD_KEYS; + break; + case 'm': + flags |= ZFS_IMPORT_MISSING_LOG; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'N': + flags |= ZFS_IMPORT_ONLY; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE)) + goto error; + } else { + mntopts = optarg; + } + break; + case 'R': + if (add_prop_list(zpool_prop_to_name( + ZPOOL_PROP_ALTROOT), optarg, &props, B_TRUE)) + goto error; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; + case 's': + do_scan = B_TRUE; + break; + case 't': + flags |= ZFS_IMPORT_TEMP_NAME; + if (add_prop_list_default(zpool_prop_to_name( + ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) + goto error; + break; + + case 'T': + errno = 0; + txg = strtoull(optarg, &endptr, 0); + if (errno != 0 || *endptr != '\0') { + (void) fprintf(stderr, + gettext("invalid txg value\n")); + usage(B_FALSE); + } + rewind_policy = ZPOOL_DO_REWIND | ZPOOL_EXTREME_REWIND; + break; + case 'V': + flags |= ZFS_IMPORT_VERBATIM; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case CHECKPOINT_OPT: + flags |= ZFS_IMPORT_CHECKPOINT; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (cachefile && nsearch != 0) { + (void) fprintf(stderr, gettext("-c is incompatible with -d\n")); + usage(B_FALSE); + } + + if ((flags & ZFS_IMPORT_LOAD_KEYS) && (flags & ZFS_IMPORT_ONLY)) { + (void) fprintf(stderr, gettext("-l is incompatible with -N\n")); + usage(B_FALSE); + } + + if ((flags & ZFS_IMPORT_LOAD_KEYS) && !do_all && argc == 0) { + (void) fprintf(stderr, gettext("-l is only meaningful during " + "an import\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In the future, we can capture further policy and include it here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_LOAD_REQUEST_TXG, txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, + rewind_policy) != 0) + goto error; + + /* check argument count */ + if (do_all) { + if (argc != 0) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } else { + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + } + + /* + * Check for the effective uid. We do this explicitly here because + * otherwise any attempt to discover pools will silently fail. + */ + if (argc == 0 && geteuid() != 0) { + (void) fprintf(stderr, gettext("cannot " + "discover pools: permission denied\n")); + if (searchdirs != NULL) + free(searchdirs); + + nvlist_free(props); + nvlist_free(policy); + return (1); + } + + /* + * Depending on the arguments given, we do one of the following: + * + * Iterate through all pools and display information about + * each one. + * + * -a Iterate through all pools and try to import each one. + * + * Find the pool that corresponds to the given GUID/pool + * name and import that one. + * + * -D Above options applies only to destroyed pools. + */ + if (argc != 0) { + char *endptr; + + errno = 0; + searchguid = strtoull(argv[0], &endptr, 10); + if (errno != 0 || *endptr != '\0') { + searchname = argv[0]; + searchguid = 0; + } + found_config = NULL; + + /* + * User specified a name or guid. Ensure it's unique. + */ + target_exists_args_t search = {searchname, searchguid}; + pool_exists = zpool_iter(g_zfs, name_or_guid_exists, &search); + } + + /* + * Check the environment for the preferred search path. + */ + if ((searchdirs == NULL) && (env = getenv("ZPOOL_IMPORT_PATH"))) { + char *dir; + + envdup = strdup(env); + + dir = strtok(envdup, ":"); + while (dir != NULL) { + if (searchdirs == NULL) { + searchdirs = safe_malloc(sizeof (char *)); + } else { + char **tmp = safe_malloc((nsearch + 1) * + sizeof (char *)); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + free(searchdirs); + searchdirs = tmp; + } + searchdirs[nsearch++] = dir; + dir = strtok(NULL, ":"); + } + } + + idata.path = searchdirs; + idata.paths = nsearch; + idata.poolname = searchname; + idata.guid = searchguid; + idata.cachefile = cachefile; + idata.scan = do_scan; + idata.policy = policy; + + pools = zpool_search_import(g_zfs, &idata, &libzfs_config_ops); + + if (pools != NULL && pool_exists && + (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name already exists\n"), + argv[0]); + (void) fprintf(stderr, gettext("use the form '%s " + " ' to give it a new name\n"), + "zpool import"); + err = 1; + } else if (pools == NULL && pool_exists) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name is already created/imported,\n"), + argv[0]); + (void) fprintf(stderr, gettext("and no additional pools " + "with that name were found\n")); + err = 1; + } else if (pools == NULL) { + if (argc != 0) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + } + err = 1; + } + + if (err == 1) { + if (searchdirs != NULL) + free(searchdirs); + if (envdup != NULL) + free(envdup); + nvlist_free(policy); + nvlist_free(pools); + nvlist_free(props); + return (1); + } + + /* + * At this point we have a list of import candidate configs. Even if + * we were searching by pool name or guid, we still need to + * post-process the list to deal with pool state and possible + * duplicate names. + */ + err = 0; + elem = NULL; + first = B_TRUE; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + + verify(nvpair_value_nvlist(elem, &config) == 0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &pool_state) == 0); + if (!do_destroyed && pool_state == POOL_STATE_DESTROYED) + continue; + if (do_destroyed && pool_state != POOL_STATE_DESTROYED) + continue; + + verify(nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + policy) == 0); + + if (argc == 0) { + if (first) + first = B_FALSE; + else if (!do_all) + (void) printf("\n"); + + if (do_all) { + err |= do_import(config, NULL, mntopts, + props, flags); + } else { + show_import(config); + } + } else if (searchname != NULL) { + char *name; + + /* + * We are searching for a pool based on name. + */ + verify(nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &name) == 0); + + if (strcmp(name, searchname) == 0) { + if (found_config != NULL) { + (void) fprintf(stderr, gettext( + "cannot import '%s': more than " + "one matching pool\n"), searchname); + (void) fprintf(stderr, gettext( + "import by numeric ID instead\n")); + err = B_TRUE; + } + found_config = config; + } + } else { + uint64_t guid; + + /* + * Search for a pool by guid. + */ + verify(nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + + if (guid == searchguid) + found_config = config; + } + } + + /* + * If we were searching for a specific pool, verify that we found a + * pool, and then do the import. + */ + if (argc != 0 && err == 0) { + if (found_config == NULL) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "no such pool available\n"), argv[0]); + err = B_TRUE; + } else { + err |= do_import(found_config, argc == 1 ? NULL : + argv[1], mntopts, props, flags); + } + } + + /* + * If we were just looking for pools, report an error if none were + * found. + */ + if (argc == 0 && first) + (void) fprintf(stderr, + gettext("no pools available to import\n")); + +error: + nvlist_free(props); + nvlist_free(pools); + nvlist_free(policy); + if (searchdirs != NULL) + free(searchdirs); + if (envdup != NULL) + free(envdup); + + return (err ? 1 : 0); +} + +/* + * zpool sync [-f] [pool] ... + * + * -f (undocumented) force uberblock (and config including zpool cache file) + * update. + * + * Sync the specified pool(s). + * Without arguments "zpool sync" will sync all pools. + * This command initiates TXG sync(s) and will return after the TXG(s) commit. + * + */ +static int +zpool_do_sync(int argc, char **argv) +{ + int ret; + boolean_t force = B_FALSE; + + /* check options */ + while ((ret = getopt(argc, argv, "f")) != -1) { + switch (ret) { + case 'f': + force = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* if argc == 0 we will execute zpool_sync_one on all pools */ + ret = for_each_pool(argc, argv, B_FALSE, NULL, zpool_sync_one, &force); + + return (ret); +} + +typedef struct iostat_cbdata { + uint64_t cb_flags; + int cb_name_flags; + int cb_namewidth; + int cb_iteration; + char **cb_vdev_names; /* Only show these vdevs */ + unsigned int cb_vdev_names_count; + boolean_t cb_verbose; + boolean_t cb_literal; + boolean_t cb_scripted; + zpool_list_t *cb_list; + vdev_cmd_data_list_t *vcdl; +} iostat_cbdata_t; + +/* iostat labels */ +typedef struct name_and_columns { + const char *name; /* Column name */ + unsigned int columns; /* Center name to this number of columns */ +} name_and_columns_t; + +#define IOSTAT_MAX_LABELS 13 /* Max number of labels on one line */ + +static const name_and_columns_t iostat_top_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"capacity", 2}, {"operations", 2}, {"bandwidth", 2}, + {NULL}}, + [IOS_LATENCY] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {"scrub", 1}, {"trim", 1}, {NULL}}, + [IOS_QUEUES] = {{"syncq_read", 2}, {"syncq_write", 2}, + {"asyncq_read", 2}, {"asyncq_write", 2}, {"scrubq_read", 2}, + {"trimq_write", 2}, {NULL}}, + [IOS_L_HISTO] = {{"total_wait", 2}, {"disk_wait", 2}, {"syncq_wait", 2}, + {"asyncq_wait", 2}, {NULL}}, + [IOS_RQ_HISTO] = {{"sync_read", 2}, {"sync_write", 2}, + {"async_read", 2}, {"async_write", 2}, {"scrub", 2}, + {"trim", 2}, {NULL}}, +}; + +/* Shorthand - if "columns" field not set, default to 1 column */ +static const name_and_columns_t iostat_bottom_labels[][IOSTAT_MAX_LABELS] = +{ + [IOS_DEFAULT] = {{"alloc"}, {"free"}, {"read"}, {"write"}, {"read"}, + {"write"}, {NULL}}, + [IOS_LATENCY] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"wait"}, {"wait"}, {NULL}}, + [IOS_QUEUES] = {{"pend"}, {"activ"}, {"pend"}, {"activ"}, {"pend"}, + {"activ"}, {"pend"}, {"activ"}, {"pend"}, {"activ"}, + {"pend"}, {"activ"}, {NULL}}, + [IOS_L_HISTO] = {{"read"}, {"write"}, {"read"}, {"write"}, {"read"}, + {"write"}, {"read"}, {"write"}, {"scrub"}, {"trim"}, {NULL}}, + [IOS_RQ_HISTO] = {{"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, + {"ind"}, {"agg"}, {"ind"}, {"agg"}, {"ind"}, {"agg"}, {NULL}}, +}; + +static const char *histo_to_title[] = { + [IOS_L_HISTO] = "latency", + [IOS_RQ_HISTO] = "req_size", +}; + +/* + * Return the number of labels in a null-terminated name_and_columns_t + * array. + * + */ +static unsigned int +label_array_len(const name_and_columns_t *labels) +{ + int i = 0; + + while (labels[i].name) + i++; + + return (i); +} + +/* + * Return the number of strings in a null-terminated string array. + * For example: + * + * const char foo[] = {"bar", "baz", NULL} + * + * returns 2 + */ +static uint64_t +str_array_len(const char *array[]) +{ + uint64_t i = 0; + while (array[i]) + i++; + + return (i); +} + + +/* + * Return a default column width for default/latency/queue columns. This does + * not include histograms, which have their columns autosized. + */ +static unsigned int +default_column_width(iostat_cbdata_t *cb, enum iostat_type type) +{ + unsigned long column_width = 5; /* Normal niceprint */ + static unsigned long widths[] = { + /* + * Choose some sane default column sizes for printing the + * raw numbers. + */ + [IOS_DEFAULT] = 15, /* 1PB capacity */ + [IOS_LATENCY] = 10, /* 1B ns = 10sec */ + [IOS_QUEUES] = 6, /* 1M queue entries */ + [IOS_L_HISTO] = 10, /* 1B ns = 10sec */ + [IOS_RQ_HISTO] = 6, /* 1M queue entries */ + }; + + if (cb->cb_literal) + column_width = widths[type]; + + return (column_width); +} + +/* + * Print the column labels, i.e: + * + * capacity operations bandwidth + * alloc free read write read write ... + * + * If force_column_width is set, use it for the column width. If not set, use + * the default column width. + */ +static void +print_iostat_labels(iostat_cbdata_t *cb, unsigned int force_column_width, + const name_and_columns_t labels[][IOSTAT_MAX_LABELS]) +{ + int i, idx, s; + int text_start, rw_column_width, spaces_to_end; + uint64_t flags = cb->cb_flags; + uint64_t f; + unsigned int column_width = force_column_width; + + /* For each bit set in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + if (!force_column_width) + column_width = default_column_width(cb, idx); + /* Print our top labels centered over "read write" label. */ + for (i = 0; i < label_array_len(labels[idx]); i++) { + const char *name = labels[idx][i].name; + /* + * We treat labels[][].columns == 0 as shorthand + * for one column. It makes writing out the label + * tables more concise. + */ + unsigned int columns = MAX(1, labels[idx][i].columns); + unsigned int slen = strlen(name); + + rw_column_width = (column_width * columns) + + (2 * (columns - 1)); + + text_start = (int)((rw_column_width) / columns - + slen / columns); + if (text_start < 0) + text_start = 0; + + printf(" "); /* Two spaces between columns */ + + /* Space from beginning of column to label */ + for (s = 0; s < text_start; s++) + printf(" "); + + printf("%s", name); + + /* Print space after label to end of column */ + spaces_to_end = rw_column_width - text_start - slen; + if (spaces_to_end < 0) + spaces_to_end = 0; + + for (s = 0; s < spaces_to_end; s++) + printf(" "); + } + } +} + + +/* + * print_cmd_columns - Print custom column titles from -c + * + * If the user specified the "zpool status|iostat -c" then print their custom + * column titles in the header. For example, print_cmd_columns() would print + * the " col1 col2" part of this: + * + * $ zpool iostat -vc 'echo col1=val1; echo col2=val2' + * ... + * capacity operations bandwidth + * pool alloc free read write read write col1 col2 + * ---------- ----- ----- ----- ----- ----- ----- ---- ---- + * mypool 269K 1008M 0 0 107 946 + * mirror 269K 1008M 0 0 107 946 + * sdb - - 0 0 102 473 val1 val2 + * sdc - - 0 0 5 473 val1 val2 + * ---------- ----- ----- ----- ----- ----- ----- ---- ---- + */ +static void +print_cmd_columns(vdev_cmd_data_list_t *vcdl, int use_dashes) +{ + int i, j; + vdev_cmd_data_t *data = &vcdl->data[0]; + + if (vcdl->count == 0 || data == NULL) + return; + + /* + * Each vdev cmd should have the same column names unless the user did + * something weird with their cmd. Just take the column names from the + * first vdev and assume it works for all of them. + */ + for (i = 0; i < vcdl->uniq_cols_cnt; i++) { + printf(" "); + if (use_dashes) { + for (j = 0; j < vcdl->uniq_cols_width[i]; j++) + printf("-"); + } else { + printf_color(ANSI_BOLD, "%*s", vcdl->uniq_cols_width[i], + vcdl->uniq_cols[i]); + } + } +} + + +/* + * Utility function to print out a line of dashes like: + * + * -------------------------------- ----- ----- ----- ----- ----- + * + * ...or a dashed named-row line like: + * + * logs - - - - - + * + * @cb: iostat data + * + * @force_column_width If non-zero, use the value as the column width. + * Otherwise use the default column widths. + * + * @name: Print a dashed named-row line starting + * with @name. Otherwise, print a regular + * dashed line. + */ +static void +print_iostat_dashes(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *name) +{ + int i; + unsigned int namewidth; + uint64_t flags = cb->cb_flags; + uint64_t f; + int idx; + const name_and_columns_t *labels; + const char *title; + + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + name ? strlen(name) : 0); + + + if (name) { + printf("%-*s", namewidth, name); + } else { + for (i = 0; i < namewidth; i++) + (void) printf("-"); + } + + /* For each bit in flags */ + for (f = flags; f; f &= ~(1ULL << idx)) { + unsigned int column_width; + idx = lowbit64(f) - 1; + if (force_column_width) + column_width = force_column_width; + else + column_width = default_column_width(cb, idx); + + labels = iostat_bottom_labels[idx]; + for (i = 0; i < label_array_len(labels); i++) { + if (name) + printf(" %*s-", column_width - 1, " "); + else + printf(" %.*s", column_width, + "--------------------"); + } + } +} + + +static void +print_iostat_separator_impl(iostat_cbdata_t *cb, + unsigned int force_column_width) +{ + print_iostat_dashes(cb, force_column_width, NULL); +} + +static void +print_iostat_separator(iostat_cbdata_t *cb) +{ + print_iostat_separator_impl(cb, 0); +} + +static void +print_iostat_header_impl(iostat_cbdata_t *cb, unsigned int force_column_width, + const char *histo_vdev_name) +{ + unsigned int namewidth; + const char *title; + + if (cb->cb_flags & IOS_ANYHISTO_M) { + title = histo_to_title[IOS_HISTO_IDX(cb->cb_flags)]; + } else if (cb->cb_vdev_names_count) { + title = "vdev"; + } else { + title = "pool"; + } + + namewidth = MAX(MAX(strlen(title), cb->cb_namewidth), + histo_vdev_name ? strlen(histo_vdev_name) : 0); + + if (histo_vdev_name) + printf("%-*s", namewidth, histo_vdev_name); + else + printf("%*s", namewidth, ""); + + + print_iostat_labels(cb, force_column_width, iostat_top_labels); + printf("\n"); + + printf("%-*s", namewidth, title); + + print_iostat_labels(cb, force_column_width, iostat_bottom_labels); + if (cb->vcdl != NULL) + print_cmd_columns(cb->vcdl, 0); + + printf("\n"); + + print_iostat_separator_impl(cb, force_column_width); + + if (cb->vcdl != NULL) + print_cmd_columns(cb->vcdl, 1); + + printf("\n"); +} + +static void +print_iostat_header(iostat_cbdata_t *cb) +{ + print_iostat_header_impl(cb, 0, NULL); +} + + +/* + * Display a single statistic. + */ +static void +print_one_stat(uint64_t value, enum zfs_nicenum_format format, + unsigned int column_size, boolean_t scripted) +{ + char buf[64]; + + zfs_nicenum_format(value, buf, sizeof (buf), format); + + if (scripted) + printf("\t%s", buf); + else + printf(" %*s", column_size, buf); +} + +/* + * Calculate the default vdev stats + * + * Subtract oldvs from newvs, apply a scaling factor, and save the resulting + * stats into calcvs. + */ +static void +calc_default_iostats(vdev_stat_t *oldvs, vdev_stat_t *newvs, + vdev_stat_t *calcvs) +{ + int i; + + memcpy(calcvs, newvs, sizeof (*calcvs)); + for (i = 0; i < ARRAY_SIZE(calcvs->vs_ops); i++) + calcvs->vs_ops[i] = (newvs->vs_ops[i] - oldvs->vs_ops[i]); + + for (i = 0; i < ARRAY_SIZE(calcvs->vs_bytes); i++) + calcvs->vs_bytes[i] = (newvs->vs_bytes[i] - oldvs->vs_bytes[i]); +} + +/* + * Internal representation of the extended iostats data. + * + * The extended iostat stats are exported in nvlists as either uint64_t arrays + * or single uint64_t's. We make both look like arrays to make them easier + * to process. In order to make single uint64_t's look like arrays, we set + * __data to the stat data, and then set *data = &__data with count = 1. Then, + * we can just use *data and count. + */ +struct stat_array { + uint64_t *data; + uint_t count; /* Number of entries in data[] */ + uint64_t __data; /* Only used when data is a single uint64_t */ +}; + +static uint64_t +stat_histo_max(struct stat_array *nva, unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array64_max(nva[i].data, nva[i].count)); + + return (max); +} + +/* + * Helper function to lookup a uint64_t array or uint64_t value and store its + * data as a stat_array. If the nvpair is a single uint64_t value, then we make + * it look like a one element array to make it easier to process. + */ +static int +nvpair64_to_stat_array(nvlist_t *nvl, const char *name, + struct stat_array *nva) +{ + nvpair_t *tmp; + int ret; + + verify(nvlist_lookup_nvpair(nvl, name, &tmp) == 0); + switch (nvpair_type(tmp)) { + case DATA_TYPE_UINT64_ARRAY: + ret = nvpair_value_uint64_array(tmp, &nva->data, &nva->count); + break; + case DATA_TYPE_UINT64: + ret = nvpair_value_uint64(tmp, &nva->__data); + nva->data = &nva->__data; + nva->count = 1; + break; + default: + /* Not a uint64_t */ + ret = EINVAL; + break; + } + + return (ret); +} + +/* + * Given a list of nvlist names, look up the extended stats in newnv and oldnv, + * subtract them, and return the results in a newly allocated stat_array. + * You must free the returned array after you are done with it with + * free_calc_stats(). + * + * Additionally, you can set "oldnv" to NULL if you simply want the newnv + * values. + */ +static struct stat_array * +calc_and_alloc_stats_ex(const char **names, unsigned int len, nvlist_t *oldnv, + nvlist_t *newnv) +{ + nvlist_t *oldnvx = NULL, *newnvx; + struct stat_array *oldnva, *newnva, *calcnva; + int i, j; + unsigned int alloc_size = (sizeof (struct stat_array)) * len; + + /* Extract our extended stats nvlist from the main list */ + verify(nvlist_lookup_nvlist(newnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &newnvx) == 0); + if (oldnv) { + verify(nvlist_lookup_nvlist(oldnv, ZPOOL_CONFIG_VDEV_STATS_EX, + &oldnvx) == 0); + } + + newnva = safe_malloc(alloc_size); + oldnva = safe_malloc(alloc_size); + calcnva = safe_malloc(alloc_size); + + for (j = 0; j < len; j++) { + verify(nvpair64_to_stat_array(newnvx, names[j], + &newnva[j]) == 0); + calcnva[j].count = newnva[j].count; + alloc_size = calcnva[j].count * sizeof (calcnva[j].data[0]); + calcnva[j].data = safe_malloc(alloc_size); + memcpy(calcnva[j].data, newnva[j].data, alloc_size); + + if (oldnvx) { + verify(nvpair64_to_stat_array(oldnvx, names[j], + &oldnva[j]) == 0); + for (i = 0; i < oldnva[j].count; i++) + calcnva[j].data[i] -= oldnva[j].data[i]; + } + } + free(newnva); + free(oldnva); + return (calcnva); +} + +static void +free_calc_stats(struct stat_array *nva, unsigned int len) +{ + int i; + for (i = 0; i < len; i++) + free(nva[i].data); + + free(nva); +} + +static void +print_iostat_histo(struct stat_array *nva, unsigned int len, + iostat_cbdata_t *cb, unsigned int column_width, unsigned int namewidth, + double scale) +{ + int i, j; + char buf[6]; + uint64_t val; + enum zfs_nicenum_format format; + unsigned int buckets; + unsigned int start_bucket; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + /* All these histos are the same size, so just use nva[0].count */ + buckets = nva[0].count; + + if (cb->cb_flags & IOS_RQ_HISTO_M) { + /* Start at 512 - req size should never be lower than this */ + start_bucket = 9; + } else { + start_bucket = 0; + } + + for (j = start_bucket; j < buckets; j++) { + /* Print histogram bucket label */ + if (cb->cb_flags & IOS_L_HISTO_M) { + /* Ending range of this bucket */ + val = (1UL << (j + 1)) - 1; + zfs_nicetime(val, buf, sizeof (buf)); + } else { + /* Request size (starting range of bucket) */ + val = (1UL << j); + zfs_nicenum(val, buf, sizeof (buf)); + } + + if (cb->cb_scripted) + printf("%llu", (u_longlong_t)val); + else + printf("%-*s", namewidth, buf); + + /* Print the values on the line */ + for (i = 0; i < len; i++) { + print_one_stat(nva[i].data[j] * scale, format, + column_width, cb->cb_scripted); + } + printf("\n"); + } +} + +static void +print_solid_separator(unsigned int length) +{ + while (length--) + printf("-"); + printf("\n"); +} + +static void +print_iostat_histos(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv, double scale, const char *name) +{ + unsigned int column_width; + unsigned int namewidth; + unsigned int entire_width; + enum iostat_type type; + struct stat_array *nva; + const char **names; + unsigned int names_len; + + /* What type of histo are we? */ + type = IOS_HISTO_IDX(cb->cb_flags); + + /* Get NULL-terminated array of nvlist names for our histo */ + names = vsx_type_to_nvlist[type]; + names_len = str_array_len(names); /* num of names */ + + nva = calc_and_alloc_stats_ex(names, names_len, oldnv, newnv); + + if (cb->cb_literal) { + column_width = MAX(5, + (unsigned int) log10(stat_histo_max(nva, names_len)) + 1); + } else { + column_width = 5; + } + + namewidth = MAX(cb->cb_namewidth, + strlen(histo_to_title[IOS_HISTO_IDX(cb->cb_flags)])); + + /* + * Calculate the entire line width of what we're printing. The + * +2 is for the two spaces between columns: + */ + /* read write */ + /* ----- ----- */ + /* |___| <---------- column_width */ + /* */ + /* |__________| <--- entire_width */ + /* */ + entire_width = namewidth + (column_width + 2) * + label_array_len(iostat_bottom_labels[type]); + + if (cb->cb_scripted) + printf("%s\n", name); + else + print_iostat_header_impl(cb, column_width, name); + + print_iostat_histo(nva, names_len, cb, column_width, + namewidth, scale); + + free_calc_stats(nva, names_len); + if (!cb->cb_scripted) + print_solid_separator(entire_width); +} + +/* + * Calculate the average latency of a power-of-two latency histogram + */ +static uint64_t +single_histo_average(uint64_t *histo, unsigned int buckets) +{ + int i; + uint64_t count = 0, total = 0; + + for (i = 0; i < buckets; i++) { + /* + * Our buckets are power-of-two latency ranges. Use the + * midpoint latency of each bucket to calculate the average. + * For example: + * + * Bucket Midpoint + * 8ns-15ns: 12ns + * 16ns-31ns: 24ns + * ... + */ + if (histo[i] != 0) { + total += histo[i] * (((1UL << i) + ((1UL << i)/2))); + count += histo[i]; + } + } + + /* Prevent divide by zero */ + return (count == 0 ? 0 : total / count); +} + +static void +print_iostat_queues(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, + ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + }; + + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_QUEUES); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), NULL, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + for (i = 0; i < ARRAY_SIZE(names); i++) { + val = nva[i].data[0]; + print_one_stat(val, format, column_width, cb->cb_scripted); + } + + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +static void +print_iostat_latency(iostat_cbdata_t *cb, nvlist_t *oldnv, + nvlist_t *newnv) +{ + int i; + uint64_t val; + const char *names[] = { + ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + }; + struct stat_array *nva; + + unsigned int column_width = default_column_width(cb, IOS_LATENCY); + enum zfs_nicenum_format format; + + nva = calc_and_alloc_stats_ex(names, ARRAY_SIZE(names), oldnv, newnv); + + if (cb->cb_literal) + format = ZFS_NICENUM_RAWTIME; + else + format = ZFS_NICENUM_TIME; + + /* Print our avg latencies on the line */ + for (i = 0; i < ARRAY_SIZE(names); i++) { + /* Compute average latency for a latency histo */ + val = single_histo_average(nva[i].data, nva[i].count); + print_one_stat(val, format, column_width, cb->cb_scripted); + } + free_calc_stats(nva, ARRAY_SIZE(names)); +} + +/* + * Print default statistics (capacity/operations/bandwidth) + */ +static void +print_iostat_default(vdev_stat_t *vs, iostat_cbdata_t *cb, double scale) +{ + unsigned int column_width = default_column_width(cb, IOS_DEFAULT); + enum zfs_nicenum_format format; + char na; /* char to print for "not applicable" values */ + + if (cb->cb_literal) { + format = ZFS_NICENUM_RAW; + na = '0'; + } else { + format = ZFS_NICENUM_1024; + na = '-'; + } + + /* only toplevel vdevs have capacity stats */ + if (vs->vs_space == 0) { + if (cb->cb_scripted) + printf("\t%c\t%c", na, na); + else + printf(" %*c %*c", column_width, na, column_width, + na); + } else { + print_one_stat(vs->vs_alloc, format, column_width, + cb->cb_scripted); + print_one_stat(vs->vs_space - vs->vs_alloc, format, + column_width, cb->cb_scripted); + } + + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_ops[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_READ] * scale), + format, column_width, cb->cb_scripted); + print_one_stat((uint64_t)(vs->vs_bytes[ZIO_TYPE_WRITE] * scale), + format, column_width, cb->cb_scripted); +} + +static const char *class_name[] = { + VDEV_ALLOC_BIAS_DEDUP, + VDEV_ALLOC_BIAS_SPECIAL, + VDEV_ALLOC_CLASS_LOGS +}; + +/* + * Print out all the statistics for the given vdev. This can either be the + * toplevel configuration, or called recursively. If 'name' is NULL, then this + * is a verbose output, and we don't want to display the toplevel pool stats. + * + * Returns the number of stat lines printed. + */ +static unsigned int +print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, + nvlist_t *newnv, iostat_cbdata_t *cb, int depth) +{ + nvlist_t **oldchild, **newchild; + uint_t c, children, oldchildren; + vdev_stat_t *oldvs, *newvs, *calcvs; + vdev_stat_t zerovs = { 0 }; + char *vname; + int i; + int ret = 0; + uint64_t tdelta; + double scale; + + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return (ret); + + calcvs = safe_malloc(sizeof (*calcvs)); + + if (oldnv != NULL) { + verify(nvlist_lookup_uint64_array(oldnv, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&oldvs, &c) == 0); + } else { + oldvs = &zerovs; + } + + /* Do we only want to see a specific vdev? */ + for (i = 0; i < cb->cb_vdev_names_count; i++) { + /* Yes we do. Is this the vdev? */ + if (strcmp(name, cb->cb_vdev_names[i]) == 0) { + /* + * This is our vdev. Since it is the only vdev we + * will be displaying, make depth = 0 so that it + * doesn't get indented. + */ + depth = 0; + break; + } + } + + if (cb->cb_vdev_names_count && (i == cb->cb_vdev_names_count)) { + /* Couldn't match the name */ + goto children; + } + + + verify(nvlist_lookup_uint64_array(newnv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&newvs, &c) == 0); + + /* + * Print the vdev name unless it's is a histogram. Histograms + * display the vdev name in the header itself. + */ + if (!(cb->cb_flags & IOS_ANYHISTO_M)) { + if (cb->cb_scripted) { + printf("%s", name); + } else { + if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - + depth), ""); + } + } + + /* Calculate our scaling factor */ + tdelta = newvs->vs_timestamp - oldvs->vs_timestamp; + if ((oldvs->vs_timestamp == 0) && (cb->cb_flags & IOS_ANYHISTO_M)) { + /* + * If we specify printing histograms with no time interval, then + * print the histogram numbers over the entire lifetime of the + * vdev. + */ + scale = 1; + } else { + if (tdelta == 0) + scale = 1.0; + else + scale = (double)NANOSEC / tdelta; + } + + if (cb->cb_flags & IOS_DEFAULT_M) { + calc_default_iostats(oldvs, newvs, calcvs); + print_iostat_default(calcvs, cb, scale); + } + if (cb->cb_flags & IOS_LATENCY_M) + print_iostat_latency(cb, oldnv, newnv); + if (cb->cb_flags & IOS_QUEUES_M) + print_iostat_queues(cb, oldnv, newnv); + if (cb->cb_flags & IOS_ANYHISTO_M) { + printf("\n"); + print_iostat_histos(cb, oldnv, newnv, scale, name); + } + + if (cb->vcdl != NULL) { + char *path; + if (nvlist_lookup_string(newnv, ZPOOL_CONFIG_PATH, + &path) == 0) { + printf(" "); + zpool_print_cmd(cb->vcdl, zpool_get_name(zhp), path); + } + } + + if (!(cb->cb_flags & IOS_ANYHISTO_M)) + printf("\n"); + + ret++; + +children: + + free(calcvs); + + if (!cb->cb_verbose) + return (ret); + + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_CHILDREN, + &newchild, &children) != 0) + return (ret); + + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_CHILDREN, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } + + /* + * print normal top-level devices + */ + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE, islog = B_FALSE; + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + + (void) nvlist_lookup_uint64(newchild[c], ZPOOL_CONFIG_IS_LOG, + &islog); + + if (ishole || islog) + continue; + + if (nvlist_exists(newchild[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, + newchild[c], cb, depth + 2); + free(vname); + } + + /* + * print all other top-level devices + */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE; + char *bias = NULL; + char *type = NULL; + + (void) nvlist_lookup_uint64(newchild[c], + ZPOOL_CONFIG_IS_LOG, &islog); + if (islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(newchild[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && + !cb->cb_scripted && !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, + class_name[n]); + } + printf("\n"); + printed = B_TRUE; + } + + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? + oldchild[c] : NULL, newchild[c], cb, depth + 2); + free(vname); + } + } + + /* + * Include level 2 ARC devices in iostat output + */ + if (nvlist_lookup_nvlist_array(newnv, ZPOOL_CONFIG_L2CACHE, + &newchild, &children) != 0) + return (ret); + + if (oldnv) { + if (nvlist_lookup_nvlist_array(oldnv, ZPOOL_CONFIG_L2CACHE, + &oldchild, &oldchildren) != 0) + return (ret); + + children = MIN(oldchildren, children); + } + + if (children > 0) { + if ((!(cb->cb_flags & IOS_ANYHISTO_M)) && !cb->cb_scripted && + !cb->cb_vdev_names) { + print_iostat_dashes(cb, 0, "cache"); + } + printf("\n"); + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + cb->cb_name_flags); + ret += print_vdev_stats(zhp, vname, oldnv ? oldchild[c] + : NULL, newchild[c], cb, depth + 2); + free(vname); + } + } + + return (ret); +} + +static int +refresh_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + boolean_t missing; + + /* + * If the pool has disappeared, remove it from the list and continue. + */ + if (zpool_refresh_stats(zhp, &missing) != 0) + return (-1); + + if (missing) + pool_list_remove(cb->cb_list, zhp); + + return (0); +} + +/* + * Callback to print out the iostats for the given pool. + */ +static int +print_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + nvlist_t *oldconfig, *newconfig; + nvlist_t *oldnvroot, *newnvroot; + int ret; + + newconfig = zpool_get_config(zhp, &oldconfig); + + if (cb->cb_iteration == 1) + oldconfig = NULL; + + verify(nvlist_lookup_nvlist(newconfig, ZPOOL_CONFIG_VDEV_TREE, + &newnvroot) == 0); + + if (oldconfig == NULL) + oldnvroot = NULL; + else + verify(nvlist_lookup_nvlist(oldconfig, ZPOOL_CONFIG_VDEV_TREE, + &oldnvroot) == 0); + + ret = print_vdev_stats(zhp, zpool_get_name(zhp), oldnvroot, newnvroot, + cb, 0); + if ((ret != 0) && !(cb->cb_flags & IOS_ANYHISTO_M) && + !cb->cb_scripted && cb->cb_verbose && !cb->cb_vdev_names_count) { + print_iostat_separator(cb); + if (cb->vcdl != NULL) { + print_cmd_columns(cb->vcdl, 1); + } + printf("\n"); + } + + return (ret); +} + +static int +get_columns(void) +{ + struct winsize ws; + int columns = 80; + int error; + + if (isatty(STDOUT_FILENO)) { + error = ioctl(STDOUT_FILENO, TIOCGWINSZ, &ws); + if (error == 0) + columns = ws.ws_col; + } else { + columns = 999; + } + + return (columns); +} + +/* + * Return the required length of the pool/vdev name column. The minimum + * allowed width and output formatting flags must be provided. + */ +static int +get_namewidth(zpool_handle_t *zhp, int min_width, int flags, boolean_t verbose) +{ + nvlist_t *config, *nvroot; + int width = min_width; + + if ((config = zpool_get_config(zhp, NULL)) != NULL) { + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + unsigned int poolname_len = strlen(zpool_get_name(zhp)); + if (verbose == B_FALSE) { + width = MAX(poolname_len, min_width); + } else { + width = MAX(poolname_len, + max_width(zhp, nvroot, 0, min_width, flags)); + } + } + + return (width); +} + +/* + * Parse the input string, get the 'interval' and 'count' value if there is one. + */ +static void +get_interval_count(int *argcp, char **argv, float *iv, + unsigned long *cnt) +{ + float interval = 0; + unsigned long count = 0; + int argc = *argcp; + + /* + * Determine if the last argument is an integer or a pool name + */ + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + char *end; + + errno = 0; + interval = strtof(argv[argc - 1], &end); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + /* + * Ignore the last parameter + */ + argc--; + } else { + /* + * If this is not a valid number, just plow on. The + * user will get a more informative error message later + * on. + */ + interval = 0; + } + } + + /* + * If the last argument is also an integer, then we have both a count + * and an interval. + */ + if (argc > 0 && zfs_isnumber(argv[argc - 1])) { + char *end; + + errno = 0; + count = interval; + interval = strtof(argv[argc - 1], &end); + + if (*end == '\0' && errno == 0) { + if (interval == 0) { + (void) fprintf(stderr, gettext("interval " + "cannot be zero\n")); + usage(B_FALSE); + } + + /* + * Ignore the last parameter + */ + argc--; + } else { + interval = 0; + } + } + + *iv = interval; + *cnt = count; + *argcp = argc; +} + +static void +get_timestamp_arg(char c) +{ + if (c == 'u') + timestamp_fmt = UDATE; + else if (c == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); +} + +/* + * Return stat flags that are supported by all pools by both the module and + * zpool iostat. "*data" should be initialized to all 0xFFs before running. + * It will get ANDed down until only the flags that are supported on all pools + * remain. + */ +static int +get_stat_flags_cb(zpool_handle_t *zhp, void *data) +{ + uint64_t *mask = data; + nvlist_t *config, *nvroot, *nvx; + uint64_t flags = 0; + int i, j; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + /* Default stats are always supported, but for completeness.. */ + if (nvlist_exists(nvroot, ZPOOL_CONFIG_VDEV_STATS)) + flags |= IOS_DEFAULT_M; + + /* Get our extended stats nvlist from the main list */ + if (nvlist_lookup_nvlist(nvroot, ZPOOL_CONFIG_VDEV_STATS_EX, + &nvx) != 0) { + /* + * No extended stats; they're probably running an older + * module. No big deal, we support that too. + */ + goto end; + } + + /* For each extended stat, make sure all its nvpairs are supported */ + for (j = 0; j < ARRAY_SIZE(vsx_type_to_nvlist); j++) { + if (!vsx_type_to_nvlist[j][0]) + continue; + + /* Start off by assuming the flag is supported, then check */ + flags |= (1ULL << j); + for (i = 0; vsx_type_to_nvlist[j][i]; i++) { + if (!nvlist_exists(nvx, vsx_type_to_nvlist[j][i])) { + /* flag isn't supported */ + flags = flags & ~(1ULL << j); + break; + } + } + } +end: + *mask = *mask & flags; + return (0); +} + +/* + * Return a bitmask of stats that are supported on all pools by both the module + * and zpool iostat. + */ +static uint64_t +get_stat_flags(zpool_list_t *list) +{ + uint64_t mask = -1; + + /* + * get_stat_flags_cb() will lop off bits from "mask" until only the + * flags that are supported on all pools remain. + */ + pool_list_iter(list, B_FALSE, get_stat_flags_cb, &mask); + return (mask); +} + +/* + * Return 1 if cb_data->cb_vdev_names[0] is this vdev's name, 0 otherwise. + */ +static int +is_vdev_cb(zpool_handle_t *zhp, nvlist_t *nv, void *cb_data) +{ + iostat_cbdata_t *cb = cb_data; + char *name = NULL; + int ret = 0; + + name = zpool_vdev_name(g_zfs, zhp, nv, cb->cb_name_flags); + + if (strcmp(name, cb->cb_vdev_names[0]) == 0) + ret = 1; /* match */ + free(name); + + return (ret); +} + +/* + * Returns 1 if cb_data->cb_vdev_names[0] is a vdev name, 0 otherwise. + */ +static int +is_vdev(zpool_handle_t *zhp, void *cb_data) +{ + return (for_each_vdev(zhp, is_vdev_cb, cb_data)); +} + +/* + * Check if vdevs are in a pool + * + * Return 1 if all argv[] strings are vdev names in pool "pool_name". Otherwise + * return 0. If pool_name is NULL, then search all pools. + */ +static int +are_vdevs_in_pool(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + char **tmp_name; + int ret = 0; + int i; + int pool_count = 0; + + if ((argc == 0) || !*argv) + return (0); + + if (pool_name) + pool_count = 1; + + /* Temporarily hijack cb_vdev_names for a second... */ + tmp_name = cb->cb_vdev_names; + + /* Go though our list of prospective vdev names */ + for (i = 0; i < argc; i++) { + cb->cb_vdev_names = argv + i; + + /* Is this name a vdev in our pools? */ + ret = for_each_pool(pool_count, &pool_name, B_TRUE, NULL, + is_vdev, cb); + if (!ret) { + /* No match */ + break; + } + } + + cb->cb_vdev_names = tmp_name; + + return (ret); +} + +static int +is_pool_cb(zpool_handle_t *zhp, void *data) +{ + char *name = data; + if (strcmp(name, zpool_get_name(zhp)) == 0) + return (1); + + return (0); +} + +/* + * Do we have a pool named *name? If so, return 1, otherwise 0. + */ +static int +is_pool(char *name) +{ + return (for_each_pool(0, NULL, B_TRUE, NULL, is_pool_cb, name)); +} + +/* Are all our argv[] strings pool names? If so return 1, 0 otherwise. */ +static int +are_all_pools(int argc, char **argv) +{ + if ((argc == 0) || !*argv) + return (0); + + while (--argc >= 0) + if (!is_pool(argv[argc])) + return (0); + + return (1); +} + +/* + * Helper function to print out vdev/pool names we can't resolve. Used for an + * error message. + */ +static void +error_list_unresolved_vdevs(int argc, char **argv, char *pool_name, + iostat_cbdata_t *cb) +{ + int i; + char *name; + char *str; + for (i = 0; i < argc; i++) { + name = argv[i]; + + if (is_pool(name)) + str = gettext("pool"); + else if (are_vdevs_in_pool(1, &name, pool_name, cb)) + str = gettext("vdev in this pool"); + else if (are_vdevs_in_pool(1, &name, NULL, cb)) + str = gettext("vdev in another pool"); + else + str = gettext("unknown"); + + fprintf(stderr, "\t%s (%s)\n", name, str); + } +} + +/* + * Same as get_interval_count(), but with additional checks to not misinterpret + * guids as interval/count values. Assumes VDEV_NAME_GUID is set in + * cb.cb_name_flags. + */ +static void +get_interval_count_filter_guids(int *argc, char **argv, float *interval, + unsigned long *count, iostat_cbdata_t *cb) +{ + char **tmpargv = argv; + int argc_for_interval = 0; + + /* Is the last arg an interval value? Or a guid? */ + if (*argc >= 1 && !are_vdevs_in_pool(1, &argv[*argc - 1], NULL, cb)) { + /* + * The last arg is not a guid, so it's probably an + * interval value. + */ + argc_for_interval++; + + if (*argc >= 2 && + !are_vdevs_in_pool(1, &argv[*argc - 2], NULL, cb)) { + /* + * The 2nd to last arg is not a guid, so it's probably + * an interval value. + */ + argc_for_interval++; + } + } + + /* Point to our list of possible intervals */ + tmpargv = &argv[*argc - argc_for_interval]; + + *argc = *argc - argc_for_interval; + get_interval_count(&argc_for_interval, tmpargv, + interval, count); +} + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +static void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + +/* + * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or + * if we were unable to determine its size. + */ +static int +terminal_height(void) +{ + struct winsize win; + + if (isatty(STDOUT_FILENO) == 0) + return (-1); + + if (ioctl(STDOUT_FILENO, TIOCGWINSZ, &win) != -1 && win.ws_row > 0) + return (win.ws_row); + + return (-1); +} + +/* + * Run one of the zpool status/iostat -c scripts with the help (-h) option and + * print the result. + * + * name: Short name of the script ('iostat'). + * path: Full path to the script ('/usr/local/etc/zfs/zpool.d/iostat'); + */ +static void +print_zpool_script_help(char *name, char *path) +{ + char *argv[] = {path, "-h", NULL}; + char **lines = NULL; + int lines_cnt = 0; + int rc; + + rc = libzfs_run_process_get_stdout_nopath(path, argv, NULL, &lines, + &lines_cnt); + if (rc != 0 || lines == NULL || lines_cnt <= 0) { + if (lines != NULL) + libzfs_free_str_array(lines, lines_cnt); + return; + } + + for (int i = 0; i < lines_cnt; i++) + if (!is_blank_str(lines[i])) + printf(" %-14s %s\n", name, lines[i]); + + libzfs_free_str_array(lines, lines_cnt); +} + +/* + * Go though the zpool status/iostat -c scripts in the user's path, run their + * help option (-h), and print out the results. + */ +static void +print_zpool_dir_scripts(char *dirpath) +{ + DIR *dir; + struct dirent *ent; + char fullpath[MAXPATHLEN]; + struct stat dir_stat; + + if ((dir = opendir(dirpath)) != NULL) { + /* print all the files and directories within directory */ + while ((ent = readdir(dir)) != NULL) { + sprintf(fullpath, "%s/%s", dirpath, ent->d_name); + + /* Print the scripts */ + if (stat(fullpath, &dir_stat) == 0) + if (dir_stat.st_mode & S_IXUSR && + S_ISREG(dir_stat.st_mode)) + print_zpool_script_help(ent->d_name, + fullpath); + } + closedir(dir); + } +} + +/* + * Print out help text for all zpool status/iostat -c scripts. + */ +static void +print_zpool_script_list(char *subcommand) +{ + char *dir, *sp; + + printf(gettext("Available 'zpool %s -c' commands:\n"), subcommand); + + sp = zpool_get_cmd_search_path(); + if (sp == NULL) + return; + + dir = strtok(sp, ":"); + while (dir != NULL) { + print_zpool_dir_scripts(dir); + dir = strtok(NULL, ":"); + } + + free(sp); +} + +/* + * Set the minimum pool/vdev name column width. The width must be at least 10, + * but may be as large as the column width - 42 so it still fits on one line. + * NOTE: 42 is the width of the default capacity/operations/bandwidth output + */ +static int +get_namewidth_iostat(zpool_handle_t *zhp, void *data) +{ + iostat_cbdata_t *cb = data; + int width, available_width; + + /* + * get_namewidth() returns the maximum width of any name in that column + * for any pool/vdev/device line that will be output. + */ + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + + /* + * The width we are calculating is the width of the header and also the + * padding width for names that are less than maximum width. The stats + * take up 42 characters, so the width available for names is: + */ + available_width = get_columns() - 42; + + /* + * If the maximum width fits on a screen, then great! Make everything + * line up by justifying all lines to the same width. If that max + * width is larger than what's available, the name plus stats won't fit + * on one line, and justifying to that width would cause every line to + * wrap on the screen. We only want lines with long names to wrap. + * Limit the padding to what won't wrap. + */ + if (width > available_width) + width = available_width; + + /* + * And regardless of whatever the screen width is (get_columns can + * return 0 if the width is not known or less than 42 for a narrow + * terminal) have the width be a minimum of 10. + */ + if (width < 10) + width = 10; + + /* Save the calculated width */ + cb->cb_namewidth = width; + + return (0); +} + +/* + * zpool iostat [[-c [script1,script2,...]] [-lq]|[-rw]] [-ghHLpPvy] [-n name] + * [-T d|u] [[ pool ...]|[pool vdev ...]|[vdev ...]] + * [interval [count]] + * + * -c CMD For each vdev, run command CMD + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -P Display full path for vdev name. + * -v Display statistics for individual vdevs + * -h Display help + * -p Display values in parsable (exact) format. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -l Display average latency + * -q Display queue depths + * -w Display latency histograms + * -r Display request size histogram + * -T Display a timestamp in date(1) or Unix format + * -n Only print headers once + * + * This command can be tricky because we want to be able to deal with pool + * creation/destruction as well as vdev configuration changes. The bulk of this + * processing is handled by the pool_list_* routines in zpool_iter.c. We rely + * on pool_list_update() to detect the addition of new pools. Configuration + * changes are all handled within libzfs. + */ +int +zpool_do_iostat(int argc, char **argv) +{ + int c; + int ret; + int npools; + float interval = 0; + unsigned long count = 0; + int winheight = 24; + zpool_list_t *list; + boolean_t verbose = B_FALSE; + boolean_t latency = B_FALSE, l_histo = B_FALSE, rq_histo = B_FALSE; + boolean_t queues = B_FALSE, parsable = B_FALSE, scripted = B_FALSE; + boolean_t omit_since_boot = B_FALSE; + boolean_t guid = B_FALSE; + boolean_t follow_links = B_FALSE; + boolean_t full_name = B_FALSE; + boolean_t headers_once = B_FALSE; + iostat_cbdata_t cb = { 0 }; + char *cmd = NULL; + + /* Used for printing error message */ + const char flag_to_arg[] = {[IOS_LATENCY] = 'l', [IOS_QUEUES] = 'q', + [IOS_L_HISTO] = 'w', [IOS_RQ_HISTO] = 'r'}; + + uint64_t unsupported_flags; + + /* check options */ + while ((c = getopt(argc, argv, "c:gLPT:vyhplqrwnH")) != -1) { + switch (c) { + case 'c': + if (cmd != NULL) { + fprintf(stderr, + gettext("Can't set -c flag twice\n")); + exit(1); + } + + if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { + fprintf(stderr, gettext( + "Can't run -c, disabled by " + "ZPOOL_SCRIPTS_ENABLED.\n")); + exit(1); + } + + if ((getuid() <= 0 || geteuid() <= 0) && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { + fprintf(stderr, gettext( + "Can't run -c with root privileges " + "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); + exit(1); + } + cmd = optarg; + verbose = B_TRUE; + break; + case 'g': + guid = B_TRUE; + break; + case 'L': + follow_links = B_TRUE; + break; + case 'P': + full_name = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + verbose = B_TRUE; + break; + case 'p': + parsable = B_TRUE; + break; + case 'l': + latency = B_TRUE; + break; + case 'q': + queues = B_TRUE; + break; + case 'H': + scripted = B_TRUE; + break; + case 'w': + l_histo = B_TRUE; + break; + case 'r': + rq_histo = B_TRUE; + break; + case 'y': + omit_since_boot = B_TRUE; + break; + case 'n': + headers_once = B_TRUE; + break; + case 'h': + usage(B_FALSE); + break; + case '?': + if (optopt == 'c') { + print_zpool_script_list("iostat"); + exit(0); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + cb.cb_literal = parsable; + cb.cb_scripted = scripted; + + if (guid) + cb.cb_name_flags |= VDEV_NAME_GUID; + if (follow_links) + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + if (full_name) + cb.cb_name_flags |= VDEV_NAME_PATH; + cb.cb_iteration = 0; + cb.cb_namewidth = 0; + cb.cb_verbose = verbose; + + /* Get our interval and count values (if any) */ + if (guid) { + get_interval_count_filter_guids(&argc, argv, &interval, + &count, &cb); + } else { + get_interval_count(&argc, argv, &interval, &count); + } + + if (argc == 0) { + /* No args, so just print the defaults. */ + } else if (are_all_pools(argc, argv)) { + /* All the args are pool names */ + } else if (are_vdevs_in_pool(argc, argv, NULL, &cb)) { + /* All the args are vdevs */ + cb.cb_vdev_names = argv; + cb.cb_vdev_names_count = argc; + argc = 0; /* No pools to process */ + } else if (are_all_pools(1, argv)) { + /* The first arg is a pool name */ + if (are_vdevs_in_pool(argc - 1, argv + 1, argv[0], &cb)) { + /* ...and the rest are vdev names */ + cb.cb_vdev_names = argv + 1; + cb.cb_vdev_names_count = argc - 1; + argc = 1; /* One pool to process */ + } else { + fprintf(stderr, gettext("Expected either a list of ")); + fprintf(stderr, gettext("pools, or list of vdevs in")); + fprintf(stderr, " \"%s\", ", argv[0]); + fprintf(stderr, gettext("but got:\n")); + error_list_unresolved_vdevs(argc - 1, argv + 1, + argv[0], &cb); + fprintf(stderr, "\n"); + usage(B_FALSE); + return (1); + } + } else { + /* + * The args don't make sense. The first arg isn't a pool name, + * nor are all the args vdevs. + */ + fprintf(stderr, gettext("Unable to parse pools/vdevs list.\n")); + fprintf(stderr, "\n"); + return (1); + } + + if (cb.cb_vdev_names_count != 0) { + /* + * If user specified vdevs, it implies verbose. + */ + cb.cb_verbose = B_TRUE; + } + + /* + * Construct the list of all interesting pools. + */ + ret = 0; + if ((list = pool_list_get(argc, argv, NULL, &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0 && argc != 0) { + pool_list_free(list); + return (1); + } + + if (pool_list_count(list) == 0 && interval == 0) { + pool_list_free(list); + (void) fprintf(stderr, gettext("no pools available\n")); + return (1); + } + + if ((l_histo || rq_histo) && (cmd != NULL || latency || queues)) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("[-r|-w] isn't allowed with [-c|-l|-q]\n")); + usage(B_FALSE); + return (1); + } + + if (l_histo && rq_histo) { + pool_list_free(list); + (void) fprintf(stderr, + gettext("Only one of [-r|-w] can be passed at a time\n")); + usage(B_FALSE); + return (1); + } + + /* + * Enter the main iostat loop. + */ + cb.cb_list = list; + + if (l_histo) { + /* + * Histograms tables look out of place when you try to display + * them with the other stats, so make a rule that you can only + * print histograms by themselves. + */ + cb.cb_flags = IOS_L_HISTO_M; + } else if (rq_histo) { + cb.cb_flags = IOS_RQ_HISTO_M; + } else { + cb.cb_flags = IOS_DEFAULT_M; + if (latency) + cb.cb_flags |= IOS_LATENCY_M; + if (queues) + cb.cb_flags |= IOS_QUEUES_M; + } + + /* + * See if the module supports all the stats we want to display. + */ + unsupported_flags = cb.cb_flags & ~get_stat_flags(list); + if (unsupported_flags) { + uint64_t f; + int idx; + fprintf(stderr, + gettext("The loaded zfs module doesn't support:")); + + /* for each bit set in unsupported_flags */ + for (f = unsupported_flags; f; f &= ~(1ULL << idx)) { + idx = lowbit64(f) - 1; + fprintf(stderr, " -%c", flag_to_arg[idx]); + } + + fprintf(stderr, ". Try running a newer module.\n"); + pool_list_free(list); + + return (1); + } + + for (;;) { + if ((npools = pool_list_count(list)) == 0) + (void) fprintf(stderr, gettext("no pools available\n")); + else { + /* + * If this is the first iteration and -y was supplied + * we skip any printing. + */ + boolean_t skip = (omit_since_boot && + cb.cb_iteration == 0); + + /* + * Refresh all statistics. This is done as an + * explicit step before calculating the maximum name + * width, so that any * configuration changes are + * properly accounted for. + */ + (void) pool_list_iter(list, B_FALSE, refresh_iostat, + &cb); + + /* + * Iterate over all pools to determine the maximum width + * for the pool / device name column across all pools. + */ + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, + get_namewidth_iostat, &cb); + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (cmd != NULL && cb.cb_verbose && + !(cb.cb_flags & IOS_ANYHISTO_M)) { + cb.vcdl = all_pools_for_each_vdev_run(argc, + argv, cmd, g_zfs, cb.cb_vdev_names, + cb.cb_vdev_names_count, cb.cb_name_flags); + } else { + cb.vcdl = NULL; + } + + + /* + * Check terminal size so we can print headers + * even when terminal window has its height + * changed. + */ + winheight = terminal_height(); + /* + * Are we connected to TTY? If not, headers_once + * should be true, to avoid breaking scripts. + */ + if (winheight < 0) + headers_once = B_TRUE; + + /* + * If it's the first time and we're not skipping it, + * or either skip or verbose mode, print the header. + * + * The histogram code explicitly prints its header on + * every vdev, so skip this for histograms. + */ + if (((++cb.cb_iteration == 1 && !skip) || + (skip != verbose) || + (!headers_once && + (cb.cb_iteration % winheight) == 0)) && + (!(cb.cb_flags & IOS_ANYHISTO_M)) && + !cb.cb_scripted) + print_iostat_header(&cb); + + if (skip) { + (void) fsleep(interval); + continue; + } + + pool_list_iter(list, B_FALSE, print_iostat, &cb); + + /* + * If there's more than one pool, and we're not in + * verbose mode (which prints a separator for us), + * then print a separator. + * + * In addition, if we're printing specific vdevs then + * we also want an ending separator. + */ + if (((npools > 1 && !verbose && + !(cb.cb_flags & IOS_ANYHISTO_M)) || + (!(cb.cb_flags & IOS_ANYHISTO_M) && + cb.cb_vdev_names_count)) && + !cb.cb_scripted) { + print_iostat_separator(&cb); + if (cb.vcdl != NULL) + print_cmd_columns(cb.vcdl, 1); + printf("\n"); + } + + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + + } + + /* + * Flush the output so that redirection to a file isn't buffered + * indefinitely. + */ + (void) fflush(stdout); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) fsleep(interval); + } + + pool_list_free(list); + + return (ret); +} + +typedef struct list_cbdata { + boolean_t cb_verbose; + int cb_name_flags; + int cb_namewidth; + boolean_t cb_scripted; + zprop_list_t *cb_proplist; + boolean_t cb_literal; +} list_cbdata_t; + + +/* + * Given a list of columns to display, output appropriate headers for each one. + */ +static void +print_header(list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + char headerbuf[ZPOOL_MAXPROPLEN]; + const char *header; + boolean_t first = B_TRUE; + boolean_t right_justify; + size_t width = 0; + + for (; pl != NULL; pl = pl->pl_next) { + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + + if (!first) + (void) printf(" "); + else + first = B_FALSE; + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + header = zpool_prop_column_name(pl->pl_prop); + right_justify = zpool_prop_align_right(pl->pl_prop); + } else { + int i; + + for (i = 0; pl->pl_user_prop[i] != '\0'; i++) + headerbuf[i] = toupper(pl->pl_user_prop[i]); + headerbuf[i] = '\0'; + header = headerbuf; + } + + if (pl->pl_next == NULL && !right_justify) + (void) printf("%s", header); + else if (right_justify) + (void) printf("%*s", (int)width, header); + else + (void) printf("%-*s", (int)width, header); + } + + (void) printf("\n"); +} + +/* + * Given a pool and a list of properties, print out all the properties according + * to the described layout. Used by zpool_do_list(). + */ +static void +print_pool(zpool_handle_t *zhp, list_cbdata_t *cb) +{ + zprop_list_t *pl = cb->cb_proplist; + boolean_t first = B_TRUE; + char property[ZPOOL_MAXPROPLEN]; + char *propstr; + boolean_t right_justify; + size_t width; + + for (; pl != NULL; pl = pl->pl_next) { + + width = pl->pl_width; + if (first && cb->cb_verbose) { + /* + * Reset the width to accommodate the verbose listing + * of devices. + */ + width = cb->cb_namewidth; + } + + if (!first) { + if (cb->cb_scripted) + (void) printf("\t"); + else + (void) printf(" "); + } else { + first = B_FALSE; + } + + right_justify = B_FALSE; + if (pl->pl_prop != ZPROP_INVAL) { + if (zpool_get_prop(zhp, pl->pl_prop, property, + sizeof (property), NULL, cb->cb_literal) != 0) + propstr = "-"; + else + propstr = property; + + right_justify = zpool_prop_align_right(pl->pl_prop); + } else if ((zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop)) && + zpool_prop_get_feature(zhp, pl->pl_user_prop, property, + sizeof (property)) == 0) { + propstr = property; + } else { + propstr = "-"; + } + + + /* + * If this is being called in scripted mode, or if this is the + * last column and it is left-justified, don't include a width + * format specifier. + */ + if (cb->cb_scripted || (pl->pl_next == NULL && !right_justify)) + (void) printf("%s", propstr); + else if (right_justify) + (void) printf("%*s", (int)width, propstr); + else + (void) printf("%-*s", (int)width, propstr); + } + + (void) printf("\n"); +} + +static void +print_one_column(zpool_prop_t prop, uint64_t value, const char *str, + boolean_t scripted, boolean_t valid, enum zfs_nicenum_format format) +{ + char propval[64]; + boolean_t fixed; + size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL); + + switch (prop) { + case ZPOOL_PROP_EXPANDSZ: + case ZPOOL_PROP_CHECKPOINT: + case ZPOOL_PROP_DEDUPRATIO: + if (value == 0) + (void) strlcpy(propval, "-", sizeof (propval)); + else + zfs_nicenum_format(value, propval, sizeof (propval), + format); + break; + case ZPOOL_PROP_FRAGMENTATION: + if (value == ZFS_FRAG_INVALID) { + (void) strlcpy(propval, "-", sizeof (propval)); + } else if (format == ZFS_NICENUM_RAW) { + (void) snprintf(propval, sizeof (propval), "%llu", + (unsigned long long)value); + } else { + (void) snprintf(propval, sizeof (propval), "%llu%%", + (unsigned long long)value); + } + break; + case ZPOOL_PROP_CAPACITY: + /* capacity value is in parts-per-10,000 (aka permyriad) */ + if (format == ZFS_NICENUM_RAW) + (void) snprintf(propval, sizeof (propval), "%llu", + (unsigned long long)value / 100); + else + (void) snprintf(propval, sizeof (propval), + value < 1000 ? "%1.2f%%" : value < 10000 ? + "%2.1f%%" : "%3.0f%%", value / 100.0); + break; + case ZPOOL_PROP_HEALTH: + width = 8; + snprintf(propval, sizeof (propval), "%-*s", (int)width, str); + break; + default: + zfs_nicenum_format(value, propval, sizeof (propval), format); + } + + if (!valid) + (void) strlcpy(propval, "-", sizeof (propval)); + + if (scripted) + (void) printf("\t%s", propval); + else + (void) printf(" %*s", (int)width, propval); +} + +/* + * print static default line per vdev + * not compatible with '-o' option + */ +static void +print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + list_cbdata_t *cb, int depth, boolean_t isspare) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, children; + char *vname; + boolean_t scripted = cb->cb_scripted; + uint64_t islog = B_FALSE; + char *dashes = "%-*s - - - - " + "- - - - -\n"; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (name != NULL) { + boolean_t toplevel = (vs->vs_space != 0); + uint64_t cap; + enum zfs_nicenum_format format; + const char *state; + + if (cb->cb_literal) + format = ZFS_NICENUM_RAW; + else + format = ZFS_NICENUM_1024; + + if (strcmp(name, VDEV_TYPE_INDIRECT) == 0) + return; + + if (scripted) + (void) printf("\t%s", name); + else if (strlen(name) + depth > cb->cb_namewidth) + (void) printf("%*s%s", depth, "", name); + else + (void) printf("%*s%s%*s", depth, "", name, + (int)(cb->cb_namewidth - strlen(name) - depth), ""); + + /* + * Print the properties for the individual vdevs. Some + * properties are only applicable to toplevel vdevs. The + * 'toplevel' boolean value is passed to the print_one_column() + * to indicate that the value is valid. + */ + print_one_column(ZPOOL_PROP_SIZE, vs->vs_space, NULL, scripted, + toplevel, format); + print_one_column(ZPOOL_PROP_ALLOCATED, vs->vs_alloc, NULL, + scripted, toplevel, format); + print_one_column(ZPOOL_PROP_FREE, vs->vs_space - vs->vs_alloc, + NULL, scripted, toplevel, format); + print_one_column(ZPOOL_PROP_CHECKPOINT, + vs->vs_checkpoint_space, NULL, scripted, toplevel, format); + print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize, NULL, + scripted, B_TRUE, format); + print_one_column(ZPOOL_PROP_FRAGMENTATION, + vs->vs_fragmentation, NULL, scripted, + (vs->vs_fragmentation != ZFS_FRAG_INVALID && toplevel), + format); + cap = (vs->vs_space == 0) ? 0 : + (vs->vs_alloc * 10000 / vs->vs_space); + print_one_column(ZPOOL_PROP_CAPACITY, cap, NULL, + scripted, toplevel, format); + print_one_column(ZPOOL_PROP_DEDUPRATIO, 0, NULL, + scripted, toplevel, format); + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { + if (vs->vs_aux == VDEV_AUX_SPARED) + state = "INUSE"; + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = "AVAIL"; + } + print_one_column(ZPOOL_PROP_HEALTH, 0, state, scripted, + B_TRUE, format); + (void) printf("\n"); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + /* list the normal vdevs first */ + for (c = 0; c < children; c++) { + uint64_t ishole = B_FALSE; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_HOLE, &ishole) == 0 && ishole) + continue; + + if (nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_LOG, &islog) == 0 && islog) + continue; + + if (nvlist_exists(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, B_FALSE); + free(vname); + } + + /* list the classes: 'logs', 'dedup', and 'special' */ + for (uint_t n = 0; n < 3; n++) { + boolean_t printed = B_FALSE; + + for (c = 0; c < children; c++) { + char *bias = NULL; + char *type = NULL; + + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog) == 0 && islog) { + bias = VDEV_ALLOC_CLASS_LOGS; + } else { + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias); + (void) nvlist_lookup_string(child[c], + ZPOOL_CONFIG_TYPE, &type); + } + if (bias == NULL || strcmp(bias, class_name[n]) != 0) + continue; + if (!islog && strcmp(type, VDEV_TYPE_INDIRECT) == 0) + continue; + + if (!printed) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, + class_name[n]); + printed = B_TRUE; + } + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_FALSE); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "cache"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_FALSE); + free(vname); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, + &children) == 0 && children > 0) { + /* LINTED E_SEC_PRINTF_VAR_FMT */ + (void) printf(dashes, cb->cb_namewidth, "spare"); + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(g_zfs, zhp, child[c], + cb->cb_name_flags); + print_list_stats(zhp, vname, child[c], cb, depth + 2, + B_TRUE); + free(vname); + } + } +} + +/* + * Generic callback function to list a pool. + */ +static int +list_callback(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cbp = data; + + print_pool(zhp, cbp); + + if (cbp->cb_verbose) { + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + print_list_stats(zhp, NULL, nvroot, cbp, 0, B_FALSE); + } + + return (0); +} + +/* + * Set the minimum pool/vdev name column width. The width must be at least 9, + * but may be as large as needed. + */ +static int +get_namewidth_list(zpool_handle_t *zhp, void *data) +{ + list_cbdata_t *cb = data; + int width; + + width = get_namewidth(zhp, cb->cb_namewidth, cb->cb_name_flags, + cb->cb_verbose); + + if (width < 9) + width = 9; + + cb->cb_namewidth = width; + + return (0); +} + +/* + * zpool list [-gHLpP] [-o prop[,prop]*] [-T d|u] [pool] ... [interval [count]] + * + * -g Display guid for individual vdev name. + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -L Follow links when resolving vdev path name. + * -o List of properties to display. Defaults to + * "name,size,allocated,free,expandsize,fragmentation,capacity," + * "dedupratio,health,altroot" + * -p Display values in parsable (exact) format. + * -P Display full path for vdev name. + * -T Display a timestamp in date(1) or Unix format + * + * List all pools in the system, whether or not they're healthy. Output space + * statistics for each one, as well as health status summary. + */ +int +zpool_do_list(int argc, char **argv) +{ + int c; + int ret = 0; + list_cbdata_t cb = { 0 }; + static char default_props[] = + "name,size,allocated,free,checkpoint,expandsize,fragmentation," + "capacity,dedupratio,health,altroot"; + char *props = default_props; + float interval = 0; + unsigned long count = 0; + zpool_list_t *list; + boolean_t first = B_TRUE; + + /* check options */ + while ((c = getopt(argc, argv, ":gHLo:pPT:v")) != -1) { + switch (c) { + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'o': + props = optarg; + break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 'v': + cb.cb_verbose = B_TRUE; + cb.cb_namewidth = 8; /* 8 until precalc is avail */ + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (zprop_get_list(g_zfs, props, &cb.cb_proplist, ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + for (;;) { + if ((list = pool_list_get(argc, argv, &cb.cb_proplist, + &ret)) == NULL) + return (1); + + if (pool_list_count(list) == 0) + break; + + cb.cb_namewidth = 0; + (void) pool_list_iter(list, B_FALSE, get_namewidth_list, &cb); + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (!cb.cb_scripted && (first || cb.cb_verbose)) { + print_header(&cb); + first = B_FALSE; + } + ret = pool_list_iter(list, B_TRUE, list_callback, &cb); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + pool_list_free(list); + (void) fsleep(interval); + } + + if (argc == 0 && !cb.cb_scripted && pool_list_count(list) == 0) { + (void) printf(gettext("no pools available\n")); + ret = 0; + } + + pool_list_free(list); + zprop_free_list(cb.cb_proplist); + return (ret); +} + +static int +zpool_do_attach_or_replace(int argc, char **argv, int replacing) +{ + boolean_t force = B_FALSE; + boolean_t rebuild = B_FALSE; + boolean_t wait = B_FALSE; + int c; + nvlist_t *nvroot; + char *poolname, *old_disk, *new_disk; + zpool_handle_t *zhp; + nvlist_t *props = NULL; + char *propval; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "fo:sw")) != -1) { + switch (c) { + case 'f': + force = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) == NULL) { + (void) fprintf(stderr, gettext("missing " + "'=' for -o option\n")); + usage(B_FALSE); + } + *propval = '\0'; + propval++; + + if ((strcmp(optarg, ZPOOL_CONFIG_ASHIFT) != 0) || + (add_prop_list(optarg, propval, &props, B_TRUE))) + usage(B_FALSE); + break; + case 's': + rebuild = B_TRUE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + + old_disk = argv[1]; + + if (argc < 3) { + if (!replacing) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + new_disk = old_disk; + argc -= 1; + argv += 1; + } else { + new_disk = argv[2]; + argc -= 2; + argv += 2; + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) { + nvlist_free(props); + return (1); + } + + if (zpool_get_config(zhp, NULL) == NULL) { + (void) fprintf(stderr, gettext("pool '%s' is unavailable\n"), + poolname); + zpool_close(zhp); + nvlist_free(props); + return (1); + } + + /* unless manually specified use "ashift" pool property (if set) */ + if (!nvlist_exists(props, ZPOOL_CONFIG_ASHIFT)) { + int intval; + zprop_source_t src; + char strval[ZPOOL_MAXPROPLEN]; + + intval = zpool_get_prop_int(zhp, ZPOOL_PROP_ASHIFT, &src); + if (src != ZPROP_SRC_DEFAULT) { + (void) sprintf(strval, "%" PRId32, intval); + verify(add_prop_list(ZPOOL_CONFIG_ASHIFT, strval, + &props, B_TRUE) == 0); + } + } + + nvroot = make_root_vdev(zhp, props, force, B_FALSE, replacing, B_FALSE, + argc, argv); + if (nvroot == NULL) { + zpool_close(zhp); + nvlist_free(props); + return (1); + } + + ret = zpool_vdev_attach(zhp, old_disk, new_disk, nvroot, replacing, + rebuild); + + if (ret == 0 && wait) + ret = zpool_wait(zhp, + replacing ? ZPOOL_WAIT_REPLACE : ZPOOL_WAIT_RESILVER); + + nvlist_free(props); + nvlist_free(nvroot); + zpool_close(zhp); + + return (ret); +} + +/* + * zpool replace [-fsw] [-o property=value] + * + * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. + * -o Set property=value. + * -w Wait for replacing to complete before returning + * + * Replace with . + */ +/* ARGSUSED */ +int +zpool_do_replace(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_TRUE)); +} + +/* + * zpool attach [-fsw] [-o property=value] + * + * -f Force attach, even if appears to be in use. + * -s Use sequential instead of healing reconstruction for resilver. + * -o Set property=value. + * -w Wait for resilvering to complete before returning + * + * Attach to the mirror containing . If is not + * part of a mirror, then will be transformed into a mirror of + * and . In either case, will begin life + * with a DTL of [0, now], and will immediately begin to resilver itself. + */ +int +zpool_do_attach(int argc, char **argv) +{ + return (zpool_do_attach_or_replace(argc, argv, B_FALSE)); +} + +/* + * zpool detach [-f] + * + * -f Force detach of , even if DTLs argue against it + * (not supported yet) + * + * Detach a device from a mirror. The operation will be refused if + * is the last device in the mirror, or if the DTLs indicate that this device + * has the only valid copy of some data. + */ +/* ARGSUSED */ +int +zpool_do_detach(int argc, char **argv) +{ + int c; + char *poolname, *path; + zpool_handle_t *zhp; + int ret; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, + gettext("missing specification\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + path = argv[1]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_vdev_detach(zhp, path); + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool split [-gLnP] [-o prop=val] ... + * [-o mntopt] ... + * [-R altroot] [ ...] + * + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -n Do not split the pool, but display the resulting layout if + * it were to be split. + * -o Set property=value, or set mount options. + * -P Display full path for vdev name. + * -R Mount the split-off pool under an alternate root. + * -l Load encryption keys while importing. + * + * Splits the named pool and gives it the new pool name. Devices to be split + * off may be listed, provided that no more than one device is specified + * per top-level vdev mirror. The newly split pool is left in an exported + * state unless -R is specified. + * + * Restrictions: the top-level of the pool pool must only be made up of + * mirrors; all devices in the pool must be healthy; no device may be + * undergoing a resilvering operation. + */ +int +zpool_do_split(int argc, char **argv) +{ + char *srcpool, *newpool, *propval; + char *mntopts = NULL; + splitflags_t flags; + int c, ret = 0; + boolean_t loadkeys = B_FALSE; + zpool_handle_t *zhp; + nvlist_t *config, *props = NULL; + + flags.dryrun = B_FALSE; + flags.import = B_FALSE; + flags.name_flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, ":gLR:lno:P")) != -1) { + switch (c) { + case 'g': + flags.name_flags |= VDEV_NAME_GUID; + break; + case 'L': + flags.name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'R': + flags.import = B_TRUE; + if (add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, + &props, B_TRUE) != 0) { + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'l': + loadkeys = B_TRUE; + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE) != 0) { + nvlist_free(props); + usage(B_FALSE); + } + } else { + mntopts = optarg; + } + break; + case 'P': + flags.name_flags |= VDEV_NAME_PATH; + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + break; + } + } + + if (!flags.import && mntopts != NULL) { + (void) fprintf(stderr, gettext("setting mntopts is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + if (!flags.import && loadkeys) { + (void) fprintf(stderr, gettext("loading keys is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("Missing new pool name\n")); + usage(B_FALSE); + } + + srcpool = argv[0]; + newpool = argv[1]; + + argc -= 2; + argv += 2; + + if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) { + nvlist_free(props); + return (1); + } + + config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); + if (config == NULL) { + ret = 1; + } else { + if (flags.dryrun) { + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), newpool); + print_vdev_tree(NULL, newpool, config, 0, "", + flags.name_flags); + } + } + + zpool_close(zhp); + + if (ret != 0 || flags.dryrun || !flags.import) { + nvlist_free(config); + nvlist_free(props); + return (ret); + } + + /* + * The split was successful. Now we need to open the new + * pool and import it. + */ + if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) { + nvlist_free(config); + nvlist_free(props); + return (1); + } + + if (loadkeys) { + ret = zfs_crypto_attempt_load_keys(g_zfs, newpool); + if (ret != 0) + ret = 1; + } + + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + ret = 1; + (void) fprintf(stderr, gettext("Split was successful, but " + "the datasets could not all be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } + zpool_close(zhp); + nvlist_free(config); + nvlist_free(props); + + return (ret); +} + + + +/* + * zpool online ... + */ +int +zpool_do_online(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + vdev_state_t newstate; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, "e")) != -1) { + switch (c) { + case 'e': + flags |= ZFS_ONLINE_EXPAND; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { + if (newstate != VDEV_STATE_HEALTHY) { + (void) printf(gettext("warning: device '%s' " + "onlined, but remains in faulted state\n"), + argv[i]); + if (newstate == VDEV_STATE_FAULTED) + (void) printf(gettext("use 'zpool " + "clear' to restore a faulted " + "device\n")); + else + (void) printf(gettext("use 'zpool " + "replace' to replace devices " + "that are no longer present\n")); + } + } else { + ret = 1; + } + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool offline [-ft] ... + * + * -f Force the device into a faulted state. + * + * -t Only take the device off-line temporarily. The offline/faulted + * state will not be persistent across reboots. + */ +/* ARGSUSED */ +int +zpool_do_offline(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + boolean_t istmp = B_FALSE; + boolean_t fault = B_FALSE; + + /* check options */ + while ((c = getopt(argc, argv, "ft")) != -1) { + switch (c) { + case 'f': + fault = B_TRUE; + break; + case 't': + istmp = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("missing device name\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + for (i = 1; i < argc; i++) { + if (fault) { + uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + vdev_aux_t aux; + if (istmp == B_FALSE) { + /* Force the fault to persist across imports */ + aux = VDEV_AUX_EXTERNAL_PERSIST; + } else { + aux = VDEV_AUX_EXTERNAL; + } + + if (guid == 0 || zpool_vdev_fault(zhp, guid, aux) != 0) + ret = 1; + } else { + if (zpool_vdev_offline(zhp, argv[i], istmp) != 0) + ret = 1; + } + } + + zpool_close(zhp); + + return (ret); +} + +/* + * zpool clear [device] + * + * Clear all errors associated with a pool or a particular device. + */ +int +zpool_do_clear(int argc, char **argv) +{ + int c; + int ret = 0; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + nvlist_t *policy = NULL; + zpool_handle_t *zhp; + char *pool, *device; + + /* check options */ + while ((c = getopt(argc, argv, "FnX")) != -1) { + switch (c) { + case 'F': + do_rewind = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 2) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In future, further rewind policy choices can be passed along here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint32(policy, ZPOOL_LOAD_REWIND_POLICY, + rewind_policy) != 0) { + return (1); + } + + pool = argv[0]; + device = argc == 2 ? argv[1] : NULL; + + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + nvlist_free(policy); + return (1); + } + + if (zpool_clear(zhp, device, policy) != 0) + ret = 1; + + zpool_close(zhp); + + nvlist_free(policy); + + return (ret); +} + +/* + * zpool reguid + */ +int +zpool_do_reguid(int argc, char **argv) +{ + int c; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* get pool name and check number of arguments */ + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + poolname = argv[0]; + if ((zhp = zpool_open(g_zfs, poolname)) == NULL) + return (1); + + ret = zpool_reguid(zhp); + + zpool_close(zhp); + return (ret); +} + + +/* + * zpool reopen + * + * Reopen the pool so that the kernel can update the sizes of all vdevs. + */ +int +zpool_do_reopen(int argc, char **argv) +{ + int c; + int ret = 0; + boolean_t scrub_restart = B_TRUE; + + /* check options */ + while ((c = getopt(argc, argv, "n")) != -1) { + switch (c) { + case 'n': + scrub_restart = B_FALSE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + /* if argc == 0 we will execute zpool_reopen_one on all pools */ + ret = for_each_pool(argc, argv, B_TRUE, NULL, zpool_reopen_one, + &scrub_restart); + + return (ret); +} + +typedef struct scrub_cbdata { + int cb_type; + pool_scrub_cmd_t cb_scrub_cmd; +} scrub_cbdata_t; + +static boolean_t +zpool_has_checkpoint(zpool_handle_t *zhp) +{ + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + + if (config != NULL) { + pool_checkpoint_stat_t *pcs = NULL; + uint_t c; + + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + + if (pcs == NULL || pcs->pcs_state == CS_NONE) + return (B_FALSE); + + assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS || + pcs->pcs_state == CS_CHECKPOINT_DISCARDING); + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +scrub_callback(zpool_handle_t *zhp, void *data) +{ + scrub_cbdata_t *cb = data; + int err; + + /* + * Ignore faulted pools. + */ + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + (void) fprintf(stderr, gettext("cannot scan '%s': pool is " + "currently unavailable\n"), zpool_get_name(zhp)); + return (1); + } + + err = zpool_scan(zhp, cb->cb_type, cb->cb_scrub_cmd); + + if (err == 0 && zpool_has_checkpoint(zhp) && + cb->cb_type == POOL_SCAN_SCRUB) { + (void) printf(gettext("warning: will not scrub state that " + "belongs to the checkpoint of pool '%s'\n"), + zpool_get_name(zhp)); + } + + return (err != 0); +} + +static int +wait_callback(zpool_handle_t *zhp, void *data) +{ + zpool_wait_activity_t *act = data; + return (zpool_wait(zhp, *act)); +} + +/* + * zpool scrub [-s | -p] [-w] ... + * + * -s Stop. Stops any in-progress scrub. + * -p Pause. Pause in-progress scrub. + * -w Wait. Blocks until scrub has completed. + */ +int +zpool_do_scrub(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + boolean_t wait = B_FALSE; + int error; + + cb.cb_type = POOL_SCAN_SCRUB; + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + + /* check options */ + while ((c = getopt(argc, argv, "spw")) != -1) { + switch (c) { + case 's': + cb.cb_type = POOL_SCAN_NONE; + break; + case 'p': + cb.cb_scrub_cmd = POOL_SCRUB_PAUSE; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + if (cb.cb_type == POOL_SCAN_NONE && + cb.cb_scrub_cmd == POOL_SCRUB_PAUSE) { + (void) fprintf(stderr, gettext("invalid option combination: " + "-s and -p are mutually exclusive\n")); + usage(B_FALSE); + } + + if (wait && (cb.cb_type == POOL_SCAN_NONE || + cb.cb_scrub_cmd == POOL_SCRUB_PAUSE)) { + (void) fprintf(stderr, gettext("invalid option combination: " + "-w cannot be used with -p or -s\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + error = for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb); + + if (wait && !error) { + zpool_wait_activity_t act = ZPOOL_WAIT_SCRUB; + error = for_each_pool(argc, argv, B_TRUE, NULL, wait_callback, + &act); + } + + return (error); +} + +/* + * zpool resilver ... + * + * Restarts any in-progress resilver + */ +int +zpool_do_resilver(int argc, char **argv) +{ + int c; + scrub_cbdata_t cb; + + cb.cb_type = POOL_SCAN_RESILVER; + cb.cb_scrub_cmd = POOL_SCRUB_NORMAL; + + /* check options */ + while ((c = getopt(argc, argv, "")) != -1) { + switch (c) { + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + } + + return (for_each_pool(argc, argv, B_TRUE, NULL, scrub_callback, &cb)); +} + +/* + * zpool trim [-d] [-r ] [-c | -s] [ ...] + * + * -c Cancel. Ends any in-progress trim. + * -d Secure trim. Requires kernel and device support. + * -r Sets the TRIM rate in bytes (per second). Supports + * adding a multiplier suffix such as 'k' or 'm'. + * -s Suspend. TRIM can then be restarted with no flags. + * -w Wait. Blocks until trimming has completed. + */ +int +zpool_do_trim(int argc, char **argv) +{ + struct option long_options[] = { + {"cancel", no_argument, NULL, 'c'}, + {"secure", no_argument, NULL, 'd'}, + {"rate", required_argument, NULL, 'r'}, + {"suspend", no_argument, NULL, 's'}, + {"wait", no_argument, NULL, 'w'}, + {0, 0, 0, 0} + }; + + pool_trim_func_t cmd_type = POOL_TRIM_START; + uint64_t rate = 0; + boolean_t secure = B_FALSE; + boolean_t wait = B_FALSE; + + int c; + while ((c = getopt_long(argc, argv, "cdr:sw", long_options, NULL)) + != -1) { + switch (c) { + case 'c': + if (cmd_type != POOL_TRIM_START && + cmd_type != POOL_TRIM_CANCEL) { + (void) fprintf(stderr, gettext("-c cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_TRIM_CANCEL; + break; + case 'd': + if (cmd_type != POOL_TRIM_START) { + (void) fprintf(stderr, gettext("-d cannot be " + "combined with the -c or -s options\n")); + usage(B_FALSE); + } + secure = B_TRUE; + break; + case 'r': + if (cmd_type != POOL_TRIM_START) { + (void) fprintf(stderr, gettext("-r cannot be " + "combined with the -c or -s options\n")); + usage(B_FALSE); + } + if (zfs_nicestrtonum(NULL, optarg, &rate) == -1) { + (void) fprintf(stderr, + gettext("invalid value for rate\n")); + usage(B_FALSE); + } + break; + case 's': + if (cmd_type != POOL_TRIM_START && + cmd_type != POOL_TRIM_SUSPEND) { + (void) fprintf(stderr, gettext("-s cannot be " + "combined with other options\n")); + usage(B_FALSE); + } + cmd_type = POOL_TRIM_SUSPEND; + break; + case 'w': + wait = B_TRUE; + break; + case '?': + if (optopt != 0) { + (void) fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } else { + (void) fprintf(stderr, + gettext("invalid option '%s'\n"), + argv[optind - 1]); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing pool name argument\n")); + usage(B_FALSE); + return (-1); + } + + if (wait && (cmd_type != POOL_TRIM_START)) { + (void) fprintf(stderr, gettext("-w cannot be used with -c or " + "-s\n")); + usage(B_FALSE); + } + + char *poolname = argv[0]; + zpool_handle_t *zhp = zpool_open(g_zfs, poolname); + if (zhp == NULL) + return (-1); + + trimflags_t trim_flags = { + .secure = secure, + .rate = rate, + .wait = wait, + }; + + nvlist_t *vdevs = fnvlist_alloc(); + if (argc == 1) { + /* no individual leaf vdevs specified, so add them all */ + nvlist_t *config = zpool_get_config(zhp, NULL); + nvlist_t *nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + zpool_collect_leaves(zhp, nvroot, vdevs); + trim_flags.fullpool = B_TRUE; + } else { + trim_flags.fullpool = B_FALSE; + for (int i = 1; i < argc; i++) { + fnvlist_add_boolean(vdevs, argv[i]); + } + } + + int error = zpool_trim(zhp, cmd_type, vdevs, &trim_flags); + + fnvlist_free(vdevs); + zpool_close(zhp); + + return (error); +} + +/* + * Converts a total number of seconds to a human readable string broken + * down in to days/hours/minutes/seconds. + */ +static void +secs_to_dhms(uint64_t total, char *buf) +{ + uint64_t days = total / 60 / 60 / 24; + uint64_t hours = (total / 60 / 60) % 24; + uint64_t mins = (total / 60) % 60; + uint64_t secs = (total % 60); + + if (days > 0) { + (void) sprintf(buf, "%llu days %02llu:%02llu:%02llu", + (u_longlong_t)days, (u_longlong_t)hours, + (u_longlong_t)mins, (u_longlong_t)secs); + } else { + (void) sprintf(buf, "%02llu:%02llu:%02llu", + (u_longlong_t)hours, (u_longlong_t)mins, + (u_longlong_t)secs); + } +} + +/* + * Print out detailed scrub status. + */ +static void +print_scan_scrub_resilver_status(pool_scan_stat_t *ps) +{ + time_t start, end, pause; + uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint64_t elapsed, scan_rate, issue_rate; + double fraction_done; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; + char srate_buf[7], irate_buf[7], time_buf[32]; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + /* If there's never been a scan, there's not much to say. */ + if (ps == NULL || ps->pss_func == POOL_SCAN_NONE || + ps->pss_func >= POOL_SCAN_FUNCS) { + (void) printf(gettext("none requested\n")); + return; + } + + start = ps->pss_start_time; + end = ps->pss_end_time; + pause = ps->pss_pass_scrub_pause; + + zfs_nicebytes(ps->pss_processed, processed_buf, sizeof (processed_buf)); + + assert(ps->pss_func == POOL_SCAN_SCRUB || + ps->pss_func == POOL_SCAN_RESILVER); + + /* Scan is finished or canceled. */ + if (ps->pss_state == DSS_FINISHED) { + secs_to_dhms(end - start, time_buf); + + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub repaired %s " + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilvered %s " + "in %s with %llu errors on %s"), processed_buf, + time_buf, (u_longlong_t)ps->pss_errors, + ctime(&end)); + } + return; + } else if (ps->pss_state == DSS_CANCELED) { + if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("scrub canceled on %s"), + ctime(&end)); + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver canceled on %s"), + ctime(&end)); + } + return; + } + + assert(ps->pss_state == DSS_SCANNING); + + /* Scan is in progress. Resilvers can't be paused. */ + if (ps->pss_func == POOL_SCAN_SCRUB) { + if (pause == 0) { + (void) printf(gettext("scrub in progress since %s"), + ctime(&start)); + } else { + (void) printf(gettext("scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\tscrub started on %s"), + ctime(&start)); + } + } else if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("resilver in progress since %s"), + ctime(&start)); + } + + scanned = ps->pss_examined; + pass_scanned = ps->pss_pass_exam; + issued = ps->pss_issued; + pass_issued = ps->pss_pass_issued; + total = ps->pss_to_examine; + + /* we are only done with a block once we have issued the IO for it */ + fraction_done = (double)issued / total; + + /* elapsed time for this pass, rounding up to 1 if it's 0 */ + elapsed = time(NULL) - ps->pss_pass_start; + elapsed -= ps->pss_pass_scrub_spent_paused; + elapsed = (elapsed != 0) ? elapsed : 1; + + scan_rate = pass_scanned / elapsed; + issue_rate = pass_issued / elapsed; + uint64_t total_secs_left = (issue_rate != 0 && total >= issued) ? + ((total - issued) / issue_rate) : UINT64_MAX; + secs_to_dhms(total_secs_left, time_buf); + + /* format all of the numbers we will be reporting */ + zfs_nicebytes(scanned, scanned_buf, sizeof (scanned_buf)); + zfs_nicebytes(issued, issued_buf, sizeof (issued_buf)); + zfs_nicebytes(total, total_buf, sizeof (total_buf)); + zfs_nicebytes(scan_rate, srate_buf, sizeof (srate_buf)); + zfs_nicebytes(issue_rate, irate_buf, sizeof (irate_buf)); + + /* do not print estimated time if we have a paused scrub */ + if (pause == 0) { + (void) printf(gettext("\t%s scanned at %s/s, " + "%s issued at %s/s, %s total\n"), + scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); + } else { + (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), + scanned_buf, issued_buf, total_buf); + } + + if (ps->pss_func == POOL_SCAN_RESILVER) { + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + processed_buf, 100 * fraction_done); + } else if (ps->pss_func == POOL_SCAN_SCRUB) { + (void) printf(gettext("\t%s repaired, %.2f%% done"), + processed_buf, 100 * fraction_done); + } + + if (pause == 0) { + if (total_secs_left != UINT64_MAX && + issue_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +static void +print_rebuild_status_impl(vdev_rebuild_stat_t *vrs, char *vdev_name) +{ + if (vrs == NULL || vrs->vrs_state == VDEV_REBUILD_NONE) + return; + + printf(" "); + printf_color(ANSI_BOLD, gettext("scan:")); + printf(" "); + + uint64_t bytes_scanned = vrs->vrs_bytes_scanned; + uint64_t bytes_issued = vrs->vrs_bytes_issued; + uint64_t bytes_rebuilt = vrs->vrs_bytes_rebuilt; + uint64_t bytes_est = vrs->vrs_bytes_est; + uint64_t scan_rate = (vrs->vrs_pass_bytes_scanned / + (vrs->vrs_pass_time_ms + 1)) * 1000; + uint64_t issue_rate = (vrs->vrs_pass_bytes_issued / + (vrs->vrs_pass_time_ms + 1)) * 1000; + double scan_pct = MIN((double)bytes_scanned * 100 / + (bytes_est + 1), 100); + + /* Format all of the numbers we will be reporting */ + char bytes_scanned_buf[7], bytes_issued_buf[7]; + char bytes_rebuilt_buf[7], bytes_est_buf[7]; + char scan_rate_buf[7], issue_rate_buf[7], time_buf[32]; + zfs_nicebytes(bytes_scanned, bytes_scanned_buf, + sizeof (bytes_scanned_buf)); + zfs_nicebytes(bytes_issued, bytes_issued_buf, + sizeof (bytes_issued_buf)); + zfs_nicebytes(bytes_rebuilt, bytes_rebuilt_buf, + sizeof (bytes_rebuilt_buf)); + zfs_nicebytes(bytes_est, bytes_est_buf, sizeof (bytes_est_buf)); + zfs_nicebytes(scan_rate, scan_rate_buf, sizeof (scan_rate_buf)); + zfs_nicebytes(issue_rate, issue_rate_buf, sizeof (issue_rate_buf)); + + time_t start = vrs->vrs_start_time; + time_t end = vrs->vrs_end_time; + + /* Rebuild is finished or canceled. */ + if (vrs->vrs_state == VDEV_REBUILD_COMPLETE) { + secs_to_dhms(vrs->vrs_scan_time_ms / 1000, time_buf); + (void) printf(gettext("resilvered (%s) %s in %s " + "with %llu errors on %s"), vdev_name, bytes_rebuilt_buf, + time_buf, (u_longlong_t)vrs->vrs_errors, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_CANCELED) { + (void) printf(gettext("resilver (%s) canceled on %s"), + vdev_name, ctime(&end)); + return; + } else if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + (void) printf(gettext("resilver (%s) in progress since %s"), + vdev_name, ctime(&start)); + } + + assert(vrs->vrs_state == VDEV_REBUILD_ACTIVE); + + secs_to_dhms(MAX((int64_t)bytes_est - (int64_t)bytes_scanned, 0) / + MAX(scan_rate, 1), time_buf); + + (void) printf(gettext("\t%s scanned at %s/s, %s issued %s/s, " + "%s total\n"), bytes_scanned_buf, scan_rate_buf, + bytes_issued_buf, issue_rate_buf, bytes_est_buf); + (void) printf(gettext("\t%s resilvered, %.2f%% done"), + bytes_rebuilt_buf, scan_pct); + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + if (scan_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %s to go\n"), time_buf); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } +} + +/* + * Print rebuild status for top-level vdevs. + */ +static void +print_rebuild_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + char *name = zpool_vdev_name(g_zfs, zhp, + child[c], VDEV_NAME_TYPE_ID); + print_rebuild_status_impl(vrs, name); + free(name); + } + } +} + +/* + * As we don't scrub checkpointed blocks, we want to warn the user that we + * skipped scanning some blocks if a checkpoint exists or existed at any + * time during the scan. If a sequential instead of healing reconstruction + * was performed then the blocks were reconstructed. However, their checksums + * have not been verified so we still print the warning. + */ +static void +print_checkpoint_scan_warning(pool_scan_stat_t *ps, pool_checkpoint_stat_t *pcs) +{ + if (ps == NULL || pcs == NULL) + return; + + if (pcs->pcs_state == CS_NONE || + pcs->pcs_state == CS_CHECKPOINT_DISCARDING) + return; + + assert(pcs->pcs_state == CS_CHECKPOINT_EXISTS); + + if (ps->pss_state == DSS_NONE) + return; + + if ((ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) && + ps->pss_end_time < pcs->pcs_start_time) + return; + + if (ps->pss_state == DSS_FINISHED || ps->pss_state == DSS_CANCELED) { + (void) printf(gettext(" scan warning: skipped blocks " + "that are only referenced by the checkpoint.\n")); + } else { + assert(ps->pss_state == DSS_SCANNING); + (void) printf(gettext(" scan warning: skipping blocks " + "that are only referenced by the checkpoint.\n")); + } +} + +/* + * Returns B_TRUE if there is an active rebuild in progress. Otherwise, + * B_FALSE is returned and 'rebuild_end_time' is set to the end time for + * the last completed (or cancelled) rebuild. + */ +static boolean_t +check_rebuilding(nvlist_t *nvroot, uint64_t *rebuild_end_time) +{ + nvlist_t **child; + uint_t children; + boolean_t rebuilding = B_FALSE; + uint64_t end_time = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + if (nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i) == 0) { + + if (vrs->vrs_end_time > end_time) + end_time = vrs->vrs_end_time; + + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + rebuilding = B_TRUE; + end_time = 0; + break; + } + } + } + + if (rebuild_end_time != NULL) + *rebuild_end_time = end_time; + + return (rebuilding); +} + +/* + * Print the scan status. + */ +static void +print_scan_status(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + uint64_t rebuild_end_time = 0, resilver_end_time = 0; + boolean_t have_resilver = B_FALSE, have_scrub = B_FALSE; + boolean_t active_resilver = B_FALSE; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *ps = NULL; + uint_t c; + + if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &c) == 0) { + if (ps->pss_func == POOL_SCAN_RESILVER) { + resilver_end_time = ps->pss_end_time; + active_resilver = (ps->pss_state == DSS_SCANNING); + } + + have_resilver = (ps->pss_func == POOL_SCAN_RESILVER); + have_scrub = (ps->pss_func == POOL_SCAN_SCRUB); + } + + boolean_t active_rebuild = check_rebuilding(nvroot, &rebuild_end_time); + boolean_t have_rebuild = (active_rebuild || (rebuild_end_time > 0)); + + /* Always print the scrub status when available. */ + if (have_scrub) + print_scan_scrub_resilver_status(ps); + + /* + * When there is an active resilver or rebuild print its status. + * Otherwise print the status of the last resilver or rebuild. + */ + if (active_resilver || (!active_rebuild && have_resilver && + resilver_end_time && resilver_end_time > rebuild_end_time)) { + print_scan_scrub_resilver_status(ps); + } else if (active_rebuild || (!active_resilver && have_rebuild && + rebuild_end_time && rebuild_end_time > resilver_end_time)) { + print_rebuild_status(zhp, nvroot); + } + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_scan_warning(ps, pcs); +} + +/* + * Print out detailed removal status. + */ +static void +print_removal_status(zpool_handle_t *zhp, pool_removal_stat_t *prs) +{ + char copied_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + time_t start, end; + nvlist_t *config, *nvroot; + nvlist_t **child; + uint_t children; + char *vdev_name; + + if (prs == NULL || prs->prs_state == DSS_NONE) + return; + + /* + * Determine name of vdev. + */ + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE); + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0); + assert(prs->prs_removing_vdev < children); + vdev_name = zpool_vdev_name(g_zfs, zhp, + child[prs->prs_removing_vdev], B_TRUE); + + (void) printf(gettext("remove: ")); + + start = prs->prs_start_time; + end = prs->prs_end_time; + zfs_nicenum(prs->prs_copied, copied_buf, sizeof (copied_buf)); + + /* + * Removal is finished or canceled. + */ + if (prs->prs_state == DSS_FINISHED) { + uint64_t minutes_taken = (end - start) / 60; + + (void) printf(gettext("Removal of vdev %llu copied %s " + "in %lluh%um, completed on %s"), + (longlong_t)prs->prs_removing_vdev, + copied_buf, + (u_longlong_t)(minutes_taken / 60), + (uint_t)(minutes_taken % 60), + ctime((time_t *)&end)); + } else if (prs->prs_state == DSS_CANCELED) { + (void) printf(gettext("Removal of %s canceled on %s"), + vdev_name, ctime(&end)); + } else { + uint64_t copied, total, elapsed, mins_left, hours_left; + double fraction_done; + uint_t rate; + + assert(prs->prs_state == DSS_SCANNING); + + /* + * Removal is in progress. + */ + (void) printf(gettext( + "Evacuation of %s in progress since %s"), + vdev_name, ctime(&start)); + + copied = prs->prs_copied > 0 ? prs->prs_copied : 1; + total = prs->prs_to_copy; + fraction_done = (double)copied / total; + + /* elapsed time for this pass */ + elapsed = time(NULL) - prs->prs_start_time; + elapsed = elapsed > 0 ? elapsed : 1; + rate = copied / elapsed; + rate = rate > 0 ? rate : 1; + mins_left = ((total - copied) / rate) / 60; + hours_left = mins_left / 60; + + zfs_nicenum(copied, examined_buf, sizeof (examined_buf)); + zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); + + /* + * do not print estimated time if hours_left is more than + * 30 days + */ + (void) printf(gettext(" %s copied out of %s at %s/s, " + "%.2f%% done"), + examined_buf, total_buf, rate_buf, 100 * fraction_done); + if (hours_left < (30 * 24)) { + (void) printf(gettext(", %lluh%um to go\n"), + (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); + } else { + (void) printf(gettext( + ", (copy is slow, no estimated time)\n")); + } + } + free(vdev_name); + + if (prs->prs_mapping_memory > 0) { + char mem_buf[7]; + zfs_nicenum(prs->prs_mapping_memory, mem_buf, sizeof (mem_buf)); + (void) printf(gettext(" %s memory used for " + "removed device mappings\n"), + mem_buf); + } +} + +static void +print_checkpoint_status(pool_checkpoint_stat_t *pcs) +{ + time_t start; + char space_buf[7]; + + if (pcs == NULL || pcs->pcs_state == CS_NONE) + return; + + (void) printf(gettext("checkpoint: ")); + + start = pcs->pcs_start_time; + zfs_nicenum(pcs->pcs_space, space_buf, sizeof (space_buf)); + + if (pcs->pcs_state == CS_CHECKPOINT_EXISTS) { + char *date = ctime(&start); + + /* + * ctime() adds a newline at the end of the generated + * string, thus the weird format specifier and the + * strlen() call used to chop it off from the output. + */ + (void) printf(gettext("created %.*s, consumes %s\n"), + (int)(strlen(date) - 1), date, space_buf); + return; + } + + assert(pcs->pcs_state == CS_CHECKPOINT_DISCARDING); + + (void) printf(gettext("discarding, %s remaining.\n"), + space_buf); +} + +static void +print_error_log(zpool_handle_t *zhp) +{ + nvlist_t *nverrlist = NULL; + nvpair_t *elem; + char *pathname; + size_t len = MAXPATHLEN * 2; + + if (zpool_get_errlog(zhp, &nverrlist) != 0) + return; + + (void) printf("errors: Permanent errors have been " + "detected in the following files:\n\n"); + + pathname = safe_malloc(len); + elem = NULL; + while ((elem = nvlist_next_nvpair(nverrlist, elem)) != NULL) { + nvlist_t *nv; + uint64_t dsobj, obj; + + verify(nvpair_value_nvlist(elem, &nv) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_DATASET, + &dsobj) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_ERR_OBJECT, + &obj) == 0); + zpool_obj_to_path(zhp, dsobj, obj, pathname, len); + (void) printf("%7s %s\n", "", pathname); + } + free(pathname); + nvlist_free(nverrlist); +} + +static void +print_spares(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **spares, + uint_t nspares) +{ + uint_t i; + char *name; + + if (nspares == 0) + return; + + (void) printf(gettext("\tspares\n")); + + for (i = 0; i < nspares; i++) { + name = zpool_vdev_name(g_zfs, zhp, spares[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, spares[i], 2, B_TRUE, NULL); + free(name); + } +} + +static void +print_l2cache(zpool_handle_t *zhp, status_cbdata_t *cb, nvlist_t **l2cache, + uint_t nl2cache) +{ + uint_t i; + char *name; + + if (nl2cache == 0) + return; + + (void) printf(gettext("\tcache\n")); + + for (i = 0; i < nl2cache; i++) { + name = zpool_vdev_name(g_zfs, zhp, l2cache[i], + cb->cb_name_flags); + print_status_config(zhp, cb, name, l2cache[i], 2, + B_FALSE, NULL); + free(name); + } +} + +static void +print_dedup_stats(nvlist_t *config) +{ + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + uint_t c; + char dspace[6], mspace[6]; + + /* + * If the pool was faulted then we may not have been able to + * obtain the config. Otherwise, if we have anything in the dedup + * table continue processing the stats. + */ + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t **)&ddo, &c) != 0) + return; + + (void) printf("\n"); + (void) printf(gettext(" dedup: ")); + if (ddo->ddo_count == 0) { + (void) printf(gettext("no DDT entries\n")); + return; + } + + zfs_nicebytes(ddo->ddo_dspace, dspace, sizeof (dspace)); + zfs_nicebytes(ddo->ddo_mspace, mspace, sizeof (mspace)); + (void) printf("DDT entries %llu, size %s on disk, %s in core\n", + (u_longlong_t)ddo->ddo_count, + dspace, + mspace); + + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, + (uint64_t **)&dds, &c) == 0); + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t **)&ddh, &c) == 0); + zpool_dump_ddt(dds, ddh); +} + +/* + * Display a summary of pool status. Displays a summary such as: + * + * pool: tank + * status: DEGRADED + * reason: One or more devices ... + * see: https://zfsonlinux.org/msg/ZFS-xxxx-01 + * config: + * mirror DEGRADED + * c1t0d0 OK + * c2t0d0 UNAVAIL + * + * When given the '-v' option, we print out the complete config. If the '-e' + * option is specified, then we print out error rate information as well. + */ +static int +status_callback(zpool_handle_t *zhp, void *data) +{ + status_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + char *msgid; + zpool_status_t reason; + zpool_errata_t errata; + const char *health; + uint_t c; + vdev_stat_t *vs; + + config = zpool_get_config(zhp, NULL); + reason = zpool_get_status(zhp, &msgid, &errata); + + cbp->cb_count++; + + /* + * If we were given 'zpool status -x', only report those pools with + * problems. + */ + if (cbp->cb_explain && + (reason == ZPOOL_STATUS_OK || + reason == ZPOOL_STATUS_VERSION_OLDER || + reason == ZPOOL_STATUS_FEAT_DISABLED)) { + if (!cbp->cb_allpools) { + (void) printf(gettext("pool '%s' is healthy\n"), + zpool_get_name(zhp)); + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + } + return (0); + } + + if (cbp->cb_first) + cbp->cb_first = B_FALSE; + else + (void) printf("\n"); + + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + health = zpool_get_state_str(zhp); + + printf(" "); + printf_color(ANSI_BOLD, gettext("pool:")); + printf(" %s\n", zpool_get_name(zhp)); + printf(" "); + printf_color(ANSI_BOLD, gettext("state: ")); + + printf_color(health_str_to_color(health), "%s", health); + + printf("\n"); + + switch (reason) { + case ZPOOL_STATUS_MISSING_DEV_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. Sufficient replicas exist for\n\tthe pool " + "to continue functioning in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_MISSING_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be opened. There are insufficient\n\treplicas for the" + " pool to continue functioning.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Attach the missing device " + "and online it using 'zpool online'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing or\n\tinvalid. " + "Sufficient replicas exist for the pool to continue\n\t" + "functioning in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the device using " + "'zpool replace'.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_LABEL_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices could " + "not be used because the label is missing \n\tor invalid. " + "There are insufficient replicas for the pool to " + "continue\n\tfunctioning.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_FAILING_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "experienced an unrecoverable error. An\n\tattempt was " + "made to correct the error. Applications are " + "unaffected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Determine if the " + "device needs to be replaced, and clear the errors\n\tusing" + " 'zpool clear' or replace the device with 'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_OFFLINE_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "been taken offline by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using 'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_REMOVED_DEV: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "been removed by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Online the device " + "using zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: + case ZPOOL_STATUS_REBUILDING: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices is " + "currently being resilvered. The pool will\n\tcontinue " + "to function, possibly in a degraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Wait for the resilver to " + "complete.\n")); + break; + + case ZPOOL_STATUS_REBUILD_SCRUB: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices have " + "been sequentially resilvered, scrubbing\n\tthe pool " + "is recommended.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Use 'zpool scrub' to " + "verify all data checksums.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_DATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices has " + "experienced an error resulting in data\n\tcorruption. " + "Applications may be affected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Restore the file in question" + " if possible. Otherwise restore the\n\tentire pool from " + "backup.\n")); + break; + + case ZPOOL_STATUS_CORRUPT_POOL: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool metadata is " + "corrupted and the pool cannot be opened.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); + break; + + case ZPOOL_STATUS_VERSION_OLDER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is formatted using " + "a legacy on-disk format. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Upgrade the pool using " + "'zpool upgrade'. Once this is done, the\n\tpool will no " + "longer be accessible on software that does not support\n\t" + "feature flags.\n")); + break; + + case ZPOOL_STATUS_VERSION_NEWER: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool has been upgraded " + "to a newer, incompatible on-disk version.\n\tThe pool " + "cannot be accessed on this system.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system running more recent software, or\n\trestore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_FEAT_DISABLED: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Some supported features are " + "not enabled on the pool. The pool can\n\tstill be used, " + "but some features are unavailable.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Enable all features using " + "'zpool upgrade'. Once this is done,\n\tthe pool may no " + "longer be accessible by software that does not support\n\t" + "the features. See zpool-features(5) for details.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_READ: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "on this system because it uses the\n\tfollowing feature(s)" + " not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Access the pool from a " + "system that supports the required feature(s),\n\tor " + "restore the pool from backup.\n")); + break; + + case ZPOOL_STATUS_UNSUP_FEAT_WRITE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool can only be " + "accessed in read-only mode on this system. It\n\tcannot be" + " accessed in read-write mode because it uses the " + "following\n\tfeature(s) not supported on this system:\n")); + zpool_print_unsup_feat(config); + (void) printf("\n"); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("The pool cannot be accessed " + "in read-write mode. Import the pool with\n" + "\t\"-o readonly=on\", access the pool from a system that " + "supports the\n\trequired feature(s), or restore the " + "pool from backup.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_R: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to persistent errors.\n\tSufficient " + "replicas exist for the pool to continue functioning " + "in a\n\tdegraded state.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Replace the faulted device, " + "or use 'zpool clear' to mark the device\n\trepaired.\n")); + break; + + case ZPOOL_STATUS_FAULTED_DEV_NR: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to persistent errors. There are " + "insufficient replicas for the pool to\n\tcontinue " + "functioning.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Destroy and re-create the " + "pool from a backup source. Manually marking the device\n" + "\trepaired using 'zpool clear' may allow some data " + "to be recovered.\n")); + break; + + case ZPOOL_STATUS_IO_FAILURE_MMP: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("The pool is suspended " + "because multihost writes failed or were delayed;\n\t" + "another system could import the pool undetected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the pool's devices" + " are connected, then reboot your system and\n\timport the " + "pool.\n")); + break; + + case ZPOOL_STATUS_IO_FAILURE_WAIT: + case ZPOOL_STATUS_IO_FAILURE_CONTINUE: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("One or more devices are " + "faulted in response to IO failures.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Make sure the affected " + "devices are connected, then run 'zpool clear'.\n")); + break; + + case ZPOOL_STATUS_BAD_LOG: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("An intent log record " + "could not be read.\n" + "\tWaiting for administrator intervention to fix the " + "faulted pool.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Either restore the affected " + "device(s) and run 'zpool online',\n" + "\tor ignore the intent log records by running " + "'zpool clear'.\n")); + break; + + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + + case ZPOOL_STATUS_HOSTID_MISMATCH: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Mismatch between pool hostid" + " and system hostid on imported pool.\n\tThis pool was " + "previously imported into a system with a different " + "hostid,\n\tand then was verbatim imported into this " + "system.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("Export this pool on all " + "systems on which it is imported.\n" + "\tThen import it to correct the mismatch.\n")); + break; + + case ZPOOL_STATUS_ERRATA: + printf_color(ANSI_BOLD, gettext("status: ")); + printf_color(ANSI_YELLOW, gettext("Errata #%d detected.\n"), + errata); + + switch (errata) { + case ZPOOL_ERRATA_NONE: + break; + + case ZPOOL_ERRATA_ZOL_2094_SCRUB: + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " run 'zpool scrub'.\n")); + break; + + case ZPOOL_ERRATA_ZOL_6845_ENCRYPTION: + (void) printf(gettext("\tExisting encrypted datasets " + "contain an on-disk incompatibility\n\twhich " + "needs to be corrected.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the issue" + " backup existing encrypted datasets to new\n\t" + "encrypted datasets and destroy the old ones. " + "'zfs mount -o ro' can\n\tbe used to temporarily " + "mount existing encrypted datasets readonly.\n")); + break; + + case ZPOOL_ERRATA_ZOL_8308_ENCRYPTION: + (void) printf(gettext("\tExisting encrypted snapshots " + "and bookmarks contain an on-disk\n\tincompat" + "ibility. This may cause on-disk corruption if " + "they are used\n\twith 'zfs recv'.\n")); + printf_color(ANSI_BOLD, gettext("action: ")); + printf_color(ANSI_YELLOW, gettext("To correct the" + "issue, enable the bookmark_v2 feature. No " + "additional\n\taction is needed if there are no " + "encrypted snapshots or bookmarks.\n\tIf preserving" + "the encrypted snapshots and bookmarks is required," + " use\n\ta non-raw send to backup and restore them." + " Alternately, they may be\n\tremoved to resolve " + "the incompatibility.\n")); + break; + + default: + /* + * All errata which allow the pool to be imported + * must contain an action message. + */ + assert(0); + } + break; + + default: + /* + * The remaining errors can't actually be generated, yet. + */ + assert(reason == ZPOOL_STATUS_OK); + } + + if (msgid != NULL) { + printf(" "); + printf_color(ANSI_BOLD, gettext("see:")); + printf(gettext(" https://zfsonlinux.org/msg/%s\n"), msgid); + } + + if (config != NULL) { + uint64_t nerr; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + pool_checkpoint_stat_t *pcs = NULL; + pool_removal_stat_t *prs = NULL; + + print_scan_status(zhp, nvroot); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + print_removal_status(zhp, prs); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + print_checkpoint_status(pcs); + + cbp->cb_namewidth = max_width(zhp, nvroot, 0, 0, + cbp->cb_name_flags | VDEV_NAME_TYPE_ID); + if (cbp->cb_namewidth < 10) + cbp->cb_namewidth = 10; + + color_start(ANSI_BOLD); + (void) printf(gettext("config:\n\n")); + (void) printf(gettext("\t%-*s %-8s %5s %5s %5s"), + cbp->cb_namewidth, "NAME", "STATE", "READ", "WRITE", + "CKSUM"); + color_end(); + + if (cbp->cb_print_slow_ios) { + printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); + } + + if (cbp->vcdl != NULL) + print_cmd_columns(cbp->vcdl, 0); + + printf("\n"); + + print_status_config(zhp, cbp, zpool_get_name(zhp), nvroot, 0, + B_FALSE, NULL); + + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_DEDUP); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_BIAS_SPECIAL); + print_class_vdevs(zhp, cbp, nvroot, VDEV_ALLOC_CLASS_LOGS); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) + print_l2cache(zhp, cbp, l2cache, nl2cache); + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) + print_spares(zhp, cbp, spares, nspares); + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, + &nerr) == 0) { + nvlist_t *nverrlist = NULL; + + /* + * If the approximate error count is small, get a + * precise count by fetching the entire log and + * uniquifying the results. + */ + if (nerr > 0 && nerr < 100 && !cbp->cb_verbose && + zpool_get_errlog(zhp, &nverrlist) == 0) { + nvpair_t *elem; + + elem = NULL; + nerr = 0; + while ((elem = nvlist_next_nvpair(nverrlist, + elem)) != NULL) { + nerr++; + } + } + nvlist_free(nverrlist); + + (void) printf("\n"); + + if (nerr == 0) + (void) printf(gettext("errors: No known data " + "errors\n")); + else if (!cbp->cb_verbose) + (void) printf(gettext("errors: %llu data " + "errors, use '-v' for a list\n"), + (u_longlong_t)nerr); + else + print_error_log(zhp); + } + + if (cbp->cb_dedup_stats) + print_dedup_stats(config); + } else { + (void) printf(gettext("config: The configuration cannot be " + "determined.\n")); + } + + return (0); +} + +/* + * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... + * [interval [count]] + * + * -c CMD For each vdev, run command CMD + * -i Display vdev initialization status. + * -g Display guid for individual vdev name. + * -L Follow links when resolving vdev path name. + * -p Display values in parsable (exact) format. + * -P Display full path for vdev name. + * -s Display slow IOs column. + * -v Display complete error logs + * -x Display only pools with potential problems + * -D Display dedup status (undocumented) + * -t Display vdev TRIM status. + * -T Display a timestamp in date(1) or Unix format + * + * Describes the health status of all pools or some subset. + */ +int +zpool_do_status(int argc, char **argv) +{ + int c; + int ret; + float interval = 0; + unsigned long count = 0; + status_cbdata_t cb = { 0 }; + char *cmd = NULL; + + /* check options */ + while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { + switch (c) { + case 'c': + if (cmd != NULL) { + fprintf(stderr, + gettext("Can't set -c flag twice\n")); + exit(1); + } + + if (getenv("ZPOOL_SCRIPTS_ENABLED") != NULL && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_ENABLED")) { + fprintf(stderr, gettext( + "Can't run -c, disabled by " + "ZPOOL_SCRIPTS_ENABLED.\n")); + exit(1); + } + + if ((getuid() <= 0 || geteuid() <= 0) && + !libzfs_envvar_is_set("ZPOOL_SCRIPTS_AS_ROOT")) { + fprintf(stderr, gettext( + "Can't run -c with root privileges " + "unless ZPOOL_SCRIPTS_AS_ROOT is set.\n")); + exit(1); + } + cmd = optarg; + break; + case 'i': + cb.cb_print_vdev_init = B_TRUE; + break; + case 'g': + cb.cb_name_flags |= VDEV_NAME_GUID; + break; + case 'L': + cb.cb_name_flags |= VDEV_NAME_FOLLOW_LINKS; + break; + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'P': + cb.cb_name_flags |= VDEV_NAME_PATH; + break; + case 's': + cb.cb_print_slow_ios = B_TRUE; + break; + case 'v': + cb.cb_verbose = B_TRUE; + break; + case 'x': + cb.cb_explain = B_TRUE; + break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; + case 't': + cb.cb_print_vdev_trim = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case '?': + if (optopt == 'c') { + print_zpool_script_list("status"); + exit(0); + } else { + fprintf(stderr, + gettext("invalid option '%c'\n"), optopt); + } + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &interval, &count); + + if (argc == 0) + cb.cb_allpools = B_TRUE; + + cb.cb_first = B_TRUE; + cb.cb_print_status = B_TRUE; + + for (;;) { + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + if (cmd != NULL) + cb.vcdl = all_pools_for_each_vdev_run(argc, argv, cmd, + NULL, NULL, 0, 0); + + ret = for_each_pool(argc, argv, B_TRUE, NULL, + status_callback, &cb); + + if (cb.vcdl != NULL) + free_vdev_cmd_data_list(cb.vcdl); + + if (argc == 0 && cb.cb_count == 0) + (void) fprintf(stderr, gettext("no pools available\n")); + else if (cb.cb_explain && cb.cb_first && cb.cb_allpools) + (void) printf(gettext("all pools are healthy\n")); + + if (ret != 0) + return (ret); + + if (interval == 0) + break; + + if (count != 0 && --count == 0) + break; + + (void) fsleep(interval); + } + + return (0); +} + +typedef struct upgrade_cbdata { + int cb_first; + int cb_argc; + uint64_t cb_version; + char **cb_argv; +} upgrade_cbdata_t; + +static int +check_unsupp_fs(zfs_handle_t *zhp, void *unsupp_fs) +{ + int zfs_version = (int)zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + int *count = (int *)unsupp_fs; + + if (zfs_version > ZPL_VERSION) { + (void) printf(gettext("%s (v%d) is not supported by this " + "implementation of ZFS.\n"), + zfs_get_name(zhp), zfs_version); + (*count)++; + } + + zfs_iter_filesystems(zhp, check_unsupp_fs, unsupp_fs); + + zfs_close(zhp); + + return (0); +} + +static int +upgrade_version(zpool_handle_t *zhp, uint64_t version) +{ + int ret; + nvlist_t *config; + uint64_t oldversion; + int unsupp_fs = 0; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &oldversion) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(oldversion)); + assert(oldversion < version); + + ret = zfs_iter_root(zpool_get_handle(zhp), check_unsupp_fs, &unsupp_fs); + if (ret != 0) + return (ret); + + if (unsupp_fs) { + (void) fprintf(stderr, gettext("Upgrade not performed due " + "to %d unsupported filesystems (max v%d).\n"), + unsupp_fs, (int)ZPL_VERSION); + return (1); + } + + ret = zpool_upgrade(zhp, version); + if (ret != 0) + return (ret); + + if (version >= SPA_VERSION_FEATURES) { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to feature flags.\n"), + zpool_get_name(zhp), (u_longlong_t)oldversion); + } else { + (void) printf(gettext("Successfully upgraded " + "'%s' from version %llu to version %llu.\n"), + zpool_get_name(zhp), (u_longlong_t)oldversion, + (u_longlong_t)version); + } + + return (0); +} + +static int +upgrade_enable_all(zpool_handle_t *zhp, int *countp) +{ + int i, ret, count; + boolean_t firstff = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + count = 0; + for (i = 0; i < SPA_FEATURES; i++) { + const char *fname = spa_feature_table[i].fi_uname; + const char *fguid = spa_feature_table[i].fi_guid; + if (!nvlist_exists(enabled, fguid)) { + char *propname; + verify(-1 != asprintf(&propname, "feature@%s", fname)); + ret = zpool_set_prop(zhp, propname, + ZFS_FEATURE_ENABLED); + if (ret != 0) { + free(propname); + return (ret); + } + count++; + + if (firstff) { + (void) printf(gettext("Enabled the " + "following features on '%s':\n"), + zpool_get_name(zhp)); + firstff = B_FALSE; + } + (void) printf(gettext(" %s\n"), fname); + free(propname); + } + } + + if (countp != NULL) + *countp = count; + return (0); +} + +static int +upgrade_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + boolean_t printnl = B_FALSE; + int ret; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < cbp->cb_version) { + cbp->cb_first = B_FALSE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + printnl = B_TRUE; + + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count > 0) { + cbp->cb_first = B_FALSE; + printnl = B_TRUE; + } + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} + +static int +upgrade_list_older_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + assert(SPA_VERSION_IS_SUPPORTED(version)); + + if (version < SPA_VERSION_FEATURES) { + if (cbp->cb_first) { + (void) printf(gettext("The following pools are " + "formatted with legacy version numbers and can\n" + "be upgraded to use feature flags. After " + "being upgraded, these pools\nwill no " + "longer be accessible by software that does not " + "support feature\nflags.\n\n")); + (void) printf(gettext("VER POOL\n")); + (void) printf(gettext("--- ------------\n")); + cbp->cb_first = B_FALSE; + } + + (void) printf("%2llu %s\n", (u_longlong_t)version, + zpool_get_name(zhp)); + } + + return (0); +} + +static int +upgrade_list_disabled_cb(zpool_handle_t *zhp, void *arg) +{ + upgrade_cbdata_t *cbp = arg; + nvlist_t *config; + uint64_t version; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if (version >= SPA_VERSION_FEATURES) { + int i; + boolean_t poolfirst = B_TRUE; + nvlist_t *enabled = zpool_get_features(zhp); + + for (i = 0; i < SPA_FEATURES; i++) { + const char *fguid = spa_feature_table[i].fi_guid; + const char *fname = spa_feature_table[i].fi_uname; + if (!nvlist_exists(enabled, fguid)) { + if (cbp->cb_first) { + (void) printf(gettext("\nSome " + "supported features are not " + "enabled on the following pools. " + "Once a\nfeature is enabled the " + "pool may become incompatible with " + "software\nthat does not support " + "the feature. See " + "zpool-features(5) for " + "details.\n\n")); + (void) printf(gettext("POOL " + "FEATURE\n")); + (void) printf(gettext("------" + "---------\n")); + cbp->cb_first = B_FALSE; + } + + if (poolfirst) { + (void) printf(gettext("%s\n"), + zpool_get_name(zhp)); + poolfirst = B_FALSE; + } + + (void) printf(gettext(" %s\n"), fname); + } + /* + * If they did "zpool upgrade -a", then we could + * be doing ioctls to different pools. We need + * to log this history once to each pool, and bypass + * the normal history logging that happens in main(). + */ + (void) zpool_log_history(g_zfs, history_str); + log_history = B_FALSE; + } + } + + return (0); +} + +/* ARGSUSED */ +static int +upgrade_one(zpool_handle_t *zhp, void *data) +{ + boolean_t printnl = B_FALSE; + upgrade_cbdata_t *cbp = data; + uint64_t cur_version; + int ret; + + if (strcmp("log", zpool_get_name(zhp)) == 0) { + (void) fprintf(stderr, gettext("'log' is now a reserved word\n" + "Pool 'log' must be renamed using export and import" + " to upgrade.\n")); + return (1); + } + + cur_version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (cur_version > cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using more current version '%llu'.\n\n"), + zpool_get_name(zhp), (u_longlong_t)cur_version); + return (0); + } + + if (cbp->cb_version != SPA_VERSION && cur_version == cbp->cb_version) { + (void) printf(gettext("Pool '%s' is already formatted " + "using version %llu.\n\n"), zpool_get_name(zhp), + (u_longlong_t)cbp->cb_version); + return (0); + } + + if (cur_version != cbp->cb_version) { + printnl = B_TRUE; + ret = upgrade_version(zhp, cbp->cb_version); + if (ret != 0) + return (ret); + } + + if (cbp->cb_version >= SPA_VERSION_FEATURES) { + int count = 0; + ret = upgrade_enable_all(zhp, &count); + if (ret != 0) + return (ret); + + if (count != 0) { + printnl = B_TRUE; + } else if (cur_version == SPA_VERSION) { + (void) printf(gettext("Pool '%s' already has all " + "supported features enabled.\n"), + zpool_get_name(zhp)); + } + } + + if (printnl) { + (void) printf(gettext("\n")); + } + + return (0); +} + +/* + * zpool upgrade + * zpool upgrade -v + * zpool upgrade [-V version] <-a | pool ...> + * + * With no arguments, display downrev'd ZFS pool available for upgrade. + * Individual pools can be upgraded by specifying the pool, and '-a' will + * upgrade all pools. + */ +int +zpool_do_upgrade(int argc, char **argv) +{ + int c; + upgrade_cbdata_t cb = { 0 }; + int ret = 0; + boolean_t showversions = B_FALSE; + boolean_t upgradeall = B_FALSE; + char *end; + + + /* check options */ + while ((c = getopt(argc, argv, ":avV:")) != -1) { + switch (c) { + case 'a': + upgradeall = B_TRUE; + break; + case 'v': + showversions = B_TRUE; + break; + case 'V': + cb.cb_version = strtoll(optarg, &end, 10); + if (*end != '\0' || + !SPA_VERSION_IS_SUPPORTED(cb.cb_version)) { + (void) fprintf(stderr, + gettext("invalid version '%s'\n"), optarg); + usage(B_FALSE); + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + cb.cb_argc = argc; + cb.cb_argv = argv; + argc -= optind; + argv += optind; + + if (cb.cb_version == 0) { + cb.cb_version = SPA_VERSION; + } else if (!upgradeall && argc == 0) { + (void) fprintf(stderr, gettext("-V option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + + if (showversions) { + if (upgradeall || argc != 0) { + (void) fprintf(stderr, gettext("-v option is " + "incompatible with other arguments\n")); + usage(B_FALSE); + } + } else if (upgradeall) { + if (argc != 0) { + (void) fprintf(stderr, gettext("-a option should not " + "be used along with a pool name\n")); + usage(B_FALSE); + } + } + + (void) printf(gettext("This system supports ZFS pool feature " + "flags.\n\n")); + if (showversions) { + int i; + + (void) printf(gettext("The following features are " + "supported:\n\n")); + (void) printf(gettext("FEAT DESCRIPTION\n")); + (void) printf("----------------------------------------------" + "---------------\n"); + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + const char *ro = + (fi->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + " (read-only compatible)" : ""; + + (void) printf("%-37s%s\n", fi->fi_uname, ro); + (void) printf(" %s\n", fi->fi_desc); + } + (void) printf("\n"); + + (void) printf(gettext("The following legacy versions are also " + "supported:\n\n")); + (void) printf(gettext("VER DESCRIPTION\n")); + (void) printf("--- -----------------------------------------" + "---------------\n"); + (void) printf(gettext(" 1 Initial ZFS version\n")); + (void) printf(gettext(" 2 Ditto blocks " + "(replicated metadata)\n")); + (void) printf(gettext(" 3 Hot spares and double parity " + "RAID-Z\n")); + (void) printf(gettext(" 4 zpool history\n")); + (void) printf(gettext(" 5 Compression using the gzip " + "algorithm\n")); + (void) printf(gettext(" 6 bootfs pool property\n")); + (void) printf(gettext(" 7 Separate intent log devices\n")); + (void) printf(gettext(" 8 Delegated administration\n")); + (void) printf(gettext(" 9 refquota and refreservation " + "properties\n")); + (void) printf(gettext(" 10 Cache devices\n")); + (void) printf(gettext(" 11 Improved scrub performance\n")); + (void) printf(gettext(" 12 Snapshot properties\n")); + (void) printf(gettext(" 13 snapused property\n")); + (void) printf(gettext(" 14 passthrough-x aclinherit\n")); + (void) printf(gettext(" 15 user/group space accounting\n")); + (void) printf(gettext(" 16 stmf property support\n")); + (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); + (void) printf(gettext(" 18 Snapshot user holds\n")); + (void) printf(gettext(" 19 Log device removal\n")); + (void) printf(gettext(" 20 Compression using zle " + "(zero-length encoding)\n")); + (void) printf(gettext(" 21 Deduplication\n")); + (void) printf(gettext(" 22 Received properties\n")); + (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext(" 24 System attributes\n")); + (void) printf(gettext(" 25 Improved scrub stats\n")); + (void) printf(gettext(" 26 Improved snapshot deletion " + "performance\n")); + (void) printf(gettext(" 27 Improved snapshot creation " + "performance\n")); + (void) printf(gettext(" 28 Multiple vdev replacements\n")); + (void) printf(gettext("\nFor more information on a particular " + "version, including supported releases,\n")); + (void) printf(gettext("see the ZFS Administration Guide.\n\n")); + } else if (argc == 0 && upgradeall) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_cb, &cb); + if (ret == 0 && cb.cb_first) { + if (cb.cb_version == SPA_VERSION) { + (void) printf(gettext("All pools are already " + "formatted using feature flags.\n\n")); + (void) printf(gettext("Every feature flags " + "pool already has all supported features " + "enabled.\n")); + } else { + (void) printf(gettext("All pools are already " + "formatted with version %llu or higher.\n"), + (u_longlong_t)cb.cb_version); + } + } + } else if (argc == 0) { + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_older_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("All pools are formatted " + "using feature flags.\n\n")); + } else { + (void) printf(gettext("\nUse 'zpool upgrade -v' " + "for a list of available legacy versions.\n")); + } + + cb.cb_first = B_TRUE; + ret = zpool_iter(g_zfs, upgrade_list_disabled_cb, &cb); + assert(ret == 0); + + if (cb.cb_first) { + (void) printf(gettext("Every feature flags pool has " + "all supported features enabled.\n")); + } else { + (void) printf(gettext("\n")); + } + } else { + ret = for_each_pool(argc, argv, B_FALSE, NULL, + upgrade_one, &cb); + } + + return (ret); +} + +typedef struct hist_cbdata { + boolean_t first; + boolean_t longfmt; + boolean_t internal; +} hist_cbdata_t; + +static void +print_history_records(nvlist_t *nvhis, hist_cbdata_t *cb) +{ + nvlist_t **records; + uint_t numrecords; + int i; + + verify(nvlist_lookup_nvlist_array(nvhis, ZPOOL_HIST_RECORD, + &records, &numrecords) == 0); + for (i = 0; i < numrecords; i++) { + nvlist_t *rec = records[i]; + char tbuf[30] = ""; + + if (nvlist_exists(rec, ZPOOL_HIST_TIME)) { + time_t tsec; + struct tm t; + + tsec = fnvlist_lookup_uint64(records[i], + ZPOOL_HIST_TIME); + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + } + + if (nvlist_exists(rec, ZPOOL_HIST_CMD)) { + (void) printf("%s %s", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_EVENT)) { + int ievent = + fnvlist_lookup_uint64(rec, ZPOOL_HIST_INT_EVENT); + if (!cb->internal) + continue; + if (ievent >= ZFS_NUM_LEGACY_HISTORY_EVENTS) { + (void) printf("%s unrecognized record:\n", + tbuf); + dump_nvlist(rec, 4); + continue; + } + (void) printf("%s [internal %s txg:%lld] %s", tbuf, + zfs_history_event_names[ievent], + (longlong_t)fnvlist_lookup_uint64( + rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_INT_NAME)) { + if (!cb->internal) + continue; + (void) printf("%s [txg:%lld] %s", tbuf, + (longlong_t)fnvlist_lookup_uint64( + rec, ZPOOL_HIST_TXG), + fnvlist_lookup_string(rec, ZPOOL_HIST_INT_NAME)); + if (nvlist_exists(rec, ZPOOL_HIST_DSNAME)) { + (void) printf(" %s (%llu)", + fnvlist_lookup_string(rec, + ZPOOL_HIST_DSNAME), + (u_longlong_t)fnvlist_lookup_uint64(rec, + ZPOOL_HIST_DSID)); + } + (void) printf(" %s", fnvlist_lookup_string(rec, + ZPOOL_HIST_INT_STR)); + } else if (nvlist_exists(rec, ZPOOL_HIST_IOCTL)) { + if (!cb->internal) + continue; + (void) printf("%s ioctl %s\n", tbuf, + fnvlist_lookup_string(rec, ZPOOL_HIST_IOCTL)); + if (nvlist_exists(rec, ZPOOL_HIST_INPUT_NVL)) { + (void) printf(" input:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_INPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_OUTPUT_NVL)) { + (void) printf(" output:\n"); + dump_nvlist(fnvlist_lookup_nvlist(rec, + ZPOOL_HIST_OUTPUT_NVL), 8); + } + if (nvlist_exists(rec, ZPOOL_HIST_ERRNO)) { + (void) printf(" errno: %lld\n", + (longlong_t)fnvlist_lookup_int64(rec, + ZPOOL_HIST_ERRNO)); + } + } else { + if (!cb->internal) + continue; + (void) printf("%s unrecognized record:\n", tbuf); + dump_nvlist(rec, 4); + } + + if (!cb->longfmt) { + (void) printf("\n"); + continue; + } + (void) printf(" ["); + if (nvlist_exists(rec, ZPOOL_HIST_WHO)) { + uid_t who = fnvlist_lookup_uint64(rec, ZPOOL_HIST_WHO); + struct passwd *pwd = getpwuid(who); + (void) printf("user %d ", (int)who); + if (pwd != NULL) + (void) printf("(%s) ", pwd->pw_name); + } + if (nvlist_exists(rec, ZPOOL_HIST_HOST)) { + (void) printf("on %s", + fnvlist_lookup_string(rec, ZPOOL_HIST_HOST)); + } + if (nvlist_exists(rec, ZPOOL_HIST_ZONE)) { + (void) printf(":%s", + fnvlist_lookup_string(rec, ZPOOL_HIST_ZONE)); + } + + (void) printf("]"); + (void) printf("\n"); + } +} + +/* + * Print out the command history for a specific pool. + */ +static int +get_history_one(zpool_handle_t *zhp, void *data) +{ + nvlist_t *nvhis; + int ret; + hist_cbdata_t *cb = (hist_cbdata_t *)data; + uint64_t off = 0; + boolean_t eof = B_FALSE; + + cb->first = B_FALSE; + + (void) printf(gettext("History for '%s':\n"), zpool_get_name(zhp)); + + while (!eof) { + if ((ret = zpool_get_history(zhp, &nvhis, &off, &eof)) != 0) + return (ret); + + print_history_records(nvhis, cb); + nvlist_free(nvhis); + } + (void) printf("\n"); + + return (ret); +} + +/* + * zpool history + * + * Displays the history of commands that modified pools. + */ +int +zpool_do_history(int argc, char **argv) +{ + hist_cbdata_t cbdata = { 0 }; + int ret; + int c; + + cbdata.first = B_TRUE; + /* check options */ + while ((c = getopt(argc, argv, "li")) != -1) { + switch (c) { + case 'l': + cbdata.longfmt = B_TRUE; + break; + case 'i': + cbdata.internal = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + ret = for_each_pool(argc, argv, B_FALSE, NULL, get_history_one, + &cbdata); + + if (argc == 0 && cbdata.first == B_TRUE) { + (void) fprintf(stderr, gettext("no pools available\n")); + return (0); + } + + return (ret); +} + +typedef struct ev_opts { + int verbose; + int scripted; + int follow; + int clear; + char poolname[ZFS_MAX_DATASET_NAME_LEN]; +} ev_opts_t; + +static void +zpool_do_events_short(nvlist_t *nvl, ev_opts_t *opts) +{ + char ctime_str[26], str[32], *ptr; + int64_t *tv; + uint_t n; + + verify(nvlist_lookup_int64_array(nvl, FM_EREPORT_TIME, &tv, &n) == 0); + memset(str, ' ', 32); + (void) ctime_r((const time_t *)&tv[0], ctime_str); + (void) memcpy(str, ctime_str+4, 6); /* 'Jun 30' */ + (void) memcpy(str+7, ctime_str+20, 4); /* '1993' */ + (void) memcpy(str+12, ctime_str+11, 8); /* '21:49:08' */ + (void) sprintf(str+20, ".%09lld", (longlong_t)tv[1]); /* '.123456789' */ + if (opts->scripted) + (void) printf(gettext("%s\t"), str); + else + (void) printf(gettext("%s "), str); + + verify(nvlist_lookup_string(nvl, FM_CLASS, &ptr) == 0); + (void) printf(gettext("%s\n"), ptr); +} + +static void +zpool_do_events_nvprint(nvlist_t *nvl, int depth) +{ + nvpair_t *nvp; + + for (nvp = nvlist_next_nvpair(nvl, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { + + data_type_t type = nvpair_type(nvp); + const char *name = nvpair_name(nvp); + + boolean_t b; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + nvlist_t *cnv; + + printf(gettext("%*s%s = "), depth, "", name); + + switch (type) { + case DATA_TYPE_BOOLEAN: + printf(gettext("%s"), "1"); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + printf(gettext("%s"), b ? "1" : "0"); + break; + + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (void *)&i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + printf(gettext("0x%x"), i8); + break; + + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (void *)&i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + printf(gettext("0x%x"), i16); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + printf(gettext("0x%x"), i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + /* + * translate vdev state values to readable + * strings to aide zpool events consumers + */ + if (strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE) == 0 || + strcmp(name, + FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE) == 0) { + printf(gettext("\"%s\" (0x%llx)"), + zpool_state_to_name(i64, VDEV_AUX_NONE), + (u_longlong_t)i64); + } else { + printf(gettext("0x%llx"), (u_longlong_t)i64); + } + break; + + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (void *)&i64); + printf(gettext("0x%llx"), (u_longlong_t)i64); + break; + + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + printf(gettext("\"%s\""), str ? str : ""); + break; + + case DATA_TYPE_NVLIST: + printf(gettext("(embedded nvlist)\n")); + (void) nvpair_value_nvlist(nvp, &cnv); + zpool_do_events_nvprint(cnv, depth + 8); + printf(gettext("%*s(end %s)"), depth, "", name); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t i, nelem; + + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + printf(gettext("(%d embedded nvlists)\n"), nelem); + for (i = 0; i < nelem; i++) { + printf(gettext("%*s%s[%d] = %s\n"), + depth, "", name, i, "(embedded nvlist)"); + zpool_do_events_nvprint(val[i], depth + 8); + printf(gettext("%*s(end %s[%i])\n"), + depth, "", name, i); + } + printf(gettext("%*s(end %s)\n"), depth, "", name); + } + break; + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t i, nelem; + + (void) nvpair_value_int8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t i, nelem; + + (void) nvpair_value_int16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t i, nelem; + + (void) nvpair_value_int32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%x "), val[i]); + + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t i, nelem; + + (void) nvpair_value_int64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), + (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t i, nelem; + + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("0x%llx "), + (u_longlong_t)val[i]); + + break; + } + + case DATA_TYPE_STRING_ARRAY: { + char **str; + uint_t i, nelem; + + (void) nvpair_value_string_array(nvp, &str, &nelem); + for (i = 0; i < nelem; i++) + printf(gettext("\"%s\" "), + str[i] ? str[i] : ""); + + break; + } + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_DOUBLE: + case DATA_TYPE_DONTCARE: + case DATA_TYPE_UNKNOWN: + printf(gettext("")); + break; + } + + printf(gettext("\n")); + } +} + +static int +zpool_do_events_next(ev_opts_t *opts) +{ + nvlist_t *nvl; + int zevent_fd, ret, dropped; + char *pool; + + zevent_fd = open(ZFS_DEV, O_RDWR); + VERIFY(zevent_fd >= 0); + + if (!opts->scripted) + (void) printf(gettext("%-30s %s\n"), "TIME", "CLASS"); + + while (1) { + ret = zpool_events_next(g_zfs, &nvl, &dropped, + (opts->follow ? ZEVENT_NONE : ZEVENT_NONBLOCK), zevent_fd); + if (ret || nvl == NULL) + break; + + if (dropped > 0) + (void) printf(gettext("dropped %d events\n"), dropped); + + if (strlen(opts->poolname) > 0 && + nvlist_lookup_string(nvl, FM_FMRI_ZFS_POOL, &pool) == 0 && + strcmp(opts->poolname, pool) != 0) + continue; + + zpool_do_events_short(nvl, opts); + + if (opts->verbose) { + zpool_do_events_nvprint(nvl, 8); + printf(gettext("\n")); + } + (void) fflush(stdout); + + nvlist_free(nvl); + } + + VERIFY(0 == close(zevent_fd)); + + return (ret); +} + +static int +zpool_do_events_clear(ev_opts_t *opts) +{ + int count, ret; + + ret = zpool_events_clear(g_zfs, &count); + if (!ret) + (void) printf(gettext("cleared %d events\n"), count); + + return (ret); +} + +/* + * zpool events [-vHf [pool] | -c] + * + * Displays events logs by ZFS. + */ +int +zpool_do_events(int argc, char **argv) +{ + ev_opts_t opts = { 0 }; + int ret; + int c; + + /* check options */ + while ((c = getopt(argc, argv, "vHfc")) != -1) { + switch (c) { + case 'v': + opts.verbose = 1; + break; + case 'H': + opts.scripted = 1; + break; + case 'f': + opts.follow = 1; + break; + case 'c': + opts.clear = 1; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + argc -= optind; + argv += optind; + + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } else if (argc == 1) { + (void) strlcpy(opts.poolname, argv[0], sizeof (opts.poolname)); + if (!zfs_name_valid(opts.poolname, ZFS_TYPE_POOL)) { + (void) fprintf(stderr, + gettext("invalid pool name '%s'\n"), opts.poolname); + usage(B_FALSE); + } + } + + if ((argc == 1 || opts.verbose || opts.scripted || opts.follow) && + opts.clear) { + (void) fprintf(stderr, + gettext("invalid options combined with -c\n")); + usage(B_FALSE); + } + + if (opts.clear) + ret = zpool_do_events_clear(&opts); + else + ret = zpool_do_events_next(&opts); + + return (ret); +} + +static int +get_callback(zpool_handle_t *zhp, void *data) +{ + zprop_get_cbdata_t *cbp = (zprop_get_cbdata_t *)data; + char value[MAXNAMELEN]; + zprop_source_t srctype; + zprop_list_t *pl; + + for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { + + /* + * Skip the special fake placeholder. This will also skip + * over the name property when 'all' is specified. + */ + if (pl->pl_prop == ZPOOL_PROP_NAME && + pl == cbp->cb_proplist) + continue; + + if (pl->pl_prop == ZPROP_INVAL && + (zpool_prop_feature(pl->pl_user_prop) || + zpool_prop_unsupported(pl->pl_user_prop))) { + srctype = ZPROP_SRC_LOCAL; + + if (zpool_prop_get_feature(zhp, pl->pl_user_prop, + value, sizeof (value)) == 0) { + zprop_print_one_property(zpool_get_name(zhp), + cbp, pl->pl_user_prop, value, srctype, + NULL, NULL); + } + } else { + if (zpool_get_prop(zhp, pl->pl_prop, value, + sizeof (value), &srctype, cbp->cb_literal) != 0) + continue; + + zprop_print_one_property(zpool_get_name(zhp), cbp, + zpool_prop_to_name(pl->pl_prop), value, srctype, + NULL, NULL); + } + } + return (0); +} + +/* + * zpool get [-Hp] [-o "all" | field[,...]] <"all" | property[,...]> ... + * + * -H Scripted mode. Don't display headers, and separate properties + * by a single tab. + * -o List of columns to display. Defaults to + * "name,property,value,source". + * -p Display values in parsable (exact) format. + * + * Get properties of pools in the system. Output space statistics + * for each one as well as other attributes. + */ +int +zpool_do_get(int argc, char **argv) +{ + zprop_get_cbdata_t cb = { 0 }; + zprop_list_t fake_name = { 0 }; + int ret; + int c, i; + char *value; + + cb.cb_first = B_TRUE; + + /* + * Set up default columns and sources. + */ + cb.cb_sources = ZPROP_SRC_ALL; + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + cb.cb_type = ZFS_TYPE_POOL; + + /* check options */ + while ((c = getopt(argc, argv, ":Hpo:")) != -1) { + switch (c) { + case 'p': + cb.cb_literal = B_TRUE; + break; + case 'H': + cb.cb_scripted = B_TRUE; + break; + case 'o': + bzero(&cb.cb_columns, sizeof (cb.cb_columns)); + i = 0; + while (*optarg != '\0') { + static char *col_subopts[] = + { "name", "property", "value", "source", + "all", NULL }; + + if (i == ZFS_GET_NCOLS) { + (void) fprintf(stderr, gettext("too " + "many fields given to -o " + "option\n")); + usage(B_FALSE); + } + + switch (getsubopt(&optarg, col_subopts, + &value)) { + case 0: + cb.cb_columns[i++] = GET_COL_NAME; + break; + case 1: + cb.cb_columns[i++] = GET_COL_PROPERTY; + break; + case 2: + cb.cb_columns[i++] = GET_COL_VALUE; + break; + case 3: + cb.cb_columns[i++] = GET_COL_SOURCE; + break; + case 4: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_SOURCE; + i = ZFS_GET_NCOLS; + break; + default: + (void) fprintf(stderr, + gettext("invalid column name " + "'%s'\n"), value); + usage(B_FALSE); + } + } + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing property " + "argument\n")); + usage(B_FALSE); + } + + if (zprop_get_list(g_zfs, argv[0], &cb.cb_proplist, + ZFS_TYPE_POOL) != 0) + usage(B_FALSE); + + argc--; + argv++; + + if (cb.cb_proplist != NULL) { + fake_name.pl_prop = ZPOOL_PROP_NAME; + fake_name.pl_width = strlen(gettext("NAME")); + fake_name.pl_next = cb.cb_proplist; + cb.cb_proplist = &fake_name; + } + + ret = for_each_pool(argc, argv, B_TRUE, &cb.cb_proplist, + get_callback, &cb); + + if (cb.cb_proplist == &fake_name) + zprop_free_list(fake_name.pl_next); + else + zprop_free_list(cb.cb_proplist); + + return (ret); +} + +typedef struct set_cbdata { + char *cb_propname; + char *cb_value; + boolean_t cb_any_successful; +} set_cbdata_t; + +static int +set_callback(zpool_handle_t *zhp, void *data) +{ + int error; + set_cbdata_t *cb = (set_cbdata_t *)data; + + error = zpool_set_prop(zhp, cb->cb_propname, cb->cb_value); + + if (!error) + cb->cb_any_successful = B_TRUE; + + return (error); +} + +int +zpool_do_set(int argc, char **argv) +{ + set_cbdata_t cb = { 0 }; + int error; + + if (argc > 1 && argv[1][0] == '-') { + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + argv[1][1]); + usage(B_FALSE); + } + + if (argc < 2) { + (void) fprintf(stderr, gettext("missing property=value " + "argument\n")); + usage(B_FALSE); + } + + if (argc < 3) { + (void) fprintf(stderr, gettext("missing pool name\n")); + usage(B_FALSE); + } + + if (argc > 3) { + (void) fprintf(stderr, gettext("too many pool names\n")); + usage(B_FALSE); + } + + cb.cb_propname = argv[1]; + cb.cb_value = strchr(cb.cb_propname, '='); + if (cb.cb_value == NULL) { + (void) fprintf(stderr, gettext("missing value in " + "property=value argument\n")); + usage(B_FALSE); + } + + *(cb.cb_value) = '\0'; + cb.cb_value++; + + error = for_each_pool(argc - 2, argv + 2, B_TRUE, NULL, + set_callback, &cb); + + return (error); +} + +/* Add up the total number of bytes left to initialize/trim across all vdevs */ +static uint64_t +vdev_activity_remaining(nvlist_t *nv, zpool_wait_activity_t activity) +{ + uint64_t bytes_remaining; + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + + assert(activity == ZPOOL_WAIT_INITIALIZE || + activity == ZPOOL_WAIT_TRIM); + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) == 0); + + if (activity == ZPOOL_WAIT_INITIALIZE && + vs->vs_initialize_state == VDEV_INITIALIZE_ACTIVE) + bytes_remaining = vs->vs_initialize_bytes_est - + vs->vs_initialize_bytes_done; + else if (activity == ZPOOL_WAIT_TRIM && + vs->vs_trim_state == VDEV_TRIM_ACTIVE) + bytes_remaining = vs->vs_trim_bytes_est - + vs->vs_trim_bytes_done; + else + bytes_remaining = 0; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) + bytes_remaining += vdev_activity_remaining(child[c], activity); + + return (bytes_remaining); +} + +/* Add up the total number of bytes left to rebuild across top-level vdevs */ +static uint64_t +vdev_activity_top_remaining(nvlist_t *nv) +{ + uint64_t bytes_remaining = 0; + nvlist_t **child; + uint_t children; + int error; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (uint_t c = 0; c < children; c++) { + vdev_rebuild_stat_t *vrs; + uint_t i; + + error = nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t **)&vrs, &i); + if (error == 0) { + if (vrs->vrs_state == VDEV_REBUILD_ACTIVE) { + bytes_remaining += (vrs->vrs_bytes_est - + vrs->vrs_bytes_rebuilt); + } + } + } + + return (bytes_remaining); +} + +/* Whether any vdevs are 'spare' or 'replacing' vdevs */ +static boolean_t +vdev_any_spare_replacing(nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *vdev_type; + + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &vdev_type); + + if (strcmp(vdev_type, VDEV_TYPE_REPLACING) == 0 || + strcmp(vdev_type, VDEV_TYPE_SPARE) == 0) { + return (B_TRUE); + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + for (c = 0; c < children; c++) { + if (vdev_any_spare_replacing(child[c])) + return (B_TRUE); + } + + return (B_FALSE); +} + +typedef struct wait_data { + char *wd_poolname; + boolean_t wd_scripted; + boolean_t wd_exact; + boolean_t wd_headers_once; + boolean_t wd_should_exit; + /* Which activities to wait for */ + boolean_t wd_enabled[ZPOOL_WAIT_NUM_ACTIVITIES]; + float wd_interval; + pthread_cond_t wd_cv; + pthread_mutex_t wd_mutex; +} wait_data_t; + +/* + * Print to stdout a single line, containing one column for each activity that + * we are waiting for specifying how many bytes of work are left for that + * activity. + */ +static void +print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) +{ + nvlist_t *config, *nvroot; + uint_t c; + int i; + pool_checkpoint_stat_t *pcs = NULL; + pool_scan_stat_t *pss = NULL; + pool_removal_stat_t *prs = NULL; + char *headers[] = {"DISCARD", "FREE", "INITIALIZE", "REPLACE", + "REMOVE", "RESILVER", "SCRUB", "TRIM"}; + int col_widths[ZPOOL_WAIT_NUM_ACTIVITIES]; + + /* Calculate the width of each column */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + /* + * Make sure we have enough space in the col for pretty-printed + * numbers and for the column header, and then leave a couple + * spaces between cols for readability. + */ + col_widths[i] = MAX(strlen(headers[i]), 6) + 2; + } + + /* Print header if appropriate */ + int term_height = terminal_height(); + boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && + row % (term_height-1) == 0); + if (!wd->wd_scripted && (row == 0 || reprint_header)) { + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + if (wd->wd_enabled[i]) + (void) printf("%*s", col_widths[i], headers[i]); + } + (void) printf("\n"); + } + + /* Bytes of work remaining in each activity */ + int64_t bytes_rem[ZPOOL_WAIT_NUM_ACTIVITIES] = {0}; + + bytes_rem[ZPOOL_WAIT_FREE] = + zpool_get_prop_int(zhp, ZPOOL_PROP_FREEING, NULL); + + config = zpool_get_config(zhp, NULL); + nvroot = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE); + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t **)&pcs, &c); + if (pcs != NULL && pcs->pcs_state == CS_CHECKPOINT_DISCARDING) + bytes_rem[ZPOOL_WAIT_CKPT_DISCARD] = pcs->pcs_space; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t **)&prs, &c); + if (prs != NULL && prs->prs_state == DSS_SCANNING) + bytes_rem[ZPOOL_WAIT_REMOVE] = prs->prs_to_copy - + prs->prs_copied; + + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&pss, &c); + if (pss != NULL && pss->pss_state == DSS_SCANNING && + pss->pss_pass_scrub_pause == 0) { + int64_t rem = pss->pss_to_examine - pss->pss_issued; + if (pss->pss_func == POOL_SCAN_SCRUB) + bytes_rem[ZPOOL_WAIT_SCRUB] = rem; + else + bytes_rem[ZPOOL_WAIT_RESILVER] = rem; + } else if (check_rebuilding(nvroot, NULL)) { + bytes_rem[ZPOOL_WAIT_RESILVER] = + vdev_activity_top_remaining(nvroot); + } + + bytes_rem[ZPOOL_WAIT_INITIALIZE] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_INITIALIZE); + bytes_rem[ZPOOL_WAIT_TRIM] = + vdev_activity_remaining(nvroot, ZPOOL_WAIT_TRIM); + + /* + * A replace finishes after resilvering finishes, so the amount of work + * left for a replace is the same as for resilvering. + * + * It isn't quite correct to say that if we have any 'spare' or + * 'replacing' vdevs and a resilver is happening, then a replace is in + * progress, like we do here. When a hot spare is used, the faulted vdev + * is not removed after the hot spare is resilvered, so parent 'spare' + * vdev is not removed either. So we could have a 'spare' vdev, but be + * resilvering for a different reason. However, we use it as a heuristic + * because we don't have access to the DTLs, which could tell us whether + * or not we have really finished resilvering a hot spare. + */ + if (vdev_any_spare_replacing(nvroot)) + bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; + + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + char buf[64]; + if (!wd->wd_enabled[i]) + continue; + + if (wd->wd_exact) + (void) snprintf(buf, sizeof (buf), "%" PRIi64, + bytes_rem[i]); + else + zfs_nicenum(bytes_rem[i], buf, sizeof (buf)); + + if (wd->wd_scripted) + (void) printf(i == 0 ? "%s" : "\t%s", buf); + else + (void) printf(" %*s", col_widths[i] - 1, buf); + } + (void) printf("\n"); + (void) fflush(stdout); +} + +static void * +wait_status_thread(void *arg) +{ + wait_data_t *wd = (wait_data_t *)arg; + zpool_handle_t *zhp; + + if ((zhp = zpool_open(g_zfs, wd->wd_poolname)) == NULL) + return (void *)(1); + + for (int row = 0; ; row++) { + boolean_t missing; + struct timespec timeout; + int ret = 0; + (void) clock_gettime(CLOCK_REALTIME, &timeout); + + if (zpool_refresh_stats(zhp, &missing) != 0 || missing || + zpool_props_refresh(zhp) != 0) { + zpool_close(zhp); + return (void *)(uintptr_t)(missing ? 0 : 1); + } + + print_wait_status_row(wd, zhp, row); + + timeout.tv_sec += floor(wd->wd_interval); + long nanos = timeout.tv_nsec + + (wd->wd_interval - floor(wd->wd_interval)) * NANOSEC; + if (nanos >= NANOSEC) { + timeout.tv_sec++; + timeout.tv_nsec = nanos - NANOSEC; + } else { + timeout.tv_nsec = nanos; + } + pthread_mutex_lock(&wd->wd_mutex); + if (!wd->wd_should_exit) + ret = pthread_cond_timedwait(&wd->wd_cv, &wd->wd_mutex, + &timeout); + pthread_mutex_unlock(&wd->wd_mutex); + if (ret == 0) { + break; /* signaled by main thread */ + } else if (ret != ETIMEDOUT) { + (void) fprintf(stderr, gettext("pthread_cond_timedwait " + "failed: %s\n"), strerror(ret)); + zpool_close(zhp); + return (void *)(uintptr_t)(1); + } + } + + zpool_close(zhp); + return (void *)(0); +} + +int +zpool_do_wait(int argc, char **argv) +{ + boolean_t verbose = B_FALSE; + char c; + char *value; + int i; + unsigned long count; + pthread_t status_thr; + int error = 0; + zpool_handle_t *zhp; + + wait_data_t wd; + wd.wd_scripted = B_FALSE; + wd.wd_exact = B_FALSE; + wd.wd_headers_once = B_FALSE; + wd.wd_should_exit = B_FALSE; + + pthread_mutex_init(&wd.wd_mutex, NULL); + pthread_cond_init(&wd.wd_cv, NULL); + + /* By default, wait for all types of activity. */ + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) + wd.wd_enabled[i] = B_TRUE; + + while ((c = getopt(argc, argv, "HpT:t:")) != -1) { + switch (c) { + case 'H': + wd.wd_scripted = B_TRUE; + break; + case 'n': + wd.wd_headers_once = B_TRUE; + break; + case 'p': + wd.wd_exact = B_TRUE; + break; + case 'T': + get_timestamp_arg(*optarg); + break; + case 't': + { + static char *col_subopts[] = { "discard", "free", + "initialize", "replace", "remove", "resilver", + "scrub", "trim", NULL }; + + /* Reset activities array */ + bzero(&wd.wd_enabled, sizeof (wd.wd_enabled)); + while (*optarg != '\0') { + int activity = getsubopt(&optarg, col_subopts, + &value); + + if (activity < 0) { + (void) fprintf(stderr, + gettext("invalid activity '%s'\n"), + value); + usage(B_FALSE); + } + + wd.wd_enabled[activity] = B_TRUE; + } + break; + } + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + get_interval_count(&argc, argv, &wd.wd_interval, &count); + if (count != 0) { + /* This subcmd only accepts an interval, not a count */ + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + if (wd.wd_interval != 0) + verbose = B_TRUE; + + if (argc < 1) { + (void) fprintf(stderr, gettext("missing 'pool' argument\n")); + usage(B_FALSE); + } + if (argc > 1) { + (void) fprintf(stderr, gettext("too many arguments\n")); + usage(B_FALSE); + } + + wd.wd_poolname = argv[0]; + + if ((zhp = zpool_open(g_zfs, wd.wd_poolname)) == NULL) + return (1); + + if (verbose) { + /* + * We use a separate thread for printing status updates because + * the main thread will call lzc_wait(), which blocks as long + * as an activity is in progress, which can be a long time. + */ + if (pthread_create(&status_thr, NULL, wait_status_thread, &wd) + != 0) { + (void) fprintf(stderr, gettext("failed to create status" + "thread: %s\n"), strerror(errno)); + zpool_close(zhp); + return (1); + } + } + + /* + * Loop over all activities that we are supposed to wait for until none + * of them are in progress. Note that this means we can end up waiting + * for more activities to complete than just those that were in progress + * when we began waiting; if an activity we are interested in begins + * while we are waiting for another activity, we will wait for both to + * complete before exiting. + */ + for (;;) { + boolean_t missing = B_FALSE; + boolean_t any_waited = B_FALSE; + + for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { + boolean_t waited; + + if (!wd.wd_enabled[i]) + continue; + + error = zpool_wait_status(zhp, i, &missing, &waited); + if (error != 0 || missing) + break; + + any_waited = (any_waited || waited); + } + + if (error != 0 || missing || !any_waited) + break; + } + + zpool_close(zhp); + + if (verbose) { + uintptr_t status; + pthread_mutex_lock(&wd.wd_mutex); + wd.wd_should_exit = B_TRUE; + pthread_cond_signal(&wd.wd_cv); + pthread_mutex_unlock(&wd.wd_mutex); + (void) pthread_join(status_thr, (void *)&status); + if (status != 0) + error = status; + } + + pthread_mutex_destroy(&wd.wd_mutex); + pthread_cond_destroy(&wd.wd_cv); + return (error); +} + +static int +find_command_idx(char *command, int *idx) +{ + int i; + + for (i = 0; i < NCOMMAND; i++) { + if (command_table[i].name == NULL) + continue; + + if (strcmp(command, command_table[i].name) == 0) { + *idx = i; + return (0); + } + } + return (1); +} + +/* + * Display version message + */ +static int +zpool_do_version(int argc, char **argv) +{ + if (zfs_version_print() == -1) + return (1); + + return (0); +} + +int +main(int argc, char **argv) +{ + int ret = 0; + int i = 0; + char *cmdname; + char **newargv; + + (void) setlocale(LC_ALL, ""); + (void) textdomain(TEXT_DOMAIN); + srand(time(NULL)); + + opterr = 0; + + /* + * Make sure the user has specified some command. + */ + if (argc < 2) { + (void) fprintf(stderr, gettext("missing command\n")); + usage(B_FALSE); + } + + cmdname = argv[1]; + + /* + * Special case '-?' + */ + if ((strcmp(cmdname, "-?") == 0) || strcmp(cmdname, "--help") == 0) + usage(B_TRUE); + + /* + * Special case '-V|--version' + */ + if ((strcmp(cmdname, "-V") == 0) || (strcmp(cmdname, "--version") == 0)) + return (zpool_do_version(argc, argv)); + + if ((g_zfs = libzfs_init()) == NULL) { + (void) fprintf(stderr, "%s\n", libzfs_error_init(errno)); + return (1); + } + + libzfs_print_on_error(g_zfs, B_TRUE); + + zfs_save_arguments(argc, argv, history_str, sizeof (history_str)); + + /* + * Many commands modify input strings for string parsing reasons. + * We create a copy to protect the original argv. + */ + newargv = malloc((argc + 1) * sizeof (newargv[0])); + for (i = 0; i < argc; i++) + newargv[i] = strdup(argv[i]); + newargv[argc] = NULL; + + /* + * Run the appropriate command. + */ + if (find_command_idx(cmdname, &i) == 0) { + current_command = &command_table[i]; + ret = command_table[i].func(argc - 1, newargv + 1); + } else if (strchr(cmdname, '=')) { + verify(find_command_idx("set", &i) == 0); + current_command = &command_table[i]; + ret = command_table[i].func(argc, newargv); + } else if (strcmp(cmdname, "freeze") == 0 && argc == 3) { + /* + * 'freeze' is a vile debugging abomination, so we treat + * it as such. + */ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, argv[2], sizeof (zc.zc_name)); + ret = zfs_ioctl(g_zfs, ZFS_IOC_POOL_FREEZE, &zc); + if (ret != 0) { + (void) fprintf(stderr, + gettext("failed to freeze pool: %d\n"), errno); + ret = 1; + } + + log_history = 0; + } else { + (void) fprintf(stderr, gettext("unrecognized " + "command '%s'\n"), cmdname); + usage(B_FALSE); + ret = 1; + } + + for (i = 0; i < argc; i++) + free(newargv[i]); + free(newargv); + + if (ret == 0 && log_history) + (void) zpool_log_history(g_zfs, history_str); + + libzfs_fini(g_zfs); + + /* + * The 'ZFS_ABORT' environment variable causes us to dump core on exit + * for the purposes of running ::findleaks. + */ + if (getenv("ZFS_ABORT") != NULL) { + (void) printf("dumping core by request\n"); + abort(); + } + + return (ret); +} diff --git a/cmd/zpool/zpool_util.c b/cmd/zpool/zpool_util.c new file mode 100644 index 000000000000..1c1eb024f365 --- /dev/null +++ b/cmd/zpool/zpool_util.c @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zpool_util.h" + +/* + * Utility function to guarantee malloc() success. + */ +void * +safe_malloc(size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) { + (void) fprintf(stderr, "internal error: out of memory\n"); + exit(1); + } + + return (data); +} + +/* + * Display an out of memory error message and abort the current program. + */ +void +zpool_no_memory(void) +{ + assert(errno == ENOMEM); + (void) fprintf(stderr, + gettext("internal error: out of memory\n")); + exit(1); +} + +/* + * Return the number of logs in supplied nvlist + */ +uint_t +num_logs(nvlist_t *nv) +{ + uint_t nlogs = 0; + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (0); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (is_log) + nlogs++; + } + return (nlogs); +} + +/* Find the max element in an array of uint64_t values */ +uint64_t +array64_max(uint64_t array[], unsigned int len) +{ + uint64_t max = 0; + int i; + for (i = 0; i < len; i++) + max = MAX(max, array[i]); + + return (max); +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + */ +int +lowbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (__builtin_ffsll(i)); +} diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h new file mode 100644 index 000000000000..265aa58953a0 --- /dev/null +++ b/cmd/zpool/zpool_util.h @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef ZPOOL_UTIL_H +#define ZPOOL_UTIL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* Path to scripts you can run with "zpool status/iostat -c" */ +#define ZPOOL_SCRIPTS_DIR SYSCONFDIR"/zfs/zpool.d" + +/* + * Basic utility functions + */ +void *safe_malloc(size_t); +void zpool_no_memory(void); +uint_t num_logs(nvlist_t *nv); +uint64_t array64_max(uint64_t array[], unsigned int len); +int highbit64(uint64_t i); +int lowbit64(uint64_t i); + +/* + * Misc utility functions + */ +char *zpool_get_cmd_search_path(void); + +/* + * Virtual device functions + */ + +nvlist_t *make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, + int check_rep, boolean_t replacing, boolean_t dryrun, int argc, + char **argv); +nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, + nvlist_t *props, splitflags_t flags, int argc, char **argv); + +/* + * Pool list functions + */ +int for_each_pool(int, char **, boolean_t unavail, zprop_list_t **, + zpool_iter_f, void *); + +/* Vdev list functions */ +typedef int (*pool_vdev_iter_f)(zpool_handle_t *, nvlist_t *, void *); +int for_each_vdev(zpool_handle_t *zhp, pool_vdev_iter_f func, void *data); + +typedef struct zpool_list zpool_list_t; + +zpool_list_t *pool_list_get(int, char **, zprop_list_t **, int *); +void pool_list_update(zpool_list_t *); +int pool_list_iter(zpool_list_t *, int unavail, zpool_iter_f, void *); +void pool_list_free(zpool_list_t *); +int pool_list_count(zpool_list_t *); +void pool_list_remove(zpool_list_t *, zpool_handle_t *); + +extern libzfs_handle_t *g_zfs; + + +typedef struct vdev_cmd_data +{ + char **lines; /* Array of lines of output, minus the column name */ + int lines_cnt; /* Number of lines in the array */ + + char **cols; /* Array of column names */ + int cols_cnt; /* Number of column names */ + + + char *path; /* vdev path */ + char *upath; /* vdev underlying path */ + char *pool; /* Pool name */ + char *cmd; /* backpointer to cmd */ + char *vdev_enc_sysfs_path; /* enclosure sysfs path (if any) */ +} vdev_cmd_data_t; + +typedef struct vdev_cmd_data_list +{ + char *cmd; /* Command to run */ + unsigned int count; /* Number of vdev_cmd_data items (vdevs) */ + + /* fields used to select only certain vdevs, if requested */ + libzfs_handle_t *g_zfs; + char **vdev_names; + int vdev_names_count; + int cb_name_flags; + + vdev_cmd_data_t *data; /* Array of vdevs */ + + /* List of unique column names and widths */ + char **uniq_cols; + int uniq_cols_cnt; + int *uniq_cols_width; + +} vdev_cmd_data_list_t; + +vdev_cmd_data_list_t *all_pools_for_each_vdev_run(int argc, char **argv, + char *cmd, libzfs_handle_t *g_zfs, char **vdev_names, int vdev_names_count, + int cb_name_flags); + +void free_vdev_cmd_data_list(vdev_cmd_data_list_t *vcdl); + +int check_device(const char *path, boolean_t force, + boolean_t isspare, boolean_t iswholedisk); +boolean_t check_sector_size_database(char *path, int *sector_size); +void vdev_error(const char *fmt, ...); +int check_file(const char *file, boolean_t force, boolean_t isspare); + +#ifdef __cplusplus +} +#endif + +#endif /* ZPOOL_UTIL_H */ diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c new file mode 100644 index 000000000000..9aa09b18c4ae --- /dev/null +++ b/cmd/zpool/zpool_vdev.c @@ -0,0 +1,1581 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016, 2017 Intel Corporation. + * Copyright 2016 Igor Kozhukhov . + */ + +/* + * Functions to convert between a list of vdevs and an nvlist representing the + * configuration. Each entry in the list can be one of: + * + * Device vdevs + * disk=(path=..., devid=...) + * file=(path=...) + * + * Group vdevs + * raidz[1|2]=(...) + * mirror=(...) + * + * Hot spares + * + * While the underlying implementation supports it, group vdevs cannot contain + * other group vdevs. All userland verification of devices is contained within + * this file. If successful, the nvlist returned can be passed directly to the + * kernel; we've done as much verification as possible in userland. + * + * Hot spares are a special case, and passed down as an array of disk vdevs, at + * the same level as the root of the vdev tree. + * + * The only function exported by this file is 'make_root_vdev'. The + * function performs several passes: + * + * 1. Construct the vdev specification. Performs syntax validation and + * makes sure each device is valid. + * 2. Check for devices in use. Using libblkid to make sure that no + * devices are also in use. Some can be overridden using the 'force' + * flag, others cannot. + * 3. Check for replication errors if the 'force' flag is not specified. + * validates that the replication level is consistent across the + * entire pool. + * 4. Call libzfs to label any whole disks with an EFI label. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zpool_util.h" +#include +#include + +/* + * For any given vdev specification, we can have multiple errors. The + * vdev_error() function keeps track of whether we have seen an error yet, and + * prints out a header if its the first error we've seen. + */ +boolean_t error_seen; +boolean_t is_force; + + + + +/*PRINTFLIKE1*/ +void +vdev_error(const char *fmt, ...) +{ + va_list ap; + + if (!error_seen) { + (void) fprintf(stderr, gettext("invalid vdev specification\n")); + if (!is_force) + (void) fprintf(stderr, gettext("use '-f' to override " + "the following errors:\n")); + else + (void) fprintf(stderr, gettext("the following errors " + "must be manually repaired:\n")); + error_seen = B_TRUE; + } + + va_start(ap, fmt); + (void) vfprintf(stderr, fmt, ap); + va_end(ap); +} + +/* + * Check that a file is valid. All we can do in this case is check that it's + * not in use by another pool, and not in use by swap. + */ +int +check_file(const char *file, boolean_t force, boolean_t isspare) +{ + char *name; + int fd; + int ret = 0; + pool_state_t state; + boolean_t inuse; + + if ((fd = open(file, O_RDONLY)) < 0) + return (0); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) == 0 && inuse) { + const char *desc; + + switch (state) { + case POOL_STATE_ACTIVE: + desc = gettext("active"); + break; + + case POOL_STATE_EXPORTED: + desc = gettext("exported"); + break; + + case POOL_STATE_POTENTIALLY_ACTIVE: + desc = gettext("potentially active"); + break; + + default: + desc = gettext("unknown"); + break; + } + + /* + * Allow hot spares to be shared between pools. + */ + if (state == POOL_STATE_SPARE && isspare) { + free(name); + (void) close(fd); + return (0); + } + + if (state == POOL_STATE_ACTIVE || + state == POOL_STATE_SPARE || !force) { + switch (state) { + case POOL_STATE_SPARE: + vdev_error(gettext("%s is reserved as a hot " + "spare for pool %s\n"), file, name); + break; + default: + vdev_error(gettext("%s is part of %s pool " + "'%s'\n"), file, desc, name); + break; + } + ret = -1; + } + + free(name); + } + + (void) close(fd); + return (ret); +} + +/* + * This may be a shorthand device path or it could be total gibberish. + * Check to see if it is a known device available in zfs_vdev_paths. + * As part of this check, see if we've been given an entire disk + * (minus the slice number). + */ +static int +is_shorthand_path(const char *arg, char *path, size_t path_size, + struct stat64 *statbuf, boolean_t *wholedisk) +{ + int error; + + error = zfs_resolve_shortname(arg, path, path_size); + if (error == 0) { + *wholedisk = zfs_dev_is_whole_disk(path); + if (*wholedisk || (stat64(path, statbuf) == 0)) + return (0); + } + + strlcpy(path, arg, path_size); + memset(statbuf, 0, sizeof (*statbuf)); + *wholedisk = B_FALSE; + + return (error); +} + +/* + * Determine if the given path is a hot spare within the given configuration. + * If no configuration is given we rely solely on the label. + */ +static boolean_t +is_spare(nvlist_t *config, const char *path) +{ + int fd; + pool_state_t state; + char *name = NULL; + nvlist_t *label; + uint64_t guid, spareguid; + nvlist_t *nvroot; + nvlist_t **spares; + uint_t i, nspares; + boolean_t inuse; + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (B_FALSE); + + if (zpool_in_use(g_zfs, fd, &state, &name, &inuse) != 0 || + !inuse || + state != POOL_STATE_SPARE || + zpool_read_label(fd, &label, NULL) != 0) { + free(name); + (void) close(fd); + return (B_FALSE); + } + free(name); + (void) close(fd); + + if (config == NULL) { + nvlist_free(label); + return (B_TRUE); + } + + verify(nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) == 0); + nvlist_free(label); + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + verify(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &spareguid) == 0); + if (spareguid == guid) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Create a leaf vdev. Determine if this is a file or a device. If it's a + * device, fill in the device id to make a complete nvlist. Valid forms for a + * leaf vdev are: + * + * /dev/xxx Complete disk path + * /xxx Full path to file + * xxx Shorthand for /xxx + */ +static nvlist_t * +make_leaf_vdev(nvlist_t *props, const char *arg, uint64_t is_log) +{ + char path[MAXPATHLEN]; + struct stat64 statbuf; + nvlist_t *vdev = NULL; + char *type = NULL; + boolean_t wholedisk = B_FALSE; + uint64_t ashift = 0; + int err; + + /* + * Determine what type of vdev this is, and put the full path into + * 'path'. We detect whether this is a device of file afterwards by + * checking the st_mode of the file. + */ + if (arg[0] == '/') { + /* + * Complete device or file path. Exact type is determined by + * examining the file descriptor afterwards. Symbolic links + * are resolved to their real paths to determine whole disk + * and S_ISBLK/S_ISREG type checks. However, we are careful + * to store the given path as ZPOOL_CONFIG_PATH to ensure we + * can leverage udev's persistent device labels. + */ + if (realpath(arg, path) == NULL) { + (void) fprintf(stderr, + gettext("cannot resolve path '%s'\n"), arg); + return (NULL); + } + + wholedisk = zfs_dev_is_whole_disk(path); + if (!wholedisk && (stat64(path, &statbuf) != 0)) { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + + /* After whole disk check restore original passed path */ + strlcpy(path, arg, sizeof (path)); + } else { + err = is_shorthand_path(arg, path, sizeof (path), + &statbuf, &wholedisk); + if (err != 0) { + /* + * If we got ENOENT, then the user gave us + * gibberish, so try to direct them with a + * reasonable error message. Otherwise, + * regurgitate strerror() since it's the best we + * can do. + */ + if (err == ENOENT) { + (void) fprintf(stderr, + gettext("cannot open '%s': no such " + "device in %s\n"), arg, DISK_ROOT); + (void) fprintf(stderr, + gettext("must be a full path or " + "shorthand device name\n")); + return (NULL); + } else { + (void) fprintf(stderr, + gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (NULL); + } + } + } + + /* + * Determine whether this is a device or a file. + */ + if (wholedisk || S_ISBLK(statbuf.st_mode)) { + type = VDEV_TYPE_DISK; + } else if (S_ISREG(statbuf.st_mode)) { + type = VDEV_TYPE_FILE; + } else { + (void) fprintf(stderr, gettext("cannot use '%s': must be a " + "block device or regular file\n"), path); + return (NULL); + } + + /* + * Finally, we have the complete device or file, and we know that it is + * acceptable to use. Construct the nvlist to describe this vdev. All + * vdevs have a 'path' element, and devices also have a 'devid' element. + */ + verify(nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(vdev, ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, + (uint64_t)wholedisk) == 0); + + /* + * Override defaults if custom properties are provided. + */ + if (props != NULL) { + char *value = NULL; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ASHIFT), &value) == 0) { + if (zfs_nicestrtonum(NULL, value, &ashift) != 0) { + (void) fprintf(stderr, + gettext("ashift must be a number.\n")); + return (NULL); + } + if (ashift != 0 && + (ashift < ASHIFT_MIN || ashift > ASHIFT_MAX)) { + (void) fprintf(stderr, + gettext("invalid 'ashift=%" PRIu64 "' " + "property: only values between %" PRId32 " " + "and %" PRId32 " are allowed.\n"), + ashift, ASHIFT_MIN, ASHIFT_MAX); + return (NULL); + } + } + } + + /* + * If the device is known to incorrectly report its physical sector + * size explicitly provide the known correct value. + */ + if (ashift == 0) { + int sector_size; + + if (check_sector_size_database(path, §or_size) == B_TRUE) + ashift = highbit64(sector_size) - 1; + } + + if (ashift > 0) + (void) nvlist_add_uint64(vdev, ZPOOL_CONFIG_ASHIFT, ashift); + + return (vdev); +} + +/* + * Go through and verify the replication level of the pool is consistent. + * Performs the following checks: + * + * For the new spec, verifies that devices in mirrors and raidz are the + * same size. + * + * If the current configuration already has inconsistent replication + * levels, ignore any other potential problems in the new spec. + * + * Otherwise, make sure that the current spec (if there is one) and the new + * spec have consistent replication levels. + * + * If there is no current spec (create), make sure new spec has at least + * one general purpose vdev. + */ +typedef struct replication_level { + char *zprl_type; + uint64_t zprl_children; + uint64_t zprl_parity; +} replication_level_t; + +#define ZPOOL_FUZZ (16 * 1024 * 1024) + +static boolean_t +is_raidz_mirror(replication_level_t *a, replication_level_t *b, + replication_level_t **raidz, replication_level_t **mirror) +{ + if (strcmp(a->zprl_type, "raidz") == 0 && + strcmp(b->zprl_type, "mirror") == 0) { + *raidz = a; + *mirror = b; + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Given a list of toplevel vdevs, return the current replication level. If + * the config is inconsistent, then NULL is returned. If 'fatal' is set, then + * an error message will be displayed for each self-inconsistent vdev. + */ +static replication_level_t * +get_replication(nvlist_t *nvroot, boolean_t fatal) +{ + nvlist_t **top; + uint_t t, toplevels; + nvlist_t **child; + uint_t c, children; + nvlist_t *nv; + char *type; + replication_level_t lastrep = {0}; + replication_level_t rep; + replication_level_t *ret; + replication_level_t *raidz, *mirror; + boolean_t dontreport; + + ret = safe_malloc(sizeof (replication_level_t)); + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t is_log = B_FALSE; + + nv = top[t]; + + /* + * For separate logs we ignore the top level vdev replication + * constraints. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); + if (is_log) + continue; + + /* Ignore holes introduced by removing aux devices */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (strcmp(type, VDEV_TYPE_HOLE) == 0) + continue; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + /* + * This is a 'file' or 'disk' vdev. + */ + rep.zprl_type = type; + rep.zprl_children = 1; + rep.zprl_parity = 0; + } else { + int64_t vdev_size; + + /* + * This is a mirror or RAID-Z vdev. Go through and make + * sure the contents are all the same (files vs. disks), + * keeping track of the number of elements in the + * process. + * + * We also check that the size of each vdev (if it can + * be determined) is the same. + */ + rep.zprl_type = type; + rep.zprl_children = 0; + + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_NPARITY, + &rep.zprl_parity) == 0); + assert(rep.zprl_parity != 0); + } else { + rep.zprl_parity = 0; + } + + /* + * The 'dontreport' variable indicates that we've + * already reported an error for this spec, so don't + * bother doing it again. + */ + type = NULL; + dontreport = 0; + vdev_size = -1LL; + for (c = 0; c < children; c++) { + nvlist_t *cnv = child[c]; + char *path; + struct stat64 statbuf; + int64_t size = -1LL; + char *childtype; + int fd, err; + + rep.zprl_children++; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, &childtype) == 0); + + /* + * If this is a replacing or spare vdev, then + * get the real first child of the vdev: do this + * in a loop because replacing and spare vdevs + * can be nested. + */ + while (strcmp(childtype, + VDEV_TYPE_REPLACING) == 0 || + strcmp(childtype, VDEV_TYPE_SPARE) == 0) { + nvlist_t **rchild; + uint_t rchildren; + + verify(nvlist_lookup_nvlist_array(cnv, + ZPOOL_CONFIG_CHILDREN, &rchild, + &rchildren) == 0); + assert(rchildren == 2); + cnv = rchild[0]; + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_TYPE, + &childtype) == 0); + } + + verify(nvlist_lookup_string(cnv, + ZPOOL_CONFIG_PATH, &path) == 0); + + /* + * If we have a raidz/mirror that combines disks + * with files, report it as an error. + */ + if (!dontreport && type != NULL && + strcmp(type, childtype) != 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: %s contains both " + "files and devices\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + /* + * According to stat(2), the value of 'st_size' + * is undefined for block devices and character + * devices. But there is no effective way to + * determine the real size in userland. + * + * Instead, we'll take advantage of an + * implementation detail of spec_size(). If the + * device is currently open, then we (should) + * return a valid size. + * + * If we still don't get a valid size (indicated + * by a size of 0 or MAXOFFSET_T), then ignore + * this device altogether. + */ + if ((fd = open(path, O_RDONLY)) >= 0) { + err = fstat64_blk(fd, &statbuf); + (void) close(fd); + } else { + err = stat64(path, &statbuf); + } + + if (err != 0 || + statbuf.st_size == 0 || + statbuf.st_size == MAXOFFSET_T) + continue; + + size = statbuf.st_size; + + /* + * Also make sure that devices and + * slices have a consistent size. If + * they differ by a significant amount + * (~16MB) then report an error. + */ + if (!dontreport && + (vdev_size != -1LL && + (llabs(size - vdev_size) > + ZPOOL_FUZZ))) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "%s contains devices of " + "different sizes\n"), + rep.zprl_type); + else + return (NULL); + dontreport = B_TRUE; + } + + type = childtype; + vdev_size = size; + } + } + + /* + * At this point, we have the replication of the last toplevel + * vdev in 'rep'. Compare it to 'lastrep' to see if it is + * different. + */ + if (lastrep.zprl_type != NULL) { + if (is_raidz_mirror(&lastrep, &rep, &raidz, &mirror) || + is_raidz_mirror(&rep, &lastrep, &raidz, &mirror)) { + /* + * Accepted raidz and mirror when they can + * handle the same number of disk failures. + */ + if (raidz->zprl_parity != + mirror->zprl_children - 1) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication " + "level: " + "%s and %s vdevs with " + "different redundancy, " + "%llu vs. %llu (%llu-way) " + "are present\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + else + return (NULL); + } + } else if (strcmp(lastrep.zprl_type, rep.zprl_type) != + 0) { + if (ret != NULL) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %s and %s vdevs are " + "present\n"), + lastrep.zprl_type, rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_parity != rep.zprl_parity) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu and %llu device parity " + "%s vdevs are present\n"), + lastrep.zprl_parity, + rep.zprl_parity, + rep.zprl_type); + else + return (NULL); + } else if (lastrep.zprl_children != rep.zprl_children) { + if (ret) + free(ret); + ret = NULL; + if (fatal) + vdev_error(gettext( + "mismatched replication level: " + "both %llu-way and %llu-way %s " + "vdevs are present\n"), + lastrep.zprl_children, + rep.zprl_children, + rep.zprl_type); + else + return (NULL); + } + } + lastrep = rep; + } + + if (ret != NULL) + *ret = rep; + + return (ret); +} + +/* + * Check the replication level of the vdev spec against the current pool. Calls + * get_replication() to make sure the new spec is self-consistent. If the pool + * has a consistent replication level, then we ignore any errors. Otherwise, + * report any difference between the two. + */ +static int +check_replication(nvlist_t *config, nvlist_t *newroot) +{ + nvlist_t **child; + uint_t children; + replication_level_t *current = NULL, *new; + replication_level_t *raidz, *mirror; + int ret; + + /* + * If we have a current pool configuration, check to see if it's + * self-consistent. If not, simply return success. + */ + if (config != NULL) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if ((current = get_replication(nvroot, B_FALSE)) == NULL) + return (0); + } + /* + * for spares there may be no children, and therefore no + * replication level to check + */ + if ((nvlist_lookup_nvlist_array(newroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) || (children == 0)) { + free(current); + return (0); + } + + /* + * If all we have is logs then there's no replication level to check. + */ + if (num_logs(newroot) == children) { + free(current); + return (0); + } + + /* + * Get the replication level of the new vdev spec, reporting any + * inconsistencies found. + */ + if ((new = get_replication(newroot, B_TRUE)) == NULL) { + free(current); + return (-1); + } + + /* + * Check to see if the new vdev spec matches the replication level of + * the current pool. + */ + ret = 0; + if (current != NULL) { + if (is_raidz_mirror(current, new, &raidz, &mirror) || + is_raidz_mirror(new, current, &raidz, &mirror)) { + if (raidz->zprl_parity != mirror->zprl_children - 1) { + vdev_error(gettext( + "mismatched replication level: pool and " + "new vdev with different redundancy, %s " + "and %s vdevs, %llu vs. %llu (%llu-way)\n"), + raidz->zprl_type, + mirror->zprl_type, + raidz->zprl_parity, + mirror->zprl_children - 1, + mirror->zprl_children); + ret = -1; + } + } else if (strcmp(current->zprl_type, new->zprl_type) != 0) { + vdev_error(gettext( + "mismatched replication level: pool uses %s " + "and new vdev is %s\n"), + current->zprl_type, new->zprl_type); + ret = -1; + } else if (current->zprl_parity != new->zprl_parity) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu " + "device parity and new vdev uses %llu\n"), + current->zprl_parity, new->zprl_parity); + ret = -1; + } else if (current->zprl_children != new->zprl_children) { + vdev_error(gettext( + "mismatched replication level: pool uses %llu-way " + "%s and new vdev uses %llu-way %s\n"), + current->zprl_children, current->zprl_type, + new->zprl_children, new->zprl_type); + ret = -1; + } + } + + free(new); + if (current != NULL) + free(current); + + return (ret); +} + +static int +zero_label(char *path) +{ + const int size = 4096; + char buf[size]; + int err, fd; + + if ((fd = open(path, O_WRONLY|O_EXCL)) < 0) { + (void) fprintf(stderr, gettext("cannot open '%s': %s\n"), + path, strerror(errno)); + return (-1); + } + + memset(buf, 0, size); + err = write(fd, buf, size); + (void) fdatasync(fd); + (void) close(fd); + + if (err == -1) { + (void) fprintf(stderr, gettext("cannot zero first %d bytes " + "of '%s': %s\n"), size, path, strerror(errno)); + return (-1); + } + + if (err != size) { + (void) fprintf(stderr, gettext("could only zero %d/%d bytes " + "of '%s'\n"), err, size, path); + return (-1); + } + + return (0); +} + +/* + * Go through and find any whole disks in the vdev specification, labelling them + * as appropriate. When constructing the vdev spec, we were unable to open this + * device in order to provide a devid. Now that we have labelled the disk and + * know that slice 0 is valid, we can construct the devid now. + * + * If the disk was already labeled with an EFI label, we will have gotten the + * devid already (because we were able to open the whole disk). Otherwise, we + * need to get the devid after we label the disk. + */ +static int +make_disks(zpool_handle_t *zhp, nvlist_t *nv) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path; + char devpath[MAXPATHLEN]; + char udevpath[MAXPATHLEN]; + uint64_t wholedisk; + struct stat64 statbuf; + int is_exclusive = 0; + int fd; + int ret; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + if (strcmp(type, VDEV_TYPE_DISK) != 0) + return (0); + + /* + * We have a disk device. If this is a whole disk write + * out the efi partition table, otherwise write zero's to + * the first 4k of the partition. This is to ensure that + * libblkid will not misidentify the partition due to a + * magic value left by the previous filesystem. + */ + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + verify(!nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk)); + + if (!wholedisk) { + /* + * Update device id string for mpath nodes (Linux only) + */ + if (is_mpath_whole_disk(path)) + update_vdev_config_dev_strs(nv); + + if (!is_spare(NULL, path)) + (void) zero_label(path); + return (0); + } + + if (realpath(path, devpath) == NULL) { + ret = errno; + (void) fprintf(stderr, + gettext("cannot resolve path '%s'\n"), path); + return (ret); + } + + /* + * Remove any previously existing symlink from a udev path to + * the device before labeling the disk. This ensures that + * only newly created links are used. Otherwise there is a + * window between when udev deletes and recreates the link + * during which access attempts will fail with ENOENT. + */ + strlcpy(udevpath, path, MAXPATHLEN); + (void) zfs_append_partition(udevpath, MAXPATHLEN); + + fd = open(devpath, O_RDWR|O_EXCL); + if (fd == -1) { + if (errno == EBUSY) + is_exclusive = 1; +#ifdef __FreeBSD__ + if (errno == EPERM) + is_exclusive = 1; +#endif + } else { + (void) close(fd); + } + + /* + * If the partition exists, contains a valid spare label, + * and is opened exclusively there is no need to partition + * it. Hot spares have already been partitioned and are + * held open exclusively by the kernel as a safety measure. + * + * If the provided path is for a /dev/disk/ device its + * symbolic link will be removed, partition table created, + * and then block until udev creates the new link. + */ + if (!is_exclusive && !is_spare(NULL, udevpath)) { + char *devnode = strrchr(devpath, '/') + 1; + + ret = strncmp(udevpath, UDISK_ROOT, strlen(UDISK_ROOT)); + if (ret == 0) { + ret = lstat64(udevpath, &statbuf); + if (ret == 0 && S_ISLNK(statbuf.st_mode)) + (void) unlink(udevpath); + } + + /* + * When labeling a pool the raw device node name + * is provided as it appears under /dev/. + */ + if (zpool_label_disk(g_zfs, zhp, devnode) == -1) + return (-1); + + /* + * Wait for udev to signal the device is available + * by the provided path. + */ + ret = zpool_label_disk_wait(udevpath, DISK_LABEL_WAIT); + if (ret) { + (void) fprintf(stderr, + gettext("missing link: %s was " + "partitioned but %s is missing\n"), + devnode, udevpath); + return (ret); + } + + ret = zero_label(udevpath); + if (ret) + return (ret); + } + + /* + * Update the path to refer to the partition. The presence of + * the 'whole_disk' field indicates to the CLI that we should + * chop off the partition number when displaying the device in + * future output. + */ + verify(nvlist_add_string(nv, ZPOOL_CONFIG_PATH, udevpath) == 0); + + /* + * Update device id strings for whole disks (Linux only) + */ + update_vdev_config_dev_strs(nv); + + return (0); + } + + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if ((ret = make_disks(zhp, child[c])) != 0) + return (ret); + + return (0); +} + +/* + * Go through and find any devices that are in use. We rely on libdiskmgt for + * the majority of this task. + */ +static boolean_t +is_device_in_use(nvlist_t *config, nvlist_t *nv, boolean_t force, + boolean_t replacing, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + char *type, *path; + int ret = 0; + char buf[MAXPATHLEN]; + uint64_t wholedisk = B_FALSE; + boolean_t anyinuse = B_FALSE; + + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + + verify(!nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path)); + if (strcmp(type, VDEV_TYPE_DISK) == 0) + verify(!nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_WHOLE_DISK, &wholedisk)); + + /* + * As a generic check, we look to see if this is a replace of a + * hot spare within the same pool. If so, we allow it + * regardless of what libblkid or zpool_in_use() says. + */ + if (replacing) { + (void) strlcpy(buf, path, sizeof (buf)); + if (wholedisk) { + ret = zfs_append_partition(buf, sizeof (buf)); + if (ret == -1) + return (-1); + } + + if (is_spare(config, buf)) + return (B_FALSE); + } + + if (strcmp(type, VDEV_TYPE_DISK) == 0) + ret = check_device(path, force, isspare, wholedisk); + + else if (strcmp(type, VDEV_TYPE_FILE) == 0) + ret = check_file(path, force, isspare); + + return (ret != 0); + } + + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_TRUE)) + anyinuse = B_TRUE; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) + for (c = 0; c < children; c++) + if (is_device_in_use(config, child[c], force, replacing, + B_FALSE)) + anyinuse = B_TRUE; + + return (anyinuse); +} + +static const char * +is_grouping(const char *type, int *mindev, int *maxdev) +{ + if (strncmp(type, "raidz", 5) == 0) { + const char *p = type + 5; + char *end; + long nparity; + + if (*p == '\0') { + nparity = 1; + } else if (*p == '0') { + return (NULL); /* no zero prefixes allowed */ + } else { + errno = 0; + nparity = strtol(p, &end, 10); + if (errno != 0 || nparity < 1 || nparity >= 255 || + *end != '\0') + return (NULL); + } + + if (mindev != NULL) + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; + return (VDEV_TYPE_RAIDZ); + } + + if (maxdev != NULL) + *maxdev = INT_MAX; + + if (strcmp(type, "mirror") == 0) { + if (mindev != NULL) + *mindev = 2; + return (VDEV_TYPE_MIRROR); + } + + if (strcmp(type, "spare") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_SPARE); + } + + if (strcmp(type, "log") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_LOG); + } + + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (mindev != NULL) + *mindev = 1; + return (type); + } + + if (strcmp(type, "cache") == 0) { + if (mindev != NULL) + *mindev = 1; + return (VDEV_TYPE_L2CACHE); + } + + return (NULL); +} + +/* + * Construct a syntactically valid vdev specification, + * and ensure that all devices and files exist and can be opened. + * Note: we don't bother freeing anything in the error paths + * because the program is just going to exit anyway. + */ +static nvlist_t * +construct_spec(nvlist_t *props, int argc, char **argv) +{ + nvlist_t *nvroot, *nv, **top, **spares, **l2cache; + int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; + const char *type; + uint64_t is_log, is_special, is_dedup; + boolean_t seen_logs; + + top = NULL; + toplevels = 0; + spares = NULL; + l2cache = NULL; + nspares = 0; + nlogs = 0; + nl2cache = 0; + is_log = is_special = is_dedup = B_FALSE; + seen_logs = B_FALSE; + nvroot = NULL; + + while (argc > 0) { + nv = NULL; + + /* + * If it's a mirror or raidz, the subsequent arguments are + * its leaves -- until we encounter the next mirror or raidz. + */ + if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { + nvlist_t **child = NULL; + int c, children = 0; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + if (spares != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'spare' can be " + "specified only once\n")); + goto spec_out; + } + is_log = is_special = is_dedup = B_FALSE; + } + + if (strcmp(type, VDEV_TYPE_LOG) == 0) { + if (seen_logs) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'log' can be " + "specified only once\n")); + goto spec_out; + } + seen_logs = B_TRUE; + is_log = B_TRUE; + is_special = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + /* + * A log is not a real grouping device. + * We just set is_log and continue. + */ + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + is_special = B_TRUE; + is_log = B_FALSE; + is_dedup = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_ALLOC_BIAS_DEDUP) == 0) { + is_dedup = B_TRUE; + is_log = B_FALSE; + is_special = B_FALSE; + argc--; + argv++; + continue; + } + + if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + if (l2cache != NULL) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: 'cache' can be " + "specified only once\n")); + goto spec_out; + } + is_log = is_special = is_dedup = B_FALSE; + } + + if (is_log || is_special || is_dedup) { + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + (void) fprintf(stderr, + gettext("invalid vdev " + "specification: unsupported '%s' " + "device: %s\n"), is_log ? "log" : + "special", type); + goto spec_out; + } + nlogs++; + } + + for (c = 1; c < argc; c++) { + if (is_grouping(argv[c], NULL, NULL) != NULL) + break; + children++; + child = realloc(child, + children * sizeof (nvlist_t *)); + if (child == NULL) + zpool_no_memory(); + if ((nv = make_leaf_vdev(props, argv[c], + B_FALSE)) == NULL) { + for (c = 0; c < children - 1; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + child[children - 1] = nv; + } + + if (children < mindev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s requires at least %d " + "devices\n"), argv[0], mindev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + if (children > maxdev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s supports no more than " + "%d devices\n"), argv[0], maxdev); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + goto spec_out; + } + + argc -= c; + argv += c; + + if (strcmp(type, VDEV_TYPE_SPARE) == 0) { + spares = child; + nspares = children; + continue; + } else if (strcmp(type, VDEV_TYPE_L2CACHE) == 0) { + l2cache = child; + nl2cache = children; + continue; + } else { + /* create a top-level vdev with children */ + verify(nvlist_alloc(&nv, NV_UNIQUE_NAME, + 0) == 0); + verify(nvlist_add_string(nv, ZPOOL_CONFIG_TYPE, + type) == 0); + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_IS_LOG, is_log) == 0); + if (is_log) + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_LOG) == 0); + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } + if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_add_uint64(nv, + ZPOOL_CONFIG_NPARITY, + mindev - 1) == 0); + } + verify(nvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, + children) == 0); + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + } + } else { + /* + * We have a device. Pass off to make_leaf_vdev() to + * construct the appropriate nvlist describing the vdev. + */ + if ((nv = make_leaf_vdev(props, argv[0], + is_log)) == NULL) + goto spec_out; + + if (is_log) + nlogs++; + if (is_special) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_SPECIAL) == 0); + } + if (is_dedup) { + verify(nvlist_add_string(nv, + ZPOOL_CONFIG_ALLOCATION_BIAS, + VDEV_ALLOC_BIAS_DEDUP) == 0); + } + argc--; + argv++; + } + + toplevels++; + top = realloc(top, toplevels * sizeof (nvlist_t *)); + if (top == NULL) + zpool_no_memory(); + top[toplevels - 1] = nv; + } + + if (toplevels == 0 && nspares == 0 && nl2cache == 0) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: at least one toplevel vdev must be " + "specified\n")); + goto spec_out; + } + + if (seen_logs && nlogs == 0) { + (void) fprintf(stderr, gettext("invalid vdev specification: " + "log requires at least 1 device\n")); + goto spec_out; + } + + /* + * Finally, create nvroot and add all top-level vdevs to it. + */ + verify(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + top, toplevels) == 0); + if (nspares != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + spares, nspares) == 0); + if (nl2cache != 0) + verify(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + l2cache, nl2cache) == 0); + +spec_out: + for (t = 0; t < toplevels; t++) + nvlist_free(top[t]); + for (t = 0; t < nspares; t++) + nvlist_free(spares[t]); + for (t = 0; t < nl2cache; t++) + nvlist_free(l2cache[t]); + + free(spares); + free(l2cache); + free(top); + + return (nvroot); +} + +nvlist_t * +split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, + splitflags_t flags, int argc, char **argv) +{ + nvlist_t *newroot = NULL, **child; + uint_t c, children; + + if (argc > 0) { + if ((newroot = construct_spec(props, argc, argv)) == NULL) { + (void) fprintf(stderr, gettext("Unable to build a " + "pool from the specified devices\n")); + return (NULL); + } + + if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* avoid any tricks in the spec */ + verify(nvlist_lookup_nvlist_array(newroot, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + for (c = 0; c < children; c++) { + char *path; + const char *type; + int min, max; + + verify(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &path) == 0); + if ((type = is_grouping(path, &min, &max)) != NULL) { + (void) fprintf(stderr, gettext("Cannot use " + "'%s' as a device for splitting\n"), type); + nvlist_free(newroot); + return (NULL); + } + } + } + + if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} + +static int +num_normal_vdevs(nvlist_t *nvroot) +{ + nvlist_t **top; + uint_t t, toplevels, normal = 0; + + verify(nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &top, &toplevels) == 0); + + for (t = 0; t < toplevels; t++) { + uint64_t log = B_FALSE; + + (void) nvlist_lookup_uint64(top[t], ZPOOL_CONFIG_IS_LOG, &log); + if (log) + continue; + if (nvlist_exists(top[t], ZPOOL_CONFIG_ALLOCATION_BIAS)) + continue; + + normal++; + } + + return (normal); +} + +/* + * Get and validate the contents of the given vdev specification. This ensures + * that the nvlist returned is well-formed, that all the devices exist, and that + * they are not currently in use by any other known consumer. The 'poolconfig' + * parameter is the current configuration of the pool when adding devices + * existing pool, and is used to perform additional checks, such as changing the + * replication level of the pool. It can be 'NULL' to indicate that this is a + * new pool. The 'force' flag controls whether devices should be forcefully + * added, even if they appear in use. + */ +nvlist_t * +make_root_vdev(zpool_handle_t *zhp, nvlist_t *props, int force, int check_rep, + boolean_t replacing, boolean_t dryrun, int argc, char **argv) +{ + nvlist_t *newroot; + nvlist_t *poolconfig = NULL; + is_force = force; + + /* + * Construct the vdev specification. If this is successful, we know + * that we have a valid specification, and that all devices can be + * opened. + */ + if ((newroot = construct_spec(props, argc, argv)) == NULL) + return (NULL); + + if (zhp && ((poolconfig = zpool_get_config(zhp, NULL)) == NULL)) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Validate each device to make sure that it's not shared with another + * subsystem. We do this even if 'force' is set, because there are some + * uses (such as a dedicated dump device) that even '-f' cannot + * override. + */ + if (is_device_in_use(poolconfig, newroot, force, replacing, B_FALSE)) { + nvlist_free(newroot); + return (NULL); + } + + /* + * Check the replication level of the given vdevs and report any errors + * found. We include the existing pool spec, if any, as we need to + * catch changes against the existing replication level. + */ + if (check_rep && check_replication(poolconfig, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* + * On pool create the new vdev spec must have one normal vdev. + */ + if (poolconfig == NULL && num_normal_vdevs(newroot) == 0) { + vdev_error(gettext("at least one general top-level vdev must " + "be specified\n")); + nvlist_free(newroot); + return (NULL); + } + + /* + * Run through the vdev specification and label any whole disks found. + */ + if (!dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} diff --git a/cmd/zstream/.gitignore b/cmd/zstream/.gitignore new file mode 100644 index 000000000000..fd1240d55c4b --- /dev/null +++ b/cmd/zstream/.gitignore @@ -0,0 +1 @@ +zstream diff --git a/cmd/zstream/Makefile.am b/cmd/zstream/Makefile.am new file mode 100644 index 000000000000..5e2ac5d69f1a --- /dev/null +++ b/cmd/zstream/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +sbin_PROGRAMS = zstream + +zstream_SOURCES = \ + zstream.c \ + zstream.h \ + zstream_dump.c \ + zstream_redup.c \ + zstream_token.c + +zstream_LDADD = \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/cmd/zstream/zstream.c b/cmd/zstream/zstream.c new file mode 100644 index 000000000000..cbcb560a8638 --- /dev/null +++ b/cmd/zstream/zstream.c @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zstream.h" + +void +zstream_usage(void) +{ + (void) fprintf(stderr, + "usage: zstream command args ...\n" + "Available commands are:\n" + "\n" + "\tzstream dump [-vCd] FILE\n" + "\t... | zstream dump [-vCd]\n" + "\n" + "\tzstream token resume_token\n" + "\n" + "\tzstream redup [-v] FILE | ...\n"); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + if (argc < 2) + zstream_usage(); + + char *subcommand = argv[1]; + + if (strcmp(subcommand, "dump") == 0) { + return (zstream_do_dump(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "token") == 0) { + return (zstream_do_token(argc - 1, argv + 1)); + } else if (strcmp(subcommand, "redup") == 0) { + return (zstream_do_redup(argc - 1, argv + 1)); + } else { + zstream_usage(); + } +} diff --git a/cmd/zstream/zstream.h b/cmd/zstream/zstream.h new file mode 100644 index 000000000000..319fecb2876b --- /dev/null +++ b/cmd/zstream/zstream.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _ZSTREAM_H +#define _ZSTREAM_H + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zstream_do_redup(int, char *[]); +extern int zstream_do_dump(int, char *[]); +extern int zstream_do_token(int, char *[]); +extern void zstream_usage(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTREAM_H */ diff --git a/cmd/zstream/zstream_dump.c b/cmd/zstream/zstream_dump.c new file mode 100644 index 000000000000..45cf7b97a147 --- /dev/null +++ b/cmd/zstream/zstream_dump.c @@ -0,0 +1,799 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2012 Martin Matuska + */ + +/* + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include "zstream.h" + +/* + * If dump mode is enabled, the number of bytes to print per line + */ +#define BYTES_PER_LINE 16 +/* + * If dump mode is enabled, the number of bytes to group together, separated + * by newlines or spaces + */ +#define DUMP_GROUPING 4 + +uint64_t total_stream_len = 0; +FILE *send_stream = 0; +boolean_t do_byteswap = B_FALSE; +boolean_t do_cksum = B_TRUE; + +static void * +safe_malloc(size_t size) +{ + void *rv = malloc(size); + if (rv == NULL) { + (void) fprintf(stderr, "ERROR; failed to allocate %zu bytes\n", + size); + abort(); + } + return (rv); +} + +/* + * ssread - send stream read. + * + * Read while computing incremental checksum + */ +static size_t +ssread(void *buf, size_t len, zio_cksum_t *cksum) +{ + size_t outlen; + + if ((outlen = fread(buf, len, 1, send_stream)) == 0) + return (0); + + if (do_cksum) { + if (do_byteswap) + fletcher_4_incremental_byteswap(buf, len, cksum); + else + fletcher_4_incremental_native(buf, len, cksum); + } + total_stream_len += len; + return (outlen); +} + +static size_t +read_hdr(dmu_replay_record_t *drr, zio_cksum_t *cksum) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + size_t r = ssread(drr, sizeof (*drr) - sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + zio_cksum_t saved_cksum = *cksum; + r = ssread(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), cksum); + if (r == 0) + return (0); + if (do_cksum && + !ZIO_CHECKSUM_IS_ZERO(&drr->drr_u.drr_checksum.drr_checksum) && + !ZIO_CHECKSUM_EQUAL(saved_cksum, + drr->drr_u.drr_checksum.drr_checksum)) { + fprintf(stderr, "invalid checksum\n"); + (void) printf("Incorrect checksum in record header.\n"); + (void) printf("Expected checksum = %llx/%llx/%llx/%llx\n", + (longlong_t)saved_cksum.zc_word[0], + (longlong_t)saved_cksum.zc_word[1], + (longlong_t)saved_cksum.zc_word[2], + (longlong_t)saved_cksum.zc_word[3]); + return (0); + } + return (sizeof (*drr)); +} + +/* + * Print part of a block in ASCII characters + */ +static void +print_ascii_block(char *subbuf, int length) +{ + int i; + + for (i = 0; i < length; i++) { + char char_print = isprint(subbuf[i]) ? subbuf[i] : '.'; + if (i != 0 && i % DUMP_GROUPING == 0) { + (void) printf(" "); + } + (void) printf("%c", char_print); + } + (void) printf("\n"); +} + +/* + * print_block - Dump the contents of a modified block to STDOUT + * + * Assume that buf has capacity evenly divisible by BYTES_PER_LINE + */ +static void +print_block(char *buf, int length) +{ + int i; + /* + * Start printing ASCII characters at a constant offset, after + * the hex prints. Leave 3 characters per byte on a line (2 digit + * hex number plus 1 space) plus spaces between characters and + * groupings. + */ + int ascii_start = BYTES_PER_LINE * 3 + + BYTES_PER_LINE / DUMP_GROUPING + 2; + + for (i = 0; i < length; i += BYTES_PER_LINE) { + int j; + int this_line_length = MIN(BYTES_PER_LINE, length - i); + int print_offset = 0; + + for (j = 0; j < this_line_length; j++) { + int buf_offset = i + j; + + /* + * Separate every DUMP_GROUPING bytes by a space. + */ + if (buf_offset % DUMP_GROUPING == 0) { + print_offset += printf(" "); + } + + /* + * Print the two-digit hex value for this byte. + */ + unsigned char hex_print = buf[buf_offset]; + print_offset += printf("%02x ", hex_print); + } + + (void) printf("%*s", ascii_start - print_offset, " "); + + print_ascii_block(buf + i, this_line_length); + } +} + +/* + * Print an array of bytes to stdout as hexadecimal characters. str must + * have buf_len * 2 + 1 bytes of space. + */ +static void +sprintf_bytes(char *str, uint8_t *buf, uint_t buf_len) +{ + int i, n; + + for (i = 0; i < buf_len; i++) { + n = sprintf(str, "%02x", buf[i] & 0xff); + str += n; + } + + str[0] = '\0'; +} + +int +zstream_do_dump(int argc, char *argv[]) +{ + char *buf = safe_malloc(SPA_MAXBLOCKSIZE); + uint64_t drr_record_count[DRR_NUMTYPES] = { 0 }; + uint64_t total_payload_size = 0; + uint64_t total_overhead_size = 0; + uint64_t drr_byte_count[DRR_NUMTYPES] = { 0 }; + char salt[ZIO_DATA_SALT_LEN * 2 + 1]; + char iv[ZIO_DATA_IV_LEN * 2 + 1]; + char mac[ZIO_DATA_MAC_LEN * 2 + 1]; + uint64_t total_records = 0; + uint64_t payload_size; + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + struct drr_begin *drrb = &thedrr.drr_u.drr_begin; + struct drr_end *drre = &thedrr.drr_u.drr_end; + struct drr_object *drro = &thedrr.drr_u.drr_object; + struct drr_freeobjects *drrfo = &thedrr.drr_u.drr_freeobjects; + struct drr_write *drrw = &thedrr.drr_u.drr_write; + struct drr_write_byref *drrwbr = &thedrr.drr_u.drr_write_byref; + struct drr_free *drrf = &thedrr.drr_u.drr_free; + struct drr_spill *drrs = &thedrr.drr_u.drr_spill; + struct drr_write_embedded *drrwe = &thedrr.drr_u.drr_write_embedded; + struct drr_object_range *drror = &thedrr.drr_u.drr_object_range; + struct drr_redact *drrr = &thedrr.drr_u.drr_redact; + struct drr_checksum *drrc = &thedrr.drr_u.drr_checksum; + int c; + boolean_t verbose = B_FALSE; + boolean_t very_verbose = B_FALSE; + boolean_t first = B_TRUE; + /* + * dump flag controls whether the contents of any modified data blocks + * are printed to the console during processing of the stream. Warning: + * for large streams, this can obviously lead to massive prints. + */ + boolean_t dump = B_FALSE; + int err; + zio_cksum_t zc = { { 0 } }; + zio_cksum_t pcksum = { { 0 } }; + + while ((c = getopt(argc, argv, ":vCd")) != -1) { + switch (c) { + case 'C': + do_cksum = B_FALSE; + break; + case 'v': + if (verbose) + very_verbose = B_TRUE; + verbose = B_TRUE; + break; + case 'd': + dump = B_TRUE; + verbose = B_TRUE; + very_verbose = B_TRUE; + break; + case ':': + (void) fprintf(stderr, + "missing argument for '%c' option\n", optopt); + zstream_usage(); + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + if (argc > optind) { + const char *filename = argv[optind]; + send_stream = fopen(filename, "r"); + if (send_stream == NULL) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + } else { + if (isatty(STDIN_FILENO)) { + (void) fprintf(stderr, + "Error: The send stream is a binary format " + "and can not be read from a\n" + "terminal. Standard input must be redirected, " + "or a file must be\n" + "specified as a command-line argument.\n"); + exit(1); + } + send_stream = stdin; + } + + fletcher_4_init(); + while (read_hdr(drr, &zc)) { + + /* + * If this is the first DMU record being processed, check for + * the magic bytes and figure out the endian-ness based on them. + */ + if (first) { + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + do_byteswap = B_TRUE; + if (do_cksum) { + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + /* + * recalculate header checksum now + * that we know it needs to be + * byteswapped. + */ + fletcher_4_incremental_byteswap(drr, + sizeof (dmu_replay_record_t), &zc); + } + } else if (drrb->drr_magic != DMU_BACKUP_MAGIC) { + (void) fprintf(stderr, "Invalid stream " + "(bad magic number)\n"); + exit(1); + } + first = B_FALSE; + } + if (do_byteswap) { + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = + BSWAP_32(drr->drr_payloadlen); + } + + /* + * At this point, the leading fields of the replay record + * (drr_type and drr_payloadlen) have been byte-swapped if + * necessary, but the rest of the data structure (the + * union of type-specific structures) is still in its + * original state. + */ + if (drr->drr_type >= DRR_NUMTYPES) { + (void) printf("INVALID record found: type 0x%x\n", + drr->drr_type); + (void) printf("Aborting.\n"); + exit(1); + } + + drr_record_count[drr->drr_type]++; + total_overhead_size += sizeof (*drr); + total_records++; + payload_size = 0; + + switch (drr->drr_type) { + case DRR_BEGIN: + if (do_byteswap) { + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = + BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = + BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_flags = BSWAP_32(drrb->drr_flags); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = + BSWAP_64(drrb->drr_fromguid); + } + + (void) printf("BEGIN record\n"); + (void) printf("\thdrtype = %lld\n", + DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo)); + (void) printf("\tfeatures = %llx\n", + DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo)); + (void) printf("\tmagic = %llx\n", + (u_longlong_t)drrb->drr_magic); + (void) printf("\tcreation_time = %llx\n", + (u_longlong_t)drrb->drr_creation_time); + (void) printf("\ttype = %u\n", drrb->drr_type); + (void) printf("\tflags = 0x%x\n", drrb->drr_flags); + (void) printf("\ttoguid = %llx\n", + (u_longlong_t)drrb->drr_toguid); + (void) printf("\tfromguid = %llx\n", + (u_longlong_t)drrb->drr_fromguid); + (void) printf("\ttoname = %s\n", drrb->drr_toname); + (void) printf("\tpayloadlen = %u\n", + drr->drr_payloadlen); + if (verbose) + (void) printf("\n"); + + if (drr->drr_payloadlen != 0) { + nvlist_t *nv; + int sz = drr->drr_payloadlen; + + if (sz > SPA_MAXBLOCKSIZE) { + free(buf); + buf = safe_malloc(sz); + } + (void) ssread(buf, sz, &zc); + if (ferror(send_stream)) + perror("fread"); + err = nvlist_unpack(buf, sz, &nv, 0); + if (err) { + perror(strerror(err)); + } else { + nvlist_print(stdout, nv); + nvlist_free(nv); + } + payload_size = sz; + } + break; + + case DRR_END: + if (do_byteswap) { + drre->drr_checksum.zc_word[0] = + BSWAP_64(drre->drr_checksum.zc_word[0]); + drre->drr_checksum.zc_word[1] = + BSWAP_64(drre->drr_checksum.zc_word[1]); + drre->drr_checksum.zc_word[2] = + BSWAP_64(drre->drr_checksum.zc_word[2]); + drre->drr_checksum.zc_word[3] = + BSWAP_64(drre->drr_checksum.zc_word[3]); + } + /* + * We compare against the *previous* checksum + * value, because the stored checksum is of + * everything before the DRR_END record. + */ + if (do_cksum && !ZIO_CHECKSUM_EQUAL(drre->drr_checksum, + pcksum)) { + (void) printf("Expected checksum differs from " + "checksum in stream.\n"); + (void) printf("Expected checksum = " + "%llx/%llx/%llx/%llx\n", + (long long unsigned int)pcksum.zc_word[0], + (long long unsigned int)pcksum.zc_word[1], + (long long unsigned int)pcksum.zc_word[2], + (long long unsigned int)pcksum.zc_word[3]); + } + (void) printf("END checksum = %llx/%llx/%llx/%llx\n", + (long long unsigned int) + drre->drr_checksum.zc_word[0], + (long long unsigned int) + drre->drr_checksum.zc_word[1], + (long long unsigned int) + drre->drr_checksum.zc_word[2], + (long long unsigned int) + drre->drr_checksum.zc_word[3]); + + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + break; + + case DRR_OBJECT: + if (do_byteswap) { + drro->drr_object = BSWAP_64(drro->drr_object); + drro->drr_type = BSWAP_32(drro->drr_type); + drro->drr_bonustype = + BSWAP_32(drro->drr_bonustype); + drro->drr_blksz = BSWAP_32(drro->drr_blksz); + drro->drr_bonuslen = + BSWAP_32(drro->drr_bonuslen); + drro->drr_raw_bonuslen = + BSWAP_32(drro->drr_raw_bonuslen); + drro->drr_toguid = BSWAP_64(drro->drr_toguid); + drro->drr_maxblkid = + BSWAP_64(drro->drr_maxblkid); + } + + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + + if (verbose) { + (void) printf("OBJECT object = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u " + "dn_slots = %u raw_bonuslen = %u " + "flags = %u maxblkid = %llu " + "indblkshift = %u nlevels = %u " + "nblkptr = %u\n", + (u_longlong_t)drro->drr_object, + drro->drr_type, + drro->drr_bonustype, + drro->drr_blksz, + drro->drr_bonuslen, + drro->drr_dn_slots, + drro->drr_raw_bonuslen, + drro->drr_flags, + (u_longlong_t)drro->drr_maxblkid, + drro->drr_indblkshift, + drro->drr_nlevels, + drro->drr_nblkptr); + } + if (drro->drr_bonuslen > 0) { + (void) ssread(buf, payload_size, &zc); + if (dump) + print_block(buf, payload_size); + } + break; + + case DRR_FREEOBJECTS: + if (do_byteswap) { + drrfo->drr_firstobj = + BSWAP_64(drrfo->drr_firstobj); + drrfo->drr_numobjs = + BSWAP_64(drrfo->drr_numobjs); + drrfo->drr_toguid = BSWAP_64(drrfo->drr_toguid); + } + if (verbose) { + (void) printf("FREEOBJECTS firstobj = %llu " + "numobjs = %llu\n", + (u_longlong_t)drrfo->drr_firstobj, + (u_longlong_t)drrfo->drr_numobjs); + } + break; + + case DRR_WRITE: + if (do_byteswap) { + drrw->drr_object = BSWAP_64(drrw->drr_object); + drrw->drr_type = BSWAP_32(drrw->drr_type); + drrw->drr_offset = BSWAP_64(drrw->drr_offset); + drrw->drr_logical_size = + BSWAP_64(drrw->drr_logical_size); + drrw->drr_toguid = BSWAP_64(drrw->drr_toguid); + drrw->drr_key.ddk_prop = + BSWAP_64(drrw->drr_key.ddk_prop); + drrw->drr_compressed_size = + BSWAP_64(drrw->drr_compressed_size); + } + + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + + /* + * If this is verbose and/or dump output, + * print info on the modified block + */ + if (verbose) { + sprintf_bytes(salt, drrw->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drrw->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drrw->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("WRITE object = %llu type = %u " + "checksum type = %u compression type = %u " + "flags = %u offset = %llu " + "logical_size = %llu " + "compressed_size = %llu " + "payload_size = %llu props = %llx " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drrw->drr_object, + drrw->drr_type, + drrw->drr_checksumtype, + drrw->drr_compressiontype, + drrw->drr_flags, + (u_longlong_t)drrw->drr_offset, + (u_longlong_t)drrw->drr_logical_size, + (u_longlong_t)drrw->drr_compressed_size, + (u_longlong_t)payload_size, + (u_longlong_t)drrw->drr_key.ddk_prop, + salt, + iv, + mac); + } + + /* + * Read the contents of the block in from STDIN to buf + */ + (void) ssread(buf, payload_size, &zc); + /* + * If in dump mode + */ + if (dump) { + print_block(buf, payload_size); + } + break; + + case DRR_WRITE_BYREF: + if (do_byteswap) { + drrwbr->drr_object = + BSWAP_64(drrwbr->drr_object); + drrwbr->drr_offset = + BSWAP_64(drrwbr->drr_offset); + drrwbr->drr_length = + BSWAP_64(drrwbr->drr_length); + drrwbr->drr_toguid = + BSWAP_64(drrwbr->drr_toguid); + drrwbr->drr_refguid = + BSWAP_64(drrwbr->drr_refguid); + drrwbr->drr_refobject = + BSWAP_64(drrwbr->drr_refobject); + drrwbr->drr_refoffset = + BSWAP_64(drrwbr->drr_refoffset); + drrwbr->drr_key.ddk_prop = + BSWAP_64(drrwbr->drr_key.ddk_prop); + } + if (verbose) { + (void) printf("WRITE_BYREF object = %llu " + "checksum type = %u props = %llx " + "offset = %llu length = %llu " + "toguid = %llx refguid = %llx " + "refobject = %llu refoffset = %llu\n", + (u_longlong_t)drrwbr->drr_object, + drrwbr->drr_checksumtype, + (u_longlong_t)drrwbr->drr_key.ddk_prop, + (u_longlong_t)drrwbr->drr_offset, + (u_longlong_t)drrwbr->drr_length, + (u_longlong_t)drrwbr->drr_toguid, + (u_longlong_t)drrwbr->drr_refguid, + (u_longlong_t)drrwbr->drr_refobject, + (u_longlong_t)drrwbr->drr_refoffset); + } + break; + + case DRR_FREE: + if (do_byteswap) { + drrf->drr_object = BSWAP_64(drrf->drr_object); + drrf->drr_offset = BSWAP_64(drrf->drr_offset); + drrf->drr_length = BSWAP_64(drrf->drr_length); + } + if (verbose) { + (void) printf("FREE object = %llu " + "offset = %llu length = %lld\n", + (u_longlong_t)drrf->drr_object, + (u_longlong_t)drrf->drr_offset, + (longlong_t)drrf->drr_length); + } + break; + case DRR_SPILL: + if (do_byteswap) { + drrs->drr_object = BSWAP_64(drrs->drr_object); + drrs->drr_length = BSWAP_64(drrs->drr_length); + drrs->drr_compressed_size = + BSWAP_64(drrs->drr_compressed_size); + drrs->drr_type = BSWAP_32(drrs->drr_type); + } + + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + + if (verbose) { + sprintf_bytes(salt, drrs->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drrs->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drrs->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("SPILL block for object = %llu " + "length = %llu flags = %u " + "compression type = %u " + "compressed_size = %llu " + "payload_size = %llu " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drrs->drr_object, + (u_longlong_t)drrs->drr_length, + drrs->drr_flags, + drrs->drr_compressiontype, + (u_longlong_t)drrs->drr_compressed_size, + (u_longlong_t)payload_size, + salt, + iv, + mac); + } + (void) ssread(buf, payload_size, &zc); + if (dump) { + print_block(buf, payload_size); + } + break; + case DRR_WRITE_EMBEDDED: + if (do_byteswap) { + drrwe->drr_object = + BSWAP_64(drrwe->drr_object); + drrwe->drr_offset = + BSWAP_64(drrwe->drr_offset); + drrwe->drr_length = + BSWAP_64(drrwe->drr_length); + drrwe->drr_toguid = + BSWAP_64(drrwe->drr_toguid); + drrwe->drr_lsize = + BSWAP_32(drrwe->drr_lsize); + drrwe->drr_psize = + BSWAP_32(drrwe->drr_psize); + } + if (verbose) { + (void) printf("WRITE_EMBEDDED object = %llu " + "offset = %llu length = %llu " + "toguid = %llx comp = %u etype = %u " + "lsize = %u psize = %u\n", + (u_longlong_t)drrwe->drr_object, + (u_longlong_t)drrwe->drr_offset, + (u_longlong_t)drrwe->drr_length, + (u_longlong_t)drrwe->drr_toguid, + drrwe->drr_compression, + drrwe->drr_etype, + drrwe->drr_lsize, + drrwe->drr_psize); + } + (void) ssread(buf, + P2ROUNDUP(drrwe->drr_psize, 8), &zc); + if (dump) { + print_block(buf, + P2ROUNDUP(drrwe->drr_psize, 8)); + } + payload_size = P2ROUNDUP(drrwe->drr_psize, 8); + break; + case DRR_OBJECT_RANGE: + if (do_byteswap) { + drror->drr_firstobj = + BSWAP_64(drror->drr_firstobj); + drror->drr_numslots = + BSWAP_64(drror->drr_numslots); + drror->drr_toguid = BSWAP_64(drror->drr_toguid); + } + if (verbose) { + sprintf_bytes(salt, drror->drr_salt, + ZIO_DATA_SALT_LEN); + sprintf_bytes(iv, drror->drr_iv, + ZIO_DATA_IV_LEN); + sprintf_bytes(mac, drror->drr_mac, + ZIO_DATA_MAC_LEN); + + (void) printf("OBJECT_RANGE firstobj = %llu " + "numslots = %llu flags = %u " + "salt = %s iv = %s mac = %s\n", + (u_longlong_t)drror->drr_firstobj, + (u_longlong_t)drror->drr_numslots, + drror->drr_flags, + salt, + iv, + mac); + } + break; + case DRR_REDACT: + if (do_byteswap) { + drrr->drr_object = BSWAP_64(drrr->drr_object); + drrr->drr_offset = BSWAP_64(drrr->drr_offset); + drrr->drr_length = BSWAP_64(drrr->drr_length); + drrr->drr_toguid = BSWAP_64(drrr->drr_toguid); + } + if (verbose) { + (void) printf("REDACT object = %llu offset = " + "%llu length = %llu\n", + (u_longlong_t)drrr->drr_object, + (u_longlong_t)drrr->drr_offset, + (u_longlong_t)drrr->drr_length); + } + break; + case DRR_NUMTYPES: + /* should never be reached */ + exit(1); + } + if (drr->drr_type != DRR_BEGIN && very_verbose) { + (void) printf(" checksum = %llx/%llx/%llx/%llx\n", + (longlong_t)drrc->drr_checksum.zc_word[0], + (longlong_t)drrc->drr_checksum.zc_word[1], + (longlong_t)drrc->drr_checksum.zc_word[2], + (longlong_t)drrc->drr_checksum.zc_word[3]); + } + pcksum = zc; + drr_byte_count[drr->drr_type] += payload_size; + total_payload_size += payload_size; + } + free(buf); + fletcher_4_fini(); + + /* Print final summary */ + + (void) printf("SUMMARY:\n"); + (void) printf("\tTotal DRR_BEGIN records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_BEGIN], + (u_longlong_t)drr_byte_count[DRR_BEGIN]); + (void) printf("\tTotal DRR_END records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_END], + (u_longlong_t)drr_byte_count[DRR_END]); + (void) printf("\tTotal DRR_OBJECT records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_OBJECT], + (u_longlong_t)drr_byte_count[DRR_OBJECT]); + (void) printf("\tTotal DRR_FREEOBJECTS records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREEOBJECTS], + (u_longlong_t)drr_byte_count[DRR_FREEOBJECTS]); + (void) printf("\tTotal DRR_WRITE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE], + (u_longlong_t)drr_byte_count[DRR_WRITE]); + (void) printf("\tTotal DRR_WRITE_BYREF records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_WRITE_BYREF], + (u_longlong_t)drr_byte_count[DRR_WRITE_BYREF]); + (void) printf("\tTotal DRR_WRITE_EMBEDDED records = %lld (%llu " + "bytes)\n", (u_longlong_t)drr_record_count[DRR_WRITE_EMBEDDED], + (u_longlong_t)drr_byte_count[DRR_WRITE_EMBEDDED]); + (void) printf("\tTotal DRR_FREE records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_FREE], + (u_longlong_t)drr_byte_count[DRR_FREE]); + (void) printf("\tTotal DRR_SPILL records = %lld (%llu bytes)\n", + (u_longlong_t)drr_record_count[DRR_SPILL], + (u_longlong_t)drr_byte_count[DRR_SPILL]); + (void) printf("\tTotal records = %lld\n", + (u_longlong_t)total_records); + (void) printf("\tTotal payload size = %lld (0x%llx)\n", + (u_longlong_t)total_payload_size, (u_longlong_t)total_payload_size); + (void) printf("\tTotal header overhead = %lld (0x%llx)\n", + (u_longlong_t)total_overhead_size, + (u_longlong_t)total_overhead_size); + (void) printf("\tTotal stream length = %lld (0x%llx)\n", + (u_longlong_t)total_stream_len, (u_longlong_t)total_stream_len); + return (0); +} diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c new file mode 100644 index 000000000000..379025ce59e5 --- /dev/null +++ b/cmd/zstream/zstream_redup.c @@ -0,0 +1,469 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_fletcher.h" +#include "zstream.h" + + +#define MAX_RDT_PHYSMEM_PERCENT 20 +#define SMALLEST_POSSIBLE_MAX_RDT_MB 128 + +typedef struct redup_entry { + struct redup_entry *rde_next; + uint64_t rde_guid; + uint64_t rde_object; + uint64_t rde_offset; + uint64_t rde_stream_offset; +} redup_entry_t; + +typedef struct redup_table { + redup_entry_t **redup_hash_array; + umem_cache_t *ddecache; + uint64_t ddt_count; + int numhashbits; +} redup_table_t; + +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +static void * +safe_calloc(size_t n) +{ + void *rv = calloc(1, n); + if (rv == NULL) { + fprintf(stderr, + "Error: could not allocate %u bytes of memory\n", + (int)n); + exit(1); + } + return (rv); +} + +/* + * Safe version of fread(), exits on error. + */ +static int +sfread(void *buf, size_t size, FILE *fp) +{ + int rv = fread(buf, size, 1, fp); + if (rv == 0 && ferror(fp)) { + (void) fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + return (rv); +} + +/* + * Safe version of pread(), exits on error. + */ +static void +spread(int fd, void *buf, size_t count, off_t offset) +{ + ssize_t err = pread(fd, buf, count, offset); + if (err == -1) { + (void) fprintf(stderr, + "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } else if (err != count) { + (void) fprintf(stderr, + "Error while reading file: short read\n"); + exit(1); + } +} + +static int +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) +{ + assert(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum) + == sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + assert(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); +} + +static void +rdt_insert(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + redup_entry_t **rdepp; + + rdepp = &(rdt->redup_hash_array[hashcode]); + redup_entry_t *rde = umem_cache_alloc(rdt->ddecache, UMEM_NOFAIL); + rde->rde_next = *rdepp; + rde->rde_guid = guid; + rde->rde_object = object; + rde->rde_offset = offset; + rde->rde_stream_offset = stream_offset; + *rdepp = rde; + rdt->ddt_count++; +} + +static void +rdt_lookup(redup_table_t *rdt, + uint64_t guid, uint64_t object, uint64_t offset, + uint64_t *stream_offsetp) +{ + uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); + + for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; + rde != NULL; rde = rde->rde_next) { + if (rde->rde_guid == guid && + rde->rde_object == object && + rde->rde_offset == offset) { + *stream_offsetp = rde->rde_stream_offset; + return; + } + } + assert(!"could not find expected redup table entry"); +} + +/* + * Convert a dedup stream (generated by "zfs send -D") to a + * non-deduplicated stream. The entire infd will be converted, including + * any substreams in a stream package (generated by "zfs send -RD"). The + * infd must be seekable. + */ +static void +zfs_redup_stream(int infd, int outfd, boolean_t verbose) +{ + int bufsz = SPA_MAXBLOCKSIZE; + dmu_replay_record_t thedrr = { 0 }; + dmu_replay_record_t *drr = &thedrr; + redup_table_t rdt; + zio_cksum_t stream_cksum; + uint64_t numbuckets; + uint64_t num_records = 0; + uint64_t num_write_byref_records = 0; + +#ifdef _ILP32 + uint64_t max_rde_size = SMALLEST_POSSIBLE_MAX_RDT_MB << 20; +#else + uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); + uint64_t max_rde_size = + MAX((physmem * MAX_RDT_PHYSMEM_PERCENT) / 100, + SMALLEST_POSSIBLE_MAX_RDT_MB << 20); +#endif + + numbuckets = max_rde_size / (sizeof (redup_entry_t)); + + /* + * numbuckets must be a power of 2. Increase number to + * a power of 2 if necessary. + */ + if (!ISP2(numbuckets)) + numbuckets = 1ULL << highbit64(numbuckets); + + rdt.redup_hash_array = + safe_calloc(numbuckets * sizeof (redup_entry_t *)); + rdt.ddecache = umem_cache_create("rde", sizeof (redup_entry_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + rdt.numhashbits = highbit64(numbuckets) - 1; + rdt.ddt_count = 0; + + char *buf = safe_calloc(bufsz); + FILE *ofp = fdopen(infd, "r"); + long offset = ftell(ofp); + while (sfread(drr, sizeof (*drr), ofp) != 0) { + num_records++; + + /* + * We need to regenerate the checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + + uint64_t payload_size = 0; + switch (drr->drr_type) { + case DRR_BEGIN: + { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int fflags; + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + + assert(drrb->drr_magic == DMU_BACKUP_MAGIC); + + /* clear the DEDUP feature flag for this stream */ + fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + fflags &= ~(DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); + + int sz = drr->drr_payloadlen; + if (sz != 0) { + if (sz > bufsz) { + free(buf); + buf = safe_calloc(sz); + bufsz = sz; + } + (void) sfread(buf, sz, ofp); + } + payload_size = sz; + break; + } + + case DRR_END: + { + struct drr_end *drre = &drr->drr_u.drr_end; + /* + * Use the recalculated checksum, unless this is + * the END record of a stream package, which has + * no checksum. + */ + if (!ZIO_CHECKSUM_IS_ZERO(&drre->drr_checksum)) + drre->drr_checksum = stream_cksum; + break; + } + + case DRR_OBJECT: + { + struct drr_object *drro = &drr->drr_u.drr_object; + + if (drro->drr_bonuslen > 0) { + payload_size = DRR_OBJECT_PAYLOAD_SIZE(drro); + (void) sfread(buf, payload_size, ofp); + } + break; + } + + case DRR_SPILL: + { + struct drr_spill *drrs = &drr->drr_u.drr_spill; + payload_size = DRR_SPILL_PAYLOAD_SIZE(drrs); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwb = + drr->drr_u.drr_write_byref; + + num_write_byref_records++; + + /* + * Look up in hash table by drrwb->drr_refguid, + * drr_refobject, drr_refoffset. Replace this + * record with the found WRITE record, but with + * drr_object,drr_offset,drr_toguid replaced with ours. + */ + uint64_t stream_offset = 0; + rdt_lookup(&rdt, drrwb.drr_refguid, + drrwb.drr_refobject, drrwb.drr_refoffset, + &stream_offset); + + spread(infd, drr, sizeof (*drr), stream_offset); + + assert(drr->drr_type == DRR_WRITE); + struct drr_write *drrw = &drr->drr_u.drr_write; + assert(drrw->drr_toguid == drrwb.drr_refguid); + assert(drrw->drr_object == drrwb.drr_refobject); + assert(drrw->drr_offset == drrwb.drr_refoffset); + + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + spread(infd, buf, payload_size, + stream_offset + sizeof (*drr)); + + drrw->drr_toguid = drrwb.drr_toguid; + drrw->drr_object = drrwb.drr_object; + drrw->drr_offset = drrwb.drr_offset; + break; + } + + case DRR_WRITE: + { + struct drr_write *drrw = &drr->drr_u.drr_write; + payload_size = DRR_WRITE_PAYLOAD_SIZE(drrw); + (void) sfread(buf, payload_size, ofp); + + rdt_insert(&rdt, drrw->drr_toguid, + drrw->drr_object, drrw->drr_offset, offset); + break; + } + + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drr->drr_u.drr_write_embedded; + payload_size = + P2ROUNDUP((uint64_t)drrwe->drr_psize, 8); + (void) sfread(buf, payload_size, ofp); + break; + } + + case DRR_FREEOBJECTS: + case DRR_FREE: + case DRR_OBJECT_RANGE: + break; + + default: + (void) fprintf(stderr, "INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + + if (feof(ofp)) { + fprintf(stderr, "Error: unexpected end-of-file\n"); + exit(1); + } + if (ferror(ofp)) { + fprintf(stderr, "Error while reading file: %s\n", + strerror(errno)); + exit(1); + } + + /* + * We need to recalculate the checksum, and it needs to be + * initially zero to do that. BEGIN records don't have + * a checksum. + */ + if (drr->drr_type != DRR_BEGIN) { + bzero(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (drr->drr_u.drr_checksum.drr_checksum)); + } + if (dump_record(drr, buf, payload_size, + &stream_cksum, outfd) != 0) + break; + if (drr->drr_type == DRR_END) { + /* + * Typically the END record is either the last + * thing in the stream, or it is followed + * by a BEGIN record (which also zeros the checksum). + * However, a stream package ends with two END + * records. The last END record's checksum starts + * from zero. + */ + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + } + offset = ftell(ofp); + } + + if (verbose) { + char mem_str[16]; + zfs_nicenum(rdt.ddt_count * sizeof (redup_entry_t), + mem_str, sizeof (mem_str)); + fprintf(stderr, "converted stream with %llu total records, " + "including %llu dedup records, using %sB memory.\n", + (long long)num_records, + (long long)num_write_byref_records, + mem_str); + } + + umem_cache_destroy(rdt.ddecache); + free(rdt.redup_hash_array); + free(buf); + (void) fclose(ofp); +} + +int +zstream_do_redup(int argc, char *argv[]) +{ + boolean_t verbose = B_FALSE; + char c; + + while ((c = getopt(argc, argv, "v")) != -1) { + switch (c) { + case 'v': + verbose = B_TRUE; + break; + case '?': + (void) fprintf(stderr, "invalid option '%c'\n", + optopt); + zstream_usage(); + break; + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + zstream_usage(); + + const char *filename = argv[0]; + + if (isatty(STDOUT_FILENO)) { + (void) fprintf(stderr, + "Error: Stream can not be written to a terminal.\n" + "You must redirect standard output.\n"); + return (1); + } + + int fd = open(filename, O_RDONLY); + if (fd == -1) { + (void) fprintf(stderr, + "Error while opening file '%s': %s\n", + filename, strerror(errno)); + exit(1); + } + + fletcher_4_init(); + zfs_redup_stream(fd, STDOUT_FILENO, verbose); + fletcher_4_fini(); + + close(fd); + + return (0); +} diff --git a/cmd/zstream/zstream_token.c b/cmd/zstream/zstream_token.c new file mode 100644 index 000000000000..36a76a4bb851 --- /dev/null +++ b/cmd/zstream/zstream_token.c @@ -0,0 +1,78 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2012 Martin Matuska + */ + +/* + * Copyright (c) 2020 by Datto Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include "zstream.h" + +int +zstream_do_token(int argc, char *argv[]) +{ + char *resume_token = NULL; + + if (argc < 2) { + (void) fprintf(stderr, "Need to pass the resume token\n"); + zstream_usage(); + } + + resume_token = argv[1]; + + libzfs_handle_t *hdl = libzfs_init(); + + nvlist_t *resume_nvl = + zfs_send_resume_token_to_nvlist(hdl, resume_token); + + if (resume_nvl == NULL) { + (void) fprintf(stderr, + "Unable to parse resume token: %s\n", + libzfs_error_description(hdl)); + libzfs_fini(hdl); + return (1); + } + + dump_nvlist(resume_nvl, 5); + nvlist_free(resume_nvl); + + libzfs_fini(hdl); + return (0); +} diff --git a/cmd/zstreamdump/Makefile.am b/cmd/zstreamdump/Makefile.am new file mode 100644 index 000000000000..2c04d8513150 --- /dev/null +++ b/cmd/zstreamdump/Makefile.am @@ -0,0 +1 @@ +dist_sbin_SCRIPTS = zstreamdump diff --git a/cmd/zstreamdump/zstreamdump b/cmd/zstreamdump/zstreamdump new file mode 100755 index 000000000000..fbf02ee687f6 --- /dev/null +++ b/cmd/zstreamdump/zstreamdump @@ -0,0 +1,3 @@ +#!/bin/sh + +zstream dump "$@" diff --git a/cmd/ztest/.gitignore b/cmd/ztest/.gitignore new file mode 100644 index 000000000000..d3d498dae693 --- /dev/null +++ b/cmd/ztest/.gitignore @@ -0,0 +1 @@ +/ztest diff --git a/cmd/ztest/Makefile.am b/cmd/ztest/Makefile.am new file mode 100644 index 000000000000..6042b44d1dde --- /dev/null +++ b/cmd/ztest/Makefile.am @@ -0,0 +1,23 @@ +include $(top_srcdir)/config/Rules.am + +# Get rid of compiler warning for unchecked truncating snprintfs on gcc 7.1.1 +AM_CFLAGS += $(NO_FORMAT_TRUNCATION) + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +# Unconditionally enable ASSERTs +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +sbin_PROGRAMS = ztest + +ztest_SOURCES = \ + ztest.c + +ztest_LDADD = \ + $(abs_top_builddir)/lib/libzpool/libzpool.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +ztest_LDADD += -lm +ztest_LDFLAGS = -pthread diff --git a/cmd/ztest/ztest.c b/cmd/ztest/ztest.c new file mode 100644 index 000000000000..31205a5bf8cf --- /dev/null +++ b/cmd/ztest/ztest.c @@ -0,0 +1,7818 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * The objective of this program is to provide a DMU/ZAP/SPA stress test + * that runs entirely in userland, is easy to use, and easy to extend. + * + * The overall design of the ztest program is as follows: + * + * (1) For each major functional area (e.g. adding vdevs to a pool, + * creating and destroying datasets, reading and writing objects, etc) + * we have a simple routine to test that functionality. These + * individual routines do not have to do anything "stressful". + * + * (2) We turn these simple functionality tests into a stress test by + * running them all in parallel, with as many threads as desired, + * and spread across as many datasets, objects, and vdevs as desired. + * + * (3) While all this is happening, we inject faults into the pool to + * verify that self-healing data really works. + * + * (4) Every time we open a dataset, we change its checksum and compression + * functions. Thus even individual objects vary from block to block + * in which checksum they use and whether they're compressed. + * + * (5) To verify that we never lose on-disk consistency after a crash, + * we run the entire test in a child of the main process. + * At random times, the child self-immolates with a SIGKILL. + * This is the software equivalent of pulling the power cord. + * The parent then runs the test again, using the existing + * storage pool, as many times as desired. If backwards compatibility + * testing is enabled ztest will sometimes run the "older" version + * of ztest after a SIGKILL. + * + * (6) To verify that we don't have future leaks or temporal incursions, + * many of the functional tests record the transaction group number + * as part of their data. When reading old data, they verify that + * the transaction group number is less than the current, open txg. + * If you add a new test, please do this if applicable. + * + * (7) Threads are created with a reduced stack size, for sanity checking. + * Therefore, it's important not to allocate huge buffers on the stack. + * + * When run with no arguments, ztest runs for about five minutes and + * produces no output if successful. To get a little bit of information, + * specify -V. To get more information, specify -VV, and so on. + * + * To turn this into an overnight stress test, use -T to specify run time. + * + * You can ask more vdevs [-v], datasets [-d], or threads [-t] + * to increase the pool capacity, fanout, and overall stress level. + * + * Use the -k option to set the desired frequency of kills. + * + * When ztest invokes itself it passes all relevant information through a + * temporary file which is mmap-ed in the child process. This allows shared + * memory to survive the exec syscall. The ztest_shared_hdr_t struct is always + * stored at offset 0 of this file and contains information on the size and + * number of shared structures in the file. The information stored in this file + * must remain backwards compatible with older versions of ztest so that + * ztest can invoke them during backwards compatibility testing (-B). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef __GLIBC__ +#include /* for backtrace() */ +#endif + +static int ztest_fd_data = -1; +static int ztest_fd_rand = -1; + +typedef struct ztest_shared_hdr { + uint64_t zh_hdr_size; + uint64_t zh_opts_size; + uint64_t zh_size; + uint64_t zh_stats_size; + uint64_t zh_stats_count; + uint64_t zh_ds_size; + uint64_t zh_ds_count; +} ztest_shared_hdr_t; + +static ztest_shared_hdr_t *ztest_shared_hdr; + +enum ztest_class_state { + ZTEST_VDEV_CLASS_OFF, + ZTEST_VDEV_CLASS_ON, + ZTEST_VDEV_CLASS_RND +}; + +typedef struct ztest_shared_opts { + char zo_pool[ZFS_MAX_DATASET_NAME_LEN]; + char zo_dir[ZFS_MAX_DATASET_NAME_LEN]; + char zo_alt_ztest[MAXNAMELEN]; + char zo_alt_libpath[MAXNAMELEN]; + uint64_t zo_vdevs; + uint64_t zo_vdevtime; + size_t zo_vdev_size; + int zo_ashift; + int zo_mirrors; + int zo_raidz; + int zo_raidz_parity; + int zo_datasets; + int zo_threads; + uint64_t zo_passtime; + uint64_t zo_killrate; + int zo_verbose; + int zo_init; + uint64_t zo_time; + uint64_t zo_maxloops; + uint64_t zo_metaslab_force_ganging; + int zo_mmp_test; + int zo_special_vdevs; + int zo_dump_dbgmsg; +} ztest_shared_opts_t; + +static const ztest_shared_opts_t ztest_opts_defaults = { + .zo_pool = "ztest", + .zo_dir = "/tmp", + .zo_alt_ztest = { '\0' }, + .zo_alt_libpath = { '\0' }, + .zo_vdevs = 5, + .zo_ashift = SPA_MINBLOCKSHIFT, + .zo_mirrors = 2, + .zo_raidz = 4, + .zo_raidz_parity = 1, + .zo_vdev_size = SPA_MINDEVSIZE * 4, /* 256m default size */ + .zo_datasets = 7, + .zo_threads = 23, + .zo_passtime = 60, /* 60 seconds */ + .zo_killrate = 70, /* 70% kill rate */ + .zo_verbose = 0, + .zo_mmp_test = 0, + .zo_init = 1, + .zo_time = 300, /* 5 minutes */ + .zo_maxloops = 50, /* max loops during spa_freeze() */ + .zo_metaslab_force_ganging = 64 << 10, + .zo_special_vdevs = ZTEST_VDEV_CLASS_RND, +}; + +extern uint64_t metaslab_force_ganging; +extern uint64_t metaslab_df_alloc_threshold; +extern unsigned long zfs_deadman_synctime_ms; +extern int metaslab_preload_limit; +extern boolean_t zfs_compressed_arc_enabled; +extern int zfs_abd_scatter_enabled; +extern int dmu_object_alloc_chunk_shift; +extern boolean_t zfs_force_some_double_word_sm_entries; +extern unsigned long zio_decompress_fail_fraction; +extern unsigned long zfs_reconstruct_indirect_damage_fraction; + + +static ztest_shared_opts_t *ztest_shared_opts; +static ztest_shared_opts_t ztest_opts; +static char *ztest_wkeydata = "abcdefghijklmnopqrstuvwxyz012345"; + +typedef struct ztest_shared_ds { + uint64_t zd_seq; +} ztest_shared_ds_t; + +static ztest_shared_ds_t *ztest_shared_ds; +#define ZTEST_GET_SHARED_DS(d) (&ztest_shared_ds[d]) + +#define BT_MAGIC 0x123456789abcdefULL +#define MAXFAULTS(zs) \ + (MAX((zs)->zs_mirrors, 1) * (ztest_opts.zo_raidz_parity + 1) - 1) + +enum ztest_io_type { + ZTEST_IO_WRITE_TAG, + ZTEST_IO_WRITE_PATTERN, + ZTEST_IO_WRITE_ZEROES, + ZTEST_IO_TRUNCATE, + ZTEST_IO_SETATTR, + ZTEST_IO_REWRITE, + ZTEST_IO_TYPES +}; + +typedef struct ztest_block_tag { + uint64_t bt_magic; + uint64_t bt_objset; + uint64_t bt_object; + uint64_t bt_dnodesize; + uint64_t bt_offset; + uint64_t bt_gen; + uint64_t bt_txg; + uint64_t bt_crtxg; +} ztest_block_tag_t; + +typedef struct bufwad { + uint64_t bw_index; + uint64_t bw_txg; + uint64_t bw_data; +} bufwad_t; + +/* + * It would be better to use a rangelock_t per object. Unfortunately + * the rangelock_t is not a drop-in replacement for rl_t, because we + * still need to map from object ID to rangelock_t. + */ +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rll { + void *rll_writer; + int rll_readers; + kmutex_t rll_lock; + kcondvar_t rll_cv; +} rll_t; + +typedef struct rl { + uint64_t rl_object; + uint64_t rl_offset; + uint64_t rl_size; + rll_t *rl_lock; +} rl_t; + +#define ZTEST_RANGE_LOCKS 64 +#define ZTEST_OBJECT_LOCKS 64 + +/* + * Object descriptor. Used as a template for object lookup/create/remove. + */ +typedef struct ztest_od { + uint64_t od_dir; + uint64_t od_object; + dmu_object_type_t od_type; + dmu_object_type_t od_crtype; + uint64_t od_blocksize; + uint64_t od_crblocksize; + uint64_t od_crdnodesize; + uint64_t od_gen; + uint64_t od_crgen; + char od_name[ZFS_MAX_DATASET_NAME_LEN]; +} ztest_od_t; + +/* + * Per-dataset state. + */ +typedef struct ztest_ds { + ztest_shared_ds_t *zd_shared; + objset_t *zd_os; + pthread_rwlock_t zd_zilog_lock; + zilog_t *zd_zilog; + ztest_od_t *zd_od; /* debugging aid */ + char zd_name[ZFS_MAX_DATASET_NAME_LEN]; + kmutex_t zd_dirobj_lock; + rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; + rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; +} ztest_ds_t; + +/* + * Per-iteration state. + */ +typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); + +typedef struct ztest_info { + ztest_func_t *zi_func; /* test function */ + uint64_t zi_iters; /* iterations per execution */ + uint64_t *zi_interval; /* execute every seconds */ + const char *zi_funcname; /* name of test function */ +} ztest_info_t; + +typedef struct ztest_shared_callstate { + uint64_t zc_count; /* per-pass count */ + uint64_t zc_time; /* per-pass time */ + uint64_t zc_next; /* next time to call this function */ +} ztest_shared_callstate_t; + +static ztest_shared_callstate_t *ztest_shared_callstate; +#define ZTEST_GET_SHARED_CALLSTATE(c) (&ztest_shared_callstate[c]) + +ztest_func_t ztest_dmu_read_write; +ztest_func_t ztest_dmu_write_parallel; +ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_object_next_chunk; +ztest_func_t ztest_dmu_commit_callbacks; +ztest_func_t ztest_zap; +ztest_func_t ztest_zap_parallel; +ztest_func_t ztest_zil_commit; +ztest_func_t ztest_zil_remount; +ztest_func_t ztest_dmu_read_write_zcopy; +ztest_func_t ztest_dmu_objset_create_destroy; +ztest_func_t ztest_dmu_prealloc; +ztest_func_t ztest_fzap; +ztest_func_t ztest_dmu_snapshot_create_destroy; +ztest_func_t ztest_dsl_prop_get_set; +ztest_func_t ztest_spa_prop_get_set; +ztest_func_t ztest_spa_create_destroy; +ztest_func_t ztest_fault_inject; +ztest_func_t ztest_dmu_snapshot_hold; +ztest_func_t ztest_mmp_enable_disable; +ztest_func_t ztest_scrub; +ztest_func_t ztest_dsl_dataset_promote_busy; +ztest_func_t ztest_vdev_attach_detach; +ztest_func_t ztest_vdev_LUN_growth; +ztest_func_t ztest_vdev_add_remove; +ztest_func_t ztest_vdev_class_add; +ztest_func_t ztest_vdev_aux_add_remove; +ztest_func_t ztest_split_pool; +ztest_func_t ztest_reguid; +ztest_func_t ztest_spa_upgrade; +ztest_func_t ztest_device_removal; +ztest_func_t ztest_spa_checkpoint_create_discard; +ztest_func_t ztest_initialize; +ztest_func_t ztest_trim; +ztest_func_t ztest_fletcher; +ztest_func_t ztest_fletcher_incr; +ztest_func_t ztest_verify_dnode_bt; + +uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ +uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ +uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ +uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ +uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ + +#define ZTI_INIT(func, iters, interval) \ + { .zi_func = (func), \ + .zi_iters = (iters), \ + .zi_interval = (interval), \ + .zi_funcname = # func } + +ztest_info_t ztest_info[] = { + ZTI_INIT(ztest_dmu_read_write, 1, &zopt_always), + ZTI_INIT(ztest_dmu_write_parallel, 10, &zopt_always), + ZTI_INIT(ztest_dmu_object_alloc_free, 1, &zopt_always), + ZTI_INIT(ztest_dmu_object_next_chunk, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_commit_callbacks, 1, &zopt_always), + ZTI_INIT(ztest_zap, 30, &zopt_always), + ZTI_INIT(ztest_zap_parallel, 100, &zopt_always), + ZTI_INIT(ztest_split_pool, 1, &zopt_always), + ZTI_INIT(ztest_zil_commit, 1, &zopt_incessant), + ZTI_INIT(ztest_zil_remount, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_read_write_zcopy, 1, &zopt_often), + ZTI_INIT(ztest_dmu_objset_create_destroy, 1, &zopt_often), + ZTI_INIT(ztest_dsl_prop_get_set, 1, &zopt_often), + ZTI_INIT(ztest_spa_prop_get_set, 1, &zopt_sometimes), +#if 0 + ZTI_INIT(ztest_dmu_prealloc, 1, &zopt_sometimes), +#endif + ZTI_INIT(ztest_fzap, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes), + ZTI_INIT(ztest_spa_create_destroy, 1, &zopt_sometimes), + ZTI_INIT(ztest_fault_inject, 1, &zopt_sometimes), + ZTI_INIT(ztest_dmu_snapshot_hold, 1, &zopt_sometimes), + ZTI_INIT(ztest_mmp_enable_disable, 1, &zopt_sometimes), + ZTI_INIT(ztest_reguid, 1, &zopt_rarely), + ZTI_INIT(ztest_scrub, 1, &zopt_rarely), + ZTI_INIT(ztest_spa_upgrade, 1, &zopt_rarely), + ZTI_INIT(ztest_dsl_dataset_promote_busy, 1, &zopt_rarely), + ZTI_INIT(ztest_vdev_attach_detach, 1, &zopt_sometimes), + ZTI_INIT(ztest_vdev_LUN_growth, 1, &zopt_rarely), + ZTI_INIT(ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_vdev_class_add, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_vdev_aux_add_remove, 1, &ztest_opts.zo_vdevtime), + ZTI_INIT(ztest_device_removal, 1, &zopt_sometimes), + ZTI_INIT(ztest_spa_checkpoint_create_discard, 1, &zopt_rarely), + ZTI_INIT(ztest_initialize, 1, &zopt_sometimes), + ZTI_INIT(ztest_trim, 1, &zopt_sometimes), + ZTI_INIT(ztest_fletcher, 1, &zopt_rarely), + ZTI_INIT(ztest_fletcher_incr, 1, &zopt_rarely), + ZTI_INIT(ztest_verify_dnode_bt, 1, &zopt_sometimes), +}; + +#define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) + +/* + * The following struct is used to hold a list of uncalled commit callbacks. + * The callbacks are ordered by txg number. + */ +typedef struct ztest_cb_list { + kmutex_t zcl_callbacks_lock; + list_t zcl_callbacks; +} ztest_cb_list_t; + +/* + * Stuff we need to share writably between parent and child. + */ +typedef struct ztest_shared { + boolean_t zs_do_init; + hrtime_t zs_proc_start; + hrtime_t zs_proc_stop; + hrtime_t zs_thread_start; + hrtime_t zs_thread_stop; + hrtime_t zs_thread_kill; + uint64_t zs_enospc_count; + uint64_t zs_vdev_next_leaf; + uint64_t zs_vdev_aux; + uint64_t zs_alloc; + uint64_t zs_space; + uint64_t zs_splits; + uint64_t zs_mirrors; + uint64_t zs_metaslab_sz; + uint64_t zs_metaslab_df_alloc_threshold; + uint64_t zs_guid; +} ztest_shared_t; + +#define ID_PARALLEL -1ULL + +static char ztest_dev_template[] = "%s/%s.%llua"; +static char ztest_aux_template[] = "%s/%s.%s.%llu"; +ztest_shared_t *ztest_shared; + +static spa_t *ztest_spa = NULL; +static ztest_ds_t *ztest_ds; + +static kmutex_t ztest_vdev_lock; +static boolean_t ztest_device_removal_active = B_FALSE; +static boolean_t ztest_pool_scrubbed = B_FALSE; +static kmutex_t ztest_checkpoint_lock; + +/* + * The ztest_name_lock protects the pool and dataset namespace used by + * the individual tests. To modify the namespace, consumers must grab + * this lock as writer. Grabbing the lock as reader will ensure that the + * namespace does not change while the lock is held. + */ +static pthread_rwlock_t ztest_name_lock; + +static boolean_t ztest_dump_core = B_TRUE; +static boolean_t ztest_exiting; + +/* Global commit callback list */ +static ztest_cb_list_t zcl; +/* Commit cb delay */ +static uint64_t zc_min_txg_delay = UINT64_MAX; +static int zc_cb_counter = 0; + +/* + * Minimum number of commit callbacks that need to be registered for us to check + * whether the minimum txg delay is acceptable. + */ +#define ZTEST_COMMIT_CB_MIN_REG 100 + +/* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ +#define ZTEST_COMMIT_CB_THRESH (TXG_CONCURRENT_STATES + 1000) + +enum ztest_object { + ZTEST_META_DNODE = 0, + ZTEST_DIROBJ, + ZTEST_OBJECTS +}; + +static void usage(boolean_t) __NORETURN; +static int ztest_scrub_impl(spa_t *spa); + +/* + * These libumem hooks provide a reasonable set of defaults for the allocator's + * debugging facilities. + */ +const char * +_umem_debug_init(void) +{ + return ("default,verbose"); /* $UMEM_DEBUG setting */ +} + +const char * +_umem_logging_init(void) +{ + return ("fail,contents"); /* $UMEM_LOGGING setting */ +} + +static void +dump_debug_buffer(void) +{ + ssize_t ret __attribute__((unused)); + + if (!ztest_opts.zo_dump_dbgmsg) + return; + + /* + * We use write() instead of printf() so that this function + * is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "\n", 1); + zfs_dbgmsg_print("ztest"); +} + +#define BACKTRACE_SZ 100 + +static void sig_handler(int signo) +{ + struct sigaction action; +#ifdef __GLIBC__ /* backtrace() is a GNU extension */ + int nptrs; + void *buffer[BACKTRACE_SZ]; + + nptrs = backtrace(buffer, BACKTRACE_SZ); + backtrace_symbols_fd(buffer, nptrs, STDERR_FILENO); +#endif + dump_debug_buffer(); + + /* + * Restore default action and re-raise signal so SIGSEGV and + * SIGABRT can trigger a core dump. + */ + action.sa_handler = SIG_DFL; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + (void) sigaction(signo, &action, NULL); + raise(signo); +} + +#define FATAL_MSG_SZ 1024 + +char *fatal_msg; + +static void +fatal(int do_perror, char *message, ...) +{ + va_list args; + int save_errno = errno; + char *buf; + + (void) fflush(stdout); + buf = umem_alloc(FATAL_MSG_SZ, UMEM_NOFAIL); + + va_start(args, message); + (void) sprintf(buf, "ztest: "); + /* LINTED */ + (void) vsprintf(buf + strlen(buf), message, args); + va_end(args); + if (do_perror) { + (void) snprintf(buf + strlen(buf), FATAL_MSG_SZ - strlen(buf), + ": %s", strerror(save_errno)); + } + (void) fprintf(stderr, "%s\n", buf); + fatal_msg = buf; /* to ease debugging */ + + if (ztest_dump_core) + abort(); + else + dump_debug_buffer(); + + exit(3); +} + +static int +str2shift(const char *buf) +{ + const char *ends = "BKMGTPEZ"; + int i; + + if (buf[0] == '\0') + return (0); + for (i = 0; i < strlen(ends); i++) { + if (toupper(buf[0]) == ends[i]) + break; + } + if (i == strlen(ends)) { + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", + buf); + usage(B_FALSE); + } + if (buf[1] == '\0' || (toupper(buf[1]) == 'B' && buf[2] == '\0')) { + return (10*i); + } + (void) fprintf(stderr, "ztest: invalid bytes suffix: %s\n", buf); + usage(B_FALSE); + /* NOTREACHED */ +} + +static uint64_t +nicenumtoull(const char *buf) +{ + char *end; + uint64_t val; + + val = strtoull(buf, &end, 0); + if (end == buf) { + (void) fprintf(stderr, "ztest: bad numeric value: %s\n", buf); + usage(B_FALSE); + } else if (end[0] == '.') { + double fval = strtod(buf, &end); + fval *= pow(2, str2shift(end)); + /* + * UINT64_MAX is not exactly representable as a double. + * The closest representation is UINT64_MAX + 1, so we + * use a >= comparison instead of > for the bounds check. + */ + if (fval >= (double)UINT64_MAX) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val = (uint64_t)fval; + } else { + int shift = str2shift(end); + if (shift >= 64 || (val << shift) >> shift != val) { + (void) fprintf(stderr, "ztest: value too large: %s\n", + buf); + usage(B_FALSE); + } + val <<= shift; + } + return (val); +} + +static void +usage(boolean_t requested) +{ + const ztest_shared_opts_t *zo = &ztest_opts_defaults; + + char nice_vdev_size[NN_NUMBUF_SZ]; + char nice_force_ganging[NN_NUMBUF_SZ]; + FILE *fp = requested ? stdout : stderr; + + nicenum(zo->zo_vdev_size, nice_vdev_size, sizeof (nice_vdev_size)); + nicenum(zo->zo_metaslab_force_ganging, nice_force_ganging, + sizeof (nice_force_ganging)); + + (void) fprintf(fp, "Usage: %s\n" + "\t[-v vdevs (default: %llu)]\n" + "\t[-s size_of_each_vdev (default: %s)]\n" + "\t[-a alignment_shift (default: %d)] use 0 for random\n" + "\t[-m mirror_copies (default: %d)]\n" + "\t[-r raidz_disks (default: %d)]\n" + "\t[-R raidz_parity (default: %d)]\n" + "\t[-d datasets (default: %d)]\n" + "\t[-t threads (default: %d)]\n" + "\t[-g gang_block_threshold (default: %s)]\n" + "\t[-i init_count (default: %d)] initialize pool i times\n" + "\t[-k kill_percentage (default: %llu%%)]\n" + "\t[-p pool_name (default: %s)]\n" + "\t[-f dir (default: %s)] file directory for vdev files\n" + "\t[-M] Multi-host simulate pool imported on remote host\n" + "\t[-V] verbose (use multiple times for ever more blather)\n" + "\t[-E] use existing pool instead of creating new one\n" + "\t[-T time (default: %llu sec)] total run time\n" + "\t[-F freezeloops (default: %llu)] max loops in spa_freeze()\n" + "\t[-P passtime (default: %llu sec)] time per pass\n" + "\t[-B alt_ztest (default: )] alternate ztest path\n" + "\t[-C vdev class state (default: random)] special=on|off|random\n" + "\t[-o variable=value] ... set global variable to an unsigned\n" + "\t 32-bit integer value\n" + "\t[-G dump zfs_dbgmsg buffer before exiting due to an error\n" + "\t[-h] (print help)\n" + "", + zo->zo_pool, + (u_longlong_t)zo->zo_vdevs, /* -v */ + nice_vdev_size, /* -s */ + zo->zo_ashift, /* -a */ + zo->zo_mirrors, /* -m */ + zo->zo_raidz, /* -r */ + zo->zo_raidz_parity, /* -R */ + zo->zo_datasets, /* -d */ + zo->zo_threads, /* -t */ + nice_force_ganging, /* -g */ + zo->zo_init, /* -i */ + (u_longlong_t)zo->zo_killrate, /* -k */ + zo->zo_pool, /* -p */ + zo->zo_dir, /* -f */ + (u_longlong_t)zo->zo_time, /* -T */ + (u_longlong_t)zo->zo_maxloops, /* -F */ + (u_longlong_t)zo->zo_passtime); + exit(requested ? 0 : 1); +} + + +static void +ztest_parse_name_value(const char *input, ztest_shared_opts_t *zo) +{ + char name[32]; + char *value; + int state = ZTEST_VDEV_CLASS_RND; + + (void) strlcpy(name, input, sizeof (name)); + + value = strchr(name, '='); + if (value == NULL) { + (void) fprintf(stderr, "missing value in property=value " + "'-C' argument (%s)\n", input); + usage(B_FALSE); + } + *(value) = '\0'; + value++; + + if (strcmp(value, "on") == 0) { + state = ZTEST_VDEV_CLASS_ON; + } else if (strcmp(value, "off") == 0) { + state = ZTEST_VDEV_CLASS_OFF; + } else if (strcmp(value, "random") == 0) { + state = ZTEST_VDEV_CLASS_RND; + } else { + (void) fprintf(stderr, "invalid property value '%s'\n", value); + usage(B_FALSE); + } + + if (strcmp(name, "special") == 0) { + zo->zo_special_vdevs = state; + } else { + (void) fprintf(stderr, "invalid property name '%s'\n", name); + usage(B_FALSE); + } + if (zo->zo_verbose >= 3) + (void) printf("%s vdev state is '%s'\n", name, value); +} + +static void +process_options(int argc, char **argv) +{ + char *path; + ztest_shared_opts_t *zo = &ztest_opts; + + int opt; + uint64_t value; + char altdir[MAXNAMELEN] = { 0 }; + + bcopy(&ztest_opts_defaults, zo, sizeof (*zo)); + + while ((opt = getopt(argc, argv, + "v:s:a:m:r:R:d:t:g:i:k:p:f:MVET:P:hF:B:C:o:G")) != EOF) { + value = 0; + switch (opt) { + case 'v': + case 's': + case 'a': + case 'm': + case 'r': + case 'R': + case 'd': + case 't': + case 'g': + case 'i': + case 'k': + case 'T': + case 'P': + case 'F': + value = nicenumtoull(optarg); + } + switch (opt) { + case 'v': + zo->zo_vdevs = value; + break; + case 's': + zo->zo_vdev_size = MAX(SPA_MINDEVSIZE, value); + break; + case 'a': + zo->zo_ashift = value; + break; + case 'm': + zo->zo_mirrors = value; + break; + case 'r': + zo->zo_raidz = MAX(1, value); + break; + case 'R': + zo->zo_raidz_parity = MIN(MAX(value, 1), 3); + break; + case 'd': + zo->zo_datasets = MAX(1, value); + break; + case 't': + zo->zo_threads = MAX(1, value); + break; + case 'g': + zo->zo_metaslab_force_ganging = + MAX(SPA_MINBLOCKSIZE << 1, value); + break; + case 'i': + zo->zo_init = value; + break; + case 'k': + zo->zo_killrate = value; + break; + case 'p': + (void) strlcpy(zo->zo_pool, optarg, + sizeof (zo->zo_pool)); + break; + case 'f': + path = realpath(optarg, NULL); + if (path == NULL) { + (void) fprintf(stderr, "error: %s: %s\n", + optarg, strerror(errno)); + usage(B_FALSE); + } else { + (void) strlcpy(zo->zo_dir, path, + sizeof (zo->zo_dir)); + free(path); + } + break; + case 'M': + zo->zo_mmp_test = 1; + break; + case 'V': + zo->zo_verbose++; + break; + case 'E': + zo->zo_init = 0; + break; + case 'T': + zo->zo_time = value; + break; + case 'P': + zo->zo_passtime = MAX(1, value); + break; + case 'F': + zo->zo_maxloops = MAX(1, value); + break; + case 'B': + (void) strlcpy(altdir, optarg, sizeof (altdir)); + break; + case 'C': + ztest_parse_name_value(optarg, zo); + break; + case 'o': + if (set_global_var(optarg) != 0) + usage(B_FALSE); + break; + case 'G': + zo->zo_dump_dbgmsg = 1; + break; + case 'h': + usage(B_TRUE); + break; + case '?': + default: + usage(B_FALSE); + break; + } + } + + zo->zo_raidz_parity = MIN(zo->zo_raidz_parity, zo->zo_raidz - 1); + + zo->zo_vdevtime = + (zo->zo_vdevs > 0 ? zo->zo_time * NANOSEC / zo->zo_vdevs : + UINT64_MAX >> 2); + + if (strlen(altdir) > 0) { + char *cmd; + char *realaltdir; + char *bin; + char *ztest; + char *isa; + int isalen; + + cmd = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + realaltdir = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + VERIFY(NULL != realpath(getexecname(), cmd)); + if (0 != access(altdir, F_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest path: %s", + altdir); + } + VERIFY(NULL != realpath(altdir, realaltdir)); + + /* + * 'cmd' should be of the form "/usr/bin//ztest". + * We want to extract to determine if we should use + * 32 or 64 bit binaries. + */ + bin = strstr(cmd, "/usr/bin/"); + ztest = strstr(bin, "/ztest"); + isa = bin + 9; + isalen = ztest - isa; + (void) snprintf(zo->zo_alt_ztest, sizeof (zo->zo_alt_ztest), + "%s/usr/bin/%.*s/ztest", realaltdir, isalen, isa); + (void) snprintf(zo->zo_alt_libpath, sizeof (zo->zo_alt_libpath), + "%s/usr/lib/%.*s", realaltdir, isalen, isa); + + if (0 != access(zo->zo_alt_ztest, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate ztest: %s", + zo->zo_alt_ztest); + } else if (0 != access(zo->zo_alt_libpath, X_OK)) { + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "invalid alternate lib directory %s", + zo->zo_alt_libpath); + } + + umem_free(cmd, MAXPATHLEN); + umem_free(realaltdir, MAXPATHLEN); + } +} + +static void +ztest_kill(ztest_shared_t *zs) +{ + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(ztest_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(ztest_spa)); + + /* + * Before we kill off ztest, make sure that the config is updated. + * See comment above spa_write_cachefile(). + */ + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(ztest_spa, B_FALSE, B_FALSE); + mutex_exit(&spa_namespace_lock); + + (void) kill(getpid(), SIGKILL); +} + +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + ASSERT3S(ztest_fd_rand, >=, 0); + + if (range == 0) + return (0); + + if (read(ztest_fd_rand, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} + +/* ARGSUSED */ +static void +ztest_record_enospc(const char *s) +{ + ztest_shared->zs_enospc_count++; +} + +static uint64_t +ztest_get_ashift(void) +{ + if (ztest_opts.zo_ashift == 0) + return (SPA_MINBLOCKSHIFT + ztest_random(5)); + return (ztest_opts.zo_ashift); +} + +static nvlist_t * +make_vdev_file(char *path, char *aux, char *pool, size_t size, uint64_t ashift) +{ + char *pathbuf; + uint64_t vdev; + nvlist_t *file; + + pathbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + if (ashift == 0) + ashift = ztest_get_ashift(); + + if (path == NULL) { + path = pathbuf; + + if (aux != NULL) { + vdev = ztest_shared->zs_vdev_aux; + (void) snprintf(path, MAXPATHLEN, + ztest_aux_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, + aux, vdev); + } else { + vdev = ztest_shared->zs_vdev_next_leaf++; + (void) snprintf(path, MAXPATHLEN, + ztest_dev_template, ztest_opts.zo_dir, + pool == NULL ? ztest_opts.zo_pool : pool, vdev); + } + } + + if (size != 0) { + int fd = open(path, O_RDWR | O_CREAT | O_TRUNC, 0666); + if (fd == -1) + fatal(1, "can't open %s", path); + if (ftruncate(fd, size) != 0) + fatal(1, "can't ftruncate %s", path); + (void) close(fd); + } + + VERIFY(nvlist_alloc(&file, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_TYPE, VDEV_TYPE_FILE) == 0); + VERIFY(nvlist_add_string(file, ZPOOL_CONFIG_PATH, path) == 0); + VERIFY(nvlist_add_uint64(file, ZPOOL_CONFIG_ASHIFT, ashift) == 0); + umem_free(pathbuf, MAXPATHLEN); + + return (file); +} + +static nvlist_t * +make_vdev_raidz(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r) +{ + nvlist_t *raidz, **child; + int c; + + if (r < 2) + return (make_vdev_file(path, aux, pool, size, ashift)); + child = umem_alloc(r * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < r; c++) + child[c] = make_vdev_file(path, aux, pool, size, ashift); + + VERIFY(nvlist_alloc(&raidz, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(raidz, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_RAIDZ) == 0); + VERIFY(nvlist_add_uint64(raidz, ZPOOL_CONFIG_NPARITY, + ztest_opts.zo_raidz_parity) == 0); + VERIFY(nvlist_add_nvlist_array(raidz, ZPOOL_CONFIG_CHILDREN, + child, r) == 0); + + for (c = 0; c < r; c++) + nvlist_free(child[c]); + + umem_free(child, r * sizeof (nvlist_t *)); + + return (raidz); +} + +static nvlist_t * +make_vdev_mirror(char *path, char *aux, char *pool, size_t size, + uint64_t ashift, int r, int m) +{ + nvlist_t *mirror, **child; + int c; + + if (m < 1) + return (make_vdev_raidz(path, aux, pool, size, ashift, r)); + + child = umem_alloc(m * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < m; c++) + child[c] = make_vdev_raidz(path, aux, pool, size, ashift, r); + + VERIFY(nvlist_alloc(&mirror, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(mirror, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MIRROR) == 0); + VERIFY(nvlist_add_nvlist_array(mirror, ZPOOL_CONFIG_CHILDREN, + child, m) == 0); + + for (c = 0; c < m; c++) + nvlist_free(child[c]); + + umem_free(child, m * sizeof (nvlist_t *)); + + return (mirror); +} + +static nvlist_t * +make_vdev_root(char *path, char *aux, char *pool, size_t size, uint64_t ashift, + const char *class, int r, int m, int t) +{ + nvlist_t *root, **child; + int c; + boolean_t log; + + ASSERT(t > 0); + + log = (class != NULL && strcmp(class, "log") == 0); + + child = umem_alloc(t * sizeof (nvlist_t *), UMEM_NOFAIL); + + for (c = 0; c < t; c++) { + child[c] = make_vdev_mirror(path, aux, pool, size, ashift, + r, m); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + log) == 0); + + if (class != NULL && class[0] != '\0') { + ASSERT(m > 1 || log); /* expecting a mirror */ + VERIFY(nvlist_add_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, class) == 0); + } + } + + VERIFY(nvlist_alloc(&root, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(root, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(root, aux ? aux : ZPOOL_CONFIG_CHILDREN, + child, t) == 0); + + for (c = 0; c < t; c++) + nvlist_free(child[c]); + + umem_free(child, t * sizeof (nvlist_t *)); + + return (root); +} + +/* + * Find a random spa version. Returns back a random spa version in the + * range [initial_version, SPA_VERSION_FEATURES]. + */ +static uint64_t +ztest_random_spa_version(uint64_t initial_version) +{ + uint64_t version = initial_version; + + if (version <= SPA_VERSION_BEFORE_FEATURES) { + version = version + + ztest_random(SPA_VERSION_BEFORE_FEATURES - version + 1); + } + + if (version > SPA_VERSION_BEFORE_FEATURES) + version = SPA_VERSION_FEATURES; + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + return (version); +} + +static int +ztest_random_blocksize(void) +{ + ASSERT(ztest_spa->spa_max_ashift != 0); + + /* + * Choose a block size >= the ashift. + * If the SPA supports new MAXBLOCKSIZE, test up to 1MB blocks. + */ + int maxbs = SPA_OLD_MAXBLOCKSHIFT; + if (spa_maxblocksize(ztest_spa) == SPA_MAXBLOCKSIZE) + maxbs = 20; + uint64_t block_shift = + ztest_random(maxbs - ztest_spa->spa_max_ashift + 1); + return (1 << (SPA_MINBLOCKSHIFT + block_shift)); +} + +static int +ztest_random_dnodesize(void) +{ + int slots; + int max_slots = spa_maxdnodesize(ztest_spa) >> DNODE_SHIFT; + + if (max_slots == DNODE_MIN_SLOTS) + return (DNODE_MIN_SIZE); + + /* + * Weight the random distribution more heavily toward smaller + * dnode sizes since that is more likely to reflect real-world + * usage. + */ + ASSERT3U(max_slots, >, 4); + switch (ztest_random(10)) { + case 0: + slots = 5 + ztest_random(max_slots - 4); + break; + case 1 ... 4: + slots = 2 + ztest_random(3); + break; + default: + slots = 1; + break; + } + + return (slots << DNODE_SHIFT); +} + +static int +ztest_random_ibshift(void) +{ + return (DN_MIN_INDBLKSHIFT + + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); +} + +static uint64_t +ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) +{ + uint64_t top; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + do { + top = ztest_random(rvd->vdev_children); + tvd = rvd->vdev_child[top]; + } while (!vdev_is_concrete(tvd) || (tvd->vdev_islog && !log_ok) || + tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); + + return (top); +} + +static uint64_t +ztest_random_dsl_prop(zfs_prop_t prop) +{ + uint64_t value; + + do { + value = zfs_prop_random_value(prop, ztest_random(-1ULL)); + } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); + + return (value); +} + +static int +ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, + boolean_t inherit) +{ + const char *propname = zfs_prop_to_name(prop); + const char *valname; + char *setpoint; + uint64_t curval; + int error; + + error = dsl_prop_set_int(osname, propname, + (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), value); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT0(error); + + setpoint = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + VERIFY0(dsl_prop_get_integer(osname, propname, &curval, setpoint)); + + if (ztest_opts.zo_verbose >= 6) { + int err; + + err = zfs_prop_index_to_string(prop, curval, &valname); + if (err) + (void) printf("%s %s = %llu at '%s'\n", osname, + propname, (unsigned long long)curval, setpoint); + else + (void) printf("%s %s = %s at '%s'\n", + osname, propname, valname, setpoint); + } + umem_free(setpoint, MAXPATHLEN); + + return (error); +} + +static int +ztest_spa_prop_set_uint64(zpool_prop_t prop, uint64_t value) +{ + spa_t *spa = ztest_spa; + nvlist_t *props = NULL; + int error; + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); + + error = spa_prop_set(spa, props); + + nvlist_free(props); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (error); + } + ASSERT0(error); + + return (error); +} + +static int +ztest_dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) +{ + int err; + char *cp = NULL; + char ddname[ZFS_MAX_DATASET_NAME_LEN]; + + strcpy(ddname, name); + cp = strchr(ddname, '@'); + if (cp != NULL) + *cp = '\0'; + + err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); + while (decrypt && err == EACCES) { + dsl_crypto_params_t *dcp; + nvlist_t *crypto_args = fnvlist_alloc(); + + fnvlist_add_uint8_array(crypto_args, "wkeydata", + (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); + VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, + crypto_args, &dcp)); + err = spa_keystore_load_wkey(ddname, dcp, B_FALSE); + dsl_crypto_params_free(dcp, B_FALSE); + fnvlist_free(crypto_args); + + if (err == EINVAL) { + /* + * We couldn't load a key for this dataset so try + * the parent. This loop will eventually hit the + * encryption root since ztest only makes clones + * as children of their origin datasets. + */ + cp = strrchr(ddname, '/'); + if (cp == NULL) + return (err); + + *cp = '\0'; + err = EACCES; + continue; + } else if (err != 0) { + break; + } + + err = dmu_objset_own(name, type, readonly, decrypt, tag, osp); + break; + } + + return (err); +} + +static void +ztest_rll_init(rll_t *rll) +{ + rll->rll_writer = NULL; + rll->rll_readers = 0; + mutex_init(&rll->rll_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rll->rll_cv, NULL, CV_DEFAULT, NULL); +} + +static void +ztest_rll_destroy(rll_t *rll) +{ + ASSERT(rll->rll_writer == NULL); + ASSERT(rll->rll_readers == 0); + mutex_destroy(&rll->rll_lock); + cv_destroy(&rll->rll_cv); +} + +static void +ztest_rll_lock(rll_t *rll, rl_type_t type) +{ + mutex_enter(&rll->rll_lock); + + if (type == RL_READER) { + while (rll->rll_writer != NULL) + (void) cv_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_readers++; + } else { + while (rll->rll_writer != NULL || rll->rll_readers) + (void) cv_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_writer = curthread; + } + + mutex_exit(&rll->rll_lock); +} + +static void +ztest_rll_unlock(rll_t *rll) +{ + mutex_enter(&rll->rll_lock); + + if (rll->rll_writer) { + ASSERT(rll->rll_readers == 0); + rll->rll_writer = NULL; + } else { + ASSERT(rll->rll_readers != 0); + ASSERT(rll->rll_writer == NULL); + rll->rll_readers--; + } + + if (rll->rll_writer == NULL && rll->rll_readers == 0) + cv_broadcast(&rll->rll_cv); + + mutex_exit(&rll->rll_lock); +} + +static void +ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_lock(rll, type); +} + +static void +ztest_object_unlock(ztest_ds_t *zd, uint64_t object) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; + + ztest_rll_unlock(rll); +} + +static rl_t * +ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, + uint64_t size, rl_type_t type) +{ + uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); + rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; + rl_t *rl; + + rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); + rl->rl_object = object; + rl->rl_offset = offset; + rl->rl_size = size; + rl->rl_lock = rll; + + ztest_rll_lock(rll, type); + + return (rl); +} + +static void +ztest_range_unlock(rl_t *rl) +{ + rll_t *rll = rl->rl_lock; + + ztest_rll_unlock(rll); + + umem_free(rl, sizeof (*rl)); +} + +static void +ztest_zd_init(ztest_ds_t *zd, ztest_shared_ds_t *szd, objset_t *os) +{ + zd->zd_os = os; + zd->zd_zilog = dmu_objset_zil(os); + zd->zd_shared = szd; + dmu_objset_name(os, zd->zd_name); + int l; + + if (zd->zd_shared != NULL) + zd->zd_shared->zd_seq = 0; + + VERIFY0(pthread_rwlock_init(&zd->zd_zilog_lock, NULL)); + mutex_init(&zd->zd_dirobj_lock, NULL, MUTEX_DEFAULT, NULL); + + for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_init(&zd->zd_object_lock[l]); + + for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_init(&zd->zd_range_lock[l]); +} + +static void +ztest_zd_fini(ztest_ds_t *zd) +{ + int l; + + mutex_destroy(&zd->zd_dirobj_lock); + (void) pthread_rwlock_destroy(&zd->zd_zilog_lock); + + for (l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_destroy(&zd->zd_object_lock[l]); + + for (l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_destroy(&zd->zd_range_lock[l]); +} + +#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) + +static uint64_t +ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) +{ + uint64_t txg; + int error; + + /* + * Attempt to assign tx to some transaction group. + */ + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ASSERT3U(error, ==, ENOSPC); + ztest_record_enospc(tag); + } + dmu_tx_abort(tx); + return (0); + } + txg = dmu_tx_get_txg(tx); + ASSERT(txg != 0); + return (txg); +} + +static void +ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) +{ + bt->bt_magic = BT_MAGIC; + bt->bt_objset = dmu_objset_id(os); + bt->bt_object = object; + bt->bt_dnodesize = dnodesize; + bt->bt_offset = offset; + bt->bt_gen = gen; + bt->bt_txg = txg; + bt->bt_crtxg = crtxg; +} + +static void +ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t dnodesize, uint64_t offset, uint64_t gen, uint64_t txg, + uint64_t crtxg) +{ + ASSERT3U(bt->bt_magic, ==, BT_MAGIC); + ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); + ASSERT3U(bt->bt_object, ==, object); + ASSERT3U(bt->bt_dnodesize, ==, dnodesize); + ASSERT3U(bt->bt_offset, ==, offset); + ASSERT3U(bt->bt_gen, <=, gen); + ASSERT3U(bt->bt_txg, <=, txg); + ASSERT3U(bt->bt_crtxg, ==, crtxg); +} + +static ztest_block_tag_t * +ztest_bt_bonus(dmu_buf_t *db) +{ + dmu_object_info_t doi; + ztest_block_tag_t *bt; + + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, <=, db->db_size); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); + bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + + return (bt); +} + +/* + * Generate a token to fill up unused bonus buffer space. Try to make + * it unique to the object, generation, and offset to verify that data + * is not getting overwritten by data from other dnodes. + */ +#define ZTEST_BONUS_FILL_TOKEN(obj, ds, gen, offset) \ + (((ds) << 48) | ((gen) << 32) | ((obj) << 8) | (offset)) + +/* + * Fill up the unused bonus buffer region before the block tag with a + * verifiable pattern. Filling the whole bonus area with non-zero data + * helps ensure that all dnode traversal code properly skips the + * interior regions of large dnodes. + */ +static void +ztest_fill_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + ASSERT(IS_P2ALIGNED((char *)end - (char *)db->db_data, 8)); + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + *bonusp = token; + } +} + +/* + * Verify that the unused area of a bonus buffer is filled with the + * expected tokens. + */ +static void +ztest_verify_unused_bonus(dmu_buf_t *db, void *end, uint64_t obj, + objset_t *os, uint64_t gen) +{ + uint64_t *bonusp; + + for (bonusp = db->db_data; bonusp < (uint64_t *)end; bonusp++) { + uint64_t token = ZTEST_BONUS_FILL_TOKEN(obj, dmu_objset_id(os), + gen, bonusp - (uint64_t *)db->db_data); + VERIFY3U(*bonusp, ==, token); + } +} + +/* + * ZIL logging ops + */ + +#define lrz_type lr_mode +#define lrz_blocksize lr_uid +#define lrz_ibshift lr_gid +#define lrz_bonustype lr_rdev +#define lrz_dnodesize lr_crtime[1] + +static void +ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr, uint64_t object) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + itx->itx_oid = object; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) +{ + itx_t *itx; + itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + if (lr->lr_length > zil_max_log_data(zd->zd_zilog)) + write_state = WR_INDIRECT; + + itx = zil_itx_create(TX_WRITE, + sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); + + if (write_state == WR_COPIED && + dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + write_state = WR_NEED_COPY; + } + itx->itx_private = zd; + itx->itx_wr_state = write_state; + itx->itx_sync = (ztest_random(8) == 0); + + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +static void +ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return; + + itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + itx->itx_sync = B_FALSE; + zil_itx_assign(zd->zd_zilog, itx, tx); +} + +/* + * ZIL replay ops + */ +static int +ztest_replay_create(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_create_t *lr = arg2; + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + ztest_block_tag_t *bbt; + dmu_buf_t *db; + dmu_tx_t *tx; + uint64_t txg; + int error = 0; + int bonuslen; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } else { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + } + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) + return (ENOSPC); + + ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + bonuslen = DN_BONUS_SIZE(lr->lrz_dnodesize); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + if (lr->lr_foid == 0) { + lr->lr_foid = zap_create_dnsize(os, + lr->lrz_type, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } else { + error = zap_create_claim_dnsize(os, lr->lr_foid, + lr->lrz_type, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } + } else { + if (lr->lr_foid == 0) { + lr->lr_foid = dmu_object_alloc_dnsize(os, + lr->lrz_type, 0, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } else { + error = dmu_object_claim_dnsize(os, lr->lr_foid, + lr->lrz_type, 0, lr->lrz_bonustype, + bonuslen, lr->lrz_dnodesize, tx); + } + } + + if (error) { + ASSERT3U(error, ==, EEXIST); + ASSERT(zd->zd_zilog->zl_replay); + dmu_tx_commit(tx); + return (error); + } + + ASSERT(lr->lr_foid != 0); + + if (lr->lrz_type != DMU_OT_ZAP_OTHER) + VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + lr->lrz_blocksize, lr->lrz_ibshift, tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + bbt = ztest_bt_bonus(db); + dmu_buf_will_dirty(db, tx); + ztest_bt_generate(bbt, os, lr->lr_foid, lr->lrz_dnodesize, -1ULL, + lr->lr_gen, txg, txg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, lr->lr_gen); + dmu_buf_rele(db, FTAG); + + VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + &lr->lr_foid, tx)); + + (void) ztest_log_create(zd, tx, lr); + + dmu_tx_commit(tx); + + return (0); +} + +static int +ztest_replay_remove(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_remove_t *lr = arg2; + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object, txg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + VERIFY3U(0, ==, + zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); + ASSERT(object != 0); + + ztest_object_lock(zd, object, RL_WRITER); + + VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_object_unlock(zd, object); + return (ENOSPC); + } + + if (doi.doi_type == DMU_OT_ZAP_OTHER) { + VERIFY3U(0, ==, zap_destroy(os, object, tx)); + } else { + VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + } + + VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + + (void) ztest_log_remove(zd, tx, lr, object); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, object); + + return (0); +} + +static int +ztest_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_write_t *lr = arg2; + objset_t *os = zd->zd_os; + void *data = lr + 1; /* data follows lr */ + uint64_t offset, length; + ztest_block_tag_t *bt = data; + ztest_block_tag_t *bbt; + uint64_t gen, txg, lrtxg, crtxg; + dmu_object_info_t doi; + dmu_tx_t *tx; + dmu_buf_t *db; + arc_buf_t *abuf = NULL; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + if (bt->bt_magic == BSWAP_64(BT_MAGIC)) + byteswap_uint64_array(bt, sizeof (*bt)); + + if (bt->bt_magic != BT_MAGIC) + bt = NULL; + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + dmu_object_info_from_db(db, &doi); + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + gen = bbt->bt_gen; + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, lr->lr_foid, offset, length); + + if (ztest_random(8) == 0 && length == doi.doi_data_block_size && + P2PHASE(offset, length) == 0) + abuf = dmu_request_arcbuf(db, length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + dmu_buf_rele(db, FTAG); + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + if (bt != NULL) { + /* + * Usually, verify the old data before writing new data -- + * but not always, because we also want to verify correct + * behavior when the data was not recently read into cache. + */ + ASSERT(offset % doi.doi_data_block_size == 0); + if (ztest_random(4) != 0) { + int prefetch = ztest_random(2) ? + DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + ztest_block_tag_t rbt; + + VERIFY(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, prefetch) == 0); + if (rbt.bt_magic == BT_MAGIC) { + ztest_bt_verify(&rbt, os, lr->lr_foid, 0, + offset, gen, txg, crtxg); + } + } + + /* + * Writes can appear to be newer than the bonus buffer because + * the ztest_get_data() callback does a dmu_read() of the + * open-context data, which may be different than the data + * as it was when the write was generated. + */ + if (zd->zd_zilog->zl_replay) { + ztest_bt_verify(bt, os, lr->lr_foid, 0, offset, + MAX(gen, bt->bt_gen), MAX(txg, lrtxg), + bt->bt_crtxg); + } + + /* + * Set the bt's gen/txg to the bonus buffer's gen/txg + * so that all of the usual ASSERTs will work. + */ + ztest_bt_generate(bt, os, lr->lr_foid, 0, offset, gen, txg, + crtxg); + } + + if (abuf == NULL) { + dmu_write(os, lr->lr_foid, offset, length, data, tx); + } else { + bcopy(data, abuf->b_data, length); + dmu_assign_arcbuf_by_dbuf(db, offset, abuf, tx); + } + + (void) ztest_log_write(zd, tx, lr); + + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_truncate_t *lr = arg2; + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx) == 0); + + (void) ztest_log_truncate(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) +{ + ztest_ds_t *zd = arg1; + lr_setattr_t *lr = arg2; + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + dmu_buf_t *db; + ztest_block_tag_t *bbt; + uint64_t txg, lrtxg, crtxg, dnodesize; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, lr->lr_foid); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + dnodesize = bbt->bt_dnodesize; + + if (zd->zd_zilog->zl_replay) { + ASSERT(lr->lr_size != 0); + ASSERT(lr->lr_mode != 0); + ASSERT(lrtxg != 0); + } else { + /* + * Randomly change the size and increment the generation. + */ + lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * + sizeof (*bbt); + lr->lr_mode = bbt->bt_gen + 1; + ASSERT(lrtxg == 0); + } + + /* + * Verify that the current bonus buffer is not newer than our txg. + */ + ztest_bt_verify(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, + MAX(txg, lrtxg), crtxg); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); + ASSERT3U(lr->lr_size, <=, db->db_size); + VERIFY0(dmu_set_bonus(db, lr->lr_size, tx)); + bbt = ztest_bt_bonus(db); + + ztest_bt_generate(bbt, os, lr->lr_foid, dnodesize, -1ULL, lr->lr_mode, + txg, crtxg); + ztest_fill_unused_bonus(db, bbt, lr->lr_foid, os, bbt->bt_gen); + dmu_buf_rele(db, FTAG); + + (void) ztest_log_setattr(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { + NULL, /* 0 no such transaction type */ + ztest_replay_create, /* TX_CREATE */ + NULL, /* TX_MKDIR */ + NULL, /* TX_MKXATTR */ + NULL, /* TX_SYMLINK */ + ztest_replay_remove, /* TX_REMOVE */ + NULL, /* TX_RMDIR */ + NULL, /* TX_LINK */ + NULL, /* TX_RENAME */ + ztest_replay_write, /* TX_WRITE */ + ztest_replay_truncate, /* TX_TRUNCATE */ + ztest_replay_setattr, /* TX_SETATTR */ + NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ +}; + +/* + * ZIL get_data callbacks + */ + +/* ARGSUSED */ +static void +ztest_get_done(zgd_t *zgd, int error) +{ + ztest_ds_t *zd = zgd->zgd_private; + uint64_t object = ((rl_t *)zgd->zgd_lr)->rl_object; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + ztest_range_unlock((rl_t *)zgd->zgd_lr); + ztest_object_unlock(zd, object); + + umem_free(zgd, sizeof (*zgd)); +} + +static int +ztest_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, + zio_t *zio) +{ + ztest_ds_t *zd = arg; + objset_t *os = zd->zd_os; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + uint64_t txg = lr->lr_common.lrc_txg; + uint64_t crtxg; + dmu_object_info_t doi; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + ztest_object_lock(zd, object, RL_READER); + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) { + ztest_object_unlock(zd, object); + return (error); + } + + crtxg = ztest_bt_bonus(db)->bt_crtxg; + + if (crtxg == 0 || crtxg > txg) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, object); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + dmu_buf_rele(db, FTAG); + db = NULL; + + zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zd; + + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); + + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + ASSERT(error == 0); + } else { + size = doi.doi_data_block_size; + if (ISP2(size)) { + offset = P2ALIGN(offset, size); + } else { + ASSERT(offset < size); + offset = 0; + } + + zgd->zgd_lr = (struct zfs_locked_range *)ztest_range_lock(zd, + object, offset, size, RL_READER); + + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + ztest_get_done, zgd); + + if (error == 0) + return (0); + } + } + + ztest_get_done(zgd, error); + + return (error); +} + +static void * +ztest_lr_alloc(size_t lrsize, char *name) +{ + char *lr; + size_t namesize = name ? strlen(name) + 1 : 0; + + lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); + + if (name) + bcopy(name, lr + lrsize, namesize); + + return (lr); +} + +static void +ztest_lr_free(void *lr, size_t lrsize, char *name) +{ + size_t namesize = name ? strlen(name) + 1 : 0; + + umem_free(lr, lrsize + namesize); +} + +/* + * Lookup a bunch of objects. Returns the number of objects not found. + */ +static int +ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + for (i = 0; i < count; i++, od++) { + od->od_object = 0; + error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, + sizeof (uint64_t), 1, &od->od_object); + if (error) { + ASSERT(error == ENOENT); + ASSERT(od->od_object == 0); + missing++; + } else { + dmu_buf_t *db; + ztest_block_tag_t *bbt; + dmu_object_info_t doi; + + ASSERT(od->od_object != 0); + ASSERT(missing == 0); /* there should be no gaps */ + + ztest_object_lock(zd, od->od_object, RL_READER); + VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, + od->od_object, FTAG, &db)); + dmu_object_info_from_db(db, &doi); + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + od->od_type = doi.doi_type; + od->od_blocksize = doi.doi_data_block_size; + od->od_gen = bbt->bt_gen; + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, od->od_object); + } + } + + return (missing); +} + +static int +ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + for (i = 0; i < count; i++, od++) { + if (missing) { + od->od_object = 0; + missing++; + continue; + } + + lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ + lr->lrz_type = od->od_crtype; + lr->lrz_blocksize = od->od_crblocksize; + lr->lrz_ibshift = ztest_random_ibshift(); + lr->lrz_bonustype = DMU_OT_UINT64_OTHER; + lr->lrz_dnodesize = od->od_crdnodesize; + lr->lr_gen = od->od_crgen; + lr->lr_crtime[0] = time(NULL); + + if (ztest_replay_create(zd, lr, B_FALSE) != 0) { + ASSERT(missing == 0); + od->od_object = 0; + missing++; + } else { + od->od_object = lr->lr_foid; + od->od_type = od->od_crtype; + od->od_blocksize = od->od_crblocksize; + od->od_gen = od->od_crgen; + ASSERT(od->od_object != 0); + } + + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + int i; + + ASSERT(MUTEX_HELD(&zd->zd_dirobj_lock)); + + od += count - 1; + + for (i = count - 1; i >= 0; i--, od--) { + if (missing) { + missing++; + continue; + } + + /* + * No object was found. + */ + if (od->od_object == 0) + continue; + + lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + + if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { + ASSERT3U(error, ==, ENOSPC); + missing++; + } else { + od->od_object = 0; + } + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, + void *data) +{ + lr_write_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + bcopy(data, lr + 1, size); + + error = ztest_replay_write(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr) + size, NULL); + + return (error); +} + +static int +ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + lr_truncate_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + + error = ztest_replay_truncate(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static int +ztest_setattr(ztest_ds_t *zd, uint64_t object) +{ + lr_setattr_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_size = 0; + lr->lr_mode = 0; + + error = ztest_replay_setattr(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static void +ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + txg_wait_synced(dmu_objset_pool(os), 0); + + ztest_object_lock(zd, object, RL_READER); + rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, object, offset, size); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + + if (txg != 0) { + dmu_prealloc(os, object, offset, size, tx); + dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + } else { + (void) dmu_free_long_range(os, object, offset, size); + } + + ztest_range_unlock(rl); + ztest_object_unlock(zd, object); +} + +static void +ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) +{ + int err; + ztest_block_tag_t wbt; + dmu_object_info_t doi; + enum ztest_io_type io_type; + uint64_t blocksize; + void *data; + + VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + blocksize = doi.doi_data_block_size; + data = umem_alloc(blocksize, UMEM_NOFAIL); + + /* + * Pick an i/o type at random, biased toward writing block tags. + */ + io_type = ztest_random(ZTEST_IO_TYPES); + if (ztest_random(2) == 0) + io_type = ZTEST_IO_WRITE_TAG; + + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + + switch (io_type) { + + case ZTEST_IO_WRITE_TAG: + ztest_bt_generate(&wbt, zd->zd_os, object, doi.doi_dnodesize, + offset, 0, 0, 0); + (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); + break; + + case ZTEST_IO_WRITE_PATTERN: + (void) memset(data, 'a' + (object + offset) % 5, blocksize); + if (ztest_random(2) == 0) { + /* + * Induce fletcher2 collisions to ensure that + * zio_ddt_collision() detects and resolves them + * when using fletcher2-verify for deduplication. + */ + ((uint64_t *)data)[0] ^= 1ULL << 63; + ((uint64_t *)data)[4] ^= 1ULL << 63; + } + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_WRITE_ZEROES: + bzero(data, blocksize); + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_TRUNCATE: + (void) ztest_truncate(zd, object, offset, blocksize); + break; + + case ZTEST_IO_SETATTR: + (void) ztest_setattr(zd, object); + break; + default: + break; + + case ZTEST_IO_REWRITE: + (void) pthread_rwlock_rdlock(&ztest_name_lock); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_CHECKSUM, spa_dedup_checksum(ztest_spa), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + err = ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_COMPRESSION, + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), + B_FALSE); + VERIFY(err == 0 || err == ENOSPC); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + VERIFY0(dmu_read(zd->zd_os, object, offset, blocksize, data, + DMU_READ_NO_PREFETCH)); + + (void) ztest_write(zd, object, offset, blocksize, data); + break; + } + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + + umem_free(data, blocksize); +} + +/* + * Initialize an object description template. + */ +static void +ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, + dmu_object_type_t type, uint64_t blocksize, uint64_t dnodesize, + uint64_t gen) +{ + od->od_dir = ZTEST_DIROBJ; + od->od_object = 0; + + od->od_crtype = type; + od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crdnodesize = dnodesize ? dnodesize : ztest_random_dnodesize(); + od->od_crgen = gen; + + od->od_type = DMU_OT_NONE; + od->od_blocksize = 0; + od->od_gen = 0; + + (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", + tag, (longlong_t)id, (u_longlong_t)index); +} + +/* + * Lookup or create the objects for a test using the od template. + * If the objects do not all exist, or if 'remove' is specified, + * remove any existing objects and create new ones. Otherwise, + * use the existing objects. + */ +static int +ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) +{ + int count = size / sizeof (*od); + int rv = 0; + + mutex_enter(&zd->zd_dirobj_lock); + if ((ztest_lookup(zd, od, count) != 0 || remove) && + (ztest_remove(zd, od, count) != 0 || + ztest_create(zd, od, count) != 0)) + rv = -1; + zd->zd_od = od; + mutex_exit(&zd->zd_dirobj_lock); + + return (rv); +} + +/* ARGSUSED */ +void +ztest_zil_commit(ztest_ds_t *zd, uint64_t id) +{ + zilog_t *zilog = zd->zd_zilog; + + (void) pthread_rwlock_rdlock(&zd->zd_zilog_lock); + + zil_commit(zilog, ztest_random(ZTEST_OBJECTS)); + + /* + * Remember the committed values in zd, which is in parent/child + * shared memory. If we die, the next iteration of ztest_run() + * will verify that the log really does contain this record. + */ + mutex_enter(&zilog->zl_lock); + ASSERT(zd->zd_shared != NULL); + ASSERT3U(zd->zd_shared->zd_seq, <=, zilog->zl_commit_lr_seq); + zd->zd_shared->zd_seq = zilog->zl_commit_lr_seq; + mutex_exit(&zilog->zl_lock); + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); +} + +/* + * This function is designed to simulate the operations that occur during a + * mount/unmount operation. We hold the dataset across these operations in an + * attempt to expose any implicit assumptions about ZIL management. + */ +/* ARGSUSED */ +void +ztest_zil_remount(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + + /* + * We hold the ztest_vdev_lock so we don't cause problems with + * other threads that wish to remove a log device, such as + * ztest_device_removal(). + */ + mutex_enter(&ztest_vdev_lock); + + /* + * We grab the zd_dirobj_lock to ensure that no other thread is + * updating the zil (i.e. adding in-memory log records) and the + * zd_zilog_lock to block any I/O. + */ + mutex_enter(&zd->zd_dirobj_lock); + (void) pthread_rwlock_wrlock(&zd->zd_zilog_lock); + + /* zfsvfs_teardown() */ + zil_close(zd->zd_zilog); + + /* zfsvfs_setup() */ + VERIFY(zil_open(os, ztest_get_data) == zd->zd_zilog); + zil_replay(os, zd, ztest_replay_vector); + + (void) pthread_rwlock_unlock(&zd->zd_zilog_lock); + mutex_exit(&zd->zd_dirobj_lock); + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify that we can't destroy an active pool, create an existing pool, + * or create a pool with a bad vdev spec. + */ +/* ARGSUSED */ +void +ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_opts_t *zo = &ztest_opts; + spa_t *spa; + nvlist_t *nvroot; + + if (zo->zo_mmp_test) + return; + + /* + * Attempt to create using a bad file. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create using a bad mirror. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 2, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, NULL, 0, 0, NULL, 0, 0, 1); + VERIFY3U(EEXIST, ==, + spa_create(zo->zo_pool, nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * We open a reference to the spa and then we try to export it + * expecting one of the following errors: + * + * EBUSY + * Because of the reference we just opened. + * + * ZFS_ERR_EXPORT_IN_PROGRESS + * For the case that there is another ztest thread doing + * an export concurrently. + */ + VERIFY3U(0, ==, spa_open(zo->zo_pool, &spa, FTAG)); + int error = spa_destroy(zo->zo_pool); + if (error != EBUSY && error != ZFS_ERR_EXPORT_IN_PROGRESS) { + fatal(0, "spa_destroy(%s) returned unexpected value %d", + spa->spa_name, error); + } + spa_close(spa, FTAG); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Start and then stop the MMP threads to ensure the startup and shutdown code + * works properly. Actual protection and property-related code tested via ZTS. + */ +/* ARGSUSED */ +void +ztest_mmp_enable_disable(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_opts_t *zo = &ztest_opts; + spa_t *spa = ztest_spa; + + if (zo->zo_mmp_test) + return; + + /* + * Since enabling MMP involves setting a property, it could not be done + * while the pool is suspended. + */ + if (spa_suspended(spa)) + return; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&spa->spa_props_lock); + + zfs_multihost_fail_intervals = 0; + + if (!spa_multihost(spa)) { + spa->spa_multihost = B_TRUE; + mmp_thread_start(spa); + } + + mutex_exit(&spa->spa_props_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); + mmp_signal_all_threads(); + txg_wait_synced(spa_get_dsl(spa), 0); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&spa->spa_props_lock); + + if (spa_multihost(spa)) { + mmp_thread_stop(spa); + spa->spa_multihost = B_FALSE; + } + + mutex_exit(&spa->spa_props_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +/* ARGSUSED */ +void +ztest_spa_upgrade(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa; + uint64_t initial_version = SPA_VERSION_INITIAL; + uint64_t version, newversion; + nvlist_t *nvroot, *props; + char *name; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + name = kmem_asprintf("%s_upgrade", ztest_opts.zo_pool); + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(name); + + nvroot = make_vdev_root(NULL, NULL, name, ztest_opts.zo_vdev_size, 0, + NULL, ztest_opts.zo_raidz, ztest_opts.zo_mirrors, 1); + + /* + * If we're configuring a RAIDZ device then make sure that the + * initial version is capable of supporting that feature. + */ + switch (ztest_opts.zo_raidz_parity) { + case 0: + case 1: + initial_version = SPA_VERSION_INITIAL; + break; + case 2: + initial_version = SPA_VERSION_RAIDZ2; + break; + case 3: + initial_version = SPA_VERSION_RAIDZ3; + break; + } + + /* + * Create a pool with a spa version that can be upgraded. Pick + * a value between initial_version and SPA_VERSION_BEFORE_FEATURES. + */ + do { + version = ztest_random_spa_version(initial_version); + } while (version > SPA_VERSION_BEFORE_FEATURES); + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), version); + VERIFY3S(spa_create(name, nvroot, props, NULL, NULL), ==, 0); + fnvlist_free(nvroot); + fnvlist_free(props); + + VERIFY3S(spa_open(name, &spa, FTAG), ==, 0); + VERIFY3U(spa_version(spa), ==, version); + newversion = ztest_random_spa_version(version + 1); + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("upgrading spa version from %llu to %llu\n", + (u_longlong_t)version, (u_longlong_t)newversion); + } + + spa_upgrade(spa, newversion); + VERIFY3U(spa_version(spa), >, version); + VERIFY3U(spa_version(spa), ==, fnvlist_lookup_uint64(spa->spa_config, + zpool_prop_to_name(ZPOOL_PROP_VERSION))); + spa_close(spa, FTAG); + + kmem_strfree(name); + mutex_exit(&ztest_vdev_lock); +} + +static void +ztest_spa_checkpoint(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); + + int error = spa_checkpoint(spa->spa_name); + + switch (error) { + case 0: + case ZFS_ERR_DEVRM_IN_PROGRESS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + case ZFS_ERR_CHECKPOINT_EXISTS: + break; + case ENOSPC: + ztest_record_enospc(FTAG); + break; + default: + fatal(0, "spa_checkpoint(%s) = %d", spa->spa_name, error); + } +} + +static void +ztest_spa_discard_checkpoint(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&ztest_checkpoint_lock)); + + int error = spa_checkpoint_discard(spa->spa_name); + + switch (error) { + case 0: + case ZFS_ERR_DISCARDING_CHECKPOINT: + case ZFS_ERR_NO_CHECKPOINT: + break; + default: + fatal(0, "spa_discard_checkpoint(%s) = %d", + spa->spa_name, error); + } + +} + +/* ARGSUSED */ +void +ztest_spa_checkpoint_create_discard(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + + mutex_enter(&ztest_checkpoint_lock); + if (ztest_random(2) == 0) { + ztest_spa_checkpoint(spa); + } else { + ztest_spa_discard_checkpoint(spa); + } + mutex_exit(&ztest_checkpoint_lock); +} + + +static vdev_t * +vdev_lookup_by_path(vdev_t *vd, const char *path) +{ + vdev_t *mvd; + int c; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + + for (c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); +} + +static int +spa_num_top_vdevs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + ASSERT3U(spa_config_held(spa, SCL_VDEV, RW_READER), ==, SCL_VDEV); + return (rvd->vdev_children); +} + +/* + * Verify that vdev_add() works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + uint64_t guid; + nvlist_t *nvroot; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + + /* + * If we have slogs then remove them 1/4 of the time. + */ + if (spa_has_slogs(spa) && ztest_random(4) == 0) { + metaslab_group_t *mg; + + /* + * find the first real slog in log allocation class + */ + mg = spa_log_class(spa)->mc_rotor; + while (!mg->mg_vd->vdev_islog) + mg = mg->mg_next; + + guid = mg->mg_vd->vdev_guid; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between removing a slog (dmu_objset_find) + * and destroying a dataset. Removing the slog will + * grab a reference on the dataset which may cause + * dsl_destroy_head() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_vdev_remove(spa, guid, B_FALSE); + pthread_rwlock_unlock(&ztest_name_lock); + + switch (error) { + case 0: + case EEXIST: /* Generic zil_reset() error */ + case EBUSY: /* Replay required */ + case EACCES: /* Crypto key not loaded */ + case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + break; + default: + fatal(0, "spa_vdev_remove() = %d", error); + } + } else { + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Make 1/4 of the devices be log devices + */ + nvroot = make_vdev_root(NULL, NULL, NULL, + ztest_opts.zo_vdev_size, 0, (ztest_random(4) == 0) ? + "log" : NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + switch (error) { + case 0: + break; + case ENOSPC: + ztest_record_enospc("spa_vdev_add"); + break; + default: + fatal(0, "spa_vdev_add() = %d", error); + } + } + + mutex_exit(&ztest_vdev_lock); +} + +/* ARGSUSED */ +void +ztest_vdev_class_add(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + uint64_t leaves; + nvlist_t *nvroot; + const char *class = (ztest_random(2) == 0) ? + VDEV_ALLOC_BIAS_SPECIAL : VDEV_ALLOC_BIAS_DEDUP; + int error; + + /* + * By default add a special vdev 50% of the time + */ + if ((ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_OFF) || + (ztest_opts.zo_special_vdevs == ZTEST_VDEV_CLASS_RND && + ztest_random(2) == 0)) { + return; + } + + mutex_enter(&ztest_vdev_lock); + + /* Only test with mirrors */ + if (zs->zs_mirrors < 2) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* requires feature@allocation_classes */ + if (!spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)) { + mutex_exit(&ztest_vdev_lock); + return; + } + + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_shared->zs_vdev_next_leaf = spa_num_top_vdevs(spa) * leaves; + spa_config_exit(spa, SCL_VDEV, FTAG); + + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + class, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + + /* + * 50% of the time allow small blocks in the special class + */ + if (error == 0 && + spa_special_class(spa)->mc_groups == 1 && ztest_random(2) == 0) { + if (ztest_opts.zo_verbose >= 3) + (void) printf("Enabling special VDEV small blocks\n"); + (void) ztest_dsl_prop_set_uint64(zd->zd_name, + ZFS_PROP_SPECIAL_SMALL_BLOCKS, 32768, B_FALSE); + } + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 3) { + metaslab_class_t *mc; + + if (strcmp(class, VDEV_ALLOC_BIAS_SPECIAL) == 0) + mc = spa_special_class(spa); + else + mc = spa_dedup_class(spa); + (void) printf("Added a %s mirrored vdev (of %d)\n", + class, (int)mc->mc_groups); + } +} + +/* + * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + vdev_t *rvd = spa->spa_root_vdev; + spa_aux_vdev_t *sav; + char *aux; + char *path; + uint64_t guid = 0; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + path = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + if (ztest_random(2) == 0) { + sav = &spa->spa_spares; + aux = ZPOOL_CONFIG_SPARES; + } else { + sav = &spa->spa_l2cache; + aux = ZPOOL_CONFIG_L2CACHE; + } + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (sav->sav_count != 0 && ztest_random(4) == 0) { + /* + * Pick a random device to remove. + */ + guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + } else { + /* + * Find an unused device we can add. + */ + zs->zs_vdev_aux = 0; + for (;;) { + int c; + (void) snprintf(path, MAXPATHLEN, ztest_aux_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, aux, + zs->zs_vdev_aux); + for (c = 0; c < sav->sav_count; c++) + if (strcmp(sav->sav_vdevs[c]->vdev_path, + path) == 0) + break; + if (c == sav->sav_count && + vdev_lookup_by_path(rvd, path) == NULL) + break; + zs->zs_vdev_aux++; + } + } + + spa_config_exit(spa, SCL_VDEV, FTAG); + + if (guid == 0) { + /* + * Add a new device. + */ + nvlist_t *nvroot = make_vdev_root(NULL, aux, NULL, + (ztest_opts.zo_vdev_size * 5) / 4, 0, NULL, 0, 0, 1); + error = spa_vdev_add(spa, nvroot); + + switch (error) { + case 0: + break; + default: + fatal(0, "spa_vdev_add(%p) = %d", nvroot, error); + } + nvlist_free(nvroot); + } else { + /* + * Remove an existing device. Sometimes, dirty its + * vdev state first to make sure we handle removal + * of devices that have pending state changes. + */ + if (ztest_random(2) == 0) + (void) vdev_online(spa, guid, 0, NULL); + + error = spa_vdev_remove(spa, guid, B_FALSE); + + switch (error) { + case 0: + case EBUSY: + case ZFS_ERR_CHECKPOINT_EXISTS: + case ZFS_ERR_DISCARDING_CHECKPOINT: + break; + default: + fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); + } + } + + mutex_exit(&ztest_vdev_lock); + + umem_free(path, MAXPATHLEN); +} + +/* + * split a pool if it has mirror tlvdevs + */ +/* ARGSUSED */ +void +ztest_split_pool(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *tree, **child, *config, *split, **schild; + uint_t c, children, schildren = 0, lastlogid = 0; + int error = 0; + + if (ztest_opts.zo_mmp_test) + return; + + mutex_enter(&ztest_vdev_lock); + + /* ensure we have a usable config; mirrors of raidz aren't supported */ + if (zs->zs_mirrors < 3 || ztest_opts.zo_raidz > 1) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* clean up the old pool, if any */ + (void) spa_destroy("splitp"); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* generate a config from the existing config */ + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, + &tree) == 0); + mutex_exit(&spa->spa_props_lock); + + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0); + + schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + nvlist_t **mchild; + uint_t mchildren; + + if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { + VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, + 0) == 0); + VERIFY(nvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1) == 0); + if (lastlogid == 0) + lastlogid = schildren; + ++schildren; + continue; + } + lastlogid = 0; + VERIFY(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + } + + /* OK, create a config that can be used to split */ + VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren) == 0); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + + for (c = 0; c < schildren; c++) + nvlist_free(schild[c]); + free(schild); + nvlist_free(split); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + nvlist_free(config); + + if (error == 0) { + (void) printf("successful split - results:\n"); + mutex_enter(&spa_namespace_lock); + show_pool_stats(spa); + show_pool_stats(spa_lookup("splitp")); + mutex_exit(&spa_namespace_lock); + ++zs->zs_splits; + --zs->zs_mirrors; + } + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify that we can attach and detach devices. + */ +/* ARGSUSED */ +void +ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + spa_aux_vdev_t *sav = &spa->spa_spares; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *pvd; + nvlist_t *root; + uint64_t leaves; + uint64_t leaf, top; + uint64_t ashift = ztest_get_ashift(); + uint64_t oldguid, pguid; + uint64_t oldsize, newsize; + char *oldpath, *newpath; + int replacing; + int oldvd_has_siblings = B_FALSE; + int newvd_is_spare = B_FALSE; + int oldvd_is_log; + int error, expected_error; + + if (ztest_opts.zo_mmp_test) + return; + + oldpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + newpath = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + mutex_enter(&ztest_vdev_lock); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * If a vdev is in the process of being removed, its removal may + * finish while we are in progress, leading to an unexpected error + * value. Don't bother trying to attach while we are in the middle + * of removal. + */ + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * Decide whether to do an attach or a replace. + */ + replacing = ztest_random(2); + + /* + * Pick a random top-level vdev. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + + /* + * Pick a random leaf within it. + */ + leaf = ztest_random(leaves); + + /* + * Locate this vdev. + */ + oldvd = rvd->vdev_child[top]; + + /* pick a child from the mirror */ + if (zs->zs_mirrors >= 1) { + ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); + ASSERT(oldvd->vdev_children >= zs->zs_mirrors); + oldvd = oldvd->vdev_child[leaf / ztest_opts.zo_raidz]; + } + + /* pick a child out of the raidz group */ + if (ztest_opts.zo_raidz > 1) { + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + ASSERT(oldvd->vdev_children == ztest_opts.zo_raidz); + oldvd = oldvd->vdev_child[leaf % ztest_opts.zo_raidz]; + } + + /* + * If we're already doing an attach or replace, oldvd may be a + * mirror vdev -- in which case, pick a random child. + */ + while (oldvd->vdev_children != 0) { + oldvd_has_siblings = B_TRUE; + ASSERT(oldvd->vdev_children >= 2); + oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; + } + + oldguid = oldvd->vdev_guid; + oldsize = vdev_get_min_asize(oldvd); + oldvd_is_log = oldvd->vdev_top->vdev_islog; + (void) strcpy(oldpath, oldvd->vdev_path); + pvd = oldvd->vdev_parent; + pguid = pvd->vdev_guid; + + /* + * If oldvd has siblings, then half of the time, detach it. Prior + * to the detach the pool is scrubbed in order to prevent creating + * unrepairable blocks as a result of the data corruption injection. + */ + if (oldvd_has_siblings && ztest_random(2) == 0) { + spa_config_exit(spa, SCL_ALL, FTAG); + + error = ztest_scrub_impl(spa); + if (error) + goto out; + + error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); + if (error != 0 && error != ENODEV && error != EBUSY && + error != ENOTSUP && error != ZFS_ERR_CHECKPOINT_EXISTS && + error != ZFS_ERR_DISCARDING_CHECKPOINT) + fatal(0, "detach (%s) returned %d", oldpath, error); + goto out; + } + + /* + * For the new vdev, choose with equal probability between the two + * standard paths (ending in either 'a' or 'b') or a random hot spare. + */ + if (sav->sav_count != 0 && ztest_random(3) == 0) { + newvd = sav->sav_vdevs[ztest_random(sav->sav_count)]; + newvd_is_spare = B_TRUE; + (void) strcpy(newpath, newvd->vdev_path); + } else { + (void) snprintf(newpath, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); + if (ztest_random(2) == 0) + newpath[strlen(newpath) - 1] = 'b'; + newvd = vdev_lookup_by_path(rvd, newpath); + } + + if (newvd) { + /* + * Reopen to ensure the vdev's asize field isn't stale. + */ + vdev_reopen(newvd); + newsize = vdev_get_min_asize(newvd); + } else { + /* + * Make newsize a little bigger or smaller than oldsize. + * If it's smaller, the attach should fail. + * If it's larger, and we're doing a replace, + * we should get dynamic LUN growth when we're done. + */ + newsize = 10 * oldsize / (9 + ztest_random(3)); + } + + /* + * If pvd is not a mirror or root, the attach should fail with ENOTSUP, + * unless it's a replace; in that case any non-replacing parent is OK. + * + * If newvd is already part of the pool, it should fail with EBUSY. + * + * If newvd is too small, it should fail with EOVERFLOW. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops && (!replacing || + pvd->vdev_ops == &vdev_replacing_ops || + pvd->vdev_ops == &vdev_spare_ops)) + expected_error = ENOTSUP; + else if (newvd_is_spare && (!replacing || oldvd_is_log)) + expected_error = ENOTSUP; + else if (newvd == oldvd) + expected_error = replacing ? 0 : EBUSY; + else if (vdev_lookup_by_path(rvd, newpath) != NULL) + expected_error = EBUSY; + else if (newsize < oldsize) + expected_error = EOVERFLOW; + else if (ashift > oldvd->vdev_top->vdev_ashift) + expected_error = EDOM; + else + expected_error = 0; + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Build the nvlist describing newpath. + */ + root = make_vdev_root(newpath, NULL, NULL, newvd == NULL ? newsize : 0, + ashift, NULL, 0, 0, 1); + + /* + * When supported select either a healing or sequential resilver. + */ + boolean_t rebuilding = B_FALSE; + if (pvd->vdev_ops == &vdev_mirror_ops || + pvd->vdev_ops == &vdev_root_ops) { + rebuilding = !!ztest_random(2); + } + + error = spa_vdev_attach(spa, oldguid, root, replacing, rebuilding); + + nvlist_free(root); + + /* + * If our parent was the replacing vdev, but the replace completed, + * then instead of failing with ENOTSUP we may either succeed, + * fail with ENODEV, or fail with EOVERFLOW. + */ + if (expected_error == ENOTSUP && + (error == 0 || error == ENODEV || error == EOVERFLOW)) + expected_error = error; + + /* + * If someone grew the LUN, the replacement may be too small. + */ + if (error == EOVERFLOW || error == EBUSY) + expected_error = error; + + if (error == ZFS_ERR_CHECKPOINT_EXISTS || + error == ZFS_ERR_DISCARDING_CHECKPOINT || + error == ZFS_ERR_RESILVER_IN_PROGRESS || + error == ZFS_ERR_REBUILD_IN_PROGRESS) + expected_error = error; + + if (error != expected_error && expected_error != EBUSY) { + fatal(0, "attach (%s %llu, %s %llu, %d) " + "returned %d, expected %d", + oldpath, oldsize, newpath, + newsize, replacing, error, expected_error); + } +out: + mutex_exit(&ztest_vdev_lock); + + umem_free(oldpath, MAXPATHLEN); + umem_free(newpath, MAXPATHLEN); +} + +/* ARGSUSED */ +void +ztest_device_removal(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + vdev_t *vd; + uint64_t guid; + int error; + + mutex_enter(&ztest_vdev_lock); + + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * Remove a random top-level vdev and wait for removal to finish. + */ + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, ztest_random_vdev_top(spa, B_FALSE)); + guid = vd->vdev_guid; + spa_config_exit(spa, SCL_VDEV, FTAG); + + error = spa_vdev_remove(spa, guid, B_FALSE); + if (error == 0) { + ztest_device_removal_active = B_TRUE; + mutex_exit(&ztest_vdev_lock); + + /* + * spa->spa_vdev_removal is created in a sync task that + * is initiated via dsl_sync_task_nowait(). Since the + * task may not run before spa_vdev_remove() returns, we + * must wait at least 1 txg to ensure that the removal + * struct has been created. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + while (spa->spa_removing_phys.sr_state == DSS_SCANNING) + txg_wait_synced(spa_get_dsl(spa), 0); + } else { + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The pool needs to be scrubbed after completing device removal. + * Failure to do so may result in checksum errors due to the + * strategy employed by ztest_fault_inject() when selecting which + * offset are redundant and can be damaged. + */ + error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error == 0) { + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + } + + mutex_enter(&ztest_vdev_lock); + ztest_device_removal_active = B_FALSE; + mutex_exit(&ztest_vdev_lock); +} + +/* + * Callback function which expands the physical size of the vdev. + */ +static vdev_t * +grow_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa __maybe_unused = vd->vdev_spa; + size_t *newsize = arg; + size_t fsize; + int fd; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if ((fd = open(vd->vdev_path, O_RDWR)) == -1) + return (vd); + + fsize = lseek(fd, 0, SEEK_END); + VERIFY(ftruncate(fd, *newsize) == 0); + + if (ztest_opts.zo_verbose >= 6) { + (void) printf("%s grew from %lu to %lu bytes\n", + vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); + } + (void) close(fd); + return (NULL); +} + +/* + * Callback function which expands a given vdev by calling vdev_online(). + */ +/* ARGSUSED */ +static vdev_t * +online_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint64_t guid = vd->vdev_guid; + uint64_t generation = spa->spa_config_generation + 1; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + int error; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* Calling vdev_online will initialize the new metaslabs */ + spa_config_exit(spa, SCL_STATE, spa); + error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If vdev_online returned an error or the underlying vdev_open + * failed then we abort the expand. The only way to know that + * vdev_open fails is by checking the returned newstate. + */ + if (error || newstate != VDEV_STATE_HEALTHY) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Unable to expand vdev, state %llu, " + "error %d\n", (u_longlong_t)newstate, error); + } + return (vd); + } + ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); + + /* + * Since we dropped the lock we need to ensure that we're + * still talking to the original vdev. It's possible this + * vdev may have been detached/replaced while we were + * trying to online it. + */ + if (generation != spa->spa_config_generation) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("vdev configuration has changed, " + "guid %llu, state %llu, expected gen %llu, " + "got gen %llu\n", + (u_longlong_t)guid, + (u_longlong_t)tvd->vdev_state, + (u_longlong_t)generation, + (u_longlong_t)spa->spa_config_generation); + } + return (vd); + } + return (NULL); +} + +/* + * Traverse the vdev tree calling the supplied function. + * We continue to walk the tree until we either have walked all + * children or we receive a non-NULL return from the callback. + * If a NULL callback is passed, then we just return back the first + * leaf vdev we encounter. + */ +static vdev_t * +vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) +{ + uint_t c; + + if (vd->vdev_ops->vdev_op_leaf) { + if (func == NULL) + return (vd); + else + return (func(vd, arg)); + } + + for (c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) + return (cvd); + } + return (NULL); +} + +/* + * Verify that dynamic LUN growth works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + vdev_t *vd, *tvd; + metaslab_class_t *mc; + metaslab_group_t *mg; + size_t psize, newsize; + uint64_t top; + uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; + + mutex_enter(&ztest_checkpoint_lock); + mutex_enter(&ztest_vdev_lock); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If there is a vdev removal in progress, it could complete while + * we are running, in which case we would not be able to verify + * that the metaslab_class space increased (because it decreases + * when the device removal completes). + */ + if (ztest_device_removal_active) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + top = ztest_random_vdev_top(spa, B_TRUE); + + tvd = spa->spa_root_vdev->vdev_child[top]; + mg = tvd->vdev_mg; + mc = mg->mg_class; + old_ms_count = tvd->vdev_ms_count; + old_class_space = metaslab_class_get_space(mc); + + /* + * Determine the size of the first leaf vdev associated with + * our top-level device. + */ + vd = vdev_walk_tree(tvd, NULL, NULL); + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + psize = vd->vdev_psize; + + /* + * We only try to expand the vdev if it's healthy, less than 4x its + * original size, and it has a valid psize. + */ + if (tvd->vdev_state != VDEV_STATE_HEALTHY || + psize == 0 || psize >= 4 * ztest_opts.zo_vdev_size) { + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + ASSERT(psize > 0); + newsize = psize + MAX(psize / 8, SPA_MAXBLOCKSIZE); + ASSERT3U(newsize, >, psize); + + if (ztest_opts.zo_verbose >= 6) { + (void) printf("Expanding LUN %s from %lu to %lu\n", + vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); + } + + /* + * Growing the vdev is a two step process: + * 1). expand the physical size (i.e. relabel) + * 2). online the vdev to create the new metaslabs + */ + if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || + vdev_walk_tree(tvd, online_vdev, NULL) != NULL || + tvd->vdev_state != VDEV_STATE_HEALTHY) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Could not expand LUN because " + "the vdev configuration changed.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + spa_config_exit(spa, SCL_STATE, spa); + + /* + * Expanding the LUN will update the config asynchronously, + * thus we must wait for the async thread to complete any + * pending tasks before proceeding. + */ + for (;;) { + boolean_t done; + mutex_enter(&spa->spa_async_lock); + done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); + mutex_exit(&spa->spa_async_lock); + if (done) + break; + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); + } + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + tvd = spa->spa_root_vdev->vdev_child[top]; + new_ms_count = tvd->vdev_ms_count; + new_class_space = metaslab_class_get_space(mc); + + if (tvd->vdev_mg != mg || mg->mg_class != mc) { + if (ztest_opts.zo_verbose >= 5) { + (void) printf("Could not verify LUN expansion due to " + "intervening vdev offline or remove.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); + return; + } + + /* + * Make sure we were able to grow the vdev. + */ + if (new_ms_count <= old_ms_count) { + fatal(0, "LUN expansion failed: ms_count %llu < %llu\n", + old_ms_count, new_ms_count); + } + + /* + * Make sure we were able to grow the pool. + */ + if (new_class_space <= old_class_space) { + fatal(0, "LUN expansion failed: class_space %llu < %llu\n", + old_class_space, new_class_space); + } + + if (ztest_opts.zo_verbose >= 5) { + char oldnumbuf[NN_NUMBUF_SZ], newnumbuf[NN_NUMBUF_SZ]; + + nicenum(old_class_space, oldnumbuf, sizeof (oldnumbuf)); + nicenum(new_class_space, newnumbuf, sizeof (newnumbuf)); + (void) printf("%s grew from %s to %s\n", + spa->spa_name, oldnumbuf, newnumbuf); + } + + spa_config_exit(spa, SCL_STATE, spa); + mutex_exit(&ztest_vdev_lock); + mutex_exit(&ztest_checkpoint_lock); +} + +/* + * Verify that dmu_objset_{create,destroy,open,close} work as expected. + */ +/* ARGSUSED */ +static void +ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + /* + * Create the objects common to all ztest datasets. + */ + VERIFY(zap_create_claim(os, ZTEST_DIROBJ, + DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); +} + +static int +ztest_dataset_create(char *dsname) +{ + int err; + uint64_t rand; + dsl_crypto_params_t *dcp = NULL; + + /* + * 50% of the time, we create encrypted datasets + * using a random cipher suite and a hard-coded + * wrapping key. + */ + rand = ztest_random(2); + if (rand != 0) { + nvlist_t *crypto_args = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + + /* slight bias towards the default cipher suite */ + rand = ztest_random(ZIO_CRYPT_FUNCTIONS); + if (rand < ZIO_CRYPT_AES_128_CCM) + rand = ZIO_CRYPT_ON; + + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), rand); + fnvlist_add_uint8_array(crypto_args, "wkeydata", + (uint8_t *)ztest_wkeydata, WRAPPING_KEY_LEN); + + /* + * These parameters aren't really used by the kernel. They + * are simply stored so that userspace knows how to load + * the wrapping key. + */ + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), ZFS_KEYFORMAT_RAW); + fnvlist_add_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), "prompt"); + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 0ULL); + fnvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 0ULL); + + VERIFY0(dsl_crypto_params_create_nvlist(DCP_CMD_NONE, props, + crypto_args, &dcp)); + + /* + * Cycle through all available encryption implementations + * to verify interoperability. + */ + VERIFY0(gcm_impl_set("cycle")); + VERIFY0(aes_impl_set("cycle")); + + fnvlist_free(crypto_args); + fnvlist_free(props); + } + + err = dmu_objset_create(dsname, DMU_OST_OTHER, 0, dcp, + ztest_objset_create_cb, NULL); + dsl_crypto_params_free(dcp, !!err); + + rand = ztest_random(100); + if (err || rand < 80) + return (err); + + if (ztest_opts.zo_verbose >= 5) + (void) printf("Setting dataset %s to sync always\n", dsname); + return (ztest_dsl_prop_set_uint64(dsname, ZFS_PROP_SYNC, + ZFS_SYNC_ALWAYS, B_FALSE)); +} + +/* ARGSUSED */ +static int +ztest_objset_destroy_cb(const char *name, void *arg) +{ + objset_t *os; + dmu_object_info_t doi; + int error; + + /* + * Verify that the dataset contains a directory object. + */ + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, + B_TRUE, FTAG, &os)); + error = dmu_object_info(os, ZTEST_DIROBJ, &doi); + if (error != ENOENT) { + /* We could have crashed in the middle of destroying it */ + ASSERT0(error); + ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); + ASSERT3S(doi.doi_physical_blocks_512, >=, 0); + } + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Destroy the dataset. + */ + if (strchr(name, '@') != NULL) { + VERIFY0(dsl_destroy_snapshot(name, B_TRUE)); + } else { + error = dsl_destroy_head(name); + if (error == ENOSPC) { + /* There could be checkpoint or insufficient slop */ + ztest_record_enospc(FTAG); + } else if (error != EBUSY) { + /* There could be a hold on this dataset */ + ASSERT0(error); + } + } + return (0); +} + +static boolean_t +ztest_snapshot_create(char *osname, uint64_t id) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + (void) snprintf(snapname, sizeof (snapname), "%llu", (u_longlong_t)id); + + error = dmu_objset_snapshot_one(osname, snapname); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (B_FALSE); + } + if (error != 0 && error != EEXIST) { + fatal(0, "ztest_snapshot_create(%s@%s) = %d", osname, + snapname, error); + } + return (B_TRUE); +} + +static boolean_t +ztest_snapshot_destroy(char *osname, uint64_t id) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + (void) snprintf(snapname, sizeof (snapname), "%s@%llu", osname, + (u_longlong_t)id); + + error = dsl_destroy_snapshot(snapname, B_FALSE); + if (error != 0 && error != ENOENT) + fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + return (B_TRUE); +} + +/* ARGSUSED */ +void +ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_ds_t *zdtmp; + int iters; + int error; + objset_t *os, *os2; + char name[ZFS_MAX_DATASET_NAME_LEN]; + zilog_t *zilog; + int i; + + zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + (void) snprintf(name, sizeof (name), "%s/temp_%llu", + ztest_opts.zo_pool, (u_longlong_t)id); + + /* + * If this dataset exists from a previous run, process its replay log + * half of the time. If we don't replay it, then dsl_destroy_head() + * (invoked from ztest_objset_destroy_cb()) should just throw it away. + */ + if (ztest_random(2) == 0 && + ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, + B_TRUE, FTAG, &os) == 0) { + ztest_zd_init(zdtmp, NULL, os); + zil_replay(os, zdtmp, ztest_replay_vector); + ztest_zd_fini(zdtmp); + dmu_objset_disown(os, B_TRUE, FTAG); + } + + /* + * There may be an old instance of the dataset we're about to + * create lying around from a previous run. If so, destroy it + * and all of its snapshots. + */ + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + + /* + * Verify that the destroyed dataset is no longer in the namespace. + */ + VERIFY3U(ENOENT, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, B_TRUE, + B_TRUE, FTAG, &os)); + + /* + * Verify that we can create a new dataset. + */ + error = ztest_dataset_create(name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", name, error); + } + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, B_TRUE, + FTAG, &os)); + + ztest_zd_init(zdtmp, NULL, os); + + /* + * Open the intent log for it. + */ + zilog = zil_open(os, ztest_get_data); + + /* + * Put some objects in there, do a little I/O to them, + * and randomly take a couple of snapshots along the way. + */ + iters = ztest_random(5); + for (i = 0; i < iters; i++) { + ztest_dmu_object_alloc_free(zdtmp, id); + if (ztest_random(iters) == 0) + (void) ztest_snapshot_create(name, i); + } + + /* + * Verify that we cannot create an existing dataset. + */ + VERIFY3U(EEXIST, ==, + dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL, NULL)); + + /* + * Verify that we can hold an objset that is also owned. + */ + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); + dmu_objset_rele(os2, FTAG); + + /* + * Verify that we cannot own an objset that is already owned. + */ + VERIFY3U(EBUSY, ==, ztest_dmu_objset_own(name, DMU_OST_OTHER, + B_FALSE, B_TRUE, FTAG, &os2)); + + zil_close(zilog); + dmu_objset_disown(os, B_TRUE, FTAG); + ztest_zd_fini(zdtmp); +out: + (void) pthread_rwlock_unlock(&ztest_name_lock); + + umem_free(zdtmp, sizeof (ztest_ds_t)); +} + +/* + * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. + */ +void +ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + (void) ztest_snapshot_destroy(zd->zd_name, id); + (void) ztest_snapshot_create(zd->zd_name, id); + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Cleanup non-standard snapshots and clones. + */ +static void +ztest_dsl_dataset_cleanup(char *osname, uint64_t id) +{ + char *snap1name; + char *clone1name; + char *snap2name; + char *clone2name; + char *snap3name; + int error; + + snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s1_%llu", osname, (u_longlong_t)id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c1_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s2_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c2_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s3_%llu", clone1name, (u_longlong_t)id); + + error = dsl_destroy_head(clone2name); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_head(%s) = %d", clone2name, error); + error = dsl_destroy_snapshot(snap3name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap3name, error); + error = dsl_destroy_snapshot(snap2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap2name, error); + error = dsl_destroy_head(clone1name); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_head(%s) = %d", clone1name, error); + error = dsl_destroy_snapshot(snap1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dsl_destroy_snapshot(%s) = %d", snap1name, error); + + umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); +} + +/* + * Verify dsl_dataset_promote handles EBUSY + */ +void +ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os; + char *snap1name; + char *clone1name; + char *snap2name; + char *clone2name; + char *snap3name; + char *osname = zd->zd_name; + int error; + + snap1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone1name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + clone2name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + snap3name = umem_alloc(ZFS_MAX_DATASET_NAME_LEN, UMEM_NOFAIL); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + ztest_dsl_dataset_cleanup(osname, id); + + (void) snprintf(snap1name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s1_%llu", osname, (u_longlong_t)id); + (void) snprintf(clone1name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c1_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap2name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s2_%llu", clone1name, (u_longlong_t)id); + (void) snprintf(clone2name, ZFS_MAX_DATASET_NAME_LEN, + "%s/c2_%llu", osname, (u_longlong_t)id); + (void) snprintf(snap3name, ZFS_MAX_DATASET_NAME_LEN, + "%s@s3_%llu", clone1name, (u_longlong_t)id); + + error = dmu_objset_snapshot_one(osname, strchr(snap1name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + } + + error = dmu_objset_clone(clone1name, snap1name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + } + + error = dmu_objset_snapshot_one(clone1name, strchr(snap2name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); + } + + error = dmu_objset_snapshot_one(clone1name, strchr(snap3name, '@') + 1); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); + } + + error = dmu_objset_clone(clone2name, snap3name); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; + } + fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + } + + error = ztest_dmu_objset_own(snap2name, DMU_OST_ANY, B_TRUE, B_TRUE, + FTAG, &os); + if (error) + fatal(0, "dmu_objset_own(%s) = %d", snap2name, error); + error = dsl_dataset_promote(clone2name, NULL); + if (error == ENOSPC) { + dmu_objset_disown(os, B_TRUE, FTAG); + ztest_record_enospc(FTAG); + goto out; + } + if (error != EBUSY) + fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, + error); + dmu_objset_disown(os, B_TRUE, FTAG); + +out: + ztest_dsl_dataset_cleanup(osname, id); + + (void) pthread_rwlock_unlock(&ztest_name_lock); + + umem_free(snap1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone1name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(clone2name, ZFS_MAX_DATASET_NAME_LEN); + umem_free(snap3name, ZFS_MAX_DATASET_NAME_LEN); +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 4 + +/* + * Verify that dmu_object_{alloc,free} work as expected. + */ +void +ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + int batchsize; + int size; + int b; + + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + batchsize = OD_ARRAY_SIZE; + + for (b = 0; b < batchsize; b++) + ztest_od_init(od + b, id, FTAG, b, DMU_OT_UINT64_OTHER, + 0, 0, 0); + + /* + * Destroy the previous batch of objects, create a new batch, + * and do some I/O on the new objects. + */ + if (ztest_object_init(zd, od, size, B_TRUE) != 0) + return; + + while (ztest_random(4 * batchsize) != 0) + ztest_io(zd, od[ztest_random(batchsize)].od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + + umem_free(od, size); +} + +/* + * Rewind the global allocator to verify object allocation backfilling. + */ +void +ztest_dmu_object_next_chunk(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + uint64_t object; + + /* + * Rewind the global allocator randomly back to a lower object number + * to force backfilling and reclamation of recently freed dnodes. + */ + mutex_enter(&os->os_obj_lock); + object = ztest_random(os->os_obj_next_chunk); + os->os_obj_next_chunk = P2ALIGN(object, dnodes_per_chunk); + mutex_exit(&os->os_obj_lock); +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 2 + +/* + * Verify that dmu_{read,write} work as expected. + */ +void +ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) +{ + int size; + ztest_od_t *od; + + objset_t *os = zd->zd_os; + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + dmu_tx_t *tx; + int i, freeit, error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 40; + int free_percent = 5; + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is arbitrary. It doesn't have to be a power of two, + * and it doesn't have any relation to the object blocksize. + * The only requirement is that it can hold at least two bufwads. + * + * Normally, we write the bufwad to each of these locations. + * However, free_percent of the time we instead write zeroes to + * packobj and perform a dmu_free_range() on bigobj. By comparing + * bigobj to packobj, we can verify that the DMU is correctly + * tracking which parts of an object are allocated and free, + * and that the contents of the allocated blocks are correct. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, chunksize); + ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + + if (ztest_object_init(zd, od, size, B_FALSE) != 0) { + umem_free(od, size); + return; + } + + bigobj = od[0].od_object; + packobj = od[1].od_object; + chunksize = od[0].od_gen; + ASSERT(chunksize == od[1].od_gen); + + /* + * Prefetch a random chunk of the big object. + * Our aim here is to get some async reads in flight + * for blocks that we may free below; the DMU should + * handle this race correctly. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(2 * width - 1); + dmu_prefetch(os, bigobj, 0, n * chunksize, s * chunksize, + ZIO_PRIORITY_SYNC_READ); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_alloc(packsize, UMEM_NOFAIL); + bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); + + /* + * free_percent of the time, free a range of bigobj rather than + * overwriting it. + */ + freeit = (ztest_random(100) < free_percent); + + /* + * Read the current contents of our objects. + */ + error = dmu_read(os, packobj, packoff, packsize, packbuf, + DMU_READ_PREFETCH); + ASSERT0(error); + error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, + DMU_READ_PREFETCH); + ASSERT0(error); + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + + if (freeit) + dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); + else + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + /* This accounts for setting the checksum/compression. */ + dmu_tx_hold_bonus(tx, bigobj); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(od, size); + return; + } + + enum zio_checksum cksum; + do { + cksum = (enum zio_checksum) + ztest_random_dsl_prop(ZFS_PROP_CHECKSUM); + } while (cksum >= ZIO_CHECKSUM_LEGACY_FUNCTIONS); + dmu_object_set_checksum(os, bigobj, cksum, tx); + + enum zio_compress comp; + do { + comp = (enum zio_compress) + ztest_random_dsl_prop(ZFS_PROP_COMPRESSION); + } while (comp >= ZIO_COMPRESS_LEGACY_FUNCTIONS); + dmu_object_set_compress(os, bigobj, comp, tx); + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + if (freeit) { + bzero(pack, sizeof (bufwad_t)); + } else { + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + } + *bigH = *pack; + *bigT = *pack; + } + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + + if (freeit) { + if (ztest_opts.zo_verbose >= 7) { + (void) printf("freeing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); + } else { + if (ztest_opts.zo_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); + } + + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(od, size); +} + +static void +compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, + uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) +{ + uint64_t i; + bufwad_t *pack; + bufwad_t *bigH; + bufwad_t *bigT; + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + + *bigH = *pack; + *bigT = *pack; + } +} + +#undef OD_ARRAY_SIZE +#define OD_ARRAY_SIZE 2 + +void +ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + dmu_tx_t *tx; + uint64_t i; + int error; + int size; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t blocksize = ztest_random_blocksize(); + uint64_t chunksize = blocksize; + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 9; + dmu_buf_t *bonus_db; + arc_buf_t **bigbuf_arcbufs; + dmu_object_info_t doi; + + size = sizeof (ztest_od_t) * OD_ARRAY_SIZE; + od = umem_alloc(size, UMEM_NOFAIL); + + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is set equal to bigobj block size so that + * dmu_assign_arcbuf_by_dbuf() can be tested for object updates. + */ + + /* + * Read the directory info. If it's the first time, set things up. + */ + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + ztest_od_init(od + 1, id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, 0, + chunksize); + + + if (ztest_object_init(zd, od, size, B_FALSE) != 0) { + umem_free(od, size); + return; + } + + bigobj = od[0].od_object; + packobj = od[1].od_object; + blocksize = od[0].od_blocksize; + chunksize = blocksize; + ASSERT(chunksize == od[1].od_gen); + + VERIFY(dmu_object_info(os, bigobj, &doi) == 0); + VERIFY(ISP2(doi.doi_data_block_size)); + VERIFY(chunksize == doi.doi_data_block_size); + VERIFY(chunksize >= 2 * sizeof (bufwad_t)); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); + + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); + + bigoff = n * chunksize; + bigsize = s * chunksize; + + packbuf = umem_zalloc(packsize, UMEM_NOFAIL); + bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); + + VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); + + bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); + + /* + * Iteration 0 test zcopy for DB_UNCACHED dbufs. + * Iteration 1 test zcopy to already referenced dbufs. + * Iteration 2 test zcopy to dirty dbuf in the same txg. + * Iteration 3 test zcopy to dbuf dirty in previous txg. + * Iteration 4 test zcopy when dbuf is no longer dirty. + * Iteration 5 test zcopy when it can't be done. + * Iteration 6 one more zcopy write. + */ + for (i = 0; i < 7; i++) { + uint64_t j; + uint64_t off; + + /* + * In iteration 5 (i == 5) use arcbufs + * that don't match bigobj blksz to test + * dmu_assign_arcbuf_by_dbuf() when it can't directly + * assign an arcbuf to a dbuf. + */ + for (j = 0; j < s; j++) { + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + bigbuf_arcbufs[j] = + dmu_request_arcbuf(bonus_db, chunksize); + } else { + bigbuf_arcbufs[2 * j] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + bigbuf_arcbufs[2 * j + 1] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + } + } + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + for (j = 0; j < s; j++) { + if (i != 5 || + chunksize < (SPA_MINBLOCKSIZE * 2)) { + dmu_return_arcbuf(bigbuf_arcbufs[j]); + } else { + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j]); + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j + 1]); + } + } + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + umem_free(od, size); + dmu_buf_rele(bonus_db, FTAG); + return; + } + + /* + * 50% of the time don't read objects in the 1st iteration to + * test dmu_assign_arcbuf_by_dbuf() for the case when there are + * no existing dbufs for the specified offsets. + */ + if (i != 0 || ztest_random(2) != 0) { + error = dmu_read(os, packobj, packoff, + packsize, packbuf, DMU_READ_PREFETCH); + ASSERT0(error); + error = dmu_read(os, bigobj, bigoff, bigsize, + bigbuf, DMU_READ_PREFETCH); + ASSERT0(error); + } + compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, + n, chunksize, txg); + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + if (ztest_opts.zo_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + for (off = bigoff, j = 0; j < s; j++, off += chunksize) { + dmu_buf_t *dbt; + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[j]->b_data, chunksize); + } else { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[2 * j]->b_data, + chunksize / 2); + bcopy((caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, + bigbuf_arcbufs[2 * j + 1]->b_data, + chunksize / 2); + } + + if (i == 1) { + VERIFY(dmu_buf_hold(os, bigobj, off, + FTAG, &dbt, DMU_READ_NO_PREFETCH) == 0); + } + if (i != 5 || chunksize < (SPA_MINBLOCKSIZE * 2)) { + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off, bigbuf_arcbufs[j], tx)); + } else { + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off, bigbuf_arcbufs[2 * j], tx)); + VERIFY0(dmu_assign_arcbuf_by_dbuf(bonus_db, + off + chunksize / 2, + bigbuf_arcbufs[2 * j + 1], tx)); + } + if (i == 1) { + dmu_buf_rele(dbt, FTAG); + } + } + dmu_tx_commit(tx); + + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); + + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); + + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); + + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + if (i == 2) { + txg_wait_open(dmu_objset_pool(os), 0, B_TRUE); + } else if (i == 3) { + txg_wait_synced(dmu_objset_pool(os), 0); + } + } + + dmu_buf_rele(bonus_db, FTAG); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + umem_free(od, size); +} + +/* ARGSUSED */ +void +ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + uint64_t offset = (1ULL << (ztest_random(20) + 43)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + + /* + * Have multiple threads write to large offsets in an object + * to verify that parallel writes to an object -- even to the + * same blocks within the object -- doesn't cause any trouble. + */ + ztest_od_init(od, ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) + return; + + while (ztest_random(10) != 0) + ztest_io(zd, od->od_object, offset); + + umem_free(od, sizeof (ztest_od_t)); +} + +void +ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t *od; + uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + uint64_t count = ztest_random(20) + 1; + uint64_t blocksize = ztest_random_blocksize(); + void *data; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + if (ztest_truncate(zd, od->od_object, offset, count * blocksize) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + ztest_prealloc(zd, od->od_object, offset, count * blocksize); + + data = umem_zalloc(blocksize, UMEM_NOFAIL); + + while (ztest_random(count) != 0) { + uint64_t randoff = offset + (ztest_random(count) * blocksize); + if (ztest_write(zd, od->od_object, randoff, blocksize, + data) != 0) + break; + while (ztest_random(4) != 0) + ztest_io(zd, od->od_object, randoff); + } + + umem_free(data, blocksize); + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Verify that zap_{create,destroy,add,remove,update} work as expected. + */ +#define ZTEST_ZAP_MIN_INTS 1 +#define ZTEST_ZAP_MAX_INTS 4 +#define ZTEST_ZAP_MAX_PROPS 1000 + +void +ztest_zap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t object; + uint64_t txg, last_txg; + uint64_t value[ZTEST_ZAP_MAX_INTS]; + uint64_t zl_ints, zl_intsize, prop; + int i, ints; + dmu_tx_t *tx; + char propname[100], txgname[100]; + int error; + char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) + goto out; + + object = od->od_object; + + /* + * Generate a known hash collision, and verify that + * we can lookup and remove both entries. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + for (i = 0; i < 2; i++) { + value[i] = i; + VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), + 1, &value[i], tx)); + } + for (i = 0; i < 2; i++) { + VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], + sizeof (uint64_t), 1, &value[i], tx)); + VERIFY3U(0, ==, + zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + } + for (i = 0; i < 2; i++) { + VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); + } + dmu_tx_commit(tx); + + /* + * Generate a bunch of random entries. + */ + ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); + + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + bzero(value, sizeof (value)); + last_txg = 0; + + /* + * If these zap entries already exist, validate their contents. + */ + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + if (error == 0) { + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + + VERIFY(zap_lookup(os, object, txgname, zl_intsize, + zl_ints, &last_txg) == 0); + + VERIFY(zap_length(os, object, propname, &zl_intsize, + &zl_ints) == 0); + + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, ints); + + VERIFY(zap_lookup(os, object, propname, zl_intsize, + zl_ints, value) == 0); + + for (i = 0; i < ints; i++) { + ASSERT3U(value[i], ==, last_txg + object + i); + } + } else { + ASSERT3U(error, ==, ENOENT); + } + + /* + * Atomically update two entries in our zap object. + * The first is named txg_%llu, and contains the txg + * in which the property was last updated. The second + * is named prop_%llu, and the nth element of its value + * should be txg + object + n. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + + if (last_txg > txg) + fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); + + for (i = 0; i < ints; i++) + value[i] = txg + object + i; + + VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), + 1, &txg, tx)); + VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), + ints, value, tx)); + + dmu_tx_commit(tx); + + /* + * Remove a random pair of entries. + */ + prop = ztest_random(ZTEST_ZAP_MAX_PROPS); + (void) sprintf(propname, "prop_%llu", (u_longlong_t)prop); + (void) sprintf(txgname, "txg_%llu", (u_longlong_t)prop); + + error = zap_length(os, object, txgname, &zl_intsize, &zl_ints); + + if (error == ENOENT) + goto out; + + ASSERT0(error); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); + VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + dmu_tx_commit(tx); +out: + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Test case to test the upgrading of a microzap to fatzap. + */ +void +ztest_fzap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t object, txg; + int i; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), + !ztest_random(2)) != 0) + goto out; + object = od->od_object; + + /* + * Add entries to this ZAP and make sure it spills over + * and gets upgraded to a fatzap. Also, since we are adding + * 2050 entries we should see ptrtbl growth and leaf-block split. + */ + for (i = 0; i < 2050; i++) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t value = i; + dmu_tx_t *tx; + int error; + + (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", + (u_longlong_t)id, (u_longlong_t)value); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, name); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + goto out; + error = zap_add(os, object, name, sizeof (uint64_t), 1, + &value, tx); + ASSERT(error == 0 || error == EEXIST); + dmu_tx_commit(tx); + } +out: + umem_free(od, sizeof (ztest_od_t)); +} + +/* ARGSUSED */ +void +ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; + dmu_tx_t *tx; + int i, namelen, error; + int micro = ztest_random(2); + char name[20], string_value[20]; + void *data; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + object = od->od_object; + + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if ((namelen & 1) || micro) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY0(zap_count(os, object, &count)); + ASSERT(count != -1ULL); + + /* + * Select an operation: length, lookup, add, update, remove. + */ + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; + } + + if (tx != NULL) + dmu_tx_commit(tx); + + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Commit callback data. + */ +typedef struct ztest_cb_data { + list_node_t zcd_node; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; +} ztest_cb_data_t; + +/* This is the actual commit callback function */ +static void +ztest_commit_callback(void *arg, int error) +{ + ztest_cb_data_t *data = arg; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %" PRIu64 " called prematurely" + ", last synced txg = %" PRIu64 "\n", data->zcd_txg, + synced_txg); + + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT0(data->zcd_txg); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + ASSERT(data->zcd_added); + ASSERT3U(data->zcd_txg, !=, 0); + + (void) mutex_enter(&zcl.zcl_callbacks_lock); + + /* See if this cb was called more quickly */ + if ((synced_txg - data->zcd_txg) < zc_min_txg_delay) + zc_min_txg_delay = synced_txg - data->zcd_txg; + + /* Remove our callback from the list */ + list_remove(&zcl.zcl_callbacks, data); + + (void) mutex_exit(&zcl.zcl_callbacks_lock); + + umem_free(data, sizeof (ztest_cb_data_t)); +} + +/* Allocate and initialize callback data structure */ +static ztest_cb_data_t * +ztest_create_cb_data(objset_t *os, uint64_t txg) +{ + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_txg = txg; + cb_data->zcd_spa = dmu_objset_spa(os); + list_link_init(&cb_data->zcd_node); + + return (cb_data); +} + +/* + * Commit callback test. + */ +void +ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t *od; + dmu_tx_t *tx; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error = 0; + + od = umem_alloc(sizeof (ztest_od_t), UMEM_NOFAIL); + ztest_od_init(od, id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + + if (ztest_object_init(zd, od, sizeof (ztest_od_t), B_FALSE) != 0) { + umem_free(od, sizeof (ztest_od_t)); + return; + } + + tx = dmu_tx_create(os); + + cb_data[0] = ztest_create_cb_data(os, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, od->od_object, 0, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[0]->zcd_txg = txg; + cb_data[1] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + + if (error) { + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * it's supposed to happen in the current implementation + * so we will check for that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + + umem_free(od, sizeof (ztest_od_t)); + return; + } + + cb_data[2] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); + + /* + * Read existing data to make sure there isn't a future leak. + */ + VERIFY(0 == dmu_read(os, od->od_object, 0, sizeof (uint64_t), + &old_txg, DMU_READ_PREFETCH)); + + if (old_txg > txg) + fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + old_txg, txg); + + dmu_write(os, od->od_object, 0, sizeof (uint64_t), &txg, tx); + + (void) mutex_enter(&zcl.zcl_callbacks_lock); + + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl.zcl_callbacks); + if (tmp_cb != NULL && + tmp_cb->zcd_txg + ZTEST_COMMIT_CB_THRESH < txg) { + fatal(0, "Commit callback threshold exceeded, oldest txg: %" + PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + } + + /* + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg + * (from other objsets) may have sneaked in. + */ + tmp_cb = list_tail(&zcl.zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl.zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl.zcl_callbacks, tmp_cb, + cb_data[i]); + + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; + } + + zc_cb_counter += 3; + + (void) mutex_exit(&zcl.zcl_callbacks_lock); + + dmu_tx_commit(tx); + + umem_free(od, sizeof (ztest_od_t)); +} + +/* + * Visit each object in the dataset. Verify that its properties + * are consistent what was stored in the block tag when it was created, + * and that its unused bonus buffer space has not been overwritten. + */ +/* ARGSUSED */ +void +ztest_verify_dnode_bt(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + uint64_t obj; + int err = 0; + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + ztest_block_tag_t *bt = NULL; + dmu_object_info_t doi; + dmu_buf_t *db; + + ztest_object_lock(zd, obj, RL_READER); + if (dmu_bonus_hold(os, obj, FTAG, &db) != 0) { + ztest_object_unlock(zd, obj); + continue; + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_size >= sizeof (*bt)) + bt = ztest_bt_bonus(db); + + if (bt && bt->bt_magic == BT_MAGIC) { + ztest_bt_verify(bt, os, obj, doi.doi_dnodesize, + bt->bt_offset, bt->bt_gen, bt->bt_txg, + bt->bt_crtxg); + ztest_verify_unused_bonus(db, bt, obj, os, bt->bt_gen); + } + + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, obj); + } +} + +/* ARGSUSED */ +void +ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + zfs_prop_t proplist[] = { + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_COPIES, + ZFS_PROP_DEDUP + }; + int p; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + for (p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) + (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], + ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); + + VERIFY0(ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_RECORDSIZE, + ztest_random_blocksize(), (int)ztest_random(2))); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* ARGSUSED */ +void +ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + nvlist_t *props = NULL; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + (void) ztest_spa_prop_set_uint64(ZPOOL_PROP_AUTOTRIM, ztest_random(2)); + + VERIFY0(spa_prop_get(ztest_spa, &props)); + + if (ztest_opts.zo_verbose >= 6) + dump_nvlist(props, 4); + + nvlist_free(props); + + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +static int +user_release_one(const char *snapname, const char *holdname) +{ + nvlist_t *snaps, *holds; + int error; + + snaps = fnvlist_alloc(); + holds = fnvlist_alloc(); + fnvlist_add_boolean(holds, holdname); + fnvlist_add_nvlist(snaps, snapname, holds); + fnvlist_free(holds); + error = dsl_dataset_user_release(snaps, NULL); + fnvlist_free(snaps); + return (error); +} + +/* + * Test snapshot hold/release and deferred destroy. + */ +void +ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) +{ + int error; + objset_t *os = zd->zd_os; + objset_t *origin; + char snapname[100]; + char fullname[100]; + char clonename[100]; + char tag[100]; + char osname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *holds; + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + dmu_objset_name(os, osname); + + (void) snprintf(snapname, sizeof (snapname), "sh1_%llu", + (u_longlong_t)id); + (void) snprintf(fullname, sizeof (fullname), "%s@%s", osname, snapname); + (void) snprintf(clonename, sizeof (clonename), + "%s/ch1_%llu", osname, (u_longlong_t)id); + (void) snprintf(tag, sizeof (tag), "tag_%llu", (u_longlong_t)id); + + /* + * Clean up from any previous run. + */ + error = dsl_destroy_head(clonename); + if (error != ENOENT) + ASSERT0(error); + error = user_release_one(fullname, tag); + if (error != ESRCH && error != ENOENT) + ASSERT0(error); + error = dsl_destroy_snapshot(fullname, B_FALSE); + if (error != ENOENT) + ASSERT0(error); + + /* + * Create snapshot, clone it, mark snap for deferred destroy, + * destroy clone, verify snap was also destroyed. + */ + error = dmu_objset_snapshot_one(osname, snapname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + error = dmu_objset_clone(clonename, fullname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_clone"); + goto out; + } + fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); + } + + error = dsl_destroy_snapshot(fullname, B_TRUE); + if (error) { + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fullname, error); + } + + error = dsl_destroy_head(clonename); + if (error) + fatal(0, "dsl_destroy_head(%s) = %d", clonename, error); + + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error != ENOENT) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + + /* + * Create snapshot, add temporary hold, verify that we can't + * destroy a held snapshot, mark for deferred destroy, + * release hold, verify snapshot was destroyed. + */ + error = dmu_objset_snapshot_one(osname, snapname); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; + } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + holds = fnvlist_alloc(); + fnvlist_add_string(holds, fullname, tag); + error = dsl_dataset_user_hold(holds, 0, NULL); + fnvlist_free(holds); + + if (error == ENOSPC) { + ztest_record_enospc("dsl_dataset_user_hold"); + goto out; + } else if (error) { + fatal(0, "dsl_dataset_user_hold(%s, %s) = %u", + fullname, tag, error); + } + + error = dsl_destroy_snapshot(fullname, B_FALSE); + if (error != EBUSY) { + fatal(0, "dsl_destroy_snapshot(%s, B_FALSE) = %d", + fullname, error); + } + + error = dsl_destroy_snapshot(fullname, B_TRUE); + if (error) { + fatal(0, "dsl_destroy_snapshot(%s, B_TRUE) = %d", + fullname, error); + } + + error = user_release_one(fullname, tag); + if (error) + fatal(0, "user_release_one(%s, %s) = %d", fullname, tag, error); + + VERIFY3U(dmu_objset_hold(fullname, FTAG, &origin), ==, ENOENT); + +out: + (void) pthread_rwlock_unlock(&ztest_name_lock); +} + +/* + * Inject random faults into the on-disk data. + */ +/* ARGSUSED */ +void +ztest_fault_inject(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = ztest_spa; + int fd; + uint64_t offset; + uint64_t leaves; + uint64_t bad = 0x1990c0ffeedecadeull; + uint64_t top, leaf; + char *path0; + char *pathrand; + size_t fsize; + int bshift = SPA_MAXBLOCKSHIFT + 2; + int iters = 1000; + int maxfaults; + int mirror_save; + vdev_t *vd0 = NULL; + uint64_t guid0 = 0; + boolean_t islog = B_FALSE; + + path0 = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + pathrand = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + + mutex_enter(&ztest_vdev_lock); + + /* + * Device removal is in progress, fault injection must be disabled + * until it completes and the pool is scrubbed. The fault injection + * strategy for damaging blocks does not take in to account evacuated + * blocks which may have already been damaged. + */ + if (ztest_device_removal_active) { + mutex_exit(&ztest_vdev_lock); + goto out; + } + + maxfaults = MAXFAULTS(zs); + leaves = MAX(zs->zs_mirrors, 1) * ztest_opts.zo_raidz; + mirror_save = zs->zs_mirrors; + mutex_exit(&ztest_vdev_lock); + + ASSERT(leaves >= 1); + + /* + * While ztest is running the number of leaves will not change. This + * is critical for the fault injection logic as it determines where + * errors can be safely injected such that they are always repairable. + * + * When restarting ztest a different number of leaves may be requested + * which will shift the regions to be damaged. This is fine as long + * as the pool has been scrubbed prior to using the new mapping. + * Failure to do can result in non-repairable damage being injected. + */ + if (ztest_pool_scrubbed == B_FALSE) + goto out; + + /* + * Grab the name lock as reader. There are some operations + * which don't like to have their vdevs changed while + * they are in progress (i.e. spa_change_guid). Those + * operations will have grabbed the name lock as writer. + */ + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + /* + * We need SCL_STATE here because we're going to look at vd0->vdev_tsd. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (ztest_random(2) == 0) { + /* + * Inject errors on a normal data device or slog device. + */ + top = ztest_random_vdev_top(spa, B_TRUE); + leaf = ztest_random(leaves) + zs->zs_splits; + + /* + * Generate paths to the first leaf in this top-level vdev, + * and to the random leaf we selected. We'll induce transient + * write failures and random online/offline activity on leaf 0, + * and we'll write random garbage to the randomly chosen leaf. + */ + (void) snprintf(path0, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + zs->zs_splits); + (void) snprintf(pathrand, MAXPATHLEN, ztest_dev_template, + ztest_opts.zo_dir, ztest_opts.zo_pool, + top * leaves + leaf); + + vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); + if (vd0 != NULL && vd0->vdev_top->vdev_islog) + islog = B_TRUE; + + /* + * If the top-level vdev needs to be resilvered + * then we only allow faults on the device that is + * resilvering. + */ + if (vd0 != NULL && maxfaults != 1 && + (!vdev_resilver_needed(vd0->vdev_top, NULL, NULL) || + vd0->vdev_resilver_txg != 0)) { + /* + * Make vd0 explicitly claim to be unreadable, + * or unwriteable, or reach behind its back + * and close the underlying fd. We can do this if + * maxfaults == 0 because we'll fail and reexecute, + * and we can do it if maxfaults >= 2 because we'll + * have enough redundancy. If maxfaults == 1, the + * combination of this with injection of random data + * corruption below exceeds the pool's fault tolerance. + */ + vdev_file_t *vf = vd0->vdev_tsd; + + zfs_dbgmsg("injecting fault to vdev %llu; maxfaults=%d", + (long long)vd0->vdev_id, (int)maxfaults); + + if (vf != NULL && ztest_random(3) == 0) { + (void) close(vf->vf_file->f_fd); + vf->vf_file->f_fd = -1; + } else if (ztest_random(2) == 0) { + vd0->vdev_cant_read = B_TRUE; + } else { + vd0->vdev_cant_write = B_TRUE; + } + guid0 = vd0->vdev_guid; + } + } else { + /* + * Inject errors on an l2cache device. + */ + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + if (sav->sav_count == 0) { + spa_config_exit(spa, SCL_STATE, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); + goto out; + } + vd0 = sav->sav_vdevs[ztest_random(sav->sav_count)]; + guid0 = vd0->vdev_guid; + (void) strcpy(path0, vd0->vdev_path); + (void) strcpy(pathrand, vd0->vdev_path); + + leaf = 0; + leaves = 1; + maxfaults = INT_MAX; /* no limit on cache devices */ + } + + spa_config_exit(spa, SCL_STATE, FTAG); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + /* + * If we can tolerate two or more faults, or we're dealing + * with a slog, randomly online/offline vd0. + */ + if ((maxfaults >= 2 || islog) && guid0 != 0) { + if (ztest_random(10) < 6) { + int flags = (ztest_random(2) == 0 ? + ZFS_OFFLINE_TEMPORARY : 0); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between offlining a slog and + * destroying a dataset. Offlining the slog will + * grab a reference on the dataset which may cause + * dsl_destroy_head() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + if (islog) + (void) pthread_rwlock_wrlock(&ztest_name_lock); + + VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + + if (islog) + (void) pthread_rwlock_unlock(&ztest_name_lock); + } else { + /* + * Ideally we would like to be able to randomly + * call vdev_[on|off]line without holding locks + * to force unpredictable failures but the side + * effects of vdev_[on|off]line prevent us from + * doing so. We grab the ztest_vdev_lock here to + * prevent a race between injection testing and + * aux_vdev removal. + */ + mutex_enter(&ztest_vdev_lock); + (void) vdev_online(spa, guid0, 0, NULL); + mutex_exit(&ztest_vdev_lock); + } + } + + if (maxfaults == 0) + goto out; + + /* + * We have at least single-fault tolerance, so inject data corruption. + */ + fd = open(pathrand, O_RDWR); + + if (fd == -1) /* we hit a gap in the device namespace */ + goto out; + + fsize = lseek(fd, 0, SEEK_END); + + while (--iters != 0) { + /* + * The offset must be chosen carefully to ensure that + * we do not inject a given logical block with errors + * on two different leaf devices, because ZFS can not + * tolerate that (if maxfaults==1). + * + * To achieve this we divide each leaf device into + * chunks of size (# leaves * SPA_MAXBLOCKSIZE * 4). + * Each chunk is further divided into error-injection + * ranges (can accept errors) and clear ranges (we do + * not inject errors in those). Each error-injection + * range can accept errors only for a single leaf vdev. + * Error-injection ranges are separated by clear ranges. + * + * For example, with 3 leaves, each chunk looks like: + * 0 to 32M: injection range for leaf 0 + * 32M to 64M: clear range - no injection allowed + * 64M to 96M: injection range for leaf 1 + * 96M to 128M: clear range - no injection allowed + * 128M to 160M: injection range for leaf 2 + * 160M to 192M: clear range - no injection allowed + * + * Each clear range must be large enough such that a + * single block cannot straddle it. This way a block + * can't be a target in two different injection ranges + * (on different leaf vdevs). + */ + offset = ztest_random(fsize / (leaves << bshift)) * + (leaves << bshift) + (leaf << bshift) + + (ztest_random(1ULL << (bshift - 1)) & -8ULL); + + /* + * Only allow damage to the labels at one end of the vdev. + * + * If all labels are damaged, the device will be totally + * inaccessible, which will result in loss of data, + * because we also damage (parts of) the other side of + * the mirror/raidz. + * + * Additionally, we will always have both an even and an + * odd label, so that we can handle crashes in the + * middle of vdev_config_sync(). + */ + if ((leaf & 1) == 0 && offset < VDEV_LABEL_START_SIZE) + continue; + + /* + * The two end labels are stored at the "end" of the disk, but + * the end of the disk (vdev_psize) is aligned to + * sizeof (vdev_label_t). + */ + uint64_t psize = P2ALIGN(fsize, sizeof (vdev_label_t)); + if ((leaf & 1) == 1 && + offset + sizeof (bad) > psize - VDEV_LABEL_END_SIZE) + continue; + + mutex_enter(&ztest_vdev_lock); + if (mirror_save != zs->zs_mirrors) { + mutex_exit(&ztest_vdev_lock); + (void) close(fd); + goto out; + } + + if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) + fatal(1, "can't inject bad word at 0x%llx in %s", + offset, pathrand); + + mutex_exit(&ztest_vdev_lock); + + if (ztest_opts.zo_verbose >= 7) + (void) printf("injected bad word into %s," + " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + } + + (void) close(fd); +out: + umem_free(path0, MAXPATHLEN); + umem_free(pathrand, MAXPATHLEN); +} + +/* + * By design ztest will never inject uncorrectable damage in to the pool. + * Issue a scrub, wait for it to complete, and verify there is never any + * any persistent damage. + * + * Only after a full scrub has been completed is it safe to start injecting + * data corruption. See the comment in zfs_fault_inject(). + */ +static int +ztest_scrub_impl(spa_t *spa) +{ + int error = spa_scan(spa, POOL_SCAN_SCRUB); + if (error) + return (error); + + while (dsl_scan_scrubbing(spa_get_dsl(spa))) + txg_wait_synced(spa_get_dsl(spa), 0); + + if (spa_get_errlog_size(spa) > 0) + return (ECKSUM); + + ztest_pool_scrubbed = B_TRUE; + + return (0); +} + +/* + * Scrub the pool. + */ +/* ARGSUSED */ +void +ztest_scrub(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error; + + /* + * Scrub in progress by device removal. + */ + if (ztest_device_removal_active) + return; + + /* + * Start a scrub, wait a moment, then force a restart. + */ + (void) spa_scan(spa, POOL_SCAN_SCRUB); + (void) poll(NULL, 0, 100); + + error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + ASSERT0(error); +} + +/* + * Change the guid for the pool. + */ +/* ARGSUSED */ +void +ztest_reguid(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + uint64_t orig, load; + int error; + + if (ztest_opts.zo_mmp_test) + return; + + orig = spa_guid(spa); + load = spa_load_guid(spa); + + (void) pthread_rwlock_wrlock(&ztest_name_lock); + error = spa_change_guid(spa); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + if (error != 0) + return; + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Changed guid old %llu -> %llu\n", + (u_longlong_t)orig, (u_longlong_t)spa_guid(spa)); + } + + VERIFY3U(orig, !=, spa_guid(spa)); + VERIFY3U(load, ==, spa_load_guid(spa)); +} + +void +ztest_fletcher(ztest_ds_t *zd, uint64_t id) +{ + hrtime_t end = gethrtime() + NANOSEC; + + while (gethrtime() <= end) { + int run_count = 100; + void *buf; + struct abd *abd_data, *abd_meta; + uint32_t size; + int *ptr; + int i; + zio_cksum_t zc_ref; + zio_cksum_t zc_ref_byteswap; + + size = ztest_random_blocksize(); + + buf = umem_alloc(size, UMEM_NOFAIL); + abd_data = abd_alloc(size, B_FALSE); + abd_meta = abd_alloc(size, B_TRUE); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + + abd_copy_from_buf_off(abd_data, buf, 0, size); + abd_copy_from_buf_off(abd_meta, buf, 0, size); + + VERIFY0(fletcher_4_impl_set("scalar")); + fletcher_4_native(buf, size, NULL, &zc_ref); + fletcher_4_byteswap(buf, size, NULL, &zc_ref_byteswap); + + VERIFY0(fletcher_4_impl_set("cycle")); + while (run_count-- > 0) { + zio_cksum_t zc; + zio_cksum_t zc_byteswap; + + fletcher_4_byteswap(buf, size, NULL, &zc_byteswap); + fletcher_4_native(buf, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + /* Test ABD - data */ + abd_fletcher_4_byteswap(abd_data, size, NULL, + &zc_byteswap); + abd_fletcher_4_native(abd_data, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + /* Test ABD - metadata */ + abd_fletcher_4_byteswap(abd_meta, size, NULL, + &zc_byteswap); + abd_fletcher_4_native(abd_meta, size, NULL, &zc); + + VERIFY0(bcmp(&zc, &zc_ref, sizeof (zc))); + VERIFY0(bcmp(&zc_byteswap, &zc_ref_byteswap, + sizeof (zc_byteswap))); + + } + + umem_free(buf, size); + abd_free(abd_data); + abd_free(abd_meta); + } +} + +void +ztest_fletcher_incr(ztest_ds_t *zd, uint64_t id) +{ + void *buf; + size_t size; + int *ptr; + int i; + zio_cksum_t zc_ref; + zio_cksum_t zc_ref_bswap; + + hrtime_t end = gethrtime() + NANOSEC; + + while (gethrtime() <= end) { + int run_count = 100; + + size = ztest_random_blocksize(); + buf = umem_alloc(size, UMEM_NOFAIL); + + for (i = 0, ptr = buf; i < size / sizeof (*ptr); i++, ptr++) + *ptr = ztest_random(UINT_MAX); + + VERIFY0(fletcher_4_impl_set("scalar")); + fletcher_4_native(buf, size, NULL, &zc_ref); + fletcher_4_byteswap(buf, size, NULL, &zc_ref_bswap); + + VERIFY0(fletcher_4_impl_set("cycle")); + + while (run_count-- > 0) { + zio_cksum_t zc; + zio_cksum_t zc_bswap; + size_t pos = 0; + + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); + + while (pos < size) { + size_t inc = 64 * ztest_random(size / 67); + /* sometimes add few bytes to test non-simd */ + if (ztest_random(100) < 10) + inc += P2ALIGN(ztest_random(64), + sizeof (uint32_t)); + + if (inc > (size - pos)) + inc = size - pos; + + fletcher_4_incremental_native(buf + pos, inc, + &zc); + fletcher_4_incremental_byteswap(buf + pos, inc, + &zc_bswap); + + pos += inc; + } + + VERIFY3U(pos, ==, size); + + VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); + VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); + + /* + * verify if incremental on the whole buffer is + * equivalent to non-incremental version + */ + ZIO_SET_CHECKSUM(&zc, 0, 0, 0, 0); + ZIO_SET_CHECKSUM(&zc_bswap, 0, 0, 0, 0); + + fletcher_4_incremental_native(buf, size, &zc); + fletcher_4_incremental_byteswap(buf, size, &zc_bswap); + + VERIFY(ZIO_CHECKSUM_EQUAL(zc, zc_ref)); + VERIFY(ZIO_CHECKSUM_EQUAL(zc_bswap, zc_ref_bswap)); + } + + umem_free(buf, size); + } +} + +static int +ztest_check_path(char *path) +{ + struct stat s; + /* return true on success */ + return (!stat(path, &s)); +} + +static void +ztest_get_zdb_bin(char *bin, int len) +{ + char *zdb_path; + /* + * Try to use ZDB_PATH and in-tree zdb path. If not successful, just + * let popen to search through PATH. + */ + if ((zdb_path = getenv("ZDB_PATH"))) { + strlcpy(bin, zdb_path, len); /* In env */ + if (!ztest_check_path(bin)) { + ztest_dump_core = 0; + fatal(1, "invalid ZDB_PATH '%s'", bin); + } + return; + } + + VERIFY(realpath(getexecname(), bin) != NULL); + if (strstr(bin, "/ztest/")) { + strstr(bin, "/ztest/")[0] = '\0'; /* In-tree */ + strcat(bin, "/zdb/zdb"); + if (ztest_check_path(bin)) + return; + } + strcpy(bin, "zdb"); +} + +static vdev_t * +ztest_random_concrete_vdev_leaf(vdev_t *vd) +{ + if (vd == NULL) + return (NULL); + + if (vd->vdev_children == 0) + return (vd); + + vdev_t *eligible[vd->vdev_children]; + int eligible_idx = 0, i; + for (i = 0; i < vd->vdev_children; i++) { + vdev_t *cvd = vd->vdev_child[i]; + if (cvd->vdev_top->vdev_removing) + continue; + if (cvd->vdev_children > 0 || + (vdev_is_concrete(cvd) && !cvd->vdev_detached)) { + eligible[eligible_idx++] = cvd; + } + } + VERIFY(eligible_idx > 0); + + uint64_t child_no = ztest_random(eligible_idx); + return (ztest_random_concrete_vdev_leaf(eligible[child_no])); +} + +/* ARGSUSED */ +void +ztest_initialize(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_initialize_thread != NULL; + + zfs_dbgmsg("vd %px, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_INITIALIZE_FUNCS); + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *vdev_errlist = fnvlist_alloc(); + fnvlist_add_uint64(vdev_guids, path, guid); + error = spa_vdev_initialize(spa, vdev_guids, cmd, vdev_errlist); + fnvlist_free(vdev_guids); + fnvlist_free(vdev_errlist); + + switch (cmd) { + case POOL_INITIALIZE_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_START: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start initialize %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_INITIALIZE_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend initialize %s", path); + if (!active) + (void) printf(" failed (no initialize active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + +/* ARGSUSED */ +void +ztest_trim(ztest_ds_t *zd, uint64_t id) +{ + spa_t *spa = ztest_spa; + int error = 0; + + mutex_enter(&ztest_vdev_lock); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* Random leaf vdev */ + vdev_t *rand_vd = ztest_random_concrete_vdev_leaf(spa->spa_root_vdev); + if (rand_vd == NULL) { + spa_config_exit(spa, SCL_VDEV, FTAG); + mutex_exit(&ztest_vdev_lock); + return; + } + + /* + * The random vdev we've selected may change as soon as we + * drop the spa_config_lock. We create local copies of things + * we're interested in. + */ + uint64_t guid = rand_vd->vdev_guid; + char *path = strdup(rand_vd->vdev_path); + boolean_t active = rand_vd->vdev_trim_thread != NULL; + + zfs_dbgmsg("vd %p, guid %llu", rand_vd, guid); + spa_config_exit(spa, SCL_VDEV, FTAG); + + uint64_t cmd = ztest_random(POOL_TRIM_FUNCS); + uint64_t rate = 1 << ztest_random(30); + boolean_t partial = (ztest_random(5) > 0); + boolean_t secure = (ztest_random(5) > 0); + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *vdev_errlist = fnvlist_alloc(); + fnvlist_add_uint64(vdev_guids, path, guid); + error = spa_vdev_trim(spa, vdev_guids, cmd, rate, partial, + secure, vdev_errlist); + fnvlist_free(vdev_guids); + fnvlist_free(vdev_errlist); + + switch (cmd) { + case POOL_TRIM_CANCEL: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Cancel TRIM %s", path); + if (!active) + (void) printf(" failed (no TRIM active)"); + (void) printf("\n"); + } + break; + case POOL_TRIM_START: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Start TRIM %s", path); + if (active && error == 0) + (void) printf(" failed (already active)"); + else if (error != 0) + (void) printf(" failed (error %d)", error); + (void) printf("\n"); + } + break; + case POOL_TRIM_SUSPEND: + if (ztest_opts.zo_verbose >= 4) { + (void) printf("Suspend TRIM %s", path); + if (!active) + (void) printf(" failed (no TRIM active)"); + (void) printf("\n"); + } + break; + } + free(path); + mutex_exit(&ztest_vdev_lock); +} + +/* + * Verify pool integrity by running zdb. + */ +static void +ztest_run_zdb(char *pool) +{ + int status; + char *bin; + char *zdb; + char *zbuf; + const int len = MAXPATHLEN + MAXNAMELEN + 20; + FILE *fp; + + bin = umem_alloc(len, UMEM_NOFAIL); + zdb = umem_alloc(len, UMEM_NOFAIL); + zbuf = umem_alloc(1024, UMEM_NOFAIL); + + ztest_get_zdb_bin(bin, len); + + (void) sprintf(zdb, + "%s -bcc%s%s -G -d -Y -e -y -p %s %s", + bin, + ztest_opts.zo_verbose >= 3 ? "s" : "", + ztest_opts.zo_verbose >= 4 ? "v" : "", + ztest_opts.zo_dir, + pool); + + if (ztest_opts.zo_verbose >= 5) + (void) printf("Executing %s\n", strstr(zdb, "zdb ")); + + fp = popen(zdb, "r"); + + while (fgets(zbuf, 1024, fp) != NULL) + if (ztest_opts.zo_verbose >= 3) + (void) printf("%s", zbuf); + + status = pclose(fp); + + if (status == 0) + goto out; + + ztest_dump_core = 0; + if (WIFEXITED(status)) + fatal(0, "'%s' exit code %d", zdb, WEXITSTATUS(status)); + else + fatal(0, "'%s' died with signal %d", zdb, WTERMSIG(status)); +out: + umem_free(bin, len); + umem_free(zdb, len); + umem_free(zbuf, 1024); +} + +static void +ztest_walk_pool_directory(char *header) +{ + spa_t *spa = NULL; + + if (ztest_opts.zo_verbose >= 6) + (void) printf("%s\n", header); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + if (ztest_opts.zo_verbose >= 6) + (void) printf("\t%s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); +} + +static void +ztest_spa_import_export(char *oldname, char *newname) +{ + nvlist_t *config, *newconfig; + uint64_t pool_guid; + spa_t *spa; + int error; + + if (ztest_opts.zo_verbose >= 4) { + (void) printf("import/export: old = %s, new = %s\n", + oldname, newname); + } + + /* + * Clean up from previous runs. + */ + (void) spa_destroy(newname); + + /* + * Get the pool's configuration and guid. + */ + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Kick off a scrub to tickle scrub/export races. + */ + if (ztest_random(2) == 0) + (void) spa_scan(spa, POOL_SCAN_SCRUB); + + pool_guid = spa_guid(spa); + spa_close(spa, FTAG); + + ztest_walk_pool_directory("pools before export"); + + /* + * Export it. + */ + VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); + + ztest_walk_pool_directory("pools after export"); + + /* + * Try to import it. + */ + newconfig = spa_tryimport(config); + ASSERT(newconfig != NULL); + nvlist_free(newconfig); + + /* + * Import it under the new name. + */ + error = spa_import(newname, config, NULL, 0); + if (error != 0) { + dump_nvlist(config, 0); + fatal(B_FALSE, "couldn't import pool %s as %s: error %u", + oldname, newname, error); + } + + ztest_walk_pool_directory("pools after import"); + + /* + * Try to import it again -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL, 0)); + + /* + * Try to import it under a different name -- should fail with EEXIST. + */ + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL, 0)); + + /* + * Verify that the pool is no longer visible under the old name. + */ + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Verify that we can open and close the pool using the new name. + */ + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); + ASSERT(pool_guid == spa_guid(spa)); + spa_close(spa, FTAG); + + nvlist_free(config); +} + +static void +ztest_resume(spa_t *spa) +{ + if (spa_suspended(spa) && ztest_opts.zo_verbose >= 6) + (void) printf("resuming from suspended state\n"); + spa_vdev_state_enter(spa, SCL_NONE); + vdev_clear(spa, NULL); + (void) spa_vdev_state_exit(spa, NULL, 0); + (void) zio_resume(spa); +} + +static void +ztest_resume_thread(void *arg) +{ + spa_t *spa = arg; + + while (!ztest_exiting) { + if (spa_suspended(spa)) + ztest_resume(spa); + (void) poll(NULL, 0, 100); + + /* + * Periodically change the zfs_compressed_arc_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_compressed_arc_enabled = ztest_random(2); + + /* + * Periodically change the zfs_abd_scatter_enabled setting. + */ + if (ztest_random(10) == 0) + zfs_abd_scatter_enabled = ztest_random(2); + } + + thread_exit(); +} + +static void +ztest_deadman_thread(void *arg) +{ + ztest_shared_t *zs = arg; + spa_t *spa = ztest_spa; + hrtime_t delay, overdue, last_run = gethrtime(); + + delay = (zs->zs_thread_stop - zs->zs_thread_start) + + MSEC2NSEC(zfs_deadman_synctime_ms); + + while (!ztest_exiting) { + /* + * Wait for the delay timer while checking occasionally + * if we should stop. + */ + if (gethrtime() < last_run + delay) { + (void) poll(NULL, 0, 1000); + continue; + } + + /* + * If the pool is suspended then fail immediately. Otherwise, + * check to see if the pool is making any progress. If + * vdev_deadman() discovers that there hasn't been any recent + * I/Os then it will end up aborting the tests. + */ + if (spa_suspended(spa) || spa->spa_root_vdev == NULL) { + fatal(0, "aborting test after %llu seconds because " + "pool has transitioned to a suspended state.", + zfs_deadman_synctime_ms / 1000); + } + vdev_deadman(spa->spa_root_vdev, FTAG); + + /* + * If the process doesn't complete within a grace period of + * zfs_deadman_synctime_ms over the expected finish time, + * then it may be hung and is terminated. + */ + overdue = zs->zs_proc_stop + MSEC2NSEC(zfs_deadman_synctime_ms); + if (gethrtime() > overdue) { + fatal(0, "aborting test after %llu seconds because " + "the process is overdue for termination.", + (gethrtime() - zs->zs_proc_start) / NANOSEC); + } + + (void) printf("ztest has been running for %lld seconds\n", + (gethrtime() - zs->zs_proc_start) / NANOSEC); + + last_run = gethrtime(); + delay = MSEC2NSEC(zfs_deadman_checktime_ms); + } + + thread_exit(); +} + +static void +ztest_execute(int test, ztest_info_t *zi, uint64_t id) +{ + ztest_ds_t *zd = &ztest_ds[id % ztest_opts.zo_datasets]; + ztest_shared_callstate_t *zc = ZTEST_GET_SHARED_CALLSTATE(test); + hrtime_t functime = gethrtime(); + int i; + + for (i = 0; i < zi->zi_iters; i++) + zi->zi_func(zd, id); + + functime = gethrtime() - functime; + + atomic_add_64(&zc->zc_count, 1); + atomic_add_64(&zc->zc_time, functime); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("%6.2f sec in %s\n", + (double)functime / NANOSEC, zi->zi_funcname); +} + +static void +ztest_thread(void *arg) +{ + int rand; + uint64_t id = (uintptr_t)arg; + ztest_shared_t *zs = ztest_shared; + uint64_t call_next; + hrtime_t now; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + + while ((now = gethrtime()) < zs->zs_thread_stop) { + /* + * See if it's time to force a crash. + */ + if (now > zs->zs_thread_kill) + ztest_kill(zs); + + /* + * If we're getting ENOSPC with some regularity, stop. + */ + if (zs->zs_enospc_count > 10) + break; + + /* + * Pick a random function to execute. + */ + rand = ztest_random(ZTEST_FUNCS); + zi = &ztest_info[rand]; + zc = ZTEST_GET_SHARED_CALLSTATE(rand); + call_next = zc->zc_next; + + if (now >= call_next && + atomic_cas_64(&zc->zc_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) { + ztest_execute(rand, zi, id); + } + } + + thread_exit(); +} + +static void +ztest_dataset_name(char *dsname, char *pool, int d) +{ + (void) snprintf(dsname, ZFS_MAX_DATASET_NAME_LEN, "%s/ds_%d", pool, d); +} + +static void +ztest_dataset_destroy(int d) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + int t; + + ztest_dataset_name(name, ztest_opts.zo_pool, d); + + if (ztest_opts.zo_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); + + /* + * Cleanup any non-standard clones and snapshots. In general, + * ztest thread t operates on dataset (t % zopt_datasets), + * so there may be more than one thing to clean up. + */ + for (t = d; t < ztest_opts.zo_threads; + t += ztest_opts.zo_datasets) + ztest_dsl_dataset_cleanup(name, t); + + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); +} + +static void +ztest_dataset_dirobj_verify(ztest_ds_t *zd) +{ + uint64_t usedobjs, dirobjs, scratch; + + /* + * ZTEST_DIROBJ is the object directory for the entire dataset. + * Therefore, the number of objects in use should equal the + * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. + * If not, we have an object leak. + * + * Note that we can only check this in ztest_dataset_open(), + * when the open-context and syncing-context values agree. + * That's because zap_count() returns the open-context value, + * while dmu_objset_space() returns the rootbp fill count. + */ + VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); + ASSERT3U(dirobjs + 1, ==, usedobjs); +} + +static int +ztest_dataset_open(int d) +{ + ztest_ds_t *zd = &ztest_ds[d]; + uint64_t committed_seq = ZTEST_GET_SHARED_DS(d)->zd_seq; + objset_t *os; + zilog_t *zilog; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + ztest_dataset_name(name, ztest_opts.zo_pool, d); + + (void) pthread_rwlock_rdlock(&ztest_name_lock); + + error = ztest_dataset_create(name); + if (error == ENOSPC) { + (void) pthread_rwlock_unlock(&ztest_name_lock); + ztest_record_enospc(FTAG); + return (error); + } + ASSERT(error == 0 || error == EEXIST); + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, + B_TRUE, zd, &os)); + (void) pthread_rwlock_unlock(&ztest_name_lock); + + ztest_zd_init(zd, ZTEST_GET_SHARED_DS(d), os); + + zilog = zd->zd_zilog; + + if (zilog->zl_header->zh_claim_lr_seq != 0 && + zilog->zl_header->zh_claim_lr_seq < committed_seq) + fatal(0, "missing log records: claimed %llu < committed %llu", + zilog->zl_header->zh_claim_lr_seq, committed_seq); + + ztest_dataset_dirobj_verify(zd); + + zil_replay(os, zd, ztest_replay_vector); + + ztest_dataset_dirobj_verify(zd); + + if (ztest_opts.zo_verbose >= 6) + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + zd->zd_name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + + zilog = zil_open(os, ztest_get_data); + + if (zilog->zl_replaying_seq != 0 && + zilog->zl_replaying_seq < committed_seq) + fatal(0, "missing log records: replayed %llu < committed %llu", + zilog->zl_replaying_seq, committed_seq); + + return (0); +} + +static void +ztest_dataset_close(int d) +{ + ztest_ds_t *zd = &ztest_ds[d]; + + zil_close(zd->zd_zilog); + dmu_objset_disown(zd->zd_os, B_TRUE, zd); + + ztest_zd_fini(zd); +} + +/* ARGSUSED */ +static int +ztest_replay_zil_cb(const char *name, void *arg) +{ + objset_t *os; + ztest_ds_t *zdtmp; + + VERIFY0(ztest_dmu_objset_own(name, DMU_OST_ANY, B_TRUE, + B_TRUE, FTAG, &os)); + + zdtmp = umem_alloc(sizeof (ztest_ds_t), UMEM_NOFAIL); + + ztest_zd_init(zdtmp, NULL, os); + zil_replay(os, zdtmp, ztest_replay_vector); + ztest_zd_fini(zdtmp); + + if (dmu_objset_zil(os)->zl_parse_lr_count != 0 && + ztest_opts.zo_verbose >= 6) { + zilog_t *zilog = dmu_objset_zil(os); + + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + } + + umem_free(zdtmp, sizeof (ztest_ds_t)); + + dmu_objset_disown(os, B_TRUE, FTAG); + return (0); +} + +static void +ztest_freeze(void) +{ + ztest_ds_t *zd = &ztest_ds[0]; + spa_t *spa; + int numloops = 0; + + if (ztest_opts.zo_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_spa = spa; + + /* + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. + */ + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, 0); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Freeze the pool. This stops spa_sync() from doing anything, + * so that the only way to record changes from now on is the ZIL. + */ + spa_freeze(spa); + + /* + * Because it is hard to predict how much space a write will actually + * require beforehand, we leave ourselves some fudge space to write over + * capacity. + */ + uint64_t capacity = metaslab_class_get_space(spa_normal_class(spa)) / 2; + + /* + * Run tests that generate log records but don't alter the pool config + * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). + * We do a txg_wait_synced() after each iteration to force the txg + * to increase well beyond the last synced value in the uberblock. + * The ZIL should be OK with that. + * + * Run a random number of times less than zo_maxloops and ensure we do + * not run out of space on the pool. + */ + while (ztest_random(10) != 0 && + numloops++ < ztest_opts.zo_maxloops && + metaslab_class_get_alloc(spa_normal_class(spa)) < capacity) { + ztest_od_t od; + ztest_od_init(&od, 0, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0, 0); + VERIFY0(ztest_object_init(zd, &od, sizeof (od), B_FALSE)); + ztest_io(zd, od.od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + txg_wait_synced(spa_get_dsl(spa), 0); + } + + /* + * Commit all of the changes we just generated. + */ + zil_commit(zd->zd_zilog, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * Close our dataset and close the pool. + */ + ztest_dataset_close(0); + spa_close(spa, FTAG); + kernel_fini(); + + /* + * Open and close the pool and dataset to induce log replay. + */ + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + ASSERT(spa_freeze_txg(spa) == UINT64_MAX); + VERIFY3U(0, ==, ztest_dataset_open(0)); + ztest_spa = spa; + txg_wait_synced(spa_get_dsl(spa), 0); + ztest_dataset_close(0); + ztest_reguid(NULL, 0); + + spa_close(spa, FTAG); + kernel_fini(); +} + +static void +ztest_import_impl(ztest_shared_t *zs) +{ + importargs_t args = { 0 }; + nvlist_t *cfg = NULL; + int nsearch = 1; + char *searchdirs[nsearch]; + int flags = ZFS_IMPORT_MISSING_LOG; + + searchdirs[0] = ztest_opts.zo_dir; + args.paths = nsearch; + args.path = searchdirs; + args.can_be_active = B_FALSE; + + VERIFY0(zpool_find_config(NULL, ztest_opts.zo_pool, &cfg, &args, + &libzpool_config_ops)); + VERIFY0(spa_import(ztest_opts.zo_pool, cfg, NULL, flags)); +} + +/* + * Import a storage pool with the given name. + */ +static void +ztest_import(ztest_shared_t *zs) +{ + spa_t *spa; + + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + + ztest_import_impl(zs); + + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + if (!ztest_opts.zo_mmp_test) { + ztest_run_zdb(ztest_opts.zo_pool); + ztest_freeze(); + ztest_run_zdb(ztest_opts.zo_pool); + } + + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +/* + * Kick off threads to run tests on all datasets in parallel. + */ +static void +ztest_run(ztest_shared_t *zs) +{ + spa_t *spa; + objset_t *os; + kthread_t *resume_thread, *deadman_thread; + kthread_t **run_threads; + uint64_t object; + int error; + int t, d; + + ztest_exiting = B_FALSE; + + /* + * Initialize parent/child shared state. + */ + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = + zs->zs_thread_start + ztest_opts.zo_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < ztest_opts.zo_killrate) { + zs->zs_thread_kill -= + ztest_random(ztest_opts.zo_passtime * NANOSEC); + } + + mutex_init(&zcl.zcl_callbacks_lock, NULL, MUTEX_DEFAULT, NULL); + + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); + + /* + * Open our pool. It may need to be imported first depending on + * what tests were running when the previous pass was terminated. + */ + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + error = spa_open(ztest_opts.zo_pool, &spa, FTAG); + if (error) { + VERIFY3S(error, ==, ENOENT); + ztest_import_impl(zs); + VERIFY0(spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + } + + metaslab_preload_limit = ztest_random(20) + 1; + ztest_spa = spa; + + VERIFY0(vdev_raidz_impl_set("cycle")); + + dmu_objset_stats_t dds; + VERIFY0(ztest_dmu_objset_own(ztest_opts.zo_pool, + DMU_OST_ANY, B_TRUE, B_TRUE, FTAG, &os)); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + dmu_objset_fast_stat(os, &dds); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + zs->zs_guid = dds.dds_guid; + dmu_objset_disown(os, B_TRUE, FTAG); + + /* + * Create a thread to periodically resume suspended I/O. + */ + resume_thread = thread_create(NULL, 0, ztest_resume_thread, + spa, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + /* + * Create a deadman thread and set to panic if we hang. + */ + deadman_thread = thread_create(NULL, 0, ztest_deadman_thread, + zs, 0, NULL, TS_RUN | TS_JOINABLE, defclsyspri); + + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + + /* + * Verify that we can safely inquire about any object, + * whether it's allocated or not. To make it interesting, + * we probe a 5-wide window around each power of two. + * This hits all edge cases, including zero and the max. + */ + for (t = 0; t < 64; t++) { + for (d = -5; d <= 5; d++) { + error = dmu_object_info(spa->spa_meta_objset, + (1ULL << t) + d, NULL); + ASSERT(error == 0 || error == ENOENT || + error == EINVAL); + } + } + + /* + * If we got any ENOSPC errors on the previous run, destroy something. + */ + if (zs->zs_enospc_count != 0) { + int d = ztest_random(ztest_opts.zo_datasets); + ztest_dataset_destroy(d); + } + zs->zs_enospc_count = 0; + + /* + * If we were in the middle of ztest_device_removal() and were killed + * we need to ensure the removal and scrub complete before running + * any tests that check ztest_device_removal_active. The removal will + * be restarted automatically when the spa is opened, but we need to + * initiate the scrub manually if it is not already in progress. Note + * that we always run the scrub whenever an indirect vdev exists + * because we have no way of knowing for sure if ztest_device_removal() + * fully completed its scrub before the pool was reimported. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + while (spa->spa_removing_phys.sr_state == DSS_SCANNING) + txg_wait_synced(spa_get_dsl(spa), 0); + + error = ztest_scrub_impl(spa); + if (error == EBUSY) + error = 0; + ASSERT0(error); + } + + run_threads = umem_zalloc(ztest_opts.zo_threads * sizeof (kthread_t *), + UMEM_NOFAIL); + + if (ztest_opts.zo_verbose >= 4) + (void) printf("starting main threads...\n"); + + /* + * Replay all logs of all datasets in the pool. This is primarily for + * temporary datasets which wouldn't otherwise get replayed, which + * can trigger failures when attempting to offline a SLOG in + * ztest_fault_inject(). + */ + (void) dmu_objset_find(ztest_opts.zo_pool, ztest_replay_zil_cb, + NULL, DS_FIND_CHILDREN); + + /* + * Kick off all the tests that run in parallel. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets && ztest_dataset_open(t) != 0) { + umem_free(run_threads, ztest_opts.zo_threads * + sizeof (kthread_t *)); + return; + } + + run_threads[t] = thread_create(NULL, 0, ztest_thread, + (void *)(uintptr_t)t, 0, NULL, TS_RUN | TS_JOINABLE, + defclsyspri); + } + + /* + * Wait for all of the tests to complete. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) + VERIFY0(thread_join(run_threads[t])); + + /* + * Close all datasets. This must be done after all the threads + * are joined so we can be sure none of the datasets are in-use + * by any of the threads. + */ + for (t = 0; t < ztest_opts.zo_threads; t++) { + if (t < ztest_opts.zo_datasets) + ztest_dataset_close(t); + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(run_threads, ztest_opts.zo_threads * sizeof (kthread_t *)); + + /* Kill the resume and deadman threads */ + ztest_exiting = B_TRUE; + VERIFY0(thread_join(resume_thread)); + VERIFY0(thread_join(deadman_thread)); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (object = 1; object < 50; object++) { + dmu_prefetch(spa->spa_meta_objset, object, 0, 0, 1ULL << 20, + ZIO_PRIORITY_SYNC_READ); + } + + /* Verify that at least one commit cb was called in a timely fashion */ + if (zc_cb_counter >= ZTEST_COMMIT_CB_MIN_REG) + VERIFY0(zc_min_txg_delay); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (ztest_opts.zo_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if ((ztest_random(2) == 0) && !ztest_opts.zo_mmp_test) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(name, sizeof (name), "%s_import", + ztest_opts.zo_pool); + ztest_spa_import_export(ztest_opts.zo_pool, name); + ztest_spa_import_export(name, ztest_opts.zo_pool); + } + + kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + mutex_destroy(&zcl.zcl_callbacks_lock); + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +static void +print_time(hrtime_t t, char *timebuf) +{ + hrtime_t s = t / NANOSEC; + hrtime_t m = s / 60; + hrtime_t h = m / 60; + hrtime_t d = h / 24; + + s -= m * 60; + m -= h * 60; + h -= d * 24; + + timebuf[0] = '\0'; + + if (d) + (void) sprintf(timebuf, + "%llud%02lluh%02llum%02llus", d, h, m, s); + else if (h) + (void) sprintf(timebuf, "%lluh%02llum%02llus", h, m, s); + else if (m) + (void) sprintf(timebuf, "%llum%02llus", m, s); + else + (void) sprintf(timebuf, "%llus", s); +} + +static nvlist_t * +make_random_props(void) +{ + nvlist_t *props; + + VERIFY0(nvlist_alloc(&props, NV_UNIQUE_NAME, 0)); + + if (ztest_random(2) == 0) + return (props); + + VERIFY0(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), 1)); + + return (props); +} + +/* + * Create a storage pool with the given name and initial vdev size. + * Then test spa_freeze() functionality. + */ +static void +ztest_init(ztest_shared_t *zs) +{ + spa_t *spa; + nvlist_t *nvroot, *props; + int i; + + mutex_init(&ztest_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ztest_checkpoint_lock, NULL, MUTEX_DEFAULT, NULL); + VERIFY0(pthread_rwlock_init(&ztest_name_lock, NULL)); + + kernel_init(SPA_MODE_READ | SPA_MODE_WRITE); + + /* + * Create the storage pool. + */ + (void) spa_destroy(ztest_opts.zo_pool); + ztest_shared->zs_vdev_next_leaf = 0; + zs->zs_splits = 0; + zs->zs_mirrors = ztest_opts.zo_mirrors; + nvroot = make_vdev_root(NULL, NULL, NULL, ztest_opts.zo_vdev_size, 0, + NULL, ztest_opts.zo_raidz, zs->zs_mirrors, 1); + props = make_random_props(); + + /* + * We don't expect the pool to suspend unless maxfaults == 0, + * in which case ztest_fault_inject() temporarily takes away + * the only valid replica. + */ + VERIFY0(nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), + MAXFAULTS(zs) ? ZIO_FAILURE_MODE_PANIC : ZIO_FAILURE_MODE_WAIT)); + + for (i = 0; i < SPA_FEATURES; i++) { + char *buf; + + /* + * 75% chance of using the log space map feature. We want ztest + * to exercise both the code paths that use the log space map + * feature and the ones that don't. + */ + if (i == SPA_FEATURE_LOG_SPACEMAP && ztest_random(4) == 0) + continue; + + VERIFY3S(-1, !=, asprintf(&buf, "feature@%s", + spa_feature_table[i].fi_uname)); + VERIFY3U(0, ==, nvlist_add_uint64(props, buf, 0)); + free(buf); + } + + VERIFY0(spa_create(ztest_opts.zo_pool, nvroot, props, NULL, NULL)); + nvlist_free(nvroot); + nvlist_free(props); + + VERIFY3U(0, ==, spa_open(ztest_opts.zo_pool, &spa, FTAG)); + zs->zs_metaslab_sz = + 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; + spa_close(spa, FTAG); + + kernel_fini(); + + if (!ztest_opts.zo_mmp_test) { + ztest_run_zdb(ztest_opts.zo_pool); + ztest_freeze(); + ztest_run_zdb(ztest_opts.zo_pool); + } + + (void) pthread_rwlock_destroy(&ztest_name_lock); + mutex_destroy(&ztest_vdev_lock); + mutex_destroy(&ztest_checkpoint_lock); +} + +static void +setup_data_fd(void) +{ + static char ztest_name_data[] = "/tmp/ztest.data.XXXXXX"; + + ztest_fd_data = mkstemp(ztest_name_data); + ASSERT3S(ztest_fd_data, >=, 0); + (void) unlink(ztest_name_data); +} + +static int +shared_data_size(ztest_shared_hdr_t *hdr) +{ + int size; + + size = hdr->zh_hdr_size; + size += hdr->zh_opts_size; + size += hdr->zh_size; + size += hdr->zh_stats_size * hdr->zh_stats_count; + size += hdr->zh_ds_size * hdr->zh_ds_count; + + return (size); +} + +static void +setup_hdr(void) +{ + int size; + ztest_shared_hdr_t *hdr; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + VERIFY3U(0, ==, ftruncate(ztest_fd_data, sizeof (ztest_shared_hdr_t))); + + hdr->zh_hdr_size = sizeof (ztest_shared_hdr_t); + hdr->zh_opts_size = sizeof (ztest_shared_opts_t); + hdr->zh_size = sizeof (ztest_shared_t); + hdr->zh_stats_size = sizeof (ztest_shared_callstate_t); + hdr->zh_stats_count = ZTEST_FUNCS; + hdr->zh_ds_size = sizeof (ztest_shared_ds_t); + hdr->zh_ds_count = ztest_opts.zo_datasets; + + size = shared_data_size(hdr); + VERIFY3U(0, ==, ftruncate(ztest_fd_data, size)); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); +} + +static void +setup_data(void) +{ + int size, offset; + ztest_shared_hdr_t *hdr; + uint8_t *buf; + + hdr = (void *)mmap(0, P2ROUNDUP(sizeof (*hdr), getpagesize()), + PROT_READ, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + + size = shared_data_size(hdr); + + (void) munmap((caddr_t)hdr, P2ROUNDUP(sizeof (*hdr), getpagesize())); + hdr = ztest_shared_hdr = (void *)mmap(0, P2ROUNDUP(size, getpagesize()), + PROT_READ | PROT_WRITE, MAP_SHARED, ztest_fd_data, 0); + ASSERT(hdr != MAP_FAILED); + buf = (uint8_t *)hdr; + + offset = hdr->zh_hdr_size; + ztest_shared_opts = (void *)&buf[offset]; + offset += hdr->zh_opts_size; + ztest_shared = (void *)&buf[offset]; + offset += hdr->zh_size; + ztest_shared_callstate = (void *)&buf[offset]; + offset += hdr->zh_stats_size * hdr->zh_stats_count; + ztest_shared_ds = (void *)&buf[offset]; +} + +static boolean_t +exec_child(char *cmd, char *libpath, boolean_t ignorekill, int *statusp) +{ + pid_t pid; + int status; + char *cmdbuf = NULL; + + pid = fork(); + + if (cmd == NULL) { + cmdbuf = umem_alloc(MAXPATHLEN, UMEM_NOFAIL); + (void) strlcpy(cmdbuf, getexecname(), MAXPATHLEN); + cmd = cmdbuf; + } + + if (pid == -1) + fatal(1, "fork failed"); + + if (pid == 0) { /* child */ + char *emptyargv[2] = { cmd, NULL }; + char fd_data_str[12]; + + struct rlimit rl = { 1024, 1024 }; + (void) setrlimit(RLIMIT_NOFILE, &rl); + + (void) close(ztest_fd_rand); + VERIFY(11 >= snprintf(fd_data_str, 12, "%d", ztest_fd_data)); + VERIFY(0 == setenv("ZTEST_FD_DATA", fd_data_str, 1)); + + (void) enable_extended_FILE_stdio(-1, -1); + if (libpath != NULL) + VERIFY(0 == setenv("LD_LIBRARY_PATH", libpath, 1)); + (void) execv(cmd, emptyargv); + ztest_dump_core = B_FALSE; + fatal(B_TRUE, "exec failed: %s", cmd); + } + + if (cmdbuf != NULL) { + umem_free(cmdbuf, MAXPATHLEN); + cmd = NULL; + } + + while (waitpid(pid, &status, 0) != pid) + continue; + if (statusp != NULL) + *statusp = status; + + if (WIFEXITED(status)) { + if (WEXITSTATUS(status) != 0) { + (void) fprintf(stderr, "child exited with code %d\n", + WEXITSTATUS(status)); + exit(2); + } + return (B_FALSE); + } else if (WIFSIGNALED(status)) { + if (!ignorekill || WTERMSIG(status) != SIGKILL) { + (void) fprintf(stderr, "child died with signal %d\n", + WTERMSIG(status)); + exit(3); + } + return (B_TRUE); + } else { + (void) fprintf(stderr, "something strange happened to child\n"); + exit(4); + /* NOTREACHED */ + } +} + +static void +ztest_run_init(void) +{ + int i; + + ztest_shared_t *zs = ztest_shared; + + /* + * Blow away any existing copy of zpool.cache + */ + (void) remove(spa_config_path); + + if (ztest_opts.zo_init == 0) { + if (ztest_opts.zo_verbose >= 1) + (void) printf("Importing pool %s\n", + ztest_opts.zo_pool); + ztest_import(zs); + return; + } + + /* + * Create and initialize our storage pool. + */ + for (i = 1; i <= ztest_opts.zo_init; i++) { + bzero(zs, sizeof (ztest_shared_t)); + if (ztest_opts.zo_verbose >= 3 && + ztest_opts.zo_init != 1) { + (void) printf("ztest_init(), pass %d\n", i); + } + ztest_init(zs); + } +} + +int +main(int argc, char **argv) +{ + int kills = 0; + int iters = 0; + int older = 0; + int newer = 0; + ztest_shared_t *zs; + ztest_info_t *zi; + ztest_shared_callstate_t *zc; + char timebuf[100]; + char numbuf[NN_NUMBUF_SZ]; + char *cmd; + boolean_t hasalt; + int f; + char *fd_data_str = getenv("ZTEST_FD_DATA"); + struct sigaction action; + + (void) setvbuf(stdout, NULL, _IOLBF, 0); + + dprintf_setup(&argc, argv); + zfs_deadman_synctime_ms = 300000; + zfs_deadman_checktime_ms = 30000; + /* + * As two-word space map entries may not come up often (especially + * if pool and vdev sizes are small) we want to force at least some + * of them so the feature get tested. + */ + zfs_force_some_double_word_sm_entries = B_TRUE; + + /* + * Verify that even extensively damaged split blocks with many + * segments can be reconstructed in a reasonable amount of time + * when reconstruction is known to be possible. + * + * Note: the lower this value is, the more damage we inflict, and + * the more time ztest spends in recovering that damage. We chose + * to induce damage 1/100th of the time so recovery is tested but + * not so frequently that ztest doesn't get to test other code paths. + */ + zfs_reconstruct_indirect_damage_fraction = 100; + + action.sa_handler = sig_handler; + sigemptyset(&action.sa_mask); + action.sa_flags = 0; + + if (sigaction(SIGSEGV, &action, NULL) < 0) { + (void) fprintf(stderr, "ztest: cannot catch SIGSEGV: %s.\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + if (sigaction(SIGABRT, &action, NULL) < 0) { + (void) fprintf(stderr, "ztest: cannot catch SIGABRT: %s.\n", + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* + * Force random_get_bytes() to use /dev/urandom in order to prevent + * ztest from needlessly depleting the system entropy pool. + */ + random_path = "/dev/urandom"; + ztest_fd_rand = open(random_path, O_RDONLY); + ASSERT3S(ztest_fd_rand, >=, 0); + + if (!fd_data_str) { + process_options(argc, argv); + + setup_data_fd(); + setup_hdr(); + setup_data(); + bcopy(&ztest_opts, ztest_shared_opts, + sizeof (*ztest_shared_opts)); + } else { + ztest_fd_data = atoi(fd_data_str); + setup_data(); + bcopy(ztest_shared_opts, &ztest_opts, sizeof (ztest_opts)); + } + ASSERT3U(ztest_opts.zo_datasets, ==, ztest_shared_hdr->zh_ds_count); + + /* Override location of zpool.cache */ + VERIFY(asprintf((char **)&spa_config_path, "%s/zpool.cache", + ztest_opts.zo_dir) != -1); + + ztest_ds = umem_alloc(ztest_opts.zo_datasets * sizeof (ztest_ds_t), + UMEM_NOFAIL); + zs = ztest_shared; + + if (fd_data_str) { + metaslab_force_ganging = ztest_opts.zo_metaslab_force_ganging; + metaslab_df_alloc_threshold = + zs->zs_metaslab_df_alloc_threshold; + + if (zs->zs_do_init) + ztest_run_init(); + else + ztest_run(zs); + exit(0); + } + + hasalt = (strlen(ztest_opts.zo_alt_ztest) != 0); + + if (ztest_opts.zo_verbose >= 1) { + (void) printf("%llu vdevs, %d datasets, %d threads," + " %llu seconds...\n", + (u_longlong_t)ztest_opts.zo_vdevs, + ztest_opts.zo_datasets, + ztest_opts.zo_threads, + (u_longlong_t)ztest_opts.zo_time); + } + + cmd = umem_alloc(MAXNAMELEN, UMEM_NOFAIL); + (void) strlcpy(cmd, getexecname(), MAXNAMELEN); + + zs->zs_do_init = B_TRUE; + if (strlen(ztest_opts.zo_alt_ztest) != 0) { + if (ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest for " + "initialization: %s\n", ztest_opts.zo_alt_ztest); + } + VERIFY(!exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_FALSE, NULL)); + } else { + VERIFY(!exec_child(NULL, NULL, B_FALSE, NULL)); + } + zs->zs_do_init = B_FALSE; + + zs->zs_proc_start = gethrtime(); + zs->zs_proc_stop = zs->zs_proc_start + ztest_opts.zo_time * NANOSEC; + + for (f = 0; f < ZTEST_FUNCS; f++) { + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) + zc->zc_next = UINT64_MAX; + else + zc->zc_next = zs->zs_proc_start + + ztest_random(2 * zi->zi_interval[0] + 1); + } + + /* + * Run the tests in a loop. These tests include fault injection + * to verify that self-healing data works, and forced crashes + * to verify that we never lose on-disk consistency. + */ + while (gethrtime() < zs->zs_proc_stop) { + int status; + boolean_t killed; + + /* + * Initialize the workload counters for each function. + */ + for (f = 0; f < ZTEST_FUNCS; f++) { + zc = ZTEST_GET_SHARED_CALLSTATE(f); + zc->zc_count = 0; + zc->zc_time = 0; + } + + /* Set the allocation switch size */ + zs->zs_metaslab_df_alloc_threshold = + ztest_random(zs->zs_metaslab_sz / 4) + 1; + + if (!hasalt || ztest_random(2) == 0) { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing newer ztest: %s\n", + cmd); + } + newer++; + killed = exec_child(cmd, NULL, B_TRUE, &status); + } else { + if (hasalt && ztest_opts.zo_verbose >= 1) { + (void) printf("Executing older ztest: %s\n", + ztest_opts.zo_alt_ztest); + } + older++; + killed = exec_child(ztest_opts.zo_alt_ztest, + ztest_opts.zo_alt_libpath, B_TRUE, &status); + } + + if (killed) + kills++; + iters++; + + if (ztest_opts.zo_verbose >= 1) { + hrtime_t now = gethrtime(); + + now = MIN(now, zs->zs_proc_stop); + print_time(zs->zs_proc_stop - now, timebuf); + nicenum(zs->zs_space, numbuf, sizeof (numbuf)); + + (void) printf("Pass %3d, %8s, %3llu ENOSPC, " + "%4.1f%% of %5s used, %3.0f%% done, %8s to go\n", + iters, + WIFEXITED(status) ? "Complete" : "SIGKILL", + (u_longlong_t)zs->zs_enospc_count, + 100.0 * zs->zs_alloc / zs->zs_space, + numbuf, + 100.0 * (now - zs->zs_proc_start) / + (ztest_opts.zo_time * NANOSEC), timebuf); + } + + if (ztest_opts.zo_verbose >= 2) { + (void) printf("\nWorkload summary:\n\n"); + (void) printf("%7s %9s %s\n", + "Calls", "Time", "Function"); + (void) printf("%7s %9s %s\n", + "-----", "----", "--------"); + for (f = 0; f < ZTEST_FUNCS; f++) { + zi = &ztest_info[f]; + zc = ZTEST_GET_SHARED_CALLSTATE(f); + print_time(zc->zc_time, timebuf); + (void) printf("%7llu %9s %s\n", + (u_longlong_t)zc->zc_count, timebuf, + zi->zi_funcname); + } + (void) printf("\n"); + } + + if (!ztest_opts.zo_mmp_test) + ztest_run_zdb(ztest_opts.zo_pool); + } + + if (ztest_opts.zo_verbose >= 1) { + if (hasalt) { + (void) printf("%d runs of older ztest: %s\n", older, + ztest_opts.zo_alt_ztest); + (void) printf("%d runs of newer ztest: %s\n", newer, + cmd); + } + (void) printf("%d killed, %d completed, %.0f%% kill rate\n", + kills, iters - kills, (100.0 * kills) / MAX(1, iters)); + } + + umem_free(cmd, MAXNAMELEN); + + return (0); +} diff --git a/cmd/zvol_id/.gitignore b/cmd/zvol_id/.gitignore new file mode 100644 index 000000000000..8b757a2d6781 --- /dev/null +++ b/cmd/zvol_id/.gitignore @@ -0,0 +1 @@ +zvol_id diff --git a/cmd/zvol_id/Makefile.am b/cmd/zvol_id/Makefile.am new file mode 100644 index 000000000000..a584875081eb --- /dev/null +++ b/cmd/zvol_id/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +# Disable GCC stack protection for zvol_id. This is a kludge and should be +# removed once https://github.com/zfsonlinux/zfs/issues/569 is resolved. +AM_CFLAGS += -fno-stack-protector + +udev_PROGRAMS = zvol_id + +zvol_id_SOURCES = \ + zvol_id_main.c diff --git a/cmd/zvol_id/zvol_id_main.c b/cmd/zvol_id/zvol_id_main.c new file mode 100644 index 000000000000..4a2d74cc203c --- /dev/null +++ b/cmd/zvol_id/zvol_id_main.c @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Fajar A. Nugraha. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +ioctl_get_msg(char *var, int fd) +{ + int error = 0; + char msg[ZFS_MAX_DATASET_NAME_LEN]; + + error = ioctl(fd, BLKZNAME, msg); + if (error < 0) { + return (error); + } + + snprintf(var, ZFS_MAX_DATASET_NAME_LEN, "%s", msg); + return (error); +} + +int +main(int argc, char **argv) +{ + int fd, error = 0; + char zvol_name[ZFS_MAX_DATASET_NAME_LEN]; + char *zvol_name_part = NULL; + char *dev_name; + struct stat64 statbuf; + int dev_minor, dev_part; + int i; + int rc; + + if (argc < 2) { + printf("Usage: %s /dev/zvol_device_node\n", argv[0]); + return (EINVAL); + } + + dev_name = argv[1]; + error = stat64(dev_name, &statbuf); + if (error != 0) { + printf("Unable to access device file: %s\n", dev_name); + return (errno); + } + + dev_minor = minor(statbuf.st_rdev); + dev_part = dev_minor % ZVOL_MINORS; + + fd = open(dev_name, O_RDONLY); + if (fd < 0) { + printf("Unable to open device file: %s\n", dev_name); + return (errno); + } + + error = ioctl_get_msg(zvol_name, fd); + if (error < 0) { + printf("ioctl_get_msg failed:%s\n", strerror(errno)); + return (errno); + } + if (dev_part > 0) + rc = asprintf(&zvol_name_part, "%s-part%d", zvol_name, + dev_part); + else + rc = asprintf(&zvol_name_part, "%s", zvol_name); + + if (rc == -1 || zvol_name_part == NULL) + goto error; + + for (i = 0; i < strlen(zvol_name_part); i++) { + if (isblank(zvol_name_part[i])) + zvol_name_part[i] = '+'; + } + + printf("%s\n", zvol_name_part); + free(zvol_name_part); +error: + close(fd); + return (error); +} diff --git a/cmd/zvol_wait/Makefile.am b/cmd/zvol_wait/Makefile.am new file mode 100644 index 000000000000..564031c9799d --- /dev/null +++ b/cmd/zvol_wait/Makefile.am @@ -0,0 +1 @@ +dist_bin_SCRIPTS = zvol_wait diff --git a/cmd/zvol_wait/zvol_wait b/cmd/zvol_wait/zvol_wait new file mode 100755 index 000000000000..9a3948da5564 --- /dev/null +++ b/cmd/zvol_wait/zvol_wait @@ -0,0 +1,116 @@ +#!/bin/sh + +count_zvols() { + if [ -z "$zvols" ]; then + echo 0 + else + echo "$zvols" | wc -l + fi +} + +filter_out_zvols_with_links() { + while read -r zvol; do + if [ ! -L "/dev/zvol/$zvol" ]; then + echo "$zvol" + fi + done +} + +filter_out_deleted_zvols() { + while read -r zvol; do + if zfs list "$zvol" >/dev/null 2>&1; then + echo "$zvol" + fi + done +} + +list_zvols() { + zfs list -t volume -H -o \ + name,volmode,receive_resume_token,redact_snaps | + while read -r zvol_line; do + name=$(echo "$zvol_line" | awk '{print $1}') + volmode=$(echo "$zvol_line" | awk '{print $2}') + token=$(echo "$zvol_line" | awk '{print $3}') + redacted=$(echo "$zvol_line" | awk '{print $4}') + # + # /dev links are not created for zvols with volmode = "none" + # or for redacted zvols. + # + [ "$volmode" = "none" ] && continue + [ "$redacted" = "-" ] || continue + # + # We also also ignore partially received zvols if it is + # not an incremental receive, as those won't even have a block + # device minor node created yet. + # + if [ "$token" != "-" ]; then + # + # Incremental receives create an invisible clone that + # is not automatically displayed by zfs list. + # + if ! zfs list "$name/%recv" >/dev/null 2>&1; then + continue + fi + fi + echo "$name" + done +} + +zvols=$(list_zvols) +zvols_count=$(count_zvols) +if [ "$zvols_count" -eq 0 ]; then + echo "No zvols found, nothing to do." + exit 0 +fi + +echo "Testing $zvols_count zvol links" + +outer_loop=0 +while [ "$outer_loop" -lt 20 ]; do + outer_loop=$((outer_loop + 1)) + + old_zvols_count=$(count_zvols) + + inner_loop=0 + while [ "$inner_loop" -lt 30 ]; do + inner_loop=$((inner_loop + 1)) + + zvols="$(echo "$zvols" | filter_out_zvols_with_links)" + + zvols_count=$(count_zvols) + if [ "$zvols_count" -eq 0 ]; then + echo "All zvol links are now present." + exit 0 + fi + sleep 1 + done + + echo "Still waiting on $zvols_count zvol links ..." + # + # Although zvols should normally not be deleted at boot time, + # if that is the case then their links will be missing and + # we would stall. + # + if [ "$old_zvols_count" -eq "$zvols_count" ]; then + echo "No progress since last loop." + echo "Checking if any zvols were deleted." + + zvols=$(echo "$zvols" | filter_out_deleted_zvols) + zvols_count=$(count_zvols) + + if [ "$old_zvols_count" -ne "$zvols_count" ]; then + echo "$((old_zvols_count - zvols_count)) zvol(s) deleted." + fi + + if [ "$zvols_count" -ne 0 ]; then + echo "Remaining zvols:" + echo "$zvols" + else + echo "All zvol links are now present." + exit 0 + fi + fi +done + +echo "Timed out waiting on zvol links" +exit 1 diff --git a/config/.gitignore b/config/.gitignore new file mode 100644 index 000000000000..cd811a0a956a --- /dev/null +++ b/config/.gitignore @@ -0,0 +1,9 @@ +/compile +/config.guess +/config.sub +/depcomp +/install-sh +/ltmain.sh +/missing +/libtool.m4 +/lt*.m4 diff --git a/config/Rules.am b/config/Rules.am new file mode 100644 index 000000000000..b02511a05298 --- /dev/null +++ b/config/Rules.am @@ -0,0 +1,62 @@ +# +# Default build rules for all user space components, every Makefile.am +# should include these rules and override or extend them as needed. +# + +DEFAULT_INCLUDES = \ + -include $(top_builddir)/zfs_config.h \ + -I$(top_builddir)/include \ + -I$(top_srcdir)/include \ + -I$(top_srcdir)/module/icp/include \ + -I$(top_srcdir)/lib/libspl/include + +if BUILD_LINUX +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib/libspl/include/os/linux +endif + +if BUILD_FREEBSD +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/lib/libspl/include/os/freebsd +endif + +AM_LIBTOOLFLAGS = --silent + +AM_CFLAGS = -std=gnu99 -Wall -Wstrict-prototypes -Wmissing-prototypes +AM_CFLAGS += -fno-strict-aliasing +AM_CFLAGS += $(NO_OMIT_FRAME_POINTER) +AM_CFLAGS += $(DEBUG_CFLAGS) +AM_CFLAGS += $(ASAN_CFLAGS) +AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) +if BUILD_FREEBSD +AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion +AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h +AM_CFLAGS += -I/usr/include -I/usr/local/include +endif + +AM_CPPFLAGS = -D_GNU_SOURCE +AM_CPPFLAGS += -D_REENTRANT +AM_CPPFLAGS += -D_FILE_OFFSET_BITS=64 +AM_CPPFLAGS += -D_LARGEFILE64_SOURCE +AM_CPPFLAGS += -DHAVE_LARGE_STACKS=1 +AM_CPPFLAGS += -DLIBEXECDIR=\"$(libexecdir)\" +AM_CPPFLAGS += -DRUNSTATEDIR=\"$(runstatedir)\" +AM_CPPFLAGS += -DSBINDIR=\"$(sbindir)\" +AM_CPPFLAGS += -DSYSCONFDIR=\"$(sysconfdir)\" +AM_CPPFLAGS += $(DEBUG_CPPFLAGS) +AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) +if BUILD_LINUX +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-linux-user\" +endif +if BUILD_FREEBSD +AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-freebsd-user\" +endif + +AM_LDFLAGS = $(DEBUG_LDFLAGS) +AM_LDFLAGS += $(ASAN_LDFLAGS) + +if BUILD_FREEBSD +AM_LDFLAGS += -fstack-protector-strong -shared +AM_LDFLAGS += -Wl,-x -Wl,--fatal-warnings -Wl,--warn-shared-textrel +AM_LDFLAGS += -lm +endif diff --git a/config/Substfiles.am b/config/Substfiles.am new file mode 100644 index 000000000000..63697bfa2b6a --- /dev/null +++ b/config/Substfiles.am @@ -0,0 +1,34 @@ +subst_sed_cmd = \ + -e 's|@bindir[@]|$(bindir)|g' \ + -e 's|@sbindir[@]|$(sbindir)|g' \ + -e 's|@datadir[@]|$(datadir)|g' \ + -e 's|@sysconfdir[@]|$(sysconfdir)|g' \ + -e 's|@runstatedir[@]|$(runstatedir)|g' \ + -e 's|@initconfdir[@]|$(initconfdir)|g' \ + -e 's|@initdir[@]|$(initdir)|g' \ + -e 's|@mounthelperdir[@]|$(mounthelperdir)|g' \ + -e 's|@systemdgeneratordir[@]|$(systemdgeneratordir)|g' \ + -e 's|@systemdunitdir[@]|$(systemdunitdir)|g' \ + -e 's|@udevdir[@]|$(udevdir)|g' \ + -e 's|@udevruledir[@]|$(udevruledir)|g' \ + -e 's|@zfsexecdir[@]|$(zfsexecdir)|g' \ + -e 's|@PYTHON[@]|$(PYTHON)|g' \ + -e 's|@PYTHON_SHEBANG[@]|$(PYTHON_SHEBANG)|g' \ + -e 's|@DEFAULT_INIT_NFS_SERVER[@]|$(DEFAULT_INIT_NFS_SERVER)|g' \ + -e 's|@DEFAULT_INIT_SHELL[@]|$(DEFAULT_INIT_SHELL)|g' + +SUBSTFILES = +CLEANFILES = $(SUBSTFILES) +EXTRA_DIST = $(SUBSTFILES:=.in) + +$(SUBSTFILES):%:%.in Makefile + $(AM_V_GEN)set -e; \ + $(MKDIR_P) $$(dirname $@); \ + $(RM) $@~; \ + $(SED) $(subst_sed_cmd) $< >$@~; \ + if grep -E '@[a-zA-Z0-9_]+@' $@~ >&2; then \ + echo "Undefined substitution" >&2; \ + exit 1; \ + else test $$? -eq 1; fi; \ + test -x $< && chmod +x $@~; \ + mv -f $@~ $@ diff --git a/config/always-arch.m4 b/config/always-arch.m4 new file mode 100644 index 000000000000..25e8c963a4b4 --- /dev/null +++ b/config/always-arch.m4 @@ -0,0 +1,41 @@ +dnl # +dnl # Set the target cpu architecture. This allows the +dnl # following syntax to be used in a Makefile.am. +dnl # +dnl # ifeq ($(TARGET_CPU),x86_64) +dnl # ... +dnl # endif +dnl # +dnl # if TARGET_CPU_POWERPC +dnl # ... +dnl # else +dnl # ... +dnl # endif +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_ARCH], [ + case $target_cpu in + i?86) + TARGET_CPU=i386 + ;; + amd64|x86_64) + TARGET_CPU=x86_64 + ;; + powerpc*) + TARGET_CPU=powerpc + ;; + aarch64*) + TARGET_CPU=aarch64 + ;; + sparc64) + TARGET_CPU=sparc64 + ;; + esac + + AC_SUBST(TARGET_CPU) + + AM_CONDITIONAL([TARGET_CPU_I386], test $TARGET_CPU = i386) + AM_CONDITIONAL([TARGET_CPU_X86_64], test $TARGET_CPU = x86_64) + AM_CONDITIONAL([TARGET_CPU_POWERPC], test $TARGET_CPU = powerpc) + AM_CONDITIONAL([TARGET_CPU_AARCH64], test $TARGET_CPU = aarch64) + AM_CONDITIONAL([TARGET_CPU_SPARC64], test $TARGET_CPU = sparc64) +]) diff --git a/config/always-compiler-options.m4 b/config/always-compiler-options.m4 new file mode 100644 index 000000000000..a84123317989 --- /dev/null +++ b/config/always-compiler-options.m4 @@ -0,0 +1,204 @@ +dnl # +dnl # Enabled -fsanitize=address if supported by gcc. +dnl # +dnl # LDFLAGS needs -fsanitize=address at all times so libraries compiled with +dnl # it will be linked successfully. CFLAGS will vary by binary being built. +dnl # +dnl # The ASAN_OPTIONS environment variable can be used to further control +dnl # the behavior of binaries and libraries build with -fsanitize=address. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_ASAN], [ + AC_MSG_CHECKING([whether to build with -fsanitize=address support]) + AC_ARG_ENABLE([asan], + [AS_HELP_STRING([--enable-asan], + [Enable -fsanitize=address support @<:@default=no@:>@])], + [], + [enable_asan=no]) + + AM_CONDITIONAL([ASAN_ENABLED], [test x$enable_asan = xyes]) + AC_SUBST([ASAN_ENABLED], [$enable_asan]) + AC_MSG_RESULT($enable_asan) + + AS_IF([ test "$enable_asan" = "yes" ], [ + AC_MSG_CHECKING([whether $CC supports -fsanitize=address]) + saved_cflags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -fsanitize=address" + AC_LINK_IFELSE([ + AC_LANG_SOURCE([[ int main() { return 0; } ]]) + ], [ + ASAN_CFLAGS="-fsanitize=address" + ASAN_LDFLAGS="-fsanitize=address" + ASAN_ZFS="_with_asan" + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_ERROR([$CC does not support -fsanitize=address]) + ]) + CFLAGS="$saved_cflags" + ], [ + ASAN_CFLAGS="" + ASAN_LDFLAGS="" + ASAN_ZFS="_without_asan" + ]) + + AC_SUBST([ASAN_CFLAGS]) + AC_SUBST([ASAN_LDFLAGS]) + AC_SUBST([ASAN_ZFS]) +]) + +dnl # +dnl # Check if gcc supports -Wframe-larger-than= option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN], [ + AC_MSG_CHECKING([whether $CC supports -Wframe-larger-than=]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wframe-larger-than=4096" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + FRAME_LARGER_THAN="-Wframe-larger-than=4096" + AC_MSG_RESULT([yes]) + ], [ + FRAME_LARGER_THAN="" + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([FRAME_LARGER_THAN]) +]) + +dnl # +dnl # Check if gcc supports -Wno-format-truncation option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION], [ + AC_MSG_CHECKING([whether $CC supports -Wno-format-truncation]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-format-truncation" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_FORMAT_TRUNCATION=-Wno-format-truncation + AC_MSG_RESULT([yes]) + ], [ + NO_FORMAT_TRUNCATION= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_FORMAT_TRUNCATION]) +]) + +dnl # +dnl # Check if gcc supports -Wno-format-truncation option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH], [ + AC_MSG_CHECKING([whether $CC supports -Wno-format-zero-length]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wno-format-zero-length" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_FORMAT_ZERO_LENGTH=-Wno-format-zero-length + AC_MSG_RESULT([yes]) + ], [ + NO_FORMAT_ZERO_LENGTH= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_FORMAT_ZERO_LENGTH]) +]) + + +dnl # +dnl # Check if gcc supports -Wno-bool-compare option. +dnl # +dnl # We actually invoke gcc with the -Wbool-compare option +dnl # and infer the 'no-' version does or doesn't exist based upon +dnl # the results. This is required because when checking any of +dnl # no- prefixed options gcc always returns success. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE], [ + AC_MSG_CHECKING([whether $CC supports -Wno-bool-compare]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wbool-compare" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_BOOL_COMPARE=-Wno-bool-compare + AC_MSG_RESULT([yes]) + ], [ + NO_BOOL_COMPARE= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_BOOL_COMPARE]) +]) + +dnl # +dnl # Check if gcc supports -Wno-unused-but-set-variable option. +dnl # +dnl # We actually invoke gcc with the -Wunused-but-set-variable option +dnl # and infer the 'no-' version does or doesn't exist based upon +dnl # the results. This is required because when checking any of +dnl # no- prefixed options gcc always returns success. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE], [ + AC_MSG_CHECKING([whether $CC supports -Wno-unused-but-set-variable]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -Wunused-but-set-variable" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_UNUSED_BUT_SET_VARIABLE=-Wno-unused-but-set-variable + AC_MSG_RESULT([yes]) + ], [ + NO_UNUSED_BUT_SET_VARIABLE= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_UNUSED_BUT_SET_VARIABLE]) +]) + +dnl # +dnl # Check if gcc supports -fno-omit-frame-pointer option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER], [ + AC_MSG_CHECKING([whether $CC supports -fno-omit-frame-pointer]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -fno-omit-frame-pointer" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_OMIT_FRAME_POINTER=-fno-omit-frame-pointer + AC_MSG_RESULT([yes]) + ], [ + NO_OMIT_FRAME_POINTER= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_OMIT_FRAME_POINTER]) +]) + +dnl # +dnl # Check if cc supports -fno-ipa-sra option. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA], [ + AC_MSG_CHECKING([whether $CC supports -fno-ipa-sra]) + + saved_flags="$CFLAGS" + CFLAGS="$CFLAGS -Werror -fno-ipa-sra" + + AC_COMPILE_IFELSE([AC_LANG_PROGRAM([], [])], [ + NO_IPA_SRA=-fno-ipa-sra + AC_MSG_RESULT([yes]) + ], [ + NO_IPA_SRA= + AC_MSG_RESULT([no]) + ]) + + CFLAGS="$saved_flags" + AC_SUBST([NO_IPA_SRA]) +]) diff --git a/config/always-python.m4 b/config/always-python.m4 new file mode 100644 index 000000000000..c01e631a8f4f --- /dev/null +++ b/config/always-python.m4 @@ -0,0 +1,70 @@ +dnl # +dnl # The majority of the python scripts are written to be compatible +dnl # with Python 2.6 and Python 3.4. Therefore, they may be installed +dnl # and used with either interpreter. This option is intended to +dnl # to provide a method to specify the default system version, and +dnl # set the PYTHON environment variable accordingly. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYTHON], [ + AC_ARG_WITH([python], + AC_HELP_STRING([--with-python[=VERSION]], + [default system python version @<:@default=check@:>@]), + [with_python=$withval], + [with_python=check]) + + AS_CASE([$with_python], + [check], [AC_CHECK_PROGS([PYTHON], [python3 python2], [:])], + [2*], [PYTHON="python${with_python}"], + [*python2*], [PYTHON="${with_python}"], + [3*], [PYTHON="python${with_python}"], + [*python3*], [PYTHON="${with_python}"], + [no], [PYTHON=":"], + [AC_MSG_ERROR([Unknown --with-python value '$with_python'])] + ) + + dnl # + dnl # Minimum supported Python versions for utilities: + dnl # Python 2.6 or Python 3.4 + dnl # + AM_PATH_PYTHON([], [], [:]) + AS_IF([test -z "$PYTHON_VERSION"], [ + PYTHON_VERSION=$(basename $PYTHON | tr -cd 0-9.) + ]) + PYTHON_MINOR=${PYTHON_VERSION#*\.} + + AS_CASE([$PYTHON_VERSION], + [2.*], [ + AS_IF([test $PYTHON_MINOR -lt 6], + [AC_MSG_ERROR("Python >= 2.6 is required")]) + ], + [3.*], [ + AS_IF([test $PYTHON_MINOR -lt 4], + [AC_MSG_ERROR("Python >= 3.4 is required")]) + ], + [:|2|3], [], + [PYTHON_VERSION=3] + ) + + AM_CONDITIONAL([USING_PYTHON], [test "$PYTHON" != :]) + AM_CONDITIONAL([USING_PYTHON_2], [test "x${PYTHON_VERSION%%\.*}" = x2]) + AM_CONDITIONAL([USING_PYTHON_3], [test "x${PYTHON_VERSION%%\.*}" = x3]) + + AM_COND_IF([USING_PYTHON_2], + [AC_SUBST([PYTHON_SHEBANG], [python2])], + [AC_SUBST([PYTHON_SHEBANG], [python3])]) + + dnl # + dnl # Request that packages be built for a specific Python version. + dnl # + AS_IF([test "x$with_python" != xcheck], [ + PYTHON_PKG_VERSION=$(echo $PYTHON_VERSION | tr -d .) + DEFINE_PYTHON_PKG_VERSION='--define "__use_python_pkg_version '${PYTHON_PKG_VERSION}'"' + DEFINE_PYTHON_VERSION='--define "__use_python '${PYTHON}'"' + ], [ + DEFINE_PYTHON_VERSION='' + DEFINE_PYTHON_PKG_VERSION='' + ]) + + AC_SUBST(DEFINE_PYTHON_VERSION) + AC_SUBST(DEFINE_PYTHON_PKG_VERSION) +]) diff --git a/config/always-pyzfs.m4 b/config/always-pyzfs.m4 new file mode 100644 index 000000000000..f620a8f9a18b --- /dev/null +++ b/config/always-pyzfs.m4 @@ -0,0 +1,105 @@ +dnl # +dnl # ZFS_AC_PYTHON_MODULE(module_name, [action-if-true], [action-if-false]) +dnl # +dnl # Checks for Python module. Freely inspired by AX_PYTHON_MODULE +dnl # https://www.gnu.org/software/autoconf-archive/ax_python_module.html +dnl # Required by ZFS_AC_CONFIG_ALWAYS_PYZFS. +dnl # +AC_DEFUN([ZFS_AC_PYTHON_MODULE], [ + PYTHON_NAME=$(basename $PYTHON) + AC_MSG_CHECKING([for $PYTHON_NAME module: $1]) + AS_IF([$PYTHON -c "import $1" 2>/dev/null], [ + AC_MSG_RESULT(yes) + m4_ifvaln([$2], [$2]) + ], [ + AC_MSG_RESULT(no) + m4_ifvaln([$3], [$3]) + ]) +]) + +dnl # +dnl # Determines if pyzfs can be built, requires Python 2.7 or later. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_PYZFS], [ + AC_ARG_ENABLE([pyzfs], + AC_HELP_STRING([--enable-pyzfs], + [install libzfs_core python bindings @<:@default=check@:>@]), + [enable_pyzfs=$enableval], + [enable_pyzfs=check]) + + dnl # + dnl # Packages for pyzfs specifically enabled/disabled. + dnl # + AS_IF([test "x$enable_pyzfs" != xcheck], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + DEFINE_PYZFS='--with pyzfs' + ], [ + DEFINE_PYZFS='--without pyzfs' + ]) + ], [ + AS_IF([test "$PYTHON" != :], [ + DEFINE_PYZFS='' + ], [ + enable_pyzfs=no + DEFINE_PYZFS='--without pyzfs' + ]) + ]) + AC_SUBST(DEFINE_PYZFS) + + dnl # + dnl # Require python-devel libraries + dnl # + AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ + AS_CASE([$PYTHON_VERSION], + [3.*], [PYTHON_REQUIRED_VERSION=">= '3.4.0'"], + [2.*], [PYTHON_REQUIRED_VERSION=">= '2.7.0'"], + [AC_MSG_ERROR("Python $PYTHON_VERSION unknown")] + ) + + AX_PYTHON_DEVEL([$PYTHON_REQUIRED_VERSION], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AC_MSG_ERROR("Python $PYTHON_REQUIRED_VERSION development library is not installed") + ], [test "x$enable_pyzfs" != xno], [ + enable_pyzfs=no + ]) + ]) + ]) + + dnl # + dnl # Python "setuptools" module is required to build and install pyzfs + dnl # + AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ + ZFS_AC_PYTHON_MODULE([setuptools], [], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AC_MSG_ERROR("Python $PYTHON_VERSION setuptools is not installed") + ], [test "x$enable_pyzfs" != xno], [ + enable_pyzfs=no + ]) + ]) + ]) + + dnl # + dnl # Python "cffi" module is required to run pyzfs + dnl # + AS_IF([test "x$enable_pyzfs" = xcheck -o "x$enable_pyzfs" = xyes], [ + ZFS_AC_PYTHON_MODULE([cffi], [], [ + AS_IF([test "x$enable_pyzfs" = xyes], [ + AC_MSG_ERROR("Python $PYTHON_VERSION cffi is not installed") + ], [test "x$enable_pyzfs" != xno], [ + enable_pyzfs=no + ]) + ]) + ]) + + dnl # + dnl # Set enable_pyzfs to 'yes' if every check passed + dnl # + AS_IF([test "x$enable_pyzfs" = xcheck], [enable_pyzfs=yes]) + + AM_CONDITIONAL([PYZFS_ENABLED], [test "x$enable_pyzfs" = xyes]) + AC_SUBST([PYZFS_ENABLED], [$enable_pyzfs]) + AC_SUBST(pythonsitedir, [$PYTHON_SITE_PKG]) + + AC_MSG_CHECKING([whether to enable pyzfs: ]) + AC_MSG_RESULT($enable_pyzfs) +]) diff --git a/config/always-sed.m4 b/config/always-sed.m4 new file mode 100644 index 000000000000..19633e118aed --- /dev/null +++ b/config/always-sed.m4 @@ -0,0 +1,16 @@ +dnl # +dnl # Set the flags used for sed in-place edits. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SED], [ + AC_REQUIRE([AC_PROG_SED])dnl + AC_CACHE_CHECK([for sed --in-place], [ac_cv_inplace], [ + tmpfile=$(mktemp conftest.XXX) + echo foo >$tmpfile + AS_IF([$SED --in-place 's#foo#bar#' $tmpfile 2>/dev/null], + [ac_cv_inplace="--in-place"], + [$SED -i '' 's#foo#bar#' $tmpfile 2>/dev/null], + [ac_cv_inplace="-i ''"], + [AC_MSG_ERROR([$SED does not support in-place])]) + ]) + AC_SUBST([ac_inplace], [$ac_cv_inplace]) +]) diff --git a/config/always-system.m4 b/config/always-system.m4 new file mode 100644 index 000000000000..3225a52af8ae --- /dev/null +++ b/config/always-system.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # Set the target system +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_SYSTEM], [ + AC_MSG_CHECKING([for system type ($host_os)]) + case $host_os in + *linux*) + AC_DEFINE([SYSTEM_LINUX], [1], + [True if ZFS is to be compiled for a Linux system]) + ac_system="Linux" + ;; + *freebsd*) + AC_DEFINE([SYSTEM_FREEBSD], [1], + [True if ZFS is to be compiled for a FreeBSD system]) + ac_system="FreeBSD" + ;; + *) + ac_system="unknown" + ;; + esac + AC_MSG_RESULT([$ac_system]) + AC_SUBST([ac_system]) + + AM_CONDITIONAL([BUILD_LINUX], [test "x$ac_system" = "xLinux"]) + AM_CONDITIONAL([BUILD_FREEBSD], [test "x$ac_system" = "xFreeBSD"]) +]) diff --git a/config/ax_code_coverage.m4 b/config/ax_code_coverage.m4 new file mode 100644 index 000000000000..3e3c666f3c54 --- /dev/null +++ b/config/ax_code_coverage.m4 @@ -0,0 +1,268 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_code_coverage.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_CODE_COVERAGE() +# +# DESCRIPTION +# +# Defines CODE_COVERAGE_CPPFLAGS, CODE_COVERAGE_CFLAGS, +# CODE_COVERAGE_CXXFLAGS and CODE_COVERAGE_LIBS which should be included +# in the CPPFLAGS, CFLAGS CXXFLAGS and LIBS/LIBADD variables of every +# build target (program or library) which should be built with code +# coverage support. Also defines CODE_COVERAGE_RULES which should be +# substituted in your Makefile; and $enable_code_coverage which can be +# used in subsequent configure output. CODE_COVERAGE_ENABLED is defined +# and substituted, and corresponds to the value of the +# --enable-code-coverage option, which defaults to being disabled. +# +# Test also for gcov program and create GCOV variable that could be +# substituted. +# +# Note that all optimization flags in CFLAGS must be disabled when code +# coverage is enabled. +# +# Usage example: +# +# configure.ac: +# +# AX_CODE_COVERAGE +# +# Makefile.am: +# +# @CODE_COVERAGE_RULES@ +# my_program_LIBS = ... $(CODE_COVERAGE_LIBS) ... +# my_program_CPPFLAGS = ... $(CODE_COVERAGE_CPPFLAGS) ... +# my_program_CFLAGS = ... $(CODE_COVERAGE_CFLAGS) ... +# my_program_CXXFLAGS = ... $(CODE_COVERAGE_CXXFLAGS) ... +# +# This results in a "check-code-coverage" rule being added to any +# Makefile.am which includes "@CODE_COVERAGE_RULES@" (assuming the module +# has been configured with --enable-code-coverage). Running `make +# check-code-coverage` in that directory will run the module's test suite +# (`make check`) and build a code coverage report detailing the code which +# was touched, then print the URI for the report. +# +# In earlier versions of this macro, CODE_COVERAGE_LDFLAGS was defined +# instead of CODE_COVERAGE_LIBS. They are both still defined, but use of +# CODE_COVERAGE_LIBS is preferred for clarity; CODE_COVERAGE_LDFLAGS is +# deprecated. They have the same value. +# +# This code was derived from Makefile.decl in GLib, originally licensed +# under LGPLv2.1+. +# +# LICENSE +# +# Copyright (c) 2012, 2016 Philip Withnall +# Copyright (c) 2012 Xan Lopez +# Copyright (c) 2012 Christian Persch +# Copyright (c) 2012 Paolo Borelli +# Copyright (c) 2012 Dan Winship +# Copyright (c) 2015 Bastien ROUCARIES +# +# This library is free software; you can redistribute it and/or modify it +# under the terms of the GNU Lesser General Public License as published by +# the Free Software Foundation; either version 2.1 of the License, or (at +# your option) any later version. +# +# This library is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser +# General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public License +# along with this program. If not, see . + +#serial 25 + +AC_DEFUN([AX_CODE_COVERAGE],[ + dnl Check for --enable-code-coverage + AC_REQUIRE([AC_PROG_SED]) + + # allow to override gcov location + AC_ARG_WITH([gcov], + [AS_HELP_STRING([--with-gcov[=GCOV]], [use given GCOV for coverage (GCOV=gcov).])], + [_AX_CODE_COVERAGE_GCOV_PROG_WITH=$with_gcov], + [_AX_CODE_COVERAGE_GCOV_PROG_WITH=gcov]) + + AC_MSG_CHECKING([whether to build with code coverage support]) + AC_ARG_ENABLE([code-coverage], + AS_HELP_STRING([--enable-code-coverage], + [Whether to enable code coverage support]),, + enable_code_coverage=no) + + AM_CONDITIONAL([CODE_COVERAGE_ENABLED], [test x$enable_code_coverage = xyes]) + AC_SUBST([CODE_COVERAGE_ENABLED], [$enable_code_coverage]) + AC_MSG_RESULT($enable_code_coverage) + + AS_IF([ test "$enable_code_coverage" = "yes" ], [ + # check for gcov + AC_CHECK_TOOL([GCOV], + [$_AX_CODE_COVERAGE_GCOV_PROG_WITH], + [:]) + AS_IF([test "X$GCOV" = "X:"], + [AC_MSG_ERROR([gcov is needed to do coverage])]) + AC_SUBST([GCOV]) + + dnl Check if gcc is being used + AS_IF([ test "$GCC" = "no" ], [ + AC_MSG_ERROR([not compiling with gcc, which is required for gcov code coverage]) + ]) + + AC_CHECK_PROG([LCOV], [lcov], [lcov]) + AC_CHECK_PROG([GENHTML], [genhtml], [genhtml]) + + AS_IF([ test -z "$LCOV" ], [ + AC_MSG_ERROR([To enable code coverage reporting you must have lcov installed]) + ]) + + AS_IF([ test -z "$GENHTML" ], [ + AC_MSG_ERROR([Could not find genhtml from the lcov package]) + ]) + + dnl Build the code coverage flags + dnl Define CODE_COVERAGE_LDFLAGS for backwards compatibility + CODE_COVERAGE_CPPFLAGS="" + CODE_COVERAGE_CFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" + CODE_COVERAGE_CXXFLAGS="-O0 -g -fprofile-arcs -ftest-coverage" + CODE_COVERAGE_LIBS="-lgcov" + CODE_COVERAGE_LDFLAGS="$CODE_COVERAGE_LIBS" + + AC_SUBST([CODE_COVERAGE_CPPFLAGS]) + AC_SUBST([CODE_COVERAGE_CFLAGS]) + AC_SUBST([CODE_COVERAGE_CXXFLAGS]) + AC_SUBST([CODE_COVERAGE_LIBS]) + AC_SUBST([CODE_COVERAGE_LDFLAGS]) + + [CODE_COVERAGE_RULES_CHECK=' + -$(A''M_V_at)$(MAKE) $(AM_MAKEFLAGS) -k check + $(A''M_V_at)$(MAKE) $(AM_MAKEFLAGS) code-coverage-capture +'] + [CODE_COVERAGE_RULES_CAPTURE=' + $(code_coverage_v_lcov_cap)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --capture --output-file "$(CODE_COVERAGE_OUTPUT_FILE).tmp" --test-name "$(call code_coverage_sanitize,$(PACKAGE_NAME)-$(PACKAGE_VERSION))" --no-checksum --compat-libtool $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_OPTIONS) + $(code_coverage_v_lcov_ign)$(LCOV) $(code_coverage_quiet) $(addprefix --directory ,$(CODE_COVERAGE_DIRECTORY)) --remove "$(CODE_COVERAGE_OUTPUT_FILE).tmp" $(CODE_COVERAGE_IGNORE_PATTERN) --output-file "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_LCOV_SHOPTS) $(CODE_COVERAGE_LCOV_RMOPTS) + -@rm -f $(CODE_COVERAGE_OUTPUT_FILE).tmp + $(code_coverage_v_genhtml)LANG=C $(GENHTML) $(code_coverage_quiet) $(addprefix --prefix ,$(CODE_COVERAGE_DIRECTORY)) --output-directory "$(CODE_COVERAGE_OUTPUT_DIRECTORY)" --title "$(PACKAGE_NAME)-$(PACKAGE_VERSION) Code Coverage" --legend --show-details "$(CODE_COVERAGE_OUTPUT_FILE)" $(CODE_COVERAGE_GENHTML_OPTIONS) + @echo "file://$(abs_builddir)/$(CODE_COVERAGE_OUTPUT_DIRECTORY)/index.html" +'] + [CODE_COVERAGE_RULES_CLEAN=' +clean: code-coverage-clean +distclean: code-coverage-clean +code-coverage-clean: + -$(LCOV) --directory $(top_builddir) -z + -rm -rf $(CODE_COVERAGE_OUTPUT_FILE) $(CODE_COVERAGE_OUTPUT_FILE).tmp $(CODE_COVERAGE_OUTPUT_DIRECTORY) + -find . \( -name "*.gcda" -o -name "*.gcno" -o -name "*.gcov" \) -delete +'] + ], [ + [CODE_COVERAGE_RULES_CHECK=' + @echo "Need to reconfigure with --enable-code-coverage" +'] + CODE_COVERAGE_RULES_CAPTURE="$CODE_COVERAGE_RULES_CHECK" + CODE_COVERAGE_RULES_CLEAN='' + ]) + +[CODE_COVERAGE_RULES=' +# Code coverage +# +# Optional: +# - CODE_COVERAGE_DIRECTORY: Top-level directory for code coverage reporting. +# Multiple directories may be specified, separated by whitespace. +# (Default: $(top_builddir)) +# - CODE_COVERAGE_OUTPUT_FILE: Filename and path for the .info file generated +# by lcov for code coverage. (Default: +# $(PACKAGE_NAME)-$(PACKAGE_VERSION)-coverage.info) +# - CODE_COVERAGE_OUTPUT_DIRECTORY: Directory for generated code coverage +# reports to be created. (Default: +# $(PACKAGE_NAME)-$(PACKAGE_VERSION)-coverage) +# - CODE_COVERAGE_BRANCH_COVERAGE: Set to 1 to enforce branch coverage, +# set to 0 to disable it and leave empty to stay with the default. +# (Default: empty) +# - CODE_COVERAGE_LCOV_SHOPTS_DEFAULT: Extra options shared between both lcov +# instances. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE) +# - CODE_COVERAGE_LCOV_SHOPTS: Extra options to shared between both lcov +# instances. (Default: $CODE_COVERAGE_LCOV_SHOPTS_DEFAULT) +# - CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH: --gcov-tool pathtogcov +# - CODE_COVERAGE_LCOV_OPTIONS_DEFAULT: Extra options to pass to the +# collecting lcov instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH) +# - CODE_COVERAGE_LCOV_OPTIONS: Extra options to pass to the collecting lcov +# instance. (Default: $CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) +# - CODE_COVERAGE_LCOV_RMOPTS_DEFAULT: Extra options to pass to the filtering +# lcov instance. (Default: empty) +# - CODE_COVERAGE_LCOV_RMOPTS: Extra options to pass to the filtering lcov +# instance. (Default: $CODE_COVERAGE_LCOV_RMOPTS_DEFAULT) +# - CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT: Extra options to pass to the +# genhtml instance. (Default: based on $CODE_COVERAGE_BRANCH_COVERAGE) +# - CODE_COVERAGE_GENHTML_OPTIONS: Extra options to pass to the genhtml +# instance. (Default: $CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT) +# - CODE_COVERAGE_IGNORE_PATTERN: Extra glob pattern of files to ignore +# +# The generated report will be titled using the $(PACKAGE_NAME) and +# $(PACKAGE_VERSION). In order to add the current git hash to the title, +# use the git-version-gen script, available online. + +# Optional variables +CODE_COVERAGE_DIRECTORY ?= $(top_builddir) +CODE_COVERAGE_OUTPUT_FILE ?= $(PACKAGE_NAME)-$(PACKAGE_VERSION)-coverage.info +CODE_COVERAGE_OUTPUT_DIRECTORY ?= $(PACKAGE_NAME)-$(PACKAGE_VERSION)-coverage +CODE_COVERAGE_BRANCH_COVERAGE ?= +CODE_COVERAGE_LCOV_SHOPTS_DEFAULT ?= $(if $(CODE_COVERAGE_BRANCH_COVERAGE),\ +--rc lcov_branch_coverage=$(CODE_COVERAGE_BRANCH_COVERAGE)) +CODE_COVERAGE_LCOV_SHOPTS ?= $(CODE_COVERAGE_LCOV_SHOPTS_DEFAULT) +CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH ?= --gcov-tool "$(GCOV)" +CODE_COVERAGE_LCOV_OPTIONS_DEFAULT ?= $(CODE_COVERAGE_LCOV_OPTIONS_GCOVPATH) +CODE_COVERAGE_LCOV_OPTIONS ?= $(CODE_COVERAGE_LCOV_OPTIONS_DEFAULT) +CODE_COVERAGE_LCOV_RMOPTS_DEFAULT ?= +CODE_COVERAGE_LCOV_RMOPTS ?= $(CODE_COVERAGE_LCOV_RMOPTS_DEFAULT) +CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT ?=\ +$(if $(CODE_COVERAGE_BRANCH_COVERAGE),\ +--rc genhtml_branch_coverage=$(CODE_COVERAGE_BRANCH_COVERAGE)) +CODE_COVERAGE_GENHTML_OPTIONS ?= $(CODE_COVERAGE_GENHTML_OPTIONS_DEFAULT) + +# Add any folders you want to ignore here +# Ignore tmp and tests themselves +CODE_COVERAGE_IGNORE_PATTERN ?= "/tmp/*" "*/tests/*" +CODE_COVERAGE_IGNORE_PATTERN += "*/module/zstd/lib/*" + +GITIGNOREFILES ?= +GITIGNOREFILES += $(CODE_COVERAGE_OUTPUT_FILE) $(CODE_COVERAGE_OUTPUT_DIRECTORY) + +code_coverage_v_lcov_cap = $(code_coverage_v_lcov_cap_$(V)) +code_coverage_v_lcov_cap_ = $(code_coverage_v_lcov_cap_$(AM_DEFAULT_VERBOSITY)) +code_coverage_v_lcov_cap_0 = @echo " LCOV --capture"\ + $(CODE_COVERAGE_OUTPUT_FILE); +code_coverage_v_lcov_ign = $(code_coverage_v_lcov_ign_$(V)) +code_coverage_v_lcov_ign_ = $(code_coverage_v_lcov_ign_$(AM_DEFAULT_VERBOSITY)) +code_coverage_v_lcov_ign_0 = @echo " LCOV --remove /tmp/*"\ + $(CODE_COVERAGE_IGNORE_PATTERN); +code_coverage_v_genhtml = $(code_coverage_v_genhtml_$(V)) +code_coverage_v_genhtml_ = $(code_coverage_v_genhtml_$(AM_DEFAULT_VERBOSITY)) +code_coverage_v_genhtml_0 = @echo " GEN " $(CODE_COVERAGE_OUTPUT_DIRECTORY); +code_coverage_quiet = $(code_coverage_quiet_$(V)) +code_coverage_quiet_ = $(code_coverage_quiet_$(AM_DEFAULT_VERBOSITY)) +code_coverage_quiet_0 = --quiet + +# sanitizes the test-name: replaces with underscores: dashes and dots +code_coverage_sanitize = $(subst -,_,$(subst .,_,$(1))) + +# Use recursive makes in order to ignore errors during check +check-code-coverage:'"$CODE_COVERAGE_RULES_CHECK"' + +# Capture code coverage data +code-coverage-capture: code-coverage-capture-hook'"$CODE_COVERAGE_RULES_CAPTURE"' + +# Hook rule executed before code-coverage-capture, overridable by the user +code-coverage-capture-hook: + +'"$CODE_COVERAGE_RULES_CLEAN"' + +A''M_DISTCHECK_CONFIGURE_FLAGS ?= +A''M_DISTCHECK_CONFIGURE_FLAGS += --disable-code-coverage + +.PHONY: check-code-coverage code-coverage-capture code-coverage-capture-hook code-coverage-clean +'] + + AC_SUBST([CODE_COVERAGE_RULES]) + m4_ifdef([_AM_SUBST_NOTMAKE], [_AM_SUBST_NOTMAKE([CODE_COVERAGE_RULES])]) +]) diff --git a/config/ax_python_devel.m4 b/config/ax_python_devel.m4 new file mode 100644 index 000000000000..c51b45b7d54d --- /dev/null +++ b/config/ax_python_devel.m4 @@ -0,0 +1,345 @@ +# =========================================================================== +# https://www.gnu.org/software/autoconf-archive/ax_python_devel.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_PYTHON_DEVEL([version], [action-if-not-found]) +# +# DESCRIPTION +# +# Note: Defines as a precious variable "PYTHON_VERSION". Don't override it +# in your configure.ac. +# +# Note: this is a slightly modified version of the original AX_PYTHON_DEVEL +# macro which accepts an additional [action-if-not-found] argument. This +# allow to detect if Python development is available without aborting the +# configure phase with an hard error in case it is not. +# +# This macro checks for Python and tries to get the include path to +# 'Python.h'. It provides the $(PYTHON_CPPFLAGS) and $(PYTHON_LIBS) output +# variables. It also exports $(PYTHON_EXTRA_LIBS) and +# $(PYTHON_EXTRA_LDFLAGS) for embedding Python in your code. +# +# You can search for some particular version of Python by passing a +# parameter to this macro, for example ">= '2.3.1'", or "== '2.4'". Please +# note that you *have* to pass also an operator along with the version to +# match, and pay special attention to the single quotes surrounding the +# version number. Don't use "PYTHON_VERSION" for this: that environment +# variable is declared as precious and thus reserved for the end-user. +# +# This macro should work for all versions of Python >= 2.1.0. As an end +# user, you can disable the check for the python version by setting the +# PYTHON_NOVERSIONCHECK environment variable to something else than the +# empty string. +# +# If you need to use this macro for an older Python version, please +# contact the authors. We're always open for feedback. +# +# LICENSE +# +# Copyright (c) 2009 Sebastian Huber +# Copyright (c) 2009 Alan W. Irwin +# Copyright (c) 2009 Rafael Laboissiere +# Copyright (c) 2009 Andrew Collier +# Copyright (c) 2009 Matteo Settenvini +# Copyright (c) 2009 Horst Knorr +# Copyright (c) 2013 Daniel Mullner +# Copyright (c) 2018 loli10K +# +# This program is free software: you can redistribute it and/or modify it +# under the terms of the GNU General Public License as published by the +# Free Software Foundation, either version 3 of the License, or (at your +# option) any later version. +# +# This program is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General +# Public License for more details. +# +# You should have received a copy of the GNU General Public License along +# with this program. If not, see . +# +# As a special exception, the respective Autoconf Macro's copyright owner +# gives unlimited permission to copy, distribute and modify the configure +# scripts that are the output of Autoconf when processing the Macro. You +# need not follow the terms of the GNU General Public License when using +# or distributing such scripts, even though portions of the text of the +# Macro appear in them. The GNU General Public License (GPL) does govern +# all other use of the material that constitutes the Autoconf Macro. +# +# This special exception to the GPL applies to versions of the Autoconf +# Macro released by the Autoconf Archive. When you make and distribute a +# modified version of the Autoconf Macro, you may extend this special +# exception to the GPL to apply to your modified version as well. + +#serial 21 + +AU_ALIAS([AC_PYTHON_DEVEL], [AX_PYTHON_DEVEL]) +AC_DEFUN([AX_PYTHON_DEVEL],[ + # + # Allow the use of a (user set) custom python version + # + AC_ARG_VAR([PYTHON_VERSION],[The installed Python + version to use, for example '2.3'. This string + will be appended to the Python interpreter + canonical name.]) + + AC_PATH_PROG([PYTHON],[python[$PYTHON_VERSION]]) + if test -z "$PYTHON"; then + m4_ifvaln([$2],[$2],[ + AC_MSG_ERROR([Cannot find python$PYTHON_VERSION in your system path]) + PYTHON_VERSION="" + ]) + fi + + # + # Check for a version of Python >= 2.1.0 + # + AC_MSG_CHECKING([for a version of Python >= '2.1.0']) + ac_supports_python_ver=`$PYTHON -c "import sys; \ + ver = sys.version.split ()[[0]]; \ + print (ver >= '2.1.0')"` + if test "$ac_supports_python_ver" != "True"; then + if test -z "$PYTHON_NOVERSIONCHECK"; then + AC_MSG_RESULT([no]) + m4_ifvaln([$2],[$2],[ + AC_MSG_FAILURE([ +This version of the AC@&t@_PYTHON_DEVEL macro +doesn't work properly with versions of Python before +2.1.0. You may need to re-run configure, setting the +variables PYTHON_CPPFLAGS, PYTHON_LIBS, PYTHON_SITE_PKG, +PYTHON_EXTRA_LIBS and PYTHON_EXTRA_LDFLAGS by hand. +Moreover, to disable this check, set PYTHON_NOVERSIONCHECK +to something else than an empty string. +]) + ]) + else + AC_MSG_RESULT([skip at user request]) + fi + else + AC_MSG_RESULT([yes]) + fi + + # + # if the macro parameter ``version'' is set, honour it + # + if test -n "$1"; then + AC_MSG_CHECKING([for a version of Python $1]) + ac_supports_python_ver=`$PYTHON -c "import sys; \ + ver = sys.version.split ()[[0]]; \ + print (ver $1)"` + if test "$ac_supports_python_ver" = "True"; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + m4_ifvaln([$2],[$2],[ + AC_MSG_ERROR([this package requires Python $1. +If you have it installed, but it isn't the default Python +interpreter in your system path, please pass the PYTHON_VERSION +variable to configure. See ``configure --help'' for reference. +]) + PYTHON_VERSION="" + ]) + fi + fi + + # + # Check if you have distutils, else fail + # + AC_MSG_CHECKING([for the distutils Python package]) + ac_distutils_result=`$PYTHON -c "import distutils" 2>&1` + if test $? -eq 0; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + m4_ifvaln([$2],[$2],[ + AC_MSG_ERROR([cannot import Python module "distutils". +Please check your Python installation. The error was: +$ac_distutils_result]) + PYTHON_VERSION="" + ]) + fi + + # + # Check for Python include path + # + AC_MSG_CHECKING([for Python include path]) + if test -z "$PYTHON_CPPFLAGS"; then + python_path=`$PYTHON -c "import distutils.sysconfig; \ + print (distutils.sysconfig.get_python_inc ());"` + plat_python_path=`$PYTHON -c "import distutils.sysconfig; \ + print (distutils.sysconfig.get_python_inc (plat_specific=1));"` + if test -n "${python_path}"; then + if test "${plat_python_path}" != "${python_path}"; then + python_path="-I$python_path -I$plat_python_path" + else + python_path="-I$python_path" + fi + fi + PYTHON_CPPFLAGS=$python_path + fi + AC_MSG_RESULT([$PYTHON_CPPFLAGS]) + AC_SUBST([PYTHON_CPPFLAGS]) + + # + # Check for Python library path + # + AC_MSG_CHECKING([for Python library path]) + if test -z "$PYTHON_LIBS"; then + # (makes two attempts to ensure we've got a version number + # from the interpreter) + ac_python_version=`cat<]], + [[Py_Initialize();]]) + ],[pythonexists=yes],[pythonexists=no]) + AC_LANG_POP([C]) + # turn back to default flags + CPPFLAGS="$ac_save_CPPFLAGS" + LIBS="$ac_save_LIBS" + LDFLAGS="$ac_save_LDFLAGS" + + AC_MSG_RESULT([$pythonexists]) + + if test ! "x$pythonexists" = "xyes"; then + m4_ifvaln([$2],[$2],[ + AC_MSG_FAILURE([ + Could not link test program to Python. Maybe the main Python library has been + installed in some non-standard library path. If so, pass it to configure, + via the LIBS environment variable. + Example: ./configure LIBS="-L/usr/non-standard-path/python/lib" + ============================================================================ + ERROR! + You probably have to install the development version of the Python package + for your distribution. The exact name of this package varies among them. + ============================================================================ + ]) + PYTHON_VERSION="" + ]) + fi + + # + # all done! + # +]) diff --git a/config/ax_restore_flags.m4 b/config/ax_restore_flags.m4 new file mode 100644 index 000000000000..cf03cae79015 --- /dev/null +++ b/config/ax_restore_flags.m4 @@ -0,0 +1,31 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_restore_flags.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_RESTORE_FLAGS() +# +# DESCRIPTION +# +# Restore common compilation flags from temporary variables +# +# LICENSE +# +# Copyright (c) 2009 Filippo Giunchedi +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 3 + +AC_DEFUN([AX_RESTORE_FLAGS], [ + CPPFLAGS="${CPPFLAGS_save}" + CFLAGS="${CFLAGS_save}" + CXXFLAGS="${CXXFLAGS_save}" + OBJCFLAGS="${OBJCFLAGS_save}" + LDFLAGS="${LDFLAGS_save}" + LIBS="${LIBS_save}" +]) diff --git a/config/ax_save_flags.m4 b/config/ax_save_flags.m4 new file mode 100644 index 000000000000..d2a054223b91 --- /dev/null +++ b/config/ax_save_flags.m4 @@ -0,0 +1,31 @@ +# =========================================================================== +# http://www.gnu.org/software/autoconf-archive/ax_save_flags.html +# =========================================================================== +# +# SYNOPSIS +# +# AX_SAVE_FLAGS() +# +# DESCRIPTION +# +# Save common compilation flags into temporary variables +# +# LICENSE +# +# Copyright (c) 2009 Filippo Giunchedi +# +# Copying and distribution of this file, with or without modification, are +# permitted in any medium without royalty provided the copyright notice +# and this notice are preserved. This file is offered as-is, without any +# warranty. + +#serial 3 + +AC_DEFUN([AX_SAVE_FLAGS], [ + CPPFLAGS_save="${CPPFLAGS}" + CFLAGS_save="${CFLAGS}" + CXXFLAGS_save="${CXXFLAGS}" + OBJCFLAGS_save="${OBJCFLAGS}" + LDFLAGS_save="${LDFLAGS}" + LIBS_save="${LIBS}" +]) diff --git a/config/config.awk b/config/config.awk new file mode 100644 index 000000000000..cc4b7cc265cd --- /dev/null +++ b/config/config.awk @@ -0,0 +1,15 @@ +# Remove default preprocessor define's from config.h +# PACKAGE +# PACKAGE_BUGREPORT +# PACKAGE_NAME +# PACKAGE_STRING +# PACKAGE_TARNAME +# PACKAGE_VERSION +# STDC_HEADERS +# VERSION + +BEGIN { RS = "" ; FS = "\n" } \ + !/.#define PACKAGE./ && \ + !/.#define VERSION./ && \ + !/.#define STDC_HEADERS./ \ + { print $0"\n" } diff --git a/config/config.rpath b/config/config.rpath new file mode 100755 index 000000000000..be202c1a9e79 --- /dev/null +++ b/config/config.rpath @@ -0,0 +1,684 @@ +#! /bin/sh +# Output a system dependent set of variables, describing how to set the +# run time search path of shared libraries in an executable. +# +# Copyright 1996-2019 Free Software Foundation, Inc. +# Taken from GNU libtool, 2001 +# Originally by Gordon Matzigkeit , 1996 +# +# This file is free software; the Free Software Foundation gives +# unlimited permission to copy and/or distribute it, with or without +# modifications, as long as this notice is preserved. +# +# The first argument passed to this file is the canonical host specification, +# CPU_TYPE-MANUFACTURER-OPERATING_SYSTEM +# or +# CPU_TYPE-MANUFACTURER-KERNEL-OPERATING_SYSTEM +# The environment variables CC, GCC, LDFLAGS, LD, with_gnu_ld +# should be set by the caller. +# +# The set of defined variables is at the end of this script. + +# Known limitations: +# - On IRIX 6.5 with CC="cc", the run time search patch must not be longer +# than 256 bytes, otherwise the compiler driver will dump core. The only +# known workaround is to choose shorter directory names for the build +# directory and/or the installation directory. + +# All known linkers require a '.a' archive for static linking (except MSVC, +# which needs '.lib'). +libext=a +shrext=.so + +host="$1" +host_cpu=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\1/'` +host_vendor=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\2/'` +host_os=`echo "$host" | sed 's/^\([^-]*\)-\([^-]*\)-\(.*\)$/\3/'` + +# Code taken from libtool.m4's _LT_CC_BASENAME. + +for cc_temp in $CC""; do + case $cc_temp in + compile | *[\\/]compile | ccache | *[\\/]ccache ) ;; + distcc | *[\\/]distcc | purify | *[\\/]purify ) ;; + \-*) ;; + *) break;; + esac +done +cc_basename=`echo "$cc_temp" | sed -e 's%^.*/%%'` + +# Code taken from libtool.m4's _LT_COMPILER_PIC. + +wl= +if test "$GCC" = yes; then + wl='-Wl,' +else + case "$host_os" in + aix*) + wl='-Wl,' + ;; + mingw* | cygwin* | pw32* | os2* | cegcc*) + ;; + hpux9* | hpux10* | hpux11*) + wl='-Wl,' + ;; + irix5* | irix6* | nonstopux*) + wl='-Wl,' + ;; + linux* | k*bsd*-gnu | kopensolaris*-gnu) + case $cc_basename in + ecc*) + wl='-Wl,' + ;; + icc* | ifort*) + wl='-Wl,' + ;; + lf95*) + wl='-Wl,' + ;; + nagfor*) + wl='-Wl,-Wl,,' + ;; + pgcc* | pgf77* | pgf90* | pgf95* | pgfortran*) + wl='-Wl,' + ;; + ccc*) + wl='-Wl,' + ;; + xl* | bgxl* | bgf* | mpixl*) + wl='-Wl,' + ;; + como) + wl='-lopt=' + ;; + *) + case `$CC -V 2>&1 | sed 5q` in + *Sun\ F* | *Sun*Fortran*) + wl= + ;; + *Sun\ C*) + wl='-Wl,' + ;; + esac + ;; + esac + ;; + newsos6) + ;; + *nto* | *qnx*) + ;; + osf3* | osf4* | osf5*) + wl='-Wl,' + ;; + rdos*) + ;; + solaris*) + case $cc_basename in + f77* | f90* | f95* | sunf77* | sunf90* | sunf95*) + wl='-Qoption ld ' + ;; + *) + wl='-Wl,' + ;; + esac + ;; + sunos4*) + wl='-Qoption ld ' + ;; + sysv4 | sysv4.2uw2* | sysv4.3*) + wl='-Wl,' + ;; + sysv4*MP*) + ;; + sysv5* | unixware* | sco3.2v5* | sco5v6* | OpenUNIX*) + wl='-Wl,' + ;; + unicos*) + wl='-Wl,' + ;; + uts4*) + ;; + esac +fi + +# Code taken from libtool.m4's _LT_LINKER_SHLIBS. + +hardcode_libdir_flag_spec= +hardcode_libdir_separator= +hardcode_direct=no +hardcode_minus_L=no + +case "$host_os" in + cygwin* | mingw* | pw32* | cegcc*) + # FIXME: the MSVC++ port hasn't been tested in a loooong time + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + if test "$GCC" != yes; then + with_gnu_ld=no + fi + ;; + interix*) + # we just hope/assume this is gcc and not c89 (= MSVC++) + with_gnu_ld=yes + ;; + openbsd*) + with_gnu_ld=no + ;; +esac + +ld_shlibs=yes +if test "$with_gnu_ld" = yes; then + # Set some defaults for GNU ld with shared library support. These + # are reset later if shared libraries are not supported. Putting them + # here allows them to be overridden if necessary. + # Unlike libtool, we use -rpath here, not --rpath, since the documented + # option of GNU ld is called -rpath, not --rpath. + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + case "$host_os" in + aix[3-9]*) + # On AIX/PPC, the GNU linker is very broken + if test "$host_cpu" != ia64; then + ld_shlibs=no + fi + ;; + amigaos*) + case "$host_cpu" in + powerpc) + ;; + m68k) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + esac + ;; + beos*) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + cygwin* | mingw* | pw32* | cegcc*) + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec='-L$libdir' + if $LD --help 2>&1 | grep 'auto-import' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + haiku*) + ;; + interix[3-9]*) + hardcode_direct=no + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + ;; + gnu* | linux* | tpf* | k*bsd*-gnu | kopensolaris*-gnu) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + netbsd*) + ;; + solaris*) + if $LD -v 2>&1 | grep 'BFD 2\.8' > /dev/null; then + ld_shlibs=no + elif $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX*) + case `$LD -v 2>&1` in + *\ [01].* | *\ 2.[0-9].* | *\ 2.1[0-5].*) + ld_shlibs=no + ;; + *) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + hardcode_libdir_flag_spec='`test -z "$SCOABSPATH" && echo ${wl}-rpath,$libdir`' + else + ld_shlibs=no + fi + ;; + esac + ;; + sunos4*) + hardcode_direct=yes + ;; + *) + if $LD --help 2>&1 | grep ': supported targets:.* elf' > /dev/null; then + : + else + ld_shlibs=no + fi + ;; + esac + if test "$ld_shlibs" = no; then + hardcode_libdir_flag_spec= + fi +else + case "$host_os" in + aix3*) + # Note: this linker hardcodes the directories in LIBPATH if there + # are no directories specified by -L. + hardcode_minus_L=yes + if test "$GCC" = yes; then + # Neither direct hardcoding nor static linking is supported with a + # broken collect2. + hardcode_direct=unsupported + fi + ;; + aix[4-9]*) + if test "$host_cpu" = ia64; then + # On IA64, the linker does run time linking by default, so we don't + # have to do anything special. + aix_use_runtimelinking=no + else + aix_use_runtimelinking=no + # Test if we are trying to use run time linking or normal + # AIX style linking. If -brtl is somewhere in LDFLAGS, we + # need to do runtime linking. + case $host_os in aix4.[23]|aix4.[23].*|aix[5-9]*) + for ld_flag in $LDFLAGS; do + if (test $ld_flag = "-brtl" || test $ld_flag = "-Wl,-brtl"); then + aix_use_runtimelinking=yes + break + fi + done + ;; + esac + fi + hardcode_direct=yes + hardcode_libdir_separator=':' + if test "$GCC" = yes; then + case $host_os in aix4.[012]|aix4.[012].*) + collect2name=`${CC} -print-prog-name=collect2` + if test -f "$collect2name" && \ + strings "$collect2name" | grep resolve_lib_name >/dev/null + then + # We have reworked collect2 + : + else + # We have old collect2 + hardcode_direct=unsupported + hardcode_minus_L=yes + hardcode_libdir_flag_spec='-L$libdir' + hardcode_libdir_separator= + fi + ;; + esac + fi + # Begin _LT_AC_SYS_LIBPATH_AIX. + echo 'int main () { return 0; }' > conftest.c + ${CC} ${LDFLAGS} conftest.c -o conftest + aix_libpath=`dump -H conftest 2>/dev/null | sed -n -e '/Import File Strings/,/^$/ { /^0/ { s/^0 *\(.*\)$/\1/; p; } +}'` + if test -z "$aix_libpath"; then + aix_libpath=`dump -HX64 conftest 2>/dev/null | sed -n -e '/Import File Strings/,/^$/ { /^0/ { s/^0 *\(.*\)$/\1/; p; } +}'` + fi + if test -z "$aix_libpath"; then + aix_libpath="/usr/lib:/lib" + fi + rm -f conftest.c conftest + # End _LT_AC_SYS_LIBPATH_AIX. + if test "$aix_use_runtimelinking" = yes; then + hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath" + else + if test "$host_cpu" = ia64; then + hardcode_libdir_flag_spec='${wl}-R $libdir:/usr/lib:/lib' + else + hardcode_libdir_flag_spec='${wl}-blibpath:$libdir:'"$aix_libpath" + fi + fi + ;; + amigaos*) + case "$host_cpu" in + powerpc) + ;; + m68k) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + esac + ;; + bsdi[45]*) + ;; + cygwin* | mingw* | pw32* | cegcc*) + # When not using gcc, we currently assume that we are using + # Microsoft Visual C++. + # hardcode_libdir_flag_spec is actually meaningless, as there is + # no search path for DLLs. + hardcode_libdir_flag_spec=' ' + libext=lib + ;; + darwin* | rhapsody*) + hardcode_direct=no + if { case $cc_basename in ifort*) true;; *) test "$GCC" = yes;; esac; }; then + : + else + ld_shlibs=no + fi + ;; + dgux*) + hardcode_libdir_flag_spec='-L$libdir' + ;; + freebsd2.[01]*) + hardcode_direct=yes + hardcode_minus_L=yes + ;; + freebsd* | dragonfly*) + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + ;; + hpux9*) + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + ;; + hpux10*) + if test "$with_gnu_ld" = no; then + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + fi + ;; + hpux11*) + if test "$with_gnu_ld" = no; then + hardcode_libdir_flag_spec='${wl}+b ${wl}$libdir' + hardcode_libdir_separator=: + case $host_cpu in + hppa*64*|ia64*) + hardcode_direct=no + ;; + *) + hardcode_direct=yes + # hardcode_minus_L: Not really in the search PATH, + # but as the default location of the library. + hardcode_minus_L=yes + ;; + esac + fi + ;; + irix5* | irix6* | nonstopux*) + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + netbsd*) + hardcode_libdir_flag_spec='-R$libdir' + hardcode_direct=yes + ;; + newsos6) + hardcode_direct=yes + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + *nto* | *qnx*) + ;; + openbsd*) + if test -f /usr/libexec/ld.so; then + hardcode_direct=yes + if test -z "`echo __ELF__ | $CC -E - | grep __ELF__`" || test "$host_os-$host_cpu" = "openbsd2.8-powerpc"; then + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + else + case "$host_os" in + openbsd[01].* | openbsd2.[0-7] | openbsd2.[0-7].*) + hardcode_libdir_flag_spec='-R$libdir' + ;; + *) + hardcode_libdir_flag_spec='${wl}-rpath,$libdir' + ;; + esac + fi + else + ld_shlibs=no + fi + ;; + os2*) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_minus_L=yes + ;; + osf3*) + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + hardcode_libdir_separator=: + ;; + osf4* | osf5*) + if test "$GCC" = yes; then + hardcode_libdir_flag_spec='${wl}-rpath ${wl}$libdir' + else + # Both cc and cxx compiler support -rpath directly + hardcode_libdir_flag_spec='-rpath $libdir' + fi + hardcode_libdir_separator=: + ;; + solaris*) + hardcode_libdir_flag_spec='-R$libdir' + ;; + sunos4*) + hardcode_libdir_flag_spec='-L$libdir' + hardcode_direct=yes + hardcode_minus_L=yes + ;; + sysv4) + case $host_vendor in + sni) + hardcode_direct=yes # is this really true??? + ;; + siemens) + hardcode_direct=no + ;; + motorola) + hardcode_direct=no #Motorola manual says yes, but my tests say they lie + ;; + esac + ;; + sysv4.3*) + ;; + sysv4*MP*) + if test -d /usr/nec; then + ld_shlibs=yes + fi + ;; + sysv4*uw2* | sysv5OpenUNIX* | sysv5UnixWare7.[01].[10]* | unixware7* | sco3.2v5.0.[024]*) + ;; + sysv5* | sco3.2v5* | sco5v6*) + hardcode_libdir_flag_spec='`test -z "$SCOABSPATH" && echo ${wl}-R,$libdir`' + hardcode_libdir_separator=':' + ;; + uts4*) + hardcode_libdir_flag_spec='-L$libdir' + ;; + *) + ld_shlibs=no + ;; + esac +fi + +# Check dynamic linker characteristics +# Code taken from libtool.m4's _LT_SYS_DYNAMIC_LINKER. +# Unlike libtool.m4, here we don't care about _all_ names of the library, but +# only about the one the linker finds when passed -lNAME. This is the last +# element of library_names_spec in libtool.m4, or possibly two of them if the +# linker has special search rules. +library_names_spec= # the last element of library_names_spec in libtool.m4 +libname_spec='lib$name' +case "$host_os" in + aix3*) + library_names_spec='$libname.a' + ;; + aix[4-9]*) + library_names_spec='$libname$shrext' + ;; + amigaos*) + case "$host_cpu" in + powerpc*) + library_names_spec='$libname$shrext' ;; + m68k) + library_names_spec='$libname.a' ;; + esac + ;; + beos*) + library_names_spec='$libname$shrext' + ;; + bsdi[45]*) + library_names_spec='$libname$shrext' + ;; + cygwin* | mingw* | pw32* | cegcc*) + shrext=.dll + library_names_spec='$libname.dll.a $libname.lib' + ;; + darwin* | rhapsody*) + shrext=.dylib + library_names_spec='$libname$shrext' + ;; + dgux*) + library_names_spec='$libname$shrext' + ;; + freebsd[23].*) + library_names_spec='$libname$shrext$versuffix' + ;; + freebsd* | dragonfly*) + library_names_spec='$libname$shrext' + ;; + gnu*) + library_names_spec='$libname$shrext' + ;; + haiku*) + library_names_spec='$libname$shrext' + ;; + hpux9* | hpux10* | hpux11*) + case $host_cpu in + ia64*) + shrext=.so + ;; + hppa*64*) + shrext=.sl + ;; + *) + shrext=.sl + ;; + esac + library_names_spec='$libname$shrext' + ;; + interix[3-9]*) + library_names_spec='$libname$shrext' + ;; + irix5* | irix6* | nonstopux*) + library_names_spec='$libname$shrext' + case "$host_os" in + irix5* | nonstopux*) + libsuff= shlibsuff= + ;; + *) + case $LD in + *-32|*"-32 "|*-melf32bsmip|*"-melf32bsmip ") libsuff= shlibsuff= ;; + *-n32|*"-n32 "|*-melf32bmipn32|*"-melf32bmipn32 ") libsuff=32 shlibsuff=N32 ;; + *-64|*"-64 "|*-melf64bmip|*"-melf64bmip ") libsuff=64 shlibsuff=64 ;; + *) libsuff= shlibsuff= ;; + esac + ;; + esac + ;; + linux*oldld* | linux*aout* | linux*coff*) + ;; + linux* | k*bsd*-gnu | kopensolaris*-gnu) + library_names_spec='$libname$shrext' + ;; + knetbsd*-gnu) + library_names_spec='$libname$shrext' + ;; + netbsd*) + library_names_spec='$libname$shrext' + ;; + newsos6) + library_names_spec='$libname$shrext' + ;; + *nto* | *qnx*) + library_names_spec='$libname$shrext' + ;; + openbsd*) + library_names_spec='$libname$shrext$versuffix' + ;; + os2*) + libname_spec='$name' + shrext=.dll + library_names_spec='$libname.a' + ;; + osf3* | osf4* | osf5*) + library_names_spec='$libname$shrext' + ;; + rdos*) + ;; + solaris*) + library_names_spec='$libname$shrext' + ;; + sunos4*) + library_names_spec='$libname$shrext$versuffix' + ;; + sysv4 | sysv4.3*) + library_names_spec='$libname$shrext' + ;; + sysv4*MP*) + library_names_spec='$libname$shrext' + ;; + sysv5* | sco3.2v5* | sco5v6* | unixware* | OpenUNIX* | sysv4*uw2*) + library_names_spec='$libname$shrext' + ;; + tpf*) + library_names_spec='$libname$shrext' + ;; + uts4*) + library_names_spec='$libname$shrext' + ;; +esac + +sed_quote_subst='s/\(["`$\\]\)/\\\1/g' +escaped_wl=`echo "X$wl" | sed -e 's/^X//' -e "$sed_quote_subst"` +shlibext=`echo "$shrext" | sed -e 's,^\.,,'` +escaped_libname_spec=`echo "X$libname_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` +escaped_library_names_spec=`echo "X$library_names_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` +escaped_hardcode_libdir_flag_spec=`echo "X$hardcode_libdir_flag_spec" | sed -e 's/^X//' -e "$sed_quote_subst"` + +LC_ALL=C sed -e 's/^\([a-zA-Z0-9_]*\)=/acl_cv_\1=/' < $${path_prepend}/dh_shlibdeps; \ + echo "`which dh_shlibdeps` -- \ + -xlibuutil1linux -xlibnvpair1linux -xlibzfs2linux -xlibzpool2linux" \ + >> $${path_prepend}/dh_shlibdeps; \ +## These -x arguments are passed to dpkg-shlibdeps, which exclude the +## Debianized packages from the auto-generated dependencies of the new debs, +## which should NOT be mixed with the alien-generated debs created here + chmod +x $${path_prepend}/dh_shlibdeps; \ + env PATH=$${path_prepend}:$${PATH} \ + fakeroot $(ALIEN) --bump=0 --scripts --to-deb --target=$$debarch \ + $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ + $$pkg8 $$pkg9 $$pkg10 || exit 1; \ + $(RM) $${path_prepend}/dh_shlibdeps; \ + rmdir $${path_prepend}; \ + $(RM) $$pkg1 $$pkg2 $$pkg3 $$pkg4 $$pkg5 $$pkg6 $$pkg7 \ + $$pkg8 $$pkg9 $$pkg10; + +deb: deb-kmod deb-dkms deb-utils diff --git a/config/find_system_library.m4 b/config/find_system_library.m4 new file mode 100644 index 000000000000..9a95d6a15033 --- /dev/null +++ b/config/find_system_library.m4 @@ -0,0 +1,93 @@ +# find_system_lib.m4 - Macros to search for a system library. -*- Autoconf -*- + +dnl requires pkg.m4 from pkg-config +dnl requires ax_save_flags.m4 from autoconf-archive +dnl requires ax_restore_flags.m4 from autoconf-archive + +dnl ZFS_AC_FIND_SYSTEM_LIBRARY(VARIABLE-PREFIX, MODULE, HEADER, HEADER-PREFIXES, LIBRARY, FUNCTIONS, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) + +AC_DEFUN([ZFS_AC_FIND_SYSTEM_LIBRARY], [ + AC_REQUIRE([PKG_PROG_PKG_CONFIG]) + + _header_found= + _library_found= + + AS_IF([test -n "$2"], [PKG_CHECK_MODULES([$1], [$2], [ + _header_found=1 + _library_found=1 + ], [:])]) + + # set _header_found/_library_found if the user passed in CFLAGS/LIBS + AS_IF([test "x$[$1][_CFLAGS]" != x], [_header_found=1]) + AS_IF([test "x$[$1][_LIBS]" != x], [_library_found=1]) + + AX_SAVE_FLAGS + + orig_CFLAGS="$CFLAGS" + + for _prefixdir in /usr /usr/local + do + AS_VAR_PUSHDEF([header_cache], [ac_cv_header_$3]) + AS_IF([test "x$_prefixdir" != "x/usr"], [ + [$1][_CFLAGS]="-I$lt_sysroot$_prefixdir/include" + AS_IF([test "x$_library_found" = x], [ + [$1][_LIBS]="-L$lt_sysroot$_prefixdir/lib" + ]) + ]) + CFLAGS="$orig_CFLAGS $[$1][_CFLAGS]" + AS_UNSET([header_cache]) + AC_CHECK_HEADER([$3], [ + _header_found=1 + break + ], [AS_IF([test "x$_header_found" = "x1"], [ + # if pkg-config or the user set CFLAGS, fail if the header is unusable + AC_MSG_FAILURE([header [$3] for library [$5] is not usable]) + ])], [AC_INCLUDES_DEFAULT]) + # search for header under HEADER-PREFIXES + m4_foreach_w([prefix], [$4], [ + [$1][_CFLAGS]=["-I$lt_sysroot$_prefixdir/include/]prefix["] + CFLAGS="$orig_CFLAGS $[$1][_CFLAGS]" + AS_UNSET([header_cache]) + AC_CHECK_HEADER([$3], [ + _header_found=1 + break + ], [], [AC_INCLUDES_DEFAULT]) + ]) + AS_VAR_POPDEF([header_cache]) + done + + AS_IF([test "x$_header_found" = "x1"], [ + AS_IF([test "x$_library_found" = x], [ + [$1][_LIBS]="$[$1]_LIBS -l[$5]" + ]) + LDFLAGS="$LDFLAGS $[$1][_LIBS]" + + _libcheck=1 + m4_ifval([$6], + [m4_foreach_w([func], [$6], [AC_CHECK_LIB([$5], func, [:], [_libcheck=])])], + [AC_CHECK_LIB([$5], [main], [:], [_libcheck=])]) + + AS_IF([test "x$_libcheck" = "x1"], [_library_found=1], + [test "x$_library_found" = "x1"], [ + # if pkg-config or the user set LIBS, fail if the library is unusable + AC_MSG_FAILURE([library [$5] is not usable]) + ]) + ], [test "x$_library_found" = "x1"], [ + # if the user set LIBS, fail if we didn't find the header + AC_MSG_FAILURE([cannot find header [$3] for library [$5]]) + ]) + + AX_RESTORE_FLAGS + + AS_IF([test "x$_header_found" = "x1" && test "x$_library_found" = "x1"], [ + AC_SUBST([$1]_CFLAGS) + AC_SUBST([$1]_LIBS) + AC_DEFINE([HAVE_][$1], [1], [Define if you have [$5]]) + $7 + ],[dnl ELSE + AC_SUBST([$1]_CFLAGS, []) + AC_SUBST([$1]_LIBS, []) + AC_MSG_WARN([cannot find [$5] via pkg-config or in the standard locations]) + $8 + ]) +]) diff --git a/config/gettext.m4 b/config/gettext.m4 new file mode 100644 index 000000000000..e7832418ea16 --- /dev/null +++ b/config/gettext.m4 @@ -0,0 +1,386 @@ +# gettext.m4 serial 70 (gettext-0.20) +dnl Copyright (C) 1995-2014, 2016, 2018 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. +dnl +dnl This file can be used in projects which are not available under +dnl the GNU General Public License or the GNU Library General Public +dnl License but which still want to provide support for the GNU gettext +dnl functionality. +dnl Please note that the actual code of the GNU gettext library is covered +dnl by the GNU Library General Public License, and the rest of the GNU +dnl gettext package is covered by the GNU General Public License. +dnl They are *not* in the public domain. + +dnl Authors: +dnl Ulrich Drepper , 1995-2000. +dnl Bruno Haible , 2000-2006, 2008-2010. + +dnl Macro to add for using GNU gettext. + +dnl Usage: AM_GNU_GETTEXT([INTLSYMBOL], [NEEDSYMBOL], [INTLDIR]). +dnl INTLSYMBOL must be one of 'external', 'use-libtool'. +dnl INTLSYMBOL should be 'external' for packages other than GNU gettext, and +dnl 'use-libtool' for the packages 'gettext-runtime' and 'gettext-tools'. +dnl If INTLSYMBOL is 'use-libtool', then a libtool library +dnl $(top_builddir)/intl/libintl.la will be created (shared and/or static, +dnl depending on --{enable,disable}-{shared,static} and on the presence of +dnl AM-DISABLE-SHARED). +dnl If NEEDSYMBOL is specified and is 'need-ngettext', then GNU gettext +dnl implementations (in libc or libintl) without the ngettext() function +dnl will be ignored. If NEEDSYMBOL is specified and is +dnl 'need-formatstring-macros', then GNU gettext implementations that don't +dnl support the ISO C 99 formatstring macros will be ignored. +dnl INTLDIR is used to find the intl libraries. If empty, +dnl the value '$(top_builddir)/intl/' is used. +dnl +dnl The result of the configuration is one of three cases: +dnl 1) GNU gettext, as included in the intl subdirectory, will be compiled +dnl and used. +dnl Catalog format: GNU --> install in $(datadir) +dnl Catalog extension: .mo after installation, .gmo in source tree +dnl 2) GNU gettext has been found in the system's C library. +dnl Catalog format: GNU --> install in $(datadir) +dnl Catalog extension: .mo after installation, .gmo in source tree +dnl 3) No internationalization, always use English msgid. +dnl Catalog format: none +dnl Catalog extension: none +dnl If INTLSYMBOL is 'external', only cases 2 and 3 can occur. +dnl The use of .gmo is historical (it was needed to avoid overwriting the +dnl GNU format catalogs when building on a platform with an X/Open gettext), +dnl but we keep it in order not to force irrelevant filename changes on the +dnl maintainers. +dnl +AC_DEFUN([AM_GNU_GETTEXT], +[ + dnl Argument checking. + ifelse([$1], [], , [ifelse([$1], [external], , [ifelse([$1], [use-libtool], , + [errprint([ERROR: invalid first argument to AM_GNU_GETTEXT +])])])]) + ifelse(ifelse([$1], [], [old])[]ifelse([$1], [no-libtool], [old]), [old], + [errprint([ERROR: Use of AM_GNU_GETTEXT without [external] argument is no longer supported. +])]) + ifelse([$2], [], , [ifelse([$2], [need-ngettext], , [ifelse([$2], [need-formatstring-macros], , + [errprint([ERROR: invalid second argument to AM_GNU_GETTEXT +])])])]) + define([gt_included_intl], + ifelse([$1], [external], [no], [yes])) + gt_NEEDS_INIT + AM_GNU_GETTEXT_NEED([$2]) + + AC_REQUIRE([AM_PO_SUBDIRS])dnl + ifelse(gt_included_intl, yes, [ + AC_REQUIRE([AM_INTL_SUBDIR])dnl + ]) + + dnl Prerequisites of AC_LIB_LINKFLAGS_BODY. + AC_REQUIRE([AC_LIB_PREPARE_PREFIX]) + AC_REQUIRE([AC_LIB_RPATH]) + + dnl Sometimes libintl requires libiconv, so first search for libiconv. + dnl Ideally we would do this search only after the + dnl if test "$USE_NLS" = "yes"; then + dnl if { eval "gt_val=\$$gt_func_gnugettext_libc"; test "$gt_val" != "yes"; }; then + dnl tests. But if configure.in invokes AM_ICONV after AM_GNU_GETTEXT + dnl the configure script would need to contain the same shell code + dnl again, outside any 'if'. There are two solutions: + dnl - Invoke AM_ICONV_LINKFLAGS_BODY here, outside any 'if'. + dnl - Control the expansions in more detail using AC_PROVIDE_IFELSE. + dnl Since AC_PROVIDE_IFELSE is not documented, we avoid it. + ifelse(gt_included_intl, yes, , [ + AC_REQUIRE([AM_ICONV_LINKFLAGS_BODY]) + ]) + + dnl Sometimes, on Mac OS X, libintl requires linking with CoreFoundation. + gt_INTL_MACOSX + + dnl Set USE_NLS. + AC_REQUIRE([AM_NLS]) + + ifelse(gt_included_intl, yes, [ + BUILD_INCLUDED_LIBINTL=no + USE_INCLUDED_LIBINTL=no + ]) + LIBINTL= + LTLIBINTL= + POSUB= + + dnl Add a version number to the cache macros. + case " $gt_needs " in + *" need-formatstring-macros "*) gt_api_version=3 ;; + *" need-ngettext "*) gt_api_version=2 ;; + *) gt_api_version=1 ;; + esac + gt_func_gnugettext_libc="gt_cv_func_gnugettext${gt_api_version}_libc" + gt_func_gnugettext_libintl="gt_cv_func_gnugettext${gt_api_version}_libintl" + + dnl If we use NLS figure out what method + if test "$USE_NLS" = "yes"; then + gt_use_preinstalled_gnugettext=no + ifelse(gt_included_intl, yes, [ + AC_MSG_CHECKING([whether included gettext is requested]) + AC_ARG_WITH([included-gettext], + [ --with-included-gettext use the GNU gettext library included here], + nls_cv_force_use_gnu_gettext=$withval, + nls_cv_force_use_gnu_gettext=no) + AC_MSG_RESULT([$nls_cv_force_use_gnu_gettext]) + + nls_cv_use_gnu_gettext="$nls_cv_force_use_gnu_gettext" + if test "$nls_cv_force_use_gnu_gettext" != "yes"; then + ]) + dnl User does not insist on using GNU NLS library. Figure out what + dnl to use. If GNU gettext is available we use this. Else we have + dnl to fall back to GNU NLS library. + + if test $gt_api_version -ge 3; then + gt_revision_test_code=' +#ifndef __GNU_GETTEXT_SUPPORTED_REVISION +#define __GNU_GETTEXT_SUPPORTED_REVISION(major) ((major) == 0 ? 0 : -1) +#endif +changequote(,)dnl +typedef int array [2 * (__GNU_GETTEXT_SUPPORTED_REVISION(0) >= 1) - 1]; +changequote([,])dnl +' + else + gt_revision_test_code= + fi + if test $gt_api_version -ge 2; then + gt_expression_test_code=' + * ngettext ("", "", 0)' + else + gt_expression_test_code= + fi + + AC_CACHE_CHECK([for GNU gettext in libc], [$gt_func_gnugettext_libc], + [AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#ifndef __GNU_GETTEXT_SUPPORTED_REVISION +extern int _nl_msg_cat_cntr; +extern int *_nl_domain_bindings; +#define __GNU_GETTEXT_SYMBOL_EXPRESSION (_nl_msg_cat_cntr + *_nl_domain_bindings) +#else +#define __GNU_GETTEXT_SYMBOL_EXPRESSION 0 +#endif +$gt_revision_test_code + ]], + [[ +bindtextdomain ("", ""); +return * gettext ("")$gt_expression_test_code + __GNU_GETTEXT_SYMBOL_EXPRESSION + ]])], + [eval "$gt_func_gnugettext_libc=yes"], + [eval "$gt_func_gnugettext_libc=no"])]) + + if { eval "gt_val=\$$gt_func_gnugettext_libc"; test "$gt_val" != "yes"; }; then + dnl Sometimes libintl requires libiconv, so first search for libiconv. + ifelse(gt_included_intl, yes, , [ + AM_ICONV_LINK + ]) + dnl Search for libintl and define LIBINTL, LTLIBINTL and INCINTL + dnl accordingly. Don't use AC_LIB_LINKFLAGS_BODY([intl],[iconv]) + dnl because that would add "-liconv" to LIBINTL and LTLIBINTL + dnl even if libiconv doesn't exist. + AC_LIB_LINKFLAGS_BODY([intl]) + AC_CACHE_CHECK([for GNU gettext in libintl], + [$gt_func_gnugettext_libintl], + [gt_save_CPPFLAGS="$CPPFLAGS" + CPPFLAGS="$CPPFLAGS $INCINTL" + gt_save_LIBS="$LIBS" + LIBS="$LIBS $LIBINTL" + dnl Now see whether libintl exists and does not depend on libiconv. + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#ifndef __GNU_GETTEXT_SUPPORTED_REVISION +extern int _nl_msg_cat_cntr; +extern +#ifdef __cplusplus +"C" +#endif +const char *_nl_expand_alias (const char *); +#define __GNU_GETTEXT_SYMBOL_EXPRESSION (_nl_msg_cat_cntr + *_nl_expand_alias ("")) +#else +#define __GNU_GETTEXT_SYMBOL_EXPRESSION 0 +#endif +$gt_revision_test_code + ]], + [[ +bindtextdomain ("", ""); +return * gettext ("")$gt_expression_test_code + __GNU_GETTEXT_SYMBOL_EXPRESSION + ]])], + [eval "$gt_func_gnugettext_libintl=yes"], + [eval "$gt_func_gnugettext_libintl=no"]) + dnl Now see whether libintl exists and depends on libiconv. + if { eval "gt_val=\$$gt_func_gnugettext_libintl"; test "$gt_val" != yes; } && test -n "$LIBICONV"; then + LIBS="$LIBS $LIBICONV" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#ifndef __GNU_GETTEXT_SUPPORTED_REVISION +extern int _nl_msg_cat_cntr; +extern +#ifdef __cplusplus +"C" +#endif +const char *_nl_expand_alias (const char *); +#define __GNU_GETTEXT_SYMBOL_EXPRESSION (_nl_msg_cat_cntr + *_nl_expand_alias ("")) +#else +#define __GNU_GETTEXT_SYMBOL_EXPRESSION 0 +#endif +$gt_revision_test_code + ]], + [[ +bindtextdomain ("", ""); +return * gettext ("")$gt_expression_test_code + __GNU_GETTEXT_SYMBOL_EXPRESSION + ]])], + [LIBINTL="$LIBINTL $LIBICONV" + LTLIBINTL="$LTLIBINTL $LTLIBICONV" + eval "$gt_func_gnugettext_libintl=yes" + ]) + fi + CPPFLAGS="$gt_save_CPPFLAGS" + LIBS="$gt_save_LIBS"]) + fi + + dnl If an already present or preinstalled GNU gettext() is found, + dnl use it. But if this macro is used in GNU gettext, and GNU + dnl gettext is already preinstalled in libintl, we update this + dnl libintl. (Cf. the install rule in intl/Makefile.in.) + if { eval "gt_val=\$$gt_func_gnugettext_libc"; test "$gt_val" = "yes"; } \ + || { { eval "gt_val=\$$gt_func_gnugettext_libintl"; test "$gt_val" = "yes"; } \ + && test "$PACKAGE" != gettext-runtime \ + && test "$PACKAGE" != gettext-tools; }; then + gt_use_preinstalled_gnugettext=yes + else + dnl Reset the values set by searching for libintl. + LIBINTL= + LTLIBINTL= + INCINTL= + fi + + ifelse(gt_included_intl, yes, [ + if test "$gt_use_preinstalled_gnugettext" != "yes"; then + dnl GNU gettext is not found in the C library. + dnl Fall back on included GNU gettext library. + nls_cv_use_gnu_gettext=yes + fi + fi + + if test "$nls_cv_use_gnu_gettext" = "yes"; then + dnl Mark actions used to generate GNU NLS library. + BUILD_INCLUDED_LIBINTL=yes + USE_INCLUDED_LIBINTL=yes + LIBINTL="ifelse([$3],[],\${top_builddir}/intl,[$3])/libintl.la $LIBICONV $LIBTHREAD" + LTLIBINTL="ifelse([$3],[],\${top_builddir}/intl,[$3])/libintl.la $LTLIBICONV $LTLIBTHREAD" + LIBS=`echo " $LIBS " | sed -e 's/ -lintl / /' -e 's/^ //' -e 's/ $//'` + fi + + CATOBJEXT= + if test "$gt_use_preinstalled_gnugettext" = "yes" \ + || test "$nls_cv_use_gnu_gettext" = "yes"; then + dnl Mark actions to use GNU gettext tools. + CATOBJEXT=.gmo + fi + ]) + + if test -n "$INTL_MACOSX_LIBS"; then + if test "$gt_use_preinstalled_gnugettext" = "yes" \ + || test "$nls_cv_use_gnu_gettext" = "yes"; then + dnl Some extra flags are needed during linking. + LIBINTL="$LIBINTL $INTL_MACOSX_LIBS" + LTLIBINTL="$LTLIBINTL $INTL_MACOSX_LIBS" + fi + fi + + if test "$gt_use_preinstalled_gnugettext" = "yes" \ + || test "$nls_cv_use_gnu_gettext" = "yes"; then + AC_DEFINE([ENABLE_NLS], [1], + [Define to 1 if translation of program messages to the user's native language + is requested.]) + else + USE_NLS=no + fi + fi + + AC_MSG_CHECKING([whether to use NLS]) + AC_MSG_RESULT([$USE_NLS]) + if test "$USE_NLS" = "yes"; then + AC_MSG_CHECKING([where the gettext function comes from]) + if test "$gt_use_preinstalled_gnugettext" = "yes"; then + if { eval "gt_val=\$$gt_func_gnugettext_libintl"; test "$gt_val" = "yes"; }; then + gt_source="external libintl" + else + gt_source="libc" + fi + else + gt_source="included intl directory" + fi + AC_MSG_RESULT([$gt_source]) + fi + + if test "$USE_NLS" = "yes"; then + + if test "$gt_use_preinstalled_gnugettext" = "yes"; then + if { eval "gt_val=\$$gt_func_gnugettext_libintl"; test "$gt_val" = "yes"; }; then + AC_MSG_CHECKING([how to link with libintl]) + AC_MSG_RESULT([$LIBINTL]) + AC_LIB_APPENDTOVAR([CPPFLAGS], [$INCINTL]) + fi + + dnl For backward compatibility. Some packages may be using this. + AC_DEFINE([HAVE_GETTEXT], [1], + [Define if the GNU gettext() function is already present or preinstalled.]) + AC_DEFINE([HAVE_DCGETTEXT], [1], + [Define if the GNU dcgettext() function is already present or preinstalled.]) + fi + + dnl We need to process the po/ directory. + POSUB=po + fi + + ifelse(gt_included_intl, yes, [ + dnl In GNU gettext we have to set BUILD_INCLUDED_LIBINTL to 'yes' + dnl because some of the testsuite requires it. + BUILD_INCLUDED_LIBINTL=yes + + dnl Make all variables we use known to autoconf. + AC_SUBST([BUILD_INCLUDED_LIBINTL]) + AC_SUBST([USE_INCLUDED_LIBINTL]) + AC_SUBST([CATOBJEXT]) + ]) + + dnl For backward compatibility. Some Makefiles may be using this. + INTLLIBS="$LIBINTL" + AC_SUBST([INTLLIBS]) + + dnl Make all documented variables known to autoconf. + AC_SUBST([LIBINTL]) + AC_SUBST([LTLIBINTL]) + AC_SUBST([POSUB]) +]) + + +dnl gt_NEEDS_INIT ensures that the gt_needs variable is initialized. +m4_define([gt_NEEDS_INIT], +[ + m4_divert_text([DEFAULTS], [gt_needs=]) + m4_define([gt_NEEDS_INIT], []) +]) + + +dnl Usage: AM_GNU_GETTEXT_NEED([NEEDSYMBOL]) +AC_DEFUN([AM_GNU_GETTEXT_NEED], +[ + m4_divert_text([INIT_PREPARE], [gt_needs="$gt_needs $1"]) +]) + + +dnl Usage: AM_GNU_GETTEXT_VERSION([gettext-version]) +AC_DEFUN([AM_GNU_GETTEXT_VERSION], []) + + +dnl Usage: AM_GNU_GETTEXT_REQUIRE_VERSION([gettext-version]) +AC_DEFUN([AM_GNU_GETTEXT_REQUIRE_VERSION], []) diff --git a/config/host-cpu-c-abi.m4 b/config/host-cpu-c-abi.m4 new file mode 100644 index 000000000000..4407296d0849 --- /dev/null +++ b/config/host-cpu-c-abi.m4 @@ -0,0 +1,644 @@ +# host-cpu-c-abi.m4 serial 11 +dnl Copyright (C) 2002-2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl From Bruno Haible and Sam Steingold. + +dnl Sets the HOST_CPU variable to the canonical name of the CPU. +dnl Sets the HOST_CPU_C_ABI variable to the canonical name of the CPU with its +dnl C language ABI (application binary interface). +dnl Also defines __${HOST_CPU}__ and __${HOST_CPU_C_ABI}__ as C macros in +dnl config.h. +dnl +dnl This canonical name can be used to select a particular assembly language +dnl source file that will interoperate with C code on the given host. +dnl +dnl For example: +dnl * 'i386' and 'sparc' are different canonical names, because code for i386 +dnl will not run on SPARC CPUs and vice versa. They have different +dnl instruction sets. +dnl * 'sparc' and 'sparc64' are different canonical names, because code for +dnl 'sparc' and code for 'sparc64' cannot be linked together: 'sparc' code +dnl contains 32-bit instructions, whereas 'sparc64' code contains 64-bit +dnl instructions. A process on a SPARC CPU can be in 32-bit mode or in 64-bit +dnl mode, but not both. +dnl * 'mips' and 'mipsn32' are different canonical names, because they use +dnl different argument passing and return conventions for C functions, and +dnl although the instruction set of 'mips' is a large subset of the +dnl instruction set of 'mipsn32'. +dnl * 'mipsn32' and 'mips64' are different canonical names, because they use +dnl different sizes for the C types like 'int' and 'void *', and although +dnl the instruction sets of 'mipsn32' and 'mips64' are the same. +dnl * The same canonical name is used for different endiannesses. You can +dnl determine the endianness through preprocessor symbols: +dnl - 'arm': test __ARMEL__. +dnl - 'mips', 'mipsn32', 'mips64': test _MIPSEB vs. _MIPSEL. +dnl - 'powerpc64': test _BIG_ENDIAN vs. _LITTLE_ENDIAN. +dnl * The same name 'i386' is used for CPUs of type i386, i486, i586 +dnl (Pentium), AMD K7, Pentium II, Pentium IV, etc., because +dnl - Instructions that do not exist on all of these CPUs (cmpxchg, +dnl MMX, SSE, SSE2, 3DNow! etc.) are not frequently used. If your +dnl assembly language source files use such instructions, you will +dnl need to make the distinction. +dnl - Speed of execution of the common instruction set is reasonable across +dnl the entire family of CPUs. If you have assembly language source files +dnl that are optimized for particular CPU types (like GNU gmp has), you +dnl will need to make the distinction. +dnl See . +AC_DEFUN([gl_HOST_CPU_C_ABI], +[ + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([gl_C_ASM]) + AC_CACHE_CHECK([host CPU and C ABI], [gl_cv_host_cpu_c_abi], + [case "$host_cpu" in + +changequote(,)dnl + i[4567]86 ) +changequote([,])dnl + gl_cv_host_cpu_c_abi=i386 + ;; + + x86_64 ) + # On x86_64 systems, the C compiler may be generating code in one of + # these ABIs: + # - 64-bit instruction set, 64-bit pointers, 64-bit 'long': x86_64. + # - 64-bit instruction set, 64-bit pointers, 32-bit 'long': x86_64 + # with native Windows (mingw, MSVC). + # - 64-bit instruction set, 32-bit pointers, 32-bit 'long': x86_64-x32. + # - 32-bit instruction set, 32-bit pointers, 32-bit 'long': i386. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if (defined __x86_64__ || defined __amd64__ \ + || defined _M_X64 || defined _M_AMD64) + int ok; + #else + error fail + #endif + ]])], + [AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __ILP32__ || defined _ILP32 + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=x86_64-x32], + [gl_cv_host_cpu_c_abi=x86_64])], + [gl_cv_host_cpu_c_abi=i386]) + ;; + +changequote(,)dnl + alphaev[4-8] | alphaev56 | alphapca5[67] | alphaev6[78] ) +changequote([,])dnl + gl_cv_host_cpu_c_abi=alpha + ;; + + arm* | aarch64 ) + # Assume arm with EABI. + # On arm64 systems, the C compiler may be generating code in one of + # these ABIs: + # - aarch64 instruction set, 64-bit pointers, 64-bit 'long': arm64. + # - aarch64 instruction set, 32-bit pointers, 32-bit 'long': arm64-ilp32. + # - 32-bit instruction set, 32-bit pointers, 32-bit 'long': arm or armhf. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#ifdef __aarch64__ + int ok; + #else + error fail + #endif + ]])], + [AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __ILP32__ || defined _ILP32 + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=arm64-ilp32], + [gl_cv_host_cpu_c_abi=arm64])], + [# Don't distinguish little-endian and big-endian arm, since they + # don't require different machine code for simple operations and + # since the user can distinguish them through the preprocessor + # defines __ARMEL__ vs. __ARMEB__. + # But distinguish arm which passes floating-point arguments and + # return values in integer registers (r0, r1, ...) - this is + # gcc -mfloat-abi=soft or gcc -mfloat-abi=softfp - from arm which + # passes them in float registers (s0, s1, ...) and double registers + # (d0, d1, ...) - this is gcc -mfloat-abi=hard. GCC 4.6 or newer + # sets the preprocessor defines __ARM_PCS (for the first case) and + # __ARM_PCS_VFP (for the second case), but older GCC does not. + echo 'double ddd; void func (double dd) { ddd = dd; }' > conftest.c + # Look for a reference to the register d0 in the .s file. + AC_TRY_COMMAND(${CC-cc} $CFLAGS $CPPFLAGS $gl_c_asm_opt conftest.c) >/dev/null 2>&1 + if LC_ALL=C grep 'd0,' conftest.$gl_asmext >/dev/null; then + gl_cv_host_cpu_c_abi=armhf + else + gl_cv_host_cpu_c_abi=arm + fi + rm -f conftest* + ]) + ;; + + hppa1.0 | hppa1.1 | hppa2.0* | hppa64 ) + # On hppa, the C compiler may be generating 32-bit code or 64-bit + # code. In the latter case, it defines _LP64 and __LP64__. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#ifdef __LP64__ + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=hppa64], + [gl_cv_host_cpu_c_abi=hppa]) + ;; + + ia64* ) + # On ia64 on HP-UX, the C compiler may be generating 64-bit code or + # 32-bit code. In the latter case, it defines _ILP32. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#ifdef _ILP32 + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=ia64-ilp32], + [gl_cv_host_cpu_c_abi=ia64]) + ;; + + mips* ) + # We should also check for (_MIPS_SZPTR == 64), but gcc keeps this + # at 32. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined _MIPS_SZLONG && (_MIPS_SZLONG == 64) + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=mips64], + [# In the n32 ABI, _ABIN32 is defined, _ABIO32 is not defined (but + # may later get defined by ), and _MIPS_SIM == _ABIN32. + # In the 32 ABI, _ABIO32 is defined, _ABIN32 is not defined (but + # may later get defined by ), and _MIPS_SIM == _ABIO32. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if (_MIPS_SIM == _ABIN32) + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=mipsn32], + [gl_cv_host_cpu_c_abi=mips])]) + ;; + + powerpc* ) + # Different ABIs are in use on AIX vs. Mac OS X vs. Linux,*BSD. + # No need to distinguish them here; the caller may distinguish + # them based on the OS. + # On powerpc64 systems, the C compiler may still be generating + # 32-bit code. And on powerpc-ibm-aix systems, the C compiler may + # be generating 64-bit code. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __powerpc64__ || defined _ARCH_PPC64 + int ok; + #else + error fail + #endif + ]])], + [# On powerpc64, there are two ABIs on Linux: The AIX compatible + # one and the ELFv2 one. The latter defines _CALL_ELF=2. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined _CALL_ELF && _CALL_ELF == 2 + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=powerpc64-elfv2], + [gl_cv_host_cpu_c_abi=powerpc64]) + ], + [gl_cv_host_cpu_c_abi=powerpc]) + ;; + + rs6000 ) + gl_cv_host_cpu_c_abi=powerpc + ;; + + riscv32 | riscv64 ) + # There are 2 architectures (with variants): rv32* and rv64*. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if __riscv_xlen == 64 + int ok; + #else + error fail + #endif + ]])], + [cpu=riscv64], + [cpu=riscv32]) + # There are 6 ABIs: ilp32, ilp32f, ilp32d, lp64, lp64f, lp64d. + # Size of 'long' and 'void *': + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __LP64__ + int ok; + #else + error fail + #endif + ]])], + [main_abi=lp64], + [main_abi=ilp32]) + # Float ABIs: + # __riscv_float_abi_double: + # 'float' and 'double' are passed in floating-point registers. + # __riscv_float_abi_single: + # 'float' are passed in floating-point registers. + # __riscv_float_abi_soft: + # No values are passed in floating-point registers. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __riscv_float_abi_double + int ok; + #else + error fail + #endif + ]])], + [float_abi=d], + [AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __riscv_float_abi_single + int ok; + #else + error fail + #endif + ]])], + [float_abi=f], + [float_abi='']) + ]) + gl_cv_host_cpu_c_abi="${cpu}-${main_abi}${float_abi}" + ;; + + s390* ) + # On s390x, the C compiler may be generating 64-bit (= s390x) code + # or 31-bit (= s390) code. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __LP64__ || defined __s390x__ + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=s390x], + [gl_cv_host_cpu_c_abi=s390]) + ;; + + sparc | sparc64 ) + # UltraSPARCs running Linux have `uname -m` = "sparc64", but the + # C compiler still generates 32-bit code. + AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#if defined __sparcv9 || defined __arch64__ + int ok; + #else + error fail + #endif + ]])], + [gl_cv_host_cpu_c_abi=sparc64], + [gl_cv_host_cpu_c_abi=sparc]) + ;; + + *) + gl_cv_host_cpu_c_abi="$host_cpu" + ;; + esac + ]) + + dnl In most cases, $HOST_CPU and $HOST_CPU_C_ABI are the same. + HOST_CPU=`echo "$gl_cv_host_cpu_c_abi" | sed -e 's/-.*//'` + HOST_CPU_C_ABI="$gl_cv_host_cpu_c_abi" + AC_SUBST([HOST_CPU]) + AC_SUBST([HOST_CPU_C_ABI]) + + # This was + # AC_DEFINE_UNQUOTED([__${HOST_CPU}__]) + # AC_DEFINE_UNQUOTED([__${HOST_CPU_C_ABI}__]) + # earlier, but KAI C++ 3.2d doesn't like this. + sed -e 's/-/_/g' >> confdefs.h < +#include + ]], + [[iconv_t cd = iconv_open("",""); + iconv(cd,NULL,NULL,NULL,NULL); + iconv_close(cd);]])], + [am_cv_func_iconv=yes]) + if test "$am_cv_func_iconv" != yes; then + am_save_LIBS="$LIBS" + LIBS="$LIBS $LIBICONV" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#include + ]], + [[iconv_t cd = iconv_open("",""); + iconv(cd,NULL,NULL,NULL,NULL); + iconv_close(cd);]])], + [am_cv_lib_iconv=yes] + [am_cv_func_iconv=yes]) + LIBS="$am_save_LIBS" + fi + ]) + if test "$am_cv_func_iconv" = yes; then + AC_CACHE_CHECK([for working iconv], [am_cv_func_iconv_works], [ + dnl This tests against bugs in AIX 5.1, AIX 6.1..7.1, HP-UX 11.11, + dnl Solaris 10. + am_save_LIBS="$LIBS" + if test $am_cv_lib_iconv = yes; then + LIBS="$LIBS $LIBICONV" + fi + am_cv_func_iconv_works=no + for ac_iconv_const in '' 'const'; do + AC_RUN_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#include + +#ifndef ICONV_CONST +# define ICONV_CONST $ac_iconv_const +#endif + ]], + [[int result = 0; + /* Test against AIX 5.1 bug: Failures are not distinguishable from successful + returns. */ + { + iconv_t cd_utf8_to_88591 = iconv_open ("ISO8859-1", "UTF-8"); + if (cd_utf8_to_88591 != (iconv_t)(-1)) + { + static ICONV_CONST char input[] = "\342\202\254"; /* EURO SIGN */ + char buf[10]; + ICONV_CONST char *inptr = input; + size_t inbytesleft = strlen (input); + char *outptr = buf; + size_t outbytesleft = sizeof (buf); + size_t res = iconv (cd_utf8_to_88591, + &inptr, &inbytesleft, + &outptr, &outbytesleft); + if (res == 0) + result |= 1; + iconv_close (cd_utf8_to_88591); + } + } + /* Test against Solaris 10 bug: Failures are not distinguishable from + successful returns. */ + { + iconv_t cd_ascii_to_88591 = iconv_open ("ISO8859-1", "646"); + if (cd_ascii_to_88591 != (iconv_t)(-1)) + { + static ICONV_CONST char input[] = "\263"; + char buf[10]; + ICONV_CONST char *inptr = input; + size_t inbytesleft = strlen (input); + char *outptr = buf; + size_t outbytesleft = sizeof (buf); + size_t res = iconv (cd_ascii_to_88591, + &inptr, &inbytesleft, + &outptr, &outbytesleft); + if (res == 0) + result |= 2; + iconv_close (cd_ascii_to_88591); + } + } + /* Test against AIX 6.1..7.1 bug: Buffer overrun. */ + { + iconv_t cd_88591_to_utf8 = iconv_open ("UTF-8", "ISO-8859-1"); + if (cd_88591_to_utf8 != (iconv_t)(-1)) + { + static ICONV_CONST char input[] = "\304"; + static char buf[2] = { (char)0xDE, (char)0xAD }; + ICONV_CONST char *inptr = input; + size_t inbytesleft = 1; + char *outptr = buf; + size_t outbytesleft = 1; + size_t res = iconv (cd_88591_to_utf8, + &inptr, &inbytesleft, + &outptr, &outbytesleft); + if (res != (size_t)(-1) || outptr - buf > 1 || buf[1] != (char)0xAD) + result |= 4; + iconv_close (cd_88591_to_utf8); + } + } +#if 0 /* This bug could be worked around by the caller. */ + /* Test against HP-UX 11.11 bug: Positive return value instead of 0. */ + { + iconv_t cd_88591_to_utf8 = iconv_open ("utf8", "iso88591"); + if (cd_88591_to_utf8 != (iconv_t)(-1)) + { + static ICONV_CONST char input[] = "\304rger mit b\366sen B\374bchen ohne Augenma\337"; + char buf[50]; + ICONV_CONST char *inptr = input; + size_t inbytesleft = strlen (input); + char *outptr = buf; + size_t outbytesleft = sizeof (buf); + size_t res = iconv (cd_88591_to_utf8, + &inptr, &inbytesleft, + &outptr, &outbytesleft); + if ((int)res > 0) + result |= 8; + iconv_close (cd_88591_to_utf8); + } + } +#endif + /* Test against HP-UX 11.11 bug: No converter from EUC-JP to UTF-8 is + provided. */ + { + /* Try standardized names. */ + iconv_t cd1 = iconv_open ("UTF-8", "EUC-JP"); + /* Try IRIX, OSF/1 names. */ + iconv_t cd2 = iconv_open ("UTF-8", "eucJP"); + /* Try AIX names. */ + iconv_t cd3 = iconv_open ("UTF-8", "IBM-eucJP"); + /* Try HP-UX names. */ + iconv_t cd4 = iconv_open ("utf8", "eucJP"); + if (cd1 == (iconv_t)(-1) && cd2 == (iconv_t)(-1) + && cd3 == (iconv_t)(-1) && cd4 == (iconv_t)(-1)) + result |= 16; + if (cd1 != (iconv_t)(-1)) + iconv_close (cd1); + if (cd2 != (iconv_t)(-1)) + iconv_close (cd2); + if (cd3 != (iconv_t)(-1)) + iconv_close (cd3); + if (cd4 != (iconv_t)(-1)) + iconv_close (cd4); + } + return result; +]])], + [am_cv_func_iconv_works=yes], , + [case "$host_os" in + aix* | hpux*) am_cv_func_iconv_works="guessing no" ;; + *) am_cv_func_iconv_works="guessing yes" ;; + esac]) + test "$am_cv_func_iconv_works" = no || break + done + LIBS="$am_save_LIBS" + ]) + case "$am_cv_func_iconv_works" in + *no) am_func_iconv=no am_cv_lib_iconv=no ;; + *) am_func_iconv=yes ;; + esac + else + am_func_iconv=no am_cv_lib_iconv=no + fi + if test "$am_func_iconv" = yes; then + AC_DEFINE([HAVE_ICONV], [1], + [Define if you have the iconv() function and it works.]) + fi + if test "$am_cv_lib_iconv" = yes; then + AC_MSG_CHECKING([how to link with libiconv]) + AC_MSG_RESULT([$LIBICONV]) + else + dnl If $LIBICONV didn't lead to a usable library, we don't need $INCICONV + dnl either. + CPPFLAGS="$am_save_CPPFLAGS" + LIBICONV= + LTLIBICONV= + fi + AC_SUBST([LIBICONV]) + AC_SUBST([LTLIBICONV]) +]) + +dnl Define AM_ICONV using AC_DEFUN_ONCE for Autoconf >= 2.64, in order to +dnl avoid warnings like +dnl "warning: AC_REQUIRE: `AM_ICONV' was expanded before it was required". +dnl This is tricky because of the way 'aclocal' is implemented: +dnl - It requires defining an auxiliary macro whose name ends in AC_DEFUN. +dnl Otherwise aclocal's initial scan pass would miss the macro definition. +dnl - It requires a line break inside the AC_DEFUN_ONCE and AC_DEFUN expansions. +dnl Otherwise aclocal would emit many "Use of uninitialized value $1" +dnl warnings. +m4_define([gl_iconv_AC_DEFUN], + m4_version_prereq([2.64], + [[AC_DEFUN_ONCE( + [$1], [$2])]], + [m4_ifdef([gl_00GNULIB], + [[AC_DEFUN_ONCE( + [$1], [$2])]], + [[AC_DEFUN( + [$1], [$2])]])])) +gl_iconv_AC_DEFUN([AM_ICONV], +[ + AM_ICONV_LINK + if test "$am_cv_func_iconv" = yes; then + AC_MSG_CHECKING([for iconv declaration]) + AC_CACHE_VAL([am_cv_proto_iconv], [ + AC_COMPILE_IFELSE( + [AC_LANG_PROGRAM( + [[ +#include +#include +extern +#ifdef __cplusplus +"C" +#endif +#if defined(__STDC__) || defined(_MSC_VER) || defined(__cplusplus) +size_t iconv (iconv_t cd, char * *inbuf, size_t *inbytesleft, char * *outbuf, size_t *outbytesleft); +#else +size_t iconv(); +#endif + ]], + [[]])], + [am_cv_proto_iconv_arg1=""], + [am_cv_proto_iconv_arg1="const"]) + am_cv_proto_iconv="extern size_t iconv (iconv_t cd, $am_cv_proto_iconv_arg1 char * *inbuf, size_t *inbytesleft, char * *outbuf, size_t *outbytesleft);"]) + am_cv_proto_iconv=`echo "[$]am_cv_proto_iconv" | tr -s ' ' | sed -e 's/( /(/'` + AC_MSG_RESULT([$am_cv_proto_iconv]) + else + dnl When compiling GNU libiconv on a system that does not have iconv yet, + dnl pick the POSIX compliant declaration without 'const'. + am_cv_proto_iconv_arg1="" + fi + AC_DEFINE_UNQUOTED([ICONV_CONST], [$am_cv_proto_iconv_arg1], + [Define as const if the declaration of iconv() needs const.]) + dnl Also substitute ICONV_CONST in the gnulib generated . + m4_ifdef([gl_ICONV_H_DEFAULTS], + [AC_REQUIRE([gl_ICONV_H_DEFAULTS]) + if test -n "$am_cv_proto_iconv_arg1"; then + ICONV_CONST="const" + fi + ]) +]) diff --git a/config/intlmacosx.m4 b/config/intlmacosx.m4 new file mode 100644 index 000000000000..30e6f50e0ac6 --- /dev/null +++ b/config/intlmacosx.m4 @@ -0,0 +1,72 @@ +# intlmacosx.m4 serial 6 (gettext-0.20) +dnl Copyright (C) 2004-2014, 2016, 2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. +dnl +dnl This file can be used in projects which are not available under +dnl the GNU General Public License or the GNU Library General Public +dnl License but which still want to provide support for the GNU gettext +dnl functionality. +dnl Please note that the actual code of the GNU gettext library is covered +dnl by the GNU Library General Public License, and the rest of the GNU +dnl gettext package is covered by the GNU General Public License. +dnl They are *not* in the public domain. + +dnl Checks for special options needed on Mac OS X. +dnl Defines INTL_MACOSX_LIBS. +AC_DEFUN([gt_INTL_MACOSX], +[ + dnl Check for API introduced in Mac OS X 10.4. + AC_CACHE_CHECK([for CFPreferencesCopyAppValue], + [gt_cv_func_CFPreferencesCopyAppValue], + [gt_save_LIBS="$LIBS" + LIBS="$LIBS -Wl,-framework -Wl,CoreFoundation" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[CFPreferencesCopyAppValue(NULL, NULL)]])], + [gt_cv_func_CFPreferencesCopyAppValue=yes], + [gt_cv_func_CFPreferencesCopyAppValue=no]) + LIBS="$gt_save_LIBS"]) + if test $gt_cv_func_CFPreferencesCopyAppValue = yes; then + AC_DEFINE([HAVE_CFPREFERENCESCOPYAPPVALUE], [1], + [Define to 1 if you have the Mac OS X function CFPreferencesCopyAppValue in the CoreFoundation framework.]) + fi + dnl Check for API introduced in Mac OS X 10.5. + AC_CACHE_CHECK([for CFLocaleCopyCurrent], [gt_cv_func_CFLocaleCopyCurrent], + [gt_save_LIBS="$LIBS" + LIBS="$LIBS -Wl,-framework -Wl,CoreFoundation" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[CFLocaleCopyCurrent();]])], + [gt_cv_func_CFLocaleCopyCurrent=yes], + [gt_cv_func_CFLocaleCopyCurrent=no]) + LIBS="$gt_save_LIBS"]) + if test $gt_cv_func_CFLocaleCopyCurrent = yes; then + AC_DEFINE([HAVE_CFLOCALECOPYCURRENT], [1], + [Define to 1 if you have the Mac OS X function CFLocaleCopyCurrent in the CoreFoundation framework.]) + fi + AC_CACHE_CHECK([for CFLocaleCopyPreferredLanguages], [gt_cv_func_CFLocaleCopyPreferredLanguages], + [gt_save_LIBS="$LIBS" + LIBS="$LIBS -Wl,-framework -Wl,CoreFoundation" + AC_LINK_IFELSE( + [AC_LANG_PROGRAM( + [[#include ]], + [[CFLocaleCopyPreferredLanguages();]])], + [gt_cv_func_CFLocaleCopyPreferredLanguages=yes], + [gt_cv_func_CFLocaleCopyPreferredLanguages=no]) + LIBS="$gt_save_LIBS"]) + if test $gt_cv_func_CFLocaleCopyPreferredLanguages = yes; then + AC_DEFINE([HAVE_CFLOCALECOPYPREFERREDLANGUAGES], [1], + [Define to 1 if you have the Mac OS X function CFLocaleCopyPreferredLanguages in the CoreFoundation framework.]) + fi + INTL_MACOSX_LIBS= + if test $gt_cv_func_CFPreferencesCopyAppValue = yes \ + || test $gt_cv_func_CFLocaleCopyCurrent = yes \ + || test $gt_cv_func_CFLocaleCopyPreferredLanguages = yes; then + INTL_MACOSX_LIBS="-Wl,-framework -Wl,CoreFoundation" + fi + AC_SUBST([INTL_MACOSX_LIBS]) +]) diff --git a/config/kernel-access-ok-type.m4 b/config/kernel-access-ok-type.m4 new file mode 100644 index 000000000000..dc9433458703 --- /dev/null +++ b/config/kernel-access-ok-type.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 5.0: access_ok() drops 'type' parameter: +dnl # +dnl # - access_ok(type, addr, size) +dnl # + access_ok(addr, size) +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE], [ + ZFS_LINUX_TEST_SRC([access_ok_type], [ + #include + ],[ + const void __user __attribute__((unused)) *addr = + (void *) 0xdeadbeef; + unsigned long __attribute__((unused)) size = 1; + int error __attribute__((unused)) = access_ok(0, addr, size); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACCESS_OK_TYPE], [ + AC_MSG_CHECKING([whether access_ok() has 'type' parameter]) + ZFS_LINUX_TEST_RESULT([access_ok_type], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ACCESS_OK_TYPE, 1, + [kernel has access_ok with 'type' parameter]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 new file mode 100644 index 000000000000..0f1c24656730 --- /dev/null +++ b/config/kernel-acl.m4 @@ -0,0 +1,289 @@ +dnl # +dnl # Check if posix_acl_release can be used from a ZFS_META_LICENSED +dnl # module. The is_owner_or_cap macro was replaced by +dnl # inode_owner_or_capable +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_RELEASE], [ + ZFS_LINUX_TEST_SRC([posix_acl_release], [ + #include + #include + #include + ], [ + struct posix_acl *tmp = posix_acl_alloc(1, 0); + posix_acl_release(tmp); + ], [], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_RELEASE], [ + AC_MSG_CHECKING([whether posix_acl_release() is available]) + ZFS_LINUX_TEST_RESULT([posix_acl_release], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_RELEASE, 1, + [posix_acl_release() is available]) + + AC_MSG_CHECKING([whether posix_acl_release() is GPL-only]) + ZFS_LINUX_TEST_RESULT([posix_acl_release_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_RELEASE_GPL_ONLY, 1, + [posix_acl_release() is GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.14 API change, +dnl # set_cached_acl() and forget_cached_acl() changed from inline to +dnl # EXPORT_SYMBOL. In the former case, they may not be usable because of +dnl # posix_acl_release. In the latter case, we can always use them. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_CACHED_ACL_USABLE], [ + ZFS_LINUX_TEST_SRC([set_cached_acl], [ + #include + #include + #include + ], [ + struct inode *ip = NULL; + struct posix_acl *acl = posix_acl_alloc(1, 0); + set_cached_acl(ip, ACL_TYPE_ACCESS, acl); + forget_cached_acl(ip, ACL_TYPE_ACCESS); + ], [], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE], [ + AC_MSG_CHECKING([whether set_cached_acl() is usable]) + ZFS_LINUX_TEST_RESULT([set_cached_acl_license], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_CACHED_ACL_USABLE, 1, + [set_cached_acl() is usable]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.1 API change, +dnl # posix_acl_chmod() was added as the preferred interface. +dnl # +dnl # 3.14 API change, +dnl # posix_acl_chmod() was changed to __posix_acl_chmod() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_CHMOD], [ + ZFS_LINUX_TEST_SRC([posix_acl_chmod], [ + #include + #include + ],[ + posix_acl_chmod(NULL, 0, 0) + ]) + + ZFS_LINUX_TEST_SRC([__posix_acl_chmod], [ + #include + #include + ],[ + __posix_acl_chmod(NULL, 0, 0) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_CHMOD], [ + AC_MSG_CHECKING([whether __posix_acl_chmod exists]) + ZFS_LINUX_TEST_RESULT([__posix_acl_chmod], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE___POSIX_ACL_CHMOD, 1, + [__posix_acl_chmod() exists]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether posix_acl_chmod exists]) + ZFS_LINUX_TEST_RESULT([posix_acl_chmod], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_CHMOD, 1, + [posix_acl_chmod() exists]) + ],[ + ZFS_LINUX_TEST_ERROR([posix_acl_chmod()]) + ]) + ]) +]) + +dnl # +dnl # 3.1 API change, +dnl # posix_acl_equiv_mode now wants an umode_t instead of a mode_t +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ + ZFS_LINUX_TEST_SRC([posix_acl_equiv_mode], [ + #include + #include + ],[ + umode_t tmp; + posix_acl_equiv_mode(NULL, &tmp); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T], [ + AC_MSG_CHECKING([whether posix_acl_equiv_mode() wants umode_t]) + ZFS_LINUX_TEST_RESULT([posix_acl_equiv_mode], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([posix_acl_equiv_mode()]) + ]) +]) + +dnl # +dnl # 4.8 API change, +dnl # The function posix_acl_valid now must be passed a namespace. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_VALID_WITH_NS], [ + ZFS_LINUX_TEST_SRC([posix_acl_valid_with_ns], [ + #include + #include + ],[ + struct user_namespace *user_ns = NULL; + const struct posix_acl *acl = NULL; + int error; + + error = posix_acl_valid(user_ns, acl); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS], [ + AC_MSG_CHECKING([whether posix_acl_valid() wants user namespace]) + ZFS_LINUX_TEST_RESULT([posix_acl_valid_with_ns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_VALID_WITH_NS, 1, + [posix_acl_valid() wants user namespace]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.1 API change, +dnl # Check if inode_operations contains the function get_acl +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [ + ZFS_LINUX_TEST_SRC([inode_operations_get_acl], [ + #include + + struct posix_acl *get_acl_fn(struct inode *inode, int type) + { return NULL; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .get_acl = get_acl_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL], [ + AC_MSG_CHECKING([whether iops->get_acl() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_get_acl], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->get_acl()]) + ]) +]) + +dnl # +dnl # 3.14 API change, +dnl # Check if inode_operations contains the function set_acl +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ + ZFS_LINUX_TEST_SRC([inode_operations_set_acl], [ + #include + + int set_acl_fn(struct inode *inode, struct posix_acl *acl, + int type) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .set_acl = set_acl_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL], [ + AC_MSG_CHECKING([whether iops->set_acl() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_set_acl], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SET_ACL, 1, [iops->set_acl() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.7 API change, +dnl # The kernel get_acl will now check cache before calling i_op->get_acl and +dnl # do set_cached_acl after that, so i_op->get_acl don't need to do that +dnl # anymore. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_ACL_HANDLE_CACHE], [ + ZFS_LINUX_TEST_SRC([get_acl_handle_cache], [ + #include + ],[ + void *sentinel __attribute__ ((unused)) = + uncached_acl_sentinel(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE], [ + AC_MSG_CHECKING([whether uncached_acl_sentinel() exists]) + ZFS_LINUX_TEST_RESULT([get_acl_handle_cache], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_GET_ACL_HANDLE_CACHE, 1, + [uncached_acl_sentinel() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.16 kernel: check if struct posix_acl acl.a_refcount is a refcount_t. +dnl # It's an atomic_t on older kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACL_HAS_REFCOUNT], [ + ZFS_LINUX_TEST_SRC([acl_refcount], [ + #include + #include + #include + ],[ + struct posix_acl acl; + refcount_t *r __attribute__ ((unused)) = &acl.a_refcount; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACL_HAS_REFCOUNT], [ + AC_MSG_CHECKING([whether posix_acl has refcount_t]) + ZFS_LINUX_TEST_RESULT([acl_refcount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ACL_REFCOUNT, 1, [posix_acl has refcount_t]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_ACL], [ + ZFS_AC_KERNEL_SRC_POSIX_ACL_RELEASE + ZFS_AC_KERNEL_SRC_SET_CACHED_ACL_USABLE + ZFS_AC_KERNEL_SRC_POSIX_ACL_CHMOD + ZFS_AC_KERNEL_SRC_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T + ZFS_AC_KERNEL_SRC_POSIX_ACL_VALID_WITH_NS + ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL + ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL + ZFS_AC_KERNEL_SRC_GET_ACL_HANDLE_CACHE + ZFS_AC_KERNEL_SRC_ACL_HAS_REFCOUNT +]) + +AC_DEFUN([ZFS_AC_KERNEL_ACL], [ + ZFS_AC_KERNEL_POSIX_ACL_RELEASE + ZFS_AC_KERNEL_SET_CACHED_ACL_USABLE + ZFS_AC_KERNEL_POSIX_ACL_CHMOD + ZFS_AC_KERNEL_POSIX_ACL_EQUIV_MODE_WANTS_UMODE_T + ZFS_AC_KERNEL_POSIX_ACL_VALID_WITH_NS + ZFS_AC_KERNEL_INODE_OPERATIONS_GET_ACL + ZFS_AC_KERNEL_INODE_OPERATIONS_SET_ACL + ZFS_AC_KERNEL_GET_ACL_HANDLE_CACHE + ZFS_AC_KERNEL_ACL_HAS_REFCOUNT +]) diff --git a/config/kernel-aio-fsync.m4 b/config/kernel-aio-fsync.m4 new file mode 100644 index 000000000000..b4dbf29ba781 --- /dev/null +++ b/config/kernel-aio-fsync.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 4.9-rc5+ ABI, removal of the .aio_fsync field +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_AIO_FSYNC], [ + ZFS_LINUX_TEST_SRC([aio_fsync], [ + #include + + static const struct file_operations + fops __attribute__ ((unused)) = { + .aio_fsync = NULL, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_AIO_FSYNC], [ + AC_MSG_CHECKING([whether fops->aio_fsync() exists]) + ZFS_LINUX_TEST_RESULT([aio_fsync], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_AIO_FSYNC, 1, [fops->aio_fsync() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-automount.m4 b/config/kernel-automount.m4 new file mode 100644 index 000000000000..f7bb63c68154 --- /dev/null +++ b/config/kernel-automount.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # 2.6.37 API change +dnl # The dops->d_automount() dentry operation was added as a clean +dnl # solution to handling automounts. Prior to this cifs/nfs clients +dnl # which required automount support would abuse the follow_link() +dnl # operation on directories for this purpose. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ + ZFS_LINUX_TEST_SRC([dentry_operations_d_automount], [ + #include + struct vfsmount *d_automount(struct path *p) { return NULL; } + struct dentry_operations dops __attribute__ ((unused)) = { + .d_automount = d_automount, + }; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_AUTOMOUNT], [ + AC_MSG_CHECKING([whether dops->d_automount() exists]) + ZFS_LINUX_TEST_RESULT([dentry_operations_d_automount], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([dops->d_automount()]) + ]) +]) diff --git a/config/kernel-bdi.m4 b/config/kernel-bdi.m4 new file mode 100644 index 000000000000..9351df71b4b8 --- /dev/null +++ b/config/kernel-bdi.m4 @@ -0,0 +1,79 @@ +dnl # +dnl # Check available BDI interfaces. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BDI], [ + ZFS_LINUX_TEST_SRC([super_setup_bdi_name], [ + #include + struct super_block sb; + ], [ + char *name = "bdi"; + atomic_long_t zfs_bdi_seq; + int error __attribute__((unused)) = + super_setup_bdi_name(&sb, "%.28s-%ld", name, + atomic_long_inc_return(&zfs_bdi_seq)); + ]) + + ZFS_LINUX_TEST_SRC([bdi_setup_and_register], [ + #include + struct backing_dev_info bdi; + ], [ + char *name = "bdi"; + int error __attribute__((unused)) = + bdi_setup_and_register(&bdi, name); + ]) + + ZFS_LINUX_TEST_SRC([bdi_setup_and_register_3args], [ + #include + struct backing_dev_info bdi; + ], [ + char *name = "bdi"; + unsigned int cap = BDI_CAP_MAP_COPY; + int error __attribute__((unused)) = + bdi_setup_and_register(&bdi, name, cap); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BDI], [ + dnl # + dnl # 4.12, super_setup_bdi_name() introduced. + dnl # + AC_MSG_CHECKING([whether super_setup_bdi_name() exists]) + ZFS_LINUX_TEST_RESULT_SYMBOL([super_setup_bdi_name], + [super_setup_bdi_name], [fs/super.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_SETUP_BDI_NAME, 1, + [super_setup_bdi_name() exits]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. + dnl # + AC_MSG_CHECKING( + [whether bdi_setup_and_register() wants 2 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([bdi_setup_and_register], + [bdi_setup_and_register], [mm/backing-dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_BDI_SETUP_AND_REGISTER, 1, + [bdi_setup_and_register() wants 2 args]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # 2.6.34 - 3.19, bdi_setup_and_register() + dnl # takes 3 arguments. + dnl # + AC_MSG_CHECKING( + [whether bdi_setup_and_register() wants 3 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL( + [bdi_setup_and_register_3args], + [bdi_setup_and_register], [mm/backing-dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_BDI_SETUP_AND_REGISTER, 1, + [bdi_setup_and_register() wants 3 args]) + ], [ + ZFS_LINUX_TEST_ERROR([bdi_setup]) + ]) + ]) + ]) +]) diff --git a/config/kernel-bio.m4 b/config/kernel-bio.m4 new file mode 100644 index 000000000000..afa1f1cabeb0 --- /dev/null +++ b/config/kernel-bio.m4 @@ -0,0 +1,403 @@ +dnl # +dnl # 2.6.36 API change, +dnl # REQ_FAILFAST_{DEV|TRANSPORT|DRIVER} +dnl # REQ_DISCARD +dnl # REQ_FLUSH +dnl # +dnl # 4.8 - 4.9 API, +dnl # REQ_FLUSH was renamed to REQ_PREFLUSH +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REQ], [ + ZFS_LINUX_TEST_SRC([req_failfast_mask], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_FAILFAST_MASK; + ]) + + ZFS_LINUX_TEST_SRC([req_discard], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_DISCARD; + ]) + + ZFS_LINUX_TEST_SRC([req_flush], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_FLUSH; + ]) + + ZFS_LINUX_TEST_SRC([req_preflush], [ + #include + ],[ + int flags __attribute__ ((unused)); + flags = REQ_PREFLUSH; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_FAILFAST_MASK], [ + AC_MSG_CHECKING([whether REQ_FAILFAST_MASK is defined]) + ZFS_LINUX_TEST_RESULT([req_failfast_mask], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([REQ_FAILFAST_MASK]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_DISCARD], [ + AC_MSG_CHECKING([whether REQ_DISCARD is defined]) + ZFS_LINUX_TEST_RESULT([req_discard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_DISCARD, 1, [REQ_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_FLUSH], [ + AC_MSG_CHECKING([whether REQ_FLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_FLUSH, 1, [REQ_FLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_PREFLUSH], [ + AC_MSG_CHECKING([whether REQ_PREFLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_preflush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_PREFLUSH, 1, [REQ_PREFLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 4.8 API, +dnl # +dnl # The bio_op() helper was introduced as a replacement for explicitly +dnl # checking the bio->bi_rw flags. The following checks are used to +dnl # detect if a specific operation is supported. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_OPS], [ + ZFS_LINUX_TEST_SRC([req_op_discard], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_DISCARD; + ]) + + ZFS_LINUX_TEST_SRC([req_op_secure_erase], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_SECURE_ERASE; + ]) + + ZFS_LINUX_TEST_SRC([req_op_flush], [ + #include + ],[ + int op __attribute__ ((unused)) = REQ_OP_FLUSH; + ]) + + ZFS_LINUX_TEST_SRC([bio_bi_opf], [ + #include + ],[ + struct bio bio __attribute__ ((unused)); + bio.bi_opf = 0; + ]) + + ZFS_LINUX_TEST_SRC([bio_set_op_attrs], [ + #include + ],[ + struct bio *bio __attribute__ ((unused)) = NULL; + bio_set_op_attrs(bio, 0, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_DISCARD], [ + AC_MSG_CHECKING([whether REQ_OP_DISCARD is defined]) + ZFS_LINUX_TEST_RESULT([req_op_discard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_DISCARD, 1, [REQ_OP_DISCARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_SECURE_ERASE], [ + AC_MSG_CHECKING([whether REQ_OP_SECURE_ERASE is defined]) + ZFS_LINUX_TEST_RESULT([req_op_secure_erase], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_SECURE_ERASE, 1, + [REQ_OP_SECURE_ERASE is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_REQ_OP_FLUSH], [ + AC_MSG_CHECKING([whether REQ_OP_FLUSH is defined]) + ZFS_LINUX_TEST_RESULT([req_op_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_REQ_OP_FLUSH, 1, [REQ_OP_FLUSH is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_OPF], [ + AC_MSG_CHECKING([whether bio->bi_opf is defined]) + ZFS_LINUX_TEST_RESULT([bio_bi_opf], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BI_OPF, 1, [bio->bi_opf is defined]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_OP_ATTRS], [ + AC_MSG_CHECKING([whether bio_set_op_attrs is available]) + ZFS_LINUX_TEST_RESULT([bio_set_op_attrs], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_OP_ATTRS, 1, + [bio_set_op_attrs is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 4.14 API, +dnl # +dnl # The bio_set_dev() helper macro was introduced as part of the transition +dnl # to have struct gendisk in struct bio. +dnl # +dnl # Linux 5.0 API, +dnl # +dnl # The bio_set_dev() helper macro was updated to internally depend on +dnl # bio_associate_blkg() symbol which is exported GPL-only. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_SET_DEV], [ + ZFS_LINUX_TEST_SRC([bio_set_dev], [ + #include + #include + ],[ + struct block_device *bdev = NULL; + struct bio *bio = NULL; + bio_set_dev(bio, bdev); + ], [], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SET_DEV], [ + AC_MSG_CHECKING([whether bio_set_dev() is available]) + ZFS_LINUX_TEST_RESULT([bio_set_dev], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_DEV, 1, [bio_set_dev() is available]) + + AC_MSG_CHECKING([whether bio_set_dev() is GPL-only]) + ZFS_LINUX_TEST_RESULT([bio_set_dev_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_SET_DEV_GPL_ONLY, 1, + [bio_set_dev() GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.3 API change +dnl # Error argument dropped from bio_endio in favor of newly introduced +dnl # bio->bi_error. This also replaces bio->bi_flags value BIO_UPTODATE. +dnl # Introduced by torvalds/linux@4246a0b63bd8f56a1469b12eafeb875b1041a451 +dnl # ("block: add a bi_error field to struct bio"). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_END_IO_T_ARGS], [ + ZFS_LINUX_TEST_SRC([bio_end_io_t_args], [ + #include + void wanted_end_io(struct bio *bio) { return; } + bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io; + ], []) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_END_IO_T_ARGS], [ + AC_MSG_CHECKING([whether bio_end_io_t wants 1 arg]) + ZFS_LINUX_TEST_RESULT([bio_end_io_t_args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_BIO_END_IO_T, 1, + [bio_end_io_t wants 1 arg]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.13 API change +dnl # The bio->bi_error field was replaced with bio->bi_status which is an +dnl # enum which describes all possible error types. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_BI_STATUS], [ + ZFS_LINUX_TEST_SRC([bio_bi_status], [ + #include + ], [ + struct bio bio __attribute__ ((unused)); + blk_status_t status __attribute__ ((unused)) = BLK_STS_OK; + bio.bi_status = status; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BI_STATUS], [ + AC_MSG_CHECKING([whether bio->bi_status exists]) + ZFS_LINUX_TEST_RESULT([bio_bi_status], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BI_STATUS, 1, [bio->bi_status exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.14 API change, +dnl # Immutable biovecs. A number of fields of struct bio are moved to +dnl # struct bvec_iter. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_BVEC_ITER], [ + ZFS_LINUX_TEST_SRC([bio_bvec_iter], [ + #include + ],[ + struct bio bio; + bio.bi_iter.bi_sector = 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_BVEC_ITER], [ + AC_MSG_CHECKING([whether bio has bi_iter]) + ZFS_LINUX_TEST_RESULT([bio_bvec_iter], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BIO_BVEC_ITER, 1, [bio has bi_iter]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.8 API change +dnl # The rw argument has been removed from submit_bio/submit_bio_wait. +dnl # Callers are now expected to set bio->bi_rw instead of passing it in. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_SUBMIT_BIO], [ + ZFS_LINUX_TEST_SRC([submit_bio], [ + #include + ],[ + blk_qc_t blk_qc; + struct bio *bio = NULL; + blk_qc = submit_bio(bio); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_SUBMIT_BIO], [ + AC_MSG_CHECKING([whether submit_bio() wants 1 arg]) + ZFS_LINUX_TEST_RESULT([submit_bio], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_SUBMIT_BIO, 1, [submit_bio() wants 1 arg]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # current->bio_list +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_CURRENT_BIO_LIST], [ + ZFS_LINUX_TEST_SRC([current_bio_list], [ + #include + ], [ + current->bio_list = (struct bio_list *) NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO_CURRENT_BIO_LIST], [ + AC_MSG_CHECKING([whether current->bio_list exists]) + ZFS_LINUX_TEST_RESULT([current_bio_list], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bio_list]) + ]) +]) + +dnl # +dnl # Linux 5.5 API, +dnl # +dnl # The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by +dnl # blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). +dnl # As a side effect the function was converted to GPL-only. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKG_TRYGET], [ + ZFS_LINUX_TEST_SRC([blkg_tryget], [ + #include + #include + #include + ],[ + struct blkcg_gq blkg __attribute__ ((unused)); + bool rc __attribute__ ((unused)); + rc = blkg_tryget(&blkg); + ], [], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKG_TRYGET], [ + AC_MSG_CHECKING([whether blkg_tryget() is available]) + ZFS_LINUX_TEST_RESULT([blkg_tryget], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKG_TRYGET, 1, [blkg_tryget() is available]) + + AC_MSG_CHECKING([whether blkg_tryget() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blkg_tryget_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKG_TRYGET_GPL_ONLY, 1, + [blkg_tryget() GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO], [ + ZFS_AC_KERNEL_SRC_REQ + ZFS_AC_KERNEL_SRC_BIO_OPS + ZFS_AC_KERNEL_SRC_BIO_SET_DEV + ZFS_AC_KERNEL_SRC_BIO_END_IO_T_ARGS + ZFS_AC_KERNEL_SRC_BIO_BI_STATUS + ZFS_AC_KERNEL_SRC_BIO_BVEC_ITER + ZFS_AC_KERNEL_SRC_BIO_SUBMIT_BIO + ZFS_AC_KERNEL_SRC_BIO_CURRENT_BIO_LIST + ZFS_AC_KERNEL_SRC_BLKG_TRYGET +]) + +AC_DEFUN([ZFS_AC_KERNEL_BIO], [ + ZFS_AC_KERNEL_BIO_REQ_FAILFAST_MASK + ZFS_AC_KERNEL_BIO_REQ_DISCARD + ZFS_AC_KERNEL_BIO_REQ_FLUSH + ZFS_AC_KERNEL_BIO_REQ_PREFLUSH + + ZFS_AC_KERNEL_BIO_REQ_OP_DISCARD + ZFS_AC_KERNEL_BIO_REQ_OP_SECURE_ERASE + ZFS_AC_KERNEL_BIO_REQ_OP_FLUSH + ZFS_AC_KERNEL_BIO_BI_OPF + ZFS_AC_KERNEL_BIO_SET_OP_ATTRS + + ZFS_AC_KERNEL_BIO_SET_DEV + ZFS_AC_KERNEL_BIO_END_IO_T_ARGS + ZFS_AC_KERNEL_BIO_BI_STATUS + ZFS_AC_KERNEL_BIO_BVEC_ITER + ZFS_AC_KERNEL_BIO_SUBMIT_BIO + ZFS_AC_KERNEL_BIO_CURRENT_BIO_LIST + ZFS_AC_KERNEL_BLKG_TRYGET +]) diff --git a/config/kernel-blk-queue.m4 b/config/kernel-blk-queue.m4 new file mode 100644 index 000000000000..382ebefd34a3 --- /dev/null +++ b/config/kernel-blk-queue.m4 @@ -0,0 +1,302 @@ +dnl # +dnl # 2.6.39 API change, +dnl # blk_start_plug() and blk_finish_plug() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG], [ + ZFS_LINUX_TEST_SRC([blk_plug], [ + #include + ],[ + struct blk_plug plug __attribute__ ((unused)); + + blk_start_plug(&plug); + blk_finish_plug(&plug); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_PLUG], [ + AC_MSG_CHECKING([whether struct blk_plug is available]) + ZFS_LINUX_TEST_RESULT([blk_plug], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_plug]) + ]) +]) + +dnl # +dnl # 2.6.32 - 4.11, statically allocated bdi in request_queue +dnl # 4.12 - x.y, dynamically allocated bdi in request_queue +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI], [ + ZFS_LINUX_TEST_SRC([blk_queue_bdi], [ + #include + ],[ + struct request_queue q; + struct backing_dev_info bdi; + q.backing_dev_info = &bdi; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_BDI], [ + AC_MSG_CHECKING([whether blk_queue bdi is dynamic]) + ZFS_LINUX_TEST_RESULT([blk_queue_bdi], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_BDI_DYNAMIC, 1, + [blk queue backing_dev_info is dynamic]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.32 - 4.x API, +dnl # blk_queue_discard() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD], [ + ZFS_LINUX_TEST_SRC([blk_queue_discard], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_discard(q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_DISCARD], [ + AC_MSG_CHECKING([whether blk_queue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_discard], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_discard]) + ]) +]) + +dnl # +dnl # 4.8 - 4.x API, +dnl # blk_queue_secure_erase() +dnl # +dnl # 2.6.36 - 4.7 API, +dnl # blk_queue_secdiscard() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE], [ + ZFS_LINUX_TEST_SRC([blk_queue_secure_erase], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_secure_erase(q); + ]) + + ZFS_LINUX_TEST_SRC([blk_queue_secdiscard], [ + #include + ],[ + struct request_queue *q __attribute__ ((unused)) = NULL; + int value __attribute__ ((unused)); + value = blk_queue_secdiscard(q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE], [ + AC_MSG_CHECKING([whether blk_queue_secure_erase() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_secure_erase], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_SECURE_ERASE, 1, + [blk_queue_secure_erase() is available]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blk_queue_secdiscard() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_secdiscard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_SECDISCARD, 1, + [blk_queue_secdiscard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_secure_erase]) + ]) + ]) +]) + +dnl # +dnl # 4.16 API change, +dnl # Introduction of blk_queue_flag_set and blk_queue_flag_clear +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET], [ + ZFS_LINUX_TEST_SRC([blk_queue_flag_set], [ + #include + #include + ],[ + struct request_queue *q = NULL; + blk_queue_flag_set(0, q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET], [ + AC_MSG_CHECKING([whether blk_queue_flag_set() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_flag_set], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLAG_SET, 1, + [blk_queue_flag_set() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR], [ + ZFS_LINUX_TEST_SRC([blk_queue_flag_clear], [ + #include + #include + ],[ + struct request_queue *q = NULL; + blk_queue_flag_clear(0, q); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR], [ + AC_MSG_CHECKING([whether blk_queue_flag_clear() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_flag_clear], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLAG_CLEAR, 1, + [blk_queue_flag_clear() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.36 API change, +dnl # Added blk_queue_flush() interface, while the previous interface +dnl # was available to all the new one is GPL-only. Thus in addition to +dnl # detecting if this function is available we determine if it is +dnl # GPL-only. If the GPL-only interface is there we implement our own +dnl # compatibility function, otherwise we use the function. The hope +dnl # is that long term this function will be opened up. +dnl # +dnl # 4.7 API change, +dnl # Replace blk_queue_flush with blk_queue_write_cache +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH], [ + ZFS_LINUX_TEST_SRC([blk_queue_flush], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_flush(q, REQ_FLUSH); + ], [$NO_UNUSED_BUT_SET_VARIABLE], [$ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([blk_queue_write_cache], [ + #include + #include + ], [ + struct request_queue *q = NULL; + blk_queue_write_cache(q, true, true); + ], [$NO_UNUSED_BUT_SET_VARIABLE], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_FLUSH], [ + AC_MSG_CHECKING([whether blk_queue_flush() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_flush], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLUSH, 1, + [blk_queue_flush() is available]) + + AC_MSG_CHECKING([whether blk_queue_flush() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blk_queue_flush_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY, 1, + [blk_queue_flush() is GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) + + dnl # + dnl # 4.7 API change + dnl # Replace blk_queue_flush with blk_queue_write_cache + dnl # + AC_MSG_CHECKING([whether blk_queue_write_cache() exists]) + ZFS_LINUX_TEST_RESULT([blk_queue_write_cache], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE, 1, + [blk_queue_write_cache() exists]) + + AC_MSG_CHECKING([whether blk_queue_write_cache() is GPL-only]) + ZFS_LINUX_TEST_RESULT([blk_queue_write_cache_license], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY, 1, + [blk_queue_write_cache() is GPL-only]) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # blk_queue_max_hw_sectors() replaces blk_queue_max_sectors(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS], [ + ZFS_LINUX_TEST_SRC([blk_queue_max_hw_sectors], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_max_hw_sectors(q, BLK_SAFE_MAX_SECTORS); + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS], [ + AC_MSG_CHECKING([whether blk_queue_max_hw_sectors() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_max_hw_sectors], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([blk_queue_max_hw_sectors]) + ]) +]) + +dnl # +dnl # 2.6.34 API change +dnl # blk_queue_max_segments() consolidates blk_queue_max_hw_segments() +dnl # and blk_queue_max_phys_segments(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS], [ + ZFS_LINUX_TEST_SRC([blk_queue_max_segments], [ + #include + ], [ + struct request_queue *q = NULL; + (void) blk_queue_max_segments(q, BLK_MAX_SEGMENTS); + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS], [ + AC_MSG_CHECKING([whether blk_queue_max_segments() is available]) + ZFS_LINUX_TEST_RESULT([blk_queue_max_segments], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blk_queue_max_segments]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLK_QUEUE], [ + ZFS_AC_KERNEL_SRC_BLK_QUEUE_PLUG + ZFS_AC_KERNEL_SRC_BLK_QUEUE_BDI + ZFS_AC_KERNEL_SRC_BLK_QUEUE_DISCARD + ZFS_AC_KERNEL_SRC_BLK_QUEUE_SECURE_ERASE + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_SET + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLAG_CLEAR + ZFS_AC_KERNEL_SRC_BLK_QUEUE_FLUSH + ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_HW_SECTORS + ZFS_AC_KERNEL_SRC_BLK_QUEUE_MAX_SEGMENTS +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLK_QUEUE], [ + ZFS_AC_KERNEL_BLK_QUEUE_PLUG + ZFS_AC_KERNEL_BLK_QUEUE_BDI + ZFS_AC_KERNEL_BLK_QUEUE_DISCARD + ZFS_AC_KERNEL_BLK_QUEUE_SECURE_ERASE + ZFS_AC_KERNEL_BLK_QUEUE_FLAG_SET + ZFS_AC_KERNEL_BLK_QUEUE_FLAG_CLEAR + ZFS_AC_KERNEL_BLK_QUEUE_FLUSH + ZFS_AC_KERNEL_BLK_QUEUE_MAX_HW_SECTORS + ZFS_AC_KERNEL_BLK_QUEUE_MAX_SEGMENTS +]) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 new file mode 100644 index 000000000000..2644555f5524 --- /dev/null +++ b/config/kernel-blkdev.m4 @@ -0,0 +1,212 @@ +dnl # +dnl # 2.6.38 API change, +dnl # Added blkdev_get_by_path() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ + ZFS_LINUX_TEST_SRC([blkdev_get_by_path], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + + bdev = blkdev_get_by_path(path, mode, holder); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ + AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) +]) + +dnl # +dnl # 2.6.38 API change, +dnl # Added blkdev_put() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ + ZFS_LINUX_TEST_SRC([blkdev_put], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + fmode_t mode = 0; + + blkdev_put(bdev, mode); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ + AC_MSG_CHECKING([whether blkdev_put() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_put], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) +]) + +dnl # +dnl # 4.1 API, exported blkdev_reread_part() symbol, back ported to the +dnl # 3.10.0 CentOS 7.x enterprise kernels. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART], [ + ZFS_LINUX_TEST_SRC([blkdev_reread_part], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + int error; + + error = blkdev_reread_part(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_REREAD_PART], [ + AC_MSG_CHECKING([whether blkdev_reread_part() exists]) + ZFS_LINUX_TEST_RESULT([blkdev_reread_part], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_REREAD_PART, 1, + [blkdev_reread_part() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.22 API change +dnl # Single argument invalidate_bdev() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV], [ + ZFS_LINUX_TEST_SRC([invalidate_bdev], [ + #include + #include + ],[ + struct block_device *bdev = NULL; + invalidate_bdev(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV], [ + AC_MSG_CHECKING([whether invalidate_bdev() exists]) + ZFS_LINUX_TEST_RESULT([invalidate_bdev], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([invalidate_bdev()]) + ]) +]) + +dnl # +dnl # 2.6.27, lookup_bdev() was exported. +dnl # 4.4.0-6.21 - lookup_bdev() takes 2 arguments. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV], [ + ZFS_LINUX_TEST_SRC([lookup_bdev_1arg], [ + #include + #include + ], [ + lookup_bdev(NULL); + ]) + + ZFS_LINUX_TEST_SRC([lookup_bdev_2args], [ + #include + ], [ + lookup_bdev(NULL, FMODE_READ); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV], [ + AC_MSG_CHECKING([whether lookup_bdev() wants 1 arg]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_1arg], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_1ARG_LOOKUP_BDEV, 1, + [lookup_bdev() wants 1 arg]) + ], [ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether lookup_bdev() wants 2 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lookup_bdev_2args], + [lookup_bdev], [fs/block_dev.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_LOOKUP_BDEV, 1, + [lookup_bdev() wants 2 args]) + ], [ + ZFS_LINUX_TEST_ERROR([lookup_bdev()]) + ]) + ]) +]) + +dnl # +dnl # 2.6.30 API change +dnl # +dnl # The bdev_physical_block_size() interface was added to provide a way +dnl # to determine the smallest write which can be performed without a +dnl # read-modify-write operation. +dnl # +dnl # Unfortunately, this interface isn't entirely reliable because +dnl # drives are sometimes known to misreport this value. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ + ZFS_LINUX_TEST_SRC([bdev_physical_block_size], [ + #include + ],[ + struct block_device *bdev __attribute__ ((unused)) = NULL; + bdev_physical_block_size(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_physical_block_size() is available]) + ZFS_LINUX_TEST_RESULT([bdev_physical_block_size], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bdev_physical_block_size()]) + ]) +]) + +dnl # +dnl # 2.6.30 API change +dnl # Added bdev_logical_block_size(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ + ZFS_LINUX_TEST_SRC([bdev_logical_block_size], [ + #include + ],[ + struct block_device *bdev __attribute__ ((unused)) = NULL; + bdev_logical_block_size(bdev); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE], [ + AC_MSG_CHECKING([whether bdev_logical_block_size() is available]) + ZFS_LINUX_TEST_RESULT([bdev_logical_block_size], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bdev_logical_block_size()]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ + ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_SRC_BLKDEV_PUT + ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART + ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV + ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ + ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_BLKDEV_PUT + ZFS_AC_KERNEL_BLKDEV_REREAD_PART + ZFS_AC_KERNEL_BLKDEV_INVALIDATE_BDEV + ZFS_AC_KERNEL_BLKDEV_LOOKUP_BDEV + ZFS_AC_KERNEL_BLKDEV_BDEV_LOGICAL_BLOCK_SIZE + ZFS_AC_KERNEL_BLKDEV_BDEV_PHYSICAL_BLOCK_SIZE +]) diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 new file mode 100644 index 000000000000..8e64ecca909b --- /dev/null +++ b/config/kernel-block-device-operations.m4 @@ -0,0 +1,63 @@ +dnl # +dnl # 2.6.38 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ + ZFS_LINUX_TEST_SRC([block_device_operations_check_events], [ + #include + + unsigned int blk_check_events(struct gendisk *disk, + unsigned int clearing) { return (0); } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .check_events = blk_check_events, + }; + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ + AC_MSG_CHECKING([whether bops->check_events() exists]) + ZFS_LINUX_TEST_RESULT([block_device_operations_check_events], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bops->check_events()]) + ]) +]) + +dnl # +dnl # 3.10.x API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ + ZFS_LINUX_TEST_SRC([block_device_operations_release_void], [ + #include + + void blk_release(struct gendisk *g, fmode_t mode) { return; } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .open = NULL, + .release = blk_release, + .ioctl = NULL, + .compat_ioctl = NULL, + }; + ], [], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ + AC_MSG_CHECKING([whether bops->release() is void]) + ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([bops->release()]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS], [ + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID +]) diff --git a/config/kernel-clear-inode.m4 b/config/kernel-clear-inode.m4 new file mode 100644 index 000000000000..3f454d7ec0d3 --- /dev/null +++ b/config/kernel-clear-inode.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # 3.5.0 API change +dnl # torvalds/linux@dbd5768f87ff6fb0a4fe09c4d7b6c4a24de99430 and +dnl # torvalds/linux@7994e6f7254354e03028a11f98a27bd67dace9f1 reworked +dnl # where inode_sync_wait() is called. +dnl # +dnl # Prior to these changes it would occur in end_writeback() but due +dnl # to various issues (described in the above commits) it has been +dnl # moved to evict(). This changes the ordering is which sync occurs +dnl # but otherwise doesn't impact the zpl implementation. +dnl # +dnl # The major impact here is the renaming of end_writeback() to +dnl # clear_inode(). However, care must be taken when detecting this +dnl # API change because as recently as 2.6.35 there was a clear_inode() +dnl # function. However, it was made obsolete by the evict_inode() API +dnl # change at the same time. +dnl # +dnl # Therefore, to ensure we have the correct API we only allow the +dnl # clear_inode() compatibility code to be defined iff the evict_inode() +dnl # functionality is also detected. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CLEAR_INODE], [ + ZFS_LINUX_TEST_SRC([clear_inode], [ + #include + ], [ + clear_inode(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CLEAR_INODE], [ + AC_MSG_CHECKING([whether clear_inode() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([clear_inode], + [clear_inode], [fs/inode.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CLEAR_INODE, 1, [clear_inode() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-commit-metadata.m4 b/config/kernel-commit-metadata.m4 new file mode 100644 index 000000000000..7df9b980290e --- /dev/null +++ b/config/kernel-commit-metadata.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.33 API change +dnl # Added eops->commit_metadata() callback to allow the underlying +dnl # filesystem to determine the most efficient way to commit the inode. +dnl # Prior to this the nfs server would issue an explicit fsync(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_COMMIT_METADATA], [ + ZFS_LINUX_TEST_SRC([export_operations_commit_metadata], [ + #include + int commit_metadata(struct inode *inode) { return 0; } + static struct export_operations eops __attribute__ ((unused))={ + .commit_metadata = commit_metadata, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_COMMIT_METADATA], [ + AC_MSG_CHECKING([whether eops->commit_metadata() exists]) + ZFS_LINUX_TEST_RESULT([export_operations_commit_metadata], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([eops->commit_metadata()]) + ]) +]) diff --git a/config/kernel-config-defined.m4 b/config/kernel-config-defined.m4 new file mode 100644 index 000000000000..0ee4231cc2db --- /dev/null +++ b/config/kernel-config-defined.m4 @@ -0,0 +1,183 @@ +dnl # +dnl # Certain kernel build options are not supported. These must be +dnl # detected at configure time and cause a build failure. Otherwise +dnl # modules may be successfully built that behave incorrectly. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEFINED], [ + AS_IF([test "x$cross_compiling" != xyes], [ + AC_RUN_IFELSE([ + AC_LANG_PROGRAM([ + #include "$LINUX/include/linux/license.h" + ], [ + return !license_is_gpl_compatible( + "$ZFS_META_LICENSE"); + ]) + ], [ + AC_DEFINE([ZFS_IS_GPL_COMPATIBLE], [1], + [Define to 1 if GPL-only symbols can be used]) + ], [ + ]) + ]) + + ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE + ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC + ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS + ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE + ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_DEFLATE + + AC_MSG_CHECKING([for kernel config option compatibility]) + ZFS_LINUX_TEST_COMPILE_ALL([config]) + AC_MSG_RESULT([done]) + + ZFS_AC_KERNEL_CONFIG_THREAD_SIZE + ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC + ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS + ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE + ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE +]) + +dnl # +dnl # Check configured THREAD_SIZE +dnl # +dnl # The stack size will vary by architecture, but as of Linux 3.15 on x86_64 +dnl # the default thread stack size was increased to 16K from 8K. Therefore, +dnl # on newer kernels and some architectures stack usage optimizations can be +dnl # conditionally applied to improve performance without negatively impacting +dnl # stability. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_THREAD_SIZE], [ + ZFS_LINUX_TEST_SRC([config_thread_size], [ + #include + ],[ + #if (THREAD_SIZE < 16384) + #error "THREAD_SIZE is less than 16K" + #endif + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_THREAD_SIZE], [ + AC_MSG_CHECKING([whether kernel was built with 16K or larger stacks]) + ZFS_LINUX_TEST_RESULT([config_thread_size], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_LARGE_STACKS, 1, [kernel has large stacks]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # Check CONFIG_DEBUG_LOCK_ALLOC +dnl # +dnl # This is typically only set for debug kernels because it comes with +dnl # a performance penalty. However, when it is set it maps the non-GPL +dnl # symbol mutex_lock() to the GPL-only mutex_lock_nested() symbol. +dnl # This will cause a failure at link time which we'd rather know about +dnl # at compile time. +dnl # +dnl # Since we plan to pursue making mutex_lock_nested() a non-GPL symbol +dnl # with the upstream community we add a check to detect this case. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_DEBUG_LOCK_ALLOC], [ + ZFS_LINUX_TEST_SRC([config_debug_lock_alloc], [ + #include + ],[ + struct mutex lock; + + mutex_init(&lock); + mutex_lock(&lock); + mutex_unlock(&lock); + ], [], [$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_DEBUG_LOCK_ALLOC], [ + AC_MSG_CHECKING([whether mutex_lock() is GPL-only]) + ZFS_LINUX_TEST_RESULT([config_debug_lock_alloc], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_MSG_ERROR([ + *** Kernel built with CONFIG_DEBUG_LOCK_ALLOC which is incompatible + *** with the CDDL license and will prevent the module linking stage + *** from succeeding. You must rebuild your kernel without this + *** option enabled.]) + ]) +]) + +dnl # +dnl # Check CONFIG_TRIM_UNUSED_KSYMS +dnl # +dnl # Verify the kernel has CONFIG_TRIM_UNUSED_KSYMS disabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_TRIM_UNUSED_KSYMS], [ + ZFS_LINUX_TEST_SRC([config_trim_unusued_ksyms], [ + #if defined(CONFIG_TRIM_UNUSED_KSYMS) + #error CONFIG_TRIM_UNUSED_KSYMS not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_TRIM_UNUSED_KSYMS], [ + AC_MSG_CHECKING([whether CONFIG_TRIM_UNUSED_KSYM is disabled]) + ZFS_LINUX_TEST_RESULT([config_trim_unusued_ksyms], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AS_IF([test "x$enable_linux_builtin" != xyes], [ + AC_MSG_ERROR([ + *** This kernel has unused symbols trimming enabled, please disable. + *** Rebuild the kernel with CONFIG_TRIM_UNUSED_KSYMS=n set.]) + ]) + ]) +]) + +dnl # +dnl # Check CONFIG_ZLIB_INFLATE +dnl # +dnl # Verify the kernel has CONFIG_ZLIB_INFLATE support enabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_INFLATE], [ + ZFS_LINUX_TEST_SRC([config_zlib_inflate], [ + #if !defined(CONFIG_ZLIB_INFLATE) && \ + !defined(CONFIG_ZLIB_INFLATE_MODULE) + #error CONFIG_ZLIB_INFLATE not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_INFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_INFLATE is defined]) + ZFS_LINUX_TEST_RESULT([config_zlib_inflate], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib inflate support. + *** Rebuild the kernel with CONFIG_ZLIB_INFLATE=y|m set.]) + ]) +]) + +dnl # +dnl # Check CONFIG_ZLIB_DEFLATE +dnl # +dnl # Verify the kernel has CONFIG_ZLIB_DEFLATE support enabled. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONFIG_ZLIB_DEFLATE], [ + ZFS_LINUX_TEST_SRC([config_zlib_deflate], [ + #if !defined(CONFIG_ZLIB_DEFLATE) && \ + !defined(CONFIG_ZLIB_DEFLATE_MODULE) + #error CONFIG_ZLIB_DEFLATE not defined + #endif + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONFIG_ZLIB_DEFLATE], [ + AC_MSG_CHECKING([whether CONFIG_ZLIB_DEFLATE is defined]) + ZFS_LINUX_TEST_RESULT([config_zlib_deflate], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + AC_MSG_ERROR([ + *** This kernel does not include the required zlib deflate support. + *** Rebuild the kernel with CONFIG_ZLIB_DEFLATE=y|m set.]) + ]) +]) diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4 new file mode 100644 index 000000000000..3ceb5f63efa9 --- /dev/null +++ b/config/kernel-current-time.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 4.9, current_time() added +dnl # 4.18, return type changed from timespec to timespec64 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [ + ZFS_LINUX_TEST_SRC([current_time], [ + #include + ], [ + struct inode ip __attribute__ ((unused)); + ip.i_atime = current_time(&ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CURRENT_TIME], [ + AC_MSG_CHECKING([whether current_time() exists]) + ZFS_LINUX_TEST_RESULT_SYMBOL([current_time], + [current_time], [fs/inode.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CURRENT_TIME, 1, [current_time() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-declare-event-class.m4 b/config/kernel-declare-event-class.m4 new file mode 100644 index 000000000000..6c78ee858d7d --- /dev/null +++ b/config/kernel-declare-event-class.m4 @@ -0,0 +1,55 @@ +dnl # +dnl # Ensure the DECLARE_EVENT_CLASS macro is available to non-GPL modules. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_DECLARE_EVENT_CLASS], [ + AC_MSG_CHECKING([whether DECLARE_EVENT_CLASS() is available]) + ZFS_LINUX_TRY_COMPILE_HEADER([ + #include + MODULE_LICENSE("$ZFS_META_LICENSE"); + + #define CREATE_TRACE_POINTS + #include "conftest.h" + ],[ + trace_zfs_autoconf_event_one(1UL); + trace_zfs_autoconf_event_two(2UL); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_DECLARE_EVENT_CLASS, 1, + [DECLARE_EVENT_CLASS() is available]) + ],[ + AC_MSG_RESULT(no) + ],[ + #if !defined(_CONFTEST_H) || defined(TRACE_HEADER_MULTI_READ) + #define _CONFTEST_H + + #undef TRACE_SYSTEM + #define TRACE_SYSTEM zfs + #include + + DECLARE_EVENT_CLASS(zfs_autoconf_event_class, + TP_PROTO(unsigned long i), + TP_ARGS(i), + TP_STRUCT__entry( + __field(unsigned long, i) + ), + TP_fast_assign( + __entry->i = i; + ), + TP_printk("i = %lu", __entry->i) + ); + + #define DEFINE_AUTOCONF_EVENT(name) \ + DEFINE_EVENT(zfs_autoconf_event_class, name, \ + TP_PROTO(unsigned long i), \ + TP_ARGS(i)) + DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_one); + DEFINE_AUTOCONF_EVENT(zfs_autoconf_event_two); + + #endif /* _CONFTEST_H */ + + #undef TRACE_INCLUDE_PATH + #define TRACE_INCLUDE_PATH . + #define TRACE_INCLUDE_FILE conftest + #include + ]) +]) diff --git a/config/kernel-dentry-operations.m4 b/config/kernel-dentry-operations.m4 new file mode 100644 index 000000000000..dd470d7607b4 --- /dev/null +++ b/config/kernel-dentry-operations.m4 @@ -0,0 +1,190 @@ +dnl # +dnl # 3.4.0 API change +dnl # Added d_make_root() to replace previous d_alloc_root() function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_MAKE_ROOT], [ + ZFS_LINUX_TEST_SRC([d_make_root], [ + #include + ], [ + d_make_root(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_MAKE_ROOT], [ + AC_MSG_CHECKING([whether d_make_root() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_make_root], + [d_make_root], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_MAKE_ROOT, 1, [d_make_root() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.28 API change +dnl # Added d_obtain_alias() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS], [ + ZFS_LINUX_TEST_SRC([d_obtain_alias], [ + #include + ], [ + d_obtain_alias(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_OBTAIN_ALIAS], [ + AC_MSG_CHECKING([whether d_obtain_alias() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_obtain_alias], + [d_obtain_alias], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([d_obtain_alias()]) + ]) +]) + +dnl # +dnl # 2.6.12 API change +dnl # d_prune_aliases() helper function available. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_PRUNE_ALIASES], [ + ZFS_LINUX_TEST_SRC([d_prune_aliases], [ + #include + ], [ + struct inode *ip = NULL; + d_prune_aliases(ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_PRUNE_ALIASES], [ + AC_MSG_CHECKING([whether d_prune_aliases() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_prune_aliases], + [d_prune_aliases], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_PRUNE_ALIASES, 1, + [d_prune_aliases() is available]) + ], [ + ZFS_LINUX_TEST_ERROR([d_prune_aliases()]) + ]) +]) + +dnl # +dnl # 2.6.38 API change +dnl # Added d_set_d_op() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_SET_D_OP], [ + ZFS_LINUX_TEST_SRC([d_set_d_op], [ + #include + ], [ + d_set_d_op(NULL, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_SET_D_OP], [ + AC_MSG_CHECKING([whether d_set_d_op() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([d_set_d_op], + [d_set_d_op], [fs/dcache.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([d_set_d_op]) + ]) +]) + +dnl # +dnl # 3.6 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_D_REVALIDATE_NAMEIDATA], [ + ZFS_LINUX_TEST_SRC([dentry_operations_revalidate], [ + #include + #include + + int revalidate (struct dentry *dentry, + struct nameidata *nidata) { return 0; } + + static const struct dentry_operations + dops __attribute__ ((unused)) = { + .d_revalidate = revalidate, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA], [ + AC_MSG_CHECKING([whether dops->d_revalidate() takes struct nameidata]) + ZFS_LINUX_TEST_RESULT([dentry_operations_revalidate], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_D_REVALIDATE_NAMEIDATA, 1, + [dops->d_revalidate() operation takes nameidata]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.30 API change +dnl # The 'struct dentry_operations' was constified in the dentry structure. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONST_DENTRY_OPERATIONS], [ + ZFS_LINUX_TEST_SRC([dentry_operations_const], [ + #include + + const struct dentry_operations test_d_op = { + .d_revalidate = NULL, + }; + ],[ + struct dentry d __attribute__ ((unused)); + d.d_op = &test_d_op; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS], [ + AC_MSG_CHECKING([whether dentry uses const struct dentry_operations]) + ZFS_LINUX_TEST_RESULT([dentry_operations_const], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_CONST_DENTRY_OPERATIONS, 1, + [dentry uses const struct dentry_operations]) + ],[ + ZFS_LINUX_TEST_ERROR([const dentry_operations]) + ]) +]) + +dnl # +dnl # 2.6.38 API change +dnl # Added sb->s_d_op default dentry_operations member +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_S_D_OP], [ + ZFS_LINUX_TEST_SRC([super_block_s_d_op], [ + #include + ],[ + struct super_block sb __attribute__ ((unused)); + sb.s_d_op = NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_S_D_OP], [ + AC_MSG_CHECKING([whether super_block has s_d_op]) + ZFS_LINUX_TEST_RESULT([super_block_s_d_op], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([super_block s_d_op]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_DENTRY], [ + ZFS_AC_KERNEL_SRC_D_MAKE_ROOT + ZFS_AC_KERNEL_SRC_D_OBTAIN_ALIAS + ZFS_AC_KERNEL_SRC_D_PRUNE_ALIASES + ZFS_AC_KERNEL_SRC_D_SET_D_OP + ZFS_AC_KERNEL_SRC_D_REVALIDATE_NAMEIDATA + ZFS_AC_KERNEL_SRC_CONST_DENTRY_OPERATIONS + ZFS_AC_KERNEL_SRC_S_D_OP +]) + +AC_DEFUN([ZFS_AC_KERNEL_DENTRY], [ + ZFS_AC_KERNEL_D_MAKE_ROOT + ZFS_AC_KERNEL_D_OBTAIN_ALIAS + ZFS_AC_KERNEL_D_PRUNE_ALIASES + ZFS_AC_KERNEL_D_SET_D_OP + ZFS_AC_KERNEL_D_REVALIDATE_NAMEIDATA + ZFS_AC_KERNEL_CONST_DENTRY_OPERATIONS + ZFS_AC_KERNEL_S_D_OP +]) diff --git a/config/kernel-dirty-inode.m4 b/config/kernel-dirty-inode.m4 new file mode 100644 index 000000000000..dc7667fa4881 --- /dev/null +++ b/config/kernel-dirty-inode.m4 @@ -0,0 +1,29 @@ +dnl # +dnl # 3.0 API change +dnl # The sops->dirty_inode() callbacks were updated to take a flags +dnl # argument. This allows the greater control over whether the +dnl # filesystem needs to push out a transaction or not. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_DIRTY_INODE], [ + ZFS_LINUX_TEST_SRC([dirty_inode_with_flags], [ + #include + + void dirty_inode(struct inode *a, int b) { return; } + + static const struct super_operations + sops __attribute__ ((unused)) = { + .dirty_inode = dirty_inode, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_DIRTY_INODE], [ + AC_MSG_CHECKING([whether sops->dirty_inode() wants flags]) + ZFS_LINUX_TEST_RESULT([dirty_inode_with_flags], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_DIRTY_INODE_WITH_FLAGS, 1, + [sops->dirty_inode() wants flags]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-discard-granularity.m4 b/config/kernel-discard-granularity.m4 new file mode 100644 index 000000000000..61326e67732b --- /dev/null +++ b/config/kernel-discard-granularity.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 2.6.33 API change +dnl # Discard granularity and alignment restrictions may now be set. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY], [ + ZFS_LINUX_TEST_SRC([discard_granularity], [ + #include + ],[ + struct queue_limits ql __attribute__ ((unused)); + ql.discard_granularity = 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_DISCARD_GRANULARITY], [ + AC_MSG_CHECKING([whether ql->discard_granularity is available]) + ZFS_LINUX_TEST_RESULT([discard_granularity], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([ql->discard_granularity]) + ]) +]) diff --git a/config/kernel-encode-fh-inode.m4 b/config/kernel-encode-fh-inode.m4 new file mode 100644 index 000000000000..9d4ba5f0f61f --- /dev/null +++ b/config/kernel-encode-fh-inode.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # 3.5.0 API change +dnl # torvalds/linux@b0b0382bb4904965a9e9fca77ad87514dfda0d1c changed the +dnl # ->encode_fh() callback to pass the child inode and its parents inode +dnl # rather than a dentry and a boolean saying whether we want the parent. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE], [ + ZFS_LINUX_TEST_SRC([export_operations_encode_fh], [ + #include + int encode_fh(struct inode *inode, __u32 *fh, int *max_len, + struct inode *parent) { return 0; } + static struct export_operations eops __attribute__ ((unused))={ + .encode_fh = encode_fh, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE], [ + AC_MSG_CHECKING([whether eops->encode_fh() wants inode]) + ZFS_LINUX_TEST_RESULT([export_operations_encode_fh], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ENCODE_FH_WITH_INODE, 1, + [eops->encode_fh() wants child and parent inodes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-evict-inode.m4 b/config/kernel-evict-inode.m4 new file mode 100644 index 000000000000..66f10492de54 --- /dev/null +++ b/config/kernel-evict-inode.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 2.6.36 API change +dnl # The sops->delete_inode() and sops->clear_inode() callbacks have +dnl # replaced by a single sops->evict_inode() callback. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_EVICT_INODE], [ + ZFS_LINUX_TEST_SRC([evict_inode], [ + #include + void evict_inode (struct inode * t) { return; } + static struct super_operations sops __attribute__ ((unused)) = { + .evict_inode = evict_inode, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_EVICT_INODE], [ + AC_MSG_CHECKING([whether sops->evict_inode() exists]) + ZFS_LINUX_TEST_RESULT([evict_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_EVICT_INODE, 1, [sops->evict_inode() exists]) + ],[ + ZFS_LINUX_TEST_ERROR([evict_inode]) + ]) +]) diff --git a/config/kernel-fallocate.m4 b/config/kernel-fallocate.m4 new file mode 100644 index 000000000000..7a8550f7e760 --- /dev/null +++ b/config/kernel-fallocate.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 2.6.38 - 3.x API +dnl # The fallocate callback was moved from the inode_operations +dnl # structure to the file_operations structure. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FALLOCATE], [ + ZFS_LINUX_TEST_SRC([file_fallocate], [ + #include + + long test_fallocate(struct file *file, int mode, + loff_t offset, loff_t len) { return 0; } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .fallocate = test_fallocate, + }; + ], []) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FALLOCATE], [ + AC_MSG_CHECKING([whether fops->fallocate() exists]) + ZFS_LINUX_TEST_RESULT([file_fallocate], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([file_fallocate]) + ]) +]) diff --git a/config/kernel-file-dentry.m4 b/config/kernel-file-dentry.m4 new file mode 100644 index 000000000000..9cb5869c3821 --- /dev/null +++ b/config/kernel-file-dentry.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 4.1 API change +dnl # struct access file->f_path.dentry was replaced by accessor function +dnl # since fix torvalds/linux@4bacc9c9234c ("overlayfs: Make f_path always +dnl # point to the overlay and f_inode to the underlay"). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILE_DENTRY], [ + ZFS_LINUX_TEST_SRC([file_dentry], [ + #include + ],[ + struct file *f = NULL; + file_dentry(f); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILE_DENTRY], [ + AC_MSG_CHECKING([whether file_dentry() is available]) + ZFS_LINUX_TEST_RESULT([file_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_DENTRY, 1, [file_dentry() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-file-inode.m4 b/config/kernel-file-inode.m4 new file mode 100644 index 000000000000..00a3621657ad --- /dev/null +++ b/config/kernel-file-inode.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 3.19 API change +dnl # struct access f->f_dentry->d_inode was replaced by accessor function +dnl # file_inode(f) +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILE_INODE], [ + ZFS_LINUX_TEST_SRC([file_inode], [ + #include + ],[ + struct file *f = NULL; + file_inode(f); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILE_INODE], [ + AC_MSG_CHECKING([whether file_inode() is available]) + ZFS_LINUX_TEST_RESULT([file_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILE_INODE, 1, [file_inode() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-fmode-t.m4 b/config/kernel-fmode-t.m4 new file mode 100644 index 000000000000..5f111e21b443 --- /dev/null +++ b/config/kernel-fmode-t.m4 @@ -0,0 +1,20 @@ +dnl # +dnl # 2.6.28 API change, +dnl # check if fmode_t typedef is defined +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FMODE_T], [ + ZFS_LINUX_TEST_SRC([type_fmode_t], [ + #include + ],[ + fmode_t *ptr __attribute__ ((unused)); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FMODE_T], [ + AC_MSG_CHECKING([whether kernel defines fmode_t]) + ZFS_LINUX_TEST_RESULT([type_fmode_t], [ + AC_MSG_RESULT([yes]) + ],[ + ZFS_LINUX_TEST_ERROR([type_fmode_t]) + ]) +]) diff --git a/config/kernel-follow-down-one.m4 b/config/kernel-follow-down-one.m4 new file mode 100644 index 000000000000..38c460d3506e --- /dev/null +++ b/config/kernel-follow-down-one.m4 @@ -0,0 +1,22 @@ +dnl # +dnl # 2.6.38 API change +dnl # follow_down() renamed follow_down_one(). The original follow_down() +dnl # symbol still exists but will traverse down all the layers. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE], [ + ZFS_LINUX_TEST_SRC([follow_down_one], [ + #include + ],[ + struct path *p = NULL; + follow_down_one(p); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FOLLOW_DOWN_ONE], [ + AC_MSG_CHECKING([whether follow_down_one() is available]) + ZFS_LINUX_TEST_RESULT([follow_down_one], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([follow_down_one()]) + ]) +]) diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 new file mode 100644 index 000000000000..3c7933413d18 --- /dev/null +++ b/config/kernel-fpu.m4 @@ -0,0 +1,131 @@ +dnl # +dnl # Handle differences in kernel FPU code. +dnl # +dnl # Kernel +dnl # 5.0: Wrappers have been introduced to save/restore the FPU state. +dnl # This change was made to the 4.19.38 and 4.14.120 LTS kernels. +dnl # HAVE_KERNEL_FPU_INTERNAL +dnl # +dnl # 4.2: Use __kernel_fpu_{begin,end}() +dnl # HAVE_UNDERSCORE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU +dnl # +dnl # Pre-4.2: Use kernel_fpu_{begin,end}() +dnl # HAVE_KERNEL_FPU & KERNEL_EXPORTS_X86_FPU +dnl # +dnl # N.B. The header check is performed before all other checks since it +dnl # depends on HAVE_KERNEL_FPU_API_HEADER being set in confdefs.h. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_FPU_HEADER], [ + AC_MSG_CHECKING([whether fpu headers are available]) + ZFS_LINUX_TRY_COMPILE([ + #include + #include + ],[ + ],[ + AC_DEFINE(HAVE_KERNEL_FPU_API_HEADER, 1, + [kernel has asm/fpu/api.h]) + AC_MSG_RESULT(asm/fpu/api.h) + ],[ + AC_MSG_RESULT(i387.h & xcr.h) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ + ZFS_LINUX_TEST_SRC([kernel_fpu], [ + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #else + #include + #include + #endif + ], [ + kernel_fpu_begin(); + kernel_fpu_end(); + ], [], [$ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([__kernel_fpu], [ + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #else + #include + #include + #endif + ], [ + __kernel_fpu_begin(); + __kernel_fpu_end(); + ], [], [$ZFS_META_LICENSE]) + + ZFS_LINUX_TEST_SRC([fpu_internal], [ + #if defined(__x86_64) || defined(__x86_64__) || \ + defined(__i386) || defined(__i386__) + #if !defined(__x86) + #define __x86 + #endif + #endif + + #if !defined(__x86) + #error Unsupported architecture + #endif + + #include + #ifdef HAVE_KERNEL_FPU_API_HEADER + #include + #include + #else + #include + #include + #endif + + #if !defined(XSTATE_XSAVE) + #error XSTATE_XSAVE not defined + #endif + + #if !defined(XSTATE_XRESTORE) + #error XSTATE_XRESTORE not defined + #endif + ],[ + struct fpu *fpu = ¤t->thread.fpu; + union fpregs_state *st = &fpu->state; + struct fregs_state *fr __attribute__ ((unused)) = &st->fsave; + struct fxregs_state *fxr __attribute__ ((unused)) = &st->fxsave; + struct xregs_state *xr __attribute__ ((unused)) = &st->xsave; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FPU], [ + dnl # + dnl # Legacy kernel + dnl # + AC_MSG_CHECKING([whether kernel fpu is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([kernel_fpu_license], + [kernel_fpu_begin], [arch/x86/kernel/fpu/core.c], [ + AC_MSG_RESULT(kernel_fpu_*) + AC_DEFINE(HAVE_KERNEL_FPU, 1, + [kernel has kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) + ],[ + dnl # + dnl # Linux 4.2 kernel + dnl # + ZFS_LINUX_TEST_RESULT_SYMBOL([__kernel_fpu_license], + [__kernel_fpu_begin], + [arch/x86/kernel/fpu/core.c arch/x86/kernel/i387.c], [ + AC_MSG_RESULT(__kernel_fpu_*) + AC_DEFINE(HAVE_UNDERSCORE_KERNEL_FPU, 1, + [kernel has __kernel_fpu_* functions]) + AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, + [kernel exports FPU functions]) + ],[ + ZFS_LINUX_TEST_RESULT([fpu_internal], [ + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ],[ + AC_MSG_RESULT(unavailable) + ]) + ]) + ]) +]) diff --git a/config/kernel-fst-mount.m4 b/config/kernel-fst-mount.m4 new file mode 100644 index 000000000000..576f5f0129c5 --- /dev/null +++ b/config/kernel-fst-mount.m4 @@ -0,0 +1,30 @@ +dnl # +dnl # 2.6.38 API change +dnl # The .get_sb callback has been replaced by a .mount callback +dnl # in the file_system_type structure. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FST_MOUNT], [ + ZFS_LINUX_TEST_SRC([file_system_type_mount], [ + #include + + static struct dentry * + mount(struct file_system_type *fs_type, int flags, + const char *osname, void *data) { + struct dentry *d = NULL; + return (d); + } + + static struct file_system_type fst __attribute__ ((unused)) = { + .mount = mount, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FST_MOUNT], [ + AC_MSG_CHECKING([whether fst->mount() exists]) + ZFS_LINUX_TEST_RESULT([file_system_type_mount], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([fst->mount()]) + ]) +]) diff --git a/config/kernel-fsync.m4 b/config/kernel-fsync.m4 new file mode 100644 index 000000000000..d198191d3ab9 --- /dev/null +++ b/config/kernel-fsync.m4 @@ -0,0 +1,53 @@ +dnl # +dnl # Check file_operations->fsync interface. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_FSYNC], [ + ZFS_LINUX_TEST_SRC([fsync_without_dentry], [ + #include + + int test_fsync(struct file *f, int x) { return 0; } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .fsync = test_fsync, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([fsync_range], [ + #include + + int test_fsync(struct file *f, loff_t a, loff_t b, int c) + { return 0; } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .fsync = test_fsync, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FSYNC], [ + dnl # + dnl # Linux 2.6.35 - Linux 3.0 API + dnl # + AC_MSG_CHECKING([whether fops->fsync() wants no dentry]) + ZFS_LINUX_TEST_RESULT([fsync_without_dentry], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_FSYNC_WITHOUT_DENTRY, 1, + [fops->fsync() without dentry]) + ],[ + AC_MSG_RESULT([no]) + + dnl # + dnl # Linux 3.1 - 3.x API + dnl # + AC_MSG_CHECKING([whether fops->fsync() wants range]) + ZFS_LINUX_TEST_RESULT([fsync_range], [ + AC_MSG_RESULT([range]) + AC_DEFINE(HAVE_FSYNC_RANGE, 1, + [fops->fsync() with range]) + ],[ + ZFS_LINUX_TEST_ERROR([fops->fsync]) + ]) + ]) +]) diff --git a/config/kernel-generic_io_acct.m4 b/config/kernel-generic_io_acct.m4 new file mode 100644 index 000000000000..423b3e5a3521 --- /dev/null +++ b/config/kernel-generic_io_acct.m4 @@ -0,0 +1,64 @@ +dnl # +dnl # Check for generic io accounting interface. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT], [ + ZFS_LINUX_TEST_SRC([generic_acct_3args], [ + #include + + void (*generic_start_io_acct_f)(int, unsigned long, + struct hd_struct *) = &generic_start_io_acct; + void (*generic_end_io_acct_f)(int, struct hd_struct *, + unsigned long) = &generic_end_io_acct; + ], [ + generic_start_io_acct(0, 0, NULL); + generic_end_io_acct(0, NULL, 0); + ]) + + ZFS_LINUX_TEST_SRC([generic_acct_4args], [ + #include + + void (*generic_start_io_acct_f)(struct request_queue *, int, + unsigned long, struct hd_struct *) = &generic_start_io_acct; + void (*generic_end_io_acct_f)(struct request_queue *, int, + struct hd_struct *, unsigned long) = &generic_end_io_acct; + ], [ + generic_start_io_acct(NULL, 0, 0, NULL); + generic_end_io_acct(NULL, 0, NULL, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_IO_ACCT], [ + dnl # + dnl # 3.19 API addition + dnl # + dnl # torvalds/linux@394ffa50 allows us to increment iostat + dnl # counters without generic_make_request(). + dnl # + AC_MSG_CHECKING([whether generic IO accounting wants 3 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_3args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_3ARG, 1, + [generic_start_io_acct()/generic_end_io_acct() available]) + ], [ + AC_MSG_RESULT(no) + + dnl # + dnl # Linux 4.14 API, + dnl # + dnl # generic_start_io_acct/generic_end_io_acct now require + dnl # request_queue to be provided. No functional changes, + dnl # but preparation for inflight accounting. + dnl # + AC_MSG_CHECKING([whether generic IO accounting wants 4 args]) + ZFS_LINUX_TEST_RESULT_SYMBOL([generic_acct_4args], + [generic_start_io_acct], [block/bio.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_IO_ACCT_4ARG, 1, + [generic_start_io_acct()/generic_end_io_acct() ] + [4 arg available]) + ], [ + AC_MSG_RESULT(no) + ]) + ]) +]) diff --git a/config/kernel-generic_readlink.m4 b/config/kernel-generic_readlink.m4 new file mode 100644 index 000000000000..a7a33b408abd --- /dev/null +++ b/config/kernel-generic_readlink.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # 4.10 API +dnl # +dnl # NULL inode_operations.readlink implies generic_readlink(), which +dnl # has been made static. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL], [ + ZFS_LINUX_TEST_SRC([generic_readlink_global], [ + #include + ],[ + int i __attribute__ ((unused)); + i = generic_readlink(NULL, NULL, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL], [ + AC_MSG_CHECKING([whether generic_readlink is global]) + ZFS_LINUX_TEST_RESULT([generic_readlink_global], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_GENERIC_READLINK, 1, + [generic_readlink is global]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel-get-disk-and-module.m4 b/config/kernel-get-disk-and-module.m4 new file mode 100644 index 000000000000..51cf7743cf0b --- /dev/null +++ b/config/kernel-get-disk-and-module.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 4.16 API change +dnl # Verify if get_disk_and_module() symbol is available. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_DISK_AND_MODULE], [ + ZFS_LINUX_TEST_SRC([get_disk_and_module], [ + #include + ], [ + struct gendisk *disk = NULL; + (void) get_disk_and_module(disk); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_AND_MODULE], [ + AC_MSG_CHECKING([whether get_disk_and_module() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([get_disk_and_module], + [get_disk_and_module], [block/genhd.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_DISK_AND_MODULE, + 1, [get_disk_and_module() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-get-disk-ro.m4 b/config/kernel-get-disk-ro.m4 new file mode 100644 index 000000000000..8a379c7669fa --- /dev/null +++ b/config/kernel-get-disk-ro.m4 @@ -0,0 +1,20 @@ +dnl # +dnl # 2.6.x API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_DISK_RO], [ + ZFS_LINUX_TEST_SRC([get_disk_ro], [ + #include + ],[ + struct gendisk *disk = NULL; + (void) get_disk_ro(disk); + ], [$NO_UNUSED_BUT_SET_VARIABLE]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_DISK_RO], [ + AC_MSG_CHECKING([whether get_disk_ro() is available]) + ZFS_LINUX_TEST_RESULT([get_disk_ro], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([get_disk_ro()]) + ]) +]) diff --git a/config/kernel-get-link.m4 b/config/kernel-get-link.m4 new file mode 100644 index 000000000000..e4f478e37c18 --- /dev/null +++ b/config/kernel-get-link.m4 @@ -0,0 +1,104 @@ +dnl # +dnl # Supported get_link() interfaces checked newest to oldest. +dnl # Note this interface used to be named follow_link. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ + ZFS_LINUX_TEST_SRC([inode_operations_get_link], [ + #include + const char *get_link(struct dentry *de, struct inode *ip, + struct delayed_call *done) { return "symlink"; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .get_link = get_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_get_link_cookie], [ + #include + const char *get_link(struct dentry *de, struct + inode *ip, void **cookie) { return "symlink"; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .get_link = get_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_follow_link], [ + #include + const char *follow_link(struct dentry *de, + void **cookie) { return "symlink"; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .follow_link = follow_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_follow_link_nameidata], [ + #include + void *follow_link(struct dentry *de, struct + nameidata *nd) { return (void *)NULL; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .follow_link = follow_link, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GET_LINK], [ + dnl # + dnl # 4.5 API change + dnl # The get_link interface has added a delayed done call and + dnl # used it to retire the put_link() interface. + dnl # + AC_MSG_CHECKING([whether iops->get_link() passes delayed]) + ZFS_LINUX_TEST_RESULT([inode_operations_get_link], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_LINK_DELAYED, 1, [iops->get_link() delayed]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.5 API change + dnl # The follow_link() interface has been replaced by + dnl # get_link() which behaves the same as before except: + dnl # - An inode is passed as a separate argument + dnl # - When called in RCU mode a NULL dentry is passed. + dnl # + AC_MSG_CHECKING([whether iops->get_link() passes cookie]) + ZFS_LINUX_TEST_RESULT([inode_operations_get_link_cookie], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GET_LINK_COOKIE, 1, + [iops->get_link() cookie]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 4.2 API change + dnl # This kernel retired the nameidata structure. + dnl # + AC_MSG_CHECKING( + [whether iops->follow_link() passes cookie]) + ZFS_LINUX_TEST_RESULT([inode_operations_follow_link], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FOLLOW_LINK_COOKIE, 1, + [iops->follow_link() cookie]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 2.6.32 API + dnl # + AC_MSG_CHECKING( + [whether iops->follow_link() passes nameidata]) + ZFS_LINUX_TEST_RESULT( + [inode_operations_follow_link_nameidata],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FOLLOW_LINK_NAMEIDATA, 1, + [iops->follow_link() nameidata]) + ],[ + ZFS_LINUX_TEST_ERROR([get_link]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-global_page_state.m4 b/config/kernel-global_page_state.m4 new file mode 100644 index 000000000000..75043f40beca --- /dev/null +++ b/config/kernel-global_page_state.m4 @@ -0,0 +1,137 @@ +dnl # +dnl # 4.8 API change +dnl # +dnl # 75ef71840539 mm, vmstat: add infrastructure for per-node vmstats +dnl # 599d0c954f91 mm, vmscan: move LRU lists to node +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE], [ + ZFS_LINUX_TEST_SRC([global_node_page_state], [ + #include + #include + ],[ + (void) global_node_page_state(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE], [ + AC_MSG_CHECKING([whether global_node_page_state() exists]) + ZFS_LINUX_TEST_RESULT([global_node_page_state], [ + AC_MSG_RESULT(yes) + AC_DEFINE(ZFS_GLOBAL_NODE_PAGE_STATE, 1, + [global_node_page_state() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.14 API change +dnl # +dnl # c41f012ade0b mm: rename global_page_state to global_zone_page_state +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE], [ + ZFS_LINUX_TEST_SRC([global_zone_page_state], [ + #include + #include + ],[ + (void) global_zone_page_state(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE], [ + AC_MSG_CHECKING([whether global_zone_page_state() exists]) + ZFS_LINUX_TEST_RESULT([global_zone_page_state], [ + AC_MSG_RESULT(yes) + AC_DEFINE(ZFS_GLOBAL_ZONE_PAGE_STATE, 1, + [global_zone_page_state() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Create a define and autoconf variable for an enum member +dnl # +AC_DEFUN([ZFS_AC_KERNEL_ENUM_MEMBER], [ + AC_MSG_CHECKING([whether enum $2 contains $1]) + AS_IF([AC_TRY_COMMAND( + "${srcdir}/scripts/enum-extract.pl" "$2" "$3" | egrep -qx $1)],[ + AC_MSG_RESULT([yes]) + AC_DEFINE(m4_join([_], [ZFS_ENUM], m4_toupper($2), $1), 1, + [enum $2 contains $1]) + m4_join([_], [ZFS_ENUM], m4_toupper($2), $1)=1 + ],[ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # Sanity check helpers +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR],[ + AC_MSG_RESULT(no) + AC_MSG_RESULT([$1 in either node_stat_item or zone_stat_item: $2]) + ZFS_LINUX_TEST_ERROR([global page state]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK], [ + enum_check_a="m4_join([_], [$ZFS_ENUM_NODE_STAT_ITEM], $1)" + enum_check_b="m4_join([_], [$ZFS_ENUM_ZONE_STAT_ITEM], $1)" + AS_IF([test -n "$enum_check_a" -a -n "$enum_check_b"],[ + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR([$1], [DUPLICATE]) + ]) + AS_IF([test -z "$enum_check_a" -a -z "$enum_check_b"],[ + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_ERROR([$1], [NOT FOUND]) + ]) +]) + +dnl # +dnl # Ensure the config tests are finding one and only one of each enum. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY], [ + AC_MSG_CHECKING([whether global_page_state enums are sane]) + + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_FILE_PAGES]) + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_ANON]) + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_INACTIVE_FILE]) + AS_IF([test -z "$ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B"],[ + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE_ENUM_CHECK([NR_SLAB_RECLAIMABLE]) + ]) + + AC_MSG_RESULT(yes) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE], [ + ZFS_AC_KERNEL_SRC_GLOBAL_NODE_PAGE_STATE + ZFS_AC_KERNEL_SRC_GLOBAL_ZONE_PAGE_STATE +]) + +dnl # +dnl # enum members in which we're interested +dnl # +AC_DEFUN([ZFS_AC_KERNEL_GLOBAL_PAGE_STATE], [ + ZFS_AC_KERNEL_GLOBAL_NODE_PAGE_STATE + ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE + + ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE_B], + [node_stat_item], [$LINUX/include/linux/mmzone.h]) + + ZFS_AC_KERNEL_ENUM_MEMBER([NR_FILE_PAGES], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_ANON], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_INACTIVE_FILE], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + ZFS_AC_KERNEL_ENUM_MEMBER([NR_SLAB_RECLAIMABLE], + [zone_stat_item], [$LINUX/include/linux/mmzone.h]) + + ZFS_AC_KERNEL_GLOBAL_ZONE_PAGE_STATE_SANITY +]) diff --git a/config/kernel-group-info.m4 b/config/kernel-group-info.m4 new file mode 100644 index 000000000000..0fee1d36d50d --- /dev/null +++ b/config/kernel-group-info.m4 @@ -0,0 +1,22 @@ +dnl # +dnl # 4.9 API change +dnl # group_info changed from 2d array via >blocks to 1d array via ->gid +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GROUP_INFO_GID], [ + ZFS_LINUX_TEST_SRC([group_info_gid], [ + #include + ],[ + struct group_info *gi = groups_alloc(1); + gi->gid[0] = KGIDT_INIT(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GROUP_INFO_GID], [ + AC_MSG_CHECKING([whether group_info->gid exists]) + ZFS_LINUX_TEST_RESULT([group_info_gid], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GROUP_INFO_GID, 1, [group_info->gid exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-in-compat-syscall.m4 b/config/kernel-in-compat-syscall.m4 new file mode 100644 index 000000000000..baaac8c4fda2 --- /dev/null +++ b/config/kernel-in-compat-syscall.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 4.5 API change +dnl # Added in_compat_syscall() which can be overridden on a per- +dnl # architecture basis. Prior to this is_compat_task() was the +dnl # provided interface. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL], [ + ZFS_LINUX_TEST_SRC([in_compat_syscall], [ + #include + ],[ + in_compat_syscall(); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_IN_COMPAT_SYSCALL], [ + AC_MSG_CHECKING([whether in_compat_syscall() is available]) + ZFS_LINUX_TEST_RESULT([in_compat_syscall], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IN_COMPAT_SYSCALL, 1, + [in_compat_syscall() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-inode-create.m4 b/config/kernel-inode-create.m4 new file mode 100644 index 000000000000..9f28bcbd4f7f --- /dev/null +++ b/config/kernel-inode-create.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 3.6 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE_FLAGS], [ + ZFS_LINUX_TEST_SRC([create_flags], [ + #include + #include + + int inode_create(struct inode *inode ,struct dentry *dentry, + umode_t umode, bool flag) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .create = inode_create, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CREATE_FLAGS], [ + AC_MSG_CHECKING([whether iops->create() passes flags]) + ZFS_LINUX_TEST_RESULT([create_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->create()]) + ]) +]) diff --git a/config/kernel-inode-getattr.m4 b/config/kernel-inode-getattr.m4 new file mode 100644 index 000000000000..48391d66f8bd --- /dev/null +++ b/config/kernel-inode-getattr.m4 @@ -0,0 +1,53 @@ +dnl # +dnl # Linux 4.11 API +dnl # See torvalds/linux@a528d35 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ + ZFS_LINUX_TEST_SRC([inode_operations_getattr_path], [ + #include + + int test_getattr( + const struct path *p, struct kstat *k, + u32 request_mask, unsigned int query_flags) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .getattr = test_getattr, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([inode_operations_getattr_vfsmount], [ + #include + + int test_getattr( + struct vfsmount *mnt, struct dentry *d, + struct kstat *k) + { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .getattr = test_getattr, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_GETATTR], [ + AC_MSG_CHECKING([whether iops->getattr() takes a path]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_path], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PATH_IOPS_GETATTR, 1, + [iops->getattr() takes a path]) + ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether iops->getattr() takes a vfsmount]) + ZFS_LINUX_TEST_RESULT([inode_operations_getattr_vfsmount], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFSMOUNT_IOPS_GETATTR, 1, + [iops->getattr() takes a vfsmount]) + ],[ + AC_MSG_RESULT(no) + ]) + ]) +]) diff --git a/config/kernel-inode-lock.m4 b/config/kernel-inode-lock.m4 new file mode 100644 index 000000000000..5eb04af78771 --- /dev/null +++ b/config/kernel-inode-lock.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 4.7 API change +dnl # i_mutex is changed to i_rwsem. Instead of directly using +dnl # i_mutex/i_rwsem, we should use inode_lock() and inode_lock_shared() +dnl # We test inode_lock_shared because inode_lock is introduced earlier. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_LOCK], [ + ZFS_LINUX_TEST_SRC([inode_lock], [ + #include + ],[ + struct inode *inode = NULL; + inode_lock_shared(inode); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_LOCK], [ + AC_MSG_CHECKING([whether inode_lock_shared() exists]) + ZFS_LINUX_TEST_RESULT([inode_lock], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_LOCK_SHARED, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-inode-lookup.m4 b/config/kernel-inode-lookup.m4 new file mode 100644 index 000000000000..1a56e69b04aa --- /dev/null +++ b/config/kernel-inode-lookup.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 3.6 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS], [ + ZFS_LINUX_TEST_SRC([lookup_flags], [ + #include + #include + + struct dentry *inode_lookup(struct inode *inode, + struct dentry *dentry, unsigned int flags) { return NULL; } + + static const struct inode_operations iops + __attribute__ ((unused)) = { + .lookup = inode_lookup, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_LOOKUP_FLAGS], [ + AC_MSG_CHECKING([whether iops->lookup() passes flags]) + ZFS_LINUX_TEST_RESULT([lookup_flags], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([iops->lookup()]) + ]) +]) diff --git a/config/kernel-inode-set-flags.m4 b/config/kernel-inode-set-flags.m4 new file mode 100644 index 000000000000..133f666a9517 --- /dev/null +++ b/config/kernel-inode-set-flags.m4 @@ -0,0 +1,22 @@ +dnl # +dnl # 3.15 API change +dnl # inode_set_flags introduced to set i_flags +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS], [ + ZFS_LINUX_TEST_SRC([inode_set_flags], [ + #include + ],[ + struct inode inode; + inode_set_flags(&inode, S_IMMUTABLE, S_IMMUTABLE); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_FLAGS], [ + AC_MSG_CHECKING([whether inode_set_flags() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_FLAGS, 1, [inode_set_flags() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-inode-set-iversion.m4 b/config/kernel-inode-set-iversion.m4 new file mode 100644 index 000000000000..dd415de324a7 --- /dev/null +++ b/config/kernel-inode-set-iversion.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 4.16 API change +dnl # inode_set_iversion introduced to set i_version +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION], [ + ZFS_LINUX_TEST_SRC([inode_set_iversion], [ + #include + ],[ + struct inode inode; + inode_set_iversion(&inode, 1); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_SET_IVERSION], [ + AC_MSG_CHECKING([whether inode_set_iversion() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_iversion], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_IVERSION, 1, + [inode_set_iversion() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 new file mode 100644 index 000000000000..9c016c790081 --- /dev/null +++ b/config/kernel-inode-times.m4 @@ -0,0 +1,50 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ + + dnl # + dnl # 5.6 API change + dnl # timespec64_trunc() replaced by timestamp_truncate() interface. + dnl # + ZFS_LINUX_TEST_SRC([timestamp_truncate], [ + #include + ],[ + struct timespec64 ts; + struct inode ip; + + memset(&ts, 0, sizeof(ts)); + ts = timestamp_truncate(ts, &ip); + ]) + + dnl # + dnl # 4.18 API change + dnl # i_atime, i_mtime, and i_ctime changed from timespec to timespec64. + dnl # + ZFS_LINUX_TEST_SRC([inode_times], [ + #include + ],[ + struct inode ip; + struct timespec ts; + + memset(&ip, 0, sizeof(ip)); + ts = ip.i_mtime; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ + AC_MSG_CHECKING([whether timestamp_truncate() exists]) + ZFS_LINUX_TEST_RESULT([timestamp_truncate], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_TIMESTAMP_TRUNCATE, 1, + [timestamp_truncate() exists]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode->i_*time's are timespec64]) + ZFS_LINUX_TEST_RESULT([inode_times], [ + AC_MSG_RESULT(no) + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_TIMESPEC64_TIMES, 1, + [inode->i_*time's are timespec64]) + ]) +]) diff --git a/config/kernel-insert-inode-locked.m4 b/config/kernel-insert-inode-locked.m4 new file mode 100644 index 000000000000..348aff9a5770 --- /dev/null +++ b/config/kernel-insert-inode-locked.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 2.6.28 API change +dnl # Added insert_inode_locked() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED], [ + ZFS_LINUX_TEST_SRC([insert_inode_locked], [ + #include + ], [ + insert_inode_locked(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INSERT_INODE_LOCKED], [ + AC_MSG_CHECKING([whether insert_inode_locked() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([insert_inode_locked], + [insert_inode_locked], [fs/inode.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([insert_inode_locked()]) + ]) +]) diff --git a/config/kernel-is_owner_or_cap.m4 b/config/kernel-is_owner_or_cap.m4 new file mode 100644 index 000000000000..3df6163da270 --- /dev/null +++ b/config/kernel-is_owner_or_cap.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 2.6.39 API change, +dnl # The is_owner_or_cap() macro was renamed to inode_owner_or_capable(), +dnl # This is used for permission checks in the xattr and file attribute call +dnl # paths. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE], [ + ZFS_LINUX_TEST_SRC([inode_owner_or_capable], [ + #include + ],[ + struct inode *ip = NULL; + (void) inode_owner_or_capable(ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE], [ + AC_MSG_CHECKING([whether inode_owner_or_capable() exists]) + ZFS_LINUX_TEST_RESULT([inode_owner_or_capable], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([capability]) + ]) +]) diff --git a/config/kernel-kmap-atomic-args.m4 b/config/kernel-kmap-atomic-args.m4 new file mode 100644 index 000000000000..1172505afc68 --- /dev/null +++ b/config/kernel-kmap-atomic-args.m4 @@ -0,0 +1,22 @@ +dnl # +dnl # 2.6.37 API change +dnl # kmap_atomic changed from assigning hard-coded named slot to using +dnl # push/pop based dynamical allocation. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS], [ + ZFS_LINUX_TEST_SRC([kmap_atomic], [ + #include + ],[ + struct page page; + kmap_atomic(&page); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS], [ + AC_MSG_CHECKING([whether kmap_atomic wants 1 args]) + ZFS_LINUX_TEST_RESULT([kmap_atomic], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kmap_atomic()]) + ]) +]) diff --git a/config/kernel-kmem-cache.m4 b/config/kernel-kmem-cache.m4 new file mode 100644 index 000000000000..0e9fe9eb2a90 --- /dev/null +++ b/config/kernel-kmem-cache.m4 @@ -0,0 +1,41 @@ +dnl # +dnl # grsecurity API change, +dnl # kmem_cache_create() with SLAB_USERCOPY flag replaced by +dnl # kmem_cache_create_usercopy(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMEM_CACHE_CREATE_USERCOPY], [ + ZFS_LINUX_TEST_SRC([kmem_cache_create_usercopy], [ + #include + static void ctor(void *foo) { /* fake ctor */ } + ],[ + struct kmem_cache *skc_linux_cache; + const char *name = "test"; + size_t size = 4096; + size_t align = 8; + unsigned long flags = 0; + size_t useroffset = 0; + size_t usersize = size - useroffset; + + skc_linux_cache = kmem_cache_create_usercopy( + name, size, align, flags, useroffset, usersize, ctor); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY], [ + AC_MSG_CHECKING([whether kmem_cache_create_usercopy() exists]) + ZFS_LINUX_TEST_RESULT([kmem_cache_create_usercopy], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KMEM_CACHE_CREATE_USERCOPY, 1, + [kmem_cache_create_usercopy() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_KMEM_CACHE], [ + ZFS_AC_KERNEL_SRC_KMEM_CACHE_CREATE_USERCOPY +]) + +AC_DEFUN([ZFS_AC_KERNEL_KMEM_CACHE], [ + ZFS_AC_KERNEL_KMEM_CACHE_CREATE_USERCOPY +]) diff --git a/config/kernel-kmem.m4 b/config/kernel-kmem.m4 new file mode 100644 index 000000000000..43f9e72f88d8 --- /dev/null +++ b/config/kernel-kmem.m4 @@ -0,0 +1,108 @@ +dnl # +dnl # Enabled by default it provides a minimal level of memory tracking. +dnl # A total count of bytes allocated is kept for each alloc and free. +dnl # Then at module unload time a report to the console will be printed +dnl # if memory was leaked. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM], [ + AC_ARG_ENABLE([debug-kmem], + [AS_HELP_STRING([--enable-debug-kmem], + [Enable basic kmem accounting @<:@default=no@:>@])], + [], + [enable_debug_kmem=no]) + + AS_IF([test "x$enable_debug_kmem" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM" + DEBUG_KMEM="_with_debug_kmem" + AC_DEFINE([DEBUG_KMEM], [1], + [Define to 1 to enable basic kmem accounting]) + ], [ + DEBUG_KMEM="_without_debug_kmem" + ]) + + AC_SUBST(DEBUG_KMEM) + AC_MSG_CHECKING([whether basic kmem accounting is enabled]) + AC_MSG_RESULT([$enable_debug_kmem]) +]) + +dnl # +dnl # Disabled by default it provides detailed memory tracking. This +dnl # feature also requires --enable-debug-kmem to be set. When enabled +dnl # not only will total bytes be tracked but also the location of every +dnl # alloc and free. When the SPL module is unloaded a list of all leaked +dnl # addresses and where they were allocated will be dumped to the console. +dnl # Enabling this feature has a significant impact on performance but it +dnl # makes finding memory leaks pretty straight forward. +dnl # +AC_DEFUN([SPL_AC_DEBUG_KMEM_TRACKING], [ + AC_ARG_ENABLE([debug-kmem-tracking], + [AS_HELP_STRING([--enable-debug-kmem-tracking], + [Enable detailed kmem tracking @<:@default=no@:>@])], + [], + [enable_debug_kmem_tracking=no]) + + AS_IF([test "x$enable_debug_kmem_tracking" = xyes], + [ + KERNELCPPFLAGS="${KERNELCPPFLAGS} -DDEBUG_KMEM_TRACKING" + DEBUG_KMEM_TRACKING="_with_debug_kmem_tracking" + AC_DEFINE([DEBUG_KMEM_TRACKING], [1], + [Define to 1 to enable detailed kmem tracking]) + ], [ + DEBUG_KMEM_TRACKING="_without_debug_kmem_tracking" + ]) + + AC_SUBST(DEBUG_KMEM_TRACKING) + AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) + AC_MSG_RESULT([$enable_debug_kmem_tracking]) +]) + +dnl # +dnl # 4.12 API, +dnl # Added kvmalloc allocation strategy +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KVMALLOC], [ + ZFS_LINUX_TEST_SRC([kvmalloc], [ + #include + ],[ + void *p __attribute__ ((unused)); + + p = kvmalloc(0, GFP_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KVMALLOC], [ + AC_MSG_CHECKING([whether kvmalloc(ptr, flags) is available]) + ZFS_LINUX_TEST_RESULT([kvmalloc], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KVMALLOC, 1, [kvmalloc exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 5.8 API, +dnl # __vmalloc PAGE_KERNEL removal +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL], [ + ZFS_LINUX_TEST_SRC([__vmalloc], [ + #include + #include + ],[ + void *p __attribute__ ((unused)); + + p = __vmalloc(0, GFP_KERNEL, PAGE_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL], [ + AC_MSG_CHECKING([whether __vmalloc(ptr, flags, pageflags) is available]) + ZFS_LINUX_TEST_RESULT([__vmalloc], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VMALLOC_PAGE_KERNEL, 1, [__vmalloc page flags exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) +- \ No newline at end of file diff --git a/config/kernel-kstrtoul.m4 b/config/kernel-kstrtoul.m4 new file mode 100644 index 000000000000..8e4b542978a9 --- /dev/null +++ b/config/kernel-kstrtoul.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 2.6.39 API change +dnl # Added kstrtoul() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KSTRTOUL], [ + ZFS_LINUX_TEST_SRC([kstrtoul], [ + #include + ],[ + int ret __attribute__ ((unused)) = kstrtoul(NULL, 10, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KSTRTOUL], [ + AC_MSG_CHECKING([whether kstrtoul() exists]) + ZFS_LINUX_TEST_RESULT([kstrtoul], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KSTRTOUL, 1, [kstrtoul() exists]) + ],[ + ZFS_LINUX_TEST_ERROR([kstrtoul()]) + ]) +]) diff --git a/config/kernel-ktime.m4 b/config/kernel-ktime.m4 new file mode 100644 index 000000000000..64c3b5f90328 --- /dev/null +++ b/config/kernel-ktime.m4 @@ -0,0 +1,55 @@ +dnl # +dnl # 4.18: ktime_get_coarse_real_ts64() replaces current_kernel_time64(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME_GET_COARSE_REAL_TS64], [ + ZFS_LINUX_TEST_SRC([ktime_get_coarse_real_ts64], [ + #include + ], [ + struct timespec64 ts; + ktime_get_coarse_real_ts64(&ts); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64], [ + AC_MSG_CHECKING([whether ktime_get_coarse_real_ts64() exists]) + ZFS_LINUX_TEST_RESULT([ktime_get_coarse_real_ts64], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KTIME_GET_COARSE_REAL_TS64, 1, + [ktime_get_coarse_real_ts64() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.18: ktime_get_raw_ts64() replaces getrawmonotonic64(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME_GET_RAW_TS64], [ + ZFS_LINUX_TEST_SRC([ktime_get_raw_ts64], [ + #include + ], [ + struct timespec64 ts; + ktime_get_raw_ts64(&ts); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME_GET_RAW_TS64], [ + AC_MSG_CHECKING([whether ktime_get_raw_ts64() exists]) + ZFS_LINUX_TEST_RESULT([ktime_get_raw_ts64], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KTIME_GET_RAW_TS64, 1, + [ktime_get_raw_ts64() exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_KTIME], [ + ZFS_AC_KERNEL_SRC_KTIME_GET_COARSE_REAL_TS64 + ZFS_AC_KERNEL_SRC_KTIME_GET_RAW_TS64 +]) + +AC_DEFUN([ZFS_AC_KERNEL_KTIME], [ + ZFS_AC_KERNEL_KTIME_GET_COARSE_REAL_TS64 + ZFS_AC_KERNEL_KTIME_GET_RAW_TS64 +]) diff --git a/config/kernel-kuid-helpers.m4 b/config/kernel-kuid-helpers.m4 new file mode 100644 index 000000000000..38a439fa6ea9 --- /dev/null +++ b/config/kernel-kuid-helpers.m4 @@ -0,0 +1,24 @@ +dnl # +dnl # 3.5 API change, +dnl # Since usernamespaces were introduced in kernel version 3.5, it +dnl # became necessary to go through one more level of indirection +dnl # when dealing with uid/gid - namely the kuid type. +dnl # +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUID_HELPERS], [ + ZFS_LINUX_TEST_SRC([i_uid_read], [ + #include + ],[ + struct inode *ip = NULL; + (void) i_uid_read(ip); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KUID_HELPERS], [ + AC_MSG_CHECKING([whether i_(uid|gid)_(read|write) exist]) + ZFS_LINUX_TEST_RESULT([i_uid_read], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([i_uid_read]) + ]) +]) diff --git a/config/kernel-kuidgid.m4 b/config/kernel-kuidgid.m4 new file mode 100644 index 000000000000..b7e441408cb9 --- /dev/null +++ b/config/kernel-kuidgid.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 3.8 API change, +dnl # User namespaces, use kuid_t in place of uid_t where available. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUIDGID_T], [ + ZFS_LINUX_TEST_SRC([kuidgid_t], [ + #include + ], [ + kuid_t userid __attribute__ ((unused)) = KUIDT_INIT(0); + kgid_t groupid __attribute__ ((unused)) = KGIDT_INIT(0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KUIDGID_T], [ + AC_MSG_CHECKING([whether kuid_t/kgid_t is available]) + ZFS_LINUX_TEST_RESULT([kuidgid_t], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kuid_t/kgid_t]) + ]) +]) diff --git a/config/kernel-lseek-execute.m4 b/config/kernel-lseek-execute.m4 new file mode 100644 index 000000000000..652f611f8da4 --- /dev/null +++ b/config/kernel-lseek-execute.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # 3.11 API change +dnl # lseek_execute helper exported +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE], [ + ZFS_LINUX_TEST_SRC([lseek_execute], [ + #include + ], [ + struct file *fp __attribute__ ((unused)) = NULL; + struct inode *ip __attribute__ ((unused)) = NULL; + loff_t offset __attribute__ ((unused)) = 0; + loff_t maxsize __attribute__ ((unused)) = 0; + + lseek_execute(fp, ip, offset, maxsize); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_LSEEK_EXECUTE], [ + AC_MSG_CHECKING([whether lseek_execute() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([lseek_execute], + [lseek_exclusive], [fs/read_write.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_LSEEK_EXECUTE, 1, [lseek_execute() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 new file mode 100644 index 000000000000..1576fece1368 --- /dev/null +++ b/config/kernel-make-request-fn.m4 @@ -0,0 +1,101 @@ +dnl # +dnl # Check for make_request_fn interface. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ + ZFS_LINUX_TEST_SRC([make_request_fn_void], [ + #include + void make_request(struct request_queue *q, + struct bio *bio) { return; } + ],[ + blk_queue_make_request(NULL, &make_request); + ]) + + ZFS_LINUX_TEST_SRC([make_request_fn_blk_qc_t], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + blk_queue_make_request(NULL, &make_request); + ]) + + ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn], [ + #include + blk_qc_t make_request(struct request_queue *q, + struct bio *bio) { return (BLK_QC_T_NONE); } + ],[ + struct request_queue *q __attribute__ ((unused)); + q = blk_alloc_queue(make_request, NUMA_NO_NODE); + ]) + + ZFS_LINUX_TEST_SRC([block_device_operations_submit_bio], [ + #include + ],[ + struct block_device_operations o; + o.submit_bio = NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MAKE_REQUEST_FN], [ + dnl # Checked as part of the blk_alloc_queue_request_fn test + dnl # + dnl # Linux 5.9 API Change + dnl # make_request_fn was moved into block_device_operations->submit_bio + dnl # + AC_MSG_CHECKING([whether submit_bio is member of struct block_device_operations]) + ZFS_LINUX_TEST_RESULT([block_device_operations_submit_bio], [ + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS, 1, + [submit_bio is member of struct block_device_operations]) + ],[ + dnl # Checked as part of the blk_alloc_queue_request_fn test + dnl # + dnl # Linux 5.7 API Change + dnl # blk_alloc_queue() expects request function. + dnl # + AC_MSG_CHECKING([whether blk_alloc_queue() expects request function]) + ZFS_LINUX_TEST_RESULT([blk_alloc_queue_request_fn], [ + AC_MSG_CHECKING([whether make_request_fn() returns blk_qc_t]) + AC_MSG_RESULT(yes) + + AC_DEFINE(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN, 1, + [blk_alloc_queue() expects request function]) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() returns blk_qc_t]) + ],[ + dnl # + dnl # Linux 3.2 API Change + dnl # make_request_fn returns void. + dnl # + AC_MSG_CHECKING([whether make_request_fn() returns void]) + ZFS_LINUX_TEST_RESULT([make_request_fn_void], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MAKE_REQUEST_FN_RET, void, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_VOID, 1, + [Noting that make_request_fn() returns void]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # Linux 4.4 API Change + dnl # make_request_fn returns blk_qc_t. + dnl # + AC_MSG_CHECKING( + [whether make_request_fn() returns blk_qc_t]) + ZFS_LINUX_TEST_RESULT([make_request_fn_blk_qc_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MAKE_REQUEST_FN_RET, blk_qc_t, + [make_request_fn() return type]) + AC_DEFINE(HAVE_MAKE_REQUEST_FN_RET_QC, 1, + [Noting that make_request_fn() ] + [returns blk_qc_t]) + ],[ + ZFS_LINUX_TEST_ERROR([make_request_fn]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-misc-minor.m4 b/config/kernel-misc-minor.m4 new file mode 100644 index 000000000000..20fe2cd2f3cd --- /dev/null +++ b/config/kernel-misc-minor.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # Determine an available miscellaneous minor number which can be used +dnl # for the /dev/zfs device. This is needed because kernel module +dnl # auto-loading depends on registering a reserved non-conflicting minor +dnl # number. Start with a large known available unreserved minor and work +dnl # our way down to lower value if a collision is detected. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_MISC_MINOR], [ + AC_MSG_CHECKING([whether /dev/zfs minor is available]) + + for i in $(seq 249 -1 200); do + if ! grep -q "^#define\s\+.*_MINOR\s\+.*$i" \ + ${LINUX}/include/linux/miscdevice.h; then + ZFS_DEVICE_MINOR="$i" + AC_MSG_RESULT($ZFS_DEVICE_MINOR) + AC_DEFINE_UNQUOTED([ZFS_DEVICE_MINOR], + [$ZFS_DEVICE_MINOR], [/dev/zfs minor]) + break + fi + done + + AS_IF([ test -z "$ZFS_DEVICE_MINOR"], [ + AC_MSG_ERROR([ + *** No available misc minor numbers available for use.]) + ]) +]) diff --git a/config/kernel-mkdir-umode-t.m4 b/config/kernel-mkdir-umode-t.m4 new file mode 100644 index 000000000000..19599670df3b --- /dev/null +++ b/config/kernel-mkdir-umode-t.m4 @@ -0,0 +1,32 @@ +dnl # +dnl # 3.3 API change +dnl # The VFS .create, .mkdir and .mknod callbacks were updated to take a +dnl # umode_t type rather than an int. The expectation is that any backport +dnl # would also change all three prototypes. However, if it turns out that +dnl # some distribution doesn't backport the whole thing this could be +dnl # broken apart into three separate checks. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR_UMODE_T], [ + ZFS_LINUX_TEST_SRC([inode_operations_mkdir], [ + #include + + int mkdir(struct inode *inode, struct dentry *dentry, + umode_t umode) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .mkdir = mkdir, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MKDIR_UMODE_T], [ + AC_MSG_CHECKING([whether iops->create()/mkdir()/mknod() take umode_t]) + ZFS_LINUX_TEST_RESULT([inode_operations_mkdir], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MKDIR_UMODE_T, 1, + [iops->create()/mkdir()/mknod() take umode_t]) + ],[ + ZFS_LINUX_TEST_ERROR([mkdir()]) + ]) +]) diff --git a/config/kernel-mod-param.m4 b/config/kernel-mod-param.m4 new file mode 100644 index 000000000000..e00f19d61e7d --- /dev/null +++ b/config/kernel-mod-param.m4 @@ -0,0 +1,33 @@ +dnl # +dnl # Grsecurity kernel API change +dnl # constified parameters of module_param_call() methods +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST], [ + ZFS_LINUX_TEST_SRC([module_param_call], [ + #include + #include + + int param_get(char *b, const struct kernel_param *kp) + { + return (0); + } + + int param_set(const char *b, const struct kernel_param *kp) + { + return (0); + } + + module_param_call(p, param_set, param_get, NULL, 0644); + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST], [ + AC_MSG_CHECKING([whether module_param_call() is hardened]) + ZFS_LINUX_TEST_RESULT([module_param_call], [ + AC_MSG_RESULT(yes) + AC_DEFINE(MODULE_PARAM_CALL_CONST, 1, + [hardened module_param_call]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-objtool.m4 b/config/kernel-objtool.m4 new file mode 100644 index 000000000000..bf60e7869213 --- /dev/null +++ b/config/kernel-objtool.m4 @@ -0,0 +1,45 @@ +dnl # +dnl # Check for objtool support. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_OBJTOOL], [ + + dnl # 4.6 API for compile-time stack validation + ZFS_LINUX_TEST_SRC([objtool], [ + #undef __ASSEMBLY__ + #include + ],[ + #if !defined(FRAME_BEGIN) + CTASSERT(1); + #endif + ]) + + dnl # 4.6 API added STACK_FRAME_NON_STANDARD macro + ZFS_LINUX_TEST_SRC([stack_frame_non_standard], [ + #include + ],[ + #if !defined(STACK_FRAME_NON_STANDARD) + CTASSERT(1); + #endif + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_OBJTOOL], [ + AC_MSG_CHECKING( + [whether compile-time stack validation (objtool) is available]) + ZFS_LINUX_TEST_RESULT([objtool], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_OBJTOOL, 1, + [kernel does stack verification]) + + AC_MSG_CHECKING([whether STACK_FRAME_NON_STANDARD is defined]) + ZFS_LINUX_TEST_RESULT([stack_frame_non_standard], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_STACK_FRAME_NON_STANDARD, 1, + [STACK_FRAME_NON_STANDARD is defined]) + ],[ + AC_MSG_RESULT(no) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-pde-data.m4 b/config/kernel-pde-data.m4 new file mode 100644 index 000000000000..f866d77a11df --- /dev/null +++ b/config/kernel-pde-data.m4 @@ -0,0 +1,20 @@ +dnl # +dnl # 3.10 API change, +dnl # PDE is replaced by PDE_DATA +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PDE_DATA], [ + ZFS_LINUX_TEST_SRC([pde_data], [ + #include + ], [ + PDE_DATA(NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PDE_DATA], [ + AC_MSG_CHECKING([whether PDE_DATA() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([pde_data], [PDE_DATA], [], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([PDE_DATA]) + ]) +]) diff --git a/config/kernel-percpu.m4 b/config/kernel-percpu.m4 new file mode 100644 index 000000000000..e9654a69ee0a --- /dev/null +++ b/config/kernel-percpu.m4 @@ -0,0 +1,34 @@ +dnl # +dnl # 3.18 API change, +dnl # The function percpu_counter_init now must be passed a GFP mask. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT], [ + ZFS_LINUX_TEST_SRC([percpu_counter_init_with_gfp], [ + #include + #include + ],[ + struct percpu_counter counter; + int error; + + error = percpu_counter_init(&counter, 0, GFP_KERNEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU_COUNTER_INIT], [ + AC_MSG_CHECKING([whether percpu_counter_init() wants gfp_t]) + ZFS_LINUX_TEST_RESULT([percpu_counter_init_with_gfp], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PERCPU_COUNTER_INIT_WITH_GFP, 1, + [percpu_counter_init() wants gfp_t]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_PERCPU], [ + ZFS_AC_KERNEL_SRC_PERCPU_COUNTER_INIT +]) + +AC_DEFUN([ZFS_AC_KERNEL_PERCPU], [ + ZFS_AC_KERNEL_PERCPU_COUNTER_INIT +]) diff --git a/config/kernel-proc-operations.m4 b/config/kernel-proc-operations.m4 new file mode 100644 index 000000000000..df216222ecc2 --- /dev/null +++ b/config/kernel-proc-operations.m4 @@ -0,0 +1,41 @@ +dnl # +dnl # 5.6 API Change +dnl # The proc_ops structure was introduced to replace the use of +dnl # of the file_operations structure when registering proc handlers. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_OPERATIONS], [ + ZFS_LINUX_TEST_SRC([proc_ops_struct], [ + #include + + int test_open(struct inode *ip, struct file *fp) { return 0; } + ssize_t test_read(struct file *fp, char __user *ptr, + size_t size, loff_t *offp) { return 0; } + ssize_t test_write(struct file *fp, const char __user *ptr, + size_t size, loff_t *offp) { return 0; } + loff_t test_lseek(struct file *fp, loff_t off, int flag) + { return 0; } + int test_release(struct inode *ip, struct file *fp) + { return 0; } + + const struct proc_ops test_ops __attribute__ ((unused)) = { + .proc_open = test_open, + .proc_read = test_read, + .proc_write = test_write, + .proc_lseek = test_lseek, + .proc_release = test_release, + }; + ], [ + struct proc_dir_entry *entry __attribute__ ((unused)) = + proc_create_data("test", 0444, NULL, &test_ops, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PROC_OPERATIONS], [ + AC_MSG_CHECKING([whether proc_ops structure exists]) + ZFS_LINUX_TEST_RESULT([proc_ops_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PROC_OPS_STRUCT, 1, [proc_ops structure exists]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-put-link.m4 b/config/kernel-put-link.m4 new file mode 100644 index 000000000000..4234861f3347 --- /dev/null +++ b/config/kernel-put-link.m4 @@ -0,0 +1,61 @@ +dnl # +dnl # Supported symlink APIs +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_PUT_LINK], [ + ZFS_LINUX_TEST_SRC([put_link_cookie], [ + #include + void put_link(struct inode *ip, void *cookie) + { return; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .put_link = put_link, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([put_link_nameidata], [ + #include + void put_link(struct dentry *de, struct + nameidata *nd, void *ptr) { return; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .put_link = put_link, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_PUT_LINK], [ + dnl # + dnl # 4.5 API change + dnl # get_link() uses delayed done, there is no put_link() interface. + dnl # This check initially uses the inode_operations_get_link result + dnl # + ZFS_LINUX_TEST_RESULT([inode_operations_get_link], [ + AC_DEFINE(HAVE_PUT_LINK_DELAYED, 1, [iops->put_link() delayed]) + ],[ + dnl # + dnl # 4.2 API change + dnl # This kernel retired the nameidata structure. + dnl # + AC_MSG_CHECKING([whether iops->put_link() passes cookie]) + ZFS_LINUX_TEST_RESULT([put_link_cookie], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PUT_LINK_COOKIE, 1, + [iops->put_link() cookie]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 2.6.32 API + dnl # + AC_MSG_CHECKING( + [whether iops->put_link() passes nameidata]) + ZFS_LINUX_TEST_RESULT([put_link_nameidata], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_PUT_LINK_NAMEIDATA, 1, + [iops->put_link() nameidata]) + ],[ + ZFS_LINUX_TEST_ERROR([put_link]) + ]) + ]) + ]) +]) diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 new file mode 100644 index 000000000000..f707391539d8 --- /dev/null +++ b/config/kernel-rename.m4 @@ -0,0 +1,29 @@ +dnl # +dnl # 4.9 API change, +dnl # iops->rename2() merged into iops->rename(), and iops->rename() now wants +dnl # flags. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS], [ + ZFS_LINUX_TEST_SRC([inode_operations_rename], [ + #include + int rename_fn(struct inode *sip, struct dentry *sdp, + struct inode *tip, struct dentry *tdp, + unsigned int flags) { return 0; } + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .rename = rename_fn, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RENAME_WANTS_FLAGS], [ + AC_MSG_CHECKING([whether iops->rename() wants flags]) + ZFS_LINUX_TEST_RESULT([inode_operations_rename], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RENAME_WANTS_FLAGS, 1, + [iops->rename() wants flags]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-rw.m4 b/config/kernel-rw.m4 new file mode 100644 index 000000000000..85b47d5c6fc2 --- /dev/null +++ b/config/kernel-rw.m4 @@ -0,0 +1,69 @@ +dnl # +dnl # 4.14 API change +dnl # kernel_write() which was introduced in 3.9 was updated to take +dnl # the offset as a pointer which is needed by vn_rdwr(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITE], [ + ZFS_LINUX_TEST_SRC([kernel_write], [ + #include + ],[ + struct file *file = NULL; + const void *buf = NULL; + size_t count = 0; + loff_t *pos = NULL; + ssize_t ret; + + ret = kernel_write(file, buf, count, pos); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WRITE], [ + AC_MSG_CHECKING([whether kernel_write() takes loff_t pointer]) + ZFS_LINUX_TEST_RESULT([kernel_write], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_WRITE_PPOS, 1, + [kernel_write() take loff_t pointer]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.14 API change +dnl # kernel_read() which has existed for forever was updated to take +dnl # the offset as a pointer which is needed by vn_rdwr(). +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_READ], [ + ZFS_LINUX_TEST_SRC([kernel_read], [ + #include + ],[ + struct file *file = NULL; + void *buf = NULL; + size_t count = 0; + loff_t *pos = NULL; + ssize_t ret; + + ret = kernel_read(file, buf, count, pos); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_READ], [ + AC_MSG_CHECKING([whether kernel_read() takes loff_t pointer]) + ZFS_LINUX_TEST_RESULT([kernel_read], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_READ_PPOS, 1, + [kernel_read() take loff_t pointer]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_RW], [ + ZFS_AC_KERNEL_SRC_WRITE + ZFS_AC_KERNEL_SRC_READ +]) + +AC_DEFUN([ZFS_AC_KERNEL_RW], [ + ZFS_AC_KERNEL_WRITE + ZFS_AC_KERNEL_READ +]) diff --git a/config/kernel-rwsem.m4 b/config/kernel-rwsem.m4 new file mode 100644 index 000000000000..824f4a3ffd41 --- /dev/null +++ b/config/kernel-rwsem.m4 @@ -0,0 +1,88 @@ +dnl # +dnl # 3.1 API Change +dnl # +dnl # The rw_semaphore.wait_lock member was changed from spinlock_t to +dnl # raw_spinlock_t at commit ddb6c9b58a19edcfac93ac670b066c836ff729f1. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_SPINLOCK_IS_RAW], [ + ZFS_LINUX_TEST_SRC([rwsem_spinlock_is_raw], [ + #include + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + raw_spinlock_t dummy_lock __attribute__ ((unused)) = + __RAW_SPIN_LOCK_INITIALIZER(dummy_lock); + dummy_semaphore.wait_lock = dummy_lock; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW], [ + AC_MSG_CHECKING([whether struct rw_semaphore member wait_lock is raw]) + ZFS_LINUX_TEST_RESULT([rwsem_spinlock_is_raw], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([rwsem_spinlock_is_raw]) + ]) +]) + +dnl # +dnl # 3.16 API Change +dnl # +dnl # rwsem-spinlock "->activity" changed to "->count" +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_ACTIVITY], [ + ZFS_LINUX_TEST_SRC([rwsem_activity], [ + #include + ],[ + struct rw_semaphore dummy_semaphore __attribute__ ((unused)); + dummy_semaphore.activity = 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ACTIVITY], [ + AC_MSG_CHECKING([whether struct rw_semaphore has member activity]) + ZFS_LINUX_TEST_RESULT([rwsem_activity], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ACTIVITY, 1, + [struct rw_semaphore has member activity]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.8 API Change +dnl # +dnl # rwsem "->count" changed to atomic_long_t type +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM_ATOMIC_LONG_COUNT], [ + ZFS_LINUX_TEST_SRC([rwsem_atomic_long_count], [ + #include + ],[ + DECLARE_RWSEM(dummy_semaphore); + (void) atomic_long_read(&dummy_semaphore.count); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT], [ + AC_MSG_CHECKING( + [whether struct rw_semaphore has atomic_long_t member count]) + ZFS_LINUX_TEST_RESULT([rwsem_atomic_long_count], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_RWSEM_ATOMIC_LONG_COUNT, 1, + [struct rw_semaphore has atomic_long_t member count]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_RWSEM], [ + ZFS_AC_KERNEL_SRC_RWSEM_SPINLOCK_IS_RAW + ZFS_AC_KERNEL_SRC_RWSEM_ACTIVITY + ZFS_AC_KERNEL_SRC_RWSEM_ATOMIC_LONG_COUNT +]) + +AC_DEFUN([ZFS_AC_KERNEL_RWSEM], [ + ZFS_AC_KERNEL_RWSEM_SPINLOCK_IS_RAW + ZFS_AC_KERNEL_RWSEM_ACTIVITY + ZFS_AC_KERNEL_RWSEM_ATOMIC_LONG_COUNT +]) diff --git a/config/kernel-sched.m4 b/config/kernel-sched.m4 new file mode 100644 index 000000000000..17e49fbdf472 --- /dev/null +++ b/config/kernel-sched.m4 @@ -0,0 +1,82 @@ +dnl # +dnl # 3.9 API change, +dnl # Moved things from linux/sched.h to linux/sched/rt.h +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED_RT_HEADER], [ + ZFS_LINUX_TEST_SRC([sched_rt_header], [ + #include + #include + ],[ + return 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED_RT_HEADER], [ + AC_MSG_CHECKING([whether header linux/sched/rt.h exists]) + ZFS_LINUX_TEST_RESULT([sched_rt_header], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([sched_rt_header]) + ]) +]) + +dnl # +dnl # 4.11 API change, +dnl # Moved things from linux/sched.h to linux/sched/signal.h +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED_SIGNAL_HEADER], [ + ZFS_LINUX_TEST_SRC([sched_signal_header], [ + #include + #include + ],[ + return 0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER], [ + AC_MSG_CHECKING([whether header linux/sched/signal.h exists]) + ZFS_LINUX_TEST_RESULT([sched_signal_header], [ + AC_DEFINE(HAVE_SCHED_SIGNAL_HEADER, 1, + [linux/sched/signal.h exists]) + AC_MSG_RESULT(yes) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.19 API change +dnl # The io_schedule_timeout() function is present in all 2.6.32 kernels +dnl # but it was not exported until Linux 3.19. The RHEL 7.x kernels which +dnl # are based on a 3.10 kernel do export this symbol. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_IO_SCHEDULE_TIMEOUT], [ + ZFS_LINUX_TEST_SRC([io_schedule_timeout], [ + #include + ], [ + (void) io_schedule_timeout(1); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT], [ + AC_MSG_CHECKING([whether io_schedule_timeout() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([io_schedule_timeout], + [io_schedule_timeout], [], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_IO_SCHEDULE_TIMEOUT, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SCHED], [ + ZFS_AC_KERNEL_SRC_SCHED_RT_HEADER + ZFS_AC_KERNEL_SRC_SCHED_SIGNAL_HEADER + ZFS_AC_KERNEL_SRC_IO_SCHEDULE_TIMEOUT +]) + +AC_DEFUN([ZFS_AC_KERNEL_SCHED], [ + ZFS_AC_KERNEL_SCHED_RT_HEADER + ZFS_AC_KERNEL_SCHED_SIGNAL_HEADER + ZFS_AC_KERNEL_IO_SCHEDULE_TIMEOUT +]) diff --git a/config/kernel-security-inode-init.m4 b/config/kernel-security-inode-init.m4 new file mode 100644 index 000000000000..4e4bfd29b2ff --- /dev/null +++ b/config/kernel-security-inode-init.m4 @@ -0,0 +1,36 @@ +dnl # +dnl # 3.2 API change +dnl # The security_inode_init_security() API has been changed to include +dnl # a filesystem specific callback to write security extended attributes. +dnl # This was done to support the initialization of multiple LSM xattrs +dnl # and the EVM xattr. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SECURITY_INODE_INIT_SECURITY_CALLBACK], [ + ZFS_LINUX_TEST_SRC([security_inode_init_security], [ + #include + ],[ + struct inode *ip __attribute__ ((unused)) = NULL; + struct inode *dip __attribute__ ((unused)) = NULL; + const struct qstr *str __attribute__ ((unused)) = NULL; + initxattrs func __attribute__ ((unused)) = NULL; + + security_inode_init_security(ip, dip, str, func, NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SECURITY_INODE_INIT_SECURITY_CALLBACK], [ + AC_MSG_CHECKING([whether security_inode_init_security wants callback]) + ZFS_LINUX_TEST_RESULT([security_inode_init_security], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([security_inode_init_security callback]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SECURITY_INODE], [ + ZFS_AC_KERNEL_SRC_SECURITY_INODE_INIT_SECURITY_CALLBACK +]) + +AC_DEFUN([ZFS_AC_KERNEL_SECURITY_INODE], [ + ZFS_AC_KERNEL_SECURITY_INODE_INIT_SECURITY_CALLBACK +]) diff --git a/config/kernel-set-nlink.m4 b/config/kernel-set-nlink.m4 new file mode 100644 index 000000000000..fa4f928b27d5 --- /dev/null +++ b/config/kernel-set-nlink.m4 @@ -0,0 +1,22 @@ +dnl # +dnl # Linux 3.2 API change +dnl # set_nlink() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SET_NLINK], [ + ZFS_LINUX_TEST_SRC([set_nlink], [ + #include + ],[ + struct inode node; + unsigned int link = 0; + (void) set_nlink(&node, link); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SET_NLINK], [ + AC_MSG_CHECKING([whether set_nlink() is available]) + ZFS_LINUX_TEST_RESULT([set_nlink], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([set_nlink()]) + ]) +]) diff --git a/config/kernel-setattr-prepare.m4 b/config/kernel-setattr-prepare.m4 new file mode 100644 index 000000000000..45408c45c69b --- /dev/null +++ b/config/kernel-setattr-prepare.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # 4.9 API change +dnl # The inode_change_ok() function has been renamed setattr_prepare() +dnl # and updated to take a dentry rather than an inode. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SETATTR_PREPARE], [ + ZFS_LINUX_TEST_SRC([setattr_prepare], [ + #include + ], [ + struct dentry *dentry = NULL; + struct iattr *attr = NULL; + int error __attribute__ ((unused)) = + setattr_prepare(dentry, attr); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SETATTR_PREPARE], [ + AC_MSG_CHECKING([whether setattr_prepare() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([setattr_prepare], + [setattr_prepare], [fs/attr.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SETATTR_PREPARE, 1, + [setattr_prepare() is available]) + ], [ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-sget-args.m4 b/config/kernel-sget-args.m4 new file mode 100644 index 000000000000..afa62c797d75 --- /dev/null +++ b/config/kernel-sget-args.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # 3.6 API change, +dnl # 'sget' now takes the mount flags as an argument. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SGET], [ + ZFS_LINUX_TEST_SRC([sget_5args], [ + #include + ],[ + struct file_system_type *type = NULL; + int (*test)(struct super_block *,void *) = NULL; + int (*set)(struct super_block *,void *) = NULL; + int flags = 0; + void *data = NULL; + (void) sget(type, test, set, flags, data); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SGET], [ + AC_MSG_CHECKING([whether sget() wants 5 args]) + ZFS_LINUX_TEST_RESULT([sget_5args], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([sget()]) + ]) +]) diff --git a/config/kernel-show-options.m4 b/config/kernel-show-options.m4 new file mode 100644 index 000000000000..93bd5fbfbb24 --- /dev/null +++ b/config/kernel-show-options.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # Linux 3.3 API +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHOW_OPTIONS], [ + ZFS_LINUX_TEST_SRC([super_operations_show_options], [ + #include + + int show_options(struct seq_file * x, struct dentry * y) { + return 0; + }; + + static struct super_operations sops __attribute__ ((unused)) = { + .show_options = show_options, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHOW_OPTIONS], [ + AC_MSG_CHECKING([whether sops->show_options() wants dentry]) + ZFS_LINUX_TEST_RESULT([super_operations_show_options], [ + AC_MSG_RESULT([yes]) + ],[ + ZFS_LINUX_TEST_ERROR([sops->show_options()]) + ]) +]) diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 new file mode 100644 index 000000000000..a40c86d5c57f --- /dev/null +++ b/config/kernel-shrink.m4 @@ -0,0 +1,151 @@ +dnl # +dnl # 3.1 API change +dnl # The super_block structure now stores a per-filesystem shrinker. +dnl # This interface is preferable because it can be used to specifically +dnl # target only the zfs filesystem for pruning. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ + ZFS_LINUX_TEST_SRC([super_block_s_shrink], [ + #include + + int shrink(struct shrinker *s, struct shrink_control *sc) + { return 0; } + + static const struct super_block + sb __attribute__ ((unused)) = { + .s_shrink.seeks = DEFAULT_SEEKS, + .s_shrink.batch = 0, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [ + AC_MSG_CHECKING([whether super_block has s_shrink]) + ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + ]) +]) + +dnl # +dnl # 3.12 API change +dnl # The nid member was added to struct shrink_control to support +dnl # NUMA-aware shrinkers. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID], [ + ZFS_LINUX_TEST_SRC([shrink_control_nid], [ + #include + ],[ + struct shrink_control sc __attribute__ ((unused)); + unsigned long scnidsize __attribute__ ((unused)) = + sizeof(sc.nid); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID], [ + AC_MSG_CHECKING([whether shrink_control has nid]) + ZFS_LINUX_TEST_RESULT([shrink_control_nid], [ + AC_MSG_RESULT(yes) + AC_DEFINE(SHRINK_CONTROL_HAS_NID, 1, + [struct shrink_control has nid]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ + ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control], [ + #include + int shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + ],[ + struct shrinker cache_shrinker = { + .shrink = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ]) + + ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control_split], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + register_shrinker(&cache_shrinker); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ + dnl # + dnl # 3.0 - 3.11 API change + dnl # ->shrink(struct shrinker *, struct shrink_control *sc) + dnl # + AC_MSG_CHECKING([whether new 2-argument shrinker exists]) + ZFS_LINUX_TEST_RESULT([shrinker_cb_shrink_control], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SINGLE_SHRINKER_CALLBACK, 1, + [new shrinker callback wants 2 args]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 3.12 API change, + dnl # ->shrink() is logically split in to + dnl # ->count_objects() and ->scan_objects() + dnl # + AC_MSG_CHECKING([whether ->count_objects callback exists]) + ZFS_LINUX_TEST_RESULT([shrinker_cb_shrink_control_split], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, + [->count_objects exists]) + ],[ + ZFS_LINUX_TEST_ERROR([shrinker]) + ]) + ]) +]) + +dnl # +dnl # 2.6.39 API change, +dnl # Shrinker adjust to use common shrink_control structure. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT], [ + ZFS_LINUX_TEST_SRC([shrink_control_struct], [ + #include + ],[ + struct shrink_control sc __attribute__ ((unused)); + + sc.nr_to_scan = 0; + sc.gfp_mask = GFP_KERNEL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ + AC_MSG_CHECKING([whether struct shrink_control exists]) + ZFS_LINUX_TEST_RESULT([shrink_control_struct], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINK_CONTROL_STRUCT, 1, + [struct shrink_control exists]) + ],[ + ZFS_LINUX_TEST_ERROR([shrink_control]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ + ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID + ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK + ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT +]) + +AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [ + ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID + ZFS_AC_KERNEL_SHRINKER_CALLBACK + ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT +]) diff --git a/config/kernel-super-userns.m4 b/config/kernel-super-userns.m4 new file mode 100644 index 000000000000..1ad35f2d19ba --- /dev/null +++ b/config/kernel-super-userns.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # 4.8 API change +dnl # struct user_namespace was added to struct super_block as +dnl # super->s_user_ns member +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_USER_NS], [ + ZFS_LINUX_TEST_SRC([super_user_ns], [ + #include + #include + ], [ + struct super_block super; + super.s_user_ns = (struct user_namespace *)NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SUPER_USER_NS], [ + AC_MSG_CHECKING([whether super_block->s_user_ns exists]) + ZFS_LINUX_TEST_RESULT([super_user_ns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_USER_NS, 1, + [super_block->s_user_ns exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4 new file mode 100644 index 000000000000..403cff3f4189 --- /dev/null +++ b/config/kernel-timer.m4 @@ -0,0 +1,75 @@ +dnl # 4.14-rc3 API change +dnl # https://lwn.net/Articles/735887/ +dnl # +dnl # Check if timer_list.func get passed a timer_list or an unsigned long +dnl # (older kernels). Also sanity check the from_timer() and timer_setup() +dnl # macros are available as well, since they will be used in the same newer +dnl # kernels that support the new timer_list.func signature. +dnl # +dnl # Also check for the existence of flags in struct timer_list, they were +dnl # added in 4.1-rc8 via 0eeda71bc30d. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TIMER_SETUP], [ + ZFS_LINUX_TEST_SRC([timer_setup], [ + #include + + struct my_task_timer { + struct timer_list timer; + int data; + }; + + void task_expire(struct timer_list *tl) + { + struct my_task_timer *task_timer = + from_timer(task_timer, tl, timer); + task_timer->data = 42; + } + ],[ + struct my_task_timer task_timer; + timer_setup(&task_timer.timer, task_expire, 0); + ]) + + ZFS_LINUX_TEST_SRC([timer_list_function], [ + #include + void task_expire(struct timer_list *tl) {} + ],[ + struct timer_list tl; + tl.function = task_expire; + ]) + + ZFS_LINUX_TEST_SRC([timer_list_flags], [ + #include + ],[ + struct timer_list tl; + tl.flags = 2; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TIMER_SETUP], [ + AC_MSG_CHECKING([whether timer_setup() is available]) + ZFS_LINUX_TEST_RESULT([timer_setup], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_SETUP, 1, + [timer_setup() is available]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether timer function expects timer_list]) + ZFS_LINUX_TEST_RESULT([timer_list_function], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST, 1, + [timer_list.function gets a timer_list]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether struct timer_list has flags]) + ZFS_LINUX_TEST_RESULT([timer_list_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_KERNEL_TIMER_LIST_FLAGS, 1, + [struct timer_list has a flags member]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-tmpfile.m4 b/config/kernel-tmpfile.m4 new file mode 100644 index 000000000000..f510bfe6ba03 --- /dev/null +++ b/config/kernel-tmpfile.m4 @@ -0,0 +1,25 @@ +dnl # +dnl # 3.11 API change +dnl # Add support for i_op->tmpfile +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ + ZFS_LINUX_TEST_SRC([inode_operations_tmpfile], [ + #include + int tmpfile(struct inode *inode, struct dentry *dentry, + umode_t mode) { return 0; } + static struct inode_operations + iops __attribute__ ((unused)) = { + .tmpfile = tmpfile, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TMPFILE], [ + AC_MSG_CHECKING([whether i_op->tmpfile() exists]) + ZFS_LINUX_TEST_RESULT([inode_operations_tmpfile], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TMPFILE, 1, [i_op->tmpfile() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-totalhigh_pages.m4 b/config/kernel-totalhigh_pages.m4 new file mode 100644 index 000000000000..4ecb03a50a51 --- /dev/null +++ b/config/kernel-totalhigh_pages.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 5.0 API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES], [ + ZFS_LINUX_TEST_SRC([totalhigh_pages], [ + #include + ],[ + unsigned long pages __attribute__ ((unused)); + pages = totalhigh_pages(); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TOTALHIGH_PAGES], [ + AC_MSG_CHECKING([whether totalhigh_pages() exists]) + ZFS_LINUX_TEST_RESULT([totalhigh_pages], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TOTALHIGH_PAGES, 1, [totalhigh_pages() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-totalram-pages-func.m4 b/config/kernel-totalram-pages-func.m4 new file mode 100644 index 000000000000..d0e812a8d2d2 --- /dev/null +++ b/config/kernel-totalram-pages-func.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # Linux 5.0: totalram_pages is no longer a global variable, and must be +dnl # read via the totalram_pages() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC], [ + ZFS_LINUX_TEST_SRC([totalram_pages], [ + #include + ],[ + unsigned long pages __attribute__ ((unused)); + pages = totalram_pages(); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC], [ + AC_MSG_CHECKING([whether totalram_pages() exists]) + ZFS_LINUX_TEST_RESULT([totalram_pages], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_TOTALRAM_PAGES_FUNC, 1, + [kernel has totalram_pages()]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-truncate-setsize.m4 b/config/kernel-truncate-setsize.m4 new file mode 100644 index 000000000000..76c82ef3021b --- /dev/null +++ b/config/kernel-truncate-setsize.m4 @@ -0,0 +1,21 @@ +dnl # +dnl # 2.6.35 API change +dnl # Added truncate_setsize() helper function. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE], [ + ZFS_LINUX_TEST_SRC([truncate_setsize], [ + #include + ], [ + truncate_setsize(NULL, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_TRUNCATE_SETSIZE], [ + AC_MSG_CHECKING([whether truncate_setsize() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([truncate_setsize], + [truncate_setsize], [mm/truncate.c], [ + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([truncate_setsize]) + ]) +]) diff --git a/config/kernel-userns-capabilities.m4 b/config/kernel-userns-capabilities.m4 new file mode 100644 index 000000000000..026503623a2b --- /dev/null +++ b/config/kernel-userns-capabilities.m4 @@ -0,0 +1,106 @@ +dnl # +dnl # 2.6.38 API change +dnl # ns_capable() was introduced +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_NS_CAPABLE], [ + ZFS_LINUX_TEST_SRC([ns_capable], [ + #include + ],[ + ns_capable((struct user_namespace *)NULL, CAP_SYS_ADMIN); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_NS_CAPABLE], [ + AC_MSG_CHECKING([whether ns_capable exists]) + ZFS_LINUX_TEST_RESULT([ns_capable], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([ns_capable()]) + ]) +]) + +dnl # +dnl # 4.10 API change +dnl # has_capability() was exported. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_HAS_CAPABILITY], [ + ZFS_LINUX_TEST_SRC([has_capability], [ + #include + ],[ + struct task_struct *task = NULL; + int cap = 0; + bool result __attribute__ ((unused)); + + result = has_capability(task, cap); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_HAS_CAPABILITY], [ + AC_MSG_CHECKING([whether has_capability() is available]) + ZFS_LINUX_TEST_RESULT_SYMBOL([has_capability], + [has_capability], [kernel/capability.c], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_HAS_CAPABILITY, 1, [has_capability() is available]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 2.6.39 API change +dnl # struct user_namespace was added to struct cred_t as cred->user_ns member +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CRED_USER_NS], [ + ZFS_LINUX_TEST_SRC([cred_user_ns], [ + #include + ],[ + struct cred cr; + cr.user_ns = (struct user_namespace *)NULL; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CRED_USER_NS], [ + AC_MSG_CHECKING([whether cred_t->user_ns exists]) + ZFS_LINUX_TEST_RESULT([cred_user_ns], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([cred_t->user_ns()]) + ]) +]) + +dnl # +dnl # 3.4 API change +dnl # kuid_has_mapping() and kgid_has_mapping() were added to distinguish +dnl # between internal kernel uids/gids and user namespace uids/gids. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING], [ + ZFS_LINUX_TEST_SRC([kuid_has_mapping], [ + #include + ],[ + kuid_has_mapping((struct user_namespace *)NULL, KUIDT_INIT(0)); + kgid_has_mapping((struct user_namespace *)NULL, KGIDT_INIT(0)); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_KUID_HAS_MAPPING], [ + AC_MSG_CHECKING([whether kuid_has_mapping/kgid_has_mapping exist]) + ZFS_LINUX_TEST_RESULT([kuid_has_mapping], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([kuid_has_mapping()]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES], [ + ZFS_AC_KERNEL_SRC_NS_CAPABLE + ZFS_AC_KERNEL_SRC_HAS_CAPABILITY + ZFS_AC_KERNEL_SRC_CRED_USER_NS + ZFS_AC_KERNEL_SRC_KUID_HAS_MAPPING +]) + +AC_DEFUN([ZFS_AC_KERNEL_USERNS_CAPABILITIES], [ + ZFS_AC_KERNEL_NS_CAPABLE + ZFS_AC_KERNEL_HAS_CAPABILITY + ZFS_AC_KERNEL_CRED_USER_NS + ZFS_AC_KERNEL_KUID_HAS_MAPPING +]) diff --git a/config/kernel-usleep_range.m4 b/config/kernel-usleep_range.m4 new file mode 100644 index 000000000000..06eb381a3c38 --- /dev/null +++ b/config/kernel-usleep_range.m4 @@ -0,0 +1,23 @@ +dnl # +dnl # 2.6.36 API compatibility- Added usleep_range timer. +dnl # +dnl # usleep_range is a finer precision implementation of msleep +dnl # designed to be a drop-in replacement for udelay where a precise +dnl # sleep / busy-wait is unnecessary. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_USLEEP_RANGE], [ + ZFS_LINUX_TEST_SRC([usleep_range], [ + #include + ],[ + usleep_range(0, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_USLEEP_RANGE], [ + AC_MSG_CHECKING([whether usleep_range() is available]) + ZFS_LINUX_TEST_RESULT([usleep_range], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([usleep_range()]) + ]) +]) diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 new file mode 100644 index 000000000000..82583d52fcbc --- /dev/null +++ b/config/kernel-vfs-direct_IO.m4 @@ -0,0 +1,109 @@ +dnl # +dnl # Check for direct IO interfaces. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ + ZFS_LINUX_TEST_SRC([direct_io_iter], [ + #include + + ssize_t test_direct_IO(struct kiocb *kiocb, + struct iov_iter *iter) { return 0; } + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .direct_IO = test_direct_IO, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([direct_io_iter_offset], [ + #include + + ssize_t test_direct_IO(struct kiocb *kiocb, + struct iov_iter *iter, loff_t offset) { return 0; } + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .direct_IO = test_direct_IO, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([direct_io_iter_rw_offset], [ + #include + + ssize_t test_direct_IO(int rw, struct kiocb *kiocb, + struct iov_iter *iter, loff_t offset) { return 0; } + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .direct_IO = test_direct_IO, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([direct_io_iovec], [ + #include + + ssize_t test_direct_IO(int rw, struct kiocb *kiocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) { return 0; } + + static const struct address_space_operations + aops __attribute__ ((unused)) = { + .direct_IO = test_direct_IO, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_DIRECT_IO], [ + dnl # + dnl # Linux 4.6.x API change + dnl # + AC_MSG_CHECKING([whether aops->direct_IO() uses iov_iter]) + ZFS_LINUX_TEST_RESULT([direct_io_iter], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER, 1, + [aops->direct_IO() uses iov_iter without rw]) + ],[ + AC_MSG_RESULT([no]) + + dnl # + dnl # Linux 4.1.x API change + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses offset]) + ZFS_LINUX_TEST_RESULT([direct_io_iter_offset], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_OFFSET, 1, + [aops->direct_IO() uses iov_iter with offset]) + + ],[ + AC_MSG_RESULT([no]) + + dnl # + dnl # Linux 3.16.x API change + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses rw and offset]) + ZFS_LINUX_TEST_RESULT([direct_io_iter_rw_offset], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET, 1, + [aops->direct_IO() uses iov_iter with ] + [rw and offset]) + ],[ + AC_MSG_RESULT([no]) + + dnl # + dnl # Ancient Linux API (predates git) + dnl # + AC_MSG_CHECKING( + [whether aops->direct_IO() uses iovec]) + ZFS_LINUX_TEST_RESULT([direct_io_iovec], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_VFS_DIRECT_IO_IOVEC, 1, + [aops->direct_IO() uses iovec]) + ],[ + ZFS_LINUX_TEST_ERROR([direct IO]) + AC_MSG_RESULT([no]) + ]) + ]) + ]) + ]) +]) diff --git a/config/kernel-vfs-fsync.m4 b/config/kernel-vfs-fsync.m4 new file mode 100644 index 000000000000..159efca4532f --- /dev/null +++ b/config/kernel-vfs-fsync.m4 @@ -0,0 +1,20 @@ +dnl # +dnl # 2.6.35 API change, +dnl # Unused 'struct dentry *' removed from vfs_fsync() prototype. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_fsync_2args], [ + #include + ],[ + vfs_fsync(NULL, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_FSYNC_2ARGS], [ + AC_MSG_CHECKING([whether vfs_fsync() wants 2 args]) + ZFS_LINUX_TEST_RESULT([vfs_fsync_2args], [ + AC_MSG_RESULT(yes) + ],[ + ZFS_LINUX_TEST_ERROR([vfs_fsync()]) + ]) +]) diff --git a/config/kernel-vfs-getattr.m4 b/config/kernel-vfs-getattr.m4 new file mode 100644 index 000000000000..eb07853cc4b9 --- /dev/null +++ b/config/kernel-vfs-getattr.m4 @@ -0,0 +1,86 @@ +dnl # +dnl # 4.11 API, a528d35e@torvalds/linux +dnl # vfs_getattr(const struct path *p, struct kstat *s, u32 m, unsigned int f) +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_4ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_4args], [ + #include + ],[ + vfs_getattr((const struct path *)NULL, + (struct kstat *)NULL, + (u32)0, + (unsigned int)0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_4ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 4 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_4args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_4ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 4 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.9 API +dnl # vfs_getattr(struct path *p, struct kstat *s) +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_2ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_2args], [ + #include + ],[ + vfs_getattr((struct path *) NULL, + (struct kstat *)NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_2ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 2 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_2args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 2 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # <3.9 API +dnl # vfs_getattr(struct vfsmount *v, struct dentry *d, struct kstat *k) +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR_3ARGS], [ + ZFS_LINUX_TEST_SRC([vfs_getattr_3args], [ + #include + ],[ + vfs_getattr((struct vfsmount *)NULL, + (struct dentry *)NULL, + (struct kstat *)NULL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR_3ARGS], [ + AC_MSG_CHECKING([whether vfs_getattr() wants 3 args]) + ZFS_LINUX_TEST_RESULT([vfs_getattr_3args], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_3ARGS_VFS_GETATTR, 1, + [vfs_getattr wants 3 args]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GETATTR], [ + ZFS_AC_KERNEL_SRC_VFS_GETATTR_4ARGS + ZFS_AC_KERNEL_SRC_VFS_GETATTR_2ARGS + ZFS_AC_KERNEL_SRC_VFS_GETATTR_3ARGS +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GETATTR], [ + ZFS_AC_KERNEL_VFS_GETATTR_4ARGS + ZFS_AC_KERNEL_VFS_GETATTR_2ARGS + ZFS_AC_KERNEL_VFS_GETATTR_3ARGS +]) diff --git a/config/kernel-vfs-iterate.m4 b/config/kernel-vfs-iterate.m4 new file mode 100644 index 000000000000..172118eac87b --- /dev/null +++ b/config/kernel-vfs-iterate.m4 @@ -0,0 +1,83 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_ITERATE], [ + ZFS_LINUX_TEST_SRC([file_operations_iterate_shared], [ + #include + int iterate(struct file *filp, struct dir_context * context) + { return 0; } + + static const struct file_operations fops + __attribute__ ((unused)) = { + .iterate_shared = iterate, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([file_operations_iterate], [ + #include + int iterate(struct file *filp, + struct dir_context *context) { return 0; } + + static const struct file_operations fops + __attribute__ ((unused)) = { + .iterate = iterate, + }; + + #if defined(FMODE_KABI_ITERATE) + #error "RHEL 7.5, FMODE_KABI_ITERATE interface" + #endif + ],[]) + + ZFS_LINUX_TEST_SRC([file_operations_readdir], [ + #include + int readdir(struct file *filp, void *entry, + filldir_t func) { return 0; } + + static const struct file_operations fops + __attribute__ ((unused)) = { + .readdir = readdir, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_ITERATE], [ + dnl # + dnl # 4.7 API change + dnl # + AC_MSG_CHECKING([whether fops->iterate_shared() is available]) + ZFS_LINUX_TEST_RESULT([file_operations_iterate_shared], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_ITERATE_SHARED, 1, + [fops->iterate_shared() is available]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # 3.11 API change + dnl # + dnl # RHEL 7.5 compatibility; the fops.iterate() method was + dnl # added to the file_operations structure but in order to + dnl # maintain KABI compatibility all callers must set + dnl # FMODE_KABI_ITERATE which is checked in iterate_dir(). + dnl # When detected ignore this interface and fallback to + dnl # to using fops.readdir() to retain KABI compatibility. + dnl # + AC_MSG_CHECKING([whether fops->iterate() is available]) + ZFS_LINUX_TEST_RESULT([file_operations_iterate], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_ITERATE, 1, + [fops->iterate() is available]) + ],[ + AC_MSG_RESULT(no) + + dnl # + dnl # readdir interface introduced + dnl # + AC_MSG_CHECKING([whether fops->readdir() is available]) + ZFS_LINUX_TEST_RESULT([file_operations_readdir], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_READDIR, 1, + [fops->readdir() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([vfs_iterate]) + ]) + ]) + ]) +]) diff --git a/config/kernel-vfs-rw-iterate.m4 b/config/kernel-vfs-rw-iterate.m4 new file mode 100644 index 000000000000..000353ec15b0 --- /dev/null +++ b/config/kernel-vfs-rw-iterate.m4 @@ -0,0 +1,80 @@ +dnl # +dnl # Linux 3.16 API +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE], [ + ZFS_LINUX_TEST_SRC([file_operations_rw], [ + #include + + ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to) + { return 0; } + ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from) + { return 0; } + + static const struct file_operations + fops __attribute__ ((unused)) = { + .read_iter = test_read, + .write_iter = test_write, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([new_sync_rw], [ + #include + ],[ + ssize_t ret __attribute__ ((unused)); + struct file *filp = NULL; + char __user *rbuf = NULL; + const char __user *wbuf = NULL; + size_t len = 0; + loff_t ppos; + + ret = new_sync_read(filp, rbuf, len, &ppos); + ret = new_sync_write(filp, wbuf, len, &ppos); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_RW_ITERATE], [ + AC_MSG_CHECKING([whether fops->read/write_iter() are available]) + ZFS_LINUX_TEST_RESULT([file_operations_rw], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_VFS_RW_ITERATE, 1, + [fops->read/write_iter() are available]) + + dnl # + dnl # Linux 4.1 API + dnl # + AC_MSG_CHECKING([whether new_sync_read/write() are available]) + ZFS_LINUX_TEST_RESULT([new_sync_rw], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_NEW_SYNC_READ, 1, + [new_sync_read()/new_sync_write() are available]) + ],[ + AC_MSG_RESULT(no) + ]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Linux 4.1.x API +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS], [ + ZFS_LINUX_TEST_SRC([generic_write_checks], [ + #include + ],[ + struct kiocb *iocb = NULL; + struct iov_iter *iov = NULL; + generic_write_checks(iocb, iov); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS], [ + AC_MSG_CHECKING([whether generic_write_checks() takes kiocb]) + ZFS_LINUX_TEST_RESULT([generic_write_checks], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_WRITE_CHECKS_KIOCB, 1, + [generic_write_checks() takes kiocb]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-wait.m4 b/config/kernel-wait.m4 new file mode 100644 index 000000000000..0414242bf6d4 --- /dev/null +++ b/config/kernel-wait.m4 @@ -0,0 +1,99 @@ +dnl # +dnl # 4.13 API change +dnl # Renamed struct wait_queue -> struct wait_queue_entry. +dnl # +dnl # N.B. The type check is performed before all other checks +dnl # since ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY depends on +dnl # HAVE_WAIT_QUEUE_ENTRY_T being set in confdefs.h. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T], [ + AC_MSG_CHECKING([whether wait_queue_entry_t exists]) + ZFS_LINUX_TRY_COMPILE([ + #include + ],[ + wait_queue_entry_t *entry __attribute__ ((unused)); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_QUEUE_ENTRY_T, 1, + [wait_queue_entry_t exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 3.17 API change, +dnl # wait_on_bit() no longer requires an action argument. The former +dnl # "wait_on_bit" interface required an 'action' function to be provided +dnl # which does the actual waiting. There were over 20 such functions in the +dnl # kernel, many of them identical, though most cases can be satisfied by one +dnl # of just two functions: one which uses io_schedule() and one which just +dnl # uses schedule(). This API change was made to consolidate all of those +dnl # redundant wait functions. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT_ON_BIT], [ + ZFS_LINUX_TEST_SRC([wait_on_bit], [ + #include + ],[ + int (*action)(void *) = NULL; + wait_on_bit(NULL, 0, action, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT_ON_BIT], [ + AC_MSG_CHECKING([whether wait_on_bit() takes an action]) + ZFS_LINUX_TEST_RESULT([wait_on_bit], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_ON_BIT_ACTION, 1, [yes]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # 4.13 API change +dnl # Renamed wait_queue_head::task_list -> wait_queue_head::head +dnl # Renamed wait_queue_entry::task_list -> wait_queue_entry::entry +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY], [ + ZFS_LINUX_TEST_SRC([wait_queue_head_entry], [ + #include + + #ifdef HAVE_WAIT_QUEUE_ENTRY_T + typedef wait_queue_head_t spl_wait_queue_head_t; + typedef wait_queue_entry_t spl_wait_queue_entry_t; + #else + typedef wait_queue_head_t spl_wait_queue_head_t; + typedef wait_queue_t spl_wait_queue_entry_t; + #endif + ],[ + spl_wait_queue_head_t wq_head; + spl_wait_queue_entry_t wq_entry; + struct list_head *head __attribute__ ((unused)); + struct list_head *entry __attribute__ ((unused)); + + head = &wq_head.head; + entry = &wq_entry.entry; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY], [ + AC_MSG_CHECKING([whether wq_head->head and wq_entry->entry exist]) + ZFS_LINUX_TEST_RESULT([wait_queue_head_entry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_WAIT_QUEUE_HEAD_ENTRY, 1, + [wq_head->head and wq_entry->entry exist]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_WAIT], [ + ZFS_AC_KERNEL_SRC_WAIT_ON_BIT + ZFS_AC_KERNEL_SRC_WAIT_QUEUE_HEAD_ENTRY +]) + +AC_DEFUN([ZFS_AC_KERNEL_WAIT], [ + ZFS_AC_KERNEL_WAIT_ON_BIT + ZFS_AC_KERNEL_WAIT_QUEUE_HEAD_ENTRY +]) diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 new file mode 100644 index 000000000000..137bf4a8aff0 --- /dev/null +++ b/config/kernel-xattr-handler.m4 @@ -0,0 +1,397 @@ +dnl # +dnl # 2.6.35 API change, +dnl # The 'struct xattr_handler' was constified in the generic +dnl # super_block structure. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER], [ + ZFS_LINUX_TEST_SRC([const_xattr_handler], [ + #include + #include + + const struct xattr_handler xattr_test_handler = { + .prefix = "test", + .get = NULL, + .set = NULL, + }; + + const struct xattr_handler *xattr_handlers[] = { + &xattr_test_handler, + }; + + const struct super_block sb __attribute__ ((unused)) = { + .s_xattr = xattr_handlers, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_CONST_XATTR_HANDLER], [ + AC_MSG_CHECKING([whether super_block uses const struct xattr_handler]) + ZFS_LINUX_TEST_RESULT([const_xattr_handler], [ + AC_MSG_RESULT([yes]) + ],[ + ZFS_LINUX_TEST_ERROR([const xattr_handler]) + ]) +]) + +dnl # +dnl # 4.5 API change, +dnl # struct xattr_handler added new member "name". +dnl # xattr_handler which matches to whole name rather than prefix should use +dnl # "name" instead of "prefix", e.g. "system.posix_acl_access" +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME], [ + ZFS_LINUX_TEST_SRC([xattr_handler_name], [ + #include + + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .name = XATTR_NAME_POSIX_ACL_ACCESS, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_NAME], [ + AC_MSG_CHECKING([whether xattr_handler has name]) + ZFS_LINUX_TEST_RESULT([xattr_handler_name], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_HANDLER_NAME, 1, + [xattr_handler has name]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # Supported xattr handler get() interfaces checked newest to oldest. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, void *buffer, size_t size) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_xattr_handler], [ + #include + + int get(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + void *buffer, size_t size) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry], [ + #include + + int get(struct dentry *dentry, const char *name, + void *buffer, size_t size, int handler_flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .get = get, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_GET], [ + dnl # + dnl # 4.7 API change, + dnl # The xattr_handler->get() callback was changed to take both + dnl # dentry and inode. + dnl # + AC_MSG_CHECKING([whether xattr_handler->get() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_DENTRY_INODE, 1, + [xattr_handler->get() wants both dentry and inode]) + ],[ + dnl # + dnl # 4.4 API change, + dnl # The xattr_handler->get() callback was changed to take a + dnl # attr_handler, and handler_flags argument was removed and + dnl # should be accessed by handler->flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->get() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_xattr_handler], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_HANDLER, 1, + [xattr_handler->get() wants xattr_handler]) + ],[ + dnl # + dnl # 2.6.33 API change, + dnl # The xattr_handler->get() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->get() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_get_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_GET_DENTRY, 1, + [xattr_handler->get() wants dentry]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr get()]) + ]) + ]) + ]) +]) + +dnl # +dnl # Supported xattr handler set() interfaces checked newest to oldest. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ + ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry_inode], [ + #include + + int set(const struct xattr_handler *handler, + struct dentry *dentry, struct inode *inode, + const char *name, const void *buffer, + size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_set_xattr_handler], [ + #include + + int set(const struct xattr_handler *handler, + struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags) + { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry], [ + #include + + int set(struct dentry *dentry, const char *name, + const void *buffer, size_t size, int flags, + int handler_flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .set = set, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_SET], [ + dnl # + dnl # 4.7 API change, + dnl # The xattr_handler->set() callback was changed to take both + dnl # dentry and inode. + dnl # + AC_MSG_CHECKING([whether xattr_handler->set() wants dentry and inode]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry_inode], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_SET_DENTRY_INODE, 1, + [xattr_handler->set() wants both dentry and inode]) + ],[ + dnl # + dnl # 4.4 API change, + dnl # The xattr_handler->set() callback was changed to take a + dnl # xattr_handler, and handler_flags argument was removed and + dnl # should be accessed by handler->flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->set() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_xattr_handler], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_SET_HANDLER, 1, + [xattr_handler->set() wants xattr_handler]) + ],[ + dnl # + dnl # 2.6.33 API change, + dnl # The xattr_handler->set() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->set() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_set_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_SET_DENTRY, 1, + [xattr_handler->set() wants dentry]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr set()]) + ]) + ]) + ]) +]) + +dnl # +dnl # Supported xattr handler list() interfaces checked newest to oldest. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ + ZFS_LINUX_TEST_SRC([xattr_handler_list_simple], [ + #include + + bool list(struct dentry *dentry) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .list = list, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_list_xattr_handler], [ + #include + + size_t list(const struct xattr_handler *handler, + struct dentry *dentry, char *list, size_t list_size, + const char *name, size_t name_len) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .list = list, + }; + ],[]) + + ZFS_LINUX_TEST_SRC([xattr_handler_list_dentry], [ + #include + + size_t list(struct dentry *dentry, + char *list, size_t list_size, + const char *name, size_t name_len, + int handler_flags) { return 0; } + static const struct xattr_handler + xops __attribute__ ((unused)) = { + .list = list, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR_HANDLER_LIST], [ + dnl # 4.5 API change, + dnl # The xattr_handler->list() callback was changed to take only a + dnl # dentry and it only needs to return if it's accessible. + AC_MSG_CHECKING([whether xattr_handler->list() wants simple]) + ZFS_LINUX_TEST_RESULT([xattr_handler_list_simple], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_LIST_SIMPLE, 1, + [xattr_handler->list() wants simple]) + ],[ + dnl # + dnl # 4.4 API change, + dnl # The xattr_handler->list() callback was changed to take a + dnl # xattr_handler, and handler_flags argument was removed + dnl # and should be accessed by handler->flags. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->list() wants xattr_handler]) + ZFS_LINUX_TEST_RESULT([xattr_handler_list_xattr_handler], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_LIST_HANDLER, 1, + [xattr_handler->list() wants xattr_handler]) + ],[ + dnl # + dnl # 2.6.33 API change, + dnl # The xattr_handler->list() callback was changed + dnl # to take a dentry instead of an inode, and a + dnl # handler_flags argument was added. + dnl # + AC_MSG_RESULT(no) + AC_MSG_CHECKING( + [whether xattr_handler->list() wants dentry]) + ZFS_LINUX_TEST_RESULT([xattr_handler_list_dentry], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_XATTR_LIST_DENTRY, 1, + [xattr_handler->list() wants dentry]) + ],[ + ZFS_LINUX_TEST_ERROR([xattr list()]) + ]) + ]) + ]) +]) + +dnl # +dnl # 3.7 API change, +dnl # The posix_acl_{from,to}_xattr functions gained a new +dnl # parameter: user_ns +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS], [ + ZFS_LINUX_TEST_SRC([posix_acl_from_xattr_userns], [ + #include + #include + #include + ],[ + posix_acl_from_xattr(&init_user_ns, NULL, 0); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS], [ + AC_MSG_CHECKING([whether posix_acl_from_xattr() needs user_ns]) + ZFS_LINUX_TEST_RESULT([posix_acl_from_xattr_userns], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_POSIX_ACL_FROM_XATTR_USERNS, 1, + [posix_acl_from_xattr() needs user_ns]) + ],[ + ZFS_LINUX_TEST_ERROR([posix_acl_from_xattr()]) + ]) +]) + +dnl # +dnl # 4.9 API change, +dnl # iops->{set,get,remove}xattr and generic_{set,get,remove}xattr are +dnl # removed. xattr operations will directly go through sb->s_xattr. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR], [ + ZFS_LINUX_TEST_SRC([have_generic_setxattr], [ + #include + #include + + static const struct inode_operations + iops __attribute__ ((unused)) = { + .setxattr = generic_setxattr + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_GENERIC_SETXATTR], [ + AC_MSG_CHECKING([whether generic_setxattr() exists]) + ZFS_LINUX_TEST_RESULT([have_generic_setxattr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_GENERIC_SETXATTR, 1, + [generic_setxattr() exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR], [ + ZFS_AC_KERNEL_SRC_CONST_XATTR_HANDLER + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_NAME + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET + ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST + ZFS_AC_KERNEL_SRC_POSIX_ACL_FROM_XATTR_USERNS + ZFS_AC_KERNEL_SRC_GENERIC_SETXATTR +]) + +AC_DEFUN([ZFS_AC_KERNEL_XATTR], [ + ZFS_AC_KERNEL_CONST_XATTR_HANDLER + ZFS_AC_KERNEL_XATTR_HANDLER_NAME + ZFS_AC_KERNEL_XATTR_HANDLER_GET + ZFS_AC_KERNEL_XATTR_HANDLER_SET + ZFS_AC_KERNEL_XATTR_HANDLER_LIST + ZFS_AC_KERNEL_POSIX_ACL_FROM_XATTR_USERNS + ZFS_AC_KERNEL_GENERIC_SETXATTR +]) diff --git a/config/kernel-zlib.m4 b/config/kernel-zlib.m4 new file mode 100644 index 000000000000..752d388389cb --- /dev/null +++ b/config/kernel-zlib.m4 @@ -0,0 +1,26 @@ +dnl # +dnl # 2.6.39 API compat, +dnl +dnl # The function zlib_deflate_workspacesize() now take 2 arguments. +dnl # This was done to avoid always having to allocate the maximum size +dnl # workspace (268K). The caller can now specific the windowBits and +dnl # memLevel compression parameters to get a smaller workspace. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], [ + ZFS_LINUX_TEST_SRC([2args_zlib_deflate_workspacesize], [ + #include + ],[ + return zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE], [ + AC_MSG_CHECKING([whether zlib_deflate_workspacesize() wants 2 args]) + ZFS_LINUX_TEST_RESULT([2args_zlib_deflate_workspacesize], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE, 1, + [zlib_deflate_workspacesize() wants 2 args]) + ],[ + ZFS_LINUX_TEST_ERROR([zlib_deflate_workspacesize()]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 new file mode 100644 index 000000000000..ec52f014a7a3 --- /dev/null +++ b/config/kernel.m4 @@ -0,0 +1,861 @@ +dnl # +dnl # Default ZFS kernel configuration +dnl # +AC_DEFUN([ZFS_AC_CONFIG_KERNEL], [ + AM_COND_IF([BUILD_LINUX], [ + dnl # Setup the kernel build environment. + ZFS_AC_KERNEL + ZFS_AC_QAT + + dnl # Sanity checks for module building and CONFIG_* defines + ZFS_AC_KERNEL_TEST_MODULE + ZFS_AC_KERNEL_CONFIG_DEFINED + + dnl # Sequential ZFS_LINUX_TRY_COMPILE tests + ZFS_AC_KERNEL_FPU_HEADER + ZFS_AC_KERNEL_WAIT_QUEUE_ENTRY_T + ZFS_AC_KERNEL_MISC_MINOR + ZFS_AC_KERNEL_DECLARE_EVENT_CLASS + + dnl # Parallel ZFS_LINUX_TEST_SRC / ZFS_LINUX_TEST_RESULT tests + ZFS_AC_KERNEL_TEST_SRC + ZFS_AC_KERNEL_TEST_RESULT + + AS_IF([test "$LINUX_OBJ" != "$LINUX"], [ + KERNEL_MAKE="$KERNEL_MAKE O=$LINUX_OBJ" + ]) + + AC_SUBST(KERNEL_MAKE) + ]) +]) + +dnl # +dnl # Generate and compile all of the kernel API test cases to determine +dnl # which interfaces are available. By invoking the kernel build system +dnl # only once the compilation can be done in parallel significantly +dnl # speeding up the process. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ + ZFS_AC_KERNEL_SRC_OBJTOOL + ZFS_AC_KERNEL_SRC_GLOBAL_PAGE_STATE + ZFS_AC_KERNEL_SRC_ACCESS_OK_TYPE + ZFS_AC_KERNEL_SRC_PDE_DATA + ZFS_AC_KERNEL_SRC_FALLOCATE + ZFS_AC_KERNEL_SRC_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE + ZFS_AC_KERNEL_SRC_RWSEM + ZFS_AC_KERNEL_SRC_SCHED + ZFS_AC_KERNEL_SRC_USLEEP_RANGE + ZFS_AC_KERNEL_SRC_KMEM_CACHE + ZFS_AC_KERNEL_SRC_KVMALLOC + ZFS_AC_KERNEL_SRC_VMALLOC_PAGE_KERNEL + ZFS_AC_KERNEL_SRC_WAIT + ZFS_AC_KERNEL_SRC_INODE_TIMES + ZFS_AC_KERNEL_SRC_INODE_LOCK + ZFS_AC_KERNEL_SRC_GROUP_INFO_GID + ZFS_AC_KERNEL_SRC_RW + ZFS_AC_KERNEL_SRC_TIMER_SETUP + ZFS_AC_KERNEL_SRC_SUPER_USER_NS + ZFS_AC_KERNEL_SRC_PROC_OPERATIONS + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS + ZFS_AC_KERNEL_SRC_BIO + ZFS_AC_KERNEL_SRC_BLKDEV + ZFS_AC_KERNEL_SRC_BLK_QUEUE + ZFS_AC_KERNEL_SRC_GET_DISK_AND_MODULE + ZFS_AC_KERNEL_SRC_GET_DISK_RO + ZFS_AC_KERNEL_SRC_GENERIC_READLINK_GLOBAL + ZFS_AC_KERNEL_SRC_DISCARD_GRANULARITY + ZFS_AC_KERNEL_SRC_INODE_OWNER_OR_CAPABLE + ZFS_AC_KERNEL_SRC_XATTR + ZFS_AC_KERNEL_SRC_ACL + ZFS_AC_KERNEL_SRC_INODE_GETATTR + ZFS_AC_KERNEL_SRC_INODE_SET_FLAGS + ZFS_AC_KERNEL_SRC_INODE_SET_IVERSION + ZFS_AC_KERNEL_SRC_SHOW_OPTIONS + ZFS_AC_KERNEL_SRC_FILE_INODE + ZFS_AC_KERNEL_SRC_FILE_DENTRY + ZFS_AC_KERNEL_SRC_FSYNC + ZFS_AC_KERNEL_SRC_AIO_FSYNC + ZFS_AC_KERNEL_SRC_EVICT_INODE + ZFS_AC_KERNEL_SRC_DIRTY_INODE + ZFS_AC_KERNEL_SRC_SHRINKER + ZFS_AC_KERNEL_SRC_MKDIR_UMODE_T + ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS + ZFS_AC_KERNEL_SRC_CREATE_FLAGS + ZFS_AC_KERNEL_SRC_GET_LINK + ZFS_AC_KERNEL_SRC_PUT_LINK + ZFS_AC_KERNEL_SRC_TMPFILE + ZFS_AC_KERNEL_SRC_AUTOMOUNT + ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE + ZFS_AC_KERNEL_SRC_COMMIT_METADATA + ZFS_AC_KERNEL_SRC_CLEAR_INODE + ZFS_AC_KERNEL_SRC_SETATTR_PREPARE + ZFS_AC_KERNEL_SRC_INSERT_INODE_LOCKED + ZFS_AC_KERNEL_SRC_DENTRY + ZFS_AC_KERNEL_SRC_TRUNCATE_SETSIZE + ZFS_AC_KERNEL_SRC_SECURITY_INODE + ZFS_AC_KERNEL_SRC_FST_MOUNT + ZFS_AC_KERNEL_SRC_BDI + ZFS_AC_KERNEL_SRC_SET_NLINK + ZFS_AC_KERNEL_SRC_SGET + ZFS_AC_KERNEL_SRC_LSEEK_EXECUTE + ZFS_AC_KERNEL_SRC_VFS_GETATTR + ZFS_AC_KERNEL_SRC_VFS_FSYNC_2ARGS + ZFS_AC_KERNEL_SRC_VFS_ITERATE + ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO + ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE + ZFS_AC_KERNEL_SRC_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_SRC_KMAP_ATOMIC_ARGS + ZFS_AC_KERNEL_SRC_FOLLOW_DOWN_ONE + ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN + ZFS_AC_KERNEL_SRC_GENERIC_IO_ACCT + ZFS_AC_KERNEL_SRC_FPU + ZFS_AC_KERNEL_SRC_FMODE_T + ZFS_AC_KERNEL_SRC_KUIDGID_T + ZFS_AC_KERNEL_SRC_KUID_HELPERS + ZFS_AC_KERNEL_SRC_MODULE_PARAM_CALL_CONST + ZFS_AC_KERNEL_SRC_RENAME_WANTS_FLAGS + ZFS_AC_KERNEL_SRC_CURRENT_TIME + ZFS_AC_KERNEL_SRC_USERNS_CAPABILITIES + ZFS_AC_KERNEL_SRC_IN_COMPAT_SYSCALL + ZFS_AC_KERNEL_SRC_KTIME + ZFS_AC_KERNEL_SRC_TOTALRAM_PAGES_FUNC + ZFS_AC_KERNEL_SRC_TOTALHIGH_PAGES + ZFS_AC_KERNEL_SRC_KSTRTOUL + ZFS_AC_KERNEL_SRC_PERCPU + + AC_MSG_CHECKING([for available kernel interfaces]) + ZFS_LINUX_TEST_COMPILE_ALL([kabi]) + AC_MSG_RESULT([done]) +]) + +dnl # +dnl # Check results of kernel interface tests. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ + ZFS_AC_KERNEL_ACCESS_OK_TYPE + ZFS_AC_KERNEL_GLOBAL_PAGE_STATE + ZFS_AC_KERNEL_OBJTOOL + ZFS_AC_KERNEL_PDE_DATA + ZFS_AC_KERNEL_FALLOCATE + ZFS_AC_KERNEL_2ARGS_ZLIB_DEFLATE_WORKSPACESIZE + ZFS_AC_KERNEL_RWSEM + ZFS_AC_KERNEL_SCHED + ZFS_AC_KERNEL_USLEEP_RANGE + ZFS_AC_KERNEL_KMEM_CACHE + ZFS_AC_KERNEL_KVMALLOC + ZFS_AC_KERNEL_VMALLOC_PAGE_KERNEL + ZFS_AC_KERNEL_WAIT + ZFS_AC_KERNEL_INODE_TIMES + ZFS_AC_KERNEL_INODE_LOCK + ZFS_AC_KERNEL_GROUP_INFO_GID + ZFS_AC_KERNEL_RW + ZFS_AC_KERNEL_TIMER_SETUP + ZFS_AC_KERNEL_SUPER_USER_NS + ZFS_AC_KERNEL_PROC_OPERATIONS + ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS + ZFS_AC_KERNEL_BIO + ZFS_AC_KERNEL_BLKDEV + ZFS_AC_KERNEL_BLK_QUEUE + ZFS_AC_KERNEL_GET_DISK_AND_MODULE + ZFS_AC_KERNEL_GET_DISK_RO + ZFS_AC_KERNEL_GENERIC_READLINK_GLOBAL + ZFS_AC_KERNEL_DISCARD_GRANULARITY + ZFS_AC_KERNEL_INODE_OWNER_OR_CAPABLE + ZFS_AC_KERNEL_XATTR + ZFS_AC_KERNEL_ACL + ZFS_AC_KERNEL_INODE_GETATTR + ZFS_AC_KERNEL_INODE_SET_FLAGS + ZFS_AC_KERNEL_INODE_SET_IVERSION + ZFS_AC_KERNEL_SHOW_OPTIONS + ZFS_AC_KERNEL_FILE_INODE + ZFS_AC_KERNEL_FILE_DENTRY + ZFS_AC_KERNEL_FSYNC + ZFS_AC_KERNEL_AIO_FSYNC + ZFS_AC_KERNEL_EVICT_INODE + ZFS_AC_KERNEL_DIRTY_INODE + ZFS_AC_KERNEL_SHRINKER + ZFS_AC_KERNEL_MKDIR_UMODE_T + ZFS_AC_KERNEL_LOOKUP_FLAGS + ZFS_AC_KERNEL_CREATE_FLAGS + ZFS_AC_KERNEL_GET_LINK + ZFS_AC_KERNEL_PUT_LINK + ZFS_AC_KERNEL_TMPFILE + ZFS_AC_KERNEL_AUTOMOUNT + ZFS_AC_KERNEL_ENCODE_FH_WITH_INODE + ZFS_AC_KERNEL_COMMIT_METADATA + ZFS_AC_KERNEL_CLEAR_INODE + ZFS_AC_KERNEL_SETATTR_PREPARE + ZFS_AC_KERNEL_INSERT_INODE_LOCKED + ZFS_AC_KERNEL_DENTRY + ZFS_AC_KERNEL_TRUNCATE_SETSIZE + ZFS_AC_KERNEL_SECURITY_INODE + ZFS_AC_KERNEL_FST_MOUNT + ZFS_AC_KERNEL_BDI + ZFS_AC_KERNEL_SET_NLINK + ZFS_AC_KERNEL_SGET + ZFS_AC_KERNEL_LSEEK_EXECUTE + ZFS_AC_KERNEL_VFS_GETATTR + ZFS_AC_KERNEL_VFS_FSYNC_2ARGS + ZFS_AC_KERNEL_VFS_ITERATE + ZFS_AC_KERNEL_VFS_DIRECT_IO + ZFS_AC_KERNEL_VFS_RW_ITERATE + ZFS_AC_KERNEL_VFS_GENERIC_WRITE_CHECKS + ZFS_AC_KERNEL_KMAP_ATOMIC_ARGS + ZFS_AC_KERNEL_FOLLOW_DOWN_ONE + ZFS_AC_KERNEL_MAKE_REQUEST_FN + ZFS_AC_KERNEL_GENERIC_IO_ACCT + ZFS_AC_KERNEL_FPU + ZFS_AC_KERNEL_FMODE_T + ZFS_AC_KERNEL_KUIDGID_T + ZFS_AC_KERNEL_KUID_HELPERS + ZFS_AC_KERNEL_MODULE_PARAM_CALL_CONST + ZFS_AC_KERNEL_RENAME_WANTS_FLAGS + ZFS_AC_KERNEL_CURRENT_TIME + ZFS_AC_KERNEL_USERNS_CAPABILITIES + ZFS_AC_KERNEL_IN_COMPAT_SYSCALL + ZFS_AC_KERNEL_KTIME + ZFS_AC_KERNEL_TOTALRAM_PAGES_FUNC + ZFS_AC_KERNEL_TOTALHIGH_PAGES + ZFS_AC_KERNEL_KSTRTOUL + ZFS_AC_KERNEL_PERCPU +]) + +dnl # +dnl # Detect name used for Module.symvers file in kernel +dnl # +AC_DEFUN([ZFS_AC_MODULE_SYMVERS], [ + modpost=$LINUX/scripts/Makefile.modpost + AC_MSG_CHECKING([kernel file name for module symbols]) + AS_IF([test "x$enable_linux_builtin" != xyes -a -f "$modpost"], [ + AS_IF([grep -q Modules.symvers $modpost], [ + LINUX_SYMBOLS=Modules.symvers + ], [ + LINUX_SYMBOLS=Module.symvers + ]) + + AS_IF([test ! -f "$LINUX_OBJ/$LINUX_SYMBOLS"], [ + AC_MSG_ERROR([ + *** Please make sure the kernel devel package for your distribution + *** is installed. If you are building with a custom kernel, make sure + *** the kernel is configured, built, and the '--with-linux=PATH' + *** configure option refers to the location of the kernel source. + ]) + ]) + ], [ + LINUX_SYMBOLS=NONE + ]) + AC_MSG_RESULT($LINUX_SYMBOLS) + AC_SUBST(LINUX_SYMBOLS) +]) + +dnl # +dnl # Detect the kernel to be built against +dnl # +AC_DEFUN([ZFS_AC_KERNEL], [ + AC_ARG_WITH([linux], + AS_HELP_STRING([--with-linux=PATH], + [Path to kernel source]), + [kernelsrc="$withval"]) + + AC_ARG_WITH(linux-obj, + AS_HELP_STRING([--with-linux-obj=PATH], + [Path to kernel build objects]), + [kernelbuild="$withval"]) + + AC_MSG_CHECKING([kernel source directory]) + AS_IF([test -z "$kernelsrc"], [ + AS_IF([test -e "/lib/modules/$(uname -r)/source"], [ + headersdir="/lib/modules/$(uname -r)/source" + sourcelink=$(readlink -f "$headersdir") + ], [test -e "/lib/modules/$(uname -r)/build"], [ + headersdir="/lib/modules/$(uname -r)/build" + sourcelink=$(readlink -f "$headersdir") + ], [ + sourcelink=$(ls -1d /usr/src/kernels/* \ + /usr/src/linux-* \ + 2>/dev/null | grep -v obj | tail -1) + ]) + + AS_IF([test -n "$sourcelink" && test -e ${sourcelink}], [ + kernelsrc=`readlink -f ${sourcelink}` + ], [ + kernelsrc="[Not found]" + ]) + ], [ + AS_IF([test "$kernelsrc" = "NONE"], [ + kernsrcver=NONE + ]) + withlinux=yes + ]) + + AC_MSG_RESULT([$kernelsrc]) + AS_IF([test ! -d "$kernelsrc"], [ + AC_MSG_ERROR([ + *** Please make sure the kernel devel package for your distribution + *** is installed and then try again. If that fails, you can specify the + *** location of the kernel source with the '--with-linux=PATH' option.]) + ]) + + AC_MSG_CHECKING([kernel build directory]) + AS_IF([test -z "$kernelbuild"], [ + AS_IF([test x$withlinux != xyes -a -e "/lib/modules/$(uname -r)/build"], [ + kernelbuild=`readlink -f /lib/modules/$(uname -r)/build` + ], [test -d ${kernelsrc}-obj/${target_cpu}/${target_cpu}], [ + kernelbuild=${kernelsrc}-obj/${target_cpu}/${target_cpu} + ], [test -d ${kernelsrc}-obj/${target_cpu}/default], [ + kernelbuild=${kernelsrc}-obj/${target_cpu}/default + ], [test -d `dirname ${kernelsrc}`/build-${target_cpu}], [ + kernelbuild=`dirname ${kernelsrc}`/build-${target_cpu} + ], [ + kernelbuild=${kernelsrc} + ]) + ]) + AC_MSG_RESULT([$kernelbuild]) + + AC_MSG_CHECKING([kernel source version]) + utsrelease1=$kernelbuild/include/linux/version.h + utsrelease2=$kernelbuild/include/linux/utsrelease.h + utsrelease3=$kernelbuild/include/generated/utsrelease.h + AS_IF([test -r $utsrelease1 && fgrep -q UTS_RELEASE $utsrelease1], [ + utsrelease=linux/version.h + ], [test -r $utsrelease2 && fgrep -q UTS_RELEASE $utsrelease2], [ + utsrelease=linux/utsrelease.h + ], [test -r $utsrelease3 && fgrep -q UTS_RELEASE $utsrelease3], [ + utsrelease=generated/utsrelease.h + ]) + + AS_IF([test "$utsrelease"], [ + kernsrcver=`(echo "#include <$utsrelease>"; + echo "kernsrcver=UTS_RELEASE") | + ${CPP} -I $kernelbuild/include - | + grep "^kernsrcver=" | cut -d \" -f 2` + + AS_IF([test -z "$kernsrcver"], [ + AC_MSG_RESULT([Not found]) + AC_MSG_ERROR([ + *** Cannot determine kernel version. + ]) + ]) + ], [ + AC_MSG_RESULT([Not found]) + if test "x$enable_linux_builtin" != xyes; then + AC_MSG_ERROR([ + *** Cannot find UTS_RELEASE definition. + ]) + else + AC_MSG_ERROR([ + *** Cannot find UTS_RELEASE definition. + *** Please run 'make prepare' inside the kernel source tree.]) + fi + ]) + + AC_MSG_RESULT([$kernsrcver]) + + AS_VERSION_COMPARE([$kernsrcver], [$ZFS_META_KVER_MIN], [ + AC_MSG_ERROR([ + *** Cannot build against kernel version $kernsrcver. + *** The minimum supported kernel version is $ZFS_META_KVER_MIN. + ]) + ]) + + LINUX=${kernelsrc} + LINUX_OBJ=${kernelbuild} + LINUX_VERSION=${kernsrcver} + + AC_SUBST(LINUX) + AC_SUBST(LINUX_OBJ) + AC_SUBST(LINUX_VERSION) + + ZFS_AC_MODULE_SYMVERS +]) + +dnl # +dnl # Detect the QAT module to be built against, QAT provides hardware +dnl # acceleration for data compression: +dnl # +dnl # https://01.org/intel-quickassist-technology +dnl # +dnl # 1) Download and install QAT driver from the above link +dnl # 2) Start QAT driver in your system: +dnl # service qat_service start +dnl # 3) Enable QAT in ZFS, e.g.: +dnl # ./configure --with-qat=/QAT1.6 +dnl # make +dnl # 4) Set GZIP compression in ZFS dataset: +dnl # zfs set compression = gzip +dnl # +dnl # Then the data written to this ZFS pool is compressed by QAT accelerator +dnl # automatically, and de-compressed by QAT when read from the pool. +dnl # +dnl # 1) Get QAT hardware statistics with: +dnl # cat /proc/icp_dh895xcc_dev/qat +dnl # 2) To disable QAT: +dnl # insmod zfs.ko zfs_qat_disable=1 +dnl # +AC_DEFUN([ZFS_AC_QAT], [ + AC_ARG_WITH([qat], + AS_HELP_STRING([--with-qat=PATH], + [Path to qat source]), + AS_IF([test "$withval" = "yes"], + AC_MSG_ERROR([--with-qat=PATH requires a PATH]), + [qatsrc="$withval"])) + + AC_ARG_WITH([qat-obj], + AS_HELP_STRING([--with-qat-obj=PATH], + [Path to qat build objects]), + [qatbuild="$withval"]) + + AS_IF([test ! -z "${qatsrc}"], [ + AC_MSG_CHECKING([qat source directory]) + AC_MSG_RESULT([$qatsrc]) + QAT_SRC="${qatsrc}/quickassist" + AS_IF([ test ! -e "$QAT_SRC/include/cpa.h"], [ + AC_MSG_ERROR([ + *** Please make sure the qat driver package is installed + *** and specify the location of the qat source with the + *** '--with-qat=PATH' option then try again. Failed to + *** find cpa.h in: + ${QAT_SRC}/include]) + ]) + ]) + + AS_IF([test ! -z "${qatsrc}"], [ + AC_MSG_CHECKING([qat build directory]) + AS_IF([test -z "$qatbuild"], [ + qatbuild="${qatsrc}/build" + ]) + + AC_MSG_RESULT([$qatbuild]) + QAT_OBJ=${qatbuild} + AS_IF([ ! test -e "$QAT_OBJ/icp_qa_al.ko" && ! test -e "$QAT_OBJ/qat_api.ko"], [ + AC_MSG_ERROR([ + *** Please make sure the qat driver is installed then try again. + *** Failed to find icp_qa_al.ko or qat_api.ko in: + $QAT_OBJ]) + ]) + + AC_SUBST(QAT_SRC) + AC_SUBST(QAT_OBJ) + + AC_DEFINE(HAVE_QAT, 1, + [qat is enabled and existed]) + ]) + + dnl # + dnl # Detect the name used for the QAT Module.symvers file. + dnl # + AS_IF([test ! -z "${qatsrc}"], [ + AC_MSG_CHECKING([qat file for module symbols]) + QAT_SYMBOLS=$QAT_SRC/lookaside/access_layer/src/Module.symvers + + AS_IF([test -r $QAT_SYMBOLS], [ + AC_MSG_RESULT([$QAT_SYMBOLS]) + AC_SUBST(QAT_SYMBOLS) + ],[ + AC_MSG_ERROR([ + *** Please make sure the qat driver is installed then try again. + *** Failed to find Module.symvers in: + $QAT_SYMBOLS + ]) + ]) + ]) +]) + +dnl # +dnl # Basic toolchain sanity check. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_TEST_MODULE], [ + AC_MSG_CHECKING([whether modules can be built]) + ZFS_LINUX_TRY_COMPILE([], [], [ + AC_MSG_RESULT([yes]) + ],[ + AC_MSG_RESULT([no]) + if test "x$enable_linux_builtin" != xyes; then + AC_MSG_ERROR([ + *** Unable to build an empty module. + ]) + else + AC_MSG_ERROR([ + *** Unable to build an empty module. + *** Please run 'make scripts' inside the kernel source tree.]) + fi + ]) +]) + +dnl # +dnl # ZFS_LINUX_CONFTEST_H +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST_H], [ +test -d build/$2 || mkdir -p build/$2 +cat - <<_ACEOF >build/$2/$2.h +$1 +_ACEOF +]) + +dnl # +dnl # ZFS_LINUX_CONFTEST_C +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST_C], [ +test -d build/$2 || mkdir -p build/$2 +cat confdefs.h - <<_ACEOF >build/$2/$2.c +$1 +_ACEOF +]) + +dnl # +dnl # ZFS_LINUX_CONFTEST_MAKEFILE +dnl # +dnl # $1 - test case name +dnl # $2 - add to top-level Makefile +dnl # $3 - additional build flags +dnl # +AC_DEFUN([ZFS_LINUX_CONFTEST_MAKEFILE], [ + test -d build || mkdir -p build + test -d build/$1 || mkdir -p build/$1 + + file=build/$1/Makefile + + dnl # Example command line to manually build source. + cat - <<_ACEOF >$file +# Example command line to manually build source +# make modules -C $LINUX_OBJ $ARCH_UM M=$PWD/build/$1 + +ccflags-y := -Werror $FRAME_LARGER_THAN +_ACEOF + + dnl # Additional custom CFLAGS as requested. + m4_ifval($3, [echo "ccflags-y += $3" >>$file], []) + + dnl # Test case source + echo "obj-m := $1.o" >>$file + + AS_IF([test "x$2" = "xyes"], [echo "obj-m += $1/" >>build/Makefile], []) +]) + +dnl # +dnl # ZFS_LINUX_TEST_PROGRAM(C)([PROLOGUE], [BODY]) +dnl # +m4_define([ZFS_LINUX_TEST_PROGRAM], [ +$1 +int +main (void) +{ +$2 + ; + return 0; +} +]) + +dnl # +dnl # ZFS_LINUX_TEST_REMOVE +dnl # +dnl # Removes the specified test source and results. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_REMOVE], [ + test -d build/$1 && rm -Rf build/$1 + test -f build/Makefile && sed '/$1/d' build/Makefile +]) + +dnl # +dnl # ZFS_LINUX_COMPILE +dnl # +dnl # $1 - build dir +dnl # $2 - test command +dnl # $3 - pass command +dnl # $4 - fail command +dnl # $5 - set KBUILD_MODPOST_NOFINAL='yes' +dnl # $6 - set KBUILD_MODPOST_WARN='yes' +dnl # +dnl # Used internally by ZFS_LINUX_TEST_{COMPILE,MODPOST} +dnl # +AC_DEFUN([ZFS_LINUX_COMPILE], [ + AC_TRY_COMMAND([ + KBUILD_MODPOST_NOFINAL="$5" KBUILD_MODPOST_WARN="$6" + make modules -k -j$TEST_JOBS -C $LINUX_OBJ $ARCH_UM + M=$PWD/$1 >$1/build.log 2>&1]) + AS_IF([AC_TRY_COMMAND([$2])], [$3], [$4]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_COMPILE +dnl # +dnl # Perform a full compile excluding the final modpost phase. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_COMPILE], [ + ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ + mv $2/Makefile $2/Makefile.compile.$1 + mv $2/build.log $2/build.log.$1 + ],[ + AC_MSG_ERROR([ + *** Unable to compile test source to determine kernel interfaces.]) + ], [yes], []) +]) + +dnl # +dnl # ZFS_LINUX_TEST_MODPOST +dnl # +dnl # Perform a full compile including the modpost phase. This may +dnl # be an incremental build if the objects have already been built. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_MODPOST], [ + ZFS_LINUX_COMPILE([$2], [test -f $2/build.log], [ + mv $2/Makefile $2/Makefile.modpost.$1 + cat $2/build.log >>build/build.log.$1 + ],[ + AC_MSG_ERROR([ + *** Unable to modpost test source to determine kernel interfaces.]) + ], [], [yes]) +]) + +dnl # +dnl # Perform the compilation of the test cases in two phases. +dnl # +dnl # Phase 1) attempt to build the object files for all of the tests +dnl # defined by the ZFS_LINUX_TEST_SRC macro. But do not +dnl # perform the final modpost stage. +dnl # +dnl # Phase 2) disable all tests which failed the initial compilation, +dnl # then invoke the final modpost step for the remaining tests. +dnl # +dnl # This allows us efficiently build the test cases in parallel while +dnl # remaining resilient to build failures which are expected when +dnl # detecting the available kernel interfaces. +dnl # +dnl # The maximum allowed parallelism can be controlled by setting the +dnl # TEST_JOBS environment variable. Otherwise, it default to $(nproc). +dnl # +AC_DEFUN([ZFS_LINUX_TEST_COMPILE_ALL], [ + dnl # Phase 1 - Compilation only, final linking is skipped. + ZFS_LINUX_TEST_COMPILE([$1], [build]) + + dnl # + dnl # Phase 2 - When building external modules disable test cases + dnl # which failed to compile and invoke modpost to verify the + dnl # final linking. + dnl # + dnl # Test names suffixed with '_license' call modpost independently + dnl # to ensure that a single incompatibility does not result in the + dnl # modpost phase exiting early. This check is not performed on + dnl # every symbol since the majority are compatible and doing so + dnl # would significantly slow down this phase. + dnl # + dnl # When configuring for builtin (--enable-linux-builtin) + dnl # fake the linking step artificially create the expected .ko + dnl # files for tests which did compile. This is required for + dnl # kernels which do not have loadable module support or have + dnl # not yet been built. + dnl # + AS_IF([test "x$enable_linux_builtin" = "xno"], [ + for dir in $(awk '/^obj-m/ { print [$]3 }' \ + build/Makefile.compile.$1); do + name=${dir%/} + AS_IF([test -f build/$name/$name.o], [ + AS_IF([test "${name##*_}" = "license"], [ + ZFS_LINUX_TEST_MODPOST([$1], + [build/$name]) + echo "obj-n += $dir" >>build/Makefile + ], [ + echo "obj-m += $dir" >>build/Makefile + ]) + ], [ + echo "obj-n += $dir" >>build/Makefile + ]) + done + + ZFS_LINUX_TEST_MODPOST([$1], [build]) + ], [ + for dir in $(awk '/^obj-m/ { print [$]3 }' \ + build/Makefile.compile.$1); do + name=${dir%/} + AS_IF([test -f build/$name/$name.o], [ + touch build/$name/$name.ko + ]) + done + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_SRC +dnl # +dnl # $1 - name +dnl # $2 - global +dnl # $3 - source +dnl # $4 - extra cflags +dnl # $5 - check license-compatibility +dnl # +dnl # N.B because all of the test cases are compiled in parallel they +dnl # must never depend on the results of previous tests. Each test +dnl # needs to be entirely independent. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_SRC], [ + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[$2]], [[$3]])], [$1]) + ZFS_LINUX_CONFTEST_MAKEFILE([$1], [yes], [$4]) + + AS_IF([ test -n "$5" ], [ + ZFS_LINUX_CONFTEST_C([ZFS_LINUX_TEST_PROGRAM([[ + #include + MODULE_LICENSE("$5"); + $2]], [[$3]])], [$1_license]) + ZFS_LINUX_CONFTEST_MAKEFILE([$1_license], [yes], [$4]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_RESULT +dnl # +dnl # $1 - name of a test source (ZFS_LINUX_TEST_SRC) +dnl # $2 - run on success (valid .ko generated) +dnl # $3 - run on failure (unable to compile) +dnl # +AC_DEFUN([ZFS_LINUX_TEST_RESULT], [ + AS_IF([test -d build/$1], [ + AS_IF([test -f build/$1/$1.ko], [$2], [$3]) + ], [ + AC_MSG_ERROR([ + *** No matching source for the "$1" test, check that + *** both the test source and result macros refer to the same name. + ]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_ERROR +dnl # +dnl # Generic error message which can be used when none of the expected +dnl # kernel interfaces were detected. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_ERROR], [ + AC_MSG_ERROR([ + *** None of the expected "$1" interfaces were detected. + *** This may be because your kernel version is newer than what is + *** supported, or you are using a patched custom kernel with + *** incompatible modifications. + *** + *** ZFS Version: $ZFS_META_ALIAS + *** Compatible Kernels: $ZFS_META_KVER_MIN - $ZFS_META_KVER_MAX + ]) +]) + +dnl # +dnl # ZFS_LINUX_TEST_RESULT_SYMBOL +dnl # +dnl # Like ZFS_LINUX_TEST_RESULT except ZFS_CHECK_SYMBOL_EXPORT is called to +dnl # verify symbol exports, unless --enable-linux-builtin was provided to +dnl # configure. +dnl # +AC_DEFUN([ZFS_LINUX_TEST_RESULT_SYMBOL], [ + AS_IF([ ! test -f build/$1/$1.ko], [ + $5 + ], [ + AS_IF([test "x$enable_linux_builtin" != "xyes"], [ + ZFS_CHECK_SYMBOL_EXPORT([$2], [$3], [$4], [$5]) + ], [ + $4 + ]) + ]) +]) + +dnl # +dnl # ZFS_LINUX_COMPILE_IFELSE +dnl # +AC_DEFUN([ZFS_LINUX_COMPILE_IFELSE], [ + ZFS_LINUX_TEST_REMOVE([conftest]) + + m4_ifvaln([$1], [ZFS_LINUX_CONFTEST_C([$1], [conftest])]) + m4_ifvaln([$5], [ZFS_LINUX_CONFTEST_H([$5], [conftest])], + [ZFS_LINUX_CONFTEST_H([], [conftest])]) + + ZFS_LINUX_CONFTEST_MAKEFILE([conftest], [no], + [m4_ifvaln([$5], [-I$PWD/build/conftest], [])]) + ZFS_LINUX_COMPILE([build/conftest], [$2], [$3], [$4], [], []) +]) + +dnl # +dnl # ZFS_LINUX_TRY_COMPILE +dnl # +dnl # $1 - global +dnl # $2 - source +dnl # $3 - run on success (valid .ko generated) +dnl # $4 - run on failure (unable to compile) +dnl # +dnl # When configuring as builtin (--enable-linux-builtin) for kernels +dnl # without loadable module support (CONFIG_MODULES=n) only the object +dnl # file is created. See ZFS_LINUX_TEST_COMPILE_ALL for details. +dnl # +AC_DEFUN([ZFS_LINUX_TRY_COMPILE], [ + AS_IF([test "x$enable_linux_builtin" = "xyes"], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [test -f build/conftest/conftest.o], [$3], [$4]) + ], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [test -f build/conftest/conftest.ko], [$3], [$4]) + ]) +]) + +dnl # +dnl # ZFS_CHECK_SYMBOL_EXPORT +dnl # +dnl # Check if a symbol is exported on not by consulting the symbols +dnl # file, or optionally the source code. +dnl # +AC_DEFUN([ZFS_CHECK_SYMBOL_EXPORT], [ + grep -q -E '[[[:space:]]]$1[[[:space:]]]' \ + $LINUX_OBJ/$LINUX_SYMBOLS 2>/dev/null + rc=$? + if test $rc -ne 0; then + export=0 + for file in $2; do + grep -q -E "EXPORT_SYMBOL.*($1)" \ + "$LINUX/$file" 2>/dev/null + rc=$? + if test $rc -eq 0; then + export=1 + break; + fi + done + if test $export -eq 0; then : + $4 + else : + $3 + fi + else : + $3 + fi +]) + +dnl # +dnl # ZFS_LINUX_TRY_COMPILE_SYMBOL +dnl # +dnl # Like ZFS_LINUX_TRY_COMPILER except ZFS_CHECK_SYMBOL_EXPORT is called +dnl # to verify symbol exports, unless --enable-linux-builtin was provided +dnl # to configure. +dnl # +AC_DEFUN([ZFS_LINUX_TRY_COMPILE_SYMBOL], [ + ZFS_LINUX_TRY_COMPILE([$1], [$2], [rc=0], [rc=1]) + if test $rc -ne 0; then : + $6 + else + if test "x$enable_linux_builtin" != xyes; then + ZFS_CHECK_SYMBOL_EXPORT([$3], [$4], [rc=0], [rc=1]) + fi + if test $rc -ne 0; then : + $6 + else : + $5 + fi + fi +]) + +dnl # +dnl # ZFS_LINUX_TRY_COMPILE_HEADER +dnl # like ZFS_LINUX_TRY_COMPILE, except the contents conftest.h are +dnl # provided via the fifth parameter +dnl # +AC_DEFUN([ZFS_LINUX_TRY_COMPILE_HEADER], [ + ZFS_LINUX_COMPILE_IFELSE( + [ZFS_LINUX_TEST_PROGRAM([[$1]], [[$2]])], + [test -f build/conftest/conftest.ko], + [$3], [$4], [$5]) +]) diff --git a/config/lib-ld.m4 b/config/lib-ld.m4 new file mode 100644 index 000000000000..a18719630d59 --- /dev/null +++ b/config/lib-ld.m4 @@ -0,0 +1,168 @@ +# lib-ld.m4 serial 9 +dnl Copyright (C) 1996-2003, 2009-2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl Subroutines of libtool.m4, +dnl with replacements s/_*LT_PATH/AC_LIB_PROG/ and s/lt_/acl_/ to avoid +dnl collision with libtool.m4. + +dnl From libtool-2.4. Sets the variable with_gnu_ld to yes or no. +AC_DEFUN([AC_LIB_PROG_LD_GNU], +[AC_CACHE_CHECK([if the linker ($LD) is GNU ld], [acl_cv_prog_gnu_ld], +[# I'd rather use --version here, but apparently some GNU lds only accept -v. +case `$LD -v 2>&1 /dev/null 2>&1 \ + && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 \ + || PATH_SEPARATOR=';' + } +fi + +if test -n "$LD"; then + AC_MSG_CHECKING([for ld]) +elif test "$GCC" = yes; then + AC_MSG_CHECKING([for ld used by $CC]) +elif test "$with_gnu_ld" = yes; then + AC_MSG_CHECKING([for GNU ld]) +else + AC_MSG_CHECKING([for non-GNU ld]) +fi +if test -n "$LD"; then + # Let the user override the test with a path. + : +else + AC_CACHE_VAL([acl_cv_path_LD], + [ + acl_cv_path_LD= # Final result of this test + ac_prog=ld # Program to search in $PATH + if test "$GCC" = yes; then + # Check if gcc -print-prog-name=ld gives a path. + case $host in + *-*-mingw*) + # gcc leaves a trailing carriage return which upsets mingw + acl_output=`($CC -print-prog-name=ld) 2>&5 | tr -d '\015'` ;; + *) + acl_output=`($CC -print-prog-name=ld) 2>&5` ;; + esac + case $acl_output in + # Accept absolute paths. + [[\\/]]* | ?:[[\\/]]*) + re_direlt='/[[^/]][[^/]]*/\.\./' + # Canonicalize the pathname of ld + acl_output=`echo "$acl_output" | sed 's%\\\\%/%g'` + while echo "$acl_output" | grep "$re_direlt" > /dev/null 2>&1; do + acl_output=`echo $acl_output | sed "s%$re_direlt%/%"` + done + # Got the pathname. No search in PATH is needed. + acl_cv_path_LD="$acl_output" + ac_prog= + ;; + "") + # If it fails, then pretend we aren't using GCC. + ;; + *) + # If it is relative, then search for the first ld in PATH. + with_gnu_ld=unknown + ;; + esac + fi + if test -n "$ac_prog"; then + # Search for $ac_prog in $PATH. + acl_save_ifs="$IFS"; IFS=$PATH_SEPARATOR + for ac_dir in $PATH; do + IFS="$acl_save_ifs" + test -z "$ac_dir" && ac_dir=. + if test -f "$ac_dir/$ac_prog" || test -f "$ac_dir/$ac_prog$ac_exeext"; then + acl_cv_path_LD="$ac_dir/$ac_prog" + # Check to see if the program is GNU ld. I'd rather use --version, + # but apparently some variants of GNU ld only accept -v. + # Break only if it was the GNU/non-GNU ld that we prefer. + case `"$acl_cv_path_LD" -v 2>&1 conftest.sh + . ./conftest.sh + rm -f ./conftest.sh + acl_cv_rpath=done + ]) + wl="$acl_cv_wl" + acl_libext="$acl_cv_libext" + acl_shlibext="$acl_cv_shlibext" + acl_libname_spec="$acl_cv_libname_spec" + acl_library_names_spec="$acl_cv_library_names_spec" + acl_hardcode_libdir_flag_spec="$acl_cv_hardcode_libdir_flag_spec" + acl_hardcode_libdir_separator="$acl_cv_hardcode_libdir_separator" + acl_hardcode_direct="$acl_cv_hardcode_direct" + acl_hardcode_minus_L="$acl_cv_hardcode_minus_L" + dnl Determine whether the user wants rpath handling at all. + AC_ARG_ENABLE([rpath], + [ --disable-rpath do not hardcode runtime library paths], + :, enable_rpath=yes) +]) + +dnl AC_LIB_FROMPACKAGE(name, package) +dnl declares that libname comes from the given package. The configure file +dnl will then not have a --with-libname-prefix option but a +dnl --with-package-prefix option. Several libraries can come from the same +dnl package. This declaration must occur before an AC_LIB_LINKFLAGS or similar +dnl macro call that searches for libname. +AC_DEFUN([AC_LIB_FROMPACKAGE], +[ + pushdef([NAME],[m4_translit([$1],[abcdefghijklmnopqrstuvwxyz./+-], + [ABCDEFGHIJKLMNOPQRSTUVWXYZ____])]) + define([acl_frompackage_]NAME, [$2]) + popdef([NAME]) + pushdef([PACK],[$2]) + pushdef([PACKUP],[m4_translit(PACK,[abcdefghijklmnopqrstuvwxyz./+-], + [ABCDEFGHIJKLMNOPQRSTUVWXYZ____])]) + define([acl_libsinpackage_]PACKUP, + m4_ifdef([acl_libsinpackage_]PACKUP, [m4_defn([acl_libsinpackage_]PACKUP)[, ]],)[lib$1]) + popdef([PACKUP]) + popdef([PACK]) +]) + +dnl AC_LIB_LINKFLAGS_BODY(name [, dependencies]) searches for libname and +dnl the libraries corresponding to explicit and implicit dependencies. +dnl Sets the LIB${NAME}, LTLIB${NAME} and INC${NAME} variables. +dnl Also, sets the LIB${NAME}_PREFIX variable to nonempty if libname was found +dnl in ${LIB${NAME}_PREFIX}/$acl_libdirstem. +AC_DEFUN([AC_LIB_LINKFLAGS_BODY], +[ + AC_REQUIRE([AC_LIB_PREPARE_MULTILIB]) + pushdef([NAME],[m4_translit([$1],[abcdefghijklmnopqrstuvwxyz./+-], + [ABCDEFGHIJKLMNOPQRSTUVWXYZ____])]) + pushdef([PACK],[m4_ifdef([acl_frompackage_]NAME, [acl_frompackage_]NAME, lib[$1])]) + pushdef([PACKUP],[m4_translit(PACK,[abcdefghijklmnopqrstuvwxyz./+-], + [ABCDEFGHIJKLMNOPQRSTUVWXYZ____])]) + pushdef([PACKLIBS],[m4_ifdef([acl_frompackage_]NAME, [acl_libsinpackage_]PACKUP, lib[$1])]) + dnl By default, look in $includedir and $libdir. + use_additional=yes + AC_LIB_WITH_FINAL_PREFIX([ + eval additional_includedir=\"$includedir\" + eval additional_libdir=\"$libdir\" + ]) + AC_ARG_WITH(PACK[-prefix], +[[ --with-]]PACK[[-prefix[=DIR] search for ]PACKLIBS[ in DIR/include and DIR/lib + --without-]]PACK[[-prefix don't search for ]PACKLIBS[ in includedir and libdir]], +[ + if test "X$withval" = "Xno"; then + use_additional=no + else + if test "X$withval" = "X"; then + AC_LIB_WITH_FINAL_PREFIX([ + eval additional_includedir=\"$includedir\" + eval additional_libdir=\"$libdir\" + ]) + else + additional_includedir="$withval/include" + additional_libdir="$withval/$acl_libdirstem" + if test "$acl_libdirstem2" != "$acl_libdirstem" \ + && test ! -d "$withval/$acl_libdirstem"; then + additional_libdir="$withval/$acl_libdirstem2" + fi + fi + fi +]) + dnl Search the library and its dependencies in $additional_libdir and + dnl $LDFLAGS. Using breadth-first-search. + LIB[]NAME= + LTLIB[]NAME= + INC[]NAME= + LIB[]NAME[]_PREFIX= + dnl HAVE_LIB${NAME} is an indicator that LIB${NAME}, LTLIB${NAME} have been + dnl computed. So it has to be reset here. + HAVE_LIB[]NAME= + rpathdirs= + ltrpathdirs= + names_already_handled= + names_next_round='$1 $2' + while test -n "$names_next_round"; do + names_this_round="$names_next_round" + names_next_round= + for name in $names_this_round; do + already_handled= + for n in $names_already_handled; do + if test "$n" = "$name"; then + already_handled=yes + break + fi + done + if test -z "$already_handled"; then + names_already_handled="$names_already_handled $name" + dnl See if it was already located by an earlier AC_LIB_LINKFLAGS + dnl or AC_LIB_HAVE_LINKFLAGS call. + uppername=`echo "$name" | sed -e 'y|abcdefghijklmnopqrstuvwxyz./+-|ABCDEFGHIJKLMNOPQRSTUVWXYZ____|'` + eval value=\"\$HAVE_LIB$uppername\" + if test -n "$value"; then + if test "$value" = yes; then + eval value=\"\$LIB$uppername\" + test -z "$value" || LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$value" + eval value=\"\$LTLIB$uppername\" + test -z "$value" || LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }$value" + else + dnl An earlier call to AC_LIB_HAVE_LINKFLAGS has determined + dnl that this library doesn't exist. So just drop it. + : + fi + else + dnl Search the library lib$name in $additional_libdir and $LDFLAGS + dnl and the already constructed $LIBNAME/$LTLIBNAME. + found_dir= + found_la= + found_so= + found_a= + eval libname=\"$acl_libname_spec\" # typically: libname=lib$name + if test -n "$acl_shlibext"; then + shrext=".$acl_shlibext" # typically: shrext=.so + else + shrext= + fi + if test $use_additional = yes; then + dir="$additional_libdir" + dnl The same code as in the loop below: + dnl First look for a shared library. + if test -n "$acl_shlibext"; then + if test -f "$dir/$libname$shrext"; then + found_dir="$dir" + found_so="$dir/$libname$shrext" + else + if test "$acl_library_names_spec" = '$libname$shrext$versuffix'; then + ver=`(cd "$dir" && \ + for f in "$libname$shrext".*; do echo "$f"; done \ + | sed -e "s,^$libname$shrext\\\\.,," \ + | sort -t '.' -n -r -k1,1 -k2,2 -k3,3 -k4,4 -k5,5 \ + | sed 1q ) 2>/dev/null` + if test -n "$ver" && test -f "$dir/$libname$shrext.$ver"; then + found_dir="$dir" + found_so="$dir/$libname$shrext.$ver" + fi + else + eval library_names=\"$acl_library_names_spec\" + for f in $library_names; do + if test -f "$dir/$f"; then + found_dir="$dir" + found_so="$dir/$f" + break + fi + done + fi + fi + fi + dnl Then look for a static library. + if test "X$found_dir" = "X"; then + if test -f "$dir/$libname.$acl_libext"; then + found_dir="$dir" + found_a="$dir/$libname.$acl_libext" + fi + fi + if test "X$found_dir" != "X"; then + if test -f "$dir/$libname.la"; then + found_la="$dir/$libname.la" + fi + fi + fi + if test "X$found_dir" = "X"; then + for x in $LDFLAGS $LTLIB[]NAME; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + case "$x" in + -L*) + dir=`echo "X$x" | sed -e 's/^X-L//'` + dnl First look for a shared library. + if test -n "$acl_shlibext"; then + if test -f "$dir/$libname$shrext"; then + found_dir="$dir" + found_so="$dir/$libname$shrext" + else + if test "$acl_library_names_spec" = '$libname$shrext$versuffix'; then + ver=`(cd "$dir" && \ + for f in "$libname$shrext".*; do echo "$f"; done \ + | sed -e "s,^$libname$shrext\\\\.,," \ + | sort -t '.' -n -r -k1,1 -k2,2 -k3,3 -k4,4 -k5,5 \ + | sed 1q ) 2>/dev/null` + if test -n "$ver" && test -f "$dir/$libname$shrext.$ver"; then + found_dir="$dir" + found_so="$dir/$libname$shrext.$ver" + fi + else + eval library_names=\"$acl_library_names_spec\" + for f in $library_names; do + if test -f "$dir/$f"; then + found_dir="$dir" + found_so="$dir/$f" + break + fi + done + fi + fi + fi + dnl Then look for a static library. + if test "X$found_dir" = "X"; then + if test -f "$dir/$libname.$acl_libext"; then + found_dir="$dir" + found_a="$dir/$libname.$acl_libext" + fi + fi + if test "X$found_dir" != "X"; then + if test -f "$dir/$libname.la"; then + found_la="$dir/$libname.la" + fi + fi + ;; + esac + if test "X$found_dir" != "X"; then + break + fi + done + fi + if test "X$found_dir" != "X"; then + dnl Found the library. + LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }-L$found_dir -l$name" + if test "X$found_so" != "X"; then + dnl Linking with a shared library. We attempt to hardcode its + dnl directory into the executable's runpath, unless it's the + dnl standard /usr/lib. + if test "$enable_rpath" = no \ + || test "X$found_dir" = "X/usr/$acl_libdirstem" \ + || test "X$found_dir" = "X/usr/$acl_libdirstem2"; then + dnl No hardcoding is needed. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$found_so" + else + dnl Use an explicit option to hardcode DIR into the resulting + dnl binary. + dnl Potentially add DIR to ltrpathdirs. + dnl The ltrpathdirs will be appended to $LTLIBNAME at the end. + haveit= + for x in $ltrpathdirs; do + if test "X$x" = "X$found_dir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + ltrpathdirs="$ltrpathdirs $found_dir" + fi + dnl The hardcoding into $LIBNAME is system dependent. + if test "$acl_hardcode_direct" = yes; then + dnl Using DIR/libNAME.so during linking hardcodes DIR into the + dnl resulting binary. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$found_so" + else + if test -n "$acl_hardcode_libdir_flag_spec" && test "$acl_hardcode_minus_L" = no; then + dnl Use an explicit option to hardcode DIR into the resulting + dnl binary. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$found_so" + dnl Potentially add DIR to rpathdirs. + dnl The rpathdirs will be appended to $LIBNAME at the end. + haveit= + for x in $rpathdirs; do + if test "X$x" = "X$found_dir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + rpathdirs="$rpathdirs $found_dir" + fi + else + dnl Rely on "-L$found_dir". + dnl But don't add it if it's already contained in the LDFLAGS + dnl or the already constructed $LIBNAME + haveit= + for x in $LDFLAGS $LIB[]NAME; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-L$found_dir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }-L$found_dir" + fi + if test "$acl_hardcode_minus_L" != no; then + dnl FIXME: Not sure whether we should use + dnl "-L$found_dir -l$name" or "-L$found_dir $found_so" + dnl here. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$found_so" + else + dnl We cannot use $acl_hardcode_runpath_var and LD_RUN_PATH + dnl here, because this doesn't fit in flags passed to the + dnl compiler. So give up. No hardcoding. This affects only + dnl very old systems. + dnl FIXME: Not sure whether we should use + dnl "-L$found_dir -l$name" or "-L$found_dir $found_so" + dnl here. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }-l$name" + fi + fi + fi + fi + else + if test "X$found_a" != "X"; then + dnl Linking with a static library. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$found_a" + else + dnl We shouldn't come here, but anyway it's good to have a + dnl fallback. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }-L$found_dir -l$name" + fi + fi + dnl Assume the include files are nearby. + additional_includedir= + case "$found_dir" in + */$acl_libdirstem | */$acl_libdirstem/) + basedir=`echo "X$found_dir" | sed -e 's,^X,,' -e "s,/$acl_libdirstem/"'*$,,'` + if test "$name" = '$1'; then + LIB[]NAME[]_PREFIX="$basedir" + fi + additional_includedir="$basedir/include" + ;; + */$acl_libdirstem2 | */$acl_libdirstem2/) + basedir=`echo "X$found_dir" | sed -e 's,^X,,' -e "s,/$acl_libdirstem2/"'*$,,'` + if test "$name" = '$1'; then + LIB[]NAME[]_PREFIX="$basedir" + fi + additional_includedir="$basedir/include" + ;; + esac + if test "X$additional_includedir" != "X"; then + dnl Potentially add $additional_includedir to $INCNAME. + dnl But don't add it + dnl 1. if it's the standard /usr/include, + dnl 2. if it's /usr/local/include and we are using GCC on Linux, + dnl 3. if it's already present in $CPPFLAGS or the already + dnl constructed $INCNAME, + dnl 4. if it doesn't exist as a directory. + if test "X$additional_includedir" != "X/usr/include"; then + haveit= + if test "X$additional_includedir" = "X/usr/local/include"; then + if test -n "$GCC"; then + case $host_os in + linux* | gnu* | k*bsd*-gnu) haveit=yes;; + esac + fi + fi + if test -z "$haveit"; then + for x in $CPPFLAGS $INC[]NAME; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-I$additional_includedir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + if test -d "$additional_includedir"; then + dnl Really add $additional_includedir to $INCNAME. + INC[]NAME="${INC[]NAME}${INC[]NAME:+ }-I$additional_includedir" + fi + fi + fi + fi + fi + dnl Look for dependencies. + if test -n "$found_la"; then + dnl Read the .la file. It defines the variables + dnl dlname, library_names, old_library, dependency_libs, current, + dnl age, revision, installed, dlopen, dlpreopen, libdir. + save_libdir="$libdir" + case "$found_la" in + */* | *\\*) . "$found_la" ;; + *) . "./$found_la" ;; + esac + libdir="$save_libdir" + dnl We use only dependency_libs. + for dep in $dependency_libs; do + case "$dep" in + -L*) + additional_libdir=`echo "X$dep" | sed -e 's/^X-L//'` + dnl Potentially add $additional_libdir to $LIBNAME and $LTLIBNAME. + dnl But don't add it + dnl 1. if it's the standard /usr/lib, + dnl 2. if it's /usr/local/lib and we are using GCC on Linux, + dnl 3. if it's already present in $LDFLAGS or the already + dnl constructed $LIBNAME, + dnl 4. if it doesn't exist as a directory. + if test "X$additional_libdir" != "X/usr/$acl_libdirstem" \ + && test "X$additional_libdir" != "X/usr/$acl_libdirstem2"; then + haveit= + if test "X$additional_libdir" = "X/usr/local/$acl_libdirstem" \ + || test "X$additional_libdir" = "X/usr/local/$acl_libdirstem2"; then + if test -n "$GCC"; then + case $host_os in + linux* | gnu* | k*bsd*-gnu) haveit=yes;; + esac + fi + fi + if test -z "$haveit"; then + haveit= + for x in $LDFLAGS $LIB[]NAME; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-L$additional_libdir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + if test -d "$additional_libdir"; then + dnl Really add $additional_libdir to $LIBNAME. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }-L$additional_libdir" + fi + fi + haveit= + for x in $LDFLAGS $LTLIB[]NAME; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-L$additional_libdir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + if test -d "$additional_libdir"; then + dnl Really add $additional_libdir to $LTLIBNAME. + LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }-L$additional_libdir" + fi + fi + fi + fi + ;; + -R*) + dir=`echo "X$dep" | sed -e 's/^X-R//'` + if test "$enable_rpath" != no; then + dnl Potentially add DIR to rpathdirs. + dnl The rpathdirs will be appended to $LIBNAME at the end. + haveit= + for x in $rpathdirs; do + if test "X$x" = "X$dir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + rpathdirs="$rpathdirs $dir" + fi + dnl Potentially add DIR to ltrpathdirs. + dnl The ltrpathdirs will be appended to $LTLIBNAME at the end. + haveit= + for x in $ltrpathdirs; do + if test "X$x" = "X$dir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + ltrpathdirs="$ltrpathdirs $dir" + fi + fi + ;; + -l*) + dnl Handle this in the next round. + names_next_round="$names_next_round "`echo "X$dep" | sed -e 's/^X-l//'` + ;; + *.la) + dnl Handle this in the next round. Throw away the .la's + dnl directory; it is already contained in a preceding -L + dnl option. + names_next_round="$names_next_round "`echo "X$dep" | sed -e 's,^X.*/,,' -e 's,^lib,,' -e 's,\.la$,,'` + ;; + *) + dnl Most likely an immediate library name. + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$dep" + LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }$dep" + ;; + esac + done + fi + else + dnl Didn't find the library; assume it is in the system directories + dnl known to the linker and runtime loader. (All the system + dnl directories known to the linker should also be known to the + dnl runtime loader, otherwise the system is severely misconfigured.) + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }-l$name" + LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }-l$name" + fi + fi + fi + done + done + if test "X$rpathdirs" != "X"; then + if test -n "$acl_hardcode_libdir_separator"; then + dnl Weird platform: only the last -rpath option counts, the user must + dnl pass all path elements in one option. We can arrange that for a + dnl single library, but not when more than one $LIBNAMEs are used. + alldirs= + for found_dir in $rpathdirs; do + alldirs="${alldirs}${alldirs:+$acl_hardcode_libdir_separator}$found_dir" + done + dnl Note: acl_hardcode_libdir_flag_spec uses $libdir and $wl. + acl_save_libdir="$libdir" + libdir="$alldirs" + eval flag=\"$acl_hardcode_libdir_flag_spec\" + libdir="$acl_save_libdir" + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$flag" + else + dnl The -rpath options are cumulative. + for found_dir in $rpathdirs; do + acl_save_libdir="$libdir" + libdir="$found_dir" + eval flag=\"$acl_hardcode_libdir_flag_spec\" + libdir="$acl_save_libdir" + LIB[]NAME="${LIB[]NAME}${LIB[]NAME:+ }$flag" + done + fi + fi + if test "X$ltrpathdirs" != "X"; then + dnl When using libtool, the option that works for both libraries and + dnl executables is -R. The -R options are cumulative. + for found_dir in $ltrpathdirs; do + LTLIB[]NAME="${LTLIB[]NAME}${LTLIB[]NAME:+ }-R$found_dir" + done + fi + popdef([PACKLIBS]) + popdef([PACKUP]) + popdef([PACK]) + popdef([NAME]) +]) + +dnl AC_LIB_APPENDTOVAR(VAR, CONTENTS) appends the elements of CONTENTS to VAR, +dnl unless already present in VAR. +dnl Works only for CPPFLAGS, not for LIB* variables because that sometimes +dnl contains two or three consecutive elements that belong together. +AC_DEFUN([AC_LIB_APPENDTOVAR], +[ + for element in [$2]; do + haveit= + for x in $[$1]; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X$element"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + [$1]="${[$1]}${[$1]:+ }$element" + fi + done +]) + +dnl For those cases where a variable contains several -L and -l options +dnl referring to unknown libraries and directories, this macro determines the +dnl necessary additional linker options for the runtime path. +dnl AC_LIB_LINKFLAGS_FROM_LIBS([LDADDVAR], [LIBSVALUE], [USE-LIBTOOL]) +dnl sets LDADDVAR to linker options needed together with LIBSVALUE. +dnl If USE-LIBTOOL evaluates to non-empty, linking with libtool is assumed, +dnl otherwise linking without libtool is assumed. +AC_DEFUN([AC_LIB_LINKFLAGS_FROM_LIBS], +[ + AC_REQUIRE([AC_LIB_RPATH]) + AC_REQUIRE([AC_LIB_PREPARE_MULTILIB]) + $1= + if test "$enable_rpath" != no; then + if test -n "$acl_hardcode_libdir_flag_spec" && test "$acl_hardcode_minus_L" = no; then + dnl Use an explicit option to hardcode directories into the resulting + dnl binary. + rpathdirs= + next= + for opt in $2; do + if test -n "$next"; then + dir="$next" + dnl No need to hardcode the standard /usr/lib. + if test "X$dir" != "X/usr/$acl_libdirstem" \ + && test "X$dir" != "X/usr/$acl_libdirstem2"; then + rpathdirs="$rpathdirs $dir" + fi + next= + else + case $opt in + -L) next=yes ;; + -L*) dir=`echo "X$opt" | sed -e 's,^X-L,,'` + dnl No need to hardcode the standard /usr/lib. + if test "X$dir" != "X/usr/$acl_libdirstem" \ + && test "X$dir" != "X/usr/$acl_libdirstem2"; then + rpathdirs="$rpathdirs $dir" + fi + next= ;; + *) next= ;; + esac + fi + done + if test "X$rpathdirs" != "X"; then + if test -n ""$3""; then + dnl libtool is used for linking. Use -R options. + for dir in $rpathdirs; do + $1="${$1}${$1:+ }-R$dir" + done + else + dnl The linker is used for linking directly. + if test -n "$acl_hardcode_libdir_separator"; then + dnl Weird platform: only the last -rpath option counts, the user + dnl must pass all path elements in one option. + alldirs= + for dir in $rpathdirs; do + alldirs="${alldirs}${alldirs:+$acl_hardcode_libdir_separator}$dir" + done + acl_save_libdir="$libdir" + libdir="$alldirs" + eval flag=\"$acl_hardcode_libdir_flag_spec\" + libdir="$acl_save_libdir" + $1="$flag" + else + dnl The -rpath options are cumulative. + for dir in $rpathdirs; do + acl_save_libdir="$libdir" + libdir="$dir" + eval flag=\"$acl_hardcode_libdir_flag_spec\" + libdir="$acl_save_libdir" + $1="${$1}${$1:+ }$flag" + done + fi + fi + fi + fi + fi + AC_SUBST([$1]) +]) diff --git a/config/lib-prefix.m4 b/config/lib-prefix.m4 new file mode 100644 index 000000000000..f7db2371db45 --- /dev/null +++ b/config/lib-prefix.m4 @@ -0,0 +1,248 @@ +# lib-prefix.m4 serial 14 +dnl Copyright (C) 2001-2005, 2008-2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. + +dnl From Bruno Haible. + +dnl AC_LIB_PREFIX adds to the CPPFLAGS and LDFLAGS the flags that are needed +dnl to access previously installed libraries. The basic assumption is that +dnl packages should use other packages that are installed with the same +dnl --prefix option. This macro is not needed if only AC_LIB_LINKFLAGS is +dnl used to locate libraries, but is otherwise very convenient. +AC_DEFUN([AC_LIB_PREFIX], +[ + AC_BEFORE([$0], [AC_LIB_LINKFLAGS]) + AC_REQUIRE([AC_PROG_CC]) + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([AC_LIB_PREPARE_MULTILIB]) + AC_REQUIRE([AC_LIB_PREPARE_PREFIX]) + dnl By default, look in $includedir and $libdir. + use_additional=yes + AC_LIB_WITH_FINAL_PREFIX([ + eval additional_includedir=\"$includedir\" + eval additional_libdir=\"$libdir\" + ]) + AC_ARG_WITH([lib-prefix], +[[ --with-lib-prefix[=DIR] search for libraries in DIR/include and DIR/lib + --without-lib-prefix don't search for libraries in includedir and libdir]], +[ + if test "X$withval" = "Xno"; then + use_additional=no + else + if test "X$withval" = "X"; then + AC_LIB_WITH_FINAL_PREFIX([ + eval additional_includedir=\"$includedir\" + eval additional_libdir=\"$libdir\" + ]) + else + additional_includedir="$withval/include" + additional_libdir="$withval/$acl_libdirstem" + fi + fi +]) + if test $use_additional = yes; then + dnl Potentially add $additional_includedir to $CPPFLAGS. + dnl But don't add it + dnl 1. if it's the standard /usr/include, + dnl 2. if it's already present in $CPPFLAGS, + dnl 3. if it's /usr/local/include and we are using GCC on Linux, + dnl 4. if it doesn't exist as a directory. + if test "X$additional_includedir" != "X/usr/include"; then + haveit= + for x in $CPPFLAGS; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-I$additional_includedir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + if test "X$additional_includedir" = "X/usr/local/include"; then + if test -n "$GCC"; then + case $host_os in + linux* | gnu* | k*bsd*-gnu) haveit=yes;; + esac + fi + fi + if test -z "$haveit"; then + if test -d "$additional_includedir"; then + dnl Really add $additional_includedir to $CPPFLAGS. + CPPFLAGS="${CPPFLAGS}${CPPFLAGS:+ }-I$additional_includedir" + fi + fi + fi + fi + dnl Potentially add $additional_libdir to $LDFLAGS. + dnl But don't add it + dnl 1. if it's the standard /usr/lib, + dnl 2. if it's already present in $LDFLAGS, + dnl 3. if it's /usr/local/lib and we are using GCC on Linux, + dnl 4. if it doesn't exist as a directory. + if test "X$additional_libdir" != "X/usr/$acl_libdirstem"; then + haveit= + for x in $LDFLAGS; do + AC_LIB_WITH_FINAL_PREFIX([eval x=\"$x\"]) + if test "X$x" = "X-L$additional_libdir"; then + haveit=yes + break + fi + done + if test -z "$haveit"; then + if test "X$additional_libdir" = "X/usr/local/$acl_libdirstem"; then + if test -n "$GCC"; then + case $host_os in + linux*) haveit=yes;; + esac + fi + fi + if test -z "$haveit"; then + if test -d "$additional_libdir"; then + dnl Really add $additional_libdir to $LDFLAGS. + LDFLAGS="${LDFLAGS}${LDFLAGS:+ }-L$additional_libdir" + fi + fi + fi + fi + fi +]) + +dnl AC_LIB_PREPARE_PREFIX creates variables acl_final_prefix, +dnl acl_final_exec_prefix, containing the values to which $prefix and +dnl $exec_prefix will expand at the end of the configure script. +AC_DEFUN([AC_LIB_PREPARE_PREFIX], +[ + dnl Unfortunately, prefix and exec_prefix get only finally determined + dnl at the end of configure. + if test "X$prefix" = "XNONE"; then + acl_final_prefix="$ac_default_prefix" + else + acl_final_prefix="$prefix" + fi + if test "X$exec_prefix" = "XNONE"; then + acl_final_exec_prefix='${prefix}' + else + acl_final_exec_prefix="$exec_prefix" + fi + acl_save_prefix="$prefix" + prefix="$acl_final_prefix" + eval acl_final_exec_prefix=\"$acl_final_exec_prefix\" + prefix="$acl_save_prefix" +]) + +dnl AC_LIB_WITH_FINAL_PREFIX([statement]) evaluates statement, with the +dnl variables prefix and exec_prefix bound to the values they will have +dnl at the end of the configure script. +AC_DEFUN([AC_LIB_WITH_FINAL_PREFIX], +[ + acl_save_prefix="$prefix" + prefix="$acl_final_prefix" + acl_save_exec_prefix="$exec_prefix" + exec_prefix="$acl_final_exec_prefix" + $1 + exec_prefix="$acl_save_exec_prefix" + prefix="$acl_save_prefix" +]) + +dnl AC_LIB_PREPARE_MULTILIB creates +dnl - a variable acl_libdirstem, containing the basename of the libdir, either +dnl "lib" or "lib64" or "lib/64", +dnl - a variable acl_libdirstem2, as a secondary possible value for +dnl acl_libdirstem, either the same as acl_libdirstem or "lib/sparcv9" or +dnl "lib/amd64". +AC_DEFUN([AC_LIB_PREPARE_MULTILIB], +[ + dnl There is no formal standard regarding lib and lib64. + dnl On glibc systems, the current practice is that on a system supporting + dnl 32-bit and 64-bit instruction sets or ABIs, 64-bit libraries go under + dnl $prefix/lib64 and 32-bit libraries go under $prefix/lib. We determine + dnl the compiler's default mode by looking at the compiler's library search + dnl path. If at least one of its elements ends in /lib64 or points to a + dnl directory whose absolute pathname ends in /lib64, we assume a 64-bit ABI. + dnl Otherwise we use the default, namely "lib". + dnl On Solaris systems, the current practice is that on a system supporting + dnl 32-bit and 64-bit instruction sets or ABIs, 64-bit libraries go under + dnl $prefix/lib/64 (which is a symlink to either $prefix/lib/sparcv9 or + dnl $prefix/lib/amd64) and 32-bit libraries go under $prefix/lib. + AC_REQUIRE([AC_CANONICAL_HOST]) + AC_REQUIRE([gl_HOST_CPU_C_ABI_32BIT]) + + case "$host_os" in + solaris*) + AC_CACHE_CHECK([for 64-bit host], [gl_cv_solaris_64bit], + [AC_COMPILE_IFELSE( + [AC_LANG_SOURCE( + [[#ifdef _LP64 + int ok; + #else + error fail + #endif + ]])], + [gl_cv_solaris_64bit=yes], + [gl_cv_solaris_64bit=no]) + ]);; + esac + + dnl Allow the user to override the result by setting acl_cv_libdirstems. + AC_CACHE_CHECK([for the common suffixes of directories in the library search path], + [acl_cv_libdirstems], + [acl_libdirstem=lib + acl_libdirstem2= + case "$host_os" in + solaris*) + dnl See Solaris 10 Software Developer Collection > Solaris 64-bit Developer's Guide > The Development Environment + dnl . + dnl "Portable Makefiles should refer to any library directories using the 64 symbolic link." + dnl But we want to recognize the sparcv9 or amd64 subdirectory also if the + dnl symlink is missing, so we set acl_libdirstem2 too. + if test $gl_cv_solaris_64bit = yes; then + acl_libdirstem=lib/64 + case "$host_cpu" in + sparc*) acl_libdirstem2=lib/sparcv9 ;; + i*86 | x86_64) acl_libdirstem2=lib/amd64 ;; + esac + fi + ;; + *) + dnl If $CC generates code for a 32-bit ABI, the libraries are + dnl surely under $prefix/lib, not $prefix/lib64. + if test "$HOST_CPU_C_ABI_32BIT" != yes; then + dnl The result is a property of the system. However, non-system + dnl compilers sometimes have odd library search paths. Therefore + dnl prefer asking /usr/bin/gcc, if available, rather than $CC. + searchpath=`(if test -f /usr/bin/gcc \ + && LC_ALL=C /usr/bin/gcc -print-search-dirs >/dev/null 2>/dev/null; then \ + LC_ALL=C /usr/bin/gcc -print-search-dirs; \ + else \ + LC_ALL=C $CC -print-search-dirs; \ + fi) 2>/dev/null \ + | sed -n -e 's,^libraries: ,,p' | sed -e 's,^=,,'` + if test -n "$searchpath"; then + acl_save_IFS="${IFS= }"; IFS=":" + for searchdir in $searchpath; do + if test -d "$searchdir"; then + case "$searchdir" in + */lib64/ | */lib64 ) acl_libdirstem=lib64 ;; + */../ | */.. ) + # Better ignore directories of this form. They are misleading. + ;; + *) searchdir=`cd "$searchdir" && pwd` + case "$searchdir" in + */lib64 ) acl_libdirstem=lib64 ;; + esac ;; + esac + fi + done + IFS="$acl_save_IFS" + fi + fi + ;; + esac + test -n "$acl_libdirstem2" || acl_libdirstem2="$acl_libdirstem" + acl_cv_libdirstems="$acl_libdirstem,$acl_libdirstem2" + ]) + # Decompose acl_cv_libdirstems into acl_libdirstem and acl_libdirstem2. + acl_libdirstem=`echo "$acl_cv_libdirstems" | sed -e 's/,.*//'` + acl_libdirstem2=`echo "$acl_cv_libdirstems" | sed -e '/,/s/.*,//'` +]) diff --git a/config/mount-helper.m4 b/config/mount-helper.m4 new file mode 100644 index 000000000000..0a6c7670840b --- /dev/null +++ b/config/mount-helper.m4 @@ -0,0 +1,8 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_MOUNT_HELPER], [ + AC_ARG_WITH(mounthelperdir, + AC_HELP_STRING([--with-mounthelperdir=DIR], + [install mount.zfs in dir [[/sbin]]]), + mounthelperdir=$withval,mounthelperdir=/sbin) + + AC_SUBST(mounthelperdir) +]) diff --git a/config/nls.m4 b/config/nls.m4 new file mode 100644 index 000000000000..b62f61485700 --- /dev/null +++ b/config/nls.m4 @@ -0,0 +1,32 @@ +# nls.m4 serial 5 (gettext-0.18) +dnl Copyright (C) 1995-2003, 2005-2006, 2008-2014, 2016, 2019 Free Software +dnl Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. +dnl +dnl This file can be used in projects which are not available under +dnl the GNU General Public License or the GNU Library General Public +dnl License but which still want to provide support for the GNU gettext +dnl functionality. +dnl Please note that the actual code of the GNU gettext library is covered +dnl by the GNU Library General Public License, and the rest of the GNU +dnl gettext package is covered by the GNU General Public License. +dnl They are *not* in the public domain. + +dnl Authors: +dnl Ulrich Drepper , 1995-2000. +dnl Bruno Haible , 2000-2003. + +AC_PREREQ([2.50]) + +AC_DEFUN([AM_NLS], +[ + AC_MSG_CHECKING([whether NLS is requested]) + dnl Default is enabled NLS + AC_ARG_ENABLE([nls], + [ --disable-nls do not use Native Language Support], + USE_NLS=$enableval, USE_NLS=yes) + AC_MSG_RESULT([$USE_NLS]) + AC_SUBST([USE_NLS]) +]) diff --git a/config/pkg.m4 b/config/pkg.m4 new file mode 100644 index 000000000000..f9075e56c87a --- /dev/null +++ b/config/pkg.m4 @@ -0,0 +1,275 @@ +# pkg.m4 - Macros to locate and utilise pkg-config. -*- Autoconf -*- +# serial 12 (pkg-config-0.29.2) + +dnl Copyright © 2004 Scott James Remnant . +dnl Copyright © 2012-2015 Dan Nicholson +dnl +dnl This program is free software; you can redistribute it and/or modify +dnl it under the terms of the GNU General Public License as published by +dnl the Free Software Foundation; either version 2 of the License, or +dnl (at your option) any later version. +dnl +dnl This program is distributed in the hope that it will be useful, but +dnl WITHOUT ANY WARRANTY; without even the implied warranty of +dnl MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +dnl General Public License for more details. +dnl +dnl You should have received a copy of the GNU General Public License +dnl along with this program; if not, write to the Free Software +dnl Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA +dnl 02111-1307, USA. +dnl +dnl As a special exception to the GNU General Public License, if you +dnl distribute this file as part of a program that contains a +dnl configuration script generated by Autoconf, you may include it under +dnl the same distribution terms that you use for the rest of that +dnl program. + +dnl PKG_PREREQ(MIN-VERSION) +dnl ----------------------- +dnl Since: 0.29 +dnl +dnl Verify that the version of the pkg-config macros are at least +dnl MIN-VERSION. Unlike PKG_PROG_PKG_CONFIG, which checks the user's +dnl installed version of pkg-config, this checks the developer's version +dnl of pkg.m4 when generating configure. +dnl +dnl To ensure that this macro is defined, also add: +dnl m4_ifndef([PKG_PREREQ], +dnl [m4_fatal([must install pkg-config 0.29 or later before running autoconf/autogen])]) +dnl +dnl See the "Since" comment for each macro you use to see what version +dnl of the macros you require. +m4_defun([PKG_PREREQ], +[m4_define([PKG_MACROS_VERSION], [0.29.2]) +m4_if(m4_version_compare(PKG_MACROS_VERSION, [$1]), -1, + [m4_fatal([pkg.m4 version $1 or higher is required but ]PKG_MACROS_VERSION[ found])]) +])dnl PKG_PREREQ + +dnl PKG_PROG_PKG_CONFIG([MIN-VERSION]) +dnl ---------------------------------- +dnl Since: 0.16 +dnl +dnl Search for the pkg-config tool and set the PKG_CONFIG variable to +dnl first found in the path. Checks that the version of pkg-config found +dnl is at least MIN-VERSION. If MIN-VERSION is not specified, 0.9.0 is +dnl used since that's the first version where most current features of +dnl pkg-config existed. +AC_DEFUN([PKG_PROG_PKG_CONFIG], +[m4_pattern_forbid([^_?PKG_[A-Z_]+$]) +m4_pattern_allow([^PKG_CONFIG(_(PATH|LIBDIR|SYSROOT_DIR|ALLOW_SYSTEM_(CFLAGS|LIBS)))?$]) +m4_pattern_allow([^PKG_CONFIG_(DISABLE_UNINSTALLED|TOP_BUILD_DIR|DEBUG_SPEW)$]) +AC_ARG_VAR([PKG_CONFIG], [path to pkg-config utility]) +AC_ARG_VAR([PKG_CONFIG_PATH], [directories to add to pkg-config's search path]) +AC_ARG_VAR([PKG_CONFIG_LIBDIR], [path overriding pkg-config's built-in search path]) + +if test "x$ac_cv_env_PKG_CONFIG_set" != "xset"; then + AC_PATH_TOOL([PKG_CONFIG], [pkg-config]) +fi +if test -n "$PKG_CONFIG"; then + _pkg_min_version=m4_default([$1], [0.9.0]) + AC_MSG_CHECKING([pkg-config is at least version $_pkg_min_version]) + if $PKG_CONFIG --atleast-pkgconfig-version $_pkg_min_version; then + AC_MSG_RESULT([yes]) + else + AC_MSG_RESULT([no]) + PKG_CONFIG="" + fi +fi[]dnl +])dnl PKG_PROG_PKG_CONFIG + +dnl PKG_CHECK_EXISTS(MODULES, [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +dnl ------------------------------------------------------------------- +dnl Since: 0.18 +dnl +dnl Check to see whether a particular set of modules exists. Similar to +dnl PKG_CHECK_MODULES(), but does not set variables or print errors. +dnl +dnl Please remember that m4 expands AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +dnl only at the first occurrence in configure.ac, so if the first place +dnl it's called might be skipped (such as if it is within an "if", you +dnl have to call PKG_CHECK_EXISTS manually +AC_DEFUN([PKG_CHECK_EXISTS], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +if test -n "$PKG_CONFIG" && \ + AC_RUN_LOG([$PKG_CONFIG --exists --print-errors "$1"]); then + m4_default([$2], [:]) +m4_ifvaln([$3], [else + $3])dnl +fi]) + +dnl _PKG_CONFIG([VARIABLE], [COMMAND], [MODULES]) +dnl --------------------------------------------- +dnl Internal wrapper calling pkg-config via PKG_CONFIG and setting +dnl pkg_failed based on the result. +m4_define([_PKG_CONFIG], +[if test -n "$$1"; then + pkg_cv_[]$1="$$1" + elif test -n "$PKG_CONFIG"; then + PKG_CHECK_EXISTS([$3], + [pkg_cv_[]$1=`$PKG_CONFIG --[]$2 "$3" 2>/dev/null` + test "x$?" != "x0" && pkg_failed=yes ], + [pkg_failed=yes]) + else + pkg_failed=untried +fi[]dnl +])dnl _PKG_CONFIG + +dnl _PKG_SHORT_ERRORS_SUPPORTED +dnl --------------------------- +dnl Internal check to see if pkg-config supports short errors. +AC_DEFUN([_PKG_SHORT_ERRORS_SUPPORTED], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG]) +if $PKG_CONFIG --atleast-pkgconfig-version 0.20; then + _pkg_short_errors_supported=yes +else + _pkg_short_errors_supported=no +fi[]dnl +])dnl _PKG_SHORT_ERRORS_SUPPORTED + + +dnl PKG_CHECK_MODULES(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +dnl [ACTION-IF-NOT-FOUND]) +dnl -------------------------------------------------------------- +dnl Since: 0.4.0 +dnl +dnl Note that if there is a possibility the first call to +dnl PKG_CHECK_MODULES might not happen, you should be sure to include an +dnl explicit call to PKG_PROG_PKG_CONFIG in your configure.ac +AC_DEFUN([PKG_CHECK_MODULES], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_ARG_VAR([$1][_CFLAGS], [C compiler flags for $1, overriding pkg-config])dnl +AC_ARG_VAR([$1][_LIBS], [linker flags for $1, overriding pkg-config])dnl + +pkg_failed=no +AC_MSG_CHECKING([for $2]) + +_PKG_CONFIG([$1][_CFLAGS], [cflags], [$2]) +_PKG_CONFIG([$1][_LIBS], [libs], [$2]) + +m4_define([_PKG_TEXT], [Alternatively, you may set the environment variables $1[]_CFLAGS +and $1[]_LIBS to avoid the need to call pkg-config. +See the pkg-config man page for more details.]) + +if test $pkg_failed = yes; then + AC_MSG_RESULT([no]) + _PKG_SHORT_ERRORS_SUPPORTED + if test $_pkg_short_errors_supported = yes; then + $1[]_PKG_ERRORS=`$PKG_CONFIG --short-errors --print-errors --cflags --libs "$2" 2>&1` + else + $1[]_PKG_ERRORS=`$PKG_CONFIG --print-errors --cflags --libs "$2" 2>&1` + fi + # Put the nasty error message in config.log where it belongs + echo "$$1[]_PKG_ERRORS" >&AS_MESSAGE_LOG_FD + + m4_default([$4], [AC_MSG_ERROR( +[Package requirements ($2) were not met: + +$$1_PKG_ERRORS + +Consider adjusting the PKG_CONFIG_PATH environment variable if you +installed software in a non-standard prefix. + +_PKG_TEXT])[]dnl + ]) +elif test $pkg_failed = untried; then + AC_MSG_RESULT([no]) + m4_default([$4], [AC_MSG_FAILURE( +[The pkg-config script could not be found or is too old. Make sure it +is in your PATH or set the PKG_CONFIG environment variable to the full +path to pkg-config. + +_PKG_TEXT + +To get pkg-config, see .])[]dnl + ]) +else + $1[]_CFLAGS=$pkg_cv_[]$1[]_CFLAGS + $1[]_LIBS=$pkg_cv_[]$1[]_LIBS + AC_MSG_RESULT([yes]) + $3 +fi[]dnl +])dnl PKG_CHECK_MODULES + + +dnl PKG_CHECK_MODULES_STATIC(VARIABLE-PREFIX, MODULES, [ACTION-IF-FOUND], +dnl [ACTION-IF-NOT-FOUND]) +dnl --------------------------------------------------------------------- +dnl Since: 0.29 +dnl +dnl Checks for existence of MODULES and gathers its build flags with +dnl static libraries enabled. Sets VARIABLE-PREFIX_CFLAGS from --cflags +dnl and VARIABLE-PREFIX_LIBS from --libs. +dnl +dnl Note that if there is a possibility the first call to +dnl PKG_CHECK_MODULES_STATIC might not happen, you should be sure to +dnl include an explicit call to PKG_PROG_PKG_CONFIG in your +dnl configure.ac. +AC_DEFUN([PKG_CHECK_MODULES_STATIC], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +_save_PKG_CONFIG=$PKG_CONFIG +PKG_CONFIG="$PKG_CONFIG --static" +PKG_CHECK_MODULES($@) +PKG_CONFIG=$_save_PKG_CONFIG[]dnl +])dnl PKG_CHECK_MODULES_STATIC + + +dnl PKG_INSTALLDIR([DIRECTORY]) +dnl ------------------------- +dnl Since: 0.27 +dnl +dnl Substitutes the variable pkgconfigdir as the location where a module +dnl should install pkg-config .pc files. By default the directory is +dnl $libdir/pkgconfig, but the default can be changed by passing +dnl DIRECTORY. The user can override through the --with-pkgconfigdir +dnl parameter. +AC_DEFUN([PKG_INSTALLDIR], +[m4_pushdef([pkg_default], [m4_default([$1], ['${libdir}/pkgconfig'])]) +m4_pushdef([pkg_description], + [pkg-config installation directory @<:@]pkg_default[@:>@]) +AC_ARG_WITH([pkgconfigdir], + [AS_HELP_STRING([--with-pkgconfigdir], pkg_description)],, + [with_pkgconfigdir=]pkg_default) +AC_SUBST([pkgconfigdir], [$with_pkgconfigdir]) +m4_popdef([pkg_default]) +m4_popdef([pkg_description]) +])dnl PKG_INSTALLDIR + + +dnl PKG_NOARCH_INSTALLDIR([DIRECTORY]) +dnl -------------------------------- +dnl Since: 0.27 +dnl +dnl Substitutes the variable noarch_pkgconfigdir as the location where a +dnl module should install arch-independent pkg-config .pc files. By +dnl default the directory is $datadir/pkgconfig, but the default can be +dnl changed by passing DIRECTORY. The user can override through the +dnl --with-noarch-pkgconfigdir parameter. +AC_DEFUN([PKG_NOARCH_INSTALLDIR], +[m4_pushdef([pkg_default], [m4_default([$1], ['${datadir}/pkgconfig'])]) +m4_pushdef([pkg_description], + [pkg-config arch-independent installation directory @<:@]pkg_default[@:>@]) +AC_ARG_WITH([noarch-pkgconfigdir], + [AS_HELP_STRING([--with-noarch-pkgconfigdir], pkg_description)],, + [with_noarch_pkgconfigdir=]pkg_default) +AC_SUBST([noarch_pkgconfigdir], [$with_noarch_pkgconfigdir]) +m4_popdef([pkg_default]) +m4_popdef([pkg_description]) +])dnl PKG_NOARCH_INSTALLDIR + + +dnl PKG_CHECK_VAR(VARIABLE, MODULE, CONFIG-VARIABLE, +dnl [ACTION-IF-FOUND], [ACTION-IF-NOT-FOUND]) +dnl ------------------------------------------- +dnl Since: 0.28 +dnl +dnl Retrieves the value of the pkg-config variable for the given module. +AC_DEFUN([PKG_CHECK_VAR], +[AC_REQUIRE([PKG_PROG_PKG_CONFIG])dnl +AC_ARG_VAR([$1], [value of $3 for $2, overriding pkg-config])dnl + +_PKG_CONFIG([$1], [variable="][$3]["], [$2]) +AS_VAR_COPY([$1], [pkg_cv_][$1]) + +AS_VAR_IF([$1], [""], [$5], [$4])dnl +])dnl PKG_CHECK_VAR diff --git a/config/po.m4 b/config/po.m4 new file mode 100644 index 000000000000..143792dba560 --- /dev/null +++ b/config/po.m4 @@ -0,0 +1,450 @@ +# po.m4 serial 30 (gettext-0.20) +dnl Copyright (C) 1995-2014, 2016, 2018-2019 Free Software Foundation, Inc. +dnl This file is free software; the Free Software Foundation +dnl gives unlimited permission to copy and/or distribute it, +dnl with or without modifications, as long as this notice is preserved. +dnl +dnl This file can be used in projects which are not available under +dnl the GNU General Public License or the GNU Library General Public +dnl License but which still want to provide support for the GNU gettext +dnl functionality. +dnl Please note that the actual code of the GNU gettext library is covered +dnl by the GNU Library General Public License, and the rest of the GNU +dnl gettext package is covered by the GNU General Public License. +dnl They are *not* in the public domain. + +dnl Authors: +dnl Ulrich Drepper , 1995-2000. +dnl Bruno Haible , 2000-2003. + +AC_PREREQ([2.60]) + +dnl Checks for all prerequisites of the po subdirectory. +AC_DEFUN([AM_PO_SUBDIRS], +[ + AC_REQUIRE([AC_PROG_MAKE_SET])dnl + AC_REQUIRE([AC_PROG_INSTALL])dnl + AC_REQUIRE([AC_PROG_MKDIR_P])dnl + AC_REQUIRE([AC_PROG_SED])dnl + AC_REQUIRE([AM_NLS])dnl + + dnl Release version of the gettext macros. This is used to ensure that + dnl the gettext macros and po/Makefile.in.in are in sync. + AC_SUBST([GETTEXT_MACRO_VERSION], [0.20]) + + dnl Perform the following tests also if --disable-nls has been given, + dnl because they are needed for "make dist" to work. + + dnl Search for GNU msgfmt in the PATH. + dnl The first test excludes Solaris msgfmt and early GNU msgfmt versions. + dnl The second test excludes FreeBSD msgfmt. + AM_PATH_PROG_WITH_TEST(MSGFMT, msgfmt, + [$ac_dir/$ac_word --statistics /dev/null >&]AS_MESSAGE_LOG_FD[ 2>&1 && + (if $ac_dir/$ac_word --statistics /dev/null 2>&1 >/dev/null | grep usage >/dev/null; then exit 1; else exit 0; fi)], + :) + AC_PATH_PROG([GMSGFMT], [gmsgfmt], [$MSGFMT]) + + dnl Test whether it is GNU msgfmt >= 0.15. +changequote(,)dnl + case `$GMSGFMT --version | sed 1q | sed -e 's,^[^0-9]*,,'` in + '' | 0.[0-9] | 0.[0-9].* | 0.1[0-4] | 0.1[0-4].*) GMSGFMT_015=: ;; + *) GMSGFMT_015=$GMSGFMT ;; + esac +changequote([,])dnl + AC_SUBST([GMSGFMT_015]) + + dnl Search for GNU xgettext 0.12 or newer in the PATH. + dnl The first test excludes Solaris xgettext and early GNU xgettext versions. + dnl The second test excludes FreeBSD xgettext. + AM_PATH_PROG_WITH_TEST(XGETTEXT, xgettext, + [$ac_dir/$ac_word --omit-header --copyright-holder= --msgid-bugs-address= /dev/null >&]AS_MESSAGE_LOG_FD[ 2>&1 && + (if $ac_dir/$ac_word --omit-header --copyright-holder= --msgid-bugs-address= /dev/null 2>&1 >/dev/null | grep usage >/dev/null; then exit 1; else exit 0; fi)], + :) + dnl Remove leftover from FreeBSD xgettext call. + rm -f messages.po + + dnl Test whether it is GNU xgettext >= 0.15. +changequote(,)dnl + case `$XGETTEXT --version | sed 1q | sed -e 's,^[^0-9]*,,'` in + '' | 0.[0-9] | 0.[0-9].* | 0.1[0-4] | 0.1[0-4].*) XGETTEXT_015=: ;; + *) XGETTEXT_015=$XGETTEXT ;; + esac +changequote([,])dnl + AC_SUBST([XGETTEXT_015]) + + dnl Search for GNU msgmerge 0.11 or newer in the PATH. + AM_PATH_PROG_WITH_TEST(MSGMERGE, msgmerge, + [$ac_dir/$ac_word --update -q /dev/null /dev/null >&]AS_MESSAGE_LOG_FD[ 2>&1], :) + + dnl Test whether it is GNU msgmerge >= 0.20. + if LC_ALL=C $MSGMERGE --help | grep ' --for-msgfmt ' >/dev/null; then + MSGMERGE_FOR_MSGFMT_OPTION='--for-msgfmt' + else + dnl Test whether it is GNU msgmerge >= 0.12. + if LC_ALL=C $MSGMERGE --help | grep ' --no-fuzzy-matching ' >/dev/null; then + MSGMERGE_FOR_MSGFMT_OPTION='--no-fuzzy-matching --no-location --quiet' + else + dnl With these old versions, $(MSGMERGE) $(MSGMERGE_FOR_MSGFMT_OPTION) is + dnl slow. But this is not a big problem, as such old gettext versions are + dnl hardly in use any more. + MSGMERGE_FOR_MSGFMT_OPTION='--no-location --quiet' + fi + fi + AC_SUBST([MSGMERGE_FOR_MSGFMT_OPTION]) + + dnl Support for AM_XGETTEXT_OPTION. + test -n "${XGETTEXT_EXTRA_OPTIONS+set}" || XGETTEXT_EXTRA_OPTIONS= + AC_SUBST([XGETTEXT_EXTRA_OPTIONS]) + + AC_CONFIG_COMMANDS([po-directories], [[ + for ac_file in $CONFIG_FILES; do + # Support "outfile[:infile[:infile...]]" + case "$ac_file" in + *:*) ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;; + esac + # PO directories have a Makefile.in generated from Makefile.in.in. + case "$ac_file" in */Makefile.in) + # Adjust a relative srcdir. + ac_dir=`echo "$ac_file"|sed 's%/[^/][^/]*$%%'` + ac_dir_suffix=/`echo "$ac_dir"|sed 's%^\./%%'` + ac_dots=`echo "$ac_dir_suffix"|sed 's%/[^/]*%../%g'` + # In autoconf-2.13 it is called $ac_given_srcdir. + # In autoconf-2.50 it is called $srcdir. + test -n "$ac_given_srcdir" || ac_given_srcdir="$srcdir" + case "$ac_given_srcdir" in + .) top_srcdir=`echo $ac_dots|sed 's%/$%%'` ;; + /*) top_srcdir="$ac_given_srcdir" ;; + *) top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + # Treat a directory as a PO directory if and only if it has a + # POTFILES.in file. This allows packages to have multiple PO + # directories under different names or in different locations. + if test -f "$ac_given_srcdir/$ac_dir/POTFILES.in"; then + rm -f "$ac_dir/POTFILES" + test -n "$as_me" && echo "$as_me: creating $ac_dir/POTFILES" || echo "creating $ac_dir/POTFILES" + gt_tab=`printf '\t'` + cat "$ac_given_srcdir/$ac_dir/POTFILES.in" | sed -e "/^#/d" -e "/^[ ${gt_tab}]*\$/d" -e "s,.*, $top_srcdir/& \\\\," | sed -e "\$s/\(.*\) \\\\/\1/" > "$ac_dir/POTFILES" + POMAKEFILEDEPS="POTFILES.in" + # ALL_LINGUAS, POFILES, UPDATEPOFILES, DUMMYPOFILES, GMOFILES depend + # on $ac_dir but don't depend on user-specified configuration + # parameters. + if test -f "$ac_given_srcdir/$ac_dir/LINGUAS"; then + # The LINGUAS file contains the set of available languages. + if test -n "$OBSOLETE_ALL_LINGUAS"; then + test -n "$as_me" && echo "$as_me: setting ALL_LINGUAS in configure.in is obsolete" || echo "setting ALL_LINGUAS in configure.in is obsolete" + fi + ALL_LINGUAS=`sed -e "/^#/d" -e "s/#.*//" "$ac_given_srcdir/$ac_dir/LINGUAS"` + POMAKEFILEDEPS="$POMAKEFILEDEPS LINGUAS" + else + # The set of available languages was given in configure.in. + ALL_LINGUAS=$OBSOLETE_ALL_LINGUAS + fi + # Compute POFILES + # as $(foreach lang, $(ALL_LINGUAS), $(srcdir)/$(lang).po) + # Compute UPDATEPOFILES + # as $(foreach lang, $(ALL_LINGUAS), $(lang).po-update) + # Compute DUMMYPOFILES + # as $(foreach lang, $(ALL_LINGUAS), $(lang).nop) + # Compute GMOFILES + # as $(foreach lang, $(ALL_LINGUAS), $(srcdir)/$(lang).gmo) + case "$ac_given_srcdir" in + .) srcdirpre= ;; + *) srcdirpre='$(srcdir)/' ;; + esac + POFILES= + UPDATEPOFILES= + DUMMYPOFILES= + GMOFILES= + for lang in $ALL_LINGUAS; do + POFILES="$POFILES $srcdirpre$lang.po" + UPDATEPOFILES="$UPDATEPOFILES $lang.po-update" + DUMMYPOFILES="$DUMMYPOFILES $lang.nop" + GMOFILES="$GMOFILES $srcdirpre$lang.gmo" + done + # CATALOGS depends on both $ac_dir and the user's LINGUAS + # environment variable. + INST_LINGUAS= + if test -n "$ALL_LINGUAS"; then + for presentlang in $ALL_LINGUAS; do + useit=no + if test "%UNSET%" != "$LINGUAS"; then + desiredlanguages="$LINGUAS" + else + desiredlanguages="$ALL_LINGUAS" + fi + for desiredlang in $desiredlanguages; do + # Use the presentlang catalog if desiredlang is + # a. equal to presentlang, or + # b. a variant of presentlang (because in this case, + # presentlang can be used as a fallback for messages + # which are not translated in the desiredlang catalog). + case "$desiredlang" in + "$presentlang"*) useit=yes;; + esac + done + if test $useit = yes; then + INST_LINGUAS="$INST_LINGUAS $presentlang" + fi + done + fi + CATALOGS= + if test -n "$INST_LINGUAS"; then + for lang in $INST_LINGUAS; do + CATALOGS="$CATALOGS $lang.gmo" + done + fi + test -n "$as_me" && echo "$as_me: creating $ac_dir/Makefile" || echo "creating $ac_dir/Makefile" + sed -e "/^POTFILES =/r $ac_dir/POTFILES" -e "/^# Makevars/r $ac_given_srcdir/$ac_dir/Makevars" -e "s|@POFILES@|$POFILES|g" -e "s|@UPDATEPOFILES@|$UPDATEPOFILES|g" -e "s|@DUMMYPOFILES@|$DUMMYPOFILES|g" -e "s|@GMOFILES@|$GMOFILES|g" -e "s|@CATALOGS@|$CATALOGS|g" -e "s|@POMAKEFILEDEPS@|$POMAKEFILEDEPS|g" "$ac_dir/Makefile.in" > "$ac_dir/Makefile" + for f in "$ac_given_srcdir/$ac_dir"/Rules-*; do + if test -f "$f"; then + case "$f" in + *.orig | *.bak | *~) ;; + *) cat "$f" >> "$ac_dir/Makefile" ;; + esac + fi + done + fi + ;; + esac + done]], + [# Capture the value of obsolete ALL_LINGUAS because we need it to compute + # POFILES, UPDATEPOFILES, DUMMYPOFILES, GMOFILES, CATALOGS. + OBSOLETE_ALL_LINGUAS="$ALL_LINGUAS" + # Capture the value of LINGUAS because we need it to compute CATALOGS. + LINGUAS="${LINGUAS-%UNSET%}" + ]) +]) + +dnl Postprocesses a Makefile in a directory containing PO files. +AC_DEFUN([AM_POSTPROCESS_PO_MAKEFILE], +[ + # When this code is run, in config.status, two variables have already been + # set: + # - OBSOLETE_ALL_LINGUAS is the value of LINGUAS set in configure.in, + # - LINGUAS is the value of the environment variable LINGUAS at configure + # time. + +changequote(,)dnl + # Adjust a relative srcdir. + ac_dir=`echo "$ac_file"|sed 's%/[^/][^/]*$%%'` + ac_dir_suffix=/`echo "$ac_dir"|sed 's%^\./%%'` + ac_dots=`echo "$ac_dir_suffix"|sed 's%/[^/]*%../%g'` + # In autoconf-2.13 it is called $ac_given_srcdir. + # In autoconf-2.50 it is called $srcdir. + test -n "$ac_given_srcdir" || ac_given_srcdir="$srcdir" + case "$ac_given_srcdir" in + .) top_srcdir=`echo $ac_dots|sed 's%/$%%'` ;; + /*) top_srcdir="$ac_given_srcdir" ;; + *) top_srcdir="$ac_dots$ac_given_srcdir" ;; + esac + + # Find a way to echo strings without interpreting backslash. + if test "X`(echo '\t') 2>/dev/null`" = 'X\t'; then + gt_echo='echo' + else + if test "X`(printf '%s\n' '\t') 2>/dev/null`" = 'X\t'; then + gt_echo='printf %s\n' + else + echo_func () { + cat < "$ac_file.tmp" + tab=`printf '\t'` + if grep -l '@TCLCATALOGS@' "$ac_file" > /dev/null; then + # Add dependencies that cannot be formulated as a simple suffix rule. + for lang in $ALL_LINGUAS; do + frobbedlang=`echo $lang | sed -e 's/\..*$//' -e 'y/ABCDEFGHIJKLMNOPQRSTUVWXYZ/abcdefghijklmnopqrstuvwxyz/'` + cat >> "$ac_file.tmp" < /dev/null; then + # Add dependencies that cannot be formulated as a simple suffix rule. + for lang in $ALL_LINGUAS; do + frobbedlang=`echo $lang | sed -e 's/_/-/g' -e 's/^sr-CS/sr-SP/' -e 's/@latin$/-Latn/' -e 's/@cyrillic$/-Cyrl/' -e 's/^sr-SP$/sr-SP-Latn/' -e 's/^uz-UZ$/uz-UZ-Latn/'` + cat >> "$ac_file.tmp" <> "$ac_file.tmp" <, 1996. + +AC_PREREQ([2.50]) + +# Search path for a program which passes the given test. + +dnl AM_PATH_PROG_WITH_TEST(VARIABLE, PROG-TO-CHECK-FOR, +dnl TEST-PERFORMED-ON-FOUND_PROGRAM [, VALUE-IF-NOT-FOUND [, PATH]]) +AC_DEFUN([AM_PATH_PROG_WITH_TEST], +[ +# Prepare PATH_SEPARATOR. +# The user is always right. +if test "${PATH_SEPARATOR+set}" != set; then + # Determine PATH_SEPARATOR by trying to find /bin/sh in a PATH which + # contains only /bin. Note that ksh looks also at the FPATH variable, + # so we have to set that as well for the test. + PATH_SEPARATOR=: + (PATH='/bin;/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 \ + && { (PATH='/bin:/bin'; FPATH=$PATH; sh -c :) >/dev/null 2>&1 \ + || PATH_SEPARATOR=';' + } +fi + +# Find out how to test for executable files. Don't use a zero-byte file, +# as systems may use methods other than mode bits to determine executability. +cat >conf$$.file <<_ASEOF +#! /bin/sh +exit 0 +_ASEOF +chmod +x conf$$.file +if test -x conf$$.file >/dev/null 2>&1; then + ac_executable_p="test -x" +else + ac_executable_p="test -f" +fi +rm -f conf$$.file + +# Extract the first word of "$2", so it can be a program name with args. +set dummy $2; ac_word=[$]2 +AC_MSG_CHECKING([for $ac_word]) +AC_CACHE_VAL([ac_cv_path_$1], +[case "[$]$1" in + [[\\/]]* | ?:[[\\/]]*) + ac_cv_path_$1="[$]$1" # Let the user override the test with a path. + ;; + *) + ac_save_IFS="$IFS"; IFS=$PATH_SEPARATOR + for ac_dir in ifelse([$5], , $PATH, [$5]); do + IFS="$ac_save_IFS" + test -z "$ac_dir" && ac_dir=. + for ac_exec_ext in '' $ac_executable_extensions; do + if $ac_executable_p "$ac_dir/$ac_word$ac_exec_ext"; then + echo "$as_me: trying $ac_dir/$ac_word..." >&AS_MESSAGE_LOG_FD + if [$3]; then + ac_cv_path_$1="$ac_dir/$ac_word$ac_exec_ext" + break 2 + fi + fi + done + done + IFS="$ac_save_IFS" +dnl If no 4th arg is given, leave the cache variable unset, +dnl so AC_PATH_PROGS will keep looking. +ifelse([$4], , , [ test -z "[$]ac_cv_path_$1" && ac_cv_path_$1="$4" +])dnl + ;; +esac])dnl +$1="$ac_cv_path_$1" +if test ifelse([$4], , [-n "[$]$1"], ["[$]$1" != "$4"]); then + AC_MSG_RESULT([$][$1]) +else + AC_MSG_RESULT([no]) +fi +AC_SUBST([$1])dnl +]) diff --git a/config/rpm.am b/config/rpm.am new file mode 100644 index 000000000000..9dd69ade333e --- /dev/null +++ b/config/rpm.am @@ -0,0 +1,93 @@ +############################################################################### +# Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. +# Copyright (C) 2007 The Regents of the University of California. +# Written by Brian Behlendorf . +############################################################################### +# Build targets for RPM packages. +############################################################################### + +PHONY += srpm srpms srpm-kmod srpm-dkms srpm-utils +PHONY += rpm rpms rpm-kmod rpm-dkms rpm-utils +PHONY += srpm-common rpm-common rpm-local + +srpm-kmod srpm-dkms srpm-utils: dist + +srpm-kmod: + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-kmod" \ + def='${SRPM_DEFINE_COMMON} ${SRPM_DEFINE_KMOD}' srpm-common + +srpm-dkms: + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-dkms" \ + def='${SRPM_DEFINE_COMMON} ${SRPM_DEFINE_DKMS}' srpm-common + +srpm-utils: + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}" \ + def='${SRPM_DEFINE_COMMON} ${SRPM_DEFINE_UTIL}' srpm-common + +srpm: srpm-kmod srpm-dkms srpm-utils +srpms: srpm-kmod srpm-dkms srpm-utils + +rpm-kmod: srpm-kmod + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-kmod" \ + def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_KMOD}' rpm-common + +rpm-dkms: srpm-dkms + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}-dkms" \ + def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_DKMS}' rpm-common + +rpm-utils: srpm-utils + $(MAKE) $(AM_MAKEFLAGS) pkg="${PACKAGE}" \ + def='${RPM_DEFINE_COMMON} ${RPM_DEFINE_UTIL}' rpm-common + +rpm: rpm-kmod rpm-dkms rpm-utils +rpms: rpm-kmod rpm-dkms rpm-utils + +rpm-local: + @(if test "${HAVE_RPMBUILD}" = "no"; then \ + echo -e "\n" \ + "*** Required util ${RPMBUILD} missing. Please install the\n" \ + "*** package for your distribution which provides ${RPMBUILD},\n" \ + "*** re-run configure, and try again.\n"; \ + exit 1; \ + fi; \ + mkdir -p $(rpmbuild)/TMP && \ + mkdir -p $(rpmbuild)/BUILD && \ + mkdir -p $(rpmbuild)/RPMS && \ + mkdir -p $(rpmbuild)/SRPMS && \ + mkdir -p $(rpmbuild)/SPECS && \ + cp ${RPM_SPEC_DIR}/$(rpmspec) $(rpmbuild)/SPECS && \ + mkdir -p $(rpmbuild)/SOURCES && \ + cp $(top_srcdir)/scripts/kmodtool $(rpmbuild)/SOURCES && \ + cp $(distdir).tar.gz $(rpmbuild)/SOURCES) + +srpm-common: + @(dist=`$(RPM) --eval %{?dist}`; \ + rpmpkg=$(pkg)-$(VERSION)-$(RELEASE)$$dist*src.rpm; \ + rpmspec=$(pkg).spec; \ + rpmbuild=`mktemp -t -d $(PACKAGE)-build-$$USER-XXXXXXXX`; \ + $(MAKE) $(AM_MAKEFLAGS) \ + rpmbuild="$$rpmbuild" \ + rpmspec="$$rpmspec" \ + rpm-local || exit 1; \ + LANG=C $(RPMBUILD) \ + --define "_tmppath $$rpmbuild/TMP" \ + --define "_topdir $$rpmbuild" \ + $(def) -bs $$rpmbuild/SPECS/$$rpmspec || exit 1; \ + cp $$rpmbuild/SRPMS/$$rpmpkg . || exit 1; \ + rm -R $$rpmbuild) + +rpm-common: + @(dist=`$(RPM) --eval %{?dist}`; \ + rpmpkg=$(pkg)-$(VERSION)-$(RELEASE)$$dist*src.rpm; \ + rpmspec=$(pkg).spec; \ + rpmbuild=`mktemp -t -d $(PACKAGE)-build-$$USER-XXXXXXXX`; \ + $(MAKE) $(AM_MAKEFLAGS) \ + rpmbuild="$$rpmbuild" \ + rpmspec="$$rpmspec" \ + rpm-local || exit 1; \ + LANG=C ${RPMBUILD} \ + --define "_tmppath $$rpmbuild/TMP" \ + --define "_topdir $$rpmbuild" \ + $(def) --rebuild $$rpmpkg || exit 1; \ + cp $$rpmbuild/RPMS/*/* . || exit 1; \ + rm -R $$rpmbuild) diff --git a/config/tgz.am b/config/tgz.am new file mode 100644 index 000000000000..2499ba42305b --- /dev/null +++ b/config/tgz.am @@ -0,0 +1,30 @@ +PHONY += tgz tgz-kmod tgz-utils tgz-local + +tgz-local: + @(if test "${HAVE_ALIEN}" = "no"; then \ + echo -e "\n" \ + "*** Required util ${ALIEN} missing. Please install the\n" \ + "*** package for your distribution which provides ${ALIEN},\n" \ + "*** re-run configure, and try again.\n"; \ + exit 1; \ + fi) + +tgz-kmod: tgz-local rpm-kmod + name=${PACKAGE}; \ + version=${VERSION}-${RELEASE}; \ + arch=`$(RPM) -qp $${name}-kmod-$${version}.src.rpm --qf %{arch} | tail -1`; \ + pkg1=kmod-$${name}*$${version}.$${arch}.rpm; \ + fakeroot $(ALIEN) --scripts --to-tgz $$pkg1; \ + $(RM) $$pkg1 + +tgz-utils: tgz-local rpm-utils + name=${PACKAGE}; \ + version=${VERSION}-${RELEASE}; \ + arch=`$(RPM) -qp $${name}-$${version}.src.rpm --qf %{arch} | tail -1`; \ + pkg1=$${name}-$${version}.$${arch}.rpm; \ + pkg2=$${name}-devel-$${version}.$${arch}.rpm; \ + pkg3=$${name}-test-$${version}.$${arch}.rpm; \ + fakeroot $(ALIEN) --scripts --to-tgz $$pkg1 $$pkg2 $$pkg3; \ + $(RM) $$pkg1 $$pkg2 $$pkg3 + +tgz: tgz-kmod tgz-utils diff --git a/config/toolchain-simd.m4 b/config/toolchain-simd.m4 new file mode 100644 index 000000000000..1153cd6941a8 --- /dev/null +++ b/config/toolchain-simd.m4 @@ -0,0 +1,424 @@ +dnl # +dnl # Checks if host toolchain supports SIMD instructions +dnl # +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD], [ + case "$host_cpu" in + amd64 | x86_64 | x86 | i686) + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2 + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ + ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE + ;; + esac +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE], [ + AC_MSG_CHECKING([whether host toolchain supports SSE]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + __asm__ __volatile__("xorps %xmm0, %xmm1"); + } + ]])], [ + AC_DEFINE([HAVE_SSE], 1, [Define if host toolchain supports SSE]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE2], [ + AC_MSG_CHECKING([whether host toolchain supports SSE2]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + __asm__ __volatile__("pxor %xmm0, %xmm1"); + } + ]])], [ + AC_DEFINE([HAVE_SSE2], 1, [Define if host toolchain supports SSE2]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE3], [ + AC_MSG_CHECKING([whether host toolchain supports SSE3]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + char v[16]; + __asm__ __volatile__("lddqu %0,%%xmm0" :: "m"(v[0])); + } + ]])], [ + AC_DEFINE([HAVE_SSE3], 1, [Define if host toolchain supports SSE3]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSSE3], [ + AC_MSG_CHECKING([whether host toolchain supports SSSE3]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + __asm__ __volatile__("pshufb %xmm0,%xmm1"); + } + ]])], [ + AC_DEFINE([HAVE_SSSE3], 1, [Define if host toolchain supports SSSE3]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_1], [ + AC_MSG_CHECKING([whether host toolchain supports SSE4.1]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + __asm__ __volatile__("pmaxsb %xmm0,%xmm1"); + } + ]])], [ + AC_DEFINE([HAVE_SSE4_1], 1, [Define if host toolchain supports SSE4.1]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_SSE4_2], [ + AC_MSG_CHECKING([whether host toolchain supports SSE4.2]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + __asm__ __volatile__("pcmpgtq %xmm0, %xmm1"); + } + ]])], [ + AC_DEFINE([HAVE_SSE4_2], 1, [Define if host toolchain supports SSE4.2]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX], [ + AC_MSG_CHECKING([whether host toolchain supports AVX]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([[ + void main() + { + char v[32]; + __asm__ __volatile__("vmovdqa %0,%%ymm0" :: "m"(v[0])); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX], 1, [Define if host toolchain supports AVX]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX2], [ + AC_MSG_CHECKING([whether host toolchain supports AVX2]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpshufb %ymm0,%ymm1,%ymm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX2], 1, [Define if host toolchain supports AVX2]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512F], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512F]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpandd %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512F], 1, [Define if host toolchain supports AVX512F]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512CD], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512CD]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vplzcntd %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512CD], 1, [Define if host toolchain supports AVX512CD]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512DQ], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512DQ]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vandpd %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512DQ], 1, [Define if host toolchain supports AVX512DQ]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512BW], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512BW]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpshufb %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512BW], 1, [Define if host toolchain supports AVX512BW]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512IFMA], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512IFMA]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpmadd52luq %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512IFMA], 1, [Define if host toolchain supports AVX512IFMA]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VBMI], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512VBMI]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpermb %zmm0,%zmm1,%zmm2"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512VBMI], 1, [Define if host toolchain supports AVX512VBMI]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512PF], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512PF]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vgatherpf0dps (%rsi,%zmm0,4){%k1}"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512PF], 1, [Define if host toolchain supports AVX512PF]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512ER], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512ER]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vexp2pd %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512ER], 1, [Define if host toolchain supports AVX512ER]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AVX512VL], [ + AC_MSG_CHECKING([whether host toolchain supports AVX512VL]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("vpabsq %zmm0,%zmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AVX512VL], 1, [Define if host toolchain supports AVX512VL]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_AES], [ + AC_MSG_CHECKING([whether host toolchain supports AES]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("aesenc %xmm0, %xmm1"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_AES], 1, [Define if host toolchain supports AES]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_PCLMULQDQ], [ + AC_MSG_CHECKING([whether host toolchain supports PCLMULQDQ]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("pclmulqdq %0, %%xmm0, %%xmm1" :: "i"(0)); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_PCLMULQDQ], 1, [Define if host toolchain supports PCLMULQDQ]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +dnl # +dnl # ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE +dnl # +AC_DEFUN([ZFS_AC_CONFIG_TOOLCHAIN_CAN_BUILD_MOVBE], [ + AC_MSG_CHECKING([whether host toolchain supports MOVBE]) + + AC_LINK_IFELSE([AC_LANG_SOURCE([ + [ + void main() + { + __asm__ __volatile__("movbe 0(%eax), %eax"); + } + ]])], [ + AC_MSG_RESULT([yes]) + AC_DEFINE([HAVE_MOVBE], 1, [Define if host toolchain supports MOVBE]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/user-clock_gettime.m4 b/config/user-clock_gettime.m4 new file mode 100644 index 000000000000..c96024da797b --- /dev/null +++ b/config/user-clock_gettime.m4 @@ -0,0 +1,12 @@ +dnl # +dnl # Check if librt is required for clock_gettime. +dnl # clock_gettime is generally available in libc on modern systems. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_CLOCK_GETTIME], [ + AC_CHECK_FUNC([clock_gettime], [], [ + AC_CHECK_LIB([rt], [clock_gettime], [ + AC_SUBST([LIBCLOCK_GETTIME], [-lrt])], [ + AC_MSG_FAILURE([*** clock_gettime is missing in libc and librt]) + ]) + ]) +]) diff --git a/config/user-dracut.m4 b/config/user-dracut.m4 new file mode 100644 index 000000000000..95f800bda47a --- /dev/null +++ b/config/user-dracut.m4 @@ -0,0 +1,22 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_DRACUT], [ + AC_MSG_CHECKING(for dracut directory) + AC_ARG_WITH([dracutdir], + AC_HELP_STRING([--with-dracutdir=DIR], + [install dracut helpers @<:@default=check@:>@]), + [dracutdir=$withval], + [dracutdir=check]) + + AS_IF([test "x$dracutdir" = xcheck], [ + path1=/usr/share/dracut + path2=/usr/lib/dracut + default=$path2 + + AS_IF([test -d "$path1"], [dracutdir="$path1"], [ + AS_IF([test -d "$path2"], [dracutdir="$path2"], + [dracutdir="$default"]) + ]) + ]) + + AC_SUBST(dracutdir) + AC_MSG_RESULT([$dracutdir]) +]) diff --git a/config/user-gettext.m4 b/config/user-gettext.m4 new file mode 100644 index 000000000000..824318eab960 --- /dev/null +++ b/config/user-gettext.m4 @@ -0,0 +1,6 @@ +dnl # +dnl # Check if libintl and possibly libiconv are needed for gettext() functionality +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_GETTEXT], [ + AM_GNU_GETTEXT([external]) +]) diff --git a/config/user-libaio.m4 b/config/user-libaio.m4 new file mode 100644 index 000000000000..95c144d76b4f --- /dev/null +++ b/config/user-libaio.m4 @@ -0,0 +1,6 @@ +dnl # +dnl # Check for libaio - only used for libaiot test cases. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBAIO], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBAIO, [], [libaio.h], [], [aio], [], [user_libaio=yes], [user_libaio=no]) +]) diff --git a/config/user-libblkid.m4 b/config/user-libblkid.m4 new file mode 100644 index 000000000000..f2016dcb15cd --- /dev/null +++ b/config/user-libblkid.m4 @@ -0,0 +1,9 @@ +dnl # +dnl # Check for libblkid. Basic support for detecting ZFS pools +dnl # has existing in blkid since 2008. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBBLKID], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBBLKID, [blkid], [blkid/blkid.h], [], [blkid], [], [], [ + AC_MSG_FAILURE([ + *** blkid.h missing, libblkid-devel package required])]) +]) diff --git a/config/user-libcrypto.m4 b/config/user-libcrypto.m4 new file mode 100644 index 000000000000..7293e1b0b42c --- /dev/null +++ b/config/user-libcrypto.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check for libcrypto. Used for userspace password derivation via PBKDF2. +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBCRYPTO], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBCRYPTO, [libcrypto], [openssl/evp.h], [], [crypto], [PKCS5_PBKDF2_HMAC_SHA1], [], [ + AC_MSG_FAILURE([ + *** evp.h missing, libssl-devel package required])]) +]) diff --git a/config/user-libexec.m4 b/config/user-libexec.m4 new file mode 100644 index 000000000000..31bcea3fcfd3 --- /dev/null +++ b/config/user-libexec.m4 @@ -0,0 +1,9 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_ZFSEXEC], [ + AC_ARG_WITH(zfsexecdir, + AC_HELP_STRING([--with-zfsexecdir=DIR], + [install scripts [[@<:@libexecdir@:>@/zfs]]]), + [zfsexecdir=$withval], + [zfsexecdir="${libexecdir}/zfs"]) + + AC_SUBST([zfsexecdir]) +]) diff --git a/config/user-libtirpc.m4 b/config/user-libtirpc.m4 new file mode 100644 index 000000000000..aa7ab4a1fd32 --- /dev/null +++ b/config/user-libtirpc.m4 @@ -0,0 +1,30 @@ +dnl # +dnl # Check for libtirpc - may be needed for xdr functionality +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBTIRPC], [ + AC_ARG_WITH([tirpc], + [AS_HELP_STRING([--with-tirpc], + [use tirpc for xdr encoding @<:@default=check@:>@])], + [], + [with_tirpc=check]) + + have_xdr= + + AS_IF([test "x$with_tirpc" != "xyes"], [ + AC_SEARCH_LIBS([xdrmem_create], [], [have_xdr=1], [ + AS_IF([test "x$with_tirpc" = "xno"], [ + AC_MSG_FAILURE([xdrmem_create() requires sunrpc support in libc if not using libtirpc]) + ]) + ]) + ]) + + AS_IF([test "x$have_xdr" = "x"], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBTIRPC, [libtirpc], [rpc/xdr.h], [tirpc], [tirpc], [xdrmem_create], [], [ + AS_IF([test "x$with_tirpc" = "xyes"], [ + AC_MSG_FAILURE([--with-tirpc was given, but libtirpc is not available, try installing libtirpc-devel]) + ],[dnl ELSE + AC_MSG_FAILURE([neither libc sunrpc support nor libtirpc is available, try installing libtirpc-devel]) + ]) + ]) + ]) +]) diff --git a/config/user-libudev.m4 b/config/user-libudev.m4 new file mode 100644 index 000000000000..8c3c1d7e0087 --- /dev/null +++ b/config/user-libudev.m4 @@ -0,0 +1,17 @@ +dnl # +dnl # Check for libudev - needed for vdev auto-online and auto-replace +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUDEV], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUDEV, [libudev], [libudev.h], [], [udev], [], [user_libudev=yes], [user_libudev=no]) + + AS_IF([test "x$user_libudev" = xyes], [ + AX_SAVE_FLAGS + + CFLAGS="$CFLAGS $LIBUDEV_CFLAGS" + LIBS="$LIBUDEV_LIBS $LIBS" + + AC_CHECK_FUNCS([udev_device_get_is_initialized]) + + AX_RESTORE_FLAGS + ]) +]) diff --git a/config/user-libuuid.m4 b/config/user-libuuid.m4 new file mode 100644 index 000000000000..0cfa83c9926a --- /dev/null +++ b/config/user-libuuid.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check for libuuid +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_LIBUUID], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(LIBUUID, [uuid], [uuid/uuid.h], [], [uuid], [uuid_generate uuid_is_null], [], [ + AC_MSG_FAILURE([*** libuuid-devel package required]) + ]) +]) diff --git a/config/user-makedev.m4 b/config/user-makedev.m4 new file mode 100644 index 000000000000..4383681a8f4c --- /dev/null +++ b/config/user-makedev.m4 @@ -0,0 +1,39 @@ +dnl # +dnl # glibc 2.25 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS], [ + AC_MSG_CHECKING([makedev() is declared in sys/sysmacros.h]) + AC_TRY_COMPILE( + [ + #include + ],[ + int k; + k = makedev(0,0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MAKEDEV_IN_SYSMACROS, 1, + [makedev() is declared in sys/sysmacros.h]) + ],[ + AC_MSG_RESULT(no) + ]) +]) + +dnl # +dnl # glibc X < Y < 2.25 +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV], [ + AC_MSG_CHECKING([makedev() is declared in sys/mkdev.h]) + AC_TRY_COMPILE( + [ + #include + ],[ + int k; + k = makedev(0,0); + ],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_MAKEDEV_IN_MKDEV, 1, + [makedev() is declared in sys/mkdev.h]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/user-pam.m4 b/config/user-pam.m4 new file mode 100644 index 000000000000..9db35808c340 --- /dev/null +++ b/config/user-pam.m4 @@ -0,0 +1,38 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_PAM], [ + AC_ARG_ENABLE([pam], + AS_HELP_STRING([--enable-pam], + [install pam_zfs_key module [[default: check]]]), + [enable_pam=$enableval], + [enable_pam=check]) + + AC_ARG_WITH(pammoduledir, + AS_HELP_STRING([--with-pammoduledir=DIR], + [install pam module in dir [[$libdir/security]]]), + [pammoduledir="$withval"],[pammoduledir=$libdir/security]) + + AC_ARG_WITH(pamconfigsdir, + AS_HELP_STRING([--with-pamconfigsdir=DIR], + [install pam-config files in dir [DATADIR/pam-configs]]), + [pamconfigsdir="$withval"], + [pamconfigsdir='${datadir}/pam-configs']) + + AS_IF([test "x$enable_pam" != "xno"], [ + AC_CHECK_HEADERS([security/pam_modules.h], [ + enable_pam=yes + ], [ + AS_IF([test "x$enable_pam" = "xyes"], [ + AC_MSG_FAILURE([ + *** security/pam_modules.h missing, libpam0g-dev package required + ]) + ],[ + enable_pam=no + ]) + ]) + ]) + AS_IF([test "x$enable_pam" = "xyes"], [ + DEFINE_PAM='--with pam' + ]) + AC_SUBST(DEFINE_PAM) + AC_SUBST(pammoduledir) + AC_SUBST(pamconfigsdir) +]) diff --git a/config/user-runstatedir.m4 b/config/user-runstatedir.m4 new file mode 100644 index 000000000000..ded1362c7b22 --- /dev/null +++ b/config/user-runstatedir.m4 @@ -0,0 +1,6 @@ +dnl For backwards compatibility; runstatedir added in autoconf 2.70. +AC_DEFUN([ZFS_AC_CONFIG_USER_RUNSTATEDIR], [ + if test "x$runstatedir" = x; then + AC_SUBST([runstatedir], ['${localstatedir}/run']) + fi +]) diff --git a/config/user-systemd.m4 b/config/user-systemd.m4 new file mode 100644 index 000000000000..3e6a4a281f3c --- /dev/null +++ b/config/user-systemd.m4 @@ -0,0 +1,53 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_SYSTEMD], [ + AC_ARG_ENABLE(systemd, + AC_HELP_STRING([--enable-systemd], + [install systemd unit/preset files [[default: yes]]]), + [enable_systemd=$enableval], + [enable_systemd=check]) + + AC_ARG_WITH(systemdunitdir, + AC_HELP_STRING([--with-systemdunitdir=DIR], + [install systemd unit files in dir [[/usr/lib/systemd/system]]]), + systemdunitdir=$withval,systemdunitdir=/usr/lib/systemd/system) + + AC_ARG_WITH(systemdpresetdir, + AC_HELP_STRING([--with-systemdpresetdir=DIR], + [install systemd preset files in dir [[/usr/lib/systemd/system-preset]]]), + systemdpresetdir=$withval,systemdpresetdir=/usr/lib/systemd/system-preset) + + AC_ARG_WITH(systemdmodulesloaddir, + AC_HELP_STRING([--with-systemdmodulesloaddir=DIR], + [install systemd module load files into dir [[/usr/lib/modules-load.d]]]), + systemdmodulesloaddir=$withval,systemdmodulesloaddir=/usr/lib/modules-load.d) + + AC_ARG_WITH(systemdgeneratordir, + AC_HELP_STRING([--with-systemdgeneratordir=DIR], + [install systemd generators in dir [[/usr/lib/systemd/system-generators]]]), + systemdgeneratordir=$withval,systemdgeneratordir=/usr/lib/systemd/system-generators) + + AS_IF([test "x$enable_systemd" = xcheck], [ + AS_IF([systemctl --version >/dev/null 2>&1], + [enable_systemd=yes], + [enable_systemd=no]) + ]) + + AC_MSG_CHECKING(for systemd support) + AC_MSG_RESULT([$enable_systemd]) + + AS_IF([test "x$enable_systemd" = xyes], [ + ZFS_INIT_SYSTEMD=systemd + ZFS_MODULE_LOAD=modules-load.d + DEFINE_SYSTEMD='--with systemd --define "_unitdir $(systemdunitdir)" --define "_presetdir $(systemdpresetdir)" --define "_generatordir $(systemdgeneratordir)"' + modulesloaddir=$systemdmodulesloaddir + ],[ + DEFINE_SYSTEMD='--without systemd' + ]) + + AC_SUBST(ZFS_INIT_SYSTEMD) + AC_SUBST(ZFS_MODULE_LOAD) + AC_SUBST(DEFINE_SYSTEMD) + AC_SUBST(systemdunitdir) + AC_SUBST(systemdpresetdir) + AC_SUBST(systemdgeneratordir) + AC_SUBST(modulesloaddir) +]) diff --git a/config/user-sysvinit.m4 b/config/user-sysvinit.m4 new file mode 100644 index 000000000000..65dcc3819231 --- /dev/null +++ b/config/user-sysvinit.m4 @@ -0,0 +1,11 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_SYSVINIT], [ + AC_ARG_ENABLE(sysvinit, + AC_HELP_STRING([--enable-sysvinit], + [install SysV init scripts [default: yes]]), + [],enable_sysvinit=yes) + + AS_IF([test "x$enable_sysvinit" = xyes], + [ZFS_INIT_SYSV=init.d]) + + AC_SUBST(ZFS_INIT_SYSV) +]) diff --git a/config/user-udev.m4 b/config/user-udev.m4 new file mode 100644 index 000000000000..65dc79fb4847 --- /dev/null +++ b/config/user-udev.m4 @@ -0,0 +1,29 @@ +AC_DEFUN([ZFS_AC_CONFIG_USER_UDEV], [ + AC_MSG_CHECKING(for udev directories) + AC_ARG_WITH(udevdir, + AC_HELP_STRING([--with-udevdir=DIR], + [install udev helpers @<:@default=check@:>@]), + [udevdir=$withval], + [udevdir=check]) + + AS_IF([test "x$udevdir" = xcheck], [ + path1=/lib/udev + path2=/usr/lib/udev + default=$path2 + + AS_IF([test -d "$path1"], [udevdir="$path1"], [ + AS_IF([test -d "$path2"], [udevdir="$path2"], + [udevdir="$default"]) + ]) + ]) + + AC_ARG_WITH(udevruledir, + AC_HELP_STRING([--with-udevruledir=DIR], + [install udev rules [[UDEVDIR/rules.d]]]), + [udevruledir=$withval], + [udevruledir="${udevdir}/rules.d"]) + + AC_SUBST(udevdir) + AC_SUBST(udevruledir) + AC_MSG_RESULT([$udevdir;$udevruledir]) +]) diff --git a/config/user-zlib.m4 b/config/user-zlib.m4 new file mode 100644 index 000000000000..1f3792829bb6 --- /dev/null +++ b/config/user-zlib.m4 @@ -0,0 +1,8 @@ +dnl # +dnl # Check for zlib +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER_ZLIB], [ + ZFS_AC_FIND_SYSTEM_LIBRARY(ZLIB, [zlib], [zlib.h], [], [z], [compress2 uncompress crc32], [], [ + AC_MSG_FAILURE([*** zlib-devel package required]) + ]) +]) diff --git a/config/user.m4 b/config/user.m4 new file mode 100644 index 000000000000..c220675514e6 --- /dev/null +++ b/config/user.m4 @@ -0,0 +1,45 @@ +dnl # +dnl # Default ZFS user configuration +dnl # +AC_DEFUN([ZFS_AC_CONFIG_USER], [ + ZFS_AC_CONFIG_USER_GETTEXT + ZFS_AC_CONFIG_USER_MOUNT_HELPER + ZFS_AC_CONFIG_USER_SYSVINIT + ZFS_AC_CONFIG_USER_DRACUT + AM_COND_IF([BUILD_FREEBSD], [ + PKG_INSTALLDIR(['${prefix}/libdata/pkgconfig'])], [ + PKG_INSTALLDIR + ]) + ZFS_AC_CONFIG_USER_ZLIB + AM_COND_IF([BUILD_LINUX], [ + ZFS_AC_CONFIG_USER_UDEV + ZFS_AC_CONFIG_USER_SYSTEMD + ZFS_AC_CONFIG_USER_LIBUUID + ZFS_AC_CONFIG_USER_LIBBLKID + ]) + ZFS_AC_CONFIG_USER_LIBTIRPC + ZFS_AC_CONFIG_USER_LIBUDEV + ZFS_AC_CONFIG_USER_LIBCRYPTO + ZFS_AC_CONFIG_USER_LIBAIO + ZFS_AC_CONFIG_USER_CLOCK_GETTIME + ZFS_AC_CONFIG_USER_PAM + ZFS_AC_CONFIG_USER_RUNSTATEDIR + ZFS_AC_CONFIG_USER_MAKEDEV_IN_SYSMACROS + ZFS_AC_CONFIG_USER_MAKEDEV_IN_MKDEV + ZFS_AC_CONFIG_USER_ZFSEXEC + ZFS_AC_TEST_FRAMEWORK + + AC_CHECK_FUNCS([issetugid mlockall strlcat strlcpy]) +]) + +dnl # +dnl # Setup the environment for the ZFS Test Suite. Currently only +dnl # Linux style systems are supported but this infrastructure can +dnl # be extended to support other platforms if needed. +dnl # +AC_DEFUN([ZFS_AC_TEST_FRAMEWORK], [ + ZONENAME="echo global" + AC_SUBST(ZONENAME) + + AC_SUBST(RM) +]) diff --git a/config/zfs-build.m4 b/config/zfs-build.m4 new file mode 100644 index 000000000000..54b61de090e7 --- /dev/null +++ b/config/zfs-build.m4 @@ -0,0 +1,554 @@ +AC_DEFUN([ZFS_AC_LICENSE], [ + AC_MSG_CHECKING([zfs author]) + AC_MSG_RESULT([$ZFS_META_AUTHOR]) + + AC_MSG_CHECKING([zfs license]) + AC_MSG_RESULT([$ZFS_META_LICENSE]) +]) + +AC_DEFUN([ZFS_AC_DEBUG_ENABLE], [ + DEBUG_CFLAGS="-Werror" + DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" + DEBUG_LDFLAGS="" + DEBUG_ZFS="_with_debug" + AC_DEFINE(ZFS_DEBUG, 1, [zfs debugging enabled]) + + KERNEL_DEBUG_CFLAGS="-Werror" + KERNEL_DEBUG_CPPFLAGS="-DDEBUG -UNDEBUG" +]) + +AC_DEFUN([ZFS_AC_DEBUG_DISABLE], [ + DEBUG_CFLAGS="" + DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" + DEBUG_LDFLAGS="" + DEBUG_ZFS="_without_debug" + + KERNEL_DEBUG_CFLAGS="" + KERNEL_DEBUG_CPPFLAGS="-UDEBUG -DNDEBUG" +]) + +dnl # +dnl # When debugging is enabled: +dnl # - Enable all ASSERTs (-DDEBUG) +dnl # - Promote all compiler warnings to errors (-Werror) +dnl # +AC_DEFUN([ZFS_AC_DEBUG], [ + AC_MSG_CHECKING([whether assertion support will be enabled]) + AC_ARG_ENABLE([debug], + [AS_HELP_STRING([--enable-debug], + [Enable compiler and code assertions @<:@default=no@:>@])], + [], + [enable_debug=no]) + + AS_CASE(["x$enable_debug"], + ["xyes"], + [ZFS_AC_DEBUG_ENABLE], + ["xno"], + [ZFS_AC_DEBUG_DISABLE], + [AC_MSG_ERROR([Unknown option $enable_debug])]) + + AC_SUBST(DEBUG_CFLAGS) + AC_SUBST(DEBUG_CPPFLAGS) + AC_SUBST(DEBUG_LDFLAGS) + AC_SUBST(DEBUG_ZFS) + + AC_SUBST(KERNEL_DEBUG_CFLAGS) + AC_SUBST(KERNEL_DEBUG_CPPFLAGS) + + AC_MSG_RESULT([$enable_debug]) +]) + +AC_DEFUN([ZFS_AC_DEBUGINFO_ENABLE], [ + DEBUG_CFLAGS="$DEBUG_CFLAGS -g -fno-inline $NO_IPA_SRA" + + KERNEL_DEBUG_CFLAGS="$KERNEL_DEBUG_CFLAGS -fno-inline $NO_IPA_SRA" + KERNEL_MAKE="$KERNEL_MAKE CONFIG_DEBUG_INFO=y" + + DEBUGINFO_ZFS="_with_debuginfo" +]) + +AC_DEFUN([ZFS_AC_DEBUGINFO_DISABLE], [ + DEBUGINFO_ZFS="_without_debuginfo" +]) + +AC_DEFUN([ZFS_AC_DEBUGINFO], [ + AC_MSG_CHECKING([whether debuginfo support will be forced]) + AC_ARG_ENABLE([debuginfo], + [AS_HELP_STRING([--enable-debuginfo], + [Force generation of debuginfo @<:@default=no@:>@])], + [], + [enable_debuginfo=no]) + + AS_CASE(["x$enable_debuginfo"], + ["xyes"], + [ZFS_AC_DEBUGINFO_ENABLE], + ["xno"], + [ZFS_AC_DEBUGINFO_DISABLE], + [AC_MSG_ERROR([Unknown option $enable_debuginfo])]) + + AC_SUBST(DEBUG_CFLAGS) + AC_SUBST(DEBUGINFO_ZFS) + + AC_SUBST(KERNEL_DEBUG_CFLAGS) + AC_SUBST(KERNEL_MAKE) + + AC_MSG_RESULT([$enable_debuginfo]) +]) + +dnl # +dnl # Disabled by default, provides basic memory tracking. Track the total +dnl # number of bytes allocated with kmem_alloc() and freed with kmem_free(). +dnl # Then at module unload time if any bytes were leaked it will be reported +dnl # on the console. +dnl # +AC_DEFUN([ZFS_AC_DEBUG_KMEM], [ + AC_MSG_CHECKING([whether basic kmem accounting is enabled]) + AC_ARG_ENABLE([debug-kmem], + [AS_HELP_STRING([--enable-debug-kmem], + [Enable basic kmem accounting @<:@default=no@:>@])], + [], + [enable_debug_kmem=no]) + + AS_IF([test "x$enable_debug_kmem" = xyes], [ + KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM" + DEBUG_KMEM_ZFS="_with_debug_kmem" + ], [ + DEBUG_KMEM_ZFS="_without_debug_kmem" + ]) + + AC_SUBST(KERNEL_DEBUG_CPPFLAGS) + AC_SUBST(DEBUG_KMEM_ZFS) + + AC_MSG_RESULT([$enable_debug_kmem]) +]) + +dnl # +dnl # Disabled by default, provides detailed memory tracking. This feature +dnl # also requires --enable-debug-kmem to be set. When enabled not only will +dnl # total bytes be tracked but also the location of every kmem_alloc() and +dnl # kmem_free(). When the module is unloaded a list of all leaked addresses +dnl # and where they were allocated will be dumped to the console. Enabling +dnl # this feature has a significant impact on performance but it makes finding +dnl # memory leaks straight forward. +dnl # +AC_DEFUN([ZFS_AC_DEBUG_KMEM_TRACKING], [ + AC_MSG_CHECKING([whether detailed kmem tracking is enabled]) + AC_ARG_ENABLE([debug-kmem-tracking], + [AS_HELP_STRING([--enable-debug-kmem-tracking], + [Enable detailed kmem tracking @<:@default=no@:>@])], + [], + [enable_debug_kmem_tracking=no]) + + AS_IF([test "x$enable_debug_kmem_tracking" = xyes], [ + KERNEL_DEBUG_CPPFLAGS="${KERNEL_DEBUG_CPPFLAGS} -DDEBUG_KMEM_TRACKING" + DEBUG_KMEM_TRACKING_ZFS="_with_debug_kmem_tracking" + ], [ + DEBUG_KMEM_TRACKING_ZFS="_without_debug_kmem_tracking" + ]) + + AC_SUBST(KERNEL_DEBUG_CPPFLAGS) + AC_SUBST(DEBUG_KMEM_TRACKING_ZFS) + + AC_MSG_RESULT([$enable_debug_kmem_tracking]) +]) + +AC_DEFUN([ZFS_AC_CONFIG_ALWAYS], [ + ZFS_AC_CONFIG_ALWAYS_CC_NO_UNUSED_BUT_SET_VARIABLE + ZFS_AC_CONFIG_ALWAYS_CC_NO_BOOL_COMPARE + ZFS_AC_CONFIG_ALWAYS_CC_FRAME_LARGER_THAN + ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_TRUNCATION + ZFS_AC_CONFIG_ALWAYS_CC_NO_FORMAT_ZERO_LENGTH + ZFS_AC_CONFIG_ALWAYS_CC_NO_OMIT_FRAME_POINTER + ZFS_AC_CONFIG_ALWAYS_CC_NO_IPA_SRA + ZFS_AC_CONFIG_ALWAYS_CC_ASAN + ZFS_AC_CONFIG_ALWAYS_TOOLCHAIN_SIMD + ZFS_AC_CONFIG_ALWAYS_SYSTEM + ZFS_AC_CONFIG_ALWAYS_ARCH + ZFS_AC_CONFIG_ALWAYS_PYTHON + ZFS_AC_CONFIG_ALWAYS_PYZFS + ZFS_AC_CONFIG_ALWAYS_SED +]) + +AC_DEFUN([ZFS_AC_CONFIG], [ + + dnl # Remove the previous build test directory. + rm -Rf build + + ZFS_CONFIG=all + AC_ARG_WITH([config], + AS_HELP_STRING([--with-config=CONFIG], + [Config file 'kernel|user|all|srpm']), + [ZFS_CONFIG="$withval"]) + AC_ARG_ENABLE([linux-builtin], + [AC_HELP_STRING([--enable-linux-builtin], + [Configure for builtin in-tree kernel modules @<:@default=no@:>@])], + [], + [enable_linux_builtin=no]) + + AC_MSG_CHECKING([zfs config]) + AC_MSG_RESULT([$ZFS_CONFIG]); + AC_SUBST(ZFS_CONFIG) + + ZFS_AC_CONFIG_ALWAYS + + + AM_COND_IF([BUILD_LINUX], [ + AC_ARG_VAR([TEST_JOBS], + [simultaneous jobs during configure (defaults to $(nproc))]) + if test "x$ac_cv_env_TEST_JOBS_set" != "xset"; then + TEST_JOBS=$(nproc) + fi + AC_SUBST(TEST_JOBS) + ]) + + case "$ZFS_CONFIG" in + kernel) ZFS_AC_CONFIG_KERNEL ;; + user) ZFS_AC_CONFIG_USER ;; + all) ZFS_AC_CONFIG_USER + ZFS_AC_CONFIG_KERNEL ;; + srpm) ;; + *) + AC_MSG_RESULT([Error!]) + AC_MSG_ERROR([Bad value "$ZFS_CONFIG" for --with-config, + user kernel|user|all|srpm]) ;; + esac + + AM_CONDITIONAL([CONFIG_USER], + [test "$ZFS_CONFIG" = user -o "$ZFS_CONFIG" = all]) + AM_CONDITIONAL([CONFIG_KERNEL], + [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && + [test "x$enable_linux_builtin" != xyes ]) + AM_CONDITIONAL([CONFIG_QAT], + [test "$ZFS_CONFIG" = kernel -o "$ZFS_CONFIG" = all] && + [test "x$qatsrc" != x ]) + AM_CONDITIONAL([WANT_DEVNAME2DEVID], [test "x$user_libudev" = xyes ]) + AM_CONDITIONAL([WANT_MMAP_LIBAIO], [test "x$user_libaio" = xyes ]) + AM_CONDITIONAL([PAM_ZFS_ENABLED], [test "x$enable_pam" = xyes]) +]) + +dnl # +dnl # Check for rpm+rpmbuild to build RPM packages. If these tools +dnl # are missing it is non-fatal but you will not be able to build +dnl # RPM packages and will be warned if you try too. +dnl # +dnl # By default the generic spec file will be used because it requires +dnl # minimal dependencies. Distribution specific spec files can be +dnl # placed under the 'rpm/' directory and enabled using +dnl # the --with-spec= configure option. +dnl # +AC_DEFUN([ZFS_AC_RPM], [ + RPM=rpm + RPMBUILD=rpmbuild + + AC_MSG_CHECKING([whether $RPM is available]) + AS_IF([tmp=$($RPM --version 2>/dev/null)], [ + RPM_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') + HAVE_RPM=yes + AC_MSG_RESULT([$HAVE_RPM ($RPM_VERSION)]) + ],[ + HAVE_RPM=no + AC_MSG_RESULT([$HAVE_RPM]) + ]) + + AC_MSG_CHECKING([whether $RPMBUILD is available]) + AS_IF([tmp=$($RPMBUILD --version 2>/dev/null)], [ + RPMBUILD_VERSION=$(echo $tmp | $AWK '/RPM/ { print $[3] }') + HAVE_RPMBUILD=yes + AC_MSG_RESULT([$HAVE_RPMBUILD ($RPMBUILD_VERSION)]) + ],[ + HAVE_RPMBUILD=no + AC_MSG_RESULT([$HAVE_RPMBUILD]) + ]) + + RPM_DEFINE_COMMON='--define "$(DEBUG_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUGINFO_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(DEBUG_KMEM_TRACKING_ZFS) 1"' + RPM_DEFINE_COMMON=${RPM_DEFINE_COMMON}' --define "$(ASAN_ZFS) 1"' + + RPM_DEFINE_UTIL=' --define "_initconfdir $(initconfdir)"' + + dnl # Make the next three RPM_DEFINE_UTIL additions conditional, since + dnl # their values may not be set when running: + dnl # + dnl # ./configure --with-config=srpm + dnl # + AS_IF([test -n "$dracutdir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_dracutdir $(dracutdir)"' + ]) + AS_IF([test -n "$udevdir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevdir $(udevdir)"' + ]) + AS_IF([test -n "$udevruledir" ], [ + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_udevruledir $(udevruledir)"' + ]) + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_INITRAMFS)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_SYSTEMD)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYZFS)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PAM)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_VERSION)' + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' $(DEFINE_PYTHON_PKG_VERSION)' + + dnl # Override default lib directory on Debian/Ubuntu systems. The + dnl # provided /usr/lib/rpm/platform//macros files do not + dnl # specify the correct path for multiarch systems as described + dnl # by the packaging guidelines. + dnl # + dnl # https://wiki.ubuntu.com/MultiarchSpec + dnl # https://wiki.debian.org/Multiarch/Implementation + dnl # + AS_IF([test "$DEFAULT_PACKAGE" = "deb"], [ + MULTIARCH_LIBDIR="lib/$(dpkg-architecture -qDEB_HOST_MULTIARCH)" + RPM_DEFINE_UTIL=${RPM_DEFINE_UTIL}' --define "_lib $(MULTIARCH_LIBDIR)"' + AC_SUBST(MULTIARCH_LIBDIR) + ]) + + dnl # Make RPM_DEFINE_KMOD additions conditional on CONFIG_KERNEL, + dnl # since the values will not be set otherwise. The spec files + dnl # provide defaults for them. + dnl # + RPM_DEFINE_KMOD='--define "_wrong_version_format_terminate_build 0"' + AM_COND_IF([CONFIG_KERNEL], [ + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kernels $(LINUX_VERSION)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "ksrc $(LINUX)"' + RPM_DEFINE_KMOD=${RPM_DEFINE_KMOD}' --define "kobj $(LINUX_OBJ)"' + ]) + + RPM_DEFINE_DKMS='' + + SRPM_DEFINE_COMMON='--define "build_src_rpm 1"' + SRPM_DEFINE_UTIL= + SRPM_DEFINE_KMOD= + SRPM_DEFINE_DKMS= + + RPM_SPEC_DIR="rpm/generic" + AC_ARG_WITH([spec], + AS_HELP_STRING([--with-spec=SPEC], + [Spec files 'generic|redhat']), + [RPM_SPEC_DIR="rpm/$withval"]) + + AC_MSG_CHECKING([whether spec files are available]) + AC_MSG_RESULT([yes ($RPM_SPEC_DIR/*.spec.in)]) + + AC_SUBST(HAVE_RPM) + AC_SUBST(RPM) + AC_SUBST(RPM_VERSION) + + AC_SUBST(HAVE_RPMBUILD) + AC_SUBST(RPMBUILD) + AC_SUBST(RPMBUILD_VERSION) + + AC_SUBST(RPM_SPEC_DIR) + AC_SUBST(RPM_DEFINE_UTIL) + AC_SUBST(RPM_DEFINE_KMOD) + AC_SUBST(RPM_DEFINE_DKMS) + AC_SUBST(RPM_DEFINE_COMMON) + AC_SUBST(SRPM_DEFINE_UTIL) + AC_SUBST(SRPM_DEFINE_KMOD) + AC_SUBST(SRPM_DEFINE_DKMS) + AC_SUBST(SRPM_DEFINE_COMMON) +]) + +dnl # +dnl # Check for dpkg+dpkg-buildpackage to build DEB packages. If these +dnl # tools are missing it is non-fatal but you will not be able to build +dnl # DEB packages and will be warned if you try too. +dnl # +AC_DEFUN([ZFS_AC_DPKG], [ + DPKG=dpkg + DPKGBUILD=dpkg-buildpackage + + AC_MSG_CHECKING([whether $DPKG is available]) + AS_IF([tmp=$($DPKG --version 2>/dev/null)], [ + DPKG_VERSION=$(echo $tmp | $AWK '/Debian/ { print $[7] }') + HAVE_DPKG=yes + AC_MSG_RESULT([$HAVE_DPKG ($DPKG_VERSION)]) + ],[ + HAVE_DPKG=no + AC_MSG_RESULT([$HAVE_DPKG]) + ]) + + AC_MSG_CHECKING([whether $DPKGBUILD is available]) + AS_IF([tmp=$($DPKGBUILD --version 2>/dev/null)], [ + DPKGBUILD_VERSION=$(echo $tmp | \ + $AWK '/Debian/ { print $[4] }' | cut -f-4 -d'.') + HAVE_DPKGBUILD=yes + AC_MSG_RESULT([$HAVE_DPKGBUILD ($DPKGBUILD_VERSION)]) + ],[ + HAVE_DPKGBUILD=no + AC_MSG_RESULT([$HAVE_DPKGBUILD]) + ]) + + AC_SUBST(HAVE_DPKG) + AC_SUBST(DPKG) + AC_SUBST(DPKG_VERSION) + + AC_SUBST(HAVE_DPKGBUILD) + AC_SUBST(DPKGBUILD) + AC_SUBST(DPKGBUILD_VERSION) +]) + +dnl # +dnl # Until native packaging for various different packing systems +dnl # can be added the least we can do is attempt to use alien to +dnl # convert the RPM packages to the needed package type. This is +dnl # a hack but so far it has worked reasonable well. +dnl # +AC_DEFUN([ZFS_AC_ALIEN], [ + ALIEN=alien + + AC_MSG_CHECKING([whether $ALIEN is available]) + AS_IF([tmp=$($ALIEN --version 2>/dev/null)], [ + ALIEN_VERSION=$(echo $tmp | $AWK '{ print $[3] }') + HAVE_ALIEN=yes + AC_MSG_RESULT([$HAVE_ALIEN ($ALIEN_VERSION)]) + ],[ + HAVE_ALIEN=no + AC_MSG_RESULT([$HAVE_ALIEN]) + ]) + + AC_SUBST(HAVE_ALIEN) + AC_SUBST(ALIEN) + AC_SUBST(ALIEN_VERSION) +]) + +dnl # +dnl # Using the VENDOR tag from config.guess set the default +dnl # package type for 'make pkg': (rpm | deb | tgz) +dnl # +AC_DEFUN([ZFS_AC_DEFAULT_PACKAGE], [ + AC_MSG_CHECKING([os distribution]) + if test -f /etc/toss-release ; then + VENDOR=toss ; + elif test -f /etc/fedora-release ; then + VENDOR=fedora ; + elif test -f /etc/redhat-release ; then + VENDOR=redhat ; + elif test -f /etc/gentoo-release ; then + VENDOR=gentoo ; + elif test -f /etc/arch-release ; then + VENDOR=arch ; + elif test -f /etc/SuSE-release ; then + VENDOR=sles ; + elif test -f /etc/slackware-version ; then + VENDOR=slackware ; + elif test -f /etc/lunar.release ; then + VENDOR=lunar ; + elif test -f /etc/lsb-release ; then + VENDOR=ubuntu ; + elif test -f /etc/debian_version ; then + VENDOR=debian ; + elif test -f /etc/alpine-release ; then + VENDOR=alpine ; + elif test -f /bin/freebsd-version ; then + VENDOR=freebsd ; + else + VENDOR= ; + fi + AC_MSG_RESULT([$VENDOR]) + AC_SUBST(VENDOR) + + AC_MSG_CHECKING([default package type]) + case "$VENDOR" in + toss) DEFAULT_PACKAGE=rpm ;; + redhat) DEFAULT_PACKAGE=rpm ;; + fedora) DEFAULT_PACKAGE=rpm ;; + gentoo) DEFAULT_PACKAGE=tgz ;; + alpine) DEFAULT_PACKAGE=tgz ;; + arch) DEFAULT_PACKAGE=tgz ;; + sles) DEFAULT_PACKAGE=rpm ;; + slackware) DEFAULT_PACKAGE=tgz ;; + lunar) DEFAULT_PACKAGE=tgz ;; + ubuntu) DEFAULT_PACKAGE=deb ;; + debian) DEFAULT_PACKAGE=deb ;; + freebsd) DEFAULT_PACKAGE=pkg ;; + *) DEFAULT_PACKAGE=rpm ;; + esac + AC_MSG_RESULT([$DEFAULT_PACKAGE]) + AC_SUBST(DEFAULT_PACKAGE) + + AC_MSG_CHECKING([default init directory]) + case "$VENDOR" in + freebsd) initdir=$sysconfdir/rc.d ;; + *) initdir=$sysconfdir/init.d;; + esac + AC_MSG_RESULT([$initdir]) + AC_SUBST(initdir) + + AC_MSG_CHECKING([default init script type and shell]) + case "$VENDOR" in + toss) DEFAULT_INIT_SCRIPT=redhat ;; + redhat) DEFAULT_INIT_SCRIPT=redhat ;; + fedora) DEFAULT_INIT_SCRIPT=fedora ;; + gentoo) DEFAULT_INIT_SCRIPT=openrc ;; + alpine) DEFAULT_INIT_SCRIPT=openrc ;; + arch) DEFAULT_INIT_SCRIPT=lsb ;; + sles) DEFAULT_INIT_SCRIPT=lsb ;; + slackware) DEFAULT_INIT_SCRIPT=lsb ;; + lunar) DEFAULT_INIT_SCRIPT=lunar ;; + ubuntu) DEFAULT_INIT_SCRIPT=lsb ;; + debian) DEFAULT_INIT_SCRIPT=lsb ;; + freebsd) DEFAULT_INIT_SCRIPT=freebsd;; + *) DEFAULT_INIT_SCRIPT=lsb ;; + esac + + # On gentoo, it's possible that OpenRC isn't installed. Check if + # /sbin/openrc-run exists, and if not, fall back to generic defaults. + + DEFAULT_INIT_SHELL="/bin/sh" + AS_IF([test "$DEFAULT_INIT_SCRIPT" = "openrc"], [ + AS_IF([test -x "/sbin/openrc-run"], + [DEFAULT_INIT_SHELL="/sbin/openrc-run"], + [DEFAULT_INIT_SCRIPT=lsb]) + ]) + + AC_MSG_RESULT([$DEFAULT_INIT_SCRIPT:$DEFAULT_INIT_SHELL]) + AC_SUBST(DEFAULT_INIT_SCRIPT) + AC_SUBST(DEFAULT_INIT_SHELL) + + AC_MSG_CHECKING([default nfs server init script]) + AS_IF([test "$VENDOR" = "debian"], + [DEFAULT_INIT_NFS_SERVER="nfs-kernel-server"], + [DEFAULT_INIT_NFS_SERVER="nfs"] + ) + AC_MSG_RESULT([$DEFAULT_INIT_NFS_SERVER]) + AC_SUBST(DEFAULT_INIT_NFS_SERVER) + + AC_MSG_CHECKING([default init config directory]) + case "$VENDOR" in + alpine) initconfdir=/etc/conf.d ;; + gentoo) initconfdir=/etc/conf.d ;; + toss) initconfdir=/etc/sysconfig ;; + redhat) initconfdir=/etc/sysconfig ;; + fedora) initconfdir=/etc/sysconfig ;; + sles) initconfdir=/etc/sysconfig ;; + ubuntu) initconfdir=/etc/default ;; + debian) initconfdir=/etc/default ;; + freebsd) initconfdir=$sysconfdir/rc.conf.d;; + *) initconfdir=/etc/default ;; + esac + AC_MSG_RESULT([$initconfdir]) + AC_SUBST(initconfdir) + + AC_MSG_CHECKING([whether initramfs-tools is available]) + if test -d /usr/share/initramfs-tools ; then + DEFINE_INITRAMFS='--define "_initramfs 1"' + AC_MSG_RESULT([yes]) + else + DEFINE_INITRAMFS='' + AC_MSG_RESULT([no]) + fi + AC_SUBST(DEFINE_INITRAMFS) +]) + +dnl # +dnl # Default ZFS package configuration +dnl # +AC_DEFUN([ZFS_AC_PACKAGE], [ + ZFS_AC_DEFAULT_PACKAGE + AS_IF([test x$VENDOR != xfreebsd], [ + ZFS_AC_RPM + ZFS_AC_DPKG + ZFS_AC_ALIEN + ]) +]) diff --git a/config/zfs-meta.m4 b/config/zfs-meta.m4 new file mode 100644 index 000000000000..b3c1befaac5d --- /dev/null +++ b/config/zfs-meta.m4 @@ -0,0 +1,207 @@ +dnl # +dnl # DESCRIPTION: +dnl # Read meta data from the META file. When building from a git repository +dnl # the ZFS_META_RELEASE field will be overwritten if there is an annotated +dnl # tag matching the form ZFS_META_NAME-ZFS_META_VERSION-*. This allows +dnl # for working builds to be uniquely identified using the git commit hash. +dnl # +dnl # The META file format is as follows: +dnl # ^[ ]*KEY:[ \t]+VALUE$ +dnl # +dnl # In other words: +dnl # - KEY is separated from VALUE by a colon and one or more spaces/tabs. +dnl # - KEY and VALUE are case sensitive. +dnl # - Leading spaces are ignored. +dnl # - First match wins for duplicate keys. +dnl # +dnl # A line can be commented out by preceding it with a '#' (or technically +dnl # any non-space character since that will prevent the regex from +dnl # matching). +dnl # +dnl # WARNING: +dnl # Placing a colon followed by a space or tab (ie, ":[ \t]+") within the +dnl # VALUE will prematurely terminate the string since that sequence is +dnl # used as the awk field separator. +dnl # +dnl # KEYS: +dnl # The following META keys are recognized: +dnl # Name, Version, Release, Date, Author, LT_Current, LT_Revision, LT_Age +dnl # +dnl # Written by Chris Dunlap . +dnl # Modified by Brian Behlendorf . +dnl # +AC_DEFUN([ZFS_AC_META], [ + + AH_BOTTOM([ +#undef PACKAGE +#undef PACKAGE_BUGREPORT +#undef PACKAGE_NAME +#undef PACKAGE_STRING +#undef PACKAGE_TARNAME +#undef PACKAGE_VERSION +#undef STDC_HEADERS +#undef VERSION]) + + AC_PROG_AWK + AC_MSG_CHECKING([metadata]) + + META="$srcdir/META" + _zfs_ac_meta_type="none" + if test -f "$META"; then + _zfs_ac_meta_type="META file" + + ZFS_META_NAME=_ZFS_AC_META_GETVAL([(Name|Project|Package)]); + if test -n "$ZFS_META_NAME"; then + AC_DEFINE_UNQUOTED([ZFS_META_NAME], ["$ZFS_META_NAME"], + [Define the project name.] + ) + AC_SUBST([ZFS_META_NAME]) + fi + + ZFS_META_VERSION=_ZFS_AC_META_GETVAL([Version]); + if test -n "$ZFS_META_VERSION"; then + AC_DEFINE_UNQUOTED([ZFS_META_VERSION], + ["$ZFS_META_VERSION"], + [Define the project version.]) + AC_DEFINE_UNQUOTED([SPL_META_VERSION], + [ZFS_META_VERSION], + [Defined for legacy compatibility.]) + AC_SUBST([ZFS_META_VERSION]) + fi + + ZFS_META_RELEASE=_ZFS_AC_META_GETVAL([Release]); + if test ! -f ".nogitrelease" && git rev-parse --git-dir > /dev/null 2>&1; then + _match="${ZFS_META_NAME}-${ZFS_META_VERSION}" + _alias=$(git describe --match=${_match} 2>/dev/null) + _release=$(echo ${_alias}|cut -f3- -d'-'|sed 's/-/_/g') + if test -n "${_release}"; then + ZFS_META_RELEASE=${_release} + _zfs_ac_meta_type="git describe" + else + _match="${ZFS_META_NAME}-${ZFS_META_VERSION}-${ZFS_META_RELEASE}" + _alias=$(git describe --match=${_match} 2>/dev/null) + _release=$(echo ${_alias}|cut -f3- -d'-'|sed 's/-/_/g') + if test -n "${_release}"; then + ZFS_META_RELEASE=${_release} + _zfs_ac_meta_type="git describe" + fi + fi + fi + + if test -n "$ZFS_META_RELEASE"; then + AC_DEFINE_UNQUOTED([ZFS_META_RELEASE], + ["$ZFS_META_RELEASE"], + [Define the project release.]) + AC_DEFINE_UNQUOTED([SPL_META_RELEASE], + [ZFS_META_RELEASE], + [Defined for legacy compatibility.]) + AC_SUBST([ZFS_META_RELEASE]) + + RELEASE="$ZFS_META_RELEASE" + AC_SUBST([RELEASE]) + fi + + ZFS_META_LICENSE=_ZFS_AC_META_GETVAL([License]); + if test -n "$ZFS_META_LICENSE"; then + AC_DEFINE_UNQUOTED([ZFS_META_LICENSE], ["$ZFS_META_LICENSE"], + [Define the project license.] + ) + AC_SUBST([ZFS_META_LICENSE]) + fi + + if test -n "$ZFS_META_NAME" -a -n "$ZFS_META_VERSION"; then + ZFS_META_ALIAS="$ZFS_META_NAME-$ZFS_META_VERSION" + test -n "$ZFS_META_RELEASE" && + ZFS_META_ALIAS="$ZFS_META_ALIAS-$ZFS_META_RELEASE" + AC_DEFINE_UNQUOTED([ZFS_META_ALIAS], + ["$ZFS_META_ALIAS"], + [Define the project alias string.]) + AC_DEFINE_UNQUOTED([SPL_META_ALIAS], + [ZFS_META_ALIAS], + [Defined for legacy compatibility.]) + AC_SUBST([ZFS_META_ALIAS]) + fi + + ZFS_META_DATA=_ZFS_AC_META_GETVAL([Date]); + if test -n "$ZFS_META_DATA"; then + AC_DEFINE_UNQUOTED([ZFS_META_DATA], ["$ZFS_META_DATA"], + [Define the project release date.] + ) + AC_SUBST([ZFS_META_DATA]) + fi + + ZFS_META_AUTHOR=_ZFS_AC_META_GETVAL([Author]); + if test -n "$ZFS_META_AUTHOR"; then + AC_DEFINE_UNQUOTED([ZFS_META_AUTHOR], ["$ZFS_META_AUTHOR"], + [Define the project author.] + ) + AC_SUBST([ZFS_META_AUTHOR]) + fi + + ZFS_META_KVER_MIN=_ZFS_AC_META_GETVAL([Linux-Minimum]); + if test -n "$ZFS_META_KVER_MIN"; then + AC_DEFINE_UNQUOTED([ZFS_META_KVER_MIN], + ["$ZFS_META_KVER_MIN"], + [Define the minimum compatible kernel version.] + ) + AC_SUBST([ZFS_META_KVER_MIN]) + fi + + ZFS_META_KVER_MAX=_ZFS_AC_META_GETVAL([Linux-Maximum]); + if test -n "$ZFS_META_KVER_MAX"; then + AC_DEFINE_UNQUOTED([ZFS_META_KVER_MAX], + ["$ZFS_META_KVER_MAX"], + [Define the maximum compatible kernel version.] + ) + AC_SUBST([ZFS_META_KVER_MAX]) + fi + + m4_pattern_allow([^LT_(CURRENT|REVISION|AGE)$]) + ZFS_META_LT_CURRENT=_ZFS_AC_META_GETVAL([LT_Current]); + ZFS_META_LT_REVISION=_ZFS_AC_META_GETVAL([LT_Revision]); + ZFS_META_LT_AGE=_ZFS_AC_META_GETVAL([LT_Age]); + if test -n "$ZFS_META_LT_CURRENT" \ + -o -n "$ZFS_META_LT_REVISION" \ + -o -n "$ZFS_META_LT_AGE"; then + test -n "$ZFS_META_LT_CURRENT" || ZFS_META_LT_CURRENT="0" + test -n "$ZFS_META_LT_REVISION" || ZFS_META_LT_REVISION="0" + test -n "$ZFS_META_LT_AGE" || ZFS_META_LT_AGE="0" + AC_DEFINE_UNQUOTED([ZFS_META_LT_CURRENT], + ["$ZFS_META_LT_CURRENT"], + [Define the libtool library 'current' + version information.] + ) + AC_DEFINE_UNQUOTED([ZFS_META_LT_REVISION], + ["$ZFS_META_LT_REVISION"], + [Define the libtool library 'revision' + version information.] + ) + AC_DEFINE_UNQUOTED([ZFS_META_LT_AGE], ["$ZFS_META_LT_AGE"], + [Define the libtool library 'age' + version information.] + ) + AC_SUBST([ZFS_META_LT_CURRENT]) + AC_SUBST([ZFS_META_LT_REVISION]) + AC_SUBST([ZFS_META_LT_AGE]) + fi + fi + + AC_MSG_RESULT([$_zfs_ac_meta_type]) + ] +) + +dnl # _ZFS_AC_META_GETVAL (KEY_NAME_OR_REGEX) +dnl # +dnl # Returns the META VALUE associated with the given KEY_NAME_OR_REGEX expr. +dnl # +dnl # Despite their resemblance to line noise, +dnl # the "@<:@" and "@:>@" constructs are quadrigraphs for "[" and "]". +dnl # +dnl # +dnl # The "$[]1" and "$[]2" constructs prevent M4 parameter expansion +dnl # so a literal $1 and $2 will be passed to the resulting awk script, +dnl # whereas the "$1" will undergo M4 parameter expansion for the META key. +dnl # +AC_DEFUN([_ZFS_AC_META_GETVAL], + [`$AWK -F ':@<:@ \t@:>@+' '$[]1 ~ /^ *$1$/ { print $[]2; exit }' $META`]dnl +) diff --git a/configure.ac b/configure.ac new file mode 100644 index 000000000000..199187ce51bb --- /dev/null +++ b/configure.ac @@ -0,0 +1,409 @@ +/* + * This file is part of the ZFS Linux port. + * + * Copyright (c) 2009 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory + * Written by: + * Brian Behlendorf , + * Herb Wartens , + * Jim Garlick + * LLNL-CODE-403049 + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +AC_INIT(m4_esyscmd(grep ^Name: META | cut -d ':' -f 2 | tr -d ' \n'), + m4_esyscmd(grep ^Version: META | cut -d ':' -f 2 | tr -d ' \n')) +AC_LANG(C) +ZFS_AC_META +AC_CONFIG_AUX_DIR([config]) +AC_CONFIG_MACRO_DIR([config]) +AC_CANONICAL_SYSTEM +AM_MAINTAINER_MODE +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) +AM_INIT_AUTOMAKE([subdir-objects]) +AC_CONFIG_HEADERS([zfs_config.h], [ + (mv zfs_config.h zfs_config.h.tmp && + awk -f ${ac_srcdir}/config/config.awk zfs_config.h.tmp >zfs_config.h && + rm zfs_config.h.tmp) || exit 1]) + +AC_PROG_INSTALL +AC_PROG_CC +AC_PROG_LIBTOOL +PKG_PROG_PKG_CONFIG +AM_PROG_AS +AM_PROG_CC_C_O +AX_CODE_COVERAGE +_AM_PROG_TAR(pax) + +ZFS_AC_LICENSE +ZFS_AC_CONFIG +ZFS_AC_PACKAGE +ZFS_AC_DEBUG +ZFS_AC_DEBUGINFO +ZFS_AC_DEBUG_KMEM +ZFS_AC_DEBUG_KMEM_TRACKING + +AC_CONFIG_FILES([ + Makefile + cmd/Makefile + cmd/arc_summary/Makefile + cmd/arcstat/Makefile + cmd/dbufstat/Makefile + cmd/fsck_zfs/Makefile + cmd/mount_zfs/Makefile + cmd/raidz_test/Makefile + cmd/vdev_id/Makefile + cmd/zdb/Makefile + cmd/zed/Makefile + cmd/zed/zed.d/Makefile + cmd/zfs/Makefile + cmd/zfs_ids_to_path/Makefile + cmd/zgenhostid/Makefile + cmd/zhack/Makefile + cmd/zinject/Makefile + cmd/zpool/Makefile + cmd/zstream/Makefile + cmd/zstreamdump/Makefile + cmd/ztest/Makefile + cmd/zvol_id/Makefile + cmd/zvol_wait/Makefile + contrib/Makefile + contrib/bash_completion.d/Makefile + contrib/bpftrace/Makefile + contrib/dracut/02zfsexpandknowledge/Makefile + contrib/dracut/90zfs/Makefile + contrib/dracut/Makefile + contrib/initramfs/Makefile + contrib/initramfs/conf.d/Makefile + contrib/initramfs/conf-hooks.d/Makefile + contrib/initramfs/hooks/Makefile + contrib/initramfs/scripts/Makefile + contrib/initramfs/scripts/local-top/Makefile + contrib/pam_zfs_key/Makefile + contrib/pyzfs/Makefile + contrib/pyzfs/setup.py + contrib/zcp/Makefile + etc/Makefile + etc/default/Makefile + etc/init.d/Makefile + etc/modules-load.d/Makefile + etc/sudoers.d/Makefile + etc/systemd/Makefile + etc/systemd/system-generators/Makefile + etc/systemd/system/Makefile + etc/zfs/Makefile + include/Makefile + include/os/Makefile + include/os/freebsd/Makefile + include/os/freebsd/linux/Makefile + include/os/freebsd/spl/Makefile + include/os/freebsd/spl/acl/Makefile + include/os/freebsd/spl/rpc/Makefile + include/os/freebsd/spl/sys/Makefile + include/os/freebsd/zfs/Makefile + include/os/freebsd/zfs/sys/Makefile + include/os/linux/Makefile + include/os/linux/kernel/Makefile + include/os/linux/kernel/linux/Makefile + include/os/linux/spl/Makefile + include/os/linux/spl/rpc/Makefile + include/os/linux/spl/sys/Makefile + include/os/linux/zfs/Makefile + include/os/linux/zfs/sys/Makefile + include/sys/Makefile + include/sys/crypto/Makefile + include/sys/fm/Makefile + include/sys/fm/fs/Makefile + include/sys/fs/Makefile + include/sys/lua/Makefile + include/sys/sysevent/Makefile + include/sys/zstd/Makefile + lib/Makefile + lib/libavl/Makefile + lib/libefi/Makefile + lib/libicp/Makefile + lib/libnvpair/Makefile + lib/libshare/Makefile + lib/libspl/Makefile + lib/libspl/include/Makefile + lib/libspl/include/ia32/Makefile + lib/libspl/include/ia32/sys/Makefile + lib/libspl/include/os/Makefile + lib/libspl/include/os/freebsd/Makefile + lib/libspl/include/os/freebsd/sys/Makefile + lib/libspl/include/os/linux/Makefile + lib/libspl/include/os/linux/sys/Makefile + lib/libspl/include/rpc/Makefile + lib/libspl/include/sys/Makefile + lib/libspl/include/sys/dktp/Makefile + lib/libspl/include/util/Makefile + lib/libtpool/Makefile + lib/libunicode/Makefile + lib/libuutil/Makefile + lib/libzfs/Makefile + lib/libzfs/libzfs.pc + lib/libzfs_core/Makefile + lib/libzfs_core/libzfs_core.pc + lib/libzpool/Makefile + lib/libzstd/Makefile + lib/libzutil/Makefile + man/Makefile + man/man1/Makefile + man/man5/Makefile + man/man8/Makefile + module/Kbuild + module/Makefile + module/avl/Makefile + module/icp/Makefile + module/lua/Makefile + module/nvpair/Makefile + module/os/linux/spl/Makefile + module/os/linux/zfs/Makefile + module/spl/Makefile + module/unicode/Makefile + module/zcommon/Makefile + module/zfs/Makefile + module/zstd/Makefile + rpm/Makefile + rpm/generic/Makefile + rpm/generic/zfs-dkms.spec + rpm/generic/zfs-kmod.spec + rpm/generic/zfs.spec + rpm/redhat/Makefile + rpm/redhat/zfs-dkms.spec + rpm/redhat/zfs-kmod.spec + rpm/redhat/zfs.spec + scripts/Makefile + tests/Makefile + tests/runfiles/Makefile + tests/test-runner/Makefile + tests/test-runner/bin/Makefile + tests/test-runner/include/Makefile + tests/test-runner/man/Makefile + tests/zfs-tests/Makefile + tests/zfs-tests/callbacks/Makefile + tests/zfs-tests/cmd/Makefile + tests/zfs-tests/cmd/btree_test/Makefile + tests/zfs-tests/cmd/chg_usr_exec/Makefile + tests/zfs-tests/cmd/devname2devid/Makefile + tests/zfs-tests/cmd/dir_rd_update/Makefile + tests/zfs-tests/cmd/file_check/Makefile + tests/zfs-tests/cmd/file_trunc/Makefile + tests/zfs-tests/cmd/file_write/Makefile + tests/zfs-tests/cmd/get_diff/Makefile + tests/zfs-tests/cmd/largest_file/Makefile + tests/zfs-tests/cmd/libzfs_input_check/Makefile + tests/zfs-tests/cmd/mkbusy/Makefile + tests/zfs-tests/cmd/mkfile/Makefile + tests/zfs-tests/cmd/mkfiles/Makefile + tests/zfs-tests/cmd/mktree/Makefile + tests/zfs-tests/cmd/mmap_exec/Makefile + tests/zfs-tests/cmd/mmap_libaio/Makefile + tests/zfs-tests/cmd/mmapwrite/Makefile + tests/zfs-tests/cmd/nvlist_to_lua/Makefile + tests/zfs-tests/cmd/randfree_file/Makefile + tests/zfs-tests/cmd/randwritecomp/Makefile + tests/zfs-tests/cmd/readmmap/Makefile + tests/zfs-tests/cmd/rename_dir/Makefile + tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile + tests/zfs-tests/cmd/stride_dd/Makefile + tests/zfs-tests/cmd/threadsappend/Makefile + tests/zfs-tests/cmd/user_ns_exec/Makefile + tests/zfs-tests/cmd/xattrtest/Makefile + tests/zfs-tests/include/Makefile + tests/zfs-tests/tests/Makefile + tests/zfs-tests/tests/functional/Makefile + tests/zfs-tests/tests/functional/acl/Makefile + tests/zfs-tests/tests/functional/acl/posix/Makefile + tests/zfs-tests/tests/functional/alloc_class/Makefile + tests/zfs-tests/tests/functional/arc/Makefile + tests/zfs-tests/tests/functional/atime/Makefile + tests/zfs-tests/tests/functional/bootfs/Makefile + tests/zfs-tests/tests/functional/btree/Makefile + tests/zfs-tests/tests/functional/cache/Makefile + tests/zfs-tests/tests/functional/cachefile/Makefile + tests/zfs-tests/tests/functional/casenorm/Makefile + tests/zfs-tests/tests/functional/channel_program/Makefile + tests/zfs-tests/tests/functional/channel_program/lua_core/Makefile + tests/zfs-tests/tests/functional/channel_program/synctask_core/Makefile + tests/zfs-tests/tests/functional/chattr/Makefile + tests/zfs-tests/tests/functional/checksum/Makefile + tests/zfs-tests/tests/functional/clean_mirror/Makefile + tests/zfs-tests/tests/functional/cli_root/Makefile + tests/zfs-tests/tests/functional/cli_root/zdb/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_bookmark/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_change-key/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_clone/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_copies/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_create/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_destroy/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_diff/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_get/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_ids_to_path/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_inherit/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_jail/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_load-key/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_mount/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_program/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_promote/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_property/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_receive/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_rename/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_reservation/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_rollback/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_send/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_set/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_share/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_snapshot/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_sysfs/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_unload-key/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_unmount/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_unshare/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_upgrade/Makefile + tests/zfs-tests/tests/functional/cli_root/zfs_wait/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_add/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_attach/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_clear/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_create/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_destroy/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_detach/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_events/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_expand/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_export/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_get/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_history/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_import/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_import/blockfiles/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_initialize/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_labelclear/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_offline/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_online/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_remove/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_reopen/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_replace/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_resilver/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_scrub/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_set/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_split/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_status/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_sync/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_trim/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_upgrade/blockfiles/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_wait/Makefile + tests/zfs-tests/tests/functional/cli_root/zpool_wait/scan/Makefile + tests/zfs-tests/tests/functional/cli_user/Makefile + tests/zfs-tests/tests/functional/cli_user/misc/Makefile + tests/zfs-tests/tests/functional/cli_user/zfs_list/Makefile + tests/zfs-tests/tests/functional/cli_user/zpool_iostat/Makefile + tests/zfs-tests/tests/functional/cli_user/zpool_list/Makefile + tests/zfs-tests/tests/functional/cli_user/zpool_status/Makefile + tests/zfs-tests/tests/functional/compression/Makefile + tests/zfs-tests/tests/functional/cp_files/Makefile + tests/zfs-tests/tests/functional/ctime/Makefile + tests/zfs-tests/tests/functional/deadman/Makefile + tests/zfs-tests/tests/functional/delegate/Makefile + tests/zfs-tests/tests/functional/devices/Makefile + tests/zfs-tests/tests/functional/events/Makefile + tests/zfs-tests/tests/functional/exec/Makefile + tests/zfs-tests/tests/functional/fallocate/Makefile + tests/zfs-tests/tests/functional/fault/Makefile + tests/zfs-tests/tests/functional/features/Makefile + tests/zfs-tests/tests/functional/features/async_destroy/Makefile + tests/zfs-tests/tests/functional/features/large_dnode/Makefile + tests/zfs-tests/tests/functional/grow/Makefile + tests/zfs-tests/tests/functional/history/Makefile + tests/zfs-tests/tests/functional/hkdf/Makefile + tests/zfs-tests/tests/functional/inheritance/Makefile + tests/zfs-tests/tests/functional/inuse/Makefile + tests/zfs-tests/tests/functional/io/Makefile + tests/zfs-tests/tests/functional/large_files/Makefile + tests/zfs-tests/tests/functional/largest_pool/Makefile + tests/zfs-tests/tests/functional/libzfs/Makefile + tests/zfs-tests/tests/functional/limits/Makefile + tests/zfs-tests/tests/functional/link_count/Makefile + tests/zfs-tests/tests/functional/log_spacemap/Makefile + tests/zfs-tests/tests/functional/migration/Makefile + tests/zfs-tests/tests/functional/mmap/Makefile + tests/zfs-tests/tests/functional/mmp/Makefile + tests/zfs-tests/tests/functional/mount/Makefile + tests/zfs-tests/tests/functional/mv_files/Makefile + tests/zfs-tests/tests/functional/nestedfs/Makefile + tests/zfs-tests/tests/functional/no_space/Makefile + tests/zfs-tests/tests/functional/nopwrite/Makefile + tests/zfs-tests/tests/functional/online_offline/Makefile + tests/zfs-tests/tests/functional/pam/Makefile + tests/zfs-tests/tests/functional/persist_l2arc/Makefile + tests/zfs-tests/tests/functional/pool_checkpoint/Makefile + tests/zfs-tests/tests/functional/pool_names/Makefile + tests/zfs-tests/tests/functional/poolversion/Makefile + tests/zfs-tests/tests/functional/privilege/Makefile + tests/zfs-tests/tests/functional/procfs/Makefile + tests/zfs-tests/tests/functional/projectquota/Makefile + tests/zfs-tests/tests/functional/pyzfs/Makefile + tests/zfs-tests/tests/functional/quota/Makefile + tests/zfs-tests/tests/functional/raidz/Makefile + tests/zfs-tests/tests/functional/redacted_send/Makefile + tests/zfs-tests/tests/functional/redundancy/Makefile + tests/zfs-tests/tests/functional/refquota/Makefile + tests/zfs-tests/tests/functional/refreserv/Makefile + tests/zfs-tests/tests/functional/removal/Makefile + tests/zfs-tests/tests/functional/rename_dirs/Makefile + tests/zfs-tests/tests/functional/replacement/Makefile + tests/zfs-tests/tests/functional/reservation/Makefile + tests/zfs-tests/tests/functional/rootpool/Makefile + tests/zfs-tests/tests/functional/rsend/Makefile + tests/zfs-tests/tests/functional/scrub_mirror/Makefile + tests/zfs-tests/tests/functional/slog/Makefile + tests/zfs-tests/tests/functional/snapshot/Makefile + tests/zfs-tests/tests/functional/snapused/Makefile + tests/zfs-tests/tests/functional/sparse/Makefile + tests/zfs-tests/tests/functional/suid/Makefile + tests/zfs-tests/tests/functional/threadsappend/Makefile + tests/zfs-tests/tests/functional/tmpfile/Makefile + tests/zfs-tests/tests/functional/trim/Makefile + tests/zfs-tests/tests/functional/truncate/Makefile + tests/zfs-tests/tests/functional/upgrade/Makefile + tests/zfs-tests/tests/functional/user_namespace/Makefile + tests/zfs-tests/tests/functional/userquota/Makefile + tests/zfs-tests/tests/functional/vdev_zaps/Makefile + tests/zfs-tests/tests/functional/write_dirs/Makefile + tests/zfs-tests/tests/functional/xattr/Makefile + tests/zfs-tests/tests/functional/zvol/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_ENOSPC/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_cli/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_misc/Makefile + tests/zfs-tests/tests/functional/zvol/zvol_swap/Makefile + tests/zfs-tests/tests/perf/Makefile + tests/zfs-tests/tests/perf/fio/Makefile + tests/zfs-tests/tests/perf/regression/Makefile + tests/zfs-tests/tests/perf/scripts/Makefile + tests/zfs-tests/tests/stress/Makefile + udev/Makefile + udev/rules.d/Makefile + zfs.release +]) + + +AC_OUTPUT diff --git a/contrib/Makefile.am b/contrib/Makefile.am new file mode 100644 index 000000000000..9547878d07d0 --- /dev/null +++ b/contrib/Makefile.am @@ -0,0 +1,8 @@ +SUBDIRS = bash_completion.d pyzfs zcp +if BUILD_LINUX +SUBDIRS += bpftrace dracut initramfs +endif +if PAM_ZFS_ENABLED +SUBDIRS += pam_zfs_key +endif +DIST_SUBDIRS = bash_completion.d bpftrace dracut initramfs pam_zfs_key pyzfs zcp diff --git a/contrib/bash_completion.d/Makefile.am b/contrib/bash_completion.d/Makefile.am new file mode 100644 index 000000000000..4f13af6b3c35 --- /dev/null +++ b/contrib/bash_completion.d/Makefile.am @@ -0,0 +1,5 @@ +bashcompletiondir = $(sysconfdir)/bash_completion.d + +noinst_DATA = zfs + +EXTRA_DIST = $(noinst_DATA) diff --git a/contrib/bash_completion.d/zfs b/contrib/bash_completion.d/zfs new file mode 100644 index 000000000000..094527340c80 --- /dev/null +++ b/contrib/bash_completion.d/zfs @@ -0,0 +1,481 @@ +# Copyright (c) 2010-2016, Aneurin Price + +# Permission is hereby granted, free of charge, to any person +# obtaining a copy of this software and associated documentation +# files (the "Software"), to deal in the Software without +# restriction, including without limitation the rights to use, +# copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the +# Software is furnished to do so, subject to the following +# conditions: + +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. + +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES +# OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT +# HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, +# WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR +# OTHER DEALINGS IN THE SOFTWARE. + +if [[ -w /dev/zfs ]]; then + __ZFS_CMD="zfs" + __ZPOOL_CMD="zpool" +else + __ZFS_CMD="sudo zfs" + __ZPOOL_CMD="sudo zpool" +fi + +# Disable bash's built-in hostname completion, as this makes it impossible to +# provide completions containing an @-sign, which is necessary for completing +# snapshot names. If bash_completion is in use, this will already be disabled +# and replaced with better completions anyway. +shopt -u hostcomplete + +__zfs_get_commands() +{ + $__ZFS_CMD 2>&1 | awk '/^\t[a-z]/ {print $1}' | cut -f1 -d '|' | uniq +} + +__zfs_get_properties() +{ + $__ZFS_CMD get 2>&1 | awk '$2 == "YES" || $2 == "NO" {print $1}'; echo all name space +} + +__zfs_get_editable_properties() +{ + $__ZFS_CMD get 2>&1 | awk '$2 == "YES" {print $1"="}' +} + +__zfs_get_inheritable_properties() +{ + $__ZFS_CMD get 2>&1 | awk '$3 == "YES" {print $1}' +} + +__zfs_list_datasets() +{ + $__ZFS_CMD list -H -o name -s name -t filesystem,volume "$@" +} + +__zfs_list_filesystems() +{ + $__ZFS_CMD list -H -o name -s name -t filesystem +} + +__zfs_match_snapshot() +{ + local base_dataset=${cur%@*} + if [[ $base_dataset != $cur ]] + then + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 $base_dataset + else + if [[ $cur != "" ]] && __zfs_list_datasets $cur &> /dev/null + then + $__ZFS_CMD list -H -o name -s name -t filesystem -r $cur | tail -n +2 + # We output the base dataset name even though we might be + # completing a command that can only take a snapshot, because it + # prevents bash from considering the completion finished when it + # ends in the bare @. + echo $cur + echo $cur@ + else + local datasets=$(__zfs_list_datasets) + # As above + echo $datasets + if [[ "$cur" == */ ]] + then + # If the current command ends with a slash, then the only way + # it can be completed with a single tab press (ie. in this pass) + # is if it has exactly one child, so that's the only time we + # need to offer a suggestion with an @ appended. + local num_children + # This is actually off by one as zfs list includes the named + # dataset in addition to its children + num_children=$(__zfs_list_datasets -d 1 ${cur%/} 2> /dev/null | wc -l) + if [[ $num_children != 2 ]] + then + return 0 + fi + fi + echo "$datasets" | awk '{print $1"@"}' + fi + fi +} + +__zfs_match_snapshot_or_bookmark() +{ + local base_dataset=${cur%[#@]*} + if [[ $base_dataset != $cur ]] + then + if [[ $cur == *@* ]] + then + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 $base_dataset + else + $__ZFS_CMD list -H -o name -s name -t bookmark -d 1 $base_dataset + fi + else + $__ZFS_CMD list -H -o name -s name -t filesystem,volume + if [[ $cur != "" ]] && $__ZFS_CMD list -H -o name -s name -t filesystem,volume $cur &> /dev/null + then + echo $cur@ + echo $cur# + fi + fi +} + +__zfs_match_multiple_snapshots() +{ + local existing_opts=$(expr "$cur" : '\(.*\)[%,]') + if [[ $existing_opts ]] + then + local base_dataset=${cur%@*} + if [[ $base_dataset != $cur ]] + then + local cur=${cur##*,} + if [[ $cur =~ ^%|%.*% ]] + then + # correct range syntax is start%end + return 1 + fi + local range_start=$(expr "$cur" : '\(.*%\)') + $__ZFS_CMD list -H -o name -s name -t snapshot -d 1 $base_dataset | sed 's$.*@$'$range_start'$g' + fi + else + __zfs_match_snapshot_or_bookmark + fi +} + +__zfs_list_volumes() +{ + $__ZFS_CMD list -H -o name -s name -t volume +} + +__zfs_argument_chosen() +{ + local word property + for word in $(seq $((COMP_CWORD-1)) -1 2) + do + local prev="${COMP_WORDS[$word]}" + if [[ ${COMP_WORDS[$word-1]} != -[tos] ]] + then + if [[ "$prev" == [^,]*,* ]] || [[ "$prev" == *[@:\#]* ]] + then + return 0 + fi + for property in $@ + do + if [[ $prev == "$property"* ]] + then + return 0 + fi + done + fi + done + return 1 +} + +__zfs_complete_ordered_arguments() +{ + local list1=$1 + local list2=$2 + local cur=$3 + local extra=$4 + if __zfs_argument_chosen $list1 + then + COMPREPLY=($(compgen -W "$list2 $extra" -- "$cur")) + else + COMPREPLY=($(compgen -W "$list1 $extra" -- "$cur")) + fi +} + +__zfs_complete_multiple_options() +{ + local options=$1 + local cur=$2 + + COMPREPLY=($(compgen -W "$options" -- "${cur##*,}")) + local existing_opts=$(expr "$cur" : '\(.*,\)') + if [[ $existing_opts ]] + then + COMPREPLY=( "${COMPREPLY[@]/#/${existing_opts}}" ) + fi +} + +__zfs_complete_switch() +{ + local options=$1 + if [[ ${cur:0:1} == - ]] + then + COMPREPLY=($(compgen -W "-{$options}" -- "$cur")) + return 0 + else + return 1 + fi +} + +__zfs_complete_nospace() +{ + # Google indicates that there may still be bash versions out there that + # don't have compopt. + if type compopt &> /dev/null + then + compopt -o nospace + fi +} + +__zfs_complete() +{ + local cur prev cmd cmds + COMPREPLY=() + if type _get_comp_words_by_ref &> /dev/null + then + # Don't split on colon + _get_comp_words_by_ref -n : -c cur -p prev -w COMP_WORDS -i COMP_CWORD + else + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + fi + cmd="${COMP_WORDS[1]}" + + if [[ ${prev##*/} == zfs ]] + then + cmds=$(__zfs_get_commands) + COMPREPLY=($(compgen -W "$cmds -?" -- "$cur")) + return 0 + fi + + case "${cmd}" in + bookmark) + if __zfs_argument_chosen + then + COMPREPLY=($(compgen -W "${prev%@*}# ${prev/@/#}" -- "$cur")) + else + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + ;; + clone) + case "${prev}" in + -o) + COMPREPLY=($(compgen -W "$(__zfs_get_editable_properties)" -- "$cur")) + __zfs_complete_nospace + ;; + *) + if ! __zfs_complete_switch "o,p" + then + if __zfs_argument_chosen + then + COMPREPLY=($(compgen -W "$(__zfs_list_datasets)" -- "$cur")) + else + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + fi + ;; + esac + ;; + get) + case "${prev}" in + -d) + COMPREPLY=($(compgen -W "" -- "$cur")) + ;; + -t) + __zfs_complete_multiple_options "filesystem volume snapshot bookmark all" "$cur" + ;; + -s) + __zfs_complete_multiple_options "local default inherited temporary received none" "$cur" + ;; + -o) + __zfs_complete_multiple_options "name property value source received all" "$cur" + ;; + *) + if ! __zfs_complete_switch "H,r,p,d,o,t,s" + then + if __zfs_argument_chosen $(__zfs_get_properties) + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + else + __zfs_complete_multiple_options "$(__zfs_get_properties)" "$cur" + fi + fi + ;; + esac + ;; + inherit) + if ! __zfs_complete_switch "r" + then + __zfs_complete_ordered_arguments "$(__zfs_get_inheritable_properties)" "$(__zfs_match_snapshot)" $cur + fi + ;; + list) + case "${prev}" in + -d) + COMPREPLY=($(compgen -W "" -- "$cur")) + ;; + -t) + __zfs_complete_multiple_options "filesystem volume snapshot bookmark all" "$cur" + ;; + -o) + __zfs_complete_multiple_options "$(__zfs_get_properties)" "$cur" + ;; + -s|-S) + COMPREPLY=($(compgen -W "$(__zfs_get_properties)" -- "$cur")) + ;; + *) + if ! __zfs_complete_switch "H,r,d,o,t,s,S" + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + ;; + esac + ;; + promote) + COMPREPLY=($(compgen -W "$(__zfs_list_filesystems)" -- "$cur")) + ;; + rollback) + if ! __zfs_complete_switch "r,R,f" + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + ;; + send) + if ! __zfs_complete_switch "D,n,P,p,R,v,e,L,i,I" + then + if __zfs_argument_chosen + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + else + if [[ $prev == -*i* ]] + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot_or_bookmark)" -- "$cur")) + else + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + fi + fi + fi + ;; + snapshot) + case "${prev}" in + -o) + COMPREPLY=($(compgen -W "$(__zfs_get_editable_properties)" -- "$cur")) + __zfs_complete_nospace + ;; + *) + if ! __zfs_complete_switch "o,r" + then + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + __zfs_complete_nospace + fi + ;; + esac + ;; + set) + __zfs_complete_ordered_arguments "$(__zfs_get_editable_properties)" "$(__zfs_match_snapshot)" $cur + __zfs_complete_nospace + ;; + upgrade) + case "${prev}" in + -a|-V|-v) + COMPREPLY=($(compgen -W "" -- "$cur")) + ;; + *) + if ! __zfs_complete_switch "a,V,v,r" + then + COMPREPLY=($(compgen -W "$(__zfs_list_filesystems)" -- "$cur")) + fi + ;; + esac + ;; + destroy) + if ! __zfs_complete_switch "d,f,n,p,R,r,v" + then + __zfs_complete_multiple_options "$(__zfs_match_multiple_snapshots)" $cur + __zfs_complete_nospace + fi + ;; + *) + COMPREPLY=($(compgen -W "$(__zfs_match_snapshot)" -- "$cur")) + ;; + esac + if type __ltrim_colon_completions &> /dev/null + then + __ltrim_colon_completions "$cur" + fi + return 0 +} + +__zpool_get_commands() +{ + $__ZPOOL_CMD 2>&1 | awk '/^\t[a-z]/ {print $1}' | uniq +} + +__zpool_get_properties() +{ + $__ZPOOL_CMD get 2>&1 | awk '$2 == "YES" || $2 == "NO" {print $1}'; echo all +} + +__zpool_get_editable_properties() +{ + $__ZPOOL_CMD get 2>&1 | awk '$2 == "YES" {print $1"="}' +} + +__zpool_list_pools() +{ + $__ZPOOL_CMD list -H -o name +} + +__zpool_complete() +{ + local cur prev cmd cmds + COMPREPLY=() + cur="${COMP_WORDS[COMP_CWORD]}" + prev="${COMP_WORDS[COMP_CWORD-1]}" + cmd="${COMP_WORDS[1]}" + + if [[ ${prev##*/} == zpool ]] + then + cmds=$(__zpool_get_commands) + COMPREPLY=($(compgen -W "$cmds" -- "$cur")) + return 0 + fi + + case "${cmd}" in + get) + __zfs_complete_ordered_arguments "$(__zpool_get_properties)" "$(__zpool_list_pools)" $cur + return 0 + ;; + import) + if [[ $prev == -d ]] + then + _filedir -d + else + COMPREPLY=($(compgen -W "$(__zpool_list_pools) -d" -- "$cur")) + fi + return 0 + ;; + set) + __zfs_complete_ordered_arguments "$(__zpool_get_editable_properties)" "$(__zpool_list_pools)" $cur + __zfs_complete_nospace + return 0 + ;; + add|attach|clear|create|detach|offline|online|remove|replace) + local pools="$(__zpool_list_pools)" + if __zfs_argument_chosen $pools + then + _filedir + else + COMPREPLY=($(compgen -W "$pools" -- "$cur")) + fi + return 0 + ;; + *) + COMPREPLY=($(compgen -W "$(__zpool_list_pools)" -- "$cur")) + return 0 + ;; + esac + +} + +complete -F __zfs_complete zfs +complete -F __zpool_complete zpool diff --git a/contrib/bpftrace/Makefile.am b/contrib/bpftrace/Makefile.am new file mode 100644 index 000000000000..327bc64425d8 --- /dev/null +++ b/contrib/bpftrace/Makefile.am @@ -0,0 +1,3 @@ +EXTRA_DIST = \ + taskqlatency.bt \ + zfs-trace.sh diff --git a/contrib/bpftrace/taskqlatency.bt b/contrib/bpftrace/taskqlatency.bt new file mode 100644 index 000000000000..598f9882b30d --- /dev/null +++ b/contrib/bpftrace/taskqlatency.bt @@ -0,0 +1,54 @@ +#include + +kprobe:trace_zfs_taskq_ent__birth +{ + $tqent = (struct taskq_ent *)arg0; + + $tqent_id = $tqent->tqent_id; + $tq_name = str($tqent->tqent_taskq->tq_name); + + @birth[$tq_name, $tqent_id] = nsecs; +} + +kprobe:trace_zfs_taskq_ent__start +{ + $tqent = (struct taskq_ent *)arg0; + + @tqent_id[tid] = $tqent->tqent_id; + @tq_name[tid] = str($tqent->tqent_taskq->tq_name); + + @start[@tq_name[tid], @tqent_id[tid]] = nsecs; +} + +kprobe:trace_zfs_taskq_ent__start +/ @birth[@tq_name[tid], @tqent_id[tid]] / +{ + @queue_lat_us[@tq_name[tid]] = + hist((nsecs - @birth[@tq_name[tid], @tqent_id[tid]])/1000); + delete(@birth[@tq_name[tid], @tqent_id[tid]]); +} + +kprobe:trace_zfs_taskq_ent__finish +/ @start[@tq_name[tid], @tqent_id[tid]] / +{ + $tqent = (struct taskq_ent *)arg0; + + @exec_lat_us[@tq_name[tid], ksym($tqent->tqent_func)] = + hist((nsecs - @start[@tq_name[tid], @tqent_id[tid]])/1000); + delete(@start[@tq_name[tid], @tqent_id[tid]]); +} + +kprobe:trace_zfs_taskq_ent__finish +{ + delete(@tq_name[tid]); + delete(@tqent_id[tid]); +} + +END +{ + clear(@birth); + clear(@start); + + clear(@tq_name); + clear(@tqent_id); +} diff --git a/contrib/bpftrace/zfs-trace.sh b/contrib/bpftrace/zfs-trace.sh new file mode 100755 index 000000000000..13230b78c3c3 --- /dev/null +++ b/contrib/bpftrace/zfs-trace.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +ZVER=$(cat /sys/module/zfs/version | cut -f 1 -d '-') +KVER=$(uname -r) + +bpftrace \ + --include "/usr/src/zfs-$ZVER/$KVER/zfs_config.h" \ + -I "/usr/src/zfs-$ZVER/include" \ + -I "/usr/src/zfs-$ZVER/include/spl" \ + "$@" diff --git a/contrib/dracut/02zfsexpandknowledge/.gitignore b/contrib/dracut/02zfsexpandknowledge/.gitignore new file mode 100644 index 000000000000..7fb6b964f058 --- /dev/null +++ b/contrib/dracut/02zfsexpandknowledge/.gitignore @@ -0,0 +1 @@ +module-setup.sh diff --git a/contrib/dracut/02zfsexpandknowledge/Makefile.am b/contrib/dracut/02zfsexpandknowledge/Makefile.am new file mode 100644 index 000000000000..d31d389a0e5e --- /dev/null +++ b/contrib/dracut/02zfsexpandknowledge/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Substfiles.am + +pkgdracutdir = $(dracutdir)/modules.d/02zfsexpandknowledge +pkgdracut_SCRIPTS = \ + module-setup.sh + +SUBSTFILES += $(pkgdracut_SCRIPTS) diff --git a/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in b/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in new file mode 100755 index 000000000000..8429bd109411 --- /dev/null +++ b/contrib/dracut/02zfsexpandknowledge/module-setup.sh.in @@ -0,0 +1,136 @@ +#!/usr/bin/env bash + +get_devtype() { + local typ + typ=$(udevadm info --query=property --name="$1" | grep "^ID_FS_TYPE=" | sed 's|^ID_FS_TYPE=||') + if [ "$typ" = "" ] ; then + typ=$(blkid -c /dev/null "$1" -o value -s TYPE) + fi + echo "$typ" +} + +get_pool_devices() { + # also present in 99zfssystemd + local poolconfigtemp + local poolconfigoutput + local pooldev + local prefix + local resolved + poolconfigtemp=`mktemp` + @sbindir@/zpool list -v -H -P "$1" > "$poolconfigtemp" 2>&1 + if [ "$?" != "0" ] ; then + poolconfigoutput=$(cat "$poolconfigtemp") + dinfo "zfsexpandknowledge: pool $1 cannot be listed: $poolconfigoutput" + else + cat "$poolconfigtemp" | awk -F '\t' '/\t\/dev/ { print $2 }' | \ + while read pooldev ; do + if [ -n "$pooldev" -a -e "$pooldev" ] ; then + if [ -h "$pooldev" ] ; then + resolved=`readlink -f "$pooldev"` + else + resolved="$pooldev" + fi + dinfo "zfsexpandknowledge: pool $1 has device $pooldev (which resolves to $resolved)" + echo "$resolved" + fi + done + fi + rm -f "$poolconfigtemp" +} + +find_zfs_block_devices() { + local dev + local blockdev + local mp + local fstype + local pool + local key + local n + local poolconfigoutput + numfields=`head -1 /proc/self/mountinfo | awk '{print NF}'` + if [ "$numfields" == "10" ] ; then + fields="n n n n mp n n fstype dev n" + else + fields="n n n n mp n n n fstype dev n" + fi + while read $fields ; do + if [ "$fstype" != "zfs" ]; then continue ; fi + if [ "$mp" == "$1" ]; then + pool=$(echo "$dev" | cut -d / -f 1) + get_pool_devices "$pool" + fi + done < /proc/self/mountinfo +} + +array_contains () { + local e + for e in "${@:2}"; do [[ "$e" == "$1" ]] && return 0; done + return 1 +} + +check() { + local mp + local dev + local blockdevs + local fstype + local majmin + local _depdev + local _depdevname + local _depdevtype + local _depmajmin + local _dev + +if [[ $hostonly ]]; then + + for mp in \ + "/" \ + "/etc" \ + "/bin" \ + "/sbin" \ + "/lib" \ + "/lib64" \ + "/usr" \ + "/usr/bin" \ + "/usr/sbin" \ + "/usr/lib" \ + "/usr/lib64" \ + "/boot"; + do + mp=$(readlink -f "$mp") + mountpoint "$mp" >/dev/null 2>&1 || continue + blockdevs=$(find_zfs_block_devices "$mp") + if [ -z "$blockdevs" ] ; then continue ; fi + dinfo "zfsexpandknowledge: block devices backing ZFS dataset $mp: $blockdevs" + for dev in $blockdevs + do + array_contains "$dev" "${host_devs[@]}" || host_devs+=("$dev") + fstype=$(get_devtype "$dev") + host_fs_types["$dev"]="$fstype" + majmin=$(get_maj_min "$dev") + if [[ -d /sys/dev/block/$majmin/slaves ]] ; then + for _depdev in /sys/dev/block/$majmin/slaves/*; do + [[ -f $_depdev/dev ]] || continue + _depdev=/dev/$(basename "$_depdev") + _depdevname=$(udevadm info --query=property --name="$_depdev" | grep "^DEVNAME=" | sed 's|^DEVNAME=||') + _depdevtype=$(get_devtype "$_depdevname") + _depmajmin=$(get_maj_min "$_depdevname") + dinfo "zfsexpandknowledge: underlying block device backing ZFS dataset $mp: $_depdevname" + array_contains "$_depdevname" "${host_devs[@]}" || host_devs+=("$_depdevname") + host_fs_types["$_depdevname"]="$_depdevtype" + done + fi + done + done + for a in "${host_devs[@]}" + do + dinfo "zfsexpandknowledge: host device $a" + done + for a in "${!host_fs_types[@]}" + do + dinfo "zfsexpandknowledge: device $a of type ${host_fs_types[$a]}" + done + +fi + +return 1 +} diff --git a/contrib/dracut/90zfs/.gitignore b/contrib/dracut/90zfs/.gitignore new file mode 100644 index 000000000000..dce24393479b --- /dev/null +++ b/contrib/dracut/90zfs/.gitignore @@ -0,0 +1,11 @@ +export-zfs.sh +module-setup.sh +mount-zfs.sh +parse-zfs.sh +zfs-generator.sh +zfs-lib.sh +zfs-load-key.sh +zfs-needshutdown.sh +zfs-env-bootfs.service +zfs-snapshot-bootfs.service +zfs-rollback-bootfs.service diff --git a/contrib/dracut/90zfs/Makefile.am b/contrib/dracut/90zfs/Makefile.am new file mode 100644 index 000000000000..a4843827ede3 --- /dev/null +++ b/contrib/dracut/90zfs/Makefile.am @@ -0,0 +1,19 @@ +include $(top_srcdir)/config/Substfiles.am + +pkgdracutdir = $(dracutdir)/modules.d/90zfs +pkgdracut_SCRIPTS = \ + export-zfs.sh \ + module-setup.sh \ + mount-zfs.sh \ + parse-zfs.sh \ + zfs-generator.sh \ + zfs-load-key.sh \ + zfs-needshutdown.sh \ + zfs-lib.sh + +pkgdracut_DATA = \ + zfs-env-bootfs.service \ + zfs-snapshot-bootfs.service \ + zfs-rollback-bootfs.service + +SUBSTFILES += $(pkgdracut_SCRIPTS) $(pkgdracut_DATA) diff --git a/contrib/dracut/90zfs/export-zfs.sh.in b/contrib/dracut/90zfs/export-zfs.sh.in new file mode 100755 index 000000000000..09e4a3cc0e5e --- /dev/null +++ b/contrib/dracut/90zfs/export-zfs.sh.in @@ -0,0 +1,30 @@ +#!/bin/bash + +. /lib/dracut-zfs-lib.sh + +_do_zpool_export() { + ret=0 + errs="" + final="${1}" + + info "ZFS: Exporting ZFS storage pools..." + errs=$(export_all -F 2>&1) + ret=$? + [ -z "${errs}" ] || echo "${errs}" | vwarn + if [ "x${ret}" != "x0" ]; then + info "ZFS: There was a problem exporting pools." + fi + + if [ "x${final}" != "x" ]; then + info "ZFS: pool list" + zpool list 2>&1 | vinfo + fi + + return ${ret} +} + +if command -v zpool >/dev/null; then + _do_zpool_export "${1}" +else + : +fi diff --git a/contrib/dracut/90zfs/module-setup.sh.in b/contrib/dracut/90zfs/module-setup.sh.in new file mode 100755 index 000000000000..7e7a96d6e0a9 --- /dev/null +++ b/contrib/dracut/90zfs/module-setup.sh.in @@ -0,0 +1,119 @@ +#!/usr/bin/env bash + +check() { + # We depend on udev-rules being loaded + [ "${1}" = "-d" ] && return 0 + + # Verify the zfs tool chain + for tool in "@sbindir@/zpool" "@sbindir@/zfs" "@mounthelperdir@/mount.zfs" ; do + test -x "$tool" || return 1 + done + # Verify grep exists + which grep >/dev/null 2>&1 || return 1 + + return 0 +} + +depends() { + echo udev-rules + return 0 +} + +installkernel() { + instmods zfs + instmods zcommon + instmods znvpair + instmods zavl + instmods zunicode + instmods zlua + instmods icp + instmods spl + instmods zlib_deflate + instmods zlib_inflate +} + +install() { + inst_rules @udevruledir@/90-zfs.rules + inst_rules @udevruledir@/69-vdev.rules + inst_rules @udevruledir@/60-zvol.rules + dracut_install hostid + dracut_install grep + dracut_install @sbindir@/zfs + dracut_install @sbindir@/zpool + # Workaround for zfsonlinux/zfs#4749 by ensuring libgcc_s.so(.1) is included + if [[ -n "$(ldd @sbindir@/zpool | grep -F 'libgcc_s.so')" ]]; then + # Dracut will have already tracked and included it + :; + elif command -v gcc-config 2>&1 1>/dev/null; then + # On systems with gcc-config (Gentoo, Funtoo, etc.): + # Use the current profile to resolve the appropriate path + dracut_install "/usr/lib/gcc/$(s=$(gcc-config -c); echo ${s%-*}/${s##*-})/libgcc_s.so.1" + elif [[ -n "$(ls /usr/lib/libgcc_s.so* 2>/dev/null)" ]]; then + # Try a simple path first + dracut_install /usr/lib/libgcc_s.so* + else + # Fallback: Guess the path and include all matches + dracut_install /usr/lib/gcc/*/*/libgcc_s.so* + fi + dracut_install @mounthelperdir@/mount.zfs + dracut_install @udevdir@/vdev_id + dracut_install awk + dracut_install basename + dracut_install cut + dracut_install head + dracut_install @udevdir@/zvol_id + inst_hook cmdline 95 "${moddir}/parse-zfs.sh" + if [ -n "$systemdutildir" ] ; then + inst_script "${moddir}/zfs-generator.sh" "$systemdutildir"/system-generators/dracut-zfs-generator + fi + inst_hook pre-mount 90 "${moddir}/zfs-load-key.sh" + inst_hook mount 98 "${moddir}/mount-zfs.sh" + inst_hook cleanup 99 "${moddir}/zfs-needshutdown.sh" + inst_hook shutdown 20 "${moddir}/export-zfs.sh" + + inst_simple "${moddir}/zfs-lib.sh" "/lib/dracut-zfs-lib.sh" + if [ -e @sysconfdir@/zfs/zpool.cache ]; then + inst @sysconfdir@/zfs/zpool.cache + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/zfs/zpool.cache + fi + + if [ -e @sysconfdir@/zfs/vdev_id.conf ]; then + inst @sysconfdir@/zfs/vdev_id.conf + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @sysconfdir@/zfs/vdev_id.conf + fi + + # Synchronize initramfs and system hostid + AA=`hostid | cut -b 1,2` + BB=`hostid | cut -b 3,4` + CC=`hostid | cut -b 5,6` + DD=`hostid | cut -b 7,8` + echo -ne "\\x${DD}\\x${CC}\\x${BB}\\x${AA}" > "${initdir}/etc/hostid" + + if dracut_module_included "systemd"; then + mkdir -p "${initdir}/$systemdsystemunitdir/zfs-import.target.wants" + for _item in scan cache ; do + dracut_install @systemdunitdir@/zfs-import-$_item.service + if ! [ -L "${initdir}/$systemdsystemunitdir/zfs-import.target.wants"/zfs-import-$_item.service ]; then + ln -s ../zfs-import-$_item.service "${initdir}/$systemdsystemunitdir/zfs-import.target.wants"/zfs-import-$_item.service + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-import-$_item.service + fi + done + inst "${moddir}"/zfs-env-bootfs.service "${systemdsystemunitdir}"/zfs-env-bootfs.service + ln -s ../zfs-env-bootfs.service "${initdir}/${systemdsystemunitdir}/zfs-import.target.wants"/zfs-env-bootfs.service + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-env-bootfs.service + dracut_install systemd-ask-password + dracut_install systemd-tty-ask-password-agent + mkdir -p "${initdir}/$systemdsystemunitdir/initrd.target.wants" + dracut_install @systemdunitdir@/zfs-import.target + if ! [ -L "${initdir}/$systemdsystemunitdir/initrd.target.wants"/zfs-import.target ]; then + ln -s ../zfs-import.target "${initdir}/$systemdsystemunitdir/initrd.target.wants"/zfs-import.target + type mark_hostonly >/dev/null 2>&1 && mark_hostonly @systemdunitdir@/zfs-import.target + fi + for _service in zfs-snapshot-bootfs.service zfs-rollback-bootfs.service ; do + inst "${moddir}"/$_service "${systemdsystemunitdir}"/$_service + if ! [ -L "${initdir}/$systemdsystemunitdir/initrd.target.wants"/$_service ]; then + ln -s ../$_service "${initdir}/$systemdsystemunitdir/initrd.target.wants"/$_service + fi + done + fi +} diff --git a/contrib/dracut/90zfs/mount-zfs.sh.in b/contrib/dracut/90zfs/mount-zfs.sh.in new file mode 100755 index 000000000000..f5b3d9056c17 --- /dev/null +++ b/contrib/dracut/90zfs/mount-zfs.sh.in @@ -0,0 +1,86 @@ +#!/bin/bash + +. /lib/dracut-zfs-lib.sh + +ZFS_DATASET="" +ZFS_POOL="" + +case "${root}" in + zfs:*) ;; + *) return ;; +esac + +GENERATOR_FILE=/run/systemd/generator/sysroot.mount +GENERATOR_EXTENSION=/run/systemd/generator/sysroot.mount.d/zfs-enhancement.conf + +if [ -e "$GENERATOR_FILE" ] && [ -e "$GENERATOR_EXTENSION" ] ; then + # If the ZFS sysroot.mount flag exists, the initial RAM disk configured + # it to mount ZFS on root. In that case, we bail early. This flag + # file gets created by the zfs-generator program upon successful run. + info "ZFS: There is a sysroot.mount and zfs-generator has extended it." + info "ZFS: Delegating root mount to sysroot.mount." + # Let us tell the initrd to run on shutdown. + # We have a shutdown hook to run + # because we imported the pool. + # We now prevent Dracut from running this thing again. + for zfsmounthook in "$hookdir"/mount/*zfs* ; do + if [ -f "$zfsmounthook" ] ; then + rm -f "$zfsmounthook" + fi + done + return +fi +info "ZFS: No sysroot.mount exists or zfs-generator did not extend it." +info "ZFS: Mounting root with the traditional mount-zfs.sh instead." + +# Delay until all required block devices are present. +modprobe zfs 2>/dev/null +udevadm settle + +if [ "${root}" = "zfs:AUTO" ] ; then + ZFS_DATASET="$(find_bootfs)" + if [ $? -ne 0 ] ; then + zpool import -N -a ${ZPOOL_IMPORT_OPTS} + ZFS_DATASET="$(find_bootfs)" + if [ $? -ne 0 ] ; then + warn "ZFS: No bootfs attribute found in importable pools." + export_all -F + + rootok=0 + return 1 + fi + fi + info "ZFS: Using ${ZFS_DATASET} as root." +fi + +ZFS_DATASET="${ZFS_DATASET:-${root#zfs:}}" +ZFS_POOL="${ZFS_DATASET%%/*}" + +if import_pool "${ZFS_POOL}" ; then + # Load keys if we can or if we need to + if [ $(zpool list -H -o feature@encryption $(echo "${ZFS_POOL}" | awk -F\/ '{print $1}')) = 'active' ]; then + # if the root dataset has encryption enabled + ENCRYPTIONROOT="$(zfs get -H -o value encryptionroot "${ZFS_DATASET}")" + if ! [ "${ENCRYPTIONROOT}" = "-" ]; then + KEYSTATUS="$(zfs get -H -o value keystatus "${ENCRYPTIONROOT}")" + # if the key needs to be loaded + if [ "$KEYSTATUS" = "unavailable" ]; then + # decrypt them + ask_for_password \ + --tries 5 \ + --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}: " \ + --cmd "zfs load-key '${ENCRYPTIONROOT}'" + fi + fi + fi + # Let us tell the initrd to run on shutdown. + # We have a shutdown hook to run + # because we imported the pool. + info "ZFS: Mounting dataset ${ZFS_DATASET}..." + if mount_dataset "${ZFS_DATASET}" ; then + ROOTFS_MOUNTED=yes + return 0 + fi +fi + +rootok=0 diff --git a/contrib/dracut/90zfs/parse-zfs.sh.in b/contrib/dracut/90zfs/parse-zfs.sh.in new file mode 100755 index 000000000000..6a805ae24a5a --- /dev/null +++ b/contrib/dracut/90zfs/parse-zfs.sh.in @@ -0,0 +1,66 @@ +#!/bin/bash + +. /lib/dracut-lib.sh + +# Let the command line override our host id. +spl_hostid=$(getarg spl_hostid=) +if [ -n "${spl_hostid}" ] ; then + info "ZFS: Using hostid from command line: ${spl_hostid}" + AA=$(echo "${spl_hostid}" | cut -b 1,2) + BB=$(echo "${spl_hostid}" | cut -b 3,4) + CC=$(echo "${spl_hostid}" | cut -b 5,6) + DD=$(echo "${spl_hostid}" | cut -b 7,8) + echo -ne "\\x${DD}\\x${CC}\\x${BB}\\x${AA}" >/etc/hostid +elif [ -f "/etc/hostid" ] ; then + info "ZFS: Using hostid from /etc/hostid: $(hostid)" +else + warn "ZFS: No hostid found on kernel command line or /etc/hostid." + warn "ZFS: Pools may not import correctly." +fi + +wait_for_zfs=0 +case "${root}" in + ""|zfs|zfs:) + # We'll take root unset, root=zfs, or root=zfs: + # No root set, so we want to read the bootfs attribute. We + # can't do that until udev settles so we'll set dummy values + # and hope for the best later on. + root="zfs:AUTO" + rootok=1 + wait_for_zfs=1 + + info "ZFS: Enabling autodetection of bootfs after udev settles." + ;; + + ZFS\=*|zfs:*|zfs:FILESYSTEM\=*|FILESYSTEM\=*) + # root is explicit ZFS root. Parse it now. We can handle + # a root=... param in any of the following formats: + # root=ZFS=rpool/ROOT + # root=zfs:rpool/ROOT + # root=zfs:FILESYSTEM=rpool/ROOT + # root=FILESYSTEM=rpool/ROOT + # root=ZFS=pool+with+space/ROOT+WITH+SPACE (translates to root=ZFS=pool with space/ROOT WITH SPACE) + + # Strip down to just the pool/fs + root="${root#zfs:}" + root="${root#FILESYSTEM=}" + root="zfs:${root#ZFS=}" + # switch + with spaces because kernel cmdline does not allow us to quote parameters + root=$(printf '%s\n' "$root" | sed "s/+/ /g") + rootok=1 + wait_for_zfs=1 + + info "ZFS: Set ${root} as bootfs." + ;; +esac + +# Make sure Dracut is happy that we have a root and will wait for ZFS +# modules to settle before mounting. +if [ ${wait_for_zfs} -eq 1 ]; then + ln -s /dev/null /dev/root 2>/dev/null + initqueuedir="${hookdir}/initqueue/finished" + test -d "${initqueuedir}" || { + initqueuedir="${hookdir}/initqueue-finished" + } + echo '[ -e /dev/zfs ]' > "${initqueuedir}/zfs.sh" +fi diff --git a/contrib/dracut/90zfs/zfs-env-bootfs.service.in b/contrib/dracut/90zfs/zfs-env-bootfs.service.in new file mode 100644 index 000000000000..3cdf69100d4b --- /dev/null +++ b/contrib/dracut/90zfs/zfs-env-bootfs.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Set BOOTFS environment for dracut +Documentation=man:zpool(8) +DefaultDependencies=no +After=zfs-import-cache.service +After=zfs-import-scan.service +Before=zfs-import.target + +[Service] +Type=oneshot +ExecStart=/bin/sh -c "/bin/systemctl set-environment BOOTFS=$(@sbindir@/zpool list -H -o bootfs | grep -m1 -v '^-$')" + +[Install] +WantedBy=zfs-import.target diff --git a/contrib/dracut/90zfs/zfs-generator.sh.in b/contrib/dracut/90zfs/zfs-generator.sh.in new file mode 100755 index 000000000000..120b9ecf957e --- /dev/null +++ b/contrib/dracut/90zfs/zfs-generator.sh.in @@ -0,0 +1,61 @@ +#!/usr/bin/env bash + +echo "zfs-generator: starting" >> /dev/kmsg + +GENERATOR_DIR="$1" +[ -n "$GENERATOR_DIR" ] || { + echo "zfs-generator: no generator directory specified, exiting" >> /dev/kmsg + exit 1 +} + +[ -f /lib/dracut-lib.sh ] && dracutlib=/lib/dracut-lib.sh +[ -f /usr/lib/dracut/modules.d/99base/dracut-lib.sh ] && dracutlib=/usr/lib/dracut/modules.d/99base/dracut-lib.sh + +type getarg >/dev/null 2>&1 || { + echo "zfs-generator: loading Dracut library from $dracutlib" >> /dev/kmsg + . "$dracutlib" +} + +[ -z "$root" ] && root=$(getarg root=) +[ -z "$rootfstype" ] && rootfstype=$(getarg rootfstype=) +[ -z "$rootflags" ] && rootflags=$(getarg rootflags=) + +# If root is not ZFS= or zfs: or rootfstype is not zfs +# then we are not supposed to handle it. +[ "${root##zfs:}" = "${root}" -a "${root##ZFS=}" = "${root}" -a "$rootfstype" != "zfs" ] && exit 0 + +rootfstype=zfs +if echo "${rootflags}" | grep -Eq '^zfsutil$|^zfsutil,|,zfsutil$|,zfsutil,' ; then + true +elif test -n "${rootflags}" ; then + rootflags="zfsutil,${rootflags}" +else + rootflags=zfsutil +fi + +echo "zfs-generator: writing extension for sysroot.mount to $GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf >> /dev/kmsg + +[ -d "$GENERATOR_DIR" ] || mkdir "$GENERATOR_DIR" +[ -d "$GENERATOR_DIR"/sysroot.mount.d ] || mkdir "$GENERATOR_DIR"/sysroot.mount.d + +{ + echo "[Unit]" + echo "Before=initrd-root-fs.target" + echo "After=zfs-import.target" + echo "[Mount]" + if [ "${root}" = "zfs:AUTO" ] ; then + echo "PassEnvironment=BOOTFS" + echo 'What=${BOOTFS}' + else + root="${root##zfs:}" + root="${root##ZFS=}" + echo "What=${root}" + fi + echo "Type=${rootfstype}" + echo "Options=${rootflags}" +} > "$GENERATOR_DIR"/sysroot.mount.d/zfs-enhancement.conf + +[ -d "$GENERATOR_DIR"/initrd-root-fs.target.requires ] || mkdir -p "$GENERATOR_DIR"/initrd-root-fs.target.requires +ln -s ../sysroot.mount "$GENERATOR_DIR"/initrd-root-fs.target.requires/sysroot.mount + +echo "zfs-generator: finished" >> /dev/kmsg \ No newline at end of file diff --git a/contrib/dracut/90zfs/zfs-lib.sh.in b/contrib/dracut/90zfs/zfs-lib.sh.in new file mode 100755 index 000000000000..f470bfcc54ae --- /dev/null +++ b/contrib/dracut/90zfs/zfs-lib.sh.in @@ -0,0 +1,174 @@ +#!/bin/bash + +command -v getarg >/dev/null || . /lib/dracut-lib.sh +command -v getargbool >/dev/null || { + # Compatibility with older Dracut versions. + # With apologies to the Dracut developers. + getargbool() { + if ! [ -z "$_b" ]; then + unset _b + fi + _default="$1"; shift + _b=$(getarg "$@") + [ $? -ne 0 ] && [ -z "$_b" ] && _b="$_default" + if [ -n "$_b" ]; then + [ "$_b" = "0" ] && return 1 + [ "$_b" = "no" ] && return 1 + [ "$_b" = "off" ] && return 1 + fi + return 0 + } +} + +OLDIFS="${IFS}" +NEWLINE=" +" + +ZPOOL_IMPORT_OPTS="" +if getargbool 0 zfs_force -y zfs.force -y zfsforce ; then + warn "ZFS: Will force-import pools if necessary." + ZPOOL_IMPORT_OPTS="${ZPOOL_IMPORT_OPTS} -f" +fi + +# find_bootfs +# returns the first dataset with the bootfs attribute. +find_bootfs() { + IFS="${NEWLINE}" + for dataset in $(zpool list -H -o bootfs); do + case "${dataset}" in + "" | "-") + continue + ;; + "no pools available") + IFS="${OLDIFS}" + return 1 + ;; + *) + IFS="${OLDIFS}" + echo "${dataset}" + return 0 + ;; + esac + done + + IFS="${OLDIFS}" + return 1 +} + +# import_pool POOL +# imports the given zfs pool if it isn't imported already. +import_pool() { + pool="${1}" + + if ! zpool list -H "${pool}" > /dev/null 2>&1; then + info "ZFS: Importing pool ${pool}..." + if ! zpool import -N ${ZPOOL_IMPORT_OPTS} "${pool}" ; then + warn "ZFS: Unable to import pool ${pool}" + return 1 + fi + fi + + return 0 +} + +# mount_dataset DATASET +# mounts the given zfs dataset. +mount_dataset() { + dataset="${1}" + mountpoint="$(zfs get -H -o value mountpoint "${dataset}")" + + # We need zfsutil for non-legacy mounts and not for legacy mounts. + if [ "${mountpoint}" = "legacy" ] ; then + mount -t zfs "${dataset}" "${NEWROOT}" + else + mount -o zfsutil -t zfs "${dataset}" "${NEWROOT}" + fi + + return $? +} + +# export_all OPTS +# exports all imported zfs pools. +export_all() { + opts="${@}" + ret=0 + + IFS="${NEWLINE}" + for pool in $(zpool list -H -o name) ; do + if zpool list -H "${pool}" > /dev/null 2>&1; then + zpool export "${pool}" ${opts} || ret=$? + fi + done + IFS="${OLDIFS}" + + return ${ret} +} + +# ask_for_password +# +# Wraps around plymouth ask-for-password and adds fallback to tty password ask +# if plymouth is not present. +# +# --cmd command +# Command to execute. Required. +# --prompt prompt +# Password prompt. Note that function already adds ':' at the end. +# Recommended. +# --tries n +# How many times repeat command on its failure. Default is 3. +# --ply-[cmd|prompt|tries] +# Command/prompt/tries specific for plymouth password ask only. +# --tty-[cmd|prompt|tries] +# Command/prompt/tries specific for tty password ask only. +# --tty-echo-off +# Turn off input echo before tty command is executed and turn on after. +# It's useful when password is read from stdin. +ask_for_password() { + ply_tries=3 + tty_tries=3 + while [ "$#" -gt 0 ]; do + case "$1" in + --cmd) ply_cmd="$2"; tty_cmd="$2"; shift;; + --ply-cmd) ply_cmd="$2"; shift;; + --tty-cmd) tty_cmd="$2"; shift;; + --prompt) ply_prompt="$2"; tty_prompt="$2"; shift;; + --ply-prompt) ply_prompt="$2"; shift;; + --tty-prompt) tty_prompt="$2"; shift;; + --tries) ply_tries="$2"; tty_tries="$2"; shift;; + --ply-tries) ply_tries="$2"; shift;; + --tty-tries) tty_tries="$2"; shift;; + --tty-echo-off) tty_echo_off=yes;; + esac + shift + done + + { flock -s 9; + # Prompt for password with plymouth, if installed and running. + if type plymouth >/dev/null 2>&1 && plymouth --ping 2>/dev/null; then + plymouth ask-for-password \ + --prompt "$ply_prompt" --number-of-tries="$ply_tries" \ + --command="$ply_cmd" + ret=$? + else + if [ "$tty_echo_off" = yes ]; then + stty_orig="$(stty -g)" + stty -echo + fi + + i=1 + while [ "$i" -le "$tty_tries" ]; do + [ -n "$tty_prompt" ] && \ + printf "%s [%i/%i]:" "$tty_prompt" "$i" "$tty_tries" >&2 + eval "$tty_cmd" && ret=0 && break + ret=$? + i=$((i+1)) + [ -n "$tty_prompt" ] && printf '\n' >&2 + done + unset i + [ "$tty_echo_off" = yes ] && stty "$stty_orig" + fi + } 9>/.console_lock + + [ $ret -ne 0 ] && echo "Wrong password" >&2 + return $ret +} diff --git a/contrib/dracut/90zfs/zfs-load-key.sh.in b/contrib/dracut/90zfs/zfs-load-key.sh.in new file mode 100755 index 000000000000..ff586ef654b8 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-load-key.sh.in @@ -0,0 +1,58 @@ +#!/bin/bash + +# only run this on systemd systems, we handle the decrypt in mount-zfs.sh in the mount hook otherwise +[ -e /bin/systemctl ] || return 0 + +# This script only gets executed on systemd systems, see mount-zfs.sh for non-systemd systems + +# import the libs now that we know the pool imported +[ -f /lib/dracut-lib.sh ] && dracutlib=/lib/dracut-lib.sh +[ -f /usr/lib/dracut/modules.d/99base/dracut-lib.sh ] && dracutlib=/usr/lib/dracut/modules.d/99base/dracut-lib.sh +# shellcheck source=./lib-zfs.sh.in +. "$dracutlib" + +# load the kernel command line vars +[ -z "$root" ] && root="$(getarg root=)" +# If root is not ZFS= or zfs: or rootfstype is not zfs then we are not supposed to handle it. +[ "${root##zfs:}" = "${root}" ] && [ "${root##ZFS=}" = "${root}" ] && [ "$rootfstype" != "zfs" ] && exit 0 + +# There is a race between the zpool import and the pre-mount hooks, so we wait for a pool to be imported +while true; do + zpool list -H | grep -q -v '^$' && break + [ "$(systemctl is-failed zfs-import-cache.service)" = 'failed' ] && exit 1 + [ "$(systemctl is-failed zfs-import-scan.service)" = 'failed' ] && exit 1 + sleep 0.1s +done + +# run this after import as zfs-import-cache/scan service is confirmed good +# we do not overwrite the ${root} variable, but create a new one, BOOTFS, to hold the dataset +if [ "${root}" = "zfs:AUTO" ] ; then + BOOTFS="$(zpool list -H -o bootfs | awk '$1 != "-" {print; exit}')" +else + BOOTFS="${root##zfs:}" + BOOTFS="${BOOTFS##ZFS=}" +fi + +# if pool encryption is active and the zfs command understands '-o encryption' +if [ "$(zpool list -H -o feature@encryption $(echo "${BOOTFS}" | awk -F\/ '{print $1}'))" = 'active' ]; then + # if the root dataset has encryption enabled + ENCRYPTIONROOT=$(zfs get -H -o value encryptionroot "${BOOTFS}") + # where the key is stored (in a file or loaded via prompt) + KEYLOCATION=$(zfs get -H -o value keylocation "${ENCRYPTIONROOT}") + if ! [ "${ENCRYPTIONROOT}" = "-" ]; then + KEYSTATUS="$(zfs get -H -o value keystatus "${ENCRYPTIONROOT}")" + # continue only if the key needs to be loaded + [ "$KEYSTATUS" = "unavailable" ] || exit 0 + # if key is stored in a file, do not prompt + if ! [ "${KEYLOCATION}" = "prompt" ]; then + zfs load-key "${ENCRYPTIONROOT}" + else + # decrypt them + TRY_COUNT=5 + while [ $TRY_COUNT -gt 0 ]; do + systemd-ask-password "Encrypted ZFS password for ${BOOTFS}" --no-tty | zfs load-key "${ENCRYPTIONROOT}" && break + TRY_COUNT=$((TRY_COUNT - 1)) + done + fi + fi +fi diff --git a/contrib/dracut/90zfs/zfs-needshutdown.sh.in b/contrib/dracut/90zfs/zfs-needshutdown.sh.in new file mode 100755 index 000000000000..ddd3edae0014 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-needshutdown.sh.in @@ -0,0 +1,10 @@ +#!/bin/bash + +type getarg >/dev/null 2>&1 || . /lib/dracut-lib.sh + +if zpool list 2>&1 | grep -q 'no pools available' ; then + info "ZFS: No active pools, no need to export anything." +else + info "ZFS: There is an active pool, will export it." + need_shutdown +fi diff --git a/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in new file mode 100644 index 000000000000..4b058c1b8c9a --- /dev/null +++ b/contrib/dracut/90zfs/zfs-rollback-bootfs.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Rollback bootfs just before it is mounted +Requisite=zfs-import.target +After=zfs-import.target zfs-snapshot-bootfs.service +Before=dracut-mount.service +DefaultDependencies=no +ConditionKernelCommandLine=bootfs.rollback + +[Service] +# ${BOOTFS} should have been set by zfs-env-bootfs.service +Type=oneshot +ExecStartPre=/bin/sh -c 'test -n "${BOOTFS}"' +ExecStart=/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.rollback)"; /sbin/zfs rollback -Rf "${BOOTFS}@${SNAPNAME:-%v}"' +RemainAfterExit=yes diff --git a/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in new file mode 100644 index 000000000000..cfd5f7029f06 --- /dev/null +++ b/contrib/dracut/90zfs/zfs-snapshot-bootfs.service.in @@ -0,0 +1,14 @@ +[Unit] +Description=Snapshot bootfs just before it is mounted +Requisite=zfs-import.target +After=zfs-import.target +Before=dracut-mount.service +DefaultDependencies=no +ConditionKernelCommandLine=bootfs.snapshot + +[Service] +# ${BOOTFS} should have been set by zfs-env-bootfs.service +Type=oneshot +ExecStartPre=/bin/sh -c 'test -n "${BOOTFS}"' +ExecStart=-/bin/sh -c '. /lib/dracut-lib.sh; SNAPNAME="$(getarg bootfs.snapshot)"; /sbin/zfs snapshot "${BOOTFS}@${SNAPNAME:-%v}"' +RemainAfterExit=yes diff --git a/contrib/dracut/Makefile.am b/contrib/dracut/Makefile.am new file mode 100644 index 000000000000..1065e5e94f0e --- /dev/null +++ b/contrib/dracut/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = 02zfsexpandknowledge 90zfs + +EXTRA_DIST = README.dracut.markdown diff --git a/contrib/dracut/README.dracut.markdown b/contrib/dracut/README.dracut.markdown new file mode 100644 index 000000000000..f31543c3cf6b --- /dev/null +++ b/contrib/dracut/README.dracut.markdown @@ -0,0 +1,225 @@ +How to setup a zfs root filesystem using dracut +----------------------------------------------- + +1) Install the zfs-dracut package. This package adds a zfs dracut module +to the /usr/share/dracut/modules.d/ directory which allows dracut to +create an initramfs which is zfs aware. + +2) Set the bootfs property for the bootable dataset in the pool. Then set +the dataset mountpoint property to '/'. + + $ zpool set bootfs=pool/dataset pool + $ zfs set mountpoint=/ pool/dataset + +Alternately, legacy mountpoints can be used by setting the 'root=' option +on the kernel line of your grub.conf/menu.lst configuration file. Then +set the dataset mountpoint property to 'legacy'. + + $ grub.conf/menu.lst: kernel ... root=ZFS=pool/dataset + $ zfs set mountpoint=legacy pool/dataset + +3) To set zfs module options put them in /etc/modprobe.d/zfs.conf file. +The complete list of zfs module options is available by running the +_modinfo zfs_ command. Commonly set options include: zfs_arc_min, +zfs_arc_max, zfs_prefetch_disable, and zfs_vdev_max_pending. + +4) Finally, create your new initramfs by running dracut. + + $ dracut --force /path/to/initramfs kernel_version + +Kernel Command Line +------------------- + +The initramfs' behavior is influenced by the following kernel command line +parameters passed in from the boot loader: + +* `root=...`: If not set, importable pools are searched for a bootfs +attribute. If an explicitly set root is desired, you may use +`root=ZFS:pool/dataset` + +* `zfs_force=0`: If set to 1, the initramfs will run `zpool import -f` when +attempting to import pools if the required pool isn't automatically imported +by the zfs module. This can save you a trip to a bootcd if hostid has +changed, but is dangerous and can lead to zpool corruption, particularly in +cases where storage is on a shared fabric such as iSCSI where multiple hosts +can access storage devices concurrently. _Please understand the implications +of force-importing a pool before enabling this option!_ + +* `spl_hostid`: By default, the hostid used by the SPL module is read from +/etc/hostid inside the initramfs. This file is placed there from the host +system when the initramfs is built which effectively ties the ramdisk to the +host which builds it. If a different hostid is desired, one may be set in +this attribute and will override any file present in the ramdisk. The +format should be hex exactly as found in the `/etc/hostid` file, IE +`spl_hostid=0x00bab10c`. + +Note that changing the hostid between boots will most likely lead to an +un-importable pool since the last importing hostid won't match. In order +to recover from this, you may use the `zfs_force` option or boot from a +different filesystem and `zpool import -f` then `zpool export` the pool +before rebooting with the new hostid. + +* `bootfs.snapshot`: If listed, enables the zfs-snapshot-bootfs service on a Dracut system. The zfs-snapshot-bootfs service simply runs `zfs snapshot $BOOTFS@%v` after the pool has been imported but before the bootfs is mounted. `$BOOTFS` is substituted with the value of the bootfs setting on the pool. `%v` is substituted with the version string of the kernel currently being booted (e.g. 5.6.6-200.fc31.x86\_64). Failure to create the snapshot (e.g. because one with the same name already exists) will be logged, but will not otherwise interrupt the boot process. + + It is safe to leave the bootfs.snapshot flag set persistently on your kernel command line so that a new snapshot of your bootfs will be created on every kernel update. If you leave bootfs.snapshot set persistently on your kernel command line, you may find the below script helpful for automatically removing old snapshots of the bootfs along with their associated kernel. + + #!/usr/bin/sh + + if [[ "$1" == "remove" ]] && grep -q "\bbootfs.snapshot\b" /proc/cmdline; then + zfs destroy $(findmnt -n -o source /)@$2 &> /dev/null + fi + + exit 0 + + To use the above script place it in a plain text file named /etc/kernel/install.d/99-zfs-cleanup.install and mark it executable with the following command: + + $ chmod +x /etc/kernel/install.d/99-zfs-cleanup.install + + On Red Hat based systems, you can change the value of `installonly_limit` in /etc/dnf/dnf.conf to adjust the number of kernels and their associated snapshots that are kept. + +* `bootfs.snapshot=`: Is identical to the bootfs.snapshot parameter explained above except that the value substituted for \ will be used when creating the snapshot instead of the version string of the kernel currently being booted. + +* `bootfs.rollback`: If listed, enables the zfs-rollback-bootfs service on a Dracut system. The zfs-rollback-bootfs service simply runs `zfs rollback -Rf $BOOTFS@%v` after the pool has been imported but before the bootfs is mounted. If the rollback operation fails, the boot process will be interrupted with a Dracut rescue shell. __Use this parameter with caution. Intermediate snapshots of the bootfs will be destroyed!__ TIP: Keep your user data (e.g. /home) on separate file systems (it can be in the same pool though). + +* `bootfs.rollback=`: Is identical to the bootfs.rollback parameter explained above except that the value substituted for \ will be used when rolling back the bootfs instead of the version string of the kernel currently being booted. If you use this form, choose a snapshot that is new enough to contain the needed kernel modules under /lib/modules or use a kernel that has all the needed modules built-in. + +How it Works +============ + +The Dracut module consists of the following files (less Makefile's): + +* `module-setup.sh`: Script run by the initramfs builder to create the +ramdisk. Contains instructions on which files are required by the modules +and z* programs. Also triggers inclusion of `/etc/hostid` and the zpool +cache. This file is not included in the initramfs. + +* `90-zfs.rules`: udev rules which trigger loading of the ZFS modules at boot. + +* `zfs-lib.sh`: Utility functions used by the other files. + +* `parse-zfs.sh`: Run early in the initramfs boot process to parse kernel +command line and determine if ZFS is the active root filesystem. + +* `mount-zfs.sh`: Run later in initramfs boot process after udev has settled +to mount the root dataset. + +* `export-zfs.sh`: Run on shutdown after dracut has restored the initramfs +and pivoted to it, allowing for a clean unmount and export of the ZFS root. + +`zfs-lib.sh` +------------ + +This file provides a few handy functions for working with ZFS. Those +functions are used by the `mount-zfs.sh` and `export-zfs.sh` files. +However, they could be used by any other file as well, as long as the file +sources `/lib/dracut-zfs-lib.sh`. + +`module-setup.sh` +----------------- + +This file is run by the Dracut script within the live system, not at boot +time. It's not included in the final initramfs. Functions in this script +describe which files are needed by ZFS at boot time. + +Currently all the various z* and spl modules are included, a dependency is +asserted on udev-rules, and the various zfs, zpool, etc. helpers are included. +Dracut provides library functions which automatically gather the shared libs +necessary to run each of these binaries, so statically built binaries are +not required. + +The zpool and zvol udev rules files are copied from where they are +installed by the ZFS build. __PACKAGERS TAKE NOTE__: If you move +`/etc/udev/rules/60-z*.rules`, you'll need to update this file to match. + +Currently this file also includes `/etc/hostid` and `/etc/zfs/zpool.cache` +which means the generated ramdisk is specific to the host system which built +it. If a generic initramfs is required, it may be preferable to omit these +files and specify the `spl_hostid` from the boot loader instead. + +`parse-zfs.sh` +-------------- + +Run during the cmdline phase of the initramfs boot process, this script +performs some basic sanity checks on kernel command line parameters to +determine if booting from ZFS is likely to be what is desired. Dracut +requires this script to adjust the `root` variable if required and to set +`rootok=1` if a mountable root filesystem is available. Unfortunately this +script must run before udev is settled and kernel modules are known to be +loaded, so accessing the zpool and zfs commands is unsafe. + +If the root=ZFS... parameter is set on the command line, then it's at least +certain that ZFS is what is desired, though this script is unable to +determine if ZFS is in fact available. This script will alter the `root` +parameter to replace several historical forms of specifying the pool and +dataset name with the canonical form of `zfs:pool/dataset`. + +If no root= parameter is set, the best this script can do is guess that +ZFS is desired. At present, no other known filesystems will work with no +root= parameter, though this might possibly interfere with using the +compiled-in default root in the kernel image. It's considered unlikely +that would ever be the case when an initramfs is in use, so this script +sets `root=zfs:AUTO` and hopes for the best. + +Once the root=... (or lack thereof) parameter is parsed, a dummy symlink +is created from `/dev/root` -> `/dev/null` to satisfy parts of the Dracut +process which check for presence of a single root device node. + +Finally, an initqueue/finished hook is registered which causes the initqueue +phase of Dracut to wait for `/dev/zfs` to become available before attempting +to mount anything. + +`mount-zfs.sh` +-------------- + +This script is run after udev has settled and all tasks in the initqueue +have succeeded. This ensures that `/dev/zfs` is available and that the +various ZFS modules are successfully loaded. As it is now safe to call +zpool and friends, we can proceed to find the bootfs attribute if necessary. + +If the root parameter was explicitly set on the command line, no parsing is +necessary. The list of imported pools is checked to see if the desired pool +is already imported. If it's not, and attempt is made to import the pool +explicitly, though no force is attempted. Finally the specified dataset +is mounted on `$NEWROOT`, first using the `-o zfsutil` option to handle +non-legacy mounts, then if that fails, without zfsutil to handle legacy +mount points. + +If no root parameter was specified, this script attempts to find a pool with +its bootfs attribute set. First, already-imported pools are scanned and if +an appropriate pool is found, no additional pools are imported. If no pool +with bootfs is found, any additional pools in the system are imported with +`zpool import -N -a`, and the scan for bootfs is tried again. If no bootfs +is found with all pools imported, all pools are re-exported, and boot fails. +Assuming a bootfs is found, an attempt is made to mount it to `$NEWROOT`, +first with, then without the zfsutil option as above. + +Ordinarily pools are imported _without_ the force option which may cause +boot to fail if the hostid has changed or a pool has been physically moved +between servers. The `zfs_force` kernel parameter is provided which when +set to `1` causes `zpool import` to be run with the `-f` flag. Forcing pool +import can lead to serious data corruption and loss of pools, so this option +should be used with extreme caution. Note that even with this flag set, if +the required zpool was auto-imported by the kernel module, no additional +`zpool import` commands are run, so nothing is forced. + +`export-zfs.sh` +--------------- + +Normally the zpool containing the root dataset cannot be exported on +shutdown as it is still in use by the init process. To work around this, +Dracut is able to restore the initramfs on shutdown and pivot to it. +All remaining process are then running from a ramdisk, allowing for a +clean unmount and export of the ZFS root. The theory of operation is +described in detail in the [Dracut manual](https://www.kernel.org/pub/linux/utils/boot/dracut/dracut.html#_dracut_on_shutdown). + +This script will try to export all remaining zpools after Dracut has +pivoted to the initramfs. If an initial regular export is not successful, +Dracut will call this script once more with the `final` option, +in which case a forceful export is attempted. + +Other Dracut modules include similar shutdown scripts and Dracut +invokes these scripts round-robin until they succeed. In particular, +the `90dm` module installs a script which tries to close and remove +all device mapper targets. Thus, if there are ZVOLs containing +dm-crypt volumes or if the zpool itself is backed by a dm-crypt +volume, the shutdown scripts will try to untangle this. diff --git a/contrib/initramfs/Makefile.am b/contrib/initramfs/Makefile.am new file mode 100644 index 000000000000..5eac6e2155db --- /dev/null +++ b/contrib/initramfs/Makefile.am @@ -0,0 +1,9 @@ +initrddir = /usr/share/initramfs-tools + +dist_initrd_SCRIPTS = \ + zfsunlock + +SUBDIRS = conf.d conf-hooks.d hooks scripts + +EXTRA_DIST = \ + README.initramfs.markdown diff --git a/contrib/initramfs/README.initramfs.markdown b/contrib/initramfs/README.initramfs.markdown new file mode 100644 index 000000000000..be3ec71ef785 --- /dev/null +++ b/contrib/initramfs/README.initramfs.markdown @@ -0,0 +1,84 @@ +## Description + +These scripts are intended to be used with `initramfs-tools`, which is a +similar software product to `dracut` (which is used in Red Hat based +distributions), and is mainly used by Debian GNU/Linux and derivatives. + +These scripts share some common functionality with the SysV init scripts, +primarily the `/etc/zfs/zfs-functions` script. + +## Configuration + +### Root pool/filesystem + +Different distributions have their own standard on what to specify on the +kernel command line to boot off a ZFS filesystem. + +This script supports the following kernel command line argument combinations +(in this order - first match wins): + +* `rpool=` +* `bootfs=/` +* `rpool= bootfs=/` +* `-B zfs-bootfs=/` +* `root=/` +* `root=ZFS=/` +* `root=zfs:AUTO` +* `root=zfs:/` +* `rpool=rpool` + +If a pool is specified, it will be used. Otherwise, in `AUTO` mode, all pools +will be searched. Pools may be excluded from the search by listing them in +`ZFS_POOL_EXCEPTIONS` in `/etc/default/zfs`. + +Pools will be imported as follows: + +* Try `/dev/disk/by-vdev` if it exists; see `/etc/zfs/vdev_id.conf`. +* Try `/dev/disk/by-id` and any other `/dev/disk/by-*` directories. +* Try `/dev`. +* Use the cache file if nothing else worked. + +This order may be modified by setting `ZPOOL_IMPORT_PATH` in +`/etc/default/zfs`. + +If a dataset is specified, it will be used as the root filesystem. Otherwise, +this script will attempt to find a root filesystem automatically (in the +specified pool or all pools, as described above). + +Filesystems below the root filesystem will be automatically mounted with no +additional configuration necessary. For example, if the root filesystem is +`rpool/ROOT/rootfs`, `rpool/root/rootfs/var`, `rpool/root/rootfs/usr`, etc. +will be mounted (if they exist). + +### Snapshots + +The `` can be a snapshot. In this case, the snapshot will be cloned +and the clone used as the root filesystem. Note: + +* If the snapshot does not exist, the base dataset (the part before `@`) is + used as the boot filesystem instead. +* If the resulting clone dataset already exists, it is destroyed. +* The clone is created with `mountpoint=none` and `canmount=noauto`. The root + filesystem is mounted manually by the initramfs script. +* If no snapshot is specified on the `root=` kernel command line, but + there is an `@`, the user will be prompted to choose a snapshot to use. + +### Extra options + +The following kernel command line arguments are supported: + +* `zfsdebug=(on,yes,1)`: Show extra debugging information +* `zfsforce=(on,yes,1)`: Force import the pool +* `rollback=(on,yes,1)`: Rollback to (instead of clone) the snapshot + +### Unlocking a ZFS encrypted root over SSH + +To use this feature: + +1. Install the `dropbear-initramfs` package. You may wish to uninstall the + `cryptsetup-initramfs` package to avoid warnings. +2. Add your SSH key(s) to `/etc/dropbear-initramfs/authorized_keys`. Note + that Dropbear does not support ed25519 keys; use RSA (2048-bit or more) + instead. +3. Rebuild the initramfs with your keys: `update-initramfs -u` +4. During the system boot, login via SSH and run: `zfsunlock` diff --git a/contrib/initramfs/conf-hooks.d/Makefile.am b/contrib/initramfs/conf-hooks.d/Makefile.am new file mode 100644 index 000000000000..f84ba5cc7e37 --- /dev/null +++ b/contrib/initramfs/conf-hooks.d/Makefile.am @@ -0,0 +1,4 @@ +confhooksddir = /usr/share/initramfs-tools/conf-hooks.d + +dist_confhooksd_DATA = \ + zfs diff --git a/contrib/initramfs/conf-hooks.d/zfs b/contrib/initramfs/conf-hooks.d/zfs new file mode 100644 index 000000000000..b86d36223e3c --- /dev/null +++ b/contrib/initramfs/conf-hooks.d/zfs @@ -0,0 +1,9 @@ +# Force the inclusion of Busybox in the initramfs. +BUSYBOX=y + +# Setup the keyboard mapping so passphrases can be entered correctly. +KEYMAP=y + +# Require the plymouth script to guarantee working video for the passphrase +# prompting. +FRAMEBUFFER=y diff --git a/contrib/initramfs/conf.d/Makefile.am b/contrib/initramfs/conf.d/Makefile.am new file mode 100644 index 000000000000..5ef27e0aa1ce --- /dev/null +++ b/contrib/initramfs/conf.d/Makefile.am @@ -0,0 +1,4 @@ +confddir = /usr/share/initramfs-tools/conf.d + +dist_confd_DATA = \ + zfs diff --git a/contrib/initramfs/conf.d/zfs b/contrib/initramfs/conf.d/zfs new file mode 100644 index 000000000000..c67d75ba8672 --- /dev/null +++ b/contrib/initramfs/conf.d/zfs @@ -0,0 +1,8 @@ +for x in $(cat /proc/cmdline) +do + case $x in + root=ZFS=*|root=zfs:*) + BOOT=zfs + ;; + esac +done diff --git a/contrib/initramfs/hooks/.gitignore b/contrib/initramfs/hooks/.gitignore new file mode 100644 index 000000000000..4e1604e18899 --- /dev/null +++ b/contrib/initramfs/hooks/.gitignore @@ -0,0 +1,2 @@ +zfs +zfsunlock diff --git a/contrib/initramfs/hooks/Makefile.am b/contrib/initramfs/hooks/Makefile.am new file mode 100644 index 000000000000..f303e995b9c7 --- /dev/null +++ b/contrib/initramfs/hooks/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Substfiles.am + +hooksdir = /usr/share/initramfs-tools/hooks + +hooks_SCRIPTS = \ + zfs \ + zfsunlock + +SUBSTFILES += $(hooks_SCRIPTS) diff --git a/contrib/initramfs/hooks/zfs.in b/contrib/initramfs/hooks/zfs.in new file mode 100755 index 000000000000..67d27a7649b2 --- /dev/null +++ b/contrib/initramfs/hooks/zfs.in @@ -0,0 +1,109 @@ +#!/bin/sh +# +# Add ZoL filesystem capabilities to an initrd, usually for a native ZFS root. +# + +# This hook installs udev rules for ZoL. +PREREQ="udev" + +# These prerequisites are provided by the zfsutils package. The zdb utility is +# not strictly required, but it can be useful at the initramfs recovery prompt. +COPY_EXEC_LIST="@sbindir@/zdb @sbindir@/zpool @sbindir@/zfs" +COPY_EXEC_LIST="$COPY_EXEC_LIST @mounthelperdir@/mount.zfs @udevdir@/vdev_id" +COPY_EXEC_LIST="$COPY_EXEC_LIST @udevdir@/zvol_id" +COPY_FILE_LIST="/etc/hostid @sysconfdir@/zfs/zpool.cache" +COPY_FILE_LIST="$COPY_FILE_LIST @initconfdir@/zfs" +COPY_FILE_LIST="$COPY_FILE_LIST @sysconfdir@/zfs/zfs-functions" +COPY_FILE_LIST="$COPY_FILE_LIST @sysconfdir@/zfs/vdev_id.conf" +COPY_FILE_LIST="$COPY_FILE_LIST @udevruledir@/60-zvol.rules" +COPY_FILE_LIST="$COPY_FILE_LIST @udevruledir@/69-vdev.rules" + +# These prerequisites are provided by the base system. +COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/dirname /bin/hostname /sbin/blkid" +COPY_EXEC_LIST="$COPY_EXEC_LIST /usr/bin/env" +COPY_EXEC_LIST="$COPY_EXEC_LIST $(which systemd-ask-password)" + +# Explicitly specify all kernel modules because automatic dependency resolution +# is unreliable on many systems. +BASE_MODULES="zlib_deflate spl zavl zcommon znvpair zunicode zlua zfs icp" +CRPT_MODULES="sun-ccm sun-gcm sun-ctr" +MANUAL_ADD_MODULES_LIST="$BASE_MODULES" + +# Generic result code. +RC=0 + +case $1 in +prereqs) + echo "$PREREQ" + exit 0 + ;; +esac + +for ii in $COPY_EXEC_LIST +do + if [ ! -x "$ii" ] + then + echo "Error: $ii is not executable." + RC=2 + fi +done + +if [ "$RC" -ne 0 ] +then + exit "$RC" +fi + +. /usr/share/initramfs-tools/hook-functions + +mkdir -p "$DESTDIR/etc/" + +# ZDB uses pthreads for some functions, but the library dependency is not +# automatically detected. The `find` utility and extended `cp` options are +# used here because libgcc_s.so could be in a subdirectory of /lib for +# multi-arch installations. +cp --target-directory="$DESTDIR" --parents $(find /lib/ -type f -name libgcc_s.so.1) + +for ii in $COPY_EXEC_LIST +do + copy_exec "$ii" +done + +for ii in $COPY_FILE_LIST +do + dir=$(dirname "$ii") + [ -d "$dir" ] && mkdir -p "$DESTDIR/$dir" + [ -f "$ii" ] && cp -p "$ii" "$DESTDIR/$ii" +done + +for ii in $MANUAL_ADD_MODULES_LIST +do + manual_add_modules "$ii" +done + +if [ -f "/etc/hostname" ] +then + cp -p "/etc/hostname" "$DESTDIR/etc/" +else + hostname >"$DESTDIR/etc/hostname" +fi + +for ii in zfs zfs.conf spl spl.conf +do + if [ -f "/etc/modprobe.d/$ii" ]; then + if [ ! -d "$DESTDIR/etc/modprobe.d" ]; then + mkdir -p $DESTDIR/etc/modprobe.d + fi + cp -p "/etc/modprobe.d/$ii" $DESTDIR/etc/modprobe.d/ + fi +done + +# With pull request #1476 (not yet merged) comes a verbose warning +# if /usr/bin/net doesn't exist or isn't executable. Just create +# a dummy... +[ ! -d "$DESTDIR/usr/bin" ] && mkdir -p "$DESTDIR/usr/bin" +if [ ! -x "$DESTDIR/usr/bin/net" ]; then + touch "$DESTDIR/usr/bin/net" + chmod +x "$DESTDIR/usr/bin/net" +fi + +exit 0 diff --git a/contrib/initramfs/hooks/zfsunlock.in b/contrib/initramfs/hooks/zfsunlock.in new file mode 100644 index 000000000000..c8ae86363981 --- /dev/null +++ b/contrib/initramfs/hooks/zfsunlock.in @@ -0,0 +1,18 @@ +#!/bin/sh + +PREREQ="dropbear" + +prereqs() { + echo "$PREREQ" +} + +case "$1" in + prereqs) + prereqs + exit 0 + ;; +esac + +. /usr/share/initramfs-tools/hook-functions + +copy_exec /usr/share/initramfs-tools/zfsunlock /usr/bin diff --git a/contrib/initramfs/scripts/.gitignore b/contrib/initramfs/scripts/.gitignore new file mode 100644 index 000000000000..73304bc2cd4a --- /dev/null +++ b/contrib/initramfs/scripts/.gitignore @@ -0,0 +1 @@ +zfs diff --git a/contrib/initramfs/scripts/Makefile.am b/contrib/initramfs/scripts/Makefile.am new file mode 100644 index 000000000000..2a142096e449 --- /dev/null +++ b/contrib/initramfs/scripts/Makefile.am @@ -0,0 +1,6 @@ +scriptsdir = /usr/share/initramfs-tools/scripts + +dist_scripts_DATA = \ + zfs + +SUBDIRS = local-top diff --git a/contrib/initramfs/scripts/local-top/Makefile.am b/contrib/initramfs/scripts/local-top/Makefile.am new file mode 100644 index 000000000000..1523a907c860 --- /dev/null +++ b/contrib/initramfs/scripts/local-top/Makefile.am @@ -0,0 +1,4 @@ +localtopdir = /usr/share/initramfs-tools/scripts/local-top + +dist_localtop_SCRIPTS = \ + zfs diff --git a/contrib/initramfs/scripts/local-top/zfs b/contrib/initramfs/scripts/local-top/zfs new file mode 100755 index 000000000000..e8e5cd26451c --- /dev/null +++ b/contrib/initramfs/scripts/local-top/zfs @@ -0,0 +1,60 @@ +#!/bin/sh +PREREQ="mdadm mdrun multipath" + +prereqs() +{ + echo "$PREREQ" +} + +case $1 in +# get pre-requisites +prereqs) + prereqs + exit 0 + ;; +esac + + +# +# Helper functions +# +message() +{ + if [ -x /bin/plymouth ] && plymouth --ping; then + plymouth message --text="$@" + else + echo "$@" >&2 + fi + return 0 +} + +udev_settle() +{ + # Wait for udev to be ready, see https://launchpad.net/bugs/85640 + if [ -x /sbin/udevadm ]; then + /sbin/udevadm settle --timeout=30 + elif [ -x /sbin/udevsettle ]; then + /sbin/udevsettle --timeout=30 + fi + return 0 +} + + +activate_vg() +{ + # Sanity checks + if [ ! -x /sbin/lvm ]; then + [ "$quiet" != "y" ] && message "lvm is not available" + return 1 + fi + + # Detect and activate available volume groups + /sbin/lvm vgscan + /sbin/lvm vgchange -a y --sysinit + return $? +} + +udev_settle +activate_vg + +exit 0 diff --git a/contrib/initramfs/scripts/zfs b/contrib/initramfs/scripts/zfs new file mode 100644 index 000000000000..a795fd39f605 --- /dev/null +++ b/contrib/initramfs/scripts/zfs @@ -0,0 +1,1011 @@ +# ZFS boot stub for initramfs-tools. +# +# In the initramfs environment, the /init script sources this stub to +# override the default functions in the /scripts/local script. +# +# Enable this by passing boot=zfs on the kernel command line. +# + +# Source the common functions +. /etc/zfs/zfs-functions + +# Start interactive shell. +# Use debian's panic() if defined, because it allows to prevent shell access +# by setting panic in cmdline (e.g. panic=0 or panic=15). +# See "4.5 Disable root prompt on the initramfs" of Securing Debian Manual: +# https://www.debian.org/doc/manuals/securing-debian-howto/ch4.en.html +shell() { + if type panic > /dev/null 2>&1; then + panic $@ + else + /bin/sh + fi +} + +# This runs any scripts that should run before we start importing +# pools and mounting any filesystems. +pre_mountroot() +{ + if type run_scripts > /dev/null 2>&1 && \ + [ -f "/scripts/local-top" -o -d "/scripts/local-top" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-top" + run_scripts /scripts/local-top + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + if type run_scripts > /dev/null 2>&1 && \ + [ -f "/scripts/local-premount" -o -d "/scripts/local-premount" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-premount" + run_scripts /scripts/local-premount + [ "$quiet" != "y" ] && zfs_log_end_msg + fi +} + +# If plymouth is available, hide the splash image. +disable_plymouth() +{ + if [ -x /bin/plymouth ] && /bin/plymouth --ping + then + /bin/plymouth hide-splash >/dev/null 2>&1 + fi +} + +# Get a ZFS filesystem property value. +get_fs_value() +{ + local fs="$1" + local value=$2 + + "${ZFS}" get -H -ovalue $value "$fs" 2> /dev/null +} + +# Find the 'bootfs' property on pool $1. +# If the property does not contain '/', then ignore this +# pool by exporting it again. +find_rootfs() +{ + local pool="$1" + + # If 'POOL_IMPORTED' isn't set, no pool imported and therefore + # we won't be able to find a root fs. + [ -z "${POOL_IMPORTED}" ] && return 1 + + # If it's already specified, just keep it mounted and exit + # User (kernel command line) must be correct. + [ -n "${ZFS_BOOTFS}" ] && return 0 + + # Not set, try to find it in the 'bootfs' property of the pool. + # NOTE: zpool does not support 'get -H -ovalue bootfs'... + ZFS_BOOTFS=$("${ZPOOL}" list -H -obootfs "$pool") + + # Make sure it's not '-' and that it starts with /. + if [ "${ZFS_BOOTFS}" != "-" ] && \ + $(get_fs_value "${ZFS_BOOTFS}" mountpoint | grep -q '^/$') + then + # Keep it mounted + POOL_IMPORTED=1 + return 0 + fi + + # Not boot fs here, export it and later try again.. + "${ZPOOL}" export "$pool" + POOL_IMPORTED="" + + return 1 +} + +# Support function to get a list of all pools, separated with ';' +find_pools() +{ + local CMD="$*" + local pools pool + + pools=$($CMD 2> /dev/null | \ + grep -E "pool:|^[a-zA-Z0-9]" | \ + sed 's@.*: @@' | \ + while read pool; do \ + echo -n "$pool;" + done) + + echo "${pools%%;}" # Return without the last ';'. +} + +# Get a list of all available pools +get_pools() +{ + local available_pools npools + + if [ -n "${ZFS_POOL_IMPORT}" ]; then + echo "$ZFS_POOL_IMPORT" + return 0 + fi + + # Get the base list of available pools. + available_pools=$(find_pools "$ZPOOL" import) + + # Just in case - seen it happen (that a pool isn't visible/found + # with a simple "zpool import" but only when using the "-d" + # option or setting ZPOOL_IMPORT_PATH). + if [ -d "/dev/disk/by-id" ] + then + npools=$(find_pools "$ZPOOL" import -d /dev/disk/by-id) + if [ -n "$npools" ] + then + # Because we have found extra pool(s) here, which wasn't + # found 'normally', we need to force USE_DISK_BY_ID to + # make sure we're able to actually import it/them later. + USE_DISK_BY_ID='yes' + + if [ -n "$available_pools" ] + then + # Filter out duplicates (pools found with the simple + # "zpool import" but which is also found with the + # "zpool import -d ..."). + npools=$(echo "$npools" | sed "s,$available_pools,,") + + # Add the list to the existing list of + # available pools + available_pools="$available_pools;$npools" + else + available_pools="$npools" + fi + fi + fi + + # Filter out any exceptions... + if [ -n "$ZFS_POOL_EXCEPTIONS" ] + then + local found="" + local apools="" + local pool exception + OLD_IFS="$IFS" ; IFS=";" + + for pool in $available_pools + do + for exception in $ZFS_POOL_EXCEPTIONS + do + [ "$pool" = "$exception" ] && continue 2 + found="$pool" + done + + if [ -n "$found" ] + then + if [ -n "$apools" ] + then + apools="$apools;$pool" + else + apools="$pool" + fi + fi + done + + IFS="$OLD_IFS" + available_pools="$apools" + fi + + # Return list of available pools. + echo "$available_pools" +} + +# Import given pool $1 +import_pool() +{ + local pool="$1" + local dirs dir + + # Verify that the pool isn't already imported + # Make as sure as we can to not require '-f' to import. + "${ZPOOL}" get name,guid -o value -H 2>/dev/null | grep -Fxq "$pool" && return 0 + + # For backwards compatibility, make sure that ZPOOL_IMPORT_PATH is set + # to something we can use later with the real import(s). We want to + # make sure we find all by* dirs, BUT by-vdev should be first (if it + # exists). + if [ -n "$USE_DISK_BY_ID" -a -z "$ZPOOL_IMPORT_PATH" ] + then + dirs="$(for dir in $(echo /dev/disk/by-*) + do + # Ignore by-vdev here - we want it first! + echo "$dir" | grep -q /by-vdev && continue + [ ! -d "$dir" ] && continue + + echo -n "$dir:" + done | sed 's,:$,,g')" + + if [ -d "/dev/disk/by-vdev" ] + then + # Add by-vdev at the beginning. + ZPOOL_IMPORT_PATH="/dev/disk/by-vdev:" + fi + + # ... and /dev at the very end, just for good measure. + ZPOOL_IMPORT_PATH="$ZPOOL_IMPORT_PATH$dirs:/dev" + fi + + # Needs to be exported for "zpool" to catch it. + [ -n "$ZPOOL_IMPORT_PATH" ] && export ZPOOL_IMPORT_PATH + + + [ "$quiet" != "y" ] && zfs_log_begin_msg \ + "Importing pool '${pool}' using defaults" + + ZFS_CMD="${ZPOOL} import -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}" + ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)" + ZFS_ERROR="$?" + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + if [ -f "${ZPOOL_CACHE}" ] + then + [ "$quiet" != "y" ] && zfs_log_begin_msg \ + "Importing pool '${pool}' using cachefile." + + ZFS_CMD="${ZPOOL} import -c ${ZPOOL_CACHE} -N ${ZPOOL_FORCE} ${ZPOOL_IMPORT_OPTS}" + ZFS_STDERR="$($ZFS_CMD "$pool" 2>&1)" + ZFS_ERROR="$?" + fi + + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + disable_plymouth + echo "" + echo "Command: ${ZFS_CMD} '$pool'" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "Failed to import pool '$pool'." + echo "Manually import the pool and exit." + shell + fi + fi + + [ "$quiet" != "y" ] && zfs_log_end_msg + + POOL_IMPORTED=1 + return 0 +} + +# Load ZFS modules +# Loading a module in a initrd require a slightly different approach, +# with more logging etc. +load_module_initrd() +{ + if [ "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" > 0 ] + then + if [ "$quiet" != "y" ]; then + zfs_log_begin_msg "Sleeping for" \ + "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP seconds..." + fi + sleep "$ZFS_INITRD_PRE_MOUNTROOT_SLEEP" + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + # Wait for all of the /dev/{hd,sd}[a-z] device nodes to appear. + if type wait_for_udev > /dev/null 2>&1 ; then + wait_for_udev 10 + elif type wait_for_dev > /dev/null 2>&1 ; then + wait_for_dev + fi + + # zpool import refuse to import without a valid /proc/self/mounts + [ ! -f /proc/self/mounts ] && mount proc /proc + + # Load the module + load_module "zfs" || return 1 + + if [ "$ZFS_INITRD_POST_MODPROBE_SLEEP" > 0 ] + then + if [ "$quiet" != "y" ]; then + zfs_log_begin_msg "Sleeping for" \ + "$ZFS_INITRD_POST_MODPROBE_SLEEP seconds..." + fi + sleep "$ZFS_INITRD_POST_MODPROBE_SLEEP" + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + return 0 +} + +# Mount a given filesystem +mount_fs() +{ + local fs="$1" + local mountpoint + + # Check that the filesystem exists + "${ZFS}" list -oname -tfilesystem -H "${fs}" > /dev/null 2>&1 + [ "$?" -ne 0 ] && return 1 + + # Skip filesystems with canmount=off. The root fs should not have + # canmount=off, but ignore it for backwards compatibility just in case. + if [ "$fs" != "${ZFS_BOOTFS}" ] + then + canmount=$(get_fs_value "$fs" canmount) + [ "$canmount" = "off" ] && return 0 + fi + + # Need the _original_ datasets mountpoint! + mountpoint=$(get_fs_value "$fs" mountpoint) + if [ "$mountpoint" = "legacy" -o "$mountpoint" = "none" ]; then + # Can't use the mountpoint property. Might be one of our + # clones. Check the 'org.zol:mountpoint' property set in + # clone_snap() if that's usable. + mountpoint=$(get_fs_value "$fs" org.zol:mountpoint) + if [ "$mountpoint" = "legacy" -o \ + "$mountpoint" = "none" -o \ + "$mountpoint" = "-" ] + then + if [ "$fs" != "${ZFS_BOOTFS}" ]; then + # We don't have a proper mountpoint and this + # isn't the root fs. + return 0 + else + # Last hail-mary: Hope 'rootmnt' is set! + mountpoint="" + fi + fi + + if [ "$mountpoint" = "legacy" ]; then + ZFS_CMD="mount -t zfs" + else + # If it's not a legacy filesystem, it can only be a + # native one... + ZFS_CMD="mount -o zfsutil -t zfs" + fi + else + ZFS_CMD="mount -o zfsutil -t zfs" + fi + + # Possibly decrypt a filesystem using native encryption. + decrypt_fs "$fs" + + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Mounting '${fs}' on '${rootmnt}/${mountpoint}'" + [ -n "${ZFS_DEBUG}" ] && \ + zfs_log_begin_msg "CMD: '$ZFS_CMD ${fs} ${rootmnt}/${mountpoint}'" + + ZFS_STDERR=$(${ZFS_CMD} "${fs}" "${rootmnt}/${mountpoint}" 2>&1) + ZFS_ERROR=$? + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + disable_plymouth + echo "" + echo "Command: ${ZFS_CMD} ${fs} ${rootmnt}/${mountpoint}" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "Failed to mount ${fs} on ${rootmnt}/${mountpoint}." + echo "Manually mount the filesystem and exit." + shell + else + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + return 0 +} + +# Unlock a ZFS native encrypted filesystem. +decrypt_fs() +{ + local fs="$1" + + # If pool encryption is active and the zfs command understands '-o encryption' + if [ "$(zpool list -H -o feature@encryption $(echo "${fs}" | awk -F\/ '{print $1}'))" = 'active' ]; then + + # Determine dataset that holds key for root dataset + ENCRYPTIONROOT="$(get_fs_value "${fs}" encryptionroot)" + KEYLOCATION="$(get_fs_value "${ENCRYPTIONROOT}" keylocation)" + + echo "${ENCRYPTIONROOT}" > /run/zfs_fs_name + + # If root dataset is encrypted... + if ! [ "${ENCRYPTIONROOT}" = "-" ]; then + KEYSTATUS="$(get_fs_value "${ENCRYPTIONROOT}" keystatus)" + # Continue only if the key needs to be loaded + [ "$KEYSTATUS" = "unavailable" ] || return 0 + TRY_COUNT=3 + + # If key is stored in a file, do not prompt + if ! [ "${KEYLOCATION}" = "prompt" ]; then + $ZFS load-key "${ENCRYPTIONROOT}" + + # Prompt with plymouth, if active + elif [ -e /bin/plymouth ] && /bin/plymouth --ping 2>/dev/null; then + echo "plymouth" > /run/zfs_console_askpwd_cmd + while [ $TRY_COUNT -gt 0 ]; do + plymouth ask-for-password --prompt "Encrypted ZFS password for ${ENCRYPTIONROOT}" | \ + $ZFS load-key "${ENCRYPTIONROOT}" && break + TRY_COUNT=$((TRY_COUNT - 1)) + done + + # Prompt with systemd, if active + elif [ -e /run/systemd/system ]; then + echo "systemd-ask-password" > /run/zfs_console_askpwd_cmd + while [ $TRY_COUNT -gt 0 ]; do + systemd-ask-password "Encrypted ZFS password for ${ENCRYPTIONROOT}" --no-tty | \ + $ZFS load-key "${ENCRYPTIONROOT}" && break + TRY_COUNT=$((TRY_COUNT - 1)) + done + + # Prompt with ZFS tty, otherwise + else + # Temporarily setting "printk" to "7" allows the prompt to appear even when the "quiet" kernel option has been used + echo "load-key" > /run/zfs_console_askpwd_cmd + storeprintk="$(awk '{print $1}' /proc/sys/kernel/printk)" + echo 7 > /proc/sys/kernel/printk + $ZFS load-key "${ENCRYPTIONROOT}" + echo "$storeprintk" > /proc/sys/kernel/printk + fi + fi + fi + + return 0 +} + +# Destroy a given filesystem. +destroy_fs() +{ + local fs="$1" + + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Destroying '$fs'" + + ZFS_CMD="${ZFS} destroy $fs" + ZFS_STDERR="$(${ZFS_CMD} 2>&1)" + ZFS_ERROR="$?" + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + disable_plymouth + echo "" + echo "Command: $ZFS_CMD" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "Failed to destroy '$fs'. Please make sure that '$fs' is not available." + echo "Hint: Try: zfs destroy -Rfn $fs" + echo "If this dryrun looks good, then remove the 'n' from '-Rfn' and try again." + shell + else + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + return 0 +} + +# Clone snapshot $1 to destination filesystem $2 +# Set 'canmount=noauto' and 'mountpoint=none' so that we get to keep +# manual control over it's mounting (i.e., make sure it's not automatically +# mounted with a 'zfs mount -a' in the init/systemd scripts). +clone_snap() +{ + local snap="$1" + local destfs="$2" + local mountpoint="$3" + + [ "$quiet" != "y" ] && zfs_log_begin_msg "Cloning '$snap' to '$destfs'" + + # Clone the snapshot into a dataset we can boot from + # + We don't want this filesystem to be automatically mounted, we + # want control over this here and nowhere else. + # + We don't need any mountpoint set for the same reason. + # We use the 'org.zol:mountpoint' property to remember the mountpoint. + ZFS_CMD="${ZFS} clone -o canmount=noauto -o mountpoint=none" + ZFS_CMD="${ZFS_CMD} -o org.zol:mountpoint=${mountpoint}" + ZFS_CMD="${ZFS_CMD} $snap $destfs" + ZFS_STDERR="$(${ZFS_CMD} 2>&1)" + ZFS_ERROR="$?" + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + disable_plymouth + echo "" + echo "Command: $ZFS_CMD" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "Failed to clone snapshot." + echo "Make sure that the any problems are corrected and then make sure" + echo "that the dataset '$destfs' exists and is bootable." + shell + else + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + return 0 +} + +# Rollback a given snapshot. +rollback_snap() +{ + local snap="$1" + + [ "$quiet" != "y" ] && zfs_log_begin_msg "Rollback $snap" + + ZFS_CMD="${ZFS} rollback -Rf $snap" + ZFS_STDERR="$(${ZFS_CMD} 2>&1)" + ZFS_ERROR="$?" + if [ "${ZFS_ERROR}" != 0 ] + then + [ "$quiet" != "y" ] && zfs_log_failure_msg "${ZFS_ERROR}" + + disable_plymouth + echo "" + echo "Command: $ZFS_CMD" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "Failed to rollback snapshot." + shell + else + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + return 0 +} + +# Get a list of snapshots, give them as a numbered list +# to the user to choose from. +ask_user_snap() +{ + local fs="$1" + local i=1 + local SNAP snapnr snap debug + + # We need to temporarily disable debugging. Set 'debug' so we + # remember to enabled it again. + if [ -n "${ZFS_DEBUG}" ]; then + unset ZFS_DEBUG + set +x + debug=1 + fi + + # Because we need the resulting snapshot, which is sent on + # stdout to the caller, we use stderr for our questions. + echo "What snapshot do you want to boot from?" > /dev/stderr + while read snap; do + echo " $i: ${snap}" > /dev/stderr + eval `echo SNAP_$i=$snap` + i=$((i + 1)) + done < /dev/stderr + read snapnr + + # Re-enable debugging. + if [ -n "${debug}" ]; then + ZFS_DEBUG=1 + set -x + fi + + echo "$(eval echo "$"SNAP_$snapnr)" +} + +setup_snapshot_booting() +{ + local snap="$1" + local s destfs subfs mountpoint retval=0 filesystems fs + + # Make sure that the snapshot specified actually exist. + if [ ! $(get_fs_value "${snap}" type) ] + then + # Snapshot does not exist (...@ ?) + # ask the user for a snapshot to use. + snap="$(ask_user_snap "${snap%%@*}")" + fi + + # Separate the full snapshot ('$snap') into it's filesystem and + # snapshot names. Would have been nice with a split() function.. + rootfs="${snap%%@*}" + snapname="${snap##*@}" + ZFS_BOOTFS="${rootfs}_${snapname}" + + if ! grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline + then + # If the destination dataset for the clone + # already exists, destroy it. Recursively + if [ $(get_fs_value "${rootfs}_${snapname}" type) ]; then + filesystems=$("${ZFS}" list -oname -tfilesystem -H \ + -r -Sname "${ZFS_BOOTFS}") + for fs in $filesystems; do + destroy_fs "${fs}" + done + fi + fi + + # Get all snapshots, recursively (might need to clone /usr, /var etc + # as well). + for s in $("${ZFS}" list -H -oname -tsnapshot -r "${rootfs}" | \ + grep "${snapname}") + do + if grep -qiE '(^|[^\\](\\\\)* )(rollback)=(on|yes|1)( |$)' /proc/cmdline + then + # Rollback snapshot + rollback_snap "$s" || retval=$((retval + 1)) + else + # Setup a destination filesystem name. + # Ex: Called with 'rpool/ROOT/debian@snap2' + # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 + # rpool/ROOT/debian/boot@snap2 => rpool/ROOT/debian_snap2/boot + # rpool/ROOT/debian/usr@snap2 => rpool/ROOT/debian_snap2/usr + # rpool/ROOT/debian/var@snap2 => rpool/ROOT/debian_snap2/var + subfs="${s##$rootfs}" + subfs="${subfs%%@$snapname}" + + destfs="${rootfs}_${snapname}" # base fs. + [ -n "$subfs" ] && destfs="${destfs}$subfs" # + sub fs. + + # Get the mountpoint of the filesystem, to be used + # with clone_snap(). If legacy or none, then use + # the sub fs value. + mountpoint=$(get_fs_value "${s%%@*}" mountpoint) + if [ "$mountpoint" = "legacy" -o \ + "$mountpoint" = "none" ] + then + if [ -n "${subfs}" ]; then + mountpoint="${subfs}" + else + mountpoint="/" + fi + fi + + # Clone the snapshot into its own + # filesystem + clone_snap "$s" "${destfs}" "${mountpoint}" || \ + retval=$((retval + 1)) + fi + done + + # If we haven't return yet, we have a problem... + return "${retval}" +} + +# ================================================================ + +# This is the main function. +mountroot() +{ + local snaporig snapsub destfs pool POOLS + + # ---------------------------------------------------------------- + # I N I T I A L S E T U P + + # ------------ + # Run the pre-mount scripts from /scripts/local-top. + pre_mountroot + + # ------------ + # Source the default setup variables. + [ -r '/etc/default/zfs' ] && . /etc/default/zfs + + # ------------ + # Support debug option + if grep -qiE '(^|[^\\](\\\\)* )(zfs_debug|zfs\.debug|zfsdebug)=(on|yes|1)( |$)' /proc/cmdline + then + ZFS_DEBUG=1 + mkdir /var/log + #exec 2> /var/log/boot.debug + set -x + fi + + # ------------ + # Load ZFS module etc. + if ! load_module_initrd; then + disable_plymouth + echo "" + echo "Failed to load ZFS modules." + echo "Manually load the modules and exit." + shell + fi + + # ------------ + # Look for the cache file (if any). + [ ! -f ${ZPOOL_CACHE} ] && unset ZPOOL_CACHE + + # ------------ + # Compatibility: 'ROOT' is for Debian GNU/Linux (etc), + # 'root' is for Redhat/Fedora (etc), + # 'REAL_ROOT' is for Gentoo + if [ -z "$ROOT" ] + then + [ -n "$root" ] && ROOT=${root} + + [ -n "$REAL_ROOT" ] && ROOT=${REAL_ROOT} + fi + + # ------------ + # Where to mount the root fs in the initrd - set outside this script + # Compatibility: 'rootmnt' is for Debian GNU/Linux (etc), + # 'NEWROOT' is for RedHat/Fedora (etc), + # 'NEW_ROOT' is for Gentoo + if [ -z "$rootmnt" ] + then + [ -n "$NEWROOT" ] && rootmnt=${NEWROOT} + + [ -n "$NEW_ROOT" ] && rootmnt=${NEW_ROOT} + fi + + # ------------ + # No longer set in the defaults file, but it could have been set in + # get_pools() in some circumstances. If it's something, but not 'yes', + # it's no good to us. + [ -n "$USE_DISK_BY_ID" -a "$USE_DISK_BY_ID" != 'yes' ] && \ + unset USE_DISK_BY_ID + + # ---------------------------------------------------------------- + # P A R S E C O M M A N D L I N E O P T I O N S + + # This part is the really ugly part - there's so many options and permutations + # 'out there', and if we should make this the 'primary' source for ZFS initrd + # scripting, we need/should support them all. + # + # Supports the following kernel command line argument combinations + # (in this order - first match win): + # + # rpool= (tries to finds bootfs automatically) + # bootfs=/ (uses this for rpool - first part) + # rpool= bootfs=/ + # -B zfs-bootfs=/ (uses this for rpool - first part) + # rpool=rpool (default if none of the above is used) + # root=/ (uses this for rpool - first part) + # root=ZFS=/ (uses this for rpool - first part, without 'ZFS=') + # root=zfs:AUTO (tries to detect both pool and rootfs + # root=zfs:/ (uses this for rpool - first part, without 'zfs:') + # + # Option could also be + # Option could also be + + # ------------ + # Support force option + # In addition, setting one of zfs_force, zfs.force or zfsforce to + # 'yes', 'on' or '1' will make sure we force import the pool. + # This should (almost) never be needed, but it's here for + # completeness. + ZPOOL_FORCE="" + if grep -qiE '(^|[^\\](\\\\)* )(zfs_force|zfs\.force|zfsforce)=(on|yes|1)( |$)' /proc/cmdline + then + ZPOOL_FORCE="-f" + fi + + # ------------ + # Look for 'rpool' and 'bootfs' parameter + [ -n "$rpool" ] && ZFS_RPOOL="${rpool#rpool=}" + [ -n "$bootfs" ] && ZFS_BOOTFS="${bootfs#bootfs=}" + + # ------------ + # If we have 'ROOT' (see above), but not 'ZFS_BOOTFS', then use + # 'ROOT' + [ -n "$ROOT" -a -z "${ZFS_BOOTFS}" ] && ZFS_BOOTFS="$ROOT" + + # ------------ + # Check for the `-B zfs-bootfs=%s/%u,...` kind of parameter. + # NOTE: Only use the pool name and dataset. The rest is not + # supported by ZoL (whatever it's for). + if [ -z "$ZFS_RPOOL" ] + then + # The ${zfs-bootfs} variable is set at the kernel command + # line, usually by GRUB, but it cannot be referenced here + # directly because bourne variable names cannot contain a + # hyphen. + # + # Reassign the variable by dumping the environment and + # stripping the zfs-bootfs= prefix. Let the shell handle + # quoting through the eval command. + eval ZFS_RPOOL=$(set | sed -n -e 's,^zfs-bootfs=,,p') + fi + + # ------------ + # No root fs or pool specified - do auto detect. + if [ -z "$ZFS_RPOOL" -a -z "${ZFS_BOOTFS}" ] + then + # Do auto detect. Do this by 'cheating' - set 'root=zfs:AUTO' + # which will be caught later + ROOT=zfs:AUTO + fi + + # ---------------------------------------------------------------- + # F I N D A N D I M P O R T C O R R E C T P O O L + + # ------------ + if [ "$ROOT" = "zfs:AUTO" ] + then + # Try to detect both pool and root fs. + + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Attempting to import additional pools." + + # Get a list of pools available for import + if [ -n "$ZFS_RPOOL" ] + then + # We've specified a pool - check only that + POOLS=$ZFS_RPOOL + else + POOLS=$(get_pools) + fi + + OLD_IFS="$IFS" ; IFS=";" + for pool in $POOLS + do + [ -z "$pool" ] && continue + + import_pool "$pool" + find_rootfs "$pool" + done + IFS="$OLD_IFS" + + [ "$quiet" != "y" ] && zfs_log_end_msg $ZFS_ERROR + else + # No auto - use value from the command line option. + + # Strip 'zfs:' and 'ZFS='. + ZFS_BOOTFS="${ROOT#*[:=]}" + + # Strip everything after the first slash. + ZFS_RPOOL="${ZFS_BOOTFS%%/*}" + fi + + # Import the pool (if not already done so in the AUTO check above). + if [ -n "$ZFS_RPOOL" -a -z "${POOL_IMPORTED}" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Importing ZFS root pool '$ZFS_RPOOL'" + + import_pool "${ZFS_RPOOL}" + find_rootfs "${ZFS_RPOOL}" + + [ "$quiet" != "y" ] && zfs_log_end_msg + fi + + if [ -z "${POOL_IMPORTED}" ] + then + # No pool imported, this is serious! + disable_plymouth + echo "" + echo "Command: $ZFS_CMD" + echo "Message: $ZFS_STDERR" + echo "Error: $ZFS_ERROR" + echo "" + echo "No pool imported. Manually import the root pool" + echo "at the command prompt and then exit." + echo "Hint: Try: zpool import -R ${rootmnt} -N ${ZFS_RPOOL}" + shell + fi + + # In case the pool was specified as guid, resolve guid to name + pool="$("${ZPOOL}" get name,guid -o name,value -H | \ + awk -v pool="${ZFS_RPOOL}" '$2 == pool { print $1 }')" + if [ -n "$pool" ]; then + # If $ZFS_BOOTFS contains guid, replace the guid portion with $pool + ZFS_BOOTFS=$(echo "$ZFS_BOOTFS" | \ + sed -e "s/$("${ZPOOL}" get guid -o value "$pool" -H)/$pool/g") + ZFS_RPOOL="${pool}" + fi + + # Set the no-op scheduler on the disks containing the vdevs of + # the root pool. For single-queue devices, this scheduler is + # "noop", for multi-queue devices, it is "none". + # ZFS already does this for wholedisk vdevs (for all pools), so this + # is only important for partitions. + "${ZPOOL}" status -L "${ZFS_RPOOL}" 2> /dev/null | + awk '/^\t / && !/(mirror|raidz)/ { + dev=$1; + sub(/[0-9]+$/, "", dev); + print dev + }' | + while read -r i + do + SCHEDULER=/sys/block/$i/queue/scheduler + if [ -e "${SCHEDULER}" ] + then + # Query to see what schedulers are available + case "$(cat "${SCHEDULER}")" in + *noop*) echo noop > "${SCHEDULER}" ;; + *none*) echo none > "${SCHEDULER}" ;; + esac + fi + done + + + # ---------------------------------------------------------------- + # P R E P A R E R O O T F I L E S Y S T E M + + if [ -n "${ZFS_BOOTFS}" ] + then + # Booting from a snapshot? + # Will overwrite the ZFS_BOOTFS variable like so: + # rpool/ROOT/debian@snap2 => rpool/ROOT/debian_snap2 + echo "${ZFS_BOOTFS}" | grep -q '@' && \ + setup_snapshot_booting "${ZFS_BOOTFS}" + fi + + if [ -z "${ZFS_BOOTFS}" ] + then + # Still nothing! Let the user sort this out. + disable_plymouth + echo "" + echo "Error: Unknown root filesystem - no 'bootfs' pool property and" + echo " not specified on the kernel command line." + echo "" + echo "Manually mount the root filesystem on $rootmnt and then exit." + echo "Hint: Try: mount -o zfsutil -t zfs ${ZFS_RPOOL-rpool}/ROOT/system $rootmnt" + shell + fi + + # ---------------------------------------------------------------- + # M O U N T F I L E S Y S T E M S + + # * Ideally, the root filesystem would be mounted like this: + # + # zpool import -R "$rootmnt" -N "$ZFS_RPOOL" + # zfs mount -o mountpoint=/ "${ZFS_BOOTFS}" + # + # but the MOUNTPOINT prefix is preserved on descendent filesystem + # after the pivot into the regular root, which later breaks things + # like `zfs mount -a` and the /proc/self/mounts refresh. + # + # * Mount additional filesystems required + # Such as /usr, /var, /usr/local etc. + # NOTE: Mounted in the order specified in the + # ZFS_INITRD_ADDITIONAL_DATASETS variable so take care! + + # Go through the complete list (recursively) of all filesystems below + # the real root dataset + filesystems=$("${ZFS}" list -oname -tfilesystem -H -r "${ZFS_BOOTFS}") + for fs in $filesystems $ZFS_INITRD_ADDITIONAL_DATASETS + do + mount_fs "$fs" + done + + touch /run/zfs_unlock_complete + if [ -e /run/zfs_unlock_complete_notify ]; then + read zfs_unlock_complete_notify < /run/zfs_unlock_complete_notify + fi + + # ------------ + # Debugging information + if [ -n "${ZFS_DEBUG}" ] + then + #exec 2>&1- + + echo "DEBUG: imported pools:" + "${ZPOOL}" list -H + echo + + echo "DEBUG: mounted ZFS filesystems:" + mount | grep zfs + echo + + echo "=> waiting for ENTER before continuing because of 'zfsdebug=1'. " + echo -n " 'c' for shell, 'r' for reboot, 'ENTER' to continue. " + read b + + [ "$b" = "c" ] && /bin/sh + [ "$b" = "r" ] && reboot -f + + set +x + fi + + # ------------ + # Run local bottom script + if type run_scripts > /dev/null 2>&1 && \ + [ -f "/scripts/local-bottom" -o -d "/scripts/local-bottom" ] + then + [ "$quiet" != "y" ] && \ + zfs_log_begin_msg "Running /scripts/local-bottom" + run_scripts /scripts/local-bottom + [ "$quiet" != "y" ] && zfs_log_end_msg + fi +} diff --git a/contrib/initramfs/zfsunlock b/contrib/initramfs/zfsunlock new file mode 100755 index 000000000000..cf8e45249024 --- /dev/null +++ b/contrib/initramfs/zfsunlock @@ -0,0 +1,42 @@ +#!/bin/sh + +set -eu +if [ ! -e /run/zfs_fs_name ]; then + echo "Wait for the root pool to be imported or press Ctrl-C to exit." +fi +while [ ! -e /run/zfs_fs_name ]; do + if [ -e /run/zfs_unlock_complete ]; then + exit 0 + fi + sleep 1 +done +echo +echo "Unlocking encrypted ZFS filesystems..." +echo "Enter the password or press Ctrl-C to exit." +echo +zfs_fs_name="" +if [ ! -e /run/zfs_unlock_complete_notify ]; then + mkfifo /run/zfs_unlock_complete_notify +fi +while [ ! -e /run/zfs_unlock_complete ]; do + zfs_fs_name=$(cat /run/zfs_fs_name) + zfs_console_askpwd_cmd=$(cat /run/zfs_console_askpwd_cmd) + systemd-ask-password "Encrypted ZFS password for ${zfs_fs_name}:" | \ + /sbin/zfs load-key "$zfs_fs_name" || true + if [ "$(/sbin/zfs get -H -ovalue keystatus "$zfs_fs_name" 2> /dev/null)" = "available" ]; then + echo "Password for $zfs_fs_name accepted." + zfs_console_askpwd_pid=$(ps | awk '!'"/awk/ && /$zfs_console_askpwd_cmd/ { print \$1; exit }") + if [ -n "$zfs_console_askpwd_pid" ]; then + kill "$zfs_console_askpwd_pid" + fi + # Wait for another filesystem to unlock. + while [ "$(cat /run/zfs_fs_name)" = "$zfs_fs_name" ] && [ ! -e /run/zfs_unlock_complete ]; do + sleep 1 + done + else + echo "Wrong password. Try again." + fi +done +echo "Unlocking complete. Resuming boot sequence..." +echo "Please reconnect in a while." +echo "ok" > /run/zfs_unlock_complete_notify diff --git a/contrib/pam_zfs_key/Makefile.am b/contrib/pam_zfs_key/Makefile.am new file mode 100644 index 000000000000..f0f2550afccb --- /dev/null +++ b/contrib/pam_zfs_key/Makefile.am @@ -0,0 +1,19 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBCRYPTO_CFLAGS) + +pammodule_LTLIBRARIES=pam_zfs_key.la + +pam_zfs_key_la_SOURCES = pam_zfs_key.c + +pam_zfs_key_la_LIBADD = \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la \ + $(abs_top_builddir)/lib/libzfs/libzfs.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la + +pam_zfs_key_la_LDFLAGS = -version-info 1:0:0 -avoid-version -module -shared + +pam_zfs_key_la_LIBADD += -lpam $(LIBCRYPTO_LIBS) + +dist_pamconfigs_DATA = zfs_key diff --git a/contrib/pam_zfs_key/pam_zfs_key.c b/contrib/pam_zfs_key/pam_zfs_key.c new file mode 100644 index 000000000000..0a96f19a3cd0 --- /dev/null +++ b/contrib/pam_zfs_key/pam_zfs_key.c @@ -0,0 +1,741 @@ +/* + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of the nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL BE LIABLE FOR ANY + * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * Copyright (c) 2020, Felix Dörre + * All rights reserved. + */ + +#include +#include +#include + +#include + +#include +#include + +#define PAM_SM_AUTH +#define PAM_SM_PASSWORD +#define PAM_SM_SESSION +#include + +#if defined(__linux__) +#include +#elif defined(__FreeBSD__) +#include +static void +pam_syslog(pam_handle_t *pamh, int loglevel, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + vsyslog(loglevel, fmt, args); + va_end(args); +} +#endif + +#include + +#include +#include +#include +#include +#include + +#include + +static const char PASSWORD_VAR_NAME[] = "pam_zfs_key_authtok"; + +static libzfs_handle_t *g_zfs; + +static void destroy_pw(pam_handle_t *pamh, void *data, int errcode); + +typedef struct { + size_t len; + char *value; +} pw_password_t; + +static pw_password_t * +alloc_pw_size(size_t len) +{ + pw_password_t *pw = malloc(sizeof (pw_password_t)); + if (!pw) { + return (NULL); + } + pw->len = len; + pw->value = malloc(len); + if (!pw->value) { + free(pw); + return (NULL); + } + mlock(pw->value, pw->len); + return (pw); +} + +static pw_password_t * +alloc_pw_string(const char *source) +{ + pw_password_t *pw = malloc(sizeof (pw_password_t)); + if (!pw) { + return (NULL); + } + pw->len = strlen(source) + 1; + pw->value = malloc(pw->len); + if (!pw->value) { + free(pw); + return (NULL); + } + mlock(pw->value, pw->len); + memcpy(pw->value, source, pw->len); + return (pw); +} + +static void +pw_free(pw_password_t *pw) +{ + bzero(pw->value, pw->len); + munlock(pw->value, pw->len); + free(pw->value); + free(pw); +} + +static pw_password_t * +pw_fetch(pam_handle_t *pamh) +{ + const char *token; + if (pam_get_authtok(pamh, PAM_AUTHTOK, &token, NULL) != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, + "couldn't get password from PAM stack"); + return (NULL); + } + if (!token) { + pam_syslog(pamh, LOG_ERR, + "token from PAM stack is null"); + return (NULL); + } + return (alloc_pw_string(token)); +} + +static const pw_password_t * +pw_fetch_lazy(pam_handle_t *pamh) +{ + pw_password_t *pw = pw_fetch(pamh); + if (pw == NULL) { + return (NULL); + } + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, pw, destroy_pw); + if (ret != PAM_SUCCESS) { + pw_free(pw); + pam_syslog(pamh, LOG_ERR, "pam_set_data failed"); + return (NULL); + } + return (pw); +} + +static const pw_password_t * +pw_get(pam_handle_t *pamh) +{ + const pw_password_t *authtok = NULL; + int ret = pam_get_data(pamh, PASSWORD_VAR_NAME, + (const void**)(&authtok)); + if (ret == PAM_SUCCESS) + return (authtok); + if (ret == PAM_NO_MODULE_DATA) + return (pw_fetch_lazy(pamh)); + pam_syslog(pamh, LOG_ERR, "password not available"); + return (NULL); +} + +static int +pw_clear(pam_handle_t *pamh) +{ + int ret = pam_set_data(pamh, PASSWORD_VAR_NAME, NULL, NULL); + if (ret != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, "clearing password failed"); + return (-1); + } + return (0); +} + +static void +destroy_pw(pam_handle_t *pamh, void *data, int errcode) +{ + if (data != NULL) { + pw_free((pw_password_t *)data); + } +} + +static int +pam_zfs_init(pam_handle_t *pamh) +{ + int error = 0; + if ((g_zfs = libzfs_init()) == NULL) { + error = errno; + pam_syslog(pamh, LOG_ERR, "Zfs initialization error: %s", + libzfs_error_init(error)); + } + return (error); +} + +static void +pam_zfs_free(void) +{ + libzfs_fini(g_zfs); +} + +static pw_password_t * +prepare_passphrase(pam_handle_t *pamh, zfs_handle_t *ds, + const char *passphrase, nvlist_t *nvlist) +{ + pw_password_t *key = alloc_pw_size(WRAPPING_KEY_LEN); + if (!key) { + return (NULL); + } + uint64_t salt; + uint64_t iters; + if (nvlist != NULL) { + int fd = open("/dev/urandom", O_RDONLY); + if (fd < 0) { + pw_free(key); + return (NULL); + } + int bytes_read = 0; + char *buf = (char *)&salt; + size_t bytes = sizeof (uint64_t); + while (bytes_read < bytes) { + ssize_t len = read(fd, buf + bytes_read, bytes + - bytes_read); + if (len < 0) { + close(fd); + pw_free(key); + return (NULL); + } + bytes_read += len; + } + close(fd); + + if (nvlist_add_uint64(nvlist, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt)) { + pam_syslog(pamh, LOG_ERR, + "failed to add salt to nvlist"); + pw_free(key); + return (NULL); + } + iters = DEFAULT_PBKDF2_ITERATIONS; + if (nvlist_add_uint64(nvlist, zfs_prop_to_name( + ZFS_PROP_PBKDF2_ITERS), iters)) { + pam_syslog(pamh, LOG_ERR, + "failed to add iters to nvlist"); + pw_free(key); + return (NULL); + } + } else { + salt = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_SALT); + iters = zfs_prop_get_int(ds, ZFS_PROP_PBKDF2_ITERS); + } + + salt = LE_64(salt); + if (!PKCS5_PBKDF2_HMAC_SHA1((char *)passphrase, + strlen(passphrase), (uint8_t *)&salt, + sizeof (uint64_t), iters, WRAPPING_KEY_LEN, + (uint8_t *)key->value)) { + pam_syslog(pamh, LOG_ERR, "pbkdf failed"); + pw_free(key); + return (NULL); + } + return (key); +} + +static int +is_key_loaded(pam_handle_t *pamh, const char *ds_name) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + int keystatus = zfs_prop_get_int(ds, ZFS_PROP_KEYSTATUS); + zfs_close(ds); + return (keystatus != ZFS_KEYSTATUS_UNAVAILABLE); +} + +static int +change_key(pam_handle_t *pamh, const char *ds_name, + const char *passphrase) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + nvlist_t *nvlist = fnvlist_alloc(); + pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, nvlist); + if (key == NULL) { + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + if (nvlist_add_string(nvlist, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + "prompt")) { + pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keylocation"); + pw_free(key); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + if (nvlist_add_uint64(nvlist, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + ZFS_KEYFORMAT_PASSPHRASE)) { + pam_syslog(pamh, LOG_ERR, "nvlist_add failed for keyformat"); + pw_free(key); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + int ret = lzc_change_key(ds_name, DCP_CMD_NEW_KEY, nvlist, + (uint8_t *)key->value, WRAPPING_KEY_LEN); + pw_free(key); + if (ret) { + pam_syslog(pamh, LOG_ERR, "change_key failed: %d", ret); + nvlist_free(nvlist); + zfs_close(ds); + return (-1); + } + nvlist_free(nvlist); + zfs_close(ds); + return (0); +} + +static int +decrypt_mount(pam_handle_t *pamh, const char *ds_name, + const char *passphrase) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + pw_password_t *key = prepare_passphrase(pamh, ds, passphrase, NULL); + if (key == NULL) { + zfs_close(ds); + return (-1); + } + int ret = lzc_load_key(ds_name, B_FALSE, (uint8_t *)key->value, + WRAPPING_KEY_LEN); + pw_free(key); + if (ret) { + pam_syslog(pamh, LOG_ERR, "load_key failed: %d", ret); + zfs_close(ds); + return (-1); + } + ret = zfs_mount(ds, NULL, 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, "mount failed: %d", ret); + zfs_close(ds); + return (-1); + } + zfs_close(ds); + return (0); +} + +static int +unmount_unload(pam_handle_t *pamh, const char *ds_name) +{ + zfs_handle_t *ds = zfs_open(g_zfs, ds_name, ZFS_TYPE_FILESYSTEM); + if (ds == NULL) { + pam_syslog(pamh, LOG_ERR, "dataset %s not found", ds_name); + return (-1); + } + int ret = zfs_unmount(ds, NULL, 0); + if (ret) { + pam_syslog(pamh, LOG_ERR, "zfs_unmount failed with: %d", ret); + zfs_close(ds); + return (-1); + } + + ret = lzc_unload_key(ds_name); + if (ret) { + pam_syslog(pamh, LOG_ERR, "unload_key failed with: %d", ret); + zfs_close(ds); + return (-1); + } + zfs_close(ds); + return (0); +} + +typedef struct { + char *homes_prefix; + char *runstatedir; + uid_t uid; + const char *username; + int unmount_and_unload; +} zfs_key_config_t; + +static int +zfs_key_config_load(pam_handle_t *pamh, zfs_key_config_t *config, + int argc, const char **argv) +{ + config->homes_prefix = strdup("rpool/home"); + if (config->homes_prefix == NULL) { + pam_syslog(pamh, LOG_ERR, "strdup failure"); + return (-1); + } + config->runstatedir = strdup(RUNSTATEDIR "/pam_zfs_key"); + if (config->runstatedir == NULL) { + pam_syslog(pamh, LOG_ERR, "strdup failure"); + free(config->homes_prefix); + return (-1); + } + const char *name; + if (pam_get_user(pamh, &name, NULL) != PAM_SUCCESS) { + pam_syslog(pamh, LOG_ERR, + "couldn't get username from PAM stack"); + free(config->runstatedir); + free(config->homes_prefix); + return (-1); + } + struct passwd *entry = getpwnam(name); + if (!entry) { + free(config->runstatedir); + free(config->homes_prefix); + return (-1); + } + config->uid = entry->pw_uid; + config->username = name; + config->unmount_and_unload = 1; + for (int c = 0; c < argc; c++) { + if (strncmp(argv[c], "homes=", 6) == 0) { + free(config->homes_prefix); + config->homes_prefix = strdup(argv[c] + 6); + } else if (strncmp(argv[c], "runstatedir=", 12) == 0) { + free(config->runstatedir); + config->runstatedir = strdup(argv[c] + 12); + } else if (strcmp(argv[c], "nounmount") == 0) { + config->unmount_and_unload = 0; + } + } + return (0); +} + +static void +zfs_key_config_free(zfs_key_config_t *config) +{ + free(config->homes_prefix); +} + +static char * +zfs_key_config_get_dataset(zfs_key_config_t *config) +{ + size_t len = ZFS_MAX_DATASET_NAME_LEN; + size_t total_len = strlen(config->homes_prefix) + 1 + + strlen(config->username); + if (total_len > len) { + return (NULL); + } + char *ret = malloc(len + 1); + if (!ret) { + return (NULL); + } + ret[0] = 0; + strcat(ret, config->homes_prefix); + strcat(ret, "/"); + strcat(ret, config->username); + return (ret); +} + +static int +zfs_key_config_modify_session_counter(pam_handle_t *pamh, + zfs_key_config_t *config, int delta) +{ + const char *runtime_path = config->runstatedir; + if (mkdir(runtime_path, S_IRWXU) != 0 && errno != EEXIST) { + pam_syslog(pamh, LOG_ERR, "Can't create runtime path: %d", + errno); + return (-1); + } + if (chown(runtime_path, 0, 0) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't chown runtime path: %d", + errno); + return (-1); + } + if (chmod(runtime_path, S_IRWXU) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't chmod runtime path: %d", + errno); + return (-1); + } + size_t runtime_path_len = strlen(runtime_path); + size_t counter_path_len = runtime_path_len + 1 + 10; + char *counter_path = malloc(counter_path_len + 1); + if (!counter_path) { + return (-1); + } + counter_path[0] = 0; + strcat(counter_path, runtime_path); + snprintf(counter_path + runtime_path_len, counter_path_len, "/%d", + config->uid); + const int fd = open(counter_path, + O_RDWR | O_CLOEXEC | O_CREAT | O_NOFOLLOW, + S_IRUSR | S_IWUSR); + free(counter_path); + if (fd < 0) { + pam_syslog(pamh, LOG_ERR, "Can't open counter file: %d", errno); + return (-1); + } + if (flock(fd, LOCK_EX) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't lock counter file: %d", errno); + close(fd); + return (-1); + } + char counter[20]; + char *pos = counter; + int remaining = sizeof (counter) - 1; + int ret; + counter[sizeof (counter) - 1] = 0; + while (remaining > 0 && (ret = read(fd, pos, remaining)) > 0) { + remaining -= ret; + pos += ret; + } + *pos = 0; + long int counter_value = strtol(counter, NULL, 10); + counter_value += delta; + if (counter_value < 0) { + counter_value = 0; + } + lseek(fd, 0, SEEK_SET); + if (ftruncate(fd, 0) != 0) { + pam_syslog(pamh, LOG_ERR, "Can't truncate counter file: %d", + errno); + close(fd); + return (-1); + } + snprintf(counter, sizeof (counter), "%ld", counter_value); + remaining = strlen(counter); + pos = counter; + while (remaining > 0 && (ret = write(fd, pos, remaining)) > 0) { + remaining -= ret; + pos += ret; + } + close(fd); + return (counter_value); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_authenticate(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (pw_fetch_lazy(pamh) == NULL) { + return (PAM_AUTH_ERR); + } + + return (PAM_SUCCESS); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_setcred(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + return (PAM_SUCCESS); +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_chauthtok(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_PERM_DENIED); + } + zfs_key_config_t config; + if (zfs_key_config_load(pamh, &config, argc, argv) == -1) { + return (PAM_SERVICE_ERR); + } + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + { + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + int key_loaded = is_key_loaded(pamh, dataset); + if (key_loaded == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + if (! key_loaded) { + pam_syslog(pamh, LOG_ERR, + "key not loaded, returning try_again"); + zfs_key_config_free(&config); + return (PAM_PERM_DENIED); + } + } + + if ((flags & PAM_UPDATE_AUTHTOK) != 0) { + const pw_password_t *token = pw_get(pamh); + if (token == NULL) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (change_key(pamh, dataset, token->value) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + if (pw_clear(pamh) == -1) { + return (PAM_SERVICE_ERR); + } + } else { + zfs_key_config_free(&config); + } + return (PAM_SUCCESS); +} + +PAM_EXTERN int +pam_sm_open_session(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_SUCCESS); + } + zfs_key_config_t config; + zfs_key_config_load(pamh, &config, argc, argv); + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + int counter = zfs_key_config_modify_session_counter(pamh, &config, 1); + if (counter != 1) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + const pw_password_t *token = pw_get(pamh); + if (token == NULL) { + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + if (decrypt_mount(pamh, dataset, token->value) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + if (pw_clear(pamh) == -1) { + return (PAM_SERVICE_ERR); + } + return (PAM_SUCCESS); + +} + +__attribute__((visibility("default"))) +PAM_EXTERN int +pam_sm_close_session(pam_handle_t *pamh, int flags, + int argc, const char **argv) +{ + if (geteuid() != 0) { + pam_syslog(pamh, LOG_ERR, + "Cannot zfs_mount when not being root."); + return (PAM_SUCCESS); + } + zfs_key_config_t config; + zfs_key_config_load(pamh, &config, argc, argv); + if (config.uid < 1000) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + int counter = zfs_key_config_modify_session_counter(pamh, &config, -1); + if (counter != 0) { + zfs_key_config_free(&config); + return (PAM_SUCCESS); + } + + if (config.unmount_and_unload) { + if (pam_zfs_init(pamh) != 0) { + zfs_key_config_free(&config); + return (PAM_SERVICE_ERR); + } + char *dataset = zfs_key_config_get_dataset(&config); + if (!dataset) { + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + if (unmount_unload(pamh, dataset) == -1) { + free(dataset); + pam_zfs_free(); + zfs_key_config_free(&config); + return (PAM_SESSION_ERR); + } + free(dataset); + pam_zfs_free(); + } + + zfs_key_config_free(&config); + return (PAM_SUCCESS); +} diff --git a/contrib/pam_zfs_key/zfs_key b/contrib/pam_zfs_key/zfs_key new file mode 100644 index 000000000000..e3ed5c4f2fa7 --- /dev/null +++ b/contrib/pam_zfs_key/zfs_key @@ -0,0 +1,13 @@ +Name: Unlock zfs datasets for user +Default: yes +Priority: 128 +Auth-Type: Additional +Auth: + optional pam_zfs_key.so +Session-Interactive-Only: yes +Session-Type: Additional +Session: + optional pam_zfs_key.so +Password-Type: Additional +Password: + optional pam_zfs_key.so diff --git a/contrib/pyzfs/.gitignore b/contrib/pyzfs/.gitignore new file mode 100644 index 000000000000..f2e63db0fcf5 --- /dev/null +++ b/contrib/pyzfs/.gitignore @@ -0,0 +1,3 @@ +build +pyzfs.egg-info +setup.py diff --git a/contrib/pyzfs/LICENSE b/contrib/pyzfs/LICENSE new file mode 100644 index 000000000000..d64569567334 --- /dev/null +++ b/contrib/pyzfs/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/contrib/pyzfs/Makefile.am b/contrib/pyzfs/Makefile.am new file mode 100644 index 000000000000..fa1bb32ce2eb --- /dev/null +++ b/contrib/pyzfs/Makefile.am @@ -0,0 +1,40 @@ +EXTRA_DIST = libzfs_core setup.py.in README LICENSE docs + +if PYZFS_ENABLED +all: + +all-local: + $(PYTHON) setup.py build + +# +# On Debian (Ubuntu, and other downstream distros) the install location of +# Python packages is "../dist-packages" instead of "../site-packages" [1]. +# The install location used by "$(PYTHON) setup.py install" must match the +# location specified in the ZFS specfile (RPM macro "%{python_sitelib}") to +# avoid errors during the rpmbuild process. +# However we cannot pass "--install-layout=deb" to the setup script here because +# it is not supported on RPM-based distros; we use the combination of +# "--prefix", "--root" and "--install-lib" parameters instead which should work +# on every supported system. +# +# [1] https://wiki.debian.org/Python#Deviations_from_upstream +# +# Using "--no-compile" will not generate .pyc files which, in turn, will not be +# packaged: this could result in failures during the uninstall phase if these +# files are later created by manually loading the Python modules. +# +install-exec-local: + $(PYTHON) $(builddir)/setup.py install \ + --prefix $(prefix) \ + --root $(DESTDIR)/ \ + --install-lib $(pythonsitedir) \ + --single-version-externally-managed \ + --verbose + +clean: clean-local + rm -rf build/ pyzfs.egg-info/ + +clean-local: + +check-local: all +endif diff --git a/contrib/pyzfs/README b/contrib/pyzfs/README new file mode 100644 index 000000000000..52983e5a90e0 --- /dev/null +++ b/contrib/pyzfs/README @@ -0,0 +1,28 @@ +This package provides a wrapper for libzfs_core C library. + +libzfs_core is intended to be a stable interface for programmatic +administration of ZFS. +This wrapper provides one-to-one wrappers for libzfs_core API functions, +but the signatures and types are more natural to Python. +nvlists are wrapped as dictionaries or lists depending on their usage. +Some parameters have default values depending on typical use for +increased convenience. +Enumerations and bit flags become strings and lists of strings in Python. +Errors are reported as exceptions rather than integer errno-style +error codes. The wrapper takes care to provide one-to-many mapping +of the error codes to the exceptions by interpreting a context +in which the error code is produced. + +Unit tests and automated test for the libzfs_core API are provided +with this package. +Please note that the API tests perform lots of ZFS dataset level +operations and ZFS tries hard to ensure that any modifications +do reach stable storage. That means that the operations are done +synchronously and that, for example, disk caches are flushed. +Thus, the tests can be very slow on real hardware. +It is recommended to place the default temporary directory or +a temporary directory specified by, for instance, TMP environment +variable on a memory backed filesystem. + +Package documentation: http://pyzfs.readthedocs.org +Package development: https://github.com/zfsonlinux/zfs diff --git a/contrib/pyzfs/docs/source/conf.py b/contrib/pyzfs/docs/source/conf.py new file mode 100644 index 000000000000..4bbb938b6296 --- /dev/null +++ b/contrib/pyzfs/docs/source/conf.py @@ -0,0 +1,304 @@ +# -*- coding: utf-8 -*- +# flake8: noqa +# +# pyzfs documentation build configuration file, created by +# sphinx-quickstart on Mon Apr 6 23:48:40 2015. +# +# This file is execfile()d with the current directory set to its +# containing dir. +# +# Note that not all possible configuration values are present in this +# autogenerated file. +# +# All configuration values have a default; values that are commented out +# serve to show the default. + +import sys +import os + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +sys.path.insert(0, os.path.abspath('../..')) + +# -- General configuration ------------------------------------------------ + +# If your documentation needs a minimal Sphinx version, state it here. +#needs_sphinx = '1.0' + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# source_suffix = ['.rst', '.md'] +source_suffix = '.rst' + +# The encoding of source files. +#source_encoding = 'utf-8-sig' + +# The master toctree document. +master_doc = 'index' + +# General information about the project. +project = u'pyzfs' +copyright = u'2015, ClusterHQ' +author = u'ClusterHQ' + +# The version info for the project you're documenting, acts as replacement for +# |version| and |release|, also used in various other places throughout the +# built documents. +# +# The short X.Y version. +version = '0.2.3' +# The full version, including alpha/beta/rc tags. +release = '0.2.3' + +# The language for content autogenerated by Sphinx. Refer to documentation +# for a list of supported languages. +# +# This is also used if you do content translation via gettext catalogs. +# Usually you set "language" from the command line for these cases. +language = None + +# There are two options for replacing |today|: either, you set today to some +# non-false value, then it is used: +#today = '' +# Else, today_fmt is used as the format for a strftime call. +#today_fmt = '%B %d, %Y' + +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +exclude_patterns = [] + +# The reST default role (used for this markup: `text`) to use for all +# documents. +#default_role = None + +# If true, '()' will be appended to :func: etc. cross-reference text. +#add_function_parentheses = True + +# If true, the current module name will be prepended to all description +# unit titles (such as .. function::). +#add_module_names = True + +# If true, sectionauthor and moduleauthor directives will be shown in the +# output. They are ignored by default. +#show_authors = False + +# The name of the Pygments (syntax highlighting) style to use. +pygments_style = 'sphinx' + +# A list of ignored prefixes for module index sorting. +#modindex_common_prefix = [] + +# If true, keep warnings as "system message" paragraphs in the built documents. +#keep_warnings = False + +# If true, `todo` and `todoList` produce output, else they produce nothing. +todo_include_todos = False + + +# -- Options for HTML output ---------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +html_theme = 'classic' + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +#html_theme_options = {} + +# Add any paths that contain custom themes here, relative to this directory. +#html_theme_path = [] + +# The name for this set of Sphinx documents. If None, it defaults to +# " v documentation". +#html_title = None + +# A shorter title for the navigation bar. Default is the same as html_title. +#html_short_title = None + +# The name of an image file (relative to this directory) to place at the top +# of the sidebar. +#html_logo = None + +# The name of an image file (within the static path) to use as favicon of the +# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32 +# pixels large. +#html_favicon = None + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] + +# Add any extra paths that contain custom files (such as robots.txt or +# .htaccess) here, relative to this directory. These files are copied +# directly to the root of the documentation. +#html_extra_path = [] + +# If not '', a 'Last updated on:' timestamp is inserted at every page bottom, +# using the given strftime format. +#html_last_updated_fmt = '%b %d, %Y' + +# If true, SmartyPants will be used to convert quotes and dashes to +# typographically correct entities. +#html_use_smartypants = True + +# Custom sidebar templates, maps document names to template names. +#html_sidebars = {} + +# Additional templates that should be rendered to pages, maps page names to +# template names. +#html_additional_pages = {} + +# If false, no module index is generated. +#html_domain_indices = True + +# If false, no index is generated. +#html_use_index = True + +# If true, the index is split into individual pages for each letter. +#html_split_index = False + +# If true, links to the reST sources are added to the pages. +#html_show_sourcelink = True + +# If true, "Created using Sphinx" is shown in the HTML footer. Default is True. +#html_show_sphinx = True + +# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True. +#html_show_copyright = True + +# If true, an OpenSearch description file will be output, and all pages will +# contain a tag referring to it. The value of this option must be the +# base URL from which the finished HTML is served. +#html_use_opensearch = '' + +# This is the file name suffix for HTML files (e.g. ".xhtml"). +#html_file_suffix = None + +# Language to be used for generating the HTML full-text search index. +# Sphinx supports the following languages: +# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja' +# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr' +#html_search_language = 'en' + +# A dictionary with options for the search language support, empty by default. +# Now only 'ja' uses this config value +#html_search_options = {'type': 'default'} + +# The name of a javascript file (relative to the configuration directory) that +# implements a search results scorer. If empty, the default will be used. +#html_search_scorer = 'scorer.js' + +# Output file base name for HTML help builder. +htmlhelp_basename = 'pyzfsdoc' + +# -- Options for LaTeX output --------------------------------------------- + +latex_elements = { +# The paper size ('letterpaper' or 'a4paper'). +#'papersize': 'letterpaper', + +# The font size ('10pt', '11pt' or '12pt'). +#'pointsize': '10pt', + +# Additional stuff for the LaTeX preamble. +#'preamble': '', + +# Latex figure (float) alignment +#'figure_align': 'htbp', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (master_doc, 'pyzfs.tex', u'pyzfs Documentation', + u'ClusterHQ', 'manual'), +] + +# The name of an image file (relative to this directory) to place at the top of +# the title page. +#latex_logo = None + +# For "manual" documents, if this is true, then toplevel headings are parts, +# not chapters. +#latex_use_parts = False + +# If true, show page references after internal links. +#latex_show_pagerefs = False + +# If true, show URL addresses after external links. +#latex_show_urls = False + +# Documents to append as an appendix to all manuals. +#latex_appendices = [] + +# If false, no module index is generated. +#latex_domain_indices = True + + +# -- Options for manual page output --------------------------------------- + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [ + (master_doc, 'pyzfs', u'pyzfs Documentation', + [author], 1) +] + +# If true, show URL addresses after external links. +#man_show_urls = False + + +# -- Options for Texinfo output ------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (master_doc, 'pyzfs', u'pyzfs Documentation', + author, 'pyzfs', 'One line description of project.', + 'Miscellaneous'), +] + +# Documents to append as an appendix to all manuals. +#texinfo_appendices = [] + +# If false, no module index is generated. +#texinfo_domain_indices = True + +# How to display URL addresses: 'footnote', 'no', or 'inline'. +#texinfo_show_urls = 'footnote' + +# If true, do not generate a @detailmenu in the "Top" node's menu. +#texinfo_no_detailmenu = False + +# Sort documentation in the same order as the source files. +autodoc_member_order = 'bysource' + + +####################### +# Neutralize effects of function wrapping on documented signatures. +# The affected signatures could be explicitly placed into the +# documentation (either in .rst files or as a first line of a +# docstring). +import functools + +def no_op_wraps(func): + def wrapper(decorator): + return func + return wrapper + +functools.wraps = no_op_wraps diff --git a/contrib/pyzfs/docs/source/index.rst b/contrib/pyzfs/docs/source/index.rst new file mode 100644 index 000000000000..36c227a49971 --- /dev/null +++ b/contrib/pyzfs/docs/source/index.rst @@ -0,0 +1,44 @@ +.. pyzfs documentation master file, created by + sphinx-quickstart on Mon Apr 6 23:48:40 2015. + You can adapt this file completely to your liking, but it should at least + contain the root `toctree` directive. + +Welcome to pyzfs's documentation! +================================= + +Contents: + +.. toctree:: + :maxdepth: 2 + + + +Indices and tables +================== + +* :ref:`genindex` +* :ref:`modindex` +* :ref:`search` + +Documentation for the libzfs_core +********************************* + +.. automodule:: libzfs_core + :members: + :exclude-members: lzc_snap, lzc_recv, lzc_destroy_one, + lzc_inherit, lzc_set_props, lzc_list + +Documentation for the libzfs_core exceptions +******************************************** + +.. automodule:: libzfs_core.exceptions + :members: + :undoc-members: + +Documentation for the miscellaneous types that correspond to specific width C types +*********************************************************************************** + +.. automodule:: libzfs_core.ctypes + :members: + :undoc-members: + diff --git a/contrib/pyzfs/libzfs_core/__init__.py b/contrib/pyzfs/libzfs_core/__init__.py new file mode 100644 index 000000000000..78e96738e29e --- /dev/null +++ b/contrib/pyzfs/libzfs_core/__init__.py @@ -0,0 +1,154 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +''' +Python wrappers for **libzfs_core** library. + +*libzfs_core* is intended to be a stable, committed interface for programmatic +administration of ZFS. +This wrapper provides one-to-one wrappers for libzfs_core API functions, +but the signatures and types are more natural to Python. +nvlists are wrapped as dictionaries or lists depending on their usage. +Some parameters have default values depending on typical use for +increased convenience. +Output parameters are not used and return values are directly returned. +Enumerations and bit flags become strings and lists of strings in Python. +Errors are reported as exceptions rather than integer errno-style +error codes. The wrapper takes care to provide one-to-many mapping +of the error codes to the exceptions by interpreting a context +in which the error code is produced. + +To submit an issue or contribute to development of this package +please visit its `GitHub repository `_. + +.. data:: MAXNAMELEN + + Maximum length of any ZFS name. +''' +from __future__ import absolute_import, division, print_function + +from ._constants import ( + MAXNAMELEN, + ZCP_DEFAULT_INSTRLIMIT, + ZCP_DEFAULT_MEMLIMIT, + WRAPPING_KEY_LEN, + zfs_key_location, + zfs_keyformat, + zio_encrypt +) + +from ._libzfs_core import ( + lzc_bookmark, + lzc_change_key, + lzc_channel_program, + lzc_channel_program_nosync, + lzc_clone, + lzc_create, + lzc_destroy_bookmarks, + lzc_destroy_snaps, + lzc_exists, + lzc_get_bookmarks, + lzc_get_holds, + lzc_hold, + lzc_load_key, + lzc_pool_checkpoint, + lzc_pool_checkpoint_discard, + lzc_promote, + lzc_receive, + lzc_receive_one, + lzc_receive_resumable, + lzc_receive_with_cmdprops, + lzc_receive_with_header, + lzc_release, + lzc_reopen, + lzc_rollback, + lzc_rollback_to, + lzc_send, + lzc_send_resume, + lzc_send_space, + lzc_snaprange_space, + lzc_snapshot, + lzc_sync, + lzc_unload_key, + is_supported, + lzc_recv, + lzc_snap, + lzc_rename, + lzc_destroy, + lzc_inherit_prop, + lzc_get_props, + lzc_set_props, + lzc_list_children, + lzc_list_snaps, + receive_header, +) + +__all__ = [ + 'ctypes', + 'exceptions', + 'MAXNAMELEN', + 'ZCP_DEFAULT_INSTRLIMIT', + 'ZCP_DEFAULT_MEMLIMIT', + 'WRAPPING_KEY_LEN', + 'zfs_key_location', + 'zfs_keyformat', + 'zio_encrypt', + 'lzc_bookmark', + 'lzc_change_key', + 'lzc_channel_program', + 'lzc_channel_program_nosync', + 'lzc_clone', + 'lzc_create', + 'lzc_destroy_bookmarks', + 'lzc_destroy_snaps', + 'lzc_exists', + 'lzc_get_bookmarks', + 'lzc_get_holds', + 'lzc_hold', + 'lzc_load_key', + 'lzc_pool_checkpoint', + 'lzc_pool_checkpoint_discard', + 'lzc_promote', + 'lzc_receive', + 'lzc_receive_one', + 'lzc_receive_resumable', + 'lzc_receive_with_cmdprops', + 'lzc_receive_with_header', + 'lzc_release', + 'lzc_reopen', + 'lzc_rollback', + 'lzc_rollback_to', + 'lzc_send', + 'lzc_send_resume', + 'lzc_send_space', + 'lzc_snaprange_space', + 'lzc_snapshot', + 'lzc_sync', + 'lzc_unload_key', + 'is_supported', + 'lzc_recv', + 'lzc_snap', + 'lzc_rename', + 'lzc_destroy', + 'lzc_inherit_prop', + 'lzc_get_props', + 'lzc_set_props', + 'lzc_list_children', + 'lzc_list_snaps', + 'receive_header', +] + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_constants.py b/contrib/pyzfs/libzfs_core/_constants.py new file mode 100644 index 000000000000..2dfed224c29d --- /dev/null +++ b/contrib/pyzfs/libzfs_core/_constants.py @@ -0,0 +1,114 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Important `libzfs_core` constants. +""" + +from __future__ import absolute_import, division, print_function +import errno +import sys + + +# Compat for platform-specific errnos +if sys.platform.startswith('freebsd'): + ECHRNG = errno.ENXIO + ECKSUM = 97 # EINTEGRITY + ETIME = errno.ETIMEDOUT +else: + ECHRNG = errno.ECHRNG + ECKSUM = errno.EBADE + ETIME = errno.ETIME + + +# https://stackoverflow.com/a/1695250 +def enum_with_offset(offset, sequential, named): + enums = dict(((b, a + offset) for a, b in enumerate(sequential)), **named) + return type('Enum', (), enums) + + +def enum(*sequential, **named): + return enum_with_offset(0, sequential, named) + + +#: Maximum length of any ZFS name. +MAXNAMELEN = 255 +#: Default channel program limits +ZCP_DEFAULT_INSTRLIMIT = 10 * 1000 * 1000 +ZCP_DEFAULT_MEMLIMIT = 10 * 1024 * 1024 +#: Encryption wrapping key length +WRAPPING_KEY_LEN = 32 +#: Encryption key location enum +zfs_key_location = enum( + 'ZFS_KEYLOCATION_NONE', + 'ZFS_KEYLOCATION_PROMPT', + 'ZFS_KEYLOCATION_URI' +) +#: Encryption key format enum +zfs_keyformat = enum( + 'ZFS_KEYFORMAT_NONE', + 'ZFS_KEYFORMAT_RAW', + 'ZFS_KEYFORMAT_HEX', + 'ZFS_KEYFORMAT_PASSPHRASE' +) +# Encryption algorithms enum +zio_encrypt = enum( + 'ZIO_CRYPT_INHERIT', + 'ZIO_CRYPT_ON', + 'ZIO_CRYPT_OFF', + 'ZIO_CRYPT_AES_128_CCM', + 'ZIO_CRYPT_AES_192_CCM', + 'ZIO_CRYPT_AES_256_CCM', + 'ZIO_CRYPT_AES_128_GCM', + 'ZIO_CRYPT_AES_192_GCM', + 'ZIO_CRYPT_AES_256_GCM' +) +# ZFS-specific error codes +zfs_errno = enum_with_offset(1024, [ + 'ZFS_ERR_CHECKPOINT_EXISTS', + 'ZFS_ERR_DISCARDING_CHECKPOINT', + 'ZFS_ERR_NO_CHECKPOINT', + 'ZFS_ERR_DEVRM_IN_PROGRESS', + 'ZFS_ERR_VDEV_TOO_BIG', + 'ZFS_ERR_IOC_CMD_UNAVAIL', + 'ZFS_ERR_IOC_ARG_UNAVAIL', + 'ZFS_ERR_IOC_ARG_REQUIRED', + 'ZFS_ERR_IOC_ARG_BADTYPE', + 'ZFS_ERR_WRONG_PARENT', + 'ZFS_ERR_FROM_IVSET_GUID_MISSING', + 'ZFS_ERR_FROM_IVSET_GUID_MISMATCH', + 'ZFS_ERR_SPILL_BLOCK_FLAG_MISSING', + 'ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE', + 'ZFS_ERR_EXPORT_IN_PROGRESS', + 'ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR', + 'ZFS_ERR_STREAM_TRUNCATED', + 'ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH', + 'ZFS_ERR_RESILVER_IN_PROGRESS', + 'ZFS_ERR_REBUILD_IN_PROGRESS', + 'ZFS_ERR_BADPROP', + ], + {} +) +# compat before we used the enum helper for these values +ZFS_ERR_CHECKPOINT_EXISTS = zfs_errno.ZFS_ERR_CHECKPOINT_EXISTS +assert(ZFS_ERR_CHECKPOINT_EXISTS == 1024) +ZFS_ERR_DISCARDING_CHECKPOINT = zfs_errno.ZFS_ERR_DISCARDING_CHECKPOINT +ZFS_ERR_NO_CHECKPOINT = zfs_errno.ZFS_ERR_NO_CHECKPOINT +ZFS_ERR_DEVRM_IN_PROGRESS = zfs_errno.ZFS_ERR_DEVRM_IN_PROGRESS +ZFS_ERR_VDEV_TOO_BIG = zfs_errno.ZFS_ERR_VDEV_TOO_BIG +ZFS_ERR_WRONG_PARENT = zfs_errno.ZFS_ERR_WRONG_PARENT + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_error_translation.py b/contrib/pyzfs/libzfs_core/_error_translation.py new file mode 100644 index 000000000000..f494461f63b2 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/_error_translation.py @@ -0,0 +1,840 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Helper routines for converting ``errno`` style error codes from C functions +to Python exceptions defined by `libzfs_core` API. + +The conversion heavily depends on the context of the error: the attempted +operation and the input parameters. For this reason, there is a conversion +routine for each `libzfs_core` interface function. The conversion routines +have the return code as a parameter as well as all the parameters of the +corresponding interface functions. + +The parameters and exceptions are documented in the `libzfs_core` interfaces. +""" +from __future__ import absolute_import, division, print_function + +import errno +import re +import string +from . import exceptions as lzc_exc +from ._constants import ( + ECHRNG, + ECKSUM, + ETIME, + MAXNAMELEN, + ZFS_ERR_CHECKPOINT_EXISTS, + ZFS_ERR_DISCARDING_CHECKPOINT, + ZFS_ERR_NO_CHECKPOINT, + ZFS_ERR_DEVRM_IN_PROGRESS, + ZFS_ERR_VDEV_TOO_BIG, + ZFS_ERR_WRONG_PARENT, + zfs_errno +) + + +def lzc_create_translate_error(ret, name, ds_type, props): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.PropertyInvalid(name) + if ret == errno.EEXIST: + raise lzc_exc.FilesystemExists(name) + if ret == errno.ENOENT: + raise lzc_exc.ParentNotFound(name) + if ret == ZFS_ERR_WRONG_PARENT: + raise lzc_exc.WrongParent(_fs_name(name)) + if ret == zfs_errno.ZFS_ERR_BADPROP: + raise lzc_exc.PropertyInvalid(name) + raise _generic_exception(ret, name, "Failed to create filesystem") + + +def lzc_clone_translate_error(ret, name, origin, props): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + _validate_snap_name(origin) + raise lzc_exc.PropertyInvalid(name) + if ret == errno.EXDEV: + raise lzc_exc.PoolsDiffer(name) + if ret == errno.EEXIST: + raise lzc_exc.FilesystemExists(name) + if ret == errno.ENOENT: + if not _is_valid_snap_name(origin): + raise lzc_exc.SnapshotNameInvalid(origin) + raise lzc_exc.DatasetNotFound(name) + raise _generic_exception(ret, name, "Failed to create clone") + + +def lzc_rollback_translate_error(ret, name): + if ret == 0: + return + if ret == errno.ESRCH: + raise lzc_exc.SnapshotNotFound(name) + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.NameInvalid(name) + if ret == errno.ENOENT: + if not _is_valid_fs_name(name): + raise lzc_exc.NameInvalid(name) + else: + raise lzc_exc.FilesystemNotFound(name) + raise _generic_exception(ret, name, "Failed to rollback") + + +def lzc_rollback_to_translate_error(ret, name, snap): + if ret == errno.EEXIST: + raise lzc_exc.SnapshotNotLatest(snap) + else: + lzc_rollback_translate_error(ret, name) + + +def lzc_snapshot_translate_errors(ret, errlist, snaps, props): + if ret == 0: + return + + def _map(ret, name): + if ret == errno.EXDEV: + pool_names = iter(map(_pool_name, snaps)) + pool_name = next(pool_names, None) + same_pool = all(x == pool_name for x in pool_names) + if same_pool: + return lzc_exc.DuplicateSnapshots(name) + else: + return lzc_exc.PoolsDiffer(name) + elif ret == errno.EINVAL: + if any(not _is_valid_snap_name(s) for s in snaps): + return lzc_exc.NameInvalid(name) + elif any(len(s) > MAXNAMELEN for s in snaps): + return lzc_exc.NameTooLong(name) + else: + return lzc_exc.PropertyInvalid(name) + + if ret == errno.EEXIST: + return lzc_exc.SnapshotExists(name) + if ret == errno.ENOENT: + return lzc_exc.FilesystemNotFound(name) + return _generic_exception(ret, name, "Failed to create snapshot") + + _handle_err_list(ret, errlist, snaps, lzc_exc.SnapshotFailure, _map) + + +def lzc_destroy_snaps_translate_errors(ret, errlist, snaps, defer): + if ret == 0: + return + + def _map(ret, name): + if ret == errno.EEXIST: + return lzc_exc.SnapshotIsCloned(name) + if ret == errno.ENOENT: + return lzc_exc.PoolNotFound(name) + if ret == errno.EBUSY: + return lzc_exc.SnapshotIsHeld(name) + return _generic_exception(ret, name, "Failed to destroy snapshot") + + _handle_err_list( + ret, errlist, snaps, lzc_exc.SnapshotDestructionFailure, _map) + + +def lzc_bookmark_translate_errors(ret, errlist, bookmarks): + + if ret == 0: + return + + def _map(ret, name): + source = bookmarks[name] + if ret == errno.EINVAL: + if name: + pool_names = map(_pool_name, bookmarks.keys()) + + # use _validate* functions for MAXNAMELEN check + try: + _validate_bmark_name(name) + except lzc_exc.ZFSError as e: + return e + + try: + _validate_snap_name(source) + source_is_snap = True + except lzc_exc.ZFSError: + source_is_snap = False + try: + _validate_bmark_name(source) + source_is_bmark = True + except lzc_exc.ZFSError: + source_is_bmark = False + if not source_is_snap and not source_is_bmark: + return lzc_exc.BookmarkSourceInvalid(source) + + if any(x != _pool_name(name) for x in pool_names): + return lzc_exc.PoolsDiffer(name) + else: + invalid_names = [ + b for b in bookmarks.keys() if not _is_valid_bmark_name(b)] + if invalid_names: + return lzc_exc.BookmarkNameInvalid(invalid_names[0]) + if ret == errno.EEXIST: + return lzc_exc.BookmarkExists(name) + if ret == errno.ENOENT: + return lzc_exc.SnapshotNotFound(name) + if ret == errno.ENOTSUP: + return lzc_exc.BookmarkNotSupported(name) + if ret == zfs_errno.ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR: + return lzc_exc.BookmarkMismatch(source) + return _generic_exception(ret, name, "Failed to create bookmark") + + _handle_err_list( + ret, errlist, bookmarks.keys(), lzc_exc.BookmarkFailure, _map) + + +def lzc_get_bookmarks_translate_error(ret, fsname, props): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(fsname) + raise _generic_exception(ret, fsname, "Failed to list bookmarks") + + +def lzc_destroy_bookmarks_translate_errors(ret, errlist, bookmarks): + if ret == 0: + return + + def _map(ret, name): + if ret == errno.EINVAL: + return lzc_exc.NameInvalid(name) + return _generic_exception(ret, name, "Failed to destroy bookmark") + + _handle_err_list( + ret, errlist, bookmarks, lzc_exc.BookmarkDestructionFailure, _map) + + +def lzc_snaprange_space_translate_error(ret, firstsnap, lastsnap): + if ret == 0: + return + if ret == errno.EXDEV and firstsnap is not None: + if _pool_name(firstsnap) != _pool_name(lastsnap): + raise lzc_exc.PoolsDiffer(lastsnap) + else: + raise lzc_exc.SnapshotMismatch(lastsnap) + if ret == errno.EINVAL: + if not _is_valid_snap_name(firstsnap): + raise lzc_exc.NameInvalid(firstsnap) + elif not _is_valid_snap_name(lastsnap): + raise lzc_exc.NameInvalid(lastsnap) + elif len(firstsnap) > MAXNAMELEN: + raise lzc_exc.NameTooLong(firstsnap) + elif len(lastsnap) > MAXNAMELEN: + raise lzc_exc.NameTooLong(lastsnap) + elif _pool_name(firstsnap) != _pool_name(lastsnap): + raise lzc_exc.PoolsDiffer(lastsnap) + else: + raise lzc_exc.SnapshotMismatch(lastsnap) + if ret == errno.ENOENT: + raise lzc_exc.SnapshotNotFound(lastsnap) + raise _generic_exception( + ret, lastsnap, "Failed to calculate space used by range of snapshots") + + +def lzc_hold_translate_errors(ret, errlist, holds, fd): + if ret == 0: + return + + def _map(ret, name): + if ret == errno.EXDEV: + return lzc_exc.PoolsDiffer(name) + elif ret == errno.EINVAL: + if name: + pool_names = map(_pool_name, holds.keys()) + if not _is_valid_snap_name(name): + return lzc_exc.NameInvalid(name) + elif len(name) > MAXNAMELEN: + return lzc_exc.NameTooLong(name) + elif any(x != _pool_name(name) for x in pool_names): + return lzc_exc.PoolsDiffer(name) + else: + invalid_names = [ + b for b in holds.keys() if not _is_valid_snap_name(b)] + if invalid_names: + return lzc_exc.NameInvalid(invalid_names[0]) + fs_name = None + hold_name = None + pool_name = None + if name is not None: + fs_name = _fs_name(name) + pool_name = _pool_name(name) + hold_name = holds[name] + if ret == errno.ENOENT: + return lzc_exc.FilesystemNotFound(fs_name) + if ret == errno.EEXIST: + return lzc_exc.HoldExists(name) + if ret == errno.E2BIG: + return lzc_exc.NameTooLong(hold_name) + if ret == errno.ENOTSUP: + return lzc_exc.FeatureNotSupported(pool_name) + return _generic_exception(ret, name, "Failed to hold snapshot") + + if ret == errno.EBADF: + raise lzc_exc.BadHoldCleanupFD() + _handle_err_list(ret, errlist, holds.keys(), lzc_exc.HoldFailure, _map) + + +def lzc_release_translate_errors(ret, errlist, holds): + if ret == 0: + return + for snap in holds: + hold_list = holds[snap] + if not isinstance(hold_list, list): + raise lzc_exc.TypeError('holds must be in a list') + + def _map(ret, name): + if ret == errno.EXDEV: + return lzc_exc.PoolsDiffer(name) + elif ret == errno.EINVAL: + if name: + pool_names = map(_pool_name, holds.keys()) + if not _is_valid_snap_name(name): + return lzc_exc.NameInvalid(name) + elif len(name) > MAXNAMELEN: + return lzc_exc.NameTooLong(name) + elif any(x != _pool_name(name) for x in pool_names): + return lzc_exc.PoolsDiffer(name) + else: + invalid_names = [ + b for b in holds.keys() if not _is_valid_snap_name(b)] + if invalid_names: + return lzc_exc.NameInvalid(invalid_names[0]) + elif ret == errno.ENOENT: + return lzc_exc.HoldNotFound(name) + elif ret == errno.E2BIG: + tag_list = holds[name] + too_long_tags = [t for t in tag_list if len(t) > MAXNAMELEN] + return lzc_exc.NameTooLong(too_long_tags[0]) + elif ret == errno.ENOTSUP: + pool_name = None + if name is not None: + pool_name = _pool_name(name) + return lzc_exc.FeatureNotSupported(pool_name) + else: + return _generic_exception( + ret, name, "Failed to release snapshot hold") + + _handle_err_list( + ret, errlist, holds.keys(), lzc_exc.HoldReleaseFailure, _map) + + +def lzc_get_holds_translate_error(ret, snapname): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_snap_name(snapname) + if ret == errno.ENOENT: + raise lzc_exc.SnapshotNotFound(snapname) + if ret == errno.ENOTSUP: + raise lzc_exc.FeatureNotSupported(_pool_name(snapname)) + raise _generic_exception(ret, snapname, "Failed to get holds on snapshot") + + +def lzc_send_translate_error(ret, snapname, fromsnap, fd, flags): + if ret == 0: + return + if ret == errno.EXDEV and fromsnap is not None: + if _pool_name(fromsnap) != _pool_name(snapname): + raise lzc_exc.PoolsDiffer(snapname) + else: + raise lzc_exc.SnapshotMismatch(snapname) + elif ret == errno.EINVAL: + if (fromsnap is not None and not _is_valid_snap_name(fromsnap) and + not _is_valid_bmark_name(fromsnap)): + raise lzc_exc.NameInvalid(fromsnap) + elif (not _is_valid_snap_name(snapname) and + not _is_valid_fs_name(snapname)): + raise lzc_exc.NameInvalid(snapname) + elif fromsnap is not None and len(fromsnap) > MAXNAMELEN: + raise lzc_exc.NameTooLong(fromsnap) + elif len(snapname) > MAXNAMELEN: + raise lzc_exc.NameTooLong(snapname) + elif (fromsnap is not None and + _pool_name(fromsnap) != _pool_name(snapname)): + raise lzc_exc.PoolsDiffer(snapname) + elif ret == errno.ENOENT: + if (fromsnap is not None and not _is_valid_snap_name(fromsnap) and + not _is_valid_bmark_name(fromsnap)): + raise lzc_exc.NameInvalid(fromsnap) + raise lzc_exc.SnapshotNotFound(snapname) + elif ret == errno.ENAMETOOLONG: + if fromsnap is not None and len(fromsnap) > MAXNAMELEN: + raise lzc_exc.NameTooLong(fromsnap) + else: + raise lzc_exc.NameTooLong(snapname) + raise lzc_exc.StreamIOError(ret) + + +def lzc_send_space_translate_error(ret, snapname, fromsnap): + if ret == 0: + return + if ret == errno.EXDEV and fromsnap is not None: + if _pool_name(fromsnap) != _pool_name(snapname): + raise lzc_exc.PoolsDiffer(snapname) + else: + raise lzc_exc.SnapshotMismatch(snapname) + elif ret == errno.EINVAL: + if fromsnap is not None and not _is_valid_snap_name(fromsnap): + raise lzc_exc.NameInvalid(fromsnap) + elif not _is_valid_snap_name(snapname): + raise lzc_exc.NameInvalid(snapname) + elif fromsnap is not None and len(fromsnap) > MAXNAMELEN: + raise lzc_exc.NameTooLong(fromsnap) + elif len(snapname) > MAXNAMELEN: + raise lzc_exc.NameTooLong(snapname) + elif (fromsnap is not None and + _pool_name(fromsnap) != _pool_name(snapname)): + raise lzc_exc.PoolsDiffer(snapname) + elif ret == errno.ENOENT and fromsnap is not None: + if not _is_valid_snap_name(fromsnap): + raise lzc_exc.NameInvalid(fromsnap) + if ret == errno.ENOENT: + raise lzc_exc.SnapshotNotFound(snapname) + raise _generic_exception( + ret, snapname, "Failed to estimate backup stream size") + + +def lzc_receive_translate_errors( + ret, snapname, fd, force, raw, resumable, embedded, origin, properrs +): + if ret == 0: + if properrs is not None and len(properrs) > 0: + def _map(ret, name): + if ret == errno.EINVAL: + return lzc_exc.PropertyInvalid(name) + if ret == zfs_errno.ZFS_ERR_BADPROP: + return lzc_exc.PropertyInvalid(name) + return _generic_exception(ret, name, "Failed to set property") + _handle_err_list( + errno.EINVAL, properrs, [snapname], + lzc_exc.ReceivePropertyFailure, _map) + else: + return + if ret == errno.EINVAL: + if (not _is_valid_snap_name(snapname) and + not _is_valid_fs_name(snapname)): + raise lzc_exc.NameInvalid(snapname) + elif len(snapname) > MAXNAMELEN: + raise lzc_exc.NameTooLong(snapname) + elif origin is not None and not _is_valid_snap_name(origin): + raise lzc_exc.NameInvalid(origin) + elif resumable: + raise lzc_exc.StreamFeatureInvalid() + elif embedded and not raw: + raise lzc_exc.StreamFeatureIncompatible() + else: + raise lzc_exc.BadStream() + if ret == errno.ENOENT: + if not _is_valid_snap_name(snapname): + raise lzc_exc.NameInvalid(snapname) + else: + raise lzc_exc.DatasetNotFound(snapname) + if ret == errno.EEXIST: + raise lzc_exc.DatasetExists(snapname) + if ret == errno.ENOTSUP: + raise lzc_exc.StreamFeatureNotSupported() + if ret == errno.ENODEV: + raise lzc_exc.StreamMismatch(_fs_name(snapname)) + if ret == errno.ETXTBSY: + raise lzc_exc.DestinationModified(_fs_name(snapname)) + if ret == errno.EBUSY: + raise lzc_exc.DatasetBusy(_fs_name(snapname)) + if ret == errno.ENOSPC: + raise lzc_exc.NoSpace(_fs_name(snapname)) + if ret == errno.EDQUOT: + raise lzc_exc.QuotaExceeded(_fs_name(snapname)) + if ret == errno.ENAMETOOLONG: + raise lzc_exc.NameTooLong(snapname) + if ret == errno.EROFS: + raise lzc_exc.ReadOnlyPool(_pool_name(snapname)) + if ret == errno.EAGAIN: + raise lzc_exc.SuspendedPool(_pool_name(snapname)) + if ret == ECKSUM: + raise lzc_exc.BadStream() + if ret == ZFS_ERR_WRONG_PARENT: + raise lzc_exc.WrongParent(_fs_name(snapname)) + if ret == zfs_errno.ZFS_ERR_STREAM_TRUNCATED: + raise lzc_exc.StreamTruncated() + if ret == zfs_errno.ZFS_ERR_BADPROP: + raise lzc_exc.PropertyInvalid(snapname) + + raise lzc_exc.StreamIOError(ret) + + +def lzc_promote_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.NotClone(name) + if ret == errno.ENOTSOCK: + raise lzc_exc.NotClone(name) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(name) + if ret == errno.EEXIST: + raise lzc_exc.SnapshotExists(name) + raise _generic_exception(ret, name, "Failed to promote dataset") + + +def lzc_change_key_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.PropertyInvalid(name) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(name) + if ret == errno.EACCES: + raise lzc_exc.EncryptionKeyNotLoaded() + raise _generic_exception(ret, name, "Failed to change encryption key") + + +def lzc_load_key_translate_error(ret, name, noop): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.PropertyInvalid(name) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(name) + if ret == errno.EACCES: + raise lzc_exc.EncryptionKeyInvalid() + if ret == errno.EEXIST: + raise lzc_exc.EncryptionKeyAlreadyLoaded() + if noop: + raise _generic_exception(ret, name, "Failed to load encryption key") + else: + raise _generic_exception(ret, name, "Failed to verify encryption key") + + +def lzc_unload_key_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.PropertyInvalid(name) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(name) + if ret == errno.EACCES: + raise lzc_exc.EncryptionKeyNotLoaded() + raise _generic_exception(ret, name, "Failed to unload encryption key") + + +def lzc_sync_translate_error(ret, name): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.PoolNotFound(name) + raise _generic_exception(ret, name, "Failed to sync pool") + + +def lzc_reopen_translate_error(ret, name): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.PoolNotFound(name) + raise _generic_exception(ret, name, "Failed to reopen pool") + + +def lzc_channel_program_translate_error(ret, name, error): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.PoolNotFound(name) + if ret == ETIME: + raise lzc_exc.ZCPTimeout() + if ret == errno.ENOMEM: + raise lzc_exc.ZCPMemoryError() + if ret == errno.ENOSPC: + raise lzc_exc.ZCPSpaceError() + if ret == errno.EPERM: + raise lzc_exc.ZCPPermissionError() + if ret == ECHRNG: + raise lzc_exc.ZCPRuntimeError(error) + if ret == errno.EINVAL: + if error is None: + raise lzc_exc.ZCPLimitInvalid() + else: + raise lzc_exc.ZCPSyntaxError(error) + raise _generic_exception(ret, name, "Failed to execute channel program") + + +def lzc_pool_checkpoint_translate_error(ret, name, discard=False): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.PoolNotFound(name) + if ret == ZFS_ERR_CHECKPOINT_EXISTS: + raise lzc_exc.CheckpointExists() + if ret == ZFS_ERR_NO_CHECKPOINT: + raise lzc_exc.CheckpointNotFound() + if ret == ZFS_ERR_DISCARDING_CHECKPOINT: + raise lzc_exc.CheckpointDiscarding() + if ret == ZFS_ERR_DEVRM_IN_PROGRESS: + raise lzc_exc.DeviceRemovalRunning() + if ret == ZFS_ERR_VDEV_TOO_BIG: + raise lzc_exc.DeviceTooBig() + if discard: + raise _generic_exception( + ret, name, "Failed to discard pool checkpoint") + else: + raise _generic_exception(ret, name, "Failed to create pool checkpoint") + + +def lzc_pool_checkpoint_discard_translate_error(ret, name): + lzc_pool_checkpoint_translate_error(ret, name, discard=True) + + +def lzc_rename_translate_error(ret, source, target): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(source) + _validate_fs_name(target) + if _pool_name(source) != _pool_name(target): + raise lzc_exc.PoolsDiffer(source) + if ret == errno.EEXIST: + raise lzc_exc.FilesystemExists(target) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(source) + if ret == ZFS_ERR_WRONG_PARENT: + raise lzc_exc.WrongParent(target) + raise _generic_exception(ret, source, "Failed to rename dataset") + + +def lzc_destroy_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + if ret == errno.ENOENT: + raise lzc_exc.FilesystemNotFound(name) + raise _generic_exception(ret, name, "Failed to destroy dataset") + + +def lzc_inherit_prop_translate_error(ret, name, prop): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise lzc_exc.PropertyInvalid(prop) + if ret == errno.ENOENT: + raise lzc_exc.DatasetNotFound(name) + raise _generic_exception(ret, name, "Failed to inherit a property") + + +def lzc_set_prop_translate_error(ret, name, prop, val): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_or_snap_name(name) + raise lzc_exc.PropertyInvalid(prop) + if ret == errno.ENOENT: + raise lzc_exc.DatasetNotFound(name) + raise _generic_exception(ret, name, "Failed to set a property") + + +def lzc_get_props_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_or_snap_name(name) + if ret == errno.ENOENT: + raise lzc_exc.DatasetNotFound(name) + raise _generic_exception(ret, name, "Failed to get properties") + + +def lzc_list_children_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise _generic_exception(ret, name, "Error while iterating children") + + +def lzc_list_snaps_translate_error(ret, name): + if ret == 0: + return + if ret == errno.EINVAL: + _validate_fs_name(name) + raise _generic_exception(ret, name, "Error while iterating snapshots") + + +def lzc_list_translate_error(ret, name, opts): + if ret == 0: + return + if ret == errno.ENOENT: + raise lzc_exc.DatasetNotFound(name) + if ret == errno.EINVAL: + _validate_fs_or_snap_name(name) + raise _generic_exception(ret, name, "Error obtaining a list") + + +def _handle_err_list(ret, errlist, names, exception, mapper): + ''' + Convert one or more errors from an operation into the requested exception. + + :param int ret: the overall return code. + :param errlist: the dictionary that maps entity names to their specific + error codes. + :type errlist: dict of bytes:int + :param names: the list of all names of the entities on which the operation + was attempted. + :param type exception: the type of the exception to raise if an error + occurred. The exception should be a subclass of + ``MultipleOperationsFailure``. + :param function mapper: the function that maps an error code and a name to + a Python exception. + + Unless ``ret`` is zero this function will raise the ``exception``. + If the ``errlist`` is not empty, then the compound exception will contain + a list of exceptions corresponding to each individual error code in the + ``errlist``. + Otherwise, the ``exception`` will contain a list with a single exception + corresponding to the ``ret`` value. If the ``names`` list contains only one + element, that is, the operation was attempted on a single entity, then the + name of that entity is passed to the ``mapper``. + If the operation was attempted on multiple entities, but the ``errlist`` + is empty, then we can not know which entity caused the error and, thus, + ``None`` is used as a name to signify that fact. + + .. note:: + Note that the ``errlist`` can contain a special element with a key of + "N_MORE_ERRORS". + That element means that there were too many errors to place on the + ``errlist``. + Those errors are suppressed and only their count is provided as a + value of the special ``N_MORE_ERRORS`` element. + ''' + if ret == 0: + return + + if len(errlist) == 0: + suppressed_count = 0 + names = list(zip(names, range(2))) + if len(names) == 1: + name, _ = names[0] + else: + name = None + errors = [mapper(ret, name)] + else: + errors = [] + suppressed_count = errlist.pop('N_MORE_ERRORS', 0) + for name in errlist: + err = errlist[name] + errors.append(mapper(err, name)) + + raise exception(errors, suppressed_count) + + +def _pool_name(name): + ''' + Extract a pool name from the given dataset or bookmark name. + + '/' separates dataset name components. + '@' separates a snapshot name from the rest of the dataset name. + '#' separates a bookmark name from the rest of the dataset name. + ''' + return re.split(b'[/@#]', name, 1)[0] + + +def _fs_name(name): + ''' + Extract a dataset name from the given snapshot or bookmark name. + + '@' separates a snapshot name from the rest of the dataset name. + '#' separates a bookmark name from the rest of the dataset name. + ''' + return re.split(b'[@#]', name, 1)[0] + + +def _is_valid_name_component(component): + allowed = string.ascii_letters + string.digits + u'-_.: ' + return component and all(x in allowed.encode() for x in component) + + +def _is_valid_fs_name(name): + return name and all(_is_valid_name_component(c) for c in name.split(b'/')) + + +def _is_valid_snap_name(name): + parts = name.split(b'@') + return (len(parts) == 2 and _is_valid_fs_name(parts[0]) and + _is_valid_name_component(parts[1])) + + +def _is_valid_bmark_name(name): + parts = name.split(b'#') + return (len(parts) == 2 and _is_valid_fs_name(parts[0]) and + _is_valid_name_component(parts[1])) + + +def _validate_fs_name(name): + if not _is_valid_fs_name(name): + raise lzc_exc.FilesystemNameInvalid(name) + elif len(name) > MAXNAMELEN: + raise lzc_exc.NameTooLong(name) + + +def _validate_snap_name(name): + if not _is_valid_snap_name(name): + raise lzc_exc.SnapshotNameInvalid(name) + elif len(name) > MAXNAMELEN: + raise lzc_exc.NameTooLong(name) + + +def _validate_bmark_name(name): + if not _is_valid_bmark_name(name): + raise lzc_exc.BookmarkNameInvalid(name) + elif len(name) > MAXNAMELEN: + raise lzc_exc.NameTooLong(name) + + +def _validate_fs_or_snap_name(name): + if not _is_valid_fs_name(name) and not _is_valid_snap_name(name): + raise lzc_exc.NameInvalid(name) + elif len(name) > MAXNAMELEN: + raise lzc_exc.NameTooLong(name) + + +def _generic_exception(err, name, message): + if err in _error_to_exception: + return _error_to_exception[err](name) + else: + return lzc_exc.ZFSGenericError(err, message, name) + + +_error_to_exception = {e.errno: e for e in [ + lzc_exc.ZIOError, + lzc_exc.NoSpace, + lzc_exc.QuotaExceeded, + lzc_exc.DatasetBusy, + lzc_exc.NameTooLong, + lzc_exc.ReadOnlyPool, + lzc_exc.SuspendedPool, + lzc_exc.PoolsDiffer, + lzc_exc.PropertyNotSupported, +]} + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_libzfs_core.py b/contrib/pyzfs/libzfs_core/_libzfs_core.py new file mode 100644 index 000000000000..fcfa5be31b1f --- /dev/null +++ b/contrib/pyzfs/libzfs_core/_libzfs_core.py @@ -0,0 +1,1955 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python wrappers for libzfs_core interfaces. + +As a rule, there is a Python function for each C function. +The signatures of the Python functions generally follow those of the +functions, but the argument types are natural to Python. +nvlists are wrapped as dictionaries or lists depending on their usage. +Some parameters have default values depending on typical use for +increased convenience. Output parameters are not used and return values +are directly returned. Error conditions are signalled by exceptions +rather than by integer error codes. +""" +from __future__ import absolute_import, division, print_function + +import errno +import functools +import fcntl +import os +import struct +import threading +from . import exceptions +from . import _error_translation as errors +from .bindings import libzfs_core +from ._constants import ( # noqa: F401 + MAXNAMELEN, + ZCP_DEFAULT_INSTRLIMIT, + ZCP_DEFAULT_MEMLIMIT, + WRAPPING_KEY_LEN, + zfs_key_location, + zfs_keyformat, + zio_encrypt +) +from .ctypes import ( + int32_t, + uint64_t +) +from ._nvlist import nvlist_in, nvlist_out + + +def _uncommitted(depends_on=None): + ''' + Mark an API function as being an uncommitted extension that might not be + available. + + :param function depends_on: the function that would be checked instead of + a decorated function. For example, if the decorated function uses + another uncommitted function. + + This decorator transforms a decorated function to raise + :exc:`NotImplementedError` if the C libzfs_core library does not provide + a function with the same name as the decorated function. + + The optional `depends_on` parameter can be provided if the decorated + function does not directly call the C function but instead calls another + Python function that follows the typical convention. + One example is :func:`lzc_list_snaps` that calls :func:`lzc_list` that + calls ``lzc_list`` in libzfs_core. + + This decorator is implemented using :func:`is_supported`. + ''' + def _uncommitted_decorator(func, depends_on=depends_on): + @functools.wraps(func) + def _f(*args, **kwargs): + if not is_supported(_f): + raise NotImplementedError(func.__name__) + return func(*args, **kwargs) + if depends_on is not None: + _f._check_func = depends_on + return _f + return _uncommitted_decorator + + +def lzc_create(name, ds_type='zfs', props=None, key=None): + ''' + Create a ZFS filesystem or a ZFS volume ("zvol"). + + :param bytes name: a name of the dataset to be created. + :param str ds_type: the type of the dataset to be created, + currently supported types are "zfs" (the default) for a filesystem and + "zvol" for a volume. + :param props: a `dict` of ZFS dataset property name-value pairs + (empty by default). + :type props: dict of bytes:Any + :param key: dataset encryption key data (empty by default). + :type key: bytes + + :raises FilesystemExists: if a dataset with the given name already exists. + :raises ParentNotFound: if a parent dataset of the requested dataset does + not exist. + :raises PropertyInvalid: if one or more of the specified properties is + invalid or has an invalid type or value. + :raises NameInvalid: if the name is not a valid dataset name. + :raises NameTooLong: if the name is too long. + :raises WrongParent: if the parent dataset of the requested dataset is not + a filesystem (e.g. ZVOL) + ''' + if props is None: + props = {} + if key is None: + key = b"" + else: + key = bytes(key) + if ds_type == 'zfs': + ds_type = _lib.DMU_OST_ZFS + elif ds_type == 'zvol': + ds_type = _lib.DMU_OST_ZVOL + else: + raise exceptions.DatasetTypeInvalid(ds_type) + nvlist = nvlist_in(props) + ret = _lib.lzc_create(name, ds_type, nvlist, key, len(key)) + errors.lzc_create_translate_error(ret, name, ds_type, props) + + +def lzc_clone(name, origin, props=None): + ''' + Clone a ZFS filesystem or a ZFS volume ("zvol") from a given snapshot. + + :param bytes name: a name of the dataset to be created. + :param bytes origin: a name of the origin snapshot. + :param props: a `dict` of ZFS dataset property name-value pairs + (empty by default). + :type props: dict of bytes:Any + + :raises FilesystemExists: if a dataset with the given name already exists. + :raises DatasetNotFound: if either a parent dataset of the requested + dataset or the origin snapshot does not exist. + :raises PropertyInvalid: if one or more of the specified properties is + invalid or has an invalid type or value. + :raises FilesystemNameInvalid: if the name is not a valid dataset name. + :raises SnapshotNameInvalid: if the origin is not a valid snapshot name. + :raises NameTooLong: if the name or the origin name is too long. + :raises PoolsDiffer: if the clone and the origin have different pool names. + + .. note:: + Because of a deficiency of the underlying C interface + :exc:`.DatasetNotFound` can mean that either a parent filesystem of + the target or the origin snapshot does not exist. + It is currently impossible to distinguish between the cases. + :func:`lzc_hold` can be used to check that the snapshot exists and + ensure that it is not destroyed before cloning. + ''' + if props is None: + props = {} + nvlist = nvlist_in(props) + ret = _lib.lzc_clone(name, origin, nvlist) + errors.lzc_clone_translate_error(ret, name, origin, props) + + +def lzc_rollback(name): + ''' + Roll back a filesystem or volume to its most recent snapshot. + + Note that the latest snapshot may change if a new one is concurrently + created or the current one is destroyed. lzc_rollback_to can be used + to roll back to a specific latest snapshot. + + :param bytes name: a name of the dataset to be rolled back. + :return: a name of the most recent snapshot. + :rtype: bytes + + :raises FilesystemNotFound: if the dataset does not exist. + :raises SnapshotNotFound: if the dataset does not have any snapshots. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + ''' + # Account for terminating NUL in C strings. + snapnamep = _ffi.new('char[]', MAXNAMELEN + 1) + ret = _lib.lzc_rollback(name, snapnamep, MAXNAMELEN + 1) + errors.lzc_rollback_translate_error(ret, name) + return _ffi.string(snapnamep) + + +def lzc_rollback_to(name, snap): + ''' + Roll back this filesystem or volume to the specified snapshot, if possible. + + :param bytes name: a name of the dataset to be rolled back. + :param bytes snap: a name of the snapshot to be rolled back. + + :raises FilesystemNotFound: if the dataset does not exist. + :raises SnapshotNotFound: if the dataset does not have any snapshots. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises SnapshotNotLatest: if the snapshot is not the latest. + ''' + ret = _lib.lzc_rollback_to(name, snap) + errors.lzc_rollback_to_translate_error(ret, name, snap) + + +def lzc_snapshot(snaps, props=None): + ''' + Create snapshots. + + All snapshots must be in the same pool. + + Optionally snapshot properties can be set on all snapshots. + Currently only user properties (prefixed with "user:") are supported. + + Either all snapshots are successfully created or none are created if + an exception is raised. + + :param snaps: a list of names of snapshots to be created. + :type snaps: list of bytes + :param props: a `dict` of ZFS dataset property name-value pairs + (empty by default). + :type props: dict of bytes:bytes + + :raises SnapshotFailure: if one or more snapshots could not be created. + + .. note:: + :exc:`.SnapshotFailure` is a compound exception that provides at least + one detailed error object in :attr:`SnapshotFailure.errors` `list`. + + .. warning:: + The underlying implementation reports an individual, per-snapshot error + only for :exc:`.SnapshotExists` condition and *sometimes* for + :exc:`.NameTooLong`. + In all other cases a single error is reported without connection to any + specific snapshot name(s). + + This has the following implications: + + * if multiple error conditions are encountered only one of them is + reported + + * unless only one snapshot is requested then it is impossible to tell + how many snapshots are problematic and what they are + + * only if there are no other error conditions :exc:`.SnapshotExists` + is reported for all affected snapshots + + * :exc:`.NameTooLong` can behave either in the same way as + :exc:`.SnapshotExists` or as all other exceptions. + The former is the case where the full snapshot name exceeds the + maximum allowed length but the short snapshot name (after '@') is + within the limit. + The latter is the case when the short name alone exceeds the maximum + allowed length. + ''' + snaps_dict = {name: None for name in snaps} + errlist = {} + snaps_nvlist = nvlist_in(snaps_dict) + if props is None: + props = {} + props_nvlist = nvlist_in(props) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_snapshot(snaps_nvlist, props_nvlist, errlist_nvlist) + errors.lzc_snapshot_translate_errors(ret, errlist, snaps, props) + + +lzc_snap = lzc_snapshot + + +def lzc_destroy_snaps(snaps, defer): + ''' + Destroy snapshots. + + They must all be in the same pool. + Snapshots that do not exist will be silently ignored. + + If 'defer' is not set, and a snapshot has user holds or clones, the + destroy operation will fail and none of the snapshots will be + destroyed. + + If 'defer' is set, and a snapshot has user holds or clones, it will be + marked for deferred destruction, and will be destroyed when the last hold + or clone is removed/destroyed. + + The operation succeeds if all snapshots were destroyed (or marked for + later destruction if 'defer' is set) or didn't exist to begin with. + + :param snaps: a list of names of snapshots to be destroyed. + :type snaps: list of bytes + :param bool defer: whether to mark busy snapshots for deferred destruction + rather than immediately failing. + + :raises SnapshotDestructionFailure: if one or more snapshots could not be + created. + + .. note:: + :exc:`.SnapshotDestructionFailure` is a compound exception that + provides at least one detailed error object in + :attr:`SnapshotDestructionFailure.errors` `list`. + + Typical error is :exc:`SnapshotIsCloned` if `defer` is `False`. + The snapshot names are validated quite loosely and invalid names are + typically ignored as nonexisting snapshots. + + A snapshot name referring to a filesystem that doesn't exist is + ignored. + However, non-existent pool name causes :exc:`PoolNotFound`. + ''' + snaps_dict = {name: None for name in snaps} + errlist = {} + snaps_nvlist = nvlist_in(snaps_dict) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_destroy_snaps(snaps_nvlist, defer, errlist_nvlist) + errors.lzc_destroy_snaps_translate_errors(ret, errlist, snaps, defer) + + +def lzc_bookmark(bookmarks): + ''' + Create bookmarks. + + :param bookmarks: a dict that maps names of wanted bookmarks to names of + existing snapshots or bookmarks. + :type bookmarks: dict of bytes to bytes + :raises BookmarkFailure: if any of the bookmarks can not be created for any + reason. + + The bookmarks `dict` maps from name of the bookmark + (e.g. :file:`{pool}/{fs}#{bmark}`) to the name of the snapshot + (e.g. :file:`{pool}/{fs}@{snap}`) or existint bookmark + :file:`{pool}/{fs}@{snap}`. All the bookmarks and snapshots must + be in the same pool. + ''' + errlist = {} + nvlist = nvlist_in(bookmarks) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_bookmark(nvlist, errlist_nvlist) + errors.lzc_bookmark_translate_errors(ret, errlist, bookmarks) + + +def lzc_get_bookmarks(fsname, props=None): + ''' + Retrieve a listing of bookmarks for the given file system. + + :param bytes fsname: a name of the filesystem. + :param props: a `list` of properties that will be returned for each + bookmark. + :type props: list of bytes + :return: a `dict` that maps the bookmarks' short names to their properties. + :rtype: dict of bytes:dict + + :raises FilesystemNotFound: if the filesystem is not found. + + The following are valid properties on bookmarks: + + guid : integer + globally unique identifier of the snapshot the bookmark refers to + createtxg : integer + txg when the snapshot the bookmark refers to was created + creation : integer + timestamp when the snapshot the bookmark refers to was created + + Any other properties passed in ``props`` are ignored without reporting + any error. + Values in the returned dictionary map the names of the requested properties + to their respective values. + ''' + bmarks = {} + if props is None: + props = [] + props_dict = {name: None for name in props} + nvlist = nvlist_in(props_dict) + with nvlist_out(bmarks) as bmarks_nvlist: + ret = _lib.lzc_get_bookmarks(fsname, nvlist, bmarks_nvlist) + errors.lzc_get_bookmarks_translate_error(ret, fsname, props) + return bmarks + + +def lzc_destroy_bookmarks(bookmarks): + ''' + Destroy bookmarks. + + :param bookmarks: a list of the bookmarks to be destroyed. The bookmarks + are specified as :file:`{fs}#{bmark}`. + :type bookmarks: list of bytes + + :raises BookmarkDestructionFailure: if any of the bookmarks may not be + destroyed. + + The bookmarks must all be in the same pool. + Bookmarks that do not exist will be silently ignored. + This also includes the case where the filesystem component of the bookmark + name does not exist. + However, an invalid bookmark name will cause :exc:`.NameInvalid` error + reported in :attr:`SnapshotDestructionFailure.errors`. + + Either all bookmarks that existed are destroyed or an exception is raised. + ''' + errlist = {} + bmarks_dict = {name: None for name in bookmarks} + nvlist = nvlist_in(bmarks_dict) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_destroy_bookmarks(nvlist, errlist_nvlist) + errors.lzc_destroy_bookmarks_translate_errors(ret, errlist, bookmarks) + + +def lzc_snaprange_space(firstsnap, lastsnap): + ''' + Calculate a size of data referenced by snapshots in the inclusive range + between the ``firstsnap`` and the ``lastsnap`` and not shared with any + other datasets. + + :param bytes firstsnap: the name of the first snapshot in the range. + :param bytes lastsnap: the name of the last snapshot in the range. + :return: the calculated stream size, in bytes. + :rtype: `int` or `long` + + :raises SnapshotNotFound: if either of the snapshots does not exist. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + :raises SnapshotMismatch: if ``fromsnap`` is not an ancestor snapshot of + ``snapname``. + :raises PoolsDiffer: if the snapshots belong to different pools. + + ``lzc_snaprange_space`` calculates total size of blocks that exist + because they are referenced only by one or more snapshots in the given + range but no other dataset. + In other words, this is the set of blocks that were born after the snap + before firstsnap, and died before the snap after the last snap. + Yet another interpretation is that the result of ``lzc_snaprange_space`` + is the size of the space that would be freed if the snapshots in the range + are destroyed. + + If the same snapshot is given as both the ``firstsnap`` and the + ``lastsnap``. + In that case ``lzc_snaprange_space`` calculates space used by the snapshot. + ''' + valp = _ffi.new('uint64_t *') + ret = _lib.lzc_snaprange_space(firstsnap, lastsnap, valp) + errors.lzc_snaprange_space_translate_error(ret, firstsnap, lastsnap) + return int(valp[0]) + + +def lzc_hold(holds, fd=None): + ''' + Create *user holds* on snapshots. If there is a hold on a snapshot, + the snapshot can not be destroyed. (However, it can be marked for + deletion by :func:`lzc_destroy_snaps` ( ``defer`` = `True` ).) + + :param holds: the dictionary of names of the snapshots to hold mapped to + the hold names. + :type holds: dict of bytes : bytes + :type fd: int or None + :param fd: if not None then it must be the result of :func:`os.open` + called as ``os.open("/dev/zfs", O_EXCL)``. + :type fd: int or None + :return: a list of the snapshots that do not exist. + :rtype: list of bytes + + :raises HoldFailure: if a hold was impossible on one or more of the + snapshots. + :raises BadHoldCleanupFD: if ``fd`` is not a valid file descriptor + associated with :file:`/dev/zfs`. + + The snapshots must all be in the same pool. + + If ``fd`` is not None, then when the ``fd`` is closed (including on process + termination), the holds will be released. If the system is shut down + uncleanly, the holds will be released when the pool is next opened + or imported. + + Holds for snapshots which don't exist will be skipped and have an entry + added to the return value, but will not cause an overall failure. + No exceptions is raised if all holds, for snapshots that existed, were + successfully created. + Otherwise :exc:`.HoldFailure` exception is raised and no holds will be + created. + :attr:`.HoldFailure.errors` may contain a single element for an error that + is not specific to any hold / snapshot, or it may contain one or more + elements detailing specific error per each affected hold. + ''' + errlist = {} + if fd is None: + fd = -1 + nvlist = nvlist_in(holds) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_hold(nvlist, fd, errlist_nvlist) + errors.lzc_hold_translate_errors(ret, errlist, holds, fd) + # If there is no error (no exception raised by _handleErrList), but errlist + # is not empty, then it contains missing snapshots. + assert all(errlist[x] == errno.ENOENT for x in errlist) + return list(errlist.keys()) + + +def lzc_release(holds): + ''' + Release *user holds* on snapshots. + + If the snapshot has been marked for + deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have + any clones, and all the user holds are removed, then the snapshot will be + destroyed. + + The snapshots must all be in the same pool. + + :param holds: a ``dict`` where keys are snapshot names and values are + lists of hold tags to remove. + :type holds: dict of bytes : list of bytes + :return: a list of any snapshots that do not exist and of any tags that do + not exist for existing snapshots. + Such tags are qualified with a corresponding snapshot name using the + following format :file:`{pool}/{fs}@{snap}#{tag}` + :rtype: list of bytes + + :raises HoldReleaseFailure: if one or more existing holds could not be + released. + + Holds which failed to release because they didn't exist will have an entry + added to errlist, but will not cause an overall failure. + + This call is success if ``holds`` was empty or all holds that + existed, were successfully removed. + Otherwise an exception will be raised. + ''' + errlist = {} + holds_dict = {} + for snap in holds: + hold_list = holds[snap] + if not isinstance(hold_list, list): + raise TypeError('holds must be in a list') + holds_dict[snap] = {hold: None for hold in hold_list} + nvlist = nvlist_in(holds_dict) + with nvlist_out(errlist) as errlist_nvlist: + ret = _lib.lzc_release(nvlist, errlist_nvlist) + errors.lzc_release_translate_errors(ret, errlist, holds) + # If there is no error (no exception raised by _handleErrList), but errlist + # is not empty, then it contains missing snapshots and tags. + assert all(errlist[x] == errno.ENOENT for x in errlist) + return list(errlist.keys()) + + +def lzc_get_holds(snapname): + ''' + Retrieve list of *user holds* on the specified snapshot. + + :param bytes snapname: the name of the snapshot. + :return: holds on the snapshot along with their creation times + in seconds since the epoch + :rtype: dict of bytes : int + ''' + holds = {} + with nvlist_out(holds) as nvlist: + ret = _lib.lzc_get_holds(snapname, nvlist) + errors.lzc_get_holds_translate_error(ret, snapname) + return holds + + +def lzc_send(snapname, fromsnap, fd, flags=None): + ''' + Generate a zfs send stream for the specified snapshot and write it to + the specified file descriptor. + + :param bytes snapname: the name of the snapshot to send. + :param fromsnap: if not None the name of the starting snapshot + for the incremental stream. + :type fromsnap: bytes or None + :param int fd: the file descriptor to write the send stream to. + :param flags: the flags that control what enhanced features can be used in + the stream. + :type flags: list of bytes + + :raises SnapshotNotFound: if either the starting snapshot is not `None` and + does not exist, or if the ending snapshot does not exist. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + :raises SnapshotMismatch: if ``fromsnap`` is not an ancestor snapshot of + ``snapname``. + :raises PoolsDiffer: if the snapshots belong to different pools. + :raises IOError: if an input / output error occurs while writing to ``fd``. + :raises UnknownStreamFeature: if the ``flags`` contain an unknown flag + name. + + If ``fromsnap`` is None, a full (non-incremental) stream will be sent. + If ``fromsnap`` is not None, it must be the full name of a snapshot or + bookmark to send an incremental from, e.g. + :file:`{pool}/{fs}@{earlier_snap}` or :file:`{pool}/{fs}#{earlier_bmark}`. + + The specified snapshot or bookmark must represent an earlier point in the + history of ``snapname``. + It can be an earlier snapshot in the same filesystem or zvol as + ``snapname``, or it can be the origin of ``snapname``'s filesystem, or an + earlier snapshot in the origin, etc. + ``fromsnap`` must be strictly an earlier snapshot, specifying the same + snapshot as both ``fromsnap`` and ``snapname`` is an error. + + If ``flags`` contains *"large_blocks"*, the stream is permitted + to contain ``DRR_WRITE`` records with ``drr_length`` > 128K, + and ``DRR_OBJECT`` records with ``drr_blksz`` > 128K. + + If ``flags`` contains *"embedded_data"*, the stream is permitted + to contain ``DRR_WRITE_EMBEDDED`` records with + ``drr_etype`` == ``BP_EMBEDDED_TYPE_DATA``, + which the receiving system must support (as indicated by support + for the *embedded_data* feature). + + If ``flags`` contains *"compress"*, the stream is generated by using + compressed WRITE records for blocks which are compressed on disk and + in memory. If the *lz4_compress* feature is active on the sending + system, then the receiving system must have that feature enabled as well. + + If ``flags`` contains *"raw"*, the stream is generated, for encrypted + datasets, by sending data exactly as it exists on disk. This allows + backups to be taken even if encryption keys are not currently loaded. + + .. note:: + ``lzc_send`` can actually accept a filesystem name as the ``snapname``. + In that case ``lzc_send`` acts as if a temporary snapshot was created + after the start of the call and before the stream starts being + produced. + + .. note:: + ``lzc_send`` does not return until all of the stream is written to + ``fd``. + + .. note:: + ``lzc_send`` does *not* close ``fd`` upon returning. + ''' + if fromsnap is not None: + c_fromsnap = fromsnap + else: + c_fromsnap = _ffi.NULL + c_flags = 0 + if flags is None: + flags = [] + for flag in flags: + c_flag = { + 'embedded_data': _lib.LZC_SEND_FLAG_EMBED_DATA, + 'large_blocks': _lib.LZC_SEND_FLAG_LARGE_BLOCK, + 'compress': _lib.LZC_SEND_FLAG_COMPRESS, + 'raw': _lib.LZC_SEND_FLAG_RAW, + }.get(flag) + if c_flag is None: + raise exceptions.UnknownStreamFeature(flag) + c_flags |= c_flag + + ret = _lib.lzc_send(snapname, c_fromsnap, fd, c_flags) + errors.lzc_send_translate_error(ret, snapname, fromsnap, fd, flags) + + +def lzc_send_space(snapname, fromsnap=None, flags=None): + ''' + Estimate size of a full or incremental backup stream + given the optional starting snapshot and the ending snapshot. + + :param bytes snapname: the name of the snapshot for which the estimate + should be done. + :param fromsnap: the optional starting snapshot name. + If not `None` then an incremental stream size is estimated, otherwise + a full stream is estimated. + :type fromsnap: `bytes` or `None` + :param flags: the flags that control what enhanced features can be used + in the stream. + :type flags: list of bytes + + :return: the estimated stream size, in bytes. + :rtype: `int` or `long` + + :raises SnapshotNotFound: if either the starting snapshot is not `None` and + does not exist, or if the ending snapshot does not exist. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + :raises SnapshotMismatch: if ``fromsnap`` is not an ancestor snapshot of + ``snapname``. + :raises PoolsDiffer: if the snapshots belong to different pools. + + ``fromsnap``, if not ``None``, must be strictly an earlier snapshot, + specifying the same snapshot as both ``fromsnap`` and ``snapname`` is an + error. + ''' + if fromsnap is not None: + c_fromsnap = fromsnap + else: + c_fromsnap = _ffi.NULL + c_flags = 0 + if flags is None: + flags = [] + for flag in flags: + c_flag = { + 'embedded_data': _lib.LZC_SEND_FLAG_EMBED_DATA, + 'large_blocks': _lib.LZC_SEND_FLAG_LARGE_BLOCK, + 'compress': _lib.LZC_SEND_FLAG_COMPRESS, + 'raw': _lib.LZC_SEND_FLAG_RAW, + }.get(flag) + if c_flag is None: + raise exceptions.UnknownStreamFeature(flag) + c_flags |= c_flag + valp = _ffi.new('uint64_t *') + + ret = _lib.lzc_send_space(snapname, c_fromsnap, c_flags, valp) + errors.lzc_send_space_translate_error(ret, snapname, fromsnap) + return int(valp[0]) + + +def lzc_receive(snapname, fd, force=False, raw=False, origin=None, props=None): + ''' + Receive from the specified ``fd``, creating the specified snapshot. + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + :raises WrongParent: if the parent dataset of the received destination is + not a filesystem (e.g. ZVOL) + + .. note:: + The ``origin`` is ignored if the actual stream is an incremental stream + that is not a clone stream and the destination filesystem exists. + If the stream is a full stream and the destination filesystem does not + exist then the ``origin`` is checked for existence: if it does not + exist :exc:`.DatasetNotFound` is raised, otherwise + :exc:`.StreamMismatch` is raised, because that snapshot can not have + any relation to the stream. + + .. note:: + If ``force`` is `True` and the stream is incremental then the + destination filesystem is rolled back to a matching source snapshot if + necessary. Intermediate snapshots are destroyed in that case. + + However, none of the existing snapshots may have the same name as + ``snapname`` even if such a snapshot were to be destroyed. + The existing ``snapname`` snapshot always causes + :exc:`.SnapshotExists` to be raised. + + If ``force`` is `True` and the stream is a full stream then the + destination filesystem is replaced with the received filesystem unless + the former has any snapshots. This prevents the destination filesystem + from being rolled back / replaced. + + .. note:: + This interface does not work on dedup'd streams + (those with ``DMU_BACKUP_FEATURE_DEDUP``). + + .. note:: + ``lzc_receive`` does not return until all of the stream is read from + ``fd`` and applied to the pool. + + .. note:: + ``lzc_receive`` does *not* close ``fd`` upon returning. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if props is None: + props = {} + nvlist = nvlist_in(props) + ret = _lib.lzc_receive(snapname, nvlist, c_origin, force, raw, fd) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, None + ) + + +lzc_recv = lzc_receive + + +def lzc_exists(name): + ''' + Check if a dataset (a filesystem, or a volume, or a snapshot) + with the given name exists. + + :param bytes name: the dataset name to check. + :return: `True` if the dataset exists, `False` otherwise. + :rtype: bool + + .. note:: + ``lzc_exists`` can not be used to check for existence of bookmarks. + ''' + ret = _lib.lzc_exists(name) + return bool(ret) + + +@_uncommitted() +def lzc_change_key(fsname, crypt_cmd, props=None, key=None): + ''' + Change encryption key on the specified dataset. + + :param bytes fsname: the name of the dataset. + :param str crypt_cmd: the encryption "command" to be executed, currently + supported values are "new_key", "inherit", "force_new_key" and + "force_inherit". + :param props: a `dict` of encryption-related property name-value pairs; + only "keyformat", "keylocation" and "pbkdf2iters" are supported + (empty by default). + :type props: dict of bytes:Any + :param key: dataset encryption key data (empty by default). + :type key: bytes + + :raises PropertyInvalid: if ``props`` contains invalid values. + :raises FilesystemNotFound: if the dataset does not exist. + :raises UnknownCryptCommand: if ``crypt_cmd`` is invalid. + :raises EncryptionKeyNotLoaded: if the encryption key is not currently + loaded and therefore cannot be changed. + ''' + if props is None: + props = {} + if key is None: + key = b"" + else: + key = bytes(key) + cmd = { + 'new_key': _lib.DCP_CMD_NEW_KEY, + 'inherit': _lib.DCP_CMD_INHERIT, + 'force_new_key': _lib.DCP_CMD_FORCE_NEW_KEY, + 'force_inherit': _lib.DCP_CMD_FORCE_INHERIT, + }.get(crypt_cmd) + if cmd is None: + raise exceptions.UnknownCryptCommand(crypt_cmd) + nvlist = nvlist_in(props) + ret = _lib.lzc_change_key(fsname, cmd, nvlist, key, len(key)) + errors.lzc_change_key_translate_error(ret, fsname) + + +@_uncommitted() +def lzc_load_key(fsname, noop, key): + ''' + Load or verify encryption key on the specified dataset. + + :param bytes fsname: the name of the dataset. + :param bool noop: if `True` the encryption key will only be verified, + not loaded. + :param key: dataset encryption key data. + :type key: bytes + + :raises FilesystemNotFound: if the dataset does not exist. + :raises EncryptionKeyAlreadyLoaded: if the encryption key is already + loaded. + :raises EncryptionKeyInvalid: if the encryption key provided is incorrect. + ''' + ret = _lib.lzc_load_key(fsname, noop, key, len(key)) + errors.lzc_load_key_translate_error(ret, fsname, noop) + + +@_uncommitted() +def lzc_unload_key(fsname): + ''' + Unload encryption key from the specified dataset. + + :param bytes fsname: the name of the dataset. + + :raises FilesystemNotFound: if the dataset does not exist. + :raises DatasetBusy: if the encryption key is still being used. This + usually occurs when the dataset is mounted. + :raises EncryptionKeyNotLoaded: if the encryption key is not currently + loaded. + ''' + ret = _lib.lzc_unload_key(fsname) + errors.lzc_unload_key_translate_error(ret, fsname) + + +def lzc_channel_program( + poolname, program, instrlimit=ZCP_DEFAULT_INSTRLIMIT, + memlimit=ZCP_DEFAULT_MEMLIMIT, params=None +): + ''' + Executes a script as a ZFS channel program on pool ``poolname``. + + :param bytes poolname: the name of the pool. + :param bytes program: channel program text. + :param int instrlimit: execution time limit, in milliseconds. + :param int memlimit: execution memory limit, in bytes. + :param bytes params: a `list` of parameters passed to the channel program + (empty by default). + :type params: dict of bytes:Any + :return: a dictionary of result values procuced by the channel program, + if any. + :rtype: dict + + :raises PoolNotFound: if the pool does not exist. + :raises ZCPLimitInvalid: if either instruction or memory limit are invalid. + :raises ZCPSyntaxError: if the channel program contains syntax errors. + :raises ZCPTimeout: if the channel program took too long to execute. + :raises ZCPSpaceError: if the channel program exhausted the memory limit. + :raises ZCPMemoryError: if the channel program return value was too large. + :raises ZCPPermissionError: if the user lacks the permission to run the + channel program. Channel programs must be run as root. + :raises ZCPRuntimeError: if the channel program encountered a runtime + error. + ''' + output = {} + params_nv = nvlist_in({b"argv": params}) + with nvlist_out(output) as outnvl: + ret = _lib.lzc_channel_program( + poolname, program, instrlimit, memlimit, params_nv, outnvl) + errors.lzc_channel_program_translate_error( + ret, poolname, output.get(b"error")) + return output.get(b"return") + + +def lzc_channel_program_nosync( + poolname, program, instrlimit=ZCP_DEFAULT_INSTRLIMIT, + memlimit=ZCP_DEFAULT_MEMLIMIT, params=None +): + ''' + Executes a script as a read-only ZFS channel program on pool ``poolname``. + A read-only channel program works programmatically the same way as a + normal channel program executed with + :func:`lzc_channel_program`. The only difference is it runs exclusively in + open-context and therefore can return faster. + The downside to that, is that the program cannot change on-disk state by + calling functions from the zfs.sync submodule. + + :param bytes poolname: the name of the pool. + :param bytes program: channel program text. + :param int instrlimit: execution time limit, in milliseconds. + :param int memlimit: execution memory limit, in bytes. + :param bytes params: a `list` of parameters passed to the channel program + (empty by default). + :type params: dict of bytes:Any + :return: a dictionary of result values procuced by the channel program, + if any. + :rtype: dict + + :raises PoolNotFound: if the pool does not exist. + :raises ZCPLimitInvalid: if either instruction or memory limit are invalid. + :raises ZCPSyntaxError: if the channel program contains syntax errors. + :raises ZCPTimeout: if the channel program took too long to execute. + :raises ZCPSpaceError: if the channel program exhausted the memory limit. + :raises ZCPMemoryError: if the channel program return value was too large. + :raises ZCPPermissionError: if the user lacks the permission to run the + channel program. Channel programs must be run as root. + :raises ZCPRuntimeError: if the channel program encountered a runtime + error. + ''' + output = {} + params_nv = nvlist_in({b"argv": params}) + with nvlist_out(output) as outnvl: + ret = _lib.lzc_channel_program_nosync( + poolname, program, instrlimit, memlimit, params_nv, outnvl) + errors.lzc_channel_program_translate_error( + ret, poolname, output.get(b"error")) + return output.get(b"return") + + +def lzc_receive_resumable( + snapname, fd, force=False, raw=False, origin=None, props=None +): + ''' + Like :func:`lzc_receive`, but if the receive fails due to premature stream + termination, the intermediate state will be preserved on disk. In this + case, ECKSUM will be returned. The receive may subsequently be resumed + with a resuming send stream generated by lzc_send_resume(). + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if props is None: + props = {} + nvlist = nvlist_in(props) + ret = _lib.lzc_receive_resumable( + snapname, nvlist, c_origin, force, raw, fd) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, None) + + +def lzc_receive_with_header( + snapname, fd, begin_record, force=False, resumable=False, raw=False, + origin=None, props=None +): + ''' + Like :func:`lzc_receive`, but allows the caller to read the begin record + and then to pass it in. + + That could be useful if the caller wants to derive, for example, + the snapname or the origin parameters based on the information contained in + the begin record. + :func:`receive_header` can be used to receive the begin record from the + file descriptor. + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param begin_record: the stream's begin record. + :type begin_record: ``cffi`` `CData` representing the dmu_replay_record_t + structure. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool resumable: whether this stream should be treated as resumable. + If the receive fails due to premature stream termination, the + intermediate state will be preserved on disk and may subsequently be + resumed with :func:`lzc_send_resume`. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if props is None: + props = {} + nvlist = nvlist_in(props) + ret = _lib.lzc_receive_with_header( + snapname, nvlist, c_origin, force, resumable, raw, fd, begin_record) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, None) + + +def receive_header(fd): + ''' + Read the begin record of the ZFS backup stream from the given file + descriptor. + + This is a helper function for :func:`lzc_receive_with_header`. + + :param int fd: the file descriptor from which to read the stream. + :return: a tuple with two elements where the first one is a Python `dict` + representing the fields of the begin record and the second one is an + opaque object suitable for passing to :func:`lzc_receive_with_header`. + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + + At present the following fields can be of interest in the header: + + drr_toname : bytes + the name of the snapshot for which the stream has been created + drr_toguid : integer + the GUID of the snapshot for which the stream has been created + drr_fromguid : integer + the GUID of the starting snapshot in the case the stream is + incremental, zero otherwise + drr_flags : integer + the flags describing the stream's properties + drr_type : integer + the type of the dataset for which the stream has been created + (volume, filesystem) + ''' + # read sizeof(dmu_replay_record_t) bytes directly into the memory backing + # 'record' + record = _ffi.new("dmu_replay_record_t *") + _ffi.buffer(record)[:] = os.read(fd, _ffi.sizeof(record[0])) + # get drr_begin member and its representation as a Python dict + drr_begin = record.drr_u.drr_begin + header = {} + for field, descr in _ffi.typeof(drr_begin).fields: + if descr.type.kind == 'primitive': + header[field] = getattr(drr_begin, field) + elif descr.type.kind == 'enum': + header[field] = getattr(drr_begin, field) + elif descr.type.kind == 'array' and descr.type.item.cname == 'char': + header[field] = _ffi.string(getattr(drr_begin, field)) + else: + raise TypeError( + 'Unexpected field type in drr_begin: ' + str(descr.type)) + return (header, record) + + +@_uncommitted() +def lzc_receive_one( + snapname, fd, begin_record, force=False, resumable=False, raw=False, + origin=None, props=None, cleanup_fd=-1, action_handle=0 +): + ''' + Like :func:`lzc_receive`, but allows the caller to pass all supported + arguments and retrieve all values returned. The only additional input + parameter is 'cleanup_fd' which is used to set a cleanup-on-exit file + descriptor. + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param begin_record: the stream's begin record. + :type begin_record: ``cffi`` `CData` representing the dmu_replay_record_t + structure. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool resumable: whether this stream should be treated as resumable. + If the receive fails due to premature stream termination, the + intermediate state will be preserved on disk and may subsequently be + resumed with :func:`lzc_send_resume`. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + :param int cleanup_fd: file descriptor used to set a cleanup-on-exit file + descriptor. + :param int action_handle: variable used to pass the handle for guid/ds + mapping: this should be set to zero on first call and will contain an + updated handle on success, which should be passed in subsequent calls. + + :return: a tuple with two elements where the first one is the number of + bytes read from the file descriptor and the second one is the + action_handle return value. + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises ReceivePropertyFailure: if one or more of the specified properties + is invalid or has an invalid type or value. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if action_handle is not None: + c_action_handle = _ffi.new("uint64_t *") + else: + c_action_handle = _ffi.NULL + c_read_bytes = _ffi.new("uint64_t *") + c_errflags = _ffi.new("uint64_t *") + if props is None: + props = {} + nvlist = nvlist_in(props) + properrs = {} + with nvlist_out(properrs) as c_errors: + ret = _lib.lzc_receive_one( + snapname, nvlist, c_origin, force, resumable, raw, fd, + begin_record, cleanup_fd, c_read_bytes, c_errflags, + c_action_handle, c_errors) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, properrs) + return (int(c_read_bytes[0]), action_handle) + + +@_uncommitted() +def lzc_receive_with_cmdprops( + snapname, fd, begin_record, force=False, resumable=False, raw=False, + origin=None, props=None, cmdprops=None, key=None, cleanup_fd=-1, + action_handle=0 +): + ''' + Like :func:`lzc_receive_one`, but allows the caller to pass an additional + 'cmdprops' argument. The 'cmdprops' nvlist contains both override + ('zfs receive -o') and exclude ('zfs receive -x') properties. + + :param bytes snapname: the name of the snapshot to create. + :param int fd: the file descriptor from which to read the stream. + :param begin_record: the stream's begin record. + :type begin_record: ``cffi`` `CData` representing the dmu_replay_record_t + structure. + :param bool force: whether to roll back or destroy the target filesystem + if that is required to receive the stream. + :param bool resumable: whether this stream should be treated as resumable. + If the receive fails due to premature stream termination, the + intermediate state will be preserved on disk and may subsequently be + resumed with :func:`lzc_send_resume`. + :param bool raw: whether this is a "raw" stream. + :param origin: the optional origin snapshot name if the stream is for a + clone. + :type origin: bytes or None + :param props: the properties to set on the snapshot as *received* + properties. + :type props: dict of bytes : Any + :param cmdprops: the properties to set on the snapshot as local overrides + to *received* properties. `bool` values are forcefully inherited while + every other value is set locally as if the command "zfs set" was + invoked immediately before the receive. + :type cmdprops: dict of bytes : Any + :param key: raw bytes representing user's wrapping key + :type key: bytes + :param int cleanup_fd: file descriptor used to set a cleanup-on-exit file + descriptor. + :param int action_handle: variable used to pass the handle for guid/ds + mapping: this should be set to zero on first call and will contain an + updated handle on success, it should be passed in subsequent calls. + + :return: a tuple with two elements where the first one is the number of + bytes read from the file descriptor and the second one is the + action_handle return value. + + :raises IOError: if an input / output error occurs while reading from the + ``fd``. + :raises DatasetExists: if the snapshot named ``snapname`` already exists. + :raises DatasetExists: if the stream is a full stream and the destination + filesystem already exists. + :raises DatasetExists: if ``force`` is `True` but the destination + filesystem could not be rolled back to a matching snapshot because a + newer snapshot exists and it is an origin of a cloned filesystem. + :raises StreamMismatch: if an incremental stream is received and the latest + snapshot of the destination filesystem does not match the source + snapshot of the stream. + :raises StreamMismatch: if a full stream is received and the destination + filesystem already exists and it has at least one snapshot, and + ``force`` is `False`. + :raises StreamMismatch: if an incremental clone stream is received but the + specified ``origin`` is not the actual received origin. + :raises DestinationModified: if an incremental stream is received and the + destination filesystem has been modified since the last snapshot and + ``force`` is `False`. + :raises DestinationModified: if a full stream is received and the + destination filesystem already exists and it does not have any + snapshots, and ``force`` is `False`. + :raises DatasetNotFound: if the destination filesystem and its parent do + not exist. + :raises DatasetNotFound: if the ``origin`` is not `None` and does not + exist. + :raises DatasetBusy: if ``force`` is `True` but the destination filesystem + could not be rolled back to a matching snapshot because a newer + snapshot is held and could not be destroyed. + :raises DatasetBusy: if another receive operation is being performed on the + destination filesystem. + :raises BadStream: if the stream is corrupt or it is not recognized or it + is a compound stream or it is a clone stream, but ``origin`` is `None`. + :raises BadStream: if a clone stream is received and the destination + filesystem already exists. + :raises StreamFeatureNotSupported: if the stream has a feature that is not + supported on this side. + :raises ReceivePropertyFailure: if one or more of the specified properties + is invalid or has an invalid type or value. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + ''' + + if origin is not None: + c_origin = origin + else: + c_origin = _ffi.NULL + if action_handle is not None: + c_action_handle = _ffi.new("uint64_t *") + else: + c_action_handle = _ffi.NULL + c_read_bytes = _ffi.new("uint64_t *") + c_errflags = _ffi.new("uint64_t *") + if props is None: + props = {} + if cmdprops is None: + cmdprops = {} + if key is None: + key = b"" + else: + key = bytes(key) + + nvlist = nvlist_in(props) + cmdnvlist = nvlist_in(cmdprops) + properrs = {} + with nvlist_out(properrs) as c_errors: + ret = _lib.lzc_receive_with_cmdprops( + snapname, nvlist, cmdnvlist, key, len(key), c_origin, + force, resumable, raw, fd, begin_record, cleanup_fd, c_read_bytes, + c_errflags, c_action_handle, c_errors) + errors.lzc_receive_translate_errors( + ret, snapname, fd, force, raw, False, False, origin, properrs) + return (int(c_read_bytes[0]), action_handle) + + +@_uncommitted() +def lzc_reopen(poolname, restart=True): + ''' + Reopen a pool + + :param bytes poolname: the name of the pool. + :param bool restart: whether to restart an in-progress scrub operation. + + :raises PoolNotFound: if the pool does not exist. + ''' + ret = _lib.lzc_reopen(poolname, restart) + errors.lzc_reopen_translate_error(ret, poolname) + + +def lzc_send_resume( + snapname, fromsnap, fd, flags=None, resumeobj=0, resumeoff=0 +): + ''' + Resume a previously interrupted send operation generating a zfs send stream + for the specified snapshot and writing it to the specified file descriptor. + + :param bytes snapname: the name of the snapshot to send. + :param fromsnap: if not None the name of the starting snapshot + for the incremental stream. + :type fromsnap: bytes or None + :param int fd: the file descriptor to write the send stream to. + :param flags: the flags that control what enhanced features can be used in + the stream. + :type flags: list of bytes + :param int resumeobj: the object number where this send stream should + resume from. + :param int resumeoff: the offset where this send stream should resume from. + + :raises SnapshotNotFound: if either the starting snapshot is not `None` and + does not exist, or if the ending snapshot does not exist. + :raises NameInvalid: if the name of either snapshot is invalid. + :raises NameTooLong: if the name of either snapshot is too long. + :raises SnapshotMismatch: if ``fromsnap`` is not an ancestor snapshot of + ``snapname``. + :raises PoolsDiffer: if the snapshots belong to different pools. + :raises IOError: if an input / output error occurs while writing to ``fd``. + :raises UnknownStreamFeature: if the ``flags`` contain an unknown flag + name. + + .. note:: + See :func:`lzc_send` for more information. + ''' + if fromsnap is not None: + c_fromsnap = fromsnap + else: + c_fromsnap = _ffi.NULL + c_flags = 0 + if flags is None: + flags = [] + for flag in flags: + c_flag = { + 'embedded_data': _lib.LZC_SEND_FLAG_EMBED_DATA, + 'large_blocks': _lib.LZC_SEND_FLAG_LARGE_BLOCK, + 'compress': _lib.LZC_SEND_FLAG_COMPRESS, + 'raw': _lib.LZC_SEND_FLAG_RAW, + }.get(flag) + if c_flag is None: + raise exceptions.UnknownStreamFeature(flag) + c_flags |= c_flag + + ret = _lib.lzc_send_resume( + snapname, c_fromsnap, fd, c_flags, uint64_t(resumeobj), + uint64_t(resumeoff)) + errors.lzc_send_translate_error(ret, snapname, fromsnap, fd, flags) + + +@_uncommitted() +def lzc_sync(poolname, force=False): + ''' + Forces all in-core dirty data to be written to the primary pool storage + and not the ZIL. + + :param bytes poolname: the name of the pool. + :param bool force: whether to force uberblock update even if there is no + dirty data. + + :raises PoolNotFound: if the pool does not exist. + + .. note:: + This method signature is different from its C libzfs_core counterpart: + `innvl` has been replaced by the `force` boolean and `outnvl` has been + conveniently removed since it's not used. + ''' + innvl = nvlist_in({b"force": force}) + with nvlist_out({}) as outnvl: + ret = _lib.lzc_sync(poolname, innvl, outnvl) + errors.lzc_sync_translate_error(ret, poolname) + + +def is_supported(func): + ''' + Check whether C *libzfs_core* provides implementation required + for the given Python wrapper. + + If `is_supported` returns ``False`` for the function, then + calling the function would result in :exc:`NotImplementedError`. + + :param function func: the function to check. + :return bool: whether the function can be used. + ''' + fname = func.__name__ + if fname not in globals(): + raise ValueError(fname + ' is not from libzfs_core') + if not callable(func): + raise ValueError(fname + ' is not a function') + if not fname.startswith("lzc_"): + raise ValueError(fname + ' is not a libzfs_core API function') + check_func = getattr(func, "_check_func", None) + if check_func is not None: + return is_supported(check_func) + return getattr(_lib, fname, None) is not None + + +@_uncommitted() +def lzc_promote(name): + ''' + Promotes the ZFS dataset. + + :param bytes name: the name of the dataset to promote. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises NameTooLong: if the dataset's origin has a snapshot that, if + transferred to the dataset, would get a too long name. + :raises NotClone: if the dataset is not a clone. + :raises FilesystemNotFound: if the dataset does not exist. + :raises SnapshotExists: if the dataset already has a snapshot with the same + name as one of the origin's snapshots. + ''' + ret = _lib.lzc_promote(name, _ffi.NULL, _ffi.NULL) + errors.lzc_promote_translate_error(ret, name) + + +@_uncommitted() +def lzc_pool_checkpoint(name): + ''' + Creates a checkpoint for the specified pool. + + :param bytes name: the name of the pool to create a checkpoint for. + :raises CheckpointExists: if the pool already has a checkpoint. + :raises CheckpointDiscarding: if ZFS is in the middle of discarding a + checkpoint for this pool. + :raises DeviceRemovalRunning: if a vdev is currently being removed. + :raises DeviceTooBig: if one or more top-level vdevs exceed the maximum + vdev size. + ''' + ret = _lib.lzc_pool_checkpoint(name) + errors.lzc_pool_checkpoint_translate_error(ret, name) + + +@_uncommitted() +def lzc_pool_checkpoint_discard(name): + ''' + Discard the checkpoint from the specified pool. + + :param bytes name: the name of the pool to discard the checkpoint from. + :raises CheckpointNotFound: if pool does not have a checkpoint. + :raises CheckpointDiscarding: if ZFS is in the middle of discarding a + checkpoint for this pool. + ''' + ret = _lib.lzc_pool_checkpoint_discard(name) + errors.lzc_pool_checkpoint_discard_translate_error(ret, name) + + +def lzc_rename(source, target): + ''' + Rename the ZFS dataset. + + :param source name: the current name of the dataset to rename. + :param target name: the new name of the dataset. + :raises NameInvalid: if either the source or target name is invalid. + :raises NameTooLong: if either the source or target name is too long. + :raises NameTooLong: if a snapshot of the source would get a too long name + after renaming. + :raises FilesystemNotFound: if the source does not exist. + :raises FilesystemNotFound: if the target's parent does not exist. + :raises FilesystemExists: if the target already exists. + :raises PoolsDiffer: if the source and target belong to different pools. + :raises WrongParent: if the "new" parent dataset is not a filesystem + (e.g. ZVOL) + ''' + ret = _lib.lzc_rename(source, target) + errors.lzc_rename_translate_error(ret, source, target) + + +def lzc_destroy(name): + ''' + Destroy the ZFS dataset. + + :param bytes name: the name of the dataset to destroy. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises FilesystemNotFound: if the dataset does not exist. + ''' + ret = _lib.lzc_destroy(name) + errors.lzc_destroy_translate_error(ret, name) + + +@_uncommitted() +def lzc_inherit(name, prop): + ''' + Inherit properties from a parent dataset of the given ZFS dataset. + + :param bytes name: the name of the dataset. + :param bytes prop: the name of the property to inherit. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises DatasetNotFound: if the dataset does not exist. + :raises PropertyInvalid: if one or more of the specified properties is + invalid or has an invalid type or value. + + Inheriting a property actually resets it to its default value + or removes it if it's a user property, so that the property could be + inherited if it's inheritable. If the property is not inheritable + then it would just have its default value. + + This function can be used on snapshots to inherit user defined properties. + ''' + ret = _lib.lzc_inherit(name, prop, _ffi.NULL) + errors.lzc_inherit_prop_translate_error(ret, name, prop) + + +# As the extended API is not committed yet, the names of the new interfaces +# are not settled down yet. +# lzc_inherit_prop makes it clearer what is to be inherited. +lzc_inherit_prop = lzc_inherit + + +@_uncommitted() +def lzc_set_props(name, prop, val): + ''' + Set properties of the ZFS dataset. + + :param bytes name: the name of the dataset. + :param bytes prop: the name of the property. + :param Any val: the value of the property. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises DatasetNotFound: if the dataset does not exist. + :raises NoSpace: if the property controls a quota and the values is too + small for that quota. + :raises PropertyInvalid: if one or more of the specified properties is + invalid or has an invalid type or value. + + This function can be used on snapshots to set user defined properties. + + .. note:: + An attempt to set a readonly / statistic property is ignored + without reporting any error. + ''' + props = {prop: val} + props_nv = nvlist_in(props) + ret = _lib.lzc_set_props(name, props_nv, _ffi.NULL, _ffi.NULL) + errors.lzc_set_prop_translate_error(ret, name, prop, val) + + +# As the extended API is not committed yet, the names of the new interfaces +# are not settled down yet. +# It's not clear if atomically setting multiple properties is an achievable +# goal and an interface acting on multiple entities must do so atomically +# by convention. +# Being able to set a single property at a time is sufficient for ClusterHQ. +lzc_set_prop = lzc_set_props + + +@_uncommitted() +def lzc_list(name, options): + ''' + List subordinate elements of the given dataset. + + This function can be used to list child datasets and snapshots of the given + dataset. The listed elements can be filtered by their type and by their + depth relative to the starting dataset. + + :param bytes name: the name of the dataset to be listed, could be a + snapshot or a dataset. + :param options: a `dict` of the options that control the listing behavior. + :type options: dict of bytes:Any + :return: a pair of file descriptors the first of which can be used to read + the listing. + :rtype: tuple of (int, int) + :raises DatasetNotFound: if the dataset does not exist. + + Two options are currently available: + + recurse : integer or None + specifies depth of the recursive listing. If ``None`` the depth is not + limited. + Absence of this option means that only the given dataset is listed. + + type : dict of bytes:None + specifies dataset types to include into the listing. + Currently allowed keys are "filesystem", "volume", "snapshot". + Absence of this option implies all types. + + The first of the returned file descriptors can be used to + read the listing in a binary encoded format. The data is + a series of variable sized records each starting with a fixed + size header, the header is followed by a serialized ``nvlist``. + Each record describes a single element and contains the element's + name as well as its properties. + The file descriptor must be closed after reading from it. + + The second file descriptor represents a pipe end to which the + kernel driver is writing information. It should not be closed + until all interesting information has been read and it must + be explicitly closed afterwards. + ''' + (rfd, wfd) = os.pipe() + fcntl.fcntl(rfd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + fcntl.fcntl(wfd, fcntl.F_SETFD, fcntl.FD_CLOEXEC) + options = options.copy() + options['fd'] = int32_t(wfd) + opts_nv = nvlist_in(options) + ret = _lib.lzc_list(name, opts_nv) + if ret == errno.ESRCH: + return (None, None) + errors.lzc_list_translate_error(ret, name, options) + return (rfd, wfd) + + +# Description of the binary format used to pass data from the kernel. +_PIPE_RECORD_FORMAT = 'IBBBB' +_PIPE_RECORD_SIZE = struct.calcsize(_PIPE_RECORD_FORMAT) + + +def _list(name, recurse=None, types=None): + ''' + A wrapper for :func:`lzc_list` that hides details of working + with the file descriptors and provides data in an easy to + consume format. + + :param bytes name: the name of the dataset to be listed, could be a + snapshot, a volume or a filesystem. + :param recurse: specifies depth of the recursive listing. If ``None`` the + depth is not limited. + :param types: specifies dataset types to include into the listing. + Currently allowed keys are "filesystem", "volume", "snapshot". ``None`` + is equivalent to specifying the type of the dataset named by `name`. + :type types: list of bytes or None + :type recurse: integer or None + :return: a list of dictionaries each describing a single listed element. + :rtype: list of dict + ''' + options = {} + + # Convert types to a dict suitable for mapping to an nvlist. + if types is not None: + types = {x: None for x in types} + options['type'] = types + if recurse is None or recurse > 0: + options['recurse'] = recurse + + # Note that other_fd is used by the kernel side to write + # the data, so we have to keep that descriptor open until + # we are done. + # Also, we have to explicitly close the descriptor as the + # kernel doesn't do that. + (fd, other_fd) = lzc_list(name, options) + if fd is None: + return + + try: + while True: + record_bytes = os.read(fd, _PIPE_RECORD_SIZE) + if not record_bytes: + break + (size, _, err, _, _) = struct.unpack( + _PIPE_RECORD_FORMAT, record_bytes) + if err == errno.ESRCH: + break + errors.lzc_list_translate_error(err, name, options) + if size == 0: + break + data_bytes = os.read(fd, size) + result = {} + with nvlist_out(result) as nvp: + ret = _lib.nvlist_unpack(data_bytes, size, nvp, 0) + if ret != 0: + raise exceptions.ZFSGenericError( + ret, None, "Failed to unpack list data") + yield result + finally: + os.close(other_fd) + os.close(fd) + + +@_uncommitted(lzc_list) +def lzc_get_props(name): + ''' + Get properties of the ZFS dataset. + + :param bytes name: the name of the dataset. + :raises DatasetNotFound: if the dataset does not exist. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :return: a dictionary mapping the property names to their values. + :rtype: dict of bytes:Any + + .. note:: + The value of ``clones`` property is a `list` of clone names as byte + strings. + + .. warning:: + The returned dictionary does not contain entries for properties + with default values. One exception is the ``mountpoint`` property + for which the default value is derived from the dataset name. + ''' + result = next(_list(name, recurse=0)) + is_snapshot = result['dmu_objset_stats']['dds_is_snapshot'] + result = result['properties'] + # In most cases the source of the property is uninteresting and the + # value alone is sufficient. One exception is the 'mountpoint' + # property the final value of which is not the same as the inherited + # value. + mountpoint = result.get('mountpoint') + if mountpoint is not None: + mountpoint_src = mountpoint['source'] + mountpoint_val = mountpoint['value'] + # 'source' is the name of the dataset that has 'mountpoint' set + # to a non-default value and from which the current dataset inherits + # the property. 'source' can be the current dataset if its + # 'mountpoint' is explicitly set. + # 'source' can also be a special value like '$recvd', that case + # is equivalent to the property being set on the current dataset. + # Note that a normal mountpoint value should start with '/' + # unlike the special values "none" and "legacy". + if (mountpoint_val.startswith('/') and + not mountpoint_src.startswith('$')): + mountpoint_val = mountpoint_val + name[len(mountpoint_src):] + elif not is_snapshot: + mountpoint_val = '/' + name + else: + mountpoint_val = None + result = {k: result[k]['value'] for k in result} + if 'clones' in result: + result['clones'] = list(result['clones'].keys()) + if mountpoint_val is not None: + result['mountpoint'] = mountpoint_val + return result + + +@_uncommitted(lzc_list) +def lzc_list_children(name): + ''' + List the children of the ZFS dataset. + + :param bytes name: the name of the dataset. + :return: an iterator that produces the names of the children. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises DatasetNotFound: if the dataset does not exist. + + .. warning:: + If the dataset does not exist, then the returned iterator would produce + no results and no error is reported. + That case is indistinguishable from the dataset having no children. + + An attempt to list children of a snapshot is silently ignored as well. + ''' + children = [] + for entry in _list(name, recurse=1, types=['filesystem', 'volume']): + child = entry['name'] + if child != name: + children.append(child) + + return iter(children) + + +@_uncommitted(lzc_list) +def lzc_list_snaps(name): + ''' + List the snapshots of the ZFS dataset. + + :param bytes name: the name of the dataset. + :return: an iterator that produces the names of the snapshots. + :raises NameInvalid: if the dataset name is invalid. + :raises NameTooLong: if the dataset name is too long. + :raises DatasetNotFound: if the dataset does not exist. + + .. warning:: + If the dataset does not exist, then the returned iterator would produce + no results and no error is reported. + That case is indistinguishable from the dataset having no snapshots. + + An attempt to list snapshots of a snapshot is silently ignored as well. + ''' + snaps = [] + for entry in _list(name, recurse=1, types=['snapshot']): + snap = entry['name'] + if snap != name: + snaps.append(snap) + + return iter(snaps) + + +# TODO: a better way to init and uninit the library +def _initialize(): + class LazyInit(object): + + def __init__(self, lib): + self._lib = lib + self._inited = False + self._lock = threading.Lock() + + def __getattr__(self, name): + if not self._inited: + with self._lock: + if not self._inited: + ret = self._lib.libzfs_core_init() + if ret != 0: + raise exceptions.ZFSInitializationFailed(ret) + self._inited = True + return getattr(self._lib, name) + + return LazyInit(libzfs_core.lib) + + +_ffi = libzfs_core.ffi +_lib = _initialize() + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/_nvlist.py b/contrib/pyzfs/libzfs_core/_nvlist.py new file mode 100644 index 000000000000..dc6d820bdea3 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/_nvlist.py @@ -0,0 +1,295 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +nvlist_in and nvlist_out provide support for converting between +a dictionary on the Python side and an nvlist_t on the C side +with the automatic memory management for C memory allocations. + +nvlist_in takes a dictionary and produces a CData object corresponding +to a C nvlist_t pointer suitable for passing as an input parameter. +The nvlist_t is populated based on the dictionary. + +nvlist_out takes a dictionary and produces a CData object corresponding +to a C nvlist_t pointer to pointer suitable for passing as an output parameter. +Upon exit from a with-block the dictionary is populated based on the nvlist_t. + +The dictionary must follow a certain format to be convertible +to the nvlist_t. The dictionary produced from the nvlist_t +will follow the same format. + +Format: +- keys are always byte strings +- a value can be None in which case it represents boolean truth by its mere + presence +- a value can be a bool +- a value can be a byte string +- a value can be an integer +- a value can be a CFFI CData object representing one of the following C types: + int8_t, uint8_t, int16_t, uint16_t, int32_t, uint32_t, int64_t, uint64_t, + boolean_t, uchar_t +- a value can be a dictionary that recursively adheres to this format +- a value can be a list of bools, byte strings, integers or CData objects of + types specified above +- a value can be a list of dictionaries that adhere to this format +- all elements of a list value must be of the same type +""" +from __future__ import absolute_import, division, print_function + +import numbers +from collections import namedtuple +from contextlib import contextmanager +from .bindings import libnvpair +from .ctypes import _type_to_suffix + +_ffi = libnvpair.ffi +_lib = libnvpair.lib + + +def nvlist_in(props): + """ + This function converts a python dictionary to a C nvlist_t + and provides automatic memory management for the latter. + + :param dict props: the dictionary to be converted. + :return: an FFI CData object representing the nvlist_t pointer. + :rtype: CData + """ + nvlistp = _ffi.new("nvlist_t **") + res = _lib.nvlist_alloc(nvlistp, 1, 0) # UNIQUE_NAME == 1 + if res != 0: + raise MemoryError('nvlist_alloc failed') + nvlist = _ffi.gc(nvlistp[0], _lib.nvlist_free) + _dict_to_nvlist(props, nvlist) + return nvlist + + +@contextmanager +def nvlist_out(props): + """ + A context manager that allocates a pointer to a C nvlist_t and yields + a CData object representing a pointer to the pointer via 'as' target. + The caller can pass that pointer to a pointer to a C function that + creates a new nvlist_t object. + The context manager takes care of memory management for the nvlist_t + and also populates the 'props' dictionary with data from the nvlist_t + upon leaving the 'with' block. + + :param dict props: the dictionary to be populated with data from the + nvlist. + :return: an FFI CData object representing the pointer to nvlist_t pointer. + :rtype: CData + """ + nvlistp = _ffi.new("nvlist_t **") + nvlistp[0] = _ffi.NULL # to be sure + try: + yield nvlistp + # clear old entries, if any + props.clear() + _nvlist_to_dict(nvlistp[0], props) + finally: + if nvlistp[0] != _ffi.NULL: + _lib.nvlist_free(nvlistp[0]) + nvlistp[0] = _ffi.NULL + + +def packed_nvlist_out(packed_nvlist, packed_size): + """ + This function converts a packed C nvlist_t to a python dictionary and + provides automatic memory management for the former. + + :param bytes packed_nvlist: packed nvlist_t. + :param int packed_size: nvlist_t packed size. + :return: an `dict` of values representing the data contained by nvlist_t. + :rtype: dict + """ + props = {} + with nvlist_out(props) as nvp: + ret = _lib.nvlist_unpack(packed_nvlist, packed_size, nvp, 0) + if ret != 0: + raise MemoryError('nvlist_unpack failed') + return props + + +_TypeInfo = namedtuple('_TypeInfo', ['suffix', 'ctype', 'is_array', 'convert']) + + +def _type_info(typeid): + return { + _lib.DATA_TYPE_BOOLEAN: _TypeInfo(None, None, None, None), + _lib.DATA_TYPE_BOOLEAN_VALUE: _TypeInfo("boolean_value", "boolean_t *", False, bool), # noqa: E501 + _lib.DATA_TYPE_BYTE: _TypeInfo("byte", "uchar_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_INT8: _TypeInfo("int8", "int8_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_UINT8: _TypeInfo("uint8", "uint8_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_INT16: _TypeInfo("int16", "int16_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_UINT16: _TypeInfo("uint16", "uint16_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_INT32: _TypeInfo("int32", "int32_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_UINT32: _TypeInfo("uint32", "uint32_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_INT64: _TypeInfo("int64", "int64_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_UINT64: _TypeInfo("uint64", "uint64_t *", False, int), # noqa: E501 + _lib.DATA_TYPE_STRING: _TypeInfo("string", "char **", False, _ffi.string), # noqa: E501 + _lib.DATA_TYPE_NVLIST: _TypeInfo("nvlist", "nvlist_t **", False, lambda x: _nvlist_to_dict(x, {})), # noqa: E501 + _lib.DATA_TYPE_BOOLEAN_ARRAY: _TypeInfo("boolean_array", "boolean_t **", True, bool), # noqa: E501 + # XXX use bytearray ? + _lib.DATA_TYPE_BYTE_ARRAY: _TypeInfo("byte_array", "uchar_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_INT8_ARRAY: _TypeInfo("int8_array", "int8_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_UINT8_ARRAY: _TypeInfo("uint8_array", "uint8_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_INT16_ARRAY: _TypeInfo("int16_array", "int16_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_UINT16_ARRAY: _TypeInfo("uint16_array", "uint16_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_INT32_ARRAY: _TypeInfo("int32_array", "int32_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_UINT32_ARRAY: _TypeInfo("uint32_array", "uint32_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_INT64_ARRAY: _TypeInfo("int64_array", "int64_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_UINT64_ARRAY: _TypeInfo("uint64_array", "uint64_t **", True, int), # noqa: E501 + _lib.DATA_TYPE_STRING_ARRAY: _TypeInfo("string_array", "char ***", True, _ffi.string), # noqa: E501 + _lib.DATA_TYPE_NVLIST_ARRAY: _TypeInfo("nvlist_array", "nvlist_t ***", True, lambda x: _nvlist_to_dict(x, {})), # noqa: E501 + }[typeid] + + +# only integer properties need to be here +_prop_name_to_type_str = { + b"rewind-request": "uint32", + b"type": "uint32", + b"N_MORE_ERRORS": "int32", + b"pool_context": "int32", +} + + +def _nvlist_add_array(nvlist, key, array): + def _is_integer(x): + return isinstance(x, numbers.Integral) and not isinstance(x, bool) + + ret = 0 + specimen = array[0] + is_integer = _is_integer(specimen) + specimen_ctype = None + if isinstance(specimen, _ffi.CData): + specimen_ctype = _ffi.typeof(specimen) + + for element in array[1:]: + if is_integer and _is_integer(element): + pass + elif type(element) is not type(specimen): + raise TypeError('Array has elements of different types: ' + + type(specimen).__name__ + + ' and ' + + type(element).__name__) + elif specimen_ctype is not None: + ctype = _ffi.typeof(element) + if ctype is not specimen_ctype: + raise TypeError('Array has elements of different C types: ' + + _ffi.typeof(specimen).cname + + ' and ' + + _ffi.typeof(element).cname) + + if isinstance(specimen, dict): + # NB: can't use automatic memory management via nvlist_in() here, + # we have a loop, but 'with' would require recursion + c_array = [] + for dictionary in array: + nvlistp = _ffi.new('nvlist_t **') + res = _lib.nvlist_alloc(nvlistp, 1, 0) # UNIQUE_NAME == 1 + if res != 0: + raise MemoryError('nvlist_alloc failed') + nested_nvlist = _ffi.gc(nvlistp[0], _lib.nvlist_free) + _dict_to_nvlist(dictionary, nested_nvlist) + c_array.append(nested_nvlist) + ret = _lib.nvlist_add_nvlist_array(nvlist, key, c_array, len(c_array)) + elif isinstance(specimen, bytes): + c_array = [] + for string in array: + c_array.append(_ffi.new('char[]', string)) + ret = _lib.nvlist_add_string_array(nvlist, key, c_array, len(c_array)) + elif isinstance(specimen, bool): + ret = _lib.nvlist_add_boolean_array(nvlist, key, array, len(array)) + elif isinstance(specimen, numbers.Integral): + suffix = _prop_name_to_type_str.get(key, "uint64") + cfunc = getattr(_lib, "nvlist_add_%s_array" % (suffix,)) + ret = cfunc(nvlist, key, array, len(array)) + elif isinstance( + specimen, _ffi.CData) and _ffi.typeof(specimen) in _type_to_suffix: + suffix = _type_to_suffix[_ffi.typeof(specimen)][True] + cfunc = getattr(_lib, "nvlist_add_%s_array" % (suffix,)) + ret = cfunc(nvlist, key, array, len(array)) + else: + raise TypeError('Unsupported value type ' + type(specimen).__name__) + if ret != 0: + raise MemoryError('nvlist_add failed, err = %d' % ret) + + +def _nvlist_to_dict(nvlist, props): + pair = _lib.nvlist_next_nvpair(nvlist, _ffi.NULL) + while pair != _ffi.NULL: + name = _ffi.string(_lib.nvpair_name(pair)) + typeid = int(_lib.nvpair_type(pair)) + typeinfo = _type_info(typeid) + is_array = bool(_lib.nvpair_type_is_array(pair)) + cfunc = getattr(_lib, "nvpair_value_%s" % (typeinfo.suffix,), None) + val = None + ret = 0 + if is_array: + valptr = _ffi.new(typeinfo.ctype) + lenptr = _ffi.new("uint_t *") + ret = cfunc(pair, valptr, lenptr) + if ret != 0: + raise RuntimeError('nvpair_value failed') + length = int(lenptr[0]) + val = [] + for i in range(length): + val.append(typeinfo.convert(valptr[0][i])) + else: + if typeid == _lib.DATA_TYPE_BOOLEAN: + val = None # XXX or should it be True ? + else: + valptr = _ffi.new(typeinfo.ctype) + ret = cfunc(pair, valptr) + if ret != 0: + raise RuntimeError('nvpair_value failed') + val = typeinfo.convert(valptr[0]) + props[name] = val + pair = _lib.nvlist_next_nvpair(nvlist, pair) + return props + + +def _dict_to_nvlist(props, nvlist): + for k, v in props.items(): + if not isinstance(k, bytes): + raise TypeError('Unsupported key type ' + type(k).__name__) + ret = 0 + if isinstance(v, dict): + ret = _lib.nvlist_add_nvlist(nvlist, k, nvlist_in(v)) + elif isinstance(v, list): + _nvlist_add_array(nvlist, k, v) + elif isinstance(v, bytes): + ret = _lib.nvlist_add_string(nvlist, k, v) + elif isinstance(v, bool): + ret = _lib.nvlist_add_boolean_value(nvlist, k, v) + elif v is None: + ret = _lib.nvlist_add_boolean(nvlist, k) + elif isinstance(v, numbers.Integral): + suffix = _prop_name_to_type_str.get(k, "uint64") + cfunc = getattr(_lib, "nvlist_add_%s" % (suffix,)) + ret = cfunc(nvlist, k, v) + elif isinstance(v, _ffi.CData) and _ffi.typeof(v) in _type_to_suffix: + suffix = _type_to_suffix[_ffi.typeof(v)][False] + cfunc = getattr(_lib, "nvlist_add_%s" % (suffix,)) + ret = cfunc(nvlist, k, v) + else: + raise TypeError('Unsupported value type ' + type(v).__name__) + if ret != 0: + raise MemoryError('nvlist_add failed') + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/bindings/__init__.py b/contrib/pyzfs/libzfs_core/bindings/__init__.py new file mode 100644 index 000000000000..4bdd9ea3115e --- /dev/null +++ b/contrib/pyzfs/libzfs_core/bindings/__init__.py @@ -0,0 +1,60 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +The package that contains a module per each C library that +`libzfs_core` uses. The modules expose CFFI objects required +to make calls to functions in the libraries. +""" +from __future__ import absolute_import, division, print_function + +import threading +import importlib + +from cffi import FFI + + +def _setup_cffi(): + class LazyLibrary(object): + + def __init__(self, ffi, libname): + self._ffi = ffi + self._libname = libname + self._lib = None + self._lock = threading.Lock() + + def __getattr__(self, name): + if self._lib is None: + with self._lock: + if self._lib is None: + self._lib = self._ffi.dlopen(self._libname) + + return getattr(self._lib, name) + + MODULES = ["libnvpair", "libzfs_core"] + ffi = FFI() + + for module_name in MODULES: + module = importlib.import_module("." + module_name, __name__) + ffi.cdef(module.CDEF) + lib = LazyLibrary(ffi, module.LIBRARY) + setattr(module, "ffi", ffi) + setattr(module, "lib", lib) + + +_setup_cffi() + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/bindings/libnvpair.py b/contrib/pyzfs/libzfs_core/bindings/libnvpair.py new file mode 100644 index 000000000000..3cd72d4908de --- /dev/null +++ b/contrib/pyzfs/libzfs_core/bindings/libnvpair.py @@ -0,0 +1,134 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python bindings for ``libnvpair``. +""" +from __future__ import absolute_import, division, print_function + +CDEF = """ + typedef ... nvlist_t; + typedef ... nvpair_t; + + + typedef enum { + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, + DATA_TYPE_UINT8_ARRAY + } data_type_t; + typedef enum { B_FALSE, B_TRUE } boolean_t; + + typedef unsigned char uchar_t; + typedef unsigned int uint_t; + + int nvlist_alloc(nvlist_t **, uint_t, int); + void nvlist_free(nvlist_t *); + + int nvlist_unpack(char *, size_t, nvlist_t **, int); + + void dump_nvlist(nvlist_t *, int); + int nvlist_dup(nvlist_t *, nvlist_t **, int); + + int nvlist_add_boolean(nvlist_t *, const char *); + int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); + int nvlist_add_byte(nvlist_t *, const char *, uchar_t); + int nvlist_add_int8(nvlist_t *, const char *, int8_t); + int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); + int nvlist_add_int16(nvlist_t *, const char *, int16_t); + int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); + int nvlist_add_int32(nvlist_t *, const char *, int32_t); + int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); + int nvlist_add_int64(nvlist_t *, const char *, int64_t); + int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); + int nvlist_add_string(nvlist_t *, const char *, const char *); + int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); + int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, + uint_t); + int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); + int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); + int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); + int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); + int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); + int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); + int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); + int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); + int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); + int nvlist_add_string_array(nvlist_t *, const char *, char *const *, + uint_t); + int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + + nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); + nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); + char *nvpair_name(nvpair_t *); + data_type_t nvpair_type(nvpair_t *); + int nvpair_type_is_array(nvpair_t *); + int nvpair_value_boolean_value(nvpair_t *, boolean_t *); + int nvpair_value_byte(nvpair_t *, uchar_t *); + int nvpair_value_int8(nvpair_t *, int8_t *); + int nvpair_value_uint8(nvpair_t *, uint8_t *); + int nvpair_value_int16(nvpair_t *, int16_t *); + int nvpair_value_uint16(nvpair_t *, uint16_t *); + int nvpair_value_int32(nvpair_t *, int32_t *); + int nvpair_value_uint32(nvpair_t *, uint32_t *); + int nvpair_value_int64(nvpair_t *, int64_t *); + int nvpair_value_uint64(nvpair_t *, uint64_t *); + int nvpair_value_string(nvpair_t *, char **); + int nvpair_value_nvlist(nvpair_t *, nvlist_t **); + int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); + int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); + int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); + int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); + int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); + int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); + int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); + int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); + int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); + int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); + int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); + int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); +""" + +SOURCE = """ +#include +""" + +LIBRARY = "nvpair" + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py new file mode 100644 index 000000000000..1b46a0891944 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/bindings/libzfs_core.py @@ -0,0 +1,146 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Python bindings for ``libzfs_core``. +""" +from __future__ import absolute_import, division, print_function + +CDEF = """ + + enum lzc_send_flags { + LZC_SEND_FLAG_EMBED_DATA = 1, + LZC_SEND_FLAG_LARGE_BLOCK = 2, + LZC_SEND_FLAG_COMPRESS = 4, + LZC_SEND_FLAG_RAW = 8 + }; + + typedef enum { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, + DMU_OST_ANY, + DMU_OST_NUMTYPES + } dmu_objset_type_t; + + #define MAXNAMELEN 256 + + struct drr_begin { + uint64_t drr_magic; + uint64_t drr_versioninfo; /* was drr_version */ + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; + }; + + typedef struct zio_cksum { + uint64_t zc_word[4]; + } zio_cksum_t; + + typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_NUMTYPES + } drr_type; + uint32_t drr_payloadlen; + union { + struct drr_begin drr_begin; + /* ... */ + struct drr_checksum { + uint64_t drr_pad[34]; + zio_cksum_t drr_checksum; + } drr_checksum; + } drr_u; + } dmu_replay_record_t; + + typedef enum { + DCP_CMD_NONE, + DCP_CMD_RAW_RECV, + DCP_CMD_NEW_KEY, + DCP_CMD_INHERIT, + DCP_CMD_FORCE_NEW_KEY, + DCP_CMD_FORCE_INHERIT + } dcp_cmd_t; + + int libzfs_core_init(void); + void libzfs_core_fini(void); + + int lzc_bookmark(nvlist_t *, nvlist_t **); + int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); + int lzc_channel_program(const char *, const char *, uint64_t, uint64_t, + nvlist_t *, nvlist_t **); + int lzc_channel_program_nosync(const char *, const char *, uint64_t, + uint64_t, nvlist_t *, nvlist_t **); + int lzc_clone(const char *, const char *, nvlist_t *); + int lzc_create(const char *, dmu_objset_type_t, nvlist_t *, uint8_t *, + uint_t); + int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); + int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); + boolean_t lzc_exists(const char *); + int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); + int lzc_get_holds(const char *, nvlist_t **); + int lzc_hold(nvlist_t *, int, nvlist_t **); + int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); + int lzc_promote(const char *, nvlist_t *, nvlist_t **); + int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, int); + int lzc_receive_one(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, boolean_t, int, const dmu_replay_record_t *, int, + uint64_t *, uint64_t *, uint64_t *, nvlist_t **); + int lzc_receive_resumable(const char *, nvlist_t *, const char *, + boolean_t, boolean_t, int); + int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, + uint8_t *, uint_t, const char *, boolean_t, boolean_t, + boolean_t, int, const dmu_replay_record_t *, int, uint64_t *, + uint64_t *, uint64_t *, nvlist_t **); + int lzc_receive_with_header(const char *, nvlist_t *, const char *, + boolean_t, boolean_t, boolean_t, int, const dmu_replay_record_t *); + int lzc_release(nvlist_t *, nvlist_t **); + int lzc_reopen(const char *, boolean_t); + int lzc_rollback(const char *, char *, int); + int lzc_rollback_to(const char *, const char *); + int lzc_send(const char *, const char *, int, enum lzc_send_flags); + int lzc_send_resume(const char *, const char *, int, enum lzc_send_flags, + uint64_t, uint64_t); + int lzc_send_space(const char *, const char *, enum lzc_send_flags, + uint64_t *); + int lzc_snaprange_space(const char *, const char *, uint64_t *); + int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); + int lzc_sync(const char *, nvlist_t *, nvlist_t **); + int lzc_unload_key(const char *); + int lzc_pool_checkpoint(const char *); + int lzc_pool_checkpoint_discard(const char *); + int lzc_rename(const char *, const char *); + int lzc_destroy(const char *fsname); + + int lzc_inherit(const char *fsname, const char *name, nvlist_t *); + int lzc_set_props(const char *, nvlist_t *, nvlist_t *, nvlist_t *); + int lzc_list (const char *, nvlist_t *); +""" + +SOURCE = """ +#include +""" + +LIBRARY = "zfs_core" + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/ctypes.py b/contrib/pyzfs/libzfs_core/ctypes.py new file mode 100644 index 000000000000..d337f46ed643 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/ctypes.py @@ -0,0 +1,71 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Utility functions for casting to a specific C type. +""" +from __future__ import absolute_import, division, print_function + +from .bindings.libnvpair import ffi as _ffi + + +def _ffi_cast(type_name): + type_info = _ffi.typeof(type_name) + + def _func(value): + # this is for overflow / underflow checking only + if type_info.kind == 'enum': + try: + type_info.elements[value] + except KeyError as e: + raise OverflowError('Invalid enum <%s> value %s: %s' % + (type_info.cname, value, e)) + else: + _ffi.new(type_name + '*', value) + return _ffi.cast(type_name, value) + _func.__name__ = type_name + return _func + + +uint8_t = _ffi_cast('uint8_t') +int8_t = _ffi_cast('int8_t') +uint16_t = _ffi_cast('uint16_t') +int16_t = _ffi_cast('int16_t') +uint32_t = _ffi_cast('uint32_t') +int32_t = _ffi_cast('int32_t') +uint64_t = _ffi_cast('uint64_t') +int64_t = _ffi_cast('int64_t') +boolean_t = _ffi_cast('boolean_t') +uchar_t = _ffi_cast('uchar_t') + + +# First element of the value tuple is a suffix for a single value function +# while the second element is for an array function +_type_to_suffix = { + _ffi.typeof('uint8_t'): ('uint8', 'uint8'), + _ffi.typeof('int8_t'): ('int8', 'int8'), + _ffi.typeof('uint16_t'): ('uint16', 'uint16'), + _ffi.typeof('int16_t'): ('int16', 'int16'), + _ffi.typeof('uint32_t'): ('uint32', 'uint32'), + _ffi.typeof('int32_t'): ('int32', 'int32'), + _ffi.typeof('uint64_t'): ('uint64', 'uint64'), + _ffi.typeof('int64_t'): ('int64', 'int64'), + _ffi.typeof('boolean_t'): ('boolean_value', 'boolean'), + _ffi.typeof('uchar_t'): ('byte', 'byte'), +} + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/exceptions.py b/contrib/pyzfs/libzfs_core/exceptions.py new file mode 100644 index 000000000000..e484b07b6450 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/exceptions.py @@ -0,0 +1,601 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Exceptions that can be raised by libzfs_core operations. +""" +from __future__ import absolute_import, division, print_function + +import errno +from ._constants import ( + ECHRNG, + ECKSUM, + ETIME, + ZFS_ERR_CHECKPOINT_EXISTS, + ZFS_ERR_DISCARDING_CHECKPOINT, + ZFS_ERR_NO_CHECKPOINT, + ZFS_ERR_DEVRM_IN_PROGRESS, + ZFS_ERR_VDEV_TOO_BIG, + ZFS_ERR_WRONG_PARENT, + zfs_errno +) + + +class ZFSError(Exception): + errno = None + message = None + name = None + + def __str__(self): + if self.name is not None: + return "[Errno %d] %s: '%s'" % ( + self.errno, self.message, self.name) + else: + return "[Errno %d] %s" % (self.errno, self.message) + + def __repr__(self): + return "%s(%r, %r)" % ( + self.__class__.__name__, self.errno, self.message) + + +class ZFSGenericError(ZFSError): + + def __init__(self, errno, name, message): + self.errno = errno + self.message = message + self.name = name + + +class ZFSInitializationFailed(ZFSError): + message = "Failed to initialize libzfs_core" + + def __init__(self, errno): + self.errno = errno + + +class MultipleOperationsFailure(ZFSError): + + def __init__(self, errors, suppressed_count): + # Use first of the individual error codes + # as an overall error code. This is more consistent. + self.errno = errors[0].errno + self.errors = errors + # this many errors were encountered but not placed on the `errors` list + self.suppressed_count = suppressed_count + + def __str__(self): + return "%s, %d errors included, %d suppressed" % ( + ZFSError.__str__(self), len(self.errors), self.suppressed_count) + + def __repr__(self): + return "%s(%r, %r, errors=%r, suppressed=%r)" % ( + self.__class__.__name__, self.errno, self.message, self.errors, + self.suppressed_count) + + +class DatasetNotFound(ZFSError): + + """ + This exception is raised when an operation failure can be caused by a + missing snapshot or a missing filesystem and it is impossible to + distinguish between the causes. + """ + errno = errno.ENOENT + message = "Dataset not found" + + def __init__(self, name): + self.name = name + + +class DatasetExists(ZFSError): + + """ + This exception is raised when an operation failure can be caused by an + existing snapshot or filesystem and it is impossible to distinguish between + the causes. + """ + errno = errno.EEXIST + message = "Dataset already exists" + + def __init__(self, name): + self.name = name + + +class NotClone(ZFSError): + errno = errno.EINVAL + message = "Filesystem is not a clone, can not promote" + + def __init__(self, name): + self.name = name + + +class FilesystemExists(DatasetExists): + message = "Filesystem already exists" + + def __init__(self, name): + self.name = name + + +class FilesystemNotFound(DatasetNotFound): + message = "Filesystem not found" + + def __init__(self, name): + self.name = name + + +class ParentNotFound(ZFSError): + errno = errno.ENOENT + message = "Parent not found" + + def __init__(self, name): + self.name = name + + +class WrongParent(ZFSError): + errno = ZFS_ERR_WRONG_PARENT + message = "Parent dataset is not a filesystem" + + def __init__(self, name): + self.name = name + + +class SnapshotExists(DatasetExists): + message = "Snapshot already exists" + + def __init__(self, name): + self.name = name + + +class SnapshotNotFound(DatasetNotFound): + message = "Snapshot not found" + + def __init__(self, name): + self.name = name + + +class SnapshotNotLatest(ZFSError): + errno = errno.EEXIST + message = "Snapshot is not the latest" + + def __init__(self, name): + self.name = name + + +class SnapshotIsCloned(ZFSError): + errno = errno.EEXIST + message = "Snapshot is cloned" + + def __init__(self, name): + self.name = name + + +class SnapshotIsHeld(ZFSError): + errno = errno.EBUSY + message = "Snapshot is held" + + def __init__(self, name): + self.name = name + + +class DuplicateSnapshots(ZFSError): + errno = errno.EXDEV + message = "Requested multiple snapshots of the same filesystem" + + def __init__(self, name): + self.name = name + + +class SnapshotFailure(MultipleOperationsFailure): + message = "Creation of snapshot(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(SnapshotFailure, self).__init__(errors, suppressed_count) + + +class SnapshotDestructionFailure(MultipleOperationsFailure): + message = "Destruction of snapshot(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(SnapshotDestructionFailure, self).__init__( + errors, suppressed_count) + + +class BookmarkExists(ZFSError): + errno = errno.EEXIST + message = "Bookmark already exists" + + def __init__(self, name): + self.name = name + + +class BookmarkNotFound(ZFSError): + errno = errno.ENOENT + message = "Bookmark not found" + + def __init__(self, name): + self.name = name + + +class BookmarkMismatch(ZFSError): + errno = errno.EINVAL + message = "source is not an ancestor of the new bookmark's dataset" + + def __init__(self, name): + self.name = name + + +class BookmarkSourceInvalid(ZFSError): + errno = errno.EINVAL + message = "Bookmark source is not a valid snapshot or existing bookmark" + + def __init__(self, name): + self.name = name + + +class BookmarkNotSupported(ZFSError): + errno = errno.ENOTSUP + message = "Bookmark feature is not supported" + + def __init__(self, name): + self.name = name + + +class BookmarkFailure(MultipleOperationsFailure): + message = "Creation of bookmark(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(BookmarkFailure, self).__init__(errors, suppressed_count) + + +class BookmarkDestructionFailure(MultipleOperationsFailure): + message = "Destruction of bookmark(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(BookmarkDestructionFailure, self).__init__( + errors, suppressed_count) + + +class BadHoldCleanupFD(ZFSError): + errno = errno.EBADF + message = "Bad file descriptor as cleanup file descriptor" + + +class HoldExists(ZFSError): + errno = errno.EEXIST + message = "Hold with a given tag already exists on snapshot" + + def __init__(self, name): + self.name = name + + +class HoldNotFound(ZFSError): + errno = errno.ENOENT + message = "Hold with a given tag does not exist on snapshot" + + def __init__(self, name): + self.name = name + + +class HoldFailure(MultipleOperationsFailure): + message = "Placement of hold(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(HoldFailure, self).__init__(errors, suppressed_count) + + +class HoldReleaseFailure(MultipleOperationsFailure): + message = "Release of hold(s) failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(HoldReleaseFailure, self).__init__(errors, suppressed_count) + + +class SnapshotMismatch(ZFSError): + errno = errno.ENODEV + message = "Snapshot is not descendant of source snapshot" + + def __init__(self, name): + self.name = name + + +class StreamMismatch(ZFSError): + errno = errno.ENODEV + message = "Stream is not applicable to destination dataset" + + def __init__(self, name): + self.name = name + + +class DestinationModified(ZFSError): + errno = errno.ETXTBSY + message = "Destination dataset has modifications that can not be undone" + + def __init__(self, name): + self.name = name + + +class BadStream(ZFSError): + errno = ECKSUM + message = "Bad backup stream" + + +class StreamFeatureNotSupported(ZFSError): + errno = errno.ENOTSUP + message = "Stream contains unsupported feature" + + +class UnknownStreamFeature(ZFSError): + errno = errno.ENOTSUP + message = "Unknown feature requested for stream" + + +class StreamFeatureInvalid(ZFSError): + errno = errno.EINVAL + message = "Kernel modules must be upgraded to receive this stream" + + +class StreamFeatureIncompatible(ZFSError): + errno = errno.EINVAL + message = "Incompatible embedded feature with encrypted receive" + + +class StreamTruncated(ZFSError): + errno = zfs_errno.ZFS_ERR_STREAM_TRUNCATED + message = "incomplete stream" + + +class ReceivePropertyFailure(MultipleOperationsFailure): + message = "Receiving of properties failed for one or more reasons" + + def __init__(self, errors, suppressed_count): + super(ReceivePropertyFailure, self).__init__(errors, suppressed_count) + + +class StreamIOError(ZFSError): + message = "I/O error while writing or reading stream" + + def __init__(self, errno): + self.errno = errno + + +class ZIOError(ZFSError): + errno = errno.EIO + message = "I/O error" + + def __init__(self, name): + self.name = name + + +class NoSpace(ZFSError): + errno = errno.ENOSPC + message = "No space left" + + def __init__(self, name): + self.name = name + + +class QuotaExceeded(ZFSError): + errno = errno.EDQUOT + message = "Quota exceeded" + + def __init__(self, name): + self.name = name + + +class DatasetBusy(ZFSError): + errno = errno.EBUSY + message = "Dataset is busy" + + def __init__(self, name): + self.name = name + + +class NameTooLong(ZFSError): + errno = errno.ENAMETOOLONG + message = "Dataset name is too long" + + def __init__(self, name): + self.name = name + + +class NameInvalid(ZFSError): + errno = errno.EINVAL + message = "Invalid name" + + def __init__(self, name): + self.name = name + + +class SnapshotNameInvalid(NameInvalid): + message = "Invalid name for snapshot" + + def __init__(self, name): + self.name = name + + +class FilesystemNameInvalid(NameInvalid): + message = "Invalid name for filesystem or volume" + + def __init__(self, name): + self.name = name + + +class BookmarkNameInvalid(NameInvalid): + message = "Invalid name for bookmark" + + def __init__(self, name): + self.name = name + + +class ReadOnlyPool(ZFSError): + errno = errno.EROFS + message = "Pool is read-only" + + def __init__(self, name): + self.name = name + + +class SuspendedPool(ZFSError): + errno = errno.EAGAIN + message = "Pool is suspended" + + def __init__(self, name): + self.name = name + + +class PoolNotFound(ZFSError): + errno = errno.EXDEV + message = "No such pool" + + def __init__(self, name): + self.name = name + + +class PoolsDiffer(ZFSError): + errno = errno.EXDEV + message = "Source and target belong to different pools" + + def __init__(self, name): + self.name = name + + +class FeatureNotSupported(ZFSError): + errno = errno.ENOTSUP + message = "Feature is not supported in this version" + + def __init__(self, name): + self.name = name + + +class PropertyNotSupported(ZFSError): + errno = errno.ENOTSUP + message = "Property is not supported in this version" + + def __init__(self, name): + self.name = name + + +class PropertyInvalid(ZFSError): + errno = errno.EINVAL + message = "Invalid property or property value" + + def __init__(self, name): + self.name = name + + +class DatasetTypeInvalid(ZFSError): + errno = errno.EINVAL + message = "Specified dataset type is unknown" + + def __init__(self, name): + self.name = name + + +class UnknownCryptCommand(ZFSError): + errno = errno.EINVAL + message = "Specified crypt command is invalid" + + def __init__(self, name): + self.name = name + + +class EncryptionKeyNotLoaded(ZFSError): + errno = errno.EACCES + message = "Encryption key is not currently loaded" + + +class EncryptionKeyAlreadyLoaded(ZFSError): + errno = errno.EEXIST + message = "Encryption key is already loaded" + + +class EncryptionKeyInvalid(ZFSError): + errno = errno.EACCES + message = "Incorrect encryption key provided" + + +class ZCPError(ZFSError): + errno = None + message = None + + +class ZCPSyntaxError(ZCPError): + errno = errno.EINVAL + message = "Channel program contains syntax errors" + + def __init__(self, details): + self.details = details + + +class ZCPRuntimeError(ZCPError): + errno = ECHRNG + message = "Channel programs encountered a runtime error" + + def __init__(self, details): + self.details = details + + +class ZCPLimitInvalid(ZCPError): + errno = errno.EINVAL + message = "Channel program called with invalid limits" + + +class ZCPTimeout(ZCPError): + errno = ETIME + message = "Channel program timed out" + + +class ZCPSpaceError(ZCPError): + errno = errno.ENOSPC + message = "Channel program exhausted the memory limit" + + +class ZCPMemoryError(ZCPError): + errno = errno.ENOMEM + message = "Channel program return value too large" + + +class ZCPPermissionError(ZCPError): + errno = errno.EPERM + message = "Channel programs must be run as root" + + +class CheckpointExists(ZFSError): + errno = ZFS_ERR_CHECKPOINT_EXISTS + message = "Pool already has a checkpoint" + + +class CheckpointNotFound(ZFSError): + errno = ZFS_ERR_NO_CHECKPOINT + message = "Pool does not have a checkpoint" + + +class CheckpointDiscarding(ZFSError): + errno = ZFS_ERR_DISCARDING_CHECKPOINT + message = "Pool checkpoint is being discarded" + + +class DeviceRemovalRunning(ZFSError): + errno = ZFS_ERR_DEVRM_IN_PROGRESS + message = "A vdev is currently being removed" + + +class DeviceTooBig(ZFSError): + errno = ZFS_ERR_VDEV_TOO_BIG + message = "One or more top-level vdevs exceed the maximum vdev size" + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/test/__init__.py b/contrib/pyzfs/libzfs_core/test/__init__.py new file mode 100644 index 000000000000..617cef7aa2b4 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/test/__init__.py @@ -0,0 +1 @@ +# `make distclean` deletes files with size 0. This comment is to avoid that. diff --git a/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py new file mode 100644 index 000000000000..a841f96af36f --- /dev/null +++ b/contrib/pyzfs/libzfs_core/test/test_libzfs_core.py @@ -0,0 +1,4380 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Tests for `libzfs_core` operations. + +These are mostly functional and conformance tests that validate +that the operations produce expected effects or fail with expected +exceptions. +""" +from __future__ import absolute_import, division, print_function + +import unittest +import contextlib +import errno +import filecmp +import os +import platform +import resource +import shutil +import stat +import subprocess +import sys +import tempfile +import time +import uuid +import itertools +import zlib +from .. import _libzfs_core as lzc +from .. import exceptions as lzc_exc +from .._nvlist import packed_nvlist_out + + +def _print(*args): + for arg in args: + print(arg, end=' ') + print() + + +@contextlib.contextmanager +def suppress(exceptions=None): + try: + yield + except BaseException as e: + if exceptions is None or isinstance(e, exceptions): + pass + else: + raise + + +@contextlib.contextmanager +def _zfs_mount(fs): + mntdir = tempfile.mkdtemp() + if platform.system() == 'SunOS': + mount_cmd = ['mount', '-F', 'zfs', fs, mntdir] + else: + mount_cmd = ['mount', '-t', 'zfs', fs, mntdir] + unmount_cmd = ['umount', '-f', mntdir] + + try: + subprocess.check_output(mount_cmd, stderr=subprocess.STDOUT) + try: + yield mntdir + finally: + with suppress(): + subprocess.check_output(unmount_cmd, stderr=subprocess.STDOUT) + except subprocess.CalledProcessError as e: + print('failed to mount %s @ %s : %s' % (fs, mntdir, e.output)) + raise + finally: + os.rmdir(mntdir) + + +# XXX On illumos it is impossible to explicitly mount a snapshot. +# So, either we need to implicitly mount it using .zfs/snapshot/ +# or we need to create a clone and mount it readonly (and discard +# it afterwards). +# At the moment the former approach is implemented. + +# This dictionary is used to keep track of mounted filesystems +# (not snapshots), so that we do not try to mount a filesystem +# more than once in the case more than one snapshot of the +# filesystem is accessed from the same context or the filesystem +# and its snapshot are accessed. +_mnttab = {} + + +@contextlib.contextmanager +def _illumos_mount_fs(fs): + if fs in _mnttab: + yield _mnttab[fs] + else: + with _zfs_mount(fs) as mntdir: + _mnttab[fs] = mntdir + try: + yield mntdir + finally: + _mnttab.pop(fs, None) + + +@contextlib.contextmanager +def _illumos_mount_snap(fs): + (base, snap) = fs.split('@', 1) + with _illumos_mount_fs(base) as mntdir: + yield os.path.join(mntdir, '.zfs', 'snapshot', snap) + + +@contextlib.contextmanager +def _zfs_mount_illumos(fs): + if '@' not in fs: + with _illumos_mount_fs(fs) as mntdir: + yield mntdir + else: + with _illumos_mount_snap(fs) as mntdir: + yield mntdir + + +if platform.system() == 'SunOS': + zfs_mount = _zfs_mount_illumos +else: + zfs_mount = _zfs_mount + + +@contextlib.contextmanager +def cleanup_fd(): + fd = os.open('/dev/zfs', os.O_EXCL) + try: + yield fd + finally: + os.close(fd) + + +@contextlib.contextmanager +def os_open(name, mode): + fd = os.open(name, mode) + try: + yield fd + finally: + os.close(fd) + + +@contextlib.contextmanager +def dev_null(): + with os_open('/dev/null', os.O_WRONLY) as fd: + yield fd + + +@contextlib.contextmanager +def dev_zero(): + with os_open('/dev/zero', os.O_RDONLY) as fd: + yield fd + + +@contextlib.contextmanager +def temp_file_in_fs(fs): + with zfs_mount(fs) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + yield f.name + + +def make_snapshots(fs, before, modified, after): + def _maybe_snap(snap): + if snap is not None: + if not snap.startswith(fs): + snap = fs + b'@' + snap + lzc.lzc_snapshot([snap]) + return snap + + before = _maybe_snap(before) + with temp_file_in_fs(fs) as name: + modified = _maybe_snap(modified) + after = _maybe_snap(after) + + return (name, (before, modified, after)) + + +@contextlib.contextmanager +def streams(fs, first, second): + (filename, snaps) = make_snapshots(fs, None, first, second) + with tempfile.TemporaryFile(suffix='.zstream') as full: + lzc.lzc_send(snaps[1], None, full.fileno()) + full.seek(0) + if snaps[2] is not None: + with tempfile.TemporaryFile(suffix='.zstream') as incremental: + lzc.lzc_send(snaps[2], snaps[1], incremental.fileno()) + incremental.seek(0) + yield (filename, (full, incremental)) + else: + yield (filename, (full, None)) + + +@contextlib.contextmanager +def encrypted_filesystem(): + fs = ZFSTest.pool.getFilesystem(b"encrypted") + name = fs.getName() + filename = None + key = os.urandom(lzc.WRAPPING_KEY_LEN) + with tempfile.NamedTemporaryFile() as f: + filename = "file://" + f.name + props = { + b"encryption": lzc.zio_encrypt.ZIO_CRYPT_AES_256_CCM, + b"keylocation": filename.encode(), + b"keyformat": lzc.zfs_keyformat.ZFS_KEYFORMAT_RAW, + } + lzc.lzc_create(name, 'zfs', props=props, key=key) + yield (name, key) + + +def runtimeSkipIf(check_method, message): + def _decorator(f): + def _f(_self, *args, **kwargs): + if check_method(_self): + return _self.skipTest(message) + else: + return f(_self, *args, **kwargs) + _f.__name__ = f.__name__ + return _f + return _decorator + + +def skipIfFeatureAvailable(feature, message): + return runtimeSkipIf( + lambda _self: _self.__class__.pool.isPoolFeatureAvailable(feature), + message) + + +def skipUnlessFeatureEnabled(feature, message): + return runtimeSkipIf( + lambda _self: not _self.__class__.pool.isPoolFeatureEnabled(feature), + message) + + +def skipUnlessBookmarksSupported(f): + return skipUnlessFeatureEnabled( + 'bookmarks', 'bookmarks are not enabled')(f) + + +def snap_always_unmounted_before_destruction(): + # Apparently ZoL automatically unmounts the snapshot + # only if it is mounted at its default .zfs/snapshot + # mountpoint. + return ( + platform.system() != 'Linux', 'snapshot is not auto-unmounted') + + +def illumos_bug_6379(): + # zfs_ioc_hold() panics on a bad cleanup fd + return ( + platform.system() == 'SunOS', + 'see https://www.illumos.org/issues/6379') + + +def needs_support(function): + return unittest.skipUnless( + lzc.is_supported(function), + '{} not available'.format(function.__name__)) + + +class ZFSTest(unittest.TestCase): + POOL_FILE_SIZE = 128 * 1024 * 1024 + FILESYSTEMS = [b'fs1', b'fs2', b'fs1/fs'] + + pool = None + misc_pool = None + readonly_pool = None + + @classmethod + def setUpClass(cls): + try: + cls.pool = _TempPool(filesystems=cls.FILESYSTEMS) + cls.misc_pool = _TempPool() + cls.readonly_pool = _TempPool( + filesystems=cls.FILESYSTEMS, readonly=True) + cls.pools = [cls.pool, cls.misc_pool, cls.readonly_pool] + except Exception: + cls._cleanUp() + raise + + @classmethod + def tearDownClass(cls): + cls._cleanUp() + + @classmethod + def _cleanUp(cls): + for pool in [cls.pool, cls.misc_pool, cls.readonly_pool]: + if pool is not None: + pool.cleanUp() + + def setUp(self): + pass + + def tearDown(self): + for pool in ZFSTest.pools: + pool.reset() + + def assertExists(self, name): + self.assertTrue( + lzc.lzc_exists(name), 'ZFS dataset %s does not exist' % (name, )) + + def assertNotExists(self, name): + self.assertFalse( + lzc.lzc_exists(name), 'ZFS dataset %s exists' % (name, )) + + def test_exists(self): + self.assertExists(ZFSTest.pool.makeName()) + + def test_exists_in_ro_pool(self): + self.assertExists(ZFSTest.readonly_pool.makeName()) + + def test_exists_failure(self): + self.assertNotExists(ZFSTest.pool.makeName(b'nonexistent')) + + def test_create_fs(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test1") + + lzc.lzc_create(name) + self.assertExists(name) + + def test_create_zvol(self): + name = ZFSTest.pool.makeName(b"fs1/fs/zvol") + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(name, ds_type='zvol', props=props) + self.assertExists(name) + # On Gentoo with ZFS 0.6.5.4 the volume is busy + # and can not be destroyed right after its creation. + # A reason for this is unknown at the moment. + # Because of that the post-test clean up could fail. + time.sleep(0.1) + + def test_create_fs_with_prop(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test2") + props = {b"atime": 0} + + lzc.lzc_create(name, props=props) + self.assertExists(name) + + def test_create_fs_wrong_ds_type(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test1") + + with self.assertRaises(lzc_exc.DatasetTypeInvalid): + lzc.lzc_create(name, ds_type='wrong') + + def test_create_fs_below_zvol(self): + name = ZFSTest.pool.makeName(b"fs1/fs/zvol") + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(name, ds_type='zvol', props=props) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_create(name + b'/fs') + + def test_create_zvol_below_zvol(self): + name = ZFSTest.pool.makeName(b"fs1/fs/zvol") + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(name, ds_type='zvol', props=props) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_create(name + b'/zvol', ds_type='zvol', props=props) + + def test_create_fs_duplicate(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test6") + + lzc.lzc_create(name) + + with self.assertRaises(lzc_exc.FilesystemExists): + lzc.lzc_create(name) + + def test_create_fs_in_ro_pool(self): + name = ZFSTest.readonly_pool.makeName(b"fs") + + with self.assertRaises(lzc_exc.ReadOnlyPool): + lzc.lzc_create(name) + + def test_create_fs_without_parent(self): + name = ZFSTest.pool.makeName(b"fs1/nonexistent/test") + + with self.assertRaises(lzc_exc.ParentNotFound): + lzc.lzc_create(name) + self.assertNotExists(name) + + def test_create_fs_in_nonexistent_pool(self): + name = b"no-such-pool/fs" + + with self.assertRaises(lzc_exc.ParentNotFound): + lzc.lzc_create(name) + self.assertNotExists(name) + + def test_create_fs_with_invalid_prop(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test3") + props = {b"BOGUS": 0} + + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_create(name, 'zfs', props) + self.assertNotExists(name) + + def test_create_fs_with_invalid_prop_type(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test4") + props = {b"recordsize": b"128k"} + + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_create(name, 'zfs', props) + self.assertNotExists(name) + + def test_create_fs_with_invalid_prop_val(self): + name = ZFSTest.pool.makeName(b"fs1/fs/test5") + props = {b"atime": 20} + + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_create(name, 'zfs', props) + self.assertNotExists(name) + + def test_create_fs_with_invalid_name(self): + name = ZFSTest.pool.makeName(b"@badname") + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_create(name) + self.assertNotExists(name) + + def test_create_fs_with_invalid_pool_name(self): + name = b"bad!pool/fs" + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_create(name) + self.assertNotExists(name) + + def test_create_encrypted_fs(self): + fs = ZFSTest.pool.getFilesystem(b"encrypted") + name = fs.getName() + filename = None + with tempfile.NamedTemporaryFile() as f: + filename = "file://" + f.name + props = { + b"encryption": lzc.zio_encrypt.ZIO_CRYPT_AES_256_CCM, + b"keylocation": filename.encode(), + b"keyformat": lzc.zfs_keyformat.ZFS_KEYFORMAT_RAW, + } + key = os.urandom(lzc.WRAPPING_KEY_LEN) + lzc.lzc_create(name, 'zfs', props=props, key=key) + self.assertEqual(fs.getProperty("encryption"), b"aes-256-ccm") + self.assertEqual(fs.getProperty("encryptionroot"), name) + self.assertEqual(fs.getProperty("keylocation"), filename.encode()) + self.assertEqual(fs.getProperty("keyformat"), b"raw") + + def test_snapshot(self): + snapname = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname] + + lzc.lzc_snapshot(snaps) + self.assertExists(snapname) + + def test_snapshot_empty_list(self): + lzc.lzc_snapshot([]) + + def test_snapshot_user_props(self): + snapname = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname] + props = {b"user:foo": b"bar"} + + lzc.lzc_snapshot(snaps, props) + self.assertExists(snapname) + + def test_snapshot_invalid_props(self): + snapname = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname] + props = {b"foo": b"bar"} + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps, props) + + self.assertEqual(len(ctx.exception.errors), len(snaps)) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PropertyInvalid) + self.assertNotExists(snapname) + + def test_snapshot_ro_pool(self): + snapname1 = ZFSTest.readonly_pool.makeName(b"@snap") + snapname2 = ZFSTest.readonly_pool.makeName(b"fs1@snap") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # NB: one common error is reported. + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.ReadOnlyPool) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_snapshot_nonexistent_pool(self): + snapname = b"no-such-pool@snap" + snaps = [snapname] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.FilesystemNotFound) + + def test_snapshot_nonexistent_fs(self): + snapname = ZFSTest.pool.makeName(b"nonexistent@snap") + snaps = [snapname] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.FilesystemNotFound) + + def test_snapshot_nonexistent_and_existent_fs(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.pool.makeName(b"nonexistent@snap") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.FilesystemNotFound) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_multiple_snapshots_nonexistent_fs(self): + snapname1 = ZFSTest.pool.makeName(b"nonexistent@snap1") + snapname2 = ZFSTest.pool.makeName(b"nonexistent@snap2") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # XXX two errors should be reported but alas + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.DuplicateSnapshots) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_multiple_snapshots_multiple_nonexistent_fs(self): + snapname1 = ZFSTest.pool.makeName(b"nonexistent1@snap") + snapname2 = ZFSTest.pool.makeName(b"nonexistent2@snap") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 2) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.FilesystemNotFound) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_snapshot_already_exists(self): + snapname = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname] + + lzc.lzc_snapshot(snaps) + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotExists) + + def test_multiple_snapshots_for_same_fs(self): + snapname1 = ZFSTest.pool.makeName(b"@snap1") + snapname2 = ZFSTest.pool.makeName(b"@snap2") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.DuplicateSnapshots) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_multiple_snapshots(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.pool.makeName(b"fs1@snap") + snaps = [snapname1, snapname2] + + lzc.lzc_snapshot(snaps) + self.assertExists(snapname1) + self.assertExists(snapname2) + + def test_multiple_existing_snapshots(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.pool.makeName(b"fs1@snap") + snaps = [snapname1, snapname2] + + lzc.lzc_snapshot(snaps) + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 2) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotExists) + + def test_multiple_new_and_existing_snapshots(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.pool.makeName(b"fs1@snap") + snapname3 = ZFSTest.pool.makeName(b"fs2@snap") + snaps = [snapname1, snapname2] + more_snaps = snaps + [snapname3] + + lzc.lzc_snapshot(snaps) + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(more_snaps) + + self.assertEqual(len(ctx.exception.errors), 2) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotExists) + self.assertNotExists(snapname3) + + def test_snapshot_multiple_errors(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.pool.makeName(b"nonexistent@snap") + snapname3 = ZFSTest.pool.makeName(b"fs1@snap") + snaps = [snapname1] + more_snaps = [snapname1, snapname2, snapname3] + + # create 'snapname1' snapshot + lzc.lzc_snapshot(snaps) + + # attempt to create 3 snapshots: + # 1. duplicate snapshot name + # 2. refers to filesystem that doesn't exist + # 3. could have succeeded if not for 1 and 2 + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(more_snaps) + + # It seems that FilesystemNotFound overrides the other error, + # but it doesn't have to. + self.assertGreater(len(ctx.exception.errors), 0) + for e in ctx.exception.errors: + self.assertIsInstance( + e, (lzc_exc.SnapshotExists, lzc_exc.FilesystemNotFound)) + self.assertNotExists(snapname2) + self.assertNotExists(snapname3) + + def test_snapshot_different_pools(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.misc_pool.makeName(b"@snap") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # NB: one common error is reported. + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolsDiffer) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_snapshot_different_pools_ro_pool(self): + snapname1 = ZFSTest.pool.makeName(b"@snap") + snapname2 = ZFSTest.readonly_pool.makeName(b"@snap") + snaps = [snapname1, snapname2] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # NB: one common error is reported. + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + # NB: depending on whether the first attempted snapshot is + # for the read-only pool a different error is reported. + self.assertIsInstance( + e, (lzc_exc.PoolsDiffer, lzc_exc.ReadOnlyPool)) + self.assertNotExists(snapname1) + self.assertNotExists(snapname2) + + def test_snapshot_invalid_name(self): + snapname1 = ZFSTest.pool.makeName(b"@bad&name") + snapname2 = ZFSTest.pool.makeName(b"fs1@bad*name") + snapname3 = ZFSTest.pool.makeName(b"fs2@snap") + snaps = [snapname1, snapname2, snapname3] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # NB: one common error is reported. + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + self.assertIsNone(e.name) + + def test_snapshot_too_long_complete_name(self): + snapname1 = ZFSTest.pool.makeTooLongName(b"fs1@") + snapname2 = ZFSTest.pool.makeTooLongName(b"fs2@") + snapname3 = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname1, snapname2, snapname3] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + self.assertEqual(len(ctx.exception.errors), 2) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertIsNotNone(e.name) + + def test_snapshot_too_long_snap_name(self): + snapname1 = ZFSTest.pool.makeTooLongComponent(b"fs1@") + snapname2 = ZFSTest.pool.makeTooLongComponent(b"fs2@") + snapname3 = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname1, snapname2, snapname3] + + with self.assertRaises(lzc_exc.SnapshotFailure) as ctx: + lzc.lzc_snapshot(snaps) + + # NB: one common error is reported. + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertIsNone(e.name) + + def test_destroy_nonexistent_snapshot(self): + lzc.lzc_destroy_snaps([ZFSTest.pool.makeName(b"@nonexistent")], False) + lzc.lzc_destroy_snaps([ZFSTest.pool.makeName(b"@nonexistent")], True) + + def test_destroy_snapshot_of_nonexistent_pool(self): + with self.assertRaises(lzc_exc.SnapshotDestructionFailure) as ctx: + lzc.lzc_destroy_snaps([b"no-such-pool@snap"], False) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolNotFound) + + with self.assertRaises(lzc_exc.SnapshotDestructionFailure) as ctx: + lzc.lzc_destroy_snaps([b"no-such-pool@snap"], True) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolNotFound) + + # NB: note the difference from the nonexistent pool test. + def test_destroy_snapshot_of_nonexistent_fs(self): + lzc.lzc_destroy_snaps( + [ZFSTest.pool.makeName(b"nonexistent@snap")], False) + lzc.lzc_destroy_snaps( + [ZFSTest.pool.makeName(b"nonexistent@snap")], True) + + # Apparently the name is not checked for validity. + @unittest.expectedFailure + def test_destroy_invalid_snap_name(self): + with self.assertRaises(lzc_exc.SnapshotDestructionFailure): + lzc.lzc_destroy_snaps( + [ZFSTest.pool.makeName(b"@non$&*existent")], False) + with self.assertRaises(lzc_exc.SnapshotDestructionFailure): + lzc.lzc_destroy_snaps( + [ZFSTest.pool.makeName(b"@non$&*existent")], True) + + # Apparently the full name is not checked for length. + @unittest.expectedFailure + def test_destroy_too_long_full_snap_name(self): + snapname1 = ZFSTest.pool.makeTooLongName(b"fs1@") + snaps = [snapname1] + + with self.assertRaises(lzc_exc.SnapshotDestructionFailure): + lzc.lzc_destroy_snaps(snaps, False) + with self.assertRaises(lzc_exc.SnapshotDestructionFailure): + lzc.lzc_destroy_snaps(snaps, True) + + def test_destroy_too_long_short_snap_name(self): + snapname1 = ZFSTest.pool.makeTooLongComponent(b"fs1@") + snapname2 = ZFSTest.pool.makeTooLongComponent(b"fs2@") + snapname3 = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname1, snapname2, snapname3] + + with self.assertRaises(lzc_exc.SnapshotDestructionFailure) as ctx: + lzc.lzc_destroy_snaps(snaps, False) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + + @unittest.skipUnless(*snap_always_unmounted_before_destruction()) + def test_destroy_mounted_snap(self): + snap = ZFSTest.pool.getRoot().getSnap() + + lzc.lzc_snapshot([snap]) + with zfs_mount(snap): + # the snapshot should be force-unmounted + lzc.lzc_destroy_snaps([snap], defer=False) + self.assertNotExists(snap) + + def test_clone(self): + # NB: note the special name for the snapshot. + # Since currently we can not destroy filesystems, + # it would be impossible to destroy the snapshot, + # so no point in attempting to clean it up. + snapname = ZFSTest.pool.makeName(b"fs2@origin1") + name = ZFSTest.pool.makeName(b"fs1/fs/clone1") + + lzc.lzc_snapshot([snapname]) + + lzc.lzc_clone(name, snapname) + self.assertExists(name) + + def test_clone_nonexistent_snapshot(self): + snapname = ZFSTest.pool.makeName(b"fs2@nonexistent") + name = ZFSTest.pool.makeName(b"fs1/fs/clone2") + + # XXX The error should be SnapshotNotFound + # but limitations of C interface do not allow + # to differentiate between the errors. + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_nonexistent_parent_fs(self): + snapname = ZFSTest.pool.makeName(b"fs2@origin3") + name = ZFSTest.pool.makeName(b"fs1/nonexistent/clone3") + + lzc.lzc_snapshot([snapname]) + + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_to_nonexistent_pool(self): + snapname = ZFSTest.pool.makeName(b"fs2@snap") + name = b"no-such-pool/fs" + + lzc.lzc_snapshot([snapname]) + + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_invalid_snap_name(self): + # Use a valid filesystem name of filesystem that + # exists as a snapshot name + snapname = ZFSTest.pool.makeName(b"fs1/fs") + name = ZFSTest.pool.makeName(b"fs2/clone") + + with self.assertRaises(lzc_exc.SnapshotNameInvalid): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_invalid_snap_name_2(self): + # Use a valid filesystem name of filesystem that + # doesn't exist as a snapshot name + snapname = ZFSTest.pool.makeName(b"fs1/nonexistent") + name = ZFSTest.pool.makeName(b"fs2/clone") + + with self.assertRaises(lzc_exc.SnapshotNameInvalid): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_invalid_name(self): + snapname = ZFSTest.pool.makeName(b"fs2@snap") + name = ZFSTest.pool.makeName(b"fs1/bad#name") + + lzc.lzc_snapshot([snapname]) + + with self.assertRaises(lzc_exc.FilesystemNameInvalid): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_invalid_pool_name(self): + snapname = ZFSTest.pool.makeName(b"fs2@snap") + name = b"bad!pool/fs1" + + lzc.lzc_snapshot([snapname]) + + with self.assertRaises(lzc_exc.FilesystemNameInvalid): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_across_pools(self): + snapname = ZFSTest.pool.makeName(b"fs2@snap") + name = ZFSTest.misc_pool.makeName(b"clone1") + + lzc.lzc_snapshot([snapname]) + + with self.assertRaises(lzc_exc.PoolsDiffer): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_clone_across_pools_to_ro_pool(self): + snapname = ZFSTest.pool.makeName(b"fs2@snap") + name = ZFSTest.readonly_pool.makeName(b"fs1/clone1") + + lzc.lzc_snapshot([snapname]) + + # it's legal to report either of the conditions + with self.assertRaises((lzc_exc.ReadOnlyPool, lzc_exc.PoolsDiffer)): + lzc.lzc_clone(name, snapname) + self.assertNotExists(name) + + def test_destroy_cloned_fs(self): + snapname1 = ZFSTest.pool.makeName(b"fs2@origin4") + snapname2 = ZFSTest.pool.makeName(b"fs1@snap") + clonename = ZFSTest.pool.makeName(b"fs1/fs/clone4") + snaps = [snapname1, snapname2] + + lzc.lzc_snapshot(snaps) + lzc.lzc_clone(clonename, snapname1) + + with self.assertRaises(lzc_exc.SnapshotDestructionFailure) as ctx: + lzc.lzc_destroy_snaps(snaps, False) + + self.assertEqual(len(ctx.exception.errors), 1) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotIsCloned) + for snap in snaps: + self.assertExists(snap) + + def test_deferred_destroy_cloned_fs(self): + snapname1 = ZFSTest.pool.makeName(b"fs2@origin5") + snapname2 = ZFSTest.pool.makeName(b"fs1@snap") + clonename = ZFSTest.pool.makeName(b"fs1/fs/clone5") + snaps = [snapname1, snapname2] + + lzc.lzc_snapshot(snaps) + lzc.lzc_clone(clonename, snapname1) + + lzc.lzc_destroy_snaps(snaps, defer=True) + + self.assertExists(snapname1) + self.assertNotExists(snapname2) + + def test_rollback(self): + name = ZFSTest.pool.makeName(b"fs1") + snapname = name + b"@snap" + + lzc.lzc_snapshot([snapname]) + ret = lzc.lzc_rollback(name) + self.assertEqual(ret, snapname) + + def test_rollback_2(self): + name = ZFSTest.pool.makeName(b"fs1") + snapname1 = name + b"@snap1" + snapname2 = name + b"@snap2" + + lzc.lzc_snapshot([snapname1]) + lzc.lzc_snapshot([snapname2]) + ret = lzc.lzc_rollback(name) + self.assertEqual(ret, snapname2) + + def test_rollback_no_snaps(self): + name = ZFSTest.pool.makeName(b"fs1") + + with self.assertRaises(lzc_exc.SnapshotNotFound): + lzc.lzc_rollback(name) + + def test_rollback_non_existent_fs(self): + name = ZFSTest.pool.makeName(b"nonexistent") + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_rollback(name) + + def test_rollback_invalid_fs_name(self): + name = ZFSTest.pool.makeName(b"bad~name") + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_rollback(name) + + def test_rollback_snap_name(self): + name = ZFSTest.pool.makeName(b"fs1@snap") + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_rollback(name) + + def test_rollback_snap_name_2(self): + name = ZFSTest.pool.makeName(b"fs1@snap") + + lzc.lzc_snapshot([name]) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_rollback(name) + + def test_rollback_too_long_fs_name(self): + name = ZFSTest.pool.makeTooLongName() + + with self.assertRaises(lzc_exc.NameTooLong): + lzc.lzc_rollback(name) + + def test_rollback_to_snap_name(self): + name = ZFSTest.pool.makeName(b"fs1") + snap = name + b"@snap" + + lzc.lzc_snapshot([snap]) + lzc.lzc_rollback_to(name, snap) + + def test_rollback_to_not_latest(self): + fsname = ZFSTest.pool.makeName(b'fs1') + snap1 = fsname + b"@snap1" + snap2 = fsname + b"@snap2" + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + with self.assertRaises(lzc_exc.SnapshotNotLatest): + lzc.lzc_rollback_to(fsname, fsname + b"@snap1") + + @skipUnlessBookmarksSupported + def test_bookmarks(self): + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + lzc.lzc_bookmark(bmark_dict) + + @skipUnlessBookmarksSupported + def test_bookmarks_2(self): + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + lzc.lzc_snapshot(snaps) + lzc.lzc_bookmark(bmark_dict) + lzc.lzc_destroy_snaps(snaps, defer=False) + + @skipUnlessBookmarksSupported + def test_bookmark_copying(self): + snaps = [ZFSTest.pool.makeName(s) for s in [ + b'fs1@snap1', b'fs1@snap2', b'fs2@snap1']] + bmarks = [ZFSTest.pool.makeName(x) for x in [ + b'fs1#bmark1', b'fs1#bmark2', b'fs2#bmark1']] + bmarks_copies = [ZFSTest.pool.makeName(x) for x in [ + b'fs1#bmark1_copy', b'fs1#bmark2_copy', b'fs2#bmark1_copy']] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + bmark_copies_dict = {x: y for x, y in zip(bmarks_copies, bmarks)} + + for snap in snaps: + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + lzc.lzc_bookmark(bmark_copies_dict) + lzc.lzc_destroy_bookmarks(bmarks_copies) + + lzc.lzc_destroy_bookmarks(bmarks) + lzc.lzc_destroy_snaps(snaps, defer=False) + + @skipUnlessBookmarksSupported + def test_bookmarks_empty(self): + lzc.lzc_bookmark({}) + + @skipUnlessBookmarksSupported + def test_bookmarks_foregin_source(self): + snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] + bmarks = [ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.BookmarkMismatch) + + @skipUnlessBookmarksSupported + def test_bookmarks_invalid_name(self): + snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] + bmarks = [ZFSTest.pool.makeName(b'fs1#bmark!')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + + @skipUnlessBookmarksSupported + def test_bookmarks_invalid_name_2(self): + snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] + bmarks = [ZFSTest.pool.makeName(b'fs1@bmark')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + + @skipUnlessBookmarksSupported + def test_bookmarks_too_long_name(self): + snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] + bmarks = [ZFSTest.pool.makeTooLongName(b'fs1#')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + + @skipUnlessBookmarksSupported + def test_bookmarks_too_long_name_2(self): + snaps = [ZFSTest.pool.makeName(b'fs1@snap1')] + bmarks = [ZFSTest.pool.makeTooLongComponent(b'fs1#')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + + @skipUnlessBookmarksSupported + def test_bookmarks_foreign_sources(self): + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs2#bmark1'), ZFSTest.pool.makeName(b'fs1#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.BookmarkMismatch) + + @skipUnlessBookmarksSupported + def test_bookmarks_partially_foreign_sources(self): + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs2#bmark'), ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.BookmarkMismatch) + + @skipUnlessBookmarksSupported + def test_bookmarks_cross_pool(self): + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.misc_pool.makeName(b'@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs1#bmark1'), ZFSTest.misc_pool.makeName(b'#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps[0:1]) + lzc.lzc_snapshot(snaps[1:2]) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolsDiffer) + + @skipUnlessBookmarksSupported + def test_bookmarks_missing_snap(self): + fss = [ZFSTest.pool.makeName(b'fs1'), ZFSTest.pool.makeName(b'fs2')] + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + lzc.lzc_snapshot(snaps[0:1]) # only create fs1@snap1 + + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotNotFound) + + # no new bookmarks are created if one or more sources do not exist + for fs in fss: + fsbmarks = lzc.lzc_get_bookmarks(fs) + self.assertEqual(len(fsbmarks), 0) + + @skipUnlessBookmarksSupported + def test_bookmarks_missing_snaps(self): + fss = [ZFSTest.pool.makeName(b'fs1'), ZFSTest.pool.makeName(b'fs2')] + snaps = [ZFSTest.pool.makeName( + b'fs1@snap1'), ZFSTest.pool.makeName(b'fs2@snap1')] + bmarks = [ZFSTest.pool.makeName( + b'fs1#bmark1'), ZFSTest.pool.makeName(b'fs2#bmark1')] + bmark_dict = {x: y for x, y in zip(bmarks, snaps)} + + # do not create any snapshots + + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotNotFound) + + # no new bookmarks are created if one or more sources do not exist + for fs in fss: + fsbmarks = lzc.lzc_get_bookmarks(fs) + self.assertEqual(len(fsbmarks), 0) + + @skipUnlessBookmarksSupported + def test_bookmarks_for_the_same_snap(self): + snap = ZFSTest.pool.makeName(b'fs1@snap1') + bmark1 = ZFSTest.pool.makeName(b'fs1#bmark1') + bmark2 = ZFSTest.pool.makeName(b'fs1#bmark2') + bmark_dict = {bmark1: snap, bmark2: snap} + + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + @skipUnlessBookmarksSupported + def test_bookmarks_for_the_same_snap_2(self): + snap = ZFSTest.pool.makeName(b'fs1@snap1') + bmark1 = ZFSTest.pool.makeName(b'fs1#bmark1') + bmark2 = ZFSTest.pool.makeName(b'fs1#bmark2') + bmark_dict1 = {bmark1: snap} + bmark_dict2 = {bmark2: snap} + + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict1) + lzc.lzc_bookmark(bmark_dict2) + + @skipUnlessBookmarksSupported + def test_bookmarks_duplicate_name(self): + snap1 = ZFSTest.pool.makeName(b'fs1@snap1') + snap2 = ZFSTest.pool.makeName(b'fs1@snap2') + bmark = ZFSTest.pool.makeName(b'fs1#bmark') + bmark_dict1 = {bmark: snap1} + bmark_dict2 = {bmark: snap2} + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_bookmark(bmark_dict1) + with self.assertRaises(lzc_exc.BookmarkFailure) as ctx: + lzc.lzc_bookmark(bmark_dict2) + + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.BookmarkExists) + + @skipUnlessBookmarksSupported + def test_get_bookmarks(self): + snap1 = ZFSTest.pool.makeName(b'fs1@snap1') + snap2 = ZFSTest.pool.makeName(b'fs1@snap2') + bmark = ZFSTest.pool.makeName(b'fs1#bmark') + bmark1 = ZFSTest.pool.makeName(b'fs1#bmark1') + bmark2 = ZFSTest.pool.makeName(b'fs1#bmark2') + bmark_dict1 = {bmark1: snap1, bmark2: snap2} + bmark_dict2 = {bmark: snap2} + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_bookmark(bmark_dict1) + lzc.lzc_bookmark(bmark_dict2) + lzc.lzc_destroy_snaps([snap1, snap2], defer=False) + + bmarks = lzc.lzc_get_bookmarks(ZFSTest.pool.makeName(b'fs1')) + self.assertEqual(len(bmarks), 3) + for b in b'bmark', b'bmark1', b'bmark2': + self.assertIn(b, bmarks) + self.assertIsInstance(bmarks[b], dict) + self.assertEqual(len(bmarks[b]), 0) + + bmarks = lzc.lzc_get_bookmarks(ZFSTest.pool.makeName(b'fs1'), + [b'guid', b'createtxg', b'creation']) + self.assertEqual(len(bmarks), 3) + for b in b'bmark', b'bmark1', b'bmark2': + self.assertIn(b, bmarks) + self.assertIsInstance(bmarks[b], dict) + self.assertEqual(len(bmarks[b]), 3) + + @skipUnlessBookmarksSupported + def test_get_bookmarks_invalid_property(self): + snap = ZFSTest.pool.makeName(b'fs1@snap') + bmark = ZFSTest.pool.makeName(b'fs1#bmark') + bmark_dict = {bmark: snap} + + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + bmarks = lzc.lzc_get_bookmarks( + ZFSTest.pool.makeName(b'fs1'), [b'badprop']) + self.assertEqual(len(bmarks), 1) + for b in (b'bmark', ): + self.assertIn(b, bmarks) + self.assertIsInstance(bmarks[b], dict) + self.assertEqual(len(bmarks[b]), 0) + + @skipUnlessBookmarksSupported + def test_get_bookmarks_nonexistent_fs(self): + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_get_bookmarks(ZFSTest.pool.makeName(b'nonexistent')) + + @skipUnlessBookmarksSupported + def test_destroy_bookmarks(self): + snap = ZFSTest.pool.makeName(b'fs1@snap') + bmark = ZFSTest.pool.makeName(b'fs1#bmark') + bmark_dict = {bmark: snap} + + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + lzc.lzc_destroy_bookmarks( + [bmark, ZFSTest.pool.makeName(b'fs1#nonexistent')]) + bmarks = lzc.lzc_get_bookmarks(ZFSTest.pool.makeName(b'fs1')) + self.assertEqual(len(bmarks), 0) + + @skipUnlessBookmarksSupported + def test_destroy_bookmarks_invalid_name(self): + snap = ZFSTest.pool.makeName(b'fs1@snap') + bmark = ZFSTest.pool.makeName(b'fs1#bmark') + bmark_dict = {bmark: snap} + + lzc.lzc_snapshot([snap]) + lzc.lzc_bookmark(bmark_dict) + + with self.assertRaises(lzc_exc.BookmarkDestructionFailure) as ctx: + lzc.lzc_destroy_bookmarks( + [bmark, ZFSTest.pool.makeName(b'fs1/nonexistent')]) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + + bmarks = lzc.lzc_get_bookmarks(ZFSTest.pool.makeName(b'fs1')) + self.assertEqual(len(bmarks), 1) + self.assertIn(b'bmark', bmarks) + + @skipUnlessBookmarksSupported + def test_destroy_bookmark_nonexistent_fs(self): + lzc.lzc_destroy_bookmarks( + [ZFSTest.pool.makeName(b'nonexistent#bmark')]) + + @skipUnlessBookmarksSupported + def test_destroy_bookmarks_empty(self): + lzc.lzc_bookmark({}) + + def test_snaprange_space(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + snap3 = ZFSTest.pool.makeName(b"fs1@snap") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_snapshot([snap3]) + + space = lzc.lzc_snaprange_space(snap1, snap2) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_snaprange_space(snap2, snap3) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_snaprange_space(snap1, snap3) + self.assertIsInstance(space, (int, int)) + + def test_snaprange_space_2(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + snap3 = ZFSTest.pool.makeName(b"fs1@snap") + + lzc.lzc_snapshot([snap1]) + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap2]) + lzc.lzc_snapshot([snap3]) + + space = lzc.lzc_snaprange_space(snap1, snap2) + self.assertGreater(space, 1024 * 1024) + space = lzc.lzc_snaprange_space(snap2, snap3) + self.assertGreater(space, 1024 * 1024) + space = lzc.lzc_snaprange_space(snap1, snap3) + self.assertGreater(space, 1024 * 1024) + + def test_snaprange_space_same_snap(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap]) + + space = lzc.lzc_snaprange_space(snap, snap) + self.assertGreater(space, 1024 * 1024) + self.assertAlmostEqual(space, 1024 * 1024, delta=1024 * 1024 // 20) + + def test_snaprange_space_wrong_order(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_snaprange_space(snap2, snap1) + + def test_snaprange_space_unrelated(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs2@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_snaprange_space(snap1, snap2) + + def test_snaprange_space_across_pools(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.misc_pool.makeName(b"@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.PoolsDiffer): + lzc.lzc_snaprange_space(snap1, snap2) + + def test_snaprange_space_nonexistent(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_snaprange_space(snap1, snap2) + self.assertEqual(ctx.exception.name, snap2) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_snaprange_space(snap2, snap1) + self.assertEqual(ctx.exception.name, snap1) + + def test_snaprange_space_invalid_name(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@sn#p") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_snaprange_space(snap1, snap2) + + def test_snaprange_space_not_snap(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_snaprange_space(snap1, snap2) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_snaprange_space(snap2, snap1) + + def test_snaprange_space_not_snap_2(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1#bmark") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_snaprange_space(snap1, snap2) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_snaprange_space(snap2, snap1) + + def test_send_space(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + snap3 = ZFSTest.pool.makeName(b"fs1@snap") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_snapshot([snap3]) + + space = lzc.lzc_send_space(snap2, snap1) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_send_space(snap3, snap2) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_send_space(snap3, snap1) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_send_space(snap1) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_send_space(snap2) + self.assertIsInstance(space, (int, int)) + space = lzc.lzc_send_space(snap3) + self.assertIsInstance(space, (int, int)) + + def test_send_space_2(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + snap3 = ZFSTest.pool.makeName(b"fs1@snap") + + lzc.lzc_snapshot([snap1]) + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap2]) + lzc.lzc_snapshot([snap3]) + + space = lzc.lzc_send_space(snap2, snap1) + self.assertGreater(space, 1024 * 1024) + + space = lzc.lzc_send_space(snap3, snap2) + + space = lzc.lzc_send_space(snap3, snap1) + + space_empty = lzc.lzc_send_space(snap1) + + space = lzc.lzc_send_space(snap2) + self.assertGreater(space, 1024 * 1024) + + space = lzc.lzc_send_space(snap3) + self.assertEqual(space, space_empty) + + def test_send_space_same_snap(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + lzc.lzc_snapshot([snap1]) + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send_space(snap1, snap1) + + def test_send_space_wrong_order(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send_space(snap1, snap2) + + def test_send_space_unrelated(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs2@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send_space(snap1, snap2) + + def test_send_space_across_pools(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.misc_pool.makeName(b"@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with self.assertRaises(lzc_exc.PoolsDiffer): + lzc.lzc_send_space(snap1, snap2) + + def test_send_space_nonexistent(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs2@snap2") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send_space(snap1, snap2) + self.assertEqual(ctx.exception.name, snap1) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send_space(snap2, snap1) + self.assertEqual(ctx.exception.name, snap2) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send_space(snap2) + self.assertEqual(ctx.exception.name, snap2) + + def test_send_space_invalid_name(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@sn!p") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send_space(snap2, snap1) + self.assertEqual(ctx.exception.name, snap2) + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send_space(snap2) + self.assertEqual(ctx.exception.name, snap2) + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send_space(snap1, snap2) + self.assertEqual(ctx.exception.name, snap2) + + def test_send_space_not_snap(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send_space(snap1, snap2) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send_space(snap2, snap1) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send_space(snap2) + + def test_send_space_not_snap_2(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1#bmark") + + lzc.lzc_snapshot([snap1]) + + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send_space(snap2, snap1) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send_space(snap2) + + def test_send_full(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + estimate = lzc.lzc_send_space(snap) + + fd = output.fileno() + lzc.lzc_send(snap, None, fd) + st = os.fstat(fd) + # 5%, arbitrary. + self.assertAlmostEqual(st.st_size, estimate, delta=estimate // 20) + + def test_send_incremental(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap2]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + estimate = lzc.lzc_send_space(snap2, snap1) + + fd = output.fileno() + lzc.lzc_send(snap2, snap1, fd) + st = os.fstat(fd) + # 5%, arbitrary. + self.assertAlmostEqual(st.st_size, estimate, delta=estimate // 20) + + def test_send_flags(self): + flags = ['embedded_data', 'large_blocks', 'compress', 'raw'] + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + for c in range(len(flags)): + for flag in itertools.permutations(flags, c + 1): + with dev_null() as fd: + lzc.lzc_send(snap, None, fd, list(flag)) + + def test_send_unknown_flags(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + with dev_null() as fd: + with self.assertRaises(lzc_exc.UnknownStreamFeature): + lzc.lzc_send(snap, None, fd, ['embedded_data', 'UNKNOWN']) + + def test_send_same_snap(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + lzc.lzc_snapshot([snap1]) + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send(snap1, snap1, fd) + + def test_send_wrong_order(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send(snap1, snap2, fd) + + def test_send_unrelated(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs2@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.SnapshotMismatch): + lzc.lzc_send(snap1, snap2, fd) + + def test_send_across_pools(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.misc_pool.makeName(b"@snap2") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.PoolsDiffer): + lzc.lzc_send(snap1, snap2, fd) + + def test_send_nonexistent(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + + lzc.lzc_snapshot([snap1]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send(snap1, snap2, fd) + self.assertEqual(ctx.exception.name, snap1) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send(snap2, snap1, fd) + self.assertEqual(ctx.exception.name, snap2) + + with self.assertRaises(lzc_exc.SnapshotNotFound) as ctx: + lzc.lzc_send(snap2, None, fd) + self.assertEqual(ctx.exception.name, snap2) + + def test_send_invalid_name(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@sn!p") + + lzc.lzc_snapshot([snap1]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send(snap2, snap1, fd) + self.assertEqual(ctx.exception.name, snap2) + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send(snap2, None, fd) + self.assertEqual(ctx.exception.name, snap2) + with self.assertRaises(lzc_exc.NameInvalid) as ctx: + lzc.lzc_send(snap1, snap2, fd) + self.assertEqual(ctx.exception.name, snap2) + + # XXX Although undocumented the API allows to create an incremental + # or full stream for a filesystem as if a temporary unnamed snapshot + # is taken at some time after the call is made and before the stream + # starts being produced. + def test_send_filesystem(self): + snap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.makeName(b"fs1") + + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + lzc.lzc_send(fs, snap, fd) + lzc.lzc_send(fs, None, fd) + + def test_send_from_filesystem(self): + snap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.makeName(b"fs1") + + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send(snap, fs, fd) + + @skipUnlessBookmarksSupported + def test_send_bookmark(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + bmark = ZFSTest.pool.makeName(b"fs1#bmark") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_bookmark({bmark: snap2}) + lzc.lzc_destroy_snaps([snap2], defer=False) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send(bmark, snap1, fd) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_send(bmark, None, fd) + + @skipUnlessBookmarksSupported + def test_send_from_bookmark(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + bmark = ZFSTest.pool.makeName(b"fs1#bmark") + + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + lzc.lzc_bookmark({bmark: snap1}) + lzc.lzc_destroy_snaps([snap1], defer=False) + + with tempfile.TemporaryFile(suffix='.zstream') as output: + fd = output.fileno() + lzc.lzc_send(snap2, bmark, fd) + + def test_send_bad_fd(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile() as tmp: + bad_fd = tmp.fileno() + + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, bad_fd) + self.assertEqual(ctx.exception.errno, errno.EBADF) + + def test_send_bad_fd_2(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, -2) + self.assertEqual(ctx.exception.errno, errno.EBADF) + + def test_send_bad_fd_3(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile() as tmp: + bad_fd = tmp.fileno() + + (soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE) + bad_fd = hard + 1 + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, bad_fd) + self.assertEqual(ctx.exception.errno, errno.EBADF) + + def test_send_to_broken_pipe(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + if sys.version_info < (3, 0): + proc = subprocess.Popen(['true'], stdin=subprocess.PIPE) + proc.wait() + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, proc.stdin.fileno()) + self.assertEqual(ctx.exception.errno, errno.EPIPE) + else: + with subprocess.Popen(['true'], stdin=subprocess.PIPE) as proc: + proc.wait() + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, proc.stdin.fileno()) + self.assertEqual(ctx.exception.errno, errno.EPIPE) + + def test_send_to_broken_pipe_2(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + for i in range(1024): + f.write(b'x' * 1024) + f.flush() + lzc.lzc_snapshot([snap]) + + if sys.version_info < (3, 0): + p = subprocess.Popen(['sleep', '2'], stdin=subprocess.PIPE) + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, p.stdin.fileno()) + self.assertTrue(ctx.exception.errno == errno.EPIPE or + ctx.exception.errno == errno.EINTR) + else: + with subprocess.Popen(['sleep', '2'], stdin=subprocess.PIPE) as p: + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, p.stdin.fileno()) + self.assertTrue(ctx.exception.errno == errno.EPIPE or + ctx.exception.errno == errno.EINTR) + + def test_send_to_ro_file(self): + snap = ZFSTest.pool.makeName(b"fs1@snap") + lzc.lzc_snapshot([snap]) + + with tempfile.NamedTemporaryFile( + suffix='.zstream', delete=False) as output: + # tempfile always opens a temporary file in read-write mode + # regardless of the specified mode, so we have to open it again. + os.chmod(output.name, stat.S_IRUSR) + fd = os.open(output.name, os.O_RDONLY) + with self.assertRaises(lzc_exc.StreamIOError) as ctx: + lzc.lzc_send(snap, None, fd) + os.close(fd) + self.assertEqual(ctx.exception.errno, errno.EBADF) + + def test_recv_full(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dst = ZFSTest.pool.makeName(b"fs2/received-1@snap") + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")) as name: + lzc.lzc_snapshot([src]) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst, stream.fileno()) + + name = os.path.basename(name) + with zfs_mount(src) as mnt1, zfs_mount(dst) as mnt2: + self.assertTrue( + filecmp.cmp( + os.path.join(mnt1, name), os.path.join(mnt2, name), False)) + + def test_recv_incremental(self): + src1 = ZFSTest.pool.makeName(b"fs1@snap1") + src2 = ZFSTest.pool.makeName(b"fs1@snap2") + dst1 = ZFSTest.pool.makeName(b"fs2/received-2@snap1") + dst2 = ZFSTest.pool.makeName(b"fs2/received-2@snap2") + + lzc.lzc_snapshot([src1]) + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")) as name: + lzc.lzc_snapshot([src2]) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src1, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst1, stream.fileno()) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src2, src1, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst2, stream.fileno()) + + name = os.path.basename(name) + with zfs_mount(src2) as mnt1, zfs_mount(dst2) as mnt2: + self.assertTrue( + filecmp.cmp( + os.path.join(mnt1, name), os.path.join(mnt2, name), False)) + + # This test case fails unless a patch from + # https://clusterhq.atlassian.net/browse/ZFS-20 + # is applied to libzfs_core, otherwise it succeeds. + @unittest.skip("fails with unpatched libzfs_core") + def test_recv_without_explicit_snap_name(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-100") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dstfs, full.fileno()) + lzc.lzc_receive(dstfs, incr.fileno()) + self.assertExists(dst1) + self.assertExists(dst2) + + def test_recv_clone(self): + orig_src = ZFSTest.pool.makeName(b"fs2@send-origin") + clone = ZFSTest.pool.makeName(b"fs1/fs/send-clone") + clone_snap = clone + b"@snap" + orig_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-origin@snap") + clone_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-clone@snap") + + lzc.lzc_snapshot([orig_src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(orig_src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(orig_dst, stream.fileno()) + + lzc.lzc_clone(clone, orig_src) + lzc.lzc_snapshot([clone_snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(clone_snap, orig_src, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(clone_dst, stream.fileno(), origin=orig_dst) + + def test_recv_full_already_existing_empty_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-3") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + lzc.lzc_create(dstfs) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(( + lzc_exc.DestinationModified, lzc_exc.DatasetExists)): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_into_root_empty_pool(self): + empty_pool = None + try: + srcfs = ZFSTest.pool.makeName(b"fs1") + empty_pool = _TempPool() + dst = empty_pool.makeName(b'@snap') + + with streams(srcfs, b"snap", None) as (_, (stream, _)): + with self.assertRaises(( + lzc_exc.DestinationModified, lzc_exc.DatasetExists)): + lzc.lzc_receive(dst, stream.fileno()) + finally: + if empty_pool is not None: + empty_pool.cleanUp() + + def test_recv_full_into_ro_pool(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + dst = ZFSTest.readonly_pool.makeName(b'fs2/received@snap') + + with streams(srcfs, b"snap", None) as (_, (stream, _)): + with self.assertRaises(lzc_exc.ReadOnlyPool): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_already_existing_modified_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-5") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + lzc.lzc_create(dstfs) + with temp_file_in_fs(dstfs): + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(( + lzc_exc.DestinationModified, lzc_exc.DatasetExists)): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_already_existing_with_snapshots(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-4") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + lzc.lzc_create(dstfs) + lzc.lzc_snapshot([dstfs + b"@snap1"]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(( + lzc_exc.StreamMismatch, lzc_exc.DatasetExists)): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_already_existing_snapshot(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-6") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + lzc.lzc_create(dstfs) + lzc.lzc_snapshot([dst]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.DatasetExists): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_missing_parent_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dst = ZFSTest.pool.makeName(b"fs2/nonexistent/fs@snap") + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_receive(dst, stream.fileno()) + + def test_recv_full_but_specify_origin(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src = srcfs + b"@snap" + dstfs = ZFSTest.pool.makeName(b"fs2/received-30") + dst = dstfs + b'@snap' + origin1 = ZFSTest.pool.makeName(b"fs2@snap1") + origin2 = ZFSTest.pool.makeName(b"fs2@snap2") + + lzc.lzc_snapshot([origin1]) + with streams(srcfs, src, None) as (_, (stream, _)): + lzc.lzc_receive(dst, stream.fileno(), origin=origin1) + origin = ZFSTest.pool.getFilesystem( + b"fs2/received-30").getProperty('origin') + self.assertEqual(origin, origin1) + stream.seek(0) + # because origin snap does not exist can't receive as a clone of it + with self.assertRaises(( + lzc_exc.DatasetNotFound, + lzc_exc.BadStream)): + lzc.lzc_receive(dst, stream.fileno(), origin=origin2) + + def test_recv_full_existing_empty_fs_and_origin(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src = srcfs + b"@snap" + dstfs = ZFSTest.pool.makeName(b"fs2/received-31") + dst = dstfs + b'@snap' + origin = dstfs + b'@dummy' + + lzc.lzc_create(dstfs) + with streams(srcfs, src, None) as (_, (stream, _)): + # because the destination fs already exists and has no snaps + with self.assertRaises(( + lzc_exc.DestinationModified, + lzc_exc.DatasetExists, + lzc_exc.BadStream)): + lzc.lzc_receive(dst, stream.fileno(), origin=origin) + lzc.lzc_snapshot([origin]) + stream.seek(0) + # because the destination fs already exists and has the snap + with self.assertRaises(( + lzc_exc.StreamMismatch, + lzc_exc.DatasetExists, + lzc_exc.BadStream)): + lzc.lzc_receive(dst, stream.fileno(), origin=origin) + + def test_recv_incremental_mounted_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-7") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with zfs_mount(dstfs): + lzc.lzc_receive(dst2, incr.fileno()) + + def test_recv_incremental_modified_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-15") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + with self.assertRaises(lzc_exc.DestinationModified): + lzc.lzc_receive(dst2, incr.fileno()) + + def test_recv_incremental_snapname_used(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-8") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_snapshot([dst2]) + with self.assertRaises(lzc_exc.DatasetExists): + lzc.lzc_receive(dst2, incr.fileno()) + + def test_recv_incremental_more_recent_snap_with_no_changes(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-9") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_snapshot([dst_snap]) + lzc.lzc_receive(dst2, incr.fileno()) + + def test_recv_incremental_non_clone_but_set_origin(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-20") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_snapshot([dst_snap]) + # because cannot receive incremental and set origin on a non-clone + with self.assertRaises(lzc_exc.BadStream): + lzc.lzc_receive(dst2, incr.fileno(), origin=dst1) + + def test_recv_incremental_non_clone_but_set_random_origin(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-21") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_snapshot([dst_snap]) + # because origin snap does not exist can't receive as a clone of it + with self.assertRaises(( + lzc_exc.DatasetNotFound, + lzc_exc.BadStream)): + lzc.lzc_receive( + dst2, incr.fileno(), + origin=ZFSTest.pool.makeName(b"fs2/fs@snap")) + + def test_recv_incremental_more_recent_snap(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-10") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + lzc.lzc_snapshot([dst_snap]) + with self.assertRaises(lzc_exc.DestinationModified): + lzc.lzc_receive(dst2, incr.fileno()) + + def test_recv_incremental_duplicate(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-11") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_receive(dst2, incr.fileno()) + incr.seek(0) + with self.assertRaises(lzc_exc.DestinationModified): + lzc.lzc_receive(dst_snap, incr.fileno()) + + def test_recv_incremental_unrelated_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-12") + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (_, incr)): + lzc.lzc_create(dstfs) + with self.assertRaises(lzc_exc.StreamMismatch): + lzc.lzc_receive(dst_snap, incr.fileno()) + + def test_recv_incremental_nonexistent_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-13") + dst_snap = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (_, incr)): + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_receive(dst_snap, incr.fileno()) + + def test_recv_incremental_same_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + src_snap = srcfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (_, incr)): + with self.assertRaises(lzc_exc.DestinationModified): + lzc.lzc_receive(src_snap, incr.fileno()) + + def test_recv_clone_without_specifying_origin(self): + orig_src = ZFSTest.pool.makeName(b"fs2@send-origin-2") + clone = ZFSTest.pool.makeName(b"fs1/fs/send-clone-2") + clone_snap = clone + b"@snap" + orig_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-origin-2@snap") + clone_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-clone-2@snap") + + lzc.lzc_snapshot([orig_src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(orig_src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(orig_dst, stream.fileno()) + + lzc.lzc_clone(clone, orig_src) + lzc.lzc_snapshot([clone_snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(clone_snap, orig_src, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.BadStream): + lzc.lzc_receive(clone_dst, stream.fileno()) + + def test_recv_clone_invalid_origin(self): + orig_src = ZFSTest.pool.makeName(b"fs2@send-origin-3") + clone = ZFSTest.pool.makeName(b"fs1/fs/send-clone-3") + clone_snap = clone + b"@snap" + orig_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-origin-3@snap") + clone_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-clone-3@snap") + + lzc.lzc_snapshot([orig_src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(orig_src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(orig_dst, stream.fileno()) + + lzc.lzc_clone(clone, orig_src) + lzc.lzc_snapshot([clone_snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(clone_snap, orig_src, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_receive( + clone_dst, stream.fileno(), + origin=ZFSTest.pool.makeName(b"fs1/fs")) + + def test_recv_clone_wrong_origin(self): + orig_src = ZFSTest.pool.makeName(b"fs2@send-origin-4") + clone = ZFSTest.pool.makeName(b"fs1/fs/send-clone-4") + clone_snap = clone + b"@snap" + orig_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-origin-4@snap") + clone_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-clone-4@snap") + wrong_origin = ZFSTest.pool.makeName(b"fs1/fs@snap") + + lzc.lzc_snapshot([orig_src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(orig_src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(orig_dst, stream.fileno()) + + lzc.lzc_clone(clone, orig_src) + lzc.lzc_snapshot([clone_snap]) + lzc.lzc_snapshot([wrong_origin]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(clone_snap, orig_src, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.StreamMismatch): + lzc.lzc_receive( + clone_dst, stream.fileno(), origin=wrong_origin) + + def test_recv_clone_nonexistent_origin(self): + orig_src = ZFSTest.pool.makeName(b"fs2@send-origin-5") + clone = ZFSTest.pool.makeName(b"fs1/fs/send-clone-5") + clone_snap = clone + b"@snap" + orig_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-origin-5@snap") + clone_dst = ZFSTest.pool.makeName(b"fs1/fs/recv-clone-5@snap") + wrong_origin = ZFSTest.pool.makeName(b"fs1/fs@snap") + + lzc.lzc_snapshot([orig_src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(orig_src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(orig_dst, stream.fileno()) + + lzc.lzc_clone(clone, orig_src) + lzc.lzc_snapshot([clone_snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(clone_snap, orig_src, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_receive( + clone_dst, stream.fileno(), origin=wrong_origin) + + def test_force_recv_full_existing_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-50") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + + lzc.lzc_create(dstfs) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst, stream.fileno(), force=True) + + def test_force_recv_full_existing_modified_mounted_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-53") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + + lzc.lzc_create(dstfs) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with zfs_mount(dstfs) as mntdir: + f = tempfile.NamedTemporaryFile(dir=mntdir, delete=False) + for i in range(1024): + f.write(b'x' * 1024) + lzc.lzc_receive(dst, stream.fileno(), force=True) + # The temporary file disappears and any access, even close(), + # results in EIO. + self.assertFalse(os.path.exists(f.name)) + with self.assertRaises(IOError): + f.close() + + # This test-case expects the behavior that should be there, + # at the moment it may fail with DatasetExists or StreamMismatch + # depending on the implementation. + def test_force_recv_full_already_existing_with_snapshots(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-51") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + + lzc.lzc_create(dstfs) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dstfs + b"@snap1"]) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst, stream.fileno(), force=True) + + def test_force_recv_full_already_existing_with_same_snap(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.makeName(b"fs2/received-52") + dst = dstfs + b'@snap' + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + + lzc.lzc_create(dstfs) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dst]) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.DatasetExists): + lzc.lzc_receive(dst, stream.fileno(), force=True) + + def test_force_recv_full_missing_parent_fs(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dst = ZFSTest.pool.makeName(b"fs2/nonexistent/fs@snap") + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")): + lzc.lzc_snapshot([src]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_receive(dst, stream.fileno(), force=True) + + def test_force_recv_incremental_modified_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-60") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_receive(dst2, incr.fileno(), force=True) + + def test_force_recv_incremental_modified_mounted_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-64") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with zfs_mount(dstfs) as mntdir: + f = tempfile.NamedTemporaryFile(dir=mntdir, delete=False) + for i in range(1024): + f.write(b'x' * 1024) + lzc.lzc_receive(dst2, incr.fileno(), force=True) + # The temporary file disappears and any access, even close(), + # results in EIO. + self.assertFalse(os.path.exists(f.name)) + with self.assertRaises(IOError): + f.close() + + def test_force_recv_incremental_modified_fs_plus_later_snap(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-61") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst3 = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dst3]) + lzc.lzc_receive(dst2, incr.fileno(), force=True) + self.assertExists(dst1) + self.assertExists(dst2) + self.assertNotExists(dst3) + + def test_force_recv_incremental_modified_fs_plus_same_name_snap(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-62") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dst2]) + with self.assertRaises(lzc_exc.DatasetExists): + lzc.lzc_receive(dst2, incr.fileno(), force=True) + + def test_force_recv_incremental_modified_fs_plus_held_snap(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-63") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst3 = dstfs + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dst3]) + with cleanup_fd() as cfd: + lzc.lzc_hold({dst3: b'tag'}, cfd) + with self.assertRaises(lzc_exc.DatasetBusy): + lzc.lzc_receive(dst2, incr.fileno(), force=True) + self.assertExists(dst1) + self.assertNotExists(dst2) + self.assertExists(dst3) + + def test_force_recv_incremental_modified_fs_plus_cloned_snap(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-70") + dst1 = dstfs + b'@snap1' + dst2 = dstfs + b'@snap2' + dst3 = dstfs + b'@snap' + cloned = ZFSTest.pool.makeName(b"fs2/received-cloned-70") + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + with temp_file_in_fs(dstfs): + pass # enough to taint the fs + lzc.lzc_snapshot([dst3]) + lzc.lzc_clone(cloned, dst3) + with self.assertRaises(lzc_exc.DatasetExists): + lzc.lzc_receive(dst2, incr.fileno(), force=True) + self.assertExists(dst1) + self.assertNotExists(dst2) + self.assertExists(dst3) + + def test_recv_incremental_into_cloned_fs(self): + srcfs = ZFSTest.pool.makeName(b"fs1") + src1 = srcfs + b"@snap1" + src2 = srcfs + b"@snap2" + dstfs = ZFSTest.pool.makeName(b"fs2/received-71") + dst1 = dstfs + b'@snap1' + cloned = ZFSTest.pool.makeName(b"fs2/received-cloned-71") + dst2 = cloned + b'@snap' + + with streams(srcfs, src1, src2) as (_, (full, incr)): + lzc.lzc_receive(dst1, full.fileno()) + lzc.lzc_clone(cloned, dst1) + # test both graceful and with-force attempts + with self.assertRaises(lzc_exc.StreamMismatch): + lzc.lzc_receive(dst2, incr.fileno()) + incr.seek(0) + with self.assertRaises(lzc_exc.StreamMismatch): + lzc.lzc_receive(dst2, incr.fileno(), force=True) + self.assertExists(dst1) + self.assertNotExists(dst2) + + def test_recv_with_header_full(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dst = ZFSTest.pool.makeName(b"fs2/received") + + with temp_file_in_fs(ZFSTest.pool.makeName(b"fs1")) as name: + lzc.lzc_snapshot([src]) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + + (header, c_header) = lzc.receive_header(stream.fileno()) + self.assertEqual(src, header['drr_toname']) + snap = header['drr_toname'].split(b'@', 1)[1] + lzc.lzc_receive_with_header( + dst + b'@' + snap, stream.fileno(), c_header) + + name = os.path.basename(name) + with zfs_mount(src) as mnt1, zfs_mount(dst) as mnt2: + self.assertTrue( + filecmp.cmp( + os.path.join(mnt1, name), os.path.join(mnt2, name), False)) + + def test_recv_fs_below_zvol(self): + send = ZFSTest.pool.makeName(b"fs1@snap") + zvol = ZFSTest.pool.makeName(b"fs1/zvol") + dest = zvol + b"/fs@snap" + props = {b"volsize": 1024 * 1024} + + lzc.lzc_snapshot([send]) + lzc.lzc_create(zvol, ds_type='zvol', props=props) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(send, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_receive(dest, stream.fileno()) + + def test_recv_zvol_over_fs_with_children(self): + parent = ZFSTest.pool.makeName(b"fs1") + child = parent + b"subfs" + zvol = ZFSTest.pool.makeName(b"fs1/zvol") + send = zvol + b"@snap" + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(child) + lzc.lzc_create(zvol, ds_type='zvol', props=props) + lzc.lzc_snapshot([send]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(send, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_receive(parent + b"@snap", stream.fileno(), force=True) + + def test_recv_zvol_overwrite_rootds(self): + zvol = ZFSTest.pool.makeName(b"fs1/zvol") + snap = zvol + b"@snap" + rootds = ZFSTest.pool.getRoot().getName() + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(zvol, ds_type='zvol', props=props) + lzc.lzc_snapshot([snap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(snap, None, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_receive(rootds + b"@snap", stream.fileno(), force=True) + + def test_send_full_across_clone_branch_point(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-20", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-20") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, None, stream.fileno()) + + def test_send_incr_across_clone_branch_point(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-21", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-21") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, fromsnap, stream.fileno()) + + def test_send_resume_token_full(self): + src = ZFSTest.pool.makeName(b"fs1@snap") + dstfs = ZFSTest.pool.getFilesystem(b"fs2/received") + dst = dstfs.getSnap() + + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + for i in range(1, 10): + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + f.write(b'x' * 1024 * i) + f.flush() + lzc.lzc_snapshot([src]) + + with tempfile.NamedTemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(src, None, stream.fileno()) + stream.seek(0) + stream.truncate(1024 * 3) + with self.assertRaises(lzc_exc.StreamTruncated): + lzc.lzc_receive_resumable(dst, stream.fileno()) + # Resume token code from zfs_send_resume_token_to_nvlist() + # XXX: if used more than twice move this code into an external func + # format: --- + token = dstfs.getProperty("receive_resume_token") + self.assertNotEqual(token, b'-') + tokens = token.split(b'-') + self.assertEqual(len(tokens), 4) + version = tokens[0] + packed_size = int(tokens[2], 16) + compressed_nvs = tokens[3] + # Validate resume token + self.assertEqual(version, b'1') # ZFS_SEND_RESUME_TOKEN_VERSION + if sys.version_info < (3, 0): + payload = ( + zlib.decompress(str(bytearray.fromhex(compressed_nvs))) + ) + else: + payload = ( + zlib.decompress(bytearray.fromhex(compressed_nvs.decode())) + ) + self.assertEqual(len(payload), packed_size) + # Unpack + resume_values = packed_nvlist_out(payload, packed_size) + resumeobj = resume_values.get(b'object') + resumeoff = resume_values.get(b'offset') + with tempfile.NamedTemporaryFile(suffix='.zstream') as rstream: + lzc.lzc_send_resume( + src, None, rstream.fileno(), None, resumeobj, resumeoff) + rstream.seek(0) + lzc.lzc_receive_resumable(dst, rstream.fileno()) + + def test_send_resume_token_incremental(self): + snap1 = ZFSTest.pool.makeName(b"fs1@snap1") + snap2 = ZFSTest.pool.makeName(b"fs1@snap2") + dstfs = ZFSTest.pool.getFilesystem(b"fs2/received") + dst1 = dstfs.getSnap() + dst2 = dstfs.getSnap() + + lzc.lzc_snapshot([snap1]) + with tempfile.NamedTemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(snap1, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(dst1, stream.fileno()) + + with zfs_mount(ZFSTest.pool.makeName(b"fs1")) as mntdir: + for i in range(1, 10): + with tempfile.NamedTemporaryFile(dir=mntdir) as f: + f.write(b'x' * 1024 * i) + f.flush() + lzc.lzc_snapshot([snap2]) + + with tempfile.NamedTemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(snap2, snap1, stream.fileno()) + stream.seek(0) + stream.truncate(1024 * 3) + with self.assertRaises(lzc_exc.StreamTruncated): + lzc.lzc_receive_resumable(dst2, stream.fileno()) + # Resume token code from zfs_send_resume_token_to_nvlist() + # format: --- + token = dstfs.getProperty("receive_resume_token") + self.assertNotEqual(token, '-') + tokens = token.split(b'-') + self.assertEqual(len(tokens), 4) + version = tokens[0] + packed_size = int(tokens[2], 16) + compressed_nvs = tokens[3] + # Validate resume token + self.assertEqual(version, b'1') # ZFS_SEND_RESUME_TOKEN_VERSION + if sys.version_info < (3, 0): + payload = ( + zlib.decompress(str(bytearray.fromhex(compressed_nvs))) + ) + else: + payload = ( + zlib.decompress(bytearray.fromhex(compressed_nvs.decode())) + ) + self.assertEqual(len(payload), packed_size) + # Unpack + resume_values = packed_nvlist_out(payload, packed_size) + resumeobj = resume_values.get(b'object') + resumeoff = resume_values.get(b'offset') + with tempfile.NamedTemporaryFile(suffix='.zstream') as rstream: + lzc.lzc_send_resume( + snap2, snap1, rstream.fileno(), None, resumeobj, resumeoff) + rstream.seek(0) + lzc.lzc_receive_resumable(dst2, rstream.fileno()) + + def test_recv_full_across_clone_branch_point(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-30", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-30") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + recvfs = ZFSTest.pool.makeName(b"fs1/recv-clone-30") + recvsnap = recvfs + b"@snap" + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(recvsnap, stream.fileno()) + + def test_recv_one(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + tosnap = ZFSTest.pool.makeName(b"recv@snap1") + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + lzc.lzc_receive_one(tosnap, stream.fileno(), c_header) + + def test_recv_one_size(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + tosnap = ZFSTest.pool.makeName(b"recv@snap1") + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + size = os.fstat(stream.fileno()).st_size + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + (read, _) = lzc.lzc_receive_one(tosnap, stream.fileno(), c_header) + self.assertAlmostEqual(read, size, delta=read * 0.05) + + def test_recv_one_props(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.getFilesystem(b"recv") + tosnap = fs.getName() + b"@snap1" + props = { + b"compression": 0x01, + b"ns:prop": b"val" + } + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + lzc.lzc_receive_one(tosnap, stream.fileno(), c_header, props=props) + self.assertExists(tosnap) + self.assertEqual(fs.getProperty("compression", "received"), b"on") + self.assertEqual(fs.getProperty("ns:prop", "received"), b"val") + + def test_recv_one_invalid_prop(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.getFilesystem(b"recv") + tosnap = fs.getName() + b"@snap1" + props = { + b"exec": 0xff, + b"atime": 0x00 + } + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + with self.assertRaises(lzc_exc.ReceivePropertyFailure) as ctx: + lzc.lzc_receive_one( + tosnap, stream.fileno(), c_header, props=props) + self.assertExists(tosnap) + self.assertEqual(fs.getProperty("atime", "received"), b"off") + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PropertyInvalid) + self.assertEqual(e.name, b"exec") + + def test_recv_with_cmdprops(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.getFilesystem(b"recv") + tosnap = fs.getName() + b"@snap1" + props = {} + cmdprops = { + b"compression": 0x01, + b"ns:prop": b"val" + } + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + lzc.lzc_receive_with_cmdprops( + tosnap, stream.fileno(), c_header, props=props, + cmdprops=cmdprops) + self.assertExists(tosnap) + self.assertEqual(fs.getProperty("compression"), b"on") + self.assertEqual(fs.getProperty("ns:prop"), b"val") + + def test_recv_with_cmdprops_and_recvprops(self): + fromsnap = ZFSTest.pool.makeName(b"fs1@snap1") + fs = ZFSTest.pool.getFilesystem(b"recv") + tosnap = fs.getName() + b"@snap1" + props = { + b"atime": 0x01, + b"exec": 0x00, + b"ns:prop": b"abc" + } + cmdprops = { + b"compression": 0x01, + b"ns:prop": b"def", + b"exec": None, + } + + lzc.lzc_snapshot([fromsnap]) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + (header, c_header) = lzc.receive_header(stream.fileno()) + lzc.lzc_receive_with_cmdprops( + tosnap, stream.fileno(), c_header, props=props, + cmdprops=cmdprops) + self.assertExists(tosnap) + self.assertEqual(fs.getProperty("atime", True), b"on") + self.assertEqual(fs.getProperty("exec", True), b"off") + self.assertEqual(fs.getProperty("ns:prop", True), b"abc") + self.assertEqual(fs.getProperty("compression"), b"on") + self.assertEqual(fs.getProperty("ns:prop"), b"def") + self.assertEqual(fs.getProperty("exec"), b"on") + + def test_recv_incr_across_clone_branch_point_no_origin(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-32", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-32") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + recvfs = ZFSTest.pool.makeName(b"fs1/recv-clone-32") + recvsnap1 = recvfs + b"@snap1" + recvsnap2 = recvfs + b"@snap2" + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(recvsnap1, stream.fileno()) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, fromsnap, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.BadStream): + lzc.lzc_receive(recvsnap2, stream.fileno()) + + def test_recv_incr_across_clone_branch_point(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-31", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-31") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + recvfs = ZFSTest.pool.makeName(b"fs1/recv-clone-31") + recvsnap1 = recvfs + b"@snap1" + recvsnap2 = recvfs + b"@snap2" + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(recvsnap1, stream.fileno()) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, fromsnap, stream.fileno()) + stream.seek(0) + with self.assertRaises(lzc_exc.BadStream): + lzc.lzc_receive(recvsnap2, stream.fileno(), origin=recvsnap1) + + def test_recv_incr_across_clone_branch_point_new_fs(self): + origfs = ZFSTest.pool.makeName(b"fs2") + + (_, (fromsnap, origsnap, _)) = make_snapshots( + origfs, b"snap1", b"send-origin-33", None) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/send-clone-33") + lzc.lzc_clone(clonefs, origsnap) + + (_, (_, tosnap, _)) = make_snapshots(clonefs, None, b"snap", None) + + recvfs1 = ZFSTest.pool.makeName(b"fs1/recv-clone-33") + recvsnap1 = recvfs1 + b"@snap" + recvfs2 = ZFSTest.pool.makeName(b"fs1/recv-clone-33_2") + recvsnap2 = recvfs2 + b"@snap" + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(fromsnap, None, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(recvsnap1, stream.fileno()) + with tempfile.TemporaryFile(suffix='.zstream') as stream: + lzc.lzc_send(tosnap, fromsnap, stream.fileno()) + stream.seek(0) + lzc.lzc_receive(recvsnap2, stream.fileno(), origin=recvsnap1) + + def test_recv_bad_stream(self): + dstfs = ZFSTest.pool.makeName(b"fs2/received") + dst_snap = dstfs + b'@snap' + + with dev_zero() as fd: + with self.assertRaises(lzc_exc.BadStream): + lzc.lzc_receive(dst_snap, fd) + + @needs_support(lzc.lzc_promote) + def test_promote(self): + origfs = ZFSTest.pool.makeName(b"fs2") + snap = b"@promote-snap-1" + origsnap = origfs + snap + lzc.lzc_snap([origsnap]) + + clonefs = ZFSTest.pool.makeName(b"fs1/fs/promote-clone-1") + lzc.lzc_clone(clonefs, origsnap) + + lzc.lzc_promote(clonefs) + # the snapshot now should belong to the promoted fs + self.assertExists(clonefs + snap) + + @needs_support(lzc.lzc_promote) + def test_promote_too_long_snapname(self): + # origfs name must be shorter than clonefs name + origfs = ZFSTest.pool.makeName(b"fs2") + clonefs = ZFSTest.pool.makeName(b"fs1/fs/promote-clone-2") + snapprefix = b"@promote-snap-2-" + pad_len = 1 + lzc.MAXNAMELEN - len(clonefs) - len(snapprefix) + snap = snapprefix + b'x' * pad_len + origsnap = origfs + snap + + lzc.lzc_snap([origsnap]) + lzc.lzc_clone(clonefs, origsnap) + + # This may fail on older buggy systems. + # See: https://www.illumos.org/issues/5909 + with self.assertRaises(lzc_exc.NameTooLong): + lzc.lzc_promote(clonefs) + + @needs_support(lzc.lzc_promote) + def test_promote_not_cloned(self): + fs = ZFSTest.pool.makeName(b"fs2") + with self.assertRaises(lzc_exc.NotClone): + lzc.lzc_promote(fs) + + @unittest.skipIf(*illumos_bug_6379()) + def test_hold_bad_fd(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile() as tmp: + bad_fd = tmp.fileno() + + with self.assertRaises(lzc_exc.BadHoldCleanupFD): + lzc.lzc_hold({snap: b'tag'}, bad_fd) + + @unittest.skipIf(*illumos_bug_6379()) + def test_hold_bad_fd_2(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with self.assertRaises(lzc_exc.BadHoldCleanupFD): + lzc.lzc_hold({snap: b'tag'}, -2) + + @unittest.skipIf(*illumos_bug_6379()) + def test_hold_bad_fd_3(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + (soft, hard) = resource.getrlimit(resource.RLIMIT_NOFILE) + bad_fd = hard + 1 + with self.assertRaises(lzc_exc.BadHoldCleanupFD): + lzc.lzc_hold({snap: b'tag'}, bad_fd) + + @unittest.skipIf(*illumos_bug_6379()) + def test_hold_wrong_fd(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with tempfile.TemporaryFile() as tmp: + fd = tmp.fileno() + with self.assertRaises(lzc_exc.BadHoldCleanupFD): + lzc.lzc_hold({snap: b'tag'}, fd) + + def test_hold_fd(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag'}, fd) + + def test_hold_empty(self): + with cleanup_fd() as fd: + lzc.lzc_hold({}, fd) + + def test_hold_empty_2(self): + lzc.lzc_hold({}) + + def test_hold_vs_snap_destroy(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag'}, fd) + + with self.assertRaises(lzc_exc.SnapshotDestructionFailure) as ctx: + lzc.lzc_destroy_snaps([snap], defer=False) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.SnapshotIsHeld) + + lzc.lzc_destroy_snaps([snap], defer=True) + self.assertExists(snap) + + # after automatic hold cleanup and deferred destruction + self.assertNotExists(snap) + + def test_hold_many_tags(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag1'}, fd) + lzc.lzc_hold({snap: b'tag2'}, fd) + + def test_hold_many_snaps(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap1: b'tag', snap2: b'tag'}, fd) + + def test_hold_many_with_one_missing(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap1]) + + with cleanup_fd() as fd: + missing = lzc.lzc_hold({snap1: b'tag', snap2: b'tag'}, fd) + self.assertEqual(len(missing), 1) + self.assertEqual(missing[0], snap2) + + def test_hold_many_with_all_missing(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.pool.getRoot().getSnap() + + with cleanup_fd() as fd: + missing = lzc.lzc_hold({snap1: b'tag', snap2: b'tag'}, fd) + self.assertEqual(len(missing), 2) + self.assertEqual(sorted(missing), sorted([snap1, snap2])) + + def test_hold_missing_fs(self): + # XXX skip pre-created filesystems + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + snap = ZFSTest.pool.getRoot().getFilesystem().getSnap() + + snaps = lzc.lzc_hold({snap: b'tag'}) + self.assertEqual([snap], snaps) + + def test_hold_missing_fs_auto_cleanup(self): + # XXX skip pre-created filesystems + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + ZFSTest.pool.getRoot().getFilesystem() + snap = ZFSTest.pool.getRoot().getFilesystem().getSnap() + + with cleanup_fd() as fd: + snaps = lzc.lzc_hold({snap: b'tag'}, fd) + self.assertEqual([snap], snaps) + + def test_hold_duplicate(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag'}, fd) + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.HoldExists) + + def test_hold_across_pools(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.misc_pool.getRoot().getSnap() + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap1: b'tag', snap2: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolsDiffer) + + def test_hold_too_long_tag(self): + snap = ZFSTest.pool.getRoot().getSnap() + tag = b't' * 256 + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: tag}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertEqual(e.name, tag) + + # Apparently the full snapshot name is not checked for length + # and this snapshot is treated as simply missing. + @unittest.expectedFailure + def test_hold_too_long_snap_name(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(False) + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertEqual(e.name, snap) + + def test_hold_too_long_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(True) + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertEqual(e.name, snap) + + def test_hold_invalid_snap_name(self): + snap = ZFSTest.pool.getRoot().getSnap() + b'@bad' + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + self.assertEqual(e.name, snap) + + def test_hold_invalid_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getFilesystem().getName() + with cleanup_fd() as fd: + with self.assertRaises(lzc_exc.HoldFailure) as ctx: + lzc.lzc_hold({snap: b'tag'}, fd) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + self.assertEqual(e.name, snap) + + def test_get_holds(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag1'}, fd) + lzc.lzc_hold({snap: b'tag2'}, fd) + + holds = lzc.lzc_get_holds(snap) + self.assertEqual(len(holds), 2) + self.assertIn(b'tag1', holds) + self.assertIn(b'tag2', holds) + self.assertIsInstance(holds[b'tag1'], (int, int)) + + def test_get_holds_after_auto_cleanup(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag1'}, fd) + lzc.lzc_hold({snap: b'tag2'}, fd) + + holds = lzc.lzc_get_holds(snap) + self.assertEqual(len(holds), 0) + self.assertIsInstance(holds, dict) + + def test_get_holds_nonexistent_snap(self): + snap = ZFSTest.pool.getRoot().getSnap() + with self.assertRaises(lzc_exc.SnapshotNotFound): + lzc.lzc_get_holds(snap) + + def test_get_holds_too_long_snap_name(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(False) + with self.assertRaises(lzc_exc.NameTooLong): + lzc.lzc_get_holds(snap) + + def test_get_holds_too_long_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(True) + with self.assertRaises(lzc_exc.NameTooLong): + lzc.lzc_get_holds(snap) + + def test_get_holds_invalid_snap_name(self): + snap = ZFSTest.pool.getRoot().getSnap() + b'@bad' + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_get_holds(snap) + + # A filesystem-like snapshot name is not recognized as + # an invalid name. + @unittest.expectedFailure + def test_get_holds_invalid_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getFilesystem().getName() + with self.assertRaises(lzc_exc.NameInvalid): + lzc.lzc_get_holds(snap) + + def test_release_hold(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + lzc.lzc_hold({snap: b'tag'}) + ret = lzc.lzc_release({snap: [b'tag']}) + self.assertEqual(len(ret), 0) + + def test_release_hold_empty(self): + ret = lzc.lzc_release({}) + self.assertEqual(len(ret), 0) + + def test_release_hold_complex(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.pool.getRoot().getSnap() + snap3 = ZFSTest.pool.getRoot().getFilesystem().getSnap() + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2, snap3]) + + lzc.lzc_hold({snap1: b'tag1'}) + lzc.lzc_hold({snap1: b'tag2'}) + lzc.lzc_hold({snap2: b'tag'}) + lzc.lzc_hold({snap3: b'tag1'}) + lzc.lzc_hold({snap3: b'tag2'}) + + holds = lzc.lzc_get_holds(snap1) + self.assertEqual(len(holds), 2) + holds = lzc.lzc_get_holds(snap2) + self.assertEqual(len(holds), 1) + holds = lzc.lzc_get_holds(snap3) + self.assertEqual(len(holds), 2) + + release = { + snap1: [b'tag1', b'tag2'], + snap2: [b'tag'], + snap3: [b'tag2'], + } + ret = lzc.lzc_release(release) + self.assertEqual(len(ret), 0) + + holds = lzc.lzc_get_holds(snap1) + self.assertEqual(len(holds), 0) + holds = lzc.lzc_get_holds(snap2) + self.assertEqual(len(holds), 0) + holds = lzc.lzc_get_holds(snap3) + self.assertEqual(len(holds), 1) + + ret = lzc.lzc_release({snap3: [b'tag1']}) + self.assertEqual(len(ret), 0) + holds = lzc.lzc_get_holds(snap3) + self.assertEqual(len(holds), 0) + + def test_release_hold_before_auto_cleanup(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag'}, fd) + ret = lzc.lzc_release({snap: [b'tag']}) + self.assertEqual(len(ret), 0) + + def test_release_hold_and_snap_destruction(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag1'}, fd) + lzc.lzc_hold({snap: b'tag2'}, fd) + + lzc.lzc_destroy_snaps([snap], defer=True) + self.assertExists(snap) + + lzc.lzc_release({snap: [b'tag1']}) + self.assertExists(snap) + + lzc.lzc_release({snap: [b'tag2']}) + self.assertNotExists(snap) + + def test_release_hold_and_multiple_snap_destruction(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap: b'tag'}, fd) + + lzc.lzc_destroy_snaps([snap], defer=True) + self.assertExists(snap) + + lzc.lzc_destroy_snaps([snap], defer=True) + self.assertExists(snap) + + lzc.lzc_release({snap: [b'tag']}) + self.assertNotExists(snap) + + def test_release_hold_missing_tag(self): + snap = ZFSTest.pool.getRoot().getSnap() + lzc.lzc_snapshot([snap]) + + ret = lzc.lzc_release({snap: [b'tag']}) + self.assertEqual(len(ret), 1) + self.assertEqual(ret[0], snap + b'#tag') + + def test_release_hold_missing_snap(self): + snap = ZFSTest.pool.getRoot().getSnap() + + ret = lzc.lzc_release({snap: [b'tag']}) + self.assertEqual(len(ret), 1) + self.assertEqual(ret[0], snap) + + def test_release_hold_missing_snap_2(self): + snap = ZFSTest.pool.getRoot().getSnap() + + ret = lzc.lzc_release({snap: [b'tag', b'another']}) + self.assertEqual(len(ret), 1) + self.assertEqual(ret[0], snap) + + def test_release_hold_across_pools(self): + snap1 = ZFSTest.pool.getRoot().getSnap() + snap2 = ZFSTest.misc_pool.getRoot().getSnap() + lzc.lzc_snapshot([snap1]) + lzc.lzc_snapshot([snap2]) + + with cleanup_fd() as fd: + lzc.lzc_hold({snap1: b'tag'}, fd) + lzc.lzc_hold({snap2: b'tag'}, fd) + with self.assertRaises(lzc_exc.HoldReleaseFailure) as ctx: + lzc.lzc_release({snap1: [b'tag'], snap2: [b'tag']}) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.PoolsDiffer) + + # Apparently the tag name is not verified, + # only its existence is checked. + @unittest.expectedFailure + def test_release_hold_too_long_tag(self): + snap = ZFSTest.pool.getRoot().getSnap() + tag = b't' * 256 + lzc.lzc_snapshot([snap]) + + with self.assertRaises(lzc_exc.HoldReleaseFailure): + lzc.lzc_release({snap: [tag]}) + + # Apparently the full snapshot name is not checked for length + # and this snapshot is treated as simply missing. + @unittest.expectedFailure + def test_release_hold_too_long_snap_name(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(False) + + with self.assertRaises(lzc_exc.HoldReleaseFailure): + lzc.lzc_release({snap: [b'tag']}) + + def test_release_hold_too_long_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getTooLongSnap(True) + with self.assertRaises(lzc_exc.HoldReleaseFailure) as ctx: + lzc.lzc_release({snap: [b'tag']}) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameTooLong) + self.assertEqual(e.name, snap) + + def test_release_hold_invalid_snap_name(self): + snap = ZFSTest.pool.getRoot().getSnap() + b'@bad' + with self.assertRaises(lzc_exc.HoldReleaseFailure) as ctx: + lzc.lzc_release({snap: [b'tag']}) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + self.assertEqual(e.name, snap) + + def test_release_hold_invalid_snap_name_2(self): + snap = ZFSTest.pool.getRoot().getFilesystem().getName() + with self.assertRaises(lzc_exc.HoldReleaseFailure) as ctx: + lzc.lzc_release({snap: [b'tag']}) + for e in ctx.exception.errors: + self.assertIsInstance(e, lzc_exc.NameInvalid) + self.assertEqual(e.name, snap) + + def test_sync_missing_pool(self): + pool = b"nonexistent" + with self.assertRaises(lzc_exc.PoolNotFound): + lzc.lzc_sync(pool) + + def test_sync_pool_forced(self): + pool = ZFSTest.pool.getRoot().getName() + lzc.lzc_sync(pool, True) + + def test_reopen_missing_pool(self): + pool = b"nonexistent" + with self.assertRaises(lzc_exc.PoolNotFound): + lzc.lzc_reopen(pool) + + def test_reopen_pool_no_restart(self): + pool = ZFSTest.pool.getRoot().getName() + lzc.lzc_reopen(pool, False) + + def test_channel_program_missing_pool(self): + pool = b"nonexistent" + with self.assertRaises(lzc_exc.PoolNotFound): + lzc.lzc_channel_program(pool, b"return {}") + + def test_channel_program_timeout(self): + pool = ZFSTest.pool.getRoot().getName() + zcp = b""" +for i = 1,10000 do + zfs.sync.snapshot('""" + pool + b"""@zcp' .. i) +end +""" + with self.assertRaises(lzc_exc.ZCPTimeout): + lzc.lzc_channel_program(pool, zcp, instrlimit=1) + + def test_channel_program_memory_limit(self): + pool = ZFSTest.pool.getRoot().getName() + zcp = b""" +for i = 1,10000 do + zfs.sync.snapshot('""" + pool + b"""@zcp' .. i) +end +""" + with self.assertRaises(lzc_exc.ZCPSpaceError): + lzc.lzc_channel_program(pool, zcp, memlimit=1) + + def test_channel_program_invalid_limits(self): + pool = ZFSTest.pool.getRoot().getName() + zcp = b""" +return {} +""" + with self.assertRaises(lzc_exc.ZCPLimitInvalid): + lzc.lzc_channel_program(pool, zcp, instrlimit=0) + with self.assertRaises(lzc_exc.ZCPLimitInvalid): + lzc.lzc_channel_program(pool, zcp, memlimit=0) + + def test_channel_program_syntax_error(self): + pool = ZFSTest.pool.getRoot().getName() + zcp = b""" +inv+val:id +""" + with self.assertRaises(lzc_exc.ZCPSyntaxError) as ctx: + lzc.lzc_channel_program(pool, zcp) + self.assertTrue(b"syntax error" in ctx.exception.details) + + def test_channel_program_sync_snapshot(self): + pool = ZFSTest.pool.getRoot().getName() + snapname = ZFSTest.pool.makeName(b"@zcp") + zcp = b""" +zfs.sync.snapshot('""" + snapname + b"""') +""" + lzc.lzc_channel_program(pool, zcp) + self.assertExists(snapname) + + def test_channel_program_runtime_error(self): + pool = ZFSTest.pool.getRoot().getName() + + # failing an assertion raises a runtime error + with self.assertRaises(lzc_exc.ZCPRuntimeError) as ctx: + lzc.lzc_channel_program(pool, b"assert(1 == 2)") + self.assertTrue( + b"assertion failed" in ctx.exception.details) + # invoking the error() function raises a runtime error + with self.assertRaises(lzc_exc.ZCPRuntimeError) as ctx: + lzc.lzc_channel_program(pool, b"error()") + + def test_channel_program_nosync_runtime_error(self): + pool = ZFSTest.pool.getRoot().getName() + zcp = b""" +zfs.sync.snapshot('""" + pool + b"""@zcp') +""" + # lzc_channel_program_nosync() allows only "read-only" operations + with self.assertRaises(lzc_exc.ZCPRuntimeError) as ctx: + lzc.lzc_channel_program_nosync(pool, zcp) + self.assertTrue( + b"running functions from the zfs.sync" in ctx.exception.details) + + def test_change_key_new(self): + with encrypted_filesystem() as (fs, _): + lzc.lzc_change_key( + fs, 'new_key', + props={b"keyformat": lzc.zfs_keyformat.ZFS_KEYFORMAT_RAW}, + key=os.urandom(lzc.WRAPPING_KEY_LEN)) + + def test_change_key_missing_fs(self): + name = b"nonexistent" + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_change_key( + name, 'new_key', + props={b"keyformat": lzc.zfs_keyformat.ZFS_KEYFORMAT_RAW}, + key=os.urandom(lzc.WRAPPING_KEY_LEN)) + + def test_change_key_not_loaded(self): + with encrypted_filesystem() as (fs, _): + lzc.lzc_unload_key(fs) + with self.assertRaises(lzc_exc.EncryptionKeyNotLoaded): + lzc.lzc_change_key( + fs, 'new_key', + props={b"keyformat": lzc.zfs_keyformat.ZFS_KEYFORMAT_RAW}, + key=os.urandom(lzc.WRAPPING_KEY_LEN)) + + def test_change_key_invalid_property(self): + with encrypted_filesystem() as (fs, _): + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_change_key(fs, 'new_key', props={b"invalid": b"prop"}) + + def test_change_key_invalid_crypt_command(self): + with encrypted_filesystem() as (fs, _): + with self.assertRaises(lzc_exc.UnknownCryptCommand): + lzc.lzc_change_key(fs, 'duplicate_key') + + def test_load_key(self): + with encrypted_filesystem() as (fs, key): + lzc.lzc_unload_key(fs) + lzc.lzc_load_key(fs, False, key) + + def test_load_key_invalid(self): + with encrypted_filesystem() as (fs, key): + lzc.lzc_unload_key(fs) + with self.assertRaises(lzc_exc.EncryptionKeyInvalid): + lzc.lzc_load_key(fs, False, os.urandom(lzc.WRAPPING_KEY_LEN)) + + def test_load_key_already_loaded(self): + with encrypted_filesystem() as (fs, key): + lzc.lzc_unload_key(fs) + lzc.lzc_load_key(fs, False, key) + with self.assertRaises(lzc_exc.EncryptionKeyAlreadyLoaded): + lzc.lzc_load_key(fs, False, key) + + def test_load_key_missing_fs(self): + name = b"nonexistent" + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_load_key(name, False, key=os.urandom(lzc.WRAPPING_KEY_LEN)) + + def test_unload_key(self): + with encrypted_filesystem() as (fs, _): + lzc.lzc_unload_key(fs) + + def test_unload_key_missing_fs(self): + name = b"nonexistent" + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_unload_key(name) + + def test_unload_key_busy(self): + with encrypted_filesystem() as (fs, _): + with zfs_mount(fs): + with self.assertRaises(lzc_exc.DatasetBusy): + lzc.lzc_unload_key(fs) + + def test_unload_key_not_loaded(self): + with encrypted_filesystem() as (fs, _): + lzc.lzc_unload_key(fs) + with self.assertRaises(lzc_exc.EncryptionKeyNotLoaded): + lzc.lzc_unload_key(fs) + + def test_checkpoint(self): + pool = ZFSTest.pool.getRoot().getName() + + lzc.lzc_pool_checkpoint(pool) + + def test_checkpoint_missing_pool(self): + pool = b"nonexistent" + + with self.assertRaises(lzc_exc.PoolNotFound): + lzc.lzc_pool_checkpoint(pool) + + def test_checkpoint_already_exists(self): + pool = ZFSTest.pool.getRoot().getName() + + lzc.lzc_pool_checkpoint(pool) + with self.assertRaises(lzc_exc.CheckpointExists): + lzc.lzc_pool_checkpoint(pool) + + def test_checkpoint_discard(self): + pool = ZFSTest.pool.getRoot().getName() + + lzc.lzc_pool_checkpoint(pool) + lzc.lzc_pool_checkpoint_discard(pool) + + def test_checkpoint_discard_missing_pool(self): + pool = b"nonexistent" + + with self.assertRaises(lzc_exc.PoolNotFound): + lzc.lzc_pool_checkpoint_discard(pool) + + def test_checkpoint_discard_missing_checkpoint(self): + pool = ZFSTest.pool.getRoot().getName() + + with self.assertRaises(lzc_exc.CheckpointNotFound): + lzc.lzc_pool_checkpoint_discard(pool) + + @needs_support(lzc.lzc_list_children) + def test_list_children(self): + name = ZFSTest.pool.makeName(b"fs1/fs") + names = [ZFSTest.pool.makeName(b"fs1/fs/test1"), + ZFSTest.pool.makeName(b"fs1/fs/test2"), + ZFSTest.pool.makeName(b"fs1/fs/test3"), ] + # and one snap to see that it is not listed + snap = ZFSTest.pool.makeName(b"fs1/fs@test") + + for fs in names: + lzc.lzc_create(fs) + lzc.lzc_snapshot([snap]) + + children = list(lzc.lzc_list_children(name)) + self.assertItemsEqual(children, names) + + @needs_support(lzc.lzc_list_children) + def test_list_children_nonexistent(self): + fs = ZFSTest.pool.makeName(b"nonexistent") + + with self.assertRaises(lzc_exc.DatasetNotFound): + list(lzc.lzc_list_children(fs)) + + @needs_support(lzc.lzc_list_children) + def test_list_children_of_snap(self): + snap = ZFSTest.pool.makeName(b"@newsnap") + + lzc.lzc_snapshot([snap]) + children = list(lzc.lzc_list_children(snap)) + self.assertEqual(children, []) + + @needs_support(lzc.lzc_list_snaps) + def test_list_snaps(self): + name = ZFSTest.pool.makeName(b"fs1/fs") + names = [ZFSTest.pool.makeName(b"fs1/fs@test1"), + ZFSTest.pool.makeName(b"fs1/fs@test2"), + ZFSTest.pool.makeName(b"fs1/fs@test3"), ] + # and one filesystem to see that it is not listed + fs = ZFSTest.pool.makeName(b"fs1/fs/test") + + for snap in names: + lzc.lzc_snapshot([snap]) + lzc.lzc_create(fs) + + snaps = list(lzc.lzc_list_snaps(name)) + self.assertItemsEqual(snaps, names) + + @needs_support(lzc.lzc_list_snaps) + def test_list_snaps_nonexistent(self): + fs = ZFSTest.pool.makeName(b"nonexistent") + + with self.assertRaises(lzc_exc.DatasetNotFound): + list(lzc.lzc_list_snaps(fs)) + + @needs_support(lzc.lzc_list_snaps) + def test_list_snaps_of_snap(self): + snap = ZFSTest.pool.makeName(b"@newsnap") + + lzc.lzc_snapshot([snap]) + snaps = list(lzc.lzc_list_snaps(snap)) + self.assertEqual(snaps, []) + + @needs_support(lzc.lzc_get_props) + def test_get_fs_props(self): + fs = ZFSTest.pool.makeName(b"new") + props = {b"user:foo": b"bar"} + + lzc.lzc_create(fs, props=props) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset(props, actual_props) + + @needs_support(lzc.lzc_get_props) + def test_get_fs_props_with_child(self): + parent = ZFSTest.pool.makeName(b"parent") + child = ZFSTest.pool.makeName(b"parent/child") + parent_props = {b"user:foo": b"parent"} + child_props = {b"user:foo": b"child"} + + lzc.lzc_create(parent, props=parent_props) + lzc.lzc_create(child, props=child_props) + actual_parent_props = lzc.lzc_get_props(parent) + actual_child_props = lzc.lzc_get_props(child) + self.assertDictContainsSubset(parent_props, actual_parent_props) + self.assertDictContainsSubset(child_props, actual_child_props) + + @needs_support(lzc.lzc_get_props) + def test_get_snap_props(self): + snapname = ZFSTest.pool.makeName(b"@snap") + snaps = [snapname] + props = {b"user:foo": b"bar"} + + lzc.lzc_snapshot(snaps, props) + actual_props = lzc.lzc_get_props(snapname) + self.assertDictContainsSubset(props, actual_props) + + @needs_support(lzc.lzc_get_props) + def test_get_props_nonexistent(self): + fs = ZFSTest.pool.makeName(b"nonexistent") + + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_get_props(fs) + + @needs_support(lzc.lzc_get_props) + def test_get_mountpoint_none(self): + ''' + If the *mountpoint* property is set to none, then its + value is returned as `bytes` "none". + Also, a child filesystem inherits that value. + ''' + fs = ZFSTest.pool.makeName(b"new") + child = ZFSTest.pool.makeName(b"new/child") + props = {b"mountpoint": b"none"} + + lzc.lzc_create(fs, props=props) + lzc.lzc_create(child) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset(props, actual_props) + # check that mountpoint value is correctly inherited + child_props = lzc.lzc_get_props(child) + self.assertDictContainsSubset(props, child_props) + + @needs_support(lzc.lzc_get_props) + def test_get_mountpoint_legacy(self): + ''' + If the *mountpoint* property is set to legacy, then its + value is returned as `bytes` "legacy". + Also, a child filesystem inherits that value. + ''' + fs = ZFSTest.pool.makeName(b"new") + child = ZFSTest.pool.makeName(b"new/child") + props = {b"mountpoint": b"legacy"} + + lzc.lzc_create(fs, props=props) + lzc.lzc_create(child) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset(props, actual_props) + # check that mountpoint value is correctly inherited + child_props = lzc.lzc_get_props(child) + self.assertDictContainsSubset(props, child_props) + + @needs_support(lzc.lzc_get_props) + def test_get_mountpoint_path(self): + ''' + If the *mountpoint* property is set to a path and the property + is not explicitly set on a child filesystem, then its + value is that of the parent filesystem with the child's + name appended using the '/' separator. + ''' + fs = ZFSTest.pool.makeName(b"new") + child = ZFSTest.pool.makeName(b"new/child") + props = {b"mountpoint": b"/mnt"} + + lzc.lzc_create(fs, props=props) + lzc.lzc_create(child) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset(props, actual_props) + # check that mountpoint value is correctly inherited + child_props = lzc.lzc_get_props(child) + self.assertDictContainsSubset( + {b"mountpoint": b"/mnt/child"}, child_props) + + @needs_support(lzc.lzc_get_props) + def test_get_snap_clones(self): + fs = ZFSTest.pool.makeName(b"new") + snap = ZFSTest.pool.makeName(b"@snap") + clone1 = ZFSTest.pool.makeName(b"clone1") + clone2 = ZFSTest.pool.makeName(b"clone2") + + lzc.lzc_create(fs) + lzc.lzc_snapshot([snap]) + lzc.lzc_clone(clone1, snap) + lzc.lzc_clone(clone2, snap) + + clones_prop = lzc.lzc_get_props(snap)["clones"] + self.assertItemsEqual(clones_prop, [clone1, clone2]) + + @needs_support(lzc.lzc_rename) + def test_rename(self): + src = ZFSTest.pool.makeName(b"source") + tgt = ZFSTest.pool.makeName(b"target") + + lzc.lzc_create(src) + lzc.lzc_rename(src, tgt) + self.assertNotExists(src) + self.assertExists(tgt) + + @needs_support(lzc.lzc_rename) + def test_rename_nonexistent(self): + src = ZFSTest.pool.makeName(b"source") + tgt = ZFSTest.pool.makeName(b"target") + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_rename(src, tgt) + + @needs_support(lzc.lzc_rename) + def test_rename_existing_target(self): + src = ZFSTest.pool.makeName(b"source") + tgt = ZFSTest.pool.makeName(b"target") + + lzc.lzc_create(src) + lzc.lzc_create(tgt) + with self.assertRaises(lzc_exc.FilesystemExists): + lzc.lzc_rename(src, tgt) + + @needs_support(lzc.lzc_rename) + def test_rename_nonexistent_target_parent(self): + src = ZFSTest.pool.makeName(b"source") + tgt = ZFSTest.pool.makeName(b"parent/target") + + lzc.lzc_create(src) + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_rename(src, tgt) + + @needs_support(lzc.lzc_rename) + def test_rename_parent_is_zvol(self): + src = ZFSTest.pool.makeName(b"source") + zvol = ZFSTest.pool.makeName(b"parent") + tgt = zvol + b"/target" + props = {b"volsize": 1024 * 1024} + + lzc.lzc_create(src) + lzc.lzc_create(zvol, ds_type='zvol', props=props) + with self.assertRaises(lzc_exc.WrongParent): + lzc.lzc_rename(src, tgt) + + @needs_support(lzc.lzc_destroy) + def test_destroy(self): + fs = ZFSTest.pool.makeName(b"test-fs") + + lzc.lzc_create(fs) + lzc.lzc_destroy(fs) + self.assertNotExists(fs) + + @needs_support(lzc.lzc_destroy) + def test_destroy_nonexistent(self): + fs = ZFSTest.pool.makeName(b"test-fs") + + with self.assertRaises(lzc_exc.FilesystemNotFound): + lzc.lzc_destroy(fs) + + @needs_support(lzc.lzc_inherit_prop) + def test_inherit_prop(self): + parent = ZFSTest.pool.makeName(b"parent") + child = ZFSTest.pool.makeName(b"parent/child") + the_prop = b"user:foo" + parent_props = {the_prop: b"parent"} + child_props = {the_prop: b"child"} + + lzc.lzc_create(parent, props=parent_props) + lzc.lzc_create(child, props=child_props) + lzc.lzc_inherit_prop(child, the_prop) + actual_props = lzc.lzc_get_props(child) + self.assertDictContainsSubset(parent_props, actual_props) + + @needs_support(lzc.lzc_inherit_prop) + def test_inherit_missing_prop(self): + parent = ZFSTest.pool.makeName(b"parent") + child = ZFSTest.pool.makeName(b"parent/child") + the_prop = "user:foo" + child_props = {the_prop: "child"} + + lzc.lzc_create(parent) + lzc.lzc_create(child, props=child_props) + lzc.lzc_inherit_prop(child, the_prop) + actual_props = lzc.lzc_get_props(child) + self.assertNotIn(the_prop, actual_props) + + @needs_support(lzc.lzc_inherit_prop) + def test_inherit_readonly_prop(self): + parent = ZFSTest.pool.makeName(b"parent") + child = ZFSTest.pool.makeName(b"parent/child") + the_prop = b"createtxg" + + lzc.lzc_create(parent) + lzc.lzc_create(child) + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_inherit_prop(child, the_prop) + + @needs_support(lzc.lzc_inherit_prop) + def test_inherit_unknown_prop(self): + parent = ZFSTest.pool.makeName(b"parent") + child = ZFSTest.pool.makeName(b"parent/child") + the_prop = b"nosuchprop" + + lzc.lzc_create(parent) + lzc.lzc_create(child) + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_inherit_prop(child, the_prop) + + @needs_support(lzc.lzc_inherit_prop) + def test_inherit_prop_on_snap(self): + fs = ZFSTest.pool.makeName(b"new") + snapname = ZFSTest.pool.makeName(b"new@snap") + prop = b"user:foo" + fs_val = b"fs" + snap_val = b"snap" + + lzc.lzc_create(fs, props={prop: fs_val}) + lzc.lzc_snapshot([snapname], props={prop: snap_val}) + + actual_props = lzc.lzc_get_props(snapname) + self.assertDictContainsSubset({prop: snap_val}, actual_props) + + lzc.lzc_inherit_prop(snapname, prop) + actual_props = lzc.lzc_get_props(snapname) + self.assertDictContainsSubset({prop: fs_val}, actual_props) + + @needs_support(lzc.lzc_set_prop) + def test_set_fs_prop(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"user:foo" + val = b"bar" + + lzc.lzc_create(fs) + lzc.lzc_set_prop(fs, prop, val) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset({prop: val}, actual_props) + + @needs_support(lzc.lzc_set_prop) + def test_set_snap_prop(self): + snapname = ZFSTest.pool.makeName(b"@snap") + prop = b"user:foo" + val = b"bar" + + lzc.lzc_snapshot([snapname]) + lzc.lzc_set_prop(snapname, prop, val) + actual_props = lzc.lzc_get_props(snapname) + self.assertDictContainsSubset({prop: val}, actual_props) + + @needs_support(lzc.lzc_set_prop) + def test_set_prop_nonexistent(self): + fs = ZFSTest.pool.makeName(b"nonexistent") + prop = b"user:foo" + val = b"bar" + + with self.assertRaises(lzc_exc.DatasetNotFound): + lzc.lzc_set_prop(fs, prop, val) + + @needs_support(lzc.lzc_set_prop) + def test_set_sys_prop(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"recordsize" + val = 4096 + + lzc.lzc_create(fs) + lzc.lzc_set_prop(fs, prop, val) + actual_props = lzc.lzc_get_props(fs) + self.assertDictContainsSubset({prop: val}, actual_props) + + @needs_support(lzc.lzc_set_prop) + def test_set_invalid_prop(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"nosuchprop" + val = 0 + + lzc.lzc_create(fs) + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_set_prop(fs, prop, val) + + @needs_support(lzc.lzc_set_prop) + def test_set_invalid_value_prop(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"atime" + val = 100 + + lzc.lzc_create(fs) + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_set_prop(fs, prop, val) + + @needs_support(lzc.lzc_set_prop) + def test_set_invalid_value_prop_2(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"readonly" + val = 100 + + lzc.lzc_create(fs) + with self.assertRaises(lzc_exc.PropertyInvalid): + lzc.lzc_set_prop(fs, prop, val) + + @needs_support(lzc.lzc_set_prop) + def test_set_prop_too_small_quota(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"refquota" + val = 1 + + lzc.lzc_create(fs) + with self.assertRaises(lzc_exc.NoSpace): + lzc.lzc_set_prop(fs, prop, val) + + @needs_support(lzc.lzc_set_prop) + def test_set_readonly_prop(self): + fs = ZFSTest.pool.makeName(b"new") + prop = b"creation" + val = 0 + + lzc.lzc_create(fs) + lzc.lzc_set_prop(fs, prop, val) + actual_props = lzc.lzc_get_props(fs) + # the change is silently ignored + self.assertTrue(actual_props[prop] != val) + + +class _TempPool(object): + SNAPSHOTS = [b'snap', b'snap1', b'snap2'] + BOOKMARKS = [b'bmark', b'bmark1', b'bmark2'] + + _cachefile_suffix = ".cachefile" + + # XXX Whether to do a sloppy but much faster cleanup + # or a proper but slower one. + _recreate_pools = True + + def __init__(self, size=128 * 1024 * 1024, readonly=False, filesystems=[]): + self._filesystems = filesystems + self._readonly = readonly + if sys.version_info < (3, 0): + self._pool_name = b'pool.' + bytes(uuid.uuid4()) + else: + self._pool_name = b'pool.' + bytes(str(uuid.uuid4()), + encoding='utf-8') + self._root = _Filesystem(self._pool_name) + (fd, self._pool_file_path) = tempfile.mkstemp( + suffix='.zpool', prefix='tmp-') + if readonly: + cachefile = self._pool_file_path + _TempPool._cachefile_suffix + else: + cachefile = 'none' + self._zpool_create = [ + 'zpool', 'create', '-o', 'cachefile=' + cachefile, + '-O', 'mountpoint=legacy', self._pool_name, self._pool_file_path] + try: + os.ftruncate(fd, size) + os.close(fd) + + subprocess.check_output( + self._zpool_create, stderr=subprocess.STDOUT) + + for fs in filesystems: + lzc.lzc_create(self.makeName(fs)) + + self._bmarks_supported = self.isPoolFeatureEnabled('bookmarks') + + if readonly: + # To make a pool read-only it must exported and re-imported + # with readonly option. + # The most deterministic way to re-import the pool is by using + # a cache file. + # But the cache file has to be stashed away before the pool is + # exported, because otherwise the pool is removed from the + # cache. + shutil.copyfile(cachefile, cachefile + '.tmp') + subprocess.check_output( + ['zpool', 'export', '-f', self._pool_name], + stderr=subprocess.STDOUT) + os.rename(cachefile + '.tmp', cachefile) + subprocess.check_output( + ['zpool', 'import', '-f', '-N', '-c', cachefile, + '-o', 'readonly=on', self._pool_name], + stderr=subprocess.STDOUT) + os.remove(cachefile) + + except subprocess.CalledProcessError as e: + self.cleanUp() + if b'permission denied' in e.output: + raise unittest.SkipTest( + 'insufficient privileges to run libzfs_core tests') + print('command failed: ', e.output) + raise + except Exception: + self.cleanUp() + raise + + def reset(self): + if self._readonly: + return + + if not self.__class__._recreate_pools: + snaps = [] + for fs in [''] + self._filesystems: + for snap in self.__class__.SNAPSHOTS: + snaps.append(self.makeName(fs + '@' + snap)) + self.getRoot().visitSnaps(lambda snap: snaps.append(snap)) + lzc.lzc_destroy_snaps(snaps, defer=False) + + if self._bmarks_supported: + bmarks = [] + for fs in [''] + self._filesystems: + for bmark in self.__class__.BOOKMARKS: + bmarks.append(self.makeName(fs + '#' + bmark)) + self.getRoot().visitBookmarks( + lambda bmark: bmarks.append(bmark)) + lzc.lzc_destroy_bookmarks(bmarks) + self.getRoot().reset() + return + + # On the Buildbot builders this may fail with "pool is busy" + # Retry 5 times before raising an error + retry = 0 + while True: + try: + subprocess.check_output( + ['zpool', 'destroy', '-f', self._pool_name], + stderr=subprocess.STDOUT) + subprocess.check_output( + self._zpool_create, stderr=subprocess.STDOUT) + break + except subprocess.CalledProcessError as e: + if b'pool is busy' in e.output and retry < 5: + retry += 1 + time.sleep(1) + continue + else: + print('command failed: ', e.output) + raise + for fs in self._filesystems: + lzc.lzc_create(self.makeName(fs)) + self.getRoot().reset() + + def cleanUp(self): + try: + subprocess.check_output( + ['zpool', 'destroy', '-f', self._pool_name], + stderr=subprocess.STDOUT) + except Exception: + pass + try: + os.remove(self._pool_file_path) + except Exception: + pass + try: + os.remove(self._pool_file_path + _TempPool._cachefile_suffix) + except Exception: + pass + try: + os.remove( + self._pool_file_path + _TempPool._cachefile_suffix + '.tmp') + except Exception: + pass + + def makeName(self, relative=None): + if not relative: + return self._pool_name + if relative.startswith((b'@', b'#')): + return self._pool_name + relative + return self._pool_name + b'/' + relative + + def makeTooLongName(self, prefix=None): + if not prefix: + prefix = b'x' + prefix = self.makeName(prefix) + pad_len = lzc.MAXNAMELEN + 1 - len(prefix) + if pad_len > 0: + return prefix + b'x' * pad_len + else: + return prefix + + def makeTooLongComponent(self, prefix=None): + padding = b'x' * (lzc.MAXNAMELEN + 1) + if not prefix: + prefix = padding + else: + prefix = prefix + padding + return self.makeName(prefix) + + def getRoot(self): + return self._root + + def getFilesystem(self, fsname): + return _Filesystem(self._pool_name + b'/' + fsname) + + def isPoolFeatureAvailable(self, feature): + output = subprocess.check_output( + ['zpool', 'get', '-H', 'feature@' + feature, self._pool_name]) + output = output.strip() + return output != '' + + def isPoolFeatureEnabled(self, feature): + output = subprocess.check_output( + ['zpool', 'get', '-H', 'feature@' + feature, self._pool_name]) + output = output.split()[2] + return output in [b'active', b'enabled'] + + +class _Filesystem(object): + + def __init__(self, name): + self._name = name + self.reset() + + def getName(self): + return self._name + + def reset(self): + self._children = [] + self._fs_id = 0 + self._snap_id = 0 + self._bmark_id = 0 + + def getFilesystem(self): + self._fs_id += 1 + fsname = self._name + b'/fs' + str(self._fs_id).encode() + fs = _Filesystem(fsname) + self._children.append(fs) + return fs + + def getProperty(self, propname, received=False): + if received: + output = subprocess.check_output( + ['zfs', 'get', '-pH', '-o', 'received', propname, self._name]) + else: + output = subprocess.check_output( + ['zfs', 'get', '-pH', '-o', 'value', propname, self._name]) + return output.strip() + + def _makeSnapName(self, i): + return self._name + b'@snap' + str(i).encode() + + def getSnap(self): + self._snap_id += 1 + return self._makeSnapName(self._snap_id) + + def _makeBookmarkName(self, i): + return self._name + b'#bmark' + bytes(i) + + def getBookmark(self): + self._bmark_id += 1 + return self._makeBookmarkName(self._bmark_id) + + def _makeTooLongName(self, too_long_component): + if too_long_component: + return b'x' * (lzc.MAXNAMELEN + 1) + + # Note that another character is used for one of '/', '@', '#'. + comp_len = lzc.MAXNAMELEN - len(self._name) + if comp_len > 0: + return b'x' * comp_len + else: + return b'x' + + def getTooLongFilesystemName(self, too_long_component): + return self._name + b'/' + self._makeTooLongName(too_long_component) + + def getTooLongSnap(self, too_long_component): + return self._name + b'@' + self._makeTooLongName(too_long_component) + + def getTooLongBookmark(self, too_long_component): + return self._name + b'#' + self._makeTooLongName(too_long_component) + + def _visitFilesystems(self, visitor): + for child in self._children: + child._visitFilesystems(visitor) + visitor(self) + + def visitFilesystems(self, visitor): + def _fsVisitor(fs): + visitor(fs._name) + + self._visitFilesystems(_fsVisitor) + + def visitSnaps(self, visitor): + def _snapVisitor(fs): + for i in range(1, fs._snap_id + 1): + visitor(fs._makeSnapName(i)) + + self._visitFilesystems(_snapVisitor) + + def visitBookmarks(self, visitor): + def _bmarkVisitor(fs): + for i in range(1, fs._bmark_id + 1): + visitor(fs._makeBookmarkName(i)) + + self._visitFilesystems(_bmarkVisitor) + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/libzfs_core/test/test_nvlist.py b/contrib/pyzfs/libzfs_core/test/test_nvlist.py new file mode 100644 index 000000000000..c3c61142da50 --- /dev/null +++ b/contrib/pyzfs/libzfs_core/test/test_nvlist.py @@ -0,0 +1,636 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +""" +Tests for _nvlist module. +The tests convert from a `dict` to C ``nvlist_t`` and back to a `dict` +and verify that no information is lost and value types are correct. +The tests also check that various error conditions like unsupported +value types or out of bounds values are detected. +""" +from __future__ import absolute_import, division, print_function + +import unittest + +from .._nvlist import nvlist_in, nvlist_out, _lib +from ..ctypes import ( + uint8_t, int8_t, uint16_t, int16_t, uint32_t, int32_t, + uint64_t, int64_t, boolean_t, uchar_t +) + + +class TestNVList(unittest.TestCase): + + def _dict_to_nvlist_to_dict(self, props): + res = {} + nv_in = nvlist_in(props) + with nvlist_out(res) as nv_out: + _lib.nvlist_dup(nv_in, nv_out, 0) + return res + + def _assertIntDictsEqual(self, dict1, dict2): + self.assertEqual( + len(dict1), len(dict1), + b"resulting dictionary is of different size") + for key in dict1.keys(): + self.assertEqual(int(dict1[key]), int(dict2[key])) + + def _assertIntArrayDictsEqual(self, dict1, dict2): + self.assertEqual( + len(dict1), len(dict1), + b"resulting dictionary is of different size") + for key in dict1.keys(): + val1 = dict1[key] + val2 = dict2[key] + self.assertEqual( + len(val1), len(val2), b"array values of different sizes") + for x, y in zip(val1, val2): + self.assertEqual(int(x), int(y)) + + def test_empty(self): + res = self._dict_to_nvlist_to_dict({}) + self.assertEqual(len(res), 0, b"expected empty dict") + + def test_invalid_key_type(self): + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict({1: None}) + + def test_invalid_val_type__tuple(self): + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict({b"key": (1, 2)}) + + def test_invalid_val_type__set(self): + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict({b"key": set(1, 2)}) + + def test_invalid_array_val_type(self): + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict({b"key": [(1, 2), (3, 4)]}) + + def test_invalid_array_of_arrays_val_type(self): + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict({b"key": [[1, 2], [3, 4]]}) + + def test_string_value(self): + props = {b"key": b"value"} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_implicit_boolean_value(self): + props = {b"key": None} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_boolean_values(self): + props = {b"key1": True, b"key2": False} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_explicit_boolean_true_value(self): + props = {b"key": boolean_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_boolean_false_value(self): + props = {b"key": boolean_t(0)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_boolean_invalid_value(self): + with self.assertRaises(OverflowError): + props = {b"key": boolean_t(2)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_boolean_another_invalid_value(self): + with self.assertRaises(OverflowError): + props = {b"key": boolean_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_uint64_value(self): + props = {b"key": 1} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_uint64_max_value(self): + props = {b"key": 2 ** 64 - 1} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_uint64_too_large_value(self): + props = {b"key": 2 ** 64} + with self.assertRaises(OverflowError): + self._dict_to_nvlist_to_dict(props) + + def test_uint64_negative_value(self): + props = {b"key": -1} + with self.assertRaises(OverflowError): + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint64_value(self): + props = {b"key": uint64_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint64_max_value(self): + props = {b"key": uint64_t(2 ** 64 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint64_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint64_t(2 ** 64)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint64_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint64_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint32_value(self): + props = {b"key": uint32_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint32_max_value(self): + props = {b"key": uint32_t(2 ** 32 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint32_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint32_t(2 ** 32)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint32_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint32_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint16_value(self): + props = {b"key": uint16_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint16_max_value(self): + props = {b"key": uint16_t(2 ** 16 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint16_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint16_t(2 ** 16)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint16_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint16_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint8_value(self): + props = {b"key": uint8_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint8_max_value(self): + props = {b"key": uint8_t(2 ** 8 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_uint8_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint8_t(2 ** 8)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_uint8_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uint8_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_byte_value(self): + props = {b"key": uchar_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_byte_max_value(self): + props = {b"key": uchar_t(2 ** 8 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_byte_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uchar_t(2 ** 8)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_byte_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": uchar_t(-1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int64_value(self): + props = {b"key": int64_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int64_max_value(self): + props = {b"key": int64_t(2 ** 63 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int64_min_value(self): + props = {b"key": int64_t(-(2 ** 63))} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int64_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int64_t(2 ** 63)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int64_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int64_t(-(2 ** 63) - 1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int32_value(self): + props = {b"key": int32_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int32_max_value(self): + props = {b"key": int32_t(2 ** 31 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int32_min_value(self): + props = {b"key": int32_t(-(2 ** 31))} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int32_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int32_t(2 ** 31)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int32_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int32_t(-(2 ** 31) - 1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int16_value(self): + props = {b"key": int16_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int16_max_value(self): + props = {b"key": int16_t(2 ** 15 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int16_min_value(self): + props = {b"key": int16_t(-(2 ** 15))} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int16_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int16_t(2 ** 15)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int16_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int16_t(-(2 ** 15) - 1)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int8_value(self): + props = {b"key": int8_t(1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int8_max_value(self): + props = {b"key": int8_t(2 ** 7 - 1)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int8_min_value(self): + props = {b"key": int8_t(-(2 ** 7))} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_explicit_int8_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int8_t(2 ** 7)} + self._dict_to_nvlist_to_dict(props) + + def test_explicit_int8_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": int8_t(-(2 ** 7) - 1)} + self._dict_to_nvlist_to_dict(props) + + def test_nested_dict(self): + props = {b"key": {}} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_nested_nested_dict(self): + props = {b"key": {b"key": {}}} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_mismatching_values_array(self): + props = {b"key": [1, b"string"]} + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict(props) + + def test_mismatching_values_array2(self): + props = {b"key": [True, 10]} + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict(props) + + def test_mismatching_values_array3(self): + props = {b"key": [1, False]} + with self.assertRaises(TypeError): + self._dict_to_nvlist_to_dict(props) + + def test_string_array(self): + props = {b"key": [b"value", b"value2"]} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_boolean_array(self): + props = {b"key": [True, False]} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_explicit_boolean_array(self): + props = {b"key": [boolean_t(False), boolean_t(True)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_uint64_array(self): + props = {b"key": [0, 1, 2 ** 64 - 1]} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_uint64_array_too_large_value(self): + props = {b"key": [0, 2 ** 64]} + with self.assertRaises(OverflowError): + self._dict_to_nvlist_to_dict(props) + + def test_uint64_array_negative_value(self): + props = {b"key": [0, -1]} + with self.assertRaises(OverflowError): + self._dict_to_nvlist_to_dict(props) + + def test_mixed_explict_int_array(self): + with self.assertRaises(TypeError): + props = {b"key": [uint64_t(0), uint32_t(0)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint64_array(self): + props = {b"key": [uint64_t(0), uint64_t(1), uint64_t(2 ** 64 - 1)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_uint64_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint64_t(0), uint64_t(2 ** 64)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint64_array_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint64_t(0), uint64_t(-1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint32_array(self): + props = {b"key": [uint32_t(0), uint32_t(1), uint32_t(2 ** 32 - 1)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_uint32_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint32_t(0), uint32_t(2 ** 32)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint32_array_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint32_t(0), uint32_t(-1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint16_array(self): + props = {b"key": [uint16_t(0), uint16_t(1), uint16_t(2 ** 16 - 1)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_uint16_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint16_t(0), uint16_t(2 ** 16)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint16_array_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint16_t(0), uint16_t(-1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint8_array(self): + props = {b"key": [uint8_t(0), uint8_t(1), uint8_t(2 ** 8 - 1)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_uint8_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint8_t(0), uint8_t(2 ** 8)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_uint8_array_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uint8_t(0), uint8_t(-1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_byte_array(self): + props = {b"key": [uchar_t(0), uchar_t(1), uchar_t(2 ** 8 - 1)]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_byte_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uchar_t(0), uchar_t(2 ** 8)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_byte_array_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [uchar_t(0), uchar_t(-1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int64_array(self): + props = {b"key": [ + int64_t(0), int64_t(1), int64_t(2 ** 63 - 1), int64_t(-(2 ** 63))]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_int64_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int64_t(0), int64_t(2 ** 63)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int64_array_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int64_t(0), int64_t(-(2 ** 63) - 1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int32_array(self): + props = {b"key": [ + int32_t(0), int32_t(1), int32_t(2 ** 31 - 1), int32_t(-(2 ** 31))]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_int32_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int32_t(0), int32_t(2 ** 31)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int32_array_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int32_t(0), int32_t(-(2 ** 31) - 1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int16_array(self): + props = {b"key": [ + int16_t(0), int16_t(1), int16_t(2 ** 15 - 1), int16_t(-(2 ** 15))]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_int16_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int16_t(0), int16_t(2 ** 15)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int16_array_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int16_t(0), int16_t(-(2 ** 15) - 1)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int8_array(self): + props = {b"key": [ + int8_t(0), int8_t(1), int8_t(2 ** 7 - 1), int8_t(-(2 ** 7))]} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntArrayDictsEqual(props, res) + + def test_explict_int8_array_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int8_t(0), int8_t(2 ** 7)]} + self._dict_to_nvlist_to_dict(props) + + def test_explict_int8_array_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"key": [int8_t(0), int8_t(-(2 ** 7) - 1)]} + self._dict_to_nvlist_to_dict(props) + + def test_dict_array(self): + props = {b"key": [{b"key": 1}, {b"key": None}, {b"key": {}}]} + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + def test_implicit_uint32_value(self): + props = {b"rewind-request": 1} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_implicit_uint32_max_value(self): + props = {b"rewind-request": 2 ** 32 - 1} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_implicit_uint32_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"rewind-request": 2 ** 32} + self._dict_to_nvlist_to_dict(props) + + def test_implicit_uint32_negative_value(self): + with self.assertRaises(OverflowError): + props = {b"rewind-request": -1} + self._dict_to_nvlist_to_dict(props) + + def test_implicit_int32_value(self): + props = {b"pool_context": 1} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_implicit_int32_max_value(self): + props = {b"pool_context": 2 ** 31 - 1} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_implicit_int32_min_value(self): + props = {b"pool_context": -(2 ** 31)} + res = self._dict_to_nvlist_to_dict(props) + self._assertIntDictsEqual(props, res) + + def test_implicit_int32_too_large_value(self): + with self.assertRaises(OverflowError): + props = {b"pool_context": 2 ** 31} + self._dict_to_nvlist_to_dict(props) + + def test_implicit_int32_too_small_value(self): + with self.assertRaises(OverflowError): + props = {b"pool_context": -(2 ** 31) - 1} + self._dict_to_nvlist_to_dict(props) + + def test_complex_dict(self): + props = { + b"key1": b"str", + b"key2": 10, + b"key3": { + b"skey1": True, + b"skey2": None, + b"skey3": [ + True, + False, + True + ] + }, + b"key4": [ + b"ab", + b"bc" + ], + b"key5": [ + 2 ** 64 - 1, + 1, + 2, + 3 + ], + b"key6": [ + { + b"skey71": b"a", + b"skey72": b"b", + }, + { + b"skey71": b"c", + b"skey72": b"d", + }, + { + b"skey71": b"e", + b"skey72": b"f", + } + + ], + b"type": 2 ** 32 - 1, + b"pool_context": -(2 ** 31) + } + res = self._dict_to_nvlist_to_dict(props) + self.assertEqual(props, res) + + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/pyzfs/setup.py.in b/contrib/pyzfs/setup.py.in new file mode 100644 index 000000000000..bd8ffc728fa6 --- /dev/null +++ b/contrib/pyzfs/setup.py.in @@ -0,0 +1,61 @@ +# +# Copyright 2015 ClusterHQ +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +from __future__ import absolute_import, division, print_function + +from setuptools import setup, find_packages + +setup( + name="pyzfs", + version="@VERSION@", + description="Wrapper for libzfs_core", + author="ClusterHQ", + author_email="support@clusterhq.com", + url="http://pyzfs.readthedocs.org", + license="Apache License, Version 2.0", + classifiers=[ + "Development Status :: 4 - Beta", + "Intended Audience :: Developers", + "License :: OSI Approved :: Apache Software License", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.7", + "Programming Language :: Python :: 3", + "Programming Language :: Python :: 3.4", + "Programming Language :: Python :: 3.5", + "Programming Language :: Python :: 3.6", + "Programming Language :: Python :: 3.7", + "Topic :: System :: Filesystems", + "Topic :: Software Development :: Libraries", + ], + keywords=[ + "ZFS", + "OpenZFS", + "libzfs_core", + ], + + packages=find_packages(), + include_package_data=True, + install_requires=[ + "cffi", + ], + setup_requires=[ + "cffi", + ], + python_requires='>=2.7,!=3.0.*,!=3.1.*,!=3.2.*,!=3.3.*,<4', + zip_safe=False, + test_suite="libzfs_core.test", +) + +# vim: softtabstop=4 tabstop=4 expandtab shiftwidth=4 diff --git a/contrib/zcp/Makefile.am b/contrib/zcp/Makefile.am new file mode 100644 index 000000000000..e6a777ad7ba7 --- /dev/null +++ b/contrib/zcp/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = autosnap.lua diff --git a/contrib/zcp/autosnap.lua b/contrib/zcp/autosnap.lua new file mode 100644 index 000000000000..d9ae32ce458a --- /dev/null +++ b/contrib/zcp/autosnap.lua @@ -0,0 +1,75 @@ +-- Recursively snapshot every dataset with a given property +-- +-- Usage: zfs program autosnap.lua -- [-n] [-p ] + +results = {} + +args = ... +argv = args["argv"] +usage = [[ + + +usage: zfs program autosnap.lua -- [-n] [-p ] + + -n: performs checks only, does not take snapshots + -p : property to check. [default: com.sun:auto-snapshot] + : root snapshot to create [example: tank/data@backup] +]] + +property = "com.sun:auto-snapshot" +noop = false +root_snap = nil + +for i, arg in ipairs(argv) do + if arg == "-n" then + noop = true + elseif arg == "-p" then + elseif argv[i-1] == "-p" then + property = arg + else + root_snap = arg + end +end + +if root_snap == nil or property == nil then + error(usage) +end + +root_ds_name = "" +snap_name = "" +for i = 1, #root_snap do + if root_snap:sub(i, i) == "@" then + root_ds_name = root_snap:sub(1, i-1) + snap_name = root_snap:sub(i+1, root_snap:len()) + end +end + +function auto_snap(root) + auto, source = zfs.get_prop(root, property) + if auto == "true" then + ds_snap_name = root .. "@" .. snap_name + err = 0 + if noop then + err = zfs.check.snapshot(ds_snap_name) + else + err = zfs.sync.snapshot(ds_snap_name) + end + results[ds_snap_name] = err + end + for child in zfs.list.children(root) do + auto_snap(child) + end +end + +auto_snap(root_ds_name) +err_txt = "" +for ds, err in pairs(results) do + if err ~= 0 then + err_txt = err_txt .. "failed to create " .. ds .. ": " .. err .. "\n" + end +end +if err_txt ~= "" then + error(err_txt) +end + +return results diff --git a/copy-builtin b/copy-builtin new file mode 100755 index 000000000000..f42f4d1a4828 --- /dev/null +++ b/copy-builtin @@ -0,0 +1,79 @@ +#!/usr/bin/env bash + +set -e + +usage() +{ + echo "usage: $0 " >&2 + exit 1 +} + +[ "$#" -eq 1 ] || usage +KERNEL_DIR="$(readlink --canonicalize-existing "$1")" + +if ! [ -e 'zfs_config.h' ] +then + echo >&2 + echo " $0: you did not run configure, or you're not in the ZFS source directory." >&2 + echo " $0: run configure with --with-linux=$KERNEL_DIR and --enable-linux-builtin." >&2 + echo >&2 + exit 1 +fi + +make clean || true +make gitrev + +rm -rf "$KERNEL_DIR/include/zfs" "$KERNEL_DIR/fs/zfs" +cp --recursive include "$KERNEL_DIR/include/zfs" +cp --recursive module "$KERNEL_DIR/fs/zfs" +cp zfs_config.h "$KERNEL_DIR/include/zfs/" + +cat > "$KERNEL_DIR/fs/zfs/Kconfig" <<"EOF" +config ZFS + tristate "ZFS filesystem support" + depends on EFI_PARTITION + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + This is the ZFS filesystem from the ZFS On Linux project. + + See https://zfsonlinux.org/ + + To compile this file system support as a module, choose M here. + + If unsure, say N. +EOF + +add_after() +{ + local FILE="$1" + local MARKER="$2" + local NEW="$3" + local LINE + + while IFS='' read -r LINE + do + echo "$LINE" + + if [ -n "$MARKER" -a "$LINE" = "$MARKER" ] + then + echo "$NEW" + MARKER='' + if IFS='' read -r LINE + then + [ "$LINE" != "$NEW" ] && echo "$LINE" + fi + fi + done < "$FILE" > "$FILE.new" + + mv "$FILE.new" "$FILE" +} + +add_after "$KERNEL_DIR/fs/Kconfig" 'if BLOCK' 'source "fs/zfs/Kconfig"' +add_after "$KERNEL_DIR/fs/Makefile" 'endif' 'obj-$(CONFIG_ZFS) += zfs/' + +echo >&2 +echo " $0: done." >&2 +echo " $0: now you can build the kernel with ZFS support." >&2 +echo " $0: make sure you enable ZFS support (CONFIG_ZFS) before building." >&2 +echo >&2 diff --git a/cppcheck-suppressions.txt b/cppcheck-suppressions.txt new file mode 100644 index 000000000000..8a0a1b830e3b --- /dev/null +++ b/cppcheck-suppressions.txt @@ -0,0 +1,8 @@ +preprocessorErrorDirective:./module/zfs/vdev_raidz_math_avx512f.c:243 +preprocessorErrorDirective:./module/zfs/vdev_raidz_math_sse2.c:266 +uninitvar:module/os/freebsd/zfs/vdev_geom.c +uninitvar:module/os/freebsd/zfs/zfs_vfsops.c +uninitvar:module/os/freebsd/spl/spl_zone.c +uninitvar:lib/libzutil/os/freebsd/zutil_import_os.c +*:module/zstd/lib/zstd.c +*:module/zstd/lib/zstd.h diff --git a/etc/Makefile.am b/etc/Makefile.am new file mode 100644 index 000000000000..ac71da9445d8 --- /dev/null +++ b/etc/Makefile.am @@ -0,0 +1,5 @@ +SUBDIRS = zfs sudoers.d +if BUILD_LINUX +SUBDIRS += default $(ZFS_INIT_SYSTEMD) $(ZFS_INIT_SYSV) $(ZFS_MODULE_LOAD) +endif +DIST_SUBDIRS = default init.d zfs systemd modules-load.d sudoers.d diff --git a/etc/default/.gitignore b/etc/default/.gitignore new file mode 100644 index 000000000000..73304bc2cd4a --- /dev/null +++ b/etc/default/.gitignore @@ -0,0 +1 @@ +zfs diff --git a/etc/default/Makefile.am b/etc/default/Makefile.am new file mode 100644 index 000000000000..0ec868e13484 --- /dev/null +++ b/etc/default/Makefile.am @@ -0,0 +1,5 @@ +include $(top_srcdir)/config/Substfiles.am + +initconf_SCRIPTS = zfs + +SUBSTFILES += $(initconf_SCRIPTS) diff --git a/etc/default/zfs.in b/etc/default/zfs.in new file mode 100644 index 000000000000..3b6e5486dd33 --- /dev/null +++ b/etc/default/zfs.in @@ -0,0 +1,103 @@ +# ZoL userland configuration. + +# NOTE: This file is intended for sysv init and initramfs. +# Changing some of these settings may not make any difference on +# systemd-based setup, e.g. setting ZFS_MOUNT=no will not prevent systemd +# from launching zfs-mount.service during boot. +# See: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=901436 + +# To enable a boolean setting, set it to yes, on, true, or 1. +# Anything else will be interpreted as unset. + +# Run `zfs mount -a` during system start? +ZFS_MOUNT='yes' + +# Run `zfs unmount -a` during system stop? +ZFS_UNMOUNT='yes' + +# Run `zfs share -a` during system start? +# nb: The shareiscsi, sharenfs, and sharesmb dataset properties. +ZFS_SHARE='yes' + +# Run `zfs unshare -a` during system stop? +ZFS_UNSHARE='yes' + +# By default, a verbatim import of all pools is performed at boot based on the +# contents of the default zpool cache file. The contents of the cache are +# managed automatically by the 'zpool import' and 'zpool export' commands. +# +# By setting this to 'yes', the system will instead search all devices for +# pools and attempt to import them all at boot, even those that have been +# exported. Under this mode, the search path can be controlled by the +# ZPOOL_IMPORT_PATH variable and a list of pools that should not be imported +# can be listed in the ZFS_POOL_EXCEPTIONS variable. +# +# Note that importing all visible pools may include pools that you don't +# expect, such as those on removable devices and SANs, and those pools may +# proceed to mount themselves in places you do not want them to. The results +# can be unpredictable and possibly dangerous. Only enable this option if you +# understand this risk and have complete physical control over your system and +# SAN to prevent the insertion of malicious pools. +ZPOOL_IMPORT_ALL_VISIBLE='no' + +# Specify specific path(s) to look for device nodes and/or links for the +# pool import(s). See zpool(8) for more information about this variable. +# It supersedes the old USE_DISK_BY_ID which indicated that it would only +# try '/dev/disk/by-id'. +# The old variable will still work in the code, but is deprecated. +#ZPOOL_IMPORT_PATH="/dev/disk/by-vdev:/dev/disk/by-id" + +# List of pools that should NOT be imported at boot +# when ZPOOL_IMPORT_ALL_VISIBLE is 'yes'. +# This is a space separated list. +#ZFS_POOL_EXCEPTIONS="test2" + +# Should the datasets be mounted verbosely? +# A mount counter will be used when mounting if set to 'yes'. +VERBOSE_MOUNT='no' + +# Should we allow overlay mounts? +# This is standard in Linux, but not ZFS which comes from Solaris where this +# is not allowed). +DO_OVERLAY_MOUNTS='no' + +# Any additional option to the 'zfs import' commandline? +# Include '-o' for each option wanted. +# You don't need to put '-f' in here, unless you want it ALL the time. +# Using the option 'zfsforce=1' on the grub/kernel command line will +# do the same, but on a case-to-case basis. +ZPOOL_IMPORT_OPTS="" + +# Full path to the ZFS cache file? +# See "cachefile" in zpool(8). +# The default is "@sysconfdir@/zfs/zpool.cache". +#ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" +# +# Setting ZPOOL_CACHE to an empty string ('') AND setting ZPOOL_IMPORT_OPTS to +# "-c @sysconfdir@/zfs/zpool.cache" will _enforce_ the use of a cache file. +# This is needed in some cases (extreme amounts of VDEVs, multipath etc). +# Generally, the use of a cache file is usually not recommended on Linux +# because it sometimes is more trouble than it's worth (laptops with external +# devices or when/if device nodes changes names). +#ZPOOL_IMPORT_OPTS="-c @sysconfdir@/zfs/zpool.cache" +#ZPOOL_CACHE="" + +# Any additional option to the 'zfs mount' command line? +# Include '-o' for each option wanted. +MOUNT_EXTRA_OPTIONS="" + +# Build kernel modules with the --enable-debug switch? +# Only applicable for Debian GNU/Linux {dkms,initramfs}. +ZFS_DKMS_ENABLE_DEBUG='no' + +# Build kernel modules with the --enable-debuginfo switch? +# Only applicable for Debian GNU/Linux {dkms,initramfs}. +ZFS_DKMS_ENABLE_DEBUGINFO='no' + +# Keep debugging symbols in kernel modules? +# Only applicable for Debian GNU/Linux {dkms,initramfs}. +ZFS_DKMS_DISABLE_STRIP='no' + +# Optional arguments for the ZFS Event Daemon (ZED). +# See zed(8) for more information on available options. +#ZED_ARGS="-M" diff --git a/etc/init.d/.gitignore b/etc/init.d/.gitignore new file mode 100644 index 000000000000..43a673d55343 --- /dev/null +++ b/etc/init.d/.gitignore @@ -0,0 +1,5 @@ +zfs-import +zfs-mount +zfs-share +zfs-zed +zfs diff --git a/etc/init.d/Makefile.am b/etc/init.d/Makefile.am new file mode 100644 index 000000000000..9285a995a1cf --- /dev/null +++ b/etc/init.d/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Substfiles.am + +EXTRA_DIST += README.md + +init_SCRIPTS = zfs-import zfs-mount zfs-share zfs-zed + +SUBSTFILES += $(init_SCRIPTS) diff --git a/etc/init.d/README.md b/etc/init.d/README.md new file mode 100644 index 000000000000..ad7c053aacab --- /dev/null +++ b/etc/init.d/README.md @@ -0,0 +1,72 @@ +DESCRIPTION + These script were written with the primary intention of being portable and + usable on as many systems as possible. + + This is, in practice, usually not possible. But the intention is there. + And it is a good one. + + They have been tested successfully on: + + * Debian GNU/Linux Wheezy + * Debian GNU/Linux Jessie + * Ubuntu Trusty + * CentOS 6.0 + * CentOS 6.6 + * Gentoo + +SUPPORT + If you find that they don't work for your platform, please report this + at the ZFS On Linux issue tracker at https://github.com/zfsonlinux/zfs/issues. + + Please include: + + * Distribution name + * Distribution version + * Where to find an install CD image + * Architecture + + If you have code to share that fixes the problem, that is much better. + But please remember to try your best keep portability in mind. If you + suspect that what you're writing/modifying won't work on anything else + than your distribution, please make sure to put that code in appropriate + if/else/fi code. + + It currently MUST be bash (or fully compatible) for this to work. + + If you're making your own distribution and you want the scripts to + work on that, the biggest problem you'll (probably) have is the part + at the beginning of the "zfs-functions" file which sets up the + logging output. + +INSTALLING INIT SCRIPT LINKS + To setup the init script links in /etc/rc?.d manually on a Debian GNU/Linux + (or derived) system, run the following commands (the order is important!): + + update-rc.d zfs-import start 07 S . stop 07 0 1 6 . + update-rc.d zfs-mount start 02 2 3 4 5 . stop 06 0 1 6 . + update-rc.d zfs-zed start 07 2 3 4 5 . stop 08 0 1 6 . + update-rc.d zfs-share start 27 2 3 4 5 . stop 05 0 1 6 . + + To do the same on RedHat, Fedora and/or CentOS: + + chkconfig zfs-import + chkconfig zfs-mount + chkconfig zfs-zed + chkconfig zfs-share + + On Gentoo: + + rc-update add zfs-import boot + rc-update add zfs-mount boot + rc-update add zfs-zed default + rc-update add zfs-share default + + The idea here is to make sure all of the ZFS filesystems, including possibly + separate datasets like /var, are mounted before anything else is started. + + Then, ZED, which depends on /var, can be started. It will consume and act + on events that occurred before it started. ZED may also play a role in + sharing filesystems in the future, so it is important to start before the + 'share' service. + + Finally, we share filesystems configured with the share\* property. diff --git a/etc/init.d/zfs-import.in b/etc/init.d/zfs-import.in new file mode 100755 index 000000000000..714cc6c089d7 --- /dev/null +++ b/etc/init.d/zfs-import.in @@ -0,0 +1,340 @@ +#!@DEFAULT_INIT_SHELL@ +# +# zfs-import This script will import ZFS pools +# +# chkconfig: 2345 01 99 +# description: This script will perform a verbatim import of ZFS pools +# during system boot. +# probe: true +# +### BEGIN INIT INFO +# Provides: zfs-import +# Required-Start: mtab +# Required-Stop: $local_fs mtab +# Default-Start: S +# Default-Stop: 0 1 6 +# X-Start-Before: checkfs +# X-Stop-After: zfs-mount +# Short-Description: Import ZFS pools +# Description: Run the `zpool import` command. +### END INIT INFO +# +# NOTE: Not having '$local_fs' on Required-Start but only on Required-Stop +# is on purpose. If we have '$local_fs' in both (and X-Start-Before=checkfs) +# we get conflicts - import needs to be started extremely early, +# but not stopped too late. +# +# Released under the 2-clause BSD license. +# +# The original script that acted as a template for this script came from +# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a +# licensing stansa) in the commit dated Mar 24, 2011: +# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a + +# Source the common init script +. @sysconfdir@/zfs/zfs-functions + +# ---------------------------------------------------- + +do_depend() +{ + before swap + after sysfs udev + keyword -lxc -openvz -prefix -vserver +} + +# Use the zpool cache file to import pools +do_verbatim_import() +{ + if [ -f "$ZPOOL_CACHE" ] + then + zfs_action "Importing ZFS pool(s)" \ + "$ZPOOL" import -c "$ZPOOL_CACHE" -N -a + fi +} + +# Support function to get a list of all pools, separated with ';' +find_pools() +{ + local CMD="$*" + local pools + + pools=$($CMD 2> /dev/null | \ + grep -E "pool:|^[a-zA-Z0-9]" | \ + sed 's@.*: @@' | \ + sort | \ + while read pool; do \ + echo -n "$pool;" + done) + + echo "${pools%%;}" # Return without the last ';'. +} + +# Find and import all visible pools, even exported ones +do_import_all_visible() +{ + local already_imported available_pools pool npools + local exception dir ZPOOL_IMPORT_PATH RET=0 r=1 + + # In case not shutdown cleanly. + [ -n "$init" ] && rm -f /etc/dfs/sharetab + + # Just simplify code later on. + if [ -n "$USE_DISK_BY_ID" -a "$USE_DISK_BY_ID" != 'yes' ] + then + # It's something, but not 'yes' so it's no good to us. + unset USE_DISK_BY_ID + fi + + # Find list of already imported pools. + already_imported=$(find_pools "$ZPOOL" list -H -oname) + available_pools=$(find_pools "$ZPOOL" import) + + # Just in case - seen it happen (that a pool isn't visible/found + # with a simple "zpool import" but only when using the "-d" + # option or setting ZPOOL_IMPORT_PATH). + if [ -d "/dev/disk/by-id" ] + then + npools=$(find_pools "$ZPOOL" import -d /dev/disk/by-id) + if [ -n "$npools" ] + then + # Because we have found extra pool(s) here, which wasn't + # found 'normally', we need to force USE_DISK_BY_ID to + # make sure we're able to actually import it/them later. + USE_DISK_BY_ID='yes' + + if [ -n "$available_pools" ] + then + # Filter out duplicates (pools found with the simpl + # "zpool import" but which is also found with the + # "zpool import -d ..."). + npools=$(echo "$npools" | sed "s,$available_pools,,") + + # Add the list to the existing list of + # available pools + available_pools="$available_pools;$npools" + else + available_pools="$npools" + fi + fi + fi + + # Filter out any exceptions... + if [ -n "$ZFS_POOL_EXCEPTIONS" ] + then + local found="" + local apools="" + OLD_IFS="$IFS" ; IFS=";" + + for pool in $available_pools + do + for exception in $ZFS_POOL_EXCEPTIONS + do + [ "$pool" = "$exception" ] && continue 2 + found="$pool" + done + + if [ -n "$found" ] + then + if [ -n "$apools" ] + then + apools="$apools;$pool" + else + apools="$pool" + fi + fi + done + + IFS="$OLD_IFS" + available_pools="$apools" + fi + + # For backwards compatibility, make sure that ZPOOL_IMPORT_PATH is set + # to something we can use later with the real import(s). We want to + # make sure we find all by* dirs, BUT by-vdev should be first (if it + # exists). + if [ -n "$USE_DISK_BY_ID" -a -z "$ZPOOL_IMPORT_PATH" ] + then + local dirs + dirs="$(for dir in $(echo /dev/disk/by-*) + do + # Ignore by-vdev here - we want it first! + echo "$dir" | grep -q /by-vdev && continue + [ ! -d "$dir" ] && continue + + echo -n "$dir:" + done | sed 's,:$,,g')" + + if [ -d "/dev/disk/by-vdev" ] + then + # Add by-vdev at the beginning. + ZPOOL_IMPORT_PATH="/dev/disk/by-vdev:" + fi + + # Help with getting LUKS partitions etc imported. + if [ -d "/dev/mapper" ]; then + if [ -n "$ZPOOL_IMPORT_PATH" ]; then + ZPOOL_IMPORT_PATH="$ZPOOL_IMPORT_PATH:/dev/mapper:" + else + ZPOOL_IMPORT_PATH="/dev/mapper:" + fi + fi + + # ... and /dev at the very end, just for good measure. + ZPOOL_IMPORT_PATH="$ZPOOL_IMPORT_PATH$dirs:/dev" + fi + + # Needs to be exported for "zpool" to catch it. + [ -n "$ZPOOL_IMPORT_PATH" ] && export ZPOOL_IMPORT_PATH + + # Mount all available pools (except those set in ZFS_POOL_EXCEPTIONS. + # + # If not interactive (run from init - variable init='/sbin/init') + # we get ONE line for all pools being imported, with just a dot + # as status for each pool. + # Example: Importing ZFS pool(s)... [OK] + # + # If it IS interactive (started from the shell manually), then we + # get one line per pool importing. + # Example: Importing ZFS pool pool1 [OK] + # Importing ZFS pool pool2 [OK] + # [etc] + [ -n "$init" ] && zfs_log_begin_msg "Importing ZFS pool(s)" + OLD_IFS="$IFS" ; IFS=";" + for pool in $available_pools + do + [ -z "$pool" ] && continue + + # We have pools that haven't been imported - import them + if [ -n "$init" ] + then + # Not interactive - a dot for each pool. + # Except on Gentoo where this doesn't work. + zfs_log_progress_msg "." + else + # Interactive - one 'Importing ...' line per pool + zfs_log_begin_msg "Importing ZFS pool $pool" + fi + + # Import by using ZPOOL_IMPORT_PATH (either set above or in + # the config file) _or_ with the 'built in' default search + # paths. This is the preferred way. + "$ZPOOL" import -N ${ZPOOL_IMPORT_OPTS} "$pool" 2> /dev/null + r="$?" ; RET=$((RET + r)) + if [ "$r" -eq 0 ] + then + # Output success and process the next pool + [ -z "$init" ] && zfs_log_end_msg 0 + continue + fi + # We don't want a fail msg here, we're going to try import + # using the cache file soon and that might succeed. + [ ! -f "$ZPOOL_CACHE" ] && zfs_log_end_msg "$RET" + + if [ "$r" -gt 0 -a -f "$ZPOOL_CACHE" ] + then + # Failed to import without a cache file. Try WITH... + if [ -z "$init" ] && check_boolean "$VERBOSE_MOUNT" + then + # Interactive + Verbose = more information + zfs_log_progress_msg " using cache file" + fi + + "$ZPOOL" import -c "$ZPOOL_CACHE" -N ${ZPOOL_IMPORT_OPTS} \ + "$pool" 2> /dev/null + r="$?" ; RET=$((RET + r)) + if [ "$r" -eq 0 ] + then + [ -z "$init" ] && zfs_log_end_msg 0 + continue 3 # Next pool + fi + zfs_log_end_msg "$RET" + fi + done + [ -n "$init" ] && zfs_log_end_msg "$RET" + + IFS="$OLD_IFS" + [ -n "$already_imported" -a -z "$available_pools" ] && return 0 + + return "$RET" +} + +do_import() +{ + if check_boolean "$ZPOOL_IMPORT_ALL_VISIBLE" + then + do_import_all_visible + else + # This is the default option + do_verbatim_import + fi +} + +# Output the status and list of pools +do_status() +{ + check_module_loaded "zfs" || exit 0 + + "$ZPOOL" status && echo "" && "$ZPOOL" list +} + +do_start() +{ + if check_boolean "$VERBOSE_MOUNT" + then + zfs_log_begin_msg "Checking if ZFS userspace tools present" + fi + + if checksystem + then + check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 + + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_begin_msg "Loading kernel ZFS infrastructure" + + if ! load_module "zfs" + then + check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 1 + return 5 + fi + check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 + + do_import && udev_trigger # just to make sure we get zvols. + + return 0 + else + return 1 + fi +} + +# ---------------------------------------------------- + +if [ ! -e /sbin/openrc-run ] +then + case "$1" in + start) + do_start + ;; + stop) + # no-op + ;; + status) + do_status + ;; + force-reload|condrestart|reload|restart) + # no-op + ;; + *) + [ -n "$1" ] && echo "Error: Unknown command $1." + echo "Usage: $0 {start|status}" + exit 3 + ;; + esac + + exit $? +else + # Create wrapper functions since Gentoo don't use the case part. + depend() { do_depend; } + start() { do_start; } + status() { do_status; } +fi diff --git a/etc/init.d/zfs-mount.in b/etc/init.d/zfs-mount.in new file mode 100755 index 000000000000..9b400916f42e --- /dev/null +++ b/etc/init.d/zfs-mount.in @@ -0,0 +1,227 @@ +#!@DEFAULT_INIT_SHELL@ +# +# zfs-mount This script will mount/umount the zfs filesystems. +# +# chkconfig: 2345 06 99 +# description: This script will mount/umount the zfs filesystems during +# system boot/shutdown. Configuration of which filesystems +# should be mounted is handled by the zfs 'mountpoint' and +# 'canmount' properties. See the zfs(8) man page for details. +# It is also responsible for all userspace zfs services. +# probe: true +# +### BEGIN INIT INFO +# Provides: zfs-mount +# Required-Start: $local_fs zfs-import +# Required-Stop: $local_fs zfs-import +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# X-Stop-After: zfs-zed +# Short-Description: Mount ZFS filesystems and volumes +# Description: Run the `zfs mount -a` or `zfs umount -a` commands. +### END INIT INFO +# +# Released under the 2-clause BSD license. +# +# The original script that acted as a template for this script came from +# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a +# licensing stansa) in the commit dated Mar 24, 2011: +# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a + +# Source the common init script +. @sysconfdir@/zfs/zfs-functions + +# ---------------------------------------------------- + +chkroot() { + while read line; do + set -- $line + if [ "$2" = "/" ]; then + return 0 + fi + done < /proc/self/mounts + + return 1 +} + +do_depend() +{ + # Try to allow people to mix and match fstab with ZFS in a way that makes sense. + if [ "$(mountinfo -s /)" = 'zfs' ] + then + before localmount + else + after localmount + fi + + # bootmisc will log to /var which may be a different zfs than root. + before bootmisc logger + + after zfs-import sysfs + use mtab + keyword -lxc -openvz -prefix -vserver +} + +# Mount all datasets/filesystems +do_mount() +{ + local verbose overlay i mntpt val + + check_boolean "$VERBOSE_MOUNT" && verbose=v + check_boolean "$DO_OVERLAY_MOUNTS" && overlay=O + + zfs_action "Mounting ZFS filesystem(s)" \ + "$ZFS" mount -a$verbose$overlay "$MOUNT_EXTRA_OPTIONS" + + # Require each volume/filesystem to have 'noauto' and no fsck + # option. This shouldn't really be necessary, as long as one + # can get zfs-import to run sufficiently early on in the boot + # process - before local mounts. This is just here in case/if + # this isn't possible. + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_begin_msg "Mounting volumes and filesystems registered in fstab" + + read_mtab "^/dev/(zd|zvol)" + read_fstab "^/dev/(zd|zvol)" + i=0; var=$(eval echo FSTAB_$i) + while [ -n "$(eval echo "$""$var")" ] + do + mntpt=$(eval echo "$""$var") + dev=$(eval echo "$"FSTAB_dev_$i) + if ! in_mtab "$mntpt" && ! is_mounted "$mntpt" && [ -e "$dev" ] + then + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_progress_msg "$mntpt " + fsck "$dev" && mount "$mntpt" + fi + + i=$((i + 1)) + var=$(eval echo FSTAB_$i) + done + + read_mtab "[[:space:]]zfs[[:space:]]" + read_fstab "[[:space:]]zfs[[:space:]]" + i=0; var=$(eval echo FSTAB_$i) + while [ -n "$(eval echo "$""$var")" ] + do + mntpt=$(eval echo "$""$var") + if ! in_mtab "$mntpt" && ! is_mounted "$mntpt" + then + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_progress_msg "$mntpt " + mount "$mntpt" + fi + + i=$((i + 1)) + var=$(eval echo FSTAB_$i) + done + check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 + + return 0 +} + +# Unmount all filesystems +do_unmount() +{ + local i var mntpt + + # This shouldn't really be necessary, as long as one can get + # zfs-import to run sufficiently late in the shutdown/reboot process + # - after unmounting local filesystems. This is just here in case/if + # this isn't possible. + zfs_action "Unmounting ZFS filesystems" "$ZFS" unmount -a + + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_begin_msg "Unmounting volumes and filesystems registered in fstab" + + read_mtab "^/dev/(zd|zvol)" + read_fstab "^/dev/(zd|zvol)" + i=0; var=$(eval echo FSTAB_$i) + while [ -n "$(eval echo "$""$var")" ] + do + mntpt=$(eval echo "$""$var") + dev=$(eval echo "$"FSTAB_dev_$i) + if in_mtab "$mntpt" + then + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_progress_msg "$mntpt " + umount "$mntpt" + fi + + i=$((i + 1)) + var=$(eval echo FSTAB_$i) + done + + read_mtab "[[:space:]]zfs[[:space:]]" + read_fstab "[[:space:]]zfs[[:space:]]" + i=0; var=$(eval echo FSTAB_$i) + while [ -n "$(eval echo "$""$var")" ] + do + mntpt=$(eval echo "$""$var") + if in_mtab "$mntpt"; then + check_boolean "$VERBOSE_MOUNT" && \ + zfs_log_progress_msg "$mntpt " + umount "$mntpt" + fi + + i=$((i + 1)) + var=$(eval echo FSTAB_$i) + done + check_boolean "$VERBOSE_MOUNT" && zfs_log_end_msg 0 + + return 0 +} + +do_start() +{ + check_boolean "$ZFS_MOUNT" || exit 0 + + check_module_loaded "zfs" || exit 0 + + # Ensure / exists in /proc/self/mounts. + # This should be handled by rc.sysinit but lets be paranoid. + if ! chkroot + then + mount -f / + fi + + do_mount +} + +do_stop() +{ + check_boolean "$ZFS_UNMOUNT" || exit 0 + + check_module_loaded "zfs" || exit 0 + + do_unmount +} + +# ---------------------------------------------------- + +if [ ! -e /sbin/openrc-run ] +then + case "$1" in + start) + do_start + ;; + stop) + do_stop + ;; + force-reload|condrestart|reload|restart|status) + # no-op + ;; + *) + [ -n "$1" ] && echo "Error: Unknown command $1." + echo "Usage: $0 {start|stop}" + exit 3 + ;; + esac + + exit $? +else + # Create wrapper functions since Gentoo don't use the case part. + depend() { do_depend; } + start() { do_start; } + stop() { do_stop; } +fi diff --git a/etc/init.d/zfs-share.in b/etc/init.d/zfs-share.in new file mode 100755 index 000000000000..3256d1d067f1 --- /dev/null +++ b/etc/init.d/zfs-share.in @@ -0,0 +1,85 @@ +#!@DEFAULT_INIT_SHELL@ +# +# zfs-share This script will network share zfs filesystems and volumes. +# +# chkconfig: 2345 30 99 +# description: Run the `zfs share -a` or `zfs unshare -a` commands +# for controlling iSCSI, NFS, or CIFS network shares. +# probe: true +# +### BEGIN INIT INFO +# Provides: zfs-share +# Required-Start: $local_fs $network $remote_fs zfs-mount +# Required-Stop: $local_fs $network $remote_fs zfs-mount +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Should-Start: iscsi iscsitarget istgt scst @DEFAULT_INIT_NFS_SERVER@ samba samba4 zfs-mount zfs-zed +# Should-Stop: iscsi iscsitarget istgt scst @DEFAULT_INIT_NFS_SERVER@ samba samba4 zfs-mount zfs-zed +# Short-Description: Network share ZFS datasets and volumes. +# Description: Run the `zfs share -a` or `zfs unshare -a` commands +# for controlling iSCSI, NFS, or CIFS network shares. +### END INIT INFO +# +# Released under the 2-clause BSD license. +# +# The original script that acted as a template for this script came from +# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a +# licensing stansa) in the commit dated Mar 24, 2011: +# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a + +# Source the common init script +. @sysconfdir@/zfs/zfs-functions + +# ---------------------------------------------------- + +do_depend() +{ + after sysfs zfs-mount zfs-zed + keyword -lxc -openvz -prefix -vserver +} + +do_start() +{ + check_boolean "$ZFS_SHARE" || exit 0 + + check_module_loaded "zfs" || exit 0 + + zfs_action "Sharing ZFS filesystems" "$ZFS" share -a +} + +do_stop() +{ + check_boolean "$ZFS_UNSHARE" || exit 0 + + check_module_loaded "zfs" || exit 0 + + zfs_action "Unsharing ZFS filesystems" "$ZFS" unshare -a +} + +# ---------------------------------------------------- + +if [ ! -e /sbin/openrc-run ]; then + case "$1" in + start) + do_start + ;; + stop) + do_stop + ;; + force-reload|reload|restart|status) + # no-op + ;; + *) + [ -n "$1" ] && echo "Error: Unknown command $1." + echo "Usage: $0 {start|stop}" + exit 3 + ;; + esac + + exit $? +else + # Create wrapper functions since Gentoo don't use the case part. + depend() { do_depend; } + start() { do_start; } + stop() { do_stop; } +fi diff --git a/etc/init.d/zfs-zed.in b/etc/init.d/zfs-zed.in new file mode 100755 index 000000000000..6af9ee60c8c1 --- /dev/null +++ b/etc/init.d/zfs-zed.in @@ -0,0 +1,134 @@ +#!@DEFAULT_INIT_SHELL@ +# +# zfs-zed +# +# chkconfig: 2345 29 99 +# description: This script will start and stop the ZFS Event Daemon. +# probe: true +# +### BEGIN INIT INFO +# Provides: zfs-zed +# Required-Start: zfs-mount +# Required-Stop: zfs-mount +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# X-Stop-After: zfs-share +# Short-Description: ZFS Event Daemon +# Description: zed monitors ZFS events. When a zevent is posted, zed +# will run any scripts that have been enabled for the +# corresponding zevent class. +### END INIT INFO +# +# Released under the 2-clause BSD license. +# +# The original script that acted as a template for this script came from +# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a +# licensing stansa) in the commit dated Mar 24, 2011: +# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a + +# Source the common init script +. @sysconfdir@/zfs/zfs-functions + +ZED_NAME="zed" +ZED_PIDFILE="@runstatedir@/$ZED_NAME.pid" + +extra_started_commands="reload" + +# Exit if the package is not installed +[ -x "$ZED" ] || exit 0 + +# ---------------------------------------------------- + +do_depend() +{ + after zfs-mount localmount +} + +do_start() +{ + check_module_loaded "zfs" || exit 0 + + ZED_ARGS="$ZED_ARGS -p $ZED_PIDFILE" + + zfs_action "Starting ZFS Event Daemon" zfs_daemon_start \ + "$ZED_PIDFILE" "$ZED" "$ZED_ARGS" + return "$?" +} + +do_stop() +{ + local pools RET + check_module_loaded "zfs" || exit 0 + + zfs_action "Stopping ZFS Event Daemon" zfs_daemon_stop \ + "$ZED_PIDFILE" "$ZED" "$ZED_NAME" + if [ "$?" -eq "0" ] + then + # Let's see if we have any pools imported + pools=$("$ZPOOL" list -H -oname) + if [ -z "$pools" ] + then + # No pools imported, it is/should be safe/possible to + # unload modules. + zfs_action "Unloading modules" rmmod zfs zunicode \ + zavl zcommon znvpair zlua spl + return "$?" + fi + else + return "$?" + fi +} + +do_status() +{ + check_module_loaded "zfs" || exit 0 + + zfs_daemon_status "$ZED_PIDFILE" "$ZED" "$ZED_NAME" + return "$?" +} + +do_reload() +{ + check_module_loaded "zfs" || exit 0 + + zfs_action "Reloading ZFS Event Daemon" zfs_daemon_reload \ + "$ZED_PIDFILE" "$ZED_NAME" + return "$?" +} + +# ---------------------------------------------------- + +if [ ! -e /sbin/openrc-run ]; then + case "$1" in + start) + do_start + ;; + stop) + do_stop + ;; + status) + do_status + ;; + reload|force-reload) + do_reload + ;; + restart) + do_stop + do_start + ;; + *) + [ -n "$1" ] && echo "Error: Unknown command $1." + echo "Usage: $0 {start|stop|status|reload|restart}" + exit 1 + ;; + esac + + exit $? +else + # Create wrapper functions since Gentoo don't use the case part. + depend() { do_depend; } + start() { do_start; } + stop() { do_stop; } + status() { do_status; } + reload() { do_reload; } +fi diff --git a/etc/modules-load.d/.gitignore b/etc/modules-load.d/.gitignore new file mode 100644 index 000000000000..fee921708337 --- /dev/null +++ b/etc/modules-load.d/.gitignore @@ -0,0 +1 @@ +*.conf diff --git a/etc/modules-load.d/Makefile.am b/etc/modules-load.d/Makefile.am new file mode 100644 index 000000000000..8a2955767b1e --- /dev/null +++ b/etc/modules-load.d/Makefile.am @@ -0,0 +1,2 @@ +dist_modulesload_DATA = \ + zfs.conf diff --git a/etc/modules-load.d/zfs.conf b/etc/modules-load.d/zfs.conf new file mode 100644 index 000000000000..44e1bb3ed906 --- /dev/null +++ b/etc/modules-load.d/zfs.conf @@ -0,0 +1,3 @@ +# The default behavior is to allow udev to load the kernel modules on demand. +# Uncomment the following line to unconditionally load them at boot. +#zfs diff --git a/etc/sudoers.d/Makefile.am b/etc/sudoers.d/Makefile.am new file mode 100644 index 000000000000..6f7ac8dbfd61 --- /dev/null +++ b/etc/sudoers.d/Makefile.am @@ -0,0 +1,5 @@ +sudoersddir = $(sysconfdir)/sudoers.d +sudoersd_DATA = zfs + +EXTRA_DIST = \ + zfs diff --git a/etc/sudoers.d/zfs b/etc/sudoers.d/zfs new file mode 100644 index 000000000000..82a25ba81ec7 --- /dev/null +++ b/etc/sudoers.d/zfs @@ -0,0 +1,9 @@ +## +## Allow any user to run `zpool iostat/status -c smart` in order +## to read basic SMART health statistics for a pool. +## +## CAUTION: Any syntax error introduced here will break sudo. +## Editing with 'visudo' is recommended: visudo -f /etc/sudoers.d/zfs +## + +# ALL ALL = (root) NOPASSWD: /usr/sbin/smartctl -a /dev/[hsv]d[a-z0-9]* diff --git a/etc/systemd/Makefile.am b/etc/systemd/Makefile.am new file mode 100644 index 000000000000..7b47b93fc105 --- /dev/null +++ b/etc/systemd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = system system-generators diff --git a/etc/systemd/system-generators/.gitignore b/etc/systemd/system-generators/.gitignore new file mode 100644 index 000000000000..fc2ebc1a2950 --- /dev/null +++ b/etc/systemd/system-generators/.gitignore @@ -0,0 +1 @@ +zfs-mount-generator diff --git a/etc/systemd/system-generators/Makefile.am b/etc/systemd/system-generators/Makefile.am new file mode 100644 index 000000000000..fee88dad8ca1 --- /dev/null +++ b/etc/systemd/system-generators/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Substfiles.am + +systemdgenerator_SCRIPTS = \ + zfs-mount-generator + +SUBSTFILES += $(systemdgenerator_SCRIPTS) diff --git a/etc/systemd/system-generators/zfs-mount-generator.in b/etc/systemd/system-generators/zfs-mount-generator.in new file mode 100755 index 000000000000..fdef13cfa95a --- /dev/null +++ b/etc/systemd/system-generators/zfs-mount-generator.in @@ -0,0 +1,450 @@ +#!/bin/sh + +# zfs-mount-generator - generates systemd mount units for zfs +# Copyright (c) 2017 Antonio Russo +# Copyright (c) 2020 InsanePrawn +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +set -e + +FSLIST="@sysconfdir@/zfs/zfs-list.cache" + +[ -d "${FSLIST}" ] || exit 0 + +do_fail() { + printf 'zfs-mount-generator: %s\n' "$*" > /dev/kmsg + exit 1 +} + +# test if $1 is in space-separated list $2 +is_known() { + query="$1" + IFS=' ' + # protect against special characters + set -f + for element in $2 ; do + if [ "$query" = "$element" ] ; then + return 0 + fi + done + return 1 +} + +# create dependency on unit file $1 +# of type $2, i.e. "wants" or "requires" +# in the target units from space-separated list $3 +create_dependencies() { + unitfile="$1" + suffix="$2" + # protect against special characters + set -f + for target in $3 ; do + target_dir="${dest_norm}/${target}.${suffix}/" + mkdir -p "${target_dir}" + ln -s "../${unitfile}" "${target_dir}" + done +} + +# see systemd.generator +if [ $# -eq 0 ] ; then + dest_norm="/tmp" +elif [ $# -eq 3 ] ; then + dest_norm="${1}" +else + do_fail "zero or three arguments required" +fi + + +# All needed information about each ZFS is available from +# zfs list -H -t filesystem -o +# cached in $FSLIST, and each line is processed by the following function: +# See the list below for the properties and their order + +process_line() { + + # zfs list -H -o name,... + # fields are tab separated + IFS="$(printf '\t')" + # protect against special characters in, e.g., mountpoints + set -f + # shellcheck disable=SC2086 + set -- $1 + dataset="${1}" + p_mountpoint="${2}" + p_canmount="${3}" + p_atime="${4}" + p_relatime="${5}" + p_devices="${6}" + p_exec="${7}" + p_readonly="${8}" + p_setuid="${9}" + p_nbmand="${10}" + p_encroot="${11}" + p_keyloc="${12}" + p_systemd_requires="${13}" + p_systemd_requiresmountsfor="${14}" + p_systemd_before="${15}" + p_systemd_after="${16}" + p_systemd_wantedby="${17}" + p_systemd_requiredby="${18}" + p_systemd_nofail="${19}" + p_systemd_ignore="${20}" + + # Minimal pre-requisites to mount a ZFS dataset + # By ordering before zfs-mount.service, we avoid race conditions. + after="zfs-import.target" + before="zfs-mount.service" + wants="zfs-import.target" + requires="" + requiredmounts="" + bindsto="" + wantedby="" + requiredby="" + noauto="off" + + if [ -n "${p_systemd_after}" ] && \ + [ "${p_systemd_after}" != "-" ] ; then + after="${p_systemd_after} ${after}" + fi + + if [ -n "${p_systemd_before}" ] && \ + [ "${p_systemd_before}" != "-" ] ; then + before="${p_systemd_before} ${before}" + fi + + if [ -n "${p_systemd_requires}" ] && \ + [ "${p_systemd_requires}" != "-" ] ; then + requires="Requires=${p_systemd_requires}" + fi + + if [ -n "${p_systemd_requiresmountsfor}" ] && \ + [ "${p_systemd_requiresmountsfor}" != "-" ] ; then + requiredmounts="RequiresMountsFor=${p_systemd_requiresmountsfor}" + fi + + # Handle encryption + if [ -n "${p_encroot}" ] && + [ "${p_encroot}" != "-" ] ; then + keyloadunit="zfs-load-key-$(systemd-escape "${p_encroot}").service" + if [ "${p_encroot}" = "${dataset}" ] ; then + keymountdep="" + if [ "${p_keyloc%%://*}" = "file" ] ; then + if [ -n "${requiredmounts}" ] ; then + keymountdep="${requiredmounts} '${p_keyloc#file://}'" + else + keymountdep="RequiresMountsFor='${p_keyloc#file://}'" + fi + keyloadscript="@sbindir@/zfs load-key \"${dataset}\"" + elif [ "${p_keyloc}" = "prompt" ] ; then + keyloadscript="\ +count=0;\ +while [ \$\$count -lt 3 ];do\ + systemd-ask-password --id=\"zfs:${dataset}\"\ + \"Enter passphrase for ${dataset}:\"|\ + @sbindir@/zfs load-key \"${dataset}\" && exit 0;\ + count=\$\$((count + 1));\ +done;\ +exit 1" + else + printf 'zfs-mount-generator: (%s) invalid keylocation\n' \ + "${dataset}" >/dev/kmsg + fi + keyloadcmd="\ +/bin/sh -c '\ +set -eu;\ +keystatus=\"\$\$(@sbindir@/zfs get -H -o value keystatus \"${dataset}\")\";\ +[ \"\$\$keystatus\" = \"unavailable\" ] || exit 0;\ +${keyloadscript}'" + keyunloadcmd="\ +/bin/sh -c '\ +set -eu;\ +keystatus=\"\$\$(@sbindir@/zfs get -H -o value keystatus \"${dataset}\")\";\ +[ \"\$\$keystatus\" = \"available\" ] || exit 0;\ +@sbindir@/zfs unload-key \"${dataset}\"'" + + + + # Generate the key-load .service unit + # + # Note: It is tempting to use a `< "${dest_norm}/${keyloadunit}" + fi + # Update the dependencies for the mount file to want the + # key-loading unit. + wants="${wants}" + bindsto="BindsTo=${keyloadunit}" + after="${after} ${keyloadunit}" + fi + + # Prepare the .mount unit + + # skip generation of the mount unit if org.openzfs.systemd:ignore is "on" + if [ -n "${p_systemd_ignore}" ] ; then + if [ "${p_systemd_ignore}" = "on" ] ; then + return + elif [ "${p_systemd_ignore}" = "-" ] \ + || [ "${p_systemd_ignore}" = "off" ] ; then + : # This is OK + else + do_fail "invalid org.openzfs.systemd:ignore for ${dataset}" + fi + fi + + # Check for canmount=off . + if [ "${p_canmount}" = "off" ] ; then + return + elif [ "${p_canmount}" = "noauto" ] ; then + noauto="on" + elif [ "${p_canmount}" = "on" ] ; then + : # This is OK + else + do_fail "invalid canmount for ${dataset}" + fi + + # Check for legacy and blank mountpoints. + if [ "${p_mountpoint}" = "legacy" ] ; then + return + elif [ "${p_mountpoint}" = "none" ] ; then + return + elif [ "${p_mountpoint%"${p_mountpoint#?}"}" != "/" ] ; then + do_fail "invalid mountpoint for ${dataset}" + fi + + # Escape the mountpoint per systemd policy. + mountfile="$(systemd-escape --path --suffix=mount "${p_mountpoint}")" + + # Parse options + # see lib/libzfs/libzfs_mount.c:zfs_add_options + opts="" + + # atime + if [ "${p_atime}" = on ] ; then + # relatime + if [ "${p_relatime}" = on ] ; then + opts="${opts},atime,relatime" + elif [ "${p_relatime}" = off ] ; then + opts="${opts},atime,strictatime" + else + printf 'zfs-mount-generator: (%s) invalid relatime\n' \ + "${dataset}" >/dev/kmsg + fi + elif [ "${p_atime}" = off ] ; then + opts="${opts},noatime" + else + printf 'zfs-mount-generator: (%s) invalid atime\n' \ + "${dataset}" >/dev/kmsg + fi + + # devices + if [ "${p_devices}" = on ] ; then + opts="${opts},dev" + elif [ "${p_devices}" = off ] ; then + opts="${opts},nodev" + else + printf 'zfs-mount-generator: (%s) invalid devices\n' \ + "${dataset}" >/dev/kmsg + fi + + # exec + if [ "${p_exec}" = on ] ; then + opts="${opts},exec" + elif [ "${p_exec}" = off ] ; then + opts="${opts},noexec" + else + printf 'zfs-mount-generator: (%s) invalid exec\n' \ + "${dataset}" >/dev/kmsg + fi + + # readonly + if [ "${p_readonly}" = on ] ; then + opts="${opts},ro" + elif [ "${p_readonly}" = off ] ; then + opts="${opts},rw" + else + printf 'zfs-mount-generator: (%s) invalid readonly\n' \ + "${dataset}" >/dev/kmsg + fi + + # setuid + if [ "${p_setuid}" = on ] ; then + opts="${opts},suid" + elif [ "${p_setuid}" = off ] ; then + opts="${opts},nosuid" + else + printf 'zfs-mount-generator: (%s) invalid setuid\n' \ + "${dataset}" >/dev/kmsg + fi + + # nbmand + if [ "${p_nbmand}" = on ] ; then + opts="${opts},mand" + elif [ "${p_nbmand}" = off ] ; then + opts="${opts},nomand" + else + printf 'zfs-mount-generator: (%s) invalid nbmand\n' \ + "${dataset}" >/dev/kmsg + fi + + if [ -n "${p_systemd_wantedby}" ] && \ + [ "${p_systemd_wantedby}" != "-" ] ; then + noauto="on" + if [ "${p_systemd_wantedby}" = "none" ] ; then + wantedby="" + else + wantedby="${p_systemd_wantedby}" + before="${before} ${wantedby}" + fi + fi + + if [ -n "${p_systemd_requiredby}" ] && \ + [ "${p_systemd_requiredby}" != "-" ] ; then + noauto="on" + if [ "${p_systemd_requiredby}" = "none" ] ; then + requiredby="" + else + requiredby="${p_systemd_requiredby}" + before="${before} ${requiredby}" + fi + fi + + # For datasets with canmount=on, a dependency is created for + # local-fs.target by default. To avoid regressions, this dependency + # is reduced to "wants" rather than "requires" when nofail is not "off". + # **THIS MAY CHANGE** + # noauto=on disables this behavior completely. + if [ "${noauto}" != "on" ] ; then + if [ "${p_systemd_nofail}" = "off" ] ; then + requiredby="local-fs.target" + before="${before} local-fs.target" + else + wantedby="local-fs.target" + if [ "${p_systemd_nofail}" != "on" ] ; then + before="${before} local-fs.target" + fi + fi + fi + + # Handle existing files: + # 1. We never overwrite existing files, although we may delete + # files if we're sure they were created by us. (see 5.) + # 2. We handle files differently based on canmount. Units with canmount=on + # always have precedence over noauto. This is enforced by the sort pipe + # in the loop around this function. + # It is important to use $p_canmount and not $noauto here, since we + # sort by canmount while other properties also modify $noauto, e.g. + # org.openzfs.systemd:wanted-by. + # 3. If no unit file exists for a noauto dataset, we create one. + # Additionally, we use $noauto_files to track the unit file names + # (which are the systemd-escaped mountpoints) of all (exclusively) + # noauto datasets that had a file created. + # 4. If the file to be created is found in the tracking variable, + # we do NOT create it. + # 5. If a file exists for a noauto dataset, we check whether the file + # name is in the variable. If it is, we have multiple noauto datasets + # for the same mountpoint. In such cases, we remove the file for safety. + # To avoid further noauto datasets creating a file for this path again, + # we leave the file name in the tracking variable. + if [ -e "${dest_norm}/${mountfile}" ] ; then + if is_known "$mountfile" "$noauto_files" ; then + # if it's in $noauto_files, we must be noauto too. See 2. + printf 'zfs-mount-generator: removing duplicate noauto %s\n' \ + "${mountfile}" >/dev/kmsg + # See 5. + rm "${dest_norm}/${mountfile}" + else + # don't log for canmount=noauto + if [ "${p_canmount}" = "on" ] ; then + printf 'zfs-mount-generator: %s already exists. Skipping.\n' \ + "${mountfile}" >/dev/kmsg + fi + fi + # file exists; Skip current dataset. + return + else + if is_known "${mountfile}" "${noauto_files}" ; then + # See 4. + return + elif [ "${p_canmount}" = "noauto" ] ; then + noauto_files="${mountfile} ${noauto_files}" + fi + fi + + # Create the .mount unit file. + # + # (Do not use `< "${dest_norm}/${mountfile}" + + # Finally, create the appropriate dependencies + create_dependencies "${mountfile}" "wants" "$wantedby" + create_dependencies "${mountfile}" "requires" "$requiredby" + +} + +for cachefile in "${FSLIST}/"* ; do + # Sort cachefile's lines by canmount, "on" before "noauto" + # and feed each line into process_line + sort -t "$(printf '\t')" -k 3 -r "${cachefile}" | \ + ( # subshell is necessary for `sort|while read` and $noauto_files + noauto_files="" + while read -r fs ; do + process_line "${fs}" + done + ) +done diff --git a/etc/systemd/system/.gitignore b/etc/systemd/system/.gitignore new file mode 100644 index 000000000000..efada54ad932 --- /dev/null +++ b/etc/systemd/system/.gitignore @@ -0,0 +1,3 @@ +*.service +*.target +*.preset diff --git a/etc/systemd/system/50-zfs.preset.in b/etc/systemd/system/50-zfs.preset.in new file mode 100644 index 000000000000..e4056a92cd98 --- /dev/null +++ b/etc/systemd/system/50-zfs.preset.in @@ -0,0 +1,9 @@ +# ZFS is enabled by default +enable zfs-import-cache.service +disable zfs-import-scan.service +enable zfs-import.target +enable zfs-mount.service +enable zfs-share.service +enable zfs-zed.service +enable zfs-volume-wait.service +enable zfs.target diff --git a/etc/systemd/system/Makefile.am b/etc/systemd/system/Makefile.am new file mode 100644 index 000000000000..c374a52ac7db --- /dev/null +++ b/etc/systemd/system/Makefile.am @@ -0,0 +1,21 @@ +include $(top_srcdir)/config/Substfiles.am + +systemdpreset_DATA = \ + 50-zfs.preset + +systemdunit_DATA = \ + zfs-zed.service \ + zfs-import-cache.service \ + zfs-import-scan.service \ + zfs-mount.service \ + zfs-share.service \ + zfs-volume-wait.service \ + zfs-import.target \ + zfs-volumes.target \ + zfs.target + +SUBSTFILES += $(systemdpreset_DATA) $(systemdunit_DATA) + +install-data-hook: + $(MKDIR_P) "$(DESTDIR)$(systemdunitdir)" + ln -sf /dev/null "$(DESTDIR)$(systemdunitdir)/zfs-import.service" diff --git a/etc/systemd/system/zfs-import-cache.service.in b/etc/systemd/system/zfs-import-cache.service.in new file mode 100644 index 000000000000..47c5b07f8ff0 --- /dev/null +++ b/etc/systemd/system/zfs-import-cache.service.in @@ -0,0 +1,20 @@ +[Unit] +Description=Import ZFS pools by cache file +Documentation=man:zpool(8) +DefaultDependencies=no +Requires=systemd-udev-settle.service +After=systemd-udev-settle.service +After=cryptsetup.target +After=multipathd.target +After=systemd-remount-fs.service +Before=zfs-import.target +ConditionPathExists=@sysconfdir@/zfs/zpool.cache +ConditionPathIsDirectory=/sys/module/zfs + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@sbindir@/zpool import -c @sysconfdir@/zfs/zpool.cache -aN + +[Install] +WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import-scan.service.in b/etc/systemd/system/zfs-import-scan.service.in new file mode 100644 index 000000000000..6520f32463dd --- /dev/null +++ b/etc/systemd/system/zfs-import-scan.service.in @@ -0,0 +1,19 @@ +[Unit] +Description=Import ZFS pools by device scanning +Documentation=man:zpool(8) +DefaultDependencies=no +Requires=systemd-udev-settle.service +After=systemd-udev-settle.service +After=cryptsetup.target +After=multipathd.target +Before=zfs-import.target +ConditionPathExists=!@sysconfdir@/zfs/zpool.cache +ConditionPathIsDirectory=/sys/module/zfs + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@sbindir@/zpool import -aN -o cachefile=none + +[Install] +WantedBy=zfs-import.target diff --git a/etc/systemd/system/zfs-import.target.in b/etc/systemd/system/zfs-import.target.in new file mode 100644 index 000000000000..8d78a7a96030 --- /dev/null +++ b/etc/systemd/system/zfs-import.target.in @@ -0,0 +1,6 @@ +[Unit] +Description=ZFS pool import target +Before=dracut-mount.service + +[Install] +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-mount.service.in b/etc/systemd/system/zfs-mount.service.in new file mode 100644 index 000000000000..480f39a49769 --- /dev/null +++ b/etc/systemd/system/zfs-mount.service.in @@ -0,0 +1,18 @@ +[Unit] +Description=Mount ZFS filesystems +Documentation=man:zfs(8) +DefaultDependencies=no +After=systemd-udev-settle.service +After=zfs-import.target +After=systemd-remount-fs.service +Before=local-fs.target +Before=systemd-random-seed.service +ConditionPathIsDirectory=/sys/module/zfs + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@sbindir@/zfs mount -a + +[Install] +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-share.service.in b/etc/systemd/system/zfs-share.service.in new file mode 100644 index 000000000000..b720085874e5 --- /dev/null +++ b/etc/systemd/system/zfs-share.service.in @@ -0,0 +1,18 @@ +[Unit] +Description=ZFS file system shares +Documentation=man:zfs(8) +After=nfs-server.service nfs-kernel-server.service +After=smb.service +Before=rpc-statd-notify.service +Wants=zfs-mount.service +After=zfs-mount.service +PartOf=nfs-server.service nfs-kernel-server.service +PartOf=smb.service + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@sbindir@/zfs share -a + +[Install] +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-volume-wait.service.in b/etc/systemd/system/zfs-volume-wait.service.in new file mode 100644 index 000000000000..75bd9fcdd56c --- /dev/null +++ b/etc/systemd/system/zfs-volume-wait.service.in @@ -0,0 +1,13 @@ +[Unit] +Description=Wait for ZFS Volume (zvol) links in /dev +DefaultDependencies=no +After=systemd-udev-settle.service +After=zfs-import.target + +[Service] +Type=oneshot +RemainAfterExit=yes +ExecStart=@bindir@/zvol_wait + +[Install] +WantedBy=zfs-volumes.target diff --git a/etc/systemd/system/zfs-volumes.target.in b/etc/systemd/system/zfs-volumes.target.in new file mode 100644 index 000000000000..5cb9a10f49c5 --- /dev/null +++ b/etc/systemd/system/zfs-volumes.target.in @@ -0,0 +1,7 @@ +[Unit] +Description=ZFS volumes are ready +After=zfs-volume-wait.service +Requires=zfs-volume-wait.service + +[Install] +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs-zed.service.in b/etc/systemd/system/zfs-zed.service.in new file mode 100644 index 000000000000..f4313625ee5e --- /dev/null +++ b/etc/systemd/system/zfs-zed.service.in @@ -0,0 +1,11 @@ +[Unit] +Description=ZFS Event Daemon (zed) +Documentation=man:zed(8) + +[Service] +ExecStart=@sbindir@/zed -F +Restart=on-abort + +[Install] +Alias=zed.service +WantedBy=zfs.target diff --git a/etc/systemd/system/zfs.target.in b/etc/systemd/system/zfs.target.in new file mode 100644 index 000000000000..4699463b0ddf --- /dev/null +++ b/etc/systemd/system/zfs.target.in @@ -0,0 +1,5 @@ +[Unit] +Description=ZFS startup target + +[Install] +WantedBy=multi-user.target diff --git a/etc/zfs/.gitignore b/etc/zfs/.gitignore new file mode 100644 index 000000000000..1b2d752debda --- /dev/null +++ b/etc/zfs/.gitignore @@ -0,0 +1 @@ +zfs-functions diff --git a/etc/zfs/Makefile.am b/etc/zfs/Makefile.am new file mode 100644 index 000000000000..b9123c176b93 --- /dev/null +++ b/etc/zfs/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Substfiles.am + +pkgsysconfdir = $(sysconfdir)/zfs + +dist_pkgsysconf_DATA = \ + vdev_id.conf.alias.example \ + vdev_id.conf.sas_direct.example \ + vdev_id.conf.sas_switch.example \ + vdev_id.conf.multipath.example \ + vdev_id.conf.scsi.example + +pkgsysconf_SCRIPTS = \ + zfs-functions + +SUBSTFILES += $(pkgsysconf_SCRIPTS) diff --git a/etc/zfs/vdev_id.conf.alias.example b/etc/zfs/vdev_id.conf.alias.example new file mode 100644 index 000000000000..33735b05b956 --- /dev/null +++ b/etc/zfs/vdev_id.conf.alias.example @@ -0,0 +1,4 @@ +# by-vdev +# name fully qualified or base name of device link +alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca +alias d2 wwn-0x5000c5002def789e diff --git a/etc/zfs/vdev_id.conf.multipath.example b/etc/zfs/vdev_id.conf.multipath.example new file mode 100644 index 000000000000..c1359d37d59c --- /dev/null +++ b/etc/zfs/vdev_id.conf.multipath.example @@ -0,0 +1,7 @@ +multipath yes + +# PCI_ID HBA PORT CHANNEL NAME +channel 85:00.0 1 A +channel 85:00.0 0 B +channel 86:00.0 1 A +channel 86:00.0 0 B diff --git a/etc/zfs/vdev_id.conf.sas_direct.example b/etc/zfs/vdev_id.conf.sas_direct.example new file mode 100644 index 000000000000..d17ed149d89b --- /dev/null +++ b/etc/zfs/vdev_id.conf.sas_direct.example @@ -0,0 +1,28 @@ +multipath no +topology sas_direct +phys_per_port 4 + +# Additionally create /dev/by-enclosure/ symlinks for enclosure devices +enclosure_symlinks yes + +# PCI_ID HBA PORT CHANNEL NAME +channel 85:00.0 1 A +channel 85:00.0 0 B +channel 86:00.0 1 C +channel 86:00.0 0 D + + +# Custom mapping for Channel A + +# Linux Mapped +# Slot Slot Channel +slot 1 7 A +slot 2 10 A +slot 3 3 A +slot 4 6 A + +# Default mapping for B, C, and D +slot 1 4 +slot 2 2 +slot 3 1 +slot 4 3 diff --git a/etc/zfs/vdev_id.conf.sas_switch.example b/etc/zfs/vdev_id.conf.sas_switch.example new file mode 100644 index 000000000000..b87d65527452 --- /dev/null +++ b/etc/zfs/vdev_id.conf.sas_switch.example @@ -0,0 +1,7 @@ +topology sas_switch + +# SWITCH PORT CHANNEL NAME +channel 1 A +channel 2 B +channel 3 C +channel 4 D diff --git a/etc/zfs/vdev_id.conf.scsi.example b/etc/zfs/vdev_id.conf.scsi.example new file mode 100644 index 000000000000..b8c0ab2bf67d --- /dev/null +++ b/etc/zfs/vdev_id.conf.scsi.example @@ -0,0 +1,9 @@ +multipath no +topology scsi +phys_per_port 1 +# Usually scsi disks are numbered from 0, but this can be offset, to +# match the physical bay numbers, as follows: +first_bay_number 1 + +# PCI_ID HBA PORT CHANNEL NAME +channel 0c:00.0 0 Y diff --git a/etc/zfs/zfs-functions.in b/etc/zfs/zfs-functions.in new file mode 100644 index 000000000000..c2ce6157c6e0 --- /dev/null +++ b/etc/zfs/zfs-functions.in @@ -0,0 +1,434 @@ +# This is a script with common functions etc used by zfs-import, zfs-mount, +# zfs-share and zfs-zed. +# +# It is _NOT_ to be called independently +# +# Released under the 2-clause BSD license. +# +# The original script that acted as a template for this script came from +# the Debian GNU/Linux kFreeBSD ZFS packages (which did not include a +# licensing stansa) in the commit dated Mar 24, 2011: +# https://github.com/zfsonlinux/pkg-zfs/commit/80a3ae582b59c0250d7912ba794dca9e669e605a + +PATH=/sbin:/bin:/usr/bin:/usr/sbin + +# Source function library +if [ -f /etc/rc.d/init.d/functions ]; then + # RedHat and derivates + . /etc/rc.d/init.d/functions +elif [ -L /etc/init.d/functions.sh ]; then + # Gentoo + . /etc/init.d/functions.sh +elif [ -f /lib/lsb/init-functions ]; then + # LSB, Debian GNU/Linux and derivates + . /lib/lsb/init-functions +fi + +# Of course the functions we need are called differently +# on different distributions - it would be way too easy +# otherwise!! +if type log_failure_msg > /dev/null 2>&1 ; then + # LSB functions - fall through + zfs_log_begin_msg() { log_begin_msg "$1"; } + zfs_log_end_msg() { log_end_msg "$1"; } + zfs_log_failure_msg() { log_failure_msg "$1"; } + zfs_log_progress_msg() { log_progress_msg "$1"; } +elif type success > /dev/null 2>&1 ; then + # Fedora/RedHat functions + zfs_set_ifs() { + # For some reason, the init function library have a problem + # with a changed IFS, so this function goes around that. + local tIFS="$1" + if [ -n "$tIFS" ] + then + TMP_IFS="$IFS" + IFS="$tIFS" + fi + } + + zfs_log_begin_msg() { echo -n "$1 "; } + zfs_log_end_msg() { + zfs_set_ifs "$OLD_IFS" + if [ "$1" -eq 0 ]; then + success + else + failure + fi + echo + zfs_set_ifs "$TMP_IFS" + } + zfs_log_failure_msg() { + zfs_set_ifs "$OLD_IFS" + failure + echo + zfs_set_ifs "$TMP_IFS" + } + zfs_log_progress_msg() { echo -n $"$1"; } +elif type einfo > /dev/null 2>&1 ; then + # Gentoo functions + zfs_log_begin_msg() { ebegin "$1"; } + zfs_log_end_msg() { eend "$1"; } + zfs_log_failure_msg() { eend "$1"; } +# zfs_log_progress_msg() { echo -n "$1"; } + zfs_log_progress_msg() { echo -n; } +else + # Unknown - simple substitutes. + zfs_log_begin_msg() { echo -n "$1"; } + zfs_log_end_msg() { + ret=$1 + if [ "$ret" -ge 1 ]; then + echo " failed!" + else + echo " success" + fi + return "$ret" + } + zfs_log_failure_msg() { echo "$1"; } + zfs_log_progress_msg() { echo -n "$1"; } +fi + +# Paths to what we need +ZFS="@sbindir@/zfs" +ZED="@sbindir@/zed" +ZPOOL="@sbindir@/zpool" +ZPOOL_CACHE="@sysconfdir@/zfs/zpool.cache" + +# Sensible defaults +ZFS_MOUNT='yes' +ZFS_UNMOUNT='yes' +ZFS_SHARE='yes' +ZFS_UNSHARE='yes' + +# Source zfs configuration, overriding the defaults +if [ -f @initconfdir@/zfs ]; then + . @initconfdir@/zfs +fi + +# ---------------------------------------------------- + +export ZFS ZED ZPOOL ZPOOL_CACHE ZFS_MOUNT ZFS_UNMOUNT ZFS_SHARE ZFS_UNSHARE + +zfs_action() +{ + local MSG="$1"; shift + local CMD="$*" + local ret + + zfs_log_begin_msg "$MSG " + $CMD + ret=$? + if [ "$ret" -eq 0 ]; then + zfs_log_end_msg $ret + else + zfs_log_failure_msg $ret + fi + + return $ret +} + +# Returns +# 0 if daemon has been started +# 1 if daemon was already running +# 2 if daemon could not be started +# 3 if unsupported +# +zfs_daemon_start() +{ + local PIDFILE="$1"; shift + local DAEMON_BIN="$1"; shift + local DAEMON_ARGS="$*" + + if type start-stop-daemon > /dev/null 2>&1 ; then + # LSB functions + start-stop-daemon --start --quiet --pidfile "$PIDFILE" \ + --exec "$DAEMON_BIN" --test > /dev/null || return 1 + + start-stop-daemon --start --quiet --exec "$DAEMON_BIN" -- \ + $DAEMON_ARGS || return 2 + + # On Debian GNU/Linux, there's a 'sendsigs' script that will + # kill basically everything quite early and zed is stopped + # much later than that. We don't want zed to be among them, + # so add the zed pid to list of pids to ignore. + if [ -f "$PIDFILE" -a -d /run/sendsigs.omit.d ] + then + ln -sf "$PIDFILE" /run/sendsigs.omit.d/zed + fi + elif type daemon > /dev/null 2>&1 ; then + # Fedora/RedHat functions + daemon --pidfile "$PIDFILE" "$DAEMON_BIN" $DAEMON_ARGS + return $? + else + # Unsupported + return 3 + fi + + return 0 +} + +# Returns +# 0 if daemon has been stopped +# 1 if daemon was already stopped +# 2 if daemon could not be stopped +# 3 if unsupported +# +zfs_daemon_stop() +{ + local PIDFILE="$1" + local DAEMON_BIN="$2" + local DAEMON_NAME="$3" + + if type start-stop-daemon > /dev/null 2>&1 ; then + # LSB functions + start-stop-daemon --stop --quiet --retry=TERM/30/KILL/5 \ + --pidfile "$PIDFILE" --name "$DAEMON_NAME" + [ "$?" = 0 ] && rm -f "$PIDFILE" + + return $? + elif type killproc > /dev/null 2>&1 ; then + # Fedora/RedHat functions + killproc -p "$PIDFILE" "$DAEMON_NAME" + [ "$?" = 0 ] && rm -f "$PIDFILE" + + return $? + else + # Unsupported + return 3 + fi + + return 0 +} + +# Returns status +zfs_daemon_status() +{ + local PIDFILE="$1" + local DAEMON_BIN="$2" + local DAEMON_NAME="$3" + + if type status_of_proc > /dev/null 2>&1 ; then + # LSB functions + status_of_proc "$DAEMON_NAME" "$DAEMON_BIN" + return $? + elif type status > /dev/null 2>&1 ; then + # Fedora/RedHat functions + status -p "$PIDFILE" "$DAEMON_NAME" + return $? + else + # Unsupported + return 3 + fi + + return 0 +} + +zfs_daemon_reload() +{ + local PIDFILE="$1" + local DAEMON_NAME="$2" + + if type start-stop-daemon > /dev/null 2>&1 ; then + # LSB functions + start-stop-daemon --stop --signal 1 --quiet \ + --pidfile "$PIDFILE" --name "$DAEMON_NAME" + return $? + elif type killproc > /dev/null 2>&1 ; then + # Fedora/RedHat functions + killproc -p "$PIDFILE" "$DAEMON_NAME" -HUP + return $? + else + # Unsupported + return 3 + fi + + return 0 +} + +zfs_installed() +{ + if [ ! -x "$ZPOOL" ]; then + return 1 + else + # Test if it works (will catch missing/broken libs etc) + "$ZPOOL" -? > /dev/null 2>&1 + return $? + fi + + if [ ! -x "$ZFS" ]; then + return 2 + else + # Test if it works (will catch missing/broken libs etc) + "$ZFS" -? > /dev/null 2>&1 + return $? + fi + + return 0 +} + +# Trigger udev and wait for it to settle. +udev_trigger() +{ + if [ -x /sbin/udevadm ]; then + /sbin/udevadm trigger --action=change --subsystem-match=block + /sbin/udevadm settle + elif [ -x /sbin/udevsettle ]; then + /sbin/udevtrigger + /sbin/udevsettle + fi +} + +# Do a lot of checks to make sure it's 'safe' to continue with the import. +checksystem() +{ + if grep -qiE '(^|[^\\](\\\\)* )zfs=(off|no|0)( |$)' /proc/cmdline; + then + # Called with zfs=(off|no|0) - bail because we don't + # want anything import, mounted or shared. + # HOWEVER, only do this if we're called at the boot up + # (from init), not if we're running interactively (as in + # from the shell - we know what we're doing). + [ -n "$init" ] && exit 3 + fi + + # Check if ZFS is installed. + zfs_installed || return 5 + + # Just make sure that /dev/zfs is created. + udev_trigger + + return 0 +} + +get_root_pool() +{ + set -- $(mount | grep ' on / ') + [ "$5" = "zfs" ] && echo "${1%%/*}" +} + +# Check if a variable is 'yes' (any case) or '1' +# Returns TRUE if set. +check_boolean() +{ + local var="$1" + + echo "$var" | grep -Eiq "^yes$|^on$|^true$|^1$" && return 0 || return 1 +} + +check_module_loaded() +{ + module="$1" + + [ -r "/sys/module/${module}/version" ] && return 0 || return 1 +} + +load_module() +{ + module="$1" + + # Load the zfs module stack + if ! check_module_loaded "$module"; then + if ! /sbin/modprobe "$module"; then + return 5 + fi + fi + return 0 +} + +# first parameter is a regular expression that filters mtab +read_mtab() +{ + local match="$1" + local fs mntpnt fstype opts rest TMPFILE + + # Unset all MTAB_* variables + unset $(env | grep ^MTAB_ | sed 's,=.*,,') + + while read -r fs mntpnt fstype opts rest; do + if echo "$fs $mntpnt $fstype $opts" | grep -qE "$match"; then + # * Fix problems (!?) in the mounts file. It will record + # 'rpool 1' as 'rpool\0401' instead of 'rpool\00401' + # which seems to be the correct (at least as far as + # 'printf' is concerned). + # * We need to use the external echo, because the + # internal one would interpret the backslash code + # (incorrectly), giving us a  instead. + mntpnt=$(/bin/echo "$mntpnt" | sed "s,\\\0,\\\00,g") + fs=$(/bin/echo "$fs" | sed "s,\\\0,\\\00,") + + # Remove 'unwanted' characters. + mntpnt=$(printf '%b\n' "$mntpnt" | sed -e 's,/,,g' \ + -e 's,-,,g' -e 's,\.,,g' -e 's, ,,g') + fs=$(printf '%b\n' "$fs") + + # Set the variable. + eval export MTAB_$mntpnt=\"$fs\" + fi + done < /proc/self/mounts +} + +in_mtab() +{ + local mntpnt="$1" + # Remove 'unwanted' characters. + mntpnt=$(printf '%b\n' "$mntpnt" | sed -e 's,/,,g' \ + -e 's,-,,g' -e 's,\.,,g' -e 's, ,,g') + local var + + var="$(eval echo MTAB_$mntpnt)" + [ "$(eval echo "$""$var")" != "" ] + return "$?" +} + +# first parameter is a regular expression that filters fstab +read_fstab() +{ + local match="$1" + local i var TMPFILE + + # Unset all FSTAB_* variables + unset $(env | grep ^FSTAB_ | sed 's,=.*,,') + + i=0 + while read -r fs mntpnt fstype opts; do + echo "$fs" | egrep -qE '^#|^$' && continue + echo "$mntpnt" | egrep -qE '^none|^swap' && continue + echo "$fstype" | egrep -qE '^swap' && continue + + if echo "$fs $mntpnt $fstype $opts" | grep -qE "$match"; then + eval export FSTAB_dev_$i="$fs" + fs=$(printf '%b\n' "$fs" | sed 's,/,_,g') + eval export FSTAB_$i="$mntpnt" + + i=$((i + 1)) + fi + done < /etc/fstab +} + +in_fstab() +{ + local var + + var="$(eval echo FSTAB_$1)" + [ "${var}" != "" ] + return $? +} + +is_mounted() +{ + local mntpt="$1" + local line + + mount | \ + while read line; do + if echo "$line" | grep -q " on $mntpt "; then + # returns: + # 0 on unsuccessful match + # 1 on a successful match + return 1 + fi + done + + # The negation will flip the subshell return result where the default + # return value is 0 when a match is not found. + return $(( !$? )) +} diff --git a/include/.gitignore b/include/.gitignore new file mode 100644 index 000000000000..e6eb2116fa29 --- /dev/null +++ b/include/.gitignore @@ -0,0 +1 @@ +/zfs_gitrev.h diff --git a/include/Makefile.am b/include/Makefile.am new file mode 100644 index 000000000000..0e997deaf173 --- /dev/null +++ b/include/Makefile.am @@ -0,0 +1,33 @@ +SUBDIRS = sys os + +COMMON_H = \ + cityhash.h \ + zfeature_common.h \ + zfs_comutil.h \ + zfs_deleg.h \ + zfs_fletcher.h \ + zfs_namecheck.h \ + zfs_prop.h + +USER_H = \ + libnvpair.h \ + libuutil_common.h \ + libuutil.h \ + libuutil_impl.h \ + libzfs.h \ + libzfs_core.h \ + libzfs_impl.h \ + libzutil.h \ + thread_pool.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs +libzfs_HEADERS = $(COMMON_H) $(USER_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/cityhash.h b/include/cityhash.h new file mode 100644 index 000000000000..33c3b7bc2532 --- /dev/null +++ b/include/cityhash.h @@ -0,0 +1,41 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_CITYHASH_H +#define _SYS_CITYHASH_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CITYHASH_H */ diff --git a/include/libnvpair.h b/include/libnvpair.h new file mode 100644 index 000000000000..5277f9574ddf --- /dev/null +++ b/include/libnvpair.h @@ -0,0 +1,196 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#ifndef _LIBNVPAIR_H +#define _LIBNVPAIR_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * All interfaces described in this file are private to Solaris, and + * are subject to change at any time and without notice. The public + * nvlist/nvpair interfaces, as documented in manpage sections 3NVPAIR, + * are all imported from included above. + */ + +extern int nvpair_value_match(nvpair_t *, int, char *, char **); +extern int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, + char **); + +extern void nvlist_print(FILE *, nvlist_t *); +int nvlist_print_json(FILE *, nvlist_t *); +extern void dump_nvlist(nvlist_t *, int); + +/* + * Private nvlist printing interface that allows the caller some control + * over output rendering (as opposed to nvlist_print and dump_nvlist). + * + * Obtain an opaque nvlist_prtctl_t cookie using nvlist_prtctl_alloc + * (NULL on failure); on return the cookie is set up for default formatting + * and rendering. Quote the cookie in subsequent customisation functions and + * then pass the cookie to nvlist_prt to render the nvlist. Finally, + * use nvlist_prtctl_free to release the cookie. + * + * For all nvlist_lookup_xxx and nvlist_lookup_xxx_array functions + * we have a corresponding brace of functions that appoint replacement + * rendering functions: + * + * extern void nvlist_prtctl_xxx(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value)) + * + * and + * + * extern void nvlist_prtctl_xxx_array(nvlist_prtctl_t, + * void (*)(nvlist_prtctl_t ctl, void *private, const char *name, + * xxxtype value, uint_t count)) + * + * where xxxtype is the C datatype corresponding to xxx, eg int8_t for "int8" + * and char * for "string". The function that is appointed to render the + * specified datatype receives as arguments the cookie, the nvlist + * member name, the value of that member (or a pointer for array function), + * and (for array rendering functions) a count of the number of elements. + */ + +typedef struct nvlist_prtctl *nvlist_prtctl_t; /* opaque */ + +enum nvlist_indent_mode { + NVLIST_INDENT_ABS, /* Absolute indentation */ + NVLIST_INDENT_TABBED /* Indent with tabstops */ +}; + +extern nvlist_prtctl_t nvlist_prtctl_alloc(void); +extern void nvlist_prtctl_free(nvlist_prtctl_t); +extern void nvlist_prt(nvlist_t *, nvlist_prtctl_t); + +/* Output stream */ +extern void nvlist_prtctl_setdest(nvlist_prtctl_t, FILE *); +extern FILE *nvlist_prtctl_getdest(nvlist_prtctl_t); + +/* Indentation mode, start indent, indent increment; default tabbed/0/1 */ +extern void nvlist_prtctl_setindent(nvlist_prtctl_t, enum nvlist_indent_mode, + int, int); +extern void nvlist_prtctl_doindent(nvlist_prtctl_t, int); + +enum nvlist_prtctl_fmt { + NVLIST_FMT_MEMBER_NAME, /* name fmt; default "%s = " */ + NVLIST_FMT_MEMBER_POSTAMBLE, /* after nvlist member; default "\n" */ + NVLIST_FMT_BTWN_ARRAY /* between array members; default " " */ +}; + +extern void nvlist_prtctl_setfmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, + const char *); +extern void nvlist_prtctl_dofmt(nvlist_prtctl_t, enum nvlist_prtctl_fmt, ...); + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for single-valued nvlist members. + * + * A replacement function receives arguments as follows: + * + * nvlist_prtctl_t Print control structure; do not change preferences + * for this object from a print callback function. + * + * void * The function-private cookie argument registered + * when the replacement function was appointed. + * + * nvlist_t * The full nvlist that is being processed. The + * rendering function is called to render a single + * member (name and value passed as below) but it may + * want to reference or incorporate other aspects of + * the full nvlist. + * + * const char * Member name to render + * + * valtype Value of the member to render + * + * The function must return non-zero if it has rendered output for this + * member, or 0 if it wants to default to standard rendering for this + * one member. + */ + +#define NVLIST_PRINTCTL_SVDECL(funcname, valtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, valtype), \ + void *) + +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean, int); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_boolean_value, boolean_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_byte, uchar_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int8, int8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint8, uint8_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int16, int16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint16, uint16_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int32, int32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint32, uint32_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_int64, int64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_uint64, uint64_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_double, double); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_string, char *); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_hrtime, hrtime_t); +NVLIST_PRINTCTL_SVDECL(nvlist_prtctlop_nvlist, nvlist_t *); + +#undef NVLIST_PRINTCTL_SVDECL /* was just for "clarity" above */ + +/* + * Function prototypes for interfaces that appoint a new rendering function + * for array-valued nvlist members. + * + * One additional argument is taken: uint_t for the number of array elements + * + * Return values as above. + */ +#define NVLIST_PRINTCTL_AVDECL(funcname, vtype) \ + extern void funcname(nvlist_prtctl_t, \ + int (*)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, uint_t), \ + void *) + +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_boolean_array, boolean_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_byte_array, uchar_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int8_array, int8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint8_array, uint8_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int16_array, int16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint16_array, uint16_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int32_array, int32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint32_array, uint32_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_int64_array, int64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_uint64_array, uint64_t *); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_string_array, char **); +NVLIST_PRINTCTL_AVDECL(nvlist_prtctlop_nvlist_array, nvlist_t **); + +#undef NVLIST_PRINTCTL_AVDECL /* was just for "clarity" above */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBNVPAIR_H */ diff --git a/include/libuutil.h b/include/libuutil.h new file mode 100644 index 000000000000..d0248901b4e7 --- /dev/null +++ b/include/libuutil.h @@ -0,0 +1,384 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBUUTIL_H +#define _LIBUUTIL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Standard flags codes. + */ +#define UU_DEFAULT 0 + +/* + * Standard error codes. + */ +#define UU_ERROR_NONE 0 /* no error */ +#define UU_ERROR_INVALID_ARGUMENT 1 /* invalid argument */ +#define UU_ERROR_UNKNOWN_FLAG 2 /* passed flag invalid */ +#define UU_ERROR_NO_MEMORY 3 /* out of memory */ +#define UU_ERROR_CALLBACK_FAILED 4 /* callback-initiated error */ +#define UU_ERROR_NOT_SUPPORTED 5 /* operation not supported */ +#define UU_ERROR_EMPTY 6 /* no value provided */ +#define UU_ERROR_UNDERFLOW 7 /* value is too small */ +#define UU_ERROR_OVERFLOW 8 /* value is too value */ +#define UU_ERROR_INVALID_CHAR 9 /* value contains unexpected char */ +#define UU_ERROR_INVALID_DIGIT 10 /* value contains digit not in base */ + +#define UU_ERROR_SYSTEM 99 /* underlying system error */ +#define UU_ERROR_UNKNOWN 100 /* error status not known */ + +/* + * Standard program exit codes. + */ +#define UU_EXIT_OK (*(uu_exit_ok())) +#define UU_EXIT_FATAL (*(uu_exit_fatal())) +#define UU_EXIT_USAGE (*(uu_exit_usage())) + +/* + * Exit status profiles. + */ +#define UU_PROFILE_DEFAULT 0 +#define UU_PROFILE_LAUNCHER 1 + +/* + * Error reporting functions. + */ +uint32_t uu_error(void); +const char *uu_strerror(uint32_t); + +/* + * Program notification functions. + */ +extern void uu_alt_exit(int); +extern const char *uu_setpname(char *); +extern const char *uu_getpname(void); +/*PRINTFLIKE1*/ +extern void uu_warn(const char *, ...); +extern void uu_vwarn(const char *, va_list); +/*PRINTFLIKE1*/ +extern void uu_die(const char *, ...) __NORETURN; +extern void uu_vdie(const char *, va_list) __NORETURN; +/*PRINTFLIKE2*/ +extern void uu_xdie(int, const char *, ...) __NORETURN; +extern void uu_vxdie(int, const char *, va_list) __NORETURN; + +/* + * Exit status functions (not to be used directly) + */ +extern int *uu_exit_ok(void); +extern int *uu_exit_fatal(void); +extern int *uu_exit_usage(void); + +/* + * Debug print facility functions. + */ +typedef struct uu_dprintf uu_dprintf_t; + +typedef enum { + UU_DPRINTF_SILENT, + UU_DPRINTF_FATAL, + UU_DPRINTF_WARNING, + UU_DPRINTF_NOTICE, + UU_DPRINTF_INFO, + UU_DPRINTF_DEBUG +} uu_dprintf_severity_t; + +extern uu_dprintf_t *uu_dprintf_create(const char *, uu_dprintf_severity_t, + uint_t); +/*PRINTFLIKE3*/ +extern void uu_dprintf(uu_dprintf_t *, uu_dprintf_severity_t, + const char *, ...); +extern void uu_dprintf_destroy(uu_dprintf_t *); +extern const char *uu_dprintf_getname(uu_dprintf_t *); + +/* + * Identifier test flags and function. + */ +#define UU_NAME_DOMAIN 0x1 /* allow SUNW, or com.sun, prefix */ +#define UU_NAME_PATH 0x2 /* allow '/'-delimited paths */ + +int uu_check_name(const char *, uint_t); + +/* + * File creation functions. + */ +extern int uu_open_tmp(const char *dir, uint_t uflags); + +/* + * Convenience functions. + */ +#define UU_NELEM(a) (sizeof (a) / sizeof ((a)[0])) + +/*PRINTFLIKE1*/ +extern char *uu_msprintf(const char *format, ...); +extern void *uu_zalloc(size_t); +extern char *uu_strdup(const char *); +extern void uu_free(void *); + +extern boolean_t uu_strcaseeq(const char *a, const char *b); +extern boolean_t uu_streq(const char *a, const char *b); +extern char *uu_strndup(const char *s, size_t n); +extern boolean_t uu_strbw(const char *a, const char *b); +extern void *uu_memdup(const void *buf, size_t sz); +extern void uu_dump(FILE *out, const char *prefix, const void *buf, size_t len); + +/* + * Comparison function type definition. + * Developers should be careful in their use of the _private argument. If you + * break interface guarantees, you get undefined behavior. + */ +typedef int uu_compare_fn_t(const void *__left, const void *__right, + void *__private); + +/* + * Walk variant flags. + * A data structure need not provide support for all variants and + * combinations. Refer to the appropriate documentation. + */ +#define UU_WALK_ROBUST 0x00000001 /* walk can survive removes */ +#define UU_WALK_REVERSE 0x00000002 /* reverse walk order */ + +#define UU_WALK_PREORDER 0x00000010 /* walk tree in pre-order */ +#define UU_WALK_POSTORDER 0x00000020 /* walk tree in post-order */ + +/* + * Walk callback function return codes. + */ +#define UU_WALK_ERROR -1 +#define UU_WALK_NEXT 0 +#define UU_WALK_DONE 1 + +/* + * Walk callback function type definition. + */ +typedef int uu_walk_fn_t(void *_elem, void *_private); + +/* + * lists: opaque structures + */ +typedef struct uu_list_pool uu_list_pool_t; +typedef struct uu_list uu_list_t; + +typedef struct uu_list_node { + uintptr_t uln_opaque[2]; +} uu_list_node_t; + +typedef struct uu_list_walk uu_list_walk_t; + +typedef uintptr_t uu_list_index_t; + +/* + * lists: interface + * + * basic usage: + * typedef struct foo { + * ... + * uu_list_node_t foo_node; + * ... + * } foo_t; + * + * static int + * foo_compare(void *l_arg, void *r_arg, void *private) + * { + * foo_t *l = l_arg; + * foo_t *r = r_arg; + * + * if (... l greater than r ...) + * return (1); + * if (... l less than r ...) + * return (-1); + * return (0); + * } + * + * ... + * // at initialization time + * foo_pool = uu_list_pool_create("foo_pool", + * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, + * debugging? 0 : UU_AVL_POOL_DEBUG); + * ... + */ +uu_list_pool_t *uu_list_pool_create(const char *, size_t, size_t, + uu_compare_fn_t *, uint32_t); +#define UU_LIST_POOL_DEBUG 0x00000001 + +void uu_list_pool_destroy(uu_list_pool_t *); + +/* + * usage: + * + * foo_t *a; + * a = malloc(sizeof (*a)); + * uu_list_node_init(a, &a->foo_list, pool); + * ... + * uu_list_node_fini(a, &a->foo_list, pool); + * free(a); + */ +void uu_list_node_init(void *, uu_list_node_t *, uu_list_pool_t *); +void uu_list_node_fini(void *, uu_list_node_t *, uu_list_pool_t *); + +uu_list_t *uu_list_create(uu_list_pool_t *, void *_parent, uint32_t); +#define UU_LIST_DEBUG 0x00000001 +#define UU_LIST_SORTED 0x00000002 /* list is sorted */ + +void uu_list_destroy(uu_list_t *); /* list must be empty */ + +size_t uu_list_numnodes(uu_list_t *); + +void *uu_list_first(uu_list_t *); +void *uu_list_last(uu_list_t *); + +void *uu_list_next(uu_list_t *, void *); +void *uu_list_prev(uu_list_t *, void *); + +int uu_list_walk(uu_list_t *, uu_walk_fn_t *, void *, uint32_t); + +uu_list_walk_t *uu_list_walk_start(uu_list_t *, uint32_t); +void *uu_list_walk_next(uu_list_walk_t *); +void uu_list_walk_end(uu_list_walk_t *); + +void *uu_list_find(uu_list_t *, void *, void *, uu_list_index_t *); +void uu_list_insert(uu_list_t *, void *, uu_list_index_t); + +void *uu_list_nearest_next(uu_list_t *, uu_list_index_t); +void *uu_list_nearest_prev(uu_list_t *, uu_list_index_t); + +void *uu_list_teardown(uu_list_t *, void **); + +void uu_list_remove(uu_list_t *, void *); + +/* + * lists: interfaces for non-sorted lists only + */ +int uu_list_insert_before(uu_list_t *, void *_target, void *_elem); +int uu_list_insert_after(uu_list_t *, void *_target, void *_elem); + +/* + * avl trees: opaque structures + */ +typedef struct uu_avl_pool uu_avl_pool_t; +typedef struct uu_avl uu_avl_t; + +typedef struct uu_avl_node { +#ifdef _LP64 + uintptr_t uan_opaque[3]; +#else + uintptr_t uan_opaque[4]; +#endif +} uu_avl_node_t; + +typedef struct uu_avl_walk uu_avl_walk_t; + +typedef uintptr_t uu_avl_index_t; + +/* + * avl trees: interface + * + * basic usage: + * typedef struct foo { + * ... + * uu_avl_node_t foo_node; + * ... + * } foo_t; + * + * static int + * foo_compare(void *l_arg, void *r_arg, void *private) + * { + * foo_t *l = l_arg; + * foo_t *r = r_arg; + * + * if (... l greater than r ...) + * return (1); + * if (... l less than r ...) + * return (-1); + * return (0); + * } + * + * ... + * // at initialization time + * foo_pool = uu_avl_pool_create("foo_pool", + * sizeof (foo_t), offsetof(foo_t, foo_node), foo_compare, + * debugging? 0 : UU_AVL_POOL_DEBUG); + * ... + */ +uu_avl_pool_t *uu_avl_pool_create(const char *, size_t, size_t, + uu_compare_fn_t *, uint32_t); +#define UU_AVL_POOL_DEBUG 0x00000001 + +void uu_avl_pool_destroy(uu_avl_pool_t *); + +/* + * usage: + * + * foo_t *a; + * a = malloc(sizeof (*a)); + * uu_avl_node_init(a, &a->foo_avl, pool); + * ... + * uu_avl_node_fini(a, &a->foo_avl, pool); + * free(a); + */ +void uu_avl_node_init(void *, uu_avl_node_t *, uu_avl_pool_t *); +void uu_avl_node_fini(void *, uu_avl_node_t *, uu_avl_pool_t *); + +uu_avl_t *uu_avl_create(uu_avl_pool_t *, void *_parent, uint32_t); +#define UU_AVL_DEBUG 0x00000001 + +void uu_avl_destroy(uu_avl_t *); /* list must be empty */ + +size_t uu_avl_numnodes(uu_avl_t *); + +void *uu_avl_first(uu_avl_t *); +void *uu_avl_last(uu_avl_t *); + +void *uu_avl_next(uu_avl_t *, void *); +void *uu_avl_prev(uu_avl_t *, void *); + +int uu_avl_walk(uu_avl_t *, uu_walk_fn_t *, void *, uint32_t); + +uu_avl_walk_t *uu_avl_walk_start(uu_avl_t *, uint32_t); +void *uu_avl_walk_next(uu_avl_walk_t *); +void uu_avl_walk_end(uu_avl_walk_t *); + +void *uu_avl_find(uu_avl_t *, void *, void *, uu_avl_index_t *); +void uu_avl_insert(uu_avl_t *, void *, uu_avl_index_t); + +void *uu_avl_nearest_next(uu_avl_t *, uu_avl_index_t); +void *uu_avl_nearest_prev(uu_avl_t *, uu_avl_index_t); + +void *uu_avl_teardown(uu_avl_t *, void **); + +void uu_avl_remove(uu_avl_t *, void *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBUUTIL_H */ diff --git a/include/libuutil_common.h b/include/libuutil_common.h new file mode 100644 index 000000000000..52ac4887f75c --- /dev/null +++ b/include/libuutil_common.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBUUTIL_COMMON_H +#define _LIBUUTIL_COMMON_H + + + +#include +#include + +#endif /* _LIBUUTIL_COMMON_H */ diff --git a/include/libuutil_impl.h b/include/libuutil_impl.h new file mode 100644 index 000000000000..f978b475eff8 --- /dev/null +++ b/include/libuutil_impl.h @@ -0,0 +1,181 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBUUTIL_IMPL_H +#define _LIBUUTIL_IMPL_H + + + +#include +#include + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void uu_set_error(uint_t); + + +/*PRINTFLIKE1*/ +void uu_panic(const char *format, ...); + + +struct uu_dprintf { + char *uud_name; + uu_dprintf_severity_t uud_severity; + uint_t uud_flags; +}; + +/* + * For debugging purposes, libuutil keeps around linked lists of all uu_lists + * and uu_avls, along with pointers to their parents. These can cause false + * negatives when looking for memory leaks, so we encode the pointers by + * storing them with swapped endianness; this is not perfect, but it's about + * the best we can do without wasting a lot of space. + */ +#ifdef _LP64 +#define UU_PTR_ENCODE(ptr) BSWAP_64((uintptr_t)(void *)(ptr)) +#else +#define UU_PTR_ENCODE(ptr) BSWAP_32((uintptr_t)(void *)(ptr)) +#endif + +#define UU_PTR_DECODE(ptr) ((void *)UU_PTR_ENCODE(ptr)) + +/* + * uu_list structures + */ +typedef struct uu_list_node_impl { + struct uu_list_node_impl *uln_next; + struct uu_list_node_impl *uln_prev; +} uu_list_node_impl_t; + +struct uu_list_walk { + uu_list_walk_t *ulw_next; + uu_list_walk_t *ulw_prev; + + uu_list_t *ulw_list; + int8_t ulw_dir; + uint8_t ulw_robust; + uu_list_node_impl_t *ulw_next_result; +}; + +struct uu_list { + uintptr_t ul_next_enc; + uintptr_t ul_prev_enc; + + uu_list_pool_t *ul_pool; + uintptr_t ul_parent_enc; /* encoded parent pointer */ + size_t ul_offset; + size_t ul_numnodes; + uint8_t ul_debug; + uint8_t ul_sorted; + uint8_t ul_index; /* mark for uu_list_index_ts */ + + uu_list_node_impl_t ul_null_node; + uu_list_walk_t ul_null_walk; /* for robust walkers */ +}; + +#define UU_LIST_PTR(ptr) ((uu_list_t *)UU_PTR_DECODE(ptr)) + +#define UU_LIST_POOL_MAXNAME 64 + +struct uu_list_pool { + uu_list_pool_t *ulp_next; + uu_list_pool_t *ulp_prev; + + char ulp_name[UU_LIST_POOL_MAXNAME]; + size_t ulp_nodeoffset; + size_t ulp_objsize; + uu_compare_fn_t *ulp_cmp; + uint8_t ulp_debug; + uint8_t ulp_last_index; + pthread_mutex_t ulp_lock; /* protects null_list */ + uu_list_t ulp_null_list; +}; + +/* + * uu_avl structures + */ +typedef struct avl_node uu_avl_node_impl_t; + +struct uu_avl_walk { + uu_avl_walk_t *uaw_next; + uu_avl_walk_t *uaw_prev; + + uu_avl_t *uaw_avl; + void *uaw_next_result; + int8_t uaw_dir; + uint8_t uaw_robust; +}; + +struct uu_avl { + uintptr_t ua_next_enc; + uintptr_t ua_prev_enc; + + uu_avl_pool_t *ua_pool; + uintptr_t ua_parent_enc; + uint8_t ua_debug; + uint8_t ua_index; /* mark for uu_avl_index_ts */ + + struct avl_tree ua_tree; + uu_avl_walk_t ua_null_walk; +}; + +#define UU_AVL_PTR(x) ((uu_avl_t *)UU_PTR_DECODE(x)) + +#define UU_AVL_POOL_MAXNAME 64 + +struct uu_avl_pool { + uu_avl_pool_t *uap_next; + uu_avl_pool_t *uap_prev; + + char uap_name[UU_AVL_POOL_MAXNAME]; + size_t uap_nodeoffset; + size_t uap_objsize; + uu_compare_fn_t *uap_cmp; + uint8_t uap_debug; + uint8_t uap_last_index; + pthread_mutex_t uap_lock; /* protects null_avl */ + uu_avl_t uap_null_avl; +}; + +/* + * atfork() handlers + */ +void uu_avl_lockup(void); +void uu_avl_release(void); + +void uu_list_lockup(void); +void uu_list_release(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBUUTIL_IMPL_H */ diff --git a/include/libzfs.h b/include/libzfs.h new file mode 100644 index 000000000000..4e6336180648 --- /dev/null +++ b/include/libzfs.h @@ -0,0 +1,920 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright Joyent, Inc. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2016, Intel Corporation. + * Copyright 2016 Nexenta Systems, Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. + */ + +#ifndef _LIBZFS_H +#define _LIBZFS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Miscellaneous ZFS constants + */ +#define ZFS_MAXPROPLEN MAXPATHLEN +#define ZPOOL_MAXPROPLEN MAXPATHLEN + +/* + * libzfs errors + */ +typedef enum zfs_error { + EZFS_SUCCESS = 0, /* no error -- success */ + EZFS_NOMEM = 2000, /* out of memory */ + EZFS_BADPROP, /* invalid property value */ + EZFS_PROPREADONLY, /* cannot set readonly property */ + EZFS_PROPTYPE, /* property does not apply to dataset type */ + EZFS_PROPNONINHERIT, /* property is not inheritable */ + EZFS_PROPSPACE, /* bad quota or reservation */ + EZFS_BADTYPE, /* dataset is not of appropriate type */ + EZFS_BUSY, /* pool or dataset is busy */ + EZFS_EXISTS, /* pool or dataset already exists */ + EZFS_NOENT, /* no such pool or dataset */ + EZFS_BADSTREAM, /* bad backup stream */ + EZFS_DSREADONLY, /* dataset is readonly */ + EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ + EZFS_INVALIDNAME, /* invalid dataset name */ + EZFS_BADRESTORE, /* unable to restore to destination */ + EZFS_BADBACKUP, /* backup failed */ + EZFS_BADTARGET, /* bad attach/detach/replace target */ + EZFS_NODEVICE, /* no such device in pool */ + EZFS_BADDEV, /* invalid device to add */ + EZFS_NOREPLICAS, /* no valid replicas */ + EZFS_RESILVERING, /* resilvering (healing reconstruction) */ + EZFS_BADVERSION, /* unsupported version */ + EZFS_POOLUNAVAIL, /* pool is currently unavailable */ + EZFS_DEVOVERFLOW, /* too many devices in one vdev */ + EZFS_BADPATH, /* must be an absolute path */ + EZFS_CROSSTARGET, /* rename or clone across pool or dataset */ + EZFS_ZONED, /* used improperly in local zone */ + EZFS_MOUNTFAILED, /* failed to mount dataset */ + EZFS_UMOUNTFAILED, /* failed to unmount dataset */ + EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ + EZFS_SHARENFSFAILED, /* share(1M) failed */ + EZFS_PERM, /* permission denied */ + EZFS_NOSPC, /* out of space */ + EZFS_FAULT, /* bad address */ + EZFS_IO, /* I/O error */ + EZFS_INTR, /* signal received */ + EZFS_ISSPARE, /* device is a hot spare */ + EZFS_INVALCONFIG, /* invalid vdev configuration */ + EZFS_RECURSIVE, /* recursive dependency */ + EZFS_NOHISTORY, /* no history object */ + EZFS_POOLPROPS, /* couldn't retrieve pool props */ + EZFS_POOL_NOTSUP, /* ops not supported for this type of pool */ + EZFS_POOL_INVALARG, /* invalid argument for this pool operation */ + EZFS_NAMETOOLONG, /* dataset name is too long */ + EZFS_OPENFAILED, /* open of device failed */ + EZFS_NOCAP, /* couldn't get capacity */ + EZFS_LABELFAILED, /* write of label failed */ + EZFS_BADWHO, /* invalid permission who */ + EZFS_BADPERM, /* invalid permission */ + EZFS_BADPERMSET, /* invalid permission set name */ + EZFS_NODELEGATION, /* delegated administration is disabled */ + EZFS_UNSHARESMBFAILED, /* failed to unshare over smb */ + EZFS_SHARESMBFAILED, /* failed to share over smb */ + EZFS_BADCACHE, /* bad cache file */ + EZFS_ISL2CACHE, /* device is for the level 2 ARC */ + EZFS_VDEVNOTSUP, /* unsupported vdev type */ + EZFS_NOTSUP, /* ops not supported on this dataset */ + EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ + EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ + EZFS_REFTAG_RELE, /* snapshot release: tag not found */ + EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ + EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ + EZFS_PIPEFAILED, /* pipe create failed */ + EZFS_THREADCREATEFAILED, /* thread create failed */ + EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ + EZFS_SCRUBBING, /* currently scrubbing */ + EZFS_NO_SCRUB, /* no active scrub */ + EZFS_DIFF, /* general failure of zfs diff */ + EZFS_DIFFDATA, /* bad zfs diff data */ + EZFS_POOLREADONLY, /* pool is in read-only mode */ + EZFS_SCRUB_PAUSED, /* scrub currently paused */ + EZFS_ACTIVE_POOL, /* pool is imported on a different system */ + EZFS_CRYPTOFAILED, /* failed to setup encryption */ + EZFS_NO_PENDING, /* cannot cancel, no operation is pending */ + EZFS_CHECKPOINT_EXISTS, /* checkpoint exists */ + EZFS_DISCARDING_CHECKPOINT, /* currently discarding a checkpoint */ + EZFS_NO_CHECKPOINT, /* pool has no checkpoint */ + EZFS_DEVRM_IN_PROGRESS, /* a device is currently being removed */ + EZFS_VDEV_TOO_BIG, /* a device is too big to be used */ + EZFS_IOC_NOTSUPPORTED, /* operation not supported by zfs module */ + EZFS_TOOMANY, /* argument list too long */ + EZFS_INITIALIZING, /* currently initializing */ + EZFS_NO_INITIALIZE, /* no active initialize */ + EZFS_WRONG_PARENT, /* invalid parent dataset (e.g ZVOL) */ + EZFS_TRIMMING, /* currently trimming */ + EZFS_NO_TRIM, /* no active trim */ + EZFS_TRIM_NOTSUP, /* device does not support trim */ + EZFS_NO_RESILVER_DEFER, /* pool doesn't support resilver_defer */ + EZFS_EXPORT_IN_PROGRESS, /* currently exporting the pool */ + EZFS_REBUILDING, /* resilvering (sequential reconstrution) */ + EZFS_UNKNOWN +} zfs_error_t; + +/* + * The following data structures are all part + * of the zfs_allow_t data structure which is + * used for printing 'allow' permissions. + * It is a linked list of zfs_allow_t's which + * then contain avl tree's for user/group/sets/... + * and each one of the entries in those trees have + * avl tree's for the permissions they belong to and + * whether they are local,descendent or local+descendent + * permissions. The AVL trees are used primarily for + * sorting purposes, but also so that we can quickly find + * a given user and or permission. + */ +typedef struct zfs_perm_node { + avl_node_t z_node; + char z_pname[MAXPATHLEN]; +} zfs_perm_node_t; + +typedef struct zfs_allow_node { + avl_node_t z_node; + char z_key[MAXPATHLEN]; /* name, such as joe */ + avl_tree_t z_localdescend; /* local+descendent perms */ + avl_tree_t z_local; /* local permissions */ + avl_tree_t z_descend; /* descendent permissions */ +} zfs_allow_node_t; + +typedef struct zfs_allow { + struct zfs_allow *z_next; + char z_setpoint[MAXPATHLEN]; + avl_tree_t z_sets; + avl_tree_t z_crperms; + avl_tree_t z_user; + avl_tree_t z_group; + avl_tree_t z_everyone; +} zfs_allow_t; + +/* + * Basic handle types + */ +typedef struct zfs_handle zfs_handle_t; +typedef struct zpool_handle zpool_handle_t; +typedef struct libzfs_handle libzfs_handle_t; + +extern int zpool_wait(zpool_handle_t *, zpool_wait_activity_t); +extern int zpool_wait_status(zpool_handle_t *, zpool_wait_activity_t, + boolean_t *, boolean_t *); + +/* + * Library initialization + */ +extern libzfs_handle_t *libzfs_init(void); +extern void libzfs_fini(libzfs_handle_t *); + +extern libzfs_handle_t *zpool_get_handle(zpool_handle_t *); +extern libzfs_handle_t *zfs_get_handle(zfs_handle_t *); + +extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); + +extern void zfs_save_arguments(int argc, char **, char *, int); +extern int zpool_log_history(libzfs_handle_t *, const char *); + +extern int libzfs_errno(libzfs_handle_t *); +extern const char *libzfs_error_init(int); +extern const char *libzfs_error_action(libzfs_handle_t *); +extern const char *libzfs_error_description(libzfs_handle_t *); +extern int zfs_standard_error(libzfs_handle_t *, int, const char *); +extern void libzfs_mnttab_init(libzfs_handle_t *); +extern void libzfs_mnttab_fini(libzfs_handle_t *); +extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); +extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, + struct mnttab *); +extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, + const char *, const char *); +extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); + +/* + * Basic handle functions + */ +extern zpool_handle_t *zpool_open(libzfs_handle_t *, const char *); +extern zpool_handle_t *zpool_open_canfail(libzfs_handle_t *, const char *); +extern void zpool_close(zpool_handle_t *); +extern const char *zpool_get_name(zpool_handle_t *); +extern int zpool_get_state(zpool_handle_t *); +extern const char *zpool_state_to_name(vdev_state_t, vdev_aux_t); +extern const char *zpool_pool_state_to_name(pool_state_t); +extern void zpool_free_handles(libzfs_handle_t *); + +/* + * Iterate over all active pools in the system. + */ +typedef int (*zpool_iter_f)(zpool_handle_t *, void *); +extern int zpool_iter(libzfs_handle_t *, zpool_iter_f, void *); +extern boolean_t zpool_skip_pool(const char *); + +/* + * Functions to create and destroy pools + */ +extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, + nvlist_t *, nvlist_t *); +extern int zpool_destroy(zpool_handle_t *, const char *); +extern int zpool_add(zpool_handle_t *, nvlist_t *); + +typedef struct splitflags { + /* do not split, but return the config that would be split off */ + int dryrun : 1; + + /* after splitting, import the pool */ + int import : 1; + int name_flags; +} splitflags_t; + +typedef struct trimflags { + /* requested vdevs are for the entire pool */ + boolean_t fullpool; + + /* request a secure trim, requires support from device */ + boolean_t secure; + + /* after starting trim, block until trim completes */ + boolean_t wait; + + /* trim at the requested rate in bytes/second */ + uint64_t rate; +} trimflags_t; + +/* + * Functions to manipulate pool and vdev state + */ +extern int zpool_scan(zpool_handle_t *, pool_scan_func_t, pool_scrub_cmd_t); +extern int zpool_initialize(zpool_handle_t *, pool_initialize_func_t, + nvlist_t *); +extern int zpool_initialize_wait(zpool_handle_t *, pool_initialize_func_t, + nvlist_t *); +extern int zpool_trim(zpool_handle_t *, pool_trim_func_t, nvlist_t *, + trimflags_t *); + +extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); +extern int zpool_reguid(zpool_handle_t *); +extern int zpool_reopen_one(zpool_handle_t *, void *); + +extern int zpool_sync_one(zpool_handle_t *, void *); + +extern int zpool_vdev_online(zpool_handle_t *, const char *, int, + vdev_state_t *); +extern int zpool_vdev_offline(zpool_handle_t *, const char *, boolean_t); +extern int zpool_vdev_attach(zpool_handle_t *, const char *, + const char *, nvlist_t *, int, boolean_t); +extern int zpool_vdev_detach(zpool_handle_t *, const char *); +extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_remove_cancel(zpool_handle_t *); +extern int zpool_vdev_indirect_size(zpool_handle_t *, const char *, uint64_t *); +extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, + splitflags_t); + +extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); + +extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, + boolean_t *, boolean_t *); +extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, + boolean_t *, boolean_t *, boolean_t *); +extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, const char *); +extern uint64_t zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path); + +const char *zpool_get_state_str(zpool_handle_t *); + +/* + * Functions to manage pool properties + */ +extern int zpool_set_prop(zpool_handle_t *, const char *, const char *); +extern int zpool_get_prop(zpool_handle_t *, zpool_prop_t, char *, + size_t proplen, zprop_source_t *, boolean_t literal); +extern uint64_t zpool_get_prop_int(zpool_handle_t *, zpool_prop_t, + zprop_source_t *); +extern int zpool_props_refresh(zpool_handle_t *); + +extern const char *zpool_prop_to_name(zpool_prop_t); +extern const char *zpool_prop_values(zpool_prop_t); + +/* + * Pool health statistics. + */ +typedef enum { + /* + * The following correspond to faults as defined in the (fault.fs.zfs.*) + * event namespace. Each is associated with a corresponding message ID. + * This must be kept in sync with the zfs_msgid_table in + * lib/libzfs/libzfs_status.c. + */ + ZPOOL_STATUS_CORRUPT_CACHE, /* corrupt /kernel/drv/zpool.cache */ + ZPOOL_STATUS_MISSING_DEV_R, /* missing device with replicas */ + ZPOOL_STATUS_MISSING_DEV_NR, /* missing device with no replicas */ + ZPOOL_STATUS_CORRUPT_LABEL_R, /* bad device label with replicas */ + ZPOOL_STATUS_CORRUPT_LABEL_NR, /* bad device label with no replicas */ + ZPOOL_STATUS_BAD_GUID_SUM, /* sum of device guids didn't match */ + ZPOOL_STATUS_CORRUPT_POOL, /* pool metadata is corrupted */ + ZPOOL_STATUS_CORRUPT_DATA, /* data errors in user (meta)data */ + ZPOOL_STATUS_FAILING_DEV, /* device experiencing errors */ + ZPOOL_STATUS_VERSION_NEWER, /* newer on-disk version */ + ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ + ZPOOL_STATUS_HOSTID_ACTIVE, /* currently active on another system */ + ZPOOL_STATUS_HOSTID_REQUIRED, /* multihost=on and hostid=0 */ + ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ + ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ + ZPOOL_STATUS_IO_FAILURE_MMP, /* failed MMP, failmode not 'panic' */ + ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ + ZPOOL_STATUS_ERRATA, /* informational errata available */ + + /* + * If the pool has unsupported features but can still be opened in + * read-only mode, its status is ZPOOL_STATUS_UNSUP_FEAT_WRITE. If the + * pool has unsupported features but cannot be opened at all, its + * status is ZPOOL_STATUS_UNSUP_FEAT_READ. + */ + ZPOOL_STATUS_UNSUP_FEAT_READ, /* unsupported features for read */ + ZPOOL_STATUS_UNSUP_FEAT_WRITE, /* unsupported features for write */ + + /* + * These faults have no corresponding message ID. At the time we are + * checking the status, the original reason for the FMA fault (I/O or + * checksum errors) has been lost. + */ + ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ + ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ + + /* + * The following are not faults per se, but still an error possibly + * requiring administrative attention. There is no corresponding + * message ID. + */ + ZPOOL_STATUS_VERSION_OLDER, /* older legacy on-disk version */ + ZPOOL_STATUS_FEAT_DISABLED, /* supported features are disabled */ + ZPOOL_STATUS_RESILVERING, /* device being resilvered */ + ZPOOL_STATUS_OFFLINE_DEV, /* device offline */ + ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_REBUILDING, /* device being rebuilt */ + ZPOOL_STATUS_REBUILD_SCRUB, /* recommend scrubbing the pool */ + ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ + + /* + * Finally, the following indicates a healthy pool. + */ + ZPOOL_STATUS_OK +} zpool_status_t; + +extern zpool_status_t zpool_get_status(zpool_handle_t *, char **, + zpool_errata_t *); +extern zpool_status_t zpool_import_status(nvlist_t *, char **, + zpool_errata_t *); + +/* + * Statistics and configuration functions. + */ +extern nvlist_t *zpool_get_config(zpool_handle_t *, nvlist_t **); +extern nvlist_t *zpool_get_features(zpool_handle_t *); +extern int zpool_refresh_stats(zpool_handle_t *, boolean_t *); +extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); + +/* + * Import and export functions + */ +extern int zpool_export(zpool_handle_t *, boolean_t, const char *); +extern int zpool_export_force(zpool_handle_t *, const char *); +extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, + char *altroot); +extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, + nvlist_t *, int); +extern void zpool_print_unsup_feat(nvlist_t *config); + +/* + * Miscellaneous pool functions + */ +struct zfs_cmd; + +extern const char *zfs_history_event_names[]; + +typedef enum { + VDEV_NAME_PATH = 1 << 0, + VDEV_NAME_GUID = 1 << 1, + VDEV_NAME_FOLLOW_LINKS = 1 << 2, + VDEV_NAME_TYPE_ID = 1 << 3, +} vdev_name_t; + +extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, + int name_flags); +extern int zpool_upgrade(zpool_handle_t *, uint64_t); +extern int zpool_get_history(zpool_handle_t *, nvlist_t **, uint64_t *, + boolean_t *); +extern int zpool_events_next(libzfs_handle_t *, nvlist_t **, int *, unsigned, + int); +extern int zpool_events_clear(libzfs_handle_t *, int *); +extern int zpool_events_seek(libzfs_handle_t *, uint64_t, int); +extern void zpool_obj_to_path_ds(zpool_handle_t *, uint64_t, uint64_t, char *, + size_t); +extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, + size_t); +extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); +extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); +extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, + nvlist_t *); +extern int zpool_checkpoint(zpool_handle_t *); +extern int zpool_discard_checkpoint(zpool_handle_t *); + +/* + * Basic handle manipulations. These functions do not create or destroy the + * underlying datasets, only the references to them. + */ +extern zfs_handle_t *zfs_open(libzfs_handle_t *, const char *, int); +extern zfs_handle_t *zfs_handle_dup(zfs_handle_t *); +extern void zfs_close(zfs_handle_t *); +extern zfs_type_t zfs_get_type(const zfs_handle_t *); +extern const char *zfs_get_name(const zfs_handle_t *); +extern zpool_handle_t *zfs_get_pool_handle(const zfs_handle_t *); +extern const char *zfs_get_pool_name(const zfs_handle_t *); + +/* + * Property management functions. Some functions are shared with the kernel, + * and are found in sys/fs/zfs.h. + */ + +/* + * zfs dataset property management + */ +extern const char *zfs_prop_default_string(zfs_prop_t); +extern uint64_t zfs_prop_default_numeric(zfs_prop_t); +extern const char *zfs_prop_column_name(zfs_prop_t); +extern boolean_t zfs_prop_align_right(zfs_prop_t); + +extern nvlist_t *zfs_valid_proplist(libzfs_handle_t *, zfs_type_t, nvlist_t *, + uint64_t, zfs_handle_t *, zpool_handle_t *, boolean_t, const char *); + +extern const char *zfs_prop_to_name(zfs_prop_t); +extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); +extern int zfs_prop_set_list(zfs_handle_t *, nvlist_t *); +extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, + zprop_source_t *, char *, size_t, boolean_t); +extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, + boolean_t); +extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, + zprop_source_t *, char *, size_t); +extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); +extern int zfs_prop_get_feature(zfs_handle_t *zhp, const char *propname, + char *buf, size_t len); +extern uint64_t getprop_uint64(zfs_handle_t *, zfs_prop_t, char **); +extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); +extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); +extern const char *zfs_prop_values(zfs_prop_t); +extern int zfs_prop_is_string(zfs_prop_t prop); +extern nvlist_t *zfs_get_all_props(zfs_handle_t *); +extern nvlist_t *zfs_get_user_props(zfs_handle_t *); +extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); +extern nvlist_t *zfs_get_clones_nvl(zfs_handle_t *); + +extern int zfs_wait_status(zfs_handle_t *, zfs_wait_activity_t, + boolean_t *, boolean_t *); + +/* + * zfs encryption management + */ +extern int zfs_crypto_get_encryption_root(zfs_handle_t *, boolean_t *, char *); +extern int zfs_crypto_create(libzfs_handle_t *, char *, nvlist_t *, nvlist_t *, + boolean_t stdin_available, uint8_t **, uint_t *); +extern int zfs_crypto_clone_check(libzfs_handle_t *, zfs_handle_t *, char *, + nvlist_t *); +extern int zfs_crypto_attempt_load_keys(libzfs_handle_t *, char *); +extern int zfs_crypto_load_key(zfs_handle_t *, boolean_t, char *); +extern int zfs_crypto_unload_key(zfs_handle_t *); +extern int zfs_crypto_rewrap(zfs_handle_t *, nvlist_t *, boolean_t); + +typedef struct zprop_list { + int pl_prop; + char *pl_user_prop; + struct zprop_list *pl_next; + boolean_t pl_all; + size_t pl_width; + size_t pl_recvd_width; + boolean_t pl_fixed; +} zprop_list_t; + +extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t, + boolean_t); +extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); + +#define ZFS_MOUNTPOINT_NONE "none" +#define ZFS_MOUNTPOINT_LEGACY "legacy" + +#define ZFS_FEATURE_DISABLED "disabled" +#define ZFS_FEATURE_ENABLED "enabled" +#define ZFS_FEATURE_ACTIVE "active" + +#define ZFS_UNSUPPORTED_INACTIVE "inactive" +#define ZFS_UNSUPPORTED_READONLY "readonly" + +/* + * zpool property management + */ +extern int zpool_expand_proplist(zpool_handle_t *, zprop_list_t **); +extern int zpool_prop_get_feature(zpool_handle_t *, const char *, char *, + size_t); +extern const char *zpool_prop_default_string(zpool_prop_t); +extern uint64_t zpool_prop_default_numeric(zpool_prop_t); +extern const char *zpool_prop_column_name(zpool_prop_t); +extern boolean_t zpool_prop_align_right(zpool_prop_t); + +/* + * Functions shared by zfs and zpool property management. + */ +extern int zprop_iter(zprop_func func, void *cb, boolean_t show_all, + boolean_t ordered, zfs_type_t type); +extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, + zfs_type_t); +extern void zprop_free_list(zprop_list_t *); + +#define ZFS_GET_NCOLS 5 + +typedef enum { + GET_COL_NONE, + GET_COL_NAME, + GET_COL_PROPERTY, + GET_COL_VALUE, + GET_COL_RECVD, + GET_COL_SOURCE +} zfs_get_column_t; + +/* + * Functions for printing zfs or zpool properties + */ +typedef struct zprop_get_cbdata { + int cb_sources; + zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; + int cb_colwidths[ZFS_GET_NCOLS + 1]; + boolean_t cb_scripted; + boolean_t cb_literal; + boolean_t cb_first; + zprop_list_t *cb_proplist; + zfs_type_t cb_type; +} zprop_get_cbdata_t; + +void zprop_print_one_property(const char *, zprop_get_cbdata_t *, + const char *, const char *, zprop_source_t, const char *, + const char *); + +/* + * Iterator functions. + */ +typedef int (*zfs_iter_f)(zfs_handle_t *, void *); +extern int zfs_iter_root(libzfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); +extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapshots(zfs_handle_t *, boolean_t, zfs_iter_f, void *, + uint64_t, uint64_t); +extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *, + uint64_t, uint64_t); +extern int zfs_iter_snapspec(zfs_handle_t *, const char *, zfs_iter_f, void *); +extern int zfs_iter_bookmarks(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_mounted(zfs_handle_t *, zfs_iter_f, void *); + +typedef struct get_all_cb { + zfs_handle_t **cb_handles; + size_t cb_alloc; + size_t cb_used; +} get_all_cb_t; + +void zfs_foreach_mountpoint(libzfs_handle_t *, zfs_handle_t **, size_t, + zfs_iter_f, void *, boolean_t); +void libzfs_add_handle(get_all_cb_t *, zfs_handle_t *); + +/* + * Functions to create and destroy datasets. + */ +extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, + nvlist_t *); +extern int zfs_create_ancestors(libzfs_handle_t *, const char *); +extern int zfs_destroy(zfs_handle_t *, boolean_t); +extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); +extern int zfs_destroy_snaps_nvl(libzfs_handle_t *, nvlist_t *, boolean_t); +extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); +extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); +extern int zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, + nvlist_t *props); +extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); +extern int zfs_rename(zfs_handle_t *, const char *, boolean_t, boolean_t); + +typedef struct sendflags { + /* Amount of extra information to print. */ + int verbosity; + + /* recursive send (ie, -R) */ + boolean_t replicate; + + /* for incrementals, do all intermediate snapshots */ + boolean_t doall; + + /* if dataset is a clone, do incremental from its origin */ + boolean_t fromorigin; + + /* field no longer used, maintained for backwards compatibility */ + boolean_t pad; + + /* send properties (ie, -p) */ + boolean_t props; + + /* do not send (no-op, ie. -n) */ + boolean_t dryrun; + + /* parsable verbose output (ie. -P) */ + boolean_t parsable; + + /* show progress (ie. -v) */ + boolean_t progress; + + /* large blocks (>128K) are permitted */ + boolean_t largeblock; + + /* WRITE_EMBEDDED records of type DATA are permitted */ + boolean_t embed_data; + + /* compressed WRITE records are permitted */ + boolean_t compress; + + /* raw encrypted records are permitted */ + boolean_t raw; + + /* only send received properties (ie. -b) */ + boolean_t backup; + + /* include snapshot holds in send stream */ + boolean_t holds; + + /* stream represents a partially received dataset */ + boolean_t saved; +} sendflags_t; + +typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); + +extern int zfs_send(zfs_handle_t *, const char *, const char *, + sendflags_t *, int, snapfilter_cb_t, void *, nvlist_t **); +extern int zfs_send_one(zfs_handle_t *, const char *, int, sendflags_t *, + const char *); +extern int zfs_send_progress(zfs_handle_t *, int, uint64_t *, uint64_t *); +extern int zfs_send_resume(libzfs_handle_t *, sendflags_t *, int outfd, + const char *); +extern int zfs_send_saved(zfs_handle_t *, sendflags_t *, int, const char *); +extern nvlist_t *zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, + const char *token); + +extern int zfs_promote(zfs_handle_t *); +extern int zfs_hold(zfs_handle_t *, const char *, const char *, + boolean_t, int); +extern int zfs_hold_nvl(zfs_handle_t *, int, nvlist_t *); +extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); +extern int zfs_get_holds(zfs_handle_t *, nvlist_t **); +extern uint64_t zvol_volsize_to_reservation(zpool_handle_t *, uint64_t, + nvlist_t *); + +typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, + uid_t rid, uint64_t space); + +extern int zfs_userspace(zfs_handle_t *, zfs_userquota_prop_t, + zfs_userspace_cb_t, void *); + +extern int zfs_get_fsacl(zfs_handle_t *, nvlist_t **); +extern int zfs_set_fsacl(zfs_handle_t *, boolean_t, nvlist_t *); + +typedef struct recvflags { + /* print informational messages (ie, -v was specified) */ + boolean_t verbose; + + /* the destination is a prefix, not the exact fs (ie, -d) */ + boolean_t isprefix; + + /* + * Only the tail of the sent snapshot path is appended to the + * destination to determine the received snapshot name (ie, -e). + */ + boolean_t istail; + + /* do not actually do the recv, just check if it would work (ie, -n) */ + boolean_t dryrun; + + /* rollback/destroy filesystems as necessary (eg, -F) */ + boolean_t force; + + /* set "canmount=off" on all modified filesystems */ + boolean_t canmountoff; + + /* + * Mark the file systems as "resumable" and do not destroy them if the + * receive is interrupted + */ + boolean_t resumable; + + /* byteswap flag is used internally; callers need not specify */ + boolean_t byteswap; + + /* do not mount file systems as they are extracted (private) */ + boolean_t nomount; + + /* Was holds flag set in the compound header? */ + boolean_t holds; + + /* skip receive of snapshot holds */ + boolean_t skipholds; + + /* mount the filesystem unless nomount is specified */ + boolean_t domount; + + /* force unmount while recv snapshot (private) */ + boolean_t forceunmount; +} recvflags_t; + +extern int zfs_receive(libzfs_handle_t *, const char *, nvlist_t *, + recvflags_t *, int, avl_tree_t *); + +typedef enum diff_flags { + ZFS_DIFF_PARSEABLE = 0x1, + ZFS_DIFF_TIMESTAMP = 0x2, + ZFS_DIFF_CLASSIFY = 0x4 +} diff_flags_t; + +extern int zfs_show_diffs(zfs_handle_t *, int, const char *, const char *, + int); + +/* + * Miscellaneous functions. + */ +extern const char *zfs_type_to_name(zfs_type_t); +extern void zfs_refresh_properties(zfs_handle_t *); +extern int zfs_name_valid(const char *, zfs_type_t); +extern zfs_handle_t *zfs_path_to_zhandle(libzfs_handle_t *, const char *, + zfs_type_t); +extern int zfs_parent_name(zfs_handle_t *, char *, size_t); +extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, + zfs_type_t); +extern int zfs_spa_version(zfs_handle_t *, int *); +extern boolean_t zfs_bookmark_exists(const char *path); + +/* + * Mount support functions. + */ +extern boolean_t is_mounted(libzfs_handle_t *, const char *special, char **); +extern boolean_t zfs_is_mounted(zfs_handle_t *, char **); +extern int zfs_mount(zfs_handle_t *, const char *, int); +extern int zfs_mount_at(zfs_handle_t *, const char *, int, const char *); +extern int zfs_unmount(zfs_handle_t *, const char *, int); +extern int zfs_unmountall(zfs_handle_t *, int); + +#if defined(__linux__) +extern int zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt); +extern void zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, + char *mntopts, char *mtabopt); +#endif + +/* + * Share support functions. + */ +extern boolean_t zfs_is_shared(zfs_handle_t *); +extern int zfs_share(zfs_handle_t *); +extern int zfs_unshare(zfs_handle_t *); + +/* + * Protocol-specific share support functions. + */ +extern boolean_t zfs_is_shared_nfs(zfs_handle_t *, char **); +extern boolean_t zfs_is_shared_smb(zfs_handle_t *, char **); +extern int zfs_share_nfs(zfs_handle_t *); +extern int zfs_share_smb(zfs_handle_t *); +extern int zfs_shareall(zfs_handle_t *); +extern int zfs_unshare_nfs(zfs_handle_t *, const char *); +extern int zfs_unshare_smb(zfs_handle_t *, const char *); +extern int zfs_unshareall_nfs(zfs_handle_t *); +extern int zfs_unshareall_smb(zfs_handle_t *); +extern int zfs_unshareall_bypath(zfs_handle_t *, const char *); +extern int zfs_unshareall_bytype(zfs_handle_t *, const char *, const char *); +extern int zfs_unshareall(zfs_handle_t *); +extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, + void *, void *, int, zfs_share_op_t); +extern void zfs_commit_nfs_shares(void); +extern void zfs_commit_smb_shares(void); +extern void zfs_commit_all_shares(void); +extern void zfs_commit_shares(const char *); + +extern int zfs_nicestrtonum(libzfs_handle_t *, const char *, uint64_t *); + +/* + * Utility functions to run an external process. + */ +#define STDOUT_VERBOSE 0x01 +#define STDERR_VERBOSE 0x02 +#define NO_DEFAULT_PATH 0x04 /* Don't use $PATH to lookup the command */ + +int libzfs_run_process(const char *, char **, int); +int libzfs_run_process_get_stdout(const char *, char *[], char *[], + char **[], int *); +int libzfs_run_process_get_stdout_nopath(const char *, char *[], char *[], + char **[], int *); + +void libzfs_free_str_array(char **, int); + +int libzfs_envvar_is_set(char *); + +/* + * Utility functions for zfs version + */ +extern void zfs_version_userland(char *, int); +extern int zfs_version_kernel(char *, int); +extern int zfs_version_print(void); + +/* + * Given a device or file, determine if it is part of a pool. + */ +extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, + boolean_t *); + +/* + * Label manipulation. + */ +extern int zpool_clear_label(int); +extern int zpool_set_bootenv(zpool_handle_t *, const char *); +extern int zpool_get_bootenv(zpool_handle_t *, char *, size_t, off_t); + +/* + * Management interfaces for SMB ACL files + */ + +int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); +int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); + +/* + * Enable and disable datasets within a pool by mounting/unmounting and + * sharing/unsharing them. + */ +extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); +extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); + +#ifdef __FreeBSD__ + +/* + * Attach/detach the given filesystem to/from the given jail. + */ +extern int zfs_jail(zfs_handle_t *zhp, int jailid, int attach); + +/* + * Set loader options for next boot. + */ +extern int zpool_nextboot(libzfs_handle_t *, uint64_t, uint64_t, const char *); + +#endif /* __FreeBSD__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_H */ diff --git a/include/libzfs_core.h b/include/libzfs_core.h new file mode 100644 index 000000000000..e69fe32cd0a1 --- /dev/null +++ b/include/libzfs_core.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + */ + +#ifndef _LIBZFS_CORE_H +#define _LIBZFS_CORE_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +int libzfs_core_init(void); +void libzfs_core_fini(void); + +/* + * NB: this type should be kept binary compatible with dmu_objset_type_t. + */ +enum lzc_dataset_type { + LZC_DATSET_TYPE_ZFS = 2, + LZC_DATSET_TYPE_ZVOL +}; + +int lzc_snapshot(nvlist_t *, nvlist_t *, nvlist_t **); +int lzc_create(const char *, enum lzc_dataset_type, nvlist_t *, uint8_t *, + uint_t); +int lzc_clone(const char *, const char *, nvlist_t *); +int lzc_promote(const char *, char *, int); +int lzc_destroy_snaps(nvlist_t *, boolean_t, nvlist_t **); +int lzc_bookmark(nvlist_t *, nvlist_t **); +int lzc_get_bookmarks(const char *, nvlist_t *, nvlist_t **); +int lzc_get_bookmark_props(const char *, nvlist_t **); +int lzc_destroy_bookmarks(nvlist_t *, nvlist_t **); +int lzc_load_key(const char *, boolean_t, uint8_t *, uint_t); +int lzc_unload_key(const char *); +int lzc_change_key(const char *, uint64_t, nvlist_t *, uint8_t *, uint_t); +int lzc_initialize(const char *, pool_initialize_func_t, nvlist_t *, + nvlist_t **); +int lzc_trim(const char *, pool_trim_func_t, uint64_t, boolean_t, + nvlist_t *, nvlist_t **); +int lzc_redact(const char *, const char *, nvlist_t *); + +int lzc_snaprange_space(const char *, const char *, uint64_t *); + +int lzc_hold(nvlist_t *, int, nvlist_t **); +int lzc_release(nvlist_t *, nvlist_t **); +int lzc_get_holds(const char *, nvlist_t **); + +enum lzc_send_flags { + LZC_SEND_FLAG_EMBED_DATA = 1 << 0, + LZC_SEND_FLAG_LARGE_BLOCK = 1 << 1, + LZC_SEND_FLAG_COMPRESS = 1 << 2, + LZC_SEND_FLAG_RAW = 1 << 3, + LZC_SEND_FLAG_SAVED = 1 << 4, +}; + +int lzc_send(const char *, const char *, int, enum lzc_send_flags); +int lzc_send_resume(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t); +int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); + +struct dmu_replay_record; + +int lzc_send_redacted(const char *, const char *, int, enum lzc_send_flags, + const char *); +int lzc_send_resume_redacted(const char *, const char *, int, + enum lzc_send_flags, uint64_t, uint64_t, const char *); +int lzc_receive(const char *, nvlist_t *, const char *, boolean_t, boolean_t, + int); +int lzc_receive_resumable(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, int); +int lzc_receive_with_header(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, boolean_t, int, const struct dmu_replay_record *); +int lzc_receive_one(const char *, nvlist_t *, const char *, boolean_t, + boolean_t, boolean_t, int, const struct dmu_replay_record *, int, + uint64_t *, uint64_t *, uint64_t *, nvlist_t **); +int lzc_receive_with_cmdprops(const char *, nvlist_t *, nvlist_t *, + uint8_t *, uint_t, const char *, boolean_t, boolean_t, boolean_t, int, + const struct dmu_replay_record *, int, uint64_t *, uint64_t *, + uint64_t *, nvlist_t **); +int lzc_send_space(const char *, const char *, enum lzc_send_flags, uint64_t *); +int lzc_send_space_resume_redacted(const char *, const char *, + enum lzc_send_flags, uint64_t, uint64_t, uint64_t, const char *, + int, uint64_t *); +uint64_t lzc_send_progress(int); + +boolean_t lzc_exists(const char *); + +int lzc_rollback(const char *, char *, int); +int lzc_rollback_to(const char *, const char *); + +int lzc_rename(const char *, const char *); +int lzc_destroy(const char *); + +int lzc_channel_program(const char *, const char *, uint64_t, + uint64_t, nvlist_t *, nvlist_t **); +int lzc_channel_program_nosync(const char *, const char *, uint64_t, + uint64_t, nvlist_t *, nvlist_t **); + +int lzc_sync(const char *, nvlist_t *, nvlist_t **); +int lzc_reopen(const char *, boolean_t); + +int lzc_pool_checkpoint(const char *); +int lzc_pool_checkpoint_discard(const char *); + +int lzc_wait(const char *, zpool_wait_activity_t, boolean_t *); +int lzc_wait_tag(const char *, zpool_wait_activity_t, uint64_t, boolean_t *); +int lzc_wait_fs(const char *, zfs_wait_activity_t, boolean_t *); + +int lzc_set_bootenv(const char *, const char *); +int lzc_get_bootenv(const char *, nvlist_t **); +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_CORE_H */ diff --git a/include/libzfs_impl.h b/include/libzfs_impl.h new file mode 100644 index 000000000000..0f111679764f --- /dev/null +++ b/include/libzfs_impl.h @@ -0,0 +1,261 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2018 Datto Inc. + * Copyright 2020 Joyent, Inc. + */ + +#ifndef _LIBZFS_IMPL_H +#define _LIBZFS_IMPL_H + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct libzfs_handle { + int libzfs_error; + int libzfs_fd; + FILE *libzfs_mnttab; + zpool_handle_t *libzfs_pool_handles; + uu_avl_pool_t *libzfs_ns_avlpool; + uu_avl_t *libzfs_ns_avl; + uint64_t libzfs_ns_gen; + int libzfs_desc_active; + char libzfs_action[1024]; + char libzfs_desc[1024]; + int libzfs_printerr; + int libzfs_storeerr; /* stuff error messages into buffer */ + boolean_t libzfs_mnttab_enable; + /* + * We need a lock to handle the case where parallel mount + * threads are populating the mnttab cache simultaneously. The + * lock only protects the integrity of the avl tree, and does + * not protect the contents of the mnttab entries themselves. + */ + pthread_mutex_t libzfs_mnttab_cache_lock; + avl_tree_t libzfs_mnttab_cache; + int libzfs_pool_iter; + char libzfs_chassis_id[256]; + boolean_t libzfs_prop_debug; + regex_t libzfs_urire; +}; + +struct zfs_handle { + libzfs_handle_t *zfs_hdl; + zpool_handle_t *zpool_hdl; + char zfs_name[ZFS_MAX_DATASET_NAME_LEN]; + zfs_type_t zfs_type; /* type including snapshot */ + zfs_type_t zfs_head_type; /* type excluding snapshot */ + dmu_objset_stats_t zfs_dmustats; + nvlist_t *zfs_props; + nvlist_t *zfs_user_props; + nvlist_t *zfs_recvd_props; + boolean_t zfs_mntcheck; + char *zfs_mntopts; + uint8_t *zfs_props_table; +}; + +/* + * This is different from checking zfs_type, because it will also catch + * snapshots of volumes. + */ +#define ZFS_IS_VOLUME(zhp) ((zhp)->zfs_head_type == ZFS_TYPE_VOLUME) + +struct zpool_handle { + libzfs_handle_t *zpool_hdl; + zpool_handle_t *zpool_next; + char zpool_name[ZFS_MAX_DATASET_NAME_LEN]; + int zpool_state; + size_t zpool_config_size; + nvlist_t *zpool_config; + nvlist_t *zpool_old_config; + nvlist_t *zpool_props; + diskaddr_t zpool_start_block; +}; + +typedef enum { + PROTO_NFS = 0, + PROTO_SMB = 1, + PROTO_END = 2 +} zfs_share_proto_t; + +/* + * The following can be used as a bitmask and any new values + * added must preserve that capability. + */ +typedef enum { + SHARED_NOT_SHARED = 0x0, + SHARED_NFS = 0x2, + SHARED_SMB = 0x4 +} zfs_share_type_t; + +typedef int (*zfs_uri_handler_fn_t)(struct libzfs_handle *, const char *, + const char *, zfs_keyformat_t, boolean_t, uint8_t **, size_t *); + +typedef struct zfs_uri_handler { + const char *zuh_scheme; + zfs_uri_handler_fn_t zuh_handler; +} zfs_uri_handler_t; + +#define CONFIG_BUF_MINSIZE 262144 + +int zfs_error(libzfs_handle_t *, int, const char *); +int zfs_error_fmt(libzfs_handle_t *, int, const char *, ...); +void zfs_error_aux(libzfs_handle_t *, const char *, ...); +void *zfs_alloc(libzfs_handle_t *, size_t); +void *zfs_realloc(libzfs_handle_t *, void *, size_t, size_t); +char *zfs_asprintf(libzfs_handle_t *, const char *, ...); +char *zfs_strdup(libzfs_handle_t *, const char *); +int no_memory(libzfs_handle_t *); + +int zfs_standard_error(libzfs_handle_t *, int, const char *); +int zfs_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); +void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); +int zpool_standard_error(libzfs_handle_t *, int, const char *); +int zpool_standard_error_fmt(libzfs_handle_t *, int, const char *, ...); + +zfs_handle_t *make_dataset_handle_zc(libzfs_handle_t *, zfs_cmd_t *); +zfs_handle_t *make_dataset_simple_handle_zc(zfs_handle_t *, zfs_cmd_t *); + +int zprop_parse_value(libzfs_handle_t *, nvpair_t *, int, zfs_type_t, + nvlist_t *, char **, uint64_t *, const char *); +int zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, + zfs_type_t type); + +/* + * Use this changelist_gather() flag to force attempting mounts + * on each change node regardless of whether or not it is currently + * mounted. + */ +#define CL_GATHER_MOUNT_ALWAYS 1 +/* + * changelist_gather() flag to force it to iterate on mounted datasets only + */ +#define CL_GATHER_ITER_MOUNTED 2 + +typedef struct prop_changelist prop_changelist_t; + +int zcmd_alloc_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, size_t); +int zcmd_write_src_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +int zcmd_write_conf_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t *); +int zcmd_expand_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *); +int zcmd_read_dst_nvlist(libzfs_handle_t *, zfs_cmd_t *, nvlist_t **); +void zcmd_free_nvlists(zfs_cmd_t *); + +int changelist_prefix(prop_changelist_t *); +int changelist_postfix(prop_changelist_t *); +void changelist_rename(prop_changelist_t *, const char *, const char *); +void changelist_remove(prop_changelist_t *, const char *); +void changelist_free(prop_changelist_t *); +prop_changelist_t *changelist_gather(zfs_handle_t *, zfs_prop_t, int, int); +int changelist_unshare(prop_changelist_t *, zfs_share_proto_t *); +int changelist_haszonedchild(prop_changelist_t *); + +void remove_mountpoint(zfs_handle_t *); +int create_parents(libzfs_handle_t *, char *, int); +boolean_t isa_child_of(const char *dataset, const char *parent); + +zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); +zfs_handle_t *make_bookmark_handle(zfs_handle_t *, const char *, + nvlist_t *props); + +int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); + +boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); + +int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying); + +void namespace_clear(libzfs_handle_t *); + +extern int zfs_parse_options(char *, zfs_share_proto_t); + +extern int zfs_unshare_proto(zfs_handle_t *, + const char *, zfs_share_proto_t *); + +typedef struct { + zfs_prop_t p_prop; + char *p_name; + int p_share_err; + int p_unshare_err; +} proto_table_t; + +typedef struct differ_info { + zfs_handle_t *zhp; + char *fromsnap; + char *frommnt; + char *tosnap; + char *tomnt; + char *ds; + char *dsmnt; + char *tmpsnap; + char errbuf[1024]; + boolean_t isclone; + boolean_t scripted; + boolean_t classify; + boolean_t timestamped; + uint64_t shares; + int zerr; + int cleanupfd; + int outputfd; + int datafd; +} differ_info_t; + +extern proto_table_t proto_table[PROTO_END]; + +extern int do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, + int flags); +extern int do_unmount(const char *mntpt, int flags); +extern int zfs_mount_delegation_check(void); +extern int zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto); +extern int unshare_one(libzfs_handle_t *hdl, const char *name, + const char *mountpoint, zfs_share_proto_t proto); +extern boolean_t zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, + zprop_source_t *source, int flags); +extern zfs_share_type_t is_shared(const char *mountpoint, + zfs_share_proto_t proto); +extern int libzfs_load_module(void); +extern int zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, + const char *msg); +extern int find_shares_object(differ_info_t *di); +extern void libzfs_set_pipe_max(int infd); +extern void zfs_commit_proto(zfs_share_proto_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZFS_IMPL_H */ diff --git a/include/libzutil.h b/include/libzutil.h new file mode 100644 index 000000000000..82a802678903 --- /dev/null +++ b/include/libzutil.h @@ -0,0 +1,166 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _LIBZUTIL_H +#define _LIBZUTIL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Default wait time for a device name to be created. + */ +#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */ + + +/* + * Pool Config Operations + * + * These are specific to the library libzfs or libzpool instance. + */ +typedef nvlist_t *refresh_config_func_t(void *, nvlist_t *); + +typedef int pool_active_func_t(void *, const char *, uint64_t, boolean_t *); + +typedef const struct pool_config_ops { + refresh_config_func_t *pco_refresh_config; + pool_active_func_t *pco_pool_active; +} pool_config_ops_t; + +/* + * An instance of pool_config_ops_t is expected in the caller's binary. + */ +extern const pool_config_ops_t libzfs_config_ops; +extern const pool_config_ops_t libzpool_config_ops; + +typedef struct importargs { + char **path; /* a list of paths to search */ + int paths; /* number of paths to search */ + const char *poolname; /* name of a pool to find */ + uint64_t guid; /* guid of a pool to find */ + const char *cachefile; /* cachefile to use for import */ + boolean_t can_be_active; /* can the pool be active? */ + boolean_t scan; /* prefer scanning to libblkid cache */ + nvlist_t *policy; /* load policy (max txg, rewind, etc.) */ +} importargs_t; + +extern nvlist_t *zpool_search_import(void *, importargs_t *, + const pool_config_ops_t *); +extern int zpool_find_config(void *, const char *, nvlist_t **, importargs_t *, + const pool_config_ops_t *); + +extern const char * const * zpool_default_search_paths(size_t *count); +extern int zpool_read_label(int, nvlist_t **, int *); +extern int zpool_label_disk_wait(const char *, int); + +struct udev_device; + +extern int zfs_device_get_devid(struct udev_device *, char *, size_t); +extern int zfs_device_get_physical(struct udev_device *, char *, size_t); + +extern void update_vdev_config_dev_strs(nvlist_t *); + +/* + * Default device paths + */ +#define DISK_ROOT "/dev" +#define UDISK_ROOT "/dev/disk" +#define ZVOL_ROOT "/dev/zvol" + +extern int zfs_append_partition(char *path, size_t max_len); +extern int zfs_resolve_shortname(const char *name, char *path, size_t pathlen); + +extern char *zfs_strip_partition(char *); +extern char *zfs_strip_path(char *); + +extern int zfs_strcmp_pathname(const char *, const char *, int); + +extern boolean_t zfs_dev_is_dm(const char *); +extern boolean_t zfs_dev_is_whole_disk(const char *); +extern int zfs_dev_flush(int); +extern char *zfs_get_underlying_path(const char *); +extern char *zfs_get_enclosure_sysfs_path(const char *); + +extern boolean_t is_mpath_whole_disk(const char *); + +extern boolean_t zfs_isnumber(const char *); + +/* + * Formats for iostat numbers. Examples: "12K", "30ms", "4B", "2321234", "-". + * + * ZFS_NICENUM_1024: Print kilo, mega, tera, peta, exa.. + * ZFS_NICENUM_BYTES: Print single bytes ("13B"), kilo, mega, tera... + * ZFS_NICENUM_TIME: Print nanosecs, microsecs, millisecs, seconds... + * ZFS_NICENUM_RAW: Print the raw number without any formatting + * ZFS_NICENUM_RAWTIME: Same as RAW, but print dashes ('-') for zero. + */ +enum zfs_nicenum_format { + ZFS_NICENUM_1024 = 0, + ZFS_NICENUM_BYTES = 1, + ZFS_NICENUM_TIME = 2, + ZFS_NICENUM_RAW = 3, + ZFS_NICENUM_RAWTIME = 4 +}; + +/* + * Convert a number to a human-readable form. + */ +extern void zfs_nicebytes(uint64_t, char *, size_t); +extern void zfs_nicenum(uint64_t, char *, size_t); +extern void zfs_nicenum_format(uint64_t, char *, size_t, + enum zfs_nicenum_format); +extern void zfs_nicetime(uint64_t, char *, size_t); +extern void zfs_niceraw(uint64_t, char *, size_t); + +#define nicenum(num, buf, size) zfs_nicenum(num, buf, size) + +extern void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); +extern int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, + uint_t *); + +struct zfs_cmd; +int zfs_ioctl_fd(int fd, unsigned long request, struct zfs_cmd *zc); + +/* + * List of colors to use + */ +#define ANSI_RED "\033[0;31m" +#define ANSI_YELLOW "\033[0;33m" +#define ANSI_RESET "\033[0m" +#define ANSI_BOLD "\033[1m" + +void color_start(char *color); +void color_end(void); +int printf_color(char *color, char *format, ...); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBZUTIL_H */ diff --git a/include/os/Makefile.am b/include/os/Makefile.am new file mode 100644 index 000000000000..7eab1abde984 --- /dev/null +++ b/include/os/Makefile.am @@ -0,0 +1,6 @@ +if BUILD_LINUX +SUBDIRS = linux +endif +if BUILD_FREEBSD +SUBDIRS = freebsd +endif diff --git a/include/os/freebsd/Makefile.am b/include/os/freebsd/Makefile.am new file mode 100644 index 000000000000..3c87d4a0e791 --- /dev/null +++ b/include/os/freebsd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = linux spl zfs diff --git a/include/os/freebsd/linux/Makefile.am b/include/os/freebsd/linux/Makefile.am new file mode 100644 index 000000000000..00cff7f5dc65 --- /dev/null +++ b/include/os/freebsd/linux/Makefile.am @@ -0,0 +1,5 @@ +KERNEL_H = \ + compiler.h \ + types.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/linux/compiler.h b/include/os/freebsd/linux/compiler.h new file mode 100644 index 000000000000..d76050378e83 --- /dev/null +++ b/include/os/freebsd/linux/compiler.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2016 Mellanox Technologies, Ltd. + * Copyright (c) 2015 François Tigeot + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_COMPILER_H_ +#define _LINUX_COMPILER_H_ + +#include + +#define __user +#define __kernel +#define __safe +#define __force +#define __nocast +#define __iomem +#define __chk_user_ptr(x) ((void)0) +#define __chk_io_ptr(x) ((void)0) +#define __builtin_warning(x, y...) (1) +#define __acquires(x) +#define __releases(x) +#define __acquire(x) do { } while (0) +#define __release(x) do { } while (0) +#define __cond_lock(x, c) (c) +#define __bitwise +#define __devinitdata +#define __deprecated +#define __init +#define __initconst +#define __devinit +#define __devexit +#define __exit +#define __rcu +#define __percpu +#define __weak __weak_symbol +#define __malloc +#define ___stringify(...) #__VA_ARGS__ +#define __stringify(...) ___stringify(__VA_ARGS__) +#define __attribute_const__ __attribute__((__const__)) +#undef __always_inline +#define __always_inline inline +#define noinline __noinline +#define ____cacheline_aligned __aligned(CACHE_LINE_SIZE) + +#ifndef _KERNEL +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif +#define typeof(x) __typeof(x) + +#define uninitialized_var(x) x = x +#define __maybe_unused __unused +#define __always_unused __unused +#define __must_check __result_use_check + +#define __printf(a, b) __printflike(a, b) + +#define barrier() __asm__ __volatile__("": : :"memory") +#define smp_rmb() rmb() +#define ___PASTE(a, b) a##b +#define __PASTE(a, b) ___PASTE(a, b) + +#define ACCESS_ONCE(x) (*(volatile __typeof(x) *)&(x)) + +#define WRITE_ONCE(x, v) do { \ + barrier(); \ + ACCESS_ONCE(x) = (v); \ + barrier(); \ +} while (0) + +#define lockless_dereference(p) READ_ONCE(p) + +#define _AT(T, X) ((T)(X)) + +#endif /* _LINUX_COMPILER_H_ */ diff --git a/include/os/freebsd/linux/types.h b/include/os/freebsd/linux/types.h new file mode 100644 index 000000000000..d290317cc0dd --- /dev/null +++ b/include/os/freebsd/linux/types.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2010 Isilon Systems, Inc. + * Copyright (c) 2010 iXsystems, Inc. + * Copyright (c) 2010 Panasas, Inc. + * Copyright (c) 2013-2017 Mellanox Technologies, Ltd. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $FreeBSD$ + */ +#ifndef _LINUX_TYPES_H_ +#define _LINUX_TYPES_H_ + +#include + + +#ifndef __bitwise__ +#ifdef __CHECKER__ +#define __bitwise__ __attribute__((bitwise)) +#else +#define __bitwise__ +#endif +#endif + +typedef uint16_t __le16; +typedef uint16_t __be16; +typedef uint32_t __le32; +typedef uint32_t __be32; +typedef uint64_t __le64; +typedef uint64_t __be64; + +typedef unsigned gfp_t; +typedef off_t loff_t; +typedef vm_paddr_t resource_size_t; +typedef uint16_t __bitwise__ __sum16; +typedef unsigned long pgoff_t; +typedef unsigned __poll_t; + +typedef uint64_t u64; +typedef u64 phys_addr_t; + +typedef size_t __kernel_size_t; + +#define DECLARE_BITMAP(n, bits) \ + unsigned long n[howmany(bits, sizeof (long) * 8)] + +typedef unsigned long irq_hw_number_t; + +struct rcu_head { + void *raw[2]; +} __aligned(sizeof (void *)); + +typedef void (*rcu_callback_t)(struct rcu_head *head); +typedef void (*call_rcu_func_t)(struct rcu_head *head, rcu_callback_t func); +typedef int linux_task_fn_t(void *data); + +#endif /* _LINUX_TYPES_H_ */ diff --git a/include/os/freebsd/spl/Makefile.am b/include/os/freebsd/spl/Makefile.am new file mode 100644 index 000000000000..b321825cb77e --- /dev/null +++ b/include/os/freebsd/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = acl rpc sys diff --git a/include/os/freebsd/spl/acl/Makefile.am b/include/os/freebsd/spl/acl/Makefile.am new file mode 100644 index 000000000000..5c0698d02ea6 --- /dev/null +++ b/include/os/freebsd/spl/acl/Makefile.am @@ -0,0 +1,4 @@ +KERNEL_H = \ + acl_common.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/acl/acl_common.h b/include/os/freebsd/spl/acl/acl_common.h new file mode 100644 index 000000000000..44f5bed592f6 --- /dev/null +++ b/include/os/freebsd/spl/acl/acl_common.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _ACL_COMMON_H +#define _ACL_COMMON_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +extern int acltrivial(const char *); +extern void adjust_ace_pair(ace_t *pair, mode_t mode); +extern void adjust_ace_pair_common(void *, size_t, size_t, mode_t); +extern int ace_trivial_common(void *, int, + uint64_t (*walk)(void *, uint64_t, int aclcnt, uint16_t *, uint16_t *, + uint32_t *mask)); +#if !defined(_KERNEL) +extern acl_t *acl_alloc(acl_type_t); +extern void acl_free(acl_t *aclp); +extern int acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, + uid_t owner, gid_t group); +#endif /* !_KERNEL */ +int cmp2acls(void *a, void *b); +int acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count); +void acl_trivial_access_masks(mode_t mode, boolean_t isdir, + trivial_acl_t *masks); + +#ifdef __cplusplus +} +#endif + +#endif /* _ACL_COMMON_H */ diff --git a/include/os/freebsd/spl/rpc/Makefile.am b/include/os/freebsd/spl/rpc/Makefile.am new file mode 100644 index 000000000000..f6faf4b188be --- /dev/null +++ b/include/os/freebsd/spl/rpc/Makefile.am @@ -0,0 +1,4 @@ +KERNEL_H = \ + xdr.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/rpc/xdr.h b/include/os/freebsd/spl/rpc/xdr.h new file mode 100644 index 000000000000..59d5e425bedb --- /dev/null +++ b/include/os/freebsd/spl/rpc/xdr.h @@ -0,0 +1,71 @@ +/* + * Sun RPC is a product of Sun Microsystems, Inc. and is provided for + * unrestricted use provided that this legend is included on all tape + * media and as a part of the software program in whole or part. Users + * may copy or modify Sun RPC without charge, but are not authorized + * to license or distribute it to anyone else except as part of a product or + * program developed by the user. + * + * SUN RPC IS PROVIDED AS IS WITH NO WARRANTIES OF ANY KIND INCLUDING THE + * WARRANTIES OF DESIGN, MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE, OR ARISING FROM A COURSE OF DEALING, USAGE OR TRADE PRACTICE. + * + * Sun RPC is provided with no support and without any obligation on the + * part of Sun Microsystems, Inc. to assist in its use, correction, + * modification or enhancement. + * + * SUN MICROSYSTEMS, INC. SHALL HAVE NO LIABILITY WITH RESPECT TO THE + * INFRINGEMENT OF COPYRIGHTS, TRADE SECRETS OR ANY PATENTS BY SUN RPC + * OR ANY PART THEREOF. + * + * In no event will Sun Microsystems, Inc. be liable for any lost revenue + * or profits or other special, indirect and consequential damages, even if + * Sun has been advised of the possibility of such damages. + * + * Sun Microsystems, Inc. + * 2550 Garcia Avenue + * Mountain View, California 94043 + */ + +#ifndef _OPENSOLARIS_RPC_XDR_H_ +#define _OPENSOLARIS_RPC_XDR_H_ + +#include +#include_next + +#ifndef _KERNEL + +#include + +/* + * Taken from sys/xdr/xdr_mem.c. + * + * FreeBSD's userland XDR doesn't implement control method (only the kernel), + * but OpenSolaris nvpair still depend on it, so we have to implement it here. + */ +static __inline bool_t +xdrmem_control(XDR *xdrs, int request, void *info) +{ + xdr_bytesrec *xptr; + + switch (request) { + case XDR_GET_BYTES_AVAIL: + xptr = (xdr_bytesrec *)info; + xptr->xc_is_last_record = TRUE; + xptr->xc_num_avail = xdrs->x_handy; + return (TRUE); + default: + assert(!"unexpected request"); + } + return (FALSE); +} + +#undef XDR_CONTROL +#define XDR_CONTROL(xdrs, req, op) \ + (((xdrs)->x_ops->x_control == NULL) ? \ + xdrmem_control((xdrs), (req), (op)) : \ + (*(xdrs)->x_ops->x_control)(xdrs, req, op)) + +#endif /* !_KERNEL */ + +#endif /* !_OPENSOLARIS_RPC_XDR_H_ */ diff --git a/include/os/freebsd/spl/sys/Makefile.am b/include/os/freebsd/spl/sys/Makefile.am new file mode 100644 index 000000000000..7d82e2d6d4e8 --- /dev/null +++ b/include/os/freebsd/spl/sys/Makefile.am @@ -0,0 +1,71 @@ +KERNEL_H = \ + acl_impl.h \ + acl.h \ + atomic.h \ + byteorder.h \ + callb.h \ + ccompile.h \ + cmn_err.h \ + condvar.h \ + console.h \ + cred.h \ + ctype.h \ + debug.h \ + dirent.h \ + disp.h \ + dkio.h \ + extdirent.h \ + file.h \ + freebsd_rwlock.h \ + inttypes.h \ + isa_defs.h \ + kmem_cache.h \ + kmem.h \ + kstat.h \ + list_impl.h \ + list.h \ + lock.h \ + Makefile.am \ + misc.h \ + mod_os.h \ + mode.h \ + mount.h \ + mutex.h \ + param.h \ + policy.h \ + proc.h \ + processor.h \ + procfs_list.h \ + random.h \ + rwlock.h \ + sdt.h \ + sid.h \ + sig.h \ + simd_x86.h \ + simd.h \ + spl_condvar.h \ + string.h \ + strings.h \ + sunddi.h \ + sysmacros.h \ + systeminfo.h \ + systm.h \ + taskq.h \ + thread.h \ + time.h \ + timer.h \ + trace_zfs.h \ + trace.h \ + types.h \ + types32.h \ + uio.h \ + uuid.h \ + vfs.h \ + vm.h \ + vmsystm.h \ + vnode_impl.h \ + vnode.h \ + zmod.h \ + zone.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/spl/sys/acl.h b/include/os/freebsd/spl/sys/acl.h new file mode 100644 index 000000000000..ee50b0a18368 --- /dev/null +++ b/include/os/freebsd/spl/sys/acl.h @@ -0,0 +1,216 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright 2017 RackTop Systems. + */ + +#ifndef _SYS_ACL_H +#define _SYS_ACL_H + +#include +#include + +/* + * When compiling OpenSolaris kernel code, this file is included instead of the + * FreeBSD one. Include the original sys/acl.h as well. + */ +#undef _SYS_ACL_H +#include_next +#define _SYS_ACL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ +typedef struct { + int a_type; /* the type of ACL entry */ + uid_t a_id; /* the entry in -uid or gid */ + o_mode_t a_perm; /* the permission field */ +} aclent_t; + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +/* + * The following are Defined types for an aclent_t. + */ +#define USER_OBJ (0x01) /* object owner */ +#define USER (0x02) /* additional users */ +#define GROUP_OBJ (0x04) /* owning group of the object */ +#define GROUP (0x08) /* additional groups */ +#define CLASS_OBJ (0x10) /* file group class and mask entry */ +#define OTHER_OBJ (0x20) /* other entry for the object */ +#define ACL_DEFAULT (0x1000) /* default flag */ +/* default object owner */ +#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) +/* default additional users */ +#define DEF_USER (ACL_DEFAULT | USER) +/* default owning group */ +#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) +/* default additional groups */ +#define DEF_GROUP (ACL_DEFAULT | GROUP) +/* default mask entry */ +#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) +/* default other entry */ +#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) + +/* + * The following are defined for ace_t. + */ +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ + ACL_DEFAULTED) + +/* + * These are only applicable in a CIFS context. + */ +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + +#define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ + ACE_READ_NAMED_ATTRS) + +#define ACE_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS) + +#define ACE_MODIFY_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_SYNCHRONIZE) +/* + * The following flags are supported by both NFSv4 ACLs and ace_t. + */ +#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | \ + ACE_INHERIT_ONLY_ACE | \ + ACE_INHERITED_ACE | \ + ACE_IDENTIFIER_GROUP) + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ + ACE_IDENTIFIER_GROUP) +#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| ACL_INHERITED_ACE| \ + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +/* cmd args to acl(2) for aclent_t */ +#define GETACL 1 +#define SETACL 2 +#define GETACLCNT 3 + +/* cmd's to manipulate ace acls. */ +#define ACE_GETACL 4 +#define ACE_SETACL 5 +#define ACE_GETACLCNT 6 + +/* minimal acl entries from GETACLCNT */ +#define MIN_ACL_ENTRIES 4 + +extern void aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp); +extern int acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries); +extern void ksort(caddr_t, int, int, int (*)(void *, void *)); +extern int cmp2acls(void *, void *); + +extern int acl(const char *path, int cmd, int cnt, void *buf); +extern int facl(int fd, int cmd, int cnt, void *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_H */ diff --git a/include/os/freebsd/spl/sys/acl_impl.h b/include/os/freebsd/spl/sys/acl_impl.h new file mode 100644 index 000000000000..8718f5bcf63f --- /dev/null +++ b/include/os/freebsd/spl/sys/acl_impl.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_IMPL_H +#define _SYS_ACL_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * acl flags + * + * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED + * flags can also be stored in this field. + */ +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 + +typedef enum acl_type { + ACLENT_T = 0, + ACE_T = 1 +} zfs_acl_type_t; + +struct acl_info { + zfs_acl_type_t acl_type; /* style of acl */ + int acl_cnt; /* number of acl entries */ + int acl_entry_size; /* sizeof acl entry */ + int acl_flags; /* special flags about acl */ + void *acl_aclp; /* the acl */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/atomic.h b/include/os/freebsd/spl/sys/atomic.h new file mode 100644 index 000000000000..4227e5f7d3ec --- /dev/null +++ b/include/os/freebsd/spl/sys/atomic.h @@ -0,0 +1,182 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ATOMIC_H_ +#define _OPENSOLARIS_SYS_ATOMIC_H_ + +#include +#include + +#define atomic_sub_64 atomic_subtract_64 + +#if defined(__i386__) && (defined(_KERNEL) || defined(KLD_MODULE)) +#define I386_HAVE_ATOMIC64 +#endif + +#if defined(__i386__) || defined(__amd64__) || defined(__arm__) +/* No spurious failures from fcmpset. */ +#define STRONG_FCMPSET +#endif + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \ + !defined(HAS_EMULATED_ATOMIC64) +extern void atomic_add_64(volatile uint64_t *target, int64_t delta); +extern void atomic_dec_64(volatile uint64_t *target); +extern uint64_t atomic_swap_64(volatile uint64_t *a, uint64_t value); +extern uint64_t atomic_load_64(volatile uint64_t *a); +extern uint64_t atomic_add_64_nv(volatile uint64_t *target, int64_t delta); +extern uint64_t atomic_cas_64(volatile uint64_t *target, uint64_t cmp, + uint64_t newval); +#endif + +#define membar_producer atomic_thread_fence_rel + +static __inline uint32_t +atomic_add_32_nv(volatile uint32_t *target, int32_t delta) +{ + return (atomic_fetchadd_32(target, delta) + delta); +} + +static __inline uint_t +atomic_add_int_nv(volatile uint_t *target, int delta) +{ + return (atomic_add_32_nv(target, delta)); +} + +static __inline void +atomic_inc_32(volatile uint32_t *target) +{ + atomic_add_32(target, 1); +} + +static __inline uint32_t +atomic_inc_32_nv(volatile uint32_t *target) +{ + return (atomic_add_32_nv(target, 1)); +} + +static __inline void +atomic_dec_32(volatile uint32_t *target) +{ + atomic_subtract_32(target, 1); +} + +static __inline uint32_t +atomic_dec_32_nv(volatile uint32_t *target) +{ + return (atomic_add_32_nv(target, -1)); +} + +#ifndef __sparc64__ +static inline uint32_t +atomic_cas_32(volatile uint32_t *target, uint32_t cmp, uint32_t newval) +{ +#ifdef STRONG_FCMPSET + (void) atomic_fcmpset_32(target, &cmp, newval); +#else + uint32_t expected = cmp; + + do { + if (atomic_fcmpset_32(target, &cmp, newval)) + break; + } while (cmp == expected); +#endif + return (cmp); +} +#endif + +#if defined(__LP64__) || defined(__mips_n32) || \ + defined(ARM_HAVE_ATOMIC64) || defined(I386_HAVE_ATOMIC64) || \ + defined(HAS_EMULATED_ATOMIC64) +static __inline void +atomic_dec_64(volatile uint64_t *target) +{ + atomic_subtract_64(target, 1); +} + +static inline uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + return (atomic_fetchadd_64(target, delta) + delta); +} + +#ifndef __sparc64__ +static inline uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ +#ifdef STRONG_FCMPSET + (void) atomic_fcmpset_64(target, &cmp, newval); +#else + uint64_t expected = cmp; + + do { + if (atomic_fcmpset_64(target, &cmp, newval)) + break; + } while (cmp == expected); +#endif + return (cmp); +} +#endif +#endif + +static __inline void +atomic_inc_64(volatile uint64_t *target) +{ + atomic_add_64(target, 1); +} + +static __inline uint64_t +atomic_inc_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, 1)); +} + +static __inline uint64_t +atomic_dec_64_nv(volatile uint64_t *target) +{ + return (atomic_add_64_nv(target, -1)); +} + +#if !defined(COMPAT_32BIT) && defined(__LP64__) +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_64((volatile uint64_t *)target, + (uint64_t)cmp, (uint64_t)newval)); +} +#else +static __inline void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_32((volatile uint32_t *)target, + (uint32_t)cmp, (uint32_t)newval)); +} +#endif /* !defined(COMPAT_32BIT) && defined(__LP64__) */ + +#endif /* !_OPENSOLARIS_SYS_ATOMIC_H_ */ diff --git a/include/os/freebsd/spl/sys/byteorder.h b/include/os/freebsd/spl/sys/byteorder.h new file mode 100644 index 000000000000..ae767242bd18 --- /dev/null +++ b/include/os/freebsd/spl/sys/byteorder.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _OPENSOLARIS_SYS_BYTEORDER_H_ +#define _OPENSOLARIS_SYS_BYTEORDER_H_ + +#include + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#if BYTE_ORDER == _BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#if BYTE_ORDER == _BIG_ENDIAN +#define htonll(x) BMASK_64(x) +#define ntohll(x) BMASK_64(x) +#else +#ifndef __LP64__ +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#else +#define htonll(x) BSWAP_64(x) +#define ntohll(x) BSWAP_64(x) +#endif +#endif + +#define BE_IN32(xa) htonl(*((uint32_t *)(void *)(xa))) + +#endif /* _OPENSOLARIS_SYS_BYTEORDER_H_ */ diff --git a/include/os/freebsd/spl/sys/callb.h b/include/os/freebsd/spl/sys/callb.h new file mode 100644 index 000000000000..cc67b0263c51 --- /dev/null +++ b/include/os/freebsd/spl/sys/callb.h @@ -0,0 +1,213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * definitions of callback classes (c_class) + * + * Callbacks belong in the same class if (1) their callback routines + * do the same kind of processing (ideally, using the same callback function) + * and (2) they can/should be executed at the same time in a cpr + * suspend/resume operation. + * + * Note: The DAEMON class, in particular, is for stopping kernel threads + * and nothing else. The CALLB_* macros below should be used to deal + * with kernel threads, and the callback function should be callb_generic_cpr. + * Another idiosyncrasy of the DAEMON class is that if a suspend operation + * fails, some of the callback functions may be called with the RESUME + * code which were never called with SUSPEND. Not a problem currently, + * but see bug 4201851. + */ +#define CB_CL_CPR_DAEMON 0 +#define CB_CL_CPR_VM 1 +#define CB_CL_CPR_CALLOUT 2 +#define CB_CL_CPR_OBP 3 +#define CB_CL_CPR_FB 4 +#define CB_CL_PANIC 5 +#define CB_CL_CPR_RPC 6 +#define CB_CL_CPR_PROMPRINTF 7 +#define CB_CL_UADMIN 8 +#define CB_CL_CPR_PM 9 +#define CB_CL_HALT 10 +#define CB_CL_CPR_DMA 11 +#define CB_CL_CPR_POST_USER 12 +#define CB_CL_UADMIN_PRE_VFS 13 +#define CB_CL_MDBOOT CB_CL_UADMIN +#define CB_CL_ENTER_DEBUGGER 14 +#define CB_CL_CPR_POST_KERNEL 15 +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ + +/* + * CB_CL_CPR_DAEMON class specific definitions are given below: + */ + +/* + * code for CPR callb_execute_class + */ +#define CB_CODE_CPR_CHKPT 0 +#define CB_CODE_CPR_RESUME 1 + +typedef void * callb_id_t; +/* + * Per kernel thread structure for CPR daemon callbacks. + * Must be protected by either a existing lock in the daemon or + * a new lock created for such a purpose. + */ +typedef struct callb_cpr { + kmutex_t *cc_lockp; /* lock to protect this struct */ + char cc_events; /* various events for CPR */ + callb_id_t cc_id; /* callb id address */ + kcondvar_t cc_callb_cv; /* cv for callback waiting */ + kcondvar_t cc_stop_cv; /* cv to checkpoint block */ +} callb_cpr_t; + +/* + * cc_events definitions + */ +#define CALLB_CPR_START 1 /* a checkpoint request's started */ +#define CALLB_CPR_SAFE 2 /* thread is safe for CPR */ +#define CALLB_CPR_ALWAYS_SAFE 4 /* thread is ALWAYS safe for CPR */ + +/* + * Used when checking that all kernel threads are stopped. + */ +#define CALLB_MAX_RETRY 3 /* when waiting for kthread to sleep */ +#define CALLB_THREAD_DELAY 10 /* ticks allowed to reach sleep */ +#define CPR_KTHREAD_TIMEOUT_SEC 90 /* secs before callback times out -- */ + /* due to pwr mgmt of disks, make -- */ + /* big enough for worst spinup time */ + +/* + * + * CALLB_CPR_INIT macro is used by kernel threads to add their entry to + * the callback table and perform other initialization. It automatically + * adds the thread as being in the callback class CB_CL_CPR_DAEMON. + * + * cp - ptr to the callb_cpr_t structure for this kernel thread + * + * lockp - pointer to mutex protecting the callb_cpr_t struct + * + * func - pointer to the callback function for this kernel thread. + * It has the prototype boolean_t (void *arg, int code) + * where: arg - ptr to the callb_cpr_t structure + * code - not used for this type of callback + * returns: B_TRUE if successful; B_FALSE if unsuccessful. + * + * name - a string giving the name of the kernel thread + * + * Note: lockp is the lock to protect the callb_cpr_t (cp) structure + * later on. No lock held is needed for this initialization. + */ +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + strlcpy(curthread->td_name, (name), \ + sizeof (curthread->td_name)); \ + bzero((caddr_t)(cp), sizeof (callb_cpr_t)); \ + (cp)->cc_lockp = lockp; \ + (cp)->cc_id = callb_add(func, (void *)(cp), \ + CB_CL_CPR_DAEMON, name); \ + cv_init(&(cp)->cc_callb_cv, NULL, CV_DEFAULT, NULL); \ + cv_init(&(cp)->cc_stop_cv, NULL, CV_DEFAULT, NULL); \ + } + +#ifndef __lock_lint +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); +#else +#define CALLB_CPR_ASSERT(cp) +#endif +/* + * Some threads (like the idle threads) do not adhere to the callback + * protocol and are always considered safe. Such threads must never exit. + * They register their presence by calling this macro during their + * initialization. + * + * Args: + * t - thread pointer of the client kernel thread + * name - a string giving the name of the kernel thread + */ +#define CALLB_CPR_INIT_SAFE(t, name) { \ + (void) callb_add_thread(callb_generic_cpr_safe, \ + (void *) &callb_cprinfo_safe, CB_CL_CPR_DAEMON, \ + name, t); \ + } +/* + * The lock to protect cp's content must be held before + * calling the following two macros. + * + * Any code region between CALLB_CPR_SAFE_BEGIN and CALLB_CPR_SAFE_END + * is safe for checkpoint/resume. + */ +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + } +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp) \ + while ((cp)->cc_events & CALLB_CPR_START) \ + cv_wait(&(cp)->cc_stop_cv, lockp); \ + (cp)->cc_events &= ~CALLB_CPR_SAFE; \ + } +/* + * cv_destroy is nop right now but may be needed in the future. + */ +#define CALLB_CPR_EXIT(cp) { \ + CALLB_CPR_ASSERT(cp) \ + (cp)->cc_events |= CALLB_CPR_SAFE; \ + if ((cp)->cc_events & CALLB_CPR_START) \ + cv_signal(&(cp)->cc_callb_cv); \ + mutex_exit((cp)->cc_lockp); \ + (void) callb_delete((cp)->cc_id); \ + cv_destroy(&(cp)->cc_callb_cv); \ + cv_destroy(&(cp)->cc_stop_cv); \ + } + +extern callb_cpr_t callb_cprinfo_safe; +extern callb_id_t callb_add(boolean_t (*)(void *, int), void *, int, char *); +extern callb_id_t callb_add_thread(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); +extern int callb_delete(callb_id_t); +extern void callb_execute(callb_id_t, int); +extern void *callb_execute_class(int, int); +extern boolean_t callb_generic_cpr(void *, int); +extern boolean_t callb_generic_cpr_safe(void *, int); +extern boolean_t callb_is_stopped(kthread_id_t, caddr_t *); +extern void callb_lock_table(void); +extern void callb_unlock_table(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CALLB_H */ diff --git a/include/os/freebsd/spl/sys/ccompat.h b/include/os/freebsd/spl/sys/ccompat.h new file mode 100644 index 000000000000..59abe921dba9 --- /dev/null +++ b/include/os/freebsd/spl/sys/ccompat.h @@ -0,0 +1,153 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_CCOMPAT_H +#define _SYS_CCOMPAT_H + +#if __FreeBSD_version < 1300051 +#define vm_page_valid(m) (m)->valid = VM_PAGE_BITS_ALL +#define vm_page_do_sunbusy(m) +#define vm_page_none_valid(m) ((m)->valid == 0) +#else +#define vm_page_do_sunbusy(m) vm_page_sunbusy(m) +#endif + +#if __FreeBSD_version < 1300074 +#define VOP_UNLOCK1(x) VOP_UNLOCK(x, 0) +#else +#define VOP_UNLOCK1(x) VOP_UNLOCK(x) +#endif + +#if __FreeBSD_version < 1300064 +#define VN_IS_DOOMED(vp) ((vp)->v_iflag & VI_DOOMED) +#endif + +#if __FreeBSD_version < 1300068 +#define VFS_VOP_VECTOR_REGISTER(x) +#endif + +#if __FreeBSD_version >= 1300076 +#define getnewvnode_reserve_() getnewvnode_reserve() +#else +#define getnewvnode_reserve_() getnewvnode_reserve(1) +#endif + +#if __FreeBSD_version < 1300102 +#define ASSERT_VOP_IN_SEQC(zp) +#define MNTK_FPLOOKUP 0 +#define vn_seqc_write_begin(vp) +#define vn_seqc_write_end(vp) + +#ifndef VFS_SMR_DECLARE +#define VFS_SMR_DECLARE +#endif +#ifndef VFS_SMR_ZONE_SET +#define VFS_SMR_ZONE_SET(zone) +#endif +#endif + +struct hlist_node { + struct hlist_node *next, **pprev; +}; + +struct hlist_head { + struct hlist_node *first; +}; + +typedef struct { + volatile int counter; +} atomic_t; + + /* BEGIN CSTYLED */ +#define hlist_for_each(p, head) \ + for (p = (head)->first; p; p = (p)->next) + +#define hlist_entry(ptr, type, field) container_of(ptr, type, field) + +#define container_of(ptr, type, member) \ +({ \ + const __typeof(((type *)0)->member) *__p = (ptr); \ + (type *)((uintptr_t)__p - offsetof(type, member)); \ +}) + /* END CSTYLED */ + +static inline void +hlist_add_head(struct hlist_node *n, struct hlist_head *h) +{ + n->next = h->first; + if (h->first != NULL) + h->first->pprev = &n->next; + WRITE_ONCE(h->first, n); + n->pprev = &h->first; +} + +static inline void +hlist_del(struct hlist_node *n) +{ + WRITE_ONCE(*(n->pprev), n->next); + if (n->next != NULL) + n->next->pprev = n->pprev; +} + /* BEGIN CSTYLED */ +#define READ_ONCE(x) ({ \ + __typeof(x) __var = ({ \ + barrier(); \ + ACCESS_ONCE(x); \ + }); \ + barrier(); \ + __var; \ +}) + +#define HLIST_HEAD_INIT { } +#define HLIST_HEAD(name) struct hlist_head name = HLIST_HEAD_INIT +#define INIT_HLIST_HEAD(head) (head)->first = NULL + +#define INIT_HLIST_NODE(node) \ + do { \ + (node)->next = NULL; \ + (node)->pprev = NULL; \ + } while (0) + +/* END CSTYLED */ +static inline int +atomic_read(const atomic_t *v) +{ + return (READ_ONCE(v->counter)); +} + +static inline int +atomic_inc(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, 1) + 1); +} + +static inline int +atomic_dec(atomic_t *v) +{ + return (atomic_fetchadd_int(&v->counter, -1) - 1); +} +#endif diff --git a/include/os/freebsd/spl/sys/ccompile.h b/include/os/freebsd/spl/sys/ccompile.h new file mode 100644 index 000000000000..7268bd1d73a6 --- /dev/null +++ b/include/os/freebsd/spl/sys/ccompile.h @@ -0,0 +1,271 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CCOMPILE_H +#define _SYS_CCOMPILE_H + +/* + * This file contains definitions designed to enable different compilers + * to be used harmoniously on Solaris systems. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Allow for version tests for compiler bugs and features. + */ +#if defined(__GNUC__) +#define __GNUC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#else +#define __GNUC_VERSION 0 +#endif + +#if defined(__ATTRIBUTE_IMPLEMENTED) || defined(__GNUC__) + +/* + * analogous to lint's PRINTFLIKEn + */ +#define __sun_attr___PRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, (__n)+1))) +#define __sun_attr___VPRINTFLIKE__(__n) \ + __attribute__((__format__(printf, __n, 0))) + +/* + * Handle the kernel printf routines that can take '%b' too + */ +#if __GNUC_VERSION < 30402 +/* + * XX64 at least this doesn't work correctly yet with 3.4.1 anyway! + */ +#define __sun_attr___KPRINTFLIKE__ __sun_attr___PRINTFLIKE__ +#define __sun_attr___KVPRINTFLIKE__ __sun_attr___VPRINTFLIKE__ +#else +#define __sun_attr___KPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, (__n)+1))) +#define __sun_attr___KVPRINTFLIKE__(__n) \ + __attribute__((__format__(cmn_err, __n, 0))) +#endif + +/* + * This one's pretty obvious -- the function never returns + */ +#define __sun_attr___noreturn__ __attribute__((__noreturn__)) + + +/* + * This is an appropriate label for functions that do not + * modify their arguments, e.g. strlen() + */ +#define __sun_attr___pure__ __attribute__((__pure__)) + +/* + * This is a stronger form of __pure__. Can be used for functions + * that do not modify their arguments and don't depend on global + * memory. + */ +#define __sun_attr___const__ __attribute__((__const__)) + +/* + * structure packing like #pragma pack(1) + */ +#define __sun_attr___packed__ __attribute__((__packed__)) + +#define ___sun_attr_inner(__a) __sun_attr_##__a +#define __sun_attr__(__a) ___sun_attr_inner __a + +#else /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +#define __sun_attr__(__a) + +#endif /* __ATTRIBUTE_IMPLEMENTED || __GNUC__ */ + +/* + * Shorthand versions for readability + */ + +#define __PRINTFLIKE(__n) __sun_attr__((__PRINTFLIKE__(__n))) +#define __VPRINTFLIKE(__n) __sun_attr__((__VPRINTFLIKE__(__n))) +#define __KPRINTFLIKE(__n) __sun_attr__((__KPRINTFLIKE__(__n))) +#define __KVPRINTFLIKE(__n) __sun_attr__((__KVPRINTFLIKE__(__n))) +#ifdef _KERNEL +#define __NORETURN __sun_attr__((__noreturn__)) +#endif +#define __CONST __sun_attr__((__const__)) +#define __PURE __sun_attr__((__pure__)) + +#if defined(INVARIANTS) && !defined(ZFS_DEBUG) +#define ZFS_DEBUG +#undef NDEBUG +#endif + +#define EXPORT_SYMBOL(x) +#define MODULE_AUTHOR(s) +#define MODULE_DESCRIPTION(s) +#define MODULE_LICENSE(s) +#define module_param(a, b, c) +#define module_param_call(a, b, c, d, e) +#define module_param_named(a, b, c, d) +#define MODULE_PARM_DESC(a, b) +#define asm __asm +#ifdef ZFS_DEBUG +#undef NDEBUG +#endif +#if !defined(ZFS_DEBUG) && !defined(NDEBUG) +#define NDEBUG +#endif + +#ifndef EINTEGRITY +#define EINTEGRITY 97 /* EINTEGRITY is new in 13 */ +#endif + +/* + * These are bespoke errnos used in ZFS. We map them to their closest FreeBSD + * equivalents. This gives us more useful error messages from strerror(3). + */ +#define ECKSUM EINTEGRITY +#define EFRAGS ENOSPC + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ECANCELED + +#define EREMOTEIO EREMOTE +#define ECHRNG ENXIO +#define ETIME ETIMEDOUT + +#define O_LARGEFILE 0 +#define O_RSYNC 0 +#define O_DSYNC 0 + +#ifndef LOCORE +#ifndef HAVE_RPC_TYPES +typedef int bool_t; +typedef int enum_t; +#endif +#endif + +#ifndef __cplusplus +#define __init +#define __exit +#endif + +#ifdef _KERNEL +#define param_set_charp(a, b) (0) +#define ATTR_UID AT_UID +#define ATTR_GID AT_GID +#define ATTR_MODE AT_MODE +#define ATTR_XVATTR AT_XVATTR +#define ATTR_CTIME AT_CTIME +#define ATTR_MTIME AT_MTIME +#define ATTR_ATIME AT_ATIME +#define vmem_free zfs_kmem_free +#define vmem_zalloc(size, flags) zfs_kmem_alloc(size, flags | M_ZERO) +#define vmem_alloc zfs_kmem_alloc +#define MUTEX_NOLOCKDEP 0 +#define RW_NOLOCKDEP 0 + +#else +#define FALSE 0 +#define TRUE 1 + /* + * XXX We really need to consolidate on standard + * error codes in the common code + */ +#define ENOSTR ENOTCONN +#define ENODATA EINVAL + + +#define __BSD_VISIBLE 1 +#ifndef IN_BASE +#define __POSIX_VISIBLE 201808 +#define __XSI_VISIBLE 1000 +#endif +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#define open64 open +#define mmap64 mmap +#define pwrite64 pwrite +#define ftruncate64 ftruncate +#define lseek64 lseek +#define pread64 pread +#define stat64 stat +#define lstat64 lstat +#define statfs64 statfs +#define readdir64 readdir +#define dirent64 dirent +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#define RLIM64_INFINITY RLIM_INFINITY +#ifndef HAVE_ERESTART +#define ERESTART EAGAIN +#endif +#define ABS(a) ((a) < 0 ? -(a) : (a)) + +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CCOMPILE_H */ diff --git a/include/os/freebsd/spl/sys/cmn_err.h b/include/os/freebsd/spl/sys/cmn_err.h new file mode 100644 index 000000000000..a75471f647eb --- /dev/null +++ b/include/os/freebsd/spl/sys/cmn_err.h @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CMN_ERR_H +#define _SYS_CMN_ERR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#if !defined(_ASM) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* Common error handling severity levels */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +#ifndef _ASM + +/*PRINTFLIKE2*/ +extern void cmn_err(int, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(cmn_err) + +extern void vzcmn_err(zoneid_t, int, const char *, __va_list) + __KVPRINTFLIKE(3); +#pragma rarely_called(vzcmn_err) + +extern void vcmn_err(int, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vcmn_err) + +/*PRINTFLIKE3*/ +extern void zcmn_err(zoneid_t, int, const char *, ...) + __KPRINTFLIKE(3); +#pragma rarely_called(zcmn_err) + +extern void vzprintf(zoneid_t, const char *, __va_list) + __KVPRINTFLIKE(2); +#pragma rarely_called(vzprintf) + +/*PRINTFLIKE2*/ +extern void zprintf(zoneid_t, const char *, ...) + __KPRINTFLIKE(2); +#pragma rarely_called(zprintf) + +extern void vuprintf(const char *, __va_list) + __KVPRINTFLIKE(1); +#pragma rarely_called(vuprintf) + +/*PRINTFLIKE1*/ +extern void panic(const char *, ...) + __KPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(panic) + +extern void vpanic(const char *, __va_list) + __KVPRINTFLIKE(1) __NORETURN; +#pragma rarely_called(vpanic) + +#endif /* !_ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CMN_ERR_H */ diff --git a/include/os/freebsd/spl/sys/condvar.h b/include/os/freebsd/spl/sys/condvar.h new file mode 100644 index 000000000000..a42995793bc2 --- /dev/null +++ b/include/os/freebsd/spl/sys/condvar.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * Copyright (c) 2013 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_CONDVAR_H_ +#define _OPENSOLARIS_SYS_CONDVAR_H_ + +#include +#include + +#include +#include +#include + +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + +static __inline sbintime_t +zfs_nstosbt(int64_t _ns) +{ + sbintime_t sb = 0; + +#ifdef KASSERT + KASSERT(_ns >= 0, ("Negative values illegal for nstosbt: %jd", _ns)); +#endif + if (_ns >= SBT_1S) { + sb = (_ns / 1000000000) * SBT_1S; + _ns = _ns % 1000000000; + } + /* 9223372037 = ceil(2^63 / 1000000000) */ + sb += ((_ns * 9223372037ull) + 0x7fffffff) >> 31; + return (sb); +} + + +typedef struct cv kcondvar_t; +#define CALLOUT_FLAG_ABSOLUTE C_ABSOLUTE + +typedef enum { + CV_DEFAULT, + CV_DRIVER +} kcv_type_t; + +#define zfs_cv_init(cv, name, type, arg) do { \ + const char *_name; \ + ASSERT((type) == CV_DEFAULT); \ + for (_name = #cv; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #cv; \ + cv_init((cv), _name); \ +} while (0) +#define cv_init(cv, name, type, arg) zfs_cv_init(cv, name, type, arg) + + +static inline int +cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + + return (_cv_wait_sig(cvp, &(mp)->lock_object) == 0); +} + +static inline int +cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sbt((cvp), &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + return (1); +} + +static inline int +cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t timo) +{ + int rc; + + timo -= ddi_get_lbolt(); + if (timo <= 0) + return (-1); + rc = _cv_timedwait_sig_sbt(cvp, &(mp)->lock_object, \ + tick_sbt * (timo), 0, C_HARDCLOCK); + if (rc == EWOULDBLOCK) + return (-1); + if (rc == EINTR || rc == ERESTART) + return (0); + + return (1); +} + +#define cv_timedwait_io cv_timedwait +#define cv_timedwait_sig_io cv_timedwait_sig + +static inline int +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (-1); + rc = cv_timedwait_sbt(cvp, mp, zfs_nstosbt(tim), + zfs_nstosbt(res), C_ABSOLUTE); + + if (rc == EWOULDBLOCK) + return (-1); + + KASSERT(rc == 0, ("unexpected rc value %d", rc)); + return (1); +} + +static inline int +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + sbintime_t sbt; + hrtime_t hrtime; + int rc; + + ASSERT(tim >= res); + + hrtime = gethrtime(); + if (flag == 0) + tim += hrtime; + + if (hrtime >= tim) + return (-1); + + sbt = zfs_nstosbt(tim); + rc = cv_timedwait_sig_sbt(cvp, mp, sbt, zfs_nstosbt(res), C_ABSOLUTE); + + switch (rc) { + case EWOULDBLOCK: + return (-1); + case EINTR: + case ERESTART: + return (0); + default: + KASSERT(rc == 0, ("unexpected rc value %d", rc)); + return (1); + } +} + +#endif /* _OPENSOLARIS_SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/console.h b/include/os/freebsd/spl/sys/console.h new file mode 100644 index 000000000000..a0bf8175e6ce --- /dev/null +++ b/include/os/freebsd/spl/sys/console.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_CONSOLE_H +#define _SPL_CONSOLE_H + +#define console_vprintf vprintf +#define console_printf printf + +#endif /* _SPL_CONSOLE_H */ diff --git a/include/os/freebsd/spl/sys/cred.h b/include/os/freebsd/spl/sys/cred.h new file mode 100644 index 000000000000..e32910e0efab --- /dev/null +++ b/include/os/freebsd/spl/sys/cred.h @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_CRED_H +#define _SYS_CRED_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The credential is an opaque kernel private data structure defined in + * . + */ + +typedef struct ucred cred_t; + +#define CRED() curthread->td_ucred +#define kcred (thread0.td_ucred) + +#define KUID_TO_SUID(x) (x) +#define KGID_TO_SGID(x) (x) +#define crgetuid(cred) ((cred)->cr_uid) +#define crgetruid(cred) ((cred)->cr_ruid) +#define crgetgid(cred) ((cred)->cr_gid) +#define crgetgroups(cred) ((cred)->cr_groups) +#define crgetngroups(cred) ((cred)->cr_ngroups) +#define crgetsid(cred, i) (NULL) + +struct proc; /* cred.h is included in proc.h */ +struct prcred; +struct ksid; +struct ksidlist; +struct credklpd; +struct credgrp; + +struct auditinfo_addr; /* cred.h is included in audit.h */ + +extern int ngroups_max; +/* + * kcred is used when you need all privileges. + */ + +extern void cred_init(void); +extern void crfree(cred_t *); +extern cred_t *cralloc(void); /* all but ref uninitialized */ +extern cred_t *cralloc_ksid(void); /* cralloc() + ksid alloc'ed */ +extern cred_t *crget(void); /* initialized */ +extern void crcopy_to(cred_t *, cred_t *); +extern cred_t *crdup(cred_t *); +extern void crdup_to(cred_t *, cred_t *); +extern cred_t *crgetcred(void); +extern void crset(struct proc *, cred_t *); +extern void crset_zone_privall(cred_t *); +extern int supgroupmember(gid_t, const cred_t *); +extern int hasprocperm(const cred_t *, const cred_t *); +extern int prochasprocperm(struct proc *, struct proc *, const cred_t *); +extern int crcmp(const cred_t *, const cred_t *); +extern cred_t *zone_kcred(void); + +extern gid_t crgetrgid(const cred_t *); +extern gid_t crgetsgid(const cred_t *); + +#define crgetzoneid(x) (0) +extern projid_t crgetprojid(const cred_t *); + +extern cred_t *crgetmapped(const cred_t *); + + +extern const struct auditinfo_addr *crgetauinfo(const cred_t *); +extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); + +extern uint_t crgetref(const cred_t *); + +extern const gid_t *crgetggroups(const struct credgrp *); + + +/* + * Sets real, effective and/or saved uid/gid; + * -1 argument accepted as "no change". + */ +extern int crsetresuid(cred_t *, uid_t, uid_t, uid_t); +extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); + +/* + * Sets real, effective and saved uids/gids all to the same + * values. Both values must be non-negative and <= MAXUID + */ +extern int crsetugid(cred_t *, uid_t, gid_t); + +/* + * Functions to handle the supplemental group list. + */ +extern struct credgrp *crgrpcopyin(int, gid_t *); +extern void crgrprele(struct credgrp *); +extern void crsetcredgrp(cred_t *, struct credgrp *); + +/* + * Private interface for setting zone association of credential. + */ +struct zone; +extern void crsetzone(cred_t *, struct zone *); +extern struct zone *crgetzone(const cred_t *); + +/* + * Private interface for setting project id in credential. + */ +extern void crsetprojid(cred_t *, projid_t); + +/* + * Private interface for nfs. + */ +extern cred_t *crnetadjust(cred_t *); + +/* + * Private interface for procfs. + */ +extern void cred2prcred(const cred_t *, struct prcred *); + +/* + * Private interfaces for Rampart Trusted Solaris. + */ +struct ts_label_s; +extern struct ts_label_s *crgetlabel(const cred_t *); +extern boolean_t crisremote(const cred_t *); + +/* + * Private interfaces for ephemeral uids. + */ +#define VALID_UID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_uid((zn), (id))) + +#define VALID_GID(id, zn) \ + ((id) <= MAXUID || valid_ephemeral_gid((zn), (id))) + +extern boolean_t valid_ephemeral_uid(struct zone *, uid_t); +extern boolean_t valid_ephemeral_gid(struct zone *, gid_t); + +extern int eph_uid_alloc(struct zone *, int, uid_t *, int); +extern int eph_gid_alloc(struct zone *, int, gid_t *, int); + +extern void crsetsid(cred_t *, struct ksid *, int); +extern void crsetsidlist(cred_t *, struct ksidlist *); + +extern struct ksidlist *crgetsidlist(const cred_t *); + +extern int crsetpriv(cred_t *, ...); + +extern struct credklpd *crgetcrklpd(const cred_t *); +extern void crsetcrklpd(cred_t *, struct credklpd *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRED_H */ diff --git a/include/os/freebsd/spl/sys/ctype.h b/include/os/freebsd/spl/sys/ctype.h new file mode 100644 index 000000000000..f225858072ab --- /dev/null +++ b/include/os/freebsd/spl/sys/ctype.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_SYS_CTYPE_H_ +#define _SPL_SYS_CTYPE_H_ +#include_next + +#define isalnum(ch) (isalpha(ch) || isdigit(ch)) +#define iscntrl(C) (uchar(C) <= 0x1f || uchar(C) == 0x7f) +#define isgraph(C) ((C) >= 0x21 && (C) <= 0x7E) +/* BEGIN CSTYLED */ +#define ispunct(C) \ + (((C) >= 0x21 && (C) <= 0x2F) || \ + ((C) >= 0x3A && (C) <= 0x40) || \ + ((C) >= 0x5B && (C) <= 0x60) || \ + ((C) >= 0x7B && (C) <= 0x7E)) +/* END CSTYLED */ + +#endif diff --git a/include/os/freebsd/spl/sys/debug.h b/include/os/freebsd/spl/sys/debug.h new file mode 100644 index 000000000000..2751f57801f7 --- /dev/null +++ b/include/os/freebsd/spl/sys/debug.h @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + + +/* + * Common DEBUG functionality. + */ +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +#ifndef expect +#define expect(expr, value) (__builtin_expect((expr), (value))) +#endif +#define likely(expr) expect((expr) != 0, 1) +#define unlikely(expr) expect((expr) != 0, 0) + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) +#define CTASSERT_GLOBAL(x) CTASSERT(x) + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/freebsd/spl/sys/dirent.h b/include/os/freebsd/spl/sys/dirent.h new file mode 100644 index 000000000000..2403766a427d --- /dev/null +++ b/include/os/freebsd/spl/sys/dirent.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DIRENT_H_ +#define _OPENSOLARIS_SYS_DIRENT_H_ + +#include + +#include_next + +typedef struct dirent dirent64_t; +typedef ino_t ino64_t; + +#define dirent64 dirent + +#define d_ino d_fileno + +#define DIRENT64_RECLEN(len) _GENERIC_DIRLEN(len) + +#endif /* !_OPENSOLARIS_SYS_DIRENT_H_ */ diff --git a/include/os/freebsd/spl/sys/disp.h b/include/os/freebsd/spl/sys/disp.h new file mode 100644 index 000000000000..2be1b76e4334 --- /dev/null +++ b/include/os/freebsd/spl/sys/disp.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2013 Andriy Gapon + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_DISP_H_ +#define _OPENSOLARIS_SYS_DISP_H_ + +#include + +#define kpreempt(x) kern_yield(PRI_USER) + +#endif /* _OPENSOLARIS_SYS_DISP_H_ */ diff --git a/include/os/freebsd/spl/sys/dkio.h b/include/os/freebsd/spl/sys/dkio.h new file mode 100644 index 000000000000..aed54ba50893 --- /dev/null +++ b/include/os/freebsd/spl/sys/dkio.h @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _OPENSOLARIS_SYS_DKIO_H_ +#define _OPENSOLARIS_SYS_DKIO_H_ + +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _OPENSOLARIS_SYS_DKIO_H_ */ diff --git a/include/os/freebsd/spl/sys/extdirent.h b/include/os/freebsd/spl/sys/extdirent.h new file mode 100644 index 000000000000..65ba11f345d2 --- /dev/null +++ b/include/os/freebsd/spl/sys/extdirent.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_EXTDIRENT_H +#define _SYS_EXTDIRENT_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * Extended file-system independent directory entry. This style of + * dirent provides additional informational flag bits for each + * directory entry. This dirent will be returned instead of the + * standard dirent if a VOP_READDIR() requests dirent flags via + * V_RDDIR_ENTFLAGS, and if the file system supports the flags. + */ +typedef struct edirent { + ino64_t ed_ino; /* "inode number" of entry */ + off64_t ed_off; /* offset of disk directory entry */ + uint32_t ed_eflags; /* per-entry flags */ + unsigned short ed_reclen; /* length of this record */ + char ed_name[1]; /* name of file */ +} edirent_t; + +#define EDIRENT_RECLEN(namelen) \ + ((offsetof(edirent_t, ed_name[0]) + 1 + (namelen) + 7) & ~ 7) +#define EDIRENT_NAMELEN(reclen) \ + ((reclen) - (offsetof(edirent_t, ed_name[0]))) + +/* + * Extended entry flags + * Extended entries include a bitfield of extra information + * regarding that entry. + */ +#define ED_CASE_CONFLICT 0x10 /* Disconsidering case, entry is not unique */ + +/* + * Extended flags accessor function + */ +#define ED_CASE_CONFLICTS(x) ((x)->ed_eflags & ED_CASE_CONFLICT) +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EXTDIRENT_H */ diff --git a/include/os/freebsd/spl/sys/file.h b/include/os/freebsd/spl/sys/file.h new file mode 100644 index 000000000000..51e59b1133c8 --- /dev/null +++ b/include/os/freebsd/spl/sys/file.h @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FILE_H_ +#define _OPENSOLARIS_SYS_FILE_H_ + +#include +#include_next + +#define FKIOCTL 0x80000000 /* ioctl addresses are from kernel */ + +typedef struct file file_t; + +#include + +static __inline file_t * +getf_caps(int fd, cap_rights_t *rightsp) +{ + struct file *fp; + + if (fget(curthread, fd, rightsp, &fp) == 0) + return (fp); + return (NULL); +} + +#endif /* !_OPENSOLARIS_SYS_FILE_H_ */ diff --git a/include/os/freebsd/spl/sys/freebsd_rwlock.h b/include/os/freebsd/spl/sys/freebsd_rwlock.h new file mode 100644 index 000000000000..b760f8cf23d4 --- /dev/null +++ b/include/os/freebsd/spl/sys/freebsd_rwlock.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ +#define _OPENSOLARIS_SYS_FREEBSD_RWLOCK_H_ + +#include_next + +#endif diff --git a/include/os/freebsd/spl/sys/idmap.h b/include/os/freebsd/spl/sys/idmap.h new file mode 100644 index 000000000000..39eeb905c72b --- /dev/null +++ b/include/os/freebsd/spl/sys/idmap.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_IDMAP_H +#define _SYS_IDMAP_H + + +/* Idmap status codes */ +#define IDMAP_SUCCESS 0 +#define IDMAP_NEXT 1 +#define IDMAP_ERR_OTHER -10000 +#define IDMAP_ERR_INTERNAL -9999 +#define IDMAP_ERR_MEMORY -9998 +#define IDMAP_ERR_NORESULT -9997 +#define IDMAP_ERR_NOTUSER -9996 +#define IDMAP_ERR_NOTGROUP -9995 +#define IDMAP_ERR_NOTSUPPORTED -9994 +#define IDMAP_ERR_W2U_NAMERULE -9993 +#define IDMAP_ERR_U2W_NAMERULE -9992 +#define IDMAP_ERR_CACHE -9991 +#define IDMAP_ERR_DB -9990 +#define IDMAP_ERR_ARG -9989 +#define IDMAP_ERR_SID -9988 +#define IDMAP_ERR_IDTYPE -9987 +#define IDMAP_ERR_RPC_HANDLE -9986 +#define IDMAP_ERR_RPC -9985 +#define IDMAP_ERR_CLIENT_HANDLE -9984 +#define IDMAP_ERR_BUSY -9983 +#define IDMAP_ERR_PERMISSION_DENIED -9982 +#define IDMAP_ERR_NOMAPPING -9981 +#define IDMAP_ERR_NEW_ID_ALLOC_REQD -9980 +#define IDMAP_ERR_DOMAIN -9979 +#define IDMAP_ERR_SECURITY -9978 +#define IDMAP_ERR_NOTFOUND -9977 +#define IDMAP_ERR_DOMAIN_NOTFOUND -9976 +#define IDMAP_ERR_UPDATE_NOTALLOWED -9975 +#define IDMAP_ERR_CFG -9974 +#define IDMAP_ERR_CFG_CHANGE -9973 +#define IDMAP_ERR_NOTMAPPED_WELLKNOWN -9972 +#define IDMAP_ERR_RETRIABLE_NET_ERR -9971 +#define IDMAP_ERR_W2U_NAMERULE_CONFLICT -9970 +#define IDMAP_ERR_U2W_NAMERULE_CONFLICT -9969 +#define IDMAP_ERR_BAD_UTF8 -9968 +#define IDMAP_ERR_NONE_GENERATED -9967 +#define IDMAP_ERR_PROP_UNKNOWN -9966 +#define IDMAP_ERR_NS_LDAP_OP_FAILED -9965 +#define IDMAP_ERR_NS_LDAP_PARTIAL -9964 +#define IDMAP_ERR_NS_LDAP_CFG -9963 +#define IDMAP_ERR_NS_LDAP_BAD_WINNAME -9962 +#define IDMAP_ERR_NO_ACTIVEDIRECTORY -9961 + +/* Reserved GIDs for some well-known SIDs */ +#define IDMAP_WK_LOCAL_SYSTEM_GID 2147483648U /* 0x80000000 */ +#define IDMAP_WK_CREATOR_GROUP_GID 2147483649U +#define IDMAP_WK__MAX_GID 2147483649U + +/* Reserved UIDs for some well-known SIDs */ +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U +#define IDMAP_WK__MAX_UID 2147483648U + +/* Reserved SIDs */ +#define IDMAP_WK_CREATOR_SID_AUTHORITY "S-1-3" + +/* + * Max door RPC size for ID mapping (can't be too large relative to the + * default user-land thread stack size, since clnt_door_call() + * alloca()s). See libidmap:idmap_init(). + */ +#define IDMAP_MAX_DOOR_RPC (256 * 1024) + +#define IDMAP_SENTINEL_PID UINT32_MAX +#define IDMAP_ID_IS_EPHEMERAL(pid) \ + (((pid) > INT32_MAX) && ((pid) != IDMAP_SENTINEL_PID)) + +#endif /* _SYS_IDMAP_H */ diff --git a/include/os/freebsd/spl/sys/inttypes.h b/include/os/freebsd/spl/sys/inttypes.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/inttypes.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/isa_defs.h b/include/os/freebsd/spl/sys/isa_defs.h new file mode 100644 index 000000000000..399d510b5f9b --- /dev/null +++ b/include/os/freebsd/spl/sys/isa_defs.h @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H +#include + +/* + * This header file serves to group a set of well known defines and to + * set these for each instruction set architecture. These defines may + * be divided into two groups; characteristics of the processor and + * implementation choices for Solaris on a processor. + * + * Processor Characteristics: + * + * _LITTLE_ENDIAN / _BIG_ENDIAN: + * The natural byte order of the processor. A pointer to an int points + * to the least/most significant byte of that int. + * + * _STACK_GROWS_UPWARD / _STACK_GROWS_DOWNWARD: + * The processor specific direction of stack growth. A push onto the + * stack increases/decreases the stack pointer, so it stores data at + * successively higher/lower addresses. (Stackless machines ignored + * without regrets). + * + * _LONG_LONG_HTOL / _LONG_LONG_LTOH: + * A pointer to a long long points to the most/least significant long + * within that long long. + * + * _BIT_FIELDS_HTOL / _BIT_FIELDS_LTOH: + * The C compiler assigns bit fields from the high/low to the low/high end + * of an int (most to least significant vs. least to most significant). + * + * _IEEE_754: + * The processor (or supported implementations of the processor) + * supports the ieee-754 floating point standard. No other floating + * point standards are supported (or significant). Any other supported + * floating point formats are expected to be cased on the ISA processor + * symbol. + * + * _CHAR_IS_UNSIGNED / _CHAR_IS_SIGNED: + * The C Compiler implements objects of type `char' as `unsigned' or + * `signed' respectively. This is really an implementation choice of + * the compiler writer, but it is specified in the ABI and tends to + * be uniform across compilers for an instruction set architecture. + * Hence, it has the properties of a processor characteristic. + * + * _CHAR_ALIGNMENT / _SHORT_ALIGNMENT / _INT_ALIGNMENT / _LONG_ALIGNMENT / + * _LONG_LONG_ALIGNMENT / _DOUBLE_ALIGNMENT / _LONG_DOUBLE_ALIGNMENT / + * _POINTER_ALIGNMENT / _FLOAT_ALIGNMENT: + * The ABI defines alignment requirements of each of the primitive + * object types. Some, if not all, may be hardware requirements as + * well. The values are expressed in "byte-alignment" units. + * + * _MAX_ALIGNMENT: + * The most stringent alignment requirement as specified by the ABI. + * Equal to the maximum of all the above _XXX_ALIGNMENT values. + * + * _ALIGNMENT_REQUIRED: + * True or false (1 or 0) whether or not the hardware requires the ABI + * alignment. + * + * _LONG_LONG_ALIGNMENT_32 + * The 32-bit ABI supported by a 64-bit kernel may have different + * alignment requirements for primitive object types. The value of this + * identifier is expressed in "byte-alignment" units. + * + * _HAVE_CPUID_INSN + * This indicates that the architecture supports the 'cpuid' + * instruction as defined by Intel. (Intel allows other vendors + * to extend the instruction for their own purposes.) + * + * + * Implementation Choices: + * + * _ILP32 / _LP64: + * This specifies the compiler data type implementation as specified in + * the relevant ABI. The choice between these is strongly influenced + * by the underlying hardware, but is not absolutely tied to it. + * Currently only two data type models are supported: + * + * _ILP32: + * Int/Long/Pointer are 32 bits. This is the historical UNIX + * and Solaris implementation. Due to its historical standing, + * this is the default case. + * + * _LP64: + * Long/Pointer are 64 bits, Int is 32 bits. This is the chosen + * implementation for 64-bit ABIs such as SPARC V9. + * + * _I32LPx: + * A compilation environment where 'int' is 32-bit, and + * longs and pointers are simply the same size. + * + * In all cases, Char is 8 bits and Short is 16 bits. + * + * _SUNOS_VTOC_8 / _SUNOS_VTOC_16 / _SVR4_VTOC_16: + * This specifies the form of the disk VTOC (or label): + * + * _SUNOS_VTOC_8: + * This is a VTOC form which is upwardly compatible with the + * SunOS 4.x disk label and allows 8 partitions per disk. + * + * _SUNOS_VTOC_16: + * In this format the incore vtoc image matches the ondisk + * version. It allows 16 slices per disk, and is not + * compatible with the SunOS 4.x disk label. + * + * Note that these are not the only two VTOC forms possible and + * additional forms may be added. One possible form would be the + * SVr4 VTOC form. The symbol for that is reserved now, although + * it is not implemented. + * + * _SVR4_VTOC_16: + * This VTOC form is compatible with the System V Release 4 + * VTOC (as implemented on the SVr4 Intel and 3b ports) with + * 16 partitions per disk. + * + * + * _DMA_USES_PHYSADDR / _DMA_USES_VIRTADDR + * This describes the type of addresses used by system DMA: + * + * _DMA_USES_PHYSADDR: + * This type of DMA, used in the x86 implementation, + * requires physical addresses for DMA buffers. The 24-bit + * addresses used by some legacy boards is the source of the + * "low-memory" (<16MB) requirement for some devices using DMA. + * + * _DMA_USES_VIRTADDR: + * This method of DMA allows the use of virtual addresses for + * DMA transfers. + * + * _FIRMWARE_NEEDS_FDISK / _NO_FDISK_PRESENT + * This indicates the presence/absence of an fdisk table. + * + * _FIRMWARE_NEEDS_FDISK + * The fdisk table is required by system firmware. If present, + * it allows a disk to be subdivided into multiple fdisk + * partitions, each of which is equivalent to a separate, + * virtual disk. This enables the co-existence of multiple + * operating systems on a shared hard disk. + * + * _NO_FDISK_PRESENT + * If the fdisk table is absent, it is assumed that the entire + * media is allocated for a single operating system. + * + * _HAVE_TEM_FIRMWARE + * Defined if this architecture has the (fallback) option of + * using prom_* calls for doing I/O if a suitable kernel driver + * is not available to do it. + * + * _DONT_USE_1275_GENERIC_NAMES + * Controls whether or not device tree node names should + * comply with the IEEE 1275 "Generic Names" Recommended + * Practice. With _DONT_USE_GENERIC_NAMES, device-specific + * names identifying the particular device will be used. + * + * __i386_COMPAT + * This indicates whether the i386 ABI is supported as a *non-native* + * mode for the platform. When this symbol is defined: + * - 32-bit xstat-style system calls are enabled + * - 32-bit xmknod-style system calls are enabled + * - 32-bit system calls use i386 sizes -and- alignments + * + * Note that this is NOT defined for the i386 native environment! + * + * __x86 + * This is ONLY a synonym for defined(__i386) || defined(__amd64) + * which is useful only insofar as these two architectures share + * common attributes. Analogous to __sparc. + * + * _PSM_MODULES + * This indicates whether or not the implementation uses PSM + * modules for processor support, reading /etc/mach from inside + * the kernel to extract a list. + * + * _RTC_CONFIG + * This indicates whether or not the implementation uses /etc/rtc_config + * to configure the real-time clock in the kernel. + * + * _UNIX_KRTLD + * This indicates that the implementation uses a dynamically + * linked unix + krtld to form the core kernel image at boot + * time, or (in the absence of this symbol) a prelinked kernel image. + * + * _OBP + * This indicates the firmware interface is OBP. + * + * _SOFT_HOSTID + * This indicates that the implementation obtains the hostid + * from the file /etc/hostid, rather than from hardware. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The following set of definitions characterize Solaris on AMD's + * 64-bit systems. + */ +#if defined(__x86_64) || defined(__amd64) + +#if !defined(__amd64) +#define __amd64 /* preferred guard */ +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Different alignment constraints for the i386 ABI in compatibility mode + */ +#define _LONG_LONG_ALIGNMENT_32 4 + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define __i386_COMPAT +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +/* + * The feature test macro __i386 is generic for all processors implementing + * the Intel 386 instruction set or a superset of it. Specifically, this + * includes all members of the 386, 486, and Pentium family of processors. + */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _SOFT_HOSTID +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__aarch64__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__riscv) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_UNSIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 +#define _ALIGNMENT_REQUIRED 1 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#define _LP64 +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__arm__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__mips__) + +/* + * Define the appropriate "processor characteristics" + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_LTOH +#define _BIT_FIELDS_LTOH +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#if defined(__mips_n64) +#define _LONG_ALIGNMENT 8 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _INT_ALIGNMENT +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#else +#define _LONG_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 4 +#define _DOUBLE_ALIGNMENT 4 +#define _DOUBLE_COMPLEX_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 4 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 4 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 4 +#define _ALIGNMENT_REQUIRED 0 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices". + */ +#if !defined(_ILP32) +#define _ILP32 +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#endif +#define _SUNOS_VTOC_16 +#define _DMA_USES_PHYSADDR +#define _FIRMWARE_NEEDS_FDISK +#define _PSM_MODULES +#define _RTC_CONFIG +#define _DONT_USE_1275_GENERIC_NAMES +#define _HAVE_CPUID_INSN + +#elif defined(__powerpc__) + +#if defined(__BIG_ENDIAN__) +#define _BIT_FIELDS_HTOL +#else +#define _BIT_FIELDS_LTOH +#endif + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if defined(__powerpc64__) +#define _LONG_LONG_ALIGNMENT 8 +#define _MULTI_DATAMODEL +#else +#define _LONG_LONG_ALIGNMENT 4 +#endif +#define _LONG_LONG_ALIGNMENT_32 4 +#define _ALIGNMENT_REQUIRED 1 + +#define _SUNOS_VTOC_16 1 + +/* + * The following set of definitions characterize the Solaris on SPARC systems. + * + * The symbol __sparc indicates any of the SPARC family of processor + * architectures. This includes SPARC V7, SPARC V8 and SPARC V9. + * + * The symbol __sparcv8 indicates the 32-bit SPARC V8 architecture as defined + * by Version 8 of the SPARC Architecture Manual. (SPARC V7 is close enough + * to SPARC V8 for the former to be subsumed into the latter definition.) + * + * The symbol __sparcv9 indicates the 64-bit SPARC V9 architecture as defined + * by Version 9 of the SPARC Architecture Manual. + * + * The symbols __sparcv8 and __sparcv9 are mutually exclusive, and are only + * relevant when the symbol __sparc is defined. + */ +/* + * XXX Due to the existence of 5110166, "defined(__sparcv9)" needs to be added + * to support backwards builds. This workaround should be removed in s10_71. + */ +#elif defined(__sparc) || defined(__sparcv9) || defined(__sparc__) +#if !defined(__sparc) +#define __sparc +#endif + +/* + * You can be 32-bit or 64-bit, but not both at the same time. + */ +#if defined(__sparcv8) && defined(__sparcv9) +#error "SPARC Versions 8 and 9 are mutually exclusive choices" +#endif + +/* + * Existing compilers do not set __sparcv8. Years will transpire before + * the compilers can be depended on to set the feature test macro. In + * the interim, we'll set it here on the basis of historical behaviour; + * if you haven't asked for SPARC V9, then you must've meant SPARC V8. + */ +#if !defined(__sparcv9) && !defined(__sparcv8) +#define __sparcv8 +#endif + +/* + * Define the appropriate "processor characteristics" shared between + * all Solaris on SPARC systems. + */ +#define _STACK_GROWS_DOWNWARD +#define _LONG_LONG_HTOL +#define _BIT_FIELDS_HTOL +#define _IEEE_754 +#define _CHAR_IS_SIGNED +#define _BOOL_ALIGNMENT 1 +#define _CHAR_ALIGNMENT 1 +#define _SHORT_ALIGNMENT 2 +#define _INT_ALIGNMENT 4 +#define _FLOAT_ALIGNMENT 4 +#define _FLOAT_COMPLEX_ALIGNMENT 4 +#define _LONG_LONG_ALIGNMENT 8 +#define _DOUBLE_ALIGNMENT 8 +#define _DOUBLE_COMPLEX_ALIGNMENT 8 +#define _ALIGNMENT_REQUIRED 1 + +/* + * Define the appropriate "implementation choices" shared between versions. + */ +#define _SUNOS_VTOC_8 +#define _DMA_USES_VIRTADDR +#define _NO_FDISK_PRESENT +#define _HAVE_TEM_FIRMWARE +#define _OBP + +/* + * The following set of definitions characterize the implementation of + * 32-bit Solaris on SPARC V8 systems. + */ +#if defined(__sparcv8) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 4 +#define _LONG_DOUBLE_ALIGNMENT 8 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 8 +#define _POINTER_ALIGNMENT 4 +#define _MAX_ALIGNMENT 8 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#define _ILP32 +#if !defined(_I32LPx) +#define _I32LPx +#endif + +/* + * The following set of definitions characterize the implementation of + * 64-bit Solaris on SPARC V9 systems. + */ +#elif defined(__sparcv9) + +/* + * Define the appropriate "processor characteristics" + */ +#define _LONG_ALIGNMENT 8 +#define _LONG_DOUBLE_ALIGNMENT 16 +#define _LONG_DOUBLE_COMPLEX_ALIGNMENT 16 +#define _POINTER_ALIGNMENT 8 +#define _MAX_ALIGNMENT 16 + +#define _LONG_LONG_ALIGNMENT_32 _LONG_LONG_ALIGNMENT + +/* + * Define the appropriate "implementation choices" + */ +#if !defined(_LP64) +#error "_LP64 not defined" +#endif +#if !defined(_I32LPx) +#define _I32LPx +#endif +#define _MULTI_DATAMODEL + +#else +#error "unknown SPARC version" +#endif + +/* + * #error is strictly ansi-C, but works as well as anything for K&R systems. + */ +#else +#error "ISA not supported" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#if BYTE_ORDER == _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif BYTE_ORDER == _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#else +#error "unknown byte order" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/include/os/freebsd/spl/sys/kidmap.h b/include/os/freebsd/spl/sys/kidmap.h new file mode 100644 index 000000000000..dc0cf5988a42 --- /dev/null +++ b/include/os/freebsd/spl/sys/kidmap.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KIDMAP_H_ +#define _OPENSOLARIS_SYS_KIDMAP_H_ + +#include + +typedef int32_t idmap_stat; +typedef void idmap_get_handle_t; + +#define kidmap_get_create() (NULL) +#define kidmap_get_destroy(hdl) do { } while (0) +#define kidmap_get_mappings(hdl) (NULL) + +#endif /* _OPENSOLARIS_SYS_KIDMAP_H_ */ diff --git a/include/os/freebsd/spl/sys/kmem.h b/include/os/freebsd/spl/sys/kmem.h new file mode 100644 index 000000000000..28c65e74a250 --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_KMEM_H_ +#define _OPENSOLARIS_SYS_KMEM_H_ + +#include +#include +#include +#include + +#include +#include +#include + +MALLOC_DECLARE(M_SOLARIS); + +#define POINTER_IS_VALID(p) (!((uintptr_t)(p) & 0x3)) +#define POINTER_INVALIDATE(pp) (*(pp) = (void *)((uintptr_t)(*(pp)) | 0x1)) + +#define KM_SLEEP M_WAITOK +#define KM_PUSHPAGE M_WAITOK +#define KM_NOSLEEP M_NOWAIT +#define KM_NORMALPRI 0 +#define KMC_NODEBUG UMA_ZONE_NODUMP + +typedef struct vmem vmem_t; + +extern char *kmem_asprintf(const char *, ...); +extern char *kmem_vasprintf(const char *fmt, va_list ap); + +typedef struct kmem_cache { + char kc_name[32]; +#if !defined(KMEM_DEBUG) + uma_zone_t kc_zone; +#else + size_t kc_size; +#endif + int (*kc_constructor)(void *, void *, int); + void (*kc_destructor)(void *, void *); + void *kc_private; +} kmem_cache_t; + +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +void *zfs_kmem_alloc(size_t size, int kmflags); +void zfs_kmem_free(void *buf, size_t size); +uint64_t kmem_size(void); +kmem_cache_t *kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags); +void kmem_cache_destroy(kmem_cache_t *cache); +void *kmem_cache_alloc(kmem_cache_t *cache, int flags); +void kmem_cache_free(kmem_cache_t *cache, void *buf); +boolean_t kmem_cache_reap_active(void); +void kmem_cache_reap_soon(kmem_cache_t *); +void kmem_reap(void); +int kmem_debugging(void); +void *calloc(size_t n, size_t s); + + +#define kmem_cache_reap_now kmem_cache_reap_soon +#define freemem vm_free_count() +#define minfree vm_cnt.v_free_min +#define kmem_alloc(size, kmflags) zfs_kmem_alloc((size), (kmflags)) +#define kmem_zalloc(size, kmflags) \ + zfs_kmem_alloc((size), (kmflags) | M_ZERO) +#define kmem_free(buf, size) zfs_kmem_free((buf), (size)) + + +#endif /* _OPENSOLARIS_SYS_KMEM_H_ */ diff --git a/include/os/freebsd/spl/sys/kmem_cache.h b/include/os/freebsd/spl/sys/kmem_cache.h new file mode 100644 index 000000000000..d8e0349e4ca3 --- /dev/null +++ b/include/os/freebsd/spl/sys/kmem_cache.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +extern void spl_kmem_cache_set_move(kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); + +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) + +#endif diff --git a/include/os/freebsd/spl/sys/kstat.h b/include/os/freebsd/spl/sys/kstat.h new file mode 100644 index 000000000000..74c3da8ec376 --- /dev/null +++ b/include/os/freebsd/spl/sys/kstat.h @@ -0,0 +1,207 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H +#include +#include +struct list_head {}; +#include +#include + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 5 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; +typedef struct kstat_s kstat_t; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +typedef struct kstat_module { + char ksm_name[KSTAT_STRLEN+1]; /* module name */ + struct list_head ksm_module_list; /* module linkage */ + struct list_head ksm_kstat_list; /* list of kstat entries */ + struct proc_dir_entry *ksm_proc; /* proc entry */ +} kstat_module_t; + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(kstat_t *ksp, loff_t index); +} kstat_raw_ops_t; + +struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + char ks_module[KSTAT_STRLEN+1]; /* provider module name */ + int ks_instance; /* provider module instance */ + char ks_name[KSTAT_STRLEN+1]; /* kstat name */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of data records */ + size_t ks_data_size; /* size of kstat data section */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + struct list_head ks_list; /* kstat linkage */ + kstat_module_t *ks_owner; /* kstat module linkage */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ + struct sysctl_ctx_list ks_sysctl_ctx; + struct sysctl_oid *ks_sysctl_root; + +}; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +int spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags); + +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); + +#define kstat_set_raw_ops(k, h, d, a) \ + __kstat_set_raw_ops(k, h, d, a) +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) + +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/freebsd/spl/sys/list.h b/include/os/freebsd/spl/sys/list.h new file mode 100644 index 000000000000..8339b6226d11 --- /dev/null +++ b/include/os/freebsd/spl/sys/list.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/include/os/freebsd/spl/sys/list_impl.h b/include/os/freebsd/spl/sys/list_impl.h new file mode 100644 index 000000000000..9c42f8832023 --- /dev/null +++ b/include/os/freebsd/spl/sys/list_impl.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *list_next; + struct list_node *list_prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/include/os/freebsd/spl/sys/lock.h b/include/os/freebsd/spl/sys/lock.h new file mode 100644 index 000000000000..7d5dc26abc74 --- /dev/null +++ b/include/os/freebsd/spl/sys/lock.h @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_LOCK_H_ +#define _OPENSOLARIS_SYS_LOCK_H_ + +#include_next + +#define LO_ALLMASK (LO_INITIALIZED | LO_WITNESS | LO_QUIET | \ + LO_RECURSABLE | LO_SLEEPABLE | LO_UPGRADABLE | \ + LO_DUPOK | LO_CLASSMASK | LO_NOPROFILE) +#define LO_EXPECTED (LO_INITIALIZED | LO_WITNESS | LO_RECURSABLE | \ + LO_SLEEPABLE | LO_UPGRADABLE | LO_DUPOK | (2 << LO_CLASSSHIFT)) + +#endif /* _OPENSOLARIS_SYS_LOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/misc.h b/include/os/freebsd/spl/sys/misc.h new file mode 100644 index 000000000000..e39bb07b2f4c --- /dev/null +++ b/include/os/freebsd/spl/sys/misc.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MISC_H_ +#define _OPENSOLARIS_SYS_MISC_H_ + +#include + +#define MAXUID UID_MAX + +#define _ACL_ACLENT_ENABLED 0x1 +#define _ACL_ACE_ENABLED 0x2 + +#define _FIOFFS (INT_MIN) +#define _FIOGDIO (INT_MIN+1) +#define _FIOSDIO (INT_MIN+2) + +#define _FIO_SEEK_DATA FIOSEEKDATA +#define _FIO_SEEK_HOLE FIOSEEKHOLE + +struct opensolaris_utsname { + char *sysname; + char *nodename; + char *release; + char version[32]; + char *machine; +}; + +extern char hw_serial[11]; + +#endif /* _OPENSOLARIS_SYS_MISC_H_ */ diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h new file mode 100644 index 000000000000..ec1da1a46ae6 --- /dev/null +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_MOD_H +#define _SPL_MOD_H + +#include + +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) + +#define EXPORT_SYMBOL(x) +#define module_param(a, b, c) +#define MODULE_PARM_DESC(a, b) + +#define ZMOD_RW CTLFLAG_RWTUN +#define ZMOD_RD CTLFLAG_RDTUN + +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) \ + SYSCTL_DECL(_vfs_ ## scope_prefix); \ + SYSCTL_##type(_vfs_ ## scope_prefix, OID_AUTO, name, perm, &name_prefix ## name, 0, desc) + +#define ZFS_MODULE_PARAM_ARGS SYSCTL_HANDLER_ARGS + +#define ZFS_MODULE_PARAM_CALL_IMPL(parent, name, perm, args, desc) \ + SYSCTL_DECL(parent); \ + SYSCTL_PROC(parent, OID_AUTO, name, perm | args, desc) + +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, func, _, perm, desc) \ + ZFS_MODULE_PARAM_CALL_IMPL(_vfs_ ## scope_prefix, name, perm, func ## _args(name_prefix ## name), desc) + +#define param_set_arc_long_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_arc_long, "LU" + +#define param_set_arc_int_args(var) \ + CTLTYPE_INT, &var, 0, param_set_arc_int, "I" + +#define param_set_deadman_failmode_args(var) \ + CTLTYPE_STRING, NULL, 0, param_set_deadman_failmode, "A" + +#define param_set_deadman_synctime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_synctime, "LU" + +#define param_set_deadman_ziotime_args(var) \ + CTLTYPE_ULONG, NULL, 0, param_set_deadman_ziotime, "LU" + +#define param_set_multihost_interval_args(var) \ + CTLTYPE_ULONG, &var, 0, param_set_multihost_interval, "LU" + +#define param_set_slop_shift_args(var) \ + CTLTYPE_INT, &var, 0, param_set_slop_shift, "I" + +#define param_set_min_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_min_auto_ashift, "QU" + +#define param_set_max_auto_ashift_args(var) \ + CTLTYPE_U64, &var, 0, param_set_max_auto_ashift, "QU" + +#include +#define module_init(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) + + +#define module_exit(fn) \ +static void \ +wrap_ ## fn(void *dummy __unused) \ +{ \ + fn(); \ +} \ +SYSUNINIT(zfs_ ## fn, SI_SUB_LAST, SI_ORDER_FIRST, wrap_ ## fn, NULL) +/* END CSTYLED */ + +#endif /* SPL_MOD_H */ diff --git a/include/os/freebsd/spl/sys/mode.h b/include/os/freebsd/spl/sys/mode.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/mode.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/mount.h b/include/os/freebsd/spl/sys/mount.h new file mode 100644 index 000000000000..42614e4739f1 --- /dev/null +++ b/include/os/freebsd/spl/sys/mount.h @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MOUNT_H_ +#define _OPENSOLARIS_SYS_MOUNT_H_ + +#include +#include_next +#ifdef BUILDING_ZFS +#include +#endif +#define MS_FORCE MNT_FORCE +#define MS_REMOUNT MNT_UPDATE + +typedef struct fid fid_t; + +#endif /* !_OPENSOLARIS_SYS_MOUNT_H_ */ diff --git a/include/os/freebsd/spl/sys/mutex.h b/include/os/freebsd/spl/sys/mutex.h new file mode 100644 index 000000000000..e757d12c1502 --- /dev/null +++ b/include/os/freebsd/spl/sys/mutex.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_MUTEX_H_ +#define _OPENSOLARIS_SYS_MUTEX_H_ + +typedef struct sx kmutex_t; + +#include +#include +#include_next +#include_next +#include +#include + +typedef enum { + MUTEX_DEFAULT = 0 /* kernel default mutex */ +} kmutex_type_t; + +#define MUTEX_HELD(x) (mutex_owned(x)) +#define MUTEX_NOT_HELD(x) (!mutex_owned(x) || panicstr) + +#ifndef OPENSOLARIS_WITNESS +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW | SX_NOWITNESS) +#else +#define MUTEX_FLAGS (SX_DUPOK | SX_NEW) +#endif + +#define mutex_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == MUTEX_DEFAULT); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, MUTEX_FLAGS); \ +} while (0) +#define mutex_destroy(lock) sx_destroy(lock) +#define mutex_enter(lock) sx_xlock(lock) +#define mutex_enter_nested(lock, type) sx_xlock(lock) +#define mutex_tryenter(lock) sx_try_xlock(lock) +#define mutex_exit(lock) sx_xunlock(lock) +#define mutex_owned(lock) sx_xlocked(lock) +#define mutex_owner(lock) sx_xholder(lock) +#endif /* _OPENSOLARIS_SYS_MUTEX_H_ */ diff --git a/include/os/freebsd/spl/sys/param.h b/include/os/freebsd/spl/sys/param.h new file mode 100644 index 000000000000..92724e332d68 --- /dev/null +++ b/include/os/freebsd/spl/sys/param.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2007 John Birrell + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + * + */ + +#ifndef _COMPAT_OPENSOLARIS_SYS_PARAM_H_ +#define _COMPAT_OPENSOLARIS_SYS_PARAM_H_ + +#include +#include_next +#define PAGESIZE PAGE_SIZE +#define ptob(x) ((uint64_t)(x) << PAGE_SHIFT) +#ifdef _KERNEL +#include +#include +#endif +#endif diff --git a/include/os/freebsd/spl/sys/policy.h b/include/os/freebsd/spl/sys/policy.h new file mode 100644 index 000000000000..3a05da12b3aa --- /dev/null +++ b/include/os/freebsd/spl/sys/policy.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $ $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_POLICY_H_ +#define _OPENSOLARIS_SYS_POLICY_H_ + +#include +#include +#include +struct mount; +struct vattr; + +int secpolicy_nfs(cred_t *cr); +int secpolicy_zfs(cred_t *crd); +int secpolicy_zfs_proc(cred_t *cr, proc_t *proc); +int secpolicy_sys_config(cred_t *cr, int checkonly); +int secpolicy_zinject(cred_t *cr); +int secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp); +int secpolicy_basic_link(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_stky_modify(cred_t *cr); +int secpolicy_vnode_remove(vnode_t *vp, cred_t *cr); +int secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t accmode); +int secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode); +int secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner); +int secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner); +int secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node); +int secpolicy_vnode_create_gid(cred_t *cr); +int secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid); +int secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, + boolean_t issuidroot); +void secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr); +int secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr); +int secpolicy_fs_owner(struct mount *vfsp, cred_t *cr); +int secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp); +void secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp); +int secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype); +int secpolicy_smb(cred_t *cr); + + +#if __FreeBSD_version >= 1300005 +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b)) +#else +#define spl_priv_check_cred(a, b) priv_check_cred((a), (b), 0) +#endif +#endif /* _OPENSOLARIS_SYS_POLICY_H_ */ diff --git a/include/os/freebsd/spl/sys/proc.h b/include/os/freebsd/spl/sys/proc.h new file mode 100644 index 000000000000..a455cfbab66f --- /dev/null +++ b/include/os/freebsd/spl/sys/proc.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_PROC_H_ +#define _OPENSOLARIS_SYS_PROC_H_ + +#include +#include +#include_next +#include +#include +#include +#include +#include +#include +#include +#include + + +#define CPU curcpu +#define minclsyspri PRIBIO +#define defclsyspri minclsyspri +#define maxclsyspri PVM +#define max_ncpus (mp_maxid + 1) +#define boot_max_ncpus (mp_maxid + 1) + +#define TS_RUN 0 + +#define p0 proc0 + +#define t_tid td_tid + +typedef short pri_t; +typedef struct thread _kthread; +typedef struct thread kthread_t; +typedef struct thread *kthread_id_t; +typedef struct proc proc_t; + +extern proc_t *system_proc; + +static __inline kthread_t * +do_thread_create(caddr_t stk, size_t stksize, void (*proc)(void *), void *arg, + size_t len, proc_t *pp, int state, pri_t pri, const char *name) +{ + kthread_t *td = NULL; + proc_t **ppp; + int error; + + /* + * Be sure there are no surprises. + */ + ASSERT(stk == NULL); + ASSERT(len == 0); + ASSERT(state == TS_RUN); + + if (pp == &p0) + ppp = &system_proc; + else + ppp = &pp; + error = kproc_kthread_add(proc, arg, ppp, &td, RFSTOPPED, + stksize / PAGE_SIZE, "zfskern", "%s", name); + if (error == 0) { + thread_lock(td); + sched_prio(td, pri); + sched_add(td, SRQ_BORING); +#if __FreeBSD_version < 1300068 + thread_unlock(td); +#endif + } + return (td); +} + +#define thread_create_named(name, stk, stksize, proc, arg, len, \ + pp, state, pri) \ + do_thread_create(stk, stksize, proc, arg, len, pp, state, pri, name) +#define thread_create(stk, stksize, proc, arg, len, pp, state, pri) \ + do_thread_create(stk, stksize, proc, arg, len, pp, state, pri, #proc) +#define thread_exit() kthread_exit() + +int uread(proc_t *, void *, size_t, uintptr_t); +int uwrite(proc_t *, void *, size_t, uintptr_t); + +static inline boolean_t +zfs_proc_is_caller(proc_t *p) +{ + return (p == curproc); +} + +#endif /* _OPENSOLARIS_SYS_PROC_H_ */ diff --git a/include/os/freebsd/spl/sys/processor.h b/include/os/freebsd/spl/sys/processor.h new file mode 100644 index 000000000000..53149840f21f --- /dev/null +++ b/include/os/freebsd/spl/sys/processor.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2014 Garrett D'Amore + * + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#include +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +#define getcpuid() curcpu + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/include/os/freebsd/spl/sys/procfs_list.h b/include/os/freebsd/spl/sys/procfs_list.h new file mode 100644 index 000000000000..5d623c369c4c --- /dev/null +++ b/include/os/freebsd/spl/sys/procfs_list.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#include +#include + + +/* + * procfs list manipulation + */ + +struct seq_file { }; +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct procfs_list { + void *pl_private; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + size_t pl_node_offset; +} procfs_list_t; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#endif /* _SPL_PROCFS_LIST_H */ diff --git a/include/os/freebsd/spl/sys/random.h b/include/os/freebsd/spl/sys/random.h new file mode 100644 index 000000000000..b3c9115f5305 --- /dev/null +++ b/include/os/freebsd/spl/sys/random.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RANDOM_H_ +#define _OPENSOLARIS_SYS_RANDOM_H_ + +#include_next + +static inline int +random_get_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +static inline int +random_get_pseudo_bytes(uint8_t *p, size_t s) +{ + arc4rand(p, (int)s, 0); + return (0); +} + +#endif /* !_OPENSOLARIS_SYS_RANDOM_H_ */ diff --git a/include/os/freebsd/spl/sys/rwlock.h b/include/os/freebsd/spl/sys/rwlock.h new file mode 100644 index 000000000000..10107a9bee84 --- /dev/null +++ b/include/os/freebsd/spl/sys/rwlock.h @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_RWLOCK_H_ +#define _OPENSOLARIS_SYS_RWLOCK_H_ + +#include +#include +#include +#include + +typedef enum { + RW_DEFAULT = 4 /* kernel default rwlock */ +} krw_type_t; + + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +typedef struct sx krwlock_t; + +#ifndef OPENSOLARIS_WITNESS +#define RW_FLAGS (SX_DUPOK | SX_NOWITNESS) +#else +#define RW_FLAGS (SX_DUPOK) +#endif + +#define RW_READ_HELD(x) (rw_read_held((x))) +#define RW_WRITE_HELD(x) (rw_write_held((x))) +#define RW_LOCK_HELD(x) (rw_lock_held((x))) +#define RW_ISWRITER(x) (rw_iswriter(x)) +/* BEGIN CSTYLED */ +#define rw_init(lock, desc, type, arg) do { \ + const char *_name; \ + ASSERT((type) == 0 || (type) == RW_DEFAULT); \ + KASSERT(((lock)->lock_object.lo_flags & LO_ALLMASK) != \ + LO_EXPECTED, ("lock %s already initialized", #lock)); \ + bzero((lock), sizeof(struct sx)); \ + for (_name = #lock; *_name != '\0'; _name++) { \ + if (*_name >= 'a' && *_name <= 'z') \ + break; \ + } \ + if (*_name == '\0') \ + _name = #lock; \ + sx_init_flags((lock), _name, RW_FLAGS); \ +} while (0) +#define rw_destroy(lock) sx_destroy(lock) +#define rw_enter(lock, how) do { \ + if ((how) == RW_READER) \ + sx_slock(lock); \ + else /* if ((how) == RW_WRITER) */ \ + sx_xlock(lock); \ + } while (0) + +#define rw_tryenter(lock, how) \ + ((how) == RW_READER ? sx_try_slock(lock) : sx_try_xlock(lock)) +#define rw_exit(lock) sx_unlock(lock) +#define rw_downgrade(lock) sx_downgrade(lock) +#define rw_tryupgrade(lock) sx_try_upgrade(lock) +#define rw_read_held(lock) \ + ((lock)->sx_lock != SX_LOCK_UNLOCKED && \ + ((lock)->sx_lock & SX_LOCK_SHARED)) +#define rw_write_held(lock) sx_xlocked(lock) +#define rw_lock_held(lock) (rw_read_held(lock) || rw_write_held(lock)) +#define rw_iswriter(lock) sx_xlocked(lock) +#define rw_owner(lock) sx_xholder(lock) + +/* END CSTYLED */ +#endif /* _OPENSOLARIS_SYS_RWLOCK_H_ */ diff --git a/include/os/freebsd/spl/sys/sdt.h b/include/os/freebsd/spl/sys/sdt.h new file mode 100644 index 000000000000..496fc58d7c7b --- /dev/null +++ b/include/os/freebsd/spl/sys/sdt.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SDT_H_ +#define _OPENSOLARIS_SYS_SDT_H_ + +#include_next +/* BEGIN CSTYLED */ +#ifdef KDTRACE_HOOKS +SDT_PROBE_DECLARE(sdt, , , set__error); + +#define SET_ERROR(err) \ + ((sdt_sdt___set__error->id ? \ + (*sdt_probe_func)(sdt_sdt___set__error->id, \ + (uintptr_t)err, 0, 0, 0, 0) : 0), err) +#else +#define SET_ERROR(err) (err) +#endif + +#endif /* _OPENSOLARIS_SYS_SDT_H_ */ diff --git a/include/os/freebsd/spl/sys/sid.h b/include/os/freebsd/spl/sys/sid.h new file mode 100644 index 000000000000..d3fab8b24744 --- /dev/null +++ b/include/os/freebsd/spl/sys/sid.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SID_H_ +#define _OPENSOLARIS_SYS_SID_H_ +#include +#include + +typedef struct ksiddomain { + char *kd_name; /* Domain part of SID */ + uint_t kd_len; +} ksiddomain_t; +typedef void ksid_t; + +static __inline ksiddomain_t * +ksid_lookupdomain(const char *domain) +{ + ksiddomain_t *kd; + size_t len; + + len = strlen(domain) + 1; + kd = kmem_alloc(sizeof (*kd), KM_SLEEP); + kd->kd_len = (uint_t)len; + kd->kd_name = kmem_alloc(len, KM_SLEEP); + strcpy(kd->kd_name, domain); + return (kd); +} + +static __inline void +ksiddomain_rele(ksiddomain_t *kd) +{ + + kmem_free(kd->kd_name, kd->kd_len); + kmem_free(kd, sizeof (*kd)); +} + +static __inline uint_t +ksid_getid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline const char * +ksid_getdomain(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +static __inline uint_t +ksid_getrid(ksid_t *ks) +{ + + panic("%s has been unexpectedly called", __func__); +} + +#define kidmap_getsidbyuid(zone, uid, sid_prefix, rid) (1) +#define kidmap_getsidbygid(zone, gid, sid_prefix, rid) (1) + +#endif /* _OPENSOLARIS_SYS_SID_H_ */ diff --git a/include/os/freebsd/spl/sys/sig.h b/include/os/freebsd/spl/sys/sig.h new file mode 100644 index 000000000000..426a9e827ecb --- /dev/null +++ b/include/os/freebsd/spl/sys/sig.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2008 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SIG_H_ +#define _OPENSOLARIS_SYS_SIG_H_ + +#include_next +#include +#include +#include +#include +#include +#include + +#define FORREAL 0 +#define JUSTLOOKING 1 + +static __inline int +issig(int why) +{ + struct thread *td = curthread; + struct proc *p; + int sig; + + ASSERT(why == FORREAL || why == JUSTLOOKING); + if (SIGPENDING(td)) { + if (why == JUSTLOOKING) + return (1); + p = td->td_proc; + PROC_LOCK(p); + mtx_lock(&p->p_sigacts->ps_mtx); + sig = cursig(td); + mtx_unlock(&p->p_sigacts->ps_mtx); + PROC_UNLOCK(p); + if (sig != 0) + return (1); + } + return (0); +} +#endif /* _OPENSOLARIS_SYS_SIG_H_ */ diff --git a/include/os/freebsd/spl/sys/simd.h b/include/os/freebsd/spl/sys/simd.h new file mode 100644 index 000000000000..53503e838912 --- /dev/null +++ b/include/os/freebsd/spl/sys/simd.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + + +#ifndef _FREEBSD_SIMD_H +#define _FREEBSD_SIMD_H +#if defined(__amd64__) || defined(__i386__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#endif +#endif diff --git a/include/os/freebsd/spl/sys/simd_x86.h b/include/os/freebsd/spl/sys/simd_x86.h new file mode 100644 index 000000000000..a35e205d5a3b --- /dev/null +++ b/include/os/freebsd/spl/sys/simd_x86.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#include +#include +#include +#ifdef __i386__ +#include +#else +#include +#endif +#include +#include + +#define kfpu_init() (0) +#define kfpu_fini() do {} while (0) +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) + +#define kfpu_begin() { \ + critical_enter(); \ + fpu_kern_enter(curthread, NULL, FPU_KERN_NOCTX); \ +} + +#define kfpu_end() \ + { \ + fpu_kern_leave(curthread, NULL); \ + critical_exit(); \ + } + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + + has_osxsave = !!(cpu_feature2 & CPUID2_OSXSAVE); + + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (!!(cpu_feature & CPUID_SSE)); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (!!(cpu_feature & CPUID_SSE2)); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE3)); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSSE3)); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE41)); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (!!(cpu_feature2 & CPUID2_SSE42)); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + boolean_t has_avx; + + has_avx = !!(cpu_feature2 & CPUID2_AVX); + + return (has_avx && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + boolean_t has_avx2; + + has_avx2 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX2); + + return (has_avx2 && __ymm_enabled()); +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + + +/* Check if AVX512F instruction set is available */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512CD instruction set is available */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512ER instruction set is available */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512CD); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512PF instruction set is available */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512PF); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512BW instruction set is available */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512BW); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512DQ instruction set is available */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512DQ); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VL instruction set is available */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512VL); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512IFMA instruction set is available */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_AVX512IFMA); + + return (has_avx512 && __zmm_enabled()); +} + +/* Check if AVX512VBMI instruction set is available */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512; + + has_avx512 = !!(cpu_stdext_feature & CPUID_STDEXT_AVX512F) && + !!(cpu_stdext_feature & CPUID_STDEXT_BMI1); + + return (has_avx512 && __zmm_enabled()); +} diff --git a/include/os/freebsd/spl/sys/spl_condvar.h b/include/os/freebsd/spl/sys/spl_condvar.h new file mode 100644 index 000000000000..7405f647d59a --- /dev/null +++ b/include/os/freebsd/spl/sys/spl_condvar.h @@ -0,0 +1,81 @@ +/* + * SPDX-License-Identifier: BSD-2-Clause-FreeBSD + * + * Copyright (c) 2000 Jake Burkholder . + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_CONDVAR_H_ +#define _SPL_SYS_CONDVAR_H_ + +#ifndef LOCORE +#include + +struct lock_object; +struct thread; + +TAILQ_HEAD(cv_waitq, thread); + +/* + * Condition variable. The waiters count is protected by the mutex that + * protects the condition; that is, the mutex that is passed to cv_wait*() + * and is held across calls to cv_signal() and cv_broadcast(). It is an + * optimization to avoid looking up the sleep queue if there are no waiters. + */ +struct cv { + const char *cv_description; + int cv_waiters; +}; + +void cv_init(struct cv *cvp, const char *desc); +void cv_destroy(struct cv *cvp); + +void _cv_wait(struct cv *cvp, struct lock_object *lock); +void _cv_wait_unlock(struct cv *cvp, struct lock_object *lock); +int _cv_wait_sig(struct cv *cvp, struct lock_object *lock); +int _cv_timedwait_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); +int _cv_timedwait_sig_sbt(struct cv *cvp, struct lock_object *lock, + sbintime_t sbt, sbintime_t pr, int flags); + +void cv_signal(struct cv *cvp); +void cv_broadcastpri(struct cv *cvp, int pri); + +#define cv_wait(cvp, lock) \ + _cv_wait((cvp), &(lock)->lock_object) +#define cv_wait_unlock(cvp, lock) \ + _cv_wait_unlock((cvp), &(lock)->lock_object) +#define cv_timedwait_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) +#define cv_timedwait_sig_sbt(cvp, lock, sbt, pr, flags) \ + _cv_timedwait_sig_sbt((cvp), &(lock)->lock_object, (sbt), (pr), (flags)) + +#define cv_broadcast(cvp) cv_broadcastpri(cvp, 0) + +#define cv_wmesg(cvp) ((cvp)->cv_description) + +#endif /* !LOCORE */ +#endif /* _SYS_CONDVAR_H_ */ diff --git a/include/os/freebsd/spl/sys/string.h b/include/os/freebsd/spl/sys/string.h new file mode 100644 index 000000000000..859b40285a94 --- /dev/null +++ b/include/os/freebsd/spl/sys/string.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_STRING_H_ +#define _OPENSOLARIS_SYS_STRING_H_ + +#include + +char *strpbrk(const char *, const char *); +void strident_canon(char *, size_t); +void kmem_strfree(char *); +char *kmem_strdup(const char *s); + +#endif /* _OPENSOLARIS_SYS_STRING_H_ */ diff --git a/include/os/freebsd/spl/sys/strings.h b/include/os/freebsd/spl/sys/strings.h new file mode 100644 index 000000000000..651685d30473 --- /dev/null +++ b/include/os/freebsd/spl/sys/strings.h @@ -0,0 +1 @@ +/* do not delete */ diff --git a/include/os/freebsd/spl/sys/sunddi.h b/include/os/freebsd/spl/sys/sunddi.h new file mode 100644 index 000000000000..41d0f4512977 --- /dev/null +++ b/include/os/freebsd/spl/sys/sunddi.h @@ -0,0 +1,69 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#ifdef BUILDING_ZFS +#include +#endif + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +extern int ddi_strtoul(const char *, char **, int, unsigned long *); +extern int ddi_strtol(const char *, char **, int, long *); +extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +extern int ddi_strtoll(const char *, char **, int, long long *); + +extern int ddi_copyin(const void *from, void *to, size_t len, int flags); +extern int ddi_copyout(const void *from, void *to, size_t len, int flags); +extern void ddi_sysevent_init(void); + + +int ddi_soft_state_init(void **statep, size_t size, size_t nitems); +void ddi_soft_state_fini(void **statep); + +void *ddi_get_soft_state(void *state, int item); +int ddi_soft_state_zalloc(void *state, int item); +void ddi_soft_state_free(void *state, int item); + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/freebsd/spl/sys/sysmacros.h b/include/os/freebsd/spl/sys/sysmacros.h new file mode 100644 index 000000000000..5afca10447e7 --- /dev/null +++ b/include/os/freebsd/spl/sys/sysmacros.h @@ -0,0 +1,404 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSMACROS_H +#define _SYS_SYSMACROS_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Some macros for units conversion + */ +/* + * Disk blocks (sectors) and bytes. + */ +#define dtob(DD) ((DD) << DEV_BSHIFT) +#define btod(BB) (((BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) +#define btodt(BB) ((BB) >> DEV_BSHIFT) +#define lbtod(BB) (((offset_t)(BB) + DEV_BSIZE - 1) >> DEV_BSHIFT) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef SIGNOF +#define SIGNOF(a) ((a) < 0 ? -1 : (a) > 0) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define boot_ncpus mp_ncpus +#define kpreempt_disable() critical_enter() +#define kpreempt_enable() critical_exit() +#define CPU_SEQID curcpu +#define is_system_labeled() 0 +/* + * Convert a single byte to/from binary-coded decimal (BCD). + */ +extern unsigned char byte_to_bcd[256]; +extern unsigned char bcd_to_byte[256]; + +#define BYTE_TO_BCD(x) byte_to_bcd[(x) & 0xff] +#define BCD_TO_BYTE(x) bcd_to_byte[(x) & 0xff] + +/* + * WARNING: The device number macros defined here should not be used by device + * drivers or user software. Device drivers should use the device functions + * defined in the DDI/DKI interface (see also ddi.h). Application software + * should make use of the library routines available in makedev(3). A set of + * new device macros are provided to operate on the expanded device number + * format supported in SVR4. Macro versions of the DDI device functions are + * provided for use by kernel proper routines only. Macro routines bmajor(), + * major(), minor(), emajor(), eminor(), and makedev() will be removed or + * their definitions changed at the next major release following SVR4. + */ + +#define O_BITSMAJOR 7 /* # of SVR3 major device bits */ +#define O_BITSMINOR 8 /* # of SVR3 minor device bits */ +#define O_MAXMAJ 0x7f /* SVR3 max major value */ +#define O_MAXMIN 0xff /* SVR3 max minor value */ + + +#define L_BITSMAJOR32 14 /* # of SVR4 major device bits */ +#define L_BITSMINOR32 18 /* # of SVR4 minor device bits */ +#define L_MAXMAJ32 0x3fff /* SVR4 max major value */ +#define L_MAXMIN32 0x3ffff /* MAX minor for 3b2 software drivers. */ + /* For 3b2 hardware devices the minor is */ + /* restricted to 256 (0-255) */ + +#ifdef _LP64 +#define L_BITSMAJOR 32 /* # of major device bits in 64-bit Solaris */ +#define L_BITSMINOR 32 /* # of minor device bits in 64-bit Solaris */ +#define L_MAXMAJ 0xfffffffful /* max major value */ +#define L_MAXMIN 0xfffffffful /* max minor value */ +#else +#define L_BITSMAJOR L_BITSMAJOR32 +#define L_BITSMINOR L_BITSMINOR32 +#define L_MAXMAJ L_MAXMAJ32 +#define L_MAXMIN L_MAXMIN32 +#endif + +/* + * These are versions of the kernel routines for compressing and + * expanding long device numbers that don't return errors. + */ +#if (L_BITSMAJOR32 == L_BITSMAJOR) && (L_BITSMINOR32 == L_BITSMINOR) + +#define DEVCMPL(x) (x) +#define DEVEXPL(x) (x) + +#else + +#define DEVCMPL(x) \ + (dev32_t)((((x) >> L_BITSMINOR) > L_MAXMAJ32 || \ + ((x) & L_MAXMIN) > L_MAXMIN32) ? NODEV32 : \ + ((((x) >> L_BITSMINOR) << L_BITSMINOR32) | ((x) & L_MAXMIN32))) + +#define DEVEXPL(x) \ + (((x) == NODEV32) ? NODEV : \ + makedevice(((x) >> L_BITSMINOR32) & L_MAXMAJ32, (x) & L_MAXMIN32)) + +#endif /* L_BITSMAJOR32 ... */ + +/* convert to old (SVR3.2) dev format */ + +#define cmpdev(x) \ + (o_dev_t)((((x) >> L_BITSMINOR) > O_MAXMAJ || \ + ((x) & L_MAXMIN) > O_MAXMIN) ? NODEV : \ + ((((x) >> L_BITSMINOR) << O_BITSMINOR) | ((x) & O_MAXMIN))) + +/* convert to new (SVR4) dev format */ + +#define expdev(x) \ + (dev_t)(((dev_t)(((x) >> O_BITSMINOR) & O_MAXMAJ) << L_BITSMINOR) | \ + ((x) & O_MAXMIN)) + +/* + * Macro for checking power of 2 address alignment. + */ +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Macros for counting and rounding. + */ +#define howmany(x, y) (((x)+((y)-1))/(y)) +#define roundup(x, y) ((((x)+((y)-1))/(y))*(y)) + +/* + * Macro to determine if value is a power of 2 + */ +#define ISP2(x) (((x) & ((x) - 1)) == 0) + +/* + * Macros for various sorts of alignment and rounding. The "align" must + * be a power of 2. Often times it is a block, sector, or page. + */ + +/* + * return x rounded down to an align boundary + * eg, P2ALIGN(1200, 1024) == 1024 (1*align) + * eg, P2ALIGN(1024, 1024) == 1024 (1*align) + * eg, P2ALIGN(0x1234, 0x100) == 0x1200 (0x12*align) + * eg, P2ALIGN(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ALIGN(x, align) ((x) & -(align)) + +/* + * return x % (mod) align + * eg, P2PHASE(0x1234, 0x100) == 0x34 (x-0x12*align) + * eg, P2PHASE(0x5600, 0x100) == 0x00 (x-0x56*align) + */ +#define P2PHASE(x, align) ((x) & ((align) - 1)) + +/* + * return how much space is left in this block (but if it's perfectly + * aligned, return 0). + * eg, P2NPHASE(0x1234, 0x100) == 0xcc (0x13*align-x) + * eg, P2NPHASE(0x5600, 0x100) == 0x00 (0x56*align-x) + */ +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) + +/* + * return x rounded up to an align boundary + * eg, P2ROUNDUP(0x1234, 0x100) == 0x1300 (0x13*align) + * eg, P2ROUNDUP(0x5600, 0x100) == 0x5600 (0x56*align) + */ +#define P2ROUNDUP(x, align) (-(-(x) & -(align))) + +/* + * return the ending address of the block that x is in + * eg, P2END(0x1234, 0x100) == 0x12ff (0x13*align - 1) + * eg, P2END(0x5600, 0x100) == 0x56ff (0x57*align - 1) + */ +#define P2END(x, align) (-(~(x) & -(align))) + +/* + * return x rounded up to the next phase (offset) within align. + * phase should be < align. + * eg, P2PHASEUP(0x1234, 0x100, 0x10) == 0x1310 (0x13*align + phase) + * eg, P2PHASEUP(0x5600, 0x100, 0x10) == 0x5610 (0x56*align + phase) + */ +#define P2PHASEUP(x, align, phase) ((phase) - (((phase) - (x)) & -(align))) + +/* + * return TRUE if adding len to off would cause it to cross an align + * boundary. + * eg, P2BOUNDARY(0x1234, 0xe0, 0x100) == TRUE (0x1234 + 0xe0 == 0x1314) + * eg, P2BOUNDARY(0x1234, 0x50, 0x100) == FALSE (0x1234 + 0x50 == 0x1284) + */ +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Return TRUE if they have the same highest bit set. + * eg, P2SAMEHIGHBIT(0x1234, 0x1001) == TRUE (the high bit is 0x1000) + * eg, P2SAMEHIGHBIT(0x1234, 0x3010) == FALSE (high bit of 0x3010 is 0x2000) + */ +#define P2SAMEHIGHBIT(x, y) (((x) ^ (y)) < ((x) & (y))) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + (-(-(type)(x) & -(type)(align))) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +/* + * Macros to atomically increment/decrement a variable. mutex and var + * must be pointers. + */ +#define INCR_COUNT(var, mutex) mutex_enter(mutex), (*(var))++, mutex_exit(mutex) +#define DECR_COUNT(var, mutex) mutex_enter(mutex), (*(var))--, mutex_exit(mutex) + +/* + * Macros to declare bitfields - the order in the parameter list is + * Low to High - that is, declare bit 0 first. We only support 8-bit bitfields + * because if a field crosses a byte boundary it's not likely to be meaningful + * without reassembly in its nonnative endianness. + */ +#if defined(_BIT_FIELDS_LTOH) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _a, _b +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _a, _b, _c +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _a, _b, _c, _d +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _a, _b, _c, _d, _e +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _a, _b, _c, _d, _e, _f +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _a, _b, _c, _d, _e, _f, _g +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _a, _b, _c, _d, _e, _f, _g, _h +#elif defined(_BIT_FIELDS_HTOL) +#define DECL_BITFIELD2(_a, _b) \ + uint8_t _b, _a +#define DECL_BITFIELD3(_a, _b, _c) \ + uint8_t _c, _b, _a +#define DECL_BITFIELD4(_a, _b, _c, _d) \ + uint8_t _d, _c, _b, _a +#define DECL_BITFIELD5(_a, _b, _c, _d, _e) \ + uint8_t _e, _d, _c, _b, _a +#define DECL_BITFIELD6(_a, _b, _c, _d, _e, _f) \ + uint8_t _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD7(_a, _b, _c, _d, _e, _f, _g) \ + uint8_t _g, _f, _e, _d, _c, _b, _a +#define DECL_BITFIELD8(_a, _b, _c, _d, _e, _f, _g, _h) \ + uint8_t _h, _g, _f, _e, _d, _c, _b, _a +#else +#error One of _BIT_FIELDS_LTOH or _BIT_FIELDS_HTOL must be defined +#endif /* _BIT_FIELDS_LTOH */ + +#if !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * High order bit is 31 (or 63 in _LP64 kernel). + */ +static __inline int +highbit(ulong_t i) +{ +#if defined(HAVE_INLINE_FLSL) + return (flsl(i)); +#else + int h = 1; + + if (i == 0) + return (0); +#ifdef _LP64 + if (i & 0xffffffff00000000ul) { + h += 32; i >>= 32; + } +#endif + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + */ +static __inline int +highbit64(uint64_t i) +{ +#if defined(HAVE_INLINE_FLSLL) + return (flsll(i)); +#else + int h = 1; + + if (i == 0) + return (0); + if (i & 0xffffffff00000000ULL) { + h += 32; i >>= 32; + } + if (i & 0xffff0000) { + h += 16; i >>= 16; + } + if (i & 0xff00) { + h += 8; i >>= 8; + } + if (i & 0xf0) { + h += 4; i >>= 4; + } + if (i & 0xc) { + h += 2; i >>= 2; + } + if (i & 0x2) { + h += 1; + } + return (h); +#endif +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSMACROS_H */ diff --git a/include/os/freebsd/spl/sys/systeminfo.h b/include/os/freebsd/spl/sys/systeminfo.h new file mode 100644 index 000000000000..4028cd7cc6fd --- /dev/null +++ b/include/os/freebsd/spl/sys/systeminfo.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_SYSTEMINFO_H_ +#define _SYS_SYSTEMINFO_H_ + +#define HW_HOSTID_LEN 11 + +#endif /* !_SYS_SYSTEMINFO_H_ */ diff --git a/include/os/freebsd/spl/sys/systm.h b/include/os/freebsd/spl/sys/systm.h new file mode 100644 index 000000000000..98ee9557527c --- /dev/null +++ b/include/os/freebsd/spl/sys/systm.h @@ -0,0 +1,43 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_SYSTM_H_ +#define _OPENSOLARIS_SYS_SYSTM_H_ + +#include +#include_next + +#include + +#define PAGESIZE PAGE_SIZE +#define PAGEOFFSET (PAGESIZE - 1) +#define PAGEMASK (~PAGEOFFSET) + +#define delay(x) pause("soldelay", (x)) + +#endif /* _OPENSOLARIS_SYS_SYSTM_H_ */ diff --git a/include/os/freebsd/spl/sys/taskq.h b/include/os/freebsd/spl/sys/taskq.h new file mode 100644 index 000000000000..200f9e838b8d --- /dev/null +++ b/include/os/freebsd/spl/sys/taskq.h @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TASKQ_H +#define _SYS_TASKQ_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TASKQ_NAMELEN 31 + +typedef struct taskq { + struct taskqueue *tq_queue; +} taskq_t; + +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq_ent { + struct task tqent_task; + struct timeout_task tqent_timeout_task; + task_func_t *tqent_func; + void *tqent_arg; + taskqid_t tqent_id; + CK_LIST_ENTRY(taskq_ent) tqent_hash; + uint8_t tqent_type; + uint8_t tqent_registered; + uint8_t tqent_cancelled; + volatile uint32_t tqent_rc; +} taskq_ent_t; + +/* + * Public flags for taskq_create(): bit range 0-15 + */ +#define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. + */ +#define TQ_SLEEP 0x00 /* Can block for memory */ +#define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ + +#define TASKQID_INVALID ((taskqid_t)0) + +#define taskq_init_ent(x) +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); +void nulltask(void *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern void taskq_wait(taskq_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); +void taskq_suspend(taskq_t *); +int taskq_suspended(taskq_t *); +void taskq_resume(taskq_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TASKQ_H */ diff --git a/include/os/freebsd/spl/sys/thread.h b/include/os/freebsd/spl/sys/thread.h new file mode 100644 index 000000000000..4fb1a542f55f --- /dev/null +++ b/include/os/freebsd/spl/sys/thread.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_THREAD_H_ +#define _SPL_THREAD_H_ + +#define getcomm() curthread->td_name +#define getpid() curthread->td_tid +#endif diff --git a/include/os/freebsd/spl/sys/time.h b/include/os/freebsd/spl/sys/time.h new file mode 100644 index 000000000000..fbc679aacf93 --- /dev/null +++ b/include/os/freebsd/spl/sys/time.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_TIME_H_ +#define _OPENSOLARIS_SYS_TIME_H_ +#pragma once +#include_next +#include +#ifndef _SYS_KERNEL_H_ +extern int hz; +#endif + +#define SEC 1 +#define MILLISEC 1000UL +#define MICROSEC 1000000UL +#define NANOSEC 1000000000UL +#define TIME_MAX LLONG_MAX + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +typedef longlong_t hrtime_t; + +#if defined(__i386__) || defined(__powerpc__) +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT32_MIN || (ts)->tv_sec > INT32_MAX) +#else +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < INT64_MIN || (ts)->tv_sec > INT64_MAX) +#endif + +#define SEC_TO_TICK(sec) ((sec) * hz) +#define NSEC_TO_TICK(nsec) ((nsec) / (NANOSEC / hz)) + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + hrtime_t nsec; + + nanouptime(&ts); + nsec = ((hrtime_t)ts.tv_sec * NANOSEC) + ts.tv_nsec; + return (nsec); +} + +#define gethrestime_sec() (time_second) +#define gethrestime(ts) getnanotime(ts) +#define gethrtime_waitfree() gethrtime() + +extern int nsec_per_tick; /* nanoseconds per clock tick */ + +#define ddi_get_lbolt64() \ + (int64_t)(((getsbinuptime() >> 16) * hz) >> 16) +#define ddi_get_lbolt() (clock_t)ddi_get_lbolt64() + +#else + +static __inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + clock_gettime(CLOCK_UPTIME, &ts); + return (((u_int64_t)ts.tv_sec) * NANOSEC + ts.tv_nsec); +} +#endif /* !_OPENSOLARIS_SYS_TIME_H_ */ diff --git a/include/os/freebsd/spl/sys/timer.h b/include/os/freebsd/spl/sys/timer.h new file mode 100644 index 000000000000..d4694bb7c09c --- /dev/null +++ b/include/os/freebsd/spl/sys/timer.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TIMER_H_ +#define _SPL_TIMER_H_ +#define ddi_time_after(a, b) ((a) > (b)) +#define ddi_time_after64(a, b) ((a) > (b)) +#define usleep_range(wakeup, wakeupepsilon) \ + pause_sbt("usleep_range", ustosbt(wakeup), \ + ustosbt(wakeupepsilon - wakeup), 0) + +#define schedule() pause("schedule", 1) +#endif diff --git a/include/os/freebsd/spl/sys/trace.h b/include/os/freebsd/spl/sys/trace.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/include/os/freebsd/spl/sys/trace.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/trace_zfs.h b/include/os/freebsd/spl/sys/trace_zfs.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/include/os/freebsd/spl/sys/trace_zfs.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/include/os/freebsd/spl/sys/types.h b/include/os/freebsd/spl/sys/types.h new file mode 100644 index 000000000000..3f895362881a --- /dev/null +++ b/include/os/freebsd/spl/sys/types.h @@ -0,0 +1,108 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_SYS_TYPES_H_ +#define _SPL_SYS_TYPES_H_ + +#pragma once +/* + * This is a bag of dirty hacks to keep things compiling. + */ +#include_next + +#ifdef __ILP32__ +typedef __uint64_t u_longlong_t; +typedef __int64_t longlong_t; +#else +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; +#endif +#include + +#define _CLOCK_T_DECLARED + +#include +#include +#include + +#define MAXNAMELEN 256 + + + +typedef void zfs_kernel_param_t; + +typedef struct timespec timestruc_t; +typedef struct timespec timespec_t; +typedef struct timespec inode_timespec_t; +/* BEGIN CSTYLED */ +typedef u_int uint_t; +typedef u_char uchar_t; +typedef u_short ushort_t; +typedef u_long ulong_t; +typedef u_int minor_t; +/* END CSTYLED */ +#ifndef _OFF64_T_DECLARED +#define _OFF64_T_DECLARED +typedef off_t off64_t; +#endif +typedef id_t taskid_t; +typedef id_t projid_t; +typedef id_t poolid_t; +typedef uint_t zoneid_t; +typedef id_t ctid_t; +typedef mode_t o_mode_t; +typedef uint64_t pgcnt_t; + +#define B_FALSE 0 +#define B_TRUE 1 + +typedef short index_t; +typedef off_t offset_t; +#ifndef _PTRDIFF_T_DECLARED +typedef __ptrdiff_t ptrdiff_t; /* pointer difference */ +#define _PTRDIFF_T_DECLARED +#endif +typedef int64_t rlim64_t; +typedef int major_t; + +#else +#ifdef NEED_SOLARIS_BOOLEAN +#if defined(__XOPEN_OR_POSIX) +typedef enum { _B_FALSE, _B_TRUE } boolean_t; +#else +typedef enum { B_FALSE, B_TRUE } boolean_t; +#endif /* defined(__XOPEN_OR_POSIX) */ +#endif + +typedef u_longlong_t u_offset_t; +typedef u_longlong_t len_t; + +typedef longlong_t diskaddr_t; + +#include +#endif /* !_OPENSOLARIS_SYS_TYPES_H_ */ diff --git a/include/os/freebsd/spl/sys/types32.h b/include/os/freebsd/spl/sys/types32.h new file mode 100644 index 000000000000..907b667e5d80 --- /dev/null +++ b/include/os/freebsd/spl/sys/types32.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h new file mode 100644 index 000000000000..2b4357a30b09 --- /dev/null +++ b/include/os/freebsd/spl/sys/uio.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_UIO_H_ +#define _OPENSOLARIS_SYS_UIO_H_ + +#include_next +#include +#include + + + +#define uio_loffset uio_offset + +typedef struct uio uio_t; +typedef struct iovec iovec_t; +typedef enum uio_seg uio_seg_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY +} xuio_type_t; + +typedef struct xuio { + uio_t xu_uio; + + /* Extended uio fields */ + enum xuio_type xu_type; /* What kind of uio structure? */ + union { + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +static __inline int +zfs_uiomove(void *cp, size_t n, enum uio_rw dir, uio_t *uio) +{ + + ASSERT(uio->uio_rw == dir); + return (uiomove(cp, (int)n, uio)); +} +#define uiomove(cp, n, dir, uio) zfs_uiomove((cp), (n), (dir), (uio)) + +int uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes); +void uioskip(uio_t *uiop, size_t n); + +#define uio_segflg(uio) (uio)->uio_segflg +#define uio_offset(uio) (uio)->uio_loffset +#define uio_resid(uio) (uio)->uio_resid +#define uio_iovcnt(uio) (uio)->uio_iovcnt +#define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len +#define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base + +static inline void +uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = uio_iovbase(uio, idx); + *len = uio_iovlen(uio, idx); +} + +static inline void +uio_advance(uio_t *uio, size_t size) +{ + uio->uio_resid -= size; + uio->uio_loffset += size; +} + +static inline offset_t +uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < uio_iovcnt(uio) && off >= uio_iovlen(uio, *vec_idx)) { + off -= uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } + + return (off); +} + +#endif /* !_OPENSOLARIS_SYS_UIO_H_ */ diff --git a/include/os/freebsd/spl/sys/uuid.h b/include/os/freebsd/spl/sys/uuid.h new file mode 100644 index 000000000000..26d46e8d6214 --- /dev/null +++ b/include/os/freebsd/spl/sys/uuid.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UUID_H +#define _SYS_UUID_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The copyright in this file is taken from the original Leach + * & Salz UUID specification, from which this implementation + * is derived. + */ + +/* + * Copyright (c) 1990- 1993, 1996 Open Software Foundation, Inc. + * Copyright (c) 1989 by Hewlett-Packard Company, Palo Alto, Ca. & + * Digital Equipment Corporation, Maynard, Mass. Copyright (c) 1998 + * Microsoft. To anyone who acknowledges that this file is provided + * "AS IS" without any express or implied warranty: permission to use, + * copy, modify, and distribute this file for any purpose is hereby + * granted without fee, provided that the above copyright notices and + * this notice appears in all source code copies, and that none of the + * names of Open Software Foundation, Inc., Hewlett-Packard Company, + * or Digital Equipment Corporation be used in advertising or + * publicity pertaining to distribution of the software without + * specific, written prior permission. Neither Open Software + * Foundation, Inc., Hewlett-Packard Company, Microsoft, nor Digital + * Equipment Corporation makes any representations about the + * suitability of this software for any purpose. + */ + +#include +#include + +typedef struct { + uint8_t nodeID[6]; +} uuid_node_t; + +/* + * The uuid type used throughout when referencing uuids themselves + */ +typedef struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node_addr[6]; +} uuid_t; + +#define UUID_PRINTABLE_STRING_LENGTH 37 + +/* + * Convert a uuid to/from little-endian format + */ +#define UUID_LE_CONVERT(dest, src) \ +{ \ + (dest) = (src); \ + (dest).time_low = LE_32((dest).time_low); \ + (dest).time_mid = LE_16((dest).time_mid); \ + (dest).time_hi_and_version = LE_16((dest).time_hi_and_version); \ +} + +static __inline int +uuid_is_null(const caddr_t uuid) +{ + return (0); +} +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UUID_H */ diff --git a/include/os/freebsd/spl/sys/vfs.h b/include/os/freebsd/spl/sys/vfs.h new file mode 100644 index 000000000000..a432f6c56739 --- /dev/null +++ b/include/os/freebsd/spl/sys/vfs.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VFS_H_ +#define _OPENSOLARIS_SYS_VFS_H_ + +#include +#include +#include + +#define rootdir rootvnode + +struct thread; +struct vnode; +typedef struct mount vfs_t; + +typedef int umode_t; + +#define vfs_flag mnt_flag +#define vfs_data mnt_data +#define vfs_count mnt_ref +#define vfs_fsid mnt_stat.f_fsid +#define vfs_bsize mnt_stat.f_bsize +#define vfs_resource mnt_stat.f_mntfromname + +#define v_flag v_vflag +#define v_vfsp v_mount + +#define VFS_RDONLY MNT_RDONLY +#define VFS_NOSETUID MNT_NOSUID +#define VFS_NOEXEC MNT_NOEXEC + +#define fs_vscan(vp, cr, async) (0) + +#define VROOT VV_ROOT + +#define XU_NGROUPS 16 + +/* + * Structure defining a mount option for a filesystem. + * option names are found in mntent.h + */ +typedef struct mntopt { + char *mo_name; /* option name */ + char **mo_cancel; /* list of options cancelled by this one */ + char *mo_arg; /* argument string for this option */ + int mo_flags; /* flags for this mount option */ + void *mo_data; /* filesystem specific data */ +} mntopt_t; + +/* + * Flags that apply to mount options + */ + +#define MO_SET 0x01 /* option is set */ +#define MO_NODISPLAY 0x02 /* option not listed in mnttab */ +#define MO_HASVALUE 0x04 /* option takes a value */ +#define MO_IGNORE 0x08 /* option ignored by parser */ +#define MO_DEFAULT MO_SET /* option is on by default */ +#define MO_TAG 0x10 /* flags a tag set by user program */ +#define MO_EMPTY 0x20 /* empty space in option table */ + +#define VFS_NOFORCEOPT 0x01 /* honor MO_IGNORE (don't set option) */ +#define VFS_DISPLAY 0x02 /* Turn off MO_NODISPLAY bit for opt */ +#define VFS_NODISPLAY 0x04 /* Turn on MO_NODISPLAY bit for opt */ +#define VFS_CREATEOPT 0x08 /* Create the opt if it's not there */ + +/* + * Structure holding mount option strings for the mounted file system. + */ +typedef struct mntopts { + uint_t mo_count; /* number of entries in table */ + mntopt_t *mo_list; /* list of mount options */ +} mntopts_t; + +void vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused); +void vfs_clearmntopt(vfs_t *vfsp, const char *name); +int vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp); +int mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, + char *fspath, char *fspec, int fsflags); + +typedef uint64_t vfs_feature_t; + +#define VFSFT_XVATTR 0x100000001 /* Supports xvattr for attrs */ +#define VFSFT_CASEINSENSITIVE 0x100000002 /* Supports case-insensitive */ +#define VFSFT_NOCASESENSITIVE 0x100000004 /* NOT case-sensitive */ +#define VFSFT_DIRENTFLAGS 0x100000008 /* Supports dirent flags */ +#define VFSFT_ACLONCREATE 0x100000010 /* Supports ACL on create */ +#define VFSFT_ACEMASKONACCESS 0x100000020 /* Can use ACEMASK for access */ +#define VFSFT_SYSATTR_VIEWS 0x100000040 /* Supports sysattr view i/f */ +#define VFSFT_ACCESS_FILTER 0x100000080 /* dirents filtered by access */ +#define VFSFT_REPARSE 0x100000100 /* Supports reparse point */ +#define VFSFT_ZEROCOPY_SUPPORTED 0x100000200 + /* Support loaning /returning cache buffer */ + +#define vfs_set_feature(vfsp, feature) do { } while (0) +#define vfs_clear_feature(vfsp, feature) do { } while (0) +#define vfs_has_feature(vfsp, feature) (0) + +#include +#endif /* _OPENSOLARIS_SYS_VFS_H_ */ diff --git a/include/os/freebsd/spl/sys/vm.h b/include/os/freebsd/spl/sys/vm.h new file mode 100644 index 000000000000..7b3830be8a57 --- /dev/null +++ b/include/os/freebsd/spl/sys/vm.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VM_H_ +#define _OPENSOLARIS_SYS_VM_H_ + +#include + +extern const int zfs_vm_pagerret_bad; +extern const int zfs_vm_pagerret_error; +extern const int zfs_vm_pagerret_ok; +extern const int zfs_vm_pagerput_sync; +extern const int zfs_vm_pagerput_inval; + +void zfs_vmobject_assert_wlocked(vm_object_t object); +void zfs_vmobject_wlock(vm_object_t object); +void zfs_vmobject_wunlock(vm_object_t object); + +#if __FreeBSD_version >= 1300081 +#define zfs_vmobject_assert_wlocked_12(x) +#define zfs_vmobject_wlock_12(x) +#define zfs_vmobject_wunlock_12(x) +#else +#define zfs_vmobject_assert_wlocked_12(x) \ + zfs_vmobject_assert_wlocked((x)) +#define zfs_vmobject_wlock_12(x) \ + zfs_vmobject_wlock(x) +#define zfs_vmobject_wunlock_12(x) \ + zfs_vmobject_wunlock(x) +#define vm_page_grab_unlocked(obj, idx, flags) \ + vm_page_grab((obj), (idx), (flags)) +#define vm_page_grab_valid_unlocked(m, obj, idx, flags) \ + vm_page_grab_valid((m), (obj), (idx), (flags)) +#endif +static inline caddr_t +zfs_map_page(vm_page_t pp, struct sf_buf **sfp) +{ + *sfp = sf_buf_alloc(pp, 0); + return ((caddr_t)sf_buf_kva(*sfp)); +} + +static inline void +zfs_unmap_page(struct sf_buf *sf) +{ + sf_buf_free(sf); +} + +#endif /* _OPENSOLARIS_SYS_VM_H_ */ diff --git a/include/os/freebsd/spl/sys/vmsystm.h b/include/os/freebsd/spl/sys/vmsystm.h new file mode 100644 index 000000000000..0db34bbe438b --- /dev/null +++ b/include/os/freebsd/spl/sys/vmsystm.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SPL_VMSYSTM_H_ +#define _SPL_VMSYSTM_H_ + +#define xcopyout copyout + +#endif diff --git a/include/os/freebsd/spl/sys/vnode.h b/include/os/freebsd/spl/sys/vnode.h new file mode 100644 index 000000000000..6a6146132765 --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +#define _OPENSOLARIS_SYS_VNODE_H_ + +struct vnode; +struct vattr; +struct xucred; + +typedef struct flock flock64_t; +typedef struct vnode vnode_t; +typedef struct vattr vattr_t; +typedef enum vtype vtype_t; + +#include +#include +#include_next +#include +enum symfollow { NO_FOLLOW = NOFOLLOW }; + +#define NOCRED ((struct ucred *)0) /* no credential available */ +#define F_FREESP 11 /* Free file space */ + +#include +#include +#ifndef IN_BASE +#include_next +#endif +#include +#include +#include +#include +#include +#include +#include + +typedef struct vop_vector vnodeops_t; +#define VOP_FID VOP_VPTOFH +#define vop_fid vop_vptofh +#define vop_fid_args vop_vptofh_args +#define a_fid a_fhp + +#define IS_XATTRDIR(dvp) (0) + +#define v_count v_usecount + +#define rootvfs (rootvnode == NULL ? NULL : rootvnode->v_mount) + + +#ifndef IN_BASE +static __inline int +vn_is_readonly(vnode_t *vp) +{ + return (vp->v_mount->mnt_flag & MNT_RDONLY); +} +#endif +#define vn_vfswlock(vp) (0) +#define vn_vfsunlock(vp) do { } while (0) +#define vn_ismntpt(vp) \ + ((vp)->v_type == VDIR && (vp)->v_mountedhere != NULL) +#define vn_mountedvfs(vp) ((vp)->v_mountedhere) +#define vn_has_cached_data(vp) \ + ((vp)->v_object != NULL && \ + (vp)->v_object->resident_page_count > 0) +#define vn_exists(vp) do { } while (0) +#define vn_invalid(vp) do { } while (0) +#define vn_renamepath(tdvp, svp, tnm, lentnm) do { } while (0) +#define vn_free(vp) do { } while (0) +#define vn_matchops(vp, vops) ((vp)->v_op == &(vops)) + +#define VN_HOLD(v) vref(v) +#define VN_RELE(v) vrele(v) +#define VN_URELE(v) vput(v) + +#define vnevent_create(vp, ct) do { } while (0) +#define vnevent_link(vp, ct) do { } while (0) +#define vnevent_remove(vp, dvp, name, ct) do { } while (0) +#define vnevent_rmdir(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_src(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest(vp, dvp, name, ct) do { } while (0) +#define vnevent_rename_dest_dir(vp, ct) do { } while (0) + +#define specvp(vp, rdev, type, cr) (VN_HOLD(vp), (vp)) +#define MANDLOCK(vp, mode) (0) + +/* + * We will use va_spare is place of Solaris' va_mask. + * This field is initialized in zfs_setattr(). + */ +#define va_mask va_spare +/* TODO: va_fileid is shorter than va_nodeid !!! */ +#define va_nodeid va_fileid +/* TODO: This field needs conversion! */ +#define va_nblocks va_bytes +#define va_blksize va_blocksize +#define va_seq va_gen + +#define MAXOFFSET_T OFF_MAX +#define EXCL 0 + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FEXCL O_EXCL +#define FDSYNC FFSYNC +#define FRSYNC FFSYNC +#define FSYNC FFSYNC +#define FOFFMAX 0x00 +#define FIGNORECASE 0x00 + +/* + * Attributes of interest to the caller of setattr or getattr. + */ +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +/* 0x04000 */ /* unused */ +#define AT_SEQ 0x08000 +/* + * If AT_XVATTR is set then there are additional bits to process in + * the xvattr_t's attribute bitmap. If this is not set then the bitmap + * MUST be ignored. Note that this bit must be set/cleared explicitly. + * That is, setting AT_ALL will NOT set AT_XVATTR. + */ +#define AT_XVATTR 0x10000 + +#define AT_ALL (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|\ + AT_NLINK|AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|\ + AT_RDEV|AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#define AT_STAT (AT_MODE|AT_UID|AT_GID|AT_FSID|AT_NODEID|AT_NLINK|\ + AT_SIZE|AT_ATIME|AT_MTIME|AT_CTIME|AT_RDEV) + +#define AT_TIMES (AT_ATIME|AT_MTIME|AT_CTIME) + +#define AT_NOSET (AT_NLINK|AT_RDEV|AT_FSID|AT_NODEID|\ + AT_BLKSIZE|AT_NBLOCKS|AT_SEQ) + +#ifndef IN_BASE +static __inline void +vattr_init_mask(vattr_t *vap) +{ + + vap->va_mask = 0; + + if (vap->va_uid != (uid_t)VNOVAL) + vap->va_mask |= AT_UID; + if (vap->va_gid != (gid_t)VNOVAL) + vap->va_mask |= AT_GID; + if (vap->va_size != (u_quad_t)VNOVAL) + vap->va_mask |= AT_SIZE; + if (vap->va_atime.tv_sec != VNOVAL) + vap->va_mask |= AT_ATIME; + if (vap->va_mtime.tv_sec != VNOVAL) + vap->va_mask |= AT_MTIME; + if (vap->va_mode != (uint16_t)VNOVAL) + vap->va_mask |= AT_MODE; + if (vap->va_flags != VNOVAL) + vap->va_mask |= AT_XVATTR; +} +#endif + +#define RLIM64_INFINITY 0 + +static __inline int +vn_rename(char *from, char *to, enum uio_seg seg) +{ + + ASSERT(seg == UIO_SYSSPACE); + + return (kern_renameat(curthread, AT_FDCWD, from, AT_FDCWD, to, seg)); +} + +#include + +#endif /* _OPENSOLARIS_SYS_VNODE_H_ */ diff --git a/include/os/freebsd/spl/sys/vnode_impl.h b/include/os/freebsd/spl/sys/vnode_impl.h new file mode 100644 index 000000000000..94ec1ad4ef6f --- /dev/null +++ b/include/os/freebsd/spl/sys/vnode_impl.h @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2017 RackTop Systems. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_VNODE_IMPL_H +#define _SYS_VNODE_IMPL_H + + +#define IS_DEVVP(vp) \ + ((vp)->v_type == VCHR || (vp)->v_type == VBLK || (vp)->v_type == VFIFO) + +#define V_XATTRDIR 0x0000 /* attribute unnamed directory */ + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +/* + * The xvattr structure is really a variable length structure that + * is made up of: + * - The classic vattr_t (xva_vattr) + * - a 32 bit quantity (xva_mapsize) that specifies the size of the + * attribute bitmaps in 32 bit words. + * - A pointer to the returned attribute bitmap (needed because the + * previous element, the requested attribute bitmap) is variable length. + * - The requested attribute bitmap, which is an array of 32 bit words. + * Callers use the XVA_SET_REQ() macro to set the bits corresponding to + * the attributes that are being requested. + * - The returned attribute bitmap, which is an array of 32 bit words. + * File systems that support optional attributes use the XVA_SET_RTN() + * macro to set the bits corresponding to the attributes that are being + * returned. + * - The xoptattr_t structure which contains the attribute values + * + * xva_mapsize determines how many words in the attribute bitmaps. + * Immediately following the attribute bitmaps is the xoptattr_t. + * xva_getxoptattr() is used to get the pointer to the xoptattr_t + * section. + */ + +#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ +#define XVA_MAGIC 0x78766174 /* Magic # for verification */ + +/* + * The xvattr structure is an extensible structure which permits optional + * attributes to be requested/returned. File systems may or may not support + * optional attributes. They do so at their own discretion but if they do + * support optional attributes, they must register the VFSFT_XVATTR feature + * so that the optional attributes can be set/retrieved. + * + * The fields of the xvattr structure are: + * + * xva_vattr - The first element of an xvattr is a legacy vattr structure + * which includes the common attributes. If AT_XVATTR is set in the va_mask + * then the entire structure is treated as an xvattr. If AT_XVATTR is not + * set, then only the xva_vattr structure can be used. + * + * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. + * + * xva_mapsize - Size of requested and returned attribute bitmaps. + * + * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the + * size of the array before it, xva_reqattrmap[], could change which means + * the location of xva_rtnattrmap[] could change. This will allow unbundled + * file systems to find the location of xva_rtnattrmap[] when the sizes change. + * + * xva_reqattrmap[] - Array of requested attributes. Attributes are + * represented by a specific bit in a specific element of the attribute + * map array. Callers set the bits corresponding to the attributes + * that the caller wants to get/set. + * + * xva_rtnattrmap[] - Array of attributes that the file system was able to + * process. Not all file systems support all optional attributes. This map + * informs the caller which attributes the underlying file system was able + * to set/get. (Same structure as the requested attributes array in terms + * of each attribute corresponding to specific bits and array elements.) + * + * xva_xoptattrs - Structure containing values of optional attributes. + * These values are only valid if the corresponding bits in xva_reqattrmap + * are set and the underlying file system supports those attributes. + */ + + + +/* + * Attribute bits used in the extensible attribute's (xva's) attribute + * bitmaps. Note that the bitmaps are made up of a variable length number + * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" + * is the element in the bitmap (starting at 1). This convention is for + * the convenience of the maintainer to keep track of which element each + * attribute belongs to. + * + * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS + * MUST USE THE XAT_* DEFINES. + */ +#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ +#define XAT0_CREATETIME 0x00000001 /* Create time of file */ +#define XAT0_ARCHIVE 0x00000002 /* Archive */ +#define XAT0_SYSTEM 0x00000004 /* System */ +#define XAT0_READONLY 0x00000008 /* Readonly */ +#define XAT0_HIDDEN 0x00000010 /* Hidden */ +#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ +#define XAT0_IMMUTABLE 0x00000040 /* immutable */ +#define XAT0_APPENDONLY 0x00000080 /* appendonly */ +#define XAT0_NODUMP 0x00000100 /* nodump */ +#define XAT0_OPAQUE 0x00000200 /* opaque */ +#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ +#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ +#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ +#define XAT0_REPARSE 0x00002000 /* FS reparse point */ +#define XAT0_GEN 0x00004000 /* object generation number */ +#define XAT0_OFFLINE 0x00008000 /* offline */ +#define XAT0_SPARSE 0x00010000 /* sparse */ + +/* Support for XAT_* optional attributes */ +#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ +#define XVA_SHFT 32 /* Used to shift index */ + +/* + * Used to pry out the index and attribute bits from the XAT_* attributes + * defined below. Note that we're masking things down to 32 bits then + * casting to uint32_t. + */ +#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) +#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) + +/* + * The following defines present a "flat namespace" so that consumers don't + * need to keep track of which element belongs to which bitmap entry. + * + * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER + */ +#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) +#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) +#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) +#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) +#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) +#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) +#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) +#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) +#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) +#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) +#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) +#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) +#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) +#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) +#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) +#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) +#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) + +/* + * The returned attribute map array (xva_rtnattrmap[]) is located past the + * requested attribute map array (xva_reqattrmap[]). Its location changes + * when the array sizes change. We use a separate pointer in a known location + * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is + * set in xva_init() + */ +#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) + +#define MODEMASK 07777 /* mode bits plus permission bits */ +#define PERMMASK 00777 /* permission bits */ + +/* + * VOP_ACCESS flags + */ +#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ + +/* + * Flags for vnode operations. + */ +enum rm { RMFILE, RMDIRECTORY }; /* rm or rmdir (remove) */ +enum create { CRCREAT, CRMKNOD, CRMKDIR }; /* reason for create */ + +/* + * Structure used by various vnode operations to determine + * the context (pid, host, identity) of a caller. + * + * The cc_caller_id is used to identify one or more callers who invoke + * operations, possibly on behalf of others. For example, the NFS + * server could have it's own cc_caller_id which can be detected by + * vnode/vfs operations or (FEM) monitors on those operations. New + * caller IDs are generated by fs_new_caller_id(). + */ +typedef struct caller_context { + pid_t cc_pid; /* Process ID of the caller */ + int cc_sysid; /* System ID, used for remote calls */ + u_longlong_t cc_caller_id; /* Identifier for (set of) caller(s) */ + ulong_t cc_flags; +} caller_context_t; + +struct taskq; + +/* + * Flags for VOP_LOOKUP + * + * Defined in file.h, but also possible, FIGNORECASE and FSEARCH + * + */ +#define LOOKUP_DIR 0x01 /* want parent dir vp */ +#define LOOKUP_XATTR 0x02 /* lookup up extended attr dir */ +#define CREATE_XATTR_DIR 0x04 /* Create extended attr dir */ +#define LOOKUP_HAVE_SYSATTR_DIR 0x08 /* Already created virtual GFS dir */ + +/* + * Flags for VOP_READDIR + */ +#define V_RDDIR_ENTFLAGS 0x01 /* request dirent flags */ +#define V_RDDIR_ACCFILTER 0x02 /* filter out inaccessible dirents */ + +/* + * Public vnode manipulation functions. + */ + +void vn_rele_async(struct vnode *vp, struct taskq *taskq); + +#define VN_RELE_ASYNC(vp, taskq) { \ + vn_rele_async(vp, taskq); \ +} + +/* + * Flags to VOP_SETATTR/VOP_GETATTR. + */ +#define ATTR_UTIME 0x01 /* non-default utime(2) request */ +#define ATTR_EXEC 0x02 /* invocation from exec(2) */ +#define ATTR_COMM 0x04 /* yield common vp attributes */ +#define ATTR_HINT 0x08 /* information returned will be `hint' */ +#define ATTR_REAL 0x10 /* yield attributes of the real vp */ +#define ATTR_NOACLCHECK 0x20 /* Don't check ACL when checking permissions */ +#define ATTR_TRIGGER 0x40 /* Mount first if vnode is a trigger mount */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VNODE_H */ diff --git a/include/os/freebsd/spl/sys/zmod.h b/include/os/freebsd/spl/sys/zmod.h new file mode 100644 index 000000000000..ba0267203ce3 --- /dev/null +++ b/include/os/freebsd/spl/sys/zmod.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZMOD_H +#define _ZMOD_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * zmod - RFC-1950-compatible decompression routines + * + * This file provides the public interfaces to zmod, an in-kernel RFC 1950 + * decompression library. More information about the implementation of these + * interfaces can be found in the usr/src/uts/common/zmod/ directory. + */ + +#define Z_OK 0 +#define Z_STREAM_END 1 +#define Z_NEED_DICT 2 +#define Z_ERRNO (-1) +#define Z_STREAM_ERROR (-2) +#define Z_DATA_ERROR (-3) +#define Z_MEM_ERROR (-4) +#define Z_BUF_ERROR (-5) +#define Z_VERSION_ERROR (-6) + +#define Z_NO_COMPRESSION 0 +#define Z_BEST_SPEED 1 +#define Z_BEST_COMPRESSION 9 +#define Z_DEFAULT_COMPRESSION (-1) + +extern int z_uncompress(void *, size_t *, const void *, size_t); +extern int z_compress(void *, size_t *, const void *, size_t); +extern int z_compress_level(void *, size_t *, const void *, size_t, int); +extern const char *z_strerror(int); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZMOD_H */ diff --git a/include/os/freebsd/spl/sys/zone.h b/include/os/freebsd/spl/sys/zone.h new file mode 100644 index 000000000000..71a28adaf26e --- /dev/null +++ b/include/os/freebsd/spl/sys/zone.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _OPENSOLARIS_SYS_ZONE_H_ +#define _OPENSOLARIS_SYS_ZONE_H_ + +/* + * Macros to help with zone visibility restrictions. + */ + +#define GLOBAL_ZONEID 0 + +/* + * Is thread in the global zone? + */ +#define INGLOBALZONE(p) in_globalzone((p)) + + +extern boolean_t in_globalzone(struct proc *); + +/* + * Attach the given dataset to the given jail. + */ +extern int zone_dataset_attach(struct ucred *, const char *, int); + +/* + * Detach the given dataset to the given jail. + */ +extern int zone_dataset_detach(struct ucred *, const char *, int); + +/* + * Returns true if the named pool/dataset is visible in the current zone. + */ +extern int zone_dataset_visible(const char *, int *); + +/* + * Safely get the hostid of the specified zone (defaults to machine's hostid + * if the specified zone doesn't emulate a hostid). Passing NULL retrieves + * the global zone's (i.e., physical system's) hostid. + */ +extern uint32_t zone_get_hostid(void *); + +#endif /* !_OPENSOLARIS_SYS_ZONE_H_ */ diff --git a/include/os/freebsd/zfs/Makefile.am b/include/os/freebsd/zfs/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/include/os/freebsd/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/freebsd/zfs/sys/Makefile.am b/include/os/freebsd/zfs/sys/Makefile.am new file mode 100644 index 000000000000..a8cfa39f3543 --- /dev/null +++ b/include/os/freebsd/zfs/sys/Makefile.am @@ -0,0 +1,14 @@ +KERNEL_H = \ + freebsd_crypto.h \ + sha2.h \ + vdev_os.h \ + zfs_context_os.h \ + zfs_ctldir.h \ + zfs_dir.h \ + zfs_ioctl_compat.h \ + zfs_vfsops.h \ + zfs_vnops.h \ + zfs_znode_impl.h \ + zpl.h + +noinst_HEADERS = $(KERNEL_H) diff --git a/include/os/freebsd/zfs/sys/freebsd_crypto.h b/include/os/freebsd/zfs/sys/freebsd_crypto.h new file mode 100644 index 000000000000..08e058d6affa --- /dev/null +++ b/include/os/freebsd/zfs/sys/freebsd_crypto.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2018 Sean Eric Fagan + * Portions Copyright (c) 2005-2011 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file were taken from GELI's implementation of hmac. + * + * $FreeBSD$ + */ + +#ifndef _ZFS_FREEBSD_CRYPTO_H +#define _ZFS_FREEBSD_CRYPTO_H + +#include +#include +#include +#include +#include + +#define SUN_CKM_AES_CCM "CKM_AES_CCM" +#define SUN_CKM_AES_GCM "CKM_AES_GCM" +#define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" + +#define CRYPTO_KEY_RAW 1 + +#define CRYPTO_BITS2BYTES(n) ((n) == 0 ? 0 : (((n) - 1) >> 3) + 1) +#define CRYPTO_BYTES2BITS(n) ((n) << 3) + +struct zio_crypt_info; + +typedef struct freebsd_crypt_session { + struct mtx fs_lock; + crypto_session_t fs_sid; + boolean_t fs_done; +} freebsd_crypt_session_t; + +/* + * Unused types to minimize code differences. + */ +typedef void *crypto_mechanism_t; +typedef void *crypto_ctx_template_t; +/* + * Unlike the ICP crypto_key type, this only + * supports (the equivalent of + * CRYPTO_KEY_RAW). + */ +typedef struct crypto_key { + int ck_format; /* Unused, but minimizes code diff */ + void *ck_data; + size_t ck_length; +} crypto_key_t; + +typedef struct hmac_ctx { + SHA512_CTX innerctx; + SHA512_CTX outerctx; +} *crypto_context_t; + +/* + * The only algorithm ZFS uses for hashing is SHA512_HMAC. + */ +void crypto_mac(const crypto_key_t *key, const void *in_data, + size_t in_data_size, void *out_data, size_t out_data_size); +void crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *key); +void crypto_mac_update(struct hmac_ctx *ctx, const void *data, + size_t data_size); +void crypto_mac_final(struct hmac_ctx *ctx, void *out_data, + size_t out_data_size); + +int freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *, crypto_key_t *); +void freebsd_crypt_freesession(freebsd_crypt_session_t *sessp); + +int freebsd_crypt_uio(boolean_t, freebsd_crypt_session_t *, + struct zio_crypt_info *, uio_t *, crypto_key_t *, uint8_t *, + size_t, size_t); + +#endif /* _ZFS_FREEBSD_CRYPTO_H */ diff --git a/include/os/freebsd/zfs/sys/sha2.h b/include/os/freebsd/zfs/sys/sha2.h new file mode 100644 index 000000000000..e3923e4ca37c --- /dev/null +++ b/include/os/freebsd/zfs/sys/sha2.h @@ -0,0 +1,200 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include /* for uint_* */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ + +#include +#include +#include +#include +typedef struct { + uint32_t algotype; /* Algorithm Type */ + union { + SHA256_CTX SHA256_ctx; + SHA384_CTX SHA384_ctx; + SHA512_CTX SHA512_ctx; + }; +} SHA2_CTX; + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + + +static inline void +SHA2Init(uint64_t mech, SHA2_CTX *c) +{ + switch (mech) { + case SHA256: + SHA256_Init(&c->SHA256_ctx); + break; + case SHA384: + SHA384_Init(&c->SHA384_ctx); + break; + case SHA512: + SHA512_Init(&c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Init(&c->SHA512_ctx); + break; + default: + panic("unknown mechanism %ju", (uintmax_t)mech); + } + c->algotype = (uint32_t)mech; +} + +static inline void +SHA2Update(SHA2_CTX *c, const void *p, size_t s) +{ + switch (c->algotype) { + case SHA256: + SHA256_Update(&c->SHA256_ctx, p, s); + break; + case SHA384: + SHA384_Update(&c->SHA384_ctx, p, s); + break; + case SHA512: + SHA512_Update(&c->SHA512_ctx, p, s); + break; + case SHA512_256: + SHA512_256_Update(&c->SHA512_ctx, p, s); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +static inline void +SHA2Final(void *p, SHA2_CTX *c) +{ + switch (c->algotype) { + case SHA256: + SHA256_Final(p, &c->SHA256_ctx); + break; + case SHA384: + SHA384_Final(p, &c->SHA384_ctx); + break; + case SHA512: + SHA512_Final(p, &c->SHA512_ctx); + break; + case SHA512_256: + SHA512_256_Final(p, &c->SHA512_ctx); + break; + default: + panic("unknown mechanism %d", c->algotype); + } +} + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/os/freebsd/zfs/sys/vdev_os.h b/include/os/freebsd/zfs/sys/vdev_os.h new file mode 100644 index 000000000000..59da954b90e6 --- /dev/null +++ b/include/os/freebsd/zfs/sys/vdev_os.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_VDEV_OS_H +#define _SYS_VDEV_OS_H + +extern int vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size); +extern int vdev_geom_read_pool_label(const char *name, nvlist_t ***configs, + uint64_t *count); + +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_context_os.h b/include/os/freebsd/zfs/sys/zfs_context_os.h new file mode 100644 index 000000000000..0a2f0bfaaa70 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_context_os.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#include +#include +#include +#include_next +#include +#include +#include +#include +#include +#include +#include + +#define cv_wait_io(cv, mp) cv_wait(cv, mp) +#define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) + +#define cond_resched() kern_yield(PRI_USER) + +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) + +#define tsd_create(keyp, destructor) do { \ + *(keyp) = osd_thread_register((destructor)); \ + KASSERT(*(keyp) > 0, ("cannot register OSD")); \ +} while (0) + +#define tsd_destroy(keyp) osd_thread_deregister(*(keyp)) +#define tsd_get(key) osd_thread_get(curthread, (key)) +#define tsd_set(key, value) osd_thread_set(curthread, (key), (value)) +#define fm_panic panic + +#define cond_resched() kern_yield(PRI_USER) +extern int zfs_debug_level; +extern struct mtx zfs_debug_mtx; +#define ZFS_LOG(lvl, ...) do { \ + if (((lvl) & 0xff) <= zfs_debug_level) { \ + mtx_lock(&zfs_debug_mtx); \ + printf("%s:%u[%d]: ", \ + __func__, __LINE__, (lvl)); \ + printf(__VA_ARGS__); \ + printf("\n"); \ + if ((lvl) & 0x100) \ + kdb_backtrace(); \ + mtx_unlock(&zfs_debug_mtx); \ + } \ +} while (0) + +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) +extern int hz; +extern int tick; +typedef int fstrans_cookie_t; +#define spl_fstrans_mark() (0) +#define spl_fstrans_unmark(x) (x = 0) +#define signal_pending(x) SIGPENDING(x) +#define current curthread +#define thread_join(x) +#define cv_wait_io(cv, mp) cv_wait(cv, mp) +typedef struct opensolaris_utsname utsname_t; +extern utsname_t *utsname(void); +extern int spa_import_rootpool(const char *name, bool checkpointrewind); +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_ctldir.h b/include/os/freebsd/zfs/sys/zfs_ctldir.h new file mode 100644 index 000000000000..28a026603f07 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ctldir.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_CTLDIR_NAME ".zfs" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == (zdp)->z_zfsvfs->z_root && \ + ((zdp)->z_zfsvfs->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + ((zdp)->z_zfsvfs->z_show_ctldir)) + +void zfsctl_create(zfsvfs_t *); +void zfsctl_destroy(zfsvfs_t *); +int zfsctl_root(zfsvfs_t *, int, vnode_t **); +void zfsctl_init(void); +void zfsctl_fini(void); +boolean_t zfsctl_is_node(vnode_t *); +int zfsctl_snapshot_unmount(char *snapname, int flags); +int zfsctl_rename_snapshot(const char *from, const char *to); +int zfsctl_destroy_snapshot(const char *snapname, int force); +int zfsctl_umount_snapshots(vfs_t *, int, cred_t *); + +int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); + +#define ZFSCTL_INO_ROOT 0x1 +#define ZFSCTL_INO_SNAPDIR 0x2 + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_dir.h b/include/os/freebsd/zfs/sys/zfs_dir.h new file mode 100644 index 000000000000..f6f8ab5c4e69 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_dir.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ + +extern int zfs_dirent_lookup(znode_t *, const char *, znode_t **, int); +extern int zfs_link_create(znode_t *, const char *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(znode_t *, const char *, znode_t *, dmu_tx_t *, int, + boolean_t *); +#if 0 +extern int zfs_dirlook(vnode_t *, const char *, vnode_t **, int); +#else +extern int zfs_dirlook(znode_t *, const char *name, znode_t **); +#endif +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, zfs_acl_ids_t *); +extern void zfs_rmnode(znode_t *); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h new file mode 100644 index 000000000000..91bc48effe3f --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_ioctl_compat.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2014 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_IOCTL_COMPAT_H +#define _SYS_ZFS_IOCTL_COMPAT_H + +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Backwards ioctl compatibility + */ + +/* ioctl versions for vfs.zfs.version.ioctl */ +#define ZFS_IOCVER_UNDEF -1 +#define ZFS_IOCVER_NONE 0 +#define ZFS_IOCVER_DEADMAN 1 +#define ZFS_IOCVER_LZC 2 +#define ZFS_IOCVER_ZCMD 3 +#define ZFS_IOCVER_EDBP 4 +#define ZFS_IOCVER_RESUME 5 +#define ZFS_IOCVER_INLANES 6 +#define ZFS_IOCVER_PAD 7 +#define ZFS_IOCVER_LEGACY ZFS_IOCVER_PAD +#define ZFS_IOCVER_OZFS 15 + +/* compatibility conversion flag */ +#define ZFS_CMD_COMPAT_NONE 0 +#define ZFS_CMD_COMPAT_V15 1 +#define ZFS_CMD_COMPAT_V28 2 +#define ZFS_CMD_COMPAT_DEADMAN 3 +#define ZFS_CMD_COMPAT_LZC 4 +#define ZFS_CMD_COMPAT_ZCMD 5 +#define ZFS_CMD_COMPAT_EDBP 6 +#define ZFS_CMD_COMPAT_RESUME 7 +#define ZFS_CMD_COMPAT_INLANES 8 +#define ZFS_CMD_COMPAT_LEGACY 9 + +#define ZFS_IOC_COMPAT_PASS 254 +#define ZFS_IOC_COMPAT_FAIL 255 + +#define ZFS_IOCREQ(ioreq) ((ioreq) & 0xff) + +typedef struct zfs_iocparm { + uint32_t zfs_ioctl_version; + uint64_t zfs_cmd; + uint64_t zfs_cmd_size; +} zfs_iocparm_t; + + +#define LEGACY_MAXPATHLEN 1024 +#define LEGACY_MAXNAMELEN 256 + +/* + * Note: this struct must have the same layout in 32-bit and 64-bit, so + * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit + * kernel. Therefore, we add padding to it so that no "hidden" padding + * is automatically added on 64-bit (but not on 32-bit). + */ +typedef struct zfs_cmd_legacy { + char zc_name[LEGACY_MAXPATHLEN]; /* pool|dataset name */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[LEGACY_MAXPATHLEN * 2]; + char zc_string[LEGACY_MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + uint64_t zc_jailid; + dmu_objset_stats_t zc_objset_stats; + dmu_replay_record_t zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad3[3]; + boolean_t zc_resumable; + uint32_t zc_pad4; + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; +} zfs_cmd_legacy_t; + + +#ifdef _KERNEL +int zfs_ioctl_compat_pre(zfs_cmd_t *, int *, const int); +void zfs_ioctl_compat_post(zfs_cmd_t *, const int, const int); +nvlist_t *zfs_ioctl_compat_innvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +nvlist_t *zfs_ioctl_compat_outnvl(zfs_cmd_t *, nvlist_t *, const int, + const int); +#endif /* _KERNEL */ +int zfs_ioctl_legacy_to_ozfs(int request); +int zfs_ioctl_ozfs_to_legacy(int request); +void zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst); +void zfs_cmd_compat_get(zfs_cmd_t *, caddr_t, const int); +void zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst); + +void zfs_cmd_compat_put(zfs_cmd_t *, caddr_t, const int, const int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_COMPAT_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops.h b/include/os/freebsd/zfs/sys/zfs_vfsops.h new file mode 100644 index 000000000000..70ada204a292 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vfsops.h @@ -0,0 +1,177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + zfsvfs_t *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_flags; /* super_block flags */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_atime; /* enable atimes mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all vnodes in the fs */ + uint64_t z_nr_znodes; /* number of znodes in the fs */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + boolean_t z_use_namecache; /* make use of FreeBSD name cache */ + uint8_t z_xattr; /* xattr type in use */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + dataset_kstats_t z_kstat; /* fs kstats */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ +#define ZFS_OBJ_MTX_SZ 64 + kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ + struct task z_unlinked_drain_task; +}; + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22[*] bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes[**] currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + * + * [*] 20 bytes on FreeBSD to fit into the size of struct fid. + * [**] 2 bytes on FreeBSD for the above reason. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[2]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern uint_t zfs_fsyncer_key; +extern int zfs_super_owner; + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, boolean_t readonly, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); +extern int zfs_get_temporary_prop(struct dsl_dataset *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); +extern int zfs_busy(void); +extern void zfsvfs_update_fromname(const char *oldname, const char *newname); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/freebsd/zfs/sys/zfs_vnops.h b/include/os/freebsd/zfs/sys/zfs_vnops.h new file mode 100644 index 000000000000..6237372b905f --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_vnops.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SYS_ZFS_VNOPS_H_ +#define _SYS_ZFS_VNOPS_H_ +int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, struct vm_page **ppa, dmu_tx_t *tx); +int dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size); +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_link(znode_t *tdzp, znode_t *sp, + char *name, cred_t *cr, int flags); +extern int zfs_space(znode_t *zp, int cmd, struct flock *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); + +#endif diff --git a/include/os/freebsd/zfs/sys/zfs_znode_impl.h b/include/os/freebsd/zfs/sys/zfs_znode_impl.h new file mode 100644 index 000000000000..b1fe4aa86816 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,183 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _FREEBSD_ZFS_SYS_ZNODE_IMPL_H +#define _FREEBSD_ZFS_SYS_ZNODE_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +#define ZNODE_OS_FIELDS \ + struct zfsvfs *z_zfsvfs; \ + vnode_t *z_vnode; \ + uint64_t z_uid; \ + uint64_t z_gid; \ + uint64_t z_gen; \ + uint64_t z_atime[2]; \ + uint64_t z_links; + +#define ZFS_LINK_MAX UINT64_MAX + +/* + * ZFS minor numbers can refer to either a control device instance or + * a zvol. Depending on the value of zss_type, zss_data points to either + * a zvol_state_t or a zfs_onexit_t. + */ +enum zfs_soft_state_type { + ZSST_ZVOL, + ZSST_CTLDEV +}; + +typedef struct zfs_soft_state { + enum zfs_soft_state_type zss_type; + void *zss_data; +} zfs_soft_state_t; + +extern minor_t zfsdev_minor_alloc(void); + +/* + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + */ + +/* + * Convert between znode pointers and vnode pointers + */ +#define ZTOV(ZP) ((ZP)->z_vnode) +#define ZTOI(ZP) ((ZP)->z_vnode) +#define VTOZ(VP) ((struct znode *)(VP)->v_data) +#define VTOZ_SMR(VP) ((znode_t *)vn_load_v_data_smr(VP)) +#define ITOZ(VP) ((struct znode *)(VP)->v_data) +#define zhold(zp) vhold(ZTOV((zp))) +#define zrele(zp) vrele(ZTOV((zp))) + +#define ZTOZSB(zp) ((zp)->z_zfsvfs) +#define ITOZSB(vp) (VTOZ(vp)->z_zfsvfs) +#define ZTOTYPE(zp) (ZTOV(zp)->v_type) +#define ZTOGID(zp) ((zp)->z_gid) +#define ZTOUID(zp) ((zp)->z_uid) +#define ZTONLNK(zp) ((zp)->z_links) +#define Z_ISBLK(type) ((type) == VBLK) +#define Z_ISCHR(type) ((type) == VCHR) +#define Z_ISLNK(type) ((type) == VLNK) + + +/* Called on entry to each ZFS vnode and vfs operation */ +#define ZFS_ENTER(zfsvfs) \ + { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (EIO); \ + } \ + } + +/* Must be called before exiting the vop */ +#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG) + +/* Verifies the znode is valid */ +#define ZFS_VERIFY_ZP(zp) \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT((zp)->z_zfsvfs); \ + return (EIO); \ + } \ + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_HASH(obj_num) ((obj_num) & (ZFS_OBJ_MTX_SZ - 1)) +#define ZFS_OBJ_MUTEX(zfsvfs, obj_num) \ + (&(zfsvfs)->z_hold_mtx[ZFS_OBJ_HASH(obj_num)]) +#define ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num) \ + mutex_enter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_TRYENTER(zfsvfs, obj_num) \ + mutex_tryenter(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) +#define ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num) \ + mutex_exit(ZFS_OBJ_MUTEX((zfsvfs), (obj_num))) + +/* Encode ZFS stored time values from a struct timespec */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +{ \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} + +/* Decode ZFS stored time values to a struct timespec */ +#define ZFS_TIME_DECODE(tp, stmp) \ +{ \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} +#define ZFS_ACCESSTIME_STAMP(zfsvfs, zp) \ + if ((zfsvfs)->z_atime && !((zfsvfs)->z_vfs->vfs_flag & VFS_RDONLY)) \ + zfs_tstamp_update_setup_ext(zp, ACCESSED, NULL, NULL, B_FALSE); + +extern void zfs_tstamp_update_setup_ext(struct znode *, + uint_t, uint64_t [2], uint64_t [2], boolean_t have_tx); +extern void zfs_znode_free(struct znode *); + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +extern int zfs_znode_parent_and_name(struct znode *zp, struct znode **dzpp, + char *buf); + +#ifdef __cplusplus +} +#endif + +#endif /* _FREEBSD_SYS_FS_ZFS_ZNODE_H */ diff --git a/include/os/freebsd/zfs/sys/zpl.h b/include/os/freebsd/zfs/sys/zpl.h new file mode 100644 index 000000000000..fb2b4e02d441 --- /dev/null +++ b/include/os/freebsd/zfs/sys/zpl.h @@ -0,0 +1 @@ +/* Don't remove */ diff --git a/include/os/linux/Makefile.am b/include/os/linux/Makefile.am new file mode 100644 index 000000000000..605a1fcb7506 --- /dev/null +++ b/include/os/linux/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = kernel spl zfs diff --git a/include/os/linux/kernel/Makefile.am b/include/os/linux/kernel/Makefile.am new file mode 100644 index 000000000000..08b2f5fc5c99 --- /dev/null +++ b/include/os/linux/kernel/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = linux diff --git a/include/os/linux/kernel/linux/Makefile.am b/include/os/linux/kernel/linux/Makefile.am new file mode 100644 index 000000000000..6ff0df506d9c --- /dev/null +++ b/include/os/linux/kernel/linux/Makefile.am @@ -0,0 +1,22 @@ +KERNEL_H = \ + dcache_compat.h \ + xattr_compat.h \ + vfs_compat.h \ + blkdev_compat.h \ + utsname_compat.h \ + kmap_compat.h \ + percpu_compat.h \ + simd.h \ + simd_x86.h \ + simd_aarch64.h \ + simd_powerpc.h \ + mod_compat.h \ + page_compat.h \ + compiler_compat.h + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/linux +kernel_HEADERS = $(KERNEL_H) +endif +endif diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h new file mode 100644 index 000000000000..1cdc300a6f85 --- /dev/null +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -0,0 +1,506 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * LLNL-CODE-403049. + */ + +#ifndef _ZFS_BLKDEV_H +#define _ZFS_BLKDEV_H + +#include +#include +#include +#include +#include /* for SECTOR_* */ + +#ifndef HAVE_BLK_QUEUE_FLAG_SET +static inline void +blk_queue_flag_set(unsigned int flag, struct request_queue *q) +{ + queue_flag_set(flag, q); +} +#endif + +#ifndef HAVE_BLK_QUEUE_FLAG_CLEAR +static inline void +blk_queue_flag_clear(unsigned int flag, struct request_queue *q) +{ + queue_flag_clear(flag, q); +} +#endif + +/* + * 4.7 - 4.x API, + * The blk_queue_write_cache() interface has replaced blk_queue_flush() + * interface. However, the new interface is GPL-only thus we implement + * our own trivial wrapper when the GPL-only version is detected. + * + * 2.6.36 - 4.6 API, + * The blk_queue_flush() interface has replaced blk_queue_ordered() + * interface. However, while the old interface was available to all the + * new one is GPL-only. Thus if the GPL-only version is detected we + * implement our own trivial helper. + */ +static inline void +blk_queue_set_write_cache(struct request_queue *q, bool wc, bool fua) +{ +#if defined(HAVE_BLK_QUEUE_WRITE_CACHE_GPL_ONLY) + if (wc) + blk_queue_flag_set(QUEUE_FLAG_WC, q); + else + blk_queue_flag_clear(QUEUE_FLAG_WC, q); + if (fua) + blk_queue_flag_set(QUEUE_FLAG_FUA, q); + else + blk_queue_flag_clear(QUEUE_FLAG_FUA, q); +#elif defined(HAVE_BLK_QUEUE_WRITE_CACHE) + blk_queue_write_cache(q, wc, fua); +#elif defined(HAVE_BLK_QUEUE_FLUSH_GPL_ONLY) + if (wc) + q->flush_flags |= REQ_FLUSH; + if (fua) + q->flush_flags |= REQ_FUA; +#elif defined(HAVE_BLK_QUEUE_FLUSH) + blk_queue_flush(q, (wc ? REQ_FLUSH : 0) | (fua ? REQ_FUA : 0)); +#else +#error "Unsupported kernel" +#endif +} + +static inline void +blk_queue_set_read_ahead(struct request_queue *q, unsigned long ra_pages) +{ +#ifdef HAVE_BLK_QUEUE_BDI_DYNAMIC + q->backing_dev_info->ra_pages = ra_pages; +#else + q->backing_dev_info.ra_pages = ra_pages; +#endif +} + +#if !defined(HAVE_GET_DISK_AND_MODULE) +static inline struct kobject * +get_disk_and_module(struct gendisk *disk) +{ + return (get_disk(disk)); +} +#endif + +#ifdef HAVE_BIO_BVEC_ITER +#define BIO_BI_SECTOR(bio) (bio)->bi_iter.bi_sector +#define BIO_BI_SIZE(bio) (bio)->bi_iter.bi_size +#define BIO_BI_IDX(bio) (bio)->bi_iter.bi_idx +#define BIO_BI_SKIP(bio) (bio)->bi_iter.bi_bvec_done +#define bio_for_each_segment4(bv, bvp, b, i) \ + bio_for_each_segment((bv), (b), (i)) +typedef struct bvec_iter bvec_iterator_t; +#else +#define BIO_BI_SECTOR(bio) (bio)->bi_sector +#define BIO_BI_SIZE(bio) (bio)->bi_size +#define BIO_BI_IDX(bio) (bio)->bi_idx +#define BIO_BI_SKIP(bio) (0) +#define bio_for_each_segment4(bv, bvp, b, i) \ + bio_for_each_segment((bvp), (b), (i)) +typedef int bvec_iterator_t; +#endif + +static inline void +bio_set_flags_failfast(struct block_device *bdev, int *flags) +{ +#ifdef CONFIG_BUG + /* + * Disable FAILFAST for loopback devices because of the + * following incorrect BUG_ON() in loop_make_request(). + * This support is also disabled for md devices because the + * test suite layers md devices on top of loopback devices. + * This may be removed when the loopback driver is fixed. + * + * BUG_ON(!lo || (rw != READ && rw != WRITE)); + */ + if ((MAJOR(bdev->bd_dev) == LOOP_MAJOR) || + (MAJOR(bdev->bd_dev) == MD_MAJOR)) + return; + +#ifdef BLOCK_EXT_MAJOR + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + return; +#endif /* BLOCK_EXT_MAJOR */ +#endif /* CONFIG_BUG */ + + *flags |= REQ_FAILFAST_MASK; +} + +/* + * Maximum disk label length, it may be undefined for some kernels. + */ +#if !defined(DISK_NAME_LEN) +#define DISK_NAME_LEN 32 +#endif /* DISK_NAME_LEN */ + +#ifdef HAVE_BIO_BI_STATUS +static inline int +bi_status_to_errno(blk_status_t status) +{ + switch (status) { + case BLK_STS_OK: + return (0); + case BLK_STS_NOTSUPP: + return (EOPNOTSUPP); + case BLK_STS_TIMEOUT: + return (ETIMEDOUT); + case BLK_STS_NOSPC: + return (ENOSPC); + case BLK_STS_TRANSPORT: + return (ENOLINK); + case BLK_STS_TARGET: + return (EREMOTEIO); + case BLK_STS_NEXUS: + return (EBADE); + case BLK_STS_MEDIUM: + return (ENODATA); + case BLK_STS_PROTECTION: + return (EILSEQ); + case BLK_STS_RESOURCE: + return (ENOMEM); + case BLK_STS_AGAIN: + return (EAGAIN); + case BLK_STS_IOERR: + return (EIO); + default: + return (EIO); + } +} + +static inline blk_status_t +errno_to_bi_status(int error) +{ + switch (error) { + case 0: + return (BLK_STS_OK); + case EOPNOTSUPP: + return (BLK_STS_NOTSUPP); + case ETIMEDOUT: + return (BLK_STS_TIMEOUT); + case ENOSPC: + return (BLK_STS_NOSPC); + case ENOLINK: + return (BLK_STS_TRANSPORT); + case EREMOTEIO: + return (BLK_STS_TARGET); + case EBADE: + return (BLK_STS_NEXUS); + case ENODATA: + return (BLK_STS_MEDIUM); + case EILSEQ: + return (BLK_STS_PROTECTION); + case ENOMEM: + return (BLK_STS_RESOURCE); + case EAGAIN: + return (BLK_STS_AGAIN); + case EIO: + return (BLK_STS_IOERR); + default: + return (BLK_STS_IOERR); + } +} +#endif /* HAVE_BIO_BI_STATUS */ + +/* + * 4.3 API change + * The bio_endio() prototype changed slightly. These are helper + * macro's to ensure the prototype and invocation are handled. + */ +#ifdef HAVE_1ARG_BIO_END_IO_T +#ifdef HAVE_BIO_BI_STATUS +#define BIO_END_IO_ERROR(bio) bi_status_to_errno(bio->bi_status) +#define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) +#define BIO_END_IO(bio, error) bio_set_bi_status(bio, error) +static inline void +bio_set_bi_status(struct bio *bio, int error) +{ + ASSERT3S(error, <=, 0); + bio->bi_status = errno_to_bi_status(-error); + bio_endio(bio); +} +#else +#define BIO_END_IO_ERROR(bio) (-(bio->bi_error)) +#define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x) +#define BIO_END_IO(bio, error) bio_set_bi_error(bio, error) +static inline void +bio_set_bi_error(struct bio *bio, int error) +{ + ASSERT3S(error, <=, 0); + bio->bi_error = error; + bio_endio(bio); +} +#endif /* HAVE_BIO_BI_STATUS */ + +#else +#define BIO_END_IO_PROTO(fn, x, z) static void fn(struct bio *x, int z) +#define BIO_END_IO(bio, error) bio_endio(bio, error); +#endif /* HAVE_1ARG_BIO_END_IO_T */ + +/* + * 4.1 - x.y.z API, + * 3.10.0 CentOS 7.x API, + * blkdev_reread_part() + * + * For older kernels trigger a re-reading of the partition table by calling + * check_disk_change() which calls flush_disk() to invalidate the device. + */ +#ifdef HAVE_BLKDEV_REREAD_PART +#define vdev_bdev_reread_part(bdev) blkdev_reread_part(bdev) +#else +#define vdev_bdev_reread_part(bdev) check_disk_change(bdev) +#endif /* HAVE_BLKDEV_REREAD_PART */ + +/* + * 2.6.27 API change + * The function was exported for use, prior to this it existed but the + * symbol was not exported. + * + * 4.4.0-6.21 API change for Ubuntu + * lookup_bdev() gained a second argument, FMODE_*, to check inode permissions. + */ +#ifdef HAVE_1ARG_LOOKUP_BDEV +#define vdev_lookup_bdev(path) lookup_bdev(path) +#else +#ifdef HAVE_2ARGS_LOOKUP_BDEV +#define vdev_lookup_bdev(path) lookup_bdev(path, 0) +#else +#error "Unsupported kernel" +#endif /* HAVE_2ARGS_LOOKUP_BDEV */ +#endif /* HAVE_1ARG_LOOKUP_BDEV */ + +/* + * Kernels without bio_set_op_attrs use bi_rw for the bio flags. + */ +#if !defined(HAVE_BIO_SET_OP_ATTRS) +static inline void +bio_set_op_attrs(struct bio *bio, unsigned rw, unsigned flags) +{ + bio->bi_rw |= rw | flags; +} +#endif + +/* + * bio_set_flush - Set the appropriate flags in a bio to guarantee + * data are on non-volatile media on completion. + * + * 2.6.37 - 4.8 API, + * Introduce WRITE_FLUSH, WRITE_FUA, and WRITE_FLUSH_FUA flags as a + * replacement for WRITE_BARRIER to allow expressing richer semantics + * to the block layer. It's up to the block layer to implement the + * semantics correctly. Use the WRITE_FLUSH_FUA flag combination. + * + * 4.8 - 4.9 API, + * REQ_FLUSH was renamed to REQ_PREFLUSH. For consistency with previous + * ZoL releases, prefer the WRITE_FLUSH_FUA flag set if it's available. + * + * 4.10 API, + * The read/write flags and their modifiers, including WRITE_FLUSH, + * WRITE_FUA and WRITE_FLUSH_FUA were removed from fs.h in + * torvalds/linux@70fd7614 and replaced by direct flag modification + * of the REQ_ flags in bio->bi_opf. Use REQ_PREFLUSH. + */ +static inline void +bio_set_flush(struct bio *bio) +{ +#if defined(HAVE_REQ_PREFLUSH) /* >= 4.10 */ + bio_set_op_attrs(bio, 0, REQ_PREFLUSH); +#elif defined(WRITE_FLUSH_FUA) /* >= 2.6.37 and <= 4.9 */ + bio_set_op_attrs(bio, 0, WRITE_FLUSH_FUA); +#else +#error "Allowing the build will cause bio_set_flush requests to be ignored." +#endif +} + +/* + * 4.8 - 4.x API, + * REQ_OP_FLUSH + * + * 4.8-rc0 - 4.8-rc1, + * REQ_PREFLUSH + * + * 2.6.36 - 4.7 API, + * REQ_FLUSH + * + * in all cases but may have a performance impact for some kernels. It + * has the advantage of minimizing kernel specific changes in the zvol code. + * + */ +static inline boolean_t +bio_is_flush(struct bio *bio) +{ +#if defined(HAVE_REQ_OP_FLUSH) && defined(HAVE_BIO_BI_OPF) + return ((bio_op(bio) == REQ_OP_FLUSH) || (bio->bi_opf & REQ_PREFLUSH)); +#elif defined(HAVE_REQ_PREFLUSH) && defined(HAVE_BIO_BI_OPF) + return (bio->bi_opf & REQ_PREFLUSH); +#elif defined(HAVE_REQ_PREFLUSH) && !defined(HAVE_BIO_BI_OPF) + return (bio->bi_rw & REQ_PREFLUSH); +#elif defined(HAVE_REQ_FLUSH) + return (bio->bi_rw & REQ_FLUSH); +#else +#error "Unsupported kernel" +#endif +} + +/* + * 4.8 - 4.x API, + * REQ_FUA flag moved to bio->bi_opf + * + * 2.6.x - 4.7 API, + * REQ_FUA + */ +static inline boolean_t +bio_is_fua(struct bio *bio) +{ +#if defined(HAVE_BIO_BI_OPF) + return (bio->bi_opf & REQ_FUA); +#elif defined(REQ_FUA) + return (bio->bi_rw & REQ_FUA); +#else +#error "Allowing the build will cause fua requests to be ignored." +#endif +} + +/* + * 4.8 - 4.x API, + * REQ_OP_DISCARD + * + * 2.6.36 - 4.7 API, + * REQ_DISCARD + * + * In all cases the normal I/O path is used for discards. The only + * difference is how the kernel tags individual I/Os as discards. + */ +static inline boolean_t +bio_is_discard(struct bio *bio) +{ +#if defined(HAVE_REQ_OP_DISCARD) + return (bio_op(bio) == REQ_OP_DISCARD); +#elif defined(HAVE_REQ_DISCARD) + return (bio->bi_rw & REQ_DISCARD); +#else +#error "Unsupported kernel" +#endif +} + +/* + * 4.8 - 4.x API, + * REQ_OP_SECURE_ERASE + * + * 2.6.36 - 4.7 API, + * REQ_SECURE + */ +static inline boolean_t +bio_is_secure_erase(struct bio *bio) +{ +#if defined(HAVE_REQ_OP_SECURE_ERASE) + return (bio_op(bio) == REQ_OP_SECURE_ERASE); +#elif defined(REQ_SECURE) + return (bio->bi_rw & REQ_SECURE); +#else + return (0); +#endif +} + +/* + * 2.6.33 API change + * Discard granularity and alignment restrictions may now be set. For + * older kernels which do not support this it is safe to skip it. + */ +static inline void +blk_queue_discard_granularity(struct request_queue *q, unsigned int dg) +{ + q->limits.discard_granularity = dg; +} + +/* + * 4.8 - 4.x API, + * blk_queue_secure_erase() + * + * 2.6.36 - 4.7 API, + * blk_queue_secdiscard() + */ +static inline int +blk_queue_discard_secure(struct request_queue *q) +{ +#if defined(HAVE_BLK_QUEUE_SECURE_ERASE) + return (blk_queue_secure_erase(q)); +#elif defined(HAVE_BLK_QUEUE_SECDISCARD) + return (blk_queue_secdiscard(q)); +#else + return (0); +#endif +} + +/* + * A common holder for vdev_bdev_open() is used to relax the exclusive open + * semantics slightly. Internal vdev disk callers may pass VDEV_HOLDER to + * allow them to open the device multiple times. Other kernel callers and + * user space processes which don't pass this value will get EBUSY. This is + * currently required for the correct operation of hot spares. + */ +#define VDEV_HOLDER ((void *)0x2401de7) + +static inline void +blk_generic_start_io_acct(struct request_queue *q, int rw, + unsigned long sectors, struct hd_struct *part) +{ +#if defined(HAVE_GENERIC_IO_ACCT_3ARG) + generic_start_io_acct(rw, sectors, part); +#elif defined(HAVE_GENERIC_IO_ACCT_4ARG) + generic_start_io_acct(q, rw, sectors, part); +#endif +} + +static inline void +blk_generic_end_io_acct(struct request_queue *q, int rw, + struct hd_struct *part, unsigned long start_time) +{ +#if defined(HAVE_GENERIC_IO_ACCT_3ARG) + generic_end_io_acct(rw, part, start_time); +#elif defined(HAVE_GENERIC_IO_ACCT_4ARG) + generic_end_io_acct(q, rw, part, start_time); +#endif +} + +#ifndef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +static inline struct request_queue * +blk_generic_alloc_queue(make_request_fn make_request, int node_id) +{ +#if defined(HAVE_BLK_ALLOC_QUEUE_REQUEST_FN) + return (blk_alloc_queue(make_request, node_id)); +#else + struct request_queue *q = blk_alloc_queue(GFP_KERNEL); + if (q != NULL) + blk_queue_make_request(q, make_request); + + return (q); +#endif +} +#endif /* !HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS */ + +#endif /* _ZFS_BLKDEV_H */ diff --git a/include/os/linux/kernel/linux/compiler_compat.h b/include/os/linux/kernel/linux/compiler_compat.h new file mode 100644 index 000000000000..921d32f246c5 --- /dev/null +++ b/include/os/linux/kernel/linux/compiler_compat.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2018 Lawrence Livermore National Security, LLC. + */ + +#ifndef _ZFS_COMPILER_COMPAT_H +#define _ZFS_COMPILER_COMPAT_H + +#include + +#if !defined(READ_ONCE) +#define READ_ONCE(x) ACCESS_ONCE(x) +#endif + +#endif /* _ZFS_COMPILER_COMPAT_H */ diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h new file mode 100644 index 000000000000..d0588a82e9ad --- /dev/null +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + */ + +#ifndef _ZFS_DCACHE_H +#define _ZFS_DCACHE_H + +#include + +#define dname(dentry) ((char *)((dentry)->d_name.name)) +#define dlen(dentry) ((int)((dentry)->d_name.len)) + +#ifndef HAVE_D_MAKE_ROOT +#define d_make_root(inode) d_alloc_root(inode) +#endif /* HAVE_D_MAKE_ROOT */ + +/* + * 2.6.30 API change, + * The const keyword was added to the 'struct dentry_operations' in + * the dentry structure. To handle this we define an appropriate + * dentry_operations_t typedef which can be used. + */ +typedef const struct dentry_operations dentry_operations_t; + +/* + * 2.6.38 API addition, + * Added d_clear_d_op() helper function which clears some flags and the + * registered dentry->d_op table. This is required because d_set_d_op() + * issues a warning when the dentry operations table is already set. + * For the .zfs control directory to work properly we must be able to + * override the default operations table and register custom .d_automount + * and .d_revalidate callbacks. + */ +static inline void +d_clear_d_op(struct dentry *dentry) +{ + dentry->d_op = NULL; + dentry->d_flags &= ~( + DCACHE_OP_HASH | DCACHE_OP_COMPARE | + DCACHE_OP_REVALIDATE | DCACHE_OP_DELETE); +} + +#endif /* _ZFS_DCACHE_H */ diff --git a/include/os/linux/kernel/linux/kmap_compat.h b/include/os/linux/kernel/linux/kmap_compat.h new file mode 100644 index 000000000000..a7e63944ea16 --- /dev/null +++ b/include/os/linux/kernel/linux/kmap_compat.h @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + +#ifndef _ZFS_KMAP_H +#define _ZFS_KMAP_H + +#include +#include + +/* 2.6.37 API change */ +#define zfs_kmap_atomic(page, km_type) kmap_atomic(page) +#define zfs_kunmap_atomic(addr, km_type) kunmap_atomic(addr) + +/* 5.0 API change - no more 'type' argument for access_ok() */ +#ifdef HAVE_ACCESS_OK_TYPE +#define zfs_access_ok(type, addr, size) access_ok(type, addr, size) +#else +#define zfs_access_ok(type, addr, size) access_ok(addr, size) +#endif + +#endif /* _ZFS_KMAP_H */ diff --git a/include/os/linux/kernel/linux/mod_compat.h b/include/os/linux/kernel/linux/mod_compat.h new file mode 100644 index 000000000000..4b83fe413334 --- /dev/null +++ b/include/os/linux/kernel/linux/mod_compat.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +#ifndef _MOD_COMPAT_H +#define _MOD_COMPAT_H + +#include +#include + +/* Grsecurity kernel API change */ +#ifdef MODULE_PARAM_CALL_CONST +typedef const struct kernel_param zfs_kernel_param_t; +#else +typedef struct kernel_param zfs_kernel_param_t; +#endif + +#define ZMOD_RW 0644 +#define ZMOD_RD 0444 + +/* BEGIN CSTYLED */ +#define INT int +#define UINT uint +#define ULONG ulong +#define LONG long +#define STRING charp +/* END CSTYLED */ + +enum scope_prefix_types { + zfs, + zfs_arc, + zfs_condense, + zfs_dbuf, + zfs_dbuf_cache, + zfs_deadman, + zfs_dedup, + zfs_l2arc, + zfs_livelist, + zfs_livelist_condense, + zfs_lua, + zfs_metaslab, + zfs_mg, + zfs_multihost, + zfs_prefetch, + zfs_reconstruct, + zfs_recv, + zfs_send, + zfs_spa, + zfs_trim, + zfs_txg, + zfs_vdev, + zfs_vdev_cache, + zfs_vdev_mirror, + zfs_zevent, + zfs_zio, + zfs_zil +}; + +/* + * Declare a module parameter / sysctl node + * + * "scope_prefix" the part of the the sysctl / sysfs tree the node resides under + * (currently a no-op on Linux) + * "name_prefix" the part of the variable name that will be excluded from the + * exported names on platforms with a hierarchical namespace + * "name" the part of the variable that will be exposed on platforms with a + * hierarchical namespace, or as name_prefix ## name on Linux + * "type" the variable type + * "perm" the permissions (read/write or read only) + * "desc" a brief description of the option + * + * Examples: + * ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, UINT, + * ZMOD_RW, "Rotating media load increment for non-seeking I/O's"); + * on FreeBSD: + * vfs.zfs.vdev.mirror.rotating_inc + * on Linux: + * zfs_vdev_mirror_rotating_inc + * + * ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, UINT, ZMOD_RW, + * "Limit one prefetch call to this size"); + * on FreeBSD: + * vfs.zfs.dmu_prefetch_max + * on Linux: + * dmu_prefetch_max + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param(name_prefix ## name, type, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + +/* + * Declare a module parameter / sysctl node + * + * "scope_prefix" the part of the the sysctl / sysfs tree the node resides under + * (currently a no-op on Linux) + * "name_prefix" the part of the variable name that will be excluded from the + * exported names on platforms with a hierarchical namespace + * "name" the part of the variable that will be exposed on platforms with a + * hierarchical namespace, or as name_prefix ## name on Linux + * "setfunc" setter function + * "getfunc" getter function + * "perm" the permissions (read/write or read only) + * "desc" a brief description of the option + * + * Examples: + * ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, + * param_get_int, ZMOD_RW, "Reserved free space in pool"); + * on FreeBSD: + * vfs.zfs.spa_slop_shift + * on Linux: + * spa_slop_shift + */ +/* BEGIN CSTYLED */ +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, getfunc, perm, desc) \ + CTASSERT_GLOBAL((sizeof (scope_prefix) == sizeof (enum scope_prefix_types))); \ + module_param_call(name_prefix ## name, setfunc, getfunc, &name_prefix ## name, perm); \ + MODULE_PARM_DESC(name_prefix ## name, desc) +/* END CSTYLED */ + +#define ZFS_MODULE_PARAM_ARGS const char *buf, zfs_kernel_param_t *kp + +#define ZFS_MODULE_DESCRIPTION(s) MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) MODULE_VERSION(s) + +#endif /* _MOD_COMPAT_H */ diff --git a/include/os/linux/kernel/linux/page_compat.h b/include/os/linux/kernel/linux/page_compat.h new file mode 100644 index 000000000000..8ad04f9bbfc6 --- /dev/null +++ b/include/os/linux/kernel/linux/page_compat.h @@ -0,0 +1,92 @@ +#ifndef _ZFS_PAGE_COMPAT_H +#define _ZFS_PAGE_COMPAT_H + +/* + * We have various enum members moving between two separate enum types, + * and accessed by different functions at various times. Centralise the + * insanity. + * + * < v4.8: all enums in zone_stat_item, via global_page_state() + * v4.8: some enums moved to node_stat_item, global_node_page_state() introduced + * v4.13: some enums moved from zone_stat_item to node_state_item + * v4.14: global_page_state() rename to global_zone_page_state() + * + * The defines used here are created by config/kernel-global_page_state.m4 + */ + +/* + * Create our own accessor functions to follow the Linux API changes + */ +#if defined(ZFS_GLOBAL_ZONE_PAGE_STATE) + +/* global_zone_page_state() introduced */ +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES) +#define nr_file_pages() global_node_page_state(NR_FILE_PAGES) +#else +#define nr_file_pages() global_zone_page_state(NR_FILE_PAGES) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON) +#define nr_inactive_anon_pages() global_node_page_state(NR_INACTIVE_ANON) +#else +#define nr_inactive_anon_pages() global_zone_page_state(NR_INACTIVE_ANON) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE) +#define nr_inactive_file_pages() global_node_page_state(NR_INACTIVE_FILE) +#else +#define nr_inactive_file_pages() global_zone_page_state(NR_INACTIVE_FILE) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B) +#define nr_slab_reclaimable_pages() \ + global_node_page_state(NR_SLAB_RECLAIMABLE_B) +#else +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) +#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) +#else +#define nr_slab_reclaimable_pages() global_zone_page_state(NR_SLAB_RECLAIMABLE) +#endif +#endif + +#elif defined(ZFS_GLOBAL_NODE_PAGE_STATE) + +/* global_node_page_state() introduced */ +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_FILE_PAGES) +#define nr_file_pages() global_node_page_state(NR_FILE_PAGES) +#else +#define nr_file_pages() global_page_state(NR_FILE_PAGES) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_ANON) +#define nr_inactive_anon_pages() global_node_page_state(NR_INACTIVE_ANON) +#else +#define nr_inactive_anon_pages() global_page_state(NR_INACTIVE_ANON) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_INACTIVE_FILE) +#define nr_inactive_file_pages() global_node_page_state(NR_INACTIVE_FILE) +#else +#define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) +#endif +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B) +#define nr_slab_reclaimable_pages() \ + global_node_page_state(NR_SLAB_RECLAIMABLE_B) +#else +#if defined(ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE) +#define nr_slab_reclaimable_pages() global_node_page_state(NR_SLAB_RECLAIMABLE) +#else +#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) +#endif +#endif + +#else + +/* global_page_state() only */ +#define nr_file_pages() global_page_state(NR_FILE_PAGES) +#define nr_inactive_anon_pages() global_page_state(NR_INACTIVE_ANON) +#define nr_inactive_file_pages() global_page_state(NR_INACTIVE_FILE) +#ifdef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B +#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE_B) +#else +#define nr_slab_reclaimable_pages() global_page_state(NR_SLAB_RECLAIMABLE) +#endif /* ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B */ + +#endif /* ZFS_GLOBAL_ZONE_PAGE_STATE */ + +#endif /* _ZFS_PAGE_COMPAT_H */ diff --git a/include/os/linux/kernel/linux/percpu_compat.h b/include/os/linux/kernel/linux/percpu_compat.h new file mode 100644 index 000000000000..e7a4242c466c --- /dev/null +++ b/include/os/linux/kernel/linux/percpu_compat.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _ZFS_PERCPU_H +#define _ZFS_PERCPU_H + +#include + +/* + * 3.18 API change, + * percpu_counter_init() now must be passed a gfp mask which will be + * used for the dynamic allocation of the actual counter. + */ +#ifdef HAVE_PERCPU_COUNTER_INIT_WITH_GFP +#define percpu_counter_init_common(counter, n, gfp) \ + percpu_counter_init(counter, n, gfp) +#else +#define percpu_counter_init_common(counter, n, gfp) \ + percpu_counter_init(counter, n) +#endif + +#endif /* _ZFS_PERCPU_H */ diff --git a/include/os/linux/kernel/linux/simd.h b/include/os/linux/kernel/linux/simd.h new file mode 100644 index 000000000000..4cde248e200c --- /dev/null +++ b/include/os/linux/kernel/linux/simd.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Lawrence Livermore National Security, LLC. + */ + +#ifndef _LINUX_SIMD_H +#define _LINUX_SIMD_H + +#if defined(__x86) +#include + +#elif defined(__aarch64__) +#include + +#elif defined(__powerpc__) +#include +#else + +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif +#endif /* _LINUX_SIMD_H */ diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h new file mode 100644 index 000000000000..50937e97ced1 --- /dev/null +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau . + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + */ + +#ifndef _LINUX_SIMD_AARCH64_H +#define _LINUX_SIMD_AARCH64_H + +#include + +#if defined(__aarch64__) + +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() kernel_neon_begin() +#define kfpu_end() kernel_neon_end() +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif /* __aarch64__ */ + +#endif /* _LINUX_SIMD_AARCH64_H */ diff --git a/include/os/linux/kernel/linux/simd_powerpc.h b/include/os/linux/kernel/linux/simd_powerpc.h new file mode 100644 index 000000000000..108cef22f56f --- /dev/null +++ b/include/os/linux/kernel/linux/simd_powerpc.h @@ -0,0 +1,109 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau + * + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_altivec_available() + */ + +#ifndef _LINUX_SIMD_POWERPC_H +#define _LINUX_SIMD_POWERPC_H + +/* only for __powerpc__ */ +#if defined(__powerpc__) + +#include +#include +#include +#include +#include +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() \ + { \ + preempt_disable(); \ + enable_kernel_altivec(); \ + } +#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 5, 0) +#define kfpu_end() \ + { \ + disable_kernel_altivec(); \ + preempt_enable(); \ + } +#else +/* seems that before 4.5 no-one bothered disabling ... */ +#define kfpu_end() preempt_enable() +#endif +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +/* + * Check if AltiVec instruction set is available + */ +static inline boolean_t +zfs_altivec_available(void) +{ + boolean_t res; + /* suggested by macallan at netbsd dot org */ +#if defined(__powerpc64__) + u64 msr; +#else + u32 msr; +#endif + kfpu_begin(); + __asm volatile("mfmsr %0" : "=r"(msr)); + /* + * 64 bits -> need to check bit 38 + * Power ISA Version 3.0B + * p944 + * 32 bits -> Need to check bit 6 + * AltiVec Technology Programming Environments Manual + * p49 (2-9) + * They are the same, as ppc counts 'backward' ... + */ + res = (msr & 0x2000000) != 0; + kfpu_end(); + return (res); +} +#endif /* defined(__powerpc) */ + +#endif /* _LINUX_SIMD_POWERPC_H */ diff --git a/include/os/linux/kernel/linux/simd_x86.h b/include/os/linux/kernel/linux/simd_x86.h new file mode 100644 index 000000000000..cdd3286d2147 --- /dev/null +++ b/include/os/linux/kernel/linux/simd_x86.h @@ -0,0 +1,646 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +/* + * USER API: + * + * Kernel fpu methods: + * kfpu_allowed() + * kfpu_begin() + * kfpu_end() + * kfpu_init() + * kfpu_fini() + * + * SIMD support: + * + * Following functions should be called to determine whether CPU feature + * is supported. All functions are usable in kernel and user space. + * If a SIMD algorithm is using more than one instruction set + * all relevant feature test functions should be called. + * + * Supported features: + * zfs_sse_available() + * zfs_sse2_available() + * zfs_sse3_available() + * zfs_ssse3_available() + * zfs_sse4_1_available() + * zfs_sse4_2_available() + * + * zfs_avx_available() + * zfs_avx2_available() + * + * zfs_bmi1_available() + * zfs_bmi2_available() + * + * zfs_avx512f_available() + * zfs_avx512cd_available() + * zfs_avx512er_available() + * zfs_avx512pf_available() + * zfs_avx512bw_available() + * zfs_avx512dq_available() + * zfs_avx512vl_available() + * zfs_avx512ifma_available() + * zfs_avx512vbmi_available() + * + * NOTE(AVX-512VL): If using AVX-512 instructions with 128Bit registers + * also add zfs_avx512vl_available() to feature check. + */ + +#ifndef _LINUX_SIMD_X86_H +#define _LINUX_SIMD_X86_H + +/* only for __x86 */ +#if defined(__x86) + +#include +#include + +/* + * Disable the WARN_ON_FPU() macro to prevent additional dependencies + * when providing the kfpu_* functions. Relevant warnings are included + * as appropriate and are unconditionally enabled. + */ +#if defined(CONFIG_X86_DEBUG_FPU) && !defined(KERNEL_EXPORTS_X86_FPU) +#undef CONFIG_X86_DEBUG_FPU +#endif + +#if defined(HAVE_KERNEL_FPU_API_HEADER) +#include +#include +#else +#include +#include +#endif + +/* + * The following cases are for kernels which export either the + * kernel_fpu_* or __kernel_fpu_* functions. + */ +#if defined(KERNEL_EXPORTS_X86_FPU) + +#define kfpu_allowed() 1 +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#if defined(HAVE_UNDERSCORE_KERNEL_FPU) +#define kfpu_begin() \ +{ \ + preempt_disable(); \ + __kernel_fpu_begin(); \ +} +#define kfpu_end() \ +{ \ + __kernel_fpu_end(); \ + preempt_enable(); \ +} + +#elif defined(HAVE_KERNEL_FPU) +#define kfpu_begin() kernel_fpu_begin() +#define kfpu_end() kernel_fpu_end() + +#else +/* + * This case is unreachable. When KERNEL_EXPORTS_X86_FPU is defined then + * either HAVE_UNDERSCORE_KERNEL_FPU or HAVE_KERNEL_FPU must be defined. + */ +#error "Unreachable kernel configuration" +#endif + +#else /* defined(KERNEL_EXPORTS_X86_FPU) */ + +/* + * When the kernel_fpu_* symbols are unavailable then provide our own + * versions which allow the FPU to be safely used. + */ +#if defined(HAVE_KERNEL_FPU_INTERNAL) + +#include + +extern union fpregs_state **zfs_kfpu_fpregs; + +/* + * Initialize per-cpu variables to store FPU state. + */ +static inline void +kfpu_fini(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + if (zfs_kfpu_fpregs[cpu] != NULL) { + free_pages((unsigned long)zfs_kfpu_fpregs[cpu], + get_order(sizeof (union fpregs_state))); + } + } + + kfree(zfs_kfpu_fpregs); +} + +static inline int +kfpu_init(void) +{ + zfs_kfpu_fpregs = kzalloc(num_possible_cpus() * + sizeof (union fpregs_state *), GFP_KERNEL); + if (zfs_kfpu_fpregs == NULL) + return (-ENOMEM); + + /* + * The fxsave and xsave operations require 16-/64-byte alignment of + * the target memory. Since kmalloc() provides no alignment + * guarantee instead use alloc_pages_node(). + */ + unsigned int order = get_order(sizeof (union fpregs_state)); + int cpu; + + for_each_possible_cpu(cpu) { + struct page *page = alloc_pages_node(cpu_to_node(cpu), + GFP_KERNEL | __GFP_ZERO, order); + if (page == NULL) { + kfpu_fini(); + return (-ENOMEM); + } + + zfs_kfpu_fpregs[cpu] = page_address(page); + } + + return (0); +} + +#define kfpu_allowed() 1 +#define ex_handler_fprestore ex_handler_default + +/* + * FPU save and restore instructions. + */ +#define __asm __asm__ __volatile__ +#define kfpu_fxsave(addr) __asm("fxsave %0" : "=m" (*(addr))) +#define kfpu_fxsaveq(addr) __asm("fxsaveq %0" : "=m" (*(addr))) +#define kfpu_fnsave(addr) __asm("fnsave %0; fwait" : "=m" (*(addr))) +#define kfpu_fxrstor(addr) __asm("fxrstor %0" : : "m" (*(addr))) +#define kfpu_fxrstorq(addr) __asm("fxrstorq %0" : : "m" (*(addr))) +#define kfpu_frstor(addr) __asm("frstor %0" : : "m" (*(addr))) +#define kfpu_fxsr_clean(rval) __asm("fnclex; emms; fildl %P[addr]" \ + : : [addr] "m" (rval)); + +static inline void +kfpu_save_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + int err; + + low = mask; + hi = mask >> 32; + XSTATE_XSAVE(addr, low, hi, err); + WARN_ON_ONCE(err); +} + +static inline void +kfpu_save_fxsr(struct fxregs_state *addr) +{ + if (IS_ENABLED(CONFIG_X86_32)) + kfpu_fxsave(addr); + else + kfpu_fxsaveq(addr); +} + +static inline void +kfpu_save_fsave(struct fregs_state *addr) +{ + kfpu_fnsave(addr); +} + +static inline void +kfpu_begin(void) +{ + /* + * Preemption and interrupts must be disabled for the critical + * region where the FPU state is being modified. + */ + preempt_disable(); + local_irq_disable(); + + /* + * The current FPU registers need to be preserved by kfpu_begin() + * and restored by kfpu_end(). They are stored in a dedicated + * per-cpu variable, not in the task struct, this allows any user + * FPU state to be correctly preserved and restored. + */ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_save_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_save_fxsr(&state->fxsave); + } else { + kfpu_save_fsave(&state->fsave); + } +} + +static inline void +kfpu_restore_xsave(struct xregs_state *addr, uint64_t mask) +{ + uint32_t low, hi; + + low = mask; + hi = mask >> 32; + XSTATE_XRESTORE(addr, low, hi); +} + +static inline void +kfpu_restore_fxsr(struct fxregs_state *addr) +{ + /* + * On AuthenticAMD K7 and K8 processors the fxrstor instruction only + * restores the _x87 FOP, FIP, and FDP registers when an exception + * is pending. Clean the _x87 state to force the restore. + */ + if (unlikely(static_cpu_has_bug(X86_BUG_FXSAVE_LEAK))) + kfpu_fxsr_clean(addr); + + if (IS_ENABLED(CONFIG_X86_32)) { + kfpu_fxrstor(addr); + } else { + kfpu_fxrstorq(addr); + } +} + +static inline void +kfpu_restore_fsave(struct fregs_state *addr) +{ + kfpu_frstor(addr); +} + +static inline void +kfpu_end(void) +{ + union fpregs_state *state = zfs_kfpu_fpregs[smp_processor_id()]; + + if (static_cpu_has(X86_FEATURE_XSAVE)) { + kfpu_restore_xsave(&state->xsave, ~0); + } else if (static_cpu_has(X86_FEATURE_FXSR)) { + kfpu_restore_fxsr(&state->fxsave); + } else { + kfpu_restore_fsave(&state->fsave); + } + + local_irq_enable(); + preempt_enable(); +} + +#else + +/* + * FPU support is unavailable. + */ +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +#endif /* defined(HAVE_KERNEL_FPU_INTERNAL) */ +#endif /* defined(KERNEL_EXPORTS_X86_FPU) */ + +/* + * Linux kernel provides an interface for CPU feature testing. + */ + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + +#if defined(X86_FEATURE_OSXSAVE) + has_osxsave = !!boot_cpu_has(X86_FEATURE_OSXSAVE); +#else + has_osxsave = B_FALSE; +#endif + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM)); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM2)); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM3)); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_SSSE3)); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM4_1)); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (!!boot_cpu_has(X86_FEATURE_XMM4_2)); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + return (boot_cpu_has(X86_FEATURE_AVX) && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + return (boot_cpu_has(X86_FEATURE_AVX2) && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ +#if defined(X86_FEATURE_BMI1) + return (!!boot_cpu_has(X86_FEATURE_BMI1)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ +#if defined(X86_FEATURE_BMI2) + return (!!boot_cpu_has(X86_FEATURE_BMI2)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ +#if defined(X86_FEATURE_AES) + return (!!boot_cpu_has(X86_FEATURE_AES)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ +#if defined(X86_FEATURE_PCLMULQDQ) + return (!!boot_cpu_has(X86_FEATURE_PCLMULQDQ)); +#else + return (B_FALSE); +#endif +} + +/* + * Check if MOVBE instruction is available + */ +static inline boolean_t +zfs_movbe_available(void) +{ +#if defined(X86_FEATURE_MOVBE) + return (!!boot_cpu_has(X86_FEATURE_MOVBE)); +#else + return (B_FALSE); +#endif +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + +/* + * Check if AVX512F instruction set is available + */ +static inline boolean_t +zfs_avx512f_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512F) + has_avx512 = !!boot_cpu_has(X86_FEATURE_AVX512F); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512CD instruction set is available + */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512CD) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512CD); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512ER instruction set is available + */ +static inline boolean_t +zfs_avx512er_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512ER) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512ER); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512PF instruction set is available + */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512PF) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512PF); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512BW instruction set is available + */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512BW) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512BW); +#endif + + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512DQ instruction set is available + */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512DQ) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512DQ); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512VL instruction set is available + */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512VL) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VL); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512IFMA instruction set is available + */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512IFMA) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512IFMA); +#endif + return (has_avx512 && __zmm_enabled()); +} + +/* + * Check if AVX512VBMI instruction set is available + */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + boolean_t has_avx512 = B_FALSE; + +#if defined(X86_FEATURE_AVX512VBMI) + has_avx512 = boot_cpu_has(X86_FEATURE_AVX512F) && + boot_cpu_has(X86_FEATURE_AVX512VBMI); +#endif + return (has_avx512 && __zmm_enabled()); +} + +#endif /* defined(__x86) */ + +#endif /* _LINUX_SIMD_X86_H */ diff --git a/include/os/linux/kernel/linux/utsname_compat.h b/include/os/linux/kernel/linux/utsname_compat.h new file mode 100644 index 000000000000..88da45cf504e --- /dev/null +++ b/include/os/linux/kernel/linux/utsname_compat.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _ZFS_UTSNAME_H +#define _ZFS_UTSNAME_H + +#include + +typedef struct new_utsname utsname_t; + +#endif /* _ZFS_UTSNAME_H */ diff --git a/include/os/linux/kernel/linux/vfs_compat.h b/include/os/linux/kernel/linux/vfs_compat.h new file mode 100644 index 000000000000..c35e80d31cd7 --- /dev/null +++ b/include/os/linux/kernel/linux/vfs_compat.h @@ -0,0 +1,439 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Copyright (C) 2015 Jörg Thalheim. + */ + +#ifndef _ZFS_VFS_H +#define _ZFS_VFS_H + +#include +#include +#include +#include + +/* + * 2.6.34 - 3.19, bdi_setup_and_register() takes 3 arguments. + * 4.0 - 4.11, bdi_setup_and_register() takes 2 arguments. + * 4.12 - x.y, super_setup_bdi_name() new interface. + */ +#if defined(HAVE_SUPER_SETUP_BDI_NAME) +extern atomic_long_t zfs_bdi_seq; + +static inline int +zpl_bdi_setup(struct super_block *sb, char *name) +{ + return super_setup_bdi_name(sb, "%.28s-%ld", name, + atomic_long_inc_return(&zfs_bdi_seq)); +} +static inline void +zpl_bdi_destroy(struct super_block *sb) +{ +} +#elif defined(HAVE_2ARGS_BDI_SETUP_AND_REGISTER) +static inline int +zpl_bdi_setup(struct super_block *sb, char *name) +{ + struct backing_dev_info *bdi; + int error; + + bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); + error = bdi_setup_and_register(bdi, name); + if (error) { + kmem_free(bdi, sizeof (struct backing_dev_info)); + return (error); + } + + sb->s_bdi = bdi; + + return (0); +} +static inline void +zpl_bdi_destroy(struct super_block *sb) +{ + struct backing_dev_info *bdi = sb->s_bdi; + + bdi_destroy(bdi); + kmem_free(bdi, sizeof (struct backing_dev_info)); + sb->s_bdi = NULL; +} +#elif defined(HAVE_3ARGS_BDI_SETUP_AND_REGISTER) +static inline int +zpl_bdi_setup(struct super_block *sb, char *name) +{ + struct backing_dev_info *bdi; + int error; + + bdi = kmem_zalloc(sizeof (struct backing_dev_info), KM_SLEEP); + error = bdi_setup_and_register(bdi, name, BDI_CAP_MAP_COPY); + if (error) { + kmem_free(sb->s_bdi, sizeof (struct backing_dev_info)); + return (error); + } + + sb->s_bdi = bdi; + + return (0); +} +static inline void +zpl_bdi_destroy(struct super_block *sb) +{ + struct backing_dev_info *bdi = sb->s_bdi; + + bdi_destroy(bdi); + kmem_free(bdi, sizeof (struct backing_dev_info)); + sb->s_bdi = NULL; +} +#else +#error "Unsupported kernel" +#endif + +/* + * 4.14 adds SB_* flag definitions, define them to MS_* equivalents + * if not set. + */ +#ifndef SB_RDONLY +#define SB_RDONLY MS_RDONLY +#endif + +#ifndef SB_SILENT +#define SB_SILENT MS_SILENT +#endif + +#ifndef SB_ACTIVE +#define SB_ACTIVE MS_ACTIVE +#endif + +#ifndef SB_POSIXACL +#define SB_POSIXACL MS_POSIXACL +#endif + +#ifndef SB_MANDLOCK +#define SB_MANDLOCK MS_MANDLOCK +#endif + +#ifndef SB_NOATIME +#define SB_NOATIME MS_NOATIME +#endif + +/* + * 3.5 API change, + * The clear_inode() function replaces end_writeback() and introduces an + * ordering change regarding when the inode_sync_wait() occurs. See the + * configure check in config/kernel-clear-inode.m4 for full details. + */ +#if defined(HAVE_EVICT_INODE) && !defined(HAVE_CLEAR_INODE) +#define clear_inode(ip) end_writeback(ip) +#endif /* HAVE_EVICT_INODE && !HAVE_CLEAR_INODE */ + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) && !defined(HAVE_LSEEK_EXECUTE) +static inline loff_t +lseek_execute( + struct file *filp, + struct inode *inode, + loff_t offset, + loff_t maxsize) +{ + if (offset < 0 && !(filp->f_mode & FMODE_UNSIGNED_OFFSET)) + return (-EINVAL); + + if (offset > maxsize) + return (-EINVAL); + + if (offset != filp->f_pos) { + spin_lock(&filp->f_lock); + filp->f_pos = offset; + filp->f_version = 0; + spin_unlock(&filp->f_lock); + } + + return (offset); +} +#endif /* SEEK_HOLE && SEEK_DATA && !HAVE_LSEEK_EXECUTE */ + +#if defined(CONFIG_FS_POSIX_ACL) +/* + * These functions safely approximates the behavior of posix_acl_release() + * which cannot be used because it calls the GPL-only symbol kfree_rcu(). + * The in-kernel version, which can access the RCU, frees the ACLs after + * the grace period expires. Because we're unsure how long that grace + * period may be this implementation conservatively delays for 60 seconds. + * This is several orders of magnitude larger than expected grace period. + * At 60 seconds the kernel will also begin issuing RCU stall warnings. + */ + +#include + +#if defined(HAVE_POSIX_ACL_RELEASE) && !defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) +#define zpl_posix_acl_release(arg) posix_acl_release(arg) +#else +void zpl_posix_acl_release_impl(struct posix_acl *); + +static inline void +zpl_posix_acl_release(struct posix_acl *acl) +{ + if ((acl == NULL) || (acl == ACL_NOT_CACHED)) + return; +#ifdef HAVE_ACL_REFCOUNT + if (refcount_dec_and_test(&acl->a_refcount)) + zpl_posix_acl_release_impl(acl); +#else + if (atomic_dec_and_test(&acl->a_refcount)) + zpl_posix_acl_release_impl(acl); +#endif +} +#endif /* HAVE_POSIX_ACL_RELEASE */ + +#ifdef HAVE_SET_CACHED_ACL_USABLE +#define zpl_set_cached_acl(ip, ty, n) set_cached_acl(ip, ty, n) +#define zpl_forget_cached_acl(ip, ty) forget_cached_acl(ip, ty) +#else +static inline void +zpl_set_cached_acl(struct inode *ip, int type, struct posix_acl *newer) +{ + struct posix_acl *older = NULL; + + spin_lock(&ip->i_lock); + + if ((newer != ACL_NOT_CACHED) && (newer != NULL)) + posix_acl_dup(newer); + + switch (type) { + case ACL_TYPE_ACCESS: + older = ip->i_acl; + rcu_assign_pointer(ip->i_acl, newer); + break; + case ACL_TYPE_DEFAULT: + older = ip->i_default_acl; + rcu_assign_pointer(ip->i_default_acl, newer); + break; + } + + spin_unlock(&ip->i_lock); + + zpl_posix_acl_release(older); +} + +static inline void +zpl_forget_cached_acl(struct inode *ip, int type) +{ + zpl_set_cached_acl(ip, type, (struct posix_acl *)ACL_NOT_CACHED); +} +#endif /* HAVE_SET_CACHED_ACL_USABLE */ + +/* + * 3.1 API change, + * posix_acl_chmod() was added as the preferred interface. + * + * 3.14 API change, + * posix_acl_chmod() was changed to __posix_acl_chmod() + */ +#ifndef HAVE___POSIX_ACL_CHMOD +#ifdef HAVE_POSIX_ACL_CHMOD +#define __posix_acl_chmod(acl, gfp, mode) posix_acl_chmod(acl, gfp, mode) +#define __posix_acl_create(acl, gfp, mode) posix_acl_create(acl, gfp, mode) +#else +#error "Unsupported kernel" +#endif /* HAVE_POSIX_ACL_CHMOD */ +#endif /* HAVE___POSIX_ACL_CHMOD */ + +/* + * 4.8 API change, + * posix_acl_valid() now must be passed a namespace, the namespace from + * from super block associated with the given inode is used for this purpose. + */ +#ifdef HAVE_POSIX_ACL_VALID_WITH_NS +#define zpl_posix_acl_valid(ip, acl) posix_acl_valid(ip->i_sb->s_user_ns, acl) +#else +#define zpl_posix_acl_valid(ip, acl) posix_acl_valid(acl) +#endif + +#endif /* CONFIG_FS_POSIX_ACL */ + +/* + * 3.19 API change + * struct access f->f_dentry->d_inode was replaced by accessor function + * file_inode(f) + */ +#ifndef HAVE_FILE_INODE +static inline struct inode *file_inode(const struct file *f) +{ + return (f->f_dentry->d_inode); +} +#endif /* HAVE_FILE_INODE */ + +/* + * 4.1 API change + * struct access file->f_path.dentry was replaced by accessor function + * file_dentry(f) + */ +#ifndef HAVE_FILE_DENTRY +static inline struct dentry *file_dentry(const struct file *f) +{ + return (f->f_path.dentry); +} +#endif /* HAVE_FILE_DENTRY */ + +static inline uid_t zfs_uid_read_impl(struct inode *ip) +{ +#ifdef HAVE_SUPER_USER_NS + return (from_kuid(ip->i_sb->s_user_ns, ip->i_uid)); +#else + return (from_kuid(kcred->user_ns, ip->i_uid)); +#endif +} + +static inline uid_t zfs_uid_read(struct inode *ip) +{ + return (zfs_uid_read_impl(ip)); +} + +static inline gid_t zfs_gid_read_impl(struct inode *ip) +{ +#ifdef HAVE_SUPER_USER_NS + return (from_kgid(ip->i_sb->s_user_ns, ip->i_gid)); +#else + return (from_kgid(kcred->user_ns, ip->i_gid)); +#endif +} + +static inline gid_t zfs_gid_read(struct inode *ip) +{ + return (zfs_gid_read_impl(ip)); +} + +static inline void zfs_uid_write(struct inode *ip, uid_t uid) +{ +#ifdef HAVE_SUPER_USER_NS + ip->i_uid = make_kuid(ip->i_sb->s_user_ns, uid); +#else + ip->i_uid = make_kuid(kcred->user_ns, uid); +#endif +} + +static inline void zfs_gid_write(struct inode *ip, gid_t gid) +{ +#ifdef HAVE_SUPER_USER_NS + ip->i_gid = make_kgid(ip->i_sb->s_user_ns, gid); +#else + ip->i_gid = make_kgid(kcred->user_ns, gid); +#endif +} + +/* + * 4.9 API change + */ +#ifndef HAVE_SETATTR_PREPARE +static inline int +setattr_prepare(struct dentry *dentry, struct iattr *ia) +{ + return (inode_change_ok(dentry->d_inode, ia)); +} +#endif + +/* + * 4.11 API change + * These macros are defined by kernel 4.11. We define them so that the same + * code builds under kernels < 4.11 and >= 4.11. The macros are set to 0 so + * that it will create obvious failures if they are accidentally used when built + * against a kernel >= 4.11. + */ + +#ifndef STATX_BASIC_STATS +#define STATX_BASIC_STATS 0 +#endif + +#ifndef AT_STATX_SYNC_AS_STAT +#define AT_STATX_SYNC_AS_STAT 0 +#endif + +/* + * 4.11 API change + * 4.11 takes struct path *, < 4.11 takes vfsmount * + */ + +#ifdef HAVE_VFSMOUNT_IOPS_GETATTR +#define ZPL_GETATTR_WRAPPER(func) \ +static int \ +func(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) \ +{ \ + struct path path = { .mnt = mnt, .dentry = dentry }; \ + return func##_impl(&path, stat, STATX_BASIC_STATS, \ + AT_STATX_SYNC_AS_STAT); \ +} +#elif defined(HAVE_PATH_IOPS_GETATTR) +#define ZPL_GETATTR_WRAPPER(func) \ +static int \ +func(const struct path *path, struct kstat *stat, u32 request_mask, \ + unsigned int query_flags) \ +{ \ + return (func##_impl(path, stat, request_mask, query_flags)); \ +} +#else +#error +#endif + +/* + * 4.9 API change + * Preferred interface to get the current FS time. + */ +#if !defined(HAVE_CURRENT_TIME) +static inline struct timespec +current_time(struct inode *ip) +{ + return (timespec_trunc(current_kernel_time(), ip->i_sb->s_time_gran)); +} +#endif + +/* + * 4.16 API change + * Added iversion interface for managing inode version field. + */ +#ifdef HAVE_INODE_SET_IVERSION +#include +#else +static inline void +inode_set_iversion(struct inode *ip, u64 val) +{ + ip->i_version = val; +} +#endif + +/* + * Returns true when called in the context of a 32-bit system call. + */ +static inline int +zpl_is_32bit_api(void) +{ +#ifdef CONFIG_COMPAT +#ifdef HAVE_IN_COMPAT_SYSCALL + return (in_compat_syscall()); +#else + return (is_compat_task()); +#endif +#else + return (BITS_PER_LONG == 32); +#endif +} + +#endif /* _ZFS_VFS_H */ diff --git a/include/os/linux/kernel/linux/xattr_compat.h b/include/os/linux/kernel/linux/xattr_compat.h new file mode 100644 index 000000000000..8348e99198af --- /dev/null +++ b/include/os/linux/kernel/linux/xattr_compat.h @@ -0,0 +1,184 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + */ + +#ifndef _ZFS_XATTR_H +#define _ZFS_XATTR_H + +#include + +/* + * 2.6.35 API change, + * The const keyword was added to the 'struct xattr_handler' in the + * generic Linux super_block structure. To handle this we define an + * appropriate xattr_handler_t typedef which can be used. This was + * the preferred solution because it keeps the code clean and readable. + */ +typedef const struct xattr_handler xattr_handler_t; + +/* + * 4.5 API change, + */ +#if defined(HAVE_XATTR_LIST_SIMPLE) +#define ZPL_XATTR_LIST_WRAPPER(fn) \ +static bool \ +fn(struct dentry *dentry) \ +{ \ + return (!!__ ## fn(dentry->d_inode, NULL, 0, NULL, 0)); \ +} +/* + * 4.4 API change, + */ +#elif defined(HAVE_XATTR_LIST_DENTRY) +#define ZPL_XATTR_LIST_WRAPPER(fn) \ +static size_t \ +fn(struct dentry *dentry, char *list, size_t list_size, \ + const char *name, size_t name_len, int type) \ +{ \ + return (__ ## fn(dentry->d_inode, \ + list, list_size, name, name_len)); \ +} +/* + * 2.6.33 API change, + */ +#elif defined(HAVE_XATTR_LIST_HANDLER) +#define ZPL_XATTR_LIST_WRAPPER(fn) \ +static size_t \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + char *list, size_t list_size, const char *name, size_t name_len) \ +{ \ + return (__ ## fn(dentry->d_inode, \ + list, list_size, name, name_len)); \ +} +#else +#error "Unsupported kernel" +#endif + +/* + * 4.7 API change, + * The xattr_handler->get() callback was changed to take a both dentry and + * inode, because the dentry might not be attached to an inode yet. + */ +#if defined(HAVE_XATTR_GET_DENTRY_INODE) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + struct inode *inode, const char *name, void *buffer, size_t size) \ +{ \ + return (__ ## fn(inode, name, buffer, size)); \ +} +/* + * 4.4 API change, + * The xattr_handler->get() callback was changed to take a xattr_handler, + * and handler_flags argument was removed and should be accessed by + * handler->flags. + */ +#elif defined(HAVE_XATTR_GET_HANDLER) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + const char *name, void *buffer, size_t size) \ +{ \ + return (__ ## fn(dentry->d_inode, name, buffer, size)); \ +} +/* + * 2.6.33 API change, + * The xattr_handler->get() callback was changed to take a dentry + * instead of an inode, and a handler_flags argument was added. + */ +#elif defined(HAVE_XATTR_GET_DENTRY) +#define ZPL_XATTR_GET_WRAPPER(fn) \ +static int \ +fn(struct dentry *dentry, const char *name, void *buffer, size_t size, \ + int unused_handler_flags) \ +{ \ + return (__ ## fn(dentry->d_inode, name, buffer, size)); \ +} +#else +#error "Unsupported kernel" +#endif + +/* + * 4.7 API change, + * The xattr_handler->set() callback was changed to take a both dentry and + * inode, because the dentry might not be attached to an inode yet. + */ +#if defined(HAVE_XATTR_SET_DENTRY_INODE) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + struct inode *inode, const char *name, const void *buffer, \ + size_t size, int flags) \ +{ \ + return (__ ## fn(inode, name, buffer, size, flags)); \ +} +/* + * 4.4 API change, + * The xattr_handler->set() callback was changed to take a xattr_handler, + * and handler_flags argument was removed and should be accessed by + * handler->flags. + */ +#elif defined(HAVE_XATTR_SET_HANDLER) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(const struct xattr_handler *handler, struct dentry *dentry, \ + const char *name, const void *buffer, size_t size, int flags) \ +{ \ + return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ +} +/* + * 2.6.33 API change, + * The xattr_handler->set() callback was changed to take a dentry + * instead of an inode, and a handler_flags argument was added. + */ +#elif defined(HAVE_XATTR_SET_DENTRY) +#define ZPL_XATTR_SET_WRAPPER(fn) \ +static int \ +fn(struct dentry *dentry, const char *name, const void *buffer, \ + size_t size, int flags, int unused_handler_flags) \ +{ \ + return (__ ## fn(dentry->d_inode, name, buffer, size, flags)); \ +} +#else +#error "Unsupported kernel" +#endif + +/* + * Linux 3.7 API change. posix_acl_{from,to}_xattr gained the user_ns + * parameter. All callers are expected to pass the &init_user_ns which + * is available through the init credential (kcred). + */ +static inline struct posix_acl * +zpl_acl_from_xattr(const void *value, int size) +{ + return (posix_acl_from_xattr(kcred->user_ns, value, size)); +} + +static inline int +zpl_acl_to_xattr(struct posix_acl *acl, void *value, int size) +{ + return (posix_acl_to_xattr(kcred->user_ns, acl, value, size)); +} + +#endif /* _ZFS_XATTR_H */ diff --git a/include/os/linux/spl/Makefile.am b/include/os/linux/spl/Makefile.am new file mode 100644 index 000000000000..bd781c08f143 --- /dev/null +++ b/include/os/linux/spl/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = rpc sys diff --git a/include/os/linux/spl/rpc/Makefile.am b/include/os/linux/spl/rpc/Makefile.am new file mode 100644 index 000000000000..13d804fce9b6 --- /dev/null +++ b/include/os/linux/spl/rpc/Makefile.am @@ -0,0 +1,7 @@ +KERNEL_H = \ + xdr.h + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/spl/rpc +kernel_HEADERS = $(KERNEL_H) +endif diff --git a/include/os/linux/spl/rpc/xdr.h b/include/os/linux/spl/rpc/xdr.h new file mode 100644 index 000000000000..0b39b46cf6a2 --- /dev/null +++ b/include/os/linux/spl/rpc/xdr.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2008 Sun Microsystems, Inc. + * Written by Ricardo Correia + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_RPC_XDR_H +#define _SPL_RPC_XDR_H + +#include + +typedef int bool_t; + +/* + * XDR enums and types. + */ +enum xdr_op { + XDR_ENCODE, + XDR_DECODE +}; + +struct xdr_ops; + +typedef struct { + struct xdr_ops *x_ops; /* Let caller know xdrmem_create() succeeds */ + caddr_t x_addr; /* Current buffer addr */ + caddr_t x_addr_end; /* End of the buffer */ + enum xdr_op x_op; /* Stream direction */ +} XDR; + +typedef bool_t (*xdrproc_t)(XDR *xdrs, void *ptr); + +struct xdr_ops { + bool_t (*xdr_control)(XDR *, int, void *); + + bool_t (*xdr_char)(XDR *, char *); + bool_t (*xdr_u_short)(XDR *, unsigned short *); + bool_t (*xdr_u_int)(XDR *, unsigned *); + bool_t (*xdr_u_longlong_t)(XDR *, u_longlong_t *); + + bool_t (*xdr_opaque)(XDR *, caddr_t, const uint_t); + bool_t (*xdr_string)(XDR *, char **, const uint_t); + bool_t (*xdr_array)(XDR *, caddr_t *, uint_t *, const uint_t, + const uint_t, const xdrproc_t); +}; + +/* + * XDR control operator. + */ +#define XDR_GET_BYTES_AVAIL 1 + +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; + +/* + * XDR functions. + */ +void xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op); + +/* Currently not needed. If needed later, we'll add it to struct xdr_ops */ +#define xdr_destroy(xdrs) ((void) 0) + +#define xdr_control(xdrs, req, info) \ + (xdrs)->x_ops->xdr_control((xdrs), (req), (info)) + +/* + * For precaution, the following are defined as static inlines instead of macros + * to get some amount of type safety. + * + * Also, macros wouldn't work in the case where typecasting is done, because it + * must be possible to reference the functions' addresses by these names. + */ +static inline bool_t xdr_char(XDR *xdrs, char *cp) +{ + return (xdrs->x_ops->xdr_char(xdrs, cp)); +} + +static inline bool_t xdr_u_short(XDR *xdrs, unsigned short *usp) +{ + return (xdrs->x_ops->xdr_u_short(xdrs, usp)); +} + +static inline bool_t xdr_short(XDR *xdrs, short *sp) +{ + BUILD_BUG_ON(sizeof (short) != 2); + return (xdrs->x_ops->xdr_u_short(xdrs, (unsigned short *) sp)); +} + +static inline bool_t xdr_u_int(XDR *xdrs, unsigned *up) +{ + return (xdrs->x_ops->xdr_u_int(xdrs, up)); +} + +static inline bool_t xdr_int(XDR *xdrs, int *ip) +{ + BUILD_BUG_ON(sizeof (int) != 4); + return (xdrs->x_ops->xdr_u_int(xdrs, (unsigned *)ip)); +} + +static inline bool_t xdr_u_longlong_t(XDR *xdrs, u_longlong_t *ullp) +{ + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, ullp)); +} + +static inline bool_t xdr_longlong_t(XDR *xdrs, longlong_t *llp) +{ + BUILD_BUG_ON(sizeof (longlong_t) != 8); + return (xdrs->x_ops->xdr_u_longlong_t(xdrs, (u_longlong_t *)llp)); +} + +/* + * Fixed-length opaque data. + */ +static inline bool_t xdr_opaque(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + return (xdrs->x_ops->xdr_opaque(xdrs, cp, cnt)); +} + +/* + * Variable-length string. + * The *sp buffer must have (maxsize + 1) bytes. + */ +static inline bool_t xdr_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + return (xdrs->x_ops->xdr_string(xdrs, sp, maxsize)); +} + +/* + * Variable-length arrays. + */ +static inline bool_t xdr_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, + const uint_t maxsize, const uint_t elsize, const xdrproc_t elproc) +{ + return xdrs->x_ops->xdr_array(xdrs, arrp, sizep, maxsize, elsize, + elproc); +} + +#endif /* SPL_RPC_XDR_H */ diff --git a/include/os/linux/spl/sys/Makefile.am b/include/os/linux/spl/sys/Makefile.am new file mode 100644 index 000000000000..0fd4cd37a7ec --- /dev/null +++ b/include/os/linux/spl/sys/Makefile.am @@ -0,0 +1,64 @@ +KERNEL_H = \ + acl.h \ + atomic.h \ + byteorder.h \ + callb.h \ + callo.h \ + cmn_err.h \ + condvar.h \ + console.h \ + cred.h \ + ctype.h \ + debug.h \ + disp.h \ + dkio.h \ + errno.h \ + fcntl.h \ + file.h \ + inttypes.h \ + isa_defs.h \ + kmem_cache.h \ + kmem.h \ + kstat.h \ + list.h \ + mod_os.h \ + mutex.h \ + param.h \ + processor.h \ + proc.h \ + procfs_list.h \ + random.h \ + rwlock.h \ + shrinker.h \ + sid.h \ + signal.h \ + simd.h \ + stat.h \ + strings.h \ + sunddi.h \ + sysmacros.h \ + systeminfo.h \ + taskq.h \ + thread.h \ + time.h \ + timer.h \ + trace.h \ + trace_spl.h \ + trace_taskq.h \ + tsd.h \ + types32.h \ + types.h \ + uio.h \ + user.h \ + vfs.h \ + vmem.h \ + vmsystm.h \ + vnode.h \ + wait.h \ + zmod.h \ + zone.h + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/spl/sys +kernel_HEADERS = $(KERNEL_H) +endif diff --git a/include/os/linux/spl/sys/acl.h b/include/os/linux/spl/sys/acl.h new file mode 100644 index 000000000000..9fc79c025caf --- /dev/null +++ b/include/os/linux/spl/sys/acl.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_ACL_H +#define _SPL_ACL_H + +#include + +typedef struct ace { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; +} ace_t; + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#define MAX_ACL_ENTRIES 1024 + +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED|ACL_DEFAULTED) + +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE|ACE_IDENTIFIER_GROUP) + +/* BEGIN CSTYLED */ +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) +/* END CSTYLED */ + +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 + +#endif /* _SPL_ACL_H */ diff --git a/include/os/linux/spl/sys/atomic.h b/include/os/linux/spl/sys/atomic.h new file mode 100644 index 000000000000..51b5479235ab --- /dev/null +++ b/include/os/linux/spl/sys/atomic.h @@ -0,0 +1,79 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_ATOMIC_H +#define _SPL_ATOMIC_H + +#include +#include +#include + +/* + * Map the atomic_* functions to the Linux counterparts. This relies on the + * fact that the atomic types are internally really a uint32 or uint64. If + * this were to change an alternate approach would be needed. + * + * N.B. Due to the limitations of the original API atomicity is not strictly + * preserved when using the 64-bit functions on a 32-bit system. In order + * to support this all consumers would need to be updated to use the Linux + * provided atomic_t and atomic64_t types. + */ +#define atomic_inc_32(v) atomic_inc((atomic_t *)(v)) +#define atomic_dec_32(v) atomic_dec((atomic_t *)(v)) +#define atomic_add_32(v, i) atomic_add((i), (atomic_t *)(v)) +#define atomic_sub_32(v, i) atomic_sub((i), (atomic_t *)(v)) +#define atomic_inc_32_nv(v) atomic_inc_return((atomic_t *)(v)) +#define atomic_dec_32_nv(v) atomic_dec_return((atomic_t *)(v)) +#define atomic_add_32_nv(v, i) atomic_add_return((i), (atomic_t *)(v)) +#define atomic_sub_32_nv(v, i) atomic_sub_return((i), (atomic_t *)(v)) +#define atomic_cas_32(v, x, y) atomic_cmpxchg((atomic_t *)(v), x, y) +#define atomic_swap_32(v, x) atomic_xchg((atomic_t *)(v), x) +#define atomic_inc_64(v) atomic64_inc((atomic64_t *)(v)) +#define atomic_dec_64(v) atomic64_dec((atomic64_t *)(v)) +#define atomic_add_64(v, i) atomic64_add((i), (atomic64_t *)(v)) +#define atomic_sub_64(v, i) atomic64_sub((i), (atomic64_t *)(v)) +#define atomic_inc_64_nv(v) atomic64_inc_return((atomic64_t *)(v)) +#define atomic_dec_64_nv(v) atomic64_dec_return((atomic64_t *)(v)) +#define atomic_add_64_nv(v, i) atomic64_add_return((i), (atomic64_t *)(v)) +#define atomic_sub_64_nv(v, i) atomic64_sub_return((i), (atomic64_t *)(v)) +#define atomic_cas_64(v, x, y) atomic64_cmpxchg((atomic64_t *)(v), x, y) +#define atomic_swap_64(v, x) atomic64_xchg((atomic64_t *)(v), x) + +#ifdef _LP64 +static __inline__ void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_64((volatile uint64_t *)target, + (uint64_t)cmp, (uint64_t)newval)); +} +#else /* _LP64 */ +static __inline__ void * +atomic_cas_ptr(volatile void *target, void *cmp, void *newval) +{ + return ((void *)atomic_cas_32((volatile uint32_t *)target, + (uint32_t)cmp, (uint32_t)newval)); +} +#endif /* _LP64 */ + +#endif /* _SPL_ATOMIC_H */ diff --git a/include/os/linux/spl/sys/byteorder.h b/include/os/linux/spl/sys/byteorder.h new file mode 100644 index 000000000000..70847edbc8a9 --- /dev/null +++ b/include/os/linux/spl/sys/byteorder.h @@ -0,0 +1,87 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_BYTEORDER_H +#define _SPL_BYTEORDER_H + +#include + +#if defined(__BIG_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#define _ZFS_BIG_ENDIAN +#endif + +#if defined(__LITTLE_ENDIAN) && !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + +#include + +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define LE_16(x) cpu_to_le16(x) +#define LE_32(x) cpu_to_le32(x) +#define LE_64(x) cpu_to_le64(x) +#define BE_16(x) cpu_to_be16(x) +#define BE_32(x) cpu_to_be32(x) +#define BE_64(x) cpu_to_be64(x) + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#ifdef _ZFS_BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +#endif /* SPL_BYTEORDER_H */ diff --git a/include/os/linux/spl/sys/callb.h b/include/os/linux/spl/sys/callb.h new file mode 100644 index 000000000000..f1826bfd353a --- /dev/null +++ b/include/os/linux/spl/sys/callb.h @@ -0,0 +1,54 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CALLB_H +#define _SPL_CALLB_H + +#include +#include + +#define CALLB_CPR_ASSERT(cp) ASSERT(MUTEX_HELD((cp)->cc_lockp)); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + CALLB_CPR_ASSERT(cp); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + +#endif /* _SPL_CALLB_H */ diff --git a/include/os/linux/spl/sys/callo.h b/include/os/linux/spl/sys/callo.h new file mode 100644 index 000000000000..c43ac92e7c32 --- /dev/null +++ b/include/os/linux/spl/sys/callo.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2007-2013 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CALLO_H +#define _SPL_CALLO_H + +/* + * Callout flags: + * + * CALLOUT_FLAG_ROUNDUP + * Roundup the expiration time to the next resolution boundary. + * If this flag is not specified, the expiration time is rounded down. + * CALLOUT_FLAG_ABSOLUTE + * Normally, the expiration passed to the timeout API functions is an + * expiration interval. If this flag is specified, then it is + * interpreted as the expiration time itself. + * CALLOUT_FLAG_HRESTIME + * Normally, callouts are not affected by changes to system time + * (hrestime). This flag is used to create a callout that is affected + * by system time. If system time changes, these timers must be + * handled in a special way (see callout.c). These are used by condition + * variables and LWP timers that need this behavior. + * CALLOUT_FLAG_32BIT + * Legacy interfaces timeout() and realtime_timeout() pass this flag + * to timeout_generic() to indicate that a 32-bit ID should be allocated. + */ +#define CALLOUT_FLAG_ROUNDUP 0x1 +#define CALLOUT_FLAG_ABSOLUTE 0x2 +#define CALLOUT_FLAG_HRESTIME 0x4 +#define CALLOUT_FLAG_32BIT 0x8 + +#endif /* _SPL_CALLB_H */ diff --git a/include/os/linux/spl/sys/cmn_err.h b/include/os/linux/spl/sys/cmn_err.h new file mode 100644 index 000000000000..be57358b0a8a --- /dev/null +++ b/include/os/linux/spl/sys/cmn_err.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CMN_ERR_H +#define _SPL_CMN_ERR_H + +#include + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +extern void cmn_err(int, const char *, ...); +extern void vcmn_err(int, const char *, va_list); +extern void vpanic(const char *, va_list); + +#define fm_panic panic + +#endif /* SPL_CMN_ERR_H */ diff --git a/include/os/linux/spl/sys/condvar.h b/include/os/linux/spl/sys/condvar.h new file mode 100644 index 000000000000..22408824f85b --- /dev/null +++ b/include/os/linux/spl/sys/condvar.h @@ -0,0 +1,113 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CONDVAR_H +#define _SPL_CONDVAR_H + +#include +#include +#include +#include +#include +#include + +/* + * cv_timedwait() is similar to cv_wait() except that it additionally expects + * a timeout value specified in ticks. When woken by cv_signal() or + * cv_broadcast() it returns 1, otherwise when the timeout is reached -1 is + * returned. + * + * cv_timedwait_sig() behaves the same as cv_timedwait() but blocks + * interruptibly and can be woken by a signal (EINTR, ERESTART). When + * this occurs 0 is returned. + * + * cv_timedwait_io() and cv_timedwait_sig_io() are variants of cv_timedwait() + * and cv_timedwait_sig() which should be used when waiting for outstanding + * IO to complete. They are responsible for updating the iowait accounting + * when this is supported by the platform. + * + * cv_timedwait_hires() and cv_timedwait_sig_hires() are high resolution + * versions of cv_timedwait() and cv_timedwait_sig(). They expect the timeout + * to be specified as a hrtime_t allowing for timeouts of less than a tick. + * + * N.B. The return values differ slightly from the illumos implementation + * which returns the time remaining, instead of 1, when woken. They both + * return -1 on timeout. Consumers which need to know the time remaining + * are responsible for tracking it themselves. + */ + + +/* + * The kcondvar_t struct is protected by mutex taken externally before + * calling any of the wait/signal funs, and passed into the wait funs. + */ +#define CV_MAGIC 0x346545f4 +#define CV_DESTROY 0x346545f5 + +typedef struct { + int cv_magic; + spl_wait_queue_head_t cv_event; + spl_wait_queue_head_t cv_destroy; + atomic_t cv_refs; + atomic_t cv_waiters; + kmutex_t *cv_mutex; +} kcondvar_t; + +typedef enum { CV_DEFAULT = 0, CV_DRIVER } kcv_type_t; + +extern void __cv_init(kcondvar_t *, char *, kcv_type_t, void *); +extern void __cv_destroy(kcondvar_t *); +extern void __cv_wait(kcondvar_t *, kmutex_t *); +extern void __cv_wait_io(kcondvar_t *, kmutex_t *); +extern int __cv_wait_io_sig(kcondvar_t *, kmutex_t *); +extern int __cv_wait_sig(kcondvar_t *, kmutex_t *); +extern int __cv_timedwait(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_io(kcondvar_t *, kmutex_t *, clock_t); +extern int __cv_timedwait_sig(kcondvar_t *, kmutex_t *, clock_t); +extern int cv_timedwait_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); +extern int cv_timedwait_sig_hires(kcondvar_t *, kmutex_t *, hrtime_t, + hrtime_t res, int flag); +extern void __cv_signal(kcondvar_t *); +extern void __cv_broadcast(kcondvar_t *c); + +#define cv_init(cvp, name, type, arg) __cv_init(cvp, name, type, arg) +#define cv_destroy(cvp) __cv_destroy(cvp) +#define cv_wait(cvp, mp) __cv_wait(cvp, mp) +#define cv_wait_io(cvp, mp) __cv_wait_io(cvp, mp) +#define cv_wait_io_sig(cvp, mp) __cv_wait_io_sig(cvp, mp) +#define cv_wait_sig(cvp, mp) __cv_wait_sig(cvp, mp) +#define cv_signal(cvp) __cv_signal(cvp) +#define cv_broadcast(cvp) __cv_broadcast(cvp) + +/* + * NB: There is no way to reliably distinguish between having been signalled + * and having timed out on Linux. If the client code needs to reliably + * distinguish between the two it should use the hires variant. + */ +#define cv_timedwait(cvp, mp, t) __cv_timedwait(cvp, mp, t) +#define cv_timedwait_io(cvp, mp, t) __cv_timedwait_io(cvp, mp, t) +#define cv_timedwait_sig(cvp, mp, t) __cv_timedwait_sig(cvp, mp, t) + +#endif /* _SPL_CONDVAR_H */ diff --git a/include/os/linux/spl/sys/console.h b/include/os/linux/spl/sys/console.h new file mode 100644 index 000000000000..33c8b3c6b4d7 --- /dev/null +++ b/include/os/linux/spl/sys/console.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CONSOLE_H +#define _SPL_CONSOLE_H + +#define console_vprintf vprintk +#define console_printf printk + +#endif /* _SPL_CONSOLE_H */ diff --git a/include/os/linux/spl/sys/cred.h b/include/os/linux/spl/sys/cred.h new file mode 100644 index 000000000000..0b07c4369940 --- /dev/null +++ b/include/os/linux/spl/sys/cred.h @@ -0,0 +1,63 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CRED_H +#define _SPL_CRED_H + +#include +#include +#include +#include + +typedef struct cred cred_t; + +#define kcred ((cred_t *)(init_task.cred)) +#define CRED() ((cred_t *)current_cred()) + +/* Linux 4.9 API change, GROUP_AT was removed */ +#ifndef GROUP_AT +#define GROUP_AT(gi, i) ((gi)->gid[i]) +#endif + +#define KUID_TO_SUID(x) (__kuid_val(x)) +#define KGID_TO_SGID(x) (__kgid_val(x)) +#define SUID_TO_KUID(x) (KUIDT_INIT(x)) +#define SGID_TO_KGID(x) (KGIDT_INIT(x)) +#define KGIDP_TO_SGIDP(x) (&(x)->val) + +extern void crhold(cred_t *cr); +extern void crfree(cred_t *cr); +extern uid_t crgetuid(const cred_t *cr); +extern uid_t crgetruid(const cred_t *cr); +extern uid_t crgetsuid(const cred_t *cr); +extern uid_t crgetfsuid(const cred_t *cr); +extern gid_t crgetgid(const cred_t *cr); +extern gid_t crgetrgid(const cred_t *cr); +extern gid_t crgetsgid(const cred_t *cr); +extern gid_t crgetfsgid(const cred_t *cr); +extern int crgetngroups(const cred_t *cr); +extern gid_t *crgetgroups(const cred_t *cr); +extern int groupmember(gid_t gid, const cred_t *cr); + +#endif /* _SPL_CRED_H */ diff --git a/include/os/linux/spl/sys/ctype.h b/include/os/linux/spl/sys/ctype.h new file mode 100644 index 000000000000..18beb1daa5d9 --- /dev/null +++ b/include/os/linux/spl/sys/ctype.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_CTYPE_H +#define _SPL_CTYPE_H + +#include + +#endif /* SPL_CTYPE_H */ diff --git a/include/os/linux/spl/sys/debug.h b/include/os/linux/spl/sys/debug.h new file mode 100644 index 000000000000..e2dbd6804056 --- /dev/null +++ b/include/os/linux/spl/sys/debug.h @@ -0,0 +1,169 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +/* + * Available Solaris debug functions. All of the ASSERT() macros will be + * compiled out when NDEBUG is defined, this is the default behavior for + * the SPL. To enable assertions use the --enable-debug with configure. + * The VERIFY() functions are never compiled out and cannot be disabled. + * + * PANIC() - Panic the node and print message. + * ASSERT() - Assert X is true, if not panic. + * ASSERT3B() - Assert boolean X OP Y is true, if not panic. + * ASSERT3S() - Assert signed X OP Y is true, if not panic. + * ASSERT3U() - Assert unsigned X OP Y is true, if not panic. + * ASSERT3P() - Assert pointer X OP Y is true, if not panic. + * ASSERT0() - Assert value is zero, if not panic. + * VERIFY() - Verify X is true, if not panic. + * VERIFY3B() - Verify boolean X OP Y is true, if not panic. + * VERIFY3S() - Verify signed X OP Y is true, if not panic. + * VERIFY3U() - Verify unsigned X OP Y is true, if not panic. + * VERIFY3P() - Verify pointer X OP Y is true, if not panic. + * VERIFY0() - Verify value is zero, if not panic. + */ + +#ifndef _SPL_DEBUG_H +#define _SPL_DEBUG_H + +/* + * Common DEBUG functionality. + */ +#define __printflike(a, b) __printf(a, b) + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +int spl_panic(const char *file, const char *func, int line, + const char *fmt, ...); +void spl_dumpstack(void); + +/* BEGIN CSTYLED */ +#define PANIC(fmt, a...) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + +#define VERIFY(cond) \ + (void) (unlikely(!(cond)) && \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "%s", "VERIFY(" #cond ") failed\n")) + +#define VERIFY3B(LEFT, OP, RIGHT) do { \ + boolean_t _verify3_left = (boolean_t)(LEFT); \ + boolean_t _verify3_right = (boolean_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%d " #OP " %d)\n", \ + (boolean_t) (_verify3_left), \ + (boolean_t) (_verify3_right)); \ + } while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) do { \ + int64_t _verify3_left = (int64_t)(LEFT); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%lld " #OP " %lld)\n", \ + (long long) (_verify3_left), \ + (long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) do { \ + uint64_t _verify3_left = (uint64_t)(LEFT); \ + uint64_t _verify3_right = (uint64_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%llu " #OP " %llu)\n", \ + (unsigned long long) (_verify3_left), \ + (unsigned long long) (_verify3_right)); \ + } while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) do { \ + uintptr_t _verify3_left = (uintptr_t)(LEFT); \ + uintptr_t _verify3_right = (uintptr_t)(RIGHT); \ + if (!(_verify3_left OP _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(" #LEFT " " #OP " " #RIGHT ") " \ + "failed (%px " #OP " %px)\n", \ + (void *) (_verify3_left), \ + (void *) (_verify3_right)); \ + } while (0) + +#define VERIFY0(RIGHT) do { \ + int64_t _verify3_left = (int64_t)(0); \ + int64_t _verify3_right = (int64_t)(RIGHT); \ + if (!(_verify3_left == _verify3_right)) \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "VERIFY3(0 == " #RIGHT ") " \ + "failed (0 == %lld)\n", \ + (long long) (_verify3_right)); \ + } while (0) + +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__ ((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + +/* + * Debugging disabled (--disable-debug) + */ +#ifdef NDEBUG + +#define ASSERT(x) ((void)0) +#define ASSERT3B(x,y,z) ((void)0) +#define ASSERT3S(x,y,z) ((void)0) +#define ASSERT3U(x,y,z) ((void)0) +#define ASSERT3P(x,y,z) ((void)0) +#define ASSERT0(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) + +/* + * Debugging enabled (--enable-debug) + */ +#else + +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") implies (" #B ")"))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + spl_panic(__FILE__, __FUNCTION__, __LINE__, \ + "(" #A ") is equivalent to (" #B ")"))) +/* END CSTYLED */ + +#endif /* NDEBUG */ + +#endif /* SPL_DEBUG_H */ diff --git a/include/os/linux/spl/sys/disp.h b/include/os/linux/spl/sys/disp.h new file mode 100644 index 000000000000..413b623c8145 --- /dev/null +++ b/include/os/linux/spl/sys/disp.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_DISP_H +#define _SPL_DISP_H + +#include + +#define kpreempt(unused) schedule() +#define kpreempt_disable() preempt_disable() +#define kpreempt_enable() preempt_enable() + +#endif /* SPL_DISP_H */ diff --git a/include/os/linux/spl/sys/dkio.h b/include/os/linux/spl/sys/dkio.h new file mode 100644 index 000000000000..49f166a9c4aa --- /dev/null +++ b/include/os/linux/spl/sys/dkio.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_DKIO_H +#define _SPL_DKIO_H + +#define DFL_SZ(num_exts) \ + (sizeof (dkioc_free_list_t) + (num_exts - 1) * 16) + +#define DKIOC (0x04 << 8) +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +/* + * ioctl to free space (e.g. SCSI UNMAP) off a disk. + * Pass a dkioc_free_list_t containing a list of extents to be freed. + */ +#define DKIOCFREE (DKIOC|50) + +#endif /* _SPL_DKIO_H */ diff --git a/include/os/linux/spl/sys/errno.h b/include/os/linux/spl/sys/errno.h new file mode 100644 index 000000000000..f6d9212a613f --- /dev/null +++ b/include/os/linux/spl/sys/errno.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2000 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_ERRNO_H +#define _SYS_ERRNO_H + +#include + +#define ENOTSUP EOPNOTSUPP + +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ENOANO + +#endif /* _SYS_ERRNO_H */ diff --git a/include/os/linux/spl/sys/fcntl.h b/include/os/linux/spl/sys/fcntl.h new file mode 100644 index 000000000000..3faa5dad78cb --- /dev/null +++ b/include/os/linux/spl/sys/fcntl.h @@ -0,0 +1,37 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_FCNTL_H +#define _SPL_FCNTL_H + +#include + +#define F_FREESP 11 + +#ifdef CONFIG_64BIT +typedef struct flock flock64_t; +#else +typedef struct flock64 flock64_t; +#endif /* CONFIG_64BIT */ + +#endif /* _SPL_FCNTL_H */ diff --git a/include/os/linux/spl/sys/file.h b/include/os/linux/spl/sys/file.h new file mode 100644 index 000000000000..05dbc0814296 --- /dev/null +++ b/include/os/linux/spl/sys/file.h @@ -0,0 +1,52 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_FILE_H +#define _SPL_FILE_H + +#define FIGNORECASE 0x00080000 +#define FKIOCTL 0x80000000 +#define ED_CASE_CONFLICT 0x10 + +#ifdef HAVE_INODE_LOCK_SHARED +#define spl_inode_lock(ip) inode_lock(ip) +#define spl_inode_unlock(ip) inode_unlock(ip) +#define spl_inode_lock_shared(ip) inode_lock_shared(ip) +#define spl_inode_unlock_shared(ip) inode_unlock_shared(ip) +#define spl_inode_trylock(ip) inode_trylock(ip) +#define spl_inode_trylock_shared(ip) inode_trylock_shared(ip) +#define spl_inode_is_locked(ip) inode_is_locked(ip) +#define spl_inode_lock_nested(ip, s) inode_lock_nested(ip, s) +#else +#define spl_inode_lock(ip) mutex_lock(&(ip)->i_mutex) +#define spl_inode_unlock(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_lock_shared(ip) mutex_lock(&(ip)->i_mutex) +#define spl_inode_unlock_shared(ip) mutex_unlock(&(ip)->i_mutex) +#define spl_inode_trylock(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_trylock_shared(ip) mutex_trylock(&(ip)->i_mutex) +#define spl_inode_is_locked(ip) mutex_is_locked(&(ip)->i_mutex) +#define spl_inode_lock_nested(ip, s) mutex_lock_nested(&(ip)->i_mutex, s) +#endif + +#endif /* SPL_FILE_H */ diff --git a/include/os/linux/spl/sys/inttypes.h b/include/os/linux/spl/sys/inttypes.h new file mode 100644 index 000000000000..92e76206ba52 --- /dev/null +++ b/include/os/linux/spl/sys/inttypes.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_INTTYPES_H +#define _SPL_INTTYPES_H + +#endif /* SPL_INTTYPES_H */ diff --git a/include/os/linux/spl/sys/isa_defs.h b/include/os/linux/spl/sys/isa_defs.h new file mode 100644 index 000000000000..af064e567e13 --- /dev/null +++ b/include/os/linux/spl/sys/isa_defs.h @@ -0,0 +1,254 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_ISA_DEFS_H +#define _SPL_ISA_DEFS_H + +/* x86_64 arch specific defines */ +#if defined(__x86_64) || defined(__x86_64__) + +#if !defined(__x86_64) +#define __x86_64 +#endif + +#if !defined(__amd64) +#define __amd64 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if defined(_ILP32) +/* x32-specific defines; careful to *not* define _LP64 here */ +#else +#if !defined(_LP64) +#define _LP64 +#endif +#endif + +#define _ALIGNMENT_REQUIRED 1 + + +/* i386 arch specific defines */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_ILP32) +#define _ILP32 +#endif + +#define _ALIGNMENT_REQUIRED 0 + +/* powerpc (ppc64) arch specific defines */ +#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if !defined(__powerpc__) +#define __powerpc__ +#endif + +#if defined(__powerpc64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for PPC, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* arm arch specific defines */ +#elif defined(__arm) || defined(__arm__) || defined(__aarch64__) + +#if !defined(__arm) +#define __arm +#endif + +#if !defined(__arm__) +#define __arm__ +#endif + +#if defined(__aarch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#if defined(__ARMEL__) || defined(__AARCH64EL__) +#define _ZFS_LITTLE_ENDIAN +#else +#define _ZFS_BIG_ENDIAN +#endif + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for ARM, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* sparc arch specific defines */ +#elif defined(__sparc) || defined(__sparc__) + +#if !defined(__sparc) +#define __sparc +#endif + +#if !defined(__sparc__) +#define __sparc__ +#endif + +#if defined(__arch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _ZFS_BIG_ENDIAN +#define _SUNOS_VTOC_16 +#define _ALIGNMENT_REQUIRED 1 + +/* s390 arch specific defines */ +#elif defined(__s390__) +#if defined(__s390x__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _ZFS_BIG_ENDIAN + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for s390, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* MIPS arch specific defines */ +#elif defined(__mips__) + +#if defined(__MIPSEB__) +#define _ZFS_BIG_ENDIAN +#elif defined(__MIPSEL__) +#define _ZFS_LITTLE_ENDIAN +#else +#error MIPS no endian specified +#endif + +#ifndef _LP64 +#define _ILP32 +#endif + +#define _SUNOS_VTOC_16 + +/* + * Illumos doesn't define _ALIGNMENT_REQUIRED for MIPS, so default to 1 + * out of paranoia. + */ +#define _ALIGNMENT_REQUIRED 1 + +/* + * RISC-V arch specific defines + * only RV64G (including atomic) LP64 is supported yet + */ +#elif defined(__riscv) && defined(_LP64) && _LP64 && \ + defined(__riscv_atomic) && __riscv_atomic + +#ifndef __riscv__ +#define __riscv__ +#endif + +#ifndef __rv64g__ +#define __rv64g__ +#endif + +#define _ZFS_LITTLE_ENDIAN + +#define _SUNOS_VTOC_16 + +#define _ALIGNMENT_REQUIRED 1 + +#else +/* + * Currently supported: + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G + */ +#error "Unsupported ISA type" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#if !defined(_ILP32) && !defined(_LP64) +#error "Neither _ILP32 or _LP64 are defined" +#endif + +#include + +/* + * CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS will be defined by the Linux + * kernel for architectures which support efficient unaligned access. + */ +#if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) +#define HAVE_EFFICIENT_UNALIGNED_ACCESS +#endif + +#if defined(_ZFS_LITTLE_ENDIAN) && defined(_ZFS_BIG_ENDIAN) +#error "Both _ZFS_LITTLE_ENDIAN and _ZFS_BIG_ENDIAN are defined" +#endif + +#if !defined(_ZFS_LITTLE_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#error "Neither _ZFS_LITTLE_ENDIAN or _ZFS_BIG_ENDIAN are defined" +#endif + +#endif /* _SPL_ISA_DEFS_H */ diff --git a/include/os/linux/spl/sys/kmem.h b/include/os/linux/spl/sys/kmem.h new file mode 100644 index 000000000000..c09c40fa34b9 --- /dev/null +++ b/include/os/linux/spl/sys/kmem.h @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KMEM_H +#define _SPL_KMEM_H + +#include +#include +#include +#include +#include + +extern int kmem_debugging(void); +extern char *kmem_vasprintf(const char *fmt, va_list ap); +extern char *kmem_asprintf(const char *fmt, ...); +extern char *kmem_strdup(const char *str); +extern void kmem_strfree(char *str); + +/* + * Memory allocation interfaces + */ +#define KM_SLEEP 0x0000 /* can block for memory; success guaranteed */ +#define KM_NOSLEEP 0x0001 /* cannot block for memory; may fail */ +#define KM_PUSHPAGE 0x0004 /* can block for memory; may use reserve */ +#define KM_ZERO 0x1000 /* zero the allocation */ +#define KM_VMEM 0x2000 /* caller is vmem_* wrapper */ + +#define KM_PUBLIC_MASK (KM_SLEEP | KM_NOSLEEP | KM_PUSHPAGE) + +static int spl_fstrans_check(void); +void *spl_kvmalloc(size_t size, gfp_t flags); + +/* + * Convert a KM_* flags mask to its Linux GFP_* counterpart. The conversion + * function is context aware which means that KM_SLEEP allocations can be + * safely used in syncing contexts which have set PF_FSTRANS. + */ +static inline gfp_t +kmem_flags_convert(int flags) +{ + gfp_t lflags = __GFP_NOWARN | __GFP_COMP; + + if (flags & KM_NOSLEEP) { + lflags |= GFP_ATOMIC | __GFP_NORETRY; + } else { + lflags |= GFP_KERNEL; + if (spl_fstrans_check()) + lflags &= ~(__GFP_IO|__GFP_FS); + } + + if (flags & KM_PUSHPAGE) + lflags |= __GFP_HIGH; + + if (flags & KM_ZERO) + lflags |= __GFP_ZERO; + + return (lflags); +} + +typedef struct { + struct task_struct *fstrans_thread; + unsigned int saved_flags; +} fstrans_cookie_t; + +/* + * Introduced in Linux 3.9, however this cannot be solely relied on before + * Linux 3.18 as it doesn't turn off __GFP_FS as it should. + */ +#ifdef PF_MEMALLOC_NOIO +#define __SPL_PF_MEMALLOC_NOIO (PF_MEMALLOC_NOIO) +#else +#define __SPL_PF_MEMALLOC_NOIO (0) +#endif + +/* + * PF_FSTRANS is removed from Linux 4.12 + */ +#ifdef PF_FSTRANS +#define __SPL_PF_FSTRANS (PF_FSTRANS) +#else +#define __SPL_PF_FSTRANS (0) +#endif + +#define SPL_FSTRANS (__SPL_PF_FSTRANS|__SPL_PF_MEMALLOC_NOIO) + +static inline fstrans_cookie_t +spl_fstrans_mark(void) +{ + fstrans_cookie_t cookie; + + BUILD_BUG_ON(SPL_FSTRANS == 0); + + cookie.fstrans_thread = current; + cookie.saved_flags = current->flags & SPL_FSTRANS; + current->flags |= SPL_FSTRANS; + + return (cookie); +} + +static inline void +spl_fstrans_unmark(fstrans_cookie_t cookie) +{ + ASSERT3P(cookie.fstrans_thread, ==, current); + ASSERT((current->flags & SPL_FSTRANS) == SPL_FSTRANS); + + current->flags &= ~SPL_FSTRANS; + current->flags |= cookie.saved_flags; +} + +static inline int +spl_fstrans_check(void) +{ + return (current->flags & SPL_FSTRANS); +} + +/* + * specifically used to check PF_FSTRANS flag, cannot be relied on for + * checking spl_fstrans_mark(). + */ +static inline int +__spl_pf_fstrans_check(void) +{ + return (current->flags & __SPL_PF_FSTRANS); +} + +/* + * Kernel compatibility for GFP flags + */ +/* < 4.13 */ +#ifndef __GFP_RETRY_MAYFAIL +#define __GFP_RETRY_MAYFAIL __GFP_REPEAT +#endif +/* < 4.4 */ +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +#ifdef HAVE_ATOMIC64_T +#define kmem_alloc_used_add(size) atomic64_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic64_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic64_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic64_set(&kmem_alloc_used, size) +extern atomic64_t kmem_alloc_used; +extern unsigned long long kmem_alloc_max; +#else /* HAVE_ATOMIC64_T */ +#define kmem_alloc_used_add(size) atomic_add(size, &kmem_alloc_used) +#define kmem_alloc_used_sub(size) atomic_sub(size, &kmem_alloc_used) +#define kmem_alloc_used_read() atomic_read(&kmem_alloc_used) +#define kmem_alloc_used_set(size) atomic_set(&kmem_alloc_used, size) +extern atomic_t kmem_alloc_used; +extern unsigned long long kmem_alloc_max; +#endif /* HAVE_ATOMIC64_T */ + +extern unsigned int spl_kmem_alloc_warn; +extern unsigned int spl_kmem_alloc_max; + +#define kmem_alloc(sz, fl) spl_kmem_alloc((sz), (fl), __func__, __LINE__) +#define kmem_zalloc(sz, fl) spl_kmem_zalloc((sz), (fl), __func__, __LINE__) +#define kmem_free(ptr, sz) spl_kmem_free((ptr), (sz)) +#define kmem_cache_reap_active spl_kmem_cache_reap_active + +extern void *spl_kmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_kmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_kmem_free(const void *ptr, size_t sz); + +/* + * 5.8 API change, pgprot_t argument removed. + */ +#ifdef HAVE_VMALLOC_PAGE_KERNEL +#define spl_vmalloc(size, flags) __vmalloc(size, flags, PAGE_KERNEL) +#else +#define spl_vmalloc(size, flags) __vmalloc(size, flags) +#endif + +/* + * The following functions are only available for internal use. + */ +extern void *spl_kmem_alloc_impl(size_t size, int flags, int node); +extern void *spl_kmem_alloc_debug(size_t size, int flags, int node); +extern void *spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node); +extern void spl_kmem_free_impl(const void *buf, size_t size); +extern void spl_kmem_free_debug(const void *buf, size_t size); +extern void spl_kmem_free_track(const void *buf, size_t size); + +extern int spl_kmem_init(void); +extern void spl_kmem_fini(void); +extern int spl_kmem_cache_reap_active(void); + +#endif /* _SPL_KMEM_H */ diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h new file mode 100644 index 000000000000..ffb8c97c9c91 --- /dev/null +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KMEM_CACHE_H +#define _SPL_KMEM_CACHE_H + +#include + +/* + * Slab allocation interfaces. The SPL slab differs from the standard + * Linux SLAB or SLUB primarily in that each cache may be backed by slabs + * allocated from the physical or virtual memory address space. The virtual + * slabs allow for good behavior when allocation large objects of identical + * size. This slab implementation also supports both constructors and + * destructors which the Linux slab does not. + */ +typedef enum kmc_bit { + KMC_BIT_NODEBUG = 1, /* Default behavior */ + KMC_BIT_KVMEM = 7, /* Use kvmalloc linux allocator */ + KMC_BIT_SLAB = 8, /* Use Linux slab cache */ + KMC_BIT_DEADLOCKED = 14, /* Deadlock detected */ + KMC_BIT_GROWING = 15, /* Growing in progress */ + KMC_BIT_REAPING = 16, /* Reaping in progress */ + KMC_BIT_DESTROY = 17, /* Destroy in progress */ + KMC_BIT_TOTAL = 18, /* Proc handler helper bit */ + KMC_BIT_ALLOC = 19, /* Proc handler helper bit */ + KMC_BIT_MAX = 20, /* Proc handler helper bit */ +} kmc_bit_t; + +/* kmem move callback return values */ +typedef enum kmem_cbrc { + KMEM_CBRC_YES = 0, /* Object moved */ + KMEM_CBRC_NO = 1, /* Object not moved */ + KMEM_CBRC_LATER = 2, /* Object not moved, try again later */ + KMEM_CBRC_DONT_NEED = 3, /* Neither object is needed */ + KMEM_CBRC_DONT_KNOW = 4, /* Object unknown */ +} kmem_cbrc_t; + +#define KMC_NODEBUG (1 << KMC_BIT_NODEBUG) +#define KMC_KVMEM (1 << KMC_BIT_KVMEM) +#define KMC_SLAB (1 << KMC_BIT_SLAB) +#define KMC_DEADLOCKED (1 << KMC_BIT_DEADLOCKED) +#define KMC_GROWING (1 << KMC_BIT_GROWING) +#define KMC_REAPING (1 << KMC_BIT_REAPING) +#define KMC_DESTROY (1 << KMC_BIT_DESTROY) +#define KMC_TOTAL (1 << KMC_BIT_TOTAL) +#define KMC_ALLOC (1 << KMC_BIT_ALLOC) +#define KMC_MAX (1 << KMC_BIT_MAX) + +#define KMC_REAP_CHUNK INT_MAX +#define KMC_DEFAULT_SEEKS 1 + +#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ + +extern struct list_head spl_kmem_cache_list; +extern struct rw_semaphore spl_kmem_cache_sem; + +#define SKM_MAGIC 0x2e2e2e2e +#define SKO_MAGIC 0x20202020 +#define SKS_MAGIC 0x22222222 +#define SKC_MAGIC 0x2c2c2c2c + +#define SPL_KMEM_CACHE_OBJ_PER_SLAB 8 /* Target objects per slab */ +#define SPL_KMEM_CACHE_ALIGN 8 /* Default object alignment */ +#ifdef _LP64 +#define SPL_KMEM_CACHE_MAX_SIZE 32 /* Max slab size in MB */ +#else +#define SPL_KMEM_CACHE_MAX_SIZE 4 /* Max slab size in MB */ +#endif + +#define SPL_MAX_ORDER (MAX_ORDER - 3) +#define SPL_MAX_ORDER_NR_PAGES (1 << (SPL_MAX_ORDER - 1)) + +#ifdef CONFIG_SLUB +#define SPL_MAX_KMEM_CACHE_ORDER PAGE_ALLOC_COSTLY_ORDER +#define SPL_MAX_KMEM_ORDER_NR_PAGES (1 << (SPL_MAX_KMEM_CACHE_ORDER - 1)) +#else +#define SPL_MAX_KMEM_ORDER_NR_PAGES (KMALLOC_MAX_SIZE >> PAGE_SHIFT) +#endif + +#define POINTER_IS_VALID(p) 0 /* Unimplemented */ +#define POINTER_INVALIDATE(pp) /* Unimplemented */ + +typedef int (*spl_kmem_ctor_t)(void *, void *, int); +typedef void (*spl_kmem_dtor_t)(void *, void *); + +typedef struct spl_kmem_magazine { + uint32_t skm_magic; /* Sanity magic */ + uint32_t skm_avail; /* Available objects */ + uint32_t skm_size; /* Magazine size */ + uint32_t skm_refill; /* Batch refill size */ + struct spl_kmem_cache *skm_cache; /* Owned by cache */ + unsigned int skm_cpu; /* Owned by cpu */ + void *skm_objs[0]; /* Object pointers */ +} spl_kmem_magazine_t; + +typedef struct spl_kmem_obj { + uint32_t sko_magic; /* Sanity magic */ + void *sko_addr; /* Buffer address */ + struct spl_kmem_slab *sko_slab; /* Owned by slab */ + struct list_head sko_list; /* Free object list linkage */ +} spl_kmem_obj_t; + +typedef struct spl_kmem_slab { + uint32_t sks_magic; /* Sanity magic */ + uint32_t sks_objs; /* Objects per slab */ + struct spl_kmem_cache *sks_cache; /* Owned by cache */ + struct list_head sks_list; /* Slab list linkage */ + struct list_head sks_free_list; /* Free object list */ + unsigned long sks_age; /* Last modify jiffie */ + uint32_t sks_ref; /* Ref count used objects */ +} spl_kmem_slab_t; + +typedef struct spl_kmem_alloc { + struct spl_kmem_cache *ska_cache; /* Owned by cache */ + int ska_flags; /* Allocation flags */ + taskq_ent_t ska_tqe; /* Task queue entry */ +} spl_kmem_alloc_t; + +typedef struct spl_kmem_emergency { + struct rb_node ske_node; /* Emergency tree linkage */ + unsigned long ske_obj; /* Buffer address */ +} spl_kmem_emergency_t; + +typedef struct spl_kmem_cache { + uint32_t skc_magic; /* Sanity magic */ + uint32_t skc_name_size; /* Name length */ + char *skc_name; /* Name string */ + spl_kmem_magazine_t **skc_mag; /* Per-CPU warm cache */ + uint32_t skc_mag_size; /* Magazine size */ + uint32_t skc_mag_refill; /* Magazine refill count */ + spl_kmem_ctor_t skc_ctor; /* Constructor */ + spl_kmem_dtor_t skc_dtor; /* Destructor */ + void *skc_private; /* Private data */ + void *skc_vmp; /* Unused */ + struct kmem_cache *skc_linux_cache; /* Linux slab cache if used */ + unsigned long skc_flags; /* Flags */ + uint32_t skc_obj_size; /* Object size */ + uint32_t skc_obj_align; /* Object alignment */ + uint32_t skc_slab_objs; /* Objects per slab */ + uint32_t skc_slab_size; /* Slab size */ + atomic_t skc_ref; /* Ref count callers */ + taskqid_t skc_taskqid; /* Slab reclaim task */ + struct list_head skc_list; /* List of caches linkage */ + struct list_head skc_complete_list; /* Completely alloc'ed */ + struct list_head skc_partial_list; /* Partially alloc'ed */ + struct rb_root skc_emergency_tree; /* Min sized objects */ + spinlock_t skc_lock; /* Cache lock */ + spl_wait_queue_head_t skc_waitq; /* Allocation waiters */ + uint64_t skc_slab_fail; /* Slab alloc failures */ + uint64_t skc_slab_create; /* Slab creates */ + uint64_t skc_slab_destroy; /* Slab destroys */ + uint64_t skc_slab_total; /* Slab total current */ + uint64_t skc_slab_alloc; /* Slab alloc current */ + uint64_t skc_slab_max; /* Slab max historic */ + uint64_t skc_obj_total; /* Obj total current */ + uint64_t skc_obj_alloc; /* Obj alloc current */ + struct percpu_counter skc_linux_alloc; /* Linux-backed Obj alloc */ + uint64_t skc_obj_max; /* Obj max historic */ + uint64_t skc_obj_deadlock; /* Obj emergency deadlocks */ + uint64_t skc_obj_emergency; /* Obj emergency current */ + uint64_t skc_obj_emergency_max; /* Obj emergency max */ +} spl_kmem_cache_t; +#define kmem_cache_t spl_kmem_cache_t + +extern spl_kmem_cache_t *spl_kmem_cache_create(char *name, size_t size, + size_t align, spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, + void *reclaim, void *priv, void *vmp, int flags); +extern void spl_kmem_cache_set_move(spl_kmem_cache_t *, + kmem_cbrc_t (*)(void *, void *, size_t, void *)); +extern void spl_kmem_cache_destroy(spl_kmem_cache_t *skc); +extern void *spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags); +extern void spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj); +extern void spl_kmem_cache_set_allocflags(spl_kmem_cache_t *skc, gfp_t flags); +extern void spl_kmem_cache_reap_now(spl_kmem_cache_t *skc); +extern void spl_kmem_reap(void); +extern uint64_t spl_kmem_cache_inuse(kmem_cache_t *cache); +extern uint64_t spl_kmem_cache_entry_size(kmem_cache_t *cache); + +#define kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) \ + spl_kmem_cache_create(name, size, align, ctor, dtor, rclm, priv, vmp, fl) +#define kmem_cache_set_move(skc, move) spl_kmem_cache_set_move(skc, move) +#define kmem_cache_destroy(skc) spl_kmem_cache_destroy(skc) +#define kmem_cache_alloc(skc, flags) spl_kmem_cache_alloc(skc, flags) +#define kmem_cache_free(skc, obj) spl_kmem_cache_free(skc, obj) +#define kmem_cache_reap_now(skc) spl_kmem_cache_reap_now(skc) +#define kmem_reap() spl_kmem_reap() + +/* + * The following functions are only available for internal use. + */ +extern int spl_kmem_cache_init(void); +extern void spl_kmem_cache_fini(void); + +#endif /* _SPL_KMEM_CACHE_H */ diff --git a/include/os/linux/spl/sys/kstat.h b/include/os/linux/spl/sys/kstat.h new file mode 100644 index 000000000000..c93c53171d88 --- /dev/null +++ b/include/os/linux/spl/sys/kstat.h @@ -0,0 +1,223 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_KSTAT_H +#define _SPL_KSTAT_H + +#include +#include +#include +#include +#include +#include + +#define KSTAT_STRLEN 255 +#define KSTAT_RAW_MAX (128*1024) + +/* + * For reference valid classes are: + * disk, tape, net, controller, vm, kvm, hat, streams, kstat, misc + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything; ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair; ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt stats; ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O stats; ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer; ks_ndata >= 1 */ +#define KSTAT_NUM_TYPES 5 + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 +#define KSTAT_DATA_LONG 5 +#define KSTAT_DATA_ULONG 6 +#define KSTAT_DATA_STRING 7 +#define KSTAT_NUM_DATAS 8 + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 +#define KSTAT_NUM_INTRS 5 + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +#define KS_MAGIC 0x9d9d9d9d + +/* Dynamic updates */ +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +struct kstat_s; +typedef struct kstat_s kstat_t; + +typedef int kid_t; /* unique kstat id */ +typedef int kstat_update_t(struct kstat_s *, int); /* dynamic update cb */ + +typedef struct kstat_module { + char ksm_name[KSTAT_STRLEN+1]; /* module name */ + struct list_head ksm_module_list; /* module linkage */ + struct list_head ksm_kstat_list; /* list of kstat entries */ + struct proc_dir_entry *ksm_proc; /* proc entry */ +} kstat_module_t; + +typedef struct kstat_raw_ops { + int (*headers)(char *buf, size_t size); + int (*data)(char *buf, size_t size, void *data); + void *(*addr)(kstat_t *ksp, loff_t index); +} kstat_raw_ops_t; + +typedef struct kstat_proc_entry { + char kpe_name[KSTAT_STRLEN+1]; /* kstat name */ + char kpe_module[KSTAT_STRLEN+1]; /* provider module name */ + kstat_module_t *kpe_owner; /* kstat module linkage */ + struct list_head kpe_list; /* kstat linkage */ + struct proc_dir_entry *kpe_proc; /* procfs entry */ +} kstat_proc_entry_t; + +struct kstat_s { + int ks_magic; /* magic value */ + kid_t ks_kid; /* unique kstat ID */ + hrtime_t ks_crtime; /* creation time */ + hrtime_t ks_snaptime; /* last access time */ + int ks_instance; /* provider module instance */ + char ks_class[KSTAT_STRLEN+1]; /* kstat class */ + uchar_t ks_type; /* kstat data type */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of data records */ + size_t ks_data_size; /* size of kstat data section */ + kstat_update_t *ks_update; /* dynamic updates */ + void *ks_private; /* private data */ + kmutex_t ks_private_lock; /* kstat private data lock */ + kmutex_t *ks_lock; /* kstat data lock */ + kstat_raw_ops_t ks_raw_ops; /* ops table for raw type */ + char *ks_raw_buf; /* buf used for raw ops */ + size_t ks_raw_bufsize; /* size of raw ops buffer */ + kstat_proc_entry_t ks_proc; /* data for procfs entry */ +}; + +typedef struct kstat_named_s { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* 128-bit int */ + int32_t i32; /* 32-bit signed int */ + uint32_t ui32; /* 32-bit unsigned int */ + int64_t i64; /* 64-bit signed int */ + uint64_t ui64; /* 64-bit unsigned int */ + long l; /* native signed long */ + ulong_t ul; /* native unsigned long */ + struct { + union { + char *ptr; /* NULL-term string */ + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } string; + } value; +} kstat_named_t; + +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.string.addr.ptr) +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.string.len) + +#ifdef HAVE_PROC_OPS_STRUCT +typedef struct proc_ops kstat_proc_op_t; +#else +typedef struct file_operations kstat_proc_op_t; +#endif + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; +} kstat_intr_t; + +typedef struct kstat_io { + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait len*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ +} kstat_io_t; + +typedef struct kstat_timer { + char name[KSTAT_STRLEN+1]; /* event name */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +int spl_kstat_init(void); +void spl_kstat_fini(void); + +extern void __kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void* (*addr)(kstat_t *ksp, loff_t index)); + +extern kstat_t *__kstat_create(const char *ks_module, int ks_instance, + const char *ks_name, const char *ks_class, uchar_t ks_type, + uint_t ks_ndata, uchar_t ks_flags); + +extern void kstat_proc_entry_init(kstat_proc_entry_t *kpep, + const char *module, const char *name); +extern void kstat_proc_entry_delete(kstat_proc_entry_t *kpep); +extern void kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, + const kstat_proc_op_t *file_ops, void *data); + +extern void __kstat_install(kstat_t *ksp); +extern void __kstat_delete(kstat_t *ksp); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); + +#define kstat_set_raw_ops(k, h, d, a) \ + __kstat_set_raw_ops(k, h, d, a) +#define kstat_create(m, i, n, c, t, s, f) \ + __kstat_create(m, i, n, c, t, s, f) + +#define kstat_install(k) __kstat_install(k) +#define kstat_delete(k) __kstat_delete(k) + +#endif /* _SPL_KSTAT_H */ diff --git a/include/os/linux/spl/sys/list.h b/include/os/linux/spl/sys/list.h new file mode 100644 index 000000000000..be38f328fc65 --- /dev/null +++ b/include/os/linux/spl/sys/list.h @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_LIST_H +#define _SPL_LIST_H + +#include +#include +#include + +/* + * NOTE: I have implemented the Solaris list API in terms of the native + * linux API. This has certain advantages in terms of leveraging the linux + * list debugging infrastructure, but it also means that the internals of a + * list differ slightly than on Solaris. This is not a problem as long as + * all callers stick to the published API. The two major differences are: + * + * 1) A list_node_t is mapped to a linux list_head struct which changes + * the name of the list_next/list_prev pointers to next/prev respectively. + * + * 2) A list_node_t which is not attached to a list on Solaris is denoted + * by having its list_next/list_prev pointers set to NULL. Under linux + * the next/prev pointers are set to LIST_POISON1 and LIST_POISON2 + * respectively. At this moment this only impacts the implementation + * of the list_link_init() and list_link_active() functions. + */ + +typedef struct list_head list_node_t; + +typedef struct list { + size_t list_size; + size_t list_offset; + list_node_t list_head; +} list_t; + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) + +static inline int +list_is_empty(list_t *list) +{ + return (list_empty(&list->list_head)); +} + +static inline void +list_link_init(list_node_t *node) +{ + node->next = LIST_POISON1; + node->prev = LIST_POISON2; +} + +static inline void +list_create(list_t *list, size_t size, size_t offset) +{ + list->list_size = size; + list->list_offset = offset; + INIT_LIST_HEAD(&list->list_head); +} + +static inline void +list_destroy(list_t *list) +{ + list_del(&list->list_head); +} + +static inline void +list_insert_head(list_t *list, void *object) +{ + list_add(list_d2l(list, object), &list->list_head); +} + +static inline void +list_insert_tail(list_t *list, void *object) +{ + list_add_tail(list_d2l(list, object), &list->list_head); +} + +static inline void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) + list_insert_head(list, nobject); + else + list_add(list_d2l(list, nobject), list_d2l(list, object)); +} + +static inline void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) + list_insert_tail(list, nobject); + else + list_add_tail(list_d2l(list, nobject), list_d2l(list, object)); +} + +static inline void +list_remove(list_t *list, void *object) +{ + list_del(list_d2l(list, object)); +} + +static inline void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.next; + if (head == &list->list_head) + return (NULL); + + list_del(head); + return (list_object(list, head)); +} + +static inline void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.prev; + if (tail == &list->list_head) + return (NULL); + + list_del(tail); + return (list_object(list, tail)); +} + +static inline void * +list_head(list_t *list) +{ + if (list_is_empty(list)) + return (NULL); + + return (list_object(list, list->list_head.next)); +} + +static inline void * +list_tail(list_t *list) +{ + if (list_is_empty(list)) + return (NULL); + + return (list_object(list, list->list_head.prev)); +} + +static inline void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->next != &list->list_head) + return (list_object(list, node->next)); + + return (NULL); +} + +static inline void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->prev != &list->list_head) + return (list_object(list, node->prev)); + + return (NULL); +} + +static inline int +list_link_active(list_node_t *node) +{ + EQUIV(node->next == LIST_POISON1, node->prev == LIST_POISON2); + return (node->next != LIST_POISON1); +} + +static inline void +spl_list_move_tail(list_t *dst, list_t *src) +{ + list_splice_init(&src->list_head, dst->list_head.prev); +} + +#define list_move_tail(dst, src) spl_list_move_tail(dst, src) + +static inline void +list_link_replace(list_node_t *old_node, list_node_t *new_node) +{ + new_node->next = old_node->next; + new_node->prev = old_node->prev; + old_node->prev->next = new_node; + old_node->next->prev = new_node; + list_link_init(old_node); +} + +#endif /* SPL_LIST_H */ diff --git a/include/os/linux/spl/sys/mod_os.h b/include/os/linux/spl/sys/mod_os.h new file mode 100644 index 000000000000..8adf6212907f --- /dev/null +++ b/include/os/linux/spl/sys/mod_os.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ +#ifndef _SPL_MOD_H +#define _SPL_MOD_H +#include + +#endif /* SPL_MOD_H */ diff --git a/include/os/linux/spl/sys/mutex.h b/include/os/linux/spl/sys/mutex.h new file mode 100644 index 000000000000..93f3af8fe016 --- /dev/null +++ b/include/os/linux/spl/sys/mutex.h @@ -0,0 +1,185 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_MUTEX_H +#define _SPL_MUTEX_H + +#include +#include +#include +#include +#include + +typedef enum { + MUTEX_DEFAULT = 0, + MUTEX_SPIN = 1, + MUTEX_ADAPTIVE = 2, + MUTEX_NOLOCKDEP = 3 +} kmutex_type_t; + +typedef struct { + struct mutex m_mutex; + spinlock_t m_lock; /* used for serializing mutex_exit */ + kthread_t *m_owner; +#ifdef CONFIG_LOCKDEP + kmutex_type_t m_type; +#endif /* CONFIG_LOCKDEP */ +} kmutex_t; + +#define MUTEX(mp) (&((mp)->m_mutex)) + +static inline void +spl_mutex_set_owner(kmutex_t *mp) +{ + mp->m_owner = current; +} + +static inline void +spl_mutex_clear_owner(kmutex_t *mp) +{ + mp->m_owner = NULL; +} + +#define mutex_owner(mp) (READ_ONCE((mp)->m_owner)) +#define mutex_owned(mp) (mutex_owner(mp) == current) +#define MUTEX_HELD(mp) mutex_owned(mp) +#define MUTEX_NOT_HELD(mp) (!MUTEX_HELD(mp)) + +#ifdef CONFIG_LOCKDEP +static inline void +spl_mutex_set_type(kmutex_t *mp, kmutex_type_t type) +{ + mp->m_type = type; +} +static inline void +spl_mutex_lockdep_off_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_mutex_lockdep_on_maybe(kmutex_t *mp) \ +{ \ + if (mp && mp->m_type == MUTEX_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_mutex_set_type(mp, type) +#define spl_mutex_lockdep_off_maybe(mp) +#define spl_mutex_lockdep_on_maybe(mp) +#endif /* CONFIG_LOCKDEP */ + +/* + * The following functions must be a #define and not static inline. + * This ensures that the native linux mutex functions (lock/unlock) + * will be correctly located in the users code which is important + * for the built in kernel lock analysis tools + */ +#undef mutex_init +#define mutex_init(mp, name, type, ibc) \ +{ \ + static struct lock_class_key __key; \ + ASSERT(type == MUTEX_DEFAULT || type == MUTEX_NOLOCKDEP); \ + \ + __mutex_init(MUTEX(mp), (name) ? (#name) : (#mp), &__key); \ + spin_lock_init(&(mp)->m_lock); \ + spl_mutex_clear_owner(mp); \ + spl_mutex_set_type(mp, type); \ +} + +#undef mutex_destroy +#define mutex_destroy(mp) \ +{ \ + VERIFY3P(mutex_owner(mp), ==, NULL); \ +} + +/* BEGIN CSTYLED */ +#define mutex_tryenter(mp) \ +({ \ + int _rc_; \ + \ + spl_mutex_lockdep_off_maybe(mp); \ + if ((_rc_ = mutex_trylock(MUTEX(mp))) == 1) \ + spl_mutex_set_owner(mp); \ + spl_mutex_lockdep_on_maybe(mp); \ + \ + _rc_; \ +}) +/* END CSTYLED */ + +#define NESTED_SINGLE 1 + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +#define mutex_enter_nested(mp, subclass) \ +{ \ + ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_lock_nested(MUTEX(mp), (subclass)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spl_mutex_set_owner(mp); \ +} +#else /* CONFIG_DEBUG_LOCK_ALLOC */ +#define mutex_enter_nested(mp, subclass) \ +{ \ + ASSERT3P(mutex_owner(mp), !=, current); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_lock(MUTEX(mp)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spl_mutex_set_owner(mp); \ +} +#endif /* CONFIG_DEBUG_LOCK_ALLOC */ + +#define mutex_enter(mp) mutex_enter_nested((mp), 0) + +/* + * The reason for the spinlock: + * + * The Linux mutex is designed with a fast-path/slow-path design such that it + * does not guarantee serialization upon itself, allowing a race where latter + * acquirers finish mutex_unlock before former ones. + * + * The race renders it unsafe to be used for serializing the freeing of an + * object in which the mutex is embedded, where the latter acquirer could go + * on to free the object while the former one is still doing mutex_unlock and + * causing memory corruption. + * + * However, there are many places in ZFS where the mutex is used for + * serializing object freeing, and the code is shared among other OSes without + * this issue. Thus, we need the spinlock to force the serialization on + * mutex_exit(). + * + * See http://lwn.net/Articles/575477/ for the information about the race. + */ +#define mutex_exit(mp) \ +{ \ + spl_mutex_clear_owner(mp); \ + spin_lock(&(mp)->m_lock); \ + spl_mutex_lockdep_off_maybe(mp); \ + mutex_unlock(MUTEX(mp)); \ + spl_mutex_lockdep_on_maybe(mp); \ + spin_unlock(&(mp)->m_lock); \ + /* NOTE: do not dereference mp after this point */ \ +} + +#endif /* _SPL_MUTEX_H */ diff --git a/include/os/linux/spl/sys/param.h b/include/os/linux/spl/sys/param.h new file mode 100644 index 000000000000..4ef929151ae4 --- /dev/null +++ b/include/os/linux/spl/sys/param.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_PARAM_H +#define _SPL_PARAM_H + +#include + +/* Pages to bytes and back */ +#define ptob(pages) ((pages) << PAGE_SHIFT) +#define btop(bytes) ((bytes) >> PAGE_SHIFT) + +#define MAXUID UINT32_MAX + +#endif /* SPL_PARAM_H */ diff --git a/include/os/linux/spl/sys/proc.h b/include/os/linux/spl/sys/proc.h new file mode 100644 index 000000000000..fefce515eb24 --- /dev/null +++ b/include/os/linux/spl/sys/proc.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_PROC_H +#define _SPL_PROC_H + +#include +#include + +extern struct proc_dir_entry *proc_spl_kstat; + +int spl_proc_init(void); +void spl_proc_fini(void); + +static inline boolean_t +zfs_proc_is_caller(struct task_struct *t) +{ + return (t->group_leader == current->group_leader); +} + +#endif /* SPL_PROC_H */ diff --git a/include/os/linux/spl/sys/processor.h b/include/os/linux/spl/sys/processor.h new file mode 100644 index 000000000000..a70101fa2f90 --- /dev/null +++ b/include/os/linux/spl/sys/processor.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_PROCESSOR_H +#define _SPL_PROCESSOR_H + +#define getcpuid() smp_processor_id() + +typedef int processorid_t; + +#endif /* _SPL_PROCESSOR_H */ diff --git a/include/os/linux/spl/sys/procfs_list.h b/include/os/linux/spl/sys/procfs_list.h new file mode 100644 index 000000000000..eb1519c0ad63 --- /dev/null +++ b/include/os/linux/spl/sys/procfs_list.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SPL_PROCFS_LIST_H +#define _SPL_PROCFS_LIST_H + +#include +#include +#include +#include + +typedef struct procfs_list procfs_list_t; +struct procfs_list { + /* Accessed only by user of a procfs_list */ + void *pl_private; + + /* + * Accessed both by user of a procfs_list and by procfs_list + * implementation + */ + kmutex_t pl_lock; + list_t pl_list; + + /* Accessed only by procfs_list implementation */ + uint64_t pl_next_id; + int (*pl_show)(struct seq_file *f, void *p); + int (*pl_show_header)(struct seq_file *f); + int (*pl_clear)(procfs_list_t *procfs_list); + size_t pl_node_offset; + kstat_proc_entry_t pl_kstat_entry; +}; + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); + +void procfs_list_add(procfs_list_t *procfs_list, void *p); + +#endif /* _SPL_PROCFS_LIST_H */ diff --git a/include/os/linux/spl/sys/random.h b/include/os/linux/spl/sys/random.h new file mode 100644 index 000000000000..93e244f566be --- /dev/null +++ b/include/os/linux/spl/sys/random.h @@ -0,0 +1,40 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_RANDOM_H +#define _SPL_RANDOM_H + +#include +#include + +static __inline__ int +random_get_bytes(uint8_t *ptr, size_t len) +{ + get_random_bytes((void *)ptr, (int)len); + return (0); +} + +extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); + +#endif /* _SPL_RANDOM_H */ diff --git a/include/os/linux/spl/sys/rwlock.h b/include/os/linux/spl/sys/rwlock.h new file mode 100644 index 000000000000..60f5bfd986b4 --- /dev/null +++ b/include/os/linux/spl/sys/rwlock.h @@ -0,0 +1,201 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_RWLOCK_H +#define _SPL_RWLOCK_H + +#include +#include +#include + +typedef enum { + RW_DRIVER = 2, + RW_DEFAULT = 4, + RW_NOLOCKDEP = 5 +} krw_type_t; + +typedef enum { + RW_NONE = 0, + RW_WRITER = 1, + RW_READER = 2 +} krw_t; + +typedef struct { + struct rw_semaphore rw_rwlock; + kthread_t *rw_owner; +#ifdef CONFIG_LOCKDEP + krw_type_t rw_type; +#endif /* CONFIG_LOCKDEP */ +} krwlock_t; + +#define SEM(rwp) (&(rwp)->rw_rwlock) + +static inline void +spl_rw_set_owner(krwlock_t *rwp) +{ + rwp->rw_owner = current; +} + +static inline void +spl_rw_clear_owner(krwlock_t *rwp) +{ + rwp->rw_owner = NULL; +} + +static inline kthread_t * +rw_owner(krwlock_t *rwp) +{ + return (rwp->rw_owner); +} + +#ifdef CONFIG_LOCKDEP +static inline void +spl_rw_set_type(krwlock_t *rwp, krw_type_t type) +{ + rwp->rw_type = type; +} +static inline void +spl_rw_lockdep_off_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_off(); \ +} +static inline void +spl_rw_lockdep_on_maybe(krwlock_t *rwp) \ +{ \ + if (rwp && rwp->rw_type == RW_NOLOCKDEP) \ + lockdep_on(); \ +} +#else /* CONFIG_LOCKDEP */ +#define spl_rw_set_type(rwp, type) +#define spl_rw_lockdep_off_maybe(rwp) +#define spl_rw_lockdep_on_maybe(rwp) +#endif /* CONFIG_LOCKDEP */ + +static inline int +RW_LOCK_HELD(krwlock_t *rwp) +{ + return (rwsem_is_locked(SEM(rwp))); +} + +static inline int +RW_WRITE_HELD(krwlock_t *rwp) +{ + return (rw_owner(rwp) == current); +} + +static inline int +RW_READ_HELD(krwlock_t *rwp) +{ + return (RW_LOCK_HELD(rwp) && rw_owner(rwp) == NULL); +} + +/* + * The following functions must be a #define and not static inline. + * This ensures that the native linux semaphore functions (down/up) + * will be correctly located in the users code which is important + * for the built in kernel lock analysis tools + */ +/* BEGIN CSTYLED */ +#define rw_init(rwp, name, type, arg) \ +({ \ + static struct lock_class_key __key; \ + ASSERT(type == RW_DEFAULT || type == RW_NOLOCKDEP); \ + \ + __init_rwsem(SEM(rwp), #rwp, &__key); \ + spl_rw_clear_owner(rwp); \ + spl_rw_set_type(rwp, type); \ +}) + +/* + * The Linux rwsem implementation does not require a matching destroy. + */ +#define rw_destroy(rwp) ((void) 0) + +/* + * Upgrading a rwsem from a reader to a writer is not supported by the + * Linux kernel. The lock must be dropped and reacquired as a writer. + */ +#define rw_tryupgrade(rwp) RW_WRITE_HELD(rwp) + +#define rw_tryenter(rwp, rw) \ +({ \ + int _rc_ = 0; \ + \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + _rc_ = down_read_trylock(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + if ((_rc_ = down_write_trylock(SEM(rwp)))) \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ + _rc_; \ +}) + +#define rw_enter(rwp, rw) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + switch (rw) { \ + case RW_READER: \ + down_read(SEM(rwp)); \ + break; \ + case RW_WRITER: \ + down_write(SEM(rwp)); \ + spl_rw_set_owner(rwp); \ + break; \ + default: \ + VERIFY(0); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_exit(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + if (RW_WRITE_HELD(rwp)) { \ + spl_rw_clear_owner(rwp); \ + up_write(SEM(rwp)); \ + } else { \ + ASSERT(RW_READ_HELD(rwp)); \ + up_read(SEM(rwp)); \ + } \ + spl_rw_lockdep_on_maybe(rwp); \ +}) + +#define rw_downgrade(rwp) \ +({ \ + spl_rw_lockdep_off_maybe(rwp); \ + spl_rw_clear_owner(rwp); \ + downgrade_write(SEM(rwp)); \ + spl_rw_lockdep_on_maybe(rwp); \ +}) +/* END CSTYLED */ + +#endif /* _SPL_RWLOCK_H */ diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h new file mode 100644 index 000000000000..cc34d8ab1931 --- /dev/null +++ b/include/os/linux/spl/sys/shrinker.h @@ -0,0 +1,110 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SHRINKER_H +#define _SPL_SHRINKER_H + +#include +#include + +/* + * Due to frequent changes in the shrinker API the following + * compatibility wrappers should be used. They are as follows: + * + * SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost); + * + * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname, + * which is passed to spl_register_shrinker()/spl_unregister_shrinker(). + * The countfunc returns the number of free-able objects. + * The scanfunc returns the number of objects that were freed. + * The callbacks can return SHRINK_STOP if further calls can't make any more + * progress. Note that a return value of SHRINK_EMPTY is currently not + * supported. + * + * Example: + * + * static unsigned long + * my_count(struct shrinker *shrink, struct shrink_control *sc) + * { + * ...calculate number of objects in the cache... + * + * return (number of objects in the cache); + * } + * + * static unsigned long + * my_scan(struct shrinker *shrink, struct shrink_control *sc) + * { + * ...scan objects in the cache and reclaim them... + * } + * + * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS); + * + * void my_init_func(void) { + * spl_register_shrinker(&my_shrinker); + * } + */ + +#define spl_register_shrinker(x) register_shrinker(x) +#define spl_unregister_shrinker(x) unregister_shrinker(x) + +/* + * Linux 3.0 to 3.11 Shrinker API Compatibility. + */ +#if defined(HAVE_SINGLE_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ +static int \ +__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ +{ \ + if (sc->nr_to_scan != 0) { \ + (void) scanfunc(shrink, sc); \ + } \ + return (countfunc(shrink, sc)); \ +} \ + \ +static struct shrinker varname = { \ + .shrink = __ ## varname ## _wrapper, \ + .seeks = seek_cost, \ +} + +#define SHRINK_STOP (-1) + +/* + * Linux 3.12 and later Shrinker API Compatibility. + */ +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) +#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ +static struct shrinker varname = { \ + .count_objects = countfunc, \ + .scan_objects = scanfunc, \ + .seeks = seek_cost, \ +} + +#else +/* + * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. + */ +#error "Unknown shrinker callback" +#endif + +#endif /* SPL_SHRINKER_H */ diff --git a/include/os/linux/spl/sys/sid.h b/include/os/linux/spl/sys/sid.h new file mode 100644 index 000000000000..731b62c47e70 --- /dev/null +++ b/include/os/linux/spl/sys/sid.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SID_H +#define _SPL_SID_H + +typedef struct ksiddomain { + char *kd_name; +} ksiddomain_t; + +typedef enum ksid_index { + KSID_USER, + KSID_GROUP, + KSID_OWNER, + KSID_COUNT +} ksid_index_t; + +typedef int ksid_t; + +static inline ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + int len = strlen(dom); + + kd = kmem_zalloc(sizeof (ksiddomain_t), KM_SLEEP); + kd->kd_name = kmem_zalloc(len + 1, KM_SLEEP); + memcpy(kd->kd_name, dom, len); + + return (kd); +} + +static inline void +ksiddomain_rele(ksiddomain_t *ksid) +{ + kmem_free(ksid->kd_name, strlen(ksid->kd_name) + 1); + kmem_free(ksid, sizeof (ksiddomain_t)); +} + +#endif /* _SPL_SID_H */ diff --git a/include/os/linux/spl/sys/signal.h b/include/os/linux/spl/sys/signal.h new file mode 100644 index 000000000000..36b8b5d985a9 --- /dev/null +++ b/include/os/linux/spl/sys/signal.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SIGNAL_H +#define _SPL_SIGNAL_H + +#include + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include +#endif + +#define FORREAL 0 /* Usual side-effects */ +#define JUSTLOOKING 1 /* Don't stop the process */ + +/* + * The "why" argument indicates the allowable side-effects of the call: + * + * FORREAL: Extract the next pending signal from p_sig into p_cursig; + * stop the process if a stop has been requested or if a traced signal + * is pending. + * + * JUSTLOOKING: Don't stop the process, just indicate whether or not + * a signal might be pending (FORREAL is needed to tell for sure). + */ +static __inline__ int +issig(int why) +{ + ASSERT(why == FORREAL || why == JUSTLOOKING); + + return (signal_pending(current)); +} + +#endif /* SPL_SIGNAL_H */ diff --git a/include/os/linux/spl/sys/simd.h b/include/os/linux/spl/sys/simd.h new file mode 100644 index 000000000000..f2048d9e121c --- /dev/null +++ b/include/os/linux/spl/sys/simd.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SYS_SIMD_H +#define _SPL_SYS_SIMD_H + +#include +#include + +#endif /* _SPL_SYS_SIMD_H */ diff --git a/include/os/linux/spl/sys/stat.h b/include/os/linux/spl/sys/stat.h new file mode 100644 index 000000000000..83018e89442f --- /dev/null +++ b/include/os/linux/spl/sys/stat.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_STAT_H +#define _SPL_STAT_H + +#include + +#endif /* SPL_STAT_H */ diff --git a/include/os/linux/spl/sys/strings.h b/include/os/linux/spl/sys/strings.h new file mode 100644 index 000000000000..4fb80320635c --- /dev/null +++ b/include/os/linux/spl/sys/strings.h @@ -0,0 +1,31 @@ +/* + * Copyright (C) 2018 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ +#ifndef _SPL_SYS_STRINGS_H +#define _SPL_SYS_STRINGS_H + +#include + +#define bzero(ptr, size) memset(ptr, 0, size) +#define bcopy(src, dest, size) memmove(dest, src, size) +#define bcmp(src, dest, size) memcmp((src), (dest), (size_t)(size)) + +#endif /* _SPL_SYS_STRINGS_H */ diff --git a/include/os/linux/spl/sys/sunddi.h b/include/os/linux/spl/sys/sunddi.h new file mode 100644 index 000000000000..29a6fe00d1f4 --- /dev/null +++ b/include/os/linux/spl/sys/sunddi.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SUNDDI_H +#define _SPL_SUNDDI_H + +#include +#include +#include +#include +#include + +typedef int ddi_devid_t; + +#define DDI_DEV_T_NONE ((dev_t)-1) +#define DDI_DEV_T_ANY ((dev_t)-2) +#define DI_MAJOR_T_UNKNOWN ((major_t)0) + +#define DDI_PROP_DONTPASS 0x0001 +#define DDI_PROP_CANSLEEP 0x0002 + +#define DDI_SUCCESS 0 +#define DDI_FAILURE -1 + +#define ddi_prop_lookup_string(x1, x2, x3, x4, x5) (*x5 = NULL) +#define ddi_prop_free(x) (void)0 +#define ddi_root_node() (void)0 + +extern int ddi_strtoul(const char *, char **, int, unsigned long *); +extern int ddi_strtol(const char *, char **, int, long *); +extern int ddi_strtoull(const char *, char **, int, unsigned long long *); +extern int ddi_strtoll(const char *, char **, int, long long *); + +extern int ddi_copyin(const void *from, void *to, size_t len, int flags); +extern int ddi_copyout(const void *from, void *to, size_t len, int flags); + +#endif /* SPL_SUNDDI_H */ diff --git a/include/os/linux/spl/sys/sysmacros.h b/include/os/linux/spl/sys/sysmacros.h new file mode 100644 index 000000000000..7314588bcf82 --- /dev/null +++ b/include/os/linux/spl/sys/sysmacros.h @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SYSMACROS_H +#define _SPL_SYSMACROS_H + +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifndef _KERNEL +#define _KERNEL __KERNEL__ +#endif + +#define FALSE 0 +#define TRUE 1 + +#define INT8_MAX (127) +#define INT8_MIN (-128) +#define UINT8_MAX (255) +#define UINT8_MIN (0) + +#define INT16_MAX (32767) +#define INT16_MIN (-32768) +#define UINT16_MAX (65535) +#define UINT16_MIN (0) + +#define INT32_MAX INT_MAX +#define INT32_MIN INT_MIN +#define UINT32_MAX UINT_MAX +#define UINT32_MIN UINT_MIN + +#define INT64_MAX LLONG_MAX +#define INT64_MIN LLONG_MIN +#define UINT64_MAX ULLONG_MAX +#define UINT64_MIN ULLONG_MIN + +#define NBBY 8 + +#define MAXMSGLEN 256 +#define MAXNAMELEN 256 +#define MAXPATHLEN 4096 +#define MAXOFFSET_T LLONG_MAX +#define MAXBSIZE 8192 +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define proc_pageout NULL +#define curproc current +#define max_ncpus num_possible_cpus() +#define boot_ncpus num_online_cpus() +#define CPU_SEQID smp_processor_id() +#define is_system_labeled() 0 + +#ifndef RLIM64_INFINITY +#define RLIM64_INFINITY (~0ULL) +#endif + +/* + * 0..MAX_PRIO-1: Process priority + * 0..MAX_RT_PRIO-1: RT priority tasks + * MAX_RT_PRIO..MAX_PRIO-1: SCHED_NORMAL tasks + * + * Treat shim tasks as SCHED_NORMAL tasks + */ +#define minclsyspri (MAX_PRIO-1) +#define maxclsyspri (MAX_RT_PRIO) +#define defclsyspri (DEFAULT_PRIO) + +#ifndef NICE_TO_PRIO +#define NICE_TO_PRIO(nice) (MAX_RT_PRIO + (nice) + 20) +#endif +#ifndef PRIO_TO_NICE +#define PRIO_TO_NICE(prio) ((prio) - MAX_RT_PRIO - 20) +#endif + +/* + * Missing macros + */ +#ifndef PAGESIZE +#define PAGESIZE PAGE_SIZE +#endif + +#ifndef PAGESHIFT +#define PAGESHIFT PAGE_SHIFT +#endif + +/* Missing globals */ +extern char spl_gitrev[64]; +extern unsigned long spl_hostid; + +/* Missing misc functions */ +extern uint32_t zone_get_hostid(void *zone); +extern void spl_setup(void); +extern void spl_cleanup(void); + +#define highbit(x) __fls(x) +#define lowbit(x) __ffs(x) + +#define highbit64(x) fls64(x) +#define makedevice(maj, min) makedev(maj, min) + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif +#ifndef roundup +#define roundup(x, y) ((((x) + ((y) - 1)) / (y)) * (y)) +#endif +#ifndef howmany +#define howmany(x, y) (((x) + ((y) - 1)) / (y)) +#endif + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + +#include +#define qsort(base, num, size, cmp) \ + sort(base, num, size, cmp, NULL) + +#if !defined(_KMEMUSER) && !defined(offsetof) + +/* avoid any possibility of clashing with version */ + +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#endif /* _SPL_SYSMACROS_H */ diff --git a/include/os/linux/spl/sys/systeminfo.h b/include/os/linux/spl/sys/systeminfo.h new file mode 100644 index 000000000000..2255691580f9 --- /dev/null +++ b/include/os/linux/spl/sys/systeminfo.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_SYSTEMINFO_H +#define _SPL_SYSTEMINFO_H + +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +/* Supplemental definitions for Linux. */ +#define HW_HOSTID_PATH "/etc/hostid" /* binary configuration file */ +#define HW_HOSTID_MASK 0xFFFFFFFF /* significant hostid bits */ + +#endif /* SPL_SYSTEMINFO_H */ diff --git a/include/os/linux/spl/sys/taskq.h b/include/os/linux/spl/sys/taskq.h new file mode 100644 index 000000000000..7a1ee9ec4f1b --- /dev/null +++ b/include/os/linux/spl/sys/taskq.h @@ -0,0 +1,164 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TASKQ_H +#define _SPL_TASKQ_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define TASKQ_NAMELEN 31 + +#define TASKQ_PREPOPULATE 0x00000001 +#define TASKQ_CPR_SAFE 0x00000002 +#define TASKQ_DYNAMIC 0x00000004 +#define TASKQ_THREADS_CPU_PCT 0x00000008 +#define TASKQ_DC_BATCH 0x00000010 +#define TASKQ_ACTIVE 0x80000000 + +/* + * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as + * KM_SLEEP/KM_NOSLEEP. TQ_NOQUEUE/TQ_NOALLOC are set particularly + * large so as not to conflict with already used GFP_* defines. + */ +#define TQ_SLEEP 0x00000000 +#define TQ_NOSLEEP 0x00000001 +#define TQ_PUSHPAGE 0x00000002 +#define TQ_NOQUEUE 0x01000000 +#define TQ_NOALLOC 0x02000000 +#define TQ_NEW 0x04000000 +#define TQ_FRONT 0x08000000 + +/* + * Reserved taskqid values. + */ +#define TASKQID_INVALID ((taskqid_t)0) +#define TASKQID_INITIAL ((taskqid_t)1) + +/* + * spin_lock(lock) and spin_lock_nested(lock,0) are equivalent, + * so TQ_LOCK_DYNAMIC must not evaluate to 0 + */ +typedef enum tq_lock_role { + TQ_LOCK_GENERAL = 0, + TQ_LOCK_DYNAMIC = 1, +} tq_lock_role_t; + +typedef unsigned long taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq { + spinlock_t tq_lock; /* protects taskq_t */ + char *tq_name; /* taskq name */ + int tq_instance; /* instance of tq_name */ + struct list_head tq_thread_list; /* list of all threads */ + struct list_head tq_active_list; /* list of active threads */ + int tq_nactive; /* # of active threads */ + int tq_nthreads; /* # of existing threads */ + int tq_nspawn; /* # of threads being spawned */ + int tq_maxthreads; /* # of threads maximum */ + int tq_pri; /* priority */ + int tq_minalloc; /* min taskq_ent_t pool size */ + int tq_maxalloc; /* max taskq_ent_t pool size */ + int tq_nalloc; /* cur taskq_ent_t pool size */ + uint_t tq_flags; /* flags */ + taskqid_t tq_next_id; /* next pend/work id */ + taskqid_t tq_lowest_id; /* lowest pend/work id */ + struct list_head tq_free_list; /* free taskq_ent_t's */ + struct list_head tq_pend_list; /* pending taskq_ent_t's */ + struct list_head tq_prio_list; /* priority taskq_ent_t's */ + struct list_head tq_delay_list; /* delayed taskq_ent_t's */ + struct list_head tq_taskqs; /* all taskq_t's */ + spl_wait_queue_head_t tq_work_waitq; /* new work waitq */ + spl_wait_queue_head_t tq_wait_waitq; /* wait waitq */ + tq_lock_role_t tq_lock_class; /* class when taking tq_lock */ +} taskq_t; + +typedef struct taskq_ent { + spinlock_t tqent_lock; + spl_wait_queue_head_t tqent_waitq; + struct timer_list tqent_timer; + struct list_head tqent_list; + taskqid_t tqent_id; + task_func_t *tqent_func; + void *tqent_arg; + taskq_t *tqent_taskq; + uintptr_t tqent_flags; + unsigned long tqent_birth; +} taskq_ent_t; + +#define TQENT_FLAG_PREALLOC 0x1 +#define TQENT_FLAG_CANCEL 0x2 + +typedef struct taskq_thread { + struct list_head tqt_thread_list; + struct list_head tqt_active_list; + struct task_struct *tqt_thread; + taskq_t *tqt_tq; + taskqid_t tqt_id; + taskq_ent_t *tqt_task; + uintptr_t tqt_flags; +} taskq_thread_t; + +/* Global system-wide dynamic task queue available for all consumers */ +extern taskq_t *system_taskq; +/* Global dynamic task queue for long delay */ +extern taskq_t *system_delay_taskq; + +/* List of all taskqs */ +extern struct list_head tq_list; +extern struct rw_semaphore tq_list_sem; + +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, + uint_t, clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern void taskq_wait(taskq_t *); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); + +#define taskq_create_proc(name, nthreads, pri, min, max, proc, flags) \ + taskq_create(name, nthreads, pri, min, max, flags) +#define taskq_create_sysdc(name, nthreads, min, max, proc, dc, flags) \ + taskq_create(name, nthreads, maxclsyspri, min, max, flags) + +int spl_taskq_init(void); +void spl_taskq_fini(void); + +#endif /* _SPL_TASKQ_H */ diff --git a/include/os/linux/spl/sys/thread.h b/include/os/linux/spl/sys/thread.h new file mode 100644 index 000000000000..72dcf9f05d0d --- /dev/null +++ b/include/os/linux/spl/sys/thread.h @@ -0,0 +1,74 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_THREAD_H +#define _SPL_THREAD_H + +#include +#include +#include +#include +#include +#include +#include + +/* + * Thread interfaces + */ +#define TP_MAGIC 0x53535353 + +#define TS_SLEEP TASK_INTERRUPTIBLE +#define TS_RUN TASK_RUNNING +#define TS_ZOMB EXIT_ZOMBIE +#define TS_STOPPED TASK_STOPPED + +typedef void (*thread_func_t)(void *); + +#define thread_create_named(name, stk, stksize, func, arg, len, \ + pp, state, pri) \ + __thread_create(stk, stksize, (thread_func_t)func, \ + name, arg, len, pp, state, pri) + +/* BEGIN CSTYLED */ +#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ + __thread_create(stk, stksize, (thread_func_t)func, \ + #func, arg, len, pp, state, pri) +/* END CSTYLED */ + +#define thread_exit() __thread_exit() +#define thread_join(t) VERIFY(0) +#define curthread current +#define getcomm() current->comm +#define getpid() current->pid + +extern kthread_t *__thread_create(caddr_t stk, size_t stksize, + thread_func_t func, const char *name, void *args, size_t len, proc_t *pp, + int state, pri_t pri); +extern void __thread_exit(void); +extern struct task_struct *spl_kthread_create(int (*func)(void *), + void *data, const char namefmt[], ...); + +extern proc_t p0; + +#endif /* _SPL_THREAD_H */ diff --git a/include/os/linux/spl/sys/time.h b/include/os/linux/spl/sys/time.h new file mode 100644 index 000000000000..4309c300b268 --- /dev/null +++ b/include/os/linux/spl/sys/time.h @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TIME_H +#define _SPL_TIME_H + +#include +#include +#include +#include + +#if defined(CONFIG_64BIT) +#define TIME_MAX INT64_MAX +#define TIME_MIN INT64_MIN +#else +#define TIME_MAX INT32_MAX +#define TIME_MIN INT32_MIN +#endif + +#define SEC 1 +#define MILLISEC 1000 +#define MICROSEC 1000000 +#define NANOSEC 1000000000 + +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) + +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) + +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) + +static const int hz = HZ; + +typedef longlong_t hrtime_t; +typedef struct timespec timespec_t; + +#define TIMESPEC_OVERFLOW(ts) \ + ((ts)->tv_sec < TIME_MIN || (ts)->tv_sec > TIME_MAX) + +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +typedef struct timespec64 inode_timespec_t; +#else +typedef struct timespec inode_timespec_t; +#endif + +/* Include for Lustre compatibility */ +#define timestruc_t inode_timespec_t + +static inline void +gethrestime(inode_timespec_t *ts) +{ +#if defined(HAVE_INODE_TIMESPEC64_TIMES) + +#if defined(HAVE_KTIME_GET_COARSE_REAL_TS64) + ktime_get_coarse_real_ts64(ts); +#else + *ts = current_kernel_time64(); +#endif /* HAVE_KTIME_GET_COARSE_REAL_TS64 */ + +#else + *ts = current_kernel_time(); +#endif +} + +static inline uint64_t +gethrestime_sec(void) +{ +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +#if defined(HAVE_KTIME_GET_COARSE_REAL_TS64) + inode_timespec_t ts; + ktime_get_coarse_real_ts64(&ts); +#else + inode_timespec_t ts = current_kernel_time64(); +#endif /* HAVE_KTIME_GET_COARSE_REAL_TS64 */ + +#else + inode_timespec_t ts = current_kernel_time(); +#endif + return (ts.tv_sec); +} + +static inline hrtime_t +gethrtime(void) +{ +#if defined(HAVE_KTIME_GET_RAW_TS64) + struct timespec64 ts; + ktime_get_raw_ts64(&ts); +#else + struct timespec ts; + getrawmonotonic(&ts); +#endif + return (((hrtime_t)ts.tv_sec * NSEC_PER_SEC) + ts.tv_nsec); +} + +#endif /* _SPL_TIME_H */ diff --git a/include/os/linux/spl/sys/timer.h b/include/os/linux/spl/sys/timer.h new file mode 100644 index 000000000000..40be12047ae4 --- /dev/null +++ b/include/os/linux/spl/sys/timer.h @@ -0,0 +1,86 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TIMER_H +#define _SPL_TIMER_H + +#include +#include +#include +#include +#include + +#define lbolt ((clock_t)jiffies) +#define lbolt64 ((int64_t)get_jiffies_64()) + +#define ddi_get_lbolt() ((clock_t)jiffies) +#define ddi_get_lbolt64() ((int64_t)get_jiffies_64()) + +#define ddi_time_before(a, b) (typecheck(clock_t, a) && \ + typecheck(clock_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after(a, b) ddi_time_before(b, a) +#define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) +#define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) + +#define ddi_time_before64(a, b) (typecheck(int64_t, a) && \ + typecheck(int64_t, b) && \ + ((a) - (b) < 0)) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) +#define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) +#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) + +#define delay(ticks) schedule_timeout_uninterruptible(ticks) + +#define SEC_TO_TICK(sec) ((sec) * HZ) +#define MSEC_TO_TICK(ms) msecs_to_jiffies(ms) +#define USEC_TO_TICK(us) usecs_to_jiffies(us) +#define NSEC_TO_TICK(ns) usecs_to_jiffies(ns / NSEC_PER_USEC) + +#ifndef from_timer +#define from_timer(var, timer, timer_field) \ + container_of(timer, typeof(*var), timer_field) +#endif + +#ifdef HAVE_KERNEL_TIMER_FUNCTION_TIMER_LIST +typedef struct timer_list *spl_timer_list_t; +#else +typedef unsigned long spl_timer_list_t; +#endif + +#ifndef HAVE_KERNEL_TIMER_SETUP + +static inline void +timer_setup(struct timer_list *timer, void (*func)(spl_timer_list_t), u32 fl) +{ +#ifdef HAVE_KERNEL_TIMER_LIST_FLAGS + (timer)->flags = fl; +#endif + init_timer(timer); + setup_timer(timer, func, (spl_timer_list_t)(timer)); +} + +#endif /* HAVE_KERNEL_TIMER_SETUP */ + +#endif /* _SPL_TIMER_H */ diff --git a/include/os/linux/spl/sys/trace.h b/include/os/linux/spl/sys/trace.h new file mode 100644 index 000000000000..b148ace6abd3 --- /dev/null +++ b/include/os/linux/spl/sys/trace.h @@ -0,0 +1,175 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) + +/* + * Calls to DTRACE_PROBE* are mapped to standard Linux kernel trace points + * when they are available(when HAVE_DECLARE_EVENT_CLASS is defined). The + * tracepoint event class definitions are found in the general tracing + * header file: include/sys/trace_*.h. See include/sys/trace_vdev.h for + * a good example. + * + * If tracepoints are not available, stub functions are generated which can + * be traced using kprobes. In this case, the DEFINE_DTRACE_PROBE* macros + * are used to provide the stub functions and also the prototypes for + * those functions. The mechanism to do this relies on DEFINE_DTRACE_PROBE + * macros defined in the general tracing headers(see trace_vdev.h) and + * CREATE_TRACE_POINTS being defined only in module/zfs/trace.c. When ZFS + * source files include the general tracing headers, e.g. + * module/zfs/vdev_removal.c including trace_vdev.h, DTRACE_PROBE calls + * are mapped to stub functions calls and prototypes for those calls are + * declared via DEFINE_DTRACE_PROBE*. Only module/zfs/trace.c defines + * CREATE_TRACE_POINTS. That is followed by includes of all the general + * tracing headers thereby defining all stub functions in one place via + * the DEFINE_DTRACE_PROBE macros. + * + * When adding new DTRACE_PROBEs to zfs source, both a tracepoint event + * class definition and a DEFINE_DTRACE_PROBE definition are needed to + * avoid undefined function errors. + */ + +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#if !defined(_TRACE_ZFS_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZFS_H + +#include +#include + +/* + * DTRACE_PROBE with 0 arguments is not currently available with + * tracepoint events + */ +#define DTRACE_PROBE(name) \ + ((void)0) + +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((arg1)) + +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((arg1), (arg2)) + +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((arg1), (arg2), (arg3)) + +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((arg1), (arg2), (arg3), (arg4)) + +#endif /* _TRACE_ZFS_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace +#include + +#else /* HAVE_DECLARE_EVENT_CLASS */ + +#define DTRACE_PROBE(name) \ + trace_zfs_##name() + +#define DTRACE_PROBE1(name, t1, arg1) \ + trace_zfs_##name((uintptr_t)(arg1)) + +#define DTRACE_PROBE2(name, t1, arg1, t2, arg2) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2)) + +#define DTRACE_PROBE3(name, t1, arg1, t2, arg2, t3, arg3) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3)) + +#define DTRACE_PROBE4(name, t1, arg1, t2, arg2, t3, arg3, t4, arg4) \ + trace_zfs_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3), (uintptr_t)(arg4)) + +#define PROTO_DTRACE_PROBE(name) \ + noinline void trace_zfs_##name(void) +#define PROTO_DTRACE_PROBE1(name) \ + noinline void trace_zfs_##name(uintptr_t) +#define PROTO_DTRACE_PROBE2(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t) +#define PROTO_DTRACE_PROBE3(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t, \ + uintptr_t) +#define PROTO_DTRACE_PROBE4(name) \ + noinline void trace_zfs_##name(uintptr_t, uintptr_t, \ + uintptr_t, uintptr_t) + +#if defined(CREATE_TRACE_POINTS) + +#define FUNC_DTRACE_PROBE(name) \ +PROTO_DTRACE_PROBE(name); \ +noinline void trace_zfs_##name(void) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE1(name) \ +PROTO_DTRACE_PROBE1(name); \ +noinline void trace_zfs_##name(uintptr_t arg1) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE2(name) \ +PROTO_DTRACE_PROBE2(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE3(name) \ +PROTO_DTRACE_PROBE3(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2, uintptr_t arg3) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#define FUNC_DTRACE_PROBE4(name) \ +PROTO_DTRACE_PROBE4(name); \ +noinline void trace_zfs_##name(uintptr_t arg1, \ + uintptr_t arg2, uintptr_t arg3, uintptr_t arg4) { } \ +EXPORT_SYMBOL(trace_zfs_##name) + +#undef DEFINE_DTRACE_PROBE +#define DEFINE_DTRACE_PROBE(name) FUNC_DTRACE_PROBE(name) + +#undef DEFINE_DTRACE_PROBE1 +#define DEFINE_DTRACE_PROBE1(name) FUNC_DTRACE_PROBE1(name) + +#undef DEFINE_DTRACE_PROBE2 +#define DEFINE_DTRACE_PROBE2(name) FUNC_DTRACE_PROBE2(name) + +#undef DEFINE_DTRACE_PROBE3 +#define DEFINE_DTRACE_PROBE3(name) FUNC_DTRACE_PROBE3(name) + +#undef DEFINE_DTRACE_PROBE4 +#define DEFINE_DTRACE_PROBE4(name) FUNC_DTRACE_PROBE4(name) + +#else /* CREATE_TRACE_POINTS */ + +#define DEFINE_DTRACE_PROBE(name) PROTO_DTRACE_PROBE(name) +#define DEFINE_DTRACE_PROBE1(name) PROTO_DTRACE_PROBE1(name) +#define DEFINE_DTRACE_PROBE2(name) PROTO_DTRACE_PROBE2(name) +#define DEFINE_DTRACE_PROBE3(name) PROTO_DTRACE_PROBE3(name) +#define DEFINE_DTRACE_PROBE4(name) PROTO_DTRACE_PROBE4(name) + +#endif /* CREATE_TRACE_POINTS */ +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/spl/sys/trace_spl.h b/include/os/linux/spl/sys/trace_spl.h new file mode 100644 index 000000000000..bffd91d9129b --- /dev/null +++ b/include/os/linux/spl/sys/trace_spl.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _OS_LINUX_SPL_TRACE_H +#define _OS_LINUX_SPL_TRACE_H + +#include + +#include +#include + +#endif diff --git a/include/os/linux/spl/sys/trace_taskq.h b/include/os/linux/spl/sys/trace_taskq.h new file mode 100644 index 000000000000..dbbb3c4c7960 --- /dev/null +++ b/include/os/linux/spl/sys/trace_taskq.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_taskq + +#if !defined(_TRACE_TASKQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TASKQ_H + +#include +#include + +/* + * Generic support for single argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * taskq_ent_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_taskq_ent_class, + TP_PROTO(taskq_ent_t *taskq_ent), + TP_ARGS(taskq_ent), + TP_STRUCT__entry( + __field(taskq_ent_t *, taskq_ent) + ), + TP_fast_assign( + __entry->taskq_ent = taskq_ent; + ), + TP_printk("taskq_ent %p", __entry->taskq_ent) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_TASKQ_EVENT(name) \ +DEFINE_EVENT(zfs_taskq_ent_class, name, \ + TP_PROTO(taskq_ent_t *taskq_ent), \ + TP_ARGS(taskq_ent)) +/* END CSTYLED */ +DEFINE_TASKQ_EVENT(zfs_taskq_ent__birth); +DEFINE_TASKQ_EVENT(zfs_taskq_ent__start); +DEFINE_TASKQ_EVENT(zfs_taskq_ent__finish); + +#endif /* _TRACE_TASKQ_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_taskq +#include + +#else + +/* + * When tracepoints are not available, a DEFINE_DTRACE_PROBE* macro is + * needed for each DTRACE_PROBE. These will be used to generate stub + * tracing functions and prototypes for those functions. See + * include/os/linux/spl/sys/trace.h. + */ + +DEFINE_DTRACE_PROBE1(taskq_ent__birth); +DEFINE_DTRACE_PROBE1(taskq_ent__start); +DEFINE_DTRACE_PROBE1(taskq_ent__finish); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/spl/sys/tsd.h b/include/os/linux/spl/sys/tsd.h new file mode 100644 index 000000000000..39a291bf3dee --- /dev/null +++ b/include/os/linux/spl/sys/tsd.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TSD_H +#define _SPL_TSD_H + +#include + +#define TSD_HASH_TABLE_BITS_DEFAULT 9 +#define TSD_KEYS_MAX 32768 +#define DTOR_PID (PID_MAX_LIMIT+1) +#define PID_KEY (TSD_KEYS_MAX+1) + +typedef void (*dtor_func_t)(void *); + +extern int tsd_set(uint_t, void *); +extern void *tsd_get(uint_t); +extern void *tsd_get_by_thread(uint_t, kthread_t *); +extern void tsd_create(uint_t *, dtor_func_t); +extern void tsd_destroy(uint_t *); +extern void tsd_exit(void); + +int spl_tsd_init(void); +void spl_tsd_fini(void); + +#endif /* _SPL_TSD_H */ diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h new file mode 100644 index 000000000000..719a44646e6b --- /dev/null +++ b/include/os/linux/spl/sys/types.h @@ -0,0 +1,58 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TYPES_H +#define _SPL_TYPES_H + +#include + +typedef enum { + B_FALSE = 0, + B_TRUE = 1 +} boolean_t; + +typedef unsigned char uchar_t; +typedef unsigned short ushort_t; +typedef unsigned int uint_t; +typedef unsigned long ulong_t; +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; + +typedef unsigned long intptr_t; +typedef unsigned long long rlim64_t; + +typedef struct task_struct kthread_t; +typedef struct task_struct proc_t; + +typedef int id_t; +typedef short pri_t; +typedef short index_t; +typedef longlong_t offset_t; +typedef u_longlong_t u_offset_t; +typedef ulong_t pgcnt_t; + +typedef int major_t; +typedef int minor_t; + +#endif /* _SPL_TYPES_H */ diff --git a/include/os/linux/spl/sys/types32.h b/include/os/linux/spl/sys/types32.h new file mode 100644 index 000000000000..c60ba8c97019 --- /dev/null +++ b/include/os/linux/spl/sys/types32.h @@ -0,0 +1,35 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_TYPES32_H +#define _SPL_TYPES32_H + +#include + +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t time32_t; +typedef uint32_t size32_t; + +#endif /* _SPL_TYPES32_H */ diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h new file mode 100644 index 000000000000..0e631d6779a2 --- /dev/null +++ b/include/os/linux/spl/sys/uio.h @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_UIO_H +#define _SPL_UIO_H + +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct iovec iovec_t; + +typedef enum uio_rw { + UIO_READ = 0, + UIO_WRITE = 1, +} uio_rw_t; + +typedef enum uio_seg { + UIO_USERSPACE = 0, + UIO_SYSSPACE = 1, + UIO_USERISPACE = 2, + UIO_BVEC = 3, +} uio_seg_t; + +typedef struct uio { + union { + const struct iovec *uio_iov; + const struct bio_vec *uio_bvec; + }; + int uio_iovcnt; + offset_t uio_loffset; + uio_seg_t uio_segflg; + boolean_t uio_fault_disable; + uint16_t uio_fmode; + uint16_t uio_extflg; + offset_t uio_limit; + ssize_t uio_resid; + size_t uio_skip; +} uio_t; + +typedef struct aio_req { + uio_t *aio_uio; + void *aio_private; +} aio_req_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { + int uioa_pfncnt; + void **uioa_ppp; + caddr_t uioa_base; + size_t uioa_len; +} uioa_page_t; + +typedef struct xuio { + uio_t xu_uio; + enum xuio_type xu_type; + union { + struct { + uint32_t xu_a_state; + ssize_t xu_a_mbytes; + uioa_page_t *xu_a_lcur; + void **xu_a_lppp; + void *xu_a_hwst[4]; + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; + void *xu_zc_priv; + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#define uio_segflg(uio) (uio)->uio_segflg +#define uio_offset(uio) (uio)->uio_loffset +#define uio_resid(uio) (uio)->uio_resid +#define uio_iovcnt(uio) (uio)->uio_iovcnt +#define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len +#define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base + +static inline void +uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = uio_iovbase(uio, idx); + *len = uio_iovlen(uio, idx); +} + +static inline void +uio_advance(uio_t *uio, size_t size) +{ + uio->uio_resid -= size; + uio->uio_loffset += size; +} + +static inline offset_t +uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < uio_iovcnt(uio) && off >= uio_iovlen(uio, *vec_idx)) { + off -= uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } + + return (off); +} + +#endif /* SPL_UIO_H */ diff --git a/include/os/linux/spl/sys/user.h b/include/os/linux/spl/sys/user.h new file mode 100644 index 000000000000..b12cb240e39b --- /dev/null +++ b/include/os/linux/spl/sys/user.h @@ -0,0 +1,42 @@ +/* + * Copyright (C) 2015 Cluster Inc. + * Produced at ClusterHQ Inc (cf, DISCLAIMER). + * Written by Richard Yao . + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_USER_H +#define _SPL_USER_H + +/* + * We have uf_info_t for areleasef(). We implement areleasef() using a global + * linked list of all open file descriptors with the task structs referenced, + * so accessing the correct descriptor from areleasef() only requires knowing + * about the Linux task_struct. Since this is internal to our compatibility + * layer, we make it an opaque type. + * + * XXX: If the descriptor changes under us and we do not do a getf() between + * the change and using it, we would get an incorrect reference. + */ + +struct uf_info; +typedef struct uf_info uf_info_t; + +#define P_FINFO(x) ((uf_info_t *)x) + +#endif /* SPL_USER_H */ diff --git a/include/os/linux/spl/sys/vfs.h b/include/os/linux/spl/sys/vfs.h new file mode 100644 index 000000000000..0d5e1d51d7aa --- /dev/null +++ b/include/os/linux/spl/sys/vfs.h @@ -0,0 +1,51 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_ZFS_H +#define _SPL_ZFS_H + +#include +#include +#include +#include +#include +#include +#include + +#define MAXFIDSZ 64 + +typedef struct spl_fid { + union { + long fid_pad; + struct { + ushort_t len; /* length of data in bytes */ + char data[MAXFIDSZ]; /* data (variable len) */ + } _fid; + } un; +} fid_t; + +#define fid_len un._fid.len +#define fid_data un._fid.data + +#endif /* SPL_ZFS_H */ diff --git a/include/os/linux/spl/sys/vmem.h b/include/os/linux/spl/sys/vmem.h new file mode 100644 index 000000000000..a31b4728c367 --- /dev/null +++ b/include/os/linux/spl/sys/vmem.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_VMEM_H +#define _SPL_VMEM_H + +#include +#include +#include + +typedef struct vmem { } vmem_t; + +/* + * Memory allocation interfaces + */ +#define VMEM_ALLOC 0x01 +#define VMEM_FREE 0x02 + +#ifndef VMALLOC_TOTAL +#define VMALLOC_TOTAL (VMALLOC_END - VMALLOC_START) +#endif + +/* + * vmem_* is an interface to a low level arena-based memory allocator on + * Illumos that is used to allocate virtual address space. The kmem SLAB + * allocator allocates slabs from it. Then the generic allocation functions + * kmem_{alloc,zalloc,free}() are layered on top of SLAB allocators. + * + * On Linux, the primary means of doing allocations is via kmalloc(), which + * is similarly layered on top of something called the buddy allocator. The + * buddy allocator is not available to kernel modules, it uses physical + * memory addresses rather than virtual memory addresses and is prone to + * fragmentation. + * + * Linux sets aside a relatively small address space for in-kernel virtual + * memory from which allocations can be done using vmalloc(). It might seem + * like a good idea to use vmalloc() to implement something similar to + * Illumos' allocator. However, this has the following problems: + * + * 1. Page directory table allocations are hard coded to use GFP_KERNEL. + * Consequently, any KM_PUSHPAGE or KM_NOSLEEP allocations done using + * vmalloc() will not have proper semantics. + * + * 2. Address space exhaustion is a real issue on 32-bit platforms where + * only a few 100MB are available. The kernel will handle it by spinning + * when it runs out of address space. + * + * 3. All vmalloc() allocations and frees are protected by a single global + * lock which serializes all allocations. + * + * 4. Accessing /proc/meminfo and /proc/vmallocinfo will iterate the entire + * list. The former will sum the allocations while the latter will print + * them to user space in a way that user space can keep the lock held + * indefinitely. When the total number of mapped allocations is large + * (several 100,000) a large amount of time will be spent waiting on locks. + * + * 5. Linux has a wait_on_bit() locking primitive that assumes physical + * memory is used, it simply does not work on virtual memory. Certain + * Linux structures (e.g. the superblock) use them and might be embedded + * into a structure from Illumos. This makes using Linux virtual memory + * unsafe in certain situations. + * + * It follows that we cannot obtain identical semantics to those on Illumos. + * Consequently, we implement the kmem_{alloc,zalloc,free}() functions in + * such a way that they can be used as drop-in replacements for small vmem_* + * allocations (8MB in size or smaller) and map vmem_{alloc,zalloc,free}() + * to them. + */ + +#define vmem_alloc(sz, fl) spl_vmem_alloc((sz), (fl), __func__, __LINE__) +#define vmem_zalloc(sz, fl) spl_vmem_zalloc((sz), (fl), __func__, __LINE__) +#define vmem_free(ptr, sz) spl_vmem_free((ptr), (sz)) + +extern void *spl_vmem_alloc(size_t sz, int fl, const char *func, int line); +extern void *spl_vmem_zalloc(size_t sz, int fl, const char *func, int line); +extern void spl_vmem_free(const void *ptr, size_t sz); + +int spl_vmem_init(void); +void spl_vmem_fini(void); + +#endif /* _SPL_VMEM_H */ diff --git a/include/os/linux/spl/sys/vmsystm.h b/include/os/linux/spl/sys/vmsystm.h new file mode 100644 index 000000000000..705339486bc8 --- /dev/null +++ b/include/os/linux/spl/sys/vmsystm.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_VMSYSTM_H +#define _SPL_VMSYSTM_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_TOTALRAM_PAGES_FUNC +#define zfs_totalram_pages totalram_pages() +#else +#define zfs_totalram_pages totalram_pages +#endif + +#ifdef HAVE_TOTALHIGH_PAGES +#define zfs_totalhigh_pages totalhigh_pages() +#else +#define zfs_totalhigh_pages totalhigh_pages +#endif + +#define membar_producer() smp_wmb() +#define physmem zfs_totalram_pages +#ifdef ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B +#define freemem (nr_free_pages() + \ + global_page_state(NR_INACTIVE_FILE) + \ + global_page_state(NR_INACTIVE_ANON) + \ + global_page_state(NR_SLAB_RECLAIMABLE_B)) +#else +#define freemem (nr_free_pages() + \ + global_page_state(NR_INACTIVE_FILE) + \ + global_page_state(NR_INACTIVE_ANON) + \ + global_page_state(NR_SLAB_RECLAIMABLE)) +#endif /* ZFS_ENUM_NODE_STAT_ITEM_NR_SLAB_RECLAIMABLE_B */ + +#define xcopyin(from, to, size) copy_from_user(to, from, size) +#define xcopyout(from, to, size) copy_to_user(to, from, size) + +static __inline__ int +copyin(const void *from, void *to, size_t len) +{ + /* On error copyin routine returns -1 */ + if (xcopyin(from, to, len)) + return (-1); + + return (0); +} + +static __inline__ int +copyout(const void *from, void *to, size_t len) +{ + /* On error copyout routine returns -1 */ + if (xcopyout(from, to, len)) + return (-1); + + return (0); +} + +static __inline__ int +copyinstr(const void *from, void *to, size_t len, size_t *done) +{ + size_t rc; + + if (len == 0) + return (-ENAMETOOLONG); + + /* XXX: Should return ENAMETOOLONG if 'strlen(from) > len' */ + + memset(to, 0, len); + rc = copyin(from, to, len - 1); + if (done != NULL) + *done = rc; + + return (0); +} + +#endif /* SPL_VMSYSTM_H */ diff --git a/include/os/linux/spl/sys/vnode.h b/include/os/linux/spl/sys/vnode.h new file mode 100644 index 000000000000..07eac8e44173 --- /dev/null +++ b/include/os/linux/spl/sys/vnode.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_VNODE_H +#define _SPL_VNODE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Prior to linux-2.6.33 only O_DSYNC semantics were implemented and + * they used the O_SYNC flag. As of linux-2.6.33 the this behavior + * was properly split in to O_SYNC and O_DSYNC respectively. + */ +#ifndef O_DSYNC +#define O_DSYNC O_SYNC +#endif + +#define F_FREESP 11 /* Free file space */ + +/* + * The vnode AT_ flags are mapped to the Linux ATTR_* flags. + * This allows them to be used safely with an iattr structure. + * The AT_XVATTR flag has been added and mapped to the upper + * bit range to avoid conflicting with the standard Linux set. + */ +#undef AT_UID +#undef AT_GID + +#define AT_MODE ATTR_MODE +#define AT_UID ATTR_UID +#define AT_GID ATTR_GID +#define AT_SIZE ATTR_SIZE +#define AT_ATIME ATTR_ATIME +#define AT_MTIME ATTR_MTIME +#define AT_CTIME ATTR_CTIME + +#define ATTR_XVATTR (1U << 31) +#define AT_XVATTR ATTR_XVATTR + +#define ATTR_IATTR_MASK (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_SIZE | \ + ATTR_ATIME | ATTR_MTIME | ATTR_CTIME | ATTR_FILE) + +#define CRCREAT 0x01 +#define RMFILE 0x02 + +#define B_INVAL 0x01 +#define B_TRUNC 0x02 + +#define LOOKUP_DIR 0x01 +#define LOOKUP_XATTR 0x02 +#define CREATE_XATTR_DIR 0x04 +#define ATTR_NOACLCHECK 0x20 + +typedef struct vattr { + uint32_t va_mask; /* attribute bit-mask */ + ushort_t va_mode; /* acc mode */ + uid_t va_uid; /* owner uid */ + gid_t va_gid; /* owner gid */ + long va_fsid; /* fs id */ + long va_nodeid; /* node # */ + uint32_t va_nlink; /* # links */ + uint64_t va_size; /* file size */ + inode_timespec_t va_atime; /* last acc */ + inode_timespec_t va_mtime; /* last mod */ + inode_timespec_t va_ctime; /* last chg */ + dev_t va_rdev; /* dev */ + uint64_t va_nblocks; /* space used */ + uint32_t va_blksize; /* block size */ + struct dentry *va_dentry; /* dentry to wire */ +} vattr_t; +#endif /* SPL_VNODE_H */ diff --git a/include/os/linux/spl/sys/wait.h b/include/os/linux/spl/sys/wait.h new file mode 100644 index 000000000000..5311ff8b971b --- /dev/null +++ b/include/os/linux/spl/sys/wait.h @@ -0,0 +1,55 @@ +/* + * Copyright (C) 2007-2014 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_WAIT_H +#define _SPL_WAIT_H + +#include +#include + +#ifndef HAVE_WAIT_ON_BIT_ACTION +#define spl_wait_on_bit(word, bit, mode) wait_on_bit(word, bit, mode) +#else + +static inline int +spl_bit_wait(void *word) +{ + schedule(); + return (0); +} + +#define spl_wait_on_bit(word, bit, mode) \ + wait_on_bit(word, bit, spl_bit_wait, mode) + +#endif /* HAVE_WAIT_ON_BIT_ACTION */ + +#ifdef HAVE_WAIT_QUEUE_ENTRY_T +typedef wait_queue_head_t spl_wait_queue_head_t; +typedef wait_queue_entry_t spl_wait_queue_entry_t; +#else +typedef wait_queue_head_t spl_wait_queue_head_t; +typedef wait_queue_t spl_wait_queue_entry_t; +#endif + +#endif /* SPL_WAIT_H */ diff --git a/include/os/linux/spl/sys/zmod.h b/include/os/linux/spl/sys/zmod.h new file mode 100644 index 000000000000..5380bd6fd022 --- /dev/null +++ b/include/os/linux/spl/sys/zmod.h @@ -0,0 +1,70 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + +#ifndef _SPL_ZMOD_H +#define _SPL_ZMOD_H + +#include +#include + +extern int z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level); +extern int z_uncompress(void *dest, size_t *destLen, const void *source, + size_t sourceLen); + +int spl_zlib_init(void); +void spl_zlib_fini(void); + +#endif /* SPL_ZMOD_H */ diff --git a/include/os/linux/spl/sys/zone.h b/include/os/linux/spl/sys/zone.h new file mode 100644 index 000000000000..b2efd13b8e0d --- /dev/null +++ b/include/os/linux/spl/sys/zone.h @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_ZONE_H +#define _SPL_ZONE_H + +#include + +#define GLOBAL_ZONEID 0 + +#define zone_dataset_visible(x, y) (1) +#define crgetzoneid(x) (GLOBAL_ZONEID) +#define INGLOBALZONE(z) (1) + +#endif /* SPL_ZONE_H */ diff --git a/include/os/linux/zfs/Makefile.am b/include/os/linux/zfs/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/include/os/linux/zfs/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/include/os/linux/zfs/sys/Makefile.am b/include/os/linux/zfs/sys/Makefile.am new file mode 100644 index 000000000000..732d94ee804e --- /dev/null +++ b/include/os/linux/zfs/sys/Makefile.am @@ -0,0 +1,30 @@ +KERNEL_H = \ + policy.h \ + sha2.h \ + trace_acl.h \ + trace_arc.h \ + trace_common.h \ + trace_zfs.h \ + trace_dbgmsg.h \ + trace_dbuf.h \ + trace_dmu.h \ + trace_dnode.h \ + trace_multilist.h \ + trace_rrwlock.h \ + trace_txg.h \ + trace_vdev.h \ + trace_zil.h \ + trace_zio.h \ + trace_zrlock.h \ + zfs_context_os.h \ + zfs_ctldir.h \ + zfs_dir.h \ + zfs_vfsops.h \ + zfs_vnops.h \ + zfs_znode_impl.h \ + zpl.h + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys +kernel_HEADERS = $(KERNEL_H) +endif diff --git a/include/os/linux/zfs/sys/policy.h b/include/os/linux/zfs/sys/policy.h new file mode 100644 index 000000000000..77a73ad149c5 --- /dev/null +++ b/include/os/linux/zfs/sys/policy.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015, Joyent, Inc. All rights reserved. + * Copyright (c) 2016, Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_POLICY_H +#define _SYS_POLICY_H + +#ifdef _KERNEL + +#include +#include +#include +#include + +int secpolicy_nfs(const cred_t *); +int secpolicy_sys_config(const cred_t *, boolean_t); +int secpolicy_vnode_access2(const cred_t *, struct inode *, + uid_t, mode_t, mode_t); +int secpolicy_vnode_any_access(const cred_t *, struct inode *, uid_t); +int secpolicy_vnode_chown(const cred_t *, uid_t); +int secpolicy_vnode_create_gid(const cred_t *); +int secpolicy_vnode_remove(const cred_t *); +int secpolicy_vnode_setdac(const cred_t *, uid_t); +int secpolicy_vnode_setid_retain(const cred_t *, boolean_t); +int secpolicy_vnode_setids_setgids(const cred_t *, gid_t); +int secpolicy_zinject(const cred_t *); +int secpolicy_zfs(const cred_t *); +int secpolicy_zfs_proc(const cred_t *, proc_t *); +void secpolicy_setid_clear(vattr_t *, cred_t *); +int secpolicy_setid_setsticky_clear(struct inode *, vattr_t *, + const vattr_t *, cred_t *); +int secpolicy_xvattr(xvattr_t *, uid_t, cred_t *, mode_t); +int secpolicy_vnode_setattr(cred_t *, struct inode *, struct vattr *, + const struct vattr *, int, int (void *, int, cred_t *), void *); +int secpolicy_basic_link(const cred_t *); + +#endif /* _KERNEL */ +#endif /* _SYS_POLICY_H */ diff --git a/include/os/linux/zfs/sys/sha2.h b/include/os/linux/zfs/sys/sha2.h new file mode 100644 index 000000000000..4dd966b6cab3 --- /dev/null +++ b/include/os/linux/zfs/sys/sha2.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include /* for uint_* */ + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ +typedef struct { + uint32_t algotype; /* Algorithm Type */ + + /* state (ABCDEFGH) */ + union { + uint32_t s32[8]; /* for SHA256 */ + uint64_t s64[8]; /* for SHA384/512 */ + } state; + /* number of bits */ + union { + uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ + uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ + } count; + union { + uint8_t buf8[128]; /* undigested input */ + uint32_t buf32[32]; /* realigned input */ + uint64_t buf64[16]; /* realigned input */ + } buf_un; +} SHA2_CTX; + +typedef SHA2_CTX SHA256_CTX; +typedef SHA2_CTX SHA384_CTX; +typedef SHA2_CTX SHA512_CTX; + +extern void SHA2Init(uint64_t mech, SHA2_CTX *); + +extern void SHA2Update(SHA2_CTX *, const void *, size_t); + +extern void SHA2Final(void *, SHA2_CTX *); + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/include/os/linux/zfs/sys/trace_acl.h b/include/os/linux/zfs/sys/trace_acl.h new file mode 100644 index 000000000000..083560952f0b --- /dev/null +++ b/include/os/linux/zfs/sys/trace_acl.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_acl + +#if !defined(_TRACE_ACL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ACL_H + +#include +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * znode_t *, ..., + * zfs_ace_hdr_t *, ..., + * uint32_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_ace_class, + TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), + TP_ARGS(zn, ace, mask_matched), + TP_STRUCT__entry( + __field(uint64_t, z_id) + __field(uint8_t, z_unlinked) + __field(uint8_t, z_atime_dirty) + __field(uint8_t, z_zn_prefetch) + __field(uint8_t, z_moved) + __field(uint_t, z_blksz) + __field(uint_t, z_seq) + __field(uint64_t, z_mapcnt) + __field(uint64_t, z_size) + __field(uint64_t, z_pflags) + __field(uint32_t, z_sync_cnt) + __field(mode_t, z_mode) + __field(boolean_t, z_is_sa) + __field(boolean_t, z_is_mapped) + __field(boolean_t, z_is_ctldir) + __field(boolean_t, z_is_stale) + + __field(uint32_t, i_uid) + __field(uint32_t, i_gid) + __field(unsigned long, i_ino) + __field(unsigned int, i_nlink) + __field(loff_t, i_size) + __field(unsigned int, i_blkbits) + __field(unsigned short, i_bytes) + __field(umode_t, i_mode) + __field(__u32, i_generation) + + __field(uint16_t, z_type) + __field(uint16_t, z_flags) + __field(uint32_t, z_access_mask) + + __field(uint32_t, mask_matched) + ), + TP_fast_assign( + __entry->z_id = zn->z_id; + __entry->z_unlinked = zn->z_unlinked; + __entry->z_atime_dirty = zn->z_atime_dirty; + __entry->z_zn_prefetch = zn->z_zn_prefetch; + __entry->z_moved = zn->z_moved; + __entry->z_blksz = zn->z_blksz; + __entry->z_seq = zn->z_seq; + __entry->z_mapcnt = zn->z_mapcnt; + __entry->z_size = zn->z_size; + __entry->z_pflags = zn->z_pflags; + __entry->z_sync_cnt = zn->z_sync_cnt; + __entry->z_mode = zn->z_mode; + __entry->z_is_sa = zn->z_is_sa; + __entry->z_is_mapped = zn->z_is_mapped; + __entry->z_is_ctldir = zn->z_is_ctldir; + __entry->z_is_stale = zn->z_is_stale; + + __entry->i_uid = KUID_TO_SUID(ZTOI(zn)->i_uid); + __entry->i_gid = KGID_TO_SGID(ZTOI(zn)->i_gid); + __entry->i_ino = zn->z_inode.i_ino; + __entry->i_nlink = zn->z_inode.i_nlink; + __entry->i_size = zn->z_inode.i_size; + __entry->i_blkbits = zn->z_inode.i_blkbits; + __entry->i_bytes = zn->z_inode.i_bytes; + __entry->i_mode = zn->z_inode.i_mode; + __entry->i_generation = zn->z_inode.i_generation; + + __entry->z_type = ace->z_type; + __entry->z_flags = ace->z_flags; + __entry->z_access_mask = ace->z_access_mask; + + __entry->mask_matched = mask_matched; + ), + TP_printk("zn { id %llu unlinked %u atime_dirty %u " + "zn_prefetch %u moved %u blksz %u seq %u " + "mapcnt %llu size %llu pflags %llu " + "sync_cnt %u mode 0x%x is_sa %d " + "is_mapped %d is_ctldir %d is_stale %d inode { " + "uid %u gid %u ino %lu nlink %u size %lli " + "blkbits %u bytes %u mode 0x%x generation %x } } " + "ace { type %u flags %u access_mask %u } mask_matched %u", + __entry->z_id, __entry->z_unlinked, __entry->z_atime_dirty, + __entry->z_zn_prefetch, __entry->z_moved, __entry->z_blksz, + __entry->z_seq, __entry->z_mapcnt, __entry->z_size, + __entry->z_pflags, __entry->z_sync_cnt, __entry->z_mode, + __entry->z_is_sa, __entry->z_is_mapped, + __entry->z_is_ctldir, __entry->z_is_stale, __entry->i_uid, + __entry->i_gid, __entry->i_ino, __entry->i_nlink, + __entry->i_size, __entry->i_blkbits, + __entry->i_bytes, __entry->i_mode, __entry->i_generation, + __entry->z_type, __entry->z_flags, __entry->z_access_mask, + __entry->mask_matched) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ACE_EVENT(name) \ +DEFINE_EVENT(zfs_ace_class, name, \ + TP_PROTO(znode_t *zn, zfs_ace_hdr_t *ace, uint32_t mask_matched), \ + TP_ARGS(zn, ace, mask_matched)) +/* END CSTYLED */ +DEFINE_ACE_EVENT(zfs_zfs__ace__denies); +DEFINE_ACE_EVENT(zfs_zfs__ace__allows); + +#endif /* _TRACE_ACL_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_acl +#include + +#else + +DEFINE_DTRACE_PROBE3(zfs__ace__denies); +DEFINE_DTRACE_PROBE3(zfs__ace__allows); +DEFINE_DTRACE_PROBE(zfs__fastpath__execute__access__miss); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_arc.h b/include/os/linux/zfs/sys/trace_arc.h new file mode 100644 index 000000000000..3df491f8b392 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_arc.h @@ -0,0 +1,419 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_arc + +#if !defined(_TRACE_ARC_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ARC_H + +#include +#include +#include /* For ZIO macros */ + +/* + * Generic support for one argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * arc_buf_hdr_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_arc_buf_hdr_class, + TP_PROTO(arc_buf_hdr_t *ab), + TP_ARGS(ab), + TP_STRUCT__entry( + __array(uint64_t, hdr_dva_word, 2) + __field(uint64_t, hdr_birth) + __field(uint32_t, hdr_flags) + __field(uint32_t, hdr_bufcnt) + __field(arc_buf_contents_t, hdr_type) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) + __field(uint64_t, hdr_spa) + __field(arc_state_type_t, hdr_state_type) + __field(clock_t, hdr_access) + __field(uint32_t, hdr_mru_hits) + __field(uint32_t, hdr_mru_ghost_hits) + __field(uint32_t, hdr_mfu_hits) + __field(uint32_t, hdr_mfu_ghost_hits) + __field(uint32_t, hdr_l2_hits) + __field(int64_t, hdr_refcount) + ), + TP_fast_assign( + __entry->hdr_dva_word[0] = ab->b_dva.dva_word[0]; + __entry->hdr_dva_word[1] = ab->b_dva.dva_word[1]; + __entry->hdr_birth = ab->b_birth; + __entry->hdr_flags = ab->b_flags; + __entry->hdr_bufcnt = ab->b_l1hdr.b_bufcnt; + __entry->hdr_psize = ab->b_psize; + __entry->hdr_lsize = ab->b_lsize; + __entry->hdr_spa = ab->b_spa; + __entry->hdr_state_type = ab->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = ab->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = ab->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = ab->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = ab->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = ab->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = ab->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = ab->b_l1hdr.b_refcnt.rc_count; + ), + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x bufcnt %u type %u psize %u lsize %u spa %llu " + "state_type %u access %lu mru_hits %u mru_ghost_hits %u " + "mfu_hits %u mfu_ghost_hits %u l2_hits %u refcount %lli }", + __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_bufcnt, __entry->hdr_type, __entry->hdr_psize, + __entry->hdr_lsize, __entry->hdr_spa, __entry->hdr_state_type, + __entry->hdr_access, __entry->hdr_mru_hits, + __entry->hdr_mru_ghost_hits, __entry->hdr_mfu_hits, + __entry->hdr_mfu_ghost_hits, __entry->hdr_l2_hits, + __entry->hdr_refcount) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ARC_BUF_HDR_EVENT(name) \ +DEFINE_EVENT(zfs_arc_buf_hdr_class, name, \ + TP_PROTO(arc_buf_hdr_t *ab), \ + TP_ARGS(ab)) +/* END CSTYLED */ +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__evict); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__delete); +DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mru); +DEFINE_ARC_BUF_HDR_EVENT(zfs_new_state__mfu); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__async__upgrade__sync); +DEFINE_ARC_BUF_HDR_EVENT(zfs_arc__demand__hit__predictive__prefetch); +DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__hit); +DEFINE_ARC_BUF_HDR_EVENT(zfs_l2arc__miss); + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * vdev_t *, ..., + * zio_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_l2arc_rw_class, + TP_PROTO(vdev_t *vd, zio_t *zio), + TP_ARGS(vd, zio), + TP_STRUCT__entry( + __field(uint64_t, vdev_id) + __field(uint64_t, vdev_guid) + __field(uint64_t, vdev_state) + ZIO_TP_STRUCT_ENTRY + ), + TP_fast_assign( + __entry->vdev_id = vd->vdev_id; + __entry->vdev_guid = vd->vdev_guid; + __entry->vdev_state = vd->vdev_state; + ZIO_TP_FAST_ASSIGN + ), + TP_printk("vdev { id %llu guid %llu state %llu } " + ZIO_TP_PRINTK_FMT, __entry->vdev_id, __entry->vdev_guid, + __entry->vdev_state, ZIO_TP_PRINTK_ARGS) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_L2ARC_RW_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_rw_class, name, \ + TP_PROTO(vdev_t *vd, zio_t *zio), \ + TP_ARGS(vd, zio)) +/* END CSTYLED */ +DEFINE_L2ARC_RW_EVENT(zfs_l2arc__read); +DEFINE_L2ARC_RW_EVENT(zfs_l2arc__write); + + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zio_t *, ..., + * l2arc_write_callback_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_l2arc_iodone_class, + TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), + TP_ARGS(zio, cb), + TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), + TP_fast_assign(ZIO_TP_FAST_ASSIGN), + TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_L2ARC_IODONE_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_iodone_class, name, \ + TP_PROTO(zio_t *zio, l2arc_write_callback_t *cb), \ + TP_ARGS(zio, cb)) +/* END CSTYLED */ +DEFINE_L2ARC_IODONE_EVENT(zfs_l2arc__iodone); + + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * arc_buf_hdr_t *, ..., + * const blkptr_t *, + * uint64_t, + * const zbookmark_phys_t *); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_arc_miss_class, + TP_PROTO(arc_buf_hdr_t *hdr, + const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), + TP_ARGS(hdr, bp, size, zb), + TP_STRUCT__entry( + __array(uint64_t, hdr_dva_word, 2) + __field(uint64_t, hdr_birth) + __field(uint32_t, hdr_flags) + __field(uint32_t, hdr_bufcnt) + __field(arc_buf_contents_t, hdr_type) + __field(uint16_t, hdr_psize) + __field(uint16_t, hdr_lsize) + __field(uint64_t, hdr_spa) + __field(arc_state_type_t, hdr_state_type) + __field(clock_t, hdr_access) + __field(uint32_t, hdr_mru_hits) + __field(uint32_t, hdr_mru_ghost_hits) + __field(uint32_t, hdr_mfu_hits) + __field(uint32_t, hdr_mfu_ghost_hits) + __field(uint32_t, hdr_l2_hits) + __field(int64_t, hdr_refcount) + + __array(uint64_t, bp_dva0, 2) + __array(uint64_t, bp_dva1, 2) + __array(uint64_t, bp_dva2, 2) + __array(uint64_t, bp_cksum, 4) + + __field(uint64_t, bp_lsize) + + __field(uint64_t, zb_objset) + __field(uint64_t, zb_object) + __field(int64_t, zb_level) + __field(uint64_t, zb_blkid) + ), + TP_fast_assign( + __entry->hdr_dva_word[0] = hdr->b_dva.dva_word[0]; + __entry->hdr_dva_word[1] = hdr->b_dva.dva_word[1]; + __entry->hdr_birth = hdr->b_birth; + __entry->hdr_flags = hdr->b_flags; + __entry->hdr_bufcnt = hdr->b_l1hdr.b_bufcnt; + __entry->hdr_psize = hdr->b_psize; + __entry->hdr_lsize = hdr->b_lsize; + __entry->hdr_spa = hdr->b_spa; + __entry->hdr_state_type = hdr->b_l1hdr.b_state->arcs_state; + __entry->hdr_access = hdr->b_l1hdr.b_arc_access; + __entry->hdr_mru_hits = hdr->b_l1hdr.b_mru_hits; + __entry->hdr_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + __entry->hdr_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + __entry->hdr_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + __entry->hdr_l2_hits = hdr->b_l1hdr.b_l2_hits; + __entry->hdr_refcount = hdr->b_l1hdr.b_refcnt.rc_count; + + __entry->bp_dva0[0] = bp->blk_dva[0].dva_word[0]; + __entry->bp_dva0[1] = bp->blk_dva[0].dva_word[1]; + __entry->bp_dva1[0] = bp->blk_dva[1].dva_word[0]; + __entry->bp_dva1[1] = bp->blk_dva[1].dva_word[1]; + __entry->bp_dva2[0] = bp->blk_dva[2].dva_word[0]; + __entry->bp_dva2[1] = bp->blk_dva[2].dva_word[1]; + __entry->bp_cksum[0] = bp->blk_cksum.zc_word[0]; + __entry->bp_cksum[1] = bp->blk_cksum.zc_word[1]; + __entry->bp_cksum[2] = bp->blk_cksum.zc_word[2]; + __entry->bp_cksum[3] = bp->blk_cksum.zc_word[3]; + + __entry->bp_lsize = size; + + __entry->zb_objset = zb->zb_objset; + __entry->zb_object = zb->zb_object; + __entry->zb_level = zb->zb_level; + __entry->zb_blkid = zb->zb_blkid; + ), + TP_printk("hdr { dva 0x%llx:0x%llx birth %llu " + "flags 0x%x bufcnt %u psize %u lsize %u spa %llu state_type %u " + "access %lu mru_hits %u mru_ghost_hits %u mfu_hits %u " + "mfu_ghost_hits %u l2_hits %u refcount %lli } " + "bp { dva0 0x%llx:0x%llx dva1 0x%llx:0x%llx dva2 " + "0x%llx:0x%llx cksum 0x%llx:0x%llx:0x%llx:0x%llx " + "lsize %llu } zb { objset %llu object %llu level %lli " + "blkid %llu }", + __entry->hdr_dva_word[0], __entry->hdr_dva_word[1], + __entry->hdr_birth, __entry->hdr_flags, + __entry->hdr_bufcnt, __entry->hdr_psize, __entry->hdr_lsize, + __entry->hdr_spa, __entry->hdr_state_type, __entry->hdr_access, + __entry->hdr_mru_hits, __entry->hdr_mru_ghost_hits, + __entry->hdr_mfu_hits, __entry->hdr_mfu_ghost_hits, + __entry->hdr_l2_hits, __entry->hdr_refcount, + __entry->bp_dva0[0], __entry->bp_dva0[1], + __entry->bp_dva1[0], __entry->bp_dva1[1], + __entry->bp_dva2[0], __entry->bp_dva2[1], + __entry->bp_cksum[0], __entry->bp_cksum[1], + __entry->bp_cksum[2], __entry->bp_cksum[3], + __entry->bp_lsize, __entry->zb_objset, __entry->zb_object, + __entry->zb_level, __entry->zb_blkid) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ARC_MISS_EVENT(name) \ +DEFINE_EVENT(zfs_arc_miss_class, name, \ + TP_PROTO(arc_buf_hdr_t *hdr, \ + const blkptr_t *bp, uint64_t size, const zbookmark_phys_t *zb), \ + TP_ARGS(hdr, bp, size, zb)) +/* END CSTYLED */ +DEFINE_ARC_MISS_EVENT(zfs_arc__miss); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * l2arc_dev_t *, ..., + * list_t *, ..., + * uint64_t, ..., + * boolean_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_l2arc_evict_class, + TP_PROTO(l2arc_dev_t *dev, + list_t *buflist, uint64_t taddr, boolean_t all), + TP_ARGS(dev, buflist, taddr, all), + TP_STRUCT__entry( + __field(uint64_t, vdev_id) + __field(uint64_t, vdev_guid) + __field(uint64_t, vdev_state) + + __field(uint64_t, l2ad_hand) + __field(uint64_t, l2ad_start) + __field(uint64_t, l2ad_end) + __field(boolean_t, l2ad_first) + __field(boolean_t, l2ad_writing) + + __field(uint64_t, taddr) + __field(boolean_t, all) + ), + TP_fast_assign( + __entry->vdev_id = dev->l2ad_vdev->vdev_id; + __entry->vdev_guid = dev->l2ad_vdev->vdev_guid; + __entry->vdev_state = dev->l2ad_vdev->vdev_state; + + __entry->l2ad_hand = dev->l2ad_hand; + __entry->l2ad_start = dev->l2ad_start; + __entry->l2ad_end = dev->l2ad_end; + __entry->l2ad_first = dev->l2ad_first; + __entry->l2ad_writing = dev->l2ad_writing; + + __entry->taddr = taddr; + __entry->all = all; + ), + TP_printk("l2ad { vdev { id %llu guid %llu state %llu } " + "hand %llu start %llu end %llu " + "first %d writing %d } taddr %llu all %d", + __entry->vdev_id, __entry->vdev_guid, __entry->vdev_state, + __entry->l2ad_hand, __entry->l2ad_start, + __entry->l2ad_end, __entry->l2ad_first, __entry->l2ad_writing, + __entry->taddr, __entry->all) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_L2ARC_EVICT_EVENT(name) \ +DEFINE_EVENT(zfs_l2arc_evict_class, name, \ + TP_PROTO(l2arc_dev_t *dev, \ + list_t *buflist, uint64_t taddr, boolean_t all), \ + TP_ARGS(dev, buflist, taddr, all)) +/* END CSTYLED */ +DEFINE_L2ARC_EVICT_EVENT(zfs_l2arc__evict); + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * uint64_t, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_arc_wait_for_eviction_class, + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), + TP_ARGS(amount, arc_evict_count, aew_count), + TP_STRUCT__entry( + __field(uint64_t, amount) + __field(uint64_t, arc_evict_count) + __field(uint64_t, aew_count) + ), + TP_fast_assign( + __entry->amount = amount; + __entry->arc_evict_count = arc_evict_count; + __entry->aew_count = aew_count; + ), + TP_printk("amount %llu arc_evict_count %llu aew_count %llu", + __entry->amount, __entry->arc_evict_count, __entry->aew_count) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(name) \ +DEFINE_EVENT(zfs_arc_wait_for_eviction_class, name, \ + TP_PROTO(uint64_t amount, uint64_t arc_evict_count, uint64_t aew_count), \ + TP_ARGS(amount, arc_evict_count, aew_count)) +/* END CSTYLED */ +DEFINE_ARC_WAIT_FOR_EVICTION_EVENT(zfs_arc__wait__for__eviction); + +#endif /* _TRACE_ARC_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_arc +#include + +#else + +DEFINE_DTRACE_PROBE1(arc__hit); +DEFINE_DTRACE_PROBE1(arc__evict); +DEFINE_DTRACE_PROBE1(arc__delete); +DEFINE_DTRACE_PROBE1(new_state__mru); +DEFINE_DTRACE_PROBE1(new_state__mfu); +DEFINE_DTRACE_PROBE1(arc__async__upgrade__sync); +DEFINE_DTRACE_PROBE1(arc__demand__hit__predictive__prefetch); +DEFINE_DTRACE_PROBE1(l2arc__hit); +DEFINE_DTRACE_PROBE1(l2arc__miss); +DEFINE_DTRACE_PROBE2(l2arc__read); +DEFINE_DTRACE_PROBE2(l2arc__write); +DEFINE_DTRACE_PROBE2(l2arc__iodone); +DEFINE_DTRACE_PROBE3(arc__wait__for__eviction); +DEFINE_DTRACE_PROBE4(arc__miss); +DEFINE_DTRACE_PROBE4(l2arc__evict); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_common.h b/include/os/linux/zfs/sys/trace_common.h new file mode 100644 index 000000000000..6922d1a1810a --- /dev/null +++ b/include/os/linux/zfs/sys/trace_common.h @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file contains commonly used trace macros. Feel free to add and use + * them in your tracepoint headers. + */ + +#ifndef _SYS_TRACE_COMMON_H +#define _SYS_TRACE_COMMON_H +#include + +/* ZIO macros */ +#define ZIO_TP_STRUCT_ENTRY \ + __field(zio_type_t, zio_type) \ + __field(int, zio_cmd) \ + __field(zio_priority_t, zio_priority) \ + __field(uint64_t, zio_size) \ + __field(uint64_t, zio_orig_size) \ + __field(uint64_t, zio_offset) \ + __field(hrtime_t, zio_timestamp) \ + __field(hrtime_t, zio_delta) \ + __field(uint64_t, zio_delay) \ + __field(enum zio_flag, zio_flags) \ + __field(enum zio_stage, zio_stage) \ + __field(enum zio_stage, zio_pipeline) \ + __field(enum zio_flag, zio_orig_flags) \ + __field(enum zio_stage, zio_orig_stage) \ + __field(enum zio_stage, zio_orig_pipeline) \ + __field(uint8_t, zio_reexecute) \ + __field(uint64_t, zio_txg) \ + __field(int, zio_error) \ + __field(uint64_t, zio_ena) \ + \ + __field(enum zio_checksum, zp_checksum) \ + __field(enum zio_compress, zp_compress) \ + __field(dmu_object_type_t, zp_type) \ + __field(uint8_t, zp_level) \ + __field(uint8_t, zp_copies) \ + __field(boolean_t, zp_dedup) \ + __field(boolean_t, zp_dedup_verify) \ + __field(boolean_t, zp_nopwrite) + +#define ZIO_TP_FAST_ASSIGN \ + __entry->zio_type = zio->io_type; \ + __entry->zio_cmd = zio->io_cmd; \ + __entry->zio_priority = zio->io_priority; \ + __entry->zio_size = zio->io_size; \ + __entry->zio_orig_size = zio->io_orig_size; \ + __entry->zio_offset = zio->io_offset; \ + __entry->zio_timestamp = zio->io_timestamp; \ + __entry->zio_delta = zio->io_delta; \ + __entry->zio_delay = zio->io_delay; \ + __entry->zio_flags = zio->io_flags; \ + __entry->zio_stage = zio->io_stage; \ + __entry->zio_pipeline = zio->io_pipeline; \ + __entry->zio_orig_flags = zio->io_orig_flags; \ + __entry->zio_orig_stage = zio->io_orig_stage; \ + __entry->zio_orig_pipeline = zio->io_orig_pipeline; \ + __entry->zio_reexecute = zio->io_reexecute; \ + __entry->zio_txg = zio->io_txg; \ + __entry->zio_error = zio->io_error; \ + __entry->zio_ena = zio->io_ena; \ + \ + __entry->zp_checksum = zio->io_prop.zp_checksum; \ + __entry->zp_compress = zio->io_prop.zp_compress; \ + __entry->zp_type = zio->io_prop.zp_type; \ + __entry->zp_level = zio->io_prop.zp_level; \ + __entry->zp_copies = zio->io_prop.zp_copies; \ + __entry->zp_dedup = zio->io_prop.zp_dedup; \ + __entry->zp_nopwrite = zio->io_prop.zp_nopwrite; \ + __entry->zp_dedup_verify = zio->io_prop.zp_dedup_verify; + +#define ZIO_TP_PRINTK_FMT \ + "zio { type %u cmd %i prio %u size %llu orig_size %llu " \ + "offset %llu timestamp %llu delta %llu delay %llu " \ + "flags 0x%x stage 0x%x pipeline 0x%x orig_flags 0x%x " \ + "orig_stage 0x%x orig_pipeline 0x%x reexecute %u " \ + "txg %llu error %d ena %llu prop { checksum %u compress %u " \ + "type %u level %u copies %u dedup %u dedup_verify %u nopwrite %u } }" + +#define ZIO_TP_PRINTK_ARGS \ + __entry->zio_type, __entry->zio_cmd, __entry->zio_priority, \ + __entry->zio_size, __entry->zio_orig_size, __entry->zio_offset, \ + __entry->zio_timestamp, __entry->zio_delta, __entry->zio_delay, \ + __entry->zio_flags, __entry->zio_stage, __entry->zio_pipeline, \ + __entry->zio_orig_flags, __entry->zio_orig_stage, \ + __entry->zio_orig_pipeline, __entry->zio_reexecute, \ + __entry->zio_txg, __entry->zio_error, __entry->zio_ena, \ + __entry->zp_checksum, __entry->zp_compress, __entry->zp_type, \ + __entry->zp_level, __entry->zp_copies, __entry->zp_dedup, \ + __entry->zp_dedup_verify, __entry->zp_nopwrite + +#endif /* _SYS_TRACE_COMMON_H */ diff --git a/include/os/linux/zfs/sys/trace_dbgmsg.h b/include/os/linux/zfs/sys/trace_dbgmsg.h new file mode 100644 index 000000000000..513918d004a4 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_dbgmsg.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_dbgmsg + +#if !defined(_TRACE_DBGMSG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DBGMSG_H + +#include + +/* + * This file defines tracepoint events for use by the dbgmsg(), + * dprintf(), and SET_ERROR() interfaces. These are grouped here because + * they all provide a way to store simple messages in the debug log (as + * opposed to events used by the DTRACE_PROBE interfaces which typically + * dump structured data). + * + * This header is included inside the trace.h multiple inclusion guard, + * and it is guarded above against direct inclusion, so it and need not + * be guarded separately. + */ + +/* + * Generic support for one argument tracepoints of the form: + * + * DTRACE_PROBE1(..., + * const char *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_dprintf_class, + TP_PROTO(const char *msg), + TP_ARGS(msg), + TP_STRUCT__entry( + __string(msg, msg) + ), + TP_fast_assign( + __assign_str(msg, msg); + ), + TP_printk("%s", __get_str(msg)) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_DPRINTF_EVENT(name) \ +DEFINE_EVENT(zfs_dprintf_class, name, \ + TP_PROTO(const char *msg), \ + TP_ARGS(msg)) +/* END CSTYLED */ +DEFINE_DPRINTF_EVENT(zfs_zfs__dprintf); + +#endif /* _TRACE_DBGMSG_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_dbgmsg +#include + +#else + +DEFINE_DTRACE_PROBE1(zfs__dprintf); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_dbuf.h b/include/os/linux/zfs/sys/trace_dbuf.h new file mode 100644 index 000000000000..bd7d791a46e7 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_dbuf.h @@ -0,0 +1,169 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_dbuf + +#if !defined(_TRACE_DBUF_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DBUF_H + +#include +#include + +#ifndef TRACE_DBUF_MSG_MAX +#define TRACE_DBUF_MSG_MAX 512 +#endif + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * dmu_buf_impl_t *, ..., + * zio_t *, ...); + */ + +#define DBUF_TP_STRUCT_ENTRY \ + __dynamic_array(char, os_spa, TRACE_DBUF_MSG_MAX) \ + __field(uint64_t, ds_object) \ + __field(uint64_t, db_object) \ + __field(uint64_t, db_level) \ + __field(uint64_t, db_blkid) \ + __field(uint64_t, db_offset) \ + __field(uint64_t, db_size) \ + __field(uint64_t, db_state) \ + __field(int64_t, db_holds) \ + __dynamic_array(char, msg, TRACE_DBUF_MSG_MAX) + +#define DBUF_TP_FAST_ASSIGN \ + if (db != NULL) { \ + __assign_str(os_spa, \ + spa_name(DB_DNODE(db)->dn_objset->os_spa)); \ + \ + __entry->ds_object = db->db_objset->os_dsl_dataset ? \ + db->db_objset->os_dsl_dataset->ds_object : 0; \ + \ + __entry->db_object = db->db.db_object; \ + __entry->db_level = db->db_level; \ + __entry->db_blkid = db->db_blkid; \ + __entry->db_offset = db->db.db_offset; \ + __entry->db_size = db->db.db_size; \ + __entry->db_state = db->db_state; \ + __entry->db_holds = zfs_refcount_count(&db->db_holds); \ + snprintf(__get_str(msg), TRACE_DBUF_MSG_MAX, \ + DBUF_TP_PRINTK_FMT, DBUF_TP_PRINTK_ARGS); \ + } else { \ + __assign_str(os_spa, "NULL") \ + __entry->ds_object = 0; \ + __entry->db_object = 0; \ + __entry->db_level = 0; \ + __entry->db_blkid = 0; \ + __entry->db_offset = 0; \ + __entry->db_size = 0; \ + __entry->db_state = 0; \ + __entry->db_holds = 0; \ + snprintf(__get_str(msg), TRACE_DBUF_MSG_MAX, \ + "dbuf { NULL }"); \ + } + +#define DBUF_TP_PRINTK_FMT \ + "dbuf { spa \"%s\" objset %llu object %llu level %llu " \ + "blkid %llu offset %llu size %llu state %llu holds %lld }" + +#define DBUF_TP_PRINTK_ARGS \ + __get_str(os_spa), __entry->ds_object, \ + __entry->db_object, __entry->db_level, \ + __entry->db_blkid, __entry->db_offset, \ + __entry->db_size, __entry->db_state, __entry->db_holds + +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_dbuf_class, + TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), + TP_ARGS(db, zio), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk("%s", __get_str(msg)) +); + +DECLARE_EVENT_CLASS(zfs_dbuf_state_class, + TP_PROTO(dmu_buf_impl_t *db, const char *why), + TP_ARGS(db, why), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk("%s", __get_str(msg)) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_DBUF_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, zio_t *zio), \ + TP_ARGS(db, zio)) +/* END CSTYLED */ +DEFINE_DBUF_EVENT(zfs_blocked__read); + +/* BEGIN CSTYLED */ +#define DEFINE_DBUF_STATE_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_state_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, const char *why), \ + TP_ARGS(db, why)) +/* END CSTYLED */ +DEFINE_DBUF_STATE_EVENT(zfs_dbuf__state_change); + +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_dbuf_evict_one_class, + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), + TP_ARGS(db, mls), + TP_STRUCT__entry(DBUF_TP_STRUCT_ENTRY), + TP_fast_assign(DBUF_TP_FAST_ASSIGN), + TP_printk("%s", __get_str(msg)) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_DBUF_EVICT_ONE_EVENT(name) \ +DEFINE_EVENT(zfs_dbuf_evict_one_class, name, \ + TP_PROTO(dmu_buf_impl_t *db, multilist_sublist_t *mls), \ + TP_ARGS(db, mls)) +/* END CSTYLED */ +DEFINE_DBUF_EVICT_ONE_EVENT(zfs_dbuf__evict__one); + +#endif /* _TRACE_DBUF_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_dbuf +#include + +#else + +DEFINE_DTRACE_PROBE2(blocked__read); +DEFINE_DTRACE_PROBE2(dbuf__evict__one); +DEFINE_DTRACE_PROBE2(dbuf__state_change); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_dmu.h b/include/os/linux/zfs/sys/trace_dmu.h new file mode 100644 index 000000000000..3c64a370f842 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_dmu.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_dmu + +#if !defined(_TRACE_DMU_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DMU_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * dmu_tx_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_delay_mintime_class, + TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), + TP_ARGS(tx, dirty, min_tx_time), + TP_STRUCT__entry( + __field(uint64_t, tx_txg) + __field(uint64_t, tx_lastsnap_txg) + __field(uint64_t, tx_lasttried_txg) + __field(boolean_t, tx_anyobj) + __field(boolean_t, tx_dirty_delayed) + __field(hrtime_t, tx_start) + __field(boolean_t, tx_wait_dirty) + __field(int, tx_err) + __field(uint64_t, min_tx_time) + __field(uint64_t, dirty) + ), + TP_fast_assign( + __entry->tx_txg = tx->tx_txg; + __entry->tx_lastsnap_txg = tx->tx_lastsnap_txg; + __entry->tx_lasttried_txg = tx->tx_lasttried_txg; + __entry->tx_anyobj = tx->tx_anyobj; + __entry->tx_dirty_delayed = tx->tx_dirty_delayed; + __entry->tx_start = tx->tx_start; + __entry->tx_wait_dirty = tx->tx_wait_dirty; + __entry->tx_err = tx->tx_err; + __entry->dirty = dirty; + __entry->min_tx_time = min_tx_time; + ), + TP_printk("tx { txg %llu lastsnap_txg %llu tx_lasttried_txg %llu " + "anyobj %d dirty_delayed %d start %llu wait_dirty %d err %i " + "} dirty %llu min_tx_time %llu", + __entry->tx_txg, __entry->tx_lastsnap_txg, + __entry->tx_lasttried_txg, __entry->tx_anyobj, + __entry->tx_dirty_delayed, __entry->tx_start, + __entry->tx_wait_dirty, __entry->tx_err, + __entry->dirty, __entry->min_tx_time) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_DELAY_MINTIME_EVENT(name) \ +DEFINE_EVENT(zfs_delay_mintime_class, name, \ + TP_PROTO(dmu_tx_t *tx, uint64_t dirty, uint64_t min_tx_time), \ + TP_ARGS(tx, dirty, min_tx_time)) +/* END CSTYLED */ +DEFINE_DELAY_MINTIME_EVENT(zfs_delay__mintime); + +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_free_long_range_class, + TP_PROTO(uint64_t long_free_dirty_all_txgs, uint64_t chunk_len, \ + uint64_t txg), + TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg), + TP_STRUCT__entry( + __field(uint64_t, long_free_dirty_all_txgs) + __field(uint64_t, chunk_len) + __field(uint64_t, txg) + ), + TP_fast_assign( + __entry->long_free_dirty_all_txgs = long_free_dirty_all_txgs; + __entry->chunk_len = chunk_len; + __entry->txg = txg; + ), + TP_printk("long_free_dirty_all_txgs %llu chunk_len %llu txg %llu", + __entry->long_free_dirty_all_txgs, + __entry->chunk_len, __entry->txg) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_FREE_LONG_RANGE_EVENT(name) \ +DEFINE_EVENT(zfs_free_long_range_class, name, \ + TP_PROTO(uint64_t long_free_dirty_all_txgs, \ + uint64_t chunk_len, uint64_t txg), \ + TP_ARGS(long_free_dirty_all_txgs, chunk_len, txg)) +/* END CSTYLED */ +DEFINE_FREE_LONG_RANGE_EVENT(zfs_free__long__range); + +#endif /* _TRACE_DMU_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_dmu +#include + +#else + +DEFINE_DTRACE_PROBE3(delay__mintime); +DEFINE_DTRACE_PROBE3(free__long__range); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_dnode.h b/include/os/linux/zfs/sys/trace_dnode.h new file mode 100644 index 000000000000..27ad6cba16d6 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_dnode.h @@ -0,0 +1,129 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_dnode + +#if !defined(_TRACE_DNODE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_DNODE_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * dnode_t *, ..., + * int64_t, ..., + * uint32_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_dnode_move_class, + TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), + TP_ARGS(dn, refcount, dbufs), + TP_STRUCT__entry( + __field(uint64_t, dn_object) + __field(dmu_object_type_t, dn_type) + __field(uint16_t, dn_bonuslen) + __field(uint8_t, dn_bonustype) + __field(uint8_t, dn_nblkptr) + __field(uint8_t, dn_checksum) + __field(uint8_t, dn_compress) + __field(uint8_t, dn_nlevels) + __field(uint8_t, dn_indblkshift) + __field(uint8_t, dn_datablkshift) + __field(uint8_t, dn_moved) + __field(uint16_t, dn_datablkszsec) + __field(uint32_t, dn_datablksz) + __field(uint64_t, dn_maxblkid) + __field(int64_t, dn_tx_holds) + __field(int64_t, dn_holds) + __field(boolean_t, dn_have_spill) + + __field(int64_t, refcount) + __field(uint32_t, dbufs) + ), + TP_fast_assign( + __entry->dn_object = dn->dn_object; + __entry->dn_type = dn->dn_type; + __entry->dn_bonuslen = dn->dn_bonuslen; + __entry->dn_bonustype = dn->dn_bonustype; + __entry->dn_nblkptr = dn->dn_nblkptr; + __entry->dn_checksum = dn->dn_checksum; + __entry->dn_compress = dn->dn_compress; + __entry->dn_nlevels = dn->dn_nlevels; + __entry->dn_indblkshift = dn->dn_indblkshift; + __entry->dn_datablkshift = dn->dn_datablkshift; + __entry->dn_moved = dn->dn_moved; + __entry->dn_datablkszsec = dn->dn_datablkszsec; + __entry->dn_datablksz = dn->dn_datablksz; + __entry->dn_maxblkid = dn->dn_maxblkid; + __entry->dn_tx_holds = dn->dn_tx_holds.rc_count; + __entry->dn_holds = dn->dn_holds.rc_count; + __entry->dn_have_spill = dn->dn_have_spill; + + __entry->refcount = refcount; + __entry->dbufs = dbufs; + ), + TP_printk("dn { object %llu type %d bonuslen %u bonustype %u " + "nblkptr %u checksum %u compress %u nlevels %u indblkshift %u " + "datablkshift %u moved %u datablkszsec %u datablksz %u " + "maxblkid %llu tx_holds %lli holds %lli have_spill %d } " + "refcount %lli dbufs %u", + __entry->dn_object, __entry->dn_type, __entry->dn_bonuslen, + __entry->dn_bonustype, __entry->dn_nblkptr, __entry->dn_checksum, + __entry->dn_compress, __entry->dn_nlevels, __entry->dn_indblkshift, + __entry->dn_datablkshift, __entry->dn_moved, + __entry->dn_datablkszsec, __entry->dn_datablksz, + __entry->dn_maxblkid, __entry->dn_tx_holds, __entry->dn_holds, + __entry->dn_have_spill, __entry->refcount, __entry->dbufs) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_DNODE_MOVE_EVENT(name) \ +DEFINE_EVENT(zfs_dnode_move_class, name, \ + TP_PROTO(dnode_t *dn, int64_t refcount, uint32_t dbufs), \ + TP_ARGS(dn, refcount, dbufs)) +/* END CSTYLED */ +DEFINE_DNODE_MOVE_EVENT(zfs_dnode__move); + +#endif /* _TRACE_DNODE_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_dnode +#include + +#else + +DEFINE_DTRACE_PROBE3(dnode__move); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_multilist.h b/include/os/linux/zfs/sys/trace_multilist.h new file mode 100644 index 000000000000..fe68d5296f73 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_multilist.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_multilist + +#if !defined(_TRACE_MULTILIST_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_MULTILIST_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * multilist_t *, ..., + * unsigned int, ..., + * void *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_multilist_insert_remove_class, + TP_PROTO(multilist_t *ml, unsigned sublist_idx, void *obj), + TP_ARGS(ml, sublist_idx, obj), + TP_STRUCT__entry( + __field(size_t, ml_offset) + __field(uint64_t, ml_num_sublists) + + __field(unsigned int, sublist_idx) + ), + TP_fast_assign( + __entry->ml_offset = ml->ml_offset; + __entry->ml_num_sublists = ml->ml_num_sublists; + + __entry->sublist_idx = sublist_idx; + ), + TP_printk("ml { offset %ld numsublists %llu sublistidx %u } ", + __entry->ml_offset, __entry->ml_num_sublists, __entry->sublist_idx) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_MULTILIST_INSERT_REMOVE_EVENT(name) \ +DEFINE_EVENT(zfs_multilist_insert_remove_class, name, \ + TP_PROTO(multilist_t *ml, unsigned int sublist_idx, void *obj), \ + TP_ARGS(ml, sublist_idx, obj)) +/* END CSTYLED */ +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__insert); +DEFINE_MULTILIST_INSERT_REMOVE_EVENT(zfs_multilist__remove); + +#endif /* _TRACE_MULTILIST_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_multilist +#include + +#else + +DEFINE_DTRACE_PROBE3(multilist__insert); +DEFINE_DTRACE_PROBE3(multilist__remove); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_rrwlock.h b/include/os/linux/zfs/sys/trace_rrwlock.h new file mode 100644 index 000000000000..4c74d6257309 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_rrwlock.h @@ -0,0 +1,31 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#else + +DEFINE_DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +DEFINE_DTRACE_PROBE(zfs__rrwfastpath__exitmiss); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_txg.h b/include/os/linux/zfs/sys/trace_txg.h new file mode 100644 index 000000000000..23d5d358bc42 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_txg.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_txg + +#if !defined(_TRACE_TXG_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_TXG_H + +#include +#include + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * dsl_pool_t *, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_txg_class, + TP_PROTO(dsl_pool_t *dp, uint64_t txg), + TP_ARGS(dp, txg), + TP_STRUCT__entry( + __field(uint64_t, txg) + ), + TP_fast_assign( + __entry->txg = txg; + ), + TP_printk("txg %llu", __entry->txg) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_TXG_EVENT(name) \ +DEFINE_EVENT(zfs_txg_class, name, \ + TP_PROTO(dsl_pool_t *dp, uint64_t txg), \ + TP_ARGS(dp, txg)) +/* END CSTYLED */ +DEFINE_TXG_EVENT(zfs_dsl_pool_sync__done); +DEFINE_TXG_EVENT(zfs_txg__quiescing); +DEFINE_TXG_EVENT(zfs_txg__opened); +DEFINE_TXG_EVENT(zfs_txg__syncing); +DEFINE_TXG_EVENT(zfs_txg__synced); +DEFINE_TXG_EVENT(zfs_txg__quiesced); + +#endif /* _TRACE_TXG_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_txg +#include + +#else + +DEFINE_DTRACE_PROBE2(dsl_pool_sync__done); +DEFINE_DTRACE_PROBE2(txg__quiescing); +DEFINE_DTRACE_PROBE2(txg__opened); +DEFINE_DTRACE_PROBE2(txg__syncing); +DEFINE_DTRACE_PROBE2(txg__synced); +DEFINE_DTRACE_PROBE2(txg__quiesced); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_vdev.h b/include/os/linux/zfs/sys/trace_vdev.h new file mode 100644 index 000000000000..50711446ffa4 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_vdev.h @@ -0,0 +1,140 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +/* + * If tracepoints are available define dtrace_probe events for vdev + * related probes. Definitions in include/os/linux/spl/sys/trace.h + * will map DTRACE_PROBE* calls to tracepoints. + */ + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_vdev + +#if !defined(_TRACE_VDEV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_VDEV_H + +#include +#include + +/* + * Generic support for three argument tracepoints of the form: + * + * DTRACE_PROBE3(..., + * spa_t *, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_removing_class_3, + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size), + TP_ARGS(spa, offset, size), + TP_STRUCT__entry( + __field(spa_t *, vdev_spa) + __field(uint64_t, vdev_offset) + __field(uint64_t, vdev_size) + ), + TP_fast_assign( + __entry->vdev_spa = spa; + __entry->vdev_offset = offset; + __entry->vdev_size = size; + ), + TP_printk("spa %p offset %llu size %llu", + __entry->vdev_spa, __entry->vdev_offset, + __entry->vdev_size) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_REMOVE_FREE_EVENT(name) \ +DEFINE_EVENT(zfs_removing_class_3, name, \ + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size), \ + TP_ARGS(spa, offset, size)) +/* END CSTYLED */ +DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__synced); +DEFINE_REMOVE_FREE_EVENT(zfs_remove__free__unvisited); + +/* + * Generic support for four argument tracepoints of the form: + * + * DTRACE_PROBE4(..., + * spa_t *, ..., + * uint64_t, ..., + * uint64_t, ..., + * uint64_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_removing_class_4, + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size, uint64_t txg), + TP_ARGS(spa, offset, size, txg), + TP_STRUCT__entry( + __field(spa_t *, vdev_spa) + __field(uint64_t, vdev_offset) + __field(uint64_t, vdev_size) + __field(uint64_t, vdev_txg) + ), + TP_fast_assign( + __entry->vdev_spa = spa; + __entry->vdev_offset = offset; + __entry->vdev_size = size; + __entry->vdev_txg = txg; + ), + TP_printk("spa %p offset %llu size %llu txg %llu", + __entry->vdev_spa, __entry->vdev_offset, + __entry->vdev_size, __entry->vdev_txg) +); + +/* BEGIN CSTYLED */ +#define DEFINE_REMOVE_FREE_EVENT_TXG(name) \ +DEFINE_EVENT(zfs_removing_class_4, name, \ + TP_PROTO(spa_t *spa, uint64_t offset, uint64_t size,uint64_t txg), \ + TP_ARGS(spa, offset, size, txg)) +/* END CSTYLED */ +DEFINE_REMOVE_FREE_EVENT_TXG(zfs_remove__free__inflight); + +#endif /* _TRACE_VDEV_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_vdev +#include + +#else + +/* + * When tracepoints are not available, a DEFINE_DTRACE_PROBE* macro is + * needed for each DTRACE_PROBE. These will be used to generate stub + * tracing functions and prototypes for those functions. See + * include/os/linux/spl/sys/trace.h. + */ + +DEFINE_DTRACE_PROBE3(remove__free__synced); +DEFINE_DTRACE_PROBE3(remove__free__unvisited); +DEFINE_DTRACE_PROBE4(remove__free__inflight); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zfs.h b/include/os/linux/zfs/sys/trace_zfs.h new file mode 100644 index 000000000000..0e19f8d186d0 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_zfs.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _OS_LINUX_ZFS_TRACE_H +#define _OS_LINUX_ZFS_TRACE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#endif diff --git a/include/os/linux/zfs/sys/trace_zil.h b/include/os/linux/zfs/sys/trace_zil.h new file mode 100644 index 000000000000..526846e664b2 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_zil.h @@ -0,0 +1,229 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_zil + +#if !defined(_TRACE_ZIL_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZIL_H + +#include +#include + +#define ZILOG_TP_STRUCT_ENTRY \ + __field(uint64_t, zl_lr_seq) \ + __field(uint64_t, zl_commit_lr_seq) \ + __field(uint64_t, zl_destroy_txg) \ + __field(uint64_t, zl_replaying_seq) \ + __field(uint32_t, zl_suspend) \ + __field(uint8_t, zl_suspending) \ + __field(uint8_t, zl_keep_first) \ + __field(uint8_t, zl_replay) \ + __field(uint8_t, zl_stop_sync) \ + __field(uint8_t, zl_logbias) \ + __field(uint8_t, zl_sync) \ + __field(int, zl_parse_error) \ + __field(uint64_t, zl_parse_blk_seq) \ + __field(uint64_t, zl_parse_lr_seq) \ + __field(uint64_t, zl_parse_blk_count) \ + __field(uint64_t, zl_parse_lr_count) \ + __field(uint64_t, zl_cur_used) \ + __field(clock_t, zl_replay_time) \ + __field(uint64_t, zl_replay_blks) + +#define ZILOG_TP_FAST_ASSIGN \ + __entry->zl_lr_seq = zilog->zl_lr_seq; \ + __entry->zl_commit_lr_seq = zilog->zl_commit_lr_seq; \ + __entry->zl_destroy_txg = zilog->zl_destroy_txg; \ + __entry->zl_replaying_seq = zilog->zl_replaying_seq; \ + __entry->zl_suspend = zilog->zl_suspend; \ + __entry->zl_suspending = zilog->zl_suspending; \ + __entry->zl_keep_first = zilog->zl_keep_first; \ + __entry->zl_replay = zilog->zl_replay; \ + __entry->zl_stop_sync = zilog->zl_stop_sync; \ + __entry->zl_logbias = zilog->zl_logbias; \ + __entry->zl_sync = zilog->zl_sync; \ + __entry->zl_parse_error = zilog->zl_parse_error; \ + __entry->zl_parse_blk_seq = zilog->zl_parse_blk_seq; \ + __entry->zl_parse_lr_seq = zilog->zl_parse_lr_seq; \ + __entry->zl_parse_blk_count = zilog->zl_parse_blk_count;\ + __entry->zl_parse_lr_count = zilog->zl_parse_lr_count; \ + __entry->zl_cur_used = zilog->zl_cur_used; \ + __entry->zl_replay_time = zilog->zl_replay_time; \ + __entry->zl_replay_blks = zilog->zl_replay_blks; + +#define ZILOG_TP_PRINTK_FMT \ + "zl { lr_seq %llu commit_lr_seq %llu destroy_txg %llu " \ + "replaying_seq %llu suspend %u suspending %u keep_first %u " \ + "replay %u stop_sync %u logbias %u sync %u " \ + "parse_error %u parse_blk_seq %llu parse_lr_seq %llu " \ + "parse_blk_count %llu parse_lr_count %llu " \ + "cur_used %llu replay_time %lu replay_blks %llu }" + +#define ZILOG_TP_PRINTK_ARGS \ + __entry->zl_lr_seq, __entry->zl_commit_lr_seq, \ + __entry->zl_destroy_txg, __entry->zl_replaying_seq, \ + __entry->zl_suspend, __entry->zl_suspending, \ + __entry->zl_keep_first, __entry->zl_replay, \ + __entry->zl_stop_sync, __entry->zl_logbias, __entry->zl_sync, \ + __entry->zl_parse_error, __entry->zl_parse_blk_seq, \ + __entry->zl_parse_lr_seq, __entry->zl_parse_blk_count, \ + __entry->zl_parse_lr_count, __entry->zl_cur_used, \ + __entry->zl_replay_time, __entry->zl_replay_blks + +#define ITX_TP_STRUCT_ENTRY \ + __field(itx_wr_state_t, itx_wr_state) \ + __field(uint8_t, itx_sync) \ + __field(zil_callback_t, itx_callback) \ + __field(void *, itx_callback_data) \ + __field(uint64_t, itx_oid) \ + \ + __field(uint64_t, lrc_txtype) \ + __field(uint64_t, lrc_reclen) \ + __field(uint64_t, lrc_txg) \ + __field(uint64_t, lrc_seq) + +#define ITX_TP_FAST_ASSIGN \ + __entry->itx_wr_state = itx->itx_wr_state; \ + __entry->itx_sync = itx->itx_sync; \ + __entry->itx_callback = itx->itx_callback; \ + __entry->itx_callback_data = itx->itx_callback_data; \ + __entry->itx_oid = itx->itx_oid; \ + \ + __entry->lrc_txtype = itx->itx_lr.lrc_txtype; \ + __entry->lrc_reclen = itx->itx_lr.lrc_reclen; \ + __entry->lrc_txg = itx->itx_lr.lrc_txg; \ + __entry->lrc_seq = itx->itx_lr.lrc_seq; + +#define ITX_TP_PRINTK_FMT \ + "itx { wr_state %u sync %u callback %p callback_data %p oid %llu" \ + " { txtype %llu reclen %llu txg %llu seq %llu } }" + +#define ITX_TP_PRINTK_ARGS \ + __entry->itx_wr_state, __entry->itx_sync, __entry->itx_callback,\ + __entry->itx_callback_data, __entry->itx_oid, \ + __entry->lrc_txtype, __entry->lrc_reclen, __entry->lrc_txg, \ + __entry->lrc_seq + +#define ZCW_TP_STRUCT_ENTRY \ + __field(lwb_t *, zcw_lwb) \ + __field(boolean_t, zcw_done) \ + __field(int, zcw_zio_error) \ + +#define ZCW_TP_FAST_ASSIGN \ + __entry->zcw_lwb = zcw->zcw_lwb; \ + __entry->zcw_done = zcw->zcw_done; \ + __entry->zcw_zio_error = zcw->zcw_zio_error; + +#define ZCW_TP_PRINTK_FMT \ + "zcw { lwb %p done %u error %u }" + +#define ZCW_TP_PRINTK_ARGS \ + __entry->zcw_lwb, __entry->zcw_done, __entry->zcw_zio_error + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zilog_t *, ..., + * itx_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zil_process_itx_class, + TP_PROTO(zilog_t *zilog, itx_t *itx), + TP_ARGS(zilog, itx), + TP_STRUCT__entry( + ZILOG_TP_STRUCT_ENTRY + ITX_TP_STRUCT_ENTRY + ), + TP_fast_assign( + ZILOG_TP_FAST_ASSIGN + ITX_TP_FAST_ASSIGN + ), + TP_printk( + ZILOG_TP_PRINTK_FMT " " ITX_TP_PRINTK_FMT, + ZILOG_TP_PRINTK_ARGS, ITX_TP_PRINTK_ARGS) +); +/* END CSTYLED */ + +/* BEGIN CSTYLED */ +#define DEFINE_ZIL_PROCESS_ITX_EVENT(name) \ +DEFINE_EVENT(zfs_zil_process_itx_class, name, \ + TP_PROTO(zilog_t *zilog, itx_t *itx), \ + TP_ARGS(zilog, itx)) +DEFINE_ZIL_PROCESS_ITX_EVENT(zfs_zil__process__commit__itx); +DEFINE_ZIL_PROCESS_ITX_EVENT(zfs_zil__process__normal__itx); +/* END CSTYLED */ + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zilog_t *, ..., + * zil_commit_waiter_t *, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zil_commit_io_error_class, + TP_PROTO(zilog_t *zilog, zil_commit_waiter_t *zcw), + TP_ARGS(zilog, zcw), + TP_STRUCT__entry( + ZILOG_TP_STRUCT_ENTRY + ZCW_TP_STRUCT_ENTRY + ), + TP_fast_assign( + ZILOG_TP_FAST_ASSIGN + ZCW_TP_FAST_ASSIGN + ), + TP_printk( + ZILOG_TP_PRINTK_FMT " " ZCW_TP_PRINTK_FMT, + ZILOG_TP_PRINTK_ARGS, ZCW_TP_PRINTK_ARGS) +); + +/* BEGIN CSTYLED */ +#define DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(name) \ +DEFINE_EVENT(zfs_zil_commit_io_error_class, name, \ + TP_PROTO(zilog_t *zilog, zil_commit_waiter_t *zcw), \ + TP_ARGS(zilog, zcw)) +DEFINE_ZIL_COMMIT_IO_ERROR_EVENT(zfs_zil__commit__io__error); +/* END CSTYLED */ + +#endif /* _TRACE_ZIL_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_zil +#include + +#else + +DEFINE_DTRACE_PROBE2(zil__process__commit__itx); +DEFINE_DTRACE_PROBE2(zil__process__normal__itx); +DEFINE_DTRACE_PROBE2(zil__commit__io__error); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zio.h b/include/os/linux/zfs/sys/trace_zio.h new file mode 100644 index 000000000000..8655e245c043 --- /dev/null +++ b/include/os/linux/zfs/sys/trace_zio.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_zio + +#if !defined(_TRACE_ZIO_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZIO_H + +#include +#include +#include /* For ZIO macros */ + +/* BEGIN CSTYLED */ +TRACE_EVENT(zfs_zio__delay__miss, + TP_PROTO(zio_t *zio, hrtime_t now), + TP_ARGS(zio, now), + TP_STRUCT__entry( + ZIO_TP_STRUCT_ENTRY + __field(hrtime_t, now) + ), + TP_fast_assign( + ZIO_TP_FAST_ASSIGN + __entry->now = now; + ), + TP_printk("now %llu " ZIO_TP_PRINTK_FMT, __entry->now, + ZIO_TP_PRINTK_ARGS) +); + +TRACE_EVENT(zfs_zio__delay__hit, + TP_PROTO(zio_t *zio, hrtime_t now, hrtime_t diff), + TP_ARGS(zio, now, diff), + TP_STRUCT__entry( + ZIO_TP_STRUCT_ENTRY + __field(hrtime_t, now) + __field(hrtime_t, diff) + ), + TP_fast_assign( + ZIO_TP_FAST_ASSIGN + __entry->now = now; + __entry->diff = diff; + ), + TP_printk("now %llu diff %llu " ZIO_TP_PRINTK_FMT, __entry->now, + __entry->diff, ZIO_TP_PRINTK_ARGS) +); + +TRACE_EVENT(zfs_zio__delay__skip, + TP_PROTO(zio_t *zio), + TP_ARGS(zio), + TP_STRUCT__entry(ZIO_TP_STRUCT_ENTRY), + TP_fast_assign(ZIO_TP_FAST_ASSIGN), + TP_printk(ZIO_TP_PRINTK_FMT, ZIO_TP_PRINTK_ARGS) +); +/* END CSTYLED */ + +#endif /* _TRACE_ZIO_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_zio +#include + +#else + +DEFINE_DTRACE_PROBE2(zio__delay__miss); +DEFINE_DTRACE_PROBE3(zio__delay__hit); +DEFINE_DTRACE_PROBE1(zio__delay__skip); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/trace_zrlock.h b/include/os/linux/zfs/sys/trace_zrlock.h new file mode 100644 index 000000000000..23f9577ba15f --- /dev/null +++ b/include/os/linux/zfs/sys/trace_zrlock.h @@ -0,0 +1,94 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) +#if defined(HAVE_DECLARE_EVENT_CLASS) + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM zfs + +#undef TRACE_SYSTEM_VAR +#define TRACE_SYSTEM_VAR zfs_zrlock + +#if !defined(_TRACE_ZRLOCK_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_ZRLOCK_H + +#include +#include + +/* + * Generic support for two argument tracepoints of the form: + * + * DTRACE_PROBE2(..., + * zrlock_t *, ..., + * uint32_t, ...); + */ +/* BEGIN CSTYLED */ +DECLARE_EVENT_CLASS(zfs_zrlock_class, + TP_PROTO(zrlock_t *zrl, kthread_t *owner, uint32_t n), + TP_ARGS(zrl, owner, n), + TP_STRUCT__entry( + __field(int32_t, refcount) +#ifdef ZFS_DEBUG + __field(pid_t, owner_pid) + __field(const char *, caller) +#endif + __field(uint32_t, n) + ), + TP_fast_assign( + __entry->refcount = zrl->zr_refcount; +#ifdef ZFS_DEBUG + __entry->owner_pid = owner ? owner->pid : 0; + __entry->caller = zrl->zr_caller ? zrl->zr_caller : "(null)"; +#endif + __entry->n = n; + ), +#ifdef ZFS_DEBUG + TP_printk("zrl { refcount %d owner_pid %d caller %s } n %u", + __entry->refcount, __entry->owner_pid, __entry->caller, + __entry->n) +#else + TP_printk("zrl { refcount %d } n %u", + __entry->refcount, __entry->n) +#endif +); +/* END_CSTYLED */ + +#define DEFINE_ZRLOCK_EVENT(name) \ +DEFINE_EVENT(zfs_zrlock_class, name, \ + TP_PROTO(zrlock_t *zrl, kthread_t *owner, uint32_t n), \ + TP_ARGS(zrl, owner, n)) +DEFINE_ZRLOCK_EVENT(zfs_zrlock__reentry); + +#endif /* _TRACE_ZRLOCK_H */ + +#undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_PATH sys +#define TRACE_INCLUDE_FILE trace_zrlock +#include + +#else + +DEFINE_DTRACE_PROBE3(zrlock__reentry); + +#endif /* HAVE_DECLARE_EVENT_CLASS */ +#endif /* _KERNEL */ diff --git a/include/os/linux/zfs/sys/zfs_context_os.h b/include/os/linux/zfs/sys/zfs_context_os.h new file mode 100644 index 000000000000..9e5fdd79f019 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_context_os.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_CONTEXT_OS_H +#define ZFS_CONTEXT_OS_H + +#include +#include +#include + +#endif diff --git a/include/os/linux/zfs/sys/zfs_ctldir.h b/include/os/linux/zfs/sys/zfs_ctldir.h new file mode 100644 index 000000000000..51933bc4fe47 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_ctldir.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + */ + +#ifndef _ZFS_CTLDIR_H +#define _ZFS_CTLDIR_H + +#include +#include +#include +#include + +#define ZFS_CTLDIR_NAME ".zfs" +#define ZFS_SNAPDIR_NAME "snapshot" +#define ZFS_SHAREDIR_NAME "shares" + +#define zfs_has_ctldir(zdp) \ + ((zdp)->z_id == ZTOZSB(zdp)->z_root && \ + (ZTOZSB(zdp)->z_ctldir != NULL)) +#define zfs_show_ctldir(zdp) \ + (zfs_has_ctldir(zdp) && \ + (ZTOZSB(zdp)->z_show_ctldir)) + +extern int zfs_expire_snapshot; + +/* zfsctl generic functions */ +extern int zfsctl_create(zfsvfs_t *); +extern void zfsctl_destroy(zfsvfs_t *); +extern struct inode *zfsctl_root(znode_t *); +extern void zfsctl_init(void); +extern void zfsctl_fini(void); +extern boolean_t zfsctl_is_node(struct inode *ip); +extern boolean_t zfsctl_is_snapdir(struct inode *ip); +extern int zfsctl_fid(struct inode *ip, fid_t *fidp); + +/* zfsctl '.zfs' functions */ +extern int zfsctl_root_lookup(struct inode *dip, char *name, + struct inode **ipp, int flags, cred_t *cr, int *direntflags, + pathname_t *realpnp); + +/* zfsctl '.zfs/snapshot' functions */ +extern int zfsctl_snapdir_lookup(struct inode *dip, char *name, + struct inode **ipp, int flags, cred_t *cr, int *direntflags, + pathname_t *realpnp); +extern int zfsctl_snapdir_rename(struct inode *sdip, char *sname, + struct inode *tdip, char *tname, cred_t *cr, int flags); +extern int zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, + int flags); +extern int zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, + struct inode **ipp, cred_t *cr, int flags); +extern int zfsctl_snapshot_mount(struct path *path, int flags); +extern int zfsctl_snapshot_unmount(char *snapname, int flags); +extern int zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, + int delay); +extern int zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, + int gen, struct inode **ipp); + +/* zfsctl '.zfs/shares' functions */ +extern int zfsctl_shares_lookup(struct inode *dip, char *name, + struct inode **ipp, int flags, cred_t *cr, int *direntflags, + pathname_t *realpnp); + +/* + * These inodes numbers are reserved for the .zfs control directory. + * It is important that they be no larger that 48-bits because only + * 6 bytes are reserved in the NFS file handle for the object number. + * However, they should be as large as possible to avoid conflicts + * with the objects which are assigned monotonically by the dmu. + */ +#define ZFSCTL_INO_ROOT 0x0000FFFFFFFFFFFFULL +#define ZFSCTL_INO_SHARES 0x0000FFFFFFFFFFFEULL +#define ZFSCTL_INO_SNAPDIR 0x0000FFFFFFFFFFFDULL +#define ZFSCTL_INO_SNAPDIRS 0x0000FFFFFFFFFFFCULL + +#define ZFSCTL_EXPIRE_SNAPSHOT 300 + +#endif /* _ZFS_CTLDIR_H */ diff --git a/include/os/linux/zfs/sys/zfs_dir.h b/include/os/linux/zfs/sys/zfs_dir.h new file mode 100644 index 000000000000..0f15e43452b2 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_dir.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_DIR_H +#define _SYS_FS_ZFS_DIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* zfs_dirent_lock() flags */ +#define ZNEW 0x0001 /* entry should not exist */ +#define ZEXISTS 0x0002 /* entry should exist */ +#define ZSHARED 0x0004 /* shared access (zfs_dirlook()) */ +#define ZXATTR 0x0008 /* we want the xattr dir */ +#define ZRENAMING 0x0010 /* znode is being renamed */ +#define ZCILOOK 0x0020 /* case-insensitive lookup requested */ +#define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ + +/* mknode flags */ +#define IS_ROOT_NODE 0x01 /* create a root node */ +#define IS_XATTR 0x02 /* create an extended attribute node */ +#define IS_TMPFILE 0x04 /* create a tmpfile */ + +extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, + int, int *, pathname_t *); +extern void zfs_dirent_unlock(zfs_dirlock_t *); +extern int zfs_link_create(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int); +extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, + boolean_t *); +extern int zfs_dirlook(znode_t *, char *, znode_t **, int, int *, + pathname_t *); +extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, + uint_t, znode_t **, zfs_acl_ids_t *); +extern void zfs_rmnode(znode_t *); +extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); +extern boolean_t zfs_dirempty(znode_t *); +extern void zfs_unlinked_add(znode_t *, dmu_tx_t *); +extern void zfs_unlinked_drain(zfsvfs_t *zfsvfs); +extern void zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs); +extern int zfs_sticky_remove_access(znode_t *, znode_t *, cred_t *cr); +extern int zfs_get_xattrdir(znode_t *, znode_t **, cred_t *, int); +extern int zfs_make_xattrdir(znode_t *, vattr_t *, znode_t **, cred_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_DIR_H */ diff --git a/include/os/linux/zfs/sys/zfs_vfsops.h b/include/os/linux/zfs/sys/zfs_vfsops.h new file mode 100644 index 000000000000..24a0a2e6a05f --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_vfsops.h @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VFSOPS_H +#define _SYS_FS_ZFS_VFSOPS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zfsvfs zfsvfs_t; +struct znode; + +/* + * This structure emulates the vfs_t from other platforms. It's purpose + * is to facilitate the handling of mount options and minimize structural + * differences between the platforms. + */ +typedef struct vfs { + struct zfsvfs *vfs_data; + char *vfs_mntpoint; /* Primary mount point */ + uint64_t vfs_xattr; + boolean_t vfs_readonly; + boolean_t vfs_do_readonly; + boolean_t vfs_setuid; + boolean_t vfs_do_setuid; + boolean_t vfs_exec; + boolean_t vfs_do_exec; + boolean_t vfs_devices; + boolean_t vfs_do_devices; + boolean_t vfs_do_xattr; + boolean_t vfs_atime; + boolean_t vfs_do_atime; + boolean_t vfs_relatime; + boolean_t vfs_do_relatime; + boolean_t vfs_nbmand; + boolean_t vfs_do_nbmand; +} vfs_t; + +typedef struct zfs_mnt { + const char *mnt_osname; /* Objset name */ + char *mnt_data; /* Raw mount options */ +} zfs_mnt_t; + +struct zfsvfs { + vfs_t *z_vfs; /* generic fs struct */ + struct super_block *z_sb; /* generic super_block */ + struct zfsvfs *z_parent; /* parent fs */ + objset_t *z_os; /* objset reference */ + uint64_t z_flags; /* super_block flags */ + uint64_t z_root; /* id of root znode */ + uint64_t z_unlinkedobj; /* id of unlinked zapobj */ + uint64_t z_max_blksz; /* maximum block size for files */ + uint64_t z_fuid_obj; /* fuid table object number */ + uint64_t z_fuid_size; /* fuid table size */ + avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ + avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ + krwlock_t z_fuid_lock; /* fuid lock */ + boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ + struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ + zilog_t *z_log; /* intent log pointer */ + uint_t z_acl_mode; /* acl chmod/mode behavior */ + uint_t z_acl_inherit; /* acl inheritance behavior */ + uint_t z_acl_type; /* type of ACL usable on this FS */ + zfs_case_t z_case; /* case-sense */ + boolean_t z_utf8; /* utf8-only */ + int z_norm; /* normalization flags */ + boolean_t z_relatime; /* enable relatime mount option */ + boolean_t z_unmounted; /* unmounted */ + rrmlock_t z_teardown_lock; + krwlock_t z_teardown_inactive_lock; + list_t z_all_znodes; /* all znodes in the fs */ + uint64_t z_nr_znodes; /* number of znodes in the fs */ + unsigned long z_rollback_time; /* last online rollback time */ + unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ + kmutex_t z_znodes_lock; /* lock for z_all_znodes */ + arc_prune_t *z_arc_prune; /* called by ARC to prune caches */ + struct inode *z_ctldir; /* .zfs directory inode */ + boolean_t z_show_ctldir; /* expose .zfs in the root dir */ + boolean_t z_issnap; /* true if this is a snapshot */ + boolean_t z_vscan; /* virus scan on/off */ + boolean_t z_use_fuids; /* version allows fuids */ + boolean_t z_replay; /* set during ZIL replay */ + boolean_t z_use_sa; /* version allow system attributes */ + boolean_t z_xattr_sa; /* allow xattrs to be stores as SA */ + boolean_t z_draining; /* is true when drain is active */ + boolean_t z_drain_cancel; /* signal the unlinked drain to stop */ + uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + dataset_kstats_t z_kstat; /* fs kstats */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; + uint64_t z_userobjquota_obj; + uint64_t z_groupobjquota_obj; + uint64_t z_projectquota_obj; + uint64_t z_projectobjquota_obj; + uint64_t z_replay_eof; /* New end of file - replay only */ + sa_attr_type_t *z_attr_table; /* SA attr mapping->id */ + uint64_t z_hold_size; /* znode hold array size */ + avl_tree_t *z_hold_trees; /* znode hold trees */ + kmutex_t *z_hold_locks; /* znode hold locks */ + taskqid_t z_drain_task; /* task id for the unlink drain task */ +}; + +#define ZSB_XATTR 0x0001 /* Enable user xattrs */ + +/* + * Allow a maximum number of links. While ZFS does not internally limit + * this the inode->i_nlink member is defined as an unsigned int. To be + * safe we use 2^31-1 as the limit. + */ +#define ZFS_LINK_MAX ((1U << 31) - 1U) + +/* + * Normal filesystems (those not under .zfs/snapshot) have a total + * file ID size limited to 12 bytes (including the length field) due to + * NFSv2 protocol's limitation of 32 bytes for a filehandle. For historical + * reasons, this same limit is being imposed by the Solaris NFSv3 implementation + * (although the NFSv3 protocol actually permits a maximum of 64 bytes). It + * is not possible to expand beyond 12 bytes without abandoning support + * of NFSv2. + * + * For normal filesystems, we partition up the available space as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * + * We reserve only 48 bits for the object number, as this is the limit + * currently defined and imposed by the DMU. + */ +typedef struct zfid_short { + uint16_t zf_len; + uint8_t zf_object[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_gen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_short_t; + +/* + * Filesystems under .zfs/snapshot have a total file ID size of 22 bytes + * (including the length field). This makes files under .zfs/snapshot + * accessible by NFSv3 and NFSv4, but not NFSv2. + * + * For files under .zfs/snapshot, we partition up the available space + * as follows: + * 2 bytes fid length (required) + * 6 bytes object number (48 bits) + * 4 bytes generation number (32 bits) + * 6 bytes objset id (48 bits) + * 4 bytes currently just zero (32 bits) + * + * We reserve only 48 bits for the object number and objset id, as these are + * the limits currently defined and imposed by the DMU. + */ +typedef struct zfid_long { + zfid_short_t z_fid; + uint8_t zf_setid[6]; /* obj[i] = obj >> (8 * i) */ + uint8_t zf_setgen[4]; /* gen[i] = gen >> (8 * i) */ +} zfid_long_t; + +#define SHORT_FID_LEN (sizeof (zfid_short_t) - sizeof (uint16_t)) +#define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) + +extern void zfs_init(void); +extern void zfs_fini(void); + +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern int zfs_end_fs(zfsvfs_t *zfsvfs, struct dsl_dataset *ds); +extern void zfs_exit_fs(zfsvfs_t *zfsvfs); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, boolean_t readony, zfsvfs_t **zfvp); +extern int zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); + +extern boolean_t zfs_is_readonly(zfsvfs_t *zfsvfs); +extern int zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent); +extern void zfs_preumount(struct super_block *sb); +extern int zfs_umount(struct super_block *sb); +extern int zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm); +extern int zfs_statvfs(struct inode *ip, struct kstatfs *statp); +extern int zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp); +extern int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, + int *objects); +extern int zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val, char *setpoint); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VFSOPS_H */ diff --git a/include/os/linux/zfs/sys/zfs_vnops.h b/include/os/linux/zfs/sys/zfs_vnops.h new file mode 100644 index 000000000000..24a2082d35d6 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_vnops.h @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_VNOPS_H +#define _SYS_FS_ZFS_VNOPS_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zfs_open(struct inode *ip, int mode, int flag, cred_t *cr); +extern int zfs_close(struct inode *ip, int flag, cred_t *cr); +extern int zfs_holey(struct inode *ip, int cmd, loff_t *off); +extern int zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr); +extern int zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid); +extern int zfs_access(struct inode *ip, int mode, int flag, cred_t *cr); +extern int zfs_lookup(znode_t *dzp, char *nm, znode_t **zpp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp); +extern int zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_tmpfile(struct inode *dip, vattr_t *vapzfs, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp); +extern int zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags); +extern int zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, + znode_t **zpp, cred_t *cr, int flags, vsecattr_t *vsecp); +extern int zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, + cred_t *cr, int flags); +extern int zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr); +extern int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr); +extern int zfs_getattr_fast(struct inode *ip, struct kstat *sp); +extern int zfs_setattr(znode_t *zp, vattr_t *vap, int flag, cred_t *cr); +extern int zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, + char *tnm, cred_t *cr, int flags); +extern int zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, + char *link, znode_t **zpp, cred_t *cr, int flags); +extern int zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr); +extern int zfs_link(znode_t *tdzp, znode_t *szp, + char *name, cred_t *cr, int flags); +extern void zfs_inactive(struct inode *ip); +extern int zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr); +extern int zfs_fid(struct inode *ip, fid_t *fidp); +extern int zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, + cred_t *cr); +extern int zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages); +extern int zfs_putpage(struct inode *ip, struct page *pp, + struct writeback_control *wbc); +extern int zfs_dirty_inode(struct inode *ip, int flags); +extern int zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, + size_t len, unsigned long vm_flags); +extern void zfs_zrele_async(znode_t *zp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_VNOPS_H */ diff --git a/include/os/linux/zfs/sys/zfs_znode_impl.h b/include/os/linux/zfs/sys/zfs_znode_impl.h new file mode 100644 index 000000000000..39bbd135ac06 --- /dev/null +++ b/include/os/linux/zfs/sys/zfs_znode_impl.h @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_ZFS_ZNODE_IMPL_H +#define _SYS_ZFS_ZNODE_IMPL_H + +#ifndef _KERNEL +#error "no user serviceable parts within" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZNODE_OS_FIELDS \ + struct inode z_inode; + + +/* + * Convert between znode pointers and inode pointers + */ +#define ZTOI(znode) (&((znode)->z_inode)) +#define ITOZ(inode) (container_of((inode), znode_t, z_inode)) +#define ZTOZSB(znode) ((zfsvfs_t *)(ZTOI(znode)->i_sb->s_fs_info)) +#define ITOZSB(inode) ((zfsvfs_t *)((inode)->i_sb->s_fs_info)) + +#define ZTOTYPE(zp) (ZTOI(zp)->i_mode) +#define ZTOGID(zp) (ZTOI(zp)->i_gid) +#define ZTOUID(zp) (ZTOI(zp)->i_uid) +#define ZTONLNK(zp) (ZTOI(zp)->i_nlink) + +#define Z_ISBLK(type) S_ISBLK(type) +#define Z_ISCHR(type) S_ISCHR(type) +#define Z_ISLNK(type) S_ISLNK(type) +#define Z_ISDEV(type) (S_ISCHR(type) || S_ISBLK(type) || S_ISFIFO(type)) + +#define zhold(zp) igrab(ZTOI((zp))) +#define zrele(zp) iput(ZTOI((zp))) + +/* Called on entry to each ZFS inode and vfs operation. */ +#define ZFS_ENTER_ERROR(zfsvfs, error) \ +do { \ + rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \ + if ((zfsvfs)->z_unmounted) { \ + ZFS_EXIT(zfsvfs); \ + return (error); \ + } \ +} while (0) +#define ZFS_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, EIO) +#define ZPL_ENTER(zfsvfs) ZFS_ENTER_ERROR(zfsvfs, -EIO) + +/* Must be called before exiting the operation. */ +#define ZFS_EXIT(zfsvfs) \ +do { \ + zfs_exit_fs(zfsvfs); \ + rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG); \ +} while (0) +#define ZPL_EXIT(zfsvfs) ZFS_EXIT(zfsvfs) + +/* Verifies the znode is valid. */ +#define ZFS_VERIFY_ZP_ERROR(zp, error) \ +do { \ + if ((zp)->z_sa_hdl == NULL) { \ + ZFS_EXIT(ZTOZSB(zp)); \ + return (error); \ + } \ +} while (0) +#define ZFS_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, EIO) +#define ZPL_VERIFY_ZP(zp) ZFS_VERIFY_ZP_ERROR(zp, -EIO) + +/* + * Macros for dealing with dmu_buf_hold + */ +#define ZFS_OBJ_MTX_SZ 64 +#define ZFS_OBJ_MTX_MAX (1024 * 1024) +#define ZFS_OBJ_HASH(zfsvfs, obj) ((obj) & ((zfsvfs->z_hold_size) - 1)) + +extern unsigned int zfs_object_mutex_size; + +/* + * Encode ZFS stored time values from a struct timespec / struct timespec64. + */ +#define ZFS_TIME_ENCODE(tp, stmp) \ +do { \ + (stmp)[0] = (uint64_t)(tp)->tv_sec; \ + (stmp)[1] = (uint64_t)(tp)->tv_nsec; \ +} while (0) + +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +/* + * Decode ZFS stored time values to a struct timespec64 + * 4.18 and newer kernels. + */ +#define ZFS_TIME_DECODE(tp, stmp) \ +do { \ + (tp)->tv_sec = (time64_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} while (0) +#else +/* + * Decode ZFS stored time values to a struct timespec + * 4.17 and older kernels. + */ +#define ZFS_TIME_DECODE(tp, stmp) \ +do { \ + (tp)->tv_sec = (time_t)(stmp)[0]; \ + (tp)->tv_nsec = (long)(stmp)[1]; \ +} while (0) +#endif /* HAVE_INODE_TIMESPEC64_TIMES */ + +struct znode; + +extern int zfs_sync(struct super_block *, int, cred_t *); +extern int zfs_inode_alloc(struct super_block *, struct inode **ip); +extern void zfs_inode_destroy(struct inode *); +extern void zfs_inode_update(struct znode *); +extern void zfs_mark_inode_dirty(struct inode *); +extern boolean_t zfs_relatime_need_update(const struct inode *); + +#if defined(HAVE_UIO_RW) +extern caddr_t zfs_map_page(page_t *, enum seg_rw); +extern void zfs_unmap_page(page_t *, caddr_t); +#endif /* HAVE_UIO_RW */ + +extern zil_get_data_t zfs_get_data; +extern zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE]; +extern int zfsfstype; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ZNODE_IMPL_H */ diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h new file mode 100644 index 000000000000..ef5a0b842d09 --- /dev/null +++ b/include/os/linux/zfs/sys/zpl.h @@ -0,0 +1,183 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_ZPL_H +#define _SYS_ZPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* zpl_inode.c */ +extern void zpl_vap_init(vattr_t *vap, struct inode *dir, + umode_t mode, cred_t *cr); + +extern const struct inode_operations zpl_inode_operations; +extern const struct inode_operations zpl_dir_inode_operations; +extern const struct inode_operations zpl_symlink_inode_operations; +extern const struct inode_operations zpl_special_inode_operations; +extern dentry_operations_t zpl_dentry_operations; + +/* zpl_file.c */ +extern ssize_t zpl_read_common(struct inode *ip, const char *buf, + size_t len, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr); +extern ssize_t zpl_write_common(struct inode *ip, const char *buf, + size_t len, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr); + +extern const struct address_space_operations zpl_address_space_operations; +extern const struct file_operations zpl_file_operations; +extern const struct file_operations zpl_dir_file_operations; + +/* zpl_super.c */ +extern void zpl_prune_sb(int64_t nr_to_scan, void *arg); + +extern const struct super_operations zpl_super_operations; +extern const struct export_operations zpl_export_operations; +extern struct file_system_type zpl_fs_type; + +/* zpl_xattr.c */ +extern ssize_t zpl_xattr_list(struct dentry *dentry, char *buf, size_t size); +extern int zpl_xattr_security_init(struct inode *ip, struct inode *dip, + const struct qstr *qstr); +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) +extern int zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type); +#endif /* HAVE_SET_ACL */ +extern struct posix_acl *zpl_get_acl(struct inode *ip, int type); +extern int zpl_init_acl(struct inode *ip, struct inode *dir); +extern int zpl_chmod_acl(struct inode *ip); +#else +static inline int +zpl_init_acl(struct inode *ip, struct inode *dir) +{ + return (0); +} + +static inline int +zpl_chmod_acl(struct inode *ip) +{ + return (0); +} +#endif /* CONFIG_FS_POSIX_ACL */ + +extern xattr_handler_t *zpl_xattr_handlers[]; + +/* zpl_ctldir.c */ +extern const struct file_operations zpl_fops_root; +extern const struct inode_operations zpl_ops_root; + +extern const struct file_operations zpl_fops_snapdir; +extern const struct inode_operations zpl_ops_snapdir; +extern const struct dentry_operations zpl_dops_snapdirs; + +extern const struct file_operations zpl_fops_shares; +extern const struct inode_operations zpl_ops_shares; + +#if defined(HAVE_VFS_ITERATE) || defined(HAVE_VFS_ITERATE_SHARED) + +#define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ + .actor = _actor, \ + .pos = _pos, \ +} + +typedef struct dir_context zpl_dir_context_t; + +#define zpl_dir_emit dir_emit +#define zpl_dir_emit_dot dir_emit_dot +#define zpl_dir_emit_dotdot dir_emit_dotdot +#define zpl_dir_emit_dots dir_emit_dots + +#else + +typedef struct zpl_dir_context { + void *dirent; + const filldir_t actor; + loff_t pos; +} zpl_dir_context_t; + +#define ZPL_DIR_CONTEXT_INIT(_dirent, _actor, _pos) { \ + .dirent = _dirent, \ + .actor = _actor, \ + .pos = _pos, \ +} + +static inline bool +zpl_dir_emit(zpl_dir_context_t *ctx, const char *name, int namelen, + uint64_t ino, unsigned type) +{ + return (!ctx->actor(ctx->dirent, name, namelen, ctx->pos, ino, type)); +} + +static inline bool +zpl_dir_emit_dot(struct file *file, zpl_dir_context_t *ctx) +{ + return (ctx->actor(ctx->dirent, ".", 1, ctx->pos, + file_inode(file)->i_ino, DT_DIR) == 0); +} + +static inline bool +zpl_dir_emit_dotdot(struct file *file, zpl_dir_context_t *ctx) +{ + return (ctx->actor(ctx->dirent, "..", 2, ctx->pos, + parent_ino(file_dentry(file)), DT_DIR) == 0); +} + +static inline bool +zpl_dir_emit_dots(struct file *file, zpl_dir_context_t *ctx) +{ + if (ctx->pos == 0) { + if (!zpl_dir_emit_dot(file, ctx)) + return (false); + ctx->pos = 1; + } + if (ctx->pos == 1) { + if (!zpl_dir_emit_dotdot(file, ctx)) + return (false); + ctx->pos = 2; + } + return (true); +} +#endif /* HAVE_VFS_ITERATE */ + +#if defined(HAVE_INODE_TIMESTAMP_TRUNCATE) +#define zpl_inode_timestamp_truncate(ts, ip) timestamp_truncate(ts, ip) +#elif defined(HAVE_INODE_TIMESPEC64_TIMES) +#define zpl_inode_timestamp_truncate(ts, ip) \ + timespec64_trunc(ts, (ip)->i_sb->s_time_gran) +#else +#define zpl_inode_timestamp_truncate(ts, ip) \ + timespec_trunc(ts, (ip)->i_sb->s_time_gran) +#endif + +#endif /* _SYS_ZPL_H */ diff --git a/include/sys/Makefile.am b/include/sys/Makefile.am new file mode 100644 index 000000000000..c2bc3be03c9f --- /dev/null +++ b/include/sys/Makefile.am @@ -0,0 +1,147 @@ +SUBDIRS = fm fs crypto lua sysevent zstd + +COMMON_H = \ + abd.h \ + abd_impl.h \ + aggsum.h \ + arc.h \ + arc_impl.h \ + avl.h \ + avl_impl.h \ + bitops.h \ + blkptr.h \ + bplist.h \ + bpobj.h \ + bptree.h \ + btree.h \ + bqueue.h \ + dataset_kstats.h \ + dbuf.h \ + ddt.h \ + dmu.h \ + dmu_impl.h \ + dmu_objset.h \ + dmu_recv.h \ + dmu_redact.h \ + dmu_send.h \ + dmu_traverse.h \ + dmu_tx.h \ + dmu_zfetch.h \ + dnode.h \ + dsl_bookmark.h \ + dsl_dataset.h \ + dsl_deadlist.h \ + dsl_deleg.h \ + dsl_destroy.h \ + dsl_dir.h \ + dsl_crypt.h \ + dsl_pool.h \ + dsl_prop.h \ + dsl_scan.h \ + dsl_synctask.h \ + dsl_userhold.h \ + edonr.h \ + efi_partition.h \ + frame.h \ + hkdf.h \ + metaslab.h \ + metaslab_impl.h \ + mmp.h \ + mntent.h \ + mod.h \ + multilist.h \ + note.h \ + nvpair.h \ + nvpair_impl.h \ + objlist.h \ + pathname.h \ + qat.h \ + range_tree.h \ + rrwlock.h \ + sa.h \ + sa_impl.h \ + skein.h \ + spa_boot.h \ + spa_checkpoint.h \ + spa_log_spacemap.h \ + space_map.h \ + space_reftree.h \ + spa.h \ + spa_impl.h \ + spa_checksum.h \ + sysevent.h \ + txg.h \ + txg_impl.h \ + u8_textprep_data.h \ + u8_textprep.h \ + uberblock.h \ + uberblock_impl.h \ + uio_impl.h \ + unique.h \ + uuid.h \ + vdev_disk.h \ + vdev_file.h \ + vdev.h \ + vdev_impl.h \ + vdev_indirect_births.h \ + vdev_indirect_mapping.h \ + vdev_initialize.h \ + vdev_raidz.h \ + vdev_raidz_impl.h \ + vdev_rebuild.h \ + vdev_removal.h \ + vdev_trim.h \ + xvattr.h \ + zap.h \ + zap_impl.h \ + zap_leaf.h \ + zcp.h \ + zcp_global.h \ + zcp_iter.h \ + zcp_prop.h \ + zcp_set.h \ + zfeature.h \ + zfs_acl.h \ + zfs_context.h \ + zfs_debug.h \ + zfs_delay.h \ + zfs_file.h \ + zfs_fuid.h \ + zfs_project.h \ + zfs_quota.h \ + zfs_ratelimit.h \ + zfs_refcount.h \ + zfs_rlock.h \ + zfs_sa.h \ + zfs_stat.h \ + zfs_sysfs.h \ + zfs_znode.h \ + zil.h \ + zil_impl.h \ + zio_checksum.h \ + zio_compress.h \ + zio_crypt.h \ + zio.h \ + zio_impl.h \ + zio_priority.h \ + zrlock.h \ + zthr.h + +KERNEL_H = \ + zfs_ioctl.h \ + zfs_ioctl_impl.h \ + zfs_onexit.h \ + zvol.h \ + zvol_impl.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys +kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +endif +endif diff --git a/include/sys/abd.h b/include/sys/abd.h new file mode 100644 index 000000000000..735a13147598 --- /dev/null +++ b/include/sys/abd.h @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_H +#define _ABD_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd; /* forward declaration */ +typedef struct abd abd_t; + +typedef int abd_iter_func_t(void *buf, size_t len, void *priv); +typedef int abd_iter_func2_t(void *bufa, void *bufb, size_t len, void *priv); + +extern int zfs_abd_scatter_enabled; + +/* + * Allocations and deallocations + */ + +abd_t *abd_alloc(size_t, boolean_t); +abd_t *abd_alloc_linear(size_t, boolean_t); +abd_t *abd_alloc_gang_abd(void); +abd_t *abd_alloc_for_io(size_t, boolean_t); +abd_t *abd_alloc_sametype(abd_t *, size_t); +void abd_gang_add(abd_t *, abd_t *, boolean_t); +void abd_free(abd_t *); +void abd_put(abd_t *); +abd_t *abd_get_offset(abd_t *, size_t); +abd_t *abd_get_offset_size(abd_t *, size_t, size_t); +abd_t *abd_get_zeros(size_t); +abd_t *abd_get_from_buf(void *, size_t); +void abd_cache_reap_now(void); + +/* + * Conversion to and from a normal buffer + */ + +void *abd_to_buf(abd_t *); +void *abd_borrow_buf(abd_t *, size_t); +void *abd_borrow_buf_copy(abd_t *, size_t); +void abd_return_buf(abd_t *, void *, size_t); +void abd_return_buf_copy(abd_t *, void *, size_t); +void abd_take_ownership_of_buf(abd_t *, boolean_t); +void abd_release_ownership_of_buf(abd_t *); + +/* + * ABD operations + */ + +int abd_iterate_func(abd_t *, size_t, size_t, abd_iter_func_t *, void *); +int abd_iterate_func2(abd_t *, abd_t *, size_t, size_t, size_t, + abd_iter_func2_t *, void *); +void abd_copy_off(abd_t *, abd_t *, size_t, size_t, size_t); +void abd_copy_from_buf_off(abd_t *, const void *, size_t, size_t); +void abd_copy_to_buf_off(void *, abd_t *, size_t, size_t); +int abd_cmp(abd_t *, abd_t *); +int abd_cmp_buf_off(abd_t *, const void *, size_t, size_t); +void abd_zero_off(abd_t *, size_t, size_t); +void abd_verify(abd_t *); +uint_t abd_get_size(abd_t *); + +void abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)); +void abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul); + +/* + * Wrappers for calls with offsets of 0 + */ + +static inline void +abd_copy(abd_t *dabd, abd_t *sabd, size_t size) +{ + abd_copy_off(dabd, sabd, 0, 0, size); +} + +static inline void +abd_copy_from_buf(abd_t *abd, const void *buf, size_t size) +{ + abd_copy_from_buf_off(abd, buf, 0, size); +} + +static inline void +abd_copy_to_buf(void* buf, abd_t *abd, size_t size) +{ + abd_copy_to_buf_off(buf, abd, 0, size); +} + +static inline int +abd_cmp_buf(abd_t *abd, const void *buf, size_t size) +{ + return (abd_cmp_buf_off(abd, buf, 0, size)); +} + +static inline void +abd_zero(abd_t *abd, size_t size) +{ + abd_zero_off(abd, 0, size); +} + +/* + * ABD type check functions + */ +boolean_t abd_is_linear(abd_t *); +boolean_t abd_is_gang(abd_t *); +boolean_t abd_is_linear_page(abd_t *); + +/* + * Module lifecycle + * Defined in each specific OS's abd_os.c + */ + +void abd_init(void); +void abd_fini(void); + +/* + * Linux ABD bio functions + */ +#if defined(__linux__) && defined(_KERNEL) +unsigned int abd_bio_map_off(struct bio *, abd_t *, unsigned int, size_t); +unsigned long abd_nr_pages_off(abd_t *, unsigned int, size_t); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_H */ diff --git a/include/sys/abd_impl.h b/include/sys/abd_impl.h new file mode 100644 index 000000000000..b1fa87b42a48 --- /dev/null +++ b/include/sys/abd_impl.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#ifndef _ABD_IMPL_H +#define _ABD_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum abd_flags { + ABD_FLAG_LINEAR = 1 << 0, /* is buffer linear (or scattered)? */ + ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */ + ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */ + ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */ + ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */ + ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */ + ABD_FLAG_GANG = 1 << 6, /* mult ABDs chained together */ + ABD_FLAG_GANG_FREE = 1 << 7, /* gang ABD is responsible for mem */ + ABD_FLAG_ZEROS = 1 << 8, /* ABD for zero-filled buffer */ +} abd_flags_t; + +typedef enum abd_stats_op { + ABDSTAT_INCR, /* Increase abdstat values */ + ABDSTAT_DECR /* Decrease abdstat values */ +} abd_stats_op_t; + +struct abd { + abd_flags_t abd_flags; + uint_t abd_size; /* excludes scattered abd_offset */ + list_node_t abd_gang_link; + struct abd *abd_parent; + zfs_refcount_t abd_children; + kmutex_t abd_mtx; + union { + struct abd_scatter { + uint_t abd_offset; +#if defined(__FreeBSD__) && defined(_KERNEL) + uint_t abd_chunk_size; + void *abd_chunks[]; +#else + uint_t abd_nents; + struct scatterlist *abd_sgl; +#endif + } abd_scatter; + struct abd_linear { + void *abd_buf; + struct scatterlist *abd_sgl; /* for LINEAR_PAGE */ + } abd_linear; + struct abd_gang { + list_t abd_gang_chain; + } abd_gang; + } abd_u; +}; + +struct scatterlist; /* forward declaration */ + +struct abd_iter { + /* public interface */ + void *iter_mapaddr; /* addr corresponding to iter_pos */ + size_t iter_mapsize; /* length of data valid at mapaddr */ + + /* private */ + abd_t *iter_abd; /* ABD being iterated through */ + size_t iter_pos; + size_t iter_offset; /* offset in current sg/abd_buf, */ + /* abd_offset included */ + struct scatterlist *iter_sg; /* current sg */ +}; + +extern abd_t *abd_zero_scatter; + +abd_t *abd_gang_get_offset(abd_t *, size_t *); + +/* + * OS specific functions + */ + +abd_t *abd_alloc_struct(size_t); +abd_t *abd_get_offset_scatter(abd_t *, size_t); +void abd_free_struct(abd_t *); +void abd_alloc_chunks(abd_t *, size_t); +void abd_free_chunks(abd_t *); +boolean_t abd_size_alloc_linear(size_t); +void abd_update_scatter_stats(abd_t *, abd_stats_op_t); +void abd_update_linear_stats(abd_t *, abd_stats_op_t); +void abd_verify_scatter(abd_t *); +void abd_free_linear_page(abd_t *); +/* OS specific abd_iter functions */ +void abd_iter_init(struct abd_iter *, abd_t *); +boolean_t abd_iter_at_end(struct abd_iter *); +void abd_iter_advance(struct abd_iter *, size_t); +void abd_iter_map(struct abd_iter *); +void abd_iter_unmap(struct abd_iter *); + +/* + * Helper macros + */ +#define ABDSTAT(stat) (abd_stats.stat.value.ui64) +#define ABDSTAT_INCR(stat, val) \ + atomic_add_64(&abd_stats.stat.value.ui64, (val)) +#define ABDSTAT_BUMP(stat) ABDSTAT_INCR(stat, 1) +#define ABDSTAT_BUMPDOWN(stat) ABDSTAT_INCR(stat, -1) + +#define ABD_SCATTER(abd) (abd->abd_u.abd_scatter) +#define ABD_LINEAR_BUF(abd) (abd->abd_u.abd_linear.abd_buf) +#define ABD_GANG(abd) (abd->abd_u.abd_gang) + +#if defined(_KERNEL) +#if defined(__FreeBSD__) +#define abd_enter_critical(flags) critical_enter() +#define abd_exit_critical(flags) critical_exit() +#else +#define abd_enter_critical(flags) local_irq_save(flags) +#define abd_exit_critical(flags) local_irq_restore(flags) +#endif +#else /* !_KERNEL */ +#define abd_enter_critical(flags) ((void)0) +#define abd_exit_critical(flags) ((void)0) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _ABD_IMPL_H */ diff --git a/include/sys/aggsum.h b/include/sys/aggsum.h new file mode 100644 index 000000000000..cb43727f1df4 --- /dev/null +++ b/include/sys/aggsum.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_AGGSUM_H +#define _SYS_AGGSUM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct aggsum_bucket aggsum_bucket_t; + +struct aggsum_bucket { + kmutex_t asc_lock; + int64_t asc_delta; + uint64_t asc_borrowed; +} ____cacheline_aligned; + +/* + * Fan out over FANOUT cpus. + */ +typedef struct aggsum { + kmutex_t as_lock; + int64_t as_lower_bound; + int64_t as_upper_bound; + uint_t as_numbuckets; + aggsum_bucket_t *as_buckets; +} aggsum_t; + +void aggsum_init(aggsum_t *, uint64_t); +void aggsum_fini(aggsum_t *); +int64_t aggsum_lower_bound(aggsum_t *); +int64_t aggsum_upper_bound(aggsum_t *); +int aggsum_compare(aggsum_t *, uint64_t); +uint64_t aggsum_value(aggsum_t *); +void aggsum_add(aggsum_t *, int64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_AGGSUM_H */ diff --git a/include/sys/arc.h b/include/sys/arc.h new file mode 100644 index 000000000000..a0852b4d5a70 --- /dev/null +++ b/include/sys/arc.h @@ -0,0 +1,335 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. + */ + +#ifndef _SYS_ARC_H +#define _SYS_ARC_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +/* + * Used by arc_flush() to inform arc_evict_state() that it should evict + * all available buffers from the arc state being passed in. + */ +#define ARC_EVICT_ALL -1ULL + +#define HDR_SET_LSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED(x, 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_lsize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_SET_PSIZE(hdr, x) do { \ + ASSERT(IS_P2ALIGNED((x), 1U << SPA_MINBLOCKSHIFT)); \ + (hdr)->b_psize = ((x) >> SPA_MINBLOCKSHIFT); \ +_NOTE(CONSTCOND) } while (0) + +#define HDR_GET_LSIZE(hdr) ((hdr)->b_lsize << SPA_MINBLOCKSHIFT) +#define HDR_GET_PSIZE(hdr) ((hdr)->b_psize << SPA_MINBLOCKSHIFT) + +typedef struct arc_buf_hdr arc_buf_hdr_t; +typedef struct arc_buf arc_buf_t; +typedef struct arc_prune arc_prune_t; + +/* + * Because the ARC can store encrypted data, errors (not due to bugs) may arise + * while transforming data into its desired format - specifically, when + * decrypting, the key may not be present, or the HMAC may not be correct + * which signifies deliberate tampering with the on-disk state + * (assuming that the checksum was correct). If any error occurs, the "buf" + * parameter will be NULL. + */ +typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *bp, arc_buf_t *buf, void *priv); +typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); +typedef void arc_prune_func_t(int64_t bytes, void *priv); + +/* Shared module parameters */ +extern int zfs_arc_average_blocksize; + +/* generic arc_done_func_t's which you can use */ +arc_read_done_func_t arc_bcopy_func; +arc_read_done_func_t arc_getbuf_func; + +/* generic arc_prune_func_t wrapper for callbacks */ +struct arc_prune { + arc_prune_func_t *p_pfunc; + void *p_private; + uint64_t p_adjust; + list_node_t p_node; + zfs_refcount_t p_refcnt; +}; + +typedef enum arc_strategy { + ARC_STRATEGY_META_ONLY = 0, /* Evict only meta data buffers */ + ARC_STRATEGY_META_BALANCED = 1, /* Evict data buffers if needed */ +} arc_strategy_t; + +typedef enum arc_flags +{ + /* + * Public flags that can be passed into the ARC by external consumers. + */ + ARC_FLAG_WAIT = 1 << 0, /* perform sync I/O */ + ARC_FLAG_NOWAIT = 1 << 1, /* perform async I/O */ + ARC_FLAG_PREFETCH = 1 << 2, /* I/O is a prefetch */ + ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ + ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ + ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ + ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ + + /* + * Private ARC flags. These flags are private ARC only flags that + * will show up in b_flags in the arc_hdr_buf_t. These flags should + * only be set by ARC code. + */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ + /* Indicates that block was read with ASYNC priority. */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, + ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ + /* + * Encrypted or authenticated on disk (may be plaintext in memory). + * This header has b_crypt_hdr allocated. Does not include indirect + * blocks with checksums of MACs which will also have their X + * (encrypted) bit set in the bp. + */ + ARC_FLAG_PROTECTED = 1 << 15, + /* data has not been authenticated yet */ + ARC_FLAG_NOAUTH = 1 << 16, + /* indicates that the buffer contains metadata (otherwise, data) */ + ARC_FLAG_BUFC_METADATA = 1 << 17, + + /* Flags specifying whether optional hdr struct fields are defined */ + ARC_FLAG_HAS_L1HDR = 1 << 18, + ARC_FLAG_HAS_L2HDR = 1 << 19, + + /* + * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. + * This allows the l2arc to use the blkptr's checksum to verify + * the data without having to store the checksum in the hdr. + */ + ARC_FLAG_COMPRESSED_ARC = 1 << 20, + ARC_FLAG_SHARED_DATA = 1 << 21, + + /* + * Fail this arc_read() (with ENOENT) if the data is not already present + * in cache. + */ + ARC_FLAG_CACHED_ONLY = 1 << 22, + + /* + * The arc buffer's compression mode is stored in the top 7 bits of the + * flags field, so these dummy flags are included so that MDB can + * interpret the enum properly. + */ + ARC_FLAG_COMPRESS_0 = 1 << 24, + ARC_FLAG_COMPRESS_1 = 1 << 25, + ARC_FLAG_COMPRESS_2 = 1 << 26, + ARC_FLAG_COMPRESS_3 = 1 << 27, + ARC_FLAG_COMPRESS_4 = 1 << 28, + ARC_FLAG_COMPRESS_5 = 1 << 29, + ARC_FLAG_COMPRESS_6 = 1 << 30 + +} arc_flags_t; + +typedef enum arc_buf_flags { + ARC_BUF_FLAG_SHARED = 1 << 0, + ARC_BUF_FLAG_COMPRESSED = 1 << 1, + /* + * indicates whether this arc_buf_t is encrypted, regardless of + * state on-disk + */ + ARC_BUF_FLAG_ENCRYPTED = 1 << 2 +} arc_buf_flags_t; + +struct arc_buf { + arc_buf_hdr_t *b_hdr; + arc_buf_t *b_next; + kmutex_t b_evict_lock; + void *b_data; + arc_buf_flags_t b_flags; +}; + +typedef enum arc_buf_contents { + ARC_BUFC_INVALID, /* invalid type */ + ARC_BUFC_DATA, /* buffer contains data */ + ARC_BUFC_METADATA, /* buffer contains metadata */ + ARC_BUFC_NUMTYPES +} arc_buf_contents_t; + +/* + * The following breakdowns of arc_size exist for kstat only. + */ +typedef enum arc_space_type { + ARC_SPACE_DATA, + ARC_SPACE_META, + ARC_SPACE_HDRS, + ARC_SPACE_L2HDRS, + ARC_SPACE_DBUF, + ARC_SPACE_DNODE, + ARC_SPACE_BONUS, + ARC_SPACE_ABD_CHUNK_WASTE, + ARC_SPACE_NUMTYPES +} arc_space_type_t; + +typedef enum arc_state_type { + ARC_STATE_ANON, + ARC_STATE_MRU, + ARC_STATE_MRU_GHOST, + ARC_STATE_MFU, + ARC_STATE_MFU_GHOST, + ARC_STATE_L2C_ONLY, + ARC_STATE_NUMTYPES +} arc_state_type_t; + +typedef struct arc_buf_info { + arc_state_type_t abi_state_type; + arc_buf_contents_t abi_state_contents; + uint32_t abi_flags; + uint32_t abi_bufcnt; + uint64_t abi_size; + uint64_t abi_spa; + uint64_t abi_access; + uint32_t abi_mru_hits; + uint32_t abi_mru_ghost_hits; + uint32_t abi_mfu_hits; + uint32_t abi_mfu_ghost_hits; + uint32_t abi_l2arc_hits; + uint32_t abi_holds; + uint64_t abi_l2arc_dattr; + uint64_t abi_l2arc_asize; + enum zio_compress abi_l2arc_compress; +} arc_buf_info_t; + +void arc_space_consume(uint64_t space, arc_space_type_t type); +void arc_space_return(uint64_t space, arc_space_type_t type); +boolean_t arc_is_metadata(arc_buf_t *buf); +boolean_t arc_is_encrypted(arc_buf_t *buf); +boolean_t arc_is_unauthenticated(arc_buf_t *buf); +enum zio_compress arc_get_compression(arc_buf_t *buf); +void arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, + uint8_t *iv, uint8_t *mac); +int arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + boolean_t in_place); +void arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, + dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac); +arc_buf_t *arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, + int32_t size); +arc_buf_t *arc_alloc_compressed_buf(spa_t *spa, void *tag, + uint64_t psize, uint64_t lsize, enum zio_compress compression_type, + uint8_t complevel); +arc_buf_t *arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, + boolean_t byteorder, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac, dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel); +uint8_t arc_get_complevel(arc_buf_t *buf); +arc_buf_t *arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size); +arc_buf_t *arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel); +arc_buf_t *arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel); +void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); +void arc_buf_destroy(arc_buf_t *buf, void *tag); +void arc_buf_info(arc_buf_t *buf, arc_buf_info_t *abi, int state_index); +uint64_t arc_buf_size(arc_buf_t *buf); +uint64_t arc_buf_lsize(arc_buf_t *buf); +void arc_buf_access(arc_buf_t *buf); +void arc_release(arc_buf_t *buf, void *tag); +int arc_released(arc_buf_t *buf); +void arc_buf_sigsegv(int sig, siginfo_t *si, void *unused); +void arc_buf_freeze(arc_buf_t *buf); +void arc_buf_thaw(arc_buf_t *buf); +#ifdef ZFS_DEBUG +int arc_referenced(arc_buf_t *buf); +#endif + +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_read_done_func_t *done, void *priv, zio_priority_t priority, + int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, + arc_write_done_func_t *physdone, arc_write_done_func_t *done, + void *priv, zio_priority_t priority, int zio_flags, + const zbookmark_phys_t *zb); + +arc_prune_t *arc_add_prune_callback(arc_prune_func_t *func, void *priv); +void arc_remove_prune_callback(arc_prune_t *p); +void arc_freed(spa_t *spa, const blkptr_t *bp); + +void arc_flush(spa_t *spa, boolean_t retry); +void arc_tempreserve_clear(uint64_t reserve); +int arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg); + +uint64_t arc_all_memory(void); +uint64_t arc_default_max(uint64_t min, uint64_t allmem); +uint64_t arc_target_bytes(void); +void arc_init(void); +void arc_fini(void); + +/* + * Level 2 ARC + */ + +void l2arc_add_vdev(spa_t *spa, vdev_t *vd); +void l2arc_remove_vdev(vdev_t *vd); +boolean_t l2arc_vdev_present(vdev_t *vd); +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +boolean_t l2arc_range_check_overlap(uint64_t bottom, uint64_t top, + uint64_t check); +void l2arc_init(void); +void l2arc_fini(void); +void l2arc_start(void); +void l2arc_stop(void); +void l2arc_spa_rebuild_start(spa_t *spa); + +#ifndef _KERNEL +extern boolean_t arc_watch; +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_H */ diff --git a/include/sys/arc_impl.h b/include/sys/arc_impl.h new file mode 100644 index 000000000000..c5061695d944 --- /dev/null +++ b/include/sys/arc_impl.h @@ -0,0 +1,928 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, Delphix. All rights reserved. + * Copyright (c) 2013, Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + */ + +#ifndef _SYS_ARC_IMPL_H +#define _SYS_ARC_IMPL_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note that buffers can be in one of 6 states: + * ARC_anon - anonymous (discussed below) + * ARC_mru - recently used, currently cached + * ARC_mru_ghost - recently used, no longer in cache + * ARC_mfu - frequently used, currently cached + * ARC_mfu_ghost - frequently used, no longer in cache + * ARC_l2c_only - exists in L2ARC but not other states + * When there are no active references to the buffer, they are + * are linked onto a list in one of these arc states. These are + * the only buffers that can be evicted or deleted. Within each + * state there are multiple lists, one for meta-data and one for + * non-meta-data. Meta-data (indirect blocks, blocks of dnodes, + * etc.) is tracked separately so that it can be managed more + * explicitly: favored over data, limited explicitly. + * + * Anonymous buffers are buffers that are not associated with + * a DVA. These are buffers that hold dirty block copies + * before they are written to stable storage. By definition, + * they are "ref'd" and are considered part of arc_mru + * that cannot be freed. Generally, they will acquire a DVA + * as they are written and migrate onto the arc_mru list. + * + * The ARC_l2c_only state is for buffers that are in the second + * level ARC but no longer in any of the ARC_m* lists. The second + * level ARC itself may also contain buffers that are in any of + * the ARC_m* states - meaning that a buffer can exist in two + * places. The reason for the ARC_l2c_only state is to keep the + * buffer header in the hash table, so that reads that hit the + * second level ARC benefit from these fast lookups. + */ + +typedef struct arc_state { + /* + * list of evictable buffers + */ + multilist_t *arcs_list[ARC_BUFC_NUMTYPES]; + /* + * total amount of evictable data in this state + */ + zfs_refcount_t arcs_esize[ARC_BUFC_NUMTYPES]; + /* + * total amount of data in this state; this includes: evictable, + * non-evictable, ARC_BUFC_DATA, and ARC_BUFC_METADATA. + */ + zfs_refcount_t arcs_size; + /* + * supports the "dbufs" kstat + */ + arc_state_type_t arcs_state; +} arc_state_t; + +typedef struct arc_callback arc_callback_t; + +struct arc_callback { + void *acb_private; + arc_read_done_func_t *acb_done; + arc_buf_t *acb_buf; + boolean_t acb_encrypted; + boolean_t acb_compressed; + boolean_t acb_noauth; + zbookmark_phys_t acb_zb; + zio_t *acb_zio_dummy; + zio_t *acb_zio_head; + arc_callback_t *acb_next; +}; + +typedef struct arc_write_callback arc_write_callback_t; + +struct arc_write_callback { + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; +}; + +/* + * ARC buffers are separated into multiple structs as a memory saving measure: + * - Common fields struct, always defined, and embedded within it: + * - L2-only fields, always allocated but undefined when not in L2ARC + * - L1-only fields, only allocated when in L1ARC + * + * Buffer in L1 Buffer only in L2 + * +------------------------+ +------------------------+ + * | arc_buf_hdr_t | | arc_buf_hdr_t | + * | | | | + * | | | | + * | | | | + * +------------------------+ +------------------------+ + * | l2arc_buf_hdr_t | | l2arc_buf_hdr_t | + * | (undefined if L1-only) | | | + * +------------------------+ +------------------------+ + * | l1arc_buf_hdr_t | + * | | + * | | + * | | + * | | + * +------------------------+ + * + * Because it's possible for the L2ARC to become extremely large, we can wind + * up eating a lot of memory in L2ARC buffer headers, so the size of a header + * is minimized by only allocating the fields necessary for an L1-cached buffer + * when a header is actually in the L1 cache. The sub-headers (l1arc_buf_hdr and + * l2arc_buf_hdr) are embedded rather than allocated separately to save a couple + * words in pointers. arc_hdr_realloc() is used to switch a header between + * these two allocation states. + */ +typedef struct l1arc_buf_hdr { + kmutex_t b_freeze_lock; + zio_cksum_t *b_freeze_cksum; + + arc_buf_t *b_buf; + uint32_t b_bufcnt; + /* for waiting on writes to complete */ + kcondvar_t b_cv; + uint8_t b_byteswap; + + + /* protected by arc state mutex */ + arc_state_t *b_state; + multilist_node_t b_arc_node; + + /* updated atomically */ + clock_t b_arc_access; + uint32_t b_mru_hits; + uint32_t b_mru_ghost_hits; + uint32_t b_mfu_hits; + uint32_t b_mfu_ghost_hits; + uint32_t b_l2_hits; + + /* self protecting */ + zfs_refcount_t b_refcnt; + + arc_callback_t *b_acb; + abd_t *b_pabd; +} l1arc_buf_hdr_t; + +typedef enum l2arc_dev_hdr_flags_t { + L2ARC_DEV_HDR_EVICT_FIRST = (1 << 0) /* mirror of l2ad_first */ +} l2arc_dev_hdr_flags_t; + +/* + * Pointer used in persistent L2ARC (for pointing to log blocks). + */ +typedef struct l2arc_log_blkptr { + /* + * Offset of log block within the device, in bytes + */ + uint64_t lbp_daddr; + /* + * Aligned payload size (in bytes) of the log block + */ + uint64_t lbp_payload_asize; + /* + * Offset in bytes of the first buffer in the payload + */ + uint64_t lbp_payload_start; + /* + * lbp_prop has the following format: + * * logical size (in bytes) + * * aligned (after compression) size (in bytes) + * * compression algorithm (we always LZ4-compress l2arc logs) + * * checksum algorithm (used for lbp_cksum) + */ + uint64_t lbp_prop; + zio_cksum_t lbp_cksum; /* checksum of log */ +} l2arc_log_blkptr_t; + +/* + * The persistent L2ARC device header. + * Byte order of magic determines whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_dev_hdr_phys { + uint64_t dh_magic; /* L2ARC_DEV_HDR_MAGIC */ + uint64_t dh_version; /* Persistent L2ARC version */ + + /* + * Global L2ARC device state and metadata. + */ + uint64_t dh_spa_guid; + uint64_t dh_vdev_guid; + uint64_t dh_log_entries; /* mirror of l2ad_log_entries */ + uint64_t dh_evict; /* evicted offset in bytes */ + uint64_t dh_flags; /* l2arc_dev_hdr_flags_t */ + /* + * Used in zdb.c for determining if a log block is valid, in the same + * way that l2arc_rebuild() does. + */ + uint64_t dh_start; /* mirror of l2ad_start */ + uint64_t dh_end; /* mirror of l2ad_end */ + /* + * Start of log block chain. [0] -> newest log, [1] -> one older (used + * for initiating prefetch). + */ + l2arc_log_blkptr_t dh_start_lbps[2]; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + uint64_t dh_lb_asize; /* mirror of l2ad_lb_asize */ + uint64_t dh_lb_count; /* mirror of l2ad_lb_count */ + /* + * Mirrors of vdev_trim_action_time and vdev_trim_state, used to + * display when the cache device was fully trimmed for the last + * time. + */ + uint64_t dh_trim_action_time; + uint64_t dh_trim_state; + const uint64_t dh_pad[30]; /* pad to 512 bytes */ + zio_eck_t dh_tail; +} l2arc_dev_hdr_phys_t; +CTASSERT_GLOBAL(sizeof (l2arc_dev_hdr_phys_t) == SPA_MINBLOCKSIZE); + +/* + * A single ARC buffer header entry in a l2arc_log_blk_phys_t. + */ +typedef struct l2arc_log_ent_phys { + dva_t le_dva; /* dva of buffer */ + uint64_t le_birth; /* birth txg of buffer */ + /* + * le_prop has the following format: + * * logical size (in bytes) + * * physical (compressed) size (in bytes) + * * compression algorithm + * * object type (used to restore arc_buf_contents_t) + * * protected status (used for encryption) + * * prefetch status (used in l2arc_read_done()) + */ + uint64_t le_prop; + uint64_t le_daddr; /* buf location on l2dev */ + uint64_t le_complevel; + /* + * We pad the size of each entry to a power of 2 so that the size of + * l2arc_log_blk_phys_t is power-of-2 aligned with SPA_MINBLOCKSHIFT, + * because of the L2ARC_SET_*SIZE macros. + */ + const uint64_t le_pad[2]; /* pad to 64 bytes */ +} l2arc_log_ent_phys_t; + +#define L2ARC_LOG_BLK_MAX_ENTRIES (1022) + +/* + * A log block of up to 1022 ARC buffer log entries, chained into the + * persistent L2ARC metadata linked list. Byte order of magic determines + * whether 64-bit bswap of fields is necessary. + */ +typedef struct l2arc_log_blk_phys { + uint64_t lb_magic; /* L2ARC_LOG_BLK_MAGIC */ + /* + * There are 2 chains (headed by dh_start_lbps[2]), and this field + * points back to the previous block in this chain. We alternate + * which chain we append to, so they are time-wise and offset-wise + * interleaved, but that is an optimization rather than for + * correctness. + */ + l2arc_log_blkptr_t lb_prev_lbp; /* pointer to prev log block */ + /* + * Pad header section to 128 bytes + */ + uint64_t lb_pad[7]; + /* Payload */ + l2arc_log_ent_phys_t lb_entries[L2ARC_LOG_BLK_MAX_ENTRIES]; +} l2arc_log_blk_phys_t; /* 64K total */ + +/* + * The size of l2arc_log_blk_phys_t has to be power-of-2 aligned with + * SPA_MINBLOCKSHIFT because of L2BLK_SET_*SIZE macros. + */ +CTASSERT_GLOBAL(IS_P2ALIGNED(sizeof (l2arc_log_blk_phys_t), + 1ULL << SPA_MINBLOCKSHIFT)); +CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) >= SPA_MINBLOCKSIZE); +CTASSERT_GLOBAL(sizeof (l2arc_log_blk_phys_t) <= SPA_MAXBLOCKSIZE); + +/* + * These structures hold in-flight abd buffers for log blocks as they're being + * written to the L2ARC device. + */ +typedef struct l2arc_lb_abd_buf { + abd_t *abd; + list_node_t node; +} l2arc_lb_abd_buf_t; + +/* + * These structures hold pointers to log blocks present on the L2ARC device. + */ +typedef struct l2arc_lb_ptr_buf { + l2arc_log_blkptr_t *lb_ptr; + list_node_t node; +} l2arc_lb_ptr_buf_t; + +/* Macros for setting fields in le_prop and lbp_prop */ +#define L2BLK_GET_LSIZE(field) \ + BF64_GET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_LSIZE(field, x) \ + BF64_SET_SB((field), 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_PSIZE(field) \ + BF64_GET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1) +#define L2BLK_SET_PSIZE(field, x) \ + BF64_SET_SB((field), 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x) +#define L2BLK_GET_COMPRESS(field) \ + BF64_GET((field), 32, SPA_COMPRESSBITS) +#define L2BLK_SET_COMPRESS(field, x) \ + BF64_SET((field), 32, SPA_COMPRESSBITS, x) +#define L2BLK_GET_PREFETCH(field) BF64_GET((field), 39, 1) +#define L2BLK_SET_PREFETCH(field, x) BF64_SET((field), 39, 1, x) +#define L2BLK_GET_CHECKSUM(field) BF64_GET((field), 40, 8) +#define L2BLK_SET_CHECKSUM(field, x) BF64_SET((field), 40, 8, x) +#define L2BLK_GET_TYPE(field) BF64_GET((field), 48, 8) +#define L2BLK_SET_TYPE(field, x) BF64_SET((field), 48, 8, x) +#define L2BLK_GET_PROTECTED(field) BF64_GET((field), 56, 1) +#define L2BLK_SET_PROTECTED(field, x) BF64_SET((field), 56, 1, x) + +#define PTR_SWAP(x, y) \ + do { \ + void *tmp = (x);\ + x = y; \ + y = tmp; \ + _NOTE(CONSTCOND)\ + } while (0) + +#define L2ARC_DEV_HDR_MAGIC 0x5a46534341434845LLU /* ASCII: "ZFSCACHE" */ +#define L2ARC_LOG_BLK_MAGIC 0x4c4f47424c4b4844LLU /* ASCII: "LOGBLKHD" */ + +/* + * L2ARC Internals + */ +typedef struct l2arc_dev { + vdev_t *l2ad_vdev; /* vdev */ + spa_t *l2ad_spa; /* spa */ + uint64_t l2ad_hand; /* next write location */ + uint64_t l2ad_start; /* first addr on device */ + uint64_t l2ad_end; /* last addr on device */ + boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ + kmutex_t l2ad_mtx; /* lock for buffer list */ + list_t l2ad_buflist; /* buffer list */ + list_node_t l2ad_node; /* device list node */ + zfs_refcount_t l2ad_alloc; /* allocated bytes */ + /* + * Persistence-related stuff + */ + l2arc_dev_hdr_phys_t *l2ad_dev_hdr; /* persistent device header */ + uint64_t l2ad_dev_hdr_asize; /* aligned hdr size */ + l2arc_log_blk_phys_t l2ad_log_blk; /* currently open log block */ + int l2ad_log_ent_idx; /* index into cur log blk */ + /* Number of bytes in current log block's payload */ + uint64_t l2ad_log_blk_payload_asize; + /* + * Offset (in bytes) of the first buffer in current log block's + * payload. + */ + uint64_t l2ad_log_blk_payload_start; + /* Flag indicating whether a rebuild is scheduled or is going on */ + boolean_t l2ad_rebuild; + boolean_t l2ad_rebuild_cancel; + boolean_t l2ad_rebuild_began; + uint64_t l2ad_log_entries; /* entries per log blk */ + uint64_t l2ad_evict; /* evicted offset in bytes */ + /* List of pointers to log blocks present in the L2ARC device */ + list_t l2ad_lbptr_list; + /* + * Aligned size of all log blocks as accounted by vdev_space_update(). + */ + zfs_refcount_t l2ad_lb_asize; + /* + * Number of log blocks present on the device. + */ + zfs_refcount_t l2ad_lb_count; + boolean_t l2ad_trim_all; /* TRIM whole device */ +} l2arc_dev_t; + +/* + * Encrypted blocks will need to be stored encrypted on the L2ARC + * disk as they appear in the main pool. In order for this to work we + * need to pass around the encryption parameters so they can be used + * to write data to the L2ARC. This struct is only defined in the + * arc_buf_hdr_t if the L1 header is defined and has the ARC_FLAG_ENCRYPTED + * flag set. + */ +typedef struct arc_buf_hdr_crypt { + abd_t *b_rabd; /* raw encrypted data */ + dmu_object_type_t b_ot; /* object type */ + uint32_t b_ebufcnt; /* count of encrypted buffers */ + + /* dsobj for looking up encryption key for l2arc encryption */ + uint64_t b_dsobj; + + /* encryption parameters */ + uint8_t b_salt[ZIO_DATA_SALT_LEN]; + uint8_t b_iv[ZIO_DATA_IV_LEN]; + + /* + * Technically this could be removed since we will always be able to + * get the mac from the bp when we need it. However, it is inconvenient + * for callers of arc code to have to pass a bp in all the time. This + * also allows us to assert that L2ARC data is properly encrypted to + * match the data in the main storage pool. + */ + uint8_t b_mac[ZIO_DATA_MAC_LEN]; +} arc_buf_hdr_crypt_t; + +typedef struct l2arc_buf_hdr { + /* protected by arc_buf_hdr mutex */ + l2arc_dev_t *b_dev; /* L2ARC device */ + uint64_t b_daddr; /* disk address, offset byte */ + uint32_t b_hits; + list_node_t b_l2node; +} l2arc_buf_hdr_t; + +typedef struct l2arc_write_callback { + l2arc_dev_t *l2wcb_dev; /* device info */ + arc_buf_hdr_t *l2wcb_head; /* head of write buflist */ + /* in-flight list of log blocks */ + list_t l2wcb_abd_list; +} l2arc_write_callback_t; + +struct arc_buf_hdr { + /* protected by hash lock */ + dva_t b_dva; + uint64_t b_birth; + + arc_buf_contents_t b_type; + uint8_t b_complevel; + uint8_t b_reserved1; /* used for 4 byte alignment */ + uint16_t b_reserved2; /* used for 4 byte alignment */ + arc_buf_hdr_t *b_hash_next; + arc_flags_t b_flags; + + /* + * This field stores the size of the data buffer after + * compression, and is set in the arc's zio completion handlers. + * It is in units of SPA_MINBLOCKSIZE (e.g. 1 == 512 bytes). + * + * While the block pointers can store up to 32MB in their psize + * field, we can only store up to 32MB minus 512B. This is due + * to the bp using a bias of 1, whereas we use a bias of 0 (i.e. + * a field of zeros represents 512B in the bp). We can't use a + * bias of 1 since we need to reserve a psize of zero, here, to + * represent holes and embedded blocks. + * + * This isn't a problem in practice, since the maximum size of a + * buffer is limited to 16MB, so we never need to store 32MB in + * this field. Even in the upstream illumos code base, the + * maximum size of a buffer is limited to 16MB. + */ + uint16_t b_psize; + + /* + * This field stores the size of the data buffer before + * compression, and cannot change once set. It is in units + * of SPA_MINBLOCKSIZE (e.g. 2 == 1024 bytes) + */ + uint16_t b_lsize; /* immutable */ + uint64_t b_spa; /* immutable */ + + /* L2ARC fields. Undefined when not in L2ARC. */ + l2arc_buf_hdr_t b_l2hdr; + /* L1ARC fields. Undefined when in l2arc_only state */ + l1arc_buf_hdr_t b_l1hdr; + /* + * Encryption parameters. Defined only when ARC_FLAG_ENCRYPTED + * is set and the L1 header exists. + */ + arc_buf_hdr_crypt_t b_crypt_hdr; +}; + +typedef struct arc_stats { + kstat_named_t arcstat_hits; + kstat_named_t arcstat_misses; + kstat_named_t arcstat_demand_data_hits; + kstat_named_t arcstat_demand_data_misses; + kstat_named_t arcstat_demand_metadata_hits; + kstat_named_t arcstat_demand_metadata_misses; + kstat_named_t arcstat_prefetch_data_hits; + kstat_named_t arcstat_prefetch_data_misses; + kstat_named_t arcstat_prefetch_metadata_hits; + kstat_named_t arcstat_prefetch_metadata_misses; + kstat_named_t arcstat_mru_hits; + kstat_named_t arcstat_mru_ghost_hits; + kstat_named_t arcstat_mfu_hits; + kstat_named_t arcstat_mfu_ghost_hits; + kstat_named_t arcstat_deleted; + /* + * Number of buffers that could not be evicted because the hash lock + * was held by another thread. The lock may not necessarily be held + * by something using the same buffer, since hash locks are shared + * by multiple buffers. + */ + kstat_named_t arcstat_mutex_miss; + /* + * Number of buffers skipped when updating the access state due to the + * header having already been released after acquiring the hash lock. + */ + kstat_named_t arcstat_access_skip; + /* + * Number of buffers skipped because they have I/O in progress, are + * indirect prefetch buffers that have not lived long enough, or are + * not from the spa we're trying to evict from. + */ + kstat_named_t arcstat_evict_skip; + /* + * Number of times arc_evict_state() was unable to evict enough + * buffers to reach its target amount. + */ + kstat_named_t arcstat_evict_not_enough; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; + kstat_named_t arcstat_evict_l2_skip; + kstat_named_t arcstat_hash_elements; + kstat_named_t arcstat_hash_elements_max; + kstat_named_t arcstat_hash_collisions; + kstat_named_t arcstat_hash_chains; + kstat_named_t arcstat_hash_chain_max; + kstat_named_t arcstat_p; + kstat_named_t arcstat_c; + kstat_named_t arcstat_c_min; + kstat_named_t arcstat_c_max; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_size; + /* + * Number of compressed bytes stored in the arc_buf_hdr_t's b_pabd. + * Note that the compressed bytes may match the uncompressed bytes + * if the block is either not compressed or compressed arc is disabled. + */ + kstat_named_t arcstat_compressed_size; + /* + * Uncompressed size of the data stored in b_pabd. If compressed + * arc is disabled then this value will be identical to the stat + * above. + */ + kstat_named_t arcstat_uncompressed_size; + /* + * Number of bytes stored in all the arc_buf_t's. This is classified + * as "overhead" since this data is typically short-lived and will + * be evicted from the arc when it becomes unreferenced unless the + * zfs_keep_uncompressed_metadata or zfs_keep_uncompressed_level + * values have been set (see comment in dbuf.c for more information). + */ + kstat_named_t arcstat_overhead_size; + /* + * Number of bytes consumed by internal ARC structures necessary + * for tracking purposes; these structures are not actually + * backed by ARC buffers. This includes arc_buf_hdr_t structures + * (allocated via arc_buf_hdr_t_full and arc_buf_hdr_t_l2only + * caches), and arc_buf_t structures (allocated via arc_buf_t + * cache). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_hdr_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_DATA. This is generally consumed by buffers backing + * on disk user data (e.g. plain file contents). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_data_size; + /* + * Number of bytes consumed by ARC buffers of type equal to + * ARC_BUFC_METADATA. This is generally consumed by buffers + * backing on disk data that is used for internal ZFS + * structures (e.g. ZAP, dnode, indirect blocks, etc). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_metadata_size; + /* + * Number of bytes consumed by dmu_buf_impl_t objects. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_dbuf_size; + /* + * Number of bytes consumed by dnode_t objects. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_dnode_size; + /* + * Number of bytes consumed by bonus buffers. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_bonus_size; +#if defined(COMPAT_FREEBSD11) + /* + * Sum of the previous three counters, provided for compatibility. + */ + kstat_named_t arcstat_other_size; +#endif + + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_anon state. This includes *all* buffers in the arc_anon + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_anon state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_anon_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mru state. This includes *all* buffers in the arc_mru + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_size; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_DATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_data; + /* + * Number of bytes consumed by ARC buffers that meet the + * following criteria: backing buffers of type ARC_BUFC_METADATA, + * residing in the arc_mru state, and are eligible for eviction + * (e.g. have no outstanding holds on the buffer). + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mru_ghost state. The key thing to note + * here, is the fact that this size doesn't actually indicate + * RAM consumption. The ghost lists only consist of headers and + * don't actually have ARC buffers linked off of these headers. + * Thus, *if* the headers had associated ARC buffers, these + * buffers *would have* consumed this number of bytes. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mru_ghost_evictable_metadata; + /* + * Total number of bytes consumed by ARC buffers residing in the + * arc_mfu state. This includes *all* buffers in the arc_mfu + * state; e.g. data, metadata, evictable, and unevictable buffers + * are all included in this value. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_size; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_DATA, and reside in the arc_mfu + * state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_data; + /* + * Number of bytes consumed by ARC buffers that are eligible for + * eviction, of type ARC_BUFC_METADATA, and reside in the + * arc_mfu state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_evictable_metadata; + /* + * Total number of bytes that *would have been* consumed by ARC + * buffers in the arc_mfu_ghost state. See the comment above + * arcstat_mru_ghost_size for more details. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_size; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_DATA, and linked off the arc_mfu_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_data; + /* + * Number of bytes that *would have been* consumed by ARC + * buffers that are eligible for eviction, of type + * ARC_BUFC_METADATA, and linked off the arc_mru_ghost state. + * Not updated directly; only synced in arc_kstat_update. + */ + kstat_named_t arcstat_mfu_ghost_evictable_metadata; + kstat_named_t arcstat_l2_hits; + kstat_named_t arcstat_l2_misses; + kstat_named_t arcstat_l2_feeds; + kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; + kstat_named_t arcstat_l2_writes_sent; + kstat_named_t arcstat_l2_writes_done; + kstat_named_t arcstat_l2_writes_error; + kstat_named_t arcstat_l2_writes_lock_retry; + kstat_named_t arcstat_l2_evict_lock_retry; + kstat_named_t arcstat_l2_evict_reading; + kstat_named_t arcstat_l2_evict_l1cached; + kstat_named_t arcstat_l2_free_on_write; + kstat_named_t arcstat_l2_abort_lowmem; + kstat_named_t arcstat_l2_cksum_bad; + kstat_named_t arcstat_l2_io_error; + kstat_named_t arcstat_l2_lsize; + kstat_named_t arcstat_l2_psize; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_l2_hdr_size; + /* + * Number of L2ARC log blocks written. These are used for restoring the + * L2ARC. Updated during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_log_blk_writes; + /* + * Moving average of the aligned size of the L2ARC log blocks, in + * bytes. Updated during L2ARC rebuild and during writing of L2ARC + * log blocks. + */ + kstat_named_t arcstat_l2_log_blk_avg_asize; + /* Aligned size of L2ARC log blocks on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_asize; + /* Number of L2ARC log blocks present on L2ARC devices. */ + kstat_named_t arcstat_l2_log_blk_count; + /* + * Moving average of the aligned size of L2ARC restored data, in bytes, + * to the aligned size of their metadata in L2ARC, in bytes. + * Updated during L2ARC rebuild and during writing of L2ARC log blocks. + */ + kstat_named_t arcstat_l2_data_to_meta_ratio; + /* + * Number of times the L2ARC rebuild was successful for an L2ARC device. + */ + kstat_named_t arcstat_l2_rebuild_success; + /* + * Number of times the L2ARC rebuild failed because the device header + * was in an unsupported format or corrupted. + */ + kstat_named_t arcstat_l2_rebuild_abort_unsupported; + /* + * Number of times the L2ARC rebuild failed because of IO errors + * while reading a log block. + */ + kstat_named_t arcstat_l2_rebuild_abort_io_errors; + /* + * Number of times the L2ARC rebuild failed because of IO errors when + * reading the device header. + */ + kstat_named_t arcstat_l2_rebuild_abort_dh_errors; + /* + * Number of L2ARC log blocks which failed to be restored due to + * checksum errors. + */ + kstat_named_t arcstat_l2_rebuild_abort_cksum_lb_errors; + /* + * Number of times the L2ARC rebuild was aborted due to low system + * memory. + */ + kstat_named_t arcstat_l2_rebuild_abort_lowmem; + /* Logical size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_size; + /* Aligned size of L2ARC restored data, in bytes. */ + kstat_named_t arcstat_l2_rebuild_asize; + /* + * Number of L2ARC log entries (buffers) that were successfully + * restored in ARC. + */ + kstat_named_t arcstat_l2_rebuild_bufs; + /* + * Number of L2ARC log entries (buffers) already cached in ARC. These + * were not restored again. + */ + kstat_named_t arcstat_l2_rebuild_bufs_precached; + /* + * Number of L2ARC log blocks that were restored successfully. Each + * log block may hold up to L2ARC_LOG_BLK_MAX_ENTRIES buffers. + */ + kstat_named_t arcstat_l2_rebuild_log_blks; + kstat_named_t arcstat_memory_throttle_count; + kstat_named_t arcstat_memory_direct_count; + kstat_named_t arcstat_memory_indirect_count; + kstat_named_t arcstat_memory_all_bytes; + kstat_named_t arcstat_memory_free_bytes; + kstat_named_t arcstat_memory_available_bytes; + kstat_named_t arcstat_no_grow; + kstat_named_t arcstat_tempreserve; + kstat_named_t arcstat_loaned_bytes; + kstat_named_t arcstat_prune; + /* Not updated directly; only synced in arc_kstat_update. */ + kstat_named_t arcstat_meta_used; + kstat_named_t arcstat_meta_limit; + kstat_named_t arcstat_dnode_limit; + kstat_named_t arcstat_meta_max; + kstat_named_t arcstat_meta_min; + kstat_named_t arcstat_async_upgrade_sync; + kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; + kstat_named_t arcstat_need_free; + kstat_named_t arcstat_sys_free; + kstat_named_t arcstat_raw_size; + kstat_named_t arcstat_cached_only_in_progress; + kstat_named_t arcstat_abd_chunk_waste_size; +} arc_stats_t; + +typedef struct arc_evict_waiter { + list_node_t aew_node; + kcondvar_t aew_cv; + uint64_t aew_count; +} arc_evict_waiter_t; + +#define ARCSTAT(stat) (arc_stats.stat.value.ui64) + +#define ARCSTAT_INCR(stat, val) \ + atomic_add_64(&arc_stats.stat.value.ui64, (val)) + +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) + +#define arc_no_grow ARCSTAT(arcstat_no_grow) /* do not grow cache size */ +#define arc_p ARCSTAT(arcstat_p) /* target size of MRU */ +#define arc_c ARCSTAT(arcstat_c) /* target size of cache */ +#define arc_c_min ARCSTAT(arcstat_c_min) /* min target cache size */ +#define arc_c_max ARCSTAT(arcstat_c_max) /* max target cache size */ +#define arc_sys_free ARCSTAT(arcstat_sys_free) /* target system free bytes */ + +extern taskq_t *arc_prune_taskq; +extern arc_stats_t arc_stats; +extern hrtime_t arc_growtime; +extern boolean_t arc_warm; +extern int arc_grow_retry; +extern int arc_no_grow_shift; +extern int arc_shrink_shift; +extern kmutex_t arc_prune_mtx; +extern list_t arc_prune_list; +extern aggsum_t arc_size; +extern arc_state_t *arc_mfu; +extern arc_state_t *arc_mru; +extern uint_t zfs_arc_pc_percent; +extern int arc_lotsfree_percent; +extern unsigned long zfs_arc_min; +extern unsigned long zfs_arc_max; + +extern void arc_reduce_target_size(int64_t to_free); +extern boolean_t arc_reclaim_needed(void); +extern void arc_kmem_reap_soon(void); +extern boolean_t arc_is_overflowing(void); +extern void arc_wait_for_eviction(uint64_t); + +extern void arc_lowmem_init(void); +extern void arc_lowmem_fini(void); +extern void arc_prune_async(int64_t); +extern int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg); +extern uint64_t arc_free_memory(void); +extern int64_t arc_available_memory(void); +extern void arc_tuning_update(boolean_t); + +extern int param_set_arc_long(ZFS_MODULE_PARAM_ARGS); +extern int param_set_arc_int(ZFS_MODULE_PARAM_ARGS); + +/* used in zdb.c */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); + +/* used in vdev_trim.c */ +void l2arc_dev_hdr_update(l2arc_dev_t *dev); +l2arc_dev_t *l2arc_vdev_get(vdev_t *vd); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ARC_IMPL_H */ diff --git a/include/sys/avl.h b/include/sys/avl.h new file mode 100644 index 000000000000..ed3c6f86a568 --- /dev/null +++ b/include/sys/avl.h @@ -0,0 +1,325 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _AVL_H +#define _AVL_H + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * This is a generic implementation of AVL trees for use in the Solaris kernel. + * The interfaces provide an efficient way of implementing an ordered set of + * data structures. + * + * AVL trees provide an alternative to using an ordered linked list. Using AVL + * trees will usually be faster, however they requires more storage. An ordered + * linked list in general requires 2 pointers in each data structure. The + * AVL tree implementation uses 3 pointers. The following chart gives the + * approximate performance of operations with the different approaches: + * + * Operation Link List AVL tree + * --------- -------- -------- + * lookup O(n) O(log(n)) + * + * insert 1 node constant constant + * + * delete 1 node constant between constant and O(log(n)) + * + * delete all nodes O(n) O(n) + * + * visit the next + * or prev node constant between constant and O(log(n)) + * + * + * The data structure nodes are anchored at an "avl_tree_t" (the equivalent + * of a list header) and the individual nodes will have a field of + * type "avl_node_t" (corresponding to list pointers). + * + * The type "avl_index_t" is used to indicate a position in the list for + * certain calls. + * + * The usage scenario is generally: + * + * 1. Create the list/tree with: avl_create() + * + * followed by any mixture of: + * + * 2a. Insert nodes with: avl_add(), or avl_find() and avl_insert() + * + * 2b. Visited elements with: + * avl_first() - returns the lowest valued node + * avl_last() - returns the highest valued node + * AVL_NEXT() - given a node go to next higher one + * AVL_PREV() - given a node go to previous lower one + * + * 2c. Find the node with the closest value either less than or greater + * than a given value with avl_nearest(). + * + * 2d. Remove individual nodes from the list/tree with avl_remove(). + * + * and finally when the list is being destroyed + * + * 3. Use avl_destroy_nodes() to quickly process/free up any remaining nodes. + * Note that once you use avl_destroy_nodes(), you can no longer + * use any routine except avl_destroy_nodes() and avl_destroy(). + * + * 4. Use avl_destroy() to destroy the AVL tree itself. + * + * Any locking for multiple thread access is up to the user to provide, just + * as is needed for any linked list implementation. + */ + +/* + * AVL comparator helpers + */ +#define TREE_ISIGN(a) (((a) > 0) - ((a) < 0)) +#define TREE_CMP(a, b) (((a) > (b)) - ((a) < (b))) +#define TREE_PCMP(a, b) \ + (((uintptr_t)(a) > (uintptr_t)(b)) - ((uintptr_t)(a) < (uintptr_t)(b))) + +/* + * Type used for the root of the AVL tree. + */ +typedef struct avl_tree avl_tree_t; + +/* + * The data nodes in the AVL tree must have a field of this type. + */ +typedef struct avl_node avl_node_t; + +/* + * An opaque type used to locate a position in the tree where a node + * would be inserted. + */ +typedef uintptr_t avl_index_t; + + +/* + * Direction constants used for avl_nearest(). + */ +#define AVL_BEFORE (0) +#define AVL_AFTER (1) + + +/* + * Prototypes + * + * Where not otherwise mentioned, "void *" arguments are a pointer to the + * user data structure which must contain a field of type avl_node_t. + * + * Also assume the user data structures looks like: + * struct my_type { + * ... + * avl_node_t my_link; + * ... + * }; + */ + +/* + * Initialize an AVL tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + * offset - the value of OFFSETOF(struct my_type, my_link) + */ +extern void avl_create(avl_tree_t *tree, + int (*compar) (const void *, const void *), size_t size, size_t offset); + + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with avl_insert() or avl_nearest(). + * + * node - node that has the value being looked for + * where - position for use with avl_nearest() or avl_insert(), may be NULL + */ +extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from avl_find() + */ +extern void avl_insert(avl_tree_t *tree, void *node, avl_index_t where); + +/* + * Insert "new_data" in "tree" in the given "direction" either after + * or before the data "here". + * + * This might be useful for avl clients caching recently accessed + * data to avoid doing avl_find() again for insertion. + * + * new_data - new data to insert + * here - existing node in "tree" + * direction - either AVL_AFTER or AVL_BEFORE the data "here". + */ +extern void avl_insert_here(avl_tree_t *tree, void *new_data, void *here, + int direction); + + +/* + * Return the first or last valued node in the tree. Will return NULL + * if the tree is empty. + * + */ +extern void *avl_first(avl_tree_t *tree); +extern void *avl_last(avl_tree_t *tree); + + +/* + * Return the next or previous valued node in the tree. + * AVL_NEXT() will return NULL if at the last node. + * AVL_PREV() will return NULL if at the first node. + * + * node - the node from which the next or previous node is found + */ +#define AVL_NEXT(tree, node) avl_walk(tree, node, AVL_AFTER) +#define AVL_PREV(tree, node) avl_walk(tree, node, AVL_BEFORE) + + +/* + * Find the node with the nearest value either greater or less than + * the value from a previous avl_find(). Returns the node or NULL if + * there isn't a matching one. + * + * where - position as returned from avl_find() + * direction - either AVL_BEFORE or AVL_AFTER + * + * EXAMPLE get the greatest node that is less than a given value: + * + * avl_tree_t *tree; + * struct my_data look_for_value = {....}; + * struct my_data *node; + * struct my_data *less; + * avl_index_t where; + * + * node = avl_find(tree, &look_for_value, &where); + * if (node != NULL) + * less = AVL_PREV(tree, node); + * else + * less = avl_nearest(tree, where, AVL_BEFORE); + */ +extern void *avl_nearest(avl_tree_t *tree, avl_index_t where, int direction); + + +/* + * Add a single node to the tree. + * The node must not be in the tree, and it must not + * compare equal to any other node already in the tree. + * + * node - the node to add + */ +extern void avl_add(avl_tree_t *tree, void *node); + + +/* + * Remove a single node from the tree. The node must be in the tree. + * + * node - the node to remove + */ +extern void avl_remove(avl_tree_t *tree, void *node); + +/* + * Reinsert a node only if its order has changed relative to its nearest + * neighbors. To optimize performance avl_update_lt() checks only the previous + * node and avl_update_gt() checks only the next node. Use avl_update_lt() and + * avl_update_gt() only if you know the direction in which the order of the + * node may change. + */ +extern boolean_t avl_update(avl_tree_t *, void *); +extern boolean_t avl_update_lt(avl_tree_t *, void *); +extern boolean_t avl_update_gt(avl_tree_t *, void *); + +/* + * Swaps the contents of the two trees. + */ +extern void avl_swap(avl_tree_t *tree1, avl_tree_t *tree2); + +/* + * Return the number of nodes in the tree + */ +extern ulong_t avl_numnodes(avl_tree_t *tree); + +/* + * Return B_TRUE if there are zero nodes in the tree, B_FALSE otherwise. + */ +extern boolean_t avl_is_empty(avl_tree_t *tree); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call avl_destroy_nodes(), you can only continuing calling it and + * finally avl_destroy(). No other AVL routines will be valid. + * + * cookie - a "void *" used to save state between calls to avl_destroy_nodes() + * + * EXAMPLE: + * avl_tree_t *tree; + * struct my_data *node; + * void *cookie; + * + * cookie = NULL; + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + */ +extern void *avl_destroy_nodes(avl_tree_t *tree, void **cookie); + + +/* + * Final destroy of an AVL tree. Arguments are: + * + * tree - the empty tree to destroy + */ +extern void avl_destroy(avl_tree_t *tree); + + + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_H */ diff --git a/include/sys/avl_impl.h b/include/sys/avl_impl.h new file mode 100644 index 000000000000..fddf76906dee --- /dev/null +++ b/include/sys/avl_impl.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AVL_IMPL_H +#define _AVL_IMPL_H + + + +/* + * This is a private header file. Applications should not directly include + * this file. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +/* + * generic AVL tree implementation for kernel use + * + * There are 5 pieces of information stored for each node in an AVL tree + * + * pointer to less than child + * pointer to greater than child + * a pointer to the parent of this node + * an indication [0/1] of which child I am of my parent + * a "balance" (-1, 0, +1) indicating which child tree is taller + * + * Since they only need 3 bits, the last two fields are packed into the + * bottom bits of the parent pointer on 64 bit machines to save on space. + */ + +#ifndef _LP64 + +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children */ + struct avl_node *avl_parent; /* this node's parent */ + unsigned short avl_child_index; /* my index in parent's avl_child[] */ + short avl_balance; /* balance value: -1, 0, +1 */ +}; + +#define AVL_XPARENT(n) ((n)->avl_parent) +#define AVL_SETPARENT(n, p) ((n)->avl_parent = (p)) + +#define AVL_XCHILD(n) ((n)->avl_child_index) +#define AVL_SETCHILD(n, c) ((n)->avl_child_index = (unsigned short)(c)) + +#define AVL_XBALANCE(n) ((n)->avl_balance) +#define AVL_SETBALANCE(n, b) ((n)->avl_balance = (short)(b)) + +#else /* _LP64 */ + +/* + * for 64 bit machines, avl_pcb contains parent pointer, balance and child_index + * values packed in the following manner: + * + * |63 3| 2 |1 0 | + * |-------------------------------------|-----------------|-------------| + * | avl_parent hi order bits | avl_child_index | avl_balance | + * | | | + 1 | + * |-------------------------------------|-----------------|-------------| + * + */ +struct avl_node { + struct avl_node *avl_child[2]; /* left/right children nodes */ + uintptr_t avl_pcb; /* parent, child_index, balance */ +}; + +/* + * macros to extract/set fields in avl_pcb + * + * pointer to the parent of the current node is the high order bits + */ +#define AVL_XPARENT(n) ((struct avl_node *)((n)->avl_pcb & ~7)) +#define AVL_SETPARENT(n, p) \ + ((n)->avl_pcb = (((n)->avl_pcb & 7) | (uintptr_t)(p))) + +/* + * index of this node in its parent's avl_child[]: bit #2 + */ +#define AVL_XCHILD(n) (((n)->avl_pcb >> 2) & 1) +#define AVL_SETCHILD(n, c) \ + ((n)->avl_pcb = (uintptr_t)(((n)->avl_pcb & ~4) | ((c) << 2))) + +/* + * balance indication for a node, lowest 2 bits. A valid balance is + * -1, 0, or +1, and is encoded by adding 1 to the value to get the + * unsigned values of 0, 1, 2. + */ +#define AVL_XBALANCE(n) ((int)(((n)->avl_pcb & 3) - 1)) +#define AVL_SETBALANCE(n, b) \ + ((n)->avl_pcb = (uintptr_t)((((n)->avl_pcb & ~3) | ((b) + 1)))) + +#endif /* _LP64 */ + + + +/* + * switch between a node and data pointer for a given tree + * the value of "o" is tree->avl_offset + */ +#define AVL_NODE2DATA(n, o) ((void *)((uintptr_t)(n) - (o))) +#define AVL_DATA2NODE(d, o) ((struct avl_node *)((uintptr_t)(d) + (o))) + + + +/* + * macros used to create/access an avl_index_t + */ +#define AVL_INDEX2NODE(x) ((avl_node_t *)((x) & ~1)) +#define AVL_INDEX2CHILD(x) ((x) & 1) +#define AVL_MKINDEX(n, c) ((avl_index_t)(n) | (c)) + + +/* + * The tree structure. The fields avl_root, avl_compar, and avl_offset come + * first since they are needed for avl_find(). We want them to fit into + * a single 64 byte cache line to make avl_find() as fast as possible. + */ +struct avl_tree { + struct avl_node *avl_root; /* root node in tree */ + int (*avl_compar)(const void *, const void *); + size_t avl_offset; /* offsetof(type, avl_link_t field) */ + ulong_t avl_numnodes; /* number of nodes in the tree */ + size_t avl_size; /* sizeof user type struct */ +}; + + +/* + * This will only by used via AVL_NEXT() or AVL_PREV() + */ +extern void *avl_walk(struct avl_tree *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _AVL_IMPL_H */ diff --git a/include/sys/bitops.h b/include/sys/bitops.h new file mode 100644 index 000000000000..56d52073bcc8 --- /dev/null +++ b/include/sys/bitops.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017 Datto Inc. + */ + +#ifndef _SYS_BITOPS_H +#define _SYS_BITOPS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * General-purpose 32-bit and 64-bit bitfield encodings. + */ +#define BF32_DECODE(x, low, len) P2PHASE((x) >> (low), 1U << (len)) +#define BF64_DECODE(x, low, len) P2PHASE((x) >> (low), 1ULL << (len)) +#define BF32_ENCODE(x, low, len) (P2PHASE((x), 1U << (len)) << (low)) +#define BF64_ENCODE(x, low, len) (P2PHASE((x), 1ULL << (len)) << (low)) + +#define BF32_GET(x, low, len) BF32_DECODE(x, low, len) +#define BF64_GET(x, low, len) BF64_DECODE(x, low, len) + +#define BF32_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1U << (len)); \ + ASSERT3U(low + len, <=, 32); \ + (x) ^= BF32_ENCODE((x >> low) ^ (val), low, len); \ +_NOTE(CONSTCOND) } while (0) + +#define BF64_SET(x, low, len, val) do { \ + ASSERT3U(val, <, 1ULL << (len)); \ + ASSERT3U(low + len, <=, 64); \ + ((x) ^= BF64_ENCODE((x >> low) ^ (val), low, len)); \ +_NOTE(CONSTCOND) } while (0) + +#define BF32_GET_SB(x, low, len, shift, bias) \ + ((BF32_GET(x, low, len) + (bias)) << (shift)) +#define BF64_GET_SB(x, low, len, shift, bias) \ + ((BF64_GET(x, low, len) + (bias)) << (shift)) + +/* + * We use ASSERT3U instead of ASSERT in these macros to prevent a lint error in + * the case where val is a constant. We can't fix ASSERT because it's used as + * an expression in several places in the kernel; as a result, changing it to + * the do{} while() syntax to allow us to _NOTE the CONSTCOND is not an option. + */ +#define BF32_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1U << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF32_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) +#define BF64_SET_SB(x, low, len, shift, bias, val) do { \ + ASSERT3U(IS_P2ALIGNED(val, 1ULL << shift), !=, B_FALSE); \ + ASSERT3S((val) >> (shift), >=, bias); \ + BF64_SET(x, low, len, ((val) >> (shift)) - (bias)); \ +_NOTE(CONSTCOND) } while (0) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITOPS_H */ diff --git a/include/sys/blkptr.h b/include/sys/blkptr.h new file mode 100644 index 000000000000..77b1b827ac37 --- /dev/null +++ b/include/sys/blkptr.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BLKPTR_H +#define _SYS_BLKPTR_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void encode_embedded_bp_compressed(blkptr_t *, void *, + enum zio_compress, int, int); +void decode_embedded_bp_compressed(const blkptr_t *, void *); +int decode_embedded_bp(const blkptr_t *, void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BLKPTR_H */ diff --git a/include/sys/bplist.h b/include/sys/bplist.h new file mode 100644 index 000000000000..f8deaf8437e6 --- /dev/null +++ b/include/sys/bplist.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPLIST_H +#define _SYS_BPLIST_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bplist_entry { + blkptr_t bpe_blk; + list_node_t bpe_node; +} bplist_entry_t; + +typedef struct bplist { + kmutex_t bpl_lock; + list_t bpl_list; +} bplist_t; + +typedef int bplist_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +void bplist_create(bplist_t *bpl); +void bplist_destroy(bplist_t *bpl); +void bplist_append(bplist_t *bpl, const blkptr_t *bp); +void bplist_iterate(bplist_t *bpl, bplist_itor_t *func, + void *arg, dmu_tx_t *tx); +void bplist_clear(bplist_t *bpl); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPLIST_H */ diff --git a/include/sys/bpobj.h b/include/sys/bpobj.h new file mode 100644 index 000000000000..16e403526cff --- /dev/null +++ b/include/sys/bpobj.h @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPOBJ_H +#define _SYS_BPOBJ_H + +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bpobj_phys { + /* + * This is the bonus buffer for the dead lists. The object's + * contents is an array of bpo_entries blkptr_t's, representing + * a total of bpo_bytes physical space. + */ + uint64_t bpo_num_blkptrs; + uint64_t bpo_bytes; + uint64_t bpo_comp; + uint64_t bpo_uncomp; + uint64_t bpo_subobjs; + uint64_t bpo_num_subobjs; + uint64_t bpo_num_freed; +} bpobj_phys_t; + +#define BPOBJ_SIZE_V0 (2 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V1 (4 * sizeof (uint64_t)) +#define BPOBJ_SIZE_V2 (6 * sizeof (uint64_t)) + +typedef struct bpobj { + kmutex_t bpo_lock; + objset_t *bpo_os; + uint64_t bpo_object; + int bpo_epb; + uint8_t bpo_havecomp; + uint8_t bpo_havesubobj; + uint8_t bpo_havefreed; + bpobj_phys_t *bpo_phys; + dmu_buf_t *bpo_dbuf; + dmu_buf_t *bpo_cached_dbuf; +} bpobj_t; + +typedef int bpobj_itor_t(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + +uint64_t bpobj_alloc(objset_t *mos, int blocksize, dmu_tx_t *tx); +uint64_t bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx); +void bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +void bpobj_decr_empty(objset_t *os, dmu_tx_t *tx); + +int bpobj_open(bpobj_t *bpo, objset_t *mos, uint64_t object); +void bpobj_close(bpobj_t *bpo); +boolean_t bpobj_is_open(const bpobj_t *bpo); + +int bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx); +int bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *, uint64_t *); +int livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, + void *arg, int64_t start); + +void bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx); +void bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + +int bpobj_space(bpobj_t *bpo, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +boolean_t bpobj_is_empty(bpobj_t *bpo); + +int bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPOBJ_H */ diff --git a/include/sys/bptree.h b/include/sys/bptree.h new file mode 100644 index 000000000000..327c128bf493 --- /dev/null +++ b/include/sys/bptree.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_BPTREE_H +#define _SYS_BPTREE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct bptree_phys { + uint64_t bt_begin; + uint64_t bt_end; + uint64_t bt_bytes; + uint64_t bt_comp; + uint64_t bt_uncomp; +} bptree_phys_t; + +typedef struct bptree_entry_phys { + blkptr_t be_bp; + uint64_t be_birth_txg; /* only delete blocks born after this txg */ + zbookmark_phys_t be_zb; /* holds traversal resume point if needed */ +} bptree_entry_phys_t; + +typedef int bptree_itor_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +uint64_t bptree_alloc(objset_t *os, dmu_tx_t *tx); +int bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx); +boolean_t bptree_is_empty(objset_t *os, uint64_t obj); + +void bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx); + +int bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, + bptree_itor_t func, void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BPTREE_H */ diff --git a/include/sys/bqueue.h b/include/sys/bqueue.h new file mode 100644 index 000000000000..797aecd791a3 --- /dev/null +++ b/include/sys/bqueue.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. + */ + +#ifndef _BQUEUE_H +#define _BQUEUE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct bqueue { + list_t bq_list; + kmutex_t bq_lock; + kcondvar_t bq_add_cv; + kcondvar_t bq_pop_cv; + uint64_t bq_size; + uint64_t bq_maxsize; + uint64_t bq_fill_fraction; + size_t bq_node_offset; +} bqueue_t; + +typedef struct bqueue_node { + list_node_t bqn_node; + uint64_t bqn_size; +} bqueue_node_t; + + +int bqueue_init(bqueue_t *, uint64_t, uint64_t, size_t); +void bqueue_destroy(bqueue_t *); +void bqueue_enqueue(bqueue_t *, void *, uint64_t); +void bqueue_enqueue_flush(bqueue_t *, void *, uint64_t); +void *bqueue_dequeue(bqueue_t *); +boolean_t bqueue_empty(bqueue_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _BQUEUE_H */ diff --git a/include/sys/btree.h b/include/sys/btree.h new file mode 100644 index 000000000000..3b53476c7c68 --- /dev/null +++ b/include/sys/btree.h @@ -0,0 +1,243 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#ifndef _BTREE_H +#define _BTREE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * This file defines the interface for a B-Tree implementation for ZFS. The + * tree can be used to store arbitrary sortable data types with low overhead + * and good operation performance. In addition the tree intelligently + * optimizes bulk in-order insertions to improve memory use and performance. + * + * Note that for all B-Tree functions, the values returned are pointers to the + * internal copies of the data in the tree. The internal data can only be + * safely mutated if the changes cannot change the ordering of the element + * with respect to any other elements in the tree. + * + * The major drawback of the B-Tree is that any returned elements or indexes + * are only valid until a side-effectful operation occurs, since these can + * result in reallocation or relocation of data. Side effectful operations are + * defined as insertion, removal, and zfs_btree_destroy_nodes. + * + * The B-Tree has two types of nodes: core nodes, and leaf nodes. Core + * nodes have an array of children pointing to other nodes, and an array of + * elements that act as separators between the elements of the subtrees rooted + * at its children. Leaf nodes only contain data elements, and form the bottom + * layer of the tree. Unlike B+ Trees, in this B-Tree implementation the + * elements in the core nodes are not copies of or references to leaf node + * elements. Each element occurs only once in the tree, no matter what kind + * of node it is in. + * + * The tree's height is the same throughout, unlike many other forms of search + * tree. Each node (except for the root) must be between half minus one and + * completely full of elements (and children) at all times. Any operation that + * would put the node outside of that range results in a rebalancing operation + * (taking, merging, or splitting). + * + * This tree was implemented using descriptions from Wikipedia's articles on + * B-Trees and B+ Trees. + */ + +/* + * Decreasing these values results in smaller memmove operations, but more of + * them, and increased memory overhead. Increasing these values results in + * higher variance in operation time, and reduces memory overhead. + */ +#define BTREE_CORE_ELEMS 128 +#define BTREE_LEAF_SIZE 4096 + +extern kmem_cache_t *zfs_btree_leaf_cache; + +typedef struct zfs_btree_hdr { + struct zfs_btree_core *bth_parent; + boolean_t bth_core; + /* + * For both leaf and core nodes, represents the number of elements in + * the node. For core nodes, they will have bth_count + 1 children. + */ + uint32_t bth_count; +} zfs_btree_hdr_t; + +typedef struct zfs_btree_core { + zfs_btree_hdr_t btc_hdr; + zfs_btree_hdr_t *btc_children[BTREE_CORE_ELEMS + 1]; + uint8_t btc_elems[]; +} zfs_btree_core_t; + +typedef struct zfs_btree_leaf { + zfs_btree_hdr_t btl_hdr; + uint8_t btl_elems[]; +} zfs_btree_leaf_t; + +typedef struct zfs_btree_index { + zfs_btree_hdr_t *bti_node; + uint64_t bti_offset; + /* + * True if the location is before the list offset, false if it's at + * the listed offset. + */ + boolean_t bti_before; +} zfs_btree_index_t; + +typedef struct btree { + zfs_btree_hdr_t *bt_root; + int64_t bt_height; + size_t bt_elem_size; + uint64_t bt_num_elems; + uint64_t bt_num_nodes; + zfs_btree_leaf_t *bt_bulk; // non-null if bulk loading + int (*bt_compar) (const void *, const void *); +} zfs_btree_t; + +/* + * Allocate and deallocate caches for btree nodes. + */ +void zfs_btree_init(void); +void zfs_btree_fini(void); + +/* + * Initialize an B-Tree. Arguments are: + * + * tree - the tree to be initialized + * compar - function to compare two nodes, it must return exactly: -1, 0, or +1 + * -1 for <, 0 for ==, and +1 for > + * size - the value of sizeof(struct my_type) + */ +void zfs_btree_create(zfs_btree_t *, int (*) (const void *, const void *), + size_t); + +/* + * Find a node with a matching value in the tree. Returns the matching node + * found. If not found, it returns NULL and then if "where" is not NULL it sets + * "where" for use with zfs_btree_add_idx() or zfs_btree_nearest(). + * + * node - node that has the value being looked for + * where - position for use with zfs_btree_nearest() or zfs_btree_add_idx(), + * may be NULL + */ +void *zfs_btree_find(zfs_btree_t *, const void *, zfs_btree_index_t *); + +/* + * Insert a node into the tree. + * + * node - the node to insert + * where - position as returned from zfs_btree_find() + */ +void zfs_btree_add_idx(zfs_btree_t *, const void *, const zfs_btree_index_t *); + +/* + * Return the first or last valued node in the tree. Will return NULL if the + * tree is empty. The index can be NULL if the location of the first or last + * element isn't required. + */ +void *zfs_btree_first(zfs_btree_t *, zfs_btree_index_t *); +void *zfs_btree_last(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the next or previous valued node in the tree. The second index can + * safely be NULL, if the location of the next or previous value isn't + * required. + */ +void *zfs_btree_next(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); +void *zfs_btree_prev(zfs_btree_t *, const zfs_btree_index_t *, + zfs_btree_index_t *); + +/* + * Get a value from a tree and an index. + */ +void *zfs_btree_get(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Add a single value to the tree. The value must not compare equal to any + * other node already in the tree. Note that the value will be copied out, not + * inserted directly. It is safe to free or destroy the value once this + * function returns. + */ +void zfs_btree_add(zfs_btree_t *, const void *); + +/* + * Remove a single value from the tree. The value must be in the tree. The + * pointer passed in may be a pointer into a tree-controlled buffer, but it + * need not be. + */ +void zfs_btree_remove(zfs_btree_t *, const void *); + +/* + * Remove the value at the given location from the tree. + */ +void zfs_btree_remove_idx(zfs_btree_t *, zfs_btree_index_t *); + +/* + * Return the number of nodes in the tree + */ +ulong_t zfs_btree_numnodes(zfs_btree_t *); + +/* + * Used to destroy any remaining nodes in a tree. The cookie argument should + * be initialized to NULL before the first call. Returns a node that has been + * removed from the tree and may be free()'d. Returns NULL when the tree is + * empty. + * + * Once you call zfs_btree_destroy_nodes(), you can only continuing calling it + * and finally zfs_btree_destroy(). No other B-Tree routines will be valid. + * + * cookie - an index used to save state between calls to + * zfs_btree_destroy_nodes() + * + * EXAMPLE: + * zfs_btree_t *tree; + * struct my_data *node; + * zfs_btree_index_t *cookie; + * + * cookie = NULL; + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * data_destroy(node); + * zfs_btree_destroy(tree); + */ +void *zfs_btree_destroy_nodes(zfs_btree_t *, zfs_btree_index_t **); + +/* + * Destroys all nodes in the tree quickly. This doesn't give the caller an + * opportunity to iterate over each node and do its own cleanup; for that, use + * zfs_btree_destroy_nodes(). + */ +void zfs_btree_clear(zfs_btree_t *); + +/* + * Final destroy of an B-Tree. Arguments are: + * + * tree - the empty tree to destroy + */ +void zfs_btree_destroy(zfs_btree_t *tree); + +/* Runs a variety of self-checks on the btree to verify integrity. */ +void zfs_btree_verify(zfs_btree_t *tree); + +#ifdef __cplusplus +} +#endif + +#endif /* _BTREE_H */ diff --git a/include/sys/crypto/Makefile.am b/include/sys/crypto/Makefile.am new file mode 100644 index 000000000000..eb31f6a45743 --- /dev/null +++ b/include/sys/crypto/Makefile.am @@ -0,0 +1,16 @@ +COMMON_H = \ + api.h \ + common.h \ + icp.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/crypto +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/crypto +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/crypto/api.h b/include/sys/crypto/api.h new file mode 100644 index 000000000000..7c3c465513de --- /dev/null +++ b/include/sys/crypto/api.h @@ -0,0 +1,425 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_API_H +#define _SYS_CRYPTO_API_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +typedef long crypto_req_id_t; +typedef void *crypto_bc_t; +typedef void *crypto_context_t; +typedef void *crypto_ctx_template_t; + +typedef uint32_t crypto_call_flag_t; + +/* crypto_call_flag's values */ +#define CRYPTO_ALWAYS_QUEUE 0x00000001 /* ALWAYS queue the req. */ +#define CRYPTO_NOTIFY_OPDONE 0x00000002 /* Notify intermediate steps */ +#define CRYPTO_SKIP_REQID 0x00000004 /* Skip request ID generation */ +#define CRYPTO_RESTRICTED 0x00000008 /* cannot use restricted prov */ + +typedef struct { + crypto_call_flag_t cr_flag; + void (*cr_callback_func)(void *, int); + void *cr_callback_arg; + crypto_req_id_t cr_reqid; +} crypto_call_req_t; + +/* + * Returns the mechanism type corresponding to a mechanism name. + */ + +#define CRYPTO_MECH_INVALID ((uint64_t)-1) +extern crypto_mech_type_t crypto_mech2id(crypto_mech_name_t name); + +/* + * Create and destroy context templates. + */ +extern int crypto_create_ctx_template(crypto_mechanism_t *mech, + crypto_key_t *key, crypto_ctx_template_t *tmpl, int kmflag); +extern void crypto_destroy_ctx_template(crypto_ctx_template_t tmpl); + +/* + * Single and multi-part digest operations. + */ +extern int crypto_digest(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_data_t *digest, crypto_call_req_t *cr); +extern int crypto_digest_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, + crypto_call_req_t *); +extern int crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp, + crypto_call_req_t *cr); +extern int crypto_digest_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_context_t *, crypto_call_req_t *); +extern int crypto_digest_update(crypto_context_t ctx, crypto_data_t *data, + crypto_call_req_t *cr); +extern int crypto_digest_final(crypto_context_t ctx, crypto_data_t *digest, + crypto_call_req_t *cr); + +/* + * Single and multi-part MAC operations. + */ +extern int crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac, + crypto_call_req_t *cr); +extern int crypto_mac_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_key_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_mac_verify(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac, + crypto_call_req_t *cr); +extern int crypto_mac_verify_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_key_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, crypto_call_req_t *cr); +extern int crypto_mac_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_context_t *, crypto_call_req_t *); +extern int crypto_mac_update(crypto_context_t ctx, crypto_data_t *data, + crypto_call_req_t *cr); +extern int crypto_mac_final(crypto_context_t ctx, crypto_data_t *data, + crypto_call_req_t *cr); + +/* + * Single and multi-part sign with private key operations. + */ +extern int crypto_sign(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_data_t *data, crypto_ctx_template_t tmpl, + crypto_data_t *signature, crypto_call_req_t *cr); +extern int crypto_sign_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_sign_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, crypto_call_req_t *cr); +extern int crypto_sign_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_context_t *, crypto_call_req_t *); +extern int crypto_sign_update(crypto_context_t ctx, crypto_data_t *data, + crypto_call_req_t *cr); +extern int crypto_sign_final(crypto_context_t ctx, crypto_data_t *signature, + crypto_call_req_t *cr); +extern int crypto_sign_recover_init_prov(crypto_provider_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_ctx_template_t tmpl, crypto_context_t *, crypto_call_req_t *); +extern int crypto_sign_recover(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_data_t *data, crypto_ctx_template_t tmpl, crypto_data_t *signature, + crypto_call_req_t *cr); +extern int crypto_sign_recover_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); + +/* + * Single and multi-part verify with public key operations. + */ +extern int crypto_verify(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_data_t *data, crypto_ctx_template_t tmpl, crypto_data_t *signature, + crypto_call_req_t *cr); +extern int crypto_verify_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_verify_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, crypto_call_req_t *cr); +extern int crypto_verify_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_context_t *, crypto_call_req_t *); +extern int crypto_verify_update(crypto_context_t ctx, crypto_data_t *data, + crypto_call_req_t *cr); +extern int crypto_verify_final(crypto_context_t ctx, crypto_data_t *signature, + crypto_call_req_t *cr); +extern int crypto_verify_recover_init_prov(crypto_provider_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_ctx_template_t tmpl, crypto_context_t *, crypto_call_req_t *); +extern int crypto_verify_recover(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_data_t *signature, crypto_ctx_template_t tmpl, crypto_data_t *data, + crypto_call_req_t *cr); +extern int crypto_verify_recover_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); + +/* + * Single and multi-part encryption operations. + */ +extern int crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *ciphertext, + crypto_call_req_t *cr); +extern int crypto_encrypt_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_key_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_encrypt_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, crypto_call_req_t *cr); +extern int crypto_encrypt_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_context_t *, crypto_call_req_t *); +extern int crypto_encrypt_update(crypto_context_t ctx, + crypto_data_t *plaintext, crypto_data_t *ciphertext, + crypto_call_req_t *cr); +extern int crypto_encrypt_final(crypto_context_t ctx, + crypto_data_t *ciphertext, crypto_call_req_t *cr); + +/* + * Single and multi-part decryption operations. + */ +extern int crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *ciphertext, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *plaintext, + crypto_call_req_t *cr); +extern int crypto_decrypt_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_key_t *, + crypto_ctx_template_t, crypto_data_t *, crypto_call_req_t *); +extern int crypto_decrypt_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *cr); +extern int crypto_decrypt_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_context_t *, crypto_call_req_t *); +extern int crypto_decrypt_update(crypto_context_t ctx, + crypto_data_t *ciphertext, crypto_data_t *plaintext, + crypto_call_req_t *cr); +extern int crypto_decrypt_final(crypto_context_t ctx, crypto_data_t *plaintext, + crypto_call_req_t *cr); + +/* + * Single and multi-part encrypt/MAC dual operations. + */ +extern int crypto_encrypt_mac(crypto_mechanism_t *encr_mech, + crypto_mechanism_t *mac_mech, crypto_data_t *pt, + crypto_key_t *encr_key, crypto_key_t *mac_key, + crypto_ctx_template_t encr_tmpl, crypto_ctx_template_t mac_tmpl, + crypto_dual_data_t *ct, crypto_data_t *mac, crypto_call_req_t *cr); +extern int crypto_encrypt_mac_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_mechanism_t *, crypto_data_t *, + crypto_key_t *, crypto_key_t *, crypto_ctx_template_t, + crypto_ctx_template_t, crypto_dual_data_t *, crypto_data_t *, + crypto_call_req_t *); +extern int crypto_encrypt_mac_init(crypto_mechanism_t *encr_mech, + crypto_mechanism_t *mac_mech, crypto_key_t *encr_key, + crypto_key_t *mac_key, crypto_ctx_template_t encr_tmpl, + crypto_ctx_template_t mac_tmpl, crypto_context_t *ctxp, + crypto_call_req_t *cr); +extern int crypto_encrypt_mac_init_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_mechanism_t *, crypto_key_t *, crypto_key_t *, + crypto_ctx_template_t, crypto_ctx_template_t, crypto_context_t *, + crypto_call_req_t *); +extern int crypto_encrypt_mac_update(crypto_context_t ctx, + crypto_data_t *pt, crypto_dual_data_t *ct, crypto_call_req_t *cr); +extern int crypto_encrypt_mac_final(crypto_context_t ctx, + crypto_dual_data_t *ct, crypto_data_t *mac, crypto_call_req_t *cr); + +/* + * Single and multi-part MAC/decrypt dual operations. + */ +extern int crypto_mac_decrypt(crypto_mechanism_t *mac_mech, + crypto_mechanism_t *decr_mech, crypto_dual_data_t *ct, + crypto_key_t *mac_key, crypto_key_t *decr_key, + crypto_ctx_template_t mac_tmpl, crypto_ctx_template_t decr_tmpl, + crypto_data_t *mac, crypto_data_t *pt, crypto_call_req_t *cr); +extern int crypto_mac_decrypt_prov(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *mac_mech, crypto_mechanism_t *decr_mech, + crypto_dual_data_t *ct, crypto_key_t *mac_key, crypto_key_t *decr_key, + crypto_ctx_template_t mac_tmpl, crypto_ctx_template_t decr_tmpl, + crypto_data_t *mac, crypto_data_t *pt, crypto_call_req_t *cr); +extern int crypto_mac_verify_decrypt(crypto_mechanism_t *mac_mech, + crypto_mechanism_t *decr_mech, crypto_dual_data_t *ct, + crypto_key_t *mac_key, crypto_key_t *decr_key, + crypto_ctx_template_t mac_tmpl, crypto_ctx_template_t decr_tmpl, + crypto_data_t *mac, crypto_data_t *pt, crypto_call_req_t *cr); +extern int crypto_mac_verify_decrypt_prov(crypto_provider_t, + crypto_session_id_t, crypto_mechanism_t *mac_mech, + crypto_mechanism_t *decr_mech, crypto_dual_data_t *ct, + crypto_key_t *mac_key, crypto_key_t *decr_key, + crypto_ctx_template_t mac_tmpl, crypto_ctx_template_t decr_tmpl, + crypto_data_t *mac, crypto_data_t *pt, crypto_call_req_t *cr); +extern int crypto_mac_decrypt_init(crypto_mechanism_t *mac_mech, + crypto_mechanism_t *decr_mech, crypto_key_t *mac_key, + crypto_key_t *decr_key, crypto_ctx_template_t mac_tmpl, + crypto_ctx_template_t decr_tmpl, crypto_context_t *ctxp, + crypto_call_req_t *cr); +extern int crypto_mac_decrypt_init_prov(crypto_provider_t, + crypto_session_id_t, crypto_mechanism_t *mac_mech, + crypto_mechanism_t *decr_mech, crypto_key_t *mac_key, + crypto_key_t *decr_key, crypto_ctx_template_t mac_tmpl, + crypto_ctx_template_t decr_tmpl, crypto_context_t *ctxp, + crypto_call_req_t *cr); +extern int crypto_mac_decrypt_update(crypto_context_t ctx, + crypto_dual_data_t *ct, crypto_data_t *pt, crypto_call_req_t *cr); +extern int crypto_mac_decrypt_final(crypto_context_t ctx, crypto_data_t *mac, + crypto_data_t *pt, crypto_call_req_t *cr); + +/* Session Management */ +extern int crypto_session_open(crypto_provider_t, crypto_session_id_t *, + crypto_call_req_t *); +extern int crypto_session_close(crypto_provider_t, crypto_session_id_t, + crypto_call_req_t *); +extern int crypto_session_login(crypto_provider_t, crypto_session_id_t, + crypto_user_type_t, char *, size_t, crypto_call_req_t *); +extern int crypto_session_logout(crypto_provider_t, crypto_session_id_t, + crypto_call_req_t *); + +/* Object Management */ +extern int crypto_object_copy(crypto_provider_t, crypto_session_id_t, + crypto_object_id_t, crypto_object_attribute_t *, uint_t, + crypto_object_id_t *, crypto_call_req_t *); +extern int crypto_object_create(crypto_provider_t, crypto_session_id_t, + crypto_object_attribute_t *, uint_t, crypto_object_id_t *, + crypto_call_req_t *); +extern int crypto_object_destroy(crypto_provider_t, crypto_session_id_t, + crypto_object_id_t, crypto_call_req_t *); +extern int crypto_object_get_attribute_value(crypto_provider_t, + crypto_session_id_t, crypto_object_id_t, crypto_object_attribute_t *, + uint_t, crypto_call_req_t *); +extern int crypto_object_get_size(crypto_provider_t, crypto_session_id_t, + crypto_object_id_t, size_t *, crypto_call_req_t *); +extern int crypto_object_find_final(crypto_provider_t, void *, + crypto_call_req_t *); +extern int crypto_object_find_init(crypto_provider_t, crypto_session_id_t, + crypto_object_attribute_t *, uint_t, void **, crypto_call_req_t *); +extern int crypto_object_find(crypto_provider_t, void *, crypto_object_id_t *, + uint_t *, uint_t, crypto_call_req_t *); +extern int crypto_object_set_attribute_value(crypto_provider_t, + crypto_session_id_t, crypto_object_id_t, crypto_object_attribute_t *, + uint_t, crypto_call_req_t *); + +/* Key Management */ +extern int crypto_key_derive(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *, + uint_t, crypto_object_id_t *, crypto_call_req_t *); +extern int crypto_key_generate(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_object_attribute_t *, uint_t, + crypto_object_id_t *, crypto_call_req_t *); +extern int crypto_key_generate_pair(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_object_attribute_t *, uint_t, + crypto_object_attribute_t *, uint_t, crypto_object_id_t *, + crypto_object_id_t *, crypto_call_req_t *); +extern int crypto_key_unwrap(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, uchar_t *, size_t *, + crypto_object_attribute_t *, uint_t, crypto_object_id_t *, + crypto_call_req_t *); +extern int crypto_key_wrap(crypto_provider_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_object_id_t *, uchar_t *, + size_t *, crypto_call_req_t *); +extern int crypto_key_check_prov(crypto_provider_t, crypto_mechanism_t *mech, + crypto_key_t *key); +extern int crypto_key_check(crypto_mechanism_t *mech, crypto_key_t *key); + + +/* + * Routines to cancel a single asynchronous request or all asynchronous + * requests associated with a particular context. + */ +extern void crypto_cancel_req(crypto_req_id_t req); +extern void crypto_cancel_ctx(crypto_context_t ctx); + +/* + * crypto_get_mech_list(9F) allocates and returns the list of currently + * supported cryptographic mechanisms. + */ +extern crypto_mech_name_t *crypto_get_mech_list(uint_t *count, int kmflag); +extern void crypto_free_mech_list(crypto_mech_name_t *mech_names, + uint_t count); + +extern crypto_provider_t crypto_get_provider(char *, char *, char *); +extern int crypto_get_provinfo(crypto_provider_t, crypto_provider_ext_info_t *); +extern void crypto_release_provider(crypto_provider_t); + +/* + * A kernel consumer can request to be notified when some particular event + * occurs. The valid events, callback function type, and functions to + * be called to register or unregister for notification are defined below. + */ + +#define CRYPTO_EVENT_MECHS_CHANGED 0x00000001 +#define CRYPTO_EVENT_PROVIDER_REGISTERED 0x00000002 +#define CRYPTO_EVENT_PROVIDER_UNREGISTERED 0x00000004 + +typedef enum { + CRYPTO_MECH_ADDED = 1, + CRYPTO_MECH_REMOVED +} crypto_event_change_t; + +/* The event_arg argument structure for CRYPTO_EVENT_PROVIDERS_CHANGE event */ +typedef struct crypto_notify_event_change { + crypto_mech_name_t ec_mech_name; + crypto_provider_type_t ec_provider_type; + crypto_event_change_t ec_change; +} crypto_notify_event_change_t; + +typedef void *crypto_notify_handle_t; +typedef void (*crypto_notify_callback_t)(uint32_t event_mask, void *event_arg); + +extern crypto_notify_handle_t crypto_notify_events( + crypto_notify_callback_t nf, uint32_t event_mask); +extern void crypto_unnotify_events(crypto_notify_handle_t); + +/* + * crypto_bufcall(9F) group of routines. + */ +extern crypto_bc_t crypto_bufcall_alloc(void); +extern int crypto_bufcall_free(crypto_bc_t bc); +extern int crypto_bufcall(crypto_bc_t bc, void (*func)(void *arg), void *arg); +extern int crypto_unbufcall(crypto_bc_t bc); + +/* + * To obtain the list of key size ranges supported by a mechanism. + */ + +#define CRYPTO_MECH_USAGE_ENCRYPT 0x00000001 +#define CRYPTO_MECH_USAGE_DECRYPT 0x00000002 +#define CRYPTO_MECH_USAGE_MAC 0x00000004 + +typedef uint32_t crypto_mech_usage_t; + +typedef struct crypto_mechanism_info { + size_t mi_min_key_size; + size_t mi_max_key_size; + crypto_keysize_unit_t mi_keysize_unit; /* for mi_xxx_key_size */ + crypto_mech_usage_t mi_usage; +} crypto_mechanism_info_t; + +#ifdef _SYSCALL32 + +typedef struct crypto_mechanism_info32 { + size32_t mi_min_key_size; + size32_t mi_max_key_size; + crypto_keysize_unit_t mi_keysize_unit; /* for mi_xxx_key_size */ + crypto_mech_usage_t mi_usage; +} crypto_mechanism_info32_t; + +#endif /* _SYSCALL32 */ + +extern int crypto_get_all_mech_info(crypto_mech_type_t, + crypto_mechanism_info_t **, uint_t *, int); +extern void crypto_free_all_mech_info(crypto_mechanism_info_t *, uint_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_API_H */ diff --git a/include/sys/crypto/common.h b/include/sys/crypto/common.h new file mode 100644 index 000000000000..a4f9d9848c23 --- /dev/null +++ b/include/sys/crypto/common.h @@ -0,0 +1,583 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#ifndef _SYS_CRYPTO_COMMON_H +#define _SYS_CRYPTO_COMMON_H + +/* + * Header file for the common data structures of the cryptographic framework + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* Cryptographic Mechanisms */ + +#define CRYPTO_MAX_MECH_NAME 32 +typedef char crypto_mech_name_t[CRYPTO_MAX_MECH_NAME]; + +typedef uint64_t crypto_mech_type_t; + +typedef struct crypto_mechanism { + crypto_mech_type_t cm_type; /* mechanism type */ + caddr_t cm_param; /* mech. parameter */ + size_t cm_param_len; /* mech. parameter len */ +} crypto_mechanism_t; + +#ifdef _SYSCALL32 + +typedef struct crypto_mechanism32 { + crypto_mech_type_t cm_type; /* mechanism type */ + caddr32_t cm_param; /* mech. parameter */ + size32_t cm_param_len; /* mech. parameter len */ +} crypto_mechanism32_t; + +#endif /* _SYSCALL32 */ + +/* CK_AES_CTR_PARAMS provides parameters to the CKM_AES_CTR mechanism */ +typedef struct CK_AES_CTR_PARAMS { + ulong_t ulCounterBits; + uint8_t cb[16]; +} CK_AES_CTR_PARAMS; + +/* CK_AES_CCM_PARAMS provides parameters to the CKM_AES_CCM mechanism */ +typedef struct CK_AES_CCM_PARAMS { + ulong_t ulMACSize; + ulong_t ulNonceSize; + ulong_t ulAuthDataSize; + ulong_t ulDataSize; /* used for plaintext or ciphertext */ + uchar_t *nonce; + uchar_t *authData; +} CK_AES_CCM_PARAMS; + +/* CK_AES_GCM_PARAMS provides parameters to the CKM_AES_GCM mechanism */ +typedef struct CK_AES_GCM_PARAMS { + uchar_t *pIv; + ulong_t ulIvLen; + ulong_t ulIvBits; + uchar_t *pAAD; + ulong_t ulAADLen; + ulong_t ulTagBits; +} CK_AES_GCM_PARAMS; + +/* CK_AES_GMAC_PARAMS provides parameters to the CKM_AES_GMAC mechanism */ +typedef struct CK_AES_GMAC_PARAMS { + uchar_t *pIv; + uchar_t *pAAD; + ulong_t ulAADLen; +} CK_AES_GMAC_PARAMS; + +/* + * CK_ECDH1_DERIVE_PARAMS provides the parameters to the + * CKM_ECDH1_KEY_DERIVE mechanism + */ +typedef struct CK_ECDH1_DERIVE_PARAMS { + ulong_t kdf; + ulong_t ulSharedDataLen; + uchar_t *pSharedData; + ulong_t ulPublicDataLen; + uchar_t *pPublicData; +} CK_ECDH1_DERIVE_PARAMS; + +#ifdef _SYSCALL32 + +/* needed for 32-bit applications running on 64-bit kernels */ +typedef struct CK_AES_CTR_PARAMS32 { + uint32_t ulCounterBits; + uint8_t cb[16]; +} CK_AES_CTR_PARAMS32; + +/* needed for 32-bit applications running on 64-bit kernels */ +typedef struct CK_AES_CCM_PARAMS32 { + uint32_t ulMACSize; + uint32_t ulNonceSize; + uint32_t ulAuthDataSize; + uint32_t ulDataSize; + caddr32_t nonce; + caddr32_t authData; +} CK_AES_CCM_PARAMS32; + +/* needed for 32-bit applications running on 64-bit kernels */ +typedef struct CK_AES_GCM_PARAMS32 { + caddr32_t pIv; + uint32_t ulIvLen; + uint32_t ulIvBits; + caddr32_t pAAD; + uint32_t ulAADLen; + uint32_t ulTagBits; +} CK_AES_GCM_PARAMS32; + +/* needed for 32-bit applications running on 64-bit kernels */ +typedef struct CK_AES_GMAC_PARAMS32 { + caddr32_t pIv; + caddr32_t pAAD; + uint32_t ulAADLen; +} CK_AES_GMAC_PARAMS32; + +typedef struct CK_ECDH1_DERIVE_PARAMS32 { + uint32_t kdf; + uint32_t ulSharedDataLen; + caddr32_t pSharedData; + uint32_t ulPublicDataLen; + caddr32_t pPublicData; +} CK_ECDH1_DERIVE_PARAMS32; + +#endif /* _SYSCALL32 */ + +/* + * The measurement unit bit flag for a mechanism's minimum or maximum key size. + * The unit are mechanism dependent. It can be in bits or in bytes. + */ +typedef uint32_t crypto_keysize_unit_t; + +/* + * The following bit flags are valid in cm_mech_flags field in + * the crypto_mech_info_t structure of the SPI. + * + * Only the first two bit flags are valid in mi_keysize_unit + * field in the crypto_mechanism_info_t structure of the API. + */ +#define CRYPTO_KEYSIZE_UNIT_IN_BITS 0x00000001 +#define CRYPTO_KEYSIZE_UNIT_IN_BYTES 0x00000002 +#define CRYPTO_CAN_SHARE_OPSTATE 0x00000004 /* supports sharing */ + + +/* Mechanisms supported out-of-the-box */ +#define SUN_CKM_MD4 "CKM_MD4" +#define SUN_CKM_MD5 "CKM_MD5" +#define SUN_CKM_MD5_HMAC "CKM_MD5_HMAC" +#define SUN_CKM_MD5_HMAC_GENERAL "CKM_MD5_HMAC_GENERAL" +#define SUN_CKM_SHA1 "CKM_SHA_1" +#define SUN_CKM_SHA1_HMAC "CKM_SHA_1_HMAC" +#define SUN_CKM_SHA1_HMAC_GENERAL "CKM_SHA_1_HMAC_GENERAL" +#define SUN_CKM_SHA256 "CKM_SHA256" +#define SUN_CKM_SHA256_HMAC "CKM_SHA256_HMAC" +#define SUN_CKM_SHA256_HMAC_GENERAL "CKM_SHA256_HMAC_GENERAL" +#define SUN_CKM_SHA384 "CKM_SHA384" +#define SUN_CKM_SHA384_HMAC "CKM_SHA384_HMAC" +#define SUN_CKM_SHA384_HMAC_GENERAL "CKM_SHA384_HMAC_GENERAL" +#define SUN_CKM_SHA512 "CKM_SHA512" +#define SUN_CKM_SHA512_HMAC "CKM_SHA512_HMAC" +#define SUN_CKM_SHA512_HMAC_GENERAL "CKM_SHA512_HMAC_GENERAL" +#define SUN_CKM_SHA512_224 "CKM_SHA512_224" +#define SUN_CKM_SHA512_256 "CKM_SHA512_256" +#define SUN_CKM_DES_CBC "CKM_DES_CBC" +#define SUN_CKM_DES3_CBC "CKM_DES3_CBC" +#define SUN_CKM_DES_ECB "CKM_DES_ECB" +#define SUN_CKM_DES3_ECB "CKM_DES3_ECB" +#define SUN_CKM_BLOWFISH_CBC "CKM_BLOWFISH_CBC" +#define SUN_CKM_BLOWFISH_ECB "CKM_BLOWFISH_ECB" +#define SUN_CKM_AES_CBC "CKM_AES_CBC" +#define SUN_CKM_AES_ECB "CKM_AES_ECB" +#define SUN_CKM_AES_CTR "CKM_AES_CTR" +#define SUN_CKM_AES_CCM "CKM_AES_CCM" +#define SUN_CKM_AES_GCM "CKM_AES_GCM" +#define SUN_CKM_AES_GMAC "CKM_AES_GMAC" +#define SUN_CKM_AES_CFB128 "CKM_AES_CFB128" +#define SUN_CKM_RC4 "CKM_RC4" +#define SUN_CKM_RSA_PKCS "CKM_RSA_PKCS" +#define SUN_CKM_RSA_X_509 "CKM_RSA_X_509" +#define SUN_CKM_MD5_RSA_PKCS "CKM_MD5_RSA_PKCS" +#define SUN_CKM_SHA1_RSA_PKCS "CKM_SHA1_RSA_PKCS" +#define SUN_CKM_SHA256_RSA_PKCS "CKM_SHA256_RSA_PKCS" +#define SUN_CKM_SHA384_RSA_PKCS "CKM_SHA384_RSA_PKCS" +#define SUN_CKM_SHA512_RSA_PKCS "CKM_SHA512_RSA_PKCS" +#define SUN_CKM_EC_KEY_PAIR_GEN "CKM_EC_KEY_PAIR_GEN" +#define SUN_CKM_ECDH1_DERIVE "CKM_ECDH1_DERIVE" +#define SUN_CKM_ECDSA_SHA1 "CKM_ECDSA_SHA1" +#define SUN_CKM_ECDSA "CKM_ECDSA" + +/* Shared operation context format for CKM_RC4 */ +typedef struct { +#if defined(__amd64) + uint32_t i, j; + uint32_t arr[256]; + uint32_t flag; +#else + uchar_t arr[256]; + uchar_t i, j; +#endif /* __amd64 */ + uint64_t pad; /* For 64-bit alignment */ +} arcfour_state_t; + +/* Data arguments of cryptographic operations */ + +typedef enum crypto_data_format { + CRYPTO_DATA_RAW = 1, + CRYPTO_DATA_UIO, +} crypto_data_format_t; + +typedef struct crypto_data { + crypto_data_format_t cd_format; /* Format identifier */ + off_t cd_offset; /* Offset from the beginning */ + size_t cd_length; /* # of bytes in use */ + caddr_t cd_miscdata; /* ancillary data */ + union { + /* Raw format */ + iovec_t cdu_raw; /* Pointer and length */ + + /* uio scatter-gather format */ + uio_t *cdu_uio; + + } cdu; /* Crypto Data Union */ +} crypto_data_t; + +#define cd_raw cdu.cdu_raw +#define cd_uio cdu.cdu_uio +#define cd_mp cdu.cdu_mp + +typedef struct crypto_dual_data { + crypto_data_t dd_data; /* The data */ + off_t dd_offset2; /* Used by dual operation */ + size_t dd_len2; /* # of bytes to take */ +} crypto_dual_data_t; + +#define dd_format dd_data.cd_format +#define dd_offset1 dd_data.cd_offset +#define dd_len1 dd_data.cd_length +#define dd_miscdata dd_data.cd_miscdata +#define dd_raw dd_data.cd_raw +#define dd_uio dd_data.cd_uio +#define dd_mp dd_data.cd_mp + +/* The keys, and their contents */ + +typedef enum { + CRYPTO_KEY_RAW = 1, /* ck_data is a cleartext key */ + CRYPTO_KEY_REFERENCE, /* ck_obj_id is an opaque reference */ + CRYPTO_KEY_ATTR_LIST /* ck_attrs is a list of object attributes */ +} crypto_key_format_t; + +typedef uint64_t crypto_attr_type_t; + +/* Attribute types to use for passing a RSA public key or a private key. */ +#define SUN_CKA_MODULUS 0x00000120 +#define SUN_CKA_MODULUS_BITS 0x00000121 +#define SUN_CKA_PUBLIC_EXPONENT 0x00000122 +#define SUN_CKA_PRIVATE_EXPONENT 0x00000123 +#define SUN_CKA_PRIME_1 0x00000124 +#define SUN_CKA_PRIME_2 0x00000125 +#define SUN_CKA_EXPONENT_1 0x00000126 +#define SUN_CKA_EXPONENT_2 0x00000127 +#define SUN_CKA_COEFFICIENT 0x00000128 +#define SUN_CKA_PRIME 0x00000130 +#define SUN_CKA_SUBPRIME 0x00000131 +#define SUN_CKA_BASE 0x00000132 + +#define CKK_EC 0x00000003 +#define CKK_GENERIC_SECRET 0x00000010 +#define CKK_RC4 0x00000012 +#define CKK_AES 0x0000001F +#define CKK_DES 0x00000013 +#define CKK_DES2 0x00000014 +#define CKK_DES3 0x00000015 + +#define CKO_PUBLIC_KEY 0x00000002 +#define CKO_PRIVATE_KEY 0x00000003 +#define CKA_CLASS 0x00000000 +#define CKA_VALUE 0x00000011 +#define CKA_KEY_TYPE 0x00000100 +#define CKA_VALUE_LEN 0x00000161 +#define CKA_EC_PARAMS 0x00000180 +#define CKA_EC_POINT 0x00000181 + +typedef uint32_t crypto_object_id_t; + +typedef struct crypto_object_attribute { + crypto_attr_type_t oa_type; /* attribute type */ + caddr_t oa_value; /* attribute value */ + ssize_t oa_value_len; /* length of attribute value */ +} crypto_object_attribute_t; + +typedef struct crypto_key { + crypto_key_format_t ck_format; /* format identifier */ + union { + /* for CRYPTO_KEY_RAW ck_format */ + struct { + uint_t cku_v_length; /* # of bits in ck_data */ + void *cku_v_data; /* ptr to key value */ + } cku_key_value; + + /* for CRYPTO_KEY_REFERENCE ck_format */ + crypto_object_id_t cku_key_id; /* reference to object key */ + + /* for CRYPTO_KEY_ATTR_LIST ck_format */ + struct { + uint_t cku_a_count; /* number of attributes */ + crypto_object_attribute_t *cku_a_oattr; + } cku_key_attrs; + } cku_data; /* Crypto Key union */ +} crypto_key_t; + +#ifdef _SYSCALL32 + +typedef struct crypto_object_attribute32 { + uint64_t oa_type; /* attribute type */ + caddr32_t oa_value; /* attribute value */ + ssize32_t oa_value_len; /* length of attribute value */ +} crypto_object_attribute32_t; + +typedef struct crypto_key32 { + crypto_key_format_t ck_format; /* format identifier */ + union { + /* for CRYPTO_KEY_RAW ck_format */ + struct { + uint32_t cku_v_length; /* # of bytes in ck_data */ + caddr32_t cku_v_data; /* ptr to key value */ + } cku_key_value; + + /* for CRYPTO_KEY_REFERENCE ck_format */ + crypto_object_id_t cku_key_id; /* reference to object key */ + + /* for CRYPTO_KEY_ATTR_LIST ck_format */ + struct { + uint32_t cku_a_count; /* number of attributes */ + caddr32_t cku_a_oattr; + } cku_key_attrs; + } cku_data; /* Crypto Key union */ +} crypto_key32_t; + +#endif /* _SYSCALL32 */ + +#define ck_data cku_data.cku_key_value.cku_v_data +#define ck_length cku_data.cku_key_value.cku_v_length +#define ck_obj_id cku_data.cku_key_id +#define ck_count cku_data.cku_key_attrs.cku_a_count +#define ck_attrs cku_data.cku_key_attrs.cku_a_oattr + +/* + * Raw key lengths are expressed in number of bits. + * The following macro returns the minimum number of + * bytes that can contain the specified number of bits. + * Round up without overflowing the integer type. + */ +#define CRYPTO_BITS2BYTES(n) ((n) == 0 ? 0 : (((n) - 1) >> 3) + 1) +#define CRYPTO_BYTES2BITS(n) ((n) << 3) + +/* Providers */ + +typedef enum { + CRYPTO_HW_PROVIDER = 0, + CRYPTO_SW_PROVIDER, + CRYPTO_LOGICAL_PROVIDER +} crypto_provider_type_t; + +typedef uint32_t crypto_provider_id_t; +#define KCF_PROVID_INVALID ((uint32_t)-1) + +typedef struct crypto_provider_entry { + crypto_provider_id_t pe_provider_id; + uint_t pe_mechanism_count; +} crypto_provider_entry_t; + +typedef struct crypto_dev_list_entry { + char le_dev_name[MAXNAMELEN]; + uint_t le_dev_instance; + uint_t le_mechanism_count; +} crypto_dev_list_entry_t; + +/* User type for authentication ioctls and SPI entry points */ + +typedef enum crypto_user_type { + CRYPTO_SO = 0, + CRYPTO_USER +} crypto_user_type_t; + +/* Version for provider management ioctls and SPI entry points */ + +typedef struct crypto_version { + uchar_t cv_major; + uchar_t cv_minor; +} crypto_version_t; + +/* session data structure opaque to the consumer */ +typedef void *crypto_session_t; + +/* provider data structure opaque to the consumer */ +typedef void *crypto_provider_t; + +/* Limits used by both consumers and providers */ +#define CRYPTO_EXT_SIZE_LABEL 32 +#define CRYPTO_EXT_SIZE_MANUF 32 +#define CRYPTO_EXT_SIZE_MODEL 16 +#define CRYPTO_EXT_SIZE_SERIAL 16 +#define CRYPTO_EXT_SIZE_TIME 16 + +typedef struct crypto_provider_ext_info { + uchar_t ei_label[CRYPTO_EXT_SIZE_LABEL]; + uchar_t ei_manufacturerID[CRYPTO_EXT_SIZE_MANUF]; + uchar_t ei_model[CRYPTO_EXT_SIZE_MODEL]; + uchar_t ei_serial_number[CRYPTO_EXT_SIZE_SERIAL]; + ulong_t ei_flags; + ulong_t ei_max_session_count; + ulong_t ei_max_pin_len; + ulong_t ei_min_pin_len; + ulong_t ei_total_public_memory; + ulong_t ei_free_public_memory; + ulong_t ei_total_private_memory; + ulong_t ei_free_private_memory; + crypto_version_t ei_hardware_version; + crypto_version_t ei_firmware_version; + uchar_t ei_time[CRYPTO_EXT_SIZE_TIME]; + int ei_hash_max_input_len; + int ei_hmac_max_input_len; +} crypto_provider_ext_info_t; + +typedef uint_t crypto_session_id_t; + +typedef enum cmd_type { + COPY_FROM_DATA, + COPY_TO_DATA, + COMPARE_TO_DATA, + MD5_DIGEST_DATA, + SHA1_DIGEST_DATA, + SHA2_DIGEST_DATA, + GHASH_DATA +} cmd_type_t; + +#define CRYPTO_DO_UPDATE 0x01 +#define CRYPTO_DO_FINAL 0x02 +#define CRYPTO_DO_MD5 0x04 +#define CRYPTO_DO_SHA1 0x08 +#define CRYPTO_DO_SIGN 0x10 +#define CRYPTO_DO_VERIFY 0x20 +#define CRYPTO_DO_SHA2 0x40 + +#define PROVIDER_OWNS_KEY_SCHEDULE 0x00000001 + +/* + * Common cryptographic status and error codes. + */ +#define CRYPTO_SUCCESS 0x00000000 +#define CRYPTO_CANCEL 0x00000001 +#define CRYPTO_HOST_MEMORY 0x00000002 +#define CRYPTO_GENERAL_ERROR 0x00000003 +#define CRYPTO_FAILED 0x00000004 +#define CRYPTO_ARGUMENTS_BAD 0x00000005 +#define CRYPTO_ATTRIBUTE_READ_ONLY 0x00000006 +#define CRYPTO_ATTRIBUTE_SENSITIVE 0x00000007 +#define CRYPTO_ATTRIBUTE_TYPE_INVALID 0x00000008 +#define CRYPTO_ATTRIBUTE_VALUE_INVALID 0x00000009 +#define CRYPTO_CANCELED 0x0000000A +#define CRYPTO_DATA_INVALID 0x0000000B +#define CRYPTO_DATA_LEN_RANGE 0x0000000C +#define CRYPTO_DEVICE_ERROR 0x0000000D +#define CRYPTO_DEVICE_MEMORY 0x0000000E +#define CRYPTO_DEVICE_REMOVED 0x0000000F +#define CRYPTO_ENCRYPTED_DATA_INVALID 0x00000010 +#define CRYPTO_ENCRYPTED_DATA_LEN_RANGE 0x00000011 +#define CRYPTO_KEY_HANDLE_INVALID 0x00000012 +#define CRYPTO_KEY_SIZE_RANGE 0x00000013 +#define CRYPTO_KEY_TYPE_INCONSISTENT 0x00000014 +#define CRYPTO_KEY_NOT_NEEDED 0x00000015 +#define CRYPTO_KEY_CHANGED 0x00000016 +#define CRYPTO_KEY_NEEDED 0x00000017 +#define CRYPTO_KEY_INDIGESTIBLE 0x00000018 +#define CRYPTO_KEY_FUNCTION_NOT_PERMITTED 0x00000019 +#define CRYPTO_KEY_NOT_WRAPPABLE 0x0000001A +#define CRYPTO_KEY_UNEXTRACTABLE 0x0000001B +#define CRYPTO_MECHANISM_INVALID 0x0000001C +#define CRYPTO_MECHANISM_PARAM_INVALID 0x0000001D +#define CRYPTO_OBJECT_HANDLE_INVALID 0x0000001E +#define CRYPTO_OPERATION_IS_ACTIVE 0x0000001F +#define CRYPTO_OPERATION_NOT_INITIALIZED 0x00000020 +#define CRYPTO_PIN_INCORRECT 0x00000021 +#define CRYPTO_PIN_INVALID 0x00000022 +#define CRYPTO_PIN_LEN_RANGE 0x00000023 +#define CRYPTO_PIN_EXPIRED 0x00000024 +#define CRYPTO_PIN_LOCKED 0x00000025 +#define CRYPTO_SESSION_CLOSED 0x00000026 +#define CRYPTO_SESSION_COUNT 0x00000027 +#define CRYPTO_SESSION_HANDLE_INVALID 0x00000028 +#define CRYPTO_SESSION_READ_ONLY 0x00000029 +#define CRYPTO_SESSION_EXISTS 0x0000002A +#define CRYPTO_SESSION_READ_ONLY_EXISTS 0x0000002B +#define CRYPTO_SESSION_READ_WRITE_SO_EXISTS 0x0000002C +#define CRYPTO_SIGNATURE_INVALID 0x0000002D +#define CRYPTO_SIGNATURE_LEN_RANGE 0x0000002E +#define CRYPTO_TEMPLATE_INCOMPLETE 0x0000002F +#define CRYPTO_TEMPLATE_INCONSISTENT 0x00000030 +#define CRYPTO_UNWRAPPING_KEY_HANDLE_INVALID 0x00000031 +#define CRYPTO_UNWRAPPING_KEY_SIZE_RANGE 0x00000032 +#define CRYPTO_UNWRAPPING_KEY_TYPE_INCONSISTENT 0x00000033 +#define CRYPTO_USER_ALREADY_LOGGED_IN 0x00000034 +#define CRYPTO_USER_NOT_LOGGED_IN 0x00000035 +#define CRYPTO_USER_PIN_NOT_INITIALIZED 0x00000036 +#define CRYPTO_USER_TYPE_INVALID 0x00000037 +#define CRYPTO_USER_ANOTHER_ALREADY_LOGGED_IN 0x00000038 +#define CRYPTO_USER_TOO_MANY_TYPES 0x00000039 +#define CRYPTO_WRAPPED_KEY_INVALID 0x0000003A +#define CRYPTO_WRAPPED_KEY_LEN_RANGE 0x0000003B +#define CRYPTO_WRAPPING_KEY_HANDLE_INVALID 0x0000003C +#define CRYPTO_WRAPPING_KEY_SIZE_RANGE 0x0000003D +#define CRYPTO_WRAPPING_KEY_TYPE_INCONSISTENT 0x0000003E +#define CRYPTO_RANDOM_SEED_NOT_SUPPORTED 0x0000003F +#define CRYPTO_RANDOM_NO_RNG 0x00000040 +#define CRYPTO_DOMAIN_PARAMS_INVALID 0x00000041 +#define CRYPTO_BUFFER_TOO_SMALL 0x00000042 +#define CRYPTO_INFORMATION_SENSITIVE 0x00000043 +#define CRYPTO_NOT_SUPPORTED 0x00000044 + +#define CRYPTO_QUEUED 0x00000045 +#define CRYPTO_BUFFER_TOO_BIG 0x00000046 +#define CRYPTO_INVALID_CONTEXT 0x00000047 +#define CRYPTO_INVALID_MAC 0x00000048 +#define CRYPTO_MECH_NOT_SUPPORTED 0x00000049 +#define CRYPTO_INCONSISTENT_ATTRIBUTE 0x0000004A +#define CRYPTO_NO_PERMISSION 0x0000004B +#define CRYPTO_INVALID_PROVIDER_ID 0x0000004C +#define CRYPTO_VERSION_MISMATCH 0x0000004D +#define CRYPTO_BUSY 0x0000004E +#define CRYPTO_UNKNOWN_PROVIDER 0x0000004F +#define CRYPTO_MODVERIFICATION_FAILED 0x00000050 +#define CRYPTO_OLD_CTX_TEMPLATE 0x00000051 +#define CRYPTO_WEAK_KEY 0x00000052 +#define CRYPTO_FIPS140_ERROR 0x00000053 +/* + * Don't forget to update CRYPTO_LAST_ERROR and the error_number_table[] + * in kernelUtil.c when new error code is added. + */ +#define CRYPTO_LAST_ERROR 0x00000053 + +/* + * Special values that can be used to indicate that information is unavailable + * or that there is not practical limit. These values can be used + * by fields of the SPI crypto_provider_ext_info(9S) structure. + * The value of CRYPTO_UNAVAILABLE_INFO should be the same as + * CK_UNAVAILABLE_INFO in the PKCS#11 spec. + */ +#define CRYPTO_UNAVAILABLE_INFO ((ulong_t)(-1)) +#define CRYPTO_EFFECTIVELY_INFINITE 0x0 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_COMMON_H */ diff --git a/include/sys/crypto/icp.h b/include/sys/crypto/icp.h new file mode 100644 index 000000000000..4609e3a1dae7 --- /dev/null +++ b/include/sys/crypto/icp.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2016, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_CRYPTO_ALGS_H +#define _SYS_CRYPTO_ALGS_H + +int aes_mod_init(void); +int aes_mod_fini(void); + +int edonr_mod_init(void); +int edonr_mod_fini(void); + +int sha1_mod_init(void); +int sha1_mod_fini(void); + +int sha2_mod_init(void); +int sha2_mod_fini(void); + +int skein_mod_init(void); +int skein_mod_fini(void); + +int icp_init(void); +void icp_fini(void); + +int aes_impl_set(const char *); +int gcm_impl_set(const char *); + +#endif /* _SYS_CRYPTO_ALGS_H */ diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h new file mode 100644 index 000000000000..667d1b85fa2c --- /dev/null +++ b/include/sys/dataset_kstats.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018 Datto Inc. + */ + +#ifndef _SYS_DATASET_KSTATS_H +#define _SYS_DATASET_KSTATS_H + +#include +#include +#include + +typedef struct dataset_aggsum_stats_t { + aggsum_t das_writes; + aggsum_t das_nwritten; + aggsum_t das_reads; + aggsum_t das_nread; + aggsum_t das_nunlinks; + aggsum_t das_nunlinked; +} dataset_aggsum_stats_t; + +typedef struct dataset_kstat_values { + kstat_named_t dkv_ds_name; + kstat_named_t dkv_writes; + kstat_named_t dkv_nwritten; + kstat_named_t dkv_reads; + kstat_named_t dkv_nread; + /* + * nunlinks is initialized to the unlinked set size on mount and + * is incremented whenever a new entry is added to the unlinked set + */ + kstat_named_t dkv_nunlinks; + /* + * nunlinked is initialized to zero on mount and is incremented when an + * entry is removed from the unlinked set + */ + kstat_named_t dkv_nunlinked; +} dataset_kstat_values_t; + +typedef struct dataset_kstats { + dataset_aggsum_stats_t dk_aggsums; + kstat_t *dk_kstats; +} dataset_kstats_t; + +void dataset_kstats_create(dataset_kstats_t *, objset_t *); +void dataset_kstats_destroy(dataset_kstats_t *); + +void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); +void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); + +void dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *, int64_t); +void dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *, int64_t); + +#endif /* _SYS_DATASET_KSTATS_H */ diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h new file mode 100644 index 000000000000..04338b2c491b --- /dev/null +++ b/include/sys/dbuf.h @@ -0,0 +1,472 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DBUF_H +#define _SYS_DBUF_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define IN_DMU_SYNC 2 + +/* + * define flags for dbuf_read + */ + +#define DB_RF_MUST_SUCCEED (1 << 0) +#define DB_RF_CANFAIL (1 << 1) +#define DB_RF_HAVESTRUCT (1 << 2) +#define DB_RF_NOPREFETCH (1 << 3) +#define DB_RF_NEVERWAIT (1 << 4) +#define DB_RF_CACHED (1 << 5) +#define DB_RF_NO_DECRYPT (1 << 6) + +/* + * The simplified state transition diagram for dbufs looks like: + * + * +----> READ ----+ + * | | + * | V + * (alloc)-->UNCACHED CACHED-->EVICTING-->(free) + * | ^ ^ + * | | | + * +----> FILL ----+ | + * | | + * | | + * +--------> NOFILL -------+ + * + * DB_SEARCH is an invalid state for a dbuf. It is used by dbuf_free_range + * to find all dbufs in a range of a dnode and must be less than any other + * dbuf_states_t (see comment on dn_dbufs in dnode.h). + */ +typedef enum dbuf_states { + DB_SEARCH = -1, + DB_UNCACHED, + DB_FILL, + DB_NOFILL, + DB_READ, + DB_CACHED, + DB_EVICTING +} dbuf_states_t; + +typedef enum dbuf_cached_state { + DB_NO_CACHE = -1, + DB_DBUF_CACHE, + DB_DBUF_METADATA_CACHE, + DB_CACHE_MAX +} dbuf_cached_state_t; + +struct dnode; +struct dmu_tx; + +/* + * level = 0 means the user data + * level = 1 means the single indirect block + * etc. + */ + +struct dmu_buf_impl; + +typedef enum override_states { + DR_NOT_OVERRIDDEN, + DR_IN_DMU_SYNC, + DR_OVERRIDDEN +} override_states_t; + +typedef enum db_lock_type { + DLT_NONE, + DLT_PARENT, + DLT_OBJSET +} db_lock_type_t; + +typedef struct dbuf_dirty_record { + /* link on our parents dirty list */ + list_node_t dr_dirty_node; + + /* transaction group this data will sync in */ + uint64_t dr_txg; + + /* zio of outstanding write IO */ + zio_t *dr_zio; + + /* pointer back to our dbuf */ + struct dmu_buf_impl *dr_dbuf; + + /* list link for dbuf dirty records */ + list_node_t dr_dbuf_node; + + /* pointer to parent dirty record */ + struct dbuf_dirty_record *dr_parent; + + /* How much space was changed to dsl_pool_dirty_space() for this? */ + unsigned int dr_accounted; + + /* A copy of the bp that points to us */ + blkptr_t dr_bp_copy; + + union dirty_types { + struct dirty_indirect { + + /* protect access to list */ + kmutex_t dr_mtx; + + /* Our list of dirty children */ + list_t dr_children; + } di; + struct dirty_leaf { + + /* + * dr_data is set when we dirty the buffer + * so that we can retain the pointer even if it + * gets COW'd in a subsequent transaction group. + */ + arc_buf_t *dr_data; + blkptr_t dr_overridden_by; + override_states_t dr_override_state; + uint8_t dr_copies; + boolean_t dr_nopwrite; + boolean_t dr_has_raw_params; + + /* + * If dr_has_raw_params is set, the following crypt + * params will be set on the BP that's written. + */ + boolean_t dr_byteorder; + uint8_t dr_salt[ZIO_DATA_SALT_LEN]; + uint8_t dr_iv[ZIO_DATA_IV_LEN]; + uint8_t dr_mac[ZIO_DATA_MAC_LEN]; + } dl; + } dt; +} dbuf_dirty_record_t; + +typedef struct dmu_buf_impl { + /* + * The following members are immutable, with the exception of + * db.db_data, which is protected by db_mtx. + */ + + /* the publicly visible structure */ + dmu_buf_t db; + + /* the objset we belong to */ + struct objset *db_objset; + + /* + * handle to safely access the dnode we belong to (NULL when evicted) + */ + struct dnode_handle *db_dnode_handle; + + /* + * our parent buffer; if the dnode points to us directly, + * db_parent == db_dnode_handle->dnh_dnode->dn_dbuf + * only accessed by sync thread ??? + * (NULL when evicted) + * May change from NULL to non-NULL under the protection of db_mtx + * (see dbuf_check_blkptr()) + */ + struct dmu_buf_impl *db_parent; + + /* + * link for hash table of all dmu_buf_impl_t's + */ + struct dmu_buf_impl *db_hash_next; + + /* + * Our link on the owner dnodes's dn_dbufs list. + * Protected by its dn_dbufs_mtx. Should be on the same cache line + * as db_level and db_blkid for the best avl_add() performance. + */ + avl_node_t db_link; + + /* our block number */ + uint64_t db_blkid; + + /* + * Pointer to the blkptr_t which points to us. May be NULL if we + * don't have one yet. (NULL when evicted) + */ + blkptr_t *db_blkptr; + + /* + * Our indirection level. Data buffers have db_level==0. + * Indirect buffers which point to data buffers have + * db_level==1. etc. Buffers which contain dnodes have + * db_level==0, since the dnodes are stored in a file. + */ + uint8_t db_level; + + /* + * Protects db_buf's contents if they contain an indirect block or data + * block of the meta-dnode. We use this lock to protect the structure of + * the block tree. This means that when modifying this dbuf's data, we + * grab its rwlock. When modifying its parent's data (including the + * blkptr to this dbuf), we grab the parent's rwlock. The lock ordering + * for this lock is: + * 1) dn_struct_rwlock + * 2) db_rwlock + * We don't currently grab multiple dbufs' db_rwlocks at once. + */ + krwlock_t db_rwlock; + + /* buffer holding our data */ + arc_buf_t *db_buf; + + /* db_mtx protects the members below */ + kmutex_t db_mtx; + + /* + * Current state of the buffer + */ + dbuf_states_t db_state; + + /* + * Refcount accessed by dmu_buf_{hold,rele}. + * If nonzero, the buffer can't be destroyed. + * Protected by db_mtx. + */ + zfs_refcount_t db_holds; + + kcondvar_t db_changed; + dbuf_dirty_record_t *db_data_pending; + + /* List of dirty records for the buffer sorted newest to oldest. */ + list_t db_dirty_records; + + /* Link in dbuf_cache or dbuf_metadata_cache */ + multilist_node_t db_cache_link; + + /* Tells us which dbuf cache this dbuf is in, if any */ + dbuf_cached_state_t db_caching_status; + + /* Data which is unique to data (leaf) blocks: */ + + /* User callback information. */ + dmu_buf_user_t *db_user; + + /* + * Evict user data as soon as the dirty and reference + * counts are equal. + */ + uint8_t db_user_immediate_evict; + + /* + * This block was freed while a read or write was + * active. + */ + uint8_t db_freed_in_flight; + + /* + * dnode_evict_dbufs() or dnode_evict_bonus() tried to + * evict this dbuf, but couldn't due to outstanding + * references. Evict once the refcount drops to 0. + */ + uint8_t db_pending_evict; + + uint8_t db_dirtycnt; +} dmu_buf_impl_t; + +/* Note: the dbuf hash table is exposed only for the mdb module */ +#define DBUF_MUTEXES 8192 +#define DBUF_HASH_MUTEX(h, idx) (&(h)->hash_mutexes[(idx) & (DBUF_MUTEXES-1)]) +typedef struct dbuf_hash_table { + uint64_t hash_table_mask; + dmu_buf_impl_t **hash_table; + kmutex_t hash_mutexes[DBUF_MUTEXES]; +} dbuf_hash_table_t; + +uint64_t dbuf_whichblock(const struct dnode *di, const int64_t level, + const uint64_t offset); + +void dbuf_create_bonus(struct dnode *dn); +int dbuf_spill_set_blksz(dmu_buf_t *db, uint64_t blksz, dmu_tx_t *tx); + +void dbuf_rm_spill(struct dnode *dn, dmu_tx_t *tx); + +dmu_buf_impl_t *dbuf_hold(struct dnode *dn, uint64_t blkid, void *tag); +dmu_buf_impl_t *dbuf_hold_level(struct dnode *dn, int level, uint64_t blkid, + void *tag); +int dbuf_hold_impl(struct dnode *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp); + +void dbuf_prefetch(struct dnode *dn, int64_t level, uint64_t blkid, + zio_priority_t prio, arc_flags_t aflags); + +void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); +boolean_t dbuf_try_add_ref(dmu_buf_t *db, objset_t *os, uint64_t obj, + uint64_t blkid, void *tag); +uint64_t dbuf_refcount(dmu_buf_impl_t *db); + +void dbuf_rele(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting); + +dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, + uint64_t blkid); + +int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); +void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); +dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); +void dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, dmu_tx_t *tx); + +void dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx); +void dbuf_destroy(dmu_buf_impl_t *db); + +void dbuf_unoverride(dbuf_dirty_record_t *dr); +void dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx); +void dbuf_release_bp(dmu_buf_impl_t *db); +db_lock_type_t dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag); +void dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag); + +void dbuf_free_range(struct dnode *dn, uint64_t start, uint64_t end, + struct dmu_tx *); + +void dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx); + +void dbuf_stats_init(dbuf_hash_table_t *hash); +void dbuf_stats_destroy(void); + +int dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift); + +#define DB_DNODE(_db) ((_db)->db_dnode_handle->dnh_dnode) +#define DB_DNODE_LOCK(_db) ((_db)->db_dnode_handle->dnh_zrlock) +#define DB_DNODE_ENTER(_db) (zrl_add(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_EXIT(_db) (zrl_remove(&DB_DNODE_LOCK(_db))) +#define DB_DNODE_HELD(_db) (!zrl_is_zero(&DB_DNODE_LOCK(_db))) + +void dbuf_init(void); +void dbuf_fini(void); + +boolean_t dbuf_is_metadata(dmu_buf_impl_t *db); + +static inline dbuf_dirty_record_t * +dbuf_find_dirty_lte(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + for (dr = list_head(&db->db_dirty_records); + dr != NULL && dr->dr_txg > txg; + dr = list_next(&db->db_dirty_records, dr)) + continue; + return (dr); +} + +static inline dbuf_dirty_record_t * +dbuf_find_dirty_eq(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr; + + dr = dbuf_find_dirty_lte(db, txg); + if (dr && dr->dr_txg == txg) + return (dr); + return (NULL); +} + +#define DBUF_GET_BUFC_TYPE(_db) \ + (dbuf_is_metadata(_db) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +#define DBUF_IS_CACHEABLE(_db) \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (dbuf_is_metadata(_db) && \ + ((_db)->db_objset->os_primary_cache == ZFS_CACHE_METADATA))) + +#define DBUF_IS_L2CACHEABLE(_db) \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (dbuf_is_metadata(_db) && \ + ((_db)->db_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#define DNODE_LEVEL_IS_L2CACHEABLE(_dn, _level) \ + ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_ALL || \ + (((_level) > 0 || \ + DMU_OT_IS_METADATA((_dn)->dn_handle->dnh_dnode->dn_type)) && \ + ((_dn)->dn_objset->os_secondary_cache == ZFS_CACHE_METADATA))) + +#ifdef ZFS_DEBUG + +/* + * There should be a ## between the string literal and fmt, to make it + * clear that we're joining two strings together, but gcc does not + * support that preprocessor token. + */ +#define dprintf_dbuf(dbuf, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dbuf)->db.db_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj); \ + dprintf_ds((dbuf)->db_objset->os_dsl_dataset, \ + "obj=%s lvl=%u blkid=%lld " fmt, \ + __db_buf, (dbuf)->db_level, \ + (u_longlong_t)(dbuf)->db_blkid, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DBUF_VERIFY(db) dbuf_verify(db) + +#else + +#define dprintf_dbuf(db, fmt, ...) +#define dprintf_dbuf_bp(db, bp, fmt, ...) +#define DBUF_VERIFY(db) + +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DBUF_H */ diff --git a/include/sys/ddt.h b/include/sys/ddt.h new file mode 100644 index 000000000000..25be6f56dddc --- /dev/null +++ b/include/sys/ddt.h @@ -0,0 +1,257 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DDT_H +#define _SYS_DDT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd; + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ +enum ddt_type { + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ +enum ddt_class { + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ +typedef struct ddt_key { + zio_cksum_t ddk_cksum; /* 256-bit block checksum */ + /* + * Encoded with logical & physical size, encryption, and compression, + * as follows: + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 |X| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ + uint64_t ddk_prop; +} ddt_key_t; + +#define DDK_GET_LSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_LSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_PSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_PSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 7) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 7, x) + +#define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) +#define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +#define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \ + ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP) + +typedef struct ddt_phys { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; +} ddt_phys_t; + +/* + * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, + * we maintain the ability to free existing dedup-ditto blocks. + */ +enum ddt_phys_type { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_TYPES +}; + +/* + * In-core ddt entry + */ +struct ddt_entry { + ddt_key_t dde_key; + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + struct abd *dde_repair_abd; + enum ddt_type dde_type; + enum ddt_class dde_class; + uint8_t dde_loading; + uint8_t dde_loaded; + kcondvar_t dde_cv; + avl_node_t dde_node; +}; + +/* + * In-core ddt + */ +struct ddt { + kmutex_t ddt_lock; + avl_tree_t ddt_tree; + avl_tree_t ddt_repair_tree; + enum zio_checksum ddt_checksum; + spa_t *ddt_spa; + objset_t *ddt_os; + uint64_t ddt_stat_object; + uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + avl_node_t ddt_node; +}; + +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); +} ddt_ops_t; + +#define DDT_NAMELEN 107 + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); +extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, uint64_t *count); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, dmu_object_info_t *); +extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz); + +extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, + uint64_t txg); +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_phys_t *ddp, blkptr_t *bp); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); +extern void ddt_phys_clear(ddt_phys_t *ddp); +extern void ddt_phys_addref(ddt_phys_t *ddp); +extern void ddt_phys_decref(ddt_phys_t *ddp); +extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, + uint64_t txg); +extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); +extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); +extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); +extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); + +extern uint64_t ddt_get_dedup_dspace(spa_t *spa); +extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); + +extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); +extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); + +extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); +extern void ddt_enter(ddt_t *ddt); +extern void ddt_exit(ddt_t *ddt); +extern void ddt_init(void); +extern void ddt_fini(void); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); +extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); + +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + +extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); +extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); + +extern int ddt_entry_compare(const void *x1, const void *x2); + +extern void ddt_create(spa_t *spa); +extern int ddt_load(spa_t *spa); +extern void ddt_unload(spa_t *spa); +extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); +extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); + +extern const ddt_ops_t ddt_zap_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/include/sys/dmu.h b/include/sys/dmu.h new file mode 100644 index 000000000000..d6efe2595be0 --- /dev/null +++ b/include/sys/dmu.h @@ -0,0 +1,1090 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_DMU_H +#define _SYS_DMU_H + +/* + * This file describes the interface that the DMU provides for its + * consumers. + * + * The DMU also interacts with the SPA. That interface is described in + * dmu_spa.h. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct page; +struct vnode; +struct spa; +struct zilog; +struct zio; +struct blkptr; +struct zap_cursor; +struct dsl_dataset; +struct dsl_pool; +struct dnode; +struct drr_begin; +struct drr_end; +struct zbookmark_phys; +struct spa; +struct nvlist; +struct arc_buf; +struct zio_prop; +struct sa_handle; +struct dsl_crypto_params; +struct locked_range; + +typedef struct objset objset_t; +typedef struct dmu_tx dmu_tx_t; +typedef struct dsl_dir dsl_dir_t; +typedef struct dnode dnode_t; + +typedef enum dmu_object_byteswap { + DMU_BSWAP_UINT8, + DMU_BSWAP_UINT16, + DMU_BSWAP_UINT32, + DMU_BSWAP_UINT64, + DMU_BSWAP_ZAP, + DMU_BSWAP_DNODE, + DMU_BSWAP_OBJSET, + DMU_BSWAP_ZNODE, + DMU_BSWAP_OLDACL, + DMU_BSWAP_ACL, + /* + * Allocating a new byteswap type number makes the on-disk format + * incompatible with any other format that uses the same number. + * + * Data can usually be structured to work with one of the + * DMU_BSWAP_UINT* or DMU_BSWAP_ZAP types. + */ + DMU_BSWAP_NUMFUNCS +} dmu_object_byteswap_t; + +#define DMU_OT_NEWTYPE 0x80 +#define DMU_OT_METADATA 0x40 +#define DMU_OT_ENCRYPTED 0x20 +#define DMU_OT_BYTESWAP_MASK 0x1f + +/* + * Defines a uint8_t object type. Object types specify if the data + * in the object is metadata (boolean) and how to byteswap the data + * (dmu_object_byteswap_t). All of the types created by this method + * are cached in the dbuf metadata cache. + */ +#define DMU_OT(byteswap, metadata, encrypted) \ + (DMU_OT_NEWTYPE | \ + ((metadata) ? DMU_OT_METADATA : 0) | \ + ((encrypted) ? DMU_OT_ENCRYPTED : 0) | \ + ((byteswap) & DMU_OT_BYTESWAP_MASK)) + +#define DMU_OT_IS_VALID(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS : \ + (ot) < DMU_OT_NUMTYPES) + +#define DMU_OT_IS_METADATA_CACHED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + B_TRUE : dmu_ot[(ot)].ot_dbuf_metadata_cache) + +/* + * MDB doesn't have dmu_ot; it defines these macros itself. + */ +#ifndef ZFS_MDB +#define DMU_OT_IS_METADATA_IMPL(ot) (dmu_ot[ot].ot_metadata) +#define DMU_OT_IS_ENCRYPTED_IMPL(ot) (dmu_ot[ot].ot_encrypt) +#define DMU_OT_BYTESWAP_IMPL(ot) (dmu_ot[ot].ot_byteswap) +#endif + +#define DMU_OT_IS_METADATA(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_METADATA) : \ + DMU_OT_IS_METADATA_IMPL(ot)) + +#define DMU_OT_IS_DDT(ot) \ + ((ot) == DMU_OT_DDT_ZAP) + +#define DMU_OT_IS_ZIL(ot) \ + ((ot) == DMU_OT_INTENT_LOG) + +/* Note: ztest uses DMU_OT_UINT64_OTHER as a proxy for file blocks */ +#define DMU_OT_IS_FILE(ot) \ + ((ot) == DMU_OT_PLAIN_FILE_CONTENTS || (ot) == DMU_OT_UINT64_OTHER) + +#define DMU_OT_IS_ENCRYPTED(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_ENCRYPTED) : \ + DMU_OT_IS_ENCRYPTED_IMPL(ot)) + +/* + * These object types use bp_fill != 1 for their L0 bp's. Therefore they can't + * have their data embedded (i.e. use a BP_IS_EMBEDDED() bp), because bp_fill + * is repurposed for embedded BPs. + */ +#define DMU_OT_HAS_FILL(ot) \ + ((ot) == DMU_OT_DNODE || (ot) == DMU_OT_OBJSET) + +#define DMU_OT_BYTESWAP(ot) (((ot) & DMU_OT_NEWTYPE) ? \ + ((ot) & DMU_OT_BYTESWAP_MASK) : \ + DMU_OT_BYTESWAP_IMPL(ot)) + +typedef enum dmu_object_type { + DMU_OT_NONE, + /* general: */ + DMU_OT_OBJECT_DIRECTORY, /* ZAP */ + DMU_OT_OBJECT_ARRAY, /* UINT64 */ + DMU_OT_PACKED_NVLIST, /* UINT8 (XDR by nvlist_pack/unpack) */ + DMU_OT_PACKED_NVLIST_SIZE, /* UINT64 */ + DMU_OT_BPOBJ, /* UINT64 */ + DMU_OT_BPOBJ_HDR, /* UINT64 */ + /* spa: */ + DMU_OT_SPACE_MAP_HEADER, /* UINT64 */ + DMU_OT_SPACE_MAP, /* UINT64 */ + /* zil: */ + DMU_OT_INTENT_LOG, /* UINT64 */ + /* dmu: */ + DMU_OT_DNODE, /* DNODE */ + DMU_OT_OBJSET, /* OBJSET */ + /* dsl: */ + DMU_OT_DSL_DIR, /* UINT64 */ + DMU_OT_DSL_DIR_CHILD_MAP, /* ZAP */ + DMU_OT_DSL_DS_SNAP_MAP, /* ZAP */ + DMU_OT_DSL_PROPS, /* ZAP */ + DMU_OT_DSL_DATASET, /* UINT64 */ + /* zpl: */ + DMU_OT_ZNODE, /* ZNODE */ + DMU_OT_OLDACL, /* Old ACL */ + DMU_OT_PLAIN_FILE_CONTENTS, /* UINT8 */ + DMU_OT_DIRECTORY_CONTENTS, /* ZAP */ + DMU_OT_MASTER_NODE, /* ZAP */ + DMU_OT_UNLINKED_SET, /* ZAP */ + /* zvol: */ + DMU_OT_ZVOL, /* UINT8 */ + DMU_OT_ZVOL_PROP, /* ZAP */ + /* other; for testing only! */ + DMU_OT_PLAIN_OTHER, /* UINT8 */ + DMU_OT_UINT64_OTHER, /* UINT64 */ + DMU_OT_ZAP_OTHER, /* ZAP */ + /* new object types: */ + DMU_OT_ERROR_LOG, /* ZAP */ + DMU_OT_SPA_HISTORY, /* UINT8 */ + DMU_OT_SPA_HISTORY_OFFSETS, /* spa_his_phys_t */ + DMU_OT_POOL_PROPS, /* ZAP */ + DMU_OT_DSL_PERMS, /* ZAP */ + DMU_OT_ACL, /* ACL */ + DMU_OT_SYSACL, /* SYSACL */ + DMU_OT_FUID, /* FUID table (Packed NVLIST UINT8) */ + DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ + DMU_OT_NEXT_CLONES, /* ZAP */ + DMU_OT_SCAN_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ + DMU_OT_SA, /* System attr */ + DMU_OT_SA_MASTER_NODE, /* ZAP */ + DMU_OT_SA_ATTR_REGISTRATION, /* ZAP */ + DMU_OT_SA_ATTR_LAYOUTS, /* ZAP */ + DMU_OT_SCAN_XLATE, /* ZAP */ + DMU_OT_DEDUP, /* fake dedup BP from ddt_bp_create() */ + DMU_OT_DEADLIST, /* ZAP */ + DMU_OT_DEADLIST_HDR, /* UINT64 */ + DMU_OT_DSL_CLONES, /* ZAP */ + DMU_OT_BPOBJ_SUBOBJ, /* UINT64 */ + /* + * Do not allocate new object types here. Doing so makes the on-disk + * format incompatible with any other format that uses the same object + * type number. + * + * When creating an object which does not have one of the above types + * use the DMU_OTN_* type with the correct byteswap and metadata + * values. + * + * The DMU_OTN_* types do not have entries in the dmu_ot table, + * use the DMU_OT_IS_METADATA() and DMU_OT_BYTESWAP() macros instead + * of indexing into dmu_ot directly (this works for both DMU_OT_* types + * and DMU_OTN_* types). + */ + DMU_OT_NUMTYPES, + + /* + * Names for valid types declared with DMU_OT(). + */ + DMU_OTN_UINT8_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_FALSE), + DMU_OTN_UINT8_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_FALSE), + DMU_OTN_UINT16_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_FALSE), + DMU_OTN_UINT16_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_FALSE), + DMU_OTN_UINT32_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_FALSE), + DMU_OTN_UINT32_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_FALSE), + DMU_OTN_UINT64_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_FALSE), + DMU_OTN_UINT64_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_FALSE), + DMU_OTN_ZAP_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_FALSE), + DMU_OTN_ZAP_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_FALSE), + + DMU_OTN_UINT8_ENC_DATA = DMU_OT(DMU_BSWAP_UINT8, B_FALSE, B_TRUE), + DMU_OTN_UINT8_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT8, B_TRUE, B_TRUE), + DMU_OTN_UINT16_ENC_DATA = DMU_OT(DMU_BSWAP_UINT16, B_FALSE, B_TRUE), + DMU_OTN_UINT16_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT16, B_TRUE, B_TRUE), + DMU_OTN_UINT32_ENC_DATA = DMU_OT(DMU_BSWAP_UINT32, B_FALSE, B_TRUE), + DMU_OTN_UINT32_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT32, B_TRUE, B_TRUE), + DMU_OTN_UINT64_ENC_DATA = DMU_OT(DMU_BSWAP_UINT64, B_FALSE, B_TRUE), + DMU_OTN_UINT64_ENC_METADATA = DMU_OT(DMU_BSWAP_UINT64, B_TRUE, B_TRUE), + DMU_OTN_ZAP_ENC_DATA = DMU_OT(DMU_BSWAP_ZAP, B_FALSE, B_TRUE), + DMU_OTN_ZAP_ENC_METADATA = DMU_OT(DMU_BSWAP_ZAP, B_TRUE, B_TRUE), +} dmu_object_type_t; + +/* + * These flags are intended to be used to specify the "txg_how" + * parameter when calling the dmu_tx_assign() function. See the comment + * above dmu_tx_assign() for more details on the meaning of these flags. + */ +#define TXG_NOWAIT (0ULL) +#define TXG_WAIT (1ULL<<0) +#define TXG_NOTHROTTLE (1ULL<<1) + +void byteswap_uint64_array(void *buf, size_t size); +void byteswap_uint32_array(void *buf, size_t size); +void byteswap_uint16_array(void *buf, size_t size); +void byteswap_uint8_array(void *buf, size_t size); +void zap_byteswap(void *buf, size_t size); +void zfs_oldacl_byteswap(void *buf, size_t size); +void zfs_acl_byteswap(void *buf, size_t size); +void zfs_znode_byteswap(void *buf, size_t size); + +#define DS_FIND_SNAPSHOTS (1<<0) +#define DS_FIND_CHILDREN (1<<1) +#define DS_FIND_SERIALIZE (1<<2) + +/* + * The maximum number of bytes that can be accessed as part of one + * operation, including metadata. + */ +#define DMU_MAX_ACCESS (64 * 1024 * 1024) /* 64MB */ +#define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ + +#define DMU_USERUSED_OBJECT (-1ULL) +#define DMU_GROUPUSED_OBJECT (-2ULL) +#define DMU_PROJECTUSED_OBJECT (-3ULL) + +/* + * Zap prefix for object accounting in DMU_{USER,GROUP,PROJECT}USED_OBJECT. + */ +#define DMU_OBJACCT_PREFIX "obj-" +#define DMU_OBJACCT_PREFIX_LEN 4 + +/* + * artificial blkids for bonus buffer and spill blocks + */ +#define DMU_BONUS_BLKID (-1ULL) +#define DMU_SPILL_BLKID (-2ULL) + +/* + * Public routines to create, destroy, open, and close objsets. + */ +typedef void dmu_objset_create_sync_func_t(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, boolean_t key_required, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, boolean_t key_required, void *tag); +int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); + +void dmu_objset_evict_dbufs(objset_t *os); +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, + struct dsl_crypto_params *dcp, dmu_objset_create_sync_func_t func, + void *arg); +int dmu_objset_clone(const char *name, const char *origin); +int dsl_destroy_snapshots_nvl(struct nvlist *snaps, boolean_t defer, + struct nvlist *errlist); +int dmu_objset_snapshot_one(const char *fsname, const char *snapname); +int dmu_objset_snapshot_tmp(const char *, const char *, int); +int dmu_objset_find(const char *name, int func(const char *, void *), void *arg, + int flags); +void dmu_objset_byteswap(void *buf, size_t size); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); + +typedef struct dmu_buf { + uint64_t db_object; /* object that this buffer is part of */ + uint64_t db_offset; /* byte offset in this object */ + uint64_t db_size; /* size of buffer in bytes */ + void *db_data; /* data in buffer */ +} dmu_buf_t; + +/* + * The names of zap entries in the DIRECTORY_OBJECT of the MOS. + */ +#define DMU_POOL_DIRECTORY_OBJECT 1 +#define DMU_POOL_CONFIG "config" +#define DMU_POOL_FEATURES_FOR_WRITE "features_for_write" +#define DMU_POOL_FEATURES_FOR_READ "features_for_read" +#define DMU_POOL_FEATURE_DESCRIPTIONS "feature_descriptions" +#define DMU_POOL_FEATURE_ENABLED_TXG "feature_enabled_txg" +#define DMU_POOL_ROOT_DATASET "root_dataset" +#define DMU_POOL_SYNC_BPOBJ "sync_bplist" +#define DMU_POOL_ERRLOG_SCRUB "errlog_scrub" +#define DMU_POOL_ERRLOG_LAST "errlog_last" +#define DMU_POOL_SPARES "spares" +#define DMU_POOL_DEFLATE "deflate" +#define DMU_POOL_HISTORY "history" +#define DMU_POOL_PROPS "pool_props" +#define DMU_POOL_L2CACHE "l2cache" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_STATS "DDT-statistics" +#define DMU_POOL_CREATION_VERSION "creation_version" +#define DMU_POOL_SCAN "scan" +#define DMU_POOL_FREE_BPOBJ "free_bpobj" +#define DMU_POOL_BPTREE_OBJ "bptree_obj" +#define DMU_POOL_EMPTY_BPOBJ "empty_bpobj" +#define DMU_POOL_CHECKSUM_SALT "org.illumos:checksum_salt" +#define DMU_POOL_VDEV_ZAP_MAP "com.delphix:vdev_zap_map" +#define DMU_POOL_REMOVING "com.delphix:removing" +#define DMU_POOL_OBSOLETE_BPOBJ "com.delphix:obsolete_bpobj" +#define DMU_POOL_CONDENSING_INDIRECT "com.delphix:condensing_indirect" +#define DMU_POOL_ZPOOL_CHECKPOINT "com.delphix:zpool_checkpoint" +#define DMU_POOL_LOG_SPACEMAP_ZAP "com.delphix:log_spacemap_zap" +#define DMU_POOL_DELETED_CLONES "com.delphix:deleted_clones" + +/* + * Allocate an object from this objset. The range of object numbers + * available is (0, DN_MAX_OBJECT). Object 0 is the meta-dnode. + * + * The transaction must be assigned to a txg. The newly allocated + * object will be "held" in the transaction (ie. you can modify the + * newly allocated object in this transaction). + * + * dmu_object_alloc() chooses an object and returns it in *objectp. + * + * dmu_object_claim() allocates a specific object number. If that + * number is already allocated, it fails and returns EEXIST. + * + * Return 0 on success, or ENOSPC or EEXIST as specified above. + */ +uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +uint64_t dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +uint64_t dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, + int blocksize, int indirect_blockshift, dmu_object_type_t bonustype, + int bonuslen, int dnodesize, dnode_t **allocated_dnode, void *tag, + dmu_tx_t *tx); +int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); +int dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonus_type, int bonus_len, + int dnodesize, dmu_tx_t *tx); +int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *txp); +int dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, + dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, + int bonuslen, int dnodesize, boolean_t keep_spill, dmu_tx_t *tx); +int dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Free an object from this objset. + * + * The object's data will be freed as well (ie. you don't need to call + * dmu_free(object, 0, -1, tx)). + * + * The object need not be held in the transaction. + * + * If there are any holds on this object's buffers (via dmu_buf_hold()), + * or tx holds on the object (via dmu_tx_hold_object()), you can not + * free it; it fails and returns EBUSY. + * + * If the object is not allocated, it fails and returns ENOENT. + * + * Return 0 on success, or EBUSY or ENOENT as specified above. + */ +int dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx); + +/* + * Find the next allocated or free object. + * + * The objectp parameter is in-out. It will be updated to be the next + * object which is allocated. Ignore objects which have not been + * modified since txg. + * + * XXX Can only be called on a objset with no dirty data. + * + * Returns 0 on success, or ENOENT if there are no more objects. + */ +int dmu_object_next(objset_t *os, uint64_t *objectp, + boolean_t hole, uint64_t txg); + +/* + * Set the number of levels on a dnode. nlevels must be greater than the + * current number of levels or an EINVAL will be returned. + */ +int dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, + dmu_tx_t *tx); + +/* + * Set the data blocksize for an object. + * + * The object cannot have any blocks allocated beyond the first. If + * the first block is allocated already, the new size must be greater + * than the current block size. If these conditions are not met, + * ENOTSUP will be returned. + * + * Returns 0 on success, or EBUSY if there are any holds on the object + * contents, or ENOTSUP as described above. + */ +int dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, + int ibs, dmu_tx_t *tx); + +/* + * Manually set the maxblkid on a dnode. This will adjust nlevels accordingly + * to accommodate the change. When calling this function, the caller must + * ensure that the object's nlevels can sufficiently support the new maxblkid. + */ +int dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, + dmu_tx_t *tx); + +/* + * Set the checksum property on a dnode. The new checksum algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx); + +/* + * Set the compress property on a dnode. The new compression algorithm will + * apply to all newly written blocks; existing blocks will not be affected. + */ +void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx); + +void dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx); +void dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); + +/* + * Decide how to write a block: checksum, compression, number of copies, etc. + */ +#define WP_NOFILL 0x1 +#define WP_DMU_SYNC 0x2 +#define WP_SPILL 0x4 + +void dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, + struct zio_prop *zp); + +/* + * The bonus data is accessed more or less like a regular buffer. + * You must dmu_bonus_hold() to get the buffer, which will give you a + * dmu_buf_t with db_offset==-1ULL, and db_size = the size of the bonus + * data. As with any normal buffer, you must call dmu_buf_will_dirty() + * before modifying it, and the + * object must be held in an assigned transaction before calling + * dmu_buf_will_dirty. You may use dmu_buf_set_user() on the bonus + * buffer as well. You must release what you hold with dmu_buf_rele(). + * + * Returns ENOENT, EIO, or 0. + */ +int dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp); +int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, + uint32_t flags); +int dmu_bonus_max(void); +int dmu_set_bonus(dmu_buf_t *, int, dmu_tx_t *); +int dmu_set_bonustype(dmu_buf_t *, dmu_object_type_t, dmu_tx_t *); +dmu_object_type_t dmu_get_bonustype(dmu_buf_t *); +int dmu_rm_spill(objset_t *, uint64_t, dmu_tx_t *); + +/* + * Special spill buffer support used by "SA" framework + */ + +int dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, + dmu_buf_t **dbp); +int dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, + void *tag, dmu_buf_t **dbp); +int dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp); + +/* + * Obtain the DMU buffer from the specified object which contains the + * specified offset. dmu_buf_hold() puts a "hold" on the buffer, so + * that it will remain in memory. You must release the hold with + * dmu_buf_rele(). You must not access the dmu_buf_t after releasing + * what you hold. You must have a hold on any dmu_buf_t* you pass to the DMU. + * + * You must call dmu_buf_read, dmu_buf_will_dirty, or dmu_buf_will_fill + * on the returned buffer before reading or writing the buffer's + * db_data. The comments for those routines describe what particular + * operations are valid after calling them. + * + * The object number must be a valid, allocated object number. + */ +int dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **, int flags); +int dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags); +int dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp, uint32_t flags); +/* + * Add a reference to a dmu buffer that has already been held via + * dmu_buf_hold() in the current context. + */ +void dmu_buf_add_ref(dmu_buf_t *db, void* tag); + +/* + * Attempt to add a reference to a dmu buffer that is in an unknown state, + * using a pointer that may have been invalidated by eviction processing. + * The request will succeed if the passed in dbuf still represents the + * same os/object/blkid, is ineligible for eviction, and has at least + * one hold by a user other than the syncer. + */ +boolean_t dmu_buf_try_add_ref(dmu_buf_t *, objset_t *os, uint64_t object, + uint64_t blkid, void *tag); + +void dmu_buf_rele(dmu_buf_t *db, void *tag); +uint64_t dmu_buf_refcount(dmu_buf_t *db); +uint64_t dmu_buf_user_refcount(dmu_buf_t *db); + +/* + * dmu_buf_hold_array holds the DMU buffers which contain all bytes in a + * range of an object. A pointer to an array of dmu_buf_t*'s is + * returned (in *dbpp). + * + * dmu_buf_rele_array releases the hold on an array of dmu_buf_t*'s, and + * frees the array. The hold on the array of buffers MUST be released + * with dmu_buf_rele_array. You can NOT release the hold on each buffer + * individually with dmu_buf_rele. + */ +int dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, + uint64_t length, boolean_t read, void *tag, + int *numbufsp, dmu_buf_t ***dbpp); +void dmu_buf_rele_array(dmu_buf_t **, int numbufs, void *tag); + +typedef void dmu_buf_evict_func_t(void *user_ptr); + +/* + * A DMU buffer user object may be associated with a dbuf for the + * duration of its lifetime. This allows the user of a dbuf (client) + * to attach private data to a dbuf (e.g. in-core only data such as a + * dnode_children_t, zap_t, or zap_leaf_t) and be optionally notified + * when that dbuf has been evicted. Clients typically respond to the + * eviction notification by freeing their private data, thus ensuring + * the same lifetime for both dbuf and private data. + * + * The mapping from a dmu_buf_user_t to any client private data is the + * client's responsibility. All current consumers of the API with private + * data embed a dmu_buf_user_t as the first member of the structure for + * their private data. This allows conversions between the two types + * with a simple cast. Since the DMU buf user API never needs access + * to the private data, other strategies can be employed if necessary + * or convenient for the client (e.g. using container_of() to do the + * conversion for private data that cannot have the dmu_buf_user_t as + * its first member). + * + * Eviction callbacks are executed without the dbuf mutex held or any + * other type of mechanism to guarantee that the dbuf is still available. + * For this reason, users must assume the dbuf has already been freed + * and not reference the dbuf from the callback context. + * + * Users requesting "immediate eviction" are notified as soon as the dbuf + * is only referenced by dirty records (dirties == holds). Otherwise the + * notification occurs after eviction processing for the dbuf begins. + */ +typedef struct dmu_buf_user { + /* + * Asynchronous user eviction callback state. + */ + taskq_ent_t dbu_tqent; + + /* + * This instance's eviction function pointers. + * + * dbu_evict_func_sync is called synchronously and then + * dbu_evict_func_async is executed asynchronously on a taskq. + */ + dmu_buf_evict_func_t *dbu_evict_func_sync; + dmu_buf_evict_func_t *dbu_evict_func_async; +#ifdef ZFS_DEBUG + /* + * Pointer to user's dbuf pointer. NULL for clients that do + * not associate a dbuf with their user data. + * + * The dbuf pointer is cleared upon eviction so as to catch + * use-after-evict bugs in clients. + */ + dmu_buf_t **dbu_clear_on_evict_dbufp; +#endif +} dmu_buf_user_t; + +/* + * Initialize the given dmu_buf_user_t instance with the eviction function + * evict_func, to be called when the user is evicted. + * + * NOTE: This function should only be called once on a given dmu_buf_user_t. + * To allow enforcement of this, dbu must already be zeroed on entry. + */ +/*ARGSUSED*/ +static inline void +dmu_buf_init_user(dmu_buf_user_t *dbu, dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp __maybe_unused) +{ + ASSERT(dbu->dbu_evict_func_sync == NULL); + ASSERT(dbu->dbu_evict_func_async == NULL); + + /* must have at least one evict func */ + IMPLY(evict_func_sync == NULL, evict_func_async != NULL); + dbu->dbu_evict_func_sync = evict_func_sync; + dbu->dbu_evict_func_async = evict_func_async; + taskq_init_ent(&dbu->dbu_tqent); +#ifdef ZFS_DEBUG + dbu->dbu_clear_on_evict_dbufp = clear_on_evict_dbufp; +#endif +} + +/* + * Attach user data to a dbuf and mark it for normal (when the dbuf's + * data is cleared or its reference count goes to zero) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Attach user data to a dbuf and mark it for immediate (its dirty and + * reference counts are equal) eviction processing. + * + * Returns NULL on success, or the existing user if another user currently + * owns the buffer. + */ +void *dmu_buf_set_user_ie(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Replace the current user of a dbuf. + * + * If given the current user of a dbuf, replaces the dbuf's user with + * "new_user" and returns the user data pointer that was replaced. + * Otherwise returns the current, and unmodified, dbuf user pointer. + */ +void *dmu_buf_replace_user(dmu_buf_t *db, + dmu_buf_user_t *old_user, dmu_buf_user_t *new_user); + +/* + * Remove the specified user data for a DMU buffer. + * + * Returns the user that was removed on success, or the current user if + * another user currently owns the buffer. + */ +void *dmu_buf_remove_user(dmu_buf_t *db, dmu_buf_user_t *user); + +/* + * Returns the user data (dmu_buf_user_t *) associated with this dbuf. + */ +void *dmu_buf_get_user(dmu_buf_t *db); + +objset_t *dmu_buf_get_objset(dmu_buf_t *db); +dnode_t *dmu_buf_dnode_enter(dmu_buf_t *db); +void dmu_buf_dnode_exit(dmu_buf_t *db); + +/* Block until any in-progress dmu buf user evictions complete. */ +void dmu_buf_user_evict_wait(void); + +/* + * Returns the blkptr associated with this dbuf, or NULL if not set. + */ +struct blkptr *dmu_buf_get_blkptr(dmu_buf_t *db); + +/* + * Indicate that you are going to modify the buffer's data (db_data). + * + * The transaction (tx) must be assigned to a txg (ie. you've called + * dmu_tx_assign()). The buffer's object must be held in the tx + * (ie. you've called dmu_tx_hold_object(tx, db->db_object)). + */ +void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +boolean_t dmu_buf_is_dirty(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx); + +/* + * You must create a transaction, then hold the objects which you will + * (or might) modify as part of this transaction. Then you must assign + * the transaction to a transaction group. Once the transaction has + * been assigned, you can modify buffers which belong to held objects as + * part of this transaction. You can't modify buffers before the + * transaction has been assigned; you can't modify buffers which don't + * belong to objects which this transaction holds; you can't hold + * objects once the transaction has been assigned. You may hold an + * object which you are going to free (with dmu_object_free()), but you + * don't have to. + * + * You can abort the transaction before it has been assigned. + * + * Note that you may hold buffers (with dmu_buf_hold) at any time, + * regardless of transaction state. + */ + +#define DMU_NEW_OBJECT (-1ULL) +#define DMU_OBJECT_END (-1ULL) + +dmu_tx_t *dmu_tx_create(objset_t *os); +void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); +void dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + int len); +void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, + uint64_t len); +void dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, + uint64_t len); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); +void dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, + const char *name); +void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn); +void dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object); +void dmu_tx_hold_sa(dmu_tx_t *tx, struct sa_handle *hdl, boolean_t may_grow); +void dmu_tx_hold_sa_create(dmu_tx_t *tx, int total_size); +void dmu_tx_abort(dmu_tx_t *tx); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_mark_netfree(dmu_tx_t *tx); + +/* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + * + * When multiple callbacks are registered to the transaction, the callbacks + * will be called in reverse order to let Lustre, the only user of commit + * callback currently, take the fast path of its commit callback handling. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_do_callbacks(list_t *cb_list, int error); + +/* + * Free up the data blocks for a defined range of a file. If size is + * -1, the range from offset to end-of-file is freed. + */ +int dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx); +int dmu_free_long_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size); +int dmu_free_long_object(objset_t *os, uint64_t object); + +/* + * Convenience functions. + * + * Canfail routines will return 0 on success, or an errno if there is a + * nonrecoverable I/O error. + */ +#define DMU_READ_PREFETCH 0 /* prefetch */ +#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ +#define DMU_READ_NO_DECRYPT 2 /* don't decrypt */ +int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags); +int dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags); +void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx); +void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx); +#ifdef _KERNEL +int dmu_read_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size); +int dmu_read_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size); +int dmu_read_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size); +int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_uio_dbuf(dmu_buf_t *zdb, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +int dmu_write_uio_dnode(dnode_t *dn, struct uio *uio, uint64_t size, + dmu_tx_t *tx); +#endif +struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); +void dmu_return_arcbuf(struct arc_buf *buf); +int dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); +int dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, + struct arc_buf *buf, dmu_tx_t *tx); +#define dmu_assign_arcbuf dmu_assign_arcbuf_by_dbuf +#ifdef HAVE_UIO_ZEROCOPY +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +#endif /* HAVE_UIO_ZEROCOPY */ +void xuio_stat_wbuf_copied(void); +void xuio_stat_wbuf_nocopy(void); + +extern int zfs_prefetch_disable; +extern int zfs_max_recordsize; + +/* + * Asynchronously try to read in the data. + */ +void dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, enum zio_priority pri); + +typedef struct dmu_object_info { + /* All sizes are in bytes unless otherwise indicated. */ + uint32_t doi_data_block_size; + uint32_t doi_metadata_block_size; + dmu_object_type_t doi_type; + dmu_object_type_t doi_bonus_type; + uint64_t doi_bonus_size; + uint8_t doi_indirection; /* 2 = dnode->indirect->data */ + uint8_t doi_checksum; + uint8_t doi_compress; + uint8_t doi_nblkptr; + uint8_t doi_pad[4]; + uint64_t doi_dnodesize; + uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ + uint64_t doi_max_offset; + uint64_t doi_fill_count; /* number of non-empty blocks */ +} dmu_object_info_t; + +typedef void (*const arc_byteswap_func_t)(void *buf, size_t size); + +typedef struct dmu_object_type_info { + dmu_object_byteswap_t ot_byteswap; + boolean_t ot_metadata; + boolean_t ot_dbuf_metadata_cache; + boolean_t ot_encrypt; + char *ot_name; +} dmu_object_type_info_t; + +typedef const struct dmu_object_byteswap_info { + arc_byteswap_func_t ob_func; + char *ob_name; +} dmu_object_byteswap_info_t; + +extern const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES]; +extern const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS]; + +/* + * Get information on a DMU object. + * + * Return 0 on success or ENOENT if object is not allocated. + * + * If doi is NULL, just indicates whether the object exists. + */ +int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi); +void __dmu_object_info_from_dnode(struct dnode *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dnode in hand. */ +void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi); +/* Like dmu_object_info, but faster if you have a held dbuf in hand. */ +void dmu_object_info_from_db(dmu_buf_t *db, dmu_object_info_t *doi); +/* + * Like dmu_object_info_from_db, but faster still when you only care about + * the size. + */ +void dmu_object_size_from_db(dmu_buf_t *db, uint32_t *blksize, + u_longlong_t *nblk512); + +void dmu_object_dnsize_from_db(dmu_buf_t *db, int *dnsize); + +typedef struct dmu_objset_stats { + uint64_t dds_num_clones; /* number of clones of this */ + uint64_t dds_creation_txg; + uint64_t dds_guid; + dmu_objset_type_t dds_type; + uint8_t dds_is_snapshot; + uint8_t dds_inconsistent; + uint8_t dds_redacted; + char dds_origin[ZFS_MAX_DATASET_NAME_LEN]; +} dmu_objset_stats_t; + +/* + * Get stats on a dataset. + */ +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); + +/* + * Add entries to the nvlist for all the objset's properties. See + * zfs_prop_table[] and zfs(1m) for details on the properties. + */ +void dmu_objset_stats(objset_t *os, struct nvlist *nv); + +/* + * Get the space usage statistics for statvfs(). + * + * refdbytes is the amount of space "referenced" by this objset. + * availbytes is the amount of space available to this objset, taking + * into account quotas & reservations, assuming that no other objsets + * use the space first. These values correspond to the 'referenced' and + * 'available' properties, described in the zfs(1m) manpage. + * + * usedobjs and availobjs are the number of objects currently allocated, + * and available. + */ +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); + +/* + * The fsid_guid is a 56-bit ID that can change to avoid collisions. + * (Contrast with the ds_guid which is a 64-bit ID that will never + * change, so there is a small probability that it will collide.) + */ +uint64_t dmu_objset_fsid_guid(objset_t *os); + +/* + * Get the [cm]time for an objset's snapshot dir + */ +inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); + +int dmu_objset_is_snapshot(objset_t *os); + +extern struct spa *dmu_objset_spa(objset_t *os); +extern struct zilog *dmu_objset_zil(objset_t *os); +extern struct dsl_pool *dmu_objset_pool(objset_t *os); +extern struct dsl_dataset *dmu_objset_ds(objset_t *os); +extern void dmu_objset_name(objset_t *os, char *buf); +extern dmu_objset_type_t dmu_objset_type(objset_t *os); +extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_dnodesize(objset_t *os); +extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os); +extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os); +extern int dmu_objset_blksize(objset_t *os); +extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *id, uint64_t *offp, boolean_t *case_conflict); +extern int dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *val); +extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, + int maxlen, boolean_t *conflict); +extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp); + +typedef struct zfs_file_info { + uint64_t zfi_user; + uint64_t zfi_group; + uint64_t zfi_project; + uint64_t zfi_generation; +} zfs_file_info_t; + +typedef int file_info_cb_t(dmu_object_type_t bonustype, const void *data, + struct zfs_file_info *zoi); +extern void dmu_objset_register_type(dmu_objset_type_t ost, + file_info_cb_t *cb); +extern void dmu_objset_set_user(objset_t *os, void *user_ptr); +extern void *dmu_objset_get_user(objset_t *os); + +/* + * Return the txg number for the given assigned transaction. + */ +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); + +/* + * Synchronous write. + * If a parent zio is provided this function initiates a write on the + * provided buffer as a child of the parent zio. + * In the absence of a parent zio, the write is completed synchronously. + * At write completion, blk is filled with the bp of the written block. + * Note that while the data covered by this function will be on stable + * storage when the write completes this new data does not become a + * permanent part of the file until the associated transaction commits. + */ + +/* + * {zfs,zvol,ztest}_get_done() args + */ +typedef struct zgd { + struct lwb *zgd_lwb; + struct blkptr *zgd_bp; + dmu_buf_t *zgd_db; + struct zfs_locked_range *zgd_lr; + void *zgd_private; +} zgd_t; + +typedef void dmu_sync_cb_t(zgd_t *arg, int error); +int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); + +/* + * Find the next hole or data block in file starting at *off + * Return found offset in *off. Return ESRCH for end of file. + */ +int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, + uint64_t *off); + +/* + * Initial setup and final teardown. + */ +extern void dmu_init(void); +extern void dmu_fini(void); + +typedef void (*dmu_traverse_cb_t)(objset_t *os, void *arg, struct blkptr *bp, + uint64_t object, uint64_t offset, int len); +void dmu_traverse_objset(objset_t *os, uint64_t txg_start, + dmu_traverse_cb_t cb, void *arg); + +int dmu_diff(const char *tosnap_name, const char *fromsnap_name, + zfs_file_t *fp, offset_t *offp); + +/* CRC64 table */ +#define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ +extern uint64_t zfs_crc64_table[256]; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_H */ diff --git a/include/sys/dmu_impl.h b/include/sys/dmu_impl.h new file mode 100644 index 000000000000..0c6273a3a727 --- /dev/null +++ b/include/sys/dmu_impl.h @@ -0,0 +1,264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_IMPL_H +#define _SYS_DMU_IMPL_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the locking strategy for the DMU. Numbers in parenthesis are + * cases that use that lock order, referenced below: + * + * ARC is self-contained + * bplist is self-contained + * refcount is self-contained + * txg is self-contained (hopefully!) + * zst_lock + * zf_rwlock + * + * XXX try to improve evicting path? + * + * dp_config_rwlock > os_obj_lock > dn_struct_rwlock > + * dn_dbufs_mtx > hash_mutexes > db_mtx > dd_lock > leafs + * + * dp_config_rwlock + * must be held before: everything + * protects dd namespace changes + * protects property changes globally + * held from: + * dsl_dir_open/r: + * dsl_dir_create_sync/w: + * dsl_dir_sync_destroy/w: + * dsl_dir_rename_sync/w: + * dsl_prop_changed_notify/r: + * + * os_obj_lock + * must be held before: + * everything except dp_config_rwlock + * protects os_obj_next + * held from: + * dmu_object_alloc: dn_dbufs_mtx, db_mtx, hash_mutexes, dn_struct_rwlock + * + * dn_struct_rwlock + * must be held before: + * everything except dp_config_rwlock and os_obj_lock + * protects structure of dnode (eg. nlevels) + * db_blkptr can change when syncing out change to nlevels + * dn_maxblkid + * dn_nlevels + * dn_*blksz* + * phys nlevels, maxblkid, physical blkptr_t's (?) + * held from: + * callers of dbuf_read_impl, dbuf_hold[_impl], dbuf_prefetch + * dmu_object_info_from_dnode: dn_dirty_mtx (dn_datablksz) + * dbuf_read_impl: db_mtx, dmu_zfetch() + * dmu_zfetch: zf_rwlock/r, zst_lock, dbuf_prefetch() + * dbuf_new_size: db_mtx + * dbuf_dirty: db_mtx + * dbuf_findbp: (callers, phys? - the real need) + * dbuf_create: dn_dbufs_mtx, hash_mutexes, db_mtx (phys?) + * dbuf_prefetch: dn_dirty_mtx, hash_mutexes, db_mtx, dn_dbufs_mtx + * dbuf_hold_impl: hash_mutexes, db_mtx, dn_dbufs_mtx, dbuf_findbp() + * dnode_sync/w (increase_indirection): db_mtx (phys) + * dnode_set_blksz/w: dn_dbufs_mtx (dn_*blksz*) + * dnode_new_blkid/w: (dn_maxblkid) + * dnode_free_range/w: dn_dirty_mtx (dn_maxblkid) + * dnode_next_offset: (phys) + * + * dn_dbufs_mtx + * must be held before: + * db_mtx, hash_mutexes + * protects: + * dn_dbufs + * dn_evicted + * held from: + * dmu_evict_user: db_mtx (dn_dbufs) + * dbuf_free_range: db_mtx (dn_dbufs) + * dbuf_remove_ref: db_mtx, callees: + * dbuf_hash_remove: hash_mutexes, db_mtx + * dbuf_create: hash_mutexes, db_mtx (dn_dbufs) + * dnode_set_blksz: (dn_dbufs) + * + * hash_mutexes (global) + * must be held before: + * db_mtx + * protects dbuf_hash_table (global) and db_hash_next + * held from: + * dbuf_find: db_mtx + * dbuf_hash_insert: db_mtx + * dbuf_hash_remove: db_mtx + * + * db_mtx (meta-leaf) + * must be held before: + * dn_mtx, dn_dirty_mtx, dd_lock (leaf mutexes) + * protects: + * db_state + * db_holds + * db_buf + * db_changed + * db_data_pending + * db_dirtied + * db_link + * db_dirty_node (??) + * db_dirtycnt + * db_d.* + * db.* + * held from: + * dbuf_dirty: dn_mtx, dn_dirty_mtx + * dbuf_dirty->dsl_dir_willuse_space: dd_lock + * dbuf_dirty->dbuf_new_block->dsl_dataset_block_freeable: dd_lock + * dbuf_undirty: dn_dirty_mtx (db_d) + * dbuf_write_done: dn_dirty_mtx (db_state) + * dbuf_* + * dmu_buf_update_user: none (db_d) + * dmu_evict_user: none (db_d) (maybe can eliminate) + * dbuf_find: none (db_holds) + * dbuf_hash_insert: none (db_holds) + * dmu_buf_read_array_impl: none (db_state, db_changed) + * dmu_sync: none (db_dirty_node, db_d) + * dnode_reallocate: none (db) + * + * dn_mtx (leaf) + * protects: + * dn_dirty_dbufs + * dn_ranges + * phys accounting + * dn_allocated_txg + * dn_free_txg + * dn_assigned_txg + * dn_dirty_txg + * dd_assigned_tx + * dn_notxholds + * dn_nodnholds + * dn_dirtyctx + * dn_dirtyctx_firstset + * (dn_phys copy fields?) + * (dn_phys contents?) + * held from: + * dnode_* + * dbuf_dirty: none + * dbuf_sync: none (phys accounting) + * dbuf_undirty: none (dn_ranges, dn_dirty_dbufs) + * dbuf_write_done: none (phys accounting) + * dmu_object_info_from_dnode: none (accounting) + * dmu_tx_commit: none + * dmu_tx_hold_object_impl: none + * dmu_tx_try_assign: dn_notxholds(cv) + * dmu_tx_unassign: none + * + * dd_lock + * must be held before: + * ds_lock + * ancestors' dd_lock + * protects: + * dd_prop_cbs + * dd_sync_* + * dd_used_bytes + * dd_tempreserved + * dd_space_towrite + * dd_myname + * dd_phys accounting? + * held from: + * dsl_dir_* + * dsl_prop_changed_notify: none (dd_prop_cbs) + * dsl_prop_register: none (dd_prop_cbs) + * dsl_prop_unregister: none (dd_prop_cbs) + * + * os_lock (leaf) + * protects: + * os_dirty_dnodes + * os_free_dnodes + * os_dnodes + * os_downgraded_dbufs + * dn_dirtyblksz + * dn_dirty_link + * held from: + * dnode_create: none (os_dnodes) + * dnode_destroy: none (os_dnodes) + * dnode_setdirty: none (dn_dirtyblksz, os_*_dnodes) + * dnode_free: none (dn_dirtyblksz, os_*_dnodes) + * + * ds_lock + * protects: + * ds_objset + * ds_open_refcount + * ds_snapname + * ds_phys accounting + * ds_phys userrefs zapobj + * ds_reserved + * held from: + * dsl_dataset_* + * + * dr_mtx (leaf) + * protects: + * dr_children + * held from: + * dbuf_dirty + * dbuf_undirty + * dbuf_sync_indirect + * dnode_new_blkid + */ + +struct objset; +struct dmu_pool; + +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct dmu_sendstatus { + list_node_t dss_link; + int dss_outfd; + proc_t *dss_proc; + offset_t *dss_off; + uint64_t dss_blocks; /* blocks visited during the sending process */ +} dmu_sendstatus_t; + +void dmu_object_zapify(objset_t *, uint64_t, dmu_object_type_t, dmu_tx_t *); +void dmu_object_free_zapified(objset_t *, uint64_t, dmu_tx_t *); +int dmu_buf_hold_noread(objset_t *, uint64_t, uint64_t, + void *, dmu_buf_t **); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_IMPL_H */ diff --git a/include/sys/dmu_objset.h b/include/sys/dmu_objset.h new file mode 100644 index 000000000000..1af69832c5d3 --- /dev/null +++ b/include/sys/dmu_objset.h @@ -0,0 +1,273 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_DMU_OBJSET_H +#define _SYS_DMU_OBJSET_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern krwlock_t os_lock; + +struct dsl_pool; +struct dsl_dataset; +struct dmu_tx; + +#define OBJSET_PHYS_SIZE_V1 1024 +#define OBJSET_PHYS_SIZE_V2 2048 +#define OBJSET_PHYS_SIZE_V3 4096 + +#define OBJSET_BUF_HAS_USERUSED(buf) \ + (arc_buf_size(buf) >= OBJSET_PHYS_SIZE_V2) +#define OBJSET_BUF_HAS_PROJECTUSED(buf) \ + (arc_buf_size(buf) >= OBJSET_PHYS_SIZE_V3) + +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL << 0) +#define OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE (1ULL << 1) +#define OBJSET_FLAG_PROJECTQUOTA_COMPLETE (1ULL << 2) + +/* + * This mask defines the set of flags which are "portable", meaning + * that they can be preserved when doing a raw encrypted zfs send. + * Flags included in this mask will be protected by os_portable_mac + * when the block of dnodes is encrypted. No portable flags currently + * exist. + */ +#define OBJSET_CRYPT_PORTABLE_FLAGS_MASK (0) + +typedef struct objset_phys { + dnode_phys_t os_meta_dnode; + zil_header_t os_zil_header; + uint64_t os_type; + uint64_t os_flags; + uint8_t os_portable_mac[ZIO_OBJSET_MAC_LEN]; + uint8_t os_local_mac[ZIO_OBJSET_MAC_LEN]; + char os_pad0[OBJSET_PHYS_SIZE_V2 - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2 - + 2*ZIO_OBJSET_MAC_LEN]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; + dnode_phys_t os_projectused_dnode; + char os_pad1[OBJSET_PHYS_SIZE_V3 - OBJSET_PHYS_SIZE_V2 - + sizeof (dnode_phys_t)]; +} objset_phys_t; + +typedef int (*dmu_objset_upgrade_cb_t)(objset_t *); + +#define OBJSET_PROP_UNINITIALIZED ((uint64_t)-1) +struct objset { + /* Immutable: */ + struct dsl_dataset *os_dsl_dataset; + spa_t *os_spa; + arc_buf_t *os_phys_buf; + objset_phys_t *os_phys; + boolean_t os_encrypted; + + /* + * The following "special" dnodes have no parent, are exempt + * from dnode_move(), and are not recorded in os_dnodes, but they + * root their descendents in this objset using handles anyway, so + * that all access to dnodes from dbufs consistently uses handles. + */ + dnode_handle_t os_meta_dnode; + dnode_handle_t os_userused_dnode; + dnode_handle_t os_groupused_dnode; + dnode_handle_t os_projectused_dnode; + zilog_t *os_zil; + + list_node_t os_evicting_node; + + /* can change, under dsl_dir's locks: */ + uint64_t os_dnodesize; /* default dnode size for new objects */ + enum zio_checksum os_checksum; + enum zio_compress os_compress; + uint8_t os_complevel; + uint8_t os_copies; + enum zio_checksum os_dedup_checksum; + boolean_t os_dedup_verify; + zfs_logbias_op_t os_logbias; + zfs_cache_type_t os_primary_cache; + zfs_cache_type_t os_secondary_cache; + zfs_sync_type_t os_sync; + zfs_redundant_metadata_type_t os_redundant_metadata; + uint64_t os_recordsize; + /* + * The next four values are used as a cache of whatever's on disk, and + * are initialized the first time these properties are queried. Before + * being initialized with their real values, their values are + * OBJSET_PROP_UNINITIALIZED. + */ + uint64_t os_version; + uint64_t os_normalization; + uint64_t os_utf8only; + uint64_t os_casesensitivity; + /* + * The largest zpl file block allowed in special class. + * cached here instead of zfsvfs for easier access. + */ + int os_zpl_special_smallblock; + + /* + * Pointer is constant; the blkptr it points to is protected by + * os_dsl_dataset->ds_bp_rwlock + */ + blkptr_t *os_rootbp; + + /* no lock needed: */ + struct dmu_tx *os_synctx; /* XXX sketchy */ + zil_header_t os_zil_header; + multilist_t *os_synced_dnodes; + uint64_t os_flags; + uint64_t os_freed_dnodes; + boolean_t os_rescan_dnodes; + boolean_t os_raw_receive; + + /* os_phys_buf should be written raw next txg */ + boolean_t os_next_write_raw[TXG_SIZE]; + + /* Protected by os_obj_lock */ + kmutex_t os_obj_lock; + uint64_t os_obj_next_chunk; + + /* Per-CPU next object to allocate, protected by atomic ops. */ + uint64_t *os_obj_next_percpu; + int os_obj_next_percpu_len; + + /* Protected by os_lock */ + kmutex_t os_lock; + multilist_t *os_dirty_dnodes[TXG_SIZE]; + list_t os_dnodes; + list_t os_downgraded_dbufs; + + /* Protects changes to DMU_{USER,GROUP,PROJECT}USED_OBJECT */ + kmutex_t os_userused_lock; + + /* stuff we store for the user */ + kmutex_t os_user_ptr_lock; + void *os_user_ptr; + sa_os_t *os_sa; + + /* kernel thread to upgrade this dataset */ + kmutex_t os_upgrade_lock; + taskqid_t os_upgrade_id; + dmu_objset_upgrade_cb_t os_upgrade_cb; + boolean_t os_upgrade_exit; + int os_upgrade_status; +}; + +#define DMU_META_OBJSET 0 +#define DMU_META_DNODE_OBJECT 0 +#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) +#define DMU_META_DNODE(os) ((os)->os_meta_dnode.dnh_dnode) +#define DMU_USERUSED_DNODE(os) ((os)->os_userused_dnode.dnh_dnode) +#define DMU_GROUPUSED_DNODE(os) ((os)->os_groupused_dnode.dnh_dnode) +#define DMU_PROJECTUSED_DNODE(os) ((os)->os_projectused_dnode.dnh_dnode) + +#define DMU_OS_IS_L2CACHEABLE(os) \ + ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ + (os)->os_secondary_cache == ZFS_CACHE_METADATA) + +/* called from zpl */ +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, + objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp); +int dmu_objset_own_obj(struct dsl_pool *dp, uint64_t obj, + dmu_objset_type_t type, boolean_t readonly, boolean_t decrypt, + void *tag, objset_t **osp); +void dmu_objset_refresh_ownership(struct dsl_dataset *ds, + struct dsl_dataset **newds, boolean_t decrypt, void *tag); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag); +void dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag); +int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); + +void dmu_objset_stats(objset_t *os, nvlist_t *nv); +void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); +void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dmu_objset_fsid_guid(objset_t *os); +int dmu_objset_find_dp(struct dsl_pool *dp, uint64_t ddobj, + int func(struct dsl_pool *, struct dsl_dataset *, void *), + void *arg, int flags); +void dmu_objset_evict_dbufs(objset_t *os); +inode_timespec_t dmu_objset_snap_cmtime(objset_t *os); + +/* called from dsl */ +void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +objset_t *dmu_objset_create_impl_dnstats(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, int levels, int blksz, int ibs, + dmu_tx_t *tx); +objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, + blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); +int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, + objset_t **osp); +void dmu_objset_evict(objset_t *os); +void dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx); +void dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_t *os); +int dmu_objset_userspace_upgrade(objset_t *os); +boolean_t dmu_objset_userspace_present(objset_t *os); +boolean_t dmu_objset_userobjused_enabled(objset_t *os); +boolean_t dmu_objset_userobjspace_upgradable(objset_t *os); +boolean_t dmu_objset_userobjspace_present(objset_t *os); +boolean_t dmu_objset_incompatible_encryption_version(objset_t *os); +boolean_t dmu_objset_projectquota_enabled(objset_t *os); +boolean_t dmu_objset_projectquota_present(objset_t *os); +boolean_t dmu_objset_projectquota_upgradable(objset_t *os); +void dmu_objset_id_quota_upgrade(objset_t *os); +int dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, + const void *data, zfs_file_info_t *zfi); + +int dmu_fsname(const char *snapname, char *buf); + +void dmu_objset_evict_done(objset_t *os); +void dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx); + +void dmu_objset_init(void); +void dmu_objset_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_OBJSET_H */ diff --git a/include/sys/dmu_recv.h b/include/sys/dmu_recv.h new file mode 100644 index 000000000000..dacc6b7829da --- /dev/null +++ b/include/sys/dmu_recv.h @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#ifndef _DMU_RECV_H +#define _DMU_RECV_H + +#include +#include +#include +#include +#include +#include + +extern const char *recv_clone_name; + +typedef struct dmu_recv_cookie { + struct dsl_dataset *drc_ds; + struct dmu_replay_record *drc_drr_begin; + struct drr_begin *drc_drrb; + const char *drc_tofs; + const char *drc_tosnap; + boolean_t drc_newfs; + boolean_t drc_byteswap; + uint64_t drc_featureflags; + boolean_t drc_force; + boolean_t drc_resumable; + boolean_t drc_should_save; + boolean_t drc_raw; + boolean_t drc_clone; + boolean_t drc_spill; + nvlist_t *drc_keynvl; + uint64_t drc_fromsnapobj; + uint64_t drc_ivset_guid; + void *drc_owner; + cred_t *drc_cred; + proc_t *drc_proc; + nvlist_t *drc_begin_nvl; + + objset_t *drc_os; + zfs_file_t *drc_fp; /* The file to read the stream from */ + uint64_t drc_voff; /* The current offset in the stream */ + uint64_t drc_bytes_read; + /* + * A record that has had its payload read in, but hasn't yet been handed + * off to the worker thread. + */ + struct receive_record_arg *drc_rrd; + /* A record that has had its header read in, but not its payload. */ + struct receive_record_arg *drc_next_rrd; + zio_cksum_t drc_cksum; + zio_cksum_t drc_prev_cksum; + /* Sorted list of objects not to issue prefetches for. */ + objlist_t *drc_ignore_objlist; +} dmu_recv_cookie_t; + +int dmu_recv_begin(char *, char *, dmu_replay_record_t *, + boolean_t, boolean_t, nvlist_t *, nvlist_t *, char *, + dmu_recv_cookie_t *, zfs_file_t *, offset_t *); +int dmu_recv_stream(dmu_recv_cookie_t *, offset_t *); +int dmu_recv_end(dmu_recv_cookie_t *, void *); +boolean_t dmu_objset_is_receiving(objset_t *); + +#endif /* _DMU_RECV_H */ diff --git a/include/sys/dmu_redact.h b/include/sys/dmu_redact.h new file mode 100644 index 000000000000..207fdbb5cfda --- /dev/null +++ b/include/sys/dmu_redact.h @@ -0,0 +1,58 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ +#ifndef _DMU_REDACT_H_ +#define _DMU_REDACT_H_ + +#include +#include + +#define REDACT_BLOCK_MAX_COUNT (1ULL << 48) + +static inline uint64_t +redact_block_get_size(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, + 0)); +} + +static inline void +redact_block_set_size(redact_block_phys_t *rbp, uint64_t size) +{ + BF64_SET_SB((rbp)->rbp_size_count, 48, 16, SPA_MINBLOCKSHIFT, 0, size); +} + +static inline uint64_t +redact_block_get_count(redact_block_phys_t *rbp) +{ + return (BF64_GET_SB((rbp)->rbp_size_count, 0, 48, 0, 1)); +} + +static inline void +redact_block_set_count(redact_block_phys_t *rbp, uint64_t count) +{ + BF64_SET_SB((rbp)->rbp_size_count, 0, 48, 0, 1, count); +} + +int dmu_redact_snap(const char *, nvlist_t *, const char *); +#endif /* _DMU_REDACT_H_ */ diff --git a/include/sys/dmu_send.h b/include/sys/dmu_send.h new file mode 100644 index 000000000000..d6d050e01f97 --- /dev/null +++ b/include/sys/dmu_send.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + */ + +#ifndef _DMU_SEND_H +#define _DMU_SEND_H + +#include +#include +#include +#include +#include +#include +#include + +#define BEGINNV_REDACT_SNAPS "redact_snaps" +#define BEGINNV_REDACT_FROM_SNAPS "redact_from_snaps" +#define BEGINNV_RESUME_OBJECT "resume_object" +#define BEGINNV_RESUME_OFFSET "resume_offset" + +struct vnode; +struct dsl_dataset; +struct drr_begin; +struct avl_tree; +struct dmu_replay_record; +struct dmu_send_outparams; +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook, int outfd, offset_t *off, + struct dmu_send_outparams *dsop); +int dmu_send_estimate_fast(struct dsl_dataset *ds, struct dsl_dataset *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, + boolean_t saved, uint64_t *sizep); +int dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, + struct dmu_send_outparams *dso); + +typedef int (*dmu_send_outfunc_t)(objset_t *os, void *buf, int len, void *arg); +typedef struct dmu_send_outparams { + dmu_send_outfunc_t dso_outfunc; + void *dso_arg; + boolean_t dso_dryrun; +} dmu_send_outparams_t; + +#endif /* _DMU_SEND_H */ diff --git a/include/sys/dmu_traverse.h b/include/sys/dmu_traverse.h new file mode 100644 index 000000000000..d76bfe3c9af3 --- /dev/null +++ b/include/sys/dmu_traverse.h @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_TRAVERSE_H +#define _SYS_DMU_TRAVERSE_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dnode_phys; +struct dsl_dataset; +struct zilog; +struct arc_buf; + +typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg); + +#define TRAVERSE_PRE (1<<0) +#define TRAVERSE_POST (1<<1) +#define TRAVERSE_PREFETCH_METADATA (1<<2) +#define TRAVERSE_PREFETCH_DATA (1<<3) +#define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) +#define TRAVERSE_HARD (1<<4) + +/* + * Encrypted dnode blocks have encrypted bonus buffers while the rest + * of the dnode is left unencrypted. Callers can specify the + * TRAVERSE_NO_DECRYPT flag to indicate to the traversal code that + * they wish to receive the raw encrypted dnodes instead of attempting + * to read the logical data. + */ +#define TRAVERSE_NO_DECRYPT (1<<5) + +/* Special traverse error return value to indicate skipping of children */ +#define TRAVERSE_VISIT_NO_CHILDREN -1 + +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_resume(struct dsl_dataset *ds, uint64_t txg_start, + zbookmark_phys_t *resume, int flags, blkptr_cb_t func, void *arg); +int traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); + +/* + * Note that this calculation cannot overflow with the current maximum indirect + * block size (128k). If that maximum is increased to 1M, however, this + * calculation can overflow, and handling would need to be added to ensure + * continued correctness. + */ +static inline uint64_t +bp_span_in_blocks(uint8_t indblkshift, uint64_t level) +{ + unsigned int shift = level * (indblkshift - SPA_BLKPTRSHIFT); + ASSERT3U(shift, <, 64); + return (1ULL << shift); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TRAVERSE_H */ diff --git a/include/sys/dmu_tx.h b/include/sys/dmu_tx.h new file mode 100644 index 000000000000..60e9ed6e26f5 --- /dev/null +++ b/include/sys/dmu_tx.h @@ -0,0 +1,177 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DMU_TX_H +#define _SYS_DMU_TX_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf_impl; +struct dmu_tx_hold; +struct dnode_link; +struct dsl_pool; +struct dnode; +struct dsl_dir; + +struct dmu_tx { + /* + * No synchronization is needed because a tx can only be handled + * by one thread. + */ + list_t tx_holds; /* list of dmu_tx_hold_t */ + objset_t *tx_objset; + struct dsl_dir *tx_dir; + struct dsl_pool *tx_pool; + uint64_t tx_txg; + uint64_t tx_lastsnap_txg; + uint64_t tx_lasttried_txg; + txg_handle_t tx_txgh; + void *tx_tempreserve_cookie; + struct dmu_tx_hold *tx_needassign_txh; + + /* list of dmu_tx_callback_t on this dmu_tx */ + list_t tx_callbacks; + + /* placeholder for syncing context, doesn't need specific holds */ + boolean_t tx_anyobj; + + /* transaction is marked as being a "net free" of space */ + boolean_t tx_netfree; + + /* time this transaction was created */ + hrtime_t tx_start; + + /* need to wait for sufficient dirty space */ + boolean_t tx_wait_dirty; + + /* has this transaction already been delayed? */ + boolean_t tx_dirty_delayed; + + int tx_err; +}; + +enum dmu_tx_hold_type { + THT_NEWOBJECT, + THT_WRITE, + THT_BONUS, + THT_FREE, + THT_ZAP, + THT_SPACE, + THT_SPILL, + THT_NUMTYPES +}; + +typedef struct dmu_tx_hold { + dmu_tx_t *txh_tx; + list_node_t txh_node; + struct dnode *txh_dnode; + zfs_refcount_t txh_space_towrite; + zfs_refcount_t txh_memory_tohold; + enum dmu_tx_hold_type txh_type; + uint64_t txh_arg1; + uint64_t txh_arg2; +} dmu_tx_hold_t; + +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; + +/* + * Used for dmu tx kstat. + */ +typedef struct dmu_tx_stats { + kstat_named_t dmu_tx_assigned; + kstat_named_t dmu_tx_delay; + kstat_named_t dmu_tx_error; + kstat_named_t dmu_tx_suspended; + kstat_named_t dmu_tx_group; + kstat_named_t dmu_tx_memory_reserve; + kstat_named_t dmu_tx_memory_reclaim; + kstat_named_t dmu_tx_dirty_throttle; + kstat_named_t dmu_tx_dirty_delay; + kstat_named_t dmu_tx_dirty_over_max; + kstat_named_t dmu_tx_dirty_frees_delay; + kstat_named_t dmu_tx_quota; +} dmu_tx_stats_t; + +extern dmu_tx_stats_t dmu_tx_stats; + +#define DMU_TX_STAT_INCR(stat, val) \ + atomic_add_64(&dmu_tx_stats.stat.value.ui64, (val)); +#define DMU_TX_STAT_BUMP(stat) \ + DMU_TX_STAT_INCR(stat, 1); + +/* + * These routines are defined in dmu.h, and are called by the user. + */ +dmu_tx_t *dmu_tx_create(objset_t *dd); +int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); +void dmu_tx_commit(dmu_tx_t *tx); +void dmu_tx_abort(dmu_tx_t *tx); +uint64_t dmu_tx_get_txg(dmu_tx_t *tx); +struct dsl_pool *dmu_tx_pool(dmu_tx_t *tx); +void dmu_tx_wait(dmu_tx_t *tx); + +/* + * These routines are defined in dmu_spa.h, and are called by the SPA. + */ +extern dmu_tx_t *dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg); + +/* + * These routines are only called by the DMU. + */ +dmu_tx_t *dmu_tx_create_dd(dsl_dir_t *dd); +int dmu_tx_is_syncing(dmu_tx_t *tx); +int dmu_tx_private_ok(dmu_tx_t *tx); +void dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn); +void dmu_tx_dirty_buf(dmu_tx_t *tx, struct dmu_buf_impl *db); +void dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space); + +#ifdef ZFS_DEBUG +#define DMU_TX_DIRTY_BUF(tx, db) dmu_tx_dirty_buf(tx, db) +#else +#define DMU_TX_DIRTY_BUF(tx, db) +#endif + +void dmu_tx_init(void); +void dmu_tx_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DMU_TX_H */ diff --git a/include/sys/dmu_zfetch.h b/include/sys/dmu_zfetch.h new file mode 100644 index 000000000000..4303ab314ced --- /dev/null +++ b/include/sys/dmu_zfetch.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + */ + +#ifndef _DMU_ZFETCH_H +#define _DMU_ZFETCH_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern unsigned long zfetch_array_rd_sz; + +struct dnode; /* so we can reference dnode */ + +typedef struct zstream { + uint64_t zs_blkid; /* expect next access at this blkid */ + uint64_t zs_pf_blkid; /* next block to prefetch */ + + /* + * We will next prefetch the L1 indirect block of this level-0 + * block id. + */ + uint64_t zs_ipf_blkid; + + kmutex_t zs_lock; /* protects stream */ + hrtime_t zs_atime; /* time last prefetch issued */ + list_node_t zs_node; /* link for zf_stream */ +} zstream_t; + +typedef struct zfetch { + kmutex_t zf_lock; /* protects zfetch structure */ + list_t zf_stream; /* list of zstream_t's */ + struct dnode *zf_dnode; /* dnode that owns this zfetch */ +} zfetch_t; + +void zfetch_init(void); +void zfetch_fini(void); + +void dmu_zfetch_init(zfetch_t *, struct dnode *); +void dmu_zfetch_fini(zfetch_t *); +void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t, + boolean_t); + + +#ifdef __cplusplus +} +#endif + +#endif /* _DMU_ZFETCH_H */ diff --git a/include/sys/dnode.h b/include/sys/dnode.h new file mode 100644 index 000000000000..3208b60f0e7b --- /dev/null +++ b/include/sys/dnode.h @@ -0,0 +1,627 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DNODE_H +#define _SYS_DNODE_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * dnode_hold() flags. + */ +#define DNODE_MUST_BE_ALLOCATED 1 +#define DNODE_MUST_BE_FREE 2 +#define DNODE_DRY_RUN 4 + +/* + * dnode_next_offset() flags. + */ +#define DNODE_FIND_HOLE 1 +#define DNODE_FIND_BACKWARDS 2 +#define DNODE_FIND_HAVELOCK 4 + +/* + * Fixed constants. + */ +#define DNODE_SHIFT 9 /* 512 bytes */ +#define DN_MIN_INDBLKSHIFT 12 /* 4k */ +/* + * If we ever increase this value beyond 20, we need to revisit all logic that + * does x << level * ebps to handle overflow. With a 1M indirect block size, + * 4 levels of indirect blocks would not be able to guarantee addressing an + * entire object, so 5 levels will be used, but 5 * (20 - 7) = 65. + */ +#define DN_MAX_INDBLKSHIFT 17 /* 128k */ +#define DNODE_BLOCK_SHIFT 14 /* 16k */ +#define DNODE_CORE_SIZE 64 /* 64 bytes for dnode sans blkptrs */ +#define DN_MAX_OBJECT_SHIFT 48 /* 256 trillion (zfs_fid_t limit) */ +#define DN_MAX_OFFSET_SHIFT 64 /* 2^64 bytes in a dnode */ + +/* + * dnode id flags + * + * Note: a file will never ever have its ids moved from bonus->spill + */ +#define DN_ID_CHKED_BONUS 0x1 +#define DN_ID_CHKED_SPILL 0x2 +#define DN_ID_OLD_EXIST 0x4 +#define DN_ID_NEW_EXIST 0x8 + +/* + * Derived constants. + */ +#define DNODE_MIN_SIZE (1 << DNODE_SHIFT) +#define DNODE_MAX_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_BLOCK_SIZE (1 << DNODE_BLOCK_SHIFT) +#define DNODE_MIN_SLOTS (DNODE_MIN_SIZE >> DNODE_SHIFT) +#define DNODE_MAX_SLOTS (DNODE_MAX_SIZE >> DNODE_SHIFT) +#define DN_BONUS_SIZE(dnsize) ((dnsize) - DNODE_CORE_SIZE - \ + (1 << SPA_BLKPTRSHIFT)) +#define DN_SLOTS_TO_BONUSLEN(slots) DN_BONUS_SIZE((slots) << DNODE_SHIFT) +#define DN_OLD_MAX_BONUSLEN (DN_BONUS_SIZE(DNODE_MIN_SIZE)) +#define DN_MAX_NBLKPTR ((DNODE_MIN_SIZE - DNODE_CORE_SIZE) >> SPA_BLKPTRSHIFT) +#define DN_MAX_OBJECT (1ULL << DN_MAX_OBJECT_SHIFT) +#define DN_ZERO_BONUSLEN (DN_BONUS_SIZE(DNODE_MAX_SIZE) + 1) +#define DN_KILL_SPILLBLK (1) + +#define DN_SLOT_UNINIT ((void *)NULL) /* Uninitialized */ +#define DN_SLOT_FREE ((void *)1UL) /* Free slot */ +#define DN_SLOT_ALLOCATED ((void *)2UL) /* Allocated slot */ +#define DN_SLOT_INTERIOR ((void *)3UL) /* Interior allocated slot */ +#define DN_SLOT_IS_PTR(dn) ((void *)dn > DN_SLOT_INTERIOR) +#define DN_SLOT_IS_VALID(dn) ((void *)dn != NULL) + +#define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) +#define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) + +/* + * This is inaccurate if the indblkshift of the particular object is not the + * max. But it's only used by userland to calculate the zvol reservation. + */ +#define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) + +#define DN_MAX_LEVELS (DIV_ROUND_UP(DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT, \ + DN_MIN_INDBLKSHIFT - SPA_BLKPTRSHIFT) + 1) + +#define DN_BONUS(dnp) ((void*)((dnp)->dn_bonus + \ + (((dnp)->dn_nblkptr - 1) * sizeof (blkptr_t)))) +#define DN_MAX_BONUS_LEN(dnp) \ + ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) ? \ + (uint8_t *)DN_SPILL_BLKPTR(dnp) - (uint8_t *)DN_BONUS(dnp) : \ + (uint8_t *)(dnp + (dnp->dn_extra_slots + 1)) - (uint8_t *)DN_BONUS(dnp)) + +#define DN_USED_BYTES(dnp) (((dnp)->dn_flags & DNODE_FLAG_USED_BYTES) ? \ + (dnp)->dn_used : (dnp)->dn_used << SPA_MINBLOCKSHIFT) + +#define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) + +struct dmu_buf_impl; +struct objset; +struct zio; + +enum dnode_dirtycontext { + DN_UNDIRTIED, + DN_DIRTY_OPEN, + DN_DIRTY_SYNC +}; + +/* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ +#define DNODE_FLAG_USED_BYTES (1 << 0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1 << 1) + +/* Does dnode have a SA spill blkptr in bonus? */ +#define DNODE_FLAG_SPILL_BLKPTR (1 << 2) + +/* User/Group/Project dnode accounting */ +#define DNODE_FLAG_USEROBJUSED_ACCOUNTED (1 << 3) + +/* + * This mask defines the set of flags which are "portable", meaning + * that they can be preserved when doing a raw encrypted zfs send. + * Flags included in this mask will be protected by AAD when the block + * of dnodes is encrypted. + */ +#define DNODE_CRYPT_PORTABLE_FLAGS_MASK (DNODE_FLAG_SPILL_BLKPTR) + +/* + * VARIABLE-LENGTH (LARGE) DNODES + * + * The motivation for variable-length dnodes is to eliminate the overhead + * associated with using spill blocks. Spill blocks are used to store + * system attribute data (i.e. file metadata) that does not fit in the + * dnode's bonus buffer. By allowing a larger bonus buffer area the use of + * a spill block can be avoided. Spill blocks potentially incur an + * additional read I/O for every dnode in a dnode block. As a worst case + * example, reading 32 dnodes from a 16k dnode block and all of the spill + * blocks could issue 33 separate reads. Now suppose those dnodes have size + * 1024 and therefore don't need spill blocks. Then the worst case number + * of blocks read is reduced to from 33 to two--one per dnode block. + * + * ZFS-on-Linux systems that make heavy use of extended attributes benefit + * from this feature. In particular, ZFS-on-Linux supports the xattr=sa + * dataset property which allows file extended attribute data to be stored + * in the dnode bonus buffer as an alternative to the traditional + * directory-based format. Workloads such as SELinux and the Lustre + * distributed filesystem often store enough xattr data to force spill + * blocks when xattr=sa is in effect. Large dnodes may therefore provide a + * performance benefit to such systems. Other use cases that benefit from + * this feature include files with large ACLs and symbolic links with long + * target names. + * + * The size of a dnode may be a multiple of 512 bytes up to the size of a + * dnode block (currently 16384 bytes). The dn_extra_slots field of the + * on-disk dnode_phys_t structure describes the size of the physical dnode + * on disk. The field represents how many "extra" dnode_phys_t slots a + * dnode consumes in its dnode block. This convention results in a value of + * 0 for 512 byte dnodes which preserves on-disk format compatibility with + * older software which doesn't support large dnodes. + * + * Similarly, the in-memory dnode_t structure has a dn_num_slots field + * to represent the total number of dnode_phys_t slots consumed on disk. + * Thus dn->dn_num_slots is 1 greater than the corresponding + * dnp->dn_extra_slots. This difference in convention was adopted + * because, unlike on-disk structures, backward compatibility is not a + * concern for in-memory objects, so we used a more natural way to + * represent size for a dnode_t. + * + * The default size for newly created dnodes is determined by the value of + * the "dnodesize" dataset property. By default the property is set to + * "legacy" which is compatible with older software. Setting the property + * to "auto" will allow the filesystem to choose the most suitable dnode + * size. Currently this just sets the default dnode size to 1k, but future + * code improvements could dynamically choose a size based on observed + * workload patterns. Dnodes of varying sizes can coexist within the same + * dataset and even within the same dnode block. + */ + +typedef struct dnode_phys { + uint8_t dn_type; /* dmu_object_type_t */ + uint8_t dn_indblkshift; /* ln2(indirect block size) */ + uint8_t dn_nlevels; /* 1=dn_blkptr->data blocks */ + uint8_t dn_nblkptr; /* length of dn_blkptr */ + uint8_t dn_bonustype; /* type of data in bonus buffer */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_flags; /* DNODE_FLAG_* */ + uint16_t dn_datablkszsec; /* data block size in 512b sectors */ + uint16_t dn_bonuslen; /* length of dn_bonus */ + uint8_t dn_extra_slots; /* # of subsequent slots consumed */ + uint8_t dn_pad2[3]; + + /* accounting is protected by dn_dirty_mtx */ + uint64_t dn_maxblkid; /* largest allocated block ID */ + uint64_t dn_used; /* bytes (or sectors) of disk space */ + + /* + * Both dn_pad2 and dn_pad3 are protected by the block's MAC. This + * allows us to protect any fields that might be added here in the + * future. In either case, developers will want to check + * zio_crypt_init_uios_dnode() to ensure the new field is being + * protected properly. + */ + uint64_t dn_pad3[4]; + + /* + * The tail region is 448 bytes for a 512 byte dnode, and + * correspondingly larger for larger dnode sizes. The spill + * block pointer, when present, is always at the end of the tail + * region. There are three ways this space may be used, using + * a 512 byte dnode for this diagram: + * + * 0 64 128 192 256 320 384 448 (offset) + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_blkptr[1] | dn_blkptr[2] | / | + * +---------------+---------------+---------------+-------+ + * | dn_blkptr[0] | dn_bonus[0..319] | + * +---------------+-----------------------+---------------+ + * | dn_blkptr[0] | dn_bonus[0..191] | dn_spill | + * +---------------+-----------------------+---------------+ + */ + union { + blkptr_t dn_blkptr[1+DN_OLD_MAX_BONUSLEN/sizeof (blkptr_t)]; + struct { + blkptr_t __dn_ignore1; + uint8_t dn_bonus[DN_OLD_MAX_BONUSLEN]; + }; + struct { + blkptr_t __dn_ignore2; + uint8_t __dn_ignore3[DN_OLD_MAX_BONUSLEN - + sizeof (blkptr_t)]; + blkptr_t dn_spill; + }; + }; +} dnode_phys_t; + +#define DN_SPILL_BLKPTR(dnp) ((blkptr_t *)((char *)(dnp) + \ + (((dnp)->dn_extra_slots + 1) << DNODE_SHIFT) - (1 << SPA_BLKPTRSHIFT))) + +struct dnode { + /* + * Protects the structure of the dnode, including the number of levels + * of indirection (dn_nlevels), dn_maxblkid, and dn_next_* + */ + krwlock_t dn_struct_rwlock; + + /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ + list_node_t dn_link; + + /* immutable: */ + struct objset *dn_objset; + uint64_t dn_object; + struct dmu_buf_impl *dn_dbuf; + struct dnode_handle *dn_handle; + dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ + + /* + * Copies of stuff in dn_phys. They're valid in the open + * context (eg. even before the dnode is first synced). + * Where necessary, these are protected by dn_struct_rwlock. + */ + dmu_object_type_t dn_type; /* object type */ + uint16_t dn_bonuslen; /* bonus length */ + uint8_t dn_bonustype; /* bonus type */ + uint8_t dn_nblkptr; /* number of blkptrs (immutable) */ + uint8_t dn_checksum; /* ZIO_CHECKSUM type */ + uint8_t dn_compress; /* ZIO_COMPRESS type */ + uint8_t dn_nlevels; + uint8_t dn_indblkshift; + uint8_t dn_datablkshift; /* zero if blksz not power of 2! */ + uint8_t dn_moved; /* Has this dnode been moved? */ + uint16_t dn_datablkszsec; /* in 512b sectors */ + uint32_t dn_datablksz; /* in bytes */ + uint64_t dn_maxblkid; + uint8_t dn_next_type[TXG_SIZE]; + uint8_t dn_num_slots; /* metadnode slots consumed on disk */ + uint8_t dn_next_nblkptr[TXG_SIZE]; + uint8_t dn_next_nlevels[TXG_SIZE]; + uint8_t dn_next_indblkshift[TXG_SIZE]; + uint8_t dn_next_bonustype[TXG_SIZE]; + uint8_t dn_rm_spillblk[TXG_SIZE]; /* for removing spill blk */ + uint16_t dn_next_bonuslen[TXG_SIZE]; + uint32_t dn_next_blksz[TXG_SIZE]; /* next block size in bytes */ + uint64_t dn_next_maxblkid[TXG_SIZE]; /* next maxblkid in bytes */ + + /* protected by dn_dbufs_mtx; declared here to fill 32-bit hole */ + uint32_t dn_dbufs_count; /* count of dn_dbufs */ + + /* protected by os_lock: */ + multilist_node_t dn_dirty_link[TXG_SIZE]; /* next on dataset's dirty */ + + /* protected by dn_mtx: */ + kmutex_t dn_mtx; + list_t dn_dirty_records[TXG_SIZE]; + struct range_tree *dn_free_ranges[TXG_SIZE]; + uint64_t dn_allocated_txg; + uint64_t dn_free_txg; + uint64_t dn_assigned_txg; + uint64_t dn_dirty_txg; /* txg dnode was last dirtied */ + kcondvar_t dn_notxholds; + kcondvar_t dn_nodnholds; + enum dnode_dirtycontext dn_dirtyctx; + void *dn_dirtyctx_firstset; /* dbg: contents meaningless */ + + /* protected by own devices */ + zfs_refcount_t dn_tx_holds; + zfs_refcount_t dn_holds; + + kmutex_t dn_dbufs_mtx; + /* + * Descendent dbufs, ordered by dbuf_compare. Note that dn_dbufs + * can contain multiple dbufs of the same (level, blkid) when a + * dbuf is marked DB_EVICTING without being removed from + * dn_dbufs. To maintain the avl invariant that there cannot be + * duplicate entries, we order the dbufs by an arbitrary value - + * their address in memory. This means that dn_dbufs cannot be used to + * directly look up a dbuf. Instead, callers must use avl_walk, have + * a reference to the dbuf, or look up a non-existent node with + * db_state = DB_SEARCH (see dbuf_free_range for an example). + */ + avl_tree_t dn_dbufs; + + /* protected by dn_struct_rwlock */ + struct dmu_buf_impl *dn_bonus; /* bonus buffer dbuf */ + + boolean_t dn_have_spill; /* have spill or are spilling */ + + /* parent IO for current sync write */ + zio_t *dn_zio; + + /* used in syncing context */ + uint64_t dn_oldused; /* old phys used bytes */ + uint64_t dn_oldflags; /* old phys dn_flags */ + uint64_t dn_olduid, dn_oldgid, dn_oldprojid; + uint64_t dn_newuid, dn_newgid, dn_newprojid; + int dn_id_flags; + + /* holds prefetch structure */ + struct zfetch dn_zfetch; +}; + +/* + * Since AVL already has embedded element counter, use dn_dbufs_count + * only for dbufs not counted there (bonus buffers) and just add them. + */ +#define DN_DBUFS_COUNT(dn) ((dn)->dn_dbufs_count + \ + avl_numnodes(&(dn)->dn_dbufs)) + +/* + * We use this (otherwise unused) bit to indicate if the value of + * dn_next_maxblkid[txgoff] is valid to use in dnode_sync(). + */ +#define DMU_NEXT_MAXBLKID_SET (1ULL << 63) + +/* + * Adds a level of indirection between the dbuf and the dnode to avoid + * iterating descendent dbufs in dnode_move(). Handles are not allocated + * individually, but as an array of child dnodes in dnode_hold_impl(). + */ +typedef struct dnode_handle { + /* Protects dnh_dnode from modification by dnode_move(). */ + zrlock_t dnh_zrlock; + dnode_t *dnh_dnode; +} dnode_handle_t; + +typedef struct dnode_children { + dmu_buf_user_t dnc_dbu; /* User evict data */ + size_t dnc_count; /* number of children */ + dnode_handle_t dnc_children[]; /* sized dynamically */ +} dnode_children_t; + +typedef struct free_range { + avl_node_t fr_node; + uint64_t fr_blkid; + uint64_t fr_nblks; +} free_range_t; + +void dnode_special_open(struct objset *dd, dnode_phys_t *dnp, + uint64_t object, dnode_handle_t *dnh); +void dnode_special_close(dnode_handle_t *dnh); + +void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); +void dnode_setbonus_type(dnode_t *dn, dmu_object_type_t, dmu_tx_t *tx); +void dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx); + +int dnode_hold(struct objset *dd, uint64_t object, + void *ref, dnode_t **dnp); +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, int dn_slots, + void *ref, dnode_t **dnp); +boolean_t dnode_add_ref(dnode_t *dn, void *ref); +void dnode_rele(dnode_t *dn, void *ref); +void dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting); +int dnode_try_claim(objset_t *os, uint64_t object, int slots); +void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx); +void dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag); +void dnode_sync(dnode_t *dn, dmu_tx_t *tx); +void dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx); +void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, + boolean_t keep_spill, dmu_tx_t *tx); +void dnode_free(dnode_t *dn, dmu_tx_t *tx); +void dnode_byteswap(dnode_phys_t *dnp); +void dnode_buf_byteswap(void *buf, size_t size); +void dnode_verify(dnode_t *dn); +int dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx); +int dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx); +void dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx); +void dnode_diduse_space(dnode_t *dn, int64_t space); +void dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, + boolean_t have_read, boolean_t force); +uint64_t dnode_block_freed(dnode_t *dn, uint64_t blkid); +void dnode_init(void); +void dnode_fini(void); +int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off, + int minlvl, uint64_t blkfill, uint64_t txg); +void dnode_evict_dbufs(dnode_t *dn); +void dnode_evict_bonus(dnode_t *dn); +void dnode_free_interior_slots(dnode_t *dn); + +#define DNODE_IS_DIRTY(_dn) \ + ((_dn)->dn_dirty_txg >= spa_syncing_txg((_dn)->dn_objset->os_spa)) + +#define DNODE_IS_CACHEABLE(_dn) \ + ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (DMU_OT_IS_METADATA((_dn)->dn_type) && \ + (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)) + +#define DNODE_META_IS_CACHEABLE(_dn) \ + ((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \ + (_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA) + +/* + * Used for dnodestats kstat. + */ +typedef struct dnode_stats { + /* + * Number of failed attempts to hold a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_hold; + /* + * Number of failed attempts to read a meta dnode dbuf. + */ + kstat_named_t dnode_hold_dbuf_read; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was able + * to hold the requested object number which was allocated. This is + * the common case when looking up any allocated object number. + */ + kstat_named_t dnode_hold_alloc_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because it was not allocated. + */ + kstat_named_t dnode_hold_alloc_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) was not + * able to hold the request object number because the object number + * refers to an interior large dnode slot. + */ + kstat_named_t dnode_hold_alloc_interior; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_alloc_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) did not + * need to create the dnode because another thread did so after + * dropping the read lock but before acquiring the write lock. + */ + kstat_named_t dnode_hold_alloc_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_ALLOCATED) found + * a free dnode instantiated by dnode_create() but not yet allocated + * by dnode_allocate(). + */ + kstat_named_t dnode_hold_alloc_type_none; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was able + * to hold the requested range of free dnode slots. + */ + kstat_named_t dnode_hold_free_hits; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) was not + * able to hold the requested range of free dnode slots because + * after acquiring the zrl lock at least one slot was allocated. + */ + kstat_named_t dnode_hold_free_lock_misses; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) needed + * to retry acquiring slot zrl locks due to contention. + */ + kstat_named_t dnode_hold_free_lock_retry; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which were held by another thread. + */ + kstat_named_t dnode_hold_free_refcount; + /* + * Number of times dnode_hold(..., DNODE_MUST_BE_FREE) requested + * a range of dnode slots which would overflow the dnode_phys_t. + */ + kstat_named_t dnode_hold_free_overflow; + /* + * Number of times dnode_free_interior_slots() needed to retry + * acquiring a slot zrl lock due to contention. + */ + kstat_named_t dnode_free_interior_lock_retry; + /* + * Number of new dnodes allocated by dnode_allocate(). + */ + kstat_named_t dnode_allocate; + /* + * Number of dnodes re-allocated by dnode_reallocate(). + */ + kstat_named_t dnode_reallocate; + /* + * Number of meta dnode dbufs evicted. + */ + kstat_named_t dnode_buf_evict; + /* + * Number of times dmu_object_alloc*() reached the end of the existing + * object ID chunk and advanced to a new one. + */ + kstat_named_t dnode_alloc_next_chunk; + /* + * Number of times multiple threads attempted to allocate a dnode + * from the same block of free dnodes. + */ + kstat_named_t dnode_alloc_race; + /* + * Number of times dmu_object_alloc*() was forced to advance to the + * next meta dnode dbuf due to an error from dmu_object_next(). + */ + kstat_named_t dnode_alloc_next_block; + /* + * Statistics for tracking dnodes which have been moved. + */ + kstat_named_t dnode_move_invalid; + kstat_named_t dnode_move_recheck1; + kstat_named_t dnode_move_recheck2; + kstat_named_t dnode_move_special; + kstat_named_t dnode_move_handle; + kstat_named_t dnode_move_rwlock; + kstat_named_t dnode_move_active; +} dnode_stats_t; + +extern dnode_stats_t dnode_stats; + +#define DNODE_STAT_INCR(stat, val) \ + atomic_add_64(&dnode_stats.stat.value.ui64, (val)); +#define DNODE_STAT_BUMP(stat) \ + DNODE_STAT_INCR(stat, 1); + +#ifdef ZFS_DEBUG + +#define dprintf_dnode(dn, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char __db_buf[32]; \ + uint64_t __db_obj = (dn)->dn_object; \ + if (__db_obj == DMU_META_DNODE_OBJECT) \ + (void) strcpy(__db_buf, "mdn"); \ + else \ + (void) snprintf(__db_buf, sizeof (__db_buf), "%lld", \ + (u_longlong_t)__db_obj);\ + dprintf_ds((dn)->dn_objset->os_dsl_dataset, "obj=%s " fmt, \ + __db_buf, __VA_ARGS__); \ + } \ +_NOTE(CONSTCOND) } while (0) + +#define DNODE_VERIFY(dn) dnode_verify(dn) +#define FREE_VERIFY(db, start, end, tx) free_verify(db, start, end, tx) + +#else + +#define dprintf_dnode(db, fmt, ...) +#define DNODE_VERIFY(dn) +#define FREE_VERIFY(db, start, end, tx) + +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DNODE_H */ diff --git a/include/sys/dsl_bookmark.h b/include/sys/dsl_bookmark.h new file mode 100644 index 000000000000..70f4813449aa --- /dev/null +++ b/include/sys/dsl_bookmark.h @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_BOOKMARK_H +#define _SYS_DSL_BOOKMARK_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On disk zap object. + */ +typedef struct zfs_bookmark_phys { + uint64_t zbm_guid; /* guid of bookmarked dataset */ + uint64_t zbm_creation_txg; /* birth transaction group */ + uint64_t zbm_creation_time; /* bookmark creation time */ + + /* fields used for redacted send / recv */ + uint64_t zbm_redaction_obj; /* redaction list object */ + uint64_t zbm_flags; /* ZBM_FLAG_* */ + + /* fields used for bookmark written size */ + uint64_t zbm_referenced_bytes_refd; + uint64_t zbm_compressed_bytes_refd; + uint64_t zbm_uncompressed_bytes_refd; + uint64_t zbm_referenced_freed_before_next_snap; + uint64_t zbm_compressed_freed_before_next_snap; + uint64_t zbm_uncompressed_freed_before_next_snap; + + /* fields used for raw sends */ + uint64_t zbm_ivset_guid; +} zfs_bookmark_phys_t; + + +#define BOOKMARK_PHYS_SIZE_V1 (3 * sizeof (uint64_t)) +#define BOOKMARK_PHYS_SIZE_V2 (12 * sizeof (uint64_t)) + +typedef enum zbm_flags { + ZBM_FLAG_HAS_FBN = (1 << 0), + ZBM_FLAG_SNAPSHOT_EXISTS = (1 << 1), +} zbm_flags_t; + +typedef struct redaction_list_phys { + uint64_t rlp_last_object; + uint64_t rlp_last_blkid; + uint64_t rlp_num_entries; + uint64_t rlp_num_snaps; + uint64_t rlp_snaps[]; /* variable length */ +} redaction_list_phys_t; + +typedef struct redaction_list { + dmu_buf_user_t rl_dbu; + redaction_list_phys_t *rl_phys; + dmu_buf_t *rl_dbuf; + uint64_t rl_object; + zfs_refcount_t rl_longholds; + objset_t *rl_mos; +} redaction_list_t; + +/* node in ds_bookmarks */ +typedef struct dsl_bookmark_node { + char *dbn_name; /* free with strfree() */ + kmutex_t dbn_lock; /* protects dirty/phys in block_killed */ + boolean_t dbn_dirty; /* in currently syncing txg */ + zfs_bookmark_phys_t dbn_phys; + avl_node_t dbn_node; +} dsl_bookmark_node_t; + +typedef struct redact_block_phys { + uint64_t rbp_object; + uint64_t rbp_blkid; + /* + * The top 16 bits of this field represent the block size in sectors of + * the blocks in question; the bottom 48 bits are used to store the + * number of consecutive blocks that are in the redaction list. They + * should be accessed using the inline functions below. + */ + uint64_t rbp_size_count; + uint64_t rbp_padding; +} redact_block_phys_t; + +typedef int (*rl_traverse_callback_t)(redact_block_phys_t *, void *); + + +typedef struct dsl_bookmark_create_arg { + nvlist_t *dbca_bmarks; + nvlist_t *dbca_errors; +} dsl_bookmark_create_arg_t; + +typedef struct dsl_bookmark_create_redacted_arg { + const char *dbcra_bmark; + const char *dbcra_snap; + redaction_list_t **dbcra_rl; + uint64_t dbcra_numsnaps; + uint64_t *dbcra_snaps; + void *dbcra_tag; +} dsl_bookmark_create_redacted_arg_t; + +int dsl_bookmark_create(nvlist_t *, nvlist_t *); +int dsl_bookmark_create_nvl_validate(nvlist_t *); +int dsl_bookmark_create_check(void *arg, dmu_tx_t *tx); +void dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx); +int dsl_bookmark_create_redacted(const char *, const char *, uint64_t, + uint64_t *, void *, redaction_list_t **); +int dsl_get_bookmarks(const char *, nvlist_t *, nvlist_t *); +int dsl_get_bookmarks_impl(dsl_dataset_t *, nvlist_t *, nvlist_t *); +int dsl_get_bookmark_props(const char *, const char *, nvlist_t *); +int dsl_bookmark_destroy(nvlist_t *, nvlist_t *); +int dsl_bookmark_lookup(struct dsl_pool *, const char *, + struct dsl_dataset *, zfs_bookmark_phys_t *); +int dsl_bookmark_lookup_impl(dsl_dataset_t *, const char *, + zfs_bookmark_phys_t *); +int dsl_redaction_list_hold_obj(struct dsl_pool *, uint64_t, void *, + redaction_list_t **); +void dsl_redaction_list_rele(redaction_list_t *, void *); +void dsl_redaction_list_long_hold(struct dsl_pool *, redaction_list_t *, + void *); +void dsl_redaction_list_long_rele(redaction_list_t *, void *); +boolean_t dsl_redaction_list_long_held(redaction_list_t *); +int dsl_bookmark_init_ds(dsl_dataset_t *); +void dsl_bookmark_fini_ds(dsl_dataset_t *); +boolean_t dsl_bookmark_ds_destroyed(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_snapshotted(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_block_killed(dsl_dataset_t *, const blkptr_t *, dmu_tx_t *); +void dsl_bookmark_sync_done(dsl_dataset_t *, dmu_tx_t *); +void dsl_bookmark_node_add(dsl_dataset_t *, dsl_bookmark_node_t *, dmu_tx_t *); +uint64_t dsl_bookmark_latest_txg(dsl_dataset_t *); +int dsl_redaction_list_traverse(redaction_list_t *, zbookmark_phys_t *, + rl_traverse_callback_t, void *); +void dsl_bookmark_next_changed(dsl_dataset_t *, dsl_dataset_t *, dmu_tx_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_BOOKMARK_H */ diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h new file mode 100644 index 000000000000..835720c87872 --- /dev/null +++ b/include/sys/dsl_crypt.h @@ -0,0 +1,225 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_CRYPT_H +#define _SYS_DSL_CRYPT_H + +#include +#include +#include +#include +#include + +/* + * ZAP entry keys for DSL Crypto Keys stored on disk. In addition, + * ZFS_PROP_KEYFORMAT, ZFS_PROP_PBKDF2_SALT, and ZFS_PROP_PBKDF2_ITERS are + * also maintained here using their respective property names. + */ +#define DSL_CRYPTO_KEY_CRYPTO_SUITE "DSL_CRYPTO_SUITE" +#define DSL_CRYPTO_KEY_GUID "DSL_CRYPTO_GUID" +#define DSL_CRYPTO_KEY_IV "DSL_CRYPTO_IV" +#define DSL_CRYPTO_KEY_MAC "DSL_CRYPTO_MAC" +#define DSL_CRYPTO_KEY_MASTER_KEY "DSL_CRYPTO_MASTER_KEY_1" +#define DSL_CRYPTO_KEY_HMAC_KEY "DSL_CRYPTO_HMAC_KEY_1" +#define DSL_CRYPTO_KEY_ROOT_DDOBJ "DSL_CRYPTO_ROOT_DDOBJ" +#define DSL_CRYPTO_KEY_REFCOUNT "DSL_CRYPTO_REFCOUNT" +#define DSL_CRYPTO_KEY_VERSION "DSL_CRYPTO_VERSION" + +/* + * In-memory representation of a wrapping key. One of these structs will exist + * for each encryption root with its key loaded. + */ +typedef struct dsl_wrapping_key { + /* link on spa_keystore_t:sk_wkeys */ + avl_node_t wk_avl_link; + + /* keyformat property enum */ + zfs_keyformat_t wk_keyformat; + + /* the pbkdf2 salt, if the keyformat is of type passphrase */ + uint64_t wk_salt; + + /* the pbkdf2 iterations, if the keyformat is of type passphrase */ + uint64_t wk_iters; + + /* actual wrapping key */ + crypto_key_t wk_key; + + /* refcount of number of dsl_crypto_key_t's holding this struct */ + zfs_refcount_t wk_refcnt; + + /* dsl directory object that owns this wrapping key */ + uint64_t wk_ddobj; +} dsl_wrapping_key_t; + +/* enum of commands indicating special actions that should be run */ +typedef enum dcp_cmd { + /* key creation commands */ + DCP_CMD_NONE = 0, /* no specific command */ + DCP_CMD_RAW_RECV, /* raw receive */ + + /* key changing commands */ + DCP_CMD_NEW_KEY, /* rewrap key as an encryption root */ + DCP_CMD_INHERIT, /* rewrap key with parent's wrapping key */ + DCP_CMD_FORCE_NEW_KEY, /* change to encryption root without rewrap */ + DCP_CMD_FORCE_INHERIT, /* inherit parent's key without rewrap */ + + DCP_CMD_MAX +} dcp_cmd_t; + +/* + * This struct is a simple wrapper around all the parameters that are usually + * required to setup encryption. It exists so that all of the params can be + * passed around the kernel together for convenience. + */ +typedef struct dsl_crypto_params { + /* command indicating intended action */ + dcp_cmd_t cp_cmd; + + /* the encryption algorithm */ + enum zio_encrypt cp_crypt; + + /* keylocation property string */ + char *cp_keylocation; + + /* the wrapping key */ + dsl_wrapping_key_t *cp_wkey; +} dsl_crypto_params_t; + +/* + * In-memory representation of a DSL Crypto Key object. One of these structs + * (and corresponding on-disk ZAP object) will exist for each encrypted + * clone family that is mounted or otherwise reading protected data. + */ +typedef struct dsl_crypto_key { + /* link on spa_keystore_t:sk_dsl_keys */ + avl_node_t dck_avl_link; + + /* refcount of holders of this key */ + zfs_refcount_t dck_holds; + + /* master key used to derive encryption keys */ + zio_crypt_key_t dck_key; + + /* wrapping key for syncing this structure to disk */ + dsl_wrapping_key_t *dck_wkey; + + /* on-disk object id */ + uint64_t dck_obj; +} dsl_crypto_key_t; + +/* + * In-memory mapping of a dataset object id to a DSL Crypto Key. This is used + * to look up the corresponding dsl_crypto_key_t from the zio layer for + * performing data encryption and decryption. + */ +typedef struct dsl_key_mapping { + /* link on spa_keystore_t:sk_key_mappings */ + avl_node_t km_avl_link; + + /* refcount of how many users are depending on this mapping */ + zfs_refcount_t km_refcnt; + + /* dataset this crypto key belongs to (index) */ + uint64_t km_dsobj; + + /* crypto key (value) of this record */ + dsl_crypto_key_t *km_key; +} dsl_key_mapping_t; + +/* in memory structure for holding all wrapping and dsl keys */ +typedef struct spa_keystore { + /* lock for protecting sk_dsl_keys */ + krwlock_t sk_dk_lock; + + /* tree of all dsl_crypto_key_t's */ + avl_tree_t sk_dsl_keys; + + /* lock for protecting sk_key_mappings */ + krwlock_t sk_km_lock; + + /* tree of all dsl_key_mapping_t's, indexed by dsobj */ + avl_tree_t sk_key_mappings; + + /* lock for protecting the wrapping keys tree */ + krwlock_t sk_wkeys_lock; + + /* tree of all dsl_wrapping_key_t's, indexed by ddobj */ + avl_tree_t sk_wkeys; +} spa_keystore_t; + +int dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, + nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out); +void dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload); +void dsl_dataset_crypt_stats(struct dsl_dataset *ds, nvlist_t *nv); +int dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation); +boolean_t dsl_dir_incompatible_encryption_version(dsl_dir_t *dd); + +void spa_keystore_init(spa_keystore_t *sk); +void spa_keystore_fini(spa_keystore_t *sk); + +void spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag); +int spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey); +int spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, + boolean_t noop); +int spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj); +int spa_keystore_unload_wkey(const char *dsname); + +int spa_keystore_create_mapping(spa_t *spa, struct dsl_dataset *ds, void *tag, + dsl_key_mapping_t **km_out); +int spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag); +void key_mapping_add_ref(dsl_key_mapping_t *km, void *tag); +void key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag); +int spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, + dsl_crypto_key_t **dck_out); + +int dsl_crypto_populate_key_nvlist(struct objset *os, + uint64_t from_ivset_guid, nvlist_t **nvl_out); +int dsl_crypto_recv_raw_key_check(struct dsl_dataset *ds, + nvlist_t *nvl, dmu_tx_t *tx); +void dsl_crypto_recv_raw_key_sync(struct dsl_dataset *ds, + nvlist_t *nvl, dmu_tx_t *tx); +int dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, + dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key); + +int spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp); +int dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent); +int dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin); +void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, + dmu_tx_t *tx); +int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, + dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, + struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); +uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, + dmu_tx_t *tx); +uint64_t dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx); +void dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx); + +int spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt); +int spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, uint8_t *mac); +int spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, boolean_t byteswap); +int spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, + dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, + uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt); + +#endif diff --git a/include/sys/dsl_dataset.h b/include/sys/dsl_dataset.h new file mode 100644 index 000000000000..f5816a934c5f --- /dev/null +++ b/include/sys/dsl_dataset.h @@ -0,0 +1,508 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DSL_DATASET_H +#define _SYS_DSL_DATASET_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zfs_allow_redacted_dataset_mount; +struct dsl_dataset; +struct dsl_dir; +struct dsl_pool; +struct dsl_crypto_params; +struct dsl_key_mapping; +struct zfs_bookmark_phys; + +#define DS_FLAG_INCONSISTENT (1ULL<<0) +#define DS_IS_INCONSISTENT(ds) \ + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) + +/* + * Do not allow this dataset to be promoted. + */ +#define DS_FLAG_NOPROMOTE (1ULL<<1) + +/* + * DS_FLAG_UNIQUE_ACCURATE is set if ds_unique_bytes has been correctly + * calculated for head datasets (starting with SPA_VERSION_UNIQUE_ACCURATE, + * refquota/refreservations). + */ +#define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) + +/* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_DEFER_DESTROY) + +/* + * DS_FIELD_* are strings that are used in the "extensified" dataset zap object. + * They should be of the format :. + */ + +/* + * This field's value is the object ID of a zap object which contains the + * bookmarks of this dataset. If it is present, then this dataset is counted + * in the refcount of the SPA_FEATURES_BOOKMARKS feature. + */ +#define DS_FIELD_BOOKMARK_NAMES "com.delphix:bookmarks" + +/* + * This field is present (with value=0) if this dataset may contain large + * dnodes (>512B). If it is present, then this dataset is counted in the + * refcount of the SPA_FEATURE_LARGE_DNODE feature. + */ +#define DS_FIELD_LARGE_DNODE "org.zfsonlinux:large_dnode" + +/* + * These fields are set on datasets that are in the middle of a resumable + * receive, and allow the sender to resume the send if it is interrupted. + */ +#define DS_FIELD_RESUME_FROMGUID "com.delphix:resume_fromguid" +#define DS_FIELD_RESUME_TONAME "com.delphix:resume_toname" +#define DS_FIELD_RESUME_TOGUID "com.delphix:resume_toguid" +#define DS_FIELD_RESUME_OBJECT "com.delphix:resume_object" +#define DS_FIELD_RESUME_OFFSET "com.delphix:resume_offset" +#define DS_FIELD_RESUME_BYTES "com.delphix:resume_bytes" +#define DS_FIELD_RESUME_LARGEBLOCK "com.delphix:resume_largeblockok" +#define DS_FIELD_RESUME_EMBEDOK "com.delphix:resume_embedok" +#define DS_FIELD_RESUME_COMPRESSOK "com.delphix:resume_compressok" +#define DS_FIELD_RESUME_RAWOK "com.datto:resume_rawok" + +/* + * This field is set to the object number of the remap deadlist if one exists. + */ +#define DS_FIELD_REMAP_DEADLIST "com.delphix:remap_deadlist" + +/* + * We were receiving an incremental from a redaction bookmark, and these are the + * guids of its snapshots. + */ +#define DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS \ + "com.delphix:resume_redact_book_snaps" + +/* + * This field is set to the ivset guid for encrypted snapshots. This is used + * for validating raw receives. + */ +#define DS_FIELD_IVSET_GUID "com.datto:ivset_guid" + +/* + * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose + * name lookups should be performed case-insensitively. + */ +#define DS_FLAG_CI_DATASET (1ULL<<16) + +#define DS_CREATE_FLAG_NODIRTY (1ULL<<24) + +typedef struct dsl_dataset_phys { + uint64_t ds_dir_obj; /* DMU_OT_DSL_DIR */ + uint64_t ds_prev_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_prev_snap_txg; + uint64_t ds_next_snap_obj; /* DMU_OT_DSL_DATASET */ + uint64_t ds_snapnames_zapobj; /* DMU_OT_DSL_DS_SNAP_MAP 0 for snaps */ + uint64_t ds_num_children; /* clone/snap children; ==0 for head */ + uint64_t ds_creation_time; /* seconds since 1970 */ + uint64_t ds_creation_txg; + uint64_t ds_deadlist_obj; /* DMU_OT_DEADLIST */ + /* + * ds_referenced_bytes, ds_compressed_bytes, and ds_uncompressed_bytes + * include all blocks referenced by this dataset, including those + * shared with any other datasets. + */ + uint64_t ds_referenced_bytes; + uint64_t ds_compressed_bytes; + uint64_t ds_uncompressed_bytes; + uint64_t ds_unique_bytes; /* only relevant to snapshots */ + /* + * The ds_fsid_guid is a 56-bit ID that can change to avoid + * collisions. The ds_guid is a 64-bit ID that will never + * change, so there is a small probability that it will collide. + */ + uint64_t ds_fsid_guid; + uint64_t ds_guid; + uint64_t ds_flags; /* DS_FLAG_* */ + blkptr_t ds_bp; + uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ + uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ +} dsl_dataset_phys_t; + +typedef struct dsl_dataset { + dmu_buf_user_t ds_dbu; + rrwlock_t ds_bp_rwlock; /* Protects ds_phys->ds_bp */ + + /* Immutable: */ + struct dsl_dir *ds_dir; + dmu_buf_t *ds_dbuf; + uint64_t ds_object; + uint64_t ds_fsid_guid; + boolean_t ds_is_snapshot; + struct dsl_key_mapping *ds_key_mapping; + + /* only used in syncing context, only valid for non-snapshots: */ + struct dsl_dataset *ds_prev; + uint64_t ds_bookmarks_obj; /* DMU_OTN_ZAP_METADATA */ + avl_tree_t ds_bookmarks; /* dsl_bookmark_node_t */ + + /* has internal locking: */ + dsl_deadlist_t ds_deadlist; + bplist_t ds_pending_deadlist; + + /* + * The remap deadlist contains blocks (DVA's, really) that are + * referenced by the previous snapshot and point to indirect vdevs, + * but in this dataset they have been remapped to point to concrete + * (or at least, less-indirect) vdevs. In other words, the + * physical DVA is referenced by the previous snapshot but not by + * this dataset. Logically, the DVA continues to be referenced, + * but we are using a different (less indirect) physical DVA. + * This deadlist is used to determine when physical DVAs that + * point to indirect vdevs are no longer referenced anywhere, + * and thus should be marked obsolete. + * + * This is only used if SPA_FEATURE_OBSOLETE_COUNTS is enabled. + */ + dsl_deadlist_t ds_remap_deadlist; + /* protects creation of the ds_remap_deadlist */ + kmutex_t ds_remap_deadlist_lock; + + /* protected by lock on pool's dp_dirty_datasets list */ + txg_node_t ds_dirty_link; + list_node_t ds_synced_link; + + /* + * ds_phys->ds_ is also protected by ds_lock. + * Protected by ds_lock: + */ + kmutex_t ds_lock; + objset_t *ds_objset; + uint64_t ds_userrefs; + void *ds_owner; + + /* + * Long holds prevent the ds from being destroyed; they allow the + * ds to remain held even after dropping the dp_config_rwlock. + * Owning counts as a long hold. See the comments above + * dsl_pool_hold() for details. + */ + zfs_refcount_t ds_longholds; + + /* no locking; only for making guesses */ + uint64_t ds_trysnap_txg; + + /* for objset_open() */ + kmutex_t ds_opening_lock; + + uint64_t ds_reserved; /* cached refreservation */ + uint64_t ds_quota; /* cached refquota */ + + kmutex_t ds_sendstream_lock; + list_t ds_sendstreams; + + /* + * When in the middle of a resumable receive, tracks how much + * progress we have made. + */ + uint64_t ds_resume_object[TXG_SIZE]; + uint64_t ds_resume_offset[TXG_SIZE]; + uint64_t ds_resume_bytes[TXG_SIZE]; + + /* Protected by our dsl_dir's dd_lock */ + list_t ds_prop_cbs; + + /* + * For ZFEATURE_FLAG_PER_DATASET features, set if this dataset + * uses this feature. + */ + void *ds_feature[SPA_FEATURES]; + + /* + * Set if we need to activate the feature on this dataset this txg + * (used only in syncing context). + */ + void *ds_feature_activation[SPA_FEATURES]; + + /* Protected by ds_lock; keep at end of struct for better locality */ + char ds_snapname[ZFS_MAX_DATASET_NAME_LEN]; +} dsl_dataset_t; + +static inline dsl_dataset_phys_t * +dsl_dataset_phys(dsl_dataset_t *ds) +{ + return ((dsl_dataset_phys_t *)ds->ds_dbuf->db_data); +} + +typedef struct dsl_dataset_promote_arg { + const char *ddpa_clonename; + dsl_dataset_t *ddpa_clone; + list_t shared_snaps, origin_snaps, clone_snaps; + dsl_dataset_t *origin_origin; /* origin of the origin */ + uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; + nvlist_t *err_ds; + cred_t *cr; + proc_t *proc; +} dsl_dataset_promote_arg_t; + +typedef struct dsl_dataset_rollback_arg { + const char *ddra_fsname; + const char *ddra_tosnap; + void *ddra_owner; + nvlist_t *ddra_result; +} dsl_dataset_rollback_arg_t; + +typedef struct dsl_dataset_snapshot_arg { + nvlist_t *ddsa_snaps; + nvlist_t *ddsa_props; + nvlist_t *ddsa_errors; + cred_t *ddsa_cr; + proc_t *ddsa_proc; +} dsl_dataset_snapshot_arg_t; + +/* + * The max length of a temporary tag prefix is the number of hex digits + * required to express UINT64_MAX plus one for the hyphen. + */ +#define MAX_TAG_PREFIX_LEN 17 + +#define dsl_dataset_is_snapshot(ds) \ + (dsl_dataset_phys(ds)->ds_num_children != 0) + +#define DS_UNIQUE_IS_ACCURATE(ds) \ + ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_UNIQUE_ACCURATE) != 0) + +/* flags for holding the dataset */ +typedef enum ds_hold_flags { + DS_HOLD_FLAG_DECRYPT = 1 << 0 /* needs access to encrypted data */ +} ds_hold_flags_t; + +int dsl_dataset_hold(struct dsl_pool *dp, const char *name, void *tag, + dsl_dataset_t **dsp); +int dsl_dataset_hold_flags(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +boolean_t dsl_dataset_try_add_ref(struct dsl_pool *dp, dsl_dataset_t *ds, + void *tag); +int dsl_dataset_create_key_mapping(dsl_dataset_t *ds); +int dsl_dataset_hold_obj_flags(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **); +void dsl_dataset_remove_key_mapping(dsl_dataset_t *ds); +int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, + void *tag, dsl_dataset_t **); +void dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, + void *tag); +void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); +int dsl_dataset_own(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_force(struct dsl_pool *dp, const char *name, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +int dsl_dataset_own_obj_force(struct dsl_pool *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp); +void dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag); +void dsl_dataset_name(dsl_dataset_t *ds, char *name); +boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override); +int dsl_dataset_namelen(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_owner(dsl_dataset_t *ds); +uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *, + struct dsl_crypto_params *, dmu_tx_t *); +uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + struct dsl_crypto_params *dcp, uint64_t flags, dmu_tx_t *tx); +void dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx); +int dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors); +void dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_promote_check(void *arg, dmu_tx_t *tx); +int dsl_dataset_promote(const char *name, char *conflsnap); +int dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive); +int dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag); + +blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); + +spa_t *dsl_dataset_get_spa(dsl_dataset_t *ds); + +boolean_t dsl_dataset_modified_since_snap(dsl_dataset_t *ds, + dsl_dataset_t *snap); + +void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx); +void dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx); + +void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx); +int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx, boolean_t async); +void dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, + uint64_t offset, uint64_t size, uint64_t birth, dmu_tx_t *tx); +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, + uint64_t *value); + +void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); + +int get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val); +char *get_receive_resume_stats_impl(dsl_dataset_t *ds); +char *get_child_receive_stats(dsl_dataset_t *ds); +uint64_t dsl_get_refratio(dsl_dataset_t *ds); +uint64_t dsl_get_logicalreferenced(dsl_dataset_t *ds); +uint64_t dsl_get_compressratio(dsl_dataset_t *ds); +uint64_t dsl_get_used(dsl_dataset_t *ds); +uint64_t dsl_get_creation(dsl_dataset_t *ds); +uint64_t dsl_get_creationtxg(dsl_dataset_t *ds); +uint64_t dsl_get_refquota(dsl_dataset_t *ds); +uint64_t dsl_get_refreservation(dsl_dataset_t *ds); +uint64_t dsl_get_guid(dsl_dataset_t *ds); +uint64_t dsl_get_unique(dsl_dataset_t *ds); +uint64_t dsl_get_objsetid(dsl_dataset_t *ds); +uint64_t dsl_get_userrefs(dsl_dataset_t *ds); +uint64_t dsl_get_defer_destroy(dsl_dataset_t *ds); +uint64_t dsl_get_referenced(dsl_dataset_t *ds); +uint64_t dsl_get_numclones(dsl_dataset_t *ds); +uint64_t dsl_get_inconsistent(dsl_dataset_t *ds); +uint64_t dsl_get_redacted(dsl_dataset_t *ds); +uint64_t dsl_get_available(dsl_dataset_t *ds); +int dsl_get_written(dsl_dataset_t *ds, uint64_t *written); +int dsl_get_prev_snap(dsl_dataset_t *ds, char *snap); +void dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval); +int dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, + char *source); + +void get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv); +void dsl_dataset_stats(dsl_dataset_t *os, nvlist_t *nv); + +void dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat); +void dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp); +uint64_t dsl_dataset_fsid_guid(dsl_dataset_t *ds); +int dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *newds, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_written_bookmark(struct zfs_bookmark_phys *bmp, + dsl_dataset_t *newds, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +int dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, dsl_dataset_t *last, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); + +int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); + +int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, + uint64_t *ref_rsrv); +int dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t quota); +int dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t reservation); +int dsl_dataset_set_compression(const char *dsname, zprop_source_t source, + uint64_t compression); + +boolean_t dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, + uint64_t earlier_txg); +void dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag); +void dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag); +boolean_t dsl_dataset_long_held(dsl_dataset_t *ds); + +int dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx); +void dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx); +int dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc); +void dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx); + +void dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx); +void dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds); +int dsl_dataset_get_snapname(dsl_dataset_t *ds); +int dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, + uint64_t *value); +int dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt); +void dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx); +void dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_is_zapified(dsl_dataset_t *ds); +boolean_t dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds); + +int dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx); +void dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx); +int dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, + nvlist_t *result); + +uint64_t dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds); +void dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); +boolean_t dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds); +void dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx); + +void dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, + dmu_tx_t *tx); +void dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, + dmu_tx_t *tx); +boolean_t dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f); +boolean_t dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, + spa_feature_t f, uint64_t *outlength, uint64_t **outp); + +void dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx); + +#ifdef ZFS_DEBUG +#define dprintf_ds(ds, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ + dsl_dataset_name(ds, __ds_name); \ + dprintf("ds=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_ds(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DATASET_H */ diff --git a/include/sys/dsl_deadlist.h b/include/sys/dsl_deadlist.h new file mode 100644 index 000000000000..64358bb5fc0b --- /dev/null +++ b/include/sys/dsl_deadlist.h @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_DEADLIST_H +#define _SYS_DSL_DEADLIST_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dmu_buf; +struct dsl_pool; +struct dsl_dataset; + +typedef struct dsl_deadlist_phys { + uint64_t dl_used; + uint64_t dl_comp; + uint64_t dl_uncomp; + uint64_t dl_pad[37]; /* pad out to 320b for future expansion */ +} dsl_deadlist_phys_t; + +typedef struct dsl_deadlist { + objset_t *dl_os; + uint64_t dl_object; + avl_tree_t dl_tree; /* contains dsl_deadlist_entry_t */ + avl_tree_t dl_cache; /* contains dsl_deadlist_cache_entry_t */ + boolean_t dl_havetree; + boolean_t dl_havecache; + struct dmu_buf *dl_dbuf; + dsl_deadlist_phys_t *dl_phys; + kmutex_t dl_lock; + + /* if it's the old on-disk format: */ + bpobj_t dl_bpobj; + boolean_t dl_oldfmt; +} dsl_deadlist_t; + +typedef struct dsl_deadlist_cache_entry { + avl_node_t dlce_node; + uint64_t dlce_mintxg; + uint64_t dlce_bpobj; + uint64_t dlce_bytes; + uint64_t dlce_comp; + uint64_t dlce_uncomp; +} dsl_deadlist_cache_entry_t; + +typedef struct dsl_deadlist_entry { + avl_node_t dle_node; + uint64_t dle_mintxg; + bpobj_t dle_bpobj; +} dsl_deadlist_entry_t; + +typedef struct livelist_condense_entry { + struct dsl_dataset *ds; + dsl_deadlist_entry_t *first; + dsl_deadlist_entry_t *next; + boolean_t syncing; + boolean_t cancelled; +} livelist_condense_entry_t; + +extern unsigned long zfs_livelist_max_entries; +extern int zfs_livelist_min_percent_shared; + +typedef int deadlist_iter_t(void *args, dsl_deadlist_entry_t *dle); + +void dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object); +void dsl_deadlist_close(dsl_deadlist_t *dl); +void dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *arg); +uint64_t dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx); +void dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx); +void dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, + boolean_t free, dmu_tx_t *tx); +int dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +int dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +void dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx); +void dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, +dmu_tx_t *tx); +dsl_deadlist_entry_t *dsl_deadlist_first(dsl_deadlist_t *dl); +dsl_deadlist_entry_t *dsl_deadlist_last(dsl_deadlist_t *dl); +uint64_t dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx); +void dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_space_range(dsl_deadlist_t *dl, + uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); +void dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx); +void dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx); +boolean_t dsl_deadlist_is_open(dsl_deadlist_t *dl); +int dsl_process_sub_livelist(bpobj_t *bpobj, struct bplist *to_free, + zthr_t *t, uint64_t *size); +void dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx); +void dsl_deadlist_discard_tree(dsl_deadlist_t *dl); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DEADLIST_H */ diff --git a/include/sys/dsl_deleg.h b/include/sys/dsl_deleg.h new file mode 100644 index 000000000000..7f46233a889b --- /dev/null +++ b/include/sys/dsl_deleg.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_DELEG_H +#define _SYS_DSL_DELEG_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_PERM_NONE "" +#define ZFS_DELEG_PERM_CREATE "create" +#define ZFS_DELEG_PERM_DESTROY "destroy" +#define ZFS_DELEG_PERM_SNAPSHOT "snapshot" +#define ZFS_DELEG_PERM_ROLLBACK "rollback" +#define ZFS_DELEG_PERM_CLONE "clone" +#define ZFS_DELEG_PERM_PROMOTE "promote" +#define ZFS_DELEG_PERM_RENAME "rename" +#define ZFS_DELEG_PERM_MOUNT "mount" +#define ZFS_DELEG_PERM_SHARE "share" +#define ZFS_DELEG_PERM_SEND "send" +#define ZFS_DELEG_PERM_RECEIVE "receive" +#define ZFS_DELEG_PERM_ALLOW "allow" +#define ZFS_DELEG_PERM_USERPROP "userprop" +#define ZFS_DELEG_PERM_VSCAN "vscan" +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USEROBJQUOTA "userobjquota" +#define ZFS_DELEG_PERM_GROUPOBJQUOTA "groupobjquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_USEROBJUSED "userobjused" +#define ZFS_DELEG_PERM_GROUPOBJUSED "groupobjused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" +#define ZFS_DELEG_PERM_DIFF "diff" +#define ZFS_DELEG_PERM_BOOKMARK "bookmark" +#define ZFS_DELEG_PERM_LOAD_KEY "load-key" +#define ZFS_DELEG_PERM_CHANGE_KEY "change-key" +#define ZFS_DELEG_PERM_PROJECTUSED "projectused" +#define ZFS_DELEG_PERM_PROJECTQUOTA "projectquota" +#define ZFS_DELEG_PERM_PROJECTOBJUSED "projectobjused" +#define ZFS_DELEG_PERM_PROJECTOBJQUOTA "projectobjquota" + +/* + * Note: the names of properties that are marked delegatable are also + * valid delegated permissions + */ + +int dsl_deleg_get(const char *ddname, nvlist_t **nvp); +int dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset); +int dsl_deleg_access(const char *ddname, const char *perm, cred_t *cr); +int dsl_deleg_access_impl(struct dsl_dataset *ds, const char *perm, cred_t *cr); +void dsl_deleg_set_create_perms(dsl_dir_t *dd, dmu_tx_t *tx, cred_t *cr); +int dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr); +int dsl_deleg_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx); +boolean_t dsl_delegation_on(objset_t *os); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DELEG_H */ diff --git a/include/sys/dsl_destroy.h b/include/sys/dsl_destroy.h new file mode 100644 index 000000000000..208d75bacffa --- /dev/null +++ b/include/sys/dsl_destroy.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_DESTROY_H +#define _SYS_DSL_DESTROY_H + +#ifdef __cplusplus +extern "C" { +#endif + +struct nvlist; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +int dsl_destroy_snapshots_nvl(struct nvlist *, boolean_t, + struct nvlist *); +int dsl_destroy_snapshot(const char *, boolean_t); +int dsl_destroy_head(const char *); +int dsl_destroy_head_check_impl(struct dsl_dataset *, int); +void dsl_destroy_head_sync_impl(struct dsl_dataset *, struct dmu_tx *); +int dsl_destroy_inconsistent(const char *, void *); +int dsl_destroy_snapshot_check_impl(struct dsl_dataset *, boolean_t); +void dsl_destroy_snapshot_sync_impl(struct dsl_dataset *, + boolean_t, struct dmu_tx *); +void dsl_dir_remove_clones_key(dsl_dir_t *, uint64_t, dmu_tx_t *); + +typedef struct dsl_destroy_snapshot_arg { + const char *ddsa_name; + boolean_t ddsa_defer; +} dsl_destroy_snapshot_arg_t; + +int dsl_destroy_snapshot_check(void *, dmu_tx_t *); +void dsl_destroy_snapshot_sync(void *, dmu_tx_t *); + +typedef struct dsl_destroy_head_arg { + const char *ddha_name; +} dsl_destroy_head_arg_t; + +int dsl_destroy_head_check(void *, dmu_tx_t *); +void dsl_destroy_head_sync(void *, dmu_tx_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DESTROY_H */ diff --git a/include/sys/dsl_dir.h b/include/sys/dsl_dir.h new file mode 100644 index 000000000000..7cf5093c2c30 --- /dev/null +++ b/include/sys/dsl_dir.h @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_DSL_DIR_H +#define _SYS_DSL_DIR_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct zthr; +/* + * DD_FIELD_* are strings that are used in the "extensified" dsl_dir zap object. + * They should be of the format :. + */ + +#define DD_FIELD_FILESYSTEM_COUNT "com.joyent:filesystem_count" +#define DD_FIELD_SNAPSHOT_COUNT "com.joyent:snapshot_count" +#define DD_FIELD_CRYPTO_KEY_OBJ "com.datto:crypto_key_obj" +#define DD_FIELD_LIVELIST "com.delphix:livelist" + +typedef enum dd_used { + DD_USED_HEAD, + DD_USED_SNAP, + DD_USED_CHILD, + DD_USED_CHILD_RSRV, + DD_USED_REFRSRV, + DD_USED_NUM +} dd_used_t; + +#define DD_FLAG_USED_BREAKDOWN (1<<0) + +typedef struct dsl_dir_phys { + uint64_t dd_creation_time; /* not actually used */ + uint64_t dd_head_dataset_obj; + uint64_t dd_parent_obj; + uint64_t dd_origin_obj; + uint64_t dd_child_dir_zapobj; + /* + * how much space our children are accounting for; for leaf + * datasets, == physical space used by fs + snaps + */ + uint64_t dd_used_bytes; + uint64_t dd_compressed_bytes; + uint64_t dd_uncompressed_bytes; + /* Administrative quota setting */ + uint64_t dd_quota; + /* Administrative reservation setting */ + uint64_t dd_reserved; + uint64_t dd_props_zapobj; + uint64_t dd_deleg_zapobj; /* dataset delegation permissions */ + uint64_t dd_flags; + uint64_t dd_used_breakdown[DD_USED_NUM]; + uint64_t dd_clones; /* dsl_dir objects */ + uint64_t dd_pad[13]; /* pad out to 256 bytes for good measure */ +} dsl_dir_phys_t; + +struct dsl_dir { + dmu_buf_user_t dd_dbu; + + /* These are immutable; no lock needed: */ + uint64_t dd_object; + uint64_t dd_crypto_obj; + dsl_pool_t *dd_pool; + + /* Stable until user eviction; no lock needed: */ + dmu_buf_t *dd_dbuf; + + /* protected by lock on pool's dp_dirty_dirs list */ + txg_node_t dd_dirty_link; + + /* protected by dp_config_rwlock */ + dsl_dir_t *dd_parent; + + /* Protected by dd_lock */ + kmutex_t dd_lock; + list_t dd_props; /* list of dsl_prop_record_t's */ + inode_timespec_t dd_snap_cmtime; /* last snapshot namespace change */ + uint64_t dd_origin_txg; + + /* gross estimate of space used by in-flight tx's */ + uint64_t dd_tempreserved[TXG_SIZE]; + /* amount of space we expect to write; == amount of dirty data */ + int64_t dd_space_towrite[TXG_SIZE]; + + dsl_deadlist_t dd_livelist; + bplist_t dd_pending_frees; + bplist_t dd_pending_allocs; + + kmutex_t dd_activity_lock; + kcondvar_t dd_activity_cv; + boolean_t dd_activity_cancelled; + uint64_t dd_activity_waiters; + + /* protected by dd_lock; keep at end of struct for better locality */ + char dd_myname[ZFS_MAX_DATASET_NAME_LEN]; +}; + +static inline dsl_dir_phys_t * +dsl_dir_phys(dsl_dir_t *dd) +{ + return (dd->dd_dbuf->db_data); +} + +void dsl_dir_rele(dsl_dir_t *dd, void *tag); +void dsl_dir_async_rele(dsl_dir_t *dd, void *tag); +int dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dir_t **, const char **tail); +int dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **); +void dsl_dir_name(dsl_dir_t *dd, char *buf); +int dsl_dir_namelen(dsl_dir_t *dd); +uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, + const char *name, dmu_tx_t *tx); + +uint64_t dsl_dir_get_used(dsl_dir_t *dd); +uint64_t dsl_dir_get_compressed(dsl_dir_t *dd); +uint64_t dsl_dir_get_quota(dsl_dir_t *dd); +uint64_t dsl_dir_get_reservation(dsl_dir_t *dd); +uint64_t dsl_dir_get_compressratio(dsl_dir_t *dd); +uint64_t dsl_dir_get_logicalused(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedsnap(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedds(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedrefreserv(dsl_dir_t *dd); +uint64_t dsl_dir_get_usedchild(dsl_dir_t *dd); +void dsl_dir_get_origin(dsl_dir_t *dd, char *buf); +int dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count); +int dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count); + +void dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv); +uint64_t dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly); +void dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx); +void dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx); +int dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t mem, + uint64_t asize, boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx); +void dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx); +void dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx); +void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); +void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); +int dsl_dir_set_quota(const char *ddname, zprop_source_t source, + uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation); +int dsl_dir_activate_fs_ss_limit(const char *); +int dsl_fs_ss_limit_check(dsl_dir_t *, uint64_t, zfs_prop_t, dsl_dir_t *, + cred_t *, proc_t *); +void dsl_fs_ss_count_adjust(dsl_dir_t *, int64_t, const char *, dmu_tx_t *); +int dsl_dir_rename(const char *oldname, const char *newname); +int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, cred_t *, proc_t *); +boolean_t dsl_dir_is_clone(dsl_dir_t *dd); +void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, + uint64_t reservation, cred_t *cr, dmu_tx_t *tx); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +inode_timespec_t dsl_dir_snap_cmtime(dsl_dir_t *dd); +void dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, + dmu_tx_t *tx); +void dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx); +boolean_t dsl_dir_is_zapified(dsl_dir_t *dd); +void dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj); +void dsl_dir_livelist_close(dsl_dir_t *dd); +void dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total); +int dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited); +void dsl_dir_cancel_waiters(dsl_dir_t *dd); + +/* internal reserved dir name */ +#define MOS_DIR_NAME "$MOS" +#define ORIGIN_DIR_NAME "$ORIGIN" +#define FREE_DIR_NAME "$FREE" +#define LEAK_DIR_NAME "$LEAK" + +#ifdef ZFS_DEBUG +#define dprintf_dd(dd, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__ds_name = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); \ + dsl_dir_name(dd, __ds_name); \ + dprintf("dd=%s " fmt, __ds_name, __VA_ARGS__); \ + kmem_free(__ds_name, ZFS_MAX_DATASET_NAME_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_dd(dd, fmt, ...) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_DIR_H */ diff --git a/include/sys/dsl_pool.h b/include/sys/dsl_pool.h new file mode 100644 index 000000000000..8249bb8fc633 --- /dev/null +++ b/include/sys/dsl_pool.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_POOL_H +#define _SYS_DSL_POOL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int zfs_txg_synctime_ms; + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; +struct dsl_scan; +struct dsl_crypto_params; +struct dsl_deadlist; + +extern unsigned long zfs_dirty_data_max; +extern unsigned long zfs_dirty_data_max_max; +extern int zfs_dirty_data_sync_percent; +extern int zfs_dirty_data_max_percent; +extern int zfs_dirty_data_max_max_percent; +extern int zfs_delay_min_dirty_percent; +extern unsigned long zfs_delay_scale; + +/* These macros are for indexing into the zfs_all_blkstats_t. */ +#define DMU_OT_DEFERRED DMU_OT_NONE +#define DMU_OT_OTHER DMU_OT_NUMTYPES /* place holder for DMU_OT() types */ +#define DMU_OT_TOTAL (DMU_OT_NUMTYPES + 1) + +typedef struct zfs_blkstat { + uint64_t zb_count; + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_gangs; + uint64_t zb_ditto_2_of_2_samevdev; + uint64_t zb_ditto_2_of_3_samevdev; + uint64_t zb_ditto_3_of_3_samevdev; +} zfs_blkstat_t; + +typedef struct zfs_all_blkstats { + zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; + kmutex_t zab_lock; +} zfs_all_blkstats_t; + + +typedef struct dsl_pool { + /* Immutable */ + spa_t *dp_spa; + struct objset *dp_meta_objset; + struct dsl_dir *dp_root_dir; + struct dsl_dir *dp_mos_dir; + struct dsl_dir *dp_free_dir; + struct dsl_dir *dp_leak_dir; + struct dsl_dataset *dp_origin_snap; + uint64_t dp_root_dir_obj; + struct taskq *dp_zrele_taskq; + struct taskq *dp_unlinked_drain_taskq; + + /* No lock needed - sync context only */ + blkptr_t dp_meta_rootbp; + uint64_t dp_tmp_userrefs_obj; + bpobj_t dp_free_bpobj; + uint64_t dp_bptree_obj; + uint64_t dp_empty_bpobj; + bpobj_t dp_obsolete_bpobj; + + struct dsl_scan *dp_scan; + + /* Uses dp_lock */ + kmutex_t dp_lock; + kcondvar_t dp_spaceavail_cv; + uint64_t dp_dirty_pertxg[TXG_SIZE]; + uint64_t dp_dirty_total; + uint64_t dp_long_free_dirty_pertxg[TXG_SIZE]; + uint64_t dp_mos_used_delta; + uint64_t dp_mos_compressed_delta; + uint64_t dp_mos_uncompressed_delta; + + /* + * Time of most recently scheduled (furthest in the future) + * wakeup for delayed transactions. + */ + hrtime_t dp_last_wakeup; + + /* Has its own locking */ + tx_state_t dp_tx; + txg_list_t dp_dirty_datasets; + txg_list_t dp_dirty_zilogs; + txg_list_t dp_dirty_dirs; + txg_list_t dp_sync_tasks; + txg_list_t dp_early_sync_tasks; + taskq_t *dp_sync_taskq; + taskq_t *dp_zil_clean_taskq; + + /* + * Protects administrative changes (properties, namespace) + * + * It is only held for write in syncing context. Therefore + * syncing context does not need to ever have it for read, since + * nobody else could possibly have it for write. + */ + rrwlock_t dp_config_rwlock; + + zfs_all_blkstats_t *dp_blkstats; +} dsl_pool_t; + +int dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); +int dsl_pool_open(dsl_pool_t *dp); +void dsl_pool_close(dsl_pool_t *dp); +dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, + struct dsl_crypto_params *dcp, uint64_t txg); +void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); +void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); +int dsl_pool_sync_context(dsl_pool_t *dp); +uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy); +uint64_t dsl_pool_unreserved_space(dsl_pool_t *dp, + zfs_space_check_t slop_policy); +void dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); +void dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg); +void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); +void dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, + const blkptr_t *bpp); +void dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); +void dsl_pool_ckpoint_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp); +boolean_t dsl_pool_need_dirty_delay(dsl_pool_t *dp); +void dsl_pool_config_enter(dsl_pool_t *dp, void *tag); +void dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag); +void dsl_pool_config_exit(dsl_pool_t *dp, void *tag); +boolean_t dsl_pool_config_held(dsl_pool_t *dp); +boolean_t dsl_pool_config_held_writer(dsl_pool_t *dp); + +taskq_t *dsl_pool_zrele_taskq(dsl_pool_t *dp); +taskq_t *dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp); + +int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t now, dmu_tx_t *tx); +int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, dmu_tx_t *tx); +void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); +int dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **); +int dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp); +void dsl_pool_rele(dsl_pool_t *dp, void *tag); + +void dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); +void dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_POOL_H */ diff --git a/include/sys/dsl_prop.h b/include/sys/dsl_prop.h new file mode 100644 index 000000000000..fba8f908dc9e --- /dev/null +++ b/include/sys/dsl_prop.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_DSL_PROP_H +#define _SYS_DSL_PROP_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_dataset; +struct dsl_dir; + +/* The callback func may not call into the DMU or DSL! */ +typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval); + +typedef struct dsl_prop_record { + list_node_t pr_node; /* link on dd_props */ + const char *pr_propname; + list_t pr_cbs; +} dsl_prop_record_t; + +typedef struct dsl_prop_cb_record { + list_node_t cbr_pr_node; /* link on pr_cbs */ + list_node_t cbr_ds_node; /* link on ds_prop_cbs */ + dsl_prop_record_t *cbr_pr; + struct dsl_dataset *cbr_ds; + dsl_prop_changed_cb_t *cbr_func; + void *cbr_arg; +} dsl_prop_cb_record_t; + +typedef struct dsl_props_arg { + nvlist_t *pa_props; + zprop_source_t pa_source; +} dsl_props_arg_t; + +typedef struct dsl_props_set_arg { + const char *dpsa_dsname; + zprop_source_t dpsa_source; + nvlist_t *dpsa_props; +} dsl_props_set_arg_t; + +void dsl_prop_init(dsl_dir_t *dd); +void dsl_prop_fini(dsl_dir_t *dd); +int dsl_prop_register(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg); +void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg); +void dsl_prop_notify_all(struct dsl_dir *dd); +boolean_t dsl_prop_hascb(struct dsl_dataset *ds); + +int dsl_prop_get(const char *ddname, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(const char *dsname, nvlist_t **nvp); +int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint); +int dsl_prop_get_int_ds(struct dsl_dataset *ds, const char *propname, + uint64_t *valuep); +int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint, + boolean_t snapshot); + +int dsl_props_set_check(void *arg, dmu_tx_t *tx); +void dsl_props_set_sync(void *arg, dmu_tx_t *tx); +void dsl_props_set_sync_impl(struct dsl_dataset *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx); +void dsl_prop_set_sync_impl(struct dsl_dataset *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx); +int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); +int dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value); +int dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value); +int dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source); + +int dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp); + +/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ +boolean_t dsl_prop_get_hasrecvd(const char *dsname); +int dsl_prop_set_hasrecvd(const char *dsname); +void dsl_prop_unset_hasrecvd(const char *dsname); + +void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); +void dsl_prop_nvlist_add_string(nvlist_t *nv, + zfs_prop_t prop, const char *value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_PROP_H */ diff --git a/include/sys/dsl_scan.h b/include/sys/dsl_scan.h new file mode 100644 index 000000000000..8f929207d2d7 --- /dev/null +++ b/include/sys/dsl_scan.h @@ -0,0 +1,193 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + */ + +#ifndef _SYS_DSL_SCAN_H +#define _SYS_DSL_SCAN_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct objset; +struct dsl_dir; +struct dsl_dataset; +struct dsl_pool; +struct dmu_tx; + +extern int zfs_scan_suspend_progress; + +/* + * All members of this structure must be uint64_t, for byteswap + * purposes. + */ +typedef struct dsl_scan_phys { + uint64_t scn_func; /* pool_scan_func_t */ + uint64_t scn_state; /* dsl_scan_state_t */ + uint64_t scn_queue_obj; + uint64_t scn_min_txg; + uint64_t scn_max_txg; + uint64_t scn_cur_min_txg; + uint64_t scn_cur_max_txg; + uint64_t scn_start_time; + uint64_t scn_end_time; + uint64_t scn_to_examine; /* total bytes to be scanned */ + uint64_t scn_examined; /* bytes scanned so far */ + uint64_t scn_to_process; + uint64_t scn_processed; + uint64_t scn_errors; /* scan I/O error count */ + uint64_t scn_ddt_class_max; + ddt_bookmark_t scn_ddt_bookmark; + zbookmark_phys_t scn_bookmark; + uint64_t scn_flags; /* dsl_scan_flags_t */ +} dsl_scan_phys_t; + +#define SCAN_PHYS_NUMINTS (sizeof (dsl_scan_phys_t) / sizeof (uint64_t)) + +typedef enum dsl_scan_flags { + DSF_VISIT_DS_AGAIN = 1<<0, + DSF_SCRUB_PAUSED = 1<<1, +} dsl_scan_flags_t; + +#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) + +/* + * Every pool will have one dsl_scan_t and this structure will contain + * in-memory information about the scan and a pointer to the on-disk + * representation (i.e. dsl_scan_phys_t). Most of the state of the scan + * is contained on-disk to allow the scan to resume in the event of a reboot + * or panic. This structure maintains information about the behavior of a + * running scan, some caching information, and how it should traverse the pool. + * + * The following members of this structure direct the behavior of the scan: + * + * scn_suspending - a scan that cannot be completed in a single txg or + * has exceeded its allotted time will need to suspend. + * When this flag is set the scanner will stop traversing + * the pool and write out the current state to disk. + * + * scn_restart_txg - directs the scanner to either restart or start a + * a scan at the specified txg value. + * + * scn_done_txg - when a scan completes its traversal it will set + * the completion txg to the next txg. This is necessary + * to ensure that any blocks that were freed during + * the scan but have not yet been processed (i.e deferred + * frees) are accounted for. + * + * This structure also maintains information about deferred frees which are + * a special kind of traversal. Deferred free can exist in either a bptree or + * a bpobj structure. The scn_is_bptree flag will indicate the type of + * deferred free that is in progress. If the deferred free is part of an + * asynchronous destroy then the scn_async_destroying flag will be set. + */ +typedef struct dsl_scan { + struct dsl_pool *scn_dp; + uint64_t scn_restart_txg; + uint64_t scn_done_txg; + uint64_t scn_sync_start_time; + uint64_t scn_issued_before_pass; + + /* for freeing blocks */ + boolean_t scn_is_bptree; + boolean_t scn_async_destroying; + boolean_t scn_async_stalled; + uint64_t scn_async_block_min_time_ms; + + /* flags and stats for controlling scan state */ + boolean_t scn_is_sorted; /* doing sequential scan */ + boolean_t scn_clearing; /* scan is issuing sequential extents */ + boolean_t scn_checkpointing; /* scan is issuing all queued extents */ + boolean_t scn_suspending; /* scan is suspending until next txg */ + uint64_t scn_last_checkpoint; /* time of last checkpoint */ + + /* members for thread synchronization */ + zio_t *scn_zio_root; /* root zio for waiting on IO */ + taskq_t *scn_taskq; /* task queue for issuing extents */ + + /* for controlling scan prefetch, protected by spa_scrub_lock */ + boolean_t scn_prefetch_stop; /* prefetch should stop */ + zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */ + avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */ + uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */ + + /* per txg statistics */ + uint64_t scn_visited_this_txg; /* total bps visited this txg */ + uint64_t scn_dedup_frees_this_txg; /* dedup bps freed this txg */ + uint64_t scn_holes_this_txg; + uint64_t scn_lt_min_this_txg; + uint64_t scn_gt_max_this_txg; + uint64_t scn_ddt_contained_this_txg; + uint64_t scn_objsets_visited_this_txg; + uint64_t scn_avg_seg_size_this_txg; + uint64_t scn_segs_this_txg; + uint64_t scn_avg_zio_size_this_txg; + uint64_t scn_zios_this_txg; + + /* members needed for syncing scan status to disk */ + dsl_scan_phys_t scn_phys; /* on disk representation of scan */ + dsl_scan_phys_t scn_phys_cached; + avl_tree_t scn_queue; /* queue of datasets to scan */ + uint64_t scn_bytes_pending; /* outstanding data to issue */ +} dsl_scan_t; + +typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; + +void scan_init(void); +void scan_fini(void); +int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); +void dsl_scan_fini(struct dsl_pool *dp); +void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); +int dsl_scan_cancel(struct dsl_pool *); +int dsl_scan(struct dsl_pool *, pool_scan_func_t); +void dsl_scan_assess_vdev(struct dsl_pool *dp, vdev_t *vd); +boolean_t dsl_scan_scrubbing(const struct dsl_pool *dp); +int dsl_scrub_set_pause_resume(const struct dsl_pool *dp, pool_scrub_cmd_t cmd); +void dsl_scan_restart_resilver(struct dsl_pool *, uint64_t txg); +boolean_t dsl_scan_resilvering(struct dsl_pool *dp); +boolean_t dsl_scan_resilver_scheduled(struct dsl_pool *dp); +boolean_t dsl_dataset_unstable(struct dsl_dataset *ds); +void dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx); +void dsl_scan_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); +void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, + struct dmu_tx *tx); +boolean_t dsl_scan_active(dsl_scan_t *scn); +boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); +void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); +void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SCAN_H */ diff --git a/include/sys/dsl_synctask.h b/include/sys/dsl_synctask.h new file mode 100644 index 000000000000..957963ffe553 --- /dev/null +++ b/include/sys/dsl_synctask.h @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_DSL_SYNCTASK_H +#define _SYS_DSL_SYNCTASK_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; + +typedef int (dsl_checkfunc_t)(void *, dmu_tx_t *); +typedef void (dsl_syncfunc_t)(void *, dmu_tx_t *); +typedef void (dsl_sigfunc_t)(void *, dmu_tx_t *); + +typedef enum zfs_space_check { + /* + * Normal space check: if there is less than 3.2% free space, + * the operation will fail. Operations which are logically + * creating things should use this (e.g. "zfs create", "zfs snapshot"). + * User writes (via the ZPL / ZVOL) also fail at this point. + */ + ZFS_SPACE_CHECK_NORMAL, + + /* + * Space check allows use of half the slop space. If there + * is less than 1.6% free space, the operation will fail. Most + * operations should use this (e.g. "zfs set", "zfs rename"), + * because we want them to succeed even after user writes are failing, + * so that they can be used as part of the space recovery process. + */ + ZFS_SPACE_CHECK_RESERVED, + + /* + * Space check allows use of three quarters of the slop space. + * If there is less than 0.8% free space, the operation will + * fail. + */ + ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * In all cases "zfs destroy" is expected to result in an net + * reduction of space, except one. When the pool has a + * checkpoint, space freed by "zfs destroy" will not actually + * free anything internally. Thus, it starts failing after + * three quarters of the slop space is exceeded. + */ + ZFS_SPACE_CHECK_DESTROY = ZFS_SPACE_CHECK_EXTRA_RESERVED, + + /* + * A channel program can run a "zfs destroy" as part of its + * script and therefore has the same space_check policy when + * being evaluated. + */ + ZFS_SPACE_CHECK_ZCP_EVAL = ZFS_SPACE_CHECK_DESTROY, + + /* + * No space check is performed. This level of space check should + * be used cautiously as operations that use it can even run when + * 0.8% capacity is left for use. In this scenario, if there is a + * checkpoint, async destroys are suspended and any kind of freeing + * can potentially add space instead of freeing it. + * + * See also the comments above spa_slop_shift. + */ + ZFS_SPACE_CHECK_NONE, + + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT = ZFS_SPACE_CHECK_NONE, + +} zfs_space_check_t; + +typedef struct dsl_sync_task { + txg_node_t dst_node; + struct dsl_pool *dst_pool; + uint64_t dst_txg; + int dst_space; + zfs_space_check_t dst_space_check; + dsl_checkfunc_t *dst_checkfunc; + dsl_syncfunc_t *dst_syncfunc; + void *dst_arg; + int dst_error; + boolean_t dst_nowaiter; +} dsl_sync_task_t; + +void dsl_sync_task_sync(dsl_sync_task_t *, dmu_tx_t *); +int dsl_sync_task(const char *, dsl_checkfunc_t *, + dsl_syncfunc_t *, void *, int, zfs_space_check_t); +void dsl_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, + void *, int, zfs_space_check_t, dmu_tx_t *); +int dsl_early_sync_task(const char *, dsl_checkfunc_t *, + dsl_syncfunc_t *, void *, int, zfs_space_check_t); +void dsl_early_sync_task_nowait(struct dsl_pool *, dsl_syncfunc_t *, + void *, int, zfs_space_check_t, dmu_tx_t *); +int dsl_sync_task_sig(const char *, dsl_checkfunc_t *, dsl_syncfunc_t *, + dsl_sigfunc_t *, void *, int, zfs_space_check_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_SYNCTASK_H */ diff --git a/include/sys/dsl_userhold.h b/include/sys/dsl_userhold.h new file mode 100644 index 000000000000..071aeb86d1f1 --- /dev/null +++ b/include/sys/dsl_userhold.h @@ -0,0 +1,57 @@ + +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +#ifndef _SYS_DSL_USERHOLD_H +#define _SYS_DSL_USERHOLD_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; +struct dmu_tx; + +int dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, + nvlist_t *errlist); +int dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist); +int dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl); +void dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds); +int dsl_dataset_user_hold_check_one(struct dsl_dataset *ds, const char *htag, + boolean_t temphold, struct dmu_tx *tx); +void dsl_dataset_user_hold_sync_one(struct dsl_dataset *ds, const char *htag, + minor_t minor, uint64_t now, struct dmu_tx *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DSL_USERHOLD_H */ diff --git a/include/sys/edonr.h b/include/sys/edonr.h new file mode 100644 index 000000000000..79b7cd8c75b8 --- /dev/null +++ b/include/sys/edonr.h @@ -0,0 +1,98 @@ +/* + * IDI,NTNU + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (C) 2009, 2010, Jorn Amundsen + * + * Tweaked Edon-R implementation for SUPERCOP, based on NIST API. + * + * $Id: edonr.h 517 2013-02-17 20:34:39Z joern $ + */ +/* + * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + */ + +#ifndef _SYS_EDONR_H_ +#define _SYS_EDONR_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include +#else +#include /* uint32_t... */ +#include /* size_t ... */ +#endif + +/* + * EdonR allows to call EdonRUpdate() consecutively only if the total length + * of stored unprocessed data and the new supplied data is less than or equal + * to the BLOCK_SIZE on which the compression functions operates. + * Otherwise an assertion failure is invoked. + */ + +/* Specific algorithm definitions */ +#define EdonR224_DIGEST_SIZE 28 +#define EdonR224_BLOCK_SIZE 64 +#define EdonR256_DIGEST_SIZE 32 +#define EdonR256_BLOCK_SIZE 64 +#define EdonR384_DIGEST_SIZE 48 +#define EdonR384_BLOCK_SIZE 128 +#define EdonR512_DIGEST_SIZE 64 +#define EdonR512_BLOCK_SIZE 128 + +#define EdonR256_BLOCK_BITSIZE 512 +#define EdonR512_BLOCK_BITSIZE 1024 + +typedef struct { + uint32_t DoublePipe[16]; + uint8_t LastPart[EdonR256_BLOCK_SIZE * 2]; +} EdonRData256; +typedef struct { + uint64_t DoublePipe[16]; + uint8_t LastPart[EdonR512_BLOCK_SIZE * 2]; +} EdonRData512; + +typedef struct { + size_t hashbitlen; + + /* + algorithm specific parameters */ + int unprocessed_bits; + uint64_t bits_processed; + union { + EdonRData256 p256[1]; + EdonRData512 p512[1]; + } pipe[1]; +} EdonRState; + +void EdonRInit(EdonRState *state, size_t hashbitlen); +void EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen); +void EdonRFinal(EdonRState *state, uint8_t *hashval); +void EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen, + uint8_t *hashval); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EDONR_H_ */ diff --git a/include/sys/efi_partition.h b/include/sys/efi_partition.h new file mode 100644 index 000000000000..88bdfd2b1ca3 --- /dev/null +++ b/include/sys/efi_partition.h @@ -0,0 +1,381 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_EFI_PARTITION_H +#define _SYS_EFI_PARTITION_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * GUID Partition Table Header + */ + +#define EFI_MIN_LABEL_SIZE 92 +#define EFI_LABEL_SIZE 512 +#define LEN_EFI_PAD (EFI_LABEL_SIZE - \ + ((5 * sizeof (diskaddr_t)) + \ + (7 * sizeof (uint_t)) + \ + (8 * sizeof (char)) + \ + (1 * (sizeof (struct uuid))))) + +#define EFI_SIGNATURE 0x5452415020494645ULL + +/* EFI Guid Partition Table Header -- little endian on-disk format */ +typedef struct efi_gpt { + uint64_t efi_gpt_Signature; + uint_t efi_gpt_Revision; + uint_t efi_gpt_HeaderSize; + uint_t efi_gpt_HeaderCRC32; + uint_t efi_gpt_Reserved1; + diskaddr_t efi_gpt_MyLBA; + diskaddr_t efi_gpt_AlternateLBA; + diskaddr_t efi_gpt_FirstUsableLBA; + diskaddr_t efi_gpt_LastUsableLBA; + struct uuid efi_gpt_DiskGUID; + diskaddr_t efi_gpt_PartitionEntryLBA; + uint_t efi_gpt_NumberOfPartitionEntries; + uint_t efi_gpt_SizeOfPartitionEntry; + uint_t efi_gpt_PartitionEntryArrayCRC32; + char efi_gpt_Reserved2[LEN_EFI_PAD]; +} efi_gpt_t; + +/* EFI Guid Partition Entry Attributes -- little endian format */ +typedef struct efi_gpe_Attrs { + uint32_t PartitionAttrs :16, + Reserved2 :16; + uint32_t Reserved1 :31, + RequiredPartition :1; +} efi_gpe_Attrs_t; + +/* + * 6a96237f-1dd2-11b2-99a6-080020736631 V_UNASSIGNED (not used as such) + * 6a82cb45-1dd2-11b2-99a6-080020736631 V_BOOT + * 6a85cf4d-1dd2-11b2-99a6-080020736631 V_ROOT + * 6a87c46f-1dd2-11b2-99a6-080020736631 V_SWAP + * 6a898cc3-1dd2-11b2-99a6-080020736631 V_USR + * 6a8b642b-1dd2-11b2-99a6-080020736631 V_BACKUP + * 6a8d2ac7-1dd2-11b2-99a6-080020736631 V_STAND (not used) + * 6a8ef2e9-1dd2-11b2-99a6-080020736631 V_VAR + * 6a90ba39-1dd2-11b2-99a6-080020736631 V_HOME + * 6a9283a5-1dd2-11b2-99a6-080020736631 V_ALTSCTR + * 6a945a3b-1dd2-11b2-99a6-080020736631 V_CACHE + */ + +#define EFI_UNUSED { 0x00000000, 0x0000, 0x0000, 0x00, 0x00, \ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 } } +#define EFI_RESV1 { 0x6a96237f, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_BOOT { 0x6a82cb45, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_ROOT { 0x6a85cf4d, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_SWAP { 0x6a87c46f, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_USR { 0x6a898cc3, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_BACKUP { 0x6a8b642b, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_RESV2 { 0x6a8d2ac7, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_VAR { 0x6a8ef2e9, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_HOME { 0x6a90ba39, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_ALTSCTR { 0x6a9283a5, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_RESERVED { 0x6a945a3b, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_SYSTEM { 0xC12A7328, 0xF81F, 0x11d2, 0xBA, 0x4B, \ + { 0x00, 0xA0, 0xC9, 0x3E, 0xC9, 0x3B } } +#define EFI_LEGACY_MBR { 0x024DEE41, 0x33E7, 0x11d3, 0x9D, 0x69, \ + { 0x00, 0x08, 0xC7, 0x81, 0xF3, 0x9F } } +#define EFI_SYMC_PUB { 0x6a9630d1, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_SYMC_CDS { 0x6a980767, 0x1dd2, 0x11b2, 0x99, 0xa6, \ + { 0x08, 0x00, 0x20, 0x73, 0x66, 0x31 } } +#define EFI_MSFT_RESV { 0xE3C9E316, 0x0B5C, 0x4DB8, 0x81, 0x7D, \ + { 0xF9, 0x2D, 0xF0, 0x02, 0x15, 0xAE } } +#define EFI_DELL_BASIC { 0xebd0a0a2, 0xb9e5, 0x4433, 0x87, 0xc0, \ + { 0x68, 0xb6, 0xb7, 0x26, 0x99, 0xc7 } } +#define EFI_DELL_RAID { 0xa19d880f, 0x05fc, 0x4d3b, 0xa0, 0x06, \ + { 0x74, 0x3f, 0x0f, 0x84, 0x91, 0x1e } } +#define EFI_DELL_SWAP { 0x0657fd6d, 0xa4ab, 0x43c4, 0x84, 0xe5, \ + { 0x09, 0x33, 0xc8, 0x4b, 0x4f, 0x4f } } +#define EFI_DELL_LVM { 0xe6d6d379, 0xf507, 0x44c2, 0xa2, 0x3c, \ + { 0x23, 0x8f, 0x2a, 0x3d, 0xf9, 0x28 } } +#define EFI_DELL_RESV { 0x8da63339, 0x0007, 0x60c0, 0xc4, 0x36, \ + { 0x08, 0x3a, 0xc8, 0x23, 0x09, 0x08 } } +#define EFI_AAPL_HFS { 0x48465300, 0x0000, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_UFS { 0x55465300, 0x0000, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_FREEBSD_BOOT { 0x83bd6b9d, 0x7f41, 0x11dc, 0xbe, 0x0b, \ + { 0x00, 0x15, 0x60, 0xb8, 0x4f, 0x0f } } +#define EFI_FREEBSD_SWAP { 0x516e7cb5, 0x6ecf, 0x11d6, 0x8f, 0xf8, \ + { 0x00, 0x02, 0x2d, 0x09, 0x71, 0x2b } } +#define EFI_FREEBSD_UFS { 0x516e7cb6, 0x6ecf, 0x11d6, 0x8f, 0xf8, \ + { 0x00, 0x02, 0x2d, 0x09, 0x71, 0x2b } } +#define EFI_FREEBSD_VINUM { 0x516e7cb8, 0x6ecf, 0x11d6, 0x8f, 0xf8, \ + { 0x00, 0x02, 0x2d, 0x09, 0x71, 0x2b } } +#define EFI_FREEBSD_ZFS { 0x516e7cba, 0x6ecf, 0x11d6, 0x8f, 0xf8, \ + { 0x00, 0x02, 0x2d, 0x09, 0x71, 0x2b } } + +/* From Wikipedia */ + +#define EFI_BIOS_BOOT { 0x21686148, 0x6449, 0x6e6f, 0x74, 0x4e, \ + { 0x65, 0x65, 0x64, 0x45, 0x46, 0x49 } } +#define EFI_INTC_RS { 0xd3bfe2de, 0x3daf, 0x11df, 0xba, 0x40, \ + { 0xe3, 0xa5, 0x56, 0xd8, 0x95, 0x93 } } +#define EFI_SNE_BOOT { 0xf4019732, 0x066e, 0x4e12, 0x82, 0x73, \ + { 0x34, 0x6c, 0x56, 0x41, 0x49, 0x4f } } +#define EFI_LENOVO_BOOT { 0xbfbfafe7, 0xa34f, 0x448a, 0x9a, 0x5b, \ + { 0x62, 0x13, 0xeb, 0x73, 0x6c, 0x22 } } +#define EFI_MSFT_LDMM { 0x5808c8aa, 0x7e8f, 0x42e0, 0x85, 0xd2, \ + { 0xe1, 0xe9, 0x04, 0x34, 0xcf, 0xb3 } } +#define EFI_MSFT_LDMD { 0xaf9b60a0, 0x1431, 0x4f62, 0xbc, 0x68, \ + { 0x33, 0x11, 0x71, 0x4a, 0x69, 0xad } } +#define EFI_MSFT_RE { 0xde94bba4, 0x06d1, 0x4d40, 0xa1, 0x6a, \ + { 0xbf, 0xd5, 0x01, 0x79, 0xd6, 0xac } } +#define EFI_IBM_GPFS { 0x37affc90, 0xef7d, 0x4e96, 0x91, 0xc3, \ + { 0x2d, 0x7a, 0xe0, 0x55, 0xb1, 0x74 } } +#define EFI_MSFT_STORAGESPACES { 0xe75caf8f, 0xf680, 0x4cee, 0xaf, 0xa3, \ + { 0xb0, 0x01, 0xe5, 0x6e, 0xfc, 0x2d } } +#define EFI_HPQ_DATA { 0x75894c1e, 0x3aeb, 0x11d3, 0xb7, 0xc1, \ + { 0x7b, 0x03, 0xa0, 0x00, 0x00, 0x00 } } +#define EFI_HPQ_SVC { 0xe2a1e728, 0x32e3, 0x11d6, 0xa6, 0x82, \ + { 0x7b, 0x03, 0xa0, 0x00, 0x00, 0x00 } } +#define EFI_RHT_DATA { 0x0fc63daf, 0x8483, 0x4772, 0x8e, 0x79, \ + { 0x3d, 0x69, 0xd8, 0x47, 0x7d, 0xe4 } } +#define EFI_RHT_HOME { 0x933ac7e1, 0x2eb4, 0x4f13, 0xb8, 0x44, \ + { 0x0e, 0x14, 0xe2, 0xae, 0xf9, 0x15 } } +#define EFI_RHT_SRV { 0x3b8f8425, 0x20e0, 0x4f3b, 0x90, 0x7f, \ + { 0x1a, 0x25, 0xa7, 0x6f, 0x98, 0xe8 } } +#define EFI_RHT_DMCRYPT { 0x7ffec5c9, 0x2d00, 0x49b7, 0x89, 0x41, \ + { 0x3e, 0xa1, 0x0a, 0x55, 0x86, 0xb7 } } +#define EFI_RHT_LUKS { 0xca7d7ccb, 0x63ed, 0x4c53, 0x86, 0x1c, \ + { 0x17, 0x42, 0x53, 0x60, 0x59, 0xcc } } +#define EFI_FREEBSD_DISKLABEL { 0x516e7cb4, 0x6ecf, 0x11d6, 0x8f, 0xf8, \ + { 0x00, 0x02, 0x2d, 0x09, 0x71, 0x2b } } +#define EFI_AAPL_RAID { 0x52414944, 0x0000, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_RAIDOFFLINE { 0x52414944, 0x5f4f, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_BOOT { 0x426f6f74, 0x0000, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_LABEL { 0x4c616265, 0x6c00, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_TVRECOVERY { 0x5265636f, 0x7665, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_AAPL_CORESTORAGE { 0x53746f72, 0x6167, 0x11aa, 0xaa, 0x11, \ + { 0x00, 0x30, 0x65, 0x43, 0xec, 0xac } } +#define EFI_NETBSD_SWAP { 0x49f48d32, 0xb10e, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_NETBSD_FFS { 0x49f48d5a, 0xb10e, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_NETBSD_LFS { 0x49f48d82, 0xb10e, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_NETBSD_RAID { 0x49f48daa, 0xb10e, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_NETBSD_CAT { 0x2db519c4, 0xb10f, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_NETBSD_CRYPT { 0x2db519ec, 0xb10f, 0x11dc, 0xb9, 0x9b, \ + { 0x00, 0x19, 0xd1, 0x87, 0x96, 0x48 } } +#define EFI_GOOG_KERN { 0xfe3a2a5d, 0x4f32, 0x41a7, 0xb7, 0x25, \ + { 0xac, 0xcc, 0x32, 0x85, 0xa3, 0x09 } } +#define EFI_GOOG_ROOT { 0x3cb8e202, 0x3b7e, 0x47dd, 0x8a, 0x3c, \ + { 0x7f, 0xf2, 0xa1, 0x3c, 0xfc, 0xec } } +#define EFI_GOOG_RESV { 0x2e0a753d, 0x9e48, 0x43b0, 0x83, 0x37, \ + { 0xb1, 0x51, 0x92, 0xcb, 0x1b, 0x5e } } +#define EFI_HAIKU_BFS { 0x42465331, 0x3ba3, 0x10f1, 0x80, 0x2a, \ + { 0x48, 0x61, 0x69, 0x6b, 0x75, 0x21 } } +#define EFI_MIDNIGHTBSD_BOOT { 0x85d5e45e, 0x237c, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_MIDNIGHTBSD_DATA { 0x85d5e45a, 0x237c, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_MIDNIGHTBSD_SWAP { 0x85d5e45b, 0x237c, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_MIDNIGHTBSD_UFS { 0x0394ef8b, 0x237e, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_MIDNIGHTBSD_VINUM { 0x85d5e45c, 0x237c, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_MIDNIGHTBSD_ZFS { 0x85d5e45d, 0x237c, 0x11e1, 0xb4, 0xb3, \ + { 0xe8, 0x9a, 0x8f, 0x7f, 0xc3, 0xa7 } } +#define EFI_CEPH_JOURNAL { 0x45b0969e, 0x9b03, 0x4f30, 0xb4, 0xc6, \ + { 0xb4, 0xb8, 0x0c, 0xef, 0xf1, 0x06 } } +#define EFI_CEPH_DMCRYPTJOURNAL { 0x45b0969e, 0x9b03, 0x4f30, 0xb4, 0xc6, \ + { 0x5e, 0xc0, 0x0c, 0xef, 0xf1, 0x06 } } +#define EFI_CEPH_OSD { 0x4fbd7e29, 0x9d25, 0x41b8, 0xaf, 0xd0, \ + { 0x06, 0x2c, 0x0c, 0xef, 0xf0, 0x5d } } +#define EFI_CEPH_DMCRYPTOSD { 0x4fbd7e29, 0x9d25, 0x41b8, 0xaf, 0xd0, \ + { 0x5e, 0xc0, 0x0c, 0xef, 0xf0, 0x5d } } +#define EFI_CEPH_CREATE { 0x89c57f98, 0x2fe5, 0x4dc0, 0x89, 0xc1, \ + { 0xf3, 0xad, 0x0c, 0xef, 0xf2, 0xbe } } +#define EFI_CEPH_DMCRYPTCREATE { 0x89c57f98, 0x2fe5, 0x4dc0, 0x89, 0xc1, \ + { 0x5e, 0xc0, 0x0c, 0xef, 0xf2, 0xbe } } +#define EFI_OPENBSD_DISKLABEL { 0x824cc7a0, 0x36a8, 0x11e3, 0x89, 0x0a, \ + { 0x95, 0x25, 0x19, 0xad, 0x3f, 0x61 } } +#define EFI_BBRY_QNX { 0xcef5a9ad, 0x73bc, 0x4601, 0x89, 0xf3, \ + { 0xcd, 0xee, 0xee, 0xe3, 0x21, 0xa1 } } +#define EFI_BELL_PLAN9 { 0xc91818f9, 0x8025, 0x47af, 0x89, 0xd2, \ + { 0xf0, 0x30, 0xd7, 0x00, 0x0c, 0x2c } } +#define EFI_VMW_KCORE { 0x9d275380, 0x40ad, 0x11db, 0xbf, 0x97, \ + { 0x00, 0x0c, 0x29, 0x11, 0xd1, 0xb8 } } +#define EFI_VMW_VMFS { 0xaa31e02a, 0x400f, 0x11db, 0x95, 0x90, \ + { 0x00, 0x0c, 0x29, 0x11, 0xd1, 0xb8 } } +#define EFI_VMW_RESV { 0x9198effc, 0x31c0, 0x11db, 0x8f, 0x78, \ + { 0x00, 0x0c, 0x29, 0x11, 0xd1, 0xb8 } } + +/* From GPT fdisk */ + +#define EFI_RHT_ROOTX86 { 0x44479540, 0xf297, 0x41b2, 0x9a, 0xf7, \ + { 0xd1, 0x31, 0xd5, 0xf0, 0x45, 0x8a } } +#define EFI_RHT_ROOTAMD64 { 0x4f68bce3, 0xe8cd, 0x4db1, 0x96, 0xe7, \ + { 0xfb, 0xca, 0xf9, 0x84, 0xb7, 0x09 } } +#define EFI_RHT_ROOTARM { 0x69dad710, 0x2ce4, 0x4e3c, 0xb1, 0x6c, \ + { 0x21, 0xa1, 0xd4, 0x9a, 0xbe, 0xd3 } } +#define EFI_RHT_ROOTARM64 { 0xb921b045, 0x1df0, 0x41c3, 0xaf, 0x44, \ + { 0x4c, 0x6f, 0x28, 0x0d, 0x3f, 0xae } } +#define EFI_ACRONIS_SECUREZONE { 0x0311fc50, 0x01ca, 0x4725, 0xad, 0x77, \ + { 0x9a, 0xdb, 0xb2, 0x0a, 0xce, 0x98 } } +#define EFI_ONIE_BOOT { 0x7412f7d5, 0xa156, 0x4b13, 0x81, 0xdc, \ + { 0x86, 0x71, 0x74, 0x92, 0x93, 0x25 } } +#define EFI_ONIE_CONFIG { 0xd4e6e2cd, 0x4469, 0x46f3, 0xb5, 0xcb, \ + { 0x1b, 0xff, 0x57, 0xaf, 0xc1, 0x49 } } +#define EFI_IBM_PPRPBOOT { 0x9e1a2d38, 0xc612, 0x4316, 0xaa, 0x26, \ + { 0x8b, 0x49, 0x52, 0x1e, 0x5a, 0x8b } } +#define EFI_FREEDESKTOP_BOOT { 0xbc13c2ff, 0x59e6, 0x4262, 0xa3, 0x52, \ + { 0xb2, 0x75, 0xfd, 0x6f, 0x71, 0x72 } } + +/* minimum # of bytes for partition table entries, per EFI spec */ +#define EFI_MIN_ARRAY_SIZE (16 * 1024) + +#define EFI_PART_NAME_LEN 36 + +/* size of the "reserved" partition, in blocks */ +#define EFI_MIN_RESV_SIZE (16 * 1024) + +/* EFI Guid Partition Entry */ +typedef struct efi_gpe { + struct uuid efi_gpe_PartitionTypeGUID; + struct uuid efi_gpe_UniquePartitionGUID; + diskaddr_t efi_gpe_StartingLBA; + diskaddr_t efi_gpe_EndingLBA; + efi_gpe_Attrs_t efi_gpe_Attributes; + ushort_t efi_gpe_PartitionName[EFI_PART_NAME_LEN]; +} efi_gpe_t; + +/* + * passed to the useful (we hope) routines (efi_alloc_and_read and + * efi_write) that take this VTOC-like struct. These routines handle + * converting this struct into the EFI struct, generate UUIDs and + * checksums, and perform any necessary byte-swapping to the on-disk + * format. + */ +/* Solaris library abstraction for EFI partitions */ +typedef struct dk_part { + diskaddr_t p_start; /* starting LBA */ + diskaddr_t p_size; /* size in blocks */ + struct uuid p_guid; /* partition type GUID */ + ushort_t p_tag; /* converted to part'n type GUID */ + ushort_t p_flag; /* attributes */ + char p_name[EFI_PART_NAME_LEN]; /* partition name */ + struct uuid p_uguid; /* unique partition GUID */ + uint_t p_resv[8]; /* future use - set to zero */ +} dk_part_t; + +/* Solaris library abstraction for an EFI GPT */ +#define EFI_VERSION102 0x00010002 +#define EFI_VERSION100 0x00010000 +#define EFI_VERSION_CURRENT EFI_VERSION100 +typedef struct dk_gpt { + uint_t efi_version; /* set to EFI_VERSION_CURRENT */ + uint_t efi_nparts; /* number of partitions below */ + uint_t efi_part_size; /* size of each partition entry */ + /* efi_part_size is unused */ + uint_t efi_lbasize; /* size of block in bytes */ + diskaddr_t efi_last_lba; /* last block on the disk */ + diskaddr_t efi_first_u_lba; /* first block after labels */ + diskaddr_t efi_last_u_lba; /* last block before backup labels */ + struct uuid efi_disk_uguid; /* unique disk GUID */ + uint_t efi_flags; + uint_t efi_reserved1; /* future use - set to zero */ + diskaddr_t efi_altern_lba; /* lba of alternate GPT header */ + uint_t efi_reserved[12]; /* future use - set to zero */ + struct dk_part efi_parts[1]; /* array of partitions */ +} dk_gpt_t; + +/* possible values for "efi_flags" */ +#define EFI_GPT_PRIMARY_CORRUPT 0x1 /* primary label corrupt */ + +/* the private ioctl between libefi and the driver */ +typedef struct dk_efi { + diskaddr_t dki_lba; /* starting block */ + len_t dki_length; /* length in bytes */ + union { + efi_gpt_t *_dki_data; + uint64_t _dki_data_64; + } dki_un; +#define dki_data dki_un._dki_data +#define dki_data_64 dki_un._dki_data_64 +} dk_efi_t; + +struct partition64 { + struct uuid p_type; + uint_t p_partno; + uint_t p_resv1; + diskaddr_t p_start; + diskaddr_t p_size; +}; + +/* + * Number of EFI partitions + */ +#if defined(__linux__) +#define EFI_NUMPAR 128 /* Expected by parted-1.8.1 */ +#else +#define EFI_NUMPAR 9 +#endif + +#ifndef _KERNEL +extern int efi_alloc_and_init(int, uint32_t, struct dk_gpt **); +extern int efi_alloc_and_read(int, struct dk_gpt **); +extern int efi_write(int, struct dk_gpt *); +extern int efi_rescan(int); +extern void efi_free(struct dk_gpt *); +extern int efi_type(int); +extern void efi_err_check(struct dk_gpt *); +extern int efi_auto_sense(int fd, struct dk_gpt **); +extern int efi_use_whole_disk(int fd); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_EFI_PARTITION_H */ diff --git a/include/sys/fm/Makefile.am b/include/sys/fm/Makefile.am new file mode 100644 index 000000000000..7c6c3d49b6e9 --- /dev/null +++ b/include/sys/fm/Makefile.am @@ -0,0 +1,17 @@ +SUBDIRS = fs + +COMMON_H = \ + protocol.h \ + util.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/fm +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fm +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/fm/fs/Makefile.am b/include/sys/fm/fs/Makefile.am new file mode 100644 index 000000000000..a662753a9e97 --- /dev/null +++ b/include/sys/fm/fs/Makefile.am @@ -0,0 +1,14 @@ +COMMON_H = \ + zfs.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/fm/fs +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fm/fs +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/fm/fs/zfs.h b/include/sys/fm/fs/zfs.h new file mode 100644 index 000000000000..9bfb123c76fe --- /dev/null +++ b/include/sys/fm/fs/zfs.h @@ -0,0 +1,121 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FM_FS_ZFS_H +#define _SYS_FM_FS_ZFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_ERROR_CLASS "fs.zfs" + +#define FM_EREPORT_ZFS_CHECKSUM "checksum" +#define FM_EREPORT_ZFS_AUTHENTICATION "authentication" +#define FM_EREPORT_ZFS_IO "io" +#define FM_EREPORT_ZFS_DATA "data" +#define FM_EREPORT_ZFS_DELAY "delay" +#define FM_EREPORT_ZFS_DEADMAN "deadman" +#define FM_EREPORT_ZFS_POOL "zpool" +#define FM_EREPORT_ZFS_DEVICE_UNKNOWN "vdev.unknown" +#define FM_EREPORT_ZFS_DEVICE_OPEN_FAILED "vdev.open_failed" +#define FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA "vdev.corrupt_data" +#define FM_EREPORT_ZFS_DEVICE_NO_REPLICAS "vdev.no_replicas" +#define FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM "vdev.bad_guid_sum" +#define FM_EREPORT_ZFS_DEVICE_TOO_SMALL "vdev.too_small" +#define FM_EREPORT_ZFS_DEVICE_BAD_LABEL "vdev.bad_label" +#define FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT "vdev.bad_ashift" +#define FM_EREPORT_ZFS_IO_FAILURE "io_failure" +#define FM_EREPORT_ZFS_PROBE_FAILURE "probe_failure" +#define FM_EREPORT_ZFS_LOG_REPLAY "log_replay" +#define FM_EREPORT_ZFS_CONFIG_CACHE_WRITE "config_cache_write" + +#define FM_EREPORT_PAYLOAD_ZFS_POOL "pool" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE "pool_failmode" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_GUID "pool_guid" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT "pool_context" +#define FM_EREPORT_PAYLOAD_ZFS_POOL_STATE "pool_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID "vdev_guid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH "vdev_physpath" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE "vdev_state" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE "vdev_laststate" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT "vdev_ashift" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS "vdev_complete_ts" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS "vdev_delta_ts" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS "vdev_spare_paths" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS "vdev_spare_guids" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS "vdev_read_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS "vdev_write_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS "vdev_cksum_errors" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS "vdev_delays" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" +#define FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID "parent_devid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET "zio_objset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT "zio_object" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL "zio_level" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID "zio_blkid" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR "zio_err" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS "zio_flags" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE "zio_stage" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE "zio_pipeline" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY "zio_delay" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP "zio_timestamp" +#define FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA "zio_delta" +#define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" + +#define FM_EREPORT_FAILMODE_WAIT "wait" +#define FM_EREPORT_FAILMODE_CONTINUE "continue" +#define FM_EREPORT_FAILMODE_PANIC "panic" + +#define FM_RESOURCE_REMOVED "removed" +#define FM_RESOURCE_AUTOREPLACE "autoreplace" +#define FM_RESOURCE_STATECHANGE "statechange" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_FS_ZFS_H */ diff --git a/include/sys/fm/protocol.h b/include/sys/fm/protocol.h new file mode 100644 index 000000000000..78031f7c15ec --- /dev/null +++ b/include/sys/fm/protocol.h @@ -0,0 +1,369 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FM_PROTOCOL_H +#define _SYS_FM_PROTOCOL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL +#include +#else +#include +#include +#endif +#include + +/* FM common member names */ +#define FM_CLASS "class" +#define FM_VERSION "version" + +/* FM protocol category 1 class names */ +#define FM_EREPORT_CLASS "ereport" +#define FM_FAULT_CLASS "fault" +#define FM_DEFECT_CLASS "defect" +#define FM_RSRC_CLASS "resource" +#define FM_LIST_EVENT "list" +#define FM_IREPORT_CLASS "ireport" +#define FM_SYSEVENT_CLASS "sysevent" + +/* FM list.* event class values */ +#define FM_LIST_SUSPECT_CLASS FM_LIST_EVENT ".suspect" +#define FM_LIST_ISOLATED_CLASS FM_LIST_EVENT ".isolated" +#define FM_LIST_REPAIRED_CLASS FM_LIST_EVENT ".repaired" +#define FM_LIST_UPDATED_CLASS FM_LIST_EVENT ".updated" +#define FM_LIST_RESOLVED_CLASS FM_LIST_EVENT ".resolved" + +/* ereport class subcategory values */ +#define FM_ERROR_CPU "cpu" +#define FM_ERROR_IO "io" + +/* ereport version and payload member names */ +#define FM_EREPORT_VERS0 0 +#define FM_EREPORT_VERSION FM_EREPORT_VERS0 + +/* ereport payload member names */ +#define FM_EREPORT_DETECTOR "detector" +#define FM_EREPORT_ENA "ena" +#define FM_EREPORT_TIME "time" +#define FM_EREPORT_EID "eid" + +/* list.* event payload member names */ +#define FM_LIST_EVENT_SIZE "list-sz" + +/* ireport.* event payload member names */ +#define FM_IREPORT_DETECTOR "detector" +#define FM_IREPORT_UUID "uuid" +#define FM_IREPORT_PRIORITY "pri" +#define FM_IREPORT_ATTRIBUTES "attr" + +/* + * list.suspect, isolated, updated, repaired and resolved + * versions/payload member names. + */ +#define FM_SUSPECT_UUID "uuid" +#define FM_SUSPECT_DIAG_CODE "code" +#define FM_SUSPECT_DIAG_TIME "diag-time" +#define FM_SUSPECT_DE "de" +#define FM_SUSPECT_FAULT_LIST "fault-list" +#define FM_SUSPECT_FAULT_SZ "fault-list-sz" +#define FM_SUSPECT_FAULT_STATUS "fault-status" +#define FM_SUSPECT_INJECTED "__injected" +#define FM_SUSPECT_MESSAGE "message" +#define FM_SUSPECT_RETIRE "retire" +#define FM_SUSPECT_RESPONSE "response" +#define FM_SUSPECT_SEVERITY "severity" + +#define FM_SUSPECT_VERS0 0 +#define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 + +#define FM_SUSPECT_FAULTY 0x1 +#define FM_SUSPECT_UNUSABLE 0x2 +#define FM_SUSPECT_NOT_PRESENT 0x4 +#define FM_SUSPECT_DEGRADED 0x8 +#define FM_SUSPECT_REPAIRED 0x10 +#define FM_SUSPECT_REPLACED 0x20 +#define FM_SUSPECT_ACQUITTED 0x40 + +/* fault event versions and payload member names */ +#define FM_FAULT_VERS0 0 +#define FM_FAULT_VERSION FM_FAULT_VERS0 + +#define FM_FAULT_ASRU "asru" +#define FM_FAULT_FRU "fru" +#define FM_FAULT_FRU_LABEL "fru-label" +#define FM_FAULT_CERTAINTY "certainty" +#define FM_FAULT_RESOURCE "resource" +#define FM_FAULT_LOCATION "location" + +/* resource event versions and payload member names */ +#define FM_RSRC_VERS0 0 +#define FM_RSRC_VERSION FM_RSRC_VERS0 +#define FM_RSRC_RESOURCE "resource" + +/* resource.fm.asru.* payload member names */ +#define FM_RSRC_ASRU_UUID "uuid" +#define FM_RSRC_ASRU_CODE "code" +#define FM_RSRC_ASRU_FAULTY "faulty" +#define FM_RSRC_ASRU_REPAIRED "repaired" +#define FM_RSRC_ASRU_REPLACED "replaced" +#define FM_RSRC_ASRU_ACQUITTED "acquitted" +#define FM_RSRC_ASRU_RESOLVED "resolved" +#define FM_RSRC_ASRU_UNUSABLE "unusable" +#define FM_RSRC_ASRU_EVENT "event" + +/* resource.fm.xprt.* versions and payload member names */ +#define FM_RSRC_XPRT_VERS0 0 +#define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 +#define FM_RSRC_XPRT_UUID "uuid" +#define FM_RSRC_XPRT_SUBCLASS "subclass" +#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" +#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" + +/* + * FM ENA Format Macros + */ +#define ENA_FORMAT_MASK 0x3 +#define ENA_FORMAT(ena) ((ena) & ENA_FORMAT_MASK) + +/* ENA format types */ +#define FM_ENA_FMT0 0 +#define FM_ENA_FMT1 1 +#define FM_ENA_FMT2 2 + +/* Format 1 */ +#define ENA_FMT1_GEN_MASK 0x00000000000003FCull +#define ENA_FMT1_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT1_CPUID_MASK 0x00000000000FFC00ull +#define ENA_FMT1_TIME_MASK 0xFFFFFFFFFFF00000ull +#define ENA_FMT1_GEN_SHFT 2 +#define ENA_FMT1_ID_SHFT 10 +#define ENA_FMT1_CPUID_SHFT ENA_FMT1_ID_SHFT +#define ENA_FMT1_TIME_SHFT 20 + +/* Format 2 */ +#define ENA_FMT2_GEN_MASK 0x00000000000003FCull +#define ENA_FMT2_ID_MASK 0xFFFFFFFFFFFFFC00ull +#define ENA_FMT2_TIME_MASK ENA_FMT2_ID_MASK +#define ENA_FMT2_GEN_SHFT 2 +#define ENA_FMT2_ID_SHFT 10 +#define ENA_FMT2_TIME_SHFT ENA_FMT2_ID_SHFT + +/* Common FMRI type names */ +#define FM_FMRI_AUTHORITY "authority" +#define FM_FMRI_SCHEME "scheme" +#define FM_FMRI_SVC_AUTHORITY "svc-authority" +#define FM_FMRI_FACILITY "facility" + +/* FMRI authority-type member names */ +#define FM_FMRI_AUTH_CHASSIS "chassis-id" +#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" +#define FM_FMRI_AUTH_PRODUCT "product-id" +#define FM_FMRI_AUTH_DOMAIN "domain-id" +#define FM_FMRI_AUTH_SERVER "server-id" +#define FM_FMRI_AUTH_HOST "host-id" + +#define FM_AUTH_VERS0 0 +#define FM_FMRI_AUTH_VERSION FM_AUTH_VERS0 + +/* scheme name values */ +#define FM_FMRI_SCHEME_FMD "fmd" +#define FM_FMRI_SCHEME_DEV "dev" +#define FM_FMRI_SCHEME_HC "hc" +#define FM_FMRI_SCHEME_SVC "svc" +#define FM_FMRI_SCHEME_CPU "cpu" +#define FM_FMRI_SCHEME_MEM "mem" +#define FM_FMRI_SCHEME_MOD "mod" +#define FM_FMRI_SCHEME_PKG "pkg" +#define FM_FMRI_SCHEME_LEGACY "legacy-hc" +#define FM_FMRI_SCHEME_ZFS "zfs" +#define FM_FMRI_SCHEME_SW "sw" + +/* Scheme versions */ +#define FMD_SCHEME_VERSION0 0 +#define FM_FMD_SCHEME_VERSION FMD_SCHEME_VERSION0 +#define DEV_SCHEME_VERSION0 0 +#define FM_DEV_SCHEME_VERSION DEV_SCHEME_VERSION0 +#define FM_HC_VERS0 0 +#define FM_HC_SCHEME_VERSION FM_HC_VERS0 +#define CPU_SCHEME_VERSION0 0 +#define CPU_SCHEME_VERSION1 1 +#define FM_CPU_SCHEME_VERSION CPU_SCHEME_VERSION1 +#define MEM_SCHEME_VERSION0 0 +#define FM_MEM_SCHEME_VERSION MEM_SCHEME_VERSION0 +#define MOD_SCHEME_VERSION0 0 +#define FM_MOD_SCHEME_VERSION MOD_SCHEME_VERSION0 +#define PKG_SCHEME_VERSION0 0 +#define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 +#define LEGACY_SCHEME_VERSION0 0 +#define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 +#define SVC_SCHEME_VERSION0 0 +#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 +#define ZFS_SCHEME_VERSION0 0 +#define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 +#define SW_SCHEME_VERSION0 0 +#define FM_SW_SCHEME_VERSION SW_SCHEME_VERSION0 + +/* hc scheme member names */ +#define FM_FMRI_HC_SERIAL_ID "serial" +#define FM_FMRI_HC_PART "part" +#define FM_FMRI_HC_REVISION "revision" +#define FM_FMRI_HC_ROOT "hc-root" +#define FM_FMRI_HC_LIST_SZ "hc-list-sz" +#define FM_FMRI_HC_LIST "hc-list" +#define FM_FMRI_HC_SPECIFIC "hc-specific" + +/* facility member names */ +#define FM_FMRI_FACILITY_NAME "facility-name" +#define FM_FMRI_FACILITY_TYPE "facility-type" + +/* hc-list version and member names */ +#define FM_FMRI_HC_NAME "hc-name" +#define FM_FMRI_HC_ID "hc-id" + +#define HC_LIST_VERSION0 0 +#define FM_HC_LIST_VERSION HC_LIST_VERSION0 + +/* hc-specific member names */ +#define FM_FMRI_HC_SPECIFIC_OFFSET "offset" +#define FM_FMRI_HC_SPECIFIC_PHYSADDR "physaddr" + +/* fmd module scheme member names */ +#define FM_FMRI_FMD_NAME "mod-name" +#define FM_FMRI_FMD_VERSION "mod-version" + +/* dev scheme member names */ +#define FM_FMRI_DEV_ID "devid" +#define FM_FMRI_DEV_TGTPTLUN0 "target-port-l0id" +#define FM_FMRI_DEV_PATH "device-path" + +/* pkg scheme member names */ +#define FM_FMRI_PKG_BASEDIR "pkg-basedir" +#define FM_FMRI_PKG_INST "pkg-inst" +#define FM_FMRI_PKG_VERSION "pkg-version" + +/* svc scheme member names */ +#define FM_FMRI_SVC_NAME "svc-name" +#define FM_FMRI_SVC_INSTANCE "svc-instance" +#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id" + +/* svc-authority member names */ +#define FM_FMRI_SVC_AUTH_SCOPE "scope" +#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn" + +/* cpu scheme member names */ +#define FM_FMRI_CPU_ID "cpuid" +#define FM_FMRI_CPU_SERIAL_ID "serial" +#define FM_FMRI_CPU_MASK "cpumask" +#define FM_FMRI_CPU_VID "cpuvid" +#define FM_FMRI_CPU_CPUFRU "cpufru" +#define FM_FMRI_CPU_CACHE_INDEX "cacheindex" +#define FM_FMRI_CPU_CACHE_WAY "cacheway" +#define FM_FMRI_CPU_CACHE_BIT "cachebit" +#define FM_FMRI_CPU_CACHE_TYPE "cachetype" + +#define FM_FMRI_CPU_CACHE_TYPE_L2 0 +#define FM_FMRI_CPU_CACHE_TYPE_L3 1 + +/* legacy-hc scheme member names */ +#define FM_FMRI_LEGACY_HC "component" +#define FM_FMRI_LEGACY_HC_PREFIX FM_FMRI_SCHEME_HC":///" \ + FM_FMRI_LEGACY_HC"=" + +/* mem scheme member names */ +#define FM_FMRI_MEM_UNUM "unum" +#define FM_FMRI_MEM_SERIAL_ID "serial" +#define FM_FMRI_MEM_PHYSADDR "physaddr" +#define FM_FMRI_MEM_MEMCONFIG "memconfig" +#define FM_FMRI_MEM_OFFSET "offset" + +/* mod scheme member names */ +#define FM_FMRI_MOD_PKG "mod-pkg" +#define FM_FMRI_MOD_NAME "mod-name" +#define FM_FMRI_MOD_ID "mod-id" +#define FM_FMRI_MOD_DESC "mod-desc" + +/* zfs scheme member names */ +#define FM_FMRI_ZFS_POOL "pool" +#define FM_FMRI_ZFS_VDEV "vdev" + +/* sw scheme member names - extra indentation for members of an nvlist */ +#define FM_FMRI_SW_OBJ "object" +#define FM_FMRI_SW_OBJ_PATH "path" +#define FM_FMRI_SW_OBJ_ROOT "root" +#define FM_FMRI_SW_OBJ_PKG "pkg" +#define FM_FMRI_SW_SITE "site" +#define FM_FMRI_SW_SITE_TOKEN "token" +#define FM_FMRI_SW_SITE_MODULE "module" +#define FM_FMRI_SW_SITE_FILE "file" +#define FM_FMRI_SW_SITE_LINE "line" +#define FM_FMRI_SW_SITE_FUNC "func" +#define FM_FMRI_SW_CTXT "context" +#define FM_FMRI_SW_CTXT_ORIGIN "origin" +#define FM_FMRI_SW_CTXT_EXECNAME "execname" +#define FM_FMRI_SW_CTXT_PID "pid" +#define FM_FMRI_SW_CTXT_ZONE "zone" +#define FM_FMRI_SW_CTXT_CTID "ctid" +#define FM_FMRI_SW_CTXT_STACK "stack" +#define FM_NVA_FREE 0 /* free allocator on nvlist_destroy */ +#define FM_NVA_RETAIN 1 /* keep allocator on nvlist_destroy */ + +extern nv_alloc_t *fm_nva_xcreate(char *, size_t); +extern void fm_nva_xdestroy(nv_alloc_t *); +extern nvlist_t *fm_nvlist_create(nv_alloc_t *); +extern void fm_nvlist_destroy(nvlist_t *, int); +extern void fm_ereport_set(nvlist_t *, int, const char *, uint64_t, + const nvlist_t *, ...); +extern void fm_payload_set(nvlist_t *, ...); +extern int i_fm_payload_set(nvlist_t *, const char *, va_list); +extern void fm_fmri_hc_set(nvlist_t *, int, const nvlist_t *, nvlist_t *, + int, ...); +extern void fm_fmri_dev_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *, const char *); +extern void fm_fmri_de_set(nvlist_t *, int, const nvlist_t *, const char *); +extern void fm_fmri_cpu_set(nvlist_t *, int, const nvlist_t *, uint32_t, + uint8_t *, const char *); +extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, + const char *, uint64_t); +extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); +extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, + nvlist_t *, int, ...); + +extern uint64_t fm_ena_increment(uint64_t); +extern uint64_t fm_ena_generate(uint64_t, uchar_t); +extern uint64_t fm_ena_generate_cpu(uint64_t, processorid_t, uchar_t); +extern uint64_t fm_ena_generation_get(uint64_t); +extern uchar_t fm_ena_format_get(uint64_t); +extern uint64_t fm_ena_id_get(uint64_t); +extern uint64_t fm_ena_time_get(uint64_t); +extern void fm_erpt_dropped_increment(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_PROTOCOL_H */ diff --git a/include/sys/fm/util.h b/include/sys/fm/util.h new file mode 100644 index 000000000000..ff54b05bb6af --- /dev/null +++ b/include/sys/fm/util.h @@ -0,0 +1,118 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FM_UTIL_H +#define _SYS_FM_UTIL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * Shared user/kernel definitions for class length, error channel name, + * and kernel event publisher string. + */ +#define FM_MAX_CLASS 100 +#define FM_ERROR_CHAN "com.sun:fm:error" +#define FM_PUB "fm" + +/* + * ereport dump device transport support + * + * Ereports are written out to the dump device at a proscribed offset from the + * end, similar to in-transit log messages. The ereports are represented as a + * erpt_dump_t header followed by ed_size bytes of packed native nvlist data. + * + * NOTE: All of these constants and the header must be defined so they have the + * same representation for *both* 32-bit and 64-bit producers and consumers. + */ +#define ERPT_MAGIC 0xf00d4eddU +#define ERPT_MAX_ERRS 16 +#define ERPT_DATA_SZ (6 * 1024) +#define ERPT_EVCH_MAX 256 +#define ERPT_HIWAT 64 + +typedef struct erpt_dump { + uint32_t ed_magic; /* ERPT_MAGIC or zero to indicate end */ + uint32_t ed_chksum; /* checksum32() of packed nvlist data */ + uint32_t ed_size; /* ereport (nvl) fixed buf size */ + uint32_t ed_pad; /* reserved for future use */ + hrtime_t ed_hrt_nsec; /* hrtime of this ereport */ + hrtime_t ed_hrt_base; /* hrtime sample corresponding to ed_tod_base */ + struct { + uint64_t sec; /* seconds since gettimeofday() Epoch */ + uint64_t nsec; /* nanoseconds past ed_tod_base.sec */ + } ed_tod_base; +} erpt_dump_t; + +#ifdef _KERNEL + +#define ZEVENT_SHUTDOWN 0x1 + +typedef void zevent_cb_t(nvlist_t *, nvlist_t *); + +typedef struct zevent_s { + nvlist_t *ev_nvl; /* protected by the zevent_lock */ + nvlist_t *ev_detector; /* " */ + list_t ev_ze_list; /* " */ + list_node_t ev_node; /* " */ + zevent_cb_t *ev_cb; /* " */ + uint64_t ev_eid; +} zevent_t; + +typedef struct zfs_zevent { + zevent_t *ze_zevent; /* protected by the zevent_lock */ + list_node_t ze_node; /* " */ + uint64_t ze_dropped; /* " */ +} zfs_zevent_t; + +extern void fm_init(void); +extern void fm_fini(void); +extern void fm_nvprint(nvlist_t *); +extern void zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector); +extern int zfs_zevent_post(nvlist_t *, nvlist_t *, zevent_cb_t *); +extern void zfs_zevent_drain_all(int *); +extern int zfs_zevent_fd_hold(int, minor_t *, zfs_zevent_t **); +extern void zfs_zevent_fd_rele(int); +extern int zfs_zevent_next(zfs_zevent_t *, nvlist_t **, uint64_t *, uint64_t *); +extern int zfs_zevent_wait(zfs_zevent_t *); +extern int zfs_zevent_seek(zfs_zevent_t *, uint64_t); +extern void zfs_zevent_init(zfs_zevent_t **); +extern void zfs_zevent_destroy(zfs_zevent_t *); + +#else + +static inline void fm_init(void) { } +static inline void fm_fini(void) { } + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FM_UTIL_H */ diff --git a/include/sys/frame.h b/include/sys/frame.h new file mode 100644 index 000000000000..2865dbb57dc3 --- /dev/null +++ b/include/sys/frame.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2017 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_FRAME_H +#define _SYS_FRAME_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__KERNEL__) && defined(HAVE_STACK_FRAME_NON_STANDARD) +#include +#else +#define STACK_FRAME_NON_STANDARD(func) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FRAME_H */ diff --git a/include/sys/fs/Makefile.am b/include/sys/fs/Makefile.am new file mode 100644 index 000000000000..6a93053c8e2b --- /dev/null +++ b/include/sys/fs/Makefile.am @@ -0,0 +1,14 @@ +COMMON_H = \ + zfs.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/fs +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/fs +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/fs/zfs.h b/include/sys/fs/zfs.h new file mode 100644 index 000000000000..d3acd674a6c6 --- /dev/null +++ b/include/sys/fs/zfs.h @@ -0,0 +1,1589 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, 2017 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019 Datto Inc. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_FS_ZFS_H +#define _SYS_FS_ZFS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Types and constants shared between userland and the kernel. + */ + +/* + * Each dataset can be one of the following types. These constants can be + * combined into masks that can be passed to various functions. + */ +typedef enum { + ZFS_TYPE_FILESYSTEM = (1 << 0), + ZFS_TYPE_SNAPSHOT = (1 << 1), + ZFS_TYPE_VOLUME = (1 << 2), + ZFS_TYPE_POOL = (1 << 3), + ZFS_TYPE_BOOKMARK = (1 << 4) +} zfs_type_t; + +/* + * NB: lzc_dataset_type should be updated whenever a new objset type is added, + * if it represents a real type of a dataset that can be created from userland. + */ +typedef enum dmu_objset_type { + DMU_OST_NONE, + DMU_OST_META, + DMU_OST_ZFS, + DMU_OST_ZVOL, + DMU_OST_OTHER, /* For testing only! */ + DMU_OST_ANY, /* Be careful! */ + DMU_OST_NUMTYPES +} dmu_objset_type_t; + +#define ZFS_TYPE_DATASET \ + (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) + +/* + * All of these include the terminating NUL byte. + */ +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN (1024 * 8) +#define ZAP_OLDMAXVALUELEN 1024 +#define ZFS_MAX_DATASET_NAME_LEN 256 + +/* + * Dataset properties are identified by these constants and must be added to + * the end of this list to ensure that external consumers are not affected + * by the change. If you make any changes to this list, be sure to update + * the property table in module/zcommon/zfs_prop.c. + */ +typedef enum { + ZPROP_CONT = -2, + ZPROP_INVAL = -1, + ZFS_PROP_TYPE = 0, + ZFS_PROP_CREATION, + ZFS_PROP_USED, + ZFS_PROP_AVAILABLE, + ZFS_PROP_REFERENCED, + ZFS_PROP_COMPRESSRATIO, + ZFS_PROP_MOUNTED, + ZFS_PROP_ORIGIN, + ZFS_PROP_QUOTA, + ZFS_PROP_RESERVATION, + ZFS_PROP_VOLSIZE, + ZFS_PROP_VOLBLOCKSIZE, + ZFS_PROP_RECORDSIZE, + ZFS_PROP_MOUNTPOINT, + ZFS_PROP_SHARENFS, + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_ATIME, + ZFS_PROP_DEVICES, + ZFS_PROP_EXEC, + ZFS_PROP_SETUID, + ZFS_PROP_READONLY, + ZFS_PROP_ZONED, + ZFS_PROP_SNAPDIR, + ZFS_PROP_ACLMODE, + ZFS_PROP_ACLINHERIT, + ZFS_PROP_CREATETXG, + ZFS_PROP_NAME, /* not exposed to the user */ + ZFS_PROP_CANMOUNT, + ZFS_PROP_ISCSIOPTIONS, /* not exposed to the user */ + ZFS_PROP_XATTR, + ZFS_PROP_NUMCLONES, /* not exposed to the user */ + ZFS_PROP_COPIES, + ZFS_PROP_VERSION, + ZFS_PROP_UTF8ONLY, + ZFS_PROP_NORMALIZE, + ZFS_PROP_CASE, + ZFS_PROP_VSCAN, + ZFS_PROP_NBMAND, + ZFS_PROP_SHARESMB, + ZFS_PROP_REFQUOTA, + ZFS_PROP_REFRESERVATION, + ZFS_PROP_GUID, + ZFS_PROP_PRIMARYCACHE, + ZFS_PROP_SECONDARYCACHE, + ZFS_PROP_USEDSNAP, + ZFS_PROP_USEDDS, + ZFS_PROP_USEDCHILD, + ZFS_PROP_USEDREFRESERV, + ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ + ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ + ZFS_PROP_DEFER_DESTROY, + ZFS_PROP_USERREFS, + ZFS_PROP_LOGBIAS, + ZFS_PROP_UNIQUE, /* not exposed to the user */ + ZFS_PROP_OBJSETID, + ZFS_PROP_DEDUP, + ZFS_PROP_MLSLABEL, + ZFS_PROP_SYNC, + ZFS_PROP_DNODESIZE, + ZFS_PROP_REFRATIO, + ZFS_PROP_WRITTEN, + ZFS_PROP_CLONES, + ZFS_PROP_LOGICALUSED, + ZFS_PROP_LOGICALREFERENCED, + ZFS_PROP_INCONSISTENT, /* not exposed to the user */ + ZFS_PROP_VOLMODE, + ZFS_PROP_FILESYSTEM_LIMIT, + ZFS_PROP_SNAPSHOT_LIMIT, + ZFS_PROP_FILESYSTEM_COUNT, + ZFS_PROP_SNAPSHOT_COUNT, + ZFS_PROP_SNAPDEV, + ZFS_PROP_ACLTYPE, + ZFS_PROP_SELINUX_CONTEXT, + ZFS_PROP_SELINUX_FSCONTEXT, + ZFS_PROP_SELINUX_DEFCONTEXT, + ZFS_PROP_SELINUX_ROOTCONTEXT, + ZFS_PROP_RELATIME, + ZFS_PROP_REDUNDANT_METADATA, + ZFS_PROP_OVERLAY, + ZFS_PROP_PREV_SNAP, + ZFS_PROP_RECEIVE_RESUME_TOKEN, + ZFS_PROP_ENCRYPTION, + ZFS_PROP_KEYLOCATION, + ZFS_PROP_KEYFORMAT, + ZFS_PROP_PBKDF2_SALT, + ZFS_PROP_PBKDF2_ITERS, + ZFS_PROP_ENCRYPTION_ROOT, + ZFS_PROP_KEY_GUID, + ZFS_PROP_KEYSTATUS, + ZFS_PROP_REMAPTXG, /* obsolete - no longer used */ + ZFS_PROP_SPECIAL_SMALL_BLOCKS, + ZFS_PROP_IVSET_GUID, /* not exposed to the user */ + ZFS_PROP_REDACTED, + ZFS_PROP_REDACT_SNAPS, + ZFS_NUM_PROPS +} zfs_prop_t; + +typedef enum { + ZFS_PROP_USERUSED, + ZFS_PROP_USERQUOTA, + ZFS_PROP_GROUPUSED, + ZFS_PROP_GROUPQUOTA, + ZFS_PROP_USEROBJUSED, + ZFS_PROP_USEROBJQUOTA, + ZFS_PROP_GROUPOBJUSED, + ZFS_PROP_GROUPOBJQUOTA, + ZFS_PROP_PROJECTUSED, + ZFS_PROP_PROJECTQUOTA, + ZFS_PROP_PROJECTOBJUSED, + ZFS_PROP_PROJECTOBJQUOTA, + ZFS_NUM_USERQUOTA_PROPS +} zfs_userquota_prop_t; + +extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; + +/* + * Pool properties are identified by these constants and must be added to the + * end of this list to ensure that external consumers are not affected + * by the change. Properties must be registered in zfs_prop_init(). + */ +typedef enum { + ZPOOL_PROP_INVAL = -1, + ZPOOL_PROP_NAME, + ZPOOL_PROP_SIZE, + ZPOOL_PROP_CAPACITY, + ZPOOL_PROP_ALTROOT, + ZPOOL_PROP_HEALTH, + ZPOOL_PROP_GUID, + ZPOOL_PROP_VERSION, + ZPOOL_PROP_BOOTFS, + ZPOOL_PROP_DELEGATION, + ZPOOL_PROP_AUTOREPLACE, + ZPOOL_PROP_CACHEFILE, + ZPOOL_PROP_FAILUREMODE, + ZPOOL_PROP_LISTSNAPS, + ZPOOL_PROP_AUTOEXPAND, + ZPOOL_PROP_DEDUPDITTO, + ZPOOL_PROP_DEDUPRATIO, + ZPOOL_PROP_FREE, + ZPOOL_PROP_ALLOCATED, + ZPOOL_PROP_READONLY, + ZPOOL_PROP_ASHIFT, + ZPOOL_PROP_COMMENT, + ZPOOL_PROP_EXPANDSZ, + ZPOOL_PROP_FREEING, + ZPOOL_PROP_FRAGMENTATION, + ZPOOL_PROP_LEAKED, + ZPOOL_PROP_MAXBLOCKSIZE, + ZPOOL_PROP_TNAME, + ZPOOL_PROP_MAXDNODESIZE, + ZPOOL_PROP_MULTIHOST, + ZPOOL_PROP_CHECKPOINT, + ZPOOL_PROP_LOAD_GUID, + ZPOOL_PROP_AUTOTRIM, + ZPOOL_NUM_PROPS +} zpool_prop_t; + +/* Small enough to not hog a whole line of printout in zpool(1M). */ +#define ZPROP_MAX_COMMENT 32 + +#define ZPROP_VALUE "value" +#define ZPROP_SOURCE "source" + +typedef enum { + ZPROP_SRC_NONE = 0x1, + ZPROP_SRC_DEFAULT = 0x2, + ZPROP_SRC_TEMPORARY = 0x4, + ZPROP_SRC_LOCAL = 0x8, + ZPROP_SRC_INHERITED = 0x10, + ZPROP_SRC_RECEIVED = 0x20 +} zprop_source_t; + +#define ZPROP_SRC_ALL 0x3f + +#define ZPROP_SOURCE_VAL_RECVD "$recvd" +#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" + +/* + * Dataset flag implemented as a special entry in the props zap object + * indicating that the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties + * just as it did in earlier versions, and thereafter, local properties are + * preserved. + */ +#define ZPROP_HAS_RECVD "$hasrecvd" + +typedef enum { + ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ + ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ +} zprop_errflags_t; + +typedef int (*zprop_func)(int, void *); + +/* + * Properties to be set on the root file system of a new pool + * are stuffed into their own nvlist, which is then included in + * the properties nvlist with the pool properties. + */ +#define ZPOOL_ROOTFS_PROPS "root-props-nvl" + +/* + * Length of 'written@' and 'written#' + */ +#define ZFS_WRITTEN_PROP_PREFIX_LEN 8 + +/* + * Dataset property functions shared between libzfs and kernel. + */ +const char *zfs_prop_default_string(zfs_prop_t); +uint64_t zfs_prop_default_numeric(zfs_prop_t); +boolean_t zfs_prop_readonly(zfs_prop_t); +boolean_t zfs_prop_visible(zfs_prop_t prop); +boolean_t zfs_prop_inheritable(zfs_prop_t); +boolean_t zfs_prop_setonce(zfs_prop_t); +boolean_t zfs_prop_encryption_key_param(zfs_prop_t); +boolean_t zfs_prop_valid_keylocation(const char *, boolean_t); +const char *zfs_prop_to_name(zfs_prop_t); +zfs_prop_t zfs_name_to_prop(const char *); +boolean_t zfs_prop_user(const char *); +boolean_t zfs_prop_userquota(const char *); +boolean_t zfs_prop_written(const char *); +int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); +int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); +uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); +boolean_t zfs_prop_valid_for_type(int, zfs_type_t, boolean_t); + +/* + * Pool property functions shared between libzfs and kernel. + */ +zpool_prop_t zpool_name_to_prop(const char *); +const char *zpool_prop_to_name(zpool_prop_t); +const char *zpool_prop_default_string(zpool_prop_t); +uint64_t zpool_prop_default_numeric(zpool_prop_t); +boolean_t zpool_prop_readonly(zpool_prop_t); +boolean_t zpool_prop_setonce(zpool_prop_t); +boolean_t zpool_prop_feature(const char *); +boolean_t zpool_prop_unsupported(const char *); +int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); +int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); +uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); + +/* + * Definitions for the Delegation. + */ +typedef enum { + ZFS_DELEG_WHO_UNKNOWN = 0, + ZFS_DELEG_USER = 'u', + ZFS_DELEG_USER_SETS = 'U', + ZFS_DELEG_GROUP = 'g', + ZFS_DELEG_GROUP_SETS = 'G', + ZFS_DELEG_EVERYONE = 'e', + ZFS_DELEG_EVERYONE_SETS = 'E', + ZFS_DELEG_CREATE = 'c', + ZFS_DELEG_CREATE_SETS = 'C', + ZFS_DELEG_NAMED_SET = 's', + ZFS_DELEG_NAMED_SET_SETS = 'S' +} zfs_deleg_who_type_t; + +typedef enum { + ZFS_DELEG_NONE = 0, + ZFS_DELEG_PERM_LOCAL = 1, + ZFS_DELEG_PERM_DESCENDENT = 2, + ZFS_DELEG_PERM_LOCALDESCENDENT = 3, + ZFS_DELEG_PERM_CREATE = 4 +} zfs_deleg_inherit_t; + +#define ZFS_DELEG_PERM_UID "uid" +#define ZFS_DELEG_PERM_GID "gid" +#define ZFS_DELEG_PERM_GROUPS "groups" + +#define ZFS_MLSLABEL_DEFAULT "none" + +#define ZFS_SMB_ACL_SRC "src" +#define ZFS_SMB_ACL_TARGET "target" + +typedef enum { + ZFS_CANMOUNT_OFF = 0, + ZFS_CANMOUNT_ON = 1, + ZFS_CANMOUNT_NOAUTO = 2 +} zfs_canmount_type_t; + +typedef enum { + ZFS_LOGBIAS_LATENCY = 0, + ZFS_LOGBIAS_THROUGHPUT = 1 +} zfs_logbias_op_t; + +typedef enum zfs_share_op { + ZFS_SHARE_NFS = 0, + ZFS_UNSHARE_NFS = 1, + ZFS_SHARE_SMB = 2, + ZFS_UNSHARE_SMB = 3 +} zfs_share_op_t; + +typedef enum zfs_smb_acl_op { + ZFS_SMB_ACL_ADD, + ZFS_SMB_ACL_REMOVE, + ZFS_SMB_ACL_RENAME, + ZFS_SMB_ACL_PURGE +} zfs_smb_acl_op_t; + +typedef enum zfs_cache_type { + ZFS_CACHE_NONE = 0, + ZFS_CACHE_METADATA = 1, + ZFS_CACHE_ALL = 2 +} zfs_cache_type_t; + +typedef enum { + ZFS_SYNC_STANDARD = 0, + ZFS_SYNC_ALWAYS = 1, + ZFS_SYNC_DISABLED = 2 +} zfs_sync_type_t; + +typedef enum { + ZFS_XATTR_OFF = 0, + ZFS_XATTR_DIR = 1, + ZFS_XATTR_SA = 2 +} zfs_xattr_type_t; + +typedef enum { + ZFS_DNSIZE_LEGACY = 0, + ZFS_DNSIZE_AUTO = 1, + ZFS_DNSIZE_1K = 1024, + ZFS_DNSIZE_2K = 2048, + ZFS_DNSIZE_4K = 4096, + ZFS_DNSIZE_8K = 8192, + ZFS_DNSIZE_16K = 16384 +} zfs_dnsize_type_t; + +typedef enum { + ZFS_REDUNDANT_METADATA_ALL, + ZFS_REDUNDANT_METADATA_MOST +} zfs_redundant_metadata_type_t; + +typedef enum { + ZFS_VOLMODE_DEFAULT = 0, + ZFS_VOLMODE_GEOM = 1, + ZFS_VOLMODE_DEV = 2, + ZFS_VOLMODE_NONE = 3 +} zfs_volmode_t; + +typedef enum zfs_keystatus { + ZFS_KEYSTATUS_NONE = 0, + ZFS_KEYSTATUS_UNAVAILABLE, + ZFS_KEYSTATUS_AVAILABLE, +} zfs_keystatus_t; + +typedef enum zfs_keyformat { + ZFS_KEYFORMAT_NONE = 0, + ZFS_KEYFORMAT_RAW, + ZFS_KEYFORMAT_HEX, + ZFS_KEYFORMAT_PASSPHRASE, + ZFS_KEYFORMAT_FORMATS +} zfs_keyformat_t; + +typedef enum zfs_key_location { + ZFS_KEYLOCATION_NONE = 0, + ZFS_KEYLOCATION_PROMPT, + ZFS_KEYLOCATION_URI, + ZFS_KEYLOCATION_LOCATIONS +} zfs_keylocation_t; + +#define DEFAULT_PBKDF2_ITERATIONS 350000 +#define MIN_PBKDF2_ITERATIONS 100000 + +/* + * On-disk version number. + */ +#define SPA_VERSION_1 1ULL +#define SPA_VERSION_2 2ULL +#define SPA_VERSION_3 3ULL +#define SPA_VERSION_4 4ULL +#define SPA_VERSION_5 5ULL +#define SPA_VERSION_6 6ULL +#define SPA_VERSION_7 7ULL +#define SPA_VERSION_8 8ULL +#define SPA_VERSION_9 9ULL +#define SPA_VERSION_10 10ULL +#define SPA_VERSION_11 11ULL +#define SPA_VERSION_12 12ULL +#define SPA_VERSION_13 13ULL +#define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL +#define SPA_VERSION_16 16ULL +#define SPA_VERSION_17 17ULL +#define SPA_VERSION_18 18ULL +#define SPA_VERSION_19 19ULL +#define SPA_VERSION_20 20ULL +#define SPA_VERSION_21 21ULL +#define SPA_VERSION_22 22ULL +#define SPA_VERSION_23 23ULL +#define SPA_VERSION_24 24ULL +#define SPA_VERSION_25 25ULL +#define SPA_VERSION_26 26ULL +#define SPA_VERSION_27 27ULL +#define SPA_VERSION_28 28ULL +#define SPA_VERSION_5000 5000ULL + +/* + * The incrementing pool version number has been replaced by pool feature + * flags. For more details, see zfeature.c. + */ +#define SPA_VERSION SPA_VERSION_5000 +#define SPA_VERSION_STRING "5000" + +/* + * Symbolic names for the changes that caused a SPA_VERSION switch. + * Used in the code when checking for presence or absence of a feature. + * Feel free to define multiple symbolic names for each version if there + * were multiple changes to on-disk structures during that version. + * + * NOTE: When checking the current SPA_VERSION in your code, be sure + * to use spa_version() since it reports the version of the + * last synced uberblock. Checking the in-flight version can + * be dangerous in some cases. + */ +#define SPA_VERSION_INITIAL SPA_VERSION_1 +#define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 +#define SPA_VERSION_SPARES SPA_VERSION_3 +#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 +#define SPA_VERSION_BPOBJ_ACCOUNT SPA_VERSION_3 +#define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 +#define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 +#define SPA_VERSION_ZPOOL_HISTORY SPA_VERSION_4 +#define SPA_VERSION_GZIP_COMPRESSION SPA_VERSION_5 +#define SPA_VERSION_BOOTFS SPA_VERSION_6 +#define SPA_VERSION_SLOGS SPA_VERSION_7 +#define SPA_VERSION_DELEGATED_PERMS SPA_VERSION_8 +#define SPA_VERSION_FUID SPA_VERSION_9 +#define SPA_VERSION_REFRESERVATION SPA_VERSION_9 +#define SPA_VERSION_REFQUOTA SPA_VERSION_9 +#define SPA_VERSION_UNIQUE_ACCURATE SPA_VERSION_9 +#define SPA_VERSION_L2CACHE SPA_VERSION_10 +#define SPA_VERSION_NEXT_CLONES SPA_VERSION_11 +#define SPA_VERSION_ORIGIN SPA_VERSION_11 +#define SPA_VERSION_DSL_SCRUB SPA_VERSION_11 +#define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 +#define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 +#define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 +#define SPA_VERSION_STMF_PROP SPA_VERSION_16 +#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 +#define SPA_VERSION_USERREFS SPA_VERSION_18 +#define SPA_VERSION_HOLES SPA_VERSION_19 +#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 +#define SPA_VERSION_DEDUP SPA_VERSION_21 +#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 +#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 +#define SPA_VERSION_SA SPA_VERSION_24 +#define SPA_VERSION_SCAN SPA_VERSION_25 +#define SPA_VERSION_DIR_CLONES SPA_VERSION_26 +#define SPA_VERSION_DEADLISTS SPA_VERSION_26 +#define SPA_VERSION_FAST_SNAP SPA_VERSION_27 +#define SPA_VERSION_MULTI_REPLACE SPA_VERSION_28 +#define SPA_VERSION_BEFORE_FEATURES SPA_VERSION_28 +#define SPA_VERSION_FEATURES SPA_VERSION_5000 + +#define SPA_VERSION_IS_SUPPORTED(v) \ + (((v) >= SPA_VERSION_INITIAL && (v) <= SPA_VERSION_BEFORE_FEATURES) || \ + ((v) >= SPA_VERSION_FEATURES && (v) <= SPA_VERSION)) + +/* + * ZPL version - rev'd whenever an incompatible on-disk format change + * occurs. This is independent of SPA/DMU/ZAP versioning. You must + * also update the version_table[] and help message in zfs_prop.c. + */ +#define ZPL_VERSION_1 1ULL +#define ZPL_VERSION_2 2ULL +#define ZPL_VERSION_3 3ULL +#define ZPL_VERSION_4 4ULL +#define ZPL_VERSION_5 5ULL +#define ZPL_VERSION ZPL_VERSION_5 +#define ZPL_VERSION_STRING "5" + +#define ZPL_VERSION_INITIAL ZPL_VERSION_1 +#define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 +#define ZPL_VERSION_FUID ZPL_VERSION_3 +#define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 +#define ZPL_VERSION_SYSATTR ZPL_VERSION_3 +#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 +#define ZPL_VERSION_SA ZPL_VERSION_5 + +/* Persistent L2ARC version */ +#define L2ARC_PERSISTENT_VERSION_1 1ULL +#define L2ARC_PERSISTENT_VERSION L2ARC_PERSISTENT_VERSION_1 +#define L2ARC_PERSISTENT_VERSION_STRING "1" + +/* Rewind policy information */ +#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ +#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ +#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ +#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ +#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ +#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ +#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ + +typedef struct zpool_load_policy { + uint32_t zlp_rewind; /* rewind policy requested */ + uint64_t zlp_maxmeta; /* max acceptable meta-data errors */ + uint64_t zlp_maxdata; /* max acceptable data errors */ + uint64_t zlp_txg; /* specific txg to load */ +} zpool_load_policy_t; + +/* + * The following are configuration names used in the nvlist describing a pool's + * configuration. New on-disk names should be prefixed with ":" + * (e.g. "org.openzfs:") to avoid conflicting names being developed + * independently. + */ +#define ZPOOL_CONFIG_VERSION "version" +#define ZPOOL_CONFIG_POOL_NAME "name" +#define ZPOOL_CONFIG_POOL_STATE "state" +#define ZPOOL_CONFIG_POOL_TXG "txg" +#define ZPOOL_CONFIG_POOL_GUID "pool_guid" +#define ZPOOL_CONFIG_CREATE_TXG "create_txg" +#define ZPOOL_CONFIG_TOP_GUID "top_guid" +#define ZPOOL_CONFIG_VDEV_TREE "vdev_tree" +#define ZPOOL_CONFIG_TYPE "type" +#define ZPOOL_CONFIG_CHILDREN "children" +#define ZPOOL_CONFIG_ID "id" +#define ZPOOL_CONFIG_GUID "guid" +#define ZPOOL_CONFIG_INDIRECT_OBJECT "com.delphix:indirect_object" +#define ZPOOL_CONFIG_INDIRECT_BIRTHS "com.delphix:indirect_births" +#define ZPOOL_CONFIG_PREV_INDIRECT_VDEV "com.delphix:prev_indirect_vdev" +#define ZPOOL_CONFIG_PATH "path" +#define ZPOOL_CONFIG_DEVID "devid" +#define ZPOOL_CONFIG_METASLAB_ARRAY "metaslab_array" +#define ZPOOL_CONFIG_METASLAB_SHIFT "metaslab_shift" +#define ZPOOL_CONFIG_ASHIFT "ashift" +#define ZPOOL_CONFIG_ASIZE "asize" +#define ZPOOL_CONFIG_DTL "DTL" +#define ZPOOL_CONFIG_SCAN_STATS "scan_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_REMOVAL_STATS "removal_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_CHECKPOINT_STATS "checkpoint_stats" /* not on disk */ +#define ZPOOL_CONFIG_VDEV_STATS "vdev_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_INDIRECT_SIZE "indirect_size" /* not stored on disk */ + +/* container nvlist of extended stats */ +#define ZPOOL_CONFIG_VDEV_STATS_EX "vdev_stats_ex" + +/* Active queue read/write stats */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE "vdev_sync_r_active_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE "vdev_sync_w_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE "vdev_async_r_active_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE "vdev_async_w_active_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE "vdev_async_scrub_active_queue" +#define ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE "vdev_async_trim_active_queue" + +/* Queue sizes */ +#define ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE "vdev_sync_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE "vdev_sync_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE "vdev_async_r_pend_queue" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE "vdev_async_w_pend_queue" +#define ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE "vdev_async_scrub_pend_queue" +#define ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE "vdev_async_trim_pend_queue" + +/* Latency read/write histogram stats */ +#define ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO "vdev_tot_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO "vdev_tot_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO "vdev_disk_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO "vdev_disk_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO "vdev_sync_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO "vdev_sync_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO "vdev_async_r_lat_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO "vdev_async_w_lat_histo" +#define ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO "vdev_scrub_histo" +#define ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO "vdev_trim_histo" + +/* Request size histograms */ +#define ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO "vdev_sync_ind_r_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO "vdev_sync_ind_w_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO "vdev_async_ind_r_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO "vdev_async_ind_w_histo" +#define ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO "vdev_ind_scrub_histo" +#define ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO "vdev_ind_trim_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO "vdev_sync_agg_r_histo" +#define ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO "vdev_sync_agg_w_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO "vdev_async_agg_r_histo" +#define ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO "vdev_async_agg_w_histo" +#define ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO "vdev_agg_scrub_histo" +#define ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO "vdev_agg_trim_histo" + +/* Number of slow IOs */ +#define ZPOOL_CONFIG_VDEV_SLOW_IOS "vdev_slow_ios" + +/* vdev enclosure sysfs path */ +#define ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH "vdev_enc_sysfs_path" + +#define ZPOOL_CONFIG_WHOLE_DISK "whole_disk" +#define ZPOOL_CONFIG_ERRCOUNT "error_count" +#define ZPOOL_CONFIG_NOT_PRESENT "not_present" +#define ZPOOL_CONFIG_SPARES "spares" +#define ZPOOL_CONFIG_IS_SPARE "is_spare" +#define ZPOOL_CONFIG_NPARITY "nparity" +#define ZPOOL_CONFIG_HOSTID "hostid" +#define ZPOOL_CONFIG_HOSTNAME "hostname" +#define ZPOOL_CONFIG_LOADED_TIME "initial_load_time" +#define ZPOOL_CONFIG_UNSPARE "unspare" +#define ZPOOL_CONFIG_PHYS_PATH "phys_path" +#define ZPOOL_CONFIG_IS_LOG "is_log" +#define ZPOOL_CONFIG_L2CACHE "l2cache" +#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" +#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" +#define ZPOOL_CONFIG_IS_HOLE "is_hole" +#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" +#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" +#define ZPOOL_CONFIG_DDT_STATS "ddt_stats" +#define ZPOOL_CONFIG_SPLIT "splitcfg" +#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" +#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" +#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" +#define ZPOOL_CONFIG_REMOVING "removing" +#define ZPOOL_CONFIG_RESILVER_TXG "resilver_txg" +#define ZPOOL_CONFIG_REBUILD_TXG "rebuild_txg" +#define ZPOOL_CONFIG_COMMENT "comment" +#define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ +#define ZPOOL_CONFIG_SUSPENDED_REASON "suspended_reason" /* not stored */ +#define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ +#define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ +#define ZPOOL_CONFIG_MISSING_DEVICES "missing_vdevs" /* not stored on disk */ +#define ZPOOL_CONFIG_LOAD_INFO "load_info" /* not stored on disk */ +#define ZPOOL_CONFIG_REWIND_INFO "rewind_info" /* not stored on disk */ +#define ZPOOL_CONFIG_UNSUP_FEAT "unsup_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_ENABLED_FEAT "enabled_feat" /* not stored on disk */ +#define ZPOOL_CONFIG_CAN_RDONLY "can_rdonly" /* not stored on disk */ +#define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_FEATURE_STATS "feature_stats" /* not stored on disk */ +#define ZPOOL_CONFIG_ERRATA "errata" /* not stored on disk */ +#define ZPOOL_CONFIG_VDEV_TOP_ZAP "com.delphix:vdev_zap_top" +#define ZPOOL_CONFIG_VDEV_LEAF_ZAP "com.delphix:vdev_zap_leaf" +#define ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS "com.delphix:has_per_vdev_zaps" +#define ZPOOL_CONFIG_RESILVER_DEFER "com.datto:resilver_defer" +#define ZPOOL_CONFIG_CACHEFILE "cachefile" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_STATE "mmp_state" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_TXG "mmp_txg" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_SEQ "mmp_seq" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_HOSTNAME "mmp_hostname" /* not stored on disk */ +#define ZPOOL_CONFIG_MMP_HOSTID "mmp_hostid" /* not stored on disk */ +#define ZPOOL_CONFIG_ALLOCATION_BIAS "alloc_bias" /* not stored on disk */ +#define ZPOOL_CONFIG_EXPANSION_TIME "expansion_time" /* not stored */ +#define ZPOOL_CONFIG_REBUILD_STATS "org.openzfs:rebuild_stats" + +/* + * The persistent vdev state is stored as separate values rather than a single + * 'vdev_state' entry. This is because a device can be in multiple states, such + * as offline and degraded. + */ +#define ZPOOL_CONFIG_OFFLINE "offline" +#define ZPOOL_CONFIG_FAULTED "faulted" +#define ZPOOL_CONFIG_DEGRADED "degraded" +#define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" +#define ZPOOL_CONFIG_AUX_STATE "aux_state" + +/* Pool load policy parameters */ +#define ZPOOL_LOAD_POLICY "load-policy" +#define ZPOOL_LOAD_REWIND_POLICY "load-rewind-policy" +#define ZPOOL_LOAD_REQUEST_TXG "load-request-txg" +#define ZPOOL_LOAD_META_THRESH "load-meta-thresh" +#define ZPOOL_LOAD_DATA_THRESH "load-data-thresh" + +/* Rewind data discovered */ +#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" +#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" +#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" + +#define VDEV_TYPE_ROOT "root" +#define VDEV_TYPE_MIRROR "mirror" +#define VDEV_TYPE_REPLACING "replacing" +#define VDEV_TYPE_RAIDZ "raidz" +#define VDEV_TYPE_DISK "disk" +#define VDEV_TYPE_FILE "file" +#define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_HOLE "hole" +#define VDEV_TYPE_SPARE "spare" +#define VDEV_TYPE_LOG "log" +#define VDEV_TYPE_L2CACHE "l2cache" +#define VDEV_TYPE_INDIRECT "indirect" + +/* VDEV_TOP_ZAP_* are used in top-level vdev ZAP objects. */ +#define VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM \ + "com.delphix:indirect_obsolete_sm" +#define VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE \ + "com.delphix:obsolete_counts_are_precise" +#define VDEV_TOP_ZAP_POOL_CHECKPOINT_SM \ + "com.delphix:pool_checkpoint_sm" +#define VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS \ + "com.delphix:ms_unflushed_phys_txgs" + +#define VDEV_TOP_ZAP_VDEV_REBUILD_PHYS \ + "org.openzfs:vdev_rebuild" + +#define VDEV_TOP_ZAP_ALLOCATION_BIAS \ + "org.zfsonlinux:allocation_bias" + +/* vdev metaslab allocation bias */ +#define VDEV_ALLOC_BIAS_LOG "log" +#define VDEV_ALLOC_BIAS_SPECIAL "special" +#define VDEV_ALLOC_BIAS_DEDUP "dedup" + +/* vdev initialize state */ +#define VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET \ + "com.delphix:next_offset_to_initialize" +#define VDEV_LEAF_ZAP_INITIALIZE_STATE \ + "com.delphix:vdev_initialize_state" +#define VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME \ + "com.delphix:vdev_initialize_action_time" + +/* vdev TRIM state */ +#define VDEV_LEAF_ZAP_TRIM_LAST_OFFSET \ + "org.zfsonlinux:next_offset_to_trim" +#define VDEV_LEAF_ZAP_TRIM_STATE \ + "org.zfsonlinux:vdev_trim_state" +#define VDEV_LEAF_ZAP_TRIM_ACTION_TIME \ + "org.zfsonlinux:vdev_trim_action_time" +#define VDEV_LEAF_ZAP_TRIM_RATE \ + "org.zfsonlinux:vdev_trim_rate" +#define VDEV_LEAF_ZAP_TRIM_PARTIAL \ + "org.zfsonlinux:vdev_trim_partial" +#define VDEV_LEAF_ZAP_TRIM_SECURE \ + "org.zfsonlinux:vdev_trim_secure" + +/* + * This is needed in userland to report the minimum necessary device size. + */ +#define SPA_MINDEVSIZE (64ULL << 20) + +/* + * Set if the fragmentation has not yet been calculated. This can happen + * because the space maps have not been upgraded or the histogram feature + * is not enabled. + */ +#define ZFS_FRAG_INVALID UINT64_MAX + +/* + * The location of the pool configuration repository, shared between kernel and + * userland. + */ +#define ZPOOL_CACHE_BOOT "/boot/zfs/zpool.cache" +#define ZPOOL_CACHE "/etc/zfs/zpool.cache" +/* + * vdev states are ordered from least to most healthy. + * A vdev that's CANT_OPEN or below is considered unusable. + */ +typedef enum vdev_state { + VDEV_STATE_UNKNOWN = 0, /* Uninitialized vdev */ + VDEV_STATE_CLOSED, /* Not currently open */ + VDEV_STATE_OFFLINE, /* Not allowed to open */ + VDEV_STATE_REMOVED, /* Explicitly removed from system */ + VDEV_STATE_CANT_OPEN, /* Tried to open, but failed */ + VDEV_STATE_FAULTED, /* External request to fault device */ + VDEV_STATE_DEGRADED, /* Replicated vdev with unhealthy kids */ + VDEV_STATE_HEALTHY /* Presumed good */ +} vdev_state_t; + +#define VDEV_STATE_ONLINE VDEV_STATE_HEALTHY + +/* + * vdev aux states. When a vdev is in the CANT_OPEN state, the aux field + * of the vdev stats structure uses these constants to distinguish why. + */ +typedef enum vdev_aux { + VDEV_AUX_NONE, /* no error */ + VDEV_AUX_OPEN_FAILED, /* ldi_open_*() or vn_open() failed */ + VDEV_AUX_CORRUPT_DATA, /* bad label or disk contents */ + VDEV_AUX_NO_REPLICAS, /* insufficient number of replicas */ + VDEV_AUX_BAD_GUID_SUM, /* vdev guid sum doesn't match */ + VDEV_AUX_TOO_SMALL, /* vdev size is too small */ + VDEV_AUX_BAD_LABEL, /* the label is OK but invalid */ + VDEV_AUX_VERSION_NEWER, /* on-disk version is too new */ + VDEV_AUX_VERSION_OLDER, /* on-disk version is too old */ + VDEV_AUX_UNSUP_FEAT, /* unsupported features */ + VDEV_AUX_SPARED, /* hot spare used in another pool */ + VDEV_AUX_ERR_EXCEEDED, /* too many errors */ + VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ + VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ + VDEV_AUX_EXTERNAL, /* external diagnosis or forced fault */ + VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ + VDEV_AUX_BAD_ASHIFT, /* vdev ashift is invalid */ + VDEV_AUX_EXTERNAL_PERSIST, /* persistent forced fault */ + VDEV_AUX_ACTIVE, /* vdev active on a different host */ + VDEV_AUX_CHILDREN_OFFLINE, /* all children are offline */ + VDEV_AUX_ASHIFT_TOO_BIG, /* vdev's min block size is too large */ +} vdev_aux_t; + +/* + * pool state. The following states are written to disk as part of the normal + * SPA lifecycle: ACTIVE, EXPORTED, DESTROYED, SPARE, L2CACHE. The remaining + * states are software abstractions used at various levels to communicate + * pool state. + */ +typedef enum pool_state { + POOL_STATE_ACTIVE = 0, /* In active use */ + POOL_STATE_EXPORTED, /* Explicitly exported */ + POOL_STATE_DESTROYED, /* Explicitly destroyed */ + POOL_STATE_SPARE, /* Reserved for hot spare use */ + POOL_STATE_L2CACHE, /* Level 2 ARC device */ + POOL_STATE_UNINITIALIZED, /* Internal spa_t state */ + POOL_STATE_UNAVAIL, /* Internal libzfs state */ + POOL_STATE_POTENTIALLY_ACTIVE /* Internal libzfs state */ +} pool_state_t; + +/* + * mmp state. The following states provide additional detail describing + * why a pool couldn't be safely imported. + */ +typedef enum mmp_state { + MMP_STATE_ACTIVE = 0, /* In active use */ + MMP_STATE_INACTIVE, /* Inactive and safe to import */ + MMP_STATE_NO_HOSTID /* System hostid is not set */ +} mmp_state_t; + +/* + * Scan Functions. + */ +typedef enum pool_scan_func { + POOL_SCAN_NONE, + POOL_SCAN_SCRUB, + POOL_SCAN_RESILVER, + POOL_SCAN_FUNCS +} pool_scan_func_t; + +/* + * Used to control scrub pause and resume. + */ +typedef enum pool_scrub_cmd { + POOL_SCRUB_NORMAL = 0, + POOL_SCRUB_PAUSE, + POOL_SCRUB_FLAGS_END +} pool_scrub_cmd_t; + +typedef enum { + CS_NONE, + CS_CHECKPOINT_EXISTS, + CS_CHECKPOINT_DISCARDING, + CS_NUM_STATES +} checkpoint_state_t; + +typedef struct pool_checkpoint_stat { + uint64_t pcs_state; /* checkpoint_state_t */ + uint64_t pcs_start_time; /* time checkpoint/discard started */ + uint64_t pcs_space; /* checkpointed space */ +} pool_checkpoint_stat_t; + +/* + * ZIO types. Needed to interpret vdev statistics below. + */ +typedef enum zio_type { + ZIO_TYPE_NULL = 0, + ZIO_TYPE_READ, + ZIO_TYPE_WRITE, + ZIO_TYPE_FREE, + ZIO_TYPE_CLAIM, + ZIO_TYPE_IOCTL, + ZIO_TYPE_TRIM, + ZIO_TYPES +} zio_type_t; + +/* + * Pool statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct pool_scan_stat { + /* values stored on disk */ + uint64_t pss_func; /* pool_scan_func_t */ + uint64_t pss_state; /* dsl_scan_state_t */ + uint64_t pss_start_time; /* scan start time */ + uint64_t pss_end_time; /* scan end time */ + uint64_t pss_to_examine; /* total bytes to scan */ + uint64_t pss_examined; /* total bytes located by scanner */ + uint64_t pss_to_process; /* total bytes to process */ + uint64_t pss_processed; /* total processed bytes */ + uint64_t pss_errors; /* scan errors */ + + /* values not stored on disk */ + uint64_t pss_pass_exam; /* examined bytes per scan pass */ + uint64_t pss_pass_start; /* start time of a scan pass */ + uint64_t pss_pass_scrub_pause; /* pause time of a scrub pass */ + /* cumulative time scrub spent paused, needed for rate calculation */ + uint64_t pss_pass_scrub_spent_paused; + uint64_t pss_pass_issued; /* issued bytes per scan pass */ + uint64_t pss_issued; /* total bytes checked by scanner */ +} pool_scan_stat_t; + +typedef struct pool_removal_stat { + uint64_t prs_state; /* dsl_scan_state_t */ + uint64_t prs_removing_vdev; + uint64_t prs_start_time; + uint64_t prs_end_time; + uint64_t prs_to_copy; /* bytes that need to be copied */ + uint64_t prs_copied; /* bytes copied so far */ + /* + * bytes of memory used for indirect mappings. + * This includes all removed vdevs. + */ + uint64_t prs_mapping_memory; +} pool_removal_stat_t; + +typedef enum dsl_scan_state { + DSS_NONE, + DSS_SCANNING, + DSS_FINISHED, + DSS_CANCELED, + DSS_NUM_STATES +} dsl_scan_state_t; + +typedef struct vdev_rebuild_stat { + uint64_t vrs_state; /* vdev_rebuild_state_t */ + uint64_t vrs_start_time; /* time_t */ + uint64_t vrs_end_time; /* time_t */ + uint64_t vrs_scan_time_ms; /* total run time (millisecs) */ + uint64_t vrs_bytes_scanned; /* allocated bytes scanned */ + uint64_t vrs_bytes_issued; /* read bytes issued */ + uint64_t vrs_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrs_bytes_est; /* total bytes to scan */ + uint64_t vrs_errors; /* scanning errors */ + uint64_t vrs_pass_time_ms; /* pass run time (millisecs) */ + uint64_t vrs_pass_bytes_scanned; /* bytes scanned since start/resume */ + uint64_t vrs_pass_bytes_issued; /* bytes rebuilt since start/resume */ +} vdev_rebuild_stat_t; + +/* + * Errata described by https://zfsonlinux.org/msg/ZFS-8000-ER. The ordering + * of this enum must be maintained to ensure the errata identifiers map to + * the correct documentation. New errata may only be appended to the list + * and must contain corresponding documentation at the above link. + */ +typedef enum zpool_errata { + ZPOOL_ERRATA_NONE, + ZPOOL_ERRATA_ZOL_2094_SCRUB, + ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY, + ZPOOL_ERRATA_ZOL_6845_ENCRYPTION, + ZPOOL_ERRATA_ZOL_8308_ENCRYPTION, +} zpool_errata_t; + +/* + * Vdev statistics. Note: all fields should be 64-bit because this + * is passed between kernel and user land as an nvlist uint64 array. + * + * The vs_ops[] and vs_bytes[] arrays must always be an array size of 6 in + * order to keep subsequent members at their known fixed offsets. When + * adding a new field it must be added to the end the structure. + */ +#define VS_ZIO_TYPES 6 + +typedef struct vdev_stat { + hrtime_t vs_timestamp; /* time since vdev load */ + uint64_t vs_state; /* vdev state */ + uint64_t vs_aux; /* see vdev_aux_t */ + uint64_t vs_alloc; /* space allocated */ + uint64_t vs_space; /* total capacity */ + uint64_t vs_dspace; /* deflated capacity */ + uint64_t vs_rsize; /* replaceable dev size */ + uint64_t vs_esize; /* expandable dev size */ + uint64_t vs_ops[VS_ZIO_TYPES]; /* operation count */ + uint64_t vs_bytes[VS_ZIO_TYPES]; /* bytes read/written */ + uint64_t vs_read_errors; /* read errors */ + uint64_t vs_write_errors; /* write errors */ + uint64_t vs_checksum_errors; /* checksum errors */ + uint64_t vs_initialize_errors; /* initializing errors */ + uint64_t vs_self_healed; /* self-healed bytes */ + uint64_t vs_scan_removing; /* removing? */ + uint64_t vs_scan_processed; /* scan processed bytes */ + uint64_t vs_fragmentation; /* device fragmentation */ + uint64_t vs_initialize_bytes_done; /* bytes initialized */ + uint64_t vs_initialize_bytes_est; /* total bytes to initialize */ + uint64_t vs_initialize_state; /* vdev_initializing_state_t */ + uint64_t vs_initialize_action_time; /* time_t */ + uint64_t vs_checkpoint_space; /* checkpoint-consumed space */ + uint64_t vs_resilver_deferred; /* resilver deferred */ + uint64_t vs_slow_ios; /* slow IOs */ + uint64_t vs_trim_errors; /* trimming errors */ + uint64_t vs_trim_notsup; /* supported by device */ + uint64_t vs_trim_bytes_done; /* bytes trimmed */ + uint64_t vs_trim_bytes_est; /* total bytes to trim */ + uint64_t vs_trim_state; /* vdev_trim_state_t */ + uint64_t vs_trim_action_time; /* time_t */ + uint64_t vs_rebuild_processed; /* bytes rebuilt */ + uint64_t vs_configured_ashift; /* TLV vdev_ashift */ + uint64_t vs_logical_ashift; /* vdev_logical_ashift */ + uint64_t vs_physical_ashift; /* vdev_physical_ashift */ +} vdev_stat_t; + +/* BEGIN CSTYLED */ +#define VDEV_STAT_VALID(field, uint64_t_field_count) \ + ((uint64_t_field_count * sizeof (uint64_t)) >= \ + (offsetof(vdev_stat_t, field) + sizeof (((vdev_stat_t *)NULL)->field))) +/* END CSTYLED */ + +/* + * Extended stats + * + * These are stats which aren't included in the original iostat output. For + * convenience, they are grouped together in vdev_stat_ex, although each stat + * is individually exported as an nvlist. + */ +typedef struct vdev_stat_ex { + /* Number of ZIOs issued to disk and waiting to finish */ + uint64_t vsx_active_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* Number of ZIOs pending to be issued to disk */ + uint64_t vsx_pend_queue[ZIO_PRIORITY_NUM_QUEUEABLE]; + + /* + * Below are the histograms for various latencies. Buckets are in + * units of nanoseconds. + */ + + /* + * 2^37 nanoseconds = 134s. Timeouts will probably start kicking in + * before this. + */ +#define VDEV_L_HISTO_BUCKETS 37 /* Latency histo buckets */ +#define VDEV_RQ_HISTO_BUCKETS 25 /* Request size histo buckets */ + + /* Amount of time in ZIO queue (ns) */ + uint64_t vsx_queue_histo[ZIO_PRIORITY_NUM_QUEUEABLE] + [VDEV_L_HISTO_BUCKETS]; + + /* Total ZIO latency (ns). Includes queuing and disk access time */ + uint64_t vsx_total_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; + + /* Amount of time to read/write the disk (ns) */ + uint64_t vsx_disk_histo[ZIO_TYPES][VDEV_L_HISTO_BUCKETS]; + + /* "lookup the bucket for a value" histogram macros */ +#define HISTO(val, buckets) (val != 0 ? MIN(highbit64(val) - 1, \ + buckets - 1) : 0) +#define L_HISTO(a) HISTO(a, VDEV_L_HISTO_BUCKETS) +#define RQ_HISTO(a) HISTO(a, VDEV_RQ_HISTO_BUCKETS) + + /* Physical IO histogram */ + uint64_t vsx_ind_histo[ZIO_PRIORITY_NUM_QUEUEABLE] + [VDEV_RQ_HISTO_BUCKETS]; + + /* Delegated (aggregated) physical IO histogram */ + uint64_t vsx_agg_histo[ZIO_PRIORITY_NUM_QUEUEABLE] + [VDEV_RQ_HISTO_BUCKETS]; + +} vdev_stat_ex_t; + +/* + * Initialize functions. + */ +typedef enum pool_initialize_func { + POOL_INITIALIZE_START, + POOL_INITIALIZE_CANCEL, + POOL_INITIALIZE_SUSPEND, + POOL_INITIALIZE_FUNCS +} pool_initialize_func_t; + +/* + * TRIM functions. + */ +typedef enum pool_trim_func { + POOL_TRIM_START, + POOL_TRIM_CANCEL, + POOL_TRIM_SUSPEND, + POOL_TRIM_FUNCS +} pool_trim_func_t; + +/* + * DDT statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct ddt_object { + uint64_t ddo_count; /* number of elements in ddt */ + uint64_t ddo_dspace; /* size of ddt on disk */ + uint64_t ddo_mspace; /* size of ddt in-core */ +} ddt_object_t; + +typedef struct ddt_stat { + uint64_t dds_blocks; /* blocks */ + uint64_t dds_lsize; /* logical size */ + uint64_t dds_psize; /* physical size */ + uint64_t dds_dsize; /* deflated allocated size */ + uint64_t dds_ref_blocks; /* referenced blocks */ + uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ + uint64_t dds_ref_psize; /* referenced psize * refcnt */ + uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ +} ddt_stat_t; + +typedef struct ddt_histogram { + ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ +} ddt_histogram_t; + +#define ZVOL_DRIVER "zvol" +#define ZFS_DRIVER "zfs" +#define ZFS_DEV "/dev/zfs" + +#define ZFS_SUPER_MAGIC 0x2fc12fc1 + +/* general zvol path */ +#define ZVOL_DIR "/dev/zvol/" + +#define ZVOL_MAJOR 230 +#define ZVOL_MINOR_BITS 4 +#define ZVOL_MINOR_MASK ((1U << ZVOL_MINOR_BITS) - 1) +#define ZVOL_MINORS (1 << 4) +#define ZVOL_DEV_NAME "zd" + +#define ZVOL_PROP_NAME "name" +#define ZVOL_DEFAULT_BLOCKSIZE 8192 + +typedef enum { + VDEV_INITIALIZE_NONE, + VDEV_INITIALIZE_ACTIVE, + VDEV_INITIALIZE_CANCELED, + VDEV_INITIALIZE_SUSPENDED, + VDEV_INITIALIZE_COMPLETE +} vdev_initializing_state_t; + +typedef enum { + VDEV_TRIM_NONE, + VDEV_TRIM_ACTIVE, + VDEV_TRIM_CANCELED, + VDEV_TRIM_SUSPENDED, + VDEV_TRIM_COMPLETE, +} vdev_trim_state_t; + +typedef enum { + VDEV_REBUILD_NONE, + VDEV_REBUILD_ACTIVE, + VDEV_REBUILD_CANCELED, + VDEV_REBUILD_COMPLETE, +} vdev_rebuild_state_t; + +/* + * nvlist name constants. Facilitate restricting snapshot iteration range for + * the "list next snapshot" ioctl + */ +#define SNAP_ITER_MIN_TXG "snap_iter_min_txg" +#define SNAP_ITER_MAX_TXG "snap_iter_max_txg" + +/* + * /dev/zfs ioctl numbers. + * + * These numbers cannot change over time. New ioctl numbers must be appended. + */ +typedef enum zfs_ioc { + /* + * Core features - 81/128 numbers reserved. + */ +#ifdef __FreeBSD__ + ZFS_IOC_FIRST = 0, +#else + ZFS_IOC_FIRST = ('Z' << 8), +#endif + ZFS_IOC = ZFS_IOC_FIRST, + ZFS_IOC_POOL_CREATE = ZFS_IOC_FIRST, /* 0x5a00 */ + ZFS_IOC_POOL_DESTROY, /* 0x5a01 */ + ZFS_IOC_POOL_IMPORT, /* 0x5a02 */ + ZFS_IOC_POOL_EXPORT, /* 0x5a03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x5a04 */ + ZFS_IOC_POOL_STATS, /* 0x5a05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x5a06 */ + ZFS_IOC_POOL_SCAN, /* 0x5a07 */ + ZFS_IOC_POOL_FREEZE, /* 0x5a08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x5a09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x5a0a */ + ZFS_IOC_VDEV_ADD, /* 0x5a0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x5a0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x5a0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x5a0e */ + ZFS_IOC_VDEV_DETACH, /* 0x5a0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x5a10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x5a11 */ + ZFS_IOC_OBJSET_STATS, /* 0x5a12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x5a13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x5a14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x5a15 */ + ZFS_IOC_SET_PROP, /* 0x5a16 */ + ZFS_IOC_CREATE, /* 0x5a17 */ + ZFS_IOC_DESTROY, /* 0x5a18 */ + ZFS_IOC_ROLLBACK, /* 0x5a19 */ + ZFS_IOC_RENAME, /* 0x5a1a */ + ZFS_IOC_RECV, /* 0x5a1b */ + ZFS_IOC_SEND, /* 0x5a1c */ + ZFS_IOC_INJECT_FAULT, /* 0x5a1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x5a1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x5a1f */ + ZFS_IOC_ERROR_LOG, /* 0x5a20 */ + ZFS_IOC_CLEAR, /* 0x5a21 */ + ZFS_IOC_PROMOTE, /* 0x5a22 */ + ZFS_IOC_SNAPSHOT, /* 0x5a23 */ + ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x5a24 */ + ZFS_IOC_OBJ_TO_PATH, /* 0x5a25 */ + ZFS_IOC_POOL_SET_PROPS, /* 0x5a26 */ + ZFS_IOC_POOL_GET_PROPS, /* 0x5a27 */ + ZFS_IOC_SET_FSACL, /* 0x5a28 */ + ZFS_IOC_GET_FSACL, /* 0x5a29 */ + ZFS_IOC_SHARE, /* 0x5a2a */ + ZFS_IOC_INHERIT_PROP, /* 0x5a2b */ + ZFS_IOC_SMB_ACL, /* 0x5a2c */ + ZFS_IOC_USERSPACE_ONE, /* 0x5a2d */ + ZFS_IOC_USERSPACE_MANY, /* 0x5a2e */ + ZFS_IOC_USERSPACE_UPGRADE, /* 0x5a2f */ + ZFS_IOC_HOLD, /* 0x5a30 */ + ZFS_IOC_RELEASE, /* 0x5a31 */ + ZFS_IOC_GET_HOLDS, /* 0x5a32 */ + ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x5a33 */ + ZFS_IOC_VDEV_SPLIT, /* 0x5a34 */ + ZFS_IOC_NEXT_OBJ, /* 0x5a35 */ + ZFS_IOC_DIFF, /* 0x5a36 */ + ZFS_IOC_TMP_SNAPSHOT, /* 0x5a37 */ + ZFS_IOC_OBJ_TO_STATS, /* 0x5a38 */ + ZFS_IOC_SPACE_WRITTEN, /* 0x5a39 */ + ZFS_IOC_SPACE_SNAPS, /* 0x5a3a */ + ZFS_IOC_DESTROY_SNAPS, /* 0x5a3b */ + ZFS_IOC_POOL_REGUID, /* 0x5a3c */ + ZFS_IOC_POOL_REOPEN, /* 0x5a3d */ + ZFS_IOC_SEND_PROGRESS, /* 0x5a3e */ + ZFS_IOC_LOG_HISTORY, /* 0x5a3f */ + ZFS_IOC_SEND_NEW, /* 0x5a40 */ + ZFS_IOC_SEND_SPACE, /* 0x5a41 */ + ZFS_IOC_CLONE, /* 0x5a42 */ + ZFS_IOC_BOOKMARK, /* 0x5a43 */ + ZFS_IOC_GET_BOOKMARKS, /* 0x5a44 */ + ZFS_IOC_DESTROY_BOOKMARKS, /* 0x5a45 */ + ZFS_IOC_RECV_NEW, /* 0x5a46 */ + ZFS_IOC_POOL_SYNC, /* 0x5a47 */ + ZFS_IOC_CHANNEL_PROGRAM, /* 0x5a48 */ + ZFS_IOC_LOAD_KEY, /* 0x5a49 */ + ZFS_IOC_UNLOAD_KEY, /* 0x5a4a */ + ZFS_IOC_CHANGE_KEY, /* 0x5a4b */ + ZFS_IOC_REMAP, /* 0x5a4c */ + ZFS_IOC_POOL_CHECKPOINT, /* 0x5a4d */ + ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x5a4e */ + ZFS_IOC_POOL_INITIALIZE, /* 0x5a4f */ + ZFS_IOC_POOL_TRIM, /* 0x5a50 */ + ZFS_IOC_REDACT, /* 0x5a51 */ + ZFS_IOC_GET_BOOKMARK_PROPS, /* 0x5a52 */ + ZFS_IOC_WAIT, /* 0x5a53 */ + ZFS_IOC_WAIT_FS, /* 0x5a54 */ + + /* + * Per-platform (Optional) - 8/128 numbers reserved. + */ + ZFS_IOC_PLATFORM = ZFS_IOC_FIRST + 0x80, + ZFS_IOC_EVENTS_NEXT, /* 0x81 (Linux) */ + ZFS_IOC_EVENTS_CLEAR, /* 0x82 (Linux) */ + ZFS_IOC_EVENTS_SEEK, /* 0x83 (Linux) */ + ZFS_IOC_NEXTBOOT, /* 0x84 (FreeBSD) */ + ZFS_IOC_JAIL, /* 0x85 (FreeBSD) */ + ZFS_IOC_UNJAIL, /* 0x86 (FreeBSD) */ + ZFS_IOC_SET_BOOTENV, /* 0x87 (Linux) */ + ZFS_IOC_GET_BOOTENV, /* 0x88 (Linux) */ + ZFS_IOC_LAST +} zfs_ioc_t; + +/* + * zvol ioctl to get dataset name + */ +#define BLKZNAME _IOR(0x12, 125, char[ZFS_MAX_DATASET_NAME_LEN]) + +/* + * ZFS-specific error codes used for returning descriptive errors + * to the userland through zfs ioctls. + * + * The enum implicitly includes all the error codes from errno.h. + * New code should use and extend this enum for errors that are + * not described precisely by generic errno codes. + * + * These numbers should not change over time. New entries should be appended. + * + * (Keep in sync with contrib/pyzfs/libzfs_core/_constants.py) + */ +typedef enum { + ZFS_ERR_CHECKPOINT_EXISTS = 1024, + ZFS_ERR_DISCARDING_CHECKPOINT, + ZFS_ERR_NO_CHECKPOINT, + ZFS_ERR_DEVRM_IN_PROGRESS, + ZFS_ERR_VDEV_TOO_BIG, + ZFS_ERR_IOC_CMD_UNAVAIL, + ZFS_ERR_IOC_ARG_UNAVAIL, + ZFS_ERR_IOC_ARG_REQUIRED, + ZFS_ERR_IOC_ARG_BADTYPE, + ZFS_ERR_WRONG_PARENT, + ZFS_ERR_FROM_IVSET_GUID_MISSING, + ZFS_ERR_FROM_IVSET_GUID_MISMATCH, + ZFS_ERR_SPILL_BLOCK_FLAG_MISSING, + ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE, + ZFS_ERR_EXPORT_IN_PROGRESS, + ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR, + ZFS_ERR_STREAM_TRUNCATED, + ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH, + ZFS_ERR_RESILVER_IN_PROGRESS, + ZFS_ERR_REBUILD_IN_PROGRESS, + ZFS_ERR_BADPROP, +} zfs_errno_t; + +/* + * Internal SPA load state. Used by FMA diagnosis engine. + */ +typedef enum { + SPA_LOAD_NONE, /* no load in progress */ + SPA_LOAD_OPEN, /* normal open */ + SPA_LOAD_IMPORT, /* import in progress */ + SPA_LOAD_TRYIMPORT, /* tryimport in progress */ + SPA_LOAD_RECOVER, /* recovery requested */ + SPA_LOAD_ERROR, /* load failed */ + SPA_LOAD_CREATE /* creation in progress */ +} spa_load_state_t; + +typedef enum { + ZPOOL_WAIT_CKPT_DISCARD, + ZPOOL_WAIT_FREE, + ZPOOL_WAIT_INITIALIZE, + ZPOOL_WAIT_REPLACE, + ZPOOL_WAIT_REMOVE, + ZPOOL_WAIT_RESILVER, + ZPOOL_WAIT_SCRUB, + ZPOOL_WAIT_TRIM, + ZPOOL_WAIT_NUM_ACTIVITIES +} zpool_wait_activity_t; + +typedef enum { + ZFS_WAIT_DELETEQ, + ZFS_WAIT_NUM_ACTIVITIES +} zfs_wait_activity_t; + +/* + * Bookmark name values. + */ +#define ZPOOL_ERR_LIST "error list" +#define ZPOOL_ERR_DATASET "dataset" +#define ZPOOL_ERR_OBJECT "object" + +#define HIS_MAX_RECORD_LEN (MAXPATHLEN + MAXPATHLEN + 1) + +/* + * The following are names used in the nvlist describing + * the pool's history log. + */ +#define ZPOOL_HIST_RECORD "history record" +#define ZPOOL_HIST_TIME "history time" +#define ZPOOL_HIST_CMD "history command" +#define ZPOOL_HIST_WHO "history who" +#define ZPOOL_HIST_ZONE "history zone" +#define ZPOOL_HIST_HOST "history hostname" +#define ZPOOL_HIST_TXG "history txg" +#define ZPOOL_HIST_INT_EVENT "history internal event" +#define ZPOOL_HIST_INT_STR "history internal str" +#define ZPOOL_HIST_INT_NAME "internal_name" +#define ZPOOL_HIST_IOCTL "ioctl" +#define ZPOOL_HIST_INPUT_NVL "in_nvl" +#define ZPOOL_HIST_OUTPUT_NVL "out_nvl" +#define ZPOOL_HIST_DSNAME "dsname" +#define ZPOOL_HIST_DSID "dsid" +#define ZPOOL_HIST_ERRNO "errno" + +/* + * Special nvlist name that will not have its args recorded in the pool's + * history log. + */ +#define ZPOOL_HIDDEN_ARGS "hidden_args" + +/* + * The following are names used when invoking ZFS_IOC_POOL_INITIALIZE. + */ +#define ZPOOL_INITIALIZE_COMMAND "initialize_command" +#define ZPOOL_INITIALIZE_VDEVS "initialize_vdevs" + +/* + * The following are names used when invoking ZFS_IOC_POOL_TRIM. + */ +#define ZPOOL_TRIM_COMMAND "trim_command" +#define ZPOOL_TRIM_VDEVS "trim_vdevs" +#define ZPOOL_TRIM_RATE "trim_rate" +#define ZPOOL_TRIM_SECURE "trim_secure" + +/* + * The following are names used when invoking ZFS_IOC_POOL_WAIT. + */ +#define ZPOOL_WAIT_ACTIVITY "wait_activity" +#define ZPOOL_WAIT_TAG "wait_tag" +#define ZPOOL_WAIT_WAITED "wait_waited" + +/* + * The following are names used when invoking ZFS_IOC_WAIT_FS. + */ +#define ZFS_WAIT_ACTIVITY "wait_activity" +#define ZFS_WAIT_WAITED "wait_waited" + +/* + * Flags for ZFS_IOC_VDEV_SET_STATE + */ +#define ZFS_ONLINE_CHECKREMOVE 0x1 +#define ZFS_ONLINE_UNSPARE 0x2 +#define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_ONLINE_EXPAND 0x8 +#define ZFS_OFFLINE_TEMPORARY 0x1 + +/* + * Flags for ZFS_IOC_POOL_IMPORT + */ +#define ZFS_IMPORT_NORMAL 0x0 +#define ZFS_IMPORT_VERBATIM 0x1 +#define ZFS_IMPORT_ANY_HOST 0x2 +#define ZFS_IMPORT_MISSING_LOG 0x4 +#define ZFS_IMPORT_ONLY 0x8 +#define ZFS_IMPORT_TEMP_NAME 0x10 +#define ZFS_IMPORT_SKIP_MMP 0x20 +#define ZFS_IMPORT_LOAD_KEYS 0x40 +#define ZFS_IMPORT_CHECKPOINT 0x80 + +/* + * Channel program argument/return nvlist keys and defaults. + */ +#define ZCP_ARG_PROGRAM "program" +#define ZCP_ARG_ARGLIST "arg" +#define ZCP_ARG_SYNC "sync" +#define ZCP_ARG_INSTRLIMIT "instrlimit" +#define ZCP_ARG_MEMLIMIT "memlimit" + +#define ZCP_ARG_CLIARGV "argv" + +#define ZCP_RET_ERROR "error" +#define ZCP_RET_RETURN "return" + +#define ZCP_DEFAULT_INSTRLIMIT (10 * 1000 * 1000) +#define ZCP_MAX_INSTRLIMIT (10 * ZCP_DEFAULT_INSTRLIMIT) +#define ZCP_DEFAULT_MEMLIMIT (10 * 1024 * 1024) +#define ZCP_MAX_MEMLIMIT (10 * ZCP_DEFAULT_MEMLIMIT) + +/* + * Sysevent payload members. ZFS will generate the following sysevents with the + * given payloads: + * + * ESC_ZFS_RESILVER_START + * ESC_ZFS_RESILVER_FINISH + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_RESILVER_TYPE DATA_TYPE_STRING + * + * ESC_ZFS_POOL_DESTROY + * ESC_ZFS_POOL_REGUID + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_VDEV_REMOVE + * ESC_ZFS_VDEV_CLEAR + * ESC_ZFS_VDEV_CHECK + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_VDEV_PATH DATA_TYPE_STRING (optional) + * ZFS_EV_VDEV_GUID DATA_TYPE_UINT64 + * + * ESC_ZFS_HISTORY_EVENT + * + * ZFS_EV_POOL_NAME DATA_TYPE_STRING + * ZFS_EV_POOL_GUID DATA_TYPE_UINT64 + * ZFS_EV_HIST_TIME DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_CMD DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_WHO DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_ZONE DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_HOST DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_TXG DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_EVENT DATA_TYPE_UINT64 (optional) + * ZFS_EV_HIST_INT_STR DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_INT_NAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_IOCTL DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSNAME DATA_TYPE_STRING (optional) + * ZFS_EV_HIST_DSID DATA_TYPE_UINT64 (optional) + * + * The ZFS_EV_HIST_* members will correspond to the ZPOOL_HIST_* members in the + * history log nvlist. The keynames will be free of any spaces or other + * characters that could be potentially unexpected to consumers of the + * sysevents. + */ +#define ZFS_EV_POOL_NAME "pool_name" +#define ZFS_EV_POOL_GUID "pool_guid" +#define ZFS_EV_VDEV_PATH "vdev_path" +#define ZFS_EV_VDEV_GUID "vdev_guid" +#define ZFS_EV_HIST_TIME "history_time" +#define ZFS_EV_HIST_CMD "history_command" +#define ZFS_EV_HIST_WHO "history_who" +#define ZFS_EV_HIST_ZONE "history_zone" +#define ZFS_EV_HIST_HOST "history_hostname" +#define ZFS_EV_HIST_TXG "history_txg" +#define ZFS_EV_HIST_INT_EVENT "history_internal_event" +#define ZFS_EV_HIST_INT_STR "history_internal_str" +#define ZFS_EV_HIST_INT_NAME "history_internal_name" +#define ZFS_EV_HIST_IOCTL "history_ioctl" +#define ZFS_EV_HIST_DSNAME "history_dsname" +#define ZFS_EV_HIST_DSID "history_dsid" +#define ZFS_EV_RESILVER_TYPE "resilver_type" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_H */ diff --git a/include/sys/hkdf.h b/include/sys/hkdf.h new file mode 100644 index 000000000000..e0f7678c03d5 --- /dev/null +++ b/include/sys/hkdf.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_HKDF_H_ +#define _SYS_HKDF_H_ + +#include + +int hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len); + +#endif /* _SYS_HKDF_H_ */ diff --git a/include/sys/lua/Makefile.am b/include/sys/lua/Makefile.am new file mode 100644 index 000000000000..8b4dafaa8cf7 --- /dev/null +++ b/include/sys/lua/Makefile.am @@ -0,0 +1,17 @@ +COMMON_H = \ + lua.h \ + luaconf.h \ + lualib.h \ + lauxlib.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/lua +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/lua +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/lua/lauxlib.h b/include/sys/lua/lauxlib.h new file mode 100644 index 000000000000..503a99e9426d --- /dev/null +++ b/include/sys/lua/lauxlib.h @@ -0,0 +1,175 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lauxlib.h,v 1.120.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions for building Lua libraries +** See Copyright Notice in lua.h +*/ + + +#ifndef lauxlib_h +#define lauxlib_h + + +#include + + + +/* extra error code for `luaL_load' */ +#define LUA_ERRFILE (LUA_ERRERR+1) + + +typedef struct luaL_Reg { + const char *name; + lua_CFunction func; +} luaL_Reg; + + +LUALIB_API void (luaL_checkversion_) (lua_State *L, lua_Number ver); +#define luaL_checkversion(L) luaL_checkversion_(L, LUA_VERSION_NUM) + +LUALIB_API int (luaL_getmetafield) (lua_State *L, int obj, const char *e); +LUALIB_API int (luaL_callmeta) (lua_State *L, int obj, const char *e); +LUALIB_API const char *(luaL_tolstring) (lua_State *L, int idx, size_t *len); +LUALIB_API int (luaL_argerror) (lua_State *L, int numarg, const char *extramsg); +LUALIB_API const char *(luaL_checklstring) (lua_State *L, int numArg, + size_t *l); +LUALIB_API const char *(luaL_optlstring) (lua_State *L, int numArg, + const char *def, size_t *l); +LUALIB_API lua_Number (luaL_checknumber) (lua_State *L, int numArg); +LUALIB_API lua_Number (luaL_optnumber) (lua_State *L, int nArg, lua_Number def); + +LUALIB_API lua_Integer (luaL_checkinteger) (lua_State *L, int numArg); +LUALIB_API lua_Integer (luaL_optinteger) (lua_State *L, int nArg, + lua_Integer def); +LUALIB_API lua_Unsigned (luaL_checkunsigned) (lua_State *L, int numArg); +LUALIB_API lua_Unsigned (luaL_optunsigned) (lua_State *L, int numArg, + lua_Unsigned def); + +LUALIB_API void (luaL_checkstack) (lua_State *L, int sz, const char *msg); +LUALIB_API void (luaL_checktype) (lua_State *L, int narg, int t); +LUALIB_API void (luaL_checkany) (lua_State *L, int narg); + +LUALIB_API int (luaL_newmetatable) (lua_State *L, const char *tname); +LUALIB_API void (luaL_setmetatable) (lua_State *L, const char *tname); +LUALIB_API void *(luaL_testudata) (lua_State *L, int ud, const char *tname); +LUALIB_API void *(luaL_checkudata) (lua_State *L, int ud, const char *tname); + +LUALIB_API void (luaL_where) (lua_State *L, int lvl); +LUALIB_API int (luaL_error) (lua_State *L, const char *fmt, ...); + +LUALIB_API int (luaL_checkoption) (lua_State *L, int narg, const char *def, + const char *const lst[]); + +/* pre-defined references */ +#define LUA_NOREF (-2) +#define LUA_REFNIL (-1) + +LUALIB_API int (luaL_ref) (lua_State *L, int t); +LUALIB_API void (luaL_unref) (lua_State *L, int t, int ref); + +LUALIB_API int (luaL_loadbufferx) (lua_State *L, const char *buff, size_t sz, + const char *name, const char *mode); +LUALIB_API int (luaL_loadstring) (lua_State *L, const char *s); + +LUALIB_API int (luaL_len) (lua_State *L, int idx); + +LUALIB_API const char *(luaL_gsub) (lua_State *L, const char *s, const char *p, + const char *r); + +LUALIB_API void (luaL_setfuncs) (lua_State *L, const luaL_Reg *l, int nup); + +LUALIB_API int (luaL_getsubtable) (lua_State *L, int idx, const char *fname); + +LUALIB_API void (luaL_traceback) (lua_State *L, lua_State *L1, + const char *msg, int level); + +LUALIB_API void (luaL_requiref) (lua_State *L, const char *modname, + lua_CFunction openf, int glb); + +/* +** =============================================================== +** some useful macros +** =============================================================== +*/ + + +#define luaL_newlibtable(L,l) \ + lua_createtable(L, 0, sizeof(l)/sizeof((l)[0]) - 1) + +#define luaL_newlib(L,l) (luaL_newlibtable(L,l), luaL_setfuncs(L,l,0)) + +#define luaL_argcheck(L, cond,numarg,extramsg) \ + ((void)((cond) || luaL_argerror(L, (numarg), (extramsg)))) +#define luaL_checkstring(L,n) (luaL_checklstring(L, (n), NULL)) +#define luaL_optstring(L,n,d) (luaL_optlstring(L, (n), (d), NULL)) +#define luaL_checkint(L,n) ((int)luaL_checkinteger(L, (n))) +#define luaL_optint(L,n,d) ((int)luaL_optinteger(L, (n), (d))) +#define luaL_checklong(L,n) ((long)luaL_checkinteger(L, (n))) +#define luaL_optlong(L,n,d) ((long)luaL_optinteger(L, (n), (d))) + +#define luaL_typename(L,i) lua_typename(L, lua_type(L,(i))) + +#define luaL_dofile(L, fn) \ + (luaL_loadfile(L, fn) || lua_pcall(L, 0, LUA_MULTRET, 0)) + +#define luaL_dostring(L, s) \ + (luaL_loadstring(L, s) || lua_pcall(L, 0, LUA_MULTRET, 0)) + +#define luaL_getmetatable(L,n) (lua_getfield(L, LUA_REGISTRYINDEX, (n))) + +#define luaL_opt(L,f,n,d) (lua_isnoneornil(L,(n)) ? (d) : f(L,(n))) + +#define luaL_loadbuffer(L,s,sz,n) luaL_loadbufferx(L,s,sz,n,NULL) + + +/* +** {====================================================== +** Generic Buffer manipulation +** ======================================================= +*/ + +typedef struct luaL_Buffer { + char *b; /* buffer address */ + size_t size; /* buffer size */ + size_t n; /* number of characters in buffer */ + lua_State *L; + char initb[LUAL_BUFFERSIZE]; /* initial buffer */ +} luaL_Buffer; + + +#define luaL_addchar(B,c) \ + ((void)((B)->n < (B)->size || luaL_prepbuffsize((B), 1)), \ + ((B)->b[(B)->n++] = (c))) + +#define luaL_addsize(B,s) ((B)->n += (s)) + +LUALIB_API void (luaL_buffinit) (lua_State *L, luaL_Buffer *B); +LUALIB_API char *(luaL_prepbuffsize) (luaL_Buffer *B, size_t sz); +LUALIB_API void (luaL_addlstring) (luaL_Buffer *B, const char *s, size_t l); +LUALIB_API void (luaL_addstring) (luaL_Buffer *B, const char *s); +LUALIB_API void (luaL_addvalue) (luaL_Buffer *B); +LUALIB_API void (luaL_pushresult) (luaL_Buffer *B); +LUALIB_API void (luaL_pushresultsize) (luaL_Buffer *B, size_t sz); +LUALIB_API char *(luaL_buffinitsize) (lua_State *L, luaL_Buffer *B, size_t sz); + +#define luaL_prepbuffer(B) luaL_prepbuffsize(B, LUAL_BUFFERSIZE) + +/* }====================================================== */ + + +/* compatibility with old module system */ +#if defined(LUA_COMPAT_MODULE) + +LUALIB_API void (luaL_pushmodule) (lua_State *L, const char *modname, + int sizehint); +LUALIB_API void (luaL_openlib) (lua_State *L, const char *libname, + const luaL_Reg *l, int nup); + +#define luaL_register(L,n,l) (luaL_openlib(L,(n),(l),0)) + +#endif + + +#endif + +/* END CSTYLED */ diff --git a/include/sys/lua/lua.h b/include/sys/lua/lua.h new file mode 100644 index 000000000000..0b4eb045cf8f --- /dev/null +++ b/include/sys/lua/lua.h @@ -0,0 +1,445 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lua.h,v 1.285.1.4 2015/02/21 14:04:50 roberto Exp $ +** Lua - A Scripting Language +** Lua.org, PUC-Rio, Brazil (http://www.lua.org) +** See Copyright Notice at the end of this file +*/ + + +#ifndef lua_h +#define lua_h + +#include + +#include + + +#define LUA_VERSION_MAJOR "5" +#define LUA_VERSION_MINOR "2" +#define LUA_VERSION_NUM 502 +#define LUA_VERSION_RELEASE "4" + +#define LUA_VERSION "Lua " LUA_VERSION_MAJOR "." LUA_VERSION_MINOR +#define LUA_RELEASE LUA_VERSION "." LUA_VERSION_RELEASE +#define LUA_COPYRIGHT LUA_RELEASE " Copyright (C) 1994-2015 Lua.org, PUC-Rio" +#define LUA_AUTHORS "R. Ierusalimschy, L. H. de Figueiredo, W. Celes" + + +/* mark for precompiled code ('Lua') */ +#define LUA_SIGNATURE "\033Lua" + +/* option for multiple returns in 'lua_pcall' and 'lua_call' */ +#define LUA_MULTRET (-1) + + +/* +** pseudo-indices +*/ +#define LUA_REGISTRYINDEX LUAI_FIRSTPSEUDOIDX +#define lua_upvalueindex(i) (LUA_REGISTRYINDEX - (i)) + + +/* thread status */ +#define LUA_OK 0 +#define LUA_YIELD 1 +#define LUA_ERRRUN 2 +#define LUA_ERRSYNTAX 3 +#define LUA_ERRMEM 4 +#define LUA_ERRGCMM 5 +#define LUA_ERRERR 6 + + +typedef struct lua_State lua_State; + +typedef int (*lua_CFunction) (lua_State *L); + + +/* +** functions that read/write blocks when loading/dumping Lua chunks +*/ +typedef const char * (*lua_Reader) (lua_State *L, void *ud, size_t *sz); + +typedef int (*lua_Writer) (lua_State *L, const void* p, size_t sz, void* ud); + + +/* +** prototype for memory-allocation functions +*/ +typedef void * (*lua_Alloc) (void *ud, void *ptr, size_t osize, size_t nsize); + + +/* +** basic types +*/ +#define LUA_TNONE (-1) + +#define LUA_TNIL 0 +#define LUA_TBOOLEAN 1 +#define LUA_TLIGHTUSERDATA 2 +#define LUA_TNUMBER 3 +#define LUA_TSTRING 4 +#define LUA_TTABLE 5 +#define LUA_TFUNCTION 6 +#define LUA_TUSERDATA 7 +#define LUA_TTHREAD 8 + +#define LUA_NUMTAGS 9 + + + +/* minimum Lua stack available to a C function */ +#define LUA_MINSTACK 20 + + +/* predefined values in the registry */ +#define LUA_RIDX_MAINTHREAD 1 +#define LUA_RIDX_GLOBALS 2 +#define LUA_RIDX_LAST LUA_RIDX_GLOBALS + + +/* type of numbers in Lua */ +typedef LUA_NUMBER lua_Number; + + +/* type for integer functions */ +typedef LUA_INTEGER lua_Integer; + +/* unsigned integer type */ +typedef LUA_UNSIGNED lua_Unsigned; + + + + +/* +** generic extra include file +*/ +#if defined(LUA_USER_H) +#include LUA_USER_H +#endif + + +/* +** RCS ident string +*/ +extern const char lua_ident[]; + + +/* +** state manipulation +*/ +LUA_API lua_State *(lua_newstate) (lua_Alloc f, void *ud); +LUA_API void (lua_close) (lua_State *L); +LUA_API lua_State *(lua_newthread) (lua_State *L); + +LUA_API lua_CFunction (lua_atpanic) (lua_State *L, lua_CFunction panicf); + + +LUA_API const lua_Number *(lua_version) (lua_State *L); + + +/* +** basic stack manipulation +*/ +LUA_API int (lua_absindex) (lua_State *L, int idx); +LUA_API int (lua_gettop) (lua_State *L); +LUA_API void (lua_settop) (lua_State *L, int idx); +LUA_API void (lua_pushvalue) (lua_State *L, int idx); +LUA_API void (lua_remove) (lua_State *L, int idx); +LUA_API void (lua_insert) (lua_State *L, int idx); +LUA_API void (lua_replace) (lua_State *L, int idx); +LUA_API void (lua_copy) (lua_State *L, int fromidx, int toidx); +LUA_API int (lua_checkstack) (lua_State *L, int sz); + +LUA_API void (lua_xmove) (lua_State *from, lua_State *to, int n); + + +/* +** access functions (stack -> C) +*/ + +LUA_API int (lua_isnumber) (lua_State *L, int idx); +LUA_API int (lua_isstring) (lua_State *L, int idx); +LUA_API int (lua_iscfunction) (lua_State *L, int idx); +LUA_API int (lua_isuserdata) (lua_State *L, int idx); +LUA_API int (lua_type) (lua_State *L, int idx); +LUA_API const char *(lua_typename) (lua_State *L, int tp); + +LUA_API lua_Number (lua_tonumberx) (lua_State *L, int idx, int *isnum); +LUA_API lua_Integer (lua_tointegerx) (lua_State *L, int idx, int *isnum); +LUA_API lua_Unsigned (lua_tounsignedx) (lua_State *L, int idx, int *isnum); +LUA_API int (lua_toboolean) (lua_State *L, int idx); +LUA_API const char *(lua_tolstring) (lua_State *L, int idx, size_t *len); +LUA_API size_t (lua_rawlen) (lua_State *L, int idx); +LUA_API lua_CFunction (lua_tocfunction) (lua_State *L, int idx); +LUA_API void *(lua_touserdata) (lua_State *L, int idx); +LUA_API lua_State *(lua_tothread) (lua_State *L, int idx); +LUA_API const void *(lua_topointer) (lua_State *L, int idx); + + +/* +** Comparison and arithmetic functions +*/ + +#define LUA_OPADD 0 /* ORDER TM */ +#define LUA_OPSUB 1 +#define LUA_OPMUL 2 +#define LUA_OPDIV 3 +#define LUA_OPMOD 4 +#define LUA_OPPOW 5 +#define LUA_OPUNM 6 + +LUA_API void (lua_arith) (lua_State *L, int op); + +#define LUA_OPEQ 0 +#define LUA_OPLT 1 +#define LUA_OPLE 2 + +LUA_API int (lua_rawequal) (lua_State *L, int idx1, int idx2); +LUA_API int (lua_compare) (lua_State *L, int idx1, int idx2, int op); + + +/* +** push functions (C -> stack) +*/ +LUA_API void (lua_pushnil) (lua_State *L); +LUA_API void (lua_pushnumber) (lua_State *L, lua_Number n); +LUA_API void (lua_pushinteger) (lua_State *L, lua_Integer n); +LUA_API void (lua_pushunsigned) (lua_State *L, lua_Unsigned n); +LUA_API const char *(lua_pushlstring) (lua_State *L, const char *s, size_t l); +LUA_API const char *(lua_pushstring) (lua_State *L, const char *s); +LUA_API const char *(lua_pushvfstring) (lua_State *L, const char *fmt, + va_list argp); +LUA_API const char *(lua_pushfstring) (lua_State *L, const char *fmt, ...); +LUA_API void (lua_pushcclosure) (lua_State *L, lua_CFunction fn, int n); +LUA_API void (lua_pushboolean) (lua_State *L, int b); +LUA_API void (lua_pushlightuserdata) (lua_State *L, void *p); +LUA_API int (lua_pushthread) (lua_State *L); + + +/* +** get functions (Lua -> stack) +*/ +LUA_API void (lua_getglobal) (lua_State *L, const char *var); +LUA_API void (lua_gettable) (lua_State *L, int idx); +LUA_API void (lua_getfield) (lua_State *L, int idx, const char *k); +LUA_API void (lua_rawget) (lua_State *L, int idx); +LUA_API void (lua_rawgeti) (lua_State *L, int idx, int n); +LUA_API void (lua_rawgetp) (lua_State *L, int idx, const void *p); +LUA_API void (lua_createtable) (lua_State *L, int narr, int nrec); +LUA_API void *(lua_newuserdata) (lua_State *L, size_t sz); +LUA_API int (lua_getmetatable) (lua_State *L, int objindex); +LUA_API void (lua_getuservalue) (lua_State *L, int idx); + + +/* +** set functions (stack -> Lua) +*/ +LUA_API void (lua_setglobal) (lua_State *L, const char *var); +LUA_API void (lua_settable) (lua_State *L, int idx); +LUA_API void (lua_setfield) (lua_State *L, int idx, const char *k); +LUA_API void (lua_rawset) (lua_State *L, int idx); +LUA_API void (lua_rawseti) (lua_State *L, int idx, int n); +LUA_API void (lua_rawsetp) (lua_State *L, int idx, const void *p); +LUA_API int (lua_setmetatable) (lua_State *L, int objindex); +LUA_API void (lua_setuservalue) (lua_State *L, int idx); + + +/* +** 'load' and 'call' functions (load and run Lua code) +*/ +LUA_API void (lua_callk) (lua_State *L, int nargs, int nresults, int ctx, + lua_CFunction k); +#define lua_call(L,n,r) lua_callk(L, (n), (r), 0, NULL) + +LUA_API int (lua_getctx) (lua_State *L, int *ctx); + +LUA_API int (lua_pcallk) (lua_State *L, int nargs, int nresults, int errfunc, + int ctx, lua_CFunction k); +#define lua_pcall(L,n,r,f) lua_pcallk(L, (n), (r), (f), 0, NULL) + +LUA_API int (lua_load) (lua_State *L, lua_Reader reader, void *dt, + const char *chunkname, + const char *mode); + +LUA_API int (lua_dump) (lua_State *L, lua_Writer writer, void *data); + + +/* +** coroutine functions +*/ +LUA_API int (lua_yieldk) (lua_State *L, int nresults, int ctx, + lua_CFunction k); +#define lua_yield(L,n) lua_yieldk(L, (n), 0, NULL) +LUA_API int (lua_resume) (lua_State *L, lua_State *from, int narg); +LUA_API int (lua_status) (lua_State *L); + +/* +** garbage-collection function and options +*/ + +#define LUA_GCSTOP 0 +#define LUA_GCRESTART 1 +#define LUA_GCCOLLECT 2 +#define LUA_GCCOUNT 3 +#define LUA_GCCOUNTB 4 +#define LUA_GCSTEP 5 +#define LUA_GCSETPAUSE 6 +#define LUA_GCSETSTEPMUL 7 +#define LUA_GCSETMAJORINC 8 +#define LUA_GCISRUNNING 9 +#define LUA_GCGEN 10 +#define LUA_GCINC 11 + +LUA_API int (lua_gc) (lua_State *L, int what, int data); + + +/* +** miscellaneous functions +*/ + +LUA_API int (lua_error) (lua_State *L); + +LUA_API int (lua_next) (lua_State *L, int idx); + +LUA_API void (lua_concat) (lua_State *L, int n); +LUA_API void (lua_len) (lua_State *L, int idx); + +LUA_API lua_Alloc (lua_getallocf) (lua_State *L, void **ud); +LUA_API void (lua_setallocf) (lua_State *L, lua_Alloc f, void *ud); + + + +/* +** =============================================================== +** some useful macros +** =============================================================== +*/ + +#define lua_tonumber(L,i) lua_tonumberx(L,i,NULL) +#define lua_tointeger(L,i) lua_tointegerx(L,i,NULL) +#define lua_tounsigned(L,i) lua_tounsignedx(L,i,NULL) + +#define lua_pop(L,n) lua_settop(L, -(n)-1) + +#define lua_newtable(L) lua_createtable(L, 0, 0) + +#define lua_register(L,n,f) (lua_pushcfunction(L, (f)), lua_setglobal(L, (n))) + +#define lua_pushcfunction(L,f) lua_pushcclosure(L, (f), 0) + +#define lua_isfunction(L,n) (lua_type(L, (n)) == LUA_TFUNCTION) +#define lua_istable(L,n) (lua_type(L, (n)) == LUA_TTABLE) +#define lua_islightuserdata(L,n) (lua_type(L, (n)) == LUA_TLIGHTUSERDATA) +#define lua_isnil(L,n) (lua_type(L, (n)) == LUA_TNIL) +#define lua_isboolean(L,n) (lua_type(L, (n)) == LUA_TBOOLEAN) +#define lua_isthread(L,n) (lua_type(L, (n)) == LUA_TTHREAD) +#define lua_isnone(L,n) (lua_type(L, (n)) == LUA_TNONE) +#define lua_isnoneornil(L, n) (lua_type(L, (n)) <= 0) + +#define lua_pushliteral(L, s) \ + lua_pushlstring(L, "" s, (sizeof(s)/sizeof(char))-1) + +#define lua_pushglobaltable(L) \ + lua_rawgeti(L, LUA_REGISTRYINDEX, LUA_RIDX_GLOBALS) + +#define lua_tostring(L,i) lua_tolstring(L, (i), NULL) + + + +/* +** {====================================================================== +** Debug API +** ======================================================================= +*/ + + +/* +** Event codes +*/ +#define LUA_HOOKCALL 0 +#define LUA_HOOKRET 1 +#define LUA_HOOKLINE 2 +#define LUA_HOOKCOUNT 3 +#define LUA_HOOKTAILCALL 4 + + +/* +** Event masks +*/ +#define LUA_MASKCALL (1 << LUA_HOOKCALL) +#define LUA_MASKRET (1 << LUA_HOOKRET) +#define LUA_MASKLINE (1 << LUA_HOOKLINE) +#define LUA_MASKCOUNT (1 << LUA_HOOKCOUNT) + +typedef struct lua_Debug lua_Debug; /* activation record */ + + +/* Functions to be called by the debugger in specific events */ +typedef void (*lua_Hook) (lua_State *L, lua_Debug *ar); + + +LUA_API int (lua_getstack) (lua_State *L, int level, lua_Debug *ar); +LUA_API int (lua_getinfo) (lua_State *L, const char *what, lua_Debug *ar); +LUA_API const char *(lua_getlocal) (lua_State *L, const lua_Debug *ar, int n); +LUA_API const char *(lua_setlocal) (lua_State *L, const lua_Debug *ar, int n); +LUA_API const char *(lua_getupvalue) (lua_State *L, int funcindex, int n); +LUA_API const char *(lua_setupvalue) (lua_State *L, int funcindex, int n); + +LUA_API void *(lua_upvalueid) (lua_State *L, int fidx, int n); +LUA_API void (lua_upvaluejoin) (lua_State *L, int fidx1, int n1, + int fidx2, int n2); + +LUA_API int (lua_sethook) (lua_State *L, lua_Hook func, int mask, int count); +LUA_API lua_Hook (lua_gethook) (lua_State *L); +LUA_API int (lua_gethookmask) (lua_State *L); +LUA_API int (lua_gethookcount) (lua_State *L); + + +struct lua_Debug { + int event; + const char *name; /* (n) */ + const char *namewhat; /* (n) 'global', 'local', 'field', 'method' */ + const char *what; /* (S) 'Lua', 'C', 'main', 'tail' */ + const char *source; /* (S) */ + int currentline; /* (l) */ + int linedefined; /* (S) */ + int lastlinedefined; /* (S) */ + unsigned char nups; /* (u) number of upvalues */ + unsigned char nparams;/* (u) number of parameters */ + char isvararg; /* (u) */ + char istailcall; /* (t) */ + char short_src[LUA_IDSIZE]; /* (S) */ + /* private part */ + struct CallInfo *i_ci; /* active function */ +}; + +/* }====================================================================== */ + + +/****************************************************************************** +* Copyright (C) 1994-2015 Lua.org, PUC-Rio. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to deal in the Software without restriction, including +* without limitation the rights to use, copy, modify, merge, publish, +* distribute, sublicense, and/or sell copies of the Software, and to +* permit persons to whom the Software is furnished to do so, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be +* included in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +******************************************************************************/ + + +#endif +/* END CSTYLED */ diff --git a/include/sys/lua/luaconf.h b/include/sys/lua/luaconf.h new file mode 100644 index 000000000000..ce99f339fe34 --- /dev/null +++ b/include/sys/lua/luaconf.h @@ -0,0 +1,562 @@ +/* BEGIN CSTYLED */ +/* +** $Id: luaconf.h,v 1.176.1.2 2013/11/21 17:26:16 roberto Exp $ +** Configuration file for Lua +** See Copyright Notice in lua.h +*/ + + +#ifndef lconfig_h +#define lconfig_h + +#include + + +extern ssize_t lcompat_sprintf(char *, size_t size, const char *, ...); +extern int64_t lcompat_strtoll(const char *, char **); +extern int64_t lcompat_pow(int64_t, int64_t); +extern int lcompat_hashnum(int64_t); + +/* +** ================================================================== +** Search for "@@" to find all configurable definitions. +** =================================================================== +*/ + + +/* +@@ LUA_ANSI controls the use of non-ansi features. +** CHANGE it (define it) if you want Lua to avoid the use of any +** non-ansi feature or library. +*/ +#if !defined(LUA_ANSI) && defined(__STRICT_ANSI__) +#define LUA_ANSI +#endif + + +#if !defined(LUA_ANSI) && defined(_WIN32) && !defined(_WIN32_WCE) +#define LUA_WIN /* enable goodies for regular Windows platforms */ +#endif + +#if defined(LUA_WIN) +#define LUA_DL_DLL +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#endif + + + +#if defined(LUA_USE_LINUX) +#define LUA_USE_POSIX +#define LUA_USE_DLOPEN /* needs an extra library: -ldl */ +#define LUA_USE_READLINE /* needs some extra libraries */ +#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#define LUA_USE_LONGLONG /* assume support for long long */ +#endif + +#if defined(LUA_USE_MACOSX) +#define LUA_USE_POSIX +#define LUA_USE_DLOPEN /* does not need -ldl */ +#define LUA_USE_READLINE /* needs an extra library: -lreadline */ +#define LUA_USE_STRTODHEX /* assume 'strtod' handles hex formats */ +#define LUA_USE_AFORMAT /* assume 'printf' handles 'aA' specifiers */ +#define LUA_USE_LONGLONG /* assume support for long long */ +#endif + + + +/* +@@ LUA_USE_POSIX includes all functionality listed as X/Open System +@* Interfaces Extension (XSI). +** CHANGE it (define it) if your system is XSI compatible. +*/ +#if defined(LUA_USE_POSIX) +#define LUA_USE_MKSTEMP +#define LUA_USE_ISATTY +#define LUA_USE_POPEN +#define LUA_USE_ULONGJMP +#define LUA_USE_GMTIME_R +#endif + + + +/* +@@ LUA_PATH_DEFAULT is the default path that Lua uses to look for +@* Lua libraries. +@@ LUA_CPATH_DEFAULT is the default path that Lua uses to look for +@* C libraries. +** CHANGE them if your machine has a non-conventional directory +** hierarchy or if you want to install your libraries in +** non-conventional directories. +*/ +#if defined(_WIN32) /* { */ +/* +** In Windows, any exclamation mark ('!') in the path is replaced by the +** path of the directory of the executable file of the current process. +*/ +#define LUA_LDIR "!\\lua\\" +#define LUA_CDIR "!\\" +#define LUA_PATH_DEFAULT \ + LUA_LDIR"?.lua;" LUA_LDIR"?\\init.lua;" \ + LUA_CDIR"?.lua;" LUA_CDIR"?\\init.lua;" ".\\?.lua" +#define LUA_CPATH_DEFAULT \ + LUA_CDIR"?.dll;" LUA_CDIR"loadall.dll;" ".\\?.dll" + +#else /* }{ */ + +#define LUA_VDIR LUA_VERSION_MAJOR "." LUA_VERSION_MINOR "/" +#define LUA_ROOT "/usr/local/" +#define LUA_LDIR LUA_ROOT "share/lua/" LUA_VDIR +#define LUA_CDIR LUA_ROOT "lib/lua/" LUA_VDIR +#define LUA_PATH_DEFAULT \ + LUA_LDIR"?.lua;" LUA_LDIR"?/init.lua;" \ + LUA_CDIR"?.lua;" LUA_CDIR"?/init.lua;" "./?.lua" +#define LUA_CPATH_DEFAULT \ + LUA_CDIR"?.so;" LUA_CDIR"loadall.so;" "./?.so" +#endif /* } */ + + +/* +@@ LUA_DIRSEP is the directory separator (for submodules). +** CHANGE it if your machine does not use "/" as the directory separator +** and is not Windows. (On Windows Lua automatically uses "\".) +*/ +#if defined(_WIN32) +#define LUA_DIRSEP "\\" +#else +#define LUA_DIRSEP "/" +#endif + + +/* +@@ LUA_ENV is the name of the variable that holds the current +@@ environment, used to access global names. +** CHANGE it if you do not like this name. +*/ +#define LUA_ENV "_ENV" + + +/* +@@ LUA_API is a mark for all core API functions. +@@ LUALIB_API is a mark for all auxiliary library functions. +@@ LUAMOD_API is a mark for all standard library opening functions. +** CHANGE them if you need to define those functions in some special way. +** For instance, if you want to create one Windows DLL with the core and +** the libraries, you may want to use the following definition (define +** LUA_BUILD_AS_DLL to get it). +*/ +#if defined(LUA_BUILD_AS_DLL) /* { */ + +#if defined(LUA_CORE) || defined(LUA_LIB) /* { */ +#define LUA_API __declspec(dllexport) +#else /* }{ */ +#define LUA_API __declspec(dllimport) +#endif /* } */ + +#else /* }{ */ + +#define LUA_API extern + +#endif /* } */ + + +/* more often than not the libs go together with the core */ +#define LUALIB_API LUA_API +#define LUAMOD_API LUALIB_API + + +/* +@@ LUAI_FUNC is a mark for all extern functions that are not to be +@* exported to outside modules. +@@ LUAI_DDEF and LUAI_DDEC are marks for all extern (const) variables +@* that are not to be exported to outside modules (LUAI_DDEF for +@* definitions and LUAI_DDEC for declarations). +** CHANGE them if you need to mark them in some special way. Elf/gcc +** (versions 3.2 and later) mark them as "hidden" to optimize access +** when Lua is compiled as a shared library. Not all elf targets support +** this attribute. Unfortunately, gcc does not offer a way to check +** whether the target offers that support, and those without support +** give a warning about it. To avoid these warnings, change to the +** default definition. +*/ +#if defined(__GNUC__) && ((__GNUC__*100 + __GNUC_MINOR__) >= 302) && \ + defined(__ELF__) /* { */ +#define LUAI_FUNC __attribute__((visibility("hidden"))) extern +#define LUAI_DDEC LUAI_FUNC +#define LUAI_DDEF /* empty */ + +#else /* }{ */ +#define LUAI_FUNC extern +#define LUAI_DDEC extern +#define LUAI_DDEF /* empty */ +#endif /* } */ + + + +/* +@@ LUA_QL describes how error messages quote program elements. +** CHANGE it if you want a different appearance. +*/ +#define LUA_QL(x) "'" x "'" +#define LUA_QS LUA_QL("%s") + + +/* +@@ LUA_IDSIZE gives the maximum size for the description of the source +@* of a function in debug information. +** CHANGE it if you want a different size. +*/ +#define LUA_IDSIZE 60 + + +/* +@@ luai_writestringerror defines how to print error messages. +** (A format string with one argument is enough for Lua...) +*/ +#ifdef _KERNEL +#define luai_writestringerror(s,p) \ + (zfs_dbgmsg((s), (p))) +#else +#define luai_writestringerror(s,p) \ + (fprintf(stderr, (s), (p)), fflush(stderr)) +#endif + + +/* +@@ LUAI_MAXSHORTLEN is the maximum length for short strings, that is, +** strings that are internalized. (Cannot be smaller than reserved words +** or tags for metamethods, as these strings must be internalized; +** #("function") = 8, #("__newindex") = 10.) +*/ +#define LUAI_MAXSHORTLEN 40 + + + +/* +** {================================================================== +** Compatibility with previous versions +** =================================================================== +*/ + +/* +@@ LUA_COMPAT_ALL controls all compatibility options. +** You can define it to get all options, or change specific options +** to fit your specific needs. +*/ +#if defined(LUA_COMPAT_ALL) /* { */ + +/* +@@ LUA_COMPAT_UNPACK controls the presence of global 'unpack'. +** You can replace it with 'table.unpack'. +*/ +#define LUA_COMPAT_UNPACK + +/* +@@ LUA_COMPAT_LOADERS controls the presence of table 'package.loaders'. +** You can replace it with 'package.searchers'. +*/ +#define LUA_COMPAT_LOADERS + +/* +@@ macro 'lua_cpcall' emulates deprecated function lua_cpcall. +** You can call your C function directly (with light C functions). +*/ +#define lua_cpcall(L,f,u) \ + (lua_pushcfunction(L, (f)), \ + lua_pushlightuserdata(L,(u)), \ + lua_pcall(L,1,0,0)) + + +/* +@@ LUA_COMPAT_LOG10 defines the function 'log10' in the math library. +** You can rewrite 'log10(x)' as 'log(x, 10)'. +*/ +#define LUA_COMPAT_LOG10 + +/* +@@ LUA_COMPAT_LOADSTRING defines the function 'loadstring' in the base +** library. You can rewrite 'loadstring(s)' as 'load(s)'. +*/ +#define LUA_COMPAT_LOADSTRING + +/* +@@ LUA_COMPAT_MAXN defines the function 'maxn' in the table library. +*/ +#define LUA_COMPAT_MAXN + +/* +@@ The following macros supply trivial compatibility for some +** changes in the API. The macros themselves document how to +** change your code to avoid using them. +*/ +#define lua_strlen(L,i) lua_rawlen(L, (i)) + +#define lua_objlen(L,i) lua_rawlen(L, (i)) + +#define lua_equal(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPEQ) +#define lua_lessthan(L,idx1,idx2) lua_compare(L,(idx1),(idx2),LUA_OPLT) + +/* +@@ LUA_COMPAT_MODULE controls compatibility with previous +** module functions 'module' (Lua) and 'luaL_register' (C). +*/ +#define LUA_COMPAT_MODULE + +#endif /* } */ + +/* }================================================================== */ + + +#undef INT_MAX +#define INT_MAX 2147483647L + +/* +@@ LUAI_BITSINT defines the number of bits in an int. +** CHANGE here if Lua cannot automatically detect the number of bits of +** your machine. Probably you do not need to change this. +*/ +/* avoid overflows in comparison */ +#if INT_MAX-20 < 32760 /* { */ +#define LUAI_BITSINT 16 +#elif INT_MAX > 2147483640L /* }{ */ +/* int has at least 32 bits */ +#define LUAI_BITSINT 32 +#else /* }{ */ +#error "you must define LUA_BITSINT with number of bits in an integer" +#endif /* } */ + + +/* +@@ LUA_INT32 is a signed integer with exactly 32 bits. +@@ LUAI_UMEM is an unsigned integer big enough to count the total +@* memory used by Lua. +@@ LUAI_MEM is a signed integer big enough to count the total memory +@* used by Lua. +** CHANGE here if for some weird reason the default definitions are not +** good enough for your machine. Probably you do not need to change +** this. +*/ +#if LUAI_BITSINT >= 32 /* { */ +#define LUA_INT32 int +#define LUAI_UMEM size_t +#define LUAI_MEM ptrdiff_t +#else /* }{ */ +/* 16-bit ints */ +#define LUA_INT32 long +#define LUAI_UMEM unsigned long +#define LUAI_MEM long +#endif /* } */ + + +/* +@@ LUAI_MAXSTACK limits the size of the Lua stack. +** CHANGE it if you need a different limit. This limit is arbitrary; +** its only purpose is to stop Lua from consuming unlimited stack +** space (and to reserve some numbers for pseudo-indices). +*/ +#if LUAI_BITSINT >= 32 +#define LUAI_MAXSTACK 1000000 +#else +#define LUAI_MAXSTACK 15000 +#endif + +/* reserve some space for error handling */ +#define LUAI_FIRSTPSEUDOIDX (-LUAI_MAXSTACK - 1000) + + +/* +@@ LUAL_BUFFERSIZE is the buffer size used by the lauxlib buffer system. +** CHANGE it if it uses too much C-stack space. +*/ +#ifdef __linux__ +#define LUAL_BUFFERSIZE 512 +#else +#define LUAL_BUFFERSIZE 1024 +#endif + + +/* +** {================================================================== +@@ LUA_NUMBER is the type of numbers in Lua. +** CHANGE the following definitions only if you want to build Lua +** with a number type different from double. You may also need to +** change lua_number2int & lua_number2integer. +** =================================================================== +*/ + +#define LUA_NUMBER int64_t + +/* +@@ LUAI_UACNUMBER is the result of an 'usual argument conversion' +@* over a number. +*/ +#define LUAI_UACNUMBER int64_t + + +/* +@@ LUA_NUMBER_SCAN is the format for reading numbers. +@@ LUA_NUMBER_FMT is the format for writing numbers. +@@ lua_number2str converts a number to a string. +@@ LUAI_MAXNUMBER2STR is maximum size of previous conversion. +*/ +#ifndef PRId64 +#define PRId64 "lld" +#endif + +#define LUAI_MAXNUMBER2STR 32 /* 16 digits, sign, point, and \0 */ +#define LUA_NUMBER_FMT "%" PRId64 +#define lua_number2str(s,n) \ + lcompat_sprintf((s), LUAI_MAXNUMBER2STR, LUA_NUMBER_FMT, (n)) + + +/* +@@ l_mathop allows the addition of an 'l' or 'f' to all math operations +*/ +#define l_mathop(x) (x ## l) + + +/* +@@ lua_str2number converts a decimal numeric string to a number. +@@ lua_strx2number converts an hexadecimal numeric string to a number. +** In C99, 'strtod' does both conversions. C89, however, has no function +** to convert floating hexadecimal strings to numbers. For these +** systems, you can leave 'lua_strx2number' undefined and Lua will +** provide its own implementation. +*/ +#define lua_str2number(s,p) lcompat_strtoll((s), (p)) + +#if defined(LUA_USE_STRTODHEX) +#define lua_strx2number(s,p) lcompat_strtoll((s), (p)) +#endif + + +/* +@@ The luai_num* macros define the primitive operations over numbers. +*/ + +/* the following operations need the math library */ +#if defined(lobject_c) || defined(lvm_c) +#define luai_nummod(L,a,b) ((a) % (b)) +#define luai_numpow(L,a,b) (lcompat_pow((a),(b))) +#endif + +/* these are quite standard operations */ +#if defined(LUA_CORE) +#define luai_numadd(L,a,b) ((a)+(b)) +#define luai_numsub(L,a,b) ((a)-(b)) +#define luai_nummul(L,a,b) ((a)*(b)) +#define luai_numdiv(L,a,b) ((a)/(b)) +#define luai_numunm(L,a) (-(a)) +#define luai_numeq(a,b) ((a)==(b)) +#define luai_numlt(L,a,b) ((a)<(b)) +#define luai_numle(L,a,b) ((a)<=(b)) +#define luai_numisnan(L,a) (!luai_numeq((a), (a))) +#endif + + + +/* +@@ LUA_INTEGER is the integral type used by lua_pushinteger/lua_tointeger. +** CHANGE that if ptrdiff_t is not adequate on your machine. (On most +** machines, ptrdiff_t gives a good choice between int or long.) +*/ +#define LUA_INTEGER ptrdiff_t + +/* +@@ LUA_UNSIGNED is the integral type used by lua_pushunsigned/lua_tounsigned. +** It must have at least 32 bits. +*/ +#define LUA_UNSIGNED uint64_t + + + +/* +** Some tricks with doubles +*/ + +#if defined(LUA_NUMBER_DOUBLE) && !defined(LUA_ANSI) /* { */ +/* +** The next definitions activate some tricks to speed up the +** conversion from doubles to integer types, mainly to LUA_UNSIGNED. +** +@@ LUA_MSASMTRICK uses Microsoft assembler to avoid clashes with a +** DirectX idiosyncrasy. +** +@@ LUA_IEEE754TRICK uses a trick that should work on any machine +** using IEEE754 with a 32-bit integer type. +** +@@ LUA_IEEELL extends the trick to LUA_INTEGER; should only be +** defined when LUA_INTEGER is a 32-bit integer. +** +@@ LUA_IEEEENDIAN is the endianness of doubles in your machine +** (0 for little endian, 1 for big endian); if not defined, Lua will +** check it dynamically for LUA_IEEE754TRICK (but not for LUA_NANTRICK). +** +@@ LUA_NANTRICK controls the use of a trick to pack all types into +** a single double value, using NaN values to represent non-number +** values. The trick only works on 32-bit machines (ints and pointers +** are 32-bit values) with numbers represented as IEEE 754-2008 doubles +** with conventional endianness (12345678 or 87654321), in CPUs that do +** not produce signaling NaN values (all NaNs are quiet). +*/ + +/* Microsoft compiler on a Pentium (32 bit) ? */ +#if defined(LUA_WIN) && defined(_MSC_VER) && defined(_M_IX86) /* { */ + +#define LUA_MSASMTRICK +#define LUA_IEEEENDIAN 0 +#define LUA_NANTRICK + + +/* pentium 32 bits? */ +#elif defined(__i386__) || defined(__i386) || defined(__X86__) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEELL +#define LUA_IEEEENDIAN 0 +#define LUA_NANTRICK + +/* pentium 64 bits? */ +#elif defined(__x86_64) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEEENDIAN 0 + +#elif defined(__POWERPC__) || defined(__ppc__) /* }{ */ + +#define LUA_IEEE754TRICK +#define LUA_IEEEENDIAN 1 + +#else /* }{ */ + +/* assume IEEE754 and a 32-bit integer type */ +#define LUA_IEEE754TRICK + +#endif /* } */ + +#endif /* } */ + +/* }================================================================== */ + + + + +/* =================================================================== */ + +/* +** Local configuration. You can use this space to add your redefinitions +** without modifying the main part of the file. +*/ + +#define getlocaledecpoint() ('.') + +#ifndef abs +#define abs(x) (((x) < 0) ? -(x) : (x)) +#endif + +#if !defined(UCHAR_MAX) +#define UCHAR_MAX (0xff) +#endif + +#endif +/* END CSTYLED */ diff --git a/include/sys/lua/lualib.h b/include/sys/lua/lualib.h new file mode 100644 index 000000000000..7b848787c8f0 --- /dev/null +++ b/include/sys/lua/lualib.h @@ -0,0 +1,57 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lualib.h,v 1.43.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua standard libraries +** See Copyright Notice in lua.h +*/ + + +#ifndef lualib_h +#define lualib_h + +#include + + + +LUAMOD_API int (luaopen_base) (lua_State *L); + +#define LUA_COLIBNAME "coroutine" +LUAMOD_API int (luaopen_coroutine) (lua_State *L); + +#define LUA_TABLIBNAME "table" +LUAMOD_API int (luaopen_table) (lua_State *L); + +#define LUA_IOLIBNAME "io" +LUAMOD_API int (luaopen_io) (lua_State *L); + +#define LUA_OSLIBNAME "os" +LUAMOD_API int (luaopen_os) (lua_State *L); + +#define LUA_STRLIBNAME "string" +LUAMOD_API int (luaopen_string) (lua_State *L); + +#define LUA_BITLIBNAME "bit32" +LUAMOD_API int (luaopen_bit32) (lua_State *L); + +#define LUA_MATHLIBNAME "math" +LUAMOD_API int (luaopen_math) (lua_State *L); + +#define LUA_DBLIBNAME "debug" +LUAMOD_API int (luaopen_debug) (lua_State *L); + +#define LUA_LOADLIBNAME "package" +LUAMOD_API int (luaopen_package) (lua_State *L); + + +/* open all previous libraries */ +LUALIB_API void (luaL_openlibs) (lua_State *L); + + + +#if !defined(lua_assert) +#define lua_assert(x) ((void)0) +#endif + + +#endif +/* END CSTYLED */ diff --git a/include/sys/metaslab.h b/include/sys/metaslab.h new file mode 100644 index 000000000000..b3b7f865536e --- /dev/null +++ b/include/sys/metaslab.h @@ -0,0 +1,147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _SYS_METASLAB_H +#define _SYS_METASLAB_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + + +typedef struct metaslab_ops { + uint64_t (*msop_alloc)(metaslab_t *, uint64_t); +} metaslab_ops_t; + + +extern metaslab_ops_t *zfs_metaslab_ops; + +int metaslab_init(metaslab_group_t *, uint64_t, uint64_t, uint64_t, + metaslab_t **); +void metaslab_fini(metaslab_t *); + +void metaslab_set_unflushed_txg(metaslab_t *, uint64_t, dmu_tx_t *); +void metaslab_set_estimated_condensed_size(metaslab_t *, uint64_t, dmu_tx_t *); +uint64_t metaslab_unflushed_txg(metaslab_t *); +uint64_t metaslab_estimated_condensed_size(metaslab_t *); +int metaslab_sort_by_flushed(const void *, const void *); +uint64_t metaslab_unflushed_changes_memused(metaslab_t *); + +int metaslab_load(metaslab_t *); +void metaslab_unload(metaslab_t *); +boolean_t metaslab_flush(metaslab_t *, dmu_tx_t *); + +uint64_t metaslab_allocated_space(metaslab_t *); + +void metaslab_sync(metaslab_t *, uint64_t); +void metaslab_sync_done(metaslab_t *, uint64_t); +void metaslab_sync_reassess(metaslab_group_t *); +uint64_t metaslab_largest_allocatable(metaslab_t *); + +/* + * metaslab alloc flags + */ +#define METASLAB_HINTBP_FAVOR 0x0 +#define METASLAB_HINTBP_AVOID 0x1 +#define METASLAB_GANG_HEADER 0x2 +#define METASLAB_GANG_CHILD 0x4 +#define METASLAB_ASYNC_ALLOC 0x8 +#define METASLAB_DONT_THROTTLE 0x10 +#define METASLAB_MUST_RESERVE 0x20 +#define METASLAB_FASTWRITE 0x40 + +int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t, + blkptr_t *, int, uint64_t, blkptr_t *, int, zio_alloc_list_t *, zio_t *, + int); +int metaslab_alloc_dva(spa_t *, metaslab_class_t *, uint64_t, + dva_t *, int, dva_t *, uint64_t, int, zio_alloc_list_t *, int); +void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t); +void metaslab_free_concrete(vdev_t *, uint64_t, uint64_t, boolean_t); +void metaslab_free_dva(spa_t *, const dva_t *, boolean_t); +void metaslab_free_impl_cb(uint64_t, vdev_t *, uint64_t, uint64_t, void *); +void metaslab_unalloc_dva(spa_t *, const dva_t *, uint64_t); +int metaslab_claim(spa_t *, const blkptr_t *, uint64_t); +int metaslab_claim_impl(vdev_t *, uint64_t, uint64_t, uint64_t); +void metaslab_check_free(spa_t *, const blkptr_t *); +void metaslab_fastwrite_mark(spa_t *, const blkptr_t *); +void metaslab_fastwrite_unmark(spa_t *, const blkptr_t *); + +void metaslab_stat_init(void); +void metaslab_stat_fini(void); +void metaslab_trace_init(zio_alloc_list_t *); +void metaslab_trace_fini(zio_alloc_list_t *); + +metaslab_class_t *metaslab_class_create(spa_t *, metaslab_ops_t *); +void metaslab_class_destroy(metaslab_class_t *); +int metaslab_class_validate(metaslab_class_t *); +void metaslab_class_histogram_verify(metaslab_class_t *); +uint64_t metaslab_class_fragmentation(metaslab_class_t *); +uint64_t metaslab_class_expandable_space(metaslab_class_t *); +boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int, int, + zio_t *, int); +void metaslab_class_throttle_unreserve(metaslab_class_t *, int, int, zio_t *); +void metaslab_class_evict_old(metaslab_class_t *, uint64_t); +uint64_t metaslab_class_get_alloc(metaslab_class_t *); +uint64_t metaslab_class_get_space(metaslab_class_t *); +uint64_t metaslab_class_get_dspace(metaslab_class_t *); +uint64_t metaslab_class_get_deferred(metaslab_class_t *); + +void metaslab_space_update(vdev_t *, metaslab_class_t *, + int64_t, int64_t, int64_t); + +metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *, int); +void metaslab_group_destroy(metaslab_group_t *); +void metaslab_group_activate(metaslab_group_t *); +void metaslab_group_passivate(metaslab_group_t *); +boolean_t metaslab_group_initialized(metaslab_group_t *); +uint64_t metaslab_group_get_space(metaslab_group_t *); +void metaslab_group_histogram_verify(metaslab_group_t *); +uint64_t metaslab_group_fragmentation(metaslab_group_t *); +void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *); +void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int, int, + boolean_t); +void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *, int); +void metaslab_recalculate_weight_and_sort(metaslab_t *); +void metaslab_disable(metaslab_t *); +void metaslab_enable(metaslab_t *, boolean_t, boolean_t); +void metaslab_set_selected_txg(metaslab_t *, uint64_t); + +extern int metaslab_debug_load; + +range_seg_type_t metaslab_calculate_range_tree_type(vdev_t *vdev, + metaslab_t *msp, uint64_t *start, uint64_t *shift); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_H */ diff --git a/include/sys/metaslab_impl.h b/include/sys/metaslab_impl.h new file mode 100644 index 000000000000..4a7475256a2b --- /dev/null +++ b/include/sys/metaslab_impl.h @@ -0,0 +1,562 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_METASLAB_IMPL_H +#define _SYS_METASLAB_IMPL_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Metaslab allocation tracing record. + */ +typedef struct metaslab_alloc_trace { + list_node_t mat_list_node; + metaslab_group_t *mat_mg; + metaslab_t *mat_msp; + uint64_t mat_size; + uint64_t mat_weight; + uint32_t mat_dva_id; + uint64_t mat_offset; + int mat_allocator; +} metaslab_alloc_trace_t; + +/* + * Used by the metaslab allocation tracing facility to indicate + * error conditions. These errors are stored to the offset member + * of the metaslab_alloc_trace_t record and displayed by mdb. + */ +typedef enum trace_alloc_type { + TRACE_ALLOC_FAILURE = -1ULL, + TRACE_TOO_SMALL = -2ULL, + TRACE_FORCE_GANG = -3ULL, + TRACE_NOT_ALLOCATABLE = -4ULL, + TRACE_GROUP_FAILURE = -5ULL, + TRACE_ENOSPC = -6ULL, + TRACE_CONDENSING = -7ULL, + TRACE_VDEV_ERROR = -8ULL, + TRACE_DISABLED = -9ULL, +} trace_alloc_type_t; + +#define METASLAB_WEIGHT_PRIMARY (1ULL << 63) +#define METASLAB_WEIGHT_SECONDARY (1ULL << 62) +#define METASLAB_WEIGHT_CLAIM (1ULL << 61) +#define METASLAB_WEIGHT_TYPE (1ULL << 60) +#define METASLAB_ACTIVE_MASK \ + (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY | \ + METASLAB_WEIGHT_CLAIM) + +/* + * The metaslab weight is used to encode the amount of free space in a + * metaslab, such that the "best" metaslab appears first when sorting the + * metaslabs by weight. The weight (and therefore the "best" metaslab) can + * be determined in two different ways: by computing a weighted sum of all + * the free space in the metaslab (a space based weight) or by counting only + * the free segments of the largest size (a segment based weight). We prefer + * the segment based weight because it reflects how the free space is + * comprised, but we cannot always use it -- legacy pools do not have the + * space map histogram information necessary to determine the largest + * contiguous regions. Pools that have the space map histogram determine + * the segment weight by looking at each bucket in the histogram and + * determining the free space whose size in bytes is in the range: + * [2^i, 2^(i+1)) + * We then encode the largest index, i, that contains regions into the + * segment-weighted value. + * + * Space-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC1| weighted-free space | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * space - the fragmentation-weighted space + * + * Segment-based weight: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * |PSC0| idx| count of segments in region | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * PS - indicates primary and secondary activation + * C - indicates activation for claimed block zio + * idx - index for the highest bucket in the histogram + * count - number of segments in the specified bucket + */ +#define WEIGHT_GET_ACTIVE(weight) BF64_GET((weight), 61, 3) +#define WEIGHT_SET_ACTIVE(weight, x) BF64_SET((weight), 61, 3, x) + +#define WEIGHT_IS_SPACEBASED(weight) \ + ((weight) == 0 || BF64_GET((weight), 60, 1)) +#define WEIGHT_SET_SPACEBASED(weight) BF64_SET((weight), 60, 1, 1) + +/* + * These macros are only applicable to segment-based weighting. + */ +#define WEIGHT_GET_INDEX(weight) BF64_GET((weight), 54, 6) +#define WEIGHT_SET_INDEX(weight, x) BF64_SET((weight), 54, 6, x) +#define WEIGHT_GET_COUNT(weight) BF64_GET((weight), 0, 54) +#define WEIGHT_SET_COUNT(weight, x) BF64_SET((weight), 0, 54, x) + +/* + * A metaslab class encompasses a category of allocatable top-level vdevs. + * Each top-level vdev is associated with a metaslab group which defines + * the allocatable region for that vdev. Examples of these categories include + * "normal" for data block allocations (i.e. main pool allocations) or "log" + * for allocations designated for intent log devices (i.e. slog devices). + * When a block allocation is requested from the SPA it is associated with a + * metaslab_class_t, and only top-level vdevs (i.e. metaslab groups) belonging + * to the class can be used to satisfy that request. Allocations are done + * by traversing the metaslab groups that are linked off of the mc_rotor field. + * This rotor points to the next metaslab group where allocations will be + * attempted. Allocating a block is a 3 step process -- select the metaslab + * group, select the metaslab, and then allocate the block. The metaslab + * class defines the low-level block allocator that will be used as the + * final step in allocation. These allocators are pluggable allowing each class + * to use a block allocator that best suits that class. + */ +struct metaslab_class { + kmutex_t mc_lock; + spa_t *mc_spa; + metaslab_group_t *mc_rotor; + metaslab_ops_t *mc_ops; + uint64_t mc_aliquot; + + /* + * Track the number of metaslab groups that have been initialized + * and can accept allocations. An initialized metaslab group is + * one has been completely added to the config (i.e. we have + * updated the MOS config and the space has been added to the pool). + */ + uint64_t mc_groups; + + /* + * Toggle to enable/disable the allocation throttle. + */ + boolean_t mc_alloc_throttle_enabled; + + /* + * The allocation throttle works on a reservation system. Whenever + * an asynchronous zio wants to perform an allocation it must + * first reserve the number of blocks that it wants to allocate. + * If there aren't sufficient slots available for the pending zio + * then that I/O is throttled until more slots free up. The current + * number of reserved allocations is maintained by the mc_alloc_slots + * refcount. The mc_alloc_max_slots value determines the maximum + * number of allocations that the system allows. Gang blocks are + * allowed to reserve slots even if we've reached the maximum + * number of allocations allowed. + */ + uint64_t *mc_alloc_max_slots; + zfs_refcount_t *mc_alloc_slots; + + uint64_t mc_alloc_groups; /* # of allocatable groups */ + + uint64_t mc_alloc; /* total allocated space */ + uint64_t mc_deferred; /* total deferred frees */ + uint64_t mc_space; /* total space (alloc + free) */ + uint64_t mc_dspace; /* total deflated space */ + uint64_t mc_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + /* + * List of all loaded metaslabs in the class, sorted in order of most + * recent use. + */ + multilist_t *mc_metaslab_txg_list; +}; + +/* + * Per-allocator data structure. + */ +typedef struct metaslab_group_allocator { + uint64_t mga_cur_max_alloc_queue_depth; + zfs_refcount_t mga_alloc_queue_depth; + metaslab_t *mga_primary; + metaslab_t *mga_secondary; +} metaslab_group_allocator_t; + +/* + * Metaslab groups encapsulate all the allocatable regions (i.e. metaslabs) + * of a top-level vdev. They are linked together to form a circular linked + * list and can belong to only one metaslab class. Metaslab groups may become + * ineligible for allocations for a number of reasons such as limited free + * space, fragmentation, or going offline. When this happens the allocator will + * simply find the next metaslab group in the linked list and attempt + * to allocate from that group instead. + */ +struct metaslab_group { + kmutex_t mg_lock; + avl_tree_t mg_metaslab_tree; + uint64_t mg_aliquot; + boolean_t mg_allocatable; /* can we allocate? */ + uint64_t mg_ms_ready; + + /* + * A metaslab group is considered to be initialized only after + * we have updated the MOS config and added the space to the pool. + * We only allow allocation attempts to a metaslab group if it + * has been initialized. + */ + boolean_t mg_initialized; + + uint64_t mg_free_capacity; /* percentage free */ + int64_t mg_bias; + int64_t mg_activation_count; + metaslab_class_t *mg_class; + vdev_t *mg_vd; + taskq_t *mg_taskq; + metaslab_group_t *mg_prev; + metaslab_group_t *mg_next; + + /* + * In order for the allocation throttle to function properly, we cannot + * have too many IOs going to each disk by default; the throttle + * operates by allocating more work to disks that finish quickly, so + * allocating larger chunks to each disk reduces its effectiveness. + * However, if the number of IOs going to each allocator is too small, + * we will not perform proper aggregation at the vdev_queue layer, + * also resulting in decreased performance. Therefore, we will use a + * ramp-up strategy. + * + * Each allocator in each metaslab group has a current queue depth + * (mg_alloc_queue_depth[allocator]) and a current max queue depth + * (mg_cur_max_alloc_queue_depth[allocator]), and each metaslab group + * has an absolute max queue depth (mg_max_alloc_queue_depth). We + * add IOs to an allocator until the mg_alloc_queue_depth for that + * allocator hits the cur_max. Every time an IO completes for a given + * allocator on a given metaslab group, we increment its cur_max until + * it reaches mg_max_alloc_queue_depth. The cur_max resets every txg to + * help protect against disks that decrease in performance over time. + * + * It's possible for an allocator to handle more allocations than + * its max. This can occur when gang blocks are required or when other + * groups are unable to handle their share of allocations. + */ + uint64_t mg_max_alloc_queue_depth; + int mg_allocators; + metaslab_group_allocator_t *mg_allocator; /* array */ + /* + * A metalab group that can no longer allocate the minimum block + * size will set mg_no_free_space. Once a metaslab group is out + * of space then its share of work must be distributed to other + * groups. + */ + boolean_t mg_no_free_space; + + uint64_t mg_allocations; + uint64_t mg_failed_allocations; + uint64_t mg_fragmentation; + uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE]; + + int mg_ms_disabled; + boolean_t mg_disabled_updating; + kmutex_t mg_ms_disabled_lock; + kcondvar_t mg_ms_disabled_cv; +}; + +/* + * This value defines the number of elements in the ms_lbas array. The value + * of 64 was chosen as it covers all power of 2 buckets up to UINT64_MAX. + * This is the equivalent of highbit(UINT64_MAX). + */ +#define MAX_LBAS 64 + +/* + * Each metaslab maintains a set of in-core trees to track metaslab + * operations. The in-core free tree (ms_allocatable) contains the list of + * free segments which are eligible for allocation. As blocks are + * allocated, the allocated segment are removed from the ms_allocatable and + * added to a per txg allocation tree (ms_allocating). As blocks are + * freed, they are added to the free tree (ms_freeing). These trees + * allow us to process all allocations and frees in syncing context + * where it is safe to update the on-disk space maps. An additional set + * of in-core trees is maintained to track deferred frees + * (ms_defer). Once a block is freed it will move from the + * ms_freed to the ms_defer tree. A deferred free means that a block + * has been freed but cannot be used by the pool until TXG_DEFER_SIZE + * transactions groups later. For example, a block that is freed in txg + * 50 will not be available for reallocation until txg 52 (50 + + * TXG_DEFER_SIZE). This provides a safety net for uberblock rollback. + * A pool could be safely rolled back TXG_DEFERS_SIZE transactions + * groups and ensure that no block has been reallocated. + * + * The simplified transition diagram looks like this: + * + * + * ALLOCATE + * | + * V + * free segment (ms_allocatable) -> ms_allocating[4] -> (write to space map) + * ^ + * | ms_freeing <--- FREE + * | | + * | v + * | ms_freed + * | | + * +-------- ms_defer[2] <-------+-------> (write to space map) + * + * + * Each metaslab's space is tracked in a single space map in the MOS, + * which is only updated in syncing context. Each time we sync a txg, + * we append the allocs and frees from that txg to the space map. The + * pool space is only updated once all metaslabs have finished syncing. + * + * To load the in-core free tree we read the space map from disk. This + * object contains a series of alloc and free records that are combined + * to make up the list of all free segments in this metaslab. These + * segments are represented in-core by the ms_allocatable and are stored + * in an AVL tree. + * + * As the space map grows (as a result of the appends) it will + * eventually become space-inefficient. When the metaslab's in-core + * free tree is zfs_condense_pct/100 times the size of the minimal + * on-disk representation, we rewrite it in its minimized form. If a + * metaslab needs to condense then we must set the ms_condensing flag to + * ensure that allocations are not performed on the metaslab that is + * being written. + */ +struct metaslab { + /* + * This is the main lock of the metaslab and its purpose is to + * coordinate our allocations and frees [e.g metaslab_block_alloc(), + * metaslab_free_concrete(), ..etc] with our various syncing + * procedures [e.g. metaslab_sync(), metaslab_sync_done(), ..etc]. + * + * The lock is also used during some miscellaneous operations like + * using the metaslab's histogram for the metaslab group's histogram + * aggregation, or marking the metaslab for initialization. + */ + kmutex_t ms_lock; + + /* + * Acquired together with the ms_lock whenever we expect to + * write to metaslab data on-disk (i.e flushing entries to + * the metaslab's space map). It helps coordinate readers of + * the metaslab's space map [see spa_vdev_remove_thread()] + * with writers [see metaslab_sync() or metaslab_flush()]. + * + * Note that metaslab_load(), even though a reader, uses + * a completely different mechanism to deal with the reading + * of the metaslab's space map based on ms_synced_length. That + * said, the function still uses the ms_sync_lock after it + * has read the ms_sm [see relevant comment in metaslab_load() + * as to why]. + */ + kmutex_t ms_sync_lock; + + kcondvar_t ms_load_cv; + space_map_t *ms_sm; + uint64_t ms_id; + uint64_t ms_start; + uint64_t ms_size; + uint64_t ms_fragmentation; + + range_tree_t *ms_allocating[TXG_SIZE]; + range_tree_t *ms_allocatable; + uint64_t ms_allocated_this_txg; + uint64_t ms_allocating_total; + + /* + * The following range trees are accessed only from syncing context. + * ms_free*tree only have entries while syncing, and are empty + * between syncs. + */ + range_tree_t *ms_freeing; /* to free this syncing txg */ + range_tree_t *ms_freed; /* already freed this syncing txg */ + range_tree_t *ms_defer[TXG_DEFER_SIZE]; + range_tree_t *ms_checkpointing; /* to add to the checkpoint */ + + /* + * The ms_trim tree is the set of allocatable segments which are + * eligible for trimming. (When the metaslab is loaded, it's a + * subset of ms_allocatable.) It's kept in-core as long as the + * autotrim property is set and is not vacated when the metaslab + * is unloaded. Its purpose is to aggregate freed ranges to + * facilitate efficient trimming. + */ + range_tree_t *ms_trim; + + boolean_t ms_condensing; /* condensing? */ + boolean_t ms_condense_wanted; + + /* + * The number of consumers which have disabled the metaslab. + */ + uint64_t ms_disabled; + + /* + * We must always hold the ms_lock when modifying ms_loaded + * and ms_loading. + */ + boolean_t ms_loaded; + boolean_t ms_loading; + kcondvar_t ms_flush_cv; + boolean_t ms_flushing; + + /* + * The following histograms count entries that are in the + * metaslab's space map (and its histogram) but are not in + * ms_allocatable yet, because they are in ms_freed, ms_freeing, + * or ms_defer[]. + * + * When the metaslab is not loaded, its ms_weight needs to + * reflect what is allocatable (i.e. what will be part of + * ms_allocatable if it is loaded). The weight is computed from + * the spacemap histogram, but that includes ranges that are + * not yet allocatable (because they are in ms_freed, + * ms_freeing, or ms_defer[]). Therefore, when calculating the + * weight, we need to remove those ranges. + * + * The ranges in the ms_freed and ms_defer[] range trees are all + * present in the spacemap. However, the spacemap may have + * multiple entries to represent a contiguous range, because it + * is written across multiple sync passes, but the changes of + * all sync passes are consolidated into the range trees. + * Adjacent ranges that are freed in different sync passes of + * one txg will be represented separately (as 2 or more entries) + * in the space map (and its histogram), but these adjacent + * ranges will be consolidated (represented as one entry) in the + * ms_freed/ms_defer[] range trees (and their histograms). + * + * When calculating the weight, we can not simply subtract the + * range trees' histograms from the spacemap's histogram, + * because the range trees' histograms may have entries in + * higher buckets than the spacemap, due to consolidation. + * Instead we must subtract the exact entries that were added to + * the spacemap's histogram. ms_synchist and ms_deferhist[] + * represent these exact entries, so we can subtract them from + * the spacemap's histogram when calculating ms_weight. + * + * ms_synchist represents the same ranges as ms_freeing + + * ms_freed, but without consolidation across sync passes. + * + * ms_deferhist[i] represents the same ranges as ms_defer[i], + * but without consolidation across sync passes. + */ + uint64_t ms_synchist[SPACE_MAP_HISTOGRAM_SIZE]; + uint64_t ms_deferhist[TXG_DEFER_SIZE][SPACE_MAP_HISTOGRAM_SIZE]; + + /* + * Tracks the exact amount of allocated space of this metaslab + * (and specifically the metaslab's space map) up to the most + * recently completed sync pass [see usage in metaslab_sync()]. + */ + uint64_t ms_allocated_space; + int64_t ms_deferspace; /* sum of ms_defermap[] space */ + uint64_t ms_weight; /* weight vs. others in group */ + uint64_t ms_activation_weight; /* activation weight */ + + /* + * Track of whenever a metaslab is selected for loading or allocation. + * We use this value to determine how long the metaslab should + * stay cached. + */ + uint64_t ms_selected_txg; + /* + * ms_load/unload_time can be used for performance monitoring + * (e.g. by dtrace or mdb). + */ + hrtime_t ms_load_time; /* time last loaded */ + hrtime_t ms_unload_time; /* time last unloaded */ + hrtime_t ms_selected_time; /* time last allocated from */ + + uint64_t ms_alloc_txg; /* last successful alloc (debug only) */ + uint64_t ms_max_size; /* maximum allocatable size */ + + /* + * -1 if it's not active in an allocator, otherwise set to the allocator + * this metaslab is active for. + */ + int ms_allocator; + boolean_t ms_primary; /* Only valid if ms_allocator is not -1 */ + + /* + * The metaslab block allocators can optionally use a size-ordered + * range tree and/or an array of LBAs. Not all allocators use + * this functionality. The ms_allocatable_by_size should always + * contain the same number of segments as the ms_allocatable. The + * only difference is that the ms_allocatable_by_size is ordered by + * segment sizes. + */ + zfs_btree_t ms_allocatable_by_size; + zfs_btree_t ms_unflushed_frees_by_size; + uint64_t ms_lbas[MAX_LBAS]; + + metaslab_group_t *ms_group; /* metaslab group */ + avl_node_t ms_group_node; /* node in metaslab group tree */ + txg_node_t ms_txg_node; /* per-txg dirty metaslab links */ + avl_node_t ms_spa_txg_node; /* node in spa_metaslabs_by_txg */ + /* + * Node in metaslab class's selected txg list + */ + multilist_node_t ms_class_txg_node; + + /* + * Allocs and frees that are committed to the vdev log spacemap but + * not yet to this metaslab's spacemap. + */ + range_tree_t *ms_unflushed_allocs; + range_tree_t *ms_unflushed_frees; + + /* + * We have flushed entries up to but not including this TXG. In + * other words, all changes from this TXG and onward should not + * be in this metaslab's space map and must be read from the + * log space maps. + */ + uint64_t ms_unflushed_txg; + + /* updated every time we are done syncing the metaslab's space map */ + uint64_t ms_synced_length; + + boolean_t ms_new; +}; + +typedef struct metaslab_unflushed_phys { + /* on-disk counterpart of ms_unflushed_txg */ + uint64_t msp_unflushed_txg; +} metaslab_unflushed_phys_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_METASLAB_IMPL_H */ diff --git a/include/sys/mmp.h b/include/sys/mmp.h new file mode 100644 index 000000000000..ce9c4496a04f --- /dev/null +++ b/include/sys/mmp.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2017 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_MMP_H +#define _SYS_MMP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MMP_MIN_INTERVAL 100 /* ms */ +#define MMP_DEFAULT_INTERVAL 1000 /* ms */ +#define MMP_DEFAULT_IMPORT_INTERVALS 20 +#define MMP_DEFAULT_FAIL_INTERVALS 10 +#define MMP_MIN_FAIL_INTERVALS 2 /* min if != 0 */ +#define MMP_IMPORT_SAFETY_FACTOR 200 /* pct */ +#define MMP_INTERVAL_OK(interval) MAX(interval, MMP_MIN_INTERVAL) +#define MMP_FAIL_INTVS_OK(fails) (fails == 0 ? 0 : MAX(fails, \ + MMP_MIN_FAIL_INTERVALS)) + +typedef struct mmp_thread { + kmutex_t mmp_thread_lock; /* protect thread mgmt fields */ + kcondvar_t mmp_thread_cv; + kthread_t *mmp_thread; + uint8_t mmp_thread_exiting; + kmutex_t mmp_io_lock; /* protect below */ + hrtime_t mmp_last_write; /* last successful MMP write */ + uint64_t mmp_delay; /* decaying avg ns between MMP writes */ + uberblock_t mmp_ub; /* last ub written by sync */ + zio_t *mmp_zio_root; /* root of mmp write zios */ + uint64_t mmp_kstat_id; /* unique id for next MMP write kstat */ + int mmp_skip_error; /* reason for last skipped write */ + vdev_t *mmp_last_leaf; /* last mmp write sent here */ + uint64_t mmp_leaf_last_gen; /* last mmp write sent here */ + uint32_t mmp_seq; /* intra-second update counter */ +} mmp_thread_t; + + +extern void mmp_init(struct spa *spa); +extern void mmp_fini(struct spa *spa); +extern void mmp_thread_start(struct spa *spa); +extern void mmp_thread_stop(struct spa *spa); +extern void mmp_update_uberblock(struct spa *spa, struct uberblock *ub); +extern void mmp_signal_all_threads(void); + +/* Global tuning */ +extern int param_set_multihost_interval(ZFS_MODULE_PARAM_ARGS); +extern ulong_t zfs_multihost_interval; +extern uint_t zfs_multihost_fail_intervals; +extern uint_t zfs_multihost_import_intervals; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MMP_H */ diff --git a/include/sys/mntent.h b/include/sys/mntent.h new file mode 100644 index 000000000000..8d578f67b8a7 --- /dev/null +++ b/include/sys/mntent.h @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + */ + +#ifndef _SYS_MNTENT_H +#define _SYS_MNTENT_H + +#define MNTMAXSTR 128 + +#define MNTTYPE_ZFS "zfs" /* ZFS file system */ + +#define MOUNT_SUCCESS 0x00 /* Success */ +#define MOUNT_USAGE 0x01 /* Invalid invocation or permissions */ +#define MOUNT_SYSERR 0x02 /* System error (ENOMEM, etc) */ +#define MOUNT_SOFTWARE 0x04 /* Internal mount bug */ +#define MOUNT_USER 0x08 /* Interrupted by user (EINTR) */ +#define MOUNT_FILEIO 0x10 /* Error updating/locking /etc/mtab */ +#define MOUNT_FAIL 0x20 /* Mount failed */ +#define MOUNT_SOMEOK 0x40 /* At least on mount succeeded */ +#define MOUNT_BUSY 0x80 /* Mount failed due to EBUSY */ + +#define MNTOPT_ASYNC "async" /* all I/O is asynchronous */ +#define MNTOPT_ATIME "atime" /* update atime for files */ +#define MNTOPT_NOATIME "noatime" /* do not update atime for files */ +#define MNTOPT_AUTO "auto" /* automount */ +#define MNTOPT_NOAUTO "noauto" /* do not automount */ +#define MNTOPT_CONTEXT "context" /* selinux context */ +#define MNTOPT_FSCONTEXT "fscontext" /* selinux fscontext */ +#define MNTOPT_DEFCONTEXT "defcontext" /* selinux defcontext */ +#define MNTOPT_ROOTCONTEXT "rootcontext" /* selinux rootcontext */ +#define MNTOPT_DEFAULTS "defaults" /* defaults */ +#define MNTOPT_DEVICES "dev" /* device-special allowed */ +#define MNTOPT_NODEVICES "nodev" /* device-special disallowed */ +#define MNTOPT_DIRATIME "diratime" /* update atime for dirs */ +#define MNTOPT_NODIRATIME "nodiratime" /* do not update atime for dirs */ +#define MNTOPT_DIRSYNC "dirsync" /* do dir updates synchronously */ +#define MNTOPT_EXEC "exec" /* enable executables */ +#define MNTOPT_NOEXEC "noexec" /* disable executables */ +#define MNTOPT_GROUP "group" /* allow group mount */ +#define MNTOPT_NOGROUP "nogroup" /* do not allow group mount */ +#define MNTOPT_IVERSION "iversion" /* update inode version */ +#define MNTOPT_NOIVERSION "noiversion" /* do not update inode version */ +#define MNTOPT_NBMAND "mand" /* allow non-blocking mandatory locks */ +#define MNTOPT_NONBMAND "nomand" /* deny non-blocking mandatory locks */ +#define MNTOPT_NETDEV "_netdev" /* network device */ +#define MNTOPT_NOFAIL "nofail" /* no failure */ +#define MNTOPT_RELATIME "relatime" /* allow relative time updates */ +#define MNTOPT_NORELATIME "norelatime" /* do not allow relative time updates */ +#define MNTOPT_STRICTATIME "strictatime" /* strict access time updates */ +#define MNTOPT_NOSTRICTATIME "nostrictatime" /* No strict access time updates */ +#define MNTOPT_LAZYTIME "lazytime" /* Defer access time writing */ +#ifdef __linux__ +#define MNTOPT_SETUID "suid" /* Both setuid and devices allowed */ +#define MNTOPT_NOSETUID "nosuid" /* Neither setuid nor devices allowed */ +#elif defined(__FreeBSD__) +#define MNTOPT_SETUID "setuid" /* Set uid allowed */ +#define MNTOPT_NOSETUID "nosetuid" /* Set uid not allowed */ +#else +#error "unknown OS" +#endif +#define MNTOPT_OWNER "owner" /* allow owner mount */ +#define MNTOPT_NOOWNER "noowner" /* do not allow owner mount */ +#define MNTOPT_REMOUNT "remount" /* change mount options */ +#define MNTOPT_RO "ro" /* read only */ +#define MNTOPT_RW "rw" /* read/write */ +#define MNTOPT_SYNC "sync" /* all I/O is synchronous */ +#define MNTOPT_USER "user" /* allow user mount */ +#define MNTOPT_NOUSER "nouser" /* do not allow user mount */ +#define MNTOPT_USERS "users" /* allow user mount */ +#define MNTOPT_NOUSERS "nousers" /* do not allow user mount */ +#define MNTOPT_SUB "sub" /* allow mounts on subdirs */ +#define MNTOPT_NOSUB "nosub" /* do not allow mounts on subdirs */ +#define MNTOPT_QUIET "quiet" /* quiet mount */ +#define MNTOPT_LOUD "loud" /* verbose mount */ +#define MNTOPT_BIND "bind" /* remount part of a tree */ +#define MNTOPT_RBIND "rbind" /* include subtrees */ +#define MNTOPT_DIRXATTR "dirxattr" /* enable directory xattrs */ +#define MNTOPT_SAXATTR "saxattr" /* enable system-attribute xattrs */ +#define MNTOPT_XATTR "xattr" /* enable extended attributes */ +#define MNTOPT_NOXATTR "noxattr" /* disable extended attributes */ +#define MNTOPT_COMMENT "comment" /* comment */ +#define MNTOPT_ZFSUTIL "zfsutil" /* called by zfs utility */ +#define MNTOPT_ACL "acl" /* passed by util-linux-2.24 mount */ +#define MNTOPT_NOACL "noacl" /* likewise */ +#define MNTOPT_POSIXACL "posixacl" /* likewise */ +#define MNTOPT_MNTPOINT "mntpoint" /* mount point hint */ + +#endif /* _SYS_MNTENT_H */ diff --git a/include/sys/mod.h b/include/sys/mod.h new file mode 100644 index 000000000000..0ad7704afe99 --- /dev/null +++ b/include/sys/mod.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ +#ifndef _SYS_MOD_H +#define _SYS_MOD_H + +#ifdef _KERNEL +#include +#else +/* + * Exported symbols + */ +#define EXPORT_SYMBOL(x) + +#define ZFS_MODULE_DESCRIPTION(s) +#define ZFS_MODULE_AUTHOR(s) +#define ZFS_MODULE_LICENSE(s) +#define ZFS_MODULE_VERSION(s) +#endif + +#endif /* SYS_MOD_H */ diff --git a/include/sys/multilist.h b/include/sys/multilist.h new file mode 100644 index 000000000000..0c7b4075d9a3 --- /dev/null +++ b/include/sys/multilist.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_MULTILIST_H +#define _SYS_MULTILIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef list_node_t multilist_node_t; +typedef struct multilist multilist_t; +typedef struct multilist_sublist multilist_sublist_t; +typedef unsigned int multilist_sublist_index_func_t(multilist_t *, void *); + +struct multilist_sublist { + /* + * The mutex used internally to implement thread safe insertions + * and removals to this individual sublist. It can also be locked + * by a consumer using multilist_sublist_{lock,unlock}, which is + * useful if a consumer needs to traverse the list in a thread + * safe manner. + */ + kmutex_t mls_lock; + /* + * The actual list object containing all objects in this sublist. + */ + list_t mls_list; + /* + * Pad to cache line, in an effort to try and prevent cache line + * contention. + */ +} ____cacheline_aligned; + +struct multilist { + /* + * This is used to get to the multilist_node_t structure given + * the void *object contained on the list. + */ + size_t ml_offset; + /* + * The number of sublists used internally by this multilist. + */ + uint64_t ml_num_sublists; + /* + * The array of pointers to the actual sublists. + */ + multilist_sublist_t *ml_sublists; + /* + * Pointer to function which determines the sublist to use + * when inserting and removing objects from this multilist. + * Please see the comment above multilist_create for details. + */ + multilist_sublist_index_func_t *ml_index_func; +}; + +void multilist_destroy(multilist_t *); +multilist_t *multilist_create(size_t, size_t, multilist_sublist_index_func_t *); + +void multilist_insert(multilist_t *, void *); +void multilist_remove(multilist_t *, void *); +int multilist_is_empty(multilist_t *); + +unsigned int multilist_get_num_sublists(multilist_t *); +unsigned int multilist_get_random_index(multilist_t *); + +multilist_sublist_t *multilist_sublist_lock(multilist_t *, unsigned int); +multilist_sublist_t *multilist_sublist_lock_obj(multilist_t *, void *); +void multilist_sublist_unlock(multilist_sublist_t *); + +void multilist_sublist_insert_head(multilist_sublist_t *, void *); +void multilist_sublist_insert_tail(multilist_sublist_t *, void *); +void multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj); +void multilist_sublist_remove(multilist_sublist_t *, void *); +int multilist_sublist_is_empty(multilist_sublist_t *); +int multilist_sublist_is_empty_idx(multilist_t *, unsigned int); + +void *multilist_sublist_head(multilist_sublist_t *); +void *multilist_sublist_tail(multilist_sublist_t *); +void *multilist_sublist_next(multilist_sublist_t *, void *); +void *multilist_sublist_prev(multilist_sublist_t *, void *); + +void multilist_link_init(multilist_node_t *); +int multilist_link_active(multilist_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MULTILIST_H */ diff --git a/include/sys/note.h b/include/sys/note.h new file mode 100644 index 000000000000..33b5476686ea --- /dev/null +++ b/include/sys/note.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1994 by Sun Microsystems, Inc. + */ + +/* + * sys/note.h: interface for annotating source with info for tools + * + * This is the underlying interface; NOTE (/usr/include/note.h) is the + * preferred interface, but all exported header files should include this + * file directly and use _NOTE so as not to take "NOTE" from the user's + * namespace. For consistency, *all* kernel source should use _NOTE. + * + * By default, annotations expand to nothing. This file implements + * that. Tools using annotations will interpose a different version + * of this file that will expand annotations as needed. + */ + +#ifndef _SYS_NOTE_H +#define _SYS_NOTE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _NOTE +#define _NOTE(s) +#endif + +#define NOTE(s) _NOTE(s) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NOTE_H */ diff --git a/include/sys/nvpair.h b/include/sys/nvpair.h new file mode 100644 index 000000000000..e8567933d2bc --- /dev/null +++ b/include/sys/nvpair.h @@ -0,0 +1,358 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_NVPAIR_H +#define _SYS_NVPAIR_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + DATA_TYPE_DONTCARE = -1, + DATA_TYPE_UNKNOWN = 0, + DATA_TYPE_BOOLEAN, + DATA_TYPE_BYTE, + DATA_TYPE_INT16, + DATA_TYPE_UINT16, + DATA_TYPE_INT32, + DATA_TYPE_UINT32, + DATA_TYPE_INT64, + DATA_TYPE_UINT64, + DATA_TYPE_STRING, + DATA_TYPE_BYTE_ARRAY, + DATA_TYPE_INT16_ARRAY, + DATA_TYPE_UINT16_ARRAY, + DATA_TYPE_INT32_ARRAY, + DATA_TYPE_UINT32_ARRAY, + DATA_TYPE_INT64_ARRAY, + DATA_TYPE_UINT64_ARRAY, + DATA_TYPE_STRING_ARRAY, + DATA_TYPE_HRTIME, + DATA_TYPE_NVLIST, + DATA_TYPE_NVLIST_ARRAY, + DATA_TYPE_BOOLEAN_VALUE, + DATA_TYPE_INT8, + DATA_TYPE_UINT8, + DATA_TYPE_BOOLEAN_ARRAY, + DATA_TYPE_INT8_ARRAY, +#if !defined(_KERNEL) + DATA_TYPE_UINT8_ARRAY, + DATA_TYPE_DOUBLE +#else + DATA_TYPE_UINT8_ARRAY +#endif +} data_type_t; + +typedef struct nvpair { + int32_t nvp_size; /* size of this nvpair */ + int16_t nvp_name_sz; /* length of name string */ + int16_t nvp_reserve; /* not used */ + int32_t nvp_value_elem; /* number of elements for array types */ + data_type_t nvp_type; /* type of value */ + /* name string */ + /* aligned ptr array for string arrays */ + /* aligned array of data for value */ +} nvpair_t; + +/* nvlist header */ +typedef struct nvlist { + int32_t nvl_version; + uint32_t nvl_nvflag; /* persistent flags */ + uint64_t nvl_priv; /* ptr to private data if not packed */ + uint32_t nvl_flag; + int32_t nvl_pad; /* currently not used, for alignment */ +} nvlist_t; + +/* nvp implementation version */ +#define NV_VERSION 0 + +/* nvlist pack encoding */ +#define NV_ENCODE_NATIVE 0 +#define NV_ENCODE_XDR 1 + +/* nvlist persistent unique name flags, stored in nvl_nvflags */ +#define NV_UNIQUE_NAME 0x1 +#define NV_UNIQUE_NAME_TYPE 0x2 + +/* nvlist lookup pairs related flags */ +#define NV_FLAG_NOENTOK 0x1 + +/* convenience macros */ +#define NV_ALIGN(x) (((ulong_t)(x) + 7ul) & ~7ul) +#define NV_ALIGN4(x) (((x) + 3) & ~3) + +#define NVP_SIZE(nvp) ((nvp)->nvp_size) +#define NVP_NAME(nvp) ((char *)(nvp) + sizeof (nvpair_t)) +#define NVP_TYPE(nvp) ((nvp)->nvp_type) +#define NVP_NELEM(nvp) ((nvp)->nvp_value_elem) +#define NVP_VALUE(nvp) ((char *)(nvp) + NV_ALIGN(sizeof (nvpair_t) \ + + (nvp)->nvp_name_sz)) + +#define NVL_VERSION(nvl) ((nvl)->nvl_version) +#define NVL_SIZE(nvl) ((nvl)->nvl_size) +#define NVL_FLAG(nvl) ((nvl)->nvl_flag) + +/* NV allocator framework */ +typedef struct nv_alloc_ops nv_alloc_ops_t; + +typedef struct nv_alloc { + const nv_alloc_ops_t *nva_ops; + void *nva_arg; +} nv_alloc_t; + +struct nv_alloc_ops { + int (*nv_ao_init)(nv_alloc_t *, va_list); + void (*nv_ao_fini)(nv_alloc_t *); + void *(*nv_ao_alloc)(nv_alloc_t *, size_t); + void (*nv_ao_free)(nv_alloc_t *, void *, size_t); + void (*nv_ao_reset)(nv_alloc_t *); +}; + +extern const nv_alloc_ops_t *nv_fixed_ops; +extern nv_alloc_t *nv_alloc_nosleep; + +#if defined(_KERNEL) +extern nv_alloc_t *nv_alloc_sleep; +extern nv_alloc_t *nv_alloc_pushpage; +#endif + +int nv_alloc_init(nv_alloc_t *, const nv_alloc_ops_t *, /* args */ ...); +void nv_alloc_reset(nv_alloc_t *); +void nv_alloc_fini(nv_alloc_t *); + +/* list management */ +int nvlist_alloc(nvlist_t **, uint_t, int); +void nvlist_free(nvlist_t *); +int nvlist_size(nvlist_t *, size_t *, int); +int nvlist_pack(nvlist_t *, char **, size_t *, int, int); +int nvlist_unpack(char *, size_t, nvlist_t **, int); +int nvlist_dup(nvlist_t *, nvlist_t **, int); +int nvlist_merge(nvlist_t *, nvlist_t *, int); + +uint_t nvlist_nvflag(nvlist_t *); + +int nvlist_xalloc(nvlist_t **, uint_t, nv_alloc_t *); +int nvlist_xpack(nvlist_t *, char **, size_t *, int, nv_alloc_t *); +int nvlist_xunpack(char *, size_t, nvlist_t **, nv_alloc_t *); +int nvlist_xdup(nvlist_t *, nvlist_t **, nv_alloc_t *); +nv_alloc_t *nvlist_lookup_nv_alloc(nvlist_t *); + +int nvlist_add_nvpair(nvlist_t *, nvpair_t *); +int nvlist_add_boolean(nvlist_t *, const char *); +int nvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +int nvlist_add_byte(nvlist_t *, const char *, uchar_t); +int nvlist_add_int8(nvlist_t *, const char *, int8_t); +int nvlist_add_uint8(nvlist_t *, const char *, uint8_t); +int nvlist_add_int16(nvlist_t *, const char *, int16_t); +int nvlist_add_uint16(nvlist_t *, const char *, uint16_t); +int nvlist_add_int32(nvlist_t *, const char *, int32_t); +int nvlist_add_uint32(nvlist_t *, const char *, uint32_t); +int nvlist_add_int64(nvlist_t *, const char *, int64_t); +int nvlist_add_uint64(nvlist_t *, const char *, uint64_t); +int nvlist_add_string(nvlist_t *, const char *, const char *); +int nvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +int nvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +int nvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +int nvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +int nvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +int nvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +int nvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +int nvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +int nvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +int nvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +int nvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +int nvlist_add_string_array(nvlist_t *, const char *, char *const *, uint_t); +int nvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); +int nvlist_add_hrtime(nvlist_t *, const char *, hrtime_t); +#if !defined(_KERNEL) +int nvlist_add_double(nvlist_t *, const char *, double); +#endif + +int nvlist_remove(nvlist_t *, const char *, data_type_t); +int nvlist_remove_all(nvlist_t *, const char *); +int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +int nvlist_lookup_boolean(nvlist_t *, const char *); +int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); +int nvlist_lookup_byte(nvlist_t *, const char *, uchar_t *); +int nvlist_lookup_int8(nvlist_t *, const char *, int8_t *); +int nvlist_lookup_uint8(nvlist_t *, const char *, uint8_t *); +int nvlist_lookup_int16(nvlist_t *, const char *, int16_t *); +int nvlist_lookup_uint16(nvlist_t *, const char *, uint16_t *); +int nvlist_lookup_int32(nvlist_t *, const char *, int32_t *); +int nvlist_lookup_uint32(nvlist_t *, const char *, uint32_t *); +int nvlist_lookup_int64(nvlist_t *, const char *, int64_t *); +int nvlist_lookup_uint64(nvlist_t *, const char *, uint64_t *); +int nvlist_lookup_string(nvlist_t *, const char *, char **); +int nvlist_lookup_nvlist(nvlist_t *, const char *, nvlist_t **); +int nvlist_lookup_boolean_array(nvlist_t *, const char *, + boolean_t **, uint_t *); +int nvlist_lookup_byte_array(nvlist_t *, const char *, uchar_t **, uint_t *); +int nvlist_lookup_int8_array(nvlist_t *, const char *, int8_t **, uint_t *); +int nvlist_lookup_uint8_array(nvlist_t *, const char *, uint8_t **, uint_t *); +int nvlist_lookup_int16_array(nvlist_t *, const char *, int16_t **, uint_t *); +int nvlist_lookup_uint16_array(nvlist_t *, const char *, uint16_t **, uint_t *); +int nvlist_lookup_int32_array(nvlist_t *, const char *, int32_t **, uint_t *); +int nvlist_lookup_uint32_array(nvlist_t *, const char *, uint32_t **, uint_t *); +int nvlist_lookup_int64_array(nvlist_t *, const char *, int64_t **, uint_t *); +int nvlist_lookup_uint64_array(nvlist_t *, const char *, uint64_t **, uint_t *); +int nvlist_lookup_string_array(nvlist_t *, const char *, char ***, uint_t *); +int nvlist_lookup_nvlist_array(nvlist_t *, const char *, + nvlist_t ***, uint_t *); +int nvlist_lookup_hrtime(nvlist_t *, const char *, hrtime_t *); +int nvlist_lookup_pairs(nvlist_t *, int, ...); +#if !defined(_KERNEL) +int nvlist_lookup_double(nvlist_t *, const char *, double *); +#endif + +int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); +int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, + int *, char **); +boolean_t nvlist_exists(nvlist_t *, const char *); +boolean_t nvlist_empty(nvlist_t *); + +/* processing nvpair */ +nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); +nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); +char *nvpair_name(nvpair_t *); +data_type_t nvpair_type(nvpair_t *); +int nvpair_type_is_array(nvpair_t *); +int nvpair_value_boolean_value(nvpair_t *, boolean_t *); +int nvpair_value_byte(nvpair_t *, uchar_t *); +int nvpair_value_int8(nvpair_t *, int8_t *); +int nvpair_value_uint8(nvpair_t *, uint8_t *); +int nvpair_value_int16(nvpair_t *, int16_t *); +int nvpair_value_uint16(nvpair_t *, uint16_t *); +int nvpair_value_int32(nvpair_t *, int32_t *); +int nvpair_value_uint32(nvpair_t *, uint32_t *); +int nvpair_value_int64(nvpair_t *, int64_t *); +int nvpair_value_uint64(nvpair_t *, uint64_t *); +int nvpair_value_string(nvpair_t *, char **); +int nvpair_value_nvlist(nvpair_t *, nvlist_t **); +int nvpair_value_boolean_array(nvpair_t *, boolean_t **, uint_t *); +int nvpair_value_byte_array(nvpair_t *, uchar_t **, uint_t *); +int nvpair_value_int8_array(nvpair_t *, int8_t **, uint_t *); +int nvpair_value_uint8_array(nvpair_t *, uint8_t **, uint_t *); +int nvpair_value_int16_array(nvpair_t *, int16_t **, uint_t *); +int nvpair_value_uint16_array(nvpair_t *, uint16_t **, uint_t *); +int nvpair_value_int32_array(nvpair_t *, int32_t **, uint_t *); +int nvpair_value_uint32_array(nvpair_t *, uint32_t **, uint_t *); +int nvpair_value_int64_array(nvpair_t *, int64_t **, uint_t *); +int nvpair_value_uint64_array(nvpair_t *, uint64_t **, uint_t *); +int nvpair_value_string_array(nvpair_t *, char ***, uint_t *); +int nvpair_value_nvlist_array(nvpair_t *, nvlist_t ***, uint_t *); +int nvpair_value_hrtime(nvpair_t *, hrtime_t *); +#if !defined(_KERNEL) +int nvpair_value_double(nvpair_t *, double *); +#endif + +nvlist_t *fnvlist_alloc(void); +void fnvlist_free(nvlist_t *); +size_t fnvlist_size(nvlist_t *); +char *fnvlist_pack(nvlist_t *, size_t *); +void fnvlist_pack_free(char *, size_t); +nvlist_t *fnvlist_unpack(char *, size_t); +nvlist_t *fnvlist_dup(nvlist_t *); +void fnvlist_merge(nvlist_t *, nvlist_t *); +size_t fnvlist_num_pairs(nvlist_t *); + +void fnvlist_add_boolean(nvlist_t *, const char *); +void fnvlist_add_boolean_value(nvlist_t *, const char *, boolean_t); +void fnvlist_add_byte(nvlist_t *, const char *, uchar_t); +void fnvlist_add_int8(nvlist_t *, const char *, int8_t); +void fnvlist_add_uint8(nvlist_t *, const char *, uint8_t); +void fnvlist_add_int16(nvlist_t *, const char *, int16_t); +void fnvlist_add_uint16(nvlist_t *, const char *, uint16_t); +void fnvlist_add_int32(nvlist_t *, const char *, int32_t); +void fnvlist_add_uint32(nvlist_t *, const char *, uint32_t); +void fnvlist_add_int64(nvlist_t *, const char *, int64_t); +void fnvlist_add_uint64(nvlist_t *, const char *, uint64_t); +void fnvlist_add_string(nvlist_t *, const char *, const char *); +void fnvlist_add_nvlist(nvlist_t *, const char *, nvlist_t *); +void fnvlist_add_nvpair(nvlist_t *, nvpair_t *); +void fnvlist_add_boolean_array(nvlist_t *, const char *, boolean_t *, uint_t); +void fnvlist_add_byte_array(nvlist_t *, const char *, uchar_t *, uint_t); +void fnvlist_add_int8_array(nvlist_t *, const char *, int8_t *, uint_t); +void fnvlist_add_uint8_array(nvlist_t *, const char *, uint8_t *, uint_t); +void fnvlist_add_int16_array(nvlist_t *, const char *, int16_t *, uint_t); +void fnvlist_add_uint16_array(nvlist_t *, const char *, uint16_t *, uint_t); +void fnvlist_add_int32_array(nvlist_t *, const char *, int32_t *, uint_t); +void fnvlist_add_uint32_array(nvlist_t *, const char *, uint32_t *, uint_t); +void fnvlist_add_int64_array(nvlist_t *, const char *, int64_t *, uint_t); +void fnvlist_add_uint64_array(nvlist_t *, const char *, uint64_t *, uint_t); +void fnvlist_add_string_array(nvlist_t *, const char *, char * const *, uint_t); +void fnvlist_add_nvlist_array(nvlist_t *, const char *, nvlist_t **, uint_t); + +void fnvlist_remove(nvlist_t *, const char *); +void fnvlist_remove_nvpair(nvlist_t *, nvpair_t *); + +nvpair_t *fnvlist_lookup_nvpair(nvlist_t *, const char *); +boolean_t fnvlist_lookup_boolean(nvlist_t *, const char *); +boolean_t fnvlist_lookup_boolean_value(nvlist_t *, const char *); +uchar_t fnvlist_lookup_byte(nvlist_t *, const char *); +int8_t fnvlist_lookup_int8(nvlist_t *, const char *); +int16_t fnvlist_lookup_int16(nvlist_t *, const char *); +int32_t fnvlist_lookup_int32(nvlist_t *, const char *); +int64_t fnvlist_lookup_int64(nvlist_t *, const char *); +uint8_t fnvlist_lookup_uint8(nvlist_t *, const char *); +uint16_t fnvlist_lookup_uint16(nvlist_t *, const char *); +uint32_t fnvlist_lookup_uint32(nvlist_t *, const char *); +uint64_t fnvlist_lookup_uint64(nvlist_t *, const char *); +char *fnvlist_lookup_string(nvlist_t *, const char *); +nvlist_t *fnvlist_lookup_nvlist(nvlist_t *, const char *); +boolean_t *fnvlist_lookup_boolean_array(nvlist_t *, const char *, uint_t *); +uchar_t *fnvlist_lookup_byte_array(nvlist_t *, const char *, uint_t *); +int8_t *fnvlist_lookup_int8_array(nvlist_t *, const char *, uint_t *); +uint8_t *fnvlist_lookup_uint8_array(nvlist_t *, const char *, uint_t *); +int16_t *fnvlist_lookup_int16_array(nvlist_t *, const char *, uint_t *); +uint16_t *fnvlist_lookup_uint16_array(nvlist_t *, const char *, uint_t *); +int32_t *fnvlist_lookup_int32_array(nvlist_t *, const char *, uint_t *); +uint32_t *fnvlist_lookup_uint32_array(nvlist_t *, const char *, uint_t *); +int64_t *fnvlist_lookup_int64_array(nvlist_t *, const char *, uint_t *); +uint64_t *fnvlist_lookup_uint64_array(nvlist_t *, const char *, uint_t *); + +boolean_t fnvpair_value_boolean_value(nvpair_t *nvp); +uchar_t fnvpair_value_byte(nvpair_t *nvp); +int8_t fnvpair_value_int8(nvpair_t *nvp); +int16_t fnvpair_value_int16(nvpair_t *nvp); +int32_t fnvpair_value_int32(nvpair_t *nvp); +int64_t fnvpair_value_int64(nvpair_t *nvp); +uint8_t fnvpair_value_uint8(nvpair_t *nvp); +uint16_t fnvpair_value_uint16(nvpair_t *nvp); +uint32_t fnvpair_value_uint32(nvpair_t *nvp); +uint64_t fnvpair_value_uint64(nvpair_t *nvp); +char *fnvpair_value_string(nvpair_t *nvp); +nvlist_t *fnvpair_value_nvlist(nvpair_t *nvp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_NVPAIR_H */ diff --git a/include/sys/nvpair_impl.h b/include/sys/nvpair_impl.h new file mode 100644 index 000000000000..c9874b3e4db7 --- /dev/null +++ b/include/sys/nvpair_impl.h @@ -0,0 +1,90 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _NVPAIR_IMPL_H +#define _NVPAIR_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * The structures here provided for information and debugging purposes only + * may be changed in the future. + */ + +/* + * implementation linked list for pre-packed data + */ +typedef struct i_nvp i_nvp_t; + +struct i_nvp { + union { + /* ensure alignment */ + uint64_t _nvi_align; + + struct { + /* pointer to next nvpair */ + i_nvp_t *_nvi_next; + + /* pointer to prev nvpair */ + i_nvp_t *_nvi_prev; + + /* next pair in table bucket */ + i_nvp_t *_nvi_hashtable_next; + } _nvi; + } _nvi_un; + + /* nvpair */ + nvpair_t nvi_nvp; +}; +#define nvi_next _nvi_un._nvi._nvi_next +#define nvi_prev _nvi_un._nvi._nvi_prev +#define nvi_hashtable_next _nvi_un._nvi._nvi_hashtable_next + +typedef struct { + i_nvp_t *nvp_list; /* linked list of nvpairs */ + i_nvp_t *nvp_last; /* last nvpair */ + i_nvp_t *nvp_curr; /* current walker nvpair */ + nv_alloc_t *nvp_nva; /* pluggable allocator */ + uint32_t nvp_stat; /* internal state */ + + i_nvp_t **nvp_hashtable; /* table of entries used for lookup */ + uint32_t nvp_nbuckets; /* # of buckets in hash table */ + uint32_t nvp_nentries; /* # of entries in hash table */ +} nvpriv_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _NVPAIR_IMPL_H */ diff --git a/include/sys/objlist.h b/include/sys/objlist.h new file mode 100644 index 000000000000..a124a61fdc95 --- /dev/null +++ b/include/sys/objlist.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _OBJLIST_H +#define _OBJLIST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef struct objlist_node { + list_node_t on_node; + uint64_t on_object; +} objlist_node_t; + +typedef struct objlist { + list_t ol_list; /* List of struct objnode. */ + /* + * Last object looked up. Used to assert that objects are being looked + * up in ascending order. + */ + uint64_t ol_last_lookup; +} objlist_t; + +objlist_t *objlist_create(void); +void objlist_destroy(objlist_t *); +boolean_t objlist_exists(objlist_t *, uint64_t); +void objlist_insert(objlist_t *, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _OBJLIST_H */ diff --git a/include/sys/pathname.h b/include/sys/pathname.h new file mode 100644 index 000000000000..d79cc5c01afd --- /dev/null +++ b/include/sys/pathname.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1989, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef _SYS_PATHNAME_H +#define _SYS_PATHNAME_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Pathname structure. + * System calls that operate on path names gather the path name + * from the system call into this structure and reduce it by + * peeling off translated components. If a symbolic link is + * encountered the new path name to be translated is also + * assembled in this structure. + * + * By convention pn_buf is not changed once it's been set to point + * to the underlying storage; routines which manipulate the pathname + * do so by changing pn_path and pn_pathlen. pn_pathlen is redundant + * since the path name is null-terminated, but is provided to make + * some computations faster. + */ +typedef struct pathname { + char *pn_buf; /* underlying storage */ +#if 0 /* unused in ZoL */ + char *pn_path; /* remaining pathname */ + size_t pn_pathlen; /* remaining length */ +#endif + size_t pn_bufsize; /* total size of pn_buf */ +} pathname_t; + +extern void pn_alloc(struct pathname *); +extern void pn_alloc_sz(struct pathname *, size_t); +extern void pn_free(struct pathname *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PATHNAME_H */ diff --git a/include/sys/qat.h b/include/sys/qat.h new file mode 100644 index 000000000000..9ae8eb173572 --- /dev/null +++ b/include/sys/qat.h @@ -0,0 +1,199 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_QAT_H +#define _SYS_QAT_H + +typedef enum qat_compress_dir { + QAT_DECOMPRESS = 0, + QAT_COMPRESS = 1, +} qat_compress_dir_t; + +typedef enum qat_encrypt_dir { + QAT_DECRYPT = 0, + QAT_ENCRYPT = 1, +} qat_encrypt_dir_t; + + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include "cpa.h" +#include "dc/cpa_dc.h" +#include "lac/cpa_cy_sym.h" + +/* + * The minimal and maximal buffer size which are not restricted + * in the QAT hardware, but with the input buffer size between 4KB + * and 128KB the hardware can provide the optimal performance. + */ +#define QAT_MIN_BUF_SIZE (4*1024) +#define QAT_MAX_BUF_SIZE (128*1024) + +/* + * Used for QAT kstat. + */ +typedef struct qat_stats { + /* + * Number of jobs submitted to QAT compression engine. + */ + kstat_named_t comp_requests; + /* + * Total bytes sent to QAT compression engine. + */ + kstat_named_t comp_total_in_bytes; + /* + * Total bytes output from QAT compression engine. + */ + kstat_named_t comp_total_out_bytes; + /* + * Number of jobs submitted to QAT de-compression engine. + */ + kstat_named_t decomp_requests; + /* + * Total bytes sent to QAT de-compression engine. + */ + kstat_named_t decomp_total_in_bytes; + /* + * Total bytes output from QAT de-compression engine. + */ + kstat_named_t decomp_total_out_bytes; + /* + * Number of fails in the QAT compression / decompression engine. + * Note: when a QAT error happens, it doesn't necessarily indicate a + * critical hardware issue. Sometimes it is because the output buffer + * is not big enough. The compression job will be transferred to the + * gzip software implementation so the functionality of ZFS is not + * impacted. + */ + kstat_named_t dc_fails; + + /* + * Number of jobs submitted to QAT encryption engine. + */ + kstat_named_t encrypt_requests; + /* + * Total bytes sent to QAT encryption engine. + */ + kstat_named_t encrypt_total_in_bytes; + /* + * Total bytes output from QAT encryption engine. + */ + kstat_named_t encrypt_total_out_bytes; + /* + * Number of jobs submitted to QAT decryption engine. + */ + kstat_named_t decrypt_requests; + /* + * Total bytes sent to QAT decryption engine. + */ + kstat_named_t decrypt_total_in_bytes; + /* + * Total bytes output from QAT decryption engine. + */ + kstat_named_t decrypt_total_out_bytes; + /* + * Number of fails in the QAT encryption / decryption engine. + * Note: when a QAT error happens, it doesn't necessarily indicate a + * critical hardware issue. The encryption job will be transferred + * to the software implementation so the functionality of ZFS is + * not impacted. + */ + kstat_named_t crypt_fails; + + /* + * Number of jobs submitted to QAT checksum engine. + */ + kstat_named_t cksum_requests; + /* + * Total bytes sent to QAT checksum engine. + */ + kstat_named_t cksum_total_in_bytes; + /* + * Number of fails in the QAT checksum engine. + * Note: when a QAT error happens, it doesn't necessarily indicate a + * critical hardware issue. The checksum job will be transferred to the + * software implementation so the functionality of ZFS is not impacted. + */ + kstat_named_t cksum_fails; +} qat_stats_t; + +#define QAT_STAT_INCR(stat, val) \ + atomic_add_64(&qat_stats.stat.value.ui64, (val)) +#define QAT_STAT_BUMP(stat) \ + QAT_STAT_INCR(stat, 1) + +extern qat_stats_t qat_stats; +extern int zfs_qat_compress_disable; +extern int zfs_qat_checksum_disable; +extern int zfs_qat_encrypt_disable; + +/* inlined for performance */ +static inline struct page * +qat_mem_to_page(void *addr) +{ + if (!is_vmalloc_addr(addr)) + return (virt_to_page(addr)); + + return (vmalloc_to_page(addr)); +} + +CpaStatus qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes); +void qat_mem_free_contig(void **pp_mem_addr); +#define QAT_PHYS_CONTIG_ALLOC(pp_mem_addr, size_bytes) \ + qat_mem_alloc_contig((void *)(pp_mem_addr), (size_bytes)) +#define QAT_PHYS_CONTIG_FREE(p_mem_addr) \ + qat_mem_free_contig((void *)&(p_mem_addr)) + +extern int qat_dc_init(void); +extern void qat_dc_fini(void); +extern int qat_cy_init(void); +extern void qat_cy_fini(void); +extern int qat_init(void); +extern void qat_fini(void); + +/* fake CpaStatus used to indicate data was not compressible */ +#define CPA_STATUS_INCOMPRESSIBLE (-127) + +extern boolean_t qat_dc_use_accel(size_t s_len); +extern boolean_t qat_crypt_use_accel(size_t s_len); +extern boolean_t qat_checksum_use_accel(size_t s_len); +extern int qat_compress(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, size_t *c_len); +extern int qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, + uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, + crypto_key_t *key, uint64_t crypt, uint32_t enc_len); +extern int qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, + zio_cksum_t *zcp); +#else +#define CPA_STATUS_SUCCESS 0 +#define CPA_STATUS_INCOMPRESSIBLE (-127) +#define qat_init() +#define qat_fini() +#define qat_dc_use_accel(s_len) 0 +#define qat_crypt_use_accel(s_len) 0 +#define qat_checksum_use_accel(s_len) 0 +#define qat_compress(dir, s, sl, d, dl, cl) 0 +#define qat_crypt(dir, s, d, a, al, i, db, k, c, el) 0 +#define qat_checksum(c, buf, s, z) 0 +#endif + +#endif /* _SYS_QAT_H */ diff --git a/include/sys/range_tree.h b/include/sys/range_tree.h new file mode 100644 index 000000000000..fef3d4d7bd21 --- /dev/null +++ b/include/sys/range_tree.h @@ -0,0 +1,330 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_RANGE_TREE_H +#define _SYS_RANGE_TREE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define RANGE_TREE_HISTOGRAM_SIZE 64 + +typedef struct range_tree_ops range_tree_ops_t; + +typedef enum range_seg_type { + RANGE_SEG32, + RANGE_SEG64, + RANGE_SEG_GAP, + RANGE_SEG_NUM_TYPES, +} range_seg_type_t; + +/* + * Note: the range_tree may not be accessed concurrently; consumers + * must provide external locking if required. + */ +typedef struct range_tree { + zfs_btree_t rt_root; /* offset-ordered segment b-tree */ + uint64_t rt_space; /* sum of all segments in the map */ + range_seg_type_t rt_type; /* type of range_seg_t in use */ + /* + * All data that is stored in the range tree must have a start higher + * than or equal to rt_start, and all sizes and offsets must be + * multiples of 1 << rt_shift. + */ + uint8_t rt_shift; + uint64_t rt_start; + range_tree_ops_t *rt_ops; + + /* rt_btree_compare should only be set if rt_arg is a b-tree */ + void *rt_arg; + int (*rt_btree_compare)(const void *, const void *); + + uint64_t rt_gap; /* allowable inter-segment gap */ + + /* + * The rt_histogram maintains a histogram of ranges. Each bucket, + * rt_histogram[i], contains the number of ranges whose size is: + * 2^i <= size of range in bytes < 2^(i+1) + */ + uint64_t rt_histogram[RANGE_TREE_HISTOGRAM_SIZE]; +} range_tree_t; + +typedef struct range_seg32 { + uint32_t rs_start; /* starting offset of this segment */ + uint32_t rs_end; /* ending offset (non-inclusive) */ +} range_seg32_t; + +/* + * Extremely large metaslabs, vdev-wide trees, and dnode-wide trees may + * require 64-bit integers for ranges. + */ +typedef struct range_seg64 { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ +} range_seg64_t; + +typedef struct range_seg_gap { + uint64_t rs_start; /* starting offset of this segment */ + uint64_t rs_end; /* ending offset (non-inclusive) */ + uint64_t rs_fill; /* actual fill if gap mode is on */ +} range_seg_gap_t; + +/* + * This type needs to be the largest of the range segs, since it will be stack + * allocated and then cast the actual type to do tree operations. + */ +typedef range_seg_gap_t range_seg_max_t; + +/* + * This is just for clarity of code purposes, so we can make it clear that a + * pointer is to a range seg of some type; when we need to do the actual math, + * we'll figure out the real type. + */ +typedef void range_seg_t; + +struct range_tree_ops { + void (*rtop_create)(range_tree_t *rt, void *arg); + void (*rtop_destroy)(range_tree_t *rt, void *arg); + void (*rtop_add)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_remove)(range_tree_t *rt, void *rs, void *arg); + void (*rtop_vacate)(range_tree_t *rt, void *arg); +}; + +static inline uint64_t +rs_get_start_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((const range_seg32_t *)rs)->rs_start); + case RANGE_SEG64: + return (((const range_seg64_t *)rs)->rs_start); + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_start); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_end_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + return (((const range_seg32_t *)rs)->rs_end); + case RANGE_SEG64: + return (((const range_seg64_t *)rs)->rs_end); + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_end); + default: + VERIFY(0); + return (0); + } +} + +static inline uint64_t +rs_get_fill_raw(const range_seg_t *rs, const range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: { + const range_seg32_t *r32 = (const range_seg32_t *)rs; + return (r32->rs_end - r32->rs_start); + } + case RANGE_SEG64: { + const range_seg64_t *r64 = (const range_seg64_t *)rs; + return (r64->rs_end - r64->rs_start); + } + case RANGE_SEG_GAP: + return (((const range_seg_gap_t *)rs)->rs_fill); + default: + VERIFY(0); + return (0); + } + +} + +static inline uint64_t +rs_get_start(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_start_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_end(const range_seg_t *rs, const range_tree_t *rt) +{ + return ((rs_get_end_raw(rs, rt) << rt->rt_shift) + rt->rt_start); +} + +static inline uint64_t +rs_get_fill(const range_seg_t *rs, const range_tree_t *rt) +{ + return (rs_get_fill_raw(rs, rt) << rt->rt_shift); +} + +static inline void +rs_set_start_raw(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(start, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_start = (uint32_t)start; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_start = start; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_start = start; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_end_raw(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + ASSERT3U(end, <=, UINT32_MAX); + ((range_seg32_t *)rs)->rs_end = (uint32_t)end; + break; + case RANGE_SEG64: + ((range_seg64_t *)rs)->rs_end = end; + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_end = end; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_fill_raw(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + switch (rt->rt_type) { + case RANGE_SEG32: + /* fall through */ + case RANGE_SEG64: + ASSERT3U(fill, ==, rs_get_end_raw(rs, rt) - rs_get_start_raw(rs, + rt)); + break; + case RANGE_SEG_GAP: + ((range_seg_gap_t *)rs)->rs_fill = fill; + break; + default: + VERIFY(0); + } +} + +static inline void +rs_set_start(range_seg_t *rs, range_tree_t *rt, uint64_t start) +{ + ASSERT3U(start, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(start, 1ULL << rt->rt_shift)); + rs_set_start_raw(rs, rt, (start - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_end(range_seg_t *rs, range_tree_t *rt, uint64_t end) +{ + ASSERT3U(end, >=, rt->rt_start); + ASSERT(IS_P2ALIGNED(end, 1ULL << rt->rt_shift)); + rs_set_end_raw(rs, rt, (end - rt->rt_start) >> rt->rt_shift); +} + +static inline void +rs_set_fill(range_seg_t *rs, range_tree_t *rt, uint64_t fill) +{ + ASSERT(IS_P2ALIGNED(fill, 1ULL << rt->rt_shift)); + rs_set_fill_raw(rs, rt, fill >> rt->rt_shift); +} + +typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); + +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, + range_seg_type_t type, void *arg, uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), uint64_t gap); +range_tree_t *range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift); +void range_tree_destroy(range_tree_t *rt); +boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); +boolean_t range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize); +void range_tree_verify_not_present(range_tree_t *rt, + uint64_t start, uint64_t size); +void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize); +uint64_t range_tree_space(range_tree_t *rt); +uint64_t range_tree_numsegs(range_tree_t *rt); +boolean_t range_tree_is_empty(range_tree_t *rt); +void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); +void range_tree_stat_verify(range_tree_t *rt); +uint64_t range_tree_min(range_tree_t *rt); +uint64_t range_tree_max(range_tree_t *rt); +uint64_t range_tree_span(range_tree_t *rt); + +void range_tree_add(void *arg, uint64_t start, uint64_t size); +void range_tree_remove(void *arg, uint64_t start, uint64_t size); +void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); +void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); + +void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); +void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); +range_seg_t *range_tree_first(range_tree_t *rt); + +void range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto); +void range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto); + +void rt_btree_create(range_tree_t *rt, void *arg); +void rt_btree_destroy(range_tree_t *rt, void *arg); +void rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_btree_vacate(range_tree_t *rt, void *arg); +extern range_tree_ops_t rt_btree_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RANGE_TREE_H */ diff --git a/include/sys/rrwlock.h b/include/sys/rrwlock.h new file mode 100644 index 000000000000..8d296ef28ffa --- /dev/null +++ b/include/sys/rrwlock.h @@ -0,0 +1,117 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#ifndef _SYS_RR_RW_LOCK_H +#define _SYS_RR_RW_LOCK_H + + + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +extern uint_t rrw_tsd_key; + +/* + * A reader-writer lock implementation that allows re-entrant reads, but + * still gives writers priority on "new" reads. + * + * See rrwlock.c for more details about the implementation. + * + * Fields of the rrwlock_t structure: + * - rr_lock: protects modification and reading of rrwlock_t fields + * - rr_cv: cv for waking up readers or waiting writers + * - rr_writer: thread id of the current writer + * - rr_anon_rount: number of active anonymous readers + * - rr_linked_rcount: total number of non-anonymous active readers + * - rr_writer_wanted: a writer wants the lock + */ +typedef struct rrwlock { + kmutex_t rr_lock; + kcondvar_t rr_cv; + kthread_t *rr_writer; + zfs_refcount_t rr_anon_rcount; + zfs_refcount_t rr_linked_rcount; + boolean_t rr_writer_wanted; + boolean_t rr_track_all; +} rrwlock_t; + +/* + * 'tag' is used in reference counting tracking. The + * 'tag' must be the same in a rrw_enter() as in its + * corresponding rrw_exit(). + */ +void rrw_init(rrwlock_t *rrl, boolean_t track_all); +void rrw_destroy(rrwlock_t *rrl); +void rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag); +void rrw_enter_read(rrwlock_t *rrl, void *tag); +void rrw_enter_read_prio(rrwlock_t *rrl, void *tag); +void rrw_enter_write(rrwlock_t *rrl); +void rrw_exit(rrwlock_t *rrl, void *tag); +boolean_t rrw_held(rrwlock_t *rrl, krw_t rw); +void rrw_tsd_destroy(void *arg); + +#define RRW_READ_HELD(x) rrw_held(x, RW_READER) +#define RRW_WRITE_HELD(x) rrw_held(x, RW_WRITER) +#define RRW_LOCK_HELD(x) \ + (rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER)) + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, pessimizing write acquisitions. + * + * This should be a prime number. See comment in rrwlock.c near + * RRM_TD_LOCK() for details. + */ +#define RRM_NUM_LOCKS 17 +typedef struct rrmlock { + rrwlock_t locks[RRM_NUM_LOCKS]; +} rrmlock_t; + +void rrm_init(rrmlock_t *rrl, boolean_t track_all); +void rrm_destroy(rrmlock_t *rrl); +void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag); +void rrm_enter_read(rrmlock_t *rrl, void *tag); +void rrm_enter_write(rrmlock_t *rrl); +void rrm_exit(rrmlock_t *rrl, void *tag); +boolean_t rrm_held(rrmlock_t *rrl, krw_t rw); + +#define RRM_READ_HELD(x) rrm_held(x, RW_READER) +#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER) +#define RRM_LOCK_HELD(x) \ + (rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER)) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_RR_RW_LOCK_H */ diff --git a/include/sys/sa.h b/include/sys/sa.h new file mode 100644 index 000000000000..432e0bc415c9 --- /dev/null +++ b/include/sys/sa.h @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_SA_H +#define _SYS_SA_H + +#include + +/* + * Currently available byteswap functions. + * If it all possible new attributes should used + * one of the already defined byteswap functions. + * If a new byteswap function is added then the + * ZPL/Pool version will need to be bumped. + */ + +typedef enum sa_bswap_type { + SA_UINT64_ARRAY, + SA_UINT32_ARRAY, + SA_UINT16_ARRAY, + SA_UINT8_ARRAY, + SA_ACL, +} sa_bswap_type_t; + +typedef uint16_t sa_attr_type_t; + +/* + * Attribute to register support for. + */ +typedef struct sa_attr_reg { + char *sa_name; /* attribute name */ + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; /* bswap function enum */ + sa_attr_type_t sa_attr; /* filled in during registration */ +} sa_attr_reg_t; + + +typedef void (sa_data_locator_t)(void **, uint32_t *, uint32_t, + boolean_t, void *userptr); + +/* + * array of attributes to store. + * + * This array should be treated as opaque/private data. + * The SA_BULK_ADD_ATTR() macro should be used for manipulating + * the array. + * + * When sa_replace_all_by_template() is used the attributes + * will be stored in the order defined in the array, except that + * the attributes may be split between the bonus and the spill buffer + * + */ +typedef struct sa_bulk_attr { + void *sa_data; + sa_data_locator_t *sa_data_func; + uint16_t sa_length; + sa_attr_type_t sa_attr; + /* the following are private to the sa framework */ + void *sa_addr; + uint16_t sa_buftype; + uint16_t sa_size; +} sa_bulk_attr_t; + +/* + * The on-disk format of sa_hdr_phys_t limits SA lengths to 16-bit values. + */ +#define SA_ATTR_MAX_LEN UINT16_MAX + +/* + * special macro for adding entries for bulk attr support + * bulk - sa_bulk_attr_t + * count - integer that will be incremented during each add + * attr - attribute to manipulate + * func - function for accessing data. + * data - pointer to data. + * len - length of data + */ + +#define SA_ADD_BULK_ATTR(b, idx, attr, func, data, len) \ +{ \ + ASSERT3U(len, <=, SA_ATTR_MAX_LEN); \ + b[idx].sa_attr = attr;\ + b[idx].sa_data_func = func; \ + b[idx].sa_data = data; \ + b[idx++].sa_length = len; \ +} + +typedef struct sa_os sa_os_t; + +typedef enum sa_handle_type { + SA_HDL_SHARED, + SA_HDL_PRIVATE +} sa_handle_type_t; + +struct sa_handle; +typedef void *sa_lookup_tab_t; +typedef struct sa_handle sa_handle_t; + +typedef void (sa_update_cb_t)(sa_handle_t *, dmu_tx_t *tx); + +int sa_handle_get(objset_t *, uint64_t, void *userp, + sa_handle_type_t, sa_handle_t **); +int sa_handle_get_from_db(objset_t *, dmu_buf_t *, void *userp, + sa_handle_type_t, sa_handle_t **); +void sa_handle_destroy(sa_handle_t *); +int sa_buf_hold(objset_t *, uint64_t, void *, dmu_buf_t **); +void sa_buf_rele(dmu_buf_t *, void *); +int sa_lookup(sa_handle_t *, sa_attr_type_t, void *buf, uint32_t buflen); +int sa_update(sa_handle_t *, sa_attr_type_t, void *buf, + uint32_t buflen, dmu_tx_t *); +int sa_remove(sa_handle_t *, sa_attr_type_t, dmu_tx_t *); +int sa_bulk_lookup(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_lookup_locked(sa_handle_t *, sa_bulk_attr_t *, int count); +int sa_bulk_update(sa_handle_t *, sa_bulk_attr_t *, int count, dmu_tx_t *); +int sa_size(sa_handle_t *, sa_attr_type_t, int *); +void sa_object_info(sa_handle_t *, dmu_object_info_t *); +void sa_object_size(sa_handle_t *, uint32_t *, u_longlong_t *); +void *sa_get_userdata(sa_handle_t *); +void sa_set_userp(sa_handle_t *, void *); +dmu_buf_t *sa_get_db(sa_handle_t *); +uint64_t sa_handle_object(sa_handle_t *); +boolean_t sa_attr_would_spill(sa_handle_t *, sa_attr_type_t, int size); +void sa_spill_rele(sa_handle_t *); +void sa_register_update_callback(objset_t *, sa_update_cb_t *); +int sa_setup(objset_t *, uint64_t, sa_attr_reg_t *, int, sa_attr_type_t **); +void sa_tear_down(objset_t *); +int sa_replace_all_by_template(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +int sa_replace_all_by_template_locked(sa_handle_t *, sa_bulk_attr_t *, + int, dmu_tx_t *); +boolean_t sa_enabled(objset_t *); +void sa_cache_init(void); +void sa_cache_fini(void); +int sa_set_sa_object(objset_t *, uint64_t); +int sa_hdrsize(void *); +void sa_handle_lock(sa_handle_t *); +void sa_handle_unlock(sa_handle_t *); + +#ifdef _KERNEL +int sa_lookup_uio(sa_handle_t *, sa_attr_type_t, uio_t *); +int sa_add_projid(sa_handle_t *, dmu_tx_t *, uint64_t); +#endif + +#ifdef __cplusplus +extern "C" { +#endif + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_H */ diff --git a/include/sys/sa_impl.h b/include/sys/sa_impl.h new file mode 100644 index 000000000000..fa10aff8a306 --- /dev/null +++ b/include/sys/sa_impl.h @@ -0,0 +1,289 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_SA_IMPL_H +#define _SYS_SA_IMPL_H + +#include +#include +#include + +/* + * Array of known attributes and their + * various characteristics. + */ +typedef struct sa_attr_table { + sa_attr_type_t sa_attr; + uint8_t sa_registered; + uint16_t sa_length; + sa_bswap_type_t sa_byteswap; + char *sa_name; +} sa_attr_table_t; + +/* + * Zap attribute format for attribute registration + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | unused | len | bswap | attr num | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Zap attribute format for layout information. + * + * layout information is stored as an array of attribute numbers + * The name of the attribute is the layout number (0, 1, 2, ...) + * + * 16 0 + * +---- ---+ + * | attr # | + * +--------+ + * | attr # | + * +--- ----+ + * ...... + * + */ + +#define ATTR_BSWAP(x) BF32_GET(x, 16, 8) +#define ATTR_LENGTH(x) BF32_GET(x, 24, 16) +#define ATTR_NUM(x) BF32_GET(x, 0, 16) +#define ATTR_ENCODE(x, attr, length, bswap) \ +{ \ + BF64_SET(x, 24, 16, length); \ + BF64_SET(x, 16, 8, bswap); \ + BF64_SET(x, 0, 16, attr); \ +} + +#define TOC_OFF(x) BF32_GET(x, 0, 23) +#define TOC_ATTR_PRESENT(x) BF32_GET(x, 31, 1) +#define TOC_LEN_IDX(x) BF32_GET(x, 24, 4) +#define TOC_ATTR_ENCODE(x, len_idx, offset) \ +{ \ + BF32_SET(x, 31, 1, 1); \ + BF32_SET(x, 24, 7, len_idx); \ + BF32_SET(x, 0, 24, offset); \ +} + +#define SA_LAYOUTS "LAYOUTS" +#define SA_REGISTRY "REGISTRY" + +/* + * Each unique layout will have their own table + * sa_lot (layout_table) + */ +typedef struct sa_lot { + avl_node_t lot_num_node; + avl_node_t lot_hash_node; + uint64_t lot_num; + uint64_t lot_hash; + sa_attr_type_t *lot_attrs; /* array of attr #'s */ + uint32_t lot_var_sizes; /* how many aren't fixed size */ + uint32_t lot_attr_count; /* total attr count */ + list_t lot_idx_tab; /* should be only a couple of entries */ + int lot_instance; /* used with lot_hash to identify entry */ +} sa_lot_t; + +/* index table of offsets */ +typedef struct sa_idx_tab { + list_node_t sa_next; + sa_lot_t *sa_layout; + uint16_t *sa_variable_lengths; + zfs_refcount_t sa_refcount; + uint32_t *sa_idx_tab; /* array of offsets */ +} sa_idx_tab_t; + +/* + * Since the offset/index information into the actual data + * will usually be identical we can share that information with + * all handles that have the exact same offsets. + * + * You would typically only have a large number of different table of + * contents if you had a several variable sized attributes. + * + * Two AVL trees are used to track the attribute layout numbers. + * one is keyed by number and will be consulted when a DMU_OT_SA + * object is first read. The second tree is keyed by the hash signature + * of the attributes and will be consulted when an attribute is added + * to determine if we already have an instance of that layout. Both + * of these tree's are interconnected. The only difference is that + * when an entry is found in the "hash" tree the list of attributes will + * need to be compared against the list of attributes you have in hand. + * The assumption is that typically attributes will just be updated and + * adding a completely new attribute is a very rare operation. + */ +struct sa_os { + kmutex_t sa_lock; + boolean_t sa_need_attr_registration; + boolean_t sa_force_spill; + uint64_t sa_master_obj; + uint64_t sa_reg_attr_obj; + uint64_t sa_layout_attr_obj; + int sa_num_attrs; + sa_attr_table_t *sa_attr_table; /* private attr table */ + sa_update_cb_t *sa_update_cb; + avl_tree_t sa_layout_num_tree; /* keyed by layout number */ + avl_tree_t sa_layout_hash_tree; /* keyed by layout hash value */ + int sa_user_table_sz; + sa_attr_type_t *sa_user_table; /* user name->attr mapping table */ +}; + +/* + * header for all bonus and spill buffers. + * + * The header has a fixed portion with a variable number + * of "lengths" depending on the number of variable sized + * attributes which are determined by the "layout number" + */ + +#define SA_MAGIC 0x2F505A /* ZFS SA */ +typedef struct sa_hdr_phys { + uint32_t sa_magic; + /* + * Encoded with hdrsize and layout number as follows: + * 16 10 0 + * +--------+-------+ + * | hdrsz |layout | + * +--------+-------+ + * + * Bits 0-10 are the layout number + * Bits 11-16 are the size of the header. + * The hdrsize is the number * 8 + * + * For example. + * hdrsz of 1 ==> 8 byte header + * 2 ==> 16 byte header + * + */ + uint16_t sa_layout_info; + uint16_t sa_lengths[1]; /* optional sizes for variable length attrs */ + /* ... Data follows the lengths. */ +} sa_hdr_phys_t; + +#define SA_HDR_LAYOUT_NUM(hdr) BF32_GET(hdr->sa_layout_info, 0, 10) +#define SA_HDR_SIZE(hdr) BF32_GET_SB(hdr->sa_layout_info, 10, 6, 3, 0) +#define SA_HDR_LAYOUT_INFO_ENCODE(x, num, size) \ +{ \ + BF32_SET_SB(x, 10, 6, 3, 0, size); \ + BF32_SET(x, 0, 10, num); \ +} + +typedef enum sa_buf_type { + SA_BONUS = 1, + SA_SPILL = 2 +} sa_buf_type_t; + +typedef enum sa_data_op { + SA_LOOKUP, + SA_UPDATE, + SA_ADD, + SA_REPLACE, + SA_REMOVE +} sa_data_op_t; + +/* + * Opaque handle used for most sa functions + * + * This needs to be kept as small as possible. + */ + +struct sa_handle { + dmu_buf_user_t sa_dbu; + kmutex_t sa_lock; + dmu_buf_t *sa_bonus; + dmu_buf_t *sa_spill; + objset_t *sa_os; + void *sa_userp; + sa_idx_tab_t *sa_bonus_tab; /* idx of bonus */ + sa_idx_tab_t *sa_spill_tab; /* only present if spill activated */ +}; + +#define SA_GET_DB(hdl, type) \ + (dmu_buf_impl_t *)((type == SA_BONUS) ? hdl->sa_bonus : hdl->sa_spill) + +#define SA_GET_HDR(hdl, type) \ + ((sa_hdr_phys_t *)((dmu_buf_impl_t *)(SA_GET_DB(hdl, \ + type))->db.db_data)) + +#define SA_IDX_TAB_GET(hdl, type) \ + (type == SA_BONUS ? hdl->sa_bonus_tab : hdl->sa_spill_tab) + +#define IS_SA_BONUSTYPE(a) \ + ((a == DMU_OT_SA) ? B_TRUE : B_FALSE) + +#define SA_BONUSTYPE_FROM_DB(db) \ + (dmu_get_bonustype((dmu_buf_t *)db)) + +#define SA_BLKPTR_SPACE (DN_OLD_MAX_BONUSLEN - sizeof (blkptr_t)) + +#define SA_LAYOUT_NUM(x, type) \ + ((!IS_SA_BONUSTYPE(type) ? 0 : (((IS_SA_BONUSTYPE(type)) && \ + ((SA_HDR_LAYOUT_NUM(x)) == 0)) ? 1 : SA_HDR_LAYOUT_NUM(x)))) + + +#define SA_REGISTERED_LEN(sa, attr) sa->sa_attr_table[attr].sa_length + +#define SA_ATTR_LEN(sa, idx, attr, hdr) ((SA_REGISTERED_LEN(sa, attr) == 0) ?\ + hdr->sa_lengths[TOC_LEN_IDX(idx->sa_idx_tab[attr])] : \ + SA_REGISTERED_LEN(sa, attr)) + +#define SA_SET_HDR(hdr, num, size) \ + { \ + hdr->sa_magic = SA_MAGIC; \ + SA_HDR_LAYOUT_INFO_ENCODE(hdr->sa_layout_info, num, size); \ + } + +#define SA_ATTR_INFO(sa, idx, hdr, attr, bulk, type, hdl) \ + { \ + bulk.sa_size = SA_ATTR_LEN(sa, idx, attr, hdr); \ + bulk.sa_buftype = type; \ + bulk.sa_addr = \ + (void *)((uintptr_t)TOC_OFF(idx->sa_idx_tab[attr]) + \ + (uintptr_t)hdr); \ +} + +#define SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb) \ + (SA_HDR_SIZE(hdr) == (sizeof (sa_hdr_phys_t) + \ + (tb->lot_var_sizes > 1 ? P2ROUNDUP((tb->lot_var_sizes - 1) * \ + sizeof (uint16_t), 8) : 0))) + +int sa_add_impl(sa_handle_t *, sa_attr_type_t, + uint32_t, sa_data_locator_t, void *, dmu_tx_t *); + +void sa_register_update_callback_locked(objset_t *, sa_update_cb_t *); +int sa_size_locked(sa_handle_t *, sa_attr_type_t, int *); + +void sa_default_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +int sa_attr_size(sa_os_t *, sa_idx_tab_t *, sa_attr_type_t, + uint16_t *, sa_hdr_phys_t *); + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SA_IMPL_H */ diff --git a/include/sys/skein.h b/include/sys/skein.h new file mode 100644 index 000000000000..2f649d6b269a --- /dev/null +++ b/include/sys/skein.h @@ -0,0 +1,183 @@ +/* + * Interface declarations for Skein hashing. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * + * The following compile-time switches may be defined to control some + * tradeoffs between speed, code size, error checking, and security. + * + * The "default" note explains what happens when the switch is not defined. + * + * SKEIN_DEBUG -- make callouts from inside Skein code + * to examine/display intermediate values. + * [default: no callouts (no overhead)] + * + * SKEIN_ERR_CHECK -- how error checking is handled inside Skein + * code. If not defined, most error checking + * is disabled (for performance). Otherwise, + * the switch value is interpreted as: + * 0: use assert() to flag errors + * 1: return SKEIN_FAIL to flag errors + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ +#ifndef _SYS_SKEIN_H_ +#define _SYS_SKEIN_H_ + +#ifdef _KERNEL +#include /* get size_t definition */ +#else +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +enum { + SKEIN_SUCCESS = 0, /* return codes from Skein calls */ + SKEIN_FAIL = 1, + SKEIN_BAD_HASHLEN = 2 +}; + +#define SKEIN_MODIFIER_WORDS (2) /* number of modifier (tweak) words */ + +#define SKEIN_256_STATE_WORDS (4) +#define SKEIN_512_STATE_WORDS (8) +#define SKEIN1024_STATE_WORDS (16) +#define SKEIN_MAX_STATE_WORDS (16) + +#define SKEIN_256_STATE_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BYTES (8 * SKEIN1024_STATE_WORDS) + +#define SKEIN_256_STATE_BITS (64 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_STATE_BITS (64 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_STATE_BITS (64 * SKEIN1024_STATE_WORDS) + +#define SKEIN_256_BLOCK_BYTES (8 * SKEIN_256_STATE_WORDS) +#define SKEIN_512_BLOCK_BYTES (8 * SKEIN_512_STATE_WORDS) +#define SKEIN1024_BLOCK_BYTES (8 * SKEIN1024_STATE_WORDS) + +typedef struct { + size_t hashBitLen; /* size of hash result, in bits */ + size_t bCnt; /* current byte count in buffer b[] */ + /* tweak words: T[0]=byte cnt, T[1]=flags */ + uint64_t T[SKEIN_MODIFIER_WORDS]; +} Skein_Ctxt_Hdr_t; + +typedef struct { /* 256-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN_256_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN_256_BLOCK_BYTES]; +} Skein_256_Ctxt_t; + +typedef struct { /* 512-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN_512_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN_512_BLOCK_BYTES]; +} Skein_512_Ctxt_t; + +typedef struct { /* 1024-bit Skein hash context structure */ + Skein_Ctxt_Hdr_t h; /* common header context variables */ + uint64_t X[SKEIN1024_STATE_WORDS]; /* chaining variables */ + /* partial block buffer (8-byte aligned) */ + uint8_t b[SKEIN1024_BLOCK_BYTES]; +} Skein1024_Ctxt_t; + +/* Skein APIs for (incremental) "straight hashing" */ +int Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen); +int Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen); +int Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen); + +int Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); +int Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); +int Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, + size_t msgByteCnt); + +int Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); + +/* + * Skein APIs for "extended" initialization: MAC keys, tree hashing. + * After an InitExt() call, just use Update/Final calls as with Init(). + * + * Notes: Same parameters as _Init() calls, plus treeInfo/key/keyBytes. + * When keyBytes == 0 and treeInfo == SKEIN_SEQUENTIAL, + * the results of InitExt() are identical to calling Init(). + * The function Init() may be called once to "precompute" the IV for + * a given hashBitLen value, then by saving a copy of the context + * the IV computation may be avoided in later calls. + * Similarly, the function InitExt() may be called once per MAC key + * to precompute the MAC IV, then a copy of the context saved and + * reused for each new MAC computation. + */ +int Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); +int Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); +int Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, + uint64_t treeInfo, const uint8_t *key, size_t keyBytes); + +/* + * Skein APIs for MAC and tree hash: + * Final_Pad: pad, do final block, but no OUTPUT type + * Output: do just the output stage + */ +int Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); + +#ifndef SKEIN_TREE_HASH +#define SKEIN_TREE_HASH (1) +#endif +#if SKEIN_TREE_HASH +int Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal); +int Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal); +int Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal); +#endif + +/* + * When you initialize a Skein KCF hashing method you can pass this param + * structure in cm_param to fine-tune the algorithm's defaults. + */ +typedef struct skein_param { + size_t sp_digest_bitlen; /* length of digest in bits */ +} skein_param_t; + +/* Module definitions */ +#ifdef SKEIN_MODULE_IMPL +#define CKM_SKEIN_256 "CKM_SKEIN_256" +#define CKM_SKEIN_512 "CKM_SKEIN_512" +#define CKM_SKEIN1024 "CKM_SKEIN1024" +#define CKM_SKEIN_256_MAC "CKM_SKEIN_256_MAC" +#define CKM_SKEIN_512_MAC "CKM_SKEIN_512_MAC" +#define CKM_SKEIN1024_MAC "CKM_SKEIN1024_MAC" + +typedef enum skein_mech_type { + SKEIN_256_MECH_INFO_TYPE, + SKEIN_512_MECH_INFO_TYPE, + SKEIN1024_MECH_INFO_TYPE, + SKEIN_256_MAC_MECH_INFO_TYPE, + SKEIN_512_MAC_MECH_INFO_TYPE, + SKEIN1024_MAC_MECH_INFO_TYPE +} skein_mech_type_t; + +#define VALID_SKEIN_DIGEST_MECH(__mech) \ + ((int)(__mech) >= SKEIN_256_MECH_INFO_TYPE && \ + (__mech) <= SKEIN1024_MECH_INFO_TYPE) +#define VALID_SKEIN_MAC_MECH(__mech) \ + ((int)(__mech) >= SKEIN_256_MAC_MECH_INFO_TYPE && \ + (__mech) <= SKEIN1024_MAC_MECH_INFO_TYPE) +#endif /* SKEIN_MODULE_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SKEIN_H_ */ diff --git a/include/sys/spa.h b/include/sys/spa.h new file mode 100644 index 000000000000..e53d0d64c302 --- /dev/null +++ b/include/sys/spa.h @@ -0,0 +1,1224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. + */ + +#ifndef _SYS_SPA_H +#define _SYS_SPA_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Forward references that lots of things need. + */ +typedef struct spa spa_t; +typedef struct vdev vdev_t; +typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; +typedef struct zilog zilog_t; +typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; +typedef struct zbookmark_phys zbookmark_phys_t; + +struct bpobj; +struct bplist; +struct dsl_pool; +struct dsl_dataset; +struct dsl_crypto_params; + +/* + * We currently support block sizes from 512 bytes to 16MB. + * The benefits of larger blocks, and thus larger IO, need to be weighed + * against the cost of COWing a giant block to modify one byte, and the + * large latency of reading or writing a large block. + * + * Note that although blocks up to 16MB are supported, the recordsize + * property can not be set larger than zfs_max_recordsize (default 1MB). + * See the comment near zfs_max_recordsize in dsl_dataset.c for details. + * + * Note that although the LSIZE field of the blkptr_t can store sizes up + * to 32MB, the dnode's dn_datablkszsec can only store sizes up to + * 32MB - 512 bytes. Therefore, we limit SPA_MAXBLOCKSIZE to 16MB. + */ +#define SPA_MINBLOCKSHIFT 9 +#define SPA_OLD_MAXBLOCKSHIFT 17 +#define SPA_MAXBLOCKSHIFT 24 +#define SPA_MINBLOCKSIZE (1ULL << SPA_MINBLOCKSHIFT) +#define SPA_OLD_MAXBLOCKSIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) +#define SPA_MAXBLOCKSIZE (1ULL << SPA_MAXBLOCKSHIFT) + +/* + * Alignment Shift (ashift) is an immutable, internal top-level vdev property + * which can only be set at vdev creation time. Physical writes are always done + * according to it, which makes 2^ashift the smallest possible IO on a vdev. + * + * We currently allow values ranging from 512 bytes (2^9 = 512) to 64 KiB + * (2^16 = 65,536). + */ +#define ASHIFT_MIN 9 +#define ASHIFT_MAX 16 + +/* + * Size of block to hold the configuration data (a packed nvlist) + */ +#define SPA_CONFIG_BLOCKSIZE (1ULL << 14) + +/* + * The DVA size encodings for LSIZE and PSIZE support blocks up to 32MB. + * The ASIZE encoding should be at least 64 times larger (6 more bits) + * to support up to 4-way RAID-Z mirror mode with worst-case gang block + * overhead, three DVAs per bp, plus one more bit in case we do anything + * else that expands the ASIZE. + */ +#define SPA_LSIZEBITS 16 /* LSIZE up to 32M (2^16 * 512) */ +#define SPA_PSIZEBITS 16 /* PSIZE up to 32M (2^16 * 512) */ +#define SPA_ASIZEBITS 24 /* ASIZE up to 64 times larger */ + +#define SPA_COMPRESSBITS 7 +#define SPA_VDEVBITS 24 +#define SPA_COMPRESSMASK ((1U << SPA_COMPRESSBITS) - 1) + +/* + * All SPA data is represented by 128-bit data virtual addresses (DVAs). + * The members of the dva_t should be considered opaque outside the SPA. + */ +typedef struct dva { + uint64_t dva_word[2]; +} dva_t; + + +/* + * Some checksums/hashes need a 256-bit initialization salt. This salt is kept + * secret and is suitable for use in MAC algorithms as the key. + */ +typedef struct zio_cksum_salt { + uint8_t zcs_bytes[32]; +} zio_cksum_salt_t; + +/* + * Each block is described by its DVAs, time of birth, checksum, etc. + * The word-by-word, bit-by-bit layout of the blkptr is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | pad | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | pad | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | pad | vdev3 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 |G| offset3 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | checksum[2] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | checksum[3] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * vdev virtual device ID + * offset offset into virtual device + * LSIZE logical size + * PSIZE physical size (after compression) + * ASIZE allocated size (including RAID-Z parity and gang block headers) + * GRID RAID-Z layout information (reserved for future use) + * cksum checksum function + * comp compression function + * G gang block indicator + * B byteorder (endianness) + * D dedup + * X encryption + * E blkptr_t contains embedded data (see below) + * lvl level of indirection + * type DMU object type + * phys birth txg when dva[0] was written; zero if same as logical birth txg + * note that typically all the dva's would be written in this + * txg, but they could be different if they were moved by + * device removal. + * log. birth transaction group in which the block was logically born + * fill count number of non-zero blocks under this bp + * checksum[4] 256-bit checksum of the data this bp describes + */ + +/* + * The blkptr_t's of encrypted blocks also need to store the encryption + * parameters so that the block can be decrypted. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | vdev1 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 1 |G| offset1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 2 | vdev2 | GRID | ASIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 3 |G| offset2 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 4 | salt | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 5 | IV1 | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | cksum |E| comp| PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 8 | padding | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 9 | physical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | IV2 | fill count | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * c | checksum[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * d | checksum[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * e | MAC[0] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * f | MAC[1] | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * salt Salt for generating encryption keys + * IV1 First 64 bits of encryption IV + * X Block requires encryption handling (set to 1) + * E blkptr_t contains embedded data (set to 0, see below) + * fill count number of non-zero blocks under this bp (truncated to 32 bits) + * IV2 Last 32 bits of encryption IV + * checksum[2] 128-bit checksum of the data this bp describes + * MAC[2] 128-bit message authentication code for this data + * + * The X bit being set indicates that this block is one of 3 types. If this is + * a level 0 block with an encrypted object type, the block is encrypted + * (see BP_IS_ENCRYPTED()). If this is a level 0 block with an unencrypted + * object type, this block is authenticated with an HMAC (see + * BP_IS_AUTHENTICATED()). Otherwise (if level > 0), this bp will use the MAC + * words to store a checksum-of-MACs from the level below (see + * BP_HAS_INDIRECT_MAC_CKSUM()). For convenience in the code, BP_IS_PROTECTED() + * refers to both encrypted and authenticated blocks and BP_USES_CRYPT() + * refers to any of these 3 kinds of blocks. + * + * The additional encryption parameters are the salt, IV, and MAC which are + * explained in greater detail in the block comment at the top of zio_crypt.c. + * The MAC occupies half of the checksum space since it serves a very similar + * purpose: to prevent data corruption on disk. The only functional difference + * is that the checksum is used to detect on-disk corruption whether or not the + * encryption key is loaded and the MAC provides additional protection against + * malicious disk tampering. We use the 3rd DVA to store the salt and first + * 64 bits of the IV. As a result encrypted blocks can only have 2 copies + * maximum instead of the normal 3. The last 32 bits of the IV are stored in + * the upper bits of what is usually the fill count. Note that only blocks at + * level 0 or -2 are ever encrypted, which allows us to guarantee that these + * 32 bits are not trampled over by other code (see zio_crypt.c for details). + * The salt and IV are not used for authenticated bps or bps with an indirect + * MAC checksum, so these blocks can utilize all 3 DVAs and the full 64 bits + * for the fill count. + */ + +/* + * "Embedded" blkptr_t's don't actually point to a block, instead they + * have a data payload embedded in the blkptr_t itself. See the comment + * in blkptr.c for more details. + * + * The blkptr_t is laid out as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | payload | + * 1 | payload | + * 2 | payload | + * 3 | payload | + * 4 | payload | + * 5 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 6 |BDX|lvl| type | etype |E| comp| PSIZE| LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 7 | payload | + * 8 | payload | + * 9 | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * a | logical birth txg | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * b | payload | + * c | payload | + * d | payload | + * e | payload | + * f | payload | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * Legend: + * + * payload contains the embedded data + * B (byteorder) byteorder (endianness) + * D (dedup) padding (set to zero) + * X encryption (set to zero) + * E (embedded) set to one + * lvl indirection level + * type DMU object type + * etype how to interpret embedded data (BP_EMBEDDED_TYPE_*) + * comp compression function of payload + * PSIZE size of payload after compression, in bytes + * LSIZE logical size of payload, in bytes + * note that 25 bits is enough to store the largest + * "normal" BP's LSIZE (2^16 * 2^9) in bytes + * log. birth transaction group in which the block was logically born + * + * Note that LSIZE and PSIZE are stored in bytes, whereas for non-embedded + * bp's they are stored in units of SPA_MINBLOCKSHIFT. + * Generally, the generic BP_GET_*() macros can be used on embedded BP's. + * The B, D, X, lvl, type, and comp fields are stored the same as with normal + * BP's so the BP_SET_* macros can be used with them. etype, PSIZE, LSIZE must + * be set with the BPE_SET_* macros. BP_SET_EMBEDDED() should be called before + * other macros, as they assert that they are only used on BP's of the correct + * "embedded-ness". Encrypted blkptr_t's cannot be embedded because they use + * the payload space for encryption parameters (see the comment above on + * how encryption parameters are stored). + */ + +#define BPE_GET_ETYPE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BPE_SET_ETYPE(bp, t) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, t); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_LSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 0, 25, 0, 1)) +#define BPE_SET_LSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 0, 25, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BPE_GET_PSIZE(bp) \ + (ASSERT(BP_IS_EMBEDDED(bp)), \ + BF64_GET_SB((bp)->blk_prop, 25, 7, 0, 1)) +#define BPE_SET_PSIZE(bp, x) do { \ + ASSERT(BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, 25, 7, 0, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +typedef enum bp_embedded_type { + BP_EMBEDDED_TYPE_DATA, + BP_EMBEDDED_TYPE_RESERVED, /* Reserved for Delphix byteswap feature. */ + BP_EMBEDDED_TYPE_REDACTED, + NUM_BP_EMBEDDED_TYPES +} bp_embedded_type_t; + +#define BPE_NUM_WORDS 14 +#define BPE_PAYLOAD_SIZE (BPE_NUM_WORDS * sizeof (uint64_t)) +#define BPE_IS_PAYLOADWORD(bp, wp) \ + ((wp) != &(bp)->blk_prop && (wp) != &(bp)->blk_birth) + +#define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ +#define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +#define SPA_SYNC_MIN_VDEVS 3 /* min vdevs to update during sync */ + +/* + * A block is a hole when it has either 1) never been written to, or + * 2) is zero-filled. In both cases, ZFS can return all zeroes for all reads + * without physically allocating disk space. Holes are represented in the + * blkptr_t structure by zeroed blk_dva. Correct checking for holes is + * done through the BP_IS_HOLE macro. For holes, the logical size, level, + * DMU object type, and birth times are all also stored for holes that + * were written to at some point (i.e. were punched after having been filled). + */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + +/* + * Macros to get and set fields in a bp or DVA. + */ +#define DVA_GET_ASIZE(dva) \ + BF64_GET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_ASIZE(dva, x) \ + BF64_SET_SB((dva)->dva_word[0], 0, SPA_ASIZEBITS, \ + SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GRID(dva) BF64_GET((dva)->dva_word[0], 24, 8) +#define DVA_SET_GRID(dva, x) BF64_SET((dva)->dva_word[0], 24, 8, x) + +#define DVA_GET_VDEV(dva) BF64_GET((dva)->dva_word[0], 32, SPA_VDEVBITS) +#define DVA_SET_VDEV(dva, x) \ + BF64_SET((dva)->dva_word[0], 32, SPA_VDEVBITS, x) + +#define DVA_GET_OFFSET(dva) \ + BF64_GET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_SET_OFFSET(dva, x) \ + BF64_SET_SB((dva)->dva_word[1], 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +#define DVA_GET_GANG(dva) BF64_GET((dva)->dva_word[1], 63, 1) +#define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) + +#define BP_GET_LSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? \ + (BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA ? BPE_GET_LSIZE(bp) : 0): \ + BF64_GET_SB((bp)->blk_prop, 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_LSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 0, SPA_LSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_PSIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + BF64_GET_SB((bp)->blk_prop, 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1)) +#define BP_SET_PSIZE(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET_SB((bp)->blk_prop, \ + 16, SPA_PSIZEBITS, SPA_MINBLOCKSHIFT, 1, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_COMPRESS(bp) \ + BF64_GET((bp)->blk_prop, 32, SPA_COMPRESSBITS) +#define BP_SET_COMPRESS(bp, x) \ + BF64_SET((bp)->blk_prop, 32, SPA_COMPRESSBITS, x) + +#define BP_IS_EMBEDDED(bp) BF64_GET((bp)->blk_prop, 39, 1) +#define BP_SET_EMBEDDED(bp, x) BF64_SET((bp)->blk_prop, 39, 1, x) + +#define BP_GET_CHECKSUM(bp) \ + (BP_IS_EMBEDDED(bp) ? ZIO_CHECKSUM_OFF : \ + BF64_GET((bp)->blk_prop, 40, 8)) +#define BP_SET_CHECKSUM(bp, x) do { \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + BF64_SET((bp)->blk_prop, 40, 8, x); \ +_NOTE(CONSTCOND) } while (0) + +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) + +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) + +/* encrypted, authenticated, and MAC cksum bps use the same bit */ +#define BP_USES_CRYPT(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_CRYPT(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) + +#define BP_IS_ENCRYPTED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) <= 0 && \ + DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_IS_AUTHENTICATED(bp) \ + (BP_USES_CRYPT(bp) && \ + BP_GET_LEVEL(bp) <= 0 && \ + !DMU_OT_IS_ENCRYPTED(BP_GET_TYPE(bp))) + +#define BP_HAS_INDIRECT_MAC_CKSUM(bp) \ + (BP_USES_CRYPT(bp) && BP_GET_LEVEL(bp) > 0) + +#define BP_IS_PROTECTED(bp) \ + (BP_IS_ENCRYPTED(bp) || BP_IS_AUTHENTICATED(bp)) + +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) BF64_GET((bp)->blk_prop, 63, 1) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_GET_FREE(bp) BF64_GET((bp)->blk_fill, 0, 1) +#define BP_SET_FREE(bp, x) BF64_SET((bp)->blk_fill, 0, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + ASSERT(!BP_IS_EMBEDDED(bp)); \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} + +#define BP_GET_FILL(bp) \ + ((BP_IS_ENCRYPTED(bp)) ? BF64_GET((bp)->blk_fill, 0, 32) : \ + ((BP_IS_EMBEDDED(bp)) ? 1 : (bp)->blk_fill)) + +#define BP_SET_FILL(bp, fill) \ +{ \ + if (BP_IS_ENCRYPTED(bp)) \ + BF64_SET((bp)->blk_fill, 0, 32, fill); \ + else \ + (bp)->blk_fill = fill; \ +} + +#define BP_GET_IV2(bp) \ + (ASSERT(BP_IS_ENCRYPTED(bp)), \ + BF64_GET((bp)->blk_fill, 32, 32)) +#define BP_SET_IV2(bp, iv2) \ +{ \ + ASSERT(BP_IS_ENCRYPTED(bp)); \ + BF64_SET((bp)->blk_fill, 32, 32, iv2); \ +} + +#define BP_IS_METADATA(bp) \ + (BP_GET_LEVEL(bp) > 0 || DMU_OT_IS_METADATA(BP_GET_TYPE(bp))) + +#define BP_GET_ASIZE(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + (DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) + +#define BP_GET_UCSIZE(bp) \ + (BP_IS_METADATA(bp) ? BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp)) + +#define BP_GET_NDVAS(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[0]) + \ + !!DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ + (!!DVA_GET_ASIZE(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp))) + +#define BP_COUNT_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? 0 : \ + (DVA_GET_GANG(&(bp)->blk_dva[0]) + \ + DVA_GET_GANG(&(bp)->blk_dva[1]) + \ + (DVA_GET_GANG(&(bp)->blk_dva[2]) * !BP_IS_ENCRYPTED(bp)))) + +#define DVA_EQUAL(dva1, dva2) \ + ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ + (dva1)->dva_word[0] == (dva2)->dva_word[0]) + +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + (bp1)->blk_birth == (bp2)->blk_birth && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + + +#define DVA_IS_VALID(dva) (DVA_GET_ASIZE(dva) != 0) + +#define BP_IDENTITY(bp) (ASSERT(!BP_IS_EMBEDDED(bp)), &(bp)->blk_dva[0]) +#define BP_IS_GANG(bp) \ + (BP_IS_EMBEDDED(bp) ? B_FALSE : DVA_GET_GANG(BP_IDENTITY(bp))) +#define DVA_IS_EMPTY(dva) ((dva)->dva_word[0] == 0ULL && \ + (dva)->dva_word[1] == 0ULL) +#define BP_IS_HOLE(bp) \ + (!BP_IS_EMBEDDED(bp) && DVA_IS_EMPTY(BP_IDENTITY(bp))) + +#define BP_SET_REDACTED(bp) \ +{ \ + BP_SET_EMBEDDED(bp, B_TRUE); \ + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_REDACTED); \ +} +#define BP_IS_REDACTED(bp) \ + (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_REDACTED) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) + +#define BP_ZERO(bp) \ +{ \ + (bp)->blk_dva[0].dva_word[0] = 0; \ + (bp)->blk_dva[0].dva_word[1] = 0; \ + (bp)->blk_dva[1].dva_word[0] = 0; \ + (bp)->blk_dva[1].dva_word[1] = 0; \ + (bp)->blk_dva[2].dva_word[0] = 0; \ + (bp)->blk_dva[2].dva_word[1] = 0; \ + (bp)->blk_prop = 0; \ + (bp)->blk_pad[0] = 0; \ + (bp)->blk_pad[1] = 0; \ + (bp)->blk_phys_birth = 0; \ + (bp)->blk_birth = 0; \ + (bp)->blk_fill = 0; \ + ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ +} + +#ifdef _ZFS_BIG_ENDIAN +#define ZFS_HOST_BYTEORDER (0ULL) +#else +#define ZFS_HOST_BYTEORDER (1ULL) +#endif + +#define BP_SHOULD_BYTESWAP(bp) (BP_GET_BYTEORDER(bp) != ZFS_HOST_BYTEORDER) + +#define BP_SPRINTF_LEN 400 + +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ + +#define SNPRINTF_BLKPTR(func, ws, buf, size, bp, type, checksum, compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int len = 0; \ + int copies = 0; \ + const char *crypt_type; \ + if (bp != NULL) { \ + if (BP_IS_ENCRYPTED(bp)) { \ + crypt_type = "encrypted"; \ + /* LINTED E_SUSPICIOUS_COMPARISON */ \ + } else if (BP_IS_AUTHENTICATED(bp)) { \ + crypt_type = "authenticated"; \ + } else if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { \ + crypt_type = "indirect-MAC"; \ + } else { \ + crypt_type = "unencrypted"; \ + } \ + } \ + if (bp == NULL) { \ + len += func(buf + len, size - len, ""); \ + } else if (BP_IS_HOLE(bp)) { \ + len += func(buf + len, size - len, \ + "HOLE [L%llu %s] " \ + "size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_EMBEDDED(bp)) { \ + len = func(buf + len, size - len, \ + "EMBEDDED [L%llu %s] et=%u %s " \ + "size=%llxL/%llxP birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (int)BPE_GET_ETYPE(bp), \ + compress, \ + (u_longlong_t)BPE_GET_LSIZE(bp), \ + (u_longlong_t)BPE_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else if (BP_IS_REDACTED(bp)) { \ + len += func(buf + len, size - len, \ + "REDACTED [L%llu %s] size=%llxL birth=%lluL", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)bp->blk_birth); \ + } else { \ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_ENCRYPTED(bp)) { \ + len += func(buf + len, size - len, \ + "salt=%llx iv=%llx:%llx%c", \ + (u_longlong_t)bp->blk_dva[2].dva_word[0], \ + (u_longlong_t)bp->blk_dva[2].dva_word[1], \ + (u_longlong_t)BP_GET_IV2(bp), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + crypt_type, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)BP_GET_FILL(bp), \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + +#define BP_GET_BUFC_TYPE(bp) \ + (BP_IS_METADATA(bp) ? ARC_BUFC_METADATA : ARC_BUFC_DATA) + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; + +typedef enum spa_mode { + SPA_MODE_UNINIT = 0, + SPA_MODE_READ = 1, + SPA_MODE_WRITE = 2, +} spa_mode_t; + +/* + * Send TRIM commands in-line during normal pool operation while deleting. + * OFF: no + * ON: yes + * NB: IN_FREEBSD_BASE is defined within the FreeBSD sources. + */ +typedef enum { + SPA_AUTOTRIM_OFF = 0, /* default */ + SPA_AUTOTRIM_ON, +#ifdef IN_FREEBSD_BASE + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_ON, +#else + SPA_AUTOTRIM_DEFAULT = SPA_AUTOTRIM_OFF, +#endif +} spa_autotrim_t; + +/* + * Reason TRIM command was issued, used internally for accounting purposes. + */ +typedef enum trim_type { + TRIM_TYPE_MANUAL = 0, + TRIM_TYPE_AUTO = 1, + TRIM_TYPE_SIMPLE = 2 +} trim_type_t; + +/* state manipulation functions */ +extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); +extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, + size_t buflen); +extern int spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + nvlist_t *zplprops, struct dsl_crypto_params *dcp); +extern int spa_import(char *pool, nvlist_t *config, nvlist_t *props, + uint64_t flags); +extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); +extern int spa_destroy(char *pool); +extern int spa_checkpoint(const char *pool); +extern int spa_checkpoint_discard(const char *pool); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce); +extern int spa_reset(char *pool); +extern void spa_async_request(spa_t *spa, int flag); +extern void spa_async_unrequest(spa_t *spa, int flag); +extern void spa_async_suspend(spa_t *spa); +extern void spa_async_resume(spa_t *spa); +extern int spa_async_tasks(spa_t *spa); +extern spa_t *spa_inject_addref(char *pool); +extern void spa_inject_delref(spa_t *spa); +extern void spa_scan_stat_init(spa_t *spa); +extern int spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps); +extern int bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); +extern int bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +#define SPA_ASYNC_CONFIG_UPDATE 0x01 +#define SPA_ASYNC_REMOVE 0x02 +#define SPA_ASYNC_PROBE 0x04 +#define SPA_ASYNC_RESILVER_DONE 0x08 +#define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 +#define SPA_ASYNC_REMOVE_DONE 0x40 +#define SPA_ASYNC_REMOVE_STOP 0x80 +#define SPA_ASYNC_INITIALIZE_RESTART 0x100 +#define SPA_ASYNC_TRIM_RESTART 0x200 +#define SPA_ASYNC_AUTOTRIM_RESTART 0x400 +#define SPA_ASYNC_L2CACHE_REBUILD 0x800 +#define SPA_ASYNC_L2CACHE_TRIM 0x1000 +#define SPA_ASYNC_REBUILD_DONE 0x2000 + +/* device manipulation */ +extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); +extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, + int replacing, int rebuild); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); +extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); +extern boolean_t spa_vdev_remove_active(spa_t *spa); +extern int spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + nvlist_t *vdev_errlist); +extern int spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + uint64_t rate, boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist); +extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); + +/* spare state (which is global across all pools) */ +extern void spa_spare_add(vdev_t *vd); +extern void spa_spare_remove(vdev_t *vd); +extern boolean_t spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt); +extern void spa_spare_activate(vdev_t *vd); + +/* L2ARC state (which is global across all pools) */ +extern void spa_l2cache_add(vdev_t *vd); +extern void spa_l2cache_remove(vdev_t *vd); +extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); +extern void spa_l2cache_activate(vdev_t *vd); +extern void spa_l2cache_drop(spa_t *spa); + +/* scanning */ +extern int spa_scan(spa_t *spa, pool_scan_func_t func); +extern int spa_scan_stop(spa_t *spa); +extern int spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t flag); + +/* spa syncing */ +extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ +extern void spa_sync_allpools(void); + +extern int zfs_sync_pass_deferred_free; + +/* spa namespace global mutex */ +extern kmutex_t spa_namespace_lock; + +/* + * SPA configuration functions in spa_config.c + */ + +#define SPA_CONFIG_UPDATE_POOL 0 +#define SPA_CONFIG_UPDATE_VDEVS 1 + +extern void spa_write_cachefile(spa_t *, boolean_t, boolean_t); +extern void spa_config_load(void); +extern nvlist_t *spa_all_configs(uint64_t *); +extern void spa_config_set(spa_t *spa, nvlist_t *config); +extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, + int getstats); +extern void spa_config_update(spa_t *spa, int what); +extern int spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, + vdev_t *parent, uint_t id, int atype); + + +/* + * Miscellaneous SPA routines in spa_misc.c + */ + +/* Namespace manipulation */ +extern spa_t *spa_lookup(const char *name); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); +extern void spa_remove(spa_t *spa); +extern spa_t *spa_next(spa_t *prev); + +/* Refcount functions */ +extern void spa_open_ref(spa_t *spa, void *tag); +extern void spa_close(spa_t *spa, void *tag); +extern void spa_async_close(spa_t *spa, void *tag); +extern boolean_t spa_refcount_zero(spa_t *spa); + +#define SCL_NONE 0x00 +#define SCL_CONFIG 0x01 +#define SCL_STATE 0x02 +#define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ +#define SCL_ALLOC 0x08 +#define SCL_ZIO 0x10 +#define SCL_FREE 0x20 +#define SCL_VDEV 0x40 +#define SCL_LOCKS 7 +#define SCL_ALL ((1 << SCL_LOCKS) - 1) +#define SCL_STATE_ALL (SCL_STATE | SCL_L2ARC | SCL_ZIO) + +/* Historical pool statistics */ +typedef struct spa_history_kstat { + kmutex_t lock; + uint64_t count; + uint64_t size; + kstat_t *kstat; + void *priv; + list_t list; +} spa_history_kstat_t; + +typedef struct spa_history_list { + uint64_t size; + procfs_list_t procfs_list; +} spa_history_list_t; + +typedef struct spa_stats { + spa_history_list_t read_history; + spa_history_list_t txg_history; + spa_history_kstat_t tx_assign_histogram; + spa_history_kstat_t io_history; + spa_history_list_t mmp_history; + spa_history_kstat_t state; /* pool state */ + spa_history_kstat_t iostats; +} spa_stats_t; + +typedef enum txg_state { + TXG_STATE_BIRTH = 0, + TXG_STATE_OPEN = 1, + TXG_STATE_QUIESCED = 2, + TXG_STATE_WAIT_FOR_SYNC = 3, + TXG_STATE_SYNCED = 4, + TXG_STATE_COMMITTED = 5, +} txg_state_t; + +typedef struct txg_stat { + vdev_stat_t vs1; + vdev_stat_t vs2; + uint64_t txg; + uint64_t ndirty; +} txg_stat_t; + +/* Assorted pool IO kstats */ +typedef struct spa_iostats { + kstat_named_t trim_extents_written; + kstat_named_t trim_bytes_written; + kstat_named_t trim_extents_skipped; + kstat_named_t trim_bytes_skipped; + kstat_named_t trim_extents_failed; + kstat_named_t trim_bytes_failed; + kstat_named_t autotrim_extents_written; + kstat_named_t autotrim_bytes_written; + kstat_named_t autotrim_extents_skipped; + kstat_named_t autotrim_bytes_skipped; + kstat_named_t autotrim_extents_failed; + kstat_named_t autotrim_bytes_failed; + kstat_named_t simple_trim_extents_written; + kstat_named_t simple_trim_bytes_written; + kstat_named_t simple_trim_extents_skipped; + kstat_named_t simple_trim_bytes_skipped; + kstat_named_t simple_trim_extents_failed; + kstat_named_t simple_trim_bytes_failed; +} spa_iostats_t; + +extern void spa_stats_init(spa_t *spa); +extern void spa_stats_destroy(spa_t *spa); +extern void spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, + uint32_t aflags); +extern void spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time); +extern int spa_txg_history_set(spa_t *spa, uint64_t txg, + txg_state_t completed_state, hrtime_t completed_time); +extern txg_stat_t *spa_txg_history_init_io(spa_t *, uint64_t, + struct dsl_pool *); +extern void spa_txg_history_fini_io(spa_t *, txg_stat_t *); +extern void spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs); +extern int spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_kstat_id); +extern int spa_mmp_history_set(spa_t *spa, uint64_t mmp_kstat_id, int io_error, + hrtime_t duration); +extern void spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_kstat_id, + int error); +extern void spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed); +extern void spa_import_progress_add(spa_t *spa); +extern void spa_import_progress_remove(uint64_t spa_guid); +extern int spa_import_progress_set_mmp_check(uint64_t pool_guid, + uint64_t mmp_sec_remaining); +extern int spa_import_progress_set_max_txg(uint64_t pool_guid, + uint64_t max_txg); +extern int spa_import_progress_set_state(uint64_t pool_guid, + spa_load_state_t spa_load_state); + +/* Pool configuration locks */ +extern int spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw); +extern void spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw); +extern void spa_config_exit(spa_t *spa, int locks, const void *tag); +extern int spa_config_held(spa_t *spa, int locks, krw_t rw); + +/* Pool vdev add/remove lock */ +extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_detach_enter(spa_t *spa, uint64_t guid); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); +extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); + +/* Pool vdev state change lock */ +extern void spa_vdev_state_enter(spa_t *spa, int oplock); +extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); + +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_reset_logs(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); +extern void spa_deadman(void *); + +/* Accessor functions */ +extern boolean_t spa_shutting_down(spa_t *spa); +extern struct dsl_pool *spa_get_dsl(spa_t *spa); +extern boolean_t spa_is_initializing(spa_t *spa); +extern boolean_t spa_indirect_vdevs_loaded(spa_t *spa); +extern blkptr_t *spa_get_rootblkptr(spa_t *spa); +extern void spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp); +extern void spa_altroot(spa_t *, char *, size_t); +extern int spa_sync_pass(spa_t *spa); +extern char *spa_name(spa_t *spa); +extern uint64_t spa_guid(spa_t *spa); +extern uint64_t spa_load_guid(spa_t *spa); +extern uint64_t spa_last_synced_txg(spa_t *spa); +extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); +extern uint64_t spa_final_dirty_txg(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); +extern uint64_t spa_freeze_txg(spa_t *spa); +extern uint64_t spa_get_worst_case_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern uint64_t spa_get_checkpoint_space(spa_t *spa); +extern uint64_t spa_get_slop_space(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); +extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); +extern metaslab_class_t *spa_special_class(spa_t *spa); +extern metaslab_class_t *spa_dedup_class(spa_t *spa); +extern metaslab_class_t *spa_preferred_class(spa_t *spa, uint64_t size, + dmu_object_type_t objtype, uint_t level, uint_t special_smallblk); + +extern void spa_evicting_os_register(spa_t *, objset_t *os); +extern void spa_evicting_os_deregister(spa_t *, objset_t *os); +extern void spa_evicting_os_wait(spa_t *spa); +extern int spa_max_replication(spa_t *spa); +extern int spa_prev_software_version(spa_t *spa); +extern uint64_t spa_get_failmode(spa_t *spa); +extern uint64_t spa_get_deadman_failmode(spa_t *spa); +extern void spa_set_deadman_failmode(spa_t *spa, const char *failmode); +extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); +extern space_map_t *spa_syncing_log_sm(spa_t *spa); +extern uint64_t spa_deadman_synctime(spa_t *spa); +extern uint64_t spa_deadman_ziotime(spa_t *spa); +extern uint64_t spa_dirty_data(spa_t *spa); +extern spa_autotrim_t spa_get_autotrim(spa_t *spa); + +/* Miscellaneous support routines */ +extern void spa_load_failed(spa_t *spa, const char *fmt, ...); +extern void spa_load_note(spa_t *spa, const char *fmt, ...); +extern void spa_activate_mos_feature(spa_t *spa, const char *feature, + dmu_tx_t *tx); +extern void spa_deactivate_mos_feature(spa_t *spa, const char *feature); +extern spa_t *spa_by_guid(uint64_t pool_guid, uint64_t device_guid); +extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); +extern char *spa_strdup(const char *); +extern void spa_strfree(char *); +extern uint64_t spa_get_random(uint64_t range); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp); +extern void spa_freeze(spa_t *spa); +extern int spa_change_guid(spa_t *spa); +extern void spa_upgrade(spa_t *spa, uint64_t version); +extern void spa_evict_all(void); +extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, + boolean_t l2cache); +extern boolean_t spa_has_spare(spa_t *, uint64_t guid); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); +extern boolean_t spa_has_slogs(spa_t *spa); +extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern boolean_t spa_has_pending_synctask(spa_t *spa); +extern int spa_maxblocksize(spa_t *spa); +extern int spa_maxdnodesize(spa_t *spa); +extern boolean_t spa_has_checkpoint(spa_t *spa); +extern boolean_t spa_importing_readonly_checkpoint(spa_t *spa); +extern boolean_t spa_suspend_async_destroy(spa_t *spa); +extern uint64_t spa_min_claim_txg(spa_t *spa); +extern boolean_t zfs_dva_valid(spa_t *spa, const dva_t *dva, + const blkptr_t *bp); +typedef void (*spa_remap_cb_t)(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg); +extern boolean_t spa_remap_blkptr(spa_t *spa, blkptr_t *bp, + spa_remap_cb_t callback, void *arg); +extern uint64_t spa_get_last_removal_txg(spa_t *spa); +extern boolean_t spa_trust_config(spa_t *spa); +extern uint64_t spa_missing_tvds_allowed(spa_t *spa); +extern void spa_set_missing_tvds(spa_t *spa, uint64_t missing); +extern boolean_t spa_top_vdevs_spacemap_addressable(spa_t *spa); +extern uint64_t spa_total_metaslabs(spa_t *spa); +extern boolean_t spa_multihost(spa_t *spa); +extern uint32_t spa_get_hostid(spa_t *spa); +extern void spa_activate_allocation_classes(spa_t *, dmu_tx_t *); +extern boolean_t spa_livelist_delete_check(spa_t *spa); + +extern spa_mode_t spa_mode(spa_t *spa); +extern uint64_t zfs_strtonum(const char *str, char **nptr); + +extern char *spa_his_ievent_table[]; + +extern void spa_history_create_obj(spa_t *spa, dmu_tx_t *tx); +extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, + char *his_buf); +extern int spa_history_log(spa_t *spa, const char *his_buf); +extern int spa_history_log_nvl(spa_t *spa, nvlist_t *nvl); +extern void spa_history_log_version(spa_t *spa, const char *operation, + dmu_tx_t *tx); +extern void spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); +extern void spa_history_log_internal_ds(struct dsl_dataset *ds, const char *op, + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); +extern void spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) __printflike(4, 5); + +extern const char *spa_state_to_name(spa_t *spa); + +/* error handling */ +struct zbookmark_phys; +extern void spa_log_error(spa_t *spa, const zbookmark_phys_t *zb); +extern int zfs_ereport_post(const char *clazz, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, + uint64_t length); +extern boolean_t zfs_ereport_is_valid(const char *clazz, spa_t *spa, vdev_t *vd, + zio_t *zio); +extern nvlist_t *zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, + const char *name, nvlist_t *aux); +extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate); +extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); +extern uint64_t spa_get_errlog_size(spa_t *spa); +extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); +extern void spa_errlog_rotate(spa_t *spa); +extern void spa_errlog_drain(spa_t *spa); +extern void spa_errlog_sync(spa_t *spa, uint64_t txg); +extern void spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub); + +/* vdev cache */ +extern void vdev_cache_stat_init(void); +extern void vdev_cache_stat_fini(void); + +/* vdev mirror */ +extern void vdev_mirror_stat_init(void); +extern void vdev_mirror_stat_fini(void); + +/* Initialization and termination */ +extern void spa_init(spa_mode_t mode); +extern void spa_fini(void); +extern void spa_boot_init(void); + +/* properties */ +extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); +extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); +extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); + +/* asynchronous event notification */ +extern void spa_event_notify(spa_t *spa, vdev_t *vdev, nvlist_t *hist_nvl, + const char *name); + +/* waiting for pool activities to complete */ +extern int spa_wait(const char *pool, zpool_wait_activity_t activity, + boolean_t *waited); +extern int spa_wait_tag(const char *name, zpool_wait_activity_t activity, + uint64_t tag, boolean_t *waited); +extern void spa_notify_waiters(spa_t *spa); +extern void spa_wake_waiters(spa_t *spa); + +/* module param call functions */ +int param_set_deadman_ziotime(ZFS_MODULE_PARAM_ARGS); +int param_set_deadman_synctime(ZFS_MODULE_PARAM_ARGS); +int param_set_slop_shift(ZFS_MODULE_PARAM_ARGS); +int param_set_deadman_failmode(ZFS_MODULE_PARAM_ARGS); + +#ifdef ZFS_DEBUG +#define dprintf_bp(bp, fmt, ...) do { \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ + char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ + snprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ + kmem_free(__blkbuf, BP_SPRINTF_LEN); \ + } \ +_NOTE(CONSTCOND) } while (0) +#else +#define dprintf_bp(bp, fmt, ...) +#endif + +extern spa_mode_t spa_mode_global; +extern int zfs_deadman_enabled; +extern unsigned long zfs_deadman_synctime_ms; +extern unsigned long zfs_deadman_ziotime_ms; +extern unsigned long zfs_deadman_checktime_ms; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_H */ diff --git a/include/sys/spa_boot.h b/include/sys/spa_boot.h new file mode 100644 index 000000000000..1d3622f5a108 --- /dev/null +++ b/include/sys/spa_boot.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SPA_BOOT_H +#define _SYS_SPA_BOOT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern char *spa_get_bootprop(char *prop); +extern void spa_free_bootprop(char *prop); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_BOOT_H */ diff --git a/include/sys/spa_checkpoint.h b/include/sys/spa_checkpoint.h new file mode 100644 index 000000000000..9be2b6eeab3c --- /dev/null +++ b/include/sys/spa_checkpoint.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_CHECKPOINT_H +#define _SYS_SPA_CHECKPOINT_H + +#include + +typedef struct spa_checkpoint_info { + uint64_t sci_timestamp; /* when checkpointed uberblock was synced */ + uint64_t sci_dspace; /* disk space used by checkpoint in bytes */ +} spa_checkpoint_info_t; + +int spa_checkpoint(const char *); +int spa_checkpoint_discard(const char *); + +boolean_t spa_checkpoint_discard_thread_check(void *, zthr_t *); +void spa_checkpoint_discard_thread(void *, zthr_t *); + +int spa_checkpoint_get_stats(spa_t *, pool_checkpoint_stat_t *); + +#endif /* _SYS_SPA_CHECKPOINT_H */ diff --git a/include/sys/spa_checksum.h b/include/sys/spa_checksum.h new file mode 100644 index 000000000000..b87990105a71 --- /dev/null +++ b/include/sys/spa_checksum.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SPA_CHECKSUM_H +#define _SPA_CHECKSUM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Each block has a 256-bit checksum -- strong enough for cryptographic hashes. + */ +typedef struct zio_cksum { + uint64_t zc_word[4]; +} zio_cksum_t; + +#define ZIO_SET_CHECKSUM(zcp, w0, w1, w2, w3) \ +{ \ + (zcp)->zc_word[0] = w0; \ + (zcp)->zc_word[1] = w1; \ + (zcp)->zc_word[2] = w2; \ + (zcp)->zc_word[3] = w3; \ +} + +#define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ + (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ + ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ + ((zc1).zc_word[2] - (zc2).zc_word[2]) | \ + ((zc1).zc_word[3] - (zc2).zc_word[3]))) + +#define ZIO_CHECKSUM_IS_ZERO(zc) \ + (0 == ((zc)->zc_word[0] | (zc)->zc_word[1] | \ + (zc)->zc_word[2] | (zc)->zc_word[3])) + +#define ZIO_CHECKSUM_BSWAP(zcp) \ +{ \ + (zcp)->zc_word[0] = BSWAP_64((zcp)->zc_word[0]); \ + (zcp)->zc_word[1] = BSWAP_64((zcp)->zc_word[1]); \ + (zcp)->zc_word[2] = BSWAP_64((zcp)->zc_word[2]); \ + (zcp)->zc_word[3] = BSWAP_64((zcp)->zc_word[3]); \ +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h new file mode 100644 index 000000000000..69de75fb6d3f --- /dev/null +++ b/include/sys/spa_impl.h @@ -0,0 +1,458 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _SYS_SPA_IMPL_H +#define _SYS_SPA_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_error_entry { + zbookmark_phys_t se_bookmark; + char *se_name; + avl_node_t se_avl; +} spa_error_entry_t; + +typedef struct spa_history_phys { + uint64_t sh_pool_create_len; /* ending offset of zpool create */ + uint64_t sh_phys_max_off; /* physical EOF */ + uint64_t sh_bof; /* logical BOF */ + uint64_t sh_eof; /* logical EOF */ + uint64_t sh_records_lost; /* num of records overwritten */ +} spa_history_phys_t; + +/* + * All members must be uint64_t, for byteswap purposes. + */ +typedef struct spa_removing_phys { + uint64_t sr_state; /* dsl_scan_state_t */ + + /* + * The vdev ID that we most recently attempted to remove, + * or -1 if no removal has been attempted. + */ + uint64_t sr_removing_vdev; + + /* + * The vdev ID that we most recently successfully removed, + * or -1 if no devices have been removed. + */ + uint64_t sr_prev_indirect_vdev; + + uint64_t sr_start_time; + uint64_t sr_end_time; + + /* + * Note that we can not use the space map's or indirect mapping's + * accounting as a substitute for these values, because we need to + * count frees of not-yet-copied data as though it did the copy. + * Otherwise, we could get into a situation where copied > to_copy, + * or we complete before copied == to_copy. + */ + uint64_t sr_to_copy; /* bytes that need to be copied */ + uint64_t sr_copied; /* bytes that have been copied or freed */ +} spa_removing_phys_t; + +/* + * This struct is stored as an entry in the DMU_POOL_DIRECTORY_OBJECT + * (with key DMU_POOL_CONDENSING_INDIRECT). It is present if a condense + * of an indirect vdev's mapping object is in progress. + */ +typedef struct spa_condensing_indirect_phys { + /* + * The vdev ID of the indirect vdev whose indirect mapping is + * being condensed. + */ + uint64_t scip_vdev; + + /* + * The vdev's old obsolete spacemap. This spacemap's contents are + * being integrated into the new mapping. + */ + uint64_t scip_prev_obsolete_sm_object; + + /* + * The new mapping object that is being created. + */ + uint64_t scip_next_mapping_object; +} spa_condensing_indirect_phys_t; + +struct spa_aux_vdev { + uint64_t sav_object; /* MOS object for device list */ + nvlist_t *sav_config; /* cached device config */ + vdev_t **sav_vdevs; /* devices */ + int sav_count; /* number devices */ + boolean_t sav_sync; /* sync the device list */ + nvlist_t **sav_pending; /* pending device additions */ + uint_t sav_npending; /* # pending devices */ +}; + +typedef struct spa_config_lock { + kmutex_t scl_lock; + kthread_t *scl_writer; + int scl_write_wanted; + kcondvar_t scl_cv; + zfs_refcount_t scl_count; +} spa_config_lock_t; + +typedef struct spa_config_dirent { + list_node_t scd_link; + char *scd_path; +} spa_config_dirent_t; + +typedef enum zio_taskq_type { + ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, + ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, + ZIO_TASKQ_TYPES +} zio_taskq_type_t; + +/* + * State machine for the zpool-poolname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + +typedef struct spa_taskqs { + uint_t stqs_count; + taskq_t **stqs_taskq; +} spa_taskqs_t; + +typedef enum spa_all_vdev_zap_action { + AVZ_ACTION_NONE = 0, + AVZ_ACTION_DESTROY, /* Destroy all per-vdev ZAPs and the AVZ. */ + AVZ_ACTION_REBUILD, /* Populate the new AVZ, see spa_avz_rebuild */ + AVZ_ACTION_INITIALIZE +} spa_avz_action_t; + +typedef enum spa_config_source { + SPA_CONFIG_SRC_NONE = 0, + SPA_CONFIG_SRC_SCAN, /* scan of path (default: /dev/dsk) */ + SPA_CONFIG_SRC_CACHEFILE, /* any cachefile */ + SPA_CONFIG_SRC_TRYIMPORT, /* returned from call to tryimport */ + SPA_CONFIG_SRC_SPLIT, /* new pool in a pool split */ + SPA_CONFIG_SRC_MOS /* MOS, but not always from right txg */ +} spa_config_source_t; + +struct spa { + /* + * Fields protected by spa_namespace_lock. + */ + char spa_name[ZFS_MAX_DATASET_NAME_LEN]; /* pool name */ + char *spa_comment; /* comment */ + avl_node_t spa_avl; /* node in spa_namespace_avl */ + nvlist_t *spa_config; /* last synced config */ + nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ + nvlist_t *spa_load_info; /* info and errors from load */ + uint64_t spa_config_txg; /* txg of last config change */ + int spa_sync_pass; /* iterate-to-convergence */ + pool_state_t spa_state; /* pool state */ + int spa_inject_ref; /* injection references */ + uint8_t spa_sync_on; /* sync threads are running */ + spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_indirect_vdevs_loaded; /* mappings loaded? */ + boolean_t spa_trust_config; /* do we trust vdev tree? */ + boolean_t spa_is_splitting; /* in the middle of a split? */ + spa_config_source_t spa_config_source; /* where config comes from? */ + uint64_t spa_import_flags; /* import specific flags */ + spa_taskqs_t spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; + dsl_pool_t *spa_dsl_pool; + boolean_t spa_is_initializing; /* true while opening pool */ + boolean_t spa_is_exporting; /* true while exporting pool */ + metaslab_class_t *spa_normal_class; /* normal data class */ + metaslab_class_t *spa_log_class; /* intent log data class */ + metaslab_class_t *spa_special_class; /* special allocation class */ + metaslab_class_t *spa_dedup_class; /* dedup allocation class */ + uint64_t spa_first_txg; /* first txg after spa_open() */ + uint64_t spa_final_txg; /* txg of export/destroy */ + uint64_t spa_freeze_txg; /* freeze pool at this txg */ + uint64_t spa_load_max_txg; /* best initial ub_txg */ + uint64_t spa_claim_max_txg; /* highest claimed birth txg */ + inode_timespec_t spa_loaded_ts; /* 1st successful open time */ + objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ + kmutex_t spa_evicting_os_lock; /* Evicting objset list lock */ + list_t spa_evicting_os_list; /* Objsets being evicted. */ + kcondvar_t spa_evicting_os_cv; /* Objset Eviction Completion */ + txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ + vdev_t *spa_root_vdev; /* top-level vdev container */ + int spa_min_ashift; /* of vdevs in normal class */ + int spa_max_ashift; /* of vdevs in normal class */ + uint64_t spa_config_guid; /* config pool guid */ + uint64_t spa_load_guid; /* spa_load initialized guid */ + uint64_t spa_last_synced_guid; /* last synced guid */ + list_t spa_config_dirty_list; /* vdevs with dirty config */ + list_t spa_state_dirty_list; /* vdevs with dirty state */ + /* + * spa_alloc_locks and spa_alloc_trees are arrays, whose lengths are + * stored in spa_alloc_count. There is one tree and one lock for each + * allocator, to help improve allocation performance in write-heavy + * workloads. + */ + kmutex_t *spa_alloc_locks; + avl_tree_t *spa_alloc_trees; + int spa_alloc_count; + + spa_aux_vdev_t spa_spares; /* hot spares */ + spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + nvlist_t *spa_label_features; /* Features for reading MOS */ + uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ + uint64_t spa_syncing_txg; /* txg currently syncing */ + bpobj_t spa_deferred_bpobj; /* deferred-free bplist */ + bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ + zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ + /* checksum context templates */ + kmutex_t spa_cksum_tmpls_lock; + void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; + uberblock_t spa_ubsync; /* last synced uberblock */ + uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_extreme_rewind; /* rewind past deferred frees */ + kmutex_t spa_scrub_lock; /* resilver/scrub lock */ + uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ + + /* in-flight verification bytes */ + uint64_t spa_load_verify_bytes; + kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ + uint8_t spa_scrub_active; /* active or suspended? */ + uint8_t spa_scrub_type; /* type of scrub we're doing */ + uint8_t spa_scrub_finished; /* indicator to rotate logs */ + uint8_t spa_scrub_started; /* started since last boot */ + uint8_t spa_scrub_reopen; /* scrub doing vdev_reopen */ + uint64_t spa_scan_pass_start; /* start time per pass/reboot */ + uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ + uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ + uint64_t spa_scan_pass_exam; /* examined bytes per pass */ + uint64_t spa_scan_pass_issued; /* issued bytes per pass */ + + /* + * We are in the middle of a resilver, and another resilver + * is needed once this one completes. This is set iff any + * vdev_resilver_deferred is set. + */ + boolean_t spa_resilver_deferred; + kmutex_t spa_async_lock; /* protect async state */ + kthread_t *spa_async_thread; /* thread doing async task */ + int spa_async_suspended; /* async tasks suspended */ + kcondvar_t spa_async_cv; /* wait for thread_exit() */ + uint16_t spa_async_tasks; /* async task mask */ + uint64_t spa_missing_tvds; /* unopenable tvds on load */ + uint64_t spa_missing_tvds_allowed; /* allow loading spa? */ + + spa_removing_phys_t spa_removing_phys; + spa_vdev_removal_t *spa_vdev_removal; + + spa_condensing_indirect_phys_t spa_condensing_indirect_phys; + spa_condensing_indirect_t *spa_condensing_indirect; + zthr_t *spa_condense_zthr; /* zthr doing condense. */ + + uint64_t spa_checkpoint_txg; /* the txg of the checkpoint */ + spa_checkpoint_info_t spa_checkpoint_info; /* checkpoint accounting */ + zthr_t *spa_checkpoint_discard_zthr; + + space_map_t *spa_syncing_log_sm; /* current log space map */ + avl_tree_t spa_sm_logs_by_txg; + kmutex_t spa_flushed_ms_lock; /* for metaslabs_by_flushed */ + avl_tree_t spa_metaslabs_by_flushed; + spa_unflushed_stats_t spa_unflushed_stats; + list_t spa_log_summary; + uint64_t spa_log_flushall_txg; + + zthr_t *spa_livelist_delete_zthr; /* deleting livelists */ + zthr_t *spa_livelist_condense_zthr; /* condensing livelists */ + uint64_t spa_livelists_to_delete; /* set of livelists to free */ + livelist_condense_entry_t spa_to_condense; /* next to condense */ + + char *spa_root; /* alternate root directory */ + uint64_t spa_ena; /* spa-wide ereport ENA */ + int spa_last_open_failed; /* error if last open failed */ + uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ + uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_txg; /* ub txg that loaded */ + uint64_t spa_load_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_meta_errors; /* verify metadata err count */ + uint64_t spa_load_data_errors; /* verify data err count */ + uint64_t spa_verify_min_txg; /* start txg of verify scrub */ + kmutex_t spa_errlog_lock; /* error log lock */ + uint64_t spa_errlog_last; /* last error log object */ + uint64_t spa_errlog_scrub; /* scrub error log object */ + kmutex_t spa_errlist_lock; /* error list/ereport lock */ + avl_tree_t spa_errlist_last; /* last error list */ + avl_tree_t spa_errlist_scrub; /* scrub error list */ + uint64_t spa_deflate; /* should we deflate? */ + uint64_t spa_history; /* history object */ + kmutex_t spa_history_lock; /* history lock */ + vdev_t *spa_pending_vdev; /* pending vdev additions */ + kmutex_t spa_props_lock; /* property lock */ + uint64_t spa_pool_props_object; /* object for properties */ + uint64_t spa_bootfs; /* default boot filesystem */ + uint64_t spa_failmode; /* failure mode for the pool */ + uint64_t spa_deadman_failmode; /* failure mode for deadman */ + uint64_t spa_delegation; /* delegation on/off */ + list_t spa_config_list; /* previous cache file(s) */ + /* per-CPU array of root of async I/O: */ + zio_t **spa_async_zio_root; + zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ + zio_t *spa_txg_zio[TXG_SIZE]; /* spa_sync() waits for this */ + kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ + kcondvar_t spa_suspend_cv; /* notification of resume */ + zio_suspend_reason_t spa_suspended; /* pool is suspended */ + uint8_t spa_claiming; /* pool is doing zil_claim() */ + boolean_t spa_is_root; /* pool is root */ + int spa_minref; /* num refs when first opened */ + spa_mode_t spa_mode; /* SPA_MODE_{READ|WRITE} */ + spa_log_state_t spa_log_state; /* log state */ + uint64_t spa_autoexpand; /* lun expansion on/off */ + ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ + uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_dspace; /* Cache get_dedup_dspace() */ + uint64_t spa_dedup_checksum; /* default dedup checksum */ + uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + proc_t *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ + boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ + uint64_t spa_creation_version; /* version at pool creation */ + uint64_t spa_prev_software_version; /* See ub_software_version */ + uint64_t spa_feat_for_write_obj; /* required to write to pool */ + uint64_t spa_feat_for_read_obj; /* required to read from pool */ + uint64_t spa_feat_desc_obj; /* Feature descriptions */ + uint64_t spa_feat_enabled_txg_obj; /* Feature enabled txg */ + kmutex_t spa_feat_stats_lock; /* protects spa_feat_stats */ + nvlist_t *spa_feat_stats; /* Cache of enabled features */ + /* cache feature refcounts */ + uint64_t spa_feat_refcount_cache[SPA_FEATURES]; + taskqid_t spa_deadman_tqid; /* Task id */ + uint64_t spa_deadman_calls; /* number of deadman calls */ + hrtime_t spa_sync_starttime; /* starting time of spa_sync */ + uint64_t spa_deadman_synctime; /* deadman sync expiration */ + uint64_t spa_deadman_ziotime; /* deadman zio expiration */ + uint64_t spa_all_vdev_zaps; /* ZAP of per-vd ZAP obj #s */ + spa_avz_action_t spa_avz_action; /* destroy/rebuild AVZ? */ + uint64_t spa_autotrim; /* automatic background trim? */ + uint64_t spa_errata; /* errata issues detected */ + spa_stats_t spa_stats; /* assorted spa statistics */ + spa_keystore_t spa_keystore; /* loaded crypto keys */ + + /* arc_memory_throttle() parameters during low memory condition */ + uint64_t spa_lowmem_page_load; /* memory load during txg */ + uint64_t spa_lowmem_last_txg; /* txg window start */ + + hrtime_t spa_ccw_fail_time; /* Conf cache write fail time */ + taskq_t *spa_zvol_taskq; /* Taskq for minor management */ + taskq_t *spa_prefetch_taskq; /* Taskq for prefetch threads */ + uint64_t spa_multihost; /* multihost aware (mmp) */ + mmp_thread_t spa_mmp; /* multihost mmp thread */ + list_t spa_leaf_list; /* list of leaf vdevs */ + uint64_t spa_leaf_list_gen; /* track leaf_list changes */ + uint32_t spa_hostid; /* cached system hostid */ + + /* synchronization for threads in spa_wait */ + kmutex_t spa_activities_lock; + kcondvar_t spa_activities_cv; + kcondvar_t spa_waiters_cv; + int spa_waiters; /* number of waiting threads */ + boolean_t spa_waiters_cancel; /* waiters should return */ + + /* + * spa_refcount & spa_config_lock must be the last elements + * because zfs_refcount_t changes size based on compilation options. + * In order for the MDB module to function correctly, the other + * fields must remain in the same location. + */ + spa_config_lock_t spa_config_lock[SCL_LOCKS]; /* config changes */ + zfs_refcount_t spa_refcount; /* number of opens */ + + taskq_t *spa_upgrade_taskq; /* taskq for upgrade jobs */ +}; + +extern char *spa_config_path; +extern char *zfs_deadman_failmode; +extern int spa_slop_shift; +extern void spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent); +extern void spa_taskq_dispatch_sync(spa_t *, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags); +extern void spa_load_spares(spa_t *spa); +extern void spa_load_l2cache(spa_t *spa); +extern sysevent_t *spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, + const char *name); +extern void spa_event_post(sysevent_t *ev); +extern int param_set_deadman_failmode_common(const char *val); +extern void spa_set_deadman_synctime(hrtime_t ns); +extern void spa_set_deadman_ziotime(hrtime_t ns); +extern const char *spa_history_zone(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPA_IMPL_H */ diff --git a/include/sys/spa_log_spacemap.h b/include/sys/spa_log_spacemap.h new file mode 100644 index 000000000000..b2ed77fac3e4 --- /dev/null +++ b/include/sys/spa_log_spacemap.h @@ -0,0 +1,79 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPA_LOG_SPACEMAP_H +#define _SYS_SPA_LOG_SPACEMAP_H + +#include + +typedef struct log_summary_entry { + uint64_t lse_start; /* start TXG */ + uint64_t lse_mscount; /* # of metaslabs needed to be flushed */ + uint64_t lse_blkcount; /* blocks held by this entry */ + list_node_t lse_node; +} log_summary_entry_t; + +typedef struct spa_unflushed_stats { + /* used for memory heuristic */ + uint64_t sus_memused; /* current memory used for unflushed trees */ + + /* used for block heuristic */ + uint64_t sus_blocklimit; /* max # of log blocks allowed */ + uint64_t sus_nblocks; /* # of blocks in log space maps currently */ +} spa_unflushed_stats_t; + +typedef struct spa_log_sm { + uint64_t sls_sm_obj; /* space map object ID */ + uint64_t sls_txg; /* txg logged on the space map */ + uint64_t sls_nblocks; /* number of blocks in this log */ + uint64_t sls_mscount; /* # of metaslabs flushed in the log's txg */ + avl_node_t sls_node; /* node in spa_sm_logs_by_txg */ +} spa_log_sm_t; + +int spa_ld_log_spacemaps(spa_t *); + +void spa_generate_syncing_log_sm(spa_t *, dmu_tx_t *); +void spa_flush_metaslabs(spa_t *, dmu_tx_t *); +void spa_sync_close_syncing_log_sm(spa_t *); + +void spa_cleanup_old_sm_logs(spa_t *, dmu_tx_t *); + +uint64_t spa_log_sm_blocklimit(spa_t *); +void spa_log_sm_set_blocklimit(spa_t *); +uint64_t spa_log_sm_nblocks(spa_t *); +uint64_t spa_log_sm_memused(spa_t *); + +void spa_log_sm_decrement_mscount(spa_t *, uint64_t); +void spa_log_sm_increment_current_mscount(spa_t *); + +void spa_log_summary_add_flushed_metaslab(spa_t *); +void spa_log_summary_decrement_mscount(spa_t *, uint64_t); +void spa_log_summary_decrement_blkcount(spa_t *, uint64_t); + +boolean_t spa_flush_all_logs_requested(spa_t *); + +extern int zfs_keep_log_spacemaps_at_export; + +#endif /* _SYS_SPA_LOG_SPACEMAP_H */ diff --git a/include/sys/space_map.h b/include/sys/space_map.h new file mode 100644 index 000000000000..cb81e710bd1e --- /dev/null +++ b/include/sys/space_map.h @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPACE_MAP_H +#define _SYS_SPACE_MAP_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The size of the space map object has increased to include a histogram. + * The SPACE_MAP_SIZE_V0 designates the original size and is used to + * maintain backward compatibility. + */ +#define SPACE_MAP_SIZE_V0 (3 * sizeof (uint64_t)) +#define SPACE_MAP_HISTOGRAM_SIZE 32 + +/* + * The space_map_phys is the on-disk representation of the space map. + * Consumers of space maps should never reference any of the members of this + * structure directly. These members may only be updated in syncing context. + * + * Note the smp_object is no longer used but remains in the structure + * for backward compatibility. + */ +typedef struct space_map_phys { + /* object number: not needed but kept for backwards compatibility */ + uint64_t smp_object; + + /* length of the object in bytes */ + uint64_t smp_length; + + /* space allocated from the map */ + int64_t smp_alloc; + + /* reserved */ + uint64_t smp_pad[5]; + + /* + * The smp_histogram maintains a histogram of free regions. Each + * bucket, smp_histogram[i], contains the number of free regions + * whose size is: + * 2^(i+sm_shift) <= size of free region in bytes < 2^(i+sm_shift+1) + * + * Note that, if log space map feature is enabled, histograms of + * space maps that belong to metaslabs will take into account any + * unflushed changes for their metaslabs, even though the actual + * space map doesn't have entries for these changes. + */ + uint64_t smp_histogram[SPACE_MAP_HISTOGRAM_SIZE]; +} space_map_phys_t; + +/* + * The space map object defines a region of space, its size, how much is + * allocated, and the on-disk object that stores this information. + * Consumers of space maps may only access the members of this structure. + * + * Note: the space_map may not be accessed concurrently; consumers + * must provide external locking if required. + */ +typedef struct space_map { + uint64_t sm_start; /* start of map */ + uint64_t sm_size; /* size of map */ + uint8_t sm_shift; /* unit shift */ + objset_t *sm_os; /* objset for this map */ + uint64_t sm_object; /* object id for this map */ + uint32_t sm_blksz; /* block size for space map */ + dmu_buf_t *sm_dbuf; /* space_map_phys_t dbuf */ + space_map_phys_t *sm_phys; /* on-disk space map */ +} space_map_t; + +/* + * debug entry + * + * 2 2 10 50 + * +-----+-----+------------+----------------------------------+ + * | 1 0 | act | syncpass | txg (lower bits) | + * +-----+-----+------------+----------------------------------+ + * 63 62 61 60 59 50 49 0 + * + * + * one-word entry + * + * 1 47 1 15 + * +-----------------------------------------------------------+ + * | 0 | offset (sm_shift units) | type | run | + * +-----------------------------------------------------------+ + * 63 62 16 15 14 0 + * + * + * two-word entry + * + * 2 2 36 24 + * +-----+-----+---------------------------+-------------------+ + * | 1 1 | pad | run | vdev | + * +-----+-----+---------------------------+-------------------+ + * 63 62 61 60 59 24 23 0 + * + * 1 63 + * +------+----------------------------------------------------+ + * | type | offset | + * +------+----------------------------------------------------+ + * 63 62 0 + * + * Note that a two-word entry will not straddle a block boundary. + * If necessary, the last word of a block will be padded with a + * debug entry (with act = syncpass = txg = 0). + */ + +typedef enum { + SM_ALLOC, + SM_FREE +} maptype_t; + +typedef struct space_map_entry { + maptype_t sme_type; + uint32_t sme_vdev; /* max is 2^24-1; SM_NO_VDEVID if not present */ + uint64_t sme_offset; /* max is 2^63-1; units of sm_shift */ + uint64_t sme_run; /* max is 2^36; units of sm_shift */ + + /* + * The following fields are not part of the actual space map entry + * on-disk and they are populated with the values from the debug + * entry most recently visited starting from the beginning to the + * end of the space map. + */ + uint64_t sme_txg; + uint64_t sme_sync_pass; +} space_map_entry_t; + +#define SM_NO_VDEVID (1 << SPA_VDEVBITS) + +/* one-word entry constants */ +#define SM_DEBUG_PREFIX 2 +#define SM_OFFSET_BITS 47 +#define SM_RUN_BITS 15 + +/* two-word entry constants */ +#define SM2_PREFIX 3 +#define SM2_OFFSET_BITS 63 +#define SM2_RUN_BITS 36 + +#define SM_PREFIX_DECODE(x) BF64_DECODE(x, 62, 2) +#define SM_PREFIX_ENCODE(x) BF64_ENCODE(x, 62, 2) + +#define SM_DEBUG_ACTION_DECODE(x) BF64_DECODE(x, 60, 2) +#define SM_DEBUG_ACTION_ENCODE(x) BF64_ENCODE(x, 60, 2) +#define SM_DEBUG_SYNCPASS_DECODE(x) BF64_DECODE(x, 50, 10) +#define SM_DEBUG_SYNCPASS_ENCODE(x) BF64_ENCODE(x, 50, 10) +#define SM_DEBUG_TXG_DECODE(x) BF64_DECODE(x, 0, 50) +#define SM_DEBUG_TXG_ENCODE(x) BF64_ENCODE(x, 0, 50) + +#define SM_OFFSET_DECODE(x) BF64_DECODE(x, 16, SM_OFFSET_BITS) +#define SM_OFFSET_ENCODE(x) BF64_ENCODE(x, 16, SM_OFFSET_BITS) +#define SM_TYPE_DECODE(x) BF64_DECODE(x, 15, 1) +#define SM_TYPE_ENCODE(x) BF64_ENCODE(x, 15, 1) +#define SM_RUN_DECODE(x) (BF64_DECODE(x, 0, SM_RUN_BITS) + 1) +#define SM_RUN_ENCODE(x) BF64_ENCODE((x) - 1, 0, SM_RUN_BITS) +#define SM_RUN_MAX SM_RUN_DECODE(~0ULL) +#define SM_OFFSET_MAX SM_OFFSET_DECODE(~0ULL) + +#define SM2_RUN_DECODE(x) (BF64_DECODE(x, SPA_VDEVBITS, SM2_RUN_BITS) + 1) +#define SM2_RUN_ENCODE(x) BF64_ENCODE((x) - 1, SPA_VDEVBITS, SM2_RUN_BITS) +#define SM2_VDEV_DECODE(x) BF64_DECODE(x, 0, SPA_VDEVBITS) +#define SM2_VDEV_ENCODE(x) BF64_ENCODE(x, 0, SPA_VDEVBITS) +#define SM2_TYPE_DECODE(x) BF64_DECODE(x, SM2_OFFSET_BITS, 1) +#define SM2_TYPE_ENCODE(x) BF64_ENCODE(x, SM2_OFFSET_BITS, 1) +#define SM2_OFFSET_DECODE(x) BF64_DECODE(x, 0, SM2_OFFSET_BITS) +#define SM2_OFFSET_ENCODE(x) BF64_ENCODE(x, 0, SM2_OFFSET_BITS) +#define SM2_RUN_MAX SM2_RUN_DECODE(~0ULL) +#define SM2_OFFSET_MAX SM2_OFFSET_DECODE(~0ULL) + +boolean_t sm_entry_is_debug(uint64_t e); +boolean_t sm_entry_is_single_word(uint64_t e); +boolean_t sm_entry_is_double_word(uint64_t e); + +typedef int (*sm_cb_t)(space_map_entry_t *sme, void *arg); + +int space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype); +int space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length); +int space_map_iterate(space_map_t *sm, uint64_t length, + sm_cb_t callback, void *arg); +int space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx); + +boolean_t space_map_histogram_verify(space_map_t *sm, range_tree_t *rt); +void space_map_histogram_clear(space_map_t *sm); +void space_map_histogram_add(space_map_t *sm, range_tree_t *rt, + dmu_tx_t *tx); + +uint64_t space_map_object(space_map_t *sm); +int64_t space_map_allocated(space_map_t *sm); +uint64_t space_map_length(space_map_t *sm); +uint64_t space_map_entries(space_map_t *sm, range_tree_t *rt); +uint64_t space_map_nblocks(space_map_t *sm); + +void space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx); +uint64_t space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id); +void space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx); +uint64_t space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx); +void space_map_free(space_map_t *sm, dmu_tx_t *tx); +void space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx); + +int space_map_open(space_map_t **smp, objset_t *os, uint64_t object, + uint64_t start, uint64_t size, uint8_t shift); +void space_map_close(space_map_t *sm); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_MAP_H */ diff --git a/include/sys/space_reftree.h b/include/sys/space_reftree.h new file mode 100644 index 000000000000..ca9d41dc1388 --- /dev/null +++ b/include/sys/space_reftree.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_SPACE_REFTREE_H +#define _SYS_SPACE_REFTREE_H + +#include +#include +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct space_ref { + avl_node_t sr_node; /* AVL node */ + uint64_t sr_offset; /* range offset (start or end) */ + int64_t sr_refcnt; /* associated reference count */ +} space_ref_t; + +void space_reftree_create(avl_tree_t *t); +void space_reftree_destroy(avl_tree_t *t); +void space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt); +void space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt); +void space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, + int64_t minref); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SPACE_REFTREE_H */ diff --git a/include/sys/sysevent.h b/include/sys/sysevent.h new file mode 100644 index 000000000000..6510297d601f --- /dev/null +++ b/include/sys/sysevent.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSEVENT_H +#define _SYS_SYSEVENT_H + +#include + +typedef struct sysevent { + nvlist_t *resource; +} sysevent_t; + +#endif diff --git a/include/sys/sysevent/Makefile.am b/include/sys/sysevent/Makefile.am new file mode 100644 index 000000000000..64e53763951a --- /dev/null +++ b/include/sys/sysevent/Makefile.am @@ -0,0 +1,15 @@ +COMMON_H = \ + eventdefs.h \ + dev.h + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/sysevent +libzfs_HEADERS = $(COMMON_H) +endif + +if CONFIG_KERNEL +if BUILD_LINUX +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/sysevent +kernel_HEADERS = $(COMMON_H) +endif +endif diff --git a/include/sys/sysevent/dev.h b/include/sys/sysevent/dev.h new file mode 100644 index 000000000000..1117538d822d --- /dev/null +++ b/include/sys/sysevent/dev.h @@ -0,0 +1,261 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SYSEVENT_DEV_H +#define _SYS_SYSEVENT_DEV_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Event schema for EC_DEV_ADD/ESC_DISK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_NETWORK + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_ADD/ESC_PRINTER + * + * Event Class - EC_DEV_ADD + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * Attribute Name - DEV_PROP_PREFIX + * Attribute Type - data type of the devinfo_node_property + * Attribute Value - value of the devinfo_node_property + * + * + * Event schema for EC_DEV_REMOVE/ESC_DISK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_DISK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name to the raw device. + * The name does not include the slice number component. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_NETWORK + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_NETWORK + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev name associated with the device if exists. + * /dev name associated with the driver for DLPI + * Style-2 only drivers. + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_REMOVE/ESC_PRINTER + * + * Event Class - EC_DEV_REMOVE + * Event Sub-Class - ESC_PRINTER + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - /dev/printers name associated with the device + * if exists. + * /dev name associated with the device if it exists + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path of the device without the "/devices" + * prefix. + * + * Attribute Name - DEV_DRIVER_NAME + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - driver name + * + * Attribute Name - DEV_INSTANCE + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - driver instance number + * + * + * Event schema for EC_DEV_BRANCH/ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Event Class - EC_DEV_BRANCH + * Event Sub-Class - ESC_DEV_BRANCH_ADD or ESC_DEV_BRANCH_REMOVE + * + * Attribute Name - EV_VERSION + * Attribute Type - DATA_TYPE_INT32 + * Attribute Value - event version number + * + * Attribute Name - DEV_PHYS_PATH + * Attribute Type - DATA_TYPE_STRING + * Attribute Value - physical path to the root node of the device subtree + * without the "/devices" prefix. + */ + +#define EV_VERSION "version" +#define DEV_PHYS_PATH "phys_path" +#define DEV_NAME "dev_name" +#define DEV_DRIVER_NAME "driver_name" +#define DEV_INSTANCE "instance" +#define DEV_PROP_PREFIX "prop-" + +#ifdef __linux__ +#define DEV_IDENTIFIER "devid" +#define DEV_PATH "path" +#define DEV_IS_PART "is_slice" +#define DEV_SIZE "dev_size" +#endif /* __linux__ */ + +#define EV_V1 1 + +/* maximum number of devinfo node properties added to the event */ +#define MAX_PROP_COUNT 100 + +/* only properties with size less than PROP_LEN_LIMIT are added to the event */ +#define PROP_LEN_LIMIT 1024 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_DEV_H */ diff --git a/include/sys/sysevent/eventdefs.h b/include/sys/sysevent/eventdefs.h new file mode 100644 index 000000000000..2067b355afb4 --- /dev/null +++ b/include/sys/sysevent/eventdefs.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. + */ + +#ifndef _SYS_SYSEVENT_EVENTDEFS_H +#define _SYS_SYSEVENT_EVENTDEFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * eventdefs.h contains public definitions for sysevent types (classes + * and subclasses). All additions/removal/changes are subject + * to PSARC approval. + */ + +/* Sysevent Class definitions */ +#define EC_NONE "EC_none" +#define EC_PRIV "EC_priv" +#define EC_PLATFORM "EC_platform" /* events private to platform */ +#define EC_DR "EC_dr" /* Dynamic reconfiguration event class */ +#define EC_ENV "EC_env" /* Environmental monitor event class */ +#define EC_DOMAIN "EC_domain" /* Domain event class */ +#define EC_AP_DRIVER "EC_ap_driver" /* Alternate Pathing event class */ +#define EC_IPMP "EC_ipmp" /* IP Multipathing event class */ +#define EC_DEV_ADD "EC_dev_add" /* device add event class */ +#define EC_DEV_REMOVE "EC_dev_remove" /* device remove event class */ +#define EC_DEV_BRANCH "EC_dev_branch" /* device tree branch event class */ +#define EC_DEV_STATUS "EC_dev_status" /* device status event class */ +#define EC_FM "EC_fm" /* FMA error report event */ +#define EC_ZFS "EC_zfs" /* ZFS event */ +#define EC_DATALINK "EC_datalink" /* datalink event */ +#define EC_VRRP "EC_vrrp" /* VRRP event */ + +/* + * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes + * (name/value pairs) are found in sys/sysevent/dev.h + */ +#define ESC_DISK "disk" /* disk device */ +#define ESC_NETWORK "network" /* network interface */ +#define ESC_PRINTER "printer" /* printer device */ +#define ESC_LOFI "lofi" /* lofi device */ + +/* + * EC_DEV_BRANCH subclass definitions - supporting attributes (name/value pairs) + * are found in sys/sysevent/dev.h + */ + +/* device tree branch added */ +#define ESC_DEV_BRANCH_ADD "dev_branch_add" + +/* device tree branch removed */ +#define ESC_DEV_BRANCH_REMOVE "dev_branch_remove" + +/* + * EC_DEV_STATUS subclass definitions + * + * device capacity dynamically changed + */ +#define ESC_DEV_DLE "dev_dle" + +/* LUN has received an eject request from the user */ +#define ESC_DEV_EJECT_REQUEST "dev_eject_request" + +/* FMA Fault and Error event protocol subclass */ +#define ESC_FM_ERROR "error" +#define ESC_FM_ERROR_REPLAY "error_replay" + +/* + * ZFS subclass definitions. supporting attributes (name/value paris) are found + * in sys/fs/zfs.h + */ +#define ESC_ZFS_RESILVER_START "resilver_start" +#define ESC_ZFS_RESILVER_FINISH "resilver_finish" +#define ESC_ZFS_VDEV_REMOVE "vdev_remove" +#define ESC_ZFS_VDEV_REMOVE_AUX "vdev_remove_aux" +#define ESC_ZFS_VDEV_REMOVE_DEV "vdev_remove_dev" +#define ESC_ZFS_POOL_CREATE "pool_create" +#define ESC_ZFS_POOL_DESTROY "pool_destroy" +#define ESC_ZFS_POOL_IMPORT "pool_import" +#define ESC_ZFS_POOL_EXPORT "pool_export" +#define ESC_ZFS_VDEV_ADD "vdev_add" +#define ESC_ZFS_VDEV_ATTACH "vdev_attach" +#define ESC_ZFS_VDEV_CLEAR "vdev_clear" +#define ESC_ZFS_VDEV_CHECK "vdev_check" +#define ESC_ZFS_VDEV_ONLINE "vdev_online" +#define ESC_ZFS_CONFIG_SYNC "config_sync" +#define ESC_ZFS_SCRUB_START "scrub_start" +#define ESC_ZFS_SCRUB_FINISH "scrub_finish" +#define ESC_ZFS_SCRUB_ABORT "scrub_abort" +#define ESC_ZFS_SCRUB_RESUME "scrub_resume" +#define ESC_ZFS_SCRUB_PAUSED "scrub_paused" +#define ESC_ZFS_VDEV_SPARE "vdev_spare" +#define ESC_ZFS_VDEV_AUTOEXPAND "vdev_autoexpand" +#define ESC_ZFS_BOOTFS_VDEV_ATTACH "bootfs_vdev_attach" +#define ESC_ZFS_POOL_REGUID "pool_reguid" +#define ESC_ZFS_HISTORY_EVENT "history_event" +#define ESC_ZFS_TRIM_START "trim_start" +#define ESC_ZFS_TRIM_FINISH "trim_finish" +#define ESC_ZFS_TRIM_CANCEL "trim_cancel" +#define ESC_ZFS_TRIM_RESUME "trim_resume" +#define ESC_ZFS_TRIM_SUSPEND "trim_suspend" + +/* + * datalink subclass definitions. + */ +#define ESC_DATALINK_PHYS_ADD "datalink_phys_add" /* new physical link */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SYSEVENT_EVENTDEFS_H */ diff --git a/include/sys/txg.h b/include/sys/txg.h new file mode 100644 index 000000000000..260a3b43cfe8 --- /dev/null +++ b/include/sys/txg.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_TXG_H +#define _SYS_TXG_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define TXG_CONCURRENT_STATES 3 /* open, quiescing, syncing */ +#define TXG_SIZE 4 /* next power of 2 */ +#define TXG_MASK (TXG_SIZE - 1) /* mask for size */ +#define TXG_INITIAL TXG_SIZE /* initial txg */ +#define TXG_IDX (txg & TXG_MASK) + +/* Number of txgs worth of frees we defer adding to in-core spacemaps */ +#define TXG_DEFER_SIZE 2 + +typedef struct tx_cpu tx_cpu_t; + +typedef struct txg_handle { + tx_cpu_t *th_cpu; + uint64_t th_txg; +} txg_handle_t; + +typedef struct txg_node { + struct txg_node *tn_next[TXG_SIZE]; + uint8_t tn_member[TXG_SIZE]; +} txg_node_t; + +typedef struct txg_list { + kmutex_t tl_lock; + size_t tl_offset; + spa_t *tl_spa; + txg_node_t *tl_head[TXG_SIZE]; +} txg_list_t; + +struct dsl_pool; + +extern void txg_init(struct dsl_pool *dp, uint64_t txg); +extern void txg_fini(struct dsl_pool *dp); +extern void txg_sync_start(struct dsl_pool *dp); +extern void txg_sync_stop(struct dsl_pool *dp); +extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); +extern void txg_rele_to_quiesce(txg_handle_t *txghp); +extern void txg_rele_to_sync(txg_handle_t *txghp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); + +extern void txg_delay(struct dsl_pool *dp, uint64_t txg, hrtime_t delta, + hrtime_t resolution); +extern void txg_kick(struct dsl_pool *dp); + +/* + * Wait until the given transaction group has finished syncing. + * Try to make this happen as soon as possible (eg. kick off any + * necessary syncs immediately). If txg==0, wait for the currently open + * txg to finish syncing. + */ +extern void txg_wait_synced(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait as above. Returns true if the thread was signaled while waiting. + */ +extern boolean_t txg_wait_synced_sig(struct dsl_pool *dp, uint64_t txg); + +/* + * Wait until the given transaction group, or one after it, is + * the open transaction group. Try to make this happen as soon + * as possible (eg. kick off any necessary syncs immediately) when + * should_quiesce is set. If txg == 0, wait for the next open txg. + */ +extern void txg_wait_open(struct dsl_pool *dp, uint64_t txg, + boolean_t should_quiesce); + +/* + * Returns TRUE if we are "backed up" waiting for the syncing + * transaction to complete; otherwise returns FALSE. + */ +extern boolean_t txg_stalled(struct dsl_pool *dp); + +/* returns TRUE if someone is waiting for the next txg to sync */ +extern boolean_t txg_sync_waiting(struct dsl_pool *dp); + +extern void txg_verify(spa_t *spa, uint64_t txg); + +/* + * Wait for pending commit callbacks of already-synced transactions to finish + * processing. + */ +extern void txg_wait_callbacks(struct dsl_pool *dp); + +/* + * Per-txg object lists. + */ + +#define TXG_CLEAN(txg) ((txg) - 1) + +extern void txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset); +extern void txg_list_destroy(txg_list_t *tl); +extern boolean_t txg_list_empty(txg_list_t *tl, uint64_t txg); +extern boolean_t txg_all_lists_empty(txg_list_t *tl); +extern boolean_t txg_list_add(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_remove(txg_list_t *tl, uint64_t txg); +extern void *txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg); +extern boolean_t txg_list_member(txg_list_t *tl, void *p, uint64_t txg); +extern void *txg_list_head(txg_list_t *tl, uint64_t txg); +extern void *txg_list_next(txg_list_t *tl, void *p, uint64_t txg); + +/* Global tuning */ +extern int zfs_txg_timeout; + + +#ifdef ZFS_DEBUG +#define TXG_VERIFY(spa, txg) txg_verify(spa, txg) +#else +#define TXG_VERIFY(spa, txg) +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_H */ diff --git a/include/sys/txg_impl.h b/include/sys/txg_impl.h new file mode 100644 index 000000000000..047d51b94c66 --- /dev/null +++ b/include/sys/txg_impl.h @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_TXG_IMPL_H +#define _SYS_TXG_IMPL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The tx_cpu structure is a per-cpu structure that is used to track + * the number of active transaction holds (tc_count). As transactions + * are assigned into a transaction group the appropriate tc_count is + * incremented to indicate that there are pending changes that have yet + * to quiesce. Consumers eventually call txg_rele_to_sync() to decrement + * the tc_count. A transaction group is not considered quiesced until all + * tx_cpu structures have reached a tc_count of zero. + * + * This structure is a per-cpu structure by design. Updates to this structure + * are frequent and concurrent. Having a single structure would result in + * heavy lock contention so a per-cpu design was implemented. With the fanned + * out mutex design, consumers only need to lock the mutex associated with + * thread's cpu. + * + * The tx_cpu contains two locks, the tc_lock and tc_open_lock. + * The tc_lock is used to protect all members of the tx_cpu structure with + * the exception of the tc_open_lock. This lock should only be held for a + * short period of time, typically when updating the value of tc_count. + * + * The tc_open_lock protects the tx_open_txg member of the tx_state structure. + * This lock is used to ensure that transactions are only assigned into + * the current open transaction group. In order to move the current open + * transaction group to the quiesce phase, the txg_quiesce thread must + * grab all tc_open_locks, increment the tx_open_txg, and drop the locks. + * The tc_open_lock is held until the transaction is assigned into the + * transaction group. Typically, this is a short operation but if throttling + * is occurring it may be held for longer periods of time. + */ +struct tx_cpu { + kmutex_t tc_open_lock; /* protects tx_open_txg */ + kmutex_t tc_lock; /* protects the rest of this struct */ + kcondvar_t tc_cv[TXG_SIZE]; + uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */ + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ + char tc_pad[8]; /* pad to fill 3 cache lines */ +}; + +/* + * The tx_state structure maintains the state information about the different + * stages of the pool's transaction groups. A per pool tx_state structure + * is used to track this information. The tx_state structure also points to + * an array of tx_cpu structures (described above). Although the tx_sync_lock + * is used to protect the members of this structure, it is not used to + * protect the tx_open_txg. Instead a special lock in the tx_cpu structure + * is used. Readers of tx_open_txg must grab the per-cpu tc_open_lock. + * Any thread wishing to update tx_open_txg must grab the tc_open_lock on + * every cpu (see txg_quiesce()). + */ +typedef struct tx_state { + tx_cpu_t *tx_cpu; /* protects access to tx_open_txg */ + kmutex_t tx_sync_lock; /* protects the rest of this struct */ + + uint64_t tx_open_txg; /* currently open txg id */ + uint64_t tx_quiescing_txg; /* currently quiescing txg id */ + uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ + uint64_t tx_syncing_txg; /* currently syncing txg id */ + uint64_t tx_synced_txg; /* last synced txg id */ + + hrtime_t tx_open_time; /* start time of tx_open_txg */ + + uint64_t tx_sync_txg_waiting; /* txg we're waiting to sync */ + uint64_t tx_quiesce_txg_waiting; /* txg we're waiting to open */ + + kcondvar_t tx_sync_more_cv; + kcondvar_t tx_sync_done_cv; + kcondvar_t tx_quiesce_more_cv; + kcondvar_t tx_quiesce_done_cv; + kcondvar_t tx_timeout_cv; + kcondvar_t tx_exit_cv; /* wait for all threads to exit */ + + uint8_t tx_threads; /* number of threads */ + uint8_t tx_exiting; /* set when we're exiting */ + + kthread_t *tx_sync_thread; + kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ +} tx_state_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TXG_IMPL_H */ diff --git a/include/sys/u8_textprep.h b/include/sys/u8_textprep.h new file mode 100644 index 000000000000..f8b5bed6e420 --- /dev/null +++ b/include/sys/u8_textprep.h @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_U8_TEXTPREP_H +#define _SYS_U8_TEXTPREP_H + + + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Unicode encoding conversion functions and their macros. + */ +#define UCONV_IN_BIG_ENDIAN 0x0001 +#define UCONV_OUT_BIG_ENDIAN 0x0002 +#define UCONV_IN_SYSTEM_ENDIAN 0x0004 +#define UCONV_OUT_SYSTEM_ENDIAN 0x0008 +#define UCONV_IN_LITTLE_ENDIAN 0x0010 +#define UCONV_OUT_LITTLE_ENDIAN 0x0020 +#define UCONV_IGNORE_NULL 0x0040 +#define UCONV_IN_ACCEPT_BOM 0x0080 +#define UCONV_OUT_EMIT_BOM 0x0100 + +extern int uconv_u16tou32(const uint16_t *, size_t *, uint32_t *, size_t *, + int); +extern int uconv_u16tou8(const uint16_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u32tou16(const uint32_t *, size_t *, uint16_t *, size_t *, + int); +extern int uconv_u32tou8(const uint32_t *, size_t *, uchar_t *, size_t *, int); +extern int uconv_u8tou16(const uchar_t *, size_t *, uint16_t *, size_t *, int); +extern int uconv_u8tou32(const uchar_t *, size_t *, uint32_t *, size_t *, int); + +/* + * UTF-8 text preparation functions and their macros. + * + * Among the macros defined, U8_CANON_DECOMP, U8_COMPAT_DECOMP, and + * U8_CANON_COMP are not public interfaces and must not be used directly + * at the flag input argument. + */ +#define U8_STRCMP_CS (0x00000001) +#define U8_STRCMP_CI_UPPER (0x00000002) +#define U8_STRCMP_CI_LOWER (0x00000004) + +#define U8_CANON_DECOMP (0x00000010) +#define U8_COMPAT_DECOMP (0x00000020) +#define U8_CANON_COMP (0x00000040) + +#define U8_STRCMP_NFD (U8_CANON_DECOMP) +#define U8_STRCMP_NFC (U8_CANON_DECOMP | U8_CANON_COMP) +#define U8_STRCMP_NFKD (U8_COMPAT_DECOMP) +#define U8_STRCMP_NFKC (U8_COMPAT_DECOMP | U8_CANON_COMP) + +#define U8_TEXTPREP_TOUPPER (U8_STRCMP_CI_UPPER) +#define U8_TEXTPREP_TOLOWER (U8_STRCMP_CI_LOWER) + +#define U8_TEXTPREP_NFD (U8_STRCMP_NFD) +#define U8_TEXTPREP_NFC (U8_STRCMP_NFC) +#define U8_TEXTPREP_NFKD (U8_STRCMP_NFKD) +#define U8_TEXTPREP_NFKC (U8_STRCMP_NFKC) + +#define U8_TEXTPREP_IGNORE_NULL (0x00010000) +#define U8_TEXTPREP_IGNORE_INVALID (0x00020000) +#define U8_TEXTPREP_NOWAIT (0x00040000) + +#define U8_UNICODE_320 (0) +#define U8_UNICODE_500 (1) +#define U8_UNICODE_LATEST (U8_UNICODE_500) + +#define U8_VALIDATE_ENTIRE (0x00100000) +#define U8_VALIDATE_CHECK_ADDITIONAL (0x00200000) +#define U8_VALIDATE_UCS2_RANGE (0x00400000) + +#define U8_ILLEGAL_CHAR (-1) +#define U8_OUT_OF_RANGE_CHAR (-2) + +extern int u8_validate(char *, size_t, char **, int, int *); +extern int u8_strcmp(const char *, const char *, size_t, int, size_t, int *); +extern size_t u8_textprep_str(char *, size_t *, char *, size_t *, int, size_t, + int *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_U8_TEXTPREP_H */ diff --git a/include/sys/u8_textprep_data.h b/include/sys/u8_textprep_data.h new file mode 100644 index 000000000000..03f71f26c9e1 --- /dev/null +++ b/include/sys/u8_textprep_data.h @@ -0,0 +1,35376 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * COPYRIGHT AND PERMISSION NOTICE + * + * Copyright (c) 1991-2006 Unicode, Inc. All rights reserved. Distributed under + * the Terms of Use in http://www.unicode.org/copyright.html. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of the Unicode data files and any associated documentation (the + * "Data Files") or Unicode software and any associated documentation (the + * "Software") to deal in the Data Files or Software without restriction, + * including without limitation the rights to use, copy, modify, merge, + * publish, distribute, and/or sell copies of the Data Files or Software, and + * to permit persons to whom the Data Files or Software are furnished to do so, + * provided that (a) the above copyright notice(s) and this permission notice + * appear with all copies of the Data Files or Software, (b) both the above + * copyright notice(s) and this permission notice appear in associated + * documentation, and (c) there is clear notice in each modified Data File or + * in the Software as well as in the documentation associated with the Data + * File(s) or Software that the data or software has been modified. + * + * THE DATA FILES AND SOFTWARE ARE PROVIDED "AS IS", WITHOUT WARRANTY OF ANY + * KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF + * THIRD PARTY RIGHTS. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS + * INCLUDED IN THIS NOTICE BE LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR + * CONSEQUENTIAL DAMAGES, OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, + * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER + * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE + * OF THE DATA FILES OR SOFTWARE. + * + * Except as contained in this notice, the name of a copyright holder shall not + * be used in advertising or otherwise to promote the sale, use or other + * dealings in these Data Files or Software without prior written authorization + * of the copyright holder. + * + * Unicode and the Unicode logo are trademarks of Unicode, Inc., and may be + * registered in some jurisdictions. All other trademarks and registered + * trademarks mentioned herein are the property of their respective owners. + */ +/* + * This file has been modified by Sun Microsystems, Inc. + */ + +#ifndef _SYS_U8_TEXTPREP_DATA_H +#define _SYS_U8_TEXTPREP_DATA_H + + + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To get to the combining class data, composition mappings, decomposition + * mappings, and case conversion mappings of Unicode, the data structures + * formulated and their meanings are like the following: + * + * Each UTF-8 character is seen as a 4-byte entity so that U+0061 (or 0x61 in + * UTF-8) would be seen as 0x00 0x00 0x00 0x61. Similarly, U+1D15E would be + * 0xF0 0x9D 0x85 0x9E in UTF-8. + * + * The first byte (MSB) value is an index to the b1_tbl, such as + * u8_common_b1_tbl and u8_composition_b1_tbl tables. A b1_tbl has + * indices to b2_tbl tables that have indices to b3_tbl. Each b3_tbl has + * either indices to b4_tbl or indices to b4_tbl and base values for + * displacement calculations later by using the u8_displacement_t type at + * below. Each b4_tbl table then has indices to the final tables. + * + * As an example, if we have a character with code value of U+1D15E which is + * 0xF0 0x9D 0x85 0x9E in UTF-8, the target decomposition character bytes + * that will be mapped by the mapping procedure would be the ones between + * the start_index and the end_index computed as like the following: + * + * b2_tbl_id = u8_common_b1_tbl[0][0xF0]; + * b3_tbl_id = u8_decomp_b2_tbl[0][b2_tbl_id][0x9D]; + * b4_tbl_id = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].tbl_id; + * b4_base = u8_decomp_b3_tbl[0][b3_tbl_id][0x85].base; + * if (b4_tbl_id >= 0x8000) { + * b4_tbl_id -= 0x8000; + * start_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E]; + * end_index = u8_decomp_b4_16bit_tbl[0][b4_tbl_id][0x9E + 1]; + * } else { + * start_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E]; + * end_index = u8_decomp_b4_tbl[0][b4_tbl_id][0x9E + 1]; + * } + * + * The start_index and the end_index can be used to retrieve the bytes + * possibly of multiple UTF-8 characters from the final tables. + * + * The "[0]" at the above indicates this is for Unicode Version 3.2.0 data + * as of today. Consequently, the "[1]" indicates another Unicode version + * data and it is Unicode 5.0.0 as of today. + * + * The mapping procedures and the data structures are more or less similar or + * alike among different mappings. You might want to read the u8_textprep.c + * for specific details. + * + * The tool programs created and used to generate the tables in this file are + * saved at PSARC/2007/149/materials/ as tools.tar.gz file. + */ + +/* The following is a component type for the b4_tbl vectors. */ +typedef struct { + uint16_t tbl_id; + uint16_t base; +} u8_displacement_t; + +/* + * The U8_TBL_ELEMENT_NOT_DEF macro indicates a byte that is not defined or + * used. The U8_TBL_ELEMENT_FILLER indicates the end of a UTF-8 character at + * the final tables. + */ +#define U8_TBL_ELEMENT_NOT_DEF (0xff) +#define N_ U8_TBL_ELEMENT_NOT_DEF + +#define U8_TBL_ELEMENT_FILLER (0xf7) +#define FIL_ U8_TBL_ELEMENT_FILLER + +/* + * The common b1_tbl for combining class, decompositions, tolower, and + * toupper case conversion mappings. + */ +static const uchar_t u8_common_b1_tbl[2][256] = { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, +}; + +static const uchar_t u8_combining_class_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, 5, N_, N_, N_, N_, 6, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 7, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 8, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const uchar_t u8_combining_class_b3_tbl[2][9][256] = { + { + { /* Third byte table 0. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 0, 1, N_, N_, + N_, N_, 2, N_, N_, N_, 3, 4, + N_, 5, N_, 6, 7, 8, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 1. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 9, 10, 11, 12, + 13, 14, 15, 16, 17, 18, N_, 19, + N_, 20, N_, 21, N_, 22, N_, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 2. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 32, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 33, N_, N_, 34, + N_, N_, 35, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 3. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, 36, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 4. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 37, N_, 38, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 5. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 39, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 40, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 6. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 41, 42, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 7. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 8. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + }, + { + { /* Third byte table 0. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 0, 1, N_, N_, + N_, N_, 2, N_, N_, N_, 3, 4, + 5, 6, N_, 7, 8, 9, N_, 10, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 1. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, N_, 21, + N_, 22, 23, 24, N_, 25, N_, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 2. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 35, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 36, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 37, N_, N_, 38, + N_, N_, 39, N_, 40, N_, N_, N_, + 41, N_, N_, N_, 42, 43, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 44, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 3. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, 45, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 4. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 46, N_, 47, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 5. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 48, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 6. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, 49, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 50, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 7. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 51, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { /* Third byte table 8. */ + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 52, 53, N_, + N_, 54, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + }, +}; + +/* + * Unlike other b4_tbl, the b4_tbl for combining class data has + * the combining class values not indices to the final tables. + */ +static const uchar_t u8_combining_class_b4_tbl[2][55][256] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 232, 220, 220, + 220, 220, 232, 216, 220, 220, 220, 220, + 220, 202, 202, 220, 220, 220, 220, 202, + 202, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 1, 1, 1, 1, + 1, 220, 220, 220, 220, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 240, 230, 220, + 220, 220, 230, 230, 230, 220, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 234, 234, 233, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 220, 230, 230, 230, 230, 220, 230, + 230, 230, 222, 220, 230, 230, 230, 230, + 230, 230, 0, 220, 220, 220, 220, 220, + 230, 230, 220, 230, 230, 222, 228, 230, + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 0, 20, 21, 22, 0, 23, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 24, 25, 0, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 27, 28, 29, 30, 31, + 32, 33, 34, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 35, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 230, + 230, 230, 230, 230, 230, 0, 0, 230, + 230, 230, 230, 220, 230, 0, 0, 230, + 230, 0, 220, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 220, 230, 230, 220, 230, 230, 220, + 220, 220, 230, 220, 220, 230, 220, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 220, 230, 220, 230, + 220, 230, 230, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 230, 220, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 84, 91, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 103, 103, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 107, 107, 107, 107, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 118, 118, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 122, 122, 122, 122, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 220, + 0, 216, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 129, 130, 0, 132, 0, 0, 0, + 0, 0, 130, 130, 130, 130, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 130, 0, 230, 230, 9, 0, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 0, 9, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 228, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 1, 1, 230, 230, 230, 230, + 1, 1, 1, 230, 230, 0, 0, 0, + 0, 230, 0, 0, 0, 1, 1, 230, + 220, 230, 1, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 218, 228, 232, 222, 224, 224, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 216, 216, 1, + 1, 1, 0, 0, 0, 226, 216, 216, + 216, 216, 216, 0, 0, 0, 0, 0, + 0, 0, 0, 220, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 220, 0, 0, 230, 230, 230, + 230, 230, 220, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 232, 220, 220, + 220, 220, 232, 216, 220, 220, 220, 220, + 220, 202, 202, 220, 220, 220, 220, 202, + 202, 220, 220, 220, 220, 220, 220, 220, + 220, 220, 220, 220, 1, 1, 1, 1, + 1, 220, 220, 220, 220, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 240, 230, 220, + 220, 220, 230, 230, 230, 220, 220, 0, + 230, 230, 230, 220, 220, 220, 220, 230, + 232, 220, 220, 230, 233, 234, 234, 233, + 234, 234, 233, 230, 230, 230, 230, 230, + 230, 230, 230, 230, 230, 230, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 220, 230, 230, 230, 230, 220, 230, + 230, 230, 222, 220, 230, 230, 230, 230, + 230, 230, 220, 220, 220, 220, 220, 220, + 230, 230, 220, 230, 230, 222, 228, 230, + 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 19, 20, 21, 22, 0, 23, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 24, 25, 0, 230, 220, 0, 18, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 27, 28, 29, 30, 31, + 32, 33, 34, 230, 230, 220, 220, 230, + 230, 230, 230, 230, 220, 230, 230, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 35, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 230, + 230, 230, 230, 230, 230, 0, 0, 230, + 230, 230, 230, 220, 230, 0, 0, 230, + 230, 0, 220, 230, 230, 220, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 220, 230, 230, 220, 230, 230, 220, + 220, 220, 230, 220, 220, 230, 220, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 220, 230, 220, 230, + 220, 230, 230, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 230, 230, 230, 230, + 230, 230, 220, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 230, 220, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 84, 91, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 9, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 103, 103, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 107, 107, 107, 107, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 118, 118, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 122, 122, 122, 122, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 220, + 0, 216, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 129, 130, 0, 132, 0, 0, 0, + 0, 0, 130, 130, 130, 130, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 130, 0, 230, 230, 9, 0, 230, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 220, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 0, 9, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 228, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 222, 230, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 230, + 220, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 230, 220, 230, 230, 230, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 220, 230, 230, 230, 230, 230, + 230, 230, 220, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 230, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 1, 1, 230, 230, 230, 230, + 1, 1, 1, 230, 230, 0, 0, 0, + 0, 230, 0, 0, 0, 1, 1, 230, + 220, 230, 1, 1, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 218, 228, 232, 222, 224, 224, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 26, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 230, 230, 230, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 220, 0, 230, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 230, 1, 220, 0, 0, 0, 0, 9, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 216, 216, 1, + 1, 1, 0, 0, 0, 226, 216, 216, + 216, 216, 216, 0, 0, 0, 0, 0, + 0, 0, 0, 220, 220, 220, 220, 220, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 220, 220, 220, 0, 0, 230, 230, 230, + 230, 230, 220, 220, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 230, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 230, 230, 230, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + }, + }, +}; + +static const uchar_t u8_composition_b1_tbl[2][256] = { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, +}; + +static const uchar_t u8_composition_b2_tbl[2][1][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_composition_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 2470 }, + { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, + { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, + { N_, 0 }, { 6, 3270 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, + { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, + { N_, 0 }, { 10, 4265 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 11, 4293 }, { 12, 4312 }, { N_, 0 }, + { 13, 4326 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 14, 4347 }, + { N_, 0 }, { N_, 0 }, { 15, 4374 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 16, 4391 }, + { 17, 4416 }, { 18, 4425 }, { N_, 0 }, + { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, + { N_, 0 }, { 22, 4503 }, { N_, 0 }, + { 23, 4529 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 4563 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 25, 4572 }, { 26, 4588 }, + { 27, 4620 }, { 28, 4666 }, { 0x8003, 4682 }, + { 0x8004, 5254 }, { 29, 5616 }, { 30, 5646 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 31, 5684 }, + { 32, 5708 }, { 33, 5732 }, { 34, 5780 }, + { 35, 5900 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 36, 6012 }, { 37, 6241 }, { 38, 6358 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { 0x8000, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 2470 }, + { 0x8001, 2491 }, { 1, 2871 }, { 2, 2959 }, + { 3, 3061 }, { 4, 3212 }, { 5, 3226 }, + { N_, 0 }, { 6, 3270 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x8002, 3277 }, + { 7, 3774 }, { 8, 3949 }, { 9, 4198 }, + { N_, 0 }, { 10, 4265 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 11, 4293 }, { 12, 4312 }, { N_, 0 }, + { 13, 4326 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 14, 4347 }, + { N_, 0 }, { N_, 0 }, { 15, 4374 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 16, 4391 }, + { 17, 4416 }, { 18, 4425 }, { N_, 0 }, + { 19, 4451 }, { 20, 4460 }, { 21, 4469 }, + { N_, 0 }, { 22, 4503 }, { N_, 0 }, + { 23, 4529 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 4563 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 25, 4572 }, { 26, 4662 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 27, 4671 }, { 28, 4687 }, + { 29, 4719 }, { 30, 4765 }, { 0x8003, 4781 }, + { 0x8004, 5353 }, { 31, 5715 }, { 32, 5745 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 33, 5783 }, + { 34, 5807 }, { 35, 5831 }, { 36, 5879 }, + { 37, 5999 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 38, 6111 }, { 39, 6340 }, { 40, 6457 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_composition_b4_tbl[2][41][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 73, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 38, 46, 46, 46, 46, + 46, 54, 62, 62, 62, 62, 62, 62, + 62, 70, 78, 86, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 108, 144, 144, 144, 144, 144, 144, 144, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 14, 22, 30, 30, 30, 30, 30, 37, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 15, 15, 70, 70, + 70, 70, 112, 133, 154, 154, 154, 162, + 162, 162, 162, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 20, 20, 20, 27, 27, 46, 59, + 66, 91, 91, 98, 98, 98, 98, 105, + 105, 105, 105, 105, 130, 130, 130, 130, + 137, 137, 137, 137, 144, 144, 151, 151, + 151, 164, 164, 164, 171, 171, 190, 203, + 210, 235, 235, 242, 242, 242, 242, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 25, 25, 25, 25, + 32, 32, 32, 32, 39, 39, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 60, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 21, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 14, 14, 14, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, + 9, 18, 18, 18, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 25, + 25, 25, 25, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 25, 25, 25, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 16, + 16, 16, 16, 24, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 38, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 16, + 16, 16, 16, 16, 16, 16, 16, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 8, 8, + 8, 16, 16, 16, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 32, 32, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 16, 16, + 16, 24, 24, 24, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 40, 40, 48, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 64, 72, 72, 72, 80, + 88, 88, 88, 96, 104, 112, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 56, 56, 56, 56, 56, + 56, 64, 72, 72, 80, 80, 80, 80, + 80, 80, 80, 88, 96, 104, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 18, 18, 27, 27, + 36, 36, 45, 45, 54, 54, 63, 63, + 72, 72, 81, 81, 90, 90, 99, 99, + 108, 108, 117, 117, 117, 126, 126, 135, + 135, 144, 144, 144, 144, 144, 144, 144, + 161, 161, 161, 178, 178, 178, 195, 195, + 195, 212, 212, 212, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 18, + 18, 18, 18, 18, 27, 27, 36, 36, + 45, 45, 54, 54, 63, 63, 72, 72, + 81, 81, 90, 90, 99, 99, 108, 108, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 9, 9, 18, 18, 27, + 27, 36, 36, 36, 36, 36, 36, 36, + 53, 53, 53, 70, 70, 70, 87, 87, + 87, 104, 104, 104, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 130, 139, 148, 157, 157, 157, 157, 157, + 157, 157, 157, 157, 157, 157, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 58, 58, 58, 58, + 58, 58, 58, 58, 58, 58, 58, 58, + 58, 58, 58, 73, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 15, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 38, 46, 46, 46, 46, + 46, 54, 62, 62, 62, 62, 62, 62, + 62, 70, 78, 86, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 94, 94, 94, 94, 94, 94, 94, 94, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, 102, 102, 102, 102, 102, 102, 102, + 102, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 36, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 108, 144, 144, 144, 144, 144, 144, 144, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, 151, 151, 151, 151, 151, 151, 151, + 151, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 14, 22, 30, 30, 30, 30, 30, 37, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 15, 15, 15, 15, 70, 70, + 70, 70, 112, 133, 154, 154, 154, 162, + 162, 162, 162, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 20, 20, 20, 27, 27, 46, 59, + 66, 91, 91, 98, 98, 98, 98, 105, + 105, 105, 105, 105, 130, 130, 130, 130, + 137, 137, 137, 137, 144, 144, 151, 151, + 151, 164, 164, 164, 171, 171, 190, 203, + 210, 235, 235, 242, 242, 242, 242, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, 249, 249, 249, 249, 249, 249, 249, + 249, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 25, 25, 25, 25, + 32, 32, 32, 32, 39, 39, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 53, 53, 53, + 53, 53, 53, 53, 53, 60, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 21, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 14, 14, 14, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 9, 9, 9, 9, 9, 9, 9, + 9, 18, 18, 18, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 25, + 25, 25, 25, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 17, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 25, 25, 25, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 34, 34, + 34, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 18, 18, 27, 27, 36, 36, 45, 45, + 45, 45, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 63, 63, 72, 72, 81, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 16, + 16, 16, 16, 24, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 38, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 16, + 16, 16, 16, 16, 16, 16, 16, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 8, 8, 16, 16, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 8, 8, + 8, 16, 16, 16, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 32, 32, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 8, 8, 16, 16, + 16, 24, 24, 24, 24, 24, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 40, 40, 48, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 64, 72, 72, 72, 80, + 88, 88, 88, 96, 104, 112, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, 120, 120, 120, 120, 120, 120, 120, + 120, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 8, 16, 16, 16, 24, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 40, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 56, 56, 56, 56, 56, + 56, 64, 72, 72, 80, 80, 80, 80, + 80, 80, 80, 88, 96, 104, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 9, 9, 9, 9, 18, 18, 27, 27, + 36, 36, 45, 45, 54, 54, 63, 63, + 72, 72, 81, 81, 90, 90, 99, 99, + 108, 108, 117, 117, 117, 126, 126, 135, + 135, 144, 144, 144, 144, 144, 144, 144, + 161, 161, 161, 178, 178, 178, 195, 195, + 195, 212, 212, 212, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, 229, 229, 229, 229, 229, 229, 229, + 229, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 18, + 18, 18, 18, 18, 27, 27, 36, 36, + 45, 45, 54, 54, 63, 63, 72, 72, + 81, 81, 90, 90, 99, 99, 108, 108, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, 117, 117, 117, 117, 117, 117, 117, + 117, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 9, 9, 9, 18, 18, 27, + 27, 36, 36, 36, 36, 36, 36, 36, + 53, 53, 53, 70, 70, 70, 87, 87, + 87, 104, 104, 104, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 121, 121, 121, 121, 121, 121, 121, 121, + 130, 139, 148, 157, 157, 157, 157, 157, + 157, 157, 157, 157, 157, 157, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, 166, 166, 166, 166, 166, 166, 166, + 166, + }, + }, +}; + +static const uint16_t u8_composition_b4_16bit_tbl[2][5][257] = { + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 24, + 24, 24, 124, 146, 177, 219, 327, 335, + 379, 427, 521, 528, 562, 602, 624, 683, + 782, 797, 797, 849, 894, 941, 1061, 1076, + 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, + 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, + 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, + 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, + 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 29, 36, 43, 56, + 64, 64, 64, 93, 93, 93, 93, 93, + 101, 101, 101, 101, 101, 130, 151, 158, + 158, 165, 165, 165, 165, 190, 190, 190, + 190, 190, 190, 219, 219, 226, 233, 246, + 254, 254, 254, 283, 283, 283, 283, 283, + 291, 291, 291, 291, 291, 320, 341, 348, + 348, 355, 355, 355, 355, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 49, 49, 49, 77, 77, + 112, 112, 160, 160, 160, 160, 160, 160, + 188, 188, 196, 196, 196, 196, 237, 237, + 237, 237, 272, 272, 272, 280, 280, 288, + 288, 288, 344, 344, 344, 344, 372, 372, + 414, 414, 469, 469, 469, 469, 469, 469, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 29, 58, 66, 74, 82, 90, 98, + 106, 135, 164, 172, 180, 188, 196, 204, + 212, 227, 242, 242, 242, 242, 242, 242, + 242, 257, 272, 272, 272, 272, 272, 272, + 272, 301, 330, 338, 346, 354, 362, 370, + 378, 407, 436, 444, 452, 460, 468, 476, + 484, 506, 528, 528, 528, 528, 528, 528, + 528, 550, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 45, 60, 60, 60, 60, 60, 60, + 60, 82, 104, 104, 104, 104, 104, 104, + 104, 104, 126, 126, 126, 126, 126, 126, + 126, 155, 184, 192, 200, 208, 216, 224, + 232, 261, 290, 298, 306, 314, 322, 330, + 338, 346, 346, 346, 346, 354, 354, 354, + 354, 354, 354, 354, 354, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, + }, + }, + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 8, 16, 24, + 24, 24, 124, 146, 177, 219, 327, 335, + 379, 427, 521, 528, 562, 602, 624, 683, + 782, 797, 797, 849, 894, 941, 1061, 1076, + 1118, 1133, 1193, 1233, 1233, 1233, 1233, 1233, + 1233, 1233, 1333, 1355, 1386, 1428, 1536, 1544, + 1588, 1643, 1731, 1744, 1778, 1818, 1840, 1899, + 1998, 2013, 2013, 2065, 2110, 2164, 2284, 2299, + 2348, 2363, 2430, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, 2470, 2470, 2470, 2470, 2470, 2470, 2470, + 2470, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 29, 29, 36, 43, 56, + 64, 64, 64, 93, 93, 93, 93, 93, + 101, 101, 101, 101, 101, 130, 151, 158, + 158, 165, 165, 165, 165, 190, 190, 190, + 190, 190, 190, 219, 219, 226, 233, 246, + 254, 254, 254, 283, 283, 283, 283, 283, + 291, 291, 291, 291, 291, 320, 341, 348, + 348, 355, 355, 355, 355, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, 380, 380, 380, 380, 380, 380, 380, + 380, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 49, 49, 49, 49, 77, 77, + 112, 112, 160, 160, 160, 160, 160, 160, + 188, 188, 196, 196, 196, 196, 237, 237, + 237, 237, 272, 272, 272, 280, 280, 288, + 288, 288, 344, 344, 344, 344, 372, 372, + 414, 414, 469, 469, 469, 469, 469, 469, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, 497, 497, 497, 497, 497, 497, 497, + 497, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 29, 58, 66, 74, 82, 90, 98, + 106, 135, 164, 172, 180, 188, 196, 204, + 212, 227, 242, 242, 242, 242, 242, 242, + 242, 257, 272, 272, 272, 272, 272, 272, + 272, 301, 330, 338, 346, 354, 362, 370, + 378, 407, 436, 444, 452, 460, 468, 476, + 484, 506, 528, 528, 528, 528, 528, 528, + 528, 550, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, 572, 572, 572, 572, 572, 572, 572, + 572, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 30, 30, 30, 30, 30, 30, + 30, 45, 60, 60, 60, 60, 60, 60, + 60, 82, 104, 104, 104, 104, 104, 104, + 104, 104, 126, 126, 126, 126, 126, 126, + 126, 155, 184, 192, 200, 208, 216, 224, + 232, 261, 290, 298, 306, 314, 322, 330, + 338, 346, 346, 346, 346, 354, 354, 354, + 354, 354, 354, 354, 354, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, 362, 362, 362, 362, 362, 362, 362, + 362, + }, + }, +}; + +static const uchar_t u8_composition_final_tbl[2][6623] = { + { + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, + 0x10, 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, + 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x80, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x81, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x83, FIL_, + 0xC3, 0x83, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, + 0xA0, FIL_, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x82, FIL_, 0xCC, + 0x84, FIL_, 0xC4, 0x80, FIL_, 0xCC, 0x88, FIL_, + 0xC3, 0x84, FIL_, 0xCC, 0x8A, FIL_, 0xC3, 0x85, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBA, 0xA2, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x8D, FIL_, 0x03, 0xCC, 0x87, FIL_, + 0xE1, 0xB8, 0x82, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0x86, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, + FIL_, 0xCC, 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, + 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0x8A, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x88, + FIL_, 0x06, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8E, + FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xB8, 0x8A, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB8, 0x8C, FIL_, 0x11, 0xCC, 0x80, FIL_, 0xC3, + 0x88, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x89, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0x8A, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC8, + 0xA8, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x86, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x84, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, 0xB0, FIL_, + 0xE1, 0xB8, 0x9A, FIL_, 0xCC, 0xAD, FIL_, 0xE1, + 0xB8, 0x98, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, + 0xBC, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, + FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x92, FIL_, 0xCC, + 0x86, FIL_, 0xC4, 0x94, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0x96, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9A, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, + 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x87, + FIL_, 0xC4, 0xA0, FIL_, 0xCC, 0x84, FIL_, 0xE1, + 0xB8, 0xA0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9C, + FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xA2, FIL_, 0xCC, 0x86, FIL_, + 0xC4, 0x9E, FIL_, 0x07, 0xCC, 0xAE, FIL_, 0xE1, + 0xB8, 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, + 0xA2, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xA4, FIL_, + 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0xA8, FIL_, 0xCC, + 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0xA4, FIL_, 0x0F, 0xCC, 0x84, FIL_, 0xC4, + 0xAA, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, + 0xCC, 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x83, + FIL_, 0xC4, 0xA8, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x8D, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0xAC, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x8A, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, + 0x87, FIL_, 0xC4, 0xB0, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0xAC, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, + FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, + 0x05, 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB8, 0xB0, FIL_, 0xCC, 0xA7, FIL_, + 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB2, FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xC4, 0xBB, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0xB6, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB8, 0xBC, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, + 0xBE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, FIL_, + 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB8, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x84, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0x83, FIL_, 0xC3, + 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, + FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, + 0xC5, 0x87, FIL_, 0x10, 0xCC, 0xA8, FIL_, 0xC7, + 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x8E, FIL_, + 0xCC, 0x80, FIL_, 0xC3, 0x92, FIL_, 0xCC, 0x9B, + FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x8F, FIL_, 0xC8, + 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x91, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0x8C, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, + FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, FIL_, 0xCC, + 0x83, FIL_, 0xC3, 0x95, FIL_, 0xCC, 0x86, FIL_, + 0xC5, 0x8E, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, + FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x8E, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0x91, + FIL_, 0xC8, 0x92, FIL_, 0xCC, 0xA7, FIL_, 0xC5, + 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x98, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x9E, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0x9A, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x98, FIL_, 0xCC, 0x81, FIL_, + 0xC5, 0x94, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x90, + FIL_, 0x07, 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, + FIL_, 0xC5, 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, + 0xA0, FIL_, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, + 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0xAE, FIL_, 0xCC, 0xA6, FIL_, 0xC8, + 0x9A, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAA, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0xB0, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB9, 0xAC, FIL_, 0x13, 0xCC, 0xA8, + FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xC5, + 0xA8, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAA, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0x9A, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0xAC, FIL_, 0xCC, 0x8A, FIL_, 0xC5, + 0xAE, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x96, FIL_, 0xCC, 0x8B, + FIL_, 0xC5, 0xB0, FIL_, 0xCC, 0xA4, FIL_, 0xE1, + 0xB9, 0xB2, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, + 0xB4, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0x9B, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x9C, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x93, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0x02, 0xCC, 0x83, + FIL_, 0xE1, 0xB9, 0xBC, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB9, 0xBE, FIL_, 0x06, 0xCC, 0x82, FIL_, + 0xC5, 0xB4, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xBA, + 0x84, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, 0x86, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x82, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0x80, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0xCC, 0x88, + FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0x09, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0xB4, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, + FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB2, FIL_, 0xCC, + 0x82, FIL_, 0xC5, 0xB6, FIL_, 0xCC, 0x88, FIL_, + 0xC5, 0xB8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0xB2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xB8, + FIL_, 0x06, 0xCC, 0x87, FIL_, 0xC5, 0xBB, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xBA, 0x94, FIL_, 0xCC, 0x82, FIL_, 0xE1, + 0xBA, 0x90, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, + FIL_, 0x10, 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x81, FIL_, 0xCC, 0xA8, + FIL_, 0xC4, 0x85, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBA, 0xA1, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA3, FIL_, + 0xCC, 0x84, FIL_, 0xC4, 0x81, FIL_, 0xCC, 0x91, + FIL_, 0xC8, 0x83, FIL_, 0xCC, 0x8A, FIL_, 0xC3, + 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xA4, FIL_, + 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, 0xCC, 0x82, + FIL_, 0xC3, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0xA1, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xA0, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, 0xCC, 0xA5, + FIL_, 0xE1, 0xB8, 0x81, FIL_, 0x03, 0xCC, 0xB1, + FIL_, 0xE1, 0xB8, 0x87, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x83, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, + 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xC3, 0xA7, FIL_, + 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8D, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0x87, FIL_, 0x06, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, + 0x93, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x8B, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, 0xCC, + 0xA7, FIL_, 0xE1, 0xB8, 0x91, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0x8F, FIL_, 0x11, 0xCC, 0xA8, FIL_, + 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x9B, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0x9B, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0x93, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, FIL_, + 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xBD, FIL_, 0xCC, + 0x86, FIL_, 0xC4, 0x95, FIL_, 0xCC, 0xA7, FIL_, + 0xC8, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xBB, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xA9, FIL_, 0xCC, 0x91, + FIL_, 0xC8, 0x87, FIL_, 0xCC, 0x80, FIL_, 0xC3, + 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, + 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, + 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xB5, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0x9D, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, + 0x8C, FIL_, 0xC7, 0xA7, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xA1, FIL_, 0x08, 0xCC, 0x8C, FIL_, + 0xC8, 0x9F, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA5, + FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xBA, 0x96, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0xA7, FIL_, + 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xAE, FIL_, 0xE1, + 0xB8, 0xAB, FIL_, 0x0E, 0xCC, 0x81, FIL_, 0xC3, + 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xAC, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, 0xCC, + 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x89, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x8B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x89, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0xB0, + FIL_, 0xE1, 0xB8, 0xAD, FIL_, 0xCC, 0xA8, FIL_, + 0xC4, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0xAD, + FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, + 0x83, FIL_, 0xC4, 0xA9, FIL_, 0xCC, 0x88, FIL_, + 0xC3, 0xAF, FIL_, 0x02, 0xCC, 0x82, FIL_, 0xC4, + 0xB5, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xB0, FIL_, + 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0xB3, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB1, FIL_, 0xCC, + 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, + 0xB5, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB7, FIL_, 0xCC, 0x81, FIL_, 0xC4, 0xBA, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x8C, + FIL_, 0xC4, 0xBE, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, + 0xBD, FIL_, 0x03, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, + 0x83, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x81, FIL_, + 0x09, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, FIL_, + 0xCC, 0x83, FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x84, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x86, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0x88, FIL_, 0xCC, 0x80, FIL_, + 0xC7, 0xB9, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xB3, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, + 0x87, FIL_, 0xC8, 0xAF, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x8D, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, + 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x92, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0x91, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0xA8, + FIL_, 0xC7, 0xAB, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0xB1, + FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x95, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9B, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0x99, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x93, FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, + FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x99, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0xCC, 0x81, + FIL_, 0xC5, 0x9B, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB9, 0xA1, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0x9D, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, 0x08, 0xCC, + 0x88, FIL_, 0xE1, 0xBA, 0x97, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, + 0xE1, 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xAD, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0xAB, FIL_, 0xCC, 0xA6, + FIL_, 0xC8, 0x9B, FIL_, 0x13, 0xCC, 0x81, FIL_, + 0xC3, 0xBA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, + FIL_, 0xCC, 0x83, FIL_, 0xC5, 0xA9, FIL_, 0xCC, + 0x8F, FIL_, 0xC8, 0x95, FIL_, 0xCC, 0xA8, FIL_, + 0xC5, 0xB3, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, + FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xA4, FIL_, 0xE1, + 0xB9, 0xB3, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, + 0xA7, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB5, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB7, FIL_, + 0xCC, 0x9B, FIL_, 0xC6, 0xB0, FIL_, 0xCC, 0x84, + FIL_, 0xC5, 0xAB, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, 0x8A, + FIL_, 0xC5, 0xAF, FIL_, 0x02, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x82, FIL_, 0xC5, + 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0x81, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xBA, 0x85, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x87, FIL_, 0xCC, 0x8A, FIL_, + 0xE1, 0xBA, 0x98, FIL_, 0x02, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, + 0xBA, 0x8F, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB5, FIL_, 0xCC, + 0x82, FIL_, 0xC5, 0xB7, FIL_, 0xCC, 0x84, FIL_, + 0xC8, 0xB3, FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, + 0x99, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0xBF, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xBD, FIL_, 0x06, 0xCC, + 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0xCC, 0x87, FIL_, + 0xC5, 0xBC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xBA, + 0x95, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x93, + FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xBA, FIL_, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0x91, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, + 0xBA, 0xA8, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, + 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA4, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, + 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, + 0xBA, 0xBE, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0x80, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x84, + FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, + 0x04, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x96, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x94, FIL_, 0x03, 0xCC, 0x84, + FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8C, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, + 0x8E, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, + 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, + 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, 0x01, 0xCC, + 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x84, FIL_, + 0xC7, 0xA3, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0xBD, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, 0x01, 0xCC, + 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x95, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBB, 0x91, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8D, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, + FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, 0x8F, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xC7, 0x98, FIL_, 0xCC, 0x84, FIL_, + 0xC7, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x9A, + FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xB0, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, 0x83, + FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBA, 0xB2, FIL_, 0x04, 0xCC, 0x80, FIL_, + 0xE1, 0xBA, 0xB1, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xB5, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, + FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xB9, 0x91, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x93, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBB, 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0xA2, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xA0, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x9E, + FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0x05, 0xCC, 0x83, + FIL_, 0xE1, 0xBB, 0xAE, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xB0, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0xAC, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, + 0xA8, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAA, + FIL_, 0x05, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB1, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0xAB, FIL_, 0x01, 0xCC, 0x8C, + FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, + 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, + FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, + 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0x89, FIL_, 0xCC, 0x81, FIL_, 0xCE, + 0x86, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, FIL_, 0xCC, + 0x86, FIL_, 0xE1, 0xBE, 0xB8, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xCE, 0x88, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0x88, FIL_, 0x05, 0xCC, 0x94, FIL_, 0xE1, 0xBC, + 0xA9, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x8C, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0x07, 0xCC, 0x81, + FIL_, 0xCE, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xCE, + 0xAA, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, + FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0x99, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB8, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB9, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0x04, 0xCC, 0x94, + FIL_, 0xE1, 0xBD, 0x89, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBF, 0xB8, FIL_, 0xCC, 0x81, FIL_, 0xCE, + 0x8C, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x88, + FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, + FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0x8E, FIL_, + 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0x99, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0xAA, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x88, FIL_, 0xCE, + 0xAB, FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xBA, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8F, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0xA8, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCC, 0x81, + FIL_, 0xCE, 0xAC, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xB0, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, + 0x80, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBE, 0xB6, FIL_, + 0xCC, 0x86, FIL_, 0xE1, 0xBE, 0xB0, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBE, 0xB1, FIL_, 0x04, 0xCC, 0x81, + FIL_, 0xCE, 0xAD, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBC, 0x91, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB2, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0x90, + FIL_, 0x06, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB4, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBC, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0xA0, FIL_, 0x08, 0xCD, 0x82, FIL_, 0xE1, + 0xBF, 0x96, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, + 0x90, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBF, 0x91, FIL_, 0xCC, 0x88, FIL_, + 0xCF, 0x8A, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB6, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xCF, 0x8C, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0x80, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, 0x02, 0xCC, + 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x94, FIL_, + 0xE1, 0xBD, 0x91, FIL_, 0xCC, 0x86, FIL_, 0xE1, + 0xBF, 0xA0, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0xA6, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x88, + FIL_, 0xCF, 0x8B, FIL_, 0x06, 0xCC, 0x94, FIL_, + 0xE1, 0xBD, 0xA1, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0xB3, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xBC, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xB6, + FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, 0xA0, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, 0x03, 0xCD, + 0x82, FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x90, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBF, 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0xA7, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, + 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, + 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x90, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x92, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, + 0x03, 0xCC, 0x86, FIL_, 0xD3, 0x96, FIL_, 0xCC, + 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x88, FIL_, + 0xD0, 0x81, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, + 0x9C, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x81, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, + 0x99, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, + 0x86, FIL_, 0xD0, 0x8E, FIL_, 0xCC, 0x8B, FIL_, + 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB0, + FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, + 0x88, FIL_, 0xD1, 0x91, FIL_, 0xCC, 0x86, FIL_, + 0xD3, 0x97, FIL_, 0x02, 0xCC, 0x88, FIL_, 0xD3, + 0x9D, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x82, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, + 0xCC, 0x88, FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x86, + FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x80, FIL_, 0xD1, + 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xA3, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, + 0x84, FIL_, 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, + 0xD1, 0x9E, FIL_, 0xCC, 0x8B, FIL_, 0xD3, 0xB3, + FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, + 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, + 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, + FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, + FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x93, FIL_, 0xD8, + 0xA2, FIL_, 0xD9, 0x95, FIL_, 0xD8, 0xA5, FIL_, + 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, + 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, + 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, + FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, + 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, + 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, + FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, + 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, + 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, + 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, + 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, + 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, + 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, + 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, + FIL_, 0x02, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, + 0x8A, FIL_, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, + 0x8C, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, + 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, + 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, + FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, + 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0xE0, 0xB3, + 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, + 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0x01, 0xE0, + 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, + 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, + 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, + 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, + FIL_, 0x03, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, + 0x9C, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, + 0x9A, FIL_, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, + 0x9E, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, + 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, + 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xB8, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB8, 0xB9, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB9, 0x9C, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xE1, 0xB9, 0x9D, FIL_, 0x01, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xA8, FIL_, 0x01, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xA9, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xE1, 0xBA, 0xB6, FIL_, 0xCC, 0x82, FIL_, 0xE1, + 0xBA, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xE1, + 0xBA, 0xB7, FIL_, 0xCC, 0x82, FIL_, 0xE1, 0xBA, + 0xAD, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x86, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x87, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x98, FIL_, 0x01, 0xCC, 0x82, FIL_, 0xE1, 0xBB, + 0x99, FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBC, + 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x84, + FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x80, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x86, FIL_, 0x04, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x87, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x85, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x81, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x83, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x84, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x85, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x86, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x87, FIL_, 0x04, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x8A, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, + FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8D, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x8B, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0x8F, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x89, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8A, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8B, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8C, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8D, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x8F, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x92, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x94, FIL_, 0x02, 0xCC, 0x80, + FIL_, 0xE1, 0xBC, 0x93, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0x95, FIL_, 0x02, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0x9A, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBC, 0x9C, FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x9D, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x90, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA4, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0x04, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xA5, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBC, 0xA7, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x91, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x92, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x93, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x94, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x95, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x96, FIL_, 0x01, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0x97, FIL_, 0x04, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAE, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBC, 0xAC, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x98, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAA, + FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xAF, + FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x99, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAD, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0xAB, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9A, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9B, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9C, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9D, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0x9F, FIL_, 0x03, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xB4, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0xB2, FIL_, 0x03, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0xB5, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xB7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, + 0xB3, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0xBC, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBA, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBE, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xBB, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xBF, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0xBD, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x82, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBD, 0x84, FIL_, 0x02, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0x83, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0x85, FIL_, 0x02, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0x8C, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0x8A, FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0x8D, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0x8B, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBD, + 0x94, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x92, FIL_, + 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x97, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x93, FIL_, 0x03, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBD, 0x9F, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0x9B, FIL_, 0x04, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0xA4, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xA2, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, + 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA0, + FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA7, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xA5, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA1, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0xA3, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA2, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA3, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA4, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA5, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA6, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA7, FIL_, 0x04, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xAA, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBD, 0xAE, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xA8, FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0xAD, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0xA9, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAF, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAB, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAA, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAB, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAC, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAD, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAE, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xAF, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB2, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x82, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB2, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xB7, FIL_, + 0x03, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8D, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBF, 0x8E, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x87, FIL_, 0x01, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0xB7, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0x9D, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBF, 0x9E, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0x9A, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0x9B, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x86, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8D, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8F, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x87, 0x8E, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0x8C, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0xA4, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x88, 0xA6, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x87, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB0, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB1, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB4, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB5, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB8, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x89, 0xB9, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x80, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x81, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA0, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA1, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x84, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x85, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x88, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0x89, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA2, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xA3, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAD, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAE, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8A, 0xAF, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAA, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAB, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAC, FIL_, 0x01, 0xCC, 0xB8, FIL_, + 0xE2, 0x8B, 0xAD, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0x94, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x8C, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8E, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x90, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x92, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x94, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0x96, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0x98, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0x9A, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x9C, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9E, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA0, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA2, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0xA5, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xA7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xA9, FIL_, 0x02, 0xE3, 0x82, 0x9A, + FIL_, 0xE3, 0x81, 0xB1, FIL_, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0xB0, FIL_, 0x02, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x81, 0xB4, FIL_, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0xB3, FIL_, 0x02, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB7, FIL_, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB6, FIL_, 0x02, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB9, FIL_, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBA, FIL_, + 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xBC, + FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xBD, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xAC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xAE, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x82, 0xB0, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB2, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB4, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB6, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x82, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xBE, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x80, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x82, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x85, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x87, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x89, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0x90, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, + 0x83, 0x91, FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0x93, FIL_, 0xE3, 0x82, 0x9A, FIL_, + 0xE3, 0x83, 0x94, FIL_, 0x02, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0x96, FIL_, 0xE3, 0x82, 0x9A, + FIL_, 0xE3, 0x83, 0x97, FIL_, 0x02, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x83, 0x9A, FIL_, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x99, FIL_, 0x02, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9D, FIL_, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x9C, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB7, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0xB8, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0xB9, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xBA, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0xBE, FIL_, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, + }, + { + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAE, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA0, FIL_, + 0x01, 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAF, FIL_, + 0x10, 0xCC, 0xA5, FIL_, 0xE1, 0xB8, 0x80, FIL_, + 0xCC, 0x87, FIL_, 0xC8, 0xA6, FIL_, 0xCC, 0x83, + FIL_, 0xC3, 0x83, FIL_, 0xCC, 0x91, FIL_, 0xC8, + 0x82, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x80, FIL_, + 0xCC, 0x8A, FIL_, 0xC3, 0x85, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0x84, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBA, 0xA2, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, + 0xA0, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8D, FIL_, + 0xCC, 0x80, FIL_, 0xC3, 0x80, FIL_, 0xCC, 0x81, + FIL_, 0xC3, 0x81, FIL_, 0xCC, 0x82, FIL_, 0xC3, + 0x82, FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x84, FIL_, + 0xCC, 0x86, FIL_, 0xC4, 0x82, FIL_, 0xCC, 0x84, + FIL_, 0xC4, 0x80, FIL_, 0x03, 0xCC, 0xB1, FIL_, + 0xE1, 0xB8, 0x86, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x82, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0x84, FIL_, 0x05, 0xCC, 0xA7, FIL_, 0xC3, 0x87, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8C, FIL_, 0xCC, + 0x81, FIL_, 0xC4, 0x86, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0x88, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x8A, + FIL_, 0x06, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x90, + FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8E, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0x8E, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB8, 0x92, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xB8, 0x8C, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0x8A, FIL_, 0x11, 0xCC, 0x84, FIL_, 0xC4, + 0x92, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x94, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xB8, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x86, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0x8A, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x84, + FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x98, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xBA, FIL_, 0xCC, + 0xA7, FIL_, 0xC8, 0xA8, FIL_, 0xCC, 0x8C, FIL_, + 0xC4, 0x9A, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x88, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x98, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xBC, FIL_, 0xCC, 0x87, + FIL_, 0xC4, 0x96, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0x89, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8B, FIL_, + 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9A, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9E, FIL_, 0x07, + 0xCC, 0x8C, FIL_, 0xC7, 0xA6, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0x9E, FIL_, 0xCC, 0x82, FIL_, 0xC4, + 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC4, 0xA2, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xA0, FIL_, 0xCC, + 0x81, FIL_, 0xC7, 0xB4, FIL_, 0xCC, 0x87, FIL_, + 0xC4, 0xA0, FIL_, 0x07, 0xCC, 0x87, FIL_, 0xE1, + 0xB8, 0xA2, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, + 0xA8, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xA4, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA6, FIL_, 0xCC, + 0x8C, FIL_, 0xC8, 0x9E, FIL_, 0xCC, 0xAE, FIL_, + 0xE1, 0xB8, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB8, 0xA4, FIL_, 0x0F, 0xCC, 0xB0, FIL_, 0xE1, + 0xB8, 0xAC, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x8F, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x8C, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x88, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBB, 0x8A, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x8A, FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x8F, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x8E, FIL_, 0xCC, + 0x81, FIL_, 0xC3, 0x8D, FIL_, 0xCC, 0x83, FIL_, + 0xC4, 0xA8, FIL_, 0xCC, 0x87, FIL_, 0xC4, 0xB0, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x88, FIL_, 0xCC, + 0xA8, FIL_, 0xC4, 0xAE, FIL_, 0xCC, 0x86, FIL_, + 0xC4, 0xAC, FIL_, 0xCC, 0x84, FIL_, 0xC4, 0xAA, + FIL_, 0x01, 0xCC, 0x82, FIL_, 0xC4, 0xB4, FIL_, + 0x05, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xB0, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0xA8, FIL_, 0xCC, 0xB1, + FIL_, 0xE1, 0xB8, 0xB4, FIL_, 0xCC, 0xA7, FIL_, + 0xC4, 0xB6, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB2, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB6, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0xBD, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBC, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB8, 0xBA, FIL_, 0xCC, 0xA7, + FIL_, 0xC4, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC4, + 0xB9, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xB8, + 0xBE, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x80, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x82, FIL_, + 0x09, 0xCC, 0x83, FIL_, 0xC3, 0x91, FIL_, 0xCC, + 0x81, FIL_, 0xC5, 0x83, FIL_, 0xCC, 0xA7, FIL_, + 0xC5, 0x85, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0x87, + FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x84, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x86, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB9, 0x88, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0x8A, FIL_, 0xCC, 0x80, FIL_, + 0xC7, 0xB8, FIL_, 0x10, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x8E, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0x8C, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x94, FIL_, 0xCC, + 0x86, FIL_, 0xC5, 0x8E, FIL_, 0xCC, 0x83, FIL_, + 0xC3, 0x95, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0x90, + FIL_, 0xCC, 0x88, FIL_, 0xC3, 0x96, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xA0, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x8E, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x91, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x8C, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0x8C, FIL_, 0xCC, 0x80, + FIL_, 0xC3, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, + 0xAA, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAE, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0x93, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x96, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x94, FIL_, 0x08, 0xCC, 0xA7, + FIL_, 0xC5, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC5, + 0x98, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x92, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x90, FIL_, 0xCC, 0x81, + FIL_, 0xC5, 0x94, FIL_, 0xCC, 0x87, FIL_, 0xE1, + 0xB9, 0x98, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, + 0x9E, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x9A, + FIL_, 0x07, 0xCC, 0xA6, FIL_, 0xC8, 0x98, FIL_, + 0xCC, 0x81, FIL_, 0xC5, 0x9A, FIL_, 0xCC, 0x82, + FIL_, 0xC5, 0x9C, FIL_, 0xCC, 0xA7, FIL_, 0xC5, + 0x9E, FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA0, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA0, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB9, 0xA2, FIL_, 0x07, 0xCC, + 0xA6, FIL_, 0xC8, 0x9A, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB9, 0xAA, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xAC, FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, + 0xAE, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB0, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0xA2, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xA4, FIL_, 0x13, 0xCC, 0x8A, + FIL_, 0xC5, 0xAE, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0x9C, FIL_, 0xCC, 0x8B, FIL_, 0xC5, 0xB0, FIL_, + 0xCC, 0xAD, FIL_, 0xE1, 0xB9, 0xB6, FIL_, 0xCC, + 0xA8, FIL_, 0xC5, 0xB2, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x93, FIL_, 0xCC, 0x80, FIL_, 0xC3, 0x99, + FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x94, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xA4, FIL_, 0xCC, 0xA4, + FIL_, 0xE1, 0xB9, 0xB2, FIL_, 0xCC, 0x81, FIL_, + 0xC3, 0x9A, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0x9B, + FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB9, 0xB4, FIL_, + 0xCC, 0x83, FIL_, 0xC5, 0xA8, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xA6, FIL_, 0xCC, 0x84, FIL_, + 0xC5, 0xAA, FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x96, + FIL_, 0xCC, 0x86, FIL_, 0xC5, 0xAC, FIL_, 0xCC, + 0x9B, FIL_, 0xC6, 0xAF, FIL_, 0x02, 0xCC, 0xA3, + FIL_, 0xE1, 0xB9, 0xBE, FIL_, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBC, FIL_, 0x06, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x84, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBA, 0x82, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, + 0x80, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x88, + FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB4, FIL_, 0xCC, + 0x87, FIL_, 0xE1, 0xBA, 0x86, FIL_, 0x02, 0xCC, + 0x88, FIL_, 0xE1, 0xBA, 0x8C, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x8A, FIL_, 0x09, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xB6, FIL_, 0xCC, 0xA3, FIL_, + 0xE1, 0xBB, 0xB4, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBB, 0xB2, FIL_, 0xCC, 0x88, FIL_, 0xC5, 0xB8, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0x9D, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0xB8, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x8E, FIL_, 0xCC, 0x84, FIL_, + 0xC8, 0xB2, FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB6, + FIL_, 0x06, 0xCC, 0x82, FIL_, 0xE1, 0xBA, 0x90, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0x92, FIL_, + 0xCC, 0xB1, FIL_, 0xE1, 0xBA, 0x94, FIL_, 0xCC, + 0x8C, FIL_, 0xC5, 0xBD, FIL_, 0xCC, 0x87, FIL_, + 0xC5, 0xBB, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0xB9, + FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, 0xBA, 0xA1, + FIL_, 0xCC, 0xA8, FIL_, 0xC4, 0x85, FIL_, 0xCC, + 0x81, FIL_, 0xC3, 0xA1, FIL_, 0xCC, 0x82, FIL_, + 0xC3, 0xA2, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xA3, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xA3, FIL_, + 0xCC, 0x8C, FIL_, 0xC7, 0x8E, FIL_, 0xCC, 0x8A, + FIL_, 0xC3, 0xA5, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xA4, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xA7, FIL_, + 0xCC, 0x91, FIL_, 0xC8, 0x83, FIL_, 0xCC, 0xA5, + FIL_, 0xE1, 0xB8, 0x81, FIL_, 0xCC, 0x84, FIL_, + 0xC4, 0x81, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x81, + FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x83, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xA0, FIL_, 0x03, 0xCC, 0xA3, + FIL_, 0xE1, 0xB8, 0x85, FIL_, 0xCC, 0x87, FIL_, + 0xE1, 0xB8, 0x83, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB8, 0x87, FIL_, 0x05, 0xCC, 0x87, FIL_, 0xC4, + 0x8B, FIL_, 0xCC, 0x8C, FIL_, 0xC4, 0x8D, FIL_, + 0xCC, 0x82, FIL_, 0xC4, 0x89, FIL_, 0xCC, 0x81, + FIL_, 0xC4, 0x87, FIL_, 0xCC, 0xA7, FIL_, 0xC3, + 0xA7, FIL_, 0x06, 0xCC, 0x87, FIL_, 0xE1, 0xB8, + 0x8B, FIL_, 0xCC, 0xA7, FIL_, 0xE1, 0xB8, 0x91, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0x8F, FIL_, + 0xCC, 0xA3, FIL_, 0xE1, 0xB8, 0x8D, FIL_, 0xCC, + 0x8C, FIL_, 0xC4, 0x8F, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB8, 0x93, FIL_, 0x11, 0xCC, 0x80, FIL_, + 0xC3, 0xA8, FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xA9, + FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xAA, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xAB, FIL_, 0xCC, 0x84, FIL_, + 0xC4, 0x93, FIL_, 0xCC, 0x86, FIL_, 0xC4, 0x95, + FIL_, 0xCC, 0x87, FIL_, 0xC4, 0x97, FIL_, 0xCC, + 0xA8, FIL_, 0xC4, 0x99, FIL_, 0xCC, 0x8C, FIL_, + 0xC4, 0x9B, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x85, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x87, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBA, 0xB9, FIL_, 0xCC, 0xA7, + FIL_, 0xC8, 0xA9, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xBD, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, + 0xBB, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0x99, + FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0x9B, FIL_, + 0x01, 0xCC, 0x87, FIL_, 0xE1, 0xB8, 0x9F, FIL_, + 0x07, 0xCC, 0x86, FIL_, 0xC4, 0x9F, FIL_, 0xCC, + 0x87, FIL_, 0xC4, 0xA1, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0x9D, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xB8, + 0xA1, FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0xA7, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xA3, FIL_, 0xCC, 0x81, + FIL_, 0xC7, 0xB5, FIL_, 0x08, 0xCC, 0xA7, FIL_, + 0xE1, 0xB8, 0xA9, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xBA, 0x96, FIL_, 0xCC, 0x8C, FIL_, 0xC8, 0x9F, + FIL_, 0xCC, 0xAE, FIL_, 0xE1, 0xB8, 0xAB, FIL_, + 0xCC, 0x88, FIL_, 0xE1, 0xB8, 0xA7, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xB8, 0xA5, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB8, 0xA3, FIL_, 0xCC, 0x82, FIL_, + 0xC4, 0xA5, FIL_, 0x0E, 0xCC, 0x88, FIL_, 0xC3, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x89, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0x8B, FIL_, + 0xCC, 0x82, FIL_, 0xC3, 0xAE, FIL_, 0xCC, 0x81, + FIL_, 0xC3, 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xC3, + 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xC4, 0xA9, FIL_, + 0xCC, 0x84, FIL_, 0xC4, 0xAB, FIL_, 0xCC, 0x86, + FIL_, 0xC4, 0xAD, FIL_, 0xCC, 0xA8, FIL_, 0xC4, + 0xAF, FIL_, 0xCC, 0xB0, FIL_, 0xE1, 0xB8, 0xAD, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x90, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x8B, FIL_, 0xCC, 0x8F, FIL_, + 0xC8, 0x89, FIL_, 0x02, 0xCC, 0x8C, FIL_, 0xC7, + 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC4, 0xB5, FIL_, + 0x05, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xB5, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xB7, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0xB1, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB3, FIL_, 0x06, 0xCC, 0xA3, FIL_, 0xE1, 0xB8, + 0xB7, FIL_, 0xCC, 0xAD, FIL_, 0xE1, 0xB8, 0xBD, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB8, 0xBB, FIL_, + 0xCC, 0xA7, FIL_, 0xC4, 0xBC, FIL_, 0xCC, 0x81, + FIL_, 0xC4, 0xBA, FIL_, 0xCC, 0x8C, FIL_, 0xC4, + 0xBE, FIL_, 0x03, 0xCC, 0x87, FIL_, 0xE1, 0xB9, + 0x81, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x83, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xBF, FIL_, + 0x09, 0xCC, 0x80, FIL_, 0xC7, 0xB9, FIL_, 0xCC, + 0xAD, FIL_, 0xE1, 0xB9, 0x8B, FIL_, 0xCC, 0x83, + FIL_, 0xC3, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0x84, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0x87, + FIL_, 0xCC, 0xB1, FIL_, 0xE1, 0xB9, 0x89, FIL_, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0x85, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0x86, FIL_, 0xCC, 0x8C, FIL_, + 0xC5, 0x88, FIL_, 0x10, 0xCC, 0xA3, FIL_, 0xE1, + 0xBB, 0x8D, FIL_, 0xCC, 0x87, FIL_, 0xC8, 0xAF, + FIL_, 0xCC, 0x80, FIL_, 0xC3, 0xB2, FIL_, 0xCC, + 0x91, FIL_, 0xC8, 0x8F, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x8F, FIL_, 0xCC, 0x88, FIL_, 0xC3, + 0xB6, FIL_, 0xCC, 0x83, FIL_, 0xC3, 0xB5, FIL_, + 0xCC, 0x81, FIL_, 0xC3, 0xB3, FIL_, 0xCC, 0x8C, + FIL_, 0xC7, 0x92, FIL_, 0xCC, 0xA8, FIL_, 0xC7, + 0xAB, FIL_, 0xCC, 0x9B, FIL_, 0xC6, 0xA1, FIL_, + 0xCC, 0x84, FIL_, 0xC5, 0x8D, FIL_, 0xCC, 0x86, + FIL_, 0xC5, 0x8F, FIL_, 0xCC, 0x8B, FIL_, 0xC5, + 0x91, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xB4, FIL_, + 0xCC, 0x8F, FIL_, 0xC8, 0x8D, FIL_, 0x02, 0xCC, + 0x87, FIL_, 0xE1, 0xB9, 0x97, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0x95, FIL_, 0x08, 0xCC, 0x8C, + FIL_, 0xC5, 0x99, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0x9B, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x95, + FIL_, 0xCC, 0xA7, FIL_, 0xC5, 0x97, FIL_, 0xCC, + 0xB1, FIL_, 0xE1, 0xB9, 0x9F, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0x99, FIL_, 0xCC, 0x91, FIL_, + 0xC8, 0x93, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x91, + FIL_, 0x07, 0xCC, 0xA7, FIL_, 0xC5, 0x9F, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0x9D, FIL_, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA1, FIL_, 0xCC, 0xA6, FIL_, + 0xC8, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC5, 0x9B, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, 0xA3, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xA1, FIL_, 0x08, 0xCC, + 0xA6, FIL_, 0xC8, 0x9B, FIL_, 0xCC, 0xAD, FIL_, + 0xE1, 0xB9, 0xB1, FIL_, 0xCC, 0xB1, FIL_, 0xE1, + 0xB9, 0xAF, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xB9, + 0xAD, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xAB, + FIL_, 0xCC, 0x8C, FIL_, 0xC5, 0xA5, FIL_, 0xCC, + 0xA7, FIL_, 0xC5, 0xA3, FIL_, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x97, FIL_, 0x13, 0xCC, 0x8A, FIL_, + 0xC5, 0xAF, FIL_, 0xCC, 0x8F, FIL_, 0xC8, 0x95, + FIL_, 0xCC, 0x8C, FIL_, 0xC7, 0x94, FIL_, 0xCC, + 0x80, FIL_, 0xC3, 0xB9, FIL_, 0xCC, 0x9B, FIL_, + 0xC6, 0xB0, FIL_, 0xCC, 0x82, FIL_, 0xC3, 0xBB, + FIL_, 0xCC, 0x81, FIL_, 0xC3, 0xBA, FIL_, 0xCC, + 0x88, FIL_, 0xC3, 0xBC, FIL_, 0xCC, 0x83, FIL_, + 0xC5, 0xA9, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, + 0xA7, FIL_, 0xCC, 0x84, FIL_, 0xC5, 0xAB, FIL_, + 0xCC, 0x86, FIL_, 0xC5, 0xAD, FIL_, 0xCC, 0xAD, + FIL_, 0xE1, 0xB9, 0xB7, FIL_, 0xCC, 0x8B, FIL_, + 0xC5, 0xB1, FIL_, 0xCC, 0xA8, FIL_, 0xC5, 0xB3, + FIL_, 0xCC, 0x91, FIL_, 0xC8, 0x97, FIL_, 0xCC, + 0xA4, FIL_, 0xE1, 0xB9, 0xB3, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBB, 0xA5, FIL_, 0xCC, 0xB0, FIL_, + 0xE1, 0xB9, 0xB5, FIL_, 0x02, 0xCC, 0x83, FIL_, + 0xE1, 0xB9, 0xBD, FIL_, 0xCC, 0xA3, FIL_, 0xE1, + 0xB9, 0xBF, FIL_, 0x07, 0xCC, 0x8A, FIL_, 0xE1, + 0xBA, 0x98, FIL_, 0xCC, 0x87, FIL_, 0xE1, 0xBA, + 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0x83, + FIL_, 0xCC, 0x82, FIL_, 0xC5, 0xB5, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0x81, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBA, 0x89, FIL_, 0xCC, 0x88, FIL_, + 0xE1, 0xBA, 0x85, FIL_, 0x02, 0xCC, 0x87, FIL_, + 0xE1, 0xBA, 0x8B, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xBA, 0x8D, FIL_, 0x0A, 0xCC, 0x87, FIL_, 0xE1, + 0xBA, 0x8F, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, + 0xB5, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0xB7, + FIL_, 0xCC, 0x8A, FIL_, 0xE1, 0xBA, 0x99, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xB3, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBB, 0xB9, FIL_, 0xCC, 0x88, + FIL_, 0xC3, 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xC3, + 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xB3, FIL_, + 0xCC, 0x82, FIL_, 0xC5, 0xB7, FIL_, 0x06, 0xCC, + 0xB1, FIL_, 0xE1, 0xBA, 0x95, FIL_, 0xCC, 0xA3, + FIL_, 0xE1, 0xBA, 0x93, FIL_, 0xCC, 0x82, FIL_, + 0xE1, 0xBA, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xC5, + 0xBA, FIL_, 0xCC, 0x87, FIL_, 0xC5, 0xBC, FIL_, + 0xCC, 0x8C, FIL_, 0xC5, 0xBE, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0xAD, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x81, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x85, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, + 0xBA, 0xAA, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xA4, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA8, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBA, 0xA6, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC7, 0x9E, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBA, FIL_, 0x02, 0xCC, + 0x84, FIL_, 0xC7, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xC7, 0xBC, FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, + 0xB8, 0x88, FIL_, 0x04, 0xCC, 0x83, FIL_, 0xE1, + 0xBB, 0x84, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBB, + 0x80, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xBE, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0xAE, FIL_, + 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x90, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x92, FIL_, 0xCC, + 0x89, FIL_, 0xE1, 0xBB, 0x94, FIL_, 0xCC, 0x83, + FIL_, 0xE1, 0xBB, 0x96, FIL_, 0x03, 0xCC, 0x84, + FIL_, 0xC8, 0xAC, FIL_, 0xCC, 0x88, FIL_, 0xE1, + 0xB9, 0x8E, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xB9, + 0x8C, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAA, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xC7, 0xBE, FIL_, + 0x04, 0xCC, 0x80, FIL_, 0xC7, 0x9B, FIL_, 0xCC, + 0x84, FIL_, 0xC7, 0x95, FIL_, 0xCC, 0x8C, FIL_, + 0xC7, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x97, + FIL_, 0x04, 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xA5, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBA, 0xAB, FIL_, + 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xA9, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBA, 0xA7, FIL_, 0x01, 0xCC, + 0x84, FIL_, 0xC7, 0x9F, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xC7, 0xBB, FIL_, 0x02, 0xCC, 0x81, FIL_, + 0xC7, 0xBD, FIL_, 0xCC, 0x84, FIL_, 0xC7, 0xA3, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x89, + FIL_, 0x04, 0xCC, 0x89, FIL_, 0xE1, 0xBB, 0x83, + FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0x85, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0x81, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBA, 0xBF, FIL_, 0x01, 0xCC, + 0x81, FIL_, 0xE1, 0xB8, 0xAF, FIL_, 0x04, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x93, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBB, 0x91, FIL_, 0xCC, 0x83, FIL_, + 0xE1, 0xBB, 0x97, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0x95, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xB9, 0x8D, FIL_, 0xCC, 0x88, FIL_, 0xE1, 0xB9, + 0x8F, FIL_, 0xCC, 0x84, FIL_, 0xC8, 0xAD, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xAB, FIL_, 0x01, + 0xCC, 0x81, FIL_, 0xC7, 0xBF, FIL_, 0x04, 0xCC, + 0x8C, FIL_, 0xC7, 0x9A, FIL_, 0xCC, 0x84, FIL_, + 0xC7, 0x96, FIL_, 0xCC, 0x80, FIL_, 0xC7, 0x9C, + FIL_, 0xCC, 0x81, FIL_, 0xC7, 0x98, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBA, 0xAE, FIL_, 0xCC, + 0x83, FIL_, 0xE1, 0xBA, 0xB4, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBA, 0xB2, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBA, 0xB0, FIL_, 0x04, 0xCC, 0x83, FIL_, + 0xE1, 0xBA, 0xB5, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBA, 0xB1, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBA, + 0xAF, FIL_, 0xCC, 0x89, FIL_, 0xE1, 0xBA, 0xB3, + FIL_, 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x96, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xB8, 0x95, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xB8, 0x97, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xB9, 0x90, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x92, FIL_, 0x02, 0xCC, + 0x81, FIL_, 0xE1, 0xB9, 0x93, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xB9, 0x91, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA4, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA5, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA6, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xB9, 0xA7, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB8, FIL_, 0x01, 0xCC, 0x81, + FIL_, 0xE1, 0xB9, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBA, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xE1, 0xB9, 0xBB, FIL_, 0x01, 0xCC, 0x87, + FIL_, 0xE1, 0xBA, 0x9B, FIL_, 0x05, 0xCC, 0x80, + FIL_, 0xE1, 0xBB, 0x9C, FIL_, 0xCC, 0x89, FIL_, + 0xE1, 0xBB, 0x9E, FIL_, 0xCC, 0x83, FIL_, 0xE1, + 0xBB, 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, + 0x9A, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA2, + FIL_, 0x05, 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xA1, + FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xA3, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0x9B, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBB, 0x9D, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0x9F, FIL_, 0x05, 0xCC, 0x81, + FIL_, 0xE1, 0xBB, 0xA8, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBB, 0xAA, FIL_, 0xCC, 0x89, FIL_, 0xE1, + 0xBB, 0xAC, FIL_, 0xCC, 0x83, FIL_, 0xE1, 0xBB, + 0xAE, FIL_, 0xCC, 0xA3, FIL_, 0xE1, 0xBB, 0xB0, + FIL_, 0x05, 0xCC, 0x80, FIL_, 0xE1, 0xBB, 0xAB, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBB, 0xA9, FIL_, + 0xCC, 0x83, FIL_, 0xE1, 0xBB, 0xAF, FIL_, 0xCC, + 0xA3, FIL_, 0xE1, 0xBB, 0xB1, FIL_, 0xCC, 0x89, + FIL_, 0xE1, 0xBB, 0xAD, FIL_, 0x01, 0xCC, 0x8C, + FIL_, 0xC7, 0xAE, FIL_, 0x01, 0xCC, 0x84, FIL_, + 0xC7, 0xAC, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, + 0xAD, FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA0, + FIL_, 0x01, 0xCC, 0x84, FIL_, 0xC7, 0xA1, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9C, FIL_, + 0x01, 0xCC, 0x86, FIL_, 0xE1, 0xB8, 0x9D, FIL_, + 0x01, 0xCC, 0x84, FIL_, 0xC8, 0xB0, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xC8, 0xB1, FIL_, 0x01, 0xCC, + 0x8C, FIL_, 0xC7, 0xAF, FIL_, 0x07, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x88, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x86, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBE, + 0xB8, FIL_, 0xCC, 0x84, FIL_, 0xE1, 0xBE, 0xB9, + FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x89, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xBC, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBE, 0xBA, FIL_, 0x04, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0x99, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x88, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x88, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBC, + 0x98, FIL_, 0x05, 0xCD, 0x85, FIL_, 0xE1, 0xBF, + 0x8C, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x89, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x8A, FIL_, 0xCC, + 0x93, FIL_, 0xE1, 0xBC, 0xA8, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBC, 0xA9, FIL_, 0x07, 0xCC, 0x80, + FIL_, 0xE1, 0xBF, 0x9A, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBF, 0x99, FIL_, 0xCC, 0x93, FIL_, 0xE1, + 0xBC, 0xB8, FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBC, + 0xB9, FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0x98, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0x8A, FIL_, 0xCC, + 0x88, FIL_, 0xCE, 0xAA, FIL_, 0x04, 0xCC, 0x81, + FIL_, 0xCE, 0x8C, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0x89, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0x88, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xB8, + FIL_, 0x01, 0xCC, 0x94, FIL_, 0xE1, 0xBF, 0xAC, + FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x99, + FIL_, 0xCC, 0x86, FIL_, 0xE1, 0xBF, 0xA8, FIL_, + 0xCC, 0x88, FIL_, 0xCE, 0xAB, FIL_, 0xCC, 0x84, + FIL_, 0xE1, 0xBF, 0xA9, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xAA, FIL_, 0x05, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0xA8, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xBC, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0xBA, FIL_, + 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0xA9, FIL_, 0xCC, + 0x81, FIL_, 0xCE, 0x8F, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB4, FIL_, 0x01, 0xCD, 0x85, + FIL_, 0xE1, 0xBF, 0x84, FIL_, 0x08, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0xB3, FIL_, 0xCC, 0x84, FIL_, + 0xE1, 0xBE, 0xB1, FIL_, 0xCC, 0x86, FIL_, 0xE1, + 0xBE, 0xB0, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, + 0xB0, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAC, FIL_, + 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0x81, FIL_, 0xCC, + 0x93, FIL_, 0xE1, 0xBC, 0x80, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBE, 0xB6, FIL_, 0x04, 0xCC, 0x93, + FIL_, 0xE1, 0xBC, 0x90, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0xB2, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBC, 0x91, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAD, + FIL_, 0x06, 0xCC, 0x94, FIL_, 0xE1, 0xBC, 0xA1, + FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAE, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBF, 0x83, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x86, FIL_, 0xCC, 0x93, FIL_, + 0xE1, 0xBC, 0xA0, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0xB4, FIL_, 0x08, 0xCC, 0x88, FIL_, 0xCF, + 0x8A, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xAF, FIL_, + 0xCC, 0x93, FIL_, 0xE1, 0xBC, 0xB0, FIL_, 0xCC, + 0x94, FIL_, 0xE1, 0xBC, 0xB1, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xB6, FIL_, 0xCC, 0x86, FIL_, + 0xE1, 0xBF, 0x90, FIL_, 0xCC, 0x84, FIL_, 0xE1, + 0xBF, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0x96, FIL_, 0x04, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0x80, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xB8, + FIL_, 0xCC, 0x94, FIL_, 0xE1, 0xBD, 0x81, FIL_, + 0xCC, 0x81, FIL_, 0xCF, 0x8C, FIL_, 0x02, 0xCC, + 0x93, FIL_, 0xE1, 0xBF, 0xA4, FIL_, 0xCC, 0x94, + FIL_, 0xE1, 0xBF, 0xA5, FIL_, 0x08, 0xCC, 0x81, + FIL_, 0xCF, 0x8D, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0x91, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, + 0xA6, FIL_, 0xCC, 0x88, FIL_, 0xCF, 0x8B, FIL_, + 0xCC, 0x84, FIL_, 0xE1, 0xBF, 0xA1, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0xBA, FIL_, 0xCC, 0x93, + FIL_, 0xE1, 0xBD, 0x90, FIL_, 0xCC, 0x86, FIL_, + 0xE1, 0xBF, 0xA0, FIL_, 0x06, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0xBC, FIL_, 0xCC, 0x94, FIL_, 0xE1, + 0xBD, 0xA1, FIL_, 0xCC, 0x93, FIL_, 0xE1, 0xBD, + 0xA0, FIL_, 0xCC, 0x81, FIL_, 0xCF, 0x8E, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB3, FIL_, 0xCD, + 0x82, FIL_, 0xE1, 0xBF, 0xB6, FIL_, 0x03, 0xCC, + 0x80, FIL_, 0xE1, 0xBF, 0x92, FIL_, 0xCD, 0x82, + FIL_, 0xE1, 0xBF, 0x97, FIL_, 0xCC, 0x81, FIL_, + 0xCE, 0x90, FIL_, 0x03, 0xCD, 0x82, FIL_, 0xE1, + 0xBF, 0xA7, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0xA2, FIL_, 0xCC, 0x81, FIL_, 0xCE, 0xB0, FIL_, + 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB4, FIL_, + 0x02, 0xCC, 0x88, FIL_, 0xCF, 0x94, FIL_, 0xCC, + 0x81, FIL_, 0xCF, 0x93, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD0, 0x87, FIL_, 0x02, 0xCC, 0x88, FIL_, + 0xD3, 0x92, FIL_, 0xCC, 0x86, FIL_, 0xD3, 0x90, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x83, FIL_, + 0x03, 0xCC, 0x88, FIL_, 0xD0, 0x81, FIL_, 0xCC, + 0x80, FIL_, 0xD0, 0x80, FIL_, 0xCC, 0x86, FIL_, + 0xD3, 0x96, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, + 0x81, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9C, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9E, FIL_, 0x04, + 0xCC, 0x84, FIL_, 0xD3, 0xA2, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA4, FIL_, 0xCC, 0x86, FIL_, 0xD0, + 0x99, FIL_, 0xCC, 0x80, FIL_, 0xD0, 0x8D, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD0, 0x8C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA6, FIL_, 0x04, 0xCC, + 0x8B, FIL_, 0xD3, 0xB2, FIL_, 0xCC, 0x88, FIL_, + 0xD3, 0xB0, FIL_, 0xCC, 0x86, FIL_, 0xD0, 0x8E, + FIL_, 0xCC, 0x84, FIL_, 0xD3, 0xAE, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB4, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB8, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAC, FIL_, 0x02, 0xCC, 0x86, FIL_, + 0xD3, 0x91, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x93, + FIL_, 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x93, FIL_, + 0x03, 0xCC, 0x80, FIL_, 0xD1, 0x90, FIL_, 0xCC, + 0x86, FIL_, 0xD3, 0x97, FIL_, 0xCC, 0x88, FIL_, + 0xD1, 0x91, FIL_, 0x02, 0xCC, 0x86, FIL_, 0xD3, + 0x82, FIL_, 0xCC, 0x88, FIL_, 0xD3, 0x9D, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9F, FIL_, 0x04, + 0xCC, 0x86, FIL_, 0xD0, 0xB9, FIL_, 0xCC, 0x88, + FIL_, 0xD3, 0xA5, FIL_, 0xCC, 0x84, FIL_, 0xD3, + 0xA3, FIL_, 0xCC, 0x80, FIL_, 0xD1, 0x9D, FIL_, + 0x01, 0xCC, 0x81, FIL_, 0xD1, 0x9C, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xA7, FIL_, 0x04, 0xCC, + 0x8B, FIL_, 0xD3, 0xB3, FIL_, 0xCC, 0x84, FIL_, + 0xD3, 0xAF, FIL_, 0xCC, 0x86, FIL_, 0xD1, 0x9E, + FIL_, 0xCC, 0x88, FIL_, 0xD3, 0xB1, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xB5, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xB9, FIL_, 0x01, 0xCC, 0x88, + FIL_, 0xD3, 0xAD, FIL_, 0x01, 0xCC, 0x88, FIL_, + 0xD1, 0x97, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, + 0xB6, FIL_, 0x01, 0xCC, 0x8F, FIL_, 0xD1, 0xB7, + FIL_, 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9A, FIL_, + 0x01, 0xCC, 0x88, FIL_, 0xD3, 0x9B, FIL_, 0x01, + 0xCC, 0x88, FIL_, 0xD3, 0xAA, FIL_, 0x01, 0xCC, + 0x88, FIL_, 0xD3, 0xAB, FIL_, 0x03, 0xD9, 0x94, + FIL_, 0xD8, 0xA3, FIL_, 0xD9, 0x95, FIL_, 0xD8, + 0xA5, FIL_, 0xD9, 0x93, FIL_, 0xD8, 0xA2, FIL_, + 0x01, 0xD9, 0x94, FIL_, 0xD8, 0xA4, FIL_, 0x01, + 0xD9, 0x94, FIL_, 0xD8, 0xA6, FIL_, 0x01, 0xD9, + 0x94, FIL_, 0xDB, 0x82, FIL_, 0x01, 0xD9, 0x94, + FIL_, 0xDB, 0x93, FIL_, 0x01, 0xD9, 0x94, FIL_, + 0xDB, 0x80, FIL_, 0x01, 0xE0, 0xA4, 0xBC, FIL_, + 0xE0, 0xA4, 0xA9, FIL_, 0x01, 0xE0, 0xA4, 0xBC, + FIL_, 0xE0, 0xA4, 0xB1, FIL_, 0x01, 0xE0, 0xA4, + 0xBC, FIL_, 0xE0, 0xA4, 0xB4, FIL_, 0x02, 0xE0, + 0xA6, 0xBE, FIL_, 0xE0, 0xA7, 0x8B, FIL_, 0xE0, + 0xA7, 0x97, FIL_, 0xE0, 0xA7, 0x8C, FIL_, 0x03, + 0xE0, 0xAD, 0x96, FIL_, 0xE0, 0xAD, 0x88, FIL_, + 0xE0, 0xAC, 0xBE, FIL_, 0xE0, 0xAD, 0x8B, FIL_, + 0xE0, 0xAD, 0x97, FIL_, 0xE0, 0xAD, 0x8C, FIL_, + 0x01, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAE, 0x94, + FIL_, 0x02, 0xE0, 0xAF, 0x97, FIL_, 0xE0, 0xAF, + 0x8C, FIL_, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, 0xAF, + 0x8A, FIL_, 0x01, 0xE0, 0xAE, 0xBE, FIL_, 0xE0, + 0xAF, 0x8B, FIL_, 0x01, 0xE0, 0xB1, 0x96, FIL_, + 0xE0, 0xB1, 0x88, FIL_, 0x01, 0xE0, 0xB3, 0x95, + FIL_, 0xE0, 0xB3, 0x80, FIL_, 0x03, 0xE0, 0xB3, + 0x82, FIL_, 0xE0, 0xB3, 0x8A, FIL_, 0xE0, 0xB3, + 0x96, FIL_, 0xE0, 0xB3, 0x88, FIL_, 0xE0, 0xB3, + 0x95, FIL_, 0xE0, 0xB3, 0x87, FIL_, 0x01, 0xE0, + 0xB3, 0x95, FIL_, 0xE0, 0xB3, 0x8B, FIL_, 0x02, + 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8A, FIL_, + 0xE0, 0xB5, 0x97, FIL_, 0xE0, 0xB5, 0x8C, FIL_, + 0x01, 0xE0, 0xB4, 0xBE, FIL_, 0xE0, 0xB5, 0x8B, + FIL_, 0x03, 0xE0, 0xB7, 0x9F, FIL_, 0xE0, 0xB7, + 0x9E, FIL_, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, 0xB7, + 0x9A, FIL_, 0xE0, 0xB7, 0x8F, FIL_, 0xE0, 0xB7, + 0x9C, FIL_, 0x01, 0xE0, 0xB7, 0x8A, FIL_, 0xE0, + 0xB7, 0x9D, FIL_, 0x01, 0xE1, 0x80, 0xAE, FIL_, + 0xE1, 0x80, 0xA6, FIL_, 0x01, 0xE1, 0xAC, 0xB5, + FIL_, 0xE1, 0xAC, 0x86, FIL_, 0x01, 0xE1, 0xAC, + 0xB5, FIL_, 0xE1, 0xAC, 0x88, FIL_, 0x01, 0xE1, + 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8A, FIL_, 0x01, + 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8C, FIL_, + 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, 0x8E, + FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, 0xAC, + 0x92, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, 0xE1, + 0xAC, 0xBB, FIL_, 0x01, 0xE1, 0xAC, 0xB5, FIL_, + 0xE1, 0xAC, 0xBD, FIL_, 0x01, 0xE1, 0xAC, 0xB5, + FIL_, 0xE1, 0xAD, 0x80, FIL_, 0x01, 0xE1, 0xAC, + 0xB5, FIL_, 0xE1, 0xAD, 0x81, FIL_, 0x01, 0xE1, + 0xAC, 0xB5, FIL_, 0xE1, 0xAD, 0x83, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB8, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB8, 0xB9, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9C, FIL_, 0x01, + 0xCC, 0x84, FIL_, 0xE1, 0xB9, 0x9D, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA8, FIL_, 0x01, + 0xCC, 0x87, FIL_, 0xE1, 0xB9, 0xA9, FIL_, 0x02, + 0xCC, 0x86, FIL_, 0xE1, 0xBA, 0xB6, FIL_, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0xAC, FIL_, 0x02, 0xCC, + 0x82, FIL_, 0xE1, 0xBA, 0xAD, FIL_, 0xCC, 0x86, + FIL_, 0xE1, 0xBA, 0xB7, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x86, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x87, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x98, FIL_, 0x01, 0xCC, 0x82, + FIL_, 0xE1, 0xBB, 0x99, FIL_, 0x04, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x80, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0x86, FIL_, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0x82, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x84, FIL_, 0x04, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0x87, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x85, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x83, FIL_, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x81, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x82, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x83, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x84, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x85, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x86, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x87, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x8C, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x8A, FIL_, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x88, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0x8E, FIL_, 0x04, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0x8B, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0x8F, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0x8D, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x89, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8A, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8B, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8C, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8D, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8E, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x8F, + FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x92, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x94, FIL_, + 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x93, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0x95, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0x9A, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBC, 0x9C, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0x9B, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0x9D, FIL_, 0x04, 0xCC, 0x80, + FIL_, 0xE1, 0xBC, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBC, 0xA4, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xA6, FIL_, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x90, FIL_, 0x04, 0xCD, 0x85, FIL_, 0xE1, 0xBE, + 0x91, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xA5, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xA7, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xA3, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x92, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x93, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x94, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x95, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x96, FIL_, 0x01, + 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x97, FIL_, 0x04, + 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xAC, FIL_, 0xCC, + 0x80, FIL_, 0xE1, 0xBC, 0xAA, FIL_, 0xCD, 0x85, + FIL_, 0xE1, 0xBE, 0x98, FIL_, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAE, FIL_, 0x04, 0xCD, 0x82, FIL_, + 0xE1, 0xBC, 0xAF, FIL_, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0x99, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, + 0xAD, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xAB, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9A, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9B, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9C, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9D, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9E, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0x9F, + FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xB4, + FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB2, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBC, 0xB6, FIL_, 0x03, + 0xCC, 0x80, FIL_, 0xE1, 0xBC, 0xB3, FIL_, 0xCD, + 0x82, FIL_, 0xE1, 0xBC, 0xB7, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0xB5, FIL_, 0x03, 0xCC, 0x81, + FIL_, 0xE1, 0xBC, 0xBC, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBC, 0xBA, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBC, 0xBE, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBC, 0xBB, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBC, + 0xBF, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBC, 0xBD, + FIL_, 0x02, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x82, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x84, FIL_, + 0x02, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x85, FIL_, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x83, FIL_, 0x02, + 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x8A, FIL_, 0xCC, + 0x81, FIL_, 0xE1, 0xBD, 0x8C, FIL_, 0x02, 0xCC, + 0x80, FIL_, 0xE1, 0xBD, 0x8B, FIL_, 0xCC, 0x81, + FIL_, 0xE1, 0xBD, 0x8D, FIL_, 0x03, 0xCD, 0x82, + FIL_, 0xE1, 0xBD, 0x96, FIL_, 0xCC, 0x80, FIL_, + 0xE1, 0xBD, 0x92, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0x94, FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, + 0xBD, 0x93, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, + 0x97, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x95, + FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0x9B, + FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0x9F, FIL_, + 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0x9D, FIL_, 0x04, + 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xA6, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA0, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xA2, FIL_, 0xCC, 0x81, FIL_, + 0xE1, 0xBD, 0xA4, FIL_, 0x04, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0xA1, FIL_, 0xCD, 0x82, FIL_, 0xE1, + 0xBD, 0xA7, FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, + 0xA5, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xA3, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA2, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA3, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA4, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA5, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA6, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBE, 0xA7, + FIL_, 0x04, 0xCC, 0x80, FIL_, 0xE1, 0xBD, 0xAA, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBD, 0xAC, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBD, 0xAE, FIL_, 0xCD, + 0x85, FIL_, 0xE1, 0xBE, 0xA8, FIL_, 0x04, 0xCD, + 0x82, FIL_, 0xE1, 0xBD, 0xAF, FIL_, 0xCC, 0x80, + FIL_, 0xE1, 0xBD, 0xAB, FIL_, 0xCD, 0x85, FIL_, + 0xE1, 0xBE, 0xA9, FIL_, 0xCC, 0x81, FIL_, 0xE1, + 0xBD, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAA, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAB, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAC, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAD, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAE, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xAF, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0x82, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBF, 0xB2, FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, + 0xBE, 0xB7, FIL_, 0x03, 0xCC, 0x81, FIL_, 0xE1, + 0xBF, 0x8E, FIL_, 0xCC, 0x80, FIL_, 0xE1, 0xBF, + 0x8D, FIL_, 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x8F, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0x87, + FIL_, 0x01, 0xCD, 0x85, FIL_, 0xE1, 0xBF, 0xB7, + FIL_, 0x03, 0xCC, 0x80, FIL_, 0xE1, 0xBF, 0x9D, + FIL_, 0xCC, 0x81, FIL_, 0xE1, 0xBF, 0x9E, FIL_, + 0xCD, 0x82, FIL_, 0xE1, 0xBF, 0x9F, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9A, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0x9B, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x86, 0xAE, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8D, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8F, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x87, 0x8E, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0x8C, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA4, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x88, 0xA6, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x81, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x87, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xAD, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xA2, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB0, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB1, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB4, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB5, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB8, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x89, 0xB9, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x80, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x81, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA0, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA1, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x84, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x85, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x88, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0x89, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA2, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xA3, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAC, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAD, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAE, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8A, 0xAF, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAA, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAB, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAC, FIL_, 0x01, + 0xCC, 0xB8, FIL_, 0xE2, 0x8B, 0xAD, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0x94, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x8C, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x8E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0x90, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0x92, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0x94, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0x96, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0x98, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9A, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0x9C, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0x9E, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xA0, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xA2, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x81, 0xA5, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x81, 0xA7, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x81, 0xA9, FIL_, 0x02, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB1, FIL_, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB0, FIL_, + 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, 0xB3, + FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, 0xB4, + FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x81, + 0xB6, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x81, + 0xB7, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, + 0x81, 0xBA, FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x81, 0xB9, FIL_, 0x02, 0xE3, 0x82, 0x9A, FIL_, + 0xE3, 0x81, 0xBD, FIL_, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x81, 0xBC, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0x9E, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0xB4, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAC, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xAE, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xB0, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, + 0xB2, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x82, 0xB4, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x82, 0xB6, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x82, 0xB8, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x82, 0xBA, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBC, FIL_, 0x01, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x82, 0xBE, FIL_, + 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x80, + FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x82, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0x85, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0x87, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0x89, FIL_, 0x02, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0x90, FIL_, 0xE3, 0x82, + 0x9A, FIL_, 0xE3, 0x83, 0x91, FIL_, 0x02, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0x93, FIL_, 0xE3, + 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x94, FIL_, 0x02, + 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x97, FIL_, + 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x96, FIL_, + 0x02, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, 0x9A, + FIL_, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, 0x99, + FIL_, 0x02, 0xE3, 0x82, 0x99, FIL_, 0xE3, 0x83, + 0x9C, FIL_, 0xE3, 0x82, 0x9A, FIL_, 0xE3, 0x83, + 0x9D, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, 0xE3, + 0x83, 0xB7, FIL_, 0x01, 0xE3, 0x82, 0x99, FIL_, + 0xE3, 0x83, 0xB8, FIL_, 0x01, 0xE3, 0x82, 0x99, + FIL_, 0xE3, 0x83, 0xB9, FIL_, 0x01, 0xE3, 0x82, + 0x99, FIL_, 0xE3, 0x83, 0xBA, FIL_, 0x01, 0xE3, + 0x82, 0x99, FIL_, 0xE3, 0x83, 0xBE, FIL_, + }, +}; + +static const uchar_t u8_decomp_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 7, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 1, 2, 3, 4, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 5, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, 6, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 7, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_decomp_b3_tbl[2][8][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 35 }, { 2, 247 }, { 3, 474 }, + { 4, 693 }, { 5, 709 }, { 6, 951 }, + { N_, 0 }, { 7, 1139 }, { 8, 1152 }, + { N_, 0 }, { 9, 1177 }, { 10, 1199 }, + { 11, 1295 }, { 12, 1360 }, { 13, 1405 }, + { N_, 0 }, { 14, 1450 }, { N_, 0 }, + { N_, 0 }, { 15, 1620 }, { N_, 0 }, + { 16, 1624 }, { 17, 1649 }, { N_, 0 }, + { 18, 1665 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 19, 1680 }, + { 20, 1701 }, { N_, 0 }, { 21, 1757 }, + { 22, 1792 }, { 23, 1806 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1834 }, + { 25, 1869 }, { 26, 1876 }, { N_, 0 }, + { 27, 1897 }, { N_, 0 }, { 28, 1904 }, + { N_, 0 }, { 29, 1942 }, { N_, 0 }, + { 30, 1963 }, { 31, 1994 }, { N_, 0 }, + { 32, 2000 }, { 33, 2006 }, { 34, 2018 }, + { 35, 2021 }, { 36, 2109 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 37, 2158 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8000, 2165 }, { 0x8001, 2445 }, + { 0x8002, 2741 }, { 0x8003, 3029 }, { 0x8004, 3337 }, + { 0x8005, 3725 }, { 0x8006, 4053 }, { 0x8007, 4536 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 38, 4895 }, + { 39, 4964 }, { 40, 4999 }, { N_, 0 }, + { 41, 5018 }, { 42, 5098 }, { 43, 5230 }, + { 44, 5248 }, { 45, 5266 }, { 46, 5326 }, + { 47, 5410 }, { 48, 5470 }, { 49, 5518 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 50, 5526 }, { 51, 5596 }, + { 52, 5767 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 53, 5810 }, { 54, 5822 }, { N_, 0 }, + { 55, 5830 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 56, 5836 }, { 57, 5839 }, { 58, 5842 }, + { 59, 6034 }, { 60, 6226 }, { 61, 6418 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 62, 6484 }, + { 63, 6497 }, { 64, 6672 }, { 65, 6770 }, + { 66, 6923 }, { 67, 6968 }, { 68, 7160 }, + { N_, 0 }, { 0x8008, 7247 }, { 69, 7597 }, + { 70, 7773 }, { 71, 7950 }, { 0x8009, 8142 }, + { 0x800A, 8919 }, { 72, 9351 }, { 73, 9522 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 5. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x800B, 9743 }, + { 0x800C, 9999 }, { 0x800D, 10255 }, { 0x800E, 10511 }, + { 74, 10767 }, { 75, 10967 }, { N_, 0 }, + { N_, 0 }, { 76, 11139 }, { 77, 11303 }, + { 78, 11468 }, { 79, 11576 }, { 0x800F, 11740 }, + { 0x8010, 12006 }, { 0x8011, 12280 }, { 0x8012, 12546 }, + { 80, 12812 }, { 0x8013, 13060 }, { 0x8014, 13348 }, + { 81, 13720 }, { 82, 13898 }, { 83, 13933 }, + { 84, 14045 }, { 85, 14197 }, { 86, 14347 }, + { 87, 14410 }, { 88, 14540 }, { 89, 14729 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 6. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 90, 14829 }, { 91, 14912 }, + { 92, 14969 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 93, 14982 }, { 94, 15046 }, { 95, 15109 }, + { 96, 15163 }, { 97, 15225 }, { 98, 15282 }, + { 99, 15341 }, { 100, 15405 }, { 101, 15469 }, + { 102, 15533 }, { 103, 15597 }, { 104, 15681 }, + { 105, 15812 }, { 106, 15942 }, { 107, 16072 }, + { 108, 16202 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 7. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8015, 16273 }, { 0x8016, 16536 }, + { 0x8017, 16799 }, { 0x8018, 17064 }, { 0x8019, 17329 }, + { 0x801A, 17601 }, { 0x801B, 17878 }, { 0x801C, 18147 }, + { 109, 18419 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 35 }, { 2, 247 }, { 3, 474 }, + { 4, 693 }, { 5, 709 }, { 6, 951 }, + { N_, 0 }, { 7, 1139 }, { 8, 1152 }, + { N_, 0 }, { 9, 1177 }, { 10, 1199 }, + { 11, 1295 }, { 12, 1362 }, { 13, 1407 }, + { N_, 0 }, { 14, 1452 }, { N_, 0 }, + { N_, 0 }, { 15, 1622 }, { N_, 0 }, + { 16, 1626 }, { 17, 1651 }, { N_, 0 }, + { 18, 1667 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 19, 1682 }, + { 20, 1703 }, { N_, 0 }, { 21, 1759 }, + { 22, 1794 }, { 23, 1808 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1836 }, + { 25, 1871 }, { 26, 1878 }, { N_, 0 }, + { 27, 1899 }, { N_, 0 }, { 28, 1906 }, + { N_, 0 }, { 29, 1944 }, { N_, 0 }, + { 30, 1965 }, { 31, 1996 }, { N_, 0 }, + { 32, 2002 }, { 33, 2008 }, { 34, 2020 }, + { 35, 2023 }, { 36, 2111 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 37, 2160 }, + { N_, 0 }, { N_, 0 }, { 38, 2167 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 39, 2170 }, { 40, 2226 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 41, 2247 }, { 42, 2268 }, { 43, 2340 }, + { N_, 0 }, { 0x8000, 2414 }, { 0x8001, 2694 }, + { 0x8002, 2990 }, { 0x8003, 3278 }, { 0x8004, 3586 }, + { 0x8005, 3974 }, { 0x8006, 4302 }, { 0x8007, 4785 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 44, 5144 }, + { 45, 5213 }, { 46, 5248 }, { N_, 0 }, + { 47, 5273 }, { 48, 5358 }, { 49, 5490 }, + { 50, 5508 }, { 51, 5526 }, { 52, 5586 }, + { 53, 5670 }, { 54, 5730 }, { 55, 5778 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 56, 5786 }, { 57, 5856 }, + { 58, 6027 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 59, 6070 }, { 60, 6082 }, { N_, 0 }, + { 61, 6090 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 62, 6096 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 63, 6099 }, { 64, 6102 }, { 65, 6105 }, + { 66, 6297 }, { 67, 6489 }, { 68, 6681 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 69, 6747 }, + { 70, 6760 }, { 71, 6935 }, { 72, 7033 }, + { 73, 7186 }, { 74, 7231 }, { 75, 7423 }, + { N_, 0 }, { 0x8008, 7510 }, { 76, 7891 }, + { 77, 8103 }, { 78, 8280 }, { 0x8009, 8482 }, + { 0x800A, 9259 }, { 79, 9701 }, { 80, 9872 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 5. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0x800B, 10106 }, + { 0x800C, 10362 }, { 0x800D, 10618 }, { 0x800E, 10874 }, + { 81, 11130 }, { 82, 11330 }, { 0x800F, 11566 }, + { 83, 11822 }, { 84, 11932 }, { 85, 12096 }, + { 86, 12261 }, { 87, 12369 }, { 0x8010, 12533 }, + { 0x8011, 12799 }, { 0x8012, 13073 }, { 0x8013, 13339 }, + { 88, 13605 }, { 0x8014, 13853 }, { 0x8015, 14141 }, + { 89, 14513 }, { 90, 14691 }, { 91, 14746 }, + { 92, 14860 }, { 93, 15012 }, { 94, 15162 }, + { 95, 15225 }, { 96, 15355 }, { 97, 15544 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 6. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 98, 15644 }, { 99, 15727 }, + { 100, 15784 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 101, 15797 }, { 102, 15861 }, { 103, 15924 }, + { 104, 15978 }, { 105, 16041 }, { 106, 16098 }, + { 107, 16157 }, { 108, 16221 }, { 109, 16285 }, + { 110, 16349 }, { 111, 16413 }, { 112, 16501 }, + { 113, 16632 }, { 114, 16762 }, { 115, 16892 }, + { 116, 17022 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + { /* Third byte table 7. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 0x8016, 17097 }, { 0x8017, 17360 }, + { 0x8018, 17623 }, { 0x8019, 17888 }, { 0x801A, 18153 }, + { 0x801B, 18425 }, { 0x801C, 18702 }, { 0x801D, 18971 }, + { 117, 19243 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_decomp_b4_tbl[2][118][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 4, 5, 5, 5, 5, 5, + 8, 8, 8, 9, 10, 13, 15, 15, + 15, 18, 19, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 24, + 28, 32, 36, 40, 44, 48, 52, 56, + 60, 60, 64, 68, 72, 76, 80, 84, + 84, 84, 88, 92, 96, 100, 104, 104, + 104, 108, 112, 116, 120, 124, 128, 128, + 132, 136, 140, 144, 148, 152, 156, 160, + 164, 164, 168, 172, 176, 180, 184, 188, + 188, 188, 192, 196, 200, 204, 208, 208, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 68, 72, 76, 80, 84, + 88, 92, 96, 100, 104, 108, 112, 116, + 120, 124, 128, 132, 136, 140, 144, 144, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 180, 182, 184, 188, 192, 196, + 200, 200, 204, 208, 212, 216, 220, 224, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 7, 11, 15, 19, + 23, 27, 30, 30, 30, 34, 38, 42, + 46, 50, 54, 54, 54, 58, 62, 66, + 70, 74, 78, 82, 86, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 126, 126, + 126, 130, 134, 138, 142, 146, 150, 154, + 158, 162, 166, 170, 174, 178, 182, 186, + 190, 194, 198, 202, 206, 210, 214, 218, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 8, 12, + 14, 16, 18, 20, 22, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 62, 68, + 74, 80, 86, 92, 98, 104, 104, 110, + 116, 122, 128, 133, 138, 138, 138, 142, + 146, 150, 154, 158, 162, 168, 174, 179, + 184, 188, 190, 192, 194, 198, 202, 202, + 202, 206, 210, 216, 222, 227, 232, 237, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 112, 112, 116, + 120, 120, 120, 120, 120, 120, 120, 124, + 128, 132, 136, 142, 148, 154, 160, 164, + 168, 174, 180, 184, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 3, 4, 5, 7, 9, 11, + 12, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 20, 21, 22, 23, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 6, 9, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 17, 17, 17, + 17, 17, 17, 20, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 14, 19, + 22, 27, 32, 37, 37, 42, 42, 47, + 52, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 64, 69, 74, 79, 84, + 89, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 27, 29, 31, 41, 51, 53, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 57, 59, 61, 61, 63, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, + 65, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 40, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 15, 20, 25, 30, 30, 30, 35, + 40, 40, 40, 45, 50, 55, 60, 65, + 70, 70, 70, 75, 80, 85, 90, 95, + 100, 100, 100, 105, 110, 115, 120, 125, + 130, 135, 140, 145, 150, 155, 160, 160, + 160, 165, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 14, 14, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 21, 28, 35, 42, 49, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 21, 28, 28, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 14, 21, 21, 21, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 28, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 14, 21, 21, 28, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 14, 24, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 14, 14, + 14, 14, 14, 21, 21, 21, 21, 21, + 28, 28, 28, 28, 28, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 63, + 72, 79, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 21, 21, + 21, 21, 21, 28, 28, 28, 28, 28, + 35, 35, 35, 35, 35, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 21, 21, 21, 21, + 21, 21, 24, 24, 24, 24, 24, 24, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 28, 30, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 40, 49, 49, 55, + 64, 64, 64, 64, 64, 66, 66, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 4, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 18, 18, 18, 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 20, 21, 21, 21, 22, 23, 24, + 25, 26, 27, 28, 31, 32, 33, 34, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 16, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 17, 17, 17, 17, 17, 17, 17, + 17, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 7, 10, 10, 13, 16, + 18, 18, 21, 22, 23, 24, 25, 26, + 28, 29, 30, 31, 32, 32, 33, 35, + 35, 35, 36, 37, 38, 39, 40, 40, + 40, 42, 45, 47, 47, 48, 48, 51, + 51, 52, 52, 54, 58, 59, 60, 60, + 61, 62, 63, 63, 64, 65, 67, 69, + 71, 73, 74, 74, 74, 74, 76, 78, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, 80, 80, 80, 80, 80, 80, 80, + 80, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 3, 3, 4, 5, + 6, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 13, 18, 23, 28, + 33, 38, 43, 48, 53, 58, 63, 68, + 72, 73, 75, 78, 80, 81, 83, 86, + 90, 92, 93, 95, 98, 99, 100, 101, + 102, 103, 105, 108, 110, 111, 113, 116, + 120, 122, 123, 125, 128, 129, 130, 131, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 6, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 12, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 12, 12, 12, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 24, 24, 30, + 30, 30, 30, 30, 30, 36, 45, 45, + 51, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 6, 6, 12, 12, 12, + 18, 18, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 28, 28, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 40, 44, + 48, 54, 60, 60, 60, 66, 72, 72, + 72, 78, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 12, 12, 18, 24, 24, + 24, 30, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 42, 48, 54, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 24, 24, 24, + 24, 24, 24, 30, 36, 42, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 11, 13, 15, 17, 19, 21, + 23, 25, 27, 29, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 62, 66, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 53, 56, 59, 62, 65, 68, + 71, 74, 77, 80, 83, 86, 89, 92, + 95, 98, 101, 104, 107, 110, 113, 116, + 119, 122, 125, 128, 131, 134, 137, 140, + 143, 146, 149, 152, 155, 158, 161, 162, + 163, 164, 165, 166, 167, 168, 169, 170, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 5, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 55. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 56. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 57. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 58. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 59. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 60. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 61. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 62. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 4, + 4, 7, 10, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 63. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 14, + 14, 21, 21, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 84, 91, 91, + 98, 98, 105, 105, 105, 105, 105, 105, + 105, 112, 119, 119, 126, 133, 133, 140, + 147, 147, 154, 161, 161, 168, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 64. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 11, 15, 15, 22, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 91, 91, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, + }, + { /* Fourth byte table 65. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 21, 21, + 28, 28, 35, 35, 35, 35, 35, 35, + 35, 42, 49, 49, 56, 63, 63, 70, + 77, 77, 84, 91, 91, 98, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 112, 112, 112, + 119, 126, 133, 140, 140, 140, 140, 147, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, + }, + { /* Fourth byte table 66. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 67. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 68. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 45, 45, 45, 48, 51, 54, 57, 60, + 63, 66, 69, 72, 75, 78, 81, 84, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 69. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 53, 56, 59, 62, 65, 68, 71, + 74, 77, 80, 83, 86, 89, 92, 98, + 104, 110, 116, 122, 128, 134, 140, 146, + 152, 158, 164, 170, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, 176, 176, 176, 176, 176, 176, 176, + 176, + }, + { /* Fourth byte table 70. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 149, 151, 153, 155, 157, 159, + 161, 163, 165, 167, 169, 171, 173, 175, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, + }, + { /* Fourth byte table 71. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 46, 51, 51, 51, 51, + 51, 54, 57, 60, 63, 66, 69, 72, + 75, 78, 81, 84, 87, 90, 93, 96, + 99, 102, 105, 108, 111, 114, 117, 120, + 123, 126, 129, 132, 135, 138, 141, 144, + 147, 150, 153, 156, 159, 162, 165, 168, + 171, 174, 177, 180, 183, 186, 189, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 72. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 7, 9, 11, 13, 15, + 17, 20, 24, 26, 28, 31, 34, 36, + 38, 40, 43, 46, 49, 52, 55, 57, + 59, 61, 63, 65, 68, 70, 72, 74, + 77, 80, 82, 85, 88, 91, 93, 96, + 101, 107, 109, 112, 115, 118, 121, 128, + 136, 138, 140, 143, 145, 147, 149, 152, + 154, 156, 158, 160, 162, 165, 167, 169, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 73. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 10, 12, 14, 16, 22, + 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 48, 50, 52, 55, 58, + 60, 64, 67, 69, 71, 73, 75, 75, + 75, 79, 83, 87, 91, 95, 99, 103, + 107, 111, 116, 121, 126, 131, 136, 141, + 146, 151, 156, 161, 166, 171, 176, 181, + 186, 191, 196, 201, 206, 211, 216, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, 221, 221, 221, 221, 221, 221, 221, + 221, + }, + { /* Fourth byte table 74. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 60, 60, 64, 64, 64, 68, 72, + 76, 80, 84, 88, 92, 96, 100, 104, + 104, 108, 108, 112, 112, 112, 116, 120, + 120, 120, 120, 124, 128, 132, 136, 136, + 136, 140, 144, 148, 152, 156, 160, 164, + 168, 172, 176, 180, 184, 188, 192, 196, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, + }, + { /* Fourth byte table 75. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, 172, 172, 172, 172, 172, 172, 172, + 172, + }, + { /* Fourth byte table 76. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 12, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 20, 24, 28, 32, + 36, 36, 36, 36, 36, 36, 41, 41, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 65, 70, 75, 82, 89, 94, + 99, 104, 109, 114, 119, 124, 129, 134, + 134, 139, 144, 149, 154, 159, 159, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 77. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 20, 20, 25, + 30, 35, 40, 45, 50, 55, 60, 65, + 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, + 101, 103, 105, 107, 109, 111, 113, 115, + 117, 119, 121, 123, 125, 127, 129, 131, + 133, 135, 137, 139, 141, 143, 145, 147, + 149, 151, 153, 155, 157, 159, 161, 163, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, + }, + { /* Fourth byte table 78. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 76, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, + 100, 104, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, + }, + { /* Fourth byte table 79. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 6, 8, + 10, 12, 14, 16, 18, 20, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 54, 60, 66, 72, 78, + 84, 90, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 158, 160, 162, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 80. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, + }, + { /* Fourth byte table 81. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 54, 60, 68, 76, 84, 92, 100, + 108, 116, 122, 155, 170, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, + }, + { /* Fourth byte table 82. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 8, 9, 10, 11, 12, + 13, 14, 17, 20, 23, 26, 29, 32, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 83. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 18, 21, 24, 27, 28, 29, + 30, 31, 34, 35, 35, 36, 37, 38, + 39, 42, 43, 44, 45, 46, 49, 52, + 53, 54, 55, 56, 57, 58, 59, 60, + 60, 61, 62, 63, 64, 64, 64, 64, + 64, 67, 71, 74, 74, 77, 77, 80, + 84, 87, 91, 94, 98, 101, 105, 108, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 84. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 18, 22, 26, + 30, 34, 38, 42, 46, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 85. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 112, 118, + 124, 130, 136, 142, 146, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, + }, + { /* Fourth byte table 86. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 87. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 34, 37, 40, 43, 46, 49, 52, 55, + 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 88. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, + }, + { /* Fourth byte table 89. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 6, 9, 12, 15, + 18, 18, 18, 21, 24, 27, 30, 33, + 36, 36, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 57, 60, 63, 63, 63, + 63, 65, 67, 69, 72, 74, 76, 79, + 79, 82, 85, 88, 91, 94, 97, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, + }, + { /* Fourth byte table 90. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 18, 31, 44, 57, 70, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 91. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 18, 31, 44, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 92. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 93. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 94. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 95. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 31, 31, 32, 32, 32, 33, 34, + 34, 34, 35, 36, 37, 38, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 50, 51, 51, 52, 53, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 96. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 2, 3, 3, 4, 5, + 6, 7, 8, 9, 10, 11, 12, 13, + 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 26, 27, 28, 29, + 30, 31, 32, 33, 34, 35, 36, 37, + 38, 39, 40, 41, 42, 43, 44, 45, + 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 97. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 6, + 7, 8, 9, 10, 10, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 18, 19, + 20, 21, 22, 23, 24, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 53, 54, 55, 56, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 98. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 5, 6, + 6, 6, 6, 7, 8, 9, 10, 11, + 12, 13, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, + }, + { /* Fourth byte table 99. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 100. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 101. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 102. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 103. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 36, 36, 36, + 36, 38, 40, 42, 44, 46, 48, 50, + 52, 54, 56, 58, 60, 62, 64, 66, + 68, 70, 72, 74, 76, 78, 80, 82, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 104. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 58, 60, 62, 64, + 66, 68, 70, 72, 74, 76, 78, 80, + 82, 84, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 123, 125, 127, 129, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, + }, + { /* Fourth byte table 105. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 106. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 107. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 108. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 11, 13, 15, + 17, 19, 21, 21, 21, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, 71, 71, 71, 71, 71, 71, 71, + 71, + }, + { /* Fourth byte table 109. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 13, 17, 21, 25, 29, + 33, 37, 42, 46, 50, 54, 58, 62, + 66, 71, 75, 80, 85, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, + }, + { /* Fourth byte table 110. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 111. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 112. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 113. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 114. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 115. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 116. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 117. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 4, 4, 5, 5, 5, 5, 5, + 8, 8, 8, 9, 10, 13, 15, 15, + 15, 18, 19, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 24, + 28, 32, 36, 40, 44, 48, 52, 56, + 60, 60, 64, 68, 72, 76, 80, 84, + 84, 84, 88, 92, 96, 100, 104, 104, + 104, 108, 112, 116, 120, 124, 128, 128, + 132, 136, 140, 144, 148, 152, 156, 160, + 164, 164, 168, 172, 176, 180, 184, 188, + 188, 188, 192, 196, 200, 204, 208, 208, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 68, 72, 76, 80, 84, + 88, 92, 96, 100, 104, 108, 112, 116, + 120, 124, 128, 132, 136, 140, 144, 144, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 180, 182, 184, 188, 192, 196, + 200, 200, 204, 208, 212, 216, 220, 224, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, 227, 227, 227, 227, 227, 227, 227, + 227, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 7, 11, 15, 19, + 23, 27, 30, 30, 30, 34, 38, 42, + 46, 50, 54, 54, 54, 58, 62, 66, + 70, 74, 78, 82, 86, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 126, 126, + 126, 130, 134, 138, 142, 146, 150, 154, + 158, 162, 166, 170, 174, 178, 182, 186, + 190, 194, 198, 202, 206, 210, 214, 218, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, 219, 219, 219, 219, 219, 219, 219, + 219, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 4, 8, 12, + 14, 16, 18, 20, 22, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 62, 68, + 74, 80, 86, 92, 98, 104, 104, 110, + 116, 122, 128, 133, 138, 138, 138, 142, + 146, 150, 154, 158, 162, 168, 174, 179, + 184, 188, 190, 192, 194, 198, 202, 202, + 202, 206, 210, 216, 222, 227, 232, 237, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, 242, 242, 242, 242, 242, 242, 242, + 242, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 112, 112, 116, + 120, 120, 120, 120, 120, 120, 120, 124, + 128, 132, 136, 142, 148, 154, 160, 164, + 168, 174, 180, 184, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, 188, 188, 188, 188, 188, 188, 188, + 188, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 3, 4, 5, 7, 9, 11, + 12, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 20, 21, 22, 23, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 6, 9, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 17, 17, 17, + 17, 17, 17, 20, 20, 20, 20, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 14, 19, + 22, 27, 32, 37, 37, 42, 42, 47, + 52, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 64, 69, 74, 79, 84, + 89, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 27, 29, 31, 41, 51, 53, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 57, 59, 61, 61, 63, 65, 65, + 65, 65, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, 67, 67, 67, 67, 67, 67, 67, + 67, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 40, 40, 40, 40, 40, 40, + 40, 40, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 15, 15, 15, + 20, 20, 20, 20, 20, 25, 30, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 40, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 15, 20, 25, 30, 30, 30, 35, + 40, 40, 40, 45, 50, 55, 60, 65, + 70, 70, 70, 75, 80, 85, 90, 95, + 100, 100, 100, 105, 110, 115, 120, 125, + 130, 135, 140, 145, 150, 155, 160, 160, + 160, 165, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, 170, 170, 170, 170, 170, 170, 170, + 170, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, + 4, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 5, 10, 15, 20, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 4, 8, + 12, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 5, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 10, 10, 10, 10, + 10, 10, 10, 10, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 14, 14, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 21, 28, 35, 42, 49, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 21, 28, 28, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 14, 21, 21, 21, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 28, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 7, 7, 7, 7, 7, + 14, 21, 21, 28, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 14, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 7, 7, 14, 24, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, 31, 31, 31, 31, 31, 31, 31, + 31, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 14, 14, + 14, 14, 14, 21, 21, 21, 21, 21, + 28, 28, 28, 28, 28, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 63, + 72, 79, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 21, 21, + 21, 21, 21, 28, 28, 28, 28, 28, + 35, 35, 35, 35, 35, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, 49, 49, 49, 49, 49, 49, 49, + 49, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, + 7, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 39. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 7, + 7, 14, 14, 21, 21, 28, 28, 35, + 35, 35, 35, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 49, 49, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 40. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 14, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 41. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 1, 3, 4, + 4, 5, 6, 8, 9, 10, 11, 12, + 13, 14, 15, 16, 16, 17, 19, 20, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 42. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 6, 8, 11, + 12, 13, 14, 16, 18, 20, 21, 21, + 22, 23, 25, 26, 28, 31, 34, 35, + 36, 37, 40, 42, 43, 46, 48, 50, + 52, 54, 56, 57, 58, 59, 60, 62, + 64, 66, 68, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 43. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 3, 5, 7, + 9, 10, 12, 14, 16, 18, 20, 22, + 25, 27, 29, 32, 34, 36, 38, 40, + 42, 44, 46, 48, 50, 52, 54, 56, + 58, 61, 63, 65, 66, 68, 70, 72, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, + }, + { /* Fourth byte table 44. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 21, 21, 21, 21, + 21, 21, 24, 24, 24, 24, 24, 24, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 28, 30, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 34, 34, 34, 34, 40, 49, 49, 55, + 64, 64, 64, 64, 64, 66, 66, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, 69, 69, 69, 69, 69, 69, 69, + 69, + }, + { /* Fourth byte table 45. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 2, 4, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 18, 18, 18, 18, 18, 18, 18, 18, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 19, 19, + 19, 20, 21, 21, 21, 22, 23, 24, + 25, 26, 27, 28, 31, 32, 33, 34, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, 35, 35, 35, 35, 35, 35, 35, + 35, + }, + { /* Fourth byte table 46. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 14, 15, 16, 17, + 17, 18, 19, 20, 21, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, + 23, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, + 25, + }, + { /* Fourth byte table 47. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 7, 10, 10, 13, 16, + 18, 18, 21, 22, 23, 24, 25, 26, + 28, 29, 30, 31, 32, 32, 33, 35, + 35, 35, 36, 37, 38, 39, 40, 40, + 40, 42, 45, 47, 47, 48, 48, 51, + 51, 52, 52, 54, 58, 59, 60, 60, + 61, 62, 63, 63, 64, 65, 67, 69, + 71, 73, 74, 74, 77, 79, 81, 83, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, 85, 85, 85, 85, 85, 85, 85, + 85, + }, + { /* Fourth byte table 48. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 3, 3, 3, 4, 5, + 6, 7, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 13, 18, 23, 28, + 33, 38, 43, 48, 53, 58, 63, 68, + 72, 73, 75, 78, 80, 81, 83, 86, + 90, 92, 93, 95, 98, 99, 100, 101, + 102, 103, 105, 108, 110, 111, 113, 116, + 120, 122, 123, 125, 128, 129, 130, 131, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, 132, 132, 132, 132, 132, 132, 132, + 132, + }, + { /* Fourth byte table 49. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 6, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 50. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 6, 12, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 51. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 12, 12, 12, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 24, 24, 30, + 30, 30, 30, 30, 30, 36, 45, 45, + 51, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 52. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 6, 6, 6, 12, 12, 12, + 18, 18, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 28, 28, 34, 34, 34, 34, 34, + 34, 34, 34, 34, 34, 34, 40, 44, + 48, 54, 60, 60, 60, 66, 72, 72, + 72, 78, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 53. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 12, 12, 18, 24, 24, + 24, 30, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 36, 36, 36, + 36, 36, 36, 36, 36, 42, 48, 54, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 54. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 24, 24, 24, + 24, 24, 24, 30, 36, 42, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 55. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 4, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 56. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 11, 13, 15, 17, 19, 21, + 23, 25, 27, 29, 31, 34, 37, 40, + 43, 46, 49, 52, 55, 58, 62, 66, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 57. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 53, 56, 59, 62, 65, 68, + 71, 74, 77, 80, 83, 86, 89, 92, + 95, 98, 101, 104, 107, 110, 113, 116, + 119, 122, 125, 128, 131, 134, 137, 140, + 143, 146, 149, 152, 155, 158, 161, 162, + 163, 164, 165, 166, 167, 168, 169, 170, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 58. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, 43, 43, 43, 43, 43, 43, 43, + 43, + }, + { /* Fourth byte table 59. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 60. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 5, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 61. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, 6, 6, 6, 6, 6, 6, 6, + 6, + }, + { /* Fourth byte table 62. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 63. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 64. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 65. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 66. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 67. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 68. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 69. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 4, + 4, 7, 10, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 70. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 14, + 14, 21, 21, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 84, 91, 91, + 98, 98, 105, 105, 105, 105, 105, 105, + 105, 112, 119, 119, 126, 133, 133, 140, + 147, 147, 154, 161, 161, 168, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, 175, 175, 175, 175, 175, 175, 175, + 175, + }, + { /* Fourth byte table 71. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 7, 7, 7, + 7, 7, 7, 7, 11, 15, 15, 22, + 28, 28, 28, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 35, 35, 42, + 42, 49, 49, 56, 56, 63, 63, 70, + 70, 77, 77, 84, 84, 91, 91, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, 98, 98, 98, 98, 98, 98, 98, + 98, + }, + { /* Fourth byte table 72. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 7, 14, 14, 14, 21, 21, + 28, 28, 35, 35, 35, 35, 35, 35, + 35, 42, 49, 49, 56, 63, 63, 70, + 77, 77, 84, 91, 91, 98, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 105, 105, 105, + 105, 105, 105, 105, 105, 112, 112, 112, + 119, 126, 133, 140, 140, 140, 140, 147, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, 153, 153, 153, 153, 153, 153, 153, + 153, + }, + { /* Fourth byte table 73. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, 45, 45, 45, 45, 45, 45, 45, + 45, + }, + { /* Fourth byte table 74. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, 192, 192, 192, 192, 192, 192, 192, + 192, + }, + { /* Fourth byte table 75. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 45, 45, 45, 48, 51, 54, 57, 60, + 63, 66, 69, 72, 75, 78, 81, 84, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 76. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 23, 25, 27, 29, 31, 33, 35, + 37, 39, 41, 43, 45, 47, 49, 51, + 53, 56, 59, 62, 65, 68, 71, 74, + 77, 80, 83, 86, 89, 92, 95, 101, + 107, 113, 119, 125, 131, 137, 143, 149, + 155, 161, 167, 173, 179, 194, 206, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, 212, 212, 212, 212, 212, 212, 212, + 212, + }, + { /* Fourth byte table 77. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 149, 151, 153, 155, 157, 159, + 161, 163, 165, 167, 169, 171, 173, 175, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, 177, 177, 177, 177, 177, 177, 177, + 177, + }, + { /* Fourth byte table 78. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 46, 51, 53, 56, 58, + 61, 64, 67, 70, 73, 76, 79, 82, + 85, 88, 91, 94, 97, 100, 103, 106, + 109, 112, 115, 118, 121, 124, 127, 130, + 133, 136, 139, 142, 145, 148, 151, 154, + 157, 160, 163, 166, 169, 172, 175, 178, + 181, 184, 187, 190, 193, 196, 199, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, 202, 202, 202, 202, 202, 202, 202, + 202, + }, + { /* Fourth byte table 79. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 7, 9, 11, 13, 15, + 17, 20, 24, 26, 28, 31, 34, 36, + 38, 40, 43, 46, 49, 52, 55, 57, + 59, 61, 63, 65, 68, 70, 72, 74, + 77, 80, 82, 85, 88, 91, 93, 96, + 101, 107, 109, 112, 115, 118, 121, 128, + 136, 138, 140, 143, 145, 147, 149, 152, + 154, 156, 158, 160, 162, 165, 167, 169, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, 171, 171, 171, 171, 171, 171, 171, + 171, + }, + { /* Fourth byte table 80. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 10, 12, 14, 16, 22, + 25, 27, 29, 31, 33, 35, 37, 39, + 41, 43, 45, 48, 50, 52, 55, 58, + 60, 64, 67, 69, 71, 73, 75, 80, + 85, 89, 93, 97, 101, 105, 109, 113, + 117, 121, 126, 131, 136, 141, 146, 151, + 156, 161, 166, 171, 176, 181, 186, 191, + 196, 201, 206, 211, 216, 221, 226, 231, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, 234, 234, 234, 234, 234, 234, 234, + 234, + }, + { /* Fourth byte table 81. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 60, 60, 64, 64, 64, 68, 72, + 76, 80, 84, 88, 92, 96, 100, 104, + 104, 108, 108, 112, 112, 112, 116, 120, + 120, 120, 120, 124, 128, 132, 136, 136, + 136, 140, 144, 148, 152, 156, 160, 164, + 168, 172, 176, 180, 184, 188, 192, 196, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, 200, 200, 200, 200, 200, 200, 200, + 200, + }, + { /* Fourth byte table 82. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 172, 172, 172, 172, + 172, 176, 180, 184, 188, 192, 196, 200, + 204, 208, 212, 216, 220, 224, 228, 232, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, 236, 236, 236, 236, 236, 236, 236, + 236, + }, + { /* Fourth byte table 83. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 65, 70, 75, 79, 83, 87, 92, 97, + 102, 106, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, 110, 110, 110, 110, 110, 110, 110, + 110, + }, + { /* Fourth byte table 84. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 12, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 20, 24, 28, 32, + 36, 36, 36, 36, 36, 36, 41, 41, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 64, 65, 70, 75, 82, 89, 94, + 99, 104, 109, 114, 119, 124, 129, 134, + 134, 139, 144, 149, 154, 159, 159, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 85. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 10, 15, 20, 20, 25, + 30, 35, 40, 45, 50, 55, 60, 65, + 69, 71, 73, 75, 77, 79, 81, 83, + 85, 87, 89, 91, 93, 95, 97, 99, + 101, 103, 105, 107, 109, 111, 113, 115, + 117, 119, 121, 123, 125, 127, 129, 131, + 133, 135, 137, 139, 141, 143, 145, 147, + 149, 151, 153, 155, 157, 159, 161, 163, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, 165, 165, 165, 165, 165, 165, 165, + 165, + }, + { /* Fourth byte table 86. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 76, 80, 82, + 84, 86, 88, 90, 92, 94, 96, 98, + 100, 104, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, 108, 108, 108, 108, 108, 108, 108, + 108, + }, + { /* Fourth byte table 87. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 6, 8, + 10, 12, 14, 16, 18, 20, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 54, 60, 66, 72, 78, + 84, 90, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 158, 160, 162, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, 164, 164, 164, 164, 164, 164, 164, + 164, + }, + { /* Fourth byte table 88. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, 248, 248, 248, 248, 248, 248, 248, + 248, + }, + { /* Fourth byte table 89. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 54, 60, 68, 76, 84, 92, 100, + 108, 116, 122, 155, 170, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, 178, 178, 178, 178, 178, 178, 178, + 178, + }, + { /* Fourth byte table 90. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 4, 7, 8, 9, 10, 11, + 14, 17, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 22, 25, 28, 29, 30, 31, 32, + 33, 34, 37, 40, 43, 46, 49, 52, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, 55, 55, 55, 55, 55, 55, 55, + 55, + }, + { /* Fourth byte table 91. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 16, 17, 20, 23, 26, 29, 30, 31, + 32, 33, 36, 37, 37, 38, 39, 40, + 41, 44, 45, 46, 47, 48, 51, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 62, 63, 64, 65, 66, 66, 66, 66, + 66, 69, 73, 76, 76, 79, 79, 82, + 86, 89, 93, 96, 100, 103, 107, 110, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, + }, + { /* Fourth byte table 92. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 6, 10, 14, 18, 22, 26, + 30, 34, 38, 42, 46, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 90, 92, 94, 96, 98, 100, 102, + 104, 106, 108, 110, 112, 114, 116, 118, + 120, 122, 124, 126, 128, 130, 132, 134, + 136, 138, 140, 142, 144, 146, 148, 150, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 93. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 98, 100, 102, 104, 106, 112, 118, + 124, 130, 136, 142, 146, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, 150, 150, 150, 150, 150, 150, 150, + 150, + }, + { /* Fourth byte table 94. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 95. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 34, 37, 40, 43, 46, 49, 52, 55, + 58, 61, 64, 67, 70, 73, 76, 79, + 82, 85, 88, 91, 94, 97, 100, 103, + 106, 109, 112, 115, 118, 121, 124, 127, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 96. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 144, 147, 150, 153, 156, 159, 162, 165, + 168, 171, 174, 177, 180, 183, 186, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, 189, 189, 189, 189, 189, 189, 189, + 189, + }, + { /* Fourth byte table 97. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 3, 6, 9, 12, 15, + 18, 18, 18, 21, 24, 27, 30, 33, + 36, 36, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 57, 60, 63, 63, 63, + 63, 65, 67, 69, 72, 74, 76, 79, + 79, 82, 85, 88, 91, 94, 97, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, 100, 100, 100, 100, 100, 100, 100, + 100, + }, + { /* Fourth byte table 98. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 9, + 18, 31, 44, 57, 70, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 99. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 9, 18, 31, 44, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 100. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, 13, 13, 13, 13, 13, 13, 13, + 13, + }, + { /* Fourth byte table 101. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 102. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 103. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 29, 30, + 31, 31, 31, 32, 32, 32, 33, 34, + 34, 34, 35, 36, 37, 38, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 50, 51, 51, 52, 53, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 104. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 4, 5, 6, + 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, + 23, 24, 25, 26, 27, 28, 29, 30, + 31, 32, 33, 34, 35, 36, 37, 38, + 39, 40, 41, 42, 43, 44, 45, 46, + 47, 48, 49, 50, 51, 52, 53, 54, + 55, 56, 57, 58, 59, 60, 61, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 105. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 6, + 7, 8, 9, 10, 10, 10, 11, 12, + 13, 14, 15, 16, 17, 18, 18, 19, + 20, 21, 22, 23, 24, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 53, 54, 55, 56, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 106. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 5, 6, + 6, 6, 6, 7, 8, 9, 10, 11, + 12, 13, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, 59, 59, 59, 59, 59, 59, 59, + 59, + }, + { /* Fourth byte table 107. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 108. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 109. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 110. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 37, 38, 39, + 40, 41, 42, 43, 44, 45, 46, 47, + 48, 49, 50, 51, 52, 53, 54, 55, + 56, 57, 58, 59, 60, 61, 62, 63, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 111. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31, + 32, 33, 34, 35, 36, 38, 40, 40, + 40, 42, 44, 46, 48, 50, 52, 54, + 56, 58, 60, 62, 64, 66, 68, 70, + 72, 74, 76, 78, 80, 82, 84, 86, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, 88, 88, 88, 88, 88, 88, 88, + 88, + }, + { /* Fourth byte table 112. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 5, 7, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 58, 60, 62, 64, + 66, 68, 70, 72, 74, 76, 78, 80, + 82, 84, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 123, 125, 127, 129, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, 131, 131, 131, 131, 131, 131, 131, + 131, + }, + { /* Fourth byte table 113. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 97, 99, 101, 103, 105, 107, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 114. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 85, 87, 89, 91, 93, 95, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 115. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 21, 23, 25, 27, 29, 31, + 33, 35, 37, 39, 41, 43, 45, 47, + 49, 51, 53, 55, 57, 59, 61, 63, + 65, 67, 69, 71, 73, 75, 77, 79, + 81, 83, 86, 88, 90, 92, 94, 96, + 98, 100, 102, 104, 106, 108, 110, 112, + 114, 116, 118, 120, 122, 124, 126, 128, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, 130, 130, 130, 130, 130, 130, 130, + 130, + }, + { /* Fourth byte table 116. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 9, 11, 13, 15, + 17, 19, 21, 23, 25, 25, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 36, 37, 38, 39, 40, 41, 42, + 43, 44, 45, 46, 47, 48, 49, 50, + 51, 52, 53, 54, 55, 56, 57, 58, + 59, 60, 61, 62, 63, 64, 65, 66, + 67, 68, 69, 70, 71, 72, 73, 74, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, 75, 75, 75, 75, 75, 75, 75, + 75, + }, + { /* Fourth byte table 117. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 13, 17, 21, 25, 29, + 33, 37, 42, 46, 50, 54, 58, 62, + 66, 71, 75, 80, 85, 90, 94, 98, + 102, 106, 110, 114, 118, 122, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, 127, 127, 127, 127, 127, 127, 127, + 127, + }, + }, +}; + +static const uint16_t u8_decomp_b4_16bit_tbl[2][30][257] = { + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 38, 44, 48, 52, 56, 60, 64, + 68, 72, 76, 80, 84, 90, 96, 102, + 108, 112, 116, 120, 124, 130, 136, 140, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 184, 188, 192, 196, 200, 206, + 212, 216, 220, 224, 228, 232, 236, 240, + 244, 250, 256, 260, 264, 268, 272, 276, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 54, 60, 66, + 72, 78, 84, 90, 96, 100, 104, 108, + 112, 116, 120, 124, 128, 134, 140, 144, + 148, 152, 156, 160, 164, 170, 176, 182, + 188, 194, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 262, 268, 274, 280, 284, 288, 292, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 107, 116, 116, 116, 116, + 116, 120, 124, 128, 132, 138, 144, 150, + 156, 162, 168, 174, 180, 186, 192, 198, + 204, 210, 216, 222, 228, 234, 240, 246, + 252, 256, 260, 264, 268, 272, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 52, 56, 60, 64, 68, 72, 76, + 80, 86, 92, 98, 104, 110, 116, 122, + 128, 134, 140, 146, 152, 158, 164, 170, + 176, 182, 188, 194, 200, 204, 208, 212, + 216, 222, 228, 234, 240, 246, 252, 258, + 264, 270, 276, 280, 284, 288, 292, 296, + 300, 304, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 45, + 52, 57, 62, 69, 76, 83, 90, 97, + 104, 109, 114, 121, 128, 135, 142, 142, + 142, 147, 152, 159, 166, 173, 180, 180, + 180, 185, 190, 197, 204, 211, 218, 225, + 232, 237, 242, 249, 256, 263, 270, 277, + 284, 289, 294, 301, 308, 315, 322, 329, + 336, 341, 346, 353, 360, 367, 374, 381, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, + }, + { /* Fourth byte 16-bit table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 38, + 38, 43, 48, 55, 62, 69, 76, 76, + 76, 81, 86, 93, 100, 107, 114, 121, + 128, 128, 133, 133, 140, 140, 147, 147, + 154, 159, 164, 171, 178, 185, 192, 199, + 206, 211, 216, 223, 230, 237, 244, 251, + 258, 263, 268, 273, 278, 283, 288, 293, + 298, 303, 308, 313, 318, 323, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, + }, + { /* Fourth byte 16-bit table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 23, 32, 41, 50, 59, + 68, 75, 82, 91, 100, 109, 118, 127, + 136, 143, 150, 159, 168, 177, 186, 195, + 204, 211, 218, 227, 236, 245, 254, 263, + 272, 279, 286, 295, 304, 313, 322, 331, + 340, 347, 354, 363, 372, 381, 390, 399, + 408, 413, 418, 425, 430, 437, 437, 442, + 449, 454, 459, 464, 469, 474, 477, 480, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, + }, + { /* Fourth byte 16-bit table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 14, 21, 26, 33, 33, 38, + 45, 50, 55, 60, 65, 70, 82, 94, + 106, 111, 116, 123, 130, 130, 130, 135, + 142, 147, 152, 157, 162, 162, 174, 186, + 198, 203, 208, 215, 222, 227, 232, 237, + 244, 249, 254, 259, 264, 269, 280, 291, + 293, 293, 293, 300, 305, 312, 312, 317, + 324, 329, 334, 339, 344, 349, 356, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, + }, + { /* Fourth byte 16-bit table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 25, 30, 35, + 40, 45, 50, 55, 60, 65, 70, 78, + 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 190, 190, + 190, 195, 200, 205, 210, 215, 220, 225, + 230, 235, 240, 245, 250, 255, 260, 265, + 270, 275, 280, 285, 290, 295, 300, 305, + 310, 315, 320, 325, 330, 335, 340, 345, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, 350, 350, 350, 350, 350, 350, 350, + 350, + }, + { /* Fourth byte 16-bit table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 27, 42, 51, 66, 75, 84, + 102, 114, 123, 132, 141, 153, 165, 177, + 189, 201, 213, 225, 243, 249, 267, 285, + 300, 312, 330, 348, 360, 369, 378, 390, + 402, 417, 432, 441, 450, 462, 471, 480, + 486, 492, 501, 510, 528, 540, 555, 573, + 585, 594, 603, 621, 633, 651, 660, 675, + 684, 696, 705, 717, 732, 744, 759, 771, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, + }, + { /* Fourth byte 16-bit table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 24, 33, 45, 54, 63, 72, + 87, 99, 105, 123, 132, 147, 159, 171, + 180, 189, 201, 207, 219, 234, 240, 258, + 267, 271, 275, 279, 283, 287, 291, 295, + 299, 303, 307, 312, 317, 322, 327, 332, + 337, 342, 347, 352, 357, 362, 367, 372, + 377, 382, 385, 387, 389, 392, 394, 396, + 396, 396, 396, 396, 402, 408, 414, 420, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, 432, 432, 432, 432, 432, 432, 432, + 432, + }, + { /* Fourth byte 16-bit table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 34, 38, + 42, 46, 50, 54, 58, 62, 66, 70, + 74, 78, 82, 86, 90, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 125, + 130, 135, 140, 145, 150, 156, 162, 168, + 174, 180, 186, 190, 194, 198, 202, 206, + 210, 214, 218, 222, 226, 230, 234, 238, + 242, 246, 250, 254, 258, 262, 266, 270, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, + }, + { /* Fourth byte 16-bit table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 98, 104, 110, 116, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 130, 136, 140, 144, 148, 152, 156, 160, + 164, 168, 172, 176, 180, 184, 188, 192, + 196, 200, 204, 210, 216, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 102, 108, 114, 120, 126, 132, 138, + 144, 150, 156, 162, 168, 174, 180, 186, + 192, 198, 204, 210, 216, 222, 228, 234, + 240, 246, 252, 258, 264, 270, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 96, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 162, 168, 174, + 180, 186, 192, 198, 204, 210, 216, 222, + 228, 234, 240, 246, 252, 258, 264, 270, + 276, 282, 288, 294, 300, 306, 312, 318, + 324, 330, 336, 342, 348, 354, 360, 366, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, + }, + { /* Fourth byte 16-bit table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 17, 21, 25, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 70, 74, 79, 83, 87, 91, 96, + 100, 104, 108, 112, 116, 121, 125, 129, + 133, 137, 141, 145, 149, 153, 157, 161, + 165, 169, 173, 177, 181, 185, 189, 193, + 197, 201, 205, 209, 213, 218, 222, 226, + 230, 235, 239, 243, 247, 251, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 105, 109, 113, 117, 121, 125, + 129, 134, 139, 143, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 200, 205, 209, 213, 217, 221, 225, + 229, 233, 237, 241, 246, 250, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 45, 49, 53, 57, 61, + 66, 70, 75, 80, 84, 88, 92, 96, + 101, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 159, 163, + 167, 171, 175, 179, 183, 187, 191, 195, + 199, 203, 207, 211, 215, 219, 223, 227, + 231, 236, 240, 244, 248, 252, 256, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 45, 49, 53, 57, 61, + 65, 69, 73, 77, 81, 85, 89, 93, + 97, 101, 105, 109, 113, 117, 122, 126, + 130, 134, 138, 142, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 201, 205, 209, 213, 217, 221, 225, + 230, 235, 240, 244, 249, 253, 257, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 71, 76, 80, 84, 88, 92, 96, + 100, 104, 108, 112, 117, 121, 126, 130, + 135, 139, 143, 147, 152, 156, 160, 165, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 227, 231, + 236, 240, 245, 249, 254, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 14, 19, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 61, 65, + 69, 73, 77, 82, 86, 91, 96, 100, + 104, 108, 112, 116, 120, 125, 130, 135, + 139, 143, 148, 152, 156, 160, 165, 169, + 173, 177, 181, 185, 190, 194, 198, 202, + 206, 210, 214, 219, 224, 228, 233, 237, + 242, 246, 250, 254, 259, 264, 268, 273, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, + }, + { /* Fourth byte 16-bit table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 9, 13, 17, 21, 25, 29, + 34, 39, 44, 49, 53, 57, 61, 65, + 69, 73, 77, 81, 85, 89, 93, 97, + 102, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 160, 165, + 169, 173, 177, 181, 186, 190, 195, 199, + 203, 208, 213, 217, 221, 225, 229, 233, + 237, 241, 245, 249, 253, 257, 261, 265, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, + }, + { /* Fourth byte 16-bit table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 25, 29, + 33, 37, 41, 45, 50, 55, 59, 63, + 67, 71, 75, 79, 84, 88, 92, 96, + 100, 105, 110, 114, 118, 122, 127, 131, + 135, 140, 145, 149, 153, 157, 162, 166, + 170, 174, 178, 182, 186, 190, 195, 199, + 203, 207, 212, 216, 220, 224, 228, 233, + 238, 242, 246, 250, 255, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte 16-bit table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 38, 44, 48, 52, 56, 60, 64, + 68, 72, 76, 80, 84, 90, 96, 102, + 108, 112, 116, 120, 124, 130, 136, 140, + 144, 148, 152, 156, 160, 164, 168, 172, + 176, 180, 184, 188, 192, 196, 200, 206, + 212, 216, 220, 224, 228, 232, 236, 240, + 244, 250, 256, 260, 264, 268, 272, 276, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, 280, 280, 280, 280, 280, 280, 280, + 280, + }, + { /* Fourth byte 16-bit table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 54, 60, 66, + 72, 78, 84, 90, 96, 100, 104, 108, + 112, 116, 120, 124, 128, 134, 140, 144, + 148, 152, 156, 160, 164, 170, 176, 182, + 188, 194, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 262, 268, 274, 280, 284, 288, 292, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, 296, 296, 296, 296, 296, 296, 296, + 296, + }, + { /* Fourth byte 16-bit table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 107, 116, 116, 116, 116, + 116, 120, 124, 128, 132, 138, 144, 150, + 156, 162, 168, 174, 180, 186, 192, 198, + 204, 210, 216, 222, 228, 234, 240, 246, + 252, 256, 260, 264, 268, 272, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 52, 56, 60, 64, 68, 72, 76, + 80, 86, 92, 98, 104, 110, 116, 122, + 128, 134, 140, 146, 152, 158, 164, 170, + 176, 182, 188, 194, 200, 204, 208, 212, + 216, 222, 228, 234, 240, 246, 252, 258, + 264, 270, 276, 280, 284, 288, 292, 296, + 300, 304, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, 308, 308, 308, 308, 308, 308, 308, + 308, + }, + { /* Fourth byte 16-bit table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 45, + 52, 57, 62, 69, 76, 83, 90, 97, + 104, 109, 114, 121, 128, 135, 142, 142, + 142, 147, 152, 159, 166, 173, 180, 180, + 180, 185, 190, 197, 204, 211, 218, 225, + 232, 237, 242, 249, 256, 263, 270, 277, + 284, 289, 294, 301, 308, 315, 322, 329, + 336, 341, 346, 353, 360, 367, 374, 381, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, 388, 388, 388, 388, 388, 388, 388, + 388, + }, + { /* Fourth byte 16-bit table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 17, 24, 31, 38, 38, + 38, 43, 48, 55, 62, 69, 76, 76, + 76, 81, 86, 93, 100, 107, 114, 121, + 128, 128, 133, 133, 140, 140, 147, 147, + 154, 159, 164, 171, 178, 185, 192, 199, + 206, 211, 216, 223, 230, 237, 244, 251, + 258, 263, 268, 273, 278, 283, 288, 293, + 298, 303, 308, 313, 318, 323, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, 328, 328, 328, 328, 328, 328, 328, + 328, + }, + { /* Fourth byte 16-bit table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 7, 14, 23, 32, 41, 50, 59, + 68, 75, 82, 91, 100, 109, 118, 127, + 136, 143, 150, 159, 168, 177, 186, 195, + 204, 211, 218, 227, 236, 245, 254, 263, + 272, 279, 286, 295, 304, 313, 322, 331, + 340, 347, 354, 363, 372, 381, 390, 399, + 408, 413, 418, 425, 430, 437, 437, 442, + 449, 454, 459, 464, 469, 474, 477, 480, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, 483, 483, 483, 483, 483, 483, 483, + 483, + }, + { /* Fourth byte 16-bit table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 14, 21, 26, 33, 33, 38, + 45, 50, 55, 60, 65, 70, 82, 94, + 106, 111, 116, 123, 130, 130, 130, 135, + 142, 147, 152, 157, 162, 162, 174, 186, + 198, 203, 208, 215, 222, 227, 232, 237, + 244, 249, 254, 259, 264, 269, 280, 291, + 293, 293, 293, 300, 305, 312, 312, 317, + 324, 329, 334, 339, 344, 349, 356, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, 359, 359, 359, 359, 359, 359, 359, + 359, + }, + { /* Fourth byte 16-bit table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 10, 15, 20, 25, 30, 35, + 40, 45, 50, 55, 60, 65, 70, 78, + 86, 94, 102, 110, 118, 126, 134, 142, + 150, 158, 166, 174, 182, 190, 207, 221, + 221, 226, 231, 236, 241, 246, 251, 256, + 261, 266, 271, 276, 281, 286, 291, 296, + 301, 306, 311, 316, 321, 326, 331, 336, + 341, 346, 351, 356, 361, 366, 371, 376, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, 381, 381, 381, 381, 381, 381, 381, + 381, + }, + { /* Fourth byte 16-bit table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 27, 42, 51, 66, 75, 84, + 102, 114, 123, 132, 141, 153, 165, 177, + 189, 201, 213, 225, 243, 249, 267, 285, + 300, 312, 330, 348, 360, 369, 378, 390, + 402, 417, 432, 441, 450, 462, 471, 480, + 486, 492, 501, 510, 528, 540, 555, 573, + 585, 594, 603, 621, 633, 651, 660, 675, + 684, 696, 705, 717, 732, 744, 759, 771, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, 777, 777, 777, 777, 777, 777, 777, + 777, + }, + { /* Fourth byte 16-bit table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 15, 24, 33, 45, 54, 63, 72, + 87, 99, 105, 123, 132, 147, 159, 171, + 180, 189, 201, 207, 219, 234, 240, 258, + 267, 271, 275, 279, 283, 287, 291, 295, + 299, 303, 307, 312, 317, 322, 327, 332, + 337, 342, 347, 352, 357, 362, 367, 372, + 377, 382, 385, 387, 389, 392, 394, 396, + 398, 401, 404, 406, 412, 418, 424, 430, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, 442, 442, 442, 442, 442, 442, 442, + 442, + }, + { /* Fourth byte 16-bit table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 164, 168, 172, 176, 180, 184, 188, + 192, 196, 200, 204, 208, 212, 216, 220, + 224, 228, 232, 236, 240, 244, 248, 252, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, 256, 256, 256, 256, 256, 256, 256, + 256, + }, + { /* Fourth byte 16-bit table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 34, 38, + 42, 46, 50, 54, 58, 62, 66, 70, + 74, 78, 82, 86, 90, 94, 98, 102, + 106, 110, 114, 118, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 125, + 130, 135, 140, 145, 150, 156, 162, 168, + 174, 180, 186, 190, 194, 198, 202, 206, + 210, 214, 218, 222, 226, 230, 234, 238, + 242, 246, 250, 254, 258, 262, 266, 270, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, 274, 274, 274, 274, 274, 274, 274, + 274, + }, + { /* Fourth byte 16-bit table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 98, 104, 110, 116, 122, 126, 130, 134, + 138, 142, 146, 150, 154, 158, 162, 166, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 130, 136, 140, 144, 148, 152, 156, 160, + 164, 168, 172, 176, 180, 184, 188, 192, + 196, 200, 204, 210, 216, 222, 226, 230, + 234, 238, 242, 246, 250, 254, 258, 262, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, 266, 266, 266, 266, 266, 266, 266, + 266, + }, + { /* Fourth byte 16-bit table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 102, 108, 114, 120, 126, 132, 138, + 144, 150, 156, 162, 168, 174, 180, 186, + 192, 198, 204, 210, 216, 222, 228, 234, + 240, 246, 252, 258, 264, 270, 276, 282, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, 288, 288, 288, 288, 288, 288, 288, + 288, + }, + { /* Fourth byte 16-bit table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 6, 12, 18, 24, 30, 36, 42, + 48, 54, 60, 66, 72, 78, 84, 90, + 96, 96, 96, 102, 108, 114, 120, 126, + 132, 138, 144, 150, 156, 162, 168, 174, + 180, 186, 192, 198, 204, 210, 216, 222, + 228, 234, 240, 246, 252, 258, 264, 270, + 276, 282, 288, 294, 300, 306, 312, 318, + 324, 330, 336, 342, 348, 354, 360, 366, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, 372, 372, 372, 372, 372, 372, 372, + 372, + }, + { /* Fourth byte 16-bit table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 17, 21, 25, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 70, 74, 79, 83, 87, 91, 96, + 100, 104, 108, 112, 116, 121, 125, 129, + 133, 137, 141, 145, 149, 153, 157, 161, + 165, 169, 173, 177, 181, 185, 189, 193, + 197, 201, 205, 209, 213, 218, 222, 226, + 230, 235, 239, 243, 247, 251, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 105, 109, 113, 117, 121, 125, + 129, 134, 139, 143, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 200, 205, 209, 213, 217, 221, 225, + 229, 233, 237, 241, 246, 250, 255, 259, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, 263, 263, 263, 263, 263, 263, 263, + 263, + }, + { /* Fourth byte 16-bit table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 41, 45, 49, 53, 57, 61, + 66, 70, 75, 80, 84, 88, 92, 96, + 101, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 159, 163, + 167, 171, 175, 179, 183, 187, 191, 195, + 199, 203, 207, 211, 215, 219, 223, 227, + 231, 236, 240, 244, 248, 252, 256, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 45, 49, 53, 57, 61, + 65, 69, 73, 77, 81, 85, 89, 93, + 97, 101, 105, 109, 113, 117, 122, 126, + 130, 134, 138, 142, 147, 151, 155, 159, + 163, 167, 171, 175, 179, 184, 188, 192, + 196, 201, 205, 209, 213, 217, 221, 225, + 230, 235, 240, 244, 249, 253, 257, 261, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, 265, 265, 265, 265, 265, 265, 265, + 265, + }, + { /* Fourth byte 16-bit table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 29, + 33, 37, 41, 45, 49, 53, 58, 62, + 66, 71, 76, 80, 84, 88, 92, 96, + 100, 104, 108, 112, 117, 121, 126, 130, + 135, 139, 143, 147, 152, 156, 160, 165, + 170, 174, 178, 182, 186, 190, 194, 198, + 202, 206, 210, 214, 218, 222, 227, 231, + 236, 240, 245, 249, 254, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + { /* Fourth byte 16-bit table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 9, 14, 19, 24, 28, 32, + 36, 40, 44, 48, 52, 56, 61, 65, + 69, 73, 77, 82, 86, 91, 96, 100, + 104, 108, 112, 116, 120, 125, 130, 135, + 139, 143, 148, 152, 156, 160, 165, 169, + 173, 177, 181, 185, 190, 194, 198, 202, + 206, 210, 214, 219, 224, 228, 233, 237, + 242, 246, 250, 254, 259, 264, 268, 273, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, 277, 277, 277, 277, 277, 277, 277, + 277, + }, + { /* Fourth byte 16-bit table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 5, 9, 13, 17, 21, 25, 29, + 34, 39, 44, 49, 53, 57, 61, 65, + 69, 73, 77, 81, 85, 89, 93, 97, + 102, 106, 110, 114, 118, 122, 126, 130, + 134, 138, 142, 146, 150, 155, 160, 165, + 169, 173, 177, 181, 186, 190, 195, 199, + 203, 208, 213, 217, 221, 225, 229, 233, + 237, 241, 245, 249, 253, 257, 261, 265, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, 269, 269, 269, 269, 269, 269, 269, + 269, + }, + { /* Fourth byte 16-bit table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 25, 29, + 33, 37, 41, 45, 50, 55, 59, 63, + 67, 71, 75, 79, 84, 88, 92, 96, + 100, 105, 110, 114, 118, 122, 127, 131, + 135, 140, 145, 149, 153, 157, 162, 166, + 170, 174, 178, 182, 186, 190, 195, 199, + 203, 207, 212, 216, 220, 224, 228, 233, + 238, 242, 246, 250, 255, 259, 264, 268, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, 272, 272, 272, 272, 272, 272, 272, + 272, + }, + }, +}; + +static const uchar_t u8_decomp_final_tbl[2][19370] = { + { + 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, + 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, + 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, + 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, + 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, + 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, + 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, + 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, + 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, + 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, + 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, + 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, + 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, + 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, + 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, + 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, + 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, + 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, + 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, + 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, + 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, + 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, + 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, + 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, + 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, + 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, + 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, + 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, + 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, + 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, + 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, + 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, + 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, + 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, + 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, + 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, + 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, + 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, + 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, + 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, + 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, + 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, + 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, + 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, + 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, + 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, + 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, + 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, + 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, + 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, + 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, + 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, + 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, + 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, + 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, + 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, + 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, + 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, + 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, + 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, + 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, + 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, + 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, + 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, + 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, + 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, + 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, + 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, + 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, + 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, + 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, + 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, + 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, + 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, + 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, + 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, + 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, + 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, + 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, + 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, + 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, + 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, + 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, + 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, + 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, + 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, + 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, + 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, + 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, + 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, + 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, + 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, + 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, + 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, + 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, + 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, + 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, + 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, + 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, + 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, + 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, + 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, + 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, + 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, + 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, + 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, + 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, + 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, + 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, + 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, + 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, + 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, + 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, + 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, + 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, + 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, + 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, + 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, + 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, + 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, + 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, + 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, + 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, + 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, + 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, + 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, + 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, + 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, + 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, + 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, + 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, 0xD0, 0x95, + 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, 0x81, 0xF6, + 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, 0x9A, 0xCC, + 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, 0xF6, 0xD0, + 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, 0xCC, 0x86, + 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, + 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, 0x88, 0xF6, + 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, 0x96, 0xCC, + 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, 0xF6, 0xD0, + 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, 0xCC, 0x86, + 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, 0xD1, 0xB5, + 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, 0x86, 0xF6, + 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, 0x90, 0xCC, + 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, 0xF6, 0xD0, + 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, 0xCC, 0x88, + 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, 0xD0, 0xB5, + 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, 0x88, 0xF6, + 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, 0x96, 0xCC, + 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, 0xF6, 0xD0, + 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, 0xCC, 0x88, + 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, 0xD0, 0xB8, + 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, 0x88, 0xF6, + 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, 0x9E, 0xCC, + 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, 0xF6, 0xD3, + 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, 0xCC, 0x88, + 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, 0xD1, 0x8D, + 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, 0x84, 0xF6, + 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, 0xA3, 0xCC, + 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, 0xF6, 0xD0, + 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, 0xCC, 0x8B, + 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, 0xD1, 0x87, + 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, 0x88, 0xF6, + 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, 0xD6, 0x82, + 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, 0xD8, 0xA7, + 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, 0x94, 0xF6, + 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, 0x88, 0xD9, + 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, 0x8A, 0xD9, + 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, 0xF6, 0xDB, + 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, 0xD9, 0x94, + 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, + 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x96, + 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x97, 0xE0, + 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA7, + 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, 0xA7, 0x87, + 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, 0xA1, 0xE0, + 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, 0xE0, 0xA6, + 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, 0xA6, 0xBC, + 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, 0xBC, 0xF6, + 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, + 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, + 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0x9C, + 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, 0xAB, 0xE0, + 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, + 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAC, 0xBE, + 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, 0x97, 0xF6, + 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, + 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, 0xE0, 0xAE, + 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xAF, 0x86, + 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x87, 0xE0, + 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, 0xE0, 0xAF, + 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, 0xB1, 0x96, + 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, 0x95, 0xF6, + 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, + 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, 0xE0, 0xB3, + 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, 0xB3, 0x86, + 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, 0xF6, 0xE0, + 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, + 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, 0xB5, 0x86, + 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, + 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, 0xB7, 0x8F, + 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, 0xB8, 0xB2, + 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, 0xE0, 0xBA, + 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, 0xAB, 0xE0, + 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, 0xE0, 0xBD, + 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x8C, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x91, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, 0xBE, 0xB7, + 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, 0xB5, 0xF6, + 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, 0xF6, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, 0xE0, 0xBE, + 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB2, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, + 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, 0xB3, 0xE0, + 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBD, + 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, 0xBE, 0x92, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0x9C, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, 0xBE, 0xB7, + 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, 0xB7, 0xF6, + 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, 0xF6, 0xE1, + 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xF6, 0x41, 0xCC, + 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, 0xCC, + 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, 0xCC, + 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, 0xCC, + 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, 0xCC, + 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, 0xCC, + 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, 0xCC, + 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, 0xCC, + 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, 0xCC, + 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, 0xCC, + 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, 0xCC, + 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, 0xF6, + 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, 0xCC, + 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, 0xCC, + 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, 0xCC, + 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, 0xCC, + 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, + 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, 0xCC, + 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, 0xCC, + 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, 0xCC, + 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, 0xCC, + 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, 0xCC, + 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, 0xCC, + 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, 0xCC, + 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, 0xCC, + 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, 0xCC, + 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, 0xCC, + 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, 0xCC, + 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, 0xCC, + 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, 0xCC, + 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, + 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, 0xCC, + 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, + 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, 0xCC, + 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, 0xCC, + 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, 0xCC, + 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, + 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, 0xCC, + 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, 0xCC, + 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, 0xCC, + 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, 0xCC, + 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, + 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, + 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, 0xCC, + 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, 0xCC, + 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, + 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, 0xCC, + 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, 0xCC, + 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, 0xCC, + 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, 0xCC, + 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, 0xCC, + 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, 0xCC, + 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, 0xCC, + 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, 0xCC, + 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, 0xCC, + 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, 0xF6, + 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, 0xCC, + 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, 0xCC, + 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, + 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, 0xCC, + 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, 0xCC, + 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, 0xCC, + 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, 0xCC, + 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, + 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, 0xCC, + 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, 0xCC, + 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, 0xCC, + 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, 0xCC, + 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, 0xCC, + 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, 0xCC, + 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, 0xCC, + 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, 0xCC, + 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, 0xCC, + 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, 0xCC, + 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, 0xCC, + 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, 0xCC, + 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, 0xCC, + 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, 0xCC, + 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, + 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, 0xCC, + 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, 0xCC, + 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, 0xCC, + 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, 0xCC, + 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, 0xBE, + 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, 0xCC, + 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, 0xCC, + 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, 0xCC, + 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, 0xF6, + 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, 0xCC, + 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, 0xCC, + 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, 0xF6, + 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, 0xCC, + 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, 0xCC, + 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, 0xCC, + 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, 0xCC, + 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, 0xCC, + 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, 0xF6, + 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, 0xCC, + 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, 0xCC, + 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, 0xCC, + 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, 0xCC, + 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, 0xCC, + 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, 0xCC, + 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, 0xCC, + 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, 0xF6, + 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, 0xCC, + 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, 0xCC, + 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, + 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, 0xCC, + 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, 0xCC, + 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, + 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, 0xCC, + 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, + 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, + 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, 0xCC, + 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, 0xCC, + 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, + 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, + 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, 0xCC, + 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, + 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, + 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, 0xCC, + 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, + 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, + 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, + 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, 0xCC, + 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, + 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, 0xCC, + 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, 0xCC, + 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, 0xCC, + 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, 0xCC, + 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, 0xCC, + 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, 0xCC, + 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0x91, + 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xF6, + 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x91, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, 0xCE, + 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, 0xCC, + 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x80, + 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, 0xF6, + 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, + 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xF6, + 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, 0xCE, + 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, + 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xF6, + 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, 0xCD, + 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, 0xCE, + 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCE, + 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xBF, + 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, 0xF6, + 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, + 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, + 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, 0xCC, + 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, + 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, 0x9F, + 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, + 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x93, + 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, 0xCF, + 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, 0x93, + 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, 0xF6, + 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, + 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, + 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCF, + 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, 0xA9, + 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x82, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, 0xCE, + 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, 0x80, + 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, + 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, 0xF6, + 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, 0xB1, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x93, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, 0xF6, + 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, + 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, 0xF6, + 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, 0x84, + 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, 0x20, + 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, 0x93, + 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, 0xCD, + 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, + 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xCD, + 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, 0xCE, + 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x95, + 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, 0xF6, + 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, + 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, 0x06, + 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, 0x93, + 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, 0xCC, + 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, 0x06, + 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, 0x93, + 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, 0xF6, + 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, 0xCC, + 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, + 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, 0x84, + 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, 0x99, + 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCC, + 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, 0x06, + 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, 0x94, + 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, 0xCD, + 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, 0x84, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCF, + 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, 0x94, + 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, 0x85, + 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, 0xCC, + 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, 0xCE, + 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, + 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, 0xC2, + 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, 0x80, + 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, 0x89, + 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, 0x89, + 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, 0xCC, + 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, 0x81, + 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, 0xC2, + 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, 0xF5, + 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, 0xE2, + 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, 0x20, + 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, + 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, + 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, + 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0x21, + 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, 0x21, + 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, 0x30, + 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x2B, + 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, + 0x52, 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, + 0x43, 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, + 0x2F, 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, + 0x48, 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, + 0x4C, 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, + 0x52, 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, + 0x4D, 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, + 0xF6, 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, + 0x46, 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, + 0x92, 0xD7, 0x93, 0x69, 0xCE, 0xB3, 0xCE, 0x93, + 0xCE, 0xA0, 0xE2, 0x88, 0x91, 0x44, 0x64, 0x65, + 0x69, 0x6A, 0x31, 0xE2, 0x81, 0x84, 0x33, 0x32, + 0xE2, 0x81, 0x84, 0x33, 0x31, 0xE2, 0x81, 0x84, + 0x35, 0x32, 0xE2, 0x81, 0x84, 0x35, 0x33, 0xE2, + 0x81, 0x84, 0x35, 0x34, 0xE2, 0x81, 0x84, 0x35, + 0x31, 0xE2, 0x81, 0x84, 0x36, 0x35, 0xE2, 0x81, + 0x84, 0x36, 0x31, 0xE2, 0x81, 0x84, 0x38, 0x33, + 0xE2, 0x81, 0x84, 0x38, 0x35, 0xE2, 0x81, 0x84, + 0x38, 0x37, 0xE2, 0x81, 0x84, 0x38, 0x31, 0xE2, + 0x81, 0x84, 0x49, 0x49, 0x49, 0x49, 0x49, 0x49, + 0x49, 0x56, 0x56, 0x56, 0x49, 0x56, 0x49, 0x49, + 0x56, 0x49, 0x49, 0x49, 0x49, 0x58, 0x58, 0x58, + 0x49, 0x58, 0x49, 0x49, 0x4C, 0x43, 0x44, 0x4D, + 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x69, 0x76, + 0x76, 0x76, 0x69, 0x76, 0x69, 0x69, 0x76, 0x69, + 0x69, 0x69, 0x69, 0x78, 0x78, 0x78, 0x69, 0x78, + 0x69, 0x69, 0x6C, 0x63, 0x64, 0x6D, 0xF6, 0xE2, + 0x86, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x86, 0x94, 0xCC, 0xB8, + 0xF6, 0xE2, 0x87, 0x90, 0xCC, 0xB8, 0xF6, 0xE2, + 0x87, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x83, 0xCC, 0xB8, + 0xF6, 0xE2, 0x88, 0x88, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0x8B, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0xA5, 0xCC, 0xB8, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, + 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, + 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xF6, 0xE2, + 0x88, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x83, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x85, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x88, 0xCC, 0xB8, 0xF6, 0x3D, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA1, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x8D, 0xCC, 0xB8, 0xF6, 0x3C, + 0xCC, 0xB8, 0xF6, 0x3E, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xA4, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA5, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xB6, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB7, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBA, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xBB, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x82, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x83, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x86, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x87, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xA2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA8, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA9, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xAB, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xBC, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBD, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x91, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB4, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xB5, 0xCC, 0xB8, 0xF6, 0xE3, + 0x80, 0x88, 0xF6, 0xE3, 0x80, 0x89, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x31, + 0x30, 0x31, 0x31, 0x31, 0x32, 0x31, 0x33, 0x31, + 0x34, 0x31, 0x35, 0x31, 0x36, 0x31, 0x37, 0x31, + 0x38, 0x31, 0x39, 0x32, 0x30, 0x28, 0x31, 0x29, + 0x28, 0x32, 0x29, 0x28, 0x33, 0x29, 0x28, 0x34, + 0x29, 0x28, 0x35, 0x29, 0x28, 0x36, 0x29, 0x28, + 0x37, 0x29, 0x28, 0x38, 0x29, 0x28, 0x39, 0x29, + 0x28, 0x31, 0x30, 0x29, 0x28, 0x31, 0x31, 0x29, + 0x28, 0x31, 0x32, 0x29, 0x28, 0x31, 0x33, 0x29, + 0x28, 0x31, 0x34, 0x29, 0x28, 0x31, 0x35, 0x29, + 0x28, 0x31, 0x36, 0x29, 0x28, 0x31, 0x37, 0x29, + 0x28, 0x31, 0x38, 0x29, 0x28, 0x31, 0x39, 0x29, + 0x28, 0x32, 0x30, 0x29, 0x31, 0x2E, 0x32, 0x2E, + 0x33, 0x2E, 0x34, 0x2E, 0x35, 0x2E, 0x36, 0x2E, + 0x37, 0x2E, 0x38, 0x2E, 0x39, 0x2E, 0x31, 0x30, + 0x2E, 0x31, 0x31, 0x2E, 0x31, 0x32, 0x2E, 0x31, + 0x33, 0x2E, 0x31, 0x34, 0x2E, 0x31, 0x35, 0x2E, + 0x31, 0x36, 0x2E, 0x31, 0x37, 0x2E, 0x31, 0x38, + 0x2E, 0x31, 0x39, 0x2E, 0x32, 0x30, 0x2E, 0x28, + 0x61, 0x29, 0x28, 0x62, 0x29, 0x28, 0x63, 0x29, + 0x28, 0x64, 0x29, 0x28, 0x65, 0x29, 0x28, 0x66, + 0x29, 0x28, 0x67, 0x29, 0x28, 0x68, 0x29, 0x28, + 0x69, 0x29, 0x28, 0x6A, 0x29, 0x28, 0x6B, 0x29, + 0x28, 0x6C, 0x29, 0x28, 0x6D, 0x29, 0x28, 0x6E, + 0x29, 0x28, 0x6F, 0x29, 0x28, 0x70, 0x29, 0x28, + 0x71, 0x29, 0x28, 0x72, 0x29, 0x28, 0x73, 0x29, + 0x28, 0x74, 0x29, 0x28, 0x75, 0x29, 0x28, 0x76, + 0x29, 0x28, 0x77, 0x29, 0x28, 0x78, 0x29, 0x28, + 0x79, 0x29, 0x28, 0x7A, 0x29, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x30, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0x3A, 0x3A, + 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0x3D, 0xF6, 0xE2, + 0xAB, 0x9D, 0xCC, 0xB8, 0xE6, 0xAF, 0x8D, 0xE9, + 0xBE, 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, + 0xE4, 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, + 0x99, 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, + 0xBA, 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, + 0xE5, 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, + 0x82, 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, + 0x87, 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, + 0xE5, 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, + 0x95, 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, + 0x8D, 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, + 0xE5, 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, + 0x88, 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, + 0x9C, 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, + 0xE5, 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, + 0xA7, 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, + 0xAE, 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, + 0xE5, 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, + 0xAE, 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, + 0xB7, 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, + 0xE5, 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, + 0xBF, 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, + 0xBC, 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, + 0xE5, 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, + 0x83, 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, + 0x89, 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, + 0xE6, 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, + 0xA4, 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, + 0x97, 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, + 0xE6, 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, + 0xA2, 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, + 0xAF, 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, + 0xE6, 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, + 0xB4, 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, + 0x88, 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, + 0xE7, 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, + 0x9B, 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, + 0x8E, 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, + 0xE7, 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, + 0xA8, 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, + 0x96, 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, + 0xE7, 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, + 0xAE, 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, + 0x9F, 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, + 0xE7, 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, + 0x8B, 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, + 0xB3, 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, + 0xE7, 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, + 0x81, 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, + 0x80, 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, + 0xE8, 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, + 0xB3, 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, + 0x88, 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, + 0xE8, 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, + 0x8D, 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, + 0xA1, 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, + 0xE8, 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, + 0x80, 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, + 0xB1, 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, + 0xE8, 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, + 0xB3, 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, + 0xBE, 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, + 0xE9, 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, + 0x86, 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, + 0x95, 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, + 0xE9, 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, + 0xA8, 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, + 0x9D, 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, + 0xE9, 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, + 0x81, 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, + 0xA3, 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, + 0xE9, 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, + 0x98, 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, + 0xAC, 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, + 0xE9, 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, + 0xB5, 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, + 0xBA, 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, + 0xE9, 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, + 0xBD, 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, + 0xBC, 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, + 0xE9, 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, + 0x9C, 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, + 0xE5, 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, + 0x85, 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0x93, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xA4, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x81, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xB5, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, + 0xE3, 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0x88, 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, + 0xB9, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, + 0xE3, 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, + 0x81, 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, + 0x86, 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, + 0xE1, 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, + 0xB0, 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, + 0x86, 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, + 0xE1, 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, + 0x87, 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, + 0x84, 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, + 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, + 0x8E, 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, + 0x84, 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, + 0xE1, 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, + 0xA4, 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, + 0x85, 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, + 0xE1, 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, + 0xAC, 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, + 0x85, 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, + 0xE1, 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, + 0xB4, 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, + 0x84, 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, + 0xE1, 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, + 0x8E, 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, + 0x87, 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, + 0xE1, 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, + 0x9E, 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, + 0x84, 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, + 0xE1, 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, + 0xAD, 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, + 0x84, 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, + 0xE1, 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, + 0xB1, 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, + 0x85, 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, + 0xE1, 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, + 0x91, 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, + 0x86, 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, + 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, + 0x9B, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, + 0xB8, 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, + 0xE4, 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, + 0xA9, 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, + 0xE1, 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, + 0x29, 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, + 0x84, 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, + 0x28, 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, + 0x89, 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, + 0xE1, 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, + 0x29, 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, + 0x84, 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, + 0x28, 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, + 0x80, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x82, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x83, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x85, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x86, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x87, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x89, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8B, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8C, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8E, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8F, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x90, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x91, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x92, 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, + 0x8C, 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, + 0x80, 0x29, 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, + 0xE4, 0xB8, 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, + 0x29, 0x28, 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, + 0x85, 0xAD, 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, + 0x28, 0xE5, 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, + 0x9D, 0x29, 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, + 0xE6, 0x9C, 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, + 0x29, 0x28, 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, + 0x9C, 0xA8, 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, + 0x28, 0xE5, 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, + 0xA5, 0x29, 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, + 0xE6, 0x9C, 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, + 0x29, 0x28, 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, + 0x89, 0xB9, 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, + 0x28, 0xE7, 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, + 0xB4, 0x29, 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, + 0xE5, 0x91, 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, + 0x29, 0x28, 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, + 0xBC, 0x81, 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, + 0x28, 0xE5, 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, + 0xAD, 0x29, 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, + 0xE8, 0x87, 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, + 0x29, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, 0x32, + 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, 0x32, + 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, 0x33, + 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, 0xE1, + 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, 0x83, + 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, 0x84, + 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, 0xE1, + 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, + 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, + 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x83, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x89, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8F, + 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, 0x85, + 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, 0xE1, + 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE4, 0xB8, 0x80, + 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, + 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, 0xAD, 0xE4, + 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, 0xB9, 0x9D, + 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, 0xE7, 0x81, + 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, 0xA8, 0xE9, + 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, 0x97, 0xA5, + 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, 0xE7, 0xA4, + 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, 0xB9, 0xE8, + 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, 0x8A, 0xB4, + 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, 0xE5, 0xA5, + 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, 0xAA, 0xE5, + 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, 0xA0, 0x85, + 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, 0xE6, 0xAD, + 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, + 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, 0x8F, 0xB3, + 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, 0xE5, 0xAD, + 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, 0x81, 0xE8, + 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, 0xA4, 0x9C, + 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, 0x33, 0x39, + 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, 0x34, 0x33, + 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, 0x34, 0x37, + 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, 0x31, 0xE6, + 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, 0x33, 0xE6, + 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, 0x35, 0xE6, + 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, 0x37, 0xE6, + 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, 0x39, 0xE6, + 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, 0x88, 0x31, + 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, 0xE6, 0x9C, + 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, + 0x82, 0xA6, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, + 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, + 0x82, 0xB5, 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, + 0xBF, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, + 0x83, 0x86, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, + 0x8D, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, + 0x83, 0x92, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, + 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, + 0x9F, 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, + 0x83, 0xA2, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, + 0xE3, 0x83, 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xAA, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, + 0x83, 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB0, + 0xE3, 0x83, 0xB1, 0xE3, 0x83, 0xB2, 0xE3, 0x82, + 0xA2, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xA2, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0xA1, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, + 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x81, 0xE3, 0x82, 0xA6, 0xE3, 0x82, 0xA9, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA8, 0xE3, + 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xB9, 0xE3, 0x82, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xA0, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0xA4, + 0xE3, 0x83, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, 0x83, + 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0xAB, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAB, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, + 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x8B, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xA5, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAF, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xA0, 0xE3, 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, + 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xAD, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x88, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xA0, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, + 0xBB, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x8D, 0xE3, 0x82, + 0xB1, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, 0xE3, + 0x82, 0xB3, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x8A, + 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB5, 0xE3, + 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xB5, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x81, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, + 0x82, 0xB7, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, + 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x86, 0xE3, + 0x82, 0x99, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x8A, 0xE3, + 0x83, 0x8E, 0xE3, 0x83, 0x8E, 0xE3, 0x83, 0x83, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, + 0xA4, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, + 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBB, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0x84, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, + 0x82, 0xA2, 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x88, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xB3, + 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0xA3, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x83, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xA7, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x95, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0xAF, + 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, + 0x82, 0xBD, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x9A, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0x92, 0xE3, 0x83, + 0x98, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x84, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, + 0xE3, 0x82, 0xB9, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB7, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0x98, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xBF, 0xE3, 0x83, + 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x9B, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x9B, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xA4, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9E, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x8F, 0xE3, 0x83, + 0x9E, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0x9E, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB7, + 0xE3, 0x83, 0xA7, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x9F, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xA1, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA1, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xA6, 0xE3, 0x82, + 0xA2, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xA0, 0xE3, + 0x83, 0xAC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, + 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x88, 0x30, 0xE7, 0x82, 0xB9, 0x31, 0xE7, + 0x82, 0xB9, 0x32, 0xE7, 0x82, 0xB9, 0x33, 0xE7, + 0x82, 0xB9, 0x34, 0xE7, 0x82, 0xB9, 0x35, 0xE7, + 0x82, 0xB9, 0x36, 0xE7, 0x82, 0xB9, 0x37, 0xE7, + 0x82, 0xB9, 0x38, 0xE7, 0x82, 0xB9, 0x39, 0xE7, + 0x82, 0xB9, 0x31, 0x30, 0xE7, 0x82, 0xB9, 0x31, + 0x31, 0xE7, 0x82, 0xB9, 0x31, 0x32, 0xE7, 0x82, + 0xB9, 0x31, 0x33, 0xE7, 0x82, 0xB9, 0x31, 0x34, + 0xE7, 0x82, 0xB9, 0x31, 0x35, 0xE7, 0x82, 0xB9, + 0x31, 0x36, 0xE7, 0x82, 0xB9, 0x31, 0x37, 0xE7, + 0x82, 0xB9, 0x31, 0x38, 0xE7, 0x82, 0xB9, 0x31, + 0x39, 0xE7, 0x82, 0xB9, 0x32, 0x30, 0xE7, 0x82, + 0xB9, 0x32, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0x32, + 0xE7, 0x82, 0xB9, 0x32, 0x33, 0xE7, 0x82, 0xB9, + 0x32, 0x34, 0xE7, 0x82, 0xB9, 0x68, 0x50, 0x61, + 0x64, 0x61, 0x41, 0x55, 0x62, 0x61, 0x72, 0x6F, + 0x56, 0x70, 0x63, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, + 0x90, 0xE6, 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, + 0xA4, 0xA7, 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, + 0xE6, 0xB2, 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, + 0x8F, 0xE4, 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, + 0x41, 0x6E, 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, + 0x6B, 0x41, 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, + 0x63, 0x61, 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, + 0x46, 0x6E, 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, + 0x67, 0x6D, 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, + 0x48, 0x7A, 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, + 0x54, 0x48, 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, + 0x64, 0x6C, 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, + 0xCE, 0xBC, 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, + 0x6D, 0x6D, 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, + 0x32, 0x6B, 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, + 0x6D, 0x33, 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, + 0xE2, 0x88, 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, + 0x73, 0x32, 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, + 0x50, 0x61, 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, + 0x72, 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, + 0x61, 0x64, 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, + 0x73, 0x6E, 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, + 0x70, 0x56, 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, + 0x56, 0x6B, 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, + 0x57, 0xCE, 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, + 0x4D, 0x57, 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, + 0x61, 0x2E, 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, + 0x63, 0x64, 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, + 0x43, 0x6F, 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, + 0x61, 0x48, 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, + 0x4D, 0x6B, 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, + 0x6F, 0x67, 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, + 0x6C, 0x6D, 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, + 0x6D, 0x2E, 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, + 0x72, 0x53, 0x76, 0x57, 0x62, 0x31, 0xE6, 0x97, + 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, + 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, + 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, + 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, + 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, + 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, + 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, + 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, + 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, + 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, + 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, + 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, + 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, + 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, + 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, + 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, + 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, + 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0xF6, + 0xE8, 0xB1, 0x88, 0xF6, 0xE6, 0x9B, 0xB4, 0xF6, + 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, 0xB3, 0x88, 0xF6, + 0xE6, 0xBB, 0x91, 0xF6, 0xE4, 0xB8, 0xB2, 0xF6, + 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, + 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, 0xA5, 0x91, 0xF6, + 0xE9, 0x87, 0x91, 0xF6, 0xE5, 0x96, 0x87, 0xF6, + 0xE5, 0xA5, 0x88, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, + 0xE7, 0x99, 0xA9, 0xF6, 0xE7, 0xBE, 0x85, 0xF6, + 0xE8, 0x98, 0xBF, 0xF6, 0xE8, 0x9E, 0xBA, 0xF6, + 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, 0x82, 0x8F, 0xF6, + 0xE6, 0xA8, 0x82, 0xF6, 0xE6, 0xB4, 0x9B, 0xF6, + 0xE7, 0x83, 0x99, 0xF6, 0xE7, 0x8F, 0x9E, 0xF6, + 0xE8, 0x90, 0xBD, 0xF6, 0xE9, 0x85, 0xAA, 0xF6, + 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, 0xBA, 0x82, 0xF6, + 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, 0xAC, 0x84, 0xF6, + 0xE7, 0x88, 0x9B, 0xF6, 0xE8, 0x98, 0xAD, 0xF6, + 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, 0xB5, 0x90, 0xF6, + 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, 0x97, 0x8D, 0xF6, + 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, 0x8B, 0x89, 0xF6, + 0xE8, 0x87, 0x98, 0xF6, 0xE8, 0xA0, 0x9F, 0xF6, + 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, + 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, 0x8B, 0xBC, 0xF6, + 0xE9, 0x83, 0x8E, 0xF6, 0xE4, 0xBE, 0x86, 0xF6, + 0xE5, 0x86, 0xB7, 0xF6, 0xE5, 0x8B, 0x9E, 0xF6, + 0xE6, 0x93, 0x84, 0xF6, 0xE6, 0xAB, 0x93, 0xF6, + 0xE7, 0x88, 0x90, 0xF6, 0xE7, 0x9B, 0xA7, 0xF6, + 0xE8, 0x80, 0x81, 0xF6, 0xE8, 0x98, 0x86, 0xF6, + 0xE8, 0x99, 0x9C, 0xF6, 0xE8, 0xB7, 0xAF, 0xF6, + 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, 0xAD, 0xAF, 0xF6, + 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, + 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, 0xB6, 0xA0, 0xF6, + 0xE8, 0x8F, 0x89, 0xF6, 0xE9, 0x8C, 0x84, 0xF6, + 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, 0xAB, 0x96, 0xF6, + 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, 0xBC, 0x84, 0xF6, + 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, 0x81, 0xBE, 0xF6, + 0xE7, 0x89, 0xA2, 0xF6, 0xE7, 0xA3, 0x8A, 0xF6, + 0xE8, 0xB3, 0x82, 0xF6, 0xE9, 0x9B, 0xB7, 0xF6, + 0xE5, 0xA3, 0x98, 0xF6, 0xE5, 0xB1, 0xA2, 0xF6, + 0xE6, 0xA8, 0x93, 0xF6, 0xE6, 0xB7, 0x9A, 0xF6, + 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, 0xB4, 0xAF, 0xF6, + 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, 0x99, 0x8B, 0xF6, + 0xE5, 0x8B, 0x92, 0xF6, 0xE8, 0x82, 0x8B, 0xF6, + 0xE5, 0x87, 0x9C, 0xF6, 0xE5, 0x87, 0x8C, 0xF6, + 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, 0xB6, 0xBE, 0xF6, + 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, 0x99, 0xB5, 0xF6, + 0xE8, 0xAE, 0x80, 0xF6, 0xE6, 0x8B, 0x8F, 0xF6, + 0xE6, 0xA8, 0x82, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, + 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, + 0xE6, 0x80, 0x92, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, + 0xE7, 0x95, 0xB0, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, + 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, 0xBE, 0xBF, 0xF6, + 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, 0xB8, 0x8D, 0xF6, + 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, 0x95, 0xB8, 0xF6, + 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, 0x8F, 0x83, 0xF6, + 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, 0x9C, 0x81, 0xF6, + 0xE8, 0x91, 0x89, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, + 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, 0xBE, 0xB0, 0xF6, + 0xE6, 0xB2, 0x88, 0xF6, 0xE6, 0x8B, 0xBE, 0xF6, + 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, 0x8E, 0xA0, 0xF6, + 0xE7, 0x95, 0xA5, 0xF6, 0xE4, 0xBA, 0xAE, 0xF6, + 0xE5, 0x85, 0xA9, 0xF6, 0xE5, 0x87, 0x89, 0xF6, + 0xE6, 0xA2, 0x81, 0xF6, 0xE7, 0xB3, 0xA7, 0xF6, + 0xE8, 0x89, 0xAF, 0xF6, 0xE8, 0xAB, 0x92, 0xF6, + 0xE9, 0x87, 0x8F, 0xF6, 0xE5, 0x8B, 0xB5, 0xF6, + 0xE5, 0x91, 0x82, 0xF6, 0xE5, 0xA5, 0xB3, 0xF6, + 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, 0x97, 0x85, 0xF6, + 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, 0xA4, 0xAA, 0xF6, + 0xE9, 0x96, 0xAD, 0xF6, 0xE9, 0xA9, 0xAA, 0xF6, + 0xE9, 0xBA, 0x97, 0xF6, 0xE9, 0xBB, 0x8E, 0xF6, + 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, 0x9B, 0x86, 0xF6, + 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, 0xBD, 0xA2, 0xF6, + 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, 0x86, 0x90, 0xF6, + 0xE6, 0x88, 0x80, 0xF6, 0xE6, 0x92, 0x9A, 0xF6, + 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, 0x85, 0x89, 0xF6, + 0xE7, 0x92, 0x89, 0xF6, 0xE7, 0xA7, 0x8A, 0xF6, + 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, 0x81, 0xAF, 0xF6, + 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, 0x93, 0xAE, 0xF6, + 0xE9, 0x80, 0xA3, 0xF6, 0xE9, 0x8D, 0x8A, 0xF6, + 0xE5, 0x88, 0x97, 0xF6, 0xE5, 0x8A, 0xA3, 0xF6, + 0xE5, 0x92, 0xBD, 0xF6, 0xE7, 0x83, 0x88, 0xF6, + 0xE8, 0xA3, 0x82, 0xF6, 0xE8, 0xAA, 0xAA, 0xF6, + 0xE5, 0xBB, 0x89, 0xF6, 0xE5, 0xBF, 0xB5, 0xF6, + 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, 0xAE, 0xAE, 0xF6, + 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, 0x8D, 0xB5, 0xF6, + 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, 0x9B, 0xB9, 0xF6, + 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xB6, 0xBA, 0xF6, + 0xE6, 0x80, 0x9C, 0xF6, 0xE7, 0x8E, 0xB2, 0xF6, + 0xE7, 0x91, 0xA9, 0xF6, 0xE7, 0xBE, 0x9A, 0xF6, + 0xE8, 0x81, 0x86, 0xF6, 0xE9, 0x88, 0xB4, 0xF6, + 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, 0x9D, 0x88, 0xF6, + 0xE9, 0xA0, 0x98, 0xF6, 0xE4, 0xBE, 0x8B, 0xF6, + 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, 0x86, 0xB4, 0xF6, + 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, 0x83, 0xA1, 0xF6, + 0xE4, 0xBA, 0x86, 0xF6, 0xE5, 0x83, 0x9A, 0xF6, + 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, 0xB0, 0xBF, 0xF6, + 0xE6, 0x96, 0x99, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, + 0xE7, 0x87, 0x8E, 0xF6, 0xE7, 0x99, 0x82, 0xF6, + 0xE8, 0x93, 0xBC, 0xF6, 0xE9, 0x81, 0xBC, 0xF6, + 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, 0x9A, 0x88, 0xF6, + 0xE9, 0x98, 0xAE, 0xF6, 0xE5, 0x8A, 0x89, 0xF6, + 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, 0x9F, 0xB3, 0xF6, + 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xBA, 0x9C, 0xF6, + 0xE7, 0x90, 0x89, 0xF6, 0xE7, 0x95, 0x99, 0xF6, + 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, 0xB4, 0x90, 0xF6, + 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, 0x85, 0xAD, 0xF6, + 0xE6, 0x88, 0xAE, 0xF6, 0xE9, 0x99, 0xB8, 0xF6, + 0xE5, 0x80, 0xAB, 0xF6, 0xE5, 0xB4, 0x99, 0xF6, + 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, 0xBC, 0xAA, 0xF6, + 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, 0x85, 0x84, 0xF6, + 0xE6, 0xA0, 0x97, 0xF6, 0xE7, 0x8E, 0x87, 0xF6, + 0xE9, 0x9A, 0x86, 0xF6, 0xE5, 0x88, 0xA9, 0xF6, + 0xE5, 0x90, 0x8F, 0xF6, 0xE5, 0xB1, 0xA5, 0xF6, + 0xE6, 0x98, 0x93, 0xF6, 0xE6, 0x9D, 0x8E, 0xF6, + 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, 0xB3, 0xA5, 0xF6, + 0xE7, 0x90, 0x86, 0xF6, 0xE7, 0x97, 0xA2, 0xF6, + 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, 0xA3, 0x8F, 0xF6, + 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, 0x87, 0x8C, 0xF6, + 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, 0x8C, 0xBF, 0xF6, + 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, 0x90, 0x9D, 0xF6, + 0xE7, 0x87, 0x90, 0xF6, 0xE7, 0x92, 0x98, 0xF6, + 0xE8, 0x97, 0xBA, 0xF6, 0xE9, 0x9A, 0xA3, 0xF6, + 0xE9, 0xB1, 0x97, 0xF6, 0xE9, 0xBA, 0x9F, 0xF6, + 0xE6, 0x9E, 0x97, 0xF6, 0xE6, 0xB7, 0x8B, 0xF6, + 0xE8, 0x87, 0xA8, 0xF6, 0xE7, 0xAB, 0x8B, 0xF6, + 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, 0xB2, 0x92, 0xF6, + 0xE7, 0x8B, 0x80, 0xF6, 0xE7, 0x82, 0x99, 0xF6, + 0xE8, 0xAD, 0x98, 0xF6, 0xE4, 0xBB, 0x80, 0xF6, + 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, 0x88, 0xBA, 0xF6, + 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xBA, 0xA6, 0xF6, + 0xE6, 0x8B, 0x93, 0xF6, 0xE7, 0xB3, 0x96, 0xF6, + 0xE5, 0xAE, 0x85, 0xF6, 0xE6, 0xB4, 0x9E, 0xF6, + 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, 0xBC, 0xBB, 0xF6, + 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, 0x99, 0x8D, 0xF6, + 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, 0xBB, 0x93, 0xF6, + 0xE5, 0x85, 0x80, 0xF6, 0xE5, 0x97, 0x80, 0xF6, + 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, + 0xE5, 0x87, 0x9E, 0xF6, 0xE7, 0x8C, 0xAA, 0xF6, + 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, 0xA4, 0xBC, 0xF6, + 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, 0xA5, 0xA5, 0xF6, + 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, 0x9D, 0x96, 0xF6, + 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, 0xBE, 0xBD, 0xF6, + 0xE8, 0x98, 0x92, 0xF6, 0xE8, 0xAB, 0xB8, 0xF6, + 0xE9, 0x80, 0xB8, 0xF6, 0xE9, 0x83, 0xBD, 0xF6, + 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, 0xA3, 0xBC, 0xF6, + 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, 0xB6, 0xB4, 0xF6, + 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, + 0xE5, 0x85, 0x8D, 0xF6, 0xE5, 0x8B, 0x89, 0xF6, + 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, + 0xE5, 0x96, 0x9D, 0xF6, 0xE5, 0x98, 0x86, 0xF6, + 0xE5, 0x99, 0xA8, 0xF6, 0xE5, 0xA1, 0x80, 0xF6, + 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, 0xB1, 0xA4, 0xF6, + 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, 0x82, 0x94, 0xF6, + 0xE6, 0x85, 0xA8, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, + 0xE6, 0x87, 0xB2, 0xF6, 0xE6, 0x95, 0x8F, 0xF6, + 0xE6, 0x97, 0xA2, 0xF6, 0xE6, 0x9A, 0x91, 0xF6, + 0xE6, 0xA2, 0x85, 0xF6, 0xE6, 0xB5, 0xB7, 0xF6, + 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, 0xBC, 0xA2, 0xF6, + 0xE7, 0x85, 0xAE, 0xF6, 0xE7, 0x88, 0xAB, 0xF6, + 0xE7, 0x90, 0xA2, 0xF6, 0xE7, 0xA2, 0x91, 0xF6, + 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, 0xA5, 0x89, 0xF6, + 0xE7, 0xA5, 0x88, 0xF6, 0xE7, 0xA5, 0x90, 0xF6, + 0xE7, 0xA5, 0x96, 0xF6, 0xE7, 0xA5, 0x9D, 0xF6, + 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, 0xA6, 0x8E, 0xF6, + 0xE7, 0xA9, 0x80, 0xF6, 0xE7, 0xAA, 0x81, 0xF6, + 0xE7, 0xAF, 0x80, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, + 0xE7, 0xB8, 0x89, 0xF6, 0xE7, 0xB9, 0x81, 0xF6, + 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xE8, 0x87, 0xAD, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, + 0xE8, 0x89, 0xB9, 0xF6, 0xE8, 0x91, 0x97, 0xF6, + 0xE8, 0xA4, 0x90, 0xF6, 0xE8, 0xA6, 0x96, 0xF6, + 0xE8, 0xAC, 0x81, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, + 0xE8, 0xB3, 0x93, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, + 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, + 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, 0x9F, 0xBF, 0xF6, + 0xE9, 0xA0, 0xBB, 0x66, 0x66, 0x66, 0x69, 0x66, + 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, 0x73, + 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, 0xD5, + 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, 0xD5, + 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, 0xF6, + 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, 0xD6, + 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, 0xD7, + 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, 0xD7, + 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, 0xD7, + 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, 0xD7, + 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, 0xA9, + 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, 0xD6, + 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, 0xD7, + 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, 0xBC, + 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, 0x93, + 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, 0xF6, + 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, 0xD6, + 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, 0xD7, + 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, 0xBC, + 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, 0x9C, + 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, 0xF6, + 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, 0xD6, + 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, 0xD7, + 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, 0xBC, + 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, 0xA8, + 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, 0xF6, + 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, 0xD6, + 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, 0xD7, + 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, 0xBF, + 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, 0xB1, + 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, + 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, + 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, + 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, + 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, + 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, + 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, + 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, + 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, + 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, + 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, + 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, + 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, 0x8C, + 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, 0x88, + 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, 0x91, + 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, + 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, + 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, + 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, + 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, 0xBB, + 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, 0x94, + 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, 0x81, + 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, 0xBE, + 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, 0x92, + 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, 0x94, + 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, + 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, 0x86, + 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, 0xB4, + 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, 0x85, + 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, 0x90, + 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, + 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x95, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x87, + 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, + 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x88, + 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, + 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, + 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, 0x8C, + 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, + 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, + 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, + 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, + 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, + 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, + 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, 0xAB, + 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, + 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, + 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, + 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, 0xB6, + 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, + 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, 0xB7, + 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB8, + 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, + 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, + 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, + 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, + 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, + 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, 0x83, + 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, + 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, 0x83, + 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x84, + 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x84, + 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, + 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, + 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, 0x87, + 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, 0x87, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, 0xB1, + 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, 0xD9, + 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, 0x91, + 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, 0x8F, + 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, 0x20, + 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xB2, + 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, + 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, 0xB2, + 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x86, + 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, 0xB2, + 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x86, + 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x8A, + 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, 0xB2, + 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x86, + 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, 0x8A, + 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, 0x8A, + 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, 0x8A, + 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, 0x84, + 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, + 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, 0x85, + 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, 0x8A, + 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, 0x85, + 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, 0xB2, + 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x86, + 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x8A, + 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xB1, + 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x94, + 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, 0xA8, + 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, 0xA8, + 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, 0xAA, + 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, + 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, + 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, + 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB3, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, + 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, 0xB5, + 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, + 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, + 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, 0xB8, + 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, 0xB9, + 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, 0x81, + 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, 0x83, + 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, 0x83, + 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x84, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x84, + 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x84, + 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x87, + 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x87, + 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x94, + 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, + 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, + 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, + 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xB3, + 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xB4, + 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, 0x83, + 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, 0x86, + 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x80, + 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x8F, + 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, 0x91, + 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, + 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, + 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, + 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, + 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, + 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, + 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, 0x8A, + 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, 0x8A, + 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x8A, + 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, 0x8A, + 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x8A, + 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, 0xB1, + 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, 0xB1, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x87, + 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAC, + 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, 0xAE, + 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, 0x85, + 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x8B, + 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAA, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAE, + 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAA, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAD, + 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, 0xAD, + 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xB3, + 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, + 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, 0xAD, + 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB4, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, 0x85, + 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xB6, + 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAE, + 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, + 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB7, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, + 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, + 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xBA, + 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, 0xAE, + 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x82, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, + 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, 0x84, + 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAC, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, + 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x84, + 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, + 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x85, + 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, 0x85, + 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xAC, + 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, + 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAD, + 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, + 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x89, + 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, 0xAE, + 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAA, + 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAE, + 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x8A, + 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xAC, + 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x89, + 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xB5, + 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x84, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, 0x8A, + 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, + 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, 0x85, + 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x8A, + 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, + 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, + 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, + 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, 0x86, + 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x81, + 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAD, + 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x85, + 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, 0xB5, + 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, 0xAE, + 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x8A, + 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, 0x82, + 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, 0x84, + 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, 0x83, + 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, 0xAD, + 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, 0x84, + 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, 0xB3, + 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, 0x84, + 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, 0xB3, + 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, 0x84, + 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, 0x89, + 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, 0xD9, + 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, 0x8A, + 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, 0xD9, + 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, 0x20, + 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, + 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, 0xA7, + 0xD9, 0x84, 0x2E, 0x2E, 0xE2, 0x80, 0x94, 0xE2, + 0x80, 0x93, 0x5F, 0x5F, 0x28, 0x29, 0x7B, 0x7D, + 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, 0xE3, 0x80, + 0x90, 0xE3, 0x80, 0x91, 0xE3, 0x80, 0x8A, 0xE3, + 0x80, 0x8B, 0xE3, 0x80, 0x88, 0xE3, 0x80, 0x89, + 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, 0x80, + 0x8E, 0xE3, 0x80, 0x8F, 0x20, 0xCC, 0x85, 0x20, + 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, + 0x5F, 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, + 0x3B, 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, + 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, + 0x95, 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, + 0x3D, 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, + 0xD9, 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, + 0xD9, 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, + 0x8E, 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, + 0x20, 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, + 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, + 0x92, 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, + 0xA7, 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, + 0xA7, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, + 0x88, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, + 0xA7, 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xA7, 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, + 0xA8, 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, + 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, + 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, + 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, + 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, + 0xAF, 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, + 0xB1, 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, + 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, + 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, + 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, + 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, + 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, + 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, + 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, + 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, + 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, + 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, + 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, + 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, + 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, + 0x88, 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, + 0x84, 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, + 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x94, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, + 0x84, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, + 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x84, 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, + 0x26, 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, + 0x2E, 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, + 0x3E, 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, + 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, + 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, + 0x56, 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, + 0x5E, 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, + 0x7E, 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, + 0x80, 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, + 0xE3, 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, + 0xB2, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, + 0x82, 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, + 0xE3, 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, + 0xA7, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, + 0xE3, 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, + 0x82, 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, + 0xBB, 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, + 0x83, 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, + 0x8B, 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, + 0x83, 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, + 0xE3, 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, + 0x9B, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, + 0x83, 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, + 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, + 0xA8, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0x99, 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, + 0x84, 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, + 0xE1, 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, + 0xAD, 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, + 0x84, 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, + 0xE1, 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, + 0xB4, 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, + 0x84, 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, + 0xE1, 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, + 0x8A, 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, + 0x84, 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, + 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, + 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, + 0x85, 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, + 0xE1, 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, + 0xA8, 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, + 0x85, 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, + 0xE1, 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, + 0xB0, 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, + 0x85, 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, + 0xC2, 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, + 0x84, 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, + 0xE2, 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, + 0x91, 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, + 0x96, 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, + 0x85, 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, + 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, + 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, + 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, + 0xAF, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, + 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, + 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, + 0x85, 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, + 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, + 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, + 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, + 0xB9, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, + 0xAF, 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, + 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, + 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, + 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, + 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, + 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, + 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, + 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, + 0x65, 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, + 0x68, 0x69, 0x6A, 0x6B, 0x6D, 0x6E, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, + 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, + 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, + 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, + 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, + 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, + 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, + 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, + 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, + 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, + 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, + 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, + 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, + 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, + 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, + 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, + 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, + 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, + 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, + 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, + 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, + 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, + 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, + 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, + 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, + 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, + 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, + 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, + 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, + 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, + 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, + 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, + 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, + 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, + 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, + 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, + 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, + 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, + 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, + 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, + 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, + 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, + 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, + 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, + 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, + 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, + 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, + 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, + 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, + 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, + 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, + 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, + 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, + 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, + 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, + 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, + 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, + 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, + 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, + 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, + 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, + 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, + 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, + 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, + 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, + 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, + 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, + 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, + 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, + 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, + 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, + 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, + 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, + 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, + 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, + 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, + 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, + 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, + 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, + 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, + 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, + 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, + 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, + 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, + 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, + 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, + 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, + 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, + 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, + 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, + 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, + 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, + 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, + 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, + 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, + 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, + 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, + 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, + 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, + 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, + 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, + 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, + 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, + 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, + 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, + 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, + 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, + 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, + 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, + 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, + 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, + 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, + 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, + 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, + 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, + 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, + 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, + 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, + 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, + 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, + 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, + 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, + 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, + 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, + 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, + 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, + 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, + 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, + 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, + 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, + 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, + 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, + 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, + 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, + 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, + 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, + 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, + 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, + 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, + 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, + 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, + 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, + 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, + 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, + 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, + 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, + 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, + 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, + 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, + 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, + 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, + 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, + 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, + 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, + 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, + 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, + 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, + 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, + 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, + 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, + 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, + 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, + 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, + 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, + 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, + 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, + 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, + 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, + 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, + 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, + 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, + 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, + 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, + 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, + 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, + 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, + 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, + 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, + 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, + 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, + 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, + 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, + 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, + 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, + 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, + 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, + 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, + 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, + 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, + 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, + 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, + 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, + 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, + 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, + 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, + 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, + 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, + 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, + 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, + 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, + 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, + 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, + 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, + 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, + 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, + 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, + 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, + 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, + 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, + 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, + 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, + 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, + 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, + 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, + 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, + 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, + 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, + 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, + 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, + 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, + 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, + 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, + 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, + 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, + 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, + 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, + 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, + 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, + 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, + 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, + 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, + 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, + 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, + 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, + 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, + 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, + 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, + 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, + 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, + 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, + 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, + 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, + 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, + 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, + 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, + 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, + 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, + 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, + 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, + 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, + 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, + 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, + 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, + 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, + 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, + 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, + 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, + 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, + 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, + 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, + 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, + 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, + 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, + 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, + 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, + 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, + 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, + 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, + 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, + 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, + 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, + 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, + 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, + 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, + 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, + 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, + 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, + 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, + 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, + 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, + 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, + 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, + 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, + 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, + 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, + 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, + 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, + 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, + 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, + 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, + 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, + 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, + 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, + 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, + 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, + 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, + 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, + 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, + 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, + 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, + 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, + 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, + 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, + 0x98, 0x80, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, + }, + { + 0x20, 0x20, 0xCC, 0x88, 0x61, 0x20, 0xCC, 0x84, + 0x32, 0x33, 0x20, 0xCC, 0x81, 0xCE, 0xBC, 0x20, + 0xCC, 0xA7, 0x31, 0x6F, 0x31, 0xE2, 0x81, 0x84, + 0x34, 0x31, 0xE2, 0x81, 0x84, 0x32, 0x33, 0xE2, + 0x81, 0x84, 0x34, 0xF6, 0x41, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x82, 0xF6, + 0x41, 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x8A, 0xF6, 0x43, 0xCC, 0xA7, 0xF6, + 0x45, 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x81, 0xF6, + 0x45, 0xCC, 0x82, 0xF6, 0x45, 0xCC, 0x88, 0xF6, + 0x49, 0xCC, 0x80, 0xF6, 0x49, 0xCC, 0x81, 0xF6, + 0x49, 0xCC, 0x82, 0xF6, 0x49, 0xCC, 0x88, 0xF6, + 0x4E, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x80, 0xF6, + 0x4F, 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xF6, + 0x4F, 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x88, 0xF6, + 0x55, 0xCC, 0x80, 0xF6, 0x55, 0xCC, 0x81, 0xF6, + 0x55, 0xCC, 0x82, 0xF6, 0x55, 0xCC, 0x88, 0xF6, + 0x59, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x80, 0xF6, + 0x61, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x82, 0xF6, + 0x61, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x88, 0xF6, + 0x61, 0xCC, 0x8A, 0xF6, 0x63, 0xCC, 0xA7, 0xF6, + 0x65, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x81, 0xF6, + 0x65, 0xCC, 0x82, 0xF6, 0x65, 0xCC, 0x88, 0xF6, + 0x69, 0xCC, 0x80, 0xF6, 0x69, 0xCC, 0x81, 0xF6, + 0x69, 0xCC, 0x82, 0xF6, 0x69, 0xCC, 0x88, 0xF6, + 0x6E, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x80, 0xF6, + 0x6F, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, 0xF6, + 0x6F, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x88, 0xF6, + 0x75, 0xCC, 0x80, 0xF6, 0x75, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x82, 0xF6, 0x75, 0xCC, 0x88, 0xF6, + 0x79, 0xCC, 0x81, 0xF6, 0x79, 0xCC, 0x88, 0xF6, + 0x41, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x84, 0xF6, + 0x41, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0x86, 0xF6, + 0x41, 0xCC, 0xA8, 0xF6, 0x61, 0xCC, 0xA8, 0xF6, + 0x43, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0x81, 0xF6, + 0x43, 0xCC, 0x82, 0xF6, 0x63, 0xCC, 0x82, 0xF6, + 0x43, 0xCC, 0x87, 0xF6, 0x63, 0xCC, 0x87, 0xF6, + 0x43, 0xCC, 0x8C, 0xF6, 0x63, 0xCC, 0x8C, 0xF6, + 0x44, 0xCC, 0x8C, 0xF6, 0x64, 0xCC, 0x8C, 0xF6, + 0x45, 0xCC, 0x84, 0xF6, 0x65, 0xCC, 0x84, 0xF6, + 0x45, 0xCC, 0x86, 0xF6, 0x65, 0xCC, 0x86, 0xF6, + 0x45, 0xCC, 0x87, 0xF6, 0x65, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA8, 0xF6, 0x65, 0xCC, 0xA8, 0xF6, + 0x45, 0xCC, 0x8C, 0xF6, 0x65, 0xCC, 0x8C, 0xF6, + 0x47, 0xCC, 0x82, 0xF6, 0x67, 0xCC, 0x82, 0xF6, + 0x47, 0xCC, 0x86, 0xF6, 0x67, 0xCC, 0x86, 0xF6, + 0x47, 0xCC, 0x87, 0xF6, 0x67, 0xCC, 0x87, 0xF6, + 0x47, 0xCC, 0xA7, 0xF6, 0x67, 0xCC, 0xA7, 0xF6, + 0x48, 0xCC, 0x82, 0xF6, 0x68, 0xCC, 0x82, 0xF6, + 0x49, 0xCC, 0x83, 0xF6, 0x69, 0xCC, 0x83, 0xF6, + 0x49, 0xCC, 0x84, 0xF6, 0x69, 0xCC, 0x84, 0xF6, + 0x49, 0xCC, 0x86, 0xF6, 0x69, 0xCC, 0x86, 0xF6, + 0x49, 0xCC, 0xA8, 0xF6, 0x69, 0xCC, 0xA8, 0xF6, + 0x49, 0xCC, 0x87, 0x49, 0x4A, 0x69, 0x6A, 0xF6, + 0x4A, 0xCC, 0x82, 0xF6, 0x6A, 0xCC, 0x82, 0xF6, + 0x4B, 0xCC, 0xA7, 0xF6, 0x6B, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x81, 0xF6, 0x6C, 0xCC, 0x81, 0xF6, + 0x4C, 0xCC, 0xA7, 0xF6, 0x6C, 0xCC, 0xA7, 0xF6, + 0x4C, 0xCC, 0x8C, 0xF6, 0x6C, 0xCC, 0x8C, 0x4C, + 0xC2, 0xB7, 0x6C, 0xC2, 0xB7, 0xF6, 0x4E, 0xCC, + 0x81, 0xF6, 0x6E, 0xCC, 0x81, 0xF6, 0x4E, 0xCC, + 0xA7, 0xF6, 0x6E, 0xCC, 0xA7, 0xF6, 0x4E, 0xCC, + 0x8C, 0xF6, 0x6E, 0xCC, 0x8C, 0xCA, 0xBC, 0x6E, + 0xF6, 0x4F, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, 0x84, + 0xF6, 0x4F, 0xCC, 0x86, 0xF6, 0x6F, 0xCC, 0x86, + 0xF6, 0x4F, 0xCC, 0x8B, 0xF6, 0x6F, 0xCC, 0x8B, + 0xF6, 0x52, 0xCC, 0x81, 0xF6, 0x72, 0xCC, 0x81, + 0xF6, 0x52, 0xCC, 0xA7, 0xF6, 0x72, 0xCC, 0xA7, + 0xF6, 0x52, 0xCC, 0x8C, 0xF6, 0x72, 0xCC, 0x8C, + 0xF6, 0x53, 0xCC, 0x81, 0xF6, 0x73, 0xCC, 0x81, + 0xF6, 0x53, 0xCC, 0x82, 0xF6, 0x73, 0xCC, 0x82, + 0xF6, 0x53, 0xCC, 0xA7, 0xF6, 0x73, 0xCC, 0xA7, + 0xF6, 0x53, 0xCC, 0x8C, 0xF6, 0x73, 0xCC, 0x8C, + 0xF6, 0x54, 0xCC, 0xA7, 0xF6, 0x74, 0xCC, 0xA7, + 0xF6, 0x54, 0xCC, 0x8C, 0xF6, 0x74, 0xCC, 0x8C, + 0xF6, 0x55, 0xCC, 0x83, 0xF6, 0x75, 0xCC, 0x83, + 0xF6, 0x55, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x84, + 0xF6, 0x55, 0xCC, 0x86, 0xF6, 0x75, 0xCC, 0x86, + 0xF6, 0x55, 0xCC, 0x8A, 0xF6, 0x75, 0xCC, 0x8A, + 0xF6, 0x55, 0xCC, 0x8B, 0xF6, 0x75, 0xCC, 0x8B, + 0xF6, 0x55, 0xCC, 0xA8, 0xF6, 0x75, 0xCC, 0xA8, + 0xF6, 0x57, 0xCC, 0x82, 0xF6, 0x77, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x82, 0xF6, 0x79, 0xCC, 0x82, + 0xF6, 0x59, 0xCC, 0x88, 0xF6, 0x5A, 0xCC, 0x81, + 0xF6, 0x7A, 0xCC, 0x81, 0xF6, 0x5A, 0xCC, 0x87, + 0xF6, 0x7A, 0xCC, 0x87, 0xF6, 0x5A, 0xCC, 0x8C, + 0xF6, 0x7A, 0xCC, 0x8C, 0x73, 0xF6, 0x4F, 0xCC, + 0x9B, 0xF6, 0x6F, 0xCC, 0x9B, 0xF6, 0x55, 0xCC, + 0x9B, 0xF6, 0x75, 0xCC, 0x9B, 0x44, 0x5A, 0xCC, + 0x8C, 0x44, 0x7A, 0xCC, 0x8C, 0x64, 0x7A, 0xCC, + 0x8C, 0x4C, 0x4A, 0x4C, 0x6A, 0x6C, 0x6A, 0x4E, + 0x4A, 0x4E, 0x6A, 0x6E, 0x6A, 0xF6, 0x41, 0xCC, + 0x8C, 0xF6, 0x61, 0xCC, 0x8C, 0xF6, 0x49, 0xCC, + 0x8C, 0xF6, 0x69, 0xCC, 0x8C, 0xF6, 0x4F, 0xCC, + 0x8C, 0xF6, 0x6F, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x8C, 0xF6, 0x75, 0xCC, 0x8C, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x55, 0xCC, + 0x88, 0xCC, 0x8C, 0xF6, 0x75, 0xCC, 0x88, 0xCC, + 0x8C, 0xF6, 0x55, 0xCC, 0x88, 0xCC, 0x80, 0xF6, + 0x75, 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0x41, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x61, 0xCC, 0x88, 0xCC, + 0x84, 0xF6, 0x41, 0xCC, 0x87, 0xCC, 0x84, 0xF6, + 0x61, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0xC3, 0x86, + 0xCC, 0x84, 0xF6, 0xC3, 0xA6, 0xCC, 0x84, 0xF6, + 0x47, 0xCC, 0x8C, 0xF6, 0x67, 0xCC, 0x8C, 0xF6, + 0x4B, 0xCC, 0x8C, 0xF6, 0x6B, 0xCC, 0x8C, 0xF6, + 0x4F, 0xCC, 0xA8, 0xF6, 0x6F, 0xCC, 0xA8, 0xF6, + 0x4F, 0xCC, 0xA8, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0xA8, 0xCC, 0x84, 0xF6, 0xC6, 0xB7, 0xCC, 0x8C, + 0xF6, 0xCA, 0x92, 0xCC, 0x8C, 0xF6, 0x6A, 0xCC, + 0x8C, 0x44, 0x5A, 0x44, 0x7A, 0x64, 0x7A, 0xF6, + 0x47, 0xCC, 0x81, 0xF6, 0x67, 0xCC, 0x81, 0xF6, + 0x4E, 0xCC, 0x80, 0xF6, 0x6E, 0xCC, 0x80, 0xF6, + 0x41, 0xCC, 0x8A, 0xCC, 0x81, 0xF6, 0x61, 0xCC, + 0x8A, 0xCC, 0x81, 0xF6, 0xC3, 0x86, 0xCC, 0x81, + 0xF6, 0xC3, 0xA6, 0xCC, 0x81, 0xF6, 0xC3, 0x98, + 0xCC, 0x81, 0xF6, 0xC3, 0xB8, 0xCC, 0x81, 0xF6, + 0x41, 0xCC, 0x8F, 0xF6, 0x61, 0xCC, 0x8F, 0xF6, + 0x41, 0xCC, 0x91, 0xF6, 0x61, 0xCC, 0x91, 0xF6, + 0x45, 0xCC, 0x8F, 0xF6, 0x65, 0xCC, 0x8F, 0xF6, + 0x45, 0xCC, 0x91, 0xF6, 0x65, 0xCC, 0x91, 0xF6, + 0x49, 0xCC, 0x8F, 0xF6, 0x69, 0xCC, 0x8F, 0xF6, + 0x49, 0xCC, 0x91, 0xF6, 0x69, 0xCC, 0x91, 0xF6, + 0x4F, 0xCC, 0x8F, 0xF6, 0x6F, 0xCC, 0x8F, 0xF6, + 0x4F, 0xCC, 0x91, 0xF6, 0x6F, 0xCC, 0x91, 0xF6, + 0x52, 0xCC, 0x8F, 0xF6, 0x72, 0xCC, 0x8F, 0xF6, + 0x52, 0xCC, 0x91, 0xF6, 0x72, 0xCC, 0x91, 0xF6, + 0x55, 0xCC, 0x8F, 0xF6, 0x75, 0xCC, 0x8F, 0xF6, + 0x55, 0xCC, 0x91, 0xF6, 0x75, 0xCC, 0x91, 0xF6, + 0x53, 0xCC, 0xA6, 0xF6, 0x73, 0xCC, 0xA6, 0xF6, + 0x54, 0xCC, 0xA6, 0xF6, 0x74, 0xCC, 0xA6, 0xF6, + 0x48, 0xCC, 0x8C, 0xF6, 0x68, 0xCC, 0x8C, 0xF6, + 0x41, 0xCC, 0x87, 0xF6, 0x61, 0xCC, 0x87, 0xF6, + 0x45, 0xCC, 0xA7, 0xF6, 0x65, 0xCC, 0xA7, 0xF6, + 0x4F, 0xCC, 0x88, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x88, 0xCC, 0x84, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, + 0x84, 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x84, 0xF6, + 0x4F, 0xCC, 0x87, 0xF6, 0x6F, 0xCC, 0x87, 0xF6, + 0x4F, 0xCC, 0x87, 0xCC, 0x84, 0xF6, 0x6F, 0xCC, + 0x87, 0xCC, 0x84, 0xF6, 0x59, 0xCC, 0x84, 0xF6, + 0x79, 0xCC, 0x84, 0x68, 0xC9, 0xA6, 0x6A, 0x72, + 0xC9, 0xB9, 0xC9, 0xBB, 0xCA, 0x81, 0x77, 0x79, + 0x20, 0xCC, 0x86, 0x20, 0xCC, 0x87, 0x20, 0xCC, + 0x8A, 0x20, 0xCC, 0xA8, 0x20, 0xCC, 0x83, 0x20, + 0xCC, 0x8B, 0xC9, 0xA3, 0x6C, 0x73, 0x78, 0xCA, + 0x95, 0xF6, 0xCC, 0x80, 0xF6, 0xCC, 0x81, 0xF6, + 0xCC, 0x93, 0xF6, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCA, 0xB9, 0x20, 0xCD, 0x85, 0xF6, 0x3B, 0x20, + 0xCC, 0x81, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, + 0x20, 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x81, 0xF6, 0xC2, 0xB7, 0xF6, 0xCE, 0x95, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x81, 0xF6, + 0xCE, 0x99, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x81, 0xF6, 0xCE, + 0xA9, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x88, 0xF6, + 0xCE, 0xA5, 0xCC, 0x88, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCE, 0xB9, 0xCC, 0x88, 0xF6, 0xCF, 0x85, 0xCC, + 0x88, 0xF6, 0xCE, 0xBF, 0xCC, 0x81, 0xF6, 0xCF, + 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, 0x81, + 0xCE, 0xB2, 0xCE, 0xB8, 0xCE, 0xA5, 0xF5, 0x05, + 0xCF, 0x92, 0xCC, 0x81, 0xCE, 0xA5, 0xCC, 0x81, + 0xF5, 0x05, 0xCF, 0x92, 0xCC, 0x88, 0xCE, 0xA5, + 0xCC, 0x88, 0xCF, 0x86, 0xCF, 0x80, 0xCE, 0xBA, + 0xCF, 0x81, 0xCF, 0x82, 0xCE, 0x98, 0xCE, 0xB5, + 0xCE, 0xA3, 0xF6, 0xD0, 0x95, 0xCC, 0x80, 0xF6, + 0xD0, 0x95, 0xCC, 0x88, 0xF6, 0xD0, 0x93, 0xCC, + 0x81, 0xF6, 0xD0, 0x86, 0xCC, 0x88, 0xF6, 0xD0, + 0x9A, 0xCC, 0x81, 0xF6, 0xD0, 0x98, 0xCC, 0x80, + 0xF6, 0xD0, 0xA3, 0xCC, 0x86, 0xF6, 0xD0, 0x98, + 0xCC, 0x86, 0xF6, 0xD0, 0xB8, 0xCC, 0x86, 0xF6, + 0xD0, 0xB5, 0xCC, 0x80, 0xF6, 0xD0, 0xB5, 0xCC, + 0x88, 0xF6, 0xD0, 0xB3, 0xCC, 0x81, 0xF6, 0xD1, + 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xBA, 0xCC, 0x81, + 0xF6, 0xD0, 0xB8, 0xCC, 0x80, 0xF6, 0xD1, 0x83, + 0xCC, 0x86, 0xF6, 0xD1, 0xB4, 0xCC, 0x8F, 0xF6, + 0xD1, 0xB5, 0xCC, 0x8F, 0xF6, 0xD0, 0x96, 0xCC, + 0x86, 0xF6, 0xD0, 0xB6, 0xCC, 0x86, 0xF6, 0xD0, + 0x90, 0xCC, 0x86, 0xF6, 0xD0, 0xB0, 0xCC, 0x86, + 0xF6, 0xD0, 0x90, 0xCC, 0x88, 0xF6, 0xD0, 0xB0, + 0xCC, 0x88, 0xF6, 0xD0, 0x95, 0xCC, 0x86, 0xF6, + 0xD0, 0xB5, 0xCC, 0x86, 0xF6, 0xD3, 0x98, 0xCC, + 0x88, 0xF6, 0xD3, 0x99, 0xCC, 0x88, 0xF6, 0xD0, + 0x96, 0xCC, 0x88, 0xF6, 0xD0, 0xB6, 0xCC, 0x88, + 0xF6, 0xD0, 0x97, 0xCC, 0x88, 0xF6, 0xD0, 0xB7, + 0xCC, 0x88, 0xF6, 0xD0, 0x98, 0xCC, 0x84, 0xF6, + 0xD0, 0xB8, 0xCC, 0x84, 0xF6, 0xD0, 0x98, 0xCC, + 0x88, 0xF6, 0xD0, 0xB8, 0xCC, 0x88, 0xF6, 0xD0, + 0x9E, 0xCC, 0x88, 0xF6, 0xD0, 0xBE, 0xCC, 0x88, + 0xF6, 0xD3, 0xA8, 0xCC, 0x88, 0xF6, 0xD3, 0xA9, + 0xCC, 0x88, 0xF6, 0xD0, 0xAD, 0xCC, 0x88, 0xF6, + 0xD1, 0x8D, 0xCC, 0x88, 0xF6, 0xD0, 0xA3, 0xCC, + 0x84, 0xF6, 0xD1, 0x83, 0xCC, 0x84, 0xF6, 0xD0, + 0xA3, 0xCC, 0x88, 0xF6, 0xD1, 0x83, 0xCC, 0x88, + 0xF6, 0xD0, 0xA3, 0xCC, 0x8B, 0xF6, 0xD1, 0x83, + 0xCC, 0x8B, 0xF6, 0xD0, 0xA7, 0xCC, 0x88, 0xF6, + 0xD1, 0x87, 0xCC, 0x88, 0xF6, 0xD0, 0xAB, 0xCC, + 0x88, 0xF6, 0xD1, 0x8B, 0xCC, 0x88, 0xD5, 0xA5, + 0xD6, 0x82, 0xF6, 0xD8, 0xA7, 0xD9, 0x93, 0xF6, + 0xD8, 0xA7, 0xD9, 0x94, 0xF6, 0xD9, 0x88, 0xD9, + 0x94, 0xF6, 0xD8, 0xA7, 0xD9, 0x95, 0xF6, 0xD9, + 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0xB4, 0xD9, + 0x88, 0xD9, 0xB4, 0xDB, 0x87, 0xD9, 0xB4, 0xD9, + 0x8A, 0xD9, 0xB4, 0xF6, 0xDB, 0x95, 0xD9, 0x94, + 0xF6, 0xDB, 0x81, 0xD9, 0x94, 0xF6, 0xDB, 0x92, + 0xD9, 0x94, 0xF6, 0xE0, 0xA4, 0xA8, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xB0, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xB3, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA4, 0x95, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, + 0xA4, 0x96, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, + 0x97, 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0x9C, + 0xE0, 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA1, 0xE0, + 0xA4, 0xBC, 0xF6, 0xE0, 0xA4, 0xA2, 0xE0, 0xA4, + 0xBC, 0xF6, 0xE0, 0xA4, 0xAB, 0xE0, 0xA4, 0xBC, + 0xF6, 0xE0, 0xA4, 0xAF, 0xE0, 0xA4, 0xBC, 0xF6, + 0xE0, 0xA7, 0x87, 0xE0, 0xA6, 0xBE, 0xF6, 0xE0, + 0xA7, 0x87, 0xE0, 0xA7, 0x97, 0xF6, 0xE0, 0xA6, + 0xA1, 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xA2, + 0xE0, 0xA6, 0xBC, 0xF6, 0xE0, 0xA6, 0xAF, 0xE0, + 0xA6, 0xBC, 0xF6, 0xE0, 0xA8, 0xB2, 0xE0, 0xA8, + 0xBC, 0xF6, 0xE0, 0xA8, 0xB8, 0xE0, 0xA8, 0xBC, + 0xF6, 0xE0, 0xA8, 0x96, 0xE0, 0xA8, 0xBC, 0xF6, + 0xE0, 0xA8, 0x97, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, + 0xA8, 0x9C, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xA8, + 0xAB, 0xE0, 0xA8, 0xBC, 0xF6, 0xE0, 0xAD, 0x87, + 0xE0, 0xAD, 0x96, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, + 0xAC, 0xBE, 0xF6, 0xE0, 0xAD, 0x87, 0xE0, 0xAD, + 0x97, 0xF6, 0xE0, 0xAC, 0xA1, 0xE0, 0xAC, 0xBC, + 0xF6, 0xE0, 0xAC, 0xA2, 0xE0, 0xAC, 0xBC, 0xF6, + 0xE0, 0xAE, 0x92, 0xE0, 0xAF, 0x97, 0xF6, 0xE0, + 0xAF, 0x86, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, + 0x87, 0xE0, 0xAE, 0xBE, 0xF6, 0xE0, 0xAF, 0x86, + 0xE0, 0xAF, 0x97, 0xF6, 0xE0, 0xB1, 0x86, 0xE0, + 0xB1, 0x96, 0xF6, 0xE0, 0xB2, 0xBF, 0xE0, 0xB3, + 0x95, 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x95, + 0xF6, 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x96, 0xF6, + 0xE0, 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xF6, 0xE0, + 0xB3, 0x86, 0xE0, 0xB3, 0x82, 0xE0, 0xB3, 0x95, + 0xF6, 0xE0, 0xB5, 0x86, 0xE0, 0xB4, 0xBE, 0xF6, + 0xE0, 0xB5, 0x87, 0xE0, 0xB4, 0xBE, 0xF6, 0xE0, + 0xB5, 0x86, 0xE0, 0xB5, 0x97, 0xF6, 0xE0, 0xB7, + 0x99, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, 0x99, + 0xE0, 0xB7, 0x8F, 0xF6, 0xE0, 0xB7, 0x99, 0xE0, + 0xB7, 0x8F, 0xE0, 0xB7, 0x8A, 0xF6, 0xE0, 0xB7, + 0x99, 0xE0, 0xB7, 0x9F, 0xE0, 0xB9, 0x8D, 0xE0, + 0xB8, 0xB2, 0xE0, 0xBB, 0x8D, 0xE0, 0xBA, 0xB2, + 0xE0, 0xBA, 0xAB, 0xE0, 0xBA, 0x99, 0xE0, 0xBA, + 0xAB, 0xE0, 0xBA, 0xA1, 0xE0, 0xBC, 0x8B, 0xF6, + 0xE0, 0xBD, 0x82, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, + 0xBD, 0x8C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, + 0x91, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x96, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x9B, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBD, 0x80, 0xE0, 0xBE, + 0xB5, 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB2, + 0xF6, 0xE0, 0xBD, 0xB1, 0xE0, 0xBD, 0xB4, 0xF6, + 0xE0, 0xBE, 0xB2, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, + 0xB2, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, + 0xE0, 0xBE, 0xB3, 0xE0, 0xBE, 0x80, 0xE0, 0xBE, + 0xB3, 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, + 0xE0, 0xBD, 0xB1, 0xE0, 0xBE, 0x80, 0xF6, 0xE0, + 0xBE, 0x92, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, + 0x9C, 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA1, + 0xE0, 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xA6, 0xE0, + 0xBE, 0xB7, 0xF6, 0xE0, 0xBE, 0xAB, 0xE0, 0xBE, + 0xB7, 0xF6, 0xE0, 0xBE, 0x90, 0xE0, 0xBE, 0xB5, + 0xF6, 0xE1, 0x80, 0xA5, 0xE1, 0x80, 0xAE, 0xE1, + 0x83, 0x9C, 0xF6, 0xE1, 0xAC, 0x85, 0xE1, 0xAC, + 0xB5, 0xF6, 0xE1, 0xAC, 0x87, 0xE1, 0xAC, 0xB5, + 0xF6, 0xE1, 0xAC, 0x89, 0xE1, 0xAC, 0xB5, 0xF6, + 0xE1, 0xAC, 0x8B, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, + 0xAC, 0x8D, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, + 0x91, 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBA, + 0xE1, 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBC, 0xE1, + 0xAC, 0xB5, 0xF6, 0xE1, 0xAC, 0xBE, 0xE1, 0xAC, + 0xB5, 0xF6, 0xE1, 0xAC, 0xBF, 0xE1, 0xAC, 0xB5, + 0xF6, 0xE1, 0xAD, 0x82, 0xE1, 0xAC, 0xB5, 0x41, + 0xC3, 0x86, 0x42, 0x44, 0x45, 0xC6, 0x8E, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0xC8, 0xA2, 0x50, 0x52, 0x54, 0x55, 0x57, 0x61, + 0xC9, 0x90, 0xC9, 0x91, 0xE1, 0xB4, 0x82, 0x62, + 0x64, 0x65, 0xC9, 0x99, 0xC9, 0x9B, 0xC9, 0x9C, + 0x67, 0x6B, 0x6D, 0xC5, 0x8B, 0x6F, 0xC9, 0x94, + 0xE1, 0xB4, 0x96, 0xE1, 0xB4, 0x97, 0x70, 0x74, + 0x75, 0xE1, 0xB4, 0x9D, 0xC9, 0xAF, 0x76, 0xE1, + 0xB4, 0xA5, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCF, 0x86, 0xCF, 0x87, 0x69, 0x72, 0x75, 0x76, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCF, 0x81, 0xCF, 0x86, + 0xCF, 0x87, 0xD0, 0xBD, 0xC9, 0x92, 0x63, 0xC9, + 0x95, 0xC3, 0xB0, 0xC9, 0x9C, 0x66, 0xC9, 0x9F, + 0xC9, 0xA1, 0xC9, 0xA5, 0xC9, 0xA8, 0xC9, 0xA9, + 0xC9, 0xAA, 0xE1, 0xB5, 0xBB, 0xCA, 0x9D, 0xC9, + 0xAD, 0xE1, 0xB6, 0x85, 0xCA, 0x9F, 0xC9, 0xB1, + 0xC9, 0xB0, 0xC9, 0xB2, 0xC9, 0xB3, 0xC9, 0xB4, + 0xC9, 0xB5, 0xC9, 0xB8, 0xCA, 0x82, 0xCA, 0x83, + 0xC6, 0xAB, 0xCA, 0x89, 0xCA, 0x8A, 0xE1, 0xB4, + 0x9C, 0xCA, 0x8B, 0xCA, 0x8C, 0x7A, 0xCA, 0x90, + 0xCA, 0x91, 0xCA, 0x92, 0xCE, 0xB8, 0xF6, 0x41, + 0xCC, 0xA5, 0xF6, 0x61, 0xCC, 0xA5, 0xF6, 0x42, + 0xCC, 0x87, 0xF6, 0x62, 0xCC, 0x87, 0xF6, 0x42, + 0xCC, 0xA3, 0xF6, 0x62, 0xCC, 0xA3, 0xF6, 0x42, + 0xCC, 0xB1, 0xF6, 0x62, 0xCC, 0xB1, 0xF6, 0x43, + 0xCC, 0xA7, 0xCC, 0x81, 0xF6, 0x63, 0xCC, 0xA7, + 0xCC, 0x81, 0xF6, 0x44, 0xCC, 0x87, 0xF6, 0x64, + 0xCC, 0x87, 0xF6, 0x44, 0xCC, 0xA3, 0xF6, 0x64, + 0xCC, 0xA3, 0xF6, 0x44, 0xCC, 0xB1, 0xF6, 0x64, + 0xCC, 0xB1, 0xF6, 0x44, 0xCC, 0xA7, 0xF6, 0x64, + 0xCC, 0xA7, 0xF6, 0x44, 0xCC, 0xAD, 0xF6, 0x64, + 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0x84, 0xCC, 0x80, + 0xF6, 0x65, 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x45, + 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x65, 0xCC, 0x84, + 0xCC, 0x81, 0xF6, 0x45, 0xCC, 0xAD, 0xF6, 0x65, + 0xCC, 0xAD, 0xF6, 0x45, 0xCC, 0xB0, 0xF6, 0x65, + 0xCC, 0xB0, 0xF6, 0x45, 0xCC, 0xA7, 0xCC, 0x86, + 0xF6, 0x65, 0xCC, 0xA7, 0xCC, 0x86, 0xF6, 0x46, + 0xCC, 0x87, 0xF6, 0x66, 0xCC, 0x87, 0xF6, 0x47, + 0xCC, 0x84, 0xF6, 0x67, 0xCC, 0x84, 0xF6, 0x48, + 0xCC, 0x87, 0xF6, 0x68, 0xCC, 0x87, 0xF6, 0x48, + 0xCC, 0xA3, 0xF6, 0x68, 0xCC, 0xA3, 0xF6, 0x48, + 0xCC, 0x88, 0xF6, 0x68, 0xCC, 0x88, 0xF6, 0x48, + 0xCC, 0xA7, 0xF6, 0x68, 0xCC, 0xA7, 0xF6, 0x48, + 0xCC, 0xAE, 0xF6, 0x68, 0xCC, 0xAE, 0xF6, 0x49, + 0xCC, 0xB0, 0xF6, 0x69, 0xCC, 0xB0, 0xF6, 0x49, + 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x69, 0xCC, 0x88, + 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0x81, 0xF6, 0x6B, + 0xCC, 0x81, 0xF6, 0x4B, 0xCC, 0xA3, 0xF6, 0x6B, + 0xCC, 0xA3, 0xF6, 0x4B, 0xCC, 0xB1, 0xF6, 0x6B, + 0xCC, 0xB1, 0xF6, 0x4C, 0xCC, 0xA3, 0xF6, 0x6C, + 0xCC, 0xA3, 0xF6, 0x4C, 0xCC, 0xA3, 0xCC, 0x84, + 0xF6, 0x6C, 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x4C, + 0xCC, 0xB1, 0xF6, 0x6C, 0xCC, 0xB1, 0xF6, 0x4C, + 0xCC, 0xAD, 0xF6, 0x6C, 0xCC, 0xAD, 0xF6, 0x4D, + 0xCC, 0x81, 0xF6, 0x6D, 0xCC, 0x81, 0xF6, 0x4D, + 0xCC, 0x87, 0xF6, 0x6D, 0xCC, 0x87, 0xF6, 0x4D, + 0xCC, 0xA3, 0xF6, 0x6D, 0xCC, 0xA3, 0xF6, 0x4E, + 0xCC, 0x87, 0xF6, 0x6E, 0xCC, 0x87, 0xF6, 0x4E, + 0xCC, 0xA3, 0xF6, 0x6E, 0xCC, 0xA3, 0xF6, 0x4E, + 0xCC, 0xB1, 0xF6, 0x6E, 0xCC, 0xB1, 0xF6, 0x4E, + 0xCC, 0xAD, 0xF6, 0x6E, 0xCC, 0xAD, 0xF6, 0x4F, + 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x83, + 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x83, 0xCC, 0x88, + 0xF6, 0x6F, 0xCC, 0x83, 0xCC, 0x88, 0xF6, 0x4F, + 0xCC, 0x84, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x84, + 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x84, 0xCC, 0x81, + 0xF6, 0x6F, 0xCC, 0x84, 0xCC, 0x81, 0xF6, 0x50, + 0xCC, 0x81, 0xF6, 0x70, 0xCC, 0x81, 0xF6, 0x50, + 0xCC, 0x87, 0xF6, 0x70, 0xCC, 0x87, 0xF6, 0x52, + 0xCC, 0x87, 0xF6, 0x72, 0xCC, 0x87, 0xF6, 0x52, + 0xCC, 0xA3, 0xF6, 0x72, 0xCC, 0xA3, 0xF6, 0x52, + 0xCC, 0xA3, 0xCC, 0x84, 0xF6, 0x72, 0xCC, 0xA3, + 0xCC, 0x84, 0xF6, 0x52, 0xCC, 0xB1, 0xF6, 0x72, + 0xCC, 0xB1, 0xF6, 0x53, 0xCC, 0x87, 0xF6, 0x73, + 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xF6, 0x73, + 0xCC, 0xA3, 0xF6, 0x53, 0xCC, 0x81, 0xCC, 0x87, + 0xF6, 0x73, 0xCC, 0x81, 0xCC, 0x87, 0xF6, 0x53, + 0xCC, 0x8C, 0xCC, 0x87, 0xF6, 0x73, 0xCC, 0x8C, + 0xCC, 0x87, 0xF6, 0x53, 0xCC, 0xA3, 0xCC, 0x87, + 0xF6, 0x73, 0xCC, 0xA3, 0xCC, 0x87, 0xF6, 0x54, + 0xCC, 0x87, 0xF6, 0x74, 0xCC, 0x87, 0xF6, 0x54, + 0xCC, 0xA3, 0xF6, 0x74, 0xCC, 0xA3, 0xF6, 0x54, + 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0xB1, 0xF6, 0x54, + 0xCC, 0xAD, 0xF6, 0x74, 0xCC, 0xAD, 0xF6, 0x55, + 0xCC, 0xA4, 0xF6, 0x75, 0xCC, 0xA4, 0xF6, 0x55, + 0xCC, 0xB0, 0xF6, 0x75, 0xCC, 0xB0, 0xF6, 0x55, + 0xCC, 0xAD, 0xF6, 0x75, 0xCC, 0xAD, 0xF6, 0x55, + 0xCC, 0x83, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x83, + 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x84, 0xCC, 0x88, + 0xF6, 0x75, 0xCC, 0x84, 0xCC, 0x88, 0xF6, 0x56, + 0xCC, 0x83, 0xF6, 0x76, 0xCC, 0x83, 0xF6, 0x56, + 0xCC, 0xA3, 0xF6, 0x76, 0xCC, 0xA3, 0xF6, 0x57, + 0xCC, 0x80, 0xF6, 0x77, 0xCC, 0x80, 0xF6, 0x57, + 0xCC, 0x81, 0xF6, 0x77, 0xCC, 0x81, 0xF6, 0x57, + 0xCC, 0x88, 0xF6, 0x77, 0xCC, 0x88, 0xF6, 0x57, + 0xCC, 0x87, 0xF6, 0x77, 0xCC, 0x87, 0xF6, 0x57, + 0xCC, 0xA3, 0xF6, 0x77, 0xCC, 0xA3, 0xF6, 0x58, + 0xCC, 0x87, 0xF6, 0x78, 0xCC, 0x87, 0xF6, 0x58, + 0xCC, 0x88, 0xF6, 0x78, 0xCC, 0x88, 0xF6, 0x59, + 0xCC, 0x87, 0xF6, 0x79, 0xCC, 0x87, 0xF6, 0x5A, + 0xCC, 0x82, 0xF6, 0x7A, 0xCC, 0x82, 0xF6, 0x5A, + 0xCC, 0xA3, 0xF6, 0x7A, 0xCC, 0xA3, 0xF6, 0x5A, + 0xCC, 0xB1, 0xF6, 0x7A, 0xCC, 0xB1, 0xF6, 0x68, + 0xCC, 0xB1, 0xF6, 0x74, 0xCC, 0x88, 0xF6, 0x77, + 0xCC, 0x8A, 0xF6, 0x79, 0xCC, 0x8A, 0x61, 0xCA, + 0xBE, 0xF5, 0x05, 0xC5, 0xBF, 0xCC, 0x87, 0x73, + 0xCC, 0x87, 0xF6, 0x41, 0xCC, 0xA3, 0xF6, 0x61, + 0xCC, 0xA3, 0xF6, 0x41, 0xCC, 0x89, 0xF6, 0x61, + 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x81, + 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x41, + 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x61, 0xCC, 0x82, + 0xCC, 0x80, 0xF6, 0x41, 0xCC, 0x82, 0xCC, 0x89, + 0xF6, 0x61, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x41, + 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x61, 0xCC, 0x82, + 0xCC, 0x83, 0xF6, 0x41, 0xCC, 0xA3, 0xCC, 0x82, + 0xF6, 0x61, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x41, + 0xCC, 0x86, 0xCC, 0x81, 0xF6, 0x61, 0xCC, 0x86, + 0xCC, 0x81, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x80, + 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x80, 0xF6, 0x41, + 0xCC, 0x86, 0xCC, 0x89, 0xF6, 0x61, 0xCC, 0x86, + 0xCC, 0x89, 0xF6, 0x41, 0xCC, 0x86, 0xCC, 0x83, + 0xF6, 0x61, 0xCC, 0x86, 0xCC, 0x83, 0xF6, 0x41, + 0xCC, 0xA3, 0xCC, 0x86, 0xF6, 0x61, 0xCC, 0xA3, + 0xCC, 0x86, 0xF6, 0x45, 0xCC, 0xA3, 0xF6, 0x65, + 0xCC, 0xA3, 0xF6, 0x45, 0xCC, 0x89, 0xF6, 0x65, + 0xCC, 0x89, 0xF6, 0x45, 0xCC, 0x83, 0xF6, 0x65, + 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x81, + 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x45, + 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x65, 0xCC, 0x82, + 0xCC, 0x80, 0xF6, 0x45, 0xCC, 0x82, 0xCC, 0x89, + 0xF6, 0x65, 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x45, + 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x65, 0xCC, 0x82, + 0xCC, 0x83, 0xF6, 0x45, 0xCC, 0xA3, 0xCC, 0x82, + 0xF6, 0x65, 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x49, + 0xCC, 0x89, 0xF6, 0x69, 0xCC, 0x89, 0xF6, 0x49, + 0xCC, 0xA3, 0xF6, 0x69, 0xCC, 0xA3, 0xF6, 0x4F, + 0xCC, 0xA3, 0xF6, 0x6F, 0xCC, 0xA3, 0xF6, 0x4F, + 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x89, 0xF6, 0x4F, + 0xCC, 0x82, 0xCC, 0x81, 0xF6, 0x6F, 0xCC, 0x82, + 0xCC, 0x81, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x80, + 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x80, 0xF6, 0x4F, + 0xCC, 0x82, 0xCC, 0x89, 0xF6, 0x6F, 0xCC, 0x82, + 0xCC, 0x89, 0xF6, 0x4F, 0xCC, 0x82, 0xCC, 0x83, + 0xF6, 0x6F, 0xCC, 0x82, 0xCC, 0x83, 0xF6, 0x4F, + 0xCC, 0xA3, 0xCC, 0x82, 0xF6, 0x6F, 0xCC, 0xA3, + 0xCC, 0x82, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x81, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x4F, + 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x6F, 0xCC, 0x9B, + 0xCC, 0x80, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0x89, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x4F, + 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x6F, 0xCC, 0x9B, + 0xCC, 0x83, 0xF6, 0x4F, 0xCC, 0x9B, 0xCC, 0xA3, + 0xF6, 0x6F, 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x55, + 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0xA3, 0xF6, 0x55, + 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x89, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0x81, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0x81, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x80, + 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x80, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0x89, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0x89, 0xF6, 0x55, 0xCC, 0x9B, 0xCC, 0x83, + 0xF6, 0x75, 0xCC, 0x9B, 0xCC, 0x83, 0xF6, 0x55, + 0xCC, 0x9B, 0xCC, 0xA3, 0xF6, 0x75, 0xCC, 0x9B, + 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x80, 0xF6, 0x79, + 0xCC, 0x80, 0xF6, 0x59, 0xCC, 0xA3, 0xF6, 0x79, + 0xCC, 0xA3, 0xF6, 0x59, 0xCC, 0x89, 0xF6, 0x79, + 0xCC, 0x89, 0xF6, 0x59, 0xCC, 0x83, 0xF6, 0x79, + 0xCC, 0x83, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xF6, 0xCE, 0xB1, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0x91, 0xCC, 0x93, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x91, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xF6, + 0xCE, 0xB5, 0xCC, 0x94, 0xF6, 0xCE, 0xB5, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xB5, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xF6, 0xCE, 0x95, + 0xCC, 0x94, 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, + 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x80, + 0xF6, 0xCE, 0x95, 0xCC, 0x93, 0xCC, 0x81, 0xF6, + 0xCE, 0x95, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, + 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xF6, + 0xCE, 0x97, 0xCC, 0x94, 0xF6, 0xCE, 0x97, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xB9, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, + 0xF6, 0xCE, 0xB9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xB9, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xB9, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x94, + 0xCD, 0x82, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xF6, + 0xCE, 0x99, 0xCC, 0x94, 0xF6, 0xCE, 0x99, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCE, 0x99, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCE, 0x99, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCE, 0x99, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xBF, 0xCC, 0x93, 0xF6, 0xCE, 0xBF, 0xCC, 0x94, + 0xF6, 0xCE, 0xBF, 0xCC, 0x93, 0xCC, 0x80, 0xF6, + 0xCE, 0xBF, 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, + 0xBF, 0xCC, 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xBF, + 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, + 0x93, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, 0xF6, 0xCE, + 0x9F, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, + 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, + 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0x9F, 0xCC, 0x94, + 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xF6, + 0xCF, 0x85, 0xCC, 0x94, 0xF6, 0xCF, 0x85, 0xCC, + 0x93, 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCC, + 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x94, 0xCC, 0x81, + 0xF6, 0xCF, 0x85, 0xCC, 0x93, 0xCD, 0x82, 0xF6, + 0xCF, 0x85, 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, + 0xA5, 0xCC, 0x94, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, + 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCC, + 0x81, 0xF6, 0xCE, 0xA5, 0xCC, 0x94, 0xCD, 0x82, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, + 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x80, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xF6, + 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCD, 0x82, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xF6, 0xCE, + 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, + 0xCC, 0x94, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, + 0xCC, 0x81, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, + 0x82, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x82, + 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xF6, 0xCE, 0xB1, + 0xCC, 0x81, 0xF6, 0xCE, 0xB5, 0xCC, 0x80, 0xF6, + 0xCE, 0xB5, 0xCC, 0x81, 0xF6, 0xCE, 0xB7, 0xCC, + 0x80, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, 0xF6, 0xCE, + 0xB9, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, 0x81, + 0xF6, 0xCE, 0xBF, 0xCC, 0x80, 0xF6, 0xCE, 0xBF, + 0xCC, 0x81, 0xF6, 0xCF, 0x85, 0xCC, 0x80, 0xF6, + 0xCF, 0x85, 0xCC, 0x81, 0xF6, 0xCF, 0x89, 0xCC, + 0x80, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xF6, 0xCE, + 0xB1, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCE, 0xB1, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xB1, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x91, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x91, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCE, 0xB7, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCE, 0xB7, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0x97, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x97, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, + 0x93, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCC, 0x94, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, + 0x89, 0xCC, 0x93, 0xCC, 0x81, 0xCD, 0x85, 0xF6, + 0xCF, 0x89, 0xCC, 0x94, 0xCC, 0x81, 0xCD, 0x85, + 0xF6, 0xCF, 0x89, 0xCC, 0x93, 0xCD, 0x82, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x94, 0xCD, 0x82, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCD, 0x85, + 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, 0x80, 0xCD, + 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, 0xCC, 0x80, + 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x93, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, 0x94, + 0xCC, 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, 0xCC, + 0x93, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0xA9, + 0xCC, 0x94, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0xB1, 0xCC, 0x86, 0xF6, 0xCE, 0xB1, 0xCC, 0x84, + 0xF6, 0xCE, 0xB1, 0xCC, 0x80, 0xCD, 0x85, 0xF6, + 0xCE, 0xB1, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCC, + 0x81, 0xCD, 0x85, 0xF6, 0xCE, 0xB1, 0xCD, 0x82, + 0xF6, 0xCE, 0xB1, 0xCD, 0x82, 0xCD, 0x85, 0xF6, + 0xCE, 0x91, 0xCC, 0x86, 0xF6, 0xCE, 0x91, 0xCC, + 0x84, 0xF6, 0xCE, 0x91, 0xCC, 0x80, 0xF6, 0xCE, + 0x91, 0xCC, 0x81, 0xF6, 0xCE, 0x91, 0xCD, 0x85, + 0x20, 0xCC, 0x93, 0xF6, 0xCE, 0xB9, 0x20, 0xCC, + 0x93, 0x20, 0xCD, 0x82, 0xF5, 0x05, 0xC2, 0xA8, + 0xCD, 0x82, 0x20, 0xCC, 0x88, 0xCD, 0x82, 0xF6, + 0xCE, 0xB7, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCE, + 0xB7, 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCC, 0x81, + 0xCD, 0x85, 0xF6, 0xCE, 0xB7, 0xCD, 0x82, 0xF6, + 0xCE, 0xB7, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, + 0x95, 0xCC, 0x80, 0xF6, 0xCE, 0x95, 0xCC, 0x81, + 0xF6, 0xCE, 0x97, 0xCC, 0x80, 0xF6, 0xCE, 0x97, + 0xCC, 0x81, 0xF6, 0xCE, 0x97, 0xCD, 0x85, 0xF5, + 0x06, 0xE1, 0xBE, 0xBF, 0xCC, 0x80, 0x20, 0xCC, + 0x93, 0xCC, 0x80, 0xF5, 0x06, 0xE1, 0xBE, 0xBF, + 0xCC, 0x81, 0x20, 0xCC, 0x93, 0xCC, 0x81, 0xF5, + 0x06, 0xE1, 0xBE, 0xBF, 0xCD, 0x82, 0x20, 0xCC, + 0x93, 0xCD, 0x82, 0xF6, 0xCE, 0xB9, 0xCC, 0x86, + 0xF6, 0xCE, 0xB9, 0xCC, 0x84, 0xF6, 0xCE, 0xB9, + 0xCC, 0x88, 0xCC, 0x80, 0xF6, 0xCE, 0xB9, 0xCC, + 0x88, 0xCC, 0x81, 0xF6, 0xCE, 0xB9, 0xCD, 0x82, + 0xF6, 0xCE, 0xB9, 0xCC, 0x88, 0xCD, 0x82, 0xF6, + 0xCE, 0x99, 0xCC, 0x86, 0xF6, 0xCE, 0x99, 0xCC, + 0x84, 0xF6, 0xCE, 0x99, 0xCC, 0x80, 0xF6, 0xCE, + 0x99, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, + 0xCC, 0x80, 0x20, 0xCC, 0x94, 0xCC, 0x80, 0xF5, + 0x06, 0xE1, 0xBF, 0xBE, 0xCC, 0x81, 0x20, 0xCC, + 0x94, 0xCC, 0x81, 0xF5, 0x06, 0xE1, 0xBF, 0xBE, + 0xCD, 0x82, 0x20, 0xCC, 0x94, 0xCD, 0x82, 0xF6, + 0xCF, 0x85, 0xCC, 0x86, 0xF6, 0xCF, 0x85, 0xCC, + 0x84, 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x80, + 0xF6, 0xCF, 0x85, 0xCC, 0x88, 0xCC, 0x81, 0xF6, + 0xCF, 0x81, 0xCC, 0x93, 0xF6, 0xCF, 0x81, 0xCC, + 0x94, 0xF6, 0xCF, 0x85, 0xCD, 0x82, 0xF6, 0xCF, + 0x85, 0xCC, 0x88, 0xCD, 0x82, 0xF6, 0xCE, 0xA5, + 0xCC, 0x86, 0xF6, 0xCE, 0xA5, 0xCC, 0x84, 0xF6, + 0xCE, 0xA5, 0xCC, 0x80, 0xF6, 0xCE, 0xA5, 0xCC, + 0x81, 0xF6, 0xCE, 0xA1, 0xCC, 0x94, 0xF5, 0x05, + 0xC2, 0xA8, 0xCC, 0x80, 0x20, 0xCC, 0x88, 0xCC, + 0x80, 0xF5, 0x05, 0xC2, 0xA8, 0xCC, 0x81, 0x20, + 0xCC, 0x88, 0xCC, 0x81, 0xF6, 0x60, 0xF6, 0xCF, + 0x89, 0xCC, 0x80, 0xCD, 0x85, 0xF6, 0xCF, 0x89, + 0xCD, 0x85, 0xF6, 0xCF, 0x89, 0xCC, 0x81, 0xCD, + 0x85, 0xF6, 0xCF, 0x89, 0xCD, 0x82, 0xF6, 0xCF, + 0x89, 0xCD, 0x82, 0xCD, 0x85, 0xF6, 0xCE, 0x9F, + 0xCC, 0x80, 0xF6, 0xCE, 0x9F, 0xCC, 0x81, 0xF6, + 0xCE, 0xA9, 0xCC, 0x80, 0xF6, 0xCE, 0xA9, 0xCC, + 0x81, 0xF6, 0xCE, 0xA9, 0xCD, 0x85, 0xF5, 0x03, + 0xC2, 0xB4, 0x20, 0xCC, 0x81, 0x20, 0xCC, 0x94, + 0xF5, 0x04, 0xE2, 0x80, 0x82, 0x20, 0xF5, 0x04, + 0xE2, 0x80, 0x83, 0x20, 0x20, 0x20, 0x20, 0x20, + 0x20, 0x20, 0x20, 0x20, 0x20, 0xE2, 0x80, 0x90, + 0x20, 0xCC, 0xB3, 0x2E, 0x2E, 0x2E, 0x2E, 0x2E, + 0x2E, 0x20, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, + 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, + 0xB2, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, + 0x80, 0xB5, 0xE2, 0x80, 0xB5, 0xE2, 0x80, 0xB5, + 0x21, 0x21, 0x20, 0xCC, 0x85, 0x3F, 0x3F, 0x3F, + 0x21, 0x21, 0x3F, 0xE2, 0x80, 0xB2, 0xE2, 0x80, + 0xB2, 0xE2, 0x80, 0xB2, 0xE2, 0x80, 0xB2, 0x20, + 0x30, 0x69, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, + 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, 0x29, 0x6E, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x2B, 0xE2, 0x88, 0x92, 0x3D, 0x28, + 0x29, 0x61, 0x65, 0x6F, 0x78, 0xC9, 0x99, 0x52, + 0x73, 0x61, 0x2F, 0x63, 0x61, 0x2F, 0x73, 0x43, + 0xC2, 0xB0, 0x43, 0x63, 0x2F, 0x6F, 0x63, 0x2F, + 0x75, 0xC6, 0x90, 0xC2, 0xB0, 0x46, 0x67, 0x48, + 0x48, 0x48, 0x68, 0xC4, 0xA7, 0x49, 0x49, 0x4C, + 0x6C, 0x4E, 0x4E, 0x6F, 0x50, 0x51, 0x52, 0x52, + 0x52, 0x53, 0x4D, 0x54, 0x45, 0x4C, 0x54, 0x4D, + 0x5A, 0xF6, 0xCE, 0xA9, 0x5A, 0xF6, 0x4B, 0xF6, + 0x41, 0xCC, 0x8A, 0x42, 0x43, 0x65, 0x45, 0x46, + 0x4D, 0x6F, 0xD7, 0x90, 0xD7, 0x91, 0xD7, 0x92, + 0xD7, 0x93, 0x69, 0x46, 0x41, 0x58, 0xCF, 0x80, + 0xCE, 0xB3, 0xCE, 0x93, 0xCE, 0xA0, 0xE2, 0x88, + 0x91, 0x44, 0x64, 0x65, 0x69, 0x6A, 0x31, 0xE2, + 0x81, 0x84, 0x33, 0x32, 0xE2, 0x81, 0x84, 0x33, + 0x31, 0xE2, 0x81, 0x84, 0x35, 0x32, 0xE2, 0x81, + 0x84, 0x35, 0x33, 0xE2, 0x81, 0x84, 0x35, 0x34, + 0xE2, 0x81, 0x84, 0x35, 0x31, 0xE2, 0x81, 0x84, + 0x36, 0x35, 0xE2, 0x81, 0x84, 0x36, 0x31, 0xE2, + 0x81, 0x84, 0x38, 0x33, 0xE2, 0x81, 0x84, 0x38, + 0x35, 0xE2, 0x81, 0x84, 0x38, 0x37, 0xE2, 0x81, + 0x84, 0x38, 0x31, 0xE2, 0x81, 0x84, 0x49, 0x49, + 0x49, 0x49, 0x49, 0x49, 0x49, 0x56, 0x56, 0x56, + 0x49, 0x56, 0x49, 0x49, 0x56, 0x49, 0x49, 0x49, + 0x49, 0x58, 0x58, 0x58, 0x49, 0x58, 0x49, 0x49, + 0x4C, 0x43, 0x44, 0x4D, 0x69, 0x69, 0x69, 0x69, + 0x69, 0x69, 0x69, 0x76, 0x76, 0x76, 0x69, 0x76, + 0x69, 0x69, 0x76, 0x69, 0x69, 0x69, 0x69, 0x78, + 0x78, 0x78, 0x69, 0x78, 0x69, 0x69, 0x6C, 0x63, + 0x64, 0x6D, 0xF6, 0xE2, 0x86, 0x90, 0xCC, 0xB8, + 0xF6, 0xE2, 0x86, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x86, 0x94, 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x90, + 0xCC, 0xB8, 0xF6, 0xE2, 0x87, 0x94, 0xCC, 0xB8, + 0xF6, 0xE2, 0x87, 0x92, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x88, + 0xCC, 0xB8, 0xF6, 0xE2, 0x88, 0x8B, 0xCC, 0xB8, + 0xF6, 0xE2, 0x88, 0xA3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x88, 0xA5, 0xCC, 0xB8, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, + 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAE, 0xE2, 0x88, + 0xAE, 0xE2, 0x88, 0xAE, 0xE2, 0x88, 0xAE, 0xE2, + 0x88, 0xAE, 0xF6, 0xE2, 0x88, 0xBC, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0x85, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x88, + 0xCC, 0xB8, 0xF6, 0x3D, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xA1, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0x8D, + 0xCC, 0xB8, 0xF6, 0x3C, 0xCC, 0xB8, 0xF6, 0x3E, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xA4, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xA5, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xB2, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB3, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xB6, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xB7, 0xCC, 0xB8, 0xF6, 0xE2, + 0x89, 0xBA, 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBB, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x82, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0x83, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x86, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x87, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xA2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xA8, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xA9, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xAB, + 0xCC, 0xB8, 0xF6, 0xE2, 0x89, 0xBC, 0xCC, 0xB8, + 0xF6, 0xE2, 0x89, 0xBD, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0x91, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0x92, + 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB2, 0xCC, 0xB8, + 0xF6, 0xE2, 0x8A, 0xB3, 0xCC, 0xB8, 0xF6, 0xE2, + 0x8A, 0xB4, 0xCC, 0xB8, 0xF6, 0xE2, 0x8A, 0xB5, + 0xCC, 0xB8, 0xF6, 0xE3, 0x80, 0x88, 0xF6, 0xE3, + 0x80, 0x89, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x31, 0x30, 0x31, 0x31, 0x31, + 0x32, 0x31, 0x33, 0x31, 0x34, 0x31, 0x35, 0x31, + 0x36, 0x31, 0x37, 0x31, 0x38, 0x31, 0x39, 0x32, + 0x30, 0x28, 0x31, 0x29, 0x28, 0x32, 0x29, 0x28, + 0x33, 0x29, 0x28, 0x34, 0x29, 0x28, 0x35, 0x29, + 0x28, 0x36, 0x29, 0x28, 0x37, 0x29, 0x28, 0x38, + 0x29, 0x28, 0x39, 0x29, 0x28, 0x31, 0x30, 0x29, + 0x28, 0x31, 0x31, 0x29, 0x28, 0x31, 0x32, 0x29, + 0x28, 0x31, 0x33, 0x29, 0x28, 0x31, 0x34, 0x29, + 0x28, 0x31, 0x35, 0x29, 0x28, 0x31, 0x36, 0x29, + 0x28, 0x31, 0x37, 0x29, 0x28, 0x31, 0x38, 0x29, + 0x28, 0x31, 0x39, 0x29, 0x28, 0x32, 0x30, 0x29, + 0x31, 0x2E, 0x32, 0x2E, 0x33, 0x2E, 0x34, 0x2E, + 0x35, 0x2E, 0x36, 0x2E, 0x37, 0x2E, 0x38, 0x2E, + 0x39, 0x2E, 0x31, 0x30, 0x2E, 0x31, 0x31, 0x2E, + 0x31, 0x32, 0x2E, 0x31, 0x33, 0x2E, 0x31, 0x34, + 0x2E, 0x31, 0x35, 0x2E, 0x31, 0x36, 0x2E, 0x31, + 0x37, 0x2E, 0x31, 0x38, 0x2E, 0x31, 0x39, 0x2E, + 0x32, 0x30, 0x2E, 0x28, 0x61, 0x29, 0x28, 0x62, + 0x29, 0x28, 0x63, 0x29, 0x28, 0x64, 0x29, 0x28, + 0x65, 0x29, 0x28, 0x66, 0x29, 0x28, 0x67, 0x29, + 0x28, 0x68, 0x29, 0x28, 0x69, 0x29, 0x28, 0x6A, + 0x29, 0x28, 0x6B, 0x29, 0x28, 0x6C, 0x29, 0x28, + 0x6D, 0x29, 0x28, 0x6E, 0x29, 0x28, 0x6F, 0x29, + 0x28, 0x70, 0x29, 0x28, 0x71, 0x29, 0x28, 0x72, + 0x29, 0x28, 0x73, 0x29, 0x28, 0x74, 0x29, 0x28, + 0x75, 0x29, 0x28, 0x76, 0x29, 0x28, 0x77, 0x29, + 0x28, 0x78, 0x29, 0x28, 0x79, 0x29, 0x28, 0x7A, + 0x29, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x30, 0xE2, 0x88, + 0xAB, 0xE2, 0x88, 0xAB, 0xE2, 0x88, 0xAB, 0xE2, + 0x88, 0xAB, 0x3A, 0x3A, 0x3D, 0x3D, 0x3D, 0x3D, + 0x3D, 0x3D, 0xF6, 0xE2, 0xAB, 0x9D, 0xCC, 0xB8, + 0xE2, 0xB5, 0xA1, 0xE6, 0xAF, 0x8D, 0xE9, 0xBE, + 0x9F, 0xE4, 0xB8, 0x80, 0xE4, 0xB8, 0xA8, 0xE4, + 0xB8, 0xB6, 0xE4, 0xB8, 0xBF, 0xE4, 0xB9, 0x99, + 0xE4, 0xBA, 0x85, 0xE4, 0xBA, 0x8C, 0xE4, 0xBA, + 0xA0, 0xE4, 0xBA, 0xBA, 0xE5, 0x84, 0xBF, 0xE5, + 0x85, 0xA5, 0xE5, 0x85, 0xAB, 0xE5, 0x86, 0x82, + 0xE5, 0x86, 0x96, 0xE5, 0x86, 0xAB, 0xE5, 0x87, + 0xA0, 0xE5, 0x87, 0xB5, 0xE5, 0x88, 0x80, 0xE5, + 0x8A, 0x9B, 0xE5, 0x8B, 0xB9, 0xE5, 0x8C, 0x95, + 0xE5, 0x8C, 0x9A, 0xE5, 0x8C, 0xB8, 0xE5, 0x8D, + 0x81, 0xE5, 0x8D, 0x9C, 0xE5, 0x8D, 0xA9, 0xE5, + 0x8E, 0x82, 0xE5, 0x8E, 0xB6, 0xE5, 0x8F, 0x88, + 0xE5, 0x8F, 0xA3, 0xE5, 0x9B, 0x97, 0xE5, 0x9C, + 0x9F, 0xE5, 0xA3, 0xAB, 0xE5, 0xA4, 0x82, 0xE5, + 0xA4, 0x8A, 0xE5, 0xA4, 0x95, 0xE5, 0xA4, 0xA7, + 0xE5, 0xA5, 0xB3, 0xE5, 0xAD, 0x90, 0xE5, 0xAE, + 0x80, 0xE5, 0xAF, 0xB8, 0xE5, 0xB0, 0x8F, 0xE5, + 0xB0, 0xA2, 0xE5, 0xB0, 0xB8, 0xE5, 0xB1, 0xAE, + 0xE5, 0xB1, 0xB1, 0xE5, 0xB7, 0x9B, 0xE5, 0xB7, + 0xA5, 0xE5, 0xB7, 0xB1, 0xE5, 0xB7, 0xBE, 0xE5, + 0xB9, 0xB2, 0xE5, 0xB9, 0xBA, 0xE5, 0xB9, 0xBF, + 0xE5, 0xBB, 0xB4, 0xE5, 0xBB, 0xBE, 0xE5, 0xBC, + 0x8B, 0xE5, 0xBC, 0x93, 0xE5, 0xBD, 0x90, 0xE5, + 0xBD, 0xA1, 0xE5, 0xBD, 0xB3, 0xE5, 0xBF, 0x83, + 0xE6, 0x88, 0x88, 0xE6, 0x88, 0xB6, 0xE6, 0x89, + 0x8B, 0xE6, 0x94, 0xAF, 0xE6, 0x94, 0xB4, 0xE6, + 0x96, 0x87, 0xE6, 0x96, 0x97, 0xE6, 0x96, 0xA4, + 0xE6, 0x96, 0xB9, 0xE6, 0x97, 0xA0, 0xE6, 0x97, + 0xA5, 0xE6, 0x9B, 0xB0, 0xE6, 0x9C, 0x88, 0xE6, + 0x9C, 0xA8, 0xE6, 0xAC, 0xA0, 0xE6, 0xAD, 0xA2, + 0xE6, 0xAD, 0xB9, 0xE6, 0xAE, 0xB3, 0xE6, 0xAF, + 0x8B, 0xE6, 0xAF, 0x94, 0xE6, 0xAF, 0x9B, 0xE6, + 0xB0, 0x8F, 0xE6, 0xB0, 0x94, 0xE6, 0xB0, 0xB4, + 0xE7, 0x81, 0xAB, 0xE7, 0x88, 0xAA, 0xE7, 0x88, + 0xB6, 0xE7, 0x88, 0xBB, 0xE7, 0x88, 0xBF, 0xE7, + 0x89, 0x87, 0xE7, 0x89, 0x99, 0xE7, 0x89, 0x9B, + 0xE7, 0x8A, 0xAC, 0xE7, 0x8E, 0x84, 0xE7, 0x8E, + 0x89, 0xE7, 0x93, 0x9C, 0xE7, 0x93, 0xA6, 0xE7, + 0x94, 0x98, 0xE7, 0x94, 0x9F, 0xE7, 0x94, 0xA8, + 0xE7, 0x94, 0xB0, 0xE7, 0x96, 0x8B, 0xE7, 0x96, + 0x92, 0xE7, 0x99, 0xB6, 0xE7, 0x99, 0xBD, 0xE7, + 0x9A, 0xAE, 0xE7, 0x9A, 0xBF, 0xE7, 0x9B, 0xAE, + 0xE7, 0x9F, 0x9B, 0xE7, 0x9F, 0xA2, 0xE7, 0x9F, + 0xB3, 0xE7, 0xA4, 0xBA, 0xE7, 0xA6, 0xB8, 0xE7, + 0xA6, 0xBE, 0xE7, 0xA9, 0xB4, 0xE7, 0xAB, 0x8B, + 0xE7, 0xAB, 0xB9, 0xE7, 0xB1, 0xB3, 0xE7, 0xB3, + 0xB8, 0xE7, 0xBC, 0xB6, 0xE7, 0xBD, 0x91, 0xE7, + 0xBE, 0x8A, 0xE7, 0xBE, 0xBD, 0xE8, 0x80, 0x81, + 0xE8, 0x80, 0x8C, 0xE8, 0x80, 0x92, 0xE8, 0x80, + 0xB3, 0xE8, 0x81, 0xBF, 0xE8, 0x82, 0x89, 0xE8, + 0x87, 0xA3, 0xE8, 0x87, 0xAA, 0xE8, 0x87, 0xB3, + 0xE8, 0x87, 0xBC, 0xE8, 0x88, 0x8C, 0xE8, 0x88, + 0x9B, 0xE8, 0x88, 0x9F, 0xE8, 0x89, 0xAE, 0xE8, + 0x89, 0xB2, 0xE8, 0x89, 0xB8, 0xE8, 0x99, 0x8D, + 0xE8, 0x99, 0xAB, 0xE8, 0xA1, 0x80, 0xE8, 0xA1, + 0x8C, 0xE8, 0xA1, 0xA3, 0xE8, 0xA5, 0xBE, 0xE8, + 0xA6, 0x8B, 0xE8, 0xA7, 0x92, 0xE8, 0xA8, 0x80, + 0xE8, 0xB0, 0xB7, 0xE8, 0xB1, 0x86, 0xE8, 0xB1, + 0x95, 0xE8, 0xB1, 0xB8, 0xE8, 0xB2, 0x9D, 0xE8, + 0xB5, 0xA4, 0xE8, 0xB5, 0xB0, 0xE8, 0xB6, 0xB3, + 0xE8, 0xBA, 0xAB, 0xE8, 0xBB, 0x8A, 0xE8, 0xBE, + 0x9B, 0xE8, 0xBE, 0xB0, 0xE8, 0xBE, 0xB5, 0xE9, + 0x82, 0x91, 0xE9, 0x85, 0x89, 0xE9, 0x87, 0x86, + 0xE9, 0x87, 0x8C, 0xE9, 0x87, 0x91, 0xE9, 0x95, + 0xB7, 0xE9, 0x96, 0x80, 0xE9, 0x98, 0x9C, 0xE9, + 0x9A, 0xB6, 0xE9, 0x9A, 0xB9, 0xE9, 0x9B, 0xA8, + 0xE9, 0x9D, 0x91, 0xE9, 0x9D, 0x9E, 0xE9, 0x9D, + 0xA2, 0xE9, 0x9D, 0xA9, 0xE9, 0x9F, 0x8B, 0xE9, + 0x9F, 0xAD, 0xE9, 0x9F, 0xB3, 0xE9, 0xA0, 0x81, + 0xE9, 0xA2, 0xA8, 0xE9, 0xA3, 0x9B, 0xE9, 0xA3, + 0x9F, 0xE9, 0xA6, 0x96, 0xE9, 0xA6, 0x99, 0xE9, + 0xA6, 0xAC, 0xE9, 0xAA, 0xA8, 0xE9, 0xAB, 0x98, + 0xE9, 0xAB, 0x9F, 0xE9, 0xAC, 0xA5, 0xE9, 0xAC, + 0xAF, 0xE9, 0xAC, 0xB2, 0xE9, 0xAC, 0xBC, 0xE9, + 0xAD, 0x9A, 0xE9, 0xB3, 0xA5, 0xE9, 0xB9, 0xB5, + 0xE9, 0xB9, 0xBF, 0xE9, 0xBA, 0xA5, 0xE9, 0xBA, + 0xBB, 0xE9, 0xBB, 0x83, 0xE9, 0xBB, 0x8D, 0xE9, + 0xBB, 0x91, 0xE9, 0xBB, 0xB9, 0xE9, 0xBB, 0xBD, + 0xE9, 0xBC, 0x8E, 0xE9, 0xBC, 0x93, 0xE9, 0xBC, + 0xA0, 0xE9, 0xBC, 0xBB, 0xE9, 0xBD, 0x8A, 0xE9, + 0xBD, 0x92, 0xE9, 0xBE, 0x8D, 0xE9, 0xBE, 0x9C, + 0xE9, 0xBE, 0xA0, 0x20, 0xE3, 0x80, 0x92, 0xE5, + 0x8D, 0x81, 0xE5, 0x8D, 0x84, 0xE5, 0x8D, 0x85, + 0xF6, 0xE3, 0x81, 0x8B, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x8D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x8F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0x91, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x93, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x95, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0x97, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0x99, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0x9B, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x81, 0x9D, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0x9F, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, + 0xA1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA4, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA6, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xA8, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0xAF, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x81, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x81, 0xB2, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, + 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB5, + 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x81, 0xB8, 0xE3, 0x82, + 0x9A, 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x81, 0xBB, 0xE3, 0x82, 0x9A, 0xF6, + 0xE3, 0x81, 0x86, 0xE3, 0x82, 0x99, 0x20, 0xE3, + 0x82, 0x99, 0x20, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x82, 0x9D, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0x88, + 0xE3, 0x82, 0x8A, 0xF6, 0xE3, 0x82, 0xAB, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x82, 0xB1, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x82, 0xB5, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, + 0xB7, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xB9, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBB, 0xE3, + 0x82, 0x99, 0xF6, 0xE3, 0x82, 0xBD, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x82, 0xBF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x84, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0x86, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x8F, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0x92, 0xE3, 0x82, 0x9A, + 0xF6, 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0x95, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x9A, 0xF6, 0xE3, 0x83, 0x9B, + 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, 0x9B, 0xE3, + 0x82, 0x9A, 0xF6, 0xE3, 0x82, 0xA6, 0xE3, 0x82, + 0x99, 0xF6, 0xE3, 0x83, 0xAF, 0xE3, 0x82, 0x99, + 0xF6, 0xE3, 0x83, 0xB0, 0xE3, 0x82, 0x99, 0xF6, + 0xE3, 0x83, 0xB1, 0xE3, 0x82, 0x99, 0xF6, 0xE3, + 0x83, 0xB2, 0xE3, 0x82, 0x99, 0xF6, 0xE3, 0x83, + 0xBD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xB3, 0xE3, + 0x83, 0x88, 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x81, + 0xE1, 0x86, 0xAA, 0xE1, 0x84, 0x82, 0xE1, 0x86, + 0xAC, 0xE1, 0x86, 0xAD, 0xE1, 0x84, 0x83, 0xE1, + 0x84, 0x84, 0xE1, 0x84, 0x85, 0xE1, 0x86, 0xB0, + 0xE1, 0x86, 0xB1, 0xE1, 0x86, 0xB2, 0xE1, 0x86, + 0xB3, 0xE1, 0x86, 0xB4, 0xE1, 0x86, 0xB5, 0xE1, + 0x84, 0x9A, 0xE1, 0x84, 0x86, 0xE1, 0x84, 0x87, + 0xE1, 0x84, 0x88, 0xE1, 0x84, 0xA1, 0xE1, 0x84, + 0x89, 0xE1, 0x84, 0x8A, 0xE1, 0x84, 0x8B, 0xE1, + 0x84, 0x8C, 0xE1, 0x84, 0x8D, 0xE1, 0x84, 0x8E, + 0xE1, 0x84, 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, + 0x91, 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, + 0x85, 0xA2, 0xE1, 0x85, 0xA3, 0xE1, 0x85, 0xA4, + 0xE1, 0x85, 0xA5, 0xE1, 0x85, 0xA6, 0xE1, 0x85, + 0xA7, 0xE1, 0x85, 0xA8, 0xE1, 0x85, 0xA9, 0xE1, + 0x85, 0xAA, 0xE1, 0x85, 0xAB, 0xE1, 0x85, 0xAC, + 0xE1, 0x85, 0xAD, 0xE1, 0x85, 0xAE, 0xE1, 0x85, + 0xAF, 0xE1, 0x85, 0xB0, 0xE1, 0x85, 0xB1, 0xE1, + 0x85, 0xB2, 0xE1, 0x85, 0xB3, 0xE1, 0x85, 0xB4, + 0xE1, 0x85, 0xB5, 0xE1, 0x85, 0xA0, 0xE1, 0x84, + 0x94, 0xE1, 0x84, 0x95, 0xE1, 0x87, 0x87, 0xE1, + 0x87, 0x88, 0xE1, 0x87, 0x8C, 0xE1, 0x87, 0x8E, + 0xE1, 0x87, 0x93, 0xE1, 0x87, 0x97, 0xE1, 0x87, + 0x99, 0xE1, 0x84, 0x9C, 0xE1, 0x87, 0x9D, 0xE1, + 0x87, 0x9F, 0xE1, 0x84, 0x9D, 0xE1, 0x84, 0x9E, + 0xE1, 0x84, 0xA0, 0xE1, 0x84, 0xA2, 0xE1, 0x84, + 0xA3, 0xE1, 0x84, 0xA7, 0xE1, 0x84, 0xA9, 0xE1, + 0x84, 0xAB, 0xE1, 0x84, 0xAC, 0xE1, 0x84, 0xAD, + 0xE1, 0x84, 0xAE, 0xE1, 0x84, 0xAF, 0xE1, 0x84, + 0xB2, 0xE1, 0x84, 0xB6, 0xE1, 0x85, 0x80, 0xE1, + 0x85, 0x87, 0xE1, 0x85, 0x8C, 0xE1, 0x87, 0xB1, + 0xE1, 0x87, 0xB2, 0xE1, 0x85, 0x97, 0xE1, 0x85, + 0x98, 0xE1, 0x85, 0x99, 0xE1, 0x86, 0x84, 0xE1, + 0x86, 0x85, 0xE1, 0x86, 0x88, 0xE1, 0x86, 0x91, + 0xE1, 0x86, 0x92, 0xE1, 0x86, 0x94, 0xE1, 0x86, + 0x9E, 0xE1, 0x86, 0xA1, 0xE4, 0xB8, 0x80, 0xE4, + 0xBA, 0x8C, 0xE4, 0xB8, 0x89, 0xE5, 0x9B, 0x9B, + 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, 0xAD, 0xE4, 0xB8, + 0x8B, 0xE7, 0x94, 0xB2, 0xE4, 0xB9, 0x99, 0xE4, + 0xB8, 0x99, 0xE4, 0xB8, 0x81, 0xE5, 0xA4, 0xA9, + 0xE5, 0x9C, 0xB0, 0xE4, 0xBA, 0xBA, 0x28, 0xE1, + 0x84, 0x80, 0x29, 0x28, 0xE1, 0x84, 0x82, 0x29, + 0x28, 0xE1, 0x84, 0x83, 0x29, 0x28, 0xE1, 0x84, + 0x85, 0x29, 0x28, 0xE1, 0x84, 0x86, 0x29, 0x28, + 0xE1, 0x84, 0x87, 0x29, 0x28, 0xE1, 0x84, 0x89, + 0x29, 0x28, 0xE1, 0x84, 0x8B, 0x29, 0x28, 0xE1, + 0x84, 0x8C, 0x29, 0x28, 0xE1, 0x84, 0x8E, 0x29, + 0x28, 0xE1, 0x84, 0x8F, 0x29, 0x28, 0xE1, 0x84, + 0x90, 0x29, 0x28, 0xE1, 0x84, 0x91, 0x29, 0x28, + 0xE1, 0x84, 0x92, 0x29, 0x28, 0xE1, 0x84, 0x80, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x82, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x83, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x85, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x86, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x87, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x89, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8B, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8E, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8F, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x90, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x91, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x92, + 0xE1, 0x85, 0xA1, 0x29, 0x28, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xAE, 0x29, 0x28, 0xE1, 0x84, 0x8B, + 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, 0xE1, 0x85, + 0xA5, 0xE1, 0x86, 0xAB, 0x29, 0x28, 0xE1, 0x84, + 0x8B, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x92, 0xE1, + 0x85, 0xAE, 0x29, 0x28, 0xE4, 0xB8, 0x80, 0x29, + 0x28, 0xE4, 0xBA, 0x8C, 0x29, 0x28, 0xE4, 0xB8, + 0x89, 0x29, 0x28, 0xE5, 0x9B, 0x9B, 0x29, 0x28, + 0xE4, 0xBA, 0x94, 0x29, 0x28, 0xE5, 0x85, 0xAD, + 0x29, 0x28, 0xE4, 0xB8, 0x83, 0x29, 0x28, 0xE5, + 0x85, 0xAB, 0x29, 0x28, 0xE4, 0xB9, 0x9D, 0x29, + 0x28, 0xE5, 0x8D, 0x81, 0x29, 0x28, 0xE6, 0x9C, + 0x88, 0x29, 0x28, 0xE7, 0x81, 0xAB, 0x29, 0x28, + 0xE6, 0xB0, 0xB4, 0x29, 0x28, 0xE6, 0x9C, 0xA8, + 0x29, 0x28, 0xE9, 0x87, 0x91, 0x29, 0x28, 0xE5, + 0x9C, 0x9F, 0x29, 0x28, 0xE6, 0x97, 0xA5, 0x29, + 0x28, 0xE6, 0xA0, 0xAA, 0x29, 0x28, 0xE6, 0x9C, + 0x89, 0x29, 0x28, 0xE7, 0xA4, 0xBE, 0x29, 0x28, + 0xE5, 0x90, 0x8D, 0x29, 0x28, 0xE7, 0x89, 0xB9, + 0x29, 0x28, 0xE8, 0xB2, 0xA1, 0x29, 0x28, 0xE7, + 0xA5, 0x9D, 0x29, 0x28, 0xE5, 0x8A, 0xB4, 0x29, + 0x28, 0xE4, 0xBB, 0xA3, 0x29, 0x28, 0xE5, 0x91, + 0xBC, 0x29, 0x28, 0xE5, 0xAD, 0xA6, 0x29, 0x28, + 0xE7, 0x9B, 0xA3, 0x29, 0x28, 0xE4, 0xBC, 0x81, + 0x29, 0x28, 0xE8, 0xB3, 0x87, 0x29, 0x28, 0xE5, + 0x8D, 0x94, 0x29, 0x28, 0xE7, 0xA5, 0xAD, 0x29, + 0x28, 0xE4, 0xBC, 0x91, 0x29, 0x28, 0xE8, 0x87, + 0xAA, 0x29, 0x28, 0xE8, 0x87, 0xB3, 0x29, 0x50, + 0x54, 0x45, 0x32, 0x31, 0x32, 0x32, 0x32, 0x33, + 0x32, 0x34, 0x32, 0x35, 0x32, 0x36, 0x32, 0x37, + 0x32, 0x38, 0x32, 0x39, 0x33, 0x30, 0x33, 0x31, + 0x33, 0x32, 0x33, 0x33, 0x33, 0x34, 0x33, 0x35, + 0xE1, 0x84, 0x80, 0xE1, 0x84, 0x82, 0xE1, 0x84, + 0x83, 0xE1, 0x84, 0x85, 0xE1, 0x84, 0x86, 0xE1, + 0x84, 0x87, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8B, + 0xE1, 0x84, 0x8C, 0xE1, 0x84, 0x8E, 0xE1, 0x84, + 0x8F, 0xE1, 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, + 0x84, 0x92, 0xE1, 0x84, 0x80, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x82, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x83, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x85, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x86, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x87, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x89, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x8B, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x8C, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x8F, 0xE1, 0x85, 0xA1, 0xE1, 0x84, 0x90, 0xE1, + 0x85, 0xA1, 0xE1, 0x84, 0x91, 0xE1, 0x85, 0xA1, + 0xE1, 0x84, 0x92, 0xE1, 0x85, 0xA1, 0xE1, 0x84, + 0x8E, 0xE1, 0x85, 0xA1, 0xE1, 0x86, 0xB7, 0xE1, + 0x84, 0x80, 0xE1, 0x85, 0xA9, 0xE1, 0x84, 0x8C, + 0xE1, 0x85, 0xAE, 0xE1, 0x84, 0x8B, 0xE1, 0x85, + 0xB4, 0xE1, 0x84, 0x8B, 0xE1, 0x85, 0xAE, 0xE4, + 0xB8, 0x80, 0xE4, 0xBA, 0x8C, 0xE4, 0xB8, 0x89, + 0xE5, 0x9B, 0x9B, 0xE4, 0xBA, 0x94, 0xE5, 0x85, + 0xAD, 0xE4, 0xB8, 0x83, 0xE5, 0x85, 0xAB, 0xE4, + 0xB9, 0x9D, 0xE5, 0x8D, 0x81, 0xE6, 0x9C, 0x88, + 0xE7, 0x81, 0xAB, 0xE6, 0xB0, 0xB4, 0xE6, 0x9C, + 0xA8, 0xE9, 0x87, 0x91, 0xE5, 0x9C, 0x9F, 0xE6, + 0x97, 0xA5, 0xE6, 0xA0, 0xAA, 0xE6, 0x9C, 0x89, + 0xE7, 0xA4, 0xBE, 0xE5, 0x90, 0x8D, 0xE7, 0x89, + 0xB9, 0xE8, 0xB2, 0xA1, 0xE7, 0xA5, 0x9D, 0xE5, + 0x8A, 0xB4, 0xE7, 0xA7, 0x98, 0xE7, 0x94, 0xB7, + 0xE5, 0xA5, 0xB3, 0xE9, 0x81, 0xA9, 0xE5, 0x84, + 0xAA, 0xE5, 0x8D, 0xB0, 0xE6, 0xB3, 0xA8, 0xE9, + 0xA0, 0x85, 0xE4, 0xBC, 0x91, 0xE5, 0x86, 0x99, + 0xE6, 0xAD, 0xA3, 0xE4, 0xB8, 0x8A, 0xE4, 0xB8, + 0xAD, 0xE4, 0xB8, 0x8B, 0xE5, 0xB7, 0xA6, 0xE5, + 0x8F, 0xB3, 0xE5, 0x8C, 0xBB, 0xE5, 0xAE, 0x97, + 0xE5, 0xAD, 0xA6, 0xE7, 0x9B, 0xA3, 0xE4, 0xBC, + 0x81, 0xE8, 0xB3, 0x87, 0xE5, 0x8D, 0x94, 0xE5, + 0xA4, 0x9C, 0x33, 0x36, 0x33, 0x37, 0x33, 0x38, + 0x33, 0x39, 0x34, 0x30, 0x34, 0x31, 0x34, 0x32, + 0x34, 0x33, 0x34, 0x34, 0x34, 0x35, 0x34, 0x36, + 0x34, 0x37, 0x34, 0x38, 0x34, 0x39, 0x35, 0x30, + 0x31, 0xE6, 0x9C, 0x88, 0x32, 0xE6, 0x9C, 0x88, + 0x33, 0xE6, 0x9C, 0x88, 0x34, 0xE6, 0x9C, 0x88, + 0x35, 0xE6, 0x9C, 0x88, 0x36, 0xE6, 0x9C, 0x88, + 0x37, 0xE6, 0x9C, 0x88, 0x38, 0xE6, 0x9C, 0x88, + 0x39, 0xE6, 0x9C, 0x88, 0x31, 0x30, 0xE6, 0x9C, + 0x88, 0x31, 0x31, 0xE6, 0x9C, 0x88, 0x31, 0x32, + 0xE6, 0x9C, 0x88, 0x48, 0x67, 0x65, 0x72, 0x67, + 0x65, 0x56, 0x4C, 0x54, 0x44, 0xE3, 0x82, 0xA2, + 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, 0x82, + 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xB1, + 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, 0x82, + 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, 0xE3, + 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, 0x81, + 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, 0x8E, + 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, 0x83, + 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, 0xE3, + 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, 0xA0, + 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, 0x83, + 0xAF, 0xE3, 0x83, 0xB0, 0xE3, 0x83, 0xB1, 0xE3, + 0x83, 0xB2, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0x8F, + 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0x88, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA2, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, 0xA2, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xA4, + 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xA4, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xA6, + 0xE3, 0x82, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x82, + 0xA8, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA8, 0xE3, 0x83, 0xBC, 0xE3, 0x82, + 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAA, 0xE3, + 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xA0, 0xE3, 0x82, + 0xAB, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAA, 0xE3, + 0x82, 0xAB, 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0x83, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xAB, 0xE3, 0x83, + 0xAD, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAB, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xAD, 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0x99, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0x8B, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xAA, + 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xBF, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xAD, + 0xE3, 0x83, 0xAD, 0xE3, 0x82, 0xAD, 0xE3, 0x83, + 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xAD, + 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xA1, 0xE3, 0x83, + 0xBC, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, + 0x82, 0xAD, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xAF, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, + 0xAF, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xA9, 0xE3, + 0x83, 0xA0, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xA0, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, + 0x83, 0xAB, 0xE3, 0x82, 0xBB, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xAD, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0x8D, 0xE3, 0x82, 0xB1, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xB3, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x8A, 0xE3, 0x82, 0xB3, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, + 0xE3, 0x82, 0xB5, 0xE3, 0x82, 0xA4, 0xE3, 0x82, + 0xAF, 0xE3, 0x83, 0xAB, 0xE3, 0x82, 0xB5, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x81, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xA0, 0xE3, 0x82, 0xB7, 0xE3, 0x83, + 0xAA, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xAF, 0xE3, + 0x82, 0x99, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x81, 0xE3, 0x82, 0xBB, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xBF, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, 0x82, 0xB9, + 0xE3, 0x83, 0x86, 0xE3, 0x82, 0x99, 0xE3, 0x82, + 0xB7, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8E, 0xE3, 0x83, + 0x8E, 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0x8F, 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0x84, + 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, 0xE3, 0x83, + 0xBC, 0xE3, 0x82, 0xBB, 0xE3, 0x83, 0xB3, 0xE3, + 0x83, 0x88, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x9A, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x84, 0xE3, 0x83, + 0x8F, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x83, 0xAC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, + 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xA2, 0xE3, 0x82, + 0xB9, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0x92, 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xAF, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, 0xE3, 0x82, + 0x9A, 0xE3, 0x82, 0xB3, 0xE3, 0x83, 0x92, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, + 0xE3, 0x82, 0xA1, 0xE3, 0x83, 0xA9, 0xE3, 0x83, + 0x83, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0x95, 0xE3, 0x82, 0xA3, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x95, 0xE3, 0x82, + 0x99, 0xE3, 0x83, 0x83, 0xE3, 0x82, 0xB7, 0xE3, + 0x82, 0xA7, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x95, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0xAF, 0xE3, 0x82, 0xBF, 0xE3, + 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x98, + 0xE3, 0x82, 0x9A, 0xE3, 0x82, 0xBD, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0x8B, 0xE3, + 0x83, 0x92, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0xAB, + 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x98, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0xB9, 0xE3, + 0x83, 0x98, 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, + 0xE3, 0x82, 0xB7, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0x98, 0xE3, 0x82, 0x99, 0xE3, 0x83, 0xBC, 0xE3, + 0x82, 0xBF, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x9A, + 0xE3, 0x82, 0xA4, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0x9B, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0x88, 0xE3, 0x83, 0x9B, + 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9B, 0xE3, 0x82, + 0x9A, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x88, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0x9B, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x9B, 0xE3, 0x83, + 0xBC, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0x9E, 0xE3, + 0x82, 0xA4, 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0xAD, + 0xE3, 0x83, 0x9E, 0xE3, 0x82, 0xA4, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x83, 0xE3, + 0x83, 0x8F, 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0xAB, + 0xE3, 0x82, 0xAF, 0xE3, 0x83, 0x9E, 0xE3, 0x83, + 0xB3, 0xE3, 0x82, 0xB7, 0xE3, 0x83, 0xA7, 0xE3, + 0x83, 0xB3, 0xE3, 0x83, 0x9F, 0xE3, 0x82, 0xAF, + 0xE3, 0x83, 0xAD, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0x9F, 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x9F, 0xE3, + 0x83, 0xAA, 0xE3, 0x83, 0x8F, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, 0x83, + 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, 0xE3, + 0x83, 0xA1, 0xE3, 0x82, 0xAB, 0xE3, 0x82, 0x99, + 0xE3, 0x83, 0x88, 0xE3, 0x83, 0xB3, 0xE3, 0x83, + 0xA1, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x88, 0xE3, + 0x83, 0xAB, 0xE3, 0x83, 0xA4, 0xE3, 0x83, 0xBC, + 0xE3, 0x83, 0x88, 0xE3, 0x82, 0x99, 0xE3, 0x83, + 0xA4, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0xAB, 0xE3, + 0x83, 0xA6, 0xE3, 0x82, 0xA2, 0xE3, 0x83, 0xB3, + 0xE3, 0x83, 0xAA, 0xE3, 0x83, 0x83, 0xE3, 0x83, + 0x88, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAA, 0xE3, + 0x83, 0xA9, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0x92, + 0xE3, 0x82, 0x9A, 0xE3, 0x83, 0xBC, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xBC, 0xE3, 0x83, 0x95, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xAB, 0xE3, 0x83, 0xAC, + 0xE3, 0x83, 0xA0, 0xE3, 0x83, 0xAC, 0xE3, 0x83, + 0xB3, 0xE3, 0x83, 0x88, 0xE3, 0x82, 0xB1, 0xE3, + 0x82, 0x99, 0xE3, 0x83, 0xB3, 0xE3, 0x83, 0xAF, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0x88, 0x30, 0xE7, + 0x82, 0xB9, 0x31, 0xE7, 0x82, 0xB9, 0x32, 0xE7, + 0x82, 0xB9, 0x33, 0xE7, 0x82, 0xB9, 0x34, 0xE7, + 0x82, 0xB9, 0x35, 0xE7, 0x82, 0xB9, 0x36, 0xE7, + 0x82, 0xB9, 0x37, 0xE7, 0x82, 0xB9, 0x38, 0xE7, + 0x82, 0xB9, 0x39, 0xE7, 0x82, 0xB9, 0x31, 0x30, + 0xE7, 0x82, 0xB9, 0x31, 0x31, 0xE7, 0x82, 0xB9, + 0x31, 0x32, 0xE7, 0x82, 0xB9, 0x31, 0x33, 0xE7, + 0x82, 0xB9, 0x31, 0x34, 0xE7, 0x82, 0xB9, 0x31, + 0x35, 0xE7, 0x82, 0xB9, 0x31, 0x36, 0xE7, 0x82, + 0xB9, 0x31, 0x37, 0xE7, 0x82, 0xB9, 0x31, 0x38, + 0xE7, 0x82, 0xB9, 0x31, 0x39, 0xE7, 0x82, 0xB9, + 0x32, 0x30, 0xE7, 0x82, 0xB9, 0x32, 0x31, 0xE7, + 0x82, 0xB9, 0x32, 0x32, 0xE7, 0x82, 0xB9, 0x32, + 0x33, 0xE7, 0x82, 0xB9, 0x32, 0x34, 0xE7, 0x82, + 0xB9, 0x68, 0x50, 0x61, 0x64, 0x61, 0x41, 0x55, + 0x62, 0x61, 0x72, 0x6F, 0x56, 0x70, 0x63, 0x64, + 0x6D, 0x64, 0x6D, 0x32, 0x64, 0x6D, 0x33, 0x49, + 0x55, 0xE5, 0xB9, 0xB3, 0xE6, 0x88, 0x90, 0xE6, + 0x98, 0xAD, 0xE5, 0x92, 0x8C, 0xE5, 0xA4, 0xA7, + 0xE6, 0xAD, 0xA3, 0xE6, 0x98, 0x8E, 0xE6, 0xB2, + 0xBB, 0xE6, 0xA0, 0xAA, 0xE5, 0xBC, 0x8F, 0xE4, + 0xBC, 0x9A, 0xE7, 0xA4, 0xBE, 0x70, 0x41, 0x6E, + 0x41, 0xCE, 0xBC, 0x41, 0x6D, 0x41, 0x6B, 0x41, + 0x4B, 0x42, 0x4D, 0x42, 0x47, 0x42, 0x63, 0x61, + 0x6C, 0x6B, 0x63, 0x61, 0x6C, 0x70, 0x46, 0x6E, + 0x46, 0xCE, 0xBC, 0x46, 0xCE, 0xBC, 0x67, 0x6D, + 0x67, 0x6B, 0x67, 0x48, 0x7A, 0x6B, 0x48, 0x7A, + 0x4D, 0x48, 0x7A, 0x47, 0x48, 0x7A, 0x54, 0x48, + 0x7A, 0xCE, 0xBC, 0x6C, 0x6D, 0x6C, 0x64, 0x6C, + 0x6B, 0x6C, 0x66, 0x6D, 0x6E, 0x6D, 0xCE, 0xBC, + 0x6D, 0x6D, 0x6D, 0x63, 0x6D, 0x6B, 0x6D, 0x6D, + 0x6D, 0x32, 0x63, 0x6D, 0x32, 0x6D, 0x32, 0x6B, + 0x6D, 0x32, 0x6D, 0x6D, 0x33, 0x63, 0x6D, 0x33, + 0x6D, 0x33, 0x6B, 0x6D, 0x33, 0x6D, 0xE2, 0x88, + 0x95, 0x73, 0x6D, 0xE2, 0x88, 0x95, 0x73, 0x32, + 0x50, 0x61, 0x6B, 0x50, 0x61, 0x4D, 0x50, 0x61, + 0x47, 0x50, 0x61, 0x72, 0x61, 0x64, 0x72, 0x61, + 0x64, 0xE2, 0x88, 0x95, 0x73, 0x72, 0x61, 0x64, + 0xE2, 0x88, 0x95, 0x73, 0x32, 0x70, 0x73, 0x6E, + 0x73, 0xCE, 0xBC, 0x73, 0x6D, 0x73, 0x70, 0x56, + 0x6E, 0x56, 0xCE, 0xBC, 0x56, 0x6D, 0x56, 0x6B, + 0x56, 0x4D, 0x56, 0x70, 0x57, 0x6E, 0x57, 0xCE, + 0xBC, 0x57, 0x6D, 0x57, 0x6B, 0x57, 0x4D, 0x57, + 0x6B, 0xCE, 0xA9, 0x4D, 0xCE, 0xA9, 0x61, 0x2E, + 0x6D, 0x2E, 0x42, 0x71, 0x63, 0x63, 0x63, 0x64, + 0x43, 0xE2, 0x88, 0x95, 0x6B, 0x67, 0x43, 0x6F, + 0x2E, 0x64, 0x42, 0x47, 0x79, 0x68, 0x61, 0x48, + 0x50, 0x69, 0x6E, 0x4B, 0x4B, 0x4B, 0x4D, 0x6B, + 0x74, 0x6C, 0x6D, 0x6C, 0x6E, 0x6C, 0x6F, 0x67, + 0x6C, 0x78, 0x6D, 0x62, 0x6D, 0x69, 0x6C, 0x6D, + 0x6F, 0x6C, 0x50, 0x48, 0x70, 0x2E, 0x6D, 0x2E, + 0x50, 0x50, 0x4D, 0x50, 0x52, 0x73, 0x72, 0x53, + 0x76, 0x57, 0x62, 0x56, 0xE2, 0x88, 0x95, 0x6D, + 0x41, 0xE2, 0x88, 0x95, 0x6D, 0x31, 0xE6, 0x97, + 0xA5, 0x32, 0xE6, 0x97, 0xA5, 0x33, 0xE6, 0x97, + 0xA5, 0x34, 0xE6, 0x97, 0xA5, 0x35, 0xE6, 0x97, + 0xA5, 0x36, 0xE6, 0x97, 0xA5, 0x37, 0xE6, 0x97, + 0xA5, 0x38, 0xE6, 0x97, 0xA5, 0x39, 0xE6, 0x97, + 0xA5, 0x31, 0x30, 0xE6, 0x97, 0xA5, 0x31, 0x31, + 0xE6, 0x97, 0xA5, 0x31, 0x32, 0xE6, 0x97, 0xA5, + 0x31, 0x33, 0xE6, 0x97, 0xA5, 0x31, 0x34, 0xE6, + 0x97, 0xA5, 0x31, 0x35, 0xE6, 0x97, 0xA5, 0x31, + 0x36, 0xE6, 0x97, 0xA5, 0x31, 0x37, 0xE6, 0x97, + 0xA5, 0x31, 0x38, 0xE6, 0x97, 0xA5, 0x31, 0x39, + 0xE6, 0x97, 0xA5, 0x32, 0x30, 0xE6, 0x97, 0xA5, + 0x32, 0x31, 0xE6, 0x97, 0xA5, 0x32, 0x32, 0xE6, + 0x97, 0xA5, 0x32, 0x33, 0xE6, 0x97, 0xA5, 0x32, + 0x34, 0xE6, 0x97, 0xA5, 0x32, 0x35, 0xE6, 0x97, + 0xA5, 0x32, 0x36, 0xE6, 0x97, 0xA5, 0x32, 0x37, + 0xE6, 0x97, 0xA5, 0x32, 0x38, 0xE6, 0x97, 0xA5, + 0x32, 0x39, 0xE6, 0x97, 0xA5, 0x33, 0x30, 0xE6, + 0x97, 0xA5, 0x33, 0x31, 0xE6, 0x97, 0xA5, 0x67, + 0x61, 0x6C, 0xF6, 0xE8, 0xB1, 0x88, 0xF6, 0xE6, + 0x9B, 0xB4, 0xF6, 0xE8, 0xBB, 0x8A, 0xF6, 0xE8, + 0xB3, 0x88, 0xF6, 0xE6, 0xBB, 0x91, 0xF6, 0xE4, + 0xB8, 0xB2, 0xF6, 0xE5, 0x8F, 0xA5, 0xF6, 0xE9, + 0xBE, 0x9C, 0xF6, 0xE9, 0xBE, 0x9C, 0xF6, 0xE5, + 0xA5, 0x91, 0xF6, 0xE9, 0x87, 0x91, 0xF6, 0xE5, + 0x96, 0x87, 0xF6, 0xE5, 0xA5, 0x88, 0xF6, 0xE6, + 0x87, 0xB6, 0xF6, 0xE7, 0x99, 0xA9, 0xF6, 0xE7, + 0xBE, 0x85, 0xF6, 0xE8, 0x98, 0xBF, 0xF6, 0xE8, + 0x9E, 0xBA, 0xF6, 0xE8, 0xA3, 0xB8, 0xF6, 0xE9, + 0x82, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE6, + 0xB4, 0x9B, 0xF6, 0xE7, 0x83, 0x99, 0xF6, 0xE7, + 0x8F, 0x9E, 0xF6, 0xE8, 0x90, 0xBD, 0xF6, 0xE9, + 0x85, 0xAA, 0xF6, 0xE9, 0xA7, 0xB1, 0xF6, 0xE4, + 0xBA, 0x82, 0xF6, 0xE5, 0x8D, 0xB5, 0xF6, 0xE6, + 0xAC, 0x84, 0xF6, 0xE7, 0x88, 0x9B, 0xF6, 0xE8, + 0x98, 0xAD, 0xF6, 0xE9, 0xB8, 0x9E, 0xF6, 0xE5, + 0xB5, 0x90, 0xF6, 0xE6, 0xBF, 0xAB, 0xF6, 0xE8, + 0x97, 0x8D, 0xF6, 0xE8, 0xA5, 0xA4, 0xF6, 0xE6, + 0x8B, 0x89, 0xF6, 0xE8, 0x87, 0x98, 0xF6, 0xE8, + 0xA0, 0x9F, 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xE6, + 0x9C, 0x97, 0xF6, 0xE6, 0xB5, 0xAA, 0xF6, 0xE7, + 0x8B, 0xBC, 0xF6, 0xE9, 0x83, 0x8E, 0xF6, 0xE4, + 0xBE, 0x86, 0xF6, 0xE5, 0x86, 0xB7, 0xF6, 0xE5, + 0x8B, 0x9E, 0xF6, 0xE6, 0x93, 0x84, 0xF6, 0xE6, + 0xAB, 0x93, 0xF6, 0xE7, 0x88, 0x90, 0xF6, 0xE7, + 0x9B, 0xA7, 0xF6, 0xE8, 0x80, 0x81, 0xF6, 0xE8, + 0x98, 0x86, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, 0xE8, + 0xB7, 0xAF, 0xF6, 0xE9, 0x9C, 0xB2, 0xF6, 0xE9, + 0xAD, 0xAF, 0xF6, 0xE9, 0xB7, 0xBA, 0xF6, 0xE7, + 0xA2, 0x8C, 0xF6, 0xE7, 0xA5, 0xBF, 0xF6, 0xE7, + 0xB6, 0xA0, 0xF6, 0xE8, 0x8F, 0x89, 0xF6, 0xE9, + 0x8C, 0x84, 0xF6, 0xE9, 0xB9, 0xBF, 0xF6, 0xE8, + 0xAB, 0x96, 0xF6, 0xE5, 0xA3, 0x9F, 0xF6, 0xE5, + 0xBC, 0x84, 0xF6, 0xE7, 0xB1, 0xA0, 0xF6, 0xE8, + 0x81, 0xBE, 0xF6, 0xE7, 0x89, 0xA2, 0xF6, 0xE7, + 0xA3, 0x8A, 0xF6, 0xE8, 0xB3, 0x82, 0xF6, 0xE9, + 0x9B, 0xB7, 0xF6, 0xE5, 0xA3, 0x98, 0xF6, 0xE5, + 0xB1, 0xA2, 0xF6, 0xE6, 0xA8, 0x93, 0xF6, 0xE6, + 0xB7, 0x9A, 0xF6, 0xE6, 0xBC, 0x8F, 0xF6, 0xE7, + 0xB4, 0xAF, 0xF6, 0xE7, 0xB8, 0xB7, 0xF6, 0xE9, + 0x99, 0x8B, 0xF6, 0xE5, 0x8B, 0x92, 0xF6, 0xE8, + 0x82, 0x8B, 0xF6, 0xE5, 0x87, 0x9C, 0xF6, 0xE5, + 0x87, 0x8C, 0xF6, 0xE7, 0xA8, 0x9C, 0xF6, 0xE7, + 0xB6, 0xBE, 0xF6, 0xE8, 0x8F, 0xB1, 0xF6, 0xE9, + 0x99, 0xB5, 0xF6, 0xE8, 0xAE, 0x80, 0xF6, 0xE6, + 0x8B, 0x8F, 0xF6, 0xE6, 0xA8, 0x82, 0xF6, 0xE8, + 0xAB, 0xBE, 0xF6, 0xE4, 0xB8, 0xB9, 0xF6, 0xE5, + 0xAF, 0xA7, 0xF6, 0xE6, 0x80, 0x92, 0xF6, 0xE7, + 0x8E, 0x87, 0xF6, 0xE7, 0x95, 0xB0, 0xF6, 0xE5, + 0x8C, 0x97, 0xF6, 0xE7, 0xA3, 0xBB, 0xF6, 0xE4, + 0xBE, 0xBF, 0xF6, 0xE5, 0xBE, 0xA9, 0xF6, 0xE4, + 0xB8, 0x8D, 0xF6, 0xE6, 0xB3, 0x8C, 0xF6, 0xE6, + 0x95, 0xB8, 0xF6, 0xE7, 0xB4, 0xA2, 0xF6, 0xE5, + 0x8F, 0x83, 0xF6, 0xE5, 0xA1, 0x9E, 0xF6, 0xE7, + 0x9C, 0x81, 0xF6, 0xE8, 0x91, 0x89, 0xF6, 0xE8, + 0xAA, 0xAA, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE8, + 0xBE, 0xB0, 0xF6, 0xE6, 0xB2, 0x88, 0xF6, 0xE6, + 0x8B, 0xBE, 0xF6, 0xE8, 0x8B, 0xA5, 0xF6, 0xE6, + 0x8E, 0xA0, 0xF6, 0xE7, 0x95, 0xA5, 0xF6, 0xE4, + 0xBA, 0xAE, 0xF6, 0xE5, 0x85, 0xA9, 0xF6, 0xE5, + 0x87, 0x89, 0xF6, 0xE6, 0xA2, 0x81, 0xF6, 0xE7, + 0xB3, 0xA7, 0xF6, 0xE8, 0x89, 0xAF, 0xF6, 0xE8, + 0xAB, 0x92, 0xF6, 0xE9, 0x87, 0x8F, 0xF6, 0xE5, + 0x8B, 0xB5, 0xF6, 0xE5, 0x91, 0x82, 0xF6, 0xE5, + 0xA5, 0xB3, 0xF6, 0xE5, 0xBB, 0xAC, 0xF6, 0xE6, + 0x97, 0x85, 0xF6, 0xE6, 0xBF, 0xBE, 0xF6, 0xE7, + 0xA4, 0xAA, 0xF6, 0xE9, 0x96, 0xAD, 0xF6, 0xE9, + 0xA9, 0xAA, 0xF6, 0xE9, 0xBA, 0x97, 0xF6, 0xE9, + 0xBB, 0x8E, 0xF6, 0xE5, 0x8A, 0x9B, 0xF6, 0xE6, + 0x9B, 0x86, 0xF6, 0xE6, 0xAD, 0xB7, 0xF6, 0xE8, + 0xBD, 0xA2, 0xF6, 0xE5, 0xB9, 0xB4, 0xF6, 0xE6, + 0x86, 0x90, 0xF6, 0xE6, 0x88, 0x80, 0xF6, 0xE6, + 0x92, 0x9A, 0xF6, 0xE6, 0xBC, 0xA3, 0xF6, 0xE7, + 0x85, 0x89, 0xF6, 0xE7, 0x92, 0x89, 0xF6, 0xE7, + 0xA7, 0x8A, 0xF6, 0xE7, 0xB7, 0xB4, 0xF6, 0xE8, + 0x81, 0xAF, 0xF6, 0xE8, 0xBC, 0xA6, 0xF6, 0xE8, + 0x93, 0xAE, 0xF6, 0xE9, 0x80, 0xA3, 0xF6, 0xE9, + 0x8D, 0x8A, 0xF6, 0xE5, 0x88, 0x97, 0xF6, 0xE5, + 0x8A, 0xA3, 0xF6, 0xE5, 0x92, 0xBD, 0xF6, 0xE7, + 0x83, 0x88, 0xF6, 0xE8, 0xA3, 0x82, 0xF6, 0xE8, + 0xAA, 0xAA, 0xF6, 0xE5, 0xBB, 0x89, 0xF6, 0xE5, + 0xBF, 0xB5, 0xF6, 0xE6, 0x8D, 0xBB, 0xF6, 0xE6, + 0xAE, 0xAE, 0xF6, 0xE7, 0xB0, 0xBE, 0xF6, 0xE7, + 0x8D, 0xB5, 0xF6, 0xE4, 0xBB, 0xA4, 0xF6, 0xE5, + 0x9B, 0xB9, 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, + 0xB6, 0xBA, 0xF6, 0xE6, 0x80, 0x9C, 0xF6, 0xE7, + 0x8E, 0xB2, 0xF6, 0xE7, 0x91, 0xA9, 0xF6, 0xE7, + 0xBE, 0x9A, 0xF6, 0xE8, 0x81, 0x86, 0xF6, 0xE9, + 0x88, 0xB4, 0xF6, 0xE9, 0x9B, 0xB6, 0xF6, 0xE9, + 0x9D, 0x88, 0xF6, 0xE9, 0xA0, 0x98, 0xF6, 0xE4, + 0xBE, 0x8B, 0xF6, 0xE7, 0xA6, 0xAE, 0xF6, 0xE9, + 0x86, 0xB4, 0xF6, 0xE9, 0x9A, 0xB8, 0xF6, 0xE6, + 0x83, 0xA1, 0xF6, 0xE4, 0xBA, 0x86, 0xF6, 0xE5, + 0x83, 0x9A, 0xF6, 0xE5, 0xAF, 0xAE, 0xF6, 0xE5, + 0xB0, 0xBF, 0xF6, 0xE6, 0x96, 0x99, 0xF6, 0xE6, + 0xA8, 0x82, 0xF6, 0xE7, 0x87, 0x8E, 0xF6, 0xE7, + 0x99, 0x82, 0xF6, 0xE8, 0x93, 0xBC, 0xF6, 0xE9, + 0x81, 0xBC, 0xF6, 0xE9, 0xBE, 0x8D, 0xF6, 0xE6, + 0x9A, 0x88, 0xF6, 0xE9, 0x98, 0xAE, 0xF6, 0xE5, + 0x8A, 0x89, 0xF6, 0xE6, 0x9D, 0xBB, 0xF6, 0xE6, + 0x9F, 0xB3, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, + 0xBA, 0x9C, 0xF6, 0xE7, 0x90, 0x89, 0xF6, 0xE7, + 0x95, 0x99, 0xF6, 0xE7, 0xA1, 0xAB, 0xF6, 0xE7, + 0xB4, 0x90, 0xF6, 0xE9, 0xA1, 0x9E, 0xF6, 0xE5, + 0x85, 0xAD, 0xF6, 0xE6, 0x88, 0xAE, 0xF6, 0xE9, + 0x99, 0xB8, 0xF6, 0xE5, 0x80, 0xAB, 0xF6, 0xE5, + 0xB4, 0x99, 0xF6, 0xE6, 0xB7, 0xAA, 0xF6, 0xE8, + 0xBC, 0xAA, 0xF6, 0xE5, 0xBE, 0x8B, 0xF6, 0xE6, + 0x85, 0x84, 0xF6, 0xE6, 0xA0, 0x97, 0xF6, 0xE7, + 0x8E, 0x87, 0xF6, 0xE9, 0x9A, 0x86, 0xF6, 0xE5, + 0x88, 0xA9, 0xF6, 0xE5, 0x90, 0x8F, 0xF6, 0xE5, + 0xB1, 0xA5, 0xF6, 0xE6, 0x98, 0x93, 0xF6, 0xE6, + 0x9D, 0x8E, 0xF6, 0xE6, 0xA2, 0xA8, 0xF6, 0xE6, + 0xB3, 0xA5, 0xF6, 0xE7, 0x90, 0x86, 0xF6, 0xE7, + 0x97, 0xA2, 0xF6, 0xE7, 0xBD, 0xB9, 0xF6, 0xE8, + 0xA3, 0x8F, 0xF6, 0xE8, 0xA3, 0xA1, 0xF6, 0xE9, + 0x87, 0x8C, 0xF6, 0xE9, 0x9B, 0xA2, 0xF6, 0xE5, + 0x8C, 0xBF, 0xF6, 0xE6, 0xBA, 0xBA, 0xF6, 0xE5, + 0x90, 0x9D, 0xF6, 0xE7, 0x87, 0x90, 0xF6, 0xE7, + 0x92, 0x98, 0xF6, 0xE8, 0x97, 0xBA, 0xF6, 0xE9, + 0x9A, 0xA3, 0xF6, 0xE9, 0xB1, 0x97, 0xF6, 0xE9, + 0xBA, 0x9F, 0xF6, 0xE6, 0x9E, 0x97, 0xF6, 0xE6, + 0xB7, 0x8B, 0xF6, 0xE8, 0x87, 0xA8, 0xF6, 0xE7, + 0xAB, 0x8B, 0xF6, 0xE7, 0xAC, 0xA0, 0xF6, 0xE7, + 0xB2, 0x92, 0xF6, 0xE7, 0x8B, 0x80, 0xF6, 0xE7, + 0x82, 0x99, 0xF6, 0xE8, 0xAD, 0x98, 0xF6, 0xE4, + 0xBB, 0x80, 0xF6, 0xE8, 0x8C, 0xB6, 0xF6, 0xE5, + 0x88, 0xBA, 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, + 0xBA, 0xA6, 0xF6, 0xE6, 0x8B, 0x93, 0xF6, 0xE7, + 0xB3, 0x96, 0xF6, 0xE5, 0xAE, 0x85, 0xF6, 0xE6, + 0xB4, 0x9E, 0xF6, 0xE6, 0x9A, 0xB4, 0xF6, 0xE8, + 0xBC, 0xBB, 0xF6, 0xE8, 0xA1, 0x8C, 0xF6, 0xE9, + 0x99, 0x8D, 0xF6, 0xE8, 0xA6, 0x8B, 0xF6, 0xE5, + 0xBB, 0x93, 0xF6, 0xE5, 0x85, 0x80, 0xF6, 0xE5, + 0x97, 0x80, 0xF6, 0xE5, 0xA1, 0x9A, 0xF6, 0xE6, + 0x99, 0xB4, 0xF6, 0xE5, 0x87, 0x9E, 0xF6, 0xE7, + 0x8C, 0xAA, 0xF6, 0xE7, 0x9B, 0x8A, 0xF6, 0xE7, + 0xA4, 0xBC, 0xF6, 0xE7, 0xA5, 0x9E, 0xF6, 0xE7, + 0xA5, 0xA5, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE9, + 0x9D, 0x96, 0xF6, 0xE7, 0xB2, 0xBE, 0xF6, 0xE7, + 0xBE, 0xBD, 0xF6, 0xE8, 0x98, 0x92, 0xF6, 0xE8, + 0xAB, 0xB8, 0xF6, 0xE9, 0x80, 0xB8, 0xF6, 0xE9, + 0x83, 0xBD, 0xF6, 0xE9, 0xA3, 0xAF, 0xF6, 0xE9, + 0xA3, 0xBC, 0xF6, 0xE9, 0xA4, 0xA8, 0xF6, 0xE9, + 0xB6, 0xB4, 0xF6, 0xE4, 0xBE, 0xAE, 0xF6, 0xE5, + 0x83, 0xA7, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8D, 0x91, 0xF6, 0xE5, 0x96, 0x9D, 0xF6, 0xE5, + 0x98, 0x86, 0xF6, 0xE5, 0x99, 0xA8, 0xF6, 0xE5, + 0xA1, 0x80, 0xF6, 0xE5, 0xA2, 0xA8, 0xF6, 0xE5, + 0xB1, 0xA4, 0xF6, 0xE5, 0xB1, 0xAE, 0xF6, 0xE6, + 0x82, 0x94, 0xF6, 0xE6, 0x85, 0xA8, 0xF6, 0xE6, + 0x86, 0x8E, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, + 0x95, 0x8F, 0xF6, 0xE6, 0x97, 0xA2, 0xF6, 0xE6, + 0x9A, 0x91, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xE6, + 0xB5, 0xB7, 0xF6, 0xE6, 0xB8, 0x9A, 0xF6, 0xE6, + 0xBC, 0xA2, 0xF6, 0xE7, 0x85, 0xAE, 0xF6, 0xE7, + 0x88, 0xAB, 0xF6, 0xE7, 0x90, 0xA2, 0xF6, 0xE7, + 0xA2, 0x91, 0xF6, 0xE7, 0xA4, 0xBE, 0xF6, 0xE7, + 0xA5, 0x89, 0xF6, 0xE7, 0xA5, 0x88, 0xF6, 0xE7, + 0xA5, 0x90, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, 0xE7, + 0xA5, 0x9D, 0xF6, 0xE7, 0xA6, 0x8D, 0xF6, 0xE7, + 0xA6, 0x8E, 0xF6, 0xE7, 0xA9, 0x80, 0xF6, 0xE7, + 0xAA, 0x81, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, + 0xB7, 0xB4, 0xF6, 0xE7, 0xB8, 0x89, 0xF6, 0xE7, + 0xB9, 0x81, 0xF6, 0xE7, 0xBD, 0xB2, 0xF6, 0xE8, + 0x80, 0x85, 0xF6, 0xE8, 0x87, 0xAD, 0xF6, 0xE8, + 0x89, 0xB9, 0xF6, 0xE8, 0x89, 0xB9, 0xF6, 0xE8, + 0x91, 0x97, 0xF6, 0xE8, 0xA4, 0x90, 0xF6, 0xE8, + 0xA6, 0x96, 0xF6, 0xE8, 0xAC, 0x81, 0xF6, 0xE8, + 0xAC, 0xB9, 0xF6, 0xE8, 0xB3, 0x93, 0xF6, 0xE8, + 0xB4, 0x88, 0xF6, 0xE8, 0xBE, 0xB6, 0xF6, 0xE9, + 0x80, 0xB8, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, + 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0xBB, 0xF6, 0xE4, + 0xB8, 0xA6, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xE5, + 0x85, 0xA8, 0xF6, 0xE4, 0xBE, 0x80, 0xF6, 0xE5, + 0x85, 0x85, 0xF6, 0xE5, 0x86, 0x80, 0xF6, 0xE5, + 0x8B, 0x87, 0xF6, 0xE5, 0x8B, 0xBA, 0xF6, 0xE5, + 0x96, 0x9D, 0xF6, 0xE5, 0x95, 0x95, 0xF6, 0xE5, + 0x96, 0x99, 0xF6, 0xE5, 0x97, 0xA2, 0xF6, 0xE5, + 0xA1, 0x9A, 0xF6, 0xE5, 0xA2, 0xB3, 0xF6, 0xE5, + 0xA5, 0x84, 0xF6, 0xE5, 0xA5, 0x94, 0xF6, 0xE5, + 0xA9, 0xA2, 0xF6, 0xE5, 0xAC, 0xA8, 0xF6, 0xE5, + 0xBB, 0x92, 0xF6, 0xE5, 0xBB, 0x99, 0xF6, 0xE5, + 0xBD, 0xA9, 0xF6, 0xE5, 0xBE, 0xAD, 0xF6, 0xE6, + 0x83, 0x98, 0xF6, 0xE6, 0x85, 0x8E, 0xF6, 0xE6, + 0x84, 0x88, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x85, 0xA0, 0xF6, 0xE6, 0x87, 0xB2, 0xF6, 0xE6, + 0x88, 0xB4, 0xF6, 0xE6, 0x8F, 0x84, 0xF6, 0xE6, + 0x90, 0x9C, 0xF6, 0xE6, 0x91, 0x92, 0xF6, 0xE6, + 0x95, 0x96, 0xF6, 0xE6, 0x99, 0xB4, 0xF6, 0xE6, + 0x9C, 0x97, 0xF6, 0xE6, 0x9C, 0x9B, 0xF6, 0xE6, + 0x9D, 0x96, 0xF6, 0xE6, 0xAD, 0xB9, 0xF6, 0xE6, + 0xAE, 0xBA, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, + 0xBB, 0x9B, 0xF6, 0xE6, 0xBB, 0x8B, 0xF6, 0xE6, + 0xBC, 0xA2, 0xF6, 0xE7, 0x80, 0x9E, 0xF6, 0xE7, + 0x85, 0xAE, 0xF6, 0xE7, 0x9E, 0xA7, 0xF6, 0xE7, + 0x88, 0xB5, 0xF6, 0xE7, 0x8A, 0xAF, 0xF6, 0xE7, + 0x8C, 0xAA, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, 0xE7, + 0x94, 0x86, 0xF6, 0xE7, 0x94, 0xBB, 0xF6, 0xE7, + 0x98, 0x9D, 0xF6, 0xE7, 0x98, 0x9F, 0xF6, 0xE7, + 0x9B, 0x8A, 0xF6, 0xE7, 0x9B, 0x9B, 0xF6, 0xE7, + 0x9B, 0xB4, 0xF6, 0xE7, 0x9D, 0x8A, 0xF6, 0xE7, + 0x9D, 0x80, 0xF6, 0xE7, 0xA3, 0x8C, 0xF6, 0xE7, + 0xAA, 0xB1, 0xF6, 0xE7, 0xAF, 0x80, 0xF6, 0xE7, + 0xB1, 0xBB, 0xF6, 0xE7, 0xB5, 0x9B, 0xF6, 0xE7, + 0xB7, 0xB4, 0xF6, 0xE7, 0xBC, 0xBE, 0xF6, 0xE8, + 0x80, 0x85, 0xF6, 0xE8, 0x8D, 0x92, 0xF6, 0xE8, + 0x8F, 0xAF, 0xF6, 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, + 0xA5, 0x81, 0xF6, 0xE8, 0xA6, 0x86, 0xF6, 0xE8, + 0xA6, 0x96, 0xF6, 0xE8, 0xAA, 0xBF, 0xF6, 0xE8, + 0xAB, 0xB8, 0xF6, 0xE8, 0xAB, 0x8B, 0xF6, 0xE8, + 0xAC, 0x81, 0xF6, 0xE8, 0xAB, 0xBE, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAC, 0xB9, 0xF6, 0xE8, + 0xAE, 0x8A, 0xF6, 0xE8, 0xB4, 0x88, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xE9, 0x81, 0xB2, 0xF6, 0xE9, + 0x86, 0x99, 0xF6, 0xE9, 0x89, 0xB6, 0xF6, 0xE9, + 0x99, 0xBC, 0xF6, 0xE9, 0x9B, 0xA3, 0xF6, 0xE9, + 0x9D, 0x96, 0xF6, 0xE9, 0x9F, 0x9B, 0xF6, 0xE9, + 0x9F, 0xBF, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, + 0xA0, 0xBB, 0xF6, 0xE9, 0xAC, 0x92, 0xF6, 0xE9, + 0xBE, 0x9C, 0xF6, 0xF0, 0xA2, 0xA1, 0x8A, 0xF6, + 0xF0, 0xA2, 0xA1, 0x84, 0xF6, 0xF0, 0xA3, 0x8F, + 0x95, 0xF6, 0xE3, 0xAE, 0x9D, 0xF6, 0xE4, 0x80, + 0x98, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xF0, 0xA5, + 0x89, 0x89, 0xF6, 0xF0, 0xA5, 0xB3, 0x90, 0xF6, + 0xF0, 0xA7, 0xBB, 0x93, 0xF6, 0xE9, 0xBD, 0x83, + 0xF6, 0xE9, 0xBE, 0x8E, 0x66, 0x66, 0x66, 0x69, + 0x66, 0x6C, 0x66, 0x66, 0x69, 0x66, 0x66, 0x6C, + 0x73, 0x74, 0x73, 0x74, 0xD5, 0xB4, 0xD5, 0xB6, + 0xD5, 0xB4, 0xD5, 0xA5, 0xD5, 0xB4, 0xD5, 0xAB, + 0xD5, 0xBE, 0xD5, 0xB6, 0xD5, 0xB4, 0xD5, 0xAD, + 0xF6, 0xD7, 0x99, 0xD6, 0xB4, 0xF6, 0xD7, 0xB2, + 0xD6, 0xB7, 0xD7, 0xA2, 0xD7, 0x90, 0xD7, 0x93, + 0xD7, 0x94, 0xD7, 0x9B, 0xD7, 0x9C, 0xD7, 0x9D, + 0xD7, 0xA8, 0xD7, 0xAA, 0x2B, 0xF6, 0xD7, 0xA9, + 0xD7, 0x81, 0xF6, 0xD7, 0xA9, 0xD7, 0x82, 0xF6, + 0xD7, 0xA9, 0xD6, 0xBC, 0xD7, 0x81, 0xF6, 0xD7, + 0xA9, 0xD6, 0xBC, 0xD7, 0x82, 0xF6, 0xD7, 0x90, + 0xD6, 0xB7, 0xF6, 0xD7, 0x90, 0xD6, 0xB8, 0xF6, + 0xD7, 0x90, 0xD6, 0xBC, 0xF6, 0xD7, 0x91, 0xD6, + 0xBC, 0xF6, 0xD7, 0x92, 0xD6, 0xBC, 0xF6, 0xD7, + 0x93, 0xD6, 0xBC, 0xF6, 0xD7, 0x94, 0xD6, 0xBC, + 0xF6, 0xD7, 0x95, 0xD6, 0xBC, 0xF6, 0xD7, 0x96, + 0xD6, 0xBC, 0xF6, 0xD7, 0x98, 0xD6, 0xBC, 0xF6, + 0xD7, 0x99, 0xD6, 0xBC, 0xF6, 0xD7, 0x9A, 0xD6, + 0xBC, 0xF6, 0xD7, 0x9B, 0xD6, 0xBC, 0xF6, 0xD7, + 0x9C, 0xD6, 0xBC, 0xF6, 0xD7, 0x9E, 0xD6, 0xBC, + 0xF6, 0xD7, 0xA0, 0xD6, 0xBC, 0xF6, 0xD7, 0xA1, + 0xD6, 0xBC, 0xF6, 0xD7, 0xA3, 0xD6, 0xBC, 0xF6, + 0xD7, 0xA4, 0xD6, 0xBC, 0xF6, 0xD7, 0xA6, 0xD6, + 0xBC, 0xF6, 0xD7, 0xA7, 0xD6, 0xBC, 0xF6, 0xD7, + 0xA8, 0xD6, 0xBC, 0xF6, 0xD7, 0xA9, 0xD6, 0xBC, + 0xF6, 0xD7, 0xAA, 0xD6, 0xBC, 0xF6, 0xD7, 0x95, + 0xD6, 0xB9, 0xF6, 0xD7, 0x91, 0xD6, 0xBF, 0xF6, + 0xD7, 0x9B, 0xD6, 0xBF, 0xF6, 0xD7, 0xA4, 0xD6, + 0xBF, 0xD7, 0x90, 0xD7, 0x9C, 0xD9, 0xB1, 0xD9, + 0xB1, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, 0xBB, 0xD9, + 0xBB, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, 0xBE, 0xD9, + 0xBE, 0xDA, 0x80, 0xDA, 0x80, 0xDA, 0x80, 0xDA, + 0x80, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, 0xBA, 0xD9, + 0xBA, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, 0xBF, 0xD9, + 0xBF, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, 0xB9, 0xD9, + 0xB9, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, 0xA4, 0xDA, + 0xA4, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, 0xA6, 0xDA, + 0xA6, 0xDA, 0x84, 0xDA, 0x84, 0xDA, 0x84, 0xDA, + 0x84, 0xDA, 0x83, 0xDA, 0x83, 0xDA, 0x83, 0xDA, + 0x83, 0xDA, 0x86, 0xDA, 0x86, 0xDA, 0x86, 0xDA, + 0x86, 0xDA, 0x87, 0xDA, 0x87, 0xDA, 0x87, 0xDA, + 0x87, 0xDA, 0x8D, 0xDA, 0x8D, 0xDA, 0x8C, 0xDA, + 0x8C, 0xDA, 0x8E, 0xDA, 0x8E, 0xDA, 0x88, 0xDA, + 0x88, 0xDA, 0x98, 0xDA, 0x98, 0xDA, 0x91, 0xDA, + 0x91, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, 0xA9, 0xDA, + 0xA9, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, 0xAF, 0xDA, + 0xAF, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, 0xB3, 0xDA, + 0xB3, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, 0xB1, 0xDA, + 0xB1, 0xDA, 0xBA, 0xDA, 0xBA, 0xDA, 0xBB, 0xDA, + 0xBB, 0xDA, 0xBB, 0xDA, 0xBB, 0xDB, 0x95, 0xD9, + 0x94, 0xDB, 0x95, 0xD9, 0x94, 0xDB, 0x81, 0xDB, + 0x81, 0xDB, 0x81, 0xDB, 0x81, 0xDA, 0xBE, 0xDA, + 0xBE, 0xDA, 0xBE, 0xDA, 0xBE, 0xDB, 0x92, 0xDB, + 0x92, 0xDB, 0x92, 0xD9, 0x94, 0xDB, 0x92, 0xD9, + 0x94, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, 0xAD, 0xDA, + 0xAD, 0xDB, 0x87, 0xDB, 0x87, 0xDB, 0x86, 0xDB, + 0x86, 0xDB, 0x88, 0xDB, 0x88, 0xDB, 0x87, 0xD9, + 0xB4, 0xDB, 0x8B, 0xDB, 0x8B, 0xDB, 0x85, 0xDB, + 0x85, 0xDB, 0x89, 0xDB, 0x89, 0xDB, 0x90, 0xDB, + 0x90, 0xDB, 0x90, 0xDB, 0x90, 0xD9, 0x89, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, + 0x8A, 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x95, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x87, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, + 0x8A, 0xD9, 0x94, 0xDB, 0x86, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, + 0x88, 0xD9, 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, + 0x8A, 0xD9, 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, + 0x94, 0xDB, 0x90, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xDB, 0x8C, 0xDB, + 0x8C, 0xDB, 0x8C, 0xDB, 0x8C, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, + 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, + 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, + 0xA8, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD8, + 0xAA, 0xD8, 0xAD, 0xD8, 0xAA, 0xD8, 0xAE, 0xD8, + 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, + 0xAA, 0xD9, 0x8A, 0xD8, 0xAB, 0xD8, 0xAC, 0xD8, + 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, + 0xAB, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD8, + 0xAE, 0xD8, 0xAD, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, + 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, + 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD8, + 0xB6, 0xD8, 0xAC, 0xD8, 0xB6, 0xD8, 0xAD, 0xD8, + 0xB6, 0xD8, 0xAE, 0xD8, 0xB6, 0xD9, 0x85, 0xD8, + 0xB7, 0xD8, 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, + 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, + 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, + 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x81, 0xD9, 0x89, 0xD9, + 0x81, 0xD9, 0x8A, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x82, 0xD9, 0x89, 0xD9, + 0x82, 0xD9, 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, + 0x83, 0xD8, 0xAC, 0xD9, 0x83, 0xD8, 0xAD, 0xD9, + 0x83, 0xD8, 0xAE, 0xD9, 0x83, 0xD9, 0x84, 0xD9, + 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x89, 0xD9, + 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, + 0x84, 0xD8, 0xAD, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, + 0x84, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, + 0x84, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x89, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x89, 0xD9, + 0x86, 0xD9, 0x8A, 0xD9, 0x87, 0xD8, 0xAC, 0xD9, + 0x87, 0xD9, 0x85, 0xD9, 0x87, 0xD9, 0x89, 0xD9, + 0x87, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x89, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD8, 0xB0, 0xD9, 0xB0, 0xD8, + 0xB1, 0xD9, 0xB0, 0xD9, 0x89, 0xD9, 0xB0, 0x20, + 0xD9, 0x8C, 0xD9, 0x91, 0x20, 0xD9, 0x8D, 0xD9, + 0x91, 0x20, 0xD9, 0x8E, 0xD9, 0x91, 0x20, 0xD9, + 0x8F, 0xD9, 0x91, 0x20, 0xD9, 0x90, 0xD9, 0x91, + 0x20, 0xD9, 0x91, 0xD9, 0xB0, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xB1, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xB2, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x89, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, + 0x8A, 0xD8, 0xA8, 0xD8, 0xB1, 0xD8, 0xA8, 0xD8, + 0xB2, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, + 0x86, 0xD8, 0xA8, 0xD9, 0x89, 0xD8, 0xA8, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD8, 0xB1, 0xD8, 0xAA, 0xD8, + 0xB2, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, + 0x86, 0xD8, 0xAA, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, + 0x8A, 0xD8, 0xAB, 0xD8, 0xB1, 0xD8, 0xAB, 0xD8, + 0xB2, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, 0xAB, 0xD9, + 0x86, 0xD8, 0xAB, 0xD9, 0x89, 0xD8, 0xAB, 0xD9, + 0x8A, 0xD9, 0x81, 0xD9, 0x89, 0xD9, 0x81, 0xD9, + 0x8A, 0xD9, 0x82, 0xD9, 0x89, 0xD9, 0x82, 0xD9, + 0x8A, 0xD9, 0x83, 0xD8, 0xA7, 0xD9, 0x83, 0xD9, + 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, 0x83, 0xD9, + 0x89, 0xD9, 0x83, 0xD9, 0x8A, 0xD9, 0x84, 0xD9, + 0x85, 0xD9, 0x84, 0xD9, 0x89, 0xD9, 0x84, 0xD9, + 0x8A, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, 0x85, 0xD9, + 0x85, 0xD9, 0x86, 0xD8, 0xB1, 0xD9, 0x86, 0xD8, + 0xB2, 0xD9, 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, + 0x86, 0xD9, 0x86, 0xD9, 0x89, 0xD9, 0x86, 0xD9, + 0x8A, 0xD9, 0x89, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, + 0xB1, 0xD9, 0x8A, 0xD8, 0xB2, 0xD9, 0x8A, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x8A, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, + 0x94, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD8, 0xAC, 0xD8, + 0xA8, 0xD8, 0xAD, 0xD8, 0xA8, 0xD8, 0xAE, 0xD8, + 0xA8, 0xD9, 0x85, 0xD8, 0xA8, 0xD9, 0x87, 0xD8, + 0xAA, 0xD8, 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, + 0xAA, 0xD8, 0xAE, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, + 0xAE, 0xD8, 0xAC, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, + 0xB3, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, + 0xB3, 0xD8, 0xAE, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAE, 0xD8, + 0xB5, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, 0xAC, 0xD8, + 0xB6, 0xD8, 0xAD, 0xD8, 0xB6, 0xD8, 0xAE, 0xD8, + 0xB6, 0xD9, 0x85, 0xD8, 0xB7, 0xD8, 0xAD, 0xD8, + 0xB8, 0xD9, 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD8, + 0xB9, 0xD9, 0x85, 0xD8, 0xBA, 0xD8, 0xAC, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAC, 0xD9, + 0x81, 0xD8, 0xAD, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x82, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x83, 0xD8, 0xAC, 0xD9, + 0x83, 0xD8, 0xAD, 0xD9, 0x83, 0xD8, 0xAE, 0xD9, + 0x83, 0xD9, 0x84, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x84, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, + 0x84, 0xD8, 0xAE, 0xD9, 0x84, 0xD9, 0x85, 0xD9, + 0x84, 0xD9, 0x87, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x86, 0xD8, 0xAE, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, + 0x87, 0xD8, 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, + 0x87, 0xD9, 0xB0, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, + 0x8A, 0xD9, 0x94, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x94, 0xD9, 0x87, 0xD8, 0xA8, 0xD9, 0x85, 0xD8, + 0xA8, 0xD9, 0x87, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAA, 0xD9, 0x87, 0xD8, 0xAB, 0xD9, 0x85, 0xD8, + 0xAB, 0xD9, 0x87, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, + 0xB4, 0xD9, 0x87, 0xD9, 0x83, 0xD9, 0x84, 0xD9, + 0x83, 0xD9, 0x85, 0xD9, 0x84, 0xD9, 0x85, 0xD9, + 0x86, 0xD9, 0x85, 0xD9, 0x86, 0xD9, 0x87, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, + 0x80, 0xD9, 0x8E, 0xD9, 0x91, 0xD9, 0x80, 0xD9, + 0x8F, 0xD9, 0x91, 0xD9, 0x80, 0xD9, 0x90, 0xD9, + 0x91, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, + 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, + 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, + 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, + 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, + 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, + 0xB1, 0xD8, 0xB7, 0xD9, 0x89, 0xD8, 0xB7, 0xD9, + 0x8A, 0xD8, 0xB9, 0xD9, 0x89, 0xD8, 0xB9, 0xD9, + 0x8A, 0xD8, 0xBA, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, + 0x8A, 0xD8, 0xB3, 0xD9, 0x89, 0xD8, 0xB3, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x89, 0xD8, 0xB4, 0xD9, + 0x8A, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, 0xAE, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, + 0x8A, 0xD8, 0xB6, 0xD9, 0x89, 0xD8, 0xB6, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB4, 0xD8, 0xB1, 0xD8, 0xB3, 0xD8, + 0xB1, 0xD8, 0xB5, 0xD8, 0xB1, 0xD8, 0xB6, 0xD8, + 0xB1, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD8, 0xB3, 0xD9, 0x87, 0xD8, 0xB4, 0xD9, + 0x87, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, + 0xAC, 0xD8, 0xB3, 0xD8, 0xAD, 0xD8, 0xB3, 0xD8, + 0xAE, 0xD8, 0xB4, 0xD8, 0xAC, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD8, 0xB4, 0xD8, 0xAE, 0xD8, 0xB7, 0xD9, + 0x85, 0xD8, 0xB8, 0xD9, 0x85, 0xD8, 0xA7, 0xD9, + 0x8B, 0xD8, 0xA7, 0xD9, 0x8B, 0xD8, 0xAA, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD8, 0xAA, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAA, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAA, 0xD8, + 0xAE, 0xD9, 0x85, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAA, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xAA, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, + 0xAD, 0xD9, 0x85, 0xD9, 0x89, 0xD8, 0xB3, 0xD8, + 0xAD, 0xD8, 0xAC, 0xD8, 0xB3, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD8, 0xB3, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, + 0xB3, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, + 0x85, 0xD8, 0xAD, 0xD8, 0xB3, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB3, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB5, 0xD8, + 0xAD, 0xD8, 0xAD, 0xD8, 0xB5, 0xD8, 0xAD, 0xD8, + 0xAD, 0xD8, 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB4, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xB4, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, + 0xB4, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xB4, 0xD9, + 0x85, 0xD9, 0x85, 0xD8, 0xB4, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, 0x89, 0xD8, + 0xB6, 0xD8, 0xAE, 0xD9, 0x85, 0xD8, 0xB6, 0xD8, + 0xAE, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, + 0xAD, 0xD8, 0xB7, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xB7, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB7, 0xD9, + 0x85, 0xD9, 0x8A, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xB9, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, + 0x85, 0xD9, 0x89, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xBA, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, + 0xBA, 0xD9, 0x85, 0xD9, 0x89, 0xD9, 0x81, 0xD8, + 0xAE, 0xD9, 0x85, 0xD9, 0x81, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, + 0x82, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x84, 0xD8, + 0xAD, 0xD9, 0x85, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x84, 0xD8, 0xAD, 0xD9, 0x89, 0xD9, + 0x84, 0xD8, 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, + 0xAC, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, + 0x85, 0xD9, 0x84, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, + 0x84, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x84, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAD, 0xD8, + 0xAC, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xAD, 0xD9, 0x8A, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, + 0x85, 0xD9, 0x85, 0xD8, 0xAE, 0xD8, 0xAC, 0xD9, + 0x85, 0xD8, 0xAE, 0xD9, 0x85, 0xD9, 0x85, 0xD8, + 0xAC, 0xD8, 0xAE, 0xD9, 0x87, 0xD9, 0x85, 0xD8, + 0xAC, 0xD9, 0x87, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAD, 0xD9, 0x85, 0xD9, 0x86, 0xD8, + 0xAD, 0xD9, 0x89, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x85, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAC, 0xD9, 0x89, 0xD9, 0x86, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD9, 0x85, 0xD9, + 0x89, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xA8, 0xD8, + 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD8, 0xAC, 0xD9, 0x89, 0xD8, + 0xAA, 0xD8, 0xAE, 0xD9, 0x8A, 0xD8, 0xAA, 0xD8, + 0xAE, 0xD9, 0x89, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, + 0x8A, 0xD8, 0xAA, 0xD9, 0x85, 0xD9, 0x89, 0xD8, + 0xAC, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x89, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x89, 0xD8, 0xB3, 0xD8, 0xAE, 0xD9, 0x89, 0xD8, + 0xB5, 0xD8, 0xAD, 0xD9, 0x8A, 0xD8, 0xB4, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD8, 0xB6, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, + 0x84, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x8A, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD9, 0x8A, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, + 0x85, 0xD9, 0x85, 0xD9, 0x8A, 0xD9, 0x82, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAD, 0xD9, + 0x8A, 0xD9, 0x82, 0xD9, 0x85, 0xD8, 0xAD, 0xD9, + 0x84, 0xD8, 0xAD, 0xD9, 0x85, 0xD8, 0xB9, 0xD9, + 0x85, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD9, + 0x85, 0xD8, 0xAE, 0xD9, 0x8A, 0xD9, 0x84, 0xD8, + 0xAC, 0xD9, 0x85, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x85, 0xD9, 0x84, 0xD8, 0xAC, 0xD9, 0x85, 0xD9, + 0x86, 0xD8, 0xAC, 0xD8, 0xAD, 0xD8, 0xAC, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD8, 0xAD, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x8A, 0xD9, + 0x81, 0xD9, 0x85, 0xD9, 0x8A, 0xD8, 0xA8, 0xD8, + 0xAD, 0xD9, 0x8A, 0xD9, 0x83, 0xD9, 0x85, 0xD9, + 0x85, 0xD8, 0xB9, 0xD8, 0xAC, 0xD9, 0x85, 0xD8, + 0xB5, 0xD9, 0x85, 0xD9, 0x85, 0xD8, 0xB3, 0xD8, + 0xAE, 0xD9, 0x8A, 0xD9, 0x86, 0xD8, 0xAC, 0xD9, + 0x8A, 0xD8, 0xB5, 0xD9, 0x84, 0xDB, 0x92, 0xD9, + 0x82, 0xD9, 0x84, 0xDB, 0x92, 0xD8, 0xA7, 0xD9, + 0x84, 0xD9, 0x84, 0xD9, 0x87, 0xD8, 0xA7, 0xD9, + 0x83, 0xD8, 0xA8, 0xD8, 0xB1, 0xD9, 0x85, 0xD8, + 0xAD, 0xD9, 0x85, 0xD8, 0xAF, 0xD8, 0xB5, 0xD9, + 0x84, 0xD8, 0xB9, 0xD9, 0x85, 0xD8, 0xB1, 0xD8, + 0xB3, 0xD9, 0x88, 0xD9, 0x84, 0xD8, 0xB9, 0xD9, + 0x84, 0xD9, 0x8A, 0xD9, 0x87, 0xD9, 0x88, 0xD8, + 0xB3, 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xB5, 0xD9, + 0x84, 0xD9, 0x89, 0xD8, 0xB5, 0xD9, 0x84, 0xD9, + 0x89, 0x20, 0xD8, 0xA7, 0xD9, 0x84, 0xD9, 0x84, + 0xD9, 0x87, 0x20, 0xD8, 0xB9, 0xD9, 0x84, 0xD9, + 0x8A, 0xD9, 0x87, 0x20, 0xD9, 0x88, 0xD8, 0xB3, + 0xD9, 0x84, 0xD9, 0x85, 0xD8, 0xAC, 0xD9, 0x84, + 0x20, 0xD8, 0xAC, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, + 0x84, 0xD9, 0x87, 0xD8, 0xB1, 0xDB, 0x8C, 0xD8, + 0xA7, 0xD9, 0x84, 0x2C, 0xE3, 0x80, 0x81, 0xE3, + 0x80, 0x82, 0x3A, 0x3B, 0x21, 0x3F, 0xE3, 0x80, + 0x96, 0xE3, 0x80, 0x97, 0x2E, 0x2E, 0x2E, 0x2E, + 0x2E, 0xE2, 0x80, 0x94, 0xE2, 0x80, 0x93, 0x5F, + 0x5F, 0x28, 0x29, 0x7B, 0x7D, 0xE3, 0x80, 0x94, + 0xE3, 0x80, 0x95, 0xE3, 0x80, 0x90, 0xE3, 0x80, + 0x91, 0xE3, 0x80, 0x8A, 0xE3, 0x80, 0x8B, 0xE3, + 0x80, 0x88, 0xE3, 0x80, 0x89, 0xE3, 0x80, 0x8C, + 0xE3, 0x80, 0x8D, 0xE3, 0x80, 0x8E, 0xE3, 0x80, + 0x8F, 0x5B, 0x5D, 0x20, 0xCC, 0x85, 0x20, 0xCC, + 0x85, 0x20, 0xCC, 0x85, 0x20, 0xCC, 0x85, 0x5F, + 0x5F, 0x5F, 0x2C, 0xE3, 0x80, 0x81, 0x2E, 0x3B, + 0x3A, 0x3F, 0x21, 0xE2, 0x80, 0x94, 0x28, 0x29, + 0x7B, 0x7D, 0xE3, 0x80, 0x94, 0xE3, 0x80, 0x95, + 0x23, 0x26, 0x2A, 0x2B, 0x2D, 0x3C, 0x3E, 0x3D, + 0x5C, 0x24, 0x25, 0x40, 0x20, 0xD9, 0x8B, 0xD9, + 0x80, 0xD9, 0x8B, 0x20, 0xD9, 0x8C, 0x20, 0xD9, + 0x8D, 0x20, 0xD9, 0x8E, 0xD9, 0x80, 0xD9, 0x8E, + 0x20, 0xD9, 0x8F, 0xD9, 0x80, 0xD9, 0x8F, 0x20, + 0xD9, 0x90, 0xD9, 0x80, 0xD9, 0x90, 0x20, 0xD9, + 0x91, 0xD9, 0x80, 0xD9, 0x91, 0x20, 0xD9, 0x92, + 0xD9, 0x80, 0xD9, 0x92, 0xD8, 0xA1, 0xD8, 0xA7, + 0xD9, 0x93, 0xD8, 0xA7, 0xD9, 0x93, 0xD8, 0xA7, + 0xD9, 0x94, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x88, + 0xD9, 0x94, 0xD9, 0x88, 0xD9, 0x94, 0xD8, 0xA7, + 0xD9, 0x95, 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD9, 0x8A, + 0xD9, 0x94, 0xD9, 0x8A, 0xD9, 0x94, 0xD8, 0xA7, + 0xD8, 0xA7, 0xD8, 0xA8, 0xD8, 0xA8, 0xD8, 0xA8, + 0xD8, 0xA8, 0xD8, 0xA9, 0xD8, 0xA9, 0xD8, 0xAA, + 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAA, 0xD8, 0xAB, + 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAB, 0xD8, 0xAC, + 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAC, 0xD8, 0xAD, + 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAD, 0xD8, 0xAE, + 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAE, 0xD8, 0xAF, + 0xD8, 0xAF, 0xD8, 0xB0, 0xD8, 0xB0, 0xD8, 0xB1, + 0xD8, 0xB1, 0xD8, 0xB2, 0xD8, 0xB2, 0xD8, 0xB3, + 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB3, 0xD8, 0xB4, + 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB4, 0xD8, 0xB5, + 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB5, 0xD8, 0xB6, + 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB6, 0xD8, 0xB7, + 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB7, 0xD8, 0xB8, + 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB8, 0xD8, 0xB9, + 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xB9, 0xD8, 0xBA, + 0xD8, 0xBA, 0xD8, 0xBA, 0xD8, 0xBA, 0xD9, 0x81, + 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x81, 0xD9, 0x82, + 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x82, 0xD9, 0x83, + 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x83, 0xD9, 0x84, + 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x84, 0xD9, 0x85, + 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x85, 0xD9, 0x86, + 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x86, 0xD9, 0x87, + 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x87, 0xD9, 0x88, + 0xD9, 0x88, 0xD9, 0x89, 0xD9, 0x89, 0xD9, 0x8A, + 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x8A, 0xD9, 0x84, + 0xD8, 0xA7, 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, + 0xD9, 0x93, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, + 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x94, 0xD9, 0x84, + 0xD8, 0xA7, 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, + 0xD9, 0x95, 0xD9, 0x84, 0xD8, 0xA7, 0xD9, 0x84, + 0xD8, 0xA7, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, + 0x27, 0x28, 0x29, 0x2A, 0x2B, 0x2C, 0x2D, 0x2E, + 0x2F, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x3A, 0x3B, 0x3C, 0x3D, 0x3E, + 0x3F, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, + 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, + 0x57, 0x58, 0x59, 0x5A, 0x5B, 0x5C, 0x5D, 0x5E, + 0x5F, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x7B, 0x7C, 0x7D, 0x7E, + 0xE2, 0xA6, 0x85, 0xE2, 0xA6, 0x86, 0xE3, 0x80, + 0x82, 0xE3, 0x80, 0x8C, 0xE3, 0x80, 0x8D, 0xE3, + 0x80, 0x81, 0xE3, 0x83, 0xBB, 0xE3, 0x83, 0xB2, + 0xE3, 0x82, 0xA1, 0xE3, 0x82, 0xA3, 0xE3, 0x82, + 0xA5, 0xE3, 0x82, 0xA7, 0xE3, 0x82, 0xA9, 0xE3, + 0x83, 0xA3, 0xE3, 0x83, 0xA5, 0xE3, 0x83, 0xA7, + 0xE3, 0x83, 0x83, 0xE3, 0x83, 0xBC, 0xE3, 0x82, + 0xA2, 0xE3, 0x82, 0xA4, 0xE3, 0x82, 0xA6, 0xE3, + 0x82, 0xA8, 0xE3, 0x82, 0xAA, 0xE3, 0x82, 0xAB, + 0xE3, 0x82, 0xAD, 0xE3, 0x82, 0xAF, 0xE3, 0x82, + 0xB1, 0xE3, 0x82, 0xB3, 0xE3, 0x82, 0xB5, 0xE3, + 0x82, 0xB7, 0xE3, 0x82, 0xB9, 0xE3, 0x82, 0xBB, + 0xE3, 0x82, 0xBD, 0xE3, 0x82, 0xBF, 0xE3, 0x83, + 0x81, 0xE3, 0x83, 0x84, 0xE3, 0x83, 0x86, 0xE3, + 0x83, 0x88, 0xE3, 0x83, 0x8A, 0xE3, 0x83, 0x8B, + 0xE3, 0x83, 0x8C, 0xE3, 0x83, 0x8D, 0xE3, 0x83, + 0x8E, 0xE3, 0x83, 0x8F, 0xE3, 0x83, 0x92, 0xE3, + 0x83, 0x95, 0xE3, 0x83, 0x98, 0xE3, 0x83, 0x9B, + 0xE3, 0x83, 0x9E, 0xE3, 0x83, 0x9F, 0xE3, 0x83, + 0xA0, 0xE3, 0x83, 0xA1, 0xE3, 0x83, 0xA2, 0xE3, + 0x83, 0xA4, 0xE3, 0x83, 0xA6, 0xE3, 0x83, 0xA8, + 0xE3, 0x83, 0xA9, 0xE3, 0x83, 0xAA, 0xE3, 0x83, + 0xAB, 0xE3, 0x83, 0xAC, 0xE3, 0x83, 0xAD, 0xE3, + 0x83, 0xAF, 0xE3, 0x83, 0xB3, 0xE3, 0x82, 0x99, + 0xE3, 0x82, 0x9A, 0xE1, 0x85, 0xA0, 0xE1, 0x84, + 0x80, 0xE1, 0x84, 0x81, 0xE1, 0x86, 0xAA, 0xE1, + 0x84, 0x82, 0xE1, 0x86, 0xAC, 0xE1, 0x86, 0xAD, + 0xE1, 0x84, 0x83, 0xE1, 0x84, 0x84, 0xE1, 0x84, + 0x85, 0xE1, 0x86, 0xB0, 0xE1, 0x86, 0xB1, 0xE1, + 0x86, 0xB2, 0xE1, 0x86, 0xB3, 0xE1, 0x86, 0xB4, + 0xE1, 0x86, 0xB5, 0xE1, 0x84, 0x9A, 0xE1, 0x84, + 0x86, 0xE1, 0x84, 0x87, 0xE1, 0x84, 0x88, 0xE1, + 0x84, 0xA1, 0xE1, 0x84, 0x89, 0xE1, 0x84, 0x8A, + 0xE1, 0x84, 0x8B, 0xE1, 0x84, 0x8C, 0xE1, 0x84, + 0x8D, 0xE1, 0x84, 0x8E, 0xE1, 0x84, 0x8F, 0xE1, + 0x84, 0x90, 0xE1, 0x84, 0x91, 0xE1, 0x84, 0x92, + 0xE1, 0x85, 0xA1, 0xE1, 0x85, 0xA2, 0xE1, 0x85, + 0xA3, 0xE1, 0x85, 0xA4, 0xE1, 0x85, 0xA5, 0xE1, + 0x85, 0xA6, 0xE1, 0x85, 0xA7, 0xE1, 0x85, 0xA8, + 0xE1, 0x85, 0xA9, 0xE1, 0x85, 0xAA, 0xE1, 0x85, + 0xAB, 0xE1, 0x85, 0xAC, 0xE1, 0x85, 0xAD, 0xE1, + 0x85, 0xAE, 0xE1, 0x85, 0xAF, 0xE1, 0x85, 0xB0, + 0xE1, 0x85, 0xB1, 0xE1, 0x85, 0xB2, 0xE1, 0x85, + 0xB3, 0xE1, 0x85, 0xB4, 0xE1, 0x85, 0xB5, 0xC2, + 0xA2, 0xC2, 0xA3, 0xC2, 0xAC, 0x20, 0xCC, 0x84, + 0xC2, 0xA6, 0xC2, 0xA5, 0xE2, 0x82, 0xA9, 0xE2, + 0x94, 0x82, 0xE2, 0x86, 0x90, 0xE2, 0x86, 0x91, + 0xE2, 0x86, 0x92, 0xE2, 0x86, 0x93, 0xE2, 0x96, + 0xA0, 0xE2, 0x97, 0x8B, 0xF6, 0xF0, 0x9D, 0x85, + 0x97, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, 0x9D, + 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF6, 0xF0, + 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x85, 0x98, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, + 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, 0x9D, 0x85, + 0xA5, 0xF0, 0x9D, 0x85, 0xB0, 0xF6, 0xF0, 0x9D, + 0x85, 0x98, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, + 0x85, 0xB1, 0xF6, 0xF0, 0x9D, 0x85, 0x98, 0xF0, + 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xB2, 0xF6, + 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, 0x85, 0xA5, + 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, + 0xA5, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, 0xF0, 0x9D, + 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAE, 0xF6, 0xF0, + 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, 0xA5, 0xF0, + 0x9D, 0x85, 0xAE, 0xF6, 0xF0, 0x9D, 0x86, 0xB9, + 0xF0, 0x9D, 0x85, 0xA5, 0xF0, 0x9D, 0x85, 0xAF, + 0xF6, 0xF0, 0x9D, 0x86, 0xBA, 0xF0, 0x9D, 0x85, + 0xA5, 0xF0, 0x9D, 0x85, 0xAF, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, 0x44, + 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, 0x4C, + 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, 0x54, + 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, 0x62, + 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6A, + 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, 0x72, + 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7A, + 0x41, 0x43, 0x44, 0x47, 0x4A, 0x4B, 0x4E, 0x4F, + 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, + 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x66, 0x68, + 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, + 0x45, 0x46, 0x47, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, + 0x4F, 0x50, 0x51, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, + 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, + 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, + 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x44, 0x45, + 0x46, 0x47, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4F, + 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4A, 0x4B, 0x4C, 0x4D, 0x4E, 0x4F, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5A, 0x61, 0x62, 0x63, 0x64, 0x65, + 0x66, 0x67, 0x68, 0x69, 0x6A, 0x6B, 0x6C, 0x6D, + 0x6E, 0x6F, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, + 0x76, 0x77, 0x78, 0x79, 0x7A, 0x41, 0x42, 0x43, + 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4A, 0x4B, + 0x4C, 0x4D, 0x4E, 0x4F, 0x50, 0x51, 0x52, 0x53, + 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5A, 0x61, + 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, + 0x6A, 0x6B, 0x6C, 0x6D, 0x6E, 0x6F, 0x70, 0x71, + 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7A, 0xC4, 0xB1, 0xC8, 0xB7, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, + 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, + 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, + 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, + 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, + 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, + 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, + 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, + 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, + 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, + 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, + 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, + 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, + 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, + 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, + 0x80, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, + 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, + 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, + 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, + 0xA0, 0xCE, 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, + 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, + 0xA8, 0xCE, 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, + 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, + 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, + 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, + 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, + 0xCF, 0x82, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, + 0xCF, 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, + 0xE2, 0x88, 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, + 0xBA, 0xCF, 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, + 0x91, 0xCE, 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, + 0x95, 0xCE, 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, + 0x99, 0xCE, 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, + 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, + 0xA1, 0xCE, 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, + 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, + 0xA9, 0xE2, 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, + 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, + 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, + 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, + 0xCE, 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, + 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, + 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, + 0x82, 0xCE, 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, + 0x86, 0xCF, 0x81, 0xCF, 0x80, 0xCE, 0x91, 0xCE, + 0x92, 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, + 0x96, 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, + 0x9A, 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, + 0x9E, 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, + 0x98, 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, + 0xA6, 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xE2, + 0x88, 0x87, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, + 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, + 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, + 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, + 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x82, 0xCF, 0x83, + 0xCF, 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, + 0xCF, 0x88, 0xCF, 0x89, 0xE2, 0x88, 0x82, 0xCE, + 0xB5, 0xCE, 0xB8, 0xCE, 0xBA, 0xCF, 0x86, 0xCF, + 0x81, 0xCF, 0x80, 0xCF, 0x9C, 0xCF, 0x9D, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, + 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, 0x33, 0x34, + 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, 0x31, 0x32, + 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x30, + 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0xF6, 0xE4, 0xB8, 0xBD, 0xF6, 0xE4, 0xB8, + 0xB8, 0xF6, 0xE4, 0xB9, 0x81, 0xF6, 0xF0, 0xA0, + 0x84, 0xA2, 0xF6, 0xE4, 0xBD, 0xA0, 0xF6, 0xE4, + 0xBE, 0xAE, 0xF6, 0xE4, 0xBE, 0xBB, 0xF6, 0xE5, + 0x80, 0x82, 0xF6, 0xE5, 0x81, 0xBA, 0xF6, 0xE5, + 0x82, 0x99, 0xF6, 0xE5, 0x83, 0xA7, 0xF6, 0xE5, + 0x83, 0x8F, 0xF6, 0xE3, 0x92, 0x9E, 0xF6, 0xF0, + 0xA0, 0x98, 0xBA, 0xF6, 0xE5, 0x85, 0x8D, 0xF6, + 0xE5, 0x85, 0x94, 0xF6, 0xE5, 0x85, 0xA4, 0xF6, + 0xE5, 0x85, 0xB7, 0xF6, 0xF0, 0xA0, 0x94, 0x9C, + 0xF6, 0xE3, 0x92, 0xB9, 0xF6, 0xE5, 0x85, 0xA7, + 0xF6, 0xE5, 0x86, 0x8D, 0xF6, 0xF0, 0xA0, 0x95, + 0x8B, 0xF6, 0xE5, 0x86, 0x97, 0xF6, 0xE5, 0x86, + 0xA4, 0xF6, 0xE4, 0xBB, 0x8C, 0xF6, 0xE5, 0x86, + 0xAC, 0xF6, 0xE5, 0x86, 0xB5, 0xF6, 0xF0, 0xA9, + 0x87, 0x9F, 0xF6, 0xE5, 0x87, 0xB5, 0xF6, 0xE5, + 0x88, 0x83, 0xF6, 0xE3, 0x93, 0x9F, 0xF6, 0xE5, + 0x88, 0xBB, 0xF6, 0xE5, 0x89, 0x86, 0xF6, 0xE5, + 0x89, 0xB2, 0xF6, 0xE5, 0x89, 0xB7, 0xF6, 0xE3, + 0x94, 0x95, 0xF6, 0xE5, 0x8B, 0x87, 0xF6, 0xE5, + 0x8B, 0x89, 0xF6, 0xE5, 0x8B, 0xA4, 0xF6, 0xE5, + 0x8B, 0xBA, 0xF6, 0xE5, 0x8C, 0x85, 0xF6, 0xE5, + 0x8C, 0x86, 0xF6, 0xE5, 0x8C, 0x97, 0xF6, 0xE5, + 0x8D, 0x89, 0xF6, 0xE5, 0x8D, 0x91, 0xF6, 0xE5, + 0x8D, 0x9A, 0xF6, 0xE5, 0x8D, 0xB3, 0xF6, 0xE5, + 0x8D, 0xBD, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xE5, + 0x8D, 0xBF, 0xF6, 0xE5, 0x8D, 0xBF, 0xF6, 0xF0, + 0xA0, 0xA8, 0xAC, 0xF6, 0xE7, 0x81, 0xB0, 0xF6, + 0xE5, 0x8F, 0x8A, 0xF6, 0xE5, 0x8F, 0x9F, 0xF6, + 0xF0, 0xA0, 0xAD, 0xA3, 0xF6, 0xE5, 0x8F, 0xAB, + 0xF6, 0xE5, 0x8F, 0xB1, 0xF6, 0xE5, 0x90, 0x86, + 0xF6, 0xE5, 0x92, 0x9E, 0xF6, 0xE5, 0x90, 0xB8, + 0xF6, 0xE5, 0x91, 0x88, 0xF6, 0xE5, 0x91, 0xA8, + 0xF6, 0xE5, 0x92, 0xA2, 0xF6, 0xE5, 0x93, 0xB6, + 0xF6, 0xE5, 0x94, 0x90, 0xF6, 0xE5, 0x95, 0x93, + 0xF6, 0xE5, 0x95, 0xA3, 0xF6, 0xE5, 0x96, 0x84, + 0xF6, 0xE5, 0x96, 0x84, 0xF6, 0xE5, 0x96, 0x99, + 0xF6, 0xE5, 0x96, 0xAB, 0xF6, 0xE5, 0x96, 0xB3, + 0xF6, 0xE5, 0x97, 0x82, 0xF6, 0xE5, 0x9C, 0x96, + 0xF6, 0xE5, 0x98, 0x86, 0xF6, 0xE5, 0x9C, 0x97, + 0xF6, 0xE5, 0x99, 0x91, 0xF6, 0xE5, 0x99, 0xB4, + 0xF6, 0xE5, 0x88, 0x87, 0xF6, 0xE5, 0xA3, 0xAE, + 0xF6, 0xE5, 0x9F, 0x8E, 0xF6, 0xE5, 0x9F, 0xB4, + 0xF6, 0xE5, 0xA0, 0x8D, 0xF6, 0xE5, 0x9E, 0x8B, + 0xF6, 0xE5, 0xA0, 0xB2, 0xF6, 0xE5, 0xA0, 0xB1, + 0xF6, 0xE5, 0xA2, 0xAC, 0xF6, 0xF0, 0xA1, 0x93, + 0xA4, 0xF6, 0xE5, 0xA3, 0xB2, 0xF6, 0xE5, 0xA3, + 0xB7, 0xF6, 0xE5, 0xA4, 0x86, 0xF6, 0xE5, 0xA4, + 0x9A, 0xF6, 0xE5, 0xA4, 0xA2, 0xF6, 0xE5, 0xA5, + 0xA2, 0xF6, 0xF0, 0xA1, 0x9A, 0xA8, 0xF6, 0xF0, + 0xA1, 0x9B, 0xAA, 0xF6, 0xE5, 0xA7, 0xAC, 0xF6, + 0xE5, 0xA8, 0x9B, 0xF6, 0xE5, 0xA8, 0xA7, 0xF6, + 0xE5, 0xA7, 0x98, 0xF6, 0xE5, 0xA9, 0xA6, 0xF6, + 0xE3, 0x9B, 0xAE, 0xF6, 0xE3, 0x9B, 0xBC, 0xF6, + 0xE5, 0xAC, 0x88, 0xF6, 0xE5, 0xAC, 0xBE, 0xF6, + 0xE5, 0xAC, 0xBE, 0xF6, 0xF0, 0xA1, 0xA7, 0x88, + 0xF6, 0xE5, 0xAF, 0x83, 0xF6, 0xE5, 0xAF, 0x98, + 0xF6, 0xE5, 0xAF, 0xA7, 0xF6, 0xE5, 0xAF, 0xB3, + 0xF6, 0xF0, 0xA1, 0xAC, 0x98, 0xF6, 0xE5, 0xAF, + 0xBF, 0xF6, 0xE5, 0xB0, 0x86, 0xF6, 0xE5, 0xBD, + 0x93, 0xF6, 0xE5, 0xB0, 0xA2, 0xF6, 0xE3, 0x9E, + 0x81, 0xF6, 0xE5, 0xB1, 0xA0, 0xF6, 0xE5, 0xB1, + 0xAE, 0xF6, 0xE5, 0xB3, 0x80, 0xF6, 0xE5, 0xB2, + 0x8D, 0xF6, 0xF0, 0xA1, 0xB7, 0xA4, 0xF6, 0xE5, + 0xB5, 0x83, 0xF6, 0xF0, 0xA1, 0xB7, 0xA6, 0xF6, + 0xE5, 0xB5, 0xAE, 0xF6, 0xE5, 0xB5, 0xAB, 0xF6, + 0xE5, 0xB5, 0xBC, 0xF6, 0xE5, 0xB7, 0xA1, 0xF6, + 0xE5, 0xB7, 0xA2, 0xF6, 0xE3, 0xA0, 0xAF, 0xF6, + 0xE5, 0xB7, 0xBD, 0xF6, 0xE5, 0xB8, 0xA8, 0xF6, + 0xE5, 0xB8, 0xBD, 0xF6, 0xE5, 0xB9, 0xA9, 0xF6, + 0xE3, 0xA1, 0xA2, 0xF6, 0xF0, 0xA2, 0x86, 0x83, + 0xF6, 0xE3, 0xA1, 0xBC, 0xF6, 0xE5, 0xBA, 0xB0, + 0xF6, 0xE5, 0xBA, 0xB3, 0xF6, 0xE5, 0xBA, 0xB6, + 0xF6, 0xE5, 0xBB, 0x8A, 0xF6, 0xF0, 0xAA, 0x8E, + 0x92, 0xF6, 0xE5, 0xBB, 0xBE, 0xF6, 0xF0, 0xA2, + 0x8C, 0xB1, 0xF6, 0xF0, 0xA2, 0x8C, 0xB1, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE5, 0xBC, 0xA2, 0xF6, + 0xE5, 0xBC, 0xA2, 0xF6, 0xE3, 0xA3, 0x87, 0xF6, + 0xF0, 0xA3, 0x8A, 0xB8, 0xF6, 0xF0, 0xA6, 0x87, + 0x9A, 0xF6, 0xE5, 0xBD, 0xA2, 0xF6, 0xE5, 0xBD, + 0xAB, 0xF6, 0xE3, 0xA3, 0xA3, 0xF6, 0xE5, 0xBE, + 0x9A, 0xF6, 0xE5, 0xBF, 0x8D, 0xF6, 0xE5, 0xBF, + 0x97, 0xF6, 0xE5, 0xBF, 0xB9, 0xF6, 0xE6, 0x82, + 0x81, 0xF6, 0xE3, 0xA4, 0xBA, 0xF6, 0xE3, 0xA4, + 0x9C, 0xF6, 0xE6, 0x82, 0x94, 0xF6, 0xF0, 0xA2, + 0x9B, 0x94, 0xF6, 0xE6, 0x83, 0x87, 0xF6, 0xE6, + 0x85, 0x88, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0x8E, 0xF6, 0xE6, 0x85, 0x8C, 0xF6, 0xE6, + 0x85, 0xBA, 0xF6, 0xE6, 0x86, 0x8E, 0xF6, 0xE6, + 0x86, 0xB2, 0xF6, 0xE6, 0x86, 0xA4, 0xF6, 0xE6, + 0x86, 0xAF, 0xF6, 0xE6, 0x87, 0x9E, 0xF6, 0xE6, + 0x87, 0xB2, 0xF6, 0xE6, 0x87, 0xB6, 0xF6, 0xE6, + 0x88, 0x90, 0xF6, 0xE6, 0x88, 0x9B, 0xF6, 0xE6, + 0x89, 0x9D, 0xF6, 0xE6, 0x8A, 0xB1, 0xF6, 0xE6, + 0x8B, 0x94, 0xF6, 0xE6, 0x8D, 0x90, 0xF6, 0xF0, + 0xA2, 0xAC, 0x8C, 0xF6, 0xE6, 0x8C, 0xBD, 0xF6, + 0xE6, 0x8B, 0xBC, 0xF6, 0xE6, 0x8D, 0xA8, 0xF6, + 0xE6, 0x8E, 0x83, 0xF6, 0xE6, 0x8F, 0xA4, 0xF6, + 0xF0, 0xA2, 0xAF, 0xB1, 0xF6, 0xE6, 0x90, 0xA2, + 0xF6, 0xE6, 0x8F, 0x85, 0xF6, 0xE6, 0x8E, 0xA9, + 0xF6, 0xE3, 0xA8, 0xAE, 0xF6, 0xE6, 0x91, 0xA9, + 0xF6, 0xE6, 0x91, 0xBE, 0xF6, 0xE6, 0x92, 0x9D, + 0xF6, 0xE6, 0x91, 0xB7, 0xF6, 0xE3, 0xA9, 0xAC, + 0xF6, 0xE6, 0x95, 0x8F, 0xF6, 0xE6, 0x95, 0xAC, + 0xF6, 0xF0, 0xA3, 0x80, 0x8A, 0xF6, 0xE6, 0x97, + 0xA3, 0xF6, 0xE6, 0x9B, 0xB8, 0xF6, 0xE6, 0x99, + 0x89, 0xF6, 0xE3, 0xAC, 0x99, 0xF6, 0xE6, 0x9A, + 0x91, 0xF6, 0xE3, 0xAC, 0x88, 0xF6, 0xE3, 0xAB, + 0xA4, 0xF6, 0xE5, 0x86, 0x92, 0xF6, 0xE5, 0x86, + 0x95, 0xF6, 0xE6, 0x9C, 0x80, 0xF6, 0xE6, 0x9A, + 0x9C, 0xF6, 0xE8, 0x82, 0xAD, 0xF6, 0xE4, 0x8F, + 0x99, 0xF6, 0xE6, 0x9C, 0x97, 0xF6, 0xE6, 0x9C, + 0x9B, 0xF6, 0xE6, 0x9C, 0xA1, 0xF6, 0xE6, 0x9D, + 0x9E, 0xF6, 0xE6, 0x9D, 0x93, 0xF6, 0xF0, 0xA3, + 0x8F, 0x83, 0xF6, 0xE3, 0xAD, 0x89, 0xF6, 0xE6, + 0x9F, 0xBA, 0xF6, 0xE6, 0x9E, 0x85, 0xF6, 0xE6, + 0xA1, 0x92, 0xF6, 0xE6, 0xA2, 0x85, 0xF6, 0xF0, + 0xA3, 0x91, 0xAD, 0xF6, 0xE6, 0xA2, 0x8E, 0xF6, + 0xE6, 0xA0, 0x9F, 0xF6, 0xE6, 0xA4, 0x94, 0xF6, + 0xE3, 0xAE, 0x9D, 0xF6, 0xE6, 0xA5, 0x82, 0xF6, + 0xE6, 0xA6, 0xA3, 0xF6, 0xE6, 0xA7, 0xAA, 0xF6, + 0xE6, 0xAA, 0xA8, 0xF6, 0xF0, 0xA3, 0x9A, 0xA3, + 0xF6, 0xE6, 0xAB, 0x9B, 0xF6, 0xE3, 0xB0, 0x98, + 0xF6, 0xE6, 0xAC, 0xA1, 0xF6, 0xF0, 0xA3, 0xA2, + 0xA7, 0xF6, 0xE6, 0xAD, 0x94, 0xF6, 0xE3, 0xB1, + 0x8E, 0xF6, 0xE6, 0xAD, 0xB2, 0xF6, 0xE6, 0xAE, + 0x9F, 0xF6, 0xE6, 0xAE, 0xBA, 0xF6, 0xE6, 0xAE, + 0xBB, 0xF6, 0xF0, 0xA3, 0xAA, 0x8D, 0xF6, 0xF0, + 0xA1, 0xB4, 0x8B, 0xF6, 0xF0, 0xA3, 0xAB, 0xBA, + 0xF6, 0xE6, 0xB1, 0x8E, 0xF6, 0xF0, 0xA3, 0xB2, + 0xBC, 0xF6, 0xE6, 0xB2, 0xBF, 0xF6, 0xE6, 0xB3, + 0x8D, 0xF6, 0xE6, 0xB1, 0xA7, 0xF6, 0xE6, 0xB4, + 0x96, 0xF6, 0xE6, 0xB4, 0xBE, 0xF6, 0xE6, 0xB5, + 0xB7, 0xF6, 0xE6, 0xB5, 0x81, 0xF6, 0xE6, 0xB5, + 0xA9, 0xF6, 0xE6, 0xB5, 0xB8, 0xF6, 0xE6, 0xB6, + 0x85, 0xF6, 0xF0, 0xA3, 0xB4, 0x9E, 0xF6, 0xE6, + 0xB4, 0xB4, 0xF6, 0xE6, 0xB8, 0xAF, 0xF6, 0xE6, + 0xB9, 0xAE, 0xF6, 0xE3, 0xB4, 0xB3, 0xF6, 0xE6, + 0xBB, 0x8B, 0xF6, 0xE6, 0xBB, 0x87, 0xF6, 0xF0, + 0xA3, 0xBB, 0x91, 0xF6, 0xE6, 0xB7, 0xB9, 0xF6, + 0xE6, 0xBD, 0xAE, 0xF6, 0xF0, 0xA3, 0xBD, 0x9E, + 0xF6, 0xF0, 0xA3, 0xBE, 0x8E, 0xF6, 0xE6, 0xBF, + 0x86, 0xF6, 0xE7, 0x80, 0xB9, 0xF6, 0xE7, 0x80, + 0x9E, 0xF6, 0xE7, 0x80, 0x9B, 0xF6, 0xE3, 0xB6, + 0x96, 0xF6, 0xE7, 0x81, 0x8A, 0xF6, 0xE7, 0x81, + 0xBD, 0xF6, 0xE7, 0x81, 0xB7, 0xF6, 0xE7, 0x82, + 0xAD, 0xF6, 0xF0, 0xA0, 0x94, 0xA5, 0xF6, 0xE7, + 0x85, 0x85, 0xF6, 0xF0, 0xA4, 0x89, 0xA3, 0xF6, + 0xE7, 0x86, 0x9C, 0xF6, 0xF0, 0xA4, 0x8E, 0xAB, + 0xF6, 0xE7, 0x88, 0xA8, 0xF6, 0xE7, 0x88, 0xB5, + 0xF6, 0xE7, 0x89, 0x90, 0xF6, 0xF0, 0xA4, 0x98, + 0x88, 0xF6, 0xE7, 0x8A, 0x80, 0xF6, 0xE7, 0x8A, + 0x95, 0xF6, 0xF0, 0xA4, 0x9C, 0xB5, 0xF6, 0xF0, + 0xA4, 0xA0, 0x94, 0xF6, 0xE7, 0x8D, 0xBA, 0xF6, + 0xE7, 0x8E, 0x8B, 0xF6, 0xE3, 0xBA, 0xAC, 0xF6, + 0xE7, 0x8E, 0xA5, 0xF6, 0xE3, 0xBA, 0xB8, 0xF6, + 0xE3, 0xBA, 0xB8, 0xF6, 0xE7, 0x91, 0x87, 0xF6, + 0xE7, 0x91, 0x9C, 0xF6, 0xE7, 0x91, 0xB1, 0xF6, + 0xE7, 0x92, 0x85, 0xF6, 0xE7, 0x93, 0x8A, 0xF6, + 0xE3, 0xBC, 0x9B, 0xF6, 0xE7, 0x94, 0xA4, 0xF6, + 0xF0, 0xA4, 0xB0, 0xB6, 0xF6, 0xE7, 0x94, 0xBE, + 0xF6, 0xF0, 0xA4, 0xB2, 0x92, 0xF6, 0xE7, 0x95, + 0xB0, 0xF6, 0xF0, 0xA2, 0x86, 0x9F, 0xF6, 0xE7, + 0x98, 0x90, 0xF6, 0xF0, 0xA4, 0xBE, 0xA1, 0xF6, + 0xF0, 0xA4, 0xBE, 0xB8, 0xF6, 0xF0, 0xA5, 0x81, + 0x84, 0xF6, 0xE3, 0xBF, 0xBC, 0xF6, 0xE4, 0x80, + 0x88, 0xF6, 0xE7, 0x9B, 0xB4, 0xF6, 0xF0, 0xA5, + 0x83, 0xB3, 0xF6, 0xF0, 0xA5, 0x83, 0xB2, 0xF6, + 0xF0, 0xA5, 0x84, 0x99, 0xF6, 0xF0, 0xA5, 0x84, + 0xB3, 0xF6, 0xE7, 0x9C, 0x9E, 0xF6, 0xE7, 0x9C, + 0x9F, 0xF6, 0xE7, 0x9C, 0x9F, 0xF6, 0xE7, 0x9D, + 0x8A, 0xF6, 0xE4, 0x80, 0xB9, 0xF6, 0xE7, 0x9E, + 0x8B, 0xF6, 0xE4, 0x81, 0x86, 0xF6, 0xE4, 0x82, + 0x96, 0xF6, 0xF0, 0xA5, 0x90, 0x9D, 0xF6, 0xE7, + 0xA1, 0x8E, 0xF6, 0xE7, 0xA2, 0x8C, 0xF6, 0xE7, + 0xA3, 0x8C, 0xF6, 0xE4, 0x83, 0xA3, 0xF6, 0xF0, + 0xA5, 0x98, 0xA6, 0xF6, 0xE7, 0xA5, 0x96, 0xF6, + 0xF0, 0xA5, 0x9A, 0x9A, 0xF6, 0xF0, 0xA5, 0x9B, + 0x85, 0xF6, 0xE7, 0xA6, 0x8F, 0xF6, 0xE7, 0xA7, + 0xAB, 0xF6, 0xE4, 0x84, 0xAF, 0xF6, 0xE7, 0xA9, + 0x80, 0xF6, 0xE7, 0xA9, 0x8A, 0xF6, 0xE7, 0xA9, + 0x8F, 0xF6, 0xF0, 0xA5, 0xA5, 0xBC, 0xF6, 0xF0, + 0xA5, 0xAA, 0xA7, 0xF6, 0xF0, 0xA5, 0xAA, 0xA7, + 0xF6, 0xE7, 0xAB, 0xAE, 0xF6, 0xE4, 0x88, 0x82, + 0xF6, 0xF0, 0xA5, 0xAE, 0xAB, 0xF6, 0xE7, 0xAF, + 0x86, 0xF6, 0xE7, 0xAF, 0x89, 0xF6, 0xE4, 0x88, + 0xA7, 0xF6, 0xF0, 0xA5, 0xB2, 0x80, 0xF6, 0xE7, + 0xB3, 0x92, 0xF6, 0xE4, 0x8A, 0xA0, 0xF6, 0xE7, + 0xB3, 0xA8, 0xF6, 0xE7, 0xB3, 0xA3, 0xF6, 0xE7, + 0xB4, 0x80, 0xF6, 0xF0, 0xA5, 0xBE, 0x86, 0xF6, + 0xE7, 0xB5, 0xA3, 0xF6, 0xE4, 0x8C, 0x81, 0xF6, + 0xE7, 0xB7, 0x87, 0xF6, 0xE7, 0xB8, 0x82, 0xF6, + 0xE7, 0xB9, 0x85, 0xF6, 0xE4, 0x8C, 0xB4, 0xF6, + 0xF0, 0xA6, 0x88, 0xA8, 0xF6, 0xF0, 0xA6, 0x89, + 0x87, 0xF6, 0xE4, 0x8D, 0x99, 0xF6, 0xF0, 0xA6, + 0x8B, 0x99, 0xF6, 0xE7, 0xBD, 0xBA, 0xF6, 0xF0, + 0xA6, 0x8C, 0xBE, 0xF6, 0xE7, 0xBE, 0x95, 0xF6, + 0xE7, 0xBF, 0xBA, 0xF6, 0xE8, 0x80, 0x85, 0xF6, + 0xF0, 0xA6, 0x93, 0x9A, 0xF6, 0xF0, 0xA6, 0x94, + 0xA3, 0xF6, 0xE8, 0x81, 0xA0, 0xF6, 0xF0, 0xA6, + 0x96, 0xA8, 0xF6, 0xE8, 0x81, 0xB0, 0xF6, 0xF0, + 0xA3, 0x8D, 0x9F, 0xF6, 0xE4, 0x8F, 0x95, 0xF6, + 0xE8, 0x82, 0xB2, 0xF6, 0xE8, 0x84, 0x83, 0xF6, + 0xE4, 0x90, 0x8B, 0xF6, 0xE8, 0x84, 0xBE, 0xF6, + 0xE5, 0xAA, 0xB5, 0xF6, 0xF0, 0xA6, 0x9E, 0xA7, + 0xF6, 0xF0, 0xA6, 0x9E, 0xB5, 0xF6, 0xF0, 0xA3, + 0x8E, 0x93, 0xF6, 0xF0, 0xA3, 0x8E, 0x9C, 0xF6, + 0xE8, 0x88, 0x81, 0xF6, 0xE8, 0x88, 0x84, 0xF6, + 0xE8, 0xBE, 0x9E, 0xF6, 0xE4, 0x91, 0xAB, 0xF6, + 0xE8, 0x8A, 0x91, 0xF6, 0xE8, 0x8A, 0x8B, 0xF6, + 0xE8, 0x8A, 0x9D, 0xF6, 0xE5, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xB1, 0xF6, 0xE8, 0x8A, 0xB3, 0xF6, + 0xE8, 0x8A, 0xBD, 0xF6, 0xE8, 0x8B, 0xA6, 0xF6, + 0xF0, 0xA6, 0xAC, 0xBC, 0xF6, 0xE8, 0x8B, 0xA5, + 0xF6, 0xE8, 0x8C, 0x9D, 0xF6, 0xE8, 0x8D, 0xA3, + 0xF6, 0xE8, 0x8E, 0xAD, 0xF6, 0xE8, 0x8C, 0xA3, + 0xF6, 0xE8, 0x8E, 0xBD, 0xF6, 0xE8, 0x8F, 0xA7, + 0xF6, 0xE8, 0x91, 0x97, 0xF6, 0xE8, 0x8D, 0x93, + 0xF6, 0xE8, 0x8F, 0x8A, 0xF6, 0xE8, 0x8F, 0x8C, + 0xF6, 0xE8, 0x8F, 0x9C, 0xF6, 0xF0, 0xA6, 0xB0, + 0xB6, 0xF6, 0xF0, 0xA6, 0xB5, 0xAB, 0xF6, 0xF0, + 0xA6, 0xB3, 0x95, 0xF6, 0xE4, 0x94, 0xAB, 0xF6, + 0xE8, 0x93, 0xB1, 0xF6, 0xE8, 0x93, 0xB3, 0xF6, + 0xE8, 0x94, 0x96, 0xF6, 0xF0, 0xA7, 0x8F, 0x8A, + 0xF6, 0xE8, 0x95, 0xA4, 0xF6, 0xF0, 0xA6, 0xBC, + 0xAC, 0xF6, 0xE4, 0x95, 0x9D, 0xF6, 0xE4, 0x95, + 0xA1, 0xF6, 0xF0, 0xA6, 0xBE, 0xB1, 0xF6, 0xF0, + 0xA7, 0x83, 0x92, 0xF6, 0xE4, 0x95, 0xAB, 0xF6, + 0xE8, 0x99, 0x90, 0xF6, 0xE8, 0x99, 0x9C, 0xF6, + 0xE8, 0x99, 0xA7, 0xF6, 0xE8, 0x99, 0xA9, 0xF6, + 0xE8, 0x9A, 0xA9, 0xF6, 0xE8, 0x9A, 0x88, 0xF6, + 0xE8, 0x9C, 0x8E, 0xF6, 0xE8, 0x9B, 0xA2, 0xF6, + 0xE8, 0x9D, 0xB9, 0xF6, 0xE8, 0x9C, 0xA8, 0xF6, + 0xE8, 0x9D, 0xAB, 0xF6, 0xE8, 0x9E, 0x86, 0xF6, + 0xE4, 0x97, 0x97, 0xF6, 0xE8, 0x9F, 0xA1, 0xF6, + 0xE8, 0xA0, 0x81, 0xF6, 0xE4, 0x97, 0xB9, 0xF6, + 0xE8, 0xA1, 0xA0, 0xF6, 0xE8, 0xA1, 0xA3, 0xF6, + 0xF0, 0xA7, 0x99, 0xA7, 0xF6, 0xE8, 0xA3, 0x97, + 0xF6, 0xE8, 0xA3, 0x9E, 0xF6, 0xE4, 0x98, 0xB5, + 0xF6, 0xE8, 0xA3, 0xBA, 0xF6, 0xE3, 0x92, 0xBB, + 0xF6, 0xF0, 0xA7, 0xA2, 0xAE, 0xF6, 0xF0, 0xA7, + 0xA5, 0xA6, 0xF6, 0xE4, 0x9A, 0xBE, 0xF6, 0xE4, + 0x9B, 0x87, 0xF6, 0xE8, 0xAA, 0xA0, 0xF6, 0xE8, + 0xAB, 0xAD, 0xF6, 0xE8, 0xAE, 0x8A, 0xF6, 0xE8, + 0xB1, 0x95, 0xF6, 0xF0, 0xA7, 0xB2, 0xA8, 0xF6, + 0xE8, 0xB2, 0xAB, 0xF6, 0xE8, 0xB3, 0x81, 0xF6, + 0xE8, 0xB4, 0x9B, 0xF6, 0xE8, 0xB5, 0xB7, 0xF6, + 0xF0, 0xA7, 0xBC, 0xAF, 0xF6, 0xF0, 0xA0, 0xA0, + 0x84, 0xF6, 0xE8, 0xB7, 0x8B, 0xF6, 0xE8, 0xB6, + 0xBC, 0xF6, 0xE8, 0xB7, 0xB0, 0xF6, 0xF0, 0xA0, + 0xA3, 0x9E, 0xF6, 0xE8, 0xBB, 0x94, 0xF6, 0xE8, + 0xBC, 0xB8, 0xF6, 0xF0, 0xA8, 0x97, 0x92, 0xF6, + 0xF0, 0xA8, 0x97, 0xAD, 0xF6, 0xE9, 0x82, 0x94, + 0xF6, 0xE9, 0x83, 0xB1, 0xF6, 0xE9, 0x84, 0x91, + 0xF6, 0xF0, 0xA8, 0x9C, 0xAE, 0xF6, 0xE9, 0x84, + 0x9B, 0xF6, 0xE9, 0x88, 0xB8, 0xF6, 0xE9, 0x8B, + 0x97, 0xF6, 0xE9, 0x8B, 0x98, 0xF6, 0xE9, 0x89, + 0xBC, 0xF6, 0xE9, 0x8F, 0xB9, 0xF6, 0xE9, 0x90, + 0x95, 0xF6, 0xF0, 0xA8, 0xAF, 0xBA, 0xF6, 0xE9, + 0x96, 0x8B, 0xF6, 0xE4, 0xA6, 0x95, 0xF6, 0xE9, + 0x96, 0xB7, 0xF6, 0xF0, 0xA8, 0xB5, 0xB7, 0xF6, + 0xE4, 0xA7, 0xA6, 0xF6, 0xE9, 0x9B, 0x83, 0xF6, + 0xE5, 0xB6, 0xB2, 0xF6, 0xE9, 0x9C, 0xA3, 0xF6, + 0xF0, 0xA9, 0x85, 0x85, 0xF6, 0xF0, 0xA9, 0x88, + 0x9A, 0xF6, 0xE4, 0xA9, 0xAE, 0xF6, 0xE4, 0xA9, + 0xB6, 0xF6, 0xE9, 0x9F, 0xA0, 0xF6, 0xF0, 0xA9, + 0x90, 0x8A, 0xF6, 0xE4, 0xAA, 0xB2, 0xF6, 0xF0, + 0xA9, 0x92, 0x96, 0xF6, 0xE9, 0xA0, 0x8B, 0xF6, + 0xE9, 0xA0, 0x8B, 0xF6, 0xE9, 0xA0, 0xA9, 0xF6, + 0xF0, 0xA9, 0x96, 0xB6, 0xF6, 0xE9, 0xA3, 0xA2, + 0xF6, 0xE4, 0xAC, 0xB3, 0xF6, 0xE9, 0xA4, 0xA9, + 0xF6, 0xE9, 0xA6, 0xA7, 0xF6, 0xE9, 0xA7, 0x82, + 0xF6, 0xE9, 0xA7, 0xBE, 0xF6, 0xE4, 0xAF, 0x8E, + 0xF6, 0xF0, 0xA9, 0xAC, 0xB0, 0xF6, 0xE9, 0xAC, + 0x92, 0xF6, 0xE9, 0xB1, 0x80, 0xF6, 0xE9, 0xB3, + 0xBD, 0xF6, 0xE4, 0xB3, 0x8E, 0xF6, 0xE4, 0xB3, + 0xAD, 0xF6, 0xE9, 0xB5, 0xA7, 0xF6, 0xF0, 0xAA, + 0x83, 0x8E, 0xF6, 0xE4, 0xB3, 0xB8, 0xF6, 0xF0, + 0xAA, 0x84, 0x85, 0xF6, 0xF0, 0xAA, 0x88, 0x8E, + 0xF6, 0xF0, 0xAA, 0x8A, 0x91, 0xF6, 0xE9, 0xBA, + 0xBB, 0xF6, 0xE4, 0xB5, 0x96, 0xF6, 0xE9, 0xBB, + 0xB9, 0xF6, 0xE9, 0xBB, 0xBE, 0xF6, 0xE9, 0xBC, + 0x85, 0xF6, 0xE9, 0xBC, 0x8F, 0xF6, 0xE9, 0xBC, + 0x96, 0xF6, 0xE9, 0xBC, 0xBB, 0xF6, 0xF0, 0xAA, + 0x98, 0x80, + }, +}; + +static const uchar_t u8_case_common_b2_tbl[2][2][256] = { + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, 1, 2, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 3, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 4, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + { + { + 0, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, 1, 2, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, 3, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + { + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + 4, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + N_, N_, N_, N_, N_, N_, N_, N_, + }, + + }, + +}; + +static const u8_displacement_t u8_tolower_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, + { 5, 321 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 6, 373 }, { 7, 439 }, + { 8, 465 }, { 9, 561 }, { 10, 593 }, { 11, 649 }, + { 12, 703 }, { 13, 749 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 14, 795 }, { 15, 891 }, { 16, 987 }, { 17, 1068 }, + { 18, 1155 }, { 19, 1245 }, { 20, 1299 }, { 21, 1386 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 22, 1443 }, { 23, 1448 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 24, 1496 }, { 25, 1526 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 26, 1574 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 27, 1652 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 0, 0 }, + { 1, 60 }, { 2, 123 }, { 3, 185 }, { 4, 257 }, + { 5, 321 }, { 6, 383 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 7, 401 }, { 8, 467 }, + { 9, 505 }, { 10, 601 }, { 11, 633 }, { 12, 689 }, + { 13, 753 }, { 14, 803 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 15, 849 }, { 16, 945 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 17, 963 }, { 18, 1059 }, { 19, 1155 }, { 20, 1236 }, + { 21, 1323 }, { 22, 1413 }, { 23, 1467 }, { 24, 1554 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 25, 1611 }, { 26, 1619 }, { 27, 1667 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 28, 1670 }, { 29, 1700 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 30, 1748 }, { 31, 1889 }, { 32, 1911 }, { 33, 2007 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 34, 2061 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 35, 2139 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_tolower_b4_tbl[2][36][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 49, 49, 51, 51, 53, 53, 55, + 55, 55, 57, 57, 59, 59, 61, 61, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 8, 10, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 54, + 54, 56, 58, 58, 60, 60, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 4, 6, 6, 8, + 10, 10, 12, 14, 16, 16, 16, 18, + 20, 22, 24, 24, 26, 28, 28, 30, + 32, 34, 34, 34, 34, 36, 38, 38, + 40, 42, 42, 44, 44, 46, 46, 48, + 50, 50, 52, 52, 52, 54, 54, 56, + 58, 58, 60, 62, 64, 64, 66, 66, + 68, 70, 70, 70, 70, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 4, + 6, 8, 8, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 46, 48, 50, 50, 52, 52, 54, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 4, 6, 8, 8, 10, 10, 12, + 14, 14, 16, 18, 20, 22, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 24, 24, 24, 24, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 52, + 52, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 33, 36, 36, 39, 39, 42, 42, 45, + 45, 48, 48, 51, 51, 54, 54, 57, + 57, 60, 60, 63, 63, 66, 66, 69, + 69, 72, 72, 75, 75, 78, 78, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 81, 84, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 18, 21, 24, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 30, 33, 36, 39, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 2, 2, 3, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, + 5, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 6, 9, 12, 15, 18, 21, 24, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, 152, 152, 152, 152, 152, 152, 152, + 152, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, 60, 60, 60, 60, 60, 60, 60, + 60, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 49, 49, 51, 51, 53, 53, 55, + 55, 55, 57, 57, 59, 59, 61, 61, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 8, 10, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 48, 48, 50, 50, 52, 52, 54, + 54, 56, 58, 58, 60, 60, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 4, 6, 6, 8, + 10, 10, 12, 14, 16, 16, 16, 18, + 20, 22, 24, 24, 26, 28, 28, 30, + 32, 34, 34, 34, 34, 36, 38, 38, + 40, 42, 42, 44, 44, 46, 46, 48, + 50, 50, 52, 52, 52, 54, 54, 56, + 58, 58, 60, 62, 64, 64, 66, 66, + 68, 70, 70, 70, 70, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 4, + 6, 8, 8, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 28, 30, + 30, 32, 32, 34, 34, 36, 36, 38, + 38, 40, 40, 42, 42, 44, 44, 46, + 46, 46, 48, 50, 50, 52, 52, 54, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 52, 52, 52, + 52, 52, 52, 55, 57, 57, 59, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 6, 8, 10, + 10, 12, 12, 14, 14, 16, 16, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 4, 6, 8, 8, 10, 10, 12, + 14, 14, 16, 18, 20, 22, 24, 26, + 28, 30, 32, 34, 36, 38, 40, 42, + 44, 46, 48, 48, 50, 52, 54, 56, + 58, 60, 62, 64, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 24, 24, 24, 24, 26, 26, 26, + 28, 28, 30, 32, 32, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 66, 68, 70, 72, 74, 76, 78, + 80, 82, 84, 86, 88, 90, 92, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 4, 6, 6, 8, 8, + 10, 10, 12, 12, 14, 14, 16, 16, + 16, 18, 18, 20, 20, 22, 22, 24, + 24, 26, 26, 28, 28, 30, 30, 32, + 32, 34, 34, 36, 36, 38, 38, 40, + 40, 42, 42, 44, 44, 46, 46, 48, + 48, 50, 50, 52, 52, 54, 54, 56, + 56, 58, 58, 60, 60, 62, 62, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 12, 12, 14, 14, 16, + 16, 18, 18, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 22, 24, 26, 28, 30, 32, + 34, 36, 38, 40, 42, 44, 46, 48, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, 46, 46, 46, 46, 46, 46, 46, + 46, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 33, + 33, 33, 33, 33, 33, 33, 33, 33, + 33, 36, 36, 39, 39, 42, 42, 45, + 45, 48, 48, 51, 51, 54, 54, 57, + 57, 60, 60, 63, 63, 66, 66, 69, + 69, 72, 72, 75, 75, 78, 78, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 81, + 81, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 81, 84, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 15, 15, + 15, 15, 15, 15, 15, 15, 15, 15, + 15, 18, 21, 24, 27, 27, 27, 27, + 27, 27, 27, 27, 27, 27, 27, 27, + 27, 30, 33, 36, 39, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, 57, 57, 57, 57, 57, 57, 57, + 57, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 2, + 2, 2, 2, 3, 5, 5, 5, 5, + 5, 5, 5, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 6, 9, 12, 15, 18, 21, 24, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 117, + 120, 123, 126, 129, 132, 135, 138, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, 141, 141, 141, 141, 141, 141, 141, + 141, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 5, 8, 10, 10, 10, + 13, 13, 16, 16, 19, 19, 19, 19, + 19, 19, 19, 19, 19, 19, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, 22, 22, 22, 22, 22, 22, 22, + 22, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 57, 57, 60, + 60, 63, 63, 66, 66, 69, 69, 72, + 72, 75, 75, 78, 78, 81, 81, 84, + 84, 87, 87, 90, 90, 93, 93, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 3, 6, 6, 9, 9, 12, + 12, 15, 15, 18, 18, 21, 21, 24, + 24, 27, 27, 30, 30, 33, 33, 36, + 36, 39, 39, 42, 42, 45, 45, 48, + 48, 51, 51, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 100, 104, 108, 112, 116, 120, 124, + 128, 132, 136, 140, 144, 148, 152, 156, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, 160, 160, 160, 160, 160, 160, 160, + 160, + }, + }, +}; + +static const uchar_t u8_tolower_final_tbl[2][2299] = { + { + 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, + 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, + 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, + 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, + 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, + 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, + 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, + 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, + 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, + 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, + 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, + 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, + 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, + 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, + 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, + 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, + 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, + 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, + 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, + 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, + 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, + 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, + 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, + 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, + 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, + 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, + 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, + 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, + 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, + 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, + 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, + 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, + 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, + 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, + 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, + 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, + 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, + 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, + 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, + 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, + 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, + 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, + 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, + 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, + 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, + 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, + 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xCE, 0xAC, 0xCE, + 0xAD, 0xCE, 0xAE, 0xCE, 0xAF, 0xCF, 0x8C, 0xCF, + 0x8D, 0xCF, 0x8E, 0xCE, 0xB1, 0xCE, 0xB2, 0xCE, + 0xB3, 0xCE, 0xB4, 0xCE, 0xB5, 0xCE, 0xB6, 0xCE, + 0xB7, 0xCE, 0xB8, 0xCE, 0xB9, 0xCE, 0xBA, 0xCE, + 0xBB, 0xCE, 0xBC, 0xCE, 0xBD, 0xCE, 0xBE, 0xCE, + 0xBF, 0xCF, 0x80, 0xCF, 0x81, 0xCF, 0x83, 0xCF, + 0x84, 0xCF, 0x85, 0xCF, 0x86, 0xCF, 0x87, 0xCF, + 0x88, 0xCF, 0x89, 0xCF, 0x8A, 0xCF, 0x8B, 0xCF, + 0x99, 0xCF, 0x9B, 0xCF, 0x9D, 0xCF, 0x9F, 0xCF, + 0xA1, 0xCF, 0xA3, 0xCF, 0xA5, 0xCF, 0xA7, 0xCF, + 0xA9, 0xCF, 0xAB, 0xCF, 0xAD, 0xCF, 0xAF, 0xCE, + 0xB8, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, + 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, + 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, + 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, + 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, + 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, + 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, + 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, + 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, + 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, + 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, + 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, + 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, + 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, + 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, + 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, + 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, + 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, + 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, + 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, + 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, + 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, + 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, + 0xBF, 0xD3, 0x82, 0xD3, 0x84, 0xD3, 0x86, 0xD3, + 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, 0x8E, 0xD3, + 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, 0x97, 0xD3, + 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, 0x9F, 0xD3, + 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, 0xA7, 0xD3, + 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, 0xAF, 0xD3, + 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, 0xB9, 0xD4, + 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, 0x87, 0xD4, + 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, 0x8F, 0xD5, + 0xA1, 0xD5, 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, + 0xA5, 0xD5, 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, + 0xA9, 0xD5, 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, + 0xAD, 0xD5, 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, + 0xB1, 0xD5, 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, + 0xB5, 0xD5, 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, + 0xB9, 0xD5, 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, + 0xBD, 0xD5, 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, + 0x81, 0xD6, 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, + 0x85, 0xD6, 0x86, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, + 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, + 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, + 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, + 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, + 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, + 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, + 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, + 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, + 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, + 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, + 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, + 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, + 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, + 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, + 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, + 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, + 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, + 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, + 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, + 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, + 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, + 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, + 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, + 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, + 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, + 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, + 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, + 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, + 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, + 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, + 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, + 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, + 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, + 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, + 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, + 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, + 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, + 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, + 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, + 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, + 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, + 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, + 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, + 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, + 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, + 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, + 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, + 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, + 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, + 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, + 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, + 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, + 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, + 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, + 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, + 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, + 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, + 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, + 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, + 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, + 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, + 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, + 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, + 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, + 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, + 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, + 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, + 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, + 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, + 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, + 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, + 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, + 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, + 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, + 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, + 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, + 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, + 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, + 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, + 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, + 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, + 0xE2, 0x85, 0xB0, 0xE2, 0x85, 0xB1, 0xE2, 0x85, + 0xB2, 0xE2, 0x85, 0xB3, 0xE2, 0x85, 0xB4, 0xE2, + 0x85, 0xB5, 0xE2, 0x85, 0xB6, 0xE2, 0x85, 0xB7, + 0xE2, 0x85, 0xB8, 0xE2, 0x85, 0xB9, 0xE2, 0x85, + 0xBA, 0xE2, 0x85, 0xBB, 0xE2, 0x85, 0xBC, 0xE2, + 0x85, 0xBD, 0xE2, 0x85, 0xBE, 0xE2, 0x85, 0xBF, + 0xE2, 0x93, 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, + 0x92, 0xE2, 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, + 0x93, 0x95, 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, + 0xE2, 0x93, 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, + 0x9A, 0xE2, 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, + 0x93, 0x9D, 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, + 0xE2, 0x93, 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, + 0xA2, 0xE2, 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, + 0x93, 0xA5, 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, + 0xE2, 0x93, 0xA8, 0xE2, 0x93, 0xA9, 0xEF, 0xBD, + 0x81, 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, + 0xBD, 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, + 0xEF, 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, + 0x89, 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, + 0xBD, 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, + 0xEF, 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, + 0x91, 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, + 0xBD, 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, + 0xEF, 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, + 0x99, 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, + 0xF0, 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, + 0xF0, 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, + 0xF0, 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, + 0xF0, 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, + 0xF0, 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, + 0xF0, 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, + 0xF0, 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, + 0xF0, 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, + 0xF0, 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, + 0xF0, 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, + 0xF0, 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, + 0xF0, 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, + 0xF0, 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, + 0xF0, 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, + 0xF0, 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, + 0xF0, 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, + 0xF0, 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, + 0xF0, 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, + 0xF0, 0x90, 0x91, 0x8D, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, + }, + { + 0xC3, 0xA0, 0xC3, 0xA1, 0xC3, 0xA2, 0xC3, 0xA3, + 0xC3, 0xA4, 0xC3, 0xA5, 0xC3, 0xA6, 0xC3, 0xA7, + 0xC3, 0xA8, 0xC3, 0xA9, 0xC3, 0xAA, 0xC3, 0xAB, + 0xC3, 0xAC, 0xC3, 0xAD, 0xC3, 0xAE, 0xC3, 0xAF, + 0xC3, 0xB0, 0xC3, 0xB1, 0xC3, 0xB2, 0xC3, 0xB3, + 0xC3, 0xB4, 0xC3, 0xB5, 0xC3, 0xB6, 0xC3, 0xB8, + 0xC3, 0xB9, 0xC3, 0xBA, 0xC3, 0xBB, 0xC3, 0xBC, + 0xC3, 0xBD, 0xC3, 0xBE, 0xC4, 0x81, 0xC4, 0x83, + 0xC4, 0x85, 0xC4, 0x87, 0xC4, 0x89, 0xC4, 0x8B, + 0xC4, 0x8D, 0xC4, 0x8F, 0xC4, 0x91, 0xC4, 0x93, + 0xC4, 0x95, 0xC4, 0x97, 0xC4, 0x99, 0xC4, 0x9B, + 0xC4, 0x9D, 0xC4, 0x9F, 0xC4, 0xA1, 0xC4, 0xA3, + 0xC4, 0xA5, 0xC4, 0xA7, 0xC4, 0xA9, 0xC4, 0xAB, + 0xC4, 0xAD, 0xC4, 0xAF, 0x69, 0xC4, 0xB3, 0xC4, + 0xB5, 0xC4, 0xB7, 0xC4, 0xBA, 0xC4, 0xBC, 0xC4, + 0xBE, 0xC5, 0x80, 0xC5, 0x82, 0xC5, 0x84, 0xC5, + 0x86, 0xC5, 0x88, 0xC5, 0x8B, 0xC5, 0x8D, 0xC5, + 0x8F, 0xC5, 0x91, 0xC5, 0x93, 0xC5, 0x95, 0xC5, + 0x97, 0xC5, 0x99, 0xC5, 0x9B, 0xC5, 0x9D, 0xC5, + 0x9F, 0xC5, 0xA1, 0xC5, 0xA3, 0xC5, 0xA5, 0xC5, + 0xA7, 0xC5, 0xA9, 0xC5, 0xAB, 0xC5, 0xAD, 0xC5, + 0xAF, 0xC5, 0xB1, 0xC5, 0xB3, 0xC5, 0xB5, 0xC5, + 0xB7, 0xC3, 0xBF, 0xC5, 0xBA, 0xC5, 0xBC, 0xC5, + 0xBE, 0xC9, 0x93, 0xC6, 0x83, 0xC6, 0x85, 0xC9, + 0x94, 0xC6, 0x88, 0xC9, 0x96, 0xC9, 0x97, 0xC6, + 0x8C, 0xC7, 0x9D, 0xC9, 0x99, 0xC9, 0x9B, 0xC6, + 0x92, 0xC9, 0xA0, 0xC9, 0xA3, 0xC9, 0xA9, 0xC9, + 0xA8, 0xC6, 0x99, 0xC9, 0xAF, 0xC9, 0xB2, 0xC9, + 0xB5, 0xC6, 0xA1, 0xC6, 0xA3, 0xC6, 0xA5, 0xCA, + 0x80, 0xC6, 0xA8, 0xCA, 0x83, 0xC6, 0xAD, 0xCA, + 0x88, 0xC6, 0xB0, 0xCA, 0x8A, 0xCA, 0x8B, 0xC6, + 0xB4, 0xC6, 0xB6, 0xCA, 0x92, 0xC6, 0xB9, 0xC6, + 0xBD, 0xC7, 0x86, 0xC7, 0x86, 0xC7, 0x89, 0xC7, + 0x89, 0xC7, 0x8C, 0xC7, 0x8C, 0xC7, 0x8E, 0xC7, + 0x90, 0xC7, 0x92, 0xC7, 0x94, 0xC7, 0x96, 0xC7, + 0x98, 0xC7, 0x9A, 0xC7, 0x9C, 0xC7, 0x9F, 0xC7, + 0xA1, 0xC7, 0xA3, 0xC7, 0xA5, 0xC7, 0xA7, 0xC7, + 0xA9, 0xC7, 0xAB, 0xC7, 0xAD, 0xC7, 0xAF, 0xC7, + 0xB3, 0xC7, 0xB3, 0xC7, 0xB5, 0xC6, 0x95, 0xC6, + 0xBF, 0xC7, 0xB9, 0xC7, 0xBB, 0xC7, 0xBD, 0xC7, + 0xBF, 0xC8, 0x81, 0xC8, 0x83, 0xC8, 0x85, 0xC8, + 0x87, 0xC8, 0x89, 0xC8, 0x8B, 0xC8, 0x8D, 0xC8, + 0x8F, 0xC8, 0x91, 0xC8, 0x93, 0xC8, 0x95, 0xC8, + 0x97, 0xC8, 0x99, 0xC8, 0x9B, 0xC8, 0x9D, 0xC8, + 0x9F, 0xC6, 0x9E, 0xC8, 0xA3, 0xC8, 0xA5, 0xC8, + 0xA7, 0xC8, 0xA9, 0xC8, 0xAB, 0xC8, 0xAD, 0xC8, + 0xAF, 0xC8, 0xB1, 0xC8, 0xB3, 0xE2, 0xB1, 0xA5, + 0xC8, 0xBC, 0xC6, 0x9A, 0xE2, 0xB1, 0xA6, 0xC9, + 0x82, 0xC6, 0x80, 0xCA, 0x89, 0xCA, 0x8C, 0xC9, + 0x87, 0xC9, 0x89, 0xC9, 0x8B, 0xC9, 0x8D, 0xC9, + 0x8F, 0xCE, 0xAC, 0xCE, 0xAD, 0xCE, 0xAE, 0xCE, + 0xAF, 0xCF, 0x8C, 0xCF, 0x8D, 0xCF, 0x8E, 0xCE, + 0xB1, 0xCE, 0xB2, 0xCE, 0xB3, 0xCE, 0xB4, 0xCE, + 0xB5, 0xCE, 0xB6, 0xCE, 0xB7, 0xCE, 0xB8, 0xCE, + 0xB9, 0xCE, 0xBA, 0xCE, 0xBB, 0xCE, 0xBC, 0xCE, + 0xBD, 0xCE, 0xBE, 0xCE, 0xBF, 0xCF, 0x80, 0xCF, + 0x81, 0xCF, 0x83, 0xCF, 0x84, 0xCF, 0x85, 0xCF, + 0x86, 0xCF, 0x87, 0xCF, 0x88, 0xCF, 0x89, 0xCF, + 0x8A, 0xCF, 0x8B, 0xCF, 0x99, 0xCF, 0x9B, 0xCF, + 0x9D, 0xCF, 0x9F, 0xCF, 0xA1, 0xCF, 0xA3, 0xCF, + 0xA5, 0xCF, 0xA7, 0xCF, 0xA9, 0xCF, 0xAB, 0xCF, + 0xAD, 0xCF, 0xAF, 0xCE, 0xB8, 0xCF, 0xB8, 0xCF, + 0xB2, 0xCF, 0xBB, 0xCD, 0xBB, 0xCD, 0xBC, 0xCD, + 0xBD, 0xD1, 0x90, 0xD1, 0x91, 0xD1, 0x92, 0xD1, + 0x93, 0xD1, 0x94, 0xD1, 0x95, 0xD1, 0x96, 0xD1, + 0x97, 0xD1, 0x98, 0xD1, 0x99, 0xD1, 0x9A, 0xD1, + 0x9B, 0xD1, 0x9C, 0xD1, 0x9D, 0xD1, 0x9E, 0xD1, + 0x9F, 0xD0, 0xB0, 0xD0, 0xB1, 0xD0, 0xB2, 0xD0, + 0xB3, 0xD0, 0xB4, 0xD0, 0xB5, 0xD0, 0xB6, 0xD0, + 0xB7, 0xD0, 0xB8, 0xD0, 0xB9, 0xD0, 0xBA, 0xD0, + 0xBB, 0xD0, 0xBC, 0xD0, 0xBD, 0xD0, 0xBE, 0xD0, + 0xBF, 0xD1, 0x80, 0xD1, 0x81, 0xD1, 0x82, 0xD1, + 0x83, 0xD1, 0x84, 0xD1, 0x85, 0xD1, 0x86, 0xD1, + 0x87, 0xD1, 0x88, 0xD1, 0x89, 0xD1, 0x8A, 0xD1, + 0x8B, 0xD1, 0x8C, 0xD1, 0x8D, 0xD1, 0x8E, 0xD1, + 0x8F, 0xD1, 0xA1, 0xD1, 0xA3, 0xD1, 0xA5, 0xD1, + 0xA7, 0xD1, 0xA9, 0xD1, 0xAB, 0xD1, 0xAD, 0xD1, + 0xAF, 0xD1, 0xB1, 0xD1, 0xB3, 0xD1, 0xB5, 0xD1, + 0xB7, 0xD1, 0xB9, 0xD1, 0xBB, 0xD1, 0xBD, 0xD1, + 0xBF, 0xD2, 0x81, 0xD2, 0x8B, 0xD2, 0x8D, 0xD2, + 0x8F, 0xD2, 0x91, 0xD2, 0x93, 0xD2, 0x95, 0xD2, + 0x97, 0xD2, 0x99, 0xD2, 0x9B, 0xD2, 0x9D, 0xD2, + 0x9F, 0xD2, 0xA1, 0xD2, 0xA3, 0xD2, 0xA5, 0xD2, + 0xA7, 0xD2, 0xA9, 0xD2, 0xAB, 0xD2, 0xAD, 0xD2, + 0xAF, 0xD2, 0xB1, 0xD2, 0xB3, 0xD2, 0xB5, 0xD2, + 0xB7, 0xD2, 0xB9, 0xD2, 0xBB, 0xD2, 0xBD, 0xD2, + 0xBF, 0xD3, 0x8F, 0xD3, 0x82, 0xD3, 0x84, 0xD3, + 0x86, 0xD3, 0x88, 0xD3, 0x8A, 0xD3, 0x8C, 0xD3, + 0x8E, 0xD3, 0x91, 0xD3, 0x93, 0xD3, 0x95, 0xD3, + 0x97, 0xD3, 0x99, 0xD3, 0x9B, 0xD3, 0x9D, 0xD3, + 0x9F, 0xD3, 0xA1, 0xD3, 0xA3, 0xD3, 0xA5, 0xD3, + 0xA7, 0xD3, 0xA9, 0xD3, 0xAB, 0xD3, 0xAD, 0xD3, + 0xAF, 0xD3, 0xB1, 0xD3, 0xB3, 0xD3, 0xB5, 0xD3, + 0xB7, 0xD3, 0xB9, 0xD3, 0xBB, 0xD3, 0xBD, 0xD3, + 0xBF, 0xD4, 0x81, 0xD4, 0x83, 0xD4, 0x85, 0xD4, + 0x87, 0xD4, 0x89, 0xD4, 0x8B, 0xD4, 0x8D, 0xD4, + 0x8F, 0xD4, 0x91, 0xD4, 0x93, 0xD5, 0xA1, 0xD5, + 0xA2, 0xD5, 0xA3, 0xD5, 0xA4, 0xD5, 0xA5, 0xD5, + 0xA6, 0xD5, 0xA7, 0xD5, 0xA8, 0xD5, 0xA9, 0xD5, + 0xAA, 0xD5, 0xAB, 0xD5, 0xAC, 0xD5, 0xAD, 0xD5, + 0xAE, 0xD5, 0xAF, 0xD5, 0xB0, 0xD5, 0xB1, 0xD5, + 0xB2, 0xD5, 0xB3, 0xD5, 0xB4, 0xD5, 0xB5, 0xD5, + 0xB6, 0xD5, 0xB7, 0xD5, 0xB8, 0xD5, 0xB9, 0xD5, + 0xBA, 0xD5, 0xBB, 0xD5, 0xBC, 0xD5, 0xBD, 0xD5, + 0xBE, 0xD5, 0xBF, 0xD6, 0x80, 0xD6, 0x81, 0xD6, + 0x82, 0xD6, 0x83, 0xD6, 0x84, 0xD6, 0x85, 0xD6, + 0x86, 0xE2, 0xB4, 0x80, 0xE2, 0xB4, 0x81, 0xE2, + 0xB4, 0x82, 0xE2, 0xB4, 0x83, 0xE2, 0xB4, 0x84, + 0xE2, 0xB4, 0x85, 0xE2, 0xB4, 0x86, 0xE2, 0xB4, + 0x87, 0xE2, 0xB4, 0x88, 0xE2, 0xB4, 0x89, 0xE2, + 0xB4, 0x8A, 0xE2, 0xB4, 0x8B, 0xE2, 0xB4, 0x8C, + 0xE2, 0xB4, 0x8D, 0xE2, 0xB4, 0x8E, 0xE2, 0xB4, + 0x8F, 0xE2, 0xB4, 0x90, 0xE2, 0xB4, 0x91, 0xE2, + 0xB4, 0x92, 0xE2, 0xB4, 0x93, 0xE2, 0xB4, 0x94, + 0xE2, 0xB4, 0x95, 0xE2, 0xB4, 0x96, 0xE2, 0xB4, + 0x97, 0xE2, 0xB4, 0x98, 0xE2, 0xB4, 0x99, 0xE2, + 0xB4, 0x9A, 0xE2, 0xB4, 0x9B, 0xE2, 0xB4, 0x9C, + 0xE2, 0xB4, 0x9D, 0xE2, 0xB4, 0x9E, 0xE2, 0xB4, + 0x9F, 0xE2, 0xB4, 0xA0, 0xE2, 0xB4, 0xA1, 0xE2, + 0xB4, 0xA2, 0xE2, 0xB4, 0xA3, 0xE2, 0xB4, 0xA4, + 0xE2, 0xB4, 0xA5, 0xE1, 0xB8, 0x81, 0xE1, 0xB8, + 0x83, 0xE1, 0xB8, 0x85, 0xE1, 0xB8, 0x87, 0xE1, + 0xB8, 0x89, 0xE1, 0xB8, 0x8B, 0xE1, 0xB8, 0x8D, + 0xE1, 0xB8, 0x8F, 0xE1, 0xB8, 0x91, 0xE1, 0xB8, + 0x93, 0xE1, 0xB8, 0x95, 0xE1, 0xB8, 0x97, 0xE1, + 0xB8, 0x99, 0xE1, 0xB8, 0x9B, 0xE1, 0xB8, 0x9D, + 0xE1, 0xB8, 0x9F, 0xE1, 0xB8, 0xA1, 0xE1, 0xB8, + 0xA3, 0xE1, 0xB8, 0xA5, 0xE1, 0xB8, 0xA7, 0xE1, + 0xB8, 0xA9, 0xE1, 0xB8, 0xAB, 0xE1, 0xB8, 0xAD, + 0xE1, 0xB8, 0xAF, 0xE1, 0xB8, 0xB1, 0xE1, 0xB8, + 0xB3, 0xE1, 0xB8, 0xB5, 0xE1, 0xB8, 0xB7, 0xE1, + 0xB8, 0xB9, 0xE1, 0xB8, 0xBB, 0xE1, 0xB8, 0xBD, + 0xE1, 0xB8, 0xBF, 0xE1, 0xB9, 0x81, 0xE1, 0xB9, + 0x83, 0xE1, 0xB9, 0x85, 0xE1, 0xB9, 0x87, 0xE1, + 0xB9, 0x89, 0xE1, 0xB9, 0x8B, 0xE1, 0xB9, 0x8D, + 0xE1, 0xB9, 0x8F, 0xE1, 0xB9, 0x91, 0xE1, 0xB9, + 0x93, 0xE1, 0xB9, 0x95, 0xE1, 0xB9, 0x97, 0xE1, + 0xB9, 0x99, 0xE1, 0xB9, 0x9B, 0xE1, 0xB9, 0x9D, + 0xE1, 0xB9, 0x9F, 0xE1, 0xB9, 0xA1, 0xE1, 0xB9, + 0xA3, 0xE1, 0xB9, 0xA5, 0xE1, 0xB9, 0xA7, 0xE1, + 0xB9, 0xA9, 0xE1, 0xB9, 0xAB, 0xE1, 0xB9, 0xAD, + 0xE1, 0xB9, 0xAF, 0xE1, 0xB9, 0xB1, 0xE1, 0xB9, + 0xB3, 0xE1, 0xB9, 0xB5, 0xE1, 0xB9, 0xB7, 0xE1, + 0xB9, 0xB9, 0xE1, 0xB9, 0xBB, 0xE1, 0xB9, 0xBD, + 0xE1, 0xB9, 0xBF, 0xE1, 0xBA, 0x81, 0xE1, 0xBA, + 0x83, 0xE1, 0xBA, 0x85, 0xE1, 0xBA, 0x87, 0xE1, + 0xBA, 0x89, 0xE1, 0xBA, 0x8B, 0xE1, 0xBA, 0x8D, + 0xE1, 0xBA, 0x8F, 0xE1, 0xBA, 0x91, 0xE1, 0xBA, + 0x93, 0xE1, 0xBA, 0x95, 0xE1, 0xBA, 0xA1, 0xE1, + 0xBA, 0xA3, 0xE1, 0xBA, 0xA5, 0xE1, 0xBA, 0xA7, + 0xE1, 0xBA, 0xA9, 0xE1, 0xBA, 0xAB, 0xE1, 0xBA, + 0xAD, 0xE1, 0xBA, 0xAF, 0xE1, 0xBA, 0xB1, 0xE1, + 0xBA, 0xB3, 0xE1, 0xBA, 0xB5, 0xE1, 0xBA, 0xB7, + 0xE1, 0xBA, 0xB9, 0xE1, 0xBA, 0xBB, 0xE1, 0xBA, + 0xBD, 0xE1, 0xBA, 0xBF, 0xE1, 0xBB, 0x81, 0xE1, + 0xBB, 0x83, 0xE1, 0xBB, 0x85, 0xE1, 0xBB, 0x87, + 0xE1, 0xBB, 0x89, 0xE1, 0xBB, 0x8B, 0xE1, 0xBB, + 0x8D, 0xE1, 0xBB, 0x8F, 0xE1, 0xBB, 0x91, 0xE1, + 0xBB, 0x93, 0xE1, 0xBB, 0x95, 0xE1, 0xBB, 0x97, + 0xE1, 0xBB, 0x99, 0xE1, 0xBB, 0x9B, 0xE1, 0xBB, + 0x9D, 0xE1, 0xBB, 0x9F, 0xE1, 0xBB, 0xA1, 0xE1, + 0xBB, 0xA3, 0xE1, 0xBB, 0xA5, 0xE1, 0xBB, 0xA7, + 0xE1, 0xBB, 0xA9, 0xE1, 0xBB, 0xAB, 0xE1, 0xBB, + 0xAD, 0xE1, 0xBB, 0xAF, 0xE1, 0xBB, 0xB1, 0xE1, + 0xBB, 0xB3, 0xE1, 0xBB, 0xB5, 0xE1, 0xBB, 0xB7, + 0xE1, 0xBB, 0xB9, 0xE1, 0xBC, 0x80, 0xE1, 0xBC, + 0x81, 0xE1, 0xBC, 0x82, 0xE1, 0xBC, 0x83, 0xE1, + 0xBC, 0x84, 0xE1, 0xBC, 0x85, 0xE1, 0xBC, 0x86, + 0xE1, 0xBC, 0x87, 0xE1, 0xBC, 0x90, 0xE1, 0xBC, + 0x91, 0xE1, 0xBC, 0x92, 0xE1, 0xBC, 0x93, 0xE1, + 0xBC, 0x94, 0xE1, 0xBC, 0x95, 0xE1, 0xBC, 0xA0, + 0xE1, 0xBC, 0xA1, 0xE1, 0xBC, 0xA2, 0xE1, 0xBC, + 0xA3, 0xE1, 0xBC, 0xA4, 0xE1, 0xBC, 0xA5, 0xE1, + 0xBC, 0xA6, 0xE1, 0xBC, 0xA7, 0xE1, 0xBC, 0xB0, + 0xE1, 0xBC, 0xB1, 0xE1, 0xBC, 0xB2, 0xE1, 0xBC, + 0xB3, 0xE1, 0xBC, 0xB4, 0xE1, 0xBC, 0xB5, 0xE1, + 0xBC, 0xB6, 0xE1, 0xBC, 0xB7, 0xE1, 0xBD, 0x80, + 0xE1, 0xBD, 0x81, 0xE1, 0xBD, 0x82, 0xE1, 0xBD, + 0x83, 0xE1, 0xBD, 0x84, 0xE1, 0xBD, 0x85, 0xE1, + 0xBD, 0x91, 0xE1, 0xBD, 0x93, 0xE1, 0xBD, 0x95, + 0xE1, 0xBD, 0x97, 0xE1, 0xBD, 0xA0, 0xE1, 0xBD, + 0xA1, 0xE1, 0xBD, 0xA2, 0xE1, 0xBD, 0xA3, 0xE1, + 0xBD, 0xA4, 0xE1, 0xBD, 0xA5, 0xE1, 0xBD, 0xA6, + 0xE1, 0xBD, 0xA7, 0xE1, 0xBE, 0x80, 0xE1, 0xBE, + 0x81, 0xE1, 0xBE, 0x82, 0xE1, 0xBE, 0x83, 0xE1, + 0xBE, 0x84, 0xE1, 0xBE, 0x85, 0xE1, 0xBE, 0x86, + 0xE1, 0xBE, 0x87, 0xE1, 0xBE, 0x90, 0xE1, 0xBE, + 0x91, 0xE1, 0xBE, 0x92, 0xE1, 0xBE, 0x93, 0xE1, + 0xBE, 0x94, 0xE1, 0xBE, 0x95, 0xE1, 0xBE, 0x96, + 0xE1, 0xBE, 0x97, 0xE1, 0xBE, 0xA0, 0xE1, 0xBE, + 0xA1, 0xE1, 0xBE, 0xA2, 0xE1, 0xBE, 0xA3, 0xE1, + 0xBE, 0xA4, 0xE1, 0xBE, 0xA5, 0xE1, 0xBE, 0xA6, + 0xE1, 0xBE, 0xA7, 0xE1, 0xBE, 0xB0, 0xE1, 0xBE, + 0xB1, 0xE1, 0xBD, 0xB0, 0xE1, 0xBD, 0xB1, 0xE1, + 0xBE, 0xB3, 0xE1, 0xBD, 0xB2, 0xE1, 0xBD, 0xB3, + 0xE1, 0xBD, 0xB4, 0xE1, 0xBD, 0xB5, 0xE1, 0xBF, + 0x83, 0xE1, 0xBF, 0x90, 0xE1, 0xBF, 0x91, 0xE1, + 0xBD, 0xB6, 0xE1, 0xBD, 0xB7, 0xE1, 0xBF, 0xA0, + 0xE1, 0xBF, 0xA1, 0xE1, 0xBD, 0xBA, 0xE1, 0xBD, + 0xBB, 0xE1, 0xBF, 0xA5, 0xE1, 0xBD, 0xB8, 0xE1, + 0xBD, 0xB9, 0xE1, 0xBD, 0xBC, 0xE1, 0xBD, 0xBD, + 0xE1, 0xBF, 0xB3, 0xCF, 0x89, 0x6B, 0xC3, 0xA5, + 0xE2, 0x85, 0x8E, 0xE2, 0x85, 0xB0, 0xE2, 0x85, + 0xB1, 0xE2, 0x85, 0xB2, 0xE2, 0x85, 0xB3, 0xE2, + 0x85, 0xB4, 0xE2, 0x85, 0xB5, 0xE2, 0x85, 0xB6, + 0xE2, 0x85, 0xB7, 0xE2, 0x85, 0xB8, 0xE2, 0x85, + 0xB9, 0xE2, 0x85, 0xBA, 0xE2, 0x85, 0xBB, 0xE2, + 0x85, 0xBC, 0xE2, 0x85, 0xBD, 0xE2, 0x85, 0xBE, + 0xE2, 0x85, 0xBF, 0xE2, 0x86, 0x84, 0xE2, 0x93, + 0x90, 0xE2, 0x93, 0x91, 0xE2, 0x93, 0x92, 0xE2, + 0x93, 0x93, 0xE2, 0x93, 0x94, 0xE2, 0x93, 0x95, + 0xE2, 0x93, 0x96, 0xE2, 0x93, 0x97, 0xE2, 0x93, + 0x98, 0xE2, 0x93, 0x99, 0xE2, 0x93, 0x9A, 0xE2, + 0x93, 0x9B, 0xE2, 0x93, 0x9C, 0xE2, 0x93, 0x9D, + 0xE2, 0x93, 0x9E, 0xE2, 0x93, 0x9F, 0xE2, 0x93, + 0xA0, 0xE2, 0x93, 0xA1, 0xE2, 0x93, 0xA2, 0xE2, + 0x93, 0xA3, 0xE2, 0x93, 0xA4, 0xE2, 0x93, 0xA5, + 0xE2, 0x93, 0xA6, 0xE2, 0x93, 0xA7, 0xE2, 0x93, + 0xA8, 0xE2, 0x93, 0xA9, 0xE2, 0xB0, 0xB0, 0xE2, + 0xB0, 0xB1, 0xE2, 0xB0, 0xB2, 0xE2, 0xB0, 0xB3, + 0xE2, 0xB0, 0xB4, 0xE2, 0xB0, 0xB5, 0xE2, 0xB0, + 0xB6, 0xE2, 0xB0, 0xB7, 0xE2, 0xB0, 0xB8, 0xE2, + 0xB0, 0xB9, 0xE2, 0xB0, 0xBA, 0xE2, 0xB0, 0xBB, + 0xE2, 0xB0, 0xBC, 0xE2, 0xB0, 0xBD, 0xE2, 0xB0, + 0xBE, 0xE2, 0xB0, 0xBF, 0xE2, 0xB1, 0x80, 0xE2, + 0xB1, 0x81, 0xE2, 0xB1, 0x82, 0xE2, 0xB1, 0x83, + 0xE2, 0xB1, 0x84, 0xE2, 0xB1, 0x85, 0xE2, 0xB1, + 0x86, 0xE2, 0xB1, 0x87, 0xE2, 0xB1, 0x88, 0xE2, + 0xB1, 0x89, 0xE2, 0xB1, 0x8A, 0xE2, 0xB1, 0x8B, + 0xE2, 0xB1, 0x8C, 0xE2, 0xB1, 0x8D, 0xE2, 0xB1, + 0x8E, 0xE2, 0xB1, 0x8F, 0xE2, 0xB1, 0x90, 0xE2, + 0xB1, 0x91, 0xE2, 0xB1, 0x92, 0xE2, 0xB1, 0x93, + 0xE2, 0xB1, 0x94, 0xE2, 0xB1, 0x95, 0xE2, 0xB1, + 0x96, 0xE2, 0xB1, 0x97, 0xE2, 0xB1, 0x98, 0xE2, + 0xB1, 0x99, 0xE2, 0xB1, 0x9A, 0xE2, 0xB1, 0x9B, + 0xE2, 0xB1, 0x9C, 0xE2, 0xB1, 0x9D, 0xE2, 0xB1, + 0x9E, 0xE2, 0xB1, 0xA1, 0xC9, 0xAB, 0xE1, 0xB5, + 0xBD, 0xC9, 0xBD, 0xE2, 0xB1, 0xA8, 0xE2, 0xB1, + 0xAA, 0xE2, 0xB1, 0xAC, 0xE2, 0xB1, 0xB6, 0xE2, + 0xB2, 0x81, 0xE2, 0xB2, 0x83, 0xE2, 0xB2, 0x85, + 0xE2, 0xB2, 0x87, 0xE2, 0xB2, 0x89, 0xE2, 0xB2, + 0x8B, 0xE2, 0xB2, 0x8D, 0xE2, 0xB2, 0x8F, 0xE2, + 0xB2, 0x91, 0xE2, 0xB2, 0x93, 0xE2, 0xB2, 0x95, + 0xE2, 0xB2, 0x97, 0xE2, 0xB2, 0x99, 0xE2, 0xB2, + 0x9B, 0xE2, 0xB2, 0x9D, 0xE2, 0xB2, 0x9F, 0xE2, + 0xB2, 0xA1, 0xE2, 0xB2, 0xA3, 0xE2, 0xB2, 0xA5, + 0xE2, 0xB2, 0xA7, 0xE2, 0xB2, 0xA9, 0xE2, 0xB2, + 0xAB, 0xE2, 0xB2, 0xAD, 0xE2, 0xB2, 0xAF, 0xE2, + 0xB2, 0xB1, 0xE2, 0xB2, 0xB3, 0xE2, 0xB2, 0xB5, + 0xE2, 0xB2, 0xB7, 0xE2, 0xB2, 0xB9, 0xE2, 0xB2, + 0xBB, 0xE2, 0xB2, 0xBD, 0xE2, 0xB2, 0xBF, 0xE2, + 0xB3, 0x81, 0xE2, 0xB3, 0x83, 0xE2, 0xB3, 0x85, + 0xE2, 0xB3, 0x87, 0xE2, 0xB3, 0x89, 0xE2, 0xB3, + 0x8B, 0xE2, 0xB3, 0x8D, 0xE2, 0xB3, 0x8F, 0xE2, + 0xB3, 0x91, 0xE2, 0xB3, 0x93, 0xE2, 0xB3, 0x95, + 0xE2, 0xB3, 0x97, 0xE2, 0xB3, 0x99, 0xE2, 0xB3, + 0x9B, 0xE2, 0xB3, 0x9D, 0xE2, 0xB3, 0x9F, 0xE2, + 0xB3, 0xA1, 0xE2, 0xB3, 0xA3, 0xEF, 0xBD, 0x81, + 0xEF, 0xBD, 0x82, 0xEF, 0xBD, 0x83, 0xEF, 0xBD, + 0x84, 0xEF, 0xBD, 0x85, 0xEF, 0xBD, 0x86, 0xEF, + 0xBD, 0x87, 0xEF, 0xBD, 0x88, 0xEF, 0xBD, 0x89, + 0xEF, 0xBD, 0x8A, 0xEF, 0xBD, 0x8B, 0xEF, 0xBD, + 0x8C, 0xEF, 0xBD, 0x8D, 0xEF, 0xBD, 0x8E, 0xEF, + 0xBD, 0x8F, 0xEF, 0xBD, 0x90, 0xEF, 0xBD, 0x91, + 0xEF, 0xBD, 0x92, 0xEF, 0xBD, 0x93, 0xEF, 0xBD, + 0x94, 0xEF, 0xBD, 0x95, 0xEF, 0xBD, 0x96, 0xEF, + 0xBD, 0x97, 0xEF, 0xBD, 0x98, 0xEF, 0xBD, 0x99, + 0xEF, 0xBD, 0x9A, 0xF0, 0x90, 0x90, 0xA8, 0xF0, + 0x90, 0x90, 0xA9, 0xF0, 0x90, 0x90, 0xAA, 0xF0, + 0x90, 0x90, 0xAB, 0xF0, 0x90, 0x90, 0xAC, 0xF0, + 0x90, 0x90, 0xAD, 0xF0, 0x90, 0x90, 0xAE, 0xF0, + 0x90, 0x90, 0xAF, 0xF0, 0x90, 0x90, 0xB0, 0xF0, + 0x90, 0x90, 0xB1, 0xF0, 0x90, 0x90, 0xB2, 0xF0, + 0x90, 0x90, 0xB3, 0xF0, 0x90, 0x90, 0xB4, 0xF0, + 0x90, 0x90, 0xB5, 0xF0, 0x90, 0x90, 0xB6, 0xF0, + 0x90, 0x90, 0xB7, 0xF0, 0x90, 0x90, 0xB8, 0xF0, + 0x90, 0x90, 0xB9, 0xF0, 0x90, 0x90, 0xBA, 0xF0, + 0x90, 0x90, 0xBB, 0xF0, 0x90, 0x90, 0xBC, 0xF0, + 0x90, 0x90, 0xBD, 0xF0, 0x90, 0x90, 0xBE, 0xF0, + 0x90, 0x90, 0xBF, 0xF0, 0x90, 0x91, 0x80, 0xF0, + 0x90, 0x91, 0x81, 0xF0, 0x90, 0x91, 0x82, 0xF0, + 0x90, 0x91, 0x83, 0xF0, 0x90, 0x91, 0x84, 0xF0, + 0x90, 0x91, 0x85, 0xF0, 0x90, 0x91, 0x86, 0xF0, + 0x90, 0x91, 0x87, 0xF0, 0x90, 0x91, 0x88, 0xF0, + 0x90, 0x91, 0x89, 0xF0, 0x90, 0x91, 0x8A, 0xF0, + 0x90, 0x91, 0x8B, 0xF0, 0x90, 0x91, 0x8C, 0xF0, + 0x90, 0x91, 0x8D, 0xF0, 0x90, 0x91, 0x8E, 0xF0, + 0x90, 0x91, 0x8F, + }, +}; + +static const u8_displacement_t u8_toupper_b3_tbl[2][5][256] = { + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, + { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 226 }, + { 6, 288 }, { 7, 338 }, { 8, 364 }, { N_, 0 }, + { N_, 0 }, { 9, 376 }, { 10, 378 }, { 11, 416 }, + { 12, 486 }, { 13, 518 }, { 14, 614 }, { 15, 670 }, + { 16, 724 }, { 17, 740 }, { 18, 802 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 19, 816 }, { 20, 912 }, { 21, 1008 }, { 22, 1092 }, + { 23, 1179 }, { 24, 1269 }, { 25, 1365 }, { 26, 1448 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 27, 1469 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 28, 1517 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 29, 1595 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 30, 1673 }, { 31, 1769 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, + { + { /* Third byte table 0. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { 0, 0 }, { 1, 2 }, + { 2, 64 }, { 3, 125 }, { 4, 188 }, { 5, 230 }, + { 6, 292 }, { 7, 344 }, { 8, 388 }, { N_, 0 }, + { N_, 0 }, { 9, 404 }, { 10, 412 }, { 11, 450 }, + { 12, 524 }, { 13, 556 }, { 14, 652 }, { 15, 708 }, + { 16, 772 }, { 17, 792 }, { 18, 854 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 1. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 19, 868 }, { N_, 0 }, { N_, 0 }, + { 20, 871 }, { 21, 967 }, { 22, 1063 }, { 23, 1147 }, + { 24, 1234 }, { 25, 1324 }, { 26, 1420 }, { 27, 1503 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 2. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 28, 1524 }, { 29, 1575 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { 30, 1578 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 31, 1656 }, { 32, 1704 }, { 33, 1816 }, { 34, 1912 }, + { 35, 1966 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 3. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { 36, 2080 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + { /* Third byte table 4. */ + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { 37, 2158 }, { 38, 2254 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + { N_, 0 }, { N_, 0 }, { N_, 0 }, { N_, 0 }, + }, + }, +}; + +static const uchar_t u8_toupper_b4_tbl[2][39][257] = { + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 49, 49, 51, 51, 53, 53, + 55, 55, 55, 57, 57, 59, 59, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 58, 58, 60, 60, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 2, 4, 4, + 4, 6, 6, 6, 6, 8, 8, 8, + 8, 8, 8, 10, 10, 10, 12, 12, + 12, 12, 14, 14, 14, 14, 14, 16, + 16, 16, 18, 18, 20, 20, 22, 22, + 22, 24, 24, 24, 24, 24, 26, 26, + 26, 28, 28, 28, 28, 30, 30, 32, + 32, 32, 34, 34, 34, 34, 36, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 4, + 4, 6, 8, 8, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 48, 50, 52, 52, 54, 54, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 50, 50, 50, + 50, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 2, 4, 4, 6, + 8, 8, 10, 10, 12, 12, 12, 12, + 12, 14, 14, 14, 16, 16, 16, 16, + 16, 18, 20, 20, 20, 20, 20, 20, + 22, 22, 22, 24, 24, 24, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, + 26, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 4, 4, + 4, 6, 6, 8, 10, 10, 10, 10, + 10, 10, 10, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, 12, 12, 12, 12, 12, 12, 12, + 12, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 6, + 8, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 30, 32, 34, 34, 34, 34, 36, 38, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 64, 66, 68, 68, 68, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, 70, 70, 70, 70, 70, 70, 70, + 70, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 64, 66, 66, 68, 68, 70, 70, + 72, 72, 74, 74, 76, 76, 78, 78, + 80, 80, 82, 82, 84, 84, 86, 86, + 88, 88, 90, 90, 92, 92, 94, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 4, 4, 6, + 6, 8, 8, 10, 10, 12, 12, 14, + 14, 14, 16, 16, 18, 18, 20, 20, + 22, 22, 24, 24, 26, 26, 28, 28, + 30, 30, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 52, 52, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 33, 33, 33, 33, 36, 36, 36, 36, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 57, 60, 63, 66, 69, 72, 75, + 78, 81, 84, 87, 90, 93, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 78, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 12, 15, 15, 15, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, + }, + }, + { + { /* Fourth byte table 0. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, + }, + { /* Fourth byte table 1. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 2. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 49, 49, 51, 51, 53, 53, + 55, 55, 55, 57, 57, 59, 59, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, 61, 61, 61, 61, 61, 61, 61, + 61, + }, + { /* Fourth byte table 3. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 4, 4, 6, 6, 8, + 8, 10, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 58, 58, 60, 60, 62, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, 63, 63, 63, 63, 63, 63, 63, + 63, + }, + { /* Fourth byte table 4. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 6, 6, + 6, 8, 8, 8, 8, 10, 10, 10, + 10, 10, 10, 12, 12, 12, 14, 14, + 14, 14, 16, 18, 18, 18, 18, 20, + 20, 20, 22, 22, 24, 24, 26, 26, + 26, 28, 28, 28, 28, 28, 30, 30, + 30, 32, 32, 32, 32, 34, 34, 36, + 36, 36, 38, 38, 38, 38, 40, 40, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, + }, + { /* Fourth byte table 5. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 4, + 4, 6, 8, 8, 10, 12, 12, 14, + 14, 16, 16, 18, 18, 20, 20, 22, + 22, 24, 24, 26, 26, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 48, 50, 52, 52, 54, 54, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 6. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 32, 32, 34, 34, 36, 36, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 50, 50, + 50, 50, 50, 50, 50, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, 52, 52, 52, 52, 52, 52, 52, + 52, + }, + { /* Fourth byte table 7. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 2, 2, 2, + 4, 4, 6, 6, 8, 8, 10, 10, + 12, 12, 12, 12, 14, 16, 16, 18, + 20, 20, 22, 22, 24, 24, 24, 24, + 24, 26, 26, 26, 28, 28, 28, 28, + 28, 30, 32, 32, 35, 35, 35, 35, + 37, 37, 37, 39, 39, 39, 41, 41, + 41, 41, 41, 41, 41, 41, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, 44, 44, 44, 44, 44, 44, 44, + 44, + }, + { /* Fourth byte table 8. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 2, 2, 4, 4, 4, 4, + 4, 6, 8, 10, 12, 14, 14, 14, + 14, 14, 14, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, 16, 16, 16, 16, 16, 16, 16, + 16, + }, + { /* Fourth byte table 9. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 6, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, 8, 8, 8, 8, 8, 8, 8, + 8, + }, + { /* Fourth byte table 10. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 2, 4, 6, + 8, 8, 10, 12, 14, 16, 18, 20, + 22, 24, 26, 28, 30, 32, 34, 36, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, 38, 38, 38, 38, 38, 38, 38, + 38, + }, + { /* Fourth byte table 11. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 30, 32, 34, 34, 34, 34, 36, 38, + 38, 38, 40, 40, 42, 42, 44, 44, + 46, 46, 48, 48, 50, 50, 52, 52, + 54, 54, 56, 56, 58, 58, 60, 60, + 62, 64, 66, 68, 68, 68, 70, 70, + 70, 72, 72, 72, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, 74, 74, 74, 74, 74, 74, 74, + 74, + }, + { /* Fourth byte table 12. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, 32, 32, 32, 32, 32, 32, 32, + 32, + }, + { /* Fourth byte table 13. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 16, 18, 20, 22, 24, 26, 28, 30, + 32, 34, 36, 38, 40, 42, 44, 46, + 48, 50, 52, 54, 56, 58, 60, 62, + 64, 64, 66, 66, 68, 68, 70, 70, + 72, 72, 74, 74, 76, 76, 78, 78, + 80, 80, 82, 82, 84, 84, 86, 86, + 88, 88, 90, 90, 92, 92, 94, 94, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 14. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, 56, 56, 56, 56, 56, 56, 56, + 56, + }, + { /* Fourth byte table 15. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 2, 2, 4, 4, 6, + 6, 8, 8, 10, 10, 12, 12, 14, + 16, 16, 18, 18, 20, 20, 22, 22, + 24, 24, 26, 26, 28, 28, 30, 30, + 32, 32, 34, 34, 36, 36, 38, 38, + 40, 40, 42, 42, 44, 44, 46, 46, + 48, 48, 50, 50, 52, 52, 54, 54, + 56, 56, 58, 58, 60, 60, 62, 62, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + { /* Fourth byte table 16. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 2, 4, 4, 6, 6, + 8, 8, 10, 10, 12, 12, 14, 14, + 16, 16, 18, 18, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, 20, 20, 20, 20, 20, 20, 20, + 20, + }, + { /* Fourth byte table 17. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 2, 4, 6, 8, 10, 12, + 14, 16, 18, 20, 22, 24, 26, 28, + 30, 32, 34, 36, 38, 40, 42, 44, + 46, 48, 50, 52, 54, 56, 58, 60, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, 62, 62, 62, 62, 62, 62, 62, + 62, + }, + { /* Fourth byte table 18. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 2, 4, 6, 8, 10, 12, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, 14, 14, 14, 14, 14, 14, 14, + 14, + }, + { /* Fourth byte table 19. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 20. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 21. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 22. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 33, 33, 33, 33, 36, 36, 36, 36, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, + 84, + }, + { /* Fourth byte table 23. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, 87, 87, 87, 87, 87, 87, 87, + 87, + }, + { /* Fourth byte table 24. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, + 42, 45, 48, 51, 54, 57, 60, 63, + 66, 66, 66, 66, 66, 66, 66, 66, + 66, 69, 72, 75, 78, 81, 84, 87, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, 90, 90, 90, 90, 90, 90, 90, + 90, + }, + { /* Fourth byte table 25. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 21, 21, 24, 24, 27, 27, + 30, 30, 30, 30, 30, 30, 30, 30, + 30, 33, 36, 39, 42, 45, 48, 51, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 57, 60, 63, 66, 69, 72, 75, + 78, 81, 84, 87, 90, 93, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 26. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 24, 24, 24, 24, 24, 24, 24, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 72, 72, 72, 72, 72, 72, 72, + 72, 75, 78, 78, 81, 81, 81, 81, + 81, 81, 81, 81, 81, 81, 81, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, 83, 83, 83, 83, 83, 83, 83, + 83, + }, + { /* Fourth byte table 27. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 9, 9, 9, 9, 9, + 9, 9, 9, 9, 9, 9, 9, 9, + 9, 12, 15, 15, 15, 15, 18, 18, + 18, 18, 18, 18, 18, 18, 18, 18, + 18, 18, 18, 18, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, 21, 21, 21, 21, 21, 21, 21, + 21, + }, + { /* Fourth byte table 28. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 6, 9, 12, 15, 18, 21, 24, + 27, 30, 33, 36, 39, 42, 45, 48, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, 51, 51, 51, 51, 51, 51, 51, + 51, + }, + { /* Fourth byte table 29. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, + 3, + }, + { /* Fourth byte table 30. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 31. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, 48, 48, 48, 48, 48, 48, 48, + 48, + }, + { /* Fourth byte table 32. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 93, 93, 96, 96, 96, 96, 98, 100, + 100, 103, 103, 106, 106, 109, 109, 109, + 109, 109, 109, 109, 109, 109, 109, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, 112, 112, 112, 112, 112, 112, 112, + 112, + }, + { /* Fourth byte table 33. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 57, 57, + 60, 60, 63, 63, 66, 66, 69, 69, + 72, 72, 75, 75, 78, 78, 81, 81, + 84, 84, 87, 87, 90, 90, 93, 93, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 34. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 3, 6, 6, 9, 9, + 12, 12, 15, 15, 18, 18, 21, 21, + 24, 24, 27, 27, 30, 30, 33, 33, + 36, 36, 39, 39, 42, 42, 45, 45, + 48, 48, 51, 51, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, 54, 54, 54, 54, 54, 54, 54, + 54, + }, + { /* Fourth byte table 35. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 6, 9, 12, 15, 18, 21, + 24, 27, 30, 33, 36, 39, 42, 45, + 48, 51, 54, 57, 60, 63, 66, 69, + 72, 75, 78, 81, 84, 87, 90, 93, + 96, 99, 102, 105, 108, 111, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, 114, 114, 114, 114, 114, 114, 114, + 114, + }, + { /* Fourth byte table 36. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 3, 6, 9, 12, 15, 18, + 21, 24, 27, 30, 33, 36, 39, 42, + 45, 48, 51, 54, 57, 60, 63, 66, + 69, 72, 75, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, 78, 78, 78, 78, 78, 78, 78, + 78, + }, + { /* Fourth byte table 37. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 68, 72, 76, 80, 84, 88, 92, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, 96, 96, 96, 96, 96, 96, 96, + 96, + }, + { /* Fourth byte table 38. */ + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 4, 8, 12, 16, 20, 24, 28, + 32, 36, 40, 44, 48, 52, 56, 60, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, 64, 64, 64, 64, 64, 64, 64, + 64, + }, + }, +}; + +static const uchar_t u8_toupper_final_tbl[2][2318] = { + { + 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, + 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, + 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, + 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, + 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, + 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, + 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, + 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, + 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, + 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, + 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, + 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, + 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, + 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, + 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, + 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, + 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, + 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, + 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, + 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, + 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, + 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, + 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, + 0xBB, 0xC5, 0xBD, 0x53, 0xC6, 0x82, 0xC6, 0x84, + 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, 0xC7, 0xB6, + 0xC6, 0x98, 0xC8, 0xA0, 0xC6, 0xA0, 0xC6, 0xA2, + 0xC6, 0xA4, 0xC6, 0xA7, 0xC6, 0xAC, 0xC6, 0xAF, + 0xC6, 0xB3, 0xC6, 0xB5, 0xC6, 0xB8, 0xC6, 0xBC, + 0xC7, 0xB7, 0xC7, 0x84, 0xC7, 0x84, 0xC7, 0x87, + 0xC7, 0x87, 0xC7, 0x8A, 0xC7, 0x8A, 0xC7, 0x8D, + 0xC7, 0x8F, 0xC7, 0x91, 0xC7, 0x93, 0xC7, 0x95, + 0xC7, 0x97, 0xC7, 0x99, 0xC7, 0x9B, 0xC6, 0x8E, + 0xC7, 0x9E, 0xC7, 0xA0, 0xC7, 0xA2, 0xC7, 0xA4, + 0xC7, 0xA6, 0xC7, 0xA8, 0xC7, 0xAA, 0xC7, 0xAC, + 0xC7, 0xAE, 0xC7, 0xB1, 0xC7, 0xB1, 0xC7, 0xB4, + 0xC7, 0xB8, 0xC7, 0xBA, 0xC7, 0xBC, 0xC7, 0xBE, + 0xC8, 0x80, 0xC8, 0x82, 0xC8, 0x84, 0xC8, 0x86, + 0xC8, 0x88, 0xC8, 0x8A, 0xC8, 0x8C, 0xC8, 0x8E, + 0xC8, 0x90, 0xC8, 0x92, 0xC8, 0x94, 0xC8, 0x96, + 0xC8, 0x98, 0xC8, 0x9A, 0xC8, 0x9C, 0xC8, 0x9E, + 0xC8, 0xA2, 0xC8, 0xA4, 0xC8, 0xA6, 0xC8, 0xA8, + 0xC8, 0xAA, 0xC8, 0xAC, 0xC8, 0xAE, 0xC8, 0xB0, + 0xC8, 0xB2, 0xC6, 0x81, 0xC6, 0x86, 0xC6, 0x89, + 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, 0xC6, 0x93, + 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, 0xC6, 0x9C, + 0xC6, 0x9D, 0xC6, 0x9F, 0xC6, 0xA6, 0xC6, 0xA9, + 0xC6, 0xAE, 0xC6, 0xB1, 0xC6, 0xB2, 0xC6, 0xB7, + 0xCE, 0x99, 0xCE, 0x86, 0xCE, 0x88, 0xCE, 0x89, + 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, 0xCE, 0x93, + 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, 0xCE, 0x97, + 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, 0xCE, 0x9B, + 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, 0xCE, 0x9F, + 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0xA3, + 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, 0xCE, 0xA7, + 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, 0xCE, 0xAB, + 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, 0xCE, 0x92, + 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, 0xCF, 0x98, + 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, 0xCF, 0xA0, + 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, 0xCF, 0xA8, + 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, 0xCE, 0x9A, + 0xCE, 0xA1, 0xCE, 0xA3, 0xCE, 0x95, 0xD0, 0x90, + 0xD0, 0x91, 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, + 0xD0, 0x95, 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, + 0xD0, 0x99, 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, + 0xD0, 0x9D, 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, + 0xD0, 0xA1, 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, + 0xD0, 0xA5, 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, + 0xD0, 0xA9, 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, + 0xD0, 0xAD, 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, + 0xD0, 0x81, 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, + 0xD0, 0x85, 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, + 0xD0, 0x89, 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, + 0xD0, 0x8D, 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, + 0xD1, 0xA2, 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, + 0xD1, 0xAA, 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, + 0xD1, 0xB2, 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, + 0xD1, 0xBA, 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, + 0xD2, 0x8A, 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, + 0xD2, 0x92, 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, + 0xD2, 0x9A, 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, + 0xD2, 0xA2, 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, + 0xD2, 0xAA, 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, + 0xD2, 0xB2, 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, + 0xD2, 0xBA, 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, + 0xD3, 0x83, 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, + 0xD3, 0x8B, 0xD3, 0x8D, 0xD3, 0x90, 0xD3, 0x92, + 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, + 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, + 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, + 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, + 0xD3, 0xB4, 0xD3, 0xB8, 0xD4, 0x80, 0xD4, 0x82, + 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, + 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0xB1, 0xD4, 0xB2, + 0xD4, 0xB3, 0xD4, 0xB4, 0xD4, 0xB5, 0xD4, 0xB6, + 0xD4, 0xB7, 0xD4, 0xB8, 0xD4, 0xB9, 0xD4, 0xBA, + 0xD4, 0xBB, 0xD4, 0xBC, 0xD4, 0xBD, 0xD4, 0xBE, + 0xD4, 0xBF, 0xD5, 0x80, 0xD5, 0x81, 0xD5, 0x82, + 0xD5, 0x83, 0xD5, 0x84, 0xD5, 0x85, 0xD5, 0x86, + 0xD5, 0x87, 0xD5, 0x88, 0xD5, 0x89, 0xD5, 0x8A, + 0xD5, 0x8B, 0xD5, 0x8C, 0xD5, 0x8D, 0xD5, 0x8E, + 0xD5, 0x8F, 0xD5, 0x90, 0xD5, 0x91, 0xD5, 0x92, + 0xD5, 0x93, 0xD5, 0x94, 0xD5, 0x95, 0xD5, 0x96, + 0xE1, 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, + 0x84, 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, + 0xB8, 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, + 0xE1, 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, + 0x94, 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, + 0xB8, 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, + 0xE1, 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, + 0xA4, 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, + 0xB8, 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, + 0xE1, 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, + 0xB4, 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, + 0xB8, 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, + 0xE1, 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, + 0x84, 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, + 0xB9, 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, + 0xE1, 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, + 0x94, 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, + 0xB9, 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, + 0xE1, 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, + 0xA4, 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, + 0xB9, 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, + 0xE1, 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, + 0xB4, 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, + 0xB9, 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, + 0xE1, 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, + 0x84, 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, + 0xBA, 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, + 0xE1, 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, + 0x94, 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, + 0xBA, 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, + 0xE1, 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, + 0xAC, 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, + 0xBA, 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, + 0xE1, 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, + 0xBC, 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, + 0xBB, 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, + 0xE1, 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, + 0x8C, 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, + 0xBB, 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, + 0xE1, 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, + 0x9C, 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, + 0xBB, 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, + 0xE1, 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, + 0xAC, 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, + 0xBB, 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, + 0xE1, 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, + 0x89, 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, + 0xBC, 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, + 0xE1, 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, + 0x99, 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, + 0xBC, 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, + 0xE1, 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, + 0xAB, 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, + 0xBC, 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, + 0xE1, 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, + 0xBB, 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, + 0xBC, 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, + 0xE1, 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, + 0x8B, 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, + 0xBD, 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, + 0xE1, 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, + 0xA9, 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, + 0xBD, 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, + 0xE1, 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, + 0xBB, 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, + 0xBF, 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, + 0xE1, 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, + 0xB9, 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, + 0xBF, 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, + 0xE1, 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, + 0x8B, 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, + 0xBE, 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, + 0xE1, 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, + 0x9B, 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, + 0xBE, 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, + 0xE1, 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, + 0xAB, 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, + 0xBE, 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, + 0xE1, 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, + 0xE1, 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, + 0x99, 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, + 0xBF, 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x85, 0xA0, + 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, 0xE2, 0x85, + 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, 0xA5, 0xE2, + 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, 0x85, 0xA8, + 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, 0xE2, 0x85, + 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, 0xAD, 0xE2, + 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, 0x92, 0xB6, + 0xE2, 0x92, 0xB7, 0xE2, 0x92, 0xB8, 0xE2, 0x92, + 0xB9, 0xE2, 0x92, 0xBA, 0xE2, 0x92, 0xBB, 0xE2, + 0x92, 0xBC, 0xE2, 0x92, 0xBD, 0xE2, 0x92, 0xBE, + 0xE2, 0x92, 0xBF, 0xE2, 0x93, 0x80, 0xE2, 0x93, + 0x81, 0xE2, 0x93, 0x82, 0xE2, 0x93, 0x83, 0xE2, + 0x93, 0x84, 0xE2, 0x93, 0x85, 0xE2, 0x93, 0x86, + 0xE2, 0x93, 0x87, 0xE2, 0x93, 0x88, 0xE2, 0x93, + 0x89, 0xE2, 0x93, 0x8A, 0xE2, 0x93, 0x8B, 0xE2, + 0x93, 0x8C, 0xE2, 0x93, 0x8D, 0xE2, 0x93, 0x8E, + 0xE2, 0x93, 0x8F, 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, + 0xA2, 0xEF, 0xBC, 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, + 0xBC, 0xA5, 0xEF, 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, + 0xEF, 0xBC, 0xA8, 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, + 0xAA, 0xEF, 0xBC, 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, + 0xBC, 0xAD, 0xEF, 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, + 0xEF, 0xBC, 0xB0, 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, + 0xB2, 0xEF, 0xBC, 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, + 0xBC, 0xB5, 0xEF, 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, + 0xEF, 0xBC, 0xB8, 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, + 0xBA, 0xF0, 0x90, 0x90, 0x80, 0xF0, 0x90, 0x90, + 0x81, 0xF0, 0x90, 0x90, 0x82, 0xF0, 0x90, 0x90, + 0x83, 0xF0, 0x90, 0x90, 0x84, 0xF0, 0x90, 0x90, + 0x85, 0xF0, 0x90, 0x90, 0x86, 0xF0, 0x90, 0x90, + 0x87, 0xF0, 0x90, 0x90, 0x88, 0xF0, 0x90, 0x90, + 0x89, 0xF0, 0x90, 0x90, 0x8A, 0xF0, 0x90, 0x90, + 0x8B, 0xF0, 0x90, 0x90, 0x8C, 0xF0, 0x90, 0x90, + 0x8D, 0xF0, 0x90, 0x90, 0x8E, 0xF0, 0x90, 0x90, + 0x8F, 0xF0, 0x90, 0x90, 0x90, 0xF0, 0x90, 0x90, + 0x91, 0xF0, 0x90, 0x90, 0x92, 0xF0, 0x90, 0x90, + 0x93, 0xF0, 0x90, 0x90, 0x94, 0xF0, 0x90, 0x90, + 0x95, 0xF0, 0x90, 0x90, 0x96, 0xF0, 0x90, 0x90, + 0x97, 0xF0, 0x90, 0x90, 0x98, 0xF0, 0x90, 0x90, + 0x99, 0xF0, 0x90, 0x90, 0x9A, 0xF0, 0x90, 0x90, + 0x9B, 0xF0, 0x90, 0x90, 0x9C, 0xF0, 0x90, 0x90, + 0x9D, 0xF0, 0x90, 0x90, 0x9E, 0xF0, 0x90, 0x90, + 0x9F, 0xF0, 0x90, 0x90, 0xA0, 0xF0, 0x90, 0x90, + 0xA1, 0xF0, 0x90, 0x90, 0xA2, 0xF0, 0x90, 0x90, + 0xA3, 0xF0, 0x90, 0x90, 0xA4, 0xF0, 0x90, 0x90, + 0xA5, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, + }, + { + 0xCE, 0x9C, 0xC3, 0x80, 0xC3, 0x81, 0xC3, 0x82, + 0xC3, 0x83, 0xC3, 0x84, 0xC3, 0x85, 0xC3, 0x86, + 0xC3, 0x87, 0xC3, 0x88, 0xC3, 0x89, 0xC3, 0x8A, + 0xC3, 0x8B, 0xC3, 0x8C, 0xC3, 0x8D, 0xC3, 0x8E, + 0xC3, 0x8F, 0xC3, 0x90, 0xC3, 0x91, 0xC3, 0x92, + 0xC3, 0x93, 0xC3, 0x94, 0xC3, 0x95, 0xC3, 0x96, + 0xC3, 0x98, 0xC3, 0x99, 0xC3, 0x9A, 0xC3, 0x9B, + 0xC3, 0x9C, 0xC3, 0x9D, 0xC3, 0x9E, 0xC5, 0xB8, + 0xC4, 0x80, 0xC4, 0x82, 0xC4, 0x84, 0xC4, 0x86, + 0xC4, 0x88, 0xC4, 0x8A, 0xC4, 0x8C, 0xC4, 0x8E, + 0xC4, 0x90, 0xC4, 0x92, 0xC4, 0x94, 0xC4, 0x96, + 0xC4, 0x98, 0xC4, 0x9A, 0xC4, 0x9C, 0xC4, 0x9E, + 0xC4, 0xA0, 0xC4, 0xA2, 0xC4, 0xA4, 0xC4, 0xA6, + 0xC4, 0xA8, 0xC4, 0xAA, 0xC4, 0xAC, 0xC4, 0xAE, + 0x49, 0xC4, 0xB2, 0xC4, 0xB4, 0xC4, 0xB6, 0xC4, + 0xB9, 0xC4, 0xBB, 0xC4, 0xBD, 0xC4, 0xBF, 0xC5, + 0x81, 0xC5, 0x83, 0xC5, 0x85, 0xC5, 0x87, 0xC5, + 0x8A, 0xC5, 0x8C, 0xC5, 0x8E, 0xC5, 0x90, 0xC5, + 0x92, 0xC5, 0x94, 0xC5, 0x96, 0xC5, 0x98, 0xC5, + 0x9A, 0xC5, 0x9C, 0xC5, 0x9E, 0xC5, 0xA0, 0xC5, + 0xA2, 0xC5, 0xA4, 0xC5, 0xA6, 0xC5, 0xA8, 0xC5, + 0xAA, 0xC5, 0xAC, 0xC5, 0xAE, 0xC5, 0xB0, 0xC5, + 0xB2, 0xC5, 0xB4, 0xC5, 0xB6, 0xC5, 0xB9, 0xC5, + 0xBB, 0xC5, 0xBD, 0x53, 0xC9, 0x83, 0xC6, 0x82, + 0xC6, 0x84, 0xC6, 0x87, 0xC6, 0x8B, 0xC6, 0x91, + 0xC7, 0xB6, 0xC6, 0x98, 0xC8, 0xBD, 0xC8, 0xA0, + 0xC6, 0xA0, 0xC6, 0xA2, 0xC6, 0xA4, 0xC6, 0xA7, + 0xC6, 0xAC, 0xC6, 0xAF, 0xC6, 0xB3, 0xC6, 0xB5, + 0xC6, 0xB8, 0xC6, 0xBC, 0xC7, 0xB7, 0xC7, 0x84, + 0xC7, 0x84, 0xC7, 0x87, 0xC7, 0x87, 0xC7, 0x8A, + 0xC7, 0x8A, 0xC7, 0x8D, 0xC7, 0x8F, 0xC7, 0x91, + 0xC7, 0x93, 0xC7, 0x95, 0xC7, 0x97, 0xC7, 0x99, + 0xC7, 0x9B, 0xC6, 0x8E, 0xC7, 0x9E, 0xC7, 0xA0, + 0xC7, 0xA2, 0xC7, 0xA4, 0xC7, 0xA6, 0xC7, 0xA8, + 0xC7, 0xAA, 0xC7, 0xAC, 0xC7, 0xAE, 0xC7, 0xB1, + 0xC7, 0xB1, 0xC7, 0xB4, 0xC7, 0xB8, 0xC7, 0xBA, + 0xC7, 0xBC, 0xC7, 0xBE, 0xC8, 0x80, 0xC8, 0x82, + 0xC8, 0x84, 0xC8, 0x86, 0xC8, 0x88, 0xC8, 0x8A, + 0xC8, 0x8C, 0xC8, 0x8E, 0xC8, 0x90, 0xC8, 0x92, + 0xC8, 0x94, 0xC8, 0x96, 0xC8, 0x98, 0xC8, 0x9A, + 0xC8, 0x9C, 0xC8, 0x9E, 0xC8, 0xA2, 0xC8, 0xA4, + 0xC8, 0xA6, 0xC8, 0xA8, 0xC8, 0xAA, 0xC8, 0xAC, + 0xC8, 0xAE, 0xC8, 0xB0, 0xC8, 0xB2, 0xC8, 0xBB, + 0xC9, 0x81, 0xC9, 0x86, 0xC9, 0x88, 0xC9, 0x8A, + 0xC9, 0x8C, 0xC9, 0x8E, 0xC6, 0x81, 0xC6, 0x86, + 0xC6, 0x89, 0xC6, 0x8A, 0xC6, 0x8F, 0xC6, 0x90, + 0xC6, 0x93, 0xC6, 0x94, 0xC6, 0x97, 0xC6, 0x96, + 0xE2, 0xB1, 0xA2, 0xC6, 0x9C, 0xC6, 0x9D, 0xC6, + 0x9F, 0xE2, 0xB1, 0xA4, 0xC6, 0xA6, 0xC6, 0xA9, + 0xC6, 0xAE, 0xC9, 0x84, 0xC6, 0xB1, 0xC6, 0xB2, + 0xC9, 0x85, 0xC6, 0xB7, 0xCE, 0x99, 0xCF, 0xBD, + 0xCF, 0xBE, 0xCF, 0xBF, 0xCE, 0x86, 0xCE, 0x88, + 0xCE, 0x89, 0xCE, 0x8A, 0xCE, 0x91, 0xCE, 0x92, + 0xCE, 0x93, 0xCE, 0x94, 0xCE, 0x95, 0xCE, 0x96, + 0xCE, 0x97, 0xCE, 0x98, 0xCE, 0x99, 0xCE, 0x9A, + 0xCE, 0x9B, 0xCE, 0x9C, 0xCE, 0x9D, 0xCE, 0x9E, + 0xCE, 0x9F, 0xCE, 0xA0, 0xCE, 0xA1, 0xCE, 0xA3, + 0xCE, 0xA3, 0xCE, 0xA4, 0xCE, 0xA5, 0xCE, 0xA6, + 0xCE, 0xA7, 0xCE, 0xA8, 0xCE, 0xA9, 0xCE, 0xAA, + 0xCE, 0xAB, 0xCE, 0x8C, 0xCE, 0x8E, 0xCE, 0x8F, + 0xCE, 0x92, 0xCE, 0x98, 0xCE, 0xA6, 0xCE, 0xA0, + 0xCF, 0x98, 0xCF, 0x9A, 0xCF, 0x9C, 0xCF, 0x9E, + 0xCF, 0xA0, 0xCF, 0xA2, 0xCF, 0xA4, 0xCF, 0xA6, + 0xCF, 0xA8, 0xCF, 0xAA, 0xCF, 0xAC, 0xCF, 0xAE, + 0xCE, 0x9A, 0xCE, 0xA1, 0xCF, 0xB9, 0xCE, 0x95, + 0xCF, 0xB7, 0xCF, 0xBA, 0xD0, 0x90, 0xD0, 0x91, + 0xD0, 0x92, 0xD0, 0x93, 0xD0, 0x94, 0xD0, 0x95, + 0xD0, 0x96, 0xD0, 0x97, 0xD0, 0x98, 0xD0, 0x99, + 0xD0, 0x9A, 0xD0, 0x9B, 0xD0, 0x9C, 0xD0, 0x9D, + 0xD0, 0x9E, 0xD0, 0x9F, 0xD0, 0xA0, 0xD0, 0xA1, + 0xD0, 0xA2, 0xD0, 0xA3, 0xD0, 0xA4, 0xD0, 0xA5, + 0xD0, 0xA6, 0xD0, 0xA7, 0xD0, 0xA8, 0xD0, 0xA9, + 0xD0, 0xAA, 0xD0, 0xAB, 0xD0, 0xAC, 0xD0, 0xAD, + 0xD0, 0xAE, 0xD0, 0xAF, 0xD0, 0x80, 0xD0, 0x81, + 0xD0, 0x82, 0xD0, 0x83, 0xD0, 0x84, 0xD0, 0x85, + 0xD0, 0x86, 0xD0, 0x87, 0xD0, 0x88, 0xD0, 0x89, + 0xD0, 0x8A, 0xD0, 0x8B, 0xD0, 0x8C, 0xD0, 0x8D, + 0xD0, 0x8E, 0xD0, 0x8F, 0xD1, 0xA0, 0xD1, 0xA2, + 0xD1, 0xA4, 0xD1, 0xA6, 0xD1, 0xA8, 0xD1, 0xAA, + 0xD1, 0xAC, 0xD1, 0xAE, 0xD1, 0xB0, 0xD1, 0xB2, + 0xD1, 0xB4, 0xD1, 0xB6, 0xD1, 0xB8, 0xD1, 0xBA, + 0xD1, 0xBC, 0xD1, 0xBE, 0xD2, 0x80, 0xD2, 0x8A, + 0xD2, 0x8C, 0xD2, 0x8E, 0xD2, 0x90, 0xD2, 0x92, + 0xD2, 0x94, 0xD2, 0x96, 0xD2, 0x98, 0xD2, 0x9A, + 0xD2, 0x9C, 0xD2, 0x9E, 0xD2, 0xA0, 0xD2, 0xA2, + 0xD2, 0xA4, 0xD2, 0xA6, 0xD2, 0xA8, 0xD2, 0xAA, + 0xD2, 0xAC, 0xD2, 0xAE, 0xD2, 0xB0, 0xD2, 0xB2, + 0xD2, 0xB4, 0xD2, 0xB6, 0xD2, 0xB8, 0xD2, 0xBA, + 0xD2, 0xBC, 0xD2, 0xBE, 0xD3, 0x81, 0xD3, 0x83, + 0xD3, 0x85, 0xD3, 0x87, 0xD3, 0x89, 0xD3, 0x8B, + 0xD3, 0x8D, 0xD3, 0x80, 0xD3, 0x90, 0xD3, 0x92, + 0xD3, 0x94, 0xD3, 0x96, 0xD3, 0x98, 0xD3, 0x9A, + 0xD3, 0x9C, 0xD3, 0x9E, 0xD3, 0xA0, 0xD3, 0xA2, + 0xD3, 0xA4, 0xD3, 0xA6, 0xD3, 0xA8, 0xD3, 0xAA, + 0xD3, 0xAC, 0xD3, 0xAE, 0xD3, 0xB0, 0xD3, 0xB2, + 0xD3, 0xB4, 0xD3, 0xB6, 0xD3, 0xB8, 0xD3, 0xBA, + 0xD3, 0xBC, 0xD3, 0xBE, 0xD4, 0x80, 0xD4, 0x82, + 0xD4, 0x84, 0xD4, 0x86, 0xD4, 0x88, 0xD4, 0x8A, + 0xD4, 0x8C, 0xD4, 0x8E, 0xD4, 0x90, 0xD4, 0x92, + 0xD4, 0xB1, 0xD4, 0xB2, 0xD4, 0xB3, 0xD4, 0xB4, + 0xD4, 0xB5, 0xD4, 0xB6, 0xD4, 0xB7, 0xD4, 0xB8, + 0xD4, 0xB9, 0xD4, 0xBA, 0xD4, 0xBB, 0xD4, 0xBC, + 0xD4, 0xBD, 0xD4, 0xBE, 0xD4, 0xBF, 0xD5, 0x80, + 0xD5, 0x81, 0xD5, 0x82, 0xD5, 0x83, 0xD5, 0x84, + 0xD5, 0x85, 0xD5, 0x86, 0xD5, 0x87, 0xD5, 0x88, + 0xD5, 0x89, 0xD5, 0x8A, 0xD5, 0x8B, 0xD5, 0x8C, + 0xD5, 0x8D, 0xD5, 0x8E, 0xD5, 0x8F, 0xD5, 0x90, + 0xD5, 0x91, 0xD5, 0x92, 0xD5, 0x93, 0xD5, 0x94, + 0xD5, 0x95, 0xD5, 0x96, 0xE2, 0xB1, 0xA3, 0xE1, + 0xB8, 0x80, 0xE1, 0xB8, 0x82, 0xE1, 0xB8, 0x84, + 0xE1, 0xB8, 0x86, 0xE1, 0xB8, 0x88, 0xE1, 0xB8, + 0x8A, 0xE1, 0xB8, 0x8C, 0xE1, 0xB8, 0x8E, 0xE1, + 0xB8, 0x90, 0xE1, 0xB8, 0x92, 0xE1, 0xB8, 0x94, + 0xE1, 0xB8, 0x96, 0xE1, 0xB8, 0x98, 0xE1, 0xB8, + 0x9A, 0xE1, 0xB8, 0x9C, 0xE1, 0xB8, 0x9E, 0xE1, + 0xB8, 0xA0, 0xE1, 0xB8, 0xA2, 0xE1, 0xB8, 0xA4, + 0xE1, 0xB8, 0xA6, 0xE1, 0xB8, 0xA8, 0xE1, 0xB8, + 0xAA, 0xE1, 0xB8, 0xAC, 0xE1, 0xB8, 0xAE, 0xE1, + 0xB8, 0xB0, 0xE1, 0xB8, 0xB2, 0xE1, 0xB8, 0xB4, + 0xE1, 0xB8, 0xB6, 0xE1, 0xB8, 0xB8, 0xE1, 0xB8, + 0xBA, 0xE1, 0xB8, 0xBC, 0xE1, 0xB8, 0xBE, 0xE1, + 0xB9, 0x80, 0xE1, 0xB9, 0x82, 0xE1, 0xB9, 0x84, + 0xE1, 0xB9, 0x86, 0xE1, 0xB9, 0x88, 0xE1, 0xB9, + 0x8A, 0xE1, 0xB9, 0x8C, 0xE1, 0xB9, 0x8E, 0xE1, + 0xB9, 0x90, 0xE1, 0xB9, 0x92, 0xE1, 0xB9, 0x94, + 0xE1, 0xB9, 0x96, 0xE1, 0xB9, 0x98, 0xE1, 0xB9, + 0x9A, 0xE1, 0xB9, 0x9C, 0xE1, 0xB9, 0x9E, 0xE1, + 0xB9, 0xA0, 0xE1, 0xB9, 0xA2, 0xE1, 0xB9, 0xA4, + 0xE1, 0xB9, 0xA6, 0xE1, 0xB9, 0xA8, 0xE1, 0xB9, + 0xAA, 0xE1, 0xB9, 0xAC, 0xE1, 0xB9, 0xAE, 0xE1, + 0xB9, 0xB0, 0xE1, 0xB9, 0xB2, 0xE1, 0xB9, 0xB4, + 0xE1, 0xB9, 0xB6, 0xE1, 0xB9, 0xB8, 0xE1, 0xB9, + 0xBA, 0xE1, 0xB9, 0xBC, 0xE1, 0xB9, 0xBE, 0xE1, + 0xBA, 0x80, 0xE1, 0xBA, 0x82, 0xE1, 0xBA, 0x84, + 0xE1, 0xBA, 0x86, 0xE1, 0xBA, 0x88, 0xE1, 0xBA, + 0x8A, 0xE1, 0xBA, 0x8C, 0xE1, 0xBA, 0x8E, 0xE1, + 0xBA, 0x90, 0xE1, 0xBA, 0x92, 0xE1, 0xBA, 0x94, + 0xE1, 0xB9, 0xA0, 0xE1, 0xBA, 0xA0, 0xE1, 0xBA, + 0xA2, 0xE1, 0xBA, 0xA4, 0xE1, 0xBA, 0xA6, 0xE1, + 0xBA, 0xA8, 0xE1, 0xBA, 0xAA, 0xE1, 0xBA, 0xAC, + 0xE1, 0xBA, 0xAE, 0xE1, 0xBA, 0xB0, 0xE1, 0xBA, + 0xB2, 0xE1, 0xBA, 0xB4, 0xE1, 0xBA, 0xB6, 0xE1, + 0xBA, 0xB8, 0xE1, 0xBA, 0xBA, 0xE1, 0xBA, 0xBC, + 0xE1, 0xBA, 0xBE, 0xE1, 0xBB, 0x80, 0xE1, 0xBB, + 0x82, 0xE1, 0xBB, 0x84, 0xE1, 0xBB, 0x86, 0xE1, + 0xBB, 0x88, 0xE1, 0xBB, 0x8A, 0xE1, 0xBB, 0x8C, + 0xE1, 0xBB, 0x8E, 0xE1, 0xBB, 0x90, 0xE1, 0xBB, + 0x92, 0xE1, 0xBB, 0x94, 0xE1, 0xBB, 0x96, 0xE1, + 0xBB, 0x98, 0xE1, 0xBB, 0x9A, 0xE1, 0xBB, 0x9C, + 0xE1, 0xBB, 0x9E, 0xE1, 0xBB, 0xA0, 0xE1, 0xBB, + 0xA2, 0xE1, 0xBB, 0xA4, 0xE1, 0xBB, 0xA6, 0xE1, + 0xBB, 0xA8, 0xE1, 0xBB, 0xAA, 0xE1, 0xBB, 0xAC, + 0xE1, 0xBB, 0xAE, 0xE1, 0xBB, 0xB0, 0xE1, 0xBB, + 0xB2, 0xE1, 0xBB, 0xB4, 0xE1, 0xBB, 0xB6, 0xE1, + 0xBB, 0xB8, 0xE1, 0xBC, 0x88, 0xE1, 0xBC, 0x89, + 0xE1, 0xBC, 0x8A, 0xE1, 0xBC, 0x8B, 0xE1, 0xBC, + 0x8C, 0xE1, 0xBC, 0x8D, 0xE1, 0xBC, 0x8E, 0xE1, + 0xBC, 0x8F, 0xE1, 0xBC, 0x98, 0xE1, 0xBC, 0x99, + 0xE1, 0xBC, 0x9A, 0xE1, 0xBC, 0x9B, 0xE1, 0xBC, + 0x9C, 0xE1, 0xBC, 0x9D, 0xE1, 0xBC, 0xA8, 0xE1, + 0xBC, 0xA9, 0xE1, 0xBC, 0xAA, 0xE1, 0xBC, 0xAB, + 0xE1, 0xBC, 0xAC, 0xE1, 0xBC, 0xAD, 0xE1, 0xBC, + 0xAE, 0xE1, 0xBC, 0xAF, 0xE1, 0xBC, 0xB8, 0xE1, + 0xBC, 0xB9, 0xE1, 0xBC, 0xBA, 0xE1, 0xBC, 0xBB, + 0xE1, 0xBC, 0xBC, 0xE1, 0xBC, 0xBD, 0xE1, 0xBC, + 0xBE, 0xE1, 0xBC, 0xBF, 0xE1, 0xBD, 0x88, 0xE1, + 0xBD, 0x89, 0xE1, 0xBD, 0x8A, 0xE1, 0xBD, 0x8B, + 0xE1, 0xBD, 0x8C, 0xE1, 0xBD, 0x8D, 0xE1, 0xBD, + 0x99, 0xE1, 0xBD, 0x9B, 0xE1, 0xBD, 0x9D, 0xE1, + 0xBD, 0x9F, 0xE1, 0xBD, 0xA8, 0xE1, 0xBD, 0xA9, + 0xE1, 0xBD, 0xAA, 0xE1, 0xBD, 0xAB, 0xE1, 0xBD, + 0xAC, 0xE1, 0xBD, 0xAD, 0xE1, 0xBD, 0xAE, 0xE1, + 0xBD, 0xAF, 0xE1, 0xBE, 0xBA, 0xE1, 0xBE, 0xBB, + 0xE1, 0xBF, 0x88, 0xE1, 0xBF, 0x89, 0xE1, 0xBF, + 0x8A, 0xE1, 0xBF, 0x8B, 0xE1, 0xBF, 0x9A, 0xE1, + 0xBF, 0x9B, 0xE1, 0xBF, 0xB8, 0xE1, 0xBF, 0xB9, + 0xE1, 0xBF, 0xAA, 0xE1, 0xBF, 0xAB, 0xE1, 0xBF, + 0xBA, 0xE1, 0xBF, 0xBB, 0xE1, 0xBE, 0x88, 0xE1, + 0xBE, 0x89, 0xE1, 0xBE, 0x8A, 0xE1, 0xBE, 0x8B, + 0xE1, 0xBE, 0x8C, 0xE1, 0xBE, 0x8D, 0xE1, 0xBE, + 0x8E, 0xE1, 0xBE, 0x8F, 0xE1, 0xBE, 0x98, 0xE1, + 0xBE, 0x99, 0xE1, 0xBE, 0x9A, 0xE1, 0xBE, 0x9B, + 0xE1, 0xBE, 0x9C, 0xE1, 0xBE, 0x9D, 0xE1, 0xBE, + 0x9E, 0xE1, 0xBE, 0x9F, 0xE1, 0xBE, 0xA8, 0xE1, + 0xBE, 0xA9, 0xE1, 0xBE, 0xAA, 0xE1, 0xBE, 0xAB, + 0xE1, 0xBE, 0xAC, 0xE1, 0xBE, 0xAD, 0xE1, 0xBE, + 0xAE, 0xE1, 0xBE, 0xAF, 0xE1, 0xBE, 0xB8, 0xE1, + 0xBE, 0xB9, 0xE1, 0xBE, 0xBC, 0xCE, 0x99, 0xE1, + 0xBF, 0x8C, 0xE1, 0xBF, 0x98, 0xE1, 0xBF, 0x99, + 0xE1, 0xBF, 0xA8, 0xE1, 0xBF, 0xA9, 0xE1, 0xBF, + 0xAC, 0xE1, 0xBF, 0xBC, 0xE2, 0x84, 0xB2, 0xE2, + 0x85, 0xA0, 0xE2, 0x85, 0xA1, 0xE2, 0x85, 0xA2, + 0xE2, 0x85, 0xA3, 0xE2, 0x85, 0xA4, 0xE2, 0x85, + 0xA5, 0xE2, 0x85, 0xA6, 0xE2, 0x85, 0xA7, 0xE2, + 0x85, 0xA8, 0xE2, 0x85, 0xA9, 0xE2, 0x85, 0xAA, + 0xE2, 0x85, 0xAB, 0xE2, 0x85, 0xAC, 0xE2, 0x85, + 0xAD, 0xE2, 0x85, 0xAE, 0xE2, 0x85, 0xAF, 0xE2, + 0x86, 0x83, 0xE2, 0x92, 0xB6, 0xE2, 0x92, 0xB7, + 0xE2, 0x92, 0xB8, 0xE2, 0x92, 0xB9, 0xE2, 0x92, + 0xBA, 0xE2, 0x92, 0xBB, 0xE2, 0x92, 0xBC, 0xE2, + 0x92, 0xBD, 0xE2, 0x92, 0xBE, 0xE2, 0x92, 0xBF, + 0xE2, 0x93, 0x80, 0xE2, 0x93, 0x81, 0xE2, 0x93, + 0x82, 0xE2, 0x93, 0x83, 0xE2, 0x93, 0x84, 0xE2, + 0x93, 0x85, 0xE2, 0x93, 0x86, 0xE2, 0x93, 0x87, + 0xE2, 0x93, 0x88, 0xE2, 0x93, 0x89, 0xE2, 0x93, + 0x8A, 0xE2, 0x93, 0x8B, 0xE2, 0x93, 0x8C, 0xE2, + 0x93, 0x8D, 0xE2, 0x93, 0x8E, 0xE2, 0x93, 0x8F, + 0xE2, 0xB0, 0x80, 0xE2, 0xB0, 0x81, 0xE2, 0xB0, + 0x82, 0xE2, 0xB0, 0x83, 0xE2, 0xB0, 0x84, 0xE2, + 0xB0, 0x85, 0xE2, 0xB0, 0x86, 0xE2, 0xB0, 0x87, + 0xE2, 0xB0, 0x88, 0xE2, 0xB0, 0x89, 0xE2, 0xB0, + 0x8A, 0xE2, 0xB0, 0x8B, 0xE2, 0xB0, 0x8C, 0xE2, + 0xB0, 0x8D, 0xE2, 0xB0, 0x8E, 0xE2, 0xB0, 0x8F, + 0xE2, 0xB0, 0x90, 0xE2, 0xB0, 0x91, 0xE2, 0xB0, + 0x92, 0xE2, 0xB0, 0x93, 0xE2, 0xB0, 0x94, 0xE2, + 0xB0, 0x95, 0xE2, 0xB0, 0x96, 0xE2, 0xB0, 0x97, + 0xE2, 0xB0, 0x98, 0xE2, 0xB0, 0x99, 0xE2, 0xB0, + 0x9A, 0xE2, 0xB0, 0x9B, 0xE2, 0xB0, 0x9C, 0xE2, + 0xB0, 0x9D, 0xE2, 0xB0, 0x9E, 0xE2, 0xB0, 0x9F, + 0xE2, 0xB0, 0xA0, 0xE2, 0xB0, 0xA1, 0xE2, 0xB0, + 0xA2, 0xE2, 0xB0, 0xA3, 0xE2, 0xB0, 0xA4, 0xE2, + 0xB0, 0xA5, 0xE2, 0xB0, 0xA6, 0xE2, 0xB0, 0xA7, + 0xE2, 0xB0, 0xA8, 0xE2, 0xB0, 0xA9, 0xE2, 0xB0, + 0xAA, 0xE2, 0xB0, 0xAB, 0xE2, 0xB0, 0xAC, 0xE2, + 0xB0, 0xAD, 0xE2, 0xB0, 0xAE, 0xE2, 0xB1, 0xA0, + 0xC8, 0xBA, 0xC8, 0xBE, 0xE2, 0xB1, 0xA7, 0xE2, + 0xB1, 0xA9, 0xE2, 0xB1, 0xAB, 0xE2, 0xB1, 0xB5, + 0xE2, 0xB2, 0x80, 0xE2, 0xB2, 0x82, 0xE2, 0xB2, + 0x84, 0xE2, 0xB2, 0x86, 0xE2, 0xB2, 0x88, 0xE2, + 0xB2, 0x8A, 0xE2, 0xB2, 0x8C, 0xE2, 0xB2, 0x8E, + 0xE2, 0xB2, 0x90, 0xE2, 0xB2, 0x92, 0xE2, 0xB2, + 0x94, 0xE2, 0xB2, 0x96, 0xE2, 0xB2, 0x98, 0xE2, + 0xB2, 0x9A, 0xE2, 0xB2, 0x9C, 0xE2, 0xB2, 0x9E, + 0xE2, 0xB2, 0xA0, 0xE2, 0xB2, 0xA2, 0xE2, 0xB2, + 0xA4, 0xE2, 0xB2, 0xA6, 0xE2, 0xB2, 0xA8, 0xE2, + 0xB2, 0xAA, 0xE2, 0xB2, 0xAC, 0xE2, 0xB2, 0xAE, + 0xE2, 0xB2, 0xB0, 0xE2, 0xB2, 0xB2, 0xE2, 0xB2, + 0xB4, 0xE2, 0xB2, 0xB6, 0xE2, 0xB2, 0xB8, 0xE2, + 0xB2, 0xBA, 0xE2, 0xB2, 0xBC, 0xE2, 0xB2, 0xBE, + 0xE2, 0xB3, 0x80, 0xE2, 0xB3, 0x82, 0xE2, 0xB3, + 0x84, 0xE2, 0xB3, 0x86, 0xE2, 0xB3, 0x88, 0xE2, + 0xB3, 0x8A, 0xE2, 0xB3, 0x8C, 0xE2, 0xB3, 0x8E, + 0xE2, 0xB3, 0x90, 0xE2, 0xB3, 0x92, 0xE2, 0xB3, + 0x94, 0xE2, 0xB3, 0x96, 0xE2, 0xB3, 0x98, 0xE2, + 0xB3, 0x9A, 0xE2, 0xB3, 0x9C, 0xE2, 0xB3, 0x9E, + 0xE2, 0xB3, 0xA0, 0xE2, 0xB3, 0xA2, 0xE1, 0x82, + 0xA0, 0xE1, 0x82, 0xA1, 0xE1, 0x82, 0xA2, 0xE1, + 0x82, 0xA3, 0xE1, 0x82, 0xA4, 0xE1, 0x82, 0xA5, + 0xE1, 0x82, 0xA6, 0xE1, 0x82, 0xA7, 0xE1, 0x82, + 0xA8, 0xE1, 0x82, 0xA9, 0xE1, 0x82, 0xAA, 0xE1, + 0x82, 0xAB, 0xE1, 0x82, 0xAC, 0xE1, 0x82, 0xAD, + 0xE1, 0x82, 0xAE, 0xE1, 0x82, 0xAF, 0xE1, 0x82, + 0xB0, 0xE1, 0x82, 0xB1, 0xE1, 0x82, 0xB2, 0xE1, + 0x82, 0xB3, 0xE1, 0x82, 0xB4, 0xE1, 0x82, 0xB5, + 0xE1, 0x82, 0xB6, 0xE1, 0x82, 0xB7, 0xE1, 0x82, + 0xB8, 0xE1, 0x82, 0xB9, 0xE1, 0x82, 0xBA, 0xE1, + 0x82, 0xBB, 0xE1, 0x82, 0xBC, 0xE1, 0x82, 0xBD, + 0xE1, 0x82, 0xBE, 0xE1, 0x82, 0xBF, 0xE1, 0x83, + 0x80, 0xE1, 0x83, 0x81, 0xE1, 0x83, 0x82, 0xE1, + 0x83, 0x83, 0xE1, 0x83, 0x84, 0xE1, 0x83, 0x85, + 0xEF, 0xBC, 0xA1, 0xEF, 0xBC, 0xA2, 0xEF, 0xBC, + 0xA3, 0xEF, 0xBC, 0xA4, 0xEF, 0xBC, 0xA5, 0xEF, + 0xBC, 0xA6, 0xEF, 0xBC, 0xA7, 0xEF, 0xBC, 0xA8, + 0xEF, 0xBC, 0xA9, 0xEF, 0xBC, 0xAA, 0xEF, 0xBC, + 0xAB, 0xEF, 0xBC, 0xAC, 0xEF, 0xBC, 0xAD, 0xEF, + 0xBC, 0xAE, 0xEF, 0xBC, 0xAF, 0xEF, 0xBC, 0xB0, + 0xEF, 0xBC, 0xB1, 0xEF, 0xBC, 0xB2, 0xEF, 0xBC, + 0xB3, 0xEF, 0xBC, 0xB4, 0xEF, 0xBC, 0xB5, 0xEF, + 0xBC, 0xB6, 0xEF, 0xBC, 0xB7, 0xEF, 0xBC, 0xB8, + 0xEF, 0xBC, 0xB9, 0xEF, 0xBC, 0xBA, 0xF0, 0x90, + 0x90, 0x80, 0xF0, 0x90, 0x90, 0x81, 0xF0, 0x90, + 0x90, 0x82, 0xF0, 0x90, 0x90, 0x83, 0xF0, 0x90, + 0x90, 0x84, 0xF0, 0x90, 0x90, 0x85, 0xF0, 0x90, + 0x90, 0x86, 0xF0, 0x90, 0x90, 0x87, 0xF0, 0x90, + 0x90, 0x88, 0xF0, 0x90, 0x90, 0x89, 0xF0, 0x90, + 0x90, 0x8A, 0xF0, 0x90, 0x90, 0x8B, 0xF0, 0x90, + 0x90, 0x8C, 0xF0, 0x90, 0x90, 0x8D, 0xF0, 0x90, + 0x90, 0x8E, 0xF0, 0x90, 0x90, 0x8F, 0xF0, 0x90, + 0x90, 0x90, 0xF0, 0x90, 0x90, 0x91, 0xF0, 0x90, + 0x90, 0x92, 0xF0, 0x90, 0x90, 0x93, 0xF0, 0x90, + 0x90, 0x94, 0xF0, 0x90, 0x90, 0x95, 0xF0, 0x90, + 0x90, 0x96, 0xF0, 0x90, 0x90, 0x97, 0xF0, 0x90, + 0x90, 0x98, 0xF0, 0x90, 0x90, 0x99, 0xF0, 0x90, + 0x90, 0x9A, 0xF0, 0x90, 0x90, 0x9B, 0xF0, 0x90, + 0x90, 0x9C, 0xF0, 0x90, 0x90, 0x9D, 0xF0, 0x90, + 0x90, 0x9E, 0xF0, 0x90, 0x90, 0x9F, 0xF0, 0x90, + 0x90, 0xA0, 0xF0, 0x90, 0x90, 0xA1, 0xF0, 0x90, + 0x90, 0xA2, 0xF0, 0x90, 0x90, 0xA3, 0xF0, 0x90, + 0x90, 0xA4, 0xF0, 0x90, 0x90, 0xA5, 0xF0, 0x90, + 0x90, 0xA6, 0xF0, 0x90, 0x90, 0xA7, + }, +}; + +#undef N_ +#undef FIL_ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_U8_TEXTPREP_DATA_H */ diff --git a/include/sys/uberblock.h b/include/sys/uberblock.h new file mode 100644 index 000000000000..044e438387c0 --- /dev/null +++ b/include/sys/uberblock.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#ifndef _SYS_UBERBLOCK_H +#define _SYS_UBERBLOCK_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct uberblock uberblock_t; + +extern int uberblock_verify(uberblock_t *); +extern boolean_t uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, + uint64_t mmp_delay); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_H */ diff --git a/include/sys/uberblock_impl.h b/include/sys/uberblock_impl.h new file mode 100644 index 000000000000..91699e65131a --- /dev/null +++ b/include/sys/uberblock_impl.h @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#ifndef _SYS_UBERBLOCK_IMPL_H +#define _SYS_UBERBLOCK_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The uberblock version is incremented whenever an incompatible on-disk + * format change is made to the SPA, DMU, or ZAP. + * + * Note: the first two fields should never be moved. When a storage pool + * is opened, the uberblock must be read off the disk before the version + * can be checked. If the ub_version field is moved, we may not detect + * version mismatch. If the ub_magic field is moved, applications that + * expect the magic number in the first word won't work. + */ +#define UBERBLOCK_MAGIC 0x00bab10c /* oo-ba-bloc! */ +#define UBERBLOCK_SHIFT 10 /* up to 1K */ +#define MMP_MAGIC 0xa11cea11 /* all-see-all */ + +#define MMP_INTERVAL_VALID_BIT 0x01 +#define MMP_SEQ_VALID_BIT 0x02 +#define MMP_FAIL_INT_VALID_BIT 0x04 + +#define MMP_VALID(ubp) (ubp->ub_magic == UBERBLOCK_MAGIC && \ + ubp->ub_mmp_magic == MMP_MAGIC) +#define MMP_INTERVAL_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_INTERVAL_VALID_BIT)) +#define MMP_SEQ_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_SEQ_VALID_BIT)) +#define MMP_FAIL_INT_VALID(ubp) (MMP_VALID(ubp) && (ubp->ub_mmp_config & \ + MMP_FAIL_INT_VALID_BIT)) + +#define MMP_INTERVAL(ubp) ((ubp->ub_mmp_config & 0x00000000FFFFFF00) \ + >> 8) +#define MMP_SEQ(ubp) ((ubp->ub_mmp_config & 0x0000FFFF00000000) \ + >> 32) +#define MMP_FAIL_INT(ubp) ((ubp->ub_mmp_config & 0xFFFF000000000000) \ + >> 48) + +#define MMP_INTERVAL_SET(write) \ + (((uint64_t)(write & 0xFFFFFF) << 8) | MMP_INTERVAL_VALID_BIT) + +#define MMP_SEQ_SET(seq) \ + (((uint64_t)(seq & 0xFFFF) << 32) | MMP_SEQ_VALID_BIT) + +#define MMP_FAIL_INT_SET(fail) \ + (((uint64_t)(fail & 0xFFFF) << 48) | MMP_FAIL_INT_VALID_BIT) + +struct uberblock { + uint64_t ub_magic; /* UBERBLOCK_MAGIC */ + uint64_t ub_version; /* SPA_VERSION */ + uint64_t ub_txg; /* txg of last sync */ + uint64_t ub_guid_sum; /* sum of all vdev guids */ + uint64_t ub_timestamp; /* UTC time of last sync */ + blkptr_t ub_rootbp; /* MOS objset_phys_t */ + + /* highest SPA_VERSION supported by software that wrote this txg */ + uint64_t ub_software_version; + + /* Maybe missing in uberblocks we read, but always written */ + uint64_t ub_mmp_magic; /* MMP_MAGIC */ + /* + * If ub_mmp_delay == 0 and ub_mmp_magic is valid, MMP is off. + * Otherwise, nanosec since last MMP write. + */ + uint64_t ub_mmp_delay; + + /* + * The ub_mmp_config contains the multihost write interval, multihost + * fail intervals, sequence number for sub-second granularity, and + * valid bit mask. This layout is as follows: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * 0 | Fail Intervals| Seq | Write Interval (ms) | VALID | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * This allows a write_interval of (2^24/1000)s, over 4.5 hours + * + * VALID Bits: + * - 0x01 - Write Interval (ms) + * - 0x02 - Sequence number exists + * - 0x04 - Fail Intervals + * - 0xf8 - Reserved + */ + uint64_t ub_mmp_config; + + /* + * ub_checkpoint_txg indicates two things about the current uberblock: + * + * 1] If it is not zero then this uberblock is a checkpoint. If it is + * zero, then this uberblock is not a checkpoint. + * + * 2] On checkpointed uberblocks, the value of ub_checkpoint_txg is + * the ub_txg that the uberblock had at the time we moved it to + * the MOS config. + * + * The field is set when we checkpoint the uberblock and continues to + * hold that value even after we've rewound (unlike the ub_txg that + * is reset to a higher value). + * + * Besides checks used to determine whether we are reopening the + * pool from a checkpointed uberblock [see spa_ld_select_uberblock()], + * the value of the field is used to determine which ZIL blocks have + * been allocated according to the ms_sm when we are rewinding to a + * checkpoint. Specifically, if blk_birth > ub_checkpoint_txg, then + * the ZIL block is not allocated [see uses of spa_min_claim_txg()]. + */ + uint64_t ub_checkpoint_txg; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UBERBLOCK_IMPL_H */ diff --git a/include/sys/uio_impl.h b/include/sys/uio_impl.h new file mode 100644 index 000000000000..cfef0b95dbb9 --- /dev/null +++ b/include/sys/uio_impl.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_UIO_IMPL_H +#define _SYS_UIO_IMPL_H + +#include + +extern int uiomove(void *, size_t, enum uio_rw, uio_t *); +extern int uio_prefaultpages(ssize_t, uio_t *); +extern int uiocopy(void *, size_t, enum uio_rw, uio_t *, size_t *); +extern void uioskip(uio_t *, size_t); + +#endif /* _SYS_UIO_IMPL_H */ diff --git a/include/sys/unique.h b/include/sys/unique.h new file mode 100644 index 000000000000..d4ba32e5c642 --- /dev/null +++ b/include/sys/unique.h @@ -0,0 +1,57 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UNIQUE_H +#define _SYS_UNIQUE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* The number of significant bits in each unique value. */ +#define UNIQUE_BITS 56 + +void unique_init(void); +void unique_fini(void); + +/* + * Return a new unique value (which will not be uniquified against until + * it is unique_insert()-ed). + */ +uint64_t unique_create(void); + +/* Return a unique value, which equals the one passed in if possible. */ +uint64_t unique_insert(uint64_t value); + +/* Indicate that this value no longer needs to be uniquified against. */ +void unique_remove(uint64_t value); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UNIQUE_H */ diff --git a/include/sys/uuid.h b/include/sys/uuid.h new file mode 100644 index 000000000000..eab4622a6d9a --- /dev/null +++ b/include/sys/uuid.h @@ -0,0 +1,94 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_UUID_H +#define _SYS_UUID_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The copyright in this file is taken from the original Leach + * & Salz UUID specification, from which this implementation + * is derived. + */ + +/* + * Copyright (c) 1990- 1993, 1996 Open Software Foundation, Inc. + * Copyright (c) 1989 by Hewlett-Packard Company, Palo Alto, Ca. & + * Digital Equipment Corporation, Maynard, Mass. Copyright (c) 1998 + * Microsoft. To anyone who acknowledges that this file is provided + * "AS IS" without any express or implied warranty: permission to use, + * copy, modify, and distribute this file for any purpose is hereby + * granted without fee, provided that the above copyright notices and + * this notice appears in all source code copies, and that none of the + * names of Open Software Foundation, Inc., Hewlett-Packard Company, + * or Digital Equipment Corporation be used in advertising or + * publicity pertaining to distribution of the software without + * specific, written prior permission. Neither Open Software + * Foundation, Inc., Hewlett-Packard Company, Microsoft, nor Digital + * Equipment Corporation makes any representations about the + * suitability of this software for any purpose. + */ + +#include +#include + +typedef struct { + uint8_t nodeID[6]; +} uuid_node_t; + +/* + * The uuid type used throughout when referencing uuids themselves + */ +struct uuid { + uint32_t time_low; + uint16_t time_mid; + uint16_t time_hi_and_version; + uint8_t clock_seq_hi_and_reserved; + uint8_t clock_seq_low; + uint8_t node_addr[6]; +}; + +#define UUID_PRINTABLE_STRING_LENGTH 37 + +/* + * Convert a uuid to/from little-endian format + */ +#define UUID_LE_CONVERT(dest, src) \ +{ \ + (dest) = (src); \ + (dest).time_low = LE_32((dest).time_low); \ + (dest).time_mid = LE_16((dest).time_mid); \ + (dest).time_hi_and_version = LE_16((dest).time_hi_and_version); \ +} + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_UUID_H */ diff --git a/include/sys/vdev.h b/include/sys/vdev.h new file mode 100644 index 000000000000..797065fdd0a5 --- /dev/null +++ b/include/sys/vdev.h @@ -0,0 +1,201 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. + */ + +#ifndef _SYS_VDEV_H +#define _SYS_VDEV_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum vdev_dtl_type { + DTL_MISSING, /* 0% replication: no copies of the data */ + DTL_PARTIAL, /* less than 100% replication: some copies missing */ + DTL_SCRUB, /* unable to fully repair during scrub/resilver */ + DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ + DTL_TYPES +} vdev_dtl_type_t; + +extern int zfs_nocacheflush; + +extern void vdev_dbgmsg(vdev_t *vd, const char *fmt, ...); +extern void vdev_dbgmsg_print_tree(vdev_t *, int); +extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *); +extern int vdev_validate(vdev_t *); +extern int vdev_copy_path_strict(vdev_t *, vdev_t *); +extern void vdev_copy_path_relaxed(vdev_t *, vdev_t *); +extern void vdev_close(vdev_t *); +extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); +extern void vdev_reopen(vdev_t *); +extern int vdev_validate_aux(vdev_t *vd); +extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); +extern boolean_t vdev_is_concrete(vdev_t *vd); +extern boolean_t vdev_is_bootable(vdev_t *vd); +extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); +extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); +extern int vdev_count_leaves(spa_t *spa); +extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); +extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done); +extern boolean_t vdev_dtl_required(vdev_t *vd); +extern boolean_t vdev_resilver_needed(vdev_t *vd, + uint64_t *minp, uint64_t *maxp); +extern void vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, + dmu_tx_t *tx); +extern uint64_t vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx); +extern void vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, + uint64_t size); +extern void spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev, + uint64_t offset, uint64_t size, dmu_tx_t *tx); +extern boolean_t vdev_replace_in_progress(vdev_t *vdev); + +extern void vdev_hold(vdev_t *); +extern void vdev_rele(vdev_t *); + +extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); +extern void vdev_metaslab_fini(vdev_t *vd); +extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_ashift_optimize(vdev_t *); +extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); +extern void vdev_deadman(vdev_t *vd, char *tag); +extern void vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs); + +extern void vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx); +extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); +extern void vdev_clear_stats(vdev_t *vd); +extern void vdev_stat_update(zio_t *zio, uint64_t psize); +extern void vdev_scan_stat_init(vdev_t *vd); +extern void vdev_propagate_state(vdev_t *vd); +extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, + vdev_aux_t aux); +extern boolean_t vdev_children_are_offline(vdev_t *vd); + +extern void vdev_space_update(vdev_t *vd, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); + +extern int64_t vdev_deflated_space(vdev_t *vd, int64_t space); + +extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); + +extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, + vdev_state_t *); +extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); +extern void vdev_clear(spa_t *spa, vdev_t *vd); + +extern boolean_t vdev_is_dead(vdev_t *vd); +extern boolean_t vdev_readable(vdev_t *vd); +extern boolean_t vdev_writeable(vdev_t *vd); +extern boolean_t vdev_allocatable(vdev_t *vd); +extern boolean_t vdev_accessible(vdev_t *vd, zio_t *zio); +extern boolean_t vdev_is_spacemap_addressable(vdev_t *vd); + +extern void vdev_cache_init(vdev_t *vd); +extern void vdev_cache_fini(vdev_t *vd); +extern boolean_t vdev_cache_read(zio_t *zio); +extern void vdev_cache_write(zio_t *zio); +extern void vdev_cache_purge(vdev_t *vd); + +extern void vdev_queue_init(vdev_t *vd); +extern void vdev_queue_fini(vdev_t *vd); +extern zio_t *vdev_queue_io(zio_t *zio); +extern void vdev_queue_io_done(zio_t *zio); +extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); + +extern int vdev_queue_length(vdev_t *vd); +extern uint64_t vdev_queue_last_offset(vdev_t *vd); + +extern void vdev_config_dirty(vdev_t *vd); +extern void vdev_config_clean(vdev_t *vd); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); + +extern void vdev_state_dirty(vdev_t *vd); +extern void vdev_state_clean(vdev_t *vd); + +extern void vdev_defer_resilver(vdev_t *vd); +extern boolean_t vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx); + +typedef enum vdev_config_flag { + VDEV_CONFIG_SPARE = 1 << 0, + VDEV_CONFIG_L2CACHE = 1 << 1, + VDEV_CONFIG_REMOVING = 1 << 2, + VDEV_CONFIG_MOS = 1 << 3, + VDEV_CONFIG_MISSING = 1 << 4 +} vdev_config_flag_t; + +extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); +extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, + boolean_t getstats, vdev_config_flag_t flags); + +/* + * Label routines + */ +struct uberblock; +extern uint64_t vdev_label_offset(uint64_t psize, int l, uint64_t offset); +extern int vdev_label_number(uint64_t psise, uint64_t offset); +extern nvlist_t *vdev_label_read_config(vdev_t *vd, uint64_t txg); +extern void vdev_uberblock_load(vdev_t *, struct uberblock *, nvlist_t **); +extern void vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv); +extern void vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t + offset, uint64_t size, zio_done_func_t *done, void *priv, int flags); +extern int vdev_label_read_bootenv(vdev_t *, nvlist_t *); +extern int vdev_label_write_bootenv(vdev_t *, char *); + +typedef enum { + VDEV_LABEL_CREATE, /* create/add a new device */ + VDEV_LABEL_REPLACE, /* replace an existing device */ + VDEV_LABEL_SPARE, /* add a new hot spare */ + VDEV_LABEL_REMOVE, /* remove an existing device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ +} vdev_labeltype_t; + +extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_H */ diff --git a/include/sys/vdev_disk.h b/include/sys/vdev_disk.h new file mode 100644 index 000000000000..a7e19fbf0c4b --- /dev/null +++ b/include/sys/vdev_disk.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * LLNL-CODE-403049. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_DISK_H +#define _SYS_VDEV_DISK_H + +/* + * Don't start the slice at the default block of 34; many storage + * devices will use a stripe width of 128k, other vendors prefer a 1m + * alignment. It is best to play it safe and ensure a 1m alignment + * given 512B blocks. When the block size is larger by a power of 2 + * we will still be 1m aligned. Some devices are sensitive to the + * partition ending alignment as well. + */ +#define NEW_START_BLOCK 2048 +#define PARTITION_END_ALIGNMENT 2048 + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ +#endif /* _SYS_VDEV_DISK_H */ diff --git a/include/sys/vdev_file.h b/include/sys/vdev_file.h new file mode 100644 index 000000000000..1514a44fcabb --- /dev/null +++ b/include/sys/vdev_file.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VDEV_FILE_H +#define _SYS_VDEV_FILE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_file { + zfs_file_t *vf_file; +} vdev_file_t; + +extern void vdev_file_init(void); +extern void vdev_file_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_FILE_H */ diff --git a/include/sys/vdev_impl.h b/include/sys/vdev_impl.h new file mode 100644 index 000000000000..90d607746013 --- /dev/null +++ b/include/sys/vdev_impl.h @@ -0,0 +1,620 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _SYS_VDEV_IMPL_H +#define _SYS_VDEV_IMPL_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Virtual device descriptors. + * + * All storage pool operations go through the virtual device framework, + * which provides data replication and I/O scheduling. + */ + +/* + * Forward declarations that lots of things need. + */ +typedef struct vdev_queue vdev_queue_t; +typedef struct vdev_cache vdev_cache_t; +typedef struct vdev_cache_entry vdev_cache_entry_t; +struct abd; + +extern int zfs_vdev_queue_depth_pct; +extern int zfs_vdev_def_queue_depth; +extern uint32_t zfs_vdev_async_write_max_active; + +/* + * Virtual device operations + */ +typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, + uint64_t *ashift, uint64_t *pshift); +typedef void vdev_close_func_t(vdev_t *vd); +typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); +typedef void vdev_io_start_func_t(zio_t *zio); +typedef void vdev_io_done_func_t(zio_t *zio); +typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); +typedef void vdev_hold_func_t(vdev_t *vd); +typedef void vdev_rele_func_t(vdev_t *vd); + +typedef void vdev_remap_cb_t(uint64_t inner_offset, vdev_t *vd, + uint64_t offset, uint64_t size, void *arg); +typedef void vdev_remap_func_t(vdev_t *vd, uint64_t offset, uint64_t size, + vdev_remap_cb_t callback, void *arg); +/* + * Given a target vdev, translates the logical range "in" to the physical + * range "res" + */ +typedef void vdev_xlation_func_t(vdev_t *cvd, const range_seg64_t *in, + range_seg64_t *res); + +typedef const struct vdev_ops { + vdev_open_func_t *vdev_op_open; + vdev_close_func_t *vdev_op_close; + vdev_asize_func_t *vdev_op_asize; + vdev_io_start_func_t *vdev_op_io_start; + vdev_io_done_func_t *vdev_op_io_done; + vdev_state_change_func_t *vdev_op_state_change; + vdev_need_resilver_func_t *vdev_op_need_resilver; + vdev_hold_func_t *vdev_op_hold; + vdev_rele_func_t *vdev_op_rele; + vdev_remap_func_t *vdev_op_remap; + /* + * For translating ranges from non-leaf vdevs (e.g. raidz) to leaves. + * Used when initializing vdevs. Isn't used by leaf ops. + */ + vdev_xlation_func_t *vdev_op_xlate; + char vdev_op_type[16]; + boolean_t vdev_op_leaf; +} vdev_ops_t; + +/* + * Virtual device properties + */ +struct vdev_cache_entry { + struct abd *ve_abd; + uint64_t ve_offset; + clock_t ve_lastused; + avl_node_t ve_offset_node; + avl_node_t ve_lastused_node; + uint32_t ve_hits; + uint16_t ve_missed_update; + zio_t *ve_fill_io; +}; + +struct vdev_cache { + avl_tree_t vc_offset_tree; + avl_tree_t vc_lastused_tree; + kmutex_t vc_lock; +}; + +typedef struct vdev_queue_class { + uint32_t vqc_active; + + /* + * Sorted by offset or timestamp, depending on if the queue is + * LBA-ordered vs FIFO. + */ + avl_tree_t vqc_queued_tree; +} vdev_queue_class_t; + +struct vdev_queue { + vdev_t *vq_vdev; + vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE]; + avl_tree_t vq_active_tree; + avl_tree_t vq_read_offset_tree; + avl_tree_t vq_write_offset_tree; + avl_tree_t vq_trim_offset_tree; + uint64_t vq_last_offset; + hrtime_t vq_io_complete_ts; /* time last i/o completed */ + hrtime_t vq_io_delta_ts; + zio_t vq_io_search; /* used as local for stack reduction */ + kmutex_t vq_lock; +}; + +typedef enum vdev_alloc_bias { + VDEV_BIAS_NONE, + VDEV_BIAS_LOG, /* dedicated to ZIL data (SLOG) */ + VDEV_BIAS_SPECIAL, /* dedicated to ddt, metadata, and small blks */ + VDEV_BIAS_DEDUP /* dedicated to dedup metadata */ +} vdev_alloc_bias_t; + + +/* + * On-disk indirect vdev state. + * + * An indirect vdev is described exclusively in the MOS config of a pool. + * The config for an indirect vdev includes several fields, which are + * accessed in memory by a vdev_indirect_config_t. + */ +typedef struct vdev_indirect_config { + /* + * Object (in MOS) which contains the indirect mapping. This object + * contains an array of vdev_indirect_mapping_entry_phys_t ordered by + * vimep_src. The bonus buffer for this object is a + * vdev_indirect_mapping_phys_t. This object is allocated when a vdev + * removal is initiated. + * + * Note that this object can be empty if none of the data on the vdev + * has been copied yet. + */ + uint64_t vic_mapping_object; + + /* + * Object (in MOS) which contains the birth times for the mapping + * entries. This object contains an array of + * vdev_indirect_birth_entry_phys_t sorted by vibe_offset. The bonus + * buffer for this object is a vdev_indirect_birth_phys_t. This object + * is allocated when a vdev removal is initiated. + * + * Note that this object can be empty if none of the vdev has yet been + * copied. + */ + uint64_t vic_births_object; + + /* + * This is the vdev ID which was removed previous to this vdev, or + * UINT64_MAX if there are no previously removed vdevs. + */ + uint64_t vic_prev_indirect_vdev; +} vdev_indirect_config_t; + +/* + * Virtual device descriptor + */ +struct vdev { + /* + * Common to all vdev types. + */ + uint64_t vdev_id; /* child number in vdev parent */ + uint64_t vdev_guid; /* unique ID for this vdev */ + uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ + uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_min_asize; /* min acceptable asize */ + uint64_t vdev_max_asize; /* max acceptable asize */ + uint64_t vdev_ashift; /* block alignment shift */ + + /* + * Logical block alignment shift + * + * The smallest sized/aligned I/O supported by the device. + */ + uint64_t vdev_logical_ashift; + /* + * Physical block alignment shift + * + * The device supports logical I/Os with vdev_logical_ashift + * size/alignment, but optimum performance will be achieved by + * aligning/sizing requests to vdev_physical_ashift. Smaller + * requests may be inflated or incur device level read-modify-write + * operations. + * + * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). + */ + uint64_t vdev_physical_ashift; + uint64_t vdev_state; /* see VDEV_STATE_* #defines */ + uint64_t vdev_prevstate; /* used when reopening a vdev */ + vdev_ops_t *vdev_ops; /* vdev operations */ + spa_t *vdev_spa; /* spa for this vdev */ + void *vdev_tsd; /* type-specific data */ + vdev_t *vdev_top; /* top-level vdev */ + vdev_t *vdev_parent; /* parent vdev */ + vdev_t **vdev_child; /* array of children */ + uint64_t vdev_children; /* number of children */ + vdev_stat_t vdev_stat; /* virtual device statistics */ + vdev_stat_ex_t vdev_stat_ex; /* extended statistics */ + boolean_t vdev_expanding; /* expand the vdev? */ + boolean_t vdev_reopening; /* reopen in progress? */ + boolean_t vdev_nonrot; /* true if solid state */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ + uint64_t vdev_crtxg; /* txg when top-level was added */ + + /* + * Top-level vdev state. + */ + uint64_t vdev_ms_array; /* metaslab array object */ + uint64_t vdev_ms_shift; /* metaslab size shift */ + uint64_t vdev_ms_count; /* number of metaslabs */ + metaslab_group_t *vdev_mg; /* metaslab group */ + metaslab_t **vdev_ms; /* metaslab array */ + uint64_t vdev_pending_fastwrite; /* allocated fastwrites */ + txg_list_t vdev_ms_list; /* per-txg dirty metaslab lists */ + txg_list_t vdev_dtl_list; /* per-txg dirty DTL lists */ + txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ + boolean_t vdev_remove_wanted; /* async remove wanted? */ + boolean_t vdev_probe_wanted; /* async probe wanted? */ + list_node_t vdev_config_dirty_node; /* config dirty list */ + list_node_t vdev_state_dirty_node; /* state dirty list */ + uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ + uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_removing; /* device is being removed? */ + boolean_t vdev_ishole; /* is a hole in the namespace */ + uint64_t vdev_top_zap; + vdev_alloc_bias_t vdev_alloc_bias; /* metaslab allocation bias */ + + /* pool checkpoint related */ + space_map_t *vdev_checkpoint_sm; /* contains reserved blocks */ + + /* Initialize related */ + boolean_t vdev_initialize_exit_wanted; + vdev_initializing_state_t vdev_initialize_state; + list_node_t vdev_initialize_node; + kthread_t *vdev_initialize_thread; + /* Protects vdev_initialize_thread and vdev_initialize_state. */ + kmutex_t vdev_initialize_lock; + kcondvar_t vdev_initialize_cv; + uint64_t vdev_initialize_offset[TXG_SIZE]; + uint64_t vdev_initialize_last_offset; + range_tree_t *vdev_initialize_tree; /* valid while initializing */ + uint64_t vdev_initialize_bytes_est; + uint64_t vdev_initialize_bytes_done; + uint64_t vdev_initialize_action_time; /* start and end time */ + + /* TRIM related */ + boolean_t vdev_trim_exit_wanted; + boolean_t vdev_autotrim_exit_wanted; + vdev_trim_state_t vdev_trim_state; + list_node_t vdev_trim_node; + kmutex_t vdev_autotrim_lock; + kcondvar_t vdev_autotrim_cv; + kthread_t *vdev_autotrim_thread; + /* Protects vdev_trim_thread and vdev_trim_state. */ + kmutex_t vdev_trim_lock; + kcondvar_t vdev_trim_cv; + kthread_t *vdev_trim_thread; + uint64_t vdev_trim_offset[TXG_SIZE]; + uint64_t vdev_trim_last_offset; + uint64_t vdev_trim_bytes_est; + uint64_t vdev_trim_bytes_done; + uint64_t vdev_trim_rate; /* requested rate (bytes/sec) */ + uint64_t vdev_trim_partial; /* requested partial TRIM */ + uint64_t vdev_trim_secure; /* requested secure TRIM */ + uint64_t vdev_trim_action_time; /* start and end time */ + + /* Rebuild related */ + boolean_t vdev_rebuilding; + boolean_t vdev_rebuild_exit_wanted; + boolean_t vdev_rebuild_cancel_wanted; + boolean_t vdev_rebuild_reset_wanted; + kmutex_t vdev_rebuild_lock; + kcondvar_t vdev_rebuild_cv; + kthread_t *vdev_rebuild_thread; + vdev_rebuild_t vdev_rebuild_config; + + /* For limiting outstanding I/Os (initialize, TRIM, rebuild) */ + kmutex_t vdev_initialize_io_lock; + kcondvar_t vdev_initialize_io_cv; + uint64_t vdev_initialize_inflight; + kmutex_t vdev_trim_io_lock; + kcondvar_t vdev_trim_io_cv; + uint64_t vdev_trim_inflight[3]; + kmutex_t vdev_rebuild_io_lock; + kcondvar_t vdev_rebuild_io_cv; + uint64_t vdev_rebuild_inflight; + + /* + * Values stored in the config for an indirect or removing vdev. + */ + vdev_indirect_config_t vdev_indirect_config; + + /* + * The vdev_indirect_rwlock protects the vdev_indirect_mapping + * pointer from changing on indirect vdevs (when it is condensed). + * Note that removing (not yet indirect) vdevs have different + * access patterns (the mapping is not accessed from open context, + * e.g. from zio_read) and locking strategy (e.g. svr_lock). + */ + krwlock_t vdev_indirect_rwlock; + vdev_indirect_mapping_t *vdev_indirect_mapping; + vdev_indirect_births_t *vdev_indirect_births; + + /* + * In memory data structures used to manage the obsolete sm, for + * indirect or removing vdevs. + * + * The vdev_obsolete_segments is the in-core record of the segments + * that are no longer referenced anywhere in the pool (due to + * being freed or remapped and not referenced by any snapshots). + * During a sync, segments are added to vdev_obsolete_segments + * via vdev_indirect_mark_obsolete(); at the end of each sync + * pass, this is appended to vdev_obsolete_sm via + * vdev_indirect_sync_obsolete(). The vdev_obsolete_lock + * protects against concurrent modifications of vdev_obsolete_segments + * from multiple zio threads. + */ + kmutex_t vdev_obsolete_lock; + range_tree_t *vdev_obsolete_segments; + space_map_t *vdev_obsolete_sm; + + /* + * Protects the vdev_scan_io_queue field itself as well as the + * structure's contents (when present). + */ + kmutex_t vdev_scan_io_queue_lock; + struct dsl_scan_io_queue *vdev_scan_io_queue; + + /* + * Leaf vdev state. + */ + range_tree_t *vdev_dtl[DTL_TYPES]; /* dirty time logs */ + space_map_t *vdev_dtl_sm; /* dirty time log space map */ + txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ + uint64_t vdev_dtl_object; /* DTL object */ + uint64_t vdev_psize; /* physical device capacity */ + uint64_t vdev_wholedisk; /* true if this is a whole disk */ + uint64_t vdev_offline; /* persistent offline state */ + uint64_t vdev_faulted; /* persistent faulted state */ + uint64_t vdev_degraded; /* persistent degraded state */ + uint64_t vdev_removed; /* persistent removed state */ + uint64_t vdev_resilver_txg; /* persistent resilvering state */ + uint64_t vdev_rebuild_txg; /* persistent rebuilding state */ + uint64_t vdev_nparity; /* number of parity devices for raidz */ + char *vdev_path; /* vdev path (if any) */ + char *vdev_devid; /* vdev devid (if any) */ + char *vdev_physpath; /* vdev device path (if any) */ + char *vdev_enc_sysfs_path; /* enclosure sysfs path */ + char *vdev_fru; /* physical FRU location */ + uint64_t vdev_not_present; /* not present during import */ + uint64_t vdev_unspare; /* unspare when resilvering done */ + boolean_t vdev_nowritecache; /* true if flushwritecache failed */ + boolean_t vdev_has_trim; /* TRIM is supported */ + boolean_t vdev_has_securetrim; /* secure TRIM is supported */ + boolean_t vdev_checkremove; /* temporary online test */ + boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ + boolean_t vdev_delayed_close; /* delayed device close? */ + boolean_t vdev_tmpoffline; /* device taken offline temporarily? */ + boolean_t vdev_detached; /* device detached? */ + boolean_t vdev_cant_read; /* vdev is failing all reads */ + boolean_t vdev_cant_write; /* vdev is failing all writes */ + boolean_t vdev_isspare; /* was a hot spare */ + boolean_t vdev_isl2cache; /* was a l2cache device */ + boolean_t vdev_copy_uberblocks; /* post expand copy uberblocks */ + boolean_t vdev_resilver_deferred; /* resilver deferred */ + vdev_queue_t vdev_queue; /* I/O deadline schedule queue */ + vdev_cache_t vdev_cache; /* physical block cache */ + spa_aux_vdev_t *vdev_aux; /* for l2cache and spares vdevs */ + zio_t *vdev_probe_zio; /* root of current probe */ + vdev_aux_t vdev_label_aux; /* on-disk aux state */ + uint64_t vdev_leaf_zap; + hrtime_t vdev_mmp_pending; /* 0 if write finished */ + uint64_t vdev_mmp_kstat_id; /* to find kstat entry */ + uint64_t vdev_expansion_time; /* vdev's last expansion time */ + list_node_t vdev_leaf_node; /* leaf vdev list */ + + /* + * For DTrace to work in userland (libzpool) context, these fields must + * remain at the end of the structure. DTrace will use the kernel's + * CTF definition for 'struct vdev', and since the size of a kmutex_t is + * larger in userland, the offsets for the rest of the fields would be + * incorrect. + */ + kmutex_t vdev_dtl_lock; /* vdev_dtl_{map,resilver} */ + kmutex_t vdev_stat_lock; /* vdev_stat */ + kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ + + /* + * We rate limit ZIO delay and ZIO checksum events, since they + * can flood ZED with tons of events when a drive is acting up. + */ + zfs_ratelimit_t vdev_delay_rl; + zfs_ratelimit_t vdev_checksum_rl; +}; + +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_be) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 +#define VDEV_PHYS_SIZE (112 << 10) +#define VDEV_UBERBLOCK_RING (128 << 10) + +/* + * MMP blocks occupy the last MMP_BLOCKS_PER_LABEL slots in the uberblock + * ring when MMP is enabled. + */ +#define MMP_BLOCKS_PER_LABEL 1 + +/* The largest uberblock we support is 8k. */ +#define MAX_UBERBLOCK_SHIFT (13) +#define VDEV_UBERBLOCK_SHIFT(vd) \ + MIN(MAX((vd)->vdev_top->vdev_ashift, UBERBLOCK_SHIFT), \ + MAX_UBERBLOCK_SHIFT) +#define VDEV_UBERBLOCK_COUNT(vd) \ + (VDEV_UBERBLOCK_RING >> VDEV_UBERBLOCK_SHIFT(vd)) +#define VDEV_UBERBLOCK_OFFSET(vd, n) \ + offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) +#define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) + +typedef struct vdev_phys { + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; +} vdev_phys_t; + +typedef enum vbe_vers { + /* The bootenv file is stored as ascii text in the envblock */ + VB_RAW = 0, + + /* + * The bootenv file is converted to an nvlist and then packed into the + * envblock. + */ + VB_NVLIST = 1 +} vbe_vers_t; + +typedef struct vdev_boot_envblock { + uint64_t vbe_version; + char vbe_bootenv[VDEV_PAD_SIZE - sizeof (uint64_t) - + sizeof (zio_eck_t)]; + zio_eck_t vbe_zbt; +} vdev_boot_envblock_t; + +CTASSERT_GLOBAL(sizeof (vdev_boot_envblock_t) == VDEV_PAD_SIZE); + +typedef struct vdev_label { + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + vdev_boot_envblock_t vl_be; /* 8K */ + vdev_phys_t vl_vdev_phys; /* 112K */ + char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ +} vdev_label_t; /* 256K total */ + +/* + * vdev_dirty() flags + */ +#define VDD_METASLAB 0x01 +#define VDD_DTL 0x02 + +/* Offset of embedded boot loader region on each label */ +#define VDEV_BOOT_OFFSET (2 * sizeof (vdev_label_t)) +/* + * Size of embedded boot loader region on each label. + * The total size of the first two labels plus the boot area is 4MB. + */ +#define VDEV_BOOT_SIZE (7ULL << 19) /* 3.5M */ + +/* + * Size of label regions at the start and end of each leaf device. + */ +#define VDEV_LABEL_START_SIZE (2 * sizeof (vdev_label_t) + VDEV_BOOT_SIZE) +#define VDEV_LABEL_END_SIZE (2 * sizeof (vdev_label_t)) +#define VDEV_LABELS 4 +#define VDEV_BEST_LABEL VDEV_LABELS + +#define VDEV_ALLOC_LOAD 0 +#define VDEV_ALLOC_ADD 1 +#define VDEV_ALLOC_SPARE 2 +#define VDEV_ALLOC_L2CACHE 3 +#define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 +#define VDEV_ALLOC_ATTACH 6 + +/* + * Allocate or free a vdev + */ +extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, + vdev_ops_t *ops); +extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, + vdev_t *parent, uint_t id, int alloctype); +extern void vdev_free(vdev_t *vd); + +/* + * Add or remove children and parents + */ +extern void vdev_add_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_remove_child(vdev_t *pvd, vdev_t *cvd); +extern void vdev_compact_children(vdev_t *pvd); +extern vdev_t *vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops); +extern void vdev_remove_parent(vdev_t *cvd); + +/* + * vdev sync load and sync + */ +extern boolean_t vdev_log_state_valid(vdev_t *vd); +extern int vdev_load(vdev_t *vd); +extern int vdev_dtl_load(vdev_t *vd); +extern void vdev_sync(vdev_t *vd, uint64_t txg); +extern void vdev_sync_done(vdev_t *vd, uint64_t txg); +extern void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg); +extern void vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg); + +/* + * Available vdev types. + */ +extern vdev_ops_t vdev_root_ops; +extern vdev_ops_t vdev_mirror_ops; +extern vdev_ops_t vdev_replacing_ops; +extern vdev_ops_t vdev_raidz_ops; +extern vdev_ops_t vdev_disk_ops; +extern vdev_ops_t vdev_file_ops; +extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_hole_ops; +extern vdev_ops_t vdev_spare_ops; +extern vdev_ops_t vdev_indirect_ops; + +/* + * Common size functions + */ +extern void vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, + range_seg64_t *out); +extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); +extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern void vdev_set_min_asize(vdev_t *vd); + +/* + * Global variables + */ +extern int zfs_vdev_standard_sm_blksz; +/* zdb uses this tunable, so it must be declared here to make lint happy. */ +extern int zfs_vdev_cache_size; + +/* + * Functions from vdev_indirect.c + */ +extern void vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx); +extern boolean_t vdev_indirect_should_condense(vdev_t *vd); +extern void spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx); +extern int vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj); +extern int vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise); + +/* + * Other miscellaneous functions + */ +int vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj); + +/* + * Vdev ashift optimization tunables + */ +extern uint64_t zfs_vdev_min_auto_ashift; +extern uint64_t zfs_vdev_max_auto_ashift; +int param_set_min_auto_ashift(ZFS_MODULE_PARAM_ARGS); +int param_set_max_auto_ashift(ZFS_MODULE_PARAM_ARGS); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_IMPL_H */ diff --git a/include/sys/vdev_indirect_births.h b/include/sys/vdev_indirect_births.h new file mode 100644 index 000000000000..987b14485d2b --- /dev/null +++ b/include/sys/vdev_indirect_births.h @@ -0,0 +1,80 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_BIRTHS_H +#define _SYS_VDEV_INDIRECT_BIRTHS_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_birth_entry_phys { + uint64_t vibe_offset; + uint64_t vibe_phys_birth_txg; +} vdev_indirect_birth_entry_phys_t; + +typedef struct vdev_indirect_birth_phys { + uint64_t vib_count; /* count of v_i_b_entry_phys_t's */ +} vdev_indirect_birth_phys_t; + +typedef struct vdev_indirect_births { + uint64_t vib_object; + + /* + * Each entry indicates that everything up to but not including + * vibe_offset was copied in vibe_phys_birth_txg. Entries are sorted + * by increasing phys_birth, and also by increasing offset. See + * vdev_indirect_births_physbirth for usage. + */ + vdev_indirect_birth_entry_phys_t *vib_entries; + + objset_t *vib_objset; + + dmu_buf_t *vib_dbuf; + vdev_indirect_birth_phys_t *vib_phys; +} vdev_indirect_births_t; + +extern vdev_indirect_births_t *vdev_indirect_births_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_births_close(vdev_indirect_births_t *vib); +extern boolean_t vdev_indirect_births_is_open(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_births_free(objset_t *os, uint64_t object, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_count(vdev_indirect_births_t *vib); +extern uint64_t vdev_indirect_births_object(vdev_indirect_births_t *vib); + +extern void vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t txg, dmu_tx_t *tx); + +extern uint64_t vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, + uint64_t offset, uint64_t asize); + +extern uint64_t vdev_indirect_births_last_entry_txg( + vdev_indirect_births_t *vib); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_BIRTHS_H */ diff --git a/include/sys/vdev_indirect_mapping.h b/include/sys/vdev_indirect_mapping.h new file mode 100644 index 000000000000..7e42c1019504 --- /dev/null +++ b/include/sys/vdev_indirect_mapping.h @@ -0,0 +1,141 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INDIRECT_MAPPING_H +#define _SYS_VDEV_INDIRECT_MAPPING_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct vdev_indirect_mapping_entry_phys { + /* + * Decode with DVA_MAPPING_* macros. + * Contains: + * the source offset (low 63 bits) + * the one-bit "mark", used for garbage collection (by zdb) + */ + uint64_t vimep_src; + + /* + * Note: the DVA's asize is 24 bits, and can thus store ranges + * up to 8GB. + */ + dva_t vimep_dst; +} vdev_indirect_mapping_entry_phys_t; + +#define DVA_MAPPING_GET_SRC_OFFSET(vimep) \ + BF64_GET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0) +#define DVA_MAPPING_SET_SRC_OFFSET(vimep, x) \ + BF64_SET_SB((vimep)->vimep_src, 0, 63, SPA_MINBLOCKSHIFT, 0, x) + +typedef struct vdev_indirect_mapping_entry { + vdev_indirect_mapping_entry_phys_t vime_mapping; + uint32_t vime_obsolete_count; + list_node_t vime_node; +} vdev_indirect_mapping_entry_t; + +/* + * This is stored in the bonus buffer of the mapping object, see comment of + * vdev_indirect_config for more details. + */ +typedef struct vdev_indirect_mapping_phys { + uint64_t vimp_max_offset; + uint64_t vimp_bytes_mapped; + uint64_t vimp_num_entries; /* number of v_i_m_entry_phys_t's */ + + /* + * For each entry in the mapping object, this object contains an + * entry representing the number of bytes of that mapping entry + * that were no longer in use by the pool at the time this indirect + * vdev was last condensed. + */ + uint64_t vimp_counts_object; +} vdev_indirect_mapping_phys_t; + +#define VDEV_INDIRECT_MAPPING_SIZE_V0 (3 * sizeof (uint64_t)) + +typedef struct vdev_indirect_mapping { + uint64_t vim_object; + boolean_t vim_havecounts; + + /* + * An ordered array of all mapping entries, sorted by source offset. + * Note that vim_entries is needed during a removal (and contains + * mappings that have been synced to disk so far) to handle frees + * from the removing device. + */ + vdev_indirect_mapping_entry_phys_t *vim_entries; + + objset_t *vim_objset; + + dmu_buf_t *vim_dbuf; + vdev_indirect_mapping_phys_t *vim_phys; +} vdev_indirect_mapping_t; + +extern vdev_indirect_mapping_t *vdev_indirect_mapping_open(objset_t *os, + uint64_t object); +extern void vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx); +extern void vdev_indirect_mapping_free(objset_t *os, uint64_t obj, + dmu_tx_t *tx); + +extern uint64_t vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_bytes_mapped( + vdev_indirect_mapping_t *vim); +extern uint64_t vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim); + +/* + * Writes the given list of vdev_indirect_mapping_entry_t to the mapping + * then updates internal state. + */ +extern void vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *vime_list, dmu_tx_t *tx); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern vdev_indirect_mapping_entry_phys_t * + vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset); + +extern uint32_t *vdev_indirect_mapping_load_obsolete_counts( + vdev_indirect_mapping_t *vim); +extern void vdev_indirect_mapping_load_obsolete_spacemap( + vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm); +extern void vdev_indirect_mapping_increment_obsolete_count( + vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t asize, uint32_t *counts); +extern void vdev_indirect_mapping_free_obsolete_counts( + vdev_indirect_mapping_t *vim, uint32_t *counts); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INDIRECT_MAPPING_H */ diff --git a/include/sys/vdev_initialize.h b/include/sys/vdev_initialize.h new file mode 100644 index 000000000000..81d39ebebcb2 --- /dev/null +++ b/include/sys/vdev_initialize.h @@ -0,0 +1,47 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_INITIALIZE_H +#define _SYS_VDEV_INITIALIZE_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern void vdev_initialize(vdev_t *vd); +extern void vdev_initialize_stop(vdev_t *vd, + vdev_initializing_state_t tgt_state, list_t *vd_list); +extern void vdev_initialize_stop_all(vdev_t *vd, + vdev_initializing_state_t tgt_state); +extern void vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list); +extern void vdev_initialize_restart(vdev_t *vd); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_INITIALIZE_H */ diff --git a/include/sys/vdev_raidz.h b/include/sys/vdev_raidz.h new file mode 100644 index 000000000000..0ce2b5ea1d67 --- /dev/null +++ b/include/sys/vdev_raidz.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Neskovic . + */ + +#ifndef _SYS_VDEV_RAIDZ_H +#define _SYS_VDEV_RAIDZ_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zio; +struct raidz_map; +#if !defined(_KERNEL) +struct kernel_param {}; +#endif + +/* + * vdev_raidz interface + */ +struct raidz_map *vdev_raidz_map_alloc(struct zio *, uint64_t, uint64_t, + uint64_t); +void vdev_raidz_map_free(struct raidz_map *); +void vdev_raidz_generate_parity(struct raidz_map *); +int vdev_raidz_reconstruct(struct raidz_map *, const int *, int); + +/* + * vdev_raidz_math interface + */ +void vdev_raidz_math_init(void); +void vdev_raidz_math_fini(void); +const struct raidz_impl_ops *vdev_raidz_math_get_ops(void); +int vdev_raidz_math_generate(struct raidz_map *); +int vdev_raidz_math_reconstruct(struct raidz_map *, const int *, const int *, + const int); +int vdev_raidz_impl_set(const char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_RAIDZ_H */ diff --git a/include/sys/vdev_raidz_impl.h b/include/sys/vdev_raidz_impl.h new file mode 100644 index 000000000000..8492daedb6f8 --- /dev/null +++ b/include/sys/vdev_raidz_impl.h @@ -0,0 +1,375 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_H +#define _VDEV_RAIDZ_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define CODE_P (0U) +#define CODE_Q (1U) +#define CODE_R (2U) + +#define PARITY_P (1U) +#define PARITY_PQ (2U) +#define PARITY_PQR (3U) + +#define TARGET_X (0U) +#define TARGET_Y (1U) +#define TARGET_Z (2U) + +/* + * Parity generation methods indexes + */ +enum raidz_math_gen_op { + RAIDZ_GEN_P = 0, + RAIDZ_GEN_PQ, + RAIDZ_GEN_PQR, + RAIDZ_GEN_NUM = 3 +}; +/* + * Data reconstruction methods indexes + */ +enum raidz_rec_op { + RAIDZ_REC_P = 0, + RAIDZ_REC_Q, + RAIDZ_REC_R, + RAIDZ_REC_PQ, + RAIDZ_REC_PR, + RAIDZ_REC_QR, + RAIDZ_REC_PQR, + RAIDZ_REC_NUM = 7 +}; + +extern const char *raidz_gen_name[RAIDZ_GEN_NUM]; +extern const char *raidz_rec_name[RAIDZ_REC_NUM]; + +/* + * Methods used to define raidz implementation + * + * @raidz_gen_f Parity generation function + * @par1 pointer to raidz_map + * @raidz_rec_f Data reconstruction function + * @par1 pointer to raidz_map + * @par2 array of reconstruction targets + * @will_work_f Function returns TRUE if impl. is supported on the system + * @init_impl_f Function is called once on init + * @fini_impl_f Function is called once on fini + */ +typedef void (*raidz_gen_f)(void *); +typedef int (*raidz_rec_f)(void *, const int *); +typedef boolean_t (*will_work_f)(void); +typedef void (*init_impl_f)(void); +typedef void (*fini_impl_f)(void); + +#define RAIDZ_IMPL_NAME_MAX (20) + +typedef struct raidz_impl_ops { + init_impl_f init; + fini_impl_f fini; + raidz_gen_f gen[RAIDZ_GEN_NUM]; /* Parity generate functions */ + raidz_rec_f rec[RAIDZ_REC_NUM]; /* Data reconstruction functions */ + will_work_f is_supported; /* Support check function */ + char name[RAIDZ_IMPL_NAME_MAX]; /* Name of the implementation */ +} raidz_impl_ops_t; + +typedef struct raidz_col { + uint64_t rc_devidx; /* child device index for I/O */ + uint64_t rc_offset; /* device offset */ + uint64_t rc_size; /* I/O size */ + abd_t *rc_abd; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ + int rc_error; /* I/O error for this device */ + uint8_t rc_tried; /* Did we attempt this I/O column? */ + uint8_t rc_skipped; /* Did we skip this I/O column? */ +} raidz_col_t; + +typedef struct raidz_map { + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ + uint64_t rm_bigcols; /* Number of oversized columns */ + uint64_t rm_asize; /* Actual total I/O size */ + uint64_t rm_missingdata; /* Count of missing data devices */ + uint64_t rm_missingparity; /* Count of missing parity devices */ + uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + abd_t *rm_abd_copy; /* rm_asize-buffer of copied data */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ + const raidz_impl_ops_t *rm_ops; /* RAIDZ math operations */ + raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ +} raidz_map_t; + +#define RAIDZ_ORIGINAL_IMPL (INT_MAX) + +extern const raidz_impl_ops_t vdev_raidz_scalar_impl; +extern boolean_t raidz_will_scalar_work(void); + +#if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_sse2_impl; +#endif +#if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_ssse3_impl; +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_avx2_impl; +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_avx512f_impl; +#endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ +extern const raidz_impl_ops_t vdev_raidz_avx512bw_impl; +#endif +#if defined(__aarch64__) +extern const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl; +extern const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl; +#endif +#if defined(__powerpc__) +extern const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl; +#endif + +/* + * Commonly used raidz_map helpers + * + * raidz_parity Returns parity of the RAIDZ block + * raidz_ncols Returns number of columns the block spans + * raidz_nbigcols Returns number of big columns + * raidz_col_p Returns pointer to a column + * raidz_col_size Returns size of a column + * raidz_big_size Returns size of big columns + * raidz_short_size Returns size of short columns + */ +#define raidz_parity(rm) ((rm)->rm_firstdatacol) +#define raidz_ncols(rm) ((rm)->rm_cols) +#define raidz_nbigcols(rm) ((rm)->rm_bigcols) +#define raidz_col_p(rm, c) ((rm)->rm_col + (c)) +#define raidz_col_size(rm, c) ((rm)->rm_col[c].rc_size) +#define raidz_big_size(rm) (raidz_col_size(rm, CODE_P)) +#define raidz_short_size(rm) (raidz_col_size(rm, raidz_ncols(rm)-1)) + +/* + * Macro defines an RAIDZ parity generation method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_GEN_WRAP(code, impl) \ +static void \ +impl ## _gen_ ## code(void *rmp) \ +{ \ + raidz_map_t *rm = (raidz_map_t *)rmp; \ + raidz_generate_## code ## _impl(rm); \ +} + +/* + * Macro defines an RAIDZ data reconstruction method + * + * @code parity the function produce + * @impl name of the implementation + */ +#define _RAIDZ_REC_WRAP(code, impl) \ +static int \ +impl ## _rec_ ## code(void *rmp, const int *tgtidx) \ +{ \ + raidz_map_t *rm = (raidz_map_t *)rmp; \ + return (raidz_reconstruct_## code ## _impl(rm, tgtidx)); \ +} + +/* + * Define all gen methods for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_GEN_METHODS(impl) \ + _RAIDZ_GEN_WRAP(p, impl); \ + _RAIDZ_GEN_WRAP(pq, impl); \ + _RAIDZ_GEN_WRAP(pqr, impl) + +/* + * Define all rec functions for an implementation + * + * @impl name of the implementation + */ +#define DEFINE_REC_METHODS(impl) \ + _RAIDZ_REC_WRAP(p, impl); \ + _RAIDZ_REC_WRAP(q, impl); \ + _RAIDZ_REC_WRAP(r, impl); \ + _RAIDZ_REC_WRAP(pq, impl); \ + _RAIDZ_REC_WRAP(pr, impl); \ + _RAIDZ_REC_WRAP(qr, impl); \ + _RAIDZ_REC_WRAP(pqr, impl) + +#define RAIDZ_GEN_METHODS(impl) \ +{ \ + [RAIDZ_GEN_P] = & impl ## _gen_p, \ + [RAIDZ_GEN_PQ] = & impl ## _gen_pq, \ + [RAIDZ_GEN_PQR] = & impl ## _gen_pqr \ +} + +#define RAIDZ_REC_METHODS(impl) \ +{ \ + [RAIDZ_REC_P] = & impl ## _rec_p, \ + [RAIDZ_REC_Q] = & impl ## _rec_q, \ + [RAIDZ_REC_R] = & impl ## _rec_r, \ + [RAIDZ_REC_PQ] = & impl ## _rec_pq, \ + [RAIDZ_REC_PR] = & impl ## _rec_pr, \ + [RAIDZ_REC_QR] = & impl ## _rec_qr, \ + [RAIDZ_REC_PQR] = & impl ## _rec_pqr \ +} + + +typedef struct raidz_impl_kstat { + uint64_t gen[RAIDZ_GEN_NUM]; /* gen method speed B/s */ + uint64_t rec[RAIDZ_REC_NUM]; /* rec method speed B/s */ +} raidz_impl_kstat_t; + +/* + * Enumerate various multiplication constants + * used in reconstruction methods + */ +typedef enum raidz_mul_info { + /* Reconstruct Q */ + MUL_Q_X = 0, + /* Reconstruct R */ + MUL_R_X = 0, + /* Reconstruct PQ */ + MUL_PQ_X = 0, + MUL_PQ_Y = 1, + /* Reconstruct PR */ + MUL_PR_X = 0, + MUL_PR_Y = 1, + /* Reconstruct QR */ + MUL_QR_XQ = 0, + MUL_QR_X = 1, + MUL_QR_YQ = 2, + MUL_QR_Y = 3, + /* Reconstruct PQR */ + MUL_PQR_XP = 0, + MUL_PQR_XQ = 1, + MUL_PQR_XR = 2, + MUL_PQR_YU = 3, + MUL_PQR_YP = 4, + MUL_PQR_YQ = 5, + + MUL_CNT = 6 +} raidz_mul_info_t; + +/* + * Powers of 2 in the Galois field. + */ +extern const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))); +/* Logs of 2 in the Galois field defined above. */ +extern const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))); + +/* + * Multiply a given number by 2 raised to the given power. + */ +static inline uint8_t +vdev_raidz_exp2(const uint8_t a, const unsigned exp) +{ + if (a == 0) + return (0); + + return (vdev_raidz_pow2[(exp + (unsigned)vdev_raidz_log2[a]) % 255]); +} + +/* + * Galois Field operations. + * + * gf_exp2 - computes 2 raised to the given power + * gf_exp2 - computes 4 raised to the given power + * gf_mul - multiplication + * gf_div - division + * gf_inv - multiplicative inverse + */ +typedef unsigned gf_t; +typedef unsigned gf_log_t; + +static inline gf_t +gf_mul(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + if (a == 0 || b == 0) + return (0); + + logsum = (gf_log_t)vdev_raidz_log2[a] + (gf_log_t)vdev_raidz_log2[b]; + + return ((gf_t)vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_div(const gf_t a, const gf_t b) +{ + gf_log_t logsum; + + ASSERT3U(b, >, 0); + if (a == 0) + return (0); + + logsum = (gf_log_t)255 + (gf_log_t)vdev_raidz_log2[a] - + (gf_log_t)vdev_raidz_log2[b]; + + return ((gf_t)vdev_raidz_pow2[logsum % 255]); +} + +static inline gf_t +gf_inv(const gf_t a) +{ + gf_log_t logsum; + + ASSERT3U(a, >, 0); + + logsum = (gf_log_t)255 - (gf_log_t)vdev_raidz_log2[a]; + + return ((gf_t)vdev_raidz_pow2[logsum]); +} + +static inline gf_t +gf_exp2(gf_log_t exp) +{ + return (vdev_raidz_pow2[exp % 255]); +} + +static inline gf_t +gf_exp4(gf_log_t exp) +{ + ASSERT3U(exp, <=, 255); + return ((gf_t)vdev_raidz_pow2[(2 * exp) % 255]); +} + +#ifdef __cplusplus +} +#endif + +#endif /* _VDEV_RAIDZ_H */ diff --git a/include/sys/vdev_rebuild.h b/include/sys/vdev_rebuild.h new file mode 100644 index 000000000000..3d4b8cc46836 --- /dev/null +++ b/include/sys/vdev_rebuild.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_REBUILD_H +#define _SYS_VDEV_REBUILD_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Number of entries in the physical vdev_rebuild_phys structure. This + * state is stored per top-level as VDEV_ZAP_TOP_VDEV_REBUILD_PHYS. + */ +#define REBUILD_PHYS_ENTRIES 12 + +/* + * On-disk rebuild configuration and state. When adding new fields they + * must be added to the end of the structure. + */ +typedef struct vdev_rebuild_phys { + uint64_t vrp_rebuild_state; /* vdev_rebuild_state_t */ + uint64_t vrp_last_offset; /* last rebuilt offset */ + uint64_t vrp_min_txg; /* minimum missing txg */ + uint64_t vrp_max_txg; /* maximum missing txg */ + uint64_t vrp_start_time; /* start time */ + uint64_t vrp_end_time; /* end time */ + uint64_t vrp_scan_time_ms; /* total run time in ms */ + uint64_t vrp_bytes_scanned; /* alloc bytes scanned */ + uint64_t vrp_bytes_issued; /* read bytes rebuilt */ + uint64_t vrp_bytes_rebuilt; /* rebuilt bytes */ + uint64_t vrp_bytes_est; /* total bytes to scan */ + uint64_t vrp_errors; /* errors during rebuild */ +} vdev_rebuild_phys_t; + +/* + * The vdev_rebuild_t describes the current state and how a top-level vdev + * should be rebuilt. The core elements are the top-vdev, the metaslab being + * rebuilt, range tree containing the allocted extents and the on-disk state. + */ +typedef struct vdev_rebuild { + vdev_t *vr_top_vdev; /* top-level vdev to rebuild */ + metaslab_t *vr_scan_msp; /* scanning disabled metaslab */ + range_tree_t *vr_scan_tree; /* scan ranges (in metaslab) */ + + /* In-core state and progress */ + uint64_t vr_scan_offset[TXG_SIZE]; + uint64_t vr_prev_scan_time_ms; /* any previous scan time */ + + /* Per-rebuild pass statistics for calculating bandwidth */ + uint64_t vr_pass_start_time; + uint64_t vr_pass_bytes_scanned; + uint64_t vr_pass_bytes_issued; + + /* On-disk state updated by vdev_rebuild_zap_update_sync() */ + vdev_rebuild_phys_t vr_rebuild_phys; +} vdev_rebuild_t; + +boolean_t vdev_rebuild_active(vdev_t *); + +int vdev_rebuild_load(vdev_t *); +void vdev_rebuild(vdev_t *); +void vdev_rebuild_stop_wait(vdev_t *); +void vdev_rebuild_stop_all(spa_t *); +void vdev_rebuild_restart(spa_t *); +void vdev_rebuild_clear_sync(void *, dmu_tx_t *); +int vdev_rebuild_get_stats(vdev_t *, vdev_rebuild_stat_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REBUILD_H */ diff --git a/include/sys/vdev_removal.h b/include/sys/vdev_removal.h new file mode 100644 index 000000000000..e3bab0658d62 --- /dev/null +++ b/include/sys/vdev_removal.h @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_VDEV_REMOVAL_H +#define _SYS_VDEV_REMOVAL_H + +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct spa_vdev_removal { + uint64_t svr_vdev_id; + uint64_t svr_max_offset_to_sync[TXG_SIZE]; + /* Thread performing a vdev removal. */ + kthread_t *svr_thread; + /* Segments left to copy from the current metaslab. */ + range_tree_t *svr_allocd_segs; + kmutex_t svr_lock; + kcondvar_t svr_cv; + boolean_t svr_thread_exit; + + /* + * New mappings to write out each txg. + */ + list_t svr_new_segments[TXG_SIZE]; + + /* + * Ranges that were freed while a mapping was in flight. This is + * a subset of the ranges covered by vdev_im_new_segments. + */ + range_tree_t *svr_frees[TXG_SIZE]; + + /* + * Number of bytes which we have finished our work for + * in each txg. This could be data copied (which will be part of + * the mappings in vdev_im_new_segments), or data freed before + * we got around to copying it. + */ + uint64_t svr_bytes_done[TXG_SIZE]; + + /* List of leaf zap objects to be unlinked */ + nvlist_t *svr_zaplist; +} spa_vdev_removal_t; + +typedef struct spa_condensing_indirect { + /* + * New mappings to write out each txg. + */ + list_t sci_new_mapping_entries[TXG_SIZE]; + + vdev_indirect_mapping_t *sci_new_mapping; +} spa_condensing_indirect_t; + +extern int spa_remove_init(spa_t *); +extern void spa_restart_removal(spa_t *); +extern int spa_condense_init(spa_t *); +extern void spa_condense_fini(spa_t *); +extern void spa_start_indirect_condensing_thread(spa_t *); +extern void spa_vdev_condense_suspend(spa_t *); +extern int spa_vdev_remove(spa_t *, uint64_t, boolean_t); +extern void free_from_removing_vdev(vdev_t *, uint64_t, uint64_t); +extern int spa_removal_get_stats(spa_t *, pool_removal_stat_t *); +extern void svr_sync(spa_t *, dmu_tx_t *); +extern void spa_vdev_remove_suspend(spa_t *); +extern int spa_vdev_remove_cancel(spa_t *); +extern void spa_vdev_removal_destroy(spa_vdev_removal_t *); +extern uint64_t spa_remove_max_segment(spa_t *); + +extern int vdev_removal_max_span; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_REMOVAL_H */ diff --git a/include/sys/vdev_trim.h b/include/sys/vdev_trim.h new file mode 100644 index 000000000000..16f4be2a41f8 --- /dev/null +++ b/include/sys/vdev_trim.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2019 Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_VDEV_TRIM_H +#define _SYS_VDEV_TRIM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern unsigned int zfs_trim_metaslab_skip; + +extern void vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, + boolean_t secure); +extern void vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt, list_t *vd_list); +extern void vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state); +extern void vdev_trim_stop_wait(spa_t *spa, list_t *vd_list); +extern void vdev_trim_restart(vdev_t *vd); +extern void vdev_autotrim(spa_t *spa); +extern void vdev_autotrim_stop_all(spa_t *spa); +extern void vdev_autotrim_stop_wait(vdev_t *vd); +extern void vdev_autotrim_restart(spa_t *spa); +extern int vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size); +extern void vdev_trim_l2arc(spa_t *spa); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VDEV_TRIM_H */ diff --git a/include/sys/xvattr.h b/include/sys/xvattr.h new file mode 100644 index 000000000000..0463bdfbc9d8 --- /dev/null +++ b/include/sys/xvattr.h @@ -0,0 +1,338 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1988, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_XVATTR_H +#define _SYS_XVATTR_H + +#include +#include + +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +/* + * Structure of all optional attributes. + */ +typedef struct xoptattr { + inode_timespec_t xoa_createtime; /* Create time of file */ + uint8_t xoa_archive; + uint8_t xoa_system; + uint8_t xoa_readonly; + uint8_t xoa_hidden; + uint8_t xoa_nounlink; + uint8_t xoa_immutable; + uint8_t xoa_appendonly; + uint8_t xoa_nodump; + uint8_t xoa_opaque; + uint8_t xoa_av_quarantined; + uint8_t xoa_av_modified; + uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; + uint8_t xoa_reparse; + uint64_t xoa_generation; + uint8_t xoa_offline; + uint8_t xoa_sparse; + uint8_t xoa_projinherit; + uint64_t xoa_projid; +} xoptattr_t; + +/* + * The xvattr structure is really a variable length structure that + * is made up of: + * - The classic vattr_t (xva_vattr) + * - a 32 bit quantity (xva_mapsize) that specifies the size of the + * attribute bitmaps in 32 bit words. + * - A pointer to the returned attribute bitmap (needed because the + * previous element, the requested attribute bitmap) is variable length. + * - The requested attribute bitmap, which is an array of 32 bit words. + * Callers use the XVA_SET_REQ() macro to set the bits corresponding to + * the attributes that are being requested. + * - The returned attribute bitmap, which is an array of 32 bit words. + * File systems that support optional attributes use the XVA_SET_RTN() + * macro to set the bits corresponding to the attributes that are being + * returned. + * - The xoptattr_t structure which contains the attribute values + * + * xva_mapsize determines how many words in the attribute bitmaps. + * Immediately following the attribute bitmaps is the xoptattr_t. + * xva_getxoptattr() is used to get the pointer to the xoptattr_t + * section. + */ + +#define XVA_MAPSIZE 3 /* Size of attr bitmaps */ +#define XVA_MAGIC 0x78766174 /* Magic # for verification */ + +/* + * The xvattr structure is an extensible structure which permits optional + * attributes to be requested/returned. File systems may or may not support + * optional attributes. They do so at their own discretion but if they do + * support optional attributes, they must register the VFSFT_XVATTR feature + * so that the optional attributes can be set/retrieved. + * + * The fields of the xvattr structure are: + * + * xva_vattr - The first element of an xvattr is a legacy vattr structure + * which includes the common attributes. If AT_XVATTR is set in the va_mask + * then the entire structure is treated as an xvattr. If AT_XVATTR is not + * set, then only the xva_vattr structure can be used. + * + * xva_magic - 0x78766174 (hex for "xvat"). Magic number for verification. + * + * xva_mapsize - Size of requested and returned attribute bitmaps. + * + * xva_rtnattrmapp - Pointer to xva_rtnattrmap[]. We need this since the + * size of the array before it, xva_reqattrmap[], could change which means + * the location of xva_rtnattrmap[] could change. This will allow unbundled + * file systems to find the location of xva_rtnattrmap[] when the sizes change. + * + * xva_reqattrmap[] - Array of requested attributes. Attributes are + * represented by a specific bit in a specific element of the attribute + * map array. Callers set the bits corresponding to the attributes + * that the caller wants to get/set. + * + * xva_rtnattrmap[] - Array of attributes that the file system was able to + * process. Not all file systems support all optional attributes. This map + * informs the caller which attributes the underlying file system was able + * to set/get. (Same structure as the requested attributes array in terms + * of each attribute corresponding to specific bits and array elements.) + * + * xva_xoptattrs - Structure containing values of optional attributes. + * These values are only valid if the corresponding bits in xva_reqattrmap + * are set and the underlying file system supports those attributes. + */ +typedef struct xvattr { + vattr_t xva_vattr; /* Embedded vattr structure */ + uint32_t xva_magic; /* Magic Number */ + uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ + uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ + uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ + uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ + xoptattr_t xva_xoptattrs; /* Optional attributes */ +} xvattr_t; + +/* + * Attribute bits used in the extensible attribute's (xva's) attribute + * bitmaps. Note that the bitmaps are made up of a variable length number + * of 32-bit words. The convention is to use XAT{n}_{attrname} where "n" + * is the element in the bitmap (starting at 1). This convention is for + * the convenience of the maintainer to keep track of which element each + * attribute belongs to. + * + * NOTE THAT CONSUMERS MUST *NOT* USE THE XATn_* DEFINES DIRECTLY. CONSUMERS + * MUST USE THE XAT_* DEFINES. + */ +#define XAT0_INDEX 0LL /* Index into bitmap for XAT0 attrs */ +#define XAT0_CREATETIME 0x00000001 /* Create time of file */ +#define XAT0_ARCHIVE 0x00000002 /* Archive */ +#define XAT0_SYSTEM 0x00000004 /* System */ +#define XAT0_READONLY 0x00000008 /* Readonly */ +#define XAT0_HIDDEN 0x00000010 /* Hidden */ +#define XAT0_NOUNLINK 0x00000020 /* Nounlink */ +#define XAT0_IMMUTABLE 0x00000040 /* immutable */ +#define XAT0_APPENDONLY 0x00000080 /* appendonly */ +#define XAT0_NODUMP 0x00000100 /* nodump */ +#define XAT0_OPAQUE 0x00000200 /* opaque */ +#define XAT0_AV_QUARANTINED 0x00000400 /* anti-virus quarantine */ +#define XAT0_AV_MODIFIED 0x00000800 /* anti-virus modified */ +#define XAT0_AV_SCANSTAMP 0x00001000 /* anti-virus scanstamp */ +#define XAT0_REPARSE 0x00002000 /* FS reparse point */ +#define XAT0_GEN 0x00004000 /* object generation number */ +#define XAT0_OFFLINE 0x00008000 /* offline */ +#define XAT0_SPARSE 0x00010000 /* sparse */ +#define XAT0_PROJINHERIT 0x00020000 /* Create with parent projid */ +#define XAT0_PROJID 0x00040000 /* Project ID */ + +#define XAT0_ALL_ATTRS (XAT0_CREATETIME|XAT0_ARCHIVE|XAT0_SYSTEM| \ + XAT0_READONLY|XAT0_HIDDEN|XAT0_NOUNLINK|XAT0_IMMUTABLE|XAT0_APPENDONLY| \ + XAT0_NODUMP|XAT0_OPAQUE|XAT0_AV_QUARANTINED| XAT0_AV_MODIFIED| \ + XAT0_AV_SCANSTAMP|XAT0_REPARSE|XATO_GEN|XAT0_OFFLINE|XAT0_SPARSE| \ + XAT0_PROJINHERIT | XAT0_PROJID) + +/* Support for XAT_* optional attributes */ +#define XVA_MASK 0xffffffff /* Used to mask off 32 bits */ +#define XVA_SHFT 32 /* Used to shift index */ + +/* + * Used to pry out the index and attribute bits from the XAT_* attributes + * defined below. Note that we're masking things down to 32 bits then + * casting to uint32_t. + */ +#define XVA_INDEX(attr) ((uint32_t)(((attr) >> XVA_SHFT) & XVA_MASK)) +#define XVA_ATTRBIT(attr) ((uint32_t)((attr) & XVA_MASK)) + +/* + * The following defines present a "flat namespace" so that consumers don't + * need to keep track of which element belongs to which bitmap entry. + * + * NOTE THAT THESE MUST NEVER BE OR-ed TOGETHER + */ +#define XAT_CREATETIME ((XAT0_INDEX << XVA_SHFT) | XAT0_CREATETIME) +#define XAT_ARCHIVE ((XAT0_INDEX << XVA_SHFT) | XAT0_ARCHIVE) +#define XAT_SYSTEM ((XAT0_INDEX << XVA_SHFT) | XAT0_SYSTEM) +#define XAT_READONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_READONLY) +#define XAT_HIDDEN ((XAT0_INDEX << XVA_SHFT) | XAT0_HIDDEN) +#define XAT_NOUNLINK ((XAT0_INDEX << XVA_SHFT) | XAT0_NOUNLINK) +#define XAT_IMMUTABLE ((XAT0_INDEX << XVA_SHFT) | XAT0_IMMUTABLE) +#define XAT_APPENDONLY ((XAT0_INDEX << XVA_SHFT) | XAT0_APPENDONLY) +#define XAT_NODUMP ((XAT0_INDEX << XVA_SHFT) | XAT0_NODUMP) +#define XAT_OPAQUE ((XAT0_INDEX << XVA_SHFT) | XAT0_OPAQUE) +#define XAT_AV_QUARANTINED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_QUARANTINED) +#define XAT_AV_MODIFIED ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_MODIFIED) +#define XAT_AV_SCANSTAMP ((XAT0_INDEX << XVA_SHFT) | XAT0_AV_SCANSTAMP) +#define XAT_REPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_REPARSE) +#define XAT_GEN ((XAT0_INDEX << XVA_SHFT) | XAT0_GEN) +#define XAT_OFFLINE ((XAT0_INDEX << XVA_SHFT) | XAT0_OFFLINE) +#define XAT_SPARSE ((XAT0_INDEX << XVA_SHFT) | XAT0_SPARSE) +#define XAT_PROJINHERIT ((XAT0_INDEX << XVA_SHFT) | XAT0_PROJINHERIT) +#define XAT_PROJID ((XAT0_INDEX << XVA_SHFT) | XAT0_PROJID) + +/* + * The returned attribute map array (xva_rtnattrmap[]) is located past the + * requested attribute map array (xva_reqattrmap[]). Its location changes + * when the array sizes change. We use a separate pointer in a known location + * (xva_rtnattrmapp) to hold the location of xva_rtnattrmap[]. This is + * set in xva_init() + */ +#define XVA_RTNATTRMAP(xvap) ((xvap)->xva_rtnattrmapp) + +/* + * XVA_SET_REQ() sets an attribute bit in the proper element in the bitmap + * of requested attributes (xva_reqattrmap[]). + */ +#define XVA_SET_REQ(xvap, attr) \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (xvap)->xva_reqattrmap[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) +/* + * XVA_CLR_REQ() clears an attribute bit in the proper element in the bitmap + * of requested attributes (xva_reqattrmap[]). + */ +#define XVA_CLR_REQ(xvap, attr) \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (xvap)->xva_reqattrmap[XVA_INDEX(attr)] &= ~XVA_ATTRBIT(attr) + +/* + * XVA_SET_RTN() sets an attribute bit in the proper element in the bitmap + * of returned attributes (xva_rtnattrmap[]). + */ +#define XVA_SET_RTN(xvap, attr) \ + ASSERT((xvap)->xva_vattr.va_mask & AT_XVATTR); \ + ASSERT((xvap)->xva_magic == XVA_MAGIC); \ + (XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] |= XVA_ATTRBIT(attr) + +/* + * XVA_ISSET_REQ() checks the requested attribute bitmap (xva_reqattrmap[]) + * to see of the corresponding attribute bit is set. If so, returns non-zero. + */ +#define XVA_ISSET_REQ(xvap, attr) \ + ((((xvap)->xva_vattr.va_mask & AT_XVATTR) && \ + ((xvap)->xva_magic == XVA_MAGIC) && \ + ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ + ((xvap)->xva_reqattrmap[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) + +/* + * XVA_ISSET_RTN() checks the returned attribute bitmap (xva_rtnattrmap[]) + * to see of the corresponding attribute bit is set. If so, returns non-zero. + */ +#define XVA_ISSET_RTN(xvap, attr) \ + ((((xvap)->xva_vattr.va_mask & AT_XVATTR) && \ + ((xvap)->xva_magic == XVA_MAGIC) && \ + ((xvap)->xva_mapsize > XVA_INDEX(attr))) ? \ + ((XVA_RTNATTRMAP(xvap))[XVA_INDEX(attr)] & XVA_ATTRBIT(attr)) : 0) + +/* + * Zero out the structure, set the size of the requested/returned bitmaps, + * set AT_XVATTR in the embedded vattr_t's va_mask, and set up the pointer + * to the returned attributes array. + */ +static inline void +xva_init(xvattr_t *xvap) +{ + bzero(xvap, sizeof (xvattr_t)); + xvap->xva_mapsize = XVA_MAPSIZE; + xvap->xva_magic = XVA_MAGIC; + xvap->xva_vattr.va_mask = ATTR_XVATTR; + xvap->xva_rtnattrmapp = &(xvap->xva_rtnattrmap)[0]; +} + +/* + * If AT_XVATTR is set, returns a pointer to the embedded xoptattr_t + * structure. Otherwise, returns NULL. + */ +static inline xoptattr_t * +xva_getxoptattr(xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + if (xvap->xva_vattr.va_mask & AT_XVATTR) + xoap = &xvap->xva_xoptattrs; + return (xoap); +} + +#define MODEMASK 07777 /* mode bits plus permission bits */ +#define PERMMASK 00777 /* permission bits */ + +/* + * VOP_ACCESS flags + */ +#define V_ACE_MASK 0x1 /* mask represents NFSv4 ACE permissions */ +#define V_APPEND 0x2 /* want to do append only check */ + +/* + * Structure used on VOP_GETSECATTR and VOP_SETSECATTR operations + */ + +typedef struct vsecattr { + uint_t vsa_mask; /* See below */ + int vsa_aclcnt; /* ACL entry count */ + void *vsa_aclentp; /* pointer to ACL entries */ + int vsa_dfaclcnt; /* default ACL entry count */ + void *vsa_dfaclentp; /* pointer to default ACL entries */ + size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ + uint_t vsa_aclflags; /* ACE ACL flags */ +} vsecattr_t; + +/* vsa_mask values */ +#define VSA_ACL 0x0001 +#define VSA_ACLCNT 0x0002 +#define VSA_DFACL 0x0004 +#define VSA_DFACLCNT 0x0008 +#define VSA_ACE 0x0010 +#define VSA_ACECNT 0x0020 +#define VSA_ACE_ALLTYPES 0x0040 +#define VSA_ACE_ACLFLAGS 0x0080 /* get/set ACE ACL flags */ + +#endif /* _SYS_XVATTR_H */ diff --git a/include/sys/zap.h b/include/sys/zap.h new file mode 100644 index 000000000000..b19b4643879c --- /dev/null +++ b/include/sys/zap.h @@ -0,0 +1,510 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#ifndef _SYS_ZAP_H +#define _SYS_ZAP_H + +/* + * ZAP - ZFS Attribute Processor + * + * The ZAP is a module which sits on top of the DMU (Data Management + * Unit) and implements a higher-level storage primitive using DMU + * objects. Its primary consumer is the ZPL (ZFS Posix Layer). + * + * A "zapobj" is a DMU object which the ZAP uses to stores attributes. + * Users should use only zap routines to access a zapobj - they should + * not access the DMU object directly using DMU routines. + * + * The attributes stored in a zapobj are name-value pairs. The name is + * a zero-terminated string of up to ZAP_MAXNAMELEN bytes (including + * terminating NULL). The value is an array of integers, which may be + * 1, 2, 4, or 8 bytes long. The total space used by the array (number + * of integers * integer length) can be up to ZAP_MAXVALUELEN bytes. + * Note that an 8-byte integer value can be used to store the location + * (object number) of another dmu object (which may be itself a zapobj). + * Note that you can use a zero-length attribute to store a single bit + * of information - the attribute is present or not. + * + * The ZAP routines are thread-safe. However, you must observe the + * DMU's restriction that a transaction may not be operated on + * concurrently. + * + * Any of the routines that return an int may return an I/O error (EIO + * or ECHECKSUM). + * + * + * Implementation / Performance Notes: + * + * The ZAP is intended to operate most efficiently on attributes with + * short (49 bytes or less) names and single 8-byte values, for which + * the microzap will be used. The ZAP should be efficient enough so + * that the user does not need to cache these attributes. + * + * The ZAP's locking scheme makes its routines thread-safe. Operations + * on different zapobjs will be processed concurrently. Operations on + * the same zapobj which only read data will be processed concurrently. + * Operations on the same zapobj which modify data will be processed + * concurrently when there are many attributes in the zapobj (because + * the ZAP uses per-block locking - more than 128 * (number of cpus) + * small attributes will suffice). + */ + +/* + * We're using zero-terminated byte strings (ie. ASCII or UTF-8 C + * strings) for the names of attributes, rather than a byte string + * bounded by an explicit length. If some day we want to support names + * in character sets which have embedded zeros (eg. UTF-16, UTF-32), + * we'll have to add routines for using length-bounded strings. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Specifies matching criteria for ZAP lookups. + * MT_NORMALIZE Use ZAP normalization flags, which can include both + * unicode normalization and case-insensitivity. + * MT_MATCH_CASE Do case-sensitive lookups even if MT_NORMALIZE is + * specified and ZAP normalization flags include + * U8_TEXTPREP_TOUPPER. + */ +typedef enum matchtype { + MT_NORMALIZE = 1 << 0, + MT_MATCH_CASE = 1 << 1, +} matchtype_t; + +typedef enum zap_flags { + /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ + ZAP_FLAG_HASH64 = 1 << 0, + /* Key is binary, not string (zap_add_uint64() can be used) */ + ZAP_FLAG_UINT64_KEY = 1 << 1, + /* + * First word of key (which must be an array of uint64) is + * already randomly distributed. + */ + ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, +} zap_flags_t; + +/* + * Create a new zapobj with no attributes and return its object number. + */ +uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_dnsize(objset_t *ds, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_norm_dnsize(objset_t *ds, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags_dnsize(objset_t *os, int normflags, + zap_flags_t flags, dmu_object_type_t ot, int leaf_blockshift, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx); +uint64_t zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx); + +uint64_t zap_create_link(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, dmu_tx_t *tx); +uint64_t zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, + uint64_t parent_obj, const char *name, int dnodesize, dmu_tx_t *tx); + +/* + * Initialize an already-allocated object. + */ +void mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, + dmu_tx_t *tx); + +/* + * Create a new zapobj with no attributes from the given (unallocated) + * object number. + */ +int zap_create_claim(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_dnsize(objset_t *ds, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); +int zap_create_claim_norm(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +int zap_create_claim_norm_dnsize(objset_t *ds, uint64_t obj, + int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx); + +/* + * The zapobj passed in must be a valid ZAP object for all of the + * following routines. + */ + +/* + * Destroy this zapobj and all its attributes. + * + * Frees the object number using dmu_object_free. + */ +int zap_destroy(objset_t *ds, uint64_t zapobj, dmu_tx_t *tx); + +/* + * Manipulate attributes. + * + * 'integer_size' is in bytes, and must be 1, 2, 4, or 8. + */ + +/* + * Retrieve the contents of the attribute with the given name. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + * + * If 'integer_size' is smaller than the attribute's integer size, the + * call will fail and return EINVAL. + * + * If 'integer_size' is equal to or larger than the attribute's integer + * size, the call will succeed and return 0. + * + * When converting to a larger integer size, the integers will be treated as + * unsigned (ie. no sign-extension will be performed). + * + * 'num_integers' is the length (in integers) of 'buf'. + * + * If the attribute is longer than the buffer, as many integers as will + * fit will be transferred to 'buf'. If the entire attribute was not + * transferred, the call will return EOVERFLOW. + */ +int zap_lookup(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); + +/* + * If rn_len is nonzero, realname will be set to the name of the found + * entry (which may be different from the requested name if matchtype is + * not MT_EXACT). + * + * If normalization_conflictp is not NULL, it will be set if there is + * another name with the same case/unicode normalized form. + */ +int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *normalization_conflictp); +int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); +int zap_prefetch(objset_t *os, uint64_t zapobj, const char *name); +int zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints); + +int zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp); + +int zap_count_write_by_dnode(dnode_t *dn, const char *name, + int add, zfs_refcount_t *towrite, zfs_refcount_t *tooverwrite); + +/* + * Create an attribute with the given name and value. + * + * If an attribute with the given name already exists, the call will + * fail and return EEXIST. + */ +int zap_add(objset_t *ds, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); +int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); + +/* + * Set the attribute with the given name to the given value. If an + * attribute with the given name does not exist, it will be created. If + * an attribute with the given name already exists, the previous value + * will be overwritten. The integer_size may be different from the + * existing attribute's integer size, in which case the attribute's + * integer size will be updated to the new value. + */ +int zap_update(objset_t *ds, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); + +/* + * Get the length (in integers) and the integer size of the specified + * attribute. + * + * If the requested attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_length(objset_t *ds, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers); +int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers); + +/* + * Remove the specified attribute. + * + * If the specified attribute does not exist, the call will fail and + * return ENOENT. + */ +int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); +int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx); +int zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx); +int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx); + +/* + * Returns (in *count) the number of attributes in the specified zap + * object. + */ +int zap_count(objset_t *ds, uint64_t zapobj, uint64_t *count); + +/* + * Returns (in name) the name of the entry whose (value & mask) + * (za_first_integer) is value, or ENOENT if not found. The string + * pointed to by name must be at least 256 bytes long. If mask==0, the + * match must be exact (ie, same as mask=-1ULL). + */ +int zap_value_search(objset_t *os, uint64_t zapobj, + uint64_t value, uint64_t mask, char *name); + +/* + * Transfer all the entries from fromobj into intoobj. Only works on + * int_size=8 num_integers=1 values. Fails if there are any duplicated + * entries. + */ +int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); + +/* Same as zap_join, but set the values to 'value'. */ +int zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx); + +/* Same as zap_join, but add together any duplicated entries. */ +int zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx); + +/* + * Manipulate entries where the name + value are the "same" (the name is + * a stringified version of the value). + */ +int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); + +/* Here the key is an int and the value is a different int. */ +int zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx); +int zap_lookup_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t *valuep); + +int zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx); + +struct zap; +struct zap_leaf; +typedef struct zap_cursor { + /* This structure is opaque! */ + objset_t *zc_objset; + struct zap *zc_zap; + struct zap_leaf *zc_leaf; + uint64_t zc_zapobj; + uint64_t zc_serialized; + uint64_t zc_hash; + uint32_t zc_cd; + boolean_t zc_prefetch; +} zap_cursor_t; + +typedef struct { + int za_integer_length; + /* + * za_normalization_conflict will be set if there are additional + * entries with this normalized form (eg, "foo" and "Foo"). + */ + boolean_t za_normalization_conflict; + uint64_t za_num_integers; + uint64_t za_first_integer; /* no sign extension for <8byte ints */ + char za_name[ZAP_MAXNAMELEN]; +} zap_attribute_t; + +/* + * The interface for listing all the attributes of a zapobj can be + * thought of as cursor moving down a list of the attributes one by + * one. The cookie returned by the zap_cursor_serialize routine is + * persistent across system calls (and across reboot, even). + */ + +/* + * Initialize a zap cursor, pointing to the "first" attribute of the + * zapobj. You must _fini the cursor when you are done with it. + */ +void zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj); +void zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, + uint64_t zapobj); +void zap_cursor_fini(zap_cursor_t *zc); + +/* + * Get the attribute currently pointed to by the cursor. Returns + * ENOENT if at the end of the attributes. + */ +int zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za); + +/* + * Advance the cursor to the next attribute. + */ +void zap_cursor_advance(zap_cursor_t *zc); + +/* + * Get a persistent cookie pointing to the current position of the zap + * cursor. The low 4 bits in the cookie are always zero, and thus can + * be used as to differentiate a serialized cookie from a different type + * of value. The cookie will be less than 2^32 as long as there are + * fewer than 2^22 (4.2 million) entries in the zap object. + */ +uint64_t zap_cursor_serialize(zap_cursor_t *zc); + +/* + * Initialize a zap cursor pointing to the position recorded by + * zap_cursor_serialize (in the "serialized" argument). You can also + * use a "serialized" argument of 0 to start at the beginning of the + * zapobj (ie. zap_cursor_init_serialized(..., 0) is equivalent to + * zap_cursor_init(...).) + */ +void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *ds, + uint64_t zapobj, uint64_t serialized); + + +#define ZAP_HISTOGRAM_SIZE 10 + +typedef struct zap_stats { + /* + * Size of the pointer table (in number of entries). + * This is always a power of 2, or zero if it's a microzap. + * In general, it should be considerably greater than zs_num_leafs. + */ + uint64_t zs_ptrtbl_len; + + uint64_t zs_blocksize; /* size of zap blocks */ + + /* + * The number of blocks used. Note that some blocks may be + * wasted because old ptrtbl's and large name/value blocks are + * not reused. (Although their space is reclaimed, we don't + * reuse those offsets in the object.) + */ + uint64_t zs_num_blocks; + + /* + * Pointer table values from zap_ptrtbl in the zap_phys_t + */ + uint64_t zs_ptrtbl_nextblk; /* next (larger) copy start block */ + uint64_t zs_ptrtbl_blks_copied; /* number source blocks copied */ + uint64_t zs_ptrtbl_zt_blk; /* starting block number */ + uint64_t zs_ptrtbl_zt_numblks; /* number of blocks */ + uint64_t zs_ptrtbl_zt_shift; /* bits to index it */ + + /* + * Values of the other members of the zap_phys_t + */ + uint64_t zs_block_type; /* ZBT_HEADER */ + uint64_t zs_magic; /* ZAP_MAGIC */ + uint64_t zs_num_leafs; /* The number of leaf blocks */ + uint64_t zs_num_entries; /* The number of zap entries */ + uint64_t zs_salt; /* salt to stir into hash function */ + + /* + * Histograms. For all histograms, the last index + * (ZAP_HISTOGRAM_SIZE-1) includes any values which are greater + * than what can be represented. For example + * zs_leafs_with_n5_entries[ZAP_HISTOGRAM_SIZE-1] is the number + * of leafs with more than 45 entries. + */ + + /* + * zs_leafs_with_n_pointers[n] is the number of leafs with + * 2^n pointers to it. + */ + uint64_t zs_leafs_with_2n_pointers[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_with_n_entries[n] is the number of leafs with + * [n*5, (n+1)*5) entries. In the current implementation, there + * can be at most 55 entries in any block, but there may be + * fewer if the name or value is large, or the block is not + * completely full. + */ + uint64_t zs_blocks_with_n5_entries[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_leafs_n_tenths_full[n] is the number of leafs whose + * fullness is in the range [n/10, (n+1)/10). + */ + uint64_t zs_blocks_n_tenths_full[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_entries_using_n_chunks[n] is the number of entries which + * consume n 24-byte chunks. (Note, large names/values only use + * one chunk, but contribute to zs_num_blocks_large.) + */ + uint64_t zs_entries_using_n_chunks[ZAP_HISTOGRAM_SIZE]; + + /* + * zs_buckets_with_n_entries[n] is the number of buckets (each + * leaf has 64 buckets) with n entries. + * zs_buckets_with_n_entries[1] should be very close to + * zs_num_entries. + */ + uint64_t zs_buckets_with_n_entries[ZAP_HISTOGRAM_SIZE]; +} zap_stats_t; + +/* + * Get statistics about a ZAP object. Note: you need to be aware of the + * internal implementation of the ZAP to correctly interpret some of the + * statistics. This interface shouldn't be relied on unless you really + * know what you're doing. + */ +int zap_get_stats(objset_t *ds, uint64_t zapobj, zap_stats_t *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_H */ diff --git a/include/sys/zap_impl.h b/include/sys/zap_impl.h new file mode 100644 index 000000000000..250dde3ce235 --- /dev/null +++ b/include/sys/zap_impl.h @@ -0,0 +1,240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#ifndef _SYS_ZAP_IMPL_H +#define _SYS_ZAP_IMPL_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern int fzap_default_block_shift; + +#define ZAP_MAGIC 0x2F52AB2ABULL + +#define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) + +#define MZAP_ENT_LEN 64 +#define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) +#define MZAP_MAX_BLKSZ SPA_OLD_MAXBLOCKSIZE + +#define ZAP_NEED_CD (-1U) + +typedef struct mzap_ent_phys { + uint64_t mze_value; + uint32_t mze_cd; + uint16_t mze_pad; /* in case we want to chain them someday */ + char mze_name[MZAP_NAME_LEN]; +} mzap_ent_phys_t; + +typedef struct mzap_phys { + uint64_t mz_block_type; /* ZBT_MICRO */ + uint64_t mz_salt; + uint64_t mz_normflags; + uint64_t mz_pad[5]; + mzap_ent_phys_t mz_chunk[1]; + /* actually variable size depending on block size */ +} mzap_phys_t; + +typedef struct mzap_ent { + avl_node_t mze_node; + int mze_chunkid; + uint64_t mze_hash; + uint32_t mze_cd; /* copy from mze_phys->mze_cd */ +} mzap_ent_t; + +#define MZE_PHYS(zap, mze) \ + (&zap_m_phys(zap)->mz_chunk[(mze)->mze_chunkid]) + +/* + * The (fat) zap is stored in one object. It is an array of + * 1<= 6] [zap_leaf_t] [ptrtbl] ... + * + */ + +struct dmu_buf; +struct zap_leaf; + +#define ZBT_LEAF ((1ULL << 63) + 0) +#define ZBT_HEADER ((1ULL << 63) + 1) +#define ZBT_MICRO ((1ULL << 63) + 3) +/* any other values are ptrtbl blocks */ + +/* + * the embedded pointer table takes up half a block: + * block size / entry size (2^3) / 2 + */ +#define ZAP_EMBEDDED_PTRTBL_SHIFT(zap) (FZAP_BLOCK_SHIFT(zap) - 3 - 1) + +/* + * The embedded pointer table starts half-way through the block. Since + * the pointer table itself is half the block, it starts at (64-bit) + * word number (1<zap_dbuf->db_data); +} + +static inline mzap_phys_t * +zap_m_phys(zap_t *zap) +{ + return (zap->zap_dbuf->db_data); +} + +typedef struct zap_name { + zap_t *zn_zap; + int zn_key_intlen; + const void *zn_key_orig; + int zn_key_orig_numints; + const void *zn_key_norm; + int zn_key_norm_numints; + uint64_t zn_hash; + matchtype_t zn_matchtype; + int zn_normflags; + char zn_normbuf[ZAP_MAXNAMELEN]; +} zap_name_t; + +#define zap_f zap_u.zap_fat +#define zap_m zap_u.zap_micro + +boolean_t zap_match(zap_name_t *zn, const char *matchname); +int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp); +void zap_unlockdir(zap_t *zap, void *tag); +void zap_evict_sync(void *dbu); +zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt); +void zap_name_free(zap_name_t *zn); +int zap_hashbits(zap_t *zap); +uint32_t zap_maxcd(zap_t *zap); +uint64_t zap_getflags(zap_t *zap); + +#define ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n)))) + +void fzap_byteswap(void *buf, size_t size); +int fzap_count(zap_t *zap, uint64_t *count); +int fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *normalization_conflictp); +void fzap_prefetch(zap_name_t *zn); +int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, + const void *val, void *tag, dmu_tx_t *tx); +int fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx); +int fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers); +int fzap_remove(zap_name_t *zn, dmu_tx_t *tx); +int fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za); +void fzap_get_stats(zap_t *zap, zap_stats_t *zs); +void zap_put_leaf(struct zap_leaf *l); + +int fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_IMPL_H */ diff --git a/include/sys/zap_leaf.h b/include/sys/zap_leaf.h new file mode 100644 index 000000000000..a3da1036a5ee --- /dev/null +++ b/include/sys/zap_leaf.h @@ -0,0 +1,255 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#ifndef _SYS_ZAP_LEAF_H +#define _SYS_ZAP_LEAF_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zap; +struct zap_name; +struct zap_stats; + +#define ZAP_LEAF_MAGIC 0x2AB1EAF + +/* chunk size = 24 bytes */ +#define ZAP_LEAF_CHUNKSIZE 24 + +/* + * The amount of space available for chunks is: + * block size (1<l_bs) - hash entry size (2) * number of hash + * entries - header space (2*chunksize) + */ +#define ZAP_LEAF_NUMCHUNKS_BS(bs) \ + (((1<<(bs)) - 2*ZAP_LEAF_HASH_NUMENTRIES_BS(bs)) / \ + ZAP_LEAF_CHUNKSIZE - 2) + +#define ZAP_LEAF_NUMCHUNKS(l) (ZAP_LEAF_NUMCHUNKS_BS(((l)->l_bs))) + +#define ZAP_LEAF_NUMCHUNKS_DEF \ + (ZAP_LEAF_NUMCHUNKS_BS(fzap_default_block_shift)) + +/* + * The amount of space within the chunk available for the array is: + * chunk size - space for type (1) - space for next pointer (2) + */ +#define ZAP_LEAF_ARRAY_BYTES (ZAP_LEAF_CHUNKSIZE - 3) + +#define ZAP_LEAF_ARRAY_NCHUNKS(bytes) \ + (((bytes)+ZAP_LEAF_ARRAY_BYTES-1)/ZAP_LEAF_ARRAY_BYTES) + +/* + * Low water mark: when there are only this many chunks free, start + * growing the ptrtbl. Ideally, this should be larger than a + * "reasonably-sized" entry. 20 chunks is more than enough for the + * largest directory entry (MAXNAMELEN (256) byte name, 8-byte value), + * while still being only around 3% for 16k blocks. + */ +#define ZAP_LEAF_LOW_WATER (20) + +/* + * The leaf hash table has block size / 2^5 (32) number of entries, + * which should be more than enough for the maximum number of entries, + * which is less than block size / CHUNKSIZE (24) / minimum number of + * chunks per entry (3). + */ +#define ZAP_LEAF_HASH_SHIFT_BS(bs) ((bs) - 5) +#define ZAP_LEAF_HASH_NUMENTRIES_BS(bs) (1 << ZAP_LEAF_HASH_SHIFT_BS(bs)) +#define ZAP_LEAF_HASH_SHIFT(l) (ZAP_LEAF_HASH_SHIFT_BS(((l)->l_bs))) +#define ZAP_LEAF_HASH_NUMENTRIES(l) (ZAP_LEAF_HASH_NUMENTRIES_BS(((l)->l_bs))) + +/* + * The chunks start immediately after the hash table. The end of the + * hash table is at l_hash + HASH_NUMENTRIES, which we simply cast to a + * chunk_t. + */ +#define ZAP_LEAF_CHUNK(l, idx) \ + ((zap_leaf_chunk_t *) \ + (zap_leaf_phys(l)->l_hash + ZAP_LEAF_HASH_NUMENTRIES(l)))[idx] +#define ZAP_LEAF_ENTRY(l, idx) (&ZAP_LEAF_CHUNK(l, idx).l_entry) + +typedef enum zap_chunk_type { + ZAP_CHUNK_FREE = 253, + ZAP_CHUNK_ENTRY = 252, + ZAP_CHUNK_ARRAY = 251, + ZAP_CHUNK_TYPE_MAX = 250 +} zap_chunk_type_t; + +#define ZLF_ENTRIES_CDSORTED (1<<0) + +/* + * TAKE NOTE: + * If zap_leaf_phys_t is modified, zap_leaf_byteswap() must be modified. + */ +typedef struct zap_leaf_phys { + struct zap_leaf_header { + /* Public to ZAP */ + uint64_t lh_block_type; /* ZBT_LEAF */ + uint64_t lh_pad1; + uint64_t lh_prefix; /* hash prefix of this leaf */ + uint32_t lh_magic; /* ZAP_LEAF_MAGIC */ + uint16_t lh_nfree; /* number free chunks */ + uint16_t lh_nentries; /* number of entries */ + uint16_t lh_prefix_len; /* num bits used to id this */ + + /* Private to zap_leaf */ + uint16_t lh_freelist; /* chunk head of free list */ + uint8_t lh_flags; /* ZLF_* flags */ + uint8_t lh_pad2[11]; + } l_hdr; /* 2 24-byte chunks */ + + /* + * The header is followed by a hash table with + * ZAP_LEAF_HASH_NUMENTRIES(zap) entries. The hash table is + * followed by an array of ZAP_LEAF_NUMCHUNKS(zap) + * zap_leaf_chunk structures. These structures are accessed + * with the ZAP_LEAF_CHUNK() macro. + */ + + uint16_t l_hash[1]; +} zap_leaf_phys_t; + +typedef union zap_leaf_chunk { + struct zap_leaf_entry { + uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ + uint8_t le_value_intlen; /* size of value's ints */ + uint16_t le_next; /* next entry in hash chain */ + uint16_t le_name_chunk; /* first chunk of the name */ + uint16_t le_name_numints; /* ints in name (incl null) */ + uint16_t le_value_chunk; /* first chunk of the value */ + uint16_t le_value_numints; /* value length in ints */ + uint32_t le_cd; /* collision differentiator */ + uint64_t le_hash; /* hash value of the name */ + } l_entry; + struct zap_leaf_array { + uint8_t la_type; /* always ZAP_CHUNK_ARRAY */ + uint8_t la_array[ZAP_LEAF_ARRAY_BYTES]; + uint16_t la_next; /* next blk or CHAIN_END */ + } l_array; + struct zap_leaf_free { + uint8_t lf_type; /* always ZAP_CHUNK_FREE */ + uint8_t lf_pad[ZAP_LEAF_ARRAY_BYTES]; + uint16_t lf_next; /* next in free list, or CHAIN_END */ + } l_free; +} zap_leaf_chunk_t; + +typedef struct zap_leaf { + dmu_buf_user_t l_dbu; + krwlock_t l_rwlock; + uint64_t l_blkid; /* 1<l_dbuf->db_data); +} + +typedef struct zap_entry_handle { + /* Set by zap_leaf and public to ZAP */ + uint64_t zeh_num_integers; + uint64_t zeh_hash; + uint32_t zeh_cd; + uint8_t zeh_integer_size; + + /* Private to zap_leaf */ + uint16_t zeh_fakechunk; + uint16_t *zeh_chunkp; + zap_leaf_t *zeh_leaf; +} zap_entry_handle_t; + +/* + * Return a handle to the named entry, or ENOENT if not found. The hash + * value must equal zap_hash(name). + */ +extern int zap_leaf_lookup(zap_leaf_t *l, + struct zap_name *zn, zap_entry_handle_t *zeh); + +/* + * Return a handle to the entry with this hash+cd, or the entry with the + * next closest hash+cd. + */ +extern int zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t hash, uint32_t cd, zap_entry_handle_t *zeh); + +/* + * Read the first num_integers in the attribute. Integer size + * conversion will be done without sign extension. Return EINVAL if + * integer_size is too small. Return EOVERFLOW if there are more than + * num_integers in the attribute. + */ +extern int zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf); + +extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); + +/* + * Replace the value of an existing entry. + * + * May fail if it runs out of space (ENOSPC). + */ +extern int zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf); + +/* + * Remove an entry. + */ +extern void zap_entry_remove(zap_entry_handle_t *zeh); + +/* + * Create an entry. An equal entry must not exist, and this entry must + * belong in this leaf (according to its hash value). Fills in the + * entry handle on success. Returns 0 on success or ENOSPC on failure. + */ +extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); + +/* Determine whether there is another entry with the same normalized form. */ +extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, + struct zap_name *zn, const char *name, struct zap *zap); + +/* + * Other stuff. + */ + +extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); +extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); +extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); +extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, + struct zap_stats *zs); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZAP_LEAF_H */ diff --git a/include/sys/zcp.h b/include/sys/zcp.h new file mode 100644 index 000000000000..d7b1dfaa2ea5 --- /dev/null +++ b/include/sys/zcp.h @@ -0,0 +1,194 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_H +#define _SYS_ZCP_H + +#include +#include + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZCP_RUN_INFO_KEY "runinfo" + +extern unsigned long zfs_lua_max_instrlimit; +extern unsigned long zfs_lua_max_memlimit; + +int zcp_argerror(lua_State *, int, const char *, ...); + +int zcp_eval(const char *, const char *, boolean_t, uint64_t, uint64_t, + nvpair_t *, nvlist_t *); + +int zcp_load_list_lib(lua_State *); + +int zcp_load_synctask_lib(lua_State *, boolean_t); + +typedef void (zcp_cleanup_t)(void *); +typedef struct zcp_cleanup_handler { + zcp_cleanup_t *zch_cleanup_func; + void *zch_cleanup_arg; + list_node_t zch_node; +} zcp_cleanup_handler_t; + +typedef struct zcp_alloc_arg { + boolean_t aa_must_succeed; + int64_t aa_alloc_remaining; + int64_t aa_alloc_limit; +} zcp_alloc_arg_t; + +typedef struct zcp_run_info { + dsl_pool_t *zri_pool; + + /* + * An estimate of the total amount of space consumed by all + * synctasks we have successfully performed so far in this + * channel program. Used to generate ENOSPC errors for syncfuncs. + */ + int zri_space_used; + + /* + * The credentials of the thread which originally invoked the channel + * program. Since channel programs are always invoked from the synctask + * thread they should always do permissions checks against this cred + * rather than the 'current' thread's. + */ + cred_t *zri_cred; + proc_t *zri_proc; + + /* + * The tx in which this channel program is running. + */ + dmu_tx_t *zri_tx; + + /* + * The maximum number of Lua instructions the channel program is allowed + * to execute. If it takes longer than this it will time out. A value + * of 0 indicates no instruction limit. + */ + uint64_t zri_maxinstrs; + + /* + * The number of Lua instructions the channel program has executed. + */ + uint64_t zri_curinstrs; + + /* + * Boolean indicating whether or not the channel program exited + * because it timed out. + */ + boolean_t zri_timed_out; + + /* + * Channel program was canceled by user + */ + boolean_t zri_canceled; + + /* + * Boolean indicating whether or not we are running in syncing + * context. + */ + boolean_t zri_sync; + + /* + * List of currently registered cleanup handlers, which will be + * triggered in the event of a fatal error. + */ + list_t zri_cleanup_handlers; + + /* + * The Lua state context of our channel program. + */ + lua_State *zri_state; + + /* + * Lua memory allocator arguments. + */ + zcp_alloc_arg_t *zri_allocargs; + + /* + * Contains output values from zcp script or error string. + */ + nvlist_t *zri_outnvl; + + /* + * The keys of this nvlist are datasets which may be zvols and may need + * to have device minor nodes created. This information is passed from + * syncing context (where the zvol is created) to open context (where we + * create the minor nodes). + */ + nvlist_t *zri_new_zvols; + + /* + * The errno number returned to caller of zcp_eval(). + */ + int zri_result; +} zcp_run_info_t; + +zcp_run_info_t *zcp_run_info(lua_State *); +zcp_cleanup_handler_t *zcp_register_cleanup(lua_State *, zcp_cleanup_t, void *); +void zcp_deregister_cleanup(lua_State *, zcp_cleanup_handler_t *); +void zcp_cleanup(lua_State *); + +/* + * Argument parsing routines for channel program callback functions. + */ +typedef struct zcp_arg { + /* + * The name of this argument. For keyword arguments this is the name + * functions will use to set the argument. For positional arguments + * the name has no programmatic meaning, but will appear in error + * messages and help output. + */ + const char *za_name; + + /* + * The Lua type this argument should have (e.g. LUA_TSTRING, + * LUA_TBOOLEAN) see the lua_type() function documentation for a + * complete list. Calling a function with an argument that does + * not match the expected type will result in the program terminating. + */ + const int za_lua_type; +} zcp_arg_t; + +void zcp_parse_args(lua_State *, const char *, const zcp_arg_t *, + const zcp_arg_t *); +int zcp_nvlist_to_lua(lua_State *, nvlist_t *, char *, int); +int zcp_dataset_hold_error(lua_State *, dsl_pool_t *, const char *, int); +struct dsl_dataset *zcp_dataset_hold(lua_State *, dsl_pool_t *, + const char *, void *); + +typedef int (zcp_lib_func_t)(lua_State *); +typedef struct zcp_lib_info { + const char *name; + zcp_lib_func_t *func; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; +} zcp_lib_info_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_H */ diff --git a/include/sys/zcp_global.h b/include/sys/zcp_global.h new file mode 100644 index 000000000000..0af5708dd3cb --- /dev/null +++ b/include/sys/zcp_global.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_GLOBALS_H +#define _SYS_ZCP_GLOBALS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void zcp_load_globals(lua_State *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_GLOBALS_H */ diff --git a/include/sys/zcp_iter.h b/include/sys/zcp_iter.h new file mode 100644 index 000000000000..1d92d0c6d10c --- /dev/null +++ b/include/sys/zcp_iter.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_LIST_H +#define _SYS_ZCP_LIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +void zcp_load_list_funcs(lua_State *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_LIST_H */ diff --git a/include/sys/zcp_prop.h b/include/sys/zcp_prop.h new file mode 100644 index 000000000000..97b17619565c --- /dev/null +++ b/include/sys/zcp_prop.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZCP_PROP_H +#define _SYS_ZCP_PROP_H + +#ifdef __cplusplus +extern "C" { +#endif + +int zcp_load_get_lib(lua_State *state); +boolean_t prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_PROP_H */ diff --git a/include/sys/zcp_set.h b/include/sys/zcp_set.h new file mode 100644 index 000000000000..b7428d6fc09e --- /dev/null +++ b/include/sys/zcp_set.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright 2019 Joyent, Inc. + */ + +#ifndef _SYS_ZCP_SET_H +#define _SYS_ZCP_SET_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zcp_set_prop_arg { + lua_State *state; + const char *dsname; + const char *prop; + const char *val; +} zcp_set_prop_arg_t; + +int zcp_set_prop_check(void *arg, dmu_tx_t *tx); +void zcp_set_prop_sync(void *arg, dmu_tx_t *tx); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZCP_SET_H */ diff --git a/include/sys/zfeature.h b/include/sys/zfeature.h new file mode 100644 index 000000000000..5abde149a615 --- /dev/null +++ b/include/sys/zfeature.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFEATURE_H +#define _SYS_ZFEATURE_H + +#include +#include +#include "zfeature_common.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define VALID_FEATURE_FID(fid) ((fid) >= 0 && (fid) < SPA_FEATURES) +#define VALID_FEATURE_OR_NONE(fid) ((fid) == SPA_FEATURE_NONE || \ + VALID_FEATURE_FID(fid)) + +struct spa; +struct dmu_tx; +struct objset; + +extern void spa_feature_create_zap_objects(struct spa *, struct dmu_tx *); +extern void spa_feature_enable(struct spa *, spa_feature_t, + struct dmu_tx *); +extern void spa_feature_incr(struct spa *, spa_feature_t, struct dmu_tx *); +extern void spa_feature_decr(struct spa *, spa_feature_t, struct dmu_tx *); +extern boolean_t spa_feature_is_enabled(struct spa *, spa_feature_t); +extern boolean_t spa_feature_is_active(struct spa *, spa_feature_t); +extern boolean_t spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, + uint64_t *txg); +extern uint64_t spa_feature_refcount(spa_t *, spa_feature_t, uint64_t); +extern boolean_t spa_features_check(spa_t *, boolean_t, nvlist_t *, nvlist_t *); + +/* + * These functions are only exported for zhack and zdb; normal callers should + * use the above interfaces. + */ +extern int feature_get_refcount(struct spa *, zfeature_info_t *, uint64_t *); +extern int feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res); +extern void feature_enable_sync(struct spa *, zfeature_info_t *, + struct dmu_tx *); +extern void feature_sync(struct spa *, zfeature_info_t *, uint64_t, + struct dmu_tx *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFEATURE_H */ diff --git a/include/sys/zfs_acl.h b/include/sys/zfs_acl.h new file mode 100644 index 000000000000..010686a9121b --- /dev/null +++ b/include/sys/zfs_acl.h @@ -0,0 +1,248 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ACL_H +#define _SYS_FS_ZFS_ACL_H + +#ifdef _KERNEL +#include +#include +#include +#endif +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct znode_phys; + +#define ACE_SLOT_CNT 6 +#define ZFS_ACL_VERSION_INITIAL 0ULL +#define ZFS_ACL_VERSION_FUID 1ULL +#define ZFS_ACL_VERSION ZFS_ACL_VERSION_FUID + +/* + * ZFS ACLs (Access Control Lists) are stored in various forms. + * + * Files created with ACL version ZFS_ACL_VERSION_INITIAL + * will all be created with fixed length ACEs of type + * zfs_oldace_t. + * + * Files with ACL version ZFS_ACL_VERSION_FUID will be created + * with various sized ACEs. The abstraction entries will utilize + * zfs_ace_hdr_t, normal user/group entries will use zfs_ace_t + * and some specialized CIFS ACEs will use zfs_object_ace_t. + */ + +/* + * All ACEs have a common hdr. For + * owner@, group@, and everyone@ this is all + * that's needed. + */ +typedef struct zfs_ace_hdr { + uint16_t z_type; + uint16_t z_flags; + uint32_t z_access_mask; +} zfs_ace_hdr_t; + +typedef zfs_ace_hdr_t zfs_ace_abstract_t; + +/* + * Standard ACE + */ +typedef struct zfs_ace { + zfs_ace_hdr_t z_hdr; + uint64_t z_fuid; +} zfs_ace_t; + +/* + * The following type only applies to ACE_ACCESS_ALLOWED|DENIED_OBJECT_ACE_TYPE + * and will only be set/retrieved in a CIFS context. + */ + +typedef struct zfs_object_ace { + zfs_ace_t z_ace; + uint8_t z_object_type[16]; /* object type */ + uint8_t z_inherit_type[16]; /* inherited object type */ +} zfs_object_ace_t; + +typedef struct zfs_oldace { + uint32_t z_fuid; /* "who" */ + uint32_t z_access_mask; /* access mask */ + uint16_t z_flags; /* flags, i.e inheritance */ + uint16_t z_type; /* type of entry allow/deny */ +} zfs_oldace_t; + +typedef struct zfs_acl_phys_v0 { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_count; /* Number of ACEs */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_pad; /* pad */ + zfs_oldace_t z_ace_data[ACE_SLOT_CNT]; /* 6 standard ACEs */ +} zfs_acl_phys_v0_t; + +#define ZFS_ACE_SPACE (sizeof (zfs_oldace_t) * ACE_SLOT_CNT) + +/* + * Size of ACL count is always 2 bytes. + * Necessary to for dealing with both V0 ACL and V1 ACL layout + */ +#define ZFS_ACL_COUNT_SIZE (sizeof (uint16_t)) + +typedef struct zfs_acl_phys { + uint64_t z_acl_extern_obj; /* ext acl pieces */ + uint32_t z_acl_size; /* Number of bytes in ACL */ + uint16_t z_acl_version; /* acl version */ + uint16_t z_acl_count; /* ace count */ + uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ +} zfs_acl_phys_t; + +typedef struct acl_ops { + uint32_t (*ace_mask_get) (void *acep); /* get access mask */ + void (*ace_mask_set) (void *acep, + uint32_t mask); /* set access mask */ + uint16_t (*ace_flags_get) (void *acep); /* get flags */ + void (*ace_flags_set) (void *acep, + uint16_t flags); /* set flags */ + uint16_t (*ace_type_get)(void *acep); /* get type */ + void (*ace_type_set)(void *acep, + uint16_t type); /* set type */ + uint64_t (*ace_who_get)(void *acep); /* get who/fuid */ + void (*ace_who_set)(void *acep, + uint64_t who); /* set who/fuid */ + size_t (*ace_size)(void *acep); /* how big is this ace */ + size_t (*ace_abstract_size)(void); /* sizeof abstract entry */ + int (*ace_mask_off)(void); /* off of access mask in ace */ + /* ptr to data if any */ + int (*ace_data)(void *acep, void **datap); +} acl_ops_t; + +/* + * A zfs_acl_t structure is composed of a list of zfs_acl_node_t's. + * Each node will have one or more ACEs associated with it. You will + * only have multiple nodes during a chmod operation. Normally only + * one node is required. + */ +typedef struct zfs_acl_node { + list_node_t z_next; /* Next chunk of ACEs */ + void *z_acldata; /* pointer into actual ACE(s) */ + void *z_allocdata; /* pointer to kmem allocated memory */ + size_t z_allocsize; /* Size of blob in bytes */ + size_t z_size; /* length of ACL data */ + uint64_t z_ace_count; /* number of ACEs in this acl node */ + int z_ace_idx; /* ace iterator positioned on */ +} zfs_acl_node_t; + +typedef struct zfs_acl { + uint64_t z_acl_count; /* Number of ACEs */ + size_t z_acl_bytes; /* Number of bytes in ACL */ + uint_t z_version; /* version of ACL */ + void *z_next_ace; /* pointer to next ACE */ + uint64_t z_hints; /* ACL hints (ZFS_INHERIT_ACE ...) */ + zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ + list_t z_acl; /* chunks of ACE data */ + acl_ops_t *z_ops; /* ACL operations */ +} zfs_acl_t; + +typedef struct acl_locator_cb { + zfs_acl_t *cb_aclp; + zfs_acl_node_t *cb_acl_node; +} zfs_acl_locator_cb_t; + +#define ACL_DATA_ALLOCED 0x1 +#define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) + +struct zfs_fuid_info; + +typedef struct zfs_acl_ids { + uint64_t z_fuid; /* file owner fuid */ + uint64_t z_fgid; /* file group owner fuid */ + uint64_t z_mode; /* mode to set on create */ + zfs_acl_t *z_aclp; /* ACL to create with file */ + struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ +} zfs_acl_ids_t; + +/* + * Property values for acl_mode and acl_inherit. + * + * acl_mode can take discard, noallow, groupmask and passthrough. + * whereas acl_inherit has secure instead of groupmask. + */ + +#define ZFS_ACL_DISCARD 0 +#define ZFS_ACL_NOALLOW 1 +#define ZFS_ACL_GROUPMASK 2 +#define ZFS_ACL_PASSTHROUGH 3 +#define ZFS_ACL_RESTRICTED 4 +#define ZFS_ACL_PASSTHROUGH_X 5 + +struct znode; +struct zfsvfs; + +#ifdef _KERNEL +int zfs_acl_ids_create(struct znode *, int, vattr_t *, + cred_t *, vsecattr_t *, zfs_acl_ids_t *); +void zfs_acl_ids_free(zfs_acl_ids_t *); +boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *, uint64_t); +int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); +void zfs_acl_rele(void *); +void zfs_oldace_byteswap(ace_t *, int); +void zfs_ace_byteswap(void *, size_t, boolean_t); +extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); +extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +int zfs_fastaccesschk_execute(struct znode *, cred_t *); +extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); +extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); +extern int zfs_acl_access(struct znode *, int, cred_t *); +int zfs_acl_chmod_setattr(struct znode *, zfs_acl_t **, uint64_t); +int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); +int zfs_zaccess_rename(struct znode *, struct znode *, + struct znode *, struct znode *, cred_t *cr); +void zfs_acl_free(zfs_acl_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, umode_t, vsecattr_t *, cred_t *, + struct zfs_fuid_info **, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); +uint64_t zfs_external_acl(struct znode *); +int zfs_znode_acl_version(struct znode *); +int zfs_acl_size(struct znode *, int *); +zfs_acl_t *zfs_acl_alloc(int); +zfs_acl_node_t *zfs_acl_node_alloc(size_t); +void zfs_acl_xform(struct znode *, zfs_acl_t *, cred_t *); +void zfs_acl_data_locator(void **, uint32_t *, uint32_t, boolean_t, void *); +uint64_t zfs_mode_compute(uint64_t, zfs_acl_t *, + uint64_t *, uint64_t, uint64_t); +int zfs_acl_node_read(struct znode *, boolean_t, zfs_acl_t **, boolean_t); +int zfs_acl_chown_setattr(struct znode *); + +#endif + +#ifdef __cplusplus +} +#endif +#endif /* _SYS_FS_ZFS_ACL_H */ diff --git a/include/sys/zfs_context.h b/include/sys/zfs_context.h new file mode 100644 index 000000000000..16df302c8f31 --- /dev/null +++ b/include/sys/zfs_context.h @@ -0,0 +1,763 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + */ + +#ifndef _SYS_ZFS_CONTEXT_H +#define _SYS_ZFS_CONTEXT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef __KERNEL__ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#else /* _KERNEL */ + +#define _SYS_MUTEX_H +#define _SYS_RWLOCK_H +#define _SYS_CONDVAR_H +#define _SYS_VNODE_H +#define _SYS_VFS_H +#define _SYS_SUNDDI_H +#define _SYS_CALLB_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * Stack + */ + +#define noinline __attribute__((noinline)) +#define likely(x) __builtin_expect((x), 1) +#define unlikely(x) __builtin_expect((x), 0) + +/* + * Debugging + */ + +/* + * Note that we are not using the debugging levels. + */ + +#define CE_CONT 0 /* continuation */ +#define CE_NOTE 1 /* notice */ +#define CE_WARN 2 /* warning */ +#define CE_PANIC 3 /* panic */ +#define CE_IGNORE 4 /* print nothing */ + +/* + * ZFS debugging + */ + +extern void dprintf_setup(int *argc, char **argv); + +extern void cmn_err(int, const char *, ...); +extern void vcmn_err(int, const char *, va_list); +extern void panic(const char *, ...) __NORETURN; +extern void vpanic(const char *, va_list) __NORETURN; + +#define fm_panic panic + +extern int aok; + +/* + * DTrace SDT probes have different signatures in userland than they do in + * the kernel. If they're being used in kernel code, re-define them out of + * existence for their counterparts in libzpool. + * + * Here's an example of how to use the set-error probes in userland: + * zfs$target:::set-error /arg0 == EBUSY/ {stack();} + * + * Here's an example of how to use DTRACE_PROBE probes in userland: + * If there is a probe declared as follows: + * DTRACE_PROBE2(zfs__probe_name, uint64_t, blkid, dnode_t *, dn); + * Then you can use it as follows: + * zfs$target:::probe2 /copyinstr(arg0) == "zfs__probe_name"/ + * {printf("%u %p\n", arg1, arg2);} + */ + +#ifdef DTRACE_PROBE +#undef DTRACE_PROBE +#endif /* DTRACE_PROBE */ +#define DTRACE_PROBE(a) + +#ifdef DTRACE_PROBE1 +#undef DTRACE_PROBE1 +#endif /* DTRACE_PROBE1 */ +#define DTRACE_PROBE1(a, b, c) + +#ifdef DTRACE_PROBE2 +#undef DTRACE_PROBE2 +#endif /* DTRACE_PROBE2 */ +#define DTRACE_PROBE2(a, b, c, d, e) + +#ifdef DTRACE_PROBE3 +#undef DTRACE_PROBE3 +#endif /* DTRACE_PROBE3 */ +#define DTRACE_PROBE3(a, b, c, d, e, f, g) + +#ifdef DTRACE_PROBE4 +#undef DTRACE_PROBE4 +#endif /* DTRACE_PROBE4 */ +#define DTRACE_PROBE4(a, b, c, d, e, f, g, h, i) + +/* + * Tunables. + */ +typedef struct zfs_kernel_param { + const char *name; /* unused stub */ +} zfs_kernel_param_t; + +#define ZFS_MODULE_PARAM(scope_prefix, name_prefix, name, type, perm, desc) +#define ZFS_MODULE_PARAM_ARGS void +#define ZFS_MODULE_PARAM_CALL(scope_prefix, name_prefix, name, setfunc, \ + getfunc, perm, desc) + +/* + * Threads. + */ +typedef pthread_t kthread_t; + +#define TS_RUN 0x00000002 +#define TS_JOINABLE 0x00000004 + +#define curthread ((void *)(uintptr_t)pthread_self()) +#define kpreempt(x) yield() +#define getcomm() "unknown" + +#define thread_create_named(name, stk, stksize, func, arg, len, \ + pp, state, pri) \ + zk_thread_create(func, arg, stksize, state) +#define thread_create(stk, stksize, func, arg, len, pp, state, pri) \ + zk_thread_create(func, arg, stksize, state) +#define thread_exit() pthread_exit(NULL) +#define thread_join(t) pthread_join((pthread_t)(t), NULL) + +#define newproc(f, a, cid, pri, ctp, pid) (ENOSYS) + +/* in libzpool, p0 exists only to have its address taken */ +typedef struct proc { + uintptr_t this_is_never_used_dont_dereference_it; +} proc_t; + +extern struct proc p0; +#define curproc (&p0) + +#define PS_NONE -1 + +extern kthread_t *zk_thread_create(void (*func)(void *), void *arg, + size_t stksize, int state); + +#define issig(why) (FALSE) +#define ISSIG(thr, why) (FALSE) + +#define kpreempt_disable() ((void)0) +#define kpreempt_enable() ((void)0) +#define cond_resched() sched_yield() + +/* + * Mutexes + */ +typedef struct kmutex { + pthread_mutex_t m_lock; + pthread_t m_owner; +} kmutex_t; + +#define MUTEX_DEFAULT 0 +#define MUTEX_NOLOCKDEP MUTEX_DEFAULT +#define MUTEX_HELD(mp) pthread_equal((mp)->m_owner, pthread_self()) +#define MUTEX_NOT_HELD(mp) !MUTEX_HELD(mp) + +extern void mutex_init(kmutex_t *mp, char *name, int type, void *cookie); +extern void mutex_destroy(kmutex_t *mp); +extern void mutex_enter(kmutex_t *mp); +extern void mutex_exit(kmutex_t *mp); +extern int mutex_tryenter(kmutex_t *mp); + +#define NESTED_SINGLE 1 +#define mutex_enter_nested(mp, class) mutex_enter(mp) +/* + * RW locks + */ +typedef struct krwlock { + pthread_rwlock_t rw_lock; + pthread_t rw_owner; + uint_t rw_readers; +} krwlock_t; + +typedef int krw_t; + +#define RW_READER 0 +#define RW_WRITER 1 +#define RW_DEFAULT RW_READER +#define RW_NOLOCKDEP RW_READER + +#define RW_READ_HELD(rw) ((rw)->rw_readers > 0) +#define RW_WRITE_HELD(rw) pthread_equal((rw)->rw_owner, pthread_self()) +#define RW_LOCK_HELD(rw) (RW_READ_HELD(rw) || RW_WRITE_HELD(rw)) + +extern void rw_init(krwlock_t *rwlp, char *name, int type, void *arg); +extern void rw_destroy(krwlock_t *rwlp); +extern void rw_enter(krwlock_t *rwlp, krw_t rw); +extern int rw_tryenter(krwlock_t *rwlp, krw_t rw); +extern int rw_tryupgrade(krwlock_t *rwlp); +extern void rw_exit(krwlock_t *rwlp); +#define rw_downgrade(rwlp) do { } while (0) + +/* + * Credentials + */ +extern uid_t crgetuid(cred_t *cr); +extern uid_t crgetruid(cred_t *cr); +extern gid_t crgetgid(cred_t *cr); +extern int crgetngroups(cred_t *cr); +extern gid_t *crgetgroups(cred_t *cr); + +/* + * Condition variables + */ +typedef pthread_cond_t kcondvar_t; + +#define CV_DEFAULT 0 +#define CALLOUT_FLAG_ABSOLUTE 0x2 + +extern void cv_init(kcondvar_t *cv, char *name, int type, void *arg); +extern void cv_destroy(kcondvar_t *cv); +extern void cv_wait(kcondvar_t *cv, kmutex_t *mp); +extern int cv_wait_sig(kcondvar_t *cv, kmutex_t *mp); +extern int cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime); +extern int cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag); +extern void cv_signal(kcondvar_t *cv); +extern void cv_broadcast(kcondvar_t *cv); + +#define cv_timedwait_io(cv, mp, at) cv_timedwait(cv, mp, at) +#define cv_timedwait_sig(cv, mp, at) cv_timedwait(cv, mp, at) +#define cv_wait_io(cv, mp) cv_wait(cv, mp) +#define cv_wait_io_sig(cv, mp) cv_wait_sig(cv, mp) +#define cv_timedwait_sig_hires(cv, mp, t, r, f) \ + cv_timedwait_hires(cv, mp, t, r, f) + +/* + * Thread-specific data + */ +#define tsd_get(k) pthread_getspecific(k) +#define tsd_set(k, v) pthread_setspecific(k, v) +#define tsd_create(kp, d) pthread_key_create((pthread_key_t *)kp, d) +#define tsd_destroy(kp) /* nothing */ +#ifdef __FreeBSD__ +typedef off_t loff_t; +#endif + +/* + * kstat creation, installation and deletion + */ +extern kstat_t *kstat_create(const char *, int, + const char *, const char *, uchar_t, ulong_t, uchar_t); +extern void kstat_install(kstat_t *); +extern void kstat_delete(kstat_t *); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_waitq_to_runq(kstat_io_t *); +extern void kstat_runq_back_to_waitq(kstat_io_t *); +extern void kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)); + +/* + * procfs list manipulation + */ + +typedef struct procfs_list { + void *pl_private; + kmutex_t pl_lock; + list_t pl_list; + uint64_t pl_next_id; + size_t pl_node_offset; +} procfs_list_t; + +#ifndef __cplusplus +struct seq_file { }; +void seq_printf(struct seq_file *m, const char *fmt, ...); + +typedef struct procfs_list_node { + list_node_t pln_link; + uint64_t pln_id; +} procfs_list_node_t; + +void procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off); +void procfs_list_uninstall(procfs_list_t *procfs_list); +void procfs_list_destroy(procfs_list_t *procfs_list); +void procfs_list_add(procfs_list_t *procfs_list, void *p); +#endif + +/* + * Kernel memory + */ +#define KM_SLEEP UMEM_NOFAIL +#define KM_PUSHPAGE KM_SLEEP +#define KM_NOSLEEP UMEM_DEFAULT +#define KM_NORMALPRI 0 /* not needed with UMEM_DEFAULT */ +#define KMC_NODEBUG UMC_NODEBUG +#define KMC_KVMEM 0x0 +#define kmem_alloc(_s, _f) umem_alloc(_s, _f) +#define kmem_zalloc(_s, _f) umem_zalloc(_s, _f) +#define kmem_free(_b, _s) umem_free(_b, _s) +#define vmem_alloc(_s, _f) kmem_alloc(_s, _f) +#define vmem_zalloc(_s, _f) kmem_zalloc(_s, _f) +#define vmem_free(_b, _s) kmem_free(_b, _s) +#define kmem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) \ + umem_cache_create(_a, _b, _c, _d, _e, _f, _g, _h, _i) +#define kmem_cache_destroy(_c) umem_cache_destroy(_c) +#define kmem_cache_alloc(_c, _f) umem_cache_alloc(_c, _f) +#define kmem_cache_free(_c, _b) umem_cache_free(_c, _b) +#define kmem_debugging() 0 +#define kmem_cache_reap_now(_c) umem_cache_reap_now(_c); +#define kmem_cache_set_move(_c, _cb) /* nothing */ +#define POINTER_INVALIDATE(_pp) /* nothing */ +#define POINTER_IS_VALID(_p) 0 + +typedef umem_cache_t kmem_cache_t; + +typedef enum kmem_cbrc { + KMEM_CBRC_YES, + KMEM_CBRC_NO, + KMEM_CBRC_LATER, + KMEM_CBRC_DONT_NEED, + KMEM_CBRC_DONT_KNOW +} kmem_cbrc_t; + +/* + * Task queues + */ + +#define TASKQ_NAMELEN 31 + +typedef uintptr_t taskqid_t; +typedef void (task_func_t)(void *); + +typedef struct taskq_ent { + struct taskq_ent *tqent_next; + struct taskq_ent *tqent_prev; + task_func_t *tqent_func; + void *tqent_arg; + uintptr_t tqent_flags; +} taskq_ent_t; + +typedef struct taskq { + char tq_name[TASKQ_NAMELEN + 1]; + kmutex_t tq_lock; + krwlock_t tq_threadlock; + kcondvar_t tq_dispatch_cv; + kcondvar_t tq_wait_cv; + kthread_t **tq_threadlist; + int tq_flags; + int tq_active; + int tq_nthreads; + int tq_nalloc; + int tq_minalloc; + int tq_maxalloc; + kcondvar_t tq_maxalloc_cv; + int tq_maxalloc_wait; + taskq_ent_t *tq_freelist; + taskq_ent_t tq_task; +} taskq_t; + +#define TQENT_FLAG_PREALLOC 0x1 /* taskq_dispatch_ent used */ + +#define TASKQ_PREPOPULATE 0x0001 +#define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ +#define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* Scale # threads by # cpus */ +#define TASKQ_DC_BATCH 0x0010 /* Mark threads as batch */ + +#define TQ_SLEEP KM_SLEEP /* Can block for memory */ +#define TQ_NOSLEEP KM_NOSLEEP /* cannot block for memory; may fail */ +#define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ +#define TQ_FRONT 0x08 /* Queue in front */ + +#define TASKQID_INVALID ((taskqid_t)0) + +extern taskq_t *system_taskq; +extern taskq_t *system_delay_taskq; + +extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); +#define taskq_create_proc(a, b, c, d, e, p, f) \ + (taskq_create(a, b, c, d, e, f)) +#define taskq_create_sysdc(a, b, d, e, p, dc, f) \ + (taskq_create(a, b, maxclsyspri, d, e, f)) +extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); +extern taskqid_t taskq_dispatch_delay(taskq_t *, task_func_t, void *, uint_t, + clock_t); +extern void taskq_dispatch_ent(taskq_t *, task_func_t, void *, uint_t, + taskq_ent_t *); +extern int taskq_empty_ent(taskq_ent_t *); +extern void taskq_init_ent(taskq_ent_t *); +extern void taskq_destroy(taskq_t *); +extern void taskq_wait(taskq_t *); +extern void taskq_wait_id(taskq_t *, taskqid_t); +extern void taskq_wait_outstanding(taskq_t *, taskqid_t); +extern int taskq_member(taskq_t *, kthread_t *); +extern taskq_t *taskq_of_curthread(void); +extern int taskq_cancel_id(taskq_t *, taskqid_t); +extern void system_taskq_init(void); +extern void system_taskq_fini(void); + +#define XVA_MAPSIZE 3 +#define XVA_MAGIC 0x78766174 + +extern char *vn_dumpdir; +#define AV_SCANSTAMP_SZ 32 /* length of anti-virus scanstamp */ + +typedef struct xoptattr { + inode_timespec_t xoa_createtime; /* Create time of file */ + uint8_t xoa_archive; + uint8_t xoa_system; + uint8_t xoa_readonly; + uint8_t xoa_hidden; + uint8_t xoa_nounlink; + uint8_t xoa_immutable; + uint8_t xoa_appendonly; + uint8_t xoa_nodump; + uint8_t xoa_settable; + uint8_t xoa_opaque; + uint8_t xoa_av_quarantined; + uint8_t xoa_av_modified; + uint8_t xoa_av_scanstamp[AV_SCANSTAMP_SZ]; + uint8_t xoa_reparse; + uint8_t xoa_offline; + uint8_t xoa_sparse; +} xoptattr_t; + +typedef struct vattr { + uint_t va_mask; /* bit-mask of attributes */ + u_offset_t va_size; /* file size in bytes */ +} vattr_t; + + +typedef struct xvattr { + vattr_t xva_vattr; /* Embedded vattr structure */ + uint32_t xva_magic; /* Magic Number */ + uint32_t xva_mapsize; /* Size of attr bitmap (32-bit words) */ + uint32_t *xva_rtnattrmapp; /* Ptr to xva_rtnattrmap[] */ + uint32_t xva_reqattrmap[XVA_MAPSIZE]; /* Requested attrs */ + uint32_t xva_rtnattrmap[XVA_MAPSIZE]; /* Returned attrs */ + xoptattr_t xva_xoptattrs; /* Optional attributes */ +} xvattr_t; + +typedef struct vsecattr { + uint_t vsa_mask; /* See below */ + int vsa_aclcnt; /* ACL entry count */ + void *vsa_aclentp; /* pointer to ACL entries */ + int vsa_dfaclcnt; /* default ACL entry count */ + void *vsa_dfaclentp; /* pointer to default ACL entries */ + size_t vsa_aclentsz; /* ACE size in bytes of vsa_aclentp */ +} vsecattr_t; + +#define AT_MODE 0x00002 +#define AT_UID 0x00004 +#define AT_GID 0x00008 +#define AT_FSID 0x00010 +#define AT_NODEID 0x00020 +#define AT_NLINK 0x00040 +#define AT_SIZE 0x00080 +#define AT_ATIME 0x00100 +#define AT_MTIME 0x00200 +#define AT_CTIME 0x00400 +#define AT_RDEV 0x00800 +#define AT_BLKSIZE 0x01000 +#define AT_NBLOCKS 0x02000 +#define AT_SEQ 0x08000 +#define AT_XVATTR 0x10000 + +#define CRCREAT 0 + +#define F_FREESP 11 +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ + +/* + * Random stuff + */ +#define ddi_get_lbolt() (gethrtime() >> 23) +#define ddi_get_lbolt64() (gethrtime() >> 23) +#define hz 119 /* frequency when using gethrtime() >> 23 for lbolt */ + +#define ddi_time_before(a, b) (a < b) +#define ddi_time_after(a, b) ddi_time_before(b, a) +#define ddi_time_before_eq(a, b) (!ddi_time_after(a, b)) +#define ddi_time_after_eq(a, b) ddi_time_before_eq(b, a) + +#define ddi_time_before64(a, b) (a < b) +#define ddi_time_after64(a, b) ddi_time_before64(b, a) +#define ddi_time_before_eq64(a, b) (!ddi_time_after64(a, b)) +#define ddi_time_after_eq64(a, b) ddi_time_before_eq64(b, a) + +extern void delay(clock_t ticks); + +#define SEC_TO_TICK(sec) ((sec) * hz) +#define MSEC_TO_TICK(msec) ((msec) / (MILLISEC / hz)) +#define USEC_TO_TICK(usec) ((usec) / (MICROSEC / hz)) +#define NSEC_TO_TICK(usec) ((usec) / (NANOSEC / hz)) + +#define max_ncpus 64 +#define boot_ncpus (sysconf(_SC_NPROCESSORS_ONLN)) + +/* + * Process priorities as defined by setpriority(2) and getpriority(2). + */ +#define minclsyspri 19 +#define maxclsyspri -20 +#define defclsyspri 0 + +#define CPU_SEQID ((uintptr_t)pthread_self() & (max_ncpus - 1)) + +#define kcred NULL +#define CRED() NULL + +#define ptob(x) ((x) * PAGESIZE) + +#define NN_DIVISOR_1000 (1U << 0) +#define NN_NUMBUF_SZ (6) + +extern uint64_t physmem; +extern char *random_path; +extern char *urandom_path; + +extern int highbit64(uint64_t i); +extern int lowbit64(uint64_t i); +extern int random_get_bytes(uint8_t *ptr, size_t len); +extern int random_get_pseudo_bytes(uint8_t *ptr, size_t len); + +extern void kernel_init(int mode); +extern void kernel_fini(void); +extern void random_init(void); +extern void random_fini(void); + +struct spa; +extern void show_pool_stats(struct spa *); +extern int set_global_var(char *arg); + +typedef struct callb_cpr { + kmutex_t *cc_lockp; +} callb_cpr_t; + +#define CALLB_CPR_INIT(cp, lockp, func, name) { \ + (cp)->cc_lockp = lockp; \ +} + +#define CALLB_CPR_SAFE_BEGIN(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ +} + +#define CALLB_CPR_SAFE_END(cp, lockp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ +} + +#define CALLB_CPR_EXIT(cp) { \ + ASSERT(MUTEX_HELD((cp)->cc_lockp)); \ + mutex_exit((cp)->cc_lockp); \ +} + +#define zone_dataset_visible(x, y) (1) +#define INGLOBALZONE(z) (1) +extern uint32_t zone_get_hostid(void *zonep); + +extern char *kmem_vasprintf(const char *fmt, va_list adx); +extern char *kmem_asprintf(const char *fmt, ...); +#define kmem_strfree(str) kmem_free((str), strlen(str) + 1) +#define kmem_strdup(s) strdup(s) + +/* + * Hostname information + */ +extern char hw_serial[]; /* for userland-emulated hostid access */ +extern int ddi_strtoul(const char *str, char **nptr, int base, + unsigned long *result); + +extern int ddi_strtoull(const char *str, char **nptr, int base, + u_longlong_t *result); + +typedef struct utsname utsname_t; +extern utsname_t *utsname(void); + +/* ZFS Boot Related stuff. */ + +struct _buf { + intptr_t _fd; +}; + +struct bootstat { + uint64_t st_size; +}; + +typedef struct ace_object { + uid_t a_who; + uint32_t a_access_mask; + uint16_t a_flags; + uint16_t a_type; + uint8_t a_obj_type[16]; + uint8_t a_inherit_obj_type[16]; +} ace_object_t; + + +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 + +extern int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr); +extern int zfs_secpolicy_rename_perms(const char *from, const char *to, + cred_t *cr); +extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); +extern int secpolicy_zfs(const cred_t *cr); +extern int secpolicy_zfs_proc(const cred_t *cr, proc_t *proc); +extern zoneid_t getzoneid(void); + +/* SID stuff */ +typedef struct ksiddomain { + uint_t kd_ref; + uint_t kd_len; + char *kd_name; +} ksiddomain_t; + +ksiddomain_t *ksid_lookupdomain(const char *); +void ksiddomain_rele(ksiddomain_t *); + +#define DDI_SLEEP KM_SLEEP +#define ddi_log_sysevent(_a, _b, _c, _d, _e, _f, _g) \ + sysevent_post_event(_c, _d, _b, "libzpool", _e, _f) + +#define zfs_sleep_until(wakeup) \ + do { \ + hrtime_t delta = wakeup - gethrtime(); \ + struct timespec ts; \ + ts.tv_sec = delta / NANOSEC; \ + ts.tv_nsec = delta % NANOSEC; \ + (void) nanosleep(&ts, NULL); \ + } while (0) + +typedef int fstrans_cookie_t; + +extern fstrans_cookie_t spl_fstrans_mark(void); +extern void spl_fstrans_unmark(fstrans_cookie_t); +extern int __spl_pf_fstrans_check(void); +extern int kmem_cache_reap_active(void); + +#define ____cacheline_aligned + +/* + * Kernel modules + */ +#define __init +#define __exit + +#endif /* _KERNEL */ + +#ifdef __cplusplus +}; +#endif + +#endif /* _SYS_ZFS_CONTEXT_H */ diff --git a/include/sys/zfs_debug.h b/include/sys/zfs_debug.h new file mode 100644 index 000000000000..89587876f947 --- /dev/null +++ b/include/sys/zfs_debug.h @@ -0,0 +1,111 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_DEBUG_H +#define _SYS_ZFS_DEBUG_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef TRUE +#define TRUE 1 +#endif + +#ifndef FALSE +#define FALSE 0 +#endif + +extern int zfs_flags; +extern int zfs_recover; +extern int zfs_free_leak_on_eio; +extern int zfs_dbgmsg_enable; + +#define ZFS_DEBUG_DPRINTF (1 << 0) +#define ZFS_DEBUG_DBUF_VERIFY (1 << 1) +#define ZFS_DEBUG_DNODE_VERIFY (1 << 2) +#define ZFS_DEBUG_SNAPNAMES (1 << 3) +#define ZFS_DEBUG_MODIFY (1 << 4) +/* 1<<5 was previously used, try not to reuse */ +#define ZFS_DEBUG_ZIO_FREE (1 << 6) +#define ZFS_DEBUG_HISTOGRAM_VERIFY (1 << 7) +#define ZFS_DEBUG_METASLAB_VERIFY (1 << 8) +#define ZFS_DEBUG_SET_ERROR (1 << 9) +#define ZFS_DEBUG_INDIRECT_REMAP (1 << 10) +#define ZFS_DEBUG_TRIM (1 << 11) +#define ZFS_DEBUG_LOG_SPACEMAP (1 << 12) + +extern void __set_error(const char *file, const char *func, int line, int err); +extern void __zfs_dbgmsg(char *buf); +extern void __dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...); + +/* + * Some general principles for using zfs_dbgmsg(): + * 1. We don't want to pollute the log with typically-irrelevant messages, + * so don't print too many messages in the "normal" code path - O(1) + * per txg. + * 2. We want to know for sure what happened, so make the message specific + * (e.g. *which* thing am I operating on). + * 3. Do print a message when something unusual or unexpected happens + * (e.g. error cases). + * 4. Print a message when making user-initiated on-disk changes. + * + * Note that besides principle 1, another reason that we don't want to + * use zfs_dbgmsg in high-frequency routines is the potential impact + * that it can have on performance. + */ +#define zfs_dbgmsg(...) \ + if (zfs_dbgmsg_enable) \ + __dprintf(B_FALSE, __FILE__, __func__, __LINE__, __VA_ARGS__) + +#ifdef ZFS_DEBUG +/* + * To enable this: + * + * $ echo 1 >/sys/module/zfs/parameters/zfs_flags + */ +#define dprintf(...) \ + if (zfs_flags & ZFS_DEBUG_DPRINTF) \ + __dprintf(B_TRUE, __FILE__, __func__, __LINE__, __VA_ARGS__) +#else +#define dprintf(...) ((void)0) +#endif /* ZFS_DEBUG */ + +extern void zfs_panic_recover(const char *fmt, ...); + +extern void zfs_dbgmsg_init(void); +extern void zfs_dbgmsg_fini(void); + +#ifndef _KERNEL +extern int dprintf_find_string(const char *string); +extern void zfs_dbgmsg_print(const char *tag); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_DEBUG_H */ diff --git a/include/sys/zfs_delay.h b/include/sys/zfs_delay.h new file mode 100644 index 000000000000..40e617dba961 --- /dev/null +++ b/include/sys/zfs_delay.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_FS_ZFS_DELAY_H +#define _SYS_FS_ZFS_DELAY_H + +#include + +/* + * Generic wrapper to sleep until a given time. + */ +#define zfs_sleep_until(wakeup) \ + do { \ + hrtime_t delta = wakeup - gethrtime(); \ + \ + if (delta > 0) { \ + unsigned long delta_us; \ + delta_us = delta / (NANOSEC / MICROSEC); \ + usleep_range(delta_us, delta_us + 100); \ + } \ + } while (0) + +#endif /* _SYS_FS_ZFS_DELAY_H */ diff --git a/include/sys/zfs_file.h b/include/sys/zfs_file.h new file mode 100644 index 000000000000..d117933a6e4c --- /dev/null +++ b/include/sys/zfs_file.h @@ -0,0 +1,62 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_FILE_H +#define _SYS_ZFS_FILE_H + +#ifndef _KERNEL +typedef struct zfs_file { + int f_fd; + int f_dump_fd; +} zfs_file_t; +#elif defined(__linux__) || defined(__FreeBSD__) +typedef struct file zfs_file_t; +#else +#error "unknown OS" +#endif + +typedef struct zfs_file_attr { + uint64_t zfa_size; /* file size */ + mode_t zfa_mode; /* file type */ +} zfs_file_attr_t; + +int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fp); +void zfs_file_close(zfs_file_t *fp); + +int zfs_file_write(zfs_file_t *fp, const void *buf, size_t len, ssize_t *resid); +int zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t len, loff_t off, + ssize_t *resid); +int zfs_file_read(zfs_file_t *fp, void *buf, size_t len, ssize_t *resid); +int zfs_file_pread(zfs_file_t *fp, void *buf, size_t len, loff_t off, + ssize_t *resid); + +int zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence); +int zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr); +int zfs_file_fsync(zfs_file_t *fp, int flags); +int zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len); +loff_t zfs_file_off(zfs_file_t *fp); +int zfs_file_unlink(const char *); + +int zfs_file_get(int fd, zfs_file_t **fp); +void zfs_file_put(int fd); +void *zfs_file_private(zfs_file_t *fp); + +#endif /* _SYS_ZFS_FILE_H */ diff --git a/include/sys/zfs_fuid.h b/include/sys/zfs_fuid.h new file mode 100644 index 000000000000..b5b37db29468 --- /dev/null +++ b/include/sys/zfs_fuid.h @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FS_ZFS_FUID_H +#define _SYS_FS_ZFS_FUID_H + +#ifdef _KERNEL +#include +#include +#include +#endif +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + ZFS_OWNER, + ZFS_GROUP, + ZFS_ACE_USER, + ZFS_ACE_GROUP +} zfs_fuid_type_t; + +/* + * Estimate space needed for one more fuid table entry. + * for now assume its current size + 1K + */ +#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) + +#define FUID_INDEX(x) ((x) >> 32) +#define FUID_RID(x) ((x) & 0xffffffff) +#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) +/* + * FUIDs cause problems for the intent log + * we need to replay the creation of the FUID, + * but we can't count on the idmapper to be around + * and during replay the FUID index may be different than + * before. Also, if an ACL has 100 ACEs and 12 different + * domains we don't want to log 100 domain strings, but rather + * just the unique 12. + */ + +/* + * The FUIDs in the log will index into + * domain string table and the bottom half will be the rid. + * Used for mapping ephemeral uid/gid during ACL setting to FUIDs + */ +typedef struct zfs_fuid { + list_node_t z_next; + uint64_t z_id; /* uid/gid being converted to fuid */ + uint64_t z_domidx; /* index in AVL domain table */ + uint64_t z_logfuid; /* index for domain in log */ +} zfs_fuid_t; + +/* list of unique domains */ +typedef struct zfs_fuid_domain { + list_node_t z_next; + uint64_t z_domidx; /* AVL tree idx */ + const char *z_domain; /* domain string */ +} zfs_fuid_domain_t; + +/* + * FUID information necessary for logging create, setattr, and setacl. + */ +typedef struct zfs_fuid_info { + list_t z_fuids; + list_t z_domains; + uint64_t z_fuid_owner; + uint64_t z_fuid_group; + char **z_domain_table; /* Used during replay */ + uint32_t z_fuid_cnt; /* How many fuids in z_fuids */ + uint32_t z_domain_cnt; /* How many domains */ + size_t z_domain_str_sz; /* len of domain strings z_domain list */ +} zfs_fuid_info_t; + +#ifdef _KERNEL +struct znode; +extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, + uint64_t, uint64_t, zfs_fuid_type_t); +extern void zfs_fuid_destroy(zfsvfs_t *); +extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, + cred_t *, zfs_fuid_info_t **); +extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, + zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, + uid_t *uid, uid_t *gid); +extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); +extern void zfs_fuid_info_free(zfs_fuid_info_t *); +extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); +extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, + char **retdomain, boolean_t addok); +extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); +extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, size_t len, boolean_t addok); +#endif + +char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); +uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); +void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_FUID_H */ diff --git a/include/sys/zfs_ioctl.h b/include/sys/zfs_ioctl.h new file mode 100644 index 000000000000..136075a1fc18 --- /dev/null +++ b/include/sys/zfs_ioctl.h @@ -0,0 +1,581 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright 2016 RackTop Systems. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _SYS_ZFS_IOCTL_H +#define _SYS_ZFS_IOCTL_H + +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif /* _KERNEL */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * The structures in this file are passed between userland and the + * kernel. Userland may be running a 32-bit process, while the kernel + * is 64-bit. Therefore, these structures need to compile the same in + * 32-bit and 64-bit. This means not using type "long", and adding + * explicit padding so that the 32-bit structure will not be packed more + * tightly than the 64-bit structure (which requires 64-bit alignment). + */ + +/* + * Property values for snapdir + */ +#define ZFS_SNAPDIR_HIDDEN 0 +#define ZFS_SNAPDIR_VISIBLE 1 + +/* + * Property values for snapdev + */ +#define ZFS_SNAPDEV_HIDDEN 0 +#define ZFS_SNAPDEV_VISIBLE 1 +/* + * Property values for acltype + */ +#define ZFS_ACLTYPE_OFF 0 +#define ZFS_ACLTYPE_POSIXACL 1 + +/* + * Field manipulation macros for the drr_versioninfo field of the + * send stream header. + */ + +/* + * Header types for zfs send streams. + */ +typedef enum drr_headertype { + DMU_SUBSTREAM = 0x1, + DMU_COMPOUNDSTREAM = 0x2 +} drr_headertype_t; + +#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) +#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) + +#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) +#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) + +/* + * Feature flags for zfs send streams (flags in drr_versioninfo) + */ + +#define DMU_BACKUP_FEATURE_DEDUP (1 << 0) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (1 << 1) +#define DMU_BACKUP_FEATURE_SA_SPILL (1 << 2) +/* flags #3 - #15 are reserved for incompatible closed-source implementations */ +#define DMU_BACKUP_FEATURE_EMBED_DATA (1 << 16) +#define DMU_BACKUP_FEATURE_LZ4 (1 << 17) +/* flag #18 is reserved for a Delphix feature */ +#define DMU_BACKUP_FEATURE_LARGE_BLOCKS (1 << 19) +#define DMU_BACKUP_FEATURE_RESUMING (1 << 20) +#define DMU_BACKUP_FEATURE_REDACTED (1 << 21) +#define DMU_BACKUP_FEATURE_COMPRESSED (1 << 22) +#define DMU_BACKUP_FEATURE_LARGE_DNODE (1 << 23) +#define DMU_BACKUP_FEATURE_RAW (1 << 24) +#define DMU_BACKUP_FEATURE_ZSTD (1 << 25) +#define DMU_BACKUP_FEATURE_HOLDS (1 << 26) +/* + * The SWITCH_TO_LARGE_BLOCKS feature indicates that we can receive + * incremental LARGE_BLOCKS streams (those with WRITE records of >128KB) even + * if the previous send did not use LARGE_BLOCKS, and thus its large blocks + * were split into multiple 128KB WRITE records. (See + * flush_write_batch_impl() and receive_object()). Older software that does + * not support this flag may encounter a bug when switching to large blocks, + * which causes files to incorrectly be zeroed. + * + * This flag is currently not set on any send streams. In the future, we + * intend for incremental send streams of snapshots that have large blocks to + * use LARGE_BLOCKS by default, and these streams will also have the + * SWITCH_TO_LARGE_BLOCKS feature set. This ensures that streams from the + * default use of "zfs send" won't encounter the bug mentioned above. + */ +#define DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS (1 << 27) + +/* + * Mask of all supported backup features + */ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_SA_SPILL | \ + DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_LZ4 | \ + DMU_BACKUP_FEATURE_RESUMING | DMU_BACKUP_FEATURE_LARGE_BLOCKS | \ + DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_LARGE_DNODE | \ + DMU_BACKUP_FEATURE_RAW | DMU_BACKUP_FEATURE_HOLDS | \ + DMU_BACKUP_FEATURE_REDACTED | DMU_BACKUP_FEATURE_SWITCH_TO_LARGE_BLOCKS | \ + DMU_BACKUP_FEATURE_ZSTD) + +/* Are all features in the given flag word currently supported? */ +#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) + +typedef enum dmu_send_resume_token_version { + ZFS_SEND_RESUME_TOKEN_VERSION = 1 +} dmu_send_resume_token_version_t; + +/* + * The drr_versioninfo field of the dmu_replay_record has the + * following layout: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | reserved | feature-flags |C|S| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * The low order two bits indicate the header type: SUBSTREAM (0x1) + * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: + * this field used to be a version number, where the two version types + * were 1 and 2. Using two bits for this allows earlier versions of + * the code to be able to recognize send streams that don't use any + * of the features indicated by feature flags. + */ + +#define DMU_BACKUP_MAGIC 0x2F5bacbacULL + +/* + * Send stream flags. Bits 24-31 are reserved for vendor-specific + * implementations and should not be used. + */ +#define DRR_FLAG_CLONE (1<<0) +#define DRR_FLAG_CI_DATA (1<<1) +/* + * This send stream, if it is a full send, includes the FREE and FREEOBJECT + * records that are created by the sending process. This means that the send + * stream can be received as a clone, even though it is not an incremental. + * This is not implemented as a feature flag, because the receiving side does + * not need to have implemented it to receive this stream; it is fully backwards + * compatible. We need a flag, though, because full send streams without it + * cannot necessarily be received as a clone correctly. + */ +#define DRR_FLAG_FREERECORDS (1<<2) +/* + * When DRR_FLAG_SPILL_BLOCK is set it indicates the DRR_OBJECT_SPILL + * and DRR_SPILL_UNMODIFIED flags are meaningful in the send stream. + * + * When DRR_FLAG_SPILL_BLOCK is set, DRR_OBJECT records will have + * DRR_OBJECT_SPILL set if and only if they should have a spill block + * (either an existing one, or a new one in the send stream). When clear + * the object does not have a spill block and any existing spill block + * should be freed. + * + * Similarly, when DRR_FLAG_SPILL_BLOCK is set, DRR_SPILL records will + * have DRR_SPILL_UNMODIFIED set if and only if they were included for + * backward compatibility purposes, and can be safely ignored by new versions + * of zfs receive. Previous versions of ZFS which do not understand the + * DRR_FLAG_SPILL_BLOCK will process this record and recreate any missing + * spill blocks. + */ +#define DRR_FLAG_SPILL_BLOCK (1<<3) + +/* + * flags in the drr_flags field in the DRR_WRITE, DRR_SPILL, DRR_OBJECT, + * DRR_WRITE_BYREF, and DRR_OBJECT_RANGE blocks + */ +#define DRR_CHECKSUM_DEDUP (1<<0) /* not used for SPILL records */ +#define DRR_RAW_BYTESWAP (1<<1) +#define DRR_OBJECT_SPILL (1<<2) /* OBJECT record has a spill block */ +#define DRR_SPILL_UNMODIFIED (1<<2) /* SPILL record for unmodified block */ + +#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) +#define DRR_IS_RAW_BYTESWAPPED(flags) ((flags) & DRR_RAW_BYTESWAP) +#define DRR_OBJECT_HAS_SPILL(flags) ((flags) & DRR_OBJECT_SPILL) +#define DRR_SPILL_IS_UNMODIFIED(flags) ((flags) & DRR_SPILL_UNMODIFIED) + +/* deal with compressed drr_write replay records */ +#define DRR_WRITE_COMPRESSED(drrw) ((drrw)->drr_compressiontype != 0) +#define DRR_WRITE_PAYLOAD_SIZE(drrw) \ + (DRR_WRITE_COMPRESSED(drrw) ? (drrw)->drr_compressed_size : \ + (drrw)->drr_logical_size) +#define DRR_SPILL_PAYLOAD_SIZE(drrs) \ + ((drrs)->drr_compressed_size ? \ + (drrs)->drr_compressed_size : (drrs)->drr_length) +#define DRR_OBJECT_PAYLOAD_SIZE(drro) \ + ((drro)->drr_raw_bonuslen != 0 ? \ + (drro)->drr_raw_bonuslen : P2ROUNDUP((drro)->drr_bonuslen, 8)) + +/* + * zfs ioctl command structure + */ + +/* Header is used in C++ so can't forward declare untagged struct */ +struct drr_begin { + uint64_t drr_magic; + uint64_t drr_versioninfo; /* was drr_version */ + uint64_t drr_creation_time; + dmu_objset_type_t drr_type; + uint32_t drr_flags; + uint64_t drr_toguid; + uint64_t drr_fromguid; + char drr_toname[MAXNAMELEN]; +}; + +typedef struct dmu_replay_record { + enum { + DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_SPILL, DRR_WRITE_EMBEDDED, DRR_OBJECT_RANGE, DRR_REDACT, + DRR_NUMTYPES + } drr_type; + uint32_t drr_payloadlen; + union { + struct drr_begin drr_begin; + struct drr_end { + zio_cksum_t drr_checksum; + uint64_t drr_toguid; + } drr_end; + struct drr_object { + uint64_t drr_object; + dmu_object_type_t drr_type; + dmu_object_type_t drr_bonustype; + uint32_t drr_blksz; + uint32_t drr_bonuslen; + uint8_t drr_checksumtype; + uint8_t drr_compress; + uint8_t drr_dn_slots; + uint8_t drr_flags; + uint32_t drr_raw_bonuslen; + uint64_t drr_toguid; + /* only (possibly) nonzero for raw streams */ + uint8_t drr_indblkshift; + uint8_t drr_nlevels; + uint8_t drr_nblkptr; + uint8_t drr_pad[5]; + uint64_t drr_maxblkid; + /* bonus content follows */ + } drr_object; + struct drr_freeobjects { + uint64_t drr_firstobj; + uint64_t drr_numobjs; + uint64_t drr_toguid; + } drr_freeobjects; + struct drr_write { + uint64_t drr_object; + dmu_object_type_t drr_type; + uint32_t drr_pad; + uint64_t drr_offset; + uint64_t drr_logical_size; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_flags; + uint8_t drr_compressiontype; + uint8_t drr_pad2[5]; + /* deduplication key */ + ddt_key_t drr_key; + /* only nonzero if drr_compressiontype is not 0 */ + uint64_t drr_compressed_size; + /* only nonzero for raw streams */ + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; + /* content follows */ + } drr_write; + struct drr_free { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_flags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; + struct drr_spill { + uint64_t drr_object; + uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_flags; + uint8_t drr_compressiontype; + uint8_t drr_pad[6]; + /* only nonzero for raw streams */ + uint64_t drr_compressed_size; + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; + dmu_object_type_t drr_type; + /* spill data follows */ + } drr_spill; + struct drr_write_embedded { + uint64_t drr_object; + uint64_t drr_offset; + /* logical length, should equal blocksize */ + uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_compression; + uint8_t drr_etype; + uint8_t drr_pad[6]; + uint32_t drr_lsize; /* uncompressed size of payload */ + uint32_t drr_psize; /* compr. (real) size of payload */ + /* (possibly compressed) content follows */ + } drr_write_embedded; + struct drr_object_range { + uint64_t drr_firstobj; + uint64_t drr_numslots; + uint64_t drr_toguid; + uint8_t drr_salt[ZIO_DATA_SALT_LEN]; + uint8_t drr_iv[ZIO_DATA_IV_LEN]; + uint8_t drr_mac[ZIO_DATA_MAC_LEN]; + uint8_t drr_flags; + uint8_t drr_pad[3]; + } drr_object_range; + struct drr_redact { + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + } drr_redact; + + /* + * Note: drr_checksum is overlaid with all record types + * except DRR_BEGIN. Therefore its (non-pad) members + * must not overlap with members from the other structs. + * We accomplish this by putting its members at the very + * end of the struct. + */ + struct drr_checksum { + uint64_t drr_pad[34]; + /* + * fletcher-4 checksum of everything preceding the + * checksum. + */ + zio_cksum_t drr_checksum; + } drr_checksum; + } drr_u; +} dmu_replay_record_t; + +/* diff record range types */ +typedef enum diff_type { + DDR_NONE = 0x1, + DDR_INUSE = 0x2, + DDR_FREE = 0x4 +} diff_type_t; + +/* + * The diff reports back ranges of free or in-use objects. + */ +typedef struct dmu_diff_record { + uint64_t ddr_type; + uint64_t ddr_first; + uint64_t ddr_last; +} dmu_diff_record_t; + +typedef struct zinject_record { + uint64_t zi_objset; + uint64_t zi_object; + uint64_t zi_start; + uint64_t zi_end; + uint64_t zi_guid; + uint32_t zi_level; + uint32_t zi_error; + uint64_t zi_type; + uint32_t zi_freq; + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; + uint64_t zi_nlanes; + uint32_t zi_cmd; + uint32_t zi_dvas; +} zinject_record_t; + +#define ZINJECT_NULL 0x1 +#define ZINJECT_FLUSH_ARC 0x2 +#define ZINJECT_UNLOAD_SPA 0x4 +#define ZINJECT_CALC_RANGE 0x8 + +#define ZEVENT_NONE 0x0 +#define ZEVENT_NONBLOCK 0x1 +#define ZEVENT_SIZE 1024 + +#define ZEVENT_SEEK_START 0 +#define ZEVENT_SEEK_END UINT64_MAX + +/* scaled frequency ranges */ +#define ZI_PERCENTAGE_MIN 4294UL +#define ZI_PERCENTAGE_MAX UINT32_MAX + +#define ZI_NO_DVA (-1) + +typedef enum zinject_type { + ZINJECT_UNINITIALIZED, + ZINJECT_DATA_FAULT, + ZINJECT_DEVICE_FAULT, + ZINJECT_LABEL_FAULT, + ZINJECT_IGNORED_WRITES, + ZINJECT_PANIC, + ZINJECT_DELAY_IO, + ZINJECT_DECRYPT_FAULT, +} zinject_type_t; + +typedef struct zfs_share { + uint64_t z_exportdata; + uint64_t z_sharedata; + uint64_t z_sharetype; /* 0 = share, 1 = unshare */ + uint64_t z_sharemax; /* max length of share string */ +} zfs_share_t; + +/* + * ZFS file systems may behave the usual, POSIX-compliant way, where + * name lookups are case-sensitive. They may also be set up so that + * all the name lookups are case-insensitive, or so that only some + * lookups, the ones that set an FIGNORECASE flag, are case-insensitive. + */ +typedef enum zfs_case { + ZFS_CASE_SENSITIVE, + ZFS_CASE_INSENSITIVE, + ZFS_CASE_MIXED +} zfs_case_t; + +/* + * Note: this struct must have the same layout in 32-bit and 64-bit, so + * that 32-bit processes (like /sbin/zfs) can pass it to the 64-bit + * kernel. Therefore, we add padding to it so that no "hidden" padding + * is automatically added on 64-bit (but not on 32-bit). + */ +typedef struct zfs_cmd { + char zc_name[MAXPATHLEN]; /* name of pool or dataset */ + uint64_t zc_nvlist_src; /* really (char *) */ + uint64_t zc_nvlist_src_size; + uint64_t zc_nvlist_dst; /* really (char *) */ + uint64_t zc_nvlist_dst_size; + boolean_t zc_nvlist_dst_filled; /* put an nvlist in dst? */ + int zc_pad2; + + /* + * The following members are for legacy ioctls which haven't been + * converted to the new method. + */ + uint64_t zc_history; /* really (char *) */ + char zc_value[MAXPATHLEN * 2]; + char zc_string[MAXNAMELEN]; + uint64_t zc_guid; + uint64_t zc_nvlist_conf; /* really (char *) */ + uint64_t zc_nvlist_conf_size; + uint64_t zc_cookie; + uint64_t zc_objset_type; + uint64_t zc_perm_action; + uint64_t zc_history_len; + uint64_t zc_history_offset; + uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ + zfs_share_t zc_share; + dmu_objset_stats_t zc_objset_stats; + struct drr_begin zc_begin_record; + zinject_record_t zc_inject_record; + uint32_t zc_defer_destroy; + uint32_t zc_flags; + uint64_t zc_action_handle; + int zc_cleanup_fd; + uint8_t zc_simple; + uint8_t zc_pad[3]; /* alignment */ + uint64_t zc_sendobj; + uint64_t zc_fromobj; + uint64_t zc_createtxg; + zfs_stat_t zc_stat; + uint64_t zc_zoneid; +} zfs_cmd_t; + +typedef struct zfs_useracct { + char zu_domain[256]; + uid_t zu_rid; + uint32_t zu_pad; + uint64_t zu_space; +} zfs_useracct_t; + +#define ZFSDEV_MAX_MINOR (1 << 16) +#define ZFS_MIN_MINOR (ZFSDEV_MAX_MINOR + 1) + +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 + +#ifdef _KERNEL +struct objset; +struct zfsvfs; + +typedef struct zfs_creat { + nvlist_t *zct_zplprops; + nvlist_t *zct_props; +} zfs_creat_t; + +extern int zfs_secpolicy_snapshot_perms(const char *, cred_t *); +extern int zfs_secpolicy_rename_perms(const char *, const char *, cred_t *); +extern int zfs_secpolicy_destroy_perms(const char *, cred_t *); +extern void zfs_unmount_snap(const char *); +extern void zfs_destroy_unmount_origin(const char *); +extern int getzfsvfs_impl(struct objset *, struct zfsvfs **); +extern int getzfsvfs(const char *, struct zfsvfs **); + +enum zfsdev_state_type { + ZST_ONEXIT, + ZST_ZEVENT, + ZST_ALL, +}; + +/* + * The zfsdev_state_t structure is managed as a singly-linked list + * from which items are never deleted. This allows for lock-free + * reading of the list so long as assignments to the zs_next and + * reads from zs_minor are performed atomically. Empty items are + * indicated by storing -1 into zs_minor. + */ +typedef struct zfsdev_state { + struct zfsdev_state *zs_next; /* next zfsdev_state_t link */ + minor_t zs_minor; /* made up minor number */ + void *zs_onexit; /* onexit data */ + void *zs_zevent; /* zevent data */ +} zfsdev_state_t; + +extern void *zfsdev_get_state(minor_t minor, enum zfsdev_state_type which); +extern int zfsdev_getminor(int fd, minor_t *minorp); +extern minor_t zfsdev_minor_alloc(void); + +extern uint_t zfs_fsyncer_key; +extern uint_t zfs_allow_log_key; + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_IOCTL_H */ diff --git a/include/sys/zfs_ioctl_impl.h b/include/sys/zfs_ioctl_impl.h new file mode 100644 index 000000000000..787475cf396c --- /dev/null +++ b/include/sys/zfs_ioctl_impl.h @@ -0,0 +1,97 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#ifndef _ZFS_IOCTL_IMPL_H_ +#define _ZFS_IOCTL_IMPL_H_ + +extern kmutex_t zfsdev_state_lock; +extern zfsdev_state_t *zfsdev_state_list; +extern unsigned long zfs_max_nvlist_src_size; + +typedef int zfs_ioc_legacy_func_t(zfs_cmd_t *); +typedef int zfs_ioc_func_t(const char *, nvlist_t *, nvlist_t *); +typedef int zfs_secpolicy_func_t(zfs_cmd_t *, nvlist_t *, cred_t *); + +typedef enum { + POOL_CHECK_NONE = 1 << 0, + POOL_CHECK_SUSPENDED = 1 << 1, + POOL_CHECK_READONLY = 1 << 2, +} zfs_ioc_poolcheck_t; + +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME, + ENTITY_NAME +} zfs_ioc_namecheck_t; + +/* + * IOC Keys are used to document and validate user->kernel interface inputs. + * See zfs_keys_recv_new for an example declaration. Any key name that is not + * listed will be rejected as input. + * + * The keyname 'optional' is always allowed, and must be an nvlist if present. + * Arguments which older kernels can safely ignore can be placed under the + * "optional" key. + * + * When adding new keys to an existing ioc for new functionality, consider: + * - adding an entry into zfs_sysfs.c zfs_features[] list + * - updating the libzfs_input_check.c test utility + * + * Note: in the ZK_WILDCARDLIST case, the name serves as documentation + * for the expected name (bookmark, snapshot, property, etc) but there + * is no validation in the preflight zfs_check_input_nvpairs() check. + */ +typedef enum { + ZK_OPTIONAL = 1 << 0, /* pair is optional */ + ZK_WILDCARDLIST = 1 << 1, /* one or more unspecified key names */ +} ioc_key_flag_t; + +typedef struct zfs_ioc_key { + const char *zkey_name; + data_type_t zkey_type; + ioc_key_flag_t zkey_flags; +} zfs_ioc_key_t; + +int zfs_secpolicy_config(zfs_cmd_t *, nvlist_t *, cred_t *); + +void zfs_ioctl_register_dataset_nolog(zfs_ioc_t, zfs_ioc_legacy_func_t *, + zfs_secpolicy_func_t *, zfs_ioc_poolcheck_t); + +void zfs_ioctl_register(const char *, zfs_ioc_t, zfs_ioc_func_t *, + zfs_secpolicy_func_t *, zfs_ioc_namecheck_t, zfs_ioc_poolcheck_t, + boolean_t, boolean_t, const zfs_ioc_key_t *, size_t); + +uint64_t zfs_max_nvlist_src_size_os(void); +void zfs_ioctl_init_os(void); + +boolean_t zfs_vfs_held(zfsvfs_t *); +int zfs_vfs_ref(zfsvfs_t **); +void zfs_vfs_rele(zfsvfs_t *); + +long zfsdev_ioctl_common(uint_t, zfs_cmd_t *, int); +int zfsdev_attach(void); +void zfsdev_detach(void); +int zfs_kmod_init(void); +void zfs_kmod_fini(void); + +#endif diff --git a/include/sys/zfs_onexit.h b/include/sys/zfs_onexit.h new file mode 100644 index 000000000000..0fab23ff849b --- /dev/null +++ b/include/sys/zfs_onexit.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_ONEXIT_H +#define _SYS_ZFS_ONEXIT_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +typedef struct zfs_onexit { + kmutex_t zo_lock; + list_t zo_actions; +} zfs_onexit_t; + +typedef struct zfs_onexit_action_node { + list_node_t za_link; + void (*za_func)(void *); + void *za_data; +} zfs_onexit_action_node_t; + +extern void zfs_onexit_init(zfs_onexit_t **zo); +extern void zfs_onexit_destroy(zfs_onexit_t *zo); + +#endif + +extern int zfs_onexit_fd_hold(int fd, minor_t *minorp); +extern void zfs_onexit_fd_rele(int fd); +extern int zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_ONEXIT_H */ diff --git a/include/sys/zfs_project.h b/include/sys/zfs_project.h new file mode 100644 index 000000000000..81a238905225 --- /dev/null +++ b/include/sys/zfs_project.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Intel Corporation. All rights reserved. + */ + +#ifndef _SYS_ZFS_PROJECT_H +#define _SYS_ZFS_PROJECT_H + +#ifndef _KERNEL +#ifndef _SYS_MOUNT_H +/* XXX: some hack to avoid include sys/mount.h */ +#define _SYS_MOUNT_H +#endif +#endif + +#include + +#ifdef FS_PROJINHERIT_FL +#define ZFS_PROJINHERIT_FL FS_PROJINHERIT_FL +#else +#define ZFS_PROJINHERIT_FL 0x20000000 +#endif + +#ifdef FS_IOC_FSGETXATTR +typedef struct fsxattr zfsxattr_t; + +#define ZFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define ZFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR +#else +struct zfsxattr { + uint32_t fsx_xflags; /* xflags field value (get/set) */ + uint32_t fsx_extsize; /* extsize field value (get/set) */ + uint32_t fsx_nextents; /* nextents field value (get) */ + uint32_t fsx_projid; /* project identifier (get/set) */ + uint32_t fsx_cowextsize; + unsigned char fsx_pad[8]; +}; +typedef struct zfsxattr zfsxattr_t; + +#define ZFS_IOC_FSGETXATTR _IOR('X', 31, zfsxattr_t) +#define ZFS_IOC_FSSETXATTR _IOW('X', 32, zfsxattr_t) +#endif + +#define ZFS_DEFAULT_PROJID (0ULL) +/* + * It is NOT ondisk project ID value. Just means either the object has + * no project ID or the operation does not touch project ID attribute. + */ +#define ZFS_INVALID_PROJID (-1ULL) + +static inline boolean_t +zpl_is_valid_projid(uint32_t projid) +{ + /* + * zfsxattr::fsx_projid is 32-bits, when convert to uint64_t, + * the higher 32-bits will be set as zero, so cannot directly + * compare with ZFS_INVALID_PROJID (-1ULL) + */ + if ((uint32_t)ZFS_INVALID_PROJID == projid) + return (B_FALSE); + return (B_TRUE); +} + +#endif /* _SYS_ZFS_PROJECT_H */ diff --git a/include/sys/zfs_quota.h b/include/sys/zfs_quota.h new file mode 100644 index 000000000000..b215b8dd0013 --- /dev/null +++ b/include/sys/zfs_quota.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZFS_QUOTA_H +#define _SYS_ZFS_QUOTA_H + +#include +#include + +struct zfsvfs; +struct zfs_file_info_t; + +extern int zpl_get_file_info(dmu_object_type_t, + const void *, struct zfs_file_info *); + +extern int zfs_userspace_one(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t *); +extern int zfs_userspace_many(struct zfsvfs *, zfs_userquota_prop_t, + uint64_t *, void *, uint64_t *); +extern int zfs_set_userquota(struct zfsvfs *, zfs_userquota_prop_t, + const char *, uint64_t, uint64_t); + +extern boolean_t zfs_id_overobjquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overblockquota(struct zfsvfs *, uint64_t, uint64_t); +extern boolean_t zfs_id_overquota(struct zfsvfs *, uint64_t, uint64_t); + +#endif diff --git a/include/sys/zfs_ratelimit.h b/include/sys/zfs_ratelimit.h new file mode 100644 index 000000000000..012825fadb22 --- /dev/null +++ b/include/sys/zfs_ratelimit.h @@ -0,0 +1,44 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, Lawrence Livermore National Security, LLC. + */ + +#ifndef _SYS_ZFS_RATELIMIT_H +#define _SYS_ZFS_RATELIMIT_H + +#include + +typedef struct { + hrtime_t start; + unsigned int count; + + /* + * Pointer to number of events per interval. We do this to + * allow the burst to be a (changeable) module parameter. + */ + unsigned int *burst; + + unsigned int interval; /* Interval length in seconds */ + kmutex_t lock; +} zfs_ratelimit_t; + +int zfs_ratelimit(zfs_ratelimit_t *rl); +void zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int *burst, + unsigned int interval); +void zfs_ratelimit_fini(zfs_ratelimit_t *rl); + +#endif /* _SYS_ZFS_RATELIMIT_H */ diff --git a/include/sys/zfs_refcount.h b/include/sys/zfs_refcount.h new file mode 100644 index 000000000000..fc0cbea1cf7c --- /dev/null +++ b/include/sys/zfs_refcount.h @@ -0,0 +1,126 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_REFCOUNT_H +#define _SYS_ZFS_REFCOUNT_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * If the reference is held only by the calling function and not any + * particular object, use FTAG (which is a string) for the holder_tag. + * Otherwise, use the object that holds the reference. + */ +#define FTAG ((char *)(uintptr_t)__func__) + +#ifdef ZFS_DEBUG +typedef struct reference { + list_node_t ref_link; + const void *ref_holder; + uint64_t ref_number; + uint8_t *ref_removed; +} reference_t; + +typedef struct refcount { + kmutex_t rc_mtx; + boolean_t rc_tracked; + list_t rc_list; + list_t rc_removed; + uint64_t rc_count; + uint64_t rc_removed_count; +} zfs_refcount_t; + +/* + * Note: zfs_refcount_t must be initialized with + * refcount_create[_untracked]() + */ + +void zfs_refcount_create(zfs_refcount_t *); +void zfs_refcount_create_untracked(zfs_refcount_t *); +void zfs_refcount_create_tracked(zfs_refcount_t *); +void zfs_refcount_destroy(zfs_refcount_t *); +void zfs_refcount_destroy_many(zfs_refcount_t *, uint64_t); +int zfs_refcount_is_zero(zfs_refcount_t *); +int64_t zfs_refcount_count(zfs_refcount_t *); +int64_t zfs_refcount_add(zfs_refcount_t *, const void *); +int64_t zfs_refcount_remove(zfs_refcount_t *, const void *); +int64_t zfs_refcount_add_many(zfs_refcount_t *, uint64_t, const void *); +int64_t zfs_refcount_remove_many(zfs_refcount_t *, uint64_t, const void *); +void zfs_refcount_transfer(zfs_refcount_t *, zfs_refcount_t *); +void zfs_refcount_transfer_ownership(zfs_refcount_t *, const void *, + const void *); +void zfs_refcount_transfer_ownership_many(zfs_refcount_t *, uint64_t, + const void *, const void *); +boolean_t zfs_refcount_held(zfs_refcount_t *, const void *); +boolean_t zfs_refcount_not_held(zfs_refcount_t *, const void *); + +void zfs_refcount_init(void); +void zfs_refcount_fini(void); + +#else /* ZFS_DEBUG */ + +typedef struct refcount { + uint64_t rc_count; +} zfs_refcount_t; + +#define zfs_refcount_create(rc) ((rc)->rc_count = 0) +#define zfs_refcount_create_untracked(rc) ((rc)->rc_count = 0) +#define zfs_refcount_create_tracked(rc) ((rc)->rc_count = 0) +#define zfs_refcount_destroy(rc) ((rc)->rc_count = 0) +#define zfs_refcount_destroy_many(rc, number) ((rc)->rc_count = 0) +#define zfs_refcount_is_zero(rc) ((rc)->rc_count == 0) +#define zfs_refcount_count(rc) ((rc)->rc_count) +#define zfs_refcount_add(rc, holder) atomic_inc_64_nv(&(rc)->rc_count) +#define zfs_refcount_remove(rc, holder) atomic_dec_64_nv(&(rc)->rc_count) +#define zfs_refcount_add_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, number) +#define zfs_refcount_remove_many(rc, number, holder) \ + atomic_add_64_nv(&(rc)->rc_count, -number) +#define zfs_refcount_transfer(dst, src) { \ + uint64_t __tmp = (src)->rc_count; \ + atomic_add_64(&(src)->rc_count, -__tmp); \ + atomic_add_64(&(dst)->rc_count, __tmp); \ +} +#define zfs_refcount_transfer_ownership(rc, ch, nh) ((void)0) +#define zfs_refcount_transfer_ownership_many(rc, nr, ch, nh) ((void)0) +#define zfs_refcount_held(rc, holder) ((rc)->rc_count > 0) +#define zfs_refcount_not_held(rc, holder) (B_TRUE) + +#define zfs_refcount_init() +#define zfs_refcount_fini() + +#endif /* ZFS_DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_REFCOUNT_H */ diff --git a/include/sys/zfs_rlock.h b/include/sys/zfs_rlock.h new file mode 100644 index 000000000000..2302abb37337 --- /dev/null +++ b/include/sys/zfs_rlock.h @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_RLOCK_H +#define _SYS_FS_ZFS_RLOCK_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} zfs_rangelock_type_t; + +struct zfs_locked_range; + +typedef void (zfs_rangelock_cb_t)(struct zfs_locked_range *, void *); + +typedef struct zfs_rangelock { + avl_tree_t rl_tree; /* contains locked_range_t */ + kmutex_t rl_lock; + zfs_rangelock_cb_t *rl_cb; + void *rl_arg; +} zfs_rangelock_t; + +typedef struct zfs_locked_range { + zfs_rangelock_t *lr_rangelock; /* rangelock that this lock applies to */ + avl_node_t lr_node; /* avl node link */ + uint64_t lr_offset; /* file range offset */ + uint64_t lr_length; /* file range length */ + uint_t lr_count; /* range reference count in tree */ + zfs_rangelock_type_t lr_type; /* range type */ + kcondvar_t lr_write_cv; /* cv for waiting writers */ + kcondvar_t lr_read_cv; /* cv for waiting readers */ + uint8_t lr_proxy; /* acting for original range */ + uint8_t lr_write_wanted; /* writer wants to lock this range */ + uint8_t lr_read_wanted; /* reader wants to lock this range */ +} zfs_locked_range_t; + +void zfs_rangelock_init(zfs_rangelock_t *, zfs_rangelock_cb_t *, void *); +void zfs_rangelock_fini(zfs_rangelock_t *); + +zfs_locked_range_t *zfs_rangelock_enter(zfs_rangelock_t *, + uint64_t, uint64_t, zfs_rangelock_type_t); +zfs_locked_range_t *zfs_rangelock_tryenter(zfs_rangelock_t *, + uint64_t, uint64_t, zfs_rangelock_type_t); +void zfs_rangelock_exit(zfs_locked_range_t *); +void zfs_rangelock_reduce(zfs_locked_range_t *, uint64_t, uint64_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_RLOCK_H */ diff --git a/include/sys/zfs_sa.h b/include/sys/zfs_sa.h new file mode 100644 index 000000000000..4e6d28638ef6 --- /dev/null +++ b/include/sys/zfs_sa.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ZFS_SA_H +#define _SYS_ZFS_SA_H + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include + + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * This is the list of known attributes + * to the ZPL. The values of the actual + * attributes are not defined by the order + * the enums. It is controlled by the attribute + * registration mechanism. Two different file system + * could have different numeric values for the same + * attributes. this list is only used for dereferencing + * into the table that will hold the actual numeric value. + */ +typedef enum zpl_attr { + ZPL_ATIME, + ZPL_MTIME, + ZPL_CTIME, + ZPL_CRTIME, + ZPL_GEN, + ZPL_MODE, + ZPL_SIZE, + ZPL_PARENT, + ZPL_LINKS, + ZPL_XATTR, + ZPL_RDEV, + ZPL_FLAGS, + ZPL_UID, + ZPL_GID, + ZPL_PAD, + ZPL_ZNODE_ACL, + ZPL_DACL_COUNT, + ZPL_SYMLINK, + ZPL_SCANSTAMP, + ZPL_DACL_ACES, + ZPL_DXATTR, + ZPL_PROJID, + ZPL_END +} zpl_attr_t; + +#define ZFS_OLD_ZNODE_PHYS_SIZE 0x108 +#define ZFS_SA_BASE_ATTR_SIZE (ZFS_OLD_ZNODE_PHYS_SIZE - \ + sizeof (zfs_acl_phys_t)) + +#define SA_MODE_OFFSET 0 +#define SA_SIZE_OFFSET 8 +#define SA_GEN_OFFSET 16 +#define SA_UID_OFFSET 24 +#define SA_GID_OFFSET 32 +#define SA_PARENT_OFFSET 40 +#define SA_FLAGS_OFFSET 48 +#define SA_PROJID_OFFSET 128 + +extern sa_attr_reg_t zfs_attr_table[ZPL_END + 1]; +extern sa_attr_reg_t zfs_legacy_attr_table[ZPL_END + 1]; + +/* + * This is a deprecated data structure that only exists for + * dealing with file systems create prior to ZPL version 5. + */ +typedef struct znode_phys { + uint64_t zp_atime[2]; /* 0 - last file access time */ + uint64_t zp_mtime[2]; /* 16 - last file modification time */ + uint64_t zp_ctime[2]; /* 32 - last file change time */ + uint64_t zp_crtime[2]; /* 48 - creation time */ + uint64_t zp_gen; /* 64 - generation (txg of creation) */ + uint64_t zp_mode; /* 72 - file mode bits */ + uint64_t zp_size; /* 80 - size of file */ + uint64_t zp_parent; /* 88 - directory parent (`..') */ + uint64_t zp_links; /* 96 - number of links to file */ + uint64_t zp_xattr; /* 104 - DMU object for xattrs */ + uint64_t zp_rdev; /* 112 - dev_t for VBLK & VCHR files */ + uint64_t zp_flags; /* 120 - persistent flags */ + uint64_t zp_uid; /* 128 - file owner */ + uint64_t zp_gid; /* 136 - owning group */ + uint64_t zp_zap; /* 144 - extra attributes */ + uint64_t zp_pad[3]; /* 152 - future */ + zfs_acl_phys_t zp_acl; /* 176 - 263 ACL */ + /* + * Data may pad out any remaining bytes in the znode buffer, eg: + * + * |<---------------------- dnode_phys (512) ------------------------>| + * |<-- dnode (192) --->|<----------- "bonus" buffer (320) ---------->| + * |<---- znode (264) ---->|<---- data (56) ---->| + * + * At present, we use this space for the following: + * - symbolic links + * - 32-byte anti-virus scanstamp (regular files only) + */ +} znode_phys_t; + +#ifdef _KERNEL + +#define DXATTR_MAX_ENTRY_SIZE (32768) +#define DXATTR_MAX_SA_SIZE (SPA_OLD_MAXBLOCKSIZE >> 1) + +int zfs_sa_readlink(struct znode *, uio_t *); +void zfs_sa_symlink(struct znode *, char *link, int len, dmu_tx_t *); +void zfs_sa_get_scanstamp(struct znode *, xvattr_t *); +void zfs_sa_set_scanstamp(struct znode *, xvattr_t *, dmu_tx_t *); +int zfs_sa_get_xattr(struct znode *); +int zfs_sa_set_xattr(struct znode *); +void zfs_sa_upgrade(struct sa_handle *, dmu_tx_t *); +void zfs_sa_upgrade_txholds(dmu_tx_t *, struct znode *); +void zfs_sa_init(void); +void zfs_sa_fini(void); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZFS_SA_H */ diff --git a/include/sys/zfs_stat.h b/include/sys/zfs_stat.h new file mode 100644 index 000000000000..465aefaa2063 --- /dev/null +++ b/include/sys/zfs_stat.h @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_STAT_H +#define _SYS_FS_ZFS_STAT_H + +#ifdef _KERNEL +#include +#include +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * A limited number of zpl level stats are retrievable + * with an ioctl. zfs diff is the current consumer. + */ +typedef struct zfs_stat { + uint64_t zs_gen; + uint64_t zs_mode; + uint64_t zs_links; + uint64_t zs_ctime[2]; +} zfs_stat_t; + +extern int zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_STAT_H */ diff --git a/include/sys/zfs_sysfs.h b/include/sys/zfs_sysfs.h new file mode 100644 index 000000000000..925d7ad542f7 --- /dev/null +++ b/include/sys/zfs_sysfs.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZFS_SYSFS_H +#define _SYS_ZFS_SYSFS_H + +#ifdef _KERNEL + +void zfs_sysfs_init(void); +void zfs_sysfs_fini(void); + +#else + +#define zfs_sysfs_init() +#define zfs_sysfs_fini() + +boolean_t zfs_mod_supported(const char *, const char *); +#endif + +#define ZFS_SYSFS_POOL_PROPERTIES "properties.pool" +#define ZFS_SYSFS_DATASET_PROPERTIES "properties.dataset" +#define ZFS_SYSFS_KERNEL_FEATURES "features.kernel" +#define ZFS_SYSFS_POOL_FEATURES "features.pool" + +#define ZFS_SYSFS_DIR "/sys/module/zfs" +/* Alternate location when ZFS is built as part of the kernel (rare) */ +#define ZFS_SYSFS_ALT_DIR "/sys/fs/zfs" + +#endif /* _SYS_ZFS_SYSFS_H */ diff --git a/include/sys/zfs_znode.h b/include/sys/zfs_znode.h new file mode 100644 index 000000000000..d7823221eb77 --- /dev/null +++ b/include/sys/zfs_znode.h @@ -0,0 +1,295 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _SYS_FS_ZFS_ZNODE_H +#define _SYS_FS_ZFS_ZNODE_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Additional file level attributes, that are stored + * in the upper half of zp_flags + */ +#define ZFS_READONLY 0x0000000100000000ull +#define ZFS_HIDDEN 0x0000000200000000ull +#define ZFS_SYSTEM 0x0000000400000000ull +#define ZFS_ARCHIVE 0x0000000800000000ull +#define ZFS_IMMUTABLE 0x0000001000000000ull +#define ZFS_NOUNLINK 0x0000002000000000ull +#define ZFS_APPENDONLY 0x0000004000000000ull +#define ZFS_NODUMP 0x0000008000000000ull +#define ZFS_OPAQUE 0x0000010000000000ull +#define ZFS_AV_QUARANTINED 0x0000020000000000ull +#define ZFS_AV_MODIFIED 0x0000040000000000ull +#define ZFS_REPARSE 0x0000080000000000ull +#define ZFS_OFFLINE 0x0000100000000000ull +#define ZFS_SPARSE 0x0000200000000000ull + +/* + * PROJINHERIT attribute is used to indicate that the child object under the + * directory which has the PROJINHERIT attribute needs to inherit its parent + * project ID that is used by project quota. + */ +#define ZFS_PROJINHERIT 0x0000400000000000ull + +/* + * PROJID attr is used internally to indicate that the object has project ID. + */ +#define ZFS_PROJID 0x0000800000000000ull + +#define ZFS_ATTR_SET(zp, attr, value, pflags, tx) \ +{ \ + if (value) \ + pflags |= attr; \ + else \ + pflags &= ~attr; \ + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(ZTOZSB(zp)), \ + &pflags, sizeof (pflags), tx)); \ +} + +/* + * Define special zfs pflags + */ +#define ZFS_XATTR 0x1 /* is an extended attribute */ +#define ZFS_INHERIT_ACE 0x2 /* ace has inheritable ACEs */ +#define ZFS_ACL_TRIVIAL 0x4 /* files ACL is trivial */ +#define ZFS_ACL_OBJ_ACE 0x8 /* ACL has CMPLX Object ACE */ +#define ZFS_ACL_PROTECTED 0x10 /* ACL protected */ +#define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ +#define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ +#define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ + +#define SA_ZPL_ATIME(z) z->z_attr_table[ZPL_ATIME] +#define SA_ZPL_MTIME(z) z->z_attr_table[ZPL_MTIME] +#define SA_ZPL_CTIME(z) z->z_attr_table[ZPL_CTIME] +#define SA_ZPL_CRTIME(z) z->z_attr_table[ZPL_CRTIME] +#define SA_ZPL_GEN(z) z->z_attr_table[ZPL_GEN] +#define SA_ZPL_DACL_ACES(z) z->z_attr_table[ZPL_DACL_ACES] +#define SA_ZPL_XATTR(z) z->z_attr_table[ZPL_XATTR] +#define SA_ZPL_SYMLINK(z) z->z_attr_table[ZPL_SYMLINK] +#define SA_ZPL_RDEV(z) z->z_attr_table[ZPL_RDEV] +#define SA_ZPL_SCANSTAMP(z) z->z_attr_table[ZPL_SCANSTAMP] +#define SA_ZPL_UID(z) z->z_attr_table[ZPL_UID] +#define SA_ZPL_GID(z) z->z_attr_table[ZPL_GID] +#define SA_ZPL_PARENT(z) z->z_attr_table[ZPL_PARENT] +#define SA_ZPL_LINKS(z) z->z_attr_table[ZPL_LINKS] +#define SA_ZPL_MODE(z) z->z_attr_table[ZPL_MODE] +#define SA_ZPL_DACL_COUNT(z) z->z_attr_table[ZPL_DACL_COUNT] +#define SA_ZPL_FLAGS(z) z->z_attr_table[ZPL_FLAGS] +#define SA_ZPL_SIZE(z) z->z_attr_table[ZPL_SIZE] +#define SA_ZPL_ZNODE_ACL(z) z->z_attr_table[ZPL_ZNODE_ACL] +#define SA_ZPL_DXATTR(z) z->z_attr_table[ZPL_DXATTR] +#define SA_ZPL_PAD(z) z->z_attr_table[ZPL_PAD] +#define SA_ZPL_PROJID(z) z->z_attr_table[ZPL_PROJID] + +/* + * Is ID ephemeral? + */ +#define IS_EPHEMERAL(x) (x > MAXUID) + +/* + * Should we use FUIDs? + */ +#define USE_FUIDS(version, os) (version >= ZPL_VERSION_FUID && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) +#define USE_SA(version, os) (version >= ZPL_VERSION_SA && \ + spa_version(dmu_objset_spa(os)) >= SPA_VERSION_SA) + +#define MASTER_NODE_OBJ 1 + +/* + * Special attributes for master node. + * "userquota@", "groupquota@" and "projectquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). + */ +#define ZFS_FSID "FSID" +#define ZFS_UNLINKED_SET "DELETE_QUEUE" +#define ZFS_ROOT_OBJ "ROOT" +#define ZPL_VERSION_STR "VERSION" +#define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" +#define ZFS_SA_ATTRS "SA_ATTRS" + +/* + * Convert mode bits (zp_mode) to BSD-style DT_* values for storing in + * the directory entries. On Linux systems this value is already + * defined correctly as part of the /usr/include/dirent.h header file. + */ +#ifndef IFTODT +#define IFTODT(mode) (((mode) & S_IFMT) >> 12) +#endif + +/* + * The directory entry has the type (currently unused on Solaris) in the + * top 4 bits, and the object number in the low 48 bits. The "middle" + * 12 bits are unused. + */ +#define ZFS_DIRENT_TYPE(de) BF64_GET(de, 60, 4) +#define ZFS_DIRENT_OBJ(de) BF64_GET(de, 0, 48) + +extern int zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len); + +#ifdef _KERNEL +#include + +/* + * Directory entry locks control access to directory entries. + * They are used to protect creates, deletes, and renames. + * Each directory znode has a mutex and a list of locked names. + */ +typedef struct zfs_dirlock { + char *dl_name; /* directory entry being locked */ + uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ + uint16_t dl_namesize; /* set if dl_name was allocated */ + kcondvar_t dl_cv; /* wait for entry to be unlocked */ + struct znode *dl_dzp; /* directory znode */ + struct zfs_dirlock *dl_next; /* next in z_dirlocks list */ +} zfs_dirlock_t; + +typedef struct znode { + uint64_t z_id; /* object ID for this znode */ + kmutex_t z_lock; /* znode modification lock */ + krwlock_t z_parent_lock; /* parent lock for directories */ + krwlock_t z_name_lock; /* "master" lock for dirent locks */ + zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ + zfs_rangelock_t z_rangelock; /* file range locks */ + boolean_t z_unlinked; /* file has been unlinked */ + boolean_t z_atime_dirty; /* atime needs to be synced */ + boolean_t z_zn_prefetch; /* Prefetch znodes? */ + boolean_t z_moved; /* Has this znode been moved? */ + boolean_t z_is_sa; /* are we native sa? */ + boolean_t z_is_mapped; /* are we mmap'ed */ + boolean_t z_is_ctldir; /* are we .zfs entry */ + boolean_t z_is_stale; /* are we stale due to rollback? */ + boolean_t z_suspended; /* extra ref from a suspend? */ + uint_t z_blksz; /* block size in bytes */ + uint_t z_seq; /* modification sequence number */ + uint64_t z_mapcnt; /* number of pages mapped to file */ + uint64_t z_dnodesize; /* dnode size */ + uint64_t z_size; /* file size (cached) */ + uint64_t z_pflags; /* pflags (cached) */ + uint32_t z_sync_cnt; /* synchronous open count */ + mode_t z_mode; /* mode (cached) */ + kmutex_t z_acl_lock; /* acl data lock */ + zfs_acl_t *z_acl_cached; /* cached acl */ + krwlock_t z_xattr_lock; /* xattr data lock */ + nvlist_t *z_xattr_cached; /* cached xattrs */ + uint64_t z_xattr_parent; /* parent obj for this xattr */ + uint64_t z_projid; /* project ID */ + list_node_t z_link_node; /* all znodes in fs link */ + sa_handle_t *z_sa_hdl; /* handle to sa data */ + + /* + * Platform specific field, defined by each platform and only + * accessible from platform specific code. + */ + ZNODE_OS_FIELDS; +} znode_t; + +typedef struct znode_hold { + uint64_t zh_obj; /* object id */ + kmutex_t zh_lock; /* lock serializing object access */ + avl_node_t zh_node; /* avl tree linkage */ + zfs_refcount_t zh_refcount; /* active consumer reference count */ +} znode_hold_t; + +static inline uint64_t +zfs_inherit_projid(znode_t *dzp) +{ + return ((dzp->z_pflags & ZFS_PROJINHERIT) ? dzp->z_projid : + ZFS_DEFAULT_PROJID); +} + +/* + * Timestamp defines + */ +#define ACCESSED (ATTR_ATIME) +#define STATE_CHANGED (ATTR_CTIME) +#define CONTENT_MODIFIED (ATTR_MTIME | ATTR_CTIME) + +extern int zfs_init_fs(zfsvfs_t *, znode_t **); +extern void zfs_set_dataprop(objset_t *); +extern void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *, + dmu_tx_t *tx); +extern void zfs_tstamp_update_setup(znode_t *, uint_t, uint64_t [2], + uint64_t [2]); +extern void zfs_grow_blocksize(znode_t *, uint64_t, dmu_tx_t *); +extern int zfs_freesp(znode_t *, uint64_t, uint64_t, int, boolean_t); +extern void zfs_znode_init(void); +extern void zfs_znode_fini(void); +extern int zfs_znode_hold_compare(const void *, const void *); +extern int zfs_zget(zfsvfs_t *, uint64_t, znode_t **); +extern int zfs_rezget(znode_t *); +extern void zfs_zinactive(znode_t *); +extern void zfs_znode_delete(znode_t *, dmu_tx_t *); +extern void zfs_remove_op_tables(void); +extern int zfs_create_op_tables(void); +extern dev_t zfs_cmpldev(uint64_t); +extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); +extern int zfs_get_stats(objset_t *os, nvlist_t *nv); +extern boolean_t zfs_get_vfs_flag_unmounted(objset_t *os); +extern void zfs_znode_dmu_fini(znode_t *); + +extern void zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *, zfs_fuid_info_t *, + vattr_t *vap); +extern int zfs_log_create_txtype(zil_create_t, vsecattr_t *vsecp, + vattr_t *vap); +extern void zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked); +#define ZFS_NO_OBJECT 0 /* no object id */ +extern void zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name); +extern void zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link); +extern void zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp); +extern void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t len, int ioflag, + zil_callback_t callback, void *callback_data); +extern void zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len); +extern void zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp); +extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); +extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx); +extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); + +#endif +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_FS_ZFS_ZNODE_H */ diff --git a/include/sys/zil.h b/include/sys/zil.h new file mode 100644 index 000000000000..7e61a13301c0 --- /dev/null +++ b/include/sys/zil.h @@ -0,0 +1,529 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_ZIL_H +#define _SYS_ZIL_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct dsl_pool; +struct dsl_dataset; +struct lwb; + +/* + * Intent log format: + * + * Each objset has its own intent log. The log header (zil_header_t) + * for objset N's intent log is kept in the Nth object of the SPA's + * intent_log objset. The log header points to a chain of log blocks, + * each of which contains log records (i.e., transactions) followed by + * a log block trailer (zil_trailer_t). The format of a log record + * depends on the record (or transaction) type, but all records begin + * with a common structure that defines the type, length, and txg. + */ + +/* + * Intent log header - this on disk structure holds fields to manage + * the log. All fields are 64 bit to easily handle cross architectures. + */ +typedef struct zil_header { + uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ + uint64_t zh_replay_seq; /* highest replayed sequence number */ + blkptr_t zh_log; /* log chain */ + uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ + uint64_t zh_flags; /* header flags */ + uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t zh_pad[3]; +} zil_header_t; + +/* + * zh_flags bit settings + */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ + +/* + * Log block chaining. + * + * Log blocks are chained together. Originally they were chained at the + * end of the block. For performance reasons the chain was moved to the + * beginning of the block which allows writes for only the data being used. + * The older position is supported for backwards compatibility. + * + * The zio_eck_t contains a zec_cksum which for the intent log is + * the sequence number of this log block. A seq of 0 is invalid. + * The zec_cksum is checked by the SPA against the sequence + * number passed in the blk_cksum field of the blkptr_t + */ +typedef struct zil_chain { + uint64_t zc_pad; + blkptr_t zc_next_blk; /* next block in chain */ + uint64_t zc_nused; /* bytes in log block used */ + zio_eck_t zc_eck; /* block trailer */ +} zil_chain_t; + +#define ZIL_MIN_BLKSZ 4096ULL + +/* + * ziltest is by and large an ugly hack, but very useful in + * checking replay without tedious work. + * When running ziltest we want to keep all itx's and so maintain + * a single list in the zl_itxg[] that uses a high txg: ZILTEST_TXG + * We subtract TXG_CONCURRENT_STATES to allow for common code. + */ +#define ZILTEST_TXG (UINT64_MAX - TXG_CONCURRENT_STATES) + +/* + * The words of a log block checksum. + */ +#define ZIL_ZC_GUID_0 0 +#define ZIL_ZC_GUID_1 1 +#define ZIL_ZC_OBJSET 2 +#define ZIL_ZC_SEQ 3 + +typedef enum zil_create { + Z_FILE, + Z_DIR, + Z_XATTRDIR, +} zil_create_t; + +/* + * size of xvattr log section. + * its composed of lr_attr_t + xvattr bitmap + 2 64 bit timestamps + * for create time and a single 64 bit integer for all of the attributes, + * and 4 64 bit integers (32 bytes) for the scanstamp. + * + */ + +#define ZIL_XVAT_SIZE(mapsize) \ + sizeof (lr_attr_t) + (sizeof (uint32_t) * (mapsize - 1)) + \ + (sizeof (uint64_t) * 7) + +/* + * Size of ACL in log. The ACE data is padded out to properly align + * on 8 byte boundary. + */ + +#define ZIL_ACE_LENGTH(x) (roundup(x, sizeof (uint64_t))) + +/* + * Intent log transaction types and record structures + */ +#define TX_COMMIT 0 /* Commit marker (no on-disk state) */ +#define TX_CREATE 1 /* Create file */ +#define TX_MKDIR 2 /* Make directory */ +#define TX_MKXATTR 3 /* Make XATTR directory */ +#define TX_SYMLINK 4 /* Create symbolic link to a file */ +#define TX_REMOVE 5 /* Remove file */ +#define TX_RMDIR 6 /* Remove directory */ +#define TX_LINK 7 /* Create hard link to a file */ +#define TX_RENAME 8 /* Rename a file */ +#define TX_WRITE 9 /* File write */ +#define TX_TRUNCATE 10 /* Truncate a file */ +#define TX_SETATTR 11 /* Set file attributes */ +#define TX_ACL_V0 12 /* Set old formatted ACL */ +#define TX_ACL 13 /* Set ACL */ +#define TX_CREATE_ACL 14 /* create with ACL */ +#define TX_CREATE_ATTR 15 /* create + attrs */ +#define TX_CREATE_ACL_ATTR 16 /* create with ACL + attrs */ +#define TX_MKDIR_ACL 17 /* mkdir with ACL */ +#define TX_MKDIR_ATTR 18 /* mkdir with attr */ +#define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ + +/* + * The transactions for mkdir, symlink, remove, rmdir, link, and rename + * may have the following bit set, indicating the original request + * specified case-insensitive handling of names. + */ +#define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ + +/* + * Transactions for write, truncate, setattr, acl_v0, and acl can be logged + * out of order. For convenience in the code, all such records must have + * lr_foid at the same offset. + */ +#define TX_OOO(txtype) \ + ((txtype) == TX_WRITE || \ + (txtype) == TX_TRUNCATE || \ + (txtype) == TX_SETATTR || \ + (txtype) == TX_ACL_V0 || \ + (txtype) == TX_ACL || \ + (txtype) == TX_WRITE2) + +/* + * The number of dnode slots consumed by the object is stored in the 8 + * unused upper bits of the object ID. We subtract 1 from the value + * stored on disk for compatibility with implementations that don't + * support large dnodes. The slot count for a single-slot dnode will + * contain 0 for those bits to preserve the log record format for + * "small" dnodes. + */ +#define LR_FOID_GET_SLOTS(oid) (BF64_GET((oid), 56, 8) + 1) +#define LR_FOID_SET_SLOTS(oid, x) BF64_SET((oid), 56, 8, (x) - 1) +#define LR_FOID_GET_OBJ(oid) BF64_GET((oid), 0, DN_MAX_OBJECT_SHIFT) +#define LR_FOID_SET_OBJ(oid, x) BF64_SET((oid), 0, DN_MAX_OBJECT_SHIFT, (x)) + +/* + * Format of log records. + * The fields are carefully defined to allow them to be aligned + * and sized the same on sparc & intel architectures. + * Each log record has a common structure at the beginning. + * + * The log record on disk (lrc_seq) holds the sequence number of all log + * records which is used to ensure we don't replay the same record. + */ +typedef struct { /* common log record header */ + uint64_t lrc_txtype; /* intent log transaction type */ + uint64_t lrc_reclen; /* transaction record length */ + uint64_t lrc_txg; /* dmu transaction group number */ + uint64_t lrc_seq; /* see comment above */ +} lr_t; + +/* + * Common start of all out-of-order record types (TX_OOO() above). + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id */ +} lr_ooo_t; + +/* + * Handle option extended vattr attributes. + * + * Whenever new attributes are added the version number + * will need to be updated as will code in + * zfs_log.c and zfs_replay.c + */ +typedef struct { + uint32_t lr_attr_masksize; /* number of elements in array */ + uint32_t lr_attr_bitmap; /* First entry of array */ + /* remainder of array and any additional fields */ +} lr_attr_t; + +/* + * log record for creates without optional ACL. + * This log record does support optional xvattr_t attributes. + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* object id of directory */ + uint64_t lr_foid; /* object id of created file object */ + uint64_t lr_mode; /* mode of object */ + uint64_t lr_uid; /* uid of object */ + uint64_t lr_gid; /* gid of object */ + uint64_t lr_gen; /* generation (txg of creation) */ + uint64_t lr_crtime[2]; /* creation time */ + uint64_t lr_rdev; /* rdev of object to create */ + /* name of object to create follows this */ + /* for symlinks, link content follows name */ + /* for creates with xvattr data, the name follows the xvattr info */ +} lr_create_t; + +/* + * FUID ACL record will be an array of ACEs from the original ACL. + * If this array includes ephemeral IDs, the record will also include + * an array of log-specific FUIDs to replace the ephemeral IDs. + * Only one copy of each unique domain will be present, so the log-specific + * FUIDs will use an index into a compressed domain table. On replay this + * information will be used to construct real FUIDs (and bypass idmap, + * since it may not be available). + */ + +/* + * Log record for creates with optional ACL + * This log record is also used for recording any FUID + * information needed for replaying the create. If the + * file doesn't have any actual ACEs then the lr_aclcnt + * would be zero. + * + * After lr_acl_flags, there are a lr_acl_bytes number of variable sized ace's. + * If create is also setting xvattr's, then acl data follows xvattr. + * If ACE FUIDs are needed then they will follow the xvattr_t. Following + * the FUIDs will be the domain table information. The FUIDs for the owner + * and group will be in lr_create. Name follows ACL data. + */ +typedef struct { + lr_create_t lr_create; /* common create portion */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ +} lr_acl_create_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + /* name of object to remove follows this */ +} lr_remove_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_doid; /* obj id of directory */ + uint64_t lr_link_obj; /* obj id of link */ + /* name of object to link follows this */ +} lr_link_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_sdoid; /* obj id of source directory */ + uint64_t lr_tdoid; /* obj id of target directory */ + /* 2 strings: names of source and destination follow this */ +} lr_rename_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to write */ + uint64_t lr_offset; /* offset to write to */ + uint64_t lr_length; /* user data length to write */ + uint64_t lr_blkoff; /* no longer used */ + blkptr_t lr_blkptr; /* spa block pointer for replay */ + /* write data will follow for small writes */ +} lr_write_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id of file to truncate */ + uint64_t lr_offset; /* offset to truncate from */ + uint64_t lr_length; /* length to truncate */ +} lr_truncate_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* file object to change attributes */ + uint64_t lr_mask; /* mask of attributes to set */ + uint64_t lr_mode; /* mode to set */ + uint64_t lr_uid; /* uid to set */ + uint64_t lr_gid; /* gid to set */ + uint64_t lr_size; /* size to set */ + uint64_t lr_atime[2]; /* access time */ + uint64_t lr_mtime[2]; /* modification time */ + /* optional attribute lr_attr_t may be here */ +} lr_setattr_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of acl entries */ + /* lr_aclcnt number of ace_t entries follow this */ +} lr_acl_v0_t; + +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* obj id of file */ + uint64_t lr_aclcnt; /* number of ACEs in ACL */ + uint64_t lr_domcnt; /* number of unique domains */ + uint64_t lr_fuidcnt; /* number of real fuids */ + uint64_t lr_acl_bytes; /* number of bytes in ACL */ + uint64_t lr_acl_flags; /* ACL flags */ + /* lr_acl_bytes number of variable sized ace's follows */ +} lr_acl_t; + +/* + * ZIL structure definitions, interface function prototype and globals. + */ + +/* + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs/zvol_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (O_SYNC or O_DSYNC), then we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. + */ +typedef enum { + WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ + /* and put blkptr in log, rather than actual data) */ + WR_COPIED, /* immediate - data is copied into lr_write_t */ + WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ + WR_NUM_STATES /* number of states */ +} itx_wr_state_t; + +typedef void (*zil_callback_t)(void *data); + +typedef struct itx { + list_node_t itx_node; /* linkage on zl_itx_list */ + void *itx_private; /* type-specific opaque data */ + itx_wr_state_t itx_wr_state; /* write state */ + uint8_t itx_sync; /* synchronous transaction */ + zil_callback_t itx_callback; /* Called when the itx is persistent */ + void *itx_callback_data; /* User data for the callback */ + size_t itx_size; /* allocated itx structure size */ + uint64_t itx_oid; /* object id */ + lr_t itx_lr; /* common part of log record */ + /* followed by type-specific part of lr_xx_t and its immediate data */ +} itx_t; + +/* + * Used for zil kstat. + */ +typedef struct zil_stats { + /* + * Number of times a ZIL commit (e.g. fsync) has been requested. + */ + kstat_named_t zil_commit_count; + + /* + * Number of times the ZIL has been flushed to stable storage. + * This is less than zil_commit_count when commits are "merged" + * (see the documentation above zil_commit()). + */ + kstat_named_t zil_commit_writer_count; + + /* + * Number of transactions (reads, writes, renames, etc.) + * that have been committed. + */ + kstat_named_t zil_itx_count; + + /* + * See the documentation for itx_wr_state_t above. + * Note that "bytes" accumulates the length of the transactions + * (i.e. data), not the actual log record sizes. + */ + kstat_named_t zil_itx_indirect_count; + kstat_named_t zil_itx_indirect_bytes; + kstat_named_t zil_itx_copied_count; + kstat_named_t zil_itx_copied_bytes; + kstat_named_t zil_itx_needcopy_count; + kstat_named_t zil_itx_needcopy_bytes; + + /* + * Transactions which have been allocated to the "normal" + * (i.e. not slog) storage pool. Note that "bytes" accumulate + * the actual log record sizes - which do not include the actual + * data in case of indirect writes. + */ + kstat_named_t zil_itx_metaslab_normal_count; + kstat_named_t zil_itx_metaslab_normal_bytes; + + /* + * Transactions which have been allocated to the "slog" storage pool. + * If there are no separate log devices, this is the same as the + * "normal" pool. + */ + kstat_named_t zil_itx_metaslab_slog_count; + kstat_named_t zil_itx_metaslab_slog_bytes; +} zil_stats_t; + +extern zil_stats_t zil_stats; + +#define ZIL_STAT_INCR(stat, val) \ + atomic_add_64(&zil_stats.stat.value.ui64, (val)); +#define ZIL_STAT_BUMP(stat) \ + ZIL_STAT_INCR(stat, 1); + +typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, + uint64_t txg); +typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, + uint64_t txg); +typedef int zil_replay_func_t(void *arg1, void *arg2, boolean_t byteswap); +typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, + struct lwb *lwb, zio_t *zio); + +extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, + boolean_t decrypt); + +extern void zil_init(void); +extern void zil_fini(void); + +extern zilog_t *zil_alloc(objset_t *os, zil_header_t *zh_phys); +extern void zil_free(zilog_t *zilog); + +extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); +extern void zil_close(zilog_t *zilog); + +extern void zil_replay(objset_t *os, void *arg, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); +extern void zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx); + +extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern void zil_itx_destroy(itx_t *itx); +extern void zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); + +extern void zil_async_to_sync(zilog_t *zilog, uint64_t oid); +extern void zil_commit(zilog_t *zilog, uint64_t oid); +extern void zil_commit_impl(zilog_t *zilog, uint64_t oid); +extern void zil_remove_async(zilog_t *zilog, uint64_t oid); + +extern int zil_reset(const char *osname, void *txarg); +extern int zil_claim(struct dsl_pool *dp, + struct dsl_dataset *ds, void *txarg); +extern int zil_check_log_chain(struct dsl_pool *dp, + struct dsl_dataset *ds, void *tx); +extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); +extern void zil_clean(zilog_t *zilog, uint64_t synced_txg); + +extern int zil_suspend(const char *osname, void **cookiep); +extern void zil_resume(void *cookie); + +extern void zil_lwb_add_block(struct lwb *lwb, const blkptr_t *bp); +extern void zil_lwb_add_txg(struct lwb *lwb, uint64_t txg); +extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); + +extern void zil_set_sync(zilog_t *zilog, uint64_t syncval); + +extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); + +extern uint64_t zil_max_copied_data(zilog_t *zilog); +extern uint64_t zil_max_log_data(zilog_t *zilog); + +extern int zil_replay_disable; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_H */ diff --git a/include/sys/zil_impl.h b/include/sys/zil_impl.h new file mode 100644 index 000000000000..d2f4018653a6 --- /dev/null +++ b/include/sys/zil_impl.h @@ -0,0 +1,230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#ifndef _SYS_ZIL_IMPL_H +#define _SYS_ZIL_IMPL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Possible states for a given lwb structure. + * + * An lwb will start out in the "closed" state, and then transition to + * the "opened" state via a call to zil_lwb_write_open(). When + * transitioning from "closed" to "opened" the zilog's "zl_issuer_lock" + * must be held. + * + * After the lwb is "opened", it can transition into the "issued" state + * via zil_lwb_write_issue(). Again, the zilog's "zl_issuer_lock" must + * be held when making this transition. + * + * After the lwb's write zio completes, it transitions into the "write + * done" state via zil_lwb_write_done(); and then into the "flush done" + * state via zil_lwb_flush_vdevs_done(). When transitioning from + * "issued" to "write done", and then from "write done" to "flush done", + * the zilog's "zl_lock" must be held, *not* the "zl_issuer_lock". + * + * The zilog's "zl_issuer_lock" can become heavily contended in certain + * workloads, so we specifically avoid acquiring that lock when + * transitioning an lwb from "issued" to "done". This allows us to avoid + * having to acquire the "zl_issuer_lock" for each lwb ZIO completion, + * which would have added more lock contention on an already heavily + * contended lock. + * + * Additionally, correctness when reading an lwb's state is often + * achieved by exploiting the fact that these state transitions occur in + * this specific order; i.e. "closed" to "opened" to "issued" to "done". + * + * Thus, if an lwb is in the "closed" or "opened" state, holding the + * "zl_issuer_lock" will prevent a concurrent thread from transitioning + * that lwb to the "issued" state. Likewise, if an lwb is already in the + * "issued" state, holding the "zl_lock" will prevent a concurrent + * thread from transitioning that lwb to the "write done" state. + */ +typedef enum { + LWB_STATE_CLOSED, + LWB_STATE_OPENED, + LWB_STATE_ISSUED, + LWB_STATE_WRITE_DONE, + LWB_STATE_FLUSH_DONE, + LWB_NUM_STATES +} lwb_state_t; + +/* + * Log write block (lwb) + * + * Prior to an lwb being issued to disk via zil_lwb_write_issue(), it + * will be protected by the zilog's "zl_issuer_lock". Basically, prior + * to it being issued, it will only be accessed by the thread that's + * holding the "zl_issuer_lock". After the lwb is issued, the zilog's + * "zl_lock" is used to protect the lwb against concurrent access. + */ +typedef struct lwb { + zilog_t *lwb_zilog; /* back pointer to log struct */ + blkptr_t lwb_blk; /* on disk address of this log blk */ + boolean_t lwb_fastwrite; /* is blk marked for fastwrite? */ + boolean_t lwb_slog; /* lwb_blk is on SLOG device */ + int lwb_nused; /* # used bytes in buffer */ + int lwb_sz; /* size of block and buffer */ + lwb_state_t lwb_state; /* the state of this lwb */ + char *lwb_buf; /* log write buffer */ + zio_t *lwb_write_zio; /* zio for the lwb buffer */ + zio_t *lwb_root_zio; /* root zio for lwb write and flushes */ + dmu_tx_t *lwb_tx; /* tx for log block allocation */ + uint64_t lwb_max_txg; /* highest txg in this lwb */ + list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ + list_t lwb_itxs; /* list of itx's */ + list_t lwb_waiters; /* list of zil_commit_waiter's */ + avl_tree_t lwb_vdev_tree; /* vdevs to flush after lwb write */ + kmutex_t lwb_vdev_lock; /* protects lwb_vdev_tree */ + hrtime_t lwb_issued_timestamp; /* when was the lwb issued? */ +} lwb_t; + +/* + * ZIL commit waiter. + * + * This structure is allocated each time zil_commit() is called, and is + * used by zil_commit() to communicate with other parts of the ZIL, such + * that zil_commit() can know when it safe for it return. For more + * details, see the comment above zil_commit(). + * + * The "zcw_lock" field is used to protect the commit waiter against + * concurrent access. This lock is often acquired while already holding + * the zilog's "zl_issuer_lock" or "zl_lock"; see the functions + * zil_process_commit_list() and zil_lwb_flush_vdevs_done() as examples + * of this. Thus, one must be careful not to acquire the + * "zl_issuer_lock" or "zl_lock" when already holding the "zcw_lock"; + * e.g. see the zil_commit_waiter_timeout() function. + */ +typedef struct zil_commit_waiter { + kcondvar_t zcw_cv; /* signalled when "done" */ + kmutex_t zcw_lock; /* protects fields of this struct */ + list_node_t zcw_node; /* linkage in lwb_t:lwb_waiter list */ + lwb_t *zcw_lwb; /* back pointer to lwb when linked */ + boolean_t zcw_done; /* B_TRUE when "done", else B_FALSE */ + int zcw_zio_error; /* contains the zio io_error value */ +} zil_commit_waiter_t; + +/* + * Intent log transaction lists + */ +typedef struct itxs { + list_t i_sync_list; /* list of synchronous itxs */ + avl_tree_t i_async_tree; /* tree of foids for async itxs */ +} itxs_t; + +typedef struct itxg { + kmutex_t itxg_lock; /* lock for this structure */ + uint64_t itxg_txg; /* txg for this chain */ + itxs_t *itxg_itxs; /* sync and async itxs */ +} itxg_t; + +/* for async nodes we build up an AVL tree of lists of async itxs per file */ +typedef struct itx_async_node { + uint64_t ia_foid; /* file object id */ + list_t ia_list; /* list of async itxs for this foid */ + avl_node_t ia_node; /* AVL tree linkage */ +} itx_async_node_t; + +/* + * Vdev flushing: during a zil_commit(), we build up an AVL tree of the vdevs + * we've touched so we know which ones need a write cache flush at the end. + */ +typedef struct zil_vdev_node { + uint64_t zv_vdev; /* vdev to be flushed */ + avl_node_t zv_node; /* AVL tree linkage */ +} zil_vdev_node_t; + +#define ZIL_PREV_BLKS 16 + +/* + * Stable storage intent log management structure. One per dataset. + */ +struct zilog { + kmutex_t zl_lock; /* protects most zilog_t fields */ + struct dsl_pool *zl_dmu_pool; /* DSL pool */ + spa_t *zl_spa; /* handle for read/write log */ + const zil_header_t *zl_header; /* log header buffer */ + objset_t *zl_os; /* object set we're logging */ + zil_get_data_t *zl_get_data; /* callback to get object content */ + lwb_t *zl_last_lwb_opened; /* most recent lwb opened */ + hrtime_t zl_last_lwb_latency; /* zio latency of last lwb done */ + uint64_t zl_lr_seq; /* on-disk log record sequence number */ + uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ + uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ + uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ + uint64_t zl_replaying_seq; /* current replay seq number */ + uint32_t zl_suspend; /* log suspend count */ + kcondvar_t zl_cv_suspend; /* log suspend completion */ + uint8_t zl_suspending; /* log is currently suspending */ + uint8_t zl_keep_first; /* keep first log block in destroy */ + uint8_t zl_replay; /* replaying records while set */ + uint8_t zl_stop_sync; /* for debugging */ + kmutex_t zl_issuer_lock; /* single writer, per ZIL, at a time */ + uint8_t zl_logbias; /* latency or throughput */ + uint8_t zl_sync; /* synchronous or asynchronous */ + int zl_parse_error; /* last zil_parse() error */ + uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ + uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ + uint64_t zl_parse_blk_count; /* number of blocks parsed */ + uint64_t zl_parse_lr_count; /* number of log records parsed */ + itxg_t zl_itxg[TXG_SIZE]; /* intent log txg chains */ + list_t zl_itx_commit_list; /* itx list to be committed */ + uint64_t zl_cur_used; /* current commit log size used */ + list_t zl_lwb_list; /* in-flight log write list */ + avl_tree_t zl_bp_tree; /* track bps during log parse */ + clock_t zl_replay_time; /* lbolt of when replay started */ + uint64_t zl_replay_blks; /* number of log blocks replayed */ + zil_header_t zl_old_header; /* debugging aid */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_rotor; /* rotor for zl_prev[] */ + txg_node_t zl_dirty_link; /* protected by dp_dirty_zilogs list */ + uint64_t zl_dirty_max_txg; /* highest txg used to dirty zilog */ + /* + * Max block size for this ZIL. Note that this can not be changed + * while the ZIL is in use because consumers (ZPL/zvol) need to take + * this into account when deciding between WR_COPIED and WR_NEED_COPY + * (see zil_max_copied_data()). + */ + uint64_t zl_max_block_size; +}; + +typedef struct zil_bp_node { + dva_t zn_dva; + avl_node_t zn_node; +} zil_bp_node_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIL_IMPL_H */ diff --git a/include/sys/zio.h b/include/sys/zio.h new file mode 100644 index 000000000000..f3b5a120793f --- /dev/null +++ b/include/sys/zio.h @@ -0,0 +1,710 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright 2016 Toomas Soome + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019-2020, Michael Niewöhner + */ + +#ifndef _ZIO_H +#define _ZIO_H + +#include +#include +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL + +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; + +/* + * Gang block headers are self-checksumming and contain an array + * of block pointers. + */ +#define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE +#define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t)) / sizeof (blkptr_t)) +#define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ + sizeof (zio_eck_t) - \ + (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ + sizeof (uint64_t)) + +typedef struct zio_gbh { + blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; + uint64_t zg_filler[SPA_GBH_FILLER]; + zio_eck_t zg_tail; +} zio_gbh_phys_t; + +enum zio_checksum { + ZIO_CHECKSUM_INHERIT = 0, + ZIO_CHECKSUM_ON, + ZIO_CHECKSUM_OFF, + ZIO_CHECKSUM_LABEL, + ZIO_CHECKSUM_GANG_HEADER, + ZIO_CHECKSUM_ZILOG, + ZIO_CHECKSUM_FLETCHER_2, + ZIO_CHECKSUM_FLETCHER_4, + ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, + ZIO_CHECKSUM_NOPARITY, + ZIO_CHECKSUM_SHA512, + ZIO_CHECKSUM_SKEIN, +#if !defined(__FreeBSD__) + ZIO_CHECKSUM_EDONR, +#endif + ZIO_CHECKSUM_FUNCTIONS +}; + +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_CHECKSUM_LEGACY_FUNCTIONS ZIO_CHECKSUM_ZILOG2 + +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 +#define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON + +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 + +/* supported encryption algorithms */ +enum zio_encrypt { + ZIO_CRYPT_INHERIT = 0, + ZIO_CRYPT_ON, + ZIO_CRYPT_OFF, + ZIO_CRYPT_AES_128_CCM, + ZIO_CRYPT_AES_192_CCM, + ZIO_CRYPT_AES_256_CCM, + ZIO_CRYPT_AES_128_GCM, + ZIO_CRYPT_AES_192_GCM, + ZIO_CRYPT_AES_256_GCM, + ZIO_CRYPT_FUNCTIONS +}; + +#define ZIO_CRYPT_ON_VALUE ZIO_CRYPT_AES_256_GCM +#define ZIO_CRYPT_DEFAULT ZIO_CRYPT_OFF + +/* macros defining encryption lengths */ +#define ZIO_OBJSET_MAC_LEN 32 +#define ZIO_DATA_IV_LEN 12 +#define ZIO_DATA_SALT_LEN 8 +#define ZIO_DATA_MAC_LEN 16 + +/* + * The number of "legacy" compression functions which can be set on individual + * objects. + */ +#define ZIO_COMPRESS_LEGACY_FUNCTIONS ZIO_COMPRESS_LZ4 + +/* + * The meaning of "compress = on" selected by the compression features enabled + * on a given pool. + */ +#define ZIO_COMPRESS_LEGACY_ON_VALUE ZIO_COMPRESS_LZJB +#define ZIO_COMPRESS_LZ4_ON_VALUE ZIO_COMPRESS_LZ4 + +#define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF + +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + (compress) == ZIO_COMPRESS_LZ4 || \ + (compress) == ZIO_COMPRESS_GZIP_1 || \ + (compress) == ZIO_COMPRESS_GZIP_2 || \ + (compress) == ZIO_COMPRESS_GZIP_3 || \ + (compress) == ZIO_COMPRESS_GZIP_4 || \ + (compress) == ZIO_COMPRESS_GZIP_5 || \ + (compress) == ZIO_COMPRESS_GZIP_6 || \ + (compress) == ZIO_COMPRESS_GZIP_7 || \ + (compress) == ZIO_COMPRESS_GZIP_8 || \ + (compress) == ZIO_COMPRESS_GZIP_9 || \ + (compress) == ZIO_COMPRESS_ZLE || \ + (compress) == ZIO_COMPRESS_ZSTD || \ + (compress) == ZIO_COMPRESS_ON || \ + (compress) == ZIO_COMPRESS_OFF) + + +#define ZIO_COMPRESS_ALGO(x) (x & SPA_COMPRESSMASK) +#define ZIO_COMPRESS_LEVEL(x) ((x & ~SPA_COMPRESSMASK) >> SPA_COMPRESSBITS) +#define ZIO_COMPRESS_RAW(type, level) (type | ((level) << SPA_COMPRESSBITS)) + +#define ZIO_COMPLEVEL_ZSTD(level) \ + ZIO_COMPRESS_RAW(ZIO_COMPRESS_ZSTD, level) + +#define ZIO_FAILURE_MODE_WAIT 0 +#define ZIO_FAILURE_MODE_CONTINUE 1 +#define ZIO_FAILURE_MODE_PANIC 2 + +typedef enum zio_suspend_reason { + ZIO_SUSPEND_NONE = 0, + ZIO_SUSPEND_IOERR, + ZIO_SUSPEND_MMP, +} zio_suspend_reason_t; + +enum zio_flag { + /* + * Flags inherited by gang, ddt, and vdev children, + * and that must be equal for two zios to aggregate + */ + ZIO_FLAG_DONT_AGGREGATE = 1 << 0, + ZIO_FLAG_IO_REPAIR = 1 << 1, + ZIO_FLAG_SELF_HEAL = 1 << 2, + ZIO_FLAG_RESILVER = 1 << 3, + ZIO_FLAG_SCRUB = 1 << 4, + ZIO_FLAG_SCAN_THREAD = 1 << 5, + ZIO_FLAG_PHYSICAL = 1 << 6, + +#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) + + /* + * Flags inherited by ddt, gang, and vdev children. + */ + ZIO_FLAG_CANFAIL = 1 << 7, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 8, + ZIO_FLAG_CONFIG_WRITER = 1 << 9, + ZIO_FLAG_DONT_RETRY = 1 << 10, + ZIO_FLAG_DONT_CACHE = 1 << 11, + ZIO_FLAG_NODATA = 1 << 12, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 13, + ZIO_FLAG_IO_ALLOCATING = 1 << 14, + +#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) +#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) + + /* + * Flags inherited by vdev children. + */ + ZIO_FLAG_IO_RETRY = 1 << 15, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 16, + ZIO_FLAG_TRYHARD = 1 << 17, + ZIO_FLAG_OPTIONAL = 1 << 18, + +#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) + + /* + * Flags not inherited by any children. + */ + ZIO_FLAG_DONT_QUEUE = 1 << 19, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 20, + ZIO_FLAG_IO_BYPASS = 1 << 21, + ZIO_FLAG_IO_REWRITE = 1 << 22, + ZIO_FLAG_RAW_COMPRESS = 1 << 23, + ZIO_FLAG_RAW_ENCRYPT = 1 << 24, + ZIO_FLAG_GANG_CHILD = 1 << 25, + ZIO_FLAG_DDT_CHILD = 1 << 26, + ZIO_FLAG_GODFATHER = 1 << 27, + ZIO_FLAG_NOPWRITE = 1 << 28, + ZIO_FLAG_REEXECUTED = 1 << 29, + ZIO_FLAG_DELEGATED = 1 << 30, + ZIO_FLAG_FASTWRITE = 1 << 31, +}; + +#define ZIO_FLAG_MUSTSUCCEED 0 +#define ZIO_FLAG_RAW (ZIO_FLAG_RAW_COMPRESS | ZIO_FLAG_RAW_ENCRYPT) + +#define ZIO_DDT_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ + ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) + +#define ZIO_GANG_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ + ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) + +#define ZIO_VDEV_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_CANFAIL) + +#define ZIO_CHILD_BIT(x) (1 << (x)) +#define ZIO_CHILD_BIT_IS_SET(val, x) ((val) & (1 << (x))) + +enum zio_child { + ZIO_CHILD_VDEV = 0, + ZIO_CHILD_GANG, + ZIO_CHILD_DDT, + ZIO_CHILD_LOGICAL, + ZIO_CHILD_TYPES +}; + +#define ZIO_CHILD_VDEV_BIT ZIO_CHILD_BIT(ZIO_CHILD_VDEV) +#define ZIO_CHILD_GANG_BIT ZIO_CHILD_BIT(ZIO_CHILD_GANG) +#define ZIO_CHILD_DDT_BIT ZIO_CHILD_BIT(ZIO_CHILD_DDT) +#define ZIO_CHILD_LOGICAL_BIT ZIO_CHILD_BIT(ZIO_CHILD_LOGICAL) +#define ZIO_CHILD_ALL_BITS \ + (ZIO_CHILD_VDEV_BIT | ZIO_CHILD_GANG_BIT | \ + ZIO_CHILD_DDT_BIT | ZIO_CHILD_LOGICAL_BIT) + +enum zio_wait_type { + ZIO_WAIT_READY = 0, + ZIO_WAIT_DONE, + ZIO_WAIT_TYPES +}; + +typedef void zio_done_func_t(zio_t *zio); + +extern int zio_exclude_metadata; +extern int zio_dva_throttle_enabled; +extern const char *zio_type_name[ZIO_TYPES]; + +/* + * A bookmark is a four-tuple that uniquely + * identifies any block in the pool. By convention, the meta-objset (MOS) + * is objset 0, and the meta-dnode is object 0. This covers all blocks + * except root blocks and ZIL blocks, which are defined as follows: + * + * Root blocks (objset_phys_t) are object 0, level -1: . + * ZIL blocks are bookmarked . + * dmu_sync()ed ZIL data blocks are bookmarked . + * dnode visit bookmarks are . + * + * Note: this structure is called a bookmark because its original purpose + * was to remember where to resume a pool-wide traverse. + * + * Note: this structure is passed between userland and the kernel, and is + * stored on disk (by virtue of being incorporated into other on-disk + * structures, e.g. dsl_scan_phys_t). + */ +struct zbookmark_phys { + uint64_t zb_objset; + uint64_t zb_object; + int64_t zb_level; + uint64_t zb_blkid; +}; + +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define ZB_DESTROYED_OBJSET (-1ULL) + +#define ZB_ROOT_OBJECT (0ULL) +#define ZB_ROOT_LEVEL (-1LL) +#define ZB_ROOT_BLKID (0ULL) + +#define ZB_ZIL_OBJECT (0ULL) +#define ZB_ZIL_LEVEL (-2LL) + +#define ZB_DNODE_LEVEL (-3LL) +#define ZB_DNODE_BLKID (0ULL) + +#define ZB_IS_ZERO(zb) \ + ((zb)->zb_objset == 0 && (zb)->zb_object == 0 && \ + (zb)->zb_level == 0 && (zb)->zb_blkid == 0) +#define ZB_IS_ROOT(zb) \ + ((zb)->zb_object == ZB_ROOT_OBJECT && \ + (zb)->zb_level == ZB_ROOT_LEVEL && \ + (zb)->zb_blkid == ZB_ROOT_BLKID) + +typedef struct zio_prop { + enum zio_checksum zp_checksum; + enum zio_compress zp_compress; + uint8_t zp_complevel; + dmu_object_type_t zp_type; + uint8_t zp_level; + uint8_t zp_copies; + boolean_t zp_dedup; + boolean_t zp_dedup_verify; + boolean_t zp_nopwrite; + boolean_t zp_encrypt; + boolean_t zp_byteorder; + uint8_t zp_salt[ZIO_DATA_SALT_LEN]; + uint8_t zp_iv[ZIO_DATA_IV_LEN]; + uint8_t zp_mac[ZIO_DATA_MAC_LEN]; + uint32_t zp_zpl_smallblk; +} zio_prop_t; + +typedef struct zio_cksum_report zio_cksum_report_t; + +typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, + const abd_t *good_data); +typedef void zio_cksum_free_f(void *cbdata, size_t size); + +struct zio_bad_cksum; /* defined in zio_checksum.h */ +struct dnode_phys; +struct abd; + +struct zio_cksum_report { + struct zio_cksum_report *zcr_next; + nvlist_t *zcr_ereport; + nvlist_t *zcr_detector; + void *zcr_cbdata; + size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_align; + uint64_t zcr_length; + zio_cksum_finish_f *zcr_finish; + zio_cksum_free_f *zcr_free; + + /* internal use only */ + struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ +}; + +typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, + void *arg); + +zio_vsd_cksum_report_f zio_vsd_default_cksum_report; + +typedef struct zio_vsd_ops { + zio_done_func_t *vsd_free; + zio_vsd_cksum_report_f *vsd_cksum_report; +} zio_vsd_ops_t; + +typedef struct zio_gang_node { + zio_gbh_phys_t *gn_gbh; + struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; +} zio_gang_node_t; + +typedef zio_t *zio_gang_issue_func_t(zio_t *zio, blkptr_t *bp, + zio_gang_node_t *gn, struct abd *data, uint64_t offset); + +typedef void zio_transform_func_t(zio_t *zio, struct abd *data, uint64_t size); + +typedef struct zio_transform { + struct abd *zt_orig_abd; + uint64_t zt_orig_size; + uint64_t zt_bufsize; + zio_transform_func_t *zt_transform; + struct zio_transform *zt_next; +} zio_transform_t; + +typedef zio_t *zio_pipe_stage_t(zio_t *zio); + +/* + * The io_reexecute flags are distinct from io_flags because the child must + * be able to propagate them to the parent. The normal io_flags are local + * to the zio, not protected by any lock, and not modifiable by children; + * the reexecute flags are protected by io_lock, modifiable by children, + * and always propagated -- even when ZIO_FLAG_DONT_PROPAGATE is set. + */ +#define ZIO_REEXECUTE_NOW 0x01 +#define ZIO_REEXECUTE_SUSPEND 0x02 + +/* + * The io_trim flags are used to specify the type of TRIM to perform. They + * only apply to ZIO_TYPE_TRIM zios are distinct from io_flags. + */ +enum trim_flag { + ZIO_TRIM_SECURE = 1 << 0, +}; + +typedef struct zio_alloc_list { + list_t zal_list; + uint64_t zal_size; +} zio_alloc_list_t; + +typedef struct zio_link { + zio_t *zl_parent; + zio_t *zl_child; + list_node_t zl_parent_node; + list_node_t zl_child_node; +} zio_link_t; + +struct zio { + /* Core information about this I/O */ + zbookmark_phys_t io_bookmark; + zio_prop_t io_prop; + zio_type_t io_type; + enum zio_child io_child_type; + enum trim_flag io_trim_flags; + int io_cmd; + zio_priority_t io_priority; + uint8_t io_reexecute; + uint8_t io_state[ZIO_WAIT_TYPES]; + uint64_t io_txg; + spa_t *io_spa; + blkptr_t *io_bp; + blkptr_t *io_bp_override; + blkptr_t io_bp_copy; + list_t io_parent_list; + list_t io_child_list; + zio_t *io_logical; + zio_transform_t *io_transform_stack; + + /* Callback info */ + zio_done_func_t *io_ready; + zio_done_func_t *io_children_ready; + zio_done_func_t *io_physdone; + zio_done_func_t *io_done; + void *io_private; + int64_t io_prev_space_delta; /* DMU private */ + blkptr_t io_bp_orig; + /* io_lsize != io_orig_size iff this is a raw write */ + uint64_t io_lsize; + + /* Data represented by this I/O */ + struct abd *io_abd; + struct abd *io_orig_abd; + uint64_t io_size; + uint64_t io_orig_size; + + /* Stuff for the vdev stack */ + vdev_t *io_vd; + void *io_vsd; + const zio_vsd_ops_t *io_vsd_ops; + metaslab_class_t *io_metaslab_class; /* dva throttle class */ + + uint64_t io_offset; + hrtime_t io_timestamp; /* submitted at */ + hrtime_t io_queued_timestamp; + hrtime_t io_target_timestamp; + hrtime_t io_delta; /* vdev queue service delta */ + hrtime_t io_delay; /* Device access time (disk or */ + /* file). */ + avl_node_t io_queue_node; + avl_node_t io_offset_node; + avl_node_t io_alloc_node; + zio_alloc_list_t io_alloc_list; + + /* Internal pipeline state */ + enum zio_flag io_flags; + enum zio_stage io_stage; + enum zio_stage io_pipeline; + enum zio_flag io_orig_flags; + enum zio_stage io_orig_stage; + enum zio_stage io_orig_pipeline; + enum zio_stage io_pipeline_trace; + int io_error; + int io_child_error[ZIO_CHILD_TYPES]; + uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_phys_children; + uint64_t io_parent_count; + uint64_t *io_stall; + zio_t *io_gang_leader; + zio_gang_node_t *io_gang_tree; + void *io_executor; + void *io_waiter; + void *io_bio; + kmutex_t io_lock; + kcondvar_t io_cv; + int io_allocator; + + /* FMA state */ + zio_cksum_report_t *io_cksum_report; + uint64_t io_ena; + + /* Taskq dispatching state */ + taskq_ent_t io_tqent; +}; + +enum blk_verify_flag { + BLK_VERIFY_ONLY, + BLK_VERIFY_LOG, + BLK_VERIFY_HALT +}; + +extern int zio_bookmark_compare(const void *, const void *); + +extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_root(spa_t *spa, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + struct abd *data, uint64_t lsize, zio_done_func_t *done, void *priv, + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb); + +extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + struct abd *data, uint64_t size, uint64_t psize, const zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *children_ready, + zio_done_func_t *physdone, zio_done_func_t *done, + void *priv, zio_priority_t priority, enum zio_flag flags, + const zbookmark_phys_t *zb); + +extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + struct abd *data, uint64_t size, zio_done_func_t *done, void *priv, + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb); + +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies, + boolean_t nopwrite); + +extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); + +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *priv, enum zio_flag flags); + +extern zio_t *zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + zio_done_func_t *done, void *priv, zio_priority_t priority, + enum zio_flag flags, enum trim_flag trim_flags); + +extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, struct abd *data, int checksum, + zio_done_func_t *done, void *priv, zio_priority_t priority, + enum zio_flag flags, boolean_t labels); + +extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, + uint64_t size, struct abd *data, int checksum, + zio_done_func_t *done, void *priv, zio_priority_t priority, + enum zio_flag flags, boolean_t labels); + +extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, enum zio_flag flags); + +extern int zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, + blkptr_t *new_bp, uint64_t size, boolean_t *slog); +extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_shrink(zio_t *zio, uint64_t size); + +extern int zio_wait(zio_t *zio); +extern void zio_nowait(zio_t *zio); +extern void zio_execute(zio_t *zio); +extern void zio_interrupt(zio_t *zio); +extern void zio_delay_init(zio_t *zio); +extern void zio_delay_interrupt(zio_t *zio); +extern void zio_deadman(zio_t *zio, char *tag); + +extern zio_t *zio_walk_parents(zio_t *cio, zio_link_t **); +extern zio_t *zio_walk_children(zio_t *pio, zio_link_t **); +extern zio_t *zio_unique_parent(zio_t *cio); +extern void zio_add_child(zio_t *pio, zio_t *cio); + +extern void *zio_buf_alloc(size_t size); +extern void zio_buf_free(void *buf, size_t size); +extern void *zio_data_buf_alloc(size_t size); +extern void zio_data_buf_free(void *buf, size_t size); + +extern void zio_push_transform(zio_t *zio, struct abd *abd, uint64_t size, + uint64_t bufsize, zio_transform_func_t *transform); +extern void zio_pop_transforms(zio_t *zio); + +extern void zio_resubmit_stage_async(void *); + +extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, + uint64_t offset, struct abd *data, uint64_t size, int type, + zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *priv); + +extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, + struct abd *data, uint64_t size, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *priv); + +extern void zio_vdev_io_bypass(zio_t *zio); +extern void zio_vdev_io_reissue(zio_t *zio); +extern void zio_vdev_io_redone(zio_t *zio); + +extern void zio_change_priority(zio_t *pio, zio_priority_t priority); + +extern void zio_checksum_verified(zio_t *zio); +extern int zio_worst_error(int e1, int e2); + +extern enum zio_checksum zio_checksum_select(enum zio_checksum child, + enum zio_checksum parent); +extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, + enum zio_checksum child, enum zio_checksum parent); +extern enum zio_compress zio_compress_select(spa_t *spa, + enum zio_compress child, enum zio_compress parent); +extern uint8_t zio_complevel_select(spa_t *spa, enum zio_compress compress, + uint8_t child, uint8_t parent); + +extern void zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t); +extern int zio_resume(spa_t *spa); +extern void zio_resume_wait(spa_t *spa); + +extern boolean_t zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, + boolean_t config_held, enum blk_verify_flag blk_verify); + +/* + * Initial setup and teardown. + */ +extern void zio_init(void); +extern void zio_fini(void); + +/* + * Fault injection + */ +struct zinject_record; +extern uint32_t zio_injection_enabled; +extern int zio_inject_fault(char *name, int flags, int *id, + struct zinject_record *record); +extern int zio_inject_list_next(int *id, char *name, size_t buflen, + struct zinject_record *record); +extern int zio_clear_fault(int id); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); +extern int zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, + uint64_t type, int error); +extern int zio_handle_fault_injection(zio_t *zio, int error); +extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); +extern int zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, + int err2); +extern int zio_handle_label_injection(zio_t *zio, int error); +extern void zio_handle_ignored_writes(zio_t *zio); +extern hrtime_t zio_handle_io_delay(zio_t *zio); + +/* + * Checksum ereport functions + */ +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, + uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const abd_t *good_data, const abd_t *bad_data, boolean_t drop_if_identical); + +extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); + +/* If we have the good data in hand, this function can be used */ +extern int zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, struct zio *zio, uint64_t offset, + uint64_t length, const abd_t *good_data, const abd_t *bad_data, + struct zio_bad_cksum *info); + +/* Called from spa_sync(), but primarily an injection handler */ +extern void spa_handle_ignored_writes(spa_t *spa); + +/* zbookmark_phys functions */ +boolean_t zbookmark_subtree_completed(const struct dnode_phys *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block); +int zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, + uint8_t ibs2, const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_H */ diff --git a/include/sys/zio_checksum.h b/include/sys/zio_checksum.h new file mode 100644 index 000000000000..45abd3bd313b --- /dev/null +++ b/include/sys/zio_checksum.h @@ -0,0 +1,144 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + * Copyright Saso Kiselkov 2013, All rights reserved. + */ + +#ifndef _SYS_ZIO_CHECKSUM_H +#define _SYS_ZIO_CHECKSUM_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct abd; + +/* + * Signature for checksum functions. + */ +typedef void zio_checksum_t(struct abd *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp); +typedef void *zio_checksum_tmpl_init_t(const zio_cksum_salt_t *salt); +typedef void zio_checksum_tmpl_free_t(void *ctx_template); + +typedef enum zio_checksum_flags { + /* Strong enough for metadata? */ + ZCHECKSUM_FLAG_METADATA = (1 << 1), + /* ZIO embedded checksum */ + ZCHECKSUM_FLAG_EMBEDDED = (1 << 2), + /* Strong enough for dedup (without verification)? */ + ZCHECKSUM_FLAG_DEDUP = (1 << 3), + /* Uses salt value */ + ZCHECKSUM_FLAG_SALTED = (1 << 4), + /* Strong enough for nopwrite? */ + ZCHECKSUM_FLAG_NOPWRITE = (1 << 5) +} zio_checksum_flags_t; + +typedef enum { + ZIO_CHECKSUM_NATIVE, + ZIO_CHECKSUM_BYTESWAP +} zio_byteorder_t; + +typedef struct zio_abd_checksum_data { + zio_byteorder_t acd_byteorder; + fletcher_4_ctx_t *acd_ctx; + zio_cksum_t *acd_zcp; + void *acd_private; +} zio_abd_checksum_data_t; + +typedef void zio_abd_checksum_init_t(zio_abd_checksum_data_t *); +typedef void zio_abd_checksum_fini_t(zio_abd_checksum_data_t *); +typedef int zio_abd_checksum_iter_t(void *, size_t, void *); + +typedef const struct zio_abd_checksum_func { + zio_abd_checksum_init_t *acf_init; + zio_abd_checksum_fini_t *acf_fini; + zio_abd_checksum_iter_t *acf_iter; +} zio_abd_checksum_func_t; + +/* + * Information about each checksum function. + */ +typedef const struct zio_checksum_info { + /* checksum function for each byteorder */ + zio_checksum_t *ci_func[2]; + zio_checksum_tmpl_init_t *ci_tmpl_init; + zio_checksum_tmpl_free_t *ci_tmpl_free; + zio_checksum_flags_t ci_flags; + char *ci_name; /* descriptive name */ +} zio_checksum_info_t; + +typedef struct zio_bad_cksum { + zio_cksum_t zbc_expected; + zio_cksum_t zbc_actual; + const char *zbc_checksum_name; + uint8_t zbc_byteswapped; + uint8_t zbc_injected; + uint8_t zbc_has_cksum; /* expected/actual valid */ +} zio_bad_cksum_t; + +extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; + +/* + * Checksum routines. + */ +extern zio_checksum_t abd_checksum_SHA256; +extern zio_checksum_t abd_checksum_SHA512_native; +extern zio_checksum_t abd_checksum_SHA512_byteswap; + +/* Skein */ +extern zio_checksum_t abd_checksum_skein_native; +extern zio_checksum_t abd_checksum_skein_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_skein_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_skein_tmpl_free; + +/* Edon-R */ +extern zio_checksum_t abd_checksum_edonr_native; +extern zio_checksum_t abd_checksum_edonr_byteswap; +extern zio_checksum_tmpl_init_t abd_checksum_edonr_tmpl_init; +extern zio_checksum_tmpl_free_t abd_checksum_edonr_tmpl_free; + +extern zio_abd_checksum_func_t fletcher_4_abd_ops; +extern zio_checksum_t abd_fletcher_4_native; +extern zio_checksum_t abd_fletcher_4_byteswap; + +extern int zio_checksum_equal(spa_t *, blkptr_t *, enum zio_checksum, + void *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern void zio_checksum_compute(zio_t *, enum zio_checksum, + struct abd *, uint64_t); +extern int zio_checksum_error_impl(spa_t *, const blkptr_t *, enum zio_checksum, + struct abd *, uint64_t, uint64_t, zio_bad_cksum_t *); +extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); +extern void zio_checksum_templates_free(spa_t *spa); +extern spa_feature_t zio_checksum_to_feature(enum zio_checksum cksum); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_CHECKSUM_H */ diff --git a/include/sys/zio_compress.h b/include/sys/zio_compress.h new file mode 100644 index 000000000000..4a22ad2a2742 --- /dev/null +++ b/include/sys/zio_compress.h @@ -0,0 +1,198 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2019, Klara Inc. + * Use is subject to license terms. + * Copyright (c) 2015, 2016 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZIO_COMPRESS_H +#define _SYS_ZIO_COMPRESS_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +enum zio_compress { + ZIO_COMPRESS_INHERIT = 0, + ZIO_COMPRESS_ON, + ZIO_COMPRESS_OFF, + ZIO_COMPRESS_LZJB, + ZIO_COMPRESS_EMPTY, + ZIO_COMPRESS_GZIP_1, + ZIO_COMPRESS_GZIP_2, + ZIO_COMPRESS_GZIP_3, + ZIO_COMPRESS_GZIP_4, + ZIO_COMPRESS_GZIP_5, + ZIO_COMPRESS_GZIP_6, + ZIO_COMPRESS_GZIP_7, + ZIO_COMPRESS_GZIP_8, + ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, + ZIO_COMPRESS_LZ4, + ZIO_COMPRESS_ZSTD, + ZIO_COMPRESS_FUNCTIONS +}; + +/* Compression algorithms that have levels */ +#define ZIO_COMPRESS_HASLEVEL(compress) ((compress == ZIO_COMPRESS_ZSTD || \ + (compress >= ZIO_COMPRESS_GZIP_1 && \ + compress <= ZIO_COMPRESS_GZIP_9))) + +#define ZIO_COMPLEVEL_INHERIT 0 +#define ZIO_COMPLEVEL_DEFAULT 255 + +enum zio_zstd_levels { + ZIO_ZSTD_LEVEL_INHERIT = 0, + ZIO_ZSTD_LEVEL_1, +#define ZIO_ZSTD_LEVEL_MIN ZIO_ZSTD_LEVEL_1 + ZIO_ZSTD_LEVEL_2, + ZIO_ZSTD_LEVEL_3, +#define ZIO_ZSTD_LEVEL_DEFAULT ZIO_ZSTD_LEVEL_3 + ZIO_ZSTD_LEVEL_4, + ZIO_ZSTD_LEVEL_5, + ZIO_ZSTD_LEVEL_6, + ZIO_ZSTD_LEVEL_7, + ZIO_ZSTD_LEVEL_8, + ZIO_ZSTD_LEVEL_9, + ZIO_ZSTD_LEVEL_10, + ZIO_ZSTD_LEVEL_11, + ZIO_ZSTD_LEVEL_12, + ZIO_ZSTD_LEVEL_13, + ZIO_ZSTD_LEVEL_14, + ZIO_ZSTD_LEVEL_15, + ZIO_ZSTD_LEVEL_16, + ZIO_ZSTD_LEVEL_17, + ZIO_ZSTD_LEVEL_18, + ZIO_ZSTD_LEVEL_19, +#define ZIO_ZSTD_LEVEL_MAX ZIO_ZSTD_LEVEL_19 + ZIO_ZSTD_LEVEL_RESERVE = 101, /* Leave room for new positive levels */ + ZIO_ZSTD_LEVEL_FAST, /* Fast levels are negative */ + ZIO_ZSTD_LEVEL_FAST_1, +#define ZIO_ZSTD_LEVEL_FAST_DEFAULT ZIO_ZSTD_LEVEL_FAST_1 + ZIO_ZSTD_LEVEL_FAST_2, + ZIO_ZSTD_LEVEL_FAST_3, + ZIO_ZSTD_LEVEL_FAST_4, + ZIO_ZSTD_LEVEL_FAST_5, + ZIO_ZSTD_LEVEL_FAST_6, + ZIO_ZSTD_LEVEL_FAST_7, + ZIO_ZSTD_LEVEL_FAST_8, + ZIO_ZSTD_LEVEL_FAST_9, + ZIO_ZSTD_LEVEL_FAST_10, + ZIO_ZSTD_LEVEL_FAST_20, + ZIO_ZSTD_LEVEL_FAST_30, + ZIO_ZSTD_LEVEL_FAST_40, + ZIO_ZSTD_LEVEL_FAST_50, + ZIO_ZSTD_LEVEL_FAST_60, + ZIO_ZSTD_LEVEL_FAST_70, + ZIO_ZSTD_LEVEL_FAST_80, + ZIO_ZSTD_LEVEL_FAST_90, + ZIO_ZSTD_LEVEL_FAST_100, + ZIO_ZSTD_LEVEL_FAST_500, + ZIO_ZSTD_LEVEL_FAST_1000, +#define ZIO_ZSTD_LEVEL_FAST_MAX ZIO_ZSTD_LEVEL_FAST_1000 + ZIO_ZSTD_LEVEL_AUTO = 251, /* Reserved for future use */ + ZIO_ZSTD_LEVEL_LEVELS +}; + +/* Forward Declaration to avoid visibility problems */ +struct zio_prop; + +/* Common signature for all zio compress functions. */ +typedef size_t zio_compress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +/* Common signature for all zio decompress functions. */ +typedef int zio_decompress_func_t(void *src, void *dst, + size_t s_len, size_t d_len, int); +/* Common signature for all zio decompress and get level functions. */ +typedef int zio_decompresslevel_func_t(void *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level); +/* Common signature for all zio get-compression-level functions. */ +typedef int zio_getlevel_func_t(void *src, size_t s_len, uint8_t *level); + + +/* + * Common signature for all zio decompress functions using an ABD as input. + * This is helpful if you have both compressed ARC and scatter ABDs enabled, + * but is not a requirement for all compression algorithms. + */ +typedef int zio_decompress_abd_func_t(abd_t *src, void *dst, + size_t s_len, size_t d_len, int); +/* + * Information about each compression function. + */ +typedef const struct zio_compress_info { + char *ci_name; + int ci_level; + zio_compress_func_t *ci_compress; + zio_decompress_func_t *ci_decompress; + zio_decompresslevel_func_t *ci_decompress_level; +} zio_compress_info_t; + +extern zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS]; + +/* + * lz4 compression init & free + */ +extern void lz4_init(void); +extern void lz4_fini(void); + +/* + * Compression routines. + */ +extern size_t lzjb_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lzjb_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern size_t lz4_compress_zfs(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int lz4_decompress_zfs(void *src, void *dst, size_t s_len, size_t d_len, + int level); + +/* + * Compress and decompress data if necessary. + */ +extern size_t zio_compress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, uint8_t level); +extern int zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level); +extern int zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level); +extern int zio_compress_to_feature(enum zio_compress comp); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZIO_COMPRESS_H */ diff --git a/include/sys/zio_crypt.h b/include/sys/zio_crypt.h new file mode 100644 index 000000000000..d7a63d69582d --- /dev/null +++ b/include/sys/zio_crypt.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifndef _SYS_ZIO_CRYPT_H +#define _SYS_ZIO_CRYPT_H + +#include +#include +#if defined(__FreeBSD__) && defined(_KERNEL) +#include +#else +#include +#endif /* __FreeBSD__ */ +#include +#include +#include + +/* forward declarations */ +struct zbookmark_phys; + +#define WRAPPING_KEY_LEN 32 +#define WRAPPING_IV_LEN ZIO_DATA_IV_LEN +#define WRAPPING_MAC_LEN ZIO_DATA_MAC_LEN +#define MASTER_KEY_MAX_LEN 32 +#define SHA512_HMAC_KEYLEN 64 + +#define ZIO_CRYPT_KEY_CURRENT_VERSION 1ULL + +typedef enum zio_crypt_type { + ZC_TYPE_NONE = 0, + ZC_TYPE_CCM, + ZC_TYPE_GCM +} zio_crypt_type_t; + +/* table of supported crypto algorithms, modes and keylengths. */ +typedef struct zio_crypt_info { + /* mechanism name, needed by ICP */ +#if defined(__FreeBSD__) && defined(_KERNEL) + /* + * I've deliberately used a different name here, to catch + * ICP-using code. + */ + const char *ci_algname; +#else + crypto_mech_name_t ci_mechname; +#endif + /* cipher mode type (GCM, CCM) */ + zio_crypt_type_t ci_crypt_type; + + /* length of the encryption key */ + size_t ci_keylen; + + /* human-readable name of the encryption algorithm */ + char *ci_name; +} zio_crypt_info_t; + +extern zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS]; + +/* in memory representation of an unwrapped key that is loaded into memory */ +typedef struct zio_crypt_key { + /* encryption algorithm */ + uint64_t zk_crypt; + + /* on-disk format version */ + uint64_t zk_version; + + /* GUID for uniquely identifying this key. Not encrypted on disk. */ + uint64_t zk_guid; + + /* buffer for master key */ + uint8_t zk_master_keydata[MASTER_KEY_MAX_LEN]; + + /* buffer for hmac key */ + uint8_t zk_hmac_keydata[SHA512_HMAC_KEYLEN]; + + /* buffer for current encryption key derived from master key */ + uint8_t zk_current_keydata[MASTER_KEY_MAX_LEN]; + + /* current 64 bit salt for deriving an encryption key */ + uint8_t zk_salt[ZIO_DATA_SALT_LEN]; + + /* count of how many times the current salt has been used */ + uint64_t zk_salt_count; + + /* illumos crypto api current encryption key */ + crypto_key_t zk_current_key; + +#if defined(__FreeBSD__) && defined(_KERNEL) + /* Session for current encryption key. Must always be set */ + freebsd_crypt_session_t zk_session; +#else + /* template of current encryption key for illumos crypto api */ + crypto_ctx_template_t zk_current_tmpl; +#endif + + /* illumos crypto api current hmac key */ + crypto_key_t zk_hmac_key; + + /* template of hmac key for illumos crypto api */ + crypto_ctx_template_t zk_hmac_tmpl; + + /* lock for changing the salt and dependent values */ + krwlock_t zk_salt_lock; +} zio_crypt_key_t; + +void zio_crypt_key_destroy(zio_crypt_key_t *key); +int zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key); +int zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt_out); + +int zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out); +int zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key); +int zio_crypt_generate_iv(uint8_t *ivbuf); +int zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt); + +void zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv); +void zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv); +void zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac); +void zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac); +void zio_crypt_encode_mac_zil(void *data, uint8_t *mac); +void zio_crypt_decode_mac_zil(const void *data, uint8_t *mac); +void zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen); + +int zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum); +int zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum); +int zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen); +int zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t byteswap, uint8_t *portable_mac, uint8_t *local_mac); +int zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt); +int zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt); + +#endif diff --git a/include/sys/zio_impl.h b/include/sys/zio_impl.h new file mode 100644 index 000000000000..4c998571653a --- /dev/null +++ b/include/sys/zio_impl.h @@ -0,0 +1,270 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#ifndef _ZIO_IMPL_H +#define _ZIO_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * XXX -- Describe ZFS I/O pipeline here. Fill in as needed. + * + * The ZFS I/O pipeline is comprised of various stages which are defined + * in the zio_stage enum below. The individual stages are used to construct + * these basic I/O operations: Read, Write, Free, Claim, and Ioctl. + * + * I/O operations: (XXX - provide detail for each of the operations) + * + * Read: + * Write: + * Free: + * Claim: + * Ioctl: + * + * Although the most common pipeline are used by the basic I/O operations + * above, there are some helper pipelines (one could consider them + * sub-pipelines) which are used internally by the ZIO module and are + * explained below: + * + * Interlock Pipeline: + * The interlock pipeline is the most basic pipeline and is used by all + * of the I/O operations. The interlock pipeline does not perform any I/O + * and is used to coordinate the dependencies between I/Os that are being + * issued (i.e. the parent/child relationship). + * + * Vdev child Pipeline: + * The vdev child pipeline is responsible for performing the physical I/O. + * It is in this pipeline where the I/O are queued and possibly cached. + * + * In addition to performing I/O, the pipeline is also responsible for + * data transformations. The transformations performed are based on the + * specific properties that user may have selected and modify the + * behavior of the pipeline. Examples of supported transformations are + * compression, dedup, and nop writes. Transformations will either modify + * the data or the pipeline. This list below further describes each of + * the supported transformations: + * + * Compression: + * ZFS supports five different flavors of compression -- gzip, lzjb, lz4, zle, + * and zstd. Compression occurs as part of the write pipeline and is + * performed in the ZIO_STAGE_WRITE_BP_INIT stage. + * + * Dedup: + * Dedup reads are handled by the ZIO_STAGE_DDT_READ_START and + * ZIO_STAGE_DDT_READ_DONE stages. These stages are added to an existing + * read pipeline if the dedup bit is set on the block pointer. + * Writing a dedup block is performed by the ZIO_STAGE_DDT_WRITE stage + * and added to a write pipeline if a user has enabled dedup on that + * particular dataset. + * + * NOP Write: + * The NOP write feature is performed by the ZIO_STAGE_NOP_WRITE stage + * and is added to an existing write pipeline if a cryptographically + * secure checksum (i.e. SHA256) is enabled and compression is turned on. + * The NOP write stage will compare the checksums of the current data + * on-disk (level-0 blocks only) and the data that is currently being written. + * If the checksum values are identical then the pipeline is converted to + * an interlock pipeline skipping block allocation and bypassing the + * physical I/O. The nop write feature can handle writes in either + * syncing or open context (i.e. zil writes) and as a result is mutually + * exclusive with dedup. + * + * Encryption: + * Encryption and authentication is handled by the ZIO_STAGE_ENCRYPT stage. + * This stage determines how the encryption metadata is stored in the bp. + * Decryption and MAC verification is performed during zio_decrypt() as a + * transform callback. Encryption is mutually exclusive with nopwrite, because + * blocks with the same plaintext will be encrypted with different salts and + * IV's (if dedup is off), and therefore have different ciphertexts. For dedup + * blocks we deterministically generate the IV and salt by performing an HMAC + * of the plaintext, which is computationally expensive, but allows us to keep + * support for encrypted dedup. See the block comment in zio_crypt.c for + * details. + */ + +/* + * zio pipeline stage definitions + */ +enum zio_stage { + ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ + + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 2, /* -W--- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 3, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 4, /* RWF-- */ + ZIO_STAGE_WRITE_COMPRESS = 1 << 5, /* -W--- */ + + ZIO_STAGE_ENCRYPT = 1 << 6, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 7, /* -W--- */ + + ZIO_STAGE_NOP_WRITE = 1 << 8, /* -W--- */ + + ZIO_STAGE_DDT_READ_START = 1 << 9, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 10, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 11, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 12, /* --F-- */ + + ZIO_STAGE_GANG_ASSEMBLE = 1 << 13, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 14, /* RWFC- */ + + ZIO_STAGE_DVA_THROTTLE = 1 << 15, /* -W--- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 16, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 17, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 18, /* ---C- */ + + ZIO_STAGE_READY = 1 << 19, /* RWFCI */ + + ZIO_STAGE_VDEV_IO_START = 1 << 20, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 21, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 22, /* RW--I */ + + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 23, /* R---- */ + + ZIO_STAGE_DONE = 1 << 24 /* RWFCI */ +}; + +#define ZIO_INTERLOCK_STAGES \ + (ZIO_STAGE_READY | \ + ZIO_STAGE_DONE) + +#define ZIO_INTERLOCK_PIPELINE \ + ZIO_INTERLOCK_STAGES + +#define ZIO_VDEV_IO_STAGES \ + (ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_DONE | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DONE) + +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_CHECKSUM_VERIFY) + +#define ZIO_READ_PHYS_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + ZIO_STAGE_READ_BP_INIT) + +#define ZIO_DDT_CHILD_READ_PIPELINE \ + ZIO_READ_COMMON_STAGES + +#define ZIO_DDT_READ_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_READ_BP_INIT | \ + ZIO_STAGE_DDT_READ_START | \ + ZIO_STAGE_DDT_READ_DONE) + +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_CHECKSUM_GENERATE) + +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES + +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ + ZIO_STAGE_WRITE_BP_INIT) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ + ZIO_STAGE_DVA_THROTTLE | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_CHILD_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_THROTTLE | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_COMPRESS | \ + ZIO_STAGE_ENCRYPT | \ + ZIO_STAGE_CHECKSUM_GENERATE | \ + ZIO_STAGE_DDT_WRITE) + +#define ZIO_GANG_STAGES \ + (ZIO_STAGE_GANG_ASSEMBLE | \ + ZIO_STAGE_GANG_ISSUE) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_DVA_FREE) + +#define ZIO_DDT_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_DDT_FREE) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_DVA_CLAIM) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_TRIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_VDEV_IO_STAGES) + +#define ZIO_BLOCKING_STAGES \ + (ZIO_STAGE_DVA_ALLOCATE | \ + ZIO_STAGE_DVA_CLAIM | \ + ZIO_STAGE_VDEV_IO_START) + +extern void zio_inject_init(void); +extern void zio_inject_fini(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_IMPL_H */ diff --git a/include/sys/zio_priority.h b/include/sys/zio_priority.h new file mode 100644 index 000000000000..2d8e7fc36bae --- /dev/null +++ b/include/sys/zio_priority.h @@ -0,0 +1,43 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2016 by Delphix. All rights reserved. + */ +#ifndef _ZIO_PRIORITY_H +#define _ZIO_PRIORITY_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum zio_priority { + ZIO_PRIORITY_SYNC_READ, + ZIO_PRIORITY_SYNC_WRITE, /* ZIL */ + ZIO_PRIORITY_ASYNC_READ, /* prefetch */ + ZIO_PRIORITY_ASYNC_WRITE, /* spa_sync() */ + ZIO_PRIORITY_SCRUB, /* asynchronous scrub/resilver reads */ + ZIO_PRIORITY_REMOVAL, /* reads/writes for vdev removal */ + ZIO_PRIORITY_INITIALIZING, /* initializing I/O */ + ZIO_PRIORITY_TRIM, /* trim I/O (discard) */ + ZIO_PRIORITY_REBUILD, /* reads/writes for vdev rebuild */ + ZIO_PRIORITY_NUM_QUEUEABLE, + ZIO_PRIORITY_NOW, /* non-queued i/os (e.g. free) */ +} zio_priority_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZIO_PRIORITY_H */ diff --git a/include/sys/zrlock.h b/include/sys/zrlock.h new file mode 100644 index 000000000000..b6eba1a18ff4 --- /dev/null +++ b/include/sys/zrlock.h @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZRLOCK_H +#define _SYS_ZRLOCK_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct zrlock { + kmutex_t zr_mtx; + volatile int32_t zr_refcount; + kcondvar_t zr_cv; + uint16_t zr_pad; +#ifdef ZFS_DEBUG + kthread_t *zr_owner; + const char *zr_caller; +#endif +} zrlock_t; + +extern void zrl_init(zrlock_t *); +extern void zrl_destroy(zrlock_t *); +#define zrl_add(_z) zrl_add_impl((_z), __func__) +extern void zrl_add_impl(zrlock_t *, const char *); +extern void zrl_remove(zrlock_t *); +extern int zrl_tryenter(zrlock_t *); +extern void zrl_exit(zrlock_t *); +extern int zrl_is_zero(zrlock_t *); +extern int zrl_is_locked(zrlock_t *); +#ifdef ZFS_DEBUG +extern kthread_t *zrl_owner(zrlock_t *); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ZRLOCK_H */ diff --git a/include/sys/zstd/Makefile.am b/include/sys/zstd/Makefile.am new file mode 100644 index 000000000000..16666fe63355 --- /dev/null +++ b/include/sys/zstd/Makefile.am @@ -0,0 +1,18 @@ +COMMON_H = \ + $(top_srcdir)/include/sys/zstd/zstd.h + +KERNEL_H = + +USER_H = + +EXTRA_DIST = $(COMMON_H) $(KERNEL_H) $(USER_H) + +if CONFIG_USER +libzfsdir = $(includedir)/libzfs/sys/zstd +libzfs_HEADERS = $(COMMON_H) $(USER_H) +endif + +if CONFIG_KERNEL +kerneldir = @prefix@/src/zfs-$(VERSION)/include/sys/zstd +kernel_HEADERS = $(COMMON_H) $(KERNEL_H) +endif diff --git a/include/sys/zstd/zstd.h b/include/sys/zstd/zstd.h new file mode 100644 index 000000000000..f965df31988c --- /dev/null +++ b/include/sys/zstd/zstd.h @@ -0,0 +1,98 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2018, Klara Inc. + * Copyright (c) 2016-2018, Allan Jude + * Copyright (c) 2018-2020, Sebastian Gottschall + * Copyright (c) 2019-2020, Michael Niewöhner + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#ifndef _ZFS_ZSTD_H +#define _ZFS_ZSTD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * ZSTD block header + * NOTE: all fields in this header are in big endian order. + */ +typedef struct zfs_zstd_header { + /* Compressed size of data */ + uint32_t c_len; + + /* + * Version and compression level + * We use a union to be able to big endian encode a single 32 bit + * unsigned integer, but still access the individual bitmasked + * components easily. + */ + union { + uint32_t raw_version_level; + struct { + uint32_t version : 24; + uint8_t level; + }; + }; + + char data[]; +} zfs_zstdhdr_t; + +/* + * kstat helper macros + */ +#define ZSTDSTAT(stat) (zstd_stats.stat.value.ui64) +#define ZSTDSTAT_INCR(stat, val) \ + atomic_add_64(&zstd_stats.stat.value.ui64, (val)) +#define ZSTDSTAT_BUMP(stat) ZSTDSTAT_INCR(stat, 1) + +/* (de)init for user space / kernel emulation */ +int zstd_init(void); +void zstd_fini(void); + +size_t zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, + size_t d_len, int level); +int zfs_zstd_get_level(void *s_start, size_t s_len, uint8_t *level); +int zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, + size_t d_len, uint8_t *level); +int zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_ZSTD_H */ diff --git a/include/sys/zthr.h b/include/sys/zthr.h new file mode 100644 index 000000000000..ae8c57e9eea2 --- /dev/null +++ b/include/sys/zthr.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, 2018 by Delphix. All rights reserved. + */ + +#ifndef _SYS_ZTHR_H +#define _SYS_ZTHR_H + +typedef struct zthr zthr_t; +typedef void (zthr_func_t)(void *, zthr_t *); +typedef boolean_t (zthr_checkfunc_t)(void *, zthr_t *); + +extern zthr_t *zthr_create(const char *zthr_name, + zthr_checkfunc_t checkfunc, zthr_func_t *func, void *arg); +extern zthr_t *zthr_create_timer(const char *zthr_name, + zthr_checkfunc_t *checkfunc, zthr_func_t *func, void *arg, + hrtime_t nano_wait); +extern void zthr_destroy(zthr_t *t); + +extern void zthr_wakeup(zthr_t *t); +extern void zthr_cancel(zthr_t *t); +extern void zthr_resume(zthr_t *t); +extern void zthr_wait_cycle_done(zthr_t *t); + +extern boolean_t zthr_iscancelled(zthr_t *t); +extern boolean_t zthr_has_waiters(zthr_t *t); + +#endif /* _SYS_ZTHR_H */ diff --git a/include/sys/zvol.h b/include/sys/zvol.h new file mode 100644 index 000000000000..8efb7f5e6041 --- /dev/null +++ b/include/sys/zvol.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + */ + +#ifndef _SYS_ZVOL_H +#define _SYS_ZVOL_H + +#include + +#define ZVOL_OBJ 1ULL +#define ZVOL_ZAP_OBJ 2ULL + +#define SPEC_MAXOFFSET_T ((1LL << ((NBBY * sizeof (daddr32_t)) + \ + DEV_BSHIFT - 1)) - 1) + +extern void zvol_create_minor(const char *); +extern void zvol_create_minors_recursive(const char *); +extern void zvol_remove_minors(spa_t *, const char *, boolean_t); +extern void zvol_rename_minors(spa_t *, const char *, const char *, boolean_t); + +#ifdef _KERNEL +struct zvol_state; +typedef struct zvol_state zvol_state_handle_t; + +extern int zvol_check_volsize(uint64_t, uint64_t); +extern int zvol_check_volblocksize(const char *, uint64_t); +extern int zvol_get_stats(objset_t *, nvlist_t *); +extern boolean_t zvol_is_zvol(const char *); +extern void zvol_create_cb(objset_t *, void *, cred_t *, dmu_tx_t *); +extern int zvol_set_volsize(const char *, uint64_t); +extern int zvol_set_volblocksize(const char *, uint64_t); +extern int zvol_set_snapdev(const char *, zprop_source_t, uint64_t); +extern int zvol_set_volmode(const char *, zprop_source_t, uint64_t); +extern zvol_state_handle_t *zvol_suspend(const char *); +extern int zvol_resume(zvol_state_handle_t *); +extern void *zvol_tag(zvol_state_handle_t *); + +extern int zvol_init(void); +extern void zvol_fini(void); +extern int zvol_busy(void); + +#endif /* _KERNEL */ +#endif /* _SYS_ZVOL_H */ diff --git a/include/sys/zvol_impl.h b/include/sys/zvol_impl.h new file mode 100644 index 000000000000..36199c311d07 --- /dev/null +++ b/include/sys/zvol_impl.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _SYS_ZVOL_IMPL_H +#define _SYS_ZVOL_IMPL_H + +#include + +#define ZVOL_RDONLY 0x1 +/* + * Whether the zvol has been written to (as opposed to ZVOL_RDONLY, which + * specifies whether or not the zvol _can_ be written to) + */ +#define ZVOL_WRITTEN_TO 0x2 + +#define ZVOL_DUMPIFIED 0x4 + +#define ZVOL_EXCL 0x8 + +/* + * The in-core state of each volume. + */ +typedef struct zvol_state { + char zv_name[MAXNAMELEN]; /* name */ + uint64_t zv_volsize; /* advertised space */ + uint64_t zv_volblocksize; /* volume block size */ + objset_t *zv_objset; /* objset handle */ + uint32_t zv_flags; /* ZVOL_* flags */ + uint32_t zv_open_count; /* open counts */ + uint32_t zv_changed; /* disk changed */ + zilog_t *zv_zilog; /* ZIL handle */ + zfs_rangelock_t zv_rangelock; /* for range locking */ + dnode_t *zv_dn; /* dnode hold */ + dataset_kstats_t zv_kstat; /* zvol kstats */ + list_node_t zv_next; /* next zvol_state_t linkage */ + uint64_t zv_hash; /* name hash */ + struct hlist_node zv_hlink; /* hash link */ + kmutex_t zv_state_lock; /* protects zvol_state_t */ + atomic_t zv_suspend_ref; /* refcount for suspend */ + krwlock_t zv_suspend_lock; /* suspend lock */ + struct zvol_state_os *zv_zso; /* private platform state */ +} zvol_state_t; + + +extern list_t zvol_state_list; +extern krwlock_t zvol_state_lock; +#define ZVOL_HT_SIZE 1024 +extern struct hlist_head *zvol_htable; +#define ZVOL_HT_HEAD(hash) (&zvol_htable[(hash) & (ZVOL_HT_SIZE-1)]) +extern zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE]; + +extern unsigned int zvol_volmode; +extern unsigned int zvol_inhibit_dev; + +/* + * platform independent functions exported to platform code + */ +zvol_state_t *zvol_find_by_name_hash(const char *name, + uint64_t hash, int mode); +int zvol_first_open(zvol_state_t *zv, boolean_t readonly); +uint64_t zvol_name_hash(const char *name); +void zvol_remove_minors_impl(const char *name); +void zvol_last_close(zvol_state_t *zv); +void zvol_insert(zvol_state_t *zv); +void zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, + uint64_t len, boolean_t sync); +void zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, + uint64_t size, int sync); +int zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, + zio_t *zio); +int zvol_init_impl(void); +void zvol_fini_impl(void); + +/* + * platform dependent functions exported to platform independent code + */ +typedef struct zvol_platform_ops { + void (*zv_free)(zvol_state_t *); + void (*zv_rename_minor)(zvol_state_t *, const char *); + int (*zv_create_minor)(const char *); + int (*zv_update_volsize)(zvol_state_t *, uint64_t); + boolean_t (*zv_is_zvol)(const char *); + void (*zv_clear_private)(zvol_state_t *); + void (*zv_set_disk_ro)(zvol_state_t *, int flags); + void (*zv_set_capacity)(zvol_state_t *, uint64_t capacity); +} zvol_platform_ops_t; + +void zvol_register_ops(const zvol_platform_ops_t *ops); + +#endif diff --git a/include/thread_pool.h b/include/thread_pool.h new file mode 100644 index 000000000000..57266f11c5a6 --- /dev/null +++ b/include/thread_pool.h @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THREAD_POOL_H_ +#define _THREAD_POOL_H_ + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct tpool tpool_t; /* opaque thread pool descriptor */ + +#if defined(__STDC__) + +extern tpool_t *tpool_create(uint_t min_threads, uint_t max_threads, + uint_t linger, pthread_attr_t *attr); +extern int tpool_dispatch(tpool_t *tpool, + void (*func)(void *), void *arg); +extern void tpool_destroy(tpool_t *tpool); +extern void tpool_abandon(tpool_t *tpool); +extern void tpool_wait(tpool_t *tpool); +extern void tpool_suspend(tpool_t *tpool); +extern int tpool_suspended(tpool_t *tpool); +extern void tpool_resume(tpool_t *tpool); +extern int tpool_member(tpool_t *tpool); + +#else /* Non ANSI */ + +extern tpool_t *tpool_create(); +extern int tpool_dispatch(); +extern void tpool_destroy(); +extern void tpool_abandon(); +extern void tpool_wait(); +extern void tpool_suspend(); +extern int tpool_suspended(); +extern void tpool_resume(); +extern int tpool_member(); + +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _THREAD_POOL_H_ */ diff --git a/include/zfeature_common.h b/include/zfeature_common.h new file mode 100644 index 000000000000..db0138ae8e39 --- /dev/null +++ b/include/zfeature_common.h @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +#ifndef _ZFEATURE_COMMON_H +#define _ZFEATURE_COMMON_H + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct zfeature_info; + +typedef enum spa_feature { + SPA_FEATURE_NONE = -1, + SPA_FEATURE_ASYNC_DESTROY, + SPA_FEATURE_EMPTY_BPOBJ, + SPA_FEATURE_LZ4_COMPRESS, + SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, + SPA_FEATURE_SPACEMAP_HISTOGRAM, + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_HOLE_BIRTH, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_EMBEDDED_DATA, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_FS_SS_LIMIT, + SPA_FEATURE_LARGE_BLOCKS, + SPA_FEATURE_LARGE_DNODE, + SPA_FEATURE_SHA512, + SPA_FEATURE_SKEIN, +#if !defined(__FreeBSD__) + SPA_FEATURE_EDONR, +#endif + SPA_FEATURE_USEROBJ_ACCOUNTING, + SPA_FEATURE_ENCRYPTION, + SPA_FEATURE_PROJECT_QUOTA, + SPA_FEATURE_DEVICE_REMOVAL, + SPA_FEATURE_OBSOLETE_COUNTS, + SPA_FEATURE_POOL_CHECKPOINT, + SPA_FEATURE_SPACEMAP_V2, + SPA_FEATURE_ALLOCATION_CLASSES, + SPA_FEATURE_RESILVER_DEFER, + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_REDACTION_BOOKMARKS, + SPA_FEATURE_REDACTED_DATASETS, + SPA_FEATURE_BOOKMARK_WRITTEN, + SPA_FEATURE_LOG_SPACEMAP, + SPA_FEATURE_LIVELIST, + SPA_FEATURE_DEVICE_REBUILD, + SPA_FEATURE_ZSTD_COMPRESS, + SPA_FEATURES +} spa_feature_t; + +#define SPA_FEATURE_DISABLED (-1ULL) + +typedef enum zfeature_flags { + /* Can open pool readonly even if this feature is not supported. */ + ZFEATURE_FLAG_READONLY_COMPAT = (1 << 0), + /* + * Is this feature necessary to load the pool? i.e. do we need this + * feature to read the full feature list out of the MOS? + */ + ZFEATURE_FLAG_MOS = (1 << 1), + /* Activate this feature at the same time it is enabled. */ + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE = (1 << 2), + /* Each dataset has a field set if it has ever used this feature. */ + ZFEATURE_FLAG_PER_DATASET = (1 << 3) +} zfeature_flags_t; + +typedef enum zfeature_type { + ZFEATURE_TYPE_BOOLEAN, + ZFEATURE_TYPE_UINT64_ARRAY, + ZFEATURE_NUM_TYPES +} zfeature_type_t; + +typedef struct zfeature_info { + spa_feature_t fi_feature; + const char *fi_uname; /* User-facing feature name */ + const char *fi_guid; /* On-disk feature identifier */ + const char *fi_desc; /* Feature description */ + zfeature_flags_t fi_flags; + boolean_t fi_zfs_mod_supported; /* supported by running zfs module */ + zfeature_type_t fi_type; /* Only relevant for PER_DATASET features */ + /* array of dependencies, terminated by SPA_FEATURE_NONE */ + const spa_feature_t *fi_depends; +} zfeature_info_t; + +typedef int (zfeature_func_t)(zfeature_info_t *, void *); + +#define ZFS_FEATURE_DEBUG + +extern zfeature_info_t spa_feature_table[SPA_FEATURES]; + +extern boolean_t zfeature_is_valid_guid(const char *); + +extern boolean_t zfeature_is_supported(const char *); +extern int zfeature_lookup_guid(const char *, spa_feature_t *); +extern int zfeature_lookup_name(const char *, spa_feature_t *); +extern boolean_t zfeature_depends_on(spa_feature_t, spa_feature_t); + +extern void zpool_feature_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFEATURE_COMMON_H */ diff --git a/include/zfs_comutil.h b/include/zfs_comutil.h new file mode 100644 index 000000000000..17b07d9e4b0c --- /dev/null +++ b/include/zfs_comutil.h @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#ifndef _ZFS_COMUTIL_H +#define _ZFS_COMUTIL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +extern boolean_t zfs_allocatable_devs(nvlist_t *); +extern boolean_t zfs_special_devs(nvlist_t *, char *); +extern void zpool_get_load_policy(nvlist_t *, zpool_load_policy_t *); + +extern int zfs_zpl_version_map(int spa_version); +extern int zfs_spa_version_map(int zpl_version); + +extern boolean_t zfs_dataset_name_hidden(const char *); + +#define ZFS_NUM_LEGACY_HISTORY_EVENTS 41 +extern const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS]; + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_COMUTIL_H */ diff --git a/include/zfs_deleg.h b/include/zfs_deleg.h new file mode 100644 index 000000000000..5738b2a73dcd --- /dev/null +++ b/include/zfs_deleg.h @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + */ + +#ifndef _ZFS_DELEG_H +#define _ZFS_DELEG_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define ZFS_DELEG_SET_NAME_CHR '@' /* set name lead char */ +#define ZFS_DELEG_FIELD_SEP_CHR '$' /* field separator */ + +/* + * Max name length for a delegation attribute + */ +#define ZFS_MAX_DELEG_NAME 128 + +#define ZFS_DELEG_LOCAL 'l' +#define ZFS_DELEG_DESCENDENT 'd' +#define ZFS_DELEG_NA '-' + +typedef enum { + ZFS_DELEG_NOTE_CREATE, + ZFS_DELEG_NOTE_DESTROY, + ZFS_DELEG_NOTE_SNAPSHOT, + ZFS_DELEG_NOTE_ROLLBACK, + ZFS_DELEG_NOTE_CLONE, + ZFS_DELEG_NOTE_PROMOTE, + ZFS_DELEG_NOTE_RENAME, + ZFS_DELEG_NOTE_SEND, + ZFS_DELEG_NOTE_RECEIVE, + ZFS_DELEG_NOTE_ALLOW, + ZFS_DELEG_NOTE_USERPROP, + ZFS_DELEG_NOTE_MOUNT, + ZFS_DELEG_NOTE_SHARE, + ZFS_DELEG_NOTE_USERQUOTA, + ZFS_DELEG_NOTE_GROUPQUOTA, + ZFS_DELEG_NOTE_USERUSED, + ZFS_DELEG_NOTE_GROUPUSED, + ZFS_DELEG_NOTE_USEROBJQUOTA, + ZFS_DELEG_NOTE_GROUPOBJQUOTA, + ZFS_DELEG_NOTE_USEROBJUSED, + ZFS_DELEG_NOTE_GROUPOBJUSED, + ZFS_DELEG_NOTE_HOLD, + ZFS_DELEG_NOTE_RELEASE, + ZFS_DELEG_NOTE_DIFF, + ZFS_DELEG_NOTE_BOOKMARK, + ZFS_DELEG_NOTE_LOAD_KEY, + ZFS_DELEG_NOTE_CHANGE_KEY, + ZFS_DELEG_NOTE_PROJECTUSED, + ZFS_DELEG_NOTE_PROJECTQUOTA, + ZFS_DELEG_NOTE_PROJECTOBJUSED, + ZFS_DELEG_NOTE_PROJECTOBJQUOTA, + ZFS_DELEG_NOTE_NONE +} zfs_deleg_note_t; + +typedef struct zfs_deleg_perm_tab { + char *z_perm; + zfs_deleg_note_t z_note; +} zfs_deleg_perm_tab_t; + +extern zfs_deleg_perm_tab_t zfs_deleg_perm_tab[]; + +int zfs_deleg_verify_nvlist(nvlist_t *nvlist); +void zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, + char checkflag, void *data); +const char *zfs_deleg_canonicalize_perm(const char *perm); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_DELEG_H */ diff --git a/include/zfs_fletcher.h b/include/zfs_fletcher.h new file mode 100644 index 000000000000..9e8b2cf7c729 --- /dev/null +++ b/include/zfs_fletcher.h @@ -0,0 +1,158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#ifndef _ZFS_FLETCHER_H +#define _ZFS_FLETCHER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * fletcher checksum functions + * + * Note: Fletcher checksum methods expect buffer size to be 4B aligned. This + * limitation stems from the algorithm design. Performing incremental checksum + * without said alignment would yield different results. Therefore, the code + * includes assertions for the size alignment. + * For compatibility, it is required that some code paths calculate checksum of + * non-aligned buffer sizes. For this purpose, `fletcher_4_native_varsize()` + * checksum method is added. This method will ignore last (size % 4) bytes of + * the data buffer. + */ +void fletcher_init(zio_cksum_t *); +void fletcher_2_native(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_2_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); +void fletcher_4_native(const void *, uint64_t, const void *, zio_cksum_t *); +int fletcher_2_incremental_native(void *, size_t, void *); +int fletcher_2_incremental_byteswap(void *, size_t, void *); +void fletcher_4_native_varsize(const void *, uint64_t, zio_cksum_t *); +void fletcher_4_byteswap(const void *, uint64_t, const void *, zio_cksum_t *); +int fletcher_4_incremental_native(void *, size_t, void *); +int fletcher_4_incremental_byteswap(void *, size_t, void *); +int fletcher_4_impl_set(const char *selector); +void fletcher_4_init(void); +void fletcher_4_fini(void); + + + +/* Internal fletcher ctx */ + +typedef struct zfs_fletcher_superscalar { + uint64_t v[4]; +} zfs_fletcher_superscalar_t; + +typedef struct zfs_fletcher_sse { + uint64_t v[2] __attribute__((aligned(16))); +} zfs_fletcher_sse_t; + +typedef struct zfs_fletcher_avx { + uint64_t v[4] __attribute__((aligned(32))); +} zfs_fletcher_avx_t; + +typedef struct zfs_fletcher_avx512 { + uint64_t v[8] __attribute__((aligned(64))); +} zfs_fletcher_avx512_t; + +typedef struct zfs_fletcher_aarch64_neon { + uint64_t v[2] __attribute__((aligned(16))); +} zfs_fletcher_aarch64_neon_t; + + +typedef union fletcher_4_ctx { + zio_cksum_t scalar; + zfs_fletcher_superscalar_t superscalar[4]; + +#if defined(HAVE_SSE2) || (defined(HAVE_SSE2) && defined(HAVE_SSSE3)) + zfs_fletcher_sse_t sse[4]; +#endif +#if defined(HAVE_AVX) && defined(HAVE_AVX2) + zfs_fletcher_avx_t avx[4]; +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) + zfs_fletcher_avx512_t avx512[4]; +#endif +#if defined(__aarch64__) + zfs_fletcher_aarch64_neon_t aarch64_neon[4]; +#endif +} fletcher_4_ctx_t; + +/* + * fletcher checksum struct + */ +typedef void (*fletcher_4_init_f)(fletcher_4_ctx_t *); +typedef void (*fletcher_4_fini_f)(fletcher_4_ctx_t *, zio_cksum_t *); +typedef void (*fletcher_4_compute_f)(fletcher_4_ctx_t *, + const void *, uint64_t); + +typedef struct fletcher_4_func { + fletcher_4_init_f init_native; + fletcher_4_fini_f fini_native; + fletcher_4_compute_f compute_native; + fletcher_4_init_f init_byteswap; + fletcher_4_fini_f fini_byteswap; + fletcher_4_compute_f compute_byteswap; + boolean_t (*valid)(void); + const char *name; +} fletcher_4_ops_t; + +extern const fletcher_4_ops_t fletcher_4_superscalar_ops; +extern const fletcher_4_ops_t fletcher_4_superscalar4_ops; + +#if defined(HAVE_SSE2) +extern const fletcher_4_ops_t fletcher_4_sse2_ops; +#endif + +#if defined(HAVE_SSE2) && defined(HAVE_SSSE3) +extern const fletcher_4_ops_t fletcher_4_ssse3_ops; +#endif + +#if defined(HAVE_AVX) && defined(HAVE_AVX2) +extern const fletcher_4_ops_t fletcher_4_avx2_ops; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512F) +extern const fletcher_4_ops_t fletcher_4_avx512f_ops; +#endif + +#if defined(__x86_64) && defined(HAVE_AVX512BW) +extern const fletcher_4_ops_t fletcher_4_avx512bw_ops; +#endif + +#if defined(__aarch64__) +extern const fletcher_4_ops_t fletcher_4_aarch64_neon_ops; +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_FLETCHER_H */ diff --git a/include/zfs_namecheck.h b/include/zfs_namecheck.h new file mode 100644 index 000000000000..197c40b56e8c --- /dev/null +++ b/include/zfs_namecheck.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +#ifndef _ZFS_NAMECHECK_H +#define _ZFS_NAMECHECK_H + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + NAME_ERR_LEADING_SLASH, /* name begins with leading slash */ + NAME_ERR_EMPTY_COMPONENT, /* name contains an empty component */ + NAME_ERR_TRAILING_SLASH, /* name ends with a slash */ + NAME_ERR_INVALCHAR, /* invalid character found */ + NAME_ERR_MULTIPLE_DELIMITERS, /* multiple '@'/'#' delimiters found */ + NAME_ERR_NOLETTER, /* pool doesn't begin with a letter */ + NAME_ERR_RESERVED, /* entire name is reserved */ + NAME_ERR_DISKLIKE, /* reserved disk name (c[0-9].*) */ + NAME_ERR_TOOLONG, /* name is too long */ + NAME_ERR_SELF_REF, /* reserved self path name ('.') */ + NAME_ERR_PARENT_REF, /* reserved parent path name ('..') */ + NAME_ERR_NO_AT, /* permission set is missing '@' */ + NAME_ERR_NO_POUND, /* permission set is missing '#' */ +} namecheck_err_t; + +#define ZFS_PERMSET_MAXLEN 64 + +extern int zfs_max_dataset_nesting; + +int get_dataset_depth(const char *); +int pool_namecheck(const char *, namecheck_err_t *, char *); +int entity_namecheck(const char *, namecheck_err_t *, char *); +int dataset_namecheck(const char *, namecheck_err_t *, char *); +int snapshot_namecheck(const char *, namecheck_err_t *, char *); +int bookmark_namecheck(const char *, namecheck_err_t *, char *); +int dataset_nestcheck(const char *); +int mountpoint_namecheck(const char *, namecheck_err_t *); +int zfs_component_namecheck(const char *, namecheck_err_t *, char *); +int permset_namecheck(const char *, namecheck_err_t *, char *); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_NAMECHECK_H */ diff --git a/include/zfs_prop.h b/include/zfs_prop.h new file mode 100644 index 000000000000..89b6a20243fb --- /dev/null +++ b/include/zfs_prop.h @@ -0,0 +1,133 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_PROP_H +#define _ZFS_PROP_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * For index types (e.g. compression and checksum), we want the numeric value + * in the kernel, but the string value in userland. + */ +typedef enum { + PROP_TYPE_NUMBER, /* numeric value */ + PROP_TYPE_STRING, /* string value */ + PROP_TYPE_INDEX /* numeric value indexed by string */ +} zprop_type_t; + +typedef enum { + PROP_DEFAULT, + PROP_READONLY, + PROP_INHERIT, + /* + * ONETIME properties are a sort of conglomeration of READONLY + * and INHERIT. They can be set only during object creation, + * after that they are READONLY. If not explicitly set during + * creation, they can be inherited. ONETIME_DEFAULT properties + * work the same way, but they will default instead of + * inheriting a value. + */ + PROP_ONETIME, + PROP_ONETIME_DEFAULT +} zprop_attr_t; + +typedef struct zfs_index { + const char *pi_name; + uint64_t pi_value; +} zprop_index_t; + +typedef struct { + const char *pd_name; /* human-readable property name */ + int pd_propnum; /* property number */ + zprop_type_t pd_proptype; /* string, boolean, index, number */ + const char *pd_strdefault; /* default for strings */ + uint64_t pd_numdefault; /* for boolean / index / number */ + zprop_attr_t pd_attr; /* default, readonly, inherit */ + int pd_types; /* bitfield of valid dataset types */ + /* fs | vol | snap; or pool */ + const char *pd_values; /* string telling acceptable values */ + const char *pd_colname; /* column header for "zfs list" */ + boolean_t pd_rightalign; /* column alignment for "zfs list" */ + boolean_t pd_visible; /* do we list this property with the */ + /* "zfs get" help message */ + boolean_t pd_zfs_mod_supported; /* supported by running zfs module */ + const zprop_index_t *pd_table; /* for index properties, a table */ + /* defining the possible values */ + size_t pd_table_size; /* number of entries in pd_table[] */ +} zprop_desc_t; + +/* + * zfs dataset property functions + */ +void zfs_prop_init(void); +zprop_type_t zfs_prop_get_type(zfs_prop_t); +boolean_t zfs_prop_delegatable(zfs_prop_t prop); +zprop_desc_t *zfs_prop_get_table(void); + +/* + * zpool property functions + */ +void zpool_prop_init(void); +zprop_type_t zpool_prop_get_type(zpool_prop_t); +zprop_desc_t *zpool_prop_get_table(void); + +/* + * Common routines to initialize property tables + */ +void zprop_register_impl(int, const char *, zprop_type_t, uint64_t, + const char *, zprop_attr_t, int, const char *, const char *, + boolean_t, boolean_t, const zprop_index_t *); +void zprop_register_string(int, const char *, const char *, + zprop_attr_t attr, int, const char *, const char *); +void zprop_register_number(int, const char *, uint64_t, zprop_attr_t, int, + const char *, const char *); +void zprop_register_index(int, const char *, uint64_t, zprop_attr_t, int, + const char *, const char *, const zprop_index_t *); +void zprop_register_hidden(int, const char *, zprop_type_t, zprop_attr_t, + int, const char *); + +/* + * Common routines for zfs and zpool property management + */ +int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); +int zprop_name_to_prop(const char *, zfs_type_t); +int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); +int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); +uint64_t zprop_random_value(int, uint64_t, zfs_type_t); +const char *zprop_values(int, zfs_type_t); +size_t zprop_width(int, boolean_t *, zfs_type_t); +boolean_t zprop_valid_for_type(int, zfs_type_t, boolean_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_PROP_H */ diff --git a/lib/Makefile.am b/lib/Makefile.am new file mode 100644 index 000000000000..dda87e41c0aa --- /dev/null +++ b/lib/Makefile.am @@ -0,0 +1,14 @@ +# NB: GNU Automake Manual, Chapter 8.3.5: Libtool Convenience Libraries +# These nine libraries are intermediary build components. +SUBDIRS = libavl libicp libshare libspl libtpool libzstd + +if BUILD_LINUX +SUBDIRS += libefi +endif + +# libzutil depends on libefi if present +SUBDIRS += libzutil libunicode + +# These five libraries, which are installed as the final build product, +# incorporate the eight convenience libraries given above. +SUBDIRS += libuutil libnvpair libzfs_core libzfs libzpool diff --git a/lib/libavl/Makefile.am b/lib/libavl/Makefile.am new file mode 100644 index 000000000000..6087b1d2f4a0 --- /dev/null +++ b/lib/libavl/Makefile.am @@ -0,0 +1,14 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = $(top_srcdir)/module/avl/ + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +noinst_LTLIBRARIES = libavl.la + +KERNEL_C = \ + avl.c + +nodist_libavl_la_SOURCES = \ + $(KERNEL_C) diff --git a/lib/libefi/Makefile.am b/lib/libefi/Makefile.am new file mode 100644 index 000000000000..fab6c8d477a6 --- /dev/null +++ b/lib/libefi/Makefile.am @@ -0,0 +1,12 @@ +include $(top_srcdir)/config/Rules.am + +AM_CFLAGS += $(LIBUUID_CFLAGS) $(ZLIB_CFLAGS) + +noinst_LTLIBRARIES = libefi.la + +USER_C = \ + rdwr_efi.c + +libefi_la_SOURCES = $(USER_C) + +libefi_la_LIBADD = $(LIBUUID_LIBS) $(ZLIB_LIBS) diff --git a/lib/libefi/rdwr_efi.c b/lib/libefi/rdwr_efi.c new file mode 100644 index 000000000000..2cb093f9601d --- /dev/null +++ b/lib/libefi/rdwr_efi.c @@ -0,0 +1,1655 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2012 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static struct uuid_to_ptag { + struct uuid uuid; +} conversion_array[] = { + { EFI_UNUSED }, + { EFI_BOOT }, + { EFI_ROOT }, + { EFI_SWAP }, + { EFI_USR }, + { EFI_BACKUP }, + { EFI_UNUSED }, /* STAND is never used */ + { EFI_VAR }, + { EFI_HOME }, + { EFI_ALTSCTR }, + { EFI_UNUSED }, /* CACHE (cachefs) is never used */ + { EFI_RESERVED }, + { EFI_SYSTEM }, + { EFI_LEGACY_MBR }, + { EFI_SYMC_PUB }, + { EFI_SYMC_CDS }, + { EFI_MSFT_RESV }, + { EFI_DELL_BASIC }, + { EFI_DELL_RAID }, + { EFI_DELL_SWAP }, + { EFI_DELL_LVM }, + { EFI_DELL_RESV }, + { EFI_AAPL_HFS }, + { EFI_AAPL_UFS }, + { EFI_FREEBSD_BOOT }, + { EFI_FREEBSD_SWAP }, + { EFI_FREEBSD_UFS }, + { EFI_FREEBSD_VINUM }, + { EFI_FREEBSD_ZFS }, + { EFI_BIOS_BOOT }, + { EFI_INTC_RS }, + { EFI_SNE_BOOT }, + { EFI_LENOVO_BOOT }, + { EFI_MSFT_LDMM }, + { EFI_MSFT_LDMD }, + { EFI_MSFT_RE }, + { EFI_IBM_GPFS }, + { EFI_MSFT_STORAGESPACES }, + { EFI_HPQ_DATA }, + { EFI_HPQ_SVC }, + { EFI_RHT_DATA }, + { EFI_RHT_HOME }, + { EFI_RHT_SRV }, + { EFI_RHT_DMCRYPT }, + { EFI_RHT_LUKS }, + { EFI_FREEBSD_DISKLABEL }, + { EFI_AAPL_RAID }, + { EFI_AAPL_RAIDOFFLINE }, + { EFI_AAPL_BOOT }, + { EFI_AAPL_LABEL }, + { EFI_AAPL_TVRECOVERY }, + { EFI_AAPL_CORESTORAGE }, + { EFI_NETBSD_SWAP }, + { EFI_NETBSD_FFS }, + { EFI_NETBSD_LFS }, + { EFI_NETBSD_RAID }, + { EFI_NETBSD_CAT }, + { EFI_NETBSD_CRYPT }, + { EFI_GOOG_KERN }, + { EFI_GOOG_ROOT }, + { EFI_GOOG_RESV }, + { EFI_HAIKU_BFS }, + { EFI_MIDNIGHTBSD_BOOT }, + { EFI_MIDNIGHTBSD_DATA }, + { EFI_MIDNIGHTBSD_SWAP }, + { EFI_MIDNIGHTBSD_UFS }, + { EFI_MIDNIGHTBSD_VINUM }, + { EFI_MIDNIGHTBSD_ZFS }, + { EFI_CEPH_JOURNAL }, + { EFI_CEPH_DMCRYPTJOURNAL }, + { EFI_CEPH_OSD }, + { EFI_CEPH_DMCRYPTOSD }, + { EFI_CEPH_CREATE }, + { EFI_CEPH_DMCRYPTCREATE }, + { EFI_OPENBSD_DISKLABEL }, + { EFI_BBRY_QNX }, + { EFI_BELL_PLAN9 }, + { EFI_VMW_KCORE }, + { EFI_VMW_VMFS }, + { EFI_VMW_RESV }, + { EFI_RHT_ROOTX86 }, + { EFI_RHT_ROOTAMD64 }, + { EFI_RHT_ROOTARM }, + { EFI_RHT_ROOTARM64 }, + { EFI_ACRONIS_SECUREZONE }, + { EFI_ONIE_BOOT }, + { EFI_ONIE_CONFIG }, + { EFI_IBM_PPRPBOOT }, + { EFI_FREEDESKTOP_BOOT } +}; + +/* + * Default vtoc information for non-SVr4 partitions + */ +struct dk_map2 default_vtoc_map[NDKMAP] = { + { V_ROOT, 0 }, /* a - 0 */ + { V_SWAP, V_UNMNT }, /* b - 1 */ + { V_BACKUP, V_UNMNT }, /* c - 2 */ + { V_UNASSIGNED, 0 }, /* d - 3 */ + { V_UNASSIGNED, 0 }, /* e - 4 */ + { V_UNASSIGNED, 0 }, /* f - 5 */ + { V_USR, 0 }, /* g - 6 */ + { V_UNASSIGNED, 0 }, /* h - 7 */ + +#if defined(_SUNOS_VTOC_16) + +#if defined(i386) || defined(__amd64) || defined(__arm) || \ + defined(__powerpc) || defined(__sparc) || defined(__s390__) || \ + defined(__mips__) || defined(__rv64g__) + { V_BOOT, V_UNMNT }, /* i - 8 */ + { V_ALTSCTR, 0 }, /* j - 9 */ + +#else +#error No VTOC format defined. +#endif /* defined(i386) */ + + { V_UNASSIGNED, 0 }, /* k - 10 */ + { V_UNASSIGNED, 0 }, /* l - 11 */ + { V_UNASSIGNED, 0 }, /* m - 12 */ + { V_UNASSIGNED, 0 }, /* n - 13 */ + { V_UNASSIGNED, 0 }, /* o - 14 */ + { V_UNASSIGNED, 0 }, /* p - 15 */ +#endif /* defined(_SUNOS_VTOC_16) */ +}; + +int efi_debug = 0; + +static int efi_read(int, struct dk_gpt *); + +/* + * Return a 32-bit CRC of the contents of the buffer. Pre-and-post + * one's conditioning will be handled by crc32() internally. + */ +static uint32_t +efi_crc32(const unsigned char *buf, unsigned int size) +{ + uint32_t crc = crc32(0, Z_NULL, 0); + + crc = crc32(crc, buf, size); + + return (crc); +} + +static int +read_disk_info(int fd, diskaddr_t *capacity, uint_t *lbsize) +{ + int sector_size; + unsigned long long capacity_size; + + if (ioctl(fd, BLKSSZGET, §or_size) < 0) + return (-1); + + if (ioctl(fd, BLKGETSIZE64, &capacity_size) < 0) + return (-1); + + *lbsize = (uint_t)sector_size; + *capacity = (diskaddr_t)(capacity_size / sector_size); + + return (0); +} + +static int +efi_get_info(int fd, struct dk_cinfo *dki_info) +{ + char *path; + char *dev_path; + int rval = 0; + + memset(dki_info, 0, sizeof (*dki_info)); + + path = calloc(1, PATH_MAX); + if (path == NULL) + goto error; + + /* + * The simplest way to get the partition number under linux is + * to parse it out of the /dev/ block device name. + * The kernel creates this using the partition number when it + * populates /dev/ so it may be trusted. The tricky bit here is + * that the naming convention is based on the block device type. + * So we need to take this in to account when parsing out the + * partition information. Another issue is that the libefi API + * API only provides the open fd and not the file path. To handle + * this realpath(3) is used to resolve the block device name from + * /proc/self/fd/. Aside from the partition number we collect + * some additional device info. + */ + (void) sprintf(path, "/proc/self/fd/%d", fd); + dev_path = realpath(path, NULL); + free(path); + + if (dev_path == NULL) + goto error; + + if ((strncmp(dev_path, "/dev/sd", 7) == 0)) { + strcpy(dki_info->dki_cname, "sd"); + dki_info->dki_ctype = DKC_SCSI_CCS; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/hd", 7) == 0)) { + strcpy(dki_info->dki_cname, "hd"); + dki_info->dki_ctype = DKC_DIRECT; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/md", 7) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_MD; + strcpy(dki_info->dki_dname, "md"); + rval = sscanf(dev_path, "/dev/md%[0-9]p%hu", + dki_info->dki_dname + 2, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/vd", 7) == 0)) { + strcpy(dki_info->dki_cname, "vd"); + dki_info->dki_ctype = DKC_MD; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/xvd", 8) == 0)) { + strcpy(dki_info->dki_cname, "xvd"); + dki_info->dki_ctype = DKC_MD; + rval = sscanf(dev_path, "/dev/%[a-zA-Z]%hu", + dki_info->dki_dname, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/zd", 7) == 0)) { + strcpy(dki_info->dki_cname, "zd"); + dki_info->dki_ctype = DKC_MD; + strcpy(dki_info->dki_dname, "zd"); + rval = sscanf(dev_path, "/dev/zd%[0-9]p%hu", + dki_info->dki_dname + 2, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/dm-", 8) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_VBD; + strcpy(dki_info->dki_dname, "dm-"); + rval = sscanf(dev_path, "/dev/dm-%[0-9]p%hu", + dki_info->dki_dname + 3, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/ram", 8) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_PCMCIA_MEM; + strcpy(dki_info->dki_dname, "ram"); + rval = sscanf(dev_path, "/dev/ram%[0-9]p%hu", + dki_info->dki_dname + 3, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/loop", 9) == 0)) { + strcpy(dki_info->dki_cname, "pseudo"); + dki_info->dki_ctype = DKC_VBD; + strcpy(dki_info->dki_dname, "loop"); + rval = sscanf(dev_path, "/dev/loop%[0-9]p%hu", + dki_info->dki_dname + 4, + &dki_info->dki_partition); + } else if ((strncmp(dev_path, "/dev/nvme", 9) == 0)) { + strcpy(dki_info->dki_cname, "nvme"); + dki_info->dki_ctype = DKC_SCSI_CCS; + strcpy(dki_info->dki_dname, "nvme"); + (void) sscanf(dev_path, "/dev/nvme%[0-9]", + dki_info->dki_dname + 4); + size_t controller_length = strlen( + dki_info->dki_dname); + strcpy(dki_info->dki_dname + controller_length, + "n"); + rval = sscanf(dev_path, + "/dev/nvme%*[0-9]n%[0-9]p%hu", + dki_info->dki_dname + controller_length + 1, + &dki_info->dki_partition); + } else { + strcpy(dki_info->dki_dname, "unknown"); + strcpy(dki_info->dki_cname, "unknown"); + dki_info->dki_ctype = DKC_UNKNOWN; + } + + switch (rval) { + case 0: + errno = EINVAL; + goto error; + case 1: + dki_info->dki_partition = 0; + } + + free(dev_path); + + return (0); +error: + if (efi_debug) + (void) fprintf(stderr, "DKIOCINFO errno 0x%x\n", errno); + + switch (errno) { + case EIO: + return (VT_EIO); + case EINVAL: + return (VT_EINVAL); + default: + return (VT_ERROR); + } +} + +/* + * the number of blocks the EFI label takes up (round up to nearest + * block) + */ +#define NBLOCKS(p, l) (1 + ((((p) * (int)sizeof (efi_gpe_t)) + \ + ((l) - 1)) / (l))) +/* number of partitions -- limited by what we can malloc */ +#define MAX_PARTS ((4294967295UL - sizeof (struct dk_gpt)) / \ + sizeof (struct dk_part)) + +int +efi_alloc_and_init(int fd, uint32_t nparts, struct dk_gpt **vtoc) +{ + diskaddr_t capacity = 0; + uint_t lbsize = 0; + uint_t nblocks; + size_t length; + struct dk_gpt *vptr; + struct uuid uuid; + struct dk_cinfo dki_info; + + if (read_disk_info(fd, &capacity, &lbsize) != 0) + return (-1); + + if (efi_get_info(fd, &dki_info) != 0) + return (-1); + + if (dki_info.dki_partition != 0) + return (-1); + + if ((dki_info.dki_ctype == DKC_PCMCIA_MEM) || + (dki_info.dki_ctype == DKC_VBD) || + (dki_info.dki_ctype == DKC_UNKNOWN)) + return (-1); + + nblocks = NBLOCKS(nparts, lbsize); + if ((nblocks * lbsize) < EFI_MIN_ARRAY_SIZE + lbsize) { + /* 16K plus one block for the GPT */ + nblocks = EFI_MIN_ARRAY_SIZE / lbsize + 1; + } + + if (nparts > MAX_PARTS) { + if (efi_debug) { + (void) fprintf(stderr, + "the maximum number of partitions supported is %lu\n", + MAX_PARTS); + } + return (-1); + } + + length = sizeof (struct dk_gpt) + + sizeof (struct dk_part) * (nparts - 1); + + vptr = calloc(1, length); + if (vptr == NULL) + return (-1); + + *vtoc = vptr; + + vptr->efi_version = EFI_VERSION_CURRENT; + vptr->efi_lbasize = lbsize; + vptr->efi_nparts = nparts; + /* + * add one block here for the PMBR; on disks with a 512 byte + * block size and 128 or fewer partitions, efi_first_u_lba + * should work out to "34" + */ + vptr->efi_first_u_lba = nblocks + 1; + vptr->efi_last_lba = capacity - 1; + vptr->efi_altern_lba = capacity -1; + vptr->efi_last_u_lba = vptr->efi_last_lba - nblocks; + + (void) uuid_generate((uchar_t *)&uuid); + UUID_LE_CONVERT(vptr->efi_disk_uguid, uuid); + return (0); +} + +/* + * Read EFI - return partition number upon success. + */ +int +efi_alloc_and_read(int fd, struct dk_gpt **vtoc) +{ + int rval; + uint32_t nparts; + int length; + struct dk_gpt *vptr; + + /* figure out the number of entries that would fit into 16K */ + nparts = EFI_MIN_ARRAY_SIZE / sizeof (efi_gpe_t); + length = (int) sizeof (struct dk_gpt) + + (int) sizeof (struct dk_part) * (nparts - 1); + vptr = calloc(1, length); + + if (vptr == NULL) + return (VT_ERROR); + + vptr->efi_nparts = nparts; + rval = efi_read(fd, vptr); + + if ((rval == VT_EINVAL) && vptr->efi_nparts > nparts) { + void *tmp; + length = (int) sizeof (struct dk_gpt) + + (int) sizeof (struct dk_part) * (vptr->efi_nparts - 1); + nparts = vptr->efi_nparts; + if ((tmp = realloc(vptr, length)) == NULL) { + free(vptr); + *vtoc = NULL; + return (VT_ERROR); + } else { + vptr = tmp; + rval = efi_read(fd, vptr); + } + } + + if (rval < 0) { + if (efi_debug) { + (void) fprintf(stderr, + "read of EFI table failed, rval=%d\n", rval); + } + free(vptr); + *vtoc = NULL; + } else { + *vtoc = vptr; + } + + return (rval); +} + +static int +efi_ioctl(int fd, int cmd, dk_efi_t *dk_ioc) +{ + void *data = dk_ioc->dki_data; + int error; + diskaddr_t capacity; + uint_t lbsize; + + /* + * When the IO is not being performed in kernel as an ioctl we need + * to know the sector size so we can seek to the proper byte offset. + */ + if (read_disk_info(fd, &capacity, &lbsize) == -1) { + if (efi_debug) + fprintf(stderr, "unable to read disk info: %d", errno); + + errno = EIO; + return (-1); + } + + switch (cmd) { + case DKIOCGETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI assuming " + "LBA %d bytes\n", DEV_BSIZE); + + lbsize = DEV_BSIZE; + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI lseek " + "error: %d\n", errno); + return (error); + } + + error = read(fd, data, dk_ioc->dki_length); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI read " + "error: %d\n", errno); + return (error); + } + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCGETEFI short " + "read of %d bytes\n", error); + errno = EIO; + return (-1); + } + error = 0; + break; + + case DKIOCSETEFI: + if (lbsize == 0) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI unknown " + "LBA size\n"); + errno = EIO; + return (-1); + } + + error = lseek(fd, dk_ioc->dki_lba * lbsize, SEEK_SET); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI lseek " + "error: %d\n", errno); + return (error); + } + + error = write(fd, data, dk_ioc->dki_length); + if (error == -1) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI write " + "error: %d\n", errno); + return (error); + } + + if (error != dk_ioc->dki_length) { + if (efi_debug) + (void) fprintf(stderr, "DKIOCSETEFI short " + "write of %d bytes\n", error); + errno = EIO; + return (-1); + } + + /* Sync the new EFI table to disk */ + error = fsync(fd); + if (error == -1) + return (error); + + /* Ensure any local disk cache is also flushed */ + if (ioctl(fd, BLKFLSBUF, 0) == -1) + return (error); + + error = 0; + break; + + default: + if (efi_debug) + (void) fprintf(stderr, "unsupported ioctl()\n"); + + errno = EIO; + return (-1); + } + + return (error); +} + +int +efi_rescan(int fd) +{ + int retry = 10; + int error; + + /* Notify the kernel a devices partition table has been updated */ + while ((error = ioctl(fd, BLKRRPART)) != 0) { + if ((--retry == 0) || (errno != EBUSY)) { + (void) fprintf(stderr, "the kernel failed to rescan " + "the partition table: %d\n", errno); + return (-1); + } + usleep(50000); + } + + return (0); +} + +static int +check_label(int fd, dk_efi_t *dk_ioc) +{ + efi_gpt_t *efi; + uint_t crc; + + if (efi_ioctl(fd, DKIOCGETEFI, dk_ioc) == -1) { + switch (errno) { + case EIO: + return (VT_EIO); + default: + return (VT_ERROR); + } + } + efi = dk_ioc->dki_data; + if (efi->efi_gpt_Signature != LE_64(EFI_SIGNATURE)) { + if (efi_debug) + (void) fprintf(stderr, + "Bad EFI signature: 0x%llx != 0x%llx\n", + (long long)efi->efi_gpt_Signature, + (long long)LE_64(EFI_SIGNATURE)); + return (VT_EINVAL); + } + + /* + * check CRC of the header; the size of the header should + * never be larger than one block + */ + crc = efi->efi_gpt_HeaderCRC32; + efi->efi_gpt_HeaderCRC32 = 0; + len_t headerSize = (len_t)LE_32(efi->efi_gpt_HeaderSize); + + if (headerSize < EFI_MIN_LABEL_SIZE || headerSize > EFI_LABEL_SIZE) { + if (efi_debug) + (void) fprintf(stderr, + "Invalid EFI HeaderSize %llu. Assuming %d.\n", + headerSize, EFI_MIN_LABEL_SIZE); + } + + if ((headerSize > dk_ioc->dki_length) || + crc != LE_32(efi_crc32((unsigned char *)efi, headerSize))) { + if (efi_debug) + (void) fprintf(stderr, + "Bad EFI CRC: 0x%x != 0x%x\n", + crc, LE_32(efi_crc32((unsigned char *)efi, + headerSize))); + return (VT_EINVAL); + } + + return (0); +} + +static int +efi_read(int fd, struct dk_gpt *vtoc) +{ + int i, j; + int label_len; + int rval = 0; + int md_flag = 0; + int vdc_flag = 0; + diskaddr_t capacity = 0; + uint_t lbsize = 0; + struct dk_minfo disk_info; + dk_efi_t dk_ioc; + efi_gpt_t *efi; + efi_gpe_t *efi_parts; + struct dk_cinfo dki_info; + uint32_t user_length; + boolean_t legacy_label = B_FALSE; + + /* + * get the partition number for this file descriptor. + */ + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return (rval); + + if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && + (strncmp(dki_info.dki_dname, "md", 3) == 0)) { + md_flag++; + } else if ((strncmp(dki_info.dki_cname, "vdc", 4) == 0) && + (strncmp(dki_info.dki_dname, "vdc", 4) == 0)) { + /* + * The controller and drive name "vdc" (virtual disk client) + * indicates a LDoms virtual disk. + */ + vdc_flag++; + } + + /* get the LBA size */ + if (read_disk_info(fd, &capacity, &lbsize) == -1) { + if (efi_debug) { + (void) fprintf(stderr, + "unable to read disk info: %d", + errno); + } + return (VT_EINVAL); + } + + disk_info.dki_lbsize = lbsize; + disk_info.dki_capacity = capacity; + + if (disk_info.dki_lbsize == 0) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_read: assuming LBA 512 bytes\n"); + } + disk_info.dki_lbsize = DEV_BSIZE; + } + /* + * Read the EFI GPT to figure out how many partitions we need + * to deal with. + */ + dk_ioc.dki_lba = 1; + if (NBLOCKS(vtoc->efi_nparts, disk_info.dki_lbsize) < 34) { + label_len = EFI_MIN_ARRAY_SIZE + disk_info.dki_lbsize; + } else { + label_len = vtoc->efi_nparts * (int) sizeof (efi_gpe_t) + + disk_info.dki_lbsize; + if (label_len % disk_info.dki_lbsize) { + /* pad to physical sector size */ + label_len += disk_info.dki_lbsize; + label_len &= ~(disk_info.dki_lbsize - 1); + } + } + + if (posix_memalign((void **)&dk_ioc.dki_data, + disk_info.dki_lbsize, label_len)) + return (VT_ERROR); + + memset(dk_ioc.dki_data, 0, label_len); + dk_ioc.dki_length = disk_info.dki_lbsize; + user_length = vtoc->efi_nparts; + efi = dk_ioc.dki_data; + if (md_flag) { + dk_ioc.dki_length = label_len; + if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) { + switch (errno) { + case EIO: + return (VT_EIO); + default: + return (VT_ERROR); + } + } + } else if ((rval = check_label(fd, &dk_ioc)) == VT_EINVAL) { + /* + * No valid label here; try the alternate. Note that here + * we just read GPT header and save it into dk_ioc.data, + * Later, we will read GUID partition entry array if we + * can get valid GPT header. + */ + + /* + * This is a workaround for legacy systems. In the past, the + * last sector of SCSI disk was invisible on x86 platform. At + * that time, backup label was saved on the next to the last + * sector. It is possible for users to move a disk from previous + * solaris system to present system. Here, we attempt to search + * legacy backup EFI label first. + */ + dk_ioc.dki_lba = disk_info.dki_capacity - 2; + dk_ioc.dki_length = disk_info.dki_lbsize; + rval = check_label(fd, &dk_ioc); + if (rval == VT_EINVAL) { + /* + * we didn't find legacy backup EFI label, try to + * search backup EFI label in the last block. + */ + dk_ioc.dki_lba = disk_info.dki_capacity - 1; + dk_ioc.dki_length = disk_info.dki_lbsize; + rval = check_label(fd, &dk_ioc); + if (rval == 0) { + legacy_label = B_TRUE; + if (efi_debug) + (void) fprintf(stderr, + "efi_read: primary label corrupt; " + "using EFI backup label located on" + " the last block\n"); + } + } else { + if ((efi_debug) && (rval == 0)) + (void) fprintf(stderr, "efi_read: primary label" + " corrupt; using legacy EFI backup label " + " located on the next to last block\n"); + } + + if (rval == 0) { + dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA); + vtoc->efi_flags |= EFI_GPT_PRIMARY_CORRUPT; + vtoc->efi_nparts = + LE_32(efi->efi_gpt_NumberOfPartitionEntries); + /* + * Partition tables are between backup GPT header + * table and ParitionEntryLBA (the starting LBA of + * the GUID partition entries array). Now that we + * already got valid GPT header and saved it in + * dk_ioc.dki_data, we try to get GUID partition + * entry array here. + */ + /* LINTED */ + dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + + disk_info.dki_lbsize); + if (legacy_label) + dk_ioc.dki_length = disk_info.dki_capacity - 1 - + dk_ioc.dki_lba; + else + dk_ioc.dki_length = disk_info.dki_capacity - 2 - + dk_ioc.dki_lba; + dk_ioc.dki_length *= disk_info.dki_lbsize; + if (dk_ioc.dki_length > + ((len_t)label_len - sizeof (*dk_ioc.dki_data))) { + rval = VT_EINVAL; + } else { + /* + * read GUID partition entry array + */ + rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc); + } + } + + } else if (rval == 0) { + + dk_ioc.dki_lba = LE_64(efi->efi_gpt_PartitionEntryLBA); + /* LINTED */ + dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + + disk_info.dki_lbsize); + dk_ioc.dki_length = label_len - disk_info.dki_lbsize; + rval = efi_ioctl(fd, DKIOCGETEFI, &dk_ioc); + + } else if (vdc_flag && rval == VT_ERROR && errno == EINVAL) { + /* + * When the device is a LDoms virtual disk, the DKIOCGETEFI + * ioctl can fail with EINVAL if the virtual disk backend + * is a ZFS volume serviced by a domain running an old version + * of Solaris. This is because the DKIOCGETEFI ioctl was + * initially incorrectly implemented for a ZFS volume and it + * expected the GPT and GPE to be retrieved with a single ioctl. + * So we try to read the GPT and the GPE using that old style + * ioctl. + */ + dk_ioc.dki_lba = 1; + dk_ioc.dki_length = label_len; + rval = check_label(fd, &dk_ioc); + } + + if (rval < 0) { + free(efi); + return (rval); + } + + /* LINTED -- always longlong aligned */ + efi_parts = (efi_gpe_t *)(((char *)efi) + disk_info.dki_lbsize); + + /* + * Assemble this into a "dk_gpt" struct for easier + * digestibility by applications. + */ + vtoc->efi_version = LE_32(efi->efi_gpt_Revision); + vtoc->efi_nparts = LE_32(efi->efi_gpt_NumberOfPartitionEntries); + vtoc->efi_part_size = LE_32(efi->efi_gpt_SizeOfPartitionEntry); + vtoc->efi_lbasize = disk_info.dki_lbsize; + vtoc->efi_last_lba = disk_info.dki_capacity - 1; + vtoc->efi_first_u_lba = LE_64(efi->efi_gpt_FirstUsableLBA); + vtoc->efi_last_u_lba = LE_64(efi->efi_gpt_LastUsableLBA); + vtoc->efi_altern_lba = LE_64(efi->efi_gpt_AlternateLBA); + UUID_LE_CONVERT(vtoc->efi_disk_uguid, efi->efi_gpt_DiskGUID); + + /* + * If the array the user passed in is too small, set the length + * to what it needs to be and return + */ + if (user_length < vtoc->efi_nparts) { + return (VT_EINVAL); + } + + for (i = 0; i < vtoc->efi_nparts; i++) { + + UUID_LE_CONVERT(vtoc->efi_parts[i].p_guid, + efi_parts[i].efi_gpe_PartitionTypeGUID); + + for (j = 0; + j < sizeof (conversion_array) + / sizeof (struct uuid_to_ptag); j++) { + + if (bcmp(&vtoc->efi_parts[i].p_guid, + &conversion_array[j].uuid, + sizeof (struct uuid)) == 0) { + vtoc->efi_parts[i].p_tag = j; + break; + } + } + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) + continue; + vtoc->efi_parts[i].p_flag = + LE_16(efi_parts[i].efi_gpe_Attributes.PartitionAttrs); + vtoc->efi_parts[i].p_start = + LE_64(efi_parts[i].efi_gpe_StartingLBA); + vtoc->efi_parts[i].p_size = + LE_64(efi_parts[i].efi_gpe_EndingLBA) - + vtoc->efi_parts[i].p_start + 1; + for (j = 0; j < EFI_PART_NAME_LEN; j++) { + vtoc->efi_parts[i].p_name[j] = + (uchar_t)LE_16( + efi_parts[i].efi_gpe_PartitionName[j]); + } + + UUID_LE_CONVERT(vtoc->efi_parts[i].p_uguid, + efi_parts[i].efi_gpe_UniquePartitionGUID); + } + free(efi); + + return (dki_info.dki_partition); +} + +/* writes a "protective" MBR */ +static int +write_pmbr(int fd, struct dk_gpt *vtoc) +{ + dk_efi_t dk_ioc; + struct mboot mb; + uchar_t *cp; + diskaddr_t size_in_lba; + uchar_t *buf; + int len; + + len = (vtoc->efi_lbasize == 0) ? sizeof (mb) : vtoc->efi_lbasize; + if (posix_memalign((void **)&buf, len, len)) + return (VT_ERROR); + + /* + * Preserve any boot code and disk signature if the first block is + * already an MBR. + */ + memset(buf, 0, len); + dk_ioc.dki_lba = 0; + dk_ioc.dki_length = len; + /* LINTED -- always longlong aligned */ + dk_ioc.dki_data = (efi_gpt_t *)buf; + if (efi_ioctl(fd, DKIOCGETEFI, &dk_ioc) == -1) { + (void) memcpy(&mb, buf, sizeof (mb)); + bzero(&mb, sizeof (mb)); + mb.signature = LE_16(MBB_MAGIC); + } else { + (void) memcpy(&mb, buf, sizeof (mb)); + if (mb.signature != LE_16(MBB_MAGIC)) { + bzero(&mb, sizeof (mb)); + mb.signature = LE_16(MBB_MAGIC); + } + } + + bzero(&mb.parts, sizeof (mb.parts)); + cp = (uchar_t *)&mb.parts[0]; + /* bootable or not */ + *cp++ = 0; + /* beginning CHS; 0xffffff if not representable */ + *cp++ = 0xff; + *cp++ = 0xff; + *cp++ = 0xff; + /* OS type */ + *cp++ = EFI_PMBR; + /* ending CHS; 0xffffff if not representable */ + *cp++ = 0xff; + *cp++ = 0xff; + *cp++ = 0xff; + /* starting LBA: 1 (little endian format) by EFI definition */ + *cp++ = 0x01; + *cp++ = 0x00; + *cp++ = 0x00; + *cp++ = 0x00; + /* ending LBA: last block on the disk (little endian format) */ + size_in_lba = vtoc->efi_last_lba; + if (size_in_lba < 0xffffffff) { + *cp++ = (size_in_lba & 0x000000ff); + *cp++ = (size_in_lba & 0x0000ff00) >> 8; + *cp++ = (size_in_lba & 0x00ff0000) >> 16; + *cp++ = (size_in_lba & 0xff000000) >> 24; + } else { + *cp++ = 0xff; + *cp++ = 0xff; + *cp++ = 0xff; + *cp++ = 0xff; + } + + (void) memcpy(buf, &mb, sizeof (mb)); + /* LINTED -- always longlong aligned */ + dk_ioc.dki_data = (efi_gpt_t *)buf; + dk_ioc.dki_lba = 0; + dk_ioc.dki_length = len; + if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { + free(buf); + switch (errno) { + case EIO: + return (VT_EIO); + case EINVAL: + return (VT_EINVAL); + default: + return (VT_ERROR); + } + } + free(buf); + return (0); +} + +/* make sure the user specified something reasonable */ +static int +check_input(struct dk_gpt *vtoc) +{ + int resv_part = -1; + int i, j; + diskaddr_t istart, jstart, isize, jsize, endsect; + + /* + * Sanity-check the input (make sure no partitions overlap) + */ + for (i = 0; i < vtoc->efi_nparts; i++) { + /* It can't be unassigned and have an actual size */ + if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && + (vtoc->efi_parts[i].p_size != 0)) { + if (efi_debug) { + (void) fprintf(stderr, "partition %d is " + "\"unassigned\" but has a size of %llu", + i, vtoc->efi_parts[i].p_size); + } + return (VT_EINVAL); + } + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) { + if (uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_guid)) + continue; + /* we have encountered an unknown uuid */ + vtoc->efi_parts[i].p_tag = 0xff; + } + if (vtoc->efi_parts[i].p_tag == V_RESERVED) { + if (resv_part != -1) { + if (efi_debug) { + (void) fprintf(stderr, "found " + "duplicate reserved partition " + "at %d\n", i); + } + return (VT_EINVAL); + } + resv_part = i; + } + if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) || + (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) { + if (efi_debug) { + (void) fprintf(stderr, + "Partition %d starts at %llu. ", + i, + vtoc->efi_parts[i].p_start); + (void) fprintf(stderr, + "It must be between %llu and %llu.\n", + vtoc->efi_first_u_lba, + vtoc->efi_last_u_lba); + } + return (VT_EINVAL); + } + if ((vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size < + vtoc->efi_first_u_lba) || + (vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size > + vtoc->efi_last_u_lba + 1)) { + if (efi_debug) { + (void) fprintf(stderr, + "Partition %d ends at %llu. ", + i, + vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size); + (void) fprintf(stderr, + "It must be between %llu and %llu.\n", + vtoc->efi_first_u_lba, + vtoc->efi_last_u_lba); + } + return (VT_EINVAL); + } + + for (j = 0; j < vtoc->efi_nparts; j++) { + isize = vtoc->efi_parts[i].p_size; + jsize = vtoc->efi_parts[j].p_size; + istart = vtoc->efi_parts[i].p_start; + jstart = vtoc->efi_parts[j].p_start; + if ((i != j) && (isize != 0) && (jsize != 0)) { + endsect = jstart + jsize -1; + if ((jstart <= istart) && + (istart <= endsect)) { + if (efi_debug) { + (void) fprintf(stderr, + "Partition %d overlaps " + "partition %d.", i, j); + } + return (VT_EINVAL); + } + } + } + } + /* just a warning for now */ + if ((resv_part == -1) && efi_debug) { + (void) fprintf(stderr, + "no reserved partition found\n"); + } + return (0); +} + +/* + * add all the unallocated space to the current label + */ +int +efi_use_whole_disk(int fd) +{ + struct dk_gpt *efi_label = NULL; + int rval; + int i; + uint_t resv_index = 0, data_index = 0; + diskaddr_t resv_start = 0, data_start = 0; + diskaddr_t data_size, limit, difference; + boolean_t sync_needed = B_FALSE; + uint_t nblocks; + + rval = efi_alloc_and_read(fd, &efi_label); + if (rval < 0) { + if (efi_label != NULL) + efi_free(efi_label); + return (rval); + } + + /* + * Find the last physically non-zero partition. + * This should be the reserved partition. + */ + for (i = 0; i < efi_label->efi_nparts; i ++) { + if (resv_start < efi_label->efi_parts[i].p_start) { + resv_start = efi_label->efi_parts[i].p_start; + resv_index = i; + } + } + + /* + * Find the last physically non-zero partition before that. + * This is the data partition. + */ + for (i = 0; i < resv_index; i ++) { + if (data_start < efi_label->efi_parts[i].p_start) { + data_start = efi_label->efi_parts[i].p_start; + data_index = i; + } + } + data_size = efi_label->efi_parts[data_index].p_size; + + /* + * See the "efi_alloc_and_init" function for more information + * about where this "nblocks" value comes from. + */ + nblocks = efi_label->efi_first_u_lba - 1; + + /* + * Determine if the EFI label is out of sync. We check that: + * + * 1. the data partition ends at the limit we set, and + * 2. the reserved partition starts at the limit we set. + * + * If either of these conditions is not met, then we need to + * resync the EFI label. + * + * The limit is the last usable LBA, determined by the last LBA + * and the first usable LBA fields on the EFI label of the disk + * (see the lines directly above). Additionally, we factor in + * EFI_MIN_RESV_SIZE (per its use in "zpool_label_disk") and + * P2ALIGN it to ensure the partition boundaries are aligned + * (for performance reasons). The alignment should match the + * alignment used by the "zpool_label_disk" function. + */ + limit = P2ALIGN(efi_label->efi_last_lba - nblocks - EFI_MIN_RESV_SIZE, + PARTITION_END_ALIGNMENT); + if (data_start + data_size != limit || resv_start != limit) + sync_needed = B_TRUE; + + if (efi_debug && sync_needed) + (void) fprintf(stderr, "efi_use_whole_disk: sync needed\n"); + + /* + * If alter_lba is 1, we are using the backup label. + * Since we can locate the backup label by disk capacity, + * there must be no unallocated space. + */ + if ((efi_label->efi_altern_lba == 1) || (efi_label->efi_altern_lba + >= efi_label->efi_last_lba && !sync_needed)) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: requested space not found\n"); + } + efi_free(efi_label); + return (VT_ENOSPC); + } + + /* + * Verify that we've found the reserved partition by checking + * that it looks the way it did when we created it in zpool_label_disk. + * If we've found the incorrect partition, then we know that this + * device was reformatted and no longer is solely used by ZFS. + */ + if ((efi_label->efi_parts[resv_index].p_size != EFI_MIN_RESV_SIZE) || + (efi_label->efi_parts[resv_index].p_tag != V_RESERVED) || + (resv_index != 8)) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: wholedisk not available\n"); + } + efi_free(efi_label); + return (VT_ENOSPC); + } + + if (data_start + data_size != resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "data_start (%lli) + " + "data_size (%lli) != " + "resv_start (%lli)\n", + data_start, data_size, resv_start); + } + + return (VT_EINVAL); + } + + if (limit < resv_start) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk: " + "limit (%lli) < resv_start (%lli)\n", + limit, resv_start); + } + + return (VT_EINVAL); + } + + difference = limit - resv_start; + + if (efi_debug) + (void) fprintf(stderr, + "efi_use_whole_disk: difference is %lli\n", difference); + + /* + * Move the reserved partition. There is currently no data in + * here except fabricated devids (which get generated via + * efi_write()). So there is no need to copy data. + */ + efi_label->efi_parts[data_index].p_size += difference; + efi_label->efi_parts[resv_index].p_start += difference; + efi_label->efi_last_u_lba = efi_label->efi_last_lba - nblocks; + + rval = efi_write(fd, efi_label); + if (rval < 0) { + if (efi_debug) { + (void) fprintf(stderr, + "efi_use_whole_disk:fail to write label, rval=%d\n", + rval); + } + efi_free(efi_label); + return (rval); + } + + efi_free(efi_label); + return (0); +} + +/* + * write EFI label and backup label + */ +int +efi_write(int fd, struct dk_gpt *vtoc) +{ + dk_efi_t dk_ioc; + efi_gpt_t *efi; + efi_gpe_t *efi_parts; + int i, j; + struct dk_cinfo dki_info; + int rval; + int md_flag = 0; + int nblocks; + diskaddr_t lba_backup_gpt_hdr; + + if ((rval = efi_get_info(fd, &dki_info)) != 0) + return (rval); + + /* check if we are dealing with a metadevice */ + if ((strncmp(dki_info.dki_cname, "pseudo", 7) == 0) && + (strncmp(dki_info.dki_dname, "md", 3) == 0)) { + md_flag = 1; + } + + if (check_input(vtoc)) { + /* + * not valid; if it's a metadevice just pass it down + * because SVM will do its own checking + */ + if (md_flag == 0) { + return (VT_EINVAL); + } + } + + dk_ioc.dki_lba = 1; + if (NBLOCKS(vtoc->efi_nparts, vtoc->efi_lbasize) < 34) { + dk_ioc.dki_length = EFI_MIN_ARRAY_SIZE + vtoc->efi_lbasize; + } else { + dk_ioc.dki_length = NBLOCKS(vtoc->efi_nparts, + vtoc->efi_lbasize) * + vtoc->efi_lbasize; + } + + /* + * the number of blocks occupied by GUID partition entry array + */ + nblocks = dk_ioc.dki_length / vtoc->efi_lbasize - 1; + + /* + * Backup GPT header is located on the block after GUID + * partition entry array. Here, we calculate the address + * for backup GPT header. + */ + lba_backup_gpt_hdr = vtoc->efi_last_u_lba + 1 + nblocks; + if (posix_memalign((void **)&dk_ioc.dki_data, + vtoc->efi_lbasize, dk_ioc.dki_length)) + return (VT_ERROR); + + memset(dk_ioc.dki_data, 0, dk_ioc.dki_length); + efi = dk_ioc.dki_data; + + /* stuff user's input into EFI struct */ + efi->efi_gpt_Signature = LE_64(EFI_SIGNATURE); + efi->efi_gpt_Revision = LE_32(vtoc->efi_version); /* 0x02000100 */ + efi->efi_gpt_HeaderSize = LE_32(sizeof (struct efi_gpt) - LEN_EFI_PAD); + efi->efi_gpt_Reserved1 = 0; + efi->efi_gpt_MyLBA = LE_64(1ULL); + efi->efi_gpt_AlternateLBA = LE_64(lba_backup_gpt_hdr); + efi->efi_gpt_FirstUsableLBA = LE_64(vtoc->efi_first_u_lba); + efi->efi_gpt_LastUsableLBA = LE_64(vtoc->efi_last_u_lba); + efi->efi_gpt_PartitionEntryLBA = LE_64(2ULL); + efi->efi_gpt_NumberOfPartitionEntries = LE_32(vtoc->efi_nparts); + efi->efi_gpt_SizeOfPartitionEntry = LE_32(sizeof (struct efi_gpe)); + UUID_LE_CONVERT(efi->efi_gpt_DiskGUID, vtoc->efi_disk_uguid); + + /* LINTED -- always longlong aligned */ + efi_parts = (efi_gpe_t *)((char *)dk_ioc.dki_data + vtoc->efi_lbasize); + + for (i = 0; i < vtoc->efi_nparts; i++) { + for (j = 0; + j < sizeof (conversion_array) / + sizeof (struct uuid_to_ptag); j++) { + + if (vtoc->efi_parts[i].p_tag == j) { + UUID_LE_CONVERT( + efi_parts[i].efi_gpe_PartitionTypeGUID, + conversion_array[j].uuid); + break; + } + } + + if (j == sizeof (conversion_array) / + sizeof (struct uuid_to_ptag)) { + /* + * If we didn't have a matching uuid match, bail here. + * Don't write a label with unknown uuid. + */ + if (efi_debug) { + (void) fprintf(stderr, + "Unknown uuid for p_tag %d\n", + vtoc->efi_parts[i].p_tag); + } + return (VT_EINVAL); + } + + /* Zero's should be written for empty partitions */ + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) + continue; + + efi_parts[i].efi_gpe_StartingLBA = + LE_64(vtoc->efi_parts[i].p_start); + efi_parts[i].efi_gpe_EndingLBA = + LE_64(vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size - 1); + efi_parts[i].efi_gpe_Attributes.PartitionAttrs = + LE_16(vtoc->efi_parts[i].p_flag); + for (j = 0; j < EFI_PART_NAME_LEN; j++) { + efi_parts[i].efi_gpe_PartitionName[j] = + LE_16((ushort_t)vtoc->efi_parts[i].p_name[j]); + } + if ((vtoc->efi_parts[i].p_tag != V_UNASSIGNED) && + uuid_is_null((uchar_t *)&vtoc->efi_parts[i].p_uguid)) { + (void) uuid_generate((uchar_t *) + &vtoc->efi_parts[i].p_uguid); + } + bcopy(&vtoc->efi_parts[i].p_uguid, + &efi_parts[i].efi_gpe_UniquePartitionGUID, + sizeof (uuid_t)); + } + efi->efi_gpt_PartitionEntryArrayCRC32 = + LE_32(efi_crc32((unsigned char *)efi_parts, + vtoc->efi_nparts * (int)sizeof (struct efi_gpe))); + efi->efi_gpt_HeaderCRC32 = + LE_32(efi_crc32((unsigned char *)efi, + LE_32(efi->efi_gpt_HeaderSize))); + + if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { + free(dk_ioc.dki_data); + switch (errno) { + case EIO: + return (VT_EIO); + case EINVAL: + return (VT_EINVAL); + default: + return (VT_ERROR); + } + } + /* if it's a metadevice we're done */ + if (md_flag) { + free(dk_ioc.dki_data); + return (0); + } + + /* write backup partition array */ + dk_ioc.dki_lba = vtoc->efi_last_u_lba + 1; + dk_ioc.dki_length -= vtoc->efi_lbasize; + /* LINTED */ + dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data + + vtoc->efi_lbasize); + + if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { + /* + * we wrote the primary label okay, so don't fail + */ + if (efi_debug) { + (void) fprintf(stderr, + "write of backup partitions to block %llu " + "failed, errno %d\n", + vtoc->efi_last_u_lba + 1, + errno); + } + } + /* + * now swap MyLBA and AlternateLBA fields and write backup + * partition table header + */ + dk_ioc.dki_lba = lba_backup_gpt_hdr; + dk_ioc.dki_length = vtoc->efi_lbasize; + /* LINTED */ + dk_ioc.dki_data = (efi_gpt_t *)((char *)dk_ioc.dki_data - + vtoc->efi_lbasize); + efi->efi_gpt_AlternateLBA = LE_64(1ULL); + efi->efi_gpt_MyLBA = LE_64(lba_backup_gpt_hdr); + efi->efi_gpt_PartitionEntryLBA = LE_64(vtoc->efi_last_u_lba + 1); + efi->efi_gpt_HeaderCRC32 = 0; + efi->efi_gpt_HeaderCRC32 = + LE_32(efi_crc32((unsigned char *)dk_ioc.dki_data, + LE_32(efi->efi_gpt_HeaderSize))); + + if (efi_ioctl(fd, DKIOCSETEFI, &dk_ioc) == -1) { + if (efi_debug) { + (void) fprintf(stderr, + "write of backup header to block %llu failed, " + "errno %d\n", + lba_backup_gpt_hdr, + errno); + } + } + /* write the PMBR */ + (void) write_pmbr(fd, vtoc); + free(dk_ioc.dki_data); + + return (0); +} + +void +efi_free(struct dk_gpt *ptr) +{ + free(ptr); +} + +/* + * Input: File descriptor + * Output: 1 if disk has an EFI label, or > 2TB with no VTOC or legacy MBR. + * Otherwise 0. + */ +int +efi_type(int fd) +{ +#if 0 + struct vtoc vtoc; + struct extvtoc extvtoc; + + if (ioctl(fd, DKIOCGEXTVTOC, &extvtoc) == -1) { + if (errno == ENOTSUP) + return (1); + else if (errno == ENOTTY) { + if (ioctl(fd, DKIOCGVTOC, &vtoc) == -1) + if (errno == ENOTSUP) + return (1); + } + } + return (0); +#else + return (ENOSYS); +#endif +} + +void +efi_err_check(struct dk_gpt *vtoc) +{ + int resv_part = -1; + int i, j; + diskaddr_t istart, jstart, isize, jsize, endsect; + int overlap = 0; + + /* + * make sure no partitions overlap + */ + for (i = 0; i < vtoc->efi_nparts; i++) { + /* It can't be unassigned and have an actual size */ + if ((vtoc->efi_parts[i].p_tag == V_UNASSIGNED) && + (vtoc->efi_parts[i].p_size != 0)) { + (void) fprintf(stderr, + "partition %d is \"unassigned\" but has a size " + "of %llu\n", i, vtoc->efi_parts[i].p_size); + } + if (vtoc->efi_parts[i].p_tag == V_UNASSIGNED) { + continue; + } + if (vtoc->efi_parts[i].p_tag == V_RESERVED) { + if (resv_part != -1) { + (void) fprintf(stderr, + "found duplicate reserved partition at " + "%d\n", i); + } + resv_part = i; + if (vtoc->efi_parts[i].p_size != EFI_MIN_RESV_SIZE) + (void) fprintf(stderr, + "Warning: reserved partition size must " + "be %d sectors\n", EFI_MIN_RESV_SIZE); + } + if ((vtoc->efi_parts[i].p_start < vtoc->efi_first_u_lba) || + (vtoc->efi_parts[i].p_start > vtoc->efi_last_u_lba)) { + (void) fprintf(stderr, + "Partition %d starts at %llu\n", + i, + vtoc->efi_parts[i].p_start); + (void) fprintf(stderr, + "It must be between %llu and %llu.\n", + vtoc->efi_first_u_lba, + vtoc->efi_last_u_lba); + } + if ((vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size < + vtoc->efi_first_u_lba) || + (vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size > + vtoc->efi_last_u_lba + 1)) { + (void) fprintf(stderr, + "Partition %d ends at %llu\n", + i, + vtoc->efi_parts[i].p_start + + vtoc->efi_parts[i].p_size); + (void) fprintf(stderr, + "It must be between %llu and %llu.\n", + vtoc->efi_first_u_lba, + vtoc->efi_last_u_lba); + } + + for (j = 0; j < vtoc->efi_nparts; j++) { + isize = vtoc->efi_parts[i].p_size; + jsize = vtoc->efi_parts[j].p_size; + istart = vtoc->efi_parts[i].p_start; + jstart = vtoc->efi_parts[j].p_start; + if ((i != j) && (isize != 0) && (jsize != 0)) { + endsect = jstart + jsize -1; + if ((jstart <= istart) && + (istart <= endsect)) { + if (!overlap) { + (void) fprintf(stderr, + "label error: EFI Labels do not " + "support overlapping partitions\n"); + } + (void) fprintf(stderr, + "Partition %d overlaps partition " + "%d.\n", i, j); + overlap = 1; + } + } + } + } + /* make sure there is a reserved partition */ + if (resv_part == -1) { + (void) fprintf(stderr, + "no reserved partition found\n"); + } +} + +/* + * We need to get information necessary to construct a *new* efi + * label type + */ +int +efi_auto_sense(int fd, struct dk_gpt **vtoc) +{ + + int i; + + /* + * Now build the default partition table + */ + if (efi_alloc_and_init(fd, EFI_NUMPAR, vtoc) != 0) { + if (efi_debug) { + (void) fprintf(stderr, "efi_alloc_and_init failed.\n"); + } + return (-1); + } + + for (i = 0; i < MIN((*vtoc)->efi_nparts, V_NUMPAR); i++) { + (*vtoc)->efi_parts[i].p_tag = default_vtoc_map[i].p_tag; + (*vtoc)->efi_parts[i].p_flag = default_vtoc_map[i].p_flag; + (*vtoc)->efi_parts[i].p_start = 0; + (*vtoc)->efi_parts[i].p_size = 0; + } + /* + * Make constants first + * and variable partitions later + */ + + /* root partition - s0 128 MB */ + (*vtoc)->efi_parts[0].p_start = 34; + (*vtoc)->efi_parts[0].p_size = 262144; + + /* partition - s1 128 MB */ + (*vtoc)->efi_parts[1].p_start = 262178; + (*vtoc)->efi_parts[1].p_size = 262144; + + /* partition -s2 is NOT the Backup disk */ + (*vtoc)->efi_parts[2].p_tag = V_UNASSIGNED; + + /* partition -s6 /usr partition - HOG */ + (*vtoc)->efi_parts[6].p_start = 524322; + (*vtoc)->efi_parts[6].p_size = (*vtoc)->efi_last_u_lba - 524322 + - (1024 * 16); + + /* efi reserved partition - s9 16K */ + (*vtoc)->efi_parts[8].p_start = (*vtoc)->efi_last_u_lba - (1024 * 16); + (*vtoc)->efi_parts[8].p_size = (1024 * 16); + (*vtoc)->efi_parts[8].p_tag = V_RESERVED; + return (0); +} diff --git a/lib/libicp/Makefile.am b/lib/libicp/Makefile.am new file mode 100644 index 000000000000..6d3c65ea324f --- /dev/null +++ b/lib/libicp/Makefile.am @@ -0,0 +1,73 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = \ + $(top_srcdir)/module/icp \ + $(top_srcdir)/lib/libicp + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +noinst_LTLIBRARIES = libicp.la + +if TARGET_CPU_X86_64 +ASM_SOURCES_C = asm-x86_64/aes/aeskey.c +ASM_SOURCES_AS = \ + asm-x86_64/aes/aes_amd64.S \ + asm-x86_64/aes/aes_aesni.S \ + asm-x86_64/modes/gcm_pclmulqdq.S \ + asm-x86_64/modes/aesni-gcm-x86_64.S \ + asm-x86_64/modes/ghash-x86_64.S \ + asm-x86_64/sha1/sha1-x86_64.S \ + asm-x86_64/sha2/sha256_impl.S \ + asm-x86_64/sha2/sha512_impl.S +else +ASM_SOURCES_C = +ASM_SOURCES_AS = +endif + +KERNEL_C = \ + spi/kcf_spi.c \ + api/kcf_ctxops.c \ + api/kcf_digest.c \ + api/kcf_cipher.c \ + api/kcf_miscapi.c \ + api/kcf_mac.c \ + algs/aes/aes_impl_aesni.c \ + algs/aes/aes_impl_generic.c \ + algs/aes/aes_impl_x86-64.c \ + algs/aes/aes_impl.c \ + algs/aes/aes_modes.c \ + algs/edonr/edonr.c \ + algs/modes/modes.c \ + algs/modes/cbc.c \ + algs/modes/gcm_generic.c \ + algs/modes/gcm_pclmulqdq.c \ + algs/modes/gcm.c \ + algs/modes/ctr.c \ + algs/modes/ccm.c \ + algs/modes/ecb.c \ + algs/sha1/sha1.c \ + algs/sha2/sha2.c \ + algs/skein/skein.c \ + algs/skein/skein_block.c \ + algs/skein/skein_iv.c \ + illumos-crypto.c \ + io/aes.c \ + io/edonr_mod.c \ + io/sha1_mod.c \ + io/sha2_mod.c \ + io/skein_mod.c \ + os/modhash.c \ + os/modconf.c \ + core/kcf_sched.c \ + core/kcf_prov_lib.c \ + core/kcf_callprov.c \ + core/kcf_mech_tabs.c \ + core/kcf_prov_tabs.c \ + $(ASM_SOURCES_C) + +KERNEL_ASM = $(ASM_SOURCES_AS) + +nodist_libicp_la_SOURCES = \ + $(KERNEL_C) \ + $(KERNEL_ASM) diff --git a/lib/libnvpair/Makefile.am b/lib/libnvpair/Makefile.am new file mode 100644 index 000000000000..ec16c5d526c5 --- /dev/null +++ b/lib/libnvpair/Makefile.am @@ -0,0 +1,44 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = \ + $(top_srcdir)/module/nvpair \ + $(top_srcdir)/lib/libnvpair + +# Includes kernel code, generate warnings for large stack frames +# and required CFLAGS for libtirpc +AM_CFLAGS += $(FRAME_LARGER_THAN) $(LIBTIRPC_CFLAGS) + +lib_LTLIBRARIES = libnvpair.la + +USER_C = \ + libnvpair.c \ + libnvpair_json.c \ + nvpair_alloc_system.c + +KERNEL_C = \ + nvpair_alloc_fixed.c \ + nvpair.c \ + fnvpair.c + +dist_libnvpair_la_SOURCES = \ + $(USER_C) + +nodist_libnvpair_la_SOURCES = \ + $(KERNEL_C) + +libnvpair_la_LIBADD = \ + $(abs_top_builddir)/lib/libspl/libspl_assert.la + +libnvpair_la_LIBADD += $(LIBTIRPC_LIBS) $(LTLIBINTL) + +libnvpair_la_LDFLAGS = + +if !ASAN_ENABLED +libnvpair_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libnvpair_la_LDFLAGS += -version-info 3:0:0 +else +libnvpair_la_LDFLAGS += -version-info 1:1:0 +endif diff --git a/lib/libnvpair/libnvpair.c b/lib/libnvpair/libnvpair.c new file mode 100644 index 000000000000..2e9ea1c174e9 --- /dev/null +++ b/lib/libnvpair/libnvpair.c @@ -0,0 +1,1277 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include "libnvpair.h" + +/* + * libnvpair - A tools library for manipulating pairs. + * + * This library provides routines packing an unpacking nv pairs + * for transporting data across process boundaries, transporting + * between kernel and userland, and possibly saving onto disk files. + */ + +/* + * Print control structure. + */ + +#define DEFINEOP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype); \ + void *arg; \ + } opname + +#define DEFINEARROP(opname, vtype) \ + struct { \ + int (*op)(struct nvlist_prtctl *, void *, nvlist_t *, \ + const char *, vtype, uint_t); \ + void *arg; \ + } opname + +struct nvlist_printops { + DEFINEOP(print_boolean, int); + DEFINEOP(print_boolean_value, boolean_t); + DEFINEOP(print_byte, uchar_t); + DEFINEOP(print_int8, int8_t); + DEFINEOP(print_uint8, uint8_t); + DEFINEOP(print_int16, int16_t); + DEFINEOP(print_uint16, uint16_t); + DEFINEOP(print_int32, int32_t); + DEFINEOP(print_uint32, uint32_t); + DEFINEOP(print_int64, int64_t); + DEFINEOP(print_uint64, uint64_t); + DEFINEOP(print_double, double); + DEFINEOP(print_string, char *); + DEFINEOP(print_hrtime, hrtime_t); + DEFINEOP(print_nvlist, nvlist_t *); + DEFINEARROP(print_boolean_array, boolean_t *); + DEFINEARROP(print_byte_array, uchar_t *); + DEFINEARROP(print_int8_array, int8_t *); + DEFINEARROP(print_uint8_array, uint8_t *); + DEFINEARROP(print_int16_array, int16_t *); + DEFINEARROP(print_uint16_array, uint16_t *); + DEFINEARROP(print_int32_array, int32_t *); + DEFINEARROP(print_uint32_array, uint32_t *); + DEFINEARROP(print_int64_array, int64_t *); + DEFINEARROP(print_uint64_array, uint64_t *); + DEFINEARROP(print_string_array, char **); + DEFINEARROP(print_nvlist_array, nvlist_t **); +}; + +struct nvlist_prtctl { + FILE *nvprt_fp; /* output destination */ + enum nvlist_indent_mode nvprt_indent_mode; /* see above */ + int nvprt_indent; /* absolute indent, or tab depth */ + int nvprt_indentinc; /* indent or tab increment */ + const char *nvprt_nmfmt; /* member name format, max one %s */ + const char *nvprt_eomfmt; /* after member format, e.g. "\n" */ + const char *nvprt_btwnarrfmt; /* between array members */ + int nvprt_btwnarrfmt_nl; /* nvprt_eoamfmt includes newline? */ + struct nvlist_printops *nvprt_dfltops; + struct nvlist_printops *nvprt_custops; +}; + +#define DFLTPRTOP(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.op) + +#define DFLTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_dfltops->print_##type.arg) + +#define CUSTPRTOP(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.op) + +#define CUSTPRTOPARG(pctl, type) \ + ((pctl)->nvprt_custops->print_##type.arg) + +#define RENDER(pctl, type, nvl, name, val) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, val); \ + } \ + (void) fprintf(pctl->nvprt_fp, "%s", pctl->nvprt_eomfmt); \ + } + +#define ARENDER(pctl, type, nvl, name, arrp, count) \ + { \ + int done = 0; \ + if ((pctl)->nvprt_custops && CUSTPRTOP(pctl, type)) { \ + done = CUSTPRTOP(pctl, type)(pctl, \ + CUSTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + if (!done) { \ + (void) DFLTPRTOP(pctl, type)(pctl, \ + DFLTPRTOPARG(pctl, type), nvl, name, arrp, count); \ + } \ + (void) fprintf(pctl->nvprt_fp, "%s", pctl->nvprt_eomfmt); \ + } + +static void nvlist_print_with_indent(nvlist_t *, nvlist_prtctl_t); + +/* + * ====================================================================== + * | | + * | Indentation | + * | | + * ====================================================================== + */ + +static void +indent(nvlist_prtctl_t pctl, int onemore) +{ + int depth; + + switch (pctl->nvprt_indent_mode) { + case NVLIST_INDENT_ABS: + (void) fprintf(pctl->nvprt_fp, "%*s", + pctl->nvprt_indent + onemore * pctl->nvprt_indentinc, ""); + break; + + case NVLIST_INDENT_TABBED: + depth = pctl->nvprt_indent + onemore; + while (depth-- > 0) + (void) fprintf(pctl->nvprt_fp, "\t"); + } +} + +/* + * ====================================================================== + * | | + * | Default nvlist member rendering functions. | + * | | + * ====================================================================== + */ + +/* + * Generate functions to print single-valued nvlist members. + * + * type_and_variant - suffix to form function name + * vtype - C type for the member value + * ptype - C type to cast value to for printing + * vfmt - format string for pair value, e.g "%d" or "0x%llx" + */ + +#define NVLIST_PRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype value) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + (void) fprintf(fp, vfmt, (ptype)value); \ + return (1); \ +} + +NVLIST_PRTFUNC(boolean, int, int, "%d") +NVLIST_PRTFUNC(boolean_value, boolean_t, int, "%d") +NVLIST_PRTFUNC(byte, uchar_t, uchar_t, "0x%2.2x") +NVLIST_PRTFUNC(int8, int8_t, int, "%d") +NVLIST_PRTFUNC(uint8, uint8_t, uint8_t, "0x%x") +NVLIST_PRTFUNC(int16, int16_t, int16_t, "%d") +NVLIST_PRTFUNC(uint16, uint16_t, uint16_t, "0x%x") +NVLIST_PRTFUNC(int32, int32_t, int32_t, "%d") +NVLIST_PRTFUNC(uint32, uint32_t, uint32_t, "0x%x") +NVLIST_PRTFUNC(int64, int64_t, longlong_t, "%lld") +NVLIST_PRTFUNC(uint64, uint64_t, u_longlong_t, "0x%llx") +NVLIST_PRTFUNC(double, double, double, "0x%f") +NVLIST_PRTFUNC(string, char *, char *, "%s") +NVLIST_PRTFUNC(hrtime, hrtime_t, hrtime_t, "0x%llx") + +/* + * Generate functions to print array-valued nvlist members. + */ + +#define NVLIST_ARRPRTFUNC(type_and_variant, vtype, ptype, vfmt) \ +static int \ +nvaprint_##type_and_variant(nvlist_prtctl_t pctl, void *private, \ + nvlist_t *nvl, const char *name, vtype *valuep, uint_t count) \ +{ \ + FILE *fp = pctl->nvprt_fp; \ + uint_t i; \ + NOTE(ARGUNUSED(private)) \ + NOTE(ARGUNUSED(nvl)) \ + for (i = 0; i < count; i++) { \ + if (i == 0 || pctl->nvprt_btwnarrfmt_nl) { \ + indent(pctl, 1); \ + (void) fprintf(fp, pctl->nvprt_nmfmt, name); \ + if (pctl->nvprt_btwnarrfmt_nl) \ + (void) fprintf(fp, "[%d]: ", i); \ + } \ + if (i != 0) \ + (void) fprintf(fp, "%s", pctl->nvprt_btwnarrfmt); \ + (void) fprintf(fp, vfmt, (ptype)valuep[i]); \ + } \ + return (1); \ +} + +NVLIST_ARRPRTFUNC(boolean_array, boolean_t, boolean_t, "%d") +NVLIST_ARRPRTFUNC(byte_array, uchar_t, uchar_t, "0x%2.2x") +NVLIST_ARRPRTFUNC(int8_array, int8_t, int8_t, "%d") +NVLIST_ARRPRTFUNC(uint8_array, uint8_t, uint8_t, "0x%x") +NVLIST_ARRPRTFUNC(int16_array, int16_t, int16_t, "%d") +NVLIST_ARRPRTFUNC(uint16_array, uint16_t, uint16_t, "0x%x") +NVLIST_ARRPRTFUNC(int32_array, int32_t, int32_t, "%d") +NVLIST_ARRPRTFUNC(uint32_array, uint32_t, uint32_t, "0x%x") +NVLIST_ARRPRTFUNC(int64_array, int64_t, longlong_t, "%lld") +NVLIST_ARRPRTFUNC(uint64_array, uint64_t, u_longlong_t, "0x%llx") +NVLIST_ARRPRTFUNC(string_array, char *, char *, "%s") + +/*ARGSUSED*/ +static int +nvprint_nvlist(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t *value) +{ + FILE *fp = pctl->nvprt_fp; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (embedded nvlist)\n", name); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(value, pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s)\n", name); + + return (1); +} + +/*ARGSUSED*/ +static int +nvaprint_nvlist_array(nvlist_prtctl_t pctl, void *private, + nvlist_t *nvl, const char *name, nvlist_t **valuep, uint_t count) +{ + FILE *fp = pctl->nvprt_fp; + uint_t i; + + indent(pctl, 1); + (void) fprintf(fp, "%s = (array of embedded nvlists)\n", name); + + for (i = 0; i < count; i++) { + indent(pctl, 1); + (void) fprintf(fp, "(start %s[%d])\n", name, i); + + pctl->nvprt_indent += pctl->nvprt_indentinc; + nvlist_print_with_indent(valuep[i], pctl); + pctl->nvprt_indent -= pctl->nvprt_indentinc; + + indent(pctl, 1); + (void) fprintf(fp, "(end %s[%d])\n", name, i); + } + + return (1); +} + +/* + * ====================================================================== + * | | + * | Interfaces that allow control over formatting. | + * | | + * ====================================================================== + */ + +void +nvlist_prtctl_setdest(nvlist_prtctl_t pctl, FILE *fp) +{ + pctl->nvprt_fp = fp; +} + +FILE * +nvlist_prtctl_getdest(nvlist_prtctl_t pctl) +{ + return (pctl->nvprt_fp); +} + + +void +nvlist_prtctl_setindent(nvlist_prtctl_t pctl, enum nvlist_indent_mode mode, + int start, int inc) +{ + if (mode < NVLIST_INDENT_ABS || mode > NVLIST_INDENT_TABBED) + mode = NVLIST_INDENT_TABBED; + + if (start < 0) + start = 0; + + if (inc < 0) + inc = 1; + + pctl->nvprt_indent_mode = mode; + pctl->nvprt_indent = start; + pctl->nvprt_indentinc = inc; +} + +void +nvlist_prtctl_doindent(nvlist_prtctl_t pctl, int onemore) +{ + indent(pctl, onemore); +} + + +void +nvlist_prtctl_setfmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, + const char *fmt) +{ + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + if (fmt == NULL) + fmt = "%s = "; + pctl->nvprt_nmfmt = fmt; + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + if (fmt == NULL) + fmt = "\n"; + pctl->nvprt_eomfmt = fmt; + break; + + case NVLIST_FMT_BTWN_ARRAY: + if (fmt == NULL) { + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + } else { + pctl->nvprt_btwnarrfmt = fmt; + pctl->nvprt_btwnarrfmt_nl = (strstr(fmt, "\n") != NULL); + } + break; + + default: + break; + } +} + + +void +nvlist_prtctl_dofmt(nvlist_prtctl_t pctl, enum nvlist_prtctl_fmt which, ...) +{ + FILE *fp = pctl->nvprt_fp; + va_list ap; + char *name; + + va_start(ap, which); + + switch (which) { + case NVLIST_FMT_MEMBER_NAME: + name = va_arg(ap, char *); + (void) fprintf(fp, pctl->nvprt_nmfmt, name); + break; + + case NVLIST_FMT_MEMBER_POSTAMBLE: + (void) fprintf(fp, "%s", pctl->nvprt_eomfmt); + break; + + case NVLIST_FMT_BTWN_ARRAY: + (void) fprintf(fp, "%s", pctl->nvprt_btwnarrfmt); + break; + + default: + break; + } + + va_end(ap); +} + +/* + * ====================================================================== + * | | + * | Interfaces to allow appointment of replacement rendering functions.| + * | | + * ====================================================================== + */ + +#define NVLIST_PRINTCTL_REPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype), \ + void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_REPLACE(boolean, int) +NVLIST_PRINTCTL_REPLACE(boolean_value, boolean_t) +NVLIST_PRINTCTL_REPLACE(byte, uchar_t) +NVLIST_PRINTCTL_REPLACE(int8, int8_t) +NVLIST_PRINTCTL_REPLACE(uint8, uint8_t) +NVLIST_PRINTCTL_REPLACE(int16, int16_t) +NVLIST_PRINTCTL_REPLACE(uint16, uint16_t) +NVLIST_PRINTCTL_REPLACE(int32, int32_t) +NVLIST_PRINTCTL_REPLACE(uint32, uint32_t) +NVLIST_PRINTCTL_REPLACE(int64, int64_t) +NVLIST_PRINTCTL_REPLACE(uint64, uint64_t) +NVLIST_PRINTCTL_REPLACE(double, double) +NVLIST_PRINTCTL_REPLACE(string, char *) +NVLIST_PRINTCTL_REPLACE(hrtime, hrtime_t) +NVLIST_PRINTCTL_REPLACE(nvlist, nvlist_t *) + +#define NVLIST_PRINTCTL_AREPLACE(type, vtype) \ +void \ +nvlist_prtctlop_##type(nvlist_prtctl_t pctl, \ + int (*func)(nvlist_prtctl_t, void *, nvlist_t *, const char *, vtype, \ + uint_t), void *private) \ +{ \ + CUSTPRTOP(pctl, type) = func; \ + CUSTPRTOPARG(pctl, type) = private; \ +} + +NVLIST_PRINTCTL_AREPLACE(boolean_array, boolean_t *) +NVLIST_PRINTCTL_AREPLACE(byte_array, uchar_t *) +NVLIST_PRINTCTL_AREPLACE(int8_array, int8_t *) +NVLIST_PRINTCTL_AREPLACE(uint8_array, uint8_t *) +NVLIST_PRINTCTL_AREPLACE(int16_array, int16_t *) +NVLIST_PRINTCTL_AREPLACE(uint16_array, uint16_t *) +NVLIST_PRINTCTL_AREPLACE(int32_array, int32_t *) +NVLIST_PRINTCTL_AREPLACE(uint32_array, uint32_t *) +NVLIST_PRINTCTL_AREPLACE(int64_array, int64_t *) +NVLIST_PRINTCTL_AREPLACE(uint64_array, uint64_t *) +NVLIST_PRINTCTL_AREPLACE(string_array, char **) +NVLIST_PRINTCTL_AREPLACE(nvlist_array, nvlist_t **) + +/* + * ====================================================================== + * | | + * | Interfaces to manage nvlist_prtctl_t cookies. | + * | | + * ====================================================================== + */ + + +static const struct nvlist_printops defprtops = +{ + { nvprint_boolean, NULL }, + { nvprint_boolean_value, NULL }, + { nvprint_byte, NULL }, + { nvprint_int8, NULL }, + { nvprint_uint8, NULL }, + { nvprint_int16, NULL }, + { nvprint_uint16, NULL }, + { nvprint_int32, NULL }, + { nvprint_uint32, NULL }, + { nvprint_int64, NULL }, + { nvprint_uint64, NULL }, + { nvprint_double, NULL }, + { nvprint_string, NULL }, + { nvprint_hrtime, NULL }, + { nvprint_nvlist, NULL }, + { nvaprint_boolean_array, NULL }, + { nvaprint_byte_array, NULL }, + { nvaprint_int8_array, NULL }, + { nvaprint_uint8_array, NULL }, + { nvaprint_int16_array, NULL }, + { nvaprint_uint16_array, NULL }, + { nvaprint_int32_array, NULL }, + { nvaprint_uint32_array, NULL }, + { nvaprint_int64_array, NULL }, + { nvaprint_uint64_array, NULL }, + { nvaprint_string_array, NULL }, + { nvaprint_nvlist_array, NULL }, +}; + +static void +prtctl_defaults(FILE *fp, struct nvlist_prtctl *pctl, + struct nvlist_printops *ops) +{ + pctl->nvprt_fp = fp; + pctl->nvprt_indent_mode = NVLIST_INDENT_TABBED; + pctl->nvprt_indent = 0; + pctl->nvprt_indentinc = 1; + pctl->nvprt_nmfmt = "%s = "; + pctl->nvprt_eomfmt = "\n"; + pctl->nvprt_btwnarrfmt = " "; + pctl->nvprt_btwnarrfmt_nl = 0; + + pctl->nvprt_dfltops = (struct nvlist_printops *)&defprtops; + pctl->nvprt_custops = ops; +} + +nvlist_prtctl_t +nvlist_prtctl_alloc(void) +{ + struct nvlist_prtctl *pctl; + struct nvlist_printops *ops; + + if ((pctl = malloc(sizeof (*pctl))) == NULL) + return (NULL); + + if ((ops = calloc(1, sizeof (*ops))) == NULL) { + free(pctl); + return (NULL); + } + + prtctl_defaults(stdout, pctl, ops); + + return (pctl); +} + +void +nvlist_prtctl_free(nvlist_prtctl_t pctl) +{ + if (pctl != NULL) { + free(pctl->nvprt_custops); + free(pctl); + } +} + +/* + * ====================================================================== + * | | + * | Top-level print request interfaces. | + * | | + * ====================================================================== + */ + +/* + * nvlist_print - Prints elements in an event buffer + */ +static void +nvlist_print_with_indent(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + FILE *fp = pctl->nvprt_fp; + char *name; + uint_t nelem; + nvpair_t *nvp; + + if (nvl == NULL) + return; + + indent(pctl, 0); + (void) fprintf(fp, "nvlist version: %d\n", NVL_VERSION(nvl)); + + nvp = nvlist_next_nvpair(nvl, NULL); + + while (nvp) { + data_type_t type = nvpair_type(nvp); + + name = nvpair_name(nvp); + nelem = 0; + + switch (type) { + case DATA_TYPE_BOOLEAN: { + RENDER(pctl, boolean, nvl, name, 1); + break; + } + case DATA_TYPE_BOOLEAN_VALUE: { + boolean_t val; + (void) nvpair_value_boolean_value(nvp, &val); + RENDER(pctl, boolean_value, nvl, name, val); + break; + } + case DATA_TYPE_BYTE: { + uchar_t val; + (void) nvpair_value_byte(nvp, &val); + RENDER(pctl, byte, nvl, name, val); + break; + } + case DATA_TYPE_INT8: { + int8_t val; + (void) nvpair_value_int8(nvp, &val); + RENDER(pctl, int8, nvl, name, val); + break; + } + case DATA_TYPE_UINT8: { + uint8_t val; + (void) nvpair_value_uint8(nvp, &val); + RENDER(pctl, uint8, nvl, name, val); + break; + } + case DATA_TYPE_INT16: { + int16_t val; + (void) nvpair_value_int16(nvp, &val); + RENDER(pctl, int16, nvl, name, val); + break; + } + case DATA_TYPE_UINT16: { + uint16_t val; + (void) nvpair_value_uint16(nvp, &val); + RENDER(pctl, uint16, nvl, name, val); + break; + } + case DATA_TYPE_INT32: { + int32_t val; + (void) nvpair_value_int32(nvp, &val); + RENDER(pctl, int32, nvl, name, val); + break; + } + case DATA_TYPE_UINT32: { + uint32_t val; + (void) nvpair_value_uint32(nvp, &val); + RENDER(pctl, uint32, nvl, name, val); + break; + } + case DATA_TYPE_INT64: { + int64_t val; + (void) nvpair_value_int64(nvp, &val); + RENDER(pctl, int64, nvl, name, val); + break; + } + case DATA_TYPE_UINT64: { + uint64_t val; + (void) nvpair_value_uint64(nvp, &val); + RENDER(pctl, uint64, nvl, name, val); + break; + } + case DATA_TYPE_DOUBLE: { + double val; + (void) nvpair_value_double(nvp, &val); + RENDER(pctl, double, nvl, name, val); + break; + } + case DATA_TYPE_STRING: { + char *val; + (void) nvpair_value_string(nvp, &val); + RENDER(pctl, string, nvl, name, val); + break; + } + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val; + (void) nvpair_value_boolean_array(nvp, &val, &nelem); + ARENDER(pctl, boolean_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val; + (void) nvpair_value_byte_array(nvp, &val, &nelem); + ARENDER(pctl, byte_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + (void) nvpair_value_int8_array(nvp, &val, &nelem); + ARENDER(pctl, int8_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + ARENDER(pctl, uint8_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + (void) nvpair_value_int16_array(nvp, &val, &nelem); + ARENDER(pctl, int16_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + ARENDER(pctl, uint16_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + (void) nvpair_value_int32_array(nvp, &val, &nelem); + ARENDER(pctl, int32_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + ARENDER(pctl, uint32_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + (void) nvpair_value_int64_array(nvp, &val, &nelem); + ARENDER(pctl, int64_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + ARENDER(pctl, uint64_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_STRING_ARRAY: { + char **val; + (void) nvpair_value_string_array(nvp, &val, &nelem); + ARENDER(pctl, string_array, nvl, name, val, nelem); + break; + } + case DATA_TYPE_HRTIME: { + hrtime_t val; + (void) nvpair_value_hrtime(nvp, &val); + RENDER(pctl, hrtime, nvl, name, val); + break; + } + case DATA_TYPE_NVLIST: { + nvlist_t *val; + (void) nvpair_value_nvlist(nvp, &val); + RENDER(pctl, nvlist, nvl, name, val); + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + ARENDER(pctl, nvlist_array, nvl, name, val, nelem); + break; + } + default: + (void) fprintf(fp, " unknown data type (%d)", type); + break; + } + nvp = nvlist_next_nvpair(nvl, nvp); + } +} + +void +nvlist_print(FILE *fp, nvlist_t *nvl) +{ + struct nvlist_prtctl pc; + + prtctl_defaults(fp, &pc, NULL); + nvlist_print_with_indent(nvl, &pc); +} + +void +nvlist_prt(nvlist_t *nvl, nvlist_prtctl_t pctl) +{ + nvlist_print_with_indent(nvl, pctl); +} + +#define NVP(elem, type, vtype, ptype, format) { \ + vtype value; \ +\ + (void) nvpair_value_##type(elem, &value); \ + (void) printf("%*s%s: " format "\n", indent, "", \ + nvpair_name(elem), (ptype)value); \ +} + +#define NVPA(elem, type, vtype, ptype, format) { \ + uint_t i, count; \ + vtype *value; \ +\ + (void) nvpair_value_##type(elem, &value, &count); \ + for (i = 0; i < count; i++) { \ + (void) printf("%*s%s[%d]: " format "\n", indent, "", \ + nvpair_name(elem), i, (ptype)value[i]); \ + } \ +} + +/* + * Similar to nvlist_print() but handles arrays slightly differently. + */ +void +dump_nvlist(nvlist_t *list, int indent) +{ + nvpair_t *elem = NULL; + boolean_t bool_value; + nvlist_t *nvlist_value; + nvlist_t **nvlist_array_value; + uint_t i, count; + + if (list == NULL) { + return; + } + + while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + (void) printf("%*s%s\n", indent, "", nvpair_name(elem)); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(elem, &bool_value); + (void) printf("%*s%s: %s\n", indent, "", + nvpair_name(elem), bool_value ? "true" : "false"); + break; + + case DATA_TYPE_BYTE: + NVP(elem, byte, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8: + NVP(elem, int8, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8: + NVP(elem, uint8, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16: + NVP(elem, int16, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16: + NVP(elem, uint16, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32: + NVP(elem, int32, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32: + NVP(elem, uint32, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64: + NVP(elem, int64, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64: + NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); + break; + + case DATA_TYPE_STRING: + NVP(elem, string, char *, char *, "'%s'"); + break; + + case DATA_TYPE_BYTE_ARRAY: + NVPA(elem, byte_array, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8_ARRAY: + NVPA(elem, int8_array, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8_ARRAY: + NVPA(elem, uint8_array, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16_ARRAY: + NVPA(elem, int16_array, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16_ARRAY: + NVPA(elem, uint16_array, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32_ARRAY: + NVPA(elem, int32_array, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32_ARRAY: + NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64_ARRAY: + NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64_ARRAY: + NVPA(elem, uint64_array, uint64_t, u_longlong_t, + "%llu"); + break; + + case DATA_TYPE_STRING_ARRAY: + NVPA(elem, string_array, char *, char *, "'%s'"); + break; + + case DATA_TYPE_NVLIST: + (void) nvpair_value_nvlist(elem, &nvlist_value); + (void) printf("%*s%s:\n", indent, "", + nvpair_name(elem)); + dump_nvlist(nvlist_value, indent + 4); + break; + + case DATA_TYPE_NVLIST_ARRAY: + (void) nvpair_value_nvlist_array(elem, + &nvlist_array_value, &count); + for (i = 0; i < count; i++) { + (void) printf("%*s%s[%u]:\n", indent, "", + nvpair_name(elem), i); + dump_nvlist(nvlist_array_value[i], indent + 4); + } + break; + + default: + (void) printf(dgettext(TEXT_DOMAIN, "bad config type " + "%d for %s\n"), nvpair_type(elem), + nvpair_name(elem)); + } + } +} + +/* + * ====================================================================== + * | | + * | Misc private interface. | + * | | + * ====================================================================== + */ + +/* + * Determine if string 'value' matches 'nvp' value. The 'value' string is + * converted, depending on the type of 'nvp', prior to match. For numeric + * types, a radix independent sscanf conversion of 'value' is used. If 'nvp' + * is an array type, 'ai' is the index into the array against which we are + * checking for match. If nvp is of DATA_TYPE_STRING*, the caller can pass + * in a regex_t compilation of value in 'value_regex' to trigger regular + * expression string match instead of simple strcmp(). + * + * Return 1 on match, 0 on no-match, and -1 on error. If the error is + * related to value syntax error and 'ep' is non-NULL, *ep will point into + * the 'value' string at the location where the error exists. + * + * NOTE: It may be possible to move the non-regex_t version of this into + * common code used by library/kernel/boot. + */ +int +nvpair_value_match_regex(nvpair_t *nvp, int ai, + char *value, regex_t *value_regex, char **ep) +{ + char *evalue; + uint_t a_len; + int sr; + + if (ep) + *ep = NULL; + + if ((nvp == NULL) || (value == NULL)) + return (-1); /* error fail match - invalid args */ + + /* make sure array and index combination make sense */ + if ((nvpair_type_is_array(nvp) && (ai < 0)) || + (!nvpair_type_is_array(nvp) && (ai >= 0))) + return (-1); /* error fail match - bad index */ + + /* non-string values should be single 'chunk' */ + if ((nvpair_type(nvp) != DATA_TYPE_STRING) && + (nvpair_type(nvp) != DATA_TYPE_STRING_ARRAY)) { + value += strspn(value, " \t"); + evalue = value + strcspn(value, " \t"); + if (*evalue) { + if (ep) + *ep = evalue; + return (-1); /* error fail match - syntax */ + } + } + + sr = EOF; + switch (nvpair_type(nvp)) { + case DATA_TYPE_STRING: { + char *val; + + /* check string value for match */ + if (nvpair_value_string(nvp, &val) == 0) { + if (value_regex) { + if (regexec(value_regex, val, + (size_t)0, NULL, 0) == 0) + return (1); /* match */ + } else { + if (strcmp(value, val) == 0) + return (1); /* match */ + } + } + break; + } + case DATA_TYPE_STRING_ARRAY: { + char **val_array; + + /* check indexed string value of array for match */ + if ((nvpair_value_string_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len)) { + if (value_regex) { + if (regexec(value_regex, val_array[ai], + (size_t)0, NULL, 0) == 0) + return (1); + } else { + if (strcmp(value, val_array[ai]) == 0) + return (1); + } + } + break; + } + case DATA_TYPE_BYTE: { + uchar_t val, val_arg; + + /* scanf uchar_t from value and check for match */ + sr = sscanf(value, "%c", &val_arg); + if ((sr == 1) && (nvpair_value_byte(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val_array, val_arg; + + + /* check indexed value of array for match */ + sr = sscanf(value, "%c", &val_arg); + if ((sr == 1) && + (nvpair_value_byte_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT8: { + int8_t val, val_arg; + + /* scanf int8_t from value and check for match */ + sr = sscanf(value, "%"SCNi8, &val_arg); + if ((sr == 1) && + (nvpair_value_int8(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT8_ARRAY: { + int8_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi8, &val_arg); + if ((sr == 1) && + (nvpair_value_int8_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT8: { + uint8_t val, val_arg; + + /* scanf uint8_t from value and check for match */ + sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint8(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi8, (int8_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint8_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT16: { + int16_t val, val_arg; + + /* scanf int16_t from value and check for match */ + sr = sscanf(value, "%"SCNi16, &val_arg); + if ((sr == 1) && + (nvpair_value_int16(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT16_ARRAY: { + int16_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi16, &val_arg); + if ((sr == 1) && + (nvpair_value_int16_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT16: { + uint16_t val, val_arg; + + /* scanf uint16_t from value and check for match */ + sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint16(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi16, (int16_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint16_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT32: { + int32_t val, val_arg; + + /* scanf int32_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_int32(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT32_ARRAY: { + int32_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, &val_arg); + if ((sr == 1) && + (nvpair_value_int32_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT32: { + uint32_t val, val_arg; + + /* scanf uint32_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint32(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint32_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT64: { + int64_t val, val_arg; + + /* scanf int64_t from value and check for match */ + sr = sscanf(value, "%"SCNi64, &val_arg); + if ((sr == 1) && + (nvpair_value_int64(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi64, &val_arg); + if ((sr == 1) && + (nvpair_value_int64_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT64: { + uint64_t val_arg, val; + + /* scanf uint64_t from value and check for match */ + sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint64(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val_array, val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi64, (int64_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_uint64_array(nvp, &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_BOOLEAN_VALUE: { + int32_t val_arg; + boolean_t val; + + /* scanf boolean_t from value and check for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_boolean_value(nvp, &val) == 0) && + (val == val_arg)) + return (1); + break; + } + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val_array; + int32_t val_arg; + + /* check indexed value of array for match */ + sr = sscanf(value, "%"SCNi32, (int32_t *)&val_arg); + if ((sr == 1) && + (nvpair_value_boolean_array(nvp, + &val_array, &a_len) == 0) && + (ai < a_len) && + (val_array[ai] == val_arg)) + return (1); + break; + } + case DATA_TYPE_HRTIME: + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: + case DATA_TYPE_BOOLEAN: + case DATA_TYPE_DOUBLE: + case DATA_TYPE_UNKNOWN: + default: + /* + * unknown/unsupported data type + */ + return (-1); /* error fail match */ + } + + /* + * check to see if sscanf failed conversion, return approximate + * pointer to problem + */ + if (sr != 1) { + if (ep) + *ep = value; + return (-1); /* error fail match - syntax */ + } + + return (0); /* fail match */ +} + +int +nvpair_value_match(nvpair_t *nvp, int ai, char *value, char **ep) +{ + return (nvpair_value_match_regex(nvp, ai, value, NULL, ep)); +} diff --git a/lib/libnvpair/libnvpair_json.c b/lib/libnvpair/libnvpair_json.c new file mode 100644 index 000000000000..37a392391fb0 --- /dev/null +++ b/lib/libnvpair/libnvpair_json.c @@ -0,0 +1,406 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ +/* + * Copyright (c) 2014, Joyent, Inc. + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "libnvpair.h" + +#define FPRINTF(fp, ...) \ + do { \ + if (fprintf(fp, __VA_ARGS__) < 0) \ + return (-1); \ + } while (0) + +/* + * When formatting a string for JSON output we must escape certain characters, + * as described in RFC4627. This applies to both member names and + * DATA_TYPE_STRING values. + * + * This function will only operate correctly if the following conditions are + * met: + * + * 1. The input String is encoded in the current locale. + * + * 2. The current locale includes the Basic Multilingual Plane (plane 0) + * as defined in the Unicode standard. + * + * The output will be entirely 7-bit ASCII (as a subset of UTF-8) with all + * representable Unicode characters included in their escaped numeric form. + */ +static int +nvlist_print_json_string(FILE *fp, const char *input) +{ + mbstate_t mbr; + wchar_t c; + size_t sz; + + bzero(&mbr, sizeof (mbr)); + + FPRINTF(fp, "\""); + while ((sz = mbrtowc(&c, input, MB_CUR_MAX, &mbr)) > 0) { + switch (c) { + case '"': + FPRINTF(fp, "\\\""); + break; + case '\n': + FPRINTF(fp, "\\n"); + break; + case '\r': + FPRINTF(fp, "\\r"); + break; + case '\\': + FPRINTF(fp, "\\\\"); + break; + case '\f': + FPRINTF(fp, "\\f"); + break; + case '\t': + FPRINTF(fp, "\\t"); + break; + case '\b': + FPRINTF(fp, "\\b"); + break; + default: + if ((c >= 0x00 && c <= 0x1f) || + (c > 0x7f && c <= 0xffff)) { + /* + * Render both Control Characters and Unicode + * characters in the Basic Multilingual Plane + * as JSON-escaped multibyte characters. + */ + FPRINTF(fp, "\\u%04x", (int)(0xffff & c)); + } else if (c >= 0x20 && c <= 0x7f) { + /* + * Render other 7-bit ASCII characters directly + * and drop other, unrepresentable characters. + */ + FPRINTF(fp, "%c", (int)(0xff & c)); + } + break; + } + input += sz; + } + + if (sz == (size_t)-1 || sz == (size_t)-2) { + /* + * We last read an invalid multibyte character sequence, + * so return an error. + */ + return (-1); + } + + FPRINTF(fp, "\""); + return (0); +} + +/* + * Dump a JSON-formatted representation of an nvlist to the provided FILE *. + * This routine does not output any new-lines or additional whitespace other + * than that contained in strings, nor does it call fflush(3C). + */ +int +nvlist_print_json(FILE *fp, nvlist_t *nvl) +{ + nvpair_t *curr; + boolean_t first = B_TRUE; + + FPRINTF(fp, "{"); + + for (curr = nvlist_next_nvpair(nvl, NULL); curr; + curr = nvlist_next_nvpair(nvl, curr)) { + data_type_t type = nvpair_type(curr); + + if (!first) + FPRINTF(fp, ","); + else + first = B_FALSE; + + if (nvlist_print_json_string(fp, nvpair_name(curr)) == -1) + return (-1); + FPRINTF(fp, ":"); + + switch (type) { + case DATA_TYPE_STRING: { + char *string = fnvpair_value_string(curr); + if (nvlist_print_json_string(fp, string) == -1) + return (-1); + break; + } + + case DATA_TYPE_BOOLEAN: { + FPRINTF(fp, "true"); + break; + } + + case DATA_TYPE_BOOLEAN_VALUE: { + FPRINTF(fp, "%s", fnvpair_value_boolean_value(curr) == + B_TRUE ? "true" : "false"); + break; + } + + case DATA_TYPE_BYTE: { + FPRINTF(fp, "%hhu", fnvpair_value_byte(curr)); + break; + } + + case DATA_TYPE_INT8: { + FPRINTF(fp, "%hhd", fnvpair_value_int8(curr)); + break; + } + + case DATA_TYPE_UINT8: { + FPRINTF(fp, "%hhu", fnvpair_value_uint8(curr)); + break; + } + + case DATA_TYPE_INT16: { + FPRINTF(fp, "%hd", fnvpair_value_int16(curr)); + break; + } + + case DATA_TYPE_UINT16: { + FPRINTF(fp, "%hu", fnvpair_value_uint16(curr)); + break; + } + + case DATA_TYPE_INT32: { + FPRINTF(fp, "%d", fnvpair_value_int32(curr)); + break; + } + + case DATA_TYPE_UINT32: { + FPRINTF(fp, "%u", fnvpair_value_uint32(curr)); + break; + } + + case DATA_TYPE_INT64: { + FPRINTF(fp, "%lld", + (long long)fnvpair_value_int64(curr)); + break; + } + + case DATA_TYPE_UINT64: { + FPRINTF(fp, "%llu", + (unsigned long long)fnvpair_value_uint64(curr)); + break; + } + + case DATA_TYPE_HRTIME: { + hrtime_t val; + VERIFY0(nvpair_value_hrtime(curr, &val)); + FPRINTF(fp, "%llu", (unsigned long long)val); + break; + } + + case DATA_TYPE_DOUBLE: { + double val; + VERIFY0(nvpair_value_double(curr, &val)); + FPRINTF(fp, "%f", val); + break; + } + + case DATA_TYPE_NVLIST: { + if (nvlist_print_json(fp, + fnvpair_value_nvlist(curr)) == -1) + return (-1); + break; + } + + case DATA_TYPE_STRING_ARRAY: { + char **val; + uint_t valsz, i; + VERIFY0(nvpair_value_string_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + if (nvlist_print_json_string(fp, val[i]) == -1) + return (-1); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t valsz, i; + VERIFY0(nvpair_value_nvlist_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + if (nvlist_print_json(fp, val[i]) == -1) + return (-1); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_BOOLEAN_ARRAY: { + boolean_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_boolean_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, val[i] == B_TRUE ? + "true" : "false"); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_BYTE_ARRAY: { + uchar_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_byte_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint8_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int8_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hhd", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint16_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hu", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int16_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%hd", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint32_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%u", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int32_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%d", val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_uint64_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%llu", + (unsigned long long)val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t valsz, i; + VERIFY0(nvpair_value_int64_array(curr, &val, &valsz)); + FPRINTF(fp, "["); + for (i = 0; i < valsz; i++) { + if (i > 0) + FPRINTF(fp, ","); + FPRINTF(fp, "%lld", (long long)val[i]); + } + FPRINTF(fp, "]"); + break; + } + + case DATA_TYPE_UNKNOWN: + case DATA_TYPE_DONTCARE: + return (-1); + } + + } + + FPRINTF(fp, "}"); + return (0); +} diff --git a/lib/libnvpair/nvpair_alloc_system.c b/lib/libnvpair/nvpair_alloc_system.c new file mode 100644 index 000000000000..54dde1ee644f --- /dev/null +++ b/lib/libnvpair/nvpair_alloc_system.c @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include +#include +#include + +static void * +nv_alloc_sys(nv_alloc_t *nva, size_t size) +{ + return (kmem_alloc(size, (int)(uintptr_t)nva->nva_arg)); +} + +/*ARGSUSED*/ +static void +nv_free_sys(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +static const nv_alloc_ops_t system_ops = { + NULL, /* nv_ao_init() */ + NULL, /* nv_ao_fini() */ + nv_alloc_sys, /* nv_ao_alloc() */ + nv_free_sys, /* nv_ao_free() */ + NULL /* nv_ao_reset() */ +}; + +nv_alloc_t nv_alloc_sleep_def = { + &system_ops, + (void *)KM_SLEEP +}; + +nv_alloc_t nv_alloc_nosleep_def = { + &system_ops, + (void *)KM_NOSLEEP +}; + +nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; +nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/lib/libshare/Makefile.am b/lib/libshare/Makefile.am new file mode 100644 index 000000000000..e730ee3bea1b --- /dev/null +++ b/lib/libshare/Makefile.am @@ -0,0 +1,25 @@ +include $(top_srcdir)/config/Rules.am + +DEFAULT_INCLUDES += -I$(srcdir) + +noinst_LTLIBRARIES = libshare.la + +USER_C = \ + libshare_impl.h \ + libshare.c \ + nfs.h \ + smb.h + +if BUILD_LINUX +USER_C += \ + os/linux/nfs.c \ + os/linux/smb.c +endif + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/nfs.c \ + os/freebsd/smb.c +endif + +libshare_la_SOURCES = $(USER_C) diff --git a/lib/libshare/libshare.c b/lib/libshare/libshare.c new file mode 100644 index 000000000000..d32a282a36e3 --- /dev/null +++ b/lib/libshare/libshare.c @@ -0,0 +1,366 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2018, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" +#include "libshare_impl.h" +#include "nfs.h" +#include "smb.h" + +static sa_share_impl_t alloc_share(const char *zfsname, const char *path); +static void free_share(sa_share_impl_t share); + +static int fstypes_count; +static sa_fstype_t *fstypes; + +sa_fstype_t * +register_fstype(const char *name, const sa_share_ops_t *ops) +{ + sa_fstype_t *fstype; + + fstype = calloc(1, sizeof (sa_fstype_t)); + + if (fstype == NULL) + return (NULL); + + fstype->name = name; + fstype->ops = ops; + fstype->fsinfo_index = fstypes_count; + + fstypes_count++; + + fstype->next = fstypes; + fstypes = fstype; + + return (fstype); +} + +__attribute__((constructor)) static void +libshare_init(void) +{ + libshare_nfs_init(); + libshare_smb_init(); +} + +int +sa_enable_share(const char *zfsname, const char *mountpoint, + const char *shareopts, char *protocol) +{ + int rc, ret = SA_OK; + boolean_t found_protocol = B_FALSE; + sa_fstype_t *fstype; + + sa_share_impl_t impl_share = alloc_share(zfsname, mountpoint); + if (impl_share == NULL) + return (SA_NO_MEMORY); + + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) { + + rc = fstype->ops->update_shareopts(impl_share, + shareopts); + if (rc != SA_OK) + break; + + rc = fstype->ops->enable_share(impl_share); + if (rc != SA_OK) + ret = rc; + + found_protocol = B_TRUE; + } + + fstype = fstype->next; + } + free_share(impl_share); + + return (found_protocol ? ret : SA_INVALID_PROTOCOL); +} + +int +sa_disable_share(const char *mountpoint, char *protocol) +{ + int rc, ret = SA_OK; + boolean_t found_protocol = B_FALSE; + sa_fstype_t *fstype; + + sa_share_impl_t impl_share = alloc_share(NULL, mountpoint); + if (impl_share == NULL) + return (SA_NO_MEMORY); + + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) { + + rc = fstype->ops->disable_share(impl_share); + if (rc != SA_OK) + ret = rc; + + found_protocol = B_TRUE; + } + + fstype = fstype->next; + } + free_share(impl_share); + + return (found_protocol ? ret : SA_INVALID_PROTOCOL); +} + +boolean_t +sa_is_shared(const char *mountpoint, char *protocol) +{ + sa_fstype_t *fstype; + boolean_t ret = B_FALSE; + + /* guid value is not used */ + sa_share_impl_t impl_share = alloc_share(NULL, mountpoint); + if (impl_share == NULL) + return (B_FALSE); + + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) { + ret = fstype->ops->is_shared(impl_share); + } + fstype = fstype->next; + } + free_share(impl_share); + return (ret); +} + +void +sa_commit_shares(const char *protocol) +{ + sa_fstype_t *fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, protocol) == 0) + fstype->ops->commit_shares(); + fstype = fstype->next; + } +} + +/* + * sa_errorstr(err) + * + * convert an error value to an error string + */ +char * +sa_errorstr(int err) +{ + static char errstr[32]; + char *ret = NULL; + + switch (err) { + case SA_OK: + ret = dgettext(TEXT_DOMAIN, "ok"); + break; + case SA_NO_SUCH_PATH: + ret = dgettext(TEXT_DOMAIN, "path doesn't exist"); + break; + case SA_NO_MEMORY: + ret = dgettext(TEXT_DOMAIN, "no memory"); + break; + case SA_DUPLICATE_NAME: + ret = dgettext(TEXT_DOMAIN, "name in use"); + break; + case SA_BAD_PATH: + ret = dgettext(TEXT_DOMAIN, "bad path"); + break; + case SA_NO_SUCH_GROUP: + ret = dgettext(TEXT_DOMAIN, "no such group"); + break; + case SA_CONFIG_ERR: + ret = dgettext(TEXT_DOMAIN, "configuration error"); + break; + case SA_SYSTEM_ERR: + ret = dgettext(TEXT_DOMAIN, "system error"); + break; + case SA_SYNTAX_ERR: + ret = dgettext(TEXT_DOMAIN, "syntax error"); + break; + case SA_NO_PERMISSION: + ret = dgettext(TEXT_DOMAIN, "no permission"); + break; + case SA_BUSY: + ret = dgettext(TEXT_DOMAIN, "busy"); + break; + case SA_NO_SUCH_PROP: + ret = dgettext(TEXT_DOMAIN, "no such property"); + break; + case SA_INVALID_NAME: + ret = dgettext(TEXT_DOMAIN, "invalid name"); + break; + case SA_INVALID_PROTOCOL: + ret = dgettext(TEXT_DOMAIN, "invalid protocol"); + break; + case SA_NOT_ALLOWED: + ret = dgettext(TEXT_DOMAIN, "operation not allowed"); + break; + case SA_BAD_VALUE: + ret = dgettext(TEXT_DOMAIN, "bad property value"); + break; + case SA_INVALID_SECURITY: + ret = dgettext(TEXT_DOMAIN, "invalid security type"); + break; + case SA_NO_SUCH_SECURITY: + ret = dgettext(TEXT_DOMAIN, "security type not found"); + break; + case SA_VALUE_CONFLICT: + ret = dgettext(TEXT_DOMAIN, "property value conflict"); + break; + case SA_NOT_IMPLEMENTED: + ret = dgettext(TEXT_DOMAIN, "not implemented"); + break; + case SA_INVALID_PATH: + ret = dgettext(TEXT_DOMAIN, "invalid path"); + break; + case SA_NOT_SUPPORTED: + ret = dgettext(TEXT_DOMAIN, "operation not supported"); + break; + case SA_PROP_SHARE_ONLY: + ret = dgettext(TEXT_DOMAIN, "property not valid for group"); + break; + case SA_NOT_SHARED: + ret = dgettext(TEXT_DOMAIN, "not shared"); + break; + case SA_NO_SUCH_RESOURCE: + ret = dgettext(TEXT_DOMAIN, "no such resource"); + break; + case SA_RESOURCE_REQUIRED: + ret = dgettext(TEXT_DOMAIN, "resource name required"); + break; + case SA_MULTIPLE_ERROR: + ret = dgettext(TEXT_DOMAIN, "errors from multiple protocols"); + break; + case SA_PATH_IS_SUBDIR: + ret = dgettext(TEXT_DOMAIN, "path is a subpath of share"); + break; + case SA_PATH_IS_PARENTDIR: + ret = dgettext(TEXT_DOMAIN, "path is parent of a share"); + break; + case SA_NO_SECTION: + ret = dgettext(TEXT_DOMAIN, "protocol requires a section"); + break; + case SA_NO_PROPERTIES: + ret = dgettext(TEXT_DOMAIN, "properties not found"); + break; + case SA_NO_SUCH_SECTION: + ret = dgettext(TEXT_DOMAIN, "section not found"); + break; + case SA_PASSWORD_ENC: + ret = dgettext(TEXT_DOMAIN, "passwords must be encrypted"); + break; + case SA_SHARE_EXISTS: + ret = dgettext(TEXT_DOMAIN, "path or file is already shared"); + break; + default: + (void) snprintf(errstr, sizeof (errstr), + dgettext(TEXT_DOMAIN, "unknown %d"), err); + ret = errstr; + } + return (ret); +} + +int +sa_validate_shareopts(char *options, char *proto) +{ + sa_fstype_t *fstype; + + fstype = fstypes; + while (fstype != NULL) { + if (strcmp(fstype->name, proto) != 0) { + fstype = fstype->next; + continue; + } + + return (fstype->ops->validate_shareopts(options)); + } + + return (SA_INVALID_PROTOCOL); +} + +static sa_share_impl_t +alloc_share(const char *zfsname, const char *mountpoint) +{ + sa_share_impl_t impl_share; + + impl_share = calloc(1, sizeof (struct sa_share_impl)); + + if (impl_share == NULL) + return (NULL); + + if (mountpoint != NULL && + ((impl_share->sa_mountpoint = strdup(mountpoint)) == NULL)) { + free(impl_share); + return (NULL); + } + + if (zfsname != NULL && + ((impl_share->sa_zfsname = strdup(zfsname)) == NULL)) { + free(impl_share->sa_mountpoint); + free(impl_share); + return (NULL); + } + + impl_share->sa_fsinfo = calloc(fstypes_count, + sizeof (sa_share_fsinfo_t)); + if (impl_share->sa_fsinfo == NULL) { + free(impl_share->sa_mountpoint); + free(impl_share->sa_zfsname); + free(impl_share); + return (NULL); + } + + return (impl_share); +} + +static void +free_share(sa_share_impl_t impl_share) +{ + sa_fstype_t *fstype; + + fstype = fstypes; + while (fstype != NULL) { + fstype->ops->clear_shareopts(impl_share); + fstype = fstype->next; + } + + free(impl_share->sa_mountpoint); + free(impl_share->sa_zfsname); + free(impl_share->sa_fsinfo); + free(impl_share); +} diff --git a/lib/libshare/libshare_impl.h b/lib/libshare/libshare_impl.h new file mode 100644 index 000000000000..cc5ef40085b1 --- /dev/null +++ b/lib/libshare/libshare_impl.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + */ + +typedef struct sa_share_fsinfo { + char *shareopts; +} sa_share_fsinfo_t; + +typedef struct sa_share_impl { + char *sa_mountpoint; + char *sa_zfsname; + + sa_share_fsinfo_t *sa_fsinfo; /* per-fstype information */ +} *sa_share_impl_t; + +#define FSINFO(impl_share, fstype) \ + (&(impl_share->sa_fsinfo[fstype->fsinfo_index])) + +typedef struct sa_share_ops { + int (*enable_share)(sa_share_impl_t share); + int (*disable_share)(sa_share_impl_t share); + boolean_t (*is_shared)(sa_share_impl_t share); + int (*validate_shareopts)(const char *shareopts); + int (*update_shareopts)(sa_share_impl_t impl_share, + const char *shareopts); + void (*clear_shareopts)(sa_share_impl_t impl_share); + int (*commit_shares)(void); +} sa_share_ops_t; + +typedef struct sa_fstype { + struct sa_fstype *next; + + const char *name; + const sa_share_ops_t *ops; + int fsinfo_index; +} sa_fstype_t; + +sa_fstype_t *register_fstype(const char *name, const sa_share_ops_t *ops); diff --git a/lib/libshare/nfs.h b/lib/libshare/nfs.h new file mode 100644 index 000000000000..b9ea6ee2f8cf --- /dev/null +++ b/lib/libshare/nfs.h @@ -0,0 +1,27 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Gunnar Beutner + */ + +void libshare_nfs_init(void); diff --git a/lib/libshare/os/freebsd/nfs.c b/lib/libshare/os/freebsd/nfs.c new file mode 100644 index 000000000000..65f3b11bf9b3 --- /dev/null +++ b/lib/libshare/os/freebsd/nfs.c @@ -0,0 +1,448 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" +#include "libshare_impl.h" +#include "nfs.h" + +#define _PATH_MOUNTDPID "/var/run/mountd.pid" +#define FILE_HEADER "# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n" +#define OPTSSIZE 1024 +#define MAXLINESIZE (PATH_MAX + OPTSSIZE) +#define ZFS_EXPORTS_FILE "/etc/zfs/exports" +#define ZFS_EXPORTS_LOCK ZFS_EXPORTS_FILE".lock" + +static sa_fstype_t *nfs_fstype; + +static int nfs_lock_fd = -1; + +/* + * The nfs_exports_[lock|unlock] is used to guard against conconcurrent + * updates to the exports file. Each protocol is responsible for + * providing the necessary locking to ensure consistency. + */ +static int +nfs_exports_lock(void) +{ + nfs_lock_fd = open(ZFS_EXPORTS_LOCK, + O_RDWR | O_CREAT, 0600); + if (nfs_lock_fd == -1) { + fprintf(stderr, "failed to lock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + return (errno); + } + if (flock(nfs_lock_fd, LOCK_EX) != 0) { + fprintf(stderr, "failed to lock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + return (errno); + } + return (0); +} + +static void +nfs_exports_unlock(void) +{ + verify(nfs_lock_fd > 0); + + if (flock(nfs_lock_fd, LOCK_UN) != 0) { + fprintf(stderr, "failed to unlock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + } + close(nfs_lock_fd); + nfs_lock_fd = -1; +} + +/* + * Read one line from a file. Skip comments, empty lines and a line with a + * mountpoint specified in the 'skip' argument. + * + * NOTE: This function returns a static buffer and thus is not thread-safe. + */ +static char * +zgetline(FILE *fd, const char *skip) +{ + static char line[MAXLINESIZE]; + size_t len, skiplen = 0; + char *s, last; + + if (skip != NULL) + skiplen = strlen(skip); + for (;;) { + s = fgets(line, sizeof (line), fd); + if (s == NULL) + return (NULL); + /* Skip empty lines and comments. */ + if (line[0] == '\n' || line[0] == '#') + continue; + len = strlen(line); + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + last = line[skiplen]; + /* Skip the given mountpoint. */ + if (skip != NULL && strncmp(skip, line, skiplen) == 0 && + (last == '\t' || last == ' ' || last == '\0')) { + continue; + } + break; + } + return (line); +} + +/* + * This function translate options to a format acceptable by exports(5), eg. + * + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \ + * zfs.freebsd.org 69.147.83.54 + * + * Accepted input formats: + * + * ro,network=192.168.0.0,mask=255.255.255.0,maproot=0,zfs.freebsd.org + * ro network=192.168.0.0 mask=255.255.255.0 maproot=0 zfs.freebsd.org + * -ro,-network=192.168.0.0,-mask=255.255.255.0,-maproot=0,zfs.freebsd.org + * -ro -network=192.168.0.0 -mask=255.255.255.0 -maproot=0 \ + * zfs.freebsd.org + * + * Recognized keywords: + * + * ro, maproot, mapall, mask, network, sec, alldirs, public, webnfs, + * index, quiet + * + * NOTE: This function returns a static buffer and thus is not thread-safe. + */ +static char * +translate_opts(const char *shareopts) +{ + static const char *known_opts[] = { "ro", "maproot", "mapall", "mask", + "network", "sec", "alldirs", "public", "webnfs", "index", "quiet", + NULL }; + static char newopts[OPTSSIZE]; + char oldopts[OPTSSIZE]; + char *o, *s = NULL; + unsigned int i; + size_t len; + + strlcpy(oldopts, shareopts, sizeof (oldopts)); + newopts[0] = '\0'; + s = oldopts; + while ((o = strsep(&s, "-, ")) != NULL) { + if (o[0] == '\0') + continue; + for (i = 0; known_opts[i] != NULL; i++) { + len = strlen(known_opts[i]); + if (strncmp(known_opts[i], o, len) == 0 && + (o[len] == '\0' || o[len] == '=')) { + strlcat(newopts, "-", sizeof (newopts)); + break; + } + } + strlcat(newopts, o, sizeof (newopts)); + strlcat(newopts, " ", sizeof (newopts)); + } + return (newopts); +} + +static char * +nfs_init_tmpfile(void) +{ + char *tmpfile = NULL; + + if (asprintf(&tmpfile, "%s%s", ZFS_EXPORTS_FILE, ".XXXXXXXX") == -1) { + fprintf(stderr, "Unable to allocate buffer for temporary " + "file name\n"); + return (NULL); + } + + int fd = mkstemp(tmpfile); + if (fd == -1) { + fprintf(stderr, "Unable to create temporary file: %s", + strerror(errno)); + free(tmpfile); + return (NULL); + } + close(fd); + return (tmpfile); +} + +static int +nfs_fini_tmpfile(char *tmpfile) +{ + if (rename(tmpfile, ZFS_EXPORTS_FILE) == -1) { + fprintf(stderr, "Unable to rename %s: %s\n", tmpfile, + strerror(errno)); + unlink(tmpfile); + free(tmpfile); + return (SA_SYSTEM_ERR); + } + free(tmpfile); + return (SA_OK); +} + +/* + * This function copies all entries from the exports file to "filename", + * omitting any entries for the specified mountpoint. + */ +static int +nfs_copy_entries(char *filename, const char *mountpoint) +{ + int error = SA_OK; + char *line; + + /* + * If the file doesn't exist then there is nothing more + * we need to do. + */ + FILE *oldfp = fopen(ZFS_EXPORTS_FILE, "r"); + if (oldfp == NULL) + return (SA_OK); + + FILE *newfp = fopen(filename, "w+"); + fputs(FILE_HEADER, newfp); + while ((line = zgetline(oldfp, mountpoint)) != NULL) + fprintf(newfp, "%s\n", line); + if (ferror(oldfp) != 0) { + error = ferror(oldfp); + } + if (error == 0 && ferror(newfp) != 0) { + error = ferror(newfp); + } + + if (fclose(newfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + fclose(oldfp); + + return (error); +} + +static int +nfs_enable_share(sa_share_impl_t impl_share) +{ + char *filename = NULL; + int error; + + if ((filename = nfs_init_tmpfile()) == NULL) + return (SA_SYSTEM_ERR); + + error = nfs_exports_lock(); + if (error != 0) { + unlink(filename); + free(filename); + return (error); + } + + error = nfs_copy_entries(filename, impl_share->sa_mountpoint); + if (error != SA_OK) { + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (error); + } + + FILE *fp = fopen(filename, "a+"); + if (fp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (SA_SYSTEM_ERR); + } + char *shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + if (strcmp(shareopts, "on") == 0) + shareopts = ""; + + if (fprintf(fp, "%s\t%s\n", impl_share->sa_mountpoint, + translate_opts(shareopts)) < 0) { + fprintf(stderr, "failed to write to %s\n", filename); + fclose(fp); + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (SA_SYSTEM_ERR); + } + + if (fclose(fp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (SA_SYSTEM_ERR); + } + error = nfs_fini_tmpfile(filename); + nfs_exports_unlock(); + return (error); +} + +static int +nfs_disable_share(sa_share_impl_t impl_share) +{ + int error; + char *filename = NULL; + + if ((filename = nfs_init_tmpfile()) == NULL) + return (SA_SYSTEM_ERR); + + error = nfs_exports_lock(); + if (error != 0) { + unlink(filename); + free(filename); + return (error); + } + + error = nfs_copy_entries(filename, impl_share->sa_mountpoint); + if (error != SA_OK) { + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (error); + } + + error = nfs_fini_tmpfile(filename); + nfs_exports_unlock(); + return (error); +} + +/* + * NOTE: This function returns a static buffer and thus is not thread-safe. + */ +static boolean_t +nfs_is_shared(sa_share_impl_t impl_share) +{ + static char line[MAXLINESIZE]; + char *s, last; + size_t len; + char *mntpoint = impl_share->sa_mountpoint; + size_t mntlen = strlen(mntpoint); + + FILE *fp = fopen(ZFS_EXPORTS_FILE, "r"); + if (fp == NULL) + return (B_FALSE); + + for (;;) { + s = fgets(line, sizeof (line), fp); + if (s == NULL) + return (B_FALSE); + /* Skip empty lines and comments. */ + if (line[0] == '\n' || line[0] == '#') + continue; + len = strlen(line); + if (line[len - 1] == '\n') + line[len - 1] = '\0'; + last = line[mntlen]; + /* Skip the given mountpoint. */ + if (strncmp(mntpoint, line, mntlen) == 0 && + (last == '\t' || last == ' ' || last == '\0')) { + fclose(fp); + return (B_TRUE); + } + } + fclose(fp); + return (B_FALSE); +} + +static int +nfs_validate_shareopts(const char *shareopts) +{ + return (SA_OK); +} + +static int +nfs_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = (char *)shareopts; + return (SA_OK); +} + +static void +nfs_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = NULL; +} + +/* + * Commit the shares by restarting mountd. + */ +static int +nfs_commit_shares(void) +{ + struct pidfh *pfh; + pid_t mountdpid; + + pfh = pidfile_open(_PATH_MOUNTDPID, 0600, &mountdpid); + if (pfh != NULL) { + /* Mountd is not running. */ + pidfile_remove(pfh); + return (SA_OK); + } + if (errno != EEXIST) { + /* Cannot open pidfile for some reason. */ + return (SA_SYSTEM_ERR); + } + /* We have mountd(8) PID in mountdpid variable. */ + kill(mountdpid, SIGHUP); + return (SA_OK); +} + +static const sa_share_ops_t nfs_shareops = { + .enable_share = nfs_enable_share, + .disable_share = nfs_disable_share, + .is_shared = nfs_is_shared, + + .validate_shareopts = nfs_validate_shareopts, + .update_shareopts = nfs_update_shareopts, + .clear_shareopts = nfs_clear_shareopts, + .commit_shares = nfs_commit_shares, +}; + +/* + * Initializes the NFS functionality of libshare. + */ +void +libshare_nfs_init(void) +{ + nfs_fstype = register_fstype("nfs", &nfs_shareops); +} diff --git a/lib/libshare/os/freebsd/smb.c b/lib/libshare/os/freebsd/smb.c new file mode 100644 index 000000000000..5b606ab96955 --- /dev/null +++ b/lib/libshare/os/freebsd/smb.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libshare_impl.h" +#include "smb.h" + +static sa_fstype_t *smb_fstype; + +/* + * Enables SMB sharing for the specified share. + */ +static int +smb_enable_share(sa_share_impl_t impl_share) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} +/* + * Disables SMB sharing for the specified share. + */ +static int +smb_disable_share(sa_share_impl_t impl_share) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} + +/* + * Checks whether the specified SMB share options are syntactically correct. + */ +static int +smb_validate_shareopts(const char *shareopts) +{ + fprintf(stderr, "No SMB support in FreeBSD yet.\n"); + return (SA_NOT_SUPPORTED); +} + +/* + * Checks whether a share is currently active. + */ +static boolean_t +smb_is_share_active(sa_share_impl_t impl_share) +{ + return (B_FALSE); +} + +/* + * Called to update a share's options. A share's options might be out of + * date if the share was loaded from disk and the "sharesmb" dataset + * property has changed in the meantime. This function also takes care + * of re-enabling the share if necessary. + */ +static int +smb_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + return (SA_OK); +} + +static int +smb_update_shares(void) +{ + /* Not implemented */ + return (0); +} +/* + * Clears a share's SMB options. Used by libshare to + * clean up shares that are about to be free()'d. + */ +static void +smb_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, smb_fstype)->shareopts = NULL; +} + +static const sa_share_ops_t smb_shareops = { + .enable_share = smb_enable_share, + .disable_share = smb_disable_share, + .is_shared = smb_is_share_active, + + .validate_shareopts = smb_validate_shareopts, + .update_shareopts = smb_update_shareopts, + .clear_shareopts = smb_clear_shareopts, + .commit_shares = smb_update_shares, +}; + +/* + * Initializes the SMB functionality of libshare. + */ +void +libshare_smb_init(void) +{ + smb_fstype = register_fstype("smb", &smb_shareops); +} diff --git a/lib/libshare/os/linux/nfs.c b/lib/libshare/os/linux/nfs.c new file mode 100644 index 000000000000..bc949213f69c --- /dev/null +++ b/lib/libshare/os/linux/nfs.c @@ -0,0 +1,712 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libshare_impl.h" +#include "nfs.h" + +#define FILE_HEADER "# !!! DO NOT EDIT THIS FILE MANUALLY !!!\n\n" +#define ZFS_EXPORTS_DIR "/etc/exports.d" +#define ZFS_EXPORTS_FILE ZFS_EXPORTS_DIR"/zfs.exports" +#define ZFS_EXPORTS_LOCK ZFS_EXPORTS_FILE".lock" + +static sa_fstype_t *nfs_fstype; + +typedef int (*nfs_shareopt_callback_t)(const char *opt, const char *value, + void *cookie); + +typedef int (*nfs_host_callback_t)(const char *sharepath, const char *filename, + const char *host, const char *security, const char *access, void *cookie); + +static int nfs_lock_fd = -1; + +/* + * The nfs_exports_[lock|unlock] is used to guard against conconcurrent + * updates to the exports file. Each protocol is responsible for + * providing the necessary locking to ensure consistency. + */ +static int +nfs_exports_lock(void) +{ + nfs_lock_fd = open(ZFS_EXPORTS_LOCK, + O_RDWR | O_CREAT, 0600); + if (nfs_lock_fd == -1) { + fprintf(stderr, "failed to lock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + return (errno); + } + if (flock(nfs_lock_fd, LOCK_EX) != 0) { + fprintf(stderr, "failed to lock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + return (errno); + } + return (0); +} + +static void +nfs_exports_unlock(void) +{ + verify(nfs_lock_fd > 0); + + if (flock(nfs_lock_fd, LOCK_UN) != 0) { + fprintf(stderr, "failed to unlock %s: %s\n", + ZFS_EXPORTS_LOCK, strerror(errno)); + } + close(nfs_lock_fd); + nfs_lock_fd = -1; +} + +/* + * Invokes the specified callback function for each Solaris share option + * listed in the specified string. + */ +static int +foreach_nfs_shareopt(const char *shareopts, + nfs_shareopt_callback_t callback, void *cookie) +{ + char *shareopts_dup, *opt, *cur, *value; + int was_nul, error; + + if (shareopts == NULL) + return (SA_OK); + + if (strcmp(shareopts, "on") == 0) + shareopts = "rw,crossmnt"; + + shareopts_dup = strdup(shareopts); + + + if (shareopts_dup == NULL) + return (SA_NO_MEMORY); + + opt = shareopts_dup; + was_nul = 0; + + while (1) { + cur = opt; + + while (*cur != ',' && *cur != '\0') + cur++; + + if (*cur == '\0') + was_nul = 1; + + *cur = '\0'; + + if (cur > opt) { + value = strchr(opt, '='); + + if (value != NULL) { + *value = '\0'; + value++; + } + + error = callback(opt, value, cookie); + + if (error != SA_OK) { + free(shareopts_dup); + return (error); + } + } + + opt = cur + 1; + + if (was_nul) + break; + } + + free(shareopts_dup); + + return (SA_OK); +} + +typedef struct nfs_host_cookie_s { + nfs_host_callback_t callback; + const char *sharepath; + void *cookie; + const char *filename; + const char *security; +} nfs_host_cookie_t; + +/* + * Helper function for foreach_nfs_host. This function checks whether the + * current share option is a host specification and invokes a callback + * function with information about the host. + */ +static int +foreach_nfs_host_cb(const char *opt, const char *value, void *pcookie) +{ + int error; + const char *access; + char *host_dup, *host, *next; + nfs_host_cookie_t *udata = (nfs_host_cookie_t *)pcookie; + +#ifdef DEBUG + fprintf(stderr, "foreach_nfs_host_cb: key=%s, value=%s\n", opt, value); +#endif + + if (strcmp(opt, "sec") == 0) + udata->security = value; + + if (strcmp(opt, "rw") == 0 || strcmp(opt, "ro") == 0) { + if (value == NULL) + value = "*"; + + access = opt; + + host_dup = strdup(value); + + if (host_dup == NULL) + return (SA_NO_MEMORY); + + host = host_dup; + + do { + next = strchr(host, ':'); + if (next != NULL) { + *next = '\0'; + next++; + } + + error = udata->callback(udata->filename, + udata->sharepath, host, udata->security, + access, udata->cookie); + + if (error != SA_OK) { + free(host_dup); + + return (error); + } + + host = next; + } while (host != NULL); + + free(host_dup); + } + + return (SA_OK); +} + +/* + * Invokes a callback function for all NFS hosts that are set for a share. + */ +static int +foreach_nfs_host(sa_share_impl_t impl_share, char *filename, + nfs_host_callback_t callback, void *cookie) +{ + nfs_host_cookie_t udata; + char *shareopts; + + udata.callback = callback; + udata.sharepath = impl_share->sa_mountpoint; + udata.cookie = cookie; + udata.filename = filename; + udata.security = "sys"; + + shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + + return (foreach_nfs_shareopt(shareopts, foreach_nfs_host_cb, + &udata)); +} + +/* + * Converts a Solaris NFS host specification to its Linux equivalent. + */ +static int +get_linux_hostspec(const char *solaris_hostspec, char **plinux_hostspec) +{ + /* + * For now we just support CIDR masks (e.g. @192.168.0.0/16) and host + * wildcards (e.g. *.example.org). + */ + if (solaris_hostspec[0] == '@') { + /* + * Solaris host specifier, e.g. @192.168.0.0/16; we just need + * to skip the @ in this case + */ + *plinux_hostspec = strdup(solaris_hostspec + 1); + } else { + *plinux_hostspec = strdup(solaris_hostspec); + } + + if (*plinux_hostspec == NULL) { + return (SA_NO_MEMORY); + } + + return (SA_OK); +} + +/* + * Adds a Linux share option to an array of NFS options. + */ +static int +add_linux_shareopt(char **plinux_opts, const char *key, const char *value) +{ + size_t len = 0; + char *new_linux_opts; + + if (*plinux_opts != NULL) + len = strlen(*plinux_opts); + + new_linux_opts = realloc(*plinux_opts, len + 1 + strlen(key) + + (value ? 1 + strlen(value) : 0) + 1); + + if (new_linux_opts == NULL) + return (SA_NO_MEMORY); + + new_linux_opts[len] = '\0'; + + if (len > 0) + strcat(new_linux_opts, ","); + + strcat(new_linux_opts, key); + + if (value != NULL) { + strcat(new_linux_opts, "="); + strcat(new_linux_opts, value); + } + + *plinux_opts = new_linux_opts; + + return (SA_OK); +} + +/* + * Validates and converts a single Solaris share option to its Linux + * equivalent. + */ +static int +get_linux_shareopts_cb(const char *key, const char *value, void *cookie) +{ + char **plinux_opts = (char **)cookie; + + /* host-specific options, these are taken care of elsewhere */ + if (strcmp(key, "ro") == 0 || strcmp(key, "rw") == 0 || + strcmp(key, "sec") == 0) + return (SA_OK); + + if (strcmp(key, "anon") == 0) + key = "anonuid"; + + if (strcmp(key, "root_mapping") == 0) { + (void) add_linux_shareopt(plinux_opts, "root_squash", NULL); + key = "anonuid"; + } + + if (strcmp(key, "nosub") == 0) + key = "subtree_check"; + + if (strcmp(key, "insecure") != 0 && strcmp(key, "secure") != 0 && + strcmp(key, "async") != 0 && strcmp(key, "sync") != 0 && + strcmp(key, "no_wdelay") != 0 && strcmp(key, "wdelay") != 0 && + strcmp(key, "nohide") != 0 && strcmp(key, "hide") != 0 && + strcmp(key, "crossmnt") != 0 && + strcmp(key, "no_subtree_check") != 0 && + strcmp(key, "subtree_check") != 0 && + strcmp(key, "insecure_locks") != 0 && + strcmp(key, "secure_locks") != 0 && + strcmp(key, "no_auth_nlm") != 0 && strcmp(key, "auth_nlm") != 0 && + strcmp(key, "no_acl") != 0 && strcmp(key, "mountpoint") != 0 && + strcmp(key, "mp") != 0 && strcmp(key, "fsuid") != 0 && + strcmp(key, "refer") != 0 && strcmp(key, "replicas") != 0 && + strcmp(key, "root_squash") != 0 && + strcmp(key, "no_root_squash") != 0 && + strcmp(key, "all_squash") != 0 && + strcmp(key, "no_all_squash") != 0 && strcmp(key, "fsid") != 0 && + strcmp(key, "anonuid") != 0 && strcmp(key, "anongid") != 0) { + return (SA_SYNTAX_ERR); + } + + (void) add_linux_shareopt(plinux_opts, key, value); + + return (SA_OK); +} + +/* + * Takes a string containing Solaris share options (e.g. "sync,no_acl") and + * converts them to a NULL-terminated array of Linux NFS options. + */ +static int +get_linux_shareopts(const char *shareopts, char **plinux_opts) +{ + int error; + + assert(plinux_opts != NULL); + + *plinux_opts = NULL; + + /* no_subtree_check - Default as of nfs-utils v1.1.0 */ + (void) add_linux_shareopt(plinux_opts, "no_subtree_check", NULL); + + /* mountpoint - Restrict exports to ZFS mountpoints */ + (void) add_linux_shareopt(plinux_opts, "mountpoint", NULL); + + error = foreach_nfs_shareopt(shareopts, get_linux_shareopts_cb, + plinux_opts); + + if (error != SA_OK) { + free(*plinux_opts); + *plinux_opts = NULL; + } + + return (error); +} + +static char * +nfs_init_tmpfile(void) +{ + char *tmpfile = NULL; + + if (asprintf(&tmpfile, "%s%s", ZFS_EXPORTS_FILE, ".XXXXXXXX") == -1) { + fprintf(stderr, "Unable to allocate temporary file\n"); + return (NULL); + } + + int fd = mkstemp(tmpfile); + if (fd == -1) { + fprintf(stderr, "Unable to create temporary file: %s", + strerror(errno)); + free(tmpfile); + return (NULL); + } + close(fd); + return (tmpfile); +} + +static int +nfs_fini_tmpfile(char *tmpfile) +{ + if (rename(tmpfile, ZFS_EXPORTS_FILE) == -1) { + fprintf(stderr, "Unable to rename %s: %s\n", tmpfile, + strerror(errno)); + unlink(tmpfile); + free(tmpfile); + return (SA_SYSTEM_ERR); + } + free(tmpfile); + return (SA_OK); +} + +/* + * This function populates an entry into /etc/exports.d/zfs.exports. + * This file is consumed by the linux nfs server so that zfs shares are + * automatically exported upon boot or whenever the nfs server restarts. + */ +static int +nfs_add_entry(const char *filename, const char *sharepath, + const char *host, const char *security, const char *access_opts, + void *pcookie) +{ + int error; + char *linuxhost; + const char *linux_opts = (const char *)pcookie; + + error = get_linux_hostspec(host, &linuxhost); + if (error != SA_OK) + return (error); + + if (linux_opts == NULL) + linux_opts = ""; + + FILE *fp = fopen(filename, "a+"); + if (fp == NULL) { + fprintf(stderr, "failed to open %s file: %s", filename, + strerror(errno)); + free(linuxhost); + return (SA_SYSTEM_ERR); + } + + if (fprintf(fp, "%s %s(sec=%s,%s,%s)\n", sharepath, linuxhost, + security, access_opts, linux_opts) < 0) { + fprintf(stderr, "failed to write to %s\n", filename); + free(linuxhost); + fclose(fp); + return (SA_SYSTEM_ERR); + } + + free(linuxhost); + if (fclose(fp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + return (SA_SYSTEM_ERR); + } + return (SA_OK); +} + +/* + * This function copies all entries from the exports file to "filename", + * omitting any entries for the specified mountpoint. + */ +static int +nfs_copy_entries(char *filename, const char *mountpoint) +{ + char *buf = NULL; + size_t buflen = 0; + int error = SA_OK; + + /* + * If the file doesn't exist then there is nothing more + * we need to do. + */ + FILE *oldfp = fopen(ZFS_EXPORTS_FILE, "r"); + if (oldfp == NULL) + return (SA_OK); + + FILE *newfp = fopen(filename, "w+"); + fputs(FILE_HEADER, newfp); + while ((getline(&buf, &buflen, oldfp)) != -1) { + char *space = NULL; + + if (buf[0] == '\n' || buf[0] == '#') + continue; + + if ((space = strchr(buf, ' ')) != NULL) { + int mountpoint_len = strlen(mountpoint); + + if (space - buf == mountpoint_len && + strncmp(mountpoint, buf, mountpoint_len) == 0) { + continue; + } + } + fputs(buf, newfp); + } + + if (oldfp != NULL && ferror(oldfp) != 0) { + error = ferror(oldfp); + } + if (error == 0 && ferror(newfp) != 0) { + error = ferror(newfp); + } + + free(buf); + if (fclose(newfp) != 0) { + fprintf(stderr, "Unable to close file %s: %s\n", + filename, strerror(errno)); + error = error != 0 ? error : SA_SYSTEM_ERR; + } + fclose(oldfp); + + return (error); +} + +/* + * Enables NFS sharing for the specified share. + */ +static int +nfs_enable_share(sa_share_impl_t impl_share) +{ + char *shareopts, *linux_opts; + char *filename = NULL; + int error; + + if ((filename = nfs_init_tmpfile()) == NULL) + return (SA_SYSTEM_ERR); + + error = nfs_exports_lock(); + if (error != 0) { + unlink(filename); + free(filename); + return (error); + } + + error = nfs_copy_entries(filename, impl_share->sa_mountpoint); + if (error != SA_OK) { + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (error); + } + + shareopts = FSINFO(impl_share, nfs_fstype)->shareopts; + error = get_linux_shareopts(shareopts, &linux_opts); + if (error != SA_OK) { + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (error); + } + + error = foreach_nfs_host(impl_share, filename, nfs_add_entry, + linux_opts); + free(linux_opts); + if (error == 0) { + error = nfs_fini_tmpfile(filename); + } else { + unlink(filename); + free(filename); + } + nfs_exports_unlock(); + return (error); +} + +/* + * Disables NFS sharing for the specified share. + */ +static int +nfs_disable_share(sa_share_impl_t impl_share) +{ + int error; + char *filename = NULL; + + if ((filename = nfs_init_tmpfile()) == NULL) + return (SA_SYSTEM_ERR); + + error = nfs_exports_lock(); + if (error != 0) { + unlink(filename); + free(filename); + return (error); + } + + error = nfs_copy_entries(filename, impl_share->sa_mountpoint); + if (error != SA_OK) { + unlink(filename); + free(filename); + nfs_exports_unlock(); + return (error); + } + error = nfs_fini_tmpfile(filename); + nfs_exports_unlock(); + return (error); +} + +static boolean_t +nfs_is_shared(sa_share_impl_t impl_share) +{ + size_t buflen = 0; + char *buf = NULL; + + FILE *fp = fopen(ZFS_EXPORTS_FILE, "r"); + if (fp == NULL) { + return (B_FALSE); + } + while ((getline(&buf, &buflen, fp)) != -1) { + char *space = NULL; + + if ((space = strchr(buf, ' ')) != NULL) { + int mountpoint_len = strlen(impl_share->sa_mountpoint); + + if (space - buf == mountpoint_len && + strncmp(impl_share->sa_mountpoint, buf, + mountpoint_len) == 0) { + fclose(fp); + free(buf); + return (B_TRUE); + } + } + } + free(buf); + fclose(fp); + return (B_FALSE); +} + +/* + * Checks whether the specified NFS share options are syntactically correct. + */ +static int +nfs_validate_shareopts(const char *shareopts) +{ + char *linux_opts; + int error; + + error = get_linux_shareopts(shareopts, &linux_opts); + + if (error != SA_OK) + return (error); + + free(linux_opts); + return (SA_OK); +} + +static int +nfs_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = (char *)shareopts; + return (SA_OK); +} + +/* + * Clears a share's NFS options. Used by libshare to + * clean up shares that are about to be free()'d. + */ +static void +nfs_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, nfs_fstype)->shareopts = NULL; +} + +static int +nfs_commit_shares(void) +{ + char *argv[] = { + "/usr/sbin/exportfs", + "-ra", + NULL + }; + + return (libzfs_run_process(argv[0], argv, 0)); +} + +static const sa_share_ops_t nfs_shareops = { + .enable_share = nfs_enable_share, + .disable_share = nfs_disable_share, + .is_shared = nfs_is_shared, + + .validate_shareopts = nfs_validate_shareopts, + .update_shareopts = nfs_update_shareopts, + .clear_shareopts = nfs_clear_shareopts, + .commit_shares = nfs_commit_shares, +}; + +/* + * Initializes the NFS functionality of libshare. + */ +void +libshare_nfs_init(void) +{ + struct stat sb; + + nfs_fstype = register_fstype("nfs", &nfs_shareops); + + if (stat(ZFS_EXPORTS_DIR, &sb) < 0 && + mkdir(ZFS_EXPORTS_DIR, 0755) < 0) { + fprintf(stderr, "failed to create %s: %s\n", + ZFS_EXPORTS_DIR, strerror(errno)); + } +} diff --git a/lib/libshare/os/linux/smb.c b/lib/libshare/os/linux/smb.c new file mode 100644 index 000000000000..3dcf666eb695 --- /dev/null +++ b/lib/libshare/os/linux/smb.c @@ -0,0 +1,453 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2002, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011,2012 Turbo Fredriksson , based on nfs.c + * by Gunnar Beutner + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + * + * This is an addition to the zfs device driver to add, modify and remove SMB + * shares using the 'net share' command that comes with Samba. + * + * TESTING + * Make sure that samba listens to 'localhost' (127.0.0.1) and that the options + * 'usershare max shares' and 'usershare owner only' have been reviewed/set + * accordingly (see zfs(8) for information). + * + * Once configuration in samba have been done, test that this + * works with the following three commands (in this case, my ZFS + * filesystem is called 'share/Test1'): + * + * (root)# net -U root -S 127.0.0.1 usershare add Test1 /share/Test1 \ + * "Comment: /share/Test1" "Everyone:F" + * (root)# net usershare list | grep -i test + * (root)# net -U root -S 127.0.0.1 usershare delete Test1 + * + * The first command will create a user share that gives everyone full access. + * To limit the access below that, use normal UNIX commands (chmod, chown etc). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libshare_impl.h" +#include "smb.h" + +static boolean_t smb_available(void); + +static sa_fstype_t *smb_fstype; + +smb_share_t *smb_shares; +static int smb_disable_share(sa_share_impl_t impl_share); +static boolean_t smb_is_share_active(sa_share_impl_t impl_share); + +/* + * Retrieve the list of SMB shares. + */ +static int +smb_retrieve_shares(void) +{ + int rc = SA_OK; + char file_path[PATH_MAX], line[512], *token, *key, *value; + char *dup_value = NULL, *path = NULL, *comment = NULL, *name = NULL; + char *guest_ok = NULL; + DIR *shares_dir; + FILE *share_file_fp = NULL; + struct dirent *directory; + struct stat eStat; + smb_share_t *shares, *new_shares = NULL; + + /* opendir(), stat() */ + shares_dir = opendir(SHARE_DIR); + if (shares_dir == NULL) + return (SA_SYSTEM_ERR); + + /* Go through the directory, looking for shares */ + while ((directory = readdir(shares_dir))) { + if (directory->d_name[0] == '.') + continue; + + snprintf(file_path, sizeof (file_path), + "%s/%s", SHARE_DIR, directory->d_name); + + if (stat(file_path, &eStat) == -1) { + rc = SA_SYSTEM_ERR; + goto out; + } + + if (!S_ISREG(eStat.st_mode)) + continue; + + if ((share_file_fp = fopen(file_path, "r")) == NULL) { + rc = SA_SYSTEM_ERR; + goto out; + } + + name = strdup(directory->d_name); + if (name == NULL) { + rc = SA_NO_MEMORY; + goto out; + } + + while (fgets(line, sizeof (line), share_file_fp)) { + if (line[0] == '#') + continue; + + /* Trim trailing new-line character(s). */ + while (line[strlen(line) - 1] == '\r' || + line[strlen(line) - 1] == '\n') + line[strlen(line) - 1] = '\0'; + + /* Split the line in two, separated by '=' */ + token = strchr(line, '='); + if (token == NULL) + continue; + + key = line; + value = token + 1; + *token = '\0'; + + dup_value = strdup(value); + if (dup_value == NULL) { + rc = SA_NO_MEMORY; + goto out; + } + + if (strcmp(key, "path") == 0) { + free(path); + path = dup_value; + } else if (strcmp(key, "comment") == 0) { + free(comment); + comment = dup_value; + } else if (strcmp(key, "guest_ok") == 0) { + free(guest_ok); + guest_ok = dup_value; + } else + free(dup_value); + + dup_value = NULL; + + if (path == NULL || comment == NULL || guest_ok == NULL) + continue; /* Incomplete share definition */ + else { + shares = (smb_share_t *) + malloc(sizeof (smb_share_t)); + if (shares == NULL) { + rc = SA_NO_MEMORY; + goto out; + } + + (void) strlcpy(shares->name, name, + sizeof (shares->name)); + + (void) strlcpy(shares->path, path, + sizeof (shares->path)); + + (void) strlcpy(shares->comment, comment, + sizeof (shares->comment)); + + shares->guest_ok = atoi(guest_ok); + + shares->next = new_shares; + new_shares = shares; + + free(path); + free(comment); + free(guest_ok); + + path = NULL; + comment = NULL; + guest_ok = NULL; + } + } + +out: + if (share_file_fp != NULL) { + fclose(share_file_fp); + share_file_fp = NULL; + } + + free(name); + free(path); + free(comment); + free(guest_ok); + + name = NULL; + path = NULL; + comment = NULL; + guest_ok = NULL; + } + closedir(shares_dir); + + smb_shares = new_shares; + + return (rc); +} + +/* + * Used internally by smb_enable_share to enable sharing for a single host. + */ +static int +smb_enable_share_one(const char *sharename, const char *sharepath) +{ + char *argv[10], *pos; + char name[SMB_NAME_MAX], comment[SMB_COMMENT_MAX]; + int rc; + + /* Support ZFS share name regexp '[[:alnum:]_-.: ]' */ + strlcpy(name, sharename, sizeof (name)); + name [sizeof (name)-1] = '\0'; + + pos = name; + while (*pos != '\0') { + switch (*pos) { + case '/': + case '-': + case ':': + case ' ': + *pos = '_'; + } + + ++pos; + } + + /* + * CMD: net -S NET_CMD_ARG_HOST usershare add Test1 /share/Test1 \ + * "Comment" "Everyone:F" + */ + snprintf(comment, sizeof (comment), "Comment: %s", sharepath); + + argv[0] = NET_CMD_PATH; + argv[1] = (char *)"-S"; + argv[2] = NET_CMD_ARG_HOST; + argv[3] = (char *)"usershare"; + argv[4] = (char *)"add"; + argv[5] = (char *)name; + argv[6] = (char *)sharepath; + argv[7] = (char *)comment; + argv[8] = "Everyone:F"; + argv[9] = NULL; + + rc = libzfs_run_process(argv[0], argv, 0); + if (rc < 0) + return (SA_SYSTEM_ERR); + + /* Reload the share file */ + (void) smb_retrieve_shares(); + + return (SA_OK); +} + +/* + * Enables SMB sharing for the specified share. + */ +static int +smb_enable_share(sa_share_impl_t impl_share) +{ + char *shareopts; + + if (!smb_available()) + return (SA_SYSTEM_ERR); + + if (smb_is_share_active(impl_share)) + smb_disable_share(impl_share); + + shareopts = FSINFO(impl_share, smb_fstype)->shareopts; + if (shareopts == NULL) /* on/off */ + return (SA_SYSTEM_ERR); + + if (strcmp(shareopts, "off") == 0) + return (SA_OK); + + /* Magic: Enable (i.e., 'create new') share */ + return (smb_enable_share_one(impl_share->sa_zfsname, + impl_share->sa_mountpoint)); +} + +/* + * Used internally by smb_disable_share to disable sharing for a single host. + */ +static int +smb_disable_share_one(const char *sharename) +{ + int rc; + char *argv[7]; + + /* CMD: net -S NET_CMD_ARG_HOST usershare delete Test1 */ + argv[0] = NET_CMD_PATH; + argv[1] = (char *)"-S"; + argv[2] = NET_CMD_ARG_HOST; + argv[3] = (char *)"usershare"; + argv[4] = (char *)"delete"; + argv[5] = strdup(sharename); + argv[6] = NULL; + + rc = libzfs_run_process(argv[0], argv, 0); + if (rc < 0) + return (SA_SYSTEM_ERR); + else + return (SA_OK); +} + +/* + * Disables SMB sharing for the specified share. + */ +static int +smb_disable_share(sa_share_impl_t impl_share) +{ + smb_share_t *shares = smb_shares; + + if (!smb_available()) { + /* + * The share can't possibly be active, so nothing + * needs to be done to disable it. + */ + return (SA_OK); + } + + while (shares != NULL) { + if (strcmp(impl_share->sa_mountpoint, shares->path) == 0) + return (smb_disable_share_one(shares->name)); + + shares = shares->next; + } + + return (SA_OK); +} + +/* + * Checks whether the specified SMB share options are syntactically correct. + */ +static int +smb_validate_shareopts(const char *shareopts) +{ + /* TODO: Accept 'name' and sec/acl (?) */ + if ((strcmp(shareopts, "off") == 0) || (strcmp(shareopts, "on") == 0)) + return (SA_OK); + + return (SA_SYNTAX_ERR); +} + +/* + * Checks whether a share is currently active. + */ +static boolean_t +smb_is_share_active(sa_share_impl_t impl_share) +{ + smb_share_t *iter = smb_shares; + + if (!smb_available()) + return (B_FALSE); + + /* Retrieve the list of (possible) active shares */ + smb_retrieve_shares(); + + while (iter != NULL) { + if (strcmp(impl_share->sa_mountpoint, iter->path) == 0) + return (B_TRUE); + + iter = iter->next; + } + + return (B_FALSE); +} + +/* + * Called to update a share's options. A share's options might be out of + * date if the share was loaded from disk and the "sharesmb" dataset + * property has changed in the meantime. This function also takes care + * of re-enabling the share if necessary. + */ +static int +smb_update_shareopts(sa_share_impl_t impl_share, const char *shareopts) +{ + if (!impl_share) + return (SA_SYSTEM_ERR); + + FSINFO(impl_share, smb_fstype)->shareopts = (char *)shareopts; + return (SA_OK); +} + +static int +smb_update_shares(void) +{ + /* Not implemented */ + return (0); +} + +/* + * Clears a share's SMB options. Used by libshare to + * clean up shares that are about to be free()'d. + */ +static void +smb_clear_shareopts(sa_share_impl_t impl_share) +{ + FSINFO(impl_share, smb_fstype)->shareopts = NULL; +} + +static const sa_share_ops_t smb_shareops = { + .enable_share = smb_enable_share, + .disable_share = smb_disable_share, + .is_shared = smb_is_share_active, + + .validate_shareopts = smb_validate_shareopts, + .update_shareopts = smb_update_shareopts, + .clear_shareopts = smb_clear_shareopts, + .commit_shares = smb_update_shares, +}; + +/* + * Provides a convenient wrapper for determining SMB availability + */ +static boolean_t +smb_available(void) +{ + struct stat statbuf; + + if (lstat(SHARE_DIR, &statbuf) != 0 || + !S_ISDIR(statbuf.st_mode)) + return (B_FALSE); + + if (access(NET_CMD_PATH, F_OK) != 0) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Initializes the SMB functionality of libshare. + */ +void +libshare_smb_init(void) +{ + smb_fstype = register_fstype("smb", &smb_shareops); +} diff --git a/lib/libshare/smb.h b/lib/libshare/smb.h new file mode 100644 index 000000000000..8ea44677f9ae --- /dev/null +++ b/lib/libshare/smb.h @@ -0,0 +1,49 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011 Turbo Fredriksson . + */ + +/* + * The maximum SMB share name seems to be 254 characters, though good + * references are hard to find. + */ + +#define SMB_NAME_MAX 255 +#define SMB_COMMENT_MAX 255 + +#define SHARE_DIR "/var/lib/samba/usershares" +#define NET_CMD_PATH "/usr/bin/net" +#define NET_CMD_ARG_HOST "127.0.0.1" + +typedef struct smb_share_s { + char name[SMB_NAME_MAX]; /* Share name */ + char path[PATH_MAX]; /* Share path */ + char comment[SMB_COMMENT_MAX]; /* Share's comment */ + boolean_t guest_ok; /* 'y' or 'n' */ + + struct smb_share_s *next; +} smb_share_t; + +extern smb_share_t *smb_shares; + +void libshare_smb_init(void); diff --git a/lib/libspl/Makefile.am b/lib/libspl/Makefile.am new file mode 100644 index 000000000000..f576d69248fa --- /dev/null +++ b/lib/libspl/Makefile.am @@ -0,0 +1,56 @@ +include $(top_srcdir)/config/Rules.am + +if TARGET_CPU_I386 +TARGET_CPU_ATOMIC_SOURCE = asm-i386/atomic.S +else +if TARGET_CPU_X86_64 +TARGET_CPU_ATOMIC_SOURCE = asm-x86_64/atomic.S +else +TARGET_CPU_ATOMIC_SOURCE = asm-generic/atomic.c +endif +endif + +SUBDIRS = include + +AM_CCASFLAGS = \ + $(CFLAGS) + +noinst_LTLIBRARIES = libspl_assert.la libspl.la + +libspl_assert_la_SOURCES = \ + assert.c + +USER_C = \ + list.c \ + mkdirp.c \ + page.c \ + strlcat.c \ + strlcpy.c \ + timestamp.c \ + zone.c \ + include/sys/list.h \ + include/sys/list_impl.h + +if BUILD_LINUX +USER_C += \ + os/linux/getexecname.c \ + os/linux/gethostid.c \ + os/linux/getmntany.c +endif + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/getexecname.c \ + os/freebsd/gethostid.c \ + os/freebsd/getmntany.c \ + os/freebsd/mnttab.c +endif + +libspl_la_SOURCES = \ + $(USER_C) \ + $(TARGET_CPU_ATOMIC_SOURCE) + +libspl_la_LIBADD = \ + libspl_assert.la + +libspl_la_LIBADD += $(LIBCLOCK_GETTIME) diff --git a/lib/libspl/asm-generic/.gitignore b/lib/libspl/asm-generic/.gitignore new file mode 100644 index 000000000000..2792cf7b4551 --- /dev/null +++ b/lib/libspl/asm-generic/.gitignore @@ -0,0 +1 @@ +/atomic.S diff --git a/lib/libspl/asm-generic/atomic.c b/lib/libspl/asm-generic/atomic.c new file mode 100644 index 000000000000..35535ea49c79 --- /dev/null +++ b/lib/libspl/asm-generic/atomic.c @@ -0,0 +1,450 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009 by Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +/* + * All operations are implemented by serializing them through a global + * pthread mutex. This provides a correct generic implementation. + * However all supported architectures are encouraged to provide a + * native implementation is assembly for performance reasons. + */ +pthread_mutex_t atomic_lock = PTHREAD_MUTEX_INITIALIZER; + +/* + * These are the void returning variants + */ +/* BEGIN CSTYLED */ +#define ATOMIC_INC(name, type) \ + void atomic_inc_##name(volatile type *target) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + (*target)++; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_INC(8, uint8_t) +ATOMIC_INC(uchar, uchar_t) +ATOMIC_INC(16, uint16_t) +ATOMIC_INC(ushort, ushort_t) +ATOMIC_INC(32, uint32_t) +ATOMIC_INC(uint, uint_t) +ATOMIC_INC(ulong, ulong_t) +ATOMIC_INC(64, uint64_t) + + +#define ATOMIC_DEC(name, type) \ + void atomic_dec_##name(volatile type *target) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + (*target)--; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_DEC(8, uint8_t) +ATOMIC_DEC(uchar, uchar_t) +ATOMIC_DEC(16, uint16_t) +ATOMIC_DEC(ushort, ushort_t) +ATOMIC_DEC(32, uint32_t) +ATOMIC_DEC(uint, uint_t) +ATOMIC_DEC(ulong, ulong_t) +ATOMIC_DEC(64, uint64_t) + + +#define ATOMIC_ADD(name, type1, type2) \ + void atomic_add_##name(volatile type1 *target, type2 bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target += bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_ADD(8, uint8_t, int8_t) +ATOMIC_ADD(char, uchar_t, signed char) +ATOMIC_ADD(16, uint16_t, int16_t) +ATOMIC_ADD(short, ushort_t, short) +ATOMIC_ADD(32, uint32_t, int32_t) +ATOMIC_ADD(int, uint_t, int) +ATOMIC_ADD(long, ulong_t, long) +ATOMIC_ADD(64, uint64_t, int64_t) + +void +atomic_add_ptr(volatile void *target, ssize_t bits) +{ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + *(caddr_t *)target += bits; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); +} + + +#define ATOMIC_SUB(name, type1, type2) \ + void atomic_sub_##name(volatile type1 *target, type2 bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target -= bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_SUB(8, uint8_t, int8_t) +ATOMIC_SUB(char, uchar_t, signed char) +ATOMIC_SUB(16, uint16_t, int16_t) +ATOMIC_SUB(short, ushort_t, short) +ATOMIC_SUB(32, uint32_t, int32_t) +ATOMIC_SUB(int, uint_t, int) +ATOMIC_SUB(long, ulong_t, long) +ATOMIC_SUB(64, uint64_t, int64_t) + +void +atomic_sub_ptr(volatile void *target, ssize_t bits) +{ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + *(caddr_t *)target -= bits; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); +} + + +#define ATOMIC_OR(name, type) \ + void atomic_or_##name(volatile type *target, type bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target |= bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_OR(8, uint8_t) +ATOMIC_OR(uchar, uchar_t) +ATOMIC_OR(16, uint16_t) +ATOMIC_OR(ushort, ushort_t) +ATOMIC_OR(32, uint32_t) +ATOMIC_OR(uint, uint_t) +ATOMIC_OR(ulong, ulong_t) +ATOMIC_OR(64, uint64_t) + + +#define ATOMIC_AND(name, type) \ + void atomic_and_##name(volatile type *target, type bits) \ + { \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + *target &= bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + } + +ATOMIC_AND(8, uint8_t) +ATOMIC_AND(uchar, uchar_t) +ATOMIC_AND(16, uint16_t) +ATOMIC_AND(ushort, ushort_t) +ATOMIC_AND(32, uint32_t) +ATOMIC_AND(uint, uint_t) +ATOMIC_AND(ulong, ulong_t) +ATOMIC_AND(64, uint64_t) + + +/* + * New value returning variants + */ + +#define ATOMIC_INC_NV(name, type) \ + type atomic_inc_##name##_nv(volatile type *target) \ + { \ + type rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (++(*target)); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_INC_NV(8, uint8_t) +ATOMIC_INC_NV(uchar, uchar_t) +ATOMIC_INC_NV(16, uint16_t) +ATOMIC_INC_NV(ushort, ushort_t) +ATOMIC_INC_NV(32, uint32_t) +ATOMIC_INC_NV(uint, uint_t) +ATOMIC_INC_NV(ulong, ulong_t) +ATOMIC_INC_NV(64, uint64_t) + + +#define ATOMIC_DEC_NV(name, type) \ + type atomic_dec_##name##_nv(volatile type *target) \ + { \ + type rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (--(*target)); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_DEC_NV(8, uint8_t) +ATOMIC_DEC_NV(uchar, uchar_t) +ATOMIC_DEC_NV(16, uint16_t) +ATOMIC_DEC_NV(ushort, ushort_t) +ATOMIC_DEC_NV(32, uint32_t) +ATOMIC_DEC_NV(uint, uint_t) +ATOMIC_DEC_NV(ulong, ulong_t) +ATOMIC_DEC_NV(64, uint64_t) + + +#define ATOMIC_ADD_NV(name, type1, type2) \ + type1 atomic_add_##name##_nv(volatile type1 *target, type2 bits)\ + { \ + type1 rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target += bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_ADD_NV(8, uint8_t, int8_t) +ATOMIC_ADD_NV(char, uchar_t, signed char) +ATOMIC_ADD_NV(16, uint16_t, int16_t) +ATOMIC_ADD_NV(short, ushort_t, short) +ATOMIC_ADD_NV(32, uint32_t, int32_t) +ATOMIC_ADD_NV(int, uint_t, int) +ATOMIC_ADD_NV(long, ulong_t, long) +ATOMIC_ADD_NV(64, uint64_t, int64_t) + +void * +atomic_add_ptr_nv(volatile void *target, ssize_t bits) +{ + void *ptr; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + ptr = (*(caddr_t *)target += bits); + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (ptr); +} + + +#define ATOMIC_SUB_NV(name, type1, type2) \ + type1 atomic_sub_##name##_nv(volatile type1 *target, type2 bits)\ + { \ + type1 rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target -= bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_SUB_NV(8, uint8_t, int8_t) +ATOMIC_SUB_NV(char, uchar_t, signed char) +ATOMIC_SUB_NV(16, uint16_t, int16_t) +ATOMIC_SUB_NV(short, ushort_t, short) +ATOMIC_SUB_NV(32, uint32_t, int32_t) +ATOMIC_SUB_NV(int, uint_t, int) +ATOMIC_SUB_NV(long, ulong_t, long) +ATOMIC_SUB_NV(64, uint64_t, int64_t) + +void * +atomic_sub_ptr_nv(volatile void *target, ssize_t bits) +{ + void *ptr; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + ptr = (*(caddr_t *)target -= bits); + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (ptr); +} + + +#define ATOMIC_OR_NV(name, type) \ + type atomic_or_##name##_nv(volatile type *target, type bits) \ + { \ + type rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target |= bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_OR_NV(8, uint8_t) +ATOMIC_OR_NV(uchar, uchar_t) +ATOMIC_OR_NV(16, uint16_t) +ATOMIC_OR_NV(ushort, ushort_t) +ATOMIC_OR_NV(32, uint32_t) +ATOMIC_OR_NV(uint, uint_t) +ATOMIC_OR_NV(ulong, ulong_t) +ATOMIC_OR_NV(64, uint64_t) + + +#define ATOMIC_AND_NV(name, type) \ + type atomic_and_##name##_nv(volatile type *target, type bits) \ + { \ + type rc; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + rc = (*target &= bits); \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (rc); \ + } + +ATOMIC_AND_NV(8, uint8_t) +ATOMIC_AND_NV(uchar, uchar_t) +ATOMIC_AND_NV(16, uint16_t) +ATOMIC_AND_NV(ushort, ushort_t) +ATOMIC_AND_NV(32, uint32_t) +ATOMIC_AND_NV(uint, uint_t) +ATOMIC_AND_NV(ulong, ulong_t) +ATOMIC_AND_NV(64, uint64_t) + + +/* + * If *arg1 == arg2, set *arg1 = arg3; return old value + */ + +#define ATOMIC_CAS(name, type) \ + type atomic_cas_##name(volatile type *target, type arg1, type arg2) \ + { \ + type old; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + old = *target; \ + if (old == arg1) \ + *target = arg2; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (old); \ + } + +ATOMIC_CAS(8, uint8_t) +ATOMIC_CAS(uchar, uchar_t) +ATOMIC_CAS(16, uint16_t) +ATOMIC_CAS(ushort, ushort_t) +ATOMIC_CAS(32, uint32_t) +ATOMIC_CAS(uint, uint_t) +ATOMIC_CAS(ulong, ulong_t) +ATOMIC_CAS(64, uint64_t) + +void * +atomic_cas_ptr(volatile void *target, void *arg1, void *arg2) +{ + void *old; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + old = *(void **)target; + if (old == arg1) + *(void **)target = arg2; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (old); +} + + +/* + * Swap target and return old value + */ + +#define ATOMIC_SWAP(name, type) \ + type atomic_swap_##name(volatile type *target, type bits) \ + { \ + type old; \ + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); \ + old = *target; \ + *target = bits; \ + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); \ + return (old); \ + } + +ATOMIC_SWAP(8, uint8_t) +ATOMIC_SWAP(uchar, uchar_t) +ATOMIC_SWAP(16, uint16_t) +ATOMIC_SWAP(ushort, ushort_t) +ATOMIC_SWAP(32, uint32_t) +ATOMIC_SWAP(uint, uint_t) +ATOMIC_SWAP(ulong, ulong_t) +ATOMIC_SWAP(64, uint64_t) +/* END CSTYLED */ + +void * +atomic_swap_ptr(volatile void *target, void *bits) +{ + void *old; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + old = *(void **)target; + *(void **)target = bits; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (old); +} + + +int +atomic_set_long_excl(volatile ulong_t *target, uint_t value) +{ + ulong_t bit; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + bit = (1UL << value); + if ((*target & bit) != 0) { + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + return (-1); + } + *target |= bit; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (0); +} + +int +atomic_clear_long_excl(volatile ulong_t *target, uint_t value) +{ + ulong_t bit; + + VERIFY3S(pthread_mutex_lock(&atomic_lock), ==, 0); + bit = (1UL << value); + if ((*target & bit) == 0) { + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + return (-1); + } + *target &= ~bit; + VERIFY3S(pthread_mutex_unlock(&atomic_lock), ==, 0); + + return (0); +} + +void +membar_enter(void) +{ + /* XXX - Implement me */ +} + +void +membar_exit(void) +{ + /* XXX - Implement me */ +} + +void +membar_producer(void) +{ + /* XXX - Implement me */ +} + +void +membar_consumer(void) +{ + /* XXX - Implement me */ +} diff --git a/lib/libspl/asm-i386/atomic.S b/lib/libspl/asm-i386/atomic.S new file mode 100644 index 000000000000..7a574b0d1729 --- /dev/null +++ b/lib/libspl/asm-i386/atomic.S @@ -0,0 +1,840 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "%Z%%M% %I% %E% SMI" + + .file "%M%" + +#define _ASM +#ifdef __linux__ +#include +#elif __FreeBSD__ +#include +#define SET_SIZE(x) +#endif + ENTRY(atomic_inc_8) + ALTENTRY(atomic_inc_uchar) + movl 4(%esp), %eax + lock + incb (%eax) + ret + SET_SIZE(atomic_inc_uchar) + SET_SIZE(atomic_inc_8) + + ENTRY(atomic_inc_16) + ALTENTRY(atomic_inc_ushort) + movl 4(%esp), %eax + lock + incw (%eax) + ret + SET_SIZE(atomic_inc_ushort) + SET_SIZE(atomic_inc_16) + + ENTRY(atomic_inc_32) + ALTENTRY(atomic_inc_uint) + ALTENTRY(atomic_inc_ulong) + movl 4(%esp), %eax + lock + incl (%eax) + ret + SET_SIZE(atomic_inc_ulong) + SET_SIZE(atomic_inc_uint) + SET_SIZE(atomic_inc_32) + + ENTRY(atomic_inc_8_nv) + ALTENTRY(atomic_inc_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + leal 1(%eax), %ecx + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_inc_uchar_nv) + SET_SIZE(atomic_inc_8_nv) + + ENTRY(atomic_inc_16_nv) + ALTENTRY(atomic_inc_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + leal 1(%eax), %ecx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_inc_ushort_nv) + SET_SIZE(atomic_inc_16_nv) + + ENTRY(atomic_inc_32_nv) + ALTENTRY(atomic_inc_uint_nv) + ALTENTRY(atomic_inc_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + leal 1(%eax), %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_inc_ulong_nv) + SET_SIZE(atomic_inc_uint_nv) + SET_SIZE(atomic_inc_32_nv) + + /* + * NOTE: If atomic_inc_64 and atomic_inc_64_nv are ever + * separated, you need to also edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_inc_64_nv. + */ + ENTRY(atomic_inc_64) + ALTENTRY(atomic_inc_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + xorl %ebx, %ebx + xorl %ecx, %ecx + incl %ebx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_inc_64_nv) + SET_SIZE(atomic_inc_64) + + ENTRY(atomic_dec_8) + ALTENTRY(atomic_dec_uchar) + movl 4(%esp), %eax + lock + decb (%eax) + ret + SET_SIZE(atomic_dec_uchar) + SET_SIZE(atomic_dec_8) + + ENTRY(atomic_dec_16) + ALTENTRY(atomic_dec_ushort) + movl 4(%esp), %eax + lock + decw (%eax) + ret + SET_SIZE(atomic_dec_ushort) + SET_SIZE(atomic_dec_16) + + ENTRY(atomic_dec_32) + ALTENTRY(atomic_dec_uint) + ALTENTRY(atomic_dec_ulong) + movl 4(%esp), %eax + lock + decl (%eax) + ret + SET_SIZE(atomic_dec_ulong) + SET_SIZE(atomic_dec_uint) + SET_SIZE(atomic_dec_32) + + ENTRY(atomic_dec_8_nv) + ALTENTRY(atomic_dec_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + leal -1(%eax), %ecx + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_dec_uchar_nv) + SET_SIZE(atomic_dec_8_nv) + + ENTRY(atomic_dec_16_nv) + ALTENTRY(atomic_dec_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + leal -1(%eax), %ecx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_dec_ushort_nv) + SET_SIZE(atomic_dec_16_nv) + + ENTRY(atomic_dec_32_nv) + ALTENTRY(atomic_dec_uint_nv) + ALTENTRY(atomic_dec_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + leal -1(%eax), %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_dec_ulong_nv) + SET_SIZE(atomic_dec_uint_nv) + SET_SIZE(atomic_dec_32_nv) + + /* + * NOTE: If atomic_dec_64 and atomic_dec_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_dec_64_nv. + */ + ENTRY(atomic_dec_64) + ALTENTRY(atomic_dec_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + xorl %ebx, %ebx + xorl %ecx, %ecx + not %ecx + not %ebx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_dec_64_nv) + SET_SIZE(atomic_dec_64) + + ENTRY(atomic_add_8) + ALTENTRY(atomic_add_char) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + addb %cl, (%eax) + ret + SET_SIZE(atomic_add_char) + SET_SIZE(atomic_add_8) + + ENTRY(atomic_add_16) + ALTENTRY(atomic_add_short) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + addw %cx, (%eax) + ret + SET_SIZE(atomic_add_short) + SET_SIZE(atomic_add_16) + + ENTRY(atomic_add_32) + ALTENTRY(atomic_add_int) + ALTENTRY(atomic_add_ptr) + ALTENTRY(atomic_add_long) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + addl %ecx, (%eax) + ret + SET_SIZE(atomic_add_long) + SET_SIZE(atomic_add_ptr) + SET_SIZE(atomic_add_int) + SET_SIZE(atomic_add_32) + + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subb %cl, (%eax) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subw %cx, (%eax) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + subl %ecx, (%eax) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + + ENTRY(atomic_or_8) + ALTENTRY(atomic_or_uchar) + movl 4(%esp), %eax + movb 8(%esp), %cl + lock + orb %cl, (%eax) + ret + SET_SIZE(atomic_or_uchar) + SET_SIZE(atomic_or_8) + + ENTRY(atomic_or_16) + ALTENTRY(atomic_or_ushort) + movl 4(%esp), %eax + movw 8(%esp), %cx + lock + orw %cx, (%eax) + ret + SET_SIZE(atomic_or_ushort) + SET_SIZE(atomic_or_16) + + ENTRY(atomic_or_32) + ALTENTRY(atomic_or_uint) + ALTENTRY(atomic_or_ulong) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + orl %ecx, (%eax) + ret + SET_SIZE(atomic_or_ulong) + SET_SIZE(atomic_or_uint) + SET_SIZE(atomic_or_32) + + ENTRY(atomic_and_8) + ALTENTRY(atomic_and_uchar) + movl 4(%esp), %eax + movb 8(%esp), %cl + lock + andb %cl, (%eax) + ret + SET_SIZE(atomic_and_uchar) + SET_SIZE(atomic_and_8) + + ENTRY(atomic_and_16) + ALTENTRY(atomic_and_ushort) + movl 4(%esp), %eax + movw 8(%esp), %cx + lock + andw %cx, (%eax) + ret + SET_SIZE(atomic_and_ushort) + SET_SIZE(atomic_and_16) + + ENTRY(atomic_and_32) + ALTENTRY(atomic_and_uint) + ALTENTRY(atomic_and_ulong) + movl 4(%esp), %eax + movl 8(%esp), %ecx + lock + andl %ecx, (%eax) + ret + SET_SIZE(atomic_and_ulong) + SET_SIZE(atomic_and_uint) + SET_SIZE(atomic_and_32) + + ENTRY(atomic_add_8_nv) + ALTENTRY(atomic_add_char_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + addb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_add_char_nv) + SET_SIZE(atomic_add_8_nv) + + ENTRY(atomic_add_16_nv) + ALTENTRY(atomic_add_short_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + addw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_add_short_nv) + SET_SIZE(atomic_add_16_nv) + + ENTRY(atomic_add_32_nv) + ALTENTRY(atomic_add_int_nv) + ALTENTRY(atomic_add_ptr_nv) + ALTENTRY(atomic_add_long_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + addl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_add_long_nv) + SET_SIZE(atomic_add_ptr_nv) + SET_SIZE(atomic_add_int_nv) + SET_SIZE(atomic_add_32_nv) + + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + subb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + subw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + + /* + * NOTE: If atomic_add_64 and atomic_add_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_add_64_nv. + */ + ENTRY(atomic_add_64) + ALTENTRY(atomic_add_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + addl %eax, %ebx + adcl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_add_64_nv) + SET_SIZE(atomic_add_64) + + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + subl %eax, %ebx + sbbl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_sub_64_nv) + SET_SIZE(atomic_sub_64) + + ENTRY(atomic_or_8_nv) + ALTENTRY(atomic_or_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + orb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_or_uchar_nv) + SET_SIZE(atomic_or_8_nv) + + ENTRY(atomic_or_16_nv) + ALTENTRY(atomic_or_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + orw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_or_ushort_nv) + SET_SIZE(atomic_or_16_nv) + + ENTRY(atomic_or_32_nv) + ALTENTRY(atomic_or_uint_nv) + ALTENTRY(atomic_or_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + orl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_or_ulong_nv) + SET_SIZE(atomic_or_uint_nv) + SET_SIZE(atomic_or_32_nv) + + /* + * NOTE: If atomic_or_64 and atomic_or_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_or_64_nv. + */ + ENTRY(atomic_or_64) + ALTENTRY(atomic_or_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + orl %eax, %ebx + orl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_or_64_nv) + SET_SIZE(atomic_or_64) + + ENTRY(atomic_and_8_nv) + ALTENTRY(atomic_and_uchar_nv) + movl 4(%esp), %edx + movb (%edx), %al +1: + movl 8(%esp), %ecx + andb %al, %cl + lock + cmpxchgb %cl, (%edx) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_and_16_nv) + ALTENTRY(atomic_and_ushort_nv) + movl 4(%esp), %edx + movw (%edx), %ax +1: + movl 8(%esp), %ecx + andw %ax, %cx + lock + cmpxchgw %cx, (%edx) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_and_ushort_nv) + SET_SIZE(atomic_and_16_nv) + + ENTRY(atomic_and_32_nv) + ALTENTRY(atomic_and_uint_nv) + ALTENTRY(atomic_and_ulong_nv) + movl 4(%esp), %edx + movl (%edx), %eax +1: + movl 8(%esp), %ecx + andl %eax, %ecx + lock + cmpxchgl %ecx, (%edx) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_and_ulong_nv) + SET_SIZE(atomic_and_uint_nv) + SET_SIZE(atomic_and_32_nv) + + /* + * NOTE: If atomic_and_64 and atomic_and_64_nv are ever + * separated, it is important to edit the libc i386 platform + * specific mapfile and remove the NODYNSORT attribute + * from atomic_and_64_nv. + */ + ENTRY(atomic_and_64) + ALTENTRY(atomic_and_64_nv) + pushl %edi + pushl %ebx + movl 12(%esp), %edi + movl (%edi), %eax + movl 4(%edi), %edx +1: + movl 16(%esp), %ebx + movl 20(%esp), %ecx + andl %eax, %ebx + andl %edx, %ecx + lock + cmpxchg8b (%edi) + jne 1b + movl %ebx, %eax + movl %ecx, %edx + popl %ebx + popl %edi + ret + SET_SIZE(atomic_and_64_nv) + SET_SIZE(atomic_and_64) + + ENTRY(atomic_cas_8) + ALTENTRY(atomic_cas_uchar) + movl 4(%esp), %edx + movzbl 8(%esp), %eax + movb 12(%esp), %cl + lock + cmpxchgb %cl, (%edx) + ret + SET_SIZE(atomic_cas_uchar) + SET_SIZE(atomic_cas_8) + + ENTRY(atomic_cas_16) + ALTENTRY(atomic_cas_ushort) + movl 4(%esp), %edx + movzwl 8(%esp), %eax + movw 12(%esp), %cx + lock + cmpxchgw %cx, (%edx) + ret + SET_SIZE(atomic_cas_ushort) + SET_SIZE(atomic_cas_16) + + ENTRY(atomic_cas_32) + ALTENTRY(atomic_cas_uint) + ALTENTRY(atomic_cas_ulong) + ALTENTRY(atomic_cas_ptr) + movl 4(%esp), %edx + movl 8(%esp), %eax + movl 12(%esp), %ecx + lock + cmpxchgl %ecx, (%edx) + ret + SET_SIZE(atomic_cas_ptr) + SET_SIZE(atomic_cas_ulong) + SET_SIZE(atomic_cas_uint) + SET_SIZE(atomic_cas_32) + + ENTRY(atomic_cas_64) + pushl %ebx + pushl %esi + movl 12(%esp), %esi + movl 16(%esp), %eax + movl 20(%esp), %edx + movl 24(%esp), %ebx + movl 28(%esp), %ecx + lock + cmpxchg8b (%esi) + popl %esi + popl %ebx + ret + SET_SIZE(atomic_cas_64) + + ENTRY(atomic_swap_8) + ALTENTRY(atomic_swap_uchar) + movl 4(%esp), %edx + movzbl 8(%esp), %eax + lock + xchgb %al, (%edx) + ret + SET_SIZE(atomic_swap_uchar) + SET_SIZE(atomic_swap_8) + + ENTRY(atomic_swap_16) + ALTENTRY(atomic_swap_ushort) + movl 4(%esp), %edx + movzwl 8(%esp), %eax + lock + xchgw %ax, (%edx) + ret + SET_SIZE(atomic_swap_ushort) + SET_SIZE(atomic_swap_16) + + ENTRY(atomic_swap_32) + ALTENTRY(atomic_swap_uint) + ALTENTRY(atomic_swap_ptr) + ALTENTRY(atomic_swap_ulong) + movl 4(%esp), %edx + movl 8(%esp), %eax + lock + xchgl %eax, (%edx) + ret + SET_SIZE(atomic_swap_ulong) + SET_SIZE(atomic_swap_ptr) + SET_SIZE(atomic_swap_uint) + SET_SIZE(atomic_swap_32) + + ENTRY(atomic_swap_64) + pushl %esi + pushl %ebx + movl 12(%esp), %esi + movl 16(%esp), %ebx + movl 20(%esp), %ecx + movl (%esi), %eax + movl 4(%esi), %edx +1: + lock + cmpxchg8b (%esi) + jne 1b + popl %ebx + popl %esi + ret + SET_SIZE(atomic_swap_64) + + ENTRY(atomic_set_long_excl) + movl 4(%esp), %edx + movl 8(%esp), %ecx + xorl %eax, %eax + lock + btsl %ecx, (%edx) + jnc 1f + decl %eax +1: + ret + SET_SIZE(atomic_set_long_excl) + + ENTRY(atomic_clear_long_excl) + movl 4(%esp), %edx + movl 8(%esp), %ecx + xorl %eax, %eax + lock + btrl %ecx, (%edx) + jc 1f + decl %eax +1: + ret + SET_SIZE(atomic_clear_long_excl) + + /* + * NOTE: membar_enter, membar_exit, membar_producer, and + * membar_consumer are all identical routines. We define them + * separately, instead of using ALTENTRY definitions to alias them + * together, so that DTrace and debuggers will see a unique address + * for them, allowing more accurate tracing. + */ + + + ENTRY(membar_enter) + lock + xorl $0, (%esp) + ret + SET_SIZE(membar_enter) + + ENTRY(membar_exit) + lock + xorl $0, (%esp) + ret + SET_SIZE(membar_exit) + + ENTRY(membar_producer) + lock + xorl $0, (%esp) + ret + SET_SIZE(membar_producer) + + ENTRY(membar_consumer) + lock + xorl $0, (%esp) + ret + SET_SIZE(membar_consumer) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libspl/asm-x86_64/atomic.S b/lib/libspl/asm-x86_64/atomic.S new file mode 100644 index 000000000000..50edfe878cc9 --- /dev/null +++ b/lib/libspl/asm-x86_64/atomic.S @@ -0,0 +1,691 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + .ident "%Z%%M% %I% %E% SMI" + + .file "%M%" + +#define _ASM +#ifdef __linux__ +#include +#elif __FreeBSD__ +#include +#define SET_SIZE(x) +#endif + ENTRY(atomic_inc_8) + ALTENTRY(atomic_inc_uchar) + lock + incb (%rdi) + ret + SET_SIZE(atomic_inc_uchar) + SET_SIZE(atomic_inc_8) + + ENTRY(atomic_inc_16) + ALTENTRY(atomic_inc_ushort) + lock + incw (%rdi) + ret + SET_SIZE(atomic_inc_ushort) + SET_SIZE(atomic_inc_16) + + ENTRY(atomic_inc_32) + ALTENTRY(atomic_inc_uint) + lock + incl (%rdi) + ret + SET_SIZE(atomic_inc_uint) + SET_SIZE(atomic_inc_32) + + ENTRY(atomic_inc_64) + ALTENTRY(atomic_inc_ulong) + lock + incq (%rdi) + ret + SET_SIZE(atomic_inc_ulong) + SET_SIZE(atomic_inc_64) + + ENTRY(atomic_inc_8_nv) + ALTENTRY(atomic_inc_uchar_nv) + movb (%rdi), %al +1: + leaq 1(%rax), %rcx + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_inc_uchar_nv) + SET_SIZE(atomic_inc_8_nv) + + ENTRY(atomic_inc_16_nv) + ALTENTRY(atomic_inc_ushort_nv) + movw (%rdi), %ax +1: + leaq 1(%rax), %rcx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_inc_ushort_nv) + SET_SIZE(atomic_inc_16_nv) + + ENTRY(atomic_inc_32_nv) + ALTENTRY(atomic_inc_uint_nv) + movl (%rdi), %eax +1: + leaq 1(%rax), %rcx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_inc_uint_nv) + SET_SIZE(atomic_inc_32_nv) + + ENTRY(atomic_inc_64_nv) + ALTENTRY(atomic_inc_ulong_nv) + movq (%rdi), %rax +1: + leaq 1(%rax), %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_inc_ulong_nv) + SET_SIZE(atomic_inc_64_nv) + + ENTRY(atomic_dec_8) + ALTENTRY(atomic_dec_uchar) + lock + decb (%rdi) + ret + SET_SIZE(atomic_dec_uchar) + SET_SIZE(atomic_dec_8) + + ENTRY(atomic_dec_16) + ALTENTRY(atomic_dec_ushort) + lock + decw (%rdi) + ret + SET_SIZE(atomic_dec_ushort) + SET_SIZE(atomic_dec_16) + + ENTRY(atomic_dec_32) + ALTENTRY(atomic_dec_uint) + lock + decl (%rdi) + ret + SET_SIZE(atomic_dec_uint) + SET_SIZE(atomic_dec_32) + + ENTRY(atomic_dec_64) + ALTENTRY(atomic_dec_ulong) + lock + decq (%rdi) + ret + SET_SIZE(atomic_dec_ulong) + SET_SIZE(atomic_dec_64) + + ENTRY(atomic_dec_8_nv) + ALTENTRY(atomic_dec_uchar_nv) + movb (%rdi), %al +1: + leaq -1(%rax), %rcx + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_dec_uchar_nv) + SET_SIZE(atomic_dec_8_nv) + + ENTRY(atomic_dec_16_nv) + ALTENTRY(atomic_dec_ushort_nv) + movw (%rdi), %ax +1: + leaq -1(%rax), %rcx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_dec_ushort_nv) + SET_SIZE(atomic_dec_16_nv) + + ENTRY(atomic_dec_32_nv) + ALTENTRY(atomic_dec_uint_nv) + movl (%rdi), %eax +1: + leaq -1(%rax), %rcx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_dec_uint_nv) + SET_SIZE(atomic_dec_32_nv) + + ENTRY(atomic_dec_64_nv) + ALTENTRY(atomic_dec_ulong_nv) + movq (%rdi), %rax +1: + leaq -1(%rax), %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_dec_ulong_nv) + SET_SIZE(atomic_dec_64_nv) + + ENTRY(atomic_add_8) + ALTENTRY(atomic_add_char) + lock + addb %sil, (%rdi) + ret + SET_SIZE(atomic_add_char) + SET_SIZE(atomic_add_8) + + ENTRY(atomic_add_16) + ALTENTRY(atomic_add_short) + lock + addw %si, (%rdi) + ret + SET_SIZE(atomic_add_short) + SET_SIZE(atomic_add_16) + + ENTRY(atomic_add_32) + ALTENTRY(atomic_add_int) + lock + addl %esi, (%rdi) + ret + SET_SIZE(atomic_add_int) + SET_SIZE(atomic_add_32) + + ENTRY(atomic_add_64) + ALTENTRY(atomic_add_ptr) + ALTENTRY(atomic_add_long) + lock + addq %rsi, (%rdi) + ret + SET_SIZE(atomic_add_long) + SET_SIZE(atomic_add_ptr) + SET_SIZE(atomic_add_64) + + ENTRY(atomic_sub_8) + ALTENTRY(atomic_sub_char) + lock + subb %sil, (%rdi) + ret + SET_SIZE(atomic_sub_char) + SET_SIZE(atomic_sub_8) + + ENTRY(atomic_sub_16) + ALTENTRY(atomic_sub_short) + lock + subw %si, (%rdi) + ret + SET_SIZE(atomic_sub_short) + SET_SIZE(atomic_sub_16) + + ENTRY(atomic_sub_32) + ALTENTRY(atomic_sub_int) + lock + subl %esi, (%rdi) + ret + SET_SIZE(atomic_sub_int) + SET_SIZE(atomic_sub_32) + + ENTRY(atomic_sub_64) + ALTENTRY(atomic_sub_ptr) + ALTENTRY(atomic_sub_long) + lock + subq %rsi, (%rdi) + ret + SET_SIZE(atomic_sub_long) + SET_SIZE(atomic_sub_ptr) + SET_SIZE(atomic_sub_64) + + ENTRY(atomic_or_8) + ALTENTRY(atomic_or_uchar) + lock + orb %sil, (%rdi) + ret + SET_SIZE(atomic_or_uchar) + SET_SIZE(atomic_or_8) + + ENTRY(atomic_or_16) + ALTENTRY(atomic_or_ushort) + lock + orw %si, (%rdi) + ret + SET_SIZE(atomic_or_ushort) + SET_SIZE(atomic_or_16) + + ENTRY(atomic_or_32) + ALTENTRY(atomic_or_uint) + lock + orl %esi, (%rdi) + ret + SET_SIZE(atomic_or_uint) + SET_SIZE(atomic_or_32) + + ENTRY(atomic_or_64) + ALTENTRY(atomic_or_ulong) + lock + orq %rsi, (%rdi) + ret + SET_SIZE(atomic_or_ulong) + SET_SIZE(atomic_or_64) + + ENTRY(atomic_and_8) + ALTENTRY(atomic_and_uchar) + lock + andb %sil, (%rdi) + ret + SET_SIZE(atomic_and_uchar) + SET_SIZE(atomic_and_8) + + ENTRY(atomic_and_16) + ALTENTRY(atomic_and_ushort) + lock + andw %si, (%rdi) + ret + SET_SIZE(atomic_and_ushort) + SET_SIZE(atomic_and_16) + + ENTRY(atomic_and_32) + ALTENTRY(atomic_and_uint) + lock + andl %esi, (%rdi) + ret + SET_SIZE(atomic_and_uint) + SET_SIZE(atomic_and_32) + + ENTRY(atomic_and_64) + ALTENTRY(atomic_and_ulong) + lock + andq %rsi, (%rdi) + ret + SET_SIZE(atomic_and_ulong) + SET_SIZE(atomic_and_64) + + ENTRY(atomic_add_8_nv) + ALTENTRY(atomic_add_char_nv) + movb (%rdi), %al +1: + movb %sil, %cl + addb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_add_char_nv) + SET_SIZE(atomic_add_8_nv) + + ENTRY(atomic_add_16_nv) + ALTENTRY(atomic_add_short_nv) + movw (%rdi), %ax +1: + movw %si, %cx + addw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_add_short_nv) + SET_SIZE(atomic_add_16_nv) + + ENTRY(atomic_add_32_nv) + ALTENTRY(atomic_add_int_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + addl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_add_int_nv) + SET_SIZE(atomic_add_32_nv) + + ENTRY(atomic_add_64_nv) + ALTENTRY(atomic_add_ptr_nv) + ALTENTRY(atomic_add_long_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + addq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_add_long_nv) + SET_SIZE(atomic_add_ptr_nv) + SET_SIZE(atomic_add_64_nv) + + ENTRY(atomic_sub_8_nv) + ALTENTRY(atomic_sub_char_nv) + movb (%rdi), %al +1: + movb %sil, %cl + subb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_sub_char_nv) + SET_SIZE(atomic_sub_8_nv) + + ENTRY(atomic_sub_16_nv) + ALTENTRY(atomic_sub_short_nv) + movw (%rdi), %ax +1: + movw %si, %cx + subw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_sub_short_nv) + SET_SIZE(atomic_sub_16_nv) + + ENTRY(atomic_sub_32_nv) + ALTENTRY(atomic_sub_int_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + subl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_sub_int_nv) + SET_SIZE(atomic_sub_32_nv) + + ENTRY(atomic_sub_64_nv) + ALTENTRY(atomic_sub_ptr_nv) + ALTENTRY(atomic_sub_long_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + subq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_sub_long_nv) + SET_SIZE(atomic_sub_ptr_nv) + SET_SIZE(atomic_sub_64_nv) + + ENTRY(atomic_and_8_nv) + ALTENTRY(atomic_and_uchar_nv) + movb (%rdi), %al +1: + movb %sil, %cl + andb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_and_16_nv) + ALTENTRY(atomic_and_ushort_nv) + movw (%rdi), %ax +1: + movw %si, %cx + andw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_and_ushort_nv) + SET_SIZE(atomic_and_16_nv) + + ENTRY(atomic_and_32_nv) + ALTENTRY(atomic_and_uint_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + andl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_and_uint_nv) + SET_SIZE(atomic_and_32_nv) + + ENTRY(atomic_and_64_nv) + ALTENTRY(atomic_and_ulong_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + andq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_and_ulong_nv) + SET_SIZE(atomic_and_64_nv) + + ENTRY(atomic_or_8_nv) + ALTENTRY(atomic_or_uchar_nv) + movb (%rdi), %al +1: + movb %sil, %cl + orb %al, %cl + lock + cmpxchgb %cl, (%rdi) + jne 1b + movzbl %cl, %eax + ret + SET_SIZE(atomic_and_uchar_nv) + SET_SIZE(atomic_and_8_nv) + + ENTRY(atomic_or_16_nv) + ALTENTRY(atomic_or_ushort_nv) + movw (%rdi), %ax +1: + movw %si, %cx + orw %ax, %cx + lock + cmpxchgw %cx, (%rdi) + jne 1b + movzwl %cx, %eax + ret + SET_SIZE(atomic_or_ushort_nv) + SET_SIZE(atomic_or_16_nv) + + ENTRY(atomic_or_32_nv) + ALTENTRY(atomic_or_uint_nv) + movl (%rdi), %eax +1: + movl %esi, %ecx + orl %eax, %ecx + lock + cmpxchgl %ecx, (%rdi) + jne 1b + movl %ecx, %eax + ret + SET_SIZE(atomic_or_uint_nv) + SET_SIZE(atomic_or_32_nv) + + ENTRY(atomic_or_64_nv) + ALTENTRY(atomic_or_ulong_nv) + movq (%rdi), %rax +1: + movq %rsi, %rcx + orq %rax, %rcx + lock + cmpxchgq %rcx, (%rdi) + jne 1b + movq %rcx, %rax + ret + SET_SIZE(atomic_or_ulong_nv) + SET_SIZE(atomic_or_64_nv) + + ENTRY(atomic_cas_8) + ALTENTRY(atomic_cas_uchar) + movzbl %sil, %eax + lock + cmpxchgb %dl, (%rdi) + ret + SET_SIZE(atomic_cas_uchar) + SET_SIZE(atomic_cas_8) + + ENTRY(atomic_cas_16) + ALTENTRY(atomic_cas_ushort) + movzwl %si, %eax + lock + cmpxchgw %dx, (%rdi) + ret + SET_SIZE(atomic_cas_ushort) + SET_SIZE(atomic_cas_16) + + ENTRY(atomic_cas_32) + ALTENTRY(atomic_cas_uint) + movl %esi, %eax + lock + cmpxchgl %edx, (%rdi) + ret + SET_SIZE(atomic_cas_uint) + SET_SIZE(atomic_cas_32) + + ENTRY(atomic_cas_64) + ALTENTRY(atomic_cas_ulong) + ALTENTRY(atomic_cas_ptr) + movq %rsi, %rax + lock + cmpxchgq %rdx, (%rdi) + ret + SET_SIZE(atomic_cas_ptr) + SET_SIZE(atomic_cas_ulong) + SET_SIZE(atomic_cas_64) + + ENTRY(atomic_swap_8) + ALTENTRY(atomic_swap_uchar) + movzbl %sil, %eax + lock + xchgb %al, (%rdi) + ret + SET_SIZE(atomic_swap_uchar) + SET_SIZE(atomic_swap_8) + + ENTRY(atomic_swap_16) + ALTENTRY(atomic_swap_ushort) + movzwl %si, %eax + lock + xchgw %ax, (%rdi) + ret + SET_SIZE(atomic_swap_ushort) + SET_SIZE(atomic_swap_16) + + ENTRY(atomic_swap_32) + ALTENTRY(atomic_swap_uint) + movl %esi, %eax + lock + xchgl %eax, (%rdi) + ret + SET_SIZE(atomic_swap_uint) + SET_SIZE(atomic_swap_32) + + ENTRY(atomic_swap_64) + ALTENTRY(atomic_swap_ulong) + ALTENTRY(atomic_swap_ptr) + movq %rsi, %rax + lock + xchgq %rax, (%rdi) + ret + SET_SIZE(atomic_swap_ptr) + SET_SIZE(atomic_swap_ulong) + SET_SIZE(atomic_swap_64) + + ENTRY(atomic_set_long_excl) + xorl %eax, %eax + lock + btsq %rsi, (%rdi) + jnc 1f + decl %eax +1: + ret + SET_SIZE(atomic_set_long_excl) + + ENTRY(atomic_clear_long_excl) + xorl %eax, %eax + lock + btrq %rsi, (%rdi) + jc 1f + decl %eax +1: + ret + SET_SIZE(atomic_clear_long_excl) + + /* + * NOTE: membar_enter, and membar_exit are identical routines. + * We define them separately, instead of using an ALTENTRY + * definitions to alias them together, so that DTrace and + * debuggers will see a unique address for them, allowing + * more accurate tracing. + */ + + ENTRY(membar_enter) + mfence + ret + SET_SIZE(membar_enter) + + ENTRY(membar_exit) + mfence + ret + SET_SIZE(membar_exit) + + ENTRY(membar_producer) + sfence + ret + SET_SIZE(membar_producer) + + ENTRY(membar_consumer) + lfence + ret + SET_SIZE(membar_consumer) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/lib/libspl/assert.c b/lib/libspl/assert.c new file mode 100644 index 000000000000..94290ae13e09 --- /dev/null +++ b/lib/libspl/assert.c @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include + +int aok = 0; + +/* printf version of libspl_assert */ +void +libspl_assertf(const char *file, const char *func, int line, + const char *format, ...) +{ + va_list args; + + va_start(args, format); + vfprintf(stderr, format, args); + fprintf(stderr, "\n"); + fprintf(stderr, "ASSERT at %s:%d:%s()", file, line, func); + va_end(args); + if (aok) { + return; + } + abort(); +} diff --git a/lib/libspl/include/Makefile.am b/lib/libspl/include/Makefile.am new file mode 100644 index 000000000000..9ca08b2bc0f7 --- /dev/null +++ b/lib/libspl/include/Makefile.am @@ -0,0 +1,22 @@ +SUBDIRS = ia32 rpc sys util os + +libspldir = $(includedir)/libspl +libspl_HEADERS = \ + assert.h \ + atomic.h \ + libdevinfo.h \ + libgen.h \ + libshare.h \ + limits.h \ + locale.h \ + statcommon.h \ + stdio.h \ + stdlib.h \ + string.h \ + stropts.h \ + thread.h \ + tzfile.h \ + ucred.h \ + umem.h \ + unistd.h \ + zone.h diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h new file mode 100644 index 000000000000..0503ce492181 --- /dev/null +++ b/lib/libspl/include/assert.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_ASSERT_H +#define _LIBSPL_ASSERT_H + +#include +#include +#include + +/* Set to non-zero to avoid abort()ing on an assertion failure */ +extern int aok; + +/* printf version of libspl_assert */ +extern void libspl_assertf(const char *file, const char *func, int line, + const char *format, ...); + +static inline int +libspl_assert(const char *buf, const char *file, const char *func, int line) +{ + libspl_assertf(file, func, line, "%s", buf); + return (0); +} + +#ifdef verify +#undef verify +#endif + +#define VERIFY(cond) \ + (void) ((!(cond)) && \ + libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) +#define verify(cond) \ + (void) ((!(cond)) && \ + libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) + +#define VERIFY3B(LEFT, OP, RIGHT) \ +do { \ + const boolean_t __left = (boolean_t)(LEFT); \ + const boolean_t __right = (boolean_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ +} while (0) + +#define VERIFY3S(LEFT, OP, RIGHT) \ +do { \ + const int64_t __left = (int64_t)(LEFT); \ + const int64_t __right = (int64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ +} while (0) + +#define VERIFY3U(LEFT, OP, RIGHT) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + const uint64_t __right = (uint64_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ +} while (0) + +#define VERIFY3P(LEFT, OP, RIGHT) \ +do { \ + const uintptr_t __left = (uintptr_t)(LEFT); \ + const uintptr_t __right = (uintptr_t)(RIGHT); \ + if (!(__left OP __right)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s %s %s (0x%llx %s 0x%llx)", #LEFT, #OP, #RIGHT, \ + (u_longlong_t)__left, #OP, (u_longlong_t)__right); \ +} while (0) + +#define VERIFY0(LEFT) \ +do { \ + const uint64_t __left = (uint64_t)(LEFT); \ + if (!(__left == 0)) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, \ + "%s == 0 (0x%llx == 0)", #LEFT, \ + (u_longlong_t)__left); \ +} while (0) + +#ifdef assert +#undef assert +#endif + +/* Compile time assert */ +#define CTASSERT_GLOBAL(x) _CTASSERT(x, __LINE__) +#define CTASSERT(x) { _CTASSERT(x, __LINE__); } +#define _CTASSERT(x, y) __CTASSERT(x, y) +#define __CTASSERT(x, y) \ + typedef char __attribute__((unused)) \ + __compile_time_assertion__ ## y[(x) ? 1 : -1] + +#ifdef NDEBUG +#define ASSERT3B(x, y, z) ((void)0) +#define ASSERT3S(x, y, z) ((void)0) +#define ASSERT3U(x, y, z) ((void)0) +#define ASSERT3P(x, y, z) ((void)0) +#define ASSERT0(x) ((void)0) +#define ASSERT(x) ((void)0) +#define assert(x) ((void)0) +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) +#else +#define ASSERT3B VERIFY3B +#define ASSERT3S VERIFY3S +#define ASSERT3U VERIFY3U +#define ASSERT3P VERIFY3P +#define ASSERT0 VERIFY0 +#define ASSERT VERIFY +#define assert VERIFY +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + libspl_assert("(" #A ") implies (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + libspl_assert("(" #A ") is equivalent to (" #B ")", \ + __FILE__, __FUNCTION__, __LINE__))) + +#endif /* NDEBUG */ + +#endif /* _LIBSPL_ASSERT_H */ diff --git a/lib/libspl/include/atomic.h b/lib/libspl/include/atomic.h new file mode 100644 index 000000000000..f8c257f9696b --- /dev/null +++ b/lib/libspl/include/atomic.h @@ -0,0 +1,296 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ATOMIC_H +#define _SYS_ATOMIC_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__STDC__) +/* + * Increment target. + */ +extern void atomic_inc_8(volatile uint8_t *); +extern void atomic_inc_uchar(volatile uchar_t *); +extern void atomic_inc_16(volatile uint16_t *); +extern void atomic_inc_ushort(volatile ushort_t *); +extern void atomic_inc_32(volatile uint32_t *); +extern void atomic_inc_uint(volatile uint_t *); +extern void atomic_inc_ulong(volatile ulong_t *); +#if defined(_INT64_TYPE) +extern void atomic_inc_64(volatile uint64_t *); +#endif + +/* + * Decrement target + */ +extern void atomic_dec_8(volatile uint8_t *); +extern void atomic_dec_uchar(volatile uchar_t *); +extern void atomic_dec_16(volatile uint16_t *); +extern void atomic_dec_ushort(volatile ushort_t *); +extern void atomic_dec_32(volatile uint32_t *); +extern void atomic_dec_uint(volatile uint_t *); +extern void atomic_dec_ulong(volatile ulong_t *); +#if defined(_INT64_TYPE) +extern void atomic_dec_64(volatile uint64_t *); +#endif + +/* + * Add delta to target + */ +extern void atomic_add_8(volatile uint8_t *, int8_t); +extern void atomic_add_char(volatile uchar_t *, signed char); +extern void atomic_add_16(volatile uint16_t *, int16_t); +extern void atomic_add_short(volatile ushort_t *, short); +extern void atomic_add_32(volatile uint32_t *, int32_t); +extern void atomic_add_int(volatile uint_t *, int); +extern void atomic_add_ptr(volatile void *, ssize_t); +extern void atomic_add_long(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern void atomic_add_64(volatile uint64_t *, int64_t); +#endif + +/* + * Subtract delta from target + */ +extern void atomic_sub_8(volatile uint8_t *, int8_t); +extern void atomic_sub_char(volatile uchar_t *, signed char); +extern void atomic_sub_16(volatile uint16_t *, int16_t); +extern void atomic_sub_short(volatile ushort_t *, short); +extern void atomic_sub_32(volatile uint32_t *, int32_t); +extern void atomic_sub_int(volatile uint_t *, int); +extern void atomic_sub_ptr(volatile void *, ssize_t); +extern void atomic_sub_long(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern void atomic_sub_64(volatile uint64_t *, int64_t); +#endif + +/* + * logical OR bits with target + */ +extern void atomic_or_8(volatile uint8_t *, uint8_t); +extern void atomic_or_uchar(volatile uchar_t *, uchar_t); +extern void atomic_or_16(volatile uint16_t *, uint16_t); +extern void atomic_or_ushort(volatile ushort_t *, ushort_t); +extern void atomic_or_32(volatile uint32_t *, uint32_t); +extern void atomic_or_uint(volatile uint_t *, uint_t); +extern void atomic_or_ulong(volatile ulong_t *, ulong_t); +#if defined(_INT64_TYPE) +extern void atomic_or_64(volatile uint64_t *, uint64_t); +#endif + +/* + * logical AND bits with target + */ +extern void atomic_and_8(volatile uint8_t *, uint8_t); +extern void atomic_and_uchar(volatile uchar_t *, uchar_t); +extern void atomic_and_16(volatile uint16_t *, uint16_t); +extern void atomic_and_ushort(volatile ushort_t *, ushort_t); +extern void atomic_and_32(volatile uint32_t *, uint32_t); +extern void atomic_and_uint(volatile uint_t *, uint_t); +extern void atomic_and_ulong(volatile ulong_t *, ulong_t); +#if defined(_INT64_TYPE) +extern void atomic_and_64(volatile uint64_t *, uint64_t); +#endif + +/* + * As above, but return the new value. Note that these _nv() variants are + * substantially more expensive on some platforms than the no-return-value + * versions above, so don't use them unless you really need to know the + * new value *atomically* (e.g. when decrementing a reference count and + * checking whether it went to zero). + */ + +/* + * Increment target and return new value. + */ +extern uint8_t atomic_inc_8_nv(volatile uint8_t *); +extern uchar_t atomic_inc_uchar_nv(volatile uchar_t *); +extern uint16_t atomic_inc_16_nv(volatile uint16_t *); +extern ushort_t atomic_inc_ushort_nv(volatile ushort_t *); +extern uint32_t atomic_inc_32_nv(volatile uint32_t *); +extern uint_t atomic_inc_uint_nv(volatile uint_t *); +extern ulong_t atomic_inc_ulong_nv(volatile ulong_t *); +#if defined(_INT64_TYPE) +extern uint64_t atomic_inc_64_nv(volatile uint64_t *); +#endif + +/* + * Decrement target and return new value. + */ +extern uint8_t atomic_dec_8_nv(volatile uint8_t *); +extern uchar_t atomic_dec_uchar_nv(volatile uchar_t *); +extern uint16_t atomic_dec_16_nv(volatile uint16_t *); +extern ushort_t atomic_dec_ushort_nv(volatile ushort_t *); +extern uint32_t atomic_dec_32_nv(volatile uint32_t *); +extern uint_t atomic_dec_uint_nv(volatile uint_t *); +extern ulong_t atomic_dec_ulong_nv(volatile ulong_t *); +#if defined(_INT64_TYPE) +extern uint64_t atomic_dec_64_nv(volatile uint64_t *); +#endif + +/* + * Add delta to target + */ +extern uint8_t atomic_add_8_nv(volatile uint8_t *, int8_t); +extern uchar_t atomic_add_char_nv(volatile uchar_t *, signed char); +extern uint16_t atomic_add_16_nv(volatile uint16_t *, int16_t); +extern ushort_t atomic_add_short_nv(volatile ushort_t *, short); +extern uint32_t atomic_add_32_nv(volatile uint32_t *, int32_t); +extern uint_t atomic_add_int_nv(volatile uint_t *, int); +extern void *atomic_add_ptr_nv(volatile void *, ssize_t); +extern ulong_t atomic_add_long_nv(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern uint64_t atomic_add_64_nv(volatile uint64_t *, int64_t); +#endif + +/* + * Subtract delta from target + */ +extern uint8_t atomic_sub_8_nv(volatile uint8_t *, int8_t); +extern uchar_t atomic_sub_char_nv(volatile uchar_t *, signed char); +extern uint16_t atomic_sub_16_nv(volatile uint16_t *, int16_t); +extern ushort_t atomic_sub_short_nv(volatile ushort_t *, short); +extern uint32_t atomic_sub_32_nv(volatile uint32_t *, int32_t); +extern uint_t atomic_sub_int_nv(volatile uint_t *, int); +extern void *atomic_sub_ptr_nv(volatile void *, ssize_t); +extern ulong_t atomic_sub_long_nv(volatile ulong_t *, long); +#if defined(_INT64_TYPE) +extern uint64_t atomic_sub_64_nv(volatile uint64_t *, int64_t); +#endif + +/* + * logical OR bits with target and return new value. + */ +extern uint8_t atomic_or_8_nv(volatile uint8_t *, uint8_t); +extern uchar_t atomic_or_uchar_nv(volatile uchar_t *, uchar_t); +extern uint16_t atomic_or_16_nv(volatile uint16_t *, uint16_t); +extern ushort_t atomic_or_ushort_nv(volatile ushort_t *, ushort_t); +extern uint32_t atomic_or_32_nv(volatile uint32_t *, uint32_t); +extern uint_t atomic_or_uint_nv(volatile uint_t *, uint_t); +extern ulong_t atomic_or_ulong_nv(volatile ulong_t *, ulong_t); +#if defined(_INT64_TYPE) +extern uint64_t atomic_or_64_nv(volatile uint64_t *, uint64_t); +#endif + +/* + * logical AND bits with target and return new value. + */ +extern uint8_t atomic_and_8_nv(volatile uint8_t *, uint8_t); +extern uchar_t atomic_and_uchar_nv(volatile uchar_t *, uchar_t); +extern uint16_t atomic_and_16_nv(volatile uint16_t *, uint16_t); +extern ushort_t atomic_and_ushort_nv(volatile ushort_t *, ushort_t); +extern uint32_t atomic_and_32_nv(volatile uint32_t *, uint32_t); +extern uint_t atomic_and_uint_nv(volatile uint_t *, uint_t); +extern ulong_t atomic_and_ulong_nv(volatile ulong_t *, ulong_t); +#if defined(_INT64_TYPE) +extern uint64_t atomic_and_64_nv(volatile uint64_t *, uint64_t); +#endif + +/* + * If *arg1 == arg2, set *arg1 = arg3; return old value + */ +extern uint8_t atomic_cas_8(volatile uint8_t *, uint8_t, uint8_t); +extern uchar_t atomic_cas_uchar(volatile uchar_t *, uchar_t, uchar_t); +extern uint16_t atomic_cas_16(volatile uint16_t *, uint16_t, uint16_t); +extern ushort_t atomic_cas_ushort(volatile ushort_t *, ushort_t, ushort_t); +extern uint32_t atomic_cas_32(volatile uint32_t *, uint32_t, uint32_t); +extern uint_t atomic_cas_uint(volatile uint_t *, uint_t, uint_t); +extern void *atomic_cas_ptr(volatile void *, void *, void *); +extern ulong_t atomic_cas_ulong(volatile ulong_t *, ulong_t, ulong_t); +#if defined(_INT64_TYPE) +extern uint64_t atomic_cas_64(volatile uint64_t *, uint64_t, uint64_t); +#endif + +/* + * Swap target and return old value + */ +extern uint8_t atomic_swap_8(volatile uint8_t *, uint8_t); +extern uchar_t atomic_swap_uchar(volatile uchar_t *, uchar_t); +extern uint16_t atomic_swap_16(volatile uint16_t *, uint16_t); +extern ushort_t atomic_swap_ushort(volatile ushort_t *, ushort_t); +extern uint32_t atomic_swap_32(volatile uint32_t *, uint32_t); +extern uint_t atomic_swap_uint(volatile uint_t *, uint_t); +extern void *atomic_swap_ptr(volatile void *, void *); +extern ulong_t atomic_swap_ulong(volatile ulong_t *, ulong_t); +#if defined(_INT64_TYPE) +extern uint64_t atomic_swap_64(volatile uint64_t *, uint64_t); +#endif + +/* + * Perform an exclusive atomic bit set/clear on a target. + * Returns 0 if bit was successfully set/cleared, or -1 + * if the bit was already set/cleared. + */ +extern int atomic_set_long_excl(volatile ulong_t *, uint_t); +extern int atomic_clear_long_excl(volatile ulong_t *, uint_t); + +/* + * Generic memory barrier used during lock entry, placed after the + * memory operation that acquires the lock to guarantee that the lock + * protects its data. No stores from after the memory barrier will + * reach visibility, and no loads from after the barrier will be + * resolved, before the lock acquisition reaches global visibility. + */ +extern void membar_enter(void); + +/* + * Generic memory barrier used during lock exit, placed before the + * memory operation that releases the lock to guarantee that the lock + * protects its data. All loads and stores issued before the barrier + * will be resolved before the subsequent lock update reaches visibility. + */ +extern void membar_exit(void); + +/* + * Arrange that all stores issued before this point in the code reach + * global visibility before any stores that follow; useful in producer + * modules that update a data item, then set a flag that it is available. + * The memory barrier guarantees that the available flag is not visible + * earlier than the updated data, i.e. it imposes store ordering. + */ +extern void membar_producer(void); + +/* + * Arrange that all loads issued before this point in the code are + * completed before any subsequent loads; useful in consumer modules + * that check to see if data is available and read the data. + * The memory barrier guarantees that the data is not sampled until + * after the available flag has been seen, i.e. it imposes load ordering. + */ +extern void membar_consumer(void); +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ATOMIC_H */ diff --git a/lib/libspl/include/ia32/Makefile.am b/lib/libspl/include/ia32/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/lib/libspl/include/ia32/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/lib/libspl/include/ia32/sys/Makefile.am b/lib/libspl/include/ia32/sys/Makefile.am new file mode 100644 index 000000000000..683288460cdf --- /dev/null +++ b/lib/libspl/include/ia32/sys/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/ia32/sys +libspl_HEADERS = \ + asm_linkage.h diff --git a/lib/libspl/include/ia32/sys/asm_linkage.h b/lib/libspl/include/ia32/sys/asm_linkage.h new file mode 100644 index 000000000000..61c4d1a26977 --- /dev/null +++ b/lib/libspl/include/ia32/sys/asm_linkage.h @@ -0,0 +1,302 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 16 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak name = _name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ + .weak sym; \ + .type sym, @stype; \ +/* CSTYLED */ \ +sym = _/**/sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ + .type sym1, @stype; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl x; \ + .type x, @function; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) \ + .size x, [.-x] + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/lib/libspl/include/libdevinfo.h b/lib/libspl/include/libdevinfo.h new file mode 100644 index 000000000000..be1d291f4051 --- /dev/null +++ b/lib/libspl/include/libdevinfo.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_LIBDEVINFO_H +#define _LIBSPL_LIBDEVINFO_H + +#endif /* _LIBSPL_LIBDEVINFO_H */ diff --git a/lib/libspl/include/libgen.h b/lib/libspl/include/libgen.h new file mode 100644 index 000000000000..c46d7454e49f --- /dev/null +++ b/lib/libspl/include/libgen.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_LIBGEN_H +#define _LIBSPL_LIBGEN_H + +#include +#include_next + +extern int mkdirp(const char *, mode_t); + +#endif /* _LIBSPL_LIBGEN_H */ diff --git a/lib/libspl/include/libshare.h b/lib/libspl/include/libshare.h new file mode 100644 index 000000000000..ea53f8c15089 --- /dev/null +++ b/lib/libspl/include/libshare.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (c) 2019, 2020 by Delphix. All rights reserved. + */ +#ifndef _LIBSPL_LIBSHARE_H +#define _LIBSPL_LIBSHARE_H + +/* API Initialization */ +#define SA_INIT_SHARE_API 0x0001 /* init share specific interface */ +#define SA_INIT_CONTROL_API 0x0002 /* init control specific interface */ + +/* + * defined error values + */ + +#define SA_OK 0 +#define SA_NO_SUCH_PATH 1 /* provided path doesn't exist */ +#define SA_NO_MEMORY 2 /* no memory for data structures */ +#define SA_DUPLICATE_NAME 3 /* object name is already in use */ +#define SA_BAD_PATH 4 /* not a full path */ +#define SA_NO_SUCH_GROUP 5 /* group is not defined */ +#define SA_CONFIG_ERR 6 /* system configuration error */ +#define SA_SYSTEM_ERR 7 /* system error, use errno */ +#define SA_SYNTAX_ERR 8 /* syntax error on command line */ +#define SA_NO_PERMISSION 9 /* no permission for operation */ +#define SA_BUSY 10 /* resource is busy */ +#define SA_NO_SUCH_PROP 11 /* property doesn't exist */ +#define SA_INVALID_NAME 12 /* name of object is invalid */ +#define SA_INVALID_PROTOCOL 13 /* specified protocol not valid */ +#define SA_NOT_ALLOWED 14 /* operation not allowed */ +#define SA_BAD_VALUE 15 /* bad value for property */ +#define SA_INVALID_SECURITY 16 /* invalid security type */ +#define SA_NO_SUCH_SECURITY 17 /* security set not found */ +#define SA_VALUE_CONFLICT 18 /* property value conflict */ +#define SA_NOT_IMPLEMENTED 19 /* plugin interface not implemented */ +#define SA_INVALID_PATH 20 /* path is sub-dir of existing share */ +#define SA_NOT_SUPPORTED 21 /* operation not supported for proto */ +#define SA_PROP_SHARE_ONLY 22 /* property valid on share only */ +#define SA_NOT_SHARED 23 /* path is not shared */ +#define SA_NO_SUCH_RESOURCE 24 /* resource not found */ +#define SA_RESOURCE_REQUIRED 25 /* resource name is required */ +#define SA_MULTIPLE_ERROR 26 /* multiple protocols reported error */ +#define SA_PATH_IS_SUBDIR 27 /* check_path found path is subdir */ +#define SA_PATH_IS_PARENTDIR 28 /* check_path found path is parent */ +#define SA_NO_SECTION 29 /* protocol requires section info */ +#define SA_NO_SUCH_SECTION 30 /* no section found */ +#define SA_NO_PROPERTIES 31 /* no properties found */ +#define SA_PASSWORD_ENC 32 /* passwords must be encrypted */ +#define SA_SHARE_EXISTS 33 /* path or file is already shared */ + +/* initialization */ +extern char *sa_errorstr(int); + +/* share control */ +extern int sa_enable_share(const char *, const char *, const char *, + char *); +extern int sa_disable_share(const char *, char *); +extern boolean_t sa_is_shared(const char *, char *); +extern void sa_commit_shares(const char *); + +/* protocol specific interfaces */ +extern int sa_validate_shareopts(char *, char *); + +#endif /* _LIBSPL_LIBSHARE_H */ diff --git a/lib/libspl/include/limits.h b/lib/libspl/include/limits.h new file mode 100644 index 000000000000..1a42cfec469c --- /dev/null +++ b/lib/libspl/include/limits.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_LIMITS_H +#define _LIBSPL_LIMITS_H + +#define DBL_DIG 15 +#define DBL_MAX 1.7976931348623157081452E+308 +#define DBL_MIN 2.2250738585072013830903E-308 + +#define FLT_DIG 6 +#define FLT_MAX 3.4028234663852885981170E+38F +#define FLT_MIN 1.1754943508222875079688E-38F + +#endif /* _LIBSPL_LIMITS_H */ diff --git a/lib/libspl/include/locale.h b/lib/libspl/include/locale.h new file mode 100644 index 000000000000..6c74df72072e --- /dev/null +++ b/lib/libspl/include/locale.h @@ -0,0 +1,35 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_LOCALE_H +#define _LIBSPL_LOCALE_H + +#include +#include + +#endif diff --git a/lib/libspl/include/os/Makefile.am b/lib/libspl/include/os/Makefile.am new file mode 100644 index 000000000000..7b362e02ad59 --- /dev/null +++ b/lib/libspl/include/os/Makefile.am @@ -0,0 +1,7 @@ +if BUILD_FREEBSD +SUBDIRS = freebsd +endif + +if BUILD_LINUX +SUBDIRS = linux +endif diff --git a/lib/libspl/include/os/freebsd/Makefile.am b/lib/libspl/include/os/freebsd/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/lib/libspl/include/os/freebsd/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/lib/libspl/include/os/freebsd/sys/Makefile.am b/lib/libspl/include/os/freebsd/sys/Makefile.am new file mode 100644 index 000000000000..6775522f5d72 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/Makefile.am @@ -0,0 +1,11 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + byteorder.h \ + file.h \ + mnttab.h \ + mount.h \ + param.h \ + stat.h \ + sysmacros.h \ + vfs.h \ + zfs_context_os.h diff --git a/lib/libspl/include/os/freebsd/sys/byteorder.h b/lib/libspl/include/os/freebsd/sys/byteorder.h new file mode 100644 index 000000000000..cd692d3616e0 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/byteorder.h @@ -0,0 +1,192 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#include +#include +#include +#include + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _ZFS_BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#ifdef _ZFS_BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/freebsd/sys/file.h b/lib/libspl/include/os/freebsd/sys/file.h new file mode 100644 index 000000000000..27fd2888f326 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/file.h @@ -0,0 +1,42 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_FILE_H +#define _LIBSPL_SYS_FILE_H + +#include_next + +#define FCREAT O_CREAT +#define FTRUNC O_TRUNC +#define FSYNC O_SYNC +#define FDSYNC O_DSYNC +#define FEXCL O_EXCL + +#define FNODSYNC 0x10000 /* fsync pseudo flag */ +#define FNOFOLLOW 0x20000 /* don't follow symlinks */ +#define FIGNORECASE 0x80000 /* request case-insensitive lookups */ + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/mnttab.h b/lib/libspl/include/os/freebsd/sys/mnttab.h new file mode 100644 index 000000000000..c08349bdf9bd --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mnttab.h @@ -0,0 +1,85 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#include +#include +#define MNTTAB _PATH_DEVZERO +#define MS_NOMNTTAB 0x0 +#define MS_RDONLY 0x1 +#define umount2(p, f) unmount(p, f) +#define MNT_LINE_MAX 4108 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; +}; + +/* + * NOTE: fields in extmnttab should match struct mnttab till new fields + * are encountered, this allows hasmntopt to work properly when its arg is + * a pointer to an extmnttab struct cast to a mnttab struct pointer. + */ + +struct extmnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; +}; + +struct stat64; +struct statfs; + +extern int getmntany(FILE *fp, struct mnttab *mp, struct mnttab *mpref); +extern int _sol_getmntent(FILE *fp, struct mnttab *mp); +extern int getextmntent(const char *path, struct extmnttab *entry, + struct stat64 *statbuf); +extern void statfs2mnttab(struct statfs *sfs, struct mnttab *mp); +char *hasmntopt(struct mnttab *mnt, char *opt); +int getmntent(FILE *fp, struct mnttab *mp); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/mount.h b/lib/libspl/include/os/freebsd/sys/mount.h new file mode 100644 index 000000000000..b4023910005b --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/mount.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#undef _SYS_MOUNT_H_ +#include_next + +#include +#include +#include + +/* + * Some old glibc headers don't define BLKGETSIZE64 + * and we don't want to require the kernel headers + */ +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 _IOR(0x12, 114, size_t) +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_NOSUID MNT_NOSUID +#define MS_NOEXEC MNT_NOEXEC +#define MS_NODEV 0 +#define S_WRITE 0 +#define MS_BIND 0 +#define MS_REMOUNT 0 +#define MS_SYNCHRONOUS MNT_SYNCHRONOUS + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/param.h b/lib/libspl/include/os/freebsd/sys/param.h new file mode 100644 index 000000000000..7c23b670591a --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/param.h @@ -0,0 +1,66 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXNAMELEN 256 + +#ifndef IN_BASE +#define UID_NOBODY 60001 /* user ID no body */ +#define GID_NOBODY UID_NOBODY +#endif +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifdef PAGESIZE +#undef PAGESIZE +#endif /* PAGESIZE */ + +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + +extern int execvpe(const char *name, char * const argv[], char * const envp[]); + +#endif diff --git a/lib/libspl/include/os/freebsd/sys/stat.h b/lib/libspl/include/os/freebsd/sys/stat.h new file mode 100644 index 000000000000..82c86262fff3 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/stat.h @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +#include /* for BLKGETSIZE64 */ + +#define stat64 stat + +#define MAXOFFSET_T OFF_MAX + +#ifndef _KERNEL +#include + +static __inline int +fstat64(int fd, struct stat *sb) +{ + int ret; + + ret = fstat(fd, sb); + if (ret == 0) { + if (S_ISCHR(sb->st_mode)) + (void) ioctl(fd, DIOCGMEDIASIZE, &sb->st_size); + } + return (ret); +} +#endif + +/* + * Emulate Solaris' behavior of returning the block device size in fstat64(). + */ +static inline int +fstat64_blk(int fd, struct stat64 *st) +{ + if (fstat64(fd, st) == -1) + return (-1); + + /* In Linux we need to use an ioctl to get the size of a block device */ + if (S_ISBLK(st->st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) + return (-1); + } + + return (0); +} +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/freebsd/sys/sysmacros.h b/lib/libspl/include/os/freebsd/sys/sysmacros.h new file mode 100644 index 000000000000..d9639d27b60e --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/sysmacros.h @@ -0,0 +1 @@ +/* keep me */ diff --git a/lib/libspl/include/os/freebsd/sys/vfs.h b/lib/libspl/include/os/freebsd/sys/vfs.h new file mode 100644 index 000000000000..55eb3c23b22e --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/vfs.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_SYS_VFS_H_ +#define ZFS_SYS_VFS_H_ + +#include_next + +int fsshare(const char *, const char *, const char *); +int fsunshare(const char *, const char *); + +#endif /* !ZFS_SYS_VFS_H_ */ diff --git a/lib/libspl/include/os/freebsd/sys/zfs_context_os.h b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h new file mode 100644 index 000000000000..f5a136d22125 --- /dev/null +++ b/lib/libspl/include/os/freebsd/sys/zfs_context_os.h @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef ZFS_CONTEXT_OS_H_ +#define ZFS_CONTEXT_OS_H_ + +#define ZFS_EXPORTS_PATH "/etc/zfs/exports" + +#endif diff --git a/lib/libspl/include/os/linux/Makefile.am b/lib/libspl/include/os/linux/Makefile.am new file mode 100644 index 000000000000..081839c48c8f --- /dev/null +++ b/lib/libspl/include/os/linux/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = sys diff --git a/lib/libspl/include/os/linux/sys/Makefile.am b/lib/libspl/include/os/linux/sys/Makefile.am new file mode 100644 index 000000000000..1ec07a76d354 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/Makefile.am @@ -0,0 +1,10 @@ +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + byteorder.h \ + errno.h \ + mnttab.h \ + mount.h \ + param.h \ + stat.h \ + sysmacros.h \ + zfs_context_os.h diff --git a/lib/libspl/include/os/linux/sys/byteorder.h b/lib/libspl/include/os/linux/sys/byteorder.h new file mode 100644 index 000000000000..d5ee3e26f5a5 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/byteorder.h @@ -0,0 +1,223 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _SYS_BYTEORDER_H +#define _SYS_BYTEORDER_H + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * macros for conversion between host and (internet) network byte order + */ + +#if defined(_ZFS_BIG_ENDIAN) && !defined(ntohl) && !defined(__lint) +/* big-endian */ +#define ntohl(x) (x) +#define ntohs(x) (x) +#define htonl(x) (x) +#define htons(x) (x) + +#elif !defined(ntohl) /* little-endian */ + +#ifndef _IN_PORT_T +#define _IN_PORT_T +typedef uint16_t in_port_t; +#endif + +#ifndef _IN_ADDR_T +#define _IN_ADDR_T +typedef uint32_t in_addr_t; +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) +extern uint32_t htonl(uint32_t); +extern uint16_t htons(uint16_t); +extern uint32_t ntohl(uint32_t); +extern uint16_t ntohs(uint16_t); +#else +extern in_addr_t htonl(in_addr_t); +extern in_port_t htons(in_port_t); +extern in_addr_t ntohl(in_addr_t); +extern in_port_t ntohs(in_port_t); +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) || defined(_XPG5) */ +#endif + +#if !defined(_XPG4_2) || defined(__EXTENSIONS__) + +/* + * Macros to reverse byte order + */ +#define BSWAP_8(x) ((x) & 0xff) +#define BSWAP_16(x) ((BSWAP_8(x) << 8) | BSWAP_8((x) >> 8)) +#define BSWAP_32(x) ((BSWAP_16(x) << 16) | BSWAP_16((x) >> 16)) +#define BSWAP_64(x) ((BSWAP_32(x) << 32) | BSWAP_32((x) >> 32)) + +#define BMASK_8(x) ((x) & 0xff) +#define BMASK_16(x) ((x) & 0xffff) +#define BMASK_32(x) ((x) & 0xffffffff) +#define BMASK_64(x) (x) + +/* + * Macros to convert from a specific byte order to/from native byte order + */ +#ifdef _ZFS_BIG_ENDIAN +#define BE_8(x) BMASK_8(x) +#define BE_16(x) BMASK_16(x) +#define BE_32(x) BMASK_32(x) +#define BE_64(x) BMASK_64(x) +#define LE_8(x) BSWAP_8(x) +#define LE_16(x) BSWAP_16(x) +#define LE_32(x) BSWAP_32(x) +#define LE_64(x) BSWAP_64(x) +#else +#define LE_8(x) BMASK_8(x) +#define LE_16(x) BMASK_16(x) +#define LE_32(x) BMASK_32(x) +#define LE_64(x) BMASK_64(x) +#define BE_8(x) BSWAP_8(x) +#define BE_16(x) BSWAP_16(x) +#define BE_32(x) BSWAP_32(x) +#define BE_64(x) BSWAP_64(x) +#endif + +#ifdef _ZFS_BIG_ENDIAN +static __inline__ uint64_t +htonll(uint64_t n) +{ + return (n); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return (n); +} +#else +static __inline__ uint64_t +htonll(uint64_t n) +{ + return ((((uint64_t)htonl(n)) << 32) + htonl(n >> 32)); +} + +static __inline__ uint64_t +ntohll(uint64_t n) +{ + return ((((uint64_t)ntohl(n)) << 32) + ntohl(n >> 32)); +} +#endif + +/* + * Macros to read unaligned values from a specific byte order to + * native byte order + */ + +#define BE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define BE_IN16(xa) \ + (((uint16_t)BE_IN8(xa) << 8) | BE_IN8((uint8_t *)(xa)+1)) + +#define BE_IN32(xa) \ + (((uint32_t)BE_IN16(xa) << 16) | BE_IN16((uint8_t *)(xa)+2)) + +#define BE_IN64(xa) \ + (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa)+4)) + +#define LE_IN8(xa) \ + *((uint8_t *)(xa)) + +#define LE_IN16(xa) \ + (((uint16_t)LE_IN8((uint8_t *)(xa) + 1) << 8) | LE_IN8(xa)) + +#define LE_IN32(xa) \ + (((uint32_t)LE_IN16((uint8_t *)(xa) + 2) << 16) | LE_IN16(xa)) + +#define LE_IN64(xa) \ + (((uint64_t)LE_IN32((uint8_t *)(xa) + 4) << 32) | LE_IN32(xa)) + +/* + * Macros to write unaligned values from native byte order to a specific byte + * order. + */ + +#define BE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define BE_OUT16(xa, yv) \ + BE_OUT8((uint8_t *)(xa) + 1, yv); \ + BE_OUT8((uint8_t *)(xa), (yv) >> 8); + +#define BE_OUT32(xa, yv) \ + BE_OUT16((uint8_t *)(xa) + 2, yv); \ + BE_OUT16((uint8_t *)(xa), (yv) >> 16); + +#define BE_OUT64(xa, yv) \ + BE_OUT32((uint8_t *)(xa) + 4, yv); \ + BE_OUT32((uint8_t *)(xa), (yv) >> 32); + +#define LE_OUT8(xa, yv) *((uint8_t *)(xa)) = (uint8_t)(yv); + +#define LE_OUT16(xa, yv) \ + LE_OUT8((uint8_t *)(xa), yv); \ + LE_OUT8((uint8_t *)(xa) + 1, (yv) >> 8); + +#define LE_OUT32(xa, yv) \ + LE_OUT16((uint8_t *)(xa), yv); \ + LE_OUT16((uint8_t *)(xa) + 2, (yv) >> 16); + +#define LE_OUT64(xa, yv) \ + LE_OUT32((uint8_t *)(xa), yv); \ + LE_OUT32((uint8_t *)(xa) + 4, (yv) >> 32); + +#endif /* !defined(_XPG4_2) || defined(__EXTENSIONS__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BYTEORDER_H */ diff --git a/lib/libspl/include/os/linux/sys/errno.h b/lib/libspl/include/os/linux/sys/errno.h new file mode 100644 index 000000000000..30d20ab895c5 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/errno.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Zettabyte Software, LLC. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Compiling against musl correctly points out that including sys/errno.h is + * disallowed by the Single UNIX Specification when building in userspace, so + * we implement a dummy header to redirect the include to the proper header. + */ +#ifndef _LIBSPL_SYS_ERRNO_H +#define _LIBSPL_SYS_ERRNO_H + +#include +/* + * We'll take the unused errnos, 'EBADE' and 'EBADR' (from the Convergent + * graveyard) to indicate checksum errors and fragmentation. + */ +#define ECKSUM EBADE +#define EFRAGS EBADR + +/* Similar for ENOACTIVE */ +#define ENOTACTIVE ENOANO + +#endif /* _LIBSPL_SYS_ERRNO_H */ diff --git a/lib/libspl/include/os/linux/sys/mnttab.h b/lib/libspl/include/os/linux/sys/mnttab.h new file mode 100644 index 000000000000..1957293d5c69 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/mnttab.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2006 Ricardo Correia */ + +#ifndef _SYS_MNTTAB_H +#define _SYS_MNTTAB_H + +#include +#include +#include +#include + +#ifdef MNTTAB +#undef MNTTAB +#endif /* MNTTAB */ + +#define MNTTAB "/proc/self/mounts" +#define MNT_LINE_MAX 4108 + +#define MNT_TOOLONG 1 /* entry exceeds MNT_LINE_MAX */ +#define MNT_TOOMANY 2 /* too many fields in line */ +#define MNT_TOOFEW 3 /* too few fields in line */ + +struct mnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; +}; + +/* + * NOTE: fields in extmnttab should match struct mnttab till new fields + * are encountered, this allows hasmntopt to work properly when its arg is + * a pointer to an extmnttab struct cast to a mnttab struct pointer. + */ + +struct extmnttab { + char *mnt_special; + char *mnt_mountp; + char *mnt_fstype; + char *mnt_mntopts; + uint_t mnt_major; + uint_t mnt_minor; +}; + +struct statfs; + +extern int getmntany(FILE *fp, struct mnttab *mp, struct mnttab *mpref); +extern int _sol_getmntent(FILE *fp, struct mnttab *mp); +extern int getextmntent(const char *path, struct extmnttab *mp, + struct stat64 *statbuf); +static inline char *_sol_hasmntopt(struct mnttab *mnt, char *opt) +{ + struct mntent mnt_new; + + mnt_new.mnt_opts = mnt->mnt_mntopts; + + return (hasmntopt(&mnt_new, opt)); +} + +#define hasmntopt _sol_hasmntopt +#define getmntent _sol_getmntent + +#endif diff --git a/lib/libspl/include/os/linux/sys/mount.h b/lib/libspl/include/os/linux/sys/mount.h new file mode 100644 index 000000000000..d7c6f750e23d --- /dev/null +++ b/lib/libspl/include/os/linux/sys/mount.h @@ -0,0 +1,98 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_SYS_MOUNT_H +#define _LIBSPL_SYS_MOUNT_H + +#include +#include +#include + +/* + * Some old glibc headers don't define BLKGETSIZE64 + * and we don't want to require the kernel headers + */ +#if !defined(BLKGETSIZE64) +#define BLKGETSIZE64 _IOR(0x12, 114, size_t) +#endif + +/* + * Some old glibc headers don't correctly define MS_DIRSYNC and + * instead use the enum name S_WRITE. When using these older + * headers define MS_DIRSYNC to be S_WRITE. + */ +#if !defined(MS_DIRSYNC) +#define MS_DIRSYNC S_WRITE +#endif + +/* + * Some old glibc headers don't correctly define MS_POSIXACL and + * instead leave it undefined. When using these older headers define + * MS_POSIXACL to the reserved value of (1<<16). + */ +#if !defined(MS_POSIXACL) +#define MS_POSIXACL (1<<16) +#endif + +#define MS_USERS (MS_NOEXEC|MS_NOSUID|MS_NODEV) +#define MS_OWNER (MS_NOSUID|MS_NODEV) +#define MS_GROUP (MS_NOSUID|MS_NODEV) +#define MS_COMMENT 0 + +/* + * Older glibc headers did not define all the available + * umount2(2) flags. Both MNT_FORCE and MNT_DETACH are supported in the + * kernel back to 2.4.11 so we define them correctly if they are missing. + */ +#ifdef MNT_FORCE +#define MS_FORCE MNT_FORCE +#else +#define MS_FORCE 0x00000001 +#endif /* MNT_FORCE */ + +#ifdef MNT_DETACH +#define MS_DETACH MNT_DETACH +#else +#define MS_DETACH 0x00000002 +#endif /* MNT_DETACH */ + +/* + * Overlay mount is default in Linux, but for solaris/zfs + * compatibility, MS_OVERLAY is defined to explicitly have the user + * provide a flag (-O) to mount over a non empty directory. + */ +#define MS_OVERLAY 0x00000004 + +/* + * MS_CRYPT indicates that encryption keys should be loaded if they are not + * already available. This is not defined in glibc, but it is never seen by + * the kernel so it will not cause any problems. + */ +#define MS_CRYPT 0x00000008 + +#endif /* _LIBSPL_SYS_MOUNT_H */ diff --git a/lib/libspl/include/os/linux/sys/param.h b/lib/libspl/include/os/linux/sys/param.h new file mode 100644 index 000000000000..26335187fdca --- /dev/null +++ b/lib/libspl/include/os/linux/sys/param.h @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PARAM_H +#define _LIBSPL_SYS_PARAM_H + +#include_next +#include + +/* + * File system parameters and macros. + * + * The file system is made out of blocks of at most MAXBSIZE units, + * with smaller units (fragments) only in the last direct block. + * MAXBSIZE primarily determines the size of buffers in the buffer + * pool. It may be made larger without any effect on existing + * file systems; however making it smaller may make some file + * systems unmountable. + * + * Note that the blocked devices are assumed to have DEV_BSIZE + * "sectors" and that fragments must be some multiple of this size. + */ +#define MAXBSIZE 8192 +#define DEV_BSIZE 512 +#define DEV_BSHIFT 9 /* log2(DEV_BSIZE) */ + +#define MAXNAMELEN 256 +#define MAXOFFSET_T LLONG_MAX + +#define UID_NOBODY 60001 /* user ID no body */ +#define GID_NOBODY UID_NOBODY +#define UID_NOACCESS 60002 /* user ID no access */ + +#define MAXUID UINT32_MAX /* max user id */ +#define MAXPROJID MAXUID /* max project id */ + +#ifdef PAGESIZE +#undef PAGESIZE +#endif /* PAGESIZE */ + +extern size_t spl_pagesize(void); +#define PAGESIZE (spl_pagesize()) + +#endif diff --git a/lib/libspl/include/os/linux/sys/stat.h b/lib/libspl/include/os/linux/sys/stat.h new file mode 100644 index 000000000000..3e8d27e4c19a --- /dev/null +++ b/lib/libspl/include/os/linux/sys/stat.h @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _LIBSPL_SYS_STAT_H +#define _LIBSPL_SYS_STAT_H + +#include_next + +#include /* for BLKGETSIZE64 */ + +/* + * Emulate Solaris' behavior of returning the block device size in fstat64(). + */ +static inline int +fstat64_blk(int fd, struct stat64 *st) +{ + if (fstat64(fd, st) == -1) + return (-1); + + /* In Linux we need to use an ioctl to get the size of a block device */ + if (S_ISBLK(st->st_mode)) { + if (ioctl(fd, BLKGETSIZE64, &st->st_size) != 0) + return (-1); + } + + return (0); +} +#endif /* _LIBSPL_SYS_STAT_H */ diff --git a/lib/libspl/include/os/linux/sys/sysmacros.h b/lib/libspl/include/os/linux/sys/sysmacros.h new file mode 100644 index 000000000000..22fcb04b94e0 --- /dev/null +++ b/lib/libspl/include/os/linux/sys/sysmacros.h @@ -0,0 +1,103 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SYSMACROS_H +#define _LIBSPL_SYS_SYSMACROS_H + +#include_next + +/* common macros */ +#ifndef MIN +#define MIN(a, b) ((a) < (b) ? (a) : (b)) +#endif +#ifndef MAX +#define MAX(a, b) ((a) < (b) ? (b) : (a)) +#endif +#ifndef ABS +#define ABS(a) ((a) < 0 ? -(a) : (a)) +#endif +#ifndef ARRAY_SIZE +#define ARRAY_SIZE(a) (sizeof (a) / sizeof (a[0])) +#endif +#ifndef DIV_ROUND_UP +#define DIV_ROUND_UP(n, d) (((n) + (d) - 1) / (d)) +#endif + +#define makedevice(maj, min) makedev(maj, min) +#define _sysconf(a) sysconf(a) + +/* + * Compatibility macros/typedefs needed for Solaris -> Linux port + */ +#define P2ALIGN(x, align) ((x) & -(align)) +#define P2CROSS(x, y, align) (((x) ^ (y)) > (align) - 1) +#define P2ROUNDUP(x, align) ((((x) - 1) | ((align) - 1)) + 1) +#define P2BOUNDARY(off, len, align) \ + (((off) ^ ((off) + (len) - 1)) > (align) - 1) +#define P2PHASE(x, align) ((x) & ((align) - 1)) +#define P2NPHASE(x, align) (-(x) & ((align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define ISP2(x) (((x) & ((x) - 1)) == 0) +#define IS_P2ALIGNED(v, a) ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) + +/* + * Typed version of the P2* macros. These macros should be used to ensure + * that the result is correctly calculated based on the data type of (x), + * which is passed in as the last argument, regardless of the data + * type of the alignment. For example, if (x) is of type uint64_t, + * and we want to round it up to a page boundary using "PAGESIZE" as + * the alignment, we can do either + * P2ROUNDUP(x, (uint64_t)PAGESIZE) + * or + * P2ROUNDUP_TYPED(x, PAGESIZE, uint64_t) + */ +#define P2ALIGN_TYPED(x, align, type) \ + ((type)(x) & -(type)(align)) +#define P2PHASE_TYPED(x, align, type) \ + ((type)(x) & ((type)(align) - 1)) +#define P2NPHASE_TYPED(x, align, type) \ + (-(type)(x) & ((type)(align) - 1)) +#define P2ROUNDUP_TYPED(x, align, type) \ + ((((type)(x) - 1) | ((type)(align) - 1)) + 1) +#define P2END_TYPED(x, align, type) \ + (-(~(type)(x) & -(type)(align))) +#define P2PHASEUP_TYPED(x, align, phase, type) \ + ((type)(phase) - (((type)(phase) - (type)(x)) & -(type)(align))) +#define P2CROSS_TYPED(x, y, align, type) \ + (((type)(x) ^ (type)(y)) > (type)(align) - 1) +#define P2SAMEHIGHBIT_TYPED(x, y, type) \ + (((type)(x) ^ (type)(y)) < ((type)(x) & (type)(y))) + + +/* avoid any possibility of clashing with version */ +#if defined(_KERNEL) && !defined(_KMEMUSER) && !defined(offsetof) +#define offsetof(s, m) ((size_t)(&(((s *)0)->m))) +#endif + +#define _NOTE(x) + +#endif /* _LIBSPL_SYS_SYSMACROS_H */ diff --git a/lib/libspl/include/os/linux/sys/zfs_context_os.h b/lib/libspl/include/os/linux/sys/zfs_context_os.h new file mode 100644 index 000000000000..008e57df4eae --- /dev/null +++ b/lib/libspl/include/os/linux/sys/zfs_context_os.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef ZFS_CONTEXT_OS_H +#define ZFS_CONTEXT_OS_H +#endif diff --git a/lib/libspl/include/rpc/Makefile.am b/lib/libspl/include/rpc/Makefile.am new file mode 100644 index 000000000000..7fe1d7fea4d7 --- /dev/null +++ b/lib/libspl/include/rpc/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl/rpc +libspl_HEADERS = \ + xdr.h diff --git a/lib/libspl/include/rpc/xdr.h b/lib/libspl/include/rpc/xdr.h new file mode 100644 index 000000000000..51d71f693bbf --- /dev/null +++ b/lib/libspl/include/rpc/xdr.h @@ -0,0 +1,68 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + * Portions of this source code were derived from Berkeley 4.3 BSD + * under license from the Regents of the University of California. + */ + +#ifndef LIBSPL_RPC_XDR_H +#define LIBSPL_RPC_XDR_H + +#include_next + +#ifdef xdr_control /* if e.g. using tirpc */ +#undef xdr_control +#endif + +#define XDR_GET_BYTES_AVAIL 1 + +#ifndef HAVE_XDR_BYTESREC +struct xdr_bytesrec { + bool_t xc_is_last_record; + size_t xc_num_avail; +}; +#endif +typedef struct xdr_bytesrec xdr_bytesrec_t; + +/* + * This functionality is not required and is disabled in user space. + */ +static inline bool_t +xdr_control(XDR *xdrs, int request, void *info) +{ + xdr_bytesrec_t *xptr; + + ASSERT3U(request, ==, XDR_GET_BYTES_AVAIL); + + xptr = (xdr_bytesrec_t *)info; + xptr->xc_is_last_record = TRUE; + xptr->xc_num_avail = xdrs->x_handy; + + return (TRUE); +} + +#endif /* LIBSPL_RPC_XDR_H */ diff --git a/lib/libspl/include/statcommon.h b/lib/libspl/include/statcommon.h new file mode 100644 index 000000000000..1f376f5c7c24 --- /dev/null +++ b/lib/libspl/include/statcommon.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Common routines for acquiring snapshots of kstats for + * iostat, mpstat, and vmstat. + */ + +#ifndef _STATCOMMON_H +#define _STATCOMMON_H + +#include + +#define NODATE 0 /* Default: No time stamp */ +#define DDATE 1 /* Standard date format */ +#define UDATE 2 /* Internal representation of Unix time */ + +/* Print a timestamp in either Unix or standard format. */ +void print_timestamp(uint_t); + +#endif /* _STATCOMMON_H */ diff --git a/lib/libspl/include/stdio.h b/lib/libspl/include/stdio.h new file mode 100644 index 000000000000..6152b09f1a97 --- /dev/null +++ b/lib/libspl/include/stdio.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_STDIO_H +#define _LIBSPL_STDIO_H + +#define enable_extended_FILE_stdio(fd, sig) ((void) 0) + +#endif diff --git a/lib/libspl/include/stdlib.h b/lib/libspl/include/stdlib.h new file mode 100644 index 000000000000..a4ce4f781fc5 --- /dev/null +++ b/lib/libspl/include/stdlib.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_STDLIB_H +#define _LIBSPL_STDLIB_H + +extern const char *getexecname(void); + +#endif diff --git a/lib/libspl/include/string.h b/lib/libspl/include/string.h new file mode 100644 index 000000000000..a7d40fa61943 --- /dev/null +++ b/lib/libspl/include/string.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_STRING_H +#define _LIBSPL_STRING_H + +#include_next + +#ifndef HAVE_STRLCAT +extern size_t strlcat(char *dst, const char *src, size_t dstsize); +#endif + +#ifndef HAVE_STRLCPY +extern size_t strlcpy(char *dst, const char *src, size_t len); +#endif + +#endif diff --git a/lib/libspl/include/stropts.h b/lib/libspl/include/stropts.h new file mode 100644 index 000000000000..37acd4052b0b --- /dev/null +++ b/lib/libspl/include/stropts.h @@ -0,0 +1,25 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _LIBSPL_STROPTS_H +#define _LIBSPL_STROPTS_H + +#endif /* _LIBSPL_STROPTS_H */ diff --git a/lib/libspl/include/sys/Makefile.am b/lib/libspl/include/sys/Makefile.am new file mode 100644 index 000000000000..53a36f6bfbca --- /dev/null +++ b/lib/libspl/include/sys/Makefile.am @@ -0,0 +1,47 @@ +SUBDIRS = dktp + +libspldir = $(includedir)/libspl/sys +libspl_HEADERS = \ + acl.h \ + acl_impl.h \ + callb.h \ + cmn_err.h \ + cred.h \ + debug.h \ + dkio.h \ + dklabel.h \ + feature_tests.h \ + int_limits.h \ + int_types.h \ + inttypes.h \ + isa_defs.h \ + kmem.h \ + kstat.h \ + list.h \ + list_impl.h \ + mhd.h \ + mkdev.h \ + policy.h \ + poll.h \ + priv.h \ + processor.h \ + sha2.h \ + simd.h \ + stack.h \ + stdtypes.h \ + strings.h \ + stropts.h \ + sunddi.h \ + systeminfo.h \ + time.h \ + trace_spl.h \ + trace_zfs.h \ + types32.h \ + types.h \ + tzfile.h \ + uio.h \ + va_list.h \ + varargs.h \ + vnode.h \ + vtoc.h \ + zone.h diff --git a/lib/libspl/include/sys/acl.h b/lib/libspl/include/sys/acl.h new file mode 100644 index 000000000000..e6df864f850f --- /dev/null +++ b/lib/libspl/include/sys/acl.h @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_H +#define _SYS_ACL_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define MAX_ACL_ENTRIES (1024) /* max entries of each type */ +typedef struct acl { + int a_type; /* the type of ACL entry */ + uid_t a_id; /* the entry in -uid or gid */ + o_mode_t a_perm; /* the permission field */ +} aclent_t; + +typedef struct ace { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ +} ace_t; + +typedef struct acl_info acl_t; + +/* + * The following are Defined types for an aclent_t. + */ +#define USER_OBJ (0x01) /* object owner */ +#define USER (0x02) /* additional users */ +#define GROUP_OBJ (0x04) /* owning group of the object */ +#define GROUP (0x08) /* additional groups */ +#define CLASS_OBJ (0x10) /* file group class and mask entry */ +#define OTHER_OBJ (0x20) /* other entry for the object */ +#define ACL_DEFAULT (0x1000) /* default flag */ +/* default object owner */ +#define DEF_USER_OBJ (ACL_DEFAULT | USER_OBJ) +/* default additional users */ +#define DEF_USER (ACL_DEFAULT | USER) +/* default owning group */ +#define DEF_GROUP_OBJ (ACL_DEFAULT | GROUP_OBJ) +/* default additional groups */ +#define DEF_GROUP (ACL_DEFAULT | GROUP) +/* default mask entry */ +#define DEF_CLASS_OBJ (ACL_DEFAULT | CLASS_OBJ) +/* default other entry */ +#define DEF_OTHER_OBJ (ACL_DEFAULT | OTHER_OBJ) + +/* + * The following are defined for ace_t. + */ +#define ACE_READ_DATA 0x00000001 +#define ACE_LIST_DIRECTORY 0x00000001 +#define ACE_WRITE_DATA 0x00000002 +#define ACE_ADD_FILE 0x00000002 +#define ACE_APPEND_DATA 0x00000004 +#define ACE_ADD_SUBDIRECTORY 0x00000004 +#define ACE_READ_NAMED_ATTRS 0x00000008 +#define ACE_WRITE_NAMED_ATTRS 0x00000010 +#define ACE_EXECUTE 0x00000020 +#define ACE_DELETE_CHILD 0x00000040 +#define ACE_READ_ATTRIBUTES 0x00000080 +#define ACE_WRITE_ATTRIBUTES 0x00000100 +#define ACE_DELETE 0x00010000 +#define ACE_READ_ACL 0x00020000 +#define ACE_WRITE_ACL 0x00040000 +#define ACE_WRITE_OWNER 0x00080000 +#define ACE_SYNCHRONIZE 0x00100000 + +#define ACE_FILE_INHERIT_ACE 0x0001 +#define ACE_DIRECTORY_INHERIT_ACE 0x0002 +#define ACE_NO_PROPAGATE_INHERIT_ACE 0x0004 +#define ACE_INHERIT_ONLY_ACE 0x0008 +#define ACE_SUCCESSFUL_ACCESS_ACE_FLAG 0x0010 +#define ACE_FAILED_ACCESS_ACE_FLAG 0x0020 +#define ACE_IDENTIFIER_GROUP 0x0040 +#define ACE_INHERITED_ACE 0x0080 +#define ACE_OWNER 0x1000 +#define ACE_GROUP 0x2000 +#define ACE_EVERYONE 0x4000 + +#define ACE_ACCESS_ALLOWED_ACE_TYPE 0x0000 +#define ACE_ACCESS_DENIED_ACE_TYPE 0x0001 +#define ACE_SYSTEM_AUDIT_ACE_TYPE 0x0002 +#define ACE_SYSTEM_ALARM_ACE_TYPE 0x0003 + +#define ACL_AUTO_INHERIT 0x0001 +#define ACL_PROTECTED 0x0002 +#define ACL_DEFAULTED 0x0004 +#define ACL_FLAGS_ALL (ACL_AUTO_INHERIT|ACL_PROTECTED| \ + ACL_DEFAULTED) + +#ifdef _KERNEL + +/* + * These are only applicable in a CIFS context. + */ +#define ACE_ACCESS_ALLOWED_COMPOUND_ACE_TYPE 0x04 +#define ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE 0x05 +#define ACE_ACCESS_DENIED_OBJECT_ACE_TYPE 0x06 +#define ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE 0x07 +#define ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE 0x08 +#define ACE_ACCESS_ALLOWED_CALLBACK_ACE_TYPE 0x09 +#define ACE_ACCESS_DENIED_CALLBACK_ACE_TYPE 0x0A +#define ACE_ACCESS_ALLOWED_CALLBACK_OBJECT_ACE_TYPE 0x0B +#define ACE_ACCESS_DENIED_CALLBACK_OBJECT_ACE_TYPE 0x0C +#define ACE_SYSTEM_AUDIT_CALLBACK_ACE_TYPE 0x0D +#define ACE_SYSTEM_ALARM_CALLBACK_ACE_TYPE 0x0E +#define ACE_SYSTEM_AUDIT_CALLBACK_OBJECT_ACE_TYPE 0x0F +#define ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE 0x10 + +#define ACE_ALL_TYPES 0x001F + +typedef struct ace_object { + uid_t a_who; /* uid or gid */ + uint32_t a_access_mask; /* read,write,... */ + uint16_t a_flags; /* see below */ + uint16_t a_type; /* allow or deny */ + uint8_t a_obj_type[16]; /* obj type */ + uint8_t a_inherit_obj_type[16]; /* inherit obj */ +} ace_object_t; + +#endif + +#define ACE_ALL_PERMS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_READ_NAMED_ATTRS| \ + ACE_WRITE_NAMED_ATTRS|ACE_EXECUTE|ACE_DELETE_CHILD|ACE_READ_ATTRIBUTES| \ + ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_SYNCHRONIZE) + +/* + * The following flags are supported by both NFSv4 ACLs and ace_t. + */ +#define ACE_NFSV4_SUP_FLAGS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | \ + ACE_INHERIT_ONLY_ACE | \ + ACE_IDENTIFIER_GROUP) + +#define ACE_TYPE_FLAGS (ACE_OWNER|ACE_GROUP|ACE_EVERYONE| \ + ACE_IDENTIFIER_GROUP) +#define ACE_INHERIT_FLAGS (ACE_FILE_INHERIT_ACE| \ + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE) + +/* cmd args to acl(2) for aclent_t */ +#define GETACL 1 +#define SETACL 2 +#define GETACLCNT 3 + +/* cmd's to manipulate ace acls. */ +#define ACE_GETACL 4 +#define ACE_SETACL 5 +#define ACE_GETACLCNT 6 + +/* minimal acl entries from GETACLCNT */ +#define MIN_ACL_ENTRIES 4 + +#if !defined(_KERNEL) + +/* acl check errors */ +#define GRP_ERROR 1 +#define USER_ERROR 2 +#define OTHER_ERROR 3 +#define CLASS_ERROR 4 +#define DUPLICATE_ERROR 5 +#define MISS_ERROR 6 +#define MEM_ERROR 7 +#define ENTRY_ERROR 8 + + +/* + * similar to ufs_acl.h: changed to char type for user commands (tar, cpio) + * Attribute types + */ +#define UFSD_FREE ('0') /* Free entry */ +#define UFSD_ACL ('1') /* Access Control Lists */ +#define UFSD_DFACL ('2') /* reserved for future use */ +#define ACE_ACL ('3') /* ace_t style acls */ + +/* + * flag to [f]acl_get() + * controls whether a trivial acl should be returned. + */ +#define ACL_NO_TRIVIAL 0x2 + + +/* + * Flags to control acl_totext() + */ + +#define ACL_APPEND_ID 0x1 /* append uid/gid to user/group entries */ +#define ACL_COMPACT_FMT 0x2 /* build ACL in ls -V format */ +#define ACL_NORESOLVE 0x4 /* don't do name service lookups */ + +/* + * Legacy aclcheck errors for aclent_t ACLs + */ +#define EACL_GRP_ERROR GRP_ERROR +#define EACL_USER_ERROR USER_ERROR +#define EACL_OTHER_ERROR OTHER_ERROR +#define EACL_CLASS_ERROR CLASS_ERROR +#define EACL_DUPLICATE_ERROR DUPLICATE_ERROR +#define EACL_MISS_ERROR MISS_ERROR +#define EACL_MEM_ERROR MEM_ERROR +#define EACL_ENTRY_ERROR ENTRY_ERROR + +#define EACL_INHERIT_ERROR 9 /* invalid inherit flags */ +#define EACL_FLAGS_ERROR 10 /* unknown flag value */ +#define EACL_PERM_MASK_ERROR 11 /* unknown permission */ +#define EACL_COUNT_ERROR 12 /* invalid acl count */ + +#define EACL_INVALID_SLOT 13 /* invalid acl slot */ +#define EACL_NO_ACL_ENTRY 14 /* Entry doesn't exist */ +#define EACL_DIFF_TYPE 15 /* acls aren't same type */ + +#define EACL_INVALID_USER_GROUP 16 /* need user/group name */ +#define EACL_INVALID_STR 17 /* invalid acl string */ +#define EACL_FIELD_NOT_BLANK 18 /* can't have blank field */ +#define EACL_INVALID_ACCESS_TYPE 19 /* invalid access type */ +#define EACL_UNKNOWN_DATA 20 /* Unrecognized data in ACL */ +#define EACL_MISSING_FIELDS 21 /* missing fields in acl */ + +#define EACL_INHERIT_NOTDIR 22 /* Need dir for inheritance */ + +extern int aclcheck(aclent_t *, int, int *); +extern int acltomode(aclent_t *, int, mode_t *); +extern int aclfrommode(aclent_t *, int, mode_t *); +extern int aclsort(int, int, aclent_t *); +extern char *acltotext(aclent_t *, int); +extern aclent_t *aclfromtext(char *, int *); +extern void acl_free(acl_t *); +extern int acl_get(const char *, int, acl_t **); +extern int facl_get(int, int, acl_t **); +extern int acl_set(const char *, acl_t *acl); +extern int facl_set(int, acl_t *acl); +extern int acl_strip(const char *, uid_t, gid_t, mode_t); +extern int acl_trivial(const char *); +extern char *acl_totext(acl_t *, int); +extern int acl_fromtext(const char *, acl_t **); +extern int acl_check(acl_t *, int); + +#else /* !defined(_KERNEL) */ + +extern void ksort(caddr_t, int, int, int (*)(void *, void *)); +extern int cmp2acls(void *, void *); + +#endif /* !defined(_KERNEL) */ + +#if defined(__STDC__) +extern int acl(const char *path, int cmd, int cnt, void *buf); +extern int facl(int fd, int cmd, int cnt, void *buf); +#else /* !__STDC__ */ +extern int acl(); +extern int facl(); +#endif /* defined(__STDC__) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_H */ diff --git a/lib/libspl/include/sys/acl_impl.h b/lib/libspl/include/sys/acl_impl.h new file mode 100644 index 000000000000..717334906415 --- /dev/null +++ b/lib/libspl/include/sys/acl_impl.h @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ACL_IMPL_H +#define _SYS_ACL_IMPL_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * acl flags + * + * ACL_AUTO_INHERIT, ACL_PROTECTED and ACL_DEFAULTED + * flags can also be stored in this field. + */ +#define ACL_IS_TRIVIAL 0x10000 +#define ACL_IS_DIR 0x20000 + +typedef enum acl_type { + ACLENT_T = 0, + ACE_T = 1 +} acl_type_t; + +struct acl_info { + acl_type_t acl_type; /* style of acl */ + int acl_cnt; /* number of acl entries */ + int acl_entry_size; /* sizeof acl entry */ + int acl_flags; /* special flags about acl */ + void *acl_aclp; /* the acl */ +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ACL_IMPL_H */ diff --git a/lib/libspl/include/sys/callb.h b/lib/libspl/include/sys/callb.h new file mode 100644 index 000000000000..8ffd18788865 --- /dev/null +++ b/lib/libspl/include/sys/callb.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CALLB_H +#define _SYS_CALLB_H + +#endif diff --git a/lib/libspl/include/sys/cmn_err.h b/lib/libspl/include/sys/cmn_err.h new file mode 100644 index 000000000000..63ff4eb29bc8 --- /dev/null +++ b/lib/libspl/include/sys/cmn_err.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_CMN_ERR_H +#define _LIBSPL_SYS_CMN_ERR_H + +#endif diff --git a/lib/libspl/include/sys/cred.h b/lib/libspl/include/sys/cred.h new file mode 100644 index 000000000000..463b3abfc977 --- /dev/null +++ b/lib/libspl/include/sys/cred.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_CRED_H +#define _LIBSPL_SYS_CRED_H + +typedef struct cred cred_t; + +#endif diff --git a/lib/libspl/include/sys/debug.h b/lib/libspl/include/sys/debug.h new file mode 100644 index 000000000000..af18da94804c --- /dev/null +++ b/lib/libspl/include/sys/debug.h @@ -0,0 +1,40 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_DEBUG_H +#define _LIBSPL_SYS_DEBUG_H + +#include + +#ifndef __printflike +#define __printflike(x, y) __attribute__((__format__(__printf__, x, y))) +#endif + +#ifndef __maybe_unused +#define __maybe_unused __attribute__((unused)) +#endif + +#endif diff --git a/lib/libspl/include/sys/dkio.h b/lib/libspl/include/sys/dkio.h new file mode 100644 index 000000000000..f3c641f669b7 --- /dev/null +++ b/lib/libspl/include/sys/dkio.h @@ -0,0 +1,483 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DKIO_H +#define _SYS_DKIO_H + + + +#include /* Needed for NDKMAP define */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Structures and definitions for disk io control commands + */ + +/* + * Structures used as data by ioctl calls. + */ + +#define DK_DEVLEN 16 /* device name max length, including */ + /* unit # & NULL (ie - "xyc1") */ + +/* + * Used for controller info + */ +struct dk_cinfo { + char dki_cname[DK_DEVLEN]; /* controller name (no unit #) */ + ushort_t dki_ctype; /* controller type */ + ushort_t dki_flags; /* flags */ + ushort_t dki_cnum; /* controller number */ + uint_t dki_addr; /* controller address */ + uint_t dki_space; /* controller bus type */ + uint_t dki_prio; /* interrupt priority */ + uint_t dki_vec; /* interrupt vector */ + char dki_dname[DK_DEVLEN]; /* drive name (no unit #) */ + uint_t dki_unit; /* unit number */ + ushort_t dki_partition; /* partition number */ + ushort_t dki_maxtransfer; /* max. transfer size in DEV_BSIZE */ +}; + +/* + * Controller types + */ +#define DKC_UNKNOWN 0 +#define DKC_CDROM 1 /* CD-ROM, SCSI or otherwise */ +#define DKC_WDC2880 2 +#define DKC_XXX_0 3 /* unassigned */ +#define DKC_XXX_1 4 /* unassigned */ +#define DKC_DSD5215 5 +#define DKC_ACB4000 7 +#define DKC_MD21 8 +#define DKC_XXX_2 9 /* unassigned */ +#define DKC_NCRFLOPPY 10 +#define DKC_SMSFLOPPY 12 +#define DKC_SCSI_CCS 13 /* SCSI CCS compatible */ +#define DKC_INTEL82072 14 /* native floppy chip */ +#define DKC_MD 16 /* meta-disk (virtual-disk) driver */ +#define DKC_INTEL82077 19 /* 82077 floppy disk controller */ +#define DKC_DIRECT 20 /* Intel direct attached device i.e. IDE */ +#define DKC_PCMCIA_MEM 21 /* PCMCIA memory disk-like type */ +#define DKC_PCMCIA_ATA 22 /* PCMCIA AT Attached type */ +#define DKC_VBD 23 /* virtual block device */ + +/* + * Sun reserves up through 1023 + */ + +#define DKC_CUSTOMER_BASE 1024 + +/* + * Flags + */ +#define DKI_BAD144 0x01 /* use DEC std 144 bad sector fwding */ +#define DKI_MAPTRK 0x02 /* controller does track mapping */ +#define DKI_FMTTRK 0x04 /* formats only full track at a time */ +#define DKI_FMTVOL 0x08 /* formats only full volume at a time */ +#define DKI_FMTCYL 0x10 /* formats only full cylinders at a time */ +#define DKI_HEXUNIT 0x20 /* unit number is printed as 3 hex digits */ +#define DKI_PCMCIA_PFD 0x40 /* PCMCIA pseudo-floppy memory card */ + +/* + * Used for all partitions + */ +struct dk_allmap { + struct dk_map dka_map[NDKMAP]; +}; + +#if defined(_SYSCALL32) +struct dk_allmap32 { + struct dk_map32 dka_map[NDKMAP]; +}; +#endif /* _SYSCALL32 */ + +/* + * Definition of a disk's geometry + */ +struct dk_geom { + unsigned short dkg_ncyl; /* # of data cylinders */ + unsigned short dkg_acyl; /* # of alternate cylinders */ + unsigned short dkg_bcyl; /* cyl offset (for fixed head area) */ + unsigned short dkg_nhead; /* # of heads */ + unsigned short dkg_obs1; /* obsolete */ + unsigned short dkg_nsect; /* # of data sectors per track */ + unsigned short dkg_intrlv; /* interleave factor */ + unsigned short dkg_obs2; /* obsolete */ + unsigned short dkg_obs3; /* obsolete */ + unsigned short dkg_apc; /* alternates per cyl (SCSI only) */ + unsigned short dkg_rpm; /* revolutions per minute */ + unsigned short dkg_pcyl; /* # of physical cylinders */ + unsigned short dkg_write_reinstruct; /* # sectors to skip, writes */ + unsigned short dkg_read_reinstruct; /* # sectors to skip, reads */ + unsigned short dkg_extra[7]; /* for compatible expansion */ +}; + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkg_bhead dkg_obs1 /* used to be head offset */ +#define dkg_gap1 dkg_obs2 /* used to be gap1 */ +#define dkg_gap2 dkg_obs3 /* used to be gap2 */ + +/* + * Disk io control commands + * Warning: some other ioctls with the DIOC prefix exist elsewhere. + * The Generic DKIOC numbers are from 0 - 50. + * The Floppy Driver uses 51 - 100. + * The Hard Disk (except SCSI) 101 - 106. (these are obsolete) + * The CDROM Driver 151 - 200. + * The USCSI ioctl 201 - 250. + */ +#define DKIOC (0x04 << 8) + +/* + * The following ioctls are generic in nature and need to be + * supported as appropriate by all disk drivers + */ +#define DKIOCGGEOM (DKIOC|1) /* Get geometry */ +#define DKIOCINFO (DKIOC|3) /* Get info */ +#define DKIOCEJECT (DKIOC|6) /* Generic 'eject' */ +#define DKIOCGVTOC (DKIOC|11) /* Get VTOC */ +#define DKIOCSVTOC (DKIOC|12) /* Set VTOC & Write to Disk */ + +/* + * Disk Cache Controls. These ioctls should be supported by + * all disk drivers. + * + * DKIOCFLUSHWRITECACHE when used from user-mode ignores the ioctl + * argument, but it should be passed as NULL to allow for future + * reinterpretation. From user-mode, this ioctl request is synchronous. + * + * When invoked from within the kernel, the arg can be NULL to indicate + * a synchronous request or can be the address of a struct dk_callback + * to request an asynchronous callback when the flush request is complete. + * In this case, the flag to the ioctl must include FKIOCTL and the + * dkc_callback field of the pointed to struct must be non-null or the + * request is made synchronously. + * + * In the callback case: if the ioctl returns 0, a callback WILL be performed. + * If the ioctl returns non-zero, a callback will NOT be performed. + * NOTE: In some cases, the callback may be done BEFORE the ioctl call + * returns. The caller's locking strategy should be prepared for this case. + */ +#define DKIOCFLUSHWRITECACHE (DKIOC|34) /* flush cache to phys medium */ + +struct dk_callback { + void (*dkc_callback)(void *dkc_cookie, int error); + void *dkc_cookie; + int dkc_flag; +}; + +/* bit flag definitions for dkc_flag */ +#define FLUSH_VOLATILE 0x1 /* Bit 0: if set, only flush */ + /* volatile cache; otherwise, flush */ + /* volatile and non-volatile cache */ + +#define DKIOCGETWCE (DKIOC|36) /* Get current write cache */ + /* enablement status */ +#define DKIOCSETWCE (DKIOC|37) /* Enable/Disable write cache */ + +/* + * The following ioctls are used by Sun drivers to communicate + * with their associated format routines. Support of these ioctls + * is not required of foreign drivers + */ +#define DKIOCSGEOM (DKIOC|2) /* Set geometry */ +#define DKIOCSAPART (DKIOC|4) /* Set all partitions */ +#define DKIOCGAPART (DKIOC|5) /* Get all partitions */ +#define DKIOCG_PHYGEOM (DKIOC|32) /* get physical geometry */ +#define DKIOCG_VIRTGEOM (DKIOC|33) /* get virtual geometry */ + +/* + * The following ioctl's are removable media support + */ +#define DKIOCLOCK (DKIOC|7) /* Generic 'lock' */ +#define DKIOCUNLOCK (DKIOC|8) /* Generic 'unlock' */ +#define DKIOCSTATE (DKIOC|13) /* Inquire insert/eject state */ +#define DKIOCREMOVABLE (DKIOC|16) /* is media removable */ + + +/* + * ioctl for hotpluggable devices + */ +#define DKIOCHOTPLUGGABLE (DKIOC|35) /* is hotpluggable */ + +/* + * Ioctl to force driver to re-read the alternate partition and rebuild + * the internal defect map. + */ +#define DKIOCADDBAD (DKIOC|20) /* Re-read the alternate map (IDE) */ +#define DKIOCGETDEF (DKIOC|21) /* read defect list (IDE) */ + +/* + * Used by applications to get disk defect information from IDE + * drives. + */ +#ifdef _SYSCALL32 +struct defect_header32 { + int head; + caddr32_t buffer; +}; +#endif /* _SYSCALL32 */ + +struct defect_header { + int head; + caddr_t buffer; +}; + +#define DKIOCPARTINFO (DKIOC|22) /* Get partition or slice parameters */ + +/* + * Used by applications to get partition or slice information + */ +#ifdef _SYSCALL32 +struct part_info32 { + uint32_t p_start; + int p_length; +}; +#endif /* _SYSCALL32 */ + +struct part_info { + uint64_t p_start; + int p_length; +}; + +/* The following ioctls are for Optical Memory Device */ +#define DKIOC_EBP_ENABLE (DKIOC|40) /* enable by pass erase on write */ +#define DKIOC_EBP_DISABLE (DKIOC|41) /* disable by pass erase on write */ + +/* + * This state enum is the argument passed to the DKIOCSTATE ioctl. + */ +enum dkio_state { DKIO_NONE, DKIO_EJECTED, DKIO_INSERTED, DKIO_DEV_GONE }; + +#define DKIOCGMEDIAINFO (DKIOC|42) /* get information about the media */ + +/* + * ioctls to read/write mboot info. + */ +#define DKIOCGMBOOT (DKIOC|43) /* get mboot info */ +#define DKIOCSMBOOT (DKIOC|44) /* set mboot info */ + +/* + * ioctl to get the device temperature. + */ +#define DKIOCGTEMPERATURE (DKIOC|45) /* get temperature */ + +/* + * Used for providing the temperature. + */ + +struct dk_temperature { + uint_t dkt_flags; /* Flags */ + short dkt_cur_temp; /* Current disk temperature */ + short dkt_ref_temp; /* reference disk temperature */ +}; + +#define DKT_BYPASS_PM 0x1 +#define DKT_INVALID_TEMP 0xFFFF + + +/* + * Used for Media info or the current profile info + */ +struct dk_minfo { + uint_t dki_media_type; /* Media type or profile info */ + uint_t dki_lbsize; /* Logical blocksize of media */ + diskaddr_t dki_capacity; /* Capacity as # of dki_lbsize blks */ +}; + +/* + * Media types or profiles known + */ +#define DK_UNKNOWN 0x00 /* Media inserted - type unknown */ + + +/* + * SFF 8090 Specification Version 3, media types 0x01 - 0xfffe are retained to + * maintain compatibility with SFF8090. The following define the + * optical media type. + */ +#define DK_REMOVABLE_DISK 0x02 /* Removable Disk */ +#define DK_MO_ERASABLE 0x03 /* MO Erasable */ +#define DK_MO_WRITEONCE 0x04 /* MO Write once */ +#define DK_AS_MO 0x05 /* AS MO */ +#define DK_CDROM 0x08 /* CDROM */ +#define DK_CDR 0x09 /* CD-R */ +#define DK_CDRW 0x0A /* CD-RW */ +#define DK_DVDROM 0x10 /* DVD-ROM */ +#define DK_DVDR 0x11 /* DVD-R */ +#define DK_DVDRAM 0x12 /* DVD_RAM or DVD-RW */ + +/* + * Media types for other rewritable magnetic media + */ +#define DK_FIXED_DISK 0x10001 /* Fixed disk SCSI or otherwise */ +#define DK_FLOPPY 0x10002 /* Floppy media */ +#define DK_ZIP 0x10003 /* IOMEGA ZIP media */ +#define DK_JAZ 0x10004 /* IOMEGA JAZ media */ + +#define DKIOCSETEFI (DKIOC|17) /* Set EFI info */ +#define DKIOCGETEFI (DKIOC|18) /* Get EFI info */ + +#define DKIOCPARTITION (DKIOC|9) /* Get partition info */ + +/* + * Ioctls to get/set volume capabilities related to Logical Volume Managers. + * They include the ability to get/set capabilities and to issue a read to a + * specific underlying device of a replicated device. + */ + +#define DKIOCGETVOLCAP (DKIOC | 25) /* Get volume capabilities */ +#define DKIOCSETVOLCAP (DKIOC | 26) /* Set volume capabilities */ +#define DKIOCDMR (DKIOC | 27) /* Issue a directed read */ + +typedef uint_t volcapinfo_t; + +typedef uint_t volcapset_t; + +#define DKV_ABR_CAP 0x00000001 /* Support Appl.Based Recovery */ +#define DKV_DMR_CAP 0x00000002 /* Support Directed Mirror Read */ + +typedef struct volcap { + volcapinfo_t vc_info; /* Capabilities available */ + volcapset_t vc_set; /* Capabilities set */ +} volcap_t; + +#define VOL_SIDENAME 256 + +typedef struct vol_directed_rd { + int vdr_flags; + offset_t vdr_offset; + size_t vdr_nbytes; + size_t vdr_bytesread; + void *vdr_data; + int vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd_t; + +#define DKV_SIDE_INIT (-1) +#define DKV_DMR_NEXT_SIDE 0x00000001 +#define DKV_DMR_DONE 0x00000002 +#define DKV_DMR_ERROR 0x00000004 +#define DKV_DMR_SUCCESS 0x00000008 +#define DKV_DMR_SHORT 0x00000010 + +#ifdef _MULTI_DATAMODEL +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif +typedef struct vol_directed_rd32 { + int32_t vdr_flags; + offset_t vdr_offset; /* 64-bit element on 32-bit alignment */ + size32_t vdr_nbytes; + size32_t vdr_bytesread; + caddr32_t vdr_data; + int32_t vdr_side; + char vdr_side_name[VOL_SIDENAME]; +} vol_directed_rd32_t; +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif +#endif /* _MULTI_DATAMODEL */ + +/* + * The ioctl is used to fetch disk's device type, vendor ID, + * model number/product ID, firmware revision and serial number together. + * + * Currently there are two device types - DKD_ATA_TYPE which means the + * disk is driven by cmdk/ata or dad/uata driver, and DKD_SCSI_TYPE + * which means the disk is driven by sd/scsi hba driver. + */ +#define DKIOC_GETDISKID (DKIOC|46) + +/* These two labels are for dkd_dtype of dk_disk_id_t */ +#define DKD_ATA_TYPE 0x01 /* ATA disk or legacy mode SATA disk */ +#define DKD_SCSI_TYPE 0x02 /* SCSI disk or native mode SATA disk */ + +#define DKD_ATA_MODEL 40 /* model number length */ +#define DKD_ATA_FWVER 8 /* firmware revision length */ +#define DKD_ATA_SERIAL 20 /* serial number length */ + +#define DKD_SCSI_VENDOR 8 /* vendor ID length */ +#define DKD_SCSI_PRODUCT 16 /* product ID length */ +#define DKD_SCSI_REVLEVEL 4 /* revision level length */ +#define DKD_SCSI_SERIAL 12 /* serial number length */ + +/* + * The argument type for DKIOC_GETDISKID ioctl. + */ +typedef struct dk_disk_id { + uint_t dkd_dtype; + union { + struct { + char dkd_amodel[DKD_ATA_MODEL]; /* 40 bytes */ + char dkd_afwver[DKD_ATA_FWVER]; /* 8 bytes */ + char dkd_aserial[DKD_ATA_SERIAL]; /* 20 bytes */ + } ata_disk_id; + struct { + char dkd_svendor[DKD_SCSI_VENDOR]; /* 8 bytes */ + char dkd_sproduct[DKD_SCSI_PRODUCT]; /* 16 bytes */ + char dkd_sfwver[DKD_SCSI_REVLEVEL]; /* 4 bytes */ + char dkd_sserial[DKD_SCSI_SERIAL]; /* 12 bytes */ + } scsi_disk_id; + } disk_id; +} dk_disk_id_t; + +/* + * The ioctl is used to update the firmware of device. + */ +#define DKIOC_UPDATEFW (DKIOC|47) + +/* The argument type for DKIOC_UPDATEFW ioctl */ +typedef struct dk_updatefw { + caddr_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_t; + +#ifdef _SYSCALL32 +typedef struct dk_updatefw_32 { + caddr32_t dku_ptrbuf; /* pointer to firmware buf */ + uint_t dku_size; /* firmware buf length */ + uint8_t dku_type; /* firmware update type */ +} dk_updatefw_32_t; +#endif /* _SYSCALL32 */ + +/* + * firmware update type - temporary or permanent use + */ +#define FW_TYPE_TEMP 0x0 /* temporary use */ +#define FW_TYPE_PERM 0x1 /* permanent use */ + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKIO_H */ diff --git a/lib/libspl/include/sys/dklabel.h b/lib/libspl/include/sys/dklabel.h new file mode 100644 index 000000000000..95faf2bb4ab3 --- /dev/null +++ b/lib/libspl/include/sys/dklabel.h @@ -0,0 +1,268 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 1990-2002 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DKLABEL_H +#define _SYS_DKLABEL_H + + + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Miscellaneous defines + */ +#define DKL_MAGIC 0xDABE /* magic number */ +#define FKL_MAGIC 0xff /* magic number for DOS floppies */ + +#if defined(_SUNOS_VTOC_16) +#define NDKMAP 16 /* # of logical partitions */ +#define DK_LABEL_LOC 1 /* location of disk label */ +#elif defined(_SUNOS_VTOC_8) +#define NDKMAP 8 /* # of logical partitions */ +#define DK_LABEL_LOC 0 /* location of disk label */ +#else +#error "No VTOC format defined." +#endif + +#define LEN_DKL_ASCII 128 /* length of dkl_asciilabel */ +#define LEN_DKL_VVOL 8 /* length of v_volume */ +#define DK_LABEL_SIZE 512 /* size of disk label */ +#define DK_MAX_BLOCKS 0x7fffffff /* max # of blocks handled */ + +/* + * Reserve two cylinders on SCSI disks. + * One is for the backup disk label and the other is for the deviceid. + * + * IPI disks only reserve one cylinder, but they will go away soon. + * CDROMs do not reserve any cylinders. + */ +#define DK_ACYL 2 + +/* + * Format of a Sun disk label. + * Resides in cylinder 0, head 0, sector 0. + * + * sizeof (struct dk_label) should be 512 (the current sector size), + * but should the sector size increase, this structure should remain + * at the beginning of the sector. + */ + +/* + * partition headers: section 1 + * Returned in struct dk_allmap by ioctl DKIOC[SG]APART (dkio(7I)) + */ +struct dk_map { + uint64_t dkl_cylno; /* starting cylinder */ + uint64_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * partition headers: section 1 + * Fixed size for on-disk dk_label + */ +struct dk_map32 { + daddr32_t dkl_cylno; /* starting cylinder */ + daddr32_t dkl_nblk; /* number of blocks; if == 0, */ + /* partition is undefined */ +}; + +/* + * partition headers: section 2, + * brought over from AT&T SVr4 vtoc structure. + */ +struct dk_map2 { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permission flag */ +}; + +struct dkl_partition { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permission flags */ + daddr32_t p_start; /* start sector no of partition */ + int32_t p_size; /* # of blocks in partition */ +}; + + +/* + * VTOC inclusions from AT&T SVr4 + * Fixed sized types for on-disk VTOC + */ + +struct dk_vtoc { +#if defined(_SUNOS_VTOC_16) + uint32_t v_bootinfo[3]; /* info for mboot (unsupported) */ + uint32_t v_sanity; /* to verify vtoc sanity */ + uint32_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_sectorsz; /* sector size in bytes */ + uint16_t v_nparts; /* number of partitions */ + uint32_t v_reserved[10]; /* free space */ + struct dkl_partition v_part[NDKMAP]; /* partition headers */ + time32_t timestamp[NDKMAP]; /* partition timestamp (unsupported) */ + char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ +#elif defined(_SUNOS_VTOC_8) + uint32_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_nparts; /* number of partitions */ + struct dk_map2 v_part[NDKMAP]; /* partition hdrs, sec 2 */ + uint32_t v_bootinfo[3]; /* info needed by mboot */ + uint32_t v_sanity; /* to verify vtoc sanity */ + uint32_t v_reserved[10]; /* free space */ + time32_t v_timestamp[NDKMAP]; /* partition timestamp */ +#else +#error "No VTOC format defined." +#endif +}; + +/* + * define the amount of disk label padding needed to make + * the entire structure occupy 512 bytes. + */ +#if defined(_SUNOS_VTOC_16) +#define LEN_DKL_PAD (DK_LABEL_SIZE - \ + ((sizeof (struct dk_vtoc) + \ + (4 * sizeof (uint32_t)) + \ + (12 * sizeof (uint16_t)) + \ + (2 * (sizeof (uint16_t)))))) +#elif defined(_SUNOS_VTOC_8) +#define LEN_DKL_PAD (DK_LABEL_SIZE \ + - ((LEN_DKL_ASCII) + \ + (sizeof (struct dk_vtoc)) + \ + (sizeof (struct dk_map32) * NDKMAP) + \ + (14 * (sizeof (uint16_t))) + \ + (2 * (sizeof (uint16_t))))) +#else +#error "No VTOC format defined." +#endif + + +struct dk_label { +#if defined(_SUNOS_VTOC_16) + struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */ + uint32_t dkl_pcyl; /* # of physical cylinders */ + uint32_t dkl_ncyl; /* # of data cylinders */ + uint16_t dkl_acyl; /* # of alternate cylinders */ + uint16_t dkl_bcyl; /* cyl offset (for fixed head area) */ + uint32_t dkl_nhead; /* # of heads */ + uint32_t dkl_nsect; /* # of data sectors per track */ + uint16_t dkl_intrlv; /* interleave factor */ + uint16_t dkl_skew; /* skew factor */ + uint16_t dkl_apc; /* alternates per cyl (SCSI only) */ + uint16_t dkl_rpm; /* revolutions per minute */ + uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */ + uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */ + uint16_t dkl_extra[4]; /* for compatible expansion */ + char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */ +#elif defined(_SUNOS_VTOC_8) + char dkl_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ + struct dk_vtoc dkl_vtoc; /* vtoc inclusions from AT&T SVr4 */ + uint16_t dkl_write_reinstruct; /* # sectors to skip, writes */ + uint16_t dkl_read_reinstruct; /* # sectors to skip, reads */ + char dkl_pad[LEN_DKL_PAD]; /* unused part of 512 bytes */ + uint16_t dkl_rpm; /* rotations per minute */ + uint16_t dkl_pcyl; /* # physical cylinders */ + uint16_t dkl_apc; /* alternates per cylinder */ + uint16_t dkl_obs1; /* obsolete */ + uint16_t dkl_obs2; /* obsolete */ + uint16_t dkl_intrlv; /* interleave factor */ + uint16_t dkl_ncyl; /* # of data cylinders */ + uint16_t dkl_acyl; /* # of alternate cylinders */ + uint16_t dkl_nhead; /* # of heads in this partition */ + uint16_t dkl_nsect; /* # of 512 byte sectors per track */ + uint16_t dkl_obs3; /* obsolete */ + uint16_t dkl_obs4; /* obsolete */ + struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */ +#else +#error "No VTOC format defined." +#endif + uint16_t dkl_magic; /* identifies this label format */ + uint16_t dkl_cksum; /* xor checksum of sector */ +}; + +#if defined(_SUNOS_VTOC_16) +#define dkl_asciilabel dkl_vtoc.v_asciilabel +#define v_timestamp timestamp + +#elif defined(_SUNOS_VTOC_8) + +/* + * These defines are for historic compatibility with old drivers. + */ +#define dkl_gap1 dkl_obs1 /* used to be gap1 */ +#define dkl_gap2 dkl_obs2 /* used to be gap2 */ +#define dkl_bhead dkl_obs3 /* used to be label head offset */ +#define dkl_ppart dkl_obs4 /* used to by physical partition */ +#else +#error "No VTOC format defined." +#endif + +struct fk_label { /* DOS floppy label */ + uchar_t fkl_type; + uchar_t fkl_magich; + uchar_t fkl_magicl; + uchar_t filler; +}; + +/* + * Layout of stored fabricated device id (on-disk) + */ +#define DK_DEVID_BLKSIZE (512) +#define DK_DEVID_SIZE (DK_DEVID_BLKSIZE - ((sizeof (uchar_t) * 7))) +#define DK_DEVID_REV_MSB (0) +#define DK_DEVID_REV_LSB (1) + +struct dk_devid { + uchar_t dkd_rev_hi; /* revision (MSB) */ + uchar_t dkd_rev_lo; /* revision (LSB) */ + uchar_t dkd_flags; /* flags (not used yet) */ + uchar_t dkd_devid[DK_DEVID_SIZE]; /* devid stored here */ + uchar_t dkd_checksum3; /* checksum (MSB) */ + uchar_t dkd_checksum2; + uchar_t dkd_checksum1; + uchar_t dkd_checksum0; /* checksum (LSB) */ +}; + +#define DKD_GETCHKSUM(dkd) ((dkd)->dkd_checksum3 << 24) + \ + ((dkd)->dkd_checksum2 << 16) + \ + ((dkd)->dkd_checksum1 << 8) + \ + ((dkd)->dkd_checksum0) + +#define DKD_FORMCHKSUM(c, dkd) (dkd)->dkd_checksum3 = hibyte(hiword((c))); \ + (dkd)->dkd_checksum2 = lobyte(hiword((c))); \ + (dkd)->dkd_checksum1 = hibyte(loword((c))); \ + (dkd)->dkd_checksum0 = lobyte(loword((c))); +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKLABEL_H */ diff --git a/lib/libspl/include/sys/dktp/Makefile.am b/lib/libspl/include/sys/dktp/Makefile.am new file mode 100644 index 000000000000..4ad3695d8abc --- /dev/null +++ b/lib/libspl/include/sys/dktp/Makefile.am @@ -0,0 +1,4 @@ +libspldir = $(includedir)/libspl/sys/dktp +libspl_HEADERS = \ + fdisk.h + diff --git a/lib/libspl/include/sys/dktp/fdisk.h b/lib/libspl/include/sys/dktp/fdisk.h new file mode 100644 index 000000000000..e90135f362e6 --- /dev/null +++ b/lib/libspl/include/sys/dktp/fdisk.h @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright (c) 1984, 1986, 1987, 1988 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_DKTP_FDISK_H +#define _SYS_DKTP_FDISK_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * fdisk.h + * This file defines the structure of physical disk sector 0 for use on + * AT386 systems. The format of this sector is constrained by the ROM + * BIOS and MS-DOS conventions. + * Note that this block does not define the partitions used by the unix + * driver. The unix partitions are obtained from the VTOC. + */ + +/* + * the MAX values are the maximum usable values for BIOS chs values + * The MAX_CYL value of 1022 is the maximum usable value + * the value of 1023 is a fence value, + * indicating no CHS geometry exists for the corresponding LBA value. + * HEAD range [ 0 .. MAX_HEAD ], so number of heads is (MAX_HEAD + 1) + * SECT range [ 1 .. MAX_SECT ], so number of sectors is (MAX_SECT) + */ +#define MAX_SECT (63) +#define MAX_CYL (1022) +#define MAX_HEAD (254) + +/* + * BOOTSZ was reduced from 446 to 440 bytes to NOT overwrite the Windows + * Vista DISKID. Otherwise Vista won't boot from Solaris GRUB in a dual-boot + * setup. + * The actual size of mboot code is 425 bytes while that of GRUB stage1 is + * 423 bytes. So this changes does not harm them. + */ +#define BOOTSZ 440 /* size of boot code in master boot block */ +#define FD_NUMPART 4 /* number of 'partitions' in fdisk table */ +#define MBB_MAGIC 0xAA55 /* magic number for mboot.signature */ +#define DEFAULT_INTLV 4 /* default interleave for testing tracks */ +#define MINPSIZE 4 /* minimum number of cylinders in a partition */ +#define TSTPAT 0xE5 /* test pattern for verifying disk */ + +/* + * structure to hold the fdisk partition table + */ +struct ipart { + unsigned char bootid; /* bootable or not */ + unsigned char beghead; /* beginning head, sector, cylinder */ + unsigned char begsect; /* begcyl is a 10-bit number. High 2 bits */ + unsigned char begcyl; /* are in begsect. */ + unsigned char systid; /* OS type */ + unsigned char endhead; /* ending head, sector, cylinder */ + unsigned char endsect; /* endcyl is a 10-bit number. High 2 bits */ + unsigned char endcyl; /* are in endsect. */ + uint32_t relsect; /* first sector relative to start of disk */ + uint32_t numsect; /* number of sectors in partition */ +}; +/* + * Values for bootid. + */ +#define NOTACTIVE 0 +#define ACTIVE 128 +/* + * Values for systid. + */ +#define UNUSED 0 /* Empty Partition */ +#define DOSOS12 1 /* DOS partition, 12-bit FAT */ +#define PCIXOS 2 /* PC/IX partition */ +#define DOSOS16 4 /* DOS partition, 16-bit FAT */ +#define EXTDOS 5 /* EXT-DOS partition */ +#define DOSHUGE 6 /* Huge DOS partition > 32MB */ +#define FDISK_IFS 7 /* Installable File System (IFS): HPFS & NTFS */ +#define FDISK_AIXBOOT 8 /* AIX Boot */ +#define FDISK_AIXDATA 9 /* AIX Data */ +#define FDISK_OS2BOOT 10 /* OS/2 Boot Manager */ +#define FDISK_WINDOWS 11 /* Windows 95 FAT32 (up to 2047GB) */ +#define FDISK_EXT_WIN 12 /* Windows 95 FAT32 (extended-INT13) */ +#define FDISK_FAT95 14 /* DOS 16-bit FAT, LBA-mapped */ +#define FDISK_EXTLBA 15 /* Extended partition, LBA-mapped */ +#define DIAGPART 18 /* Diagnostic boot partition (OS independent) */ +#define FDISK_LINUX 65 /* Linux */ +#define FDISK_LINUXDSWAP 66 /* Linux swap (sharing disk w/ DRDOS) */ +#define FDISK_LINUXDNAT 67 /* Linux native (sharing disk with DRDOS) */ +#define FDISK_CPM 82 /* CP/M */ +#define DOSDATA 86 /* DOS data partition */ +#define OTHEROS 98 /* part. type for appl. (DB?) needs */ + /* raw partition. ID was 0 but conflicted */ + /* with DOS 3.3 fdisk */ +#define UNIXOS 99 /* UNIX V.x partition */ +#define FDISK_NOVELL2 100 /* Novell Netware 286 */ +#define FDISK_NOVELL3 101 /* Novell Netware 3.x and later */ +#define FDISK_QNX4 119 /* QNX 4.x */ +#define FDISK_QNX42 120 /* QNX 4.x 2nd part */ +#define FDISK_QNX43 121 /* QNX 4.x 3rd part */ +#define SUNIXOS 130 /* Solaris UNIX partition */ +#define FDISK_LINUXNAT 131 /* Linux native */ +#define FDISK_NTFSVOL1 134 /* NTFS volume set 1 */ +#define FDISK_NTFSVOL2 135 /* NTFS volume set 2 */ +#define FDISK_BSD 165 /* BSD/386, 386BSD, NetBSD, FreeBSD, OpenBSD */ +#define FDISK_NEXTSTEP 167 /* NeXTSTEP */ +#define FDISK_BSDIFS 183 /* BSDI file system */ +#define FDISK_BSDISWAP 184 /* BSDI swap */ +#define X86BOOT 190 /* x86 Solaris boot partition */ +#define SUNIXOS2 191 /* Solaris UNIX partition */ +#define EFI_PMBR 238 /* EFI PMBR */ +#define EFI_FS 239 /* EFI File System (System Partition) */ +#define MAXDOS 65535L /* max size (sectors) for DOS partition */ + +/* + * structure to hold master boot block in physical sector 0 of the disk. + * Note that partitions stuff can't be directly included in the structure + * because of lameo '386 compiler alignment design. + * Alignment issues also force us to have 2 16bit entities for a single + * 32bit win_volserno. It is not used anywhere anyway. + */ + +struct mboot { /* master boot block */ + char bootinst[BOOTSZ]; + uint16_t win_volserno_lo; + uint16_t win_volserno_hi; + uint16_t reserved; + char parts[FD_NUMPART * sizeof (struct ipart)]; + ushort_t signature; +}; + +#if defined(__i386) || defined(__amd64) + +/* Byte offset of the start of the partition table within the sector */ +#define FDISK_PART_TABLE_START 446 + +/* Maximum number of valid partitions assumed as 32 */ +#define MAX_EXT_PARTS 32 + +#else + +#define MAX_EXT_PARTS 0 + +#endif /* if defined(__i386) || defined(__amd64) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DKTP_FDISK_H */ diff --git a/lib/libspl/include/sys/feature_tests.h b/lib/libspl/include/sys/feature_tests.h new file mode 100644 index 000000000000..1a68b75f0cdc --- /dev/null +++ b/lib/libspl/include/sys/feature_tests.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_FEATURE_TESTS_H +#define _SYS_FEATURE_TESTS_H + +#define __NORETURN __attribute__((__noreturn__)) + +#endif diff --git a/lib/libspl/include/sys/int_limits.h b/lib/libspl/include/sys/int_limits.h new file mode 100644 index 000000000000..7af68cdb2998 --- /dev/null +++ b/lib/libspl/include/sys/int_limits.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_INT_LIMITS_H +#define _LIBSPL_SYS_INT_LIMITS_H + +#endif diff --git a/lib/libspl/include/sys/int_types.h b/lib/libspl/include/sys/int_types.h new file mode 100644 index 000000000000..51e9e0285490 --- /dev/null +++ b/lib/libspl/include/sys/int_types.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOL_SYS_INT_TYPES_H +#define _SOL_SYS_INT_TYPES_H + +#include + +#endif diff --git a/lib/libspl/include/sys/inttypes.h b/lib/libspl/include/sys/inttypes.h new file mode 100644 index 000000000000..d7d063985316 --- /dev/null +++ b/lib/libspl/include/sys/inttypes.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SOL_SYS_INTTYPES_H +#define _SOL_SYS_INTTYPES_H + +#include + +#define _INT64_TYPE + +#endif diff --git a/lib/libspl/include/sys/isa_defs.h b/lib/libspl/include/sys/isa_defs.h new file mode 100644 index 000000000000..8c0932f57654 --- /dev/null +++ b/lib/libspl/include/sys/isa_defs.h @@ -0,0 +1,264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ISA_DEFS_H +#define _SYS_ISA_DEFS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* x86_64 arch specific defines */ +#if defined(__x86_64) || defined(__x86_64__) + +#if !defined(__x86_64) +#define __x86_64 +#endif + +#if !defined(__amd64) +#define __amd64 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if defined(_ILP32) +/* x32-specific defines; careful to *not* define _LP64 here */ +#else +#if !defined(_LP64) +#define _LP64 +#endif +#endif + +#if !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + +#define _SUNOS_VTOC_16 +#define HAVE_EFFICIENT_UNALIGNED_ACCESS + +/* i386 arch specific defines */ +#elif defined(__i386) || defined(__i386__) + +#if !defined(__i386) +#define __i386 +#endif + +#if !defined(__x86) +#define __x86 +#endif + +#if !defined(_ILP32) +#define _ILP32 +#endif + +#if !defined(_ZFS_LITTLE_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + +#define _SUNOS_VTOC_16 +#define HAVE_EFFICIENT_UNALIGNED_ACCESS + +/* powerpc arch specific defines */ +#elif defined(__powerpc) || defined(__powerpc__) || defined(__powerpc64__) + +#if !defined(__powerpc) +#define __powerpc +#endif + +#if !defined(__powerpc__) +#define __powerpc__ +#endif + +#if defined(__powerpc64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _SUNOS_VTOC_16 +#define HAVE_EFFICIENT_UNALIGNED_ACCESS + +#if defined(__BYTE_ORDER) +#if defined(__BIG_ENDIAN) && __BYTE_ORDER == __BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif defined(__LITTLE_ENDIAN) && __BYTE_ORDER == __LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#endif +#elif defined(_BYTE_ORDER) +#if defined(_BIG_ENDIAN) && _BYTE_ORDER == _BIG_ENDIAN +#define _ZFS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) && _BYTE_ORDER == _LITTLE_ENDIAN +#define _ZFS_LITTLE_ENDIAN +#endif +#elif defined(_BIG_ENDIAN) && !defined(_LITTLE_ENDIAN) +#define _ZFS_BIG_ENDIAN +#elif defined(_LITTLE_ENDIAN) && !defined(_BIG_ENDIAN) +#define _ZFS_LITTLE_ENDIAN +#endif + +/* arm arch specific defines */ +#elif defined(__arm) || defined(__arm__) || defined(__aarch64__) + +#if !defined(__arm) +#define __arm +#endif + +#if !defined(__arm__) +#define __arm__ +#endif + +#if defined(__aarch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#if defined(__ARMEL__) || defined(__AARCH64EL__) +#define _ZFS_LITTLE_ENDIAN +#else +#define _ZFS_BIG_ENDIAN +#endif + +#define _SUNOS_VTOC_16 + +#if defined(__ARM_FEATURE_UNALIGNED) +#define HAVE_EFFICIENT_UNALIGNED_ACCESS +#endif + +/* sparc arch specific defines */ +#elif defined(__sparc) || defined(__sparc__) + +#if !defined(__sparc) +#define __sparc +#endif + +#if !defined(__sparc__) +#define __sparc__ +#endif + +#define _ZFS_BIG_ENDIAN +#define _SUNOS_VTOC_16 + +#if defined(__arch64__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +/* s390 arch specific defines */ +#elif defined(__s390__) +#if defined(__s390x__) +#if !defined(_LP64) +#define _LP64 +#endif +#else +#if !defined(_ILP32) +#define _ILP32 +#endif +#endif + +#define _ZFS_BIG_ENDIAN +#define _SUNOS_VTOC_16 + +/* MIPS arch specific defines */ +#elif defined(__mips__) + +#if defined(__MIPSEB__) +#define _ZFS_BIG_ENDIAN +#elif defined(__MIPSEL__) +#define _ZFS_LITTLE_ENDIAN +#else +#error MIPS no endian specified +#endif + +#if !defined(_LP64) && !defined(_ILP32) +#define _ILP32 +#endif + +#define _SUNOS_VTOC_16 + +/* + * RISC-V arch specific defines + * only RV64G (including atomic) LP64 is supported yet + */ +#elif defined(__riscv) && defined(_LP64) && _LP64 && \ + defined(__riscv_atomic) && __riscv_atomic + +#ifndef __riscv__ +#define __riscv__ +#endif + +#ifndef __rv64g__ +#define __rv64g__ +#endif + +#define _ZFS_LITTLE_ENDIAN + +#define _SUNOS_VTOC_16 + +#else +/* + * Currently supported: + * x86_64, x32, i386, arm, powerpc, s390, sparc, mips, and RV64G + */ +#error "Unsupported ISA type" +#endif + +#if defined(_ILP32) && defined(_LP64) +#error "Both _ILP32 and _LP64 are defined" +#endif + +#if !defined(_ILP32) && !defined(_LP64) +#error "Neither _ILP32 or _LP64 are defined" +#endif + +#if defined(_ZFS_LITTLE_ENDIAN) && defined(_ZFS_BIG_ENDIAN) +#error "Both _ZFS_LITTLE_ENDIAN and _ZFS_BIG_ENDIAN are defined" +#endif + +#if !defined(_ZFS_LITTLE_ENDIAN) && !defined(_ZFS_BIG_ENDIAN) +#error "Neither _ZFS_LITTLE_ENDIAN nor _ZFS_BIG_ENDIAN are defined" +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_ISA_DEFS_H */ diff --git a/lib/libspl/include/sys/kmem.h b/lib/libspl/include/sys/kmem.h new file mode 100644 index 000000000000..83d47565aeaf --- /dev/null +++ b/lib/libspl/include/sys/kmem.h @@ -0,0 +1,45 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_KMEM_H +#define _SYS_KMEM_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define KM_SLEEP 0x00000000 /* same as KM_SLEEP */ +#define KM_NOSLEEP 0x00000001 /* same as KM_NOSLEEP */ + +#define kmem_alloc(size, flags) malloc(size) +#define kmem_free(ptr, size) free(ptr) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_KMEM_H */ diff --git a/lib/libspl/include/sys/kstat.h b/lib/libspl/include/sys/kstat.h new file mode 100644 index 000000000000..69fb6d401fc7 --- /dev/null +++ b/lib/libspl/include/sys/kstat.h @@ -0,0 +1,822 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_KSTAT_H +#define _SYS_KSTAT_H + + + +/* + * Definition of general kernel statistics structures and /dev/kstat ioctls + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef int kid_t; /* unique kstat id */ + +/* + * Kernel statistics driver (/dev/kstat) ioctls + */ + +#define KSTAT_IOC_BASE ('K' << 8) + +#define KSTAT_IOC_CHAIN_ID KSTAT_IOC_BASE | 0x01 +#define KSTAT_IOC_READ KSTAT_IOC_BASE | 0x02 +#define KSTAT_IOC_WRITE KSTAT_IOC_BASE | 0x03 + +/* + * /dev/kstat ioctl usage (kd denotes /dev/kstat descriptor): + * + * kcid = ioctl(kd, KSTAT_IOC_CHAIN_ID, NULL); + * kcid = ioctl(kd, KSTAT_IOC_READ, kstat_t *); + * kcid = ioctl(kd, KSTAT_IOC_WRITE, kstat_t *); + */ + +#define KSTAT_STRLEN 255 /* 254 chars + NULL; must be 16 * n - 1 */ + +/* + * The generic kstat header + */ + +typedef struct kstat { + /* + * Fields relevant to both kernel and user + */ + hrtime_t ks_crtime; /* creation time (from gethrtime()) */ + struct kstat *ks_next; /* kstat chain linkage */ + kid_t ks_kid; /* unique kstat ID */ + char ks_module[KSTAT_STRLEN]; /* provider module name */ + uchar_t ks_resv; /* reserved, currently just padding */ + int ks_instance; /* provider module's instance */ + char ks_name[KSTAT_STRLEN]; /* kstat name */ + uchar_t ks_type; /* kstat data type */ + char ks_class[KSTAT_STRLEN]; /* kstat class */ + uchar_t ks_flags; /* kstat flags */ + void *ks_data; /* kstat type-specific data */ + uint_t ks_ndata; /* # of type-specific data records */ + size_t ks_data_size; /* total size of kstat data section */ + hrtime_t ks_snaptime; /* time of last data snapshot */ + /* + * Fields relevant to kernel only + */ + int (*ks_update)(struct kstat *, int); /* dynamic update */ + void *ks_private; /* arbitrary provider-private data */ + int (*ks_snapshot)(struct kstat *, void *, int); + void *ks_lock; /* protects this kstat's data */ +} kstat_t; + +#ifdef _SYSCALL32 + +typedef int32_t kid32_t; + +typedef struct kstat32 { + /* + * Fields relevant to both kernel and user + */ + hrtime_t ks_crtime; + caddr32_t ks_next; /* struct kstat pointer */ + kid32_t ks_kid; + char ks_module[KSTAT_STRLEN]; + uint8_t ks_resv; + int32_t ks_instance; + char ks_name[KSTAT_STRLEN]; + uint8_t ks_type; + char ks_class[KSTAT_STRLEN]; + uint8_t ks_flags; + caddr32_t ks_data; /* type-specific data */ + uint32_t ks_ndata; + size32_t ks_data_size; + hrtime_t ks_snaptime; + /* + * Fields relevant to kernel only (only needed here for padding) + */ + int32_t _ks_update; + caddr32_t _ks_private; + int32_t _ks_snapshot; + caddr32_t _ks_lock; +} kstat32_t; + +#endif /* _SYSCALL32 */ + +/* + * kstat structure and locking strategy + * + * Each kstat consists of a header section (a kstat_t) and a data section. + * The system maintains a set of kstats, protected by kstat_chain_lock. + * kstat_chain_lock protects all additions to/deletions from this set, + * as well as all changes to kstat headers. kstat data sections are + * *optionally* protected by the per-kstat ks_lock. If ks_lock is non-NULL, + * kstat clients (e.g. /dev/kstat) will acquire this lock for all of their + * operations on that kstat. It is up to the kstat provider to decide whether + * guaranteeing consistent data to kstat clients is sufficiently important + * to justify the locking cost. Note, however, that most statistic updates + * already occur under one of the provider's mutexes, so if the provider sets + * ks_lock to point to that mutex, then kstat data locking is free. + * + * NOTE: variable-size kstats MUST employ kstat data locking, to prevent + * data-size races with kstat clients. + * + * NOTE: ks_lock is really of type (kmutex_t *); it is declared as (void *) + * in the kstat header so that users don't have to be exposed to all of the + * kernel's lock-related data structures. + */ + +#if defined(_KERNEL) + +#define KSTAT_ENTER(k) \ + { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_enter(lp); } + +#define KSTAT_EXIT(k) \ + { kmutex_t *lp = (k)->ks_lock; if (lp) mutex_exit(lp); } + +#define KSTAT_UPDATE(k, rw) (*(k)->ks_update)((k), (rw)) + +#define KSTAT_SNAPSHOT(k, buf, rw) (*(k)->ks_snapshot)((k), (buf), (rw)) + +#endif /* defined(_KERNEL) */ + +/* + * kstat time + * + * All times associated with kstats (e.g. creation time, snapshot time, + * kstat_timer_t and kstat_io_t timestamps, etc.) are 64-bit nanosecond values, + * as returned by gethrtime(). The accuracy of these timestamps is machine + * dependent, but the precision (units) is the same across all platforms. + */ + +/* + * kstat identity (KID) + * + * Each kstat is assigned a unique KID (kstat ID) when it is added to the + * global kstat chain. The KID is used as a cookie by /dev/kstat to + * request information about the corresponding kstat. There is also + * an identity associated with the entire kstat chain, kstat_chain_id, + * which is bumped each time a kstat is added or deleted. /dev/kstat uses + * the chain ID to detect changes in the kstat chain (e.g., a new disk + * coming online) between ioctl()s. + */ + +/* + * kstat module, kstat instance + * + * ks_module and ks_instance contain the name and instance of the module + * that created the kstat. In cases where there can only be one instance, + * ks_instance is 0. The kernel proper (/kernel/unix) uses "unix" as its + * module name. + */ + +/* + * kstat name + * + * ks_name gives a meaningful name to a kstat. The full kstat namespace + * is module.instance.name, so the name only need be unique within a + * module. kstat_create() will fail if you try to create a kstat with + * an already-used (ks_module, ks_instance, ks_name) triplet. Spaces are + * allowed in kstat names, but strongly discouraged, since they hinder + * awk-style processing at user level. + */ + +/* + * kstat type + * + * The kstat mechanism provides several flavors of kstat data, defined + * below. The "raw" kstat type is just treated as an array of bytes; you + * can use this to export any kind of data you want. + * + * Some kstat types allow multiple data structures per kstat, e.g. + * KSTAT_TYPE_NAMED; others do not. This is part of the spec for each + * kstat data type. + * + * User-level tools should *not* rely on the #define KSTAT_NUM_TYPES. To + * get this information, read out the standard system kstat "kstat_types". + */ + +#define KSTAT_TYPE_RAW 0 /* can be anything */ + /* ks_ndata >= 1 */ +#define KSTAT_TYPE_NAMED 1 /* name/value pair */ + /* ks_ndata >= 1 */ +#define KSTAT_TYPE_INTR 2 /* interrupt statistics */ + /* ks_ndata == 1 */ +#define KSTAT_TYPE_IO 3 /* I/O statistics */ + /* ks_ndata == 1 */ +#define KSTAT_TYPE_TIMER 4 /* event timer */ + /* ks_ndata >= 1 */ + +#define KSTAT_NUM_TYPES 5 + +/* + * kstat class + * + * Each kstat can be characterized as belonging to some broad class + * of statistics, e.g. disk, tape, net, vm, streams, etc. This field + * can be used as a filter to extract related kstats. The following + * values are currently in use: disk, tape, net, controller, vm, kvm, + * hat, streams, kstat, and misc. (The kstat class encompasses things + * like kstat_types.) + */ + +/* + * kstat flags + * + * Any of the following flags may be passed to kstat_create(). They are + * all zero by default. + * + * KSTAT_FLAG_VIRTUAL: + * + * Tells kstat_create() not to allocate memory for the + * kstat data section; instead, you will set the ks_data + * field to point to the data you wish to export. This + * provides a convenient way to export existing data + * structures. + * + * KSTAT_FLAG_VAR_SIZE: + * + * The size of the kstat you are creating will vary over time. + * For example, you may want to use the kstat mechanism to + * export a linked list. NOTE: The kstat framework does not + * manage the data section, so all variable-size kstats must be + * virtual kstats. Moreover, variable-size kstats MUST employ + * kstat data locking to prevent data-size races with kstat + * clients. See the section on "kstat snapshot" for details. + * + * KSTAT_FLAG_WRITABLE: + * + * Makes the kstat's data section writable by root. + * The ks_snapshot routine (see below) does not need to check for + * this; permission checking is handled in the kstat driver. + * + * KSTAT_FLAG_PERSISTENT: + * + * Indicates that this kstat is to be persistent over time. + * For persistent kstats, kstat_delete() simply marks the + * kstat as dormant; a subsequent kstat_create() reactivates + * the kstat. This feature is provided so that statistics + * are not lost across driver close/open (e.g., raw disk I/O + * on a disk with no mounted partitions.) + * NOTE: Persistent kstats cannot be virtual, since ks_data + * points to garbage as soon as the driver goes away. + * + * The following flags are maintained by the kstat framework: + * + * KSTAT_FLAG_DORMANT: + * + * For persistent kstats, indicates that the kstat is in the + * dormant state (e.g., the corresponding device is closed). + * + * KSTAT_FLAG_INVALID: + * + * This flag is set when a kstat is in a transitional state, + * e.g. between kstat_create() and kstat_install(). + * kstat clients must not attempt to access the kstat's data + * if this flag is set. + */ + +#define KSTAT_FLAG_VIRTUAL 0x01 +#define KSTAT_FLAG_VAR_SIZE 0x02 +#define KSTAT_FLAG_WRITABLE 0x04 +#define KSTAT_FLAG_PERSISTENT 0x08 +#define KSTAT_FLAG_DORMANT 0x10 +#define KSTAT_FLAG_INVALID 0x20 +#define KSTAT_FLAG_LONGSTRINGS 0x40 +#define KSTAT_FLAG_NO_HEADERS 0x80 + +/* + * Dynamic update support + * + * The kstat mechanism allows for an optional ks_update function to update + * kstat data. This is useful for drivers where the underlying device + * keeps cheap hardware stats, but extraction is expensive. Instead of + * constantly keeping the kstat data section up to date, you can supply a + * ks_update function which updates the kstat's data section on demand. + * To take advantage of this feature, simply set the ks_update field before + * calling kstat_install(). + * + * The ks_update function, if supplied, must have the following structure: + * + * int + * foo_kstat_update(kstat_t *ksp, int rw) + * { + * if (rw == KSTAT_WRITE) { + * ... update the native stats from ksp->ks_data; + * return EACCES if you don't support this + * } else { + * ... update ksp->ks_data from the native stats + * } + * } + * + * The ks_update return codes are: 0 for success, EACCES if you don't allow + * KSTAT_WRITE, and EIO for any other type of error. + * + * In general, the ks_update function may need to refer to provider-private + * data; for example, it may need a pointer to the provider's raw statistics. + * The ks_private field is available for this purpose. Its use is entirely + * at the provider's discretion. + * + * All variable-size kstats MUST supply a ks_update routine, which computes + * and sets ks_data_size (and ks_ndata if that is meaningful), since these + * are needed to perform kstat snapshots (see below). + * + * No kstat locking should be done inside the ks_update routine. The caller + * will already be holding the kstat's ks_lock (to ensure consistent data). + */ + +#define KSTAT_READ 0 +#define KSTAT_WRITE 1 + +/* + * Kstat snapshot + * + * In order to get a consistent view of a kstat's data, clients must obey + * the kstat's locking strategy. However, these clients may need to perform + * operations on the data which could cause a fault (e.g. copyout()), or + * operations which are simply expensive. Doing so could cause deadlock + * (e.g. if you're holding a disk's kstat lock which is ultimately required + * to resolve a copyout() fault), performance degradation (since the providers' + * activity is serialized at the kstat lock), device timing problems, etc. + * + * To avoid these problems, kstat data is provided via snapshots. Taking + * a snapshot is a simple process: allocate a wired-down kernel buffer, + * acquire the kstat's data lock, copy the data into the buffer ("take the + * snapshot"), and release the lock. This ensures that the kstat's data lock + * will be held as briefly as possible, and that no faults will occur while + * the lock is held. + * + * Normally, the snapshot is taken by default_kstat_snapshot(), which + * timestamps the data (sets ks_snaptime), copies it, and does a little + * massaging to deal with incomplete transactions on i/o kstats. However, + * this routine only works for kstats with contiguous data (the typical case). + * If you create a kstat whose data is, say, a linked list, you must provide + * your own ks_snapshot routine. The routine you supply must have the + * following prototype (replace "foo" with something appropriate): + * + * int foo_kstat_snapshot(kstat_t *ksp, void *buf, int rw); + * + * The minimal snapshot routine -- one which copies contiguous data that + * doesn't need any massaging -- would be this: + * + * ksp->ks_snaptime = gethrtime(); + * if (rw == KSTAT_WRITE) + * bcopy(buf, ksp->ks_data, ksp->ks_data_size); + * else + * bcopy(ksp->ks_data, buf, ksp->ks_data_size); + * return (0); + * + * A more illuminating example is taking a snapshot of a linked list: + * + * ksp->ks_snaptime = gethrtime(); + * if (rw == KSTAT_WRITE) + * return (EACCES); ... See below ... + * for (foo = first_foo; foo; foo = foo->next) { + * bcopy((char *) foo, (char *) buf, sizeof (struct foo)); + * buf = ((struct foo *) buf) + 1; + * } + * return (0); + * + * In the example above, we have decided that we don't want to allow + * KSTAT_WRITE access, so we return EACCES if this is attempted. + * + * The key points are: + * + * (1) ks_snaptime must be set (via gethrtime()) to timestamp the data. + * (2) Data gets copied from the kstat to the buffer on KSTAT_READ, + * and from the buffer to the kstat on KSTAT_WRITE. + * (3) ks_snapshot return values are: 0 for success, EACCES if you + * don't allow KSTAT_WRITE, and EIO for any other type of error. + * + * Named kstats (see section on "Named statistics" below) containing long + * strings (KSTAT_DATA_STRING) need special handling. The kstat driver + * assumes that all strings are copied into the buffer after the array of + * named kstats, and the pointers (KSTAT_NAMED_STR_PTR()) are updated to point + * into the copy within the buffer. The default snapshot routine does this, + * but overriding routines should contain at least the following: + * + * if (rw == KSTAT_READ) { + * kstat_named_t *knp = buf; + * char *end = knp + ksp->ks_ndata; + * uint_t i; + * + * ... Do the regular copy ... + * bcopy(ksp->ks_data, buf, sizeof (kstat_named_t) * ksp->ks_ndata); + * + * for (i = 0; i < ksp->ks_ndata; i++, knp++) { + * if (knp[i].data_type == KSTAT_DATA_STRING && + * KSTAT_NAMED_STR_PTR(knp) != NULL) { + * bcopy(KSTAT_NAMED_STR_PTR(knp), end, + * KSTAT_NAMED_STR_BUFLEN(knp)); + * KSTAT_NAMED_STR_PTR(knp) = end; + * end += KSTAT_NAMED_STR_BUFLEN(knp); + * } + * } + */ + +/* + * Named statistics. + * + * List of arbitrary name=value statistics. + */ + +typedef struct kstat_named { + char name[KSTAT_STRLEN]; /* name of counter */ + uchar_t data_type; /* data type */ + union { + char c[16]; /* enough for 128-bit ints */ + int32_t i32; + uint32_t ui32; + struct { + union { + char *ptr; /* NULL-term string */ +#if defined(_KERNEL) && defined(_MULTI_DATAMODEL) + caddr32_t ptr32; +#endif + char __pad[8]; /* 64-bit padding */ + } addr; + uint32_t len; /* # bytes for strlen + '\0' */ + } str; +/* + * The int64_t and uint64_t types are not valid for a maximally conformant + * 32-bit compilation environment (cc -Xc) using compilers prior to the + * introduction of C99 conforming compiler (reference ISO/IEC 9899:1990). + * In these cases, the visibility of i64 and ui64 is only permitted for + * 64-bit compilation environments or 32-bit non-maximally conformant + * C89 or C90 ANSI C compilation environments (cc -Xt and cc -Xa). In the + * C99 ANSI C compilation environment, the long long type is supported. + * The _INT64_TYPE is defined by the implementation (see sys/int_types.h). + */ +#if defined(_INT64_TYPE) + int64_t i64; + uint64_t ui64; +#endif + long l; + ulong_t ul; + + /* These structure members are obsolete */ + + longlong_t ll; + u_longlong_t ull; + float f; + double d; + } value; /* value of counter */ +} kstat_named_t; + +#define KSTAT_DATA_CHAR 0 +#define KSTAT_DATA_INT32 1 +#define KSTAT_DATA_UINT32 2 +#define KSTAT_DATA_INT64 3 +#define KSTAT_DATA_UINT64 4 + +#if !defined(_LP64) +#define KSTAT_DATA_LONG KSTAT_DATA_INT32 +#define KSTAT_DATA_ULONG KSTAT_DATA_UINT32 +#else +#if !defined(_KERNEL) +#define KSTAT_DATA_LONG KSTAT_DATA_INT64 +#define KSTAT_DATA_ULONG KSTAT_DATA_UINT64 +#else +#define KSTAT_DATA_LONG 7 /* only visible to the kernel */ +#define KSTAT_DATA_ULONG 8 /* only visible to the kernel */ +#endif /* !_KERNEL */ +#endif /* !_LP64 */ + +/* + * Statistics exporting named kstats with long strings (KSTAT_DATA_STRING) + * may not make the assumption that ks_data_size is equal to (ks_ndata * sizeof + * (kstat_named_t)). ks_data_size in these cases is equal to the sum of the + * amount of space required to store the strings (ie, the sum of + * KSTAT_NAMED_STR_BUFLEN() for all KSTAT_DATA_STRING statistics) plus the + * space required to store the kstat_named_t's. + * + * The default update routine will update ks_data_size automatically for + * variable-length kstats containing long strings (using the default update + * routine only makes sense if the string is the only thing that is changing + * in size, and ks_ndata is constant). Fixed-length kstats containing long + * strings must explicitly change ks_data_size (after creation but before + * initialization) to reflect the correct amount of space required for the + * long strings and the kstat_named_t's. + */ +#define KSTAT_DATA_STRING 9 + +/* These types are obsolete */ + +#define KSTAT_DATA_LONGLONG KSTAT_DATA_INT64 +#define KSTAT_DATA_ULONGLONG KSTAT_DATA_UINT64 +#define KSTAT_DATA_FLOAT 5 +#define KSTAT_DATA_DOUBLE 6 + +#define KSTAT_NAMED_PTR(kptr) ((kstat_named_t *)(kptr)->ks_data) + +/* + * Retrieve the pointer of the string contained in the given named kstat. + */ +#define KSTAT_NAMED_STR_PTR(knptr) ((knptr)->value.str.addr.ptr) + +/* + * Retrieve the length of the buffer required to store the string in the given + * named kstat. + */ +#define KSTAT_NAMED_STR_BUFLEN(knptr) ((knptr)->value.str.len) + +/* + * Interrupt statistics. + * + * An interrupt is a hard interrupt (sourced from the hardware device + * itself), a soft interrupt (induced by the system via the use of + * some system interrupt source), a watchdog interrupt (induced by + * a periodic timer call), spurious (an interrupt entry point was + * entered but there was no interrupt condition to service), + * or multiple service (an interrupt condition was detected and + * serviced just prior to returning from any of the other types). + * + * Measurement of the spurious class of interrupts is useful for + * autovectored devices in order to pinpoint any interrupt latency + * problems in a particular system configuration. + * + * Devices that have more than one interrupt of the same + * type should use multiple structures. + */ + +#define KSTAT_INTR_HARD 0 +#define KSTAT_INTR_SOFT 1 +#define KSTAT_INTR_WATCHDOG 2 +#define KSTAT_INTR_SPURIOUS 3 +#define KSTAT_INTR_MULTSVC 4 + +#define KSTAT_NUM_INTRS 5 + +typedef struct kstat_intr { + uint_t intrs[KSTAT_NUM_INTRS]; /* interrupt counters */ +} kstat_intr_t; + +#define KSTAT_INTR_PTR(kptr) ((kstat_intr_t *)(kptr)->ks_data) + +/* + * I/O statistics. + */ + +typedef struct kstat_io { + + /* + * Basic counters. + * + * The counters should be updated at the end of service + * (e.g., just prior to calling biodone()). + */ + + u_longlong_t nread; /* number of bytes read */ + u_longlong_t nwritten; /* number of bytes written */ + uint_t reads; /* number of read operations */ + uint_t writes; /* number of write operations */ + + /* + * Accumulated time and queue length statistics. + * + * Accumulated time statistics are kept as a running sum + * of "active" time. Queue length statistics are kept as a + * running sum of the product of queue length and elapsed time + * at that length -- i.e., a Riemann sum for queue length + * integrated against time. (You can also think of the active time + * as a Riemann sum, for the boolean function (queue_length > 0) + * integrated against time, or you can think of it as the + * Lebesgue measure of the set on which queue_length > 0.) + * + * ^ + * | _________ + * 8 | i4 | + * | | | + * Queue 6 | | + * Length | _________ | | + * 4 | i2 |_______| | + * | | i3 | + * 2_______| | + * | i1 | + * |_______________________________| + * Time-> t1 t2 t3 t4 + * + * At each change of state (entry or exit from the queue), + * we add the elapsed time (since the previous state change) + * to the active time if the queue length was non-zero during + * that interval; and we add the product of the elapsed time + * times the queue length to the running length*time sum. + * + * This method is generalizable to measuring residency + * in any defined system: instead of queue lengths, think + * of "outstanding RPC calls to server X". + * + * A large number of I/O subsystems have at least two basic + * "lists" of transactions they manage: one for transactions + * that have been accepted for processing but for which processing + * has yet to begin, and one for transactions which are actively + * being processed (but not done). For this reason, two cumulative + * time statistics are defined here: wait (pre-service) time, + * and run (service) time. + * + * All times are 64-bit nanoseconds (hrtime_t), as returned by + * gethrtime(). + * + * The units of cumulative busy time are accumulated nanoseconds. + * The units of cumulative length*time products are elapsed time + * times queue length. + * + * Updates to the fields below are performed implicitly by calls to + * these five functions: + * + * kstat_waitq_enter() + * kstat_waitq_exit() + * kstat_runq_enter() + * kstat_runq_exit() + * + * kstat_waitq_to_runq() (see below) + * kstat_runq_back_to_waitq() (see below) + * + * Since kstat_waitq_exit() is typically followed immediately + * by kstat_runq_enter(), there is a single kstat_waitq_to_runq() + * function which performs both operations. This is a performance + * win since only one timestamp is required. + * + * In some instances, it may be necessary to move a request from + * the run queue back to the wait queue, e.g. for write throttling. + * For these situations, call kstat_runq_back_to_waitq(). + * + * These fields should never be updated by any other means. + */ + + hrtime_t wtime; /* cumulative wait (pre-service) time */ + hrtime_t wlentime; /* cumulative wait length*time product */ + hrtime_t wlastupdate; /* last time wait queue changed */ + hrtime_t rtime; /* cumulative run (service) time */ + hrtime_t rlentime; /* cumulative run length*time product */ + hrtime_t rlastupdate; /* last time run queue changed */ + + uint_t wcnt; /* count of elements in wait state */ + uint_t rcnt; /* count of elements in run state */ + +} kstat_io_t; + +#define KSTAT_IO_PTR(kptr) ((kstat_io_t *)(kptr)->ks_data) + +/* + * Event timer statistics - cumulative elapsed time and number of events. + * + * Updates to these fields are performed implicitly by calls to + * kstat_timer_start() and kstat_timer_stop(). + */ + +typedef struct kstat_timer { + char name[KSTAT_STRLEN]; /* event name */ + uchar_t resv; /* reserved */ + u_longlong_t num_events; /* number of events */ + hrtime_t elapsed_time; /* cumulative elapsed time */ + hrtime_t min_time; /* shortest event duration */ + hrtime_t max_time; /* longest event duration */ + hrtime_t start_time; /* previous event start time */ + hrtime_t stop_time; /* previous event stop time */ +} kstat_timer_t; + +#define KSTAT_TIMER_PTR(kptr) ((kstat_timer_t *)(kptr)->ks_data) + +#if defined(_KERNEL) + +#include + +extern kid_t kstat_chain_id; /* bumped at each state change */ +extern void kstat_init(void); /* initialize kstat framework */ + +/* + * Adding and deleting kstats. + * + * The typical sequence to add a kstat is: + * + * ksp = kstat_create(module, instance, name, class, type, ndata, flags); + * if (ksp) { + * ... provider initialization, if necessary + * kstat_install(ksp); + * } + * + * There are three logically distinct steps here: + * + * Step 1: System Initialization (kstat_create) + * + * kstat_create() performs system initialization. kstat_create() + * allocates memory for the entire kstat (header plus data), initializes + * all header fields, initializes the data section to all zeroes, assigns + * a unique KID, and puts the kstat onto the system's kstat chain. + * The returned kstat is marked invalid (KSTAT_FLAG_INVALID is set), + * because the provider (caller) has not yet had a chance to initialize + * the data section. + * + * By default, kstats are exported to all zones on the system. A kstat may be + * created via kstat_create_zone() to specify a zone to which the statistics + * should be exported. kstat_zone_add() may be used to specify additional + * zones to which the statistics are to be exported. + * + * Step 2: Provider Initialization + * + * The provider performs any necessary initialization of the data section, + * e.g. setting the name fields in a KSTAT_TYPE_NAMED. Virtual kstats set + * the ks_data field at this time. The provider may also set the ks_update, + * ks_snapshot, ks_private, and ks_lock fields if necessary. + * + * Step 3: Installation (kstat_install) + * + * Once the kstat is completely initialized, kstat_install() clears the + * INVALID flag, thus making the kstat accessible to the outside world. + * kstat_install() also clears the DORMANT flag for persistent kstats. + * + * Removing a kstat from the system + * + * kstat_delete(ksp) removes ksp from the kstat chain and frees all + * associated system resources. NOTE: When you call kstat_delete(), + * you must NOT be holding that kstat's ks_lock. Otherwise, you may + * deadlock with a kstat reader. + * + * Persistent kstats + * + * From the provider's point of view, persistence is transparent. The only + * difference between ephemeral (normal) kstats and persistent kstats + * is that you pass KSTAT_FLAG_PERSISTENT to kstat_create(). Magically, + * this has the effect of making your data visible even when you're + * not home. Persistence is important to tools like iostat, which want + * to get a meaningful picture of disk activity. Without persistence, + * raw disk i/o statistics could never accumulate: they would come and + * go with each open/close of the raw device. + * + * The magic of persistence works by slightly altering the behavior of + * kstat_create() and kstat_delete(). The first call to kstat_create() + * creates a new kstat, as usual. However, kstat_delete() does not + * actually delete the kstat: it performs one final update of the data + * (i.e., calls the ks_update routine), marks the kstat as dormant, and + * sets the ks_lock, ks_update, ks_private, and ks_snapshot fields back + * to their default values (since they might otherwise point to garbage, + * e.g. if the provider is going away). kstat clients can still access + * the dormant kstat just like a live kstat; they just continue to see + * the final data values as long as the kstat remains dormant. + * All subsequent kstat_create() calls simply find the already-existing, + * dormant kstat and return a pointer to it, without altering any fields. + * The provider then performs its usual initialization sequence, and + * calls kstat_install(). kstat_install() uses the old data values to + * initialize the native data (i.e., ks_update is called with KSTAT_WRITE), + * thus making it seem like you were never gone. + */ + +extern kstat_t *kstat_create(const char *, int, const char *, const char *, + uchar_t, uint_t, uchar_t); +extern kstat_t *kstat_create_zone(const char *, int, const char *, + const char *, uchar_t, uint_t, uchar_t, zoneid_t); +extern void kstat_install(kstat_t *); +extern void kstat_delete(kstat_t *); +extern void kstat_named_setstr(kstat_named_t *knp, const char *src); +extern void kstat_set_string(char *, const char *); +extern void kstat_delete_byname(const char *, int, const char *); +extern void kstat_delete_byname_zone(const char *, int, const char *, zoneid_t); +extern void kstat_named_init(kstat_named_t *, const char *, uchar_t); +extern void kstat_timer_init(kstat_timer_t *, const char *); +extern void kstat_waitq_enter(kstat_io_t *); +extern void kstat_waitq_exit(kstat_io_t *); +extern void kstat_runq_enter(kstat_io_t *); +extern void kstat_runq_exit(kstat_io_t *); +extern void kstat_waitq_to_runq(kstat_io_t *); +extern void kstat_runq_back_to_waitq(kstat_io_t *); +extern void kstat_timer_start(kstat_timer_t *); +extern void kstat_timer_stop(kstat_timer_t *); + +extern void kstat_zone_add(kstat_t *, zoneid_t); +extern void kstat_zone_remove(kstat_t *, zoneid_t); +extern int kstat_zone_find(kstat_t *, zoneid_t); + +extern kstat_t *kstat_hold_bykid(kid_t kid, zoneid_t); +extern kstat_t *kstat_hold_byname(const char *, int, const char *, zoneid_t); +extern void kstat_rele(kstat_t *); + +#endif /* defined(_KERNEL) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_KSTAT_H */ diff --git a/lib/libspl/include/sys/list.h b/lib/libspl/include/sys/list.h new file mode 100644 index 000000000000..6db92ed42955 --- /dev/null +++ b/lib/libspl/include/sys/list.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_H +#define _SYS_LIST_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef struct list_node list_node_t; +typedef struct list list_t; + +void list_create(list_t *, size_t, size_t); +void list_destroy(list_t *); + +void list_insert_after(list_t *, void *, void *); +void list_insert_before(list_t *, void *, void *); +void list_insert_head(list_t *, void *); +void list_insert_tail(list_t *, void *); +void list_remove(list_t *, void *); +void *list_remove_head(list_t *); +void *list_remove_tail(list_t *); +void list_move_tail(list_t *, list_t *); + +void *list_head(list_t *); +void *list_tail(list_t *); +void *list_next(list_t *, void *); +void *list_prev(list_t *, void *); +int list_is_empty(list_t *); + +void list_link_init(list_node_t *); +void list_link_replace(list_node_t *, list_node_t *); + +int list_link_active(list_node_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_H */ diff --git a/lib/libspl/include/sys/list_impl.h b/lib/libspl/include/sys/list_impl.h new file mode 100644 index 000000000000..b5655b972c11 --- /dev/null +++ b/lib/libspl/include/sys/list_impl.h @@ -0,0 +1,51 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2003 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_LIST_IMPL_H +#define _SYS_LIST_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct list_node { + struct list_node *next; + struct list_node *prev; +}; + +struct list { + size_t list_size; + size_t list_offset; + struct list_node list_head; +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_LIST_IMPL_H */ diff --git a/lib/libspl/include/sys/mhd.h b/lib/libspl/include/sys/mhd.h new file mode 100644 index 000000000000..fcc062d51c84 --- /dev/null +++ b/lib/libspl/include/sys/mhd.h @@ -0,0 +1,159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MHD_H +#define _SYS_MHD_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for multi-host device I/O control commands + */ +#define MHIOC ('M'<<8) +#define MHIOCENFAILFAST (MHIOC|1) +#define MHIOCTKOWN (MHIOC|2) +#define MHIOCRELEASE (MHIOC|3) +#define MHIOCSTATUS (MHIOC|4) +#define MHIOCGRP_INKEYS (MHIOC|5) +#define MHIOCGRP_INRESV (MHIOC|6) +#define MHIOCGRP_REGISTER (MHIOC|7) +#define MHIOCGRP_RESERVE (MHIOC|8) +#define MHIOCGRP_PREEMPTANDABORT (MHIOC|9) +#define MHIOCGRP_PREEMPT (MHIOC|10) +#define MHIOCGRP_CLEAR (MHIOC|11) +#define MHIOCGRP_REGISTERANDIGNOREKEY (MHIOC|14) +#define MHIOCQRESERVE (MHIOC|12) +#define MHIOCREREGISTERDEVID (MHIOC|13) + +/* + * Following is the structure to specify the delay parameters in + * milliseconds, via the MHIOCTKOWN ioctl. + */ +struct mhioctkown { + int reinstate_resv_delay; + int min_ownership_delay; + int max_ownership_delay; +}; + +#define MHIOC_RESV_KEY_SIZE 8 +typedef struct mhioc_resv_key { + uchar_t key[MHIOC_RESV_KEY_SIZE]; +} mhioc_resv_key_t; + +typedef struct mhioc_key_list { + uint32_t listsize; + uint32_t listlen; + mhioc_resv_key_t *list; +} mhioc_key_list_t; + +typedef struct mhioc_inkeys { + uint32_t generation; + mhioc_key_list_t *li; +} mhioc_inkeys_t; + +#if defined(_SYSCALL32) +struct mhioc_key_list32 { + uint32_t listsize; + uint32_t listlen; + caddr32_t list; +} mhioc_key_list32_t; + +struct mhioc_inkeys32 { + uint32_t generation; + caddr32_t li; +} mhioc_inkeys32_t; +#endif + +typedef struct mhioc_resv_desc { + mhioc_resv_key_t key; + uint8_t type; + uint8_t scope; + uint32_t scope_specific_addr; +} mhioc_resv_desc_t; + +typedef struct mhioc_resv_desc_list { + uint32_t listsize; + uint32_t listlen; + mhioc_resv_desc_t *list; +} mhioc_resv_desc_list_t; + +typedef struct mhioc_inresvs { + uint32_t generation; + mhioc_resv_desc_list_t *li; +} mhioc_inresvs_t; + +#if defined(_SYSCALL32) +struct mhioc_resv_desc_list32 { + uint32_t listsize; + uint32_t listlen; + caddr32_t list; +} mhioc_resv_desc_list32_t; + +typedef struct mhioc_inresvs32 { + uint32_t generation; + caddr32_t li; +} mhioc_inresvs32_t; +#endif + +typedef struct mhioc_register { + mhioc_resv_key_t oldkey; + mhioc_resv_key_t newkey; + boolean_t aptpl; /* True if persistent across power failures */ +} mhioc_register_t; + +typedef struct mhioc_preemptandabort { + mhioc_resv_desc_t resvdesc; + mhioc_resv_key_t victim_key; +} mhioc_preemptandabort_t; + +typedef struct mhioc_registerandignorekey { + mhioc_resv_key_t newkey; + boolean_t aptpl; /* True if persistent across power failures */ +} mhioc_registerandignorekey_t; + +/* + * SCSI-3 PGR Reservation Type Codes. Codes with the _OBSOLETE suffix + * have been removed from the SCSI3 PGR standard. + */ +#define SCSI3_RESV_READSHARED_OBSOLETE 0 +#define SCSI3_RESV_WRITEEXCLUSIVE 1 +#define SCSI3_RESV_READEXCLUSIVE_OBSOLETE 2 +#define SCSI3_RESV_EXCLUSIVEACCESS 3 +#define SCSI3_RESV_SHAREDACCESS_OBSOLETE 4 +#define SCSI3_RESV_WRITEEXCLUSIVEREGISTRANTSONLY 5 +#define SCSI3_RESV_EXCLUSIVEACCESSREGISTRANTSONLY 6 + +#define SCSI3_SCOPE_LOGICALUNIT 0 +#define SCSI3_SCOPE_EXTENT_OBSOLETE 1 +#define SCSI3_SCOPE_ELEMENT 2 + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MHD_H */ diff --git a/lib/libspl/include/sys/mkdev.h b/lib/libspl/include/sys/mkdev.h new file mode 100644 index 000000000000..5978de65de50 --- /dev/null +++ b/lib/libspl/include/sys/mkdev.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_MKDEV_H +#define _LIBSPL_SYS_MKDEV_H + +#endif diff --git a/lib/libspl/include/sys/policy.h b/lib/libspl/include/sys/policy.h new file mode 100644 index 000000000000..2f695b35cd0b --- /dev/null +++ b/lib/libspl/include/sys/policy.h @@ -0,0 +1,26 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef _LIBSYS_SYS_POLICY_H +#define _LIBSYS_SYS_POLICY_H + +#endif /* _LIBSYS_SYS_POLICY_H */ diff --git a/lib/libspl/include/sys/poll.h b/lib/libspl/include/sys/poll.h new file mode 100644 index 000000000000..6ab0bddc0be5 --- /dev/null +++ b/lib/libspl/include/sys/poll.h @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2017 Zettabyte Software, LLC. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Compiling against musl correctly points out that including sys/poll.h is + * disallowed by the Single UNIX Specification when building in userspace. We + * implement a dummy header to redirect the include to the proper header. + * However, glibc, klibc and uclibc break this shim by including sys/poll.h + * from poll.h, so we add explicit exceptions for them. + */ +#ifndef _LIBSPL_SYS_POLL_H +#define _LIBSPL_SYS_POLL_H +#if defined(__GLIBC__) || defined(__KLIBC__) || defined(__UCLIBC__) +#include_next +#else +#include +#endif +#endif /* _LIBSPL_SYS_POLL_H */ diff --git a/lib/libspl/include/sys/priv.h b/lib/libspl/include/sys/priv.h new file mode 100644 index 000000000000..76c76d1830c2 --- /dev/null +++ b/lib/libspl/include/sys/priv.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PRIV_H +#define _LIBSPL_SYS_PRIV_H + +#endif diff --git a/lib/libspl/include/sys/processor.h b/lib/libspl/include/sys/processor.h new file mode 100644 index 000000000000..78e95d01f1b5 --- /dev/null +++ b/lib/libspl/include/sys/processor.h @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_PROCESSOR_H +#define _LIBSPL_SYS_PROCESSOR_H + +#define getcpuid() (-1) + +typedef int processorid_t; + +#endif diff --git a/lib/libspl/include/sys/sha2.h b/lib/libspl/include/sys/sha2.h new file mode 100644 index 000000000000..e2f66d225e25 --- /dev/null +++ b/lib/libspl/include/sys/sha2.h @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* Copyright 2013 Saso Kiselkov. All rights reserved. */ + +#ifndef _SYS_SHA2_H +#define _SYS_SHA2_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA2_HMAC_MIN_KEY_LEN 1 /* SHA2-HMAC min key length in bytes */ +#define SHA2_HMAC_MAX_KEY_LEN INT_MAX /* SHA2-HMAC max key length in bytes */ + +#define SHA256_DIGEST_LENGTH 32 /* SHA256 digest length in bytes */ +#define SHA384_DIGEST_LENGTH 48 /* SHA384 digest length in bytes */ +#define SHA512_DIGEST_LENGTH 64 /* SHA512 digest length in bytes */ + +/* Truncated versions of SHA-512 according to FIPS-180-4, section 5.3.6 */ +#define SHA512_224_DIGEST_LENGTH 28 /* SHA512/224 digest length */ +#define SHA512_256_DIGEST_LENGTH 32 /* SHA512/256 digest length */ + +#define SHA256_HMAC_BLOCK_SIZE 64 /* SHA256-HMAC block size */ +#define SHA512_HMAC_BLOCK_SIZE 128 /* SHA512-HMAC block size */ + +#define SHA256 0 +#define SHA256_HMAC 1 +#define SHA256_HMAC_GEN 2 +#define SHA384 3 +#define SHA384_HMAC 4 +#define SHA384_HMAC_GEN 5 +#define SHA512 6 +#define SHA512_HMAC 7 +#define SHA512_HMAC_GEN 8 +#define SHA512_224 9 +#define SHA512_256 10 + +/* + * SHA2 context. + * The contents of this structure are a private interface between the + * Init/Update/Final calls of the functions defined below. + * Callers must never attempt to read or write any of the fields + * in this structure directly. + */ +typedef struct { + uint32_t algotype; /* Algorithm Type */ + + /* state (ABCDEFGH) */ + union { + uint32_t s32[8]; /* for SHA256 */ + uint64_t s64[8]; /* for SHA384/512 */ + } state; + /* number of bits */ + union { + uint32_t c32[2]; /* for SHA256 , modulo 2^64 */ + uint64_t c64[2]; /* for SHA384/512, modulo 2^128 */ + } count; + union { + uint8_t buf8[128]; /* undigested input */ + uint32_t buf32[32]; /* realigned input */ + uint64_t buf64[16]; /* realigned input */ + } buf_un; +} SHA2_CTX; + +typedef SHA2_CTX SHA256_CTX; +typedef SHA2_CTX SHA384_CTX; +typedef SHA2_CTX SHA512_CTX; + +extern void SHA256Init(SHA256_CTX *); + +extern void SHA256Update(SHA256_CTX *, const void *, size_t); + +extern void SHA256Final(void *, SHA256_CTX *); + +extern void SHA384Init(SHA384_CTX *); + +extern void SHA384Update(SHA384_CTX *, const void *, size_t); + +extern void SHA384Final(void *, SHA384_CTX *); + +extern void SHA512Init(SHA512_CTX *); + +extern void SHA512Update(SHA512_CTX *, const void *, size_t); + +extern void SHA512Final(void *, SHA512_CTX *); + +extern void SHA2Init(uint64_t mech, SHA2_CTX *); + +extern void SHA2Update(SHA2_CTX *, const void *, size_t); + +extern void SHA2Final(void *, SHA2_CTX *); + +#ifdef _SHA2_IMPL +/* + * The following types/functions are all private to the implementation + * of the SHA2 functions and must not be used by consumers of the interface + */ + +/* + * List of support mechanisms in this module. + * + * It is important to note that in the module, division or modulus calculations + * are used on the enumerated type to determine which mechanism is being used; + * therefore, changing the order or additional mechanisms should be done + * carefully + */ +typedef enum sha2_mech_type { + SHA256_MECH_INFO_TYPE, /* SUN_CKM_SHA256 */ + SHA256_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC */ + SHA256_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA256_HMAC_GENERAL */ + SHA384_MECH_INFO_TYPE, /* SUN_CKM_SHA384 */ + SHA384_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC */ + SHA384_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA384_HMAC_GENERAL */ + SHA512_MECH_INFO_TYPE, /* SUN_CKM_SHA512 */ + SHA512_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC */ + SHA512_HMAC_GEN_MECH_INFO_TYPE, /* SUN_CKM_SHA512_HMAC_GENERAL */ + SHA512_224_MECH_INFO_TYPE, /* SUN_CKM_SHA512_224 */ + SHA512_256_MECH_INFO_TYPE /* SUN_CKM_SHA512_256 */ +} sha2_mech_type_t; + +#endif /* _SHA2_IMPL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_H */ diff --git a/lib/libspl/include/sys/simd.h b/lib/libspl/include/sys/simd.h new file mode 100644 index 000000000000..dceedb698fe0 --- /dev/null +++ b/lib/libspl/include/sys/simd.h @@ -0,0 +1,502 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SIMD_H +#define _LIBSPL_SYS_SIMD_H + +#include +#include + +#if defined(__x86) +#include + +#define kfpu_allowed() 1 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#define kfpu_init() 0 +#define kfpu_fini() ((void) 0) + +/* + * CPUID feature tests for user-space. + * + * x86 registers used implicitly by CPUID + */ +typedef enum cpuid_regs { + EAX = 0, + EBX, + ECX, + EDX, + CPUID_REG_CNT = 4 +} cpuid_regs_t; + +/* + * List of instruction sets identified by CPUID + */ +typedef enum cpuid_inst_sets { + SSE = 0, + SSE2, + SSE3, + SSSE3, + SSE4_1, + SSE4_2, + OSXSAVE, + AVX, + AVX2, + BMI1, + BMI2, + AVX512F, + AVX512CD, + AVX512DQ, + AVX512BW, + AVX512IFMA, + AVX512VBMI, + AVX512PF, + AVX512ER, + AVX512VL, + AES, + PCLMULQDQ, + MOVBE +} cpuid_inst_sets_t; + +/* + * Instruction set descriptor. + */ +typedef struct cpuid_feature_desc { + uint32_t leaf; /* CPUID leaf */ + uint32_t subleaf; /* CPUID sub-leaf */ + uint32_t flag; /* bit mask of the feature */ + cpuid_regs_t reg; /* which CPUID return register to test */ +} cpuid_feature_desc_t; + +#define _AVX512F_BIT (1U << 16) +#define _AVX512CD_BIT (_AVX512F_BIT | (1U << 28)) +#define _AVX512DQ_BIT (_AVX512F_BIT | (1U << 17)) +#define _AVX512BW_BIT (_AVX512F_BIT | (1U << 30)) +#define _AVX512IFMA_BIT (_AVX512F_BIT | (1U << 21)) +#define _AVX512VBMI_BIT (1U << 1) /* AVX512F_BIT is on another leaf */ +#define _AVX512PF_BIT (_AVX512F_BIT | (1U << 26)) +#define _AVX512ER_BIT (_AVX512F_BIT | (1U << 27)) +#define _AVX512VL_BIT (1U << 31) /* if used also check other levels */ +#define _AES_BIT (1U << 25) +#define _PCLMULQDQ_BIT (1U << 1) +#define _MOVBE_BIT (1U << 22) + +/* + * Descriptions of supported instruction sets + */ +static const cpuid_feature_desc_t cpuid_features[] = { + [SSE] = {1U, 0U, 1U << 25, EDX }, + [SSE2] = {1U, 0U, 1U << 26, EDX }, + [SSE3] = {1U, 0U, 1U << 0, ECX }, + [SSSE3] = {1U, 0U, 1U << 9, ECX }, + [SSE4_1] = {1U, 0U, 1U << 19, ECX }, + [SSE4_2] = {1U, 0U, 1U << 20, ECX }, + [OSXSAVE] = {1U, 0U, 1U << 27, ECX }, + [AVX] = {1U, 0U, 1U << 28, ECX }, + [AVX2] = {7U, 0U, 1U << 5, EBX }, + [BMI1] = {7U, 0U, 1U << 3, EBX }, + [BMI2] = {7U, 0U, 1U << 8, EBX }, + [AVX512F] = {7U, 0U, _AVX512F_BIT, EBX }, + [AVX512CD] = {7U, 0U, _AVX512CD_BIT, EBX }, + [AVX512DQ] = {7U, 0U, _AVX512DQ_BIT, EBX }, + [AVX512BW] = {7U, 0U, _AVX512BW_BIT, EBX }, + [AVX512IFMA] = {7U, 0U, _AVX512IFMA_BIT, EBX }, + [AVX512VBMI] = {7U, 0U, _AVX512VBMI_BIT, ECX }, + [AVX512PF] = {7U, 0U, _AVX512PF_BIT, EBX }, + [AVX512ER] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AVX512VL] = {7U, 0U, _AVX512ER_BIT, EBX }, + [AES] = {1U, 0U, _AES_BIT, ECX }, + [PCLMULQDQ] = {1U, 0U, _PCLMULQDQ_BIT, ECX }, + [MOVBE] = {1U, 0U, _MOVBE_BIT, ECX }, +}; + +/* + * Check if OS supports AVX and AVX2 by checking XCR0 + * Only call this function if CPUID indicates that AVX feature is + * supported by the CPU, otherwise it might be an illegal instruction. + */ +static inline uint64_t +xgetbv(uint32_t index) +{ + uint32_t eax, edx; + /* xgetbv - instruction byte code */ + __asm__ __volatile__(".byte 0x0f; .byte 0x01; .byte 0xd0" + : "=a" (eax), "=d" (edx) + : "c" (index)); + + return ((((uint64_t)edx)<<32) | (uint64_t)eax); +} + +/* + * Check if CPU supports a feature + */ +static inline boolean_t +__cpuid_check_feature(const cpuid_feature_desc_t *desc) +{ + uint32_t r[CPUID_REG_CNT]; + + if (__get_cpuid_max(0, NULL) >= desc->leaf) { + /* + * __cpuid_count is needed to properly check + * for AVX2. It is a macro, so return parameters + * are passed by value. + */ + __cpuid_count(desc->leaf, desc->subleaf, + r[EAX], r[EBX], r[ECX], r[EDX]); + return ((r[desc->reg] & desc->flag) == desc->flag); + } + return (B_FALSE); +} + +#define CPUID_FEATURE_CHECK(name, id) \ +static inline boolean_t \ +__cpuid_has_ ## name(void) \ +{ \ + return (__cpuid_check_feature(&cpuid_features[id])); \ +} + +/* + * Define functions for user-space CPUID features testing + */ +CPUID_FEATURE_CHECK(sse, SSE); +CPUID_FEATURE_CHECK(sse2, SSE2); +CPUID_FEATURE_CHECK(sse3, SSE3); +CPUID_FEATURE_CHECK(ssse3, SSSE3); +CPUID_FEATURE_CHECK(sse4_1, SSE4_1); +CPUID_FEATURE_CHECK(sse4_2, SSE4_2); +CPUID_FEATURE_CHECK(avx, AVX); +CPUID_FEATURE_CHECK(avx2, AVX2); +CPUID_FEATURE_CHECK(osxsave, OSXSAVE); +CPUID_FEATURE_CHECK(bmi1, BMI1); +CPUID_FEATURE_CHECK(bmi2, BMI2); +CPUID_FEATURE_CHECK(avx512f, AVX512F); +CPUID_FEATURE_CHECK(avx512cd, AVX512CD); +CPUID_FEATURE_CHECK(avx512dq, AVX512DQ); +CPUID_FEATURE_CHECK(avx512bw, AVX512BW); +CPUID_FEATURE_CHECK(avx512ifma, AVX512IFMA); +CPUID_FEATURE_CHECK(avx512vbmi, AVX512VBMI); +CPUID_FEATURE_CHECK(avx512pf, AVX512PF); +CPUID_FEATURE_CHECK(avx512er, AVX512ER); +CPUID_FEATURE_CHECK(avx512vl, AVX512VL); +CPUID_FEATURE_CHECK(aes, AES); +CPUID_FEATURE_CHECK(pclmulqdq, PCLMULQDQ); +CPUID_FEATURE_CHECK(movbe, MOVBE); + +/* + * Detect register set support + */ +static inline boolean_t +__simd_state_enabled(const uint64_t state) +{ + boolean_t has_osxsave; + uint64_t xcr0; + + has_osxsave = __cpuid_has_osxsave(); + if (!has_osxsave) + return (B_FALSE); + + xcr0 = xgetbv(0); + return ((xcr0 & state) == state); +} + +#define _XSTATE_SSE_AVX (0x2 | 0x4) +#define _XSTATE_AVX512 (0xE0 | _XSTATE_SSE_AVX) + +#define __ymm_enabled() __simd_state_enabled(_XSTATE_SSE_AVX) +#define __zmm_enabled() __simd_state_enabled(_XSTATE_AVX512) + +/* + * Check if SSE instruction set is available + */ +static inline boolean_t +zfs_sse_available(void) +{ + return (__cpuid_has_sse()); +} + +/* + * Check if SSE2 instruction set is available + */ +static inline boolean_t +zfs_sse2_available(void) +{ + return (__cpuid_has_sse2()); +} + +/* + * Check if SSE3 instruction set is available + */ +static inline boolean_t +zfs_sse3_available(void) +{ + return (__cpuid_has_sse3()); +} + +/* + * Check if SSSE3 instruction set is available + */ +static inline boolean_t +zfs_ssse3_available(void) +{ + return (__cpuid_has_ssse3()); +} + +/* + * Check if SSE4.1 instruction set is available + */ +static inline boolean_t +zfs_sse4_1_available(void) +{ + return (__cpuid_has_sse4_1()); +} + +/* + * Check if SSE4.2 instruction set is available + */ +static inline boolean_t +zfs_sse4_2_available(void) +{ + return (__cpuid_has_sse4_2()); +} + +/* + * Check if AVX instruction set is available + */ +static inline boolean_t +zfs_avx_available(void) +{ + return (__cpuid_has_avx() && __ymm_enabled()); +} + +/* + * Check if AVX2 instruction set is available + */ +static inline boolean_t +zfs_avx2_available(void) +{ + return (__cpuid_has_avx2() && __ymm_enabled()); +} + +/* + * Check if BMI1 instruction set is available + */ +static inline boolean_t +zfs_bmi1_available(void) +{ + return (__cpuid_has_bmi1()); +} + +/* + * Check if BMI2 instruction set is available + */ +static inline boolean_t +zfs_bmi2_available(void) +{ + return (__cpuid_has_bmi2()); +} + +/* + * Check if AES instruction set is available + */ +static inline boolean_t +zfs_aes_available(void) +{ + return (__cpuid_has_aes()); +} + +/* + * Check if PCLMULQDQ instruction set is available + */ +static inline boolean_t +zfs_pclmulqdq_available(void) +{ + return (__cpuid_has_pclmulqdq()); +} + +/* + * Check if MOVBE instruction is available + */ +static inline boolean_t +zfs_movbe_available(void) +{ + return (__cpuid_has_movbe()); +} + +/* + * AVX-512 family of instruction sets: + * + * AVX512F Foundation + * AVX512CD Conflict Detection Instructions + * AVX512ER Exponential and Reciprocal Instructions + * AVX512PF Prefetch Instructions + * + * AVX512BW Byte and Word Instructions + * AVX512DQ Double-word and Quadword Instructions + * AVX512VL Vector Length Extensions + * + * AVX512IFMA Integer Fused Multiply Add (Not supported by kernel 4.4) + * AVX512VBMI Vector Byte Manipulation Instructions + */ + +/* + * Check if AVX512F instruction set is available + */ +static inline boolean_t +zfs_avx512f_available(void) +{ + return (__cpuid_has_avx512f() && __zmm_enabled()); +} + +/* + * Check if AVX512CD instruction set is available + */ +static inline boolean_t +zfs_avx512cd_available(void) +{ + return (__cpuid_has_avx512cd() && __zmm_enabled()); +} + +/* + * Check if AVX512ER instruction set is available + */ +static inline boolean_t +zfs_avx512er_available(void) +{ + return (__cpuid_has_avx512er() && __zmm_enabled()); +} + +/* + * Check if AVX512PF instruction set is available + */ +static inline boolean_t +zfs_avx512pf_available(void) +{ + return (__cpuid_has_avx512pf() && __zmm_enabled()); +} + +/* + * Check if AVX512BW instruction set is available + */ +static inline boolean_t +zfs_avx512bw_available(void) +{ + return (__cpuid_has_avx512bw() && __zmm_enabled()); +} + +/* + * Check if AVX512DQ instruction set is available + */ +static inline boolean_t +zfs_avx512dq_available(void) +{ + return (__cpuid_has_avx512dq() && __zmm_enabled()); +} + +/* + * Check if AVX512VL instruction set is available + */ +static inline boolean_t +zfs_avx512vl_available(void) +{ + return (__cpuid_has_avx512vl() && __zmm_enabled()); +} + +/* + * Check if AVX512IFMA instruction set is available + */ +static inline boolean_t +zfs_avx512ifma_available(void) +{ + return (__cpuid_has_avx512ifma() && __zmm_enabled()); +} + +/* + * Check if AVX512VBMI instruction set is available + */ +static inline boolean_t +zfs_avx512vbmi_available(void) +{ + return (__cpuid_has_avx512f() && __cpuid_has_avx512vbmi() && + __zmm_enabled()); +} + +#elif defined(__aarch64__) + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +#elif defined(__powerpc__) + +#define kfpu_allowed() 1 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +/* + * Check if AltiVec instruction set is available + * No easy way beyond 'altivec works' :-( + */ +#include +#include + +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) +static jmp_buf env; +static void sigillhandler(int x) +{ + longjmp(env, 1); +} +#endif + +static inline boolean_t +zfs_altivec_available(void) +{ + boolean_t has_altivec = B_FALSE; +#if defined(__ALTIVEC__) && !defined(__FreeBSD__) + sighandler_t savesig; + savesig = signal(SIGILL, sigillhandler); + if (setjmp(env)) { + signal(SIGILL, savesig); + has_altivec = B_FALSE; + } else { + __asm__ __volatile__("vor 0,0,0\n" : : : "v0"); + signal(SIGILL, savesig); + has_altivec = B_TRUE; + } +#endif + return (has_altivec); +} +#else + +#define kfpu_allowed() 0 +#define kfpu_initialize(tsk) do {} while (0) +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) + +#endif + +#endif /* _LIBSPL_SYS_SIMD_H */ diff --git a/lib/libspl/include/sys/stack.h b/lib/libspl/include/sys/stack.h new file mode 100644 index 000000000000..59807e97b6a0 --- /dev/null +++ b/lib/libspl/include/sys/stack.h @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * This header file distributed under the terms of the CDDL. + * Portions Copyright 2008 Sun Microsystems, Inc. All Rights reserved. + */ +#ifndef _SYS_STACK_H +#define _SYS_STACK_H + +#include + +#define STACK_BIAS 0 + +#ifdef __USE_GNU + +static inline int +stack_getbounds(stack_t *sp) +{ + pthread_attr_t attr; + int rc; + + rc = pthread_getattr_np(pthread_self(), &attr); + if (rc) + return (rc); + + rc = pthread_attr_getstack(&attr, &sp->ss_sp, &sp->ss_size); + if (rc == 0) + sp->ss_flags = 0; + + pthread_attr_destroy(&attr); + + return (rc); +} + +static inline int +thr_stksegment(stack_t *sp) +{ + int rc; + + rc = stack_getbounds(sp); + if (rc) + return (rc); + + /* + * thr_stksegment() is expected to set sp.ss_sp to the high stack + * address, but the stack_getbounds() interface is expected to + * set sp.ss_sp to the low address. Adjust accordingly. + */ + sp->ss_sp = (void *)(((uintptr_t)sp->ss_sp) + sp->ss_size); + sp->ss_flags = 0; + + return (rc); +} + +#endif /* __USE_GNU */ +#endif /* _SYS_STACK_H */ diff --git a/lib/libspl/include/sys/stdtypes.h b/lib/libspl/include/sys/stdtypes.h new file mode 100644 index 000000000000..c26e2dc96c4e --- /dev/null +++ b/lib/libspl/include/sys/stdtypes.h @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#ifndef __SYS_STDTYPES_H +#define __SYS_STDTYPES_H + +typedef enum { + B_FALSE = 0, + B_TRUE = 1 +} boolean_t; + +typedef unsigned char uchar_t; +typedef unsigned short ushort_t; +typedef unsigned int uint_t; +typedef unsigned long ulong_t; +typedef unsigned long long u_longlong_t; +typedef long long longlong_t; + +typedef longlong_t offset_t; +typedef u_longlong_t u_offset_t; +typedef u_longlong_t len_t; +typedef longlong_t diskaddr_t; + +typedef ulong_t pgcnt_t; /* number of pages */ +typedef long spgcnt_t; /* signed number of pages */ + +typedef short pri_t; +typedef ushort_t o_mode_t; /* old file attribute type */ + +typedef int major_t; +typedef int minor_t; + +typedef short index_t; + +#endif /* __SYS_STDTYPES_H */ diff --git a/lib/libspl/include/sys/strings.h b/lib/libspl/include/sys/strings.h new file mode 100644 index 000000000000..c142047dcdb8 --- /dev/null +++ b/lib/libspl/include/sys/strings.h @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_STRINGS_H +#define _LIBSPL_SYS_STRINGS_H + +#include +#include + +#endif diff --git a/lib/libspl/include/sys/stropts.h b/lib/libspl/include/sys/stropts.h new file mode 100644 index 000000000000..08c2e79bc53c --- /dev/null +++ b/lib/libspl/include/sys/stropts.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_STROPTS_H +#define _LIBSPL_SYS_STROPTS_H + +#endif /* _LIBSPL_SYS_STROPTS_H */ diff --git a/lib/libspl/include/sys/sunddi.h b/lib/libspl/include/sys/sunddi.h new file mode 100644 index 000000000000..ccd2b29b9b09 --- /dev/null +++ b/lib/libspl/include/sys/sunddi.h @@ -0,0 +1,29 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008 by Sun Microsystems, Inc. + */ + +#ifndef _SYS_SUNDDI_H +#define _SYS_SUNDDI_H + +#endif /* _SYS_SUNDDI_H */ diff --git a/lib/libspl/include/sys/systeminfo.h b/lib/libspl/include/sys/systeminfo.h new file mode 100644 index 000000000000..cc6c1793c00e --- /dev/null +++ b/lib/libspl/include/sys/systeminfo.h @@ -0,0 +1,38 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_SYSTEMINFO_H +#define _LIBSPL_SYS_SYSTEMINFO_H + +#define HOSTID_MASK 0xFFFFFFFF +#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ + +unsigned long get_system_hostid(void); + +#endif diff --git a/lib/libspl/include/sys/time.h b/lib/libspl/include/sys/time.h new file mode 100644 index 000000000000..c9f6165047d2 --- /dev/null +++ b/lib/libspl/include/sys/time.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_TIME_H +#define _LIBSPL_SYS_TIME_H + +#include +#include +#include_next + +#ifndef SEC +#define SEC 1 +#endif + +#ifndef MILLISEC +#define MILLISEC 1000 +#endif + +#ifndef MICROSEC +#define MICROSEC 1000000 +#endif + +#ifndef NANOSEC +#define NANOSEC 1000000000 +#endif + +#ifndef NSEC_PER_USEC +#define NSEC_PER_USEC 1000L +#endif + +#ifndef MSEC2NSEC +#define MSEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MILLISEC)) +#endif + +#ifndef NSEC2MSEC +#define NSEC2MSEC(n) ((n) / (NANOSEC / MILLISEC)) +#endif + +#ifndef USEC2NSEC +#define USEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / MICROSEC)) +#endif + +#ifndef NSEC2USEC +#define NSEC2USEC(n) ((n) / (NANOSEC / MICROSEC)) +#endif + +#ifndef NSEC2SEC +#define NSEC2SEC(n) ((n) / (NANOSEC / SEC)) +#endif + +#ifndef SEC2NSEC +#define SEC2NSEC(m) ((hrtime_t)(m) * (NANOSEC / SEC)) +#endif + +typedef long long hrtime_t; +typedef struct timespec timespec_t; +typedef struct timespec inode_timespec_t; + +static inline void +gethrestime(inode_timespec_t *ts) +{ + struct timeval tv; + (void) gettimeofday(&tv, NULL); + ts->tv_sec = tv.tv_sec; + ts->tv_nsec = tv.tv_usec * NSEC_PER_USEC; +} + +static inline uint64_t +gethrestime_sec(void) +{ + struct timeval tv; + (void) gettimeofday(&tv, NULL); + return (tv.tv_sec); +} + +static inline hrtime_t +gethrtime(void) +{ + struct timespec ts; + (void) clock_gettime(CLOCK_MONOTONIC, &ts); + return ((((u_int64_t)ts.tv_sec) * NANOSEC) + ts.tv_nsec); +} + +#endif /* _LIBSPL_SYS_TIME_H */ diff --git a/lib/libspl/include/sys/trace_spl.h b/lib/libspl/include/sys/trace_spl.h new file mode 100644 index 000000000000..b80d288f7332 --- /dev/null +++ b/lib/libspl/include/sys/trace_spl.h @@ -0,0 +1,24 @@ +/* Here to keep the libspl build happy */ + +#ifndef _LIBSPL_SPL_TRACE_H +#define _LIBSPL_SPL_TRACE_H + +/* + * The set-error SDT probe is extra static, in that we declare its fake + * function literally, rather than with the DTRACE_PROBE1() macro. This is + * necessary so that SET_ERROR() can evaluate to a value, which wouldn't + * be possible if it required multiple statements (to declare the function + * and then call it). + * + * SET_ERROR() uses the comma operator so that it can be used without much + * additional code. For example, "return (EINVAL);" becomes + * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated + * twice, so it should not have side effects (e.g. something like: + * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). + */ +#undef SET_ERROR +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + + +#endif diff --git a/lib/libspl/include/sys/trace_zfs.h b/lib/libspl/include/sys/trace_zfs.h new file mode 100644 index 000000000000..87ed5ad3c3be --- /dev/null +++ b/lib/libspl/include/sys/trace_zfs.h @@ -0,0 +1,24 @@ +/* Here to keep the libspl build happy */ + +#ifndef _LIBSPL_ZFS_TRACE_H +#define _LIBSPL_ZFS_TRACE_H + +/* + * The set-error SDT probe is extra static, in that we declare its fake + * function literally, rather than with the DTRACE_PROBE1() macro. This is + * necessary so that SET_ERROR() can evaluate to a value, which wouldn't + * be possible if it required multiple statements (to declare the function + * and then call it). + * + * SET_ERROR() uses the comma operator so that it can be used without much + * additional code. For example, "return (EINVAL);" becomes + * "return (SET_ERROR(EINVAL));". Note that the argument will be evaluated + * twice, so it should not have side effects (e.g. something like: + * "return (SET_ERROR(log_error(EINVAL, info)));" would log the error twice). + */ +#undef SET_ERROR +#define SET_ERROR(err) \ + (__set_error(__FILE__, __func__, __LINE__, err), err) + + +#endif diff --git a/lib/libspl/include/sys/types.h b/lib/libspl/include/sys/types.h new file mode 100644 index 000000000000..ea02ffac93ac --- /dev/null +++ b/lib/libspl/include/sys/types.h @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_TYPES_H +#define _LIBSPL_SYS_TYPES_H + +#if defined(HAVE_MAKEDEV_IN_SYSMACROS) +#include +#elif defined(HAVE_MAKEDEV_IN_MKDEV) +#include +#endif + +#include +#include +#include_next +#include +#include +#include + +#ifndef HAVE_INTTYPES +#include +#endif /* HAVE_INTTYPES */ + +typedef int zoneid_t; +typedef int projid_t; + +/* + * Definitions remaining from previous partial support for 64-bit file + * offsets. This partial support for devices greater than 2gb requires + * compiler support for long long. + */ +#ifdef _LONG_LONG_LTOH +typedef union { + offset_t _f; /* Full 64 bit offset value */ + struct { + int32_t _l; /* lower 32 bits of offset value */ + int32_t _u; /* upper 32 bits of offset value */ + } _p; +} lloff_t; +#endif + +#ifdef _LONG_LONG_HTOL +typedef union { + offset_t _f; /* Full 64 bit offset value */ + struct { + int32_t _u; /* upper 32 bits of offset value */ + int32_t _l; /* lower 32 bits of offset value */ + } _p; +} lloff_t; +#endif + +#include /* for NBBY */ + +#endif diff --git a/lib/libspl/include/sys/types32.h b/lib/libspl/include/sys/types32.h new file mode 100644 index 000000000000..bb41aa34bee0 --- /dev/null +++ b/lib/libspl/include/sys/types32.h @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TYPES32_H +#define _SYS_TYPES32_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Interoperability types for programs. Used for: + * + * Crossing between 32-bit and 64-bit domains. + * + * On disk data formats such as filesystem meta data + * and disk label. + * + * Note: Applications should never include this + * header file. + */ +typedef uint32_t caddr32_t; +typedef int32_t daddr32_t; +typedef int32_t off32_t; +typedef uint32_t ino32_t; +typedef int32_t blkcnt32_t; +typedef uint32_t fsblkcnt32_t; +typedef uint32_t fsfilcnt32_t; +typedef int32_t id32_t; +typedef uint32_t major32_t; +typedef uint32_t minor32_t; +typedef int32_t key32_t; +typedef uint32_t mode32_t; +typedef uint32_t uid32_t; +typedef uint32_t gid32_t; +typedef uint32_t nlink32_t; +typedef uint32_t dev32_t; +typedef int32_t pid32_t; +typedef uint32_t size32_t; +typedef int32_t ssize32_t; +typedef int32_t time32_t; +typedef int32_t clock32_t; + +struct timeval32 { + time32_t tv_sec; /* seconds */ + int32_t tv_usec; /* and microseconds */ +}; + +typedef struct timespec32 { + time32_t tv_sec; /* seconds */ + int32_t tv_nsec; /* and nanoseconds */ +} timespec32_t; + +typedef struct timespec32 timestruc32_t; + +typedef struct itimerspec32 { + struct timespec32 it_interval; + struct timespec32 it_value; +} itimerspec32_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_TYPES32_H */ diff --git a/lib/libspl/include/sys/tzfile.h b/lib/libspl/include/sys/tzfile.h new file mode 100644 index 000000000000..e30e7566366e --- /dev/null +++ b/lib/libspl/include/sys/tzfile.h @@ -0,0 +1,164 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * from Arthur Olson's 6.1 + */ + +#ifndef _LIBSPL_SYS_TZFILE_H +#define _LIBSPL_SYS_TZFILE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Information about time zone files. + */ + +#define TZDIR "/usr/share/lib/zoneinfo" /* Time zone object file directory */ + +#define TZDEFAULT (getenv("TZ")) + +#define TZDEFRULES "posixrules" + +/* + * Each file begins with. . . + */ + +struct tzhead { + char tzh_reserved[24]; /* reserved for future use */ + char tzh_ttisstdcnt[4]; /* coded number of trans. time flags */ + char tzh_leapcnt[4]; /* coded number of leap seconds */ + char tzh_timecnt[4]; /* coded number of transition times */ + char tzh_typecnt[4]; /* coded number of local time types */ + char tzh_charcnt[4]; /* coded number of abbr. chars */ +}; + +/* + * . . .followed by. . . + * + * tzh_timecnt (char [4])s coded transition times a la time(2) + * tzh_timecnt (unsigned char)s types of local time starting at above + * tzh_typecnt repetitions of + * one (char [4]) coded GMT offset in seconds + * one (unsigned char) used to set tm_isdst + * one (unsigned char) that's an abbreviation list index + * tzh_charcnt (char)s '\0'-terminated zone abbreviations + * tzh_leapcnt repetitions of + * one (char [4]) coded leap second transition times + * one (char [4]) total correction after above + * tzh_ttisstdcnt (char)s indexed by type; if TRUE, transition + * time is standard time, if FALSE, + * transition time is wall clock time + * if absent, transition times are + * assumed to be wall clock time + */ + +/* + * In the current implementation, "tzset()" refuses to deal with files that + * exceed any of the limits below. + */ + +/* + * The TZ_MAX_TIMES value below is enough to handle a bit more than a + * year's worth of solar time (corrected daily to the nearest second) or + * 138 years of Pacific Presidential Election time + * (where there are three time zone transitions every fourth year). + */ +#define TZ_MAX_TIMES 370 + +#define TZ_MAX_TYPES 256 /* Limited by what (unsigned char)'s can hold */ + +#define TZ_MAX_CHARS 50 /* Maximum number of abbreviation characters */ + +#define TZ_MAX_LEAPS 50 /* Maximum number of leap second corrections */ + +#define SECSPERMIN 60 +#define MINSPERHOUR 60 +#define HOURSPERDAY 24 +#define DAYSPERWEEK 7 +#define DAYSPERNYEAR 365 +#define DAYSPERLYEAR 366 +#define SECSPERHOUR (SECSPERMIN * MINSPERHOUR) +#define SECSPERDAY ((long)SECSPERHOUR * HOURSPERDAY) +#define MONSPERYEAR 12 + +#define TM_SUNDAY 0 +#define TM_MONDAY 1 +#define TM_TUESDAY 2 +#define TM_WEDNESDAY 3 +#define TM_THURSDAY 4 +#define TM_FRIDAY 5 +#define TM_SATURDAY 6 + +#define TM_JANUARY 0 +#define TM_FEBRUARY 1 +#define TM_MARCH 2 +#define TM_APRIL 3 +#define TM_MAY 4 +#define TM_JUNE 5 +#define TM_JULY 6 +#define TM_AUGUST 7 +#define TM_SEPTEMBER 8 +#define TM_OCTOBER 9 +#define TM_NOVEMBER 10 +#define TM_DECEMBER 11 + +#define TM_YEAR_BASE 1900 + +#define EPOCH_YEAR 1970 +#define EPOCH_WDAY TM_THURSDAY + +/* + * Accurate only for the past couple of centuries; + * that will probably do. + */ + +#define isleap(y) (((y) % 4) == 0 && ((y) % 100) != 0 || ((y) % 400) == 0) + +/* + * Use of the underscored variants may cause problems if you move your code to + * certain System-V-based systems; for maximum portability, use the + * underscore-free variants. The underscored variants are provided for + * backward compatibility only; they may disappear from future versions of + * this file. + */ + +#define SECS_PER_MIN SECSPERMIN +#define MINS_PER_HOUR MINSPERHOUR +#define HOURS_PER_DAY HOURSPERDAY +#define DAYS_PER_WEEK DAYSPERWEEK +#define DAYS_PER_NYEAR DAYSPERNYEAR +#define DAYS_PER_LYEAR DAYSPERLYEAR +#define SECS_PER_HOUR SECSPERHOUR +#define SECS_PER_DAY SECSPERDAY +#define MONS_PER_YEAR MONSPERYEAR + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBSPL_SYS_TZFILE_H */ diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h new file mode 100644 index 000000000000..3a834b996add --- /dev/null +++ b/lib/libspl/include/sys/uio.h @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +#ifndef _LIBSPL_SYS_UIO_H +#define _LIBSPL_SYS_UIO_H + +#include +#include_next + +#ifdef __APPLE__ +#include +#endif + +#include +typedef struct iovec iovec_t; + +#if defined(__linux__) || defined(__APPLE__) +typedef enum uio_rw { + UIO_READ = 0, + UIO_WRITE = 1, +} uio_rw_t; + +typedef enum uio_seg { + UIO_USERSPACE = 0, + UIO_SYSSPACE = 1, + UIO_USERISPACE = 2, +} uio_seg_t; + +#elif defined(__FreeBSD__) +typedef enum uio_seg uio_seg_t; +#endif + +typedef struct uio { + struct iovec *uio_iov; /* pointer to array of iovecs */ + int uio_iovcnt; /* number of iovecs */ + offset_t uio_loffset; /* file offset */ + uio_seg_t uio_segflg; /* address space (kernel or user) */ + uint16_t uio_fmode; /* file mode flags */ + uint16_t uio_extflg; /* extended flags */ + offset_t uio_limit; /* u-limit (maximum byte offset) */ + ssize_t uio_resid; /* residual count */ +} uio_t; + +typedef enum xuio_type { + UIOTYPE_ASYNCIO, + UIOTYPE_ZEROCOPY, +} xuio_type_t; + +#define UIOA_IOV_MAX 16 + +typedef struct uioa_page_s { /* locked uio_iov state */ + int uioa_pfncnt; /* count of pfn_t(s) in *uioa_ppp */ + void **uioa_ppp; /* page_t or pfn_t array */ + caddr_t uioa_base; /* address base */ + size_t uioa_len; /* span length */ +} uioa_page_t; + +typedef struct xuio { + uio_t xu_uio; /* embedded UIO structure */ + + /* Extended uio fields */ + enum xuio_type xu_type; /* uio type */ + union { + struct { + uint32_t xu_a_state; /* state of async i/o */ + ssize_t xu_a_mbytes; /* bytes moved */ + uioa_page_t *xu_a_lcur; /* uioa_locked[] pointer */ + void **xu_a_lppp; /* lcur->uioa_pppp[] pointer */ + void *xu_a_hwst[4]; /* opaque hardware state */ + uioa_page_t xu_a_locked[UIOA_IOV_MAX]; + } xu_aio; + + struct { + int xu_zc_rw; /* read or write buffer */ + void *xu_zc_priv; /* fs specific */ + } xu_zc; + } xu_ext; +} xuio_t; + +#define XUIO_XUZC_PRIV(xuio) xuio->xu_ext.xu_zc.xu_zc_priv +#define XUIO_XUZC_RW(xuio) xuio->xu_ext.xu_zc.xu_zc_rw + +#define uio_segflg(uio) (uio)->uio_segflg +#define uio_offset(uio) (uio)->uio_loffset +#define uio_resid(uio) (uio)->uio_resid +#define uio_iovcnt(uio) (uio)->uio_iovcnt +#define uio_iovlen(uio, idx) (uio)->uio_iov[(idx)].iov_len +#define uio_iovbase(uio, idx) (uio)->uio_iov[(idx)].iov_base + +static inline void +uio_iov_at_index(uio_t *uio, uint_t idx, void **base, uint64_t *len) +{ + *base = uio_iovbase(uio, idx); + *len = uio_iovlen(uio, idx); +} + +static inline void +uio_advance(uio_t *uio, size_t size) +{ + uio->uio_resid -= size; + uio->uio_loffset += size; +} + +static inline offset_t +uio_index_at_offset(uio_t *uio, offset_t off, uint_t *vec_idx) +{ + *vec_idx = 0; + while (*vec_idx < (uint_t)uio_iovcnt(uio) && + off >= (offset_t)uio_iovlen(uio, *vec_idx)) { + off -= uio_iovlen(uio, *vec_idx); + (*vec_idx)++; + } + + return (off); +} + +#endif /* _SYS_UIO_H */ diff --git a/lib/libspl/include/sys/va_list.h b/lib/libspl/include/sys/va_list.h new file mode 100644 index 000000000000..a36f5c77daa9 --- /dev/null +++ b/lib/libspl/include/sys/va_list.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_VA_LIST_H +#define _SYS_VA_LIST_H + +#include + +#endif diff --git a/lib/libspl/include/sys/varargs.h b/lib/libspl/include/sys/varargs.h new file mode 100644 index 000000000000..3d00a3361d87 --- /dev/null +++ b/lib/libspl/include/sys/varargs.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_VARARGS_H +#define _LIBSPL_SYS_VARARGS_H + +#endif diff --git a/lib/libspl/include/sys/vnode.h b/lib/libspl/include/sys/vnode.h new file mode 100644 index 000000000000..efcdd2c5a5ae --- /dev/null +++ b/lib/libspl/include/sys/vnode.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_VNODE_H +#define _LIBSPL_SYS_VNODE_H + +#endif /* _LIBSPL_SYS_VNODE_H */ diff --git a/lib/libspl/include/sys/vtoc.h b/lib/libspl/include/sys/vtoc.h new file mode 100644 index 000000000000..5d8448b628dc --- /dev/null +++ b/lib/libspl/include/sys/vtoc.h @@ -0,0 +1,350 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_VTOC_H +#define _SYS_VTOC_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Note: the VTOC is not implemented fully, nor in the manner + * that AT&T implements it. AT&T puts the vtoc structure + * into a sector, usually the second sector (pdsector is first). + * + * Sun incorporates the tag, flag, version, and volume vtoc fields into + * its Disk Label, which already has some vtoc-equivalent fields. + * Upon reading the vtoc with read_vtoc(), the following exceptions + * occur: + * v_bootinfo [all] returned as zero + * v_sanity returned as VTOC_SANE + * if Disk Label was sane + * v_sectorsz returned as 512 + * v_reserved [all] returned as zero + * timestamp [all] returned as zero + * + * See dklabel.h, read_vtoc(), and write_vtoc(). + */ + +#define V_NUMPAR NDKMAP /* The number of partitions */ + /* (from dkio.h) */ + +#define VTOC_SANE 0x600DDEEE /* Indicates a sane VTOC */ +#define V_VERSION 0x01 /* layout version number */ +#define V_EXTVERSION V_VERSION /* extvtoc layout version number */ + +/* + * Partition identification tags + */ +#define V_UNASSIGNED 0x00 /* unassigned partition */ +#define V_BOOT 0x01 /* Boot partition */ +#define V_ROOT 0x02 /* Root filesystem */ +#define V_SWAP 0x03 /* Swap filesystem */ +#define V_USR 0x04 /* Usr filesystem */ +#define V_BACKUP 0x05 /* full disk */ +#define V_STAND 0x06 /* Stand partition */ +#define V_VAR 0x07 /* Var partition */ +#define V_HOME 0x08 /* Home partition */ +#define V_ALTSCTR 0x09 /* Alternate sector partition */ +#define V_CACHE 0x0a /* Cache (cachefs) partition */ +#define V_RESERVED 0x0b /* SMI reserved data */ + +/* + * Partition permission flags + */ +#define V_UNMNT 0x01 /* Unmountable partition */ +#define V_RONLY 0x10 /* Read only */ + +/* + * error codes for reading & writing vtoc + */ +#define VT_ERROR (-2) /* errno supplies specific error */ +#define VT_EIO (-3) /* I/O error accessing vtoc */ +#define VT_EINVAL (-4) /* illegal value in vtoc or request */ +#define VT_ENOTSUP (-5) /* VTOC op. not supported */ +#define VT_ENOSPC (-6) /* requested space not found */ +#define VT_EOVERFLOW (-7) /* VTOC op. data struct limited */ + +struct partition { + ushort_t p_tag; /* ID tag of partition */ + ushort_t p_flag; /* permission flags */ + uint64_t p_start; /* start sector no of partition */ + long p_size; /* # of blocks in partition */ +}; + +struct vtoc { + unsigned long v_bootinfo[3]; /* info needed by mboot (unsupported) */ + unsigned long v_sanity; /* to verify vtoc sanity */ + unsigned long v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + ushort_t v_sectorsz; /* sector size in bytes */ + ushort_t v_nparts; /* number of partitions */ + unsigned long v_reserved[10]; /* free space */ + struct partition v_part[V_NUMPAR]; /* partition headers */ + time_t timestamp[V_NUMPAR]; /* partition timestamp (unsupported) */ + char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ +}; + +struct extpartition { + ushort_t p_tag; /* ID tag of partition */ + ushort_t p_flag; /* permission flags */ + ushort_t p_pad[2]; + diskaddr_t p_start; /* start sector no of partition */ + diskaddr_t p_size; /* # of blocks in partition */ +}; + + +struct extvtoc { + uint64_t v_bootinfo[3]; /* info needed by mboot (unsupported) */ + uint64_t v_sanity; /* to verify vtoc sanity */ + uint64_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + ushort_t v_sectorsz; /* sector size in bytes */ + ushort_t v_nparts; /* number of partitions */ + ushort_t pad[2]; + uint64_t v_reserved[10]; + struct extpartition v_part[V_NUMPAR]; /* partition headers */ + uint64_t timestamp[V_NUMPAR]; /* partition timestamp (unsupported) */ + char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ +}; + +#ifdef _KERNEL +#define extvtoctovtoc(extv, v) \ + { \ + int i; \ + v.v_bootinfo[0] = (unsigned long)extv.v_bootinfo[0]; \ + v.v_bootinfo[1] = (unsigned long)extv.v_bootinfo[1]; \ + v.v_bootinfo[2] = (unsigned long)extv.v_bootinfo[2]; \ + v.v_sanity = (unsigned long)extv.v_sanity; \ + v.v_version = (unsigned long)extv.v_version; \ + bcopy(extv.v_volume, v.v_volume, LEN_DKL_VVOL); \ + v.v_sectorsz = extv.v_sectorsz; \ + v.v_nparts = extv.v_nparts; \ + for (i = 0; i < 10; i++) \ + v.v_reserved[i] = (unsigned long)extv.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + v.v_part[i].p_tag = extv.v_part[i].p_tag; \ + v.v_part[i].p_flag = extv.v_part[i].p_flag; \ + v.v_part[i].p_start = (uint64_t)extv.v_part[i].p_start; \ + v.v_part[i].p_size = (long)extv.v_part[i].p_size; \ + v.timestamp[i] = (time_t)extv.timestamp[i]; \ + } \ + bcopy(extv.v_asciilabel, v.v_asciilabel, LEN_DKL_ASCII); \ + } + +#define vtoctoextvtoc(v, extv) \ + { \ + int i; \ + extv.v_bootinfo[0] = (uint64_t)v.v_bootinfo[0]; \ + extv.v_bootinfo[1] = (uint64_t)v.v_bootinfo[1]; \ + extv.v_bootinfo[2] = (uint64_t)v.v_bootinfo[2]; \ + extv.v_sanity = (uint64_t)v.v_sanity; \ + extv.v_version = (uint64_t)v.v_version; \ + bcopy(v.v_volume, extv.v_volume, LEN_DKL_VVOL); \ + extv.v_sectorsz = v.v_sectorsz; \ + extv.v_nparts = v.v_nparts; \ + for (i = 0; i < 10; i++) \ + extv.v_reserved[i] = (uint64_t)v.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + extv.v_part[i].p_tag = v.v_part[i].p_tag; \ + extv.v_part[i].p_flag = v.v_part[i].p_flag; \ + extv.v_part[i].p_start = \ + (diskaddr_t)(unsigned long)v.v_part[i].p_start; \ + extv.v_part[i].p_size = \ + (diskaddr_t)(unsigned long)v.v_part[i].p_size; \ + extv.timestamp[i] = (uint64_t)v.timestamp[i]; \ + } \ + bcopy(v.v_asciilabel, extv.v_asciilabel, LEN_DKL_ASCII); \ + } +#endif /* _KERNEL */ + +#if defined(_SYSCALL32) +struct partition32 { + uint16_t p_tag; /* ID tag of partition */ + uint16_t p_flag; /* permission flags */ + daddr32_t p_start; /* start sector no of partition */ + int32_t p_size; /* # of blocks in partition */ +}; + +struct vtoc32 { + uint32_t v_bootinfo[3]; /* info needed by mboot (unsupported) */ + uint32_t v_sanity; /* to verify vtoc sanity */ + uint32_t v_version; /* layout version */ + char v_volume[LEN_DKL_VVOL]; /* volume name */ + uint16_t v_sectorsz; /* sector size in bytes */ + uint16_t v_nparts; /* number of partitions */ + uint32_t v_reserved[10]; /* free space */ + struct partition32 v_part[V_NUMPAR]; /* partition headers */ + time32_t timestamp[V_NUMPAR]; /* partition timestamp (unsupported) */ + char v_asciilabel[LEN_DKL_ASCII]; /* for compatibility */ +}; + +#define vtoc32tovtoc(v32, v) \ + { \ + int i; \ + v.v_bootinfo[0] = v32.v_bootinfo[0]; \ + v.v_bootinfo[1] = v32.v_bootinfo[1]; \ + v.v_bootinfo[2] = v32.v_bootinfo[2]; \ + v.v_sanity = v32.v_sanity; \ + v.v_version = v32.v_version; \ + bcopy(v32.v_volume, v.v_volume, LEN_DKL_VVOL); \ + v.v_sectorsz = v32.v_sectorsz; \ + v.v_nparts = v32.v_nparts; \ + v.v_version = v32.v_version; \ + for (i = 0; i < 10; i++) \ + v.v_reserved[i] = v32.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + v.v_part[i].p_tag = (ushort_t)v32.v_part[i].p_tag; \ + v.v_part[i].p_flag = (ushort_t)v32.v_part[i].p_flag; \ + v.v_part[i].p_start = (unsigned)v32.v_part[i].p_start; \ + v.v_part[i].p_size = (unsigned)v32.v_part[i].p_size; \ + } \ + for (i = 0; i < V_NUMPAR; i++) \ + v.timestamp[i] = (time_t)v32.timestamp[i]; \ + bcopy(v32.v_asciilabel, v.v_asciilabel, LEN_DKL_ASCII); \ + } + +#define vtoc32toextvtoc(v32, extv) \ + { \ + int i; \ + extv.v_bootinfo[0] = v32.v_bootinfo[0]; \ + extv.v_bootinfo[1] = v32.v_bootinfo[1]; \ + extv.v_bootinfo[2] = v32.v_bootinfo[2]; \ + extv.v_sanity = v32.v_sanity; \ + extv.v_version = v32.v_version; \ + bcopy(v32.v_volume, extv.v_volume, LEN_DKL_VVOL); \ + extv.v_sectorsz = v32.v_sectorsz; \ + extv.v_nparts = v32.v_nparts; \ + extv.v_version = v32.v_version; \ + for (i = 0; i < 10; i++) \ + extv.v_reserved[i] = v32.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + extv.v_part[i].p_tag = (ushort_t)v32.v_part[i].p_tag; \ + extv.v_part[i].p_flag = (ushort_t)v32.v_part[i].p_flag; \ + extv.v_part[i].p_start = (diskaddr_t)v32.v_part[i].p_start; \ + extv.v_part[i].p_size = (diskaddr_t)v32.v_part[i].p_size; \ + extv.timestamp[i] = (time_t)v32.timestamp[i]; \ + } \ + bcopy(v32.v_asciilabel, extv.v_asciilabel, LEN_DKL_ASCII); \ + } + + +#define vtoctovtoc32(v, v32) \ + { \ + int i; \ + v32.v_bootinfo[0] = v.v_bootinfo[0]; \ + v32.v_bootinfo[1] = v.v_bootinfo[1]; \ + v32.v_bootinfo[2] = v.v_bootinfo[2]; \ + v32.v_sanity = v.v_sanity; \ + v32.v_version = v.v_version; \ + bcopy(v.v_volume, v32.v_volume, LEN_DKL_VVOL); \ + v32.v_sectorsz = v.v_sectorsz; \ + v32.v_nparts = v.v_nparts; \ + v32.v_version = v.v_version; \ + for (i = 0; i < 10; i++) \ + v32.v_reserved[i] = v.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + v32.v_part[i].p_tag = (ushort_t)v.v_part[i].p_tag; \ + v32.v_part[i].p_flag = (ushort_t)v.v_part[i].p_flag; \ + v32.v_part[i].p_start = (unsigned)v.v_part[i].p_start; \ + v32.v_part[i].p_size = (unsigned)v.v_part[i].p_size; \ + } \ + for (i = 0; i < V_NUMPAR; i++) { \ + if (v.timestamp[i] > TIME32_MAX) \ + v32.timestamp[i] = TIME32_MAX; \ + else \ + v32.timestamp[i] = (time32_t)v.timestamp[i]; \ + } \ + bcopy(v.v_asciilabel, v32.v_asciilabel, LEN_DKL_ASCII); \ + } + +#define extvtoctovtoc32(extv, v32) \ + { \ + int i; \ + v32.v_bootinfo[0] = extv.v_bootinfo[0]; \ + v32.v_bootinfo[1] = extv.v_bootinfo[1]; \ + v32.v_bootinfo[2] = extv.v_bootinfo[2]; \ + v32.v_sanity = extv.v_sanity; \ + v32.v_version = extv.v_version; \ + bcopy(extv.v_volume, v32.v_volume, LEN_DKL_VVOL); \ + v32.v_sectorsz = extv.v_sectorsz; \ + v32.v_nparts = extv.v_nparts; \ + v32.v_version = extv.v_version; \ + for (i = 0; i < 10; i++) \ + v32.v_reserved[i] = extv.v_reserved[i]; \ + for (i = 0; i < V_NUMPAR; i++) { \ + v32.v_part[i].p_tag = (ushort_t)extv.v_part[i].p_tag; \ + v32.v_part[i].p_flag = (ushort_t)extv.v_part[i].p_flag; \ + v32.v_part[i].p_start = (unsigned)extv.v_part[i].p_start; \ + v32.v_part[i].p_size = (unsigned)extv.v_part[i].p_size; \ + } \ + for (i = 0; i < V_NUMPAR; i++) { \ + if (extv.timestamp[i] > TIME32_MAX) \ + v32.timestamp[i] = TIME32_MAX; \ + else \ + v32.timestamp[i] = (time32_t)extv.timestamp[i]; \ + } \ + bcopy(extv.v_asciilabel, v32.v_asciilabel, LEN_DKL_ASCII); \ + } + + +#endif /* _SYSCALL32 */ + +/* + * These defines are the mode parameter for the checksum routines. + */ +#define CK_CHECKSUM 0 /* check checksum */ +#define CK_MAKESUM 1 /* generate checksum */ + +#if defined(__STDC__) + +extern int read_vtoc(int, struct vtoc *); +extern int write_vtoc(int, struct vtoc *); +extern int read_extvtoc(int, struct extvtoc *); +extern int write_extvtoc(int, struct extvtoc *); + +#else + +extern int read_vtoc(); +extern int write_vtoc(); +extern int read_extvtoc(); +extern int write_extvtoc(); + +#endif /* __STDC__ */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_VTOC_H */ diff --git a/lib/libspl/include/sys/zone.h b/lib/libspl/include/sys/zone.h new file mode 100644 index 000000000000..bbb964dcef22 --- /dev/null +++ b/lib/libspl/include/sys/zone.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_SYS_ZONE_H +#define _LIBSPL_SYS_ZONE_H + +#endif diff --git a/lib/libspl/include/thread.h b/lib/libspl/include/thread.h new file mode 100644 index 000000000000..74694e23eed5 --- /dev/null +++ b/lib/libspl/include/thread.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_THREAD_H +#define _LIBSPL_THREAD_H + +#endif /* _LIBSPL_THREAD_H */ diff --git a/lib/libspl/include/tzfile.h b/lib/libspl/include/tzfile.h new file mode 100644 index 000000000000..7bd4087cd5d1 --- /dev/null +++ b/lib/libspl/include/tzfile.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_TZFILE_H +#define _LIBSPL_TZFILE_H + +#include + +#endif /* _LIBSPL_TZFILE_H */ diff --git a/lib/libspl/include/ucred.h b/lib/libspl/include/ucred.h new file mode 100644 index 000000000000..8178fdec4c74 --- /dev/null +++ b/lib/libspl/include/ucred.h @@ -0,0 +1,32 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_UCRED_H +#define _LIBSPL_UCRED_H + +typedef int ucred_t; + +#endif diff --git a/lib/libspl/include/umem.h b/lib/libspl/include/umem.h new file mode 100644 index 000000000000..65f12595e64f --- /dev/null +++ b/lib/libspl/include/umem.h @@ -0,0 +1,208 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_UMEM_H +#define _LIBSPL_UMEM_H + +/* + * XXX: We should use the real portable umem library if it is detected + * at configure time. However, if the library is not available, we can + * use a trivial malloc based implementation. This obviously impacts + * performance, but unless you are using a full userspace build of zpool for + * something other than ztest, you are likely not going to notice or care. + * + * https://labs.omniti.com/trac/portableumem + */ +#include + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef void vmem_t; + +/* + * Flags for umem_alloc/umem_free + */ +#define UMEM_DEFAULT 0x0000 /* normal -- may fail */ +#define UMEM_NOFAIL 0x0100 /* Never fails */ + +/* + * Flags for umem_cache_create() + */ +#define UMC_NODEBUG 0x00020000 + +#define UMEM_CACHE_NAMELEN 31 + +typedef int umem_nofail_callback_t(void); +typedef int umem_constructor_t(void *, void *, int); +typedef void umem_destructor_t(void *, void *); +typedef void umem_reclaim_t(void *); + +typedef struct umem_cache { + char cache_name[UMEM_CACHE_NAMELEN + 1]; + size_t cache_bufsize; + size_t cache_align; + umem_constructor_t *cache_constructor; + umem_destructor_t *cache_destructor; + umem_reclaim_t *cache_reclaim; + void *cache_private; + void *cache_arena; + int cache_cflags; +} umem_cache_t; + +/* Prototypes for functions to provide defaults for umem envvars */ +const char *_umem_debug_init(void); +const char *_umem_options_init(void); +const char *_umem_logging_init(void); + +static inline void * +umem_alloc(size_t size, int flags) +{ + void *ptr = NULL; + + do { + ptr = malloc(size); + } while (ptr == NULL && (flags & UMEM_NOFAIL)); + + return (ptr); +} + +static inline void * +umem_alloc_aligned(size_t size, size_t align, int flags) +{ + void *ptr = NULL; + int rc = EINVAL; + + do { + rc = posix_memalign(&ptr, align, size); + } while (rc == ENOMEM && (flags & UMEM_NOFAIL)); + + if (rc == EINVAL) { + fprintf(stderr, "%s: invalid memory alignment (%zd)\n", + __func__, align); + if (flags & UMEM_NOFAIL) + abort(); + return (NULL); + } + + return (ptr); +} + +static inline void * +umem_zalloc(size_t size, int flags) +{ + void *ptr = NULL; + + ptr = umem_alloc(size, flags); + if (ptr) + memset(ptr, 0, size); + + return (ptr); +} + +static inline void +umem_free(void *ptr, size_t size __maybe_unused) +{ + free(ptr); +} + +static inline void +umem_nofail_callback(umem_nofail_callback_t *cb __maybe_unused) +{} + +static inline umem_cache_t * +umem_cache_create( + char *name, size_t bufsize, size_t align, + umem_constructor_t *constructor, + umem_destructor_t *destructor, + umem_reclaim_t *reclaim, + void *priv, void *vmp, int cflags) +{ + umem_cache_t *cp; + + cp = (umem_cache_t *)umem_alloc(sizeof (umem_cache_t), UMEM_DEFAULT); + if (cp) { + strlcpy(cp->cache_name, name, UMEM_CACHE_NAMELEN); + cp->cache_bufsize = bufsize; + cp->cache_align = align; + cp->cache_constructor = constructor; + cp->cache_destructor = destructor; + cp->cache_reclaim = reclaim; + cp->cache_private = priv; + cp->cache_arena = vmp; + cp->cache_cflags = cflags; + } + + return (cp); +} + +static inline void +umem_cache_destroy(umem_cache_t *cp) +{ + umem_free(cp, sizeof (umem_cache_t)); +} + +static inline void * +umem_cache_alloc(umem_cache_t *cp, int flags) +{ + void *ptr = NULL; + + if (cp->cache_align != 0) + ptr = umem_alloc_aligned( + cp->cache_bufsize, cp->cache_align, flags); + else + ptr = umem_alloc(cp->cache_bufsize, flags); + + if (ptr && cp->cache_constructor) + cp->cache_constructor(ptr, cp->cache_private, UMEM_DEFAULT); + + return (ptr); +} + +static inline void +umem_cache_free(umem_cache_t *cp, void *ptr) +{ + if (cp->cache_destructor) + cp->cache_destructor(ptr, cp->cache_private); + + umem_free(ptr, cp->cache_bufsize); +} + +static inline void +umem_cache_reap_now(umem_cache_t *cp __maybe_unused) +{ +} + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/lib/libspl/include/unistd.h b/lib/libspl/include/unistd.h new file mode 100644 index 000000000000..0246991b4b61 --- /dev/null +++ b/lib/libspl/include/unistd.h @@ -0,0 +1,39 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include_next + +#ifndef _LIBSPL_UNISTD_H +#define _LIBSPL_UNISTD_H + +#include + +#if !defined(HAVE_ISSETUGID) +#include +#define issetugid() (geteuid() == 0 || getegid() == 0) +#endif + +#endif /* _LIBSPL_UNISTD_H */ diff --git a/lib/libspl/include/util/Makefile.am b/lib/libspl/include/util/Makefile.am new file mode 100644 index 000000000000..ab553bc80313 --- /dev/null +++ b/lib/libspl/include/util/Makefile.am @@ -0,0 +1,3 @@ +libspldir = $(includedir)/libspl +libspl_HEADERS = \ + sscanf.h diff --git a/lib/libspl/include/util/sscanf.h b/lib/libspl/include/util/sscanf.h new file mode 100644 index 000000000000..ead36acaba3e --- /dev/null +++ b/lib/libspl/include/util/sscanf.h @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_UTIL_SSCANF_H +#define _LIBSPL_UTIL_SSCANF_H + +#endif diff --git a/lib/libspl/include/zone.h b/lib/libspl/include/zone.h new file mode 100644 index 000000000000..b4a6deb40ce4 --- /dev/null +++ b/lib/libspl/include/zone.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _LIBSPL_ZONE_H +#define _LIBSPL_ZONE_H + + + +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define GLOBAL_ZONEID 0 +#define GLOBAL_ZONEID_NAME "global" + +/* + * Functions for mapping between id and name for active zones. + */ +extern zoneid_t getzoneid(void); +extern zoneid_t getzoneidbyname(const char *); +extern ssize_t getzonenamebyid(zoneid_t, char *, size_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _LIBSPL_ZONE_H */ diff --git a/lib/libspl/list.c b/lib/libspl/list.c new file mode 100644 index 000000000000..0f2f3731b235 --- /dev/null +++ b/lib/libspl/list.c @@ -0,0 +1,243 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include +#include + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->prev = (node); \ + lnew->next = (node)->next; \ + (node)->next->prev = lnew; \ + (node)->next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->next = (node); \ + lnew->prev = (node)->prev; \ + (node)->prev->next = lnew; \ + (node)->prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->prev->next = (node)->next; \ + (node)->next->prev = (node)->prev; \ + (node)->next = (node)->prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.next = list->list_head.prev = &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.next == node); + ASSERT(list->list_head.prev == node); + + node->next = node->prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->next != &list->list_head) + return (list_object(list, node->next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->prev != &list->list_head) + return (list_object(list, node->prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->prev->next = srcnode->next; + srcnode->next->prev = dstnode->prev; + dstnode->prev = srcnode->prev; + srcnode->prev->next = dstnode; + + /* empty src list */ + srcnode->next = srcnode->prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->next = lold->next; + lnew->prev = lold->prev; + lold->prev->next = lnew; + lold->next->prev = lnew; + lold->next = lold->prev = NULL; +} + +void +list_link_init(list_node_t *ln) +{ + ln->next = NULL; + ln->prev = NULL; +} + +int +list_link_active(list_node_t *ln) +{ + EQUIV(ln->next == NULL, ln->prev == NULL); + return (ln->next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/lib/libspl/mkdirp.c b/lib/libspl/mkdirp.c new file mode 100644 index 000000000000..fce2c1c82eb7 --- /dev/null +++ b/lib/libspl/mkdirp.c @@ -0,0 +1,212 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +/* + * Creates directory and it's parents if the parents do not + * exist yet. + * + * Returns -1 if fails for reasons other than non-existing + * parents. + * Does NOT simplify pathnames with . or .. in them. + */ + +#include +#include +#include +#include +#include +#include +#include + +static char *simplify(const char *str); + +int +mkdirp(const char *d, mode_t mode) +{ + char *endptr, *ptr, *slash, *str; + + str = simplify(d); + + /* If space couldn't be allocated for the simplified names, return. */ + + if (str == NULL) + return (-1); + + /* Try to make the directory */ + + if (mkdir(str, mode) == 0) { + free(str); + return (0); + } + if (errno != ENOENT) { + free(str); + return (-1); + } + endptr = strrchr(str, '\0'); + slash = strrchr(str, '/'); + + /* Search upward for the non-existing parent */ + + while (slash != NULL) { + + ptr = slash; + *ptr = '\0'; + + /* If reached an existing parent, break */ + + if (access(str, F_OK) == 0) + break; + + /* If non-existing parent */ + + else { + slash = strrchr(str, '/'); + + /* If under / or current directory, make it. */ + + if (slash == NULL || slash == str) { + if (mkdir(str, mode) != 0 && errno != EEXIST) { + free(str); + return (-1); + } + break; + } + } + } + + /* Create directories starting from upmost non-existing parent */ + + while ((ptr = strchr(str, '\0')) != endptr) { + *ptr = '/'; + if (mkdir(str, mode) != 0 && errno != EEXIST) { + /* + * If the mkdir fails because str already + * exists (EEXIST), then str has the form + * "existing-dir/..", and this is really + * ok. (Remember, this loop is creating the + * portion of the path that didn't exist) + */ + free(str); + return (-1); + } + } + free(str); + return (0); +} + +/* + * simplify - given a pathname, simplify that path by removing + * duplicate contiguous slashes. + * + * A simplified copy of the argument is returned to the + * caller, or NULL is returned on error. + * + * The caller should handle error reporting based upon the + * returned value, and should free the returned value, + * when appropriate. + */ + +static char * +simplify(const char *str) +{ + int i; + size_t mbPathlen; /* length of multi-byte path */ + size_t wcPathlen; /* length of wide-character path */ + wchar_t *wptr; /* scratch pointer */ + wchar_t *wcPath; /* wide-character version of the path */ + char *mbPath; /* The copy fo the path to be returned */ + + /* + * bail out if there is nothing there. + */ + + if (!str) { + errno = ENOENT; + return (NULL); + } + + /* + * Get a copy of the argument. + */ + + if ((mbPath = strdup(str)) == NULL) { + return (NULL); + } + + /* + * convert the multi-byte version of the path to a + * wide-character rendering, for doing our figuring. + */ + + mbPathlen = strlen(mbPath); + + if ((wcPath = calloc(mbPathlen+1, sizeof (wchar_t))) == NULL) { + free(mbPath); + return (NULL); + } + + if ((wcPathlen = mbstowcs(wcPath, mbPath, mbPathlen)) == (size_t)-1) { + free(mbPath); + free(wcPath); + return (NULL); + } + + /* + * remove duplicate slashes first ("//../" -> "/") + */ + + for (wptr = wcPath, i = 0; i < wcPathlen; i++) { + *wptr++ = wcPath[i]; + + if (wcPath[i] == '/') { + i++; + + while (wcPath[i] == '/') { + i++; + } + + i--; + } + } + + *wptr = '\0'; + + /* + * now convert back to the multi-byte format. + */ + + if (wcstombs(mbPath, wcPath, mbPathlen) == (size_t)-1) { + free(mbPath); + free(wcPath); + return (NULL); + } + + free(wcPath); + return (mbPath); +} diff --git a/lib/libspl/os/freebsd/getexecname.c b/lib/libspl/os/freebsd/getexecname.c new file mode 100644 index 000000000000..2b057cc73016 --- /dev/null +++ b/lib/libspl/os/freebsd/getexecname.c @@ -0,0 +1,71 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char * +getexecname(void) +{ + static char execname[PATH_MAX + 1] = ""; + static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + char *ptr = NULL; + ssize_t rc; + + (void) pthread_mutex_lock(&mtx); + + if (strlen(execname) == 0) { + int error, name[4]; + size_t len; + + name[0] = CTL_KERN; + name[1] = KERN_PROC; + name[2] = KERN_PROC_PATHNAME; + name[3] = -1; + len = PATH_MAX; + error = sysctl(name, nitems(name), execname, &len, NULL, 0); + if (error != 0) { + rc = -1; + } else { + rc = len; + } + if (rc == -1) { + execname[0] = '\0'; + } else { + execname[rc] = '\0'; + ptr = execname; + } + } else { + ptr = execname; + } + + (void) pthread_mutex_unlock(&mtx); + return (ptr); +} diff --git a/lib/libspl/os/freebsd/gethostid.c b/lib/libspl/os/freebsd/gethostid.c new file mode 100644 index 000000000000..7bd567fe61b5 --- /dev/null +++ b/lib/libspl/os/freebsd/gethostid.c @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +unsigned long +get_system_hostid(void) +{ + return (gethostid()); +} diff --git a/lib/libspl/os/freebsd/getmntany.c b/lib/libspl/os/freebsd/getmntany.c new file mode 100644 index 000000000000..b41e763cee43 --- /dev/null +++ b/lib/libspl/os/freebsd/getmntany.c @@ -0,0 +1,67 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFSIZE (MNT_LINE_MAX + 2) + +__thread char buf[BUFSIZE]; + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct statfs sfs; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + if (statfs(path, &sfs) != 0) { + (void) fprintf(stderr, "%s: %s\n", path, + strerror(errno)); + return (-1); + } + statfs2mnttab(&sfs, (struct mnttab *)entry); + return (0); +} diff --git a/lib/libspl/os/freebsd/mnttab.c b/lib/libspl/os/freebsd/mnttab.c new file mode 100644 index 000000000000..5b9e6429d9e3 --- /dev/null +++ b/lib/libspl/os/freebsd/mnttab.c @@ -0,0 +1,216 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible getmntany() and hasmntopt() + * functions. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +static char * +mntopt(char **p) +{ + char *cp = *p; + char *retstr; + + while (*cp && isspace(*cp)) + cp++; + + retstr = cp; + while (*cp && *cp != ',') + cp++; + + if (*cp) { + *cp = '\0'; + cp++; + } + + *p = cp; + return (retstr); +} + +char * +hasmntopt(struct mnttab *mnt, char *opt) +{ + char tmpopts[MNT_LINE_MAX]; + char *f, *opts = tmpopts; + + if (mnt->mnt_mntopts == NULL) + return (NULL); + (void) strcpy(opts, mnt->mnt_mntopts); + f = mntopt(&opts); + for (; *f; f = mntopt(&opts)) { + if (strncmp(opt, f, strlen(opt)) == 0) + return (f - tmpopts + mnt->mnt_mntopts); + } + return (NULL); +} + +static void +optadd(char *mntopts, size_t size, const char *opt) +{ + + if (mntopts[0] != '\0') + strlcat(mntopts, ",", size); + strlcat(mntopts, opt, size); +} + +void +statfs2mnttab(struct statfs *sfs, struct mnttab *mp) +{ + static char mntopts[MNTMAXSTR]; + long flags; + + mntopts[0] = '\0'; + + flags = sfs->f_flags; +#define OPTADD(opt) optadd(mntopts, sizeof (mntopts), (opt)) + if (flags & MNT_RDONLY) + OPTADD(MNTOPT_RO); + else + OPTADD(MNTOPT_RW); + if (flags & MNT_NOSUID) + OPTADD(MNTOPT_NOSETUID); + else + OPTADD(MNTOPT_SETUID); + if (flags & MNT_UPDATE) + OPTADD(MNTOPT_REMOUNT); + if (flags & MNT_NOATIME) + OPTADD(MNTOPT_NOATIME); + else + OPTADD(MNTOPT_ATIME); + OPTADD(MNTOPT_NOXATTR); + if (flags & MNT_NOEXEC) + OPTADD(MNTOPT_NOEXEC); + else + OPTADD(MNTOPT_EXEC); +#undef OPTADD + mp->mnt_special = strdup(sfs->f_mntfromname); + mp->mnt_mountp = strdup(sfs->f_mntonname); + mp->mnt_fstype = strdup(sfs->f_fstypename); + mp->mnt_mntopts = strdup(mntopts); +} + +static struct statfs *gsfs = NULL; +static int allfs = 0; + +static int +statfs_init(void) +{ + struct statfs *sfs; + int error; + + if (gsfs != NULL) { + free(gsfs); + gsfs = NULL; + } + allfs = getfsstat(NULL, 0, MNT_WAIT); + if (allfs == -1) + goto fail; + gsfs = malloc(sizeof (gsfs[0]) * allfs * 2); + if (gsfs == NULL) + goto fail; + allfs = getfsstat(gsfs, (long)(sizeof (gsfs[0]) * allfs * 2), + MNT_WAIT); + if (allfs == -1) + goto fail; + sfs = realloc(gsfs, allfs * sizeof (gsfs[0])); + if (sfs != NULL) + gsfs = sfs; + return (0); +fail: + error = errno; + if (gsfs != NULL) + free(gsfs); + gsfs = NULL; + allfs = 0; + return (error); +} + +int +getmntany(FILE *fd __unused, struct mnttab *mgetp, struct mnttab *mrefp) +{ + // struct statfs *sfs; + int i, error; + + error = statfs_init(); + if (error != 0) + return (error); + + for (i = 0; i < allfs; i++) { + if (mrefp->mnt_special != NULL && + strcmp(mrefp->mnt_special, gsfs[i].f_mntfromname) != 0) { + continue; + } + if (mrefp->mnt_mountp != NULL && + strcmp(mrefp->mnt_mountp, gsfs[i].f_mntonname) != 0) { + continue; + } + if (mrefp->mnt_fstype != NULL && + strcmp(mrefp->mnt_fstype, gsfs[i].f_fstypename) != 0) { + continue; + } + statfs2mnttab(&gsfs[i], mgetp); + return (0); + } + return (-1); +} + +int +getmntent(FILE *fp, struct mnttab *mp) +{ + // struct statfs *sfs; + int error, nfs; + + nfs = (int)lseek(fileno(fp), 0, SEEK_CUR); + if (nfs == -1) + return (errno); + /* If nfs is 0, we want to refresh out cache. */ + if (nfs == 0 || gsfs == NULL) { + error = statfs_init(); + if (error != 0) + return (error); + } + if (nfs >= allfs) + return (-1); + statfs2mnttab(&gsfs[nfs], mp); + if (lseek(fileno(fp), 1, SEEK_CUR) == -1) + return (errno); + return (0); +} diff --git a/lib/libspl/os/linux/getexecname.c b/lib/libspl/os/linux/getexecname.c new file mode 100644 index 000000000000..6352a1a34015 --- /dev/null +++ b/lib/libspl/os/linux/getexecname.c @@ -0,0 +1,59 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + +#include +#include +#include +#include +#include + +const char * +getexecname(void) +{ + static char execname[PATH_MAX + 1] = ""; + static pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER; + char *ptr = NULL; + ssize_t rc; + + (void) pthread_mutex_lock(&mtx); + + if (strlen(execname) == 0) { + rc = readlink("/proc/self/exe", + execname, sizeof (execname) - 1); + if (rc == -1) { + execname[0] = '\0'; + } else { + execname[rc] = '\0'; + ptr = execname; + } + } else { + ptr = execname; + } + + (void) pthread_mutex_unlock(&mtx); + return (ptr); +} diff --git a/lib/libspl/os/linux/gethostid.c b/lib/libspl/os/linux/gethostid.c new file mode 100644 index 000000000000..1eb93f441142 --- /dev/null +++ b/lib/libspl/os/linux/gethostid.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include + +static unsigned long +get_spl_hostid(void) +{ + FILE *f; + unsigned long hostid; + char *env; + + /* + * Allow the hostid to be subverted for testing. + */ + env = getenv("ZFS_HOSTID"); + if (env) { + hostid = strtoull(env, NULL, 0); + return (hostid & HOSTID_MASK); + } + + f = fopen("/sys/module/spl/parameters/spl_hostid", "r"); + if (!f) + return (0); + + if (fscanf(f, "%lu", &hostid) != 1) + hostid = 0; + + fclose(f); + + return (hostid & HOSTID_MASK); +} + +unsigned long +get_system_hostid(void) +{ + unsigned long system_hostid = get_spl_hostid(); + /* + * We do not use the library call gethostid() because + * it generates a hostid value that the kernel is + * unaware of, if the spl_hostid module parameter has not + * been set and there is no system hostid file (e.g. + * /etc/hostid). The kernel and userspace must agree. + * See comments above hostid_read() in the SPL. + */ + if (system_hostid == 0) { + int fd, rc; + unsigned long hostid; + int hostid_size = 4; /* 4 bytes regardless of arch */ + + fd = open("/etc/hostid", O_RDONLY); + if (fd >= 0) { + rc = read(fd, &hostid, hostid_size); + if (rc > 0) + system_hostid = (hostid & HOSTID_MASK); + close(fd); + } + } + return (system_hostid); +} diff --git a/lib/libspl/os/linux/getmntany.c b/lib/libspl/os/linux/getmntany.c new file mode 100644 index 000000000000..f42fcc047893 --- /dev/null +++ b/lib/libspl/os/linux/getmntany.c @@ -0,0 +1,165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1988 AT&T */ +/* All Rights Reserved */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define BUFSIZE (MNT_LINE_MAX + 2) + +__thread char buf[BUFSIZE]; + +#define DIFF(xx) ( \ + (mrefp->xx != NULL) && \ + (mgetp->xx == NULL || strcmp(mrefp->xx, mgetp->xx) != 0)) + +int +getmntany(FILE *fp, struct mnttab *mgetp, struct mnttab *mrefp) +{ + int ret; + + while ( + ((ret = _sol_getmntent(fp, mgetp)) == 0) && ( + DIFF(mnt_special) || DIFF(mnt_mountp) || + DIFF(mnt_fstype) || DIFF(mnt_mntopts))) { } + + return (ret); +} + +int +_sol_getmntent(FILE *fp, struct mnttab *mgetp) +{ + struct mntent mntbuf; + struct mntent *ret; + + ret = getmntent_r(fp, &mntbuf, buf, BUFSIZE); + + if (ret != NULL) { + mgetp->mnt_special = mntbuf.mnt_fsname; + mgetp->mnt_mountp = mntbuf.mnt_dir; + mgetp->mnt_fstype = mntbuf.mnt_type; + mgetp->mnt_mntopts = mntbuf.mnt_opts; + return (0); + } + + if (feof(fp)) + return (-1); + + return (MNT_TOOLONG); +} + +static int +getextmntent_impl(FILE *fp, struct extmnttab *mp, int len) +{ + int ret; + struct stat64 st; + + ret = _sol_getmntent(fp, (struct mnttab *)mp); + if (ret == 0) { + if (stat64(mp->mnt_mountp, &st) != 0) { + mp->mnt_major = 0; + mp->mnt_minor = 0; + return (ret); + } + mp->mnt_major = major(st.st_dev); + mp->mnt_minor = minor(st.st_dev); + } + + return (ret); +} + +int +getextmntent(const char *path, struct extmnttab *entry, struct stat64 *statbuf) +{ + struct stat64 st; + FILE *fp; + int match; + + if (strlen(path) >= MAXPATHLEN) { + (void) fprintf(stderr, "invalid object; pathname too long\n"); + return (-1); + } + + /* + * Search for the path in /proc/self/mounts. Rather than looking for the + * specific path, which can be fooled by non-standard paths (i.e. ".." + * or "//"), we stat() the path and search for the corresponding + * (major,minor) device pair. + */ + if (stat64(path, statbuf) != 0) { + (void) fprintf(stderr, "cannot open '%s': %s\n", + path, strerror(errno)); + return (-1); + } + + +#ifdef HAVE_SETMNTENT + if ((fp = setmntent(MNTTAB, "r")) == NULL) { +#else + if ((fp = fopen(MNTTAB, "r")) == NULL) { +#endif + (void) fprintf(stderr, "cannot open %s\n", MNTTAB); + return (-1); + } + + /* + * Search for the given (major,minor) pair in the mount table. + */ + + match = 0; + while (getextmntent_impl(fp, entry, sizeof (*entry)) == 0) { + if (makedev(entry->mnt_major, entry->mnt_minor) == + statbuf->st_dev) { + match = 1; + break; + } + } + + if (!match) { + (void) fprintf(stderr, "cannot find mountpoint for '%s'\n", + path); + return (-1); + } + + if (stat64(entry->mnt_mountp, &st) != 0) { + entry->mnt_major = 0; + entry->mnt_minor = 0; + return (-1); + } + + return (0); +} diff --git a/lib/libspl/page.c b/lib/libspl/page.c new file mode 100644 index 000000000000..06d9fcfa05cc --- /dev/null +++ b/lib/libspl/page.c @@ -0,0 +1,34 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include + +size_t pagesize = 0; + +size_t +spl_pagesize(void) +{ + if (pagesize == 0) + pagesize = sysconf(_SC_PAGESIZE); + + return (pagesize); +} diff --git a/lib/libspl/strlcat.c b/lib/libspl/strlcat.c new file mode 100644 index 000000000000..4528d875ed5e --- /dev/null +++ b/lib/libspl/strlcat.c @@ -0,0 +1,60 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +#ifndef HAVE_STRLCAT + +#include +#include + +/* + * Appends src to the dstsize buffer at dst. The append will never + * overflow the destination buffer and the buffer will always be null + * terminated. Never reference beyond &dst[dstsize-1] when computing + * the length of the pre-existing string. + */ + +size_t +strlcat(char *dst, const char *src, size_t dstsize) +{ + char *df = dst; + size_t left = dstsize; + size_t l1; + size_t l2 = strlen(src); + size_t copied; + + while (left-- != 0 && *df != '\0') + df++; + l1 = df - dst; + if (dstsize == l1) + return (l1 + l2); + + copied = l1 + l2 >= dstsize ? dstsize - l1 - 1 : l2; + (void) memcpy(dst + l1, src, copied); + dst[l1+copied] = '\0'; + + return (l1 + l2); +} + +#endif diff --git a/lib/libspl/strlcpy.c b/lib/libspl/strlcpy.c new file mode 100644 index 000000000000..d483b91f6121 --- /dev/null +++ b/lib/libspl/strlcpy.c @@ -0,0 +1,56 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef HAVE_STRLCPY + +#include +#include + +/* + * Copies src to the dstsize buffer at dst. The copy will never + * overflow the destination buffer and the buffer will always be null + * terminated. + */ + +size_t +strlcpy(char *dst, const char *src, size_t len) +{ + size_t slen = strlen(src); + size_t copied; + + if (len == 0) + return (slen); + + if (slen >= len) + copied = len - 1; + else + copied = slen; + (void) memcpy(dst, src, copied); + dst[copied] = '\0'; + return (slen); +} + +#endif /* HAVE_STRLCPY */ diff --git a/lib/libspl/timestamp.c b/lib/libspl/timestamp.c new file mode 100644 index 000000000000..22ecb3940739 --- /dev/null +++ b/lib/libspl/timestamp.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include "statcommon.h" + +#ifndef _DATE_FMT +#ifdef D_T_FMT +#define _DATE_FMT D_T_FMT +#else /* D_T_FMT */ +#define _DATE_FMT "%+" +#endif /* !D_T_FMT */ +#endif /* _DATE_FMT */ + +/* + * Print timestamp as decimal reprentation of time_t value (-T u was specified) + * or in date(1) format (-T d was specified). + */ +void +print_timestamp(uint_t timestamp_fmt) +{ + time_t t = time(NULL); + static char *fmt = NULL; + + /* We only need to retrieve this once per invocation */ + if (fmt == NULL) + fmt = nl_langinfo(_DATE_FMT); + + if (timestamp_fmt == UDATE) { + (void) printf("%lld\n", (longlong_t)t); + } else if (timestamp_fmt == DDATE) { + char dstr[64]; + int len; + + len = strftime(dstr, sizeof (dstr), fmt, localtime(&t)); + if (len > 0) + (void) printf("%s\n", dstr); + } +} diff --git a/lib/libspl/zone.c b/lib/libspl/zone.c new file mode 100644 index 000000000000..5ca93b224d9e --- /dev/null +++ b/lib/libspl/zone.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Ricardo Correia. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +zoneid_t +getzoneid() +{ + return (GLOBAL_ZONEID); +} + +zoneid_t +getzoneidbyname(const char *name) +{ + if (name == NULL) + return (GLOBAL_ZONEID); + + if (strcmp(name, GLOBAL_ZONEID_NAME) == 0) + return (GLOBAL_ZONEID); + + return (EINVAL); +} + +ssize_t +getzonenamebyid(zoneid_t id, char *buf, size_t buflen) +{ + if (id != GLOBAL_ZONEID) + return (EINVAL); + + ssize_t ret = strlen(GLOBAL_ZONEID_NAME) + 1; + + if (buf == NULL || buflen == 0) + return (ret); + + strncpy(buf, GLOBAL_ZONEID_NAME, buflen); + buf[buflen - 1] = '\0'; + + return (ret); +} diff --git a/lib/libtpool/Makefile.am b/lib/libtpool/Makefile.am new file mode 100644 index 000000000000..22bfa4b23a8f --- /dev/null +++ b/lib/libtpool/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Rules.am + +noinst_LTLIBRARIES = libtpool.la + +USER_C = \ + thread_pool.c \ + thread_pool_impl.h + +libtpool_la_SOURCES = $(USER_C) diff --git a/lib/libtpool/thread_pool.c b/lib/libtpool/thread_pool.c new file mode 100644 index 000000000000..892beeffa5ae --- /dev/null +++ b/lib/libtpool/thread_pool.c @@ -0,0 +1,599 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include "thread_pool_impl.h" + +static pthread_mutex_t thread_pool_lock = PTHREAD_MUTEX_INITIALIZER; +static tpool_t *thread_pools = NULL; + +static void +delete_pool(tpool_t *tpool) +{ + tpool_job_t *job; + + ASSERT(tpool->tp_current == 0 && tpool->tp_active == NULL); + + /* + * Unlink the pool from the global list of all pools. + */ + (void) pthread_mutex_lock(&thread_pool_lock); + if (thread_pools == tpool) + thread_pools = tpool->tp_forw; + if (thread_pools == tpool) + thread_pools = NULL; + else { + tpool->tp_back->tp_forw = tpool->tp_forw; + tpool->tp_forw->tp_back = tpool->tp_back; + } + pthread_mutex_unlock(&thread_pool_lock); + + /* + * There should be no pending jobs, but just in case... + */ + for (job = tpool->tp_head; job != NULL; job = tpool->tp_head) { + tpool->tp_head = job->tpj_next; + free(job); + } + (void) pthread_attr_destroy(&tpool->tp_attr); + free(tpool); +} + +/* + * Worker thread is terminating. + */ +static void +worker_cleanup(void *arg) +{ + tpool_t *tpool = (tpool_t *)arg; + + if (--tpool->tp_current == 0 && + (tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) { + if (tpool->tp_flags & TP_ABANDON) { + pthread_mutex_unlock(&tpool->tp_mutex); + delete_pool(tpool); + return; + } + if (tpool->tp_flags & TP_DESTROY) + (void) pthread_cond_broadcast(&tpool->tp_busycv); + } + pthread_mutex_unlock(&tpool->tp_mutex); +} + +static void +notify_waiters(tpool_t *tpool) +{ + if (tpool->tp_head == NULL && tpool->tp_active == NULL) { + tpool->tp_flags &= ~TP_WAIT; + (void) pthread_cond_broadcast(&tpool->tp_waitcv); + } +} + +/* + * Called by a worker thread on return from a tpool_dispatch()d job. + */ +static void +job_cleanup(void *arg) +{ + tpool_t *tpool = (tpool_t *)arg; + + pthread_t my_tid = pthread_self(); + tpool_active_t *activep; + tpool_active_t **activepp; + + pthread_mutex_lock(&tpool->tp_mutex); + /* CSTYLED */ + for (activepp = &tpool->tp_active;; activepp = &activep->tpa_next) { + activep = *activepp; + if (activep->tpa_tid == my_tid) { + *activepp = activep->tpa_next; + break; + } + } + if (tpool->tp_flags & TP_WAIT) + notify_waiters(tpool); +} + +static void * +tpool_worker(void *arg) +{ + tpool_t *tpool = (tpool_t *)arg; + int elapsed; + tpool_job_t *job; + void (*func)(void *); + tpool_active_t active; + + pthread_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(worker_cleanup, tpool); + + /* + * This is the worker's main loop. + * It will only be left if a timeout or an error has occurred. + */ + active.tpa_tid = pthread_self(); + for (;;) { + elapsed = 0; + tpool->tp_idle++; + if (tpool->tp_flags & TP_WAIT) + notify_waiters(tpool); + while ((tpool->tp_head == NULL || + (tpool->tp_flags & TP_SUSPEND)) && + !(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))) { + if (tpool->tp_current <= tpool->tp_minimum || + tpool->tp_linger == 0) { + (void) pthread_cond_wait(&tpool->tp_workcv, + &tpool->tp_mutex); + } else { + struct timespec ts; + + clock_gettime(CLOCK_REALTIME, &ts); + ts.tv_sec += tpool->tp_linger; + + if (pthread_cond_timedwait(&tpool->tp_workcv, + &tpool->tp_mutex, &ts) != 0) { + elapsed = 1; + break; + } + } + } + tpool->tp_idle--; + if (tpool->tp_flags & TP_DESTROY) + break; + if (tpool->tp_flags & TP_ABANDON) { + /* can't abandon a suspended pool */ + if (tpool->tp_flags & TP_SUSPEND) { + tpool->tp_flags &= ~TP_SUSPEND; + (void) pthread_cond_broadcast( + &tpool->tp_workcv); + } + if (tpool->tp_head == NULL) + break; + } + if ((job = tpool->tp_head) != NULL && + !(tpool->tp_flags & TP_SUSPEND)) { + elapsed = 0; + func = job->tpj_func; + arg = job->tpj_arg; + tpool->tp_head = job->tpj_next; + if (job == tpool->tp_tail) + tpool->tp_tail = NULL; + tpool->tp_njobs--; + active.tpa_next = tpool->tp_active; + tpool->tp_active = &active; + pthread_mutex_unlock(&tpool->tp_mutex); + pthread_cleanup_push(job_cleanup, tpool); + free(job); + + sigset_t maskset; + (void) pthread_sigmask(SIG_SETMASK, NULL, &maskset); + + /* + * Call the specified function. + */ + func(arg); + /* + * We don't know what this thread has been doing, + * so we reset its signal mask and cancellation + * state back to the values prior to calling func(). + */ + (void) pthread_sigmask(SIG_SETMASK, &maskset, NULL); + (void) pthread_setcanceltype(PTHREAD_CANCEL_DEFERRED, + NULL); + (void) pthread_setcancelstate(PTHREAD_CANCEL_ENABLE, + NULL); + pthread_cleanup_pop(1); + } + if (elapsed && tpool->tp_current > tpool->tp_minimum) { + /* + * We timed out and there is no work to be done + * and the number of workers exceeds the minimum. + * Exit now to reduce the size of the pool. + */ + break; + } + } + pthread_cleanup_pop(1); + return (arg); +} + +/* + * Create a worker thread, with default signals blocked. + */ +static int +create_worker(tpool_t *tpool) +{ + pthread_t thread; + sigset_t oset; + int error; + + (void) pthread_sigmask(SIG_SETMASK, NULL, &oset); + error = pthread_create(&thread, &tpool->tp_attr, tpool_worker, tpool); + (void) pthread_sigmask(SIG_SETMASK, &oset, NULL); + return (error); +} + + +/* + * pthread_attr_clone: make a copy of a pthread_attr_t. When old_attr + * is NULL initialize the cloned attr using default values. + */ +static int +pthread_attr_clone(pthread_attr_t *attr, const pthread_attr_t *old_attr) +{ + int error; + + error = pthread_attr_init(attr); + if (error || (old_attr == NULL)) + return (error); + +#ifdef __GLIBC__ + cpu_set_t cpuset; + size_t cpusetsize = sizeof (cpuset); + error = pthread_attr_getaffinity_np(old_attr, cpusetsize, &cpuset); + if (error == 0) + error = pthread_attr_setaffinity_np(attr, cpusetsize, &cpuset); + if (error) + goto error; +#endif /* __GLIBC__ */ + + int detachstate; + error = pthread_attr_getdetachstate(old_attr, &detachstate); + if (error == 0) + error = pthread_attr_setdetachstate(attr, detachstate); + if (error) + goto error; + + size_t guardsize; + error = pthread_attr_getguardsize(old_attr, &guardsize); + if (error == 0) + error = pthread_attr_setguardsize(attr, guardsize); + if (error) + goto error; + + int inheritsched; + error = pthread_attr_getinheritsched(old_attr, &inheritsched); + if (error == 0) + error = pthread_attr_setinheritsched(attr, inheritsched); + if (error) + goto error; + + struct sched_param param; + error = pthread_attr_getschedparam(old_attr, ¶m); + if (error == 0) + error = pthread_attr_setschedparam(attr, ¶m); + if (error) + goto error; + + int policy; + error = pthread_attr_getschedpolicy(old_attr, &policy); + if (error == 0) + error = pthread_attr_setschedpolicy(attr, policy); + if (error) + goto error; + + int scope; + error = pthread_attr_getscope(old_attr, &scope); + if (error == 0) + error = pthread_attr_setscope(attr, scope); + if (error) + goto error; + + void *stackaddr; + size_t stacksize; + error = pthread_attr_getstack(old_attr, &stackaddr, &stacksize); + if (error == 0) + error = pthread_attr_setstack(attr, stackaddr, stacksize); + if (error) + goto error; + + return (0); +error: + pthread_attr_destroy(attr); + return (error); +} + +tpool_t * +tpool_create(uint_t min_threads, uint_t max_threads, uint_t linger, + pthread_attr_t *attr) +{ + tpool_t *tpool; + void *stackaddr; + size_t stacksize; + size_t minstack; + int error; + + if (min_threads > max_threads || max_threads < 1) { + errno = EINVAL; + return (NULL); + } + if (attr != NULL) { + if (pthread_attr_getstack(attr, &stackaddr, &stacksize) != 0) { + errno = EINVAL; + return (NULL); + } + /* + * Allow only one thread in the pool with a specified stack. + * Require threads to have at least the minimum stack size. + */ + minstack = PTHREAD_STACK_MIN; + if (stackaddr != NULL) { + if (stacksize < minstack || max_threads != 1) { + errno = EINVAL; + return (NULL); + } + } else if (stacksize != 0 && stacksize < minstack) { + errno = EINVAL; + return (NULL); + } + } + + tpool = calloc(1, sizeof (*tpool)); + if (tpool == NULL) { + errno = ENOMEM; + return (NULL); + } + (void) pthread_mutex_init(&tpool->tp_mutex, NULL); + (void) pthread_cond_init(&tpool->tp_busycv, NULL); + (void) pthread_cond_init(&tpool->tp_workcv, NULL); + (void) pthread_cond_init(&tpool->tp_waitcv, NULL); + tpool->tp_minimum = min_threads; + tpool->tp_maximum = max_threads; + tpool->tp_linger = linger; + + /* + * We cannot just copy the attribute pointer. + * We need to initialize a new pthread_attr_t structure + * with the values from the user-supplied pthread_attr_t. + * If the attribute pointer is NULL, we need to initialize + * the new pthread_attr_t structure with default values. + */ + error = pthread_attr_clone(&tpool->tp_attr, attr); + if (error) { + free(tpool); + errno = error; + return (NULL); + } + + /* make all pool threads be detached daemon threads */ + (void) pthread_attr_setdetachstate(&tpool->tp_attr, + PTHREAD_CREATE_DETACHED); + + /* insert into the global list of all thread pools */ + pthread_mutex_lock(&thread_pool_lock); + if (thread_pools == NULL) { + tpool->tp_forw = tpool; + tpool->tp_back = tpool; + thread_pools = tpool; + } else { + thread_pools->tp_back->tp_forw = tpool; + tpool->tp_forw = thread_pools; + tpool->tp_back = thread_pools->tp_back; + thread_pools->tp_back = tpool; + } + pthread_mutex_unlock(&thread_pool_lock); + + return (tpool); +} + +/* + * Dispatch a work request to the thread pool. + * If there are idle workers, awaken one. + * Else, if the maximum number of workers has + * not been reached, spawn a new worker thread. + * Else just return with the job added to the queue. + */ +int +tpool_dispatch(tpool_t *tpool, void (*func)(void *), void *arg) +{ + tpool_job_t *job; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + if ((job = calloc(1, sizeof (*job))) == NULL) + return (-1); + job->tpj_next = NULL; + job->tpj_func = func; + job->tpj_arg = arg; + + pthread_mutex_lock(&tpool->tp_mutex); + + if (tpool->tp_head == NULL) + tpool->tp_head = job; + else + tpool->tp_tail->tpj_next = job; + tpool->tp_tail = job; + tpool->tp_njobs++; + + if (!(tpool->tp_flags & TP_SUSPEND)) { + if (tpool->tp_idle > 0) + (void) pthread_cond_signal(&tpool->tp_workcv); + else if (tpool->tp_current < tpool->tp_maximum && + create_worker(tpool) == 0) + tpool->tp_current++; + } + + pthread_mutex_unlock(&tpool->tp_mutex); + return (0); +} + +static void +tpool_cleanup(void *arg) +{ + tpool_t *tpool = (tpool_t *)arg; + + pthread_mutex_unlock(&tpool->tp_mutex); +} + +/* + * Assumes: by the time tpool_destroy() is called no one will use this + * thread pool in any way and no one will try to dispatch entries to it. + * Calling tpool_destroy() from a job in the pool will cause deadlock. + */ +void +tpool_destroy(tpool_t *tpool) +{ + tpool_active_t *activep; + + ASSERT(!tpool_member(tpool)); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(tpool_cleanup, tpool); + + /* mark the pool as being destroyed; wakeup idle workers */ + tpool->tp_flags |= TP_DESTROY; + tpool->tp_flags &= ~TP_SUSPEND; + (void) pthread_cond_broadcast(&tpool->tp_workcv); + + /* cancel all active workers */ + for (activep = tpool->tp_active; activep; activep = activep->tpa_next) + (void) pthread_cancel(activep->tpa_tid); + + /* wait for all active workers to finish */ + while (tpool->tp_active != NULL) { + tpool->tp_flags |= TP_WAIT; + (void) pthread_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex); + } + + /* the last worker to terminate will wake us up */ + while (tpool->tp_current != 0) + (void) pthread_cond_wait(&tpool->tp_busycv, &tpool->tp_mutex); + + pthread_cleanup_pop(1); /* pthread_mutex_unlock(&tpool->tp_mutex); */ + delete_pool(tpool); +} + +/* + * Like tpool_destroy(), but don't cancel workers or wait for them to finish. + * The last worker to terminate will delete the pool. + */ +void +tpool_abandon(tpool_t *tpool) +{ + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + if (tpool->tp_current == 0) { + /* no workers, just delete the pool */ + pthread_mutex_unlock(&tpool->tp_mutex); + delete_pool(tpool); + } else { + /* wake up all workers, last one will delete the pool */ + tpool->tp_flags |= TP_ABANDON; + tpool->tp_flags &= ~TP_SUSPEND; + (void) pthread_cond_broadcast(&tpool->tp_workcv); + pthread_mutex_unlock(&tpool->tp_mutex); + } +} + +/* + * Wait for all jobs to complete. + * Calling tpool_wait() from a job in the pool will cause deadlock. + */ +void +tpool_wait(tpool_t *tpool) +{ + ASSERT(!tpool_member(tpool)); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + pthread_cleanup_push(tpool_cleanup, tpool); + while (tpool->tp_head != NULL || tpool->tp_active != NULL) { + tpool->tp_flags |= TP_WAIT; + (void) pthread_cond_wait(&tpool->tp_waitcv, &tpool->tp_mutex); + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + } + pthread_cleanup_pop(1); /* pthread_mutex_unlock(&tpool->tp_mutex); */ +} + +void +tpool_suspend(tpool_t *tpool) +{ + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + tpool->tp_flags |= TP_SUSPEND; + pthread_mutex_unlock(&tpool->tp_mutex); +} + +int +tpool_suspended(tpool_t *tpool) +{ + int suspended; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + suspended = (tpool->tp_flags & TP_SUSPEND) != 0; + pthread_mutex_unlock(&tpool->tp_mutex); + + return (suspended); +} + +void +tpool_resume(tpool_t *tpool) +{ + int excess; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + if (!(tpool->tp_flags & TP_SUSPEND)) { + pthread_mutex_unlock(&tpool->tp_mutex); + return; + } + tpool->tp_flags &= ~TP_SUSPEND; + (void) pthread_cond_broadcast(&tpool->tp_workcv); + excess = tpool->tp_njobs - tpool->tp_idle; + while (excess-- > 0 && tpool->tp_current < tpool->tp_maximum) { + if (create_worker(tpool) != 0) + break; /* pthread_create() failed */ + tpool->tp_current++; + } + pthread_mutex_unlock(&tpool->tp_mutex); +} + +int +tpool_member(tpool_t *tpool) +{ + pthread_t my_tid = pthread_self(); + tpool_active_t *activep; + + ASSERT(!(tpool->tp_flags & (TP_DESTROY | TP_ABANDON))); + + pthread_mutex_lock(&tpool->tp_mutex); + for (activep = tpool->tp_active; activep; activep = activep->tpa_next) { + if (activep->tpa_tid == my_tid) { + pthread_mutex_unlock(&tpool->tp_mutex); + return (1); + } + } + pthread_mutex_unlock(&tpool->tp_mutex); + return (0); +} diff --git a/lib/libtpool/thread_pool_impl.h b/lib/libtpool/thread_pool_impl.h new file mode 100644 index 000000000000..5349c2ade8c3 --- /dev/null +++ b/lib/libtpool/thread_pool_impl.h @@ -0,0 +1,93 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _THREAD_POOL_IMPL_H +#define _THREAD_POOL_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Thread pool implementation definitions. + * See for interface declarations. + */ + +/* + * FIFO queued job + */ +typedef struct tpool_job tpool_job_t; +struct tpool_job { + tpool_job_t *tpj_next; /* list of jobs */ + void (*tpj_func)(void *); /* function to call */ + void *tpj_arg; /* its argument */ +}; + +/* + * List of active threads, linked through their stacks. + */ +typedef struct tpool_active tpool_active_t; +struct tpool_active { + tpool_active_t *tpa_next; /* list of active threads */ + pthread_t tpa_tid; /* active thread id */ +}; + +/* + * The thread pool. + */ +struct tpool { + tpool_t *tp_forw; /* circular list of all thread pools */ + tpool_t *tp_back; + pthread_mutex_t tp_mutex; /* protects the pool data */ + pthread_cond_t tp_busycv; /* synchronization in tpool_dispatch */ + pthread_cond_t tp_workcv; /* synchronization with workers */ + pthread_cond_t tp_waitcv; /* synchronization in tpool_wait() */ + tpool_active_t *tp_active; /* threads performing work */ + tpool_job_t *tp_head; /* FIFO job queue */ + tpool_job_t *tp_tail; + pthread_attr_t tp_attr; /* attributes of the workers */ + int tp_flags; /* see below */ + uint_t tp_linger; /* seconds before idle workers exit */ + int tp_njobs; /* number of jobs in job queue */ + int tp_minimum; /* minimum number of worker threads */ + int tp_maximum; /* maximum number of worker threads */ + int tp_current; /* current number of worker threads */ + int tp_idle; /* number of idle workers */ +}; + +/* tp_flags */ +#define TP_WAIT 0x01 /* waiting in tpool_wait() */ +#define TP_SUSPEND 0x02 /* pool is being suspended */ +#define TP_DESTROY 0x04 /* pool is being destroyed */ +#define TP_ABANDON 0x08 /* pool is abandoned (auto-destroy) */ + +#ifdef __cplusplus +} +#endif + +#endif /* _THREAD_POOL_IMPL_H */ diff --git a/lib/libunicode/Makefile.am b/lib/libunicode/Makefile.am new file mode 100644 index 000000000000..fa9dd359d2ba --- /dev/null +++ b/lib/libunicode/Makefile.am @@ -0,0 +1,15 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = $(top_srcdir)/module/unicode + +# Includes kernel code, generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +noinst_LTLIBRARIES = libunicode.la + +KERNEL_C = \ + u8_textprep.c \ + uconv.c + +nodist_libunicode_la_SOURCES = \ + $(KERNEL_C) diff --git a/lib/libuutil/Makefile.am b/lib/libuutil/Makefile.am new file mode 100644 index 000000000000..8d9b32e44802 --- /dev/null +++ b/lib/libuutil/Makefile.am @@ -0,0 +1,34 @@ +include $(top_srcdir)/config/Rules.am + +lib_LTLIBRARIES = libuutil.la + +USER_C = \ + uu_alloc.c \ + uu_avl.c \ + uu_dprintf.c \ + uu_ident.c \ + uu_list.c \ + uu_misc.c \ + uu_open.c \ + uu_pname.c \ + uu_string.c + +libuutil_la_SOURCES = $(USER_C) + +libuutil_la_LIBADD = \ + $(abs_top_builddir)/lib/libavl/libavl.la \ + $(abs_top_builddir)/lib/libspl/libspl.la + +libuutil_la_LIBADD += $(LTLIBINTL) + +libuutil_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libuutil_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libuutil_la_LDFLAGS += -version-info 3:0:0 +else +libuutil_la_LDFLAGS += -version-info 1:1:0 +endif diff --git a/lib/libuutil/uu_alloc.c b/lib/libuutil/uu_alloc.c new file mode 100644 index 000000000000..2bef759d525e --- /dev/null +++ b/lib/libuutil/uu_alloc.c @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "libuutil_common.h" + +#include +#include +#include +#include + +void * +uu_zalloc(size_t n) +{ + void *p = malloc(n); + + if (p == NULL) { + uu_set_error(UU_ERROR_SYSTEM); + return (NULL); + } + + (void) memset(p, 0, n); + + return (p); +} + +void +uu_free(void *p) +{ + free(p); +} + +char * +uu_strdup(const char *str) +{ + char *buf = NULL; + + if (str != NULL) { + size_t sz; + + sz = strlen(str) + 1; + buf = uu_zalloc(sz); + if (buf != NULL) + (void) memcpy(buf, str, sz); + } + return (buf); +} + +/* + * Duplicate up to n bytes of a string. Kind of sort of like + * strdup(strlcpy(s, n)). + */ +char * +uu_strndup(const char *s, size_t n) +{ + size_t len; + char *p; + + len = strnlen(s, n); + p = uu_zalloc(len + 1); + if (p == NULL) + return (NULL); + + if (len > 0) + (void) memcpy(p, s, len); + p[len] = '\0'; + + return (p); +} + +/* + * Duplicate a block of memory. Combines malloc with memcpy, much as + * strdup combines malloc, strlen, and strcpy. + */ +void * +uu_memdup(const void *buf, size_t sz) +{ + void *p; + + p = uu_zalloc(sz); + if (p == NULL) + return (NULL); + (void) memcpy(p, buf, sz); + return (p); +} + +char * +uu_msprintf(const char *format, ...) +{ + va_list args; + char attic[1]; + uint_t M, m; + char *b; + + va_start(args, format); + M = vsnprintf(attic, 1, format, args); + va_end(args); + + for (;;) { + m = M; + if ((b = uu_zalloc(m + 1)) == NULL) + return (NULL); + + va_start(args, format); + M = vsnprintf(b, m + 1, format, args); + va_end(args); + + if (M == m) + break; /* sizes match */ + + uu_free(b); + } + + return (b); +} diff --git a/lib/libuutil/uu_avl.c b/lib/libuutil/uu_avl.c new file mode 100644 index 000000000000..040008883aec --- /dev/null +++ b/lib/libuutil/uu_avl.c @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include +#include +#include +#include + +static uu_avl_pool_t uu_null_apool = { &uu_null_apool, &uu_null_apool }; +static pthread_mutex_t uu_apool_list_lock = PTHREAD_MUTEX_INITIALIZER; + +/* + * The index mark change on every insert and delete, to catch stale + * references. + * + * We leave the low bit alone, since the avl code uses it. + */ +#define INDEX_MAX (sizeof (uintptr_t) - 2) +#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 2 : ((m) + 2) & INDEX_MAX) + +#define INDEX_DECODE(i) ((i) & ~INDEX_MAX) +#define INDEX_ENCODE(p, n) (((n) & ~INDEX_MAX) | (p)->ua_index) +#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ua_index) +#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) + +/* + * When an element is inactive (not in a tree), we keep a marked pointer to + * its containing pool in its first word, and a NULL pointer in its second. + * + * On insert, we use these to verify that it comes from the correct pool. + */ +#define NODE_ARRAY(p, n) ((uintptr_t *)((uintptr_t)(n) + \ + (pp)->uap_nodeoffset)) + +#define POOL_TO_MARKER(pp) (((uintptr_t)(pp) | 1)) + +#define DEAD_MARKER 0xc4 + +uu_avl_pool_t * +uu_avl_pool_create(const char *name, size_t objsize, size_t nodeoffset, + uu_compare_fn_t *compare_func, uint32_t flags) +{ + uu_avl_pool_t *pp, *next, *prev; + + if (name == NULL || + uu_check_name(name, UU_NAME_DOMAIN) == -1 || + nodeoffset + sizeof (uu_avl_node_t) > objsize || + compare_func == NULL) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if (flags & ~UU_AVL_POOL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + pp = uu_zalloc(sizeof (uu_avl_pool_t)); + if (pp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + (void) strlcpy(pp->uap_name, name, sizeof (pp->uap_name)); + pp->uap_nodeoffset = nodeoffset; + pp->uap_objsize = objsize; + pp->uap_cmp = compare_func; + if (flags & UU_AVL_POOL_DEBUG) + pp->uap_debug = 1; + pp->uap_last_index = 0; + + (void) pthread_mutex_init(&pp->uap_lock, NULL); + + pp->uap_null_avl.ua_next_enc = UU_PTR_ENCODE(&pp->uap_null_avl); + pp->uap_null_avl.ua_prev_enc = UU_PTR_ENCODE(&pp->uap_null_avl); + + (void) pthread_mutex_lock(&uu_apool_list_lock); + pp->uap_next = next = &uu_null_apool; + pp->uap_prev = prev = next->uap_prev; + next->uap_prev = pp; + prev->uap_next = pp; + (void) pthread_mutex_unlock(&uu_apool_list_lock); + + return (pp); +} + +void +uu_avl_pool_destroy(uu_avl_pool_t *pp) +{ + if (pp->uap_debug) { + if (pp->uap_null_avl.ua_next_enc != + UU_PTR_ENCODE(&pp->uap_null_avl) || + pp->uap_null_avl.ua_prev_enc != + UU_PTR_ENCODE(&pp->uap_null_avl)) { + uu_panic("uu_avl_pool_destroy: Pool \"%.*s\" (%p) has " + "outstanding avls, or is corrupt.\n", + (int)sizeof (pp->uap_name), pp->uap_name, + (void *)pp); + } + } + (void) pthread_mutex_lock(&uu_apool_list_lock); + pp->uap_next->uap_prev = pp->uap_prev; + pp->uap_prev->uap_next = pp->uap_next; + (void) pthread_mutex_unlock(&uu_apool_list_lock); + pp->uap_prev = NULL; + pp->uap_next = NULL; + uu_free(pp); +} + +void +uu_avl_node_init(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) +{ + uintptr_t *na = (uintptr_t *)np; + + if (pp->uap_debug) { + uintptr_t offset = (uintptr_t)np - (uintptr_t)base; + if (offset + sizeof (*np) > pp->uap_objsize) { + uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't fit in object (size %ld)\n", + base, (void *)np, (void *)pp, pp->uap_name, + (long)offset, (long)pp->uap_objsize); + } + if (offset != pp->uap_nodeoffset) { + uu_panic("uu_avl_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't match pool's offset (%ld)\n", + base, (void *)np, (void *)pp, pp->uap_name, + (long)offset, (long)pp->uap_objsize); + } + } + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; +} + +void +uu_avl_node_fini(void *base, uu_avl_node_t *np, uu_avl_pool_t *pp) +{ + uintptr_t *na = (uintptr_t *)np; + + if (pp->uap_debug) { + if (na[0] == DEAD_MARKER && na[1] == DEAD_MARKER) { + uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " + "node already finied\n", + base, (void *)np, (void *)pp, pp->uap_name); + } + if (na[0] != POOL_TO_MARKER(pp) || na[1] != 0) { + uu_panic("uu_avl_node_fini(%p, %p, %p (\"%s\")): " + "node corrupt, in tree, or in different pool\n", + base, (void *)np, (void *)pp, pp->uap_name); + } + } + + na[0] = DEAD_MARKER; + na[1] = DEAD_MARKER; + na[2] = DEAD_MARKER; +} + +struct uu_avl_node_compare_info { + uu_compare_fn_t *ac_compare; + void *ac_private; + void *ac_right; + void *ac_found; +}; + +static int +uu_avl_node_compare(const void *l, const void *r) +{ + struct uu_avl_node_compare_info *info = + (struct uu_avl_node_compare_info *)l; + + int res = info->ac_compare(r, info->ac_right, info->ac_private); + + if (res == 0) { + if (info->ac_found == NULL) + info->ac_found = (void *)r; + return (-1); + } + if (res < 0) + return (1); + return (-1); +} + +uu_avl_t * +uu_avl_create(uu_avl_pool_t *pp, void *parent, uint32_t flags) +{ + uu_avl_t *ap, *next, *prev; + + if (flags & ~UU_AVL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + ap = uu_zalloc(sizeof (*ap)); + if (ap == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + ap->ua_pool = pp; + ap->ua_parent_enc = UU_PTR_ENCODE(parent); + ap->ua_debug = pp->uap_debug || (flags & UU_AVL_DEBUG); + ap->ua_index = (pp->uap_last_index = INDEX_NEXT(pp->uap_last_index)); + + avl_create(&ap->ua_tree, &uu_avl_node_compare, pp->uap_objsize, + pp->uap_nodeoffset); + + ap->ua_null_walk.uaw_next = &ap->ua_null_walk; + ap->ua_null_walk.uaw_prev = &ap->ua_null_walk; + + (void) pthread_mutex_lock(&pp->uap_lock); + next = &pp->uap_null_avl; + prev = UU_PTR_DECODE(next->ua_prev_enc); + ap->ua_next_enc = UU_PTR_ENCODE(next); + ap->ua_prev_enc = UU_PTR_ENCODE(prev); + next->ua_prev_enc = UU_PTR_ENCODE(ap); + prev->ua_next_enc = UU_PTR_ENCODE(ap); + (void) pthread_mutex_unlock(&pp->uap_lock); + + return (ap); +} + +void +uu_avl_destroy(uu_avl_t *ap) +{ + uu_avl_pool_t *pp = ap->ua_pool; + + if (ap->ua_debug) { + if (avl_numnodes(&ap->ua_tree) != 0) { + uu_panic("uu_avl_destroy(%p): tree not empty\n", + (void *)ap); + } + if (ap->ua_null_walk.uaw_next != &ap->ua_null_walk || + ap->ua_null_walk.uaw_prev != &ap->ua_null_walk) { + uu_panic("uu_avl_destroy(%p): outstanding walkers\n", + (void *)ap); + } + } + (void) pthread_mutex_lock(&pp->uap_lock); + UU_AVL_PTR(ap->ua_next_enc)->ua_prev_enc = ap->ua_prev_enc; + UU_AVL_PTR(ap->ua_prev_enc)->ua_next_enc = ap->ua_next_enc; + (void) pthread_mutex_unlock(&pp->uap_lock); + ap->ua_prev_enc = UU_PTR_ENCODE(NULL); + ap->ua_next_enc = UU_PTR_ENCODE(NULL); + + ap->ua_pool = NULL; + avl_destroy(&ap->ua_tree); + + uu_free(ap); +} + +size_t +uu_avl_numnodes(uu_avl_t *ap) +{ + return (avl_numnodes(&ap->ua_tree)); +} + +void * +uu_avl_first(uu_avl_t *ap) +{ + return (avl_first(&ap->ua_tree)); +} + +void * +uu_avl_last(uu_avl_t *ap) +{ + return (avl_last(&ap->ua_tree)); +} + +void * +uu_avl_next(uu_avl_t *ap, void *node) +{ + return (AVL_NEXT(&ap->ua_tree, node)); +} + +void * +uu_avl_prev(uu_avl_t *ap, void *node) +{ + return (AVL_PREV(&ap->ua_tree, node)); +} + +static void +_avl_walk_init(uu_avl_walk_t *wp, uu_avl_t *ap, uint32_t flags) +{ + uu_avl_walk_t *next, *prev; + + int robust = (flags & UU_WALK_ROBUST); + int direction = (flags & UU_WALK_REVERSE)? -1 : 1; + + (void) memset(wp, 0, sizeof (*wp)); + wp->uaw_avl = ap; + wp->uaw_robust = robust; + wp->uaw_dir = direction; + + if (direction > 0) + wp->uaw_next_result = avl_first(&ap->ua_tree); + else + wp->uaw_next_result = avl_last(&ap->ua_tree); + + if (ap->ua_debug || robust) { + wp->uaw_next = next = &ap->ua_null_walk; + wp->uaw_prev = prev = next->uaw_prev; + next->uaw_prev = wp; + prev->uaw_next = wp; + } +} + +static void * +_avl_walk_advance(uu_avl_walk_t *wp, uu_avl_t *ap) +{ + void *np = wp->uaw_next_result; + + avl_tree_t *t = &ap->ua_tree; + + if (np == NULL) + return (NULL); + + wp->uaw_next_result = (wp->uaw_dir > 0)? AVL_NEXT(t, np) : + AVL_PREV(t, np); + + return (np); +} + +static void +_avl_walk_fini(uu_avl_walk_t *wp) +{ + if (wp->uaw_next != NULL) { + wp->uaw_next->uaw_prev = wp->uaw_prev; + wp->uaw_prev->uaw_next = wp->uaw_next; + wp->uaw_next = NULL; + wp->uaw_prev = NULL; + } + wp->uaw_avl = NULL; + wp->uaw_next_result = NULL; +} + +uu_avl_walk_t * +uu_avl_walk_start(uu_avl_t *ap, uint32_t flags) +{ + uu_avl_walk_t *wp; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + wp = uu_zalloc(sizeof (*wp)); + if (wp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + _avl_walk_init(wp, ap, flags); + return (wp); +} + +void * +uu_avl_walk_next(uu_avl_walk_t *wp) +{ + return (_avl_walk_advance(wp, wp->uaw_avl)); +} + +void +uu_avl_walk_end(uu_avl_walk_t *wp) +{ + _avl_walk_fini(wp); + uu_free(wp); +} + +int +uu_avl_walk(uu_avl_t *ap, uu_walk_fn_t *func, void *private, uint32_t flags) +{ + void *e; + uu_avl_walk_t my_walk; + + int status = UU_WALK_NEXT; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + _avl_walk_init(&my_walk, ap, flags); + while (status == UU_WALK_NEXT && + (e = _avl_walk_advance(&my_walk, ap)) != NULL) + status = (*func)(e, private); + _avl_walk_fini(&my_walk); + + if (status >= 0) + return (0); + uu_set_error(UU_ERROR_CALLBACK_FAILED); + return (-1); +} + +void +uu_avl_remove(uu_avl_t *ap, void *elem) +{ + uu_avl_walk_t *wp; + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + if (ap->ua_debug) { + /* + * invalidate outstanding uu_avl_index_ts. + */ + ap->ua_index = INDEX_NEXT(ap->ua_index); + } + + /* + * Robust walkers most be advanced, if we are removing the node + * they are currently using. In debug mode, non-robust walkers + * are also on the walker list. + */ + for (wp = ap->ua_null_walk.uaw_next; wp != &ap->ua_null_walk; + wp = wp->uaw_next) { + if (wp->uaw_robust) { + if (elem == wp->uaw_next_result) + (void) _avl_walk_advance(wp, ap); + } else if (wp->uaw_next_result != NULL) { + uu_panic("uu_avl_remove(%p, %p): active non-robust " + "walker\n", (void *)ap, elem); + } + } + + avl_remove(&ap->ua_tree, elem); + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; +} + +void * +uu_avl_teardown(uu_avl_t *ap, void **cookie) +{ + void *elem = avl_destroy_nodes(&ap->ua_tree, cookie); + + if (elem != NULL) { + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + na[0] = POOL_TO_MARKER(pp); + na[1] = 0; + } + return (elem); +} + +void * +uu_avl_find(uu_avl_t *ap, void *elem, void *private, uu_avl_index_t *out) +{ + struct uu_avl_node_compare_info info; + void *result; + + info.ac_compare = ap->ua_pool->uap_cmp; + info.ac_private = private; + info.ac_right = elem; + info.ac_found = NULL; + + result = avl_find(&ap->ua_tree, &info, out); + if (out != NULL) + *out = INDEX_ENCODE(ap, *out); + + if (ap->ua_debug && result != NULL) + uu_panic("uu_avl_find: internal error: avl_find succeeded\n"); + + return (info.ac_found); +} + +void +uu_avl_insert(uu_avl_t *ap, void *elem, uu_avl_index_t idx) +{ + if (ap->ua_debug) { + uu_avl_pool_t *pp = ap->ua_pool; + uintptr_t *na = NODE_ARRAY(pp, elem); + + if (na[1] != 0) + uu_panic("uu_avl_insert(%p, %p, %p): node already " + "in tree, or corrupt\n", + (void *)ap, elem, (void *)idx); + if (na[0] == 0) + uu_panic("uu_avl_insert(%p, %p, %p): node not " + "initialized\n", + (void *)ap, elem, (void *)idx); + if (na[0] != POOL_TO_MARKER(pp)) + uu_panic("uu_avl_insert(%p, %p, %p): node from " + "other pool, or corrupt\n", + (void *)ap, elem, (void *)idx); + + if (!INDEX_VALID(ap, idx)) + uu_panic("uu_avl_insert(%p, %p, %p): %s\n", + (void *)ap, elem, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + + /* + * invalidate outstanding uu_avl_index_ts. + */ + ap->ua_index = INDEX_NEXT(ap->ua_index); + } + avl_insert(&ap->ua_tree, elem, INDEX_DECODE(idx)); +} + +void * +uu_avl_nearest_next(uu_avl_t *ap, uu_avl_index_t idx) +{ + if (ap->ua_debug && !INDEX_VALID(ap, idx)) + uu_panic("uu_avl_nearest_next(%p, %p): %s\n", + (void *)ap, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_AFTER)); +} + +void * +uu_avl_nearest_prev(uu_avl_t *ap, uu_avl_index_t idx) +{ + if (ap->ua_debug && !INDEX_VALID(ap, idx)) + uu_panic("uu_avl_nearest_prev(%p, %p): %s\n", + (void *)ap, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + return (avl_nearest(&ap->ua_tree, INDEX_DECODE(idx), AVL_BEFORE)); +} + +/* + * called from uu_lockup() and uu_release(), as part of our fork1()-safety. + */ +void +uu_avl_lockup(void) +{ + uu_avl_pool_t *pp; + + (void) pthread_mutex_lock(&uu_apool_list_lock); + for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; + pp = pp->uap_next) + (void) pthread_mutex_lock(&pp->uap_lock); +} + +void +uu_avl_release(void) +{ + uu_avl_pool_t *pp; + + for (pp = uu_null_apool.uap_next; pp != &uu_null_apool; + pp = pp->uap_next) + (void) pthread_mutex_unlock(&pp->uap_lock); + (void) pthread_mutex_unlock(&uu_apool_list_lock); +} diff --git a/lib/libuutil/uu_dprintf.c b/lib/libuutil/uu_dprintf.c new file mode 100644 index 000000000000..6958057b29c4 --- /dev/null +++ b/lib/libuutil/uu_dprintf.c @@ -0,0 +1,130 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include +#include + +#define FACILITY_FMT "%s (%s): " + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +static const char * +strseverity(uu_dprintf_severity_t severity) +{ + switch (severity) { + case UU_DPRINTF_SILENT: + return (dgettext(TEXT_DOMAIN, "silent")); + case UU_DPRINTF_FATAL: + return (dgettext(TEXT_DOMAIN, "FATAL")); + case UU_DPRINTF_WARNING: + return (dgettext(TEXT_DOMAIN, "WARNING")); + case UU_DPRINTF_NOTICE: + return (dgettext(TEXT_DOMAIN, "note")); + case UU_DPRINTF_INFO: + return (dgettext(TEXT_DOMAIN, "info")); + case UU_DPRINTF_DEBUG: + return (dgettext(TEXT_DOMAIN, "debug")); + default: + return (dgettext(TEXT_DOMAIN, "unspecified")); + } +} + +uu_dprintf_t * +uu_dprintf_create(const char *name, uu_dprintf_severity_t severity, + uint_t flags) +{ + uu_dprintf_t *D; + + if (name != NULL && + uu_check_name(name, UU_NAME_DOMAIN) == -1) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if ((D = uu_zalloc(sizeof (uu_dprintf_t))) == NULL) + return (NULL); + + if (name != NULL) { + D->uud_name = strdup(name); + if (D->uud_name == NULL) { + uu_free(D); + return (NULL); + } + } else { + D->uud_name = NULL; + } + + D->uud_severity = severity; + D->uud_flags = flags; + + return (D); +} + +/*PRINTFLIKE3*/ +void +uu_dprintf(uu_dprintf_t *D, uu_dprintf_severity_t severity, + const char *format, ...) +{ + va_list alist; + + /* XXX Assert that severity is not UU_DPRINTF_SILENT. */ + + if (severity > D->uud_severity) + return; + + (void) fprintf(stderr, FACILITY_FMT, D->uud_name, + strseverity(severity)); + + va_start(alist, format); + (void) vfprintf(stderr, format, alist); + va_end(alist); +} + +void +uu_dprintf_destroy(uu_dprintf_t *D) +{ + if (D->uud_name) + free(D->uud_name); + + uu_free(D); +} + +const char * +uu_dprintf_getname(uu_dprintf_t *D) +{ + return (D->uud_name); +} diff --git a/lib/libuutil/uu_ident.c b/lib/libuutil/uu_ident.c new file mode 100644 index 000000000000..382139316ebc --- /dev/null +++ b/lib/libuutil/uu_ident.c @@ -0,0 +1,122 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include + +/* + * We require names of the form: + * [provider,]identifier[/[provider,]identifier]... + * + * Where provider is either a stock symbol (SUNW) or a java-style reversed + * domain name (com.sun). + * + * Both providers and identifiers must start with a letter, and may + * only contain alphanumerics, dashes, and underlines. Providers + * may also contain periods. + * + * Note that we do _not_ use the macros in , since they are affected + * by the current locale settings. + */ + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +#define IS_DIGIT(c) \ + ((c) >= '0' && (c) <= '9') + +static int +is_valid_ident(const char *s, const char *e, int allowdot) +{ + char c; + + if (s >= e) + return (0); /* name is empty */ + + c = *s++; + if (!IS_ALPHA(c)) + return (0); /* does not start with letter */ + + while (s < e && (c = *s++) != 0) { + if (IS_ALPHA(c) || IS_DIGIT(c) || c == '-' || c == '_' || + (allowdot && c == '.')) + continue; + return (0); /* invalid character */ + } + return (1); +} + +static int +is_valid_component(const char *b, const char *e, uint_t flags) +{ + char *sp; + + if (flags & UU_NAME_DOMAIN) { + sp = strchr(b, ','); + if (sp != NULL && sp < e) { + if (!is_valid_ident(b, sp, 1)) + return (0); + b = sp + 1; + } + } + + return (is_valid_ident(b, e, 0)); +} + +int +uu_check_name(const char *name, uint_t flags) +{ + const char *end = name + strlen(name); + const char *p; + + if (flags & ~(UU_NAME_DOMAIN | UU_NAME_PATH)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + if (!(flags & UU_NAME_PATH)) { + if (!is_valid_component(name, end, flags)) + goto bad; + return (0); + } + + while ((p = strchr(name, '/')) != NULL) { + if (!is_valid_component(name, p - 1, flags)) + goto bad; + name = p + 1; + } + if (!is_valid_component(name, end, flags)) + goto bad; + + return (0); + +bad: + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (-1); +} diff --git a/lib/libuutil/uu_list.c b/lib/libuutil/uu_list.c new file mode 100644 index 000000000000..c3a447d01de3 --- /dev/null +++ b/lib/libuutil/uu_list.c @@ -0,0 +1,718 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include +#include +#include +#include + +#define ELEM_TO_NODE(lp, e) \ + ((uu_list_node_impl_t *)((uintptr_t)(e) + (lp)->ul_offset)) + +#define NODE_TO_ELEM(lp, n) \ + ((void *)((uintptr_t)(n) - (lp)->ul_offset)) + +/* + * uu_list_index_ts define a location for insertion. They are simply a + * pointer to the object after the insertion point. We store a mark + * in the low-bits of the index, to help prevent mistakes. + * + * When debugging, the index mark changes on every insert and delete, to + * catch stale references. + */ +#define INDEX_MAX (sizeof (uintptr_t) - 1) +#define INDEX_NEXT(m) (((m) == INDEX_MAX)? 1 : ((m) + 1) & INDEX_MAX) + +#define INDEX_TO_NODE(i) ((uu_list_node_impl_t *)((i) & ~INDEX_MAX)) +#define NODE_TO_INDEX(p, n) (((uintptr_t)(n) & ~INDEX_MAX) | (p)->ul_index) +#define INDEX_VALID(p, i) (((i) & INDEX_MAX) == (p)->ul_index) +#define INDEX_CHECK(i) (((i) & INDEX_MAX) != 0) + +#define POOL_TO_MARKER(pp) ((void *)((uintptr_t)(pp) | 1)) + +static uu_list_pool_t uu_null_lpool = { &uu_null_lpool, &uu_null_lpool }; +static pthread_mutex_t uu_lpool_list_lock = PTHREAD_MUTEX_INITIALIZER; + +uu_list_pool_t * +uu_list_pool_create(const char *name, size_t objsize, + size_t nodeoffset, uu_compare_fn_t *compare_func, uint32_t flags) +{ + uu_list_pool_t *pp, *next, *prev; + + if (name == NULL || + uu_check_name(name, UU_NAME_DOMAIN) == -1 || + nodeoffset + sizeof (uu_list_node_t) > objsize) { + uu_set_error(UU_ERROR_INVALID_ARGUMENT); + return (NULL); + } + + if (flags & ~UU_LIST_POOL_DEBUG) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + pp = uu_zalloc(sizeof (uu_list_pool_t)); + if (pp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + (void) strlcpy(pp->ulp_name, name, sizeof (pp->ulp_name)); + pp->ulp_nodeoffset = nodeoffset; + pp->ulp_objsize = objsize; + pp->ulp_cmp = compare_func; + if (flags & UU_LIST_POOL_DEBUG) + pp->ulp_debug = 1; + pp->ulp_last_index = 0; + + (void) pthread_mutex_init(&pp->ulp_lock, NULL); + + pp->ulp_null_list.ul_next_enc = UU_PTR_ENCODE(&pp->ulp_null_list); + pp->ulp_null_list.ul_prev_enc = UU_PTR_ENCODE(&pp->ulp_null_list); + + (void) pthread_mutex_lock(&uu_lpool_list_lock); + pp->ulp_next = next = &uu_null_lpool; + pp->ulp_prev = prev = next->ulp_prev; + next->ulp_prev = pp; + prev->ulp_next = pp; + (void) pthread_mutex_unlock(&uu_lpool_list_lock); + + return (pp); +} + +void +uu_list_pool_destroy(uu_list_pool_t *pp) +{ + if (pp->ulp_debug) { + if (pp->ulp_null_list.ul_next_enc != + UU_PTR_ENCODE(&pp->ulp_null_list) || + pp->ulp_null_list.ul_prev_enc != + UU_PTR_ENCODE(&pp->ulp_null_list)) { + uu_panic("uu_list_pool_destroy: Pool \"%.*s\" (%p) has " + "outstanding lists, or is corrupt.\n", + (int)sizeof (pp->ulp_name), pp->ulp_name, + (void *)pp); + } + } + (void) pthread_mutex_lock(&uu_lpool_list_lock); + pp->ulp_next->ulp_prev = pp->ulp_prev; + pp->ulp_prev->ulp_next = pp->ulp_next; + (void) pthread_mutex_unlock(&uu_lpool_list_lock); + pp->ulp_prev = NULL; + pp->ulp_next = NULL; + uu_free(pp); +} + +void +uu_list_node_init(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) +{ + uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; + + if (pp->ulp_debug) { + uintptr_t offset = (uintptr_t)np - (uintptr_t)base; + if (offset + sizeof (*np) > pp->ulp_objsize) { + uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't fit in object (size %ld)\n", + base, (void *)np, (void *)pp, pp->ulp_name, + (long)offset, (long)pp->ulp_objsize); + } + if (offset != pp->ulp_nodeoffset) { + uu_panic("uu_list_node_init(%p, %p, %p (\"%s\")): " + "offset %ld doesn't match pool's offset (%ld)\n", + base, (void *)np, (void *)pp, pp->ulp_name, + (long)offset, (long)pp->ulp_objsize); + } + } + np->uln_next = POOL_TO_MARKER(pp); + np->uln_prev = NULL; +} + +void +uu_list_node_fini(void *base, uu_list_node_t *np_arg, uu_list_pool_t *pp) +{ + uu_list_node_impl_t *np = (uu_list_node_impl_t *)np_arg; + + if (pp->ulp_debug) { + if (np->uln_next == NULL && + np->uln_prev == NULL) { + uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " + "node already finied\n", + base, (void *)np_arg, (void *)pp, pp->ulp_name); + } + if (np->uln_next != POOL_TO_MARKER(pp) || + np->uln_prev != NULL) { + uu_panic("uu_list_node_fini(%p, %p, %p (\"%s\")): " + "node corrupt or on list\n", + base, (void *)np_arg, (void *)pp, pp->ulp_name); + } + } + np->uln_next = NULL; + np->uln_prev = NULL; +} + +uu_list_t * +uu_list_create(uu_list_pool_t *pp, void *parent, uint32_t flags) +{ + uu_list_t *lp, *next, *prev; + + if (flags & ~(UU_LIST_DEBUG | UU_LIST_SORTED)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + if ((flags & UU_LIST_SORTED) && pp->ulp_cmp == NULL) { + if (pp->ulp_debug) + uu_panic("uu_list_create(%p, ...): requested " + "UU_LIST_SORTED, but pool has no comparison func\n", + (void *)pp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (NULL); + } + + lp = uu_zalloc(sizeof (*lp)); + if (lp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + lp->ul_pool = pp; + lp->ul_parent_enc = UU_PTR_ENCODE(parent); + lp->ul_offset = pp->ulp_nodeoffset; + lp->ul_debug = pp->ulp_debug || (flags & UU_LIST_DEBUG); + lp->ul_sorted = (flags & UU_LIST_SORTED); + lp->ul_numnodes = 0; + lp->ul_index = (pp->ulp_last_index = INDEX_NEXT(pp->ulp_last_index)); + + lp->ul_null_node.uln_next = &lp->ul_null_node; + lp->ul_null_node.uln_prev = &lp->ul_null_node; + + lp->ul_null_walk.ulw_next = &lp->ul_null_walk; + lp->ul_null_walk.ulw_prev = &lp->ul_null_walk; + + (void) pthread_mutex_lock(&pp->ulp_lock); + next = &pp->ulp_null_list; + prev = UU_PTR_DECODE(next->ul_prev_enc); + lp->ul_next_enc = UU_PTR_ENCODE(next); + lp->ul_prev_enc = UU_PTR_ENCODE(prev); + next->ul_prev_enc = UU_PTR_ENCODE(lp); + prev->ul_next_enc = UU_PTR_ENCODE(lp); + (void) pthread_mutex_unlock(&pp->ulp_lock); + + return (lp); +} + +void +uu_list_destroy(uu_list_t *lp) +{ + uu_list_pool_t *pp = lp->ul_pool; + + if (lp->ul_debug) { + if (lp->ul_null_node.uln_next != &lp->ul_null_node || + lp->ul_null_node.uln_prev != &lp->ul_null_node) { + uu_panic("uu_list_destroy(%p): list not empty\n", + (void *)lp); + } + if (lp->ul_numnodes != 0) { + uu_panic("uu_list_destroy(%p): numnodes is nonzero, " + "but list is empty\n", (void *)lp); + } + if (lp->ul_null_walk.ulw_next != &lp->ul_null_walk || + lp->ul_null_walk.ulw_prev != &lp->ul_null_walk) { + uu_panic("uu_list_destroy(%p): outstanding walkers\n", + (void *)lp); + } + } + + (void) pthread_mutex_lock(&pp->ulp_lock); + UU_LIST_PTR(lp->ul_next_enc)->ul_prev_enc = lp->ul_prev_enc; + UU_LIST_PTR(lp->ul_prev_enc)->ul_next_enc = lp->ul_next_enc; + (void) pthread_mutex_unlock(&pp->ulp_lock); + lp->ul_prev_enc = UU_PTR_ENCODE(NULL); + lp->ul_next_enc = UU_PTR_ENCODE(NULL); + lp->ul_pool = NULL; + uu_free(lp); +} + +static void +list_insert(uu_list_t *lp, uu_list_node_impl_t *np, uu_list_node_impl_t *prev, + uu_list_node_impl_t *next) +{ + if (lp->ul_debug) { + if (next->uln_prev != prev || prev->uln_next != next) + uu_panic("insert(%p): internal error: %p and %p not " + "neighbors\n", (void *)lp, (void *)next, + (void *)prev); + + if (np->uln_next != POOL_TO_MARKER(lp->ul_pool) || + np->uln_prev != NULL) { + uu_panic("insert(%p): elem %p node %p corrupt, " + "not initialized, or already in a list.\n", + (void *)lp, NODE_TO_ELEM(lp, np), (void *)np); + } + /* + * invalidate outstanding uu_list_index_ts. + */ + lp->ul_index = INDEX_NEXT(lp->ul_index); + } + np->uln_next = next; + np->uln_prev = prev; + next->uln_prev = np; + prev->uln_next = np; + + lp->ul_numnodes++; +} + +void +uu_list_insert(uu_list_t *lp, void *elem, uu_list_index_t idx) +{ + uu_list_node_impl_t *np; + + np = INDEX_TO_NODE(idx); + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_insert(%p, %p, %p): %s\n", + (void *)lp, elem, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_insert(%p, %p, %p): out-of-date " + "index\n", (void *)lp, elem, (void *)idx); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); +} + +void * +uu_list_find(uu_list_t *lp, void *elem, void *private, uu_list_index_t *out) +{ + int sorted = lp->ul_sorted; + uu_compare_fn_t *func = lp->ul_pool->ulp_cmp; + uu_list_node_impl_t *np; + + if (func == NULL) { + if (out != NULL) + *out = 0; + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (NULL); + } + for (np = lp->ul_null_node.uln_next; np != &lp->ul_null_node; + np = np->uln_next) { + void *ep = NODE_TO_ELEM(lp, np); + int cmp = func(ep, elem, private); + if (cmp == 0) { + if (out != NULL) + *out = NODE_TO_INDEX(lp, np); + return (ep); + } + if (sorted && cmp > 0) { + if (out != NULL) + *out = NODE_TO_INDEX(lp, np); + return (NULL); + } + } + if (out != NULL) + *out = NODE_TO_INDEX(lp, 0); + return (NULL); +} + +void * +uu_list_nearest_next(uu_list_t *lp, uu_list_index_t idx) +{ + uu_list_node_impl_t *np = INDEX_TO_NODE(idx); + + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_nearest_next(%p, %p): %s\n", + (void *)lp, (void *)idx, + INDEX_CHECK(idx)? "outdated index" : + "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_nearest_next(%p, %p): out-of-date " + "index\n", (void *)lp, (void *)idx); + } + + if (np == &lp->ul_null_node) + return (NULL); + else + return (NODE_TO_ELEM(lp, np)); +} + +void * +uu_list_nearest_prev(uu_list_t *lp, uu_list_index_t idx) +{ + uu_list_node_impl_t *np = INDEX_TO_NODE(idx); + + if (np == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (!INDEX_VALID(lp, idx)) + uu_panic("uu_list_nearest_prev(%p, %p): %s\n", + (void *)lp, (void *)idx, INDEX_CHECK(idx)? + "outdated index" : "invalid index"); + if (np->uln_prev == NULL) + uu_panic("uu_list_nearest_prev(%p, %p): out-of-date " + "index\n", (void *)lp, (void *)idx); + } + + if ((np = np->uln_prev) == &lp->ul_null_node) + return (NULL); + else + return (NODE_TO_ELEM(lp, np)); +} + +static void +list_walk_init(uu_list_walk_t *wp, uu_list_t *lp, uint32_t flags) +{ + uu_list_walk_t *next, *prev; + + int robust = (flags & UU_WALK_ROBUST); + int direction = (flags & UU_WALK_REVERSE)? -1 : 1; + + (void) memset(wp, 0, sizeof (*wp)); + wp->ulw_list = lp; + wp->ulw_robust = robust; + wp->ulw_dir = direction; + if (direction > 0) + wp->ulw_next_result = lp->ul_null_node.uln_next; + else + wp->ulw_next_result = lp->ul_null_node.uln_prev; + + if (lp->ul_debug || robust) { + /* + * Add this walker to the list's list of walkers so + * uu_list_remove() can advance us if somebody tries to + * remove ulw_next_result. + */ + wp->ulw_next = next = &lp->ul_null_walk; + wp->ulw_prev = prev = next->ulw_prev; + next->ulw_prev = wp; + prev->ulw_next = wp; + } +} + +static uu_list_node_impl_t * +list_walk_advance(uu_list_walk_t *wp, uu_list_t *lp) +{ + uu_list_node_impl_t *np = wp->ulw_next_result; + uu_list_node_impl_t *next; + + if (np == &lp->ul_null_node) + return (NULL); + + next = (wp->ulw_dir > 0)? np->uln_next : np->uln_prev; + + wp->ulw_next_result = next; + return (np); +} + +static void +list_walk_fini(uu_list_walk_t *wp) +{ + /* GLXXX debugging? */ + if (wp->ulw_next != NULL) { + wp->ulw_next->ulw_prev = wp->ulw_prev; + wp->ulw_prev->ulw_next = wp->ulw_next; + wp->ulw_next = NULL; + wp->ulw_prev = NULL; + } + wp->ulw_list = NULL; + wp->ulw_next_result = NULL; +} + +uu_list_walk_t * +uu_list_walk_start(uu_list_t *lp, uint32_t flags) +{ + uu_list_walk_t *wp; + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (NULL); + } + + wp = uu_zalloc(sizeof (*wp)); + if (wp == NULL) { + uu_set_error(UU_ERROR_NO_MEMORY); + return (NULL); + } + + list_walk_init(wp, lp, flags); + return (wp); +} + +void * +uu_list_walk_next(uu_list_walk_t *wp) +{ + uu_list_t *lp = wp->ulw_list; + uu_list_node_impl_t *np = list_walk_advance(wp, lp); + + if (np == NULL) + return (NULL); + + return (NODE_TO_ELEM(lp, np)); +} + +void +uu_list_walk_end(uu_list_walk_t *wp) +{ + list_walk_fini(wp); + uu_free(wp); +} + +int +uu_list_walk(uu_list_t *lp, uu_walk_fn_t *func, void *private, uint32_t flags) +{ + uu_list_node_impl_t *np; + + int status = UU_WALK_NEXT; + + int robust = (flags & UU_WALK_ROBUST); + int reverse = (flags & UU_WALK_REVERSE); + + if (flags & ~(UU_WALK_ROBUST | UU_WALK_REVERSE)) { + uu_set_error(UU_ERROR_UNKNOWN_FLAG); + return (-1); + } + + if (lp->ul_debug || robust) { + uu_list_walk_t my_walk; + void *e; + + list_walk_init(&my_walk, lp, flags); + while (status == UU_WALK_NEXT && + (e = uu_list_walk_next(&my_walk)) != NULL) + status = (*func)(e, private); + list_walk_fini(&my_walk); + } else { + if (!reverse) { + for (np = lp->ul_null_node.uln_next; + status == UU_WALK_NEXT && np != &lp->ul_null_node; + np = np->uln_next) { + status = (*func)(NODE_TO_ELEM(lp, np), private); + } + } else { + for (np = lp->ul_null_node.uln_prev; + status == UU_WALK_NEXT && np != &lp->ul_null_node; + np = np->uln_prev) { + status = (*func)(NODE_TO_ELEM(lp, np), private); + } + } + } + if (status >= 0) + return (0); + uu_set_error(UU_ERROR_CALLBACK_FAILED); + return (-1); +} + +void +uu_list_remove(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, elem); + uu_list_walk_t *wp; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_remove(%p, %p): elem not on list\n", + (void *)lp, elem); + /* + * invalidate outstanding uu_list_index_ts. + */ + lp->ul_index = INDEX_NEXT(lp->ul_index); + } + + /* + * robust walkers must be advanced. In debug mode, non-robust + * walkers are also on the list. If there are any, it's an error. + */ + for (wp = lp->ul_null_walk.ulw_next; wp != &lp->ul_null_walk; + wp = wp->ulw_next) { + if (wp->ulw_robust) { + if (np == wp->ulw_next_result) + (void) list_walk_advance(wp, lp); + } else if (wp->ulw_next_result != NULL) { + uu_panic("uu_list_remove(%p, %p): active non-robust " + "walker\n", (void *)lp, elem); + } + } + + np->uln_next->uln_prev = np->uln_prev; + np->uln_prev->uln_next = np->uln_next; + + lp->ul_numnodes--; + + np->uln_next = POOL_TO_MARKER(lp->ul_pool); + np->uln_prev = NULL; +} + +void * +uu_list_teardown(uu_list_t *lp, void **cookie) +{ + void *ep; + + /* + * XXX: disable list modification until list is empty + */ + if (lp->ul_debug && *cookie != NULL) + uu_panic("uu_list_teardown(%p, %p): unexpected cookie\n", + (void *)lp, (void *)cookie); + + ep = uu_list_first(lp); + if (ep) + uu_list_remove(lp, ep); + return (ep); +} + +int +uu_list_insert_before(uu_list_t *lp, void *target, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); + + if (target == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_insert_before(%p, %p, %p): %p is " + "not currently on a list\n", + (void *)lp, target, elem, target); + } + if (lp->ul_sorted) { + if (lp->ul_debug) + uu_panic("uu_list_insert_before(%p, ...): list is " + "UU_LIST_SORTED\n", (void *)lp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (-1); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np->uln_prev, np); + return (0); +} + +int +uu_list_insert_after(uu_list_t *lp, void *target, void *elem) +{ + uu_list_node_impl_t *np = ELEM_TO_NODE(lp, target); + + if (target == NULL) + np = &lp->ul_null_node; + + if (lp->ul_debug) { + if (np->uln_prev == NULL) + uu_panic("uu_list_insert_after(%p, %p, %p): %p is " + "not currently on a list\n", + (void *)lp, target, elem, target); + } + if (lp->ul_sorted) { + if (lp->ul_debug) + uu_panic("uu_list_insert_after(%p, ...): list is " + "UU_LIST_SORTED\n", (void *)lp); + uu_set_error(UU_ERROR_NOT_SUPPORTED); + return (-1); + } + + list_insert(lp, ELEM_TO_NODE(lp, elem), np, np->uln_next); + return (0); +} + +size_t +uu_list_numnodes(uu_list_t *lp) +{ + return (lp->ul_numnodes); +} + +void * +uu_list_first(uu_list_t *lp) +{ + uu_list_node_impl_t *n = lp->ul_null_node.uln_next; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_last(uu_list_t *lp) +{ + uu_list_node_impl_t *n = lp->ul_null_node.uln_prev; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_next(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); + + n = n->uln_next; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +void * +uu_list_prev(uu_list_t *lp, void *elem) +{ + uu_list_node_impl_t *n = ELEM_TO_NODE(lp, elem); + + n = n->uln_prev; + if (n == &lp->ul_null_node) + return (NULL); + return (NODE_TO_ELEM(lp, n)); +} + +/* + * called from uu_lockup() and uu_release(), as part of our fork1()-safety. + */ +void +uu_list_lockup(void) +{ + uu_list_pool_t *pp; + + (void) pthread_mutex_lock(&uu_lpool_list_lock); + for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; + pp = pp->ulp_next) + (void) pthread_mutex_lock(&pp->ulp_lock); +} + +void +uu_list_release(void) +{ + uu_list_pool_t *pp; + + for (pp = uu_null_lpool.ulp_next; pp != &uu_null_lpool; + pp = pp->ulp_next) + (void) pthread_mutex_unlock(&pp->ulp_lock); + (void) pthread_mutex_unlock(&uu_lpool_list_lock); +} diff --git a/lib/libuutil/uu_misc.c b/lib/libuutil/uu_misc.c new file mode 100644 index 000000000000..b10afd8eadf1 --- /dev/null +++ b/lib/libuutil/uu_misc.c @@ -0,0 +1,281 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + +/* + * All of the old code under !defined(PTHREAD_ONCE_KEY_NP) + * is here to enable the building of a native version of + * libuutil.so when the build machine has not yet been upgraded + * to a version of libc that provides pthread_key_create_once_np(). + * It should all be deleted when solaris_nevada ships. + * The code is not MT-safe in a relaxed memory model. + */ + +#if defined(PTHREAD_ONCE_KEY_NP) +static pthread_key_t uu_error_key = PTHREAD_ONCE_KEY_NP; +#else /* PTHREAD_ONCE_KEY_NP */ +static pthread_key_t uu_error_key = 0; +static pthread_mutex_t uu_key_lock = PTHREAD_MUTEX_INITIALIZER; +#endif /* PTHREAD_ONCE_KEY_NP */ + +static int uu_error_key_setup = 0; + +static pthread_mutex_t uu_panic_lock = PTHREAD_MUTEX_INITIALIZER; +/* LINTED static unused */ +static const char *uu_panic_format; +/* LINTED static unused */ +static va_list uu_panic_args; +static pthread_t uu_panic_thread; + +static uint32_t _uu_main_error; +static __thread int _uu_main_thread = 0; + +void +uu_set_error(uint_t code) +{ + if (_uu_main_thread) { + _uu_main_error = code; + return; + } +#if defined(PTHREAD_ONCE_KEY_NP) + if (pthread_key_create_once_np(&uu_error_key, NULL) != 0) + uu_error_key_setup = -1; + else + uu_error_key_setup = 1; +#else /* PTHREAD_ONCE_KEY_NP */ + if (uu_error_key_setup == 0) { + (void) pthread_mutex_lock(&uu_key_lock); + if (uu_error_key_setup == 0) { + if (pthread_key_create(&uu_error_key, NULL) != 0) + uu_error_key_setup = -1; + else + uu_error_key_setup = 1; + } + (void) pthread_mutex_unlock(&uu_key_lock); + } +#endif /* PTHREAD_ONCE_KEY_NP */ + if (uu_error_key_setup > 0) + (void) pthread_setspecific(uu_error_key, + (void *)(uintptr_t)code); +} + +uint32_t +uu_error(void) +{ + if (_uu_main_thread) + return (_uu_main_error); + + if (uu_error_key_setup < 0) /* can't happen? */ + return (UU_ERROR_UNKNOWN); + + /* + * Because UU_ERROR_NONE == 0, if uu_set_error() was + * never called, then this will return UU_ERROR_NONE: + */ + return ((uint32_t)(uintptr_t)pthread_getspecific(uu_error_key)); +} + +const char * +uu_strerror(uint32_t code) +{ + const char *str; + + switch (code) { + case UU_ERROR_NONE: + str = dgettext(TEXT_DOMAIN, "No error"); + break; + + case UU_ERROR_INVALID_ARGUMENT: + str = dgettext(TEXT_DOMAIN, "Invalid argument"); + break; + + case UU_ERROR_UNKNOWN_FLAG: + str = dgettext(TEXT_DOMAIN, "Unknown flag passed"); + break; + + case UU_ERROR_NO_MEMORY: + str = dgettext(TEXT_DOMAIN, "Out of memory"); + break; + + case UU_ERROR_CALLBACK_FAILED: + str = dgettext(TEXT_DOMAIN, "Callback-initiated failure"); + break; + + case UU_ERROR_NOT_SUPPORTED: + str = dgettext(TEXT_DOMAIN, "Operation not supported"); + break; + + case UU_ERROR_EMPTY: + str = dgettext(TEXT_DOMAIN, "No value provided"); + break; + + case UU_ERROR_UNDERFLOW: + str = dgettext(TEXT_DOMAIN, "Value too small"); + break; + + case UU_ERROR_OVERFLOW: + str = dgettext(TEXT_DOMAIN, "Value too large"); + break; + + case UU_ERROR_INVALID_CHAR: + str = dgettext(TEXT_DOMAIN, + "Value contains unexpected character"); + break; + + case UU_ERROR_INVALID_DIGIT: + str = dgettext(TEXT_DOMAIN, + "Value contains digit not in base"); + break; + + case UU_ERROR_SYSTEM: + str = dgettext(TEXT_DOMAIN, "Underlying system error"); + break; + + case UU_ERROR_UNKNOWN: + str = dgettext(TEXT_DOMAIN, "Error status not known"); + break; + + default: + errno = ESRCH; + str = NULL; + break; + } + return (str); +} + +void +uu_panic(const char *format, ...) +{ + va_list args; + + va_start(args, format); + + (void) pthread_mutex_lock(&uu_panic_lock); + if (uu_panic_thread == 0) { + uu_panic_thread = pthread_self(); + uu_panic_format = format; + va_copy(uu_panic_args, args); + } + (void) pthread_mutex_unlock(&uu_panic_lock); + + (void) vfprintf(stderr, format, args); + + va_end(args); + + if (uu_panic_thread == pthread_self()) + abort(); + else + for (;;) + (void) pause(); +} + +static void +uu_lockup(void) +{ + (void) pthread_mutex_lock(&uu_panic_lock); +#if !defined(PTHREAD_ONCE_KEY_NP) + (void) pthread_mutex_lock(&uu_key_lock); +#endif + uu_avl_lockup(); + uu_list_lockup(); +} + +static void +uu_release(void) +{ + (void) pthread_mutex_unlock(&uu_panic_lock); +#if !defined(PTHREAD_ONCE_KEY_NP) + (void) pthread_mutex_unlock(&uu_key_lock); +#endif + uu_avl_release(); + uu_list_release(); +} + +static void +uu_release_child(void) +{ + uu_panic_format = NULL; + uu_panic_thread = 0; + + uu_release(); +} + +#ifdef __GNUC__ +static void +uu_init(void) __attribute__((constructor)); +#else +#pragma init(uu_init) +#endif + +static void +uu_init(void) +{ + _uu_main_thread = 1; + (void) pthread_atfork(uu_lockup, uu_release, uu_release_child); +} + +/* + * Dump a block of memory in hex+ascii, for debugging + */ +void +uu_dump(FILE *out, const char *prefix, const void *buf, size_t len) +{ + const unsigned char *p = buf; + int i; + + for (i = 0; i < len; i += 16) { + int j; + + (void) fprintf(out, "%s", prefix); + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%2.2x ", p[i + j]); + } + for (; j < 16; j++) { + (void) fprintf(out, " "); + } + for (j = 0; j < 16 && i + j < len; j++) { + (void) fprintf(out, "%c", + isprint(p[i + j]) ? p[i + j] : '.'); + } + (void) fprintf(out, "\n"); + } +} diff --git a/lib/libuutil/uu_open.c b/lib/libuutil/uu_open.c new file mode 100644 index 000000000000..cf5c5450b820 --- /dev/null +++ b/lib/libuutil/uu_open.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include + +#include +#include +#include +#include +#include + +#ifdef _LP64 +#define TMPPATHFMT "%s/uu%ld" +#else /* _LP64 */ +#define TMPPATHFMT "%s/uu%lld" +#endif /* _LP64 */ + +/*ARGSUSED*/ +int +uu_open_tmp(const char *dir, uint_t uflags) +{ + int f; + char *fname = uu_zalloc(PATH_MAX); + + if (fname == NULL) + return (-1); + + for (;;) { + (void) snprintf(fname, PATH_MAX, "%s/uu%lld", dir, gethrtime()); + + f = open(fname, O_CREAT | O_EXCL | O_RDWR, 0600); + + if (f >= 0 || errno != EEXIST) + break; + } + + if (f >= 0) + (void) unlink(fname); + + uu_free(fname); + + return (f); +} diff --git a/lib/libuutil/uu_pname.c b/lib/libuutil/uu_pname.c new file mode 100644 index 000000000000..a6a0f22661e5 --- /dev/null +++ b/lib/libuutil/uu_pname.c @@ -0,0 +1,207 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include "libuutil_common.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const char PNAME_FMT[] = "%s: "; +static const char ERRNO_FMT[] = ": %s\n"; + +static const char *pname; + +static void +uu_die_internal(int status, const char *format, va_list alist) __NORETURN; + +int uu_exit_ok_value = EXIT_SUCCESS; +int uu_exit_fatal_value = EXIT_FAILURE; +int uu_exit_usage_value = 2; + +int * +uu_exit_ok(void) +{ + return (&uu_exit_ok_value); +} + +int * +uu_exit_fatal(void) +{ + return (&uu_exit_fatal_value); +} + +int * +uu_exit_usage(void) +{ + return (&uu_exit_usage_value); +} + +void +uu_alt_exit(int profile) +{ + switch (profile) { + case UU_PROFILE_DEFAULT: + uu_exit_ok_value = EXIT_SUCCESS; + uu_exit_fatal_value = EXIT_FAILURE; + uu_exit_usage_value = 2; + break; + case UU_PROFILE_LAUNCHER: + uu_exit_ok_value = EXIT_SUCCESS; + uu_exit_fatal_value = 124; + uu_exit_usage_value = 125; + break; + } +} + +static void +uu_warn_internal(int err, const char *format, va_list alist) +{ + if (pname != NULL) + (void) fprintf(stderr, PNAME_FMT, pname); + + (void) vfprintf(stderr, format, alist); + + if (strrchr(format, '\n') == NULL) + (void) fprintf(stderr, ERRNO_FMT, strerror(err)); +} + +void +uu_vwarn(const char *format, va_list alist) +{ + uu_warn_internal(errno, format, alist); +} + +/*PRINTFLIKE1*/ +void +uu_warn(const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_warn_internal(errno, format, alist); + va_end(alist); +} + +static void +uu_die_internal(int status, const char *format, va_list alist) +{ + uu_warn_internal(errno, format, alist); +#ifdef DEBUG + { + char *cp; + + if (!issetugid()) { + cp = getenv("UU_DIE_ABORTS"); + if (cp != NULL && *cp != '\0') + abort(); + } + } +#endif + exit(status); +} + +void +uu_vdie(const char *format, va_list alist) +{ + uu_die_internal(UU_EXIT_FATAL, format, alist); +} + +/*PRINTFLIKE1*/ +void +uu_die(const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_die_internal(UU_EXIT_FATAL, format, alist); + va_end(alist); +} + +void +uu_vxdie(int status, const char *format, va_list alist) +{ + uu_die_internal(status, format, alist); +} + +/*PRINTFLIKE2*/ +void +uu_xdie(int status, const char *format, ...) +{ + va_list alist; + va_start(alist, format); + uu_die_internal(status, format, alist); + va_end(alist); +} + +const char * +uu_setpname(char *arg0) +{ + /* + * Having a NULL argv[0], while uncommon, is possible. It + * makes more sense to handle this event in uu_setpname rather + * than in each of its consumers. + */ + if (arg0 == NULL) { + pname = getexecname(); + if (pname == NULL) + pname = "unknown_command"; + return (pname); + } + + /* + * Guard against '/' at end of command invocation. + */ + for (;;) { + char *p = strrchr(arg0, '/'); + if (p == NULL) { + pname = arg0; + break; + } else { + if (*(p + 1) == '\0') { + *p = '\0'; + continue; + } + + pname = p + 1; + break; + } + } + + return (pname); +} + +const char * +uu_getpname(void) +{ + return (pname); +} diff --git a/lib/libuutil/uu_string.c b/lib/libuutil/uu_string.c new file mode 100644 index 000000000000..67024c3b50b4 --- /dev/null +++ b/lib/libuutil/uu_string.c @@ -0,0 +1,54 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * String helper functions + */ + +#include +#include +#include +#include "libuutil.h" + +/* Return true if strings are equal */ +boolean_t +uu_streq(const char *a, const char *b) +{ + return (strcmp(a, b) == 0); +} + +/* Return true if strings are equal, case-insensitively */ +boolean_t +uu_strcaseeq(const char *a, const char *b) +{ + return (strcasecmp(a, b) == 0); +} + +/* Return true if string a Begins With string b */ +boolean_t +uu_strbw(const char *a, const char *b) +{ + return (strncmp(a, b, strlen(b)) == 0); +} diff --git a/lib/libzfs/.gitignore b/lib/libzfs/.gitignore new file mode 100644 index 000000000000..9336a5c00b46 --- /dev/null +++ b/lib/libzfs/.gitignore @@ -0,0 +1 @@ +/libzfs.pc diff --git a/lib/libzfs/Makefile.am b/lib/libzfs/Makefile.am new file mode 100644 index 000000000000..f88fb828d53d --- /dev/null +++ b/lib/libzfs/Makefile.am @@ -0,0 +1,93 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = \ + $(top_srcdir)/module/icp \ + $(top_srcdir)/module/zcommon \ + $(top_srcdir)/lib/libzfs + +# Suppress unused but set variable warnings often due to ASSERTs +AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) +AM_CFLAGS += $(LIBCRYPTO_CFLAGS) $(ZLIB_CFLAGS) + +pkgconfig_DATA = libzfs.pc + +lib_LTLIBRARIES = libzfs.la + +USER_C = \ + libzfs_changelist.c \ + libzfs_config.c \ + libzfs_crypto.c \ + libzfs_dataset.c \ + libzfs_diff.c \ + libzfs_import.c \ + libzfs_iter.c \ + libzfs_mount.c \ + libzfs_pool.c \ + libzfs_sendrecv.c \ + libzfs_status.c \ + libzfs_util.c + + +if BUILD_FREEBSD +USER_C += \ + os/freebsd/libzfs_compat.c \ + os/freebsd/libzfs_ioctl_compat.c \ + os/freebsd/libzfs_zmount.c +endif + +if BUILD_LINUX +USER_C += \ + os/linux/libzfs_mount_os.c \ + os/linux/libzfs_pool_os.c \ + os/linux/libzfs_sendrecv_os.c \ + os/linux/libzfs_util_os.c +endif + +KERNEL_C = \ + algs/sha2/sha2.c \ + cityhash.c \ + zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_aarch64_neon.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zfs_uio.c \ + zpool_prop.c \ + zprop_common.c + +dist_libzfs_la_SOURCES = \ + $(USER_C) + +nodist_libzfs_la_SOURCES = \ + $(KERNEL_C) + +libzfs_la_LIBADD = \ + $(abs_top_builddir)/lib/libshare/libshare.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libuutil/libuutil.la + +libzfs_la_LIBADD += -lm $(LIBCRYPTO_LIBS) $(ZLIB_LIBS) $(LTLIBINTL) + +libzfs_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzfs_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzfs_la_LIBADD += -lutil -lgeom +libzfs_la_LDFLAGS += -version-info 4:0:0 +else +libzfs_la_LDFLAGS += -version-info 2:0:0 +endif + +# Licensing data +EXTRA_DIST = THIRDPARTYLICENSE.openssl THIRDPARTYLICENSE.openssl.descrip diff --git a/lib/libzfs/THIRDPARTYLICENSE.openssl b/lib/libzfs/THIRDPARTYLICENSE.openssl new file mode 100644 index 000000000000..92c9e196a318 --- /dev/null +++ b/lib/libzfs/THIRDPARTYLICENSE.openssl @@ -0,0 +1,127 @@ + + LICENSE ISSUES + ============== + + The OpenSSL toolkit stays under a dual license, i.e. both the conditions of + the OpenSSL License and the original SSLeay license apply to the toolkit. + See below for the actual license texts. Actually both licenses are BSD-style + Open Source licenses. In case of any license issues related to OpenSSL + please contact openssl-core@openssl.org. + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the routines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + diff --git a/lib/libzfs/THIRDPARTYLICENSE.openssl.descrip b/lib/libzfs/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 000000000000..a9125ea21122 --- /dev/null +++ b/lib/libzfs/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PBKDF2 IMPLEMENTATION diff --git a/lib/libzfs/libzfs.pc.in b/lib/libzfs/libzfs.pc.in new file mode 100644 index 000000000000..6caf49d221f1 --- /dev/null +++ b/lib/libzfs/libzfs.pc.in @@ -0,0 +1,14 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libzfs +Description: LibZFS library +Version: @VERSION@ +URL: https://zfsonlinux.org +Requires: libzfs_core +Requires.private: libcrypto zlib +Cflags: -I${includedir}/libzfs -I${includedir}/libspl +Libs: -L${libdir} -lzfs -lnvpair +Libs.private: -luutil -lm -pthread diff --git a/lib/libzfs/libzfs_changelist.c b/lib/libzfs/libzfs_changelist.c new file mode 100644 index 000000000000..fec2fd5f2648 --- /dev/null +++ b/lib/libzfs/libzfs_changelist.c @@ -0,0 +1,781 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Portions Copyright 2007 Ramprakash Jelari + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" + +/* + * Structure to keep track of dataset state. Before changing the 'sharenfs' or + * 'mountpoint' property, we record whether the filesystem was previously + * mounted/shared. This prior state dictates whether we remount/reshare the + * dataset after the property has been changed. + * + * The interface consists of the following sequence of functions: + * + * changelist_gather() + * changelist_prefix() + * < change property > + * changelist_postfix() + * changelist_free() + * + * Other interfaces: + * + * changelist_remove() - remove a node from a gathered list + * changelist_rename() - renames all datasets appropriately when doing a rename + * changelist_unshare() - unshares all the nodes in a given changelist + * changelist_haszonedchild() - check if there is any child exported to + * a local zone + */ +typedef struct prop_changenode { + zfs_handle_t *cn_handle; + int cn_shared; + int cn_mounted; + int cn_zoned; + boolean_t cn_needpost; /* is postfix() needed? */ + uu_avl_node_t cn_treenode; +} prop_changenode_t; + +struct prop_changelist { + zfs_prop_t cl_prop; + zfs_prop_t cl_realprop; + zfs_prop_t cl_shareprop; /* used with sharenfs/sharesmb */ + uu_avl_pool_t *cl_pool; + uu_avl_t *cl_tree; + boolean_t cl_waslegacy; + boolean_t cl_allchildren; + boolean_t cl_alldependents; + int cl_mflags; /* Mount flags */ + int cl_gflags; /* Gather request flags */ + boolean_t cl_haszonedchild; +}; + +/* + * If the property is 'mountpoint', go through and unmount filesystems as + * necessary. We don't do the same for 'sharenfs', because we can just re-share + * with different options without interrupting service. We do handle 'sharesmb' + * since there may be old resource names that need to be removed. + */ +int +changelist_prefix(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + uu_avl_walk_t *walk; + int ret = 0; + boolean_t commit_smb_shares = B_FALSE; + + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (0); + + if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) + return (-1); + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + + /* if a previous loop failed, set the remaining to false */ + if (ret == -1) { + cn->cn_needpost = B_FALSE; + continue; + } + + /* + * If we are in the global zone, but this dataset is exported + * to a local zone, do nothing. + */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + continue; + + if (!ZFS_IS_VOLUME(cn->cn_handle)) { + /* + * Do the property specific processing. + */ + switch (clp->cl_prop) { + case ZFS_PROP_MOUNTPOINT: + if (zfs_unmount(cn->cn_handle, NULL, + clp->cl_mflags) != 0) { + ret = -1; + cn->cn_needpost = B_FALSE; + } + break; + case ZFS_PROP_SHARESMB: + (void) zfs_unshare_smb(cn->cn_handle, NULL); + commit_smb_shares = B_TRUE; + break; + + default: + break; + } + } + } + + if (commit_smb_shares) + zfs_commit_smb_shares(); + uu_avl_walk_end(walk); + + if (ret == -1) + (void) changelist_postfix(clp); + + return (ret); +} + +/* + * If the property is 'mountpoint' or 'sharenfs', go through and remount and/or + * reshare the filesystems as necessary. In changelist_gather() we recorded + * whether the filesystem was previously shared or mounted. The action we take + * depends on the previous state, and whether the value was previously 'legacy'. + * For non-legacy properties, we only remount/reshare the filesystem if it was + * previously mounted/shared. Otherwise, we always remount/reshare the + * filesystem. + */ +int +changelist_postfix(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + uu_avl_walk_t *walk; + char shareopts[ZFS_MAXPROPLEN]; + int errors = 0; + boolean_t commit_smb_shares = B_FALSE; + boolean_t commit_nfs_shares = B_FALSE; + + /* + * If we're changing the mountpoint, attempt to destroy the underlying + * mountpoint. All other datasets will have inherited from this dataset + * (in which case their mountpoints exist in the filesystem in the new + * location), or have explicit mountpoints set (in which case they won't + * be in the changelist). + */ + if ((cn = uu_avl_last(clp->cl_tree)) == NULL) + return (0); + + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT) + remove_mountpoint(cn->cn_handle); + + /* + * We walk the datasets in reverse, because we want to mount any parent + * datasets before mounting the children. We walk all datasets even if + * there are errors. + */ + if ((walk = uu_avl_walk_start(clp->cl_tree, + UU_WALK_REVERSE | UU_WALK_ROBUST)) == NULL) + return (-1); + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + + boolean_t sharenfs; + boolean_t sharesmb; + boolean_t mounted; + boolean_t needs_key; + + /* + * If we are in the global zone, but this dataset is exported + * to a local zone, do nothing. + */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + continue; + + /* Only do post-processing if it's required */ + if (!cn->cn_needpost) + continue; + cn->cn_needpost = B_FALSE; + + zfs_refresh_properties(cn->cn_handle); + + if (ZFS_IS_VOLUME(cn->cn_handle)) + continue; + + /* + * Remount if previously mounted or mountpoint was legacy, + * or sharenfs or sharesmb property is set. + */ + sharenfs = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARENFS, + shareopts, sizeof (shareopts), NULL, NULL, 0, + B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); + + sharesmb = ((zfs_prop_get(cn->cn_handle, ZFS_PROP_SHARESMB, + shareopts, sizeof (shareopts), NULL, NULL, 0, + B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); + + needs_key = (zfs_prop_get_int(cn->cn_handle, + ZFS_PROP_KEYSTATUS) == ZFS_KEYSTATUS_UNAVAILABLE); + + mounted = zfs_is_mounted(cn->cn_handle, NULL); + + if (!mounted && !needs_key && (cn->cn_mounted || + ((sharenfs || sharesmb || clp->cl_waslegacy) && + (zfs_prop_get_int(cn->cn_handle, + ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { + + if (zfs_mount(cn->cn_handle, NULL, 0) != 0) + errors++; + else + mounted = TRUE; + } + + /* + * If the file system is mounted we always re-share even + * if the filesystem is currently shared, so that we can + * adopt any new options. + */ + if (sharenfs && mounted) { + errors += zfs_share_nfs(cn->cn_handle); + commit_nfs_shares = B_TRUE; + } else if (cn->cn_shared || clp->cl_waslegacy) { + errors += zfs_unshare_nfs(cn->cn_handle, NULL); + commit_nfs_shares = B_TRUE; + } + if (sharesmb && mounted) { + errors += zfs_share_smb(cn->cn_handle); + commit_smb_shares = B_TRUE; + } else if (cn->cn_shared || clp->cl_waslegacy) { + errors += zfs_unshare_smb(cn->cn_handle, NULL); + commit_smb_shares = B_TRUE; + } + } + if (commit_nfs_shares) + zfs_commit_nfs_shares(); + if (commit_smb_shares) + zfs_commit_smb_shares(); + uu_avl_walk_end(walk); + + return (errors ? -1 : 0); +} + +/* + * Is this "dataset" a child of "parent"? + */ +boolean_t +isa_child_of(const char *dataset, const char *parent) +{ + int len; + + len = strlen(parent); + + if (strncmp(dataset, parent, len) == 0 && + (dataset[len] == '@' || dataset[len] == '/' || + dataset[len] == '\0')) + return (B_TRUE); + else + return (B_FALSE); + +} + +/* + * If we rename a filesystem, child filesystem handles are no longer valid + * since we identify each dataset by its name in the ZFS namespace. As a + * result, we have to go through and fix up all the names appropriately. We + * could do this automatically if libzfs kept track of all open handles, but + * this is a lot less work. + */ +void +changelist_rename(prop_changelist_t *clp, const char *src, const char *dst) +{ + prop_changenode_t *cn; + uu_avl_walk_t *walk; + char newname[ZFS_MAX_DATASET_NAME_LEN]; + + if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) + return; + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + /* + * Do not rename a clone that's not in the source hierarchy. + */ + if (!isa_child_of(cn->cn_handle->zfs_name, src)) + continue; + + /* + * Destroy the previous mountpoint if needed. + */ + remove_mountpoint(cn->cn_handle); + + (void) strlcpy(newname, dst, sizeof (newname)); + (void) strlcat(newname, cn->cn_handle->zfs_name + strlen(src), + sizeof (newname)); + + (void) strlcpy(cn->cn_handle->zfs_name, newname, + sizeof (cn->cn_handle->zfs_name)); + } + + uu_avl_walk_end(walk); +} + +/* + * Given a gathered changelist for the 'sharenfs' or 'sharesmb' property, + * unshare all the datasets in the list. + */ +int +changelist_unshare(prop_changelist_t *clp, zfs_share_proto_t *proto) +{ + prop_changenode_t *cn; + uu_avl_walk_t *walk; + int ret = 0; + + if (clp->cl_prop != ZFS_PROP_SHARENFS && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (0); + + if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) + return (-1); + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + if (zfs_unshare_proto(cn->cn_handle, NULL, proto) != 0) + ret = -1; + } + + zfs_commit_proto(proto); + uu_avl_walk_end(walk); + + return (ret); +} + +/* + * Check if there is any child exported to a local zone in a given changelist. + * This information has already been recorded while gathering the changelist + * via changelist_gather(). + */ +int +changelist_haszonedchild(prop_changelist_t *clp) +{ + return (clp->cl_haszonedchild); +} + +/* + * Remove a node from a gathered list. + */ +void +changelist_remove(prop_changelist_t *clp, const char *name) +{ + prop_changenode_t *cn; + uu_avl_walk_t *walk; + + if ((walk = uu_avl_walk_start(clp->cl_tree, UU_WALK_ROBUST)) == NULL) + return; + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + if (strcmp(cn->cn_handle->zfs_name, name) == 0) { + uu_avl_remove(clp->cl_tree, cn); + zfs_close(cn->cn_handle); + free(cn); + uu_avl_walk_end(walk); + return; + } + } + + uu_avl_walk_end(walk); +} + +/* + * Release any memory associated with a changelist. + */ +void +changelist_free(prop_changelist_t *clp) +{ + prop_changenode_t *cn; + + if (clp->cl_tree) { + uu_avl_walk_t *walk; + + if ((walk = uu_avl_walk_start(clp->cl_tree, + UU_WALK_ROBUST)) == NULL) + return; + + while ((cn = uu_avl_walk_next(walk)) != NULL) { + uu_avl_remove(clp->cl_tree, cn); + zfs_close(cn->cn_handle); + free(cn); + } + + uu_avl_walk_end(walk); + uu_avl_destroy(clp->cl_tree); + } + if (clp->cl_pool) + uu_avl_pool_destroy(clp->cl_pool); + + free(clp); +} + +/* + * Add one dataset to changelist + */ +static int +changelist_add_mounted(zfs_handle_t *zhp, void *data) +{ + prop_changelist_t *clp = data; + prop_changenode_t *cn; + uu_avl_index_t idx; + + ASSERT3U(clp->cl_prop, ==, ZFS_PROP_MOUNTPOINT); + + if ((cn = zfs_alloc(zfs_get_handle(zhp), + sizeof (prop_changenode_t))) == NULL) { + zfs_close(zhp); + return (ENOMEM); + } + + cn->cn_handle = zhp; + cn->cn_mounted = zfs_is_mounted(zhp, NULL); + ASSERT3U(cn->cn_mounted, ==, B_TRUE); + cn->cn_shared = zfs_is_shared(zhp); + cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + cn->cn_needpost = B_TRUE; + + /* Indicate if any child is exported to a local zone. */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + clp->cl_haszonedchild = B_TRUE; + + uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); + + if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { + uu_avl_insert(clp->cl_tree, cn, idx); + } else { + free(cn); + zfs_close(zhp); + } + + return (0); +} + +static int +change_one(zfs_handle_t *zhp, void *data) +{ + prop_changelist_t *clp = data; + char property[ZFS_MAXPROPLEN]; + char where[64]; + prop_changenode_t *cn = NULL; + zprop_source_t sourcetype = ZPROP_SRC_NONE; + zprop_source_t share_sourcetype = ZPROP_SRC_NONE; + int ret = 0; + + /* + * We only want to unmount/unshare those filesystems that may inherit + * from the target filesystem. If we find any filesystem with a + * locally set mountpoint, we ignore any children since changing the + * property will not affect them. If this is a rename, we iterate + * over all children regardless, since we need them unmounted in + * order to do the rename. Also, if this is a volume and we're doing + * a rename, then always add it to the changelist. + */ + + if (!(ZFS_IS_VOLUME(zhp) && clp->cl_realprop == ZFS_PROP_NAME) && + zfs_prop_get(zhp, clp->cl_prop, property, + sizeof (property), &sourcetype, where, sizeof (where), + B_FALSE) != 0) { + goto out; + } + + /* + * If we are "watching" sharenfs or sharesmb + * then check out the companion property which is tracked + * in cl_shareprop + */ + if (clp->cl_shareprop != ZPROP_INVAL && + zfs_prop_get(zhp, clp->cl_shareprop, property, + sizeof (property), &share_sourcetype, where, sizeof (where), + B_FALSE) != 0) { + goto out; + } + + if (clp->cl_alldependents || clp->cl_allchildren || + sourcetype == ZPROP_SRC_DEFAULT || + sourcetype == ZPROP_SRC_INHERITED || + (clp->cl_shareprop != ZPROP_INVAL && + (share_sourcetype == ZPROP_SRC_DEFAULT || + share_sourcetype == ZPROP_SRC_INHERITED))) { + if ((cn = zfs_alloc(zfs_get_handle(zhp), + sizeof (prop_changenode_t))) == NULL) { + ret = -1; + goto out; + } + + cn->cn_handle = zhp; + cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || + zfs_is_mounted(zhp, NULL); + cn->cn_shared = zfs_is_shared(zhp); + cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + cn->cn_needpost = B_TRUE; + + /* Indicate if any child is exported to a local zone. */ + if (getzoneid() == GLOBAL_ZONEID && cn->cn_zoned) + clp->cl_haszonedchild = B_TRUE; + + uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); + + uu_avl_index_t idx; + + if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { + uu_avl_insert(clp->cl_tree, cn, idx); + } else { + free(cn); + cn = NULL; + } + + if (!clp->cl_alldependents) + ret = zfs_iter_children(zhp, change_one, data); + + /* + * If we added the handle to the changelist, we will re-use it + * later so return without closing it. + */ + if (cn != NULL) + return (ret); + } + +out: + zfs_close(zhp); + return (ret); +} + +static int +compare_props(const void *a, const void *b, zfs_prop_t prop) +{ + const prop_changenode_t *ca = a; + const prop_changenode_t *cb = b; + + char propa[MAXPATHLEN]; + char propb[MAXPATHLEN]; + + boolean_t haspropa, haspropb; + + haspropa = (zfs_prop_get(ca->cn_handle, prop, propa, sizeof (propa), + NULL, NULL, 0, B_FALSE) == 0); + haspropb = (zfs_prop_get(cb->cn_handle, prop, propb, sizeof (propb), + NULL, NULL, 0, B_FALSE) == 0); + + if (!haspropa && haspropb) + return (-1); + else if (haspropa && !haspropb) + return (1); + else if (!haspropa && !haspropb) + return (0); + else + return (strcmp(propb, propa)); +} + +/*ARGSUSED*/ +static int +compare_mountpoints(const void *a, const void *b, void *unused) +{ + /* + * When unsharing or unmounting filesystems, we need to do it in + * mountpoint order. This allows the user to have a mountpoint + * hierarchy that is different from the dataset hierarchy, and still + * allow it to be changed. + */ + return (compare_props(a, b, ZFS_PROP_MOUNTPOINT)); +} + +/*ARGSUSED*/ +static int +compare_dataset_names(const void *a, const void *b, void *unused) +{ + return (compare_props(a, b, ZFS_PROP_NAME)); +} + +/* + * Given a ZFS handle and a property, construct a complete list of datasets + * that need to be modified as part of this process. For anything but the + * 'mountpoint' and 'sharenfs' properties, this just returns an empty list. + * Otherwise, we iterate over all children and look for any datasets that + * inherit the property. For each such dataset, we add it to the list and + * mark whether it was shared beforehand. + */ +prop_changelist_t * +changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, + int mnt_flags) +{ + prop_changelist_t *clp; + prop_changenode_t *cn; + zfs_handle_t *temp; + char property[ZFS_MAXPROPLEN]; + boolean_t legacy = B_FALSE; + + if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) + return (NULL); + + /* + * For mountpoint-related tasks, we want to sort everything by + * mountpoint, so that we mount and unmount them in the appropriate + * order, regardless of their position in the hierarchy. + */ + if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || + prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) { + + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + property, sizeof (property), + NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + legacy = B_TRUE; + } + } + + clp->cl_pool = uu_avl_pool_create("changelist_pool", + sizeof (prop_changenode_t), + offsetof(prop_changenode_t, cn_treenode), + legacy ? compare_dataset_names : compare_mountpoints, 0); + if (clp->cl_pool == NULL) { + assert(uu_error() == UU_ERROR_NO_MEMORY); + (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); + changelist_free(clp); + return (NULL); + } + + clp->cl_tree = uu_avl_create(clp->cl_pool, NULL, UU_DEFAULT); + clp->cl_gflags = gather_flags; + clp->cl_mflags = mnt_flags; + + if (clp->cl_tree == NULL) { + assert(uu_error() == UU_ERROR_NO_MEMORY); + (void) zfs_error(zhp->zfs_hdl, EZFS_NOMEM, "internal error"); + changelist_free(clp); + return (NULL); + } + + /* + * If this is a rename or the 'zoned' property, we pretend we're + * changing the mountpoint and flag it so we can catch all children in + * change_one(). + * + * Flag cl_alldependents to catch all children plus the dependents + * (clones) that are not in the hierarchy. + */ + if (prop == ZFS_PROP_NAME) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + clp->cl_alldependents = B_TRUE; + } else if (prop == ZFS_PROP_ZONED) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + clp->cl_allchildren = B_TRUE; + } else if (prop == ZFS_PROP_CANMOUNT) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + } else if (prop == ZFS_PROP_VOLSIZE) { + clp->cl_prop = ZFS_PROP_MOUNTPOINT; + } else { + clp->cl_prop = prop; + } + clp->cl_realprop = prop; + + if (clp->cl_prop != ZFS_PROP_MOUNTPOINT && + clp->cl_prop != ZFS_PROP_SHARENFS && + clp->cl_prop != ZFS_PROP_SHARESMB) + return (clp); + + /* + * If watching SHARENFS or SHARESMB then + * also watch its companion property. + */ + if (clp->cl_prop == ZFS_PROP_SHARENFS) + clp->cl_shareprop = ZFS_PROP_SHARESMB; + else if (clp->cl_prop == ZFS_PROP_SHARESMB) + clp->cl_shareprop = ZFS_PROP_SHARENFS; + + if (clp->cl_prop == ZFS_PROP_MOUNTPOINT && + (clp->cl_gflags & CL_GATHER_ITER_MOUNTED)) { + /* + * Instead of iterating through all of the dataset children we + * gather mounted dataset children from MNTTAB + */ + if (zfs_iter_mounted(zhp, changelist_add_mounted, clp) != 0) { + changelist_free(clp); + return (NULL); + } + } else if (clp->cl_alldependents) { + if (zfs_iter_dependents(zhp, B_TRUE, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } + } else if (zfs_iter_children(zhp, change_one, clp) != 0) { + changelist_free(clp); + return (NULL); + } + + /* + * We have to re-open ourselves because we auto-close all the handles + * and can't tell the difference. + */ + if ((temp = zfs_open(zhp->zfs_hdl, zfs_get_name(zhp), + ZFS_TYPE_DATASET)) == NULL) { + changelist_free(clp); + return (NULL); + } + + /* + * Always add ourself to the list. We add ourselves to the end so that + * we're the last to be unmounted. + */ + if ((cn = zfs_alloc(zhp->zfs_hdl, + sizeof (prop_changenode_t))) == NULL) { + zfs_close(temp); + changelist_free(clp); + return (NULL); + } + + cn->cn_handle = temp; + cn->cn_mounted = (clp->cl_gflags & CL_GATHER_MOUNT_ALWAYS) || + zfs_is_mounted(temp, NULL); + cn->cn_shared = zfs_is_shared(temp); + cn->cn_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + cn->cn_needpost = B_TRUE; + + uu_avl_node_init(cn, &cn->cn_treenode, clp->cl_pool); + uu_avl_index_t idx; + if (uu_avl_find(clp->cl_tree, cn, NULL, &idx) == NULL) { + uu_avl_insert(clp->cl_tree, cn, idx); + } else { + free(cn); + zfs_close(temp); + } + + /* + * If the mountpoint property was previously 'legacy', or 'none', + * record it as the behavior of changelist_postfix() will be different. + */ + if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { + /* + * do not automatically mount ex-legacy datasets if + * we specifically set canmount to noauto + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) != + ZFS_CANMOUNT_NOAUTO) + clp->cl_waslegacy = B_TRUE; + } + + return (clp); +} diff --git a/lib/libzfs/libzfs_config.c b/lib/libzfs/libzfs_config.c new file mode 100644 index 000000000000..a3ecc4a327dc --- /dev/null +++ b/lib/libzfs/libzfs_config.c @@ -0,0 +1,457 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2015 by Syneto S.R.L. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * The pool configuration repository is stored in /etc/zfs/zpool.cache as a + * single packed nvlist. While it would be nice to just read in this + * file from userland, this wouldn't work from a local zone. So we have to have + * a zpool ioctl to return the complete configuration for all pools. In the + * global zone, this will be identical to reading the file and unpacking it in + * userland. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +typedef struct config_node { + char *cn_name; + nvlist_t *cn_config; + uu_avl_node_t cn_avl; +} config_node_t; + +/* ARGSUSED */ +static int +config_node_compare(const void *a, const void *b, void *unused) +{ + int ret; + + const config_node_t *ca = (config_node_t *)a; + const config_node_t *cb = (config_node_t *)b; + + ret = strcmp(ca->cn_name, cb->cn_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +void +namespace_clear(libzfs_handle_t *hdl) +{ + if (hdl->libzfs_ns_avl) { + config_node_t *cn; + void *cookie = NULL; + + while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, + &cookie)) != NULL) { + nvlist_free(cn->cn_config); + free(cn->cn_name); + free(cn); + } + + uu_avl_destroy(hdl->libzfs_ns_avl); + hdl->libzfs_ns_avl = NULL; + } + + if (hdl->libzfs_ns_avlpool) { + uu_avl_pool_destroy(hdl->libzfs_ns_avlpool); + hdl->libzfs_ns_avlpool = NULL; + } +} + +/* + * Loads the pool namespace, or re-loads it if the cache has changed. + */ +static int +namespace_reload(libzfs_handle_t *hdl) +{ + nvlist_t *config; + config_node_t *cn; + nvpair_t *elem; + zfs_cmd_t zc = {"\0"}; + void *cookie; + + if (hdl->libzfs_ns_gen == 0) { + /* + * This is the first time we've accessed the configuration + * cache. Initialize the AVL tree and then fall through to the + * common code. + */ + if ((hdl->libzfs_ns_avlpool = uu_avl_pool_create("config_pool", + sizeof (config_node_t), + offsetof(config_node_t, cn_avl), + config_node_compare, UU_DEFAULT)) == NULL) + return (no_memory(hdl)); + + if ((hdl->libzfs_ns_avl = uu_avl_create(hdl->libzfs_ns_avlpool, + NULL, UU_DEFAULT)) == NULL) + return (no_memory(hdl)); + } + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + for (;;) { + zc.zc_cookie = hdl->libzfs_ns_gen; + if (zfs_ioctl(hdl, ZFS_IOC_POOL_CONFIGS, &zc) != 0) { + switch (errno) { + case EEXIST: + /* + * The namespace hasn't changed. + */ + zcmd_free_nvlists(&zc); + return (0); + + case ENOMEM: + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + break; + + default: + zcmd_free_nvlists(&zc); + return (zfs_standard_error(hdl, errno, + dgettext(TEXT_DOMAIN, "failed to read " + "pool configuration"))); + } + } else { + hdl->libzfs_ns_gen = zc.zc_cookie; + break; + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + /* + * Clear out any existing configuration information. + */ + cookie = NULL; + while ((cn = uu_avl_teardown(hdl->libzfs_ns_avl, &cookie)) != NULL) { + nvlist_free(cn->cn_config); + free(cn->cn_name); + free(cn); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(config, elem)) != NULL) { + nvlist_t *child; + uu_avl_index_t where; + + if ((cn = zfs_alloc(hdl, sizeof (config_node_t))) == NULL) { + nvlist_free(config); + return (-1); + } + + if ((cn->cn_name = zfs_strdup(hdl, + nvpair_name(elem))) == NULL) { + free(cn); + nvlist_free(config); + return (-1); + } + + verify(nvpair_value_nvlist(elem, &child) == 0); + if (nvlist_dup(child, &cn->cn_config, 0) != 0) { + free(cn->cn_name); + free(cn); + nvlist_free(config); + return (no_memory(hdl)); + } + verify(uu_avl_find(hdl->libzfs_ns_avl, cn, NULL, &where) + == NULL); + + uu_avl_insert(hdl->libzfs_ns_avl, cn, where); + } + + nvlist_free(config); + return (0); +} + +/* + * Retrieve the configuration for the given pool. The configuration is an nvlist + * describing the vdevs, as well as the statistics associated with each one. + */ +nvlist_t * +zpool_get_config(zpool_handle_t *zhp, nvlist_t **oldconfig) +{ + if (oldconfig) + *oldconfig = zhp->zpool_old_config; + return (zhp->zpool_config); +} + +/* + * Retrieves a list of enabled features and their refcounts and caches it in + * the pool handle. + */ +nvlist_t * +zpool_get_features(zpool_handle_t *zhp) +{ + nvlist_t *config, *features; + + config = zpool_get_config(zhp, NULL); + + if (config == NULL || !nvlist_exists(config, + ZPOOL_CONFIG_FEATURE_STATS)) { + int error; + boolean_t missing = B_FALSE; + + error = zpool_refresh_stats(zhp, &missing); + + if (error != 0 || missing) + return (NULL); + + config = zpool_get_config(zhp, NULL); + } + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + &features) != 0) + return (NULL); + + return (features); +} + +/* + * Refresh the vdev statistics associated with the given pool. This is used in + * iostat to show configuration changes and determine the delta from the last + * time the function was called. This function can fail, in case the pool has + * been destroyed. + */ +int +zpool_refresh_stats(zpool_handle_t *zhp, boolean_t *missing) +{ + zfs_cmd_t zc = {"\0"}; + int error; + nvlist_t *config; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + *missing = B_FALSE; + (void) strcpy(zc.zc_name, zhp->zpool_name); + + if (zhp->zpool_config_size == 0) + zhp->zpool_config_size = 1 << 16; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size) != 0) + return (-1); + + for (;;) { + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_STATS, + &zc) == 0) { + /* + * The real error is returned in the zc_cookie field. + */ + error = zc.zc_cookie; + break; + } + + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + if (errno == ENOENT || errno == EINVAL) + *missing = B_TRUE; + zhp->zpool_state = POOL_STATE_UNAVAIL; + return (0); + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &config) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + zhp->zpool_config_size = zc.zc_nvlist_dst_size; + + if (zhp->zpool_config != NULL) { + nvlist_free(zhp->zpool_old_config); + + zhp->zpool_old_config = zhp->zpool_config; + } + + zhp->zpool_config = config; + if (error) + zhp->zpool_state = POOL_STATE_UNAVAIL; + else + zhp->zpool_state = POOL_STATE_ACTIVE; + + return (0); +} + +/* + * The following environment variables are undocumented + * and should be used for testing purposes only: + * + * __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists + * __ZFS_POOL_RESTRICT - iterate only over the pools it lists + * + * This function returns B_TRUE if the pool should be skipped + * during iteration. + */ +boolean_t +zpool_skip_pool(const char *poolname) +{ + static boolean_t initialized = B_FALSE; + static const char *exclude = NULL; + static const char *restricted = NULL; + + const char *cur, *end; + int len; + int namelen = strlen(poolname); + + if (!initialized) { + initialized = B_TRUE; + exclude = getenv("__ZFS_POOL_EXCLUDE"); + restricted = getenv("__ZFS_POOL_RESTRICT"); + } + + if (exclude != NULL) { + cur = exclude; + do { + end = strchr(cur, ' '); + len = (NULL == end) ? strlen(cur) : (end - cur); + if (len == namelen && 0 == strncmp(cur, poolname, len)) + return (B_TRUE); + cur += (len + 1); + } while (NULL != end); + } + + if (NULL == restricted) + return (B_FALSE); + + cur = restricted; + do { + end = strchr(cur, ' '); + len = (NULL == end) ? strlen(cur) : (end - cur); + + if (len == namelen && 0 == strncmp(cur, poolname, len)) { + return (B_FALSE); + } + + cur += (len + 1); + } while (NULL != end); + + return (B_TRUE); +} + +/* + * Iterate over all pools in the system. + */ +int +zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data) +{ + config_node_t *cn; + zpool_handle_t *zhp; + int ret; + + /* + * If someone makes a recursive call to zpool_iter(), we want to avoid + * refreshing the namespace because that will invalidate the parent + * context. We allow recursive calls, but simply re-use the same + * namespace AVL tree. + */ + if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0) + return (-1); + + hdl->libzfs_pool_iter++; + for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; + cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + + if (zpool_skip_pool(cn->cn_name)) + continue; + + if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { + hdl->libzfs_pool_iter--; + return (-1); + } + + if (zhp == NULL) + continue; + + if ((ret = func(zhp, data)) != 0) { + hdl->libzfs_pool_iter--; + return (ret); + } + } + hdl->libzfs_pool_iter--; + + return (0); +} + +/* + * Iterate over root datasets, calling the given function for each. The zfs + * handle passed each time must be explicitly closed by the callback. + */ +int +zfs_iter_root(libzfs_handle_t *hdl, zfs_iter_f func, void *data) +{ + config_node_t *cn; + zfs_handle_t *zhp; + int ret; + + if (namespace_reload(hdl) != 0) + return (-1); + + for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; + cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { + + if (zpool_skip_pool(cn->cn_name)) + continue; + + if ((zhp = make_dataset_handle(hdl, cn->cn_name)) == NULL) + continue; + + if ((ret = func(zhp, data)) != 0) + return (ret); + } + + return (0); +} diff --git a/lib/libzfs/libzfs_crypto.c b/lib/libzfs/libzfs_crypto.c new file mode 100644 index 000000000000..1a2ee638aef3 --- /dev/null +++ b/lib/libzfs/libzfs_crypto.c @@ -0,0 +1,1621 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + * Copyright 2020 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" +#include "zfeature_common.h" + +/* + * User keys are used to decrypt the master encryption keys of a dataset. This + * indirection allows a user to change his / her access key without having to + * re-encrypt the entire dataset. User keys can be provided in one of several + * ways. Raw keys are simply given to the kernel as is. Similarly, hex keys + * are converted to binary and passed into the kernel. Password based keys are + * a bit more complicated. Passwords alone do not provide suitable entropy for + * encryption and may be too short or too long to be used. In order to derive + * a more appropriate key we use a PBKDF2 function. This function is designed + * to take a (relatively) long time to calculate in order to discourage + * attackers from guessing from a list of common passwords. PBKDF2 requires + * 2 additional parameters. The first is the number of iterations to run, which + * will ultimately determine how long it takes to derive the resulting key from + * the password. The second parameter is a salt that is randomly generated for + * each dataset. The salt is used to "tweak" PBKDF2 such that a group of + * attackers cannot reasonably generate a table of commonly known passwords to + * their output keys and expect it work for all past and future PBKDF2 users. + * We store the salt as a hidden property of the dataset (although it is + * technically ok if the salt is known to the attacker). + */ + +typedef enum key_locator { + KEY_LOCATOR_NONE, + KEY_LOCATOR_PROMPT, + KEY_LOCATOR_URI +} key_locator_t; + +#define MIN_PASSPHRASE_LEN 8 +#define MAX_PASSPHRASE_LEN 512 +#define MAX_KEY_PROMPT_ATTEMPTS 3 + +static int caught_interrupt; + +static int get_key_material_file(libzfs_handle_t *, const char *, const char *, + zfs_keyformat_t, boolean_t, uint8_t **, size_t *); + +static zfs_uri_handler_t uri_handlers[] = { + { "file", get_key_material_file }, + { NULL, NULL } +}; + +static int +pkcs11_get_urandom(uint8_t *buf, size_t bytes) +{ + int rand; + ssize_t bytes_read = 0; + + rand = open("/dev/urandom", O_RDONLY); + + if (rand < 0) + return (rand); + + while (bytes_read < bytes) { + ssize_t rc = read(rand, buf + bytes_read, bytes - bytes_read); + if (rc < 0) + break; + bytes_read += rc; + } + + (void) close(rand); + + return (bytes_read); +} + +static int +zfs_prop_parse_keylocation(libzfs_handle_t *restrict hdl, const char *str, + zfs_keylocation_t *restrict locp, char **restrict schemep) +{ + *locp = ZFS_KEYLOCATION_NONE; + *schemep = NULL; + + if (strcmp("prompt", str) == 0) { + *locp = ZFS_KEYLOCATION_PROMPT; + return (0); + } + + regmatch_t pmatch[2]; + + if (regexec(&hdl->libzfs_urire, str, ARRAY_SIZE(pmatch), + pmatch, 0) == 0) { + size_t scheme_len; + + if (pmatch[1].rm_so == -1) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid URI")); + return (EINVAL); + } + + scheme_len = pmatch[1].rm_eo - pmatch[1].rm_so; + + *schemep = calloc(1, scheme_len + 1); + if (*schemep == NULL) { + int ret = errno; + + errno = 0; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid URI")); + return (ret); + } + + (void) memcpy(*schemep, str + pmatch[1].rm_so, scheme_len); + *locp = ZFS_KEYLOCATION_URI; + return (0); + } + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Invalid keylocation")); + return (EINVAL); +} + +static int +hex_key_to_raw(char *hex, int hexlen, uint8_t *out) +{ + int ret, i; + unsigned int c; + + for (i = 0; i < hexlen; i += 2) { + if (!isxdigit(hex[i]) || !isxdigit(hex[i + 1])) { + ret = EINVAL; + goto error; + } + + ret = sscanf(&hex[i], "%02x", &c); + if (ret != 1) { + ret = EINVAL; + goto error; + } + + out[i / 2] = c; + } + + return (0); + +error: + return (ret); +} + + +static void +catch_signal(int sig) +{ + caught_interrupt = sig; +} + +static const char * +get_format_prompt_string(zfs_keyformat_t format) +{ + switch (format) { + case ZFS_KEYFORMAT_RAW: + return ("raw key"); + case ZFS_KEYFORMAT_HEX: + return ("hex key"); + case ZFS_KEYFORMAT_PASSPHRASE: + return ("passphrase"); + default: + /* shouldn't happen */ + return (NULL); + } +} + +/* do basic validation of the key material */ +static int +validate_key(libzfs_handle_t *hdl, zfs_keyformat_t keyformat, + const char *key, size_t keylen) +{ + switch (keyformat) { + case ZFS_KEYFORMAT_RAW: + /* verify the key length is correct */ + if (keylen < WRAPPING_KEY_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Raw key too short (expected %u)."), + WRAPPING_KEY_LEN); + return (EINVAL); + } + + if (keylen > WRAPPING_KEY_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Raw key too long (expected %u)."), + WRAPPING_KEY_LEN); + return (EINVAL); + } + break; + case ZFS_KEYFORMAT_HEX: + /* verify the key length is correct */ + if (keylen < WRAPPING_KEY_LEN * 2) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Hex key too short (expected %u)."), + WRAPPING_KEY_LEN * 2); + return (EINVAL); + } + + if (keylen > WRAPPING_KEY_LEN * 2) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Hex key too long (expected %u)."), + WRAPPING_KEY_LEN * 2); + return (EINVAL); + } + + /* check for invalid hex digits */ + for (size_t i = 0; i < WRAPPING_KEY_LEN * 2; i++) { + if (!isxdigit(key[i])) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid hex character detected.")); + return (EINVAL); + } + } + break; + case ZFS_KEYFORMAT_PASSPHRASE: + /* verify the length is within bounds */ + if (keylen > MAX_PASSPHRASE_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Passphrase too long (max %u)."), + MAX_PASSPHRASE_LEN); + return (EINVAL); + } + + if (keylen < MIN_PASSPHRASE_LEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Passphrase too short (min %u)."), + MIN_PASSPHRASE_LEN); + return (EINVAL); + } + break; + default: + /* can't happen, checked above */ + break; + } + + return (0); +} + +static int +libzfs_getpassphrase(zfs_keyformat_t keyformat, boolean_t is_reenter, + boolean_t new_key, const char *fsname, + char **restrict res, size_t *restrict reslen) +{ + FILE *f = stdin; + size_t buflen = 0; + ssize_t bytes; + int ret = 0; + struct termios old_term, new_term; + struct sigaction act, osigint, osigtstp; + + *res = NULL; + *reslen = 0; + + /* + * handle SIGINT and ignore SIGSTP. This is necessary to + * restore the state of the terminal. + */ + caught_interrupt = 0; + act.sa_flags = 0; + (void) sigemptyset(&act.sa_mask); + act.sa_handler = catch_signal; + + (void) sigaction(SIGINT, &act, &osigint); + act.sa_handler = SIG_IGN; + (void) sigaction(SIGTSTP, &act, &osigtstp); + + (void) printf("%s %s%s", + is_reenter ? "Re-enter" : "Enter", + new_key ? "new " : "", + get_format_prompt_string(keyformat)); + if (fsname != NULL) + (void) printf(" for '%s'", fsname); + (void) fputc(':', stdout); + (void) fflush(stdout); + + /* disable the terminal echo for key input */ + (void) tcgetattr(fileno(f), &old_term); + + new_term = old_term; + new_term.c_lflag &= ~(ECHO | ECHOE | ECHOK | ECHONL); + + ret = tcsetattr(fileno(f), TCSAFLUSH, &new_term); + if (ret != 0) { + ret = errno; + errno = 0; + goto out; + } + + bytes = getline(res, &buflen, f); + if (bytes < 0) { + ret = errno; + errno = 0; + goto out; + } + + /* trim the ending newline if it exists */ + if (bytes > 0 && (*res)[bytes - 1] == '\n') { + (*res)[bytes - 1] = '\0'; + bytes--; + } + + *reslen = bytes; + +out: + /* reset the terminal */ + (void) tcsetattr(fileno(f), TCSAFLUSH, &old_term); + (void) sigaction(SIGINT, &osigint, NULL); + (void) sigaction(SIGTSTP, &osigtstp, NULL); + + /* if we caught a signal, re-throw it now */ + if (caught_interrupt != 0) + (void) kill(getpid(), caught_interrupt); + + /* print the newline that was not echo'd */ + (void) printf("\n"); + + return (ret); +} + +static int +get_key_interactive(libzfs_handle_t *restrict hdl, const char *fsname, + zfs_keyformat_t keyformat, boolean_t confirm_key, boolean_t newkey, + uint8_t **restrict outbuf, size_t *restrict len_out) +{ + char *buf = NULL, *buf2 = NULL; + size_t buflen = 0, buf2len = 0; + int ret = 0; + + ASSERT(isatty(fileno(stdin))); + + /* raw keys cannot be entered on the terminal */ + if (keyformat == ZFS_KEYFORMAT_RAW) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Cannot enter raw keys on the terminal")); + goto out; + } + + /* prompt for the key */ + if ((ret = libzfs_getpassphrase(keyformat, B_FALSE, newkey, fsname, + &buf, &buflen)) != 0) { + free(buf); + buf = NULL; + buflen = 0; + goto out; + } + + if (!confirm_key) + goto out; + + if ((ret = validate_key(hdl, keyformat, buf, buflen)) != 0) { + free(buf); + return (ret); + } + + ret = libzfs_getpassphrase(keyformat, B_TRUE, newkey, fsname, &buf2, + &buf2len); + if (ret != 0) { + free(buf); + free(buf2); + buf = buf2 = NULL; + buflen = buf2len = 0; + goto out; + } + + if (buflen != buf2len || strcmp(buf, buf2) != 0) { + free(buf); + buf = NULL; + buflen = 0; + + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Provided keys do not match.")); + } + + free(buf2); + +out: + *outbuf = (uint8_t *)buf; + *len_out = buflen; + return (ret); +} + +static int +get_key_material_raw(FILE *fd, zfs_keyformat_t keyformat, + uint8_t **buf, size_t *len_out) +{ + int ret = 0; + size_t buflen = 0; + + *len_out = 0; + + /* read the key material */ + if (keyformat != ZFS_KEYFORMAT_RAW) { + ssize_t bytes; + + bytes = getline((char **)buf, &buflen, fd); + if (bytes < 0) { + ret = errno; + errno = 0; + goto out; + } + + /* trim the ending newline if it exists */ + if (bytes > 0 && (*buf)[bytes - 1] == '\n') { + (*buf)[bytes - 1] = '\0'; + bytes--; + } + + *len_out = bytes; + } else { + size_t n; + + /* + * Raw keys may have newline characters in them and so can't + * use getline(). Here we attempt to read 33 bytes so that we + * can properly check the key length (the file should only have + * 32 bytes). + */ + *buf = malloc((WRAPPING_KEY_LEN + 1) * sizeof (uint8_t)); + if (*buf == NULL) { + ret = ENOMEM; + goto out; + } + + n = fread(*buf, 1, WRAPPING_KEY_LEN + 1, fd); + if (n == 0 || ferror(fd)) { + /* size errors are handled by the calling function */ + free(*buf); + *buf = NULL; + ret = errno; + errno = 0; + goto out; + } + + *len_out = n; + } +out: + return (ret); +} + +static int +get_key_material_file(libzfs_handle_t *hdl, const char *uri, + const char *fsname, zfs_keyformat_t keyformat, boolean_t newkey, + uint8_t **restrict buf, size_t *restrict len_out) +{ + FILE *f = NULL; + int ret = 0; + + if (strlen(uri) < 7) + return (EINVAL); + + if ((f = fopen(uri + 7, "r")) == NULL) { + ret = errno; + errno = 0; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to open key material file")); + return (ret); + } + + ret = get_key_material_raw(f, keyformat, buf, len_out); + + (void) fclose(f); + + return (ret); +} + +/* + * Attempts to fetch key material, no matter where it might live. The key + * material is allocated and returned in km_out. *can_retry_out will be set + * to B_TRUE if the user is providing the key material interactively, allowing + * for re-entry attempts. + */ +static int +get_key_material(libzfs_handle_t *hdl, boolean_t do_verify, boolean_t newkey, + zfs_keyformat_t keyformat, char *keylocation, const char *fsname, + uint8_t **km_out, size_t *kmlen_out, boolean_t *can_retry_out) +{ + int ret; + zfs_keylocation_t keyloc = ZFS_KEYLOCATION_NONE; + uint8_t *km = NULL; + size_t kmlen = 0; + char *uri_scheme = NULL; + zfs_uri_handler_t *handler = NULL; + boolean_t can_retry = B_FALSE; + + /* verify and parse the keylocation */ + ret = zfs_prop_parse_keylocation(hdl, keylocation, &keyloc, + &uri_scheme); + if (ret != 0) + goto error; + + /* open the appropriate file descriptor */ + switch (keyloc) { + case ZFS_KEYLOCATION_PROMPT: + if (isatty(fileno(stdin))) { + can_retry = B_TRUE; + ret = get_key_interactive(hdl, fsname, keyformat, + do_verify, newkey, &km, &kmlen); + } else { + /* fetch the key material into the buffer */ + ret = get_key_material_raw(stdin, keyformat, &km, + &kmlen); + } + + if (ret != 0) + goto error; + + break; + case ZFS_KEYLOCATION_URI: + for (handler = uri_handlers; handler->zuh_scheme != NULL; + handler++) { + if (strcmp(handler->zuh_scheme, uri_scheme) != 0) + continue; + + if ((ret = handler->zuh_handler(hdl, keylocation, + fsname, keyformat, newkey, &km, &kmlen)) != 0) + goto error; + + break; + } + + ret = ENOTSUP; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "URI scheme is not supported")); + + break; + default: + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid keylocation.")); + goto error; + } + + if ((ret = validate_key(hdl, keyformat, (const char *)km, kmlen)) != 0) + goto error; + + *km_out = km; + *kmlen_out = kmlen; + if (can_retry_out != NULL) + *can_retry_out = can_retry; + + free(uri_scheme); + return (0); + +error: + free(km); + + *km_out = NULL; + *kmlen_out = 0; + + if (can_retry_out != NULL) + *can_retry_out = can_retry; + + free(uri_scheme); + return (ret); +} + +static int +derive_key(libzfs_handle_t *hdl, zfs_keyformat_t format, uint64_t iters, + uint8_t *key_material, size_t key_material_len, uint64_t salt, + uint8_t **key_out) +{ + int ret; + uint8_t *key; + + *key_out = NULL; + + key = zfs_alloc(hdl, WRAPPING_KEY_LEN); + if (!key) + return (ENOMEM); + + switch (format) { + case ZFS_KEYFORMAT_RAW: + bcopy(key_material, key, WRAPPING_KEY_LEN); + break; + case ZFS_KEYFORMAT_HEX: + ret = hex_key_to_raw((char *)key_material, + WRAPPING_KEY_LEN * 2, key); + if (ret != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Invalid hex key provided.")); + goto error; + } + break; + case ZFS_KEYFORMAT_PASSPHRASE: + salt = LE_64(salt); + + ret = PKCS5_PBKDF2_HMAC_SHA1((char *)key_material, + strlen((char *)key_material), ((uint8_t *)&salt), + sizeof (uint64_t), iters, WRAPPING_KEY_LEN, key); + if (ret != 1) { + ret = EIO; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to generate key from passphrase.")); + goto error; + } + break; + default: + ret = EINVAL; + goto error; + } + + *key_out = key; + return (0); + +error: + free(key); + + *key_out = NULL; + return (ret); +} + +static boolean_t +encryption_feature_is_enabled(zpool_handle_t *zph) +{ + nvlist_t *features; + uint64_t feat_refcount; + + /* check that features can be enabled */ + if (zpool_get_prop_int(zph, ZPOOL_PROP_VERSION, NULL) + < SPA_VERSION_FEATURES) + return (B_FALSE); + + /* check for crypto feature */ + features = zpool_get_features(zph); + if (!features || nvlist_lookup_uint64(features, + spa_feature_table[SPA_FEATURE_ENCRYPTION].fi_guid, + &feat_refcount) != 0) + return (B_FALSE); + + return (B_TRUE); +} + +static int +populate_create_encryption_params_nvlists(libzfs_handle_t *hdl, + zfs_handle_t *zhp, boolean_t newkey, zfs_keyformat_t keyformat, + char *keylocation, nvlist_t *props, uint8_t **wkeydata, uint_t *wkeylen) +{ + int ret; + uint64_t iters = 0, salt = 0; + uint8_t *key_material = NULL; + size_t key_material_len = 0; + uint8_t *key_data = NULL; + const char *fsname = (zhp) ? zfs_get_name(zhp) : NULL; + + /* get key material from keyformat and keylocation */ + ret = get_key_material(hdl, B_TRUE, newkey, keyformat, keylocation, + fsname, &key_material, &key_material_len, NULL); + if (ret != 0) + goto error; + + /* passphrase formats require a salt and pbkdf2 iters property */ + if (keyformat == ZFS_KEYFORMAT_PASSPHRASE) { + /* always generate a new salt */ + ret = pkcs11_get_urandom((uint8_t *)&salt, sizeof (uint64_t)); + if (ret != sizeof (uint64_t)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to generate salt.")); + goto error; + } + + ret = nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt); + if (ret != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to add salt to properties.")); + goto error; + } + + /* + * If not otherwise specified, use the default number of + * pbkdf2 iterations. If specified, we have already checked + * that the given value is greater than MIN_PBKDF2_ITERATIONS + * during zfs_valid_proplist(). + */ + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); + if (ret == ENOENT) { + iters = DEFAULT_PBKDF2_ITERATIONS; + ret = nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters); + if (ret != 0) + goto error; + } else if (ret != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to get pbkdf2 iterations.")); + goto error; + } + } else { + /* check that pbkdf2iters was not specified by the user */ + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); + if (ret == 0) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Cannot specify pbkdf2iters with a non-passphrase " + "keyformat.")); + goto error; + } + } + + /* derive a key from the key material */ + ret = derive_key(hdl, keyformat, iters, key_material, key_material_len, + salt, &key_data); + if (ret != 0) + goto error; + + free(key_material); + + *wkeydata = key_data; + *wkeylen = WRAPPING_KEY_LEN; + return (0); + +error: + if (key_material != NULL) + free(key_material); + if (key_data != NULL) + free(key_data); + + *wkeydata = NULL; + *wkeylen = 0; + return (ret); +} + +static boolean_t +proplist_has_encryption_props(nvlist_t *props) +{ + int ret; + uint64_t intval; + char *strval; + + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &intval); + if (ret == 0 && intval != ZIO_CRYPT_OFF) + return (B_TRUE); + + ret = nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &strval); + if (ret == 0 && strcmp(strval, "none") != 0) + return (B_TRUE); + + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &intval); + if (ret == 0) + return (B_TRUE); + + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &intval); + if (ret == 0) + return (B_TRUE); + + return (B_FALSE); +} + +int +zfs_crypto_get_encryption_root(zfs_handle_t *zhp, boolean_t *is_encroot, + char *buf) +{ + int ret; + char prop_encroot[MAXNAMELEN]; + + /* if the dataset isn't encrypted, just return */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) == ZIO_CRYPT_OFF) { + *is_encroot = B_FALSE; + if (buf != NULL) + buf[0] = '\0'; + return (0); + } + + ret = zfs_prop_get(zhp, ZFS_PROP_ENCRYPTION_ROOT, prop_encroot, + sizeof (prop_encroot), NULL, NULL, 0, B_TRUE); + if (ret != 0) { + *is_encroot = B_FALSE; + if (buf != NULL) + buf[0] = '\0'; + return (ret); + } + + *is_encroot = strcmp(prop_encroot, zfs_get_name(zhp)) == 0; + if (buf != NULL) + strcpy(buf, prop_encroot); + + return (0); +} + +int +zfs_crypto_create(libzfs_handle_t *hdl, char *parent_name, nvlist_t *props, + nvlist_t *pool_props, boolean_t stdin_available, uint8_t **wkeydata_out, + uint_t *wkeylen_out) +{ + int ret; + char errbuf[1024]; + uint64_t crypt = ZIO_CRYPT_INHERIT, pcrypt = ZIO_CRYPT_INHERIT; + uint64_t keyformat = ZFS_KEYFORMAT_NONE; + char *keylocation = NULL; + zfs_handle_t *pzhp = NULL; + uint8_t *wkeydata = NULL; + uint_t wkeylen = 0; + boolean_t local_crypt = B_TRUE; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Encryption create error")); + + /* lookup crypt from props */ + ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt); + if (ret != 0) + local_crypt = B_FALSE; + + /* lookup key location and format from props */ + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); + (void) nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); + + if (parent_name != NULL) { + /* get a reference to parent dataset */ + pzhp = make_dataset_handle(hdl, parent_name); + if (pzhp == NULL) { + ret = ENOENT; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Failed to lookup parent.")); + goto out; + } + + /* Lookup parent's crypt */ + pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); + + /* Params require the encryption feature */ + if (!encryption_feature_is_enabled(pzhp->zpool_hdl)) { + if (proplist_has_encryption_props(props)) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Encryption feature not enabled.")); + goto out; + } + + ret = 0; + goto out; + } + } else { + /* + * special case for root dataset where encryption feature + * feature won't be on disk yet + */ + if (!nvlist_exists(pool_props, "feature@encryption")) { + if (proplist_has_encryption_props(props)) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Encryption feature not enabled.")); + goto out; + } + + ret = 0; + goto out; + } + + pcrypt = ZIO_CRYPT_OFF; + } + + /* Get the inherited encryption property if we don't have it locally */ + if (!local_crypt) + crypt = pcrypt; + + /* + * At this point crypt should be the actual encryption value. If + * encryption is off just verify that no encryption properties have + * been specified and return. + */ + if (crypt == ZIO_CRYPT_OFF) { + if (proplist_has_encryption_props(props)) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Encryption must be turned on to set encryption " + "properties.")); + goto out; + } + + ret = 0; + goto out; + } + + /* + * If we have a parent crypt it is valid to specify encryption alone. + * This will result in a child that is encrypted with the chosen + * encryption suite that will also inherit the parent's key. If + * the parent is not encrypted we need an encryption suite provided. + */ + if (pcrypt == ZIO_CRYPT_OFF && keylocation == NULL && + keyformat == ZFS_KEYFORMAT_NONE) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Keyformat required for new encryption root.")); + goto out; + } + + /* + * Specifying a keylocation implies this will be a new encryption root. + * Check that a keyformat is also specified. + */ + if (keylocation != NULL && keyformat == ZFS_KEYFORMAT_NONE) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Keyformat required for new encryption root.")); + goto out; + } + + /* default to prompt if no keylocation is specified */ + if (keyformat != ZFS_KEYFORMAT_NONE && keylocation == NULL) { + keylocation = "prompt"; + ret = nvlist_add_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), keylocation); + if (ret != 0) + goto out; + } + + /* + * If a local key is provided, this dataset will be a new + * encryption root. Populate the encryption params. + */ + if (keylocation != NULL) { + /* + * 'zfs recv -o keylocation=prompt' won't work because stdin + * is being used by the send stream, so we disallow it. + */ + if (!stdin_available && strcmp(keylocation, "prompt") == 0) { + ret = EINVAL; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Cannot use " + "'prompt' keylocation because stdin is in use.")); + goto out; + } + + ret = populate_create_encryption_params_nvlists(hdl, NULL, + B_FALSE, keyformat, keylocation, props, &wkeydata, + &wkeylen); + if (ret != 0) + goto out; + } + + if (pzhp != NULL) + zfs_close(pzhp); + + *wkeydata_out = wkeydata; + *wkeylen_out = wkeylen; + return (0); + +out: + if (pzhp != NULL) + zfs_close(pzhp); + if (wkeydata != NULL) + free(wkeydata); + + *wkeydata_out = NULL; + *wkeylen_out = 0; + return (ret); +} + +int +zfs_crypto_clone_check(libzfs_handle_t *hdl, zfs_handle_t *origin_zhp, + char *parent_name, nvlist_t *props) +{ + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Encryption clone error")); + + /* + * No encryption properties should be specified. They will all be + * inherited from the origin dataset. + */ + if (nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)) || + nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_KEYLOCATION)) || + nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)) || + nvlist_exists(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Encryption properties must inherit from origin dataset.")); + return (EINVAL); + } + + return (0); +} + +typedef struct loadkeys_cbdata { + uint64_t cb_numfailed; + uint64_t cb_numattempted; +} loadkey_cbdata_t; + +static int +load_keys_cb(zfs_handle_t *zhp, void *arg) +{ + int ret; + boolean_t is_encroot; + loadkey_cbdata_t *cb = arg; + uint64_t keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + + /* only attempt to load keys for encryption roots */ + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + if (ret != 0 || !is_encroot) + goto out; + + /* don't attempt to load already loaded keys */ + if (keystatus == ZFS_KEYSTATUS_AVAILABLE) + goto out; + + /* Attempt to load the key. Record status in cb. */ + cb->cb_numattempted++; + + ret = zfs_crypto_load_key(zhp, B_FALSE, NULL); + if (ret) + cb->cb_numfailed++; + +out: + (void) zfs_iter_filesystems(zhp, load_keys_cb, cb); + zfs_close(zhp); + + /* always return 0, since this function is best effort */ + return (0); +} + +/* + * This function is best effort. It attempts to load all the keys for the given + * filesystem and all of its children. + */ +int +zfs_crypto_attempt_load_keys(libzfs_handle_t *hdl, char *fsname) +{ + int ret; + zfs_handle_t *zhp = NULL; + loadkey_cbdata_t cb = { 0 }; + + zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ret = ENOENT; + goto error; + } + + ret = load_keys_cb(zfs_handle_dup(zhp), &cb); + if (ret) + goto error; + + (void) printf(gettext("%llu / %llu keys successfully loaded\n"), + (u_longlong_t)(cb.cb_numattempted - cb.cb_numfailed), + (u_longlong_t)cb.cb_numattempted); + + if (cb.cb_numfailed != 0) { + ret = -1; + goto error; + } + + zfs_close(zhp); + return (0); + +error: + if (zhp != NULL) + zfs_close(zhp); + return (ret); +} + +int +zfs_crypto_load_key(zfs_handle_t *zhp, boolean_t noop, char *alt_keylocation) +{ + int ret, attempts = 0; + char errbuf[1024]; + uint64_t keystatus, iters = 0, salt = 0; + uint64_t keyformat = ZFS_KEYFORMAT_NONE; + char prop_keylocation[MAXNAMELEN]; + char prop_encroot[MAXNAMELEN]; + char *keylocation = NULL; + uint8_t *key_material = NULL, *key_data = NULL; + size_t key_material_len; + boolean_t is_encroot, can_retry = B_FALSE, correctible = B_FALSE; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Key load error")); + + /* check that encryption is enabled for the pool */ + if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Encryption feature not enabled.")); + ret = EINVAL; + goto error; + } + + /* Fetch the keyformat. Check that the dataset is encrypted. */ + keyformat = zfs_prop_get_int(zhp, ZFS_PROP_KEYFORMAT); + if (keyformat == ZFS_KEYFORMAT_NONE) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "'%s' is not encrypted."), zfs_get_name(zhp)); + ret = EINVAL; + goto error; + } + + /* + * Fetch the key location. Check that we are working with an + * encryption root. + */ + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, prop_encroot); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Failed to get encryption root for '%s'."), + zfs_get_name(zhp)); + goto error; + } else if (!is_encroot) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Keys must be loaded for encryption root of '%s' (%s)."), + zfs_get_name(zhp), prop_encroot); + ret = EINVAL; + goto error; + } + + /* + * if the caller has elected to override the keylocation property + * use that instead + */ + if (alt_keylocation != NULL) { + keylocation = alt_keylocation; + } else { + ret = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, prop_keylocation, + sizeof (prop_keylocation), NULL, NULL, 0, B_TRUE); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Failed to get keylocation for '%s'."), + zfs_get_name(zhp)); + goto error; + } + + keylocation = prop_keylocation; + } + + /* check that the key is unloaded unless this is a noop */ + if (!noop) { + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + if (keystatus == ZFS_KEYSTATUS_AVAILABLE) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key already loaded for '%s'."), zfs_get_name(zhp)); + ret = EEXIST; + goto error; + } + } + + /* passphrase formats require a salt and pbkdf2_iters property */ + if (keyformat == ZFS_KEYFORMAT_PASSPHRASE) { + salt = zfs_prop_get_int(zhp, ZFS_PROP_PBKDF2_SALT); + iters = zfs_prop_get_int(zhp, ZFS_PROP_PBKDF2_ITERS); + } + +try_again: + /* fetching and deriving the key are correctable errors. set the flag */ + correctible = B_TRUE; + + /* get key material from key format and location */ + ret = get_key_material(zhp->zfs_hdl, B_FALSE, B_FALSE, keyformat, + keylocation, zfs_get_name(zhp), &key_material, &key_material_len, + &can_retry); + if (ret != 0) + goto error; + + /* derive a key from the key material */ + ret = derive_key(zhp->zfs_hdl, keyformat, iters, key_material, + key_material_len, salt, &key_data); + if (ret != 0) + goto error; + + correctible = B_FALSE; + + /* pass the wrapping key and noop flag to the ioctl */ + ret = lzc_load_key(zhp->zfs_name, noop, key_data, WRAPPING_KEY_LEN); + if (ret != 0) { + switch (ret) { + case EPERM: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Permission denied.")); + break; + case EINVAL: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Invalid parameters provided for dataset %s."), + zfs_get_name(zhp)); + break; + case EEXIST: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key already loaded for '%s'."), zfs_get_name(zhp)); + break; + case EBUSY: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "'%s' is busy."), zfs_get_name(zhp)); + break; + case EACCES: + correctible = B_TRUE; + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Incorrect key provided for '%s'."), + zfs_get_name(zhp)); + break; + } + goto error; + } + + free(key_material); + free(key_data); + + return (0); + +error: + zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); + if (key_material != NULL) { + free(key_material); + key_material = NULL; + } + if (key_data != NULL) { + free(key_data); + key_data = NULL; + } + + /* + * Here we decide if it is ok to allow the user to retry entering their + * key. The can_retry flag will be set if the user is entering their + * key from an interactive prompt. The correctable flag will only be + * set if an error that occurred could be corrected by retrying. Both + * flags are needed to allow the user to attempt key entry again + */ + attempts++; + if (can_retry && correctible && attempts < MAX_KEY_PROMPT_ATTEMPTS) + goto try_again; + + return (ret); +} + +int +zfs_crypto_unload_key(zfs_handle_t *zhp) +{ + int ret; + char errbuf[1024]; + char prop_encroot[MAXNAMELEN]; + uint64_t keystatus, keyformat; + boolean_t is_encroot; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Key unload error")); + + /* check that encryption is enabled for the pool */ + if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Encryption feature not enabled.")); + ret = EINVAL; + goto error; + } + + /* Fetch the keyformat. Check that the dataset is encrypted. */ + keyformat = zfs_prop_get_int(zhp, ZFS_PROP_KEYFORMAT); + if (keyformat == ZFS_KEYFORMAT_NONE) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "'%s' is not encrypted."), zfs_get_name(zhp)); + ret = EINVAL; + goto error; + } + + /* + * Fetch the key location. Check that we are working with an + * encryption root. + */ + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, prop_encroot); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Failed to get encryption root for '%s'."), + zfs_get_name(zhp)); + goto error; + } else if (!is_encroot) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Keys must be unloaded for encryption root of '%s' (%s)."), + zfs_get_name(zhp), prop_encroot); + ret = EINVAL; + goto error; + } + + /* check that the key is loaded */ + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key already unloaded for '%s'."), zfs_get_name(zhp)); + ret = EACCES; + goto error; + } + + /* call the ioctl */ + ret = lzc_unload_key(zhp->zfs_name); + + if (ret != 0) { + switch (ret) { + case EPERM: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Permission denied.")); + break; + case EACCES: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key already unloaded for '%s'."), + zfs_get_name(zhp)); + break; + case EBUSY: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "'%s' is busy."), zfs_get_name(zhp)); + break; + } + zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); + } + + return (ret); + +error: + zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); + return (ret); +} + +static int +zfs_crypto_verify_rewrap_nvlist(zfs_handle_t *zhp, nvlist_t *props, + nvlist_t **props_out, char *errbuf) +{ + int ret; + nvpair_t *elem = NULL; + zfs_prop_t prop; + nvlist_t *new_props = NULL; + + new_props = fnvlist_alloc(); + + /* + * loop through all provided properties, we should only have + * keyformat, keylocation and pbkdf2iters. The actual validation of + * values is done by zfs_valid_proplist(). + */ + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + const char *propname = nvpair_name(elem); + prop = zfs_name_to_prop(propname); + + switch (prop) { + case ZFS_PROP_PBKDF2_ITERS: + case ZFS_PROP_KEYFORMAT: + case ZFS_PROP_KEYLOCATION: + break; + default: + ret = EINVAL; + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Only keyformat, keylocation and pbkdf2iters may " + "be set with this command.")); + goto error; + } + } + + new_props = zfs_valid_proplist(zhp->zfs_hdl, zhp->zfs_type, props, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), NULL, zhp->zpool_hdl, + B_TRUE, errbuf); + if (new_props == NULL) { + ret = EINVAL; + goto error; + } + + *props_out = new_props; + return (0); + +error: + nvlist_free(new_props); + *props_out = NULL; + return (ret); +} + +int +zfs_crypto_rewrap(zfs_handle_t *zhp, nvlist_t *raw_props, boolean_t inheritkey) +{ + int ret; + char errbuf[1024]; + boolean_t is_encroot; + nvlist_t *props = NULL; + uint8_t *wkeydata = NULL; + uint_t wkeylen = 0; + dcp_cmd_t cmd = (inheritkey) ? DCP_CMD_INHERIT : DCP_CMD_NEW_KEY; + uint64_t crypt, pcrypt, keystatus, pkeystatus; + uint64_t keyformat = ZFS_KEYFORMAT_NONE; + zfs_handle_t *pzhp = NULL; + char *keylocation = NULL; + char origin_name[MAXNAMELEN]; + char prop_keylocation[MAXNAMELEN]; + char parent_name[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Key change error")); + + /* check that encryption is enabled for the pool */ + if (!encryption_feature_is_enabled(zhp->zpool_hdl)) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Encryption feature not enabled.")); + ret = EINVAL; + goto error; + } + + /* get crypt from dataset */ + crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); + if (crypt == ZIO_CRYPT_OFF) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Dataset not encrypted.")); + ret = EINVAL; + goto error; + } + + /* get the encryption root of the dataset */ + ret = zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Failed to get encryption root for '%s'."), + zfs_get_name(zhp)); + goto error; + } + + /* Clones use their origin's key and cannot rewrap it */ + ret = zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin_name, + sizeof (origin_name), NULL, NULL, 0, B_TRUE); + if (ret == 0 && strcmp(origin_name, "") != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Keys cannot be changed on clones.")); + ret = EINVAL; + goto error; + } + + /* + * If the user wants to use the inheritkey variant of this function + * we don't need to collect any crypto arguments. + */ + if (!inheritkey) { + /* validate the provided properties */ + ret = zfs_crypto_verify_rewrap_nvlist(zhp, raw_props, &props, + errbuf); + if (ret != 0) + goto error; + + /* + * Load keyformat and keylocation from the nvlist. Fetch from + * the dataset properties if not specified. + */ + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); + (void) nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); + + if (is_encroot) { + /* + * If this is already an encryption root, just keep + * any properties not set by the user. + */ + if (keyformat == ZFS_KEYFORMAT_NONE) { + keyformat = zfs_prop_get_int(zhp, + ZFS_PROP_KEYFORMAT); + ret = nvlist_add_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + keyformat); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, "Failed to " + "get existing keyformat " + "property.")); + goto error; + } + } + + if (keylocation == NULL) { + ret = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, + prop_keylocation, sizeof (prop_keylocation), + NULL, NULL, 0, B_TRUE); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, "Failed to " + "get existing keylocation " + "property.")); + goto error; + } + + keylocation = prop_keylocation; + } + } else { + /* need a new key for non-encryption roots */ + if (keyformat == ZFS_KEYFORMAT_NONE) { + ret = EINVAL; + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, "Keyformat required " + "for new encryption root.")); + goto error; + } + + /* default to prompt if no keylocation is specified */ + if (keylocation == NULL) { + keylocation = "prompt"; + ret = nvlist_add_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + keylocation); + if (ret != 0) + goto error; + } + } + + /* fetch the new wrapping key and associated properties */ + ret = populate_create_encryption_params_nvlists(zhp->zfs_hdl, + zhp, B_TRUE, keyformat, keylocation, props, &wkeydata, + &wkeylen); + if (ret != 0) + goto error; + } else { + /* check that zhp is an encryption root */ + if (!is_encroot) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key inheritting can only be performed on " + "encryption roots.")); + ret = EINVAL; + goto error; + } + + /* get the parent's name */ + ret = zfs_parent_name(zhp, parent_name, sizeof (parent_name)); + if (ret != 0) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Root dataset cannot inherit key.")); + ret = EINVAL; + goto error; + } + + /* get a handle to the parent */ + pzhp = make_dataset_handle(zhp->zfs_hdl, parent_name); + if (pzhp == NULL) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Failed to lookup parent.")); + ret = ENOENT; + goto error; + } + + /* parent must be encrypted */ + pcrypt = zfs_prop_get_int(pzhp, ZFS_PROP_ENCRYPTION); + if (pcrypt == ZIO_CRYPT_OFF) { + zfs_error_aux(pzhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Parent must be encrypted.")); + ret = EINVAL; + goto error; + } + + /* check that the parent's key is loaded */ + pkeystatus = zfs_prop_get_int(pzhp, ZFS_PROP_KEYSTATUS); + if (pkeystatus == ZFS_KEYSTATUS_UNAVAILABLE) { + zfs_error_aux(pzhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Parent key must be loaded.")); + ret = EACCES; + goto error; + } + } + + /* check that the key is loaded */ + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key must be loaded.")); + ret = EACCES; + goto error; + } + + /* call the ioctl */ + ret = lzc_change_key(zhp->zfs_name, cmd, props, wkeydata, wkeylen); + if (ret != 0) { + switch (ret) { + case EPERM: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Permission denied.")); + break; + case EINVAL: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Invalid properties for key change.")); + break; + case EACCES: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "Key is not currently loaded.")); + break; + } + zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); + } + + if (pzhp != NULL) + zfs_close(pzhp); + if (props != NULL) + nvlist_free(props); + if (wkeydata != NULL) + free(wkeydata); + + return (ret); + +error: + if (pzhp != NULL) + zfs_close(pzhp); + if (props != NULL) + nvlist_free(props); + if (wkeydata != NULL) + free(wkeydata); + + zfs_error(zhp->zfs_hdl, EZFS_CRYPTOFAILED, errbuf); + return (ret); +} diff --git a/lib/libzfs/libzfs_dataset.c b/lib/libzfs/libzfs_dataset.c new file mode 100644 index 000000000000..d548cc0ecabc --- /dev/null +++ b/lib/libzfs/libzfs_dataset.c @@ -0,0 +1,5492 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2019 Joyent, Inc. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012 DEY Storage Systems, Inc. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek . + * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017-2018 RackTop Systems. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_IDMAP +#include +#include +#include +#endif /* HAVE_IDMAP */ + +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "libzfs.h" +#include "zfs_deleg.h" + +static int userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); + +/* + * Given a single type (not a mask of types), return the type in a human + * readable form. + */ +const char * +zfs_type_to_name(zfs_type_t type) +{ + switch (type) { + case ZFS_TYPE_FILESYSTEM: + return (dgettext(TEXT_DOMAIN, "filesystem")); + case ZFS_TYPE_SNAPSHOT: + return (dgettext(TEXT_DOMAIN, "snapshot")); + case ZFS_TYPE_VOLUME: + return (dgettext(TEXT_DOMAIN, "volume")); + case ZFS_TYPE_POOL: + return (dgettext(TEXT_DOMAIN, "pool")); + case ZFS_TYPE_BOOKMARK: + return (dgettext(TEXT_DOMAIN, "bookmark")); + default: + assert(!"unhandled zfs_type_t"); + } + + return (NULL); +} + +/* + * Validate a ZFS path. This is used even before trying to open the dataset, to + * provide a more meaningful error message. We call zfs_error_aux() to + * explain exactly why the name was not valid. + */ +int +zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, + boolean_t modifying) +{ + namecheck_err_t why; + char what; + + if (!(type & ZFS_TYPE_SNAPSHOT) && strchr(path, '@') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshot delimiter '@' is not expected here")); + return (0); + } + + if (type == ZFS_TYPE_SNAPSHOT && strchr(path, '@') == NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing '@' delimiter in snapshot name")); + return (0); + } + + if (!(type & ZFS_TYPE_BOOKMARK) && strchr(path, '#') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bookmark delimiter '#' is not expected here")); + return (0); + } + + if (type == ZFS_TYPE_BOOKMARK && strchr(path, '#') == NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing '#' delimiter in bookmark name")); + return (0); + } + + if (modifying && strchr(path, '%') != NULL) { + if (hdl != NULL) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid character %c in name"), '%'); + return (0); + } + + if (entity_namecheck(path, &why, &what) != 0) { + if (hdl != NULL) { + switch (why) { + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is too long")); + break; + + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "leading slash in name")); + break; + + case NAME_ERR_EMPTY_COMPONENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty component or misplaced '@'" + " or '#' delimiter in name")); + break; + + case NAME_ERR_TRAILING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "trailing slash in name")); + break; + + case NAME_ERR_INVALCHAR: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "invalid character " + "'%c' in name"), what); + break; + + case NAME_ERR_MULTIPLE_DELIMITERS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple '@' and/or '#' delimiters in " + "name")); + break; + + case NAME_ERR_NOLETTER: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool doesn't begin with a letter")); + break; + + case NAME_ERR_RESERVED: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is reserved")); + break; + + case NAME_ERR_DISKLIKE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "reserved disk name")); + break; + + case NAME_ERR_SELF_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "self reference, '.' is found in name")); + break; + + case NAME_ERR_PARENT_REF: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent reference, '..' is found in name")); + break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "(%d) not defined"), why); + break; + } + } + + return (0); + } + + return (-1); +} + +int +zfs_name_valid(const char *name, zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (zpool_name_valid(NULL, B_FALSE, name)); + return (zfs_validate_name(NULL, name, type, B_FALSE)); +} + +/* + * This function takes the raw DSL properties, and filters out the user-defined + * properties into a separate nvlist. + */ +static nvlist_t * +process_user_props(zfs_handle_t *zhp, nvlist_t *props) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvpair_t *elem; + nvlist_t *propval; + nvlist_t *nvl; + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + if (!zfs_prop_user(nvpair_name(elem))) + continue; + + verify(nvpair_value_nvlist(elem, &propval) == 0); + if (nvlist_add_nvlist(nvl, nvpair_name(elem), propval) != 0) { + nvlist_free(nvl); + (void) no_memory(hdl); + return (NULL); + } + } + + return (nvl); +} + +static zpool_handle_t * +zpool_add_handle(zfs_handle_t *zhp, const char *pool_name) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zpool_handle_t *zph; + + if ((zph = zpool_open_canfail(hdl, pool_name)) != NULL) { + if (hdl->libzfs_pool_handles != NULL) + zph->zpool_next = hdl->libzfs_pool_handles; + hdl->libzfs_pool_handles = zph; + } + return (zph); +} + +static zpool_handle_t * +zpool_find_handle(zfs_handle_t *zhp, const char *pool_name, int len) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zpool_handle_t *zph = hdl->libzfs_pool_handles; + + while ((zph != NULL) && + (strncmp(pool_name, zpool_get_name(zph), len) != 0)) + zph = zph->zpool_next; + return (zph); +} + +/* + * Returns a handle to the pool that contains the provided dataset. + * If a handle to that pool already exists then that handle is returned. + * Otherwise, a new handle is created and added to the list of handles. + */ +static zpool_handle_t * +zpool_handle(zfs_handle_t *zhp) +{ + char *pool_name; + int len; + zpool_handle_t *zph; + + len = strcspn(zhp->zfs_name, "/@#") + 1; + pool_name = zfs_alloc(zhp->zfs_hdl, len); + (void) strlcpy(pool_name, zhp->zfs_name, len); + + zph = zpool_find_handle(zhp, pool_name, len); + if (zph == NULL) + zph = zpool_add_handle(zhp, pool_name); + + free(pool_name); + return (zph); +} + +void +zpool_free_handles(libzfs_handle_t *hdl) +{ + zpool_handle_t *next, *zph = hdl->libzfs_pool_handles; + + while (zph != NULL) { + next = zph->zpool_next; + zpool_close(zph); + zph = next; + } + hdl->libzfs_pool_handles = NULL; +} + +/* + * Utility function to gather stats (objset and zpl) for the given object. + */ +static int +get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + + while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { + return (-1); + } + } else { + return (-1); + } + } + return (0); +} + +/* + * Utility function to get the received properties of the given object. + */ +static int +get_recvd_props_ioctl(zfs_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *recvdprops; + zfs_cmd_t zc = {"\0"}; + int err; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + while (zfs_ioctl(hdl, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); + zcmd_free_nvlists(&zc); + if (err != 0) + return (-1); + + nvlist_free(zhp->zfs_recvd_props); + zhp->zfs_recvd_props = recvdprops; + + return (0); +} + +static int +put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + nvlist_t *allprops, *userprops; + + zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ + + if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { + return (-1); + } + + /* + * XXX Why do we store the user props separately, in addition to + * storing them in zfs_props? + */ + if ((userprops = process_user_props(zhp, allprops)) == NULL) { + nvlist_free(allprops); + return (-1); + } + + nvlist_free(zhp->zfs_props); + nvlist_free(zhp->zfs_user_props); + + zhp->zfs_props = allprops; + zhp->zfs_user_props = userprops; + + return (0); +} + +static int +get_stats(zfs_handle_t *zhp) +{ + int rc = 0; + zfs_cmd_t zc = {"\0"}; + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + if (get_stats_ioctl(zhp, &zc) != 0) + rc = -1; + else if (put_stats_zhdl(zhp, &zc) != 0) + rc = -1; + zcmd_free_nvlists(&zc); + return (rc); +} + +/* + * Refresh the properties currently stored in the handle. + */ +void +zfs_refresh_properties(zfs_handle_t *zhp) +{ + (void) get_stats(zhp); +} + +/* + * Makes a handle from the given dataset name. Used by zfs_open() and + * zfs_iter_* to create child handles on the fly. + */ +static int +make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + if (put_stats_zhdl(zhp, zc) != 0) + return (-1); + + /* + * We've managed to open the dataset and gather statistics. Determine + * the high-level type. + */ + if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) + zhp->zfs_head_type = ZFS_TYPE_VOLUME; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) + zhp->zfs_head_type = ZFS_TYPE_FILESYSTEM; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_OTHER) + return (-1); + else + abort(); + + if (zhp->zfs_dmustats.dds_is_snapshot) + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) + zhp->zfs_type = ZFS_TYPE_VOLUME; + else if (zhp->zfs_dmustats.dds_type == DMU_OST_ZFS) + zhp->zfs_type = ZFS_TYPE_FILESYSTEM; + else + abort(); /* we should never see any other types */ + + if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) + return (-1); + + return (0); +} + +zfs_handle_t * +make_dataset_handle(libzfs_handle_t *hdl, const char *path) +{ + zfs_cmd_t zc = {"\0"}; + + zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { + free(zhp); + return (NULL); + } + if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_free_nvlists(&zc); + free(zhp); + return (NULL); + } + if (make_dataset_handle_common(zhp, &zc) == -1) { + free(zhp); + zhp = NULL; + } + zcmd_free_nvlists(&zc); + return (zhp); +} + +zfs_handle_t * +make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) +{ + zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); + if (make_dataset_handle_common(zhp, zc) == -1) { + free(zhp); + return (NULL); + } + return (zhp); +} + +zfs_handle_t * +make_dataset_simple_handle_zc(zfs_handle_t *pzhp, zfs_cmd_t *zc) +{ + zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = pzhp->zfs_hdl; + (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); + zhp->zfs_head_type = pzhp->zfs_type; + zhp->zfs_type = ZFS_TYPE_SNAPSHOT; + zhp->zpool_hdl = zpool_handle(zhp); + + return (zhp); +} + +zfs_handle_t * +zfs_handle_dup(zfs_handle_t *zhp_orig) +{ + zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = zhp_orig->zfs_hdl; + zhp->zpool_hdl = zhp_orig->zpool_hdl; + (void) strlcpy(zhp->zfs_name, zhp_orig->zfs_name, + sizeof (zhp->zfs_name)); + zhp->zfs_type = zhp_orig->zfs_type; + zhp->zfs_head_type = zhp_orig->zfs_head_type; + zhp->zfs_dmustats = zhp_orig->zfs_dmustats; + if (zhp_orig->zfs_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_props, &zhp->zfs_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_user_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_user_props, + &zhp->zfs_user_props, 0) != 0) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + if (zhp_orig->zfs_recvd_props != NULL) { + if (nvlist_dup(zhp_orig->zfs_recvd_props, + &zhp->zfs_recvd_props, 0)) { + (void) no_memory(zhp->zfs_hdl); + zfs_close(zhp); + return (NULL); + } + } + zhp->zfs_mntcheck = zhp_orig->zfs_mntcheck; + if (zhp_orig->zfs_mntopts != NULL) { + zhp->zfs_mntopts = zfs_strdup(zhp_orig->zfs_hdl, + zhp_orig->zfs_mntopts); + } + zhp->zfs_props_table = zhp_orig->zfs_props_table; + return (zhp); +} + +boolean_t +zfs_bookmark_exists(const char *path) +{ + nvlist_t *bmarks; + nvlist_t *props; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *bmark_name; + char *pound; + int err; + boolean_t rv; + + (void) strlcpy(fsname, path, sizeof (fsname)); + pound = strchr(fsname, '#'); + if (pound == NULL) + return (B_FALSE); + + *pound = '\0'; + bmark_name = pound + 1; + props = fnvlist_alloc(); + err = lzc_get_bookmarks(fsname, props, &bmarks); + nvlist_free(props); + if (err != 0) { + nvlist_free(bmarks); + return (B_FALSE); + } + + rv = nvlist_exists(bmarks, bmark_name); + nvlist_free(bmarks); + return (rv); +} + +zfs_handle_t * +make_bookmark_handle(zfs_handle_t *parent, const char *path, + nvlist_t *bmark_props) +{ + zfs_handle_t *zhp = calloc(1, sizeof (zfs_handle_t)); + + if (zhp == NULL) + return (NULL); + + /* Fill in the name. */ + zhp->zfs_hdl = parent->zfs_hdl; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + + /* Set the property lists. */ + if (nvlist_dup(bmark_props, &zhp->zfs_props, 0) != 0) { + free(zhp); + return (NULL); + } + + /* Set the types. */ + zhp->zfs_head_type = parent->zfs_head_type; + zhp->zfs_type = ZFS_TYPE_BOOKMARK; + + if ((zhp->zpool_hdl = zpool_handle(zhp)) == NULL) { + nvlist_free(zhp->zfs_props); + free(zhp); + return (NULL); + } + + return (zhp); +} + +struct zfs_open_bookmarks_cb_data { + const char *path; + zfs_handle_t *zhp; +}; + +static int +zfs_open_bookmarks_cb(zfs_handle_t *zhp, void *data) +{ + struct zfs_open_bookmarks_cb_data *dp = data; + + /* + * Is it the one we are looking for? + */ + if (strcmp(dp->path, zfs_get_name(zhp)) == 0) { + /* + * We found it. Save it and let the caller know we are done. + */ + dp->zhp = zhp; + return (EEXIST); + } + + /* + * Not found. Close the handle and ask for another one. + */ + zfs_close(zhp); + return (0); +} + +/* + * Opens the given snapshot, bookmark, filesystem, or volume. The 'types' + * argument is a mask of acceptable types. The function will print an + * appropriate error message and return NULL if it can't be opened. + */ +zfs_handle_t * +zfs_open(libzfs_handle_t *hdl, const char *path, int types) +{ + zfs_handle_t *zhp; + char errbuf[1024]; + char *bookp; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); + + /* + * Validate the name before we even try to open it. + */ + if (!zfs_validate_name(hdl, path, types, B_FALSE)) { + (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + return (NULL); + } + + /* + * Bookmarks needs to be handled separately. + */ + bookp = strchr(path, '#'); + if (bookp == NULL) { + /* + * Try to get stats for the dataset, which will tell us if it + * exists. + */ + errno = 0; + if ((zhp = make_dataset_handle(hdl, path)) == NULL) { + (void) zfs_standard_error(hdl, errno, errbuf); + return (NULL); + } + } else { + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + zfs_handle_t *pzhp; + struct zfs_open_bookmarks_cb_data cb_data = {path, NULL}; + + /* + * We need to cut out '#' and everything after '#' + * to get the parent dataset name only. + */ + assert(bookp - path < sizeof (dsname)); + (void) strncpy(dsname, path, bookp - path); + dsname[bookp - path] = '\0'; + + /* + * Create handle for the parent dataset. + */ + errno = 0; + if ((pzhp = make_dataset_handle(hdl, dsname)) == NULL) { + (void) zfs_standard_error(hdl, errno, errbuf); + return (NULL); + } + + /* + * Iterate bookmarks to find the right one. + */ + errno = 0; + if ((zfs_iter_bookmarks(pzhp, zfs_open_bookmarks_cb, + &cb_data) == 0) && (cb_data.zhp == NULL)) { + (void) zfs_error(hdl, EZFS_NOENT, errbuf); + zfs_close(pzhp); + return (NULL); + } + if (cb_data.zhp == NULL) { + (void) zfs_standard_error(hdl, errno, errbuf); + zfs_close(pzhp); + return (NULL); + } + zhp = cb_data.zhp; + + /* + * Cleanup. + */ + zfs_close(pzhp); + } + + if (!(types & zhp->zfs_type)) { + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + zfs_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Release a ZFS handle. Nothing to do but free the associated memory. + */ +void +zfs_close(zfs_handle_t *zhp) +{ + if (zhp->zfs_mntopts) + free(zhp->zfs_mntopts); + nvlist_free(zhp->zfs_props); + nvlist_free(zhp->zfs_user_props); + nvlist_free(zhp->zfs_recvd_props); + free(zhp); +} + +typedef struct mnttab_node { + struct mnttab mtn_mt; + avl_node_t mtn_node; +} mnttab_node_t; + +static int +libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) +{ + const mnttab_node_t *mtn1 = (const mnttab_node_t *)arg1; + const mnttab_node_t *mtn2 = (const mnttab_node_t *)arg2; + int rv; + + rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); + + return (TREE_ISIGN(rv)); +} + +void +libzfs_mnttab_init(libzfs_handle_t *hdl) +{ + pthread_mutex_init(&hdl->libzfs_mnttab_cache_lock, NULL); + assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); + avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, + sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); +} + +static int +libzfs_mnttab_update(libzfs_handle_t *hdl) +{ + struct mnttab entry; + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + return (ENOENT); + + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + mnttab_node_t *mtn; + avl_index_t where; + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); + + /* Exclude duplicate mounts */ + if (avl_find(&hdl->libzfs_mnttab_cache, mtn, &where) != NULL) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); + free(mtn); + continue; + } + + avl_add(&hdl->libzfs_mnttab_cache, mtn); + } + + return (0); +} + +void +libzfs_mnttab_fini(libzfs_handle_t *hdl) +{ + void *cookie = NULL; + mnttab_node_t *mtn; + + while ((mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) + != NULL) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); + free(mtn); + } + avl_destroy(&hdl->libzfs_mnttab_cache); + (void) pthread_mutex_destroy(&hdl->libzfs_mnttab_cache_lock); +} + +void +libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) +{ + hdl->libzfs_mnttab_enable = enable; +} + +int +libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, + struct mnttab *entry) +{ + mnttab_node_t find; + mnttab_node_t *mtn; + int ret = ENOENT; + + if (!hdl->libzfs_mnttab_enable) { + struct mnttab srch = { 0 }; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache)) + libzfs_mnttab_fini(hdl); + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + return (ENOENT); + + srch.mnt_special = (char *)fsname; + srch.mnt_fstype = MNTTYPE_ZFS; + if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) + return (0); + else + return (ENOENT); + } + + pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) { + int error; + + if ((error = libzfs_mnttab_update(hdl)) != 0) { + pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); + return (error); + } + } + + find.mtn_mt.mnt_special = (char *)fsname; + mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); + if (mtn) { + *entry = mtn->mtn_mt; + ret = 0; + } + pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); + return (ret); +} + +void +libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, + const char *mountp, const char *mntopts) +{ + mnttab_node_t *mtn; + + pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); + if (avl_numnodes(&hdl->libzfs_mnttab_cache) != 0) { + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); + /* + * Another thread may have already added this entry + * via libzfs_mnttab_update. If so we should skip it. + */ + if (avl_find(&hdl->libzfs_mnttab_cache, mtn, NULL) != NULL) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); + free(mtn); + } else { + avl_add(&hdl->libzfs_mnttab_cache, mtn); + } + } + pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); +} + +void +libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) +{ + mnttab_node_t find; + mnttab_node_t *ret; + + pthread_mutex_lock(&hdl->libzfs_mnttab_cache_lock); + find.mtn_mt.mnt_special = (char *)fsname; + if ((ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) + != NULL) { + avl_remove(&hdl->libzfs_mnttab_cache, ret); + free(ret->mtn_mt.mnt_special); + free(ret->mtn_mt.mnt_mountp); + free(ret->mtn_mt.mnt_fstype); + free(ret->mtn_mt.mnt_mntopts); + free(ret); + } + pthread_mutex_unlock(&hdl->libzfs_mnttab_cache_lock); +} + +int +zfs_spa_version(zfs_handle_t *zhp, int *spa_version) +{ + zpool_handle_t *zpool_handle = zhp->zpool_hdl; + + if (zpool_handle == NULL) + return (-1); + + *spa_version = zpool_get_prop_int(zpool_handle, + ZPOOL_PROP_VERSION, NULL); + return (0); +} + +/* + * The choice of reservation property depends on the SPA version. + */ +static int +zfs_which_resv_prop(zfs_handle_t *zhp, zfs_prop_t *resv_prop) +{ + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + if (spa_version >= SPA_VERSION_REFRESERVATION) + *resv_prop = ZFS_PROP_REFRESERVATION; + else + *resv_prop = ZFS_PROP_RESERVATION; + + return (0); +} + +/* + * Given an nvlist of properties to set, validates that they are correct, and + * parses any numeric properties (index, boolean, etc) if they are specified as + * strings. + */ +nvlist_t * +zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, + uint64_t zoned, zfs_handle_t *zhp, zpool_handle_t *zpool_hdl, + boolean_t key_params_ok, const char *errbuf) +{ + nvpair_t *elem; + uint64_t intval; + char *strval; + zfs_prop_t prop; + nvlist_t *ret; + int chosen_normal = -1; + int chosen_utf = -1; + + if (nvlist_alloc(&ret, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + /* + * Make sure this property is valid and applies to this type. + */ + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + const char *propname = nvpair_name(elem); + + prop = zfs_name_to_prop(propname); + if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { + /* + * This is a user property: make sure it's a + * string, and that it's less than ZAP_MAXNAMELEN. + */ + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property name '%s' is too long"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (nvlist_add_string(ret, propname, strval) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + /* + * Currently, only user properties can be modified on + * snapshots. + */ + if (type == ZFS_TYPE_SNAPSHOT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "this property can not be modified for snapshots")); + (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); + goto error; + } + + if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { + zfs_userquota_prop_t uqtype; + char *newpropname = NULL; + char domain[128]; + uint64_t rid; + uint64_t valary[3]; + int rc; + + if (userquota_propname_decode(propname, zoned, + &uqtype, domain, sizeof (domain), &rid) != 0) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' has an invalid user/group name"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (uqtype != ZFS_PROP_USERQUOTA && + uqtype != ZFS_PROP_GROUPQUOTA && + uqtype != ZFS_PROP_USEROBJQUOTA && + uqtype != ZFS_PROP_GROUPOBJQUOTA && + uqtype != ZFS_PROP_PROJECTQUOTA && + uqtype != ZFS_PROP_PROJECTOBJQUOTA) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, + errbuf); + goto error; + } + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, "none") == 0) { + intval = 0; + } else if (zfs_nicestrtonum(hdl, + strval, &intval) != 0) { + (void) zfs_error(hdl, + EZFS_BADPROP, errbuf); + goto error; + } + } else if (nvpair_type(elem) == + DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, &intval); + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable " + "{user|group|project}quota")); + goto error; + } + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* + * Encode the prop name as + * userquota@-domain, to make it easy + * for the kernel to decode. + */ + rc = asprintf(&newpropname, "%s%llx-%s", + zfs_userquota_prop_prefixes[uqtype], + (longlong_t)rid, domain); + if (rc == -1 || newpropname == NULL) { + (void) no_memory(hdl); + goto error; + } + + valary[0] = uqtype; + valary[1] = rid; + valary[2] = intval; + if (nvlist_add_uint64_array(ret, newpropname, + valary, 3) != 0) { + free(newpropname); + (void) no_memory(hdl); + goto error; + } + free(newpropname); + continue; + } else if (prop == ZPROP_INVAL && zfs_prop_written(propname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; + } + + if (prop == ZPROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (!zfs_prop_valid_for_type(prop, type, B_FALSE)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' does not " + "apply to datasets of this type"), propname); + (void) zfs_error(hdl, EZFS_PROPTYPE, errbuf); + goto error; + } + + if (zfs_prop_readonly(prop) && + !(zfs_prop_setonce(prop) && zhp == NULL) && + !(zfs_prop_encryption_key_param(prop) && key_params_ok)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; + } + + if (zprop_parse_value(hdl, elem, prop, type, ret, + &strval, &intval, errbuf) != 0) + goto error; + + /* + * Perform some additional checks for specific properties. + */ + switch (prop) { + case ZFS_PROP_VERSION: + { + int version; + + if (zhp == NULL) + break; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (intval < version) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Can not downgrade; already at version %u"), + version); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + } + + case ZFS_PROP_VOLBLOCKSIZE: + case ZFS_PROP_RECORDSIZE: + { + int maxbs = SPA_MAXBLOCKSIZE; + char buf[64]; + + if (zpool_hdl != NULL) { + maxbs = zpool_get_prop_int(zpool_hdl, + ZPOOL_PROP_MAXBLOCKSIZE, NULL); + } + /* + * The value must be a power of two between + * SPA_MINBLOCKSIZE and maxbs. + */ + if (intval < SPA_MINBLOCKSIZE || + intval > maxbs || !ISP2(intval)) { + zfs_nicebytes(maxbs, buf, sizeof (buf)); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be power of 2 from 512B " + "to %s"), propname, buf); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + } + + case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + { + int maxbs = SPA_OLD_MAXBLOCKSIZE; + char buf[64]; + + if (zpool_hdl != NULL) { + char state[64] = ""; + + maxbs = zpool_get_prop_int(zpool_hdl, + ZPOOL_PROP_MAXBLOCKSIZE, NULL); + + /* + * Issue a warning but do not fail so that + * tests for settable properties succeed. + */ + if (zpool_prop_get_feature(zpool_hdl, + "feature@allocation_classes", state, + sizeof (state)) != 0 || + strcmp(state, ZFS_FEATURE_ACTIVE) != 0) { + (void) fprintf(stderr, gettext( + "%s: property requires a special " + "device in the pool\n"), propname); + } + } + if (intval != 0 && + (intval < SPA_MINBLOCKSIZE || + intval > maxbs || !ISP2(intval))) { + zfs_nicebytes(maxbs, buf, sizeof (buf)); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid '%s=%d' property: must be zero or " + "a power of 2 from 512B to %s"), propname, + intval, buf); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + } + + case ZFS_PROP_MLSLABEL: + { +#ifdef HAVE_MLSLABEL + /* + * Verify the mlslabel string and convert to + * internal hex label string. + */ + + m_label_t *new_sl; + char *hex = NULL; /* internal label string */ + + /* Default value is already OK. */ + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + break; + + /* Verify the label can be converted to binary form */ + if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || + (str_to_label(strval, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1)) { + goto badlabel; + } + + /* Now translate to hex internal label string */ + if (label_to_str(new_sl, &hex, M_INTERNAL, + DEF_NAMES) != 0) { + if (hex) + free(hex); + goto badlabel; + } + m_label_free(new_sl); + + /* If string is already in internal form, we're done. */ + if (strcmp(strval, hex) == 0) { + free(hex); + break; + } + + /* Replace the label string with the internal form. */ + (void) nvlist_remove(ret, zfs_prop_to_name(prop), + DATA_TYPE_STRING); + verify(nvlist_add_string(ret, zfs_prop_to_name(prop), + hex) == 0); + free(hex); + + break; + +badlabel: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid mlslabel '%s'"), strval); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + m_label_free(new_sl); /* OK if null */ + goto error; +#else + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "mlslabels are unsupported")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; +#endif /* HAVE_MLSLABEL */ + } + + case ZFS_PROP_MOUNTPOINT: + { + namecheck_err_t why; + + if (strcmp(strval, ZFS_MOUNTPOINT_NONE) == 0 || + strcmp(strval, ZFS_MOUNTPOINT_LEGACY) == 0) + break; + + if (mountpoint_namecheck(strval, &why)) { + switch (why) { + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' must be an absolute path, " + "'none', or 'legacy'"), propname); + break; + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "component of '%s' is too long"), + propname); + break; + + default: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "(%d) not defined"), + why); + break; + } + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + } + + /*FALLTHRU*/ + + case ZFS_PROP_SHARESMB: + case ZFS_PROP_SHARENFS: + /* + * For the mountpoint and sharenfs or sharesmb + * properties, check if it can be set in a + * global/non-global zone based on + * the zoned property value: + * + * global zone non-global zone + * -------------------------------------------------- + * zoned=on mountpoint (no) mountpoint (yes) + * sharenfs (no) sharenfs (no) + * sharesmb (no) sharesmb (no) + * + * zoned=off mountpoint (yes) N/A + * sharenfs (yes) + * sharesmb (yes) + */ + if (zoned) { + if (getzoneid() == GLOBAL_ZONEID) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set on " + "dataset in a non-global zone"), + propname); + (void) zfs_error(hdl, EZFS_ZONED, + errbuf); + goto error; + } else if (prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set in " + "a non-global zone"), propname); + (void) zfs_error(hdl, EZFS_ZONED, + errbuf); + goto error; + } + } else if (getzoneid() != GLOBAL_ZONEID) { + /* + * If zoned property is 'off', this must be in + * a global zone. If not, something is wrong. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set while dataset " + "'zoned' property is set"), propname); + (void) zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + /* + * At this point, it is legitimate to set the + * property. Now we want to make sure that the + * property value is valid if it is sharenfs. + */ + if ((prop == ZFS_PROP_SHARENFS || + prop == ZFS_PROP_SHARESMB) && + strcmp(strval, "on") != 0 && + strcmp(strval, "off") != 0) { + zfs_share_proto_t proto; + + if (prop == ZFS_PROP_SHARESMB) + proto = PROTO_SMB; + else + proto = PROTO_NFS; + + if (zfs_parse_options(strval, proto) != SA_OK) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be set to invalid " + "options"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + } + + break; + + case ZFS_PROP_KEYLOCATION: + if (!zfs_prop_valid_keylocation(strval, B_FALSE)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid keylocation")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (zhp != NULL) { + uint64_t crypt = + zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); + + if (crypt == ZIO_CRYPT_OFF && + strcmp(strval, "none") != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "keylocation must be 'none' " + "for unencrypted datasets")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } else if (crypt != ZIO_CRYPT_OFF && + strcmp(strval, "none") == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "keylocation must not be 'none' " + "for encrypted datasets")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + } + break; + + case ZFS_PROP_PBKDF2_ITERS: + if (intval < MIN_PBKDF2_ITERATIONS) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "minimum pbkdf2 iterations is %u"), + MIN_PBKDF2_ITERATIONS); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + + case ZFS_PROP_UTF8ONLY: + chosen_utf = (int)intval; + break; + + case ZFS_PROP_NORMALIZE: + chosen_normal = (int)intval; + break; + + default: + break; + } + + /* + * For changes to existing volumes, we have some additional + * checks to enforce. + */ + if (type == ZFS_TYPE_VOLUME && zhp != NULL) { + uint64_t blocksize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLBLOCKSIZE); + char buf[64]; + + switch (prop) { + case ZFS_PROP_VOLSIZE: + if (intval % blocksize != 0) { + zfs_nicebytes(blocksize, buf, + sizeof (buf)); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a multiple of " + "volume block size (%s)"), + propname, buf); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' cannot be zero"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + break; + + default: + break; + } + } + + /* check encryption properties */ + if (zhp != NULL) { + int64_t crypt = zfs_prop_get_int(zhp, + ZFS_PROP_ENCRYPTION); + + switch (prop) { + case ZFS_PROP_COPIES: + if (crypt != ZIO_CRYPT_OFF && intval > 2) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encrypted datasets cannot have " + "3 copies")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + break; + default: + break; + } + } + } + + /* + * If normalization was chosen, but no UTF8 choice was made, + * enforce rejection of non-UTF8 names. + * + * If normalization was chosen, but rejecting non-UTF8 names + * was explicitly not chosen, it is an error. + */ + if (chosen_normal > 0 && chosen_utf < 0) { + if (nvlist_add_uint64(ret, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), 1) != 0) { + (void) no_memory(hdl); + goto error; + } + } else if (chosen_normal > 0 && chosen_utf == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be set 'on' if normalization chosen"), + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + return (ret); + +error: + nvlist_free(ret); + return (NULL); +} + +static int +zfs_add_synthetic_resv(zfs_handle_t *zhp, nvlist_t *nvl) +{ + uint64_t old_volsize; + uint64_t new_volsize; + uint64_t old_reservation; + uint64_t new_reservation; + zfs_prop_t resv_prop; + nvlist_t *props; + zpool_handle_t *zph = zpool_handle(zhp); + + /* + * If this is an existing volume, and someone is setting the volsize, + * make sure that it matches the reservation, or add it if necessary. + */ + old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + if (zfs_which_resv_prop(zhp, &resv_prop) < 0) + return (-1); + old_reservation = zfs_prop_get_int(zhp, resv_prop); + + props = fnvlist_alloc(); + fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); + + if ((zvol_volsize_to_reservation(zph, old_volsize, props) != + old_reservation) || nvlist_exists(nvl, + zfs_prop_to_name(resv_prop))) { + fnvlist_free(props); + return (0); + } + if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &new_volsize) != 0) { + fnvlist_free(props); + return (-1); + } + new_reservation = zvol_volsize_to_reservation(zph, new_volsize, props); + fnvlist_free(props); + + if (nvlist_add_uint64(nvl, zfs_prop_to_name(resv_prop), + new_reservation) != 0) { + (void) no_memory(zhp->zfs_hdl); + return (-1); + } + return (1); +} + +/* + * Helper for 'zfs {set|clone} refreservation=auto'. Must be called after + * zfs_valid_proplist(), as it is what sets the UINT64_MAX sentinel value. + * Return codes must match zfs_add_synthetic_resv(). + */ +static int +zfs_fix_auto_resv(zfs_handle_t *zhp, nvlist_t *nvl) +{ + uint64_t volsize; + uint64_t resvsize; + zfs_prop_t prop; + nvlist_t *props; + + if (!ZFS_IS_VOLUME(zhp)) { + return (0); + } + + if (zfs_which_resv_prop(zhp, &prop) != 0) { + return (-1); + } + + if (prop != ZFS_PROP_REFRESERVATION) { + return (0); + } + + if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(prop), &resvsize) != 0) { + /* No value being set, so it can't be "auto" */ + return (0); + } + if (resvsize != UINT64_MAX) { + /* Being set to a value other than "auto" */ + return (0); + } + + props = fnvlist_alloc(); + + fnvlist_add_uint64(props, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + zfs_prop_get_int(zhp, ZFS_PROP_VOLBLOCKSIZE)); + + if (nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_VOLSIZE), + &volsize) != 0) { + volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + } + + resvsize = zvol_volsize_to_reservation(zpool_handle(zhp), volsize, + props); + fnvlist_free(props); + + (void) nvlist_remove_all(nvl, zfs_prop_to_name(prop)); + if (nvlist_add_uint64(nvl, zfs_prop_to_name(prop), resvsize) != 0) { + (void) no_memory(zhp->zfs_hdl); + return (-1); + } + return (1); +} + +static boolean_t +zfs_is_namespace_prop(zfs_prop_t prop) +{ + switch (prop) { + + case ZFS_PROP_ATIME: + case ZFS_PROP_RELATIME: + case ZFS_PROP_DEVICES: + case ZFS_PROP_EXEC: + case ZFS_PROP_SETUID: + case ZFS_PROP_READONLY: + case ZFS_PROP_XATTR: + case ZFS_PROP_NBMAND: + return (B_TRUE); + + default: + return (B_FALSE); + } +} + +/* + * Given a property name and value, set the property for the given dataset. + */ +int +zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) +{ + int ret = -1; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *nvl = NULL; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zfs_name); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_string(nvl, propname, propval) != 0) { + (void) no_memory(hdl); + goto error; + } + + ret = zfs_prop_set_list(zhp, nvl); + +error: + nvlist_free(nvl); + return (ret); +} + + + +/* + * Given an nvlist of property names and values, set the properties for the + * given dataset. + */ +int +zfs_prop_set_list(zfs_handle_t *zhp, nvlist_t *props) +{ + zfs_cmd_t zc = {"\0"}; + int ret = -1; + prop_changelist_t **cls = NULL; + int cl_idx; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *nvl; + int nvl_len = 0; + int added_resv = 0; + zfs_prop_t prop = 0; + nvpair_t *elem; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zfs_name); + + if ((nvl = zfs_valid_proplist(hdl, zhp->zfs_type, props, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), zhp, zhp->zpool_hdl, + B_FALSE, errbuf)) == NULL) + goto error; + + /* + * We have to check for any extra properties which need to be added + * before computing the length of the nvlist. + */ + for (elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) { + if (zfs_name_to_prop(nvpair_name(elem)) == ZFS_PROP_VOLSIZE && + (added_resv = zfs_add_synthetic_resv(zhp, nvl)) == -1) { + goto error; + } + } + + if (added_resv != 1 && + (added_resv = zfs_fix_auto_resv(zhp, nvl)) == -1) { + goto error; + } + + /* + * Check how many properties we're setting and allocate an array to + * store changelist pointers for postfix(). + */ + for (elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) + nvl_len++; + if ((cls = calloc(nvl_len, sizeof (prop_changelist_t *))) == NULL) + goto error; + + cl_idx = 0; + for (elem = nvlist_next_nvpair(nvl, NULL); + elem != NULL; + elem = nvlist_next_nvpair(nvl, elem)) { + + prop = zfs_name_to_prop(nvpair_name(elem)); + + assert(cl_idx < nvl_len); + /* + * We don't want to unmount & remount the dataset when changing + * its canmount property to 'on' or 'noauto'. We only use + * the changelist logic to unmount when setting canmount=off. + */ + if (prop != ZFS_PROP_CANMOUNT || + (fnvpair_value_uint64(elem) == ZFS_CANMOUNT_OFF && + zfs_is_mounted(zhp, NULL))) { + cls[cl_idx] = changelist_gather(zhp, prop, 0, 0); + if (cls[cl_idx] == NULL) + goto error; + } + + if (prop == ZFS_PROP_MOUNTPOINT && + changelist_haszonedchild(cls[cl_idx])) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + ret = zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + if (cls[cl_idx] != NULL && + (ret = changelist_prefix(cls[cl_idx])) != 0) + goto error; + + cl_idx++; + } + assert(cl_idx == nvl_len); + + /* + * Execute the corresponding ioctl() to set this list of properties. + */ + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if ((ret = zcmd_write_src_nvlist(hdl, &zc, nvl)) != 0 || + (ret = zcmd_alloc_dst_nvlist(hdl, &zc, 0)) != 0) + goto error; + + ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + + if (ret != 0) { + if (zc.zc_nvlist_dst_filled == B_FALSE) { + (void) zfs_standard_error(hdl, errno, errbuf); + goto error; + } + + /* Get the list of unset properties back and report them. */ + nvlist_t *errorprops = NULL; + if (zcmd_read_dst_nvlist(hdl, &zc, &errorprops) != 0) + goto error; + for (nvpair_t *elem = nvlist_next_nvpair(errorprops, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errorprops, elem)) { + prop = zfs_name_to_prop(nvpair_name(elem)); + zfs_setprop_error(hdl, prop, errno, errbuf); + } + nvlist_free(errorprops); + + if (added_resv && errno == ENOSPC) { + /* clean up the volsize property we tried to set */ + uint64_t old_volsize = zfs_prop_get_int(zhp, + ZFS_PROP_VOLSIZE); + nvlist_free(nvl); + nvl = NULL; + zcmd_free_nvlists(&zc); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + goto error; + if (nvlist_add_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), + old_volsize) != 0) + goto error; + if (zcmd_write_src_nvlist(hdl, &zc, nvl) != 0) + goto error; + (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + } + } else { + for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { + if (cls[cl_idx] != NULL) { + int clp_err = changelist_postfix(cls[cl_idx]); + if (clp_err != 0) + ret = clp_err; + } + } + + if (ret == 0) { + /* + * Refresh the statistics so the new property + * value is reflected. + */ + (void) get_stats(zhp); + + /* + * Remount the filesystem to propagate the change + * if one of the options handled by the generic + * Linux namespace layer has been modified. + */ + if (zfs_is_namespace_prop(prop) && + zfs_is_mounted(zhp, NULL)) + ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); + } + } + +error: + nvlist_free(nvl); + zcmd_free_nvlists(&zc); + if (cls != NULL) { + for (cl_idx = 0; cl_idx < nvl_len; cl_idx++) { + if (cls[cl_idx] != NULL) + changelist_free(cls[cl_idx]); + } + free(cls); + } + return (ret); +} + +/* + * Given a property, inherit the value from the parent dataset, or if received + * is TRUE, revert to the received value, if any. + */ +int +zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) +{ + zfs_cmd_t zc = {"\0"}; + int ret; + prop_changelist_t *cl; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + zfs_prop_t prop; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot inherit %s for '%s'"), propname, zhp->zfs_name); + + zc.zc_cookie = received; + if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { + /* + * For user properties, the amount of work we have to do is very + * small, so just do it here. + */ + if (!zfs_prop_user(propname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); + + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc) != 0) + return (zfs_standard_error(hdl, errno, errbuf)); + + return (0); + } + + /* + * Verify that this property is inheritable. + */ + if (zfs_prop_readonly(prop)) + return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); + + if (!zfs_prop_inheritable(prop) && !received) + return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); + + /* + * Check to see if the value applies to this type + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) + return (zfs_error(hdl, EZFS_PROPTYPE, errbuf)); + + /* + * Normalize the name, to get rid of shorthand abbreviations. + */ + propname = zfs_prop_to_name(prop); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, propname, sizeof (zc.zc_value)); + + if (prop == ZFS_PROP_MOUNTPOINT && getzoneid() == GLOBAL_ZONEID && + zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is used in a non-global zone")); + return (zfs_error(hdl, EZFS_ZONED, errbuf)); + } + + /* + * Determine datasets which will be affected by this change, if any. + */ + if ((cl = changelist_gather(zhp, prop, 0, 0)) == NULL) + return (-1); + + if (prop == ZFS_PROP_MOUNTPOINT && changelist_haszonedchild(cl)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + ret = zfs_error(hdl, EZFS_ZONED, errbuf); + goto error; + } + + if ((ret = changelist_prefix(cl)) != 0) + goto error; + + if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_INHERIT_PROP, &zc)) != 0) { + return (zfs_standard_error(hdl, errno, errbuf)); + } else { + + if ((ret = changelist_postfix(cl)) != 0) + goto error; + + /* + * Refresh the statistics so the new property is reflected. + */ + (void) get_stats(zhp); + + /* + * Remount the filesystem to propagate the change + * if one of the options handled by the generic + * Linux namespace layer has been modified. + */ + if (zfs_is_namespace_prop(prop) && + zfs_is_mounted(zhp, NULL)) + ret = zfs_mount(zhp, MNTOPT_REMOUNT, 0); + } + +error: + changelist_free(cl); + return (ret); +} + +/* + * True DSL properties are stored in an nvlist. The following two functions + * extract them appropriately. + */ +uint64_t +getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) +{ + nvlist_t *nv; + uint64_t value; + + *source = NULL; + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); + (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); + } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); + value = zfs_prop_default_numeric(prop); + *source = ""; + } + + return (value); +} + +static const char * +getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) +{ + nvlist_t *nv; + const char *value; + + *source = NULL; + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(prop), &nv) == 0) { + value = fnvlist_lookup_string(nv, ZPROP_VALUE); + (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); + } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); + value = zfs_prop_default_string(prop); + *source = ""; + } + + return (value); +} + +static boolean_t +zfs_is_recvd_props_mode(zfs_handle_t *zhp) +{ + return (zhp->zfs_props == zhp->zfs_recvd_props); +} + +static void +zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; + zhp->zfs_props = zhp->zfs_recvd_props; +} + +static void +zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; + *cookie = 0; +} + +/* + * Internal function for getting a numeric property. Both zfs_prop_get() and + * zfs_prop_get_int() are built using this interface. + * + * Certain properties can be overridden using 'mount -o'. In this case, scan + * the contents of the /proc/self/mounts entry, searching for the + * appropriate options. If they differ from the on-disk values, report the + * current values and mark the source "temporary". + */ +static int +get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, + char **source, uint64_t *val) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *zplprops = NULL; + struct mnttab mnt; + char *mntopt_on = NULL; + char *mntopt_off = NULL; + boolean_t received = zfs_is_recvd_props_mode(zhp); + + *source = NULL; + + /* + * If the property is being fetched for a snapshot, check whether + * the property is valid for the snapshot's head dataset type. + */ + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT && + !zfs_prop_valid_for_type(prop, zhp->zfs_head_type, B_TRUE)) { + *val = zfs_prop_default_numeric(prop); + return (-1); + } + + switch (prop) { + case ZFS_PROP_ATIME: + mntopt_on = MNTOPT_ATIME; + mntopt_off = MNTOPT_NOATIME; + break; + + case ZFS_PROP_RELATIME: + mntopt_on = MNTOPT_RELATIME; + mntopt_off = MNTOPT_NORELATIME; + break; + + case ZFS_PROP_DEVICES: + mntopt_on = MNTOPT_DEVICES; + mntopt_off = MNTOPT_NODEVICES; + break; + + case ZFS_PROP_EXEC: + mntopt_on = MNTOPT_EXEC; + mntopt_off = MNTOPT_NOEXEC; + break; + + case ZFS_PROP_READONLY: + mntopt_on = MNTOPT_RO; + mntopt_off = MNTOPT_RW; + break; + + case ZFS_PROP_SETUID: + mntopt_on = MNTOPT_SETUID; + mntopt_off = MNTOPT_NOSETUID; + break; + + case ZFS_PROP_XATTR: + mntopt_on = MNTOPT_XATTR; + mntopt_off = MNTOPT_NOXATTR; + break; + + case ZFS_PROP_NBMAND: + mntopt_on = MNTOPT_NBMAND; + mntopt_off = MNTOPT_NONBMAND; + break; + + default: + break; + } + + /* + * Because looking up the mount options is potentially expensive + * (iterating over all of /proc/self/mounts), we defer its + * calculation until we're looking up a property which requires + * its presence. + */ + if (!zhp->zfs_mntcheck && + (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + + if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { + zhp->zfs_mntopts = zfs_strdup(hdl, + entry.mnt_mntopts); + if (zhp->zfs_mntopts == NULL) + return (-1); + } + + zhp->zfs_mntcheck = B_TRUE; + } + + if (zhp->zfs_mntopts == NULL) + mnt.mnt_mntopts = ""; + else + mnt.mnt_mntopts = zhp->zfs_mntopts; + + switch (prop) { + case ZFS_PROP_ATIME: + case ZFS_PROP_RELATIME: + case ZFS_PROP_DEVICES: + case ZFS_PROP_EXEC: + case ZFS_PROP_READONLY: + case ZFS_PROP_SETUID: +#ifndef __FreeBSD__ + case ZFS_PROP_XATTR: +#endif + case ZFS_PROP_NBMAND: + *val = getprop_uint64(zhp, prop, source); + + if (received) + break; + + if (hasmntopt(&mnt, mntopt_on) && !*val) { + *val = B_TRUE; + if (src) + *src = ZPROP_SRC_TEMPORARY; + } else if (hasmntopt(&mnt, mntopt_off) && *val) { + *val = B_FALSE; + if (src) + *src = ZPROP_SRC_TEMPORARY; + } + break; + + case ZFS_PROP_CANMOUNT: + case ZFS_PROP_VOLSIZE: + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: + *val = getprop_uint64(zhp, prop, source); + + if (*source == NULL) { + /* not default, must be local */ + *source = zhp->zfs_name; + } + break; + + case ZFS_PROP_MOUNTED: + *val = (zhp->zfs_mntopts != NULL); + break; + + case ZFS_PROP_NUMCLONES: + *val = zhp->zfs_dmustats.dds_num_clones; + break; + + case ZFS_PROP_VERSION: + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + case ZFS_PROP_CASE: + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { + zcmd_free_nvlists(&zc); + if (prop == ZFS_PROP_VERSION && + zhp->zfs_type == ZFS_TYPE_VOLUME) + *val = zfs_prop_default_numeric(prop); + return (-1); + } + if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || + nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), + val) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + nvlist_free(zplprops); + zcmd_free_nvlists(&zc); + break; + + case ZFS_PROP_INCONSISTENT: + *val = zhp->zfs_dmustats.dds_inconsistent; + break; + + case ZFS_PROP_REDACTED: + *val = zhp->zfs_dmustats.dds_redacted; + break; + + default: + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + *val = getprop_uint64(zhp, prop, source); + /* + * If we tried to use a default value for a + * readonly property, it means that it was not + * present. Note this only applies to "truly" + * readonly properties, not set-once properties + * like volblocksize. + */ + if (zfs_prop_readonly(prop) && + !zfs_prop_setonce(prop) && + *source != NULL && (*source)[0] == '\0') { + *source = NULL; + return (-1); + } + break; + + case PROP_TYPE_STRING: + default: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "cannot get non-numeric property")); + return (zfs_error(zhp->zfs_hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "internal error"))); + } + } + + return (0); +} + +/* + * Calculate the source type, given the raw source string. + */ +static void +get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, + char *statbuf, size_t statlen) +{ + if (statbuf == NULL || + srctype == NULL || *srctype == ZPROP_SRC_TEMPORARY) { + return; + } + + if (source == NULL) { + *srctype = ZPROP_SRC_NONE; + } else if (source[0] == '\0') { + *srctype = ZPROP_SRC_DEFAULT; + } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { + *srctype = ZPROP_SRC_RECEIVED; + } else { + if (strcmp(source, zhp->zfs_name) == 0) { + *srctype = ZPROP_SRC_LOCAL; + } else { + (void) strlcpy(statbuf, source, statlen); + *srctype = ZPROP_SRC_INHERITED; + } + } + +} + +int +zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, + size_t proplen, boolean_t literal) +{ + zfs_prop_t prop; + int err = 0; + + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (-1); + + prop = zfs_name_to_prop(propname); + + if (prop != ZPROP_INVAL) { + uint64_t cookie; + if (!nvlist_exists(zhp->zfs_recvd_props, propname)) + return (-1); + zfs_set_recvd_props_mode(zhp, &cookie); + err = zfs_prop_get(zhp, prop, propbuf, proplen, + NULL, NULL, 0, literal); + zfs_unset_recvd_props_mode(zhp, &cookie); + } else { + nvlist_t *propval; + char *recvdval; + if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, + propname, &propval) != 0) + return (-1); + verify(nvlist_lookup_string(propval, ZPROP_VALUE, + &recvdval) == 0); + (void) strlcpy(propbuf, recvdval, proplen); + } + + return (err == 0 ? 0 : -1); +} + +static int +get_clones_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + nvpair_t *pair; + + value = zfs_get_clones_nvl(zhp); + if (value == NULL) + return (-1); + + propbuf[0] = '\0'; + for (pair = nvlist_next_nvpair(value, NULL); pair != NULL; + pair = nvlist_next_nvpair(value, pair)) { + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) strlcat(propbuf, nvpair_name(pair), proplen); + } + + return (0); +} + +struct get_clones_arg { + uint64_t numclones; + nvlist_t *value; + const char *origin; + char buf[ZFS_MAX_DATASET_NAME_LEN]; +}; + +static int +get_clones_cb(zfs_handle_t *zhp, void *arg) +{ + struct get_clones_arg *gca = arg; + + if (gca->numclones == 0) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, gca->buf, sizeof (gca->buf), + NULL, NULL, 0, B_TRUE) != 0) + goto out; + if (strcmp(gca->buf, gca->origin) == 0) { + fnvlist_add_boolean(gca->value, zfs_get_name(zhp)); + gca->numclones--; + } + +out: + (void) zfs_iter_children(zhp, get_clones_cb, gca); + zfs_close(zhp); + return (0); +} + +nvlist_t * +zfs_get_clones_nvl(zfs_handle_t *zhp) +{ + nvlist_t *nv, *value; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv) != 0) { + struct get_clones_arg gca; + + /* + * if this is a snapshot, then the kernel wasn't able + * to get the clones. Do it by slowly iterating. + */ + if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) + return (NULL); + if (nvlist_alloc(&nv, NV_UNIQUE_NAME, 0) != 0) + return (NULL); + if (nvlist_alloc(&value, NV_UNIQUE_NAME, 0) != 0) { + nvlist_free(nv); + return (NULL); + } + + gca.numclones = zfs_prop_get_int(zhp, ZFS_PROP_NUMCLONES); + gca.value = value; + gca.origin = zhp->zfs_name; + + if (gca.numclones != 0) { + zfs_handle_t *root; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + char *cp = pool; + + /* get the pool name */ + (void) strlcpy(pool, zhp->zfs_name, sizeof (pool)); + (void) strsep(&cp, "/@"); + root = zfs_open(zhp->zfs_hdl, pool, + ZFS_TYPE_FILESYSTEM); + if (root == NULL) { + nvlist_free(nv); + nvlist_free(value); + return (NULL); + } + + (void) get_clones_cb(root, &gca); + } + + if (gca.numclones != 0 || + nvlist_add_nvlist(nv, ZPROP_VALUE, value) != 0 || + nvlist_add_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), nv) != 0) { + nvlist_free(nv); + nvlist_free(value); + return (NULL); + } + nvlist_free(nv); + nvlist_free(value); + verify(0 == nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_CLONES), &nv)); + } + + verify(nvlist_lookup_nvlist(nv, ZPROP_VALUE, &value) == 0); + + return (value); +} + +static int +get_rsnaps_string(zfs_handle_t *zhp, char *propbuf, size_t proplen) +{ + nvlist_t *value; + uint64_t *snaps; + uint_t nsnaps; + + if (nvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &value) != 0) + return (-1); + if (nvlist_lookup_uint64_array(value, ZPROP_VALUE, &snaps, + &nsnaps) != 0) + return (-1); + if (nsnaps == 0) { + /* There's no redaction snapshots; pass a special value back */ + (void) snprintf(propbuf, proplen, "none"); + return (0); + } + propbuf[0] = '\0'; + for (int i = 0; i < nsnaps; i++) { + char buf[128]; + if (propbuf[0] != '\0') + (void) strlcat(propbuf, ",", proplen); + (void) snprintf(buf, sizeof (buf), "%llu", + (u_longlong_t)snaps[i]); + (void) strlcat(propbuf, buf, proplen); + } + + return (0); +} + +/* + * Accepts a property and value and checks that the value + * matches the one found by the channel program. If they are + * not equal, print both of them. + */ +static void +zcp_check(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t intval, + const char *strval) +{ + if (!zhp->zfs_hdl->libzfs_prop_debug) + return; + int error; + char *poolname = zhp->zpool_hdl->zpool_name; + const char *prop_name = zfs_prop_to_name(prop); + const char *program = + "args = ...\n" + "ds = args['dataset']\n" + "prop = args['property']\n" + "value, setpoint = zfs.get_prop(ds, prop)\n" + "return {value=value, setpoint=setpoint}\n"; + nvlist_t *outnvl; + nvlist_t *retnvl; + nvlist_t *argnvl = fnvlist_alloc(); + + fnvlist_add_string(argnvl, "dataset", zhp->zfs_name); + fnvlist_add_string(argnvl, "property", zfs_prop_to_name(prop)); + + error = lzc_channel_program_nosync(poolname, program, + 10 * 1000 * 1000, 10 * 1024 * 1024, argnvl, &outnvl); + + if (error == 0) { + retnvl = fnvlist_lookup_nvlist(outnvl, "return"); + if (zfs_prop_get_type(prop) == PROP_TYPE_NUMBER) { + int64_t ans; + error = nvlist_lookup_int64(retnvl, "value", &ans); + if (error != 0) { + (void) fprintf(stderr, "%s: zcp check error: " + "%u\n", prop_name, error); + return; + } + if (ans != intval) { + (void) fprintf(stderr, "%s: zfs found %llu, " + "but zcp found %llu\n", prop_name, + (u_longlong_t)intval, (u_longlong_t)ans); + } + } else { + char *str_ans; + error = nvlist_lookup_string(retnvl, "value", &str_ans); + if (error != 0) { + (void) fprintf(stderr, "%s: zcp check error: " + "%u\n", prop_name, error); + return; + } + if (strcmp(strval, str_ans) != 0) { + (void) fprintf(stderr, + "%s: zfs found '%s', but zcp found '%s'\n", + prop_name, strval, str_ans); + } + } + } else { + (void) fprintf(stderr, "%s: zcp check failed, channel program " + "error: %u\n", prop_name, error); + } + nvlist_free(argnvl); + nvlist_free(outnvl); +} + +/* + * Retrieve a property from the given object. If 'literal' is specified, then + * numbers are left as exact values. Otherwise, numbers are converted to a + * human-readable form. + * + * Returns 0 on success, or -1 on error. + */ +int +zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, + zprop_source_t *src, char *statbuf, size_t statlen, boolean_t literal) +{ + char *source = NULL; + uint64_t val; + const char *str; + const char *strval; + boolean_t received = zfs_is_recvd_props_mode(zhp); + + /* + * Check to see if this property applies to our object + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) + return (-1); + + if (received && zfs_prop_readonly(prop)) + return (-1); + + if (src) + *src = ZPROP_SRC_NONE; + + switch (prop) { + case ZFS_PROP_CREATION: + /* + * 'creation' is a time_t stored in the statistics. We convert + * this into a string unless 'literal' is specified. + */ + { + val = getprop_uint64(zhp, prop, &source); + time_t time = (time_t)val; + struct tm t; + + if (literal || + localtime_r(&time, &t) == NULL || + strftime(propbuf, proplen, "%a %b %e %k:%M %Y", + &t) == 0) + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } + zcp_check(zhp, prop, val, NULL); + break; + + case ZFS_PROP_MOUNTPOINT: + /* + * Getting the precise mountpoint can be tricky. + * + * - for 'none' or 'legacy', return those values. + * - for inherited mountpoints, we want to take everything + * after our ancestor and append it to the inherited value. + * + * If the pool has an alternate root, we want to prepend that + * root to any values we return. + */ + + str = getprop_string(zhp, prop, &source); + + if (str[0] == '/') { + char buf[MAXPATHLEN]; + char *root = buf; + const char *relpath; + + /* + * If we inherit the mountpoint, even from a dataset + * with a received value, the source will be the path of + * the dataset we inherit from. If source is + * ZPROP_SOURCE_VAL_RECVD, the received value is not + * inherited. + */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { + relpath = ""; + } else { + relpath = zhp->zfs_name + strlen(source); + if (relpath[0] == '/') + relpath++; + } + + if ((zpool_get_prop(zhp->zpool_hdl, + ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL, + B_FALSE)) || (strcmp(root, "-") == 0)) + root[0] = '\0'; + /* + * Special case an alternate root of '/'. This will + * avoid having multiple leading slashes in the + * mountpoint path. + */ + if (strcmp(root, "/") == 0) + root++; + + /* + * If the mountpoint is '/' then skip over this + * if we are obtaining either an alternate root or + * an inherited mountpoint. + */ + if (str[1] == '\0' && (root[0] != '\0' || + relpath[0] != '\0')) + str++; + + if (relpath[0] == '\0') + (void) snprintf(propbuf, proplen, "%s%s", + root, str); + else + (void) snprintf(propbuf, proplen, "%s%s%s%s", + root, str, relpath[0] == '@' ? "" : "/", + relpath); + } else { + /* 'legacy' or 'none' */ + (void) strlcpy(propbuf, str, proplen); + } + zcp_check(zhp, prop, 0, propbuf); + break; + + case ZFS_PROP_ORIGIN: + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + (void) strlcpy(propbuf, str, proplen); + zcp_check(zhp, prop, 0, str); + break; + + case ZFS_PROP_REDACT_SNAPS: + if (get_rsnaps_string(zhp, propbuf, proplen) != 0) + return (-1); + break; + + case ZFS_PROP_CLONES: + if (get_clones_string(zhp, propbuf, proplen) != 0) + return (-1); + break; + + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + /* + * If quota or reservation is 0, we translate this into 'none' + * (unless literal is set), and indicate that it's the default + * value. Otherwise, we print the number nicely and indicate + * that its set locally. + */ + if (val == 0) { + if (literal) + (void) strlcpy(propbuf, "0", proplen); + else + (void) strlcpy(propbuf, "none", proplen); + } else { + if (literal) + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + else + zfs_nicebytes(val, propbuf, proplen); + } + zcp_check(zhp, prop, val, NULL); + break; + + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + case ZFS_PROP_FILESYSTEM_COUNT: + case ZFS_PROP_SNAPSHOT_COUNT: + + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + + /* + * If limit is UINT64_MAX, we translate this into 'none' (unless + * literal is set), and indicate that it's the default value. + * Otherwise, we print the number nicely and indicate that it's + * set locally. + */ + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else if (val == UINT64_MAX) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(val, propbuf, proplen); + } + + zcp_check(zhp, prop, val, NULL); + break; + + case ZFS_PROP_REFRATIO: + case ZFS_PROP_COMPRESSRATIO: + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + if (literal) + (void) snprintf(propbuf, proplen, "%llu.%02llu", + (u_longlong_t)(val / 100), + (u_longlong_t)(val % 100)); + else + (void) snprintf(propbuf, proplen, "%llu.%02llux", + (u_longlong_t)(val / 100), + (u_longlong_t)(val % 100)); + zcp_check(zhp, prop, val, NULL); + break; + + case ZFS_PROP_TYPE: + switch (zhp->zfs_type) { + case ZFS_TYPE_FILESYSTEM: + str = "filesystem"; + break; + case ZFS_TYPE_VOLUME: + str = "volume"; + break; + case ZFS_TYPE_SNAPSHOT: + str = "snapshot"; + break; + case ZFS_TYPE_BOOKMARK: + str = "bookmark"; + break; + default: + abort(); + } + (void) snprintf(propbuf, proplen, "%s", str); + zcp_check(zhp, prop, 0, propbuf); + break; + + case ZFS_PROP_MOUNTED: + /* + * The 'mounted' property is a pseudo-property that described + * whether the filesystem is currently mounted. Even though + * it's a boolean value, the typical values of "on" and "off" + * don't make sense, so we translate to "yes" and "no". + */ + if (get_numeric_property(zhp, ZFS_PROP_MOUNTED, + src, &source, &val) != 0) + return (-1); + if (val) + (void) strlcpy(propbuf, "yes", proplen); + else + (void) strlcpy(propbuf, "no", proplen); + break; + + case ZFS_PROP_NAME: + /* + * The 'name' property is a pseudo-property derived from the + * dataset name. It is presented as a real property to simplify + * consumers. + */ + (void) strlcpy(propbuf, zhp->zfs_name, proplen); + zcp_check(zhp, prop, 0, propbuf); + break; + + case ZFS_PROP_MLSLABEL: + { +#ifdef HAVE_MLSLABEL + m_label_t *new_sl = NULL; + char *ascii = NULL; /* human readable label */ + + (void) strlcpy(propbuf, + getprop_string(zhp, prop, &source), proplen); + + if (literal || (strcasecmp(propbuf, + ZFS_MLSLABEL_DEFAULT) == 0)) + break; + + /* + * Try to translate the internal hex string to + * human-readable output. If there are any + * problems just use the hex string. + */ + + if (str_to_label(propbuf, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1) { + m_label_free(new_sl); + break; + } + + if (label_to_str(new_sl, &ascii, M_LABEL, + DEF_NAMES) != 0) { + if (ascii) + free(ascii); + m_label_free(new_sl); + break; + } + m_label_free(new_sl); + + (void) strlcpy(propbuf, ascii, proplen); + free(ascii); +#else + (void) strlcpy(propbuf, + getprop_string(zhp, prop, &source), proplen); +#endif /* HAVE_MLSLABEL */ + } + break; + + case ZFS_PROP_GUID: + case ZFS_PROP_CREATETXG: + case ZFS_PROP_OBJSETID: + /* + * These properties are stored as numbers, but they are + * identifiers. + * We don't want them to be pretty printed, because pretty + * printing mangles the ID into a truncated and useless value. + */ + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + (void) snprintf(propbuf, proplen, "%llu", (u_longlong_t)val); + zcp_check(zhp, prop, val, NULL); + break; + + case ZFS_PROP_REFERENCED: + case ZFS_PROP_AVAILABLE: + case ZFS_PROP_USED: + case ZFS_PROP_USEDSNAP: + case ZFS_PROP_USEDDS: + case ZFS_PROP_USEDREFRESERV: + case ZFS_PROP_USEDCHILD: + if (get_numeric_property(zhp, prop, src, &source, &val) != 0) + return (-1); + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else { + zfs_nicebytes(val, propbuf, proplen); + } + zcp_check(zhp, prop, val, NULL); + break; + + default: + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + if (get_numeric_property(zhp, prop, src, + &source, &val) != 0) { + return (-1); + } + + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)val); + } else { + zfs_nicenum(val, propbuf, proplen); + } + zcp_check(zhp, prop, val, NULL); + break; + + case PROP_TYPE_STRING: + str = getprop_string(zhp, prop, &source); + if (str == NULL) + return (-1); + + (void) strlcpy(propbuf, str, proplen); + zcp_check(zhp, prop, 0, str); + break; + + case PROP_TYPE_INDEX: + if (get_numeric_property(zhp, prop, src, + &source, &val) != 0) + return (-1); + if (zfs_prop_index_to_string(prop, val, &strval) != 0) + return (-1); + + (void) strlcpy(propbuf, strval, proplen); + zcp_check(zhp, prop, 0, strval); + break; + + default: + abort(); + } + } + + get_source(zhp, src, source, statbuf, statlen); + + return (0); +} + +/* + * Utility function to get the given numeric property. Does no validation that + * the given property is the appropriate type; should only be used with + * hard-coded property types. + */ +uint64_t +zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) +{ + char *source; + uint64_t val = 0; + + (void) get_numeric_property(zhp, prop, NULL, &source, &val); + + return (val); +} + +static int +zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) +{ + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); + return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); +} + +/* + * Similar to zfs_prop_get(), but returns the value as an integer. + */ +int +zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, + zprop_source_t *src, char *statbuf, size_t statlen) +{ + char *source; + + /* + * Check to see if this property applies to our object + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type, B_FALSE)) { + return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, + dgettext(TEXT_DOMAIN, "cannot get property '%s'"), + zfs_prop_to_name(prop))); + } + + if (src) + *src = ZPROP_SRC_NONE; + + if (get_numeric_property(zhp, prop, src, &source, value) != 0) + return (-1); + + get_source(zhp, src, source, statbuf, statlen); + + return (0); +} + +#ifdef HAVE_IDMAP +static int +idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, + char **domainp, idmap_rid_t *ridp) +{ + idmap_get_handle_t *get_hdl = NULL; + idmap_stat status; + int err = EINVAL; + + if (idmap_get_create(&get_hdl) != IDMAP_SUCCESS) + goto out; + + if (isuser) { + err = idmap_get_sidbyuid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } else { + err = idmap_get_sidbygid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } + if (err == IDMAP_SUCCESS && + idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && + status == IDMAP_SUCCESS) + err = 0; + else + err = EINVAL; +out: + if (get_hdl) + idmap_get_destroy(get_hdl); + return (err); +} +#endif /* HAVE_IDMAP */ + +/* + * convert the propname into parameters needed by kernel + * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 + * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 + * Eg: groupquota@staff -> ZFS_PROP_GROUPQUOTA, "", 1234 + * Eg: groupused@staff -> ZFS_PROP_GROUPUSED, "", 1234 + * Eg: projectquota@123 -> ZFS_PROP_PROJECTQUOTA, "", 123 + * Eg: projectused@789 -> ZFS_PROP_PROJECTUSED, "", 789 + */ +static int +userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) +{ + zfs_userquota_prop_t type; + char *cp; + boolean_t isuser; + boolean_t isgroup; + boolean_t isproject; + struct passwd *pw; + struct group *gr; + + domain[0] = '\0'; + + /* Figure out the property type ({user|group|project}{quota|space}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(propname, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + if (type == ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + *typep = type; + + isuser = (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_USERUSED || + type == ZFS_PROP_USEROBJQUOTA || + type == ZFS_PROP_USEROBJUSED); + isgroup = (type == ZFS_PROP_GROUPQUOTA || type == ZFS_PROP_GROUPUSED || + type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_GROUPOBJUSED); + isproject = (type == ZFS_PROP_PROJECTQUOTA || + type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED); + + cp = strchr(propname, '@') + 1; + + if (isuser && (pw = getpwnam(cp)) != NULL) { + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + *ridp = pw->pw_uid; + } else if (isgroup && (gr = getgrnam(cp)) != NULL) { + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + *ridp = gr->gr_gid; + } else if (!isproject && strchr(cp, '@')) { +#ifdef HAVE_IDMAP + /* + * It's a SID name (eg "user@domain") that needs to be + * turned into S-1-domainID-RID. + */ + directory_error_t e; + char *numericsid = NULL; + char *end; + + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + e = directory_sid_from_user_name(NULL, + cp, &numericsid); + } else { + e = directory_sid_from_group_name(NULL, + cp, &numericsid); + } + if (e != NULL) { + directory_error_free(e); + return (ENOENT); + } + if (numericsid == NULL) + return (ENOENT); + cp = numericsid; + (void) strlcpy(domain, cp, domainlen); + cp = strrchr(domain, '-'); + *cp = '\0'; + cp++; + + errno = 0; + *ridp = strtoull(cp, &end, 10); + free(numericsid); + + if (errno != 0 || *end != '\0') + return (EINVAL); +#else + return (ENOSYS); +#endif /* HAVE_IDMAP */ + } else { + /* It's a user/group/project ID (eg "12345"). */ + uid_t id; + char *end; + id = strtoul(cp, &end, 10); + if (*end != '\0') + return (EINVAL); + if (id > MAXUID && !isproject) { +#ifdef HAVE_IDMAP + /* It's an ephemeral ID. */ + idmap_rid_t rid; + char *mapdomain; + + if (idmap_id_to_numeric_domain_rid(id, isuser, + &mapdomain, &rid) != 0) + return (ENOENT); + (void) strlcpy(domain, mapdomain, domainlen); + *ridp = rid; +#else + return (ENOSYS); +#endif /* HAVE_IDMAP */ + } else { + *ridp = id; + } + } + + return (0); +} + +static int +zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue, zfs_userquota_prop_t *typep) +{ + int err; + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + err = userquota_propname_decode(propname, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), + typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); + zc.zc_objset_type = *typep; + if (err) + return (err); + + err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_USERSPACE_ONE, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); +} + +int +zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) +{ + zfs_userquota_prop_t type; + + return (zfs_prop_get_userquota_common(zhp, propname, propvalue, + &type)); +} + +int +zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) +{ + int err; + uint64_t propvalue; + zfs_userquota_prop_t type; + + err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, + &type); + + if (err) + return (err); + + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)propvalue); + } else if (propvalue == 0 && + (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTQUOTA || + type == ZFS_PROP_PROJECTOBJQUOTA)) { + (void) strlcpy(propbuf, "none", proplen); + } else if (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA || + type == ZFS_PROP_USERUSED || type == ZFS_PROP_GROUPUSED || + type == ZFS_PROP_PROJECTUSED || type == ZFS_PROP_PROJECTQUOTA) { + zfs_nicebytes(propvalue, propbuf, proplen); + } else { + zfs_nicenum(propvalue, propbuf, proplen); + } + return (0); +} + +/* + * propname must start with "written@" or "written#". + */ +int +zfs_prop_get_written_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) +{ + int err; + zfs_cmd_t zc = {"\0"}; + const char *snapname; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + assert(zfs_prop_written(propname)); + snapname = propname + strlen("written@"); + if (strchr(snapname, '@') != NULL || strchr(snapname, '#') != NULL) { + /* full snapshot or bookmark name specified */ + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + } else { + /* snapname is the short name, append it to zhp's fsname */ + char *cp; + + (void) strlcpy(zc.zc_value, zhp->zfs_name, + sizeof (zc.zc_value)); + cp = strchr(zc.zc_value, '@'); + if (cp != NULL) + *cp = '\0'; + (void) strlcat(zc.zc_value, snapname - 1, sizeof (zc.zc_value)); + } + + err = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SPACE_WRITTEN, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); +} + +int +zfs_prop_get_written(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) +{ + int err; + uint64_t propvalue; + + err = zfs_prop_get_written_int(zhp, propname, &propvalue); + + if (err) + return (err); + + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", + (u_longlong_t)propvalue); + } else { + zfs_nicebytes(propvalue, propbuf, proplen); + } + + return (0); +} + +/* + * Returns the name of the given zfs handle. + */ +const char * +zfs_get_name(const zfs_handle_t *zhp) +{ + return (zhp->zfs_name); +} + +/* + * Returns the name of the parent pool for the given zfs handle. + */ +const char * +zfs_get_pool_name(const zfs_handle_t *zhp) +{ + return (zhp->zpool_hdl->zpool_name); +} + +/* + * Returns the type of the given zfs handle. + */ +zfs_type_t +zfs_get_type(const zfs_handle_t *zhp) +{ + return (zhp->zfs_type); +} + +/* + * Is one dataset name a child dataset of another? + * + * Needs to handle these cases: + * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" + * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" + * Descendant? No. No. No. Yes. + */ +static boolean_t +is_descendant(const char *ds1, const char *ds2) +{ + size_t d1len = strlen(ds1); + + /* ds2 can't be a descendant if it's smaller */ + if (strlen(ds2) < d1len) + return (B_FALSE); + + /* otherwise, compare strings and verify that there's a '/' char */ + return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); +} + +/* + * Given a complete name, return just the portion that refers to the parent. + * Will return -1 if there is no parent (path is just the name of the + * pool). + */ +static int +parent_name(const char *path, char *buf, size_t buflen) +{ + char *slashp; + + (void) strlcpy(buf, path, buflen); + + if ((slashp = strrchr(buf, '/')) == NULL) + return (-1); + *slashp = '\0'; + + return (0); +} + +int +zfs_parent_name(zfs_handle_t *zhp, char *buf, size_t buflen) +{ + return (parent_name(zfs_get_name(zhp), buf, buflen)); +} + +/* + * If accept_ancestor is false, then check to make sure that the given path has + * a parent, and that it exists. If accept_ancestor is true, then find the + * closest existing ancestor for the given path. In prefixlen return the + * length of already existing prefix of the given path. We also fetch the + * 'zoned' property, which is used to validate property settings when creating + * new datasets. + */ +static int +check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, + boolean_t accept_ancestor, int *prefixlen) +{ + zfs_cmd_t zc = {"\0"}; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + char *slash; + zfs_handle_t *zhp; + char errbuf[1024]; + uint64_t is_zoned; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); + + /* get parent, and check to see if this is just a pool */ + if (parent_name(path, parent, sizeof (parent)) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing dataset name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + /* check to see if the pool exists */ + if ((slash = strchr(parent, '/')) == NULL) + slash = parent + strlen(parent); + (void) strncpy(zc.zc_name, parent, slash - parent); + zc.zc_name[slash - parent] = '\0'; + if (zfs_ioctl(hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0 && + errno == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such pool '%s'"), zc.zc_name); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + + /* check to see if the parent dataset exists */ + while ((zhp = make_dataset_handle(hdl, parent)) == NULL) { + if (errno == ENOENT && accept_ancestor) { + /* + * Go deeper to find an ancestor, give up on top level. + */ + if (parent_name(parent, parent, sizeof (parent)) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such pool '%s'"), zc.zc_name); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + } else if (errno == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent does not exist")); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } else + return (zfs_standard_error(hdl, errno, errbuf)); + } + + is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + if (zoned != NULL) + *zoned = is_zoned; + + /* we are in a non-global zone, but parent is in the global zone */ + if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { + (void) zfs_standard_error(hdl, EPERM, errbuf); + zfs_close(zhp); + return (-1); + } + + /* make sure parent is a filesystem */ + if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent is not a filesystem")); + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + zfs_close(zhp); + return (-1); + } + + zfs_close(zhp); + if (prefixlen != NULL) + *prefixlen = strlen(parent); + return (0); +} + +/* + * Finds whether the dataset of the given type(s) exists. + */ +boolean_t +zfs_dataset_exists(libzfs_handle_t *hdl, const char *path, zfs_type_t types) +{ + zfs_handle_t *zhp; + + if (!zfs_validate_name(hdl, path, types, B_FALSE)) + return (B_FALSE); + + /* + * Try to get stats for the dataset, which will tell us if it exists. + */ + if ((zhp = make_dataset_handle(hdl, path)) != NULL) { + int ds_type = zhp->zfs_type; + + zfs_close(zhp); + if (types & ds_type) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Given a path to 'target', create all the ancestors between + * the prefixlen portion of the path, and the target itself. + * Fail if the initial prefixlen-ancestor does not already exist. + */ +int +create_parents(libzfs_handle_t *hdl, char *target, int prefixlen) +{ + zfs_handle_t *h; + char *cp; + const char *opname; + + /* make sure prefix exists */ + cp = target + prefixlen; + if (*cp != '/') { + assert(strchr(cp, '/') == NULL); + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + } else { + *cp = '\0'; + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + *cp = '/'; + } + if (h == NULL) + return (-1); + zfs_close(h); + + /* + * Attempt to create, mount, and share any ancestor filesystems, + * up to the prefixlen-long one. + */ + for (cp = target + prefixlen + 1; + (cp = strchr(cp, '/')) != NULL; *cp = '/', cp++) { + + *cp = '\0'; + + h = make_dataset_handle(hdl, target); + if (h) { + /* it already exists, nothing to do here */ + zfs_close(h); + continue; + } + + if (zfs_create(hdl, target, ZFS_TYPE_FILESYSTEM, + NULL) != 0) { + opname = dgettext(TEXT_DOMAIN, "create"); + goto ancestorerr; + } + + h = zfs_open(hdl, target, ZFS_TYPE_FILESYSTEM); + if (h == NULL) { + opname = dgettext(TEXT_DOMAIN, "open"); + goto ancestorerr; + } + + if (zfs_mount(h, NULL, 0) != 0) { + opname = dgettext(TEXT_DOMAIN, "mount"); + goto ancestorerr; + } + + if (zfs_share(h) != 0) { + opname = dgettext(TEXT_DOMAIN, "share"); + goto ancestorerr; + } + + zfs_close(h); + } + zfs_commit_all_shares(); + + return (0); + +ancestorerr: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to %s ancestor '%s'"), opname, target); + return (-1); +} + +/* + * Creates non-existing ancestors of the given path. + */ +int +zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) +{ + int prefix; + char *path_copy; + char errbuf[1024]; + int rc = 0; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), path); + + /* + * Check that we are not passing the nesting limit + * before we start creating any ancestors. + */ + if (dataset_nestcheck(path) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "maximum name nesting depth exceeded")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) + return (-1); + + if ((path_copy = strdup(path)) != NULL) { + rc = create_parents(hdl, path_copy, prefix); + free(path_copy); + } + if (path_copy == NULL || rc != 0) + return (-1); + + return (0); +} + +/* + * Create a new filesystem or volume. + */ +int +zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, + nvlist_t *props) +{ + int ret; + uint64_t size = 0; + uint64_t blocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + uint64_t zoned; + enum lzc_dataset_type ost; + zpool_handle_t *zpool_handle; + uint8_t *wkeydata = NULL; + uint_t wkeylen = 0; + char errbuf[1024]; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), path); + + /* validate the path, taking care to note the extended error message */ + if (!zfs_validate_name(hdl, path, type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + if (dataset_nestcheck(path) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "maximum name nesting depth exceeded")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + + /* validate parents exist */ + if (check_parents(hdl, path, &zoned, B_FALSE, NULL) != 0) + return (-1); + + /* + * The failure modes when creating a dataset of a different type over + * one that already exists is a little strange. In particular, if you + * try to create a dataset on top of an existing dataset, the ioctl() + * will return ENOENT, not EEXIST. To prevent this from happening, we + * first try to see if the dataset exists. + */ + if (zfs_dataset_exists(hdl, path, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset already exists")); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + } + + if (type == ZFS_TYPE_VOLUME) + ost = LZC_DATSET_TYPE_ZVOL; + else + ost = LZC_DATSET_TYPE_ZFS; + + /* open zpool handle for prop validation */ + char pool_path[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(pool_path, path, sizeof (pool_path)); + + /* truncate pool_path at first slash */ + char *p = strchr(pool_path, '/'); + if (p != NULL) + *p = '\0'; + + if ((zpool_handle = zpool_open(hdl, pool_path)) == NULL) + return (-1); + + if (props && (props = zfs_valid_proplist(hdl, type, props, + zoned, NULL, zpool_handle, B_TRUE, errbuf)) == 0) { + zpool_close(zpool_handle); + return (-1); + } + zpool_close(zpool_handle); + + if (type == ZFS_TYPE_VOLUME) { + /* + * If we are creating a volume, the size and block size must + * satisfy a few restraints. First, the blocksize must be a + * valid block size between SPA_{MIN,MAX}BLOCKSIZE. Second, the + * volsize must be a multiple of the block size, and cannot be + * zero. + */ + if (props == NULL || nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &size) != 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing volume size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + if ((ret = nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &blocksize)) != 0) { + if (ret == ENOENT) { + blocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + } else { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "missing volume block size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + } + + if (size == 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volume size cannot be zero")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + if (size % blocksize != 0) { + nvlist_free(props); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volume size must be a multiple of volume block " + "size")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + } + + (void) parent_name(path, parent, sizeof (parent)); + if (zfs_crypto_create(hdl, parent, props, NULL, B_TRUE, + &wkeydata, &wkeylen) != 0) { + nvlist_free(props); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + } + + /* create the dataset */ + ret = lzc_create(path, ost, props, wkeydata, wkeylen); + nvlist_free(props); + if (wkeydata != NULL) + free(wkeydata); + + /* check for failure */ + if (ret != 0) { + switch (errno) { + case ENOENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no such parent '%s'"), parent); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to set this " + "property or value")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + + case EACCES: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encryption root's key is not loaded " + "or provided")); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + + case ERANGE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property value(s) specified")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); +#ifdef _ILP32 + case EOVERFLOW: + /* + * This platform can't address a volume this big. + */ + if (type == ZFS_TYPE_VOLUME) + return (zfs_error(hdl, EZFS_VOLTOOBIG, + errbuf)); +#endif + /* FALLTHROUGH */ + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + return (0); +} + +/* + * Destroys the given dataset. The caller must make sure that the filesystem + * isn't mounted, and that there are no active dependents. If the file system + * does not exist this function does nothing. + */ +int +zfs_destroy(zfs_handle_t *zhp, boolean_t defer) +{ + int error; + + if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT && defer) + return (EINVAL); + + if (zhp->zfs_type == ZFS_TYPE_BOOKMARK) { + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_boolean(nv, zhp->zfs_name); + error = lzc_destroy_bookmarks(nv, NULL); + fnvlist_free(nv); + if (error != 0) { + return (zfs_standard_error_fmt(zhp->zfs_hdl, error, + dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), + zhp->zfs_name)); + } + return (0); + } + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_boolean(nv, zhp->zfs_name); + error = lzc_destroy_snaps(nv, defer, NULL); + fnvlist_free(nv); + } else { + error = lzc_destroy(zhp->zfs_name); + } + + if (error != 0 && error != ENOENT) { + return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), + zhp->zfs_name)); + } + + remove_mountpoint(zhp); + + return (0); +} + +struct destroydata { + nvlist_t *nvl; + const char *snapname; +}; + +static int +zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) +{ + struct destroydata *dd = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + + if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, + dd->snapname) >= sizeof (name)) + return (EINVAL); + + if (lzc_exists(name)) + verify(nvlist_add_boolean(dd->nvl, name) == 0); + + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, dd); + zfs_close(zhp); + return (rv); +} + +/* + * Destroys all snapshots with the given name in zhp & descendants. + */ +int +zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) +{ + int ret; + struct destroydata dd = { 0 }; + + dd.snapname = snapname; + verify(nvlist_alloc(&dd.nvl, NV_UNIQUE_NAME, 0) == 0); + (void) zfs_check_snap_cb(zfs_handle_dup(zhp), &dd); + + if (nvlist_empty(dd.nvl)) { + ret = zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, + dgettext(TEXT_DOMAIN, "cannot destroy '%s@%s'"), + zhp->zfs_name, snapname); + } else { + ret = zfs_destroy_snaps_nvl(zhp->zfs_hdl, dd.nvl, defer); + } + nvlist_free(dd.nvl); + return (ret); +} + +/* + * Destroys all the snapshots named in the nvlist. + */ +int +zfs_destroy_snaps_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, boolean_t defer) +{ + int ret; + nvlist_t *errlist = NULL; + nvpair_t *pair; + + ret = lzc_destroy_snaps(snaps, defer, &errlist); + + if (ret == 0) { + nvlist_free(errlist); + return (0); + } + + if (nvlist_empty(errlist)) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot destroy snapshots")); + + ret = zfs_standard_error(hdl, ret, errbuf); + } + for (pair = nvlist_next_nvpair(errlist, NULL); + pair != NULL; pair = nvlist_next_nvpair(errlist, pair)) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot destroy snapshot %s"), + nvpair_name(pair)); + + switch (fnvpair_value_int32(pair)) { + case EEXIST: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "snapshot is cloned")); + ret = zfs_error(hdl, EZFS_EXISTS, errbuf); + break; + default: + ret = zfs_standard_error(hdl, errno, errbuf); + break; + } + } + + nvlist_free(errlist); + return (ret); +} + +/* + * Clones the given dataset. The target must be of the same type as the source. + */ +int +zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) +{ + char parent[ZFS_MAX_DATASET_NAME_LEN]; + int ret; + char errbuf[1024]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + uint64_t zoned; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), target); + + /* validate the target/clone name */ + if (!zfs_validate_name(hdl, target, ZFS_TYPE_FILESYSTEM, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* validate parents exist */ + if (check_parents(hdl, target, &zoned, B_FALSE, NULL) != 0) + return (-1); + + (void) parent_name(target, parent, sizeof (parent)); + + /* do the clone */ + + if (props) { + zfs_type_t type; + + if (ZFS_IS_VOLUME(zhp)) { + type = ZFS_TYPE_VOLUME; + } else { + type = ZFS_TYPE_FILESYSTEM; + } + if ((props = zfs_valid_proplist(hdl, type, props, zoned, + zhp, zhp->zpool_hdl, B_TRUE, errbuf)) == NULL) + return (-1); + if (zfs_fix_auto_resv(zhp, props) == -1) { + nvlist_free(props); + return (-1); + } + } + + if (zfs_crypto_clone_check(hdl, zhp, parent, props) != 0) { + nvlist_free(props); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + } + + ret = lzc_clone(target, zhp->zfs_name, props); + nvlist_free(props); + + if (ret != 0) { + switch (errno) { + + case ENOENT: + /* + * The parent doesn't exist. We should have caught this + * above, but there may a race condition that has since + * destroyed the parent. + * + * At this point, we don't know whether it's the source + * that doesn't exist anymore, or whether the target + * dataset doesn't exist. + */ + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "no such parent '%s'"), parent); + return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); + + case EXDEV: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "source and target pools differ")); + return (zfs_error(zhp->zfs_hdl, EZFS_CROSSTARGET, + errbuf)); + + default: + return (zfs_standard_error(zhp->zfs_hdl, errno, + errbuf)); + } + } + + return (ret); +} + +/* + * Promotes the given clone fs to be the clone parent. + */ +int +zfs_promote(zfs_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + int ret; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot promote '%s'"), zhp->zfs_name); + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be promoted")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + if (zhp->zfs_dmustats.dds_origin[0] == '\0') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not a cloned filesystem")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + ret = lzc_promote(zhp->zfs_name, snapname, sizeof (snapname)); + + if (ret != 0) { + switch (ret) { + case EACCES: + /* + * Promoting encrypted dataset outside its + * encryption root. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot promote dataset outside its " + "encryption root")); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + + case EEXIST: + /* There is a conflicting snapshot name. */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "conflicting snapshot '%s' from parent '%s'"), + snapname, zhp->zfs_dmustats.dds_origin); + return (zfs_error(hdl, EZFS_EXISTS, errbuf)); + + default: + return (zfs_standard_error(hdl, ret, errbuf)); + } + } + return (ret); +} + +typedef struct snapdata { + nvlist_t *sd_nvl; + const char *sd_snapname; +} snapdata_t; + +static int +zfs_snapshot_cb(zfs_handle_t *zhp, void *arg) +{ + snapdata_t *sd = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) == 0) { + if (snprintf(name, sizeof (name), "%s@%s", zfs_get_name(zhp), + sd->sd_snapname) >= sizeof (name)) + return (EINVAL); + + fnvlist_add_boolean(sd->sd_nvl, name); + + rv = zfs_iter_filesystems(zhp, zfs_snapshot_cb, sd); + } + zfs_close(zhp); + + return (rv); +} + +/* + * Creates snapshots. The keys in the snaps nvlist are the snapshots to be + * created. + */ +int +zfs_snapshot_nvl(libzfs_handle_t *hdl, nvlist_t *snaps, nvlist_t *props) +{ + int ret; + char errbuf[1024]; + nvpair_t *elem; + nvlist_t *errors; + zpool_handle_t *zpool_hdl; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot create snapshots ")); + + elem = NULL; + while ((elem = nvlist_next_nvpair(snaps, elem)) != NULL) { + const char *snapname = nvpair_name(elem); + + /* validate the target name */ + if (!zfs_validate_name(hdl, snapname, ZFS_TYPE_SNAPSHOT, + B_TRUE)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), snapname); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + } + + /* + * get pool handle for prop validation. assumes all snaps are in the + * same pool, as does lzc_snapshot (below). + */ + elem = nvlist_next_nvpair(snaps, NULL); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + zpool_hdl = zpool_open(hdl, pool); + if (zpool_hdl == NULL) + return (-1); + + if (props != NULL && + (props = zfs_valid_proplist(hdl, ZFS_TYPE_SNAPSHOT, + props, B_FALSE, NULL, zpool_hdl, B_FALSE, errbuf)) == NULL) { + zpool_close(zpool_hdl); + return (-1); + } + zpool_close(zpool_hdl); + + ret = lzc_snapshot(snaps, props, &errors); + + if (ret != 0) { + boolean_t printed = B_FALSE; + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot create snapshot '%s'"), nvpair_name(elem)); + (void) zfs_standard_error(hdl, + fnvpair_value_int32(elem), errbuf); + printed = B_TRUE; + } + if (!printed) { + switch (ret) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple snapshots of same " + "fs not allowed")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); + + break; + default: + (void) zfs_standard_error(hdl, ret, errbuf); + } + } + } + + nvlist_free(props); + nvlist_free(errors); + return (ret); +} + +int +zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, + nvlist_t *props) +{ + int ret; + snapdata_t sd = { 0 }; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *cp; + zfs_handle_t *zhp; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot snapshot %s"), path); + + if (!zfs_validate_name(hdl, path, ZFS_TYPE_SNAPSHOT, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + (void) strlcpy(fsname, path, sizeof (fsname)); + cp = strchr(fsname, '@'); + *cp = '\0'; + sd.sd_snapname = cp + 1; + + if ((zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME)) == NULL) { + return (-1); + } + + verify(nvlist_alloc(&sd.sd_nvl, NV_UNIQUE_NAME, 0) == 0); + if (recursive) { + (void) zfs_snapshot_cb(zfs_handle_dup(zhp), &sd); + } else { + fnvlist_add_boolean(sd.sd_nvl, path); + } + + ret = zfs_snapshot_nvl(hdl, sd.sd_nvl, props); + nvlist_free(sd.sd_nvl); + zfs_close(zhp); + return (ret); +} + +/* + * Destroy any more recent snapshots. We invoke this callback on any dependents + * of the snapshot first. If the 'cb_dependent' member is non-zero, then this + * is a dependent and we should just destroy it without checking the transaction + * group. + */ +typedef struct rollback_data { + const char *cb_target; /* the snapshot */ + uint64_t cb_create; /* creation time reference */ + boolean_t cb_error; + boolean_t cb_force; +} rollback_data_t; + +static int +rollback_destroy_dependent(zfs_handle_t *zhp, void *data) +{ + rollback_data_t *cbp = data; + prop_changelist_t *clp; + + /* We must destroy this clone; first unmount it */ + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + cbp->cb_force ? MS_FORCE: 0); + if (clp == NULL || changelist_prefix(clp) != 0) { + cbp->cb_error = B_TRUE; + zfs_close(zhp); + return (0); + } + if (zfs_destroy(zhp, B_FALSE) != 0) + cbp->cb_error = B_TRUE; + else + changelist_remove(clp, zhp->zfs_name); + (void) changelist_postfix(clp); + changelist_free(clp); + + zfs_close(zhp); + return (0); +} + +static int +rollback_destroy(zfs_handle_t *zhp, void *data) +{ + rollback_data_t *cbp = data; + + if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > cbp->cb_create) { + cbp->cb_error |= zfs_iter_dependents(zhp, B_FALSE, + rollback_destroy_dependent, cbp); + + cbp->cb_error |= zfs_destroy(zhp, B_FALSE); + } + + zfs_close(zhp); + return (0); +} + +/* + * Given a dataset, rollback to a specific snapshot, discarding any + * data changes since then and making it the active dataset. + * + * Any snapshots and bookmarks more recent than the target are + * destroyed, along with their dependents (i.e. clones). + */ +int +zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) +{ + rollback_data_t cb = { 0 }; + int err; + boolean_t restore_resv = 0; + uint64_t old_volsize = 0, new_volsize; + zfs_prop_t resv_prop = { 0 }; + uint64_t min_txg = 0; + + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM || + zhp->zfs_type == ZFS_TYPE_VOLUME); + + /* + * Destroy all recent snapshots and their dependents. + */ + cb.cb_force = force; + cb.cb_target = snap->zfs_name; + cb.cb_create = zfs_prop_get_int(snap, ZFS_PROP_CREATETXG); + + if (cb.cb_create > 0) + min_txg = cb.cb_create; + + (void) zfs_iter_snapshots(zhp, B_FALSE, rollback_destroy, &cb, + min_txg, 0); + + (void) zfs_iter_bookmarks(zhp, rollback_destroy, &cb); + + if (cb.cb_error) + return (-1); + + /* + * Now that we have verified that the snapshot is the latest, + * rollback to the given snapshot. + */ + + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + if (zfs_which_resv_prop(zhp, &resv_prop) < 0) + return (-1); + old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + restore_resv = + (old_volsize == zfs_prop_get_int(zhp, resv_prop)); + } + + /* + * Pass both the filesystem and the wanted snapshot names, + * we would get an error back if the snapshot is destroyed or + * a new snapshot is created before this request is processed. + */ + err = lzc_rollback_to(zhp->zfs_name, snap->zfs_name); + if (err != 0) { + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot rollback '%s'"), + zhp->zfs_name); + switch (err) { + case EEXIST: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "there is a snapshot or bookmark more recent " + "than '%s'"), snap->zfs_name); + (void) zfs_error(zhp->zfs_hdl, EZFS_EXISTS, errbuf); + break; + case ESRCH: + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "'%s' is not found among snapshots of '%s'"), + snap->zfs_name, zhp->zfs_name); + (void) zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf); + break; + case EINVAL: + (void) zfs_error(zhp->zfs_hdl, EZFS_BADTYPE, errbuf); + break; + default: + (void) zfs_standard_error(zhp->zfs_hdl, err, errbuf); + } + return (err); + } + + /* + * For volumes, if the pre-rollback volsize matched the pre- + * rollback reservation and the volsize has changed then set + * the reservation property to the post-rollback volsize. + * Make a new handle since the rollback closed the dataset. + */ + if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && + (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { + if (restore_resv) { + new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); + if (old_volsize != new_volsize) + err = zfs_prop_set_int(zhp, resv_prop, + new_volsize); + } + zfs_close(zhp); + } + return (err); +} + +/* + * Renames the given dataset. + */ +int +zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive, + boolean_t force_unmount) +{ + int ret = 0; + zfs_cmd_t zc = {"\0"}; + char *delim; + prop_changelist_t *cl = NULL; + char parent[ZFS_MAX_DATASET_NAME_LEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + + /* if we have the same exact name, just return success */ + if (strcmp(zhp->zfs_name, target) == 0) + return (0); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rename to '%s'"), target); + + /* make sure source name is valid */ + if (!zfs_validate_name(hdl, zhp->zfs_name, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* + * Make sure the target name is valid + */ + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + if ((strchr(target, '@') == NULL) || + *target == '@') { + /* + * Snapshot target name is abbreviated, + * reconstruct full dataset name + */ + (void) strlcpy(parent, zhp->zfs_name, + sizeof (parent)); + delim = strchr(parent, '@'); + if (strchr(target, '@') == NULL) + *(++delim) = '\0'; + else + *delim = '\0'; + (void) strlcat(parent, target, sizeof (parent)); + target = parent; + } else { + /* + * Make sure we're renaming within the same dataset. + */ + delim = strchr(target, '@'); + if (strncmp(zhp->zfs_name, target, delim - target) + != 0 || zhp->zfs_name[delim - target] != '@') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots must be part of same " + "dataset")); + return (zfs_error(hdl, EZFS_CROSSTARGET, + errbuf)); + } + } + + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } else { + if (recursive) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "recursive rename must be a snapshot")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + + if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + + /* validate parents */ + if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) + return (-1); + + /* make sure we're in the same pool */ + verify((delim = strchr(target, '/')) != NULL); + if (strncmp(zhp->zfs_name, target, delim - target) != 0 || + zhp->zfs_name[delim - target] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "datasets must be within same pool")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + } + + /* new name cannot be a child of the current dataset name */ + if (is_descendant(zhp->zfs_name, target)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "New dataset name cannot be a descendant of " + "current dataset name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + } + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot rename '%s'"), zhp->zfs_name); + + if (getzoneid() == GLOBAL_ZONEID && + zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is used in a non-global zone")); + return (zfs_error(hdl, EZFS_ZONED, errbuf)); + } + + if (recursive) { + zfs_handle_t *zhrp; + char *parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); + if (parentname == NULL) { + ret = -1; + goto error; + } + delim = strchr(parentname, '@'); + *delim = '\0'; + zhrp = zfs_open(zhp->zfs_hdl, parentname, ZFS_TYPE_DATASET); + free(parentname); + if (zhrp == NULL) { + ret = -1; + goto error; + } + zfs_close(zhrp); + } else if (zhp->zfs_type != ZFS_TYPE_SNAPSHOT) { + if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, + CL_GATHER_ITER_MOUNTED, + force_unmount ? MS_FORCE : 0)) == NULL) + return (-1); + + if (changelist_haszonedchild(cl)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "child dataset with inherited mountpoint is used " + "in a non-global zone")); + (void) zfs_error(hdl, EZFS_ZONED, errbuf); + ret = -1; + goto error; + } + + if ((ret = changelist_prefix(cl)) != 0) + goto error; + } + + if (ZFS_IS_VOLUME(zhp)) + zc.zc_objset_type = DMU_OST_ZVOL; + else + zc.zc_objset_type = DMU_OST_ZFS; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); + + zc.zc_cookie = recursive; + + if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { + /* + * if it was recursive, the one that actually failed will + * be in zc.zc_name + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rename '%s'"), zc.zc_name); + + if (recursive && errno == EEXIST) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a child dataset already has a snapshot " + "with the new name")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); + } else if (errno == EACCES) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot move encrypted child outside of " + "its encryption root")); + (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); + } else { + (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); + } + + /* + * On failure, we still want to remount any filesystems that + * were previously mounted, so we don't alter the system state. + */ + if (cl != NULL) + (void) changelist_postfix(cl); + } else { + if (cl != NULL) { + changelist_rename(cl, zfs_get_name(zhp), target); + ret = changelist_postfix(cl); + } + } + +error: + if (cl != NULL) { + changelist_free(cl); + } + return (ret); +} + +nvlist_t * +zfs_get_all_props(zfs_handle_t *zhp) +{ + return (zhp->zfs_props); +} + +nvlist_t * +zfs_get_recvd_props(zfs_handle_t *zhp) +{ + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (NULL); + return (zhp->zfs_recvd_props); +} + +nvlist_t * +zfs_get_user_props(zfs_handle_t *zhp) +{ + return (zhp->zfs_user_props); +} + +/* + * This function is used by 'zfs list' to determine the exact set of columns to + * display, and their maximum widths. This does two main things: + * + * - If this is a list of all properties, then expand the list to include + * all native properties, and set a flag so that for each dataset we look + * for new unique user properties and add them to the list. + * + * - For non fixed-width properties, keep track of the maximum width seen + * so that we can size the column appropriately. If the user has + * requested received property values, we also need to compute the width + * of the RECEIVED column. + */ +int +zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received, + boolean_t literal) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zprop_list_t *entry; + zprop_list_t **last, **start; + nvlist_t *userprops, *propval; + nvpair_t *elem; + char *strval; + char buf[ZFS_MAXPROPLEN]; + + if (zprop_expand_list(hdl, plp, ZFS_TYPE_DATASET) != 0) + return (-1); + + userprops = zfs_get_user_props(zhp); + + entry = *plp; + if (entry->pl_all && nvlist_next_nvpair(userprops, NULL) != NULL) { + /* + * Go through and add any user properties as necessary. We + * start by incrementing our list pointer to the first + * non-native property. + */ + start = plp; + while (*start != NULL) { + if ((*start)->pl_prop == ZPROP_INVAL) + break; + start = &(*start)->pl_next; + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(userprops, elem)) != NULL) { + /* + * See if we've already found this property in our list. + */ + for (last = start; *last != NULL; + last = &(*last)->pl_next) { + if (strcmp((*last)->pl_user_prop, + nvpair_name(elem)) == 0) + break; + } + + if (*last == NULL) { + if ((entry = zfs_alloc(hdl, + sizeof (zprop_list_t))) == NULL || + ((entry->pl_user_prop = zfs_strdup(hdl, + nvpair_name(elem)))) == NULL) { + free(entry); + return (-1); + } + + entry->pl_prop = ZPROP_INVAL; + entry->pl_width = strlen(nvpair_name(elem)); + entry->pl_all = B_TRUE; + *last = entry; + } + } + } + + /* + * Now go through and check the width of any non-fixed columns + */ + for (entry = *plp; entry != NULL; entry = entry->pl_next) { + if (entry->pl_fixed && !literal) + continue; + + if (entry->pl_prop != ZPROP_INVAL) { + if (zfs_prop_get(zhp, entry->pl_prop, + buf, sizeof (buf), NULL, NULL, 0, literal) == 0) { + if (strlen(buf) > entry->pl_width) + entry->pl_width = strlen(buf); + } + if (received && zfs_prop_get_recvd(zhp, + zfs_prop_to_name(entry->pl_prop), + buf, sizeof (buf), literal) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); + } else { + if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, + &propval) == 0) { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + if (strlen(strval) > entry->pl_width) + entry->pl_width = strlen(strval); + } + if (received && zfs_prop_get_recvd(zhp, + entry->pl_user_prop, + buf, sizeof (buf), literal) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); + } + } + + return (0); +} + +void +zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) +{ + nvpair_t *curr; + nvpair_t *next; + + /* + * Keep a reference to the props-table against which we prune the + * properties. + */ + zhp->zfs_props_table = props; + + curr = nvlist_next_nvpair(zhp->zfs_props, NULL); + + while (curr) { + zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); + next = nvlist_next_nvpair(zhp->zfs_props, curr); + + /* + * User properties will result in ZPROP_INVAL, and since we + * only know how to prune standard ZFS properties, we always + * leave these in the list. This can also happen if we + * encounter an unknown DSL property (when running older + * software, for example). + */ + if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) + (void) nvlist_remove(zhp->zfs_props, + nvpair_name(curr), nvpair_type(curr)); + curr = next; + } +} + +static int +zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, + zfs_smb_acl_op_t cmd, char *resource1, char *resource2) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *nvlist = NULL; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + zc.zc_cookie = (uint64_t)cmd; + + if (cmd == ZFS_SMB_ACL_RENAME) { + if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (0); + } + } + + switch (cmd) { + case ZFS_SMB_ACL_ADD: + case ZFS_SMB_ACL_REMOVE: + (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); + break; + case ZFS_SMB_ACL_RENAME: + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, + resource1) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, + resource2) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { + nvlist_free(nvlist); + return (-1); + } + break; + case ZFS_SMB_ACL_PURGE: + break; + default: + return (-1); + } + error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); + nvlist_free(nvlist); + return (error); +} + +int +zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, + resource, NULL)); +} + +int +zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, + resource, NULL)); +} + +int +zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, + NULL, NULL)); +} + +int +zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, + char *oldname, char *newname) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, + oldname, newname)); +} + +int +zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg) +{ + zfs_cmd_t zc = {"\0"}; + zfs_useracct_t buf[100]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int ret; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + zc.zc_objset_type = type; + zc.zc_nvlist_dst = (uintptr_t)buf; + + for (;;) { + zfs_useracct_t *zua = buf; + + zc.zc_nvlist_dst_size = sizeof (buf); + if (zfs_ioctl(hdl, ZFS_IOC_USERSPACE_MANY, &zc) != 0) { + char errbuf[1024]; + + if ((errno == ENOTSUP && + (type == ZFS_PROP_USEROBJUSED || + type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || + type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTQUOTA))) + break; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot get used/quota for %s"), zc.zc_name); + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + if (zc.zc_nvlist_dst_size == 0) + break; + + while (zc.zc_nvlist_dst_size > 0) { + if ((ret = func(arg, zua->zu_domain, zua->zu_rid, + zua->zu_space)) != 0) + return (ret); + zua++; + zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); + } + } + + return (0); +} + +struct holdarg { + nvlist_t *nvl; + const char *snapname; + const char *tag; + boolean_t recursive; + int error; +}; + +static int +zfs_hold_one(zfs_handle_t *zhp, void *arg) +{ + struct holdarg *ha = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + + if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, + ha->snapname) >= sizeof (name)) + return (EINVAL); + + if (lzc_exists(name)) + fnvlist_add_string(ha->nvl, name, ha->tag); + + if (ha->recursive) + rv = zfs_iter_filesystems(zhp, zfs_hold_one, ha); + zfs_close(zhp); + return (rv); +} + +int +zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive, int cleanup_fd) +{ + int ret; + struct holdarg ha; + + ha.nvl = fnvlist_alloc(); + ha.snapname = snapname; + ha.tag = tag; + ha.recursive = recursive; + (void) zfs_hold_one(zfs_handle_dup(zhp), &ha); + + if (nvlist_empty(ha.nvl)) { + char errbuf[1024]; + + fnvlist_free(ha.nvl); + ret = ENOENT; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot hold snapshot '%s@%s'"), + zhp->zfs_name, snapname); + (void) zfs_standard_error(zhp->zfs_hdl, ret, errbuf); + return (ret); + } + + ret = zfs_hold_nvl(zhp, cleanup_fd, ha.nvl); + fnvlist_free(ha.nvl); + + return (ret); +} + +int +zfs_hold_nvl(zfs_handle_t *zhp, int cleanup_fd, nvlist_t *holds) +{ + int ret; + nvlist_t *errors; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + nvpair_t *elem; + + errors = NULL; + ret = lzc_hold(holds, cleanup_fd, &errors); + + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); + return (0); + } + + if (nvlist_empty(errors)) { + /* no hold-specific errors */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot hold")); + switch (ret) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + default: + (void) zfs_standard_error(hdl, ret, errbuf); + } + } + + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot hold snapshot '%s'"), nvpair_name(elem)); + switch (fnvpair_value_int32(elem)) { + case E2BIG: + /* + * Temporary tags wind up having the ds object id + * prepended. So even if we passed the length check + * above, it's still possible for the tag to wind + * up being slightly too long. + */ + (void) zfs_error(hdl, EZFS_TAGTOOLONG, errbuf); + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case EEXIST: + (void) zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf); + break; + default: + (void) zfs_standard_error(hdl, + fnvpair_value_int32(elem), errbuf); + } + } + + fnvlist_free(errors); + return (ret); +} + +static int +zfs_release_one(zfs_handle_t *zhp, void *arg) +{ + struct holdarg *ha = arg; + char name[ZFS_MAX_DATASET_NAME_LEN]; + int rv = 0; + nvlist_t *existing_holds; + + if (snprintf(name, sizeof (name), "%s@%s", zhp->zfs_name, + ha->snapname) >= sizeof (name)) { + ha->error = EINVAL; + rv = EINVAL; + } + + if (lzc_get_holds(name, &existing_holds) != 0) { + ha->error = ENOENT; + } else if (!nvlist_exists(existing_holds, ha->tag)) { + ha->error = ESRCH; + } else { + nvlist_t *torelease = fnvlist_alloc(); + fnvlist_add_boolean(torelease, ha->tag); + fnvlist_add_nvlist(ha->nvl, name, torelease); + fnvlist_free(torelease); + } + + if (ha->recursive) + rv = zfs_iter_filesystems(zhp, zfs_release_one, ha); + zfs_close(zhp); + return (rv); +} + +int +zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + int ret; + struct holdarg ha; + nvlist_t *errors = NULL; + nvpair_t *elem; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char errbuf[1024]; + + ha.nvl = fnvlist_alloc(); + ha.snapname = snapname; + ha.tag = tag; + ha.recursive = recursive; + ha.error = 0; + (void) zfs_release_one(zfs_handle_dup(zhp), &ha); + + if (nvlist_empty(ha.nvl)) { + fnvlist_free(ha.nvl); + ret = ha.error; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot release hold from snapshot '%s@%s'"), + zhp->zfs_name, snapname); + if (ret == ESRCH) { + (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); + } else { + (void) zfs_standard_error(hdl, ret, errbuf); + } + return (ret); + } + + ret = lzc_release(ha.nvl, &errors); + fnvlist_free(ha.nvl); + + if (ret == 0) { + /* There may be errors even in the success case. */ + fnvlist_free(errors); + return (0); + } + + if (nvlist_empty(errors)) { + /* no hold-specific errors */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot release")); + switch (errno) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + default: + (void) zfs_standard_error_fmt(hdl, errno, errbuf); + } + } + + for (elem = nvlist_next_nvpair(errors, NULL); + elem != NULL; + elem = nvlist_next_nvpair(errors, elem)) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot release hold from snapshot '%s'"), + nvpair_name(elem)); + switch (fnvpair_value_int32(elem)) { + case ESRCH: + (void) zfs_error(hdl, EZFS_REFTAG_RELE, errbuf); + break; + case EINVAL: + (void) zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + default: + (void) zfs_standard_error_fmt(hdl, + fnvpair_value_int32(elem), errbuf); + } + } + + fnvlist_free(errors); + return (ret); +} + +int +zfs_get_fsacl(zfs_handle_t *zhp, nvlist_t **nvl) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zfs_hdl; + int nvsz = 2048; + void *nvbuf; + int err = 0; + char errbuf[1024]; + + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + +tryagain: + + nvbuf = malloc(nvsz); + if (nvbuf == NULL) { + err = (zfs_error(hdl, EZFS_NOMEM, strerror(errno))); + goto out; + } + + zc.zc_nvlist_dst_size = nvsz; + zc.zc_nvlist_dst = (uintptr_t)nvbuf; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_GET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + switch (errno) { + case ENOMEM: + free(nvbuf); + nvsz = zc.zc_nvlist_dst_size; + goto tryagain; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } else { + /* success */ + int rc = nvlist_unpack(nvbuf, zc.zc_nvlist_dst_size, nvl, 0); + if (rc) { + (void) snprintf(errbuf, sizeof (errbuf), dgettext( + TEXT_DOMAIN, "cannot get permissions on '%s'"), + zc.zc_name); + err = zfs_standard_error_fmt(hdl, rc, errbuf); + } + } + + free(nvbuf); +out: + return (err); +} + +int +zfs_set_fsacl(zfs_handle_t *zhp, boolean_t un, nvlist_t *nvl) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char *nvbuf; + char errbuf[1024]; + size_t nvsz; + int err; + + assert(zhp->zfs_type == ZFS_TYPE_VOLUME || + zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + err = nvlist_size(nvl, &nvsz, NV_ENCODE_NATIVE); + assert(err == 0); + + nvbuf = malloc(nvsz); + + err = nvlist_pack(nvl, &nvbuf, &nvsz, NV_ENCODE_NATIVE, 0); + assert(err == 0); + + zc.zc_nvlist_src_size = nvsz; + zc.zc_nvlist_src = (uintptr_t)nvbuf; + zc.zc_perm_action = un; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_SET_FSACL, &zc) != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set permissions on '%s'"), + zc.zc_name); + switch (errno) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } + + free(nvbuf); + + return (err); +} + +int +zfs_get_holds(zfs_handle_t *zhp, nvlist_t **nvl) +{ + int err; + char errbuf[1024]; + + err = lzc_get_holds(zhp->zfs_name, nvl); + + if (err != 0) { + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot get holds for '%s'"), + zhp->zfs_name); + switch (err) { + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + err = zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EINVAL: + err = zfs_error(hdl, EZFS_BADTYPE, errbuf); + break; + case ENOENT: + err = zfs_error(hdl, EZFS_NOENT, errbuf); + break; + default: + err = zfs_standard_error_fmt(hdl, errno, errbuf); + break; + } + } + + return (err); +} + +/* + * The theory of raidz space accounting + * + * The "referenced" property of RAIDZ vdevs is scaled such that a 128KB block + * will "reference" 128KB, even though it allocates more than that, to store the + * parity information (and perhaps skip sectors). This concept of the + * "referenced" (and other DMU space accounting) being lower than the allocated + * space by a constant factor is called "raidz deflation." + * + * As mentioned above, the constant factor for raidz deflation assumes a 128KB + * block size. However, zvols typically have a much smaller block size (default + * 8KB). These smaller blocks may require proportionally much more parity + * information (and perhaps skip sectors). In this case, the change to the + * "referenced" property may be much more than the logical block size. + * + * Suppose a raidz vdev has 5 disks with ashift=12. A 128k block may be written + * as follows. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D8 | D16 | D24 | + * | P1 | D1 | D9 | D17 | D25 | + * | P2 | D2 | D10 | D18 | D26 | + * | P3 | D3 | D11 | D19 | D27 | + * | P4 | D4 | D12 | D20 | D28 | + * | P5 | D5 | D13 | D21 | D29 | + * | P6 | D6 | D14 | D22 | D30 | + * | P7 | D7 | D15 | D23 | D31 | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that 160k was allocated: 8 x 4k parity sectors + 32 x 4k data + * sectors. The dataset's referenced will increase by 128k and the pool's + * allocated and free properties will be adjusted by 160k. + * + * A 4k block written to the same raidz vdev will require two 4k sectors. The + * blank cells represent unallocated space. + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | | | | + * +-------+-------+-------+-------+-------+ + * + * Above, notice that the 4k block required one sector for parity and another + * for data. vdev_raidz_asize() will return 8k and as such the pool's allocated + * and free properties will be adjusted by 8k. The dataset will not be charged + * 8k. Rather, it will be charged a value that is scaled according to the + * overhead of the 128k block on the same vdev. This 8k allocation will be + * charged 8k * 128k / 160k. 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as + * calculated in the 128k block example above. + * + * Every raidz allocation is sized to be a multiple of nparity+1 sectors. That + * is, every raidz1 allocation will be a multiple of 2 sectors, raidz2 + * allocations are a multiple of 3 sectors, and raidz3 allocations are a + * multiple of of 4 sectors. When a block does not fill the required number of + * sectors, skip blocks (sectors) are used. + * + * An 8k block being written to a raidz vdev may be written as follows: + * + * +-------+-------+-------+-------+-------+ + * | disk1 | disk2 | disk3 | disk4 | disk5 | + * +-------+-------+-------+-------+-------+ + * | P0 | D0 | D1 | S0 | | + * +-------+-------+-------+-------+-------+ + * + * In order to maintain the nparity+1 allocation size, a skip block (S0) was + * added. For this 8k block, the pool's allocated and free properties are + * adjusted by 16k and the dataset's referenced is increased by 16k * 128k / + * 160k. Again, 128k is from SPA_OLD_MAXBLOCKSIZE and 160k is as calculated in + * the 128k block example above. + * + * Compression may lead to a variety of block sizes being written for the same + * volume or file. There is no clear way to reserve just the amount of space + * that will be required, so the worst case (no compression) is assumed. + * Note that metadata blocks will typically be compressed, so the reservation + * size returned by zvol_volsize_to_reservation() will generally be slightly + * larger than the maximum that the volume can reference. + */ + +/* + * Derived from function of same name in module/zfs/vdev_raidz.c. Returns the + * amount of space (in bytes) that will be allocated for the specified block + * size. Note that the "referenced" space accounted will be less than this, but + * not necessarily equal to "blksize", due to RAIDZ deflation. + */ +static uint64_t +vdev_raidz_asize(uint64_t ndisks, uint64_t nparity, uint64_t ashift, + uint64_t blksize) +{ + uint64_t asize, ndata; + + ASSERT3U(ndisks, >, nparity); + ndata = ndisks - nparity; + asize = ((blksize - 1) >> ashift) + 1; + asize += nparity * ((asize + ndata - 1) / ndata); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +/* + * Determine how much space will be allocated if it lands on the most space- + * inefficient top-level vdev. Returns the size in bytes required to store one + * copy of the volume data. See theory comment above. + */ +static uint64_t +volsize_from_vdevs(zpool_handle_t *zhp, uint64_t nblocks, uint64_t blksize) +{ + nvlist_t *config, *tree, **vdevs; + uint_t nvdevs, v; + uint64_t ret = 0; + + config = zpool_get_config(zhp, NULL); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) != 0 || + nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, + &vdevs, &nvdevs) != 0) { + return (nblocks * blksize); + } + + for (v = 0; v < nvdevs; v++) { + char *type; + uint64_t nparity, ashift, asize, tsize; + nvlist_t **disks; + uint_t ndisks; + uint64_t volsize; + + if (nvlist_lookup_string(vdevs[v], ZPOOL_CONFIG_TYPE, + &type) != 0 || strcmp(type, VDEV_TYPE_RAIDZ) != 0 || + nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_NPARITY, + &nparity) != 0 || + nvlist_lookup_uint64(vdevs[v], ZPOOL_CONFIG_ASHIFT, + &ashift) != 0 || + nvlist_lookup_nvlist_array(vdevs[v], ZPOOL_CONFIG_CHILDREN, + &disks, &ndisks) != 0) { + continue; + } + + /* allocation size for the "typical" 128k block */ + tsize = vdev_raidz_asize(ndisks, nparity, ashift, + SPA_OLD_MAXBLOCKSIZE); + /* allocation size for the blksize block */ + asize = vdev_raidz_asize(ndisks, nparity, ashift, blksize); + + /* + * Scale this size down as a ratio of 128k / tsize. See theory + * statement above. + */ + volsize = nblocks * asize * SPA_OLD_MAXBLOCKSIZE / tsize; + if (volsize > ret) { + ret = volsize; + } + } + + if (ret == 0) { + ret = nblocks * blksize; + } + + return (ret); +} + +/* + * Convert the zvol's volume size to an appropriate reservation. See theory + * comment above. + * + * Note: If this routine is updated, it is necessary to update the ZFS test + * suite's shell version in reservation.shlib. + */ +uint64_t +zvol_volsize_to_reservation(zpool_handle_t *zph, uint64_t volsize, + nvlist_t *props) +{ + uint64_t numdb; + uint64_t nblocks, volblocksize; + int ncopies; + char *strval; + + if (nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) + ncopies = atoi(strval); + else + ncopies = 1; + if (nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize) != 0) + volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + + nblocks = volsize / volblocksize; + /* + * Metadata defaults to using 128k blocks, not volblocksize blocks. For + * this reason, only the data blocks are scaled based on vdev config. + */ + volsize = volsize_from_vdevs(zph, nblocks, volblocksize); + + /* start with metadnode L0-L6 */ + numdb = 7; + /* calculate number of indirects */ + while (nblocks > 1) { + nblocks += DNODES_PER_LEVEL - 1; + nblocks /= DNODES_PER_LEVEL; + numdb += nblocks; + } + numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); + volsize *= ncopies; + /* + * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't + * compressed, but in practice they compress down to about + * 1100 bytes + */ + numdb *= 1ULL << DN_MAX_INDBLKSHIFT; + volsize += numdb; + return (volsize); +} + +/* + * Wait for the given activity and return the status of the wait (whether or not + * any waiting was done) in the 'waited' parameter. Non-existent fses are + * reported via the 'missing' parameter, rather than by printing an error + * message. This is convenient when this function is called in a loop over a + * long period of time (as it is, for example, by zfs's wait cmd). In that + * scenario, a fs being exported or destroyed should be considered a normal + * event, so we don't want to print an error when we find that the fs doesn't + * exist. + */ +int +zfs_wait_status(zfs_handle_t *zhp, zfs_wait_activity_t activity, + boolean_t *missing, boolean_t *waited) +{ + int error = lzc_wait_fs(zhp->zfs_name, activity, waited); + *missing = (error == ENOENT); + if (*missing) + return (0); + + if (error != 0) { + (void) zfs_standard_error_fmt(zhp->zfs_hdl, error, + dgettext(TEXT_DOMAIN, "error waiting in fs '%s'"), + zhp->zfs_name); + } + + return (error); +} diff --git a/lib/libzfs/libzfs_diff.c b/lib/libzfs/libzfs_diff.c new file mode 100644 index 000000000000..7941a5883074 --- /dev/null +++ b/lib/libzfs/libzfs_diff.c @@ -0,0 +1,797 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015, 2018 by Delphix. All rights reserved. + * Copyright 2016 Joyent, Inc. + * Copyright 2016 Igor Kozhukhov + */ + +/* + * zfs diff support + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" + +#define ZDIFF_SNAPDIR "/.zfs/snapshot/" +#define ZDIFF_PREFIX "zfs-diff-%d" + +#define ZDIFF_ADDED '+' +#define ZDIFF_MODIFIED 'M' +#define ZDIFF_REMOVED '-' +#define ZDIFF_RENAMED 'R' + + +/* + * Given a {dsname, object id}, get the object path + */ +static int +get_stats_for_obj(differ_info_t *di, const char *dsname, uint64_t obj, + char *pn, int maxlen, zfs_stat_t *sb) +{ + zfs_cmd_t zc = {"\0"}; + int error; + + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + zc.zc_obj = obj; + + errno = 0; + error = zfs_ioctl(di->zhp->zfs_hdl, ZFS_IOC_OBJ_TO_STATS, &zc); + di->zerr = errno; + + /* we can get stats even if we failed to get a path */ + (void) memcpy(sb, &zc.zc_stat, sizeof (zfs_stat_t)); + if (error == 0) { + ASSERT(di->zerr == 0); + (void) strlcpy(pn, zc.zc_value, maxlen); + return (0); + } + + if (di->zerr == ESTALE) { + (void) snprintf(pn, maxlen, "(on_delete_queue)"); + return (0); + } else if (di->zerr == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "The sys_config privilege or diff delegated permission " + "is needed\nto discover path names")); + return (-1); + } else if (di->zerr == EACCES) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Key must be loaded to discover path names")); + return (-1); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine path or stats for " + "object %lld in %s"), (longlong_t)obj, dsname); + return (-1); + } +} + +/* + * stream_bytes + * + * Prints a file name out a character at a time. If the character is + * not in the range of what we consider "printable" ASCII, display it + * as an escaped 4-digit octal value. ASCII values less than a space + * are all control characters and we declare the upper end as the + * DELete character. This also is the last 7-bit ASCII character. + * We choose to treat all 8-bit ASCII as not printable for this + * application. + */ +static void +stream_bytes(FILE *fp, const char *string) +{ + char c; + + while ((c = *string++) != '\0') { + if (c > ' ' && c != '\\' && c < '\177') { + (void) fprintf(fp, "%c", c); + } else { + (void) fprintf(fp, "\\%04o", (uint8_t)c); + } + } +} + +static void +print_what(FILE *fp, mode_t what) +{ + char symbol; + + switch (what & S_IFMT) { + case S_IFBLK: + symbol = 'B'; + break; + case S_IFCHR: + symbol = 'C'; + break; + case S_IFDIR: + symbol = '/'; + break; +#ifdef S_IFDOOR + case S_IFDOOR: + symbol = '>'; + break; +#endif + case S_IFIFO: + symbol = '|'; + break; + case S_IFLNK: + symbol = '@'; + break; +#ifdef S_IFPORT + case S_IFPORT: + symbol = 'P'; + break; +#endif + case S_IFSOCK: + symbol = '='; + break; + case S_IFREG: + symbol = 'F'; + break; + default: + symbol = '?'; + break; + } + (void) fprintf(fp, "%c", symbol); +} + +static void +print_cmn(FILE *fp, differ_info_t *di, const char *file) +{ + stream_bytes(fp, di->dsmnt); + stream_bytes(fp, file); +} + +static void +print_rename(FILE *fp, differ_info_t *di, const char *old, const char *new, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_RENAMED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, old); + if (di->scripted) + (void) fprintf(fp, "\t"); + else + (void) fprintf(fp, " -> "); + print_cmn(fp, di, new); + (void) fprintf(fp, "\n"); +} + +static void +print_link_change(FILE *fp, differ_info_t *di, int delta, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", ZDIFF_MODIFIED); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\t(%+d)", delta); + (void) fprintf(fp, "\n"); +} + +static void +print_file(FILE *fp, differ_info_t *di, char type, const char *file, + zfs_stat_t *isb) +{ + if (di->timestamped) + (void) fprintf(fp, "%10lld.%09lld\t", + (longlong_t)isb->zs_ctime[0], + (longlong_t)isb->zs_ctime[1]); + (void) fprintf(fp, "%c\t", type); + if (di->classify) { + print_what(fp, isb->zs_mode); + (void) fprintf(fp, "\t"); + } + print_cmn(fp, di, file); + (void) fprintf(fp, "\n"); +} + +static int +write_inuse_diffs_one(FILE *fp, differ_info_t *di, uint64_t dobj) +{ + struct zfs_stat fsb, tsb; + mode_t fmode, tmode; + char fobjname[MAXPATHLEN], tobjname[MAXPATHLEN]; + int fobjerr, tobjerr; + int change; + + if (dobj == di->shares) + return (0); + + /* + * Check the from and to snapshots for info on the object. If + * we get ENOENT, then the object just didn't exist in that + * snapshot. If we get ENOTSUP, then we tried to get + * info on a non-ZPL object, which we don't care about anyway. + */ + fobjerr = get_stats_for_obj(di, di->fromsnap, dobj, fobjname, + MAXPATHLEN, &fsb); + if (fobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + tobjerr = get_stats_for_obj(di, di->tosnap, dobj, tobjname, + MAXPATHLEN, &tsb); + if (tobjerr && di->zerr != ENOENT && di->zerr != ENOTSUP) + return (-1); + + /* + * Unallocated object sharing the same meta dnode block + */ + if (fobjerr && tobjerr) { + ASSERT(di->zerr == ENOENT || di->zerr == ENOTSUP); + di->zerr = 0; + return (0); + } + + di->zerr = 0; /* negate get_stats_for_obj() from side that failed */ + fmode = fsb.zs_mode & S_IFMT; + tmode = tsb.zs_mode & S_IFMT; + if (fmode == S_IFDIR || tmode == S_IFDIR || fsb.zs_links == 0 || + tsb.zs_links == 0) + change = 0; + else + change = tsb.zs_links - fsb.zs_links; + + if (fobjerr) { + if (change) { + print_link_change(fp, di, change, tobjname, &tsb); + return (0); + } + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } else if (tobjerr) { + if (change) { + print_link_change(fp, di, change, fobjname, &fsb); + return (0); + } + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + return (0); + } + + if (fmode != tmode && fsb.zs_gen == tsb.zs_gen) + tsb.zs_gen++; /* Force a generational difference */ + + /* Simple modification or no change */ + if (fsb.zs_gen == tsb.zs_gen) { + /* No apparent changes. Could we assert !this? */ + if (fsb.zs_ctime[0] == tsb.zs_ctime[0] && + fsb.zs_ctime[1] == tsb.zs_ctime[1]) + return (0); + if (change) { + print_link_change(fp, di, change, + change > 0 ? fobjname : tobjname, &tsb); + } else if (strcmp(fobjname, tobjname) == 0) { + print_file(fp, di, ZDIFF_MODIFIED, fobjname, &tsb); + } else { + print_rename(fp, di, fobjname, tobjname, &tsb); + } + return (0); + } else { + /* file re-created or object re-used */ + print_file(fp, di, ZDIFF_REMOVED, fobjname, &fsb); + print_file(fp, di, ZDIFF_ADDED, tobjname, &tsb); + return (0); + } +} + +static int +write_inuse_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + uint64_t o; + int err; + + for (o = dr->ddr_first; o <= dr->ddr_last; o++) { + if ((err = write_inuse_diffs_one(fp, di, o)) != 0) + return (err); + } + return (0); +} + +static int +describe_free(FILE *fp, differ_info_t *di, uint64_t object, char *namebuf, + int maxlen) +{ + struct zfs_stat sb; + + if (get_stats_for_obj(di, di->fromsnap, object, namebuf, + maxlen, &sb) != 0) { + return (-1); + } + /* Don't print if in the delete queue on from side */ + if (di->zerr == ESTALE) { + di->zerr = 0; + return (0); + } + + print_file(fp, di, ZDIFF_REMOVED, namebuf, &sb); + return (0); +} + +static int +write_free_diffs(FILE *fp, differ_info_t *di, dmu_diff_record_t *dr) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *lhdl = di->zhp->zfs_hdl; + char fobjname[MAXPATHLEN]; + + (void) strlcpy(zc.zc_name, di->fromsnap, sizeof (zc.zc_name)); + zc.zc_obj = dr->ddr_first - 1; + + ASSERT(di->zerr == 0); + + while (zc.zc_obj < dr->ddr_last) { + int err; + + err = zfs_ioctl(lhdl, ZFS_IOC_NEXT_OBJ, &zc); + if (err == 0) { + if (zc.zc_obj == di->shares) { + zc.zc_obj++; + continue; + } + if (zc.zc_obj > dr->ddr_last) { + break; + } + err = describe_free(fp, di, zc.zc_obj, fobjname, + MAXPATHLEN); + if (err) + break; + } else if (errno == ESRCH) { + break; + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "next allocated object (> %lld) find failure"), + (longlong_t)zc.zc_obj); + di->zerr = errno; + break; + } + } + if (di->zerr) + return (-1); + return (0); +} + +static void * +differ(void *arg) +{ + differ_info_t *di = arg; + dmu_diff_record_t dr; + FILE *ofp; + int err = 0; + + if ((ofp = fdopen(di->outputfd, "w")) == NULL) { + di->zerr = errno; + strlcpy(di->errbuf, strerror(errno), sizeof (di->errbuf)); + (void) close(di->datafd); + return ((void *)-1); + } + + for (;;) { + char *cp = (char *)&dr; + int len = sizeof (dr); + int rv; + + do { + rv = read(di->datafd, cp, len); + cp += rv; + len -= rv; + } while (len > 0 && rv > 0); + + if (rv < 0 || (rv == 0 && len != sizeof (dr))) { + di->zerr = EPIPE; + break; + } else if (rv == 0) { + /* end of file at a natural breaking point */ + break; + } + + switch (dr.ddr_type) { + case DDR_FREE: + err = write_free_diffs(ofp, di, &dr); + break; + case DDR_INUSE: + err = write_inuse_diffs(ofp, di, &dr); + break; + default: + di->zerr = EPIPE; + break; + } + + if (err || di->zerr) + break; + } + + (void) fclose(ofp); + (void) close(di->datafd); + if (err) + return ((void *)-1); + if (di->zerr) { + ASSERT(di->zerr == EPIPE); + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Internal error: bad data from diff IOCTL")); + return ((void *)-1); + } + return ((void *)0); +} + +static int +make_temp_snapshot(differ_info_t *di) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + zfs_cmd_t zc = {"\0"}; + + (void) snprintf(zc.zc_value, sizeof (zc.zc_value), + ZDIFF_PREFIX, getpid()); + (void) strlcpy(zc.zc_name, di->ds, sizeof (zc.zc_name)); + zc.zc_cleanup_fd = di->cleanupfd; + + if (zfs_ioctl(hdl, ZFS_IOC_TMP_SNAPSHOT, &zc) != 0) { + int err = errno; + if (err == EPERM) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "The diff delegated " + "permission is needed in order\nto create a " + "just-in-time snapshot for diffing\n")); + return (zfs_error(hdl, EZFS_DIFF, di->errbuf)); + } else { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot create just-in-time " + "snapshot of '%s'"), zc.zc_name); + return (zfs_standard_error(hdl, err, di->errbuf)); + } + } + + di->tmpsnap = zfs_strdup(hdl, zc.zc_value); + di->tosnap = zfs_asprintf(hdl, "%s@%s", di->ds, di->tmpsnap); + return (0); +} + +static void +teardown_differ_info(differ_info_t *di) +{ + free(di->ds); + free(di->dsmnt); + free(di->fromsnap); + free(di->frommnt); + free(di->tosnap); + free(di->tmpsnap); + free(di->tomnt); + (void) close(di->cleanupfd); +} + +static int +get_snapshot_names(differ_info_t *di, const char *fromsnap, + const char *tosnap) +{ + libzfs_handle_t *hdl = di->zhp->zfs_hdl; + char *atptrf = NULL; + char *atptrt = NULL; + int fdslen, fsnlen; + int tdslen, tsnlen; + + /* + * Can accept + * fdslen fsnlen tdslen tsnlen + * dataset@snap1 + * 0. dataset@snap1 dataset@snap2 >0 >1 >0 >1 + * 1. dataset@snap1 @snap2 >0 >1 ==0 >1 + * 2. dataset@snap1 dataset >0 >1 >0 ==0 + * 3. @snap1 dataset@snap2 ==0 >1 >0 >1 + * 4. @snap1 dataset ==0 >1 >0 ==0 + */ + if (tosnap == NULL) { + /* only a from snapshot given, must be valid */ + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Badly formed snapshot name %s"), fromsnap); + + if (!zfs_validate_name(hdl, fromsnap, ZFS_TYPE_SNAPSHOT, + B_FALSE)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, + di->errbuf)); + } + + atptrf = strchr(fromsnap, '@'); + ASSERT(atptrf != NULL); + fdslen = atptrf - fromsnap; + + di->fromsnap = zfs_strdup(hdl, fromsnap); + di->ds = zfs_strdup(hdl, fromsnap); + di->ds[fdslen] = '\0'; + + /* the to snap will be a just-in-time snap of the head */ + return (make_temp_snapshot(di)); + } + + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Unable to determine which snapshots to compare")); + + atptrf = strchr(fromsnap, '@'); + atptrt = strchr(tosnap, '@'); + fdslen = atptrf ? atptrf - fromsnap : strlen(fromsnap); + tdslen = atptrt ? atptrt - tosnap : strlen(tosnap); + fsnlen = strlen(fromsnap) - fdslen; /* includes @ sign */ + tsnlen = strlen(tosnap) - tdslen; /* includes @ sign */ + + if (fsnlen <= 1 || tsnlen == 1 || (fdslen == 0 && tdslen == 0)) { + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else if ((fdslen > 0 && tdslen > 0) && + ((tdslen != fdslen || strncmp(fromsnap, tosnap, fdslen) != 0))) { + /* + * not the same dataset name, might be okay if + * tosnap is a clone of a fromsnap descendant. + */ + char origin[ZFS_MAX_DATASET_NAME_LEN]; + zprop_source_t src; + zfs_handle_t *zhp; + + di->ds = zfs_alloc(di->zhp->zfs_hdl, tdslen + 1); + (void) strncpy(di->ds, tosnap, tdslen); + di->ds[tdslen] = '\0'; + + zhp = zfs_open(hdl, di->ds, ZFS_TYPE_FILESYSTEM); + while (zhp != NULL) { + if (zfs_prop_get(zhp, ZFS_PROP_ORIGIN, origin, + sizeof (origin), &src, NULL, 0, B_FALSE) != 0) { + (void) zfs_close(zhp); + zhp = NULL; + break; + } + if (strncmp(origin, fromsnap, fsnlen) == 0) + break; + + (void) zfs_close(zhp); + zhp = zfs_open(hdl, origin, ZFS_TYPE_FILESYSTEM); + } + + if (zhp == NULL) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_INVALIDNAME, di->errbuf)); + } else { + (void) zfs_close(zhp); + } + + di->isclone = B_TRUE; + di->fromsnap = zfs_strdup(hdl, fromsnap); + if (tsnlen) { + di->tosnap = zfs_strdup(hdl, tosnap); + } else { + return (make_temp_snapshot(di)); + } + } else { + int dslen = fdslen ? fdslen : tdslen; + + di->ds = zfs_alloc(hdl, dslen + 1); + (void) strncpy(di->ds, fdslen ? fromsnap : tosnap, dslen); + di->ds[dslen] = '\0'; + + di->fromsnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrf); + if (tsnlen) { + di->tosnap = zfs_asprintf(hdl, "%s%s", di->ds, atptrt); + } else { + return (make_temp_snapshot(di)); + } + } + return (0); +} + +static int +get_mountpoint(differ_info_t *di, char *dsnm, char **mntpt) +{ + boolean_t mounted; + + mounted = is_mounted(di->zhp->zfs_hdl, dsnm, mntpt); + if (mounted == B_FALSE) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, + "Cannot diff an unmounted snapshot")); + return (zfs_error(di->zhp->zfs_hdl, EZFS_BADTYPE, di->errbuf)); + } + + /* Avoid a double slash at the beginning of root-mounted datasets */ + if (**mntpt == '/' && *(*mntpt + 1) == '\0') + **mntpt = '\0'; + return (0); +} + +static int +get_mountpoints(differ_info_t *di) +{ + char *strptr; + char *frommntpt; + + /* + * first get the mountpoint for the parent dataset + */ + if (get_mountpoint(di, di->ds, &di->dsmnt) != 0) + return (-1); + + strptr = strchr(di->tosnap, '@'); + ASSERT3P(strptr, !=, NULL); + di->tomnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", di->dsmnt, + ZDIFF_SNAPDIR, ++strptr); + + strptr = strchr(di->fromsnap, '@'); + ASSERT3P(strptr, !=, NULL); + + frommntpt = di->dsmnt; + if (di->isclone) { + char *mntpt; + int err; + + *strptr = '\0'; + err = get_mountpoint(di, di->fromsnap, &mntpt); + *strptr = '@'; + if (err != 0) + return (-1); + frommntpt = mntpt; + } + + di->frommnt = zfs_asprintf(di->zhp->zfs_hdl, "%s%s%s", frommntpt, + ZDIFF_SNAPDIR, ++strptr); + + if (di->isclone) + free(frommntpt); + + return (0); +} + +static int +setup_differ_info(zfs_handle_t *zhp, const char *fromsnap, + const char *tosnap, differ_info_t *di) +{ + di->zhp = zhp; + + di->cleanupfd = open(ZFS_DEV, O_RDWR); + VERIFY(di->cleanupfd >= 0); + + if (get_snapshot_names(di, fromsnap, tosnap) != 0) + return (-1); + + if (get_mountpoints(di) != 0) + return (-1); + + if (find_shares_object(di) != 0) + return (-1); + + return (0); +} + +int +zfs_show_diffs(zfs_handle_t *zhp, int outfd, const char *fromsnap, + const char *tosnap, int flags) +{ + zfs_cmd_t zc = {"\0"}; + char errbuf[1024]; + differ_info_t di = { 0 }; + pthread_t tid; + int pipefd[2]; + int iocerr; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "zfs diff failed")); + + if (setup_differ_info(zhp, fromsnap, tosnap, &di)) { + teardown_differ_info(&di); + return (-1); + } + + if (pipe(pipefd)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, errbuf)); + } + + di.scripted = (flags & ZFS_DIFF_PARSEABLE); + di.classify = (flags & ZFS_DIFF_CLASSIFY); + di.timestamped = (flags & ZFS_DIFF_TIMESTAMP); + + di.outputfd = outfd; + di.datafd = pipefd[0]; + + if (pthread_create(&tid, NULL, differ, &di)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + (void) close(pipefd[0]); + (void) close(pipefd[1]); + teardown_differ_info(&di); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + + /* do the ioctl() */ + (void) strlcpy(zc.zc_value, di.fromsnap, strlen(di.fromsnap) + 1); + (void) strlcpy(zc.zc_name, di.tosnap, strlen(di.tosnap) + 1); + zc.zc_cookie = pipefd[1]; + + iocerr = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DIFF, &zc); + if (iocerr != 0) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "Unable to obtain diffs")); + if (errno == EPERM) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n The sys_mount privilege or diff delegated " + "permission is needed\n to execute the " + "diff ioctl")); + } else if (errno == EXDEV) { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "\n Not an earlier snapshot from the same fs")); + } else if (errno != EPIPE || di.zerr == 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + } + (void) close(pipefd[1]); + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + teardown_differ_info(&di); + if (di.zerr != 0 && di.zerr != EPIPE) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } else { + return (zfs_error(zhp->zfs_hdl, EZFS_DIFFDATA, errbuf)); + } + } + + (void) close(pipefd[1]); + (void) pthread_join(tid, NULL); + + if (di.zerr != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(di.zerr)); + return (zfs_error(zhp->zfs_hdl, EZFS_DIFF, di.errbuf)); + } + teardown_differ_info(&di); + return (0); +} diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c new file mode 100644 index 000000000000..6c5f61836978 --- /dev/null +++ b/lib/libzfs/libzfs_import.c @@ -0,0 +1,472 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Returns true if the named pool matches the given GUID. + */ +static int +pool_active(libzfs_handle_t *hdl, const char *name, uint64_t guid, + boolean_t *isactive) +{ + zpool_handle_t *zhp; + uint64_t theguid; + + if (zpool_open_silent(hdl, name, &zhp) != 0) + return (-1); + + if (zhp == NULL) { + *isactive = B_FALSE; + return (0); + } + + verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_POOL_GUID, + &theguid) == 0); + + zpool_close(zhp); + + *isactive = (theguid == guid); + return (0); +} + +static nvlist_t * +refresh_config(libzfs_handle_t *hdl, nvlist_t *config) +{ + nvlist_t *nvl; + zfs_cmd_t zc = {"\0"}; + int err, dstbuf_size; + + if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) + return (NULL); + + dstbuf_size = MAX(CONFIG_BUF_MINSIZE, zc.zc_nvlist_conf_size * 4); + + if (zcmd_alloc_dst_nvlist(hdl, &zc, dstbuf_size) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + while ((err = zfs_ioctl(hdl, ZFS_IOC_POOL_TRYIMPORT, + &zc)) != 0 && errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + } + + if (err) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &nvl) != 0) { + zcmd_free_nvlists(&zc); + return (NULL); + } + + zcmd_free_nvlists(&zc); + return (nvl); +} + +static nvlist_t * +refresh_config_libzfs(void *handle, nvlist_t *tryconfig) +{ + return (refresh_config((libzfs_handle_t *)handle, tryconfig)); +} + + +static int +pool_active_libzfs(void *handle, const char *name, uint64_t guid, + boolean_t *isactive) +{ + return (pool_active((libzfs_handle_t *)handle, name, guid, isactive)); +} + +const pool_config_ops_t libzfs_config_ops = { + .pco_refresh_config = refresh_config_libzfs, + .pco_pool_active = pool_active_libzfs, +}; + +/* + * Return the offset of the given label. + */ +static uint64_t +label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given a file descriptor, clear (zero) the label information. This function + * is used in the appliance stack as part of the ZFS sysevent module and + * to implement the "zpool labelclear" command. + */ +int +zpool_clear_label(int fd) +{ + struct stat64 statbuf; + int l; + vdev_label_t *label; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t size; + int labels_cleared = 0, header_cleared = 0; + boolean_t clear_l2arc_header = B_FALSE; + + if (fstat64_blk(fd, &statbuf) == -1) + return (0); + + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + if ((label = calloc(1, sizeof (vdev_label_t))) == NULL) + return (-1); + + if ((l2dhdr = calloc(1, sizeof (l2arc_dev_hdr_phys_t))) == NULL) { + free(label); + return (-1); + } + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, l2cache; + nvlist_t *config; + + if (pread64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) { + continue; + } + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), &config, 0) != 0) { + continue; + } + + /* Skip labels which do not have a valid guid. */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + nvlist_free(config); + continue; + } + + /* Skip labels which are not in a known valid state. */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + /* If the device is a cache device clear the header. */ + if (!clear_l2arc_header) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &l2cache) == 0 && + l2cache == POOL_STATE_L2CACHE) { + clear_l2arc_header = B_TRUE; + } + } + + nvlist_free(config); + + /* + * A valid label was found, overwrite this label's nvlist + * and uberblocks with zeros on disk. This is done to prevent + * system utilities, like blkid, from incorrectly detecting a + * partial label. The leading pad space is left untouched. + */ + memset(label, 0, sizeof (vdev_label_t)); + size_t label_size = sizeof (vdev_label_t) - (2 * VDEV_PAD_SIZE); + + if (pwrite64(fd, label, label_size, label_offset(size, l) + + (2 * VDEV_PAD_SIZE)) == label_size) { + labels_cleared++; + } + } + + /* Clear the L2ARC header. */ + if (clear_l2arc_header) { + memset(l2dhdr, 0, sizeof (l2arc_dev_hdr_phys_t)); + if (pwrite64(fd, l2dhdr, sizeof (l2arc_dev_hdr_phys_t), + VDEV_LABEL_START_SIZE) == sizeof (l2arc_dev_hdr_phys_t)) { + header_cleared++; + } + } + + free(label); + free(l2dhdr); + + if (labels_cleared == 0) + return (-1); + + return (0); +} + +static boolean_t +find_guid(nvlist_t *nv, uint64_t guid) +{ + uint64_t tmp; + nvlist_t **child; + uint_t c, children; + + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &tmp) == 0); + if (tmp == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_guid(child[c], guid)) + return (B_TRUE); + } + + return (B_FALSE); +} + +typedef struct aux_cbdata { + const char *cb_type; + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} aux_cbdata_t; + +static int +find_aux(zpool_handle_t *zhp, void *data) +{ + aux_cbdata_t *cbp = data; + nvlist_t **list; + uint_t i, count; + uint64_t guid; + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (nvlist_lookup_nvlist_array(nvroot, cbp->cb_type, + &list, &count) == 0) { + for (i = 0; i < count; i++) { + verify(nvlist_lookup_uint64(list[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (guid == cbp->cb_guid) { + cbp->cb_zhp = zhp; + return (1); + } + } + } + + zpool_close(zhp); + return (0); +} + +/* + * Determines if the pool is in use. If so, it returns true and the state of + * the pool as well as the name of the pool. Name string is allocated and + * must be freed by the caller. + */ +int +zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, + boolean_t *inuse) +{ + nvlist_t *config; + char *name; + boolean_t ret; + uint64_t guid, vdev_guid; + zpool_handle_t *zhp; + nvlist_t *pool_config; + uint64_t stateval, isspare; + aux_cbdata_t cb = { 0 }; + boolean_t isactive; + + *inuse = B_FALSE; + + if (zpool_read_label(fd, &config, NULL) != 0) { + (void) no_memory(hdl); + return (-1); + } + + if (config == NULL) + return (0); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &stateval) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) == 0); + + if (stateval != POOL_STATE_SPARE && stateval != POOL_STATE_L2CACHE) { + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + } + + switch (stateval) { + case POOL_STATE_EXPORTED: + /* + * A pool with an exported state may in fact be imported + * read-only, so check the in-core state to see if it's + * active and imported read-only. If it is, set + * its state to active. + */ + if (pool_active(hdl, name, guid, &isactive) == 0 && isactive && + (zhp = zpool_open_canfail(hdl, name)) != NULL) { + if (zpool_get_prop_int(zhp, ZPOOL_PROP_READONLY, NULL)) + stateval = POOL_STATE_ACTIVE; + + /* + * All we needed the zpool handle for is the + * readonly prop check. + */ + zpool_close(zhp); + } + + ret = B_TRUE; + break; + + case POOL_STATE_ACTIVE: + /* + * For an active pool, we have to determine if it's really part + * of a currently active pool (in which case the pool will exist + * and the guid will be the same), or whether it's part of an + * active pool that was disconnected without being explicitly + * exported. + */ + if (pool_active(hdl, name, guid, &isactive) != 0) { + nvlist_free(config); + return (-1); + } + + if (isactive) { + /* + * Because the device may have been removed while + * offlined, we only report it as active if the vdev is + * still present in the config. Otherwise, pretend like + * it's not in use. + */ + if ((zhp = zpool_open_canfail(hdl, name)) != NULL && + (pool_config = zpool_get_config(zhp, NULL)) + != NULL) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(pool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + ret = find_guid(nvroot, vdev_guid); + } else { + ret = B_FALSE; + } + + /* + * If this is an active spare within another pool, we + * treat it like an unused hot spare. This allows the + * user to create a pool with a hot spare that currently + * in use within another pool. Since we return B_TRUE, + * libdiskmgt will continue to prevent generic consumers + * from using the device. + */ + if (ret && nvlist_lookup_uint64(config, + ZPOOL_CONFIG_IS_SPARE, &isspare) == 0 && isspare) + stateval = POOL_STATE_SPARE; + + if (zhp != NULL) + zpool_close(zhp); + } else { + stateval = POOL_STATE_POTENTIALLY_ACTIVE; + ret = B_TRUE; + } + break; + + case POOL_STATE_SPARE: + /* + * For a hot spare, it can be either definitively in use, or + * potentially active. To determine if it's in use, we iterate + * over all pools in the system and search for one with a spare + * with a matching guid. + * + * Due to the shared nature of spares, we don't actually report + * the potentially active case as in use. This means the user + * can freely create pools on the hot spares of exported pools, + * but to do otherwise makes the resulting code complicated, and + * we end up having to deal with this case anyway. + */ + cb.cb_zhp = NULL; + cb.cb_guid = vdev_guid; + cb.cb_type = ZPOOL_CONFIG_SPARES; + if (zpool_iter(hdl, find_aux, &cb) == 1) { + name = (char *)zpool_get_name(cb.cb_zhp); + ret = B_TRUE; + } else { + ret = B_FALSE; + } + break; + + case POOL_STATE_L2CACHE: + + /* + * Check if any pool is currently using this l2cache device. + */ + cb.cb_zhp = NULL; + cb.cb_guid = vdev_guid; + cb.cb_type = ZPOOL_CONFIG_L2CACHE; + if (zpool_iter(hdl, find_aux, &cb) == 1) { + name = (char *)zpool_get_name(cb.cb_zhp); + ret = B_TRUE; + } else { + ret = B_FALSE; + } + break; + + default: + ret = B_FALSE; + } + + + if (ret) { + if ((*namestr = zfs_strdup(hdl, name)) == NULL) { + if (cb.cb_zhp) + zpool_close(cb.cb_zhp); + nvlist_free(config); + return (-1); + } + *state = (pool_state_t)stateval; + } + + if (cb.cb_zhp) + zpool_close(cb.cb_zhp); + + nvlist_free(config); + *inuse = ret; + return (0); +} diff --git a/lib/libzfs/libzfs_iter.c b/lib/libzfs/libzfs_iter.c new file mode 100644 index 000000000000..7ee326bc6905 --- /dev/null +++ b/lib/libzfs/libzfs_iter.c @@ -0,0 +1,600 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2019 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +static int +zfs_iter_clones(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + nvlist_t *nvl = zfs_get_clones_nvl(zhp); + nvpair_t *pair; + + if (nvl == NULL) + return (0); + + for (pair = nvlist_next_nvpair(nvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) { + zfs_handle_t *clone = zfs_open(zhp->zfs_hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (clone != NULL) { + int err = func(clone, data); + if (err != 0) + return (err); + } + } + return (0); +} + +static int +zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) +{ + int rc; + uint64_t orig_cookie; + + orig_cookie = zc->zc_cookie; +top: + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + rc = zfs_ioctl(zhp->zfs_hdl, arg, zc); + + if (rc == -1) { + switch (errno) { + case ENOMEM: + /* expand nvlist memory and try again */ + if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { + zcmd_free_nvlists(zc); + return (-1); + } + zc->zc_cookie = orig_cookie; + goto top; + /* + * An errno value of ESRCH indicates normal completion. + * If ENOENT is returned, then the underlying dataset + * has been removed since we obtained the handle. + */ + case ESRCH: + case ENOENT: + rc = 1; + break; + default: + rc = zfs_standard_error(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot iterate filesystems")); + break; + } + } + return (rc); +} + +/* + * Iterate over all child filesystems + */ +int +zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_cmd_t zc = {"\0"}; + zfs_handle_t *nzhp; + int ret; + + if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) + return (0); + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, + &zc)) == 0) { + /* + * Silently ignore errors, as the only plausible explanation is + * that the pool has since been removed. + */ + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { + continue; + } + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + return (ret); + } + } + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all snapshots + */ +int +zfs_iter_snapshots(zfs_handle_t *zhp, boolean_t simple, zfs_iter_f func, + void *data, uint64_t min_txg, uint64_t max_txg) +{ + zfs_cmd_t zc = {"\0"}; + zfs_handle_t *nzhp; + int ret; + nvlist_t *range_nvl = NULL; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT || + zhp->zfs_type == ZFS_TYPE_BOOKMARK) + return (0); + + zc.zc_simple = simple; + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + + if (min_txg != 0) { + range_nvl = fnvlist_alloc(); + fnvlist_add_uint64(range_nvl, SNAP_ITER_MIN_TXG, min_txg); + } + if (max_txg != 0) { + if (range_nvl == NULL) + range_nvl = fnvlist_alloc(); + fnvlist_add_uint64(range_nvl, SNAP_ITER_MAX_TXG, max_txg); + } + + if (range_nvl != NULL && + zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, range_nvl) != 0) { + zcmd_free_nvlists(&zc); + fnvlist_free(range_nvl); + return (-1); + } + + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc)) == 0) { + + if (simple) + nzhp = make_dataset_simple_handle_zc(zhp, &zc); + else + nzhp = make_dataset_handle_zc(zhp->zfs_hdl, &zc); + if (nzhp == NULL) + continue; + + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); + fnvlist_free(range_nvl); + return (ret); + } + } + zcmd_free_nvlists(&zc); + fnvlist_free(range_nvl); + return ((ret < 0) ? ret : 0); +} + +/* + * Iterate over all bookmarks + */ +int +zfs_iter_bookmarks(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + zfs_handle_t *nzhp; + nvlist_t *props = NULL; + nvlist_t *bmarks = NULL; + int err; + nvpair_t *pair; + + if ((zfs_get_type(zhp) & (ZFS_TYPE_SNAPSHOT | ZFS_TYPE_BOOKMARK)) != 0) + return (0); + + /* Setup the requested properties nvlist. */ + props = fnvlist_alloc(); + for (zfs_prop_t p = 0; p < ZFS_NUM_PROPS; p++) { + if (zfs_prop_valid_for_type(p, ZFS_TYPE_BOOKMARK, B_FALSE)) { + fnvlist_add_boolean(props, zfs_prop_to_name(p)); + } + } + fnvlist_add_boolean(props, "redact_complete"); + + if ((err = lzc_get_bookmarks(zhp->zfs_name, props, &bmarks)) != 0) + goto out; + + for (pair = nvlist_next_nvpair(bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + char *bmark_name; + nvlist_t *bmark_props; + + bmark_name = nvpair_name(pair); + bmark_props = fnvpair_value_nvlist(pair); + + if (snprintf(name, sizeof (name), "%s#%s", zhp->zfs_name, + bmark_name) >= sizeof (name)) { + err = EINVAL; + goto out; + } + + nzhp = make_bookmark_handle(zhp, name, bmark_props); + if (nzhp == NULL) + continue; + + if ((err = func(nzhp, data)) != 0) + goto out; + } + +out: + fnvlist_free(props); + fnvlist_free(bmarks); + + return (err); +} + +/* + * Routines for dealing with the sorted snapshot functionality + */ +typedef struct zfs_node { + zfs_handle_t *zn_handle; + avl_node_t zn_avlnode; +} zfs_node_t; + +static int +zfs_sort_snaps(zfs_handle_t *zhp, void *data) +{ + avl_tree_t *avl = data; + zfs_node_t *node; + zfs_node_t search; + + search.zn_handle = zhp; + node = avl_find(avl, &search, NULL); + if (node) { + /* + * If this snapshot was renamed while we were creating the + * AVL tree, it's possible that we already inserted it under + * its old name. Remove the old handle before adding the new + * one. + */ + zfs_close(node->zn_handle); + avl_remove(avl, node); + free(node); + } + + node = zfs_alloc(zhp->zfs_hdl, sizeof (zfs_node_t)); + node->zn_handle = zhp; + avl_add(avl, node); + + return (0); +} + +static int +zfs_snapshot_compare(const void *larg, const void *rarg) +{ + zfs_handle_t *l = ((zfs_node_t *)larg)->zn_handle; + zfs_handle_t *r = ((zfs_node_t *)rarg)->zn_handle; + uint64_t lcreate, rcreate; + + /* + * Sort them according to creation time. We use the hidden + * CREATETXG property to get an absolute ordering of snapshots. + */ + lcreate = zfs_prop_get_int(l, ZFS_PROP_CREATETXG); + rcreate = zfs_prop_get_int(r, ZFS_PROP_CREATETXG); + + return (TREE_CMP(lcreate, rcreate)); +} + +int +zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data, + uint64_t min_txg, uint64_t max_txg) +{ + int ret = 0; + zfs_node_t *node; + avl_tree_t avl; + void *cookie = NULL; + + avl_create(&avl, zfs_snapshot_compare, + sizeof (zfs_node_t), offsetof(zfs_node_t, zn_avlnode)); + + ret = zfs_iter_snapshots(zhp, B_FALSE, zfs_sort_snaps, &avl, min_txg, + max_txg); + + for (node = avl_first(&avl); node != NULL; node = AVL_NEXT(&avl, node)) + ret |= callback(node->zn_handle, data); + + while ((node = avl_destroy_nodes(&avl, &cookie)) != NULL) + free(node); + + avl_destroy(&avl); + + return (ret); +} + +typedef struct { + char *ssa_first; + char *ssa_last; + boolean_t ssa_seenfirst; + boolean_t ssa_seenlast; + zfs_iter_f ssa_func; + void *ssa_arg; +} snapspec_arg_t; + +static int +snapspec_cb(zfs_handle_t *zhp, void *arg) +{ + snapspec_arg_t *ssa = arg; + const char *shortsnapname; + int err = 0; + + if (ssa->ssa_seenlast) + return (0); + + shortsnapname = strchr(zfs_get_name(zhp), '@') + 1; + if (!ssa->ssa_seenfirst && strcmp(shortsnapname, ssa->ssa_first) == 0) + ssa->ssa_seenfirst = B_TRUE; + if (strcmp(shortsnapname, ssa->ssa_last) == 0) + ssa->ssa_seenlast = B_TRUE; + + if (ssa->ssa_seenfirst) { + err = ssa->ssa_func(zhp, ssa->ssa_arg); + } else { + zfs_close(zhp); + } + + return (err); +} + +/* + * spec is a string like "A,B%C,D" + * + * , where can be: + * (single snapshot) + * % (range of snapshots, inclusive) + * % (range of snapshots, starting with earliest) + * % (range of snapshots, ending with last) + * % (all snapshots) + * [,...] (comma separated list of the above) + * + * If a snapshot can not be opened, continue trying to open the others, but + * return ENOENT at the end. + */ +int +zfs_iter_snapspec(zfs_handle_t *fs_zhp, const char *spec_orig, + zfs_iter_f func, void *arg) +{ + char *buf, *comma_separated, *cp; + int err = 0; + int ret = 0; + + buf = zfs_strdup(fs_zhp->zfs_hdl, spec_orig); + cp = buf; + + while ((comma_separated = strsep(&cp, ",")) != NULL) { + char *pct = strchr(comma_separated, '%'); + if (pct != NULL) { + snapspec_arg_t ssa = { 0 }; + ssa.ssa_func = func; + ssa.ssa_arg = arg; + + if (pct == comma_separated) + ssa.ssa_seenfirst = B_TRUE; + else + ssa.ssa_first = comma_separated; + *pct = '\0'; + ssa.ssa_last = pct + 1; + + /* + * If there is a lastname specified, make sure it + * exists. + */ + if (ssa.ssa_last[0] != '\0') { + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + (void) snprintf(snapname, sizeof (snapname), + "%s@%s", zfs_get_name(fs_zhp), + ssa.ssa_last); + if (!zfs_dataset_exists(fs_zhp->zfs_hdl, + snapname, ZFS_TYPE_SNAPSHOT)) { + ret = ENOENT; + continue; + } + } + + err = zfs_iter_snapshots_sorted(fs_zhp, + snapspec_cb, &ssa, 0, 0); + if (ret == 0) + ret = err; + if (ret == 0 && (!ssa.ssa_seenfirst || + (ssa.ssa_last[0] != '\0' && !ssa.ssa_seenlast))) { + ret = ENOENT; + } + } else { + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfs_handle_t *snap_zhp; + (void) snprintf(snapname, sizeof (snapname), "%s@%s", + zfs_get_name(fs_zhp), comma_separated); + snap_zhp = make_dataset_handle(fs_zhp->zfs_hdl, + snapname); + if (snap_zhp == NULL) { + ret = ENOENT; + continue; + } + err = func(snap_zhp, arg); + if (ret == 0) + ret = err; + } + } + + free(buf); + return (ret); +} + +/* + * Iterate over all children, snapshots and filesystems + * Process snapshots before filesystems because they are nearer the input + * handle: this is extremely important when used with zfs_iter_f functions + * looking for data, following the logic that we would like to find it as soon + * and as close as possible. + */ +int +zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + int ret; + + if ((ret = zfs_iter_snapshots(zhp, B_FALSE, func, data, 0, 0)) != 0) + return (ret); + + return (zfs_iter_filesystems(zhp, func, data)); +} + + +typedef struct iter_stack_frame { + struct iter_stack_frame *next; + zfs_handle_t *zhp; +} iter_stack_frame_t; + +typedef struct iter_dependents_arg { + boolean_t first; + boolean_t allowrecursion; + iter_stack_frame_t *stack; + zfs_iter_f func; + void *data; +} iter_dependents_arg_t; + +static int +iter_dependents_cb(zfs_handle_t *zhp, void *arg) +{ + iter_dependents_arg_t *ida = arg; + int err = 0; + boolean_t first = ida->first; + ida->first = B_FALSE; + + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + err = zfs_iter_clones(zhp, iter_dependents_cb, ida); + } else if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) { + iter_stack_frame_t isf; + iter_stack_frame_t *f; + + /* + * check if there is a cycle by seeing if this fs is already + * on the stack. + */ + for (f = ida->stack; f != NULL; f = f->next) { + if (f->zhp->zfs_dmustats.dds_guid == + zhp->zfs_dmustats.dds_guid) { + if (ida->allowrecursion) { + zfs_close(zhp); + return (0); + } else { + zfs_error_aux(zhp->zfs_hdl, + dgettext(TEXT_DOMAIN, + "recursive dependency at '%s'"), + zfs_get_name(zhp)); + err = zfs_error(zhp->zfs_hdl, + EZFS_RECURSIVE, + dgettext(TEXT_DOMAIN, + "cannot determine dependent " + "datasets")); + zfs_close(zhp); + return (err); + } + } + } + + isf.zhp = zhp; + isf.next = ida->stack; + ida->stack = &isf; + err = zfs_iter_filesystems(zhp, iter_dependents_cb, ida); + if (err == 0) + err = zfs_iter_snapshots(zhp, B_FALSE, + iter_dependents_cb, ida, 0, 0); + ida->stack = isf.next; + } + + if (!first && err == 0) + err = ida->func(zhp, ida->data); + else + zfs_close(zhp); + + return (err); +} + +int +zfs_iter_dependents(zfs_handle_t *zhp, boolean_t allowrecursion, + zfs_iter_f func, void *data) +{ + iter_dependents_arg_t ida; + ida.allowrecursion = allowrecursion; + ida.stack = NULL; + ida.func = func; + ida.data = data; + ida.first = B_TRUE; + return (iter_dependents_cb(zfs_handle_dup(zhp), &ida)); +} + +/* + * Iterate over mounted children of the specified dataset + */ +int +zfs_iter_mounted(zfs_handle_t *zhp, zfs_iter_f func, void *data) +{ + char mnt_prop[ZFS_MAXPROPLEN]; + struct mnttab entry; + zfs_handle_t *mtab_zhp; + size_t namelen = strlen(zhp->zfs_name); + FILE *mnttab; + int err = 0; + + if ((mnttab = fopen(MNTTAB, "r")) == NULL) + return (ENOENT); + + while (err == 0 && getmntent(mnttab, &entry) == 0) { + /* Ignore non-ZFS entries */ + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* Ignore datasets not within the provided dataset */ + if (strncmp(entry.mnt_special, zhp->zfs_name, namelen) != 0 || + (entry.mnt_special[namelen] != '/' && + entry.mnt_special[namelen] != '@')) + continue; + + if ((mtab_zhp = zfs_open(zhp->zfs_hdl, entry.mnt_special, + ZFS_TYPE_FILESYSTEM)) == NULL) + continue; + + /* Ignore legacy mounts as they are user managed */ + verify(zfs_prop_get(mtab_zhp, ZFS_PROP_MOUNTPOINT, mnt_prop, + sizeof (mnt_prop), NULL, NULL, 0, B_FALSE) == 0); + if (strcmp(mnt_prop, "legacy") == 0) { + zfs_close(mtab_zhp); + continue; + } + + err = func(mtab_zhp, data); + } + + fclose(mnttab); + + return (err); +} diff --git a/lib/libzfs/libzfs_mount.c b/lib/libzfs/libzfs_mount.c new file mode 100644 index 000000000000..6d11016bca4e --- /dev/null +++ b/lib/libzfs/libzfs_mount.c @@ -0,0 +1,1614 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017 RackTop Systems. + * Copyright (c) 2018 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +/* + * Routines to manage ZFS mounts. We separate all the nasty routines that have + * to deal with the OS. The following functions are the main entry points -- + * they are used by mount and unmount and when changing a filesystem's + * mountpoint. + * + * zfs_is_mounted() + * zfs_mount() + * zfs_mount_at() + * zfs_unmount() + * zfs_unmountall() + * + * This file also contains the functions used to manage sharing filesystems via + * NFS and iSCSI: + * + * zfs_is_shared() + * zfs_share() + * zfs_unshare() + * + * zfs_is_shared_nfs() + * zfs_is_shared_smb() + * zfs_share_proto() + * zfs_shareall(); + * zfs_unshare_nfs() + * zfs_unshare_smb() + * zfs_unshareall_nfs() + * zfs_unshareall_smb() + * zfs_unshareall() + * zfs_unshareall_bypath() + * + * The following functions are available for pool consumers, and will + * mount/unmount and share/unshare all datasets within pool: + * + * zpool_enable_datasets() + * zpool_disable_datasets() + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "libzfs_impl.h" +#include + +#include +#include +#define MAXISALEN 257 /* based on sysinfo(2) man page */ + +static int mount_tp_nthr = 512; /* tpool threads for multi-threaded mounting */ + +static void zfs_mount_task(void *); +zfs_share_type_t zfs_is_shared_proto(zfs_handle_t *, char **, + zfs_share_proto_t); + +/* + * The share protocols table must be in the same order as the zfs_share_proto_t + * enum in libzfs_impl.h + */ +proto_table_t proto_table[PROTO_END] = { + {ZFS_PROP_SHARENFS, "nfs", EZFS_SHARENFSFAILED, EZFS_UNSHARENFSFAILED}, + {ZFS_PROP_SHARESMB, "smb", EZFS_SHARESMBFAILED, EZFS_UNSHARESMBFAILED}, +}; + +zfs_share_proto_t nfs_only[] = { + PROTO_NFS, + PROTO_END +}; + +zfs_share_proto_t smb_only[] = { + PROTO_SMB, + PROTO_END +}; +zfs_share_proto_t share_all_proto[] = { + PROTO_NFS, + PROTO_SMB, + PROTO_END +}; + + + +static boolean_t +dir_is_empty_stat(const char *dirname) +{ + struct stat st; + + /* + * We only want to return false if the given path is a non empty + * directory, all other errors are handled elsewhere. + */ + if (stat(dirname, &st) < 0 || !S_ISDIR(st.st_mode)) { + return (B_TRUE); + } + + /* + * An empty directory will still have two entries in it, one + * entry for each of "." and "..". + */ + if (st.st_size > 2) { + return (B_FALSE); + } + + return (B_TRUE); +} + +static boolean_t +dir_is_empty_readdir(const char *dirname) +{ + DIR *dirp; + struct dirent64 *dp; + int dirfd; + + if ((dirfd = openat(AT_FDCWD, dirname, + O_RDONLY | O_NDELAY | O_LARGEFILE | O_CLOEXEC, 0)) < 0) { + return (B_TRUE); + } + + if ((dirp = fdopendir(dirfd)) == NULL) { + (void) close(dirfd); + return (B_TRUE); + } + + while ((dp = readdir64(dirp)) != NULL) { + + if (strcmp(dp->d_name, ".") == 0 || + strcmp(dp->d_name, "..") == 0) + continue; + + (void) closedir(dirp); + return (B_FALSE); + } + + (void) closedir(dirp); + return (B_TRUE); +} + +/* + * Returns true if the specified directory is empty. If we can't open the + * directory at all, return true so that the mount can fail with a more + * informative error message. + */ +static boolean_t +dir_is_empty(const char *dirname) +{ + struct statfs64 st; + + /* + * If the statvfs call fails or the filesystem is not a ZFS + * filesystem, fall back to the slow path which uses readdir. + */ + if ((statfs64(dirname, &st) != 0) || + (st.f_type != ZFS_SUPER_MAGIC)) { + return (dir_is_empty_readdir(dirname)); + } + + /* + * At this point, we know the provided path is on a ZFS + * filesystem, so we can use stat instead of readdir to + * determine if the directory is empty or not. We try to avoid + * using readdir because that requires opening "dirname"; this + * open file descriptor can potentially end up in a child + * process if there's a concurrent fork, thus preventing the + * zfs_mount() from otherwise succeeding (the open file + * descriptor inherited by the child process will cause the + * parent's mount to fail with EBUSY). The performance + * implications of replacing the open, read, and close with a + * single stat is nice; but is not the main motivation for the + * added complexity. + */ + return (dir_is_empty_stat(dirname)); +} + +/* + * Checks to see if the mount is active. If the filesystem is mounted, we fill + * in 'where' with the current mountpoint, and return 1. Otherwise, we return + * 0. + */ +boolean_t +is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) +{ + struct mnttab entry; + + if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0) + return (B_FALSE); + + if (where != NULL) + *where = zfs_strdup(zfs_hdl, entry.mnt_mountp); + + return (B_TRUE); +} + +boolean_t +zfs_is_mounted(zfs_handle_t *zhp, char **where) +{ + return (is_mounted(zhp->zfs_hdl, zfs_get_name(zhp), where)); +} + +/* + * Checks any higher order concerns about whether the given dataset is + * mountable, false otherwise. zfs_is_mountable_internal specifically assumes + * that the caller has verified the sanity of mounting the dataset at + * mountpoint to the extent the caller wants. + */ +static boolean_t +zfs_is_mountable_internal(zfs_handle_t *zhp, const char *mountpoint) +{ + + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED) && + getzoneid() == GLOBAL_ZONEID) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Returns true if the given dataset is mountable, false otherwise. Returns the + * mountpoint in 'buf'. + */ +boolean_t +zfs_is_mountable(zfs_handle_t *zhp, char *buf, size_t buflen, + zprop_source_t *source, int flags) +{ + char sourceloc[MAXNAMELEN]; + zprop_source_t sourcetype; + + if (!zfs_prop_valid_for_type(ZFS_PROP_MOUNTPOINT, zhp->zfs_type, + B_FALSE)) + return (B_FALSE); + + verify(zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, buf, buflen, + &sourcetype, sourceloc, sizeof (sourceloc), B_FALSE) == 0); + + if (strcmp(buf, ZFS_MOUNTPOINT_NONE) == 0 || + strcmp(buf, ZFS_MOUNTPOINT_LEGACY) == 0) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_OFF) + return (B_FALSE); + + if (!zfs_is_mountable_internal(zhp, buf)) + return (B_FALSE); + + if (zfs_prop_get_int(zhp, ZFS_PROP_REDACTED) && !(flags & MS_FORCE)) + return (B_FALSE); + + if (source) + *source = sourcetype; + + return (B_TRUE); +} + +/* + * The filesystem is mounted by invoking the system mount utility rather + * than by the system call mount(2). This ensures that the /etc/mtab + * file is correctly locked for the update. Performing our own locking + * and /etc/mtab update requires making an unsafe assumption about how + * the mount utility performs its locking. Unfortunately, this also means + * in the case of a mount failure we do not have the exact errno. We must + * make due with return value from the mount process. + * + * In the long term a shared library called libmount is under development + * which provides a common API to address the locking and errno issues. + * Once the standard mount utility has been updated to use this library + * we can add an autoconf check to conditionally use it. + * + * http://www.kernel.org/pub/linux/utils/util-linux/libmount-docs/index.html + */ + +static int +zfs_add_option(zfs_handle_t *zhp, char *options, int len, + zfs_prop_t prop, char *on, char *off) +{ + char *source; + uint64_t value; + + /* Skip adding duplicate default options */ + if ((strstr(options, on) != NULL) || (strstr(options, off) != NULL)) + return (0); + + /* + * zfs_prop_get_int() is not used to ensure our mount options + * are not influenced by the current /proc/self/mounts contents. + */ + value = getprop_uint64(zhp, prop, &source); + + (void) strlcat(options, ",", len); + (void) strlcat(options, value ? on : off, len); + + return (0); +} + +static int +zfs_add_options(zfs_handle_t *zhp, char *options, int len) +{ + int error = 0; + + error = zfs_add_option(zhp, options, len, + ZFS_PROP_ATIME, MNTOPT_ATIME, MNTOPT_NOATIME); + /* + * don't add relatime/strictatime when atime=off, otherwise strictatime + * will force atime=on + */ + if (strstr(options, MNTOPT_NOATIME) == NULL) { + error = zfs_add_option(zhp, options, len, + ZFS_PROP_RELATIME, MNTOPT_RELATIME, MNTOPT_STRICTATIME); + } + error = error ? error : zfs_add_option(zhp, options, len, + ZFS_PROP_DEVICES, MNTOPT_DEVICES, MNTOPT_NODEVICES); + error = error ? error : zfs_add_option(zhp, options, len, + ZFS_PROP_EXEC, MNTOPT_EXEC, MNTOPT_NOEXEC); + error = error ? error : zfs_add_option(zhp, options, len, + ZFS_PROP_READONLY, MNTOPT_RO, MNTOPT_RW); + error = error ? error : zfs_add_option(zhp, options, len, + ZFS_PROP_SETUID, MNTOPT_SETUID, MNTOPT_NOSETUID); + error = error ? error : zfs_add_option(zhp, options, len, + ZFS_PROP_NBMAND, MNTOPT_NBMAND, MNTOPT_NONBMAND); + + return (error); +} + +int +zfs_mount(zfs_handle_t *zhp, const char *options, int flags) +{ + char mountpoint[ZFS_MAXPROPLEN]; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, + flags)) + return (0); + + return (zfs_mount_at(zhp, options, flags, mountpoint)); +} + +/* + * Mount the given filesystem. + */ +int +zfs_mount_at(zfs_handle_t *zhp, const char *options, int flags, + const char *mountpoint) +{ + struct stat buf; + char mntopts[MNT_LINE_MAX]; + char overlay[ZFS_MAXPROPLEN]; + libzfs_handle_t *hdl = zhp->zfs_hdl; + uint64_t keystatus; + int remount = 0, rc; + + if (options == NULL) { + (void) strlcpy(mntopts, MNTOPT_DEFAULTS, sizeof (mntopts)); + } else { + (void) strlcpy(mntopts, options, sizeof (mntopts)); + } + + if (strstr(mntopts, MNTOPT_REMOUNT) != NULL) + remount = 1; + + /* Potentially duplicates some checks if invoked by zfs_mount(). */ + if (!zfs_is_mountable_internal(zhp, mountpoint)) + return (0); + + /* + * If the pool is imported read-only then all mounts must be read-only + */ + if (zpool_get_prop_int(zhp->zpool_hdl, ZPOOL_PROP_READONLY, NULL)) + (void) strlcat(mntopts, "," MNTOPT_RO, sizeof (mntopts)); + + /* + * Append default mount options which apply to the mount point. + * This is done because under Linux (unlike Solaris) multiple mount + * points may reference a single super block. This means that just + * given a super block there is no back reference to update the per + * mount point options. + */ + rc = zfs_add_options(zhp, mntopts, sizeof (mntopts)); + if (rc) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "default options unavailable")); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + mountpoint)); + } + + /* + * If the filesystem is encrypted the key must be loaded in order to + * mount. If the key isn't loaded, the MS_CRYPT flag decides whether + * or not we attempt to load the keys. Note: we must call + * zfs_refresh_properties() here since some callers of this function + * (most notably zpool_enable_datasets()) may implicitly load our key + * by loading the parent's key first. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { + zfs_refresh_properties(zhp); + keystatus = zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS); + + /* + * If the key is unavailable and MS_CRYPT is set give the + * user a chance to enter the key. Otherwise just fail + * immediately. + */ + if (keystatus == ZFS_KEYSTATUS_UNAVAILABLE) { + if (flags & MS_CRYPT) { + rc = zfs_crypto_load_key(zhp, B_FALSE, NULL); + if (rc) + return (rc); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encryption key not loaded")); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + mountpoint)); + } + } + + } + + /* + * Append zfsutil option so the mount helper allow the mount + */ + strlcat(mntopts, "," MNTOPT_ZFSUTIL, sizeof (mntopts)); + + /* Create the directory if it doesn't already exist */ + if (lstat(mountpoint, &buf) != 0) { + if (mkdirp(mountpoint, 0755) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to create mountpoint: %s"), + strerror(errno)); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + mountpoint)); + } + } + + /* + * Overlay mounts are enabled by default but may be disabled + * via the 'overlay' property. The -O flag remains for compatibility. + */ + if (!(flags & MS_OVERLAY)) { + if (zfs_prop_get(zhp, ZFS_PROP_OVERLAY, overlay, + sizeof (overlay), NULL, NULL, 0, B_FALSE) == 0) { + if (strcmp(overlay, "on") == 0) { + flags |= MS_OVERLAY; + } + } + } + + /* + * Determine if the mountpoint is empty. If so, refuse to perform the + * mount. We don't perform this check if 'remount' is + * specified or if overlay option (-O) is given + */ + if ((flags & MS_OVERLAY) == 0 && !remount && + !dir_is_empty(mountpoint)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "directory is not empty")); + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), mountpoint)); + } + + /* perform the mount */ + rc = do_mount(zhp, mountpoint, mntopts, flags); + if (rc) { + /* + * Generic errors are nasty, but there are just way too many + * from mount(), and they're well-understood. We pick a few + * common ones to improve upon. + */ + if (rc == EBUSY) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "mountpoint or dataset is busy")); + } else if (rc == EPERM) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Insufficient privileges")); + } else if (rc == ENOTSUP) { + char buf[256]; + int spa_version; + + VERIFY(zfs_spa_version(zhp, &spa_version) == 0); + (void) snprintf(buf, sizeof (buf), + dgettext(TEXT_DOMAIN, "Can't mount a version %lld " + "file system on a version %d pool. Pool must be" + " upgraded to mount this file system."), + (u_longlong_t)zfs_prop_get_int(zhp, + ZFS_PROP_VERSION), spa_version); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, buf)); + } else { + zfs_error_aux(hdl, strerror(rc)); + } + return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot mount '%s'"), + zhp->zfs_name)); + } + + /* remove the mounted entry before re-adding on remount */ + if (remount) + libzfs_mnttab_remove(hdl, zhp->zfs_name); + + /* add the mounted entry into our cache */ + libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint, mntopts); + return (0); +} + +/* + * Unmount a single filesystem. + */ +static int +unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) +{ + int error; + + error = do_unmount(mountpoint, flags); + if (error != 0) { + return (zfs_error_fmt(hdl, EZFS_UMOUNTFAILED, + dgettext(TEXT_DOMAIN, "cannot unmount '%s'"), + mountpoint)); + } + + return (0); +} + +/* + * Unmount the given filesystem. + */ +int +zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + char *mntpt = NULL; + boolean_t encroot, unmounted = B_FALSE; + + /* check to see if we need to unmount the filesystem */ + if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && + libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) { + /* + * mountpoint may have come from a call to + * getmnt/getmntany if it isn't NULL. If it is NULL, + * we know it comes from libzfs_mnttab_find which can + * then get freed later. We strdup it to play it safe. + */ + if (mountpoint == NULL) + mntpt = zfs_strdup(hdl, entry.mnt_mountp); + else + mntpt = zfs_strdup(hdl, mountpoint); + + /* + * Unshare and unmount the filesystem + */ + if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) { + free(mntpt); + return (-1); + } + zfs_commit_all_shares(); + + if (unmount_one(hdl, mntpt, flags) != 0) { + free(mntpt); + (void) zfs_shareall(zhp); + zfs_commit_all_shares(); + return (-1); + } + + libzfs_mnttab_remove(hdl, zhp->zfs_name); + free(mntpt); + unmounted = B_TRUE; + } + + /* + * If the MS_CRYPT flag is provided we must ensure we attempt to + * unload the dataset's key regardless of whether we did any work + * to unmount it. We only do this for encryption roots. + */ + if ((flags & MS_CRYPT) != 0 && + zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { + zfs_refresh_properties(zhp); + + if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0 && + unmounted) { + (void) zfs_mount(zhp, NULL, 0); + return (-1); + } + + if (encroot && zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_AVAILABLE && + zfs_crypto_unload_key(zhp) != 0) { + (void) zfs_mount(zhp, NULL, 0); + return (-1); + } + } + + return (0); +} + +/* + * Unmount this filesystem and any children inheriting the mountpoint property. + * To do this, just act like we're changing the mountpoint property, but don't + * remount the filesystems afterwards. + */ +int +zfs_unmountall(zfs_handle_t *zhp, int flags) +{ + prop_changelist_t *clp; + int ret; + + clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, + CL_GATHER_ITER_MOUNTED, flags); + if (clp == NULL) + return (-1); + + ret = changelist_prefix(clp); + changelist_free(clp); + + return (ret); +} + +boolean_t +zfs_is_shared(zfs_handle_t *zhp) +{ + zfs_share_type_t rc = 0; + zfs_share_proto_t *curr_proto; + + if (ZFS_IS_VOLUME(zhp)) + return (B_FALSE); + + for (curr_proto = share_all_proto; *curr_proto != PROTO_END; + curr_proto++) + rc |= zfs_is_shared_proto(zhp, NULL, *curr_proto); + + return (rc ? B_TRUE : B_FALSE); +} + +/* + * Unshare a filesystem by mountpoint. + */ +int +unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, + zfs_share_proto_t proto) +{ + int err; + + err = sa_disable_share(mountpoint, proto_table[proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(hdl, proto_table[proto].p_unshare_err, + dgettext(TEXT_DOMAIN, "cannot unshare '%s': %s"), + name, sa_errorstr(err))); + } + return (0); +} + +/* + * Query libshare for the given mountpoint and protocol, returning + * a zfs_share_type_t value. + */ +zfs_share_type_t +is_shared(const char *mountpoint, zfs_share_proto_t proto) +{ + if (sa_is_shared(mountpoint, proto_table[proto].p_name)) { + switch (proto) { + case PROTO_NFS: + return (SHARED_NFS); + case PROTO_SMB: + return (SHARED_SMB); + default: + return (SHARED_NOT_SHARED); + } + } + return (SHARED_NOT_SHARED); +} + +/* + * Share the given filesystem according to the options in the specified + * protocol specific properties (sharenfs, sharesmb). We rely + * on "libshare" to do the dirty work for us. + */ +int +zfs_share_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + char mountpoint[ZFS_MAXPROPLEN]; + char shareopts[ZFS_MAXPROPLEN]; + char sourcestr[ZFS_MAXPROPLEN]; + zfs_share_proto_t *curr_proto; + zprop_source_t sourcetype; + int err = 0; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), NULL, 0)) + return (0); + + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + /* + * Return success if there are no share options. + */ + if (zfs_prop_get(zhp, proto_table[*curr_proto].p_prop, + shareopts, sizeof (shareopts), &sourcetype, sourcestr, + ZFS_MAXPROPLEN, B_FALSE) != 0 || + strcmp(shareopts, "off") == 0) + continue; + + /* + * If the 'zoned' property is set, then zfs_is_mountable() + * will have already bailed out if we are in the global zone. + * But local zones cannot be NFS servers, so we ignore it for + * local zones as well. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_ZONED)) + continue; + + err = sa_enable_share(zfs_get_name(zhp), mountpoint, shareopts, + proto_table[*curr_proto].p_name); + if (err != SA_OK) { + return (zfs_error_fmt(zhp->zfs_hdl, + proto_table[*curr_proto].p_share_err, + dgettext(TEXT_DOMAIN, "cannot share '%s: %s'"), + zfs_get_name(zhp), sa_errorstr(err))); + } + + } + return (0); +} + +int +zfs_share(zfs_handle_t *zhp) +{ + assert(!ZFS_IS_VOLUME(zhp)); + return (zfs_share_proto(zhp, share_all_proto)); +} + +int +zfs_unshare(zfs_handle_t *zhp) +{ + assert(!ZFS_IS_VOLUME(zhp)); + return (zfs_unshareall(zhp)); +} + +/* + * Check to see if the filesystem is currently shared. + */ +zfs_share_type_t +zfs_is_shared_proto(zfs_handle_t *zhp, char **where, zfs_share_proto_t proto) +{ + char *mountpoint; + zfs_share_type_t rc; + + if (!zfs_is_mounted(zhp, &mountpoint)) + return (SHARED_NOT_SHARED); + + if ((rc = is_shared(mountpoint, proto)) + != SHARED_NOT_SHARED) { + if (where != NULL) + *where = mountpoint; + else + free(mountpoint); + return (rc); + } else { + free(mountpoint); + return (SHARED_NOT_SHARED); + } +} + +boolean_t +zfs_is_shared_nfs(zfs_handle_t *zhp, char **where) +{ + return (zfs_is_shared_proto(zhp, where, + PROTO_NFS) != SHARED_NOT_SHARED); +} + +boolean_t +zfs_is_shared_smb(zfs_handle_t *zhp, char **where) +{ + return (zfs_is_shared_proto(zhp, where, + PROTO_SMB) != SHARED_NOT_SHARED); +} + +/* + * zfs_parse_options(options, proto) + * + * Call the legacy parse interface to get the protocol specific + * options using the NULL arg to indicate that this is a "parse" only. + */ +int +zfs_parse_options(char *options, zfs_share_proto_t proto) +{ + return (sa_validate_shareopts(options, proto_table[proto].p_name)); +} + +void +zfs_commit_proto(zfs_share_proto_t *proto) +{ + zfs_share_proto_t *curr_proto; + for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { + sa_commit_shares(proto_table[*curr_proto].p_name); + } +} + +void +zfs_commit_nfs_shares(void) +{ + zfs_commit_proto(nfs_only); +} + +void +zfs_commit_smb_shares(void) +{ + zfs_commit_proto(smb_only); +} + +void +zfs_commit_all_shares(void) +{ + zfs_commit_proto(share_all_proto); +} + +void +zfs_commit_shares(const char *proto) +{ + if (proto == NULL) + zfs_commit_proto(share_all_proto); + else if (strcmp(proto, "nfs") == 0) + zfs_commit_proto(nfs_only); + else if (strcmp(proto, "smb") == 0) + zfs_commit_proto(smb_only); +} + +int +zfs_share_nfs(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, nfs_only)); +} + +int +zfs_share_smb(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, smb_only)); +} + +int +zfs_shareall(zfs_handle_t *zhp) +{ + return (zfs_share_proto(zhp, share_all_proto)); +} + +/* + * Unshare the given filesystem. + */ +int +zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, + zfs_share_proto_t *proto) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; + char *mntpt = NULL; + + /* check to see if need to unmount the filesystem */ + if (mountpoint != NULL) + mntpt = zfs_strdup(hdl, mountpoint); + + if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && + libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) { + zfs_share_proto_t *curr_proto; + + if (mountpoint == NULL) + mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); + + for (curr_proto = proto; *curr_proto != PROTO_END; + curr_proto++) { + + if (is_shared(mntpt, *curr_proto)) { + if (unshare_one(hdl, zhp->zfs_name, + mntpt, *curr_proto) != 0) { + if (mntpt != NULL) + free(mntpt); + return (-1); + } + } + } + } + if (mntpt != NULL) + free(mntpt); + + return (0); +} + +int +zfs_unshare_nfs(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); +} + +int +zfs_unshare_smb(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, smb_only)); +} + +/* + * Same as zfs_unmountall(), but for NFS and SMB unshares. + */ +static int +zfs_unshareall_proto(zfs_handle_t *zhp, zfs_share_proto_t *proto) +{ + prop_changelist_t *clp; + int ret; + + clp = changelist_gather(zhp, ZFS_PROP_SHARENFS, 0, 0); + if (clp == NULL) + return (-1); + + ret = changelist_unshare(clp, proto); + changelist_free(clp); + + return (ret); +} + +int +zfs_unshareall_nfs(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, nfs_only)); +} + +int +zfs_unshareall_smb(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, smb_only)); +} + +int +zfs_unshareall(zfs_handle_t *zhp) +{ + return (zfs_unshareall_proto(zhp, share_all_proto)); +} + +int +zfs_unshareall_bypath(zfs_handle_t *zhp, const char *mountpoint) +{ + return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); +} + +int +zfs_unshareall_bytype(zfs_handle_t *zhp, const char *mountpoint, + const char *proto) +{ + if (proto == NULL) + return (zfs_unshare_proto(zhp, mountpoint, share_all_proto)); + if (strcmp(proto, "nfs") == 0) + return (zfs_unshare_proto(zhp, mountpoint, nfs_only)); + else if (strcmp(proto, "smb") == 0) + return (zfs_unshare_proto(zhp, mountpoint, smb_only)); + else + return (1); +} + +/* + * Remove the mountpoint associated with the current dataset, if necessary. + * We only remove the underlying directory if: + * + * - The mountpoint is not 'none' or 'legacy' + * - The mountpoint is non-empty + * - The mountpoint is the default or inherited + * - The 'zoned' property is set, or we're in a local zone + * + * Any other directories we leave alone. + */ +void +remove_mountpoint(zfs_handle_t *zhp) +{ + char mountpoint[ZFS_MAXPROPLEN]; + zprop_source_t source; + + if (!zfs_is_mountable(zhp, mountpoint, sizeof (mountpoint), + &source, 0)) + return; + + if (source == ZPROP_SRC_DEFAULT || + source == ZPROP_SRC_INHERITED) { + /* + * Try to remove the directory, silently ignoring any errors. + * The filesystem may have since been removed or moved around, + * and this error isn't really useful to the administrator in + * any way. + */ + (void) rmdir(mountpoint); + } +} + +/* + * Add the given zfs handle to the cb_handles array, dynamically reallocating + * the array if it is out of space. + */ +void +libzfs_add_handle(get_all_cb_t *cbp, zfs_handle_t *zhp) +{ + if (cbp->cb_alloc == cbp->cb_used) { + size_t newsz; + zfs_handle_t **newhandles; + + newsz = cbp->cb_alloc != 0 ? cbp->cb_alloc * 2 : 64; + newhandles = zfs_realloc(zhp->zfs_hdl, + cbp->cb_handles, cbp->cb_alloc * sizeof (zfs_handle_t *), + newsz * sizeof (zfs_handle_t *)); + cbp->cb_handles = newhandles; + cbp->cb_alloc = newsz; + } + cbp->cb_handles[cbp->cb_used++] = zhp; +} + +/* + * Recursive helper function used during file system enumeration + */ +static int +zfs_iter_cb(zfs_handle_t *zhp, void *data) +{ + get_all_cb_t *cbp = data; + + if (!(zfs_get_type(zhp) & ZFS_TYPE_FILESYSTEM)) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_NOAUTO) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_UNAVAILABLE) { + zfs_close(zhp); + return (0); + } + + /* + * If this filesystem is inconsistent and has a receive resume + * token, we can not mount it. + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT) && + zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + NULL, 0, NULL, NULL, 0, B_TRUE) == 0) { + zfs_close(zhp); + return (0); + } + + libzfs_add_handle(cbp, zhp); + if (zfs_iter_filesystems(zhp, zfs_iter_cb, cbp) != 0) { + zfs_close(zhp); + return (-1); + } + return (0); +} + +/* + * Sort comparator that compares two mountpoint paths. We sort these paths so + * that subdirectories immediately follow their parents. This means that we + * effectively treat the '/' character as the lowest value non-nul char. + * Since filesystems from non-global zones can have the same mountpoint + * as other filesystems, the comparator sorts global zone filesystems to + * the top of the list. This means that the global zone will traverse the + * filesystem list in the correct order and can stop when it sees the + * first zoned filesystem. In a non-global zone, only the delegated + * filesystems are seen. + * + * An example sorted list using this comparator would look like: + * + * /foo + * /foo/bar + * /foo/bar/baz + * /foo/baz + * /foo.bar + * /foo (NGZ1) + * /foo (NGZ2) + * + * The mounting code depends on this ordering to deterministically iterate + * over filesystems in order to spawn parallel mount tasks. + */ +static int +mountpoint_cmp(const void *arga, const void *argb) +{ + zfs_handle_t *const *zap = arga; + zfs_handle_t *za = *zap; + zfs_handle_t *const *zbp = argb; + zfs_handle_t *zb = *zbp; + char mounta[MAXPATHLEN]; + char mountb[MAXPATHLEN]; + const char *a = mounta; + const char *b = mountb; + boolean_t gota, gotb; + uint64_t zoneda, zonedb; + + zoneda = zfs_prop_get_int(za, ZFS_PROP_ZONED); + zonedb = zfs_prop_get_int(zb, ZFS_PROP_ZONED); + if (zoneda && !zonedb) + return (1); + if (!zoneda && zonedb) + return (-1); + + gota = (zfs_get_type(za) == ZFS_TYPE_FILESYSTEM); + if (gota) { + verify(zfs_prop_get(za, ZFS_PROP_MOUNTPOINT, mounta, + sizeof (mounta), NULL, NULL, 0, B_FALSE) == 0); + } + gotb = (zfs_get_type(zb) == ZFS_TYPE_FILESYSTEM); + if (gotb) { + verify(zfs_prop_get(zb, ZFS_PROP_MOUNTPOINT, mountb, + sizeof (mountb), NULL, NULL, 0, B_FALSE) == 0); + } + + if (gota && gotb) { + while (*a != '\0' && (*a == *b)) { + a++; + b++; + } + if (*a == *b) + return (0); + if (*a == '\0') + return (-1); + if (*b == '\0') + return (1); + if (*a == '/') + return (-1); + if (*b == '/') + return (1); + return (*a < *b ? -1 : *a > *b); + } + + if (gota) + return (-1); + if (gotb) + return (1); + + /* + * If neither filesystem has a mountpoint, revert to sorting by + * dataset name. + */ + return (strcmp(zfs_get_name(za), zfs_get_name(zb))); +} + +/* + * Return true if path2 is a child of path1 or path2 equals path1 or + * path1 is "/" (path2 is always a child of "/"). + */ +static boolean_t +libzfs_path_contains(const char *path1, const char *path2) +{ + return (strcmp(path1, path2) == 0 || strcmp(path1, "/") == 0 || + (strstr(path2, path1) == path2 && path2[strlen(path1)] == '/')); +} + +/* + * Given a mountpoint specified by idx in the handles array, find the first + * non-descendent of that mountpoint and return its index. Descendant paths + * start with the parent's path. This function relies on the ordering + * enforced by mountpoint_cmp(). + */ +static int +non_descendant_idx(zfs_handle_t **handles, size_t num_handles, int idx) +{ + char parent[ZFS_MAXPROPLEN]; + char child[ZFS_MAXPROPLEN]; + int i; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, parent, + sizeof (parent), NULL, NULL, 0, B_FALSE) == 0); + + for (i = idx + 1; i < num_handles; i++) { + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, child, + sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + if (!libzfs_path_contains(parent, child)) + break; + } + return (i); +} + +typedef struct mnt_param { + libzfs_handle_t *mnt_hdl; + tpool_t *mnt_tp; + zfs_handle_t **mnt_zhps; /* filesystems to mount */ + size_t mnt_num_handles; + int mnt_idx; /* Index of selected entry to mount */ + zfs_iter_f mnt_func; + void *mnt_data; +} mnt_param_t; + +/* + * Allocate and populate the parameter struct for mount function, and + * schedule mounting of the entry selected by idx. + */ +static void +zfs_dispatch_mount(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, int idx, zfs_iter_f func, void *data, tpool_t *tp) +{ + mnt_param_t *mnt_param = zfs_alloc(hdl, sizeof (mnt_param_t)); + + mnt_param->mnt_hdl = hdl; + mnt_param->mnt_tp = tp; + mnt_param->mnt_zhps = handles; + mnt_param->mnt_num_handles = num_handles; + mnt_param->mnt_idx = idx; + mnt_param->mnt_func = func; + mnt_param->mnt_data = data; + + (void) tpool_dispatch(tp, zfs_mount_task, (void*)mnt_param); +} + +/* + * This is the structure used to keep state of mounting or sharing operations + * during a call to zpool_enable_datasets(). + */ +typedef struct mount_state { + /* + * ms_mntstatus is set to -1 if any mount fails. While multiple threads + * could update this variable concurrently, no synchronization is + * needed as it's only ever set to -1. + */ + int ms_mntstatus; + int ms_mntflags; + const char *ms_mntopts; +} mount_state_t; + +static int +zfs_mount_one(zfs_handle_t *zhp, void *arg) +{ + mount_state_t *ms = arg; + int ret = 0; + + /* + * don't attempt to mount encrypted datasets with + * unloaded keys + */ + if (zfs_prop_get_int(zhp, ZFS_PROP_KEYSTATUS) == + ZFS_KEYSTATUS_UNAVAILABLE) + return (0); + + if (zfs_mount(zhp, ms->ms_mntopts, ms->ms_mntflags) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); +} + +static int +zfs_share_one(zfs_handle_t *zhp, void *arg) +{ + mount_state_t *ms = arg; + int ret = 0; + + if (zfs_share(zhp) != 0) + ret = ms->ms_mntstatus = -1; + return (ret); +} + +/* + * Thread pool function to mount one file system. On completion, it finds and + * schedules its children to be mounted. This depends on the sorting done in + * zfs_foreach_mountpoint(). Note that the degenerate case (chain of entries + * each descending from the previous) will have no parallelism since we always + * have to wait for the parent to finish mounting before we can schedule + * its children. + */ +static void +zfs_mount_task(void *arg) +{ + mnt_param_t *mp = arg; + int idx = mp->mnt_idx; + zfs_handle_t **handles = mp->mnt_zhps; + size_t num_handles = mp->mnt_num_handles; + char mountpoint[ZFS_MAXPROPLEN]; + + verify(zfs_prop_get(handles[idx], ZFS_PROP_MOUNTPOINT, mountpoint, + sizeof (mountpoint), NULL, NULL, 0, B_FALSE) == 0); + + if (mp->mnt_func(handles[idx], mp->mnt_data) != 0) + return; + + /* + * We dispatch tasks to mount filesystems with mountpoints underneath + * this one. We do this by dispatching the next filesystem with a + * descendant mountpoint of the one we just mounted, then skip all of + * its descendants, dispatch the next descendant mountpoint, and so on. + * The non_descendant_idx() function skips over filesystems that are + * descendants of the filesystem we just dispatched. + */ + for (int i = idx + 1; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + char child[ZFS_MAXPROPLEN]; + verify(zfs_prop_get(handles[i], ZFS_PROP_MOUNTPOINT, + child, sizeof (child), NULL, NULL, 0, B_FALSE) == 0); + + if (!libzfs_path_contains(mountpoint, child)) + break; /* not a descendant, return */ + zfs_dispatch_mount(mp->mnt_hdl, handles, num_handles, i, + mp->mnt_func, mp->mnt_data, mp->mnt_tp); + } + free(mp); +} + +/* + * Issue the func callback for each ZFS handle contained in the handles + * array. This function is used to mount all datasets, and so this function + * guarantees that filesystems for parent mountpoints are called before their + * children. As such, before issuing any callbacks, we first sort the array + * of handles by mountpoint. + * + * Callbacks are issued in one of two ways: + * + * 1. Sequentially: If the parallel argument is B_FALSE or the ZFS_SERIAL_MOUNT + * environment variable is set, then we issue callbacks sequentially. + * + * 2. In parallel: If the parallel argument is B_TRUE and the ZFS_SERIAL_MOUNT + * environment variable is not set, then we use a tpool to dispatch threads + * to mount filesystems in parallel. This function dispatches tasks to mount + * the filesystems at the top-level mountpoints, and these tasks in turn + * are responsible for recursively mounting filesystems in their children + * mountpoints. + */ +void +zfs_foreach_mountpoint(libzfs_handle_t *hdl, zfs_handle_t **handles, + size_t num_handles, zfs_iter_f func, void *data, boolean_t parallel) +{ + zoneid_t zoneid = getzoneid(); + + /* + * The ZFS_SERIAL_MOUNT environment variable is an undocumented + * variable that can be used as a convenience to do a/b comparison + * of serial vs. parallel mounting. + */ + boolean_t serial_mount = !parallel || + (getenv("ZFS_SERIAL_MOUNT") != NULL); + + /* + * Sort the datasets by mountpoint. See mountpoint_cmp for details + * of how these are sorted. + */ + qsort(handles, num_handles, sizeof (zfs_handle_t *), mountpoint_cmp); + + if (serial_mount) { + for (int i = 0; i < num_handles; i++) { + func(handles[i], data); + } + return; + } + + /* + * Issue the callback function for each dataset using a parallel + * algorithm that uses a thread pool to manage threads. + */ + tpool_t *tp = tpool_create(1, mount_tp_nthr, 0, NULL); + + /* + * There may be multiple "top level" mountpoints outside of the pool's + * root mountpoint, e.g.: /foo /bar. Dispatch a mount task for each of + * these. + */ + for (int i = 0; i < num_handles; + i = non_descendant_idx(handles, num_handles, i)) { + /* + * Since the mountpoints have been sorted so that the zoned + * filesystems are at the end, a zoned filesystem seen from + * the global zone means that we're done. + */ + if (zoneid == GLOBAL_ZONEID && + zfs_prop_get_int(handles[i], ZFS_PROP_ZONED)) + break; + zfs_dispatch_mount(hdl, handles, num_handles, i, func, data, + tp); + } + + tpool_wait(tp); /* wait for all scheduled mounts to complete */ + tpool_destroy(tp); +} + +/* + * Mount and share all datasets within the given pool. This assumes that no + * datasets within the pool are currently mounted. + */ +#pragma weak zpool_mount_datasets = zpool_enable_datasets +int +zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) +{ + get_all_cb_t cb = { 0 }; + mount_state_t ms = { 0 }; + zfs_handle_t *zfsp; + int ret = 0; + + if ((zfsp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, + ZFS_TYPE_DATASET)) == NULL) + goto out; + + /* + * Gather all non-snapshot datasets within the pool. Start by adding + * the root filesystem for this pool to the list, and then iterate + * over all child filesystems. + */ + libzfs_add_handle(&cb, zfsp); + if (zfs_iter_filesystems(zfsp, zfs_iter_cb, &cb) != 0) + goto out; + + /* + * Mount all filesystems + */ + ms.ms_mntopts = mntopts; + ms.ms_mntflags = flags; + zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, + zfs_mount_one, &ms, B_TRUE); + if (ms.ms_mntstatus != 0) + ret = ms.ms_mntstatus; + + /* + * Share all filesystems that need to be shared. This needs to be + * a separate pass because libshare is not mt-safe, and so we need + * to share serially. + */ + ms.ms_mntstatus = 0; + zfs_foreach_mountpoint(zhp->zpool_hdl, cb.cb_handles, cb.cb_used, + zfs_share_one, &ms, B_FALSE); + if (ms.ms_mntstatus != 0) + ret = ms.ms_mntstatus; + else + zfs_commit_all_shares(); + +out: + for (int i = 0; i < cb.cb_used; i++) + zfs_close(cb.cb_handles[i]); + free(cb.cb_handles); + + return (ret); +} + +static int +mountpoint_compare(const void *a, const void *b) +{ + const char *mounta = *((char **)a); + const char *mountb = *((char **)b); + + return (strcmp(mountb, mounta)); +} + +/* alias for 2002/240 */ +#pragma weak zpool_unmount_datasets = zpool_disable_datasets +/* + * Unshare and unmount all datasets within the given pool. We don't want to + * rely on traversing the DSL to discover the filesystems within the pool, + * because this may be expensive (if not all of them are mounted), and can fail + * arbitrarily (on I/O error, for example). Instead, we walk /proc/self/mounts + * and gather all the filesystems that are currently mounted. + */ +int +zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) +{ + int used, alloc; + struct mnttab entry; + size_t namelen; + char **mountpoints = NULL; + zfs_handle_t **datasets = NULL; + libzfs_handle_t *hdl = zhp->zpool_hdl; + int i; + int ret = -1; + int flags = (force ? MS_FORCE : 0); + + namelen = strlen(zhp->zpool_name); + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + return (ENOENT); + + used = alloc = 0; + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + /* + * Ignore non-ZFS entries. + */ + if (entry.mnt_fstype == NULL || + strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + + /* + * Ignore filesystems not within this pool. + */ + if (entry.mnt_mountp == NULL || + strncmp(entry.mnt_special, zhp->zpool_name, namelen) != 0 || + (entry.mnt_special[namelen] != '/' && + entry.mnt_special[namelen] != '\0')) + continue; + + /* + * At this point we've found a filesystem within our pool. Add + * it to our growing list. + */ + if (used == alloc) { + if (alloc == 0) { + if ((mountpoints = zfs_alloc(hdl, + 8 * sizeof (void *))) == NULL) + goto out; + + if ((datasets = zfs_alloc(hdl, + 8 * sizeof (void *))) == NULL) + goto out; + + alloc = 8; + } else { + void *ptr; + + if ((ptr = zfs_realloc(hdl, mountpoints, + alloc * sizeof (void *), + alloc * 2 * sizeof (void *))) == NULL) + goto out; + mountpoints = ptr; + + if ((ptr = zfs_realloc(hdl, datasets, + alloc * sizeof (void *), + alloc * 2 * sizeof (void *))) == NULL) + goto out; + datasets = ptr; + + alloc *= 2; + } + } + + if ((mountpoints[used] = zfs_strdup(hdl, + entry.mnt_mountp)) == NULL) + goto out; + + /* + * This is allowed to fail, in case there is some I/O error. It + * is only used to determine if we need to remove the underlying + * mountpoint, so failure is not fatal. + */ + datasets[used] = make_dataset_handle(hdl, entry.mnt_special); + + used++; + } + + /* + * At this point, we have the entire list of filesystems, so sort it by + * mountpoint. + */ + qsort(mountpoints, used, sizeof (char *), mountpoint_compare); + + /* + * Walk through and first unshare everything. + */ + for (i = 0; i < used; i++) { + zfs_share_proto_t *curr_proto; + for (curr_proto = share_all_proto; *curr_proto != PROTO_END; + curr_proto++) { + if (is_shared(mountpoints[i], *curr_proto) && + unshare_one(hdl, mountpoints[i], + mountpoints[i], *curr_proto) != 0) + goto out; + } + } + zfs_commit_all_shares(); + + /* + * Now unmount everything, removing the underlying directories as + * appropriate. + */ + for (i = 0; i < used; i++) { + if (unmount_one(hdl, mountpoints[i], flags) != 0) + goto out; + } + + for (i = 0; i < used; i++) { + if (datasets[i]) + remove_mountpoint(datasets[i]); + } + + ret = 0; +out: + for (i = 0; i < used; i++) { + if (datasets[i]) + zfs_close(datasets[i]); + free(mountpoints[i]); + } + free(datasets); + free(mountpoints); + + return (ret); +} diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c new file mode 100644 index 000000000000..2501965e42ad --- /dev/null +++ b/lib/libzfs/libzfs_pool.c @@ -0,0 +1,4531 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +static boolean_t zpool_vdev_is_interior(const char *name); + +typedef struct prop_flags { + int create:1; /* Validate property on creation */ + int import:1; /* Validate property on import */ +} prop_flags_t; + +/* + * ==================================================================== + * zpool property functions + * ==================================================================== + */ + +static int +zpool_get_all_props(zpool_handle_t *zhp) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) + return (-1); + + while (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_PROPS, &zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } else { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (zcmd_read_dst_nvlist(hdl, &zc, &zhp->zpool_props) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zcmd_free_nvlists(&zc); + + return (0); +} + +int +zpool_props_refresh(zpool_handle_t *zhp) +{ + nvlist_t *old_props; + + old_props = zhp->zpool_props; + + if (zpool_get_all_props(zhp) != 0) + return (-1); + + nvlist_free(old_props); + return (0); +} + +static const char * +zpool_get_prop_string(zpool_handle_t *zhp, zpool_prop_t prop, + zprop_source_t *src) +{ + nvlist_t *nv, *nvl; + uint64_t ival; + char *value; + zprop_source_t source; + + nvl = zhp->zpool_props; + if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &ival) == 0); + source = ival; + verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); + } else { + source = ZPROP_SRC_DEFAULT; + if ((value = (char *)zpool_prop_default_string(prop)) == NULL) + value = "-"; + } + + if (src) + *src = source; + + return (value); +} + +uint64_t +zpool_get_prop_int(zpool_handle_t *zhp, zpool_prop_t prop, zprop_source_t *src) +{ + nvlist_t *nv, *nvl; + uint64_t value; + zprop_source_t source; + + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) { + /* + * zpool_get_all_props() has most likely failed because + * the pool is faulted, but if all we need is the top level + * vdev's guid then get it from the zhp config nvlist. + */ + if ((prop == ZPOOL_PROP_GUID) && + (nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nv) == 0) && + (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value) + == 0)) { + return (value); + } + return (zpool_prop_default_numeric(prop)); + } + + nvl = zhp->zpool_props; + if (nvlist_lookup_nvlist(nvl, zpool_prop_to_name(prop), &nv) == 0) { + verify(nvlist_lookup_uint64(nv, ZPROP_SOURCE, &value) == 0); + source = value; + verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); + } else { + source = ZPROP_SRC_DEFAULT; + value = zpool_prop_default_numeric(prop); + } + + if (src) + *src = source; + + return (value); +} + +/* + * Map VDEV STATE to printed strings. + */ +const char * +zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) +{ + switch (state) { + case VDEV_STATE_CLOSED: + case VDEV_STATE_OFFLINE: + return (gettext("OFFLINE")); + case VDEV_STATE_REMOVED: + return (gettext("REMOVED")); + case VDEV_STATE_CANT_OPEN: + if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) + return (gettext("FAULTED")); + else if (aux == VDEV_AUX_SPLIT_POOL) + return (gettext("SPLIT")); + else + return (gettext("UNAVAIL")); + case VDEV_STATE_FAULTED: + return (gettext("FAULTED")); + case VDEV_STATE_DEGRADED: + return (gettext("DEGRADED")); + case VDEV_STATE_HEALTHY: + return (gettext("ONLINE")); + + default: + break; + } + + return (gettext("UNKNOWN")); +} + +/* + * Map POOL STATE to printed strings. + */ +const char * +zpool_pool_state_to_name(pool_state_t state) +{ + switch (state) { + default: + break; + case POOL_STATE_ACTIVE: + return (gettext("ACTIVE")); + case POOL_STATE_EXPORTED: + return (gettext("EXPORTED")); + case POOL_STATE_DESTROYED: + return (gettext("DESTROYED")); + case POOL_STATE_SPARE: + return (gettext("SPARE")); + case POOL_STATE_L2CACHE: + return (gettext("L2CACHE")); + case POOL_STATE_UNINITIALIZED: + return (gettext("UNINITIALIZED")); + case POOL_STATE_UNAVAIL: + return (gettext("UNAVAIL")); + case POOL_STATE_POTENTIALLY_ACTIVE: + return (gettext("POTENTIALLY_ACTIVE")); + } + + return (gettext("UNKNOWN")); +} + +/* + * Given a pool handle, return the pool health string ("ONLINE", "DEGRADED", + * "SUSPENDED", etc). + */ +const char * +zpool_get_state_str(zpool_handle_t *zhp) +{ + zpool_errata_t errata; + zpool_status_t status; + nvlist_t *nvroot; + vdev_stat_t *vs; + uint_t vsc; + const char *str; + + status = zpool_get_status(zhp, NULL, &errata); + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + str = gettext("FAULTED"); + } else if (status == ZPOOL_STATUS_IO_FAILURE_WAIT || + status == ZPOOL_STATUS_IO_FAILURE_MMP) { + str = gettext("SUSPENDED"); + } else { + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); + str = zpool_state_to_name(vs->vs_state, vs->vs_aux); + } + return (str); +} + +/* + * Get a zpool property value for 'prop' and return the value in + * a pre-allocated buffer. + */ +int +zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, + size_t len, zprop_source_t *srctype, boolean_t literal) +{ + uint64_t intval; + const char *strval; + zprop_source_t src = ZPROP_SRC_NONE; + + if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { + switch (prop) { + case ZPOOL_PROP_NAME: + (void) strlcpy(buf, zpool_get_name(zhp), len); + break; + + case ZPOOL_PROP_HEALTH: + (void) strlcpy(buf, zpool_get_state_str(zhp), len); + break; + + case ZPOOL_PROP_GUID: + intval = zpool_get_prop_int(zhp, prop, &src); + (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); + break; + + case ZPOOL_PROP_ALTROOT: + case ZPOOL_PROP_CACHEFILE: + case ZPOOL_PROP_COMMENT: + if (zhp->zpool_props != NULL || + zpool_get_all_props(zhp) == 0) { + (void) strlcpy(buf, + zpool_get_prop_string(zhp, prop, &src), + len); + break; + } + /* FALLTHROUGH */ + default: + (void) strlcpy(buf, "-", len); + break; + } + + if (srctype != NULL) + *srctype = src; + return (0); + } + + if (zhp->zpool_props == NULL && zpool_get_all_props(zhp) && + prop != ZPOOL_PROP_NAME) + return (-1); + + switch (zpool_prop_get_type(prop)) { + case PROP_TYPE_STRING: + (void) strlcpy(buf, zpool_get_prop_string(zhp, prop, &src), + len); + break; + + case PROP_TYPE_NUMBER: + intval = zpool_get_prop_int(zhp, prop, &src); + + switch (prop) { + case ZPOOL_PROP_SIZE: + case ZPOOL_PROP_ALLOCATED: + case ZPOOL_PROP_FREE: + case ZPOOL_PROP_FREEING: + case ZPOOL_PROP_LEAKED: + case ZPOOL_PROP_ASHIFT: + if (literal) + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + else + (void) zfs_nicenum(intval, buf, len); + break; + + case ZPOOL_PROP_EXPANDSZ: + case ZPOOL_PROP_CHECKPOINT: + if (intval == 0) { + (void) strlcpy(buf, "-", len); + } else if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) zfs_nicebytes(intval, buf, len); + } + break; + + case ZPOOL_PROP_CAPACITY: + if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + } + break; + + case ZPOOL_PROP_FRAGMENTATION: + if (intval == UINT64_MAX) { + (void) strlcpy(buf, "-", len); + } else if (literal) { + (void) snprintf(buf, len, "%llu", + (u_longlong_t)intval); + } else { + (void) snprintf(buf, len, "%llu%%", + (u_longlong_t)intval); + } + break; + + case ZPOOL_PROP_DEDUPRATIO: + if (literal) + (void) snprintf(buf, len, "%llu.%02llu", + (u_longlong_t)(intval / 100), + (u_longlong_t)(intval % 100)); + else + (void) snprintf(buf, len, "%llu.%02llux", + (u_longlong_t)(intval / 100), + (u_longlong_t)(intval % 100)); + break; + + case ZPOOL_PROP_HEALTH: + (void) strlcpy(buf, zpool_get_state_str(zhp), len); + break; + case ZPOOL_PROP_VERSION: + if (intval >= SPA_VERSION_FEATURES) { + (void) snprintf(buf, len, "-"); + break; + } + /* FALLTHROUGH */ + default: + (void) snprintf(buf, len, "%llu", (u_longlong_t)intval); + } + break; + + case PROP_TYPE_INDEX: + intval = zpool_get_prop_int(zhp, prop, &src); + if (zpool_prop_index_to_string(prop, intval, &strval) + != 0) + return (-1); + (void) strlcpy(buf, strval, len); + break; + + default: + abort(); + } + + if (srctype) + *srctype = src; + + return (0); +} + +/* + * Check if the bootfs name has the same pool name as it is set to. + * Assuming bootfs is a valid dataset name. + */ +static boolean_t +bootfs_name_valid(const char *pool, const char *bootfs) +{ + int len = strlen(pool); + if (bootfs[0] == '\0') + return (B_TRUE); + + if (!zfs_name_valid(bootfs, ZFS_TYPE_FILESYSTEM|ZFS_TYPE_SNAPSHOT)) + return (B_FALSE); + + if (strncmp(pool, bootfs, len) == 0 && + (bootfs[len] == '/' || bootfs[len] == '\0')) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Given an nvlist of zpool properties to be set, validate that they are + * correct, and parse any numeric properties (index, boolean, etc) if they are + * specified as strings. + */ +static nvlist_t * +zpool_valid_proplist(libzfs_handle_t *hdl, const char *poolname, + nvlist_t *props, uint64_t version, prop_flags_t flags, char *errbuf) +{ + nvpair_t *elem; + nvlist_t *retprops; + zpool_prop_t prop; + char *strval; + uint64_t intval; + char *slash, *check; + struct stat64 statbuf; + zpool_handle_t *zhp; + + if (nvlist_alloc(&retprops, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + const char *propname = nvpair_name(elem); + + prop = zpool_name_to_prop(propname); + if (prop == ZPOOL_PROP_INVAL && zpool_prop_feature(propname)) { + int err; + char *fname = strchr(propname, '@') + 1; + + err = zfeature_lookup_name(fname, NULL); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid feature '%s'"), fname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvpair_type(elem) != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, ZFS_FEATURE_ENABLED) != 0 && + strcmp(strval, ZFS_FEATURE_DISABLED) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set to " + "'enabled' or 'disabled'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (!flags.create && + strcmp(strval, ZFS_FEATURE_DISABLED) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set to " + "'disabled' at creation time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (nvlist_add_uint64(retprops, propname, 0) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + /* + * Make sure this property is valid and applies to this type. + */ + if (prop == ZPOOL_PROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (zpool_prop_readonly(prop)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " + "is readonly"), propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, errbuf); + goto error; + } + + if (!flags.create && zpool_prop_setonce(prop)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set at " + "creation time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (zprop_parse_value(hdl, elem, prop, ZFS_TYPE_POOL, retprops, + &strval, &intval, errbuf) != 0) + goto error; + + /* + * Perform additional checking for specific properties. + */ + switch (prop) { + case ZPOOL_PROP_VERSION: + if (intval < version || + !SPA_VERSION_IS_SUPPORTED(intval)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' number %d is invalid."), + propname, intval); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_ASHIFT: + if (intval != 0 && + (intval < ASHIFT_MIN || intval > ASHIFT_MAX)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' number %d is invalid, only " + "values between %" PRId32 " and " + "%" PRId32 " are allowed."), + propname, intval, ASHIFT_MIN, ASHIFT_MAX); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_BOOTFS: + if (flags.create || flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' cannot be set at creation " + "or import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (version < SPA_VERSION_BOOTFS) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to support " + "'%s' property"), propname); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + goto error; + } + + /* + * bootfs property value has to be a dataset name and + * the dataset has to be in the same pool as it sets to. + */ + if (!bootfs_name_valid(poolname, strval)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' " + "is an invalid name"), strval); + (void) zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto error; + } + + if ((zhp = zpool_open_canfail(hdl, poolname)) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "could not open pool '%s'"), poolname); + (void) zfs_error(hdl, EZFS_OPENFAILED, errbuf); + goto error; + } + zpool_close(zhp); + break; + + case ZPOOL_PROP_ALTROOT: + if (!flags.create && !flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set during pool " + "creation or import"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (strval[0] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bad alternate root '%s'"), strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + break; + + case ZPOOL_PROP_CACHEFILE: + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' must be empty, an " + "absolute path, or 'none'"), propname); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + slash = strrchr(strval, '/'); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is not a valid file"), strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + *slash = '\0'; + + if (strval[0] != '\0' && + (stat64(strval, &statbuf) != 0 || + !S_ISDIR(statbuf.st_mode))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is not a valid directory"), + strval); + (void) zfs_error(hdl, EZFS_BADPATH, errbuf); + goto error; + } + + *slash = '/'; + break; + + case ZPOOL_PROP_COMMENT: + for (check = strval; *check != '\0'; check++) { + if (!isprint(*check)) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "comment may only have printable " + "characters")); + (void) zfs_error(hdl, EZFS_BADPROP, + errbuf); + goto error; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "comment must not exceed %d characters"), + ZPROP_MAX_COMMENT); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZPOOL_PROP_READONLY: + if (!flags.import) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' can only be set at " + "import time"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZPOOL_PROP_MULTIHOST: + if (get_system_hostid() == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "requires a non-zero system hostid")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + break; + case ZPOOL_PROP_DEDUPDITTO: + printf("Note: property '%s' no longer has " + "any effect\n", propname); + break; + + default: + break; + } + } + + return (retprops); +error: + nvlist_free(retprops); + return (NULL); +} + +/* + * Set zpool property : propname=propval. + */ +int +zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) +{ + zfs_cmd_t zc = {"\0"}; + int ret = -1; + char errbuf[1024]; + nvlist_t *nvl = NULL; + nvlist_t *realprops; + uint64_t version; + prop_flags_t flags = { 0 }; + + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), + zhp->zpool_name); + + if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) + return (no_memory(zhp->zpool_hdl)); + + if (nvlist_add_string(nvl, propname, propval) != 0) { + nvlist_free(nvl); + return (no_memory(zhp->zpool_hdl)); + } + + version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if ((realprops = zpool_valid_proplist(zhp->zpool_hdl, + zhp->zpool_name, nvl, version, flags, errbuf)) == NULL) { + nvlist_free(nvl); + return (-1); + } + + nvlist_free(nvl); + nvl = realprops; + + /* + * Execute the corresponding ioctl() to set this property. + */ + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, nvl) != 0) { + nvlist_free(nvl); + return (-1); + } + + ret = zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_SET_PROPS, &zc); + + zcmd_free_nvlists(&zc); + nvlist_free(nvl); + + if (ret) + (void) zpool_standard_error(zhp->zpool_hdl, errno, errbuf); + else + (void) zpool_props_refresh(zhp); + + return (ret); +} + +int +zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + zprop_list_t *entry; + char buf[ZFS_MAXPROPLEN]; + nvlist_t *features = NULL; + nvpair_t *nvp; + zprop_list_t **last; + boolean_t firstexpand = (NULL == *plp); + int i; + + if (zprop_expand_list(hdl, plp, ZFS_TYPE_POOL) != 0) + return (-1); + + last = plp; + while (*last != NULL) + last = &(*last)->pl_next; + + if ((*plp)->pl_all) + features = zpool_get_features(zhp); + + if ((*plp)->pl_all && firstexpand) { + for (i = 0; i < SPA_FEATURES; i++) { + zprop_list_t *entry = zfs_alloc(hdl, + sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = zfs_asprintf(hdl, "feature@%s", + spa_feature_table[i].fi_uname); + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + } + + /* add any unsupported features */ + for (nvp = nvlist_next_nvpair(features, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(features, nvp)) { + char *propname; + boolean_t found; + zprop_list_t *entry; + + if (zfeature_is_supported(nvpair_name(nvp))) + continue; + + propname = zfs_asprintf(hdl, "unsupported@%s", + nvpair_name(nvp)); + + /* + * Before adding the property to the list make sure that no + * other pool already added the same property. + */ + found = B_FALSE; + entry = *plp; + while (entry != NULL) { + if (entry->pl_user_prop != NULL && + strcmp(propname, entry->pl_user_prop) == 0) { + found = B_TRUE; + break; + } + entry = entry->pl_next; + } + if (found) { + free(propname); + continue; + } + + entry = zfs_alloc(hdl, sizeof (zprop_list_t)); + entry->pl_prop = ZPROP_INVAL; + entry->pl_user_prop = propname; + entry->pl_width = strlen(entry->pl_user_prop); + entry->pl_all = B_TRUE; + + *last = entry; + last = &entry->pl_next; + } + + for (entry = *plp; entry != NULL; entry = entry->pl_next) { + + if (entry->pl_fixed) + continue; + + if (entry->pl_prop != ZPROP_INVAL && + zpool_get_prop(zhp, entry->pl_prop, buf, sizeof (buf), + NULL, B_FALSE) == 0) { + if (strlen(buf) > entry->pl_width) + entry->pl_width = strlen(buf); + } + } + + return (0); +} + +/* + * Get the state for the given feature on the given ZFS pool. + */ +int +zpool_prop_get_feature(zpool_handle_t *zhp, const char *propname, char *buf, + size_t len) +{ + uint64_t refcount; + boolean_t found = B_FALSE; + nvlist_t *features = zpool_get_features(zhp); + boolean_t supported; + const char *feature = strchr(propname, '@') + 1; + + supported = zpool_prop_feature(propname); + ASSERT(supported || zpool_prop_unsupported(propname)); + + /* + * Convert from feature name to feature guid. This conversion is + * unnecessary for unsupported@... properties because they already + * use guids. + */ + if (supported) { + int ret; + spa_feature_t fid; + + ret = zfeature_lookup_name(feature, &fid); + if (ret != 0) { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + feature = spa_feature_table[fid].fi_guid; + } + + if (nvlist_lookup_uint64(features, feature, &refcount) == 0) + found = B_TRUE; + + if (supported) { + if (!found) { + (void) strlcpy(buf, ZFS_FEATURE_DISABLED, len); + } else { + if (refcount == 0) + (void) strlcpy(buf, ZFS_FEATURE_ENABLED, len); + else + (void) strlcpy(buf, ZFS_FEATURE_ACTIVE, len); + } + } else { + if (found) { + if (refcount == 0) { + (void) strcpy(buf, ZFS_UNSUPPORTED_INACTIVE); + } else { + (void) strcpy(buf, ZFS_UNSUPPORTED_READONLY); + } + } else { + (void) strlcpy(buf, "-", len); + return (ENOTSUP); + } + } + + return (0); +} + +/* + * Validate the given pool name, optionally putting an extended error message in + * 'buf'. + */ +boolean_t +zpool_name_valid(libzfs_handle_t *hdl, boolean_t isopen, const char *pool) +{ + namecheck_err_t why; + char what; + int ret; + + ret = pool_namecheck(pool, &why, &what); + + /* + * The rules for reserved pool names were extended at a later point. + * But we need to support users with existing pools that may now be + * invalid. So we only check for this expanded set of names during a + * create (or import), and only in userland. + */ + if (ret == 0 && !isopen && + (strncmp(pool, "mirror", 6) == 0 || + strncmp(pool, "raidz", 5) == 0 || + strncmp(pool, "spare", 5) == 0 || + strcmp(pool, "log") == 0)) { + if (hdl != NULL) + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "name is reserved")); + return (B_FALSE); + } + + + if (ret != 0) { + if (hdl != NULL) { + switch (why) { + case NAME_ERR_TOOLONG: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "name is too long")); + break; + + case NAME_ERR_INVALCHAR: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "invalid character " + "'%c' in pool name"), what); + break; + + case NAME_ERR_NOLETTER: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name must begin with a letter")); + break; + + case NAME_ERR_RESERVED: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "name is reserved")); + break; + + case NAME_ERR_DISKLIKE: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool name is reserved")); + break; + + case NAME_ERR_LEADING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "leading slash in name")); + break; + + case NAME_ERR_EMPTY_COMPONENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty component in name")); + break; + + case NAME_ERR_TRAILING_SLASH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "trailing slash in name")); + break; + + case NAME_ERR_MULTIPLE_DELIMITERS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "multiple '@' and/or '#' delimiters in " + "name")); + break; + + case NAME_ERR_NO_AT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "permission set is missing '@'")); + break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "(%d) not defined"), why); + break; + } + } + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Open a handle to the given pool, even if the pool is currently in the FAULTED + * state. + */ +zpool_handle_t * +zpool_open_canfail(libzfs_handle_t *hdl, const char *pool) +{ + zpool_handle_t *zhp; + boolean_t missing; + + /* + * Make sure the pool name is valid. + */ + if (!zpool_name_valid(hdl, B_TRUE, pool)) { + (void) zfs_error_fmt(hdl, EZFS_INVALIDNAME, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), + pool); + return (NULL); + } + + if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) + return (NULL); + + zhp->zpool_hdl = hdl; + (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (NULL); + } + + if (missing) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "no such pool")); + (void) zfs_error_fmt(hdl, EZFS_NOENT, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), pool); + zpool_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Like the above, but silent on error. Used when iterating over pools (because + * the configuration cache may be out of date). + */ +int +zpool_open_silent(libzfs_handle_t *hdl, const char *pool, zpool_handle_t **ret) +{ + zpool_handle_t *zhp; + boolean_t missing; + + if ((zhp = zfs_alloc(hdl, sizeof (zpool_handle_t))) == NULL) + return (-1); + + zhp->zpool_hdl = hdl; + (void) strlcpy(zhp->zpool_name, pool, sizeof (zhp->zpool_name)); + + if (zpool_refresh_stats(zhp, &missing) != 0) { + zpool_close(zhp); + return (-1); + } + + if (missing) { + zpool_close(zhp); + *ret = NULL; + return (0); + } + + *ret = zhp; + return (0); +} + +/* + * Similar to zpool_open_canfail(), but refuses to open pools in the faulted + * state. + */ +zpool_handle_t * +zpool_open(libzfs_handle_t *hdl, const char *pool) +{ + zpool_handle_t *zhp; + + if ((zhp = zpool_open_canfail(hdl, pool)) == NULL) + return (NULL); + + if (zhp->zpool_state == POOL_STATE_UNAVAIL) { + (void) zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), zhp->zpool_name); + zpool_close(zhp); + return (NULL); + } + + return (zhp); +} + +/* + * Close the handle. Simply frees the memory associated with the handle. + */ +void +zpool_close(zpool_handle_t *zhp) +{ + nvlist_free(zhp->zpool_config); + nvlist_free(zhp->zpool_old_config); + nvlist_free(zhp->zpool_props); + free(zhp); +} + +/* + * Return the name of the pool. + */ +const char * +zpool_get_name(zpool_handle_t *zhp) +{ + return (zhp->zpool_name); +} + + +/* + * Return the state of the pool (ACTIVE or UNAVAILABLE) + */ +int +zpool_get_state(zpool_handle_t *zhp) +{ + return (zhp->zpool_state); +} + +/* + * Check if vdev list contains a special vdev + */ +static boolean_t +zpool_has_special_vdev(nvlist_t *nvroot) +{ + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (uint_t c = 0; c < children; c++) { + char *bias; + + if (nvlist_lookup_string(child[c], + ZPOOL_CONFIG_ALLOCATION_BIAS, &bias) == 0 && + strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) { + return (B_TRUE); + } + } + } + return (B_FALSE); +} + +/* + * Create the named pool, using the provided vdev list. It is assumed + * that the consumer has already validated the contents of the nvlist, so we + * don't have to worry about error semantics. + */ +int +zpool_create(libzfs_handle_t *hdl, const char *pool, nvlist_t *nvroot, + nvlist_t *props, nvlist_t *fsprops) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *zc_fsprops = NULL; + nvlist_t *zc_props = NULL; + nvlist_t *hidden_args = NULL; + uint8_t *wkeydata = NULL; + uint_t wkeylen = 0; + char msg[1024]; + int ret = -1; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot create '%s'"), pool); + + if (!zpool_name_valid(hdl, B_FALSE, pool)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + + if (props) { + prop_flags_t flags = { .create = B_TRUE, .import = B_FALSE }; + + if ((zc_props = zpool_valid_proplist(hdl, pool, props, + SPA_VERSION_1, flags, msg)) == NULL) { + goto create_failed; + } + } + + if (fsprops) { + uint64_t zoned; + char *zonestr; + + zoned = ((nvlist_lookup_string(fsprops, + zfs_prop_to_name(ZFS_PROP_ZONED), &zonestr) == 0) && + strcmp(zonestr, "on") == 0); + + if ((zc_fsprops = zfs_valid_proplist(hdl, ZFS_TYPE_FILESYSTEM, + fsprops, zoned, NULL, NULL, B_TRUE, msg)) == NULL) { + goto create_failed; + } + + if (nvlist_exists(zc_fsprops, + zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)) && + !zpool_has_special_vdev(nvroot)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "%s property requires a special vdev"), + zfs_prop_to_name(ZFS_PROP_SPECIAL_SMALL_BLOCKS)); + (void) zfs_error(hdl, EZFS_BADPROP, msg); + goto create_failed; + } + + if (!zc_props && + (nvlist_alloc(&zc_props, NV_UNIQUE_NAME, 0) != 0)) { + goto create_failed; + } + if (zfs_crypto_create(hdl, NULL, zc_fsprops, props, B_TRUE, + &wkeydata, &wkeylen) != 0) { + zfs_error(hdl, EZFS_CRYPTOFAILED, msg); + goto create_failed; + } + if (nvlist_add_nvlist(zc_props, + ZPOOL_ROOTFS_PROPS, zc_fsprops) != 0) { + goto create_failed; + } + if (wkeydata != NULL) { + if (nvlist_alloc(&hidden_args, NV_UNIQUE_NAME, 0) != 0) + goto create_failed; + + if (nvlist_add_uint8_array(hidden_args, "wkeydata", + wkeydata, wkeylen) != 0) + goto create_failed; + + if (nvlist_add_nvlist(zc_props, ZPOOL_HIDDEN_ARGS, + hidden_args) != 0) + goto create_failed; + } + } + + if (zc_props && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto create_failed; + + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + + if ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_CREATE, &zc)) != 0) { + + zcmd_free_nvlists(&zc); + nvlist_free(zc_props); + nvlist_free(zc_fsprops); + nvlist_free(hidden_args); + if (wkeydata != NULL) + free(wkeydata); + + switch (errno) { + case EBUSY: + /* + * This can happen if the user has specified the same + * device multiple times. We can't reliably detect this + * until we try to add it and see we already have a + * label. This can also happen under if the device is + * part of an active md or lvm device. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more vdevs refer to the same device, or " + "one of\nthe devices is part of an active md or " + "lvm device")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + case ERANGE: + /* + * This happens if the record size is smaller or larger + * than the allowed size range, or not a power of 2. + * + * NOTE: although zfs_valid_proplist is called earlier, + * this case may have slipped through since the + * pool does not exist yet and it is therefore + * impossible to read properties e.g. max blocksize + * from the pool. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "record size invalid")); + return (zfs_error(hdl, EZFS_BADPROP, msg)); + + case EOVERFLOW: + /* + * This occurs when one of the devices is below + * SPA_MINDEVSIZE. Unfortunately, we can't detect which + * device was the problem device since there's no + * reliable way to determine device size from userland. + */ + { + char buf[64]; + + zfs_nicebytes(SPA_MINDEVSIZE, buf, + sizeof (buf)); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is less than the " + "minimum size (%s)"), buf); + } + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + case ENOSPC: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is out of space")); + return (zfs_error(hdl, EZFS_BADDEV, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } + } + +create_failed: + zcmd_free_nvlists(&zc); + nvlist_free(zc_props); + nvlist_free(zc_fsprops); + nvlist_free(hidden_args); + if (wkeydata != NULL) + free(wkeydata); + return (ret); +} + +/* + * Destroy the given pool. It is up to the caller to ensure that there are no + * datasets left in the pool. + */ +int +zpool_destroy(zpool_handle_t *zhp, const char *log_str) +{ + zfs_cmd_t zc = {"\0"}; + zfs_handle_t *zfp = NULL; + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + + if (zhp->zpool_state == POOL_STATE_ACTIVE && + (zfp = zfs_open(hdl, zhp->zpool_name, ZFS_TYPE_FILESYSTEM)) == NULL) + return (-1); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_history = (uint64_t)(uintptr_t)log_str; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot destroy '%s'"), zhp->zpool_name); + + if (errno == EROFS) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is read only")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + } else { + (void) zpool_standard_error(hdl, errno, msg); + } + + if (zfp) + zfs_close(zfp); + return (-1); + } + + if (zfp) { + remove_mountpoint(zfp); + zfs_close(zfp); + } + + return (0); +} + +/* + * Create a checkpoint in the given pool. + */ +int +zpool_checkpoint(zpool_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + int error; + + error = lzc_pool_checkpoint(zhp->zpool_name); + if (error != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot checkpoint '%s'"), zhp->zpool_name); + (void) zpool_standard_error(hdl, error, msg); + return (-1); + } + + return (0); +} + +/* + * Discard the checkpoint from the given pool. + */ +int +zpool_discard_checkpoint(zpool_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + int error; + + error = lzc_pool_checkpoint_discard(zhp->zpool_name); + if (error != 0) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot discard checkpoint in '%s'"), zhp->zpool_name); + (void) zpool_standard_error(hdl, error, msg); + return (-1); + } + + return (0); +} + +/* + * Add the given vdevs to the pool. The caller must have already performed the + * necessary verification to ensure that the vdev specification is well-formed. + */ +int +zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) +{ + zfs_cmd_t zc = {"\0"}; + int ret; + libzfs_handle_t *hdl = zhp->zpool_hdl; + char msg[1024]; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot add to '%s'"), zhp->zpool_name); + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_SPARES && + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " + "upgraded to add hot spares")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + if (zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL) < + SPA_VERSION_L2CACHE && + nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool must be " + "upgraded to add cache devices")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_ADD, &zc) != 0) { + switch (errno) { + case EBUSY: + /* + * This can happen if the user has specified the same + * device multiple times. We can't reliably detect this + * until we try to add it and see we already have a + * label. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more vdevs refer to the same device")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; a pool with removing/removed " + "vdevs does not support adding raidz vdevs")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EOVERFLOW: + /* + * This occurs when one of the devices is below + * SPA_MINDEVSIZE. Unfortunately, we can't detect which + * device was the problem device since there's no + * reliable way to determine device size from userland. + */ + { + char buf[64]; + + zfs_nicebytes(SPA_MINDEVSIZE, buf, + sizeof (buf)); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device is less than the minimum " + "size (%s)"), buf); + } + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to add these vdevs")); + (void) zfs_error(hdl, EZFS_BADVERSION, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + ret = -1; + } else { + ret = 0; + } + + zcmd_free_nvlists(&zc); + + return (ret); +} + +/* + * Exports the pool from the system. The caller must ensure that there are no + * mounted datasets in the pool. + */ +static int +zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce, + const char *log_str) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot export '%s'"), zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = force; + zc.zc_guid = hardforce; + zc.zc_history = (uint64_t)(uintptr_t)log_str; + + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { + switch (errno) { + case EXDEV: + zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "use '-f' to override the following errors:\n" + "'%s' has an active shared spare which could be" + " used by other pools once '%s' is exported."), + zhp->zpool_name, zhp->zpool_name); + return (zfs_error(zhp->zpool_hdl, EZFS_ACTIVE_SPARE, + msg)); + default: + return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, + msg)); + } + } + + return (0); +} + +int +zpool_export(zpool_handle_t *zhp, boolean_t force, const char *log_str) +{ + return (zpool_export_common(zhp, force, B_FALSE, log_str)); +} + +int +zpool_export_force(zpool_handle_t *zhp, const char *log_str) +{ + return (zpool_export_common(zhp, B_TRUE, B_TRUE, log_str)); +} + +static void +zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, + nvlist_t *config) +{ + nvlist_t *nv = NULL; + uint64_t rewindto; + int64_t loss = -1; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr || config == NULL) + return; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0) { + return; + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + return; + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, "%c", &t) != 0) { + if (dryrun) { + (void) printf(dgettext(TEXT_DOMAIN, + "Would be able to return %s " + "to its state as of %s.\n"), + name, timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "Pool %s returned to its state as of %s.\n"), + name, timestr); + } + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", + ((longlong_t)loss + 30) / 60); + (void) printf(dgettext(TEXT_DOMAIN, + "minutes of transactions.\n")); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", + (longlong_t)loss); + (void) printf(dgettext(TEXT_DOMAIN, + "seconds of transactions.\n")); + } + } +} + +void +zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, + nvlist_t *config) +{ + nvlist_t *nv = NULL; + int64_t loss = -1; + uint64_t edata = UINT64_MAX; + uint64_t rewindto; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr) + return; + + if (reason >= 0) + (void) printf(dgettext(TEXT_DOMAIN, "action: ")); + else + (void) printf(dgettext(TEXT_DOMAIN, "\t")); + + /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nv) != 0 || + nvlist_lookup_nvlist(nv, ZPOOL_CONFIG_REWIND_INFO, &nv) != 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + goto no_info; + + (void) nvlist_lookup_int64(nv, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + &edata); + + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery is possible, but will result in some data loss.\n")); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, "%c", &t) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReturning the pool to its state as of %s\n" + "\tshould correct the problem. "), + timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReverting the pool to an earlier state " + "should correct the problem.\n\t")); + } + + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld minutes of data\n" + "\tmust be discarded, irreversibly. "), + ((longlong_t)loss + 30) / 60); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld seconds of data\n" + "\tmust be discarded, irreversibly. "), + (longlong_t)loss); + } + if (edata != 0 && edata != UINT64_MAX) { + if (edata == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, at least\n" + "\tone persistent user-data error will remain. ")); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, several\n" + "\tpersistent user-data errors will remain. ")); + } + } + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), + reason >= 0 ? "clear" : "import", name); + + (void) printf(dgettext(TEXT_DOMAIN, + "A scrub of the pool\n" + "\tis strongly recommended after recovery.\n")); + return; + +no_info: + (void) printf(dgettext(TEXT_DOMAIN, + "Destroy and re-create the pool from\n\ta backup source.\n")); +} + +/* + * zpool_import() is a contracted interface. Should be kept the same + * if possible. + * + * Applications should use zpool_import_props() to import a pool with + * new properties value to be set. + */ +int +zpool_import(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, + char *altroot) +{ + nvlist_t *props = NULL; + int ret; + + if (altroot != NULL) { + if (nvlist_alloc(&props, NV_UNIQUE_NAME, 0) != 0) { + return (zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + } + + if (nvlist_add_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), altroot) != 0 || + nvlist_add_string(props, + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), "none") != 0) { + nvlist_free(props); + return (zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + } + } + + ret = zpool_import_props(hdl, config, newname, props, + ZFS_IMPORT_NORMAL); + nvlist_free(props); + return (ret); +} + +static void +print_vdev_tree(libzfs_handle_t *hdl, const char *name, nvlist_t *nv, + int indent) +{ + nvlist_t **child; + uint_t c, children; + char *vname; + uint64_t is_log = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, + &is_log); + + if (name != NULL) + (void) printf("\t%*s%s%s\n", indent, "", name, + is_log ? " [log]" : ""); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + vname = zpool_vdev_name(hdl, NULL, child[c], VDEV_NAME_TYPE_ID); + print_vdev_tree(hdl, vname, child[c], indent + 2); + free(vname); + } +} + +void +zpool_print_unsup_feat(nvlist_t *config) +{ + nvlist_t *nvinfo, *unsup_feat; + nvpair_t *nvp; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == + 0); + verify(nvlist_lookup_nvlist(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT, + &unsup_feat) == 0); + + for (nvp = nvlist_next_nvpair(unsup_feat, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(unsup_feat, nvp)) { + char *desc; + + verify(nvpair_type(nvp) == DATA_TYPE_STRING); + verify(nvpair_value_string(nvp, &desc) == 0); + + if (strlen(desc) > 0) + (void) printf("\t%s (%s)\n", nvpair_name(nvp), desc); + else + (void) printf("\t%s\n", nvpair_name(nvp)); + } +} + +/* + * Import the given pool using the known configuration and a list of + * properties to be set. The configuration should have come from + * zpool_find_import(). The 'newname' parameters control whether the pool + * is imported with a different name. + */ +int +zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, + nvlist_t *props, int flags) +{ + zfs_cmd_t zc = {"\0"}; + zpool_load_policy_t policy; + nvlist_t *nv = NULL; + nvlist_t *nvinfo = NULL; + nvlist_t *missing = NULL; + char *thename; + char *origname; + int ret; + int error = 0; + char errbuf[1024]; + + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &origname) == 0); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot import pool '%s'"), origname); + + if (newname != NULL) { + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error_fmt(hdl, EZFS_INVALIDNAME, + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + newname)); + thename = (char *)newname; + } else { + thename = origname; + } + + if (props != NULL) { + uint64_t version; + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + + if ((props = zpool_valid_proplist(hdl, origname, + props, version, flags, errbuf)) == NULL) + return (-1); + if (zcmd_write_src_nvlist(hdl, &zc, props) != 0) { + nvlist_free(props); + return (-1); + } + nvlist_free(props); + } + + (void) strlcpy(zc.zc_name, thename, sizeof (zc.zc_name)); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &zc.zc_guid) == 0); + + if (zcmd_write_conf_nvlist(hdl, &zc, config) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + if (zcmd_alloc_dst_nvlist(hdl, &zc, zc.zc_nvlist_conf_size * 2) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + + zc.zc_cookie = flags; + while ((ret = zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + if (ret != 0) + error = errno; + + (void) zcmd_read_dst_nvlist(hdl, &zc, &nv); + + zcmd_free_nvlists(&zc); + + zpool_get_load_policy(config, &policy); + + if (error) { + char desc[1024]; + char aux[256]; + + /* + * Dry-run failed, but we print out what success + * looks like if we found a best txg + */ + if (policy.zlp_rewind & ZPOOL_TRY_REWIND) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + B_TRUE, nv); + nvlist_free(nv); + return (-1); + } + + if (newname == NULL) + (void) snprintf(desc, sizeof (desc), + dgettext(TEXT_DOMAIN, "cannot import '%s'"), + thename); + else + (void) snprintf(desc, sizeof (desc), + dgettext(TEXT_DOMAIN, "cannot import '%s' as '%s'"), + origname, thename); + + switch (error) { + case ENOTSUP: + if (nv != NULL && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_exists(nvinfo, ZPOOL_CONFIG_UNSUP_FEAT)) { + (void) printf(dgettext(TEXT_DOMAIN, "This " + "pool uses the following feature(s) not " + "supported by this system:\n")); + zpool_print_unsup_feat(nv); + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_CAN_RDONLY)) { + (void) printf(dgettext(TEXT_DOMAIN, + "All unsupported features are only " + "required for writing to the pool." + "\nThe pool can be imported using " + "'-o readonly=on'.\n")); + } + } + /* + * Unsupported version. + */ + (void) zfs_error(hdl, EZFS_BADVERSION, desc); + break; + + case EREMOTEIO: + if (nv != NULL && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0) { + char *hostname = ""; + uint64_t hostid = 0; + mmp_state_t mmp_state; + + mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME)) + hostname = fnvlist_lookup_string(nvinfo, + ZPOOL_CONFIG_MMP_HOSTNAME); + + if (nvlist_exists(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID)) + hostid = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_HOSTID); + + if (mmp_state == MMP_STATE_ACTIVE) { + (void) snprintf(aux, sizeof (aux), + dgettext(TEXT_DOMAIN, "pool is imp" + "orted on host '%s' (hostid=%lx).\n" + "Export the pool on the other " + "system, then run 'zpool import'."), + hostname, (unsigned long) hostid); + } else if (mmp_state == MMP_STATE_NO_HOSTID) { + (void) snprintf(aux, sizeof (aux), + dgettext(TEXT_DOMAIN, "pool has " + "the multihost property on and " + "the\nsystem's hostid is not set. " + "Set a unique system hostid with " + "the zgenhostid(8) command.\n")); + } + + (void) zfs_error_aux(hdl, aux); + } + (void) zfs_error(hdl, EZFS_ACTIVE_POOL, desc); + break; + + case EINVAL: + (void) zfs_error(hdl, EZFS_INVALCONFIG, desc); + break; + + case EROFS: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is read only")); + (void) zfs_error(hdl, EZFS_BADDEV, desc); + break; + + case ENXIO: + if (nv && nvlist_lookup_nvlist(nv, + ZPOOL_CONFIG_LOAD_INFO, &nvinfo) == 0 && + nvlist_lookup_nvlist(nvinfo, + ZPOOL_CONFIG_MISSING_DEVICES, &missing) == 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "The devices below are missing or " + "corrupted, use '-m' to import the pool " + "anyway:\n")); + print_vdev_tree(hdl, NULL, missing, 2); + (void) printf("\n"); + } + (void) zpool_standard_error(hdl, error, desc); + break; + + case EEXIST: + (void) zpool_standard_error(hdl, error, desc); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices are already in use\n")); + (void) zfs_error(hdl, EZFS_BADDEV, desc); + break; + case ENAMETOOLONG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new name of at least one dataset is longer than " + "the maximum allowable length")); + (void) zfs_error(hdl, EZFS_NAMETOOLONG, desc); + break; + default: + (void) zpool_standard_error(hdl, error, desc); + zpool_explain_recover(hdl, + newname ? origname : thename, -error, nv); + break; + } + + nvlist_free(nv); + ret = -1; + } else { + zpool_handle_t *zhp; + + /* + * This should never fail, but play it safe anyway. + */ + if (zpool_open_silent(hdl, thename, &zhp) != 0) + ret = -1; + else if (zhp != NULL) + zpool_close(zhp); + if (policy.zlp_rewind & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), nv); + } + nvlist_free(nv); + return (0); + } + + return (ret); +} + +/* + * Translate vdev names to guids. If a vdev_path is determined to be + * unsuitable then a vd_errlist is allocated and the vdev path and errno + * are added to it. + */ +static int +zpool_translate_vdev_guids(zpool_handle_t *zhp, nvlist_t *vds, + nvlist_t *vdev_guids, nvlist_t *guids_to_paths, nvlist_t **vd_errlist) +{ + nvlist_t *errlist = NULL; + int error = 0; + + for (nvpair_t *elem = nvlist_next_nvpair(vds, NULL); elem != NULL; + elem = nvlist_next_nvpair(vds, elem)) { + boolean_t spare, cache; + + char *vd_path = nvpair_name(elem); + nvlist_t *tgt = zpool_find_vdev(zhp, vd_path, &spare, &cache, + NULL); + + if ((tgt == NULL) || cache || spare) { + if (errlist == NULL) { + errlist = fnvlist_alloc(); + error = EINVAL; + } + + uint64_t err = (tgt == NULL) ? EZFS_NODEVICE : + (spare ? EZFS_ISSPARE : EZFS_ISL2CACHE); + fnvlist_add_int64(errlist, vd_path, err); + continue; + } + + uint64_t guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + fnvlist_add_uint64(vdev_guids, vd_path, guid); + + char msg[MAXNAMELEN]; + (void) snprintf(msg, sizeof (msg), "%llu", (u_longlong_t)guid); + fnvlist_add_string(guids_to_paths, msg, vd_path); + } + + if (error != 0) { + verify(errlist != NULL); + if (vd_errlist != NULL) + *vd_errlist = errlist; + else + fnvlist_free(errlist); + } + + return (error); +} + +static int +xlate_init_err(int err) +{ + switch (err) { + case ENODEV: + return (EZFS_NODEVICE); + case EINVAL: + case EROFS: + return (EZFS_BADDEV); + case EBUSY: + return (EZFS_INITIALIZING); + case ESRCH: + return (EZFS_NO_INITIALIZE); + } + return (err); +} + +/* + * Begin, suspend, or cancel the initialization (initializing of all free + * blocks) for the given vdevs in the given pool. + */ +static int +zpool_initialize_impl(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds, boolean_t wait) +{ + int err; + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); + nvlist_t *vd_errlist = NULL; + nvlist_t *errlist; + nvpair_t *elem; + + err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, + guids_to_paths, &vd_errlist); + + if (err != 0) { + verify(vd_errlist != NULL); + goto list_errors; + } + + err = lzc_initialize(zhp->zpool_name, cmd_type, + vdev_guids, &errlist); + + if (err != 0) { + if (errlist != NULL) { + vd_errlist = fnvlist_lookup_nvlist(errlist, + ZPOOL_INITIALIZE_VDEVS); + goto list_errors; + } + (void) zpool_standard_error(zhp->zpool_hdl, err, + dgettext(TEXT_DOMAIN, "operation failed")); + goto out; + } + + if (wait) { + for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; + elem = nvlist_next_nvpair(vdev_guids, elem)) { + + uint64_t guid = fnvpair_value_uint64(elem); + + err = lzc_wait_tag(zhp->zpool_name, + ZPOOL_WAIT_INITIALIZE, guid, NULL); + if (err != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, + err, dgettext(TEXT_DOMAIN, "error " + "waiting for '%s' to initialize"), + nvpair_name(elem)); + + goto out; + } + } + } + goto out; + +list_errors: + for (elem = nvlist_next_nvpair(vd_errlist, NULL); elem != NULL; + elem = nvlist_next_nvpair(vd_errlist, elem)) { + int64_t vd_error = xlate_init_err(fnvpair_value_int64(elem)); + char *path; + + if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), + &path) != 0) + path = nvpair_name(elem); + + (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, + "cannot initialize '%s'", path); + } + +out: + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + + if (vd_errlist != NULL) + fnvlist_free(vd_errlist); + + return (err == 0 ? 0 : -1); +} + +int +zpool_initialize(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + return (zpool_initialize_impl(zhp, cmd_type, vds, B_FALSE)); +} + +int +zpool_initialize_wait(zpool_handle_t *zhp, pool_initialize_func_t cmd_type, + nvlist_t *vds) +{ + return (zpool_initialize_impl(zhp, cmd_type, vds, B_TRUE)); +} + +static int +xlate_trim_err(int err) +{ + switch (err) { + case ENODEV: + return (EZFS_NODEVICE); + case EINVAL: + case EROFS: + return (EZFS_BADDEV); + case EBUSY: + return (EZFS_TRIMMING); + case ESRCH: + return (EZFS_NO_TRIM); + case EOPNOTSUPP: + return (EZFS_TRIM_NOTSUP); + } + return (err); +} + +static int +zpool_trim_wait(zpool_handle_t *zhp, nvlist_t *vdev_guids) +{ + int err; + nvpair_t *elem; + + for (elem = nvlist_next_nvpair(vdev_guids, NULL); elem != NULL; + elem = nvlist_next_nvpair(vdev_guids, elem)) { + + uint64_t guid = fnvpair_value_uint64(elem); + + err = lzc_wait_tag(zhp->zpool_name, + ZPOOL_WAIT_TRIM, guid, NULL); + if (err != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, + err, dgettext(TEXT_DOMAIN, "error " + "waiting to trim '%s'"), nvpair_name(elem)); + + return (err); + } + } + return (0); +} + +/* + * Check errlist and report any errors, omitting ones which should be + * suppressed. Returns B_TRUE if any errors were reported. + */ +static boolean_t +check_trim_errs(zpool_handle_t *zhp, trimflags_t *trim_flags, + nvlist_t *guids_to_paths, nvlist_t *vds, nvlist_t *errlist) +{ + nvpair_t *elem; + boolean_t reported_errs = B_FALSE; + int num_vds = 0; + int num_suppressed_errs = 0; + + for (elem = nvlist_next_nvpair(vds, NULL); + elem != NULL; elem = nvlist_next_nvpair(vds, elem)) { + num_vds++; + } + + for (elem = nvlist_next_nvpair(errlist, NULL); + elem != NULL; elem = nvlist_next_nvpair(errlist, elem)) { + int64_t vd_error = xlate_trim_err(fnvpair_value_int64(elem)); + char *path; + + /* + * If only the pool was specified, and it was not a secure + * trim then suppress warnings for individual vdevs which + * do not support trimming. + */ + if (vd_error == EZFS_TRIM_NOTSUP && + trim_flags->fullpool && + !trim_flags->secure) { + num_suppressed_errs++; + continue; + } + + reported_errs = B_TRUE; + if (nvlist_lookup_string(guids_to_paths, nvpair_name(elem), + &path) != 0) + path = nvpair_name(elem); + + (void) zfs_error_fmt(zhp->zpool_hdl, vd_error, + "cannot trim '%s'", path); + } + + if (num_suppressed_errs == num_vds) { + (void) zfs_error_aux(zhp->zpool_hdl, dgettext(TEXT_DOMAIN, + "no devices in pool support trim operations")); + (void) (zfs_error(zhp->zpool_hdl, EZFS_TRIM_NOTSUP, + dgettext(TEXT_DOMAIN, "cannot trim"))); + reported_errs = B_TRUE; + } + + return (reported_errs); +} + +/* + * Begin, suspend, or cancel the TRIM (discarding of all free blocks) for + * the given vdevs in the given pool. + */ +int +zpool_trim(zpool_handle_t *zhp, pool_trim_func_t cmd_type, nvlist_t *vds, + trimflags_t *trim_flags) +{ + int err; + int retval = 0; + + nvlist_t *vdev_guids = fnvlist_alloc(); + nvlist_t *guids_to_paths = fnvlist_alloc(); + nvlist_t *errlist = NULL; + + err = zpool_translate_vdev_guids(zhp, vds, vdev_guids, + guids_to_paths, &errlist); + if (err != 0) { + check_trim_errs(zhp, trim_flags, guids_to_paths, vds, errlist); + retval = -1; + goto out; + } + + err = lzc_trim(zhp->zpool_name, cmd_type, trim_flags->rate, + trim_flags->secure, vdev_guids, &errlist); + if (err != 0) { + nvlist_t *vd_errlist; + if (errlist != NULL && nvlist_lookup_nvlist(errlist, + ZPOOL_TRIM_VDEVS, &vd_errlist) == 0) { + if (check_trim_errs(zhp, trim_flags, guids_to_paths, + vds, vd_errlist)) { + retval = -1; + goto out; + } + } else { + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "operation failed")); + zpool_standard_error(zhp->zpool_hdl, err, msg); + retval = -1; + goto out; + } + } + + + if (trim_flags->wait) + retval = zpool_trim_wait(zhp, vdev_guids); + +out: + if (errlist != NULL) + fnvlist_free(errlist); + fnvlist_free(vdev_guids); + fnvlist_free(guids_to_paths); + return (retval); +} + +/* + * Scan the pool. + */ +int +zpool_scan(zpool_handle_t *zhp, pool_scan_func_t func, pool_scrub_cmd_t cmd) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + int err; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = func; + zc.zc_flags = cmd; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_SCAN, &zc) == 0) + return (0); + + err = errno; + + /* ECANCELED on a scrub means we resumed a paused scrub */ + if (err == ECANCELED && func == POOL_SCAN_SCRUB && + cmd == POOL_SCRUB_NORMAL) + return (0); + + if (err == ENOENT && func != POOL_SCAN_NONE && cmd == POOL_SCRUB_NORMAL) + return (0); + + if (func == POOL_SCAN_SCRUB) { + if (cmd == POOL_SCRUB_PAUSE) { + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot pause scrubbing %s"), zc.zc_name); + } else { + assert(cmd == POOL_SCRUB_NORMAL); + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot scrub %s"), zc.zc_name); + } + } else if (func == POOL_SCAN_RESILVER) { + assert(cmd == POOL_SCRUB_NORMAL); + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot restart resilver on %s"), zc.zc_name); + } else if (func == POOL_SCAN_NONE) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel scrubbing %s"), + zc.zc_name); + } else { + assert(!"unexpected result"); + } + + if (err == EBUSY) { + nvlist_t *nvroot; + pool_scan_stat_t *ps = NULL; + uint_t psc; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + (void) nvlist_lookup_uint64_array(nvroot, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); + if (ps && ps->pss_func == POOL_SCAN_SCRUB && + ps->pss_state == DSS_SCANNING) { + if (cmd == POOL_SCRUB_PAUSE) + return (zfs_error(hdl, EZFS_SCRUB_PAUSED, msg)); + else + return (zfs_error(hdl, EZFS_SCRUBBING, msg)); + } else { + return (zfs_error(hdl, EZFS_RESILVERING, msg)); + } + } else if (err == ENOENT) { + return (zfs_error(hdl, EZFS_NO_SCRUB, msg)); + } else if (err == ENOTSUP && func == POOL_SCAN_RESILVER) { + return (zfs_error(hdl, EZFS_NO_RESILVER_DEFER, msg)); + } else { + return (zpool_standard_error(hdl, err, msg)); + } +} + +/* + * Find a vdev that matches the search criteria specified. We use the + * the nvpair name to determine how we should look for the device. + * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL + * spare; but FALSE if its an INUSE spare. + */ +static nvlist_t * +vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) +{ + uint_t c, children; + nvlist_t **child; + nvlist_t *ret; + uint64_t is_log; + char *srchkey; + nvpair_t *pair = nvlist_next_nvpair(search, NULL); + + /* Nothing to look for */ + if (search == NULL || pair == NULL) + return (NULL); + + /* Obtain the key we will use to search */ + srchkey = nvpair_name(pair); + + switch (nvpair_type(pair)) { + case DATA_TYPE_UINT64: + if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { + uint64_t srchval, theguid; + + verify(nvpair_value_uint64(pair, &srchval) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &theguid) == 0); + if (theguid == srchval) + return (nv); + } + break; + + case DATA_TYPE_STRING: { + char *srchval, *val; + + verify(nvpair_value_string(pair, &srchval) == 0); + if (nvlist_lookup_string(nv, srchkey, &val) != 0) + break; + + /* + * Search for the requested value. Special cases: + * + * - ZPOOL_CONFIG_PATH for whole disk entries. These end in + * "-part1", or "p1". The suffix is hidden from the user, + * but included in the string, so this matches around it. + * - ZPOOL_CONFIG_PATH for short names zfs_strcmp_shortname() + * is used to check all possible expanded paths. + * - looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). + * + * Otherwise, all other searches are simple string compares. + */ + if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0) { + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (zfs_strcmp_pathname(srchval, val, wholedisk) == 0) + return (nv); + + } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { + char *type, *idx, *end, *p; + uint64_t id, vdev_id; + + /* + * Determine our vdev type, keeping in mind + * that the srchval is composed of a type and + * vdev id pair (i.e. mirror-4). + */ + if ((type = strdup(srchval)) == NULL) + return (NULL); + + if ((p = strrchr(type, '-')) == NULL) { + free(type); + break; + } + idx = p + 1; + *p = '\0'; + + /* + * If the types don't match then keep looking. + */ + if (strncmp(val, type, strlen(val)) != 0) { + free(type); + break; + } + + verify(zpool_vdev_is_interior(type)); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + + errno = 0; + vdev_id = strtoull(idx, &end, 10); + + free(type); + if (errno != 0) + return (NULL); + + /* + * Now verify that we have the correct vdev id. + */ + if (vdev_id == id) + return (nv); + } + + /* + * Common case + */ + if (strcmp(srchval, val) == 0) + return (nv); + break; + } + + default: + break; + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + return (NULL); + + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + /* + * The 'is_log' value is only set for the toplevel + * vdev, not the leaf vdevs. So we always lookup the + * log device from the root of the vdev tree (where + * 'log' is non-NULL). + */ + if (log != NULL && + nvlist_lookup_uint64(child[c], + ZPOOL_CONFIG_IS_LOG, &is_log) == 0 && + is_log) { + *log = B_TRUE; + } + return (ret); + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + *avail_spare = B_TRUE; + return (ret); + } + } + } + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((ret = vdev_to_nvlist_iter(child[c], search, + avail_spare, l2cache, NULL)) != NULL) { + *l2cache = B_TRUE; + return (ret); + } + } + } + + return (NULL); +} + +/* + * Given a physical path or guid, find the associated vdev. + */ +nvlist_t * +zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, + boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +{ + nvlist_t *search, *nvroot, *ret; + uint64_t guid; + char *end; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + guid = strtoull(ppath, &end, 0); + if (guid != 0 && *end == '\0') { + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); + } else { + verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, + ppath) == 0); + } + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + *l2cache = B_FALSE; + if (log != NULL) + *log = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + +/* + * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). + */ +static boolean_t +zpool_vdev_is_interior(const char *name) +{ + if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(name, VDEV_TYPE_SPARE, strlen(VDEV_TYPE_SPARE)) == 0 || + strncmp(name, + VDEV_TYPE_REPLACING, strlen(VDEV_TYPE_REPLACING)) == 0 || + strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) + return (B_TRUE); + return (B_FALSE); +} + +nvlist_t * +zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) +{ + char *end; + nvlist_t *nvroot, *search, *ret; + uint64_t guid; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + guid = strtoull(path, &end, 0); + if (guid != 0 && *end == '\0') { + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); + } else if (zpool_vdev_is_interior(path)) { + verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); + } else { + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); + } + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + *l2cache = B_FALSE; + if (log != NULL) + *log = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + +static int +vdev_is_online(nvlist_t *nv) +{ + uint64_t ival; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || + nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) + return (0); + + return (1); +} + +/* + * Helper function for zpool_get_physpaths(). + */ +static int +vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, + size_t *bytes_written) +{ + size_t bytes_left, pos, rsz; + char *tmppath; + const char *format; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, + &tmppath) != 0) + return (EZFS_NODEVICE); + + pos = *bytes_written; + bytes_left = physpath_size - pos; + format = (pos == 0) ? "%s" : " %s"; + + rsz = snprintf(physpath + pos, bytes_left, format, tmppath); + *bytes_written += rsz; + + if (rsz >= bytes_left) { + /* if physpath was not copied properly, clear it */ + if (bytes_left != 0) { + physpath[pos] = 0; + } + return (EZFS_NOSPC); + } + return (0); +} + +static int +vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, + size_t *rsz, boolean_t is_spare) +{ + char *type; + int ret; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EZFS_INVALCONFIG); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + /* + * An active spare device has ZPOOL_CONFIG_IS_SPARE set. + * For a spare vdev, we only want to boot from the active + * spare device. + */ + if (is_spare) { + uint64_t spare = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare); + if (!spare) + return (EZFS_INVALCONFIG); + } + + if (vdev_is_online(nv)) { + if ((ret = vdev_get_one_physpath(nv, physpath, + phypath_size, rsz)) != 0) + return (ret); + } + } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || + strcmp(type, VDEV_TYPE_RAIDZ) == 0 || + strcmp(type, VDEV_TYPE_REPLACING) == 0 || + (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { + nvlist_t **child; + uint_t count; + int i, ret; + + if (nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) + return (EZFS_INVALCONFIG); + + for (i = 0; i < count; i++) { + ret = vdev_get_physpaths(child[i], physpath, + phypath_size, rsz, is_spare); + if (ret == EZFS_NOSPC) + return (ret); + } + } + + return (EZFS_POOL_INVALARG); +} + +/* + * Get phys_path for a root pool config. + * Return 0 on success; non-zero on failure. + */ +static int +zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) +{ + size_t rsz; + nvlist_t *vdev_root; + nvlist_t **child; + uint_t count; + char *type; + + rsz = 0; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &vdev_root) != 0) + return (EZFS_INVALCONFIG); + + if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || + nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, + &child, &count) != 0) + return (EZFS_INVALCONFIG); + + /* + * root pool can only have a single top-level vdev. + */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1) + return (EZFS_POOL_INVALARG); + + (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, + B_FALSE); + + /* No online devices */ + if (rsz == 0) + return (EZFS_NODEVICE); + + return (0); +} + +/* + * Get phys_path for a root pool + * Return 0 on success; non-zero on failure. + */ +int +zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) +{ + return (zpool_get_config_physpath(zhp->zpool_config, physpath, + phypath_size)); +} + +/* + * Convert a vdev path to a GUID. Returns GUID or 0 on error. + * + * If is_spare, is_l2cache, or is_log is non-NULL, then store within it + * if the VDEV is a spare, l2cache, or log device. If they're NULL then + * ignore them. + */ +static uint64_t +zpool_vdev_path_to_guid_impl(zpool_handle_t *zhp, const char *path, + boolean_t *is_spare, boolean_t *is_l2cache, boolean_t *is_log) +{ + uint64_t guid; + boolean_t spare = B_FALSE, l2cache = B_FALSE, log = B_FALSE; + nvlist_t *tgt; + + if ((tgt = zpool_find_vdev(zhp, path, &spare, &l2cache, + &log)) == NULL) + return (0); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &guid) == 0); + if (is_spare != NULL) + *is_spare = spare; + if (is_l2cache != NULL) + *is_l2cache = l2cache; + if (is_log != NULL) + *is_log = log; + + return (guid); +} + +/* Convert a vdev path to a GUID. Returns GUID or 0 on error. */ +uint64_t +zpool_vdev_path_to_guid(zpool_handle_t *zhp, const char *path) +{ + return (zpool_vdev_path_to_guid_impl(zhp, path, NULL, NULL, NULL)); +} + +/* + * Bring the specified vdev online. The 'flags' parameter is a set of the + * ZFS_ONLINE_* flags. + */ +int +zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, + vdev_state_t *newstate) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + char *pathname; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + int error; + + if (flags & ZFS_ONLINE_EXPAND) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot expand %s"), path); + } else { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot online %s"), path); + } + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if ((flags & ZFS_ONLINE_EXPAND || + zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) && + nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, &pathname) == 0) { + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + + /* + * XXX - L2ARC 1.0 devices can't support expansion. + */ + if (l2cache) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot expand cache devices")); + return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); + } + + if (wholedisk) { + const char *fullpath = path; + char buf[MAXPATHLEN]; + + if (path[0] != '/') { + error = zfs_resolve_shortname(path, buf, + sizeof (buf)); + if (error != 0) + return (zfs_error(hdl, EZFS_NODEVICE, + msg)); + + fullpath = buf; + } + + error = zpool_relabel_disk(hdl, fullpath, msg); + if (error != 0) + return (error); + } + } + + zc.zc_cookie = VDEV_STATE_ONLINE; + zc.zc_obj = flags; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { + if (errno == EINVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " + "from this pool into a new one. Use '%s' " + "instead"), "zpool detach"); + return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); + } + return (zpool_standard_error(hdl, errno, msg)); + } + + *newstate = zc.zc_cookie; + return (0); +} + +/* + * Take the specified vdev offline + */ +int +zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot offline %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + NULL)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + zc.zc_cookie = VDEV_STATE_OFFLINE; + zc.zc_obj = istmp ? ZFS_OFFLINE_TEMPORARY : 0; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + switch (errno) { + case EBUSY: + + /* + * There are no other replicas of this device. + */ + return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + + case EEXIST: + /* + * The log device has unplayed logs + */ + return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } +} + +/* + * Mark the given vdev faulted. + */ +int +zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot fault %llu"), (u_longlong_t)guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_FAULTED; + zc.zc_obj = aux; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + switch (errno) { + case EBUSY: + + /* + * There are no other replicas of this device. + */ + return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + + default: + return (zpool_standard_error(hdl, errno, msg)); + } + +} + +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_obj = aux; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Returns TRUE if the given nvlist is a vdev that was originally swapped in as + * a hot spare. + */ +static boolean_t +is_replacing_spare(nvlist_t *search, nvlist_t *tgt, int which) +{ + nvlist_t **child; + uint_t c, children; + char *type; + + if (nvlist_lookup_nvlist_array(search, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + verify(nvlist_lookup_string(search, ZPOOL_CONFIG_TYPE, + &type) == 0); + + if (strcmp(type, VDEV_TYPE_SPARE) == 0 && + children == 2 && child[which] == tgt) + return (B_TRUE); + + for (c = 0; c < children; c++) + if (is_replacing_spare(child[c], tgt, which)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Attach new_disk (fully described by nvroot) to old_disk. + * If 'replacing' is specified, the new disk will replace the old one. + */ +int +zpool_vdev_attach(zpool_handle_t *zhp, const char *old_disk, + const char *new_disk, nvlist_t *nvroot, int replacing, boolean_t rebuild) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + int ret; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + uint64_t val; + char *newname; + nvlist_t **child; + uint_t children; + nvlist_t *config_root; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + if (replacing) + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot replace %s with %s"), old_disk, new_disk); + else + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot attach %s to %s"), new_disk, old_disk); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, old_disk, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + zc.zc_cookie = replacing; + zc.zc_simple = rebuild; + + if (rebuild && + zfeature_lookup_guid("org.openzfs:device_rebuild", NULL) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "the loaded zfs module doesn't support device rebuilds")); + return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); + } + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0 || children != 1) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device must be a single disk")); + return (zfs_error(hdl, EZFS_INVALCONFIG, msg)); + } + + verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), + ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); + + if ((newname = zpool_vdev_name(NULL, NULL, child[0], 0)) == NULL) + return (-1); + + /* + * If the target is a hot spare that has been swapped in, we can only + * replace it with another hot spare. + */ + if (replacing && + nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_IS_SPARE, &val) == 0 && + (zpool_find_vdev(zhp, newname, &avail_spare, &l2cache, + NULL) == NULL || !avail_spare) && + is_replacing_spare(config_root, tgt, 1)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "can only be replaced by another hot spare")); + free(newname); + return (zfs_error(hdl, EZFS_BADTARGET, msg)); + } + + free(newname); + + if (zcmd_write_conf_nvlist(hdl, &zc, nvroot) != 0) + return (-1); + + ret = zfs_ioctl(hdl, ZFS_IOC_VDEV_ATTACH, &zc); + + zcmd_free_nvlists(&zc); + + if (ret == 0) + return (0); + + switch (errno) { + case ENOTSUP: + /* + * Can't attach to or replace this type of vdev. + */ + if (replacing) { + uint64_t version = zpool_get_prop_int(zhp, + ZPOOL_PROP_VERSION, NULL); + + if (islog) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot replace a log with a spare")); + } else if (rebuild) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "only mirror vdevs support sequential " + "reconstruction")); + } else if (version >= SPA_VERSION_MULTI_REPLACE) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "already in replacing/spare config; wait " + "for completion or use 'zpool detach'")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot replace a replacing device")); + } + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "can only attach to mirrors and top-level " + "disks")); + } + (void) zfs_error(hdl, EZFS_BADTARGET, msg); + break; + + case EINVAL: + /* + * The new device must be a single disk. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device must be a single disk")); + (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "%s is busy, " + "or device removal is in progress"), + new_disk); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EOVERFLOW: + /* + * The new device is too small. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "device is too small")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case EDOM: + /* + * The new device has a different optimal sector size. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "new device has a different optimal sector size; use the " + "option '-o ashift=N' to override the optimal size")); + (void) zfs_error(hdl, EZFS_BADDEV, msg); + break; + + case ENAMETOOLONG: + /* + * The resulting top-level vdev spec won't fit in the label. + */ + (void) zfs_error(hdl, EZFS_DEVOVERFLOW, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + return (-1); +} + +/* + * Detach the specified device. + */ +int +zpool_vdev_detach(zpool_handle_t *zhp, const char *path) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot detach %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + NULL)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + if (l2cache) + return (zfs_error(hdl, EZFS_ISL2CACHE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_DETACH, &zc) == 0) + return (0); + + switch (errno) { + + case ENOTSUP: + /* + * Can't detach from this type of vdev. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "only " + "applicable to mirror and replacing vdevs")); + (void) zfs_error(hdl, EZFS_BADTARGET, msg); + break; + + case EBUSY: + /* + * There are no other replicas of this device. + */ + (void) zfs_error(hdl, EZFS_NOREPLICAS, msg); + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + + return (-1); +} + +/* + * Find a mirror vdev in the source nvlist. + * + * The mchild array contains a list of disks in one of the top-level mirrors + * of the source pool. The schild array contains a list of disks that the + * user specified on the command line. We loop over the mchild array to + * see if any entry in the schild array matches. + * + * If a disk in the mchild array is found in the schild array, we return + * the index of that entry. Otherwise we return -1. + */ +static int +find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, + nvlist_t **schild, uint_t schildren) +{ + uint_t mc; + + for (mc = 0; mc < mchildren; mc++) { + uint_t sc; + char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, + mchild[mc], 0); + + for (sc = 0; sc < schildren; sc++) { + char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, + schild[sc], 0); + boolean_t result = (strcmp(mpath, spath) == 0); + + free(spath); + if (result) { + free(mpath); + return (mc); + } + } + + free(mpath); + } + + return (-1); +} + +/* + * Split a mirror pool. If newroot points to null, then a new nvlist + * is generated and it is the responsibility of the caller to free it. + */ +int +zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, + nvlist_t *props, splitflags_t flags) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; + nvlist_t **varray = NULL, *zc_props = NULL; + uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t vers, readonly = B_FALSE; + boolean_t freelist = B_FALSE, memory_err = B_TRUE; + int retval = 0; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); + + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("Internal error: unable to " + "retrieve pool configuration\n")); + return (-1); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) + == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); + + if (props) { + prop_flags_t flags = { .create = B_FALSE, .import = B_TRUE }; + if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, + props, vers, flags, msg)) == NULL) + return (-1); + (void) nvlist_lookup_uint64(zc_props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property %s can only be set at import time"), + zpool_prop_to_name(ZPOOL_PROP_READONLY)); + return (-1); + } + } + + if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool is missing vdev tree")); + nvlist_free(zc_props); + return (-1); + } + + varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); + vcount = 0; + + if (*newroot == NULL || + nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, + &newchild, &newchildren) != 0) + newchildren = 0; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE, is_hole = B_FALSE; + char *type; + nvlist_t **mchild, *vdev; + uint_t mchildren; + int entry; + + /* + * Unlike cache & spares, slogs are stored in the + * ZPOOL_CONFIG_CHILDREN array. We filter them out here. + */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + if (is_log || is_hole) { + /* + * Create a hole vdev and put it in the config. + */ + if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) + goto out; + if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0) + goto out; + if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, + 1) != 0) + goto out; + if (lastlog == 0) + lastlog = vcount; + varray[vcount++] = vdev; + continue; + } + lastlog = 0; + verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) + == 0); + + if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev = child[c]; + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + continue; + } else if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool must be composed only of mirrors\n")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + verify(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + + /* find or add an entry for this top-level vdev */ + if (newchildren > 0 && + (entry = find_vdev_entry(zhp, mchild, mchildren, + newchild, newchildren)) >= 0) { + /* We found a disk that the user specified. */ + vdev = mchild[entry]; + ++found; + } else { + /* User didn't specify a disk for this vdev. */ + vdev = mchild[mchildren - 1]; + } + + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + } + + /* did we find every disk the user specified? */ + if (found != newchildren) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " + "include at most one disk from each mirror")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + /* Prepare the nvlist for populating. */ + if (*newroot == NULL) { + if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) + goto out; + freelist = B_TRUE; + if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0) + goto out; + } else { + verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); + } + + /* Add all the children we found */ + if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, + lastlog == 0 ? vcount : lastlog) != 0) + goto out; + + /* + * If we're just doing a dry run, exit now with success. + */ + if (flags.dryrun) { + memory_err = B_FALSE; + freelist = B_FALSE; + goto out; + } + + /* now build up the config list & call the ioctl */ + if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) + goto out; + + if (nvlist_add_nvlist(newconfig, + ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || + nvlist_add_string(newconfig, + ZPOOL_CONFIG_POOL_NAME, newname) != 0 || + nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) + goto out; + + /* + * The new pool is automatically part of the namespace unless we + * explicitly export it. + */ + if (!flags.import) + zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); + if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) + goto out; + if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto out; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { + retval = zpool_standard_error(hdl, errno, msg); + goto out; + } + + freelist = B_FALSE; + memory_err = B_FALSE; + +out: + if (varray != NULL) { + int v; + + for (v = 0; v < vcount; v++) + nvlist_free(varray[v]); + free(varray); + } + zcmd_free_nvlists(&zc); + nvlist_free(zc_props); + nvlist_free(newconfig); + if (freelist) { + nvlist_free(*newroot); + *newroot = NULL; + } + + if (retval != 0) + return (retval); + + if (memory_err) + return (no_memory(hdl)); + + return (0); +} + +/* + * Remove the given device. + */ +int +zpool_vdev_remove(zpool_handle_t *zhp, const char *path) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t version; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot remove %s"), path); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (islog && version < SPA_VERSION_HOLES) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to support log removal")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + + zc.zc_guid = fnvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID); + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) + return (0); + + switch (errno) { + + case EINVAL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid config; all top-level vdevs must " + "have the same sector size and not be raidz.")); + (void) zfs_error(hdl, EZFS_INVALCONFIG, msg); + break; + + case EBUSY: + if (islog) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Mount encrypted datasets to replay logs.")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Pool busy; removal may already be in progress")); + } + (void) zfs_error(hdl, EZFS_BUSY, msg); + break; + + case EACCES: + if (islog) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Mount encrypted datasets to replay logs.")); + (void) zfs_error(hdl, EZFS_BUSY, msg); + } else { + (void) zpool_standard_error(hdl, errno, msg); + } + break; + + default: + (void) zpool_standard_error(hdl, errno, msg); + } + return (-1); +} + +int +zpool_vdev_remove_cancel(zpool_handle_t *zhp) +{ + zfs_cmd_t zc; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot cancel removal")); + + bzero(&zc, sizeof (zc)); + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_cookie = 1; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +int +zpool_vdev_indirect_size(zpool_handle_t *zhp, const char *path, + uint64_t *sizep) +{ + char msg[1024]; + nvlist_t *tgt; + boolean_t avail_spare, l2cache, islog; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot determine indirect size of %s"), + path); + + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, + &islog)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + if (avail_spare || l2cache || islog) { + *sizep = 0; + return (0); + } + + if (nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_INDIRECT_SIZE, sizep) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "indirect size not available")); + return (zfs_error(hdl, EINVAL, msg)); + } + return (0); +} + +/* + * Clear the errors for the pool, or the particular device if specified. + */ +int +zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + nvlist_t *tgt; + zpool_load_policy_t policy; + boolean_t avail_spare, l2cache; + libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nvi = NULL; + int error; + + if (path) + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), + path); + else + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %s"), + zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (path) { + if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, + &l2cache, NULL)) == NULL) + return (zfs_error(hdl, EZFS_NODEVICE, msg)); + + /* + * Don't allow error clearing for hot spares. Do allow + * error clearing for l2cache devices. + */ + if (avail_spare) + return (zfs_error(hdl, EZFS_ISSPARE, msg)); + + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, + &zc.zc_guid) == 0); + } + + zpool_get_load_policy(rewindnvl, &policy); + zc.zc_cookie = policy.zlp_rewind; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, zhp->zpool_config_size * 2) != 0) + return (-1); + + if (zcmd_write_src_nvlist(hdl, &zc, rewindnvl) != 0) + return (-1); + + while ((error = zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc)) != 0 && + errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + zcmd_free_nvlists(&zc); + return (-1); + } + } + + if (!error || ((policy.zlp_rewind & ZPOOL_TRY_REWIND) && + errno != EPERM && errno != EACCES)) { + if (policy.zlp_rewind & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); + zpool_rewind_exclaim(hdl, zc.zc_name, + ((policy.zlp_rewind & ZPOOL_TRY_REWIND) != 0), + nvi); + nvlist_free(nvi); + } + zcmd_free_nvlists(&zc); + return (0); + } + + zcmd_free_nvlists(&zc); + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Similar to zpool_clear(), but takes a GUID (used by fmd). + */ +int +zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot clear errors for %llx"), + (u_longlong_t)guid); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_guid = guid; + zc.zc_cookie = ZPOOL_NO_REWIND; + + if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Change the GUID for a pool. + */ +int +zpool_reguid(zpool_handle_t *zhp) +{ + char msg[1024]; + libzfs_handle_t *hdl = zhp->zpool_hdl; + zfs_cmd_t zc = {"\0"}; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot reguid '%s'"), zhp->zpool_name); + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + if (zfs_ioctl(hdl, ZFS_IOC_POOL_REGUID, &zc) == 0) + return (0); + + return (zpool_standard_error(hdl, errno, msg)); +} + +/* + * Reopen the pool. + */ +int +zpool_reopen_one(zpool_handle_t *zhp, void *data) +{ + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + boolean_t *scrub_restart = data; + int error; + + error = lzc_reopen(pool_name, *scrub_restart); + if (error) { + return (zpool_standard_error_fmt(hdl, error, + dgettext(TEXT_DOMAIN, "cannot reopen '%s'"), pool_name)); + } + + return (0); +} + +/* call into libzfs_core to execute the sync IOCTL per pool */ +int +zpool_sync_one(zpool_handle_t *zhp, void *data) +{ + int ret; + libzfs_handle_t *hdl = zpool_get_handle(zhp); + const char *pool_name = zpool_get_name(zhp); + boolean_t *force = data; + nvlist_t *innvl = fnvlist_alloc(); + + fnvlist_add_boolean_value(innvl, "force", *force); + if ((ret = lzc_sync(pool_name, innvl, NULL)) != 0) { + nvlist_free(innvl); + return (zpool_standard_error_fmt(hdl, ret, + dgettext(TEXT_DOMAIN, "sync '%s' failed"), pool_name)); + } + nvlist_free(innvl); + + return (0); +} + +#define PATH_BUF_LEN 64 + +/* + * Given a vdev, return the name to display in iostat. If the vdev has a path, + * we use that, stripping off any leading "/dev/dsk/"; if not, we use the type. + * We also check if this is a whole disk, in which case we strip off the + * trailing 's0' slice name. + * + * This routine is also responsible for identifying when disks have been + * reconfigured in a new location. The kernel will have opened the device by + * devid, but the path will still refer to the old location. To catch this, we + * first do a path -> devid translation (which is fast for the common case). If + * the devid matches, we're done. If not, we do a reverse devid -> path + * translation and issue the appropriate ioctl() to update the path of the vdev. + * If 'zhp' is NULL, then this is an exported pool, and we don't need to do any + * of these checks. + */ +char * +zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, + int name_flags) +{ + char *path, *type, *env; + uint64_t value; + char buf[PATH_BUF_LEN]; + char tmpbuf[PATH_BUF_LEN]; + + /* + * vdev_name will be "root"/"root-0" for the root vdev, but it is the + * zpool name that will be displayed to the user. + */ + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); + if (zhp != NULL && strcmp(type, "root") == 0) + return (zfs_strdup(hdl, zpool_get_name(zhp))); + + env = getenv("ZPOOL_VDEV_NAME_PATH"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_PATH; + + env = getenv("ZPOOL_VDEV_NAME_GUID"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_GUID; + + env = getenv("ZPOOL_VDEV_NAME_FOLLOW_LINKS"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) + name_flags |= VDEV_NAME_FOLLOW_LINKS; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &value) == 0 || + name_flags & VDEV_NAME_GUID) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &value); + (void) snprintf(buf, sizeof (buf), "%llu", (u_longlong_t)value); + path = buf; + } else if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { + if (name_flags & VDEV_NAME_FOLLOW_LINKS) { + char *rp = realpath(path, NULL); + if (rp) { + strlcpy(buf, rp, sizeof (buf)); + path = buf; + free(rp); + } + } + + /* + * For a block device only use the name. + */ + if ((strcmp(type, VDEV_TYPE_DISK) == 0) && + !(name_flags & VDEV_NAME_PATH)) { + path = zfs_strip_path(path); + } + + /* + * Remove the partition from the path it this is a whole disk. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &value) + == 0 && value && !(name_flags & VDEV_NAME_PATH)) { + return (zfs_strip_partition(path)); + } + } else { + path = type; + + /* + * If it's a raidz device, we need to stick in the parity level. + */ + if (strcmp(path, VDEV_TYPE_RAIDZ) == 0) { + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &value) == 0); + (void) snprintf(buf, sizeof (buf), "%s%llu", path, + (u_longlong_t)value); + path = buf; + } + + /* + * We identify each top-level vdev by using a + * naming convention. + */ + if (name_flags & VDEV_NAME_TYPE_ID) { + uint64_t id; + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s-%llu", + path, (u_longlong_t)id); + path = tmpbuf; + } + } + + return (zfs_strdup(hdl, path)); +} + +static int +zbookmark_mem_compare(const void *a, const void *b) +{ + return (memcmp(a, b, sizeof (zbookmark_phys_t))); +} + +/* + * Retrieve the persistent error log, uniquify the members, and return to the + * caller. + */ +int +zpool_get_errlog(zpool_handle_t *zhp, nvlist_t **nverrlistp) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t count; + zbookmark_phys_t *zb = NULL; + int i; + + /* + * Retrieve the raw error list from the kernel. If the number of errors + * has increased, allocate more space and continue until we get the + * entire list. + */ + verify(nvlist_lookup_uint64(zhp->zpool_config, ZPOOL_CONFIG_ERRCOUNT, + &count) == 0); + if (count == 0) + return (0); + zc.zc_nvlist_dst = (uintptr_t)zfs_alloc(zhp->zpool_hdl, + count * sizeof (zbookmark_phys_t)); + zc.zc_nvlist_dst_size = count; + (void) strcpy(zc.zc_name, zhp->zpool_name); + for (;;) { + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_ERROR_LOG, + &zc) != 0) { + free((void *)(uintptr_t)zc.zc_nvlist_dst); + if (errno == ENOMEM) { + void *dst; + + count = zc.zc_nvlist_dst_size; + dst = zfs_alloc(zhp->zpool_hdl, count * + sizeof (zbookmark_phys_t)); + zc.zc_nvlist_dst = (uintptr_t)dst; + } else { + return (zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "errors: List of " + "errors unavailable"))); + } + } else { + break; + } + } + + /* + * Sort the resulting bookmarks. This is a little confusing due to the + * implementation of ZFS_IOC_ERROR_LOG. The bookmarks are copied last + * to first, and 'zc_nvlist_dst_size' indicates the number of bookmarks + * _not_ copied as part of the process. So we point the start of our + * array appropriate and decrement the total number of elements. + */ + zb = ((zbookmark_phys_t *)(uintptr_t)zc.zc_nvlist_dst) + + zc.zc_nvlist_dst_size; + count -= zc.zc_nvlist_dst_size; + + qsort(zb, count, sizeof (zbookmark_phys_t), zbookmark_mem_compare); + + verify(nvlist_alloc(nverrlistp, 0, KM_SLEEP) == 0); + + /* + * Fill in the nverrlistp with nvlist's of dataset and object numbers. + */ + for (i = 0; i < count; i++) { + nvlist_t *nv; + + /* ignoring zb_blkid and zb_level for now */ + if (i > 0 && zb[i-1].zb_objset == zb[i].zb_objset && + zb[i-1].zb_object == zb[i].zb_object) + continue; + + if (nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) != 0) + goto nomem; + if (nvlist_add_uint64(nv, ZPOOL_ERR_DATASET, + zb[i].zb_objset) != 0) { + nvlist_free(nv); + goto nomem; + } + if (nvlist_add_uint64(nv, ZPOOL_ERR_OBJECT, + zb[i].zb_object) != 0) { + nvlist_free(nv); + goto nomem; + } + if (nvlist_add_nvlist(*nverrlistp, "ejk", nv) != 0) { + nvlist_free(nv); + goto nomem; + } + nvlist_free(nv); + } + + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (0); + +nomem: + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (no_memory(zhp->zpool_hdl)); +} + +/* + * Upgrade a ZFS pool to the latest on-disk version. + */ +int +zpool_upgrade(zpool_handle_t *zhp, uint64_t new_version) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strcpy(zc.zc_name, zhp->zpool_name); + zc.zc_cookie = new_version; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_UPGRADE, &zc) != 0) + return (zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot upgrade '%s'"), + zhp->zpool_name)); + return (0); +} + +void +zfs_save_arguments(int argc, char **argv, char *string, int len) +{ + int i; + + (void) strlcpy(string, basename(argv[0]), len); + for (i = 1; i < argc; i++) { + (void) strlcat(string, " ", len); + (void) strlcat(string, argv[i], len); + } +} + +int +zpool_log_history(libzfs_handle_t *hdl, const char *message) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *args; + int err; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "message", message); + err = zcmd_write_src_nvlist(hdl, &zc, args); + if (err == 0) + err = zfs_ioctl(hdl, ZFS_IOC_LOG_HISTORY, &zc); + nvlist_free(args); + zcmd_free_nvlists(&zc); + return (err); +} + +/* + * Perform ioctl to get some command history of a pool. + * + * 'buf' is the buffer to fill up to 'len' bytes. 'off' is the + * logical offset of the history buffer to start reading from. + * + * Upon return, 'off' is the next logical offset to read from and + * 'len' is the actual amount of bytes read into 'buf'. + */ +static int +get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zpool_hdl; + + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + + zc.zc_history = (uint64_t)(uintptr_t)buf; + zc.zc_history_len = *len; + zc.zc_history_offset = *off; + + if (zfs_ioctl(hdl, ZFS_IOC_POOL_GET_HISTORY, &zc) != 0) { + switch (errno) { + case EPERM: + return (zfs_error_fmt(hdl, EZFS_PERM, + dgettext(TEXT_DOMAIN, + "cannot show history for pool '%s'"), + zhp->zpool_name)); + case ENOENT: + return (zfs_error_fmt(hdl, EZFS_NOHISTORY, + dgettext(TEXT_DOMAIN, "cannot get history for pool " + "'%s'"), zhp->zpool_name)); + case ENOTSUP: + return (zfs_error_fmt(hdl, EZFS_BADVERSION, + dgettext(TEXT_DOMAIN, "cannot get history for pool " + "'%s', pool must be upgraded"), zhp->zpool_name)); + default: + return (zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot get history for '%s'"), zhp->zpool_name)); + } + } + + *len = zc.zc_history_len; + *off = zc.zc_history_offset; + + return (0); +} + +/* + * Retrieve the command history of a pool. + */ +int +zpool_get_history(zpool_handle_t *zhp, nvlist_t **nvhisp, uint64_t *off, + boolean_t *eof) +{ + char *buf; + int buflen = 128 * 1024; + nvlist_t **records = NULL; + uint_t numrecords = 0; + int err, i; + uint64_t start = *off; + + buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); + /* process about 1MB a time */ + while (*off - start < 1024 * 1024) { + uint64_t bytes_read = buflen; + uint64_t leftover; + + if ((err = get_history(zhp, buf, off, &bytes_read)) != 0) + break; + + /* if nothing else was read in, we're at EOF, just return */ + if (!bytes_read) { + *eof = B_TRUE; + break; + } + + if ((err = zpool_history_unpack(buf, bytes_read, + &leftover, &records, &numrecords)) != 0) + break; + *off -= leftover; + if (leftover == bytes_read) { + /* + * no progress made, because buffer is not big enough + * to hold this record; resize and retry. + */ + buflen *= 2; + free(buf); + buf = malloc(buflen); + if (buf == NULL) + return (ENOMEM); + } + } + + free(buf); + + if (!err) { + verify(nvlist_alloc(nvhisp, NV_UNIQUE_NAME, 0) == 0); + verify(nvlist_add_nvlist_array(*nvhisp, ZPOOL_HIST_RECORD, + records, numrecords) == 0); + } + for (i = 0; i < numrecords; i++) + nvlist_free(records[i]); + free(records); + + return (err); +} + +/* + * Retrieve the next event given the passed 'zevent_fd' file descriptor. + * If there is a new event available 'nvp' will contain a newly allocated + * nvlist and 'dropped' will be set to the number of missed events since + * the last call to this function. When 'nvp' is set to NULL it indicates + * no new events are available. In either case the function returns 0 and + * it is up to the caller to free 'nvp'. In the case of a fatal error the + * function will return a non-zero value. When the function is called in + * blocking mode (the default, unless the ZEVENT_NONBLOCK flag is passed), + * it will not return until a new event is available. + */ +int +zpool_events_next(libzfs_handle_t *hdl, nvlist_t **nvp, + int *dropped, unsigned flags, int zevent_fd) +{ + zfs_cmd_t zc = {"\0"}; + int error = 0; + + *nvp = NULL; + *dropped = 0; + zc.zc_cleanup_fd = zevent_fd; + + if (flags & ZEVENT_NONBLOCK) + zc.zc_guid = ZEVENT_NONBLOCK; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, ZEVENT_SIZE) != 0) + return (-1); + +retry: + if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_NEXT, &zc) != 0) { + switch (errno) { + case ESHUTDOWN: + error = zfs_error_fmt(hdl, EZFS_POOLUNAVAIL, + dgettext(TEXT_DOMAIN, "zfs shutdown")); + goto out; + case ENOENT: + /* Blocking error case should not occur */ + if (!(flags & ZEVENT_NONBLOCK)) + error = zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot get event")); + + goto out; + case ENOMEM: + if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { + error = zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot get event")); + goto out; + } else { + goto retry; + } + default: + error = zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot get event")); + goto out; + } + } + + error = zcmd_read_dst_nvlist(hdl, &zc, nvp); + if (error != 0) + goto out; + + *dropped = (int)zc.zc_cookie; +out: + zcmd_free_nvlists(&zc); + + return (error); +} + +/* + * Clear all events. + */ +int +zpool_events_clear(libzfs_handle_t *hdl, int *count) +{ + zfs_cmd_t zc = {"\0"}; + char msg[1024]; + + (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, + "cannot clear events")); + + if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_CLEAR, &zc) != 0) + return (zpool_standard_error_fmt(hdl, errno, msg)); + + if (count != NULL) + *count = (int)zc.zc_cookie; /* # of events cleared */ + + return (0); +} + +/* + * Seek to a specific EID, ZEVENT_SEEK_START, or ZEVENT_SEEK_END for + * the passed zevent_fd file handle. On success zero is returned, + * otherwise -1 is returned and hdl->libzfs_error is set to the errno. + */ +int +zpool_events_seek(libzfs_handle_t *hdl, uint64_t eid, int zevent_fd) +{ + zfs_cmd_t zc = {"\0"}; + int error = 0; + + zc.zc_guid = eid; + zc.zc_cleanup_fd = zevent_fd; + + if (zfs_ioctl(hdl, ZFS_IOC_EVENTS_SEEK, &zc) != 0) { + switch (errno) { + case ENOENT: + error = zfs_error_fmt(hdl, EZFS_NOENT, + dgettext(TEXT_DOMAIN, "cannot get event")); + break; + + case ENOMEM: + error = zfs_error_fmt(hdl, EZFS_NOMEM, + dgettext(TEXT_DOMAIN, "cannot get event")); + break; + + default: + error = zpool_standard_error_fmt(hdl, errno, + dgettext(TEXT_DOMAIN, "cannot get event")); + break; + } + } + + return (error); +} + +static void +zpool_obj_to_path_impl(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len, boolean_t always_unmounted) +{ + zfs_cmd_t zc = {"\0"}; + boolean_t mounted = B_FALSE; + char *mntpnt = NULL; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + + if (dsobj == 0) { + /* special case for the MOS */ + (void) snprintf(pathname, len, ":<0x%llx>", + (longlong_t)obj); + return; + } + + /* get the dataset's name */ + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + zc.zc_obj = dsobj; + if (zfs_ioctl(zhp->zpool_hdl, + ZFS_IOC_DSOBJ_TO_DSNAME, &zc) != 0) { + /* just write out a path of two object numbers */ + (void) snprintf(pathname, len, "<0x%llx>:<0x%llx>", + (longlong_t)dsobj, (longlong_t)obj); + return; + } + (void) strlcpy(dsname, zc.zc_value, sizeof (dsname)); + + /* find out if the dataset is mounted */ + mounted = !always_unmounted && is_mounted(zhp->zpool_hdl, dsname, + &mntpnt); + + /* get the corrupted object's path */ + (void) strlcpy(zc.zc_name, dsname, sizeof (zc.zc_name)); + zc.zc_obj = obj; + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_OBJ_TO_PATH, + &zc) == 0) { + if (mounted) { + (void) snprintf(pathname, len, "%s%s", mntpnt, + zc.zc_value); + } else { + (void) snprintf(pathname, len, "%s:%s", + dsname, zc.zc_value); + } + } else { + (void) snprintf(pathname, len, "%s:<0x%llx>", dsname, + (longlong_t)obj); + } + free(mntpnt); +} + +void +zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len) +{ + zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_FALSE); +} + +void +zpool_obj_to_path_ds(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, + char *pathname, size_t len) +{ + zpool_obj_to_path_impl(zhp, dsobj, obj, pathname, len, B_TRUE); +} +/* + * Wait while the specified activity is in progress in the pool. + */ +int +zpool_wait(zpool_handle_t *zhp, zpool_wait_activity_t activity) +{ + boolean_t missing; + + int error = zpool_wait_status(zhp, activity, &missing, NULL); + + if (missing) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, ENOENT, + dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), + zhp->zpool_name); + return (ENOENT); + } else { + return (error); + } +} + +/* + * Wait for the given activity and return the status of the wait (whether or not + * any waiting was done) in the 'waited' parameter. Non-existent pools are + * reported via the 'missing' parameter, rather than by printing an error + * message. This is convenient when this function is called in a loop over a + * long period of time (as it is, for example, by zpool's wait cmd). In that + * scenario, a pool being exported or destroyed should be considered a normal + * event, so we don't want to print an error when we find that the pool doesn't + * exist. + */ +int +zpool_wait_status(zpool_handle_t *zhp, zpool_wait_activity_t activity, + boolean_t *missing, boolean_t *waited) +{ + int error = lzc_wait(zhp->zpool_name, activity, waited); + *missing = (error == ENOENT); + if (*missing) + return (0); + + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, "error waiting in pool '%s'"), + zhp->zpool_name); + } + + return (error); +} + +int +zpool_set_bootenv(zpool_handle_t *zhp, const char *envmap) +{ + int error = lzc_set_bootenv(zhp->zpool_name, envmap); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error setting bootenv in pool '%s'"), zhp->zpool_name); + } + + return (error); +} + +int +zpool_get_bootenv(zpool_handle_t *zhp, char *outbuf, size_t size, off_t offset) +{ + nvlist_t *nvl = NULL; + int error = lzc_get_bootenv(zhp->zpool_name, &nvl); + if (error != 0) { + (void) zpool_standard_error_fmt(zhp->zpool_hdl, error, + dgettext(TEXT_DOMAIN, + "error getting bootenv in pool '%s'"), zhp->zpool_name); + return (-1); + } + char *envmap = fnvlist_lookup_string(nvl, "envmap"); + if (offset >= strlen(envmap)) { + fnvlist_free(nvl); + return (0); + } + + strncpy(outbuf, envmap + offset, size); + int bytes = MIN(strlen(envmap + offset), size); + fnvlist_free(nvl); + return (bytes); +} diff --git a/lib/libzfs/libzfs_sendrecv.c b/lib/libzfs/libzfs_sendrecv.c new file mode 100644 index 000000000000..f1524c384feb --- /dev/null +++ b/lib/libzfs/libzfs_sendrecv.c @@ -0,0 +1,5170 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2012, Joyent, Inc. All rights reserved. + * Copyright (c) 2012 Pawel Jakub Dawidek . + * All rights reserved + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_fletcher.h" +#include "libzfs_impl.h" +#include +#include +#include +#include +#include +#include +#include + +static int zfs_receive_impl(libzfs_handle_t *, const char *, const char *, + recvflags_t *, int, const char *, nvlist_t *, avl_tree_t *, char **, + const char *, nvlist_t *); +static int guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name); +static int guid_to_name(libzfs_handle_t *, const char *, + uint64_t, boolean_t, char *); + +typedef struct progress_arg { + zfs_handle_t *pa_zhp; + int pa_fd; + boolean_t pa_parsable; + boolean_t pa_estimate; + int pa_verbosity; +} progress_arg_t; + +static int +dump_record(dmu_replay_record_t *drr, void *payload, int payload_len, + zio_cksum_t *zc, int outfd) +{ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + fletcher_4_incremental_native(drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), zc); + if (drr->drr_type != DRR_BEGIN) { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&drr->drr_u. + drr_checksum.drr_checksum)); + drr->drr_u.drr_checksum.drr_checksum = *zc; + } + fletcher_4_incremental_native(&drr->drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), zc); + if (write(outfd, drr, sizeof (*drr)) == -1) + return (errno); + if (payload_len != 0) { + fletcher_4_incremental_native(payload, payload_len, zc); + if (write(outfd, payload, payload_len) == -1) + return (errno); + } + return (0); +} + +/* + * Routines for dealing with the AVL tree of fs-nvlists + */ +typedef struct fsavl_node { + avl_node_t fn_node; + nvlist_t *fn_nvfs; + char *fn_snapname; + uint64_t fn_guid; +} fsavl_node_t; + +static int +fsavl_compare(const void *arg1, const void *arg2) +{ + const fsavl_node_t *fn1 = (const fsavl_node_t *)arg1; + const fsavl_node_t *fn2 = (const fsavl_node_t *)arg2; + + return (TREE_CMP(fn1->fn_guid, fn2->fn_guid)); +} + +/* + * Given the GUID of a snapshot, find its containing filesystem and + * (optionally) name. + */ +static nvlist_t * +fsavl_find(avl_tree_t *avl, uint64_t snapguid, char **snapname) +{ + fsavl_node_t fn_find; + fsavl_node_t *fn; + + fn_find.fn_guid = snapguid; + + fn = avl_find(avl, &fn_find, NULL); + if (fn) { + if (snapname) + *snapname = fn->fn_snapname; + return (fn->fn_nvfs); + } + return (NULL); +} + +static void +fsavl_destroy(avl_tree_t *avl) +{ + fsavl_node_t *fn; + void *cookie; + + if (avl == NULL) + return; + + cookie = NULL; + while ((fn = avl_destroy_nodes(avl, &cookie)) != NULL) + free(fn); + avl_destroy(avl); + free(avl); +} + +/* + * Given an nvlist, produce an avl tree of snapshots, ordered by guid + */ +static avl_tree_t * +fsavl_create(nvlist_t *fss) +{ + avl_tree_t *fsavl; + nvpair_t *fselem = NULL; + + if ((fsavl = malloc(sizeof (avl_tree_t))) == NULL) + return (NULL); + + avl_create(fsavl, fsavl_compare, sizeof (fsavl_node_t), + offsetof(fsavl_node_t, fn_node)); + + while ((fselem = nvlist_next_nvpair(fss, fselem)) != NULL) { + nvlist_t *nvfs, *snaps; + nvpair_t *snapelem = NULL; + + VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); + VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); + + while ((snapelem = + nvlist_next_nvpair(snaps, snapelem)) != NULL) { + fsavl_node_t *fn; + uint64_t guid; + + VERIFY(0 == nvpair_value_uint64(snapelem, &guid)); + if ((fn = malloc(sizeof (fsavl_node_t))) == NULL) { + fsavl_destroy(fsavl); + return (NULL); + } + fn->fn_nvfs = nvfs; + fn->fn_snapname = nvpair_name(snapelem); + fn->fn_guid = guid; + + /* + * Note: if there are multiple snaps with the + * same GUID, we ignore all but one. + */ + if (avl_find(fsavl, fn, NULL) == NULL) + avl_add(fsavl, fn); + else + free(fn); + } + } + + return (fsavl); +} + +/* + * Routines for dealing with the giant nvlist of fs-nvlists, etc. + */ +typedef struct send_data { + /* + * assigned inside every recursive call, + * restored from *_save on return: + * + * guid of fromsnap snapshot in parent dataset + * txg of fromsnap snapshot in current dataset + * txg of tosnap snapshot in current dataset + */ + + uint64_t parent_fromsnap_guid; + uint64_t fromsnap_txg; + uint64_t tosnap_txg; + + /* the nvlists get accumulated during depth-first traversal */ + nvlist_t *parent_snaps; + nvlist_t *fss; + nvlist_t *snapprops; + nvlist_t *snapholds; /* user holds */ + + /* send-receive configuration, does not change during traversal */ + const char *fsname; + const char *fromsnap; + const char *tosnap; + boolean_t recursive; + boolean_t raw; + boolean_t doall; + boolean_t replicate; + boolean_t verbose; + boolean_t backup; + boolean_t seenfrom; + boolean_t seento; + boolean_t holds; /* were holds requested with send -h */ + boolean_t props; + + /* + * The header nvlist is of the following format: + * { + * "tosnap" -> string + * "fromsnap" -> string (if incremental) + * "fss" -> { + * id -> { + * + * "name" -> string (full name; for debugging) + * "parentfromsnap" -> number (guid of fromsnap in parent) + * + * "props" -> { name -> value (only if set here) } + * "snaps" -> { name (lastname) -> number (guid) } + * "snapprops" -> { name (lastname) -> { name -> value } } + * "snapholds" -> { name (lastname) -> { holdname -> crtime } } + * + * "origin" -> number (guid) (if clone) + * "is_encroot" -> boolean + * "sent" -> boolean (not on-disk) + * } + * } + * } + * + */ +} send_data_t; + +static void +send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv); + +static int +send_iterate_snap(zfs_handle_t *zhp, void *arg) +{ + send_data_t *sd = arg; + uint64_t guid = zhp->zfs_dmustats.dds_guid; + uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; + char *snapname; + nvlist_t *nv; + boolean_t isfromsnap, istosnap, istosnapwithnofrom; + + snapname = strrchr(zhp->zfs_name, '@')+1; + isfromsnap = (sd->fromsnap != NULL && + strcmp(sd->fromsnap, snapname) == 0); + istosnap = (sd->tosnap != NULL && (strcmp(sd->tosnap, snapname) == 0)); + istosnapwithnofrom = (istosnap && sd->fromsnap == NULL); + + if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { + if (sd->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "skipping snapshot %s because it was created " + "after the destination snapshot (%s)\n"), + zhp->zfs_name, sd->tosnap); + } + zfs_close(zhp); + return (0); + } + + VERIFY(0 == nvlist_add_uint64(sd->parent_snaps, snapname, guid)); + /* + * NB: if there is no fromsnap here (it's a newly created fs in + * an incremental replication), we will substitute the tosnap. + */ + if (isfromsnap || (sd->parent_fromsnap_guid == 0 && istosnap)) { + sd->parent_fromsnap_guid = guid; + } + + if (!sd->recursive) { + if (!sd->seenfrom && isfromsnap) { + sd->seenfrom = B_TRUE; + zfs_close(zhp); + return (0); + } + + if ((sd->seento || !sd->seenfrom) && !istosnapwithnofrom) { + zfs_close(zhp); + return (0); + } + + if (istosnap) + sd->seento = B_TRUE; + } + + VERIFY(0 == nvlist_alloc(&nv, NV_UNIQUE_NAME, 0)); + send_iterate_prop(zhp, sd->backup, nv); + VERIFY(0 == nvlist_add_nvlist(sd->snapprops, snapname, nv)); + nvlist_free(nv); + if (sd->holds) { + nvlist_t *holds = fnvlist_alloc(); + int err = lzc_get_holds(zhp->zfs_name, &holds); + if (err == 0) { + VERIFY(0 == nvlist_add_nvlist(sd->snapholds, + snapname, holds)); + } + fnvlist_free(holds); + } + + zfs_close(zhp); + return (0); +} + +static void +send_iterate_prop(zfs_handle_t *zhp, boolean_t received_only, nvlist_t *nv) +{ + nvlist_t *props = NULL; + nvpair_t *elem = NULL; + + if (received_only) + props = zfs_get_recvd_props(zhp); + else + props = zhp->zfs_props; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + char *propname = nvpair_name(elem); + zfs_prop_t prop = zfs_name_to_prop(propname); + nvlist_t *propnv; + + if (!zfs_prop_user(propname)) { + /* + * Realistically, this should never happen. However, + * we want the ability to add DSL properties without + * needing to make incompatible version changes. We + * need to ignore unknown properties to allow older + * software to still send datasets containing these + * properties, with the unknown properties elided. + */ + if (prop == ZPROP_INVAL) + continue; + + if (zfs_prop_readonly(prop)) + continue; + } + + verify(nvpair_value_nvlist(elem, &propnv) == 0); + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { + char *source; + uint64_t value; + verify(nvlist_lookup_uint64(propnv, + ZPROP_VALUE, &value) == 0); + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) + continue; + /* + * May have no source before SPA_VERSION_RECVD_PROPS, + * but is still modifiable. + */ + if (nvlist_lookup_string(propnv, + ZPROP_SOURCE, &source) == 0) { + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, + ZPROP_SOURCE_VAL_RECVD) != 0)) + continue; + } + } else { + char *source; + if (nvlist_lookup_string(propnv, + ZPROP_SOURCE, &source) != 0) + continue; + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) + continue; + } + + if (zfs_prop_user(propname) || + zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + char *value; + verify(nvlist_lookup_string(propnv, + ZPROP_VALUE, &value) == 0); + VERIFY(0 == nvlist_add_string(nv, propname, value)); + } else { + uint64_t value; + verify(nvlist_lookup_uint64(propnv, + ZPROP_VALUE, &value) == 0); + VERIFY(0 == nvlist_add_uint64(nv, propname, value)); + } + } +} + +/* + * returns snapshot creation txg + * and returns 0 if the snapshot does not exist + */ +static uint64_t +get_snap_txg(libzfs_handle_t *hdl, const char *fs, const char *snap) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t txg = 0; + + if (fs == NULL || fs[0] == '\0' || snap == NULL || snap[0] == '\0') + return (txg); + + (void) snprintf(name, sizeof (name), "%s@%s", fs, snap); + if (zfs_dataset_exists(hdl, name, ZFS_TYPE_SNAPSHOT)) { + zfs_handle_t *zhp = zfs_open(hdl, name, ZFS_TYPE_SNAPSHOT); + if (zhp != NULL) { + txg = zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG); + zfs_close(zhp); + } + } + + return (txg); +} + +/* + * recursively generate nvlists describing datasets. See comment + * for the data structure send_data_t above for description of contents + * of the nvlist. + */ +static int +send_iterate_fs(zfs_handle_t *zhp, void *arg) +{ + send_data_t *sd = arg; + nvlist_t *nvfs = NULL, *nv = NULL; + int rv = 0; + uint64_t min_txg = 0, max_txg = 0; + uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; + uint64_t fromsnap_txg_save = sd->fromsnap_txg; + uint64_t tosnap_txg_save = sd->tosnap_txg; + uint64_t txg = zhp->zfs_dmustats.dds_creation_txg; + uint64_t guid = zhp->zfs_dmustats.dds_guid; + uint64_t fromsnap_txg, tosnap_txg; + char guidstring[64]; + + fromsnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->fromsnap); + if (fromsnap_txg != 0) + sd->fromsnap_txg = fromsnap_txg; + + tosnap_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, sd->tosnap); + if (tosnap_txg != 0) + sd->tosnap_txg = tosnap_txg; + + /* + * on the send side, if the current dataset does not have tosnap, + * perform two additional checks: + * + * - skip sending the current dataset if it was created later than + * the parent tosnap + * - return error if the current dataset was created earlier than + * the parent tosnap + */ + if (sd->tosnap != NULL && tosnap_txg == 0) { + if (sd->tosnap_txg != 0 && txg > sd->tosnap_txg) { + if (sd->verbose) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "skipping dataset %s: snapshot %s does " + "not exist\n"), zhp->zfs_name, sd->tosnap); + } + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "cannot send %s@%s%s: snapshot %s@%s does not " + "exist\n"), sd->fsname, sd->tosnap, sd->recursive ? + dgettext(TEXT_DOMAIN, " recursively") : "", + zhp->zfs_name, sd->tosnap); + rv = EZFS_NOENT; + } + goto out; + } + + nvfs = fnvlist_alloc(); + fnvlist_add_string(nvfs, "name", zhp->zfs_name); + fnvlist_add_uint64(nvfs, "parentfromsnap", + sd->parent_fromsnap_guid); + + if (zhp->zfs_dmustats.dds_origin[0]) { + zfs_handle_t *origin = zfs_open(zhp->zfs_hdl, + zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); + if (origin == NULL) { + rv = -1; + goto out; + } + fnvlist_add_uint64(nvfs, "origin", + origin->zfs_dmustats.dds_guid); + + zfs_close(origin); + } + + /* iterate over props */ + if (sd->props || sd->backup || sd->recursive) { + nv = fnvlist_alloc(); + send_iterate_prop(zhp, sd->backup, nv); + } + if (zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != ZIO_CRYPT_OFF) { + boolean_t encroot; + + /* determine if this dataset is an encryption root */ + if (zfs_crypto_get_encryption_root(zhp, &encroot, NULL) != 0) { + rv = -1; + goto out; + } + + if (encroot) + fnvlist_add_boolean(nvfs, "is_encroot"); + + /* + * Encrypted datasets can only be sent with properties if + * the raw flag is specified because the receive side doesn't + * currently have a mechanism for recursively asking the user + * for new encryption parameters. + */ + if (!sd->raw) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "cannot send %s@%s: encrypted dataset %s may not " + "be sent with properties without the raw flag\n"), + sd->fsname, sd->tosnap, zhp->zfs_name); + rv = -1; + goto out; + } + + } + + if (nv != NULL) + fnvlist_add_nvlist(nvfs, "props", nv); + + /* iterate over snaps, and set sd->parent_fromsnap_guid */ + sd->parent_fromsnap_guid = 0; + sd->parent_snaps = fnvlist_alloc(); + sd->snapprops = fnvlist_alloc(); + if (sd->holds) + VERIFY(0 == nvlist_alloc(&sd->snapholds, NV_UNIQUE_NAME, 0)); + + + /* + * If this is a "doall" send, a replicate send or we're just trying + * to gather a list of previous snapshots, iterate through all the + * snaps in the txg range. Otherwise just look at the one we're + * interested in. + */ + if (sd->doall || sd->replicate || sd->tosnap == NULL) { + if (!sd->replicate && fromsnap_txg != 0) + min_txg = fromsnap_txg; + if (!sd->replicate && tosnap_txg != 0) + max_txg = tosnap_txg; + (void) zfs_iter_snapshots_sorted(zhp, send_iterate_snap, sd, + min_txg, max_txg); + } else { + char snapname[MAXPATHLEN] = { 0 }; + zfs_handle_t *snap; + + (void) snprintf(snapname, sizeof (snapname), "%s@%s", + zhp->zfs_name, sd->tosnap); + if (sd->fromsnap != NULL) + sd->seenfrom = B_TRUE; + snap = zfs_open(zhp->zfs_hdl, snapname, + ZFS_TYPE_SNAPSHOT); + if (snap != NULL) + (void) send_iterate_snap(snap, sd); + } + + fnvlist_add_nvlist(nvfs, "snaps", sd->parent_snaps); + fnvlist_add_nvlist(nvfs, "snapprops", sd->snapprops); + if (sd->holds) + fnvlist_add_nvlist(nvfs, "snapholds", sd->snapholds); + fnvlist_free(sd->parent_snaps); + fnvlist_free(sd->snapprops); + fnvlist_free(sd->snapholds); + + /* add this fs to nvlist */ + (void) snprintf(guidstring, sizeof (guidstring), + "0x%llx", (longlong_t)guid); + fnvlist_add_nvlist(sd->fss, guidstring, nvfs); + + /* iterate over children */ + if (sd->recursive) + rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); + +out: + sd->parent_fromsnap_guid = parent_fromsnap_guid_save; + sd->fromsnap_txg = fromsnap_txg_save; + sd->tosnap_txg = tosnap_txg_save; + fnvlist_free(nv); + fnvlist_free(nvfs); + + zfs_close(zhp); + return (rv); +} + +static int +gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, + const char *tosnap, boolean_t recursive, boolean_t raw, boolean_t doall, + boolean_t replicate, boolean_t verbose, boolean_t backup, boolean_t holds, + boolean_t props, nvlist_t **nvlp, avl_tree_t **avlp) +{ + zfs_handle_t *zhp; + send_data_t sd = { 0 }; + int error; + + zhp = zfs_open(hdl, fsname, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return (EZFS_BADTYPE); + + VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); + sd.fsname = fsname; + sd.fromsnap = fromsnap; + sd.tosnap = tosnap; + sd.recursive = recursive; + sd.raw = raw; + sd.doall = doall; + sd.replicate = replicate; + sd.verbose = verbose; + sd.backup = backup; + sd.holds = holds; + sd.props = props; + + if ((error = send_iterate_fs(zhp, &sd)) != 0) { + nvlist_free(sd.fss); + if (avlp != NULL) + *avlp = NULL; + *nvlp = NULL; + return (error); + } + + if (avlp != NULL && (*avlp = fsavl_create(sd.fss)) == NULL) { + nvlist_free(sd.fss); + *nvlp = NULL; + return (EZFS_NOMEM); + } + + *nvlp = sd.fss; + return (0); +} + +/* + * Routines specific to "zfs send" + */ +typedef struct send_dump_data { + /* these are all just the short snapname (the part after the @) */ + const char *fromsnap; + const char *tosnap; + char prevsnap[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t prevsnap_obj; + boolean_t seenfrom, seento, replicate, doall, fromorigin; + boolean_t dryrun, parsable, progress, embed_data, std_out; + boolean_t large_block, compress, raw, holds; + int outfd; + boolean_t err; + nvlist_t *fss; + nvlist_t *snapholds; + avl_tree_t *fsavl; + snapfilter_cb_t *filter_cb; + void *filter_cb_arg; + nvlist_t *debugnv; + char holdtag[ZFS_MAX_DATASET_NAME_LEN]; + int cleanup_fd; + int verbosity; + uint64_t size; +} send_dump_data_t; + +static int +zfs_send_space(zfs_handle_t *zhp, const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + int error; + + assert(snapname != NULL); + error = lzc_send_space(snapname, from, flags, spacep); + + if (error != 0) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot estimate space for '%s'"), snapname); + + switch (error) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + if (zfs_dataset_exists(hdl, snapname, + ZFS_TYPE_SNAPSHOT)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (%s) does not exist"), + snapname); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + case EINVAL: + zfs_error_aux(hdl, strerror(error)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, error, errbuf)); + } + } + + return (0); +} + +/* + * Dumps a backup of the given snapshot (incremental from fromsnap if it's not + * NULL) to the file descriptor specified by outfd. + */ +static int +dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, uint64_t fromsnap_obj, + boolean_t fromorigin, int outfd, enum lzc_send_flags flags, + nvlist_t *debugnv) +{ + zfs_cmd_t zc = {"\0"}; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *thisdbg; + + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + assert(fromsnap_obj == 0 || !fromorigin); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_cookie = outfd; + zc.zc_obj = fromorigin; + zc.zc_sendobj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zc.zc_fromobj = fromsnap_obj; + zc.zc_flags = flags; + + VERIFY(0 == nvlist_alloc(&thisdbg, NV_UNIQUE_NAME, 0)); + if (fromsnap && fromsnap[0] != '\0') { + VERIFY(0 == nvlist_add_string(thisdbg, + "fromsnap", fromsnap)); + } + + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND, &zc) != 0) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + VERIFY(0 == nvlist_add_uint64(thisdbg, "error", errno)); + if (debugnv) { + VERIFY(0 == nvlist_add_nvlist(debugnv, + zhp->zfs_name, thisdbg)); + } + nvlist_free(thisdbg); + + switch (errno) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case EACCES: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "source key must be loaded")); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + + case ENOENT: + if (zfs_dataset_exists(hdl, zc.zc_name, + ZFS_TYPE_SNAPSHOT)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (@%s) does not exist"), + zc.zc_value); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + + if (debugnv) + VERIFY(0 == nvlist_add_nvlist(debugnv, zhp->zfs_name, thisdbg)); + nvlist_free(thisdbg); + + return (0); +} + +static void +gather_holds(zfs_handle_t *zhp, send_dump_data_t *sdd) +{ + assert(zhp->zfs_type == ZFS_TYPE_SNAPSHOT); + + /* + * zfs_send() only sets snapholds for sends that need them, + * e.g. replication and doall. + */ + if (sdd->snapholds == NULL) + return; + + fnvlist_add_string(sdd->snapholds, zhp->zfs_name, sdd->holdtag); +} + +int +zfs_send_progress(zfs_handle_t *zhp, int fd, uint64_t *bytes_written, + uint64_t *blocks_visited) +{ + zfs_cmd_t zc = {"\0"}; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_cookie = fd; + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SEND_PROGRESS, &zc) != 0) + return (errno); + if (bytes_written != NULL) + *bytes_written = zc.zc_cookie; + if (blocks_visited != NULL) + *blocks_visited = zc.zc_objset_type; + return (0); +} + +static void * +send_progress_thread(void *arg) +{ + progress_arg_t *pa = arg; + zfs_handle_t *zhp = pa->pa_zhp; + uint64_t bytes; + uint64_t blocks; + char buf[16]; + time_t t; + struct tm *tm; + boolean_t firstloop = B_TRUE; + + /* + * Print the progress from ZFS_IOC_SEND_PROGRESS every second. + */ + for (;;) { + int err; + (void) sleep(1); + if ((err = zfs_send_progress(zhp, pa->pa_fd, &bytes, + &blocks)) != 0) { + if (err == EINTR || err == ENOENT) + return ((void *)0); + return ((void *)(uintptr_t)err); + } + + if (firstloop && !pa->pa_parsable) { + (void) fprintf(stderr, + "TIME %s %sSNAPSHOT %s\n", + pa->pa_estimate ? "BYTES" : " SENT", + pa->pa_verbosity >= 2 ? " BLOCKS " : "", + zhp->zfs_name); + firstloop = B_FALSE; + } + + (void) time(&t); + tm = localtime(&t); + + if (pa->pa_verbosity >= 2 && pa->pa_parsable) { + (void) fprintf(stderr, + "%02d:%02d:%02d\t%llu\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + (u_longlong_t)bytes, (u_longlong_t)blocks, + zhp->zfs_name); + } else if (pa->pa_verbosity >= 2) { + zfs_nicenum(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, + "%02d:%02d:%02d %5s %8llu %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, (u_longlong_t)blocks, zhp->zfs_name); + } else if (pa->pa_parsable) { + (void) fprintf(stderr, "%02d:%02d:%02d\t%llu\t%s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + (u_longlong_t)bytes, zhp->zfs_name); + } else { + zfs_nicebytes(bytes, buf, sizeof (buf)); + (void) fprintf(stderr, "%02d:%02d:%02d %5s %s\n", + tm->tm_hour, tm->tm_min, tm->tm_sec, + buf, zhp->zfs_name); + } + } +} + +static void +send_print_verbose(FILE *fout, const char *tosnap, const char *fromsnap, + uint64_t size, boolean_t parsable) +{ + if (parsable) { + if (fromsnap != NULL) { + (void) fprintf(fout, "incremental\t%s\t%s", + fromsnap, tosnap); + } else { + (void) fprintf(fout, "full\t%s", + tosnap); + } + } else { + if (fromsnap != NULL) { + if (strchr(fromsnap, '@') == NULL && + strchr(fromsnap, '#') == NULL) { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from @%s to %s"), + fromsnap, tosnap); + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "send from %s to %s"), + fromsnap, tosnap); + } + } else { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "full send of %s"), + tosnap); + } + } + + if (parsable) { + (void) fprintf(fout, "\t%llu", + (longlong_t)size); + } else if (size != 0) { + char buf[16]; + zfs_nicebytes(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + " estimated size is %s"), buf); + } + (void) fprintf(fout, "\n"); +} + +static int +dump_snapshot(zfs_handle_t *zhp, void *arg) +{ + send_dump_data_t *sdd = arg; + progress_arg_t pa = { 0 }; + pthread_t tid; + char *thissnap; + enum lzc_send_flags flags = 0; + int err; + boolean_t isfromsnap, istosnap, fromorigin; + boolean_t exclude = B_FALSE; + FILE *fout = sdd->std_out ? stdout : stderr; + + err = 0; + thissnap = strchr(zhp->zfs_name, '@') + 1; + isfromsnap = (sdd->fromsnap != NULL && + strcmp(sdd->fromsnap, thissnap) == 0); + + if (!sdd->seenfrom && isfromsnap) { + gather_holds(zhp, sdd); + sdd->seenfrom = B_TRUE; + (void) strlcpy(sdd->prevsnap, thissnap, + sizeof (sdd->prevsnap)); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zfs_close(zhp); + return (0); + } + + if (sdd->seento || !sdd->seenfrom) { + zfs_close(zhp); + return (0); + } + + istosnap = (strcmp(sdd->tosnap, thissnap) == 0); + if (istosnap) + sdd->seento = B_TRUE; + + if (sdd->large_block) + flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (sdd->embed_data) + flags |= LZC_SEND_FLAG_EMBED_DATA; + if (sdd->compress) + flags |= LZC_SEND_FLAG_COMPRESS; + if (sdd->raw) + flags |= LZC_SEND_FLAG_RAW; + + if (!sdd->doall && !isfromsnap && !istosnap) { + if (sdd->replicate) { + char *snapname; + nvlist_t *snapprops; + /* + * Filter out all intermediate snapshots except origin + * snapshots needed to replicate clones. + */ + nvlist_t *nvfs = fsavl_find(sdd->fsavl, + zhp->zfs_dmustats.dds_guid, &snapname); + + VERIFY(0 == nvlist_lookup_nvlist(nvfs, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + thissnap, &snapprops)); + exclude = !nvlist_exists(snapprops, "is_clone_origin"); + } else { + exclude = B_TRUE; + } + } + + /* + * If a filter function exists, call it to determine whether + * this snapshot will be sent. + */ + if (exclude || (sdd->filter_cb != NULL && + sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE)) { + /* + * This snapshot is filtered out. Don't send it, and don't + * set prevsnap_obj, so it will be as if this snapshot didn't + * exist, and the next accepted snapshot will be sent as + * an incremental from the last accepted one, or as the + * first (and full) snapshot in the case of a replication, + * non-incremental send. + */ + zfs_close(zhp); + return (0); + } + + gather_holds(zhp, sdd); + fromorigin = sdd->prevsnap[0] == '\0' && + (sdd->fromorigin || sdd->replicate); + + if (sdd->verbosity != 0) { + uint64_t size = 0; + char fromds[ZFS_MAX_DATASET_NAME_LEN]; + + if (sdd->prevsnap[0] != '\0') { + (void) strlcpy(fromds, zhp->zfs_name, sizeof (fromds)); + *(strchr(fromds, '@') + 1) = '\0'; + (void) strlcat(fromds, sdd->prevsnap, sizeof (fromds)); + } + if (zfs_send_space(zhp, zhp->zfs_name, + sdd->prevsnap[0] ? fromds : NULL, flags, &size) != 0) { + size = 0; /* cannot estimate send space */ + } else { + send_print_verbose(fout, zhp->zfs_name, + sdd->prevsnap[0] ? sdd->prevsnap : NULL, + size, sdd->parsable); + } + sdd->size += size; + } + + if (!sdd->dryrun) { + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (sdd->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = sdd->outfd; + pa.pa_parsable = sdd->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = sdd->verbosity; + + if ((err = pthread_create(&tid, NULL, + send_progress_thread, &pa)) != 0) { + zfs_close(zhp); + return (err); + } + } + + err = dump_ioctl(zhp, sdd->prevsnap, sdd->prevsnap_obj, + fromorigin, sdd->outfd, flags, sdd->debugnv); + + if (sdd->progress) { + void *status = NULL; + (void) pthread_cancel(tid); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } + } + } + + (void) strcpy(sdd->prevsnap, thissnap); + sdd->prevsnap_obj = zfs_prop_get_int(zhp, ZFS_PROP_OBJSETID); + zfs_close(zhp); + return (err); +} + +static int +dump_filesystem(zfs_handle_t *zhp, void *arg) +{ + int rv = 0; + send_dump_data_t *sdd = arg; + boolean_t missingfrom = B_FALSE; + zfs_cmd_t zc = {"\0"}; + uint64_t min_txg = 0, max_txg = 0; + + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", + zhp->zfs_name, sdd->tosnap); + if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_STATS, &zc) != 0) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s: does not exist\n"), + zhp->zfs_name, sdd->tosnap); + sdd->err = B_TRUE; + return (0); + } + + if (sdd->replicate && sdd->fromsnap) { + /* + * If this fs does not have fromsnap, and we're doing + * recursive, we need to send a full stream from the + * beginning (or an incremental from the origin if this + * is a clone). If we're doing non-recursive, then let + * them get the error. + */ + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", + zhp->zfs_name, sdd->fromsnap); + if (zfs_ioctl(zhp->zfs_hdl, + ZFS_IOC_OBJSET_STATS, &zc) != 0) { + missingfrom = B_TRUE; + } + } + + sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; + sdd->prevsnap_obj = 0; + if (sdd->fromsnap == NULL || missingfrom) + sdd->seenfrom = B_TRUE; + + + + /* + * Iterate through all snapshots and process the ones we will be + * sending. If we only have a "from" and "to" snapshot to deal + * with, we can avoid iterating through all the other snapshots. + */ + if (sdd->doall || sdd->replicate || sdd->tosnap == NULL) { + if (!sdd->replicate && sdd->fromsnap != NULL) + min_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, + sdd->fromsnap); + if (!sdd->replicate && sdd->tosnap != NULL) + max_txg = get_snap_txg(zhp->zfs_hdl, zhp->zfs_name, + sdd->tosnap); + rv = zfs_iter_snapshots_sorted(zhp, dump_snapshot, arg, + min_txg, max_txg); + } else { + char snapname[MAXPATHLEN] = { 0 }; + zfs_handle_t *snap; + + if (!sdd->seenfrom) { + (void) snprintf(snapname, sizeof (snapname), + "%s@%s", zhp->zfs_name, sdd->fromsnap); + snap = zfs_open(zhp->zfs_hdl, snapname, + ZFS_TYPE_SNAPSHOT); + if (snap != NULL) + rv = dump_snapshot(snap, sdd); + else + rv = -1; + } + + if (rv == 0) { + (void) snprintf(snapname, sizeof (snapname), + "%s@%s", zhp->zfs_name, sdd->tosnap); + snap = zfs_open(zhp->zfs_hdl, snapname, + ZFS_TYPE_SNAPSHOT); + if (snap != NULL) + rv = dump_snapshot(snap, sdd); + else + rv = -1; + } + } + + if (!sdd->seenfrom) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) does not exist\n"), + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + sdd->err = B_TRUE; + } else if (!sdd->seento) { + if (sdd->fromsnap) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) " + "is not earlier than it\n"), + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + } else { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "WARNING: " + "could not send %s@%s: does not exist\n"), + zhp->zfs_name, sdd->tosnap); + } + sdd->err = B_TRUE; + } + + return (rv); +} + +static int +dump_filesystems(zfs_handle_t *rzhp, void *arg) +{ + send_dump_data_t *sdd = arg; + nvpair_t *fspair; + boolean_t needagain, progress; + + if (!sdd->replicate) + return (dump_filesystem(rzhp, sdd)); + + /* Mark the clone origin snapshots. */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *nvfs; + uint64_t origin_guid = 0; + + VERIFY(0 == nvpair_value_nvlist(fspair, &nvfs)); + (void) nvlist_lookup_uint64(nvfs, "origin", &origin_guid); + if (origin_guid != 0) { + char *snapname; + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, &snapname); + if (origin_nv != NULL) { + nvlist_t *snapprops; + VERIFY(0 == nvlist_lookup_nvlist(origin_nv, + "snapprops", &snapprops)); + VERIFY(0 == nvlist_lookup_nvlist(snapprops, + snapname, &snapprops)); + VERIFY(0 == nvlist_add_boolean( + snapprops, "is_clone_origin")); + } + } + } +again: + needagain = progress = B_FALSE; + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *fslist, *parent_nv; + char *fsname; + zfs_handle_t *zhp; + int err; + uint64_t origin_guid = 0; + uint64_t parent_guid = 0; + + VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + if (nvlist_lookup_boolean(fslist, "sent") == 0) + continue; + + VERIFY(nvlist_lookup_string(fslist, "name", &fsname) == 0); + (void) nvlist_lookup_uint64(fslist, "origin", &origin_guid); + (void) nvlist_lookup_uint64(fslist, "parentfromsnap", + &parent_guid); + + if (parent_guid != 0) { + parent_nv = fsavl_find(sdd->fsavl, parent_guid, NULL); + if (!nvlist_exists(parent_nv, "sent")) { + /* parent has not been sent; skip this one */ + needagain = B_TRUE; + continue; + } + } + + if (origin_guid != 0) { + nvlist_t *origin_nv = fsavl_find(sdd->fsavl, + origin_guid, NULL); + if (origin_nv != NULL && + !nvlist_exists(origin_nv, "sent")) { + /* + * origin has not been sent yet; + * skip this clone. + */ + needagain = B_TRUE; + continue; + } + } + + zhp = zfs_open(rzhp->zfs_hdl, fsname, ZFS_TYPE_DATASET); + if (zhp == NULL) + return (-1); + err = dump_filesystem(zhp, sdd); + VERIFY(nvlist_add_boolean(fslist, "sent") == 0); + progress = B_TRUE; + zfs_close(zhp); + if (err) + return (err); + } + if (needagain) { + assert(progress); + goto again; + } + + /* clean out the sent flags in case we reuse this fss */ + for (fspair = nvlist_next_nvpair(sdd->fss, NULL); fspair; + fspair = nvlist_next_nvpair(sdd->fss, fspair)) { + nvlist_t *fslist; + + VERIFY(nvpair_value_nvlist(fspair, &fslist) == 0); + (void) nvlist_remove_all(fslist, "sent"); + } + + return (0); +} + +nvlist_t * +zfs_send_resume_token_to_nvlist(libzfs_handle_t *hdl, const char *token) +{ + unsigned int version; + int nread, i; + unsigned long long checksum, packed_len; + + /* + * Decode token header, which is: + * -- + * Note that the only supported token version is 1. + */ + nread = sscanf(token, "%u-%llx-%llx-", + &version, &checksum, &packed_len); + if (nread != 3) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid format)")); + return (NULL); + } + + if (version != ZFS_SEND_RESUME_TOKEN_VERSION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (invalid version %u)"), + version); + return (NULL); + } + + /* convert hexadecimal representation to binary */ + token = strrchr(token, '-') + 1; + int len = strlen(token) / 2; + unsigned char *compressed = zfs_alloc(hdl, len); + for (i = 0; i < len; i++) { + nread = sscanf(token + i * 2, "%2hhx", compressed + i); + if (nread != 1) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt " + "(payload is not hex-encoded)")); + return (NULL); + } + } + + /* verify checksum */ + zio_cksum_t cksum; + fletcher_4_native_varsize(compressed, len, &cksum); + if (cksum.zc_word[0] != checksum) { + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (incorrect checksum)")); + return (NULL); + } + + /* uncompress */ + void *packed = zfs_alloc(hdl, packed_len); + uLongf packed_len_long = packed_len; + if (uncompress(packed, &packed_len_long, compressed, len) != Z_OK || + packed_len_long != packed_len) { + free(packed); + free(compressed); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (decompression failed)")); + return (NULL); + } + + /* unpack nvlist */ + nvlist_t *nv; + int error = nvlist_unpack(packed, packed_len, &nv, KM_SLEEP); + free(packed); + free(compressed); + if (error != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt (nvlist_unpack failed)")); + return (NULL); + } + return (nv); +} +static enum lzc_send_flags +lzc_flags_from_sendflags(const sendflags_t *flags) +{ + enum lzc_send_flags lzc_flags = 0; + if (flags->largeblock) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags->embed_data) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; + if (flags->raw) + lzc_flags |= LZC_SEND_FLAG_RAW; + if (flags->saved) + lzc_flags |= LZC_SEND_FLAG_SAVED; + return (lzc_flags); +} + +static int +estimate_size(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + uint64_t resumeobj, uint64_t resumeoff, uint64_t bytes, + const char *redactbook, char *errbuf) +{ + uint64_t size; + FILE *fout = flags->dryrun ? stdout : stderr; + progress_arg_t pa = { 0 }; + int err = 0; + pthread_t ptid; + + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_TRUE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + err = lzc_send_space_resume_redacted(zhp->zfs_name, from, + lzc_flags_from_sendflags(flags), resumeobj, resumeoff, bytes, + redactbook, fd, &size); + + if (flags->progress) { + void *status = NULL; + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "progress thread exited " + "nonzero")); + return (zfs_standard_error(zhp->zfs_hdl, error, + errbuf)); + } + } + + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + send_print_verbose(fout, zhp->zfs_name, from, size, + flags->parsable); + + if (flags->parsable) { + (void) fprintf(fout, "size\t%llu\n", (longlong_t)size); + } else { + char buf[16]; + zfs_nicenum(size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + return (0); +} + +static boolean_t +redact_snaps_contains(const uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +redact_snaps_equal(const uint64_t *snaps1, uint64_t num_snaps1, + const uint64_t *snaps2, uint64_t num_snaps2) +{ + if (num_snaps1 != num_snaps2) + return (B_FALSE); + for (int i = 0; i < num_snaps1; i++) { + if (!redact_snaps_contains(snaps2, num_snaps2, snaps1[i])) + return (B_FALSE); + } + return (B_TRUE); +} + +/* + * Check that the list of redaction snapshots in the bookmark matches the send + * we're resuming, and return whether or not it's complete. + * + * Note that the caller needs to free the contents of *bookname with free() if + * this function returns successfully. + */ +static int +find_redact_book(libzfs_handle_t *hdl, const char *path, + const uint64_t *redact_snap_guids, int num_redact_snaps, + char **bookname) +{ + char errbuf[1024]; + int error = 0; + nvlist_t *props = fnvlist_alloc(); + nvlist_t *bmarks; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + fnvlist_add_boolean(props, "redact_complete"); + fnvlist_add_boolean(props, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + error = lzc_get_bookmarks(path, props, &bmarks); + nvlist_free(props); + if (error != 0) { + if (error == ESRCH) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "nonexistent redaction bookmark provided")); + } else if (error == ENOENT) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset to be sent no longer exists")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unknown error: %s"), strerror(error)); + } + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + nvpair_t *pair; + for (pair = nvlist_next_nvpair(bmarks, NULL); pair; + pair = nvlist_next_nvpair(bmarks, pair)) { + + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + uint_t len = 0; + uint64_t *bmarksnaps = fnvlist_lookup_uint64_array(vallist, + ZPROP_VALUE, &len); + if (redact_snaps_equal(redact_snap_guids, + num_redact_snaps, bmarksnaps, len)) { + break; + } + } + if (pair == NULL) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no appropriate redaction bookmark exists")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + char *name = nvpair_name(pair); + nvlist_t *bmark = fnvpair_value_nvlist(pair); + nvlist_t *vallist = fnvlist_lookup_nvlist(bmark, "redact_complete"); + boolean_t complete = fnvlist_lookup_boolean_value(vallist, + ZPROP_VALUE); + if (!complete) { + fnvlist_free(bmarks); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incomplete redaction bookmark provided")); + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + *bookname = strndup(name, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3P(*bookname, !=, NULL); + fnvlist_free(bmarks); + return (0); +} + +static int +zfs_send_resume_impl(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + nvlist_t *resume_nvl) +{ + char errbuf[1024]; + char *toname; + char *fromname = NULL; + uint64_t resumeobj, resumeoff, toguid, fromguid, bytes; + zfs_handle_t *zhp; + int error = 0; + char name[ZFS_MAX_DATASET_NAME_LEN]; + enum lzc_send_flags lzc_flags = 0; + FILE *fout = (flags->verbosity > 0 && flags->dryrun) ? stdout : stderr; + uint64_t *redact_snap_guids = NULL; + int num_redact_snaps = 0; + char *redact_book = NULL; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + if (flags->verbosity != 0) { + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "resume token contents:\n")); + nvlist_print(fout, resume_nvl); + } + + if (nvlist_lookup_string(resume_nvl, "toname", &toname) != 0 || + nvlist_lookup_uint64(resume_nvl, "object", &resumeobj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &resumeoff) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", &toguid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "resume token is corrupt")); + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + fromguid = 0; + (void) nvlist_lookup_uint64(resume_nvl, "fromguid", &fromguid); + + if (flags->largeblock || nvlist_exists(resume_nvl, "largeblockok")) + lzc_flags |= LZC_SEND_FLAG_LARGE_BLOCK; + if (flags->embed_data || nvlist_exists(resume_nvl, "embedok")) + lzc_flags |= LZC_SEND_FLAG_EMBED_DATA; + if (flags->compress || nvlist_exists(resume_nvl, "compressok")) + lzc_flags |= LZC_SEND_FLAG_COMPRESS; + if (flags->raw || nvlist_exists(resume_nvl, "rawok")) + lzc_flags |= LZC_SEND_FLAG_RAW; + if (flags->saved || nvlist_exists(resume_nvl, "savedok")) + lzc_flags |= LZC_SEND_FLAG_SAVED; + + if (flags->saved) { + (void) strcpy(name, toname); + } else { + error = guid_to_name(hdl, toname, toguid, B_FALSE, name); + if (error != 0) { + if (zfs_dataset_exists(hdl, toname, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is no longer the same snapshot " + "used in the initial send"), toname); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' used in the initial send no " + "longer exists"), toname); + } + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + } + + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "unable to access '%s'"), name); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + + if (nvlist_lookup_uint64_array(resume_nvl, "book_redact_snaps", + &redact_snap_guids, (uint_t *)&num_redact_snaps) != 0) { + num_redact_snaps = -1; + } + + if (fromguid != 0) { + if (guid_to_name_redact_snaps(hdl, toname, fromguid, B_TRUE, + redact_snap_guids, num_redact_snaps, name) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source %#llx no longer exists"), + (longlong_t)fromguid); + return (zfs_error(hdl, EZFS_BADPATH, errbuf)); + } + fromname = name; + } + + redact_snap_guids = NULL; + + if (nvlist_lookup_uint64_array(resume_nvl, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), &redact_snap_guids, + (uint_t *)&num_redact_snaps) == 0) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + + (void) strlcpy(path, toname, sizeof (path)); + char *at = strchr(path, '@'); + ASSERT3P(at, !=, NULL); + + *at = '\0'; + + if ((error = find_redact_book(hdl, path, redact_snap_guids, + num_redact_snaps, &redact_book)) != 0) { + return (error); + } + } + + if (flags->verbosity != 0) { + /* + * Some of these may have come from the resume token, set them + * here for size estimate purposes. + */ + sendflags_t tmpflags = *flags; + if (lzc_flags & LZC_SEND_FLAG_LARGE_BLOCK) + tmpflags.largeblock = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_COMPRESS) + tmpflags.compress = B_TRUE; + if (lzc_flags & LZC_SEND_FLAG_EMBED_DATA) + tmpflags.embed_data = B_TRUE; + error = estimate_size(zhp, fromname, outfd, &tmpflags, + resumeobj, resumeoff, bytes, redact_book, errbuf); + } + + if (!flags->dryrun) { + progress_arg_t pa = { 0 }; + pthread_t tid; + /* + * If progress reporting is requested, spawn a new thread to + * poll ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = outfd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; + + error = pthread_create(&tid, NULL, + send_progress_thread, &pa); + if (error != 0) { + if (redact_book != NULL) + free(redact_book); + zfs_close(zhp); + return (error); + } + } + + error = lzc_send_resume_redacted(zhp->zfs_name, fromname, outfd, + lzc_flags, resumeobj, resumeoff, redact_book); + if (redact_book != NULL) + free(redact_book); + + if (flags->progress) { + void *status = NULL; + (void) pthread_cancel(tid); + (void) pthread_join(tid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "progress thread exited nonzero")); + return (zfs_standard_error(hdl, error, errbuf)); + } + } + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + + zfs_close(zhp); + + switch (error) { + case 0: + return (0); + case EACCES: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "source key must be loaded")); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + case ESRCH: + if (lzc_exists(zhp->zfs_name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source could not be found")); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EXDEV: + case ENOENT: + case EDQUOT: + case EFBIG: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EFAULT: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } else { + if (redact_book != NULL) + free(redact_book); + } + + zfs_close(zhp); + + return (error); +} + +int +zfs_send_resume(libzfs_handle_t *hdl, sendflags_t *flags, int outfd, + const char *resume_token) +{ + int ret; + char errbuf[1024]; + nvlist_t *resume_nvl; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot resume send")); + + resume_nvl = zfs_send_resume_token_to_nvlist(hdl, resume_token); + if (resume_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist() + */ + return (zfs_error(hdl, EZFS_FAULT, errbuf)); + } + + ret = zfs_send_resume_impl(hdl, flags, outfd, resume_nvl); + nvlist_free(resume_nvl); + + return (ret); +} + +int +zfs_send_saved(zfs_handle_t *zhp, sendflags_t *flags, int outfd, + const char *resume_token) +{ + int ret; + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *saved_nvl = NULL, *resume_nvl = NULL; + uint64_t saved_guid = 0, resume_guid = 0; + uint64_t obj = 0, off = 0, bytes = 0; + char token_buf[ZFS_MAXPROPLEN]; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "saved send failed")); + + ret = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), NULL, NULL, 0, B_TRUE); + if (ret != 0) + goto out; + + saved_nvl = zfs_send_resume_token_to_nvlist(hdl, token_buf); + if (saved_nvl == NULL) { + /* + * zfs_error_aux has already been set by + * zfs_send_resume_token_to_nvlist() + */ + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + /* + * If a resume token is provided we use the object and offset + * from that instead of the default, which starts from the + * beginning. + */ + if (resume_token != NULL) { + resume_nvl = zfs_send_resume_token_to_nvlist(hdl, + resume_token); + if (resume_nvl == NULL) { + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (nvlist_lookup_uint64(resume_nvl, "object", &obj) != 0 || + nvlist_lookup_uint64(resume_nvl, "offset", &off) != 0 || + nvlist_lookup_uint64(resume_nvl, "bytes", &bytes) != 0 || + nvlist_lookup_uint64(resume_nvl, "toguid", + &resume_guid) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "provided resume token is corrupt")); + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (nvlist_lookup_uint64(saved_nvl, "toguid", + &saved_guid)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset's resume token is corrupt")); + ret = zfs_error(hdl, EZFS_FAULT, errbuf); + goto out; + } + + if (resume_guid != saved_guid) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "provided resume token does not match dataset")); + ret = zfs_error(hdl, EZFS_BADBACKUP, errbuf); + goto out; + } + } + + (void) nvlist_remove_all(saved_nvl, "object"); + fnvlist_add_uint64(saved_nvl, "object", obj); + + (void) nvlist_remove_all(saved_nvl, "offset"); + fnvlist_add_uint64(saved_nvl, "offset", off); + + (void) nvlist_remove_all(saved_nvl, "bytes"); + fnvlist_add_uint64(saved_nvl, "bytes", bytes); + + (void) nvlist_remove_all(saved_nvl, "toname"); + fnvlist_add_string(saved_nvl, "toname", zhp->zfs_name); + + ret = zfs_send_resume_impl(hdl, flags, outfd, saved_nvl); + +out: + nvlist_free(saved_nvl); + nvlist_free(resume_nvl); + return (ret); +} + +/* + * This function informs the target system that the recursive send is complete. + * The record is also expected in the case of a send -p. + */ +static int +send_conclusion_record(int fd, zio_cksum_t *zc) +{ + dmu_replay_record_t drr = { 0 }; + drr.drr_type = DRR_END; + if (zc != NULL) + drr.drr_u.drr_end.drr_checksum = *zc; + if (write(fd, &drr, sizeof (drr)) == -1) { + return (errno); + } + return (0); +} + +/* + * This function is responsible for sending the records that contain the + * necessary information for the target system's libzfs to be able to set the + * properties of the filesystem being received, or to be able to prepare for + * a recursive receive. + * + * The "zhp" argument is the handle of the snapshot we are sending + * (the "tosnap"). The "from" argument is the short snapshot name (the part + * after the @) of the incremental source. + */ +static int +send_prelim_records(zfs_handle_t *zhp, const char *from, int fd, + boolean_t gather_props, boolean_t recursive, boolean_t verbose, + boolean_t dryrun, boolean_t raw, boolean_t replicate, boolean_t backup, + boolean_t holds, boolean_t props, boolean_t doall, + nvlist_t **fssp, avl_tree_t **fsavlp) +{ + int err = 0; + char *packbuf = NULL; + size_t buflen = 0; + zio_cksum_t zc = { {0} }; + int featureflags = 0; + /* name of filesystem/volume that contains snapshot we are sending */ + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + /* short name of snap we are sending */ + char *tosnap = ""; + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), zhp->zfs_name); + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM && zfs_prop_get_int(zhp, + ZFS_PROP_VERSION) >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + + if (holds) + featureflags |= DMU_BACKUP_FEATURE_HOLDS; + + (void) strlcpy(tofs, zhp->zfs_name, ZFS_MAX_DATASET_NAME_LEN); + char *at = strchr(tofs, '@'); + if (at != NULL) { + *at = '\0'; + tosnap = at + 1; + } + + if (gather_props) { + nvlist_t *hdrnv = fnvlist_alloc(); + nvlist_t *fss = NULL; + + if (from != NULL) + fnvlist_add_string(hdrnv, "fromsnap", from); + fnvlist_add_string(hdrnv, "tosnap", tosnap); + if (!recursive) + fnvlist_add_boolean(hdrnv, "not_recursive"); + + if (raw) { + VERIFY0(nvlist_add_boolean(hdrnv, "raw")); + } + + if ((err = gather_nvlist(zhp->zfs_hdl, tofs, + from, tosnap, recursive, raw, doall, replicate, verbose, + backup, holds, props, &fss, fsavlp)) != 0) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + fnvlist_add_nvlist(hdrnv, "fss", fss); + VERIFY0(nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, + 0)); + if (fssp != NULL) { + *fssp = fss; + } else { + nvlist_free(fss); + } + nvlist_free(hdrnv); + } + + if (!dryrun) { + dmu_replay_record_t drr = { 0 }; + /* write first begin record */ + drr.drr_type = DRR_BEGIN; + drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin. + drr_versioninfo, DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin. + drr_versioninfo, featureflags); + if (snprintf(drr.drr_u.drr_begin.drr_toname, + sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", tofs, + tosnap) >= sizeof (drr.drr_u.drr_begin.drr_toname)) { + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + drr.drr_payloadlen = buflen; + + err = dump_record(&drr, packbuf, buflen, &zc, fd); + free(packbuf); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + err = send_conclusion_record(fd, &zc); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(err)); + return (zfs_error(zhp->zfs_hdl, EZFS_BADBACKUP, + errbuf)); + } + } + return (0); +} + +/* + * Generate a send stream. The "zhp" argument is the filesystem/volume + * that contains the snapshot to send. The "fromsnap" argument is the + * short name (the part after the '@') of the snapshot that is the + * incremental source to send from (if non-NULL). The "tosnap" argument + * is the short name of the snapshot to send. + * + * The content of the send stream is the snapshot identified by + * 'tosnap'. Incremental streams are requested in two ways: + * - from the snapshot identified by "fromsnap" (if non-null) or + * - from the origin of the dataset identified by zhp, which must + * be a clone. In this case, "fromsnap" is null and "fromorigin" + * is TRUE. + * + * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and + * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) + * if "replicate" is set. If "doall" is set, dump all the intermediate + * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" + * case too. If "props" is set, send properties. + */ +int +zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + sendflags_t *flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg, nvlist_t **debugnvp) +{ + char errbuf[1024]; + send_dump_data_t sdd = { 0 }; + int err = 0; + nvlist_t *fss = NULL; + avl_tree_t *fsavl = NULL; + static uint64_t holdseq; + int spa_version; + pthread_t tid = 0; + int pipefd[2]; + int featureflags = 0; + FILE *fout; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot send '%s'"), zhp->zfs_name); + + if (fromsnap && fromsnap[0] == '\0') { + zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, + "zero-length incremental source")); + return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); + } + + if (zhp->zfs_type == ZFS_TYPE_FILESYSTEM) { + uint64_t version; + version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); + if (version >= ZPL_VERSION_SA) { + featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } + } + + if (flags->holds) + featureflags |= DMU_BACKUP_FEATURE_HOLDS; + + if (flags->replicate || flags->doall || flags->props || + flags->holds || flags->backup) { + char full_tosnap_name[ZFS_MAX_DATASET_NAME_LEN]; + if (snprintf(full_tosnap_name, sizeof (full_tosnap_name), + "%s@%s", zhp->zfs_name, tosnap) >= + sizeof (full_tosnap_name)) { + err = EINVAL; + goto stderr_out; + } + zfs_handle_t *tosnap = zfs_open(zhp->zfs_hdl, + full_tosnap_name, ZFS_TYPE_SNAPSHOT); + if (tosnap == NULL) { + err = -1; + goto err_out; + } + err = send_prelim_records(tosnap, fromsnap, outfd, + flags->replicate || flags->props || flags->holds, + flags->replicate, flags->verbosity > 0, flags->dryrun, + flags->raw, flags->replicate, flags->backup, flags->holds, + flags->props, flags->doall, &fss, &fsavl); + zfs_close(tosnap); + if (err != 0) + goto err_out; + } + + /* dump each stream */ + sdd.fromsnap = fromsnap; + sdd.tosnap = tosnap; + if (tid != 0) + sdd.outfd = pipefd[0]; + else + sdd.outfd = outfd; + sdd.replicate = flags->replicate; + sdd.doall = flags->doall; + sdd.fromorigin = flags->fromorigin; + sdd.fss = fss; + sdd.fsavl = fsavl; + sdd.verbosity = flags->verbosity; + sdd.parsable = flags->parsable; + sdd.progress = flags->progress; + sdd.dryrun = flags->dryrun; + sdd.large_block = flags->largeblock; + sdd.embed_data = flags->embed_data; + sdd.compress = flags->compress; + sdd.raw = flags->raw; + sdd.holds = flags->holds; + sdd.filter_cb = filter_func; + sdd.filter_cb_arg = cb_arg; + if (debugnvp) + sdd.debugnv = *debugnvp; + if (sdd.verbosity != 0 && sdd.dryrun) + sdd.std_out = B_TRUE; + fout = sdd.std_out ? stdout : stderr; + + /* + * Some flags require that we place user holds on the datasets that are + * being sent so they don't get destroyed during the send. We can skip + * this step if the pool is imported read-only since the datasets cannot + * be destroyed. + */ + if (!flags->dryrun && !zpool_get_prop_int(zfs_get_pool_handle(zhp), + ZPOOL_PROP_READONLY, NULL) && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS && + (flags->doall || flags->replicate)) { + ++holdseq; + (void) snprintf(sdd.holdtag, sizeof (sdd.holdtag), + ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); + sdd.cleanup_fd = open(ZFS_DEV, O_RDWR); + if (sdd.cleanup_fd < 0) { + err = errno; + goto stderr_out; + } + sdd.snapholds = fnvlist_alloc(); + } else { + sdd.cleanup_fd = -1; + sdd.snapholds = NULL; + } + + if (flags->verbosity != 0 || sdd.snapholds != NULL) { + /* + * Do a verbose no-op dry run to get all the verbose output + * or to gather snapshot hold's before generating any data, + * then do a non-verbose real run to generate the streams. + */ + sdd.dryrun = B_TRUE; + err = dump_filesystems(zhp, &sdd); + + if (err != 0) + goto stderr_out; + + if (flags->verbosity != 0) { + if (flags->parsable) { + (void) fprintf(fout, "size\t%llu\n", + (longlong_t)sdd.size); + } else { + char buf[16]; + zfs_nicebytes(sdd.size, buf, sizeof (buf)); + (void) fprintf(fout, dgettext(TEXT_DOMAIN, + "total estimated size is %s\n"), buf); + } + } + + /* Ensure no snaps found is treated as an error. */ + if (!sdd.seento) { + err = ENOENT; + goto err_out; + } + + /* Skip the second run if dryrun was requested. */ + if (flags->dryrun) + goto err_out; + + if (sdd.snapholds != NULL) { + err = zfs_hold_nvl(zhp, sdd.cleanup_fd, sdd.snapholds); + if (err != 0) + goto stderr_out; + + fnvlist_free(sdd.snapholds); + sdd.snapholds = NULL; + } + + sdd.dryrun = B_FALSE; + sdd.verbosity = 0; + } + + err = dump_filesystems(zhp, &sdd); + fsavl_destroy(fsavl); + nvlist_free(fss); + + /* Ensure no snaps found is treated as an error. */ + if (err == 0 && !sdd.seento) + err = ENOENT; + + if (tid != 0) { + if (err != 0) + (void) pthread_cancel(tid); + (void) close(pipefd[0]); + (void) pthread_join(tid, NULL); + } + + if (sdd.cleanup_fd != -1) { + VERIFY(0 == close(sdd.cleanup_fd)); + sdd.cleanup_fd = -1; + } + + if (!flags->dryrun && (flags->replicate || flags->doall || + flags->props || flags->backup || flags->holds)) { + /* + * write final end record. NB: want to do this even if + * there was some error, because it might not be totally + * failed. + */ + err = send_conclusion_record(outfd, NULL); + if (err != 0) + return (zfs_standard_error(zhp->zfs_hdl, err, errbuf)); + } + + return (err || sdd.err); + +stderr_out: + err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); +err_out: + fsavl_destroy(fsavl); + nvlist_free(fss); + fnvlist_free(sdd.snapholds); + + if (sdd.cleanup_fd != -1) + VERIFY(0 == close(sdd.cleanup_fd)); + if (tid != 0) { + (void) pthread_cancel(tid); + (void) close(pipefd[0]); + (void) pthread_join(tid, NULL); + } + return (err); +} + +static zfs_handle_t * +name_to_dir_handle(libzfs_handle_t *hdl, const char *snapname) +{ + char dirname[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(dirname, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(dirname, '@'); + if (c != NULL) + *c = '\0'; + return (zfs_open(hdl, dirname, ZFS_TYPE_DATASET)); +} + +/* + * Returns B_TRUE if earlier is an earlier snapshot in later's timeline; either + * an earlier snapshot in the same filesystem, or a snapshot before later's + * origin, or it's origin's origin, etc. + */ +static boolean_t +snapshot_is_before(zfs_handle_t *earlier, zfs_handle_t *later) +{ + boolean_t ret; + uint64_t later_txg = + (later->zfs_type == ZFS_TYPE_FILESYSTEM || + later->zfs_type == ZFS_TYPE_VOLUME ? + UINT64_MAX : zfs_prop_get_int(later, ZFS_PROP_CREATETXG)); + uint64_t earlier_txg = zfs_prop_get_int(earlier, ZFS_PROP_CREATETXG); + + if (earlier_txg >= later_txg) + return (B_FALSE); + + zfs_handle_t *earlier_dir = name_to_dir_handle(earlier->zfs_hdl, + earlier->zfs_name); + zfs_handle_t *later_dir = name_to_dir_handle(later->zfs_hdl, + later->zfs_name); + + if (strcmp(earlier_dir->zfs_name, later_dir->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_TRUE); + } + + char clonename[ZFS_MAX_DATASET_NAME_LEN]; + if (zfs_prop_get(later_dir, ZFS_PROP_ORIGIN, clonename, + ZFS_MAX_DATASET_NAME_LEN, NULL, NULL, 0, B_TRUE) != 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + return (B_FALSE); + } + + zfs_handle_t *origin = zfs_open(earlier->zfs_hdl, clonename, + ZFS_TYPE_DATASET); + uint64_t origin_txg = zfs_prop_get_int(origin, ZFS_PROP_CREATETXG); + + /* + * If "earlier" is exactly the origin, then + * snapshot_is_before(earlier, origin) will return false (because + * they're the same). + */ + if (origin_txg == earlier_txg && + strcmp(origin->zfs_name, earlier->zfs_name) == 0) { + zfs_close(earlier_dir); + zfs_close(later_dir); + zfs_close(origin); + return (B_TRUE); + } + zfs_close(earlier_dir); + zfs_close(later_dir); + + ret = snapshot_is_before(earlier, origin); + zfs_close(origin); + return (ret); +} + +/* + * The "zhp" argument is the handle of the dataset to send (typically a + * snapshot). The "from" argument is the full name of the snapshot or + * bookmark that is the incremental source. + */ +int +zfs_send_one(zfs_handle_t *zhp, const char *from, int fd, sendflags_t *flags, + const char *redactbook) +{ + int err; + libzfs_handle_t *hdl = zhp->zfs_hdl; + char *name = zhp->zfs_name; + int orig_fd = fd; + pthread_t ptid; + progress_arg_t pa = { 0 }; + + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "warning: cannot send '%s'"), name); + + if (from != NULL && strchr(from, '@')) { + zfs_handle_t *from_zhp = zfs_open(hdl, from, + ZFS_TYPE_DATASET); + if (from_zhp == NULL) + return (-1); + if (!snapshot_is_before(from_zhp, zhp)) { + zfs_close(from_zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + } + zfs_close(from_zhp); + } + + if (redactbook != NULL) { + char bookname[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *redact_snaps; + zfs_handle_t *book_zhp; + char *at, *pound; + int dsnamelen; + + pound = strchr(redactbook, '#'); + if (pound != NULL) + redactbook = pound + 1; + at = strchr(name, '@'); + if (at == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot do a redacted send to a filesystem")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + dsnamelen = at - name; + if (snprintf(bookname, sizeof (bookname), "%.*s#%s", + dsnamelen, name, redactbook) + >= sizeof (bookname)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid bookmark name")); + return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); + } + book_zhp = zfs_open(hdl, bookname, ZFS_TYPE_BOOKMARK); + if (book_zhp == NULL) + return (-1); + if (nvlist_lookup_nvlist(book_zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), + &redact_snaps) != 0 || redact_snaps == NULL) { + zfs_close(book_zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not a redaction bookmark")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + } + zfs_close(book_zhp); + } + + /* + * Send fs properties + */ + if (flags->props || flags->holds || flags->backup) { + /* + * Note: the header generated by send_prelim_records() + * assumes that the incremental source is in the same + * filesystem/volume as the target (which is a requirement + * when doing "zfs send -R"). But that isn't always the + * case here (e.g. send from snap in origin, or send from + * bookmark). We pass from=NULL, which will omit this + * information from the prelim records; it isn't used + * when receiving this type of stream. + */ + err = send_prelim_records(zhp, NULL, fd, B_TRUE, B_FALSE, + flags->verbosity > 0, flags->dryrun, flags->raw, + flags->replicate, flags->backup, flags->holds, + flags->props, flags->doall, NULL, NULL); + if (err != 0) + return (err); + } + + /* + * Perform size estimate if verbose was specified. + */ + if (flags->verbosity != 0) { + err = estimate_size(zhp, from, fd, flags, 0, 0, 0, redactbook, + errbuf); + if (err != 0) + return (err); + } + + if (flags->dryrun) + return (0); + + /* + * If progress reporting is requested, spawn a new thread to poll + * ZFS_IOC_SEND_PROGRESS at a regular interval. + */ + if (flags->progress) { + pa.pa_zhp = zhp; + pa.pa_fd = fd; + pa.pa_parsable = flags->parsable; + pa.pa_estimate = B_FALSE; + pa.pa_verbosity = flags->verbosity; + + err = pthread_create(&ptid, NULL, + send_progress_thread, &pa); + if (err != 0) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + err = lzc_send_redacted(name, from, fd, + lzc_flags_from_sendflags(flags), redactbook); + + if (flags->progress) { + void *status = NULL; + if (err != 0) + (void) pthread_cancel(ptid); + (void) pthread_join(ptid, &status); + int error = (int)(uintptr_t)status; + if (error != 0 && status != PTHREAD_CANCELED) { + char errbuf[1024]; + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "progress thread exited " + "nonzero")); + return (zfs_standard_error(hdl, error, errbuf)); + } + } + + if (flags->props || flags->holds || flags->backup) { + /* Write the final end record. */ + err = send_conclusion_record(orig_fd, NULL); + if (err != 0) + return (zfs_standard_error(hdl, err, errbuf)); + } + if (err != 0) { + switch (errno) { + case EXDEV: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "not an earlier snapshot from the same fs")); + return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); + + case ENOENT: + case ESRCH: + if (lzc_exists(name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental source (%s) does not exist"), + from); + } + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + + case EACCES: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset key must be loaded")); + return (zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf)); + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "target is busy; if a filesystem, " + "it must not be mounted")); + return (zfs_error(hdl, EZFS_BUSY, errbuf)); + + case EDQUOT: + case EFAULT: + case EFBIG: + case EINVAL: + case EIO: + case ENOLINK: + case ENOSPC: + case ENOSTR: + case ENXIO: + case EPIPE: + case ERANGE: + case EROFS: + zfs_error_aux(hdl, strerror(errno)); + return (zfs_error(hdl, EZFS_BADBACKUP, errbuf)); + + default: + return (zfs_standard_error(hdl, errno, errbuf)); + } + } + return (err != 0); +} + +/* + * Routines specific to "zfs recv" + */ + +static int +recv_read(libzfs_handle_t *hdl, int fd, void *buf, int ilen, + boolean_t byteswap, zio_cksum_t *zc) +{ + char *cp = buf; + int rv; + int len = ilen; + + assert(ilen <= SPA_MAXBLOCKSIZE); + + do { + rv = read(fd, cp, len); + cp += rv; + len -= rv; + } while (rv > 0); + + if (rv < 0 || len != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to read from stream")); + return (zfs_error(hdl, EZFS_BADSTREAM, dgettext(TEXT_DOMAIN, + "cannot receive"))); + } + + if (zc) { + if (byteswap) + fletcher_4_incremental_byteswap(buf, ilen, zc); + else + fletcher_4_incremental_native(buf, ilen, zc); + } + return (0); +} + +static int +recv_read_nvlist(libzfs_handle_t *hdl, int fd, int len, nvlist_t **nvp, + boolean_t byteswap, zio_cksum_t *zc) +{ + char *buf; + int err; + + buf = zfs_alloc(hdl, len); + if (buf == NULL) + return (ENOMEM); + + err = recv_read(hdl, fd, buf, len, byteswap, zc); + if (err != 0) { + free(buf); + return (err); + } + + err = nvlist_unpack(buf, len, nvp, 0); + free(buf); + if (err != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (malformed nvlist)")); + return (EINVAL); + } + return (0); +} + +/* + * Returns the grand origin (origin of origin of origin...) of a given handle. + * If this dataset is not a clone, it simply returns a copy of the original + * handle. + */ +static zfs_handle_t * +recv_open_grand_origin(zfs_handle_t *zhp) +{ + char origin[ZFS_MAX_DATASET_NAME_LEN]; + zprop_source_t src; + zfs_handle_t *ozhp = zfs_handle_dup(zhp); + + while (ozhp != NULL) { + if (zfs_prop_get(ozhp, ZFS_PROP_ORIGIN, origin, + sizeof (origin), &src, NULL, 0, B_FALSE) != 0) + break; + + (void) zfs_close(ozhp); + ozhp = zfs_open(zhp->zfs_hdl, origin, ZFS_TYPE_FILESYSTEM); + } + + return (ozhp); +} + +static int +recv_rename_impl(zfs_handle_t *zhp, const char *name, const char *newname) +{ + int err; + zfs_handle_t *ozhp = NULL; + + /* + * Attempt to rename the dataset. If it fails with EACCES we have + * attempted to rename the dataset outside of its encryption root. + * Force the dataset to become an encryption root and try again. + */ + err = lzc_rename(name, newname); + if (err == EACCES) { + ozhp = recv_open_grand_origin(zhp); + if (ozhp == NULL) { + err = ENOENT; + goto out; + } + + err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, + NULL, NULL, 0); + if (err != 0) + goto out; + + err = lzc_rename(name, newname); + } + +out: + if (ozhp != NULL) + zfs_close(ozhp); + return (err); +} + +static int +recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, + int baselen, char *newname, recvflags_t *flags) +{ + static int seq; + int err; + prop_changelist_t *clp = NULL; + zfs_handle_t *zhp = NULL; + + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + err = -1; + goto out; + } + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags->force ? MS_FORCE : 0); + if (clp == NULL) { + err = -1; + goto out; + } + err = changelist_prefix(clp); + if (err) + goto out; + + if (tryname) { + (void) strcpy(newname, tryname); + if (flags->verbose) { + (void) printf("attempting rename %s to %s\n", + name, newname); + } + err = recv_rename_impl(zhp, name, newname); + if (err == 0) + changelist_rename(clp, name, tryname); + } else { + err = ENOENT; + } + + if (err != 0 && strncmp(name + baselen, "recv-", 5) != 0) { + seq++; + + (void) snprintf(newname, ZFS_MAX_DATASET_NAME_LEN, + "%.*srecv-%u-%u", baselen, name, getpid(), seq); + + if (flags->verbose) { + (void) printf("failed - trying rename %s to %s\n", + name, newname); + } + err = recv_rename_impl(zhp, name, newname); + if (err == 0) + changelist_rename(clp, name, newname); + if (err && flags->verbose) { + (void) printf("failed (%u) - " + "will try again on next pass\n", errno); + } + err = EAGAIN; + } else if (flags->verbose) { + if (err == 0) + (void) printf("success\n"); + else + (void) printf("failed (%u)\n", errno); + } + + (void) changelist_postfix(clp); + +out: + if (clp != NULL) + changelist_free(clp); + if (zhp != NULL) + zfs_close(zhp); + + return (err); +} + +static int +recv_promote(libzfs_handle_t *hdl, const char *fsname, + const char *origin_fsname, recvflags_t *flags) +{ + int err; + zfs_cmd_t zc = {"\0"}; + zfs_handle_t *zhp = NULL, *ozhp = NULL; + + if (flags->verbose) + (void) printf("promoting %s\n", fsname); + + (void) strlcpy(zc.zc_value, origin_fsname, sizeof (zc.zc_value)); + (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); + + /* + * Attempt to promote the dataset. If it fails with EACCES the + * promotion would cause this dataset to leave its encryption root. + * Force the origin to become an encryption root and try again. + */ + err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); + if (err == EACCES) { + zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); + if (zhp == NULL) { + err = -1; + goto out; + } + + ozhp = recv_open_grand_origin(zhp); + if (ozhp == NULL) { + err = -1; + goto out; + } + + err = lzc_change_key(ozhp->zfs_name, DCP_CMD_FORCE_NEW_KEY, + NULL, NULL, 0); + if (err != 0) + goto out; + + err = zfs_ioctl(hdl, ZFS_IOC_PROMOTE, &zc); + } + +out: + if (zhp != NULL) + zfs_close(zhp); + if (ozhp != NULL) + zfs_close(ozhp); + + return (err); +} + +static int +recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, + char *newname, recvflags_t *flags) +{ + int err = 0; + prop_changelist_t *clp; + zfs_handle_t *zhp; + boolean_t defer = B_FALSE; + int spa_version; + + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) + return (-1); + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags->force ? MS_FORCE : 0); + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS) + defer = B_TRUE; + zfs_close(zhp); + if (clp == NULL) + return (-1); + err = changelist_prefix(clp); + if (err) + return (err); + + if (flags->verbose) + (void) printf("attempting destroy %s\n", name); + if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) { + nvlist_t *nv = fnvlist_alloc(); + fnvlist_add_boolean(nv, name); + err = lzc_destroy_snaps(nv, defer, NULL); + fnvlist_free(nv); + } else { + err = lzc_destroy(name); + } + if (err == 0) { + if (flags->verbose) + (void) printf("success\n"); + changelist_remove(clp, name); + } + + (void) changelist_postfix(clp); + changelist_free(clp); + + /* + * Deferred destroy might destroy the snapshot or only mark it to be + * destroyed later, and it returns success in either case. + */ + if (err != 0 || (defer && zfs_dataset_exists(hdl, name, + ZFS_TYPE_SNAPSHOT))) { + err = recv_rename(hdl, name, NULL, baselen, newname, flags); + } + + return (err); +} + +typedef struct guid_to_name_data { + uint64_t guid; + boolean_t bookmark_ok; + char *name; + char *skip; + uint64_t *redact_snap_guids; + uint64_t num_redact_snaps; +} guid_to_name_data_t; + +static boolean_t +redact_snaps_match(zfs_handle_t *zhp, guid_to_name_data_t *gtnd) +{ + uint64_t *bmark_snaps; + uint_t bmark_num_snaps; + nvlist_t *nvl; + if (zhp->zfs_type != ZFS_TYPE_BOOKMARK) + return (B_FALSE); + + nvl = fnvlist_lookup_nvlist(zhp->zfs_props, + zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS)); + bmark_snaps = fnvlist_lookup_uint64_array(nvl, ZPROP_VALUE, + &bmark_num_snaps); + if (bmark_num_snaps != gtnd->num_redact_snaps) + return (B_FALSE); + int i = 0; + for (; i < bmark_num_snaps; i++) { + int j = 0; + for (; j < bmark_num_snaps; j++) { + if (bmark_snaps[i] == gtnd->redact_snap_guids[j]) + break; + } + if (j == bmark_num_snaps) + break; + } + return (i == bmark_num_snaps); +} + +static int +guid_to_name_cb(zfs_handle_t *zhp, void *arg) +{ + guid_to_name_data_t *gtnd = arg; + const char *slash; + int err; + + if (gtnd->skip != NULL && + (slash = strrchr(zhp->zfs_name, '/')) != NULL && + strcmp(slash + 1, gtnd->skip) == 0) { + zfs_close(zhp); + return (0); + } + + if (zfs_prop_get_int(zhp, ZFS_PROP_GUID) == gtnd->guid && + (gtnd->num_redact_snaps == -1 || redact_snaps_match(zhp, gtnd))) { + (void) strcpy(gtnd->name, zhp->zfs_name); + zfs_close(zhp); + return (EEXIST); + } + + err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); + if (err != EEXIST && gtnd->bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, gtnd); + zfs_close(zhp); + return (err); +} + +/* + * Attempt to find the local dataset associated with this guid. In the case of + * multiple matches, we attempt to find the "best" match by searching + * progressively larger portions of the hierarchy. This allows one to send a + * tree of datasets individually and guarantee that we will find the source + * guid within that hierarchy, even if there are multiple matches elsewhere. + * + * If num_redact_snaps is not -1, we attempt to find a redaction bookmark with + * the specified number of redaction snapshots. If num_redact_snaps isn't 0 or + * -1, then redact_snap_guids will be an array of the guids of the snapshots the + * redaction bookmark was created with. If num_redact_snaps is -1, then we will + * attempt to find a snapshot or bookmark (if bookmark_ok is passed) with the + * given guid. Note that a redaction bookmark can be returned if + * num_redact_snaps == -1. + */ +static int +guid_to_name_redact_snaps(libzfs_handle_t *hdl, const char *parent, + uint64_t guid, boolean_t bookmark_ok, uint64_t *redact_snap_guids, + uint64_t num_redact_snaps, char *name) +{ + char pname[ZFS_MAX_DATASET_NAME_LEN]; + guid_to_name_data_t gtnd; + + gtnd.guid = guid; + gtnd.bookmark_ok = bookmark_ok; + gtnd.name = name; + gtnd.skip = NULL; + gtnd.redact_snap_guids = redact_snap_guids; + gtnd.num_redact_snaps = num_redact_snaps; + + /* + * Search progressively larger portions of the hierarchy, starting + * with the filesystem specified by 'parent'. This will + * select the "most local" version of the origin snapshot in the case + * that there are multiple matching snapshots in the system. + */ + (void) strlcpy(pname, parent, sizeof (pname)); + char *cp = strrchr(pname, '@'); + if (cp == NULL) + cp = strchr(pname, '\0'); + for (; cp != NULL; cp = strrchr(pname, '/')) { + /* Chop off the last component and open the parent */ + *cp = '\0'; + zfs_handle_t *zhp = make_dataset_handle(hdl, pname); + + if (zhp == NULL) + continue; + int err = guid_to_name_cb(zfs_handle_dup(zhp), >nd); + if (err != EEXIST) + err = zfs_iter_children(zhp, guid_to_name_cb, >nd); + if (err != EEXIST && bookmark_ok) + err = zfs_iter_bookmarks(zhp, guid_to_name_cb, >nd); + zfs_close(zhp); + if (err == EEXIST) + return (0); + + /* + * Remember the last portion of the dataset so we skip it next + * time through (as we've already searched that portion of the + * hierarchy). + */ + gtnd.skip = strrchr(pname, '/') + 1; + } + + return (ENOENT); +} + +static int +guid_to_name(libzfs_handle_t *hdl, const char *parent, uint64_t guid, + boolean_t bookmark_ok, char *name) +{ + return (guid_to_name_redact_snaps(hdl, parent, guid, bookmark_ok, NULL, + -1, name)); +} + +/* + * Return +1 if guid1 is before guid2, 0 if they are the same, and -1 if + * guid1 is after guid2. + */ +static int +created_before(libzfs_handle_t *hdl, avl_tree_t *avl, + uint64_t guid1, uint64_t guid2) +{ + nvlist_t *nvfs; + char *fsname = NULL, *snapname = NULL; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + int rv; + zfs_handle_t *guid1hdl, *guid2hdl; + uint64_t create1, create2; + + if (guid2 == 0) + return (0); + if (guid1 == 0) + return (1); + + nvfs = fsavl_find(avl, guid1, &snapname); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); + guid1hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid1hdl == NULL) + return (-1); + + nvfs = fsavl_find(avl, guid2, &snapname); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + (void) snprintf(buf, sizeof (buf), "%s@%s", fsname, snapname); + guid2hdl = zfs_open(hdl, buf, ZFS_TYPE_SNAPSHOT); + if (guid2hdl == NULL) { + zfs_close(guid1hdl); + return (-1); + } + + create1 = zfs_prop_get_int(guid1hdl, ZFS_PROP_CREATETXG); + create2 = zfs_prop_get_int(guid2hdl, ZFS_PROP_CREATETXG); + + if (create1 < create2) + rv = -1; + else if (create1 > create2) + rv = +1; + else + rv = 0; + + zfs_close(guid1hdl); + zfs_close(guid2hdl); + + return (rv); +} + +/* + * This function reestablishes the hierarchy of encryption roots after a + * recursive incremental receive has completed. This must be done after the + * second call to recv_incremental_replication() has renamed and promoted all + * sent datasets to their final locations in the dataset hierarchy. + */ +static int +recv_fix_encryption_hierarchy(libzfs_handle_t *hdl, const char *top_zfs, + nvlist_t *stream_nv, avl_tree_t *stream_avl) +{ + int err; + nvpair_t *fselem = NULL; + nvlist_t *stream_fss; + + VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", &stream_fss)); + + while ((fselem = nvlist_next_nvpair(stream_fss, fselem)) != NULL) { + zfs_handle_t *zhp = NULL; + uint64_t crypt; + nvlist_t *snaps, *props, *stream_nvfs = NULL; + nvpair_t *snapel = NULL; + boolean_t is_encroot, is_clone, stream_encroot; + char *cp; + char *stream_keylocation = NULL; + char keylocation[MAXNAMELEN]; + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + + keylocation[0] = '\0'; + VERIFY(0 == nvpair_value_nvlist(fselem, &stream_nvfs)); + VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "snaps", &snaps)); + VERIFY(0 == nvlist_lookup_nvlist(stream_nvfs, "props", &props)); + stream_encroot = nvlist_exists(stream_nvfs, "is_encroot"); + + /* find a snapshot from the stream that exists locally */ + err = ENOENT; + while ((snapel = nvlist_next_nvpair(snaps, snapel)) != NULL) { + uint64_t guid; + + VERIFY(0 == nvpair_value_uint64(snapel, &guid)); + err = guid_to_name(hdl, top_zfs, guid, B_FALSE, + fsname); + if (err == 0) + break; + } + + if (err != 0) + continue; + + cp = strchr(fsname, '@'); + if (cp != NULL) + *cp = '\0'; + + zhp = zfs_open(hdl, fsname, ZFS_TYPE_DATASET); + if (zhp == NULL) { + err = ENOENT; + goto error; + } + + crypt = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION); + is_clone = zhp->zfs_dmustats.dds_origin[0] != '\0'; + (void) zfs_crypto_get_encryption_root(zhp, &is_encroot, NULL); + + /* we don't need to do anything for unencrypted datasets */ + if (crypt == ZIO_CRYPT_OFF) { + zfs_close(zhp); + continue; + } + + /* + * If the dataset is flagged as an encryption root, was not + * received as a clone and is not currently an encryption root, + * force it to become one. Fixup the keylocation if necessary. + */ + if (stream_encroot) { + if (!is_clone && !is_encroot) { + err = lzc_change_key(fsname, + DCP_CMD_FORCE_NEW_KEY, NULL, NULL, 0); + if (err != 0) { + zfs_close(zhp); + goto error; + } + } + + VERIFY(0 == nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + &stream_keylocation)); + + /* + * Refresh the properties in case the call to + * lzc_change_key() changed the value. + */ + zfs_refresh_properties(zhp); + err = zfs_prop_get(zhp, ZFS_PROP_KEYLOCATION, + keylocation, sizeof (keylocation), NULL, NULL, + 0, B_TRUE); + if (err != 0) { + zfs_close(zhp); + goto error; + } + + if (strcmp(keylocation, stream_keylocation) != 0) { + err = zfs_prop_set(zhp, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + stream_keylocation); + if (err != 0) { + zfs_close(zhp); + goto error; + } + } + } + + /* + * If the dataset is not flagged as an encryption root and is + * currently an encryption root, force it to inherit from its + * parent. The root of a raw send should never be + * force-inherited. + */ + if (!stream_encroot && is_encroot && + strcmp(top_zfs, fsname) != 0) { + err = lzc_change_key(fsname, DCP_CMD_FORCE_INHERIT, + NULL, NULL, 0); + if (err != 0) { + zfs_close(zhp); + goto error; + } + } + + zfs_close(zhp); + } + + return (0); + +error: + return (err); +} + +static int +recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, + recvflags_t *flags, nvlist_t *stream_nv, avl_tree_t *stream_avl, + nvlist_t *renamed) +{ + nvlist_t *local_nv, *deleted = NULL; + avl_tree_t *local_avl; + nvpair_t *fselem, *nextfselem; + char *fromsnap; + char newname[ZFS_MAX_DATASET_NAME_LEN]; + char guidname[32]; + int error; + boolean_t needagain, progress, recursive; + char *s1, *s2; + + VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + if (flags->dryrun) + return (0); + +again: + needagain = progress = B_FALSE; + + VERIFY(0 == nvlist_alloc(&deleted, NV_UNIQUE_NAME, 0)); + + if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, + recursive, B_TRUE, B_FALSE, recursive, B_FALSE, B_FALSE, + B_FALSE, B_TRUE, &local_nv, &local_avl)) != 0) + return (error); + + /* + * Process deletes and renames + */ + for (fselem = nvlist_next_nvpair(local_nv, NULL); + fselem; fselem = nextfselem) { + nvlist_t *nvfs, *snaps; + nvlist_t *stream_nvfs = NULL; + nvpair_t *snapelem, *nextsnapelem; + uint64_t fromguid = 0; + uint64_t originguid = 0; + uint64_t stream_originguid = 0; + uint64_t parent_fromsnap_guid, stream_parent_fromsnap_guid; + char *fsname, *stream_fsname; + + nextfselem = nvlist_next_nvpair(local_nv, fselem); + + VERIFY(0 == nvpair_value_nvlist(fselem, &nvfs)); + VERIFY(0 == nvlist_lookup_nvlist(nvfs, "snaps", &snaps)); + VERIFY(0 == nvlist_lookup_string(nvfs, "name", &fsname)); + VERIFY(0 == nvlist_lookup_uint64(nvfs, "parentfromsnap", + &parent_fromsnap_guid)); + (void) nvlist_lookup_uint64(nvfs, "origin", &originguid); + + /* + * First find the stream's fs, so we can check for + * a different origin (due to "zfs promote") + */ + for (snapelem = nvlist_next_nvpair(snaps, NULL); + snapelem; snapelem = nvlist_next_nvpair(snaps, snapelem)) { + uint64_t thisguid; + + VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + stream_nvfs = fsavl_find(stream_avl, thisguid, NULL); + + if (stream_nvfs != NULL) + break; + } + + /* check for promote */ + (void) nvlist_lookup_uint64(stream_nvfs, "origin", + &stream_originguid); + if (stream_nvfs && originguid != stream_originguid) { + switch (created_before(hdl, local_avl, + stream_originguid, originguid)) { + case 1: { + /* promote it! */ + nvlist_t *origin_nvfs; + char *origin_fsname; + + origin_nvfs = fsavl_find(local_avl, originguid, + NULL); + VERIFY(0 == nvlist_lookup_string(origin_nvfs, + "name", &origin_fsname)); + error = recv_promote(hdl, fsname, origin_fsname, + flags); + if (error == 0) + progress = B_TRUE; + break; + } + default: + break; + case -1: + fsavl_destroy(local_avl); + nvlist_free(local_nv); + return (-1); + } + /* + * We had/have the wrong origin, therefore our + * list of snapshots is wrong. Need to handle + * them on the next pass. + */ + needagain = B_TRUE; + continue; + } + + for (snapelem = nvlist_next_nvpair(snaps, NULL); + snapelem; snapelem = nextsnapelem) { + uint64_t thisguid; + char *stream_snapname; + nvlist_t *found, *props; + + nextsnapelem = nvlist_next_nvpair(snaps, snapelem); + + VERIFY(0 == nvpair_value_uint64(snapelem, &thisguid)); + found = fsavl_find(stream_avl, thisguid, + &stream_snapname); + + /* check for delete */ + if (found == NULL) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + + if (!flags->force) + continue; + + (void) snprintf(name, sizeof (name), "%s@%s", + fsname, nvpair_name(snapelem)); + + error = recv_destroy(hdl, name, + strlen(fsname)+1, newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + sprintf(guidname, "%llu", + (u_longlong_t)thisguid); + nvlist_add_boolean(deleted, guidname); + continue; + } + + stream_nvfs = found; + + if (0 == nvlist_lookup_nvlist(stream_nvfs, "snapprops", + &props) && 0 == nvlist_lookup_nvlist(props, + stream_snapname, &props)) { + zfs_cmd_t zc = {"\0"}; + + zc.zc_cookie = B_TRUE; /* received */ + (void) snprintf(zc.zc_name, sizeof (zc.zc_name), + "%s@%s", fsname, nvpair_name(snapelem)); + if (zcmd_write_src_nvlist(hdl, &zc, + props) == 0) { + (void) zfs_ioctl(hdl, + ZFS_IOC_SET_PROP, &zc); + zcmd_free_nvlists(&zc); + } + } + + /* check for different snapname */ + if (strcmp(nvpair_name(snapelem), + stream_snapname) != 0) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + char tryname[ZFS_MAX_DATASET_NAME_LEN]; + + (void) snprintf(name, sizeof (name), "%s@%s", + fsname, nvpair_name(snapelem)); + (void) snprintf(tryname, sizeof (name), "%s@%s", + fsname, stream_snapname); + + error = recv_rename(hdl, name, tryname, + strlen(fsname)+1, newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + } + + if (strcmp(stream_snapname, fromsnap) == 0) + fromguid = thisguid; + } + + /* check for delete */ + if (stream_nvfs == NULL) { + if (!flags->force) + continue; + + error = recv_destroy(hdl, fsname, strlen(tofs)+1, + newname, flags); + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + sprintf(guidname, "%llu", + (u_longlong_t)parent_fromsnap_guid); + nvlist_add_boolean(deleted, guidname); + continue; + } + + if (fromguid == 0) { + if (flags->verbose) { + (void) printf("local fs %s does not have " + "fromsnap (%s in stream); must have " + "been deleted locally; ignoring\n", + fsname, fromsnap); + } + continue; + } + + VERIFY(0 == nvlist_lookup_string(stream_nvfs, + "name", &stream_fsname)); + VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, + "parentfromsnap", &stream_parent_fromsnap_guid)); + + s1 = strrchr(fsname, '/'); + s2 = strrchr(stream_fsname, '/'); + + /* + * Check if we're going to rename based on parent guid change + * and the current parent guid was also deleted. If it was then + * rename will fail and is likely unneeded, so avoid this and + * force an early retry to determine the new + * parent_fromsnap_guid. + */ + if (stream_parent_fromsnap_guid != 0 && + parent_fromsnap_guid != 0 && + stream_parent_fromsnap_guid != parent_fromsnap_guid) { + sprintf(guidname, "%llu", + (u_longlong_t)parent_fromsnap_guid); + if (nvlist_exists(deleted, guidname)) { + progress = B_TRUE; + needagain = B_TRUE; + goto doagain; + } + } + + /* + * Check for rename. If the exact receive path is specified, it + * does not count as a rename, but we still need to check the + * datasets beneath it. + */ + if ((stream_parent_fromsnap_guid != 0 && + parent_fromsnap_guid != 0 && + stream_parent_fromsnap_guid != parent_fromsnap_guid) || + ((flags->isprefix || strcmp(tofs, fsname) != 0) && + (s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { + nvlist_t *parent; + char tryname[ZFS_MAX_DATASET_NAME_LEN]; + + parent = fsavl_find(local_avl, + stream_parent_fromsnap_guid, NULL); + /* + * NB: parent might not be found if we used the + * tosnap for stream_parent_fromsnap_guid, + * because the parent is a newly-created fs; + * we'll be able to rename it after we recv the + * new fs. + */ + if (parent != NULL) { + char *pname; + + VERIFY(0 == nvlist_lookup_string(parent, "name", + &pname)); + (void) snprintf(tryname, sizeof (tryname), + "%s%s", pname, strrchr(stream_fsname, '/')); + } else { + tryname[0] = '\0'; + if (flags->verbose) { + (void) printf("local fs %s new parent " + "not found\n", fsname); + } + } + + newname[0] = '\0'; + + error = recv_rename(hdl, fsname, tryname, + strlen(tofs)+1, newname, flags); + + if (renamed != NULL && newname[0] != '\0') { + VERIFY(0 == nvlist_add_boolean(renamed, + newname)); + } + + if (error) + needagain = B_TRUE; + else + progress = B_TRUE; + } + } + +doagain: + fsavl_destroy(local_avl); + nvlist_free(local_nv); + nvlist_free(deleted); + + if (needagain && progress) { + /* do another pass to fix up temporary names */ + if (flags->verbose) + (void) printf("another pass:\n"); + goto again; + } + + return (needagain || error != 0); +} + +static int +zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, + recvflags_t *flags, dmu_replay_record_t *drr, zio_cksum_t *zc, + char **top_zfs, nvlist_t *cmdprops) +{ + nvlist_t *stream_nv = NULL; + avl_tree_t *stream_avl = NULL; + char *fromsnap = NULL; + char *sendsnap = NULL; + char *cp; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + char sendfs[ZFS_MAX_DATASET_NAME_LEN]; + char errbuf[1024]; + dmu_replay_record_t drre; + int error; + boolean_t anyerr = B_FALSE; + boolean_t softerr = B_FALSE; + boolean_t recursive, raw; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + assert(drr->drr_type == DRR_BEGIN); + assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); + assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == + DMU_COMPOUNDSTREAM); + + /* + * Read in the nvlist from the stream. + */ + if (drr->drr_payloadlen != 0) { + error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, + &stream_nv, flags->byteswap, zc); + if (error) { + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + } + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + raw = (nvlist_lookup_boolean(stream_nv, "raw") == 0); + + if (recursive && strchr(destname, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot stream")); + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + + /* + * Read in the end record and verify checksum. + */ + if (0 != (error = recv_read(hdl, fd, &drre, sizeof (drre), + flags->byteswap, NULL))) + goto out; + if (flags->byteswap) { + drre.drr_type = BSWAP_32(drre.drr_type); + drre.drr_u.drr_end.drr_checksum.zc_word[0] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[0]); + drre.drr_u.drr_end.drr_checksum.zc_word[1] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[1]); + drre.drr_u.drr_end.drr_checksum.zc_word[2] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[2]); + drre.drr_u.drr_end.drr_checksum.zc_word[3] = + BSWAP_64(drre.drr_u.drr_end.drr_checksum.zc_word[3]); + } + if (drre.drr_type != DRR_END) { + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + if (!ZIO_CHECKSUM_EQUAL(drre.drr_u.drr_end.drr_checksum, *zc)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incorrect header checksum")); + error = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + + (void) nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap); + + if (drr->drr_payloadlen != 0) { + nvlist_t *stream_fss; + + VERIFY(0 == nvlist_lookup_nvlist(stream_nv, "fss", + &stream_fss)); + if ((stream_avl = fsavl_create(stream_fss)) == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "couldn't allocate avl tree")); + error = zfs_error(hdl, EZFS_NOMEM, errbuf); + goto out; + } + + if (fromsnap != NULL && recursive) { + nvlist_t *renamed = NULL; + nvpair_t *pair = NULL; + + (void) strlcpy(tofs, destname, sizeof (tofs)); + if (flags->isprefix) { + struct drr_begin *drrb = &drr->drr_u.drr_begin; + int i; + + if (flags->istail) { + cp = strrchr(drrb->drr_toname, '/'); + if (cp == NULL) { + (void) strlcat(tofs, "/", + sizeof (tofs)); + i = 0; + } else { + i = (cp - drrb->drr_toname); + } + } else { + i = strcspn(drrb->drr_toname, "/@"); + } + /* zfs_receive_one() will create_parents() */ + (void) strlcat(tofs, &drrb->drr_toname[i], + sizeof (tofs)); + *strchr(tofs, '@') = '\0'; + } + + if (!flags->dryrun && !flags->nomount) { + VERIFY(0 == nvlist_alloc(&renamed, + NV_UNIQUE_NAME, 0)); + } + + softerr = recv_incremental_replication(hdl, tofs, flags, + stream_nv, stream_avl, renamed); + + /* Unmount renamed filesystems before receiving. */ + while ((pair = nvlist_next_nvpair(renamed, + pair)) != NULL) { + zfs_handle_t *zhp; + prop_changelist_t *clp = NULL; + + zhp = zfs_open(hdl, nvpair_name(pair), + ZFS_TYPE_FILESYSTEM); + if (zhp != NULL) { + clp = changelist_gather(zhp, + ZFS_PROP_MOUNTPOINT, 0, + flags->forceunmount ? MS_FORCE : 0); + zfs_close(zhp); + if (clp != NULL) { + softerr |= + changelist_prefix(clp); + changelist_free(clp); + } + } + } + + nvlist_free(renamed); + } + } + + /* + * Get the fs specified by the first path in the stream (the top level + * specified by 'zfs send') and pass it to each invocation of + * zfs_receive_one(). + */ + (void) strlcpy(sendfs, drr->drr_u.drr_begin.drr_toname, + sizeof (sendfs)); + if ((cp = strchr(sendfs, '@')) != NULL) { + *cp = '\0'; + /* + * Find the "sendsnap", the final snapshot in a replication + * stream. zfs_receive_one() handles certain errors + * differently, depending on if the contained stream is the + * last one or not. + */ + sendsnap = (cp + 1); + } + + /* Finally, receive each contained stream */ + do { + /* + * we should figure out if it has a recoverable + * error, in which case do a recv_skip() and drive on. + * Note, if we fail due to already having this guid, + * zfs_receive_one() will take care of it (ie, + * recv_skip() and return 0). + */ + error = zfs_receive_impl(hdl, destname, NULL, flags, fd, + sendfs, stream_nv, stream_avl, top_zfs, sendsnap, cmdprops); + if (error == ENODATA) { + error = 0; + break; + } + anyerr |= error; + } while (error == 0); + + if (drr->drr_payloadlen != 0 && recursive && fromsnap != NULL) { + /* + * Now that we have the fs's they sent us, try the + * renames again. + */ + softerr = recv_incremental_replication(hdl, tofs, flags, + stream_nv, stream_avl, NULL); + } + + if (raw && softerr == 0 && *top_zfs != NULL) { + softerr = recv_fix_encryption_hierarchy(hdl, *top_zfs, + stream_nv, stream_avl); + } + +out: + fsavl_destroy(stream_avl); + nvlist_free(stream_nv); + if (softerr) + error = -2; + if (anyerr) + error = -1; + return (error); +} + +static void +trunc_prop_errs(int truncated) +{ + ASSERT(truncated != 0); + + if (truncated == 1) + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "1 more property could not be set\n")); + else + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%d more properties could not be set\n"), truncated); +} + +static int +recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) +{ + dmu_replay_record_t *drr; + void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); + uint64_t payload_size; + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + /* XXX would be great to use lseek if possible... */ + drr = buf; + + while (recv_read(hdl, fd, drr, sizeof (dmu_replay_record_t), + byteswap, NULL) == 0) { + if (byteswap) + drr->drr_type = BSWAP_32(drr->drr_type); + + switch (drr->drr_type) { + case DRR_BEGIN: + if (drr->drr_payloadlen != 0) { + (void) recv_read(hdl, fd, buf, + drr->drr_payloadlen, B_FALSE, NULL); + } + break; + + case DRR_END: + free(buf); + return (0); + + case DRR_OBJECT: + if (byteswap) { + drr->drr_u.drr_object.drr_bonuslen = + BSWAP_32(drr->drr_u.drr_object. + drr_bonuslen); + drr->drr_u.drr_object.drr_raw_bonuslen = + BSWAP_32(drr->drr_u.drr_object. + drr_raw_bonuslen); + } + + payload_size = + DRR_OBJECT_PAYLOAD_SIZE(&drr->drr_u.drr_object); + (void) recv_read(hdl, fd, buf, payload_size, + B_FALSE, NULL); + break; + + case DRR_WRITE: + if (byteswap) { + drr->drr_u.drr_write.drr_logical_size = + BSWAP_64( + drr->drr_u.drr_write.drr_logical_size); + drr->drr_u.drr_write.drr_compressed_size = + BSWAP_64( + drr->drr_u.drr_write.drr_compressed_size); + } + payload_size = + DRR_WRITE_PAYLOAD_SIZE(&drr->drr_u.drr_write); + (void) recv_read(hdl, fd, buf, + payload_size, B_FALSE, NULL); + break; + case DRR_SPILL: + if (byteswap) { + drr->drr_u.drr_spill.drr_length = + BSWAP_64(drr->drr_u.drr_spill.drr_length); + drr->drr_u.drr_spill.drr_compressed_size = + BSWAP_64(drr->drr_u.drr_spill. + drr_compressed_size); + } + + payload_size = + DRR_SPILL_PAYLOAD_SIZE(&drr->drr_u.drr_spill); + (void) recv_read(hdl, fd, buf, payload_size, + B_FALSE, NULL); + break; + case DRR_WRITE_EMBEDDED: + if (byteswap) { + drr->drr_u.drr_write_embedded.drr_psize = + BSWAP_32(drr->drr_u.drr_write_embedded. + drr_psize); + } + (void) recv_read(hdl, fd, buf, + P2ROUNDUP(drr->drr_u.drr_write_embedded.drr_psize, + 8), B_FALSE, NULL); + break; + case DRR_OBJECT_RANGE: + case DRR_WRITE_BYREF: + case DRR_FREEOBJECTS: + case DRR_FREE: + break; + + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid record type")); + free(buf); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + } + + free(buf); + return (-1); +} + +static void +recv_ecksum_set_aux(libzfs_handle_t *hdl, const char *target_snap, + boolean_t resumable, boolean_t checksum) +{ + char target_fs[ZFS_MAX_DATASET_NAME_LEN]; + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, (checksum ? + "checksum mismatch" : "incomplete stream"))); + + if (!resumable) + return; + (void) strlcpy(target_fs, target_snap, sizeof (target_fs)); + *strchr(target_fs, '@') = '\0'; + zfs_handle_t *zhp = zfs_open(hdl, target_fs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) + return; + + char token_buf[ZFS_MAXPROPLEN]; + int error = zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, + token_buf, sizeof (token_buf), + NULL, NULL, 0, B_TRUE); + if (error == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "checksum mismatch or incomplete stream.\n" + "Partially received snapshot is saved.\n" + "A resuming stream can be generated on the sending " + "system by running:\n" + " zfs send -t %s"), + token_buf); + } + zfs_close(zhp); +} + +/* + * Prepare a new nvlist of properties that are to override (-o) or be excluded + * (-x) from the received dataset + * recvprops: received properties from the send stream + * cmdprops: raw input properties from command line + * origprops: properties, both locally-set and received, currently set on the + * target dataset if it exists, NULL otherwise. + * oxprops: valid output override (-o) and excluded (-x) properties + */ +static int +zfs_setup_cmdline_props(libzfs_handle_t *hdl, zfs_type_t type, + char *fsname, boolean_t zoned, boolean_t recursive, boolean_t newfs, + boolean_t raw, boolean_t toplevel, nvlist_t *recvprops, nvlist_t *cmdprops, + nvlist_t *origprops, nvlist_t **oxprops, uint8_t **wkeydata_out, + uint_t *wkeylen_out, const char *errbuf) +{ + nvpair_t *nvp; + nvlist_t *oprops, *voprops; + zfs_handle_t *zhp = NULL; + zpool_handle_t *zpool_hdl = NULL; + char *cp; + int ret = 0; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + if (nvlist_empty(cmdprops)) + return (0); /* No properties to override or exclude */ + + *oxprops = fnvlist_alloc(); + oprops = fnvlist_alloc(); + + strlcpy(namebuf, fsname, ZFS_MAX_DATASET_NAME_LEN); + + /* + * Get our dataset handle. The target dataset may not exist yet. + */ + if (zfs_dataset_exists(hdl, namebuf, ZFS_TYPE_DATASET)) { + zhp = zfs_open(hdl, namebuf, ZFS_TYPE_DATASET); + if (zhp == NULL) { + ret = -1; + goto error; + } + } + + /* open the zpool handle */ + cp = strchr(namebuf, '/'); + if (cp != NULL) + *cp = '\0'; + zpool_hdl = zpool_open(hdl, namebuf); + if (zpool_hdl == NULL) { + ret = -1; + goto error; + } + + /* restore namebuf to match fsname for later use */ + if (cp != NULL) + *cp = '/'; + + /* + * first iteration: process excluded (-x) properties now and gather + * added (-o) properties to be later processed by zfs_valid_proplist() + */ + nvp = NULL; + while ((nvp = nvlist_next_nvpair(cmdprops, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + zfs_prop_t prop = zfs_name_to_prop(name); + + /* "origin" is processed separately, don't handle it here */ + if (prop == ZFS_PROP_ORIGIN) + continue; + + /* + * we're trying to override or exclude a property that does not + * make sense for this type of dataset, but we don't want to + * fail if the receive is recursive: this comes in handy when + * the send stream contains, for instance, a child ZVOL and + * we're trying to receive it with "-o atime=on" + */ + if (!zfs_prop_valid_for_type(prop, type, B_FALSE) && + !zfs_prop_user(name)) { + if (recursive) + continue; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' does not apply to datasets of this " + "type"), name); + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* raw streams can't override encryption properties */ + if ((zfs_prop_encryption_key_param(prop) || + prop == ZFS_PROP_ENCRYPTION) && raw) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encryption property '%s' cannot " + "be set or excluded for raw streams."), name); + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* incremental streams can only exclude encryption properties */ + if ((zfs_prop_encryption_key_param(prop) || + prop == ZFS_PROP_ENCRYPTION) && !newfs && + nvpair_type(nvp) != DATA_TYPE_BOOLEAN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encryption property '%s' cannot " + "be set for incremental streams."), name); + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + switch (nvpair_type(nvp)) { + case DATA_TYPE_BOOLEAN: /* -x property */ + /* + * DATA_TYPE_BOOLEAN is the way we're asked to "exclude" + * a property: this is done by forcing an explicit + * inherit on the destination so the effective value is + * not the one we received from the send stream. + * We do this only if the property is not already + * locally-set, in which case its value will take + * priority over the received anyway. + */ + if (nvlist_exists(origprops, name)) { + nvlist_t *attrs; + char *source = NULL; + + attrs = fnvlist_lookup_nvlist(origprops, name); + if (nvlist_lookup_string(attrs, + ZPROP_SOURCE, &source) == 0 && + strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0) + continue; + } + /* + * We can't force an explicit inherit on non-inheritable + * properties: if we're asked to exclude this kind of + * values we remove them from "recvprops" input nvlist. + */ + if (!zfs_prop_inheritable(prop) && + !zfs_prop_user(name) && /* can be inherited too */ + nvlist_exists(recvprops, name)) + fnvlist_remove(recvprops, name); + else + fnvlist_add_nvpair(*oxprops, nvp); + break; + case DATA_TYPE_STRING: /* -o property=value */ + fnvlist_add_nvpair(oprops, nvp); + break; + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property '%s' must be a string or boolean"), name); + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + } + + if (toplevel) { + /* convert override strings properties to native */ + if ((voprops = zfs_valid_proplist(hdl, ZFS_TYPE_DATASET, + oprops, zoned, zhp, zpool_hdl, B_FALSE, errbuf)) == NULL) { + ret = zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* + * zfs_crypto_create() requires the parent name. Get it + * by truncating the fsname copy stored in namebuf. + */ + cp = strrchr(namebuf, '/'); + if (cp != NULL) + *cp = '\0'; + + if (!raw && zfs_crypto_create(hdl, namebuf, voprops, NULL, + B_FALSE, wkeydata_out, wkeylen_out) != 0) { + fnvlist_free(voprops); + ret = zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); + goto error; + } + + /* second pass: process "-o" properties */ + fnvlist_merge(*oxprops, voprops); + fnvlist_free(voprops); + } else { + /* override props on child dataset are inherited */ + nvp = NULL; + while ((nvp = nvlist_next_nvpair(oprops, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + fnvlist_add_boolean(*oxprops, name); + } + } + +error: + if (zhp != NULL) + zfs_close(zhp); + if (zpool_hdl != NULL) + zpool_close(zpool_hdl); + fnvlist_free(oprops); + return (ret); +} + +/* + * Restores a backup of tosnap from the file descriptor specified by infd. + */ +static int +zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, + const char *originsnap, recvflags_t *flags, dmu_replay_record_t *drr, + dmu_replay_record_t *drr_noswap, const char *sendfs, nvlist_t *stream_nv, + avl_tree_t *stream_avl, char **top_zfs, + const char *finalsnap, nvlist_t *cmdprops) +{ + time_t begin_time; + int ioctl_err, ioctl_errno, err; + char *cp; + struct drr_begin *drrb = &drr->drr_u.drr_begin; + char errbuf[1024]; + const char *chopprefix; + boolean_t newfs = B_FALSE; + boolean_t stream_wantsnewfs; + boolean_t newprops = B_FALSE; + uint64_t read_bytes = 0; + uint64_t errflags = 0; + uint64_t parent_snapguid = 0; + prop_changelist_t *clp = NULL; + nvlist_t *snapprops_nvlist = NULL; + nvlist_t *snapholds_nvlist = NULL; + zprop_errflags_t prop_errflags; + nvlist_t *prop_errors = NULL; + boolean_t recursive; + char *snapname = NULL; + char destsnap[MAXPATHLEN * 2]; + char origin[MAXNAMELEN]; + char name[MAXPATHLEN]; + char tmp_keylocation[MAXNAMELEN]; + nvlist_t *rcvprops = NULL; /* props received from the send stream */ + nvlist_t *oxprops = NULL; /* override (-o) and exclude (-x) props */ + nvlist_t *origprops = NULL; /* original props (if destination exists) */ + zfs_type_t type; + boolean_t toplevel = B_FALSE; + boolean_t zoned = B_FALSE; + boolean_t hastoken = B_FALSE; + boolean_t redacted; + uint8_t *wkeydata = NULL; + uint_t wkeylen = 0; + + begin_time = time(NULL); + bzero(origin, MAXNAMELEN); + bzero(tmp_keylocation, MAXNAMELEN); + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + + /* Did the user request holds be skipped via zfs recv -k? */ + boolean_t holds = flags->holds && !flags->skipholds; + + if (stream_avl != NULL) { + char *keylocation = NULL; + nvlist_t *lookup = NULL; + nvlist_t *fs = fsavl_find(stream_avl, drrb->drr_toguid, + &snapname); + + (void) nvlist_lookup_uint64(fs, "parentfromsnap", + &parent_snapguid); + err = nvlist_lookup_nvlist(fs, "props", &rcvprops); + if (err) { + VERIFY(0 == nvlist_alloc(&rcvprops, NV_UNIQUE_NAME, 0)); + newprops = B_TRUE; + } + + /* + * The keylocation property may only be set on encryption roots, + * but this dataset might not become an encryption root until + * recv_fix_encryption_hierarchy() is called. That function + * will fixup the keylocation anyway, so we temporarily unset + * the keylocation for now to avoid any errors from the receive + * ioctl. + */ + err = nvlist_lookup_string(rcvprops, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); + if (err == 0) { + strcpy(tmp_keylocation, keylocation); + (void) nvlist_remove_all(rcvprops, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION)); + } + + if (flags->canmountoff) { + VERIFY(0 == nvlist_add_uint64(rcvprops, + zfs_prop_to_name(ZFS_PROP_CANMOUNT), 0)); + } else if (newprops) { /* nothing in rcvprops, eliminate it */ + nvlist_free(rcvprops); + rcvprops = NULL; + newprops = B_FALSE; + } + if (0 == nvlist_lookup_nvlist(fs, "snapprops", &lookup)) { + VERIFY(0 == nvlist_lookup_nvlist(lookup, + snapname, &snapprops_nvlist)); + } + if (holds) { + if (0 == nvlist_lookup_nvlist(fs, "snapholds", + &lookup)) { + VERIFY(0 == nvlist_lookup_nvlist(lookup, + snapname, &snapholds_nvlist)); + } + } + } + + cp = NULL; + + /* + * Determine how much of the snapshot name stored in the stream + * we are going to tack on to the name they specified on the + * command line, and how much we are going to chop off. + * + * If they specified a snapshot, chop the entire name stored in + * the stream. + */ + if (flags->istail) { + /* + * A filesystem was specified with -e. We want to tack on only + * the tail of the sent snapshot path. + */ + if (strchr(tosnap, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "argument - snapshot not allowed with -e")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } + + chopprefix = strrchr(sendfs, '/'); + + if (chopprefix == NULL) { + /* + * The tail is the poolname, so we need to + * prepend a path separator. + */ + int len = strlen(drrb->drr_toname); + cp = malloc(len + 2); + cp[0] = '/'; + (void) strcpy(&cp[1], drrb->drr_toname); + chopprefix = cp; + } else { + chopprefix = drrb->drr_toname + (chopprefix - sendfs); + } + } else if (flags->isprefix) { + /* + * A filesystem was specified with -d. We want to tack on + * everything but the first element of the sent snapshot path + * (all but the pool name). + */ + if (strchr(tosnap, '@')) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "argument - snapshot not allowed with -d")); + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } + + chopprefix = strchr(drrb->drr_toname, '/'); + if (chopprefix == NULL) + chopprefix = strchr(drrb->drr_toname, '@'); + } else if (strchr(tosnap, '@') == NULL) { + /* + * If a filesystem was specified without -d or -e, we want to + * tack on everything after the fs specified by 'zfs send'. + */ + chopprefix = drrb->drr_toname + strlen(sendfs); + } else { + /* A snapshot was specified as an exact path (no -d or -e). */ + if (recursive) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot specify snapshot name for multi-snapshot " + "stream")); + err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + chopprefix = drrb->drr_toname + strlen(drrb->drr_toname); + } + + ASSERT(strstr(drrb->drr_toname, sendfs) == drrb->drr_toname); + ASSERT(chopprefix > drrb->drr_toname || strchr(sendfs, '/') == NULL); + ASSERT(chopprefix <= drrb->drr_toname + strlen(drrb->drr_toname) || + strchr(sendfs, '/') == NULL); + ASSERT(chopprefix[0] == '/' || chopprefix[0] == '@' || + chopprefix[0] == '\0'); + + /* + * Determine name of destination snapshot. + */ + (void) strlcpy(destsnap, tosnap, sizeof (destsnap)); + (void) strlcat(destsnap, chopprefix, sizeof (destsnap)); + free(cp); + if (!zfs_name_valid(destsnap, ZFS_TYPE_SNAPSHOT)) { + err = zfs_error(hdl, EZFS_INVALIDNAME, errbuf); + goto out; + } + + /* + * Determine the name of the origin snapshot. + */ + if (originsnap) { + (void) strlcpy(origin, originsnap, sizeof (origin)); + if (flags->verbose) + (void) printf("using provided clone origin %s\n", + origin); + } else if (drrb->drr_flags & DRR_FLAG_CLONE) { + if (guid_to_name(hdl, destsnap, + drrb->drr_fromguid, B_FALSE, origin) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "local origin for clone %s does not exist"), + destsnap); + err = zfs_error(hdl, EZFS_NOENT, errbuf); + goto out; + } + if (flags->verbose) + (void) printf("found clone origin %s\n", origin); + } + + if ((DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_DEDUP)) { + (void) fprintf(stderr, + gettext("ERROR: \"zfs receive\" no longer supports " + "deduplicated send streams. Use\n" + "the \"zstream redup\" command to convert this stream " + "to a regular,\n" + "non-deduplicated stream.\n")); + err = zfs_error(hdl, EZFS_NOTSUP, errbuf); + goto out; + } + + boolean_t resuming = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RESUMING; + boolean_t raw = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RAW; + boolean_t embedded = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_EMBED_DATA; + stream_wantsnewfs = (drrb->drr_fromguid == 0 || + (drrb->drr_flags & DRR_FLAG_CLONE) || originsnap) && !resuming; + + if (stream_wantsnewfs) { + /* + * if the parent fs does not exist, look for it based on + * the parent snap GUID + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive new filesystem stream")); + + (void) strcpy(name, destsnap); + cp = strrchr(name, '/'); + if (cp) + *cp = '\0'; + if (cp && + !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { + char suffix[ZFS_MAX_DATASET_NAME_LEN]; + (void) strcpy(suffix, strrchr(destsnap, '/')); + if (guid_to_name(hdl, name, parent_snapguid, + B_FALSE, destsnap) == 0) { + *strchr(destsnap, '@') = '\0'; + (void) strcat(destsnap, suffix); + } + } + } else { + /* + * If the fs does not exist, look for it based on the + * fromsnap GUID. + */ + if (resuming) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive resume stream")); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, + "cannot receive incremental stream")); + } + + (void) strcpy(name, destsnap); + *strchr(name, '@') = '\0'; + + /* + * If the exact receive path was specified and this is the + * topmost path in the stream, then if the fs does not exist we + * should look no further. + */ + if ((flags->isprefix || (*(chopprefix = drrb->drr_toname + + strlen(sendfs)) != '\0' && *chopprefix != '@')) && + !zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { + char snap[ZFS_MAX_DATASET_NAME_LEN]; + (void) strcpy(snap, strchr(destsnap, '@')); + if (guid_to_name(hdl, name, drrb->drr_fromguid, + B_FALSE, destsnap) == 0) { + *strchr(destsnap, '@') = '\0'; + (void) strcat(destsnap, snap); + } + } + } + + (void) strcpy(name, destsnap); + *strchr(name, '@') = '\0'; + + redacted = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_REDACTED; + + if (zfs_dataset_exists(hdl, name, ZFS_TYPE_DATASET)) { + zfs_cmd_t zc = {"\0"}; + zfs_handle_t *zhp; + boolean_t encrypted; + + (void) strcpy(zc.zc_name, name); + + /* + * Destination fs exists. It must be one of these cases: + * - an incremental send stream + * - the stream specifies a new fs (full stream or clone) + * and they want us to blow away the existing fs (and + * have therefore specified -F and removed any snapshots) + * - we are resuming a failed receive. + */ + if (stream_wantsnewfs) { + boolean_t is_volume = drrb->drr_type == DMU_OST_ZVOL; + if (!flags->force) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' exists\n" + "must specify -F to overwrite it"), name); + err = zfs_error(hdl, EZFS_EXISTS, errbuf); + goto out; + } + if (zfs_ioctl(hdl, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination has snapshots (eg. %s)\n" + "must destroy them to overwrite it"), + zc.zc_name); + err = zfs_error(hdl, EZFS_EXISTS, errbuf); + goto out; + } + if (is_volume && strrchr(name, '/') == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s is the root dataset\n" + "cannot overwrite with a ZVOL"), + name); + err = zfs_error(hdl, EZFS_EXISTS, errbuf); + goto out; + } + if (is_volume && + zfs_ioctl(hdl, ZFS_IOC_DATASET_LIST_NEXT, + &zc) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination has children (eg. %s)\n" + "cannot overwrite with a ZVOL"), + zc.zc_name); + err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); + goto out; + } + } + + if ((zhp = zfs_open(hdl, name, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME)) == NULL) { + err = -1; + goto out; + } + + if (stream_wantsnewfs && + zhp->zfs_dmustats.dds_origin[0]) { + zfs_close(zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' is a clone\n" + "must destroy it to overwrite it"), name); + err = zfs_error(hdl, EZFS_EXISTS, errbuf); + goto out; + } + + /* + * Raw sends can not be performed as an incremental on top + * of existing unencrypted datasets. zfs recv -F can't be + * used to blow away an existing encrypted filesystem. This + * is because it would require the dsl dir to point to the + * new key (or lack of a key) and the old key at the same + * time. The -F flag may still be used for deleting + * intermediate snapshots that would otherwise prevent the + * receive from working. + */ + encrypted = zfs_prop_get_int(zhp, ZFS_PROP_ENCRYPTION) != + ZIO_CRYPT_OFF; + if (!stream_wantsnewfs && !encrypted && raw) { + zfs_close(zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot perform raw receive on top of " + "existing unencrypted dataset")); + err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); + goto out; + } + + if (stream_wantsnewfs && flags->force && + ((raw && !encrypted) || encrypted)) { + zfs_close(zhp); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "zfs receive -F cannot be used to destroy an " + "encrypted filesystem or overwrite an " + "unencrypted one with an encrypted one")); + err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); + goto out; + } + + if (!flags->dryrun && zhp->zfs_type == ZFS_TYPE_FILESYSTEM && + stream_wantsnewfs) { + /* We can't do online recv in this case */ + clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, + flags->forceunmount ? MS_FORCE : 0); + if (clp == NULL) { + zfs_close(zhp); + err = -1; + goto out; + } + if (changelist_prefix(clp) != 0) { + changelist_free(clp); + zfs_close(zhp); + err = -1; + goto out; + } + } + + /* + * If we are resuming a newfs, set newfs here so that we will + * mount it if the recv succeeds this time. We can tell + * that it was a newfs on the first recv because the fs + * itself will be inconsistent (if the fs existed when we + * did the first recv, we would have received it into + * .../%recv). + */ + if (resuming && zfs_prop_get_int(zhp, ZFS_PROP_INCONSISTENT)) + newfs = B_TRUE; + + /* we want to know if we're zoned when validating -o|-x props */ + zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + + /* may need this info later, get it now we have zhp around */ + if (zfs_prop_get(zhp, ZFS_PROP_RECEIVE_RESUME_TOKEN, NULL, 0, + NULL, NULL, 0, B_TRUE) == 0) + hastoken = B_TRUE; + + /* gather existing properties on destination */ + origprops = fnvlist_alloc(); + fnvlist_merge(origprops, zhp->zfs_props); + fnvlist_merge(origprops, zhp->zfs_user_props); + + zfs_close(zhp); + } else { + zfs_handle_t *zhp; + + /* + * Destination filesystem does not exist. Therefore we better + * be creating a new filesystem (either from a full backup, or + * a clone). It would therefore be invalid if the user + * specified only the pool name (i.e. if the destination name + * contained no slash character). + */ + cp = strrchr(name, '/'); + + if (!stream_wantsnewfs || cp == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination '%s' does not exist"), name); + err = zfs_error(hdl, EZFS_NOENT, errbuf); + goto out; + } + + /* + * Trim off the final dataset component so we perform the + * recvbackup ioctl to the filesystems's parent. + */ + *cp = '\0'; + + if (flags->isprefix && !flags->istail && !flags->dryrun && + create_parents(hdl, destsnap, strlen(tosnap)) != 0) { + err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); + goto out; + } + + /* validate parent */ + zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); + if (zhp == NULL) { + err = zfs_error(hdl, EZFS_BADRESTORE, errbuf); + goto out; + } + if (zfs_get_type(zhp) != ZFS_TYPE_FILESYSTEM) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "parent '%s' is not a filesystem"), name); + err = zfs_error(hdl, EZFS_WRONG_PARENT, errbuf); + zfs_close(zhp); + goto out; + } + + zfs_close(zhp); + + newfs = B_TRUE; + *cp = '/'; + } + + if (flags->verbose) { + (void) printf("%s %s stream of %s into %s\n", + flags->dryrun ? "would receive" : "receiving", + drrb->drr_fromguid ? "incremental" : "full", + drrb->drr_toname, destsnap); + (void) fflush(stdout); + } + + if (flags->dryrun) { + void *buf = zfs_alloc(hdl, SPA_MAXBLOCKSIZE); + + /* + * We have read the DRR_BEGIN record, but we have + * not yet read the payload. For non-dryrun sends + * this will be done by the kernel, so we must + * emulate that here, before attempting to read + * more records. + */ + err = recv_read(hdl, infd, buf, drr->drr_payloadlen, + flags->byteswap, NULL); + free(buf); + if (err != 0) + goto out; + + err = recv_skip(hdl, infd, flags->byteswap); + goto out; + } + + /* + * If this is the top-level dataset, record it so we can use it + * for recursive operations later. + */ + if (top_zfs != NULL && + (*top_zfs == NULL || strcmp(*top_zfs, name) == 0)) { + toplevel = B_TRUE; + if (*top_zfs == NULL) + *top_zfs = zfs_strdup(hdl, name); + } + + if (drrb->drr_type == DMU_OST_ZVOL) { + type = ZFS_TYPE_VOLUME; + } else if (drrb->drr_type == DMU_OST_ZFS) { + type = ZFS_TYPE_FILESYSTEM; + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid record type: 0x%d"), drrb->drr_type); + err = zfs_error(hdl, EZFS_BADSTREAM, errbuf); + goto out; + } + if ((err = zfs_setup_cmdline_props(hdl, type, name, zoned, recursive, + stream_wantsnewfs, raw, toplevel, rcvprops, cmdprops, origprops, + &oxprops, &wkeydata, &wkeylen, errbuf)) != 0) + goto out; + + /* + * When sending with properties (zfs send -p), the encryption property + * is not included because it is a SETONCE property and therefore + * treated as read only. However, we are always able to determine its + * value because raw sends will include it in the DRR_BDEGIN payload + * and non-raw sends with properties are not allowed for encrypted + * datasets. Therefore, if this is a non-raw properties stream, we can + * infer that the value should be ZIO_CRYPT_OFF and manually add that + * to the received properties. + */ + if (stream_wantsnewfs && !raw && rcvprops != NULL && + !nvlist_exists(cmdprops, zfs_prop_to_name(ZFS_PROP_ENCRYPTION))) { + if (oxprops == NULL) + oxprops = fnvlist_alloc(); + fnvlist_add_uint64(oxprops, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), ZIO_CRYPT_OFF); + } + + err = ioctl_err = lzc_receive_with_cmdprops(destsnap, rcvprops, + oxprops, wkeydata, wkeylen, origin, flags->force, flags->resumable, + raw, infd, drr_noswap, -1, &read_bytes, &errflags, + NULL, &prop_errors); + ioctl_errno = ioctl_err; + prop_errflags = errflags; + + if (err == 0) { + nvpair_t *prop_err = NULL; + + while ((prop_err = nvlist_next_nvpair(prop_errors, + prop_err)) != NULL) { + char tbuf[1024]; + zfs_prop_t prop; + int intval; + + prop = zfs_name_to_prop(nvpair_name(prop_err)); + (void) nvpair_value_int32(prop_err, &intval); + if (strcmp(nvpair_name(prop_err), + ZPROP_N_MORE_ERRORS) == 0) { + trunc_prop_errs(intval); + break; + } else if (snapname == NULL || finalsnap == NULL || + strcmp(finalsnap, snapname) == 0 || + strcmp(nvpair_name(prop_err), + zfs_prop_to_name(ZFS_PROP_REFQUOTA)) != 0) { + /* + * Skip the special case of, for example, + * "refquota", errors on intermediate + * snapshots leading up to a final one. + * That's why we have all of the checks above. + * + * See zfs_ioctl.c's extract_delay_props() for + * a list of props which can fail on + * intermediate snapshots, but shouldn't + * affect the overall receive. + */ + (void) snprintf(tbuf, sizeof (tbuf), + dgettext(TEXT_DOMAIN, + "cannot receive %s property on %s"), + nvpair_name(prop_err), name); + zfs_setprop_error(hdl, prop, intval, tbuf); + } + } + } + + if (err == 0 && snapprops_nvlist) { + zfs_cmd_t zc = {"\0"}; + + (void) strcpy(zc.zc_name, destsnap); + zc.zc_cookie = B_TRUE; /* received */ + if (zcmd_write_src_nvlist(hdl, &zc, snapprops_nvlist) == 0) { + (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); + zcmd_free_nvlists(&zc); + } + } + if (err == 0 && snapholds_nvlist) { + nvpair_t *pair; + nvlist_t *holds, *errors = NULL; + int cleanup_fd = -1; + + VERIFY(0 == nvlist_alloc(&holds, 0, KM_SLEEP)); + for (pair = nvlist_next_nvpair(snapholds_nvlist, NULL); + pair != NULL; + pair = nvlist_next_nvpair(snapholds_nvlist, pair)) { + VERIFY(0 == nvlist_add_string(holds, destsnap, + nvpair_name(pair))); + } + (void) lzc_hold(holds, cleanup_fd, &errors); + nvlist_free(snapholds_nvlist); + nvlist_free(holds); + } + + if (err && (ioctl_errno == ENOENT || ioctl_errno == EEXIST)) { + /* + * It may be that this snapshot already exists, + * in which case we want to consume & ignore it + * rather than failing. + */ + avl_tree_t *local_avl; + nvlist_t *local_nv, *fs; + cp = strchr(destsnap, '@'); + + /* + * XXX Do this faster by just iterating over snaps in + * this fs. Also if zc_value does not exist, we will + * get a strange "does not exist" error message. + */ + *cp = '\0'; + if (gather_nvlist(hdl, destsnap, NULL, NULL, B_FALSE, B_TRUE, + B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_FALSE, B_TRUE, + &local_nv, &local_avl) == 0) { + *cp = '@'; + fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); + fsavl_destroy(local_avl); + nvlist_free(local_nv); + + if (fs != NULL) { + if (flags->verbose) { + (void) printf("snap %s already exists; " + "ignoring\n", destsnap); + } + err = ioctl_err = recv_skip(hdl, infd, + flags->byteswap); + } + } + *cp = '@'; + } + + if (ioctl_err != 0) { + switch (ioctl_errno) { + case ENODEV: + cp = strchr(destsnap, '@'); + *cp = '\0'; + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "most recent snapshot of %s does not\n" + "match incremental source"), destsnap); + (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + *cp = '@'; + break; + case ETXTBSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s has been modified\n" + "since most recent snapshot"), name); + (void) zfs_error(hdl, EZFS_BADRESTORE, errbuf); + break; + case EACCES: + if (raw && stream_wantsnewfs) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "failed to create encryption key")); + } else if (raw && !stream_wantsnewfs) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "encryption key does not match " + "existing key")); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "inherited key must be loaded")); + } + (void) zfs_error(hdl, EZFS_CRYPTOFAILED, errbuf); + break; + case EEXIST: + cp = strchr(destsnap, '@'); + if (newfs) { + /* it's the containing fs that exists */ + *cp = '\0'; + } + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination already exists")); + (void) zfs_error_fmt(hdl, EZFS_EXISTS, + dgettext(TEXT_DOMAIN, "cannot restore to %s"), + destsnap); + *cp = '@'; + break; + case EINVAL: + if (flags->resumable) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "kernel modules must be upgraded to " + "receive this stream.")); + } else if (embedded && !raw) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incompatible embedded data stream " + "feature with encrypted receive.")); + } + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ECKSUM: + case ZFS_ERR_STREAM_TRUNCATED: + recv_ecksum_set_aux(hdl, destsnap, flags->resumable, + ioctl_err == ECKSUM); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "incremental send stream requires -L " + "(--large-block), to match previous receive.")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded to receive this stream.")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + case EDQUOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s space quota exceeded."), name); + (void) zfs_error(hdl, EZFS_NOSPC, errbuf); + break; + case ZFS_ERR_FROM_IVSET_GUID_MISSING: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "IV set guid missing. See errata %u at " + "https://zfsonlinux.org/msg/ZFS-8000-ER."), + ZPOOL_ERRATA_ZOL_8308_ENCRYPTION); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ZFS_ERR_FROM_IVSET_GUID_MISMATCH: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "IV set guid mismatch. See the 'zfs receive' " + "man page section\n discussing the limitations " + "of raw encrypted send streams.")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case ZFS_ERR_SPILL_BLOCK_FLAG_MISSING: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Spill block flag missing for raw send.\n" + "The zfs software on the sending system must " + "be updated.")); + (void) zfs_error(hdl, EZFS_BADSTREAM, errbuf); + break; + case EBUSY: + if (hastoken) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "destination %s contains " + "partially-complete state from " + "\"zfs receive -s\"."), name); + (void) zfs_error(hdl, EZFS_BUSY, errbuf); + break; + } + /* fallthru */ + default: + (void) zfs_standard_error(hdl, ioctl_errno, errbuf); + } + } + + /* + * Mount the target filesystem (if created). Also mount any + * children of the target filesystem if we did a replication + * receive (indicated by stream_avl being non-NULL). + */ + if (clp) { + if (!flags->nomount) + err |= changelist_postfix(clp); + changelist_free(clp); + } + + if ((newfs || stream_avl) && type == ZFS_TYPE_FILESYSTEM && !redacted) + flags->domount = B_TRUE; + + if (prop_errflags & ZPROP_ERR_NOCLEAR) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to clear unreceived properties on %s"), name); + (void) fprintf(stderr, "\n"); + } + if (prop_errflags & ZPROP_ERR_NORESTORE) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to restore original properties on %s"), name); + (void) fprintf(stderr, "\n"); + } + + if (err || ioctl_err) { + err = -1; + goto out; + } + + if (flags->verbose) { + char buf1[64]; + char buf2[64]; + uint64_t bytes = read_bytes; + time_t delta = time(NULL) - begin_time; + if (delta == 0) + delta = 1; + zfs_nicebytes(bytes, buf1, sizeof (buf1)); + zfs_nicebytes(bytes/delta, buf2, sizeof (buf1)); + + (void) printf("received %s stream in %lld seconds (%s/sec)\n", + buf1, (longlong_t)delta, buf2); + } + + err = 0; +out: + if (prop_errors != NULL) + nvlist_free(prop_errors); + + if (tmp_keylocation[0] != '\0') { + VERIFY(0 == nvlist_add_string(rcvprops, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), tmp_keylocation)); + } + + if (newprops) + nvlist_free(rcvprops); + + nvlist_free(oxprops); + nvlist_free(origprops); + + return (err); +} + +/* + * Check properties we were asked to override (both -o|-x) + */ +static boolean_t +zfs_receive_checkprops(libzfs_handle_t *hdl, nvlist_t *props, + const char *errbuf) +{ + nvpair_t *nvp; + zfs_prop_t prop; + const char *name; + + nvp = NULL; + while ((nvp = nvlist_next_nvpair(props, nvp)) != NULL) { + name = nvpair_name(nvp); + prop = zfs_name_to_prop(name); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(name)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), name); + return (B_FALSE); + } + continue; + } + /* + * "origin" is readonly but is used to receive datasets as + * clones so we don't raise an error here + */ + if (prop == ZFS_PROP_ORIGIN) + continue; + + /* encryption params have their own verification later */ + if (prop == ZFS_PROP_ENCRYPTION || + zfs_prop_encryption_key_param(prop)) + continue; + + /* + * cannot override readonly, set-once and other specific + * settable properties + */ + if (zfs_prop_readonly(prop) || prop == ZFS_PROP_VERSION || + prop == ZFS_PROP_VOLSIZE) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), name); + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static int +zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, + const char *originsnap, recvflags_t *flags, int infd, const char *sendfs, + nvlist_t *stream_nv, avl_tree_t *stream_avl, char **top_zfs, + const char *finalsnap, nvlist_t *cmdprops) +{ + int err; + dmu_replay_record_t drr, drr_noswap; + struct drr_begin *drrb = &drr.drr_u.drr_begin; + char errbuf[1024]; + zio_cksum_t zcksum = { { 0 } }; + uint64_t featureflags; + int hdrtype; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive")); + + /* check cmdline props, raise an error if they cannot be received */ + if (!zfs_receive_checkprops(hdl, cmdprops, errbuf)) { + return (zfs_error(hdl, EZFS_BADPROP, errbuf)); + } + + if (flags->isprefix && + !zfs_dataset_exists(hdl, tosnap, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified fs " + "(%s) does not exist"), tosnap); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + if (originsnap && + !zfs_dataset_exists(hdl, originsnap, ZFS_TYPE_DATASET)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "specified origin fs " + "(%s) does not exist"), originsnap); + return (zfs_error(hdl, EZFS_NOENT, errbuf)); + } + + /* read in the BEGIN record */ + if (0 != (err = recv_read(hdl, infd, &drr, sizeof (drr), B_FALSE, + &zcksum))) + return (err); + + if (drr.drr_type == DRR_END || drr.drr_type == BSWAP_32(DRR_END)) { + /* It's the double end record at the end of a package */ + return (ENODATA); + } + + /* the kernel needs the non-byteswapped begin record */ + drr_noswap = drr; + + flags->byteswap = B_FALSE; + if (drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + /* + * We computed the checksum in the wrong byteorder in + * recv_read() above; do it again correctly. + */ + bzero(&zcksum, sizeof (zio_cksum_t)); + fletcher_4_incremental_byteswap(&drr, sizeof (drr), &zcksum); + flags->byteswap = B_TRUE; + + drr.drr_type = BSWAP_32(drr.drr_type); + drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); + drrb->drr_magic = BSWAP_64(drrb->drr_magic); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); + drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); + drrb->drr_type = BSWAP_32(drrb->drr_type); + drrb->drr_flags = BSWAP_32(drrb->drr_flags); + drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); + drrb->drr_fromguid = BSWAP_64(drrb->drr_fromguid); + } + + if (drrb->drr_magic != DMU_BACKUP_MAGIC || drr.drr_type != DRR_BEGIN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (bad magic number)")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); + + if (!DMU_STREAM_SUPPORTED(featureflags) || + (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream has unsupported feature, feature flags = %lx"), + featureflags); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + /* Holds feature is set once in the compound stream header. */ + if (featureflags & DMU_BACKUP_FEATURE_HOLDS) + flags->holds = B_TRUE; + + if (strchr(drrb->drr_toname, '@') == NULL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " + "stream (bad snapshot name)")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { + char nonpackage_sendfs[ZFS_MAX_DATASET_NAME_LEN]; + if (sendfs == NULL) { + /* + * We were not called from zfs_receive_package(). Get + * the fs specified by 'zfs send'. + */ + char *cp; + (void) strlcpy(nonpackage_sendfs, + drr.drr_u.drr_begin.drr_toname, + sizeof (nonpackage_sendfs)); + if ((cp = strchr(nonpackage_sendfs, '@')) != NULL) + *cp = '\0'; + sendfs = nonpackage_sendfs; + VERIFY(finalsnap == NULL); + } + return (zfs_receive_one(hdl, infd, tosnap, originsnap, flags, + &drr, &drr_noswap, sendfs, stream_nv, stream_avl, top_zfs, + finalsnap, cmdprops)); + } else { + assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM); + return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, + &zcksum, top_zfs, cmdprops)); + } +} + +/* + * Restores a backup of tosnap from the file descriptor specified by infd. + * Return 0 on total success, -2 if some things couldn't be + * destroyed/renamed/promoted, -1 if some things couldn't be received. + * (-1 will override -2, if -1 and the resumable flag was specified the + * transfer can be resumed if the sending side supports it). + */ +int +zfs_receive(libzfs_handle_t *hdl, const char *tosnap, nvlist_t *props, + recvflags_t *flags, int infd, avl_tree_t *stream_avl) +{ + char *top_zfs = NULL; + int err; + struct stat sb; + char *originsnap = NULL; + + /* + * The only way fstat can fail is if we do not have a valid file + * descriptor. + */ + if (fstat(infd, &sb) == -1) { + perror("fstat"); + return (-2); + } + + /* + * It is not uncommon for gigabytes to be processed in zfs receive. + * Speculatively increase the buffer size if supported by the platform. + */ + if (S_ISFIFO(sb.st_mode)) + libzfs_set_pipe_max(infd); + + if (props) { + err = nvlist_lookup_string(props, "origin", &originsnap); + if (err && err != ENOENT) + return (err); + } + + err = zfs_receive_impl(hdl, tosnap, originsnap, flags, infd, NULL, NULL, + stream_avl, &top_zfs, NULL, props); + + if (err == 0 && !flags->nomount && flags->domount && top_zfs) { + zfs_handle_t *zhp = NULL; + prop_changelist_t *clp = NULL; + + zhp = zfs_open(hdl, top_zfs, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + err = -1; + goto out; + } else { + if (zhp->zfs_type == ZFS_TYPE_VOLUME) { + zfs_close(zhp); + goto out; + } + + clp = changelist_gather(zhp, ZFS_PROP_MOUNTPOINT, + CL_GATHER_MOUNT_ALWAYS, + flags->forceunmount ? MS_FORCE : 0); + zfs_close(zhp); + if (clp == NULL) { + err = -1; + goto out; + } + + /* mount and share received datasets */ + err = changelist_postfix(clp); + changelist_free(clp); + if (err != 0) + err = -1; + } + } + +out: + if (top_zfs) + free(top_zfs); + + return (err); +} diff --git a/lib/libzfs/libzfs_status.c b/lib/libzfs/libzfs_status.c new file mode 100644 index 000000000000..435937041608 --- /dev/null +++ b/lib/libzfs/libzfs_status.c @@ -0,0 +1,508 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +/* + * This file contains the functions which analyze the status of a pool. This + * include both the status of an active pool, as well as the status exported + * pools. Returns one of the ZPOOL_STATUS_* defines describing the status of + * the pool. This status is independent (to a certain degree) from the state of + * the pool. A pool's state describes only whether or not it is capable of + * providing the necessary fault tolerance for data. The status describes the + * overall status of devices. A pool that is online can still have a device + * that is experiencing errors. + * + * Only a subset of the possible faults can be detected using 'zpool status', + * and not all possible errors correspond to a FMA message ID. The explanation + * is left up to the caller, depending on whether it is a live pool or an + * import. + */ + +#include +#include +#include +#include +#include +#include +#include "libzfs_impl.h" +#include "zfeature_common.h" + +/* + * Message ID table. This must be kept in sync with the ZPOOL_STATUS_* defines + * in include/libzfs.h. Note that there are some status results which go past + * the end of this table, and hence have no associated message ID. + */ +static char *zfs_msgid_table[] = { + "ZFS-8000-14", /* ZPOOL_STATUS_CORRUPT_CACHE */ + "ZFS-8000-2Q", /* ZPOOL_STATUS_MISSING_DEV_R */ + "ZFS-8000-3C", /* ZPOOL_STATUS_MISSING_DEV_NR */ + "ZFS-8000-4J", /* ZPOOL_STATUS_CORRUPT_LABEL_R */ + "ZFS-8000-5E", /* ZPOOL_STATUS_CORRUPT_LABEL_NR */ + "ZFS-8000-6X", /* ZPOOL_STATUS_BAD_GUID_SUM */ + "ZFS-8000-72", /* ZPOOL_STATUS_CORRUPT_POOL */ + "ZFS-8000-8A", /* ZPOOL_STATUS_CORRUPT_DATA */ + "ZFS-8000-9P", /* ZPOOL_STATUS_FAILING_DEV */ + "ZFS-8000-A5", /* ZPOOL_STATUS_VERSION_NEWER */ + "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_MISMATCH */ + "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_ACTIVE */ + "ZFS-8000-EY", /* ZPOOL_STATUS_HOSTID_REQUIRED */ + "ZFS-8000-HC", /* ZPOOL_STATUS_IO_FAILURE_WAIT */ + "ZFS-8000-JQ", /* ZPOOL_STATUS_IO_FAILURE_CONTINUE */ + "ZFS-8000-MM", /* ZPOOL_STATUS_IO_FAILURE_MMP */ + "ZFS-8000-K4", /* ZPOOL_STATUS_BAD_LOG */ + "ZFS-8000-ER", /* ZPOOL_STATUS_ERRATA */ + /* + * The following results have no message ID. + * ZPOOL_STATUS_UNSUP_FEAT_READ + * ZPOOL_STATUS_UNSUP_FEAT_WRITE + * ZPOOL_STATUS_FAULTED_DEV_R + * ZPOOL_STATUS_FAULTED_DEV_NR + * ZPOOL_STATUS_VERSION_OLDER + * ZPOOL_STATUS_FEAT_DISABLED + * ZPOOL_STATUS_RESILVERING + * ZPOOL_STATUS_OFFLINE_DEV + * ZPOOL_STATUS_REMOVED_DEV + * ZPOOL_STATUS_REBUILDING + * ZPOOL_STATUS_REBUILD_SCRUB + * ZPOOL_STATUS_OK + */ +}; + +#define NMSGID (sizeof (zfs_msgid_table) / sizeof (zfs_msgid_table[0])) + +/* ARGSUSED */ +static int +vdev_missing(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_OPEN_FAILED); +} + +/* ARGSUSED */ +static int +vdev_faulted(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_FAULTED); +} + +/* ARGSUSED */ +static int +vdev_errors(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_DEGRADED || + vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || + vs->vs_checksum_errors != 0); +} + +/* ARGSUSED */ +static int +vdev_broken(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_CANT_OPEN); +} + +/* ARGSUSED */ +static int +vdev_offlined(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_OFFLINE); +} + +/* ARGSUSED */ +static int +vdev_removed(vdev_stat_t *vs, uint_t vsc) +{ + return (vs->vs_state == VDEV_STATE_REMOVED); +} + +static int +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) +{ + if (getenv("ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE") != NULL) + return (0); + + return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift); +} + +/* + * Detect if any leaf devices that have seen errors or could not be opened. + */ +static boolean_t +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), + boolean_t ignore_replacing) +{ + nvlist_t **child; + vdev_stat_t *vs; + uint_t c, vsc, children; + + /* + * Ignore problems within a 'replacing' vdev, since we're presumably in + * the process of repairing any such errors, and don't want to call them + * out again. We'll pick up the fact that a resilver is happening + * later. + */ + if (ignore_replacing == B_TRUE) { + char *type; + + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + } + + if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev_problem(child[c], func, ignore_replacing)) + return (B_TRUE); + } else { + verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + + if (func(vs, vsc) != 0) + return (B_TRUE); + } + + /* + * Check any L2 cache devs + */ + if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, + &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev_problem(child[c], func, ignore_replacing)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Active pool health status. + * + * To determine the status for a pool, we make several passes over the config, + * picking the most egregious error we find. In order of importance, we do the + * following: + * + * - Check for a complete and valid configuration + * - Look for any faulted or missing devices in a non-replicated config + * - Check for any data errors + * - Check for any faulted or missing devices in a replicated config + * - Look for any devices showing errors + * - Check for any resilvering or rebuilding devices + * + * There can obviously be multiple errors within a single pool, so this routine + * only picks the most damaging of all the current errors to report. + */ +static zpool_status_t +check_status(nvlist_t *config, boolean_t isimport, zpool_errata_t *erratap) +{ + nvlist_t *nvroot; + vdev_stat_t *vs; + pool_scan_stat_t *ps = NULL; + uint_t vsc, psc; + uint64_t nerr; + uint64_t version; + uint64_t stateval; + uint64_t suspended; + uint64_t hostid = 0; + uint64_t errata = 0; + unsigned long system_hostid = get_system_hostid(); + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &version) == 0); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + verify(nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &stateval) == 0); + + /* + * Currently resilvering a vdev + */ + (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, + (uint64_t **)&ps, &psc); + if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER && + ps->pss_state == DSS_SCANNING) + return (ZPOOL_STATUS_RESILVERING); + + /* + * Currently rebuilding a vdev, check top-level vdevs. + */ + vdev_rebuild_stat_t *vrs = NULL; + nvlist_t **child; + uint_t c, i, children; + uint64_t rebuild_end_time = 0; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + if ((nvlist_lookup_uint64_array(child[c], + ZPOOL_CONFIG_REBUILD_STATS, + (uint64_t **)&vrs, &i) == 0) && (vrs != NULL)) { + uint64_t state = vrs->vrs_state; + + if (state == VDEV_REBUILD_ACTIVE) { + return (ZPOOL_STATUS_REBUILDING); + } else if (state == VDEV_REBUILD_COMPLETE && + vrs->vrs_end_time > rebuild_end_time) { + rebuild_end_time = vrs->vrs_end_time; + } + } + } + + /* + * If we can determine when the last scrub was run, and it + * was before the last rebuild completed, then recommend + * that the pool be scrubbed to verify all checksums. When + * ps is NULL we can infer the pool has never been scrubbed. + */ + if (rebuild_end_time > 0) { + if (ps != NULL) { + if ((ps->pss_state == DSS_FINISHED && + ps->pss_func == POOL_SCAN_SCRUB && + rebuild_end_time > ps->pss_end_time) || + ps->pss_state == DSS_NONE) + return (ZPOOL_STATUS_REBUILD_SCRUB); + } else { + return (ZPOOL_STATUS_REBUILD_SCRUB); + } + } + } + + /* + * The multihost property is set and the pool may be active. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_ACTIVE) { + mmp_state_t mmp_state; + nvlist_t *nvinfo; + + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + mmp_state = fnvlist_lookup_uint64(nvinfo, + ZPOOL_CONFIG_MMP_STATE); + + if (mmp_state == MMP_STATE_ACTIVE) + return (ZPOOL_STATUS_HOSTID_ACTIVE); + else if (mmp_state == MMP_STATE_NO_HOSTID) + return (ZPOOL_STATUS_HOSTID_REQUIRED); + else + return (ZPOOL_STATUS_HOSTID_MISMATCH); + } + + /* + * Pool last accessed by another system. + */ + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_HOSTID, &hostid); + if (hostid != 0 && (unsigned long)hostid != system_hostid && + stateval == POOL_STATE_ACTIVE) + return (ZPOOL_STATUS_HOSTID_MISMATCH); + + /* + * Newer on-disk version. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_VERSION_NEWER) + return (ZPOOL_STATUS_VERSION_NEWER); + + /* + * Unsupported feature(s). + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_UNSUP_FEAT) { + nvlist_t *nvinfo; + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + &nvinfo) == 0); + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_CAN_RDONLY)) + return (ZPOOL_STATUS_UNSUP_FEAT_WRITE); + return (ZPOOL_STATUS_UNSUP_FEAT_READ); + } + + /* + * Check that the config is complete. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_BAD_GUID_SUM) + return (ZPOOL_STATUS_BAD_GUID_SUM); + + /* + * Check whether the pool has suspended. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED, + &suspended) == 0) { + uint64_t reason; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_SUSPENDED_REASON, + &reason) == 0 && reason == ZIO_SUSPEND_MMP) + return (ZPOOL_STATUS_IO_FAILURE_MMP); + + if (suspended == ZIO_FAILURE_MODE_CONTINUE) + return (ZPOOL_STATUS_IO_FAILURE_CONTINUE); + return (ZPOOL_STATUS_IO_FAILURE_WAIT); + } + + /* + * Could not read a log. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_BAD_LOG) { + return (ZPOOL_STATUS_BAD_LOG); + } + + /* + * Bad devices in non-replicated config. + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) + return (ZPOOL_STATUS_FAULTED_DEV_NR); + + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_missing, B_TRUE)) + return (ZPOOL_STATUS_MISSING_DEV_NR); + + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + find_vdev_problem(nvroot, vdev_broken, B_TRUE)) + return (ZPOOL_STATUS_CORRUPT_LABEL_NR); + + /* + * Corrupted pool metadata + */ + if (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_CORRUPT_DATA) + return (ZPOOL_STATUS_CORRUPT_POOL); + + /* + * Persistent data errors. + */ + if (!isimport) { + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRCOUNT, + &nerr) == 0 && nerr != 0) + return (ZPOOL_STATUS_CORRUPT_DATA); + } + + /* + * Missing devices in a replicated config. + */ + if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) + return (ZPOOL_STATUS_FAULTED_DEV_R); + if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) + return (ZPOOL_STATUS_MISSING_DEV_R); + if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) + return (ZPOOL_STATUS_CORRUPT_LABEL_R); + + /* + * Devices with errors + */ + if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) + return (ZPOOL_STATUS_FAILING_DEV); + + /* + * Offlined devices + */ + if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) + return (ZPOOL_STATUS_OFFLINE_DEV); + + /* + * Removed device + */ + if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) + return (ZPOOL_STATUS_REMOVED_DEV); + + /* + * Suboptimal, but usable, ashift configuration. + */ + if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); + + /* + * Informational errata available. + */ + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_ERRATA, &errata); + if (errata) { + *erratap = errata; + return (ZPOOL_STATUS_ERRATA); + } + + /* + * Outdated, but usable, version + */ + if (SPA_VERSION_IS_SUPPORTED(version) && version != SPA_VERSION) + return (ZPOOL_STATUS_VERSION_OLDER); + + /* + * Usable pool with disabled features + */ + if (version >= SPA_VERSION_FEATURES) { + int i; + nvlist_t *feat; + + if (isimport) { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO); + if (nvlist_exists(feat, ZPOOL_CONFIG_ENABLED_FEAT)) + feat = fnvlist_lookup_nvlist(feat, + ZPOOL_CONFIG_ENABLED_FEAT); + } else { + feat = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_FEATURE_STATS); + } + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *fi = &spa_feature_table[i]; + if (!nvlist_exists(feat, fi->fi_guid)) + return (ZPOOL_STATUS_FEAT_DISABLED); + } + } + + return (ZPOOL_STATUS_OK); +} + +zpool_status_t +zpool_get_status(zpool_handle_t *zhp, char **msgid, zpool_errata_t *errata) +{ + zpool_status_t ret = check_status(zhp->zpool_config, B_FALSE, errata); + if (msgid != NULL) { + if (ret >= NMSGID) + *msgid = NULL; + else + *msgid = zfs_msgid_table[ret]; + } + return (ret); +} + +zpool_status_t +zpool_import_status(nvlist_t *config, char **msgid, zpool_errata_t *errata) +{ + zpool_status_t ret = check_status(config, B_TRUE, errata); + + if (ret >= NMSGID) + *msgid = NULL; + else + *msgid = zfs_msgid_table[ret]; + + return (ret); +} diff --git a/lib/libzfs/libzfs_util.c b/lib/libzfs/libzfs_util.c new file mode 100644 index 000000000000..c43c6bb1fe9b --- /dev/null +++ b/lib/libzfs/libzfs_util.c @@ -0,0 +1,2092 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2020 Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2020 The FreeBSD Foundation + * + * Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +/* + * Internal utility routines for the ZFS library. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libzfs_impl.h" +#include "zfs_prop.h" +#include "zfeature_common.h" +#include +#include + +/* + * We only care about the scheme in order to match the scheme + * with the handler. Each handler should validate the full URI + * as necessary. + */ +#define URI_REGEX "^\\([A-Za-z][A-Za-z0-9+.\\-]*\\):" + +int +libzfs_errno(libzfs_handle_t *hdl) +{ + return (hdl->libzfs_error); +} + +const char * +libzfs_error_action(libzfs_handle_t *hdl) +{ + return (hdl->libzfs_action); +} + +const char * +libzfs_error_description(libzfs_handle_t *hdl) +{ + if (hdl->libzfs_desc[0] != '\0') + return (hdl->libzfs_desc); + + switch (hdl->libzfs_error) { + case EZFS_NOMEM: + return (dgettext(TEXT_DOMAIN, "out of memory")); + case EZFS_BADPROP: + return (dgettext(TEXT_DOMAIN, "invalid property value")); + case EZFS_PROPREADONLY: + return (dgettext(TEXT_DOMAIN, "read-only property")); + case EZFS_PROPTYPE: + return (dgettext(TEXT_DOMAIN, "property doesn't apply to " + "datasets of this type")); + case EZFS_PROPNONINHERIT: + return (dgettext(TEXT_DOMAIN, "property cannot be inherited")); + case EZFS_PROPSPACE: + return (dgettext(TEXT_DOMAIN, "invalid quota or reservation")); + case EZFS_BADTYPE: + return (dgettext(TEXT_DOMAIN, "operation not applicable to " + "datasets of this type")); + case EZFS_BUSY: + return (dgettext(TEXT_DOMAIN, "pool or dataset is busy")); + case EZFS_EXISTS: + return (dgettext(TEXT_DOMAIN, "pool or dataset exists")); + case EZFS_NOENT: + return (dgettext(TEXT_DOMAIN, "no such pool or dataset")); + case EZFS_BADSTREAM: + return (dgettext(TEXT_DOMAIN, "invalid backup stream")); + case EZFS_DSREADONLY: + return (dgettext(TEXT_DOMAIN, "dataset is read-only")); + case EZFS_VOLTOOBIG: + return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " + "this system")); + case EZFS_INVALIDNAME: + return (dgettext(TEXT_DOMAIN, "invalid name")); + case EZFS_BADRESTORE: + return (dgettext(TEXT_DOMAIN, "unable to restore to " + "destination")); + case EZFS_BADBACKUP: + return (dgettext(TEXT_DOMAIN, "backup failed")); + case EZFS_BADTARGET: + return (dgettext(TEXT_DOMAIN, "invalid target vdev")); + case EZFS_NODEVICE: + return (dgettext(TEXT_DOMAIN, "no such device in pool")); + case EZFS_BADDEV: + return (dgettext(TEXT_DOMAIN, "invalid device")); + case EZFS_NOREPLICAS: + return (dgettext(TEXT_DOMAIN, "no valid replicas")); + case EZFS_RESILVERING: + return (dgettext(TEXT_DOMAIN, "currently resilvering")); + case EZFS_BADVERSION: + return (dgettext(TEXT_DOMAIN, "unsupported version or " + "feature")); + case EZFS_POOLUNAVAIL: + return (dgettext(TEXT_DOMAIN, "pool is unavailable")); + case EZFS_DEVOVERFLOW: + return (dgettext(TEXT_DOMAIN, "too many devices in one vdev")); + case EZFS_BADPATH: + return (dgettext(TEXT_DOMAIN, "must be an absolute path")); + case EZFS_CROSSTARGET: + return (dgettext(TEXT_DOMAIN, "operation crosses datasets or " + "pools")); + case EZFS_ZONED: + return (dgettext(TEXT_DOMAIN, "dataset in use by local zone")); + case EZFS_MOUNTFAILED: + return (dgettext(TEXT_DOMAIN, "mount failed")); + case EZFS_UMOUNTFAILED: + return (dgettext(TEXT_DOMAIN, "umount failed")); + case EZFS_UNSHARENFSFAILED: + return (dgettext(TEXT_DOMAIN, "unshare(1M) failed")); + case EZFS_SHARENFSFAILED: + return (dgettext(TEXT_DOMAIN, "share(1M) failed")); + case EZFS_UNSHARESMBFAILED: + return (dgettext(TEXT_DOMAIN, "smb remove share failed")); + case EZFS_SHARESMBFAILED: + return (dgettext(TEXT_DOMAIN, "smb add share failed")); + case EZFS_PERM: + return (dgettext(TEXT_DOMAIN, "permission denied")); + case EZFS_NOSPC: + return (dgettext(TEXT_DOMAIN, "out of space")); + case EZFS_FAULT: + return (dgettext(TEXT_DOMAIN, "bad address")); + case EZFS_IO: + return (dgettext(TEXT_DOMAIN, "I/O error")); + case EZFS_INTR: + return (dgettext(TEXT_DOMAIN, "signal received")); + case EZFS_ISSPARE: + return (dgettext(TEXT_DOMAIN, "device is reserved as a hot " + "spare")); + case EZFS_INVALCONFIG: + return (dgettext(TEXT_DOMAIN, "invalid vdev configuration")); + case EZFS_RECURSIVE: + return (dgettext(TEXT_DOMAIN, "recursive dataset dependency")); + case EZFS_NOHISTORY: + return (dgettext(TEXT_DOMAIN, "no history available")); + case EZFS_POOLPROPS: + return (dgettext(TEXT_DOMAIN, "failed to retrieve " + "pool properties")); + case EZFS_POOL_NOTSUP: + return (dgettext(TEXT_DOMAIN, "operation not supported " + "on this type of pool")); + case EZFS_POOL_INVALARG: + return (dgettext(TEXT_DOMAIN, "invalid argument for " + "this pool operation")); + case EZFS_NAMETOOLONG: + return (dgettext(TEXT_DOMAIN, "dataset name is too long")); + case EZFS_OPENFAILED: + return (dgettext(TEXT_DOMAIN, "open failed")); + case EZFS_NOCAP: + return (dgettext(TEXT_DOMAIN, + "disk capacity information could not be retrieved")); + case EZFS_LABELFAILED: + return (dgettext(TEXT_DOMAIN, "write of label failed")); + case EZFS_BADWHO: + return (dgettext(TEXT_DOMAIN, "invalid user/group")); + case EZFS_BADPERM: + return (dgettext(TEXT_DOMAIN, "invalid permission")); + case EZFS_BADPERMSET: + return (dgettext(TEXT_DOMAIN, "invalid permission set name")); + case EZFS_NODELEGATION: + return (dgettext(TEXT_DOMAIN, "delegated administration is " + "disabled on pool")); + case EZFS_BADCACHE: + return (dgettext(TEXT_DOMAIN, "invalid or missing cache file")); + case EZFS_ISL2CACHE: + return (dgettext(TEXT_DOMAIN, "device is in use as a cache")); + case EZFS_VDEVNOTSUP: + return (dgettext(TEXT_DOMAIN, "vdev specification is not " + "supported")); + case EZFS_NOTSUP: + return (dgettext(TEXT_DOMAIN, "operation not supported " + "on this dataset")); + case EZFS_IOC_NOTSUPPORTED: + return (dgettext(TEXT_DOMAIN, "operation not supported by " + "zfs kernel module")); + case EZFS_ACTIVE_SPARE: + return (dgettext(TEXT_DOMAIN, "pool has active shared spare " + "device")); + case EZFS_UNPLAYED_LOGS: + return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " + "logs")); + case EZFS_REFTAG_RELE: + return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); + case EZFS_REFTAG_HOLD: + return (dgettext(TEXT_DOMAIN, "tag already exists on this " + "dataset")); + case EZFS_TAGTOOLONG: + return (dgettext(TEXT_DOMAIN, "tag too long")); + case EZFS_PIPEFAILED: + return (dgettext(TEXT_DOMAIN, "pipe create failed")); + case EZFS_THREADCREATEFAILED: + return (dgettext(TEXT_DOMAIN, "thread create failed")); + case EZFS_POSTSPLIT_ONLINE: + return (dgettext(TEXT_DOMAIN, "disk was split from this pool " + "into a new one")); + case EZFS_SCRUB_PAUSED: + return (dgettext(TEXT_DOMAIN, "scrub is paused; " + "use 'zpool scrub' to resume")); + case EZFS_SCRUBBING: + return (dgettext(TEXT_DOMAIN, "currently scrubbing; " + "use 'zpool scrub -s' to cancel current scrub")); + case EZFS_NO_SCRUB: + return (dgettext(TEXT_DOMAIN, "there is no active scrub")); + case EZFS_DIFF: + return (dgettext(TEXT_DOMAIN, "unable to generate diffs")); + case EZFS_DIFFDATA: + return (dgettext(TEXT_DOMAIN, "invalid diff data")); + case EZFS_POOLREADONLY: + return (dgettext(TEXT_DOMAIN, "pool is read-only")); + case EZFS_NO_PENDING: + return (dgettext(TEXT_DOMAIN, "operation is not " + "in progress")); + case EZFS_CHECKPOINT_EXISTS: + return (dgettext(TEXT_DOMAIN, "checkpoint exists")); + case EZFS_DISCARDING_CHECKPOINT: + return (dgettext(TEXT_DOMAIN, "currently discarding " + "checkpoint")); + case EZFS_NO_CHECKPOINT: + return (dgettext(TEXT_DOMAIN, "checkpoint does not exist")); + case EZFS_DEVRM_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "device removal in progress")); + case EZFS_VDEV_TOO_BIG: + return (dgettext(TEXT_DOMAIN, "device exceeds supported size")); + case EZFS_ACTIVE_POOL: + return (dgettext(TEXT_DOMAIN, "pool is imported on a " + "different host")); + case EZFS_CRYPTOFAILED: + return (dgettext(TEXT_DOMAIN, "encryption failure")); + case EZFS_TOOMANY: + return (dgettext(TEXT_DOMAIN, "argument list too long")); + case EZFS_INITIALIZING: + return (dgettext(TEXT_DOMAIN, "currently initializing")); + case EZFS_NO_INITIALIZE: + return (dgettext(TEXT_DOMAIN, "there is no active " + "initialization")); + case EZFS_WRONG_PARENT: + return (dgettext(TEXT_DOMAIN, "invalid parent dataset")); + case EZFS_TRIMMING: + return (dgettext(TEXT_DOMAIN, "currently trimming")); + case EZFS_NO_TRIM: + return (dgettext(TEXT_DOMAIN, "there is no active trim")); + case EZFS_TRIM_NOTSUP: + return (dgettext(TEXT_DOMAIN, "trim operations are not " + "supported by this device")); + case EZFS_NO_RESILVER_DEFER: + return (dgettext(TEXT_DOMAIN, "this action requires the " + "resilver_defer feature")); + case EZFS_EXPORT_IN_PROGRESS: + return (dgettext(TEXT_DOMAIN, "pool export in progress")); + case EZFS_REBUILDING: + return (dgettext(TEXT_DOMAIN, "currently sequentially " + "resilvering")); + case EZFS_UNKNOWN: + return (dgettext(TEXT_DOMAIN, "unknown error")); + default: + assert(hdl->libzfs_error == 0); + return (dgettext(TEXT_DOMAIN, "no error")); + } +} + +/*PRINTFLIKE2*/ +void +zfs_error_aux(libzfs_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + (void) vsnprintf(hdl->libzfs_desc, sizeof (hdl->libzfs_desc), + fmt, ap); + hdl->libzfs_desc_active = 1; + + va_end(ap); +} + +static void +zfs_verror(libzfs_handle_t *hdl, int error, const char *fmt, va_list ap) +{ + (void) vsnprintf(hdl->libzfs_action, sizeof (hdl->libzfs_action), + fmt, ap); + hdl->libzfs_error = error; + + if (hdl->libzfs_desc_active) + hdl->libzfs_desc_active = 0; + else + hdl->libzfs_desc[0] = '\0'; + + if (hdl->libzfs_printerr) { + if (error == EZFS_UNKNOWN) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "internal " + "error: %s: %s\n"), hdl->libzfs_action, + libzfs_error_description(hdl)); + abort(); + } + + (void) fprintf(stderr, "%s: %s\n", hdl->libzfs_action, + libzfs_error_description(hdl)); + if (error == EZFS_NOMEM) + exit(1); + } +} + +int +zfs_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zfs_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zfs_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + zfs_verror(hdl, error, fmt, ap); + + va_end(ap); + + return (-1); +} + +static int +zfs_common_error(libzfs_handle_t *hdl, int error, const char *fmt, + va_list ap) +{ + switch (error) { + case EPERM: + case EACCES: + zfs_verror(hdl, EZFS_PERM, fmt, ap); + return (-1); + + case ECANCELED: + zfs_verror(hdl, EZFS_NODELEGATION, fmt, ap); + return (-1); + + case EIO: + zfs_verror(hdl, EZFS_IO, fmt, ap); + return (-1); + + case EFAULT: + zfs_verror(hdl, EZFS_FAULT, fmt, ap); + return (-1); + + case EINTR: + zfs_verror(hdl, EZFS_INTR, fmt, ap); + return (-1); + } + + return (0); +} + +int +zfs_standard_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zfs_standard_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + if (zfs_common_error(hdl, error, fmt, ap) != 0) { + va_end(ap); + return (-1); + } + + switch (error) { + case ENXIO: + case ENODEV: + case EPIPE: + zfs_verror(hdl, EZFS_IO, fmt, ap); + break; + + case ENOENT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset does not exist")); + zfs_verror(hdl, EZFS_NOENT, fmt, ap); + break; + + case ENOSPC: + case EDQUOT: + zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + break; + + case EEXIST: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset already exists")); + zfs_verror(hdl, EZFS_EXISTS, fmt, ap); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "dataset is busy")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; + case EROFS: + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); + break; + case ENAMETOOLONG: + zfs_verror(hdl, EZFS_NAMETOOLONG, fmt, ap); + break; + case ENOTSUP: + zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); + break; + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; + case EREMOTEIO: + zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); + break; + case ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE: + case ZFS_ERR_IOC_CMD_UNAVAIL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " + "module does not support this operation. A reboot may " + "be required to enable this operation.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + case ZFS_ERR_IOC_ARG_UNAVAIL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " + "module does not support an option for this operation. " + "A reboot may be required to enable this option.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + case ZFS_ERR_IOC_ARG_REQUIRED: + case ZFS_ERR_IOC_ARG_BADTYPE: + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + case ZFS_ERR_WRONG_PARENT: + zfs_verror(hdl, EZFS_WRONG_PARENT, fmt, ap); + break; + case ZFS_ERR_BADPROP: + zfs_verror(hdl, EZFS_BADPROP, fmt, ap); + break; + default: + zfs_error_aux(hdl, strerror(error)); + zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); + break; + } + + va_end(ap); + return (-1); +} + +void +zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, + char *errbuf) +{ + switch (err) { + + case ENOSPC: + /* + * For quotas and reservations, ENOSPC indicates + * something different; setting a quota or reservation + * doesn't use any disk space. + */ + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is less than current used or " + "reserved space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is greater than available space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; + + default: + (void) zfs_standard_error(hdl, err, errbuf); + break; + } + break; + + case EBUSY: + (void) zfs_standard_error(hdl, EBUSY, errbuf); + break; + + case EROFS: + (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); + break; + + case E2BIG: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property value too long")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + break; + + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool and or dataset must be upgraded to set this " + "property or value")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; + + case ERANGE: + if (prop == ZFS_PROP_COMPRESSION || + prop == ZFS_PROP_DNODESIZE || + prop == ZFS_PROP_RECORDSIZE) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "bootable datasets")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else if (prop == ZFS_PROP_CHECKSUM || + prop == ZFS_PROP_DEDUP) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "root pools")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EINVAL: + if (prop == ZPROP_INVAL) { + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case ZFS_ERR_BADPROP: + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + break; + + case EACCES: + if (prop == ZFS_PROP_KEYLOCATION) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "keylocation may only be set on encryption roots")); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + } else { + (void) zfs_standard_error(hdl, err, errbuf); + } + break; + + case EOVERFLOW: + /* + * This platform can't address a volume this big. + */ +#ifdef _ILP32 + if (prop == ZFS_PROP_VOLSIZE) { + (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); + break; + } +#endif + /* FALLTHROUGH */ + default: + (void) zfs_standard_error(hdl, err, errbuf); + } +} + +int +zpool_standard_error(libzfs_handle_t *hdl, int error, const char *msg) +{ + return (zpool_standard_error_fmt(hdl, error, "%s", msg)); +} + +/*PRINTFLIKE3*/ +int +zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + if (zfs_common_error(hdl, error, fmt, ap) != 0) { + va_end(ap); + return (-1); + } + + switch (error) { + case ENODEV: + zfs_verror(hdl, EZFS_NODEVICE, fmt, ap); + break; + + case ENOENT: + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "no such pool or dataset")); + zfs_verror(hdl, EZFS_NOENT, fmt, ap); + break; + + case EEXIST: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool already exists")); + zfs_verror(hdl, EZFS_EXISTS, fmt, ap); + break; + + case EBUSY: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "pool is busy")); + zfs_verror(hdl, EZFS_BUSY, fmt, ap); + break; + + /* There is no pending operation to cancel */ + case ENOTACTIVE: + zfs_verror(hdl, EZFS_NO_PENDING, fmt, ap); + break; + + case ENXIO: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "one or more devices is currently unavailable")); + zfs_verror(hdl, EZFS_BADDEV, fmt, ap); + break; + + case ENAMETOOLONG: + zfs_verror(hdl, EZFS_DEVOVERFLOW, fmt, ap); + break; + + case ENOTSUP: + zfs_verror(hdl, EZFS_POOL_NOTSUP, fmt, ap); + break; + + case EINVAL: + zfs_verror(hdl, EZFS_POOL_INVALARG, fmt, ap); + break; + + case ENOSPC: + case EDQUOT: + zfs_verror(hdl, EZFS_NOSPC, fmt, ap); + return (-1); + + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; + + case EROFS: + zfs_verror(hdl, EZFS_POOLREADONLY, fmt, ap); + break; + case EDOM: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "block size out of range or does not match")); + zfs_verror(hdl, EZFS_BADPROP, fmt, ap); + break; + case EREMOTEIO: + zfs_verror(hdl, EZFS_ACTIVE_POOL, fmt, ap); + break; + case ZFS_ERR_CHECKPOINT_EXISTS: + zfs_verror(hdl, EZFS_CHECKPOINT_EXISTS, fmt, ap); + break; + case ZFS_ERR_DISCARDING_CHECKPOINT: + zfs_verror(hdl, EZFS_DISCARDING_CHECKPOINT, fmt, ap); + break; + case ZFS_ERR_NO_CHECKPOINT: + zfs_verror(hdl, EZFS_NO_CHECKPOINT, fmt, ap); + break; + case ZFS_ERR_DEVRM_IN_PROGRESS: + zfs_verror(hdl, EZFS_DEVRM_IN_PROGRESS, fmt, ap); + break; + case ZFS_ERR_VDEV_TOO_BIG: + zfs_verror(hdl, EZFS_VDEV_TOO_BIG, fmt, ap); + break; + case ZFS_ERR_EXPORT_IN_PROGRESS: + zfs_verror(hdl, EZFS_EXPORT_IN_PROGRESS, fmt, ap); + break; + case ZFS_ERR_RESILVER_IN_PROGRESS: + zfs_verror(hdl, EZFS_RESILVERING, fmt, ap); + break; + case ZFS_ERR_REBUILD_IN_PROGRESS: + zfs_verror(hdl, EZFS_REBUILDING, fmt, ap); + break; + case ZFS_ERR_BADPROP: + zfs_verror(hdl, EZFS_BADPROP, fmt, ap); + break; + case ZFS_ERR_IOC_CMD_UNAVAIL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " + "module does not support this operation. A reboot may " + "be required to enable this operation.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + case ZFS_ERR_IOC_ARG_UNAVAIL: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "the loaded zfs " + "module does not support an option for this operation. " + "A reboot may be required to enable this option.")); + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + case ZFS_ERR_IOC_ARG_REQUIRED: + case ZFS_ERR_IOC_ARG_BADTYPE: + zfs_verror(hdl, EZFS_IOC_NOTSUPPORTED, fmt, ap); + break; + default: + zfs_error_aux(hdl, strerror(error)); + zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); + } + + va_end(ap); + return (-1); +} + +/* + * Display an out of memory error message and abort the current program. + */ +int +no_memory(libzfs_handle_t *hdl) +{ + return (zfs_error(hdl, EZFS_NOMEM, "internal error")); +} + +/* + * A safe form of malloc() which will die if the allocation fails. + */ +void * +zfs_alloc(libzfs_handle_t *hdl, size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + (void) no_memory(hdl); + + return (data); +} + +/* + * A safe form of asprintf() which will die if the allocation fails. + */ +/*PRINTFLIKE2*/ +char * +zfs_asprintf(libzfs_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + char *ret; + int err; + + va_start(ap, fmt); + + err = vasprintf(&ret, fmt, ap); + + va_end(ap); + + if (err < 0) + (void) no_memory(hdl); + + return (ret); +} + +/* + * A safe form of realloc(), which also zeroes newly allocated space. + */ +void * +zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) +{ + void *ret; + + if ((ret = realloc(ptr, newsize)) == NULL) { + (void) no_memory(hdl); + return (NULL); + } + + bzero((char *)ret + oldsize, (newsize - oldsize)); + return (ret); +} + +/* + * A safe form of strdup() which will die if the allocation fails. + */ +char * +zfs_strdup(libzfs_handle_t *hdl, const char *str) +{ + char *ret; + + if ((ret = strdup(str)) == NULL) + (void) no_memory(hdl); + + return (ret); +} + +void +libzfs_print_on_error(libzfs_handle_t *hdl, boolean_t printerr) +{ + hdl->libzfs_printerr = printerr; +} + +/* + * Read lines from an open file descriptor and store them in an array of + * strings until EOF. lines[] will be allocated and populated with all the + * lines read. All newlines are replaced with NULL terminators for + * convenience. lines[] must be freed after use with libzfs_free_str_array(). + * + * Returns the number of lines read. + */ +static int +libzfs_read_stdout_from_fd(int fd, char **lines[]) +{ + + FILE *fp; + int lines_cnt = 0; + size_t len = 0; + char *line = NULL; + char **tmp_lines = NULL, **tmp; + char *nl = NULL; + int rc; + + fp = fdopen(fd, "r"); + if (fp == NULL) + return (0); + while (1) { + rc = getline(&line, &len, fp); + if (rc == -1) + break; + + tmp = realloc(tmp_lines, sizeof (*tmp_lines) * (lines_cnt + 1)); + if (tmp == NULL) { + /* Return the lines we were able to process */ + break; + } + tmp_lines = tmp; + + /* Terminate newlines */ + if ((nl = strchr(line, '\n')) != NULL) + *nl = '\0'; + tmp_lines[lines_cnt] = line; + lines_cnt++; + line = NULL; + } + fclose(fp); + *lines = tmp_lines; + return (lines_cnt); +} + +static int +libzfs_run_process_impl(const char *path, char *argv[], char *env[], int flags, + char **lines[], int *lines_cnt) +{ + pid_t pid; + int error, devnull_fd; + int link[2]; + + /* + * Setup a pipe between our child and parent process if we're + * reading stdout. + */ + if ((lines != NULL) && pipe(link) == -1) + return (-EPIPE); + + pid = vfork(); + if (pid == 0) { + /* Child process */ + devnull_fd = open("/dev/null", O_WRONLY); + + if (devnull_fd < 0) + _exit(-1); + + if (!(flags & STDOUT_VERBOSE) && (lines == NULL)) + (void) dup2(devnull_fd, STDOUT_FILENO); + else if (lines != NULL) { + /* Save the output to lines[] */ + dup2(link[1], STDOUT_FILENO); + close(link[0]); + close(link[1]); + } + + if (!(flags & STDERR_VERBOSE)) + (void) dup2(devnull_fd, STDERR_FILENO); + + close(devnull_fd); + + if (flags & NO_DEFAULT_PATH) { + if (env == NULL) + execv(path, argv); + else + execve(path, argv, env); + } else { + if (env == NULL) + execvp(path, argv); + else + execvpe(path, argv, env); + } + + _exit(-1); + } else if (pid > 0) { + /* Parent process */ + int status; + + while ((error = waitpid(pid, &status, 0)) == -1 && + errno == EINTR) { } + if (error < 0 || !WIFEXITED(status)) + return (-1); + + if (lines != NULL) { + close(link[1]); + *lines_cnt = libzfs_read_stdout_from_fd(link[0], lines); + } + return (WEXITSTATUS(status)); + } + + return (-1); +} + +int +libzfs_run_process(const char *path, char *argv[], int flags) +{ + return (libzfs_run_process_impl(path, argv, NULL, flags, NULL, NULL)); +} + +/* + * Run a command and store its stdout lines in an array of strings (lines[]). + * lines[] is allocated and populated for you, and the number of lines is set in + * lines_cnt. lines[] must be freed after use with libzfs_free_str_array(). + * All newlines (\n) in lines[] are terminated for convenience. + */ +int +libzfs_run_process_get_stdout(const char *path, char *argv[], char *env[], + char **lines[], int *lines_cnt) +{ + return (libzfs_run_process_impl(path, argv, env, 0, lines, lines_cnt)); +} + +/* + * Same as libzfs_run_process_get_stdout(), but run without $PATH set. This + * means that *path needs to be the full path to the executable. + */ +int +libzfs_run_process_get_stdout_nopath(const char *path, char *argv[], + char *env[], char **lines[], int *lines_cnt) +{ + return (libzfs_run_process_impl(path, argv, env, NO_DEFAULT_PATH, + lines, lines_cnt)); +} + +/* + * Free an array of strings. Free both the strings contained in the array and + * the array itself. + */ +void +libzfs_free_str_array(char **strs, int count) +{ + while (--count >= 0) + free(strs[count]); + + free(strs); +} + +/* + * Returns 1 if environment variable is set to "YES", "yes", "ON", "on", or + * a non-zero number. + * + * Returns 0 otherwise. + */ +int +libzfs_envvar_is_set(char *envvar) +{ + char *env = getenv(envvar); + if (env && (strtoul(env, NULL, 0) > 0 || + (!strncasecmp(env, "YES", 3) && strnlen(env, 4) == 3) || + (!strncasecmp(env, "ON", 2) && strnlen(env, 3) == 2))) + return (1); + + return (0); +} + +libzfs_handle_t * +libzfs_init(void) +{ + libzfs_handle_t *hdl; + int error; + + error = libzfs_load_module(); + if (error) { + errno = error; + return (NULL); + } + + if ((hdl = calloc(1, sizeof (libzfs_handle_t))) == NULL) { + return (NULL); + } + + if (regcomp(&hdl->libzfs_urire, URI_REGEX, 0) != 0) { + free(hdl); + return (NULL); + } + + if ((hdl->libzfs_fd = open(ZFS_DEV, O_RDWR|O_EXCL)) < 0) { + free(hdl); + return (NULL); + } + +#ifdef HAVE_SETMNTENT + if ((hdl->libzfs_mnttab = setmntent(MNTTAB, "r")) == NULL) { +#else + if ((hdl->libzfs_mnttab = fopen(MNTTAB, "r")) == NULL) { +#endif + (void) close(hdl->libzfs_fd); + free(hdl); + return (NULL); + } + + if (libzfs_core_init() != 0) { + (void) close(hdl->libzfs_fd); + (void) fclose(hdl->libzfs_mnttab); + free(hdl); + return (NULL); + } + + zfs_prop_init(); + zpool_prop_init(); + zpool_feature_init(); + libzfs_mnttab_init(hdl); + fletcher_4_init(); + + if (getenv("ZFS_PROP_DEBUG") != NULL) { + hdl->libzfs_prop_debug = B_TRUE; + } + + /* + * For testing, remove some settable properties and features + */ + if (libzfs_envvar_is_set("ZFS_SYSFS_PROP_SUPPORT_TEST")) { + zprop_desc_t *proptbl; + + proptbl = zpool_prop_get_table(); + proptbl[ZPOOL_PROP_COMMENT].pd_zfs_mod_supported = B_FALSE; + + proptbl = zfs_prop_get_table(); + proptbl[ZFS_PROP_DNODESIZE].pd_zfs_mod_supported = B_FALSE; + + zfeature_info_t *ftbl = spa_feature_table; + ftbl[SPA_FEATURE_LARGE_BLOCKS].fi_zfs_mod_supported = B_FALSE; + } + + return (hdl); +} + +void +libzfs_fini(libzfs_handle_t *hdl) +{ + (void) close(hdl->libzfs_fd); + if (hdl->libzfs_mnttab) +#ifdef HAVE_SETMNTENT + (void) endmntent(hdl->libzfs_mnttab); +#else + (void) fclose(hdl->libzfs_mnttab); +#endif + zpool_free_handles(hdl); + namespace_clear(hdl); + libzfs_mnttab_fini(hdl); + libzfs_core_fini(); + regfree(&hdl->libzfs_urire); + fletcher_4_fini(); + free(hdl); +} + +libzfs_handle_t * +zpool_get_handle(zpool_handle_t *zhp) +{ + return (zhp->zpool_hdl); +} + +libzfs_handle_t * +zfs_get_handle(zfs_handle_t *zhp) +{ + return (zhp->zfs_hdl); +} + +zpool_handle_t * +zfs_get_pool_handle(const zfs_handle_t *zhp) +{ + return (zhp->zpool_hdl); +} + +/* + * Given a name, determine whether or not it's a valid path + * (starts with '/' or "./"). If so, walk the mnttab trying + * to match the device number. If not, treat the path as an + * fs/vol/snap/bkmark name. + */ +zfs_handle_t * +zfs_path_to_zhandle(libzfs_handle_t *hdl, const char *path, zfs_type_t argtype) +{ + struct stat64 statbuf; + struct extmnttab entry; + + if (path[0] != '/' && strncmp(path, "./", strlen("./")) != 0) { + /* + * It's not a valid path, assume it's a name of type 'argtype'. + */ + return (zfs_open(hdl, path, argtype)); + } + + /* Reopen MNTTAB to prevent reading stale data from open file */ + if (freopen(MNTTAB, "r", hdl->libzfs_mnttab) == NULL) + return (NULL); + + if (getextmntent(path, &entry, &statbuf) != 0) + return (NULL); + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) { + (void) fprintf(stderr, gettext("'%s': not a ZFS filesystem\n"), + path); + return (NULL); + } + + return (zfs_open(hdl, entry.mnt_special, ZFS_TYPE_FILESYSTEM)); +} + +/* + * Initialize the zc_nvlist_dst member to prepare for receiving an nvlist from + * an ioctl(). + */ +int +zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) +{ + if (len == 0) + len = 256 * 1024; + zc->zc_nvlist_dst_size = len; + zc->zc_nvlist_dst = + (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); + if (zc->zc_nvlist_dst == 0) + return (-1); + + return (0); +} + +/* + * Called when an ioctl() which returns an nvlist fails with ENOMEM. This will + * expand the nvlist to the size specified in 'zc_nvlist_dst_size', which was + * filled in by the kernel to indicate the actual required size. + */ +int +zcmd_expand_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc) +{ + free((void *)(uintptr_t)zc->zc_nvlist_dst); + zc->zc_nvlist_dst = + (uint64_t)(uintptr_t)zfs_alloc(hdl, zc->zc_nvlist_dst_size); + if (zc->zc_nvlist_dst == 0) + return (-1); + + return (0); +} + +/* + * Called to free the src and dst nvlists stored in the command structure. + */ +void +zcmd_free_nvlists(zfs_cmd_t *zc) +{ + free((void *)(uintptr_t)zc->zc_nvlist_conf); + free((void *)(uintptr_t)zc->zc_nvlist_src); + free((void *)(uintptr_t)zc->zc_nvlist_dst); + zc->zc_nvlist_conf = 0; + zc->zc_nvlist_src = 0; + zc->zc_nvlist_dst = 0; +} + +static int +zcmd_write_nvlist_com(libzfs_handle_t *hdl, uint64_t *outnv, uint64_t *outlen, + nvlist_t *nvl) +{ + char *packed; + size_t len; + + verify(nvlist_size(nvl, &len, NV_ENCODE_NATIVE) == 0); + + if ((packed = zfs_alloc(hdl, len)) == NULL) + return (-1); + + verify(nvlist_pack(nvl, &packed, &len, NV_ENCODE_NATIVE, 0) == 0); + + *outnv = (uint64_t)(uintptr_t)packed; + *outlen = len; + + return (0); +} + +int +zcmd_write_conf_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) +{ + return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_conf, + &zc->zc_nvlist_conf_size, nvl)); +} + +int +zcmd_write_src_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t *nvl) +{ + return (zcmd_write_nvlist_com(hdl, &zc->zc_nvlist_src, + &zc->zc_nvlist_src_size, nvl)); +} + +/* + * Unpacks an nvlist from the ZFS ioctl command structure. + */ +int +zcmd_read_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, nvlist_t **nvlp) +{ + if (nvlist_unpack((void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, nvlp, 0) != 0) + return (no_memory(hdl)); + + return (0); +} + +/* + * ================================================================ + * API shared by zfs and zpool property management + * ================================================================ + */ + +static void +zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) +{ + zprop_list_t *pl = cbp->cb_proplist; + int i; + char *title; + size_t len; + + cbp->cb_first = B_FALSE; + if (cbp->cb_scripted) + return; + + /* + * Start with the length of the column headers. + */ + cbp->cb_colwidths[GET_COL_NAME] = strlen(dgettext(TEXT_DOMAIN, "NAME")); + cbp->cb_colwidths[GET_COL_PROPERTY] = strlen(dgettext(TEXT_DOMAIN, + "PROPERTY")); + cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, + "VALUE")); + cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, + "RECEIVED")); + cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, + "SOURCE")); + + /* first property is always NAME */ + assert(cbp->cb_proplist->pl_prop == + ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME)); + + /* + * Go through and calculate the widths for each column. For the + * 'source' column, we kludge it up by taking the worst-case scenario of + * inheriting from the longest name. This is acceptable because in the + * majority of cases 'SOURCE' is the last column displayed, and we don't + * use the width anyway. Note that the 'VALUE' column can be oversized, + * if the name of the property is much longer than any values we find. + */ + for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { + /* + * 'PROPERTY' column + */ + if (pl->pl_prop != ZPROP_INVAL) { + const char *propname = (type == ZFS_TYPE_POOL) ? + zpool_prop_to_name(pl->pl_prop) : + zfs_prop_to_name(pl->pl_prop); + + len = strlen(propname); + if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) + cbp->cb_colwidths[GET_COL_PROPERTY] = len; + } else { + len = strlen(pl->pl_user_prop); + if (len > cbp->cb_colwidths[GET_COL_PROPERTY]) + cbp->cb_colwidths[GET_COL_PROPERTY] = len; + } + + /* + * 'VALUE' column. The first property is always the 'name' + * property that was tacked on either by /sbin/zfs's + * zfs_do_get() or when calling zprop_expand_list(), so we + * ignore its width. If the user specified the name property + * to display, then it will be later in the list in any case. + */ + if (pl != cbp->cb_proplist && + pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) + cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; + + /* 'RECEIVED' column. */ + if (pl != cbp->cb_proplist && + pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) + cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; + + /* + * 'NAME' and 'SOURCE' columns + */ + if (pl->pl_prop == (type == ZFS_TYPE_POOL ? ZPOOL_PROP_NAME : + ZFS_PROP_NAME) && + pl->pl_width > cbp->cb_colwidths[GET_COL_NAME]) { + cbp->cb_colwidths[GET_COL_NAME] = pl->pl_width; + cbp->cb_colwidths[GET_COL_SOURCE] = pl->pl_width + + strlen(dgettext(TEXT_DOMAIN, "inherited from")); + } + } + + /* + * Now go through and print the headers. + */ + for (i = 0; i < ZFS_GET_NCOLS; i++) { + switch (cbp->cb_columns[i]) { + case GET_COL_NAME: + title = dgettext(TEXT_DOMAIN, "NAME"); + break; + case GET_COL_PROPERTY: + title = dgettext(TEXT_DOMAIN, "PROPERTY"); + break; + case GET_COL_VALUE: + title = dgettext(TEXT_DOMAIN, "VALUE"); + break; + case GET_COL_RECVD: + title = dgettext(TEXT_DOMAIN, "RECEIVED"); + break; + case GET_COL_SOURCE: + title = dgettext(TEXT_DOMAIN, "SOURCE"); + break; + default: + title = NULL; + } + + if (title != NULL) { + if (i == (ZFS_GET_NCOLS - 1) || + cbp->cb_columns[i + 1] == GET_COL_NONE) + (void) printf("%s", title); + else + (void) printf("%-*s ", + cbp->cb_colwidths[cbp->cb_columns[i]], + title); + } + } + (void) printf("\n"); +} + +/* + * Display a single line of output, according to the settings in the callback + * structure. + */ +void +zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, + const char *propname, const char *value, zprop_source_t sourcetype, + const char *source, const char *recvd_value) +{ + int i; + const char *str = NULL; + char buf[128]; + + /* + * Ignore those source types that the user has chosen to ignore. + */ + if ((sourcetype & cbp->cb_sources) == 0) + return; + + if (cbp->cb_first) + zprop_print_headers(cbp, cbp->cb_type); + + for (i = 0; i < ZFS_GET_NCOLS; i++) { + switch (cbp->cb_columns[i]) { + case GET_COL_NAME: + str = name; + break; + + case GET_COL_PROPERTY: + str = propname; + break; + + case GET_COL_VALUE: + str = value; + break; + + case GET_COL_SOURCE: + switch (sourcetype) { + case ZPROP_SRC_NONE: + str = "-"; + break; + + case ZPROP_SRC_DEFAULT: + str = "default"; + break; + + case ZPROP_SRC_LOCAL: + str = "local"; + break; + + case ZPROP_SRC_TEMPORARY: + str = "temporary"; + break; + + case ZPROP_SRC_INHERITED: + (void) snprintf(buf, sizeof (buf), + "inherited from %s", source); + str = buf; + break; + case ZPROP_SRC_RECEIVED: + str = "received"; + break; + + default: + str = NULL; + assert(!"unhandled zprop_source_t"); + } + break; + + case GET_COL_RECVD: + str = (recvd_value == NULL ? "-" : recvd_value); + break; + + default: + continue; + } + + if (i == (ZFS_GET_NCOLS - 1) || + cbp->cb_columns[i + 1] == GET_COL_NONE) + (void) printf("%s", str); + else if (cbp->cb_scripted) + (void) printf("%s\t", str); + else + (void) printf("%-*s ", + cbp->cb_colwidths[cbp->cb_columns[i]], + str); + } + + (void) printf("\n"); +} + +/* + * Given a numeric suffix, convert the value into a number of bits that the + * resulting value must be shifted. + */ +static int +str2shift(libzfs_handle_t *hdl, const char *buf) +{ + const char *ends = "BKMGTPEZ"; + int i; + + if (buf[0] == '\0') + return (0); + for (i = 0; i < strlen(ends); i++) { + if (toupper(buf[0]) == ends[i]) + break; + } + if (i == strlen(ends)) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid numeric suffix '%s'"), buf); + return (-1); + } + + /* + * Allow 'G' = 'GB' = 'GiB', case-insensitively. + * However, 'BB' and 'BiB' are disallowed. + */ + if (buf[1] == '\0' || + (toupper(buf[0]) != 'B' && + ((toupper(buf[1]) == 'B' && buf[2] == '\0') || + (toupper(buf[1]) == 'I' && toupper(buf[2]) == 'B' && + buf[3] == '\0')))) + return (10 * i); + + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid numeric suffix '%s'"), buf); + return (-1); +} + +/* + * Convert a string of the form '100G' into a real number. Used when setting + * properties or creating a volume. 'buf' is used to place an extended error + * message for the caller to use. + */ +int +zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) +{ + char *end; + int shift; + + *num = 0; + + /* Check to see if this looks like a number. */ + if ((value[0] < '0' || value[0] > '9') && value[0] != '.') { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bad numeric value '%s'"), value); + return (-1); + } + + /* Rely on strtoull() to process the numeric portion. */ + errno = 0; + *num = strtoull(value, &end, 10); + + /* + * Check for ERANGE, which indicates that the value is too large to fit + * in a 64-bit value. + */ + if (errno == ERANGE) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + /* + * If we have a decimal value, then do the computation with floating + * point arithmetic. Otherwise, use standard arithmetic. + */ + if (*end == '.') { + double fval = strtod(value, &end); + + if ((shift = str2shift(hdl, end)) == -1) + return (-1); + + fval *= pow(2, shift); + + /* + * UINT64_MAX is not exactly representable as a double. + * The closest representation is UINT64_MAX + 1, so we + * use a >= comparison instead of > for the bounds check. + */ + if (fval >= (double)UINT64_MAX) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + *num = (uint64_t)fval; + } else { + if ((shift = str2shift(hdl, end)) == -1) + return (-1); + + /* Check for overflow */ + if (shift >= 64 || (*num << shift) >> shift != *num) { + if (hdl) + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "numeric value is too large")); + return (-1); + } + + *num <<= shift; + } + + return (0); +} + +/* + * Given a propname=value nvpair to set, parse any numeric properties + * (index, boolean, etc) if they are specified as strings and add the + * resulting nvpair to the returned nvlist. + * + * At the DSL layer, all properties are either 64-bit numbers or strings. + * We want the user to be able to ignore this fact and specify properties + * as native values (numbers, for example) or as strings (to simplify + * command line utilities). This also handles converting index types + * (compression, checksum, etc) from strings to their on-disk index. + */ +int +zprop_parse_value(libzfs_handle_t *hdl, nvpair_t *elem, int prop, + zfs_type_t type, nvlist_t *ret, char **svalp, uint64_t *ivalp, + const char *errbuf) +{ + data_type_t datatype = nvpair_type(elem); + zprop_type_t proptype; + const char *propname; + char *value; + boolean_t isnone = B_FALSE; + boolean_t isauto = B_FALSE; + int err = 0; + + if (type == ZFS_TYPE_POOL) { + proptype = zpool_prop_get_type(prop); + propname = zpool_prop_to_name(prop); + } else { + proptype = zfs_prop_get_type(prop); + propname = zfs_prop_to_name(prop); + } + + /* + * Convert any properties to the internal DSL value types. + */ + *svalp = NULL; + *ivalp = 0; + + switch (proptype) { + case PROP_TYPE_STRING: + if (datatype != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), nvpair_name(elem)); + goto error; + } + err = nvpair_value_string(elem, svalp); + if (err != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is invalid"), nvpair_name(elem)); + goto error; + } + if (strlen(*svalp) >= ZFS_MAXPROPLEN) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' is too long"), nvpair_name(elem)); + goto error; + } + break; + + case PROP_TYPE_NUMBER: + if (datatype == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &value); + if (strcmp(value, "none") == 0) { + isnone = B_TRUE; + } else if (strcmp(value, "auto") == 0) { + isauto = B_TRUE; + } else if (zfs_nicestrtonum(hdl, value, ivalp) != 0) { + goto error; + } + } else if (datatype == DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, ivalp); + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), nvpair_name(elem)); + goto error; + } + + /* + * Quota special: force 'none' and don't allow 0. + */ + if ((type & ZFS_TYPE_DATASET) && *ivalp == 0 && !isnone && + (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_REFQUOTA)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable quota/refquota")); + goto error; + } + + /* + * Special handling for "*_limit=none". In this case it's not + * 0 but UINT64_MAX. + */ + if ((type & ZFS_TYPE_DATASET) && isnone && + (prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT)) { + *ivalp = UINT64_MAX; + } + + /* + * Special handling for setting 'refreservation' to 'auto'. Use + * UINT64_MAX to tell the caller to use zfs_fix_auto_resv(). + * 'auto' is only allowed on volumes. + */ + if (isauto) { + switch (prop) { + case ZFS_PROP_REFRESERVATION: + if ((type & ZFS_TYPE_VOLUME) == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s=auto' only allowed on " + "volumes"), nvpair_name(elem)); + goto error; + } + *ivalp = UINT64_MAX; + break; + default: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'auto' is invalid value for '%s'"), + nvpair_name(elem)); + goto error; + } + } + + break; + + case PROP_TYPE_INDEX: + if (datatype != DATA_TYPE_STRING) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a string"), nvpair_name(elem)); + goto error; + } + + (void) nvpair_value_string(elem, &value); + + if (zprop_string_to_index(prop, value, ivalp, type) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be one of '%s'"), propname, + zprop_values(prop, type)); + goto error; + } + break; + + default: + abort(); + } + + /* + * Add the result to our return set of properties. + */ + if (*svalp != NULL) { + if (nvlist_add_string(ret, propname, *svalp) != 0) { + (void) no_memory(hdl); + return (-1); + } + } else { + if (nvlist_add_uint64(ret, propname, *ivalp) != 0) { + (void) no_memory(hdl); + return (-1); + } + } + + return (0); +error: + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + return (-1); +} + +static int +addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, + zfs_type_t type) +{ + int prop; + zprop_list_t *entry; + + prop = zprop_name_to_prop(propname, type); + + if (prop != ZPROP_INVAL && !zprop_valid_for_type(prop, type, B_FALSE)) + prop = ZPROP_INVAL; + + /* + * When no property table entry can be found, return failure if + * this is a pool property or if this isn't a user-defined + * dataset property, + */ + if (prop == ZPROP_INVAL && ((type == ZFS_TYPE_POOL && + !zpool_prop_feature(propname) && + !zpool_prop_unsupported(propname)) || + (type == ZFS_TYPE_DATASET && !zfs_prop_user(propname) && + !zfs_prop_userquota(propname) && !zfs_prop_written(propname)))) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + return (zfs_error(hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "bad property list"))); + } + + if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) + return (-1); + + entry->pl_prop = prop; + if (prop == ZPROP_INVAL) { + if ((entry->pl_user_prop = zfs_strdup(hdl, propname)) == + NULL) { + free(entry); + return (-1); + } + entry->pl_width = strlen(propname); + } else { + entry->pl_width = zprop_width(prop, &entry->pl_fixed, + type); + } + + *listp = entry; + + return (0); +} + +/* + * Given a comma-separated list of properties, construct a property list + * containing both user-defined and native properties. This function will + * return a NULL list if 'all' is specified, which can later be expanded + * by zprop_expand_list(). + */ +int +zprop_get_list(libzfs_handle_t *hdl, char *props, zprop_list_t **listp, + zfs_type_t type) +{ + *listp = NULL; + + /* + * If 'all' is specified, return a NULL list. + */ + if (strcmp(props, "all") == 0) + return (0); + + /* + * If no props were specified, return an error. + */ + if (props[0] == '\0') { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "no properties specified")); + return (zfs_error(hdl, EZFS_BADPROP, dgettext(TEXT_DOMAIN, + "bad property list"))); + } + + /* + * It would be nice to use getsubopt() here, but the inclusion of column + * aliases makes this more effort than it's worth. + */ + while (*props != '\0') { + size_t len; + char *p; + char c; + + if ((p = strchr(props, ',')) == NULL) { + len = strlen(props); + p = props + len; + } else { + len = p - props; + } + + /* + * Check for empty options. + */ + if (len == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "empty property name")); + return (zfs_error(hdl, EZFS_BADPROP, + dgettext(TEXT_DOMAIN, "bad property list"))); + } + + /* + * Check all regular property names. + */ + c = props[len]; + props[len] = '\0'; + + if (strcmp(props, "space") == 0) { + static char *spaceprops[] = { + "name", "avail", "used", "usedbysnapshots", + "usedbydataset", "usedbyrefreservation", + "usedbychildren", NULL + }; + int i; + + for (i = 0; spaceprops[i]; i++) { + if (addlist(hdl, spaceprops[i], listp, type)) + return (-1); + listp = &(*listp)->pl_next; + } + } else { + if (addlist(hdl, props, listp, type)) + return (-1); + listp = &(*listp)->pl_next; + } + + props = p; + if (c == ',') + props++; + } + + return (0); +} + +void +zprop_free_list(zprop_list_t *pl) +{ + zprop_list_t *next; + + while (pl != NULL) { + next = pl->pl_next; + free(pl->pl_user_prop); + free(pl); + pl = next; + } +} + +typedef struct expand_data { + zprop_list_t **last; + libzfs_handle_t *hdl; + zfs_type_t type; +} expand_data_t; + +static int +zprop_expand_list_cb(int prop, void *cb) +{ + zprop_list_t *entry; + expand_data_t *edp = cb; + + if ((entry = zfs_alloc(edp->hdl, sizeof (zprop_list_t))) == NULL) + return (ZPROP_INVAL); + + entry->pl_prop = prop; + entry->pl_width = zprop_width(prop, &entry->pl_fixed, edp->type); + entry->pl_all = B_TRUE; + + *(edp->last) = entry; + edp->last = &entry->pl_next; + + return (ZPROP_CONT); +} + +int +zprop_expand_list(libzfs_handle_t *hdl, zprop_list_t **plp, zfs_type_t type) +{ + zprop_list_t *entry; + zprop_list_t **last; + expand_data_t exp; + + if (*plp == NULL) { + /* + * If this is the very first time we've been called for an 'all' + * specification, expand the list to include all native + * properties. + */ + last = plp; + + exp.last = last; + exp.hdl = hdl; + exp.type = type; + + if (zprop_iter_common(zprop_expand_list_cb, &exp, B_FALSE, + B_FALSE, type) == ZPROP_INVAL) + return (-1); + + /* + * Add 'name' to the beginning of the list, which is handled + * specially. + */ + if ((entry = zfs_alloc(hdl, sizeof (zprop_list_t))) == NULL) + return (-1); + + entry->pl_prop = (type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : + ZFS_PROP_NAME; + entry->pl_width = zprop_width(entry->pl_prop, + &entry->pl_fixed, type); + entry->pl_all = B_TRUE; + entry->pl_next = *plp; + *plp = entry; + } + return (0); +} + +int +zprop_iter(zprop_func func, void *cb, boolean_t show_all, boolean_t ordered, + zfs_type_t type) +{ + return (zprop_iter_common(func, cb, show_all, ordered, type)); +} + +/* + * Fill given version buffer with zfs userland version + */ +void +zfs_version_userland(char *version, int len) +{ + (void) strlcpy(version, ZFS_META_ALIAS, len); +} + +/* + * Prints both zfs userland and kernel versions + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_print(void) +{ + char zver_userland[128]; + char zver_kernel[128]; + + zfs_version_userland(zver_userland, sizeof (zver_userland)); + + (void) printf("%s\n", zver_userland); + + if (zfs_version_kernel(zver_kernel, sizeof (zver_kernel)) == -1) { + fprintf(stderr, "zfs_version_kernel() failed: %s\n", + strerror(errno)); + return (-1); + } + + (void) printf("zfs-kmod-%s\n", zver_kernel); + + return (0); +} + +/* + * Return 1 if the user requested ANSI color output, and our terminal supports + * it. Return 0 for no color. + */ +static int +use_color(void) +{ + static int use_color = -1; + char *term; + + /* + * Optimization: + * + * For each zpool invocation, we do a single check to see if we should + * be using color or not, and cache that value for the lifetime of the + * the zpool command. That makes it cheap to call use_color() when + * we're printing with color. We assume that the settings are not going + * to change during the invocation of a zpool command (the user isn't + * going to change the ZFS_COLOR value while zpool is running, for + * example). + */ + if (use_color != -1) { + /* + * We've already figured out if we should be using color or + * not. Return the cached value. + */ + return (use_color); + } + + term = getenv("TERM"); + /* + * The user sets the ZFS_COLOR env var set to enable zpool ANSI color + * output. However if NO_COLOR is set (https://no-color.org/) then + * don't use it. Also, don't use color if terminal doesn't support + * it. + */ + if (libzfs_envvar_is_set("ZFS_COLOR") && + !libzfs_envvar_is_set("NO_COLOR") && + isatty(STDOUT_FILENO) && term && strcmp("dumb", term) != 0 && + strcmp("unknown", term) != 0) { + /* Color supported */ + use_color = 1; + } else { + use_color = 0; + } + + return (use_color); +} + +/* + * color_start() and color_end() are used for when you want to colorize a block + * of text. For example: + * + * color_start(ANSI_RED_FG) + * printf("hello"); + * printf("world"); + * color_end(); + */ +void +color_start(char *color) +{ + if (use_color()) + printf("%s", color); +} + +void +color_end(void) +{ + if (use_color()) + printf(ANSI_RESET); +} + +/* printf() with a color. If color is NULL, then do a normal printf. */ +int +printf_color(char *color, char *format, ...) +{ + va_list aptr; + int rc; + + if (color) + color_start(color); + + va_start(aptr, format); + rc = vprintf(format, aptr); + va_end(aptr); + + if (color) + color_end(); + + return (rc); +} diff --git a/lib/libzfs/os/freebsd/libzfs_compat.c b/lib/libzfs/os/freebsd/libzfs_compat.c new file mode 100644 index 000000000000..037ba56efe1c --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_compat.c @@ -0,0 +1,321 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef IN_BASE +#define ZFS_KMOD "zfs" +#else +#define ZFS_KMOD "openzfs" +#endif + +void +libzfs_set_pipe_max(int infd) +{ + /* FreeBSD automatically resizes */ +} + +static int +execvPe(const char *name, const char *path, char * const *argv, + char * const *envp) +{ + const char **memp; + size_t cnt, lp, ln; + int eacces, save_errno; + char *cur, buf[MAXPATHLEN]; + const char *p, *bp; + struct stat sb; + + eacces = 0; + + /* If it's an absolute or relative path name, it's easy. */ + if (strchr(name, '/')) { + bp = name; + cur = NULL; + goto retry; + } + bp = buf; + + /* If it's an empty path name, fail in the usual POSIX way. */ + if (*name == '\0') { + errno = ENOENT; + return (-1); + } + + cur = alloca(strlen(path) + 1); + if (cur == NULL) { + errno = ENOMEM; + return (-1); + } + strcpy(cur, path); + while ((p = strsep(&cur, ":")) != NULL) { + /* + * It's a SHELL path -- double, leading and trailing colons + * mean the current directory. + */ + if (*p == '\0') { + p = "."; + lp = 1; + } else + lp = strlen(p); + ln = strlen(name); + + /* + * If the path is too long complain. This is a possible + * security issue; given a way to make the path too long + * the user may execute the wrong program. + */ + if (lp + ln + 2 > sizeof (buf)) { + (void) write(STDERR_FILENO, "execvP: ", 8); + (void) write(STDERR_FILENO, p, lp); + (void) write(STDERR_FILENO, ": path too long\n", + 16); + continue; + } + bcopy(p, buf, lp); + buf[lp] = '/'; + bcopy(name, buf + lp + 1, ln); + buf[lp + ln + 1] = '\0'; + +retry: (void) execve(bp, argv, envp); + switch (errno) { + case E2BIG: + goto done; + case ELOOP: + case ENAMETOOLONG: + case ENOENT: + break; + case ENOEXEC: + for (cnt = 0; argv[cnt]; ++cnt) + ; + memp = alloca((cnt + 2) * sizeof (char *)); + if (memp == NULL) { + /* errno = ENOMEM; XXX override ENOEXEC? */ + goto done; + } + memp[0] = "sh"; + memp[1] = bp; + bcopy(argv + 1, memp + 2, cnt * sizeof (char *)); + execve(_PATH_BSHELL, __DECONST(char **, memp), envp); + goto done; + case ENOMEM: + goto done; + case ENOTDIR: + break; + case ETXTBSY: + /* + * We used to retry here, but sh(1) doesn't. + */ + goto done; + default: + /* + * EACCES may be for an inaccessible directory or + * a non-executable file. Call stat() to decide + * which. This also handles ambiguities for EFAULT + * and EIO, and undocumented errors like ESTALE. + * We hope that the race for a stat() is unimportant. + */ + save_errno = errno; + if (stat(bp, &sb) != 0) + break; + if (save_errno == EACCES) { + eacces = 1; + continue; + } + errno = save_errno; + goto done; + } + } + if (eacces) + errno = EACCES; + else + errno = ENOENT; +done: + return (-1); +} + +int +execvpe(const char *name, char * const argv[], char * const envp[]) +{ + const char *path; + + /* Get the path we're searching. */ + if ((path = getenv("PATH")) == NULL) + path = _PATH_DEFPATH; + + return (execvPe(name, path, argv, envp)); +} + +const char * +libzfs_error_init(int error) +{ + + return (strerror(error)); +} + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (zfs_ioctl_fd(hdl->libzfs_fd, request, zc)); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +int +libzfs_load_module(void) +{ + /* + * XXX: kldfind(ZFS_KMOD) would be nice here, but we retain + * modfind("zfs") so out-of-base openzfs userland works with the + * in-base module. + */ + if (modfind("zfs") < 0) { + /* Not present in kernel, try loading it. */ + if (kldload(ZFS_KMOD) < 0 && errno != EEXIST) { + return (errno); + } + } + return (0); +} + +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + return (0); +} + +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + return (0); +} + +int +find_shares_object(differ_info_t *di) +{ + return (0); +} + +/* + * Attach/detach the given filesystem to/from the given jail. + */ +int +zfs_jail(zfs_handle_t *zhp, int jailid, int attach) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + zfs_cmd_t zc = {"\0"}; + char errbuf[1024]; + unsigned long cmd; + int ret; + + if (attach) { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot jail '%s'"), zhp->zfs_name); + } else { + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot unjail '%s'"), zhp->zfs_name); + } + + switch (zhp->zfs_type) { + case ZFS_TYPE_VOLUME: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "volumes can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_SNAPSHOT: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "snapshots can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_BOOKMARK: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "bookmarks can not be jailed")); + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case ZFS_TYPE_POOL: + case ZFS_TYPE_FILESYSTEM: + /* OK */ + ; + } + assert(zhp->zfs_type == ZFS_TYPE_FILESYSTEM); + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_zoneid = jailid; + + cmd = attach ? ZFS_IOC_JAIL : ZFS_IOC_UNJAIL; + if ((ret = zfs_ioctl(hdl, cmd, &zc)) != 0) + zfs_standard_error(hdl, errno, errbuf); + + return (ret); +} + +/* + * Set loader options for next boot. + */ +int +zpool_nextboot(libzfs_handle_t *hdl, uint64_t pool_guid, uint64_t dev_guid, + const char *command) +{ + zfs_cmd_t zc = {"\0"}; + nvlist_t *args; + int error; + + args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_CONFIG_POOL_GUID, pool_guid); + fnvlist_add_uint64(args, ZPOOL_CONFIG_GUID, dev_guid); + fnvlist_add_string(args, "command", command); + error = zcmd_write_src_nvlist(hdl, &zc, args); + if (error == 0) + error = zfs_ioctl(hdl, ZFS_IOC_NEXTBOOT, &zc); + zcmd_free_nvlists(&zc); + nvlist_free(args); + return (error); +} + +/* + * Fill given version buffer with zfs kernel version. + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + size_t l = len; + + return (sysctlbyname("vfs.zfs.version.module", + version, &l, NULL, 0)); +} diff --git a/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c b/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c new file mode 100644 index 000000000000..18b93fe27969 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_ioctl_compat.c @@ -0,0 +1,432 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Xin Li . All rights reserved. + * Copyright 2013 Martin Matuska . All rights reserved. + * Portions Copyright 2005, 2010, Oracle and/or its affiliates. + * All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" +#include + +/* + * FreeBSD zfs_cmd compatibility with older binaries + * appropriately remap/extend the zfs_cmd_t structure + */ +void +zfs_cmd_compat_get(zfs_cmd_t *zc, caddr_t addr, const int cflag) +{ + +} +#if 0 +static int +zfs_ioctl_compat_get_nvlist(uint64_t nvl, size_t size, int iflag, + nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (EINVAL); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { + kmem_free(packed, size); + return (error); + } +#else + packed = (void *)(uintptr_t)nvl; +#endif + + error = nvlist_unpack(packed, size, &list, 0); + +#ifdef _KERNEL + kmem_free(packed, size); +#endif + + if (error != 0) + return (error); + + *nvp = list; + return (0); +} + +static int +zfs_ioctl_compat_put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + int error = 0; + size_t size; + + VERIFY(nvlist_size(nvl, &size, NV_ENCODE_NATIVE) == 0); + +#ifdef _KERNEL + packed = kmem_alloc(size, KM_SLEEP); + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + KM_SLEEP) == 0); + + if (ddi_copyout(packed, + (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags) != 0) + error = EFAULT; + kmem_free(packed, size); +#else + packed = (void *)(uintptr_t)zc->zc_nvlist_dst; + VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, + 0) == 0); +#endif + + zc->zc_nvlist_dst_size = size; + return (error); +} + +static void +zfs_ioctl_compat_fix_stats_nvlist(nvlist_t *nvl) +{ + nvlist_t **child; + nvlist_t *nvroot = NULL; + vdev_stat_t *vs; + uint_t c, children, nelem; + + if (nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + zfs_ioctl_compat_fix_stats_nvlist(child[c]); + } + } + + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvroot); + if ((nvlist_lookup_uint64_array(nvl, "stats", + (uint64_t **)&vs, &nelem) == 0)) { + nvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, nelem); + nvlist_remove(nvl, "stats", + DATA_TYPE_UINT64_ARRAY); + } +} + + +static int +zfs_ioctl_compat_fix_stats(zfs_cmd_t *zc, const int nc) +{ + nvlist_t *nv, *nvp = NULL; + nvpair_t *elem; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + + if (nc == 5) { /* ZFS_IOC_POOL_STATS */ + elem = NULL; + while ((elem = nvlist_next_nvpair(nv, elem)) != NULL) { + if (nvpair_value_nvlist(elem, &nvp) == 0) + zfs_ioctl_compat_fix_stats_nvlist(nvp); + } + elem = NULL; + } else + zfs_ioctl_compat_fix_stats_nvlist(nv); + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} + +static int +zfs_ioctl_compat_pool_get_props(zfs_cmd_t *zc) +{ + nvlist_t *nv, *nva = NULL; + int error; + + if ((error = zfs_ioctl_compat_get_nvlist(zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size, zc->zc_iflags, &nv)) != 0) + return (error); + + if (nvlist_lookup_nvlist(nv, "used", &nva) == 0) { + nvlist_add_nvlist(nv, "allocated", nva); + nvlist_remove(nv, "used", DATA_TYPE_NVLIST); + } + + if (nvlist_lookup_nvlist(nv, "available", &nva) == 0) { + nvlist_add_nvlist(nv, "free", nva); + nvlist_remove(nv, "available", DATA_TYPE_NVLIST); + } + + error = zfs_ioctl_compat_put_nvlist(zc, nv); + + nvlist_free(nv); + + return (error); +} +#endif + +#ifdef _KERNEL +int +zfs_ioctl_compat_pre(zfs_cmd_t *zc, int *vec, const int cflag) +{ + int error = 0; + + /* are we creating a clone? */ + if (*vec == ZFS_IOC_CREATE && zc->zc_value[0] != '\0') + *vec = ZFS_IOC_CLONE; + + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (*vec) { + + case 7: /* ZFS_IOC_POOL_SCRUB (v15) */ + zc->zc_cookie = POOL_SCAN_SCRUB; + break; + } + } + + return (error); +} + +void +zfs_ioctl_compat_post(zfs_cmd_t *zc, int vec, const int cflag) +{ + if (cflag == ZFS_CMD_COMPAT_V15) { + switch (vec) { + case ZFS_IOC_POOL_CONFIGS: + case ZFS_IOC_POOL_STATS: + case ZFS_IOC_POOL_TRYIMPORT: + zfs_ioctl_compat_fix_stats(zc, vec); + break; + case 41: /* ZFS_IOC_POOL_GET_PROPS (v15) */ + zfs_ioctl_compat_pool_get_props(zc); + break; + } + } +} + +nvlist_t * +zfs_ioctl_compat_innvl(zfs_cmd_t *zc, nvlist_t *innvl, const int vec, + const int cflag) +{ + nvlist_t *nvl, *tmpnvl, *hnvl; + nvpair_t *elem; + char *poolname, *snapname; + int err; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + goto out; + + switch (vec) { + case ZFS_IOC_CREATE: + nvl = fnvlist_alloc(); + fnvlist_add_int32(nvl, "type", zc->zc_objset_type); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_CLONE: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "origin", zc->zc_value); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "props", innvl); + nvlist_free(innvl); + } + return (nvl); + break; + case ZFS_IOC_SNAPSHOT: + if (innvl == NULL) + goto out; + nvl = fnvlist_alloc(); + fnvlist_add_nvlist(nvl, "props", innvl); + tmpnvl = fnvlist_alloc(); + snapname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + fnvlist_add_boolean(tmpnvl, snapname); + kmem_free(snapname, strlen(snapname + 1)); + /* check if we are doing a recursive snapshot */ + if (zc->zc_cookie) + dmu_get_recursive_snaps_nvl(zc->zc_name, zc->zc_value, + tmpnvl); + fnvlist_add_nvlist(nvl, "snaps", tmpnvl); + fnvlist_free(tmpnvl); + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_SPACE_SNAPS: + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, "firstsnap", zc->zc_value); + if (innvl != NULL) + nvlist_free(innvl); + return (nvl); + break; + case ZFS_IOC_DESTROY_SNAPS: + if (innvl == NULL && cflag == ZFS_CMD_COMPAT_DEADMAN) + goto out; + nvl = fnvlist_alloc(); + if (innvl != NULL) { + fnvlist_add_nvlist(nvl, "snaps", innvl); + } else { + /* + * We are probably called by even older binaries, + * allocate and populate nvlist with recursive + * snapshots + */ + if (zfs_component_namecheck(zc->zc_value, NULL, + NULL) == 0) { + tmpnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, tmpnvl) == 0) + fnvlist_add_nvlist(nvl, "snaps", + tmpnvl); + nvlist_free(tmpnvl); + } + } + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_HOLD: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cleanup_fd != -1) + fnvlist_add_int32(nvl, "cleanup_fd", + (int32_t)zc->zc_cleanup_fd); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + nvlist_add_string(tmpnvl, + nvpair_name(elem), zc->zc_string); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + nvlist_add_string(tmpnvl, snapname, zc->zc_string); + kmem_free(snapname, strlen(snapname + 1)); + } + fnvlist_add_nvlist(nvl, "holds", tmpnvl); + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + case ZFS_IOC_RELEASE: + nvl = fnvlist_alloc(); + tmpnvl = fnvlist_alloc(); + if (zc->zc_cookie) { + hnvl = fnvlist_alloc(); + if (dmu_get_recursive_snaps_nvl(zc->zc_name, + zc->zc_value, hnvl) == 0) { + elem = NULL; + while ((elem = nvlist_next_nvpair(hnvl, + elem)) != NULL) { + fnvlist_add_boolean(tmpnvl, + zc->zc_string); + fnvlist_add_nvlist(nvl, + nvpair_name(elem), tmpnvl); + } + } + nvlist_free(hnvl); + } else { + snapname = kmem_asprintf("%s@%s", zc->zc_name, + zc->zc_value); + fnvlist_add_boolean(tmpnvl, zc->zc_string); + fnvlist_add_nvlist(nvl, snapname, tmpnvl); + kmem_free(snapname, strlen(snapname + 1)); + } + nvlist_free(tmpnvl); + if (innvl != NULL) + nvlist_free(innvl); + /* strip dataset part from zc->zc_name */ + zc->zc_name[strcspn(zc->zc_name, "/@")] = '\0'; + return (nvl); + break; + } +out: + return (innvl); +} + +nvlist_t * +zfs_ioctl_compat_outnvl(zfs_cmd_t *zc, nvlist_t *outnvl, const int vec, + const int cflag) +{ + nvlist_t *tmpnvl; + + if (cflag == ZFS_CMD_COMPAT_NONE || cflag == ZFS_CMD_COMPAT_LZC || + cflag == ZFS_CMD_COMPAT_ZCMD || cflag == ZFS_CMD_COMPAT_EDBP || + cflag == ZFS_CMD_COMPAT_RESUME || cflag == ZFS_CMD_COMPAT_INLANES) + return (outnvl); + + switch (vec) { + case ZFS_IOC_SPACE_SNAPS: + (void) nvlist_lookup_uint64(outnvl, "used", &zc->zc_cookie); + (void) nvlist_lookup_uint64(outnvl, "compressed", + &zc->zc_objset_type); + (void) nvlist_lookup_uint64(outnvl, "uncompressed", + &zc->zc_perm_action); + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + case ZFS_IOC_CREATE: + case ZFS_IOC_CLONE: + case ZFS_IOC_HOLD: + case ZFS_IOC_RELEASE: + nvlist_free(outnvl); + /* return empty outnvl */ + tmpnvl = fnvlist_alloc(); + return (tmpnvl); + break; + } + + return (outnvl); +} +#endif /* KERNEL */ diff --git a/lib/libzfs/os/freebsd/libzfs_zmount.c b/lib/libzfs/os/freebsd/libzfs_zmount.c new file mode 100644 index 000000000000..2207fffc5573 --- /dev/null +++ b/lib/libzfs/os/freebsd/libzfs_zmount.c @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +/* + * This file implements Solaris compatible zmount() function. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" + +static void +build_iovec(struct iovec **iov, int *iovlen, const char *name, void *val, + size_t len) +{ + int i; + + if (*iovlen < 0) + return; + i = *iovlen; + *iov = realloc(*iov, sizeof (**iov) * (i + 2)); + if (*iov == NULL) { + *iovlen = -1; + return; + } + (*iov)[i].iov_base = strdup(name); + (*iov)[i].iov_len = strlen(name) + 1; + i++; + (*iov)[i].iov_base = val; + if (len == (size_t)-1) { + if (val != NULL) + len = strlen(val) + 1; + else + len = 0; + } + (*iov)[i].iov_len = (int)len; + *iovlen = ++i; +} + +static int +do_mount_(const char *spec, const char *dir, int mflag, char *fstype, + char *dataptr, int datalen, char *optptr, int optlen) +{ + struct iovec *iov; + char *optstr, *p, *tofree; + int iovlen, rv; + + assert(spec != NULL); + assert(dir != NULL); + assert(fstype != NULL); + assert(strcmp(fstype, MNTTYPE_ZFS) == 0); + assert(dataptr == NULL); + assert(datalen == 0); + assert(optptr != NULL); + assert(optlen > 0); + + tofree = optstr = strdup(optptr); + assert(optstr != NULL); + + iov = NULL; + iovlen = 0; + if (strstr(optstr, MNTOPT_REMOUNT) != NULL) + build_iovec(&iov, &iovlen, "update", NULL, 0); + if (strstr(optstr, MNTOPT_NOXATTR) == NULL && + strstr(optstr, MNTOPT_XATTR) == NULL && + strstr(optstr, MNTOPT_SAXATTR) == NULL && + strstr(optstr, MNTOPT_DIRXATTR) == NULL) + build_iovec(&iov, &iovlen, "xattr", NULL, 0); + if (mflag & MS_RDONLY) + build_iovec(&iov, &iovlen, "ro", NULL, 0); + build_iovec(&iov, &iovlen, "fstype", fstype, (size_t)-1); + build_iovec(&iov, &iovlen, "fspath", __DECONST(char *, dir), + (size_t)-1); + build_iovec(&iov, &iovlen, "from", __DECONST(char *, spec), (size_t)-1); + while ((p = strsep(&optstr, ",/")) != NULL) + build_iovec(&iov, &iovlen, p, NULL, (size_t)-1); + rv = nmount(iov, iovlen, 0); + free(tofree); + if (rv < 0) + return (errno); + return (rv); +} + +int +do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) +{ + + return (do_mount_(zfs_get_name(zhp), mntpt, flags, MNTTYPE_ZFS, NULL, 0, + opts, sizeof (mntpt))); +} + +int +do_unmount(const char *mntpt, int flags) +{ + + return (unmount(mntpt, flags)); +} + +int +zfs_mount_delegation_check(void) +{ + return (0); +} diff --git a/lib/libzfs/os/linux/libzfs_mount_os.c b/lib/libzfs/os/linux/libzfs_mount_os.c new file mode 100644 index 000000000000..92052f28ffe7 --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_mount_os.c @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2020 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright 2017 RackTop Systems. + * Copyright (c) 2018 Datto Inc. + * Copyright 2018 OmniOS Community Edition (OmniOSce) Association. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "libzfs_impl.h" +#include + +#define ZS_COMMENT 0x00000000 /* comment */ +#define ZS_ZFSUTIL 0x00000001 /* caller is zfs(8) */ + +typedef struct option_map { + const char *name; + unsigned long mntmask; + unsigned long zfsmask; +} option_map_t; + +static const option_map_t option_map[] = { + /* Canonicalized filesystem independent options from mount(8) */ + { MNTOPT_NOAUTO, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DEFAULTS, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NODEVICES, MS_NODEV, ZS_COMMENT }, + { MNTOPT_DEVICES, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DIRSYNC, MS_DIRSYNC, ZS_COMMENT }, + { MNTOPT_NOEXEC, MS_NOEXEC, ZS_COMMENT }, + { MNTOPT_EXEC, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_GROUP, MS_GROUP, ZS_COMMENT }, + { MNTOPT_NETDEV, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOFAIL, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOSUID, MS_NOSUID, ZS_COMMENT }, + { MNTOPT_SUID, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_OWNER, MS_OWNER, ZS_COMMENT }, + { MNTOPT_REMOUNT, MS_REMOUNT, ZS_COMMENT }, + { MNTOPT_RO, MS_RDONLY, ZS_COMMENT }, + { MNTOPT_RW, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_SYNC, MS_SYNCHRONOUS, ZS_COMMENT }, + { MNTOPT_USER, MS_USERS, ZS_COMMENT }, + { MNTOPT_USERS, MS_USERS, ZS_COMMENT }, + /* acl flags passed with util-linux-2.24 mount command */ + { MNTOPT_ACL, MS_POSIXACL, ZS_COMMENT }, + { MNTOPT_NOACL, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_POSIXACL, MS_POSIXACL, ZS_COMMENT }, +#ifdef MS_NOATIME + { MNTOPT_NOATIME, MS_NOATIME, ZS_COMMENT }, + { MNTOPT_ATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_NODIRATIME + { MNTOPT_NODIRATIME, MS_NODIRATIME, ZS_COMMENT }, + { MNTOPT_DIRATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_RELATIME + { MNTOPT_RELATIME, MS_RELATIME, ZS_COMMENT }, + { MNTOPT_NORELATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_STRICTATIME + { MNTOPT_STRICTATIME, MS_STRICTATIME, ZS_COMMENT }, + { MNTOPT_NOSTRICTATIME, MS_COMMENT, ZS_COMMENT }, +#endif +#ifdef MS_LAZYTIME + { MNTOPT_LAZYTIME, MS_LAZYTIME, ZS_COMMENT }, +#endif + { MNTOPT_CONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_FSCONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_DEFCONTEXT, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_ROOTCONTEXT, MS_COMMENT, ZS_COMMENT }, +#ifdef MS_I_VERSION + { MNTOPT_IVERSION, MS_I_VERSION, ZS_COMMENT }, +#endif +#ifdef MS_MANDLOCK + { MNTOPT_NBMAND, MS_MANDLOCK, ZS_COMMENT }, + { MNTOPT_NONBMAND, MS_COMMENT, ZS_COMMENT }, +#endif + /* Valid options not found in mount(8) */ + { MNTOPT_BIND, MS_BIND, ZS_COMMENT }, +#ifdef MS_REC + { MNTOPT_RBIND, MS_BIND|MS_REC, ZS_COMMENT }, +#endif + { MNTOPT_COMMENT, MS_COMMENT, ZS_COMMENT }, +#ifdef MS_NOSUB + { MNTOPT_NOSUB, MS_NOSUB, ZS_COMMENT }, +#endif +#ifdef MS_SILENT + { MNTOPT_QUIET, MS_SILENT, ZS_COMMENT }, +#endif + /* Custom zfs options */ + { MNTOPT_XATTR, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_NOXATTR, MS_COMMENT, ZS_COMMENT }, + { MNTOPT_ZFSUTIL, MS_COMMENT, ZS_ZFSUTIL }, + { NULL, 0, 0 } }; + +/* + * Break the mount option in to a name/value pair. The name is + * validated against the option map and mount flags set accordingly. + */ +static int +parse_option(char *mntopt, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy) +{ + const option_map_t *opt; + char *ptr, *name, *value = NULL; + int error = 0; + + name = strdup(mntopt); + if (name == NULL) + return (ENOMEM); + + for (ptr = name; ptr && *ptr; ptr++) { + if (*ptr == '=') { + *ptr = '\0'; + value = ptr+1; + VERIFY3P(value, !=, NULL); + break; + } + } + + for (opt = option_map; opt->name != NULL; opt++) { + if (strncmp(name, opt->name, strlen(name)) == 0) { + *mntflags |= opt->mntmask; + *zfsflags |= opt->zfsmask; + error = 0; + goto out; + } + } + + if (!sloppy) + error = ENOENT; +out: + /* If required further process on the value may be done here */ + free(name); + return (error); +} + +/* + * Translate the mount option string in to MS_* mount flags for the + * kernel vfs. When sloppy is non-zero unknown options will be ignored + * otherwise they are considered fatal are copied in to badopt. + */ +int +zfs_parse_mount_options(char *mntopts, unsigned long *mntflags, + unsigned long *zfsflags, int sloppy, char *badopt, char *mtabopt) +{ + int error = 0, quote = 0, flag = 0, count = 0; + char *ptr, *opt, *opts; + + opts = strdup(mntopts); + if (opts == NULL) + return (ENOMEM); + + *mntflags = 0; + opt = NULL; + + /* + * Scan through all mount options which must be comma delimited. + * We must be careful to notice regions which are double quoted + * and skip commas in these regions. Each option is then checked + * to determine if it is a known option. + */ + for (ptr = opts; ptr && !flag; ptr++) { + if (opt == NULL) + opt = ptr; + + if (*ptr == '"') + quote = !quote; + + if (quote) + continue; + + if (*ptr == '\0') + flag = 1; + + if ((*ptr == ',') || (*ptr == '\0')) { + *ptr = '\0'; + + error = parse_option(opt, mntflags, zfsflags, sloppy); + if (error) { + strcpy(badopt, opt); + goto out; + + } + + if (!(*mntflags & MS_REMOUNT) && + !(*zfsflags & ZS_ZFSUTIL) && + mtabopt != NULL) { + if (count > 0) + strlcat(mtabopt, ",", MNT_LINE_MAX); + + strlcat(mtabopt, opt, MNT_LINE_MAX); + count++; + } + + opt = NULL; + } + } + +out: + free(opts); + return (error); +} + +static void +append_mntopt(const char *name, const char *val, char *mntopts, + char *mtabopt, boolean_t quote) +{ + char tmp[MNT_LINE_MAX]; + + snprintf(tmp, MNT_LINE_MAX, quote ? ",%s=\"%s\"" : ",%s=%s", name, val); + + if (mntopts) + strlcat(mntopts, tmp, MNT_LINE_MAX); + + if (mtabopt) + strlcat(mtabopt, tmp, MNT_LINE_MAX); +} + +static void +zfs_selinux_setcontext(zfs_handle_t *zhp, zfs_prop_t zpt, const char *name, + char *mntopts, char *mtabopt) +{ + char context[ZFS_MAXPROPLEN]; + + if (zfs_prop_get(zhp, zpt, context, sizeof (context), + NULL, NULL, 0, B_FALSE) == 0) { + if (strcmp(context, "none") != 0) + append_mntopt(name, context, mntopts, mtabopt, B_TRUE); + } +} + +void +zfs_adjust_mount_options(zfs_handle_t *zhp, const char *mntpoint, + char *mntopts, char *mtabopt) +{ + char prop[ZFS_MAXPROPLEN]; + + /* + * Checks to see if the ZFS_PROP_SELINUX_CONTEXT exists + * if it does, create a tmp variable in case it's needed + * checks to see if the selinux context is set to the default + * if it is, allow the setting of the other context properties + * this is needed because the 'context' property overrides others + * if it is not the default, set the 'context' property + */ + if (zfs_prop_get(zhp, ZFS_PROP_SELINUX_CONTEXT, prop, sizeof (prop), + NULL, NULL, 0, B_FALSE) == 0) { + if (strcmp(prop, "none") == 0) { + zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_FSCONTEXT, + MNTOPT_FSCONTEXT, mntopts, mtabopt); + zfs_selinux_setcontext(zhp, ZFS_PROP_SELINUX_DEFCONTEXT, + MNTOPT_DEFCONTEXT, mntopts, mtabopt); + zfs_selinux_setcontext(zhp, + ZFS_PROP_SELINUX_ROOTCONTEXT, MNTOPT_ROOTCONTEXT, + mntopts, mtabopt); + } else { + append_mntopt(MNTOPT_CONTEXT, prop, + mntopts, mtabopt, B_TRUE); + } + } + + /* A hint used to determine an auto-mounted snapshot mount point */ + append_mntopt(MNTOPT_MNTPOINT, mntpoint, mntopts, NULL, B_FALSE); +} + +/* + * By default the filesystem by preparing the mount options (i.e. parsing + * some flags from the "opts" parameter into the "flags" parameter) and then + * directly calling the system call mount(2). We don't need the mount utility + * or update /etc/mtab, because this is a symlink on all modern systems. + * + * If the environment variable ZFS_MOUNT_HELPER is set, we fall back to the + * previous behavior: + * The filesystem is mounted by invoking the system mount utility rather + * than by the system call mount(2). This ensures that the /etc/mtab + * file is correctly locked for the update. Performing our own locking + * and /etc/mtab update requires making an unsafe assumption about how + * the mount utility performs its locking. Unfortunately, this also means + * in the case of a mount failure we do not have the exact errno. We must + * make due with return value from the mount process. + */ +int +do_mount(zfs_handle_t *zhp, const char *mntpt, char *opts, int flags) +{ + const char *src = zfs_get_name(zhp); + int error = 0; + + if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + char badopt[MNT_LINE_MAX] = {0}; + unsigned long mntflags = flags, zfsflags; + char myopts[MNT_LINE_MAX] = {0}; + + if (zfs_parse_mount_options(opts, &mntflags, + &zfsflags, 0, badopt, NULL)) { + return (EINVAL); + } + strlcat(myopts, opts, MNT_LINE_MAX); + zfs_adjust_mount_options(zhp, mntpt, myopts, NULL); + if (mount(src, mntpt, MNTTYPE_ZFS, mntflags, myopts)) { + return (errno); + } + } else { + char *argv[9] = { + "/bin/mount", + "--no-canonicalize", + "-t", MNTTYPE_ZFS, + "-o", opts, + (char *)src, + (char *)mntpt, + (char *)NULL }; + + /* Return only the most critical mount error */ + error = libzfs_run_process(argv[0], argv, + STDOUT_VERBOSE|STDERR_VERBOSE); + if (error) { + if (error & MOUNT_FILEIO) { + error = EIO; + } else if (error & MOUNT_USER) { + error = EINTR; + } else if (error & MOUNT_SOFTWARE) { + error = EPIPE; + } else if (error & MOUNT_BUSY) { + error = EBUSY; + } else if (error & MOUNT_SYSERR) { + error = EAGAIN; + } else if (error & MOUNT_USAGE) { + error = EINVAL; + } else + error = ENXIO; /* Generic error */ + } + } + + return (error); +} + +int +do_unmount(const char *mntpt, int flags) +{ + if (!libzfs_envvar_is_set("ZFS_MOUNT_HELPER")) { + return (umount2(mntpt, flags)); + } + + char force_opt[] = "-f"; + char lazy_opt[] = "-l"; + char *argv[7] = { + "/bin/umount", + "-t", MNTTYPE_ZFS, + NULL, NULL, NULL, NULL }; + int rc, count = 3; + + if (flags & MS_FORCE) { + argv[count] = force_opt; + count++; + } + + if (flags & MS_DETACH) { + argv[count] = lazy_opt; + count++; + } + + argv[count] = (char *)mntpt; + rc = libzfs_run_process(argv[0], argv, STDOUT_VERBOSE|STDERR_VERBOSE); + + return (rc ? EINVAL : 0); +} + +int +zfs_mount_delegation_check(void) +{ + return ((geteuid() != 0) ? EACCES : 0); +} diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c new file mode 100644 index 000000000000..5c6da5338dbc --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -0,0 +1,345 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + * Copyright (c) 2018 Datto Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2018, loli10K + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "libzfs_impl.h" +#include "zfs_comutil.h" +#include "zfeature_common.h" + +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *path, const char *msg) +{ + int fd, error; + + if ((fd = open(path, O_RDWR|O_DIRECT)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, msg)); + } + + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + * + * Also, we don't call efi_rescan() - that would just return EBUSY. + * The module will do it for us in vdev_disk_open(). + */ + error = efi_use_whole_disk(fd); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); + (void) ioctl(fd, BLKFLSBUF); + + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), path); + return (zfs_error(hdl, EZFS_NOCAP, msg)); + } + return (0); +} + +/* + * Read the EFI label from the config, if a label does not exist then + * pass back the error to the caller. If the caller has passed a non-NULL + * diskaddr argument then we set it to the starting address of the EFI + * partition. + */ +static int +read_efi_label(nvlist_t *config, diskaddr_t *sb) +{ + char *path; + int fd; + char diskname[MAXPATHLEN]; + int err = -1; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PATH, &path) != 0) + return (err); + + (void) snprintf(diskname, sizeof (diskname), "%s%s", DISK_ROOT, + strrchr(path, '/')); + if ((fd = open(diskname, O_RDONLY|O_DIRECT)) >= 0) { + struct dk_gpt *vtoc; + + if ((err = efi_alloc_and_read(fd, &vtoc)) >= 0) { + if (sb != NULL) + *sb = vtoc->efi_parts[0].p_start; + efi_free(vtoc); + } + (void) close(fd); + } + return (err); +} + +/* + * determine where a partition starts on a disk in the current + * configuration + */ +static diskaddr_t +find_start_block(nvlist_t *config) +{ + nvlist_t **child; + uint_t c, children; + diskaddr_t sb = MAXOFFSET_T; + uint64_t wholedisk; + + if (nvlist_lookup_nvlist_array(config, + ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) { + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk) != 0 || !wholedisk) { + return (MAXOFFSET_T); + } + if (read_efi_label(config, &sb) < 0) + sb = MAXOFFSET_T; + return (sb); + } + + for (c = 0; c < children; c++) { + sb = find_start_block(child[c]); + if (sb != MAXOFFSET_T) { + return (sb); + } + } + return (MAXOFFSET_T); +} + +static int +zpool_label_disk_check(char *path) +{ + struct dk_gpt *vtoc; + int fd, err; + + if ((fd = open(path, O_RDONLY|O_DIRECT)) < 0) + return (errno); + + if ((err = efi_alloc_and_read(fd, &vtoc)) != 0) { + (void) close(fd); + return (err); + } + + if (vtoc->efi_flags & EFI_GPT_PRIMARY_CORRUPT) { + efi_free(vtoc); + (void) close(fd); + return (EIDRM); + } + + efi_free(vtoc); + (void) close(fd); + return (0); +} + +/* + * Generate a unique partition name for the ZFS member. Partitions must + * have unique names to ensure udev will be able to create symlinks under + * /dev/disk/by-partlabel/ for all pool members. The partition names are + * of the form -. + */ +static void +zpool_label_name(char *label_name, int label_size) +{ + uint64_t id = 0; + int fd; + + fd = open("/dev/urandom", O_RDONLY); + if (fd >= 0) { + if (read(fd, &id, sizeof (id)) != sizeof (id)) + id = 0; + + close(fd); + } + + if (id == 0) + id = (((uint64_t)rand()) << 32) | (uint64_t)rand(); + + snprintf(label_name, label_size, "zfs-%016llx", (u_longlong_t)id); +} + +/* + * Label an individual disk. The name provided is the short name, + * stripped of any leading /dev path. + */ +int +zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) +{ + char path[MAXPATHLEN]; + struct dk_gpt *vtoc; + int rval, fd; + size_t resv = EFI_MIN_RESV_SIZE; + uint64_t slice_size; + diskaddr_t start_block; + char errbuf[1024]; + + /* prepare an error message just in case */ + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot label '%s'"), name); + + if (zhp) { + nvlist_t *nvroot; + + verify(nvlist_lookup_nvlist(zhp->zpool_config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + + if (zhp->zpool_start_block == 0) + start_block = find_start_block(nvroot); + else + start_block = zhp->zpool_start_block; + zhp->zpool_start_block = start_block; + } else { + /* new pool */ + start_block = NEW_START_BLOCK; + } + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + + if ((fd = open(path, O_RDWR|O_DIRECT|O_EXCL)) < 0) { + /* + * This shouldn't happen. We've long since verified that this + * is a valid device. + */ + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to open device: %d"), path, errno); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); + } + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &vtoc) != 0) { + /* + * The only way this can fail is if we run out of memory, or we + * were unable to read the disk's capacity + */ + if (errno == ENOMEM) + (void) no_memory(hdl); + + (void) close(fd); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': unable to read disk capacity"), path); + + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + + slice_size = vtoc->efi_last_u_lba + 1; + slice_size -= EFI_MIN_RESV_SIZE; + if (start_block == MAXOFFSET_T) + start_block = NEW_START_BLOCK; + slice_size -= start_block; + slice_size = P2ALIGN(slice_size, PARTITION_END_ALIGNMENT); + + vtoc->efi_parts[0].p_start = start_block; + vtoc->efi_parts[0].p_size = slice_size; + + /* + * Why we use V_USR: V_BACKUP confuses users, and is considered + * disposable by some EFI utilities (since EFI doesn't have a backup + * slice). V_UNASSIGNED is supposed to be used only for zero size + * partitions, and efi_write() will fail if we use it. V_ROOT, V_BOOT, + * etc. were all pretty specific. V_USR is as close to reality as we + * can get, in the absence of V_OTHER. + */ + vtoc->efi_parts[0].p_tag = V_USR; + zpool_label_name(vtoc->efi_parts[0].p_name, EFI_PART_NAME_LEN); + + vtoc->efi_parts[8].p_start = slice_size + start_block; + vtoc->efi_parts[8].p_size = resv; + vtoc->efi_parts[8].p_tag = V_RESERVED; + + rval = efi_write(fd, vtoc); + + /* Flush the buffers to disk and invalidate the page cache. */ + (void) fsync(fd); + (void) ioctl(fd, BLKFLSBUF); + + if (rval == 0) + rval = efi_rescan(fd); + + /* + * Some block drivers (like pcata) may not support EFI GPT labels. + * Print out a helpful error message directing the user to manually + * label the disk and give a specific slice. + */ + if (rval != 0) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "try using " + "parted(8) and then provide a specific slice: %d"), rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + (void) close(fd); + efi_free(vtoc); + + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + (void) zfs_append_partition(path, MAXPATHLEN); + + /* Wait to udev to signal use the device has settled. */ + rval = zpool_label_disk_wait(path, DISK_LABEL_WAIT); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "failed to " + "detect device partitions on '%s': %d"), path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + + /* We can't be to paranoid. Read the label back and verify it. */ + (void) snprintf(path, sizeof (path), "%s/%s", DISK_ROOT, name); + rval = zpool_label_disk_check(path); + if (rval) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "freshly written " + "EFI label on '%s' is damaged. Ensure\nthis device " + "is not in use, and is functioning properly: %d"), + path, rval); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + return (0); +} diff --git a/lib/libzfs/os/linux/libzfs_sendrecv_os.c b/lib/libzfs/os/linux/libzfs_sendrecv_os.c new file mode 100644 index 000000000000..eeb1f07f2dea --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_sendrecv_os.c @@ -0,0 +1,52 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include + +#include "libzfs_impl.h" + +#ifndef F_SETPIPE_SZ +#define F_SETPIPE_SZ (F_SETLEASE + 7) +#endif /* F_SETPIPE_SZ */ + +#ifndef F_GETPIPE_SZ +#define F_GETPIPE_SZ (F_GETLEASE + 7) +#endif /* F_GETPIPE_SZ */ + +void +libzfs_set_pipe_max(int infd) +{ + FILE *procf = fopen("/proc/sys/fs/pipe-max-size", "r"); + + if (procf != NULL) { + unsigned long max_psize; + long cur_psize; + if (fscanf(procf, "%lu", &max_psize) > 0) { + cur_psize = fcntl(infd, F_GETPIPE_SZ); + if (cur_psize > 0 && + max_psize > (unsigned long) cur_psize) + fcntl(infd, F_SETPIPE_SZ, + max_psize); + } + fclose(procf); + } +} diff --git a/lib/libzfs/os/linux/libzfs_util_os.c b/lib/libzfs/os/linux/libzfs_util_os.c new file mode 100644 index 000000000000..918a43f7d03d --- /dev/null +++ b/lib/libzfs/os/linux/libzfs_util_os.c @@ -0,0 +1,215 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "libzfs_impl.h" +#include "zfs_prop.h" +#include +#include + +#define ZDIFF_SHARESDIR "/.zfs/shares/" + +int +zfs_ioctl(libzfs_handle_t *hdl, int request, zfs_cmd_t *zc) +{ + return (ioctl(hdl->libzfs_fd, request, zc)); +} + +const char * +libzfs_error_init(int error) +{ + switch (error) { + case ENXIO: + return (dgettext(TEXT_DOMAIN, "The ZFS modules are not " + "loaded.\nTry running '/sbin/modprobe zfs' as root " + "to load them.")); + case ENOENT: + return (dgettext(TEXT_DOMAIN, "/dev/zfs and /proc/self/mounts " + "are required.\nTry running 'udevadm trigger' and 'mount " + "-t proc proc /proc' as root.")); + case ENOEXEC: + return (dgettext(TEXT_DOMAIN, "The ZFS modules cannot be " + "auto-loaded.\nTry running '/sbin/modprobe zfs' as " + "root to manually load them.")); + case EACCES: + return (dgettext(TEXT_DOMAIN, "Permission denied the " + "ZFS utilities must be run as root.")); + default: + return (dgettext(TEXT_DOMAIN, "Failed to initialize the " + "libzfs library.")); + } +} + +static int +libzfs_module_loaded(const char *module) +{ + const char path_prefix[] = "/sys/module/"; + char path[256]; + + memcpy(path, path_prefix, sizeof (path_prefix) - 1); + strcpy(path + sizeof (path_prefix) - 1, module); + + return (access(path, F_OK) == 0); +} + +/* + * Verify the required ZFS_DEV device is available and optionally attempt + * to load the ZFS modules. Under normal circumstances the modules + * should already have been loaded by some external mechanism. + * + * Environment variables: + * - ZFS_MODULE_LOADING="YES|yes|ON|on" - Attempt to load modules. + * - ZFS_MODULE_TIMEOUT="" - Seconds to wait for ZFS_DEV + */ +static int +libzfs_load_module_impl(const char *module) +{ + char *argv[4] = {"/sbin/modprobe", "-q", (char *)module, (char *)0}; + char *load_str, *timeout_str; + long timeout = 10; /* seconds */ + long busy_timeout = 10; /* milliseconds */ + int load = 0, fd; + hrtime_t start; + + /* Optionally request module loading */ + if (!libzfs_module_loaded(module)) { + load_str = getenv("ZFS_MODULE_LOADING"); + if (load_str) { + if (!strncasecmp(load_str, "YES", strlen("YES")) || + !strncasecmp(load_str, "ON", strlen("ON"))) + load = 1; + else + load = 0; + } + + if (load) { + if (libzfs_run_process("/sbin/modprobe", argv, 0)) + return (ENOEXEC); + } + + if (!libzfs_module_loaded(module)) + return (ENXIO); + } + + /* + * Device creation by udev is asynchronous and waiting may be + * required. Busy wait for 10ms and then fall back to polling every + * 10ms for the allowed timeout (default 10s, max 10m). This is + * done to optimize for the common case where the device is + * immediately available and to avoid penalizing the possible + * case where udev is slow or unable to create the device. + */ + timeout_str = getenv("ZFS_MODULE_TIMEOUT"); + if (timeout_str) { + timeout = strtol(timeout_str, NULL, 0); + timeout = MAX(MIN(timeout, (10 * 60)), 0); /* 0 <= N <= 600 */ + } + + start = gethrtime(); + do { + fd = open(ZFS_DEV, O_RDWR); + if (fd >= 0) { + (void) close(fd); + return (0); + } else if (errno != ENOENT) { + return (errno); + } else if (NSEC2MSEC(gethrtime() - start) < busy_timeout) { + sched_yield(); + } else { + usleep(10 * MILLISEC); + } + } while (NSEC2MSEC(gethrtime() - start) < (timeout * MILLISEC)); + + return (ENOENT); +} + +int +libzfs_load_module(void) +{ + return (libzfs_load_module_impl(ZFS_DRIVER)); +} + +int +find_shares_object(differ_info_t *di) +{ + char fullpath[MAXPATHLEN]; + struct stat64 sb = { 0 }; + + (void) strlcpy(fullpath, di->dsmnt, MAXPATHLEN); + (void) strlcat(fullpath, ZDIFF_SHARESDIR, MAXPATHLEN); + + if (stat64(fullpath, &sb) != 0) { + (void) snprintf(di->errbuf, sizeof (di->errbuf), + dgettext(TEXT_DOMAIN, "Cannot stat %s"), fullpath); + return (zfs_error(di->zhp->zfs_hdl, EZFS_DIFF, di->errbuf)); + } + + di->shares = (uint64_t)sb.st_ino; + return (0); +} + +/* + * Fill given version buffer with zfs kernel version read from ZFS_SYSFS_DIR + * Returns 0 on success, and -1 on error (with errno set) + */ +int +zfs_version_kernel(char *version, int len) +{ + int _errno; + int fd; + int rlen; + + if ((fd = open(ZFS_SYSFS_DIR "/version", O_RDONLY)) == -1) + return (-1); + + if ((rlen = read(fd, version, len)) == -1) { + version[0] = '\0'; + _errno = errno; + (void) close(fd); + errno = _errno; + return (-1); + } + + version[rlen-1] = '\0'; /* discard '\n' */ + + if (close(fd) == -1) + return (-1); + + return (0); +} diff --git a/lib/libzfs_core/.gitignore b/lib/libzfs_core/.gitignore new file mode 100644 index 000000000000..c428d6369033 --- /dev/null +++ b/lib/libzfs_core/.gitignore @@ -0,0 +1 @@ +/libzfs_core.pc diff --git a/lib/libzfs_core/Makefile.am b/lib/libzfs_core/Makefile.am new file mode 100644 index 000000000000..e94ba85d275c --- /dev/null +++ b/lib/libzfs_core/Makefile.am @@ -0,0 +1,29 @@ +include $(top_srcdir)/config/Rules.am + +pkgconfig_DATA = libzfs_core.pc + +lib_LTLIBRARIES = libzfs_core.la + +USER_C = \ + libzfs_core.c + +libzfs_core_la_SOURCES = $(USER_C) + +libzfs_core_la_LIBADD = \ + $(abs_top_builddir)/lib/libzutil/libzutil.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la + +libzfs_core_la_LIBADD += $(LTLIBINTL) + +libzfs_core_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzfs_core_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzfs_core_la_LIBADD += -lutil -lgeom +libzfs_core_la_LDFLAGS += -version-info 3:0:0 +else +libzfs_core_la_LDFLAGS += -version-info 1:0:0 +endif diff --git a/lib/libzfs_core/libzfs_core.c b/lib/libzfs_core/libzfs_core.c new file mode 100644 index 000000000000..50b1b9dec260 --- /dev/null +++ b/lib/libzfs_core/libzfs_core.c @@ -0,0 +1,1644 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + */ + +/* + * LibZFS_Core (lzc) is intended to replace most functionality in libzfs. + * It has the following characteristics: + * + * - Thread Safe. libzfs_core is accessible concurrently from multiple + * threads. This is accomplished primarily by avoiding global data + * (e.g. caching). Since it's thread-safe, there is no reason for a + * process to have multiple libzfs "instances". Therefore, we store + * our few pieces of data (e.g. the file descriptor) in global + * variables. The fd is reference-counted so that the libzfs_core + * library can be "initialized" multiple times (e.g. by different + * consumers within the same process). + * + * - Committed Interface. The libzfs_core interface will be committed, + * therefore consumers can compile against it and be confident that + * their code will continue to work on future releases of this code. + * Currently, the interface is Evolving (not Committed), but we intend + * to commit to it once it is more complete and we determine that it + * meets the needs of all consumers. + * + * - Programmatic Error Handling. libzfs_core communicates errors with + * defined error numbers, and doesn't print anything to stdout/stderr. + * + * - Thin Layer. libzfs_core is a thin layer, marshaling arguments + * to/from the kernel ioctls. There is generally a 1:1 correspondence + * between libzfs_core functions and ioctls to ZFS_DEV. + * + * - Clear Atomicity. Because libzfs_core functions are generally 1:1 + * with kernel ioctls, and kernel ioctls are general atomic, each + * libzfs_core function is atomic. For example, creating multiple + * snapshots with a single call to lzc_snapshot() is atomic -- it + * can't fail with only some of the requested snapshots created, even + * in the event of power loss or system crash. + * + * - Continued libzfs Support. Some higher-level operations (e.g. + * support for "zfs send -R") are too complicated to fit the scope of + * libzfs_core. This functionality will continue to live in libzfs. + * Where appropriate, libzfs will use the underlying atomic operations + * of libzfs_core. For example, libzfs may implement "zfs send -R | + * zfs receive" by using individual "send one snapshot", rename, + * destroy, and "receive one snapshot" operations in libzfs_core. + * /sbin/zfs and /sbin/zpool will link with both libzfs and + * libzfs_core. Other consumers should aim to use only libzfs_core, + * since that will be the supported, stable interface going forwards. + */ + +#include +#include +#include +#include +#include +#ifdef ZFS_DEBUG +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int g_fd = -1; +static pthread_mutex_t g_lock = PTHREAD_MUTEX_INITIALIZER; +static int g_refcount; + +#ifdef ZFS_DEBUG +static zfs_ioc_t fail_ioc_cmd = ZFS_IOC_LAST; +static zfs_errno_t fail_ioc_err; + +static void +libzfs_core_debug_ioc(void) +{ + /* + * To test running newer user space binaries with kernel's + * that don't yet support an ioctl or a new ioctl arg we + * provide an override to intentionally fail an ioctl. + * + * USAGE: + * The override variable, ZFS_IOC_TEST, is of the form "cmd:err" + * + * For example, to fail a ZFS_IOC_POOL_CHECKPOINT with a + * ZFS_ERR_IOC_CMD_UNAVAIL, the string would be "0x5a4d:1029" + * + * $ sudo sh -c "ZFS_IOC_TEST=0x5a4d:1029 zpool checkpoint tank" + * cannot checkpoint 'tank': the loaded zfs module does not support + * this operation. A reboot may be required to enable this operation. + */ + if (fail_ioc_cmd == ZFS_IOC_LAST) { + char *ioc_test = getenv("ZFS_IOC_TEST"); + unsigned int ioc_num = 0, ioc_err = 0; + + if (ioc_test != NULL && + sscanf(ioc_test, "%i:%i", &ioc_num, &ioc_err) == 2 && + ioc_num < ZFS_IOC_LAST) { + fail_ioc_cmd = ioc_num; + fail_ioc_err = ioc_err; + } + } +} +#endif + +int +libzfs_core_init(void) +{ + (void) pthread_mutex_lock(&g_lock); + if (g_refcount == 0) { + g_fd = open(ZFS_DEV, O_RDWR); + if (g_fd < 0) { + (void) pthread_mutex_unlock(&g_lock); + return (errno); + } + } + g_refcount++; + +#ifdef ZFS_DEBUG + libzfs_core_debug_ioc(); +#endif + (void) pthread_mutex_unlock(&g_lock); + return (0); +} + +void +libzfs_core_fini(void) +{ + (void) pthread_mutex_lock(&g_lock); + ASSERT3S(g_refcount, >, 0); + + if (g_refcount > 0) + g_refcount--; + + if (g_refcount == 0 && g_fd != -1) { + (void) close(g_fd); + g_fd = -1; + } + (void) pthread_mutex_unlock(&g_lock); +} + +static int +lzc_ioctl(zfs_ioc_t ioc, const char *name, + nvlist_t *source, nvlist_t **resultp) +{ + zfs_cmd_t zc = {"\0"}; + int error = 0; + char *packed = NULL; + size_t size = 0; + + ASSERT3S(g_refcount, >, 0); + VERIFY3S(g_fd, !=, -1); + +#ifdef ZFS_DEBUG + if (ioc == fail_ioc_cmd) + return (fail_ioc_err); +#endif + + if (name != NULL) + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + + if (source != NULL) { + packed = fnvlist_pack(source, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + } + + if (resultp != NULL) { + *resultp = NULL; + if (ioc == ZFS_IOC_CHANNEL_PROGRAM) { + zc.zc_nvlist_dst_size = fnvlist_lookup_uint64(source, + ZCP_ARG_MEMLIMIT); + } else { + zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); + } + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); + if (zc.zc_nvlist_dst == (uint64_t)0) { + error = ENOMEM; + goto out; + } + } + + while (zfs_ioctl_fd(g_fd, ioc, &zc) != 0) { + /* + * If ioctl exited with ENOMEM, we retry the ioctl after + * increasing the size of the destination nvlist. + * + * Channel programs that exit with ENOMEM ran over the + * lua memory sandbox; they should not be retried. + */ + if (errno == ENOMEM && resultp != NULL && + ioc != ZFS_IOC_CHANNEL_PROGRAM) { + free((void *)(uintptr_t)zc.zc_nvlist_dst); + zc.zc_nvlist_dst_size *= 2; + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); + if (zc.zc_nvlist_dst == (uint64_t)0) { + error = ENOMEM; + goto out; + } + } else { + error = errno; + break; + } + } + if (zc.zc_nvlist_dst_filled) { + *resultp = fnvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size); + } + +out: + if (packed != NULL) + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zc.zc_nvlist_dst); + return (error); +} + +int +lzc_create(const char *fsname, enum lzc_dataset_type type, nvlist_t *props, + uint8_t *wkeydata, uint_t wkeylen) +{ + int error; + nvlist_t *hidden_args = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_int32(args, "type", (dmu_objset_type_t)type); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + + if (wkeydata != NULL) { + hidden_args = fnvlist_alloc(); + fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, + wkeylen); + fnvlist_add_nvlist(args, ZPOOL_HIDDEN_ARGS, hidden_args); + } + + error = lzc_ioctl(ZFS_IOC_CREATE, fsname, args, NULL); + nvlist_free(hidden_args); + nvlist_free(args); + return (error); +} + +int +lzc_clone(const char *fsname, const char *origin, nvlist_t *props) +{ + int error; + nvlist_t *hidden_args = NULL; + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_string(args, "origin", origin); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + error = lzc_ioctl(ZFS_IOC_CLONE, fsname, args, NULL); + nvlist_free(hidden_args); + nvlist_free(args); + return (error); +} + +int +lzc_promote(const char *fsname, char *snapnamebuf, int snapnamelen) +{ + /* + * The promote ioctl is still legacy, so we need to construct our + * own zfs_cmd_t rather than using lzc_ioctl(). + */ + zfs_cmd_t zc = {"\0"}; + + ASSERT3S(g_refcount, >, 0); + VERIFY3S(g_fd, !=, -1); + + (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); + if (zfs_ioctl_fd(g_fd, ZFS_IOC_PROMOTE, &zc) != 0) { + int error = errno; + if (error == EEXIST && snapnamebuf != NULL) + (void) strlcpy(snapnamebuf, zc.zc_string, snapnamelen); + return (error); + } + return (0); +} + +int +lzc_rename(const char *source, const char *target) +{ + zfs_cmd_t zc = {"\0"}; + int error; + + ASSERT3S(g_refcount, >, 0); + VERIFY3S(g_fd, !=, -1); + (void) strlcpy(zc.zc_name, source, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); + error = zfs_ioctl_fd(g_fd, ZFS_IOC_RENAME, &zc); + if (error != 0) + error = errno; + return (error); +} +int +lzc_destroy(const char *fsname) +{ + int error; + nvlist_t *args = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_DESTROY, fsname, args, NULL); + nvlist_free(args); + return (error); +} + +/* + * Creates snapshots. + * + * The keys in the snaps nvlist are the snapshots to be created. + * They must all be in the same pool. + * + * The props nvlist is properties to set. Currently only user properties + * are supported. { user:prop_name -> string value } + * + * The returned results nvlist will have an entry for each snapshot that failed. + * The value will be the (int32) error code. + * + * The return value will be 0 if all snapshots were created, otherwise it will + * be the errno of a (unspecified) snapshot that failed. + */ +int +lzc_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + *errlist = NULL; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (props != NULL) + fnvlist_add_nvlist(args, "props", props); + + error = lzc_ioctl(ZFS_IOC_SNAPSHOT, pool, args, errlist); + nvlist_free(args); + + return (error); +} + +/* + * Destroys snapshots. + * + * The keys in the snaps nvlist are the snapshots to be destroyed. + * They must all be in the same pool. + * + * Snapshots that do not exist will be silently ignored. + * + * If 'defer' is not set, and a snapshot has user holds or clones, the + * destroy operation will fail and none of the snapshots will be + * destroyed. + * + * If 'defer' is set, and a snapshot has user holds or clones, it will be + * marked for deferred destruction, and will be destroyed when the last hold + * or clone is removed/destroyed. + * + * The return value will be 0 if all snapshots were destroyed (or marked for + * later destruction if 'defer' is set) or didn't exist to begin with. + * + * Otherwise the return value will be the errno of a (unspecified) snapshot + * that failed, no snapshots will be destroyed, and the errlist will have an + * entry for each snapshot that failed. The value in the errlist will be + * the (int32) error code. + */ +int +lzc_destroy_snaps(nvlist_t *snaps, boolean_t defer, nvlist_t **errlist) +{ + nvpair_t *elem; + nvlist_t *args; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(snaps, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "snaps", snaps); + if (defer) + fnvlist_add_boolean(args, "defer"); + + error = lzc_ioctl(ZFS_IOC_DESTROY_SNAPS, pool, args, errlist); + nvlist_free(args); + + return (error); +} + +int +lzc_snaprange_space(const char *firstsnap, const char *lastsnap, + uint64_t *usedp) +{ + nvlist_t *args; + nvlist_t *result; + int err; + char fs[ZFS_MAX_DATASET_NAME_LEN]; + char *atp; + + /* determine the fs name */ + (void) strlcpy(fs, firstsnap, sizeof (fs)); + atp = strchr(fs, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "firstsnap", firstsnap); + + err = lzc_ioctl(ZFS_IOC_SPACE_SNAPS, lastsnap, args, &result); + nvlist_free(args); + if (err == 0) + *usedp = fnvlist_lookup_uint64(result, "used"); + fnvlist_free(result); + + return (err); +} + +boolean_t +lzc_exists(const char *dataset) +{ + /* + * The objset_stats ioctl is still legacy, so we need to construct our + * own zfs_cmd_t rather than using lzc_ioctl(). + */ + zfs_cmd_t zc = {"\0"}; + + ASSERT3S(g_refcount, >, 0); + VERIFY3S(g_fd, !=, -1); + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + return (zfs_ioctl_fd(g_fd, ZFS_IOC_OBJSET_STATS, &zc) == 0); +} + +/* + * outnvl is unused. + * It was added to preserve the function signature in case it is + * needed in the future. + */ +/*ARGSUSED*/ +int +lzc_sync(const char *pool_name, nvlist_t *innvl, nvlist_t **outnvl) +{ + return (lzc_ioctl(ZFS_IOC_POOL_SYNC, pool_name, innvl, NULL)); +} + +/* + * Create "user holds" on snapshots. If there is a hold on a snapshot, + * the snapshot can not be destroyed. (However, it can be marked for deletion + * by lzc_destroy_snaps(defer=B_TRUE).) + * + * The keys in the nvlist are snapshot names. + * The snapshots must all be in the same pool. + * The value is the name of the hold (string type). + * + * If cleanup_fd is not -1, it must be the result of open(ZFS_DEV, O_EXCL). + * In this case, when the cleanup_fd is closed (including on process + * termination), the holds will be released. If the system is shut down + * uncleanly, the holds will be released when the pool is next opened + * or imported. + * + * Holds for snapshots which don't exist will be skipped and have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if all holds, for snapshots that existed, + * were successfully created. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed and no holds will be created. + * + * In all cases the errlist will have an entry for each hold that failed + * (name = snapshot), with its value being the error code (int32). + */ +int +lzc_hold(nvlist_t *holds, int cleanup_fd, nvlist_t **errlist) +{ + char pool[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *args; + nvpair_t *elem; + int error; + + /* determine the pool name */ + elem = nvlist_next_nvpair(holds, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + args = fnvlist_alloc(); + fnvlist_add_nvlist(args, "holds", holds); + if (cleanup_fd != -1) + fnvlist_add_int32(args, "cleanup_fd", cleanup_fd); + + error = lzc_ioctl(ZFS_IOC_HOLD, pool, args, errlist); + nvlist_free(args); + return (error); +} + +/* + * Release "user holds" on snapshots. If the snapshot has been marked for + * deferred destroy (by lzc_destroy_snaps(defer=B_TRUE)), it does not have + * any clones, and all the user holds are removed, then the snapshot will be + * destroyed. + * + * The keys in the nvlist are snapshot names. + * The snapshots must all be in the same pool. + * The value is an nvlist whose keys are the holds to remove. + * + * Holds which failed to release because they didn't exist will have an entry + * added to errlist, but will not cause an overall failure. + * + * The return value will be 0 if the nvl holds was empty or all holds that + * existed, were successfully removed. + * + * Otherwise the return value will be the errno of a (unspecified) hold that + * failed to release and no holds will be released. + * + * In all cases the errlist will have an entry for each hold that failed to + * to release. + */ +int +lzc_release(nvlist_t *holds, nvlist_t **errlist) +{ + char pool[ZFS_MAX_DATASET_NAME_LEN]; + nvpair_t *elem; + + /* determine the pool name */ + elem = nvlist_next_nvpair(holds, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/@")] = '\0'; + + return (lzc_ioctl(ZFS_IOC_RELEASE, pool, holds, errlist)); +} + +/* + * Retrieve list of user holds on the specified snapshot. + * + * On success, *holdsp will be set to an nvlist which the caller must free. + * The keys are the names of the holds, and the value is the creation time + * of the hold (uint64) in seconds since the epoch. + */ +int +lzc_get_holds(const char *snapname, nvlist_t **holdsp) +{ + return (lzc_ioctl(ZFS_IOC_GET_HOLDS, snapname, NULL, holdsp)); +} + +/* + * Generate a zfs send stream for the specified snapshot and write it to + * the specified file descriptor. + * + * "snapname" is the full name of the snapshot to send (e.g. "pool/fs@snap") + * + * If "from" is NULL, a full (non-incremental) stream will be sent. + * If "from" is non-NULL, it must be the full name of a snapshot or + * bookmark to send an incremental from (e.g. "pool/fs@earlier_snap" or + * "pool/fs#earlier_bmark"). If non-NULL, the specified snapshot or + * bookmark must represent an earlier point in the history of "snapname"). + * It can be an earlier snapshot in the same filesystem or zvol as "snapname", + * or it can be the origin of "snapname"'s filesystem, or an earlier + * snapshot in the origin, etc. + * + * "fd" is the file descriptor to write the send stream to. + * + * If "flags" contains LZC_SEND_FLAG_LARGE_BLOCK, the stream is permitted + * to contain DRR_WRITE records with drr_length > 128K, and DRR_OBJECT + * records with drr_blksz > 128K. + * + * If "flags" contains LZC_SEND_FLAG_EMBED_DATA, the stream is permitted + * to contain DRR_WRITE_EMBEDDED records with drr_etype==BP_EMBEDDED_TYPE_DATA, + * which the receiving system must support (as indicated by support + * for the "embedded_data" feature). + * + * If "flags" contains LZC_SEND_FLAG_COMPRESS, the stream is generated by using + * compressed WRITE records for blocks which are compressed on disk and in + * memory. If the lz4_compress feature is active on the sending system, then + * the receiving system must have that feature enabled as well. + * + * If "flags" contains LZC_SEND_FLAG_RAW, the stream is generated, for encrypted + * datasets, by sending data exactly as it exists on disk. This allows backups + * to be taken even if encryption keys are not currently loaded. + */ +int +lzc_send(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + NULL)); +} + +int +lzc_send_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, const char *redactbook) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, 0, 0, + redactbook)); +} + +int +lzc_send_resume(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff) +{ + return (lzc_send_resume_redacted(snapname, from, fd, flags, resumeobj, + resumeoff, NULL)); +} + +/* + * snapname: The name of the "tosnap", or the snapshot whose contents we are + * sending. + * from: The name of the "fromsnap", or the incremental source. + * fd: File descriptor to write the stream to. + * flags: flags that determine features to be used by the stream. + * resumeobj: Object to resume from, for resuming send + * resumeoff: Offset to resume from, for resuming send. + * redactnv: nvlist of string -> boolean(ignored) containing the names of all + * the snapshots that we should redact with respect to. + * redactbook: Name of the redaction bookmark to create. + */ +int +lzc_send_resume_redacted(const char *snapname, const char *from, int fd, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook) +{ + nvlist_t *args; + int err; + + args = fnvlist_alloc(); + fnvlist_add_int32(args, "fd", fd); + if (from != NULL) + fnvlist_add_string(args, "fromsnap", from); + if (flags & LZC_SEND_FLAG_LARGE_BLOCK) + fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); + if (flags & LZC_SEND_FLAG_RAW) + fnvlist_add_boolean(args, "rawok"); + if (flags & LZC_SEND_FLAG_SAVED) + fnvlist_add_boolean(args, "savedok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + + err = lzc_ioctl(ZFS_IOC_SEND_NEW, snapname, args, NULL); + nvlist_free(args); + return (err); +} + +/* + * "from" can be NULL, a snapshot, or a bookmark. + * + * If from is NULL, a full (non-incremental) stream will be estimated. This + * is calculated very efficiently. + * + * If from is a snapshot, lzc_send_space uses the deadlists attached to + * each snapshot to efficiently estimate the stream size. + * + * If from is a bookmark, the indirect blocks in the destination snapshot + * are traversed, looking for blocks with a birth time since the creation TXG of + * the snapshot this bookmark was created from. This will result in + * significantly more I/O and be less efficient than a send space estimation on + * an equivalent snapshot. This process is also used if redact_snaps is + * non-null. + */ +int +lzc_send_space_resume_redacted(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t resumeobj, uint64_t resumeoff, + uint64_t resume_bytes, const char *redactbook, int fd, uint64_t *spacep) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + if (from != NULL) + fnvlist_add_string(args, "from", from); + if (flags & LZC_SEND_FLAG_LARGE_BLOCK) + fnvlist_add_boolean(args, "largeblockok"); + if (flags & LZC_SEND_FLAG_EMBED_DATA) + fnvlist_add_boolean(args, "embedok"); + if (flags & LZC_SEND_FLAG_COMPRESS) + fnvlist_add_boolean(args, "compressok"); + if (flags & LZC_SEND_FLAG_RAW) + fnvlist_add_boolean(args, "rawok"); + if (resumeobj != 0 || resumeoff != 0) { + fnvlist_add_uint64(args, "resume_object", resumeobj); + fnvlist_add_uint64(args, "resume_offset", resumeoff); + fnvlist_add_uint64(args, "bytes", resume_bytes); + } + if (redactbook != NULL) + fnvlist_add_string(args, "redactbook", redactbook); + if (fd != -1) + fnvlist_add_int32(args, "fd", fd); + + err = lzc_ioctl(ZFS_IOC_SEND_SPACE, snapname, args, &result); + nvlist_free(args); + if (err == 0) + *spacep = fnvlist_lookup_uint64(result, "space"); + nvlist_free(result); + return (err); +} + +int +lzc_send_space(const char *snapname, const char *from, + enum lzc_send_flags flags, uint64_t *spacep) +{ + return (lzc_send_space_resume_redacted(snapname, from, flags, 0, 0, 0, + NULL, -1, spacep)); +} + +static int +recv_read(int fd, void *buf, int ilen) +{ + char *cp = buf; + int rv; + int len = ilen; + + do { + rv = read(fd, cp, len); + cp += rv; + len -= rv; + } while (rv > 0); + + if (rv < 0 || len != 0) + return (EIO); + + return (0); +} + +/* + * Linux adds ZFS_IOC_RECV_NEW for resumable and raw streams and preserves the + * legacy ZFS_IOC_RECV user/kernel interface. The new interface supports all + * stream options but is currently only used for resumable streams. This way + * updated user space utilities will interoperate with older kernel modules. + * + * Non-Linux OpenZFS platforms have opted to modify the legacy interface. + */ +static int +recv_impl(const char *snapname, nvlist_t *recvdprops, nvlist_t *localprops, + uint8_t *wkeydata, uint_t wkeylen, const char *origin, boolean_t force, + boolean_t resumable, boolean_t raw, int input_fd, + const dmu_replay_record_t *begin_record, uint64_t *read_bytes, + uint64_t *errflags, nvlist_t **errors) +{ + dmu_replay_record_t drr; + char fsname[MAXPATHLEN]; + char *atp; + int error; + boolean_t payload = B_FALSE; + + ASSERT3S(g_refcount, >, 0); + VERIFY3S(g_fd, !=, -1); + + /* Set 'fsname' to the name of containing filesystem */ + (void) strlcpy(fsname, snapname, sizeof (fsname)); + atp = strchr(fsname, '@'); + if (atp == NULL) + return (EINVAL); + *atp = '\0'; + + /* If the fs does not exist, try its parent. */ + if (!lzc_exists(fsname)) { + char *slashp = strrchr(fsname, '/'); + if (slashp == NULL) + return (ENOENT); + *slashp = '\0'; + } + + /* + * The begin_record is normally a non-byteswapped BEGIN record. + * For resumable streams it may be set to any non-byteswapped + * dmu_replay_record_t. + */ + if (begin_record == NULL) { + error = recv_read(input_fd, &drr, sizeof (drr)); + if (error != 0) + return (error); + } else { + drr = *begin_record; + payload = (begin_record->drr_payloadlen != 0); + } + + /* + * All receives with a payload should use the new interface. + */ + if (resumable || raw || wkeydata != NULL || payload) { + nvlist_t *outnvl = NULL; + nvlist_t *innvl = fnvlist_alloc(); + + fnvlist_add_string(innvl, "snapname", snapname); + + if (recvdprops != NULL) + fnvlist_add_nvlist(innvl, "props", recvdprops); + + if (localprops != NULL) + fnvlist_add_nvlist(innvl, "localprops", localprops); + + if (wkeydata != NULL) { + /* + * wkeydata must be placed in the special + * ZPOOL_HIDDEN_ARGS nvlist so that it + * will not be printed to the zpool history. + */ + nvlist_t *hidden_args = fnvlist_alloc(); + fnvlist_add_uint8_array(hidden_args, "wkeydata", + wkeydata, wkeylen); + fnvlist_add_nvlist(innvl, ZPOOL_HIDDEN_ARGS, + hidden_args); + nvlist_free(hidden_args); + } + + if (origin != NULL && strlen(origin)) + fnvlist_add_string(innvl, "origin", origin); + + fnvlist_add_byte_array(innvl, "begin_record", + (uchar_t *)&drr, sizeof (drr)); + + fnvlist_add_int32(innvl, "input_fd", input_fd); + + if (force) + fnvlist_add_boolean(innvl, "force"); + + if (resumable) + fnvlist_add_boolean(innvl, "resumable"); + + + error = lzc_ioctl(ZFS_IOC_RECV_NEW, fsname, innvl, &outnvl); + + if (error == 0 && read_bytes != NULL) + error = nvlist_lookup_uint64(outnvl, "read_bytes", + read_bytes); + + if (error == 0 && errflags != NULL) + error = nvlist_lookup_uint64(outnvl, "error_flags", + errflags); + + if (error == 0 && errors != NULL) { + nvlist_t *nvl; + error = nvlist_lookup_nvlist(outnvl, "errors", &nvl); + if (error == 0) + *errors = fnvlist_dup(nvl); + } + + fnvlist_free(innvl); + fnvlist_free(outnvl); + } else { + zfs_cmd_t zc = {"\0"}; + char *packed = NULL; + size_t size; + + ASSERT3S(g_refcount, >, 0); + + (void) strlcpy(zc.zc_name, fsname, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + + if (recvdprops != NULL) { + packed = fnvlist_pack(recvdprops, &size); + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + } + + if (localprops != NULL) { + packed = fnvlist_pack(localprops, &size); + zc.zc_nvlist_conf = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_conf_size = size; + } + + if (origin != NULL) + (void) strlcpy(zc.zc_string, origin, + sizeof (zc.zc_string)); + + ASSERT3S(drr.drr_type, ==, DRR_BEGIN); + zc.zc_begin_record = drr.drr_u.drr_begin; + zc.zc_guid = force; + zc.zc_cookie = input_fd; + zc.zc_cleanup_fd = -1; + zc.zc_action_handle = 0; + + zc.zc_nvlist_dst_size = 128 * 1024; + zc.zc_nvlist_dst = (uint64_t)(uintptr_t) + malloc(zc.zc_nvlist_dst_size); + + error = zfs_ioctl_fd(g_fd, ZFS_IOC_RECV, &zc); + if (error != 0) { + error = errno; + } else { + if (read_bytes != NULL) + *read_bytes = zc.zc_cookie; + + if (errflags != NULL) + *errflags = zc.zc_obj; + + if (errors != NULL) + VERIFY0(nvlist_unpack( + (void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size, errors, KM_SLEEP)); + } + + if (packed != NULL) + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zc.zc_nvlist_dst); + } + + return (error); +} + +/* + * The simplest receive case: receive from the specified fd, creating the + * specified snapshot. Apply the specified properties as "received" properties + * (which can be overridden by locally-set properties). If the stream is a + * clone, its origin snapshot must be specified by 'origin'. The 'force' + * flag will cause the target filesystem to be rolled back or destroyed if + * necessary to receive. + * + * Return 0 on success or an errno on failure. + * + * Note: this interface does not work on dedup'd streams + * (those with DMU_BACKUP_FEATURE_DEDUP). + */ +int +lzc_receive(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, boolean_t raw, int fd) +{ + return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, + B_FALSE, raw, fd, NULL, NULL, NULL, NULL)); +} + +/* + * Like lzc_receive, but if the receive fails due to premature stream + * termination, the intermediate state will be preserved on disk. In this + * case, ECKSUM will be returned. The receive may subsequently be resumed + * with a resuming send stream generated by lzc_send_resume(). + */ +int +lzc_receive_resumable(const char *snapname, nvlist_t *props, const char *origin, + boolean_t force, boolean_t raw, int fd) +{ + return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, + B_TRUE, raw, fd, NULL, NULL, NULL, NULL)); +} + +/* + * Like lzc_receive, but allows the caller to read the begin record and then to + * pass it in. That could be useful if the caller wants to derive, for example, + * the snapname or the origin parameters based on the information contained in + * the begin record. + * The begin record must be in its original form as read from the stream, + * in other words, it should not be byteswapped. + * + * The 'resumable' parameter allows to obtain the same behavior as with + * lzc_receive_resumable. + */ +int +lzc_receive_with_header(const char *snapname, nvlist_t *props, + const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, + int fd, const dmu_replay_record_t *begin_record) +{ + if (begin_record == NULL) + return (EINVAL); + + return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, + resumable, raw, fd, begin_record, NULL, NULL, NULL)); +} + +/* + * Like lzc_receive, but allows the caller to pass all supported arguments + * and retrieve all values returned. The only additional input parameter + * is 'cleanup_fd' which is used to set a cleanup-on-exit file descriptor. + * + * The following parameters all provide return values. Several may be set + * in the failure case and will contain additional information. + * + * The 'read_bytes' value will be set to the total number of bytes read. + * + * The 'errflags' value will contain zprop_errflags_t flags which are + * used to describe any failures. + * + * The 'action_handle' and 'cleanup_fd' are no longer used, and are ignored. + * + * The 'errors' nvlist contains an entry for each unapplied received + * property. Callers are responsible for freeing this nvlist. + */ +int lzc_receive_one(const char *snapname, nvlist_t *props, + const char *origin, boolean_t force, boolean_t resumable, boolean_t raw, + int input_fd, const dmu_replay_record_t *begin_record, int cleanup_fd, + uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, + nvlist_t **errors) +{ + return (recv_impl(snapname, props, NULL, NULL, 0, origin, force, + resumable, raw, input_fd, begin_record, + read_bytes, errflags, errors)); +} + +/* + * Like lzc_receive_one, but allows the caller to pass an additional 'cmdprops' + * argument. + * + * The 'cmdprops' nvlist contains both override ('zfs receive -o') and + * exclude ('zfs receive -x') properties. Callers are responsible for freeing + * this nvlist + */ +int lzc_receive_with_cmdprops(const char *snapname, nvlist_t *props, + nvlist_t *cmdprops, uint8_t *wkeydata, uint_t wkeylen, const char *origin, + boolean_t force, boolean_t resumable, boolean_t raw, int input_fd, + const dmu_replay_record_t *begin_record, int cleanup_fd, + uint64_t *read_bytes, uint64_t *errflags, uint64_t *action_handle, + nvlist_t **errors) +{ + return (recv_impl(snapname, props, cmdprops, wkeydata, wkeylen, origin, + force, resumable, raw, input_fd, begin_record, + read_bytes, errflags, errors)); +} + +/* + * Roll back this filesystem or volume to its most recent snapshot. + * If snapnamebuf is not NULL, it will be filled in with the name + * of the most recent snapshot. + * Note that the latest snapshot may change if a new one is concurrently + * created or the current one is destroyed. lzc_rollback_to can be used + * to roll back to a specific latest snapshot. + * + * Return 0 on success or an errno on failure. + */ +int +lzc_rollback(const char *fsname, char *snapnamebuf, int snapnamelen) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); + nvlist_free(args); + if (err == 0 && snapnamebuf != NULL) { + const char *snapname = fnvlist_lookup_string(result, "target"); + (void) strlcpy(snapnamebuf, snapname, snapnamelen); + } + nvlist_free(result); + + return (err); +} + +/* + * Roll back this filesystem or volume to the specified snapshot, + * if possible. + * + * Return 0 on success or an errno on failure. + */ +int +lzc_rollback_to(const char *fsname, const char *snapname) +{ + nvlist_t *args; + nvlist_t *result; + int err; + + args = fnvlist_alloc(); + fnvlist_add_string(args, "target", snapname); + err = lzc_ioctl(ZFS_IOC_ROLLBACK, fsname, args, &result); + nvlist_free(args); + nvlist_free(result); + return (err); +} + +/* + * Creates new bookmarks from existing snapshot or bookmark. + * + * The bookmarks nvlist maps from the full name of the new bookmark to + * the full name of the source snapshot or bookmark. + * All the bookmarks and snapshots must be in the same pool. + * The new bookmarks names must be unique. + * => see function dsl_bookmark_create_nvl_validate + * + * The returned results nvlist will have an entry for each bookmark that failed. + * The value will be the (int32) error code. + * + * The return value will be 0 if all bookmarks were created, otherwise it will + * be the errno of a (undetermined) bookmarks that failed. + */ +int +lzc_bookmark(nvlist_t *bookmarks, nvlist_t **errlist) +{ + nvpair_t *elem; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine pool name from first bookmark */ + elem = nvlist_next_nvpair(bookmarks, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/#")] = '\0'; + + error = lzc_ioctl(ZFS_IOC_BOOKMARK, pool, bookmarks, errlist); + + return (error); +} + +/* + * Retrieve bookmarks. + * + * Retrieve the list of bookmarks for the given file system. The props + * parameter is an nvlist of property names (with no values) that will be + * returned for each bookmark. + * + * The following are valid properties on bookmarks, most of which are numbers + * (represented as uint64 in the nvlist), except redact_snaps, which is a + * uint64 array, and redact_complete, which is a boolean + * + * "guid" - globally unique identifier of the snapshot it refers to + * "createtxg" - txg when the snapshot it refers to was created + * "creation" - timestamp when the snapshot it refers to was created + * "ivsetguid" - IVset guid for identifying encrypted snapshots + * "redact_snaps" - list of guids of the redaction snapshots for the specified + * bookmark. If the bookmark is not a redaction bookmark, the nvlist will + * not contain an entry for this value. If it is redacted with respect to + * no snapshots, it will contain value -> NULL uint64 array + * "redact_complete" - boolean value; true if the redaction bookmark is + * complete, false otherwise. + * + * The format of the returned nvlist as follows: + * -> { + * -> { + * "value" -> uint64 + * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + * "redact_complete" -> { + * "value" -> boolean value + * } + * } + */ +int +lzc_get_bookmarks(const char *fsname, nvlist_t *props, nvlist_t **bmarks) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOKMARKS, fsname, props, bmarks)); +} + +/* + * Get bookmark properties. + * + * Given a bookmark's full name, retrieve all properties for the bookmark. + * + * The format of the returned property list is as follows: + * { + * -> { + * "value" -> uint64 + * } + * ... + * "redact_snaps" -> { + * "value" -> uint64 array + * } + */ +int +lzc_get_bookmark_props(const char *bookmark, nvlist_t **props) +{ + int error; + + nvlist_t *innvl = fnvlist_alloc(); + error = lzc_ioctl(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, innvl, props); + fnvlist_free(innvl); + + return (error); +} + +/* + * Destroys bookmarks. + * + * The keys in the bmarks nvlist are the bookmarks to be destroyed. + * They must all be in the same pool. Bookmarks are specified as + * #. + * + * Bookmarks that do not exist will be silently ignored. + * + * The return value will be 0 if all bookmarks that existed were destroyed. + * + * Otherwise the return value will be the errno of a (undetermined) bookmark + * that failed, no bookmarks will be destroyed, and the errlist will have an + * entry for each bookmarks that failed. The value in the errlist will be + * the (int32) error code. + */ +int +lzc_destroy_bookmarks(nvlist_t *bmarks, nvlist_t **errlist) +{ + nvpair_t *elem; + int error; + char pool[ZFS_MAX_DATASET_NAME_LEN]; + + /* determine the pool name */ + elem = nvlist_next_nvpair(bmarks, NULL); + if (elem == NULL) + return (0); + (void) strlcpy(pool, nvpair_name(elem), sizeof (pool)); + pool[strcspn(pool, "/#")] = '\0'; + + error = lzc_ioctl(ZFS_IOC_DESTROY_BOOKMARKS, pool, bmarks, errlist); + + return (error); +} + +static int +lzc_channel_program_impl(const char *pool, const char *program, boolean_t sync, + uint64_t instrlimit, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) +{ + int error; + nvlist_t *args; + + args = fnvlist_alloc(); + fnvlist_add_string(args, ZCP_ARG_PROGRAM, program); + fnvlist_add_nvlist(args, ZCP_ARG_ARGLIST, argnvl); + fnvlist_add_boolean_value(args, ZCP_ARG_SYNC, sync); + fnvlist_add_uint64(args, ZCP_ARG_INSTRLIMIT, instrlimit); + fnvlist_add_uint64(args, ZCP_ARG_MEMLIMIT, memlimit); + error = lzc_ioctl(ZFS_IOC_CHANNEL_PROGRAM, pool, args, outnvl); + fnvlist_free(args); + + return (error); +} + +/* + * Executes a channel program. + * + * If this function returns 0 the channel program was successfully loaded and + * ran without failing. Note that individual commands the channel program ran + * may have failed and the channel program is responsible for reporting such + * errors through outnvl if they are important. + * + * This method may also return: + * + * EINVAL The program contains syntax errors, or an invalid memory or time + * limit was given. No part of the channel program was executed. + * If caused by syntax errors, 'outnvl' contains information about the + * errors. + * + * ECHRNG The program was executed, but encountered a runtime error, such as + * calling a function with incorrect arguments, invoking the error() + * function directly, failing an assert() command, etc. Some portion + * of the channel program may have executed and committed changes. + * Information about the failure can be found in 'outnvl'. + * + * ENOMEM The program fully executed, but the output buffer was not large + * enough to store the returned value. No output is returned through + * 'outnvl'. + * + * ENOSPC The program was terminated because it exceeded its memory usage + * limit. Some portion of the channel program may have executed and + * committed changes to disk. No output is returned through 'outnvl'. + * + * ETIME The program was terminated because it exceeded its Lua instruction + * limit. Some portion of the channel program may have executed and + * committed changes to disk. No output is returned through 'outnvl'. + */ +int +lzc_channel_program(const char *pool, const char *program, uint64_t instrlimit, + uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) +{ + return (lzc_channel_program_impl(pool, program, B_TRUE, instrlimit, + memlimit, argnvl, outnvl)); +} + +/* + * Creates a checkpoint for the specified pool. + * + * If this function returns 0 the pool was successfully checkpointed. + * + * This method may also return: + * + * ZFS_ERR_CHECKPOINT_EXISTS + * The pool already has a checkpoint. A pools can only have one + * checkpoint at most, at any given time. + * + * ZFS_ERR_DISCARDING_CHECKPOINT + * ZFS is in the middle of discarding a checkpoint for this pool. + * The pool can be checkpointed again once the discard is done. + * + * ZFS_DEVRM_IN_PROGRESS + * A vdev is currently being removed. The pool cannot be + * checkpointed until the device removal is done. + * + * ZFS_VDEV_TOO_BIG + * One or more top-level vdevs exceed the maximum vdev size + * supported for this feature. + */ +int +lzc_pool_checkpoint(const char *pool) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + error = lzc_ioctl(ZFS_IOC_POOL_CHECKPOINT, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +/* + * Discard the checkpoint from the specified pool. + * + * If this function returns 0 the checkpoint was successfully discarded. + * + * This method may also return: + * + * ZFS_ERR_NO_CHECKPOINT + * The pool does not have a checkpoint. + * + * ZFS_ERR_DISCARDING_CHECKPOINT + * ZFS is already in the middle of discarding the checkpoint. + */ +int +lzc_pool_checkpoint_discard(const char *pool) +{ + int error; + + nvlist_t *result = NULL; + nvlist_t *args = fnvlist_alloc(); + + error = lzc_ioctl(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, args, &result); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +/* + * Executes a read-only channel program. + * + * A read-only channel program works programmatically the same way as a + * normal channel program executed with lzc_channel_program(). The only + * difference is it runs exclusively in open-context and therefore can + * return faster. The downside to that, is that the program cannot change + * on-disk state by calling functions from the zfs.sync submodule. + * + * The return values of this function (and their meaning) are exactly the + * same as the ones described in lzc_channel_program(). + */ +int +lzc_channel_program_nosync(const char *pool, const char *program, + uint64_t timeout, uint64_t memlimit, nvlist_t *argnvl, nvlist_t **outnvl) +{ + return (lzc_channel_program_impl(pool, program, B_FALSE, timeout, + memlimit, argnvl, outnvl)); +} + +/* + * Performs key management functions + * + * crypto_cmd should be a value from dcp_cmd_t. If the command specifies to + * load or change a wrapping key, the key should be specified in the + * hidden_args nvlist so that it is not logged. + */ +int +lzc_load_key(const char *fsname, boolean_t noop, uint8_t *wkeydata, + uint_t wkeylen) +{ + int error; + nvlist_t *ioc_args; + nvlist_t *hidden_args; + + if (wkeydata == NULL) + return (EINVAL); + + ioc_args = fnvlist_alloc(); + hidden_args = fnvlist_alloc(); + fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, wkeylen); + fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); + if (noop) + fnvlist_add_boolean(ioc_args, "noop"); + error = lzc_ioctl(ZFS_IOC_LOAD_KEY, fsname, ioc_args, NULL); + nvlist_free(hidden_args); + nvlist_free(ioc_args); + + return (error); +} + +int +lzc_unload_key(const char *fsname) +{ + return (lzc_ioctl(ZFS_IOC_UNLOAD_KEY, fsname, NULL, NULL)); +} + +int +lzc_change_key(const char *fsname, uint64_t crypt_cmd, nvlist_t *props, + uint8_t *wkeydata, uint_t wkeylen) +{ + int error; + nvlist_t *ioc_args = fnvlist_alloc(); + nvlist_t *hidden_args = NULL; + + fnvlist_add_uint64(ioc_args, "crypt_cmd", crypt_cmd); + + if (wkeydata != NULL) { + hidden_args = fnvlist_alloc(); + fnvlist_add_uint8_array(hidden_args, "wkeydata", wkeydata, + wkeylen); + fnvlist_add_nvlist(ioc_args, ZPOOL_HIDDEN_ARGS, hidden_args); + } + + if (props != NULL) + fnvlist_add_nvlist(ioc_args, "props", props); + + error = lzc_ioctl(ZFS_IOC_CHANGE_KEY, fsname, ioc_args, NULL); + nvlist_free(hidden_args); + nvlist_free(ioc_args); + + return (error); +} + +int +lzc_reopen(const char *pool_name, boolean_t scrub_restart) +{ + nvlist_t *args = fnvlist_alloc(); + int error; + + fnvlist_add_boolean_value(args, "scrub_restart", scrub_restart); + + error = lzc_ioctl(ZFS_IOC_POOL_REOPEN, pool_name, args, NULL); + nvlist_free(args); + return (error); +} + +/* + * Changes initializing state. + * + * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. + * The key is ignored. + * + * If there are errors related to vdev arguments, per-vdev errors are returned + * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where + * guid is stringified with PRIu64, and errno is one of the following as + * an int64_t: + * - ENODEV if the device was not found + * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) + * - EROFS if the device is not writeable + * - EBUSY start requested but the device is already being either + * initialized or trimmed + * - ESRCH cancel/suspend requested but device is not being initialized + * + * If the errlist is empty, then return value will be: + * - EINVAL if one or more arguments was invalid + * - Other spa_open failures + * - 0 if the operation succeeded + */ +int +lzc_initialize(const char *poolname, pool_initialize_func_t cmd_type, + nvlist_t *vdevs, nvlist_t **errlist) +{ + int error; + + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_INITIALIZE_COMMAND, (uint64_t)cmd_type); + fnvlist_add_nvlist(args, ZPOOL_INITIALIZE_VDEVS, vdevs); + + error = lzc_ioctl(ZFS_IOC_POOL_INITIALIZE, poolname, args, errlist); + + fnvlist_free(args); + + return (error); +} + +/* + * Changes TRIM state. + * + * vdevs should be a list of (, guid) where guid is a uint64 vdev GUID. + * The key is ignored. + * + * If there are errors related to vdev arguments, per-vdev errors are returned + * in an nvlist with the key "vdevs". Each error is a (guid, errno) pair where + * guid is stringified with PRIu64, and errno is one of the following as + * an int64_t: + * - ENODEV if the device was not found + * - EINVAL if the devices is not a leaf or is not concrete (e.g. missing) + * - EROFS if the device is not writeable + * - EBUSY start requested but the device is already being either trimmed + * or initialized + * - ESRCH cancel/suspend requested but device is not being initialized + * - EOPNOTSUPP if the device does not support TRIM (or secure TRIM) + * + * If the errlist is empty, then return value will be: + * - EINVAL if one or more arguments was invalid + * - Other spa_open failures + * - 0 if the operation succeeded + */ +int +lzc_trim(const char *poolname, pool_trim_func_t cmd_type, uint64_t rate, + boolean_t secure, nvlist_t *vdevs, nvlist_t **errlist) +{ + int error; + + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_uint64(args, ZPOOL_TRIM_COMMAND, (uint64_t)cmd_type); + fnvlist_add_nvlist(args, ZPOOL_TRIM_VDEVS, vdevs); + fnvlist_add_uint64(args, ZPOOL_TRIM_RATE, rate); + fnvlist_add_boolean_value(args, ZPOOL_TRIM_SECURE, secure); + + error = lzc_ioctl(ZFS_IOC_POOL_TRIM, poolname, args, errlist); + + fnvlist_free(args); + + return (error); +} + +/* + * Create a redaction bookmark named bookname by redacting snapshot with respect + * to all the snapshots in snapnv. + */ +int +lzc_redact(const char *snapshot, const char *bookname, nvlist_t *snapnv) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "bookname", bookname); + fnvlist_add_nvlist(args, "snapnv", snapnv); + int error = lzc_ioctl(ZFS_IOC_REDACT, snapshot, args, NULL); + fnvlist_free(args); + return (error); +} + +static int +wait_common(const char *pool, zpool_wait_activity_t activity, boolean_t use_tag, + uint64_t tag, boolean_t *waited) +{ + nvlist_t *args = fnvlist_alloc(); + nvlist_t *result = NULL; + + fnvlist_add_int32(args, ZPOOL_WAIT_ACTIVITY, activity); + if (use_tag) + fnvlist_add_uint64(args, ZPOOL_WAIT_TAG, tag); + + int error = lzc_ioctl(ZFS_IOC_WAIT, pool, args, &result); + + if (error == 0 && waited != NULL) + *waited = fnvlist_lookup_boolean_value(result, + ZPOOL_WAIT_WAITED); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +int +lzc_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) +{ + return (wait_common(pool, activity, B_FALSE, 0, waited)); +} + +int +lzc_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, + boolean_t *waited) +{ + return (wait_common(pool, activity, B_TRUE, tag, waited)); +} + +int +lzc_wait_fs(const char *fs, zfs_wait_activity_t activity, boolean_t *waited) +{ + nvlist_t *args = fnvlist_alloc(); + nvlist_t *result = NULL; + + fnvlist_add_int32(args, ZFS_WAIT_ACTIVITY, activity); + + int error = lzc_ioctl(ZFS_IOC_WAIT_FS, fs, args, &result); + + if (error == 0 && waited != NULL) + *waited = fnvlist_lookup_boolean_value(result, + ZFS_WAIT_WAITED); + + fnvlist_free(args); + fnvlist_free(result); + + return (error); +} + +/* + * Set the bootenv contents for the given pool. + */ +int +lzc_set_bootenv(const char *pool, const char *env) +{ + nvlist_t *args = fnvlist_alloc(); + fnvlist_add_string(args, "envmap", env); + int error = lzc_ioctl(ZFS_IOC_SET_BOOTENV, pool, args, NULL); + fnvlist_free(args); + return (error); +} + +/* + * Get the contents of the bootenv of the given pool. + */ +int +lzc_get_bootenv(const char *pool, nvlist_t **outnvl) +{ + return (lzc_ioctl(ZFS_IOC_GET_BOOTENV, pool, NULL, outnvl)); +} diff --git a/lib/libzfs_core/libzfs_core.pc.in b/lib/libzfs_core/libzfs_core.pc.in new file mode 100644 index 000000000000..e14d42d11a5d --- /dev/null +++ b/lib/libzfs_core/libzfs_core.pc.in @@ -0,0 +1,13 @@ +prefix=@prefix@ +exec_prefix=@exec_prefix@ +libdir=@libdir@ +includedir=@includedir@ + +Name: libzfs_core +Description: LibZFS core library +Version: @VERSION@ +URL: https://zfsonlinux.org +Requires.private: blkid uuid libtirpc zlib +Cflags: -I${includedir}/libzfs -I${includedir}/libspl +Libs: -L${libdir} -lzfs_core -lnvpair +Libs.private: @LIBCLOCK_GETTIME@ @LIBUDEV_LIBS@ -lm -pthread diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am new file mode 100644 index 000000000000..9c1b81bf5373 --- /dev/null +++ b/lib/libzpool/Makefile.am @@ -0,0 +1,230 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = \ + $(top_srcdir)/module/zfs \ + $(top_srcdir)/module/zcommon \ + $(top_srcdir)/module/lua \ + $(top_srcdir)/module/os/linux/zfs \ + $(top_srcdir)/lib/libzpool + +# Unconditionally enable debugging for libzpool +AM_CPPFLAGS += -DDEBUG -UNDEBUG -DZFS_DEBUG + +# Suppress unused but set variable warnings often due to ASSERTs +AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) + +# Includes kernel code generate warnings for large stack frames +AM_CFLAGS += $(FRAME_LARGER_THAN) + +AM_CFLAGS += $(ZLIB_CFLAGS) + +AM_CFLAGS += -DLIB_ZPOOL_BUILD + +lib_LTLIBRARIES = libzpool.la + +USER_C = \ + kernel.c \ + taskq.c \ + util.c + +KERNEL_C = \ + zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_aarch64_neon.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zfs_uio.c \ + zpool_prop.c \ + zprop_common.c \ + abd.c \ + abd_os.c \ + aggsum.c \ + arc.c \ + arc_os.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + bptree.c \ + btree.c \ + bqueue.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_bookmark.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_destroy.c \ + dsl_userhold.c \ + edonr_zfs.c \ + hkdf.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + trace.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_file.c \ + vdev_indirect_births.c \ + vdev_indirect.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math_aarch64_neon.c \ + vdev_raidz_math_aarch64_neonx2.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + vdev_raidz_math_powerpc_altivec.c \ + vdev_rebuild.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_debug.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_sa.c \ + zfs_znode.c \ + zfs_ratelimit.c \ + zfs_rlock.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_crypt.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c + +LUA_C = \ + lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +dist_libzpool_la_SOURCES = \ + $(USER_C) + +nodist_libzpool_la_SOURCES = \ + $(KERNEL_C) \ + $(LUA_C) + +libzpool_la_LIBADD = \ + $(abs_top_builddir)/lib/libicp/libicp.la \ + $(abs_top_builddir)/lib/libunicode/libunicode.la \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la \ + $(abs_top_builddir)/lib/libzstd/libzstd.la + +libzpool_la_LIBADD += $(LIBCLOCK_GETTIME) $(ZLIB_LIBS) -ldl + +libzpool_la_LDFLAGS = -pthread + +if !ASAN_ENABLED +libzpool_la_LDFLAGS += -Wl,-z,defs +endif + +if BUILD_FREEBSD +libzpool_la_LIBADD += -lgeom +libzpool_la_LDFLAGS += -version-info 4:0:0 +else +libzpool_la_LDFLAGS += -version-info 2:0:0 +endif + +if TARGET_CPU_POWERPC +vdev_raidz_math_powerpc_altivec.$(OBJEXT): CFLAGS += -maltivec +vdev_raidz_math_powerpc_altivec.l$(OBJEXT): CFLAGS += -maltivec +endif diff --git a/lib/libzpool/kernel.c b/lib/libzpool/kernel.c new file mode 100644 index 000000000000..cba1d242b843 --- /dev/null +++ b/lib/libzpool/kernel.c @@ -0,0 +1,1410 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Emulation of kernel services in userland. + */ + +uint64_t physmem; +char hw_serial[HW_HOSTID_LEN]; +struct utsname hw_utsname; + +/* If set, all blocks read will be copied to the specified directory. */ +char *vn_dumpdir = NULL; + +/* this only exists to have its address taken */ +struct proc p0; + +/* + * ========================================================================= + * threads + * ========================================================================= + * + * TS_STACK_MIN is dictated by the minimum allowed pthread stack size. While + * TS_STACK_MAX is somewhat arbitrary, it was selected to be large enough for + * the expected stack depth while small enough to avoid exhausting address + * space with high thread counts. + */ +#define TS_STACK_MIN MAX(PTHREAD_STACK_MIN, 32768) +#define TS_STACK_MAX (256 * 1024) + +/*ARGSUSED*/ +kthread_t * +zk_thread_create(void (*func)(void *), void *arg, size_t stksize, int state) +{ + pthread_attr_t attr; + pthread_t tid; + char *stkstr; + int detachstate = PTHREAD_CREATE_DETACHED; + + VERIFY0(pthread_attr_init(&attr)); + + if (state & TS_JOINABLE) + detachstate = PTHREAD_CREATE_JOINABLE; + + VERIFY0(pthread_attr_setdetachstate(&attr, detachstate)); + + /* + * We allow the default stack size in user space to be specified by + * setting the ZFS_STACK_SIZE environment variable. This allows us + * the convenience of observing and debugging stack overruns in + * user space. Explicitly specified stack sizes will be honored. + * The usage of ZFS_STACK_SIZE is discussed further in the + * ENVIRONMENT VARIABLES sections of the ztest(1) man page. + */ + if (stksize == 0) { + stkstr = getenv("ZFS_STACK_SIZE"); + + if (stkstr == NULL) + stksize = TS_STACK_MAX; + else + stksize = MAX(atoi(stkstr), TS_STACK_MIN); + } + + VERIFY3S(stksize, >, 0); + stksize = P2ROUNDUP(MAX(stksize, TS_STACK_MIN), PAGESIZE); + + /* + * If this ever fails, it may be because the stack size is not a + * multiple of system page size. + */ + VERIFY0(pthread_attr_setstacksize(&attr, stksize)); + VERIFY0(pthread_attr_setguardsize(&attr, PAGESIZE)); + + VERIFY0(pthread_create(&tid, &attr, (void *(*)(void *))func, arg)); + VERIFY0(pthread_attr_destroy(&attr)); + + return ((void *)(uintptr_t)tid); +} + +/* + * ========================================================================= + * kstats + * ========================================================================= + */ +/*ARGSUSED*/ +kstat_t * +kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t type, ulong_t ndata, uchar_t ks_flag) +{ + return (NULL); +} + +/*ARGSUSED*/ +void +kstat_install(kstat_t *ksp) +{} + +/*ARGSUSED*/ +void +kstat_delete(kstat_t *ksp) +{} + +/*ARGSUSED*/ +void +kstat_waitq_enter(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_waitq_exit(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_enter(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_exit(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_waitq_to_runq(kstat_io_t *kiop) +{} + +/*ARGSUSED*/ +void +kstat_runq_back_to_waitq(kstat_io_t *kiop) +{} + +void +kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{} + +/* + * ========================================================================= + * mutexes + * ========================================================================= + */ + +void +mutex_init(kmutex_t *mp, char *name, int type, void *cookie) +{ + VERIFY0(pthread_mutex_init(&mp->m_lock, NULL)); + memset(&mp->m_owner, 0, sizeof (pthread_t)); +} + +void +mutex_destroy(kmutex_t *mp) +{ + VERIFY0(pthread_mutex_destroy(&mp->m_lock)); +} + +void +mutex_enter(kmutex_t *mp) +{ + VERIFY0(pthread_mutex_lock(&mp->m_lock)); + mp->m_owner = pthread_self(); +} + +int +mutex_tryenter(kmutex_t *mp) +{ + int error; + + error = pthread_mutex_trylock(&mp->m_lock); + if (error == 0) { + mp->m_owner = pthread_self(); + return (1); + } else { + VERIFY3S(error, ==, EBUSY); + return (0); + } +} + +void +mutex_exit(kmutex_t *mp) +{ + memset(&mp->m_owner, 0, sizeof (pthread_t)); + VERIFY0(pthread_mutex_unlock(&mp->m_lock)); +} + +/* + * ========================================================================= + * rwlocks + * ========================================================================= + */ + +void +rw_init(krwlock_t *rwlp, char *name, int type, void *arg) +{ + VERIFY0(pthread_rwlock_init(&rwlp->rw_lock, NULL)); + rwlp->rw_readers = 0; + rwlp->rw_owner = 0; +} + +void +rw_destroy(krwlock_t *rwlp) +{ + VERIFY0(pthread_rwlock_destroy(&rwlp->rw_lock)); +} + +void +rw_enter(krwlock_t *rwlp, krw_t rw) +{ + if (rw == RW_READER) { + VERIFY0(pthread_rwlock_rdlock(&rwlp->rw_lock)); + atomic_inc_uint(&rwlp->rw_readers); + } else { + VERIFY0(pthread_rwlock_wrlock(&rwlp->rw_lock)); + rwlp->rw_owner = pthread_self(); + } +} + +void +rw_exit(krwlock_t *rwlp) +{ + if (RW_READ_HELD(rwlp)) + atomic_dec_uint(&rwlp->rw_readers); + else + rwlp->rw_owner = 0; + + VERIFY0(pthread_rwlock_unlock(&rwlp->rw_lock)); +} + +int +rw_tryenter(krwlock_t *rwlp, krw_t rw) +{ + int error; + + if (rw == RW_READER) + error = pthread_rwlock_tryrdlock(&rwlp->rw_lock); + else + error = pthread_rwlock_trywrlock(&rwlp->rw_lock); + + if (error == 0) { + if (rw == RW_READER) + atomic_inc_uint(&rwlp->rw_readers); + else + rwlp->rw_owner = pthread_self(); + + return (1); + } + + VERIFY3S(error, ==, EBUSY); + + return (0); +} + +/* ARGSUSED */ +uint32_t +zone_get_hostid(void *zonep) +{ + /* + * We're emulating the system's hostid in userland. + */ + return (strtoul(hw_serial, NULL, 10)); +} + +int +rw_tryupgrade(krwlock_t *rwlp) +{ + return (0); +} + +/* + * ========================================================================= + * condition variables + * ========================================================================= + */ + +void +cv_init(kcondvar_t *cv, char *name, int type, void *arg) +{ + VERIFY0(pthread_cond_init(cv, NULL)); +} + +void +cv_destroy(kcondvar_t *cv) +{ + VERIFY0(pthread_cond_destroy(cv)); +} + +void +cv_wait(kcondvar_t *cv, kmutex_t *mp) +{ + memset(&mp->m_owner, 0, sizeof (pthread_t)); + VERIFY0(pthread_cond_wait(cv, &mp->m_lock)); + mp->m_owner = pthread_self(); +} + +int +cv_wait_sig(kcondvar_t *cv, kmutex_t *mp) +{ + cv_wait(cv, mp); + return (1); +} + +int +cv_timedwait(kcondvar_t *cv, kmutex_t *mp, clock_t abstime) +{ + int error; + struct timeval tv; + struct timespec ts; + clock_t delta; + + delta = abstime - ddi_get_lbolt(); + if (delta <= 0) + return (-1); + + VERIFY(gettimeofday(&tv, NULL) == 0); + + ts.tv_sec = tv.tv_sec + delta / hz; + ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % hz) * (NANOSEC / hz); + if (ts.tv_nsec >= NANOSEC) { + ts.tv_sec++; + ts.tv_nsec -= NANOSEC; + } + + memset(&mp->m_owner, 0, sizeof (pthread_t)); + error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); + mp->m_owner = pthread_self(); + + if (error == ETIMEDOUT) + return (-1); + + VERIFY0(error); + + return (1); +} + +/*ARGSUSED*/ +int +cv_timedwait_hires(kcondvar_t *cv, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + int error; + struct timeval tv; + struct timespec ts; + hrtime_t delta; + + ASSERT(flag == 0 || flag == CALLOUT_FLAG_ABSOLUTE); + + delta = tim; + if (flag & CALLOUT_FLAG_ABSOLUTE) + delta -= gethrtime(); + + if (delta <= 0) + return (-1); + + VERIFY0(gettimeofday(&tv, NULL)); + + ts.tv_sec = tv.tv_sec + delta / NANOSEC; + ts.tv_nsec = tv.tv_usec * NSEC_PER_USEC + (delta % NANOSEC); + if (ts.tv_nsec >= NANOSEC) { + ts.tv_sec++; + ts.tv_nsec -= NANOSEC; + } + + memset(&mp->m_owner, 0, sizeof (pthread_t)); + error = pthread_cond_timedwait(cv, &mp->m_lock, &ts); + mp->m_owner = pthread_self(); + + if (error == ETIMEDOUT) + return (-1); + + VERIFY0(error); + + return (1); +} + +void +cv_signal(kcondvar_t *cv) +{ + VERIFY0(pthread_cond_signal(cv)); +} + +void +cv_broadcast(kcondvar_t *cv) +{ + VERIFY0(pthread_cond_broadcast(cv)); +} + +/* + * ========================================================================= + * procfs list + * ========================================================================= + */ + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{} + +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} + +/* + * ========================================================================= + * vnode operations + * ========================================================================= + */ + +/* + * ========================================================================= + * Figure out which debugging statements to print + * ========================================================================= + */ + +static char *dprintf_string; +static int dprintf_print_all; + +int +dprintf_find_string(const char *string) +{ + char *tmp_str = dprintf_string; + int len = strlen(string); + + /* + * Find out if this is a string we want to print. + * String format: file1.c,function_name1,file2.c,file3.c + */ + + while (tmp_str != NULL) { + if (strncmp(tmp_str, string, len) == 0 && + (tmp_str[len] == ',' || tmp_str[len] == '\0')) + return (1); + tmp_str = strchr(tmp_str, ','); + if (tmp_str != NULL) + tmp_str++; /* Get rid of , */ + } + return (0); +} + +void +dprintf_setup(int *argc, char **argv) +{ + int i, j; + + /* + * Debugging can be specified two ways: by setting the + * environment variable ZFS_DEBUG, or by including a + * "debug=..." argument on the command line. The command + * line setting overrides the environment variable. + */ + + for (i = 1; i < *argc; i++) { + int len = strlen("debug="); + /* First look for a command line argument */ + if (strncmp("debug=", argv[i], len) == 0) { + dprintf_string = argv[i] + len; + /* Remove from args */ + for (j = i; j < *argc; j++) + argv[j] = argv[j+1]; + argv[j] = NULL; + (*argc)--; + } + } + + if (dprintf_string == NULL) { + /* Look for ZFS_DEBUG environment variable */ + dprintf_string = getenv("ZFS_DEBUG"); + } + + /* + * Are we just turning on all debugging? + */ + if (dprintf_find_string("on")) + dprintf_print_all = 1; + + if (dprintf_string != NULL) + zfs_flags |= ZFS_DEBUG_DPRINTF; +} + +/* + * ========================================================================= + * debug printfs + * ========================================================================= + */ +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + + /* + * Get rid of annoying "../common/" prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + if (dprint) { + /* dprintf messages are printed immediately */ + + if (!dprintf_print_all && + !dprintf_find_string(newfile) && + !dprintf_find_string(func)) + return; + + /* Print out just the function name if requested */ + flockfile(stdout); + if (dprintf_find_string("pid")) + (void) printf("%d ", getpid()); + if (dprintf_find_string("tid")) + (void) printf("%ju ", + (uintmax_t)(uintptr_t)pthread_self()); + if (dprintf_find_string("cpu")) + (void) printf("%u ", getcpuid()); + if (dprintf_find_string("time")) + (void) printf("%llu ", gethrtime()); + if (dprintf_find_string("long")) + (void) printf("%s, line %d: ", newfile, line); + (void) printf("dprintf: %s: ", func); + va_start(adx, fmt); + (void) vprintf(fmt, adx); + va_end(adx); + funlockfile(stdout); + } else { + /* zfs_dbgmsg is logged for dumping later */ + size_t size; + char *buf; + int i; + + size = 1024; + buf = umem_alloc(size, UMEM_NOFAIL); + i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + __zfs_dbgmsg(buf); + + umem_free(buf, size); + } +} + +/* + * ========================================================================= + * cmn_err() and panic() + * ========================================================================= + */ +static char ce_prefix[CE_IGNORE][10] = { "", "NOTICE: ", "WARNING: ", "" }; +static char ce_suffix[CE_IGNORE][2] = { "", "\n", "\n", "" }; + +void +vpanic(const char *fmt, va_list adx) +{ + (void) fprintf(stderr, "error: "); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "\n"); + + abort(); /* think of it as a "user-level crash dump" */ +} + +void +panic(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vpanic(fmt, adx); + va_end(adx); +} + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + if (ce == CE_PANIC) + vpanic(fmt, adx); + if (ce != CE_NOTE) { /* suppress noise in userland stress testing */ + (void) fprintf(stderr, "%s", ce_prefix[ce]); + (void) vfprintf(stderr, fmt, adx); + (void) fprintf(stderr, "%s", ce_suffix[ce]); + } +} + +/*PRINTFLIKE2*/ +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(ce, fmt, adx); + va_end(adx); +} + +/* + * ========================================================================= + * misc routines + * ========================================================================= + */ + +void +delay(clock_t ticks) +{ + (void) poll(0, 0, ticks * (1000 / hz)); +} + +/* + * Find highest one bit set. + * Returns bit number + 1 of highest bit that is set, otherwise returns 0. + * The __builtin_clzll() function is supported by both GCC and Clang. + */ +int +highbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (NBBY * sizeof (uint64_t) - __builtin_clzll(i)); +} + +/* + * Find lowest one bit set. + * Returns bit number + 1 of lowest bit that is set, otherwise returns 0. + * The __builtin_ffsll() function is supported by both GCC and Clang. + */ +int +lowbit64(uint64_t i) +{ + if (i == 0) + return (0); + + return (__builtin_ffsll(i)); +} + +char *random_path = "/dev/random"; +char *urandom_path = "/dev/urandom"; +static int random_fd = -1, urandom_fd = -1; + +void +random_init(void) +{ + VERIFY((random_fd = open(random_path, O_RDONLY)) != -1); + VERIFY((urandom_fd = open(urandom_path, O_RDONLY)) != -1); +} + +void +random_fini(void) +{ + close(random_fd); + close(urandom_fd); + + random_fd = -1; + urandom_fd = -1; +} + +static int +random_get_bytes_common(uint8_t *ptr, size_t len, int fd) +{ + size_t resid = len; + ssize_t bytes; + + ASSERT(fd != -1); + + while (resid != 0) { + bytes = read(fd, ptr, resid); + ASSERT3S(bytes, >=, 0); + ptr += bytes; + resid -= bytes; + } + + return (0); +} + +int +random_get_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, random_fd)); +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + return (random_get_bytes_common(ptr, len, urandom_fd)); +} + +int +ddi_strtoul(const char *hw_serial, char **nptr, int base, unsigned long *result) +{ + char *end; + + *result = strtoul(hw_serial, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, u_longlong_t *result) +{ + char *end; + + *result = strtoull(str, &end, base); + if (*result == 0) + return (errno); + return (0); +} + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} + +/* + * ========================================================================= + * kernel emulation setup & teardown + * ========================================================================= + */ +static int +umem_out_of_memory(void) +{ + char errmsg[] = "out of memory -- generating core dump\n"; + + (void) fprintf(stderr, "%s", errmsg); + abort(); + return (0); +} + +void +kernel_init(int mode) +{ + extern uint_t rrw_tsd_key; + + umem_nofail_callback(umem_out_of_memory); + + physmem = sysconf(_SC_PHYS_PAGES); + + dprintf("physmem = %llu pages (%.2f GB)\n", physmem, + (double)physmem * sysconf(_SC_PAGE_SIZE) / (1ULL << 30)); + + (void) snprintf(hw_serial, sizeof (hw_serial), "%ld", + (mode & SPA_MODE_WRITE) ? get_system_hostid() : 0); + + random_init(); + + VERIFY0(uname(&hw_utsname)); + + system_taskq_init(); + icp_init(); + + zstd_init(); + + spa_init((spa_mode_t)mode); + + fletcher_4_init(); + + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); +} + +void +kernel_fini(void) +{ + fletcher_4_fini(); + spa_fini(); + + zstd_fini(); + + icp_fini(); + system_taskq_fini(); + + random_fini(); +} + +uid_t +crgetuid(cred_t *cr) +{ + return (0); +} + +uid_t +crgetruid(cred_t *cr) +{ + return (0); +} + +gid_t +crgetgid(cred_t *cr) +{ + return (0); +} + +int +crgetngroups(cred_t *cr) +{ + return (0); +} + +gid_t * +crgetgroups(cred_t *cr) +{ + return (NULL); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + return (0); +} + +int +secpolicy_zfs(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) +{ + return (0); +} + +ksiddomain_t * +ksid_lookupdomain(const char *dom) +{ + ksiddomain_t *kd; + + kd = umem_zalloc(sizeof (ksiddomain_t), UMEM_NOFAIL); + kd->kd_name = spa_strdup(dom); + return (kd); +} + +void +ksiddomain_rele(ksiddomain_t *ksid) +{ + spa_strfree(ksid->kd_name); + umem_free(ksid, sizeof (ksiddomain_t)); +} + +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *buf = NULL; + va_list adx_copy; + + va_copy(adx_copy, adx); + VERIFY(vasprintf(&buf, fmt, adx_copy) != -1); + va_end(adx_copy); + + return (buf); +} + +char * +kmem_asprintf(const char *fmt, ...) +{ + char *buf = NULL; + va_list adx; + + va_start(adx, fmt); + VERIFY(vasprintf(&buf, fmt, adx) != -1); + va_end(adx); + + return (buf); +} + +/* ARGSUSED */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + *minorp = 0; + return (0); +} + +/* ARGSUSED */ +void +zfs_onexit_fd_rele(int fd) +{ +} + +/* ARGSUSED */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + return (0); +} + +fstrans_cookie_t +spl_fstrans_mark(void) +{ + return ((fstrans_cookie_t)0); +} + +void +spl_fstrans_unmark(fstrans_cookie_t cookie) +{ +} + +int +__spl_pf_fstrans_check(void) +{ + return (0); +} + +int +kmem_cache_reap_active(void) +{ + return (0); +} + +void *zvol_tag = "zvol_tag"; + +void +zvol_create_minor(const char *name) +{ +} + +void +zvol_create_minors_recursive(const char *name) +{ +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ +} + +void +zvol_rename_minors(spa_t *spa, const char *oldname, const char *newname, + boolean_t async) +{ +} + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + int fd = -1; + int dump_fd = -1; + int err; + int old_umask = 0; + zfs_file_t *fp; + struct stat64 st; + + if (!(flags & O_CREAT) && stat64(path, &st) == -1) + return (errno); + + if (!(flags & O_CREAT) && S_ISBLK(st.st_mode)) + flags |= O_DIRECT; + + if (flags & O_CREAT) + old_umask = umask(0); + + fd = open64(path, flags, mode); + if (fd == -1) + return (errno); + + if (flags & O_CREAT) + (void) umask(old_umask); + + if (vn_dumpdir != NULL) { + char *dumppath = umem_zalloc(MAXPATHLEN, UMEM_NOFAIL); + char *inpath = basename((char *)(uintptr_t)path); + + (void) snprintf(dumppath, MAXPATHLEN, + "%s/%s", vn_dumpdir, inpath); + dump_fd = open64(dumppath, O_CREAT | O_WRONLY, 0666); + umem_free(dumppath, MAXPATHLEN); + if (dump_fd == -1) { + err = errno; + close(fd); + return (err); + } + } else { + dump_fd = -1; + } + + (void) fcntl(fd, F_SETFD, FD_CLOEXEC); + + fp = umem_zalloc(sizeof (zfs_file_t), UMEM_NOFAIL); + fp->f_fd = fd; + fp->f_dump_fd = dump_fd; + *fpp = fp; + + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + close(fp->f_fd); + if (fp->f_dump_fd != -1) + close(fp->f_dump_fd); + + umem_free(fp, sizeof (zfs_file_t)); +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + ssize_t rc; + + rc = write(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, + size_t count, loff_t pos, ssize_t *resid) +{ + ssize_t rc, split, done; + int sectors; + + /* + * To simulate partial disk writes, we split writes into two + * system calls so that the process can be killed in between. + * This is used by ztest to simulate realistic failure modes. + */ + sectors = count >> SPA_MINBLOCKSHIFT; + split = (sectors > 0 ? rand() % sectors : 0) << SPA_MINBLOCKSHIFT; + rc = pwrite64(fp->f_fd, buf, split, pos); + if (rc != -1) { + done = rc; + rc = pwrite64(fp->f_fd, (char *)buf + split, + count - split, pos + split); + } +#ifdef __linux__ + if (rc == -1 && errno == EINVAL) { + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order + * to catch the offender. + */ + abort(); + } +#endif + + if (rc < 0) + return (errno); + + done += rc; + + if (resid) { + *resid = count - done; + } else if (done != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + int rc; + + rc = read(fp->f_fd, buf, count); + if (rc < 0) + return (errno); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = pread64(fp->f_fd, buf, count, off); + if (rc < 0) { +#ifdef __linux__ + /* + * Under Linux, this most likely means an alignment issue + * (memory or disk) due to O_DIRECT, so we abort() in order to + * catch the offender. + */ + if (errno == EINVAL) + abort(); +#endif + return (errno); + } + + if (fp->f_dump_fd != -1) { + int status; + + status = pwrite64(fp->f_dump_fd, buf, rc, off); + ASSERT(status != -1); + } + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + rc = lseek(fp->f_fd, *offp, whence); + if (rc < 0) + return (errno); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct stat64 st; + + if (fstat64_blk(fp->f_fd, &st) == -1) + return (errno); + + zfattr->zfa_size = st.st_size; + zfattr->zfa_mode = st.st_mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + int rc; + + rc = fsync(fp->f_fd); + if (rc < 0) + return (errno); + + return (0); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ +#ifdef __linux__ + return (fallocate(fp->f_fd, mode, offset, len)); +#else + return (EOPNOTSUPP); +#endif +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (lseek(fp->f_fd, SEEK_CUR, 0)); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (remove(path)); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * fpp - pointer to file pointer + * + * Returns 0 on success EBADF on failure. + * Unsupported in user space. + */ +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + abort(); + + return (EOPNOTSUPP); +} + +/* + * Drop reference to file pointer + * + * fd - input file descriptor + * + * Unsupported in user space. + */ +void +zfs_file_put(int fd) +{ + abort(); +} diff --git a/lib/libzpool/taskq.c b/lib/libzpool/taskq.c new file mode 100644 index 000000000000..456080f7f247 --- /dev/null +++ b/lib/libzpool/taskq.c @@ -0,0 +1,380 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright 2012 Garrett D'Amore . All rights reserved. + * Copyright (c) 2014 by Delphix. All rights reserved. + */ + +#include + +int taskq_now; +taskq_t *system_taskq; +taskq_t *system_delay_taskq; + +static pthread_key_t taskq_tsd; + +#define TASKQ_ACTIVE 0x00010000 + +static taskq_ent_t * +task_alloc(taskq_t *tq, int tqflags) +{ + taskq_ent_t *t; + int rv; + +again: if ((t = tq->tq_freelist) != NULL && tq->tq_nalloc >= tq->tq_minalloc) { + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + tq->tq_freelist = t->tqent_next; + } else { + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (!(tqflags & KM_SLEEP)) + return (NULL); + + /* + * We don't want to exceed tq_maxalloc, but we can't + * wait for other tasks to complete (and thus free up + * task structures) without risking deadlock with + * the caller. So, we just delay for one second + * to throttle the allocation rate. If we have tasks + * complete before one second timeout expires then + * taskq_ent_free will signal us and we will + * immediately retry the allocation. + */ + tq->tq_maxalloc_wait++; + rv = cv_timedwait(&tq->tq_maxalloc_cv, + &tq->tq_lock, ddi_get_lbolt() + hz); + tq->tq_maxalloc_wait--; + if (rv > 0) + goto again; /* signaled */ + } + mutex_exit(&tq->tq_lock); + + t = kmem_alloc(sizeof (taskq_ent_t), tqflags); + + mutex_enter(&tq->tq_lock); + if (t != NULL) { + /* Make sure we start without any flags */ + t->tqent_flags = 0; + tq->tq_nalloc++; + } + } + return (t); +} + +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_next = tq->tq_freelist; + tq->tq_freelist = t; + } else { + tq->tq_nalloc--; + mutex_exit(&tq->tq_lock); + kmem_free(t, sizeof (taskq_ent_t)); + mutex_enter(&tq->tq_lock); + } + + if (tq->tq_maxalloc_wait) + cv_signal(&tq->tq_maxalloc_cv); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) +{ + taskq_ent_t *t; + + if (taskq_now) { + func(arg); + return (1); + } + + mutex_enter(&tq->tq_lock); + ASSERT(tq->tq_flags & TASKQ_ACTIVE); + if ((t = task_alloc(tq, tqflags)) == NULL) { + mutex_exit(&tq->tq_lock); + return (0); + } + if (tqflags & TQ_FRONT) { + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; + } else { + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; + } + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_flags = 0; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); + return (1); +} + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags, + clock_t expire_time) +{ + return (0); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (t->tqent_next == NULL); +} + +void +taskq_init_ent(taskq_ent_t *t) +{ + t->tqent_next = NULL; + t->tqent_prev = NULL; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + ASSERT(func != NULL); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + /* + * Enqueue the task to the underlying queue. + */ + mutex_enter(&tq->tq_lock); + + if (flags & TQ_FRONT) { + t->tqent_next = tq->tq_task.tqent_next; + t->tqent_prev = &tq->tq_task; + } else { + t->tqent_next = &tq->tq_task; + t->tqent_prev = tq->tq_task.tqent_prev; + } + t->tqent_next->tqent_prev = t; + t->tqent_prev->tqent_next = t; + t->tqent_func = func; + t->tqent_arg = arg; + cv_signal(&tq->tq_dispatch_cv); + mutex_exit(&tq->tq_lock); +} + +void +taskq_wait(taskq_t *tq) +{ + mutex_enter(&tq->tq_lock); + while (tq->tq_task.tqent_next != &tq->tq_task || tq->tq_active != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + mutex_exit(&tq->tq_lock); +} + +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + taskq_wait(tq); +} + +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + taskq_wait(tq); +} + +static void +taskq_thread(void *arg) +{ + taskq_t *tq = arg; + taskq_ent_t *t; + boolean_t prealloc; + + VERIFY0(pthread_setspecific(taskq_tsd, tq)); + + mutex_enter(&tq->tq_lock); + while (tq->tq_flags & TASKQ_ACTIVE) { + if ((t = tq->tq_task.tqent_next) == &tq->tq_task) { + if (--tq->tq_active == 0) + cv_broadcast(&tq->tq_wait_cv); + cv_wait(&tq->tq_dispatch_cv, &tq->tq_lock); + tq->tq_active++; + continue; + } + t->tqent_prev->tqent_next = t->tqent_next; + t->tqent_next->tqent_prev = t->tqent_prev; + t->tqent_next = NULL; + t->tqent_prev = NULL; + prealloc = t->tqent_flags & TQENT_FLAG_PREALLOC; + mutex_exit(&tq->tq_lock); + + rw_enter(&tq->tq_threadlock, RW_READER); + t->tqent_func(t->tqent_arg); + rw_exit(&tq->tq_threadlock); + + mutex_enter(&tq->tq_lock); + if (!prealloc) + task_free(tq, t); + } + tq->tq_nthreads--; + cv_broadcast(&tq->tq_wait_cv); + mutex_exit(&tq->tq_lock); + thread_exit(); +} + +/*ARGSUSED*/ +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); + int t; + + if (flags & TASKQ_THREADS_CPU_PCT) { + int pct; + ASSERT3S(nthreads, >=, 0); + ASSERT3S(nthreads, <=, 100); + pct = MIN(nthreads, 100); + pct = MAX(pct, 0); + + nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; + nthreads = MAX(nthreads, 1); /* need at least 1 thread */ + } else { + ASSERT3S(nthreads, >=, 1); + } + + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); + mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_wait_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tq->tq_maxalloc_cv, NULL, CV_DEFAULT, NULL); + (void) strncpy(tq->tq_name, name, TASKQ_NAMELEN); + tq->tq_flags = flags | TASKQ_ACTIVE; + tq->tq_active = nthreads; + tq->tq_nthreads = nthreads; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_task.tqent_next = &tq->tq_task; + tq->tq_task.tqent_prev = &tq->tq_task; + tq->tq_threadlist = kmem_alloc(nthreads * sizeof (kthread_t *), + KM_SLEEP); + + if (flags & TASKQ_PREPOPULATE) { + mutex_enter(&tq->tq_lock); + while (minalloc-- > 0) + task_free(tq, task_alloc(tq, KM_SLEEP)); + mutex_exit(&tq->tq_lock); + } + + for (t = 0; t < nthreads; t++) + VERIFY((tq->tq_threadlist[t] = thread_create(NULL, 0, + taskq_thread, tq, 0, &p0, TS_RUN, pri)) != NULL); + + return (tq); +} + +void +taskq_destroy(taskq_t *tq) +{ + int nthreads = tq->tq_nthreads; + + taskq_wait(tq); + + mutex_enter(&tq->tq_lock); + + tq->tq_flags &= ~TASKQ_ACTIVE; + cv_broadcast(&tq->tq_dispatch_cv); + + while (tq->tq_nthreads != 0) + cv_wait(&tq->tq_wait_cv, &tq->tq_lock); + + tq->tq_minalloc = 0; + while (tq->tq_nalloc != 0) { + ASSERT(tq->tq_freelist != NULL); + task_free(tq, task_alloc(tq, KM_SLEEP)); + } + + mutex_exit(&tq->tq_lock); + + kmem_free(tq->tq_threadlist, nthreads * sizeof (kthread_t *)); + + rw_destroy(&tq->tq_threadlock); + mutex_destroy(&tq->tq_lock); + cv_destroy(&tq->tq_dispatch_cv); + cv_destroy(&tq->tq_wait_cv); + cv_destroy(&tq->tq_maxalloc_cv); + + kmem_free(tq, sizeof (taskq_t)); +} + +int +taskq_member(taskq_t *tq, kthread_t *t) +{ + int i; + + if (taskq_now) + return (1); + + for (i = 0; i < tq->tq_nthreads; i++) + if (tq->tq_threadlist[i] == t) + return (1); + + return (0); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (pthread_getspecific(taskq_tsd)); +} + +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + return (ENOENT); +} + +void +system_taskq_init(void) +{ + VERIFY0(pthread_key_create(&taskq_tsd, NULL)); + system_taskq = taskq_create("system_taskq", 64, maxclsyspri, 4, 512, + TASKQ_DYNAMIC | TASKQ_PREPOPULATE); + system_delay_taskq = taskq_create("delay_taskq", 4, maxclsyspri, 4, + 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); +} + +void +system_taskq_fini(void) +{ + taskq_destroy(system_taskq); + system_taskq = NULL; /* defensive */ + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; + VERIFY0(pthread_key_delete(taskq_tsd)); +} diff --git a/lib/libzpool/util.c b/lib/libzpool/util.c new file mode 100644 index 000000000000..ebfaa9b41a2a --- /dev/null +++ b/lib/libzpool/util.c @@ -0,0 +1,257 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright 2017 Jason King + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Routines needed by more than one client of libzpool. + */ + +static void +show_vdev_stats(const char *desc, const char *ctype, nvlist_t *nv, int indent) +{ + vdev_stat_t *vs; + vdev_stat_t *v0 = { 0 }; + uint64_t sec; + uint64_t is_log = 0; + nvlist_t **child; + uint_t c, children; + char used[6], avail[6]; + char rops[6], wops[6], rbytes[6], wbytes[6], rerr[6], werr[6], cerr[6]; + + v0 = umem_zalloc(sizeof (*v0), UMEM_NOFAIL); + + if (indent == 0 && desc != NULL) { + (void) printf(" " + " capacity operations bandwidth ---- errors ----\n"); + (void) printf("description " + "used avail read write read write read write cksum\n"); + } + + if (desc != NULL) { + char *suffix = "", *bias = NULL; + char bias_suffix[32]; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &is_log); + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias); + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &c) != 0) + vs = v0; + + if (bias != NULL) { + (void) snprintf(bias_suffix, sizeof (bias_suffix), + " (%s)", bias); + suffix = bias_suffix; + } else if (is_log) { + suffix = " (log)"; + } + + sec = MAX(1, vs->vs_timestamp / NANOSEC); + + nicenum(vs->vs_alloc, used, sizeof (used)); + nicenum(vs->vs_space - vs->vs_alloc, avail, sizeof (avail)); + nicenum(vs->vs_ops[ZIO_TYPE_READ] / sec, rops, sizeof (rops)); + nicenum(vs->vs_ops[ZIO_TYPE_WRITE] / sec, wops, sizeof (wops)); + nicenum(vs->vs_bytes[ZIO_TYPE_READ] / sec, rbytes, + sizeof (rbytes)); + nicenum(vs->vs_bytes[ZIO_TYPE_WRITE] / sec, wbytes, + sizeof (wbytes)); + nicenum(vs->vs_read_errors, rerr, sizeof (rerr)); + nicenum(vs->vs_write_errors, werr, sizeof (werr)); + nicenum(vs->vs_checksum_errors, cerr, sizeof (cerr)); + + (void) printf("%*s%s%*s%*s%*s %5s %5s %5s %5s %5s %5s %5s\n", + indent, "", + desc, + (int)(indent+strlen(desc)-25-(vs->vs_space ? 0 : 12)), + suffix, + vs->vs_space ? 6 : 0, vs->vs_space ? used : "", + vs->vs_space ? 6 : 0, vs->vs_space ? avail : "", + rops, wops, rbytes, wbytes, rerr, werr, cerr); + } + umem_free(v0, sizeof (*v0)); + + if (nvlist_lookup_nvlist_array(nv, ctype, &child, &children) != 0) + return; + + for (c = 0; c < children; c++) { + nvlist_t *cnv = child[c]; + char *cname = NULL, *tname; + uint64_t np; + int len; + if (nvlist_lookup_string(cnv, ZPOOL_CONFIG_PATH, &cname) && + nvlist_lookup_string(cnv, ZPOOL_CONFIG_TYPE, &cname)) + cname = ""; + len = strlen(cname) + 2; + tname = umem_zalloc(len, UMEM_NOFAIL); + (void) strlcpy(tname, cname, len); + if (nvlist_lookup_uint64(cnv, ZPOOL_CONFIG_NPARITY, &np) == 0) + tname[strlen(tname)] = '0' + np; + show_vdev_stats(tname, ctype, cnv, indent + 2); + umem_free(tname, len); + } +} + +void +show_pool_stats(spa_t *spa) +{ + nvlist_t *config, *nvroot; + char *name; + + VERIFY(spa_get_stats(spa_name(spa), &config, NULL, 0) == 0); + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + + show_vdev_stats(name, ZPOOL_CONFIG_CHILDREN, nvroot, 0); + show_vdev_stats(NULL, ZPOOL_CONFIG_L2CACHE, nvroot, 0); + show_vdev_stats(NULL, ZPOOL_CONFIG_SPARES, nvroot, 0); + + nvlist_free(config); +} + +/* + * Sets given global variable in libzpool to given unsigned 32-bit value. + * arg: "=" + */ +int +set_global_var(char *arg) +{ + void *zpoolhdl; + char *varname = arg, *varval; + u_longlong_t val; + +#ifndef _LITTLE_ENDIAN + /* + * On big endian systems changing a 64-bit variable would set the high + * 32 bits instead of the low 32 bits, which could cause unexpected + * results. + */ + fprintf(stderr, "Setting global variables is only supported on " + "little-endian systems\n"); + return (ENOTSUP); +#endif + if (arg != NULL && (varval = strchr(arg, '=')) != NULL) { + *varval = '\0'; + varval++; + val = strtoull(varval, NULL, 0); + if (val > UINT32_MAX) { + fprintf(stderr, "Value for global variable '%s' must " + "be a 32-bit unsigned integer\n", varname); + return (EOVERFLOW); + } + } else { + return (EINVAL); + } + + zpoolhdl = dlopen("libzpool.so", RTLD_LAZY); + if (zpoolhdl != NULL) { + uint32_t *var; + var = dlsym(zpoolhdl, varname); + if (var == NULL) { + fprintf(stderr, "Global variable '%s' does not exist " + "in libzpool.so\n", varname); + return (EINVAL); + } + *var = (uint32_t)val; + + dlclose(zpoolhdl); + } else { + fprintf(stderr, "Failed to open libzpool.so to set global " + "variable\n"); + return (EIO); + } + + return (0); +} + +static nvlist_t * +refresh_config(void *unused, nvlist_t *tryconfig) +{ + return (spa_tryimport(tryconfig)); +} + +static int +pool_active(void *unused, const char *name, uint64_t guid, + boolean_t *isactive) +{ + zfs_cmd_t *zcp; + nvlist_t *innvl; + char *packed = NULL; + size_t size = 0; + int fd, ret; + + /* + * Use ZFS_IOC_POOL_SYNC to confirm if a pool is active + */ + + fd = open(ZFS_DEV, O_RDWR); + if (fd < 0) + return (-1); + + zcp = umem_zalloc(sizeof (zfs_cmd_t), UMEM_NOFAIL); + + innvl = fnvlist_alloc(); + fnvlist_add_boolean_value(innvl, "force", B_FALSE); + + (void) strlcpy(zcp->zc_name, name, sizeof (zcp->zc_name)); + packed = fnvlist_pack(innvl, &size); + zcp->zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zcp->zc_nvlist_src_size = size; + + ret = zfs_ioctl_fd(fd, ZFS_IOC_POOL_SYNC, zcp); + + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zcp->zc_nvlist_dst); + nvlist_free(innvl); + umem_free(zcp, sizeof (zfs_cmd_t)); + + (void) close(fd); + + *isactive = (ret == 0); + + return (0); +} + +const pool_config_ops_t libzpool_config_ops = { + .pco_refresh_config = refresh_config, + .pco_pool_active = pool_active, +}; diff --git a/lib/libzstd/Makefile.am b/lib/libzstd/Makefile.am new file mode 100644 index 000000000000..df31d8b23d16 --- /dev/null +++ b/lib/libzstd/Makefile.am @@ -0,0 +1,21 @@ +include $(top_srcdir)/config/Rules.am + +VPATH = $(top_srcdir)/module/zstd + +# -fno-tree-vectorize is set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +AM_CFLAGS += -fno-tree-vectorize + +noinst_LTLIBRARIES = libzstd.la + +KERNEL_C = \ + lib/zstd.c \ + zfs_zstd.c + +nodist_libzstd_la_SOURCES = $(KERNEL_C) + +lib/zstd.$(OBJEXT): CFLAGS += -fno-tree-vectorize -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h -Wp,-w +lib/zstd.l$(OBJEXT): CFLAGS += -fno-tree-vectorize -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h -Wp,-w + +zfs_zstd.$(OBJEXT): CFLAGS += -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h +zfs_zstd.l$(OBJEXT): CFLAGS += -include $(top_srcdir)/module/zstd/include/zstd_compat_wrapper.h diff --git a/lib/libzutil/Makefile.am b/lib/libzutil/Makefile.am new file mode 100644 index 000000000000..cd03ba19d508 --- /dev/null +++ b/lib/libzutil/Makefile.am @@ -0,0 +1,50 @@ +include $(top_srcdir)/config/Rules.am + +# Suppress unused but set variable warnings often due to ASSERTs +AM_CFLAGS += $(NO_UNUSED_BUT_SET_VARIABLE) +AM_CFLAGS += $(LIBBLKID_CFLAGS) $(LIBUDEV_CFLAGS) + +DEFAULT_INCLUDES += -I$(srcdir) + +noinst_LTLIBRARIES = libzutil.la + +USER_C = \ + zutil_device_path.c \ + zutil_import.c \ + zutil_import.h \ + zutil_nicenum.c \ + zutil_pool.c + +if BUILD_LINUX +USER_C += \ + os/linux/zutil_device_path_os.c \ + os/linux/zutil_import_os.c \ + os/linux/zutil_compat.c +endif + +if BUILD_FREEBSD +DEFAULT_INCLUDES += -I$(top_srcdir)/include/os/freebsd/zfs + +USER_C += \ + os/freebsd/zutil_device_path_os.c \ + os/freebsd/zutil_import_os.c \ + os/freebsd/zutil_compat.c + +VPATH += $(top_srcdir)/module/os/freebsd/zfs + +nodist_libzutil_la_SOURCES = zfs_ioctl_compat.c +endif + +libzutil_la_SOURCES = $(USER_C) + +libzutil_la_LIBADD = \ + $(abs_top_builddir)/lib/libavl/libavl.la \ + $(abs_top_builddir)/lib/libtpool/libtpool.la \ + $(abs_top_builddir)/lib/libspl/libspl.la + +if BUILD_LINUX +libzutil_la_LIBADD += \ + $(abs_top_builddir)/lib/libefi/libefi.la +endif + +libzutil_la_LIBADD += -lm $(LIBBLKID_LIBS) $(LIBUDEV_LIBS) diff --git a/lib/libzutil/os/freebsd/zutil_compat.c b/lib/libzutil/os/freebsd/zutil_compat.c new file mode 100644 index 000000000000..3e70fef1ec3d --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_compat.c @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include +#include +#include + +#include + +int zfs_ioctl_version = ZFS_IOCVER_UNDEF; + +/* + * Get zfs_ioctl_version + */ +static int +get_zfs_ioctl_version(void) +{ + size_t ver_size; + int ver = ZFS_IOCVER_NONE; + + ver_size = sizeof (ver); + sysctlbyname("vfs.zfs.version.ioctl", &ver, &ver_size, NULL, 0); + + return (ver); +} + +static int +zcmd_ioctl_compat(int fd, int request, zfs_cmd_t *zc, const int cflag) +{ + int newrequest, ret; + void *zc_c = NULL; + unsigned long ncmd; + zfs_iocparm_t zp; + + switch (cflag) { + case ZFS_CMD_COMPAT_NONE: + ncmd = _IOWR('Z', request, zfs_iocparm_t); + zp.zfs_cmd = (uint64_t)zc; + zp.zfs_cmd_size = sizeof (zfs_cmd_t); + zp.zfs_ioctl_version = ZFS_IOCVER_OZFS; + break; + case ZFS_CMD_COMPAT_LEGACY: + newrequest = zfs_ioctl_ozfs_to_legacy(request); + ncmd = _IOWR('Z', newrequest, zfs_iocparm_t); + zc_c = malloc(sizeof (zfs_cmd_legacy_t)); + zfs_cmd_ozfs_to_legacy(zc, zc_c); + zp.zfs_cmd = (uint64_t)zc_c; + zp.zfs_cmd_size = sizeof (zfs_cmd_legacy_t); + zp.zfs_ioctl_version = ZFS_IOCVER_LEGACY; + break; + default: + abort(); + return (EINVAL); + } + + ret = ioctl(fd, ncmd, &zp); + if (ret) { + if (zc_c) + free(zc_c); + return (ret); + } + if (zc_c) { + zfs_cmd_legacy_to_ozfs(zc_c, zc); + free(zc_c); + } + return (ret); +} + +/* + * This is FreeBSD version of ioctl, because Solaris' ioctl() updates + * zc_nvlist_dst_size even if an error is returned, on FreeBSD if an + * error is returned zc_nvlist_dst_size won't be updated. + */ +int +zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + size_t oldsize; + int ret, cflag = ZFS_CMD_COMPAT_NONE; + + if (zfs_ioctl_version == ZFS_IOCVER_UNDEF) + zfs_ioctl_version = get_zfs_ioctl_version(); + + switch (zfs_ioctl_version) { + case ZFS_IOCVER_LEGACY: + cflag = ZFS_CMD_COMPAT_LEGACY; + break; + case ZFS_IOCVER_OZFS: + cflag = ZFS_CMD_COMPAT_NONE; + break; + default: + errx(1, "unrecognized zfs ioctl version %d", + zfs_ioctl_version); + } + + oldsize = zc->zc_nvlist_dst_size; + ret = zcmd_ioctl_compat(fd, request, zc, cflag); + + if (ret == 0 && oldsize < zc->zc_nvlist_dst_size) { + ret = -1; + errno = ENOMEM; + } + + return (ret); +} diff --git a/lib/libzutil/os/freebsd/zutil_device_path_os.c b/lib/libzutil/os/freebsd/zutil_device_path_os.c new file mode 100644 index 000000000000..71c936005242 --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_device_path_os.c @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include + +#include + +#include + +#include + +/* + * We don't strip/append partitions on FreeBSD. + */ + +/* + * Note: The caller must free the returned string. + */ +char * +zfs_strip_partition(char *dev) +{ + return (strdup(dev)); +} + +int +zfs_append_partition(char *path, size_t max_len) +{ + return (strnlen(path, max_len)); +} + +/* + * Strip the path from a device name. + * On FreeBSD we only want to remove "/dev/" from the beginning of + * paths if present. + */ +char * +zfs_strip_path(char *path) +{ + if (strncmp(path, _PATH_DEV, sizeof (_PATH_DEV) - 1) == 0) + return (path + sizeof (_PATH_DEV) - 1); + else + return (path); +} + +char * +zfs_get_underlying_path(const char *dev_name) +{ + + if (dev_name == NULL) + return (NULL); + + return (realpath(dev_name, NULL)); +} + +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + int fd; + + fd = g_open(dev_name, 0); + if (fd >= 0) { + g_close(fd); + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +} + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c new file mode 100644 index 000000000000..6cb001eac3de --- /dev/null +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -0,0 +1,239 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright 2016 Nexenta Systems, Inc. + */ + +/* + * Pool import support functions. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include "zutil_import.h" + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * On FreeBSD we currently just strip devid and phys_path to avoid confusion. + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); +} + +/* + * Do not even look at these devices. + */ +static const char * const excluded_devs[] = { + "nfslock", + "sequencer", + "zfs", +}; +#define EXCLUDED_DIR "/dev/" +#define EXCLUDED_DIR_LEN 5 + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + size_t i; + int num_labels; + int fd; + off_t mediasize = 0; + + /* + * Do not even look at excluded devices. + */ + if (strncmp(rn->rn_name, EXCLUDED_DIR, EXCLUDED_DIR_LEN) == 0) { + char *name = rn->rn_name + EXCLUDED_DIR_LEN; + for (i = 0; i < nitems(excluded_devs); ++i) { + const char *excluded_name = excluded_devs[i]; + size_t len = strlen(excluded_name); + if (strncmp(name, excluded_name, len) == 0) { + return; + } + } + } + + /* + * O_NONBLOCK so we don't hang trying to open things like serial ports. + */ + if ((fd = open(rn->rn_name, O_RDONLY|O_NONBLOCK)) < 0) + return; + + /* + * Ignore failed stats. + */ + if (fstat64(fd, &statbuf) != 0) + goto out; + /* + * We only want regular files, character devs and block devs. + */ + if (S_ISREG(statbuf.st_mode)) { + /* Check if this file is too small to hold a zpool. */ + if (statbuf.st_size < SPA_MINDEVSIZE) { + goto out; + } + } else if (S_ISCHR(statbuf.st_mode) || S_ISBLK(statbuf.st_mode)) { + /* Check if this device is too small to hold a zpool. */ + if (ioctl(fd, DIOCGMEDIASIZE, &mediasize) != 0 || + mediasize < SPA_MINDEVSIZE) { + goto out; + } + } else { + goto out; + } + + if (zpool_read_label(fd, &config, &num_labels) != 0) + goto out; + if (num_labels == 0) { + nvlist_free(config); + goto out; + } + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* TODO: Reuse labelpaths logic from Linux? */ +out: + (void) close(fd); +} + +static const char * +zpool_default_import_path[] = { + "/dev" +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = nitems(zpool_default_import_path); + return (zpool_default_import_path); +} + +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + char *end, path[MAXPATHLEN]; + rdsk_node_t *slice; + struct gmesh mesh; + struct gclass *mp; + struct ggeom *gp; + struct gprovider *pp; + avl_index_t where; + size_t pathleft; + int error; + + end = stpcpy(path, "/dev/"); + pathleft = &path[sizeof (path)] - end; + + error = geom_gettree(&mesh); + if (error != 0) + return (error); + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + LIST_FOREACH(mp, &mesh.lg_class, lg_class) { + LIST_FOREACH(gp, &mp->lg_geom, lg_geom) { + LIST_FOREACH(pp, &gp->lg_provider, lg_provider) { + strlcpy(end, pp->lg_name, pathleft); + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_FALSE; + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + } + } + + geom_deletetree(&mesh); + + return (0); +} + +int +zfs_dev_flush(int fd __unused) +{ + return (0); +} diff --git a/lib/libzutil/os/linux/zutil_compat.c b/lib/libzutil/os/linux/zutil_compat.c new file mode 100644 index 000000000000..173ae9cb619c --- /dev/null +++ b/lib/libzutil/os/linux/zutil_compat.c @@ -0,0 +1,30 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +#include +#include +#include +#include + +int +zfs_ioctl_fd(int fd, unsigned long request, zfs_cmd_t *zc) +{ + return (ioctl(fd, request, zc)); +} diff --git a/lib/libzutil/os/linux/zutil_device_path_os.c b/lib/libzutil/os/linux/zutil_device_path_os.c new file mode 100644 index 000000000000..36331fd72bf7 --- /dev/null +++ b/lib/libzutil/os/linux/zutil_device_path_os.c @@ -0,0 +1,506 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_LIBUDEV +#include +#endif + +#include + +/* + * Append partition suffix to an otherwise fully qualified device path. + * This is used to generate the name the full path as its stored in + * ZPOOL_CONFIG_PATH for whole disk devices. On success the new length + * of 'path' will be returned on error a negative value is returned. + */ +int +zfs_append_partition(char *path, size_t max_len) +{ + int len = strlen(path); + + if ((strncmp(path, UDISK_ROOT, strlen(UDISK_ROOT)) == 0) || + (strncmp(path, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0)) { + if (len + 6 >= max_len) + return (-1); + + (void) strcat(path, "-part1"); + len += 6; + } else { + if (len + 2 >= max_len) + return (-1); + + if (isdigit(path[len-1])) { + (void) strcat(path, "p1"); + len += 2; + } else { + (void) strcat(path, "1"); + len += 1; + } + } + + return (len); +} + +/* + * Remove partition suffix from a vdev path. Partition suffixes may take three + * forms: "-partX", "pX", or "X", where X is a string of digits. The second + * case only occurs when the suffix is preceded by a digit, i.e. "md0p0" The + * third case only occurs when preceded by a string matching the regular + * expression "^([hsv]|xv)d[a-z]+", i.e. a scsi, ide, virtio or xen disk. + * + * caller must free the returned string + */ +char * +zfs_strip_partition(char *path) +{ + char *tmp = strdup(path); + char *part = NULL, *d = NULL; + if (!tmp) + return (NULL); + + if ((part = strstr(tmp, "-part")) && part != tmp) { + d = part + 5; + } else if ((part = strrchr(tmp, 'p')) && + part > tmp + 1 && isdigit(*(part-1))) { + d = part + 1; + } else if ((tmp[0] == 'h' || tmp[0] == 's' || tmp[0] == 'v') && + tmp[1] == 'd') { + for (d = &tmp[2]; isalpha(*d); part = ++d) { } + } else if (strncmp("xvd", tmp, 3) == 0) { + for (d = &tmp[3]; isalpha(*d); part = ++d) { } + } + if (part && d && *d != '\0') { + for (; isdigit(*d); d++) { } + if (*d == '\0') + *part = '\0'; + } + + return (tmp); +} + +/* + * Same as zfs_strip_partition, but allows "/dev/" to be in the pathname + * + * path: /dev/sda1 + * returns: /dev/sda + * + * Returned string must be freed. + */ +static char * +zfs_strip_partition_path(char *path) +{ + char *newpath = strdup(path); + char *sd_offset; + char *new_sd; + + if (!newpath) + return (NULL); + + /* Point to "sda1" part of "/dev/sda1" */ + sd_offset = strrchr(newpath, '/') + 1; + + /* Get our new name "sda" */ + new_sd = zfs_strip_partition(sd_offset); + if (!new_sd) { + free(newpath); + return (NULL); + } + + /* Paste the "sda" where "sda1" was */ + strlcpy(sd_offset, new_sd, strlen(sd_offset) + 1); + + /* Free temporary "sda" */ + free(new_sd); + + return (newpath); +} + +/* + * Strip the unwanted portion of a device path. + */ +char * +zfs_strip_path(char *path) +{ + return (strrchr(path, '/') + 1); +} + +/* + * Allocate and return the underlying device name for a device mapper device. + * If a device mapper device maps to multiple devices, return the first device. + * + * For example, dm_name = "/dev/dm-0" could return "/dev/sda". Symlinks to a + * DM device (like /dev/disk/by-vdev/A0) are also allowed. + * + * Returns device name, or NULL on error or no match. If dm_name is not a DM + * device then return NULL. + * + * NOTE: The returned name string must be *freed*. + */ +static char * +dm_get_underlying_path(const char *dm_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char *realp; + char *tmp = NULL; + char *path = NULL; + char *dev_str; + int size; + + if (dm_name == NULL) + return (NULL); + + /* dm name may be a symlink (like /dev/disk/by-vdev/A0) */ + realp = realpath(dm_name, NULL); + if (realp == NULL) + return (NULL); + + /* + * If they preface 'dev' with a path (like "/dev") then strip it off. + * We just want the 'dm-N' part. + */ + tmp = strrchr(realp, '/'); + if (tmp != NULL) + dev_str = tmp + 1; /* +1 since we want the chr after '/' */ + else + dev_str = tmp; + + size = asprintf(&tmp, "/sys/block/%s/slaves/", dev_str); + if (size == -1 || !tmp) + goto end; + + dp = opendir(tmp); + if (dp == NULL) + goto end; + + /* + * Return first entry (that isn't itself a directory) in the + * directory containing device-mapper dependent (underlying) + * devices. + */ + while ((ep = readdir(dp))) { + if (ep->d_type != DT_DIR) { /* skip "." and ".." dirs */ + size = asprintf(&path, "/dev/%s", ep->d_name); + break; + } + } + +end: + if (dp != NULL) + closedir(dp); + free(tmp); + free(realp); + return (path); +} + +/* + * Return B_TRUE if device is a device mapper or multipath device. + * Return B_FALSE if not. + */ +boolean_t +zfs_dev_is_dm(const char *dev_name) +{ + + char *tmp; + tmp = dm_get_underlying_path(dev_name); + if (tmp == NULL) + return (B_FALSE); + + free(tmp); + return (B_TRUE); +} + +/* + * By "whole disk" we mean an entire physical disk (something we can + * label, toggle the write cache on, etc.) as opposed to the full + * capacity of a pseudo-device such as lofi or did. We act as if we + * are labeling the disk, which should be a pretty good test of whether + * it's a viable device or not. Returns B_TRUE if it is and B_FALSE if + * it isn't. + */ +boolean_t +zfs_dev_is_whole_disk(const char *dev_name) +{ + struct dk_gpt *label; + int fd; + + if ((fd = open(dev_name, O_RDONLY | O_DIRECT)) < 0) + return (B_FALSE); + + if (efi_alloc_and_init(fd, EFI_NUMPAR, &label) != 0) { + (void) close(fd); + return (B_FALSE); + } + + efi_free(label); + (void) close(fd); + + return (B_TRUE); +} + +/* + * Lookup the underlying device for a device name + * + * Often you'll have a symlink to a device, a partition device, + * or a multipath device, and want to look up the underlying device. + * This function returns the underlying device name. If the device + * name is already the underlying device, then just return the same + * name. If the device is a DM device with multiple underlying devices + * then return the first one. + * + * For example: + * + * 1. /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 -> ../../sda + * dev_name: /dev/disk/by-id/ata-QEMU_HARDDISK_QM00001 + * returns: /dev/sda + * + * 2. /dev/mapper/mpatha (made up of /dev/sda and /dev/sdb) + * dev_name: /dev/mapper/mpatha + * returns: /dev/sda (first device) + * + * 3. /dev/sda (already the underlying device) + * dev_name: /dev/sda + * returns: /dev/sda + * + * 4. /dev/dm-3 (mapped to /dev/sda) + * dev_name: /dev/dm-3 + * returns: /dev/sda + * + * 5. /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 -> ../../sdb9 + * dev_name: /dev/disk/by-id/scsi-0QEMU_drive-scsi0-0-0-0-part9 + * returns: /dev/sdb + * + * 6. /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a -> ../dev/sda2 + * dev_name: /dev/disk/by-uuid/5df030cf-3cd9-46e4-8e99-3ccb462a4e9a + * returns: /dev/sda + * + * Returns underlying device name, or NULL on error or no match. + * + * NOTE: The returned name string must be *freed*. + */ +char * +zfs_get_underlying_path(const char *dev_name) +{ + char *name = NULL; + char *tmp; + + if (dev_name == NULL) + return (NULL); + + tmp = dm_get_underlying_path(dev_name); + + /* dev_name not a DM device, so just un-symlinkize it */ + if (tmp == NULL) + tmp = realpath(dev_name, NULL); + + if (tmp != NULL) { + name = zfs_strip_partition_path(tmp); + free(tmp); + } + + return (name); +} + +/* + * Given a dev name like "sda", return the full enclosure sysfs path to + * the disk. You can also pass in the name with "/dev" prepended + * to it (like /dev/sda). + * + * For example, disk "sda" in enclosure slot 1: + * dev: "sda" + * returns: "/sys/class/enclosure/1:0:3:0/Slot 1" + * + * 'dev' must be a non-devicemapper device. + * + * Returned string must be freed. + */ +char * +zfs_get_enclosure_sysfs_path(const char *dev_name) +{ + DIR *dp = NULL; + struct dirent *ep; + char buf[MAXPATHLEN]; + char *tmp1 = NULL; + char *tmp2 = NULL; + char *tmp3 = NULL; + char *path = NULL; + size_t size; + int tmpsize; + + if (dev_name == NULL) + return (NULL); + + /* If they preface 'dev' with a path (like "/dev") then strip it off */ + tmp1 = strrchr(dev_name, '/'); + if (tmp1 != NULL) + dev_name = tmp1 + 1; /* +1 since we want the chr after '/' */ + + tmpsize = asprintf(&tmp1, "/sys/block/%s/device", dev_name); + if (tmpsize == -1 || tmp1 == NULL) { + tmp1 = NULL; + goto end; + } + + dp = opendir(tmp1); + if (dp == NULL) { + tmp1 = NULL; /* To make free() at the end a NOP */ + goto end; + } + + /* + * Look though all sysfs entries in /sys/block//device for + * the enclosure symlink. + */ + while ((ep = readdir(dp))) { + /* Ignore everything that's not our enclosure_device link */ + if (strstr(ep->d_name, "enclosure_device") == NULL) + continue; + + if (asprintf(&tmp2, "%s/%s", tmp1, ep->d_name) == -1 || + tmp2 == NULL) + break; + + size = readlink(tmp2, buf, sizeof (buf)); + + /* Did readlink fail or crop the link name? */ + if (size == -1 || size >= sizeof (buf)) { + free(tmp2); + tmp2 = NULL; /* To make free() at the end a NOP */ + break; + } + + /* + * We got a valid link. readlink() doesn't terminate strings + * so we have to do it. + */ + buf[size] = '\0'; + + /* + * Our link will look like: + * + * "../../../../port-11:1:2/..STUFF../enclosure/1:0:3:0/SLOT 1" + * + * We want to grab the "enclosure/1:0:3:0/SLOT 1" part + */ + tmp3 = strstr(buf, "enclosure"); + if (tmp3 == NULL) + break; + + if (asprintf(&path, "/sys/class/%s", tmp3) == -1) { + /* If asprintf() fails, 'path' is undefined */ + path = NULL; + break; + } + + if (path == NULL) + break; + } + +end: + free(tmp2); + free(tmp1); + + if (dp != NULL) + closedir(dp); + + return (path); +} + +#ifdef HAVE_LIBUDEV + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a disk is effectively a multipath whole disk + */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + boolean_t wholedisk = B_FALSE; + + if (realpath(path, nodepath) == NULL) + return (B_FALSE); + sysname = strrchr(nodepath, '/') + 1; + if (strncmp(sysname, "dm-", 3) != 0) + return (B_FALSE); + if ((udev = udev_new()) == NULL) + return (B_FALSE); + if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname)) == NULL) { + udev_device_unref(dev); + return (B_FALSE); + } + + wholedisk = udev_mpath_whole_disk(dev); + + udev_device_unref(dev); + return (wholedisk); +} + +#else /* HAVE_LIBUDEV */ + +/* ARGSUSED */ +boolean_t +is_mpath_whole_disk(const char *path) +{ + return (B_FALSE); +} + +#endif /* HAVE_LIBUDEV */ diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c new file mode 100644 index 000000000000..a4bf01749da8 --- /dev/null +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -0,0 +1,871 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zutil_import.h" + +#ifdef HAVE_LIBUDEV +#include +#include +#endif +#include + +#define DEFAULT_IMPORT_PATH_SIZE 9 +#define DEV_BYID_PATH "/dev/disk/by-id/" + +static boolean_t +is_watchdog_dev(char *dev) +{ + /* For 'watchdog' dev */ + if (strcmp(dev, "watchdog") == 0) + return (B_TRUE); + + /* For 'watchdog */ + if (strstr(dev, "watchdog") == dev && isdigit(dev[8])) + return (B_TRUE); + + return (B_FALSE); +} + +int +zfs_dev_flush(int fd) +{ + return (ioctl(fd, BLKFLSBUF)); +} + +void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + libpc_handle_t *hdl = rn->rn_hdl; + struct stat64 statbuf; + nvlist_t *config; + char *bname, *dupname; + uint64_t vdev_guid = 0; + int error; + int num_labels = 0; + int fd; + + /* + * Skip devices with well known prefixes there can be side effects + * when opening devices which need to be avoided. + * + * hpet - High Precision Event Timer + * watchdog - Watchdog must be closed in a special way. + */ + dupname = zutil_strdup(hdl, rn->rn_name); + bname = basename(dupname); + error = ((strcmp(bname, "hpet") == 0) || is_watchdog_dev(bname)); + free(dupname); + if (error) + return; + + /* + * Ignore failed stats. We only want regular files and block devices. + */ + if (stat64(rn->rn_name, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && !S_ISBLK(statbuf.st_mode))) + return; + + /* + * Preferentially open using O_DIRECT to bypass the block device + * cache which may be stale for multipath devices. An EINVAL errno + * indicates O_DIRECT is unsupported so fallback to just O_RDONLY. + */ + fd = open(rn->rn_name, O_RDONLY | O_DIRECT); + if ((fd < 0) && (errno == EINVAL)) + fd = open(rn->rn_name, O_RDONLY); + if ((fd < 0) && (errno == EACCES)) + hdl->lpc_open_access_error = B_TRUE; + if (fd < 0) + return; + + /* + * This file is too small to hold a zpool + */ + if (S_ISREG(statbuf.st_mode) && statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } + + error = zpool_read_label(fd, &config, &num_labels); + if (error != 0) { + (void) close(fd); + return; + } + + if (num_labels == 0) { + (void) close(fd); + nvlist_free(config); + return; + } + + /* + * Check that the vdev is for the expected guid. Additional entries + * are speculatively added based on the paths stored in the labels. + * Entries with valid paths but incorrect guids must be removed. + */ + error = nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + if (error || (rn->rn_vdev_guid && rn->rn_vdev_guid != vdev_guid)) { + (void) close(fd); + nvlist_free(config); + return; + } + + (void) close(fd); + + rn->rn_config = config; + rn->rn_num_labels = num_labels; + + /* + * Add additional entries for paths described by this label. + */ + if (rn->rn_labelpaths) { + char *path = NULL; + char *devid = NULL; + char *env = NULL; + rdsk_node_t *slice; + avl_index_t where; + int timeout; + int error; + + if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) + return; + + env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); + if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || + timeout < 0) { + timeout = DISK_LABEL_WAIT; + } + + /* + * Allow devlinks to stabilize so all paths are available. + */ + zpool_label_disk_wait(rn->rn_name, timeout); + + if (path != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, path); + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_1; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + + if (devid != NULL) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + error = asprintf(&slice->rn_name, "%s%s", + DEV_BYID_PATH, devid); + if (error == -1) { + free(slice); + return; + } + + slice->rn_vdev_guid = vdev_guid; + slice->rn_avl = rn->rn_avl; + slice->rn_hdl = hdl; + slice->rn_order = IMPORT_ORDER_PREFERRED_2; + slice->rn_labelpaths = B_FALSE; + pthread_mutex_lock(rn->rn_lock); + if (avl_find(rn->rn_avl, slice, &where)) { + pthread_mutex_unlock(rn->rn_lock); + free(slice->rn_name); + free(slice); + } else { + avl_insert(rn->rn_avl, slice, where); + pthread_mutex_unlock(rn->rn_lock); + zpool_open_func(slice); + } + } + } +} + +static char * +zpool_default_import_path[DEFAULT_IMPORT_PATH_SIZE] = { + "/dev/disk/by-vdev", /* Custom rules, use first if they exist */ + "/dev/mapper", /* Use multipath devices before components */ + "/dev/disk/by-partlabel", /* Single unique entry set by user */ + "/dev/disk/by-partuuid", /* Generated partition uuid */ + "/dev/disk/by-label", /* Custom persistent labels */ + "/dev/disk/by-uuid", /* Single unique entry and persistent */ + "/dev/disk/by-id", /* May be multiple entries and persistent */ + "/dev/disk/by-path", /* Encodes physical location and persistent */ + "/dev" /* UNSAFE device names will change */ +}; + +const char * const * +zpool_default_search_paths(size_t *count) +{ + *count = DEFAULT_IMPORT_PATH_SIZE; + return ((const char * const *)zpool_default_import_path); +} + +/* + * Given a full path to a device determine if that device appears in the + * import search path. If it does return the first match and store the + * index in the passed 'order' variable, otherwise return an error. + */ +static int +zfs_path_order(char *name, int *order) +{ + int i = 0, error = ENOENT; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir) { + if (strncmp(name, dir, strlen(dir)) == 0) { + *order = i; + error = 0; + break; + } + dir = strtok(NULL, ":"); + i++; + } + free(envdup); + } else { + for (i = 0; i < DEFAULT_IMPORT_PATH_SIZE; i++) { + if (strncmp(name, zpool_default_import_path[i], + strlen(zpool_default_import_path[i])) == 0) { + *order = i; + error = 0; + break; + } + } + } + + return (error); +} + +/* + * Use libblkid to quickly enumerate all known zfs devices. + */ +int +zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache) +{ + rdsk_node_t *slice; + blkid_cache cache; + blkid_dev_iterate iter; + blkid_dev dev; + avl_index_t where; + int error; + + *slice_cache = NULL; + + error = blkid_get_cache(&cache, NULL); + if (error != 0) + return (error); + + error = blkid_probe_all_new(cache); + if (error != 0) { + blkid_put_cache(cache); + return (error); + } + + iter = blkid_dev_iterate_begin(cache); + if (iter == NULL) { + blkid_put_cache(cache); + return (EINVAL); + } + + error = blkid_dev_set_search(iter, "TYPE", "zfs_member"); + if (error != 0) { + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + return (error); + } + + *slice_cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(*slice_cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + while (blkid_dev_next(iter, &dev) == 0) { + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zutil_strdup(hdl, blkid_dev_devname(dev)); + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = *slice_cache; + slice->rn_hdl = hdl; + slice->rn_labelpaths = B_TRUE; + + error = zfs_path_order(slice->rn_name, &slice->rn_order); + if (error == 0) + slice->rn_order += IMPORT_ORDER_SCAN_OFFSET; + else + slice->rn_order = IMPORT_ORDER_DEFAULT; + + pthread_mutex_lock(lock); + if (avl_find(*slice_cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(*slice_cache, slice, where); + } + pthread_mutex_unlock(lock); + } + + blkid_dev_iterate_end(iter); + blkid_put_cache(cache); + + return (0); +} + +/* + * Linux persistent device strings for vdev labels + * + * based on libudev for consistency with libudev disk add/remove events + */ + +typedef struct vdev_dev_strs { + char vds_devid[128]; + char vds_devphys[128]; +} vdev_dev_strs_t; + +#ifdef HAVE_LIBUDEV + +/* + * Obtain the persistent device id string (describes what) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + struct udev_list_entry *entry; + const char *bus; + char devbyid[MAXPATHLEN]; + + /* The bus based by-id path is preferred */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + + if (bus == NULL) { + const char *dm_uuid; + + /* + * For multipath nodes use the persistent uuid based identifier + * + * Example: /dev/disk/by-id/dm-uuid-mpath-35000c5006304de3f + */ + dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (dm_uuid != NULL) { + (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); + return (0); + } + + /* + * For volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * NVME 'by-id' symlinks are similar to bus case + */ + struct udev_device *parent; + + parent = udev_device_get_parent_with_subsystem_devtype(dev, + "nvme", NULL); + if (parent != NULL) + bus = "nvme"; /* continue with bus symlink search */ + else + return (ENODATA); + } + + /* + * locate the bus specific by-id link + */ + (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, devbyid, strlen(devbyid)) == 0) { + name += strlen(DEV_BYID_PATH); + (void) strlcpy(bufptr, name, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * Obtain the persistent physical location string (describes where) + * + * used by ZED vdev matching for auto-{online,expand,replace} + */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + const char *physpath = NULL; + struct udev_list_entry *entry; + + /* + * Normal disks use ID_PATH for their physical path. + */ + physpath = udev_device_get_property_value(dev, "ID_PATH"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * Device mapper devices are virtual and don't have a physical + * path. For them we use ID_VDEV instead, which is setup via the + * /etc/vdev_id.conf file. ID_VDEV provides a persistent path + * to a virtual device. If you don't have vdev_id.conf setup, + * you cannot use multipath autoreplace with device mapper. + */ + physpath = udev_device_get_property_value(dev, "ID_VDEV"); + if (physpath != NULL && strlen(physpath) > 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + + /* + * For ZFS volumes use the persistent /dev/zvol/dataset identifier + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, ZVOL_ROOT, strlen(ZVOL_ROOT)) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + /* + * For all other devices fallback to using the by-uuid name. + */ + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + physpath = udev_list_entry_get_name(entry); + if (strncmp(physpath, "/dev/disk/by-uuid", 17) == 0) { + (void) strlcpy(bufptr, physpath, buflen); + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * A disk is considered a multipath whole disk when: + * DEVNAME key value has "dm-" + * DM_NAME key value has "mpath" prefix + * DM_UUID key exists + * ID_PART_TABLE_TYPE key does not exist or is not gpt + */ +static boolean_t +udev_mpath_whole_disk(struct udev_device *dev) +{ + const char *devname, *type, *uuid; + + devname = udev_device_get_property_value(dev, "DEVNAME"); + type = udev_device_get_property_value(dev, "ID_PART_TABLE_TYPE"); + uuid = udev_device_get_property_value(dev, "DM_UUID"); + + if ((devname != NULL && strncmp(devname, "/dev/dm-", 8) == 0) && + ((type == NULL) || (strcmp(type, "gpt") != 0)) && + (uuid != NULL)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +udev_device_is_ready(struct udev_device *dev) +{ +#ifdef HAVE_LIBUDEV_UDEV_DEVICE_GET_IS_INITIALIZED + return (udev_device_get_is_initialized(dev)); +#else + /* wait for DEVLINKS property to be initialized */ + return (udev_device_get_property_value(dev, "DEVLINKS") != NULL); +#endif +} + +#else + +/* ARGSUSED */ +int +zfs_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +/* ARGSUSED */ +int +zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen) +{ + return (ENODATA); +} + +#endif /* HAVE_LIBUDEV */ + +/* + * Wait up to timeout_ms for udev to set up the device node. The device is + * considered ready when libudev determines it has been initialized, all of + * the device links have been verified to exist, and it has been allowed to + * settle. At this point the device the device can be accessed reliably. + * Depending on the complexity of the udev rules this process could take + * several seconds. + */ +int +zpool_label_disk_wait(const char *path, int timeout_ms) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname = NULL; + int ret = ENODEV; + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + start = gethrtime(); + settle = 0; + + do { + if (sysname == NULL) { + if (realpath(path, nodepath) != NULL) { + sysname = strrchr(nodepath, '/') + 1; + } else { + (void) usleep(sleep_ms * MILLISEC); + continue; + } + } + + dev = udev_device_new_from_subsystem_sysname(udev, + "block", sysname); + if ((dev != NULL) && udev_device_is_ready(dev)) { + struct udev_list_entry *links, *link = NULL; + + ret = 0; + links = udev_device_get_devlinks_list_entry(dev); + + udev_list_entry_foreach(link, links) { + struct stat64 statbuf; + const char *name; + + name = udev_list_entry_get_name(link); + errno = 0; + if (stat64(name, &statbuf) == 0 && errno == 0) + continue; + + settle = 0; + ret = ENODEV; + break; + } + + if (ret == 0) { + if (settle == 0) { + settle = gethrtime(); + } else if (NSEC2MSEC(gethrtime() - settle) >= + settle_ms) { + udev_device_unref(dev); + break; + } + } + } + + udev_device_unref(dev); + (void) usleep(sleep_ms * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + udev_unref(udev); + + return (ret); +#else + int settle_ms = 50; + long sleep_ms = 10; + hrtime_t start, settle; + struct stat64 statbuf; + + start = gethrtime(); + settle = 0; + + do { + errno = 0; + if ((stat64(path, &statbuf) == 0) && (errno == 0)) { + if (settle == 0) + settle = gethrtime(); + else if (NSEC2MSEC(gethrtime() - settle) >= settle_ms) + return (0); + } else if (errno != ENOENT) { + return (errno); + } + + usleep(sleep_ms * MILLISEC); + } while (NSEC2MSEC(gethrtime() - start) < timeout_ms); + + return (ENODEV); +#endif /* HAVE_LIBUDEV */ +} + +/* + * Encode the persistent devices strings + * used for the vdev disk label + */ +static int +encode_device_strings(const char *path, vdev_dev_strs_t *ds, + boolean_t wholedisk) +{ +#ifdef HAVE_LIBUDEV + struct udev *udev; + struct udev_device *dev = NULL; + char nodepath[MAXPATHLEN]; + char *sysname; + int ret = ENODEV; + hrtime_t start; + + if ((udev = udev_new()) == NULL) + return (ENXIO); + + /* resolve path to a runtime device node instance */ + if (realpath(path, nodepath) == NULL) + goto no_dev; + + sysname = strrchr(nodepath, '/') + 1; + + /* + * Wait up to 3 seconds for udev to set up the device node context + */ + start = gethrtime(); + do { + dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname); + if (dev == NULL) + goto no_dev; + if (udev_device_is_ready(dev)) + break; /* udev ready */ + + udev_device_unref(dev); + dev = NULL; + + if (NSEC2MSEC(gethrtime() - start) < 10) + (void) sched_yield(); /* yield/busy wait up to 10ms */ + else + (void) usleep(10 * MILLISEC); + + } while (NSEC2MSEC(gethrtime() - start) < (3 * MILLISEC)); + + if (dev == NULL) + goto no_dev; + + /* + * Only whole disks require extra device strings + */ + if (!wholedisk && !udev_mpath_whole_disk(dev)) + goto no_dev; + + ret = zfs_device_get_devid(dev, ds->vds_devid, sizeof (ds->vds_devid)); + if (ret != 0) + goto no_dev_ref; + + /* physical location string (optional) */ + if (zfs_device_get_physical(dev, ds->vds_devphys, + sizeof (ds->vds_devphys)) != 0) { + ds->vds_devphys[0] = '\0'; /* empty string --> not available */ + } + +no_dev_ref: + udev_device_unref(dev); +no_dev: + udev_unref(udev); + + return (ret); +#else + return (ENOENT); +#endif +} + +/* + * Update a leaf vdev's persistent device strings + * + * - only applies for a dedicated leaf vdev (aka whole disk) + * - updated during pool create|add|attach|import + * - used for matching device matching during auto-{online,expand,replace} + * - stored in a leaf disk config label (i.e. alongside 'path' NVP) + * - these strings are currently not used in kernel (i.e. for vdev_disk_open) + * + * single device node example: + * devid: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + * phys_path: 'pci-0000:04:00.0-sas-0x50000394a8cb3d67-lun-0' + * + * multipath device node example: + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * We also store the enclosure sysfs path for turning on enclosure LEDs + * (if applicable): + * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + */ +void +update_vdev_config_dev_strs(nvlist_t *nv) +{ + vdev_dev_strs_t vds; + char *env, *type, *path; + uint64_t wholedisk = 0; + char *upath, *spath; + + /* + * For the benefit of legacy ZFS implementations, allow + * for opting out of devid strings in the vdev label. + * + * example use: + * env ZFS_VDEV_DEVID_OPT_OUT=YES zpool import dozer + * + * explanation: + * Older ZFS on Linux implementations had issues when attempting to + * display pool config VDEV names if a "devid" NVP value is present + * in the pool's config. + * + * For example, a pool that originated on illumos platform would + * have a devid value in the config and "zpool status" would fail + * when listing the config. + * + * A pool can be stripped of any "devid" values on import or + * prevented from adding them on zpool create|add by setting + * ZFS_VDEV_DEVID_OPT_OUT. + */ + env = getenv("ZFS_VDEV_DEVID_OPT_OUT"); + if (env && (strtoul(env, NULL, 0) > 0 || + !strncasecmp(env, "YES", 3) || !strncasecmp(env, "ON", 2))) { + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + return; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0 || + strcmp(type, VDEV_TYPE_DISK) != 0) { + return; + } + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + return; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); + + /* + * Update device string values in the config nvlist. + */ + if (encode_device_strings(path, &vds, (boolean_t)wholedisk) == 0) { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vds.vds_devid); + if (vds.vds_devphys[0] != '\0') { + (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vds.vds_devphys); + } + + /* Add enclosure sysfs path (if disk is in an enclosure). */ + upath = zfs_get_underlying_path(path); + spath = zfs_get_enclosure_sysfs_path(upath); + if (spath) + nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + spath); + else + nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + + free(upath); + free(spath); + } else { + /* Clear out any stale entries. */ + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_PHYS_PATH); + (void) nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + } +} diff --git a/lib/libzutil/zutil_device_path.c b/lib/libzutil/zutil_device_path.c new file mode 100644 index 000000000000..27ca80e5040c --- /dev/null +++ b/lib/libzutil/zutil_device_path.c @@ -0,0 +1,174 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include + +/* + * Given a shorthand device name check if a file by that name exists in any + * of the 'zpool_default_import_path' or ZPOOL_IMPORT_PATH directories. If + * one is found, store its fully qualified path in the 'path' buffer passed + * by the caller and return 0, otherwise return an error. + */ +int +zfs_resolve_shortname(const char *name, char *path, size_t len) +{ + int i, error = -1; + char *dir, *env, *envdup; + + env = getenv("ZPOOL_IMPORT_PATH"); + errno = ENOENT; + + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + while (dir && error) { + (void) snprintf(path, len, "%s/%s", dir, name); + error = access(path, F_OK); + dir = strtok(NULL, ":"); + } + free(envdup); + } else { + const char * const *zpool_default_import_path; + size_t count; + + zpool_default_import_path = zpool_default_search_paths(&count); + + for (i = 0; i < count && error < 0; i++) { + (void) snprintf(path, len, "%s/%s", + zpool_default_import_path[i], name); + error = access(path, F_OK); + } + } + + return (error ? ENOENT : 0); +} + +/* + * Given a shorthand device name look for a match against 'cmp_name'. This + * is done by checking all prefix expansions using either the default + * 'zpool_default_import_paths' or the ZPOOL_IMPORT_PATH environment + * variable. Proper partition suffixes will be appended if this is a + * whole disk. When a match is found 0 is returned otherwise ENOENT. + */ +static int +zfs_strcmp_shortname(const char *name, const char *cmp_name, int wholedisk) +{ + int path_len, cmp_len, i = 0, error = ENOENT; + char *dir, *env, *envdup = NULL; + char path_name[MAXPATHLEN]; + const char * const *zpool_default_import_path; + size_t count; + + zpool_default_import_path = zpool_default_search_paths(&count); + + cmp_len = strlen(cmp_name); + env = getenv("ZPOOL_IMPORT_PATH"); + + if (env) { + envdup = strdup(env); + dir = strtok(envdup, ":"); + } else { + dir = (char *)zpool_default_import_path[i]; + } + + while (dir) { + /* Trim trailing directory slashes from ZPOOL_IMPORT_PATH */ + if (env) { + while (dir[strlen(dir)-1] == '/') + dir[strlen(dir)-1] = '\0'; + } + + path_len = snprintf(path_name, MAXPATHLEN, "%s/%s", dir, name); + if (wholedisk) + path_len = zfs_append_partition(path_name, MAXPATHLEN); + + if ((path_len == cmp_len) && strcmp(path_name, cmp_name) == 0) { + error = 0; + break; + } + + if (env) { + dir = strtok(NULL, ":"); + } else if (++i < count) { + dir = (char *)zpool_default_import_path[i]; + } else { + dir = NULL; + } + } + + if (env) + free(envdup); + + return (error); +} + +/* + * Given either a shorthand or fully qualified path name look for a match + * against 'cmp'. The passed name will be expanded as needed for comparison + * purposes and redundant slashes stripped to ensure an accurate match. + */ +int +zfs_strcmp_pathname(const char *name, const char *cmp, int wholedisk) +{ + int path_len, cmp_len; + char path_name[MAXPATHLEN]; + char cmp_name[MAXPATHLEN]; + char *dir, *dup; + + /* Strip redundant slashes if one exists due to ZPOOL_IMPORT_PATH */ + memset(cmp_name, 0, MAXPATHLEN); + dup = strdup(cmp); + dir = strtok(dup, "/"); + while (dir) { + strlcat(cmp_name, "/", sizeof (cmp_name)); + strlcat(cmp_name, dir, sizeof (cmp_name)); + dir = strtok(NULL, "/"); + } + free(dup); + + if (name[0] != '/') + return (zfs_strcmp_shortname(name, cmp_name, wholedisk)); + + (void) strlcpy(path_name, name, MAXPATHLEN); + path_len = strlen(path_name); + cmp_len = strlen(cmp_name); + + if (wholedisk) { + path_len = zfs_append_partition(path_name, MAXPATHLEN); + if (path_len == -1) + return (ENOMEM); + } + + if ((path_len != cmp_len) || strcmp(path_name, cmp_name)) + return (ENOENT); + + return (0); +} diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c new file mode 100644 index 000000000000..b8cdc118b263 --- /dev/null +++ b/lib/libzutil/zutil_import.c @@ -0,0 +1,1587 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ + +/* + * Pool import support functions. + * + * Used by zpool, ztest, zdb, and zhack to locate importable configs. Since + * these commands are expected to run in the global zone, we can assume + * that the devices are all readable when called. + * + * To import a pool, we rely on reading the configuration information from the + * ZFS label of each device. If we successfully read the label, then we + * organize the configuration information in the following hierarchy: + * + * pool guid -> toplevel vdev guid -> label txg + * + * Duplicate entries matching this same tuple will be discarded. Once we have + * examined every device, we pick the best label txg config for each toplevel + * vdev. We then arrange these toplevel vdevs into a complete pool config, and + * update any paths that have changed. Finally, we attempt to import the pool + * using our derived config, and record the results. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zutil_import.h" + +/*PRINTFLIKE2*/ +static void +zutil_error_aux(libpc_handle_t *hdl, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + (void) vsnprintf(hdl->lpc_desc, sizeof (hdl->lpc_desc), fmt, ap); + hdl->lpc_desc_active = B_TRUE; + + va_end(ap); +} + +static void +zutil_verror(libpc_handle_t *hdl, const char *error, const char *fmt, + va_list ap) +{ + char action[1024]; + + (void) vsnprintf(action, sizeof (action), fmt, ap); + + if (hdl->lpc_desc_active) + hdl->lpc_desc_active = B_FALSE; + else + hdl->lpc_desc[0] = '\0'; + + if (hdl->lpc_printerr) { + if (hdl->lpc_desc[0] != '\0') + error = hdl->lpc_desc; + + (void) fprintf(stderr, "%s: %s\n", action, error); + } +} + +/*PRINTFLIKE3*/ +static int +zutil_error_fmt(libpc_handle_t *hdl, const char *error, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + + zutil_verror(hdl, error, fmt, ap); + + va_end(ap); + + return (-1); +} + +static int +zutil_error(libpc_handle_t *hdl, const char *error, const char *msg) +{ + return (zutil_error_fmt(hdl, error, "%s", msg)); +} + +static int +zutil_no_memory(libpc_handle_t *hdl) +{ + zutil_error(hdl, EZFS_NOMEM, "internal error"); + exit(1); +} + +void * +zutil_alloc(libpc_handle_t *hdl, size_t size) +{ + void *data; + + if ((data = calloc(1, size)) == NULL) + (void) zutil_no_memory(hdl); + + return (data); +} + +char * +zutil_strdup(libpc_handle_t *hdl, const char *str) +{ + char *ret; + + if ((ret = strdup(str)) == NULL) + (void) zutil_no_memory(hdl); + + return (ret); +} + +/* + * Intermediate structures used to gather configuration information. + */ +typedef struct config_entry { + uint64_t ce_txg; + nvlist_t *ce_config; + struct config_entry *ce_next; +} config_entry_t; + +typedef struct vdev_entry { + uint64_t ve_guid; + config_entry_t *ve_configs; + struct vdev_entry *ve_next; +} vdev_entry_t; + +typedef struct pool_entry { + uint64_t pe_guid; + vdev_entry_t *pe_vdevs; + struct pool_entry *pe_next; +} pool_entry_t; + +typedef struct name_entry { + char *ne_name; + uint64_t ne_guid; + uint64_t ne_order; + uint64_t ne_num_labels; + struct name_entry *ne_next; +} name_entry_t; + +typedef struct pool_list { + pool_entry_t *pools; + name_entry_t *names; +} pool_list_t; + +/* + * Go through and fix up any path and/or devid information for the given vdev + * configuration. + */ +static int +fix_paths(libpc_handle_t *hdl, nvlist_t *nv, name_entry_t *names) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + name_entry_t *ne, *best; + char *path; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (fix_paths(hdl, child[c], names) != 0) + return (-1); + return (0); + } + + /* + * This is a leaf (file or disk) vdev. In either case, go through + * the name list and see if we find a matching guid. If so, replace + * the path and see if we can calculate a new devid. + * + * There may be multiple names associated with a particular guid, in + * which case we have overlapping partitions or multiple paths to the + * same disk. In this case we prefer to use the path name which + * matches the ZPOOL_CONFIG_PATH. If no matching entry is found we + * use the lowest order device which corresponds to the first match + * while traversing the ZPOOL_IMPORT_PATH search path. + */ + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) + path = NULL; + + best = NULL; + for (ne = names; ne != NULL; ne = ne->ne_next) { + if (ne->ne_guid == guid) { + if (path == NULL) { + best = ne; + break; + } + + if ((strlen(path) == strlen(ne->ne_name)) && + strncmp(path, ne->ne_name, strlen(path)) == 0) { + best = ne; + break; + } + + if (best == NULL) { + best = ne; + continue; + } + + /* Prefer paths with move vdev labels. */ + if (ne->ne_num_labels > best->ne_num_labels) { + best = ne; + continue; + } + + /* Prefer paths earlier in the search order. */ + if (ne->ne_num_labels == best->ne_num_labels && + ne->ne_order < best->ne_order) { + best = ne; + continue; + } + } + } + + if (best == NULL) + return (0); + + if (nvlist_add_string(nv, ZPOOL_CONFIG_PATH, best->ne_name) != 0) + return (-1); + + update_vdev_config_dev_strs(nv); + + return (0); +} + +/* + * Add the given configuration to the list of known devices. + */ +static int +add_config(libpc_handle_t *hdl, pool_list_t *pl, const char *path, + int order, int num_labels, nvlist_t *config) +{ + uint64_t pool_guid, vdev_guid, top_guid, txg, state; + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + name_entry_t *ne; + + /* + * If this is a hot spare not currently in use or level 2 cache + * device, add it to the list of names to translate, but don't do + * anything else. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) == 0 && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE) && + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid) == 0) { + if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); + } + + /* + * If we have a valid config but cannot read any of these fields, then + * it means we have a half-initialized label. In vdev_label_init() + * we write a label with txg == 0 so that we can identify the device + * in case the user refers to the same disk later on. If we fail to + * create the pool, we'll be left with a label in this state + * which should not be considered part of a valid pool. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, + &vdev_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, + &top_guid) != 0 || + nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0) { + return (0); + } + + /* + * First, see if we know about this pool. If not, then add it to the + * list of known pools. + */ + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + if (pe->pe_guid == pool_guid) + break; + } + + if (pe == NULL) { + if ((pe = zutil_alloc(hdl, sizeof (pool_entry_t))) == NULL) { + return (-1); + } + pe->pe_guid = pool_guid; + pe->pe_next = pl->pools; + pl->pools = pe; + } + + /* + * Second, see if we know about this toplevel vdev. Add it if its + * missing. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + if (ve->ve_guid == top_guid) + break; + } + + if (ve == NULL) { + if ((ve = zutil_alloc(hdl, sizeof (vdev_entry_t))) == NULL) { + return (-1); + } + ve->ve_guid = top_guid; + ve->ve_next = pe->pe_vdevs; + pe->pe_vdevs = ve; + } + + /* + * Third, see if we have a config with a matching transaction group. If + * so, then we do nothing. Otherwise, add it to the list of known + * configs. + */ + for (ce = ve->ve_configs; ce != NULL; ce = ce->ce_next) { + if (ce->ce_txg == txg) + break; + } + + if (ce == NULL) { + if ((ce = zutil_alloc(hdl, sizeof (config_entry_t))) == NULL) { + return (-1); + } + ce->ce_txg = txg; + ce->ce_config = fnvlist_dup(config); + ce->ce_next = ve->ve_configs; + ve->ve_configs = ce; + } + + /* + * At this point we've successfully added our config to the list of + * known configs. The last thing to do is add the vdev guid -> path + * mappings so that we can fix up the configuration as necessary before + * doing the import. + */ + if ((ne = zutil_alloc(hdl, sizeof (name_entry_t))) == NULL) + return (-1); + + if ((ne->ne_name = zutil_strdup(hdl, path)) == NULL) { + free(ne); + return (-1); + } + + ne->ne_guid = vdev_guid; + ne->ne_order = order; + ne->ne_num_labels = num_labels; + ne->ne_next = pl->names; + pl->names = ne; + + return (0); +} + +static int +zutil_pool_active(libpc_handle_t *hdl, const char *name, uint64_t guid, + boolean_t *isactive) +{ + ASSERT(hdl->lpc_ops->pco_pool_active != NULL); + + int error = hdl->lpc_ops->pco_pool_active(hdl->lpc_lib_handle, name, + guid, isactive); + + return (error); +} + +static nvlist_t * +zutil_refresh_config(libpc_handle_t *hdl, nvlist_t *tryconfig) +{ + ASSERT(hdl->lpc_ops->pco_refresh_config != NULL); + + return (hdl->lpc_ops->pco_refresh_config(hdl->lpc_lib_handle, + tryconfig)); +} + +/* + * Determine if the vdev id is a hole in the namespace. + */ +static boolean_t +vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + int c; + + for (c = 0; c < holes; c++) { + + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Convert our list of pools into the definitive set of configurations. We + * start by picking the best config for each toplevel vdev. Once that's done, + * we assemble the toplevel vdevs into a full config for the pool. We make a + * pass to fix up any incorrect paths, and then add it to the main list to + * return to the user. + */ +static nvlist_t * +get_configs(libpc_handle_t *hdl, pool_list_t *pl, boolean_t active_ok, + nvlist_t *policy) +{ + pool_entry_t *pe; + vdev_entry_t *ve; + config_entry_t *ce; + nvlist_t *ret = NULL, *config = NULL, *tmp = NULL, *nvtop, *nvroot; + nvlist_t **spares, **l2cache; + uint_t i, nspares, nl2cache; + boolean_t config_seen; + uint64_t best_txg; + char *name, *hostname = NULL; + uint64_t guid; + uint_t children = 0; + nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; + uint_t c; + boolean_t isactive; + uint64_t hostid; + nvlist_t *nvl; + boolean_t valid_top_config = B_FALSE; + + if (nvlist_alloc(&ret, 0, 0) != 0) + goto nomem; + + for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { + uint64_t id, max_txg = 0; + + if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + config_seen = B_FALSE; + + /* + * Iterate over all toplevel vdevs. Grab the pool configuration + * from the first one we find, and then go through the rest and + * add them as necessary to the 'vdevs' member of the config. + */ + for (ve = pe->pe_vdevs; ve != NULL; ve = ve->ve_next) { + + /* + * Determine the best configuration for this vdev by + * selecting the config with the latest transaction + * group. + */ + best_txg = 0; + for (ce = ve->ve_configs; ce != NULL; + ce = ce->ce_next) { + + if (ce->ce_txg > best_txg) { + tmp = ce->ce_config; + best_txg = ce->ce_txg; + } + } + + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + + if (!config_seen) { + /* + * Copy the relevant pieces of data to the pool + * configuration: + * + * version + * pool guid + * name + * comment (if available) + * pool state + * hostid (if available) + * hostname (if available) + */ + uint64_t state, version; + char *comment = NULL; + + version = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VERSION); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_VERSION, version); + guid = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_GUID); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_GUID, guid); + name = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_POOL_NAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_POOL_NAME, name); + + if (nvlist_lookup_string(tmp, + ZPOOL_CONFIG_COMMENT, &comment) == 0) + fnvlist_add_string(config, + ZPOOL_CONFIG_COMMENT, comment); + + state = fnvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_POOL_STATE); + fnvlist_add_uint64(config, + ZPOOL_CONFIG_POOL_STATE, state); + + hostid = 0; + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + fnvlist_add_uint64(config, + ZPOOL_CONFIG_HOSTID, hostid); + hostname = fnvlist_lookup_string(tmp, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(config, + ZPOOL_CONFIG_HOSTNAME, hostname); + } + + config_seen = B_TRUE; + } + + /* + * Add this top-level vdev to the child array. + */ + verify(nvlist_lookup_nvlist(tmp, + ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); + verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, + &id) == 0); + + if (id >= children) { + nvlist_t **newchild; + + newchild = zutil_alloc(hdl, (id + 1) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = id + 1; + } + if (nvlist_dup(nvtop, &child[id], 0) != 0) + goto nomem; + + } + + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = zutil_alloc(hdl, (max_id) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = max_id; + } + } + + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !vdev_is_hole(hole_array, holes, c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(holey); + goto nomem; + } + child[c] = holey; + } + } + + /* + * Look for any missing top-level vdevs. If this is the case, + * create a faked up 'missing' vdev as a placeholder. We cannot + * simply compress the child array, because the kernel performs + * certain checks to make sure the vdev IDs match their location + * in the configuration. + */ + for (c = 0; c < children; c++) { + if (child[c] == NULL) { + nvlist_t *missing; + if (nvlist_alloc(&missing, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + if (nvlist_add_string(missing, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(missing, + ZPOOL_CONFIG_GUID, 0ULL) != 0) { + nvlist_free(missing); + goto nomem; + } + child[c] = missing; + } + } + + /* + * Put all of this pool's top-level vdevs into a root vdev. + */ + if (nvlist_alloc(&nvroot, NV_UNIQUE_NAME, 0) != 0) + goto nomem; + if (nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) != 0 || + nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, guid) != 0 || + nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + child, children) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + children = 0; + child = NULL; + + /* + * Go through and fix up any paths and/or devids based on our + * known list of vdev GUID -> path mappings. + */ + if (fix_paths(hdl, nvroot, pl->names) != 0) { + nvlist_free(nvroot); + goto nomem; + } + + /* + * Add the root vdev to this pool's configuration. + */ + if (nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + nvroot) != 0) { + nvlist_free(nvroot); + goto nomem; + } + nvlist_free(nvroot); + + /* + * zdb uses this path to report on active pools that were + * imported or created using -R. + */ + if (active_ok) + goto add_pool; + + /* + * Determine if this pool is currently active, in which case we + * can't actually import it. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &guid) == 0); + + if (zutil_pool_active(hdl, name, guid, &isactive) != 0) + goto error; + + if (isactive) { + nvlist_free(config); + config = NULL; + continue; + } + + if (policy != NULL) { + if (nvlist_add_nvlist(config, ZPOOL_LOAD_POLICY, + policy) != 0) + goto nomem; + } + + if ((nvl = zutil_refresh_config(hdl, config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } + + nvlist_free(config); + config = nvl; + + /* + * Go through and update the paths for spares, now that we have + * them. + */ + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + for (i = 0; i < nspares; i++) { + if (fix_paths(hdl, spares[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Update the paths for l2cache devices. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + for (i = 0; i < nl2cache; i++) { + if (fix_paths(hdl, l2cache[i], pl->names) != 0) + goto nomem; + } + } + + /* + * Restore the original information read from the actual label. + */ + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTID, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, ZPOOL_CONFIG_HOSTNAME, + DATA_TYPE_STRING); + if (hostid != 0) { + verify(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, + hostid) == 0); + verify(nvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, + hostname) == 0); + } + +add_pool: + /* + * Add this pool to the list of configs. + */ + verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &name) == 0); + + if (nvlist_add_nvlist(ret, name, config) != 0) + goto nomem; + + nvlist_free(config); + config = NULL; + } + + return (ret); + +nomem: + (void) zutil_no_memory(hdl); +error: + nvlist_free(config); + nvlist_free(ret); + for (c = 0; c < children; c++) + nvlist_free(child[c]); + free(child); + + return (NULL); +} + +/* + * Return the offset of the given label. + */ +static uint64_t +label_offset(uint64_t size, int l) +{ + ASSERT(P2PHASE_TYPED(size, sizeof (vdev_label_t), uint64_t) == 0); + return (l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : size - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Given a file descriptor, read the label information and return an nvlist + * describing the configuration, if there is one. The number of valid + * labels found will be returned in num_labels when non-NULL. + */ +int +zpool_read_label(int fd, nvlist_t **config, int *num_labels) +{ + struct stat64 statbuf; + int l, count = 0; + vdev_label_t *label; + nvlist_t *expected_config = NULL; + uint64_t expected_guid = 0, size; + int error; + + *config = NULL; + + if (fstat64_blk(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + error = posix_memalign((void **)&label, PAGESIZE, sizeof (*label)); + if (error) + return (-1); + + for (l = 0; l < VDEV_LABELS; l++) { + uint64_t state, guid, txg; + + if (pread64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) + continue; + + if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, + sizeof (label->vl_vdev_phys.vp_nvlist), config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_GUID, + &guid) != 0 || guid == 0) { + nvlist_free(*config); + continue; + } + + if (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(*config); + continue; + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(*config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(*config); + continue; + } + + if (expected_guid) { + if (expected_guid == guid) + count++; + + nvlist_free(*config); + } else { + expected_config = *config; + expected_guid = guid; + count++; + } + } + + if (num_labels != NULL) + *num_labels = count; + + free(label); + *config = expected_config; + + return (0); +} + +/* + * Sorted by full path and then vdev guid to allow for multiple entries with + * the same full path name. This is required because it's possible to + * have multiple block devices with labels that refer to the same + * ZPOOL_CONFIG_PATH yet have different vdev guids. In this case both + * entries need to be added to the cache. Scenarios where this can occur + * include overwritten pool labels, devices which are visible from multiple + * hosts and multipath devices. + */ +int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + uint64_t guid1 = ((rdsk_node_t *)arg1)->rn_vdev_guid; + uint64_t guid2 = ((rdsk_node_t *)arg2)->rn_vdev_guid; + int rv; + + rv = TREE_ISIGN(strcmp(nm1, nm2)); + if (rv) + return (rv); + + return (TREE_CMP(guid1, guid2)); +} + +static int +label_paths_impl(libpc_handle_t *hdl, nvlist_t *nvroot, uint64_t pool_guid, + uint64_t vdev_guid, char **path, char **devid) +{ + nvlist_t **child; + uint_t c, children; + uint64_t guid; + char *val; + int error; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) { + error = label_paths_impl(hdl, child[c], + pool_guid, vdev_guid, path, devid); + if (error) + return (error); + } + return (0); + } + + if (nvroot == NULL) + return (0); + + error = nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_GUID, &guid); + if ((error != 0) || (guid != vdev_guid)) + return (0); + + error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_PATH, &val); + if (error == 0) + *path = val; + + error = nvlist_lookup_string(nvroot, ZPOOL_CONFIG_DEVID, &val); + if (error == 0) + *devid = val; + + return (0); +} + +/* + * Given a disk label fetch the ZPOOL_CONFIG_PATH and ZPOOL_CONFIG_DEVID + * and store these strings as config_path and devid_path respectively. + * The returned pointers are only valid as long as label remains valid. + */ +int +label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, char **devid) +{ + nvlist_t *nvroot; + uint64_t pool_guid; + uint64_t vdev_guid; + + *path = NULL; + *devid = NULL; + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) + return (ENOENT); + + return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, + devid)); +} + +static void +zpool_find_import_scan_add_slice(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *path, const char *name, int order) +{ + avl_index_t where; + rdsk_node_t *slice; + + slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); + if (asprintf(&slice->rn_name, "%s/%s", path, name) == -1) { + free(slice); + return; + } + slice->rn_vdev_guid = 0; + slice->rn_lock = lock; + slice->rn_avl = cache; + slice->rn_hdl = hdl; + slice->rn_order = order + IMPORT_ORDER_SCAN_OFFSET; + slice->rn_labelpaths = B_FALSE; + + pthread_mutex_lock(lock); + if (avl_find(cache, slice, &where)) { + free(slice->rn_name); + free(slice); + } else { + avl_insert(cache, slice, where); + } + pthread_mutex_unlock(lock); +} + +static int +zpool_find_import_scan_dir(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *dir, int order) +{ + int error; + char path[MAXPATHLEN]; + struct dirent64 *dp; + DIR *dirp; + + if (realpath(dir, path) == NULL) { + error = errno; + if (error == ENOENT) + return (0); + + zutil_error_aux(hdl, strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir); + return (error); + } + + dirp = opendir(path); + if (dirp == NULL) { + error = errno; + zutil_error_aux(hdl, strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, + dgettext(TEXT_DOMAIN, "cannot open '%s'"), path); + return (error); + } + + while ((dp = readdir64(dirp)) != NULL) { + const char *name = dp->d_name; + if (name[0] == '.' && + (name[1] == 0 || (name[1] == '.' && name[2] == 0))) + continue; + + zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, + order); + } + + (void) closedir(dirp); + return (0); +} + +static int +zpool_find_import_scan_path(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t *cache, const char *dir, int order) +{ + int error = 0; + char path[MAXPATHLEN]; + char *d, *b; + char *dpath, *name; + + /* + * Separate the directory part and last part of the + * path. We do this so that we can get the realpath of + * the directory. We don't get the realpath on the + * whole path because if it's a symlink, we want the + * path of the symlink not where it points to. + */ + d = zutil_strdup(hdl, dir); + b = zutil_strdup(hdl, dir); + dpath = dirname(d); + name = basename(b); + + if (realpath(dpath, path) == NULL) { + error = errno; + if (error == ENOENT) { + error = 0; + goto out; + } + + zutil_error_aux(hdl, strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir); + goto out; + } + + zpool_find_import_scan_add_slice(hdl, lock, cache, path, name, order); + +out: + free(b); + free(d); + return (error); +} + +/* + * Scan a list of directories for zfs devices. + */ +static int +zpool_find_import_scan(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache, const char * const *dir, size_t dirs) +{ + avl_tree_t *cache; + rdsk_node_t *slice; + void *cookie; + int i, error; + + *slice_cache = NULL; + cache = zutil_alloc(hdl, sizeof (avl_tree_t)); + avl_create(cache, slice_cache_compare, sizeof (rdsk_node_t), + offsetof(rdsk_node_t, rn_node)); + + for (i = 0; i < dirs; i++) { + struct stat sbuf; + + if (stat(dir[i], &sbuf) != 0) { + error = errno; + if (error == ENOENT) + continue; + + zutil_error_aux(hdl, strerror(error)); + (void) zutil_error_fmt(hdl, EZFS_BADPATH, dgettext( + TEXT_DOMAIN, "cannot resolve path '%s'"), dir[i]); + goto error; + } + + /* + * If dir[i] is a directory, we walk through it and add all + * the entries to the cache. If it's not a directory, we just + * add it to the cache. + */ + if (S_ISDIR(sbuf.st_mode)) { + if ((error = zpool_find_import_scan_dir(hdl, lock, + cache, dir[i], i)) != 0) + goto error; + } else { + if ((error = zpool_find_import_scan_path(hdl, lock, + cache, dir[i], i)) != 0) + goto error; + } + } + + *slice_cache = cache; + return (0); + +error: + cookie = NULL; + while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { + free(slice->rn_name); + free(slice); + } + free(cache); + + return (error); +} + +/* + * Given a list of directories to search, find all pools stored on disk. This + * includes partial pools which are not available to import. If no args are + * given (argc is 0), then the default directory (/dev/dsk) is searched. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +static nvlist_t * +zpool_find_import_impl(libpc_handle_t *hdl, importargs_t *iarg) +{ + nvlist_t *ret = NULL; + pool_list_t pools = { 0 }; + pool_entry_t *pe, *penext; + vdev_entry_t *ve, *venext; + config_entry_t *ce, *cenext; + name_entry_t *ne, *nenext; + pthread_mutex_t lock; + avl_tree_t *cache; + rdsk_node_t *slice; + void *cookie; + tpool_t *t; + + verify(iarg->poolname == NULL || iarg->guid == 0); + pthread_mutex_init(&lock, NULL); + + /* + * Locate pool member vdevs by blkid or by directory scanning. + * On success a newly allocated AVL tree which is populated with an + * entry for each discovered vdev will be returned in the cache. + * It's the caller's responsibility to consume and destroy this tree. + */ + if (iarg->scan || iarg->paths != 0) { + size_t dirs = iarg->paths; + const char * const *dir = (const char * const *)iarg->path; + + if (dirs == 0) + dir = zpool_default_search_paths(&dirs); + + if (zpool_find_import_scan(hdl, &lock, &cache, dir, dirs) != 0) + return (NULL); + } else { + if (zpool_find_import_blkid(hdl, &lock, &cache) != 0) + return (NULL); + } + + /* + * Create a thread pool to parallelize the process of reading and + * validating labels, a large number of threads can be used due to + * minimal contention. + */ + t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), 0, NULL); + for (slice = avl_first(cache); slice; + (slice = avl_walk(cache, slice, AVL_AFTER))) + (void) tpool_dispatch(t, zpool_open_func, slice); + + tpool_wait(t); + tpool_destroy(t); + + /* + * Process the cache, filtering out any entries which are not + * for the specified pool then adding matching label configs. + */ + cookie = NULL; + while ((slice = avl_destroy_nodes(cache, &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; + boolean_t matched = B_TRUE; + boolean_t aux = B_FALSE; + int fd; + + /* + * Check if it's a spare or l2cache device. If it is, + * we need to skip the name and guid check since they + * don't exist on aux device label. + */ + if (iarg->poolname != NULL || iarg->guid != 0) { + uint64_t state; + aux = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_STATE, &state) == 0 && + (state == POOL_STATE_SPARE || + state == POOL_STATE_L2CACHE); + } + + if (iarg->poolname != NULL && !aux) { + char *pname; + + matched = nvlist_lookup_string(config, + ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && + strcmp(iarg->poolname, pname) == 0; + } else if (iarg->guid != 0 && !aux) { + uint64_t this_guid; + + matched = nvlist_lookup_uint64(config, + ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && + iarg->guid == this_guid; + } + if (matched) { + /* + * Verify all remaining entries can be opened + * exclusively. This will prune all underlying + * multipath devices which otherwise could + * result in the vdev appearing as UNAVAIL. + * + * Under zdb, this step isn't required and + * would prevent a zdb -e of active pools with + * no cachefile. + */ + fd = open(slice->rn_name, O_RDONLY | O_EXCL); + if (fd >= 0 || iarg->can_be_active) { + if (fd >= 0) + close(fd); + add_config(hdl, &pools, + slice->rn_name, slice->rn_order, + slice->rn_num_labels, config); + } + } + nvlist_free(config); + } + free(slice->rn_name); + free(slice); + } + avl_destroy(cache); + free(cache); + pthread_mutex_destroy(&lock); + + ret = get_configs(hdl, &pools, iarg->can_be_active, iarg->policy); + + for (pe = pools.pools; pe != NULL; pe = penext) { + penext = pe->pe_next; + for (ve = pe->pe_vdevs; ve != NULL; ve = venext) { + venext = ve->ve_next; + for (ce = ve->ve_configs; ce != NULL; ce = cenext) { + cenext = ce->ce_next; + nvlist_free(ce->ce_config); + free(ce); + } + free(ve); + } + free(pe); + } + + for (ne = pools.names; ne != NULL; ne = nenext) { + nenext = ne->ne_next; + free(ne->ne_name); + free(ne); + } + + return (ret); +} + +/* + * Given a cache file, return the contents as a list of importable pools. + * poolname or guid (but not both) are provided by the caller when trying + * to import a specific pool. + */ +static nvlist_t * +zpool_find_import_cached(libpc_handle_t *hdl, const char *cachefile, + const char *poolname, uint64_t guid) +{ + char *buf; + int fd; + struct stat64 statbuf; + nvlist_t *raw, *src, *dst; + nvlist_t *pools; + nvpair_t *elem; + char *name; + uint64_t this_guid; + boolean_t active; + + verify(poolname == NULL || guid == 0); + + if ((fd = open(cachefile, O_RDONLY)) < 0) { + zutil_error_aux(hdl, "%s", strerror(errno)); + (void) zutil_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to open cache file")); + return (NULL); + } + + if (fstat64(fd, &statbuf) != 0) { + zutil_error_aux(hdl, "%s", strerror(errno)); + (void) close(fd); + (void) zutil_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, "failed to get size of cache file")); + return (NULL); + } + + if ((buf = zutil_alloc(hdl, statbuf.st_size)) == NULL) { + (void) close(fd); + return (NULL); + } + + if (read(fd, buf, statbuf.st_size) != statbuf.st_size) { + (void) close(fd); + free(buf); + (void) zutil_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "failed to read cache file contents")); + return (NULL); + } + + (void) close(fd); + + if (nvlist_unpack(buf, statbuf.st_size, &raw, 0) != 0) { + free(buf); + (void) zutil_error(hdl, EZFS_BADCACHE, + dgettext(TEXT_DOMAIN, + "invalid or corrupt cache file contents")); + return (NULL); + } + + free(buf); + + /* + * Go through and get the current state of the pools and refresh their + * state. + */ + if (nvlist_alloc(&pools, 0, 0) != 0) { + (void) zutil_no_memory(hdl); + nvlist_free(raw); + return (NULL); + } + + elem = NULL; + while ((elem = nvlist_next_nvpair(raw, elem)) != NULL) { + src = fnvpair_value_nvlist(elem); + + name = fnvlist_lookup_string(src, ZPOOL_CONFIG_POOL_NAME); + if (poolname != NULL && strcmp(poolname, name) != 0) + continue; + + this_guid = fnvlist_lookup_uint64(src, ZPOOL_CONFIG_POOL_GUID); + if (guid != 0 && guid != this_guid) + continue; + + if (zutil_pool_active(hdl, name, this_guid, &active) != 0) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (active) + continue; + + if (nvlist_add_string(src, ZPOOL_CONFIG_CACHEFILE, + cachefile) != 0) { + (void) zutil_no_memory(hdl); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if ((dst = zutil_refresh_config(hdl, src)) == NULL) { + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + + if (nvlist_add_nvlist(pools, nvpair_name(elem), dst) != 0) { + (void) zutil_no_memory(hdl); + nvlist_free(dst); + nvlist_free(raw); + nvlist_free(pools); + return (NULL); + } + nvlist_free(dst); + } + + nvlist_free(raw); + return (pools); +} + +nvlist_t * +zpool_search_import(void *hdl, importargs_t *import, + const pool_config_ops_t *pco) +{ + libpc_handle_t handle = { 0 }; + nvlist_t *pools = NULL; + + handle.lpc_lib_handle = hdl; + handle.lpc_ops = pco; + handle.lpc_printerr = B_TRUE; + + verify(import->poolname == NULL || import->guid == 0); + + if (import->cachefile != NULL) + pools = zpool_find_import_cached(&handle, import->cachefile, + import->poolname, import->guid); + else + pools = zpool_find_import_impl(&handle, import); + + if ((pools == NULL || nvlist_empty(pools)) && + handle.lpc_open_access_error && geteuid() != 0) { + (void) zutil_error(&handle, EZFS_EACESS, dgettext(TEXT_DOMAIN, + "no pools found")); + } + + return (pools); +} + +static boolean_t +pool_match(nvlist_t *cfg, char *tgt) +{ + uint64_t v, guid = strtoull(tgt, NULL, 0); + char *s; + + if (guid != 0) { + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) + return (v == guid); + } else { + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) + return (strcmp(s, tgt) == 0); + } + return (B_FALSE); +} + +int +zpool_find_config(void *hdl, const char *target, nvlist_t **configp, + importargs_t *args, const pool_config_ops_t *pco) +{ + nvlist_t *pools; + nvlist_t *match = NULL; + nvlist_t *config = NULL; + char *name = NULL, *sepp = NULL; + char sep = '\0'; + int count = 0; + char *targetdup = strdup(target); + + *configp = NULL; + + if ((sepp = strpbrk(targetdup, "/@")) != NULL) { + sep = *sepp; + *sepp = '\0'; + } + + pools = zpool_search_import(hdl, args, pco); + + if (pools != NULL) { + nvpair_t *elem = NULL; + while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { + VERIFY0(nvpair_value_nvlist(elem, &config)); + if (pool_match(config, targetdup)) { + count++; + if (match != NULL) { + /* multiple matches found */ + continue; + } else { + match = config; + name = nvpair_name(elem); + } + } + } + } + + if (count == 0) { + free(targetdup); + return (ENOENT); + } + + if (count > 1) { + free(targetdup); + return (EINVAL); + } + + *configp = match; + free(targetdup); + + return (0); +} diff --git a/lib/libzutil/zutil_import.h b/lib/libzutil/zutil_import.h new file mode 100644 index 000000000000..0108eb45c5cf --- /dev/null +++ b/lib/libzutil/zutil_import.h @@ -0,0 +1,76 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright 2015 RackTop Systems. + * Copyright (c) 2016, Intel Corporation. + */ +#ifndef _LIBZUTIL_ZUTIL_IMPORT_H_ +#define _LIBZUTIL_ZUTIL_IMPORT_H_ + +#define EZFS_BADCACHE "invalid or missing cache file" +#define EZFS_BADPATH "must be an absolute path" +#define EZFS_NOMEM "out of memory" +#define EZFS_EACESS "some devices require root privileges" + +#define IMPORT_ORDER_PREFERRED_1 1 +#define IMPORT_ORDER_PREFERRED_2 2 +#define IMPORT_ORDER_SCAN_OFFSET 10 +#define IMPORT_ORDER_DEFAULT 100 + +typedef struct libpc_handle { + boolean_t lpc_printerr; + boolean_t lpc_open_access_error; + boolean_t lpc_desc_active; + char lpc_desc[1024]; + const pool_config_ops_t *lpc_ops; + void *lpc_lib_handle; +} libpc_handle_t; + + +int label_paths(libpc_handle_t *hdl, nvlist_t *label, char **path, + char **devid); +int zpool_find_import_blkid(libpc_handle_t *hdl, pthread_mutex_t *lock, + avl_tree_t **slice_cache); + +void * zutil_alloc(libpc_handle_t *hdl, size_t size); +char *zutil_strdup(libpc_handle_t *hdl, const char *str); + +typedef struct rdsk_node { + char *rn_name; /* Full path to device */ + int rn_order; /* Preferred order (low to high) */ + int rn_num_labels; /* Number of valid labels */ + uint64_t rn_vdev_guid; /* Expected vdev guid when set */ + libpc_handle_t *rn_hdl; + nvlist_t *rn_config; /* Label config */ + avl_tree_t *rn_avl; + avl_node_t rn_node; + pthread_mutex_t *rn_lock; + boolean_t rn_labelpaths; +} rdsk_node_t; + +int slice_cache_compare(const void *, const void *); + +void zpool_open_func(void *); + +#endif /* _LIBZUTIL_ZUTIL_IMPORT_H_ */ diff --git a/lib/libzutil/zutil_nicenum.c b/lib/libzutil/zutil_nicenum.c new file mode 100644 index 000000000000..306cec3caddf --- /dev/null +++ b/lib/libzutil/zutil_nicenum.c @@ -0,0 +1,172 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include + +/* + * Return B_TRUE if "str" is a number string, B_FALSE otherwise. + * Works for integer and floating point numbers. + */ +boolean_t +zfs_isnumber(const char *str) +{ + for (; *str; str++) + if (!(isdigit(*str) || (*str == '.'))) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum_format(uint64_t num, char *buf, size_t buflen, + enum zfs_nicenum_format format) +{ + uint64_t n = num; + int index = 0; + const char *u; + const char *units[3][7] = { + [ZFS_NICENUM_1024] = {"", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_BYTES] = {"B", "K", "M", "G", "T", "P", "E"}, + [ZFS_NICENUM_TIME] = {"ns", "us", "ms", "s", "?", "?", "?"} + }; + + const int units_len[] = {[ZFS_NICENUM_1024] = 6, + [ZFS_NICENUM_BYTES] = 6, + [ZFS_NICENUM_TIME] = 4}; + + const int k_unit[] = { [ZFS_NICENUM_1024] = 1024, + [ZFS_NICENUM_BYTES] = 1024, + [ZFS_NICENUM_TIME] = 1000}; + + double val; + + if (format == ZFS_NICENUM_RAW) { + snprintf(buf, buflen, "%llu", (u_longlong_t)num); + return; + } else if (format == ZFS_NICENUM_RAWTIME && num > 0) { + snprintf(buf, buflen, "%llu", (u_longlong_t)num); + return; + } else if (format == ZFS_NICENUM_RAWTIME && num == 0) { + snprintf(buf, buflen, "%s", "-"); + return; + } + + while (n >= k_unit[format] && index < units_len[format]) { + n /= k_unit[format]; + index++; + } + + u = units[format][index]; + + /* Don't print zero latencies since they're invalid */ + if ((format == ZFS_NICENUM_TIME) && (num == 0)) { + (void) snprintf(buf, buflen, "-"); + } else if ((index == 0) || ((num % + (uint64_t)powl(k_unit[format], index)) == 0)) { + /* + * If this is an even multiple of the base, always display + * without any decimal precision. + */ + (void) snprintf(buf, buflen, "%llu%s", (u_longlong_t)n, u); + + } else { + /* + * We want to choose a precision that reflects the best choice + * for fitting in 5 characters. This can get rather tricky when + * we have numbers that are very close to an order of magnitude. + * For example, when displaying 10239 (which is really 9.999K), + * we want only a single place of precision for 10.0K. We could + * develop some complex heuristics for this, but it's much + * easier just to try each combination in turn. + */ + int i; + for (i = 2; i >= 0; i--) { + val = (double)num / + (uint64_t)powl(k_unit[format], index); + + /* + * Don't print floating point values for time. Note, + * we use floor() instead of round() here, since + * round can result in undesirable results. For + * example, if "num" is in the range of + * 999500-999999, it will print out "1000us". This + * doesn't happen if we use floor(). + */ + if (format == ZFS_NICENUM_TIME) { + if (snprintf(buf, buflen, "%d%s", + (unsigned int) floor(val), u) <= 5) + break; + + } else { + if (snprintf(buf, buflen, "%.*f%s", i, + val, u) <= 5) + break; + } + } + } +} + +/* + * Convert a number to an appropriately human-readable output. + */ +void +zfs_nicenum(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_1024); +} + +/* + * Convert a time to an appropriately human-readable output. + * @num: Time in nanoseconds + */ +void +zfs_nicetime(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_TIME); +} + +/* + * Print out a raw number with correct column spacing + */ +void +zfs_niceraw(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_RAW); +} + +/* + * Convert a number of bytes to an appropriately human-readable output. + */ +void +zfs_nicebytes(uint64_t num, char *buf, size_t buflen) +{ + zfs_nicenum_format(num, buf, buflen, ZFS_NICENUM_BYTES); +} diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c new file mode 100644 index 000000000000..734650f3cffc --- /dev/null +++ b/lib/libzutil/zutil_pool.c @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include + +static void +dump_ddt_stat(const ddt_stat_t *dds, int h) +{ + char refcnt[6]; + char blocks[6], lsize[6], psize[6], dsize[6]; + char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; + + if (dds == NULL || dds->dds_blocks == 0) + return; + + if (h == -1) + (void) strcpy(refcnt, "Total"); + else + zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); + + zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); + zfs_nicebytes(dds->dds_lsize, lsize, sizeof (lsize)); + zfs_nicebytes(dds->dds_psize, psize, sizeof (psize)); + zfs_nicebytes(dds->dds_dsize, dsize, sizeof (dsize)); + zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); + zfs_nicebytes(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); + zfs_nicebytes(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); + zfs_nicebytes(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + refcnt, + blocks, lsize, psize, dsize, + ref_blocks, ref_lsize, ref_psize, ref_dsize); +} + +/* + * Print the DDT histogram and the column totals. + */ +void +zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) +{ + int h; + + (void) printf("\n"); + + (void) printf("bucket " + " allocated " + " referenced \n"); + (void) printf("______ " + "______________________________ " + "______________________________\n"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "refcnt", + "blocks", "LSIZE", "PSIZE", "DSIZE", + "blocks", "LSIZE", "PSIZE", "DSIZE"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "------", + "------", "-----", "-----", "-----", + "------", "-----", "-----", "-----"); + + for (h = 0; h < 64; h++) + dump_ddt_stat(&ddh->ddh_stat[h], h); + + dump_ddt_stat(dds_total, -1); + + (void) printf("\n"); +} + +/* + * Process the buffer of nvlists, unpacking and storing each nvlist record + * into 'records'. 'leftover' is set to the number of bytes that weren't + * processed as there wasn't a complete record. + */ +int +zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, + nvlist_t ***records, uint_t *numrecords) +{ + uint64_t reclen; + nvlist_t *nv; + int i; + void *tmp; + + while (bytes_read > sizeof (reclen)) { + + /* get length of packed record (stored as little endian) */ + for (i = 0, reclen = 0; i < sizeof (reclen); i++) + reclen += (uint64_t)(((uchar_t *)buf)[i]) << (8*i); + + if (bytes_read < sizeof (reclen) + reclen) + break; + + /* unpack record */ + if (nvlist_unpack(buf + sizeof (reclen), reclen, &nv, 0) != 0) + return (ENOMEM); + bytes_read -= sizeof (reclen) + reclen; + buf += sizeof (reclen) + reclen; + + /* add record to nvlist array */ + (*numrecords)++; + if (ISP2(*numrecords + 1)) { + tmp = realloc(*records, + *numrecords * 2 * sizeof (nvlist_t *)); + if (tmp == NULL) { + nvlist_free(nv); + (*numrecords)--; + return (ENOMEM); + } + *records = tmp; + } + (*records)[*numrecords - 1] = nv; + } + + *leftover = bytes_read; + return (0); +} diff --git a/man/Makefile.am b/man/Makefile.am new file mode 100644 index 000000000000..841cb9c4e6a0 --- /dev/null +++ b/man/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = man1 man5 man8 diff --git a/man/man1/Makefile.am b/man/man1/Makefile.am new file mode 100644 index 000000000000..8d7457a3e258 --- /dev/null +++ b/man/man1/Makefile.am @@ -0,0 +1,12 @@ +dist_man_MANS = zhack.1 ztest.1 raidz_test.1 zvol_wait.1 arcstat.1 +EXTRA_DIST = cstyle.1 + +if BUILD_LINUX +# The man pager in most Linux distros defaults to BSD instead of Linux +# when .Os is blank, but leaving it blank makes things a lot easier on +# FreeBSD when OpenZFS is vendored in the base system. +install-data-hook: + cd $(DESTDIR)$(mandir)/man1; \ + $(SED) ${ac_inplace} -e 's/^\.Os$$/.Os Linux/' \ + $(dist_man_MANS) +endif diff --git a/man/man1/arcstat.1 b/man/man1/arcstat.1 new file mode 100644 index 000000000000..9113b76af46c --- /dev/null +++ b/man/man1/arcstat.1 @@ -0,0 +1,502 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2014 Adam Stevko. All rights reserved. +.\" Copyright (c) 2015 by Delphix. All rights reserved. +.\" Copyright (c) 2020 by AJ Jordan. All rights reserved. +.\" +.TH ARCSTAT 1 "May 7, 2020" +.SH NAME +arcstat \- report ZFS ARC and L2ARC statistics +.SH SYNOPSIS +.LP +.nf +\fBarcstat\fR [\fB-hvx\fR] [\fB-f field[,field]...\fR] [\fB-o file\fR] [\fB-s string\fR] [\fBinterval\fR [\fBcount\fR]] +.fi + +.SH DESCRIPTION +.LP +The \fBarcstat\fR utility print various ZFS ARC and L2ARC statistics in +vmstat-like fashion. +.sp + +.sp +.LP +The \fBarcstat\fR command reports the following information: +.sp +.ne 2 + +.\" +.sp +.ne 1 +.na +\fBc \fR +.ad +.RS 14n +ARC target size +.RE + +.sp +.ne 2 +.na +\fBdh% \fR +.ad +.RS 14n +Demand data hit percentage +.RE + +.sp +.ne 2 +.na +\fBdm% \fR +.ad +.RS 14n +Demand data miss percentage +.RE + +.sp +.ne 2 +.na +\fBmfu \fR +.ad +.RS 14n +MFU list hits per second +.RE + +.sp +.ne 2 +.na +\fBmh% \fR +.ad +.RS 14n +Metadata hit percentage +.RE + +.sp +.ne 2 +.na +\fBmm% \fR +.ad +.RS 14n +Metadata miss percentage +.RE + +.sp +.ne 2 +.na +\fBmru \fR +.ad +.RS 14n +MRU list hits per second +.RE + +.sp +.ne 2 +.na +\fBph% \fR +.ad +.RS 14n +Prefetch hits percentage +.RE + +.sp +.ne 2 +.na +\fBpm% \fR +.ad +.RS 14n +Prefetch miss percentage +.RE + +.sp +.ne 2 +.na +\fBdhit \fR +.ad +.RS 14n +Demand data hits per second +.RE + +.sp +.ne 2 +.na +\fBdmis \fR +.ad +.RS 14n +Demand data misses per second +.RE + +.sp +.ne 2 +.na +\fBhit% \fR +.ad +.RS 14n +ARC hit percentage +.RE + +.sp +.ne 2 +.na +\fBhits \fR +.ad +.RS 14n +ARC reads per second +.RE + +.sp +.ne 2 +.na +\fBmfug \fR +.ad +.RS 14n +MFU ghost list hits per second +.RE + +.sp +.ne 2 +.na +\fBmhit \fR +.ad +.RS 14n +Metadata hits per second +.RE + +.sp +.ne 2 +.na +\fBmiss \fR +.ad +.RS 14n +ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBmmis \fR +.ad +.RS 14n +Metadata misses per second +.RE + +.sp +.ne 2 +.na +\fBmrug \fR +.ad +.RS 14n +MRU ghost list hits per second +.RE + +.sp +.ne 2 +.na +\fBphit \fR +.ad +.RS 14n +Prefetch hits per second +.RE + +.sp +.ne 2 +.na +\fBpmis \fR +.ad +.RS 14n +Prefetch misses per second +.RE + +.sp +.ne 2 +.na +\fBread \fR +.ad +.RS 14n +Total ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBtime \fR +.ad +.RS 14n +Time +.RE + +.sp +.ne 2 +.na +\fBsize \fR +.ad +.RS 14n +ARC size +.RE + +.sp +.ne 2 +.na +\fBarcsz \fR +.ad +.RS 14n +Alias for \fBsize\fR +.RE + +.sp +.ne 2 +.na +\fBdread \fR +.ad +.RS 14n +Demand data accesses per second +.RE + +.sp +.ne 2 +.na +\fBeskip \fR +.ad +.RS 14n +evict_skip per second +.RE + +.sp +.ne 2 +.na +\fBmiss% \fR +.ad +.RS 14n +ARC miss percentage +.RE + +.sp +.ne 2 +.na +\fBmread \fR +.ad +.RS 14n +Metadata accesses per second +.RE + +.sp +.ne 2 +.na +\fBpread \fR +.ad +.RS 14n +Prefetch accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2hit% \fR +.ad +.RS 14n +L2ARC access hit percentage +.RE + +.sp +.ne 2 +.na +\fBl2hits \fR +.ad +.RS 14n +L2ARC hits per second +.RE + +.sp +.ne 2 +.na +\fBl2miss \fR +.ad +.RS 14n +L2ARC misses per second +.RE + +.sp +.ne 2 +.na +\fBl2read \fR +.ad +.RS 14n +Total L2ARC accesses per second +.RE + +.sp +.ne 2 +.na +\fBl2size \fR +.ad +.RS 14n +Size of the L2ARC +.RE + +.sp +.ne 2 +.na +\fBmtxmis \fR +.ad +.RS 14n +mutex_miss per second +.RE + +.sp +.ne 2 +.na +\fBl2bytes \fR +.ad +.RS 14n +Bytes read per second from the L2ARC +.RE + +.sp +.ne 2 +.na +\fBl2miss% \fR +.ad +.RS 14n +L2ARC access miss percentage +.RE + +.sp +.ne 2 +.na +\fBl2asize \fR +.ad +.RS 14n +Actual (compressed) size of the L2ARC +.RE + +.sp +.ne 2 +.na +\fBgrow \fR +.ad +.RS 14n +ARC grow disabled +.RE + +.sp +.ne 2 +.na +\fBneed \fR +.ad +.RS 14n +ARC reclaim needed +.RE + +.sp +.ne 2 +.na +\fBfree \fR +.ad +.RS 14n +The ARC's idea of how much free memory there is, which includes evictable memory in the page cache. +Since the ARC tries to keep \fBavail\fR above zero, \fBavail\fR is usually more instructive to observe than \fBfree\fR. +.RE + +.sp +.ne 2 +.na +\fBavail \fR +.ad +.RS 14n +The ARC's idea of how much free memory is available to it, which is a bit less than \fBfree\fR. +May temporarily be negative, in which case the ARC will reduce the target size \fBc\fR. +.RE +.\" + +.SH OPTIONS +.LP +The following options are supported: + +.sp +.ne 2 +.na +\fB\fB-f\fR\fR +.ad +.RS 12n +Display only specific fields. See \fBDESCRIPTION\fR for supported statistics. +.RE + +.sp +.ne 2 +.na +\fB\fB-h\fR\fR +.ad +.RS 12n +Display help message. +.RE + +.sp +.ne 2 +.na +\fB\fB-o\fR\fR +.ad +.RS 12n +Report statistics to a file instead of the standard output. +.RE + +.sp +.ne 2 +.na +\fB\fB-s\fR\fR +.ad +.RS 12n +Display data with a specified separator (default: 2 spaces). +.RE + +.sp +.ne 2 +.na +\fB\fB-x\fR\fR +.ad +.RS 12n +Print extended stats (same as -f time,mfu,mru,mfug,mrug,eskip,mtxmis,dread,pread,read). +.RE + +.sp +.ne 2 +.na +\fB\fB-v\fR\fR +.ad +.RS 12n +Show field headers and definitions +.RE + +.SH OPERANDS +.LP +The following operands are supported: +.sp +.ne 2 +.na +\fB\fIcount\fR\fR +.ad +.RS 12n +Display only \fIcount\fR reports. +.RE + +.sp +.ne 2 +.na +\fB\fIinterval\fR\fR +.ad +.RS 12n +Specify the sampling interval in seconds. +.RE + +.SH AUTHORS +.LP +arcstat was originally written in Perl by Neelakanth Nadgir and supported only ZFS ARC statistics. +Mike Harsch updated it to support L2ARC statistics. +John Hixson ported it to Python for FreeNAS over some beer, after which many individuals from the OpenZFS community continued to maintain and improve it. diff --git a/man/man1/cstyle.1 b/man/man1/cstyle.1 new file mode 100644 index 000000000000..f77d534507a4 --- /dev/null +++ b/man/man1/cstyle.1 @@ -0,0 +1,167 @@ +.\" Copyright 2009 Sun Microsystems, Inc. All rights reserved. +.\" Use is subject to license terms. +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.TH cstyle 1 "28 March 2005" +.SH NAME +.I cstyle +\- check for some common stylistic errors in C source files +.SH SYNOPSIS +\fBcstyle [-chpvCP] [-o constructs] [file...]\fP +.LP +.SH DESCRIPTION +.IX "OS-Net build tools" "cstyle" "" "\fBcstyle\fP" +.LP +.I cstyle +inspects C source files (*.c and *.h) for common stylistic errors. It +attempts to check for the cstyle documented in +\fIhttp://www.cis.upenn.edu/~lee/06cse480/data/cstyle.ms.pdf\fP. +Note that there is much in that document that +.I cannot +be checked for; just because your code is \fBcstyle(1)\fP clean does not +mean that you've followed Sun's C style. \fICaveat emptor\fP. +.LP +.SH OPTIONS +.LP +The following options are supported: +.TP 4 +.B \-c +Check continuation line indentation inside of functions. Sun's C style +states that all statements must be indented to an appropriate tab stop, +and any continuation lines after them must be indented \fIexactly\fP four +spaces from the start line. This option enables a series of checks +designed to find continuation line problems within functions only. The +checks have some limitations; see CONTINUATION CHECKING, below. +.LP +.TP 4 +.B \-h +Performs heuristic checks that are sometimes wrong. Not generally used. +.LP +.TP 4 +.B \-p +Performs some of the more picky checks. Includes ANSI #else and #endif +rules, and tries to detect spaces after casts. Used as part of the +putback checks. +.LP +.TP 4 +.B \-v +Verbose output; includes the text of the line of error, and, for +\fB-c\fP, the first statement in the current continuation block. +.LP +.TP 4 +.B \-C +Ignore errors in header comments (i.e. block comments starting in the +first column). Not generally used. +.LP +.TP 4 +.B \-P +Check for use of non-POSIX types. Historically, types like "u_int" and +"u_long" were used, but they are now deprecated in favor of the POSIX +types uint_t, ulong_t, etc. This detects any use of the deprecated +types. Used as part of the putback checks. +.LP +.TP 4 +.B \-o \fIconstructs\fP +Allow a comma-separated list of additional constructs. Available +constructs include: +.LP +.TP 10 +.B doxygen +Allow doxygen-style block comments (\fB/**\fP and \fB/*!\fP) +.LP +.TP 10 +.B splint +Allow splint-style lint comments (\fB/*@...@*/\fP) +.LP +.SH NOTES +.LP +The cstyle rule for the OS/Net consolidation is that all new files must +be \fB-pP\fP clean. For existing files, the following invocations are +run against both the old and new files: +.LP +.TP 4 +\fBcstyle file\fB +.LP +.TP 4 +\fBcstyle -p file\fB +.LP +.TP 4 +\fBcstyle -pP file\fB +.LP +If the old file gave no errors for one of the invocations, the new file +must also give no errors. This way, files can only become more clean. +.LP +.SH CONTINUATION CHECKING +.LP +The continuation checker is a reasonably simple state machine that knows +something about how C is laid out, and can match parenthesis, etc. over +multiple lines. It does have some limitations: +.LP +.TP 4 +.B 1. +Preprocessor macros which cause unmatched parenthesis will confuse the +checker for that line. To fix this, you'll need to make sure that each +branch of the #if statement has balanced parenthesis. +.LP +.TP 4 +.B 2. +Some \fBcpp\fP macros do not require ;s after them. Any such macros +*must* be ALL_CAPS; any lower case letters will cause bad output. +.LP +The bad output will generally be corrected after the next \fB;\fP, +\fB{\fP, or \fB}\fP. +.LP +Some continuation error messages deserve some additional explanation +.LP +.TP 4 +.B +multiple statements continued over multiple lines +A multi-line statement which is not broken at statement +boundaries. For example: +.RS 4 +.HP 4 +if (this_is_a_long_variable == another_variable) a = +.br +b + c; +.LP +Will trigger this error. Instead, do: +.HP 8 +if (this_is_a_long_variable == another_variable) +.br +a = b + c; +.RE +.LP +.TP 4 +.B +empty if/for/while body not on its own line +For visibility, empty bodies for if, for, and while statements should be +on their own line. For example: +.RS 4 +.HP 4 +while (do_something(&x) == 0); +.LP +Will trigger this error. Instead, do: +.HP 8 +while (do_something(&x) == 0) +.br +; +.RE + diff --git a/man/man1/raidz_test.1 b/man/man1/raidz_test.1 new file mode 100644 index 000000000000..423177a1b839 --- /dev/null +++ b/man/man1/raidz_test.1 @@ -0,0 +1,97 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2016 Gvozden NeÅ¡ković. All rights reserved. +.\" +.TH raidz_test 1 "2016" "ZFS on Linux" "User Commands" + +.SH NAME +\fBraidz_test\fR \- raidz implementation verification and benchmarking tool +.SH SYNOPSIS +.LP +.BI "raidz_test " +.SH DESCRIPTION +.LP +This manual page documents briefly the \fBraidz_test\fR command. +.LP +Purpose of this tool is to run all supported raidz implementation and verify +results of all methods. Tool also contains a parameter sweep option where all +parameters affecting RAIDZ block are verified (like ashift size, data offset, +data size, etc...). +The tool also supports a benchmarking mode using -B option. +.SH OPTION +.HP +.BI "\-h" "" +.IP +Print a help summary. +.HP +.BI "\-a" " ashift (default: 9)" +.IP +Ashift value. +.HP +.BI "\-o" " zio_off_shift" " (default: 0)" +.IP +Zio offset for raidz block. Offset value is 1 << (zio_off_shift) +.HP +.BI "\-d" " raidz_data_disks" " (default: 8)" +.IP +Number of raidz data disks to use. Additional disks for parity will be used +during testing. +.HP +.BI "\-s" " zio_size_shift" " (default: 19)" +.IP +Size of data for raidz block. Size is 1 << (zio_size_shift). +.HP +.BI "\-S(weep)" +.IP +Sweep parameter space while verifying the raidz implementations. This option +will exhaust all most of valid values for -a -o -d -s options. Runtime using +this option will be long. +.HP +.BI "\-t(imeout)" +.IP +Wall time for sweep test in seconds. The actual runtime could be longer. +.HP +.BI "\-B(enchmark)" +.IP +This options starts the benchmark mode. All implementations are benchmarked +using increasing per disk data size. Results are given as throughput per disk, +measured in MiB/s. +.HP +.BI "\-v(erbose)" +.IP +Increase verbosity. +.HP +.BI "\-T(est the test)" +.IP +Debugging option. When this option is specified tool is supposed to fail +all tests. This is to check if tests would properly verify bit-exactness. +.HP +.BI "\-D(ebug)" +.IP +Debugging option. Specify to attach gdb when SIGSEGV or SIGABRT are received. +.HP + +.SH "SEE ALSO" +.BR "ztest (1)" +.SH "AUTHORS" +vdev_raidz, created for ZFS on Linux by Gvozden NeÅ¡ković diff --git a/man/man1/zhack.1 b/man/man1/zhack.1 new file mode 100644 index 000000000000..11d300b70014 --- /dev/null +++ b/man/man1/zhack.1 @@ -0,0 +1,98 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright 2013 Darik Horn . All rights reserved. +.\" +.TH zhack 1 "2013 MAR 16" "ZFS on Linux" "User Commands" + +.SH NAME +zhack \- libzpool debugging tool +.SH DESCRIPTION +This utility pokes configuration changes directly into a ZFS pool, +which is dangerous and can cause data corruption. +.SH SYNOPSIS +.LP +.BI "zhack [\-c " "cachefile" "] [\-d " "dir" "] <" "subcommand" "> [" "arguments" "]" +.SH OPTIONS +.HP +.BI "\-c" " cachefile" +.IP +Read the \fIpool\fR configuration from the \fIcachefile\fR, which is +/etc/zfs/zpool.cache by default. +.HP +.BI "\-d" " dir" +.IP +Search for \fIpool\fR members in the \fIdir\fR path. Can be specified +more than once. +.SH SUBCOMMANDS +.LP +.BI "feature stat " "pool" +.IP +List feature flags. +.LP +.BI "feature enable [\-d " "description" "] [\-r] " "pool guid" +.IP +Add a new feature to \fIpool\fR that is uniquely identified by +\fIguid\fR, which is specified in the same form as a zfs(8) user +property. +.IP +The \fIdescription\fR is a short human readable explanation of the new +feature. +.IP +The \fB\-r\fR switch indicates that \fIpool\fR can be safely opened +in read-only mode by a system that does not have the \fIguid\fR +feature. +.LP +.BI "feature ref [\-d|\-m] " "pool guid" +.IP +Increment the reference count of the \fIguid\fR feature in \fIpool\fR. +.IP +The \fB\-d\fR switch decrements the reference count of the \fIguid\fR +feature in \fIpool\fR. +.IP +The \fB\-m\fR switch indicates that the \fIguid\fR feature is now +required to read the pool MOS. +.SH EXAMPLES +.LP +.nf +# zhack feature stat tank + +for_read_obj: + org.illumos:lz4_compress = 0 +for_write_obj: + com.delphix:async_destroy = 0 + com.delphix:empty_bpobj = 0 +descriptions_obj: + com.delphix:async_destroy = Destroy filesystems asynchronously. + com.delphix:empty_bpobj = Snapshots use less space. + org.illumos:lz4_compress = LZ4 compression algorithm support. +.LP +# zhack feature enable -d 'Predict future disk failures.' \\ + tank com.example:clairvoyance +.LP +# zhack feature ref tank com.example:clairvoyance +.SH AUTHORS +This man page was written by Darik Horn . +.SH SEE ALSO +.BR zfs (8), +.BR zpool-features (5), +.BR ztest (1) diff --git a/man/man1/ztest.1 b/man/man1/ztest.1 new file mode 100644 index 000000000000..84e56c822d13 --- /dev/null +++ b/man/man1/ztest.1 @@ -0,0 +1,179 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Oracle and/or its affiliates. All rights reserved. +.\" Copyright (c) 2009 Michael Gebetsroither . All rights +.\" reserved. +.\" +.TH ztest 1 "2009 NOV 01" "ZFS on Linux" "User Commands" + +.SH NAME +\fBztest\fR \- was written by the ZFS Developers as a ZFS unit test. +.SH SYNOPSIS +.LP +.BI "ztest " +.SH DESCRIPTION +.LP +This manual page documents briefly the \fBztest\fR command. +.LP +\fBztest\fR was written by the ZFS Developers as a ZFS unit test. The +tool was developed in tandem with the ZFS functionality and was +executed nightly as one of the many regression test against the daily +build. As features were added to ZFS, unit tests were also added to +\fBztest\fR. In addition, a separate test development team wrote and +executed more functional and stress tests. +.LP +By default \fBztest\fR runs for ten minutes and uses block files +(stored in /tmp) to create pools rather than using physical disks. +Block files afford \fBztest\fR its flexibility to play around with +zpool components without requiring large hardware configurations. +However, storing the block files in /tmp may not work for you if you +have a small tmp directory. +.LP +By default is non-verbose. This is why entering the command above will +result in \fBztest\fR quietly executing for 5 minutes. The -V option +can be used to increase the verbosity of the tool. Adding multiple -V +option is allowed and the more you add the more chatty \fBztest\fR +becomes. +.LP +After the \fBztest\fR run completes, you should notice many ztest.* +files lying around. Once the run completes you can safely remove these +files. Note that you shouldn't remove these files during a run. You +can re-use these files in your next \fBztest\fR run by using the -E +option. +.SH OPTIONS +.HP +.BI "\-?" "" +.IP +Print a help summary. +.HP +.BI "\-v" " vdevs" " (default: 5) +.IP +Number of vdevs. +.HP +.BI "\-s" " size_of_each_vdev" " (default: 64M)" +.IP +Size of each vdev. +.HP +.BI "\-a" " alignment_shift" " (default: 9) (use 0 for random)" +.IP +Used alignment in test. +.HP +.BI "\-m" " mirror_copies" " (default: 2)" +.IP +Number of mirror copies. +.HP +.BI "\-r" " raidz_disks" " (default: 4)" +.IP +Number of raidz disks. +.HP +.BI "\-R" " raidz_parity" " (default: 1)" +.IP +Raidz parity. +.HP +.BI "\-d" " datasets" " (default: 7)" +.IP +Number of datasets. +.HP +.BI "\-t" " threads" " (default: 23)" +.IP +Number of threads. +.HP +.BI "\-g" " gang_block_threshold" " (default: 32K)" +.IP +Gang block threshold. +.HP +.BI "\-i" " initialize_pool_i_times" " (default: 1)" +.IP +Number of pool initialisations. +.HP +.BI "\-k" " kill_percentage" " (default: 70%)" +.IP +Kill percentage. +.HP +.BI "\-p" " pool_name" " (default: ztest)" +.IP +Pool name. +.HP +.BI "\-V(erbose)" +.IP +Verbose (use multiple times for ever more blather). +.HP +.BI "\-E(xisting)" +.IP +Use existing pool (use existing pool instead of creating new one). +.HP +.BI "\-T" " time" " (default: 300 sec)" +.IP +Total test run time. +.HP +.BI "\-z" " zil_failure_rate" " (default: fail every 2^5 allocs) +.IP +Injected failure rate. +.HP +.BI "\-G" +.IP +Dump zfs_dbgmsg buffer before exiting. +.SH "EXAMPLES" +.LP +To override /tmp as your location for block files, you can use the -f +option: +.IP +ztest -f / +.LP +To get an idea of what ztest is actually testing try this: +.IP +ztest -f / -VVV +.LP +Maybe you'd like to run ztest for longer? To do so simply use the -T +option and specify the runlength in seconds like so: +.IP +ztest -f / -V -T 120 + +.SH "ENVIRONMENT VARIABLES" +.TP +.B "ZFS_HOSTID=id" +Use \fBid\fR instead of the SPL hostid to identify this host. Intended for use +with ztest, but this environment variable will affect any utility which uses +libzpool, including \fBzpool(8)\fR. Since the kernel is unaware of this setting +results with utilities other than ztest are undefined. +.TP +.B "ZFS_STACK_SIZE=stacksize" +Limit the default stack size to \fBstacksize\fR bytes for the purpose of +detecting and debugging kernel stack overflows. This value defaults to +\fB32K\fR which is double the default \fB16K\fR Linux kernel stack size. + +In practice, setting the stack size slightly higher is needed because +differences in stack usage between kernel and user space can lead to spurious +stack overflows (especially when debugging is enabled). The specified value +will be rounded up to a floor of PTHREAD_STACK_MIN which is the minimum stack +required for a NULL procedure in user space. + +By default the stack size is limited to 256K. +.SH "SEE ALSO" +.BR "spl-module-parameters (5)" "," +.BR "zpool (1)" "," +.BR "zfs (1)" "," +.BR "zdb (1)" "," +.SH "AUTHOR" +This manual page was transferred to asciidoc by Michael Gebetsroither + from http://opensolaris.org/os/community/zfs/ztest/ diff --git a/man/man1/zvol_wait.1 b/man/man1/zvol_wait.1 new file mode 100644 index 000000000000..839e1f144752 --- /dev/null +++ b/man/man1/zvol_wait.1 @@ -0,0 +1,21 @@ +.Dd July 5, 2019 +.Dt ZVOL_WAIT 1 SMM +.Os +.Sh NAME +.Nm zvol_wait +.Nd Wait for ZFS volume links in +.Em /dev +to be created. +.Sh SYNOPSIS +.Nm +.Sh DESCRIPTION +When a ZFS pool is imported, ZFS will register each ZFS volume +(zvol) as a disk device with the system. As the disks are registered, +.Xr \fBudev 7\fR +will asynchronously create symlinks under +.Em /dev/zvol +using the zvol's name. +.Nm +will wait for all those symlinks to be created before returning. +.Sh SEE ALSO +.Xr \fBudev 7\fR diff --git a/man/man5/Makefile.am b/man/man5/Makefile.am new file mode 100644 index 000000000000..9cbb2c08f378 --- /dev/null +++ b/man/man5/Makefile.am @@ -0,0 +1,16 @@ +dist_man_MANS = \ + vdev_id.conf.5 \ + zpool-features.5 \ + spl-module-parameters.5 \ + zfs-module-parameters.5 \ + zfs-events.5 + +if BUILD_LINUX +# The man pager in most Linux distros defaults to BSD instead of Linux +# when .Os is blank, but leaving it blank makes things a lot easier on +# FreeBSD when OpenZFS is vendored in the base system. +install-data-hook: + cd $(DESTDIR)$(mandir)/man5; \ + $(SED) ${ac_inplace} -e 's/^\.Os$$/.Os Linux/' \ + $(dist_man_MANS) +endif diff --git a/man/man5/spl-module-parameters.5 b/man/man5/spl-module-parameters.5 new file mode 100644 index 000000000000..2dce5b2963d6 --- /dev/null +++ b/man/man5/spl-module-parameters.5 @@ -0,0 +1,326 @@ +'\" te +.\" +.\" Copyright 2013 Turbo Fredriksson . All rights reserved. +.\" +.TH SPL-MODULE-PARAMETERS 5 "Oct 28, 2017" +.SH NAME +spl\-module\-parameters \- SPL module parameters +.SH DESCRIPTION +.sp +.LP +Description of the different parameters to the SPL module. + +.SS "Module parameters" +.sp +.LP + +.sp +.ne 2 +.na +\fBspl_kmem_cache_expire\fR (uint) +.ad +.RS 12n +Cache expiration is part of default Illumos cache behavior. The idea is +that objects in magazines which have not been recently accessed should be +returned to the slabs periodically. This is known as cache aging and +when enabled objects will be typically returned after 15 seconds. +.sp +On the other hand Linux slabs are designed to never move objects back to +the slabs unless there is memory pressure. This is possible because under +Linux the cache will be notified when memory is low and objects can be +released. +.sp +By default only the Linux method is enabled. It has been shown to improve +responsiveness on low memory systems and not negatively impact the performance +of systems with more memory. This policy may be changed by setting the +\fBspl_kmem_cache_expire\fR bit mask as follows, both policies may be enabled +concurrently. +.sp +0x01 - Aging (Illumos), 0x02 - Low memory (Linux) +.sp +Default value: \fB0x02\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_kmem_threads\fR (uint) +.ad +.RS 12n +The number of threads created for the spl_kmem_cache task queue. This task +queue is responsible for allocating new slabs for use by the kmem caches. +For the majority of systems and workloads only a small number of threads are +required. +.sp +Default value: \fB4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_reclaim\fR (uint) +.ad +.RS 12n +When this is set it prevents Linux from being able to rapidly reclaim all the +memory held by the kmem caches. This may be useful in circumstances where +it's preferable that Linux reclaim memory from some other subsystem first. +Setting this will increase the likelihood out of memory events on a memory +constrained system. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_obj_per_slab\fR (uint) +.ad +.RS 12n +The preferred number of objects per slab in the cache. In general, a larger +value will increase the caches memory footprint while decreasing the time +required to perform an allocation. Conversely, a smaller value will minimize +the footprint and improve cache reclaim time but individual allocations may +take longer. +.sp +Default value: \fB8\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_obj_per_slab_min\fR (uint) +.ad +.RS 12n +The minimum number of objects allowed per slab. Normally slabs will contain +\fBspl_kmem_cache_obj_per_slab\fR objects but for caches that contain very +large objects it's desirable to only have a few, or even just one, object per +slab. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_max_size\fR (uint) +.ad +.RS 12n +The maximum size of a kmem cache slab in MiB. This effectively limits +the maximum cache object size to \fBspl_kmem_cache_max_size\fR / +\fBspl_kmem_cache_obj_per_slab\fR. Caches may not be created with +object sized larger than this limit. +.sp +Default value: \fB32 (64-bit) or 4 (32-bit)\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_slab_limit\fR (uint) +.ad +.RS 12n +For small objects the Linux slab allocator should be used to make the most +efficient use of the memory. However, large objects are not supported by +the Linux slab and therefore the SPL implementation is preferred. This +value is used to determine the cutoff between a small and large object. +.sp +Objects of \fBspl_kmem_cache_slab_limit\fR or smaller will be allocated +using the Linux slab allocator, large objects use the SPL allocator. A +cutoff of 16K was determined to be optimal for architectures using 4K pages. +.sp +Default value: \fB16,384\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_warn\fR (uint) +.ad +.RS 12n +As a general rule kmem_alloc() allocations should be small, preferably +just a few pages since they must by physically contiguous. Therefore, a +rate limited warning will be printed to the console for any kmem_alloc() +which exceeds a reasonable threshold. +.sp +The default warning threshold is set to eight pages but capped at 32K to +accommodate systems using large pages. This value was selected to be small +enough to ensure the largest allocations are quickly noticed and fixed. +But large enough to avoid logging any warnings when a allocation size is +larger than optimal but not a serious concern. Since this value is tunable, +developers are encouraged to set it lower when testing so any new largish +allocations are quickly caught. These warnings may be disabled by setting +the threshold to zero. +.sp +Default value: \fB32,768\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_alloc_max\fR (uint) +.ad +.RS 12n +Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. +Allocations which are marginally smaller than this limit may succeed but +should still be avoided due to the expense of locating a contiguous range +of free pages. Therefore, a maximum kmem size with reasonable safely +margin of 4x is set. Kmem_alloc() allocations larger than this maximum +will quickly fail. Vmem_alloc() allocations less than or equal to this +value will use kmalloc(), but shift to vmalloc() when exceeding this value. +.sp +Default value: \fBKMALLOC_MAX_SIZE/4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_kmem_cache_magazine_size\fR (uint) +.ad +.RS 12n +Cache magazines are an optimization designed to minimize the cost of +allocating memory. They do this by keeping a per-cpu cache of recently +freed objects, which can then be reallocated without taking a lock. This +can improve performance on highly contended caches. However, because +objects in magazines will prevent otherwise empty slabs from being +immediately released this may not be ideal for low memory machines. +.sp +For this reason \fBspl_kmem_cache_magazine_size\fR can be used to set a +maximum magazine size. When this value is set to 0 the magazine size will +be automatically determined based on the object size. Otherwise magazines +will be limited to 2-256 objects per magazine (i.e per cpu). Magazines +may never be entirely disabled in this implementation. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_hostid\fR (ulong) +.ad +.RS 12n +The system hostid, when set this can be used to uniquely identify a system. +By default this value is set to zero which indicates the hostid is disabled. +It can be explicitly enabled by placing a unique non-zero value in +\fB/etc/hostid/\fR. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_hostid_path\fR (charp) +.ad +.RS 12n +The expected path to locate the system hostid when specified. This value +may be overridden for non-standard configurations. +.sp +Default value: \fB/etc/hostid\fR +.RE + +.sp +.ne 2 +.na +\fBspl_panic_halt\fR (uint) +.ad +.RS 12n +Cause a kernel panic on assertion failures. When not enabled, the thread is +halted to facilitate further debugging. +.sp +Set to a non-zero value to enable. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_kick\fR (uint) +.ad +.RS 12n +Kick stuck taskq to spawn threads. When writing a non-zero value to it, it will +scan all the taskqs. If any of them have a pending task more than 5 seconds old, +it will kick it to spawn more threads. This can be used if you find a rare +deadlock occurs because one or more taskqs didn't spawn a thread when it should. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_bind\fR (int) +.ad +.RS 12n +Bind taskq threads to specific CPUs. When enabled all taskq threads will +be distributed evenly over the available CPUs. By default, this behavior +is disabled to allow the Linux scheduler the maximum flexibility to determine +where a thread should run. +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_dynamic\fR (int) +.ad +.RS 12n +Allow dynamic taskqs. When enabled taskqs which set the TASKQ_DYNAMIC flag +will by default create only a single thread. New threads will be created on +demand up to a maximum allowed number to facilitate the completion of +outstanding tasks. Threads which are no longer needed will be promptly +destroyed. By default this behavior is enabled but it can be disabled to +aid performance analysis or troubleshooting. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_priority\fR (int) +.ad +.RS 12n +Allow newly created taskq threads to set a non-default scheduler priority. +When enabled the priority specified when a taskq is created will be applied +to all threads created by that taskq. When disabled all threads will use +the default Linux kernel thread priority. By default, this behavior is +enabled. +.sp +Default value: \fB1\fR +.RE + +.sp +.ne 2 +.na +\fBspl_taskq_thread_sequential\fR (int) +.ad +.RS 12n +The number of items a taskq worker thread must handle without interruption +before requesting a new worker thread be spawned. This is used to control +how quickly taskqs ramp up the number of threads processing the queue. +Because Linux thread creation and destruction are relatively inexpensive a +small default value has been selected. This means that normally threads will +be created aggressively which is desirable. Increasing this value will +result in a slower thread creation rate which may be preferable for some +configurations. +.sp +Default value: \fB4\fR +.RE + +.sp +.ne 2 +.na +\fBspl_max_show_tasks\fR (uint) +.ad +.RS 12n +The maximum number of tasks per pending list in each taskq shown in +/proc/spl/{taskq,taskq-all}. Write 0 to turn off the limit. The proc file will +walk the lists with lock held, reading it could cause a lock up if the list +grow too large without limiting the output. "(truncated)" will be shown if the +list is larger than the limit. +.sp +Default value: \fB512\fR +.RE diff --git a/man/man5/vdev_id.conf.5 b/man/man5/vdev_id.conf.5 new file mode 100644 index 000000000000..89c5ee961094 --- /dev/null +++ b/man/man5/vdev_id.conf.5 @@ -0,0 +1,222 @@ +.TH vdev_id.conf 5 +.SH NAME +vdev_id.conf \- Configuration file for vdev_id +.SH DESCRIPTION +.I vdev_id.conf +is the configuration file for +.BR vdev_id (8). +It controls the default behavior of +.BR vdev_id (8) +while it is mapping a disk device name to an alias. +.PP +The +.I vdev_id.conf +file uses a simple format consisting of a keyword followed by one or +more values on a single line. Any line not beginning with a recognized +keyword is ignored. Comments may optionally begin with a hash +character. + +The following keywords and values are used. +.TP +\fIalias\fR +Maps a device link in the /dev directory hierarchy to a new device +name. The udev rule defining the device link must have run prior to +.BR vdev_id (8). +A defined alias takes precedence over a topology-derived name, but the +two naming methods can otherwise coexist. For example, one might name +drives in a JBOD with the sas_direct topology while naming an internal +L2ARC device with an alias. + +\fIname\fR - the name of the link to the device that will by created in +/dev/disk/by-vdev. + +\fIdevlink\fR - the name of the device link that has already been +defined by udev. This may be an absolute path or the base filename. + +.TP +\fIchannel\fR [pci_slot] +Maps a physical path to a channel name (typically representing a single +disk enclosure). + +.TP +\fIenclosure_symlinks\fR +Additionally create /dev/by-enclosure symlinks to the disk enclosure +sg devices using the naming scheme from vdev_id.conf. +\fIenclosure_symlinks\fR is only allowed for sas_direct mode. +.TP +\fIenclosure_symlinks_prefix\fR +Specify the prefix for the enclosure symlinks in the form of: + +/dev/by-enclosure/- + +Defaults to "enc" if not specified. +.TP +\fIpci_slot\fR - specifies the PCI SLOT of the HBA +hosting the disk enclosure being mapped, as found in the output of +.BR lspci (8). +This argument is not used in sas_switch mode. + +\fIport\fR - specifies the numeric identifier of the HBA or SAS switch port +connected to the disk enclosure being mapped. + +\fIname\fR - specifies the name of the channel. + +.TP +\fIslot\fR [channel] +Maps a disk slot number as reported by the operating system to an +alternative slot number. If the \fIchannel\fR parameter is specified +then the mapping is only applied to slots in the named channel, +otherwise the mapping is applied to all channels. The first-specified +\fIslot\fR rule that can match a slot takes precedence. Therefore a +channel-specific mapping for a given slot should generally appear before +a generic mapping for the same slot. In this way a custom mapping may +be applied to a particular channel and a default mapping applied to the +others. + +.TP +\fImultipath\fR +Specifies whether +.BR vdev_id (8) +will handle only dm-multipath devices. If set to "yes" then +.BR vdev_id (8) +will examine the first running component disk of a dm-multipath +device as listed by the +.BR multipath (8) +command to determine the physical path. +.TP +\fItopology\fR +Identifies a physical topology that governs how physical paths are +mapped to channels. + +\fIsas_direct\fR - in this mode a channel is uniquely identified by +a PCI slot and a HBA port number + +\fIsas_switch\fR - in this mode a channel is uniquely identified by +a SAS switch port number + +.TP +\fIphys_per_port\fR +Specifies the number of PHY devices associated with a SAS HBA port or SAS +switch port. +.BR vdev_id (8) +internally uses this value to determine which HBA or switch port a +device is connected to. The default is 4. + +.TP +\fIslot\fR +Specifies from which element of a SAS identifier the slot number is +taken. The default is bay. + +\fIbay\fR - read the slot number from the bay identifier. + +\fIphy\fR - read the slot number from the phy identifier. + +\fIport\fR - use the SAS port as the slot number. + +\fIid\fR - use the scsi id as the slot number. + +\fIlun\fR - use the scsi lun as the slot number. + +\fIses\fR - use the SCSI Enclosure Services (SES) enclosure device slot number, +as reported by +.BR sg_ses (8). +This is intended for use only on systems where \fIbay\fR is unsupported, +noting that \fIport\fR and \fIid\fR may be unstable across disk replacement. +.SH EXAMPLES +A non-multipath configuration with direct-attached SAS enclosures and an +arbitrary slot re-mapping. +.P +.nf + multipath no + topology sas_direct + phys_per_port 4 + slot bay + + # PCI_SLOT HBA PORT CHANNEL NAME + channel 85:00.0 1 A + channel 85:00.0 0 B + channel 86:00.0 1 C + channel 86:00.0 0 D + + # Custom mapping for Channel A + + # Linux Mapped + # Slot Slot Channel + slot 1 7 A + slot 2 10 A + slot 3 3 A + slot 4 6 A + + # Default mapping for B, C, and D + + slot 1 4 + slot 2 2 + slot 3 1 + slot 4 3 +.fi +.P +A SAS-switch topology. Note that the +.I channel +keyword takes only two arguments in this example. +.P +.nf + topology sas_switch + + # SWITCH PORT CHANNEL NAME + channel 1 A + channel 2 B + channel 3 C + channel 4 D +.fi +.P +A multipath configuration. Note that channel names have multiple +definitions - one per physical path. +.P +.nf + multipath yes + + # PCI_SLOT HBA PORT CHANNEL NAME + channel 85:00.0 1 A + channel 85:00.0 0 B + channel 86:00.0 1 A + channel 86:00.0 0 B +.fi +.P +A configuration with enclosure_symlinks enabled. +.P +.nf + multipath yes + enclosure_symlinks yes + + # PCI_ID HBA PORT CHANNEL NAME + channel 05:00.0 1 U + channel 05:00.0 0 L + channel 06:00.0 1 U + channel 06:00.0 0 L +.fi +In addition to the disks symlinks, this configuration will create: +.P +.nf + /dev/by-enclosure/enc-L0 + /dev/by-enclosure/enc-L1 + /dev/by-enclosure/enc-U0 + /dev/by-enclosure/enc-U1 +.fi +.P +A configuration using device link aliases. +.P +.nf + # by-vdev + # name fully qualified or base name of device link + alias d1 /dev/disk/by-id/wwn-0x5000c5002de3b9ca + alias d2 wwn-0x5000c5002def789e +.fi +.P + +.SH FILES +.TP +.I /etc/zfs/vdev_id.conf +The configuration file for +.BR vdev_id (8). +.SH SEE ALSO +.BR vdev_id (8) diff --git a/man/man5/zfs-events.5 b/man/man5/zfs-events.5 new file mode 100644 index 000000000000..4a28be71e685 --- /dev/null +++ b/man/man5/zfs-events.5 @@ -0,0 +1,965 @@ +'\" te +.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. +.\" Portions Copyright 2018 by Richard Elling +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.TH ZFS-EVENTS 5 "Oct 24, 2018" +.SH NAME +zfs\-events \- Events created by the ZFS filesystem. +.SH DESCRIPTION +.sp +.LP +Description of the different events generated by the ZFS stack. +.sp +Most of these don't have any description. The events generated by ZFS +have never been publicly documented. What is here is intended as a +starting point to provide documentation for all possible events. +.sp +To view all events created since the loading of the ZFS infrastructure +(i.e, "the module"), run +.P +.nf +\fBzpool events\fR +.fi +.P +to get a short list, and +.P +.nf +\fBzpool events -v\fR +.fi +.P +to get a full detail of the events and what information +is available about it. +.sp +This man page lists the different subclasses that are issued +in the case of an event. The full event name would be +\fIereport.fs.zfs.SUBCLASS\fR, but we only list the last +part here. + +.SS "EVENTS (SUBCLASS)" +.sp +.LP + +.sp +.ne 2 +.na +\fBchecksum\fR +.ad +.RS 12n +Issued when a checksum error has been detected. +.RE + +.sp +.ne 2 +.na +\fBio\fR +.ad +.RS 12n +Issued when there is an I/O error in a vdev in the pool. +.RE + +.sp +.ne 2 +.na +\fBdata\fR +.ad +.RS 12n +Issued when there have been data errors in the pool. +.RE + +.sp +.ne 2 +.na +\fBdeadman\fR +.ad +.RS 12n +Issued when an I/O is determined to be "hung", this can be caused by lost +completion events due to flaky hardware or drivers. See the +\fBzfs_deadman_failmode\fR module option description for additional +information regarding "hung" I/O detection and configuration. +.RE + +.sp +.ne 2 +.na +\fBdelay\fR +.ad +.RS 12n +Issued when a completed I/O exceeds the maximum allowed time specified +by the \fBzio_slow_io_ms\fR module option. This can be an indicator of +problems with the underlying storage device. The number of delay events is +ratelimited by the \fBzfs_slow_io_events_per_second\fR module parameter. +.RE + +.sp +.ne 2 +.na +\fBconfig.sync\fR +.ad +.RS 12n +Issued every time a vdev change have been done to the pool. +.RE + +.sp +.ne 2 +.na +\fBzpool\fR +.ad +.RS 12n +Issued when a pool cannot be imported. +.RE + +.sp +.ne 2 +.na +\fBzpool.destroy\fR +.ad +.RS 12n +Issued when a pool is destroyed. +.RE + +.sp +.ne 2 +.na +\fBzpool.export\fR +.ad +.RS 12n +Issued when a pool is exported. +.RE + +.sp +.ne 2 +.na +\fBzpool.import\fR +.ad +.RS 12n +Issued when a pool is imported. +.RE + +.sp +.ne 2 +.na +\fBzpool.reguid\fR +.ad +.RS 12n +Issued when a REGUID (new unique identifier for the pool have been regenerated) have been detected. +.RE + +.sp +.ne 2 +.na +\fBvdev.unknown\fR +.ad +.RS 12n +Issued when the vdev is unknown. Such as trying to clear device +errors on a vdev that have failed/been kicked from the system/pool +and is no longer available. +.RE + +.sp +.ne 2 +.na +\fBvdev.open_failed\fR +.ad +.RS 12n +Issued when a vdev could not be opened (because it didn't exist for example). +.RE + +.sp +.ne 2 +.na +\fBvdev.corrupt_data\fR +.ad +.RS 12n +Issued when corrupt data have been detected on a vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev.no_replicas\fR +.ad +.RS 12n +Issued when there are no more replicas to sustain the pool. +This would lead to the pool being \fIDEGRADED\fR. +.RE + +.sp +.ne 2 +.na +\fBvdev.bad_guid_sum\fR +.ad +.RS 12n +Issued when a missing device in the pool have been detected. +.RE + +.sp +.ne 2 +.na +\fBvdev.too_small\fR +.ad +.RS 12n +Issued when the system (kernel) have removed a device, and ZFS +notices that the device isn't there any more. This is usually +followed by a \fBprobe_failure\fR event. +.RE + +.sp +.ne 2 +.na +\fBvdev.bad_label\fR +.ad +.RS 12n +Issued when the label is OK but invalid. +.RE + +.sp +.ne 2 +.na +\fBvdev.bad_ashift\fR +.ad +.RS 12n +Issued when the ashift alignment requirement has increased. +.RE + +.sp +.ne 2 +.na +\fBvdev.remove\fR +.ad +.RS 12n +Issued when a vdev is detached from a mirror (or a spare detached from a +vdev where it have been used to replace a failed drive - only works if +the original drive have been readded). +.RE + +.sp +.ne 2 +.na +\fBvdev.clear\fR +.ad +.RS 12n +Issued when clearing device errors in a pool. Such as running \fBzpool clear\fR +on a device in the pool. +.RE + +.sp +.ne 2 +.na +\fBvdev.check\fR +.ad +.RS 12n +Issued when a check to see if a given vdev could be opened is started. +.RE + +.sp +.ne 2 +.na +\fBvdev.spare\fR +.ad +.RS 12n +Issued when a spare have kicked in to replace a failed device. +.RE + +.sp +.ne 2 +.na +\fBvdev.autoexpand\fR +.ad +.RS 12n +Issued when a vdev can be automatically expanded. +.RE + +.sp +.ne 2 +.na +\fBio_failure\fR +.ad +.RS 12n +Issued when there is an I/O failure in a vdev in the pool. +.RE + +.sp +.ne 2 +.na +\fBprobe_failure\fR +.ad +.RS 12n +Issued when a probe fails on a vdev. This would occur if a vdev +have been kicked from the system outside of ZFS (such as the kernel +have removed the device). +.RE + +.sp +.ne 2 +.na +\fBlog_replay\fR +.ad +.RS 12n +Issued when the intent log cannot be replayed. The can occur in the case +of a missing or damaged log device. +.RE + +.sp +.ne 2 +.na +\fBresilver.start\fR +.ad +.RS 12n +Issued when a resilver is started. +.RE + +.sp +.ne 2 +.na +\fBresilver.finish\fR +.ad +.RS 12n +Issued when the running resilver have finished. +.RE + +.sp +.ne 2 +.na +\fBscrub.start\fR +.ad +.RS 12n +Issued when a scrub is started on a pool. +.RE + +.sp +.ne 2 +.na +\fBscrub.finish\fR +.ad +.RS 12n +Issued when a pool has finished scrubbing. +.RE + +.sp +.ne 2 +.na +\fBscrub.abort\fR +.ad +.RS 12n +Issued when a scrub is aborted on a pool. +.RE + +.sp +.ne 2 +.na +\fBscrub.resume\fR +.ad +.RS 12n +Issued when a scrub is resumed on a pool. +.RE + +.sp +.ne 2 +.na +\fBscrub.paused\fR +.ad +.RS 12n +Issued when a scrub is paused on a pool. +.RE + +.sp +.ne 2 +.na +\fBbootfs.vdev.attach\fR +.ad +.RS 12n +.RE + +.SS "PAYLOADS" +.sp +.LP +This is the payload (data, information) that accompanies an +event. +.sp +For +.BR zed (8), +these are set to uppercase and prefixed with \fBZEVENT_\fR. + +.sp +.ne 2 +.na +\fBpool\fR +.ad +.RS 12n +Pool name. +.RE + +.sp +.ne 2 +.na +\fBpool_failmode\fR +.ad +.RS 12n +Failmode - \fBwait\fR, \fBcontinue\fR or \fBpanic\fR. +See +.BR zpool (8) +(\fIfailmode\fR property) for more information. +.RE + +.sp +.ne 2 +.na +\fBpool_guid\fR +.ad +.RS 12n +The GUID of the pool. +.RE + +.sp +.ne 2 +.na +\fBpool_context\fR +.ad +.RS 12n +The load state for the pool (0=none, 1=open, 2=import, 3=tryimport, 4=recover +5=error). +.RE + +.sp +.ne 2 +.na +\fBvdev_guid\fR +.ad +.RS 12n +The GUID of the vdev in question (the vdev failing or operated upon with +\fBzpool clear\fR etc). +.RE + +.sp +.ne 2 +.na +\fBvdev_type\fR +.ad +.RS 12n +Type of vdev - \fBdisk\fR, \fBfile\fR, \fBmirror\fR etc. See +.BR zpool (8) +under \fBVirtual Devices\fR for more information on possible values. +.RE + +.sp +.ne 2 +.na +\fBvdev_path\fR +.ad +.RS 12n +Full path of the vdev, including any \fI-partX\fR. +.RE + +.sp +.ne 2 +.na +\fBvdev_devid\fR +.ad +.RS 12n +ID of vdev (if any). +.RE + +.sp +.ne 2 +.na +\fBvdev_fru\fR +.ad +.RS 12n +Physical FRU location. +.RE + +.sp +.ne 2 +.na +\fBvdev_state\fR +.ad +.RS 12n +State of vdev (0=uninitialized, 1=closed, 2=offline, 3=removed, 4=failed to open, 5=faulted, 6=degraded, 7=healthy). +.RE + +.sp +.ne 2 +.na +\fBvdev_ashift\fR +.ad +.RS 12n +The ashift value of the vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev_complete_ts\fR +.ad +.RS 12n +The time the last I/O completed for the specified vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev_delta_ts\fR +.ad +.RS 12n +The time since the last I/O completed for the specified vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev_spare_paths\fR +.ad +.RS 12n +List of spares, including full path and any \fI-partX\fR. +.RE + +.sp +.ne 2 +.na +\fBvdev_spare_guids\fR +.ad +.RS 12n +GUID(s) of spares. +.RE + +.sp +.ne 2 +.na +\fBvdev_read_errors\fR +.ad +.RS 12n +How many read errors that have been detected on the vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev_write_errors\fR +.ad +.RS 12n +How many write errors that have been detected on the vdev. +.RE + +.sp +.ne 2 +.na +\fBvdev_cksum_errors\fR +.ad +.RS 12n +How many checksum errors that have been detected on the vdev. +.RE + +.sp +.ne 2 +.na +\fBparent_guid\fR +.ad +.RS 12n +GUID of the vdev parent. +.RE + +.sp +.ne 2 +.na +\fBparent_type\fR +.ad +.RS 12n +Type of parent. See \fBvdev_type\fR. +.RE + +.sp +.ne 2 +.na +\fBparent_path\fR +.ad +.RS 12n +Path of the vdev parent (if any). +.RE + +.sp +.ne 2 +.na +\fBparent_devid\fR +.ad +.RS 12n +ID of the vdev parent (if any). +.RE + +.sp +.ne 2 +.na +\fBzio_objset\fR +.ad +.RS 12n +The object set number for a given I/O. +.RE + +.sp +.ne 2 +.na +\fBzio_object\fR +.ad +.RS 12n +The object number for a given I/O. +.RE + +.sp +.ne 2 +.na +\fBzio_level\fR +.ad +.RS 12n +The indirect level for the block. Level 0 is the lowest level and includes +data blocks. Values > 0 indicate metadata blocks at the appropriate level. +.RE + +.sp +.ne 2 +.na +\fBzio_blkid\fR +.ad +.RS 12n +The block ID for a given I/O. +.RE + +.sp +.ne 2 +.na +\fBzio_err\fR +.ad +.RS 12n +The errno for a failure when handling a given I/O. The errno is compatible +with \fBerrno\fR(3) with the value for EBADE (0x34) used to indicate ZFS +checksum error. +.RE + +.sp +.ne 2 +.na +\fBzio_offset\fR +.ad +.RS 12n +The offset in bytes of where to write the I/O for the specified vdev. +.RE + +.sp +.ne 2 +.na +\fBzio_size\fR +.ad +.RS 12n +The size in bytes of the I/O. +.RE + +.sp +.ne 2 +.na +\fBzio_flags\fR +.ad +.RS 12n +The current flags describing how the I/O should be handled. See the +\fBI/O FLAGS\fR section for the full list of I/O flags. +.RE + +.sp +.ne 2 +.na +\fBzio_stage\fR +.ad +.RS 12n +The current stage of the I/O in the pipeline. See the \fBI/O STAGES\fR +section for a full list of all the I/O stages. +.RE + +.sp +.ne 2 +.na +\fBzio_pipeline\fR +.ad +.RS 12n +The valid pipeline stages for the I/O. See the \fBI/O STAGES\fR section for a +full list of all the I/O stages. +.RE + +.sp +.ne 2 +.na +\fBzio_delay\fR +.ad +.RS 12n +The time elapsed (in nanoseconds) waiting for the block layer to complete the +I/O. Unlike \fBzio_delta\fR this does not include any vdev queuing time and is +therefore solely a measure of the block layer performance. +.RE + +.sp +.ne 2 +.na +\fBzio_timestamp\fR +.ad +.RS 12n +The time when a given I/O was submitted. +.RE + +.sp +.ne 2 +.na +\fBzio_delta\fR +.ad +.RS 12n +The time required to service a given I/O. +.RE + +.sp +.ne 2 +.na +\fBprev_state\fR +.ad +.RS 12n +The previous state of the vdev. +.RE + +.sp +.ne 2 +.na +\fBcksum_expected\fR +.ad +.RS 12n +The expected checksum value for the block. +.RE + +.sp +.ne 2 +.na +\fBcksum_actual\fR +.ad +.RS 12n +The actual checksum value for an errant block. +.RE + +.sp +.ne 2 +.na +\fBcksum_algorithm\fR +.ad +.RS 12n +Checksum algorithm used. See \fBzfs\fR(8) for more information on checksum +algorithms available. +.RE + +.sp +.ne 2 +.na +\fBcksum_byteswap\fR +.ad +.RS 12n +Whether or not the data is byteswapped. +.RE + +.sp +.ne 2 +.na +\fBbad_ranges\fR +.ad +.RS 12n +[start, end) pairs of corruption offsets. Offsets are always aligned on a +64-bit boundary, and can include some gaps of non-corruption. +(See \fBbad_ranges_min_gap\fR) +.RE + +.sp +.ne 2 +.na +\fBbad_ranges_min_gap\fR +.ad +.RS 12n +In order to bound the size of the \fBbad_ranges\fR array, gaps of non-corruption +less than or equal to \fBbad_ranges_min_gap\fR bytes have been merged with +adjacent corruption. Always at least 8 bytes, since corruption is detected +on a 64-bit word basis. +.RE + +.sp +.ne 2 +.na +\fBbad_range_sets\fR +.ad +.RS 12n +This array has one element per range in \fBbad_ranges\fR. Each element contains +the count of bits in that range which were clear in the good data and set +in the bad data. +.RE + +.sp +.ne 2 +.na +\fBbad_range_clears\fR +.ad +.RS 12n +This array has one element per range in \fBbad_ranges\fR. Each element contains +the count of bits for that range which were set in the good data and clear in +the bad data. +.RE + +.sp +.ne 2 +.na +\fBbad_set_bits\fR +.ad +.RS 12n +If this field exists, it is an array of: (bad data & ~(good data)); that is, +the bits set in the bad data which are cleared in the good data. Each element +corresponds a byte whose offset is in a range in \fBbad_ranges\fR, and the +array is ordered by offset. Thus, the first element is the first byte in the +first \fBbad_ranges\fR range, and the last element is the last byte in the last +\fBbad_ranges\fR range. +.RE + +.sp +.ne 2 +.na +\fBbad_cleared_bits\fR +.ad +.RS 12n +Like \fBbad_set_bits\fR, but contains: (good data & ~(bad data)); that is, +the bits set in the good data which are cleared in the bad data. +.RE + +.sp +.ne 2 +.na +\fBbad_set_histogram\fR +.ad +.RS 12n +If this field exists, it is an array of counters. Each entry counts bits set +in a particular bit of a big-endian uint64 type. The first entry counts bits +set in the high-order bit of the first byte, the 9th byte, etc, and the last +entry counts bits set of the low-order bit of the 8th byte, the 16th byte, etc. +This information is useful for observing a stuck bit in a parallel data path, +such as IDE or parallel SCSI. +.RE + +.sp +.ne 2 +.na +\fBbad_cleared_histogram\fR +.ad +.RS 12n +If this field exists, it is an array of counters. Each entry counts bit clears +in a particular bit of a big-endian uint64 type. The first entry counts bits +clears of the high-order bit of the first byte, the 9th byte, etc, and the +last entry counts clears of the low-order bit of the 8th byte, the 16th byte, +etc. This information is useful for observing a stuck bit in a parallel data +path, such as IDE or parallel SCSI. +.RE + +.SS "I/O STAGES" +.sp +.LP +The ZFS I/O pipeline is comprised of various stages which are defined +below. The individual stages are used to construct these basic I/O +operations: Read, Write, Free, Claim, and Ioctl. These stages may be +set on an event to describe the life cycle of a given I/O. + +.TS +tab(:); +l l l . +Stage:Bit Mask:Operations +_:_:_ +ZIO_STAGE_OPEN:0x00000001:RWFCI + +ZIO_STAGE_READ_BP_INIT:0x00000002:R---- +ZIO_STAGE_WRITE_BP_INIT:0x00000004:-W--- +ZIO_STAGE_FREE_BP_INIT:0x00000008:--F-- +ZIO_STAGE_ISSUE_ASYNC:0x00000010:RWF-- +ZIO_STAGE_WRITE_COMPRESS:0x00000020:-W--- + +ZIO_STAGE_ENCRYPT:0x00000040:-W--- +ZIO_STAGE_CHECKSUM_GENERATE:0x00000080:-W--- + +ZIO_STAGE_NOP_WRITE:0x00000100:-W--- + +ZIO_STAGE_DDT_READ_START:0x00000200:R---- +ZIO_STAGE_DDT_READ_DONE:0x00000400:R---- +ZIO_STAGE_DDT_WRITE:0x00000800:-W--- +ZIO_STAGE_DDT_FREE:0x00001000:--F-- + +ZIO_STAGE_GANG_ASSEMBLE:0x00002000:RWFC- +ZIO_STAGE_GANG_ISSUE:0x00004000:RWFC- + +ZIO_STAGE_DVA_THROTTLE:0x00008000:-W--- +ZIO_STAGE_DVA_ALLOCATE:0x00010000:-W--- +ZIO_STAGE_DVA_FREE:0x00020000:--F-- +ZIO_STAGE_DVA_CLAIM:0x00040000:---C- + +ZIO_STAGE_READY:0x00080000:RWFCI + +ZIO_STAGE_VDEV_IO_START:0x00100000:RW--I +ZIO_STAGE_VDEV_IO_DONE:0x00200000:RW--I +ZIO_STAGE_VDEV_IO_ASSESS:0x00400000:RW--I + +ZIO_STAGE_CHECKSUM_VERIFY:0x00800000:R---- + +ZIO_STAGE_DONE:0x01000000:RWFCI +.TE + +.SS "I/O FLAGS" +.sp +.LP +Every I/O in the pipeline contains a set of flags which describe its +function and are used to govern its behavior. These flags will be set +in an event as an \fBzio_flags\fR payload entry. + +.TS +tab(:); +l l . +Flag:Bit Mask +_:_ +ZIO_FLAG_DONT_AGGREGATE:0x00000001 +ZIO_FLAG_IO_REPAIR:0x00000002 +ZIO_FLAG_SELF_HEAL:0x00000004 +ZIO_FLAG_RESILVER:0x00000008 +ZIO_FLAG_SCRUB:0x00000010 +ZIO_FLAG_SCAN_THREAD:0x00000020 +ZIO_FLAG_PHYSICAL:0x00000040 + +ZIO_FLAG_CANFAIL:0x00000080 +ZIO_FLAG_SPECULATIVE:0x00000100 +ZIO_FLAG_CONFIG_WRITER:0x00000200 +ZIO_FLAG_DONT_RETRY:0x00000400 +ZIO_FLAG_DONT_CACHE:0x00000800 +ZIO_FLAG_NODATA:0x00001000 +ZIO_FLAG_INDUCE_DAMAGE:0x00002000 + +ZIO_FLAG_IO_ALLOCATING:0x00004000 +ZIO_FLAG_IO_RETRY:0x00008000 +ZIO_FLAG_PROBE:0x00010000 +ZIO_FLAG_TRYHARD:0x00020000 +ZIO_FLAG_OPTIONAL:0x00040000 + +ZIO_FLAG_DONT_QUEUE:0x00080000 +ZIO_FLAG_DONT_PROPAGATE:0x00100000 +ZIO_FLAG_IO_BYPASS:0x00200000 +ZIO_FLAG_IO_REWRITE:0x00400000 +ZIO_FLAG_RAW_COMPRESS:0x00800000 +ZIO_FLAG_RAW_ENCRYPT:0x01000000 + +ZIO_FLAG_GANG_CHILD:0x02000000 +ZIO_FLAG_DDT_CHILD:0x04000000 +ZIO_FLAG_GODFATHER:0x08000000 +ZIO_FLAG_NOPWRITE:0x10000000 +ZIO_FLAG_REEXECUTED:0x20000000 +ZIO_FLAG_DELEGATED:0x40000000 +ZIO_FLAG_FASTWRITE:0x80000000 +.TE diff --git a/man/man5/zfs-module-parameters.5 b/man/man5/zfs-module-parameters.5 new file mode 100644 index 000000000000..853e8fc9442c --- /dev/null +++ b/man/man5/zfs-module-parameters.5 @@ -0,0 +1,4084 @@ +'\" te +.\" Copyright (c) 2013 by Turbo Fredriksson . All rights reserved. +.\" Copyright (c) 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2019 Datto Inc. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.TH ZFS-MODULE-PARAMETERS 5 "Feb 15, 2019" +.SH NAME +zfs\-module\-parameters \- ZFS module parameters +.SH DESCRIPTION +.sp +.LP +Description of the different parameters to the ZFS module. + +.SS "Module parameters" +.sp +.LP + +.sp +.ne 2 +.na +\fBdbuf_cache_max_bytes\fR (ulong) +.ad +.RS 12n +Maximum size in bytes of the dbuf cache. The target size is determined by the +MIN versus \fB1/2^dbuf_cache_shift\fR (1/32) of the target ARC size. The +behavior of the dbuf cache and its associated settings can be observed via the +\fB/proc/spl/kstat/zfs/dbufstats\fR kstat. +.sp +Default value: \fBULONG_MAX\fR. +.RE + +.sp +.ne 2 +.na +\fBdbuf_metadata_cache_max_bytes\fR (ulong) +.ad +.RS 12n +Maximum size in bytes of the metadata dbuf cache. The target size is +determined by the MIN versus \fB1/2^dbuf_metadata_cache_shift\fR (1/64) of the +target ARC size. The behavior of the metadata dbuf cache and its associated +settings can be observed via the \fB/proc/spl/kstat/zfs/dbufstats\fR kstat. +.sp +Default value: \fBULONG_MAX\fR. +.RE + +.sp +.ne 2 +.na +\fBdbuf_cache_hiwater_pct\fR (uint) +.ad +.RS 12n +The percentage over \fBdbuf_cache_max_bytes\fR when dbufs must be evicted +directly. +.sp +Default value: \fB10\fR%. +.RE + +.sp +.ne 2 +.na +\fBdbuf_cache_lowater_pct\fR (uint) +.ad +.RS 12n +The percentage below \fBdbuf_cache_max_bytes\fR when the evict thread stops +evicting dbufs. +.sp +Default value: \fB10\fR%. +.RE + +.sp +.ne 2 +.na +\fBdbuf_cache_shift\fR (int) +.ad +.RS 12n +Set the size of the dbuf cache, \fBdbuf_cache_max_bytes\fR, to a log2 fraction +of the target ARC size. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBdbuf_metadata_cache_shift\fR (int) +.ad +.RS 12n +Set the size of the dbuf metadata cache, \fBdbuf_metadata_cache_max_bytes\fR, +to a log2 fraction of the target ARC size. +.sp +Default value: \fB6\fR. +.RE + +.sp +.ne 2 +.na +\fBdmu_object_alloc_chunk_shift\fR (int) +.ad +.RS 12n +dnode slots allocated in a single operation as a power of 2. The default value +minimizes lock contention for the bulk operation performed. +.sp +Default value: \fB7\fR (128). +.RE + +.sp +.ne 2 +.na +\fBdmu_prefetch_max\fR (int) +.ad +.RS 12n +Limit the amount we can prefetch with one call to this amount (in bytes). +This helps to limit the amount of memory that can be used by prefetching. +.sp +Default value: \fB134,217,728\fR (128MB). +.RE + +.sp +.ne 2 +.na +\fBignore_hole_birth\fR (int) +.ad +.RS 12n +This is an alias for \fBsend_holes_without_birth_time\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_feed_again\fR (int) +.ad +.RS 12n +Turbo L2ARC warm-up. When the L2ARC is cold the fill interval will be set as +fast as possible. +.sp +Use \fB1\fR for yes (default) and \fB0\fR to disable. +.RE + +.sp +.ne 2 +.na +\fBl2arc_feed_min_ms\fR (ulong) +.ad +.RS 12n +Min feed interval in milliseconds. Requires \fBl2arc_feed_again=1\fR and only +applicable in related situations. +.sp +Default value: \fB200\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_feed_secs\fR (ulong) +.ad +.RS 12n +Seconds between L2ARC writing +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_headroom\fR (ulong) +.ad +.RS 12n +How far through the ARC lists to search for L2ARC cacheable content, expressed +as a multiplier of \fBl2arc_write_max\fR. +ARC persistence across reboots can be achieved with persistent L2ARC by setting +this parameter to \fB0\fR allowing the full length of ARC lists to be searched +for cacheable content. +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_headroom_boost\fR (ulong) +.ad +.RS 12n +Scales \fBl2arc_headroom\fR by this percentage when L2ARC contents are being +successfully compressed before writing. A value of \fB100\fR disables this +feature. +.sp +Default value: \fB200\fR%. +.RE + +.sp +.ne 2 +.na +\fBl2arc_trim_ahead\fR (ulong) +.ad +.RS 12n +Trims ahead of the current write size (\fBl2arc_write_max\fR) on L2ARC devices +by this percentage of write size if we have filled the device. If set to +\fB100\fR we TRIM twice the space required to accommodate upcoming writes. A +minimum of 64MB will be trimmed. It also enables TRIM of the whole L2ARC device +upon creation or addition to an existing pool or if the header of the device is +invalid upon importing a pool or onlining a cache device. A value of \fB0\fR +disables TRIM on L2ARC altogether and is the default as it can put significant +stress on the underlying storage devices. This will vary depending of how well +the specific device handles these commands. +.sp +Default value: \fB0\fR%. +.RE + +.sp +.ne 2 +.na +\fBl2arc_noprefetch\fR (int) +.ad +.RS 12n +Do not write buffers to L2ARC if they were prefetched but not used by +applications. +.sp +Use \fB1\fR for yes (default) and \fB0\fR to disable. +.RE + +.sp +.ne 2 +.na +\fBl2arc_norw\fR (int) +.ad +.RS 12n +No reads during writes. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBl2arc_write_boost\fR (ulong) +.ad +.RS 12n +Cold L2ARC devices will have \fBl2arc_write_max\fR increased by this amount +while they remain cold. +.sp +Default value: \fB8,388,608\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_write_max\fR (ulong) +.ad +.RS 12n +Max write bytes per interval. +.sp +Default value: \fB8,388,608\fR. +.RE + +.sp +.ne 2 +.na +\fBl2arc_rebuild_enabled\fR (int) +.ad +.RS 12n +Rebuild the L2ARC when importing a pool (persistent L2ARC). This can be +disabled if there are problems importing a pool or attaching an L2ARC device +(e.g. the L2ARC device is slow in reading stored log metadata, or the metadata +has become somehow fragmented/unusable). +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBl2arc_rebuild_blocks_min_l2size\fR (ulong) +.ad +.RS 12n +Min size (in bytes) of an L2ARC device required in order to write log blocks +in it. The log blocks are used upon importing the pool to rebuild +the L2ARC (persistent L2ARC). Rationale: for L2ARC devices less than 1GB, the +amount of data l2arc_evict() evicts is significant compared to the amount of +restored L2ARC data. In this case do not write log blocks in L2ARC in order not +to waste space. +.sp +Default value: \fB1,073,741,824\fR (1GB). +.RE + +.sp +.ne 2 +.na +\fBmetaslab_aliquot\fR (ulong) +.ad +.RS 12n +Metaslab granularity, in bytes. This is roughly similar to what would be +referred to as the "stripe size" in traditional RAID arrays. In normal +operation, ZFS will try to write this amount of data to a top-level vdev +before moving on to the next one. +.sp +Default value: \fB524,288\fR. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_bias_enabled\fR (int) +.ad +.RS 12n +Enable metaslab group biasing based on its vdev's over- or under-utilization +relative to the pool. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_force_ganging\fR (ulong) +.ad +.RS 12n +Make some blocks above a certain size be gang blocks. This option is used +by the test suite to facilitate testing. +.sp +Default value: \fB16,777,217\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_keep_log_spacemaps_at_export\fR (int) +.ad +.RS 12n +Prevent log spacemaps from being destroyed during pool exports and destroys. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_segment_weight_enabled\fR (int) +.ad +.RS 12n +Enable/disable segment-based metaslab selection. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_switch_threshold\fR (int) +.ad +.RS 12n +When using segment-based metaslab selection, continue allocating +from the active metaslab until \fBzfs_metaslab_switch_threshold\fR +worth of buckets have been exhausted. +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_debug_load\fR (int) +.ad +.RS 12n +Load all metaslabs during pool import. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBmetaslab_debug_unload\fR (int) +.ad +.RS 12n +Prevent metaslabs from being unloaded. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBmetaslab_fragmentation_factor_enabled\fR (int) +.ad +.RS 12n +Enable use of the fragmentation metric in computing metaslab weights. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_df_max_search\fR (int) +.ad +.RS 12n +Maximum distance to search forward from the last offset. Without this limit, +fragmented pools can see >100,000 iterations and metaslab_block_picker() +becomes the performance limiting factor on high-performance storage. + +With the default setting of 16MB, we typically see less than 500 iterations, +even with very fragmented, ashift=9 pools. The maximum number of iterations +possible is: \fBmetaslab_df_max_search / (2 * (1< physical sector size on new +top-level vdevs. +.sp +Default value: \fBASHIFT_MAX\fR (16). +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_min_auto_ashift\fR (ulong) +.ad +.RS 12n +Minimum ashift used when creating new top-level vdevs. +.sp +Default value: \fBASHIFT_MIN\fR (9). +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_min_ms_count\fR (int) +.ad +.RS 12n +Minimum number of metaslabs to create in a top-level vdev. +.sp +Default value: \fB16\fR. +.RE + +.sp +.ne 2 +.na +\fBvdev_validate_skip\fR (int) +.ad +.RS 12n +Skip label validation steps during pool import. Changing is not recommended +unless you know what you are doing and are recovering a damaged label. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_ms_count_limit\fR (int) +.ad +.RS 12n +Practical upper limit of total metaslabs per top-level vdev. +.sp +Default value: \fB131,072\fR. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_preload_enabled\fR (int) +.ad +.RS 12n +Enable metaslab group preloading. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_lba_weighting_enabled\fR (int) +.ad +.RS 12n +Give more weight to metaslabs with lower LBAs, assuming they have +greater bandwidth as is typically the case on a modern constant +angular velocity disk drive. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_unload_delay\fR (int) +.ad +.RS 12n +After a metaslab is used, we keep it loaded for this many txgs, to attempt to +reduce unnecessary reloading. Note that both this many txgs and +\fBmetaslab_unload_delay_ms\fR milliseconds must pass before unloading will +occur. +.sp +Default value: \fB32\fR. +.RE + +.sp +.ne 2 +.na +\fBmetaslab_unload_delay_ms\fR (int) +.ad +.RS 12n +After a metaslab is used, we keep it loaded for this many milliseconds, to +attempt to reduce unnecessary reloading. Note that both this many +milliseconds and \fBmetaslab_unload_delay\fR txgs must pass before unloading +will occur. +.sp +Default value: \fB600000\fR (ten minutes). +.RE + +.sp +.ne 2 +.na +\fBsend_holes_without_birth_time\fR (int) +.ad +.RS 12n +When set, the hole_birth optimization will not be used, and all holes will +always be sent on zfs send. This is useful if you suspect your datasets are +affected by a bug in hole_birth. +.sp +Use \fB1\fR for on (default) and \fB0\fR for off. +.RE + +.sp +.ne 2 +.na +\fBspa_config_path\fR (charp) +.ad +.RS 12n +SPA config file +.sp +Default value: \fB/etc/zfs/zpool.cache\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_asize_inflation\fR (int) +.ad +.RS 12n +Multiplication factor used to estimate actual disk consumption from the +size of data being written. The default value is a worst case estimate, +but lower values may be valid for a given pool depending on its +configuration. Pool administrators who understand the factors involved +may wish to specify a more realistic inflation factor, particularly if +they operate close to quota or capacity limits. +.sp +Default value: \fB24\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_load_print_vdev_tree\fR (int) +.ad +.RS 12n +Whether to print the vdev tree in the debugging message buffer during pool import. +Use 0 to disable and 1 to enable. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_load_verify_data\fR (int) +.ad +.RS 12n +Whether to traverse data blocks during an "extreme rewind" (\fB-X\fR) +import. Use 0 to disable and 1 to enable. + +An extreme rewind import normally performs a full traversal of all +blocks in the pool for verification. If this parameter is set to 0, +the traversal skips non-metadata blocks. It can be toggled once the +import has started to stop or start the traversal of non-metadata blocks. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_load_verify_metadata\fR (int) +.ad +.RS 12n +Whether to traverse blocks during an "extreme rewind" (\fB-X\fR) +pool import. Use 0 to disable and 1 to enable. + +An extreme rewind import normally performs a full traversal of all +blocks in the pool for verification. If this parameter is set to 0, +the traversal is not performed. It can be toggled once the import has +started to stop or start the traversal. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_load_verify_shift\fR (int) +.ad +.RS 12n +Sets the maximum number of bytes to consume during pool import to the log2 +fraction of the target ARC size. +.sp +Default value: \fB4\fR. +.RE + +.sp +.ne 2 +.na +\fBspa_slop_shift\fR (int) +.ad +.RS 12n +Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space +in the pool to be consumed. This ensures that we don't run the pool +completely out of space, due to unaccounted changes (e.g. to the MOS). +It also limits the worst-case time to allocate space. If we have +less than this amount of free space, most ZPL operations (e.g. write, +create) will return ENOSPC. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBvdev_removal_max_span\fR (int) +.ad +.RS 12n +During top-level vdev removal, chunks of data are copied from the vdev +which may include free space in order to trade bandwidth for IOPS. +This parameter determines the maximum span of free space (in bytes) +which will be included as "unnecessary" data in a chunk of copied data. + +The default value here was chosen to align with +\fBzfs_vdev_read_gap_limit\fR, which is a similar concept when doing +regular reads (but there's no reason it has to be the same). +.sp +Default value: \fB32,768\fR. +.RE + +.sp +.ne 2 +.na +\fBzap_iterate_prefetch\fR (int) +.ad +.RS 12n +If this is set, when we start iterating over a ZAP object, zfs will prefetch +the entire object (all leaf blocks). However, this is limited by +\fBdmu_prefetch_max\fR. +.sp +Use \fB1\fR for on (default) and \fB0\fR for off. +.RE + +.sp +.ne 2 +.na +\fBzfetch_array_rd_sz\fR (ulong) +.ad +.RS 12n +If prefetching is enabled, disable prefetching for reads larger than this size. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfetch_max_distance\fR (uint) +.ad +.RS 12n +Max bytes to prefetch per stream (default 8MB). +.sp +Default value: \fB8,388,608\fR. +.RE + +.sp +.ne 2 +.na +\fBzfetch_max_streams\fR (uint) +.ad +.RS 12n +Max number of streams per zfetch (prefetch streams per file). +.sp +Default value: \fB8\fR. +.RE + +.sp +.ne 2 +.na +\fBzfetch_min_sec_reap\fR (uint) +.ad +.RS 12n +Min time before an active prefetch stream can be reclaimed +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_abd_scatter_enabled\fR (int) +.ad +.RS 12n +Enables ARC from using scatter/gather lists and forces all allocations to be +linear in kernel memory. Disabling can improve performance in some code paths +at the expense of fragmented kernel memory. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_abd_scatter_max_order\fR (iunt) +.ad +.RS 12n +Maximum number of consecutive memory pages allocated in a single block for +scatter/gather lists. Default value is specified by the kernel itself. +.sp +Default value: \fB10\fR at the time of this writing. +.RE + +.sp +.ne 2 +.na +\fBzfs_abd_scatter_min_size\fR (uint) +.ad +.RS 12n +This is the minimum allocation size that will use scatter (page-based) +ABD's. Smaller allocations will use linear ABD's. +.sp +Default value: \fB1536\fR (512B and 1KB allocations will be linear). +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_dnode_limit\fR (ulong) +.ad +.RS 12n +When the number of bytes consumed by dnodes in the ARC exceeds this number of +bytes, try to unpin some of it in response to demand for non-metadata. This +value acts as a ceiling to the amount of dnode metadata, and defaults to 0 which +indicates that a percent which is based on \fBzfs_arc_dnode_limit_percent\fR of +the ARC meta buffers that may be used for dnodes. + +See also \fBzfs_arc_meta_prune\fR which serves a similar purpose but is used +when the amount of metadata in the ARC exceeds \fBzfs_arc_meta_limit\fR rather +than in response to overall demand for non-metadata. + +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_dnode_limit_percent\fR (ulong) +.ad +.RS 12n +Percentage that can be consumed by dnodes of ARC meta buffers. +.sp +See also \fBzfs_arc_dnode_limit\fR which serves a similar purpose but has a +higher priority if set to nonzero value. +.sp +Default value: \fB10\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_dnode_reduce_percent\fR (ulong) +.ad +.RS 12n +Percentage of ARC dnodes to try to scan in response to demand for non-metadata +when the number of bytes consumed by dnodes exceeds \fBzfs_arc_dnode_limit\fR. + +.sp +Default value: \fB10\fR% of the number of dnodes in the ARC. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_average_blocksize\fR (int) +.ad +.RS 12n +The ARC's buffer hash table is sized based on the assumption of an average +block size of \fBzfs_arc_average_blocksize\fR (default 8K). This works out +to roughly 1MB of hash table per 1GB of physical memory with 8-byte pointers. +For configurations with a known larger average block size this value can be +increased to reduce the memory footprint. + +.sp +Default value: \fB8192\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_eviction_pct\fR (int) +.ad +.RS 12n +When \fBarc_is_overflowing()\fR, \fBarc_get_data_impl()\fR waits for this +percent of the requested amount of data to be evicted. For example, by +default for every 2KB that's evicted, 1KB of it may be "reused" by a new +allocation. Since this is above 100%, it ensures that progress is made +towards getting \fBarc_size\fR under \fBarc_c\fR. Since this is finite, it +ensures that allocations can still happen, even during the potentially long +time that \fBarc_size\fR is more than \fBarc_c\fR. +.sp +Default value: \fB200\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_evict_batch_limit\fR (int) +.ad +.RS 12n +Number ARC headers to evict per sub-list before proceeding to another sub-list. +This batch-style operation prevents entire sub-lists from being evicted at once +but comes at a cost of additional unlocking and locking. +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_grow_retry\fR (int) +.ad +.RS 12n +If set to a non zero value, it will replace the arc_grow_retry value with this value. +The arc_grow_retry value (default 5) is the number of seconds the ARC will wait before +trying to resume growth after a memory pressure event. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_lotsfree_percent\fR (int) +.ad +.RS 12n +Throttle I/O when free system memory drops below this percentage of total +system memory. Setting this value to 0 will disable the throttle. +.sp +Default value: \fB10\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_max\fR (ulong) +.ad +.RS 12n +Max size of ARC in bytes. If set to 0 then the max size of ARC is determined +by the amount of system memory installed. For Linux, 1/2 of system memory will +be used as the limit. For FreeBSD, the larger of all system memory - 1GB or +5/8 of system memory will be used as the limit. This value must be at least +67108864 (64 megabytes). +.sp +This value can be changed dynamically with some caveats. It cannot be set back +to 0 while running and reducing it below the current ARC size will not cause +the ARC to shrink without memory pressure to induce shrinking. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_adjust_restarts\fR (ulong) +.ad +.RS 12n +The number of restart passes to make while scanning the ARC attempting +the free buffers in order to stay below the \fBzfs_arc_meta_limit\fR. +This value should not need to be tuned but is available to facilitate +performance analysis. +.sp +Default value: \fB4096\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_limit\fR (ulong) +.ad +.RS 12n +The maximum allowed size in bytes that meta data buffers are allowed to +consume in the ARC. When this limit is reached meta data buffers will +be reclaimed even if the overall arc_c_max has not been reached. This +value defaults to 0 which indicates that a percent which is based on +\fBzfs_arc_meta_limit_percent\fR of the ARC may be used for meta data. +.sp +This value my be changed dynamically except that it cannot be set back to 0 +for a specific percent of the ARC; it must be set to an explicit value. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_limit_percent\fR (ulong) +.ad +.RS 12n +Percentage of ARC buffers that can be used for meta data. + +See also \fBzfs_arc_meta_limit\fR which serves a similar purpose but has a +higher priority if set to nonzero value. + +.sp +Default value: \fB75\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_min\fR (ulong) +.ad +.RS 12n +The minimum allowed size in bytes that meta data buffers may consume in +the ARC. This value defaults to 0 which disables a floor on the amount +of the ARC devoted meta data. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_prune\fR (int) +.ad +.RS 12n +The number of dentries and inodes to be scanned looking for entries +which can be dropped. This may be required when the ARC reaches the +\fBzfs_arc_meta_limit\fR because dentries and inodes can pin buffers +in the ARC. Increasing this value will cause to dentry and inode caches +to be pruned more aggressively. Setting this value to 0 will disable +pruning the inode and dentry caches. +.sp +Default value: \fB10,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_meta_strategy\fR (int) +.ad +.RS 12n +Define the strategy for ARC meta data buffer eviction (meta reclaim strategy). +A value of 0 (META_ONLY) will evict only the ARC meta data buffers. +A value of 1 (BALANCED) indicates that additional data buffers may be evicted if +that is required to in order to evict the required number of meta data buffers. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_min\fR (ulong) +.ad +.RS 12n +Min size of ARC in bytes. If set to 0 then arc_c_min will default to +consuming the larger of 32M or 1/32 of total system memory. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_min_prefetch_ms\fR (int) +.ad +.RS 12n +Minimum time prefetched blocks are locked in the ARC, specified in ms. +A value of \fB0\fR will default to 1000 ms. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_min_prescient_prefetch_ms\fR (int) +.ad +.RS 12n +Minimum time "prescient prefetched" blocks are locked in the ARC, specified +in ms. These blocks are meant to be prefetched fairly aggressively ahead of +the code that may use them. A value of \fB0\fR will default to 6000 ms. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_max_missing_tvds\fR (int) +.ad +.RS 12n +Number of missing top-level vdevs which will be allowed during +pool import (only in read-only mode). +.sp +Default value: \fB0\fR +.RE + +.sp +.ne 2 +.na +\fBzfs_max_nvlist_src_size\fR (ulong) +.ad +.RS 12n +Maximum size in bytes allowed to be passed as zc_nvlist_src_size for ioctls on +/dev/zfs. This prevents a user from causing the kernel to allocate an excessive +amount of memory. When the limit is exceeded, the ioctl fails with EINVAL and a +description of the error is sent to the zfs-dbgmsg log. This parameter should +not need to be touched under normal circumstances. On FreeBSD, the default is +based on the system limit on user wired memory. On Linux, the default is +\fBKMALLOC_MAX_SIZE\fR . +.sp +Default value: \fB0\fR (kernel decides) +.RE + +.sp +.ne 2 +.na +\fBzfs_multilist_num_sublists\fR (int) +.ad +.RS 12n +To allow more fine-grained locking, each ARC state contains a series +of lists for both data and meta data objects. Locking is performed at +the level of these "sub-lists". This parameters controls the number of +sub-lists per ARC state, and also applies to other uses of the +multilist data structure. +.sp +Default value: \fB4\fR or the number of online CPUs, whichever is greater +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_overflow_shift\fR (int) +.ad +.RS 12n +The ARC size is considered to be overflowing if it exceeds the current +ARC target size (arc_c) by a threshold determined by this parameter. +The threshold is calculated as a fraction of arc_c using the formula +"arc_c >> \fBzfs_arc_overflow_shift\fR". + +The default value of 8 causes the ARC to be considered to be overflowing +if it exceeds the target size by 1/256th (0.3%) of the target size. + +When the ARC is overflowing, new buffer allocations are stalled until +the reclaim thread catches up and the overflow condition no longer exists. +.sp +Default value: \fB8\fR. +.RE + +.sp +.ne 2 +.na + +\fBzfs_arc_p_min_shift\fR (int) +.ad +.RS 12n +If set to a non zero value, this will update arc_p_min_shift (default 4) +with the new value. +arc_p_min_shift is used to shift of arc_c for calculating both min and max +max arc_p +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_p_dampener_disable\fR (int) +.ad +.RS 12n +Disable arc_p adapt dampener +.sp +Use \fB1\fR for yes (default) and \fB0\fR to disable. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_shrink_shift\fR (int) +.ad +.RS 12n +If set to a non zero value, this will update arc_shrink_shift (default 7) +with the new value. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_pc_percent\fR (uint) +.ad +.RS 12n +Percent of pagecache to reclaim arc to + +This tunable allows ZFS arc to play more nicely with the kernel's LRU +pagecache. It can guarantee that the ARC size won't collapse under scanning +pressure on the pagecache, yet still allows arc to be reclaimed down to +zfs_arc_min if necessary. This value is specified as percent of pagecache +size (as measured by NR_FILE_PAGES) where that percent may exceed 100. This +only operates during memory pressure/reclaim. +.sp +Default value: \fB0\fR% (disabled). +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_shrinker_limit\fR (int) +.ad +.RS 12n +This is a limit on how many pages the ARC shrinker makes available for +eviction in response to one page allocation attempt. Note that in +practice, the kernel's shrinker can ask us to evict up to about 4x this +for one allocation attempt. +.sp +The default limit of 10,000 (in practice, 160MB per allocation attempt with +4K pages) limits the amount of time spent attempting to reclaim ARC memory to +less than 100ms per allocation attempt, even with a small average compressed +block size of ~8KB. +.sp +The parameter can be set to 0 (zero) to disable the limit. +.sp +This parameter only applies on Linux. +.sp +Default value: \fB10,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_arc_sys_free\fR (ulong) +.ad +.RS 12n +The target number of bytes the ARC should leave as free memory on the system. +Defaults to the larger of 1/64 of physical memory or 512K. Setting this +option to a non-zero value will override the default. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_autoimport_disable\fR (int) +.ad +.RS 12n +Disable pool import at module load by ignoring the cache file (typically \fB/etc/zfs/zpool.cache\fR). +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBzfs_checksum_events_per_second\fR (uint) +.ad +.RS 12n +Rate limit checksum events to this many per second. Note that this should +not be set below the zed thresholds (currently 10 checksums over 10 sec) +or else zed may not trigger any action. +.sp +Default value: 20 +.RE + +.sp +.ne 2 +.na +\fBzfs_commit_timeout_pct\fR (int) +.ad +.RS 12n +This controls the amount of time that a ZIL block (lwb) will remain "open" +when it isn't "full", and it has a thread waiting for it to be committed to +stable storage. The timeout is scaled based on a percentage of the last lwb +latency to avoid significantly impacting the latency of each individual +transaction record (itx). +.sp +Default value: \fB5\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_condense_indirect_commit_entry_delay_ms\fR (int) +.ad +.RS 12n +Vdev indirection layer (used for device removal) sleeps for this many +milliseconds during mapping generation. Intended for use with the test suite +to throttle vdev removal speed. +.sp +Default value: \fB0\fR (no throttle). +.RE + +.sp +.ne 2 +.na +\fBzfs_condense_indirect_vdevs_enable\fR (int) +.ad +.RS 12n +Enable condensing indirect vdev mappings. When set to a non-zero value, +attempt to condense indirect vdev mappings if the mapping uses more than +\fBzfs_condense_min_mapping_bytes\fR bytes of memory and if the obsolete +space map object uses more than \fBzfs_condense_max_obsolete_bytes\fR +bytes on-disk. The condensing process is an attempt to save memory by +removing obsolete mappings. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_condense_max_obsolete_bytes\fR (ulong) +.ad +.RS 12n +Only attempt to condense indirect vdev mappings if the on-disk size +of the obsolete space map object is greater than this number of bytes +(see \fBfBzfs_condense_indirect_vdevs_enable\fR). +.sp +Default value: \fB1,073,741,824\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_condense_min_mapping_bytes\fR (ulong) +.ad +.RS 12n +Minimum size vdev mapping to attempt to condense (see +\fBzfs_condense_indirect_vdevs_enable\fR). +.sp +Default value: \fB131,072\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dbgmsg_enable\fR (int) +.ad +.RS 12n +Internally ZFS keeps a small log to facilitate debugging. By default the log +is disabled, to enable it set this option to 1. The contents of the log can +be accessed by reading the /proc/spl/kstat/zfs/dbgmsg file. Writing 0 to +this proc file clears the log. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dbgmsg_maxsize\fR (int) +.ad +.RS 12n +The maximum size in bytes of the internal ZFS debug log. +.sp +Default value: \fB4M\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dbuf_state_index\fR (int) +.ad +.RS 12n +This feature is currently unused. It is normally used for controlling what +reporting is available under /proc/spl/kstat/zfs. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_enabled\fR (int) +.ad +.RS 12n +When a pool sync operation takes longer than \fBzfs_deadman_synctime_ms\fR +milliseconds, or when an individual I/O takes longer than +\fBzfs_deadman_ziotime_ms\fR milliseconds, then the operation is considered to +be "hung". If \fBzfs_deadman_enabled\fR is set then the deadman behavior is +invoked as described by the \fBzfs_deadman_failmode\fR module option. +By default the deadman is enabled and configured to \fBwait\fR which results +in "hung" I/Os only being logged. The deadman is automatically disabled +when a pool gets suspended. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_failmode\fR (charp) +.ad +.RS 12n +Controls the failure behavior when the deadman detects a "hung" I/O. Valid +values are \fBwait\fR, \fBcontinue\fR, and \fBpanic\fR. +.sp +\fBwait\fR - Wait for a "hung" I/O to complete. For each "hung" I/O a +"deadman" event will be posted describing that I/O. +.sp +\fBcontinue\fR - Attempt to recover from a "hung" I/O by re-dispatching it +to the I/O pipeline if possible. +.sp +\fBpanic\fR - Panic the system. This can be used to facilitate an automatic +fail-over to a properly configured fail-over partner. +.sp +Default value: \fBwait\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_checktime_ms\fR (int) +.ad +.RS 12n +Check time in milliseconds. This defines the frequency at which we check +for hung I/O and potentially invoke the \fBzfs_deadman_failmode\fR behavior. +.sp +Default value: \fB60,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_synctime_ms\fR (ulong) +.ad +.RS 12n +Interval in milliseconds after which the deadman is triggered and also +the interval after which a pool sync operation is considered to be "hung". +Once this limit is exceeded the deadman will be invoked every +\fBzfs_deadman_checktime_ms\fR milliseconds until the pool sync completes. +.sp +Default value: \fB600,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_deadman_ziotime_ms\fR (ulong) +.ad +.RS 12n +Interval in milliseconds after which the deadman is triggered and an +individual I/O operation is considered to be "hung". As long as the I/O +remains "hung" the deadman will be invoked every \fBzfs_deadman_checktime_ms\fR +milliseconds until the I/O completes. +.sp +Default value: \fB300,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dedup_prefetch\fR (int) +.ad +.RS 12n +Enable prefetching dedup-ed blks +.sp +Use \fB1\fR for yes and \fB0\fR to disable (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_delay_min_dirty_percent\fR (int) +.ad +.RS 12n +Start to delay each transaction once there is this amount of dirty data, +expressed as a percentage of \fBzfs_dirty_data_max\fR. +This value should be >= zfs_vdev_async_write_active_max_dirty_percent. +See the section "ZFS TRANSACTION DELAY". +.sp +Default value: \fB60\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_delay_scale\fR (int) +.ad +.RS 12n +This controls how quickly the transaction delay approaches infinity. +Larger values cause longer delays for a given amount of dirty data. +.sp +For the smoothest delay, this value should be about 1 billion divided +by the maximum number of operations per second. This will smoothly +handle between 10x and 1/10th this number. +.sp +See the section "ZFS TRANSACTION DELAY". +.sp +Note: \fBzfs_delay_scale\fR * \fBzfs_dirty_data_max\fR must be < 2^64. +.sp +Default value: \fB500,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_disable_ivset_guid_check\fR (int) +.ad +.RS 12n +Disables requirement for IVset guids to be present and match when doing a raw +receive of encrypted datasets. Intended for users whose pools were created with +ZFS on Linux pre-release versions and now have compatibility issues. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_key_max_salt_uses\fR (ulong) +.ad +.RS 12n +Maximum number of uses of a single salt value before generating a new one for +encrypted datasets. The default value is also the maximum that will be +accepted. +.sp +Default value: \fB400,000,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_object_mutex_size\fR (uint) +.ad +.RS 12n +Size of the znode hashtable used for holds. + +Due to the need to hold locks on objects that may not exist yet, kernel mutexes +are not created per-object and instead a hashtable is used where collisions +will result in objects waiting when there is not actually contention on the +same object. +.sp +Default value: \fB64\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_slow_io_events_per_second\fR (int) +.ad +.RS 12n +Rate limit delay zevents (which report slow I/Os) to this many per second. +.sp +Default value: 20 +.RE + +.sp +.ne 2 +.na +\fBzfs_unflushed_max_mem_amt\fR (ulong) +.ad +.RS 12n +Upper-bound limit for unflushed metadata changes to be held by the +log spacemap in memory (in bytes). +.sp +Default value: \fB1,073,741,824\fR (1GB). +.RE + +.sp +.ne 2 +.na +\fBzfs_unflushed_max_mem_ppm\fR (ulong) +.ad +.RS 12n +Percentage of the overall system memory that ZFS allows to be used +for unflushed metadata changes by the log spacemap. +(value is calculated over 1000000 for finer granularity). +.sp +Default value: \fB1000\fR (which is divided by 1000000, resulting in +the limit to be \fB0.1\fR% of memory) +.RE + +.sp +.ne 2 +.na +\fBzfs_unflushed_log_block_max\fR (ulong) +.ad +.RS 12n +Describes the maximum number of log spacemap blocks allowed for each pool. +The default value of 262144 means that the space in all the log spacemaps +can add up to no more than 262144 blocks (which means 32GB of logical +space before compression and ditto blocks, assuming that blocksize is +128k). +.sp +This tunable is important because it involves a trade-off between import +time after an unclean export and the frequency of flushing metaslabs. +The higher this number is, the more log blocks we allow when the pool is +active which means that we flush metaslabs less often and thus decrease +the number of I/Os for spacemap updates per TXG. +At the same time though, that means that in the event of an unclean export, +there will be more log spacemap blocks for us to read, inducing overhead +in the import time of the pool. +The lower the number, the amount of flushing increases destroying log +blocks quicker as they become obsolete faster, which leaves less blocks +to be read during import time after a crash. +.sp +Each log spacemap block existing during pool import leads to approximately +one extra logical I/O issued. +This is the reason why this tunable is exposed in terms of blocks rather +than space used. +.sp +Default value: \fB262144\fR (256K). +.RE + +.sp +.ne 2 +.na +\fBzfs_unflushed_log_block_min\fR (ulong) +.ad +.RS 12n +If the number of metaslabs is small and our incoming rate is high, we +could get into a situation that we are flushing all our metaslabs every +TXG. +Thus we always allow at least this many log blocks. +.sp +Default value: \fB1000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_unflushed_log_block_pct\fR (ulong) +.ad +.RS 12n +Tunable used to determine the number of blocks that can be used for +the spacemap log, expressed as a percentage of the total number of +metaslabs in the pool. +.sp +Default value: \fB400\fR (read as \fB400\fR% - meaning that the number +of log spacemap blocks are capped at 4 times the number of +metaslabs in the pool). +.RE + +.sp +.ne 2 +.na +\fBzfs_unlink_suspend_progress\fR (uint) +.ad +.RS 12n +When enabled, files will not be asynchronously removed from the list of pending +unlinks and the space they consume will be leaked. Once this option has been +disabled and the dataset is remounted, the pending unlinks will be processed +and the freed space returned to the pool. +This option is used by the test suite to facilitate testing. +.sp +Uses \fB0\fR (default) to allow progress and \fB1\fR to pause progress. +.RE + +.sp +.ne 2 +.na +\fBzfs_delete_blocks\fR (ulong) +.ad +.RS 12n +This is the used to define a large file for the purposes of delete. Files +containing more than \fBzfs_delete_blocks\fR will be deleted asynchronously +while smaller files are deleted synchronously. Decreasing this value will +reduce the time spent in an unlink(2) system call at the expense of a longer +delay before the freed space is available. +.sp +Default value: \fB20,480\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dirty_data_max\fR (int) +.ad +.RS 12n +Determines the dirty space limit in bytes. Once this limit is exceeded, new +writes are halted until space frees up. This parameter takes precedence +over \fBzfs_dirty_data_max_percent\fR. +See the section "ZFS TRANSACTION DELAY". +.sp +Default value: \fB10\fR% of physical RAM, capped at \fBzfs_dirty_data_max_max\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dirty_data_max_max\fR (int) +.ad +.RS 12n +Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed in bytes. +This limit is only enforced at module load time, and will be ignored if +\fBzfs_dirty_data_max\fR is later changed. This parameter takes +precedence over \fBzfs_dirty_data_max_max_percent\fR. See the section +"ZFS TRANSACTION DELAY". +.sp +Default value: \fB25\fR% of physical RAM. +.RE + +.sp +.ne 2 +.na +\fBzfs_dirty_data_max_max_percent\fR (int) +.ad +.RS 12n +Maximum allowable value of \fBzfs_dirty_data_max\fR, expressed as a +percentage of physical RAM. This limit is only enforced at module load +time, and will be ignored if \fBzfs_dirty_data_max\fR is later changed. +The parameter \fBzfs_dirty_data_max_max\fR takes precedence over this +one. See the section "ZFS TRANSACTION DELAY". +.sp +Default value: \fB25\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_dirty_data_max_percent\fR (int) +.ad +.RS 12n +Determines the dirty space limit, expressed as a percentage of all +memory. Once this limit is exceeded, new writes are halted until space frees +up. The parameter \fBzfs_dirty_data_max\fR takes precedence over this +one. See the section "ZFS TRANSACTION DELAY". +.sp +Default value: \fB10\fR%, subject to \fBzfs_dirty_data_max_max\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_dirty_data_sync_percent\fR (int) +.ad +.RS 12n +Start syncing out a transaction group if there's at least this much dirty data +as a percentage of \fBzfs_dirty_data_max\fR. This should be less than +\fBzfs_vdev_async_write_active_min_dirty_percent\fR. +.sp +Default value: \fB20\fR% of \fBzfs_dirty_data_max\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_fallocate_reserve_percent\fR (uint) +.ad +.RS 12n +Since ZFS is a copy-on-write filesystem with snapshots, blocks cannot be +preallocated for a file in order to guarantee that later writes will not +run out of space. Instead, fallocate() space preallocation only checks +that sufficient space is currently available in the pool or the user's +project quota allocation, and then creates a sparse file of the requested +size. The requested space is multiplied by \fBzfs_fallocate_reserve_percent\fR +to allow additional space for indirect blocks and other internal metadata. +Setting this value to 0 disables support for fallocate(2) and returns +EOPNOTSUPP for fallocate() space preallocation again. +.sp +Default value: \fB110\fR% +.RE + +.sp +.ne 2 +.na +\fBzfs_fletcher_4_impl\fR (string) +.ad +.RS 12n +Select a fletcher 4 implementation. +.sp +Supported selectors are: \fBfastest\fR, \fBscalar\fR, \fBsse2\fR, \fBssse3\fR, +\fBavx2\fR, \fBavx512f\fR, \fBavx512bw\fR, and \fBaarch64_neon\fR. +All of the selectors except \fBfastest\fR and \fBscalar\fR require instruction +set extensions to be available and will only appear if ZFS detects that they are +present at runtime. If multiple implementations of fletcher 4 are available, +the \fBfastest\fR will be chosen using a micro benchmark. Selecting \fBscalar\fR +results in the original, CPU based calculation, being used. Selecting any option +other than \fBfastest\fR and \fBscalar\fR results in vector instructions from +the respective CPU instruction set being used. +.sp +Default value: \fBfastest\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_free_bpobj_enabled\fR (int) +.ad +.RS 12n +Enable/disable the processing of the free_bpobj object. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_async_block_max_blocks\fR (ulong) +.ad +.RS 12n +Maximum number of blocks freed in a single txg. +.sp +Default value: \fBULONG_MAX\fR (unlimited). +.RE + +.sp +.ne 2 +.na +\fBzfs_max_async_dedup_frees\fR (ulong) +.ad +.RS 12n +Maximum number of dedup blocks freed in a single txg. +.sp +Default value: \fB100,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_override_estimate_recordsize\fR (ulong) +.ad +.RS 12n +Record size calculation override for zfs send estimates. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_read_max_active\fR (int) +.ad +.RS 12n +Maximum asynchronous read I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB3\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_read_min_active\fR (int) +.ad +.RS 12n +Minimum asynchronous read I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_write_active_max_dirty_percent\fR (int) +.ad +.RS 12n +When the pool has more than +\fBzfs_vdev_async_write_active_max_dirty_percent\fR dirty data, use +\fBzfs_vdev_async_write_max_active\fR to limit active async writes. If +the dirty data is between min and max, the active I/O limit is linearly +interpolated. See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB60\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_write_active_min_dirty_percent\fR (int) +.ad +.RS 12n +When the pool has less than +\fBzfs_vdev_async_write_active_min_dirty_percent\fR dirty data, use +\fBzfs_vdev_async_write_min_active\fR to limit active async writes. If +the dirty data is between min and max, the active I/O limit is linearly +interpolated. See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB30\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_write_max_active\fR (int) +.ad +.RS 12n +Maximum asynchronous write I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_async_write_min_active\fR (int) +.ad +.RS 12n +Minimum asynchronous write I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Lower values are associated with better latency on rotational media but poorer +resilver performance. The default value of 2 was chosen as a compromise. A +value of 3 has been shown to improve resilver performance further at a cost of +further increasing latency. +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_initializing_max_active\fR (int) +.ad +.RS 12n +Maximum initializing I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_initializing_min_active\fR (int) +.ad +.RS 12n +Minimum initializing I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_max_active\fR (int) +.ad +.RS 12n +The maximum number of I/Os active to each device. Ideally, this will be >= +the sum of each queue's max_active. It must be at least the sum of each +queue's min_active. See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_rebuild_max_active\fR (int) +.ad +.RS 12n +Maximum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB3\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_rebuild_min_active\fR (int) +.ad +.RS 12n +Minimum sequential resilver I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_removal_max_active\fR (int) +.ad +.RS 12n +Maximum removal I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_removal_min_active\fR (int) +.ad +.RS 12n +Minimum removal I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_scrub_max_active\fR (int) +.ad +.RS 12n +Maximum scrub I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_scrub_min_active\fR (int) +.ad +.RS 12n +Minimum scrub I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_sync_read_max_active\fR (int) +.ad +.RS 12n +Maximum synchronous read I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_sync_read_min_active\fR (int) +.ad +.RS 12n +Minimum synchronous read I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_sync_write_max_active\fR (int) +.ad +.RS 12n +Maximum synchronous write I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_sync_write_min_active\fR (int) +.ad +.RS 12n +Minimum synchronous write I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_trim_max_active\fR (int) +.ad +.RS 12n +Maximum trim/discard I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_trim_min_active\fR (int) +.ad +.RS 12n +Minimum trim/discard I/Os active to each device. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_queue_depth_pct\fR (int) +.ad +.RS 12n +Maximum number of queued allocations per top-level vdev expressed as +a percentage of \fBzfs_vdev_async_write_max_active\fR which allows the +system to detect devices that are more capable of handling allocations +and to allocate more blocks to those devices. It allows for dynamic +allocation distribution when devices are imbalanced as fuller devices +will tend to be slower than empty devices. + +See also \fBzio_dva_throttle_enabled\fR. +.sp +Default value: \fB1000\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_expire_snapshot\fR (int) +.ad +.RS 12n +Seconds to expire .zfs/snapshot +.sp +Default value: \fB300\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_admin_snapshot\fR (int) +.ad +.RS 12n +Allow the creation, removal, or renaming of entries in the .zfs/snapshot +directory to cause the creation, destruction, or renaming of snapshots. +When enabled this functionality works both locally and over NFS exports +which have the 'no_root_squash' option set. This functionality is disabled +by default. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_flags\fR (int) +.ad +.RS 12n +Set additional debugging flags. The following flags may be bitwise-or'd +together. +.sp +.TS +box; +rB lB +lB lB +r l. +Value Symbolic Name + Description +_ +1 ZFS_DEBUG_DPRINTF + Enable dprintf entries in the debug log. +_ +2 ZFS_DEBUG_DBUF_VERIFY * + Enable extra dbuf verifications. +_ +4 ZFS_DEBUG_DNODE_VERIFY * + Enable extra dnode verifications. +_ +8 ZFS_DEBUG_SNAPNAMES + Enable snapshot name verification. +_ +16 ZFS_DEBUG_MODIFY + Check for illegally modified ARC buffers. +_ +64 ZFS_DEBUG_ZIO_FREE + Enable verification of block frees. +_ +128 ZFS_DEBUG_HISTOGRAM_VERIFY + Enable extra spacemap histogram verifications. +_ +256 ZFS_DEBUG_METASLAB_VERIFY + Verify space accounting on disk matches in-core range_trees. +_ +512 ZFS_DEBUG_SET_ERROR + Enable SET_ERROR and dprintf entries in the debug log. +_ +1024 ZFS_DEBUG_INDIRECT_REMAP + Verify split blocks created by device removal. +_ +2048 ZFS_DEBUG_TRIM + Verify TRIM ranges are always within the allocatable range tree. +_ +4096 ZFS_DEBUG_LOG_SPACEMAP + Verify that the log summary is consistent with the spacemap log + and enable zfs_dbgmsgs for metaslab loading and flushing. +.TE +.sp +* Requires debug build. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_free_leak_on_eio\fR (int) +.ad +.RS 12n +If destroy encounters an EIO while reading metadata (e.g. indirect +blocks), space referenced by the missing metadata can not be freed. +Normally this causes the background destroy to become "stalled", as +it is unable to make forward progress. While in this stalled state, +all remaining space to free from the error-encountering filesystem is +"temporarily leaked". Set this flag to cause it to ignore the EIO, +permanently leak the space from indirect blocks that can not be read, +and continue to free everything else that it can. + +The default, "stalling" behavior is useful if the storage partially +fails (i.e. some but not all i/os fail), and then later recovers. In +this case, we will be able to continue pool operations while it is +partially failed, and when it recovers, we can continue to free the +space, with no leaks. However, note that this case is actually +fairly rare. + +Typically pools either (a) fail completely (but perhaps temporarily, +e.g. a top-level vdev going offline), or (b) have localized, +permanent errors (e.g. disk returns the wrong data due to bit flip or +firmware bug). In case (a), this setting does not matter because the +pool will be suspended and the sync thread will not be able to make +forward progress regardless. In case (b), because the error is +permanent, the best we can do is leak the minimum amount of space, +which is what setting this flag will do. Therefore, it is reasonable +for this flag to normally be set, but we chose the more conservative +approach of not setting it, so that there is no possibility of +leaking space in the "partial temporary" failure case. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_free_min_time_ms\fR (int) +.ad +.RS 12n +During a \fBzfs destroy\fR operation using \fBfeature@async_destroy\fR a minimum +of this much time will be spent working on freeing blocks per txg. +.sp +Default value: \fB1,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_obsolete_min_time_ms\fR (int) +.ad +.RS 12n +Similar to \fBzfs_free_min_time_ms\fR but for cleanup of old indirection records +for removed vdevs. +.sp +Default value: \fB500\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_immediate_write_sz\fR (long) +.ad +.RS 12n +Largest data block to write to zil. Larger blocks will be treated as if the +dataset being written to had the property setting \fBlogbias=throughput\fR. +.sp +Default value: \fB32,768\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_initialize_value\fR (ulong) +.ad +.RS 12n +Pattern written to vdev free space by \fBzpool initialize\fR. +.sp +Default value: \fB16,045,690,984,833,335,022\fR (0xdeadbeefdeadbeee). +.RE + +.sp +.ne 2 +.na +\fBzfs_initialize_chunk_size\fR (ulong) +.ad +.RS 12n +Size of writes used by \fBzpool initialize\fR. +This option is used by the test suite to facilitate testing. +.sp +Default value: \fB1,048,576\fR +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_max_entries\fR (ulong) +.ad +.RS 12n +The threshold size (in block pointers) at which we create a new sub-livelist. +Larger sublists are more costly from a memory perspective but the fewer +sublists there are, the lower the cost of insertion. +.sp +Default value: \fB500,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_min_percent_shared\fR (int) +.ad +.RS 12n +If the amount of shared space between a snapshot and its clone drops below +this threshold, the clone turns off the livelist and reverts to the old deletion +method. This is in place because once a clone has been overwritten enough +livelists no long give us a benefit. +.sp +Default value: \fB75\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_new_alloc\fR (int) +.ad +.RS 12n +Incremented each time an extra ALLOC blkptr is added to a livelist entry while +it is being condensed. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_sync_cancel\fR (int) +.ad +.RS 12n +Incremented each time livelist condensing is canceled while in +spa_livelist_condense_sync. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_sync_pause\fR (int) +.ad +.RS 12n +When set, the livelist condense process pauses indefinitely before +executing the synctask - spa_livelist_condense_sync. +This option is used by the test suite to trigger race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_zthr_cancel\fR (int) +.ad +.RS 12n +Incremented each time livelist condensing is canceled while in +spa_livelist_condense_cb. +This option is used by the test suite to track race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_livelist_condense_zthr_pause\fR (int) +.ad +.RS 12n +When set, the livelist condense process pauses indefinitely before +executing the open context condensing work in spa_livelist_condense_cb. +This option is used by the test suite to trigger race conditions. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_lua_max_instrlimit\fR (ulong) +.ad +.RS 12n +The maximum execution time limit that can be set for a ZFS channel program, +specified as a number of Lua instructions. +.sp +Default value: \fB100,000,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_lua_max_memlimit\fR (ulong) +.ad +.RS 12n +The maximum memory limit that can be set for a ZFS channel program, specified +in bytes. +.sp +Default value: \fB104,857,600\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_max_dataset_nesting\fR (int) +.ad +.RS 12n +The maximum depth of nested datasets. This value can be tuned temporarily to +fix existing datasets that exceed the predefined limit. +.sp +Default value: \fB50\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_max_log_walking\fR (ulong) +.ad +.RS 12n +The number of past TXGs that the flushing algorithm of the log spacemap +feature uses to estimate incoming log blocks. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_max_logsm_summary_length\fR (ulong) +.ad +.RS 12n +Maximum number of rows allowed in the summary of the spacemap log. +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_max_recordsize\fR (int) +.ad +.RS 12n +We currently support block sizes from 512 bytes to 16MB. The benefits of +larger blocks, and thus larger I/O, need to be weighed against the cost of +COWing a giant block to modify one byte. Additionally, very large blocks +can have an impact on i/o latency, and also potentially on the memory +allocator. Therefore, we do not allow the recordsize to be set larger than +zfs_max_recordsize (default 1MB). Larger blocks can be created by changing +this tunable, and pools with larger blocks can always be imported and used, +regardless of this setting. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_allow_redacted_dataset_mount\fR (int) +.ad +.RS 12n +Allow datasets received with redacted send/receive to be mounted. Normally +disabled because these datasets may be missing key data. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_min_metaslabs_to_flush\fR (ulong) +.ad +.RS 12n +Minimum number of metaslabs to flush per dirty TXG +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_metaslab_fragmentation_threshold\fR (int) +.ad +.RS 12n +Allow metaslabs to keep their active state as long as their fragmentation +percentage is less than or equal to this value. An active metaslab that +exceeds this threshold will no longer keep its active status allowing +better metaslabs to be selected. +.sp +Default value: \fB70\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_mg_fragmentation_threshold\fR (int) +.ad +.RS 12n +Metaslab groups are considered eligible for allocations if their +fragmentation metric (measured as a percentage) is less than or equal to +this value. If a metaslab group exceeds this threshold then it will be +skipped unless all metaslab groups within the metaslab class have also +crossed this threshold. +.sp +Default value: \fB95\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_mg_noalloc_threshold\fR (int) +.ad +.RS 12n +Defines a threshold at which metaslab groups should be eligible for +allocations. The value is expressed as a percentage of free space +beyond which a metaslab group is always eligible for allocations. +If a metaslab group's free space is less than or equal to the +threshold, the allocator will avoid allocating to that group +unless all groups in the pool have reached the threshold. Once all +groups have reached the threshold, all groups are allowed to accept +allocations. The default value of 0 disables the feature and causes +all metaslab groups to be eligible for allocations. + +This parameter allows one to deal with pools having heavily imbalanced +vdevs such as would be the case when a new vdev has been added. +Setting the threshold to a non-zero percentage will stop allocations +from being made to vdevs that aren't filled to the specified percentage +and allow lesser filled vdevs to acquire more allocations than they +otherwise would under the old \fBzfs_mg_alloc_failures\fR facility. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_ddt_data_is_special\fR (int) +.ad +.RS 12n +If enabled, ZFS will place DDT data into the special allocation class. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_user_indirect_is_special\fR (int) +.ad +.RS 12n +If enabled, ZFS will place user data (both file and zvol) indirect blocks +into the special allocation class. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_multihost_history\fR (int) +.ad +.RS 12n +Historical statistics for the last N multihost updates will be available in +\fB/proc/spl/kstat/zfs//multihost\fR +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_multihost_interval\fR (ulong) +.ad +.RS 12n +Used to control the frequency of multihost writes which are performed when the +\fBmultihost\fR pool property is on. This is one factor used to determine the +length of the activity check during import. +.sp +The multihost write period is \fBzfs_multihost_interval / leaf-vdevs\fR +milliseconds. On average a multihost write will be issued for each leaf vdev +every \fBzfs_multihost_interval\fR milliseconds. In practice, the observed +period can vary with the I/O load and this observed value is the delay which is +stored in the uberblock. +.sp +Default value: \fB1000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_multihost_import_intervals\fR (uint) +.ad +.RS 12n +Used to control the duration of the activity test on import. Smaller values of +\fBzfs_multihost_import_intervals\fR will reduce the import time but increase +the risk of failing to detect an active pool. The total activity check time is +never allowed to drop below one second. +.sp +On import the activity check waits a minimum amount of time determined by +\fBzfs_multihost_interval * zfs_multihost_import_intervals\fR, or the same +product computed on the host which last had the pool imported (whichever is +greater). The activity check time may be further extended if the value of mmp +delay found in the best uberblock indicates actual multihost updates happened +at longer intervals than \fBzfs_multihost_interval\fR. A minimum value of +\fB100ms\fR is enforced. +.sp +A value of 0 is ignored and treated as if it was set to 1. +.sp +Default value: \fB20\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_multihost_fail_intervals\fR (uint) +.ad +.RS 12n +Controls the behavior of the pool when multihost write failures or delays are +detected. +.sp +When \fBzfs_multihost_fail_intervals = 0\fR, multihost write failures or delays +are ignored. The failures will still be reported to the ZED which depending on +its configuration may take action such as suspending the pool or offlining a +device. + +.sp +When \fBzfs_multihost_fail_intervals > 0\fR, the pool will be suspended if +\fBzfs_multihost_fail_intervals * zfs_multihost_interval\fR milliseconds pass +without a successful mmp write. This guarantees the activity test will see +mmp writes if the pool is imported. A value of 1 is ignored and treated as +if it was set to 2. This is necessary to prevent the pool from being suspended +due to normal, small I/O latency variations. + +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_no_scrub_io\fR (int) +.ad +.RS 12n +Set for no scrub I/O. This results in scrubs not actually scrubbing data and +simply doing a metadata crawl of the pool instead. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_no_scrub_prefetch\fR (int) +.ad +.RS 12n +Set to disable block prefetching for scrubs. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_nocacheflush\fR (int) +.ad +.RS 12n +Disable cache flush operations on disks when writing. Setting this will +cause pool corruption on power loss if a volatile out-of-order write cache +is enabled. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_nopwrite_enabled\fR (int) +.ad +.RS 12n +Enable NOP writes +.sp +Use \fB1\fR for yes (default) and \fB0\fR to disable. +.RE + +.sp +.ne 2 +.na +\fBzfs_dmu_offset_next_sync\fR (int) +.ad +.RS 12n +Enable forcing txg sync to find holes. When enabled forces ZFS to act +like prior versions when SEEK_HOLE or SEEK_DATA flags are used, which +when a dnode is dirty causes txg's to be synced so that this data can be +found. +.sp +Use \fB1\fR for yes and \fB0\fR to disable (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_pd_bytes_max\fR (int) +.ad +.RS 12n +The number of bytes which should be prefetched during a pool traversal +(eg: \fBzfs send\fR or other data crawling operations) +.sp +Default value: \fB52,428,800\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_per_txg_dirty_frees_percent \fR (ulong) +.ad +.RS 12n +Tunable to control percentage of dirtied indirect blocks from frees allowed +into one TXG. After this threshold is crossed, additional frees will wait until +the next TXG. +A value of zero will disable this throttle. +.sp +Default value: \fB5\fR, set to \fB0\fR to disable. +.RE + +.sp +.ne 2 +.na +\fBzfs_prefetch_disable\fR (int) +.ad +.RS 12n +This tunable disables predictive prefetch. Note that it leaves "prescient" +prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, +prescient prefetch never issues i/os that end up not being needed, so it +can't hurt performance. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_qat_checksum_disable\fR (int) +.ad +.RS 12n +This tunable disables qat hardware acceleration for sha256 checksums. It +may be set after the zfs modules have been loaded to initialize the qat +hardware as long as support is compiled in and the qat driver is present. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_qat_compress_disable\fR (int) +.ad +.RS 12n +This tunable disables qat hardware acceleration for gzip compression. It +may be set after the zfs modules have been loaded to initialize the qat +hardware as long as support is compiled in and the qat driver is present. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_qat_encrypt_disable\fR (int) +.ad +.RS 12n +This tunable disables qat hardware acceleration for AES-GCM encryption. It +may be set after the zfs modules have been loaded to initialize the qat +hardware as long as support is compiled in and the qat driver is present. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_read_chunk_size\fR (long) +.ad +.RS 12n +Bytes to read per chunk +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_read_history\fR (int) +.ad +.RS 12n +Historical statistics for the last N reads will be available in +\fB/proc/spl/kstat/zfs//reads\fR +.sp +Default value: \fB0\fR (no data is kept). +.RE + +.sp +.ne 2 +.na +\fBzfs_read_history_hits\fR (int) +.ad +.RS 12n +Include cache hits in read history +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_rebuild_max_segment\fR (ulong) +.ad +.RS 12n +Maximum read segment size to issue when sequentially resilvering a +top-level vdev. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_reconstruct_indirect_combinations_max\fR (int) +.ad +.RS 12na +If an indirect split block contains more than this many possible unique +combinations when being reconstructed, consider it too computationally +expensive to check them all. Instead, try at most +\fBzfs_reconstruct_indirect_combinations_max\fR randomly-selected +combinations each time the block is accessed. This allows all segment +copies to participate fairly in the reconstruction when all combinations +cannot be checked and prevents repeated use of one bad copy. +.sp +Default value: \fB4096\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_recover\fR (int) +.ad +.RS 12n +Set to attempt to recover from fatal errors. This should only be used as a +last resort, as it typically results in leaked space, or worse. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_removal_ignore_errors\fR (int) +.ad +.RS 12n +.sp +Ignore hard IO errors during device removal. When set, if a device encounters +a hard IO error during the removal process the removal will not be cancelled. +This can result in a normally recoverable block becoming permanently damaged +and is not recommended. This should only be used as a last resort when the +pool cannot be returned to a healthy state prior to removing the device. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_removal_suspend_progress\fR (int) +.ad +.RS 12n +.sp +This is used by the test suite so that it can ensure that certain actions +happen while in the middle of a removal. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_remove_max_segment\fR (int) +.ad +.RS 12n +.sp +The largest contiguous segment that we will attempt to allocate when removing +a device. This can be no larger than 16MB. If there is a performance +problem with attempting to allocate large blocks, consider decreasing this. +.sp +Default value: \fB16,777,216\fR (16MB). +.RE + +.sp +.ne 2 +.na +\fBzfs_resilver_disable_defer\fR (int) +.ad +.RS 12n +Disables the \fBresilver_defer\fR feature, causing an operation that would +start a resilver to restart one in progress immediately. +.sp +Default value: \fB0\fR (feature enabled). +.RE + +.sp +.ne 2 +.na +\fBzfs_resilver_min_time_ms\fR (int) +.ad +.RS 12n +Resilvers are processed by the sync thread. While resilvering it will spend +at least this much time working on a resilver between txg flushes. +.sp +Default value: \fB3,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_ignore_errors\fR (int) +.ad +.RS 12n +If set to a nonzero value, remove the DTL (dirty time list) upon +completion of a pool scan (scrub) even if there were unrepairable +errors. It is intended to be used during pool repair or recovery to +stop resilvering when the pool is next imported. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scrub_min_time_ms\fR (int) +.ad +.RS 12n +Scrubs are processed by the sync thread. While scrubbing it will spend +at least this much time working on a scrub between txg flushes. +.sp +Default value: \fB1,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_checkpoint_intval\fR (int) +.ad +.RS 12n +To preserve progress across reboots the sequential scan algorithm periodically +needs to stop metadata scanning and issue all the verifications I/Os to disk. +The frequency of this flushing is determined by the +\fBzfs_scan_checkpoint_intval\fR tunable. +.sp +Default value: \fB7200\fR seconds (every 2 hours). +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_fill_weight\fR (int) +.ad +.RS 12n +This tunable affects how scrub and resilver I/O segments are ordered. A higher +number indicates that we care more about how filled in a segment is, while a +lower number indicates we care more about the size of the extent without +considering the gaps within a segment. This value is only tunable upon module +insertion. Changing the value afterwards will have no affect on scrub or +resilver performance. +.sp +Default value: \fB3\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_issue_strategy\fR (int) +.ad +.RS 12n +Determines the order that data will be verified while scrubbing or resilvering. +If set to \fB1\fR, data will be verified as sequentially as possible, given the +amount of memory reserved for scrubbing (see \fBzfs_scan_mem_lim_fact\fR). This +may improve scrub performance if the pool's data is very fragmented. If set to +\fB2\fR, the largest mostly-contiguous chunk of found data will be verified +first. By deferring scrubbing of small segments, we may later find adjacent data +to coalesce and increase the segment size. If set to \fB0\fR, zfs will use +strategy \fB1\fR during normal verification and strategy \fB2\fR while taking a +checkpoint. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_legacy\fR (int) +.ad +.RS 12n +A value of 0 indicates that scrubs and resilvers will gather metadata in +memory before issuing sequential I/O. A value of 1 indicates that the legacy +algorithm will be used where I/O is initiated as soon as it is discovered. +Changing this value to 0 will not affect scrubs or resilvers that are already +in progress. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_max_ext_gap\fR (int) +.ad +.RS 12n +Indicates the largest gap in bytes between scrub / resilver I/Os that will still +be considered sequential for sorting purposes. Changing this value will not +affect scrubs or resilvers that are already in progress. +.sp +Default value: \fB2097152 (2 MB)\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_mem_lim_fact\fR (int) +.ad +.RS 12n +Maximum fraction of RAM used for I/O sorting by sequential scan algorithm. +This tunable determines the hard limit for I/O sorting memory usage. +When the hard limit is reached we stop scanning metadata and start issuing +data verification I/O. This is done until we get below the soft limit. +.sp +Default value: \fB20\fR which is 5% of RAM (1/20). +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_mem_lim_soft_fact\fR (int) +.ad +.RS 12n +The fraction of the hard limit used to determined the soft limit for I/O sorting +by the sequential scan algorithm. When we cross this limit from below no action +is taken. When we cross this limit from above it is because we are issuing +verification I/O. In this case (unless the metadata scan is done) we stop +issuing verification I/O and start scanning metadata again until we get to the +hard limit. +.sp +Default value: \fB20\fR which is 5% of the hard limit (1/20). +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_strict_mem_lim\fR (int) +.ad +.RS 12n +Enforces tight memory limits on pool scans when a sequential scan is in +progress. When disabled the memory limit may be exceeded by fast disks. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_scan_suspend_progress\fR (int) +.ad +.RS 12n +Freezes a scrub/resilver in progress without actually pausing it. Intended for +testing/debugging. +.sp +Default value: \fB0\fR. +.RE + + +.sp +.ne 2 +.na +\fBzfs_scan_vdev_limit\fR (int) +.ad +.RS 12n +Maximum amount of data that can be concurrently issued at once for scrubs and +resilvers per leaf device, given in bytes. +.sp +Default value: \fB41943040\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_corrupt_data\fR (int) +.ad +.RS 12n +Allow sending of corrupt data (ignore read/checksum errors when sending data) +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_send_unmodified_spill_blocks\fR (int) +.ad +.RS 12n +Include unmodified spill blocks in the send stream. Under certain circumstances +previous versions of ZFS could incorrectly remove the spill block from an +existing object. Including unmodified copies of the spill blocks creates a +backwards compatible stream which will recreate a spill block if it was +incorrectly removed. +.sp +Use \fB1\fR for yes (default) and \fB0\fR for no. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_no_prefetch_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs send\fR internal queues. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_no_prefetch_queue_length\fR (int) +.ad +.RS 12n +The maximum number of bytes allowed in \fBzfs send\fR's internal queues. +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs send\fR prefetch queue. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_send_queue_length\fR (int) +.ad +.RS 12n +The maximum number of bytes allowed that will be prefetched by \fBzfs send\fR. +This value must be at least twice the maximum block size in use. +.sp +Default value: \fB16,777,216\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_recv_queue_ff\fR (int) +.ad +.RS 12n +The fill fraction of the \fBzfs receive\fR queue. The fill fraction +controls the timing with which internal threads are woken up. +.sp +Default value: \fB20\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_recv_queue_length\fR (int) +.ad +.RS 12n +The maximum number of bytes allowed in the \fBzfs receive\fR queue. This value +must be at least twice the maximum block size in use. +.sp +Default value: \fB16,777,216\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_recv_write_batch_size\fR (int) +.ad +.RS 12n +The maximum amount of data (in bytes) that \fBzfs receive\fR will write in +one DMU transaction. This is the uncompressed size, even when receiving a +compressed send stream. This setting will not reduce the write size below +a single block. Capped at a maximum of 32MB +.sp +Default value: \fB1MB\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_override_estimate_recordsize\fR (ulong) +.ad +.RS 12n +Setting this variable overrides the default logic for estimating block +sizes when doing a zfs send. The default heuristic is that the average +block size will be the current recordsize. Override this value if most data +in your dataset is not of that size and you require accurate zfs send size +estimates. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_sync_pass_deferred_free\fR (int) +.ad +.RS 12n +Flushing of data to disk is done in passes. Defer frees starting in this pass +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_spa_discard_memory_limit\fR (int) +.ad +.RS 12n +Maximum memory used for prefetching a checkpoint's space map on each +vdev while discarding the checkpoint. +.sp +Default value: \fB16,777,216\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_special_class_metadata_reserve_pct\fR (int) +.ad +.RS 12n +Only allow small data blocks to be allocated on the special and dedup vdev +types when the available free space percentage on these vdevs exceeds this +value. This ensures reserved space is available for pool meta data as the +special vdevs approach capacity. +.sp +Default value: \fB25\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_sync_pass_dont_compress\fR (int) +.ad +.RS 12n +Starting in this sync pass, we disable compression (including of metadata). +With the default setting, in practice, we don't have this many sync passes, +so this has no effect. +.sp +The original intent was that disabling compression would help the sync passes +to converge. However, in practice disabling compression increases the average +number of sync passes, because when we turn compression off, a lot of block's +size will change and thus we have to re-allocate (not overwrite) them. It +also increases the number of 128KB allocations (e.g. for indirect blocks and +spacemaps) because these will not be compressed. The 128K allocations are +especially detrimental to performance on highly fragmented systems, which may +have very few free segments of this size, and may need to load new metaslabs +to satisfy 128K allocations. +.sp +Default value: \fB8\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_sync_pass_rewrite\fR (int) +.ad +.RS 12n +Rewrite new block pointers starting in this pass +.sp +Default value: \fB2\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_sync_taskq_batch_pct\fR (int) +.ad +.RS 12n +This controls the number of threads used by the dp_sync_taskq. The default +value of 75% will create a maximum of one thread per cpu. +.sp +Default value: \fB75\fR%. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_extent_bytes_max\fR (uint) +.ad +.RS 12n +Maximum size of TRIM command. Ranges larger than this will be split in to +chunks no larger than \fBzfs_trim_extent_bytes_max\fR bytes before being +issued to the device. +.sp +Default value: \fB134,217,728\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_extent_bytes_min\fR (uint) +.ad +.RS 12n +Minimum size of TRIM commands. TRIM ranges smaller than this will be skipped +unless they're part of a larger range which was broken in to chunks. This is +done because it's common for these small TRIMs to negatively impact overall +performance. This value can be set to 0 to TRIM all unallocated space. +.sp +Default value: \fB32,768\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_metaslab_skip\fR (uint) +.ad +.RS 12n +Skip uninitialized metaslabs during the TRIM process. This option is useful +for pools constructed from large thinly-provisioned devices where TRIM +operations are slow. As a pool ages an increasing fraction of the pools +metaslabs will be initialized progressively degrading the usefulness of +this option. This setting is stored when starting a manual TRIM and will +persist for the duration of the requested TRIM. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_queue_limit\fR (uint) +.ad +.RS 12n +Maximum number of queued TRIMs outstanding per leaf vdev. The number of +concurrent TRIM commands issued to the device is controlled by the +\fBzfs_vdev_trim_min_active\fR and \fBzfs_vdev_trim_max_active\fR module +options. +.sp +Default value: \fB10\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_trim_txg_batch\fR (uint) +.ad +.RS 12n +The number of transaction groups worth of frees which should be aggregated +before TRIM operations are issued to the device. This setting represents a +trade-off between issuing larger, more efficient TRIM operations and the +delay before the recently trimmed space is available for use by the device. +.sp +Increasing this value will allow frees to be aggregated for a longer time. +This will result is larger TRIM operations and potentially increased memory +usage. Decreasing this value will have the opposite effect. The default +value of 32 was determined to be a reasonable compromise. +.sp +Default value: \fB32\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_txg_history\fR (int) +.ad +.RS 12n +Historical statistics for the last N txgs will be available in +\fB/proc/spl/kstat/zfs//txgs\fR +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_txg_timeout\fR (int) +.ad +.RS 12n +Flush dirty data to disk at least every N seconds (maximum txg duration) +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_aggregate_trim\fR (int) +.ad +.RS 12n +Allow TRIM I/Os to be aggregated. This is normally not helpful because +the extents to be trimmed will have been already been aggregated by the +metaslab. This option is provided for debugging and performance analysis. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_aggregation_limit\fR (int) +.ad +.RS 12n +Max vdev I/O aggregation size +.sp +Default value: \fB1,048,576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_aggregation_limit_non_rotating\fR (int) +.ad +.RS 12n +Max vdev I/O aggregation size for non-rotating media +.sp +Default value: \fB131,072\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_cache_bshift\fR (int) +.ad +.RS 12n +Shift size to inflate reads too +.sp +Default value: \fB16\fR (effectively 65536). +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_cache_max\fR (int) +.ad +.RS 12n +Inflate reads smaller than this value to meet the \fBzfs_vdev_cache_bshift\fR +size (default 64k). +.sp +Default value: \fB16384\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_cache_size\fR (int) +.ad +.RS 12n +Total size of the per-disk cache in bytes. +.sp +Currently this feature is disabled as it has been found to not be helpful +for performance and in some cases harmful. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O immediately +follows its predecessor on rotational vdevs for the purpose of making decisions +based on load. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB5\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_rotating_seek_offset\fR (int) +.ad +.RS 12n +The maximum distance for the last queued I/O in which the balancing algorithm +considers an I/O to have locality. +See the section "ZFS I/O SCHEDULER". +.sp +Default value: \fB1048576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member on non-rotational vdevs +when I/Os do not immediately follow one another. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_mirror_non_rotating_seek_inc\fR (int) +.ad +.RS 12n +A number by which the balancing algorithm increments the load calculation for +the purpose of selecting the least busy mirror member when an I/O lacks +locality as defined by the zfs_vdev_mirror_rotating_seek_offset. I/Os within +this that are not immediately following the previous I/O are incremented by +half. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_read_gap_limit\fR (int) +.ad +.RS 12n +Aggregate read I/O operations if the gap on-disk between them is within this +threshold. +.sp +Default value: \fB32,768\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_write_gap_limit\fR (int) +.ad +.RS 12n +Aggregate write I/O over gap +.sp +Default value: \fB4,096\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_raidz_impl\fR (string) +.ad +.RS 12n +Parameter for selecting raidz parity implementation to use. + +Options marked (always) below may be selected on module load as they are +supported on all systems. +The remaining options may only be set after the module is loaded, as they +are available only if the implementations are compiled in and supported +on the running system. + +Once the module is loaded, the content of +/sys/module/zfs/parameters/zfs_vdev_raidz_impl will show available options +with the currently selected one enclosed in []. +Possible options are: + fastest - (always) implementation selected using built-in benchmark + original - (always) original raidz implementation + scalar - (always) scalar raidz implementation + sse2 - implementation using SSE2 instruction set (64bit x86 only) + ssse3 - implementation using SSSE3 instruction set (64bit x86 only) + avx2 - implementation using AVX2 instruction set (64bit x86 only) + avx512f - implementation using AVX512F instruction set (64bit x86 only) + avx512bw - implementation using AVX512F & AVX512BW instruction sets (64bit x86 only) + aarch64_neon - implementation using NEON (Aarch64/64 bit ARMv8 only) + aarch64_neonx2 - implementation using NEON with more unrolling (Aarch64/64 bit ARMv8 only) + powerpc_altivec - implementation using Altivec (PowerPC only) +.sp +Default value: \fBfastest\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_vdev_scheduler\fR (charp) +.ad +.RS 12n +\fBDEPRECATED\fR: This option exists for compatibility with older user +configurations. It does nothing except print a warning to the kernel log if +set. +.sp +.RE + +.sp +.ne 2 +.na +\fBzfs_zevent_cols\fR (int) +.ad +.RS 12n +When zevents are logged to the console use this as the word wrap width. +.sp +Default value: \fB80\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_zevent_console\fR (int) +.ad +.RS 12n +Log events to the console +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzfs_zevent_len_max\fR (int) +.ad +.RS 12n +Max event queue length. A value of 0 will result in a calculated value which +increases with the number of CPUs in the system (minimum 64 events). Events +in the queue can be viewed with the \fBzpool events\fR command. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_zil_clean_taskq_maxalloc\fR (int) +.ad +.RS 12n +The maximum number of taskq entries that are allowed to be cached. When this +limit is exceeded transaction records (itxs) will be cleaned synchronously. +.sp +Default value: \fB1048576\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_zil_clean_taskq_minalloc\fR (int) +.ad +.RS 12n +The number of taskq entries that are pre-populated when the taskq is first +created and are immediately available for use. +.sp +Default value: \fB1024\fR. +.RE + +.sp +.ne 2 +.na +\fBzfs_zil_clean_taskq_nthr_pct\fR (int) +.ad +.RS 12n +This controls the number of threads used by the dp_zil_clean_taskq. The default +value of 100% will create a maximum of one thread per cpu. +.sp +Default value: \fB100\fR%. +.RE + +.sp +.ne 2 +.na +\fBzil_maxblocksize\fR (int) +.ad +.RS 12n +This sets the maximum block size used by the ZIL. On very fragmented pools, +lowering this (typically to 36KB) can improve performance. +.sp +Default value: \fB131072\fR (128KB). +.RE + +.sp +.ne 2 +.na +\fBzil_nocacheflush\fR (int) +.ad +.RS 12n +Disable the cache flush commands that are normally sent to the disk(s) by +the ZIL after an LWB write has completed. Setting this will cause ZIL +corruption on power loss if a volatile out-of-order write cache is enabled. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzil_replay_disable\fR (int) +.ad +.RS 12n +Disable intent logging replay. Can be disabled for recovery from corrupted +ZIL +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzil_slog_bulk\fR (ulong) +.ad +.RS 12n +Limit SLOG write size per commit executed with synchronous priority. +Any writes above that will be executed with lower (asynchronous) priority +to limit potential SLOG device abuse by single active ZIL writer. +.sp +Default value: \fB786,432\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_deadman_log_all\fR (int) +.ad +.RS 12n +If non-zero, the zio deadman will produce debugging messages (see +\fBzfs_dbgmsg_enable\fR) for all zios, rather than only for leaf +zios possessing a vdev. This is meant to be used by developers to gain +diagnostic information for hang conditions which don't involve a mutex +or other locking primitive; typically conditions in which a thread in +the zio pipeline is looping indefinitely. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_decompress_fail_fraction\fR (int) +.ad +.RS 12n +If non-zero, this value represents the denominator of the probability that zfs +should induce a decompression failure. For instance, for a 5% decompression +failure rate, this value should be set to 20. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_slow_io_ms\fR (int) +.ad +.RS 12n +When an I/O operation takes more than \fBzio_slow_io_ms\fR milliseconds to +complete is marked as a slow I/O. Each slow I/O causes a delay zevent. Slow +I/O counters can be seen with "zpool status -s". + +.sp +Default value: \fB30,000\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_dva_throttle_enabled\fR (int) +.ad +.RS 12n +Throttle block allocations in the I/O pipeline. This allows for +dynamic allocation distribution when devices are imbalanced. +When enabled, the maximum number of pending allocations per top-level vdev +is limited by \fBzfs_vdev_queue_depth_pct\fR. +.sp +Default value: \fB1\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_requeue_io_start_cut_in_line\fR (int) +.ad +.RS 12n +Prioritize requeued I/O +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzio_taskq_batch_pct\fR (uint) +.ad +.RS 12n +Percentage of online CPUs (or CPU cores, etc) which will run a worker thread +for I/O. These workers are responsible for I/O work such as compression and +checksum calculations. Fractional number of CPUs will be rounded down. +.sp +The default value of 75 was chosen to avoid using all CPUs which can result in +latency issues and inconsistent application performance, especially when high +compression is enabled. +.sp +Default value: \fB75\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_inhibit_dev\fR (uint) +.ad +.RS 12n +Do not create zvol device nodes. This may slightly improve startup time on +systems with a very large number of zvols. +.sp +Use \fB1\fR for yes and \fB0\fR for no (default). +.RE + +.sp +.ne 2 +.na +\fBzvol_major\fR (uint) +.ad +.RS 12n +Major number for zvol block devices +.sp +Default value: \fB230\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_max_discard_blocks\fR (ulong) +.ad +.RS 12n +Discard (aka TRIM) operations done on zvols will be done in batches of this +many blocks, where block size is determined by the \fBvolblocksize\fR property +of a zvol. +.sp +Default value: \fB16,384\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_prefetch_bytes\fR (uint) +.ad +.RS 12n +When adding a zvol to the system prefetch \fBzvol_prefetch_bytes\fR +from the start and end of the volume. Prefetching these regions +of the volume is desirable because they are likely to be accessed +immediately by \fBblkid(8)\fR or by the kernel scanning for a partition +table. +.sp +Default value: \fB131,072\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_request_sync\fR (uint) +.ad +.RS 12n +When processing I/O requests for a zvol submit them synchronously. This +effectively limits the queue depth to 1 for each I/O submitter. When set +to 0 requests are handled asynchronously by a thread pool. The number of +requests which can be handled concurrently is controller by \fBzvol_threads\fR. +.sp +Default value: \fB0\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_threads\fR (uint) +.ad +.RS 12n +Max number of threads which can handle zvol I/O requests concurrently. +.sp +Default value: \fB32\fR. +.RE + +.sp +.ne 2 +.na +\fBzvol_volmode\fR (uint) +.ad +.RS 12n +Defines zvol block devices behaviour when \fBvolmode\fR is set to \fBdefault\fR. +Valid values are \fB1\fR (full), \fB2\fR (dev) and \fB3\fR (none). +.sp +Default value: \fB1\fR. +.RE + +.SH ZFS I/O SCHEDULER +ZFS issues I/O operations to leaf vdevs to satisfy and complete I/Os. +The I/O scheduler determines when and in what order those operations are +issued. The I/O scheduler divides operations into five I/O classes +prioritized in the following order: sync read, sync write, async read, +async write, and scrub/resilver. Each queue defines the minimum and +maximum number of concurrent operations that may be issued to the +device. In addition, the device has an aggregate maximum, +\fBzfs_vdev_max_active\fR. Note that the sum of the per-queue minimums +must not exceed the aggregate maximum. If the sum of the per-queue +maximums exceeds the aggregate maximum, then the number of active I/Os +may reach \fBzfs_vdev_max_active\fR, in which case no further I/Os will +be issued regardless of whether all per-queue minimums have been met. +.sp +For many physical devices, throughput increases with the number of +concurrent operations, but latency typically suffers. Further, physical +devices typically have a limit at which more concurrent operations have no +effect on throughput or can actually cause it to decrease. +.sp +The scheduler selects the next operation to issue by first looking for an +I/O class whose minimum has not been satisfied. Once all are satisfied and +the aggregate maximum has not been hit, the scheduler looks for classes +whose maximum has not been satisfied. Iteration through the I/O classes is +done in the order specified above. No further operations are issued if the +aggregate maximum number of concurrent operations has been hit or if there +are no operations queued for an I/O class that has not hit its maximum. +Every time an I/O is queued or an operation completes, the I/O scheduler +looks for new operations to issue. +.sp +In general, smaller max_active's will lead to lower latency of synchronous +operations. Larger max_active's may lead to higher overall throughput, +depending on underlying storage. +.sp +The ratio of the queues' max_actives determines the balance of performance +between reads, writes, and scrubs. E.g., increasing +\fBzfs_vdev_scrub_max_active\fR will cause the scrub or resilver to complete +more quickly, but reads and writes to have higher latency and lower throughput. +.sp +All I/O classes have a fixed maximum number of outstanding operations +except for the async write class. Asynchronous writes represent the data +that is committed to stable storage during the syncing stage for +transaction groups. Transaction groups enter the syncing state +periodically so the number of queued async writes will quickly burst up +and then bleed down to zero. Rather than servicing them as quickly as +possible, the I/O scheduler changes the maximum number of active async +write I/Os according to the amount of dirty data in the pool. Since +both throughput and latency typically increase with the number of +concurrent operations issued to physical devices, reducing the +burstiness in the number of concurrent operations also stabilizes the +response time of operations from other -- and in particular synchronous +-- queues. In broad strokes, the I/O scheduler will issue more +concurrent operations from the async write queue as there's more dirty +data in the pool. +.sp +Async Writes +.sp +The number of concurrent operations issued for the async write I/O class +follows a piece-wise linear function defined by a few adjustable points. +.nf + + | o---------| <-- zfs_vdev_async_write_max_active + ^ | /^ | + | | / | | +active | / | | + I/O | / | | +count | / | | + | / | | + |-------o | | <-- zfs_vdev_async_write_min_active + 0|_______^______|_________| + 0% | | 100% of zfs_dirty_data_max + | | + | `-- zfs_vdev_async_write_active_max_dirty_percent + `--------- zfs_vdev_async_write_active_min_dirty_percent + +.fi +Until the amount of dirty data exceeds a minimum percentage of the dirty +data allowed in the pool, the I/O scheduler will limit the number of +concurrent operations to the minimum. As that threshold is crossed, the +number of concurrent operations issued increases linearly to the maximum at +the specified maximum percentage of the dirty data allowed in the pool. +.sp +Ideally, the amount of dirty data on a busy pool will stay in the sloped +part of the function between \fBzfs_vdev_async_write_active_min_dirty_percent\fR +and \fBzfs_vdev_async_write_active_max_dirty_percent\fR. If it exceeds the +maximum percentage, this indicates that the rate of incoming data is +greater than the rate that the backend storage can handle. In this case, we +must further throttle incoming writes, as described in the next section. + +.SH ZFS TRANSACTION DELAY +We delay transactions when we've determined that the backend storage +isn't able to accommodate the rate of incoming writes. +.sp +If there is already a transaction waiting, we delay relative to when +that transaction will finish waiting. This way the calculated delay time +is independent of the number of threads concurrently executing +transactions. +.sp +If we are the only waiter, wait relative to when the transaction +started, rather than the current time. This credits the transaction for +"time already served", e.g. reading indirect blocks. +.sp +The minimum time for a transaction to take is calculated as: +.nf + min_time = zfs_delay_scale * (dirty - min) / (max - dirty) + min_time is then capped at 100 milliseconds. +.fi +.sp +The delay has two degrees of freedom that can be adjusted via tunables. The +percentage of dirty data at which we start to delay is defined by +\fBzfs_delay_min_dirty_percent\fR. This should typically be at or above +\fBzfs_vdev_async_write_active_max_dirty_percent\fR so that we only start to +delay after writing at full speed has failed to keep up with the incoming write +rate. The scale of the curve is defined by \fBzfs_delay_scale\fR. Roughly speaking, +this variable determines the amount of delay at the midpoint of the curve. +.sp +.nf +delay + 10ms +-------------------------------------------------------------*+ + | *| + 9ms + *+ + | *| + 8ms + *+ + | * | + 7ms + * + + | * | + 6ms + * + + | * | + 5ms + * + + | * | + 4ms + * + + | * | + 3ms + * + + | * | + 2ms + (midpoint) * + + | | ** | + 1ms + v *** + + | zfs_delay_scale ----------> ******** | + 0 +-------------------------------------*********----------------+ + 0% <- zfs_dirty_data_max -> 100% +.fi +.sp +Note that since the delay is added to the outstanding time remaining on the +most recent transaction, the delay is effectively the inverse of IOPS. +Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve +was chosen such that small changes in the amount of accumulated dirty data +in the first 3/4 of the curve yield relatively small differences in the +amount of delay. +.sp +The effects can be easier to understand when the amount of delay is +represented on a log scale: +.sp +.nf +delay +100ms +-------------------------------------------------------------++ + + + + | | + + *+ + 10ms + *+ + + ** + + | (midpoint) ** | + + | ** + + 1ms + v **** + + + zfs_delay_scale ----------> ***** + + | **** | + + **** + +100us + ** + + + * + + | * | + + * + + 10us + * + + + + + | | + + + + +--------------------------------------------------------------+ + 0% <- zfs_dirty_data_max -> 100% +.fi +.sp +Note here that only as the amount of dirty data approaches its limit does +the delay start to increase rapidly. The goal of a properly tuned system +should be to keep the amount of dirty data out of that range by first +ensuring that the appropriate limits are set for the I/O scheduler to reach +optimal throughput on the backend storage, and then by changing the value +of \fBzfs_delay_scale\fR to increase the steepness of the curve. diff --git a/man/man5/zpool-features.5 b/man/man5/zpool-features.5 new file mode 100644 index 000000000000..f65ef40a7e79 --- /dev/null +++ b/man/man5/zpool-features.5 @@ -0,0 +1,985 @@ +'\" te +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" The contents of this file are subject to the terms of the Common Development +.\" and Distribution License (the "License"). You may not use this file except +.\" in compliance with the License. You can obtain a copy of the license at +.\" usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" +.\" See the License for the specific language governing permissions and +.\" limitations under the License. When distributing Covered Code, include this +.\" CDDL HEADER in each file and include the License file at +.\" usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this +.\" CDDL HEADER, with the fields enclosed by brackets "[]" replaced with your +.\" own identifying information: +.\" Portions Copyright [yyyy] [name of copyright owner] +.\" Copyright (c) 2019, Klara Inc. +.\" Copyright (c) 2019, Allan Jude +.TH ZPOOL-FEATURES 5 "Jun 8, 2018" +.SH NAME +zpool\-features \- ZFS pool feature descriptions +.SH DESCRIPTION +.sp +.LP +ZFS pool on\-disk format versions are specified via "features" which replace +the old on\-disk format numbers (the last supported on\-disk format number is +28). To enable a feature on a pool use the \fBupgrade\fR subcommand of the +zpool(8) command, or set the \fBfeature@\fR\fIfeature_name\fR property +to \fBenabled\fR. +.sp +.LP +The pool format does not affect file system version compatibility or the ability +to send file systems between pools. +.sp +.LP +Since most features can be enabled independently of each other the on\-disk +format of the pool is specified by the set of all features marked as +\fBactive\fR on the pool. If the pool was created by another software version +this set may include unsupported features. +.SS "Identifying features" +.sp +.LP +Every feature has a GUID of the form \fIcom.example:feature_name\fR. The +reversed DNS name ensures that the feature's GUID is unique across all ZFS +implementations. When unsupported features are encountered on a pool they will +be identified by their GUIDs. Refer to the documentation for the ZFS +implementation that created the pool for information about those features. +.sp +.LP +Each supported feature also has a short name. By convention a feature's short +name is the portion of its GUID which follows the ':' (e.g. +\fIcom.example:feature_name\fR would have the short name \fIfeature_name\fR), +however a feature's short name may differ across ZFS implementations if +following the convention would result in name conflicts. +.SS "Feature states" +.sp +.LP +Features can be in one of three states: +.sp +.ne 2 +.na +\fBactive\fR +.ad +.RS 12n +This feature's on\-disk format changes are in effect on the pool. Support for +this feature is required to import the pool in read\-write mode. If this +feature is not read-only compatible, support is also required to import the pool +in read\-only mode (see "Read\-only compatibility"). +.RE + +.sp +.ne 2 +.na +\fBenabled\fR +.ad +.RS 12n +An administrator has marked this feature as enabled on the pool, but the +feature's on\-disk format changes have not been made yet. The pool can still be +imported by software that does not support this feature, but changes may be made +to the on\-disk format at any time which will move the feature to the +\fBactive\fR state. Some features may support returning to the \fBenabled\fR +state after becoming \fBactive\fR. See feature\-specific documentation for +details. +.RE + +.sp +.ne 2 +.na +\fBdisabled\fR +.ad +.RS 12n +This feature's on\-disk format changes have not been made and will not be made +unless an administrator moves the feature to the \fBenabled\fR state. Features +cannot be disabled once they have been enabled. +.RE + +.sp +.LP +The state of supported features is exposed through pool properties of the form +\fIfeature@short_name\fR. +.SS "Read\-only compatibility" +.sp +.LP +Some features may make on\-disk format changes that do not interfere with other +software's ability to read from the pool. These features are referred to as +"read\-only compatible". If all unsupported features on a pool are read\-only +compatible, the pool can be imported in read\-only mode by setting the +\fBreadonly\fR property during import (see zpool(8) for details on +importing pools). +.SS "Unsupported features" +.sp +.LP +For each unsupported feature enabled on an imported pool a pool property +named \fIunsupported@feature_name\fR will indicate why the import was allowed +despite the unsupported feature. Possible values for this property are: + +.sp +.ne 2 +.na +\fBinactive\fR +.ad +.RS 12n +The feature is in the \fBenabled\fR state and therefore the pool's on\-disk +format is still compatible with software that does not support this feature. +.RE + +.sp +.ne 2 +.na +\fBreadonly\fR +.ad +.RS 12n +The feature is read\-only compatible and the pool has been imported in +read\-only mode. +.RE + +.SS "Feature dependencies" +.sp +.LP +Some features depend on other features being enabled in order to function +properly. Enabling a feature will automatically enable any features it +depends on. +.SH FEATURES +.sp +.LP +The following features are supported on this system: + +.sp +.ne 2 +.na +\fBallocation_classes\fR +.ad +.RS 4n +.TS +l l . +GUID org.zfsonlinux:allocation_classes +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables support for separate allocation classes. + +This feature becomes \fBactive\fR when a dedicated allocation class vdev +(dedup or special) is created with the \fBzpool create\fR or \fBzpool add\fR +subcommands. With device removal, it can be returned to the \fBenabled\fR +state if all the dedicated allocation class vdevs are removed. +.RE + +.sp +.ne 2 +.na +\fBasync_destroy\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:async_destroy +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +Destroying a file system requires traversing all of its data in order to +return its used space to the pool. Without \fBasync_destroy\fR the file system +is not fully removed until all space has been reclaimed. If the destroy +operation is interrupted by a reboot or power outage the next attempt to open +the pool will need to complete the destroy operation synchronously. + +When \fBasync_destroy\fR is enabled the file system's data will be reclaimed +by a background process, allowing the destroy operation to complete without +traversing the entire file system. The background process is able to resume +interrupted destroys after the pool has been opened, eliminating the need +to finish interrupted destroys as part of the open operation. The amount +of space remaining to be reclaimed by the background process is available +through the \fBfreeing\fR property. + +This feature is only \fBactive\fR while \fBfreeing\fR is non\-zero. +.RE + +.sp +.ne 2 +.na +\fBbookmarks\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:bookmarks +READ\-ONLY COMPATIBLE yes +DEPENDENCIES extensible_dataset +.TE + +This feature enables use of the \fBzfs bookmark\fR subcommand. + +This feature is \fBactive\fR while any bookmarks exist in the pool. +All bookmarks in the pool can be listed by running +\fBzfs list -t bookmark -r \fIpoolname\fR\fR. +.RE + +.sp +.ne 2 +.na +\fBbookmark_v2\fR +.ad +.RS 4n +.TS +l l . +GUID com.datto:bookmark_v2 +READ\-ONLY COMPATIBLE no +DEPENDENCIES bookmark, extensible_dataset +.TE + +This feature enables the creation and management of larger bookmarks which are +needed for other features in ZFS. + +This feature becomes \fBactive\fR when a v2 bookmark is created and will be +returned to the \fBenabled\fR state when all v2 bookmarks are destroyed. +.RE + +.sp +.ne 2 +.na +\fBbookmark_written\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:bookmark_written +READ\-ONLY COMPATIBLE no +DEPENDENCIES bookmark, extensible_dataset, bookmark_v2 +.TE + +This feature enables additional bookmark accounting fields, enabling the +written# property (space written since a bookmark) and estimates of +send stream sizes for incrementals from bookmarks. + +This feature becomes \fBactive\fR when a bookmark is created and will be +returned to the \fBenabled\fR state when all bookmarks with these fields are destroyed. +.RE + +.sp +.ne 2 +.na +\fBdevice_rebuild\fR +.ad +.RS 4n +.TS +l l . +GUID org.openzfs:device_rebuild +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the ability for the \fBzpool attach\fR and \fBzpool +replace\fR subcommands to perform sequential reconstruction (instead of +healing reconstruction) when resilvering. + +Sequential reconstruction resilvers a device in LBA order without immediately +verifying the checksums. Once complete a scrub is started which then verifies +the checksums. This approach allows full redundancy to be restored to the pool +in the minimum amount of time. This two phase approach will take longer than a +healing resilver when the time to verify the checksums is included. However, +unless there is additional pool damage no checksum errors should be reported +by the scrub. This feature is incompatible with raidz configurations. + +This feature becomes \fBactive\fR while a sequential resilver is in progress, +and returns to \fBenabled\fR when the resilver completes. +.RE + +.sp +.ne 2 +.na +\fBdevice_removal\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:device_removal +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature enables the \fBzpool remove\fR subcommand to remove top-level +vdevs, evacuating them to reduce the total size of the pool. + +This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is used +on a top-level vdev, and will never return to being \fBenabled\fR. +.RE + +.sp +.ne 2 +.na +\fBedonr\fR +.ad +.RS 4n +.TS +l l . +GUID org.illumos:edonr +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +This feature enables the use of the Edon-R hash algorithm for checksum, +including for nopwrite (if compression is also enabled, an overwrite of +a block whose checksum matches the data being written will be ignored). +In an abundance of caution, Edon-R requires verification when used with +dedup: \fBzfs set dedup=edonr,verify\fR. See \fBzfs\fR(8). + +Edon-R is a very high-performance hash algorithm that was part +of the NIST SHA-3 competition. It provides extremely high hash +performance (over 350% faster than SHA-256), but was not selected +because of its unsuitability as a general purpose secure hash algorithm. +This implementation utilizes the new salted checksumming functionality +in ZFS, which means that the checksum is pre-seeded with a secret +256-bit random key (stored on the pool) before being fed the data block +to be checksummed. Thus the produced checksums are unique to a given +pool. + +When the \fBedonr\fR feature is set to \fBenabled\fR, the administrator +can turn on the \fBedonr\fR checksum on any dataset using the +\fBzfs set checksum=edonr\fR. See zfs(8). This feature becomes +\fBactive\fR once a \fBchecksum\fR property has been set to \fBedonr\fR, +and will return to being \fBenabled\fR once all filesystems that have +ever had their checksum set to \fBedonr\fR are destroyed. + +FreeBSD does not support the \fBedonr\fR feature. +.RE + +.sp +.ne 2 +.na +\fBembedded_data\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:embedded_data +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature improves the performance and compression ratio of +highly-compressible blocks. Blocks whose contents can compress to 112 bytes +or smaller can take advantage of this feature. + +When this feature is enabled, the contents of highly-compressible blocks are +stored in the block "pointer" itself (a misnomer in this case, as it contains +the compressed data, rather than a pointer to its location on disk). Thus +the space of the block (one sector, typically 512 bytes or 4KB) is saved, +and no additional i/o is needed to read and write the data block. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fR. +.RE + +.sp +.ne 2 +.na +\fBempty_bpobj\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:empty_bpobj +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature increases the performance of creating and using a large +number of snapshots of a single filesystem or volume, and also reduces +the disk space required. + +When there are many snapshots, each snapshot uses many Block Pointer +Objects (bpobj's) to track blocks associated with that snapshot. +However, in common use cases, most of these bpobj's are empty. This +feature allows us to create each bpobj on-demand, thus eliminating the +empty bpobjs. + +This feature is \fBactive\fR while there are any filesystems, volumes, +or snapshots which were created after enabling this feature. +.RE + +.sp +.ne 2 +.na +\fBenabled_txg\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:enabled_txg +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +Once this feature is enabled ZFS records the transaction group number +in which new features are enabled. This has no user-visible impact, +but other features may depend on this feature. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fB. +.RE + +.sp +.ne 2 +.na +\fBencryption\fR +.ad +.RS 4n +.TS +l l . +GUID com.datto:encryption +READ\-ONLY COMPATIBLE no +DEPENDENCIES bookmark_v2, extensible_dataset +.TE + +This feature enables the creation and management of natively encrypted datasets. + +This feature becomes \fBactive\fR when an encrypted dataset is created and will +be returned to the \fBenabled\fR state when all datasets that use this feature +are destroyed. +.RE + +.sp +.ne 2 +.na +\fBextensible_dataset\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:extensible_dataset +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature allows more flexible use of internal ZFS data structures, +and exists for other features to depend on. + +This feature will be \fBactive\fR when the first dependent feature uses it, +and will be returned to the \fBenabled\fR state when all datasets that use +this feature are destroyed. +.RE + +.sp +.ne 2 +.na +\fBfilesystem_limits\fR +.ad +.RS 4n +.TS +l l . +GUID com.joyent:filesystem_limits +READ\-ONLY COMPATIBLE yes +DEPENDENCIES extensible_dataset +.TE + +This feature enables filesystem and snapshot limits. These limits can be used +to control how many filesystems and/or snapshots can be created at the point in +the tree on which the limits are set. + +This feature is \fBactive\fR once either of the limit properties has been +set on a dataset. Once activated the feature is never deactivated. +.RE + +.sp +.ne 2 +.na +\fBhole_birth\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:hole_birth +READ\-ONLY COMPATIBLE no +DEPENDENCIES enabled_txg +.TE + +This feature has/had bugs, the result of which is that, if you do a +\fBzfs send -i\fR (or \fB-R\fR, since it uses \fB-i\fR) from an affected +dataset, the receiver will not see any checksum or other errors, but the +resulting destination snapshot will not match the source. Its use by +\fBzfs send -i\fR has been disabled by default. See the +\fBsend_holes_without_birth_time\fR module parameter in +zfs-module-parameters(5). + +This feature improves performance of incremental sends (\fBzfs send -i\fR) +and receives for objects with many holes. The most common case of +hole-filled objects is zvols. + +An incremental send stream from snapshot \fBA\fR to snapshot \fBB\fR +contains information about every block that changed between \fBA\fR and +\fBB\fR. Blocks which did not change between those snapshots can be +identified and omitted from the stream using a piece of metadata called +the 'block birth time', but birth times are not recorded for holes (blocks +filled only with zeroes). Since holes created after \fBA\fR cannot be +distinguished from holes created before \fBA\fR, information about every +hole in the entire filesystem or zvol is included in the send stream. + +For workloads where holes are rare this is not a problem. However, when +incrementally replicating filesystems or zvols with many holes (for +example a zvol formatted with another filesystem) a lot of time will +be spent sending and receiving unnecessary information about holes that +already exist on the receiving side. + +Once the \fBhole_birth\fR feature has been enabled the block birth times +of all new holes will be recorded. Incremental sends between snapshots +created after this feature is enabled will use this new metadata to avoid +sending information about holes that already exist on the receiving side. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fB. +.RE + +.sp +.ne 2 +.na +\fBlarge_blocks\fR +.ad +.RS 4n +.TS +l l . +GUID org.open-zfs:large_blocks +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +The \fBlarge_block\fR feature allows the record size on a dataset to be +set larger than 128KB. + +This feature becomes \fBactive\fR once a dataset contains a file with +a block size larger than 128KB, and will return to being \fBenabled\fR once all +filesystems that have ever had their recordsize larger than 128KB are destroyed. +.RE + +.sp +.ne 2 +.na +\fBlarge_dnode\fR +.ad +.RS 4n +.TS +l l . +GUID org.zfsonlinux:large_dnode +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +The \fBlarge_dnode\fR feature allows the size of dnodes in a dataset to be +set larger than 512B. + +This feature becomes \fBactive\fR once a dataset contains an object with +a dnode larger than 512B, which occurs as a result of setting the +\fBdnodesize\fR dataset property to a value other than \fBlegacy\fR. The +feature will return to being \fBenabled\fR once all filesystems that +have ever contained a dnode larger than 512B are destroyed. Large dnodes +allow more data to be stored in the bonus buffer, thus potentially +improving performance by avoiding the use of spill blocks. +.RE + +.sp +.ne 2 +.na +\fB\fBlivelist\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:livelist +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE +This feature allows clones to be deleted faster than the traditional method +when a large number of random/sparse writes have been made to the clone. +All blocks allocated and freed after a clone is created are tracked by the +the clone's livelist which is referenced during the deletion of the clone. +The feature is activated when a clone is created and remains active until all +clones have been destroyed. +.RE + +.sp +.ne 2 +.na +\fBlog_spacemap\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:log_spacemap +READ\-ONLY COMPATIBLE yes +DEPENDENCIES com.delphix:spacemap_v2 +.TE + +This feature improves performance for heavily-fragmented pools, +especially when workloads are heavy in random-writes. It does so by +logging all the metaslab changes on a single spacemap every TXG +instead of scattering multiple writes to all the metaslab spacemaps. + +This feature becomes \fBactive\fR as soon as it is enabled and will never +return to being \fBenabled\fR. +.RE + +.sp +.ne 2 +.na +\fBlz4_compress\fR +.ad +.RS 4n +.TS +l l . +GUID org.illumos:lz4_compress +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +\fBlz4\fR is a high-performance real-time compression algorithm that +features significantly faster compression and decompression as well as a +higher compression ratio than the older \fBlzjb\fR compression. +Typically, \fBlz4\fR compression is approximately 50% faster on +compressible data and 200% faster on incompressible data than +\fBlzjb\fR. It is also approximately 80% faster on decompression, while +giving approximately 10% better compression ratio. + +When the \fBlz4_compress\fR feature is set to \fBenabled\fR, the +administrator can turn on \fBlz4\fR compression on any dataset on the +pool using the zfs(8) command. Please note that doing so will +immediately activate the \fBlz4_compress\fR feature on the underlying +pool using the zfs(8) command. Also, all newly written metadata +will be compressed with \fBlz4\fR algorithm. Since this feature is not +read-only compatible, this operation will render the pool unimportable +on systems without support for the \fBlz4_compress\fR feature. + +Booting off of \fBlz4\fR-compressed root pools is supported. + +This feature becomes \fBactive\fR as soon as it is enabled and will +never return to being \fBenabled\fB. +.RE + +.sp +.ne 2 +.na +\fBmulti_vdev_crash_dump\fR +.ad +.RS 4n +.TS +l l . +GUID com.joyent:multi_vdev_crash_dump +READ\-ONLY COMPATIBLE no +DEPENDENCIES none +.TE + +This feature allows a dump device to be configured with a pool comprised +of multiple vdevs. Those vdevs may be arranged in any mirrored or raidz +configuration. + +When the \fBmulti_vdev_crash_dump\fR feature is set to \fBenabled\fR, +the administrator can use the \fBdumpadm\fR(1M) command to configure a +dump device on a pool comprised of multiple vdevs. + +Under Linux this feature is registered for compatibility but not used. +New pools created under Linux will have the feature \fBenabled\fR but +will never transition to \fB\fBactive\fR. This functionality is not +required in order to support crash dumps under Linux. Existing pools +where this feature is \fB\fBactive\fR can be imported. +.RE + +.sp +.ne 2 +.na +\fBobsolete_counts\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:obsolete_counts +READ\-ONLY COMPATIBLE yes +DEPENDENCIES device_removal +.TE + +This feature is an enhancement of device_removal, which will over time +reduce the memory used to track removed devices. When indirect blocks +are freed or remapped, we note that their part of the indirect mapping +is "obsolete", i.e. no longer needed. + +This feature becomes \fBactive\fR when the \fBzpool remove\fR subcommand is +used on a top-level vdev, and will never return to being \fBenabled\fR. +.RE + +.sp +.ne 2 +.na +\fBproject_quota\fR +.ad +.RS 4n +.TS +l l . +GUID org.zfsonlinux:project_quota +READ\-ONLY COMPATIBLE yes +DEPENDENCIES extensible_dataset +.TE + +This feature allows administrators to account the spaces and objects usage +information against the project identifier (ID). + +The project ID is new object-based attribute. When upgrading an existing +filesystem, object without project ID attribute will be assigned a zero +project ID. After this feature is enabled, newly created object will inherit +its parent directory's project ID if the parent inherit flag is set (via +\fBchattr +/-P\fR or \fBzfs project [-s|-C]\fR). Otherwise, the new object's +project ID will be set as zero. An object's project ID can be changed at +anytime by the owner (or privileged user) via \fBchattr -p $prjid\fR or +\fBzfs project -p $prjid\fR. + +This feature will become \fBactive\fR as soon as it is enabled and will never +return to being \fBdisabled\fR. Each filesystem will be upgraded automatically +when remounted or when new file is created under that filesystem. The upgrade +can also be triggered on filesystems via `zfs set version=current `. +The upgrade process runs in the background and may take a while to complete +for the filesystems containing a large number of files. +.RE + +.sp +.ne 2 +.na +\fB\fBredaction_bookmarks\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:redaction_bookmarks +READ\-ONLY COMPATIBLE no +DEPENDENCIES bookmarks, extensible_dataset +.TE + +This feature enables the use of the redacted zfs send. Redacted \fBzfs send\fR +creates redaction bookmarks, which store the list of blocks redacted by the +send that created them. For more information about redacted send, +see \fBzfs\fR(8). + +.RE + +.sp +.ne 2 +.na +\fB\fBredacted_datasets\fR\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:redacted_datasets +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +This feature enables the receiving of redacted zfs send streams. Redacted zfs +send streams create redacted datasets when received. These datasets are +missing some of their blocks, and so cannot be safely mounted, and their +contents cannot be safely read. For more information about redacted receive, +see \fBzfs\fR(8). +.RE + +.sp +.ne 2 +.na +\fBresilver_defer\fR +.ad +.RS 4n +.TS +l l . +GUID com.datto:resilver_defer +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature allows zfs to postpone new resilvers if an existing one is already +in progress. Without this feature, any new resilvers will cause the currently +running one to be immediately restarted from the beginning. + +This feature becomes \fBactive\fR once a resilver has been deferred, and +returns to being \fBenabled\fR when the deferred resilver begins. +.RE + +.sp +.ne 2 +.na +\fBsha512\fR +.ad +.RS 4n +.TS +l l . +GUID org.illumos:sha512 +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +This feature enables the use of the SHA-512/256 truncated hash algorithm +(FIPS 180-4) for checksum and dedup. The native 64-bit arithmetic of +SHA-512 provides an approximate 50% performance boost over SHA-256 on +64-bit hardware and is thus a good minimum-change replacement candidate +for systems where hash performance is important, but these systems +cannot for whatever reason utilize the faster \fBskein\fR and +\fBedonr\fR algorithms. + +When the \fBsha512\fR feature is set to \fBenabled\fR, the administrator +can turn on the \fBsha512\fR checksum on any dataset using +\fBzfs set checksum=sha512\fR. See zfs(8). This feature becomes +\fBactive\fR once a \fBchecksum\fR property has been set to \fBsha512\fR, +and will return to being \fBenabled\fR once all filesystems that have +ever had their checksum set to \fBsha512\fR are destroyed. +.RE + +.sp +.ne 2 +.na +\fBskein\fR +.ad +.RS 4n +.TS +l l . +GUID org.illumos:skein +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +This feature enables the use of the Skein hash algorithm for checksum +and dedup. Skein is a high-performance secure hash algorithm that was a +finalist in the NIST SHA-3 competition. It provides a very high security +margin and high performance on 64-bit hardware (80% faster than +SHA-256). This implementation also utilizes the new salted checksumming +functionality in ZFS, which means that the checksum is pre-seeded with a +secret 256-bit random key (stored on the pool) before being fed the data +block to be checksummed. Thus the produced checksums are unique to a +given pool, preventing hash collision attacks on systems with dedup. + +When the \fBskein\fR feature is set to \fBenabled\fR, the administrator +can turn on the \fBskein\fR checksum on any dataset using +\fBzfs set checksum=skein\fR. See zfs(8). This feature becomes +\fBactive\fR once a \fBchecksum\fR property has been set to \fBskein\fR, +and will return to being \fBenabled\fR once all filesystems that have +ever had their checksum set to \fBskein\fR are destroyed. +.RE + +.sp +.ne 2 +.na +\fBspacemap_histogram\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:spacemap_histogram +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This features allows ZFS to maintain more information about how free space +is organized within the pool. If this feature is \fBenabled\fR, ZFS will +set this feature to \fBactive\fR when a new space map object is created or +an existing space map is upgraded to the new format. Once the feature is +\fBactive\fR, it will remain in that state until the pool is destroyed. +.RE + +.sp +.ne 2 +.na +\fBspacemap_v2\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:spacemap_v2 +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the use of the new space map encoding which +consists of two words (instead of one) whenever it is advantageous. +The new encoding allows space maps to represent large regions of +space more efficiently on-disk while also increasing their maximum +addressable offset. + +This feature becomes \fBactive\fR once it is \fBenabled\fR, and never +returns back to being \fBenabled\fR. +.RE + +.sp +.ne 2 +.na +\fBuserobj_accounting\fR +.ad +.RS 4n +.TS +l l . +GUID org.zfsonlinux:userobj_accounting +READ\-ONLY COMPATIBLE yes +DEPENDENCIES extensible_dataset +.TE + +This feature allows administrators to account the object usage information +by user and group. + +This feature becomes \fBactive\fR as soon as it is enabled and will never +return to being \fBenabled\fR. Each filesystem will be upgraded automatically +when remounted, or when new files are created under that filesystem. +The upgrade can also be started manually on filesystems by running +`zfs set version=current `. The upgrade process runs in the background +and may take a while to complete for filesystems containing a large number of +files. +.RE + +.sp +.ne 2 +.na +\fBzpool_checkpoint\fR +.ad +.RS 4n +.TS +l l . +GUID com.delphix:zpool_checkpoint +READ\-ONLY COMPATIBLE yes +DEPENDENCIES none +.TE + +This feature enables the \fBzpool checkpoint\fR subcommand that can +checkpoint the state of the pool at the time it was issued and later +rewind back to it or discard it. + +This feature becomes \fBactive\fR when the \fBzpool checkpoint\fR subcommand +is used to checkpoint the pool. +The feature will only return back to being \fBenabled\fR when the pool +is rewound or the checkpoint has been discarded. +.RE + +.sp +.ne 2 +.na +\fBzstd_compress\fR +.ad +.RS 4n +.TS +l l . +GUID org.freebsd:zstd_compress +READ\-ONLY COMPATIBLE no +DEPENDENCIES extensible_dataset +.TE + +\fBzstd\fR is a high-performance compression algorithm that features a +combination of high compression ratios and high speed. Compared to \fBgzip\fR, +\fBzstd\fR offers slighty better compression at much higher speeds. Compared +to \fBlz4\fR, \fBzstd\fR offers much better compression while being only +modestly slower. Typically, \fBzstd\fR compression speed ranges from 250 to 500 +MB/s per thread and decompression speed is over 1 GB/s per thread. + +When the \fBzstd\fR feature is set to \fBenabled\fR, the administrator can turn +on \fBzstd\fR compression of any dataset by running +`zfs set compress=zstd `. + +This feature becomes \fBactive\fR once a \fBcompress\fR property has been set to +\fBzstd\fR, and will return to being \fBenabled\fR once all filesystems that +have ever had their compress property set to \fBzstd\fR are destroyed. + +Booting off of \fBzstd\fR-compressed root pools is not yet supported. +.RE + +.SH "SEE ALSO" +zpool(8) diff --git a/man/man8/.gitignore b/man/man8/.gitignore new file mode 100644 index 000000000000..f2fc702147e9 --- /dev/null +++ b/man/man8/.gitignore @@ -0,0 +1,2 @@ +/zed.8 +/zfs-mount-generator.8 diff --git a/man/man8/Makefile.am b/man/man8/Makefile.am new file mode 100644 index 000000000000..07f6aefa68e6 --- /dev/null +++ b/man/man8/Makefile.am @@ -0,0 +1,101 @@ +include $(top_srcdir)/config/Substfiles.am + +dist_man_MANS = \ + fsck.zfs.8 \ + mount.zfs.8 \ + vdev_id.8 \ + zdb.8 \ + zfs.8 \ + zfsconcepts.8 \ + zfsprops.8 \ + zfs-allow.8 \ + zfs-bookmark.8 \ + zfs-change-key.8 \ + zfs-clone.8 \ + zfs-create.8 \ + zfs-destroy.8 \ + zfs-diff.8 \ + zfs-get.8 \ + zfs-groupspace.8 \ + zfs-hold.8 \ + zfs-inherit.8 \ + zfs-jail.8 \ + zfs-list.8 \ + zfs-load-key.8 \ + zfs-mount.8 \ + zfs-program.8 \ + zfs-project.8 \ + zfs-projectspace.8 \ + zfs-promote.8 \ + zfs-receive.8 \ + zfs-recv.8 \ + zfs-redact.8 \ + zfs-release.8 \ + zfs-rename.8 \ + zfs-rollback.8 \ + zfs-send.8 \ + zfs-set.8 \ + zfs-share.8 \ + zfs-snapshot.8 \ + zfs-unallow.8 \ + zfs-unjail.8 \ + zfs-unload-key.8 \ + zfs-unmount.8 \ + zfs-upgrade.8 \ + zfs-userspace.8 \ + zfs-wait.8 \ + zfs_ids_to_path.8 \ + zgenhostid.8 \ + zinject.8 \ + zpool.8 \ + zpoolconcepts.8 \ + zpoolprops.8 \ + zpool-add.8 \ + zpool-attach.8 \ + zpool-checkpoint.8 \ + zpool-clear.8 \ + zpool-create.8 \ + zpool-destroy.8 \ + zpool-detach.8 \ + zpool-events.8 \ + zpool-export.8 \ + zpool-get.8 \ + zpool-history.8 \ + zpool-import.8 \ + zpool-initialize.8 \ + zpool-iostat.8 \ + zpool-labelclear.8 \ + zpool-list.8 \ + zpool-offline.8 \ + zpool-online.8 \ + zpool-reguid.8 \ + zpool-remove.8 \ + zpool-reopen.8 \ + zpool-replace.8 \ + zpool-resilver.8 \ + zpool-scrub.8 \ + zpool-set.8 \ + zpool-split.8 \ + zpool-status.8 \ + zpool-sync.8 \ + zpool-trim.8 \ + zpool-upgrade.8 \ + zpool-wait.8 \ + zstream.8 \ + zstreamdump.8 + +nodist_man_MANS = \ + zed.8 \ + zfs-mount-generator.8 + +SUBSTFILES += $(nodist_man_MANS) + +if BUILD_LINUX +# The man pager in most Linux distros defaults to BSD instead of Linux +# when .Os is blank, but leaving it blank makes things a lot easier on +# FreeBSD when OpenZFS is vendored in the base system. +install-data-hook: + cd $(DESTDIR)$(mandir)/man8; \ + $(SED) ${ac_inplace} -e 's/^\.Os$$/.Os Linux/' \ + $(dist_man_MANS) $(nodist_man_MANS) +endif diff --git a/man/man8/fsck.zfs.8 b/man/man8/fsck.zfs.8 new file mode 100644 index 000000000000..80e7a1ebb35f --- /dev/null +++ b/man/man8/fsck.zfs.8 @@ -0,0 +1,67 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright 2013 Darik Horn . All rights reserved. +.\" +.TH fsck.zfs 8 "2013 MAR 16" "ZFS on Linux" "System Administration Commands" + +.SH NAME +fsck.zfs \- Dummy ZFS filesystem checker. + +.SH SYNOPSIS +.LP +.BI "fsck.zfs [" "options" "] <" "dataset" ">" + +.SH DESCRIPTION +.LP +\fBfsck.zfs\fR is a shell stub that does nothing and always returns +true. It is installed by ZoL because some Linux distributions expect +a fsck helper for all filesystems. + +.SH OPTIONS +.HP +All \fIoptions\fR and the \fIdataset\fR are ignored. + +.SH "NOTES" +.LP +ZFS datasets are checked by running \fBzpool scrub\fR on the +containing pool. An individual ZFS dataset is never checked +independently of its pool, which is unlike a regular filesystem. + +.SH "BUGS" +.LP +On some systems, if the \fIdataset\fR is in a degraded pool, then it +might be appropriate for \fBfsck.zfs\fR to return exit code 4 to +indicate an uncorrected filesystem error. +.LP +Similarly, if the \fIdataset\fR is in a faulted pool and has a legacy +/etc/fstab record, then \fBfsck.zfs\fR should return exit code 8 to +indicate a fatal operational error. + +.SH "AUTHORS" +.LP +Darik Horn . + +.SH "SEE ALSO" +.BR fsck (8), +.BR fstab (5), +.BR zpool-scrub (8) diff --git a/man/man8/mount.zfs.8 b/man/man8/mount.zfs.8 new file mode 100644 index 000000000000..4b71367e23e9 --- /dev/null +++ b/man/man8/mount.zfs.8 @@ -0,0 +1,144 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright 2013 Darik Horn . All rights reserved. +.\" +.TH mount.zfs 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands" + +.SH NAME +mount.zfs \- mount a ZFS filesystem +.SH SYNOPSIS +.LP +.BI "mount.zfs [\-sfnvh] [\-o " options "]" " dataset mountpoint + +.SH DESCRIPTION +.BR mount.zfs +is part of the zfsutils package for Linux. It is a helper program that +is usually invoked by the +.BR mount (8) +or +.BR zfs (8) +commands to mount a ZFS dataset. + +All +.I options +are handled according to the FILESYSTEM INDEPENDENT MOUNT OPTIONS +section in the +.BR mount (8) +manual, except for those described below. + +The +.I dataset +parameter is a ZFS filesystem name, as output by the +.B "zfs list -H -o name +command. This parameter never has a leading slash character and is +not a device name. + +The +.I mountpoint +parameter is the path name of a directory. + + +.SH OPTIONS +.TP +.BI "\-s" +Ignore bad or sloppy mount options. +.TP +.BI "\-f" +Do a fake mount; do not perform the mount operation. +.TP +.BI "\-n" +Do not update the /etc/mtab file. +.TP +.BI "\-v" +Increase verbosity. +.TP +.BI "\-h" +Print the usage message. +.TP +.BI "\-o context" +This flag sets the SELinux context for all files in the filesystem +under that mountpoint. +.TP +.BI "\-o fscontext" +This flag sets the SELinux context for the filesystem being mounted. +.TP +.BI "\-o defcontext" +This flag sets the SELinux context for unlabeled files. +.TP +.BI "\-o rootcontext" +This flag sets the SELinux context for the root inode of the filesystem. +.TP +.BI "\-o legacy" +This private flag indicates that the +.I dataset +has an entry in the /etc/fstab file. +.TP +.BI "\-o noxattr" +This private flag disables extended attributes. +.TP +.BI "\-o xattr +This private flag enables directory-based extended attributes and, if +appropriate, adds a ZFS context to the selinux system policy. +.TP +.BI "\-o saxattr +This private flag enables system attributed-based extended attributes and, if +appropriate, adds a ZFS context to the selinux system policy. +.TP +.BI "\-o dirxattr +Equivalent to +.BR xattr . +.TP +.BI "\-o zfsutil" +This private flag indicates that +.BR mount (8) +is being called by the +.BR zfs (8) +command. + +.SH NOTES +ZFS conventionally requires that the +.I mountpoint +be an empty directory, but the Linux implementation inconsistently +enforces the requirement. + +The +.BR mount.zfs +helper does not mount the contents of zvols. + +.SH FILES +.TP 18n +.I /etc/fstab +The static filesystem table. +.TP +.I /etc/mtab +The mounted filesystem table. +.SH "AUTHORS" +The primary author of +.BR mount.zfs +is Brian Behlendorf . + +This man page was written by Darik Horn . +.SH "SEE ALSO" +.BR fstab (5), +.BR mount (8), +.BR zfs (8) diff --git a/man/man8/vdev_id.8 b/man/man8/vdev_id.8 new file mode 100644 index 000000000000..70956c634f03 --- /dev/null +++ b/man/man8/vdev_id.8 @@ -0,0 +1,77 @@ +.TH vdev_id 8 +.SH NAME +vdev_id \- generate user-friendly names for JBOD disks +.SH SYNOPSIS +.LP +.nf +\fBvdev_id\fR <-d dev> [-c config_file] [-g sas_direct|sas_switch] + [-m] [-p phys_per_port] +\fBvdev_id\fR -h +.fi +.SH DESCRIPTION +The \fBvdev_id\fR command is a udev helper which parses the file +.BR /etc/zfs/vdev_id.conf (5) +to map a physical path in a storage topology to a channel name. The +channel name is combined with a disk enclosure slot number to create an +alias that reflects the physical location of the drive. This is +particularly helpful when it comes to tasks like replacing failed +drives. Slot numbers may also be re-mapped in case the default +numbering is unsatisfactory. The drive aliases will be created as +symbolic links in /dev/disk/by-vdev. + +The currently supported topologies are sas_direct and sas_switch. A +multipath mode is supported in which dm-mpath devices are handled by +examining the first-listed running component disk as reported by the +.BR multipath (8) +command. In multipath mode the configuration file should contain a +channel definition with the same name for each path to a given +enclosure. + +.BR vdev_id +also supports creating aliases based on existing udev links in the /dev +hierarchy using the \fIalias\fR configuration file keyword. See the +.BR vdev_id.conf (5) +man page for details. + +.SH OPTIONS +.TP +\fB\-c\fR +Specifies the path to an alternate configuration file. The default is +/etc/zfs/vdev_id.conf. +.TP +\fB\-d\fR +This is the only mandatory argument. Specifies the name of a device +in /dev, i.e. "sda". +.TP +\fB\-g\fR +Identifies a physical topology that governs how physical paths are +mapped to channels. + +\fIsas_direct\fR - in this mode a channel is uniquely identified by +a PCI slot and a HBA port number + +\fIsas_switch\fR - in this mode a channel is uniquely identified by +a SAS switch port number +.TP +\fB\-m\fR +Specifies that +.BR vdev_id (8) +will handle only dm-multipath devices. If set to "yes" then +.BR vdev_id (8) +will examine the first running component disk of a dm-multipath +device as listed by the +.BR multipath (8) +command to determine the physical path. +.TP +\fB\-p\fR +Specifies the number of PHY devices associated with a SAS HBA port or SAS +switch port. +.BR vdev_id (8) +internally uses this value to determine which HBA or switch port a +device is connected to. The default is 4. +.TP +\fB\-h\fR +Print a usage summary. +.SH SEE ALSO +.LP +\fBvdev_id.conf\fR(5) diff --git a/man/man8/zdb.8 b/man/man8/zdb.8 new file mode 100644 index 000000000000..79e42d29ffc1 --- /dev/null +++ b/man/man8/zdb.8 @@ -0,0 +1,478 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2012, Richard Lowe. +.\" Copyright (c) 2012, 2019 by Delphix. All rights reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Lawrence Livermore National Security, LLC. +.\" Copyright (c) 2017 Intel Corporation. +.\" +.Dd April 14, 2019 +.Dt ZDB 8 SMM +.Os +.Sh NAME +.Nm zdb +.Nd display zpool debugging and consistency information +.Sh SYNOPSIS +.Nm +.Op Fl AbcdDFGhikLMPsvXYy +.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl I Ar inflight I/Os +.Oo Fl o Ar var Ns = Ns Ar value Oc Ns ... +.Op Fl t Ar txg +.Op Fl U Ar cache +.Op Fl x Ar dumpdir +.Op Ar poolname[/dataset | objset ID] +.Op Ar object | range ... +.Nm +.Op Fl AdiPv +.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname[/dataset | objset ID] Op Ar object | range ... +.Nm +.Fl C +.Op Fl A +.Op Fl U Ar cache +.Nm +.Fl E +.Op Fl A +.Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 +.Nm +.Fl l +.Op Fl Aqu +.Ar device +.Nm +.Fl m +.Op Fl AFLPXY +.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl t Ar txg +.Op Fl U Ar cache +.Ar poolname Op Ar vdev Op Ar metaslab ... +.Nm +.Fl O +.Ar dataset path +.Nm +.Fl R +.Op Fl A +.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar [/] Ns Op : Ns Ar flags +.Nm +.Fl S +.Op Fl AP +.Op Fl e Oo Fl V Oc Op Fl p Ar path ... +.Op Fl U Ar cache +.Ar poolname +.Sh DESCRIPTION +The +.Nm +utility displays information about a ZFS pool useful for debugging and performs +some amount of consistency checking. +It is a not a general purpose tool and options +.Pq and facilities +may change. +This is not a +.Xr fsck 8 +utility. +.Pp +The output of this command in general reflects the on-disk structure of a ZFS +pool, and is inherently unstable. +The precise output of most invocations is not documented, a knowledge of ZFS +internals is assumed. +.Pp +If the +.Ar dataset +argument does not contain any +.Qq Sy / +or +.Qq Sy @ +characters, it is interpreted as a pool name. +The root dataset can be specified as +.Ar pool Ns / +.Pq pool name followed by a slash . +.Pp +When operating on an imported and active pool it is possible, though unlikely, +that zdb may interpret inconsistent pool data and behave erratically. +.Sh OPTIONS +Display options: +.Bl -tag -width Ds +.It Fl b +Display statistics regarding the number, size +.Pq logical, physical and allocated +and deduplication of blocks. +.It Fl c +Verify the checksum of all metadata blocks while printing block statistics +.Po see +.Fl b +.Pc . +.Pp +If specified multiple times, verify the checksums of all blocks. +.It Fl C +Display information about the configuration. +If specified with no other options, instead display information about the cache +file +.Pq Pa /etc/zfs/zpool.cache . +To specify the cache file to display, see +.Fl U . +.Pp +If specified multiple times, and a pool name is also specified display both the +cached configuration and the on-disk configuration. +If specified multiple times with +.Fl e +also display the configuration that would be used were the pool to be imported. +.It Fl d +Display information about datasets. +Specified once, displays basic dataset information: ID, create transaction, +size, and object count. +.Pp +If specified multiple times provides greater and greater verbosity. +.Pp +If object IDs or object ID ranges are specified, display information about +those specific objects or ranges only. +.Pp +An object ID range is specified in terms of a colon-separated tuple of +the form +.Ao start Ac Ns : Ns Ao end Ac Ns Op Ns : Ns Ao flags Ac Ns . +The fields +.Ar start +and +.Ar end +are integer object identifiers that denote the upper and lower bounds +of the range. An +.Ar end +value of -1 specifies a range with no upper bound. The +.Ar flags +field optionally specifies a set of flags, described below, that control +which object types are dumped. By default, all object types are dumped. A minus +sign +.Pq - +negates the effect of the flag that follows it and has no effect unless +preceded by the +.Ar A +flag. For example, the range 0:-1:A-d will dump all object types except +for directories. +.Pp +.Bl -tag -compact +.It Sy A +Dump all objects (this is the default) +.It Sy d +Dump ZFS directory objects +.It Sy f +Dump ZFS plain file objects +.It Sy m +Dump SPA space map objects +.It Sy z +Dump ZAP objects +.It Sy - +Negate the effect of next flag +.El +.It Fl D +Display deduplication statistics, including the deduplication ratio +.Pq Sy dedup , +compression ratio +.Pq Sy compress , +inflation due to the zfs copies property +.Pq Sy copies , +and an overall effective ratio +.Pq Sy dedup No * Sy compress No / Sy copies . +.It Fl DD +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. +.It Fl DDD +Display the statistics independently for each deduplication table. +.It Fl DDDD +Dump the contents of the deduplication tables describing duplicate blocks. +.It Fl DDDDD +Also dump the contents of the deduplication tables describing unique blocks. +.It Fl E Ar word0 Ns \&: Ns Ar word1 Ns :...: Ns Ar word15 +Decode and display block from an embedded block pointer specified by the +.Ar word +arguments. +.It Fl h +Display pool history similar to +.Nm zpool Cm history , +but include internal changes, transaction, and dataset information. +.It Fl i +Display information about intent log +.Pq ZIL +entries relating to each dataset. +If specified multiple times, display counts of each intent log transaction type. +.It Fl k +Examine the checkpointed state of the pool. +Note, the on disk format of the pool is not reverted to the checkpointed state. +.It Fl l Ar device +Read the vdev labels and L2ARC header from the specified device. +.Nm Fl l +will return 0 if valid label was found, 1 if error occurred, and 2 if no valid +labels were found. The presence of L2ARC header is indicated by a specific +sequence (L2ARC_DEV_HDR_MAGIC). If there is an accounting error in the size +or the number of L2ARC log blocks +.Nm Fl l +will return 1. Each unique configuration is displayed only +once. +.It Fl ll Ar device +In addition display label space usage stats. If a valid L2ARC header was found +also display the properties of log blocks used for restoring L2ARC contents +(persistent L2ARC). +.It Fl lll Ar device +Display every configuration, unique or not. If a valid L2ARC header was found +also display the properties of log entries in log blocks used for restoring +L2ARC contents (persistent L2ARC). +.Pp +If the +.Fl q +option is also specified, don't print the labels or the L2ARC header. +.Pp +If the +.Fl u +option is also specified, also display the uberblocks on this device. Specify +multiple times to increase verbosity. +.It Fl L +Disable leak detection and the loading of space maps. +By default, +.Nm +verifies that all non-free blocks are referenced, which can be very expensive. +.It Fl m +Display the offset, spacemap, free space of each metaslab, all the log +spacemaps and their obsolete entry statistics. +.It Fl mm +Also display information about the on-disk free space histogram associated with +each metaslab. +.It Fl mmm +Display the maximum contiguous free space, the in-core free space histogram, and +the percentage of free space in each space map. +.It Fl mmmm +Display every spacemap record. +.It Fl M +Display the offset, spacemap, and free space of each metaslab. +.It Fl MM +Also display information about the maximum contiguous free space and the +percentage of free space in each space map. +.It Fl MMM +Display every spacemap record. +.It Fl O Ar dataset path +Look up the specified +.Ar path +inside of the +.Ar dataset +and display its metadata and indirect blocks. +Specified +.Ar path +must be relative to the root of +.Ar dataset . +This option can be combined with +.Fl v +for increasing verbosity. +.It Xo +.Fl R Ar poolname vdev Ns \&: Ns Ar offset Ns \&: Ns Ar [/] Ns Op : Ns Ar flags +.Xc +Read and display a block from the specified device. +By default the block is displayed as a hex dump, but see the description of the +.Sy r +flag, below. +.Pp +The block is specified in terms of a colon-separated tuple +.Ar vdev +.Pq an integer vdev identifier +.Ar offset +.Pq the offset within the vdev +.Ar size +.Pq the physical size, or logical size / physical size +of the block to read and, optionally, +.Ar flags +.Pq a set of flags, described below . +.Pp +.Bl -tag -compact -width "b offset" +.It Sy b Ar offset +Print block pointer at hex offset +.It Sy c +Calculate and display checksums +.It Sy d +Decompress the block. Set environment variable +.Nm ZDB_NO_ZLE +to skip zle when guessing. +.It Sy e +Byte swap the block +.It Sy g +Dump gang block header +.It Sy i +Dump indirect block +.It Sy r +Dump raw uninterpreted block data +.It Sy v +Verbose output for guessing compression algorithm +.El +.It Fl s +Report statistics on +.Nm zdb +I/O. +Display operation counts, bandwidth, and error counts of I/O to the pool from +.Nm . +.It Fl S +Simulate the effects of deduplication, constructing a DDT and then display +that DDT as with +.Fl DD . +.It Fl u +Display the current uberblock. +.El +.Pp +Other options: +.Bl -tag -width Ds +.It Fl A +Do not abort should any assertion fail. +.It Fl AA +Enable panic recovery, certain errors which would otherwise be fatal are +demoted to warnings. +.It Fl AAA +Do not abort if asserts fail and also enable panic recovery. +.It Fl e Op Fl p Ar path ... +Operate on an exported pool, not present in +.Pa /etc/zfs/zpool.cache . +The +.Fl p +flag specifies the path under which devices are to be searched. +.It Fl x Ar dumpdir +All blocks accessed will be copied to files in the specified directory. +The blocks will be placed in sparse files whose name is the same as +that of the file or device read. +.Nm +can be then run on the generated files. +Note that the +.Fl bbc +flags are sufficient to access +.Pq and thus copy +all metadata on the pool. +.It Fl F +Attempt to make an unreadable pool readable by trying progressively older +transactions. +.It Fl G +Dump the contents of the zfs_dbgmsg buffer before exiting +.Nm . +zfs_dbgmsg is a buffer used by ZFS to dump advanced debug information. +.It Fl I Ar inflight I/Os +Limit the number of outstanding checksum I/Os to the specified value. +The default value is 200. +This option affects the performance of the +.Fl c +option. +.It Fl o Ar var Ns = Ns Ar value ... +Set the given global libzpool variable to the provided value. +The value must be an unsigned 32-bit integer. +Currently only little-endian systems are supported to avoid accidentally setting +the high 32 bits of 64-bit variables. +.It Fl P +Print numbers in an unscaled form more amenable to parsing, eg. 1000000 rather +than 1M. +.It Fl t Ar transaction +Specify the highest transaction to use when searching for uberblocks. +See also the +.Fl u +and +.Fl l +options for a means to see the available uberblocks and their associated +transaction numbers. +.It Fl U Ar cachefile +Use a cache file other than +.Pa /etc/zfs/zpool.cache . +.It Fl v +Enable verbosity. +Specify multiple times for increased verbosity. +.It Fl V +Attempt verbatim import. +This mimics the behavior of the kernel when loading a pool from a cachefile. +Only usable with +.Fl e . +.It Fl X +Attempt +.Qq extreme +transaction rewind, that is attempt the same recovery as +.Fl F +but read transactions otherwise deemed too old. +.It Fl Y +Attempt all possible combinations when reconstructing indirect split blocks. +This flag disables the individual I/O deadman timer in order to allow as +much time as required for the attempted reconstruction. +.It Fl y +Perform validation for livelists that are being deleted. +Scans through the livelist and metaslabs, checking for duplicate entries +and compares the two, checking for potential double frees. +If it encounters issues, warnings will be printed, but the command will not +necessarily fail. +.El +.Pp +Specifying a display option more than once enables verbosity for only that +option, with more occurrences enabling more verbosity. +.Pp +If no options are specified, all information about the named pool will be +displayed at default verbosity. +.Sh EXAMPLES +.Bl -tag -width Ds +.It Xo +.Sy Example 1 +Display the configuration of imported pool +.Pa rpool +.Xc +.Bd -literal +# zdb -C rpool + +MOS Configuration: + version: 28 + name: 'rpool' + ... +.Ed +.It Xo +.Sy Example 2 +Display basic dataset information about +.Pa rpool +.Xc +.Bd -literal +# zdb -d rpool +Dataset mos [META], ID 0, cr_txg 4, 26.9M, 1051 objects +Dataset rpool/swap [ZVOL], ID 59, cr_txg 356, 486M, 2 objects + ... +.Ed +.It Xo +.Sy Example 3 +Display basic information about object 0 in +.Pa rpool/export/home +.Xc +.Bd -literal +# zdb -d rpool/export/home 0 +Dataset rpool/export/home [ZPL], ID 137, cr_txg 1546, 32K, 8 objects + + Object lvl iblk dblk dsize lsize %full type + 0 7 16K 16K 15.0K 16K 25.00 DMU dnode +.Ed +.It Xo +.Sy Example 4 +Display the predicted effect of enabling deduplication on +.Pa rpool +.Xc +.Bd -literal +# zdb -S rpool +Simulated DDT histogram: + +bucket allocated referenced +______ ______________________________ ______________________________ +refcnt blocks LSIZE PSIZE DSIZE blocks LSIZE PSIZE DSIZE +------ ------ ----- ----- ----- ------ ----- ----- ----- + 1 694K 27.1G 15.0G 15.0G 694K 27.1G 15.0G 15.0G + 2 35.0K 1.33G 699M 699M 74.7K 2.79G 1.45G 1.45G + ... +dedup = 1.11, compress = 1.80, copies = 1.00, dedup * compress / copies = 2.00 +.Ed +.El +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zpool 8 diff --git a/man/man8/zed.8.in b/man/man8/zed.8.in new file mode 100644 index 000000000000..2ca3935724d7 --- /dev/null +++ b/man/man8/zed.8.in @@ -0,0 +1,268 @@ +.\" +.\" This file is part of the ZFS Event Daemon (ZED) +.\" for ZFS on Linux (ZoL) . +.\" Developed at Lawrence Livermore National Laboratory (LLNL-CODE-403049). +.\" Copyright (C) 2013-2014 Lawrence Livermore National Security, LLC. +.\" Refer to the ZoL git commit log for authoritative copyright attribution. +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License Version 1.0 (CDDL-1.0). +.\" You can obtain a copy of the license from the top-level file +.\" "OPENSOLARIS.LICENSE" or at . +.\" You may not use this file except in compliance with the license. +.\" +.TH ZED 8 "Octember 1, 2013" "ZFS on Linux" "System Administration Commands" + +.SH NAME +ZED \- ZFS Event Daemon + +.SH SYNOPSIS +.HP +.B zed +.\" [\fB\-c\fR \fIconfigfile\fR] +[\fB\-d\fR \fIzedletdir\fR] +[\fB\-f\fR] +[\fB\-F\fR] +[\fB\-h\fR] +[\fB\-I\fR] +[\fB\-L\fR] +[\fB\-M\fR] +[\fB\-p\fR \fIpidfile\fR] +[\fB\-P\fR \fIpath\fR] +[\fB\-s\fR \fIstatefile\fR] +[\fB\-v\fR] +[\fB\-V\fR] +[\fB\-Z\fR] + +.SH DESCRIPTION +.PP +\fBZED\fR (ZFS Event Daemon) monitors events generated by the ZFS kernel +module. When a zevent (ZFS Event) is posted, \fBZED\fR will run any ZEDLETs +(ZFS Event Daemon Linkage for Executable Tasks) that have been enabled for the +corresponding zevent class. + +.SH OPTIONS +.TP +.BI \-h +Display a summary of the command-line options. +.TP +.BI \-L +Display license information. +.TP +.BI \-V +Display version information. +.TP +.BI \-v +Be verbose. +.TP +.BI \-f +Force the daemon to run if at all possible, disabling security checks and +throwing caution to the wind. Not recommended for use in production. +.TP +.BI \-F +Run the daemon in the foreground. +.TP +.BI \-M +Lock all current and future pages in the virtual memory address space. +This may help the daemon remain responsive when the system is under heavy +memory pressure. +.TP +.BI \-I +Request that the daemon idle rather than exit when the kernel modules are +not loaded. Processing of events will start, or resume, when the kernel +modules are (re)loaded. Under Linux the kernel modules cannot be unloaded +while the daemon is running. +.TP +.BI \-Z +Zero the daemon's state, thereby allowing zevents still within the kernel +to be reprocessed. +.\" .TP +.\" .BI \-c\ configfile +.\" Read the configuration from the specified file. +.TP +.BI \-d\ zedletdir +Read the enabled ZEDLETs from the specified directory. +.TP +.BI \-p\ pidfile +Write the daemon's process ID to the specified file. +.TP +.BI \-P\ path +Custom $PATH for zedlets to use. Normally zedlets run in a locked-down +environment, with hardcoded paths to the ZFS commands ($ZFS, $ZPOOL, $ZED, ...), +and a hardcoded $PATH. This is done for security reasons. However, the +ZFS test suite uses a custom PATH for its ZFS commands, and passes it to zed +with -P. In short, -P is only to be used by the ZFS test suite; never use +it in production! +.TP +.BI \-s\ statefile +Write the daemon's state to the specified file. +.SH ZEVENTS +.PP +A zevent is comprised of a list of nvpairs (name/value pairs). Each zevent +contains an EID (Event IDentifier) that uniquely identifies it throughout +the lifetime of the loaded ZFS kernel module; this EID is a monotonically +increasing integer that resets to 1 each time the kernel module is loaded. +Each zevent also contains a class string that identifies the type of event. +For brevity, a subclass string is defined that omits the leading components +of the class string. Additional nvpairs exist to provide event details. +.PP +The kernel maintains a list of recent zevents that can be viewed (along with +their associated lists of nvpairs) using the "\fBzpool events \-v\fR" command. + +.SH CONFIGURATION +.PP +ZEDLETs to be invoked in response to zevents are located in the +\fIenabled-zedlets\fR directory. These can be symlinked or copied from the +\fIinstalled-zedlets\fR directory; symlinks allow for automatic updates +from the installed ZEDLETs, whereas copies preserve local modifications. +As a security measure, ZEDLETs must be owned by root. They must have +execute permissions for the user, but they must not have write permissions +for group or other. Dotfiles are ignored. +.PP +ZEDLETs are named after the zevent class for which they should be invoked. +In particular, a ZEDLET will be invoked for a given zevent if either its +class or subclass string is a prefix of its filename (and is followed by +a non-alphabetic character). As a special case, the prefix "all" matches +all zevents. Multiple ZEDLETs may be invoked for a given zevent. + +.SH ZEDLETS +.PP +ZEDLETs are executables invoked by the ZED in response to a given zevent. +They should be written under the presumption they can be invoked concurrently, +and they should use appropriate locking to access any shared resources. +Common variables used by ZEDLETs can be stored in the default rc file which +is sourced by scripts; these variables should be prefixed with "ZED_". +.PP +The zevent nvpairs are passed to ZEDLETs as environment variables. +Each nvpair name is converted to an environment variable in the following +manner: 1) it is prefixed with "ZEVENT_", 2) it is converted to uppercase, +and 3) each non-alphanumeric character is converted to an underscore. +Some additional environment variables have been defined to present certain +nvpair values in a more convenient form. An incomplete list of zevent +environment variables is as follows: +.TP +.B +ZEVENT_EID +The Event IDentifier. +.TP +.B +ZEVENT_CLASS +The zevent class string. +.TP +.B +ZEVENT_SUBCLASS +The zevent subclass string. +.TP +.B +ZEVENT_TIME +The time at which the zevent was posted as +"\fIseconds\fR\ \fInanoseconds\fR" since the Epoch. +.TP +.B +ZEVENT_TIME_SECS +The \fIseconds\fR component of ZEVENT_TIME. +.TP +.B +ZEVENT_TIME_NSECS +The \fInanoseconds\fR component of ZEVENT_TIME. +.TP +.B +ZEVENT_TIME_STRING +An almost-RFC3339-compliant string for ZEVENT_TIME. +.PP +Additionally, the following ZED & ZFS variables are defined: +.TP +.B +ZED_PID +The daemon's process ID. +.TP +.B +ZED_ZEDLET_DIR +The daemon's current \fIenabled-zedlets\fR directory. +.TP +.B +ZFS_ALIAS +The ZFS alias (\fIname-version-release\fR) string used to build the daemon. +.TP +.B +ZFS_VERSION +The ZFS version used to build the daemon. +.TP +.B +ZFS_RELEASE +The ZFS release used to build the daemon. +.PP +ZEDLETs may need to call other ZFS commands. The installation paths of +the following executables are defined: \fBZDB\fR, \fBZED\fR, \fBZFS\fR, +\fBZINJECT\fR, and \fBZPOOL\fR. These variables can be overridden in the +rc file if needed. + +.SH FILES +.\" .TP +.\" @sysconfdir@/zfs/zed.conf +.\" The default configuration file for the daemon. +.TP +.I @sysconfdir@/zfs/zed.d +The default directory for enabled ZEDLETs. +.TP +.I @sysconfdir@/zfs/zed.d/zed.rc +The default rc file for common variables used by ZEDLETs. +.TP +.I @zfsexecdir@/zed.d +The default directory for installed ZEDLETs. +.TP +.I @runstatedir@/zed.pid +The default file containing the daemon's process ID. +.TP +.I @runstatedir@/zed.state +The default file containing the daemon's state. + +.SH SIGNALS +.TP +.B HUP +Reconfigure the daemon and rescan the directory for enabled ZEDLETs. +.TP +.B TERM +Terminate the daemon. + +.SH NOTES +.PP +\fBZED\fR requires root privileges. +.\" Do not taunt zed. + +.SH BUGS +.PP +Events are processed synchronously by a single thread. This can delay the +processing of simultaneous zevents. +.PP +There is no maximum timeout for ZEDLET execution. Consequently, a misbehaving +ZEDLET can delay the processing of subsequent zevents. +.PP +The ownership and permissions of the \fIenabled-zedlets\fR directory (along +with all parent directories) are not checked. If any of these directories +are improperly owned or permissioned, an unprivileged user could insert a +ZEDLET to be executed as root. The requirement that ZEDLETs be owned by +root mitigates this to some extent. +.PP +ZEDLETs are unable to return state/status information to the kernel. +.PP +Some zevent nvpair types are not handled. These are denoted by zevent +environment variables having a "_NOT_IMPLEMENTED_" value. +.PP +Internationalization support via gettext has not been added. +.PP +The configuration file is not yet implemented. +.PP +The diagnosis engine is not yet implemented. + +.SH LICENSE +.PP +\fBZED\fR (ZFS Event Daemon) is distributed under the terms of the +Common Development and Distribution License Version 1.0 (CDDL\-1.0). +.PP +Developed at Lawrence Livermore National Laboratory (LLNL\-CODE\-403049). + +.SH SEE ALSO +.BR zfs (8), +.BR zpool (8) +.BR zpool-events (8) diff --git a/man/man8/zfs-allow.8 b/man/man8/zfs-allow.8 new file mode 100644 index 000000000000..f32b29a72661 --- /dev/null +++ b/man/man8/zfs-allow.8 @@ -0,0 +1,372 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-ALLOW 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm allow +.Nd Delegates ZFS administration permission for the file systems to non-privileged users. +.Sh SYNOPSIS +.Nm +.Cm allow +.Op Fl dglu +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm allow +.Op Fl dl +.Fl e Ns | Ns Sy everyone +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm allow +.Fl c +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm allow +.Fl s No @ Ns Ar setname +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm unallow +.Op Fl dglru +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm unallow +.Op Fl dlr +.Fl e Ns | Ns Sy everyone +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm unallow +.Op Fl r +.Fl c +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm unallow +.Op Fl r +.Fl s No @ Ns Ar setname +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm allow +.Ar filesystem Ns | Ns Ar volume +.Xc +Displays permissions that have been delegated on the specified filesystem or +volume. +See the other forms of +.Nm zfs Cm allow +for more information. +.Pp +Delegations are supported under Linux with the exception of +.Sy mount , +.Sy unmount , +.Sy mountpoint , +.Sy canmount , +.Sy rename , +and +.Sy share . +These permissions cannot be delegated because the Linux +.Xr mount 8 +command restricts modifications of the global namespace to the root user. +.It Xo +.Nm +.Cm allow +.Op Fl dglu +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm +.Cm allow +.Op Fl dl +.Fl e Ns | Ns Sy everyone +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Xc +Delegates ZFS administration permission for the file systems to non-privileged +users. +.Bl -tag -width "-d" +.It Fl d +Allow only for the descendent file systems. +.It Fl e Ns | Ns Sy everyone +Specifies that the permissions be delegated to everyone. +.It Fl g Ar group Ns Oo , Ns Ar group Oc Ns ... +Explicitly specify that permissions are delegated to the group. +.It Fl l +Allow +.Qq locally +only for the specified file system. +.It Fl u Ar user Ns Oo , Ns Ar user Oc Ns ... +Explicitly specify that permissions are delegated to the user. +.It Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... +Specifies to whom the permissions are delegated. +Multiple entities can be specified as a comma-separated list. +If neither of the +.Fl gu +options are specified, then the argument is interpreted preferentially as the +keyword +.Sy everyone , +then as a user name, and lastly as a group name. +To specify a user or group named +.Qq everyone , +use the +.Fl g +or +.Fl u +options. +To specify a group with the same name as a user, use the +.Fl g +options. +.It Xo +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Xc +The permissions to delegate. +Multiple permissions may be specified as a comma-separated list. +Permission names are the same as ZFS subcommand and property names. +See the property list below. +Property set names, which begin with +.Sy @ , +may be specified. +See the +.Fl s +form below for details. +.El +.Pp +If neither of the +.Fl dl +options are specified, or both are, then the permissions are allowed for the +file system or volume, and all of its descendents. +.Pp +Permissions are generally the ability to use a ZFS subcommand or change a ZFS +property. +The following permissions are available: +.Bd -literal +NAME TYPE NOTES +allow subcommand Must also have the permission that is + being allowed +clone subcommand Must also have the 'create' ability and + 'mount' ability in the origin file system +create subcommand Must also have the 'mount' ability. + Must also have the 'refreservation' ability to + create a non-sparse volume. +destroy subcommand Must also have the 'mount' ability +diff subcommand Allows lookup of paths within a dataset + given an object number, and the ability + to create snapshots necessary to + 'zfs diff'. +load-key subcommand Allows loading and unloading of encryption key + (see 'zfs load-key' and 'zfs unload-key'). +change-key subcommand Allows changing an encryption key via + 'zfs change-key'. +mount subcommand Allows mount/umount of ZFS datasets +promote subcommand Must also have the 'mount' and 'promote' + ability in the origin file system +receive subcommand Must also have the 'mount' and 'create' + ability +rename subcommand Must also have the 'mount' and 'create' + ability in the new parent +rollback subcommand Must also have the 'mount' ability +send subcommand +share subcommand Allows sharing file systems over NFS + or SMB protocols +snapshot subcommand Must also have the 'mount' ability + +groupquota other Allows accessing any groupquota@... + property +groupused other Allows reading any groupused@... property +userprop other Allows changing any user property +userquota other Allows accessing any userquota@... + property +userused other Allows reading any userused@... property +projectobjquota other Allows accessing any projectobjquota@... + property +projectquota other Allows accessing any projectquota@... property +projectobjused other Allows reading any projectobjused@... property +projectused other Allows reading any projectused@... property + +aclinherit property +acltype property +atime property +canmount property +casesensitivity property +checksum property +compression property +copies property +devices property +exec property +filesystem_limit property +mountpoint property +nbmand property +normalization property +primarycache property +quota property +readonly property +recordsize property +refquota property +refreservation property +reservation property +secondarycache property +setuid property +sharenfs property +sharesmb property +snapdir property +snapshot_limit property +utf8only property +version property +volblocksize property +volsize property +vscan property +xattr property +zoned property +.Ed +.It Xo +.Nm +.Cm allow +.Fl c +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Xc +Sets +.Qq create time +permissions. +These permissions are granted +.Pq locally +to the creator of any newly-created descendent file system. +.It Xo +.Nm +.Cm allow +.Fl s No @ Ns Ar setname +.Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... +.Ar filesystem Ns | Ns Ar volume +.Xc +Defines or adds permissions to a permission set. +The set can be used by other +.Nm zfs Cm allow +commands for the specified file system and its descendents. +Sets are evaluated dynamically, so changes to a set are immediately reflected. +Permission sets follow the same naming restrictions as ZFS file systems, but the +name must begin with +.Sy @ , +and can be no more than 64 characters long. +.It Xo +.Nm +.Cm unallow +.Op Fl dglru +.Ar user Ns | Ns Ar group Ns Oo , Ns Ar user Ns | Ns Ar group Oc Ns ... +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm +.Cm unallow +.Op Fl dlr +.Fl e Ns | Ns Sy everyone +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +.It Xo +.Nm +.Cm unallow +.Op Fl r +.Fl c +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +Removes permissions that were granted with the +.Nm zfs Cm allow +command. +No permissions are explicitly denied, so other permissions granted are still in +effect. +For example, if the permission is granted by an ancestor. +If no permissions are specified, then all permissions for the specified +.Ar user , +.Ar group , +or +.Sy everyone +are removed. +Specifying +.Sy everyone +.Po or using the +.Fl e +option +.Pc +only removes the permissions that were granted to everyone, not all permissions +for every user and group. +See the +.Nm zfs Cm allow +command for a description of the +.Fl ldugec +options. +.Bl -tag -width "-r" +.It Fl r +Recursively remove the permissions from this file system and all descendents. +.El +.It Xo +.Nm +.Cm unallow +.Op Fl r +.Fl s No @ Ns Ar setname +.Oo Ar perm Ns | Ns @ Ns Ar setname Ns Oo , Ns Ar perm Ns | Ns @ Ns +.Ar setname Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar volume +.Xc +Removes permissions from a permission set. +If no permissions are specified, then all permissions are removed, thus removing +the set entirely. +.El diff --git a/man/man8/zfs-bookmark.8 b/man/man8/zfs-bookmark.8 new file mode 100644 index 000000000000..3b2af815920d --- /dev/null +++ b/man/man8/zfs-bookmark.8 @@ -0,0 +1,69 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved. +.\" +.Dd June 30, 2019 +.Dt ZFS-BOOKMARK 8 SMM +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm bookmark +.Nd Creates a bookmark of the given snapshot. +.Sh SYNOPSIS +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm bookmark +.Ar snapshot Ns | Ns Ar bookmark newbookmark +.Xc +Creates a new bookmark of the given snapshot or bookmark. +Bookmarks mark the point in time when the snapshot was created, and can be used +as the incremental source for a +.Xr zfs-send 8 +command. +.Pp +When creating a bookmark from an existing redaction bookmark, the resulting +bookmark is +.Sy not +a redaction bookmark. +.Pp +This feature must be enabled to be used. +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy bookmarks +feature. +.El +.Sh SEE ALSO +.Xr zfs-destroy 8 , +.Xr zfs-send 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-change-key.8 b/man/man8/zfs-change-key.8 new file mode 120000 index 000000000000..d027a419d1e4 --- /dev/null +++ b/man/man8/zfs-change-key.8 @@ -0,0 +1 @@ +zfs-load-key.8 \ No newline at end of file diff --git a/man/man8/zfs-clone.8 b/man/man8/zfs-clone.8 new file mode 100644 index 000000000000..352a2392ff40 --- /dev/null +++ b/man/man8/zfs-clone.8 @@ -0,0 +1,77 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-CLONE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm clone +.Nd Creates a clone of the given snapshot. +.Sh SYNOPSIS +.Nm +.Cm clone +.Op Fl p +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar snapshot Ar filesystem Ns | Ns Ar volume +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm clone +.Op Fl p +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar snapshot Ar filesystem Ns | Ns Ar volume +.Xc +See the +.Em Clones +section of +.Xr zfsconcepts 8 +for details. +The target dataset can be located anywhere in the ZFS hierarchy, and is created +as the same type as the original. +.Bl -tag -width "-o" +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property; see +.Nm zfs Cm create +for details. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +If the target filesystem or volume already exists, the operation completes +successfully. +.El +.El +.Sh SEE ALSO +.Xr zfs-promote 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-create.8 b/man/man8/zfs-create.8 new file mode 100644 index 000000000000..c37d63305a4e --- /dev/null +++ b/man/man8/zfs-create.8 @@ -0,0 +1,244 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-CREATE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm create +.Nd Creates a new ZFS file system. +.Sh SYNOPSIS +.Nm zfs +.Cm create +.Op Fl Pnpv +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem +.Nm +.Cm create +.Op Fl ps +.Op Fl b Ar blocksize +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Fl V Ar size Ar volume +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm create +.Op Fl Pnpv +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem +.Xc +Creates a new ZFS file system. +The file system is automatically mounted according to the +.Sy mountpoint +property inherited from the parent. +.Bl -tag -width "-o" +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property as if the command +.Nm zfs Cm set Ar property Ns = Ns Ar value +was invoked at the same time the dataset was created. +Any editable ZFS property can also be set at creation time. +Multiple +.Fl o +options can be specified. +An error results if the same property is specified in multiple +.Fl o +options. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +Any property specified on the command line using the +.Fl o +option is ignored. +If the target filesystem already exists, the operation completes successfully. +.It Fl n +Do a dry-run +.Pq Qq No-op +creation. +No datasets will be created. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to validate properties that are passed via +.Fl o +options and those implied by other options. +The actual dataset creation can still fail due to insufficient privileges or +available capacity. +.It Fl P +Print machine-parsable verbose information about the created dataset. +Each line of output contains a key and one or two values, all separated by tabs. +The +.Sy create_ancestors +and +.Sy create +keys have +.Em filesystem +as their only value. +The +.Sy create_ancestors +key only appears if the +.Fl p +option is used. +The +.Sy property +key has two values, a property name that property's value. +The +.Sy property +key may appear zero or more times, once for each property that will be set local +to +.Em filesystem +due to the use of the +.Fl o +option. +.It Fl v +Print verbose information about the created dataset. +.El +.It Xo +.Nm +.Cm create +.Op Fl ps +.Op Fl b Ar blocksize +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Fl V Ar size Ar volume +.Xc +Creates a volume of the given size. +The volume is exported as a block device in +.Pa /dev/zvol/path , +where +.Em path +is the name of the volume in the ZFS namespace. +The size represents the logical size as exported by the device. +By default, a reservation of equal size is created. +.Pp +.Ar size +is automatically rounded up to the nearest multiple of the +.Sy blocksize . +.Bl -tag -width "-b" +.It Fl b Ar blocksize +Equivalent to +.Fl o Sy volblocksize Ns = Ns Ar blocksize . +If this option is specified in conjunction with +.Fl o Sy volblocksize , +the resulting behavior is undefined. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property as if the +.Nm zfs Cm set Ar property Ns = Ns Ar value +command was invoked at the same time the dataset was created. +Any editable ZFS property can also be set at creation time. +Multiple +.Fl o +options can be specified. +An error results if the same property is specified in multiple +.Fl o +options. +.It Fl p +Creates all the non-existing parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +Any property specified on the command line using the +.Fl o +option is ignored. +If the target filesystem already exists, the operation completes successfully. +.It Fl s +Creates a sparse volume with no reservation. +See +.Sy volsize +in the +.Em Native Properties +section of +.Xr zfsprops 8 +for more information about sparse volumes. +.It Fl n +Do a dry-run +.Pq Qq No-op +creation. +No datasets will be created. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to validate properties that are passed via +.Fl o +options and those implied by other options. +The actual dataset creation can still fail due to insufficient privileges or +available capacity. +.It Fl P +Print machine-parsable verbose information about the created dataset. +Each line of output contains a key and one or two values, all separated by tabs. +The +.Sy create_ancestors +and +.Sy create +keys have +.Em volume +as their only value. +The +.Sy create_ancestors +key only appears if the +.Fl p +option is used. +The +.Sy property +key has two values, a property name that property's value. +The +.Sy property +key may appear zero or more times, once for each property that will be set local +to +.Em volume +due to the use of the +.Fl b +or +.Fl o +options, as well as +.Sy refreservation +if the volume is not sparse. +.It Fl v +Print verbose information about the created dataset. +.El +.El +.Ss ZFS Volumes as Swap +ZFS volumes may be used as swap devices. After creating the volume with the +.Nm zfs Cm create Fl V +command set up and enable the swap area using the +.Xr mkswap 8 +and +.Xr swapon 8 +commands. Do not swap to a file on a ZFS file system. A ZFS swap file +configuration is not supported. +.Sh SEE ALSO +.Xr zfs-destroy 8 , +.Xr zfs-list 8 , +.Xr zpool-create 8 diff --git a/man/man8/zfs-destroy.8 b/man/man8/zfs-destroy.8 new file mode 100644 index 000000000000..99ae33d5ecf4 --- /dev/null +++ b/man/man8/zfs-destroy.8 @@ -0,0 +1,178 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-DESTROY 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm destroy +.Nd Destroys the given dataset(s), snapshot(s), or bookmark. +.Sh SYNOPSIS +.Nm +.Cm destroy +.Op Fl Rfnprv +.Ar filesystem Ns | Ns Ar volume +.Nm +.Cm destroy +.Op Fl Rdnprv +.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns +.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns ... +.Nm +.Cm destroy +.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm destroy +.Op Fl Rfnprv +.Ar filesystem Ns | Ns Ar volume +.Xc +Destroys the given dataset. +By default, the command unshares any file systems that are currently shared, +unmounts any file systems that are currently mounted, and refuses to destroy a +dataset that has active dependents +.Pq children or clones . +.Bl -tag -width "-R" +.It Fl R +Recursively destroy all dependents, including cloned file systems outside the +target hierarchy. +.It Fl f +Force an unmount of any file systems using the +.Nm unmount Fl f +command. +This option has no effect on non-file systems or unmounted file systems. +.It Fl n +Do a dry-run +.Pq Qq No-op +deletion. +No data will be deleted. +This is useful in conjunction with the +.Fl v +or +.Fl p +flags to determine what data would be deleted. +.It Fl p +Print machine-parsable verbose information about the deleted data. +.It Fl r +Recursively destroy all children. +.It Fl v +Print verbose information about the deleted data. +.El +.Pp +Extreme care should be taken when applying either the +.Fl r +or the +.Fl R +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.It Xo +.Nm +.Cm destroy +.Op Fl Rdnprv +.Ar filesystem Ns | Ns Ar volume Ns @ Ns Ar snap Ns +.Oo % Ns Ar snap Ns Oo , Ns Ar snap Ns Oo % Ns Ar snap Oc Oc Oc Ns ... +.Xc +The given snapshots are destroyed immediately if and only if the +.Ql zfs destroy +command without the +.Fl d +option would have destroyed it. +Such immediate destruction would occur, for example, if the snapshot had no +clones and the user-initiated reference count were zero. +.Pp +If a snapshot does not qualify for immediate destruction, it is marked for +deferred deletion. +In this state, it exists as a usable, visible snapshot until both of the +preconditions listed above are met, at which point it is destroyed. +.Pp +An inclusive range of snapshots may be specified by separating the first and +last snapshots with a percent sign. +The first and/or last snapshots may be left blank, in which case the +filesystem's oldest or newest snapshot will be implied. +.Pp +Multiple snapshots +.Pq or ranges of snapshots +of the same filesystem or volume may be specified in a comma-separated list of +snapshots. +Only the snapshot's short name +.Po the part after the +.Sy @ +.Pc +should be specified when using a range or comma-separated list to identify +multiple snapshots. +.Bl -tag -width "-R" +.It Fl R +Recursively destroy all clones of these snapshots, including the clones, +snapshots, and children. +If this flag is specified, the +.Fl d +flag will have no effect. +.It Fl d +Destroy immediately. If a snapshot cannot be destroyed now, mark it for +deferred destruction. +.It Fl n +Do a dry-run +.Pq Qq No-op +deletion. +No data will be deleted. +This is useful in conjunction with the +.Fl p +or +.Fl v +flags to determine what data would be deleted. +.It Fl p +Print machine-parsable verbose information about the deleted data. +.It Fl r +Destroy +.Pq or mark for deferred deletion +all snapshots with this name in descendent file systems. +.It Fl v +Print verbose information about the deleted data. +.Pp +Extreme care should be taken when applying either the +.Fl r +or the +.Fl R +options, as they can destroy large portions of a pool and cause unexpected +behavior for mounted file systems in use. +.El +.It Xo +.Nm +.Cm destroy +.Ar filesystem Ns | Ns Ar volume Ns # Ns Ar bookmark +.Xc +The given bookmark is destroyed. +.El +.Sh SEE ALSO +.Xr zfs-create 8 , +.Xr zfs-hold 8 diff --git a/man/man8/zfs-diff.8 b/man/man8/zfs-diff.8 new file mode 100644 index 000000000000..bcc12f7cbba7 --- /dev/null +++ b/man/man8/zfs-diff.8 @@ -0,0 +1,91 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-DIFF 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm diff +.Nd Display the difference between two snapshots of a given filesystem. +.Sh SYNOPSIS +.Nm +.Cm diff +.Op Fl FHt +.Ar snapshot Ar snapshot Ns | Ns Ar filesystem +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm diff +.Op Fl FHt +.Ar snapshot Ar snapshot Ns | Ns Ar filesystem +.Xc +Display the difference between a snapshot of a given filesystem and another +snapshot of that filesystem from a later time or the current contents of the +filesystem. +The first column is a character indicating the type of change, the other columns +indicate pathname, new pathname +.Pq in case of rename , +change in link count, and optionally file type and/or change time. +The types of change are: +.Bd -literal +- The path has been removed ++ The path has been created +M The path has been modified +R The path has been renamed +.Ed +.Bl -tag -width "-F" +.It Fl F +Display an indication of the type of file, in a manner similar to the +.Fl +option of +.Xr ls 1 . +.Bd -literal +B Block device +C Character device +/ Directory +> Door +| Named pipe +@ Symbolic link +P Event port += Socket +F Regular file +.Ed +.It Fl H +Give more parsable tab-separated output, without header lines and without +arrows. +.It Fl t +Display the path's inode change time as the first column of output. +.El +.El +.Sh SEE ALSO +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-get.8 b/man/man8/zfs-get.8 new file mode 120000 index 000000000000..c70b41ae4064 --- /dev/null +++ b/man/man8/zfs-get.8 @@ -0,0 +1 @@ +zfs-set.8 \ No newline at end of file diff --git a/man/man8/zfs-groupspace.8 b/man/man8/zfs-groupspace.8 new file mode 120000 index 000000000000..8bc2f1df305e --- /dev/null +++ b/man/man8/zfs-groupspace.8 @@ -0,0 +1 @@ +zfs-userspace.8 \ No newline at end of file diff --git a/man/man8/zfs-hold.8 b/man/man8/zfs-hold.8 new file mode 100644 index 000000000000..de30caab153a --- /dev/null +++ b/man/man8/zfs-hold.8 @@ -0,0 +1,110 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-HOLD 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm hold +.Nd Hold a snapshot to prevent it being removed with the zfs destroy command. +.Sh SYNOPSIS +.Nm +.Cm hold +.Op Fl r +.Ar tag Ar snapshot Ns ... +.Nm +.Cm holds +.Op Fl rH +.Ar snapshot Ns ... +.Nm +.Cm release +.Op Fl r +.Ar tag Ar snapshot Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm hold +.Op Fl r +.Ar tag Ar snapshot Ns ... +.Xc +Adds a single reference, named with the +.Ar tag +argument, to the specified snapshot or snapshots. +Each snapshot has its own tag namespace, and tags must be unique within that +space. +.Pp +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Er EBUSY . +.Bl -tag -width "-r" +.It Fl r +Specifies that a hold with the given tag is applied recursively to the snapshots +of all descendent file systems. +.El +.It Xo +.Nm +.Cm holds +.Op Fl rH +.Ar snapshot Ns ... +.Xc +Lists all existing user references for the given snapshot or snapshots. +.Bl -tag -width "-r" +.It Fl r +Lists the holds that are set on the named descendent snapshots, in addition to +listing the holds on the named snapshot. +.It Fl H +Do not print headers, use tab-delimited output. +.El +.It Xo +.Nm +.Cm release +.Op Fl r +.Ar tag Ar snapshot Ns ... +.Xc +Removes a single reference, named with the +.Ar tag +argument, from the specified snapshot or snapshots. +The tag must already exist for each snapshot. +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Er EBUSY . +.Bl -tag -width "-r" +.It Fl r +Recursively releases a hold with the given tag on the snapshots of all +descendent file systems. +.El +.El +.Sh SEE ALSO +.Xr zfs-destroy 8 diff --git a/man/man8/zfs-inherit.8 b/man/man8/zfs-inherit.8 new file mode 120000 index 000000000000..c70b41ae4064 --- /dev/null +++ b/man/man8/zfs-inherit.8 @@ -0,0 +1 @@ +zfs-set.8 \ No newline at end of file diff --git a/man/man8/zfs-jail.8 b/man/man8/zfs-jail.8 new file mode 100644 index 000000000000..8274179bb089 --- /dev/null +++ b/man/man8/zfs-jail.8 @@ -0,0 +1,119 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd December 9, 2019 +.Dt ZFS-JAIL 8 +.Os FreeBSD +.Sh NAME +.Nm zfs Ns Pf - Cm jail +.Nd Attaches and detaches ZFS filesystems from FreeBSD jails. +.No A Tn ZFS +dataset can be attached to a jail by using the +.Qq Nm Cm jail +subcommand. You cannot attach a dataset to one jail and the children of the +same dataset to another jail. You can also not attach the root file system +of the jail or any dataset which needs to be mounted before the zfs rc script +is run inside the jail, as it would be attached unmounted until it is +mounted from the rc script inside the jail. To allow management of the +dataset from within a jail, the +.Sy jailed +property has to be set and the jail needs access to the +.Pa /dev/zfs +device. The +.Sy quota +property cannot be changed from within a jail. See +.Xr jail 8 +for information on how to allow mounting +.Tn ZFS +datasets from within a jail. +.Pp +.No A Tn ZFS +dataset can be detached from a jail using the +.Qq Nm Cm unjail +subcommand. +.Pp +After a dataset is attached to a jail and the jailed property is set, a jailed +file system cannot be mounted outside the jail, since the jail administrator +might have set the mount point to an unacceptable value. +.Sh SYNOPSIS +.Nm +.Cm jail +.Ar jailid Ns | Ns Ar jailname filesystem +.Nm +.Cm unjail +.Ar jailid Ns | Ns Ar jailname filesystem +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm jail +.Ar jailid filesystem +.Xc +.Pp +Attaches the specified +.Ar filesystem +to the jail identified by JID +.Ar jailid . +From now on this file system tree can be managed from within a jail if the +.Sy jailed +property has been set. To use this functuinality, the jail needs the +.Va allow.mount +and +.Va allow.mount.zfs +parameters set to 1 and the +.Va enforce_statfs +parameter set to a value lower than 2. +.Pp +See +.Xr jail 8 +for more information on managing jails and configuring the parameters above. +.It Xo +.Nm +.Cm unjail +.Ar jailid filesystem +.Xc +.Pp +Detaches the specified +.Ar filesystem +from the jail identified by JID +.Ar jailid . +.El +.Sh SEE ALSO +.Xr jail 8 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8 new file mode 100644 index 000000000000..ad2b57e6d664 --- /dev/null +++ b/man/man8/zfs-list.8 @@ -0,0 +1,169 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-LIST 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm list +.Nd Lists the property information for the given datasets in tabular form. +.Sh SYNOPSIS +.Nm +.Cm list +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... Oc +.Oo Fl s Ar property Oc Ns ... +.Oo Fl S Ar property Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm list +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... Oc +.Oo Fl s Ar property Oc Ns ... +.Oo Fl S Ar property Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Oc Ns ... +.Xc +If specified, you can list property information by the absolute pathname or the +relative pathname. +By default, all file systems and volumes are displayed. +Snapshots are displayed if the +.Sy listsnaps +property is +.Sy on +.Po the default is +.Sy off +.Pc . +The following fields are displayed: +.Sy name Ns \&, Sy used Ns \&, Sy available Ns \&, Sy referenced Ns \&, Sy mountpoint Ns . +.Bl -tag -width "-H" +.It Fl H +Used for scripting mode. +Do not print headers and separate fields by a single tab instead of arbitrary +white space. +.It Fl S Ar property +Same as the +.Fl s +option, but sorts by property in descending order. +.It Fl d Ar depth +Recursively display any children of the dataset, limiting the recursion to +.Ar depth . +A +.Ar depth +of +.Sy 1 +will display only the dataset and its direct children. +.It Fl o Ar property +A comma-separated list of properties to display. +The property must be: +.Bl -bullet +.It +One of the properties described in the +.Em Native Properties +section of +.Xr zfsprops 8 +.It +A user property +.It +The value +.Sy name +to display the dataset name +.It +The value +.Sy space +to display space usage properties on file systems and volumes. +This is a shortcut for specifying +.Fl o Sy name Ns \&, Ns Sy avail Ns \&, Ns Sy used Ns \&, Ns Sy usedsnap Ns \&, Ns +.Sy usedds Ns \&, Ns Sy usedrefreserv Ns \&, Ns Sy usedchild Fl t +.Sy filesystem Ns \&, Ns Sy volume +syntax. +.El +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl r +Recursively display any children of the dataset on the command line. +.It Fl s Ar property +A property for sorting the output by column in ascending order based on the +value of the property. +The property must be one of the properties described in the +.Em Properties +section of +.Xr zfsprops 8 +or the value +.Sy name +to sort by the dataset name. +Multiple properties can be specified at one time using multiple +.Fl s +property options. +Multiple +.Fl s +options are evaluated from left to right in decreasing order of importance. +The following is a list of sorting criteria: +.Bl -bullet +.It +Numeric types sort in numeric order. +.It +String types sort in alphabetical order. +.It +Types inappropriate for a row sort that row to the literal bottom, regardless of +the specified ordering. +.El +.Pp +If no sorting options are specified the existing behavior of +.Nm zfs Cm list +is preserved. +.It Fl t Ar type +A comma-separated list of types to display, where +.Ar type +is one of +.Sy filesystem , +.Sy snapshot , +.Sy volume , +.Sy bookmark , +or +.Sy all . +For example, specifying +.Fl t Sy snapshot +displays only snapshots. +.El +.El +.Sh SEE ALSO +.Xr zfs-get 8 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-load-key.8 b/man/man8/zfs-load-key.8 new file mode 100644 index 000000000000..72248b6962d9 --- /dev/null +++ b/man/man8/zfs-load-key.8 @@ -0,0 +1,295 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd January 13, 2020 +.Dt ZFS-LOAD-KEY 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm load-key +.Nd Load, unload, or change the encryption key used to access a dataset. +.Sh SYNOPSIS +.Nm +.Cm load-key +.Op Fl nr +.Op Fl L Ar keylocation +.Fl a | Ar filesystem +.Nm +.Cm unload-key +.Op Fl r +.Fl a | Ar filesystem +.Nm +.Cm change-key +.Op Fl l +.Op Fl o Ar keylocation Ns = Ns Ar value +.Op Fl o Ar keyformat Ns = Ns Ar value +.Op Fl o Ar pbkdf2iters Ns = Ns Ar value +.Ar filesystem +.Nm +.Cm change-key +.Fl i +.Op Fl l +.Ar filesystem +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm load-key +.Op Fl nr +.Op Fl L Ar keylocation +.Fl a | Ar filesystem +.Xc +Load the key for +.Ar filesystem , +allowing it and all children that inherit the +.Sy keylocation +property to be accessed. The key will be expected in the format specified by the +.Sy keyformat +and location specified by the +.Sy keylocation +property. Note that if the +.Sy keylocation +is set to +.Sy prompt +the terminal will interactively wait for the key to be entered. Loading a key +will not automatically mount the dataset. If that functionality is desired, +.Nm zfs Cm mount Sy -l +will ask for the key and mount the dataset +.Po +see +.Xr zfs-mount 8 +.Pc . +Once the key is loaded the +.Sy keystatus +property will become +.Sy available . +.Bl -tag -width "-r" +.It Fl r +Recursively loads the keys for the specified filesystem and all descendent +encryption roots. +.It Fl a +Loads the keys for all encryption roots in all imported pools. +.It Fl n +Do a dry-run +.Pq Qq No-op +load-key. This will cause zfs to simply check that the +provided key is correct. This command may be run even if the key is already +loaded. +.It Fl L Ar keylocation +Use +.Ar keylocation +instead of the +.Sy keylocation +property. This will not change the value of the property on the dataset. Note +that if used with either +.Fl r +or +.Fl a , +.Ar keylocation +may only be given as +.Sy prompt . +.El +.It Xo +.Nm +.Cm unload-key +.Op Fl r +.Fl a | Ar filesystem +.Xc +Unloads a key from ZFS, removing the ability to access the dataset and all of +its children that inherit the +.Sy keylocation +property. This requires that the dataset is not currently open or mounted. Once +the key is unloaded the +.Sy keystatus +property will become +.Sy unavailable . +.Bl -tag -width "-r" +.It Fl r +Recursively unloads the keys for the specified filesystem and all descendent +encryption roots. +.It Fl a +Unloads the keys for all encryption roots in all imported pools. +.El +.It Xo +.Nm +.Cm change-key +.Op Fl l +.Op Fl o Ar keylocation Ns = Ns Ar value +.Op Fl o Ar keyformat Ns = Ns Ar value +.Op Fl o Ar pbkdf2iters Ns = Ns Ar value +.Ar filesystem +.Xc +.It Xo +.Nm +.Cm change-key +.Fl i +.Op Fl l +.Ar filesystem +.Xc +Changes the user's key (e.g. a passphrase) used to access a dataset. This +command requires that the existing key for the dataset is already loaded into +ZFS. This command may also be used to change the +.Sy keylocation , +.Sy keyformat , +and +.Sy pbkdf2iters +properties as needed. If the dataset was not previously an encryption root it +will become one. Alternatively, the +.Fl i +flag may be provided to cause an encryption root to inherit the parent's key +instead. +.Pp +If the user's key is compromised, +.Nm zfs Cm change-key +does not necessarily protect existing or newly-written data from attack. +Newly-written data will continue to be encrypted with the same master key as +the existing data. The master key is compromised if an attacker obtains a +user key and the corresponding wrapped master key. Currently, +.Nm zfs Cm change-key +does not overwrite the previous wrapped master key on disk, so it is +accessible via forensic analysis for an indeterminate length of time. +.Pp +In the event of a master key compromise, ideally the drives should be securely +erased to remove all the old data (which is readable using the compromised +master key), a new pool created, and the data copied back. This can be +approximated in place by creating new datasets, copying the data +(e.g. using +.Nm zfs Cm send +| +.Nm zfs Cm recv Ns +), and then clearing the free space with +.Nm zpool Cm trim --secure +if supported by your hardware, otherwise +.Nm zpool Cm initialize Ns . +.Bl -tag -width "-r" +.It Fl l +Ensures the key is loaded before attempting to change the key. This is +effectively equivalent to +.Qq Nm zfs Cm load-key Ar filesystem ; Nm zfs Cm change-key Ar filesystem +.It Fl o Ar property Ns = Ns Ar value +Allows the user to set encryption key properties ( +.Sy keyformat , +.Sy keylocation , +and +.Sy pbkdf2iters +) while changing the key. This is the only way to alter +.Sy keyformat +and +.Sy pbkdf2iters +after the dataset has been created. +.It Fl i +Indicates that zfs should make +.Ar filesystem +inherit the key of its parent. Note that this command can only be run on an +encryption root that has an encrypted parent. +.El +.El +.Ss Encryption +Enabling the +.Sy encryption +feature allows for the creation of encrypted filesystems and volumes. ZFS +will encrypt file and zvol data, file attributes, ACLs, permission bits, +directory listings, FUID mappings, and +.Sy userused +/ +.Sy groupused +data. ZFS will not encrypt metadata related to the pool structure, including +dataset and snapshot names, dataset hierarchy, properties, file size, file +holes, and deduplication tables (though the deduplicated data itself is +encrypted). +.Pp +Key rotation is managed by ZFS. Changing the user's key (e.g. a passphrase) +does not require re-encrypting the entire dataset. Datasets can be scrubbed, +resilvered, renamed, and deleted without the encryption keys being loaded (see the +.Nm zfs Cm load-key +subcommand for more info on key loading). +.Pp +Creating an encrypted dataset requires specifying the +.Sy encryption +and +.Sy keyformat +properties at creation time, along with an optional +.Sy keylocation +and +.Sy pbkdf2iters . +After entering an encryption key, the +created dataset will become an encryption root. Any descendant datasets will +inherit their encryption key from the encryption root by default, meaning that +loading, unloading, or changing the key for the encryption root will implicitly +do the same for all inheriting datasets. If this inheritance is not desired, +simply supply a +.Sy keyformat +when creating the child dataset or use +.Nm zfs Cm change-key +to break an existing relationship, creating a new encryption root on the child. +Note that the child's +.Sy keyformat +may match that of the parent while still creating a new encryption root, and +that changing the +.Sy encryption +property alone does not create a new encryption root; this would simply use a +different cipher suite with the same key as its encryption root. The one +exception is that clones will always use their origin's encryption key. +As a result of this exception, some encryption-related properties (namely +.Sy keystatus , +.Sy keyformat , +.Sy keylocation , +and +.Sy pbkdf2iters ) +do not inherit like other ZFS properties and instead use the value determined +by their encryption root. Encryption root inheritance can be tracked via the +read-only +.Sy encryptionroot +property. +.Pp +Encryption changes the behavior of a few ZFS +operations. Encryption is applied after compression so compression ratios are +preserved. Normally checksums in ZFS are 256 bits long, but for encrypted data +the checksum is 128 bits of the user-chosen checksum and 128 bits of MAC from +the encryption suite, which provides additional protection against maliciously +altered data. Deduplication is still possible with encryption enabled but for +security, datasets will only dedup against themselves, their snapshots, and +their clones. +.Pp +There are a few limitations on encrypted datasets. Encrypted data cannot be +embedded via the +.Sy embedded_data +feature. Encrypted datasets may not have +.Sy copies Ns = Ns Em 3 +since the implementation stores some encryption metadata where the third copy +would normally be. Since compression is applied before encryption datasets may +be vulnerable to a CRIME-like attack if applications accessing the data allow +for it. Deduplication with encryption will leak information about which blocks +are equivalent in a dataset and will incur an extra CPU cost per block written. +.Sh SEE ALSO +.Xr zfs-create 8 , +.Xr zfs-set 8 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-mount-generator.8.in b/man/man8/zfs-mount-generator.8.in new file mode 100644 index 000000000000..41a2999f0f0a --- /dev/null +++ b/man/man8/zfs-mount-generator.8.in @@ -0,0 +1,248 @@ +.\" +.\" Copyright 2018 Antonio Russo +.\" Copyright 2019 Kjeld Schouten-Lebbing +.\" Copyright 2020 InsanePrawn +.\" +.\" Permission is hereby granted, free of charge, to any person obtaining +.\" a copy of this software and associated documentation files (the +.\" "Software"), to deal in the Software without restriction, including +.\" without limitation the rights to use, copy, modify, merge, publish, +.\" distribute, sublicense, and/or sell copies of the Software, and to +.\" permit persons to whom the Software is furnished to do so, subject to +.\" the following conditions: +.\" +.\" The above copyright notice and this permission notice shall be +.\" included in all copies or substantial portions of the Software. +.\" +.\" THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +.\" EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +.\" MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +.\" NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +.\" LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +.\" OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +.\" WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +.TH "ZFS\-MOUNT\-GENERATOR" "8" "2020-01-19" "ZFS" "zfs-mount-generator" "\"" + +.SH "NAME" +zfs\-mount\-generator \- generates systemd mount units for ZFS +.SH SYNOPSIS +.B @systemdgeneratordir@/zfs\-mount\-generator +.sp +.SH DESCRIPTION +zfs\-mount\-generator implements the \fBGenerators Specification\fP +of +.BR systemd (1), +and is called during early boot to generate +.BR systemd.mount (5) +units for automatically mounted datasets. Mount ordering and dependencies +are created for all tracked pools (see below). + +.SS ENCRYPTION KEYS +If the dataset is an encryption root, a service that loads the associated key (either from file or through a +.BR systemd\-ask\-password (1) +prompt) will be created. This service +. BR RequiresMountsFor +the path of the key (if file-based) and also copies the mount unit's +.BR After , +.BR Before +and +.BR Requires . +All mount units of encrypted datasets add the key\-load service for their encryption root to their +.BR Wants +and +.BR After . +The service will not be +.BR Want ed +or +.BR Require d +by +.BR local-fs.target +directly, and so will only be started manually or as a dependency of a started mount unit. + +.SS UNIT ORDERING AND DEPENDENCIES +mount unit's +.BR Before +\-> +key\-load service (if any) +\-> +mount unit +\-> +mount unit's +.BR After + +It is worth nothing that when a mount unit is activated, it activates all available mount units for parent paths to its mountpoint, i.e. activating the mount unit for /tmp/foo/1/2/3 automatically activates all available mount units for /tmp, /tmp/foo, /tmp/foo/1, and /tmp/foo/1/2. This is true for any combination of mount units from any sources, not just ZFS. + +.SS CACHE FILE +Because ZFS pools may not be available very early in the boot process, +information on ZFS mountpoints must be stored separately. The output of the command +.PP +.RS 4 +zfs list -H -o name,mountpoint,canmount,atime,relatime,devices,exec,readonly,setuid,nbmand,encroot,keylocation,org.openzfs.systemd:requires,org.openzfs.systemd:requires-mounts-for,org.openzfs.systemd:before,org.openzfs.systemd:after,org.openzfs.systemd:wanted-by,org.openzfs.systemd:required-by,org.openzfs.systemd:nofail,org.openzfs.systemd:ignore + +.RE +.PP +for datasets that should be mounted by systemd, should be kept +separate from the pool, at +.PP +.RS 4 +.RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME +. +.RE +.PP +The cache file, if writeable, will be kept synchronized with the pool +state by the ZEDLET +.PP +.RS 4 +history_event-zfs-list-cacher.sh . +.RE +.PP +.sp +.SS PROPERTIES +The behavior of the generator script can be influenced by the following dataset properties: +.sp +.TP 4 +.BR canmount = on | off | noauto +If a dataset has +.BR mountpoint +set and +.BR canmount +is not +.BR off , +a mount unit will be generated. +Additionally, if +.BR canmount +is +.BR on , +.BR local-fs.target +will gain a dependency on the mount unit. + +This behavior is equal to the +.BR auto +and +.BR noauto +legacy mount options, see +.BR systemd.mount (5). + +Encryption roots always generate a key-load service, even for +.BR canmount=off . +.TP 4 +.BR org.openzfs.systemd:requires\-mounts\-for = \fIpath\fR... +Space\-separated list of mountpoints to require to be mounted for this mount unit +.TP 4 +.BR org.openzfs.systemd:before = \fIunit\fR... +The mount unit and associated key\-load service will be ordered before this space\-separated list of units. +.TP 4 +.BR org.openzfs.systemd:after = \fIunit\fR... +The mount unit and associated key\-load service will be ordered after this space\-separated list of units. +.TP 4 +.BR org.openzfs.systemd:wanted\-by = \fIunit\fR... +Space-separated list of units that will gain a +.BR Wants +dependency on this mount unit. +Setting this property implies +.BR noauto . +.TP 4 +.BR org.openzfs.systemd:required\-by = \fIunit\fR... +Space-separated list of units that will gain a +.BR Requires +dependency on this mount unit. +Setting this property implies +.BR noauto . +.TP 4 +.BR org.openzfs.systemd:nofail = unset | on | off +Toggles between a +.BR Wants +and +.BR Requires +type of dependency between the mount unit and +.BR local-fs.target , +if +.BR noauto +isn't set or implied. + +.BR on : +Mount will be +.BR WantedBy +local-fs.target + +.BR off : +Mount will be +.BR Before +and +.BR RequiredBy +local-fs.target + +.BR unset : +Mount will be +.BR Before +and +.BR WantedBy +local-fs.target +.TP 4 +.BR org.openzfs.systemd:ignore = on | off +If set to +.BR on , +do not generate a mount unit for this dataset. + +.RE +See also +.BR systemd.mount (5) + +.PP +.SH EXAMPLE +To begin, enable tracking for the pool: +.PP +.RS 4 +touch +.RI @sysconfdir@/zfs/zfs-list.cache/ POOLNAME +.RE +.PP +Then, enable the tracking ZEDLET: +.PP +.RS 4 +ln -s "@zfsexecdir@/zed.d/history_event-zfs-list-cacher.sh" "@sysconfdir@/zfs/zed.d" + +systemctl enable zfs-zed.service + +systemctl restart zfs-zed.service +.RE +.PP +Force the running of the ZEDLET by setting a monitored property, e.g. +.BR canmount , +for at least one dataset in the pool: +.PP +.RS 4 +zfs set canmount=on +.I DATASET +.RE +.PP +This forces an update to the stale cache file. + +To test the generator output, run +.PP +.RS 4 +@systemdgeneratordir@/zfs-mount-generator /tmp/zfs-mount-generator . . +.RE +.PP +This will generate units and dependencies in +.I /tmp/zfs-mount-generator +for you to inspect them. The second and third argument are ignored. + +If you're satisfied with the generated units, instruct systemd to re-run all generators: +.PP +.RS 4 +systemctl daemon-reload +.RE +.PP + +.sp +.SH SEE ALSO +.BR zfs (5) +.BR zfs-events (5) +.BR zed (8) +.BR zpool (5) +.BR systemd (1) +.BR systemd.target (5) +.BR systemd.special (7) +.BR systemd.mount (7) diff --git a/man/man8/zfs-mount.8 b/man/man8/zfs-mount.8 new file mode 100644 index 000000000000..feddafb28e2f --- /dev/null +++ b/man/man8/zfs-mount.8 @@ -0,0 +1,128 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd February 16, 2019 +.Dt ZFS-MOUNT 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm mount +.Nd Manage mount state of ZFS file systems. +.Sh SYNOPSIS +.Nm +.Cm mount +.Nm +.Cm mount +.Op Fl Oflv +.Op Fl o Ar options +.Fl a | Ar filesystem +.Nm +.Cm unmount +.Op Fl fu +.Fl a | Ar filesystem Ns | Ns Ar mountpoint +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm mount +.Xc +Displays all ZFS file systems currently mounted. +.It Xo +.Nm +.Cm mount +.Op Fl Oflv +.Op Fl o Ar options +.Fl a | Ar filesystem +.Xc +Mount ZFS filesystem on a path described by its +.Sy mountpoint +property, if the path exists and is empty. If +.Sy mountpoint +is set to +.Em legacy , +the filesystem should be instead mounted using +.Xr mount 8 . +.Bl -tag -width "-O" +.It Fl O +Perform an overlay mount. Allows mounting in non-empty +.Sy mountpoint . +See +.Xr mount 8 +for more information. +.It Fl a +Mount all available ZFS file systems. +Invoked automatically as part of the boot process if configured. +.It Ar filesystem +Mount the specified filesystem. +.It Fl o Ar options +An optional, comma-separated list of mount options to use temporarily for the +duration of the mount. +See the +.Em Temporary Mount Point Properties +section of +.Xr zfsprops 8 +for details. +.It Fl l +Load keys for encrypted filesystems as they are being mounted. This is +equivalent to executing +.Nm zfs Cm load-key +on each encryption root before mounting it. Note that if a filesystem has a +.Sy keylocation +of +.Sy prompt +this will cause the terminal to interactively block after asking for the key. +.It Fl v +Report mount progress. +.It Fl f +Attempt to force mounting of all filesystems, even those that couldn't normally be mounted (e.g. redacted datasets). +.El +.It Xo +.Nm +.Cm unmount +.Op Fl fu +.Fl a | Ar filesystem Ns | Ns Ar mountpoint +.Xc +Unmounts currently mounted ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Unmount all available ZFS file systems. +Invoked automatically as part of the shutdown process. +.It Fl f +Forcefully unmount the file system, even if it is currently in use. +This option is not supported on Linux. +.It Fl u +Unload keys for any encryption roots unmounted by this command. +.It Ar filesystem Ns | Ns Ar mountpoint +Unmount the specified filesystem. +The command can also be given a path to a ZFS file system mount point on the +system. +.El +.El diff --git a/man/man8/zfs-program.8 b/man/man8/zfs-program.8 new file mode 100644 index 000000000000..41d38587e547 --- /dev/null +++ b/man/man8/zfs-program.8 @@ -0,0 +1,634 @@ +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright (c) 2016, 2019 by Delphix. All Rights Reserved. +.\" Copyright (c) 2019, 2020 by Christian Schwarz. All Rights Reserved. +.\" Copyright 2020 Joyent, Inc. +.\" +.Dd February 3, 2020 +.Dt ZFS-PROGRAM 8 +.Os +.Sh NAME +.Nm zfs program +.Nd executes ZFS channel programs +.Sh SYNOPSIS +.Cm "zfs program" +.Op Fl jn +.Op Fl t Ar instruction-limit +.Op Fl m Ar memory-limit +.Ar pool +.Ar script +.\".Op Ar optional arguments to channel program +.Sh DESCRIPTION +The ZFS channel program interface allows ZFS administrative operations to be +run programmatically as a Lua script. +The entire script is executed atomically, with no other administrative +operations taking effect concurrently. +A library of ZFS calls is made available to channel program scripts. +Channel programs may only be run with root privileges. +.Pp +A modified version of the Lua 5.2 interpreter is used to run channel program +scripts. +The Lua 5.2 manual can be found at: +.Bd -centered -offset indent +.Lk http://www.lua.org/manual/5.2/ +.Ed +.Pp +The channel program given by +.Ar script +will be run on +.Ar pool , +and any attempts to access or modify other pools will cause an error. +.Sh OPTIONS +.Bl -tag -width "-t" +.It Fl j +Display channel program output in JSON format. When this flag is specified and +standard output is empty - channel program encountered an error. The details of +such an error will be printed to standard error in plain text. +.It Fl n +Executes a read-only channel program, which runs faster. +The program cannot change on-disk state by calling functions from the +zfs.sync submodule. +The program can be used to gather information such as properties and +determining if changes would succeed (zfs.check.*). +Without this flag, all pending changes must be synced to disk before a +channel program can complete. +.It Fl t Ar instruction-limit +Limit the number of Lua instructions to execute. +If a channel program executes more than the specified number of instructions, +it will be stopped and an error will be returned. +The default limit is 10 million instructions, and it can be set to a maximum of +100 million instructions. +.It Fl m Ar memory-limit +Memory limit, in bytes. +If a channel program attempts to allocate more memory than the given limit, it +will be stopped and an error returned. +The default memory limit is 10 MB, and can be set to a maximum of 100 MB. +.El +.Pp +All remaining argument strings will be passed directly to the Lua script as +described in the +.Sx LUA INTERFACE +section below. +.Sh LUA INTERFACE +A channel program can be invoked either from the command line, or via a library +call to +.Fn lzc_channel_program . +.Ss Arguments +Arguments passed to the channel program are converted to a Lua table. +If invoked from the command line, extra arguments to the Lua script will be +accessible as an array stored in the argument table with the key 'argv': +.Bd -literal -offset indent +args = ... +argv = args["argv"] +-- argv == {1="arg1", 2="arg2", ...} +.Ed +.Pp +If invoked from the libZFS interface, an arbitrary argument list can be +passed to the channel program, which is accessible via the same +"..." syntax in Lua: +.Bd -literal -offset indent +args = ... +-- args == {"foo"="bar", "baz"={...}, ...} +.Ed +.Pp +Note that because Lua arrays are 1-indexed, arrays passed to Lua from the +libZFS interface will have their indices incremented by 1. +That is, the element +in +.Va arr[0] +in a C array passed to a channel program will be stored in +.Va arr[1] +when accessed from Lua. +.Ss Return Values +Lua return statements take the form: +.Bd -literal -offset indent +return ret0, ret1, ret2, ... +.Ed +.Pp +Return statements returning multiple values are permitted internally in a +channel program script, but attempting to return more than one value from the +top level of the channel program is not permitted and will throw an error. +However, tables containing multiple values can still be returned. +If invoked from the command line, a return statement: +.Bd -literal -offset indent +a = {foo="bar", baz=2} +return a +.Ed +.Pp +Will be output formatted as: +.Bd -literal -offset indent +Channel program fully executed with return value: + return: + baz: 2 + foo: 'bar' +.Ed +.Ss Fatal Errors +If the channel program encounters a fatal error while running, a non-zero exit +status will be returned. +If more information about the error is available, a singleton list will be +returned detailing the error: +.Bd -literal -offset indent +error: "error string, including Lua stack trace" +.Ed +.Pp +If a fatal error is returned, the channel program may have not executed at all, +may have partially executed, or may have fully executed but failed to pass a +return value back to userland. +.Pp +If the channel program exhausts an instruction or memory limit, a fatal error +will be generated and the program will be stopped, leaving the program partially +executed. +No attempt is made to reverse or undo any operations already performed. +Note that because both the instruction count and amount of memory used by a +channel program are deterministic when run against the same inputs and +filesystem state, as long as a channel program has run successfully once, you +can guarantee that it will finish successfully against a similar size system. +.Pp +If a channel program attempts to return too large a value, the program will +fully execute but exit with a nonzero status code and no return value. +.Pp +.Em Note : +ZFS API functions do not generate Fatal Errors when correctly invoked, they +return an error code and the channel program continues executing. +See the +.Sx ZFS API +section below for function-specific details on error return codes. +.Ss Lua to C Value Conversion +When invoking a channel program via the libZFS interface, it is necessary to +translate arguments and return values from Lua values to their C equivalents, +and vice-versa. +.Pp +There is a correspondence between nvlist values in C and Lua tables. +A Lua table which is returned from the channel program will be recursively +converted to an nvlist, with table values converted to their natural +equivalents: +.Bd -literal -offset indent +string -> string +number -> int64 +boolean -> boolean_value +nil -> boolean (no value) +table -> nvlist +.Ed +.Pp +Likewise, table keys are replaced by string equivalents as follows: +.Bd -literal -offset indent +string -> no change +number -> signed decimal string ("%lld") +boolean -> "true" | "false" +.Ed +.Pp +Any collision of table key strings (for example, the string "true" and a +true boolean value) will cause a fatal error. +.Pp +Lua numbers are represented internally as signed 64-bit integers. +.Sh LUA STANDARD LIBRARY +The following Lua built-in base library functions are available: +.Bd -literal -offset indent +assert rawlen +collectgarbage rawget +error rawset +getmetatable select +ipairs setmetatable +next tonumber +pairs tostring +rawequal type +.Ed +.Pp +All functions in the +.Em coroutine , +.Em string , +and +.Em table +built-in submodules are also available. +A complete list and documentation of these modules is available in the Lua +manual. +.Pp +The following functions base library functions have been disabled and are +not available for use in channel programs: +.Bd -literal -offset indent +dofile +loadfile +load +pcall +print +xpcall +.Ed +.Sh ZFS API +.Ss Function Arguments +Each API function takes a fixed set of required positional arguments and +optional keyword arguments. +For example, the destroy function takes a single positional string argument +(the name of the dataset to destroy) and an optional "defer" keyword boolean +argument. +When using parentheses to specify the arguments to a Lua function, only +positional arguments can be used: +.Bd -literal -offset indent +zfs.sync.destroy("rpool@snap") +.Ed +.Pp +To use keyword arguments, functions must be called with a single argument that +is a Lua table containing entries mapping integers to positional arguments and +strings to keyword arguments: +.Bd -literal -offset indent +zfs.sync.destroy({1="rpool@snap", defer=true}) +.Ed +.Pp +The Lua language allows curly braces to be used in place of parenthesis as +syntactic sugar for this calling convention: +.Bd -literal -offset indent +zfs.sync.snapshot{"rpool@snap", defer=true} +.Ed +.Ss Function Return Values +If an API function succeeds, it returns 0. +If it fails, it returns an error code and the channel program continues +executing. +API functions do not generate Fatal Errors except in the case of an +unrecoverable internal file system error. +.Pp +In addition to returning an error code, some functions also return extra +details describing what caused the error. +This extra description is given as a second return value, and will always be a +Lua table, or Nil if no error details were returned. +Different keys will exist in the error details table depending on the function +and error case. +Any such function may be called expecting a single return value: +.Bd -literal -offset indent +errno = zfs.sync.promote(dataset) +.Ed +.Pp +Or, the error details can be retrieved: +.Bd -literal -offset indent +errno, details = zfs.sync.promote(dataset) +if (errno == EEXIST) then + assert(details ~= Nil) + list_of_conflicting_snapshots = details +end +.Ed +.Pp +The following global aliases for API function error return codes are defined +for use in channel programs: +.Bd -literal -offset indent +EPERM ECHILD ENODEV ENOSPC +ENOENT EAGAIN ENOTDIR ESPIPE +ESRCH ENOMEM EISDIR EROFS +EINTR EACCES EINVAL EMLINK +EIO EFAULT ENFILE EPIPE +ENXIO ENOTBLK EMFILE EDOM +E2BIG EBUSY ENOTTY ERANGE +ENOEXEC EEXIST ETXTBSY EDQUOT +EBADF EXDEV EFBIG +.Ed +.Ss API Functions +For detailed descriptions of the exact behavior of any zfs administrative +operations, see the main +.Xr zfs 1 +manual page. +.Bl -tag -width "xx" +.It Em zfs.debug(msg) +Record a debug message in the zfs_dbgmsg log. +A log of these messages can be printed via mdb's "::zfs_dbgmsg" command, or +can be monitored live by running: +.Bd -literal -offset indent + dtrace -n 'zfs-dbgmsg{trace(stringof(arg0))}' +.Ed +.Pp +msg (string) +.Bd -ragged -compact -offset "xxxx" +Debug message to be printed. +.Ed +.It Em zfs.exists(dataset) +Returns true if the given dataset exists, or false if it doesn't. +A fatal error will be thrown if the dataset is not in the target pool. +That is, in a channel program running on rpool, +zfs.exists("rpool/nonexistent_fs") returns false, but +zfs.exists("somepool/fs_that_may_exist") will error. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Dataset to check for existence. +Must be in the target pool. +.Ed +.It Em zfs.get_prop(dataset, property) +Returns two values. +First, a string, number or table containing the property value for the given +dataset. +Second, a string containing the source of the property (i.e. the name of the +dataset in which it was set or nil if it is readonly). +Throws a Lua error if the dataset is invalid or the property doesn't exist. +Note that Lua only supports int64 number types whereas ZFS number properties +are uint64. +This means very large values (like guid) may wrap around and appear negative. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Filesystem or snapshot path to retrieve properties from. +.Ed +.Pp +property (string) +.Bd -ragged -compact -offset "xxxx" +Name of property to retrieve. +All filesystem, snapshot and volume properties are supported except +for 'mounted' and 'iscsioptions.' +Also supports the 'written@snap' and 'written#bookmark' properties and +the '@id' properties, though the id must be in numeric +form. +.Ed +.El +.Bl -tag -width "xx" +.It Sy zfs.sync submodule +The sync submodule contains functions that modify the on-disk state. +They are executed in "syncing context". +.Pp +The available sync submodule functions are as follows: +.Bl -tag -width "xx" +.It Em zfs.sync.destroy(dataset, [defer=true|false]) +Destroy the given dataset. +Returns 0 on successful destroy, or a nonzero error code if the dataset could +not be destroyed (for example, if the dataset has any active children or +clones). +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Filesystem or snapshot to be destroyed. +.Ed +.Pp +[optional] defer (boolean) +.Bd -ragged -compact -offset "xxxx" +Valid only for destroying snapshots. +If set to true, and the snapshot has holds or clones, allows the snapshot to be +marked for deferred deletion rather than failing. +.Ed +.It Em zfs.sync.inherit(dataset, property) +Clears the specified property in the given dataset, causing it to be inherited +from an ancestor, or restored to the default if no ancestor property is set. +The +.Ql zfs inherit -S +option has not been implemented. +Returns 0 on success, or a nonzero error code if the property could not be +cleared. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Filesystem or snapshot containing the property to clear. +.Ed +.Pp +property (string) +.Bd -ragged -compact -offset "xxxx" +The property to clear. +Allowed properties are the same as those for the +.Nm zfs Cm inherit +command. +.Ed +.It Em zfs.sync.promote(dataset) +Promote the given clone to a filesystem. +Returns 0 on successful promotion, or a nonzero error code otherwise. +If EEXIST is returned, the second return value will be an array of the clone's +snapshots whose names collide with snapshots of the parent filesystem. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Clone to be promoted. +.Ed +.It Em zfs.sync.rollback(filesystem) +Rollback to the previous snapshot for a dataset. +Returns 0 on successful rollback, or a nonzero error code otherwise. +Rollbacks can be performed on filesystems or zvols, but not on snapshots +or mounted datasets. +EBUSY is returned in the case where the filesystem is mounted. +.Pp +filesystem (string) +.Bd -ragged -compact -offset "xxxx" +Filesystem to rollback. +.Ed +.It Em zfs.sync.set_prop(dataset, property, value) +Sets the given property on a dataset. +Currently only user properties are supported. +Returns 0 if the property was set, or a nonzero error code otherwise. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +The dataset where the property will be set. +.Ed +.Pp +property (string) +.Bd -ragged -compact -offset "xxxx" +The property to set. +Only user properties are supported. +.Ed +.Pp +value (string) +.Bd -ragged -compact -offset "xxxx" +The value of the property to be set. +.Ed +.It Em zfs.sync.snapshot(dataset) +Create a snapshot of a filesystem. +Returns 0 if the snapshot was successfully created, +and a nonzero error code otherwise. +.Pp +Note: Taking a snapshot will fail on any pool older than legacy version 27. +To enable taking snapshots from ZCP scripts, the pool must be upgraded. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Name of snapshot to create. +.Ed +.It Em zfs.sync.bookmark(source, newbookmark) +Create a bookmark of an existing source snapshot or bookmark. +Returns 0 if the new bookmark was successfully created, +and a nonzero error code otherwise. +.Pp +Note: Bookmarking requires the corresponding pool feature to be enabled. +.Pp +source (string) +.Bd -ragged -compact -offset "xxxx" +Full name of the existing snapshot or bookmark. +.Ed +.Pp +newbookmark (string) +.Bd -ragged -compact -offset "xxxx" +Full name of the new bookmark. +.El +.It Sy zfs.check submodule +For each function in the zfs.sync submodule, there is a corresponding zfs.check +function which performs a "dry run" of the same operation. +Each takes the same arguments as its zfs.sync counterpart and returns 0 if the +operation would succeed, or a non-zero error code if it would fail, along with +any other error details. +That is, each has the same behavior as the corresponding sync function except +for actually executing the requested change. +For example, +.Em zfs.check.destroy("fs") +returns 0 if +.Em zfs.sync.destroy("fs") +would successfully destroy the dataset. +.Pp +The available zfs.check functions are: +.Bl -tag -width "xx" +.It Em zfs.check.destroy(dataset, [defer=true|false]) +.It Em zfs.check.promote(dataset) +.It Em zfs.check.rollback(filesystem) +.It Em zfs.check.set_property(dataset, property, value) +.It Em zfs.check.snapshot(dataset) +.El +.It Sy zfs.list submodule +The zfs.list submodule provides functions for iterating over datasets and +properties. +Rather than returning tables, these functions act as Lua iterators, and are +generally used as follows: +.Bd -literal -offset indent +for child in zfs.list.children("rpool") do + ... +end +.Ed +.Pp +The available zfs.list functions are: +.Bl -tag -width "xx" +.It Em zfs.list.clones(snapshot) +Iterate through all clones of the given snapshot. +.Pp +snapshot (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid snapshot path in the current pool. +.Ed +.It Em zfs.list.snapshots(dataset) +Iterate through all snapshots of the given dataset. +Each snapshot is returned as a string containing the full dataset name, e.g. +"pool/fs@snap". +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem or volume. +.Ed +.It Em zfs.list.children(dataset) +Iterate through all direct children of the given dataset. +Each child is returned as a string containing the full dataset name, e.g. +"pool/fs/child". +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem or volume. +.Ed +.It Em zfs.list.bookmarks(dataset) +Iterate through all bookmarks of the given dataset. Each bookmark is returned +as a string containing the full dataset name, e.g. "pool/fs#bookmark". +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem or volume. +.Ed +.It Em zfs.list.holds(snapshot) +Iterate through all user holds on the given snapshot. Each hold is returned +as a pair of the hold's tag and the timestamp (in seconds since the epoch) at +which it was created. +.Pp +snapshot (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid snapshot. +.Ed +.It Em zfs.list.properties(dataset) +An alias for zfs.list.user_properties (see relevant entry). +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem, snapshot, or volume. +.Ed +.It Em zfs.list.user_properties(dataset) +Iterate through all user properties for the given dataset. For each +step of the iteration, output the property name, its value, and its source. +Throws a Lua error if the dataset is invalid. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem, snapshot, or volume. +.Ed +.It Em zfs.list.system_properties(dataset) +Returns an array of strings, the names of the valid system (non-user defined) +properties for the given dataset. +Throws a Lua error if the dataset is invalid. +.Pp +dataset (string) +.Bd -ragged -compact -offset "xxxx" +Must be a valid filesystem, snapshot or volume. +.Ed +.El +.El +.Sh EXAMPLES +.Ss Example 1 +The following channel program recursively destroys a filesystem and all its +snapshots and children in a naive manner. +Note that this does not involve any error handling or reporting. +.Bd -literal -offset indent +function destroy_recursive(root) + for child in zfs.list.children(root) do + destroy_recursive(child) + end + for snap in zfs.list.snapshots(root) do + zfs.sync.destroy(snap) + end + zfs.sync.destroy(root) +end +destroy_recursive("pool/somefs") +.Ed +.Ss Example 2 +A more verbose and robust version of the same channel program, which +properly detects and reports errors, and also takes the dataset to destroy +as a command line argument, would be as follows: +.Bd -literal -offset indent +succeeded = {} +failed = {} + +function destroy_recursive(root) + for child in zfs.list.children(root) do + destroy_recursive(child) + end + for snap in zfs.list.snapshots(root) do + err = zfs.sync.destroy(snap) + if (err ~= 0) then + failed[snap] = err + else + succeeded[snap] = err + end + end + err = zfs.sync.destroy(root) + if (err ~= 0) then + failed[root] = err + else + succeeded[root] = err + end +end + +args = ... +argv = args["argv"] + +destroy_recursive(argv[1]) + +results = {} +results["succeeded"] = succeeded +results["failed"] = failed +return results +.Ed +.Ss Example 3 +The following function performs a forced promote operation by attempting to +promote the given clone and destroying any conflicting snapshots. +.Bd -literal -offset indent +function force_promote(ds) + errno, details = zfs.check.promote(ds) + if (errno == EEXIST) then + assert(details ~= Nil) + for i, snap in ipairs(details) do + zfs.sync.destroy(ds .. "@" .. snap) + end + elseif (errno ~= 0) then + return errno + end + return zfs.sync.promote(ds) +end +.Ed diff --git a/man/man8/zfs-project.8 b/man/man8/zfs-project.8 new file mode 100644 index 000000000000..d3171c05c5bc --- /dev/null +++ b/man/man8/zfs-project.8 @@ -0,0 +1,153 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-PROJECT 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm project +.Nd List, set, or clear project ID and/or inherit flag on the file(s) or directories. +.Sh SYNOPSIS +.Nm +.Cm project +.Oo Fl d Ns | Ns Fl r Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Nm +.Cm project +.Fl C +.Oo Fl kr Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Nm +.Cm project +.Fl c +.Oo Fl 0 Ns Oc +.Oo Fl d Ns | Ns Fl r Ns Oc +.Op Fl p Ar id +.Ar file Ns | Ns Ar directory Ns ... +.Nm +.Cm project +.Op Fl p Ar id +.Oo Fl rs Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm project +.Oo Fl d Ns | Ns Fl r Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Xc +List project identifier (ID) and inherit flag of file(s) or directories. +.Bl -tag -width "-d" +.It Fl d +Show the directory project ID and inherit flag, not its children. It will +overwrite the former specified +.Fl r +option. +.It Fl r +Show on subdirectories recursively. It will overwrite the former specified +.Fl d +option. +.El +.It Xo +.Nm +.Cm project +.Fl C +.Oo Fl kr Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Xc +Clear project inherit flag and/or ID on the file(s) or directories. +.Bl -tag -width "-k" +.It Fl k +Keep the project ID unchanged. If not specified, the project ID will be reset +as zero. +.It Fl r +Clear on subdirectories recursively. +.El +.It Xo +.Nm +.Cm project +.Fl c +.Oo Fl 0 Ns Oc +.Oo Fl d Ns | Ns Fl r Ns Oc +.Op Fl p Ar id +.Ar file Ns | Ns Ar directory Ns ... +.Xc +Check project ID and inherit flag on the file(s) or directories, report the +entries without project inherit flag or with different project IDs from the +specified (via +.Fl p +option) value or the target directory's project ID. +.Bl -tag -width "-0" +.It Fl 0 +Print file name with a trailing NUL instead of newline (by default), like +"find -print0". +.It Fl d +Check the directory project ID and inherit flag, not its children. It will +overwrite the former specified +.Fl r +option. +.It Fl p +Specify the referenced ID for comparing with the target file(s) or directories' +project IDs. If not specified, the target (top) directory's project ID will be +used as the referenced one. +.It Fl r +Check on subdirectories recursively. It will overwrite the former specified +.Fl d +option. +.El +.It Xo +.Nm +.Cm project +.Op Fl p Ar id +.Oo Fl rs Ns Oc +.Ar file Ns | Ns Ar directory Ns ... +.Xc +Set project ID and/or inherit flag on the file(s) or directories. +.Bl -tag -width "-p" +.It Fl p +Set the file(s)' or directories' project ID with the given value. +.It Fl r +Set on subdirectories recursively. +.It Fl s +Set project inherit flag on the given file(s) or directories. It is usually used +for setup tree quota on the directory target with +.Fl r +option specified together. When setup tree quota, by default the directory's +project ID will be set to all its descendants unless you specify the project +ID via +.Fl p +option explicitly. +.El +.El +.Sh SEE ALSO +.Xr zfs-projectspace 8 diff --git a/man/man8/zfs-projectspace.8 b/man/man8/zfs-projectspace.8 new file mode 120000 index 000000000000..8bc2f1df305e --- /dev/null +++ b/man/man8/zfs-projectspace.8 @@ -0,0 +1 @@ +zfs-userspace.8 \ No newline at end of file diff --git a/man/man8/zfs-promote.8 b/man/man8/zfs-promote.8 new file mode 100644 index 000000000000..08cd8b2b94f2 --- /dev/null +++ b/man/man8/zfs-promote.8 @@ -0,0 +1,69 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-PROMOTE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm promote +.Nd Promotes a clone file system to no longer be dependent on its origin snapshot. +.Sh SYNOPSIS +.Nm +.Cm promote +.Ar clone-filesystem +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm promote +.Ar clone-filesystem +.Xc +The +.Cm promote +command makes it possible to destroy the file system that the clone was created +from. +The clone parent-child dependency relationship is reversed, so that the origin +file system becomes a clone of the specified file system. +.Pp +The snapshot that was cloned, and any snapshots previous to this snapshot, are +now owned by the promoted clone. +The space they use moves from the origin file system to the promoted clone, so +enough space must be available to accommodate these snapshots. +No new space is consumed by this operation, but the space accounting is +adjusted. +The promoted clone must not have any conflicting snapshot names of its own. +The +.Xr zfs-rename 8 +subcommand can be used to rename any conflicting snapshots. +.El +.Sh SEE ALSO +.Xr zfs-clone 8 diff --git a/man/man8/zfs-receive.8 b/man/man8/zfs-receive.8 new file mode 100644 index 000000000000..3cd17fea4a2d --- /dev/null +++ b/man/man8/zfs-receive.8 @@ -0,0 +1,375 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd February 16, 2020 +.Dt ZFS-RECEIVE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm receive +.Nd Creates a snapshot whose contents are as specified in the stream provided on standard input. +.Sh SYNOPSIS +.Nm +.Cm receive +.Op Fl FhMnsuv +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm +.Cm receive +.Op Fl FhMnsuv +.Op Fl d Ns | Ns Fl e +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem +.Nm +.Cm receive +.Fl A +.Ar filesystem Ns | Ns Ar volume +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm receive +.Op Fl FhMnsuv +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +.It Xo +.Nm +.Cm receive +.Op Fl FhMnsuv +.Op Fl d Ns | Ns Fl e +.Op Fl o Sy origin Ns = Ns Ar snapshot +.Op Fl o Ar property Ns = Ns Ar value +.Op Fl x Ar property +.Ar filesystem +.Xc +Creates a snapshot whose contents are as specified in the stream provided on +standard input. +If a full stream is received, then a new file system is created as well. +Streams are created using the +.Nm zfs Cm send +subcommand, which by default creates a full stream. +.Nm zfs Cm recv +can be used as an alias for +.Nm zfs Cm receive. +.Pp +If an incremental stream is received, then the destination file system must +already exist, and its most recent snapshot must match the incremental stream's +source. +For +.Sy zvols , +the destination device link is destroyed and recreated, which means the +.Sy zvol +cannot be accessed during the +.Cm receive +operation. +.Pp +When a snapshot replication package stream that is generated by using the +.Nm zfs Cm send Fl R +command is received, any snapshots that do not exist on the sending location are +destroyed by using the +.Nm zfs Cm destroy Fl d +command. +.Pp +The ability to send and receive deduplicated send streams has been removed. +However, a deduplicated send stream created with older software can be converted +to a regular (non-deduplicated) stream by using the +.Nm zstream Cm redup +command. +.Pp +If +.Fl o Em property Ns = Ns Ar value +or +.Fl x Em property +is specified, it applies to the effective value of the property throughout +the entire subtree of replicated datasets. Effective property values will be +set ( +.Fl o +) or inherited ( +.Fl x +) on the topmost in the replicated subtree. In descendant datasets, if the +property is set by the send stream, it will be overridden by forcing the +property to be inherited from the topâ€most file system. Received properties +are retained in spite of being overridden and may be restored with +.Nm zfs Cm inherit Fl S . +Specifying +.Fl o Sy origin Ns = Ns Em snapshot +is a special case because, even if +.Sy origin +is a read-only property and cannot be set, it's allowed to receive the send +stream as a clone of the given snapshot. +.Pp +Raw encrypted send streams (created with +.Nm zfs Cm send Fl w +) may only be received as is, and cannot be re-encrypted, decrypted, or +recompressed by the receive process. Unencrypted streams can be received as +encrypted datasets, either through inheritance or by specifying encryption +parameters with the +.Fl o +options. Note that the +.Sy keylocation +property cannot be overridden to +.Sy prompt +during a receive. This is because the receive process itself is already using +stdin for the send stream. Instead, the property can be overridden after the +receive completes. +.Pp +The added security provided by raw sends adds some restrictions to the send +and receive process. ZFS will not allow a mix of raw receives and non-raw +receives. Specifically, any raw incremental receives that are attempted after +a non-raw receive will fail. Non-raw receives do not have this restriction and, +therefore, are always possible. Because of this, it is best practice to always +use either raw sends for their security benefits or non-raw sends for their +flexibility when working with encrypted datasets, but not a combination. +.Pp +The reason for this restriction stems from the inherent restrictions of the +AEAD ciphers that ZFS uses to encrypt data. When using ZFS native encryption, +each block of data is encrypted against a randomly generated number known as +the "initialization vector" (IV), which is stored in the filesystem metadata. +This number is required by the encryption algorithms whenever the data is to +be decrypted. Together, all of the IVs provided for all of the blocks in a +given snapshot are collectively called an "IV set". When ZFS performs a raw +send, the IV set is transferred from the source to the destination in the send +stream. When ZFS performs a non-raw send, the data is decrypted by the source +system and re-encrypted by the destination system, creating a snapshot with +effectively the same data, but a different IV set. In order for decryption to +work after a raw send, ZFS must ensure that the IV set used on both the source +and destination side match. When an incremental raw receive is performed on +top of an existing snapshot, ZFS will check to confirm that the "from" +snapshot on both the source and destination were using the same IV set, +ensuring the new IV set is consistent. +.Pp +The name of the snapshot +.Pq and file system, if a full stream is received +that this subcommand creates depends on the argument type and the use of the +.Fl d +or +.Fl e +options. +.Pp +If the argument is a snapshot name, the specified +.Ar snapshot +is created. +If the argument is a file system or volume name, a snapshot with the same name +as the sent snapshot is created within the specified +.Ar filesystem +or +.Ar volume . +If neither of the +.Fl d +or +.Fl e +options are specified, the provided target snapshot name is used exactly as +provided. +.Pp +The +.Fl d +and +.Fl e +options cause the file system name of the target snapshot to be determined by +appending a portion of the sent snapshot's name to the specified target +.Ar filesystem . +If the +.Fl d +option is specified, all but the first element of the sent snapshot's file +system path +.Pq usually the pool name +is used and any required intermediate file systems within the specified one are +created. +If the +.Fl e +option is specified, then only the last element of the sent snapshot's file +system name +.Pq i.e. the name of the source file system itself +is used as the target file system name. +.Bl -tag -width "-F" +.It Fl F +Force a rollback of the file system to the most recent snapshot before +performing the receive operation. +If receiving an incremental replication stream +.Po for example, one generated by +.Nm zfs Cm send Fl R Op Fl i Ns | Ns Fl I +.Pc , +destroy snapshots and file systems that do not exist on the sending side. +.It Fl d +Discard the first element of the sent snapshot's file system name, using the +remaining elements to determine the name of the target file system for the new +snapshot as described in the paragraph above. +.It Fl e +Discard all but the last element of the sent snapshot's file system name, using +that element to determine the name of the target file system for the new +snapshot as described in the paragraph above. +.It Fl h +Skip the receive of holds. There is no effect if holds are not sent. +.It Fl M +Force an unmount of the file system while receiving a snapshot. +This option is not supported on Linux. +.It Fl n +Do not actually receive the stream. +This can be useful in conjunction with the +.Fl v +option to verify the name the receive operation would use. +.It Fl o Sy origin Ns = Ns Ar snapshot +Forces the stream to be received as a clone of the given snapshot. +If the stream is a full send stream, this will create the filesystem +described by the stream as a clone of the specified snapshot. +Which snapshot was specified will not affect the success or failure of the +receive, as long as the snapshot does exist. +If the stream is an incremental send stream, all the normal verification will be +performed. +.It Fl o Em property Ns = Ns Ar value +Sets the specified property as if the command +.Nm zfs Cm set Em property Ns = Ns Ar value +was invoked immediately before the receive. When receiving a stream from +.Nm zfs Cm send Fl R , +causes the property to be inherited by all descendant datasets, as through +.Nm zfs Cm inherit Em property +was run on any descendant datasets that have this property set on the +sending system. +.Pp +Any editable property can be set at receive time. Set-once properties bound +to the received data, such as +.Sy normalization +and +.Sy casesensitivity , +cannot be set at receive time even when the datasets are newly created by +.Nm zfs Cm receive . +Additionally both settable properties +.Sy version +and +.Sy volsize +cannot be set at receive time. +.Pp +The +.Fl o +option may be specified multiple times, for different properties. An error +results if the same property is specified in multiple +.Fl o +or +.Fl x +options. +.Pp +The +.Fl o +option may also be used to override encryption properties upon initial +receive. This allows unencrypted streams to be received as encrypted datasets. +To cause the received dataset (or root dataset of a recursive stream) to be +received as an encryption root, specify encryption properties in the same +manner as is required for +.Nm +.Cm create . +For instance: +.Bd -literal +# zfs send tank/test@snap1 | zfs recv -o encryption=on -o keyformat=passphrase -o keylocation=file:///path/to/keyfile +.Ed +.Pp +Note that +.Op Fl o Ar keylocation Ns = Ns Ar prompt +may not be specified here, since stdin is already being utilized for the send +stream. Once the receive has completed, you can use +.Nm +.Cm set +to change this setting after the fact. Similarly, you can receive a dataset as +an encrypted child by specifying +.Op Fl x Ar encryption +to force the property to be inherited. Overriding encryption properties (except +for +.Sy keylocation Ns ) +is not possible with raw send streams. +.It Fl s +If the receive is interrupted, save the partially received state, rather +than deleting it. +Interruption may be due to premature termination of the stream +.Po e.g. due to network failure or failure of the remote system +if the stream is being read over a network connection +.Pc , +a checksum error in the stream, termination of the +.Nm zfs Cm receive +process, or unclean shutdown of the system. +.Pp +The receive can be resumed with a stream generated by +.Nm zfs Cm send Fl t Ar token , +where the +.Ar token +is the value of the +.Sy receive_resume_token +property of the filesystem or volume which is received into. +.Pp +To use this flag, the storage pool must have the +.Sy extensible_dataset +feature enabled. +See +.Xr zpool-features 5 +for details on ZFS feature flags. +.It Fl u +File system that is associated with the received stream is not mounted. +.It Fl v +Print verbose information about the stream and the time required to perform the +receive operation. +.It Fl x Em property +Ensures that the effective value of the specified property after the +receive is unaffected by the value of that property in the send stream (if any), +as if the property had been excluded from the send stream. +.Pp +If the specified property is not present in the send stream, this option does +nothing. +.Pp +If a received property needs to be overridden, the effective value will be +set or inherited, depending on whether the property is inheritable or not. +.Pp +In the case of an incremental update, +.Fl x +leaves any existing local setting or explicit inheritance unchanged. +.Pp +All +.Fl o +restrictions (e.g. set-once) apply equally to +.Fl x . +.El +.It Xo +.Nm +.Cm receive +.Fl A +.Ar filesystem Ns | Ns Ar volume +.Xc +Abort an interrupted +.Nm zfs Cm receive Fl s , +deleting its saved partially received state. +.El +.Sh SEE ALSO +.Xr zfs-send 8 +.Xr zstream 8 diff --git a/man/man8/zfs-recv.8 b/man/man8/zfs-recv.8 new file mode 120000 index 000000000000..f11b7add7bcc --- /dev/null +++ b/man/man8/zfs-recv.8 @@ -0,0 +1 @@ +zfs-receive.8 \ No newline at end of file diff --git a/man/man8/zfs-redact.8 b/man/man8/zfs-redact.8 new file mode 120000 index 000000000000..f7c605788388 --- /dev/null +++ b/man/man8/zfs-redact.8 @@ -0,0 +1 @@ +zfs-send.8 \ No newline at end of file diff --git a/man/man8/zfs-release.8 b/man/man8/zfs-release.8 new file mode 120000 index 000000000000..58809d66a5a8 --- /dev/null +++ b/man/man8/zfs-release.8 @@ -0,0 +1 @@ +zfs-hold.8 \ No newline at end of file diff --git a/man/man8/zfs-rename.8 b/man/man8/zfs-rename.8 new file mode 100644 index 000000000000..9d650709a0ad --- /dev/null +++ b/man/man8/zfs-rename.8 @@ -0,0 +1,91 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-RENAME 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm rename +.Nd Renames the given dataset (filesystem or snapshot). +.Sh SYNOPSIS +.Nm +.Cm rename +.Op Fl f +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm +.Cm rename +.Op Fl fp +.Ar filesystem Ns | Ns Ar volume +.Ar filesystem Ns | Ns Ar volume +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm rename +.Op Fl f +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +.It Xo +.Nm +.Cm rename +.Op Fl fp +.Ar filesystem Ns | Ns Ar volume +.Ar filesystem Ns | Ns Ar volume +.Xc +Renames the given dataset. +The new target can be located anywhere in the ZFS hierarchy, with the exception +of snapshots. +Snapshots can only be renamed within the parent file system or volume. +When renaming a snapshot, the parent file system of the snapshot does not need +to be specified as part of the second argument. +Renamed file systems can inherit new mount points, in which case they are +unmounted and remounted at the new mount point. +.Bl -tag -width "-a" +.It Fl f +Force unmount any filesystems that need to be unmounted in the process. +.It Fl p +Creates all the nonexistent parent datasets. +Datasets created in this manner are automatically mounted according to the +.Sy mountpoint +property inherited from their parent. +.El +.It Xo +.Nm +.Cm rename +.Fl r +.Ar snapshot Ar snapshot +.Xc +Recursively rename the snapshots of all descendent datasets. +Snapshots are the only dataset that can be renamed recursively. +.El diff --git a/man/man8/zfs-rollback.8 b/man/man8/zfs-rollback.8 new file mode 100644 index 000000000000..1078efd8a4b7 --- /dev/null +++ b/man/man8/zfs-rollback.8 @@ -0,0 +1,81 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-ROLLBACK 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm rollback +.Nd Roll back the given dataset to a previous snapshot. +.Sh SYNOPSIS +.Nm +.Cm rollback +.Op Fl Rfr +.Ar snapshot +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm rollback +.Op Fl Rfr +.Ar snapshot +.Xc +When a dataset is rolled back, all data that has changed since the snapshot is +discarded, and the dataset reverts to the state at the time of the snapshot. +By default, the command refuses to roll back to a snapshot other than the most +recent one. +In order to do so, all intermediate snapshots and bookmarks must be destroyed by +specifying the +.Fl r +option. +.Pp +The +.Fl rR +options do not recursively destroy the child snapshots of a recursive snapshot. +Only direct snapshots of the specified filesystem are destroyed by either of +these options. +To completely roll back a recursive snapshot, you must rollback the individual +child snapshots. +.Bl -tag -width "-R" +.It Fl R +Destroy any more recent snapshots and bookmarks, as well as any clones of those +snapshots. +.It Fl f +Used with the +.Fl R +option to force an unmount of any clone file systems that are to be destroyed. +.It Fl r +Destroy any snapshots and bookmarks more recent than the one specified. +.El +.El +.Sh SEE ALSO +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-send.8 b/man/man8/zfs-send.8 new file mode 100644 index 000000000000..8b7f940eb793 --- /dev/null +++ b/man/man8/zfs-send.8 @@ -0,0 +1,609 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-SEND 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm send +.Nd Generate a send stream, which may be of a filesystem, and may be incremental from a bookmark. +.Sh SYNOPSIS +.Nm +.Cm send +.Op Fl DLPRbcehnpvw +.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot +.Ar snapshot +.Nm +.Cm send +.Op Fl DLPRcenpvw +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Nm +.Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.br +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Nm +.Cm send +.Op Fl Penv +.Fl t +.Ar receive_resume_token +.Nm +.Cm send +.Op Fl Pnv +.Fl S Ar filesystem +.Nm +.Cm redact +.Ar snapshot redaction_bookmark +.Ar redaction_snapshot Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm send +.Op Fl DLPRbcehnpvw +.Op Oo Fl I Ns | Ns Fl i Oc Ar snapshot +.Ar snapshot +.Xc +Creates a stream representation of the second +.Ar snapshot , +which is written to standard output. +The output can be redirected to a file or to a different system +.Po for example, using +.Xr ssh 1 +.Pc . +By default, a full stream is generated. +.Bl -tag -width "-D" +.It Fl D, -dedup +Deduplicated send is no longer supported. +This flag is accepted for backwards compatibility, but a regular, +non-deduplicated stream will be generated. +.It Fl I Ar snapshot +Generate a stream package that sends all intermediary snapshots from the first +snapshot to the second snapshot. +For example, +.Fl I Em @a Em fs@d +is similar to +.Fl i Em @a Em fs@b Ns \&; Fl i Em @b Em fs@c Ns \&; Fl i Em @c Em fs@d . +The incremental source may be specified as with the +.Fl i +option. +.It Fl L, -large-block +Generate a stream which may contain blocks larger than 128KB. +This flag has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy large_blocks +feature. +.It Fl P, -parsable +Print machine-parsable verbose information about the stream package generated. +.It Fl R, -replicate +Generate a replication stream package, which will replicate the specified +file system, and all descendent file systems, up to the named snapshot. +When received, all properties, snapshots, descendent file systems, and clones +are preserved. +.Pp +If the +.Fl i +or +.Fl I +flags are used in conjunction with the +.Fl R +flag, an incremental replication stream is generated. +The current values of properties, and current snapshot and file system names are +set when the stream is received. +If the +.Fl F +flag is specified when this stream is received, snapshots and file systems that +do not exist on the sending side are destroyed. If the +.Fl R +flag is used to send encrypted datasets, then +.Fl w +must also be specified. +.It Fl e, -embed +Generate a more compact stream by using +.Sy WRITE_EMBEDDED +records for blocks which are stored more compactly on disk by the +.Sy embedded_data +pool feature. +This flag has no effect if the +.Sy embedded_data +feature is disabled. +The receiving system must have the +.Sy embedded_data +feature enabled. +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. Datasets that are sent with this flag may not be +received as an encrypted dataset, since encrypted datasets cannot use the +.Sy embedded_data +feature. +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy embedded_data +feature. +.It Fl b, -backup +Sends only received property values whether or not they are overridden by local +settings, but only if the dataset has ever been received. Use this option when +you want +.Nm zfs Cm receive +to restore received properties backed up on the sent dataset and to avoid +sending local settings that may have nothing to do with the source dataset, +but only with how the data is backed up. +.It Fl c, -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory +.Po see the +.Sy compression +property for details +.Pc . +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c , +then the data will be decompressed before sending so it can be split into +smaller block sizes. +.It Fl w, -raw +For encrypted datasets, send data exactly as it exists on disk. This allows +backups to be taken even if encryption keys are not currently loaded. The +backup may then be received on an untrusted machine since that machine will +not have the encryption keys to read the protected data or alter it without +being detected. Upon being received, the dataset will have the same encryption +keys as it did on the send side, although the +.Sy keylocation +property will be defaulted to +.Sy prompt +if not otherwise provided. For unencrypted datasets, this flag will be +equivalent to +.Fl Lec . +Note that if you do not use this flag for sending encrypted datasets, data will +be sent unencrypted and may be re-encrypted with a different encryption key on +the receiving system, which will disable the ability to do a raw send to that +system for incrementals. +.It Fl h, -holds +Generate a stream package that includes any snapshot holds (created with the +.Sy zfs hold +command), and indicating to +.Sy zfs receive +that the holds be applied to the dataset on the receiving system. +.It Fl i Ar snapshot +Generate an incremental stream from the first +.Ar snapshot +.Pq the incremental source +to the second +.Ar snapshot +.Pq the incremental target . +The incremental source can be specified as the last component of the snapshot +name +.Po the +.Sy @ +character and following +.Pc +and it is assumed to be from the same file system as the incremental target. +.Pp +If the destination is a clone, the source may be the origin snapshot, which must +be fully specified +.Po for example, +.Em pool/fs@origin , +not just +.Em @origin +.Pc . +.It Fl n, -dryrun +Do a dry-run +.Pq Qq No-op +send. +Do not generate any actual send data. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to determine what data will be sent. +In this case, the verbose output will be written to standard output +.Po contrast with a non-dry-run, where the stream is written to standard output +and the verbose output goes to standard error +.Pc . +.It Fl p, -props +Include the dataset's properties in the stream. +This flag is implicit when +.Fl R +is specified. +The receiving system must also support this feature. Sends of encrypted datasets +must use +.Fl w +when using this flag. +.It Fl v, -verbose +Print verbose information about the stream package generated. +This information includes a per-second report of how much data has been sent. +.Pp +The format of the stream is committed. +You will be able to receive your streams on future versions of ZFS. +.El +.It Xo +.Nm +.Cm send +.Op Fl DLPRcenpvw +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot +.Xc +Generate a send stream, which may be of a filesystem, and may be incremental +from a bookmark. +If the destination is a filesystem or volume, the pool must be read-only, or the +filesystem must not be mounted. +When the stream generated from a filesystem or volume is received, the default +snapshot name will be +.Qq --head-- . +.Bl -tag -width "-L" +.It Fl L, -large-block +Generate a stream which may contain blocks larger than 128KB. +This flag has no effect if the +.Sy large_blocks +pool feature is disabled, or if the +.Sy recordsize +property of this filesystem has never been set above 128KB. +The receiving system must have the +.Sy large_blocks +pool feature enabled as well. +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy large_blocks +feature. +.It Fl P, -parsable +Print machine-parsable verbose information about the stream package generated. +.It Fl c, -compressed +Generate a more compact stream by using compressed WRITE records for blocks +which are compressed on disk and in memory +.Po see the +.Sy compression +property for details +.Pc . +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. +If the +.Sy large_blocks +feature is enabled on the sending system but the +.Fl L +option is not supplied in conjunction with +.Fl c , +then the data will be decompressed before sending so it can be split into +smaller block sizes. +.It Fl w, -raw +For encrypted datasets, send data exactly as it exists on disk. This allows +backups to be taken even if encryption keys are not currently loaded. The +backup may then be received on an untrusted machine since that machine will +not have the encryption keys to read the protected data or alter it without +being detected. Upon being received, the dataset will have the same encryption +keys as it did on the send side, although the +.Sy keylocation +property will be defaulted to +.Sy prompt +if not otherwise provided. For unencrypted datasets, this flag will be +equivalent to +.Fl Lec . +Note that if you do not use this flag for sending encrypted datasets, data will +be sent unencrypted and may be re-encrypted with a different encryption key on +the receiving system, which will disable the ability to do a raw send to that +system for incrementals. +.It Fl e, -embed +Generate a more compact stream by using +.Sy WRITE_EMBEDDED +records for blocks which are stored more compactly on disk by the +.Sy embedded_data +pool feature. +This flag has no effect if the +.Sy embedded_data +feature is disabled. +The receiving system must have the +.Sy embedded_data +feature enabled. +If the +.Sy lz4_compress +feature is active on the sending system, then the receiving system must have +that feature enabled as well. Datasets that are sent with this flag may not be +received as an encrypted dataset, since encrypted datasets cannot use the +.Sy embedded_data +feature. +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy embedded_data +feature. +.It Fl i Ar snapshot Ns | Ns Ar bookmark +Generate an incremental send stream. +The incremental source must be an earlier snapshot in the destination's history. +It will commonly be an earlier snapshot in the destination's file system, in +which case it can be specified as the last component of the name +.Po the +.Sy # +or +.Sy @ +character and following +.Pc . +.Pp +If the incremental target is a clone, the incremental source can be the origin +snapshot, or an earlier snapshot in the origin's filesystem, or the origin's +origin, etc. +.It Fl n, -dryrun +Do a dry-run +.Pq Qq No-op +send. +Do not generate any actual send data. +This is useful in conjunction with the +.Fl v +or +.Fl P +flags to determine what data will be sent. +In this case, the verbose output will be written to standard output +.Po contrast with a non-dry-run, where the stream is written to standard output +and the verbose output goes to standard error +.Pc . +.It Fl v, -verbose +Print verbose information about the stream package generated. +This information includes a per-second report of how much data has been sent. +.El +.It Xo +.Nm +.Cm send +.Fl -redact Ar redaction_bookmark +.Op Fl DLPcenpv +.br +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Ar snapshot +.Xc +Generate a redacted send stream. +This send stream contains all blocks from the snapshot being sent that aren't +included in the redaction list contained in the bookmark specified by the +.Fl -redact +(or +.Fl -d +) flag. +The resulting send stream is said to be redacted with respect to the snapshots +the bookmark specified by the +.Fl -redact No flag was created with. +The bookmark must have been created by running +.Sy zfs redact +on the snapshot being sent. +.sp +This feature can be used to allow clones of a filesystem to be made available on +a remote system, in the case where their parent need not (or needs to not) be +usable. +For example, if a filesystem contains sensitive data, and it has clones where +that sensitive data has been secured or replaced with dummy data, redacted sends +can be used to replicate the secured data without replicating the original +sensitive data, while still sharing all possible blocks. +A snapshot that has been redacted with respect to a set of snapshots will +contain all blocks referenced by at least one snapshot in the set, but will +contain none of the blocks referenced by none of the snapshots in the set. +In other words, if all snapshots in the set have modified a given block in the +parent, that block will not be sent; but if one or more snapshots have not +modified a block in the parent, they will still reference the parent's block, so +that block will be sent. +Note that only user data will be redacted. +.sp +When the redacted send stream is received, we will generate a redacted +snapshot. +Due to the nature of redaction, a redacted dataset can only be used in the +following ways: +.sp +1. To receive, as a clone, an incremental send from the original snapshot to one +of the snapshots it was redacted with respect to. +In this case, the stream will produce a valid dataset when received because all +blocks that were redacted in the parent are guaranteed to be present in the +child's send stream. +This use case will produce a normal snapshot, which can be used just like other +snapshots. +.sp +2. To receive an incremental send from the original snapshot to something +redacted with respect to a subset of the set of snapshots the initial snapshot +was redacted with respect to. +In this case, each block that was redacted in the original is still redacted +(redacting with respect to additional snapshots causes less data to be redacted +(because the snapshots define what is permitted, and everything else is +redacted)). +This use case will produce a new redacted snapshot. +.sp +3. To receive an incremental send from a redaction bookmark of the original +snapshot that was created when redacting with respect to a subset of the set of +snapshots the initial snapshot was created with respect to +anything else. +A send stream from such a redaction bookmark will contain all of the blocks +necessary to fill in any redacted data, should it be needed, because the sending +system is aware of what blocks were originally redacted. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.sp +4. To receive an incremental send from a redacted version of the initial +snapshot that is redacted with respect to a subject of the set of snapshots the +initial snapshot was created with respect to. +A send stream from a compatible redacted dataset will contain all of the blocks +necessary to fill in any redacted data. +This will either produce a normal snapshot or a redacted one, depending on +whether the new send stream is redacted. +.sp +5. To receive a full send as a clone of the redacted snapshot. +Since the stream is a full send, it definitionally contains all the data needed +to create a new dataset. +This use case will either produce a normal snapshot or a redacted one, depending +on whether the full send stream was redacted. +.sp +These restrictions are detected and enforced by \fBzfs receive\fR; a +redacted send stream will contain the list of snapshots that the stream is +redacted with respect to. +These are stored with the redacted snapshot, and are used to detect and +correctly handle the cases above. Note that for technical reasons, raw sends +and redacted sends cannot be combined at this time. +.It Xo +.Nm +.Cm send +.Op Fl Penv +.Fl t +.Ar receive_resume_token +.Xc +Creates a send stream which resumes an interrupted receive. +The +.Ar receive_resume_token +is the value of this property on the filesystem or volume that was being +received into. +See the documentation for +.Sy zfs receive -s +for more details. +.It Xo +.Nm +.Cm send +.Op Fl Pnv +.Op Fl i Ar snapshot Ns | Ns Ar bookmark +.Fl S +.Ar filesystem +.Xc +Generate a send stream from a dataset that has been partially received. +.Bl -tag -width "-L" +.It Fl S, -saved +This flag requires that the specified filesystem previously received a resumable +send that did not finish and was interrupted. In such scenarios this flag +enables the user to send this partially received state. Using this flag will +always use the last fully received snapshot as the incremental source if it +exists. +.El +.It Xo +.Nm +.Cm redact +.Ar snapshot redaction_bookmark +.Ar redaction_snapshot Ns ... +.Xc +Generate a new redaction bookmark. +In addition to the typical bookmark information, a redaction bookmark contains +the list of redacted blocks and the list of redaction snapshots specified. +The redacted blocks are blocks in the snapshot which are not referenced by any +of the redaction snapshots. +These blocks are found by iterating over the metadata in each redaction snapshot +to determine what has been changed since the target snapshot. +Redaction is designed to support redacted zfs sends; see the entry for +.Sy zfs send +for more information on the purpose of this operation. +If a redact operation fails partway through (due to an error or a system +failure), the redaction can be resumed by rerunning the same command. +.El +.Ss Redaction +ZFS has support for a limited version of data subsetting, in the form of +redaction. Using the +.Sy zfs redact +command, a +.Sy redaction bookmark +can be created that stores a list of blocks containing sensitive information. When +provided to +.Sy zfs +.Sy send , +this causes a +.Sy redacted send +to occur. Redacted sends omit the blocks containing sensitive information, +replacing them with REDACT records. When these send streams are received, a +.Sy redacted dataset +is created. A redacted dataset cannot be mounted by default, since it is +incomplete. It can be used to receive other send streams. In this way datasets +can be used for data backup and replication, with all the benefits that zfs send +and receive have to offer, while protecting sensitive information from being +stored on less-trusted machines or services. +.Pp +For the purposes of redaction, there are two steps to the process. A redact +step, and a send/receive step. First, a redaction bookmark is created. This is +done by providing the +.Sy zfs redact +command with a parent snapshot, a bookmark to be created, and a number of +redaction snapshots. These redaction snapshots must be descendants of the +parent snapshot, and they should modify data that is considered sensitive in +some way. Any blocks of data modified by all of the redaction snapshots will +be listed in the redaction bookmark, because it represents the truly sensitive +information. When it comes to the send step, the send process will not send +the blocks listed in the redaction bookmark, instead replacing them with +REDACT records. When received on the target system, this will create a +redacted dataset, missing the data that corresponds to the blocks in the +redaction bookmark on the sending system. The incremental send streams from +the original parent to the redaction snapshots can then also be received on +the target system, and this will produce a complete snapshot that can be used +normally. Incrementals from one snapshot on the parent filesystem and another +can also be done by sending from the redaction bookmark, rather than the +snapshots themselves. +.Pp +In order to make the purpose of the feature more clear, an example is +provided. Consider a zfs filesystem containing four files. These files +represent information for an online shopping service. One file contains a list +of usernames and passwords, another contains purchase histories, a third +contains click tracking data, and a fourth contains user preferences. The +owner of this data wants to make it available for their development teams to +test against, and their market research teams to do analysis on. The +development teams need information about user preferences and the click +tracking data, while the market research teams need information about purchase +histories and user preferences. Neither needs access to the usernames and +passwords. However, because all of this data is stored in one ZFS filesystem, +it must all be sent and received together. In addition, the owner of the data +wants to take advantage of features like compression, checksumming, and +snapshots, so they do want to continue to use ZFS to store and transmit their +data. Redaction can help them do so. First, they would make two clones of a +snapshot of the data on the source. In one clone, they create the setup they +want their market research team to see; they delete the usernames and +passwords file, and overwrite the click tracking data with dummy +information. In another, they create the setup they want the development teams +to see, by replacing the passwords with fake information and replacing the +purchase histories with randomly generated ones. They would then create a +redaction bookmark on the parent snapshot, using snapshots on the two clones +as redaction snapshots. The parent can then be sent, redacted, to the target +server where the research and development teams have access. Finally, +incremental sends from the parent snapshot to each of the clones can be send +to and received on the target server; these snapshots are identical to the +ones on the source, and are ready to be used, while the parent snapshot on the +target contains none of the username and password data present on the source, +because it was removed by the redacted send operation. +.Sh SEE ALSO +.Xr zfs-bookmark 8 , +.Xr zfs-receive 8 , +.Xr zfs-redact 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zfs-set.8 b/man/man8/zfs-set.8 new file mode 100644 index 000000000000..5b4348ba94c0 --- /dev/null +++ b/man/man8/zfs-set.8 @@ -0,0 +1,188 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-SET 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm set +.Nd Sets the property or list of properties to the given value(s) for each dataset. +.Sh SYNOPSIS +.Nm +.Cm set +.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... +.Nm +.Cm get +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... Oc +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Cm all | Ar property Ns Oo , Ns Ar property Oc Ns ... +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns ... +.Nm +.Cm inherit +.Op Fl rS +.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm set +.Ar property Ns = Ns Ar value Oo Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... +.Xc +Only some properties can be edited. +See +.Xr zfsprops 8 +for more information on what properties can be set and acceptable +values. +Numeric values can be specified as exact values, or in a human-readable form +with a suffix of +.Sy B , K , M , G , T , P , E , Z +.Po for bytes, kilobytes, megabytes, gigabytes, terabytes, petabytes, exabytes, +or zettabytes, respectively +.Pc . +User properties can be set on snapshots. +For more information, see the +.Em User Properties +section of +.Xr zfsprops 8 . +.It Xo +.Nm +.Cm get +.Op Fl r Ns | Ns Fl d Ar depth +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar source Ns Oo , Ns Ar source Oc Ns ... Oc +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Cm all | Ar property Ns Oo , Ns Ar property Oc Ns ... +.Oo Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns | Ns Ar bookmark Oc Ns ... +.Xc +Displays properties for the given datasets. +If no datasets are specified, then the command displays properties for all +datasets on the system. +For each property, the following columns are displayed: +.Bd -literal + name Dataset name + property Property name + value Property value + source Property source \fBlocal\fP, \fBdefault\fP, \fBinherited\fP, + \fBtemporary\fP, \fBreceived\fP or none (\fB-\fP). +.Ed +.Pp +All columns are displayed by default, though this can be controlled by using the +.Fl o +option. +This command takes a comma-separated list of properties as described in the +.Em Native Properties +and +.Em User Properties +sections of +.Xr zfsprops 8 . +.Pp +The value +.Sy all +can be used to display all properties that apply to the given dataset's type +.Pq filesystem, volume, snapshot, or bookmark . +.Bl -tag -width "-H" +.It Fl H +Display output in a form more easily parsed by scripts. +Any headers are omitted, and fields are explicitly separated by a single tab +instead of an arbitrary amount of space. +.It Fl d Ar depth +Recursively display any children of the dataset, limiting the recursion to +.Ar depth . +A depth of +.Sy 1 +will display only the dataset and its direct children. +.It Fl o Ar field +A comma-separated list of columns to display. +.Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source +is the default value. +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl r +Recursively display properties for any children. +.It Fl s Ar source +A comma-separated list of sources to display. +Those properties coming from a source other than those in this list are ignored. +Each source must be one of the following: +.Sy local , +.Sy default , +.Sy inherited , +.Sy temporary , +.Sy received , +and +.Sy none . +The default value is all sources. +.It Fl t Ar type +A comma-separated list of types to display, where +.Ar type +is one of +.Sy filesystem , +.Sy snapshot , +.Sy volume , +.Sy bookmark , +or +.Sy all . +.El +.It Xo +.Nm +.Cm inherit +.Op Fl rS +.Ar property Ar filesystem Ns | Ns Ar volume Ns | Ns Ar snapshot Ns ... +.Xc +Clears the specified property, causing it to be inherited from an ancestor, +restored to default if no ancestor has the property set, or with the +.Fl S +option reverted to the received value if one exists. +See +.Xr zfsprops 8 +for a listing of default values, and details on which properties can be +inherited. +.Bl -tag -width "-r" +.It Fl r +Recursively inherit the given property for all children. +.It Fl S +Revert the property to the received value if one exists; otherwise operate as +if the +.Fl S +option was not specified. +.El +.El +.Sh SEE ALSO +.Xr zfs-list 8 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-share.8 b/man/man8/zfs-share.8 new file mode 100644 index 000000000000..cb013103371c --- /dev/null +++ b/man/man8/zfs-share.8 @@ -0,0 +1,88 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-SHARE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm share +.Nd Shares and unshares available ZFS filesystems. +.Sh SYNOPSIS +.Nm +.Cm share +.Fl a | Ar filesystem +.Nm +.Cm unshare +.Fl a | Ar filesystem Ns | Ns Ar mountpoint +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm share +.Fl a | Ar filesystem +.Xc +Shares available ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Share all available ZFS file systems. +Invoked automatically as part of the boot process. +.It Ar filesystem +Share the specified filesystem according to the +.Sy sharenfs +and +.Sy sharesmb +properties. +File systems are shared when the +.Sy sharenfs +or +.Sy sharesmb +property is set. +.El +.It Xo +.Nm +.Cm unshare +.Fl a | Ar filesystem Ns | Ns Ar mountpoint +.Xc +Unshares currently shared ZFS file systems. +.Bl -tag -width "-a" +.It Fl a +Unshare all available ZFS file systems. +Invoked automatically as part of the shutdown process. +.It Ar filesystem Ns | Ns Ar mountpoint +Unshare the specified filesystem. +The command can also be given a path to a ZFS file system shared on the system. +.El +.El +.Sh SEE ALSO +.Xr exports 5 , +.Xr smb.conf 5 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-snapshot.8 b/man/man8/zfs-snapshot.8 new file mode 100644 index 000000000000..618d41d22fe1 --- /dev/null +++ b/man/man8/zfs-snapshot.8 @@ -0,0 +1,83 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-SNAPSHOT 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm snapshot +.Nd Creates snapshots with the given names. +.Sh SYNOPSIS +.Nm +.Cm snapshot +.Op Fl r +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem Ns @ Ns Ar snapname Ns | Ns Ar volume Ns @ Ns Ar snapname Ns ... +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm snapshot +.Op Fl r +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Ar filesystem Ns @ Ns Ar snapname Ns | Ns Ar volume Ns @ Ns Ar snapname Ns ... +.Xc +All previous modifications by successful system calls to the file system are +part of the snapshots. +Snapshots are taken atomically, so that all snapshots correspond to the same +moment in time. +.Nm zfs Cm snap +can be used as an alias for +.Nm zfs Cm snapshot. +See the +.Em Snapshots +section of +.Xr zfsconcepts 8 +for details. +.Bl -tag -width "-o" +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property; see +.Nm zfs Cm create +for details. +.It Fl r +Recursively create snapshots of all descendent datasets +.El +.El +.Sh SEE ALSO +.Xr zfs-bookmark 8 , +.Xr zfs-clone 8 , +.Xr zfs-destroy 8 , +.Xr zfs-diff 8 , +.Xr zfs-hold 8 , +.Xr zfs-rename 8 , +.Xr zfs-rollback 8 , +.Xr zfs-send 8 diff --git a/man/man8/zfs-unallow.8 b/man/man8/zfs-unallow.8 new file mode 120000 index 000000000000..8886f334bffb --- /dev/null +++ b/man/man8/zfs-unallow.8 @@ -0,0 +1 @@ +zfs-allow.8 \ No newline at end of file diff --git a/man/man8/zfs-unjail.8 b/man/man8/zfs-unjail.8 new file mode 120000 index 000000000000..04cc05a00258 --- /dev/null +++ b/man/man8/zfs-unjail.8 @@ -0,0 +1 @@ +zfs-jail.8 \ No newline at end of file diff --git a/man/man8/zfs-unload-key.8 b/man/man8/zfs-unload-key.8 new file mode 120000 index 000000000000..d027a419d1e4 --- /dev/null +++ b/man/man8/zfs-unload-key.8 @@ -0,0 +1 @@ +zfs-load-key.8 \ No newline at end of file diff --git a/man/man8/zfs-unmount.8 b/man/man8/zfs-unmount.8 new file mode 120000 index 000000000000..be0d9dbf6c01 --- /dev/null +++ b/man/man8/zfs-unmount.8 @@ -0,0 +1 @@ +zfs-mount.8 \ No newline at end of file diff --git a/man/man8/zfs-upgrade.8 b/man/man8/zfs-upgrade.8 new file mode 100644 index 000000000000..742d5d7bd9e7 --- /dev/null +++ b/man/man8/zfs-upgrade.8 @@ -0,0 +1,106 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-UPGRADE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm upgrade +.Nd Manage upgrading the on-disk version of filesystems. +.Sh SYNOPSIS +.Nm +.Cm upgrade +.Nm +.Cm upgrade +.Fl v +.Nm +.Cm upgrade +.Op Fl r +.Op Fl V Ar version +.Fl a | Ar filesystem +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm upgrade +.Xc +Displays a list of file systems that are not the most recent version. +.It Xo +.Nm +.Cm upgrade +.Fl v +.Xc +Displays a list of currently supported file system versions. +.It Xo +.Nm +.Cm upgrade +.Op Fl r +.Op Fl V Ar version +.Fl a | Ar filesystem +.Xc +Upgrades file systems to a new on-disk version. +Once this is done, the file systems will no longer be accessible on systems +running older versions of the software. +.Nm zfs Cm send +streams generated from new snapshots of these file systems cannot be accessed on +systems running older versions of the software. +.Pp +In general, the file system version is independent of the pool version. +See +.Xr zpool 8 +for information on the +.Nm zpool Cm upgrade +command. +.Pp +In some cases, the file system version and the pool version are interrelated and +the pool version must be upgraded before the file system version can be +upgraded. +.Bl -tag -width "-V" +.It Fl V Ar version +Upgrade to the specified +.Ar version . +If the +.Fl V +flag is not specified, this command upgrades to the most recent version. +This +option can only be used to increase the version number, and only up to the most +recent version supported by this software. +.It Fl a +Upgrade all file systems on all imported pools. +.It Ar filesystem +Upgrade the specified file system. +.It Fl r +Upgrade the specified file system and all descendent file systems. +.El +.El +.Sh SEE ALSO +.Xr zpool-upgrade 8 diff --git a/man/man8/zfs-userspace.8 b/man/man8/zfs-userspace.8 new file mode 100644 index 000000000000..a8477d16a1ae --- /dev/null +++ b/man/man8/zfs-userspace.8 @@ -0,0 +1,186 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS-USERSPACE 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm userspace +.Nd Displays space consumed by, and quotas on, each user or group in the specified filesystem or snapshot. +.Sh SYNOPSIS +.Nm +.Cm userspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar snapshot +.Nm +.Cm groupspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar snapshot +.Nm +.Cm projectspace +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Ar filesystem Ns | Ns Ar snapshot +.Sh DESCRIPTION +.Bl -tag -width "" +.It Xo +.Nm +.Cm userspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar snapshot +.Xc +Displays space consumed by, and quotas on, each user in the specified filesystem +or snapshot. +This corresponds to the +.Sy userused@ Ns Em user , +.Sy userobjused@ Ns Em user , +.Sy userquota@ Ns Em user, +and +.Sy userobjquota@ Ns Em user +properties. +.Bl -tag -width "-H" +.It Fl H +Do not print headers, use tab-delimited output. +.It Fl S Ar field +Sort by this field in reverse order. +See +.Fl s . +.It Fl i +Translate SID to POSIX ID. +The POSIX ID may be ephemeral if no mapping exists. +Normal POSIX interfaces +.Po for example, +.Xr stat 2 , +.Nm ls Fl l +.Pc +perform this translation, so the +.Fl i +option allows the output from +.Nm zfs Cm userspace +to be compared directly with those utilities. +However, +.Fl i +may lead to confusion if some files were created by an SMB user before a +SMB-to-POSIX name mapping was established. +In such a case, some files will be owned by the SMB entity and some by the POSIX +entity. +However, the +.Fl i +option will report that the POSIX entity has the total usage and quota for both. +.It Fl n +Print numeric ID instead of user/group name. +.It Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... +Display only the specified fields from the following set: +.Sy type , +.Sy name , +.Sy used , +.Sy quota . +The default is to display all fields. +.It Fl p +Use exact +.Pq parsable +numeric output. +.It Fl s Ar field +Sort output by this field. +The +.Fl s +and +.Fl S +flags may be specified multiple times to sort first by one field, then by +another. +The default is +.Fl s Sy type Fl s Sy name . +.It Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... +Print only the specified types from the following set: +.Sy all , +.Sy posixuser , +.Sy smbuser , +.Sy posixgroup , +.Sy smbgroup . +The default is +.Fl t Sy posixuser Ns \&, Ns Sy smbuser . +The default can be changed to include group types. +.El +.It Xo +.Nm +.Cm groupspace +.Op Fl Hinp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Oo Fl t Ar type Ns Oo , Ns Ar type Oc Ns ... Oc +.Ar filesystem Ns | Ns Ar snapshot +.Xc +Displays space consumed by, and quotas on, each group in the specified +filesystem or snapshot. +This subcommand is identical to +.Cm userspace , +except that the default types to display are +.Fl t Sy posixgroup Ns \&, Ns Sy smbgroup . +.It Xo +.Nm +.Cm projectspace +.Op Fl Hp +.Oo Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... Oc +.Oo Fl s Ar field Oc Ns ... +.Oo Fl S Ar field Oc Ns ... +.Ar filesystem Ns | Ns Ar snapshot +.Xc +Displays space consumed by, and quotas on, each project in the specified +filesystem or snapshot. This subcommand is identical to +.Cm userspace , +except that the project identifier is numeral, not name. So need neither +the option +.Sy -i +for SID to POSIX ID nor +.Sy -n +for numeric ID, nor +.Sy -t +for types. +.El +.Sh SEE ALSO +.Xr zfs-set 8 , +.Xr zfsprops 8 diff --git a/man/man8/zfs-wait.8 b/man/man8/zfs-wait.8 new file mode 100644 index 000000000000..e7e3decb9da1 --- /dev/null +++ b/man/man8/zfs-wait.8 @@ -0,0 +1,71 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZFS-WAIT 8 +.Os +.Sh NAME +.Nm zfs Ns Pf - Cm wait +.Nd Wait for background activity to stop in a ZFS filesystem +.Sh SYNOPSIS +.Nm +.Cm wait +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar fs +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm wait +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar fs +.Xc +Waits until all background activity of the given types has ceased in the given +filesystem. +The activity could cease because it has completed or because the filesystem has +been destroyed or unmounted. +If no activities are specified, the command waits until background activity of +every type listed below has ceased. +If there is no activity of the given types in progress, the command returns +immediately. +.Pp +These are the possible values for +.Ar activity , +along with what each one waits for: +.Bd -literal + deleteq The filesystem's internal delete queue to empty +.Ed +.Pp +Note that the internal delete queue does not finish draining until +all large files have had time to be fully destroyed and all open file +handles to unlinked files are closed. +.El +.El +.Sh SEE ALSO +.Xr lsof 8 diff --git a/man/man8/zfs.8 b/man/man8/zfs.8 new file mode 100644 index 000000000000..92135c986e87 --- /dev/null +++ b/man/man8/zfs.8 @@ -0,0 +1,741 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFS 8 +.Os +.Sh NAME +.Nm zfs +.Nd configures ZFS file systems +.Sh SYNOPSIS +.Nm +.Fl ?V +.Nm +.Cm version +.Nm +.Cm +.Op Ar +.Sh DESCRIPTION +The +.Nm +command configures ZFS datasets within a ZFS storage pool, as described in +.Xr zpool 8 . +A dataset is identified by a unique path within the ZFS namespace. +For example: +.Bd -literal +pool/{filesystem,volume,snapshot} +.Ed +.Pp +where the maximum length of a dataset name is +.Dv MAXNAMELEN +.Pq 256 bytes +and the maximum amount of nesting allowed in a path is 50 levels deep. +.Pp +A dataset can be one of the following: +.Bl -tag -width "file system" +.It Sy file system +A ZFS dataset of type +.Sy filesystem +can be mounted within the standard system namespace and behaves like other file +systems. +While ZFS file systems are designed to be POSIX compliant, known issues exist +that prevent compliance in some cases. +Applications that depend on standards conformance might fail due to non-standard +behavior when checking file system free space. +.It Sy volume +A logical volume exported as a raw or block device. +This type of dataset should only be used when a block device is required. +File systems are typically used in most environments. +.It Sy snapshot +A read-only version of a file system or volume at a given point in time. +It is specified as +.Ar filesystem Ns @ Ns Ar name +or +.Ar volume Ns @ Ns Ar name . +.It Sy bookmark +Much like a +.Sy snapshot , +but without the hold on on-disk data. +It can be used as the source of a send (but not for a receive). It is specified as +.Ar filesystem Ns # Ns Ar name +or +.Ar volume Ns # Ns Ar name . +.El +.Pp +For details see +.Xr zfsconcepts 8 . +.Ss Properties +Properties are divided into two types, native properties and user-defined +.Po or +.Qq user +.Pc +properties. +Native properties either export internal statistics or control ZFS behavior. +In addition, native properties are either editable or read-only. +User properties have no effect on ZFS behavior, but you can use them to annotate +datasets in a way that is meaningful in your environment. +For more information about properties, see the +.Xr zfsprops 8 man page. +.Ss Encryption +Enabling the +.Sy encryption +feature allows for the creation of encrypted filesystems and volumes. +ZFS will encrypt file and zvol data, file attributes, ACLs, permission bits, +directory listings, FUID mappings, and +.Sy userused +/ +.Sy groupused +data. +For an overview of encryption see the +.Xr zfs-load-key 8 command manual. +.Sh SUBCOMMANDS +All subcommands that modify state are logged persistently to the pool in their +original form. +.Bl -tag -width "" +.It Nm Fl ? +Displays a help message. +.It Xo +.Nm +.Fl V , -version +.Xc +An alias for the +.Nm zfs Cm version +subcommand. +.It Xo +.Nm +.Cm version +.Xc +Displays the software version of the +.Nm +userland utility and the zfs kernel module. +.El +.Ss Dataset Management +.Bl -tag -width "" +.It Xr zfs-list 8 +Lists the property information for the given datasets in tabular form. +.It Xr zfs-create 8 +Creates a new ZFS file system or volume. +.It Xr zfs-destroy 8 +Destroys the given dataset(s), snapshot(s), or bookmark. +.It Xr zfs-rename 8 +Renames the given dataset (filesystem or snapshot). +.It Xr zfs-upgrade 8 +Manage upgrading the on-disk version of filesystems. +.El +.Ss Snapshots +.Bl -tag -width "" +.It Xr zfs-snapshot 8 +Creates snapshots with the given names. +.It Xr zfs-rollback 8 +Roll back the given dataset to a previous snapshot. +.It Xo +.Xr zfs-hold 8 / +.Xr zfs-release 8 +.Xc +Add or remove a hold reference to the specified snapshot or snapshots. +If a hold exists on a snapshot, attempts to destroy that snapshot by using the +.Nm zfs Cm destroy +command return +.Er EBUSY . +.It Xr zfs-diff 8 +Display the difference between a snapshot of a given filesystem and another +snapshot of that filesystem from a later time or the current contents of the +filesystem. +.El +.Ss Clones +.Bl -tag -width "" +.It Xr zfs-clone 8 +Creates a clone of the given snapshot. +.It Xr zfs-promote 8 +Promotes a clone file system to no longer be dependent on its +.Qq origin +snapshot. +.El +.Ss Send & Receive +.Bl -tag -width "" +.It Xr zfs-send 8 +Generate a send stream, which may be of a filesystem, and may be incremental +from a bookmark. +.It Xr zfs-receive 8 +Creates a snapshot whose contents are as specified in the stream provided on +standard input. +If a full stream is received, then a new file system is created as well. +Streams are created using the +.Xr zfs-send 8 +subcommand, which by default creates a full stream. +.It Xr zfs-bookmark 8 +Creates a new bookmark of the given snapshot or bookmark. +Bookmarks mark the point in time when the snapshot was created, and can be used +as the incremental source for a +.Nm zfs Cm send +command. +.It Xr zfs-redact 8 +Generate a new redaction bookmark. +This feature can be used to allow clones of a filesystem to be made available on +a remote system, in the case where their parent need not (or needs to not) be +usable. +.El +.Ss Properties +.Bl -tag -width "" +.It Xr zfs-get 8 +Displays properties for the given datasets. +.It Xr zfs-set 8 +Sets the property or list of properties to the given value(s) for each dataset. +.It Xr zfs-inherit 8 +Clears the specified property, causing it to be inherited from an ancestor, +restored to default if no ancestor has the property set, or with the +.Fl S +option reverted to the received value if one exists. +.El +.Ss Quotas +.Bl -tag -width "" +.It Xo +.Xr zfs-userspace 8 / +.Xr zfs-groupspace 8 / +.Xr zfs-projectspace 8 +.Xc +Displays space consumed by, and quotas on, each user, group, or project +in the specified filesystem or snapshot. +.It Xr zfs-project 8 +List, set, or clear project ID and/or inherit flag on the file(s) or directories. +.El +.Ss Mountpoints +.Bl -tag -width "" +.It Xr zfs-mount 8 +Displays all ZFS file systems currently mounted, or mount ZFS filesystem +on a path described by its +.Sy mountpoint +property. +.It Xr zfs-unmount 8 +Unmounts currently mounted ZFS file systems. +.El +.Ss Shares +.Bl -tag -width "" +.It Xr zfs-share 8 +Shares available ZFS file systems. +.It Xr zfs-unshare 8 +Unshares currently shared ZFS file systems. +.El +.Ss Delegated Administration +.Bl -tag -width "" +.It Xr zfs-allow 8 +Delegate permissions on the specified filesystem or volume. +.It Xr zfs-unallow 8 +Remove delegated permissions on the specified filesystem or volume. +.El +.Ss Encryption +.Bl -tag -width "" +.It Xr zfs-change-key 8 +Add or change an encryption key on the specified dataset. +.It Xr zfs-load-key 8 +Load the key for the specified encrypted dataset, enabling access. +.It Xr zfs-unload-key 8 +Unload a key for the specified dataset, removing the ability to access the dataset. +.El +.Ss Channel Programs +.Bl -tag -width "" +.It Xr zfs-program 8 +Execute ZFS administrative operations +programmatically via a Lua script-language channel program. +.El +.Ss Jails +.Bl -tag -width "" +.It Xr zfs-jail 8 +Attaches a filesystem to a jail. +.It Xr zfs-unjail 8 +Detaches a filesystem from a jail. +.El +.Ss Waiting +.Bl -tag -width "" +.It Xr zfs-wait 8 +Wait for background activity in a filesystem to complete. +.El +.Sh EXIT STATUS +The +.Nm +utility exits 0 on success, 1 if an error occurs, and 2 if invalid command line +options were specified. +.Sh EXAMPLES +.Bl -tag -width "" +.It Sy Example 1 No Creating a ZFS File System Hierarchy +The following commands create a file system named +.Em pool/home +and a file system named +.Em pool/home/bob . +The mount point +.Pa /export/home +is set for the parent file system, and is automatically inherited by the child +file system. +.Bd -literal +# zfs create pool/home +# zfs set mountpoint=/export/home pool/home +# zfs create pool/home/bob +.Ed +.It Sy Example 2 No Creating a ZFS Snapshot +The following command creates a snapshot named +.Sy yesterday . +This snapshot is mounted on demand in the +.Pa .zfs/snapshot +directory at the root of the +.Em pool/home/bob +file system. +.Bd -literal +# zfs snapshot pool/home/bob@yesterday +.Ed +.It Sy Example 3 No Creating and Destroying Multiple Snapshots +The following command creates snapshots named +.Sy yesterday +of +.Em pool/home +and all of its descendent file systems. +Each snapshot is mounted on demand in the +.Pa .zfs/snapshot +directory at the root of its file system. +The second command destroys the newly created snapshots. +.Bd -literal +# zfs snapshot -r pool/home@yesterday +# zfs destroy -r pool/home@yesterday +.Ed +.It Sy Example 4 No Disabling and Enabling File System Compression +The following command disables the +.Sy compression +property for all file systems under +.Em pool/home . +The next command explicitly enables +.Sy compression +for +.Em pool/home/anne . +.Bd -literal +# zfs set compression=off pool/home +# zfs set compression=on pool/home/anne +.Ed +.It Sy Example 5 No Listing ZFS Datasets +The following command lists all active file systems and volumes in the system. +Snapshots are displayed if the +.Sy listsnaps +property is +.Sy on . +The default is +.Sy off . +See +.Xr zpool 8 +for more information on pool properties. +.Bd -literal +# zfs list +NAME USED AVAIL REFER MOUNTPOINT +pool 450K 457G 18K /pool +pool/home 315K 457G 21K /export/home +pool/home/anne 18K 457G 18K /export/home/anne +pool/home/bob 276K 457G 276K /export/home/bob +.Ed +.It Sy Example 6 No Setting a Quota on a ZFS File System +The following command sets a quota of 50 Gbytes for +.Em pool/home/bob . +.Bd -literal +# zfs set quota=50G pool/home/bob +.Ed +.It Sy Example 7 No Listing ZFS Properties +The following command lists all properties for +.Em pool/home/bob . +.Bd -literal +# zfs get all pool/home/bob +NAME PROPERTY VALUE SOURCE +pool/home/bob type filesystem - +pool/home/bob creation Tue Jul 21 15:53 2009 - +pool/home/bob used 21K - +pool/home/bob available 20.0G - +pool/home/bob referenced 21K - +pool/home/bob compressratio 1.00x - +pool/home/bob mounted yes - +pool/home/bob quota 20G local +pool/home/bob reservation none default +pool/home/bob recordsize 128K default +pool/home/bob mountpoint /pool/home/bob default +pool/home/bob sharenfs off default +pool/home/bob checksum on default +pool/home/bob compression on local +pool/home/bob atime on default +pool/home/bob devices on default +pool/home/bob exec on default +pool/home/bob setuid on default +pool/home/bob readonly off default +pool/home/bob zoned off default +pool/home/bob snapdir hidden default +pool/home/bob acltype off default +pool/home/bob aclmode discard default +pool/home/bob aclinherit restricted default +pool/home/bob canmount on default +pool/home/bob xattr on default +pool/home/bob copies 1 default +pool/home/bob version 4 - +pool/home/bob utf8only off - +pool/home/bob normalization none - +pool/home/bob casesensitivity sensitive - +pool/home/bob vscan off default +pool/home/bob nbmand off default +pool/home/bob sharesmb off default +pool/home/bob refquota none default +pool/home/bob refreservation none default +pool/home/bob primarycache all default +pool/home/bob secondarycache all default +pool/home/bob usedbysnapshots 0 - +pool/home/bob usedbydataset 21K - +pool/home/bob usedbychildren 0 - +pool/home/bob usedbyrefreservation 0 - +.Ed +.Pp +The following command gets a single property value. +.Bd -literal +# zfs get -H -o value compression pool/home/bob +on +.Ed +The following command lists all properties with local settings for +.Em pool/home/bob . +.Bd -literal +# zfs get -r -s local -o name,property,value all pool/home/bob +NAME PROPERTY VALUE +pool/home/bob quota 20G +pool/home/bob compression on +.Ed +.It Sy Example 8 No Rolling Back a ZFS File System +The following command reverts the contents of +.Em pool/home/anne +to the snapshot named +.Sy yesterday , +deleting all intermediate snapshots. +.Bd -literal +# zfs rollback -r pool/home/anne@yesterday +.Ed +.It Sy Example 9 No Creating a ZFS Clone +The following command creates a writable file system whose initial contents are +the same as +.Em pool/home/bob@yesterday . +.Bd -literal +# zfs clone pool/home/bob@yesterday pool/clone +.Ed +.It Sy Example 10 No Promoting a ZFS Clone +The following commands illustrate how to test out changes to a file system, and +then replace the original file system with the changed one, using clones, clone +promotion, and renaming: +.Bd -literal +# zfs create pool/project/production + populate /pool/project/production with data +# zfs snapshot pool/project/production@today +# zfs clone pool/project/production@today pool/project/beta + make changes to /pool/project/beta and test them +# zfs promote pool/project/beta +# zfs rename pool/project/production pool/project/legacy +# zfs rename pool/project/beta pool/project/production + once the legacy version is no longer needed, it can be destroyed +# zfs destroy pool/project/legacy +.Ed +.It Sy Example 11 No Inheriting ZFS Properties +The following command causes +.Em pool/home/bob +and +.Em pool/home/anne +to inherit the +.Sy checksum +property from their parent. +.Bd -literal +# zfs inherit checksum pool/home/bob pool/home/anne +.Ed +.It Sy Example 12 No Remotely Replicating ZFS Data +The following commands send a full stream and then an incremental stream to a +remote machine, restoring them into +.Em poolB/received/fs@a +and +.Em poolB/received/fs@b , +respectively. +.Em poolB +must contain the file system +.Em poolB/received , +and must not initially contain +.Em poolB/received/fs . +.Bd -literal +# zfs send pool/fs@a | \e + ssh host zfs receive poolB/received/fs@a +# zfs send -i a pool/fs@b | \e + ssh host zfs receive poolB/received/fs +.Ed +.It Sy Example 13 No Using the zfs receive -d Option +The following command sends a full stream of +.Em poolA/fsA/fsB@snap +to a remote machine, receiving it into +.Em poolB/received/fsA/fsB@snap . +The +.Em fsA/fsB@snap +portion of the received snapshot's name is determined from the name of the sent +snapshot. +.Em poolB +must contain the file system +.Em poolB/received . +If +.Em poolB/received/fsA +does not exist, it is created as an empty file system. +.Bd -literal +# zfs send poolA/fsA/fsB@snap | \e + ssh host zfs receive -d poolB/received +.Ed +.It Sy Example 14 No Setting User Properties +The following example sets the user-defined +.Sy com.example:department +property for a dataset. +.Bd -literal +# zfs set com.example:department=12345 tank/accounting +.Ed +.It Sy Example 15 No Performing a Rolling Snapshot +The following example shows how to maintain a history of snapshots with a +consistent naming scheme. +To keep a week's worth of snapshots, the user destroys the oldest snapshot, +renames the remaining snapshots, and then creates a new snapshot, as follows: +.Bd -literal +# zfs destroy -r pool/users@7daysago +# zfs rename -r pool/users@6daysago @7daysago +# zfs rename -r pool/users@5daysago @6daysago +# zfs rename -r pool/users@4daysago @5daysago +# zfs rename -r pool/users@3daysago @4daysago +# zfs rename -r pool/users@2daysago @3daysago +# zfs rename -r pool/users@yesterday @2daysago +# zfs rename -r pool/users@today @yesterday +# zfs snapshot -r pool/users@today +.Ed +.It Sy Example 16 No Setting sharenfs Property Options on a ZFS File System +The following commands show how to set +.Sy sharenfs +property options to enable +.Sy rw +access for a set of +.Sy IP +addresses and to enable root access for system +.Sy neo +on the +.Em tank/home +file system. +.Bd -literal +# zfs set sharenfs='rw=@123.123.0.0/16,root=neo' tank/home +.Ed +.Pp +If you are using +.Sy DNS +for host name resolution, specify the fully qualified hostname. +.It Sy Example 17 No Delegating ZFS Administration Permissions on a ZFS Dataset +The following example shows how to set permissions so that user +.Sy cindys +can create, destroy, mount, and take snapshots on +.Em tank/cindys . +The permissions on +.Em tank/cindys +are also displayed. +.Bd -literal +# zfs allow cindys create,destroy,mount,snapshot tank/cindys +# zfs allow tank/cindys +---- Permissions on tank/cindys -------------------------------------- +Local+Descendent permissions: + user cindys create,destroy,mount,snapshot +.Ed +.Pp +Because the +.Em tank/cindys +mount point permission is set to 755 by default, user +.Sy cindys +will be unable to mount file systems under +.Em tank/cindys . +Add an ACE similar to the following syntax to provide mount point access: +.Bd -literal +# chmod A+user:cindys:add_subdirectory:allow /tank/cindys +.Ed +.It Sy Example 18 No Delegating Create Time Permissions on a ZFS Dataset +The following example shows how to grant anyone in the group +.Sy staff +to create file systems in +.Em tank/users . +This syntax also allows staff members to destroy their own file systems, but not +destroy anyone else's file system. +The permissions on +.Em tank/users +are also displayed. +.Bd -literal +# zfs allow staff create,mount tank/users +# zfs allow -c destroy tank/users +# zfs allow tank/users +---- Permissions on tank/users --------------------------------------- +Permission sets: + destroy +Local+Descendent permissions: + group staff create,mount +.Ed +.It Sy Example 19 No Defining and Granting a Permission Set on a ZFS Dataset +The following example shows how to define and grant a permission set on the +.Em tank/users +file system. +The permissions on +.Em tank/users +are also displayed. +.Bd -literal +# zfs allow -s @pset create,destroy,snapshot,mount tank/users +# zfs allow staff @pset tank/users +# zfs allow tank/users +---- Permissions on tank/users --------------------------------------- +Permission sets: + @pset create,destroy,mount,snapshot +Local+Descendent permissions: + group staff @pset +.Ed +.It Sy Example 20 No Delegating Property Permissions on a ZFS Dataset +The following example shows to grant the ability to set quotas and reservations +on the +.Em users/home +file system. +The permissions on +.Em users/home +are also displayed. +.Bd -literal +# zfs allow cindys quota,reservation users/home +# zfs allow users/home +---- Permissions on users/home --------------------------------------- +Local+Descendent permissions: + user cindys quota,reservation +cindys% zfs set quota=10G users/home/marks +cindys% zfs get quota users/home/marks +NAME PROPERTY VALUE SOURCE +users/home/marks quota 10G local +.Ed +.It Sy Example 21 No Removing ZFS Delegated Permissions on a ZFS Dataset +The following example shows how to remove the snapshot permission from the +.Sy staff +group on the +.Em tank/users +file system. +The permissions on +.Em tank/users +are also displayed. +.Bd -literal +# zfs unallow staff snapshot tank/users +# zfs allow tank/users +---- Permissions on tank/users --------------------------------------- +Permission sets: + @pset create,destroy,mount,snapshot +Local+Descendent permissions: + group staff @pset +.Ed +.It Sy Example 22 No Showing the differences between a snapshot and a ZFS Dataset +The following example shows how to see what has changed between a prior +snapshot of a ZFS dataset and its current state. +The +.Fl F +option is used to indicate type information for the files affected. +.Bd -literal +# zfs diff -F tank/test@before tank/test +M / /tank/test/ +M F /tank/test/linked (+1) +R F /tank/test/oldname -> /tank/test/newname +- F /tank/test/deleted ++ F /tank/test/created +M F /tank/test/modified +.Ed +.It Sy Example 23 No Creating a bookmark +The following example create a bookmark to a snapshot. +This bookmark can then be used instead of snapshot in send streams. +.Bd -literal +# zfs bookmark rpool@snapshot rpool#bookmark +.Ed +.It Sy Example 24 No Setting sharesmb Property Options on a ZFS File System +The following example show how to share SMB filesystem through ZFS. +Note that that a user and his/her password must be given. +.Bd -literal +# smbmount //127.0.0.1/share_tmp /mnt/tmp \\ + -o user=workgroup/turbo,password=obrut,uid=1000 +.Ed +.Pp +Minimal +.Em /etc/samba/smb.conf +configuration required: +.Pp +Samba will need to listen to 'localhost' (127.0.0.1) for the ZFS utilities to +communicate with Samba. +This is the default behavior for most Linux distributions. +.Pp +Samba must be able to authenticate a user. +This can be done in a number of ways, depending on if using the system password file, LDAP or the Samba +specific smbpasswd file. +How to do this is outside the scope of this manual. +Please refer to the +.Xr smb.conf 5 +man page for more information. +.Pp +See the +.Sy USERSHARE section +of the +.Xr smb.conf 5 +man page for all configuration options in case you need to modify any options +to the share afterwards. +Do note that any changes done with the +.Xr net 8 +command will be undone if the share is ever unshared (such as at a reboot etc). +.El +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZFS_MOUNT_HELPER" +.It Ev ZFS_MOUNT_HELPER +Cause +.Nm zfs mount +to use +.Em /bin/mount +to mount zfs datasets. This option is provided for backwards compatibility with older zfs versions. +.El +.Sh INTERFACE STABILITY +.Sy Committed . +.Sh SEE ALSO +.Xr attr 1 , +.Xr gzip 1 , +.Xr ssh 1 , +.Xr chmod 2 , +.Xr fsync 2 , +.Xr stat 2 , +.Xr write 2 , +.Xr acl 5 , +.Xr attributes 5 , +.Xr exports 5 , +.Xr exportfs 8 , +.Xr mount 8 , +.Xr net 8 , +.Xr selinux 8 , +.Xr zfsconcepts 8 , +.Xr zfsprops 8 , +.Xr zpool 8 diff --git a/man/man8/zfs_ids_to_path.8 b/man/man8/zfs_ids_to_path.8 new file mode 100644 index 000000000000..9d6a4976efa2 --- /dev/null +++ b/man/man8/zfs_ids_to_path.8 @@ -0,0 +1,50 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2020 by Delphix. All rights reserved. +.Dd April 17, 2020 +.Dt ZFS_IDS_TO_PATH 8 +.Os +.Sh NAME +.Nm zfs_ids_to_path +.Nd convert objset and object ids to names and paths +.Sh SYNOPSIS +.Nm +.Op Fl v +.Ar pool +.Ar objset id +.Ar object id +.Nm +.Sh DESCRIPTION +.Pp +.LP +The +.Sy zfs_ids_to_path +utility converts a provided objset and object id into a path to the file that +those ids refer to. +.Bl -tag -width "-D" +.It Fl v +Verbose. +Print the dataset name and the file path within the dataset separately. This +will work correctly even if the dataset is not mounted. +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zdb 8 diff --git a/man/man8/zfsconcepts.8 b/man/man8/zfsconcepts.8 new file mode 100644 index 000000000000..8e0e68678789 --- /dev/null +++ b/man/man8/zfsconcepts.8 @@ -0,0 +1,198 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" +.Dd June 30, 2019 +.Dt ZFSCONCEPTS 8 +.Os +.Sh NAME +.Nm zfsconcepts +.Nd An overview of ZFS concepts. +.Sh DESCRIPTION +.Ss ZFS File System Hierarchy +A ZFS storage pool is a logical collection of devices that provide space for +datasets. +A storage pool is also the root of the ZFS file system hierarchy. +.Pp +The root of the pool can be accessed as a file system, such as mounting and +unmounting, taking snapshots, and setting properties. +The physical storage characteristics, however, are managed by the +.Xr zpool 8 +command. +.Pp +See +.Xr zpool 8 +for more information on creating and administering pools. +.Ss Snapshots +A snapshot is a read-only copy of a file system or volume. +Snapshots can be created extremely quickly, and initially consume no additional +space within the pool. +As data within the active dataset changes, the snapshot consumes more data than +would otherwise be shared with the active dataset. +.Pp +Snapshots can have arbitrary names. +Snapshots of volumes can be cloned or rolled back, visibility is determined +by the +.Sy snapdev +property of the parent volume. +.Pp +File system snapshots can be accessed under the +.Pa .zfs/snapshot +directory in the root of the file system. +Snapshots are automatically mounted on demand and may be unmounted at regular +intervals. +The visibility of the +.Pa .zfs +directory can be controlled by the +.Sy snapdir +property. +.Ss Bookmarks +A bookmark is like a snapshot, a read-only copy of a file system or volume. +Bookmarks can be created extremely quickly, compared to snapshots, and they +consume no additional space within the pool. Bookmarks can also have arbitrary +names, much like snapshots. +.Pp +Unlike snapshots, bookmarks can not be accessed through the filesystem in any +way. From a storage standpoint a bookmark just provides a way to reference +when a snapshot was created as a distinct object. Bookmarks are initially +tied to a snapshot, not the filesystem or volume, and they will survive if the +snapshot itself is destroyed. Since they are very light weight there's little +incentive to destroy them. +.Ss Clones +A clone is a writable volume or file system whose initial contents are the same +as another dataset. +As with snapshots, creating a clone is nearly instantaneous, and initially +consumes no additional space. +.Pp +Clones can only be created from a snapshot. +When a snapshot is cloned, it creates an implicit dependency between the parent +and child. +Even though the clone is created somewhere else in the dataset hierarchy, the +original snapshot cannot be destroyed as long as a clone exists. +The +.Sy origin +property exposes this dependency, and the +.Cm destroy +command lists any such dependencies, if they exist. +.Pp +The clone parent-child dependency relationship can be reversed by using the +.Cm promote +subcommand. +This causes the +.Qq origin +file system to become a clone of the specified file system, which makes it +possible to destroy the file system that the clone was created from. +.Ss "Mount Points" +Creating a ZFS file system is a simple operation, so the number of file systems +per system is likely to be numerous. +To cope with this, ZFS automatically manages mounting and unmounting file +systems without the need to edit the +.Pa /etc/fstab +file. +All automatically managed file systems are mounted by ZFS at boot time. +.Pp +By default, file systems are mounted under +.Pa /path , +where +.Ar path +is the name of the file system in the ZFS namespace. +Directories are created and destroyed as needed. +.Pp +A file system can also have a mount point set in the +.Sy mountpoint +property. +This directory is created as needed, and ZFS automatically mounts the file +system when the +.Nm zfs Cm mount Fl a +command is invoked +.Po without editing +.Pa /etc/fstab +.Pc . +The +.Sy mountpoint +property can be inherited, so if +.Em pool/home +has a mount point of +.Pa /export/stuff , +then +.Em pool/home/user +automatically inherits a mount point of +.Pa /export/stuff/user . +.Pp +A file system +.Sy mountpoint +property of +.Sy none +prevents the file system from being mounted. +.Pp +If needed, ZFS file systems can also be managed with traditional tools +.Po +.Nm mount , +.Nm umount , +.Pa /etc/fstab +.Pc . +If a file system's mount point is set to +.Sy legacy , +ZFS makes no attempt to manage the file system, and the administrator is +responsible for mounting and unmounting the file system. Because pools must +be imported before a legacy mount can succeed, administrators should ensure +that legacy mounts are only attempted after the zpool import process +finishes at boot time. For example, on machines using systemd, the mount +option +.Pp +.Nm x-systemd.requires=zfs-import.target +.Pp +will ensure that the zfs-import completes before systemd attempts mounting +the filesystem. See systemd.mount(5) for details. +.Ss Deduplication +Deduplication is the process for removing redundant data at the block level, +reducing the total amount of data stored. If a file system has the +.Sy dedup +property enabled, duplicate data blocks are removed synchronously. The result +is that only unique data is stored and common components are shared among files. +.Pp +Deduplicating data is a very resource-intensive operation. It is generally +recommended that you have at least 1.25 GiB of RAM per 1 TiB of storage when +you enable deduplication. Calculating the exact requirement depends heavily +on the type of data stored in the pool. +.Pp +Enabling deduplication on an improperly-designed system can result in +performance issues (slow IO and administrative operations). It can potentially +lead to problems importing a pool due to memory exhaustion. Deduplication +can consume significant processing power (CPU) and memory as well as generate +additional disk IO. +.Pp +Before creating a pool with deduplication enabled, ensure that you have planned +your hardware requirements appropriately and implemented appropriate recovery +practices, such as regular backups. As an alternative to deduplication +consider using +.Sy compression=on , +as a less resource-intensive alternative. diff --git a/man/man8/zfsprops.8 b/man/man8/zfsprops.8 new file mode 100644 index 000000000000..a4e2829e3fad --- /dev/null +++ b/man/man8/zfsprops.8 @@ -0,0 +1,1988 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright 2011 Joshua M. Clulow +.\" Copyright (c) 2011, 2019 by Delphix. All rights reserved. +.\" Copyright (c) 2011, Pawel Jakub Dawidek +.\" Copyright (c) 2012, Glen Barber +.\" Copyright (c) 2012, Bryan Drewery +.\" Copyright (c) 2013, Steven Hartland +.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved. +.\" Copyright (c) 2014, Joyent, Inc. All rights reserved. +.\" Copyright (c) 2014 by Adam Stevko. All rights reserved. +.\" Copyright (c) 2014 Integros [integros.com] +.\" Copyright (c) 2016 Nexenta Systems, Inc. All Rights Reserved. +.\" Copyright (c) 2014, Xin LI +.\" Copyright (c) 2014-2015, The FreeBSD Foundation, All Rights Reserved. +.\" Copyright 2019 Richard Laager. All rights reserved. +.\" Copyright 2018 Nexenta Systems, Inc. +.\" Copyright 2019 Joyent, Inc. +.\" Copyright (c) 2019, Kjeld Schouten-Lebbing +.\" +.Dd January 30, 2020 +.Dt ZFSPROPS 8 +.Os +.Sh NAME +.Nm zfsprops +.Nd Native properties and user-defined of ZFS datasets. +.Sh DESCRIPTION +Properties are divided into two types, native properties and user-defined +.Po or +.Qq user +.Pc +properties. +Native properties either export internal statistics or control ZFS behavior. +In addition, native properties are either editable or read-only. +User properties have no effect on ZFS behavior, but you can use them to annotate +datasets in a way that is meaningful in your environment. +For more information about user properties, see the +.Sx User Properties +section, below. +.Ss Native Properties +Every dataset has a set of properties that export statistics about the dataset +as well as control various behaviors. +Properties are inherited from the parent unless overridden by the child. +Some properties apply only to certain types of datasets +.Pq file systems, volumes, or snapshots . +.Pp +The values of numeric properties can be specified using human-readable suffixes +.Po for example, +.Sy k , +.Sy KB , +.Sy M , +.Sy Gb , +and so forth, up to +.Sy Z +for zettabyte +.Pc . +The following are all valid +.Pq and equal +specifications: +.Li 1536M, 1.5g, 1.50GB . +.Pp +The values of non-numeric properties are case sensitive and must be lowercase, +except for +.Sy mountpoint , +.Sy sharenfs , +and +.Sy sharesmb . +.Pp +The following native properties consist of read-only statistics about the +dataset. +These properties can be neither set, nor inherited. +Native properties apply to all dataset types unless otherwise noted. +.Bl -tag -width "usedbyrefreservation" +.It Sy available +The amount of space available to the dataset and all its children, assuming that +there is no other activity in the pool. +Because space is shared within a pool, availability can be limited by any number +of factors, including physical pool size, quotas, reservations, or other +datasets within the pool. +.Pp +This property can also be referred to by its shortened column name, +.Sy avail . +.It Sy compressratio +For non-snapshots, the compression ratio achieved for the +.Sy used +space of this dataset, expressed as a multiplier. +The +.Sy used +property includes descendant datasets, and, for clones, does not include the +space shared with the origin snapshot. +For snapshots, the +.Sy compressratio +is the same as the +.Sy refcompressratio +property. +Compression can be turned on by running: +.Nm zfs Cm set Sy compression Ns = Ns Sy on Ar dataset . +The default value is +.Sy off . +.It Sy createtxg +The transaction group (txg) in which the dataset was created. Bookmarks have +the same +.Sy createtxg +as the snapshot they are initially tied to. This property is suitable for +ordering a list of snapshots, e.g. for incremental send and receive. +.It Sy creation +The time this dataset was created. +.It Sy clones +For snapshots, this property is a comma-separated list of filesystems or volumes +which are clones of this snapshot. +The clones' +.Sy origin +property is this snapshot. +If the +.Sy clones +property is not empty, then this snapshot can not be destroyed +.Po even with the +.Fl r +or +.Fl f +options +.Pc . +The roles of origin and clone can be swapped by promoting the clone with the +.Nm zfs Cm promote +command. +.It Sy defer_destroy +This property is +.Sy on +if the snapshot has been marked for deferred destroy by using the +.Nm zfs Cm destroy Fl d +command. +Otherwise, the property is +.Sy off . +.It Sy encryptionroot +For encrypted datasets, indicates where the dataset is currently inheriting its +encryption key from. Loading or unloading a key for the +.Sy encryptionroot +will implicitly load / unload the key for any inheriting datasets (see +.Nm zfs Cm load-key +and +.Nm zfs Cm unload-key +for details). +Clones will always share an +encryption key with their origin. See the +.Em Encryption +section of +.Xr zfs-load-key 8 +for details. +.It Sy filesystem_count +The total number of filesystems and volumes that exist under this location in +the dataset tree. +This value is only available when a +.Sy filesystem_limit +has been set somewhere in the tree under which the dataset resides. +.It Sy keystatus +Indicates if an encryption key is currently loaded into ZFS. The possible +values are +.Sy none , +.Sy available , +and +.Sy unavailable . +See +.Nm zfs Cm load-key +and +.Nm zfs Cm unload-key . +.It Sy guid +The 64 bit GUID of this dataset or bookmark which does not change over its +entire lifetime. When a snapshot is sent to another pool, the received +snapshot has the same GUID. Thus, the +.Sy guid +is suitable to identify a snapshot across pools. +.It Sy logicalreferenced +The amount of space that is +.Qq logically +accessible by this dataset. +See the +.Sy referenced +property. +The logical space ignores the effect of the +.Sy compression +and +.Sy copies +properties, giving a quantity closer to the amount of data that applications +see. +However, it does include space consumed by metadata. +.Pp +This property can also be referred to by its shortened column name, +.Sy lrefer . +.It Sy logicalused +The amount of space that is +.Qq logically +consumed by this dataset and all its descendents. +See the +.Sy used +property. +The logical space ignores the effect of the +.Sy compression +and +.Sy copies +properties, giving a quantity closer to the amount of data that applications +see. +However, it does include space consumed by metadata. +.Pp +This property can also be referred to by its shortened column name, +.Sy lused . +.It Sy mounted +For file systems, indicates whether the file system is currently mounted. +This property can be either +.Sy yes +or +.Sy no . +.It Sy objsetid +A unique identifier for this dataset within the pool. Unlike the dataset's +.Sy guid +, the +.Sy objsetid +of a dataset is not transferred to other pools when the snapshot is copied +with a send/receive operation. +The +.Sy objsetid +can be reused (for a new dataset) after the dataset is deleted. +.It Sy origin +For cloned file systems or volumes, the snapshot from which the clone was +created. +See also the +.Sy clones +property. +.It Sy receive_resume_token +For filesystems or volumes which have saved partially-completed state from +.Sy zfs receive -s , +this opaque token can be provided to +.Sy zfs send -t +to resume and complete the +.Sy zfs receive . +.It Sy redact_snaps +For bookmarks, this is the list of snapshot guids the bookmark contains a redaction +list for. +For snapshots, this is the list of snapshot guids the snapshot is redacted with +respect to. +.It Sy referenced +The amount of data that is accessible by this dataset, which may or may not be +shared with other datasets in the pool. +When a snapshot or clone is created, it initially references the same amount of +space as the file system or snapshot it was created from, since its contents are +identical. +.Pp +This property can also be referred to by its shortened column name, +.Sy refer . +.It Sy refcompressratio +The compression ratio achieved for the +.Sy referenced +space of this dataset, expressed as a multiplier. +See also the +.Sy compressratio +property. +.It Sy snapshot_count +The total number of snapshots that exist under this location in the dataset +tree. +This value is only available when a +.Sy snapshot_limit +has been set somewhere in the tree under which the dataset resides. +.It Sy type +The type of dataset: +.Sy filesystem , +.Sy volume , +.Sy snapshot , +or +.Sy bookmark . +.It Sy used +The amount of space consumed by this dataset and all its descendents. +This is the value that is checked against this dataset's quota and reservation. +The space used does not include this dataset's reservation, but does take into +account the reservations of any descendent datasets. +The amount of space that a dataset consumes from its parent, as well as the +amount of space that is freed if this dataset is recursively destroyed, is the +greater of its space used and its reservation. +.Pp +The used space of a snapshot +.Po see the +.Em Snapshots +section of +.Xr zfsconcepts 8 +.Pc +is space that is referenced exclusively by this snapshot. +If this snapshot is destroyed, the amount of +.Sy used +space will be freed. +Space that is shared by multiple snapshots isn't accounted for in this metric. +When a snapshot is destroyed, space that was previously shared with this +snapshot can become unique to snapshots adjacent to it, thus changing the used +space of those snapshots. +The used space of the latest snapshot can also be affected by changes in the +file system. +Note that the +.Sy used +space of a snapshot is a subset of the +.Sy written +space of the snapshot. +.Pp +The amount of space used, available, or referenced does not take into account +pending changes. +Pending changes are generally accounted for within a few seconds. +Committing a change to a disk using +.Xr fsync 2 +or +.Dv O_SYNC +does not necessarily guarantee that the space usage information is updated +immediately. +.It Sy usedby* +The +.Sy usedby* +properties decompose the +.Sy used +properties into the various reasons that space is used. +Specifically, +.Sy used No = +.Sy usedbychildren No + +.Sy usedbydataset No + +.Sy usedbyrefreservation No + +.Sy usedbysnapshots . +These properties are only available for datasets created on +.Nm zpool +.Qo version 13 Qc +pools. +.It Sy usedbychildren +The amount of space used by children of this dataset, which would be freed if +all the dataset's children were destroyed. +.It Sy usedbydataset +The amount of space used by this dataset itself, which would be freed if the +dataset were destroyed +.Po after first removing any +.Sy refreservation +and destroying any necessary snapshots or descendents +.Pc . +.It Sy usedbyrefreservation +The amount of space used by a +.Sy refreservation +set on this dataset, which would be freed if the +.Sy refreservation +was removed. +.It Sy usedbysnapshots +The amount of space consumed by snapshots of this dataset. +In particular, it is the amount of space that would be freed if all of this +dataset's snapshots were destroyed. +Note that this is not simply the sum of the snapshots' +.Sy used +properties because space can be shared by multiple snapshots. +.It Sy userused Ns @ Ns Em user +The amount of space consumed by the specified user in this dataset. +Space is charged to the owner of each file, as displayed by +.Nm ls Fl l . +The amount of space charged is displayed by +.Nm du +and +.Nm ls Fl s . +See the +.Nm zfs Cm userspace +subcommand for more information. +.Pp +Unprivileged users can access only their own space usage. +The root user, or a user who has been granted the +.Sy userused +privilege with +.Nm zfs Cm allow , +can access everyone's usage. +.Pp +The +.Sy userused Ns @ Ns Em ... +properties are not displayed by +.Nm zfs Cm get Sy all . +The user's name must be appended after the @ symbol, using one of the following +forms: +.Bl -bullet -width "" +.It +.Em POSIX name +.Po for example, +.Sy joe +.Pc +.It +.Em POSIX numeric ID +.Po for example, +.Sy 789 +.Pc +.It +.Em SID name +.Po for example, +.Sy joe.smith@mydomain +.Pc +.It +.Em SID numeric ID +.Po for example, +.Sy S-1-123-456-789 +.Pc +.El +.Pp +Files created on Linux always have POSIX owners. +.It Sy userobjused Ns @ Ns Em user +The +.Sy userobjused +property is similar to +.Sy userused +but instead it counts the number of objects consumed by a user. This property +counts all objects allocated on behalf of the user, it may differ from the +results of system tools such as +.Nm df Fl i . +.Pp +When the property +.Sy xattr=on +is set on a file system additional objects will be created per-file to store +extended attributes. These additional objects are reflected in the +.Sy userobjused +value and are counted against the user's +.Sy userobjquota . +When a file system is configured to use +.Sy xattr=sa +no additional internal objects are normally required. +.It Sy userrefs +This property is set to the number of user holds on this snapshot. +User holds are set by using the +.Nm zfs Cm hold +command. +.It Sy groupused Ns @ Ns Em group +The amount of space consumed by the specified group in this dataset. +Space is charged to the group of each file, as displayed by +.Nm ls Fl l . +See the +.Sy userused Ns @ Ns Em user +property for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupused +privilege with +.Nm zfs Cm allow , +can access all groups' usage. +.It Sy groupobjused Ns @ Ns Em group +The number of objects consumed by the specified group in this dataset. +Multiple objects may be charged to the group for each file when extended +attributes are in use. See the +.Sy userobjused Ns @ Ns Em user +property for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupobjused +privilege with +.Nm zfs Cm allow , +can access all groups' usage. +.It Sy projectused Ns @ Ns Em project +The amount of space consumed by the specified project in this dataset. Project +is identified via the project identifier (ID) that is object-based numeral +attribute. An object can inherit the project ID from its parent object (if the +parent has the flag of inherit project ID that can be set and changed via +.Nm chattr Fl /+P +or +.Nm zfs project Fl s ) +when being created. The privileged user can set and change object's project +ID via +.Nm chattr Fl p +or +.Nm zfs project Fl s +anytime. Space is charged to the project of each file, as displayed by +.Nm lsattr Fl p +or +.Nm zfs project . +See the +.Sy userused Ns @ Ns Em user +property for more information. +.Pp +The root user, or a user who has been granted the +.Sy projectused +privilege with +.Nm zfs allow , +can access all projects' usage. +.It Sy projectobjused Ns @ Ns Em project +The +.Sy projectobjused +is similar to +.Sy projectused +but instead it counts the number of objects consumed by project. When the +property +.Sy xattr=on +is set on a fileset, ZFS will create additional objects per-file to store +extended attributes. These additional objects are reflected in the +.Sy projectobjused +value and are counted against the project's +.Sy projectobjquota . +When a filesystem is configured to use +.Sy xattr=sa +no additional internal objects are required. See the +.Sy userobjused Ns @ Ns Em user +property for more information. +.Pp +The root user, or a user who has been granted the +.Sy projectobjused +privilege with +.Nm zfs allow , +can access all projects' objects usage. +.It Sy volblocksize +For volumes, specifies the block size of the volume. +The +.Sy blocksize +cannot be changed once the volume has been written, so it should be set at +volume creation time. +The default +.Sy blocksize +for volumes is 8 Kbytes. +Any power of 2 from 512 bytes to 128 Kbytes is valid. +.Pp +This property can also be referred to by its shortened column name, +.Sy volblock . +.It Sy written +The amount of space +.Sy referenced +by this dataset, that was written since the previous snapshot +.Pq i.e. that is not referenced by the previous snapshot . +.It Sy written Ns @ Ns Em snapshot +The amount of +.Sy referenced +space written to this dataset since the specified snapshot. +This is the space that is referenced by this dataset but was not referenced by +the specified snapshot. +.Pp +The +.Em snapshot +may be specified as a short snapshot name +.Po just the part after the +.Sy @ +.Pc , +in which case it will be interpreted as a snapshot in the same filesystem as +this dataset. +The +.Em snapshot +may be a full snapshot name +.Po Em filesystem Ns @ Ns Em snapshot Pc , +which for clones may be a snapshot in the origin's filesystem +.Pq or the origin of the origin's filesystem, etc. +.El +.Pp +The following native properties can be used to change the behavior of a ZFS +dataset. +.Bl -tag -width "" +.It Xo +.Sy aclinherit Ns = Ns Sy discard Ns | Ns Sy noallow Ns | Ns +.Sy restricted Ns | Ns Sy passthrough Ns | Ns Sy passthrough-x +.Xc +Controls how ACEs are inherited when files and directories are created. +.Bl -tag -width "passthrough-x" +.It Sy discard +does not inherit any ACEs. +.It Sy noallow +only inherits inheritable ACEs that specify +.Qq deny +permissions. +.It Sy restricted +default, removes the +.Sy write_acl +and +.Sy write_owner +permissions when the ACE is inherited. +.It Sy passthrough +inherits all inheritable ACEs without any modifications. +.It Sy passthrough-x +same meaning as +.Sy passthrough , +except that the +.Sy owner@ , +.Sy group@ , +and +.Sy everyone@ +ACEs inherit the execute permission only if the file creation mode also requests +the execute bit. +.El +.Pp +When the property value is set to +.Sy passthrough , +files are created with a mode determined by the inheritable ACEs. +If no inheritable ACEs exist that affect the mode, then the mode is set in +accordance to the requested mode from the application. +.Pp +The +.Sy aclinherit +property does not apply to POSIX ACLs. +.It Xo +.Sy aclmode Ns = Ns Sy discard Ns | Ns Sy groupmask Ns | Ns +.Sy passthrough Ns | Ns Sy restricted Ns +.Xc +Controls how an ACL is modified during chmod(2) and how inherited ACEs +are modified by the file creation mode. +.Bl -tag -width "passthrough" +.It Sy discard +default, deletes all +.Sy ACEs +except for those representing +the mode of the file or directory requested by +.Xr chmod 2 . +.It Sy groupmask +reduces permissions granted in all +.Sy ALLOW +entries found in the +.Sy ACL +such that they are no greater than the group permissions specified by +.Xr chmod 2 . +.It Sy passthrough +indicates that no changes are made to the +.Tn ACL +other than creating or updating the necessary +.Tn ACL +entries to represent the new mode of the file or directory. +.It Sy restricted +will cause the +.Xr chmod 2 +operation to return an error when used on any file or directory which has +a non-trivial +.Tn ACL +whose entries can not be represented by a mode. +.Xr chmod 2 +is required to change the set user ID, set group ID, or sticky bits on a file +or directory, as they do not have equivalent +.Tn ACL +entries. +In order to use +.Xr chmod 2 +on a file or directory with a non-trivial +.Tn ACL +when +.Sy aclmode +is set to +.Sy restricted , +you must first remove all +.Tn ACL +entries which do not represent the current mode. +.El +.It Sy acltype Ns = Ns Sy off Ns | Ns Sy noacl Ns | Ns Sy posixacl +Controls whether ACLs are enabled and if so what type of ACL to use. +This property is not visible on FreeBSD yet. +.Bl -tag -width "posixacl" +.It Sy off +default, when a file system has the +.Sy acltype +property set to off then ACLs are disabled. +.It Sy noacl +an alias for +.Sy off +.It Sy posixacl +indicates POSIX ACLs should be used. POSIX ACLs are specific to Linux and are +not functional on other platforms. POSIX ACLs are stored as an extended +attribute and therefore will not overwrite any existing NFSv4 ACLs which +may be set. +.El +.Pp +To obtain the best performance when setting +.Sy posixacl +users are strongly encouraged to set the +.Sy xattr=sa +property. This will result in the POSIX ACL being stored more efficiently on +disk. But as a consequence, all new extended attributes will only be +accessible from OpenZFS implementations which support the +.Sy xattr=sa +property. See the +.Sy xattr +property for more details. +.It Sy atime Ns = Ns Sy on Ns | Ns Sy off +Controls whether the access time for files is updated when they are read. +Turning this property off avoids producing write traffic when reading files and +can result in significant performance gains, though it might confuse mailers +and other similar utilities. The values +.Sy on +and +.Sy off +are equivalent to the +.Sy atime +and +.Sy noatime +mount options. The default value is +.Sy on . +See also +.Sy relatime +below. +.It Sy canmount Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy noauto +If this property is set to +.Sy off , +the file system cannot be mounted, and is ignored by +.Nm zfs Cm mount Fl a . +Setting this property to +.Sy off +is similar to setting the +.Sy mountpoint +property to +.Sy none , +except that the dataset still has a normal +.Sy mountpoint +property, which can be inherited. +Setting this property to +.Sy off +allows datasets to be used solely as a mechanism to inherit properties. +One example of setting +.Sy canmount Ns = Ns Sy off +is to have two datasets with the same +.Sy mountpoint , +so that the children of both datasets appear in the same directory, but might +have different inherited characteristics. +.Pp +When set to +.Sy noauto , +a dataset can only be mounted and unmounted explicitly. +The dataset is not mounted automatically when the dataset is created or +imported, nor is it mounted by the +.Nm zfs Cm mount Fl a +command or unmounted by the +.Nm zfs Cm unmount Fl a +command. +.Pp +This property is not inherited. +.It Xo +.Sy checksum Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy fletcher2 Ns | Ns +.Sy fletcher4 Ns | Ns Sy sha256 Ns | Ns Sy noparity Ns | Ns +.Sy sha512 Ns | Ns Sy skein Ns | Ns Sy edonr +.Xc +Controls the checksum used to verify data integrity. +The default value is +.Sy on , +which automatically selects an appropriate algorithm +.Po currently, +.Sy fletcher4 , +but this may change in future releases +.Pc . +The value +.Sy off +disables integrity checking on user data. +The value +.Sy noparity +not only disables integrity but also disables maintaining parity for user data. +This setting is used internally by a dump device residing on a RAID-Z pool and +should not be used by any other dataset. +Disabling checksums is +.Sy NOT +a recommended practice. +.Pp +The +.Sy sha512 , +.Sy skein , +and +.Sy edonr +checksum algorithms require enabling the appropriate features on the pool. +FreeBSD does not support the +.Sy edonr +algorithm. +.Pp +Please see +.Xr zpool-features 5 +for more information on these algorithms. +.Pp +Changing this property affects only newly-written data. +.It Xo +.Sy compression Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy gzip Ns | Ns +.Sy gzip- Ns Em N Ns | Ns Sy lz4 Ns | Ns Sy lzjb Ns | Ns Sy zle Ns | Ns Sy zstd Ns | Ns +.Sy zstd- Ns Em N Ns | Ns Sy zstd-fast Ns | Ns Sy zstd-fast- Ns Em N +.Xc +Controls the compression algorithm used for this dataset. +.Pp +Setting compression to +.Sy on +indicates that the current default compression algorithm should be used. +The default balances compression and decompression speed, with compression ratio +and is expected to work well on a wide variety of workloads. +Unlike all other settings for this property, +.Sy on +does not select a fixed compression type. +As new compression algorithms are added to ZFS and enabled on a pool, the +default compression algorithm may change. +The current default compression algorithm is either +.Sy lzjb +or, if the +.Sy lz4_compress +feature is enabled, +.Sy lz4 . +.Pp +The +.Sy lz4 +compression algorithm is a high-performance replacement for the +.Sy lzjb +algorithm. +It features significantly faster compression and decompression, as well as a +moderately higher compression ratio than +.Sy lzjb , +but can only be used on pools with the +.Sy lz4_compress +feature set to +.Sy enabled . +See +.Xr zpool-features 5 +for details on ZFS feature flags and the +.Sy lz4_compress +feature. +.Pp +The +.Sy lzjb +compression algorithm is optimized for performance while providing decent data +compression. +.Pp +The +.Sy gzip +compression algorithm uses the same compression as the +.Xr gzip 1 +command. +You can specify the +.Sy gzip +level by using the value +.Sy gzip- Ns Em N , +where +.Em N +is an integer from 1 +.Pq fastest +to 9 +.Pq best compression ratio . +Currently, +.Sy gzip +is equivalent to +.Sy gzip-6 +.Po which is also the default for +.Xr gzip 1 +.Pc . +.Pp +The +.Sy zstd +compression algorithm provides both high compression ratios and good +performance. You can specify the +.Sy zstd +level by using the value +.Sy zstd- Ns Em N , +where +.Em N +is an integer from 1 +.Pq fastest +to 19 +.Pq best compression ratio . +.Sy zstd +is equivalent to +.Sy zstd-3 . +.Pp +Faster speeds at the cost of the compression ratio can be requested by +setting a negative +.Sy zstd +level. This is done using +.Sy zstd-fast- Ns Em N , +where +.Em N +is an integer in [1-9,10,20,30,...,100,500,1000] which maps to a negative +.Sy zstd +level. The lower the level the faster the compression - 1000 provides +the fastest compression and lowest compression ratio. +.Sy zstd-fast +is equivalent to +.Sy zstd-fast-1 . +.Pp +The +.Sy zle +compression algorithm compresses runs of zeros. +.Pp +This property can also be referred to by its shortened column name +.Sy compress . +Changing this property affects only newly-written data. +.Pp +When any setting except +.Sy off +is selected, compression will explicitly check for blocks consisting of only +zeroes (the NUL byte). When a zero-filled block is detected, it is stored as +a hole and not compressed using the indicated compression algorithm. +.Pp +Any block being compressed must be no larger than 7/8 of its original size +after compression, otherwise the compression will not be considered worthwhile +and the block saved uncompressed. Note that when the logical block is less than +8 times the disk sector size this effectively reduces the necessary compression +ratio; for example 8k blocks on disks with 4k disk sectors must compress to 1/2 +or less of their original size. +.It Xo +.Sy context Ns = Ns Sy none Ns | Ns +.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level +.Xc +This flag sets the SELinux context for all files in the file system under +a mount point for that file system. See +.Xr selinux 8 +for more information. +.It Xo +.Sy fscontext Ns = Ns Sy none Ns | Ns +.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level +.Xc +This flag sets the SELinux context for the file system file system being +mounted. See +.Xr selinux 8 +for more information. +.It Xo +.Sy defcontext Ns = Ns Sy none Ns | Ns +.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level +.Xc +This flag sets the SELinux default context for unlabeled files. See +.Xr selinux 8 +for more information. +.It Xo +.Sy rootcontext Ns = Ns Sy none Ns | Ns +.Em SELinux_User:SElinux_Role:Selinux_Type:Sensitivity_Level +.Xc +This flag sets the SELinux context for the root inode of the file system. See +.Xr selinux 8 +for more information. +.It Sy copies Ns = Ns Sy 1 Ns | Ns Sy 2 Ns | Ns Sy 3 +Controls the number of copies of data stored for this dataset. +These copies are in addition to any redundancy provided by the pool, for +example, mirroring or RAID-Z. +The copies are stored on different disks, if possible. +The space used by multiple copies is charged to the associated file and dataset, +changing the +.Sy used +property and counting against quotas and reservations. +.Pp +Changing this property only affects newly-written data. +Therefore, set this property at file system creation time by using the +.Fl o Sy copies Ns = Ns Ar N +option. +.Pp +Remember that ZFS will not import a pool with a missing top-level vdev. Do +.Sy NOT +create, for example a two-disk striped pool and set +.Sy copies=2 +on some datasets thinking you have setup redundancy for them. When a disk +fails you will not be able to import the pool and will have lost all of your +data. +.Pp +Encrypted datasets may not have +.Sy copies Ns = Ns Em 3 +since the implementation stores some encryption metadata where the third copy +would normally be. +.It Sy devices Ns = Ns Sy on Ns | Ns Sy off +Controls whether device nodes can be opened on this file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy dev +and +.Sy nodev +mount options. +.It Xo +.Sy dedup Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy verify Ns | Ns +.Sy sha256[,verify] Ns | Ns Sy sha512[,verify] Ns | Ns Sy skein[,verify] Ns | Ns +.Sy edonr,verify +.Xc +Configures deduplication for a dataset. The default value is +.Sy off . +The default deduplication checksum is +.Sy sha256 +(this may change in the future). When +.Sy dedup +is enabled, the checksum defined here overrides the +.Sy checksum +property. Setting the value to +.Sy verify +has the same effect as the setting +.Sy sha256,verify. +.Pp +If set to +.Sy verify , +ZFS will do a byte-to-byte comparison in case of two blocks having the same +signature to make sure the block contents are identical. Specifying +.Sy verify +is mandatory for the +.Sy edonr +algorithm. +.Pp +Unless necessary, deduplication should NOT be enabled on a system. See the +.Em Deduplication +section of +.Xr zfsconcepts 8 . +.It Xo +.Sy dnodesize Ns = Ns Sy legacy Ns | Ns Sy auto Ns | Ns Sy 1k Ns | Ns +.Sy 2k Ns | Ns Sy 4k Ns | Ns Sy 8k Ns | Ns Sy 16k +.Xc +Specifies a compatibility mode or literal value for the size of dnodes in the +file system. The default value is +.Sy legacy . +Setting this property to a value other than +.Sy legacy +requires the large_dnode pool feature to be enabled. +.Pp +Consider setting +.Sy dnodesize +to +.Sy auto +if the dataset uses the +.Sy xattr=sa +property setting and the workload makes heavy use of extended attributes. This +may be applicable to SELinux-enabled systems, Lustre servers, and Samba +servers, for example. Literal values are supported for cases where the optimal +size is known in advance and for performance testing. +.Pp +Leave +.Sy dnodesize +set to +.Sy legacy +if you need to receive a send stream of this dataset on a pool that doesn't +enable the large_dnode feature, or if you need to import this pool on a system +that doesn't support the large_dnode feature. +.Pp +This property can also be referred to by its shortened column name, +.Sy dnsize . +.It Xo +.Sy encryption Ns = Ns Sy off Ns | Ns Sy on Ns | Ns Sy aes-128-ccm Ns | Ns +.Sy aes-192-ccm Ns | Ns Sy aes-256-ccm Ns | Ns Sy aes-128-gcm Ns | Ns +.Sy aes-192-gcm Ns | Ns Sy aes-256-gcm +.Xc +Controls the encryption cipher suite (block cipher, key length, and mode) used +for this dataset. Requires the +.Sy encryption +feature to be enabled on the pool. +Requires a +.Sy keyformat +to be set at dataset creation time. +.Pp +Selecting +.Sy encryption Ns = Ns Sy on +when creating a dataset indicates that the default encryption suite will be +selected, which is currently +.Sy aes-256-gcm . +In order to provide consistent data protection, encryption must be specified at +dataset creation time and it cannot be changed afterwards. +.Pp +For more details and caveats about encryption see the +.Sy Encryption +section. +.It Sy keyformat Ns = Ns Sy raw Ns | Ns Sy hex Ns | Ns Sy passphrase +Controls what format the user's encryption key will be provided as. This +property is only set when the dataset is encrypted. +.Pp +Raw keys and hex keys must be 32 bytes long (regardless of the chosen +encryption suite) and must be randomly generated. A raw key can be generated +with the following command: +.Bd -literal +# dd if=/dev/urandom of=/path/to/output/key bs=32 count=1 +.Ed +.Pp +Passphrases must be between 8 and 512 bytes long and will be processed through +PBKDF2 before being used (see the +.Sy pbkdf2iters +property). Even though the +encryption suite cannot be changed after dataset creation, the keyformat can be +with +.Nm zfs Cm change-key . +.It Xo +.Sy keylocation Ns = Ns Sy prompt Ns | Ns Sy file:// Ns Em +.Xc +Controls where the user's encryption key will be loaded from by default for +commands such as +.Nm zfs Cm load-key +and +.Nm zfs Cm mount Cm -l . +This property is only set for encrypted datasets which are encryption roots. If +unspecified, the default is +.Sy prompt. +.Pp +Even though the encryption suite cannot be changed after dataset creation, the +keylocation can be with either +.Nm zfs Cm set +or +.Nm zfs Cm change-key . +If +.Sy prompt +is selected ZFS will ask for the key at the command prompt when it is required +to access the encrypted data (see +.Nm zfs Cm load-key +for details). This setting will also allow the key to be passed in via STDIN, +but users should be careful not to place keys which should be kept secret on +the command line. If a file URI is selected, the key will be loaded from the +specified absolute file path. +.It Sy pbkdf2iters Ns = Ns Ar iterations +Controls the number of PBKDF2 iterations that a +.Sy passphrase +encryption key should be run through when processing it into an encryption key. +This property is only defined when encryption is enabled and a keyformat of +.Sy passphrase +is selected. The goal of PBKDF2 is to significantly increase the +computational difficulty needed to brute force a user's passphrase. This is +accomplished by forcing the attacker to run each passphrase through a +computationally expensive hashing function many times before they arrive at the +resulting key. A user who actually knows the passphrase will only have to pay +this cost once. As CPUs become better at processing, this number should be +raised to ensure that a brute force attack is still not possible. The current +default is +.Sy 350000 +and the minimum is +.Sy 100000 . +This property may be changed with +.Nm zfs Cm change-key . +.It Sy exec Ns = Ns Sy on Ns | Ns Sy off +Controls whether processes can be executed from within this file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy exec +and +.Sy noexec +mount options. +.It Sy filesystem_limit Ns = Ns Em count Ns | Ns Sy none +Limits the number of filesystems and volumes that can exist under this point in +the dataset tree. +The limit is not enforced if the user is allowed to change the limit. +Setting a +.Sy filesystem_limit +to +.Sy on +a descendent of a filesystem that already has a +.Sy filesystem_limit +does not override the ancestor's +.Sy filesystem_limit , +but rather imposes an additional limit. +This feature must be enabled to be used +.Po see +.Xr zpool-features 5 +.Pc . +.It Sy special_small_blocks Ns = Ns Em size +This value represents the threshold block size for including small file +blocks into the special allocation class. Blocks smaller than or equal to this +value will be assigned to the special allocation class while greater blocks +will be assigned to the regular class. Valid values are zero or a power of two +from 512B up to 1M. The default size is 0 which means no small file blocks +will be allocated in the special class. +.Pp +Before setting this property, a special class vdev must be added to the +pool. See +.Xr zpool 8 +for more details on the special allocation class. +.It Sy mountpoint Ns = Ns Pa path Ns | Ns Sy none Ns | Ns Sy legacy +Controls the mount point used for this file system. +See the +.Em Mount Points +section of +.Xr zfsconcepts 8 +for more information on how this property is used. +.Pp +When the +.Sy mountpoint +property is changed for a file system, the file system and any children that +inherit the mount point are unmounted. +If the new value is +.Sy legacy , +then they remain unmounted. +Otherwise, they are automatically remounted in the new location if the property +was previously +.Sy legacy +or +.Sy none , +or if they were mounted before the property was changed. +In addition, any shared file systems are unshared and shared in the new +location. +.It Sy nbmand Ns = Ns Sy on Ns | Ns Sy off +Controls whether the file system should be mounted with +.Sy nbmand +.Pq Non Blocking mandatory locks . +This is used for SMB clients. +Changes to this property only take effect when the file system is umounted and +remounted. +See +.Xr mount 8 +for more information on +.Sy nbmand +mounts. This property is not used on Linux. +.It Sy overlay Ns = Ns Sy on Ns | Ns Sy off +Allow mounting on a busy directory or a directory which already contains +files or directories. +This is the default mount behavior for Linux and FreeBSD file systems. +On these platforms the property is +.Sy on +by default. +Set to +.Sy off +to disable overlay mounts for consistency with OpenZFS on other platforms. +.It Sy primarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what is cached in the primary cache +.Pq ARC . +If this property is set to +.Sy all , +then both user data and metadata is cached. +If this property is set to +.Sy none , +then neither user data nor metadata is cached. +If this property is set to +.Sy metadata , +then only metadata is cached. +The default value is +.Sy all . +.It Sy quota Ns = Ns Em size Ns | Ns Sy none +Limits the amount of space a dataset and its descendents can consume. +This property enforces a hard limit on the amount of space used. +This includes all space consumed by descendents, including file systems and +snapshots. +Setting a quota on a descendent of a dataset that already has a quota does not +override the ancestor's quota, but rather imposes an additional limit. +.Pp +Quotas cannot be set on volumes, as the +.Sy volsize +property acts as an implicit quota. +.It Sy snapshot_limit Ns = Ns Em count Ns | Ns Sy none +Limits the number of snapshots that can be created on a dataset and its +descendents. +Setting a +.Sy snapshot_limit +on a descendent of a dataset that already has a +.Sy snapshot_limit +does not override the ancestor's +.Sy snapshot_limit , +but rather imposes an additional limit. +The limit is not enforced if the user is allowed to change the limit. +For example, this means that recursive snapshots taken from the global zone are +counted against each delegated dataset within a zone. +This feature must be enabled to be used +.Po see +.Xr zpool-features 5 +.Pc . +.It Sy userquota@ Ns Em user Ns = Ns Em size Ns | Ns Sy none +Limits the amount of space consumed by the specified user. +User space consumption is identified by the +.Sy userspace@ Ns Em user +property. +.Pp +Enforcement of user quotas may be delayed by several seconds. +This delay means that a user might exceed their quota before the system notices +that they are over quota and begins to refuse additional writes with the +.Er EDQUOT +error message. +See the +.Nm zfs Cm userspace +subcommand for more information. +.Pp +Unprivileged users can only access their own groups' space usage. +The root user, or a user who has been granted the +.Sy userquota +privilege with +.Nm zfs Cm allow , +can get and set everyone's quota. +.Pp +This property is not available on volumes, on file systems before version 4, or +on pools before version 15. +The +.Sy userquota@ Ns Em ... +properties are not displayed by +.Nm zfs Cm get Sy all . +The user's name must be appended after the +.Sy @ +symbol, using one of the following forms: +.Bl -bullet +.It +.Em POSIX name +.Po for example, +.Sy joe +.Pc +.It +.Em POSIX numeric ID +.Po for example, +.Sy 789 +.Pc +.It +.Em SID name +.Po for example, +.Sy joe.smith@mydomain +.Pc +.It +.Em SID numeric ID +.Po for example, +.Sy S-1-123-456-789 +.Pc +.El +.Pp +Files created on Linux always have POSIX owners. +.It Sy userobjquota@ Ns Em user Ns = Ns Em size Ns | Ns Sy none +The +.Sy userobjquota +is similar to +.Sy userquota +but it limits the number of objects a user can create. Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy groupquota@ Ns Em group Ns = Ns Em size Ns | Ns Sy none +Limits the amount of space consumed by the specified group. +Group space consumption is identified by the +.Sy groupused@ Ns Em group +property. +.Pp +Unprivileged users can access only their own groups' space usage. +The root user, or a user who has been granted the +.Sy groupquota +privilege with +.Nm zfs Cm allow , +can get and set all groups' quotas. +.It Sy groupobjquota@ Ns Em group Ns = Ns Em size Ns | Ns Sy none +The +.Sy groupobjquota +is similar to +.Sy groupquota +but it limits number of objects a group can consume. Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy projectquota@ Ns Em project Ns = Ns Em size Ns | Ns Sy none +Limits the amount of space consumed by the specified project. Project +space consumption is identified by the +.Sy projectused@ Ns Em project +property. Please refer to +.Sy projectused +for more information about how project is identified and set/changed. +.Pp +The root user, or a user who has been granted the +.Sy projectquota +privilege with +.Nm zfs allow , +can access all projects' quota. +.It Sy projectobjquota@ Ns Em project Ns = Ns Em size Ns | Ns Sy none +The +.Sy projectobjquota +is similar to +.Sy projectquota +but it limits number of objects a project can consume. Please refer to +.Sy userobjused +for more information about how objects are counted. +.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off +Controls whether this dataset can be modified. +The default value is +.Sy off . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy ro +and +.Sy rw +mount options. +.Pp +This property can also be referred to by its shortened column name, +.Sy rdonly . +.It Sy recordsize Ns = Ns Em size +Specifies a suggested block size for files in the file system. +This property is designed solely for use with database workloads that access +files in fixed-size records. +ZFS automatically tunes block sizes according to internal algorithms optimized +for typical access patterns. +.Pp +For databases that create very large files but access them in small random +chunks, these algorithms may be suboptimal. +Specifying a +.Sy recordsize +greater than or equal to the record size of the database can result in +significant performance gains. +Use of this property for general purpose file systems is strongly discouraged, +and may adversely affect performance. +.Pp +The size specified must be a power of two greater than or equal to 512 and less +than or equal to 128 Kbytes. +If the +.Sy large_blocks +feature is enabled on the pool, the size may be up to 1 Mbyte. +See +.Xr zpool-features 5 +for details on ZFS feature flags. +.Pp +Changing the file system's +.Sy recordsize +affects only files created afterward; existing files are unaffected. +.Pp +This property can also be referred to by its shortened column name, +.Sy recsize . +.It Sy redundant_metadata Ns = Ns Sy all Ns | Ns Sy most +Controls what types of metadata are stored redundantly. +ZFS stores an extra copy of metadata, so that if a single block is corrupted, +the amount of user data lost is limited. +This extra copy is in addition to any redundancy provided at the pool level +.Pq e.g. by mirroring or RAID-Z , +and is in addition to an extra copy specified by the +.Sy copies +property +.Pq up to a total of 3 copies . +For example if the pool is mirrored, +.Sy copies Ns = Ns 2 , +and +.Sy redundant_metadata Ns = Ns Sy most , +then ZFS stores 6 copies of most metadata, and 4 copies of data and some +metadata. +.Pp +When set to +.Sy all , +ZFS stores an extra copy of all metadata. +If a single on-disk block is corrupt, at worst a single block of user data +.Po which is +.Sy recordsize +bytes long +.Pc +can be lost. +.Pp +When set to +.Sy most , +ZFS stores an extra copy of most types of metadata. +This can improve performance of random writes, because less metadata must be +written. +In practice, at worst about 100 blocks +.Po of +.Sy recordsize +bytes each +.Pc +of user data can be lost if a single on-disk block is corrupt. +The exact behavior of which metadata blocks are stored redundantly may change in +future releases. +.Pp +The default value is +.Sy all . +.It Sy refquota Ns = Ns Em size Ns | Ns Sy none +Limits the amount of space a dataset can consume. +This property enforces a hard limit on the amount of space used. +This hard limit does not include space used by descendents, including file +systems and snapshots. +.It Sy refreservation Ns = Ns Em size Ns | Ns Sy none Ns | Ns Sy auto +The minimum amount of space guaranteed to a dataset, not including its +descendents. +When the amount of space used is below this value, the dataset is treated as if +it were taking up the amount of space specified by +.Sy refreservation . +The +.Sy refreservation +reservation is accounted for in the parent datasets' space used, and counts +against the parent datasets' quotas and reservations. +.Pp +If +.Sy refreservation +is set, a snapshot is only allowed if there is enough free pool space outside of +this reservation to accommodate the current number of +.Qq referenced +bytes in the dataset. +.Pp +If +.Sy refreservation +is set to +.Sy auto , +a volume is thick provisioned +.Po or +.Qq not sparse +.Pc . +.Sy refreservation Ns = Ns Sy auto +is only supported on volumes. +See +.Sy volsize +in the +.Sx Native Properties +section for more information about sparse volumes. +.Pp +This property can also be referred to by its shortened column name, +.Sy refreserv . +.It Sy relatime Ns = Ns Sy on Ns | Ns Sy off +Controls the manner in which the access time is updated when +.Sy atime=on +is set. Turning this property on causes the access time to be updated relative +to the modify or change time. Access time is only updated if the previous +access time was earlier than the current modify or change time or if the +existing access time hasn't been updated within the past 24 hours. The default +value is +.Sy off . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy relatime +and +.Sy norelatime +mount options. +.It Sy reservation Ns = Ns Em size Ns | Ns Sy none +The minimum amount of space guaranteed to a dataset and its descendants. +When the amount of space used is below this value, the dataset is treated as if +it were taking up the amount of space specified by its reservation. +Reservations are accounted for in the parent datasets' space used, and count +against the parent datasets' quotas and reservations. +.Pp +This property can also be referred to by its shortened column name, +.Sy reserv . +.It Sy secondarycache Ns = Ns Sy all Ns | Ns Sy none Ns | Ns Sy metadata +Controls what is cached in the secondary cache +.Pq L2ARC . +If this property is set to +.Sy all , +then both user data and metadata is cached. +If this property is set to +.Sy none , +then neither user data nor metadata is cached. +If this property is set to +.Sy metadata , +then only metadata is cached. +The default value is +.Sy all . +.It Sy setuid Ns = Ns Sy on Ns | Ns Sy off +Controls whether the setuid bit is respected for the file system. +The default value is +.Sy on . +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy suid +and +.Sy nosuid +mount options. +.It Sy sharesmb Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Em opts +Controls whether the file system is shared by using +.Sy Samba USERSHARES +and what options are to be used. Otherwise, the file system is automatically +shared and unshared with the +.Nm zfs Cm share +and +.Nm zfs Cm unshare +commands. If the property is set to on, the +.Xr net 8 +command is invoked to create a +.Sy USERSHARE . +.Pp +Because SMB shares requires a resource name, a unique resource name is +constructed from the dataset name. The constructed name is a copy of the +dataset name except that the characters in the dataset name, which would be +invalid in the resource name, are replaced with underscore (_) characters. +Linux does not currently support additional options which might be available +on Solaris. +.Pp +If the +.Sy sharesmb +property is set to +.Sy off , +the file systems are unshared. +.Pp +The share is created with the ACL (Access Control List) "Everyone:F" ("F" +stands for "full permissions", ie. read and write permissions) and no guest +access (which means Samba must be able to authenticate a real user, system +passwd/shadow, LDAP or smbpasswd based) by default. This means that any +additional access control (disallow specific user specific access etc) must +be done on the underlying file system. +.It Sy sharenfs Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Em opts +Controls whether the file system is shared via NFS, and what options are to be +used. +A file system with a +.Sy sharenfs +property of +.Sy off +is managed with the +.Xr exportfs 8 +command and entries in the +.Em /etc/exports +file. +Otherwise, the file system is automatically shared and unshared with the +.Nm zfs Cm share +and +.Nm zfs Cm unshare +commands. +If the property is set to +.Sy on , +the dataset is shared using the default options: +.Pp +.Em sec=sys,rw,crossmnt,no_subtree_check +.Pp +See +.Xr exports 5 +for the meaning of the default options. Otherwise, the +.Xr exportfs 8 +command is invoked with options equivalent to the contents of this property. +.Pp +When the +.Sy sharenfs +property is changed for a dataset, the dataset and any children inheriting the +property are re-shared with the new options, only if the property was previously +.Sy off , +or if they were shared before the property was changed. +If the new property is +.Sy off , +the file systems are unshared. +.It Sy logbias Ns = Ns Sy latency Ns | Ns Sy throughput +Provide a hint to ZFS about handling of synchronous requests in this dataset. +If +.Sy logbias +is set to +.Sy latency +.Pq the default , +ZFS will use pool log devices +.Pq if configured +to handle the requests at low latency. +If +.Sy logbias +is set to +.Sy throughput , +ZFS will not use configured pool log devices. +ZFS will instead optimize synchronous operations for global pool throughput and +efficient use of resources. +.It Sy snapdev Ns = Ns Sy hidden Ns | Ns Sy visible +Controls whether the volume snapshot devices under +.Em /dev/zvol/ +are hidden or visible. The default value is +.Sy hidden . +.It Sy snapdir Ns = Ns Sy hidden Ns | Ns Sy visible +Controls whether the +.Pa .zfs +directory is hidden or visible in the root of the file system as discussed in +the +.Em Snapshots +section of +.Xr zfsconcepts 8 . +The default value is +.Sy hidden . +.It Sy sync Ns = Ns Sy standard Ns | Ns Sy always Ns | Ns Sy disabled +Controls the behavior of synchronous requests +.Pq e.g. fsync, O_DSYNC . +.Sy standard +is the +.Tn POSIX +specified behavior of ensuring all synchronous requests are written to stable +storage and all devices are flushed to ensure data is not cached by device +controllers +.Pq this is the default . +.Sy always +causes every file system transaction to be written and flushed before its +system call returns. +This has a large performance penalty. +.Sy disabled +disables synchronous requests. +File system transactions are only committed to stable storage periodically. +This option will give the highest performance. +However, it is very dangerous as ZFS would be ignoring the synchronous +transaction demands of applications such as databases or NFS. +Administrators should only use this option when the risks are understood. +.It Sy version Ns = Ns Em N Ns | Ns Sy current +The on-disk version of this file system, which is independent of the pool +version. +This property can only be set to later supported versions. +See the +.Nm zfs Cm upgrade +command. +.It Sy volsize Ns = Ns Em size +For volumes, specifies the logical size of the volume. +By default, creating a volume establishes a reservation of equal size. +For storage pools with a version number of 9 or higher, a +.Sy refreservation +is set instead. +Any changes to +.Sy volsize +are reflected in an equivalent change to the reservation +.Po or +.Sy refreservation +.Pc . +The +.Sy volsize +can only be set to a multiple of +.Sy volblocksize , +and cannot be zero. +.Pp +The reservation is kept equal to the volume's logical size to prevent unexpected +behavior for consumers. +Without the reservation, the volume could run out of space, resulting in +undefined behavior or data corruption, depending on how the volume is used. +These effects can also occur when the volume size is changed while it is in use +.Pq particularly when shrinking the size . +Extreme care should be used when adjusting the volume size. +.Pp +Though not recommended, a +.Qq sparse volume +.Po also known as +.Qq thin provisioned +.Pc +can be created by specifying the +.Fl s +option to the +.Nm zfs Cm create Fl V +command, or by changing the value of the +.Sy refreservation +property +.Po or +.Sy reservation +property on pool version 8 or earlier +.Pc +after the volume has been created. +A +.Qq sparse volume +is a volume where the value of +.Sy refreservation +is less than the size of the volume plus the space required to store its +metadata. +Consequently, writes to a sparse volume can fail with +.Er ENOSPC +when the pool is low on space. +For a sparse volume, changes to +.Sy volsize +are not reflected in the +.Sy refreservation. +A volume that is not sparse is said to be +.Qq thick provisioned . +A sparse volume can become thick provisioned by setting +.Sy refreservation +to +.Sy auto . +.It Sy volmode Ns = Ns Cm default | full | geom | dev | none +This property specifies how volumes should be exposed to the OS. +Setting it to +.Sy full +exposes volumes as fully fledged block devices, providing maximal +functionality. The value +.Sy geom +is just an alias for +.Sy full +and is kept for compatibility. +Setting it to +.Sy dev +hides its partitions. +Volumes with property set to +.Sy none +are not exposed outside ZFS, but can be snapshotted, cloned, replicated, etc, +that can be suitable for backup purposes. +Value +.Sy default +means that volumes exposition is controlled by system-wide tunable +.Va zvol_volmode , +where +.Sy full , +.Sy dev +and +.Sy none +are encoded as 1, 2 and 3 respectively. +The default values is +.Sy full . +.It Sy vscan Ns = Ns Sy on Ns | Ns Sy off +Controls whether regular files should be scanned for viruses when a file is +opened and closed. +In addition to enabling this property, the virus scan service must also be +enabled for virus scanning to occur. +The default value is +.Sy off . +This property is not used on Linux. +.It Sy xattr Ns = Ns Sy on Ns | Ns Sy off Ns | Ns Sy sa +Controls whether extended attributes are enabled for this file system. Two +styles of extended attributes are supported either directory based or system +attribute based. +.Pp +The default value of +.Sy on +enables directory based extended attributes. This style of extended attribute +imposes no practical limit on either the size or number of attributes which +can be set on a file. Although under Linux the +.Xr getxattr 2 +and +.Xr setxattr 2 +system calls limit the maximum size to 64K. This is the most compatible +style of extended attribute and is supported by all OpenZFS implementations. +.Pp +System attribute based xattrs can be enabled by setting the value to +.Sy sa . +The key advantage of this type of xattr is improved performance. Storing +extended attributes as system attributes significantly decreases the amount of +disk IO required. Up to 64K of data may be stored per-file in the space +reserved for system attributes. If there is not enough space available for +an extended attribute then it will be automatically written as a directory +based xattr. System attribute based extended attributes are not accessible +on platforms which do not support the +.Sy xattr=sa +feature. +.Pp +The use of system attribute based xattrs is strongly encouraged for users of +SELinux or POSIX ACLs. Both of these features heavily rely of extended +attributes and benefit significantly from the reduced access time. +.Pp +The values +.Sy on +and +.Sy off +are equivalent to the +.Sy xattr +and +.Sy noxattr +mount options. +.It Sy jailed Ns = Ns Cm off | on +Controls whether the dataset is managed from a jail. See the +.Qq Sx Jails +section in +.Xr zfs 8 +for more information. Jails are a FreeBSD feature and are not relevant on +other platforms. The default value is +.Cm off . +.It Sy zoned Ns = Ns Sy on Ns | Ns Sy off +Controls whether the dataset is managed from a non-global zone. Zones are a +Solaris feature and are not relevant on other platforms. The default value is +.Sy off . +.El +.Pp +The following three properties cannot be changed after the file system is +created, and therefore, should be set when the file system is created. +If the properties are not set with the +.Nm zfs Cm create +or +.Nm zpool Cm create +commands, these properties are inherited from the parent dataset. +If the parent dataset lacks these properties due to having been created prior to +these features being supported, the new file system will have the default values +for these properties. +.Bl -tag -width "" +.It Xo +.Sy casesensitivity Ns = Ns Sy sensitive Ns | Ns +.Sy insensitive Ns | Ns Sy mixed +.Xc +Indicates whether the file name matching algorithm used by the file system +should be case-sensitive, case-insensitive, or allow a combination of both +styles of matching. +The default value for the +.Sy casesensitivity +property is +.Sy sensitive . +Traditionally, +.Ux +and +.Tn POSIX +file systems have case-sensitive file names. +.Pp +The +.Sy mixed +value for the +.Sy casesensitivity +property indicates that the file system can support requests for both +case-sensitive and case-insensitive matching behavior. +Currently, case-insensitive matching behavior on a file system that supports +mixed behavior is limited to the SMB server product. +For more information about the +.Sy mixed +value behavior, see the "ZFS Administration Guide". +.It Xo +.Sy normalization Ns = Ns Sy none Ns | Ns Sy formC Ns | Ns +.Sy formD Ns | Ns Sy formKC Ns | Ns Sy formKD +.Xc +Indicates whether the file system should perform a +.Sy unicode +normalization of file names whenever two file names are compared, and which +normalization algorithm should be used. +File names are always stored unmodified, names are normalized as part of any +comparison process. +If this property is set to a legal value other than +.Sy none , +and the +.Sy utf8only +property was left unspecified, the +.Sy utf8only +property is automatically set to +.Sy on . +The default value of the +.Sy normalization +property is +.Sy none . +This property cannot be changed after the file system is created. +.It Sy utf8only Ns = Ns Sy on Ns | Ns Sy off +Indicates whether the file system should reject file names that include +characters that are not present in the +.Sy UTF-8 +character code set. +If this property is explicitly set to +.Sy off , +the normalization property must either not be explicitly set or be set to +.Sy none . +The default value for the +.Sy utf8only +property is +.Sy off . +This property cannot be changed after the file system is created. +.El +.Pp +The +.Sy casesensitivity , +.Sy normalization , +and +.Sy utf8only +properties are also new permissions that can be assigned to non-privileged users +by using the ZFS delegated administration feature. +.Ss "Temporary Mount Point Properties" +When a file system is mounted, either through +.Xr mount 8 +for legacy mounts or the +.Nm zfs Cm mount +command for normal file systems, its mount options are set according to its +properties. +The correlation between properties and mount options is as follows: +.Bd -literal + PROPERTY MOUNT OPTION + atime atime/noatime + canmount auto/noauto + devices dev/nodev + exec exec/noexec + readonly ro/rw + relatime relatime/norelatime + setuid suid/nosuid + xattr xattr/noxattr +.Ed +.Pp +In addition, these options can be set on a per-mount basis using the +.Fl o +option, without affecting the property that is stored on disk. +The values specified on the command line override the values stored in the +dataset. +The +.Sy nosuid +option is an alias for +.Sy nodevices Ns \&, Ns Sy nosetuid . +These properties are reported as +.Qq temporary +by the +.Nm zfs Cm get +command. +If the properties are changed while the dataset is mounted, the new setting +overrides any temporary settings. +.Ss "User Properties" +In addition to the standard native properties, ZFS supports arbitrary user +properties. +User properties have no effect on ZFS behavior, but applications or +administrators can use them to annotate datasets +.Pq file systems, volumes, and snapshots . +.Pp +User property names must contain a colon +.Pq Qq Sy \&: +character to distinguish them from native properties. +They may contain lowercase letters, numbers, and the following punctuation +characters: colon +.Pq Qq Sy \&: , +dash +.Pq Qq Sy - , +period +.Pq Qq Sy \&. , +and underscore +.Pq Qq Sy _ . +The expected convention is that the property name is divided into two portions +such as +.Em module Ns \&: Ns Em property , +but this namespace is not enforced by ZFS. +User property names can be at most 256 characters, and cannot begin with a dash +.Pq Qq Sy - . +.Pp +When making programmatic use of user properties, it is strongly suggested to use +a reversed +.Sy DNS +domain name for the +.Em module +component of property names to reduce the chance that two +independently-developed packages use the same property name for different +purposes. +.Pp +The values of user properties are arbitrary strings, are always inherited, and +are never validated. +All of the commands that operate on properties +.Po Nm zfs Cm list , +.Nm zfs Cm get , +.Nm zfs Cm set , +and so forth +.Pc +can be used to manipulate both native properties and user properties. +Use the +.Nm zfs Cm inherit +command to clear a user property. +If the property is not defined in any parent dataset, it is removed entirely. +Property values are limited to 8192 bytes. diff --git a/man/man8/zgenhostid.8 b/man/man8/zgenhostid.8 new file mode 100644 index 000000000000..ff3d2d960b12 --- /dev/null +++ b/man/man8/zgenhostid.8 @@ -0,0 +1,71 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2017 by Lawrence Livermore National Security, LLC. +.\" +.Dd September 16, 2017 +.Dt ZGENHOSTID 8 SMM +.Os +.Sh NAME +.Nm zgenhostid +.Nd generate and store a hostid in +.Em /etc/hostid +.Sh SYNOPSIS +.Nm +.Op Ar hostid +.Sh DESCRIPTION +If +.Em /etc/hostid +does not exist, create it and store a hostid in it. If the user provides +.Op Ar hostid +on the command line, store that value. Otherwise, randomly generate a +value to store. +.Pp +This emulates the +.Xr genhostid 1 +utility and is provided for use on systems which do not include the utility. +.Sh OPTIONS +.Op Ar hostid +Specifies the value to be placed in +.Em /etc/hostid . +It must be a number with a value between 1 and 2^32-1. This value +.Sy must +be unique among your systems. It must be expressed in hexadecimal and be +exactly 8 digits long. +.Sh EXAMPLES +.Bl -tag -width Ds +.It Generate a random hostid and store it +.Bd -literal +# zgenhostid +.Ed +.It Record the libc-generated hostid in Em /etc/hostid +.Bd -literal +# zgenhostid $(hostid) +.Ed +.It Record a custom hostid (0xdeadbeef) in Em etc/hostid +.Bd -literal +# zgenhostid deadbeef +.Ed +.El +.Sh SEE ALSO +.Xr genhostid 1 , +.Xr hostid 1 , +.Xr spl-module-parameters 5 diff --git a/man/man8/zinject.8 b/man/man8/zinject.8 new file mode 100644 index 000000000000..f02e78ca207e --- /dev/null +++ b/man/man8/zinject.8 @@ -0,0 +1,198 @@ +'\" t +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright 2013 Darik Horn . All rights reserved. +.\" +.TH zinject 8 "2013 FEB 28" "ZFS on Linux" "System Administration Commands" + +.SH NAME +zinject \- ZFS Fault Injector +.SH DESCRIPTION +.BR zinject +creates artificial problems in a ZFS pool by simulating data corruption or device failures. This program is dangerous. +.SH SYNOPSIS +.TP +.B "zinject" +List injection records. +.TP +.B "zinject \-b \fIobjset:object:level:blkd\fB [\-f \fIfrequency\fB] [\-amu] \fIpool\fB" +Force an error into the pool at a bookmark. +.TP +.B "zinject \-c <\fIid\fB | all> +Cancel injection records. +.TP +.B "zinject \-d \fIvdev\fB \-A \fIpool\fB +Force a vdev into the DEGRADED or FAULTED state. +.TP +.B "zinject -d \fIvdev\fB -D latency:lanes \fIpool\fB + +Add an artificial delay to IO requests on a particular +device, such that the requests take a minimum of 'latency' +milliseconds to complete. Each delay has an associated +number of 'lanes' which defines the number of concurrent +IO requests that can be processed. + +For example, with a single lane delay of 10 ms (-D 10:1), +the device will only be able to service a single IO request +at a time with each request taking 10 ms to complete. So, +if only a single request is submitted every 10 ms, the +average latency will be 10 ms; but if more than one request +is submitted every 10 ms, the average latency will be more +than 10 ms. + +Similarly, if a delay of 10 ms is specified to have two +lanes (-D 10:2), then the device will be able to service +two requests at a time, each with a minimum latency of +10 ms. So, if two requests are submitted every 10 ms, then +the average latency will be 10 ms; but if more than two +requests are submitted every 10 ms, the average latency +will be more than 10 ms. + +Also note, these delays are additive. So two invocations +of '-D 10:1', is roughly equivalent to a single invocation +of '-D 10:2'. This also means, one can specify multiple +lanes with differing target latencies. For example, an +invocation of '-D 10:1' followed by '-D 25:2' will +create 3 lanes on the device; one lane with a latency +of 10 ms and two lanes with a 25 ms latency. + +.TP +.B "zinject \-d \fIvdev\fB [\-e \fIdevice_error\fB] [\-L \fIlabel_error\fB] [\-T \fIfailure\fB] [\-f \fIfrequency\fB] [\-F] \fIpool\fB" +Force a vdev error. +.TP +.B "zinject \-I [\-s \fIseconds\fB | \-g \fItxgs\fB] \fIpool\fB" +Simulate a hardware failure that fails to honor a cache flush. +.TP +.B "zinject \-p \fIfunction\fB \fIpool\fB +Panic inside the specified function. +.TP +.B "zinject \-t data [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amq] \fIpath\fB" +Force an error into the contents of a file. +.TP +.B "zinject \-t dnode [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-amq] \fIpath\fB" +Force an error into the metadnode for a file or directory. +.TP +.B "zinject \-t \fImos_type\fB [\-C \fIdvas\fB] [\-e \fIdevice_error\fB] [\-f \fIfrequency\fB] [\-l \fIlevel\fB] [\-r \fIrange\fB] [\-amqu] \fIpool\fB" +Force an error into the MOS of a pool. +.SH OPTIONS +.TP +.BI "\-a" +Flush the ARC before injection. +.TP +.BI "\-b" " objset:object:level:start:end" +Force an error into the pool at this bookmark tuple. Each number is +in hexadecimal, and only one block can be specified. +.TP +.BI "\-C" " dvas" +Inject the given error only into specific DVAs. The mask should be +specified as a list of 0-indexed DVAs separated by commas (ex. '0,2'). This +option is not applicable to logical data errors such as +.BR "decompress" +and +.BR "decrypt" . +.TP +.BI "\-d" " vdev" +A vdev specified by path or GUID. +.TP +.BI "\-e" " device_error" +Specify +.BR "checksum" " for an ECKSUM error," +.BR "decompress" " for a data decompression error," +.BR "decrypt" " for a data decryption error," +.BR "corrupt" " to flip a bit in the data after a read," +.BR "dtl" " for an ECHILD error," +.BR "io" " for an EIO error where reopening the device will succeed, or" +.BR "nxio" " for an ENXIO error where reopening the device will fail." +For EIO and ENXIO, the "failed" reads or writes still occur. The probe simply +sets the error value reported by the I/O pipeline so it appears the read or +write failed. Decryption errors only currently work with file data. +.TP +.BI "\-f" " frequency" +Only inject errors a fraction of the time. Expressed as a real number +percentage between 0.0001 and 100. +.TP +.BI "\-F" +Fail faster. Do fewer checks. +.TP +.BI "\-g" " txgs" +Run for this many transaction groups before reporting failure. +.TP +.BI "\-h" +Print the usage message. +.TP +.BI "\-l" " level" +Inject an error at a particular block level. The default is 0. +.TP +.BI "\-L" " label_error" +Set the label error region to one of +.BR " nvlist" "," +.BR " pad1" "," +.BR " pad2" ", or" +.BR " uber" "." +.TP +.BI "\-m" +Automatically remount the underlying filesystem. +.TP +.BI "\-q" +Quiet mode. Only print the handler number added. +.TP +.BI "\-r" " range" +Inject an error over a particular logical range of an object, which +will be translated to the appropriate blkid range according to the +object's properties. +.TP +.BI "\-s" " seconds" +Run for this many seconds before reporting failure. +.TP +.BI "\-T" " failure" +Set the failure type to one of +.BR " all" "," +.BR " claim" "," +.BR " free" "," +.BR " read" ", or" +.BR " write" "." +.TP +.BI "\-t" " mos_type" +Set this to +.BR "mos " "for any data in the MOS," +.BR "mosdir " "for an object directory," +.BR "config " "for the pool configuration," +.BR "bpobj " "for the block pointer list," +.BR "spacemap " "for the space map," +.BR "metaslab " "for the metaslab, or" +.BR "errlog " "for the persistent error log." +.TP +.BI "\-u" +Unload the pool after injection. + +.SH "ENVIRONMENT VARIABLES" +.TP +.B "ZINJECT_DEBUG" +Run \fBzinject\fR in debug mode. + +.SH "AUTHORS" +This man page was written by Darik Horn +excerpting the \fBzinject\fR usage message and source code. + +.SH "SEE ALSO" +.BR zpool (8), +.BR zfs (8) diff --git a/man/man8/zpool-add.8 b/man/man8/zpool-add.8 new file mode 100644 index 000000000000..e8adc353ac9c --- /dev/null +++ b/man/man8/zpool-add.8 @@ -0,0 +1,102 @@ +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-ADD 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm add +.Nd Adds specified virtual devices to a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm add +.Op Fl fgLnP +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool vdev Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm add +.Op Fl fgLnP +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool vdev Ns ... +.Xc +Adds the specified virtual devices to the given pool. +The +.Ar vdev +specification is described in the +.Em Virtual Devices +section of +.Xr zpoolconcepts 8. +The behavior of the +.Fl f +option, and the device checks performed are described in the +.Nm zpool Cm create +subcommand. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar vdev Ns s , +even if they appear in use or specify a conflicting replication level. +Not all devices can be overridden in this manner. +.It Fl g +Display +.Ar vdev , +GUIDs instead of the normal device names. These GUIDs can be used in place of +device names for the zpool detach/offline/remove/replace commands. +.It Fl L +Display real paths for +.Ar vdev Ns s +resolving all symbolic links. This can be used to look up the current block +device name regardless of the /dev/disk/ path used to open it. +.It Fl n +Displays the configuration that would be used without actually adding the +.Ar vdev Ns s . +The actual pool creation can still fail due to insufficient privileges or +device sharing. +.It Fl P +Display real paths for +.Ar vdev Ns s +instead of only the last component of the path. This can be used in +conjunction with the +.Fl L +flag. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. See the +.Xr zpoolprops 8 +manual page for a list of valid properties that can be set. The only property +supported at the moment is ashift. +.El +.El +.Sh SEE ALSO +.Xr zpool-remove 8 , +.Xr zpool-attach 8 , +.Xr zpool-import 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 diff --git a/man/man8/zpool-attach.8 b/man/man8/zpool-attach.8 new file mode 100644 index 000000000000..03e04436df81 --- /dev/null +++ b/man/man8/zpool-attach.8 @@ -0,0 +1,103 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 15, 2020 +.Dt ZPOOL-ATTACH 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm attach +.Nd Attach a new device to an existing ZFS virtual device (vdev). +.Sh SYNOPSIS +.Nm +.Cm attach +.Op Fl fsw +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool device new_device +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm attach +.Op Fl fsw +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool device new_device +.Xc +Attaches +.Ar new_device +to the existing +.Ar device . +The existing device cannot be part of a raidz configuration. +If +.Ar device +is not currently part of a mirrored configuration, +.Ar device +automatically transforms into a two-way mirror of +.Ar device +and +.Ar new_device . +If +.Ar device +is part of a two-way mirror, attaching +.Ar new_device +creates a three-way mirror, and so on. +In either case, +.Ar new_device +begins to resilver immediately and any running scrub is cancelled. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar new_device , +even if it appears to be in use. +Not all devices can be overridden in this manner. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. See the +.Xr zpoolprops 8 +manual page for a list of valid properties that can be set. The only property +supported at the moment is ashift. +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until +.Ar new_device +has finished resilvering before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-detach 8 , +.Xr zpool-add 8 , +.Xr zpool-import 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 , +.Xr zpool-replace 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-checkpoint.8 b/man/man8/zpool-checkpoint.8 new file mode 100644 index 000000000000..df04c8819146 --- /dev/null +++ b/man/man8/zpool-checkpoint.8 @@ -0,0 +1,82 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-CHECKPOINT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm checkpoint +.Nd Checkpoints the current state of a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm checkpoint +.Op Fl d, -discard Oo Fl w, -wait Oc +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm checkpoint +.Op Fl d, -discard Oo Fl w, -wait Oc +.Ar pool +.Xc +Checkpoints the current state of +.Ar pool +, which can be later restored by +.Nm zpool Cm import --rewind-to-checkpoint . +The existence of a checkpoint in a pool prohibits the following +.Nm zpool +commands: +.Cm remove , +.Cm attach , +.Cm detach , +.Cm split , +and +.Cm reguid . +In addition, it may break reservation boundaries if the pool lacks free +space. +The +.Nm zpool Cm status +command indicates the existence of a checkpoint or the progress of discarding a +checkpoint from a pool. +The +.Nm zpool Cm list +command reports how much space the checkpoint takes from the pool. +.Bl -tag -width Ds +.It Fl d, -discard +Discards an existing checkpoint from +.Ar pool . +.It Fl w, -wait +Waits until the checkpoint has finished being discarded before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-import 8 , +.Xr zpool-status 8 , +.Xr zfs-snapshot 8 diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 new file mode 100644 index 000000000000..ee7a6a255905 --- /dev/null +++ b/man/man8/zpool-clear.8 @@ -0,0 +1,60 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-CLEAR 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm clear +.Nd Clears device errors in a ZFS storage pool. +.Sh SYNOPSIS +.Nm +.Cm clear +.Ar pool +.Op Ar device +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm clear +.Ar pool +.Op Ar device +.Xc +Clears device errors in a pool. +If no arguments are specified, all device errors within the pool are cleared. +If one or more devices is specified, only those errors associated with the +specified device or devices are cleared. +If multihost is enabled, and the pool has been suspended, this will not +resume I/O. While the pool was suspended, it may have been imported on +another host, and resuming I/O could result in pool damage. +.El +.Sh SEE ALSO +.Xr zdb 8 , +.Xr zpool-reopen 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-create.8 b/man/man8/zpool-create.8 new file mode 100644 index 000000000000..0676a67f9fc6 --- /dev/null +++ b/man/man8/zpool-create.8 @@ -0,0 +1,207 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-CREATE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm create +.Nd Creates a new ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm create +.Op Fl dfn +.Op Fl m Ar mountpoint +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Oo Fl o Ar feature@feature Ns = Ns Ar value Oc +.Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Ar pool vdev Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm create +.Op Fl dfn +.Op Fl m Ar mountpoint +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Oo Fl o Ar feature@feature Ns = Ns Ar value Oc Ns ... +.Oo Fl O Ar file-system-property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Op Fl t Ar tname +.Ar pool vdev Ns ... +.Xc +Creates a new storage pool containing the virtual devices specified on the +command line. +The pool name must begin with a letter, and can only contain +alphanumeric characters as well as underscore +.Pq Qq Sy _ , +dash +.Pq Qq Sy \&- , +colon +.Pq Qq Sy \&: , +space +.Pq Qq Sy \&\ , +and period +.Pq Qq Sy \&. . +The pool names +.Sy mirror , +.Sy raidz , +.Sy spare +and +.Sy log +are reserved, as are names beginning with +.Sy mirror , +.Sy raidz , +.Sy spare , +and the pattern +.Sy c[0-9] . +The +.Ar vdev +specification is described in the +.Em Virtual Devices +section of +.Xr zpoolconcepts. +.Pp +The command attempts to verify that each device specified is accessible and not +currently in use by another subsystem. However this check is not robust enough +to detect simultaneous attempts to use a new device in different pools, even if +.Sy multihost +is +.Sy enabled. +The +administrator must ensure that simultaneous invocations of any combination of +.Sy zpool replace , +.Sy zpool create , +.Sy zpool add , +or +.Sy zpool labelclear , +do not refer to the same device. Using the same device in two pools will +result in pool corruption. +.Pp +There are some uses, such as being currently mounted, or specified as the +dedicated dump device, that prevents a device from ever being used by ZFS. +Other uses, such as having a preexisting UFS file system, can be overridden with +the +.Fl f +option. +.Pp +The command also checks that the replication strategy for the pool is +consistent. +An attempt to combine redundant and non-redundant storage in a single pool, or +to mix disks and files, results in an error unless +.Fl f +is specified. +The use of differently sized devices within a single raidz or mirror group is +also flagged as an error unless +.Fl f +is specified. +.Pp +Unless the +.Fl R +option is specified, the default mount point is +.Pa / Ns Ar pool . +The mount point must not exist or must be empty, or else the root dataset +cannot be mounted. +This can be overridden with the +.Fl m +option. +.Pp +By default all supported features are enabled on the new pool unless the +.Fl d +option is specified. +.Bl -tag -width Ds +.It Fl d +Do not enable any features on the new pool. +Individual features can be enabled by setting their corresponding properties to +.Sy enabled +with the +.Fl o +option. +See +.Xr zpool-features 5 +for details about feature properties. +.It Fl f +Forces use of +.Ar vdev Ns s , +even if they appear in use or specify a conflicting replication level. +Not all devices can be overridden in this manner. +.It Fl m Ar mountpoint +Sets the mount point for the root dataset. +The default mount point is +.Pa /pool +or +.Pa altroot/pool +if +.Ar altroot +is specified. +The mount point must be an absolute path, +.Sy legacy , +or +.Sy none . +For more information on dataset mount points, see +.Xr zfs 8 . +.It Fl n +Displays the configuration that would be used without actually creating the +pool. +The actual pool creation can still fail due to insufficient privileges or +device sharing. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. +See the +.Xr zpoolprops +manual page for a list of valid properties that can be set. +.It Fl o Ar feature@feature Ns = Ns Ar value +Sets the given pool feature. See the +.Xr zpool-features 5 +section for a list of valid features that can be set. +Value can be either disabled or enabled. +.It Fl O Ar file-system-property Ns = Ns Ar value +Sets the given file system properties in the root file system of the pool. +See the +.Xr zfsprops 8 +manual page for a list of valid properties that can be set. +.It Fl R Ar root +Equivalent to +.Fl o Sy cachefile Ns = Ns Sy none Fl o Sy altroot Ns = Ns Ar root +.It Fl t Ar tname +Sets the in-core pool name to +.Sy tname +while the on-disk name will be the name specified as the pool name +.Sy pool . +This will set the default cachefile property to none. This is intended +to handle name space collisions when creating pools for other systems, +such as virtual machines or physical machines whose pools live on network +block devices. +.El +.El +.Sh SEE ALSO +.Xr zpool-destroy 8 , +.Xr zpool-export 8 , +.Xr zpool-import 8 diff --git a/man/man8/zpool-destroy.8 b/man/man8/zpool-destroy.8 new file mode 100644 index 000000000000..9eace1983ebb --- /dev/null +++ b/man/man8/zpool-destroy.8 @@ -0,0 +1,55 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-DESTROY 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm destroy +.Nd Destroys the given ZFS storage pool, freeing up any devices for other use +.Sh SYNOPSIS +.Nm +.Cm destroy +.Op Fl f +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm destroy +.Op Fl f +.Ar pool +.Xc +Destroys the given pool, freeing up any devices for other use. +This command tries to unmount any active datasets before destroying the pool. +.Bl -tag -width Ds +.It Fl f +Forces any active datasets contained within the pool to be unmounted. +.El +.El diff --git a/man/man8/zpool-detach.8 b/man/man8/zpool-detach.8 new file mode 100644 index 000000000000..dab8871cea87 --- /dev/null +++ b/man/man8/zpool-detach.8 @@ -0,0 +1,61 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-DETACH 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm detach +.Nd Detaches a device from a ZFS mirror vdev (virtual device) +.Sh SYNOPSIS +.Nm +.Cm detach +.Ar pool device +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm detach +.Ar pool device +.Xc +Detaches +.Ar device +from a mirror. +The operation is refused if there are no other valid replicas of the data. +If device may be re-added to the pool later on then consider the +.Sy zpool offline +command instead. +.El +.Sh SEE ALSO +.Xr zpool-attach 8 , +.Xr zpool-offline 8 , +.Xr zpool-labelclear 8 , +.Xr zpool-remove 8 , +.Xr zpool-replace 8 , +.Xr zpool-split 8 diff --git a/man/man8/zpool-events.8 b/man/man8/zpool-events.8 new file mode 100644 index 000000000000..e5887a1d820d --- /dev/null +++ b/man/man8/zpool-events.8 @@ -0,0 +1,71 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-EVENTS 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm events +.Nd Lists all recent events generated by the ZFS kernel modules +.Sh SYNOPSIS +.Nm +.Cm events +.Op Fl vHf Oo Ar pool Oc | Fl c +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm events +.Op Fl vHf Oo Ar pool Oc | Fl c +.Xc +Lists all recent events generated by the ZFS kernel modules. These events +are consumed by the +.Xr zed 8 +and used to automate administrative tasks such as replacing a failed device +with a hot spare. For more information about the subclasses and event payloads +that can be generated see the +.Xr zfs-events 5 +man page. +.Bl -tag -width Ds +.It Fl c +Clear all previous events. +.It Fl f +Follow mode. +.It Fl H +Scripted mode. Do not display headers, and separate fields by a +single tab instead of arbitrary space. +.It Fl v +Print the entire payload for each event. +.El +.El +.Sh SEE ALSO +.Xr zed 8 , +.Xr zpool-wait 8 , +.Xr zfs-events 5 , +.Xr zfs-module-parameters 5 diff --git a/man/man8/zpool-export.8 b/man/man8/zpool-export.8 new file mode 100644 index 000000000000..afe185c98c58 --- /dev/null +++ b/man/man8/zpool-export.8 @@ -0,0 +1,83 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd February 16, 2020 +.Dt ZPOOL-EXPORT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm export +.Nd Exports the given ZFS storage pools from the system +.Sh SYNOPSIS +.Nm +.Cm export +.Op Fl a +.Op Fl f +.Ar pool Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm export +.Op Fl a +.Op Fl f +.Ar pool Ns ... +.Xc +Exports the given pools from the system. +All devices are marked as exported, but are still considered in use by other +subsystems. +The devices can be moved between systems +.Pq even those of different endianness +and imported as long as a sufficient number of devices are present. +.Pp +Before exporting the pool, all datasets within the pool are unmounted. +A pool can not be exported if it has a shared spare that is currently being +used. +.Pp +For pools to be portable, you must give the +.Nm +command whole disks, not just partitions, so that ZFS can label the disks with +portable EFI labels. +Otherwise, disk drivers on platforms of different endianness will not recognize +the disks. +.Bl -tag -width Ds +.It Fl a +Exports all pools imported on the system. +.It Fl f +Forcefully unmount all datasets, using the +.Nm unmount Fl f +command. +This option is not supported on Linux. +.Pp +This command will forcefully export the pool even if it has a shared spare that +is currently being used. +This may lead to potential data corruption. +.El +.El +.Sh SEE ALSO +.Xr zpool-import 8 diff --git a/man/man8/zpool-get.8 b/man/man8/zpool-get.8 new file mode 100644 index 000000000000..fbe341a8cddd --- /dev/null +++ b/man/man8/zpool-get.8 @@ -0,0 +1,102 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-GET 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm get +.Nd Retrieves properties for the specified ZFS storage pool(s) +.Sh SYNOPSIS +.Nm +.Cm get +.Op Fl Hp +.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... +.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... +.Oo Ar pool Oc Ns ... +.Nm +.Cm set +.Ar property Ns = Ns Ar value +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm get +.Op Fl Hp +.Op Fl o Ar field Ns Oo , Ns Ar field Oc Ns ... +.Sy all Ns | Ns Ar property Ns Oo , Ns Ar property Oc Ns ... +.Oo Ar pool Oc Ns ... +.Xc +Retrieves the given list of properties +.Po +or all properties if +.Sy all +is used +.Pc +for the specified storage pool(s). +These properties are displayed with the following fields: +.Bd -literal + name Name of storage pool + property Property name + value Property value + source Property source, either 'default' or 'local'. +.Ed +.Pp +See the +.Xr zpoolprops +manual page for more information on the available pool properties. +.Bl -tag -width Ds +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl o Ar field +A comma-separated list of columns to display. +.Sy name Ns \&, Ns Sy property Ns \&, Ns Sy value Ns \&, Ns Sy source +is the default value. +.It Fl p +Display numbers in parsable (exact) values. +.El +.It Xo +.Nm +.Cm set +.Ar property Ns = Ns Ar value +.Ar pool +.Xc +Sets the given property on the specified pool. +See the +.Xr zpoolprops +manual page for more information on what properties can be set and acceptable +values. +.El +.Sh SEE ALSO +.Xr zpoolprops 8 , +.Xr zpool-list 8 , +.Xr zpool-features 5 diff --git a/man/man8/zpool-history.8 b/man/man8/zpool-history.8 new file mode 100644 index 000000000000..73fb27449d8c --- /dev/null +++ b/man/man8/zpool-history.8 @@ -0,0 +1,64 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-HISTORY 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm history +.Nd Displays the command history of the specified ZFS storage pool(s) +.Sh SYNOPSIS +.Nm +.Cm history +.Op Fl il +.Oo Ar pool Oc Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm history +.Op Fl il +.Oo Ar pool Oc Ns ... +.Xc +Displays the command history of the specified pool(s) or all pools if no pool is +specified. +.Bl -tag -width Ds +.It Fl i +Displays internally logged ZFS events in addition to user initiated events. +.It Fl l +Displays log records in long format, which in addition to standard format +includes, the user name, the hostname, and the zone in which the operation was +performed. +.El +.El +.Sh SEE ALSO +.Xr zpool-checkpoint 8 , +.Xr zpool-events 8 , +.Xr zpool-status 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-import.8 b/man/man8/zpool-import.8 new file mode 100644 index 000000000000..e63db9363fd0 --- /dev/null +++ b/man/man8/zpool-import.8 @@ -0,0 +1,388 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-IMPORT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm import +.Nd Lists ZFS storage pools available to import or import the specified pools +.Sh SYNOPSIS +.Nm +.Cm import +.Op Fl D +.Op Fl d Ar dir Ns | Ns device +.Nm +.Cm import +.Fl a +.Op Fl DflmN +.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc +.Op Fl -rewind-to-checkpoint +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Nm +.Cm import +.Op Fl Dflm +.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc +.Op Fl -rewind-to-checkpoint +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Op Fl s +.Ar pool Ns | Ns Ar id +.Op Ar newpool Oo Fl t Oc +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm import +.Op Fl D +.Op Fl d Ar dir Ns | Ns device +.Xc +Lists pools available to import. +If the +.Fl d or +.Fl c +options are not specified, this command searches for devices using libblkid +on Linux and geom on FreeBSD. +The +.Fl d +option can be specified multiple times, and all directories are searched. +If the device appears to be part of an exported pool, this command displays a +summary of the pool with the name of the pool, a numeric identifier, as well as +the vdev layout and current health of the device for each device or file. +Destroyed pools, pools that were previously destroyed with the +.Nm zpool Cm destroy +command, are not listed unless the +.Fl D +option is specified. +.Pp +The numeric identifier is unique, and can be used instead of the pool name when +multiple exported pools of the same name are available. +.Bl -tag -width Ds +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +.It Fl D +Lists destroyed pools only. +.El +.It Xo +.Nm +.Cm import +.Fl a +.Op Fl DflmN +.Op Fl F Oo Fl n Oc Oo Fl T Oc Oo Fl X Oc +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Op Fl s +.Xc +Imports all pools found in the search directories. +Identical to the previous command, except that all pools with a sufficient +number of devices available are imported. +Destroyed pools, pools that were previously destroyed with the +.Nm zpool Cm destroy +command, will not be imported unless the +.Fl D +option is specified. +.Bl -tag -width Ds +.It Fl a +Searches for and imports all pools found. +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +This option is incompatible with the +.Fl c +option. +.It Fl D +Imports destroyed pools only. +The +.Fl f +option is also required. +.It Fl f +Forces import, even if the pool appears to be potentially active. +.It Fl F +Recovery mode for a non-importable pool. +Attempt to return the pool to an importable state by discarding the last few +transactions. +Not all damaged pools can be recovered by using this option. +If successful, the data from the discarded transactions is irretrievably lost. +This option is ignored if the pool is importable or already imported. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the pool online. Note that if +any datasets have a +.Sy keylocation +of +.Sy prompt +this command will block waiting for the keys to be entered. Without this flag +encrypted datasets will be left unavailable until the keys are loaded. +.It Fl m +Allows a pool to import when there is a missing log device. +Recent transactions can be lost because the log device will be discarded. +.It Fl n +Used with the +.Fl F +recovery option. +Determines whether a non-importable pool can be made importable again, but does +not actually perform the pool recovery. +For more details about pool recovery mode, see the +.Fl F +option, above. +.It Fl N +Import the pool without mounting any file systems. +.It Fl o Ar mntopts +Comma-separated list of mount options to use when mounting datasets within the +pool. +See +.Xr zfs 8 +for a description of dataset properties and mount options. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property on the imported pool. +See the +.Xr zpoolprops +manual page for more information on the available pool properties. +.It Fl R Ar root +Sets the +.Sy cachefile +property to +.Sy none +and the +.Sy altroot +property to +.Ar root . +.It Fl -rewind-to-checkpoint +Rewinds pool to the checkpointed state. +Once the pool is imported with this flag there is no way to undo the rewind. +All changes and data that were written after the checkpoint are lost! +The only exception is when the +.Sy readonly +mounting option is enabled. +In this case, the checkpointed state of the pool is opened and an +administrator can see how the pool would look like if they were +to fully rewind. +.It Fl s +Scan using the default search path, the libblkid cache will not be +consulted. A custom search path may be specified by setting the +ZPOOL_IMPORT_PATH environment variable. +.It Fl X +Used with the +.Fl F +recovery option. Determines whether extreme +measures to find a valid txg should take place. This allows the pool to +be rolled back to a txg which is no longer guaranteed to be consistent. +Pools imported at an inconsistent txg may contain uncorrectable +checksum errors. For more details about pool recovery mode, see the +.Fl F +option, above. WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl T +Specify the txg to use for rollback. Implies +.Fl FX . +For more details +about pool recovery mode, see the +.Fl X +option, above. WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.El +.It Xo +.Nm +.Cm import +.Op Fl Dflm +.Op Fl F Oo Fl n Oc Oo Fl t Oc Oo Fl T Oc Oo Fl X Oc +.Op Fl c Ar cachefile Ns | Ns Fl d Ar dir Ns | Ns device +.Op Fl o Ar mntopts +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Op Fl s +.Ar pool Ns | Ns Ar id +.Op Ar newpool +.Xc +Imports a specific pool. +A pool can be identified by its name or the numeric identifier. +If +.Ar newpool +is specified, the pool is imported using the name +.Ar newpool . +Otherwise, it is imported with the same name as its exported name. +.Pp +If a device is removed from a system without running +.Nm zpool Cm export +first, the device appears as potentially active. +It cannot be determined if this was a failed export, or whether the device is +really in use from another host. +To import a pool in this state, the +.Fl f +option is required. +.Bl -tag -width Ds +.It Fl c Ar cachefile +Reads configuration from the given +.Ar cachefile +that was created with the +.Sy cachefile +pool property. +This +.Ar cachefile +is used instead of searching for devices. +.It Fl d Ar dir Ns | Ns Ar device +Uses +.Ar device +or searches for devices or files in +.Ar dir . +The +.Fl d +option can be specified multiple times. +This option is incompatible with the +.Fl c +option. +.It Fl D +Imports destroyed pool. +The +.Fl f +option is also required. +.It Fl f +Forces import, even if the pool appears to be potentially active. +.It Fl F +Recovery mode for a non-importable pool. +Attempt to return the pool to an importable state by discarding the last few +transactions. +Not all damaged pools can be recovered by using this option. +If successful, the data from the discarded transactions is irretrievably lost. +This option is ignored if the pool is importable or already imported. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the pool online. Note that if +any datasets have a +.Sy keylocation +of +.Sy prompt +this command will block waiting for the keys to be entered. Without this flag +encrypted datasets will be left unavailable until the keys are loaded. +.It Fl m +Allows a pool to import when there is a missing log device. +Recent transactions can be lost because the log device will be discarded. +.It Fl n +Used with the +.Fl F +recovery option. +Determines whether a non-importable pool can be made importable again, but does +not actually perform the pool recovery. +For more details about pool recovery mode, see the +.Fl F +option, above. +.It Fl o Ar mntopts +Comma-separated list of mount options to use when mounting datasets within the +pool. +See +.Xr zfs 8 +for a description of dataset properties and mount options. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property on the imported pool. +See the +.Xr zpoolprops +manual page for more information on the available pool properties. +.It Fl R Ar root +Sets the +.Sy cachefile +property to +.Sy none +and the +.Sy altroot +property to +.Ar root . +.It Fl s +Scan using the default search path, the libblkid cache will not be +consulted. A custom search path may be specified by setting the +ZPOOL_IMPORT_PATH environment variable. +.It Fl X +Used with the +.Fl F +recovery option. Determines whether extreme +measures to find a valid txg should take place. This allows the pool to +be rolled back to a txg which is no longer guaranteed to be consistent. +Pools imported at an inconsistent txg may contain uncorrectable +checksum errors. For more details about pool recovery mode, see the +.Fl F +option, above. WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl T +Specify the txg to use for rollback. Implies +.Fl FX . +For more details +about pool recovery mode, see the +.Fl X +option, above. WARNING: This option can be extremely hazardous to the +health of your pool and should only be used as a last resort. +.It Fl t +Used with +.Sy newpool . +Specifies that +.Sy newpool +is temporary. Temporary pool names last until export. Ensures that +the original pool name will be used in all label updates and therefore +is retained upon export. +Will also set -o cachefile=none when not explicitly specified. +.El +.El +.Sh SEE ALSO +.Xr zpool-export 8 , +.Xr zpool-list 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-initialize.8 b/man/man8/zpool-initialize.8 new file mode 100644 index 000000000000..e8bf656e4b97 --- /dev/null +++ b/man/man8/zpool-initialize.8 @@ -0,0 +1,81 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-INITIALIZE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm initialize +.Nd Write to all unallocated regions of eligible devices in a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm initialize +.Op Fl c | Fl s +.Op Fl w +.Ar pool +.Op Ar device Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm initialize +.Op Fl c | Fl s +.Op Fl w +.Ar pool +.Op Ar device Ns ... +.Xc +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. +Only leaf data or log devices may be initialized. +.Bl -tag -width Ds +.It Fl c, -cancel +Cancel initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no cancellation will occur on any device. +.It Fl s -suspend +Suspend initializing on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +initialized, the command will fail and no suspension will occur on any device. +Initializing can then be resumed by running +.Nm zpool Cm initialize +with no flags on the relevant target devices. +.It Fl w, -wait +Wait until the devices have finished initializing before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-add 8 , +.Xr zpool-attach 8 , +.Xr zpool-create 8 , +.Xr zpool-online 8 , +.Xr zpool-replace 8 , +.Xr zpool-trim 8 diff --git a/man/man8/zpool-iostat.8 b/man/man8/zpool-iostat.8 new file mode 100644 index 000000000000..c318dcd7478c --- /dev/null +++ b/man/man8/zpool-iostat.8 @@ -0,0 +1,247 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-IOSTAT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm iostat +.Nd Display logical I/O statistics for the given ZFS storage pools/vdevs +.Sh SYNOPSIS +.Nm +.Cm iostat +.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl ghHLnpPvy +.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc +.Op Ar interval Op Ar count +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm iostat +.Op Oo Oo Fl c Ar SCRIPT Oc Oo Fl lq Oc Oc Ns | Ns Fl rw +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl ghHLnpPvy +.Oo Oo Ar pool Ns ... Oc Ns | Ns Oo Ar pool vdev Ns ... Oc Ns | Ns Oo Ar vdev Ns ... Oc Oc +.Op Ar interval Op Ar count +.Xc +Displays logical I/O statistics for the given pools/vdevs. Physical I/Os may +be observed via +.Xr iostat 1 . +If writes are located nearby, they may be merged into a single +larger operation. Additional I/O may be generated depending on the level of +vdev redundancy. +To filter output, you may pass in a list of pools, a pool and list of vdevs +in that pool, or a list of any vdevs from any pool. If no items are specified, +statistics for every pool in the system are shown. +When given an +.Ar interval , +the statistics are printed every +.Ar interval +seconds until ^C is pressed. If +.Fl n +flag is specified the headers are displayed only once, otherwise they are +displayed periodically. If count is specified, the command exits +after count reports are printed. The first report printed is always +the statistics since boot regardless of whether +.Ar interval +and +.Ar count +are passed. However, this behavior can be suppressed with the +.Fl y +flag. Also note that the units of +.Sy K , +.Sy M , +.Sy G ... +that are printed in the report are in base 1024. To get the raw +values, use the +.Fl p +flag. +.Bl -tag -width Ds +.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... +Run a script (or scripts) on each vdev and include the output as a new column +in the +.Nm zpool Cm iostat +output. Users can run any script found in their +.Pa ~/.zpool.d +directory or from the system +.Pa /etc/zfs/zpool.d +directory. Script names containing the slash (/) character are not allowed. +The default search path can be overridden by setting the +ZPOOL_SCRIPTS_PATH environment variable. A privileged user can run +.Fl c +if they have the ZPOOL_SCRIPTS_AS_ROOT +environment variable set. If a script requires the use of a privileged +command, like +.Xr smartctl 8 , +then it's recommended you allow the user access to it in +.Pa /etc/sudoers +or add the user to the +.Pa /etc/sudoers.d/zfs +file. +.Pp +If +.Fl c +is passed without a script name, it prints a list of all scripts. +.Fl c +also sets verbose mode +.No \&( Ns Fl v Ns No \&). +.Pp +Script output should be in the form of "name=value". The column name is +set to "name" and the value is set to "value". Multiple lines can be +used to output multiple columns. The first line of output not in the +"name=value" format is displayed without a column title, and no more +output after that is displayed. This can be useful for printing error +messages. Blank or NULL values are printed as a '-' to make output +awk-able. +.Pp +The following environment variables are set before running each script: +.Bl -tag -width "VDEV_PATH" +.It Sy VDEV_PATH +Full path to the vdev +.El +.Bl -tag -width "VDEV_UPATH" +.It Sy VDEV_UPATH +Underlying path to the vdev (/dev/sd*). For use with device mapper, +multipath, or partitioned vdevs. +.El +.Bl -tag -width "VDEV_ENC_SYSFS_PATH" +.It Sy VDEV_ENC_SYSFS_PATH +The sysfs path to the enclosure for the vdev (if any). +.El +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl g +Display vdev GUIDs instead of the normal device names. These GUIDs +can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl H +Scripted mode. Do not display headers, and separate fields by a +single tab instead of arbitrary space. +.It Fl L +Display real paths for vdevs resolving all symbolic links. This can +be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl n +Print headers only once when passed +.It Fl p +Display numbers in parsable (exact) values. Time values are in +nanoseconds. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. This can be used in conjunction with the +.Fl L +flag. +.It Fl r +Print request size histograms for the leaf vdev's IO. This includes +histograms of individual IOs (ind) and aggregate IOs (agg). These stats +can be useful for observing how well IO aggregation is working. Note +that TRIM IOs may exceed 16M, but will be counted as 16M. +.It Fl v +Verbose statistics Reports usage statistics for individual vdevs within the +pool, in addition to the pool-wide statistics. +.It Fl y +Omit statistics since boot. +Normally the first line of output reports the statistics since boot. +This option suppresses that first line of output. +.Ar interval +.It Fl w +Display latency histograms: +.Pp +.Ar total_wait : +Total IO time (queuing + disk IO time). +.Ar disk_wait : +Disk IO time (time reading/writing the disk). +.Ar syncq_wait : +Amount of time IO spent in synchronous priority queues. Does not include +disk time. +.Ar asyncq_wait : +Amount of time IO spent in asynchronous priority queues. Does not include +disk time. +.Ar scrub : +Amount of time IO spent in scrub queue. Does not include disk time. +.It Fl l +Include average latency statistics: +.Pp +.Ar total_wait : +Average total IO time (queuing + disk IO time). +.Ar disk_wait : +Average disk IO time (time reading/writing the disk). +.Ar syncq_wait : +Average amount of time IO spent in synchronous priority queues. Does +not include disk time. +.Ar asyncq_wait : +Average amount of time IO spent in asynchronous priority queues. +Does not include disk time. +.Ar scrub : +Average queuing time in scrub queue. Does not include disk time. +.Ar trim : +Average queuing time in trim queue. Does not include disk time. +.It Fl q +Include active queue statistics. Each priority queue has both +pending ( +.Ar pend ) +and active ( +.Ar activ ) +IOs. Pending IOs are waiting to +be issued to the disk, and active IOs have been issued to disk and are +waiting for completion. These stats are broken out by priority queue: +.Pp +.Ar syncq_read/write : +Current number of entries in synchronous priority +queues. +.Ar asyncq_read/write : +Current number of entries in asynchronous priority queues. +.Ar scrubq_read : +Current number of entries in scrub queue. +.Ar trimq_write : +Current number of entries in trim queue. +.Pp +All queue statistics are instantaneous measurements of the number of +entries in the queues. If you specify an interval, the measurements +will be sampled from the end of the interval. +.El +.El +.Sh SEE ALSO +.Xr zpool-list 8 , +.Xr zpool-status 8 , +.Xr iostat 1 , +.Xr smartctl 8 diff --git a/man/man8/zpool-labelclear.8 b/man/man8/zpool-labelclear.8 new file mode 100644 index 000000000000..52c80056142c --- /dev/null +++ b/man/man8/zpool-labelclear.8 @@ -0,0 +1,66 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-LABELCLEAR 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm labelclear +.Nd Removes ZFS label information from the specified physical device +.Sh SYNOPSIS +.Nm +.Cm labelclear +.Op Fl f +.Ar device +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm labelclear +.Op Fl f +.Ar device +.Xc +Removes ZFS label information from the specified +.Ar device . +If the +.Ar device +is a cache device, it also removes the L2ARC header +(persistent L2ARC). The +.Ar device +must not be part of an active pool configuration. +.Bl -tag -width Ds +.It Fl f +Treat exported or foreign devices as inactive. +.El +.El +.Sh SEE ALSO +.Xr zpool-destroy 8 , +.Xr zpool-detach 8 , +.Xr zpool-remove 8 , +.Xr zpool-replace 8 diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8 new file mode 100644 index 000000000000..5e8c6fe5c9ea --- /dev/null +++ b/man/man8/zpool-list.8 @@ -0,0 +1,119 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-LIST 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm list +.Nd Lists ZFS storage pools along with a health status and space usage +.Sh SYNOPSIS +.Nm +.Cm list +.Op Fl HgLpPv +.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... +.Op Fl T Sy u Ns | Ns Sy d +.Oo Ar pool Oc Ns ... +.Op Ar interval Op Ar count +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm list +.Op Fl HgLpPv +.Op Fl o Ar property Ns Oo , Ns Ar property Oc Ns ... +.Op Fl T Sy u Ns | Ns Sy d +.Oo Ar pool Oc Ns ... +.Op Ar interval Op Ar count +.Xc +Lists the given pools along with a health status and space usage. +If no +.Ar pool Ns s +are specified, all pools in the system are listed. +When given an +.Ar interval , +the information is printed every +.Ar interval +seconds until ^C is pressed. +If +.Ar count +is specified, the command exits after +.Ar count +reports are printed. +.Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. These GUIDs +can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl o Ar property +Comma-separated list of properties to display. +See the +.Xr zpoolprops +manual page for a list of valid properties. +The default list is +.Cm name , size , allocated , free , checkpoint, expandsize , fragmentation , +.Cm capacity , dedupratio , health , altroot . +.It Fl L +Display real paths for vdevs resolving all symbolic links. This can +be used to look up the current block device name regardless of the +/dev/disk/ path used to open it. +.It Fl p +Display numbers in parsable +.Pq exact +values. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. This can be used in conjunction with the +.Fl L +flag. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl v +Verbose statistics. +Reports usage statistics for individual vdevs within the pool, in addition to +the pool-wise statistics. +.El +.El +.Sh SEE ALSO +.Xr zpool-import 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-offline.8 b/man/man8/zpool-offline.8 new file mode 100644 index 000000000000..cdcda141f5eb --- /dev/null +++ b/man/man8/zpool-offline.8 @@ -0,0 +1,89 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-OFFLINE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm offline +.Nd Take a physical device in a ZFS storage pool offline +.Sh SYNOPSIS +.Nm +.Cm offline +.Op Fl f +.Op Fl t +.Ar pool Ar device Ns ... +.Nm +.Cm online +.Op Fl e +.Ar pool Ar device Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm offline +.Op Fl f +.Op Fl t +.Ar pool Ar device Ns ... +.Xc +Takes the specified physical device offline. +While the +.Ar device +is offline, no attempt is made to read or write to the device. +This command is not applicable to spares. +.Bl -tag -width Ds +.It Fl f +Force fault. Instead of offlining the disk, put it into a faulted +state. The fault will persist across imports unless the +.Fl t +flag was specified. +.It Fl t +Temporary. +Upon reboot, the specified physical device reverts to its previous state. +.El +.It Xo +.Nm +.Cm online +.Op Fl e +.Ar pool Ar device Ns ... +.Xc +Brings the specified physical device online. +This command is not applicable to spares. +.Bl -tag -width Ds +.It Fl e +Expand the device to use all available space. +If the device is part of a mirror or raidz then all devices must be expanded +before the new space will become available to the pool. +.El +.El +.Sh SEE ALSO +.Xr zpool-detach 8 , +.Xr zpool-remove 8 , +.Xr zpool-reopen 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-online.8 b/man/man8/zpool-online.8 new file mode 120000 index 000000000000..537e00e1c4b0 --- /dev/null +++ b/man/man8/zpool-online.8 @@ -0,0 +1 @@ +zpool-offline.8 \ No newline at end of file diff --git a/man/man8/zpool-reguid.8 b/man/man8/zpool-reguid.8 new file mode 100644 index 000000000000..e73f421ade78 --- /dev/null +++ b/man/man8/zpool-reguid.8 @@ -0,0 +1,53 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-REGUID 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm reguid +.Nd Generate a new unique identifier for a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm reguid +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm reguid +.Ar pool +.Xc +Generates a new unique identifier for the pool. +You must ensure that all devices in this pool are online and healthy before +performing this action. +.El +.Sh SEE ALSO +.Xr zpool-export 8 , +.Xr zpool-import 8 diff --git a/man/man8/zpool-remove.8 b/man/man8/zpool-remove.8 new file mode 100644 index 000000000000..055f852fc7cb --- /dev/null +++ b/man/man8/zpool-remove.8 @@ -0,0 +1,109 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-REMOVE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm remove +.Nd Remove a device from a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm remove +.Op Fl npw +.Ar pool Ar device Ns ... +.Nm +.Cm remove +.Fl s +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm remove +.Op Fl npw +.Ar pool Ar device Ns ... +.Xc +Removes the specified device from the pool. +This command supports removing hot spare, cache, log, and both mirrored and +non-redundant primary top-level vdevs, including dedup and special vdevs. +When the primary pool storage includes a top-level raidz vdev only hot spare, +cache, and log devices can be removed. +.sp +Removing a top-level vdev reduces the total amount of space in the storage pool. +The specified device will be evacuated by copying all allocated space from it to +the other devices in the pool. +In this case, the +.Nm zpool Cm remove +command initiates the removal and returns, while the evacuation continues in +the background. +The removal progress can be monitored with +.Nm zpool Cm status . +If an IO error is encountered during the removal process it will be +cancelled. The +.Sy device_removal +feature flag must be enabled to remove a top-level vdev, see +.Xr zpool-features 5 . +.Pp +A mirrored top-level device (log or data) can be removed by specifying the top-level mirror for the +same. +Non-log devices or data devices that are part of a mirrored configuration can be removed using +the +.Nm zpool Cm detach +command. +.Bl -tag -width Ds +.It Fl n +Do not actually perform the removal ("no-op"). +Instead, print the estimated amount of memory that will be used by the +mapping table after the removal completes. +This is nonzero only for top-level vdevs. +.El +.Bl -tag -width Ds +.It Fl p +Used in conjunction with the +.Fl n +flag, displays numbers as parsable (exact) values. +.It Fl w +Waits until the removal has completed before returning. +.El +.It Xo +.Nm +.Cm remove +.Fl s +.Ar pool +.Xc +Stops and cancels an in-progress removal of a top-level vdev. +.El +.Sh SEE ALSO +.Xr zpool-add 8 , +.Xr zpool-detach 8 , +.Xr zpool-offline 8 , +.Xr zpool-labelclear 8 , +.Xr zpool-replace 8 , +.Xr zpool-split 8 diff --git a/man/man8/zpool-reopen.8 b/man/man8/zpool-reopen.8 new file mode 100644 index 000000000000..206fedcc57d9 --- /dev/null +++ b/man/man8/zpool-reopen.8 @@ -0,0 +1,55 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-REOPEN 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm reopen +.Nd Reopen all virtual devices (vdevs) associated with a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm reopen +.Op Fl n +.Ar pool +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm reopen +.Op Fl n +.Ar pool +.Xc +Reopen all the vdevs associated with the pool. +.Bl -tag -width Ds +.It Fl n +Do not restart an in-progress scrub operation. This is not recommended and can +result in partially resilvered devices unless a second scrub is performed. +.El +.El diff --git a/man/man8/zpool-replace.8 b/man/man8/zpool-replace.8 new file mode 100644 index 000000000000..32eaf77c0c2c --- /dev/null +++ b/man/man8/zpool-replace.8 @@ -0,0 +1,105 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 15, 2020 +.Dt ZPOOL-REPLACE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm replace +.Nd Replace one device with another in a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm replace +.Op Fl fsw +.Oo Fl o Ar property Ns = Ns Ar value Oc +.Ar pool Ar device Op Ar new_device +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm replace +.Op Fl fsw +.Op Fl o Ar property Ns = Ns Ar value +.Ar pool Ar device Op Ar new_device +.Xc +Replaces +.Ar old_device +with +.Ar new_device . +This is equivalent to attaching +.Ar new_device , +waiting for it to resilver, and then detaching +.Ar old_device . +Any in progress scrub will be cancelled. +.Pp +The size of +.Ar new_device +must be greater than or equal to the minimum size of all the devices in a mirror +or raidz configuration. +.Pp +.Ar new_device +is required if the pool is not redundant. +If +.Ar new_device +is not specified, it defaults to +.Ar old_device . +This form of replacement is useful after an existing disk has failed and has +been physically replaced. +In this case, the new disk may have the same +.Pa /dev +path as the old device, even though it is actually a different disk. +ZFS recognizes this. +.Bl -tag -width Ds +.It Fl f +Forces use of +.Ar new_device , +even if it appears to be in use. +Not all devices can be overridden in this manner. +.It Fl o Ar property Ns = Ns Ar value +Sets the given pool properties. See the +.Xr zpoolprops +manual page for a list of valid properties that can be set. +The only property supported at the moment is +.Sy ashift . +.It Fl s +The +.Ar new_device +is reconstructed sequentially to restore redundancy as quickly as possible. +Checksums are not verfied during sequential reconstruction so a scrub is +started when the resilver completes. +Sequential reconstruction is not supported for raidz configurations. +.It Fl w +Waits until the replacement has completed before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-detach 8 , +.Xr zpool-initialize 8 , +.Xr zpool-online 8 , +.Xr zpool-resilver 8 diff --git a/man/man8/zpool-resilver.8 b/man/man8/zpool-resilver.8 new file mode 100644 index 000000000000..a804bea4a3b8 --- /dev/null +++ b/man/man8/zpool-resilver.8 @@ -0,0 +1,59 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-RESILVER 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm resilver +.Nd Start a resilver of a device in a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm resilver +.Ar pool Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm resilver +.Ar pool Ns ... +.Xc +Starts a resilver. If an existing resilver is already running it will be +restarted from the beginning. Any drives that were scheduled for a deferred +resilver will be added to the new one. This requires the +.Sy resilver_defer +feature. +.El +.Sh SEE ALSO +.Xr zpool-iostat 8 , +.Xr zpool-online 8 , +.Xr zpool-reopen 8 , +.Xr zpool-replace 8 , +.Xr zpool-scrub 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-scrub.8 b/man/man8/zpool-scrub.8 new file mode 100644 index 000000000000..a5624795544c --- /dev/null +++ b/man/man8/zpool-scrub.8 @@ -0,0 +1,105 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-SCRUB 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm scrub +.Nd Begin a scrub or resume a paused scrub of a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm scrub +.Op Fl s | Fl p +.Op Fl w +.Ar pool Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm scrub +.Op Fl s | Fl p +.Op Fl w +.Ar pool Ns ... +.Xc +Begins a scrub or resumes a paused scrub. +The scrub examines all data in the specified pools to verify that it checksums +correctly. +For replicated +.Pq mirror or raidz +devices, ZFS automatically repairs any damage discovered during the scrub. +The +.Nm zpool Cm status +command reports the progress of the scrub and summarizes the results of the +scrub upon completion. +.Pp +Scrubbing and resilvering are very similar operations. +The difference is that resilvering only examines data that ZFS knows to be out +of date +.Po +for example, when attaching a new device to a mirror or replacing an existing +device +.Pc , +whereas scrubbing examines all data to discover silent errors due to hardware +faults or disk failure. +.Pp +Because scrubbing and resilvering are I/O-intensive operations, ZFS only allows +one at a time. +If a scrub is paused, the +.Nm zpool Cm scrub +resumes it. +If a resilver is in progress, ZFS does not allow a scrub to be started until the +resilver completes. +.Pp +Note that, due to changes in pool data on a live system, it is possible for +scrubs to progress slightly beyond 100% completion. During this period, no +completion time estimate will be provided. +.Bl -tag -width Ds +.It Fl s +Stop scrubbing. +.El +.Bl -tag -width Ds +.It Fl p +Pause scrubbing. +Scrub pause state and progress are periodically synced to disk. +If the system is restarted or pool is exported during a paused scrub, +even after import, scrub will remain paused until it is resumed. +Once resumed the scrub will pick up from the place where it was last +checkpointed to disk. +To resume a paused scrub issue +.Nm zpool Cm scrub +again. +.It Fl w +Wait until scrub has completed before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-iostat 8 , +.Xr zpool-resilver 8 , +.Xr zpool-status 8 diff --git a/man/man8/zpool-set.8 b/man/man8/zpool-set.8 new file mode 120000 index 000000000000..2b8b8cfb7e1c --- /dev/null +++ b/man/man8/zpool-set.8 @@ -0,0 +1 @@ +zpool-get.8 \ No newline at end of file diff --git a/man/man8/zpool-split.8 b/man/man8/zpool-split.8 new file mode 100644 index 000000000000..d2943e6e374b --- /dev/null +++ b/man/man8/zpool-split.8 @@ -0,0 +1,124 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-SPLIT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm split +.Nd Split devices off a ZFS storage pool creating a new pool +.Sh SYNOPSIS +.Nm +.Cm split +.Op Fl gLlnP +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Ar pool newpool +.Oo Ar device Oc Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm split +.Op Fl gLlnP +.Oo Fl o Ar property Ns = Ns Ar value Oc Ns ... +.Op Fl R Ar root +.Ar pool newpool +.Op Ar device ... +.Xc +Splits devices off +.Ar pool +creating +.Ar newpool . +All vdevs in +.Ar pool +must be mirrors and the pool must not be in the process of resilvering. +At the time of the split, +.Ar newpool +will be a replica of +.Ar pool . +By default, the +last device in each mirror is split from +.Ar pool +to create +.Ar newpool . +.Pp +The optional device specification causes the specified device(s) to be +included in the new +.Ar pool +and, should any devices remain unspecified, +the last device in each mirror is used as would be by default. +.Bl -tag -width Ds +.It Fl g +Display vdev GUIDs instead of the normal device names. These GUIDs +can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. This can +be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl l +Indicates that this command will request encryption keys for all encrypted +datasets it attempts to mount as it is bringing the new pool online. Note that +if any datasets have a +.Sy keylocation +of +.Sy prompt +this command will block waiting for the keys to be entered. Without this flag +encrypted datasets will be left unavailable until the keys are loaded. +.It Fl n +Do dry run, do not actually perform the split. +Print out the expected configuration of +.Ar newpool . +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. This can be used in conjunction with the +.Fl L +flag. +.It Fl o Ar property Ns = Ns Ar value +Sets the specified property for +.Ar newpool . +See the +.Xr zpoolprops +manual page for more information on the available pool properties. +.It Fl R Ar root +Set +.Sy altroot +for +.Ar newpool +to +.Ar root +and automatically import it. +.El +.El +.Sh SEE ALSO +.Xr zpool-import 8 , +.Xr zpool-list 8 , +.Xr zpool-remove 8 diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 new file mode 100644 index 000000000000..c2fe76c03812 --- /dev/null +++ b/man/man8/zpool-status.8 @@ -0,0 +1,138 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd May 15, 2020 +.Dt ZPOOL-STATUS 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm status +.Nd Display detailed health status for the given ZFS storage pools +.Sh SYNOPSIS +.Nm +.Cm status +.Oo Fl c Ar SCRIPT Oc +.Op Fl DigLpPstvx +.Op Fl T Sy u Ns | Ns Sy d +.Oo Ar pool Oc Ns ... +.Op Ar interval Op Ar count +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm status +.Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... +.Op Fl DigLpPstvx +.Op Fl T Sy u Ns | Ns Sy d +.Oo Ar pool Oc Ns ... +.Op Ar interval Op Ar count +.Xc +Displays the detailed health status for the given pools. +If no +.Ar pool +is specified, then the status of each pool in the system is displayed. +For more information on pool and device health, see the +.Em Device Failure and Recovery +section of +.Xr zpoolconcepts 8 . +.Pp +If a scrub or resilver is in progress, this command reports the percentage done +and the estimated time to completion. +Both of these are only approximate, because the amount of data in the pool and +the other workloads on the system can change. +.Bl -tag -width Ds +.It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns ... +Run a script (or scripts) on each vdev and include the output as a new column +in the +.Nm zpool Cm status +output. See the +.Fl c +option of +.Nm zpool Cm iostat +for complete details. +.It Fl i +Display vdev initialization status. +.It Fl g +Display vdev GUIDs instead of the normal device names. These GUIDs +can be used in place of device names for the zpool +detach/offline/remove/replace commands. +.It Fl L +Display real paths for vdevs resolving all symbolic links. This can +be used to look up the current block device name regardless of the +.Pa /dev/disk/ +path used to open it. +.It Fl p +Display numbers in parsable (exact) values. +.It Fl P +Display full paths for vdevs instead of only the last component of +the path. This can be used in conjunction with the +.Fl L +flag. +.It Fl D +Display a histogram of deduplication statistics, showing the allocated +.Pq physically present on disk +and referenced +.Pq logically referenced in the pool +block counts and sizes by reference count. +.It Fl s +Display the number of leaf VDEV slow IOs. This is the number of IOs that +didn't complete in \fBzio_slow_io_ms\fR milliseconds (default 30 seconds). +This does not necessarily mean the IOs failed to complete, just took an +unreasonably long amount of time. This may indicate a problem with the +underlying storage. +.It Fl t +Display vdev TRIM status. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.It Fl v +Displays verbose data error information, printing out a complete list of all +data errors since the last complete pool scrub. +.It Fl x +Only display status for pools that are exhibiting errors or are otherwise +unavailable. +Warnings about pools not using the latest on-disk format will not be included. +.El +.El +.Sh SEE ALSO +.Xr zpool-events 8 , +.Xr zpool-history 8 , +.Xr zpool-iostat 8 , +.Xr zpool-list 8 , +.Xr zpool-resilver 8 , +.Xr zpool-scrub 8 , +.Xr zpool-wait 8 diff --git a/man/man8/zpool-sync.8 b/man/man8/zpool-sync.8 new file mode 100644 index 000000000000..098fdde9e5c9 --- /dev/null +++ b/man/man8/zpool-sync.8 @@ -0,0 +1,57 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-SYNC 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm sync +.Nd Force data to be written to primary storage of a ZFS storage pool and update reporting data +.Sh SYNOPSIS +.Nm +.Cm sync +.Oo Ar pool Oc Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm sync +.Op Ar pool ... +.Xc +This command forces all in-core dirty data to be written to the primary +pool storage and not the ZIL. It will also update administrative +information including quota reporting. Without arguments, +.Sy zpool sync +will sync all pools on the system. Otherwise, it will sync only the +specified pool(s). +.El +.Sh SEE ALSO +.Xr zpoolconcepts 8 , +.Xr zpool-export 8 , +.Xr zpool-iostat 8 diff --git a/man/man8/zpool-trim.8 b/man/man8/zpool-trim.8 new file mode 100644 index 000000000000..eef58aae5da6 --- /dev/null +++ b/man/man8/zpool-trim.8 @@ -0,0 +1,94 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd February 25, 2020 +.Dt ZPOOL-TRIM 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm trim +.Nd Initiate immediate TRIM operations for all free space in a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm trim +.Op Fl dw +.Op Fl r Ar rate +.Op Fl c | Fl s +.Ar pool +.Op Ar device Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm trim +.Op Fl dw +.Op Fl c | Fl s +.Ar pool +.Op Ar device Ns ... +.Xc +Initiates an immediate on-demand TRIM operation for all of the free space in +a pool. This operation informs the underlying storage devices of all blocks +in the pool which are no longer allocated and allows thinly provisioned +devices to reclaim the space. +.Pp +A manual on-demand TRIM operation can be initiated irrespective of the +.Sy autotrim +pool property setting. See the documentation for the +.Sy autotrim +property above for the types of vdev devices which can be trimmed. +.Bl -tag -width Ds +.It Fl d -secure +Causes a secure TRIM to be initiated. When performing a secure TRIM, the +device guarantees that data stored on the trimmed blocks has been erased. +This requires support from the device and is not supported by all SSDs. +.It Fl r -rate Ar rate +Controls the rate at which the TRIM operation progresses. Without this +option TRIM is executed as quickly as possible. The rate, expressed in bytes +per second, is applied on a per-vdev basis and may be set differently for +each leaf vdev. +.It Fl c, -cancel +Cancel trimming on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +trimmed, the command will fail and no cancellation will occur on any device. +.It Fl s -suspend +Suspend trimming on the specified devices, or all eligible devices if none +are specified. +If one or more target devices are invalid or are not currently being +trimmed, the command will fail and no suspension will occur on any device. +Trimming can then be resumed by running +.Nm zpool Cm trim +with no flags on the relevant target devices. +.It Fl w -wait +Wait until the devices are done being trimmed before returning. +.El +.El +.Sh SEE ALSO +.Xr zpool-initialize 8 , +.Xr zpool-wait 8 , +.Xr zpoolprops 8 diff --git a/man/man8/zpool-upgrade.8 b/man/man8/zpool-upgrade.8 new file mode 100644 index 000000000000..d23d0e0e6d0c --- /dev/null +++ b/man/man8/zpool-upgrade.8 @@ -0,0 +1,96 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL-UPGRADE 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm upgrade +.Nd Manage version and feature flags of ZFS storage pools +.Sh SYNOPSIS +.Nm +.Cm upgrade +.Nm +.Cm upgrade +.Fl v +.Nm +.Cm upgrade +.Op Fl V Ar version +.Fl a Ns | Ns Ar pool Ns ... +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm upgrade +.Xc +Displays pools which do not have all supported features enabled and pools +formatted using a legacy ZFS version number. +These pools can continue to be used, but some features may not be available. +Use +.Nm zpool Cm upgrade Fl a +to enable all features on all pools. +.It Xo +.Nm +.Cm upgrade +.Fl v +.Xc +Displays legacy ZFS versions supported by the current software. +See +.Xr zpool-features 5 +for a description of feature flags features supported by the current software. +.It Xo +.Nm +.Cm upgrade +.Op Fl V Ar version +.Fl a Ns | Ns Ar pool Ns ... +.Xc +Enables all supported features on the given pool. +Once this is done, the pool will no longer be accessible on systems that do not +support feature flags. +See +.Xr zpool-features 5 +for details on compatibility with systems that support feature flags, but do not +support all features enabled on the pool. +.Bl -tag -width Ds +.It Fl a +Enables all supported features on all pools. +.It Fl V Ar version +Upgrade to the specified legacy version. +If the +.Fl V +flag is specified, no features will be enabled on the pool. +This option can only be used to increase the version number up to the last +supported legacy version number. +.El +.El +.Sh SEE ALSO +.Xr zpool-features 5 , +.Xr zpoolconcepts 8 , +.Xr zpoolprops 8 , +.Xr zpool-history 8 diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 new file mode 100644 index 000000000000..a3bdba669e53 --- /dev/null +++ b/man/man8/zpool-wait.8 @@ -0,0 +1,114 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd February 25, 2020 +.Dt ZPOOL-WAIT 8 +.Os +.Sh NAME +.Nm zpool Ns Pf - Cm wait +.Nd Wait for background activity to stop in a ZFS storage pool +.Sh SYNOPSIS +.Nm +.Cm wait +.Op Fl Hp +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar pool +.Op Ar interval +.Sh DESCRIPTION +.Bl -tag -width Ds +.It Xo +.Nm +.Cm wait +.Op Fl Hp +.Op Fl T Sy u Ns | Ns Sy d +.Op Fl t Ar activity Ns Oo , Ns Ar activity Ns Oc Ns ... +.Ar pool +.Op Ar interval +.Xc +Waits until all background activity of the given types has ceased in the given +pool. +The activity could cease because it has completed, or because it has been +paused or canceled by a user, or because the pool has been exported or +destroyed. +If no activities are specified, the command waits until background activity of +every type listed below has ceased. +If there is no activity of the given types in progress, the command returns +immediately. +.Pp +These are the possible values for +.Ar activity , +along with what each one waits for: +.Bd -literal + discard Checkpoint to be discarded + free 'freeing' property to become 0 + initialize All initializations to cease + replace All device replacements to cease + remove Device removal to cease + resilver Resilver to cease + scrub Scrub to cease + trim Manual trim to cease +.Ed +.Pp +If an +.Ar interval +is provided, the amount of work remaining, in bytes, for each activity is +printed every +.Ar interval +seconds. +.Bl -tag -width Ds +.It Fl H +Scripted mode. +Do not display headers, and separate fields by a single tab instead of arbitrary +space. +.It Fl p +Display numbers in parsable (exact) values. +.It Fl T Sy u Ns | Ns Sy d +Display a time stamp. +Specify +.Sy u +for a printed representation of the internal representation of time. +See +.Xr time 2 . +Specify +.Sy d +for standard date format. +See +.Xr date 1 . +.El +.El +.Sh SEE ALSO +.Xr zpool-status 8 , +.Xr zpool-checkpoint 8 , +.Xr zpool-initialize 8 , +.Xr zpool-replace 8 , +.Xr zpool-remove 8 , +.Xr zpool-resilver 8 , +.Xr zpool-scrub 8 , +.Xr zpool-trim 8 diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 new file mode 100644 index 000000000000..0fe6866f33da --- /dev/null +++ b/man/man8/zpool.8 @@ -0,0 +1,568 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOL 8 +.Os +.Sh NAME +.Nm zpool +.Nd configure ZFS storage pools +.Sh SYNOPSIS +.Nm +.Fl ?V +.Nm +.Cm version +.Nm +.Cm +.Op Ar +.Sh DESCRIPTION +The +.Nm +command configures ZFS storage pools. +A storage pool is a collection of devices that provides physical storage and +data replication for ZFS datasets. +All datasets within a storage pool share the same space. +See +.Xr zfs 8 +for information on managing datasets. +.Pp +For an overview of creating and managing ZFS storage pools see the +.Xr zpoolconcepts 8 +manual page. +.Sh SUBCOMMANDS +All subcommands that modify state are logged persistently to the pool in their +original form. +.Pp +The +.Nm +command provides subcommands to create and destroy storage pools, add capacity +to storage pools, and provide information about the storage pools. +The following subcommands are supported: +.Bl -tag -width Ds +.It Xo +.Nm +.Fl ? +.Xc +Displays a help message. +.It Xo +.Nm +.Fl V, -version +.Xc +An alias for the +.Nm zpool Cm version +subcommand. +.It Xo +.Nm +.Cm version +.Xc +Displays the software version of the +.Nm +userland utility and the zfs kernel module. +.El +.Ss Creation +.Bl -tag -width Ds +.It Xr zpool-create 8 +Creates a new storage pool containing the virtual devices specified on the +command line. +.It Xr zpool-initialize 8 +Begins initializing by writing to all unallocated regions on the specified +devices, or all eligible devices in the pool if no individual devices are +specified. +.El +.Ss Destruction +.Bl -tag -width Ds +.It Xr zpool-destroy 8 +Destroys the given pool, freeing up any devices for other use. +.It Xr zpool-labelclear 8 +Removes ZFS label information from the specified +.Ar device . +.El +.Ss Virtual Devices +.Bl -tag -width Ds +.It Xo +.Xr zpool-attach 8 / +.Xr zpool-detach 8 +.Xc +Increases or decreases redundancy by +.Cm attach Ns -ing or +.Cm detach Ns -ing a device on an existing vdev (virtual device). +.It Xo +.Xr zpool-add 8 / +.Xr zpool-remove 8 +.Xc +Adds the specified virtual devices to the given pool, +or removes the specified device from the pool. +.It Xr zpool-replace 8 +Replaces an existing device (which may be faulted) with a new one. +.It Xr zpool-split 8 +Creates a new pool by splitting all mirrors in an existing pool (which decreases its redundancy). +.El +.Ss Properties +Available pool properties listed in the +.Xr zpoolprops 8 +manual page. +.Bl -tag -width Ds +.It Xr zpool-list 8 +Lists the given pools along with a health status and space usage. +.It Xo +.Xr zpool-get 8 / +.Xr zpool-set 8 +.Xc +Retrieves the given list of properties +.Po +or all properties if +.Sy all +is used +.Pc +for the specified storage pool(s). +.El +.Ss Monitoring +.Bl -tag -width Ds +.It Xr zpool-status 8 +Displays the detailed health status for the given pools. +.It Xr zpool-iostat 8 +Displays logical I/O statistics for the given pools/vdevs. Physical I/Os may +be observed via +.Xr iostat 1 . +.It Xr zpool-events 8 +Lists all recent events generated by the ZFS kernel modules. These events +are consumed by the +.Xr zed 8 +and used to automate administrative tasks such as replacing a failed device +with a hot spare. For more information about the subclasses and event payloads +that can be generated see the +.Xr zfs-events 5 +man page. +.It Xr zpool-history 8 +Displays the command history of the specified pool(s) or all pools if no pool is +specified. +.El +.Ss Maintenance +.Bl -tag -width Ds +.It Xr zpool-scrub 8 +Begins a scrub or resumes a paused scrub. +.It Xr zpool-checkpoint 8 +Checkpoints the current state of +.Ar pool +, which can be later restored by +.Nm zpool Cm import --rewind-to-checkpoint . +.It Xr zpool-trim 8 +Initiates an immediate on-demand TRIM operation for all of the free space in +a pool. This operation informs the underlying storage devices of all blocks +in the pool which are no longer allocated and allows thinly provisioned +devices to reclaim the space. +.It Xr zpool-sync 8 +This command forces all in-core dirty data to be written to the primary +pool storage and not the ZIL. It will also update administrative +information including quota reporting. Without arguments, +.Sy zpool sync +will sync all pools on the system. Otherwise, it will sync only the +specified pool(s). +.It Xr zpool-upgrade 8 +Manage the on-disk format version of storage pools. +.It Xr zpool-wait 8 +Waits until all background activity of the given types has ceased in the given +pool. +.El +.Ss Fault Resolution +.Bl -tag -width Ds +.It Xo +.Xr zpool-offline 8 +.Xr zpool-online 8 +.Xc +Takes the specified physical device offline or brings it online. +.It Xr zpool-resilver 8 +Starts a resilver. If an existing resilver is already running it will be +restarted from the beginning. +.It Xr zpool-reopen 8 +Reopen all the vdevs associated with the pool. +.It Xr zpool-clear 8 +Clears device errors in a pool. +.El +.Ss Import & Export +.Bl -tag -width Ds +.It Xr zpool-import 8 +Make disks containing ZFS storage pools available for use on the system. +.It Xr zpool-export 8 +Exports the given pools from the system. +.It Xr zpool-reguid 8 +Generates a new unique identifier for the pool. +.El +.Sh EXIT STATUS +The following exit values are returned: +.Bl -tag -width Ds +.It Sy 0 +Successful completion. +.It Sy 1 +An error occurred. +.It Sy 2 +Invalid command line options were specified. +.El +.Sh EXAMPLES +.Bl -tag -width Ds +.It Sy Example 1 No Creating a RAID-Z Storage Pool +The following command creates a pool with a single raidz root vdev that +consists of six disks. +.Bd -literal +# zpool create tank raidz sda sdb sdc sdd sde sdf +.Ed +.It Sy Example 2 No Creating a Mirrored Storage Pool +The following command creates a pool with two mirrors, where each mirror +contains two disks. +.Bd -literal +# zpool create tank mirror sda sdb mirror sdc sdd +.Ed +.It Sy Example 3 No Creating a ZFS Storage Pool by Using Partitions +The following command creates an unmirrored pool using two disk partitions. +.Bd -literal +# zpool create tank sda1 sdb2 +.Ed +.It Sy Example 4 No Creating a ZFS Storage Pool by Using Files +The following command creates an unmirrored pool using files. +While not recommended, a pool based on files can be useful for experimental +purposes. +.Bd -literal +# zpool create tank /path/to/file/a /path/to/file/b +.Ed +.It Sy Example 5 No Adding a Mirror to a ZFS Storage Pool +The following command adds two mirrored disks to the pool +.Em tank , +assuming the pool is already made up of two-way mirrors. +The additional space is immediately available to any datasets within the pool. +.Bd -literal +# zpool add tank mirror sda sdb +.Ed +.It Sy Example 6 No Listing Available ZFS Storage Pools +The following command lists all available pools on the system. +In this case, the pool +.Em zion +is faulted due to a missing device. +The results from this command are similar to the following: +.Bd -literal +# zpool list +NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT +rpool 19.9G 8.43G 11.4G - 33% 42% 1.00x ONLINE - +tank 61.5G 20.0G 41.5G - 48% 32% 1.00x ONLINE - +zion - - - - - - - FAULTED - +.Ed +.It Sy Example 7 No Destroying a ZFS Storage Pool +The following command destroys the pool +.Em tank +and any datasets contained within. +.Bd -literal +# zpool destroy -f tank +.Ed +.It Sy Example 8 No Exporting a ZFS Storage Pool +The following command exports the devices in pool +.Em tank +so that they can be relocated or later imported. +.Bd -literal +# zpool export tank +.Ed +.It Sy Example 9 No Importing a ZFS Storage Pool +The following command displays available pools, and then imports the pool +.Em tank +for use on the system. +The results from this command are similar to the following: +.Bd -literal +# zpool import + pool: tank + id: 15451357997522795478 + state: ONLINE +action: The pool can be imported using its name or numeric identifier. +config: + + tank ONLINE + mirror ONLINE + sda ONLINE + sdb ONLINE + +# zpool import tank +.Ed +.It Sy Example 10 No Upgrading All ZFS Storage Pools to the Current Version +The following command upgrades all ZFS Storage pools to the current version of +the software. +.Bd -literal +# zpool upgrade -a +This system is currently running ZFS version 2. +.Ed +.It Sy Example 11 No Managing Hot Spares +The following command creates a new pool with an available hot spare: +.Bd -literal +# zpool create tank mirror sda sdb spare sdc +.Ed +.Pp +If one of the disks were to fail, the pool would be reduced to the degraded +state. +The failed device can be replaced using the following command: +.Bd -literal +# zpool replace tank sda sdd +.Ed +.Pp +Once the data has been resilvered, the spare is automatically removed and is +made available for use should another device fail. +The hot spare can be permanently removed from the pool using the following +command: +.Bd -literal +# zpool remove tank sdc +.Ed +.It Sy Example 12 No Creating a ZFS Pool with Mirrored Separate Intent Logs +The following command creates a ZFS storage pool consisting of two, two-way +mirrors and mirrored log devices: +.Bd -literal +# zpool create pool mirror sda sdb mirror sdc sdd log mirror \\ + sde sdf +.Ed +.It Sy Example 13 No Adding Cache Devices to a ZFS Pool +The following command adds two disks for use as cache devices to a ZFS storage +pool: +.Bd -literal +# zpool add pool cache sdc sdd +.Ed +.Pp +Once added, the cache devices gradually fill with content from main memory. +Depending on the size of your cache devices, it could take over an hour for +them to fill. +Capacity and reads can be monitored using the +.Cm iostat +option as follows: +.Bd -literal +# zpool iostat -v pool 5 +.Ed +.It Sy Example 14 No Removing a Mirrored top-level (Log or Data) Device +The following commands remove the mirrored log device +.Sy mirror-2 +and mirrored top-level data device +.Sy mirror-1 . +.Pp +Given this configuration: +.Bd -literal + pool: tank + state: ONLINE + scrub: none requested +config: + + NAME STATE READ WRITE CKSUM + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + sda ONLINE 0 0 0 + sdb ONLINE 0 0 0 + mirror-1 ONLINE 0 0 0 + sdc ONLINE 0 0 0 + sdd ONLINE 0 0 0 + logs + mirror-2 ONLINE 0 0 0 + sde ONLINE 0 0 0 + sdf ONLINE 0 0 0 +.Ed +.Pp +The command to remove the mirrored log +.Sy mirror-2 +is: +.Bd -literal +# zpool remove tank mirror-2 +.Ed +.Pp +The command to remove the mirrored data +.Sy mirror-1 +is: +.Bd -literal +# zpool remove tank mirror-1 +.Ed +.It Sy Example 15 No Displaying expanded space on a device +The following command displays the detailed information for the pool +.Em data . +This pool is comprised of a single raidz vdev where one of its devices +increased its capacity by 10GB. +In this example, the pool will not be able to utilize this extra capacity until +all the devices under the raidz vdev have been expanded. +.Bd -literal +# zpool list -v data +NAME SIZE ALLOC FREE EXPANDSZ FRAG CAP DEDUP HEALTH ALTROOT +data 23.9G 14.6G 9.30G - 48% 61% 1.00x ONLINE - + raidz1 23.9G 14.6G 9.30G - 48% + sda - - - - - + sdb - - - 10G - + sdc - - - - - +.Ed +.It Sy Example 16 No Adding output columns +Additional columns can be added to the +.Nm zpool Cm status +and +.Nm zpool Cm iostat +output with +.Fl c +option. +.Bd -literal +# zpool status -c vendor,model,size + NAME STATE READ WRITE CKSUM vendor model size + tank ONLINE 0 0 0 + mirror-0 ONLINE 0 0 0 + U1 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + U10 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + U11 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + U12 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + U13 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + U14 ONLINE 0 0 0 SEAGATE ST8000NM0075 7.3T + +# zpool iostat -vc size + capacity operations bandwidth +pool alloc free read write read write size +---------- ----- ----- ----- ----- ----- ----- ---- +rpool 14.6G 54.9G 4 55 250K 2.69M + sda1 14.6G 54.9G 4 55 250K 2.69M 70G +---------- ----- ----- ----- ----- ----- ----- ---- +.Ed +.El +.Sh ENVIRONMENT VARIABLES +.Bl -tag -width "ZFS_ABORT" +.It Ev ZFS_ABORT +Cause +.Nm zpool +to dump core on exit for the purposes of running +.Sy ::findleaks . +.El +.Bl -tag -width "ZFS_COLOR" +.It Ev ZFS_COLOR +Use ANSI color in +.Nm zpool status +output. +.El +.Bl -tag -width "ZPOOL_IMPORT_PATH" +.It Ev ZPOOL_IMPORT_PATH +The search path for devices or files to use with the pool. This is a colon-separated list of directories in which +.Nm zpool +looks for device nodes and files. +Similar to the +.Fl d +option in +.Nm zpool import . +.El +.Bl -tag -width "ZPOOL_IMPORT_UDEV_TIMEOUT_MS" +.It Ev ZPOOL_IMPORT_UDEV_TIMEOUT_MS +The maximum time in milliseconds that +.Nm zpool import +will wait for an expected device to be available. +.El +.Bl -tag -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE" +.It Ev ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE +If set, suppress warning about non-native vdev ashift in +.Nm zpool status . +The value is not used, only the presence or absence of the variable matters. +.El +.Bl -tag -width "ZPOOL_VDEV_NAME_GUID" +.It Ev ZPOOL_VDEV_NAME_GUID +Cause +.Nm zpool +subcommands to output vdev guids by default. This behavior is identical to the +.Nm zpool status -g +command line option. +.El +.Bl -tag -width "ZPOOL_VDEV_NAME_FOLLOW_LINKS" +.It Ev ZPOOL_VDEV_NAME_FOLLOW_LINKS +Cause +.Nm zpool +subcommands to follow links for vdev names by default. This behavior is identical to the +.Nm zpool status -L +command line option. +.El +.Bl -tag -width "ZPOOL_VDEV_NAME_PATH" +.It Ev ZPOOL_VDEV_NAME_PATH +Cause +.Nm zpool +subcommands to output full vdev path names by default. This +behavior is identical to the +.Nm zpool status -P +command line option. +.El +.Bl -tag -width "ZFS_VDEV_DEVID_OPT_OUT" +.It Ev ZFS_VDEV_DEVID_OPT_OUT +Older ZFS on Linux implementations had issues when attempting to display pool +config VDEV names if a +.Sy devid +NVP value is present in the pool's config. +.Pp +For example, a pool that originated on illumos platform would have a devid +value in the config and +.Nm zpool status +would fail when listing the config. +This would also be true for future Linux based pools. +.Pp +A pool can be stripped of any +.Sy devid +values on import or prevented from adding +them on +.Nm zpool create +or +.Nm zpool add +by setting +.Sy ZFS_VDEV_DEVID_OPT_OUT . +.El +.Bl -tag -width "ZPOOL_SCRIPTS_AS_ROOT" +.It Ev ZPOOL_SCRIPTS_AS_ROOT +Allow a privileged user to run the +.Nm zpool status/iostat +with the +.Fl c +option. Normally, only unprivileged users are allowed to run +.Fl c . +.El +.Bl -tag -width "ZPOOL_SCRIPTS_PATH" +.It Ev ZPOOL_SCRIPTS_PATH +The search path for scripts when running +.Nm zpool status/iostat +with the +.Fl c +option. This is a colon-separated list of directories and overrides the default +.Pa ~/.zpool.d +and +.Pa /etc/zfs/zpool.d +search paths. +.El +.Bl -tag -width "ZPOOL_SCRIPTS_ENABLED" +.It Ev ZPOOL_SCRIPTS_ENABLED +Allow a user to run +.Nm zpool status/iostat +with the +.Fl c +option. If +.Sy ZPOOL_SCRIPTS_ENABLED +is not set, it is assumed that the user is allowed to run +.Nm zpool status/iostat -c . +.El +.Sh INTERFACE STABILITY +.Sy Evolving +.Sh SEE ALSO +.Xr zpoolconcepts 8 , +.Xr zpoolprops 8 , +.Xr zfs-events 5 , +.Xr zfs-module-parameters 5 , +.Xr zpool-features 5 , +.Xr zed 8 , +.Xr zfs 8 diff --git a/man/man8/zpoolconcepts.8 b/man/man8/zpoolconcepts.8 new file mode 100644 index 000000000000..f9c262f4be42 --- /dev/null +++ b/man/man8/zpoolconcepts.8 @@ -0,0 +1,414 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOLCONCEPTS 8 +.Os +.Sh NAME +.Nm zpoolconcepts +.Nd overview of ZFS storage pools +.Sh DESCRIPTION +.Ss Virtual Devices (vdevs) +A "virtual device" describes a single device or a collection of devices +organized according to certain performance and fault characteristics. +The following virtual devices are supported: +.Bl -tag -width Ds +.It Sy disk +A block device, typically located under +.Pa /dev . +ZFS can use individual slices or partitions, though the recommended mode of +operation is to use whole disks. +A disk can be specified by a full path, or it can be a shorthand name +.Po the relative portion of the path under +.Pa /dev +.Pc . +A whole disk can be specified by omitting the slice or partition designation. +For example, +.Pa sda +is equivalent to +.Pa /dev/sda . +When given a whole disk, ZFS automatically labels the disk, if necessary. +.It Sy file +A regular file. +The use of files as a backing store is strongly discouraged. +It is designed primarily for experimental purposes, as the fault tolerance of a +file is only as good as the file system of which it is a part. +A file must be specified by a full path. +.It Sy mirror +A mirror of two or more devices. +Data is replicated in an identical fashion across all components of a mirror. +A mirror with N disks of size X can hold X bytes and can withstand (N-1) devices +failing before data integrity is compromised. +.It Sy raidz , raidz1 , raidz2 , raidz3 +A variation on RAID-5 that allows for better distribution of parity and +eliminates the RAID-5 +.Qq write hole +.Pq in which data and parity become inconsistent after a power loss . +Data and parity is striped across all disks within a raidz group. +.Pp +A raidz group can have single-, double-, or triple-parity, meaning that the +raidz group can sustain one, two, or three failures, respectively, without +losing any data. +The +.Sy raidz1 +vdev type specifies a single-parity raidz group; the +.Sy raidz2 +vdev type specifies a double-parity raidz group; and the +.Sy raidz3 +vdev type specifies a triple-parity raidz group. +The +.Sy raidz +vdev type is an alias for +.Sy raidz1 . +.Pp +A raidz group with N disks of size X with P parity disks can hold approximately +(N-P)*X bytes and can withstand P device(s) failing before data integrity is +compromised. +The minimum number of devices in a raidz group is one more than the number of +parity disks. +The recommended number is between 3 and 9 to help increase performance. +.It Sy spare +A pseudo-vdev which keeps track of available hot spares for a pool. +For more information, see the +.Sx Hot Spares +section. +.It Sy log +A separate intent log device. +If more than one log device is specified, then writes are load-balanced between +devices. +Log devices can be mirrored. +However, raidz vdev types are not supported for the intent log. +For more information, see the +.Sx Intent Log +section. +.It Sy dedup +A device dedicated solely for deduplication tables. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. If more than one dedup device is specified, then +allocations are load-balanced between those devices. +.It Sy special +A device dedicated solely for allocating various kinds of internal metadata, +and optionally small file blocks. +The redundancy of this device should match the redundancy of the other normal +devices in the pool. If more than one special device is specified, then +allocations are load-balanced between those devices. +.Pp +For more information on special allocations, see the +.Sx Special Allocation Class +section. +.It Sy cache +A device used to cache storage pool data. +A cache device cannot be configured as a mirror or raidz group. +For more information, see the +.Sx Cache Devices +section. +.El +.Pp +Virtual devices cannot be nested, so a mirror or raidz virtual device can only +contain files or disks. +Mirrors of mirrors +.Pq or other combinations +are not allowed. +.Pp +A pool can have any number of virtual devices at the top of the configuration +.Po known as +.Qq root vdevs +.Pc . +Data is dynamically distributed across all top-level devices to balance data +among devices. +As new virtual devices are added, ZFS automatically places data on the newly +available devices. +.Pp +Virtual devices are specified one at a time on the command line, separated by +whitespace. +The keywords +.Sy mirror +and +.Sy raidz +are used to distinguish where a group ends and another begins. +For example, the following creates two root vdevs, each a mirror of two disks: +.Bd -literal +# zpool create mypool mirror sda sdb mirror sdc sdd +.Ed +.Ss Device Failure and Recovery +ZFS supports a rich set of mechanisms for handling device failure and data +corruption. +All metadata and data is checksummed, and ZFS automatically repairs bad data +from a good copy when corruption is detected. +.Pp +In order to take advantage of these features, a pool must make use of some form +of redundancy, using either mirrored or raidz groups. +While ZFS supports running in a non-redundant configuration, where each root +vdev is simply a disk or file, this is strongly discouraged. +A single case of bit corruption can render some or all of your data unavailable. +.Pp +A pool's health status is described by one of three states: online, degraded, +or faulted. +An online pool has all devices operating normally. +A degraded pool is one in which one or more devices have failed, but the data is +still available due to a redundant configuration. +A faulted pool has corrupted metadata, or one or more faulted devices, and +insufficient replicas to continue functioning. +.Pp +The health of the top-level vdev, such as mirror or raidz device, is +potentially impacted by the state of its associated vdevs, or component +devices. +A top-level vdev or component device is in one of the following states: +.Bl -tag -width "DEGRADED" +.It Sy DEGRADED +One or more top-level vdevs is in the degraded state because one or more +component devices are offline. +Sufficient replicas exist to continue functioning. +.Pp +One or more component devices is in the degraded or faulted state, but +sufficient replicas exist to continue functioning. +The underlying conditions are as follows: +.Bl -bullet +.It +The number of checksum errors exceeds acceptable levels and the device is +degraded as an indication that something may be wrong. +ZFS continues to use the device as necessary. +.It +The number of I/O errors exceeds acceptable levels. +The device could not be marked as faulted because there are insufficient +replicas to continue functioning. +.El +.It Sy FAULTED +One or more top-level vdevs is in the faulted state because one or more +component devices are offline. +Insufficient replicas exist to continue functioning. +.Pp +One or more component devices is in the faulted state, and insufficient +replicas exist to continue functioning. +The underlying conditions are as follows: +.Bl -bullet +.It +The device could be opened, but the contents did not match expected values. +.It +The number of I/O errors exceeds acceptable levels and the device is faulted to +prevent further use of the device. +.El +.It Sy OFFLINE +The device was explicitly taken offline by the +.Nm zpool Cm offline +command. +.It Sy ONLINE +The device is online and functioning. +.It Sy REMOVED +The device was physically removed while the system was running. +Device removal detection is hardware-dependent and may not be supported on all +platforms. +.It Sy UNAVAIL +The device could not be opened. +If a pool is imported when a device was unavailable, then the device will be +identified by a unique identifier instead of its path since the path was never +correct in the first place. +.El +.Pp +If a device is removed and later re-attached to the system, ZFS attempts +to put the device online automatically. +Device attach detection is hardware-dependent and might not be supported on all +platforms. +.Ss Hot Spares +ZFS allows devices to be associated with pools as +.Qq hot spares . +These devices are not actively used in the pool, but when an active device +fails, it is automatically replaced by a hot spare. +To create a pool with hot spares, specify a +.Sy spare +vdev with any number of devices. +For example, +.Bd -literal +# zpool create pool mirror sda sdb spare sdc sdd +.Ed +.Pp +Spares can be shared across multiple pools, and can be added with the +.Nm zpool Cm add +command and removed with the +.Nm zpool Cm remove +command. +Once a spare replacement is initiated, a new +.Sy spare +vdev is created within the configuration that will remain there until the +original device is replaced. +At this point, the hot spare becomes available again if another device fails. +.Pp +If a pool has a shared spare that is currently being used, the pool can not be +exported since other pools may use this shared spare, which may lead to +potential data corruption. +.Pp +Shared spares add some risk. If the pools are imported on different hosts, and +both pools suffer a device failure at the same time, both could attempt to use +the spare at the same time. This may not be detected, resulting in data +corruption. +.Pp +An in-progress spare replacement can be cancelled by detaching the hot spare. +If the original faulted device is detached, then the hot spare assumes its +place in the configuration, and is removed from the spare list of all active +pools. +.Pp +Spares cannot replace log devices. +.Ss Intent Log +The ZFS Intent Log (ZIL) satisfies POSIX requirements for synchronous +transactions. +For instance, databases often require their transactions to be on stable storage +devices when returning from a system call. +NFS and other applications can also use +.Xr fsync 2 +to ensure data stability. +By default, the intent log is allocated from blocks within the main pool. +However, it might be possible to get better performance using separate intent +log devices such as NVRAM or a dedicated disk. +For example: +.Bd -literal +# zpool create pool sda sdb log sdc +.Ed +.Pp +Multiple log devices can also be specified, and they can be mirrored. +See the +.Sx EXAMPLES +section for an example of mirroring multiple log devices. +.Pp +Log devices can be added, replaced, attached, detached and removed. In +addition, log devices are imported and exported as part of the pool +that contains them. +Mirrored devices can be removed by specifying the top-level mirror vdev. +.Ss Cache Devices +Devices can be added to a storage pool as +.Qq cache devices . +These devices provide an additional layer of caching between main memory and +disk. +For read-heavy workloads, where the working set size is much larger than what +can be cached in main memory, using cache devices allow much more of this +working set to be served from low latency media. +Using cache devices provides the greatest performance improvement for random +read-workloads of mostly static content. +.Pp +To create a pool with cache devices, specify a +.Sy cache +vdev with any number of devices. +For example: +.Bd -literal +# zpool create pool sda sdb cache sdc sdd +.Ed +.Pp +Cache devices cannot be mirrored or part of a raidz configuration. +If a read error is encountered on a cache device, that read I/O is reissued to +the original storage pool device, which might be part of a mirrored or raidz +configuration. +.Pp +The content of the cache devices is persistent across reboots and restored +asynchronously when importing the pool in L2ARC (persistent L2ARC). +This can be disabled by setting +.Sy l2arc_rebuild_enabled = 0 . +For cache devices smaller than 1GB we do not write the metadata structures +required for rebuilding the L2ARC in order not to waste space. This can be +changed with +.Sy l2arc_rebuild_blocks_min_l2size . +The cache device header (512 bytes) is updated even if no metadata structures +are written. Setting +.Sy l2arc_headroom = 0 +will result in scanning the full-length ARC lists for cacheable content to be +written in L2ARC (persistent ARC). If a cache device is added with +.Nm zpool Cm add +its label and header will be overwritten and its contents are not going to be +restored in L2ARC, even if the device was previously part of the pool. If a +cache device is onlined with +.Nm zpool Cm online +its contents will be restored in L2ARC. This is useful in case of memory pressure +where the contents of the cache device are not fully restored in L2ARC. +The user can off/online the cache device when there is less memory pressure +in order to fully restore its contents to L2ARC. +.Ss Pool checkpoint +Before starting critical procedures that include destructive actions (e.g +.Nm zfs Cm destroy +), an administrator can checkpoint the pool's state and in the case of a +mistake or failure, rewind the entire pool back to the checkpoint. +Otherwise, the checkpoint can be discarded when the procedure has completed +successfully. +.Pp +A pool checkpoint can be thought of as a pool-wide snapshot and should be used +with care as it contains every part of the pool's state, from properties to vdev +configuration. +Thus, while a pool has a checkpoint certain operations are not allowed. +Specifically, vdev removal/attach/detach, mirror splitting, and +changing the pool's guid. +Adding a new vdev is supported but in the case of a rewind it will have to be +added again. +Finally, users of this feature should keep in mind that scrubs in a pool that +has a checkpoint do not repair checkpointed data. +.Pp +To create a checkpoint for a pool: +.Bd -literal +# zpool checkpoint pool +.Ed +.Pp +To later rewind to its checkpointed state, you need to first export it and +then rewind it during import: +.Bd -literal +# zpool export pool +# zpool import --rewind-to-checkpoint pool +.Ed +.Pp +To discard the checkpoint from a pool: +.Bd -literal +# zpool checkpoint -d pool +.Ed +.Pp +Dataset reservations (controlled by the +.Nm reservation +or +.Nm refreservation +zfs properties) may be unenforceable while a checkpoint exists, because the +checkpoint is allowed to consume the dataset's reservation. +Finally, data that is part of the checkpoint but has been freed in the +current state of the pool won't be scanned during a scrub. +.Ss Special Allocation Class +The allocations in the special class are dedicated to specific block types. +By default this includes all metadata, the indirect blocks of user data, and +any deduplication tables. The class can also be provisioned to accept +small file blocks. +.Pp +A pool must always have at least one normal (non-dedup/special) vdev before +other devices can be assigned to the special class. If the special class +becomes full, then allocations intended for it will spill back into the +normal class. +.Pp +Deduplication tables can be excluded from the special class by setting the +.Sy zfs_ddt_data_is_special +zfs module parameter to false (0). +.Pp +Inclusion of small file blocks in the special class is opt-in. Each dataset +can control the size of small file blocks allowed in the special class by +setting the +.Sy special_small_blocks +dataset property. It defaults to zero, so you must opt-in by setting it to a +non-zero value. See +.Xr zfs 8 +for more info on setting this property. diff --git a/man/man8/zpoolprops.8 b/man/man8/zpoolprops.8 new file mode 100644 index 000000000000..5adbfb321115 --- /dev/null +++ b/man/man8/zpoolprops.8 @@ -0,0 +1,368 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved. +.\" Copyright (c) 2012, 2018 by Delphix. All rights reserved. +.\" Copyright (c) 2012 Cyril Plisko. All Rights Reserved. +.\" Copyright (c) 2017 Datto Inc. +.\" Copyright (c) 2018 George Melikov. All Rights Reserved. +.\" Copyright 2017 Nexenta Systems, Inc. +.\" Copyright (c) 2017 Open-E, Inc. All Rights Reserved. +.\" +.Dd August 9, 2019 +.Dt ZPOOLPROPS 8 +.Os +.Sh NAME +.Nm zpoolprops +.Nd available properties for ZFS storage pools +.Sh DESCRIPTION +Each pool has several properties associated with it. +Some properties are read-only statistics while others are configurable and +change the behavior of the pool. +.Pp +The following are read-only properties: +.Bl -tag -width Ds +.It Cm allocated +Amount of storage used within the pool. +See +.Sy fragmentation +and +.Sy free +for more information. +.It Sy capacity +Percentage of pool space used. +This property can also be referred to by its shortened column name, +.Sy cap . +.It Sy expandsize +Amount of uninitialized space within the pool or device that can be used to +increase the total capacity of the pool. +Uninitialized space consists of any space on an EFI labeled vdev which has not +been brought online +.Po e.g, using +.Nm zpool Cm online Fl e +.Pc . +This space occurs when a LUN is dynamically expanded. +.It Sy fragmentation +The amount of fragmentation in the pool. As the amount of space +.Sy allocated +increases, it becomes more difficult to locate +.Sy free +space. This may result in lower write performance compared to pools with more +unfragmented free space. +.It Sy free +The amount of free space available in the pool. +By contrast, the +.Xr zfs 8 +.Sy available +property describes how much new data can be written to ZFS filesystems/volumes. +The zpool +.Sy free +property is not generally useful for this purpose, and can be substantially more than the zfs +.Sy available +space. This discrepancy is due to several factors, including raidz party; zfs +reservation, quota, refreservation, and refquota properties; and space set aside by +.Sy spa_slop_shift +(see +.Xr zfs-module-parameters 5 +for more information). +.It Sy freeing +After a file system or snapshot is destroyed, the space it was using is +returned to the pool asynchronously. +.Sy freeing +is the amount of space remaining to be reclaimed. +Over time +.Sy freeing +will decrease while +.Sy free +increases. +.It Sy health +The current health of the pool. +Health can be one of +.Sy ONLINE , DEGRADED , FAULTED , OFFLINE, REMOVED , UNAVAIL . +.It Sy guid +A unique identifier for the pool. +.It Sy load_guid +A unique identifier for the pool. +Unlike the +.Sy guid +property, this identifier is generated every time we load the pool (e.g. does +not persist across imports/exports) and never changes while the pool is loaded +(even if a +.Sy reguid +operation takes place). +.It Sy size +Total size of the storage pool. +.It Sy unsupported@ Ns Em feature_guid +Information about unsupported features that are enabled on the pool. +See +.Xr zpool-features 5 +for details. +.El +.Pp +The space usage properties report actual physical space available to the +storage pool. +The physical space can be different from the total amount of space that any +contained datasets can actually use. +The amount of space used in a raidz configuration depends on the characteristics +of the data being written. +In addition, ZFS reserves some space for internal accounting that the +.Xr zfs 8 +command takes into account, but the +.Nm +command does not. +For non-full pools of a reasonable size, these effects should be invisible. +For small pools, or pools that are close to being completely full, these +discrepancies may become more noticeable. +.Pp +The following property can be set at creation time and import time: +.Bl -tag -width Ds +.It Sy altroot +Alternate root directory. +If set, this directory is prepended to any mount points within the pool. +This can be used when examining an unknown pool where the mount points cannot be +trusted, or in an alternate boot environment, where the typical paths are not +valid. +.Sy altroot +is not a persistent property. +It is valid only while the system is up. +Setting +.Sy altroot +defaults to using +.Sy cachefile Ns = Ns Sy none , +though this may be overridden using an explicit setting. +.El +.Pp +The following property can be set only at import time: +.Bl -tag -width Ds +.It Sy readonly Ns = Ns Sy on Ns | Ns Sy off +If set to +.Sy on , +the pool will be imported in read-only mode. +This property can also be referred to by its shortened column name, +.Sy rdonly . +.El +.Pp +The following properties can be set at creation time and import time, and later +changed with the +.Nm zpool Cm set +command: +.Bl -tag -width Ds +.It Sy ashift Ns = Ns Sy ashift +Pool sector size exponent, to the power of +.Sy 2 +(internally referred to as +.Sy ashift +). Values from 9 to 16, inclusive, are valid; also, the +value 0 (the default) means to auto-detect using the kernel's block +layer and a ZFS internal exception list. I/O operations will be aligned +to the specified size boundaries. Additionally, the minimum (disk) +write size will be set to the specified size, so this represents a +space vs. performance trade-off. For optimal performance, the pool +sector size should be greater than or equal to the sector size of the +underlying disks. The typical case for setting this property is when +performance is important and the underlying disks use 4KiB sectors but +report 512B sectors to the OS (for compatibility reasons); in that +case, set +.Sy ashift=12 +(which is 1<<12 = 4096). When set, this property is +used as the default hint value in subsequent vdev operations (add, +attach and replace). Changing this value will not modify any existing +vdev, not even on disk replacement; however it can be used, for +instance, to replace a dying 512B sectors disk with a newer 4KiB +sectors device: this will probably result in bad performance but at the +same time could prevent loss of data. +.It Sy autoexpand Ns = Ns Sy on Ns | Ns Sy off +Controls automatic pool expansion when the underlying LUN is grown. +If set to +.Sy on , +the pool will be resized according to the size of the expanded device. +If the device is part of a mirror or raidz then all devices within that +mirror/raidz group must be expanded before the new space is made available to +the pool. +The default behavior is +.Sy off . +This property can also be referred to by its shortened column name, +.Sy expand . +.It Sy autoreplace Ns = Ns Sy on Ns | Ns Sy off +Controls automatic device replacement. +If set to +.Sy off , +device replacement must be initiated by the administrator by using the +.Nm zpool Cm replace +command. +If set to +.Sy on , +any new device, found in the same physical location as a device that previously +belonged to the pool, is automatically formatted and replaced. +The default behavior is +.Sy off . +This property can also be referred to by its shortened column name, +.Sy replace . +Autoreplace can also be used with virtual disks (like device +mapper) provided that you use the /dev/disk/by-vdev paths setup by +vdev_id.conf. See the +.Xr vdev_id 8 +man page for more details. +Autoreplace and autoonline require the ZFS Event Daemon be configured and +running. See the +.Xr zed 8 +man page for more details. +.It Sy autotrim Ns = Ns Sy on Ns | Ns Sy off +When set to +.Sy on +space which has been recently freed, and is no longer allocated by the pool, +will be periodically trimmed. This allows block device vdevs which support +BLKDISCARD, such as SSDs, or file vdevs on which the underlying file system +supports hole-punching, to reclaim unused blocks. The default setting for +this property is +.Sy off . +.Pp +Automatic TRIM does not immediately reclaim blocks after a free. Instead, +it will optimistically delay allowing smaller ranges to be aggregated in to +a few larger ones. These can then be issued more efficiently to the storage. +TRIM on L2ARC devices is enabled by setting +.Sy l2arc_trim_ahead > 0 . +.Pp +Be aware that automatic trimming of recently freed data blocks can put +significant stress on the underlying storage devices. This will vary +depending of how well the specific device handles these commands. For +lower end devices it is often possible to achieve most of the benefits +of automatic trimming by running an on-demand (manual) TRIM periodically +using the +.Nm zpool Cm trim +command. +.It Sy bootfs Ns = Ns Sy (unset) Ns | Ns Ar pool Ns / Ns Ar dataset +Identifies the default bootable dataset for the root pool. This property is +expected to be set mainly by the installation and upgrade programs. +Not all Linux distribution boot processes use the bootfs property. +.It Sy cachefile Ns = Ns Ar path Ns | Ns Sy none +Controls the location of where the pool configuration is cached. +Discovering all pools on system startup requires a cached copy of the +configuration data that is stored on the root file system. +All pools in this cache are automatically imported when the system boots. +Some environments, such as install and clustering, need to cache this +information in a different location so that pools are not automatically +imported. +Setting this property caches the pool configuration in a different location that +can later be imported with +.Nm zpool Cm import Fl c . +Setting it to the value +.Sy none +creates a temporary pool that is never cached, and the +.Qq +.Pq empty string +uses the default location. +.Pp +Multiple pools can share the same cache file. +Because the kernel destroys and recreates this file when pools are added and +removed, care should be taken when attempting to access this file. +When the last pool using a +.Sy cachefile +is exported or destroyed, the file will be empty. +.It Sy comment Ns = Ns Ar text +A text string consisting of printable ASCII characters that will be stored +such that it is available even if the pool becomes faulted. +An administrator can provide additional information about a pool using this +property. +.It Sy dedupditto Ns = Ns Ar number +This property is deprecated and no longer has any effect. +.It Sy delegation Ns = Ns Sy on Ns | Ns Sy off +Controls whether a non-privileged user is granted access based on the dataset +permissions defined on the dataset. +See +.Xr zfs 8 +for more information on ZFS delegated administration. +.It Sy failmode Ns = Ns Sy wait Ns | Ns Sy continue Ns | Ns Sy panic +Controls the system behavior in the event of catastrophic pool failure. +This condition is typically a result of a loss of connectivity to the underlying +storage device(s) or a failure of all devices within the pool. +The behavior of such an event is determined as follows: +.Bl -tag -width "continue" +.It Sy wait +Blocks all I/O access until the device connectivity is recovered and the errors +are cleared. +This is the default behavior. +.It Sy continue +Returns +.Er EIO +to any new write I/O requests but allows reads to any of the remaining healthy +devices. +Any write requests that have yet to be committed to disk would be blocked. +.It Sy panic +Prints out a message to the console and generates a system crash dump. +.El +.It Sy feature@ Ns Ar feature_name Ns = Ns Sy enabled +The value of this property is the current state of +.Ar feature_name . +The only valid value when setting this property is +.Sy enabled +which moves +.Ar feature_name +to the enabled state. +See +.Xr zpool-features 5 +for details on feature states. +.It Sy listsnapshots Ns = Ns Sy on Ns | Ns Sy off +Controls whether information about snapshots associated with this pool is +output when +.Nm zfs Cm list +is run without the +.Fl t +option. +The default value is +.Sy off . +This property can also be referred to by its shortened name, +.Sy listsnaps . +.It Sy multihost Ns = Ns Sy on Ns | Ns Sy off +Controls whether a pool activity check should be performed during +.Nm zpool Cm import . +When a pool is determined to be active it cannot be imported, even with the +.Fl f +option. This property is intended to be used in failover configurations +where multiple hosts have access to a pool on shared storage. +.Pp +Multihost provides protection on import only. It does not protect against an +individual device being used in multiple pools, regardless of the type of vdev. +See the discussion under +.Sy zpool create. +.Pp +When this property is on, periodic writes to storage occur to show the pool is +in use. See +.Sy zfs_multihost_interval +in the +.Xr zfs-module-parameters 5 +man page. In order to enable this property each host must set a unique hostid. +See +.Xr genhostid 1 +.Xr zgenhostid 8 +.Xr spl-module-parameters 5 +for additional details. The default value is +.Sy off . +.It Sy version Ns = Ns Ar version +The current on-disk version of the pool. +This can be increased, but never decreased. +The preferred method of updating pools is with the +.Nm zpool Cm upgrade +command, though this property can be used when a specific version is needed for +backwards compatibility. +Once feature flags are enabled on a pool this property will no longer have a +value. +.El diff --git a/man/man8/zstream.8 b/man/man8/zstream.8 new file mode 100644 index 000000000000..6056e097b0ef --- /dev/null +++ b/man/man8/zstream.8 @@ -0,0 +1,110 @@ +.\" +.\" CDDL HEADER START +.\" +.\" The contents of this file are subject to the terms of the +.\" Common Development and Distribution License (the "License"). +.\" You may not use this file except in compliance with the License. +.\" +.\" You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +.\" or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions +.\" and limitations under the License. +.\" +.\" When distributing Covered Code, include this CDDL HEADER in each +.\" file and include the License file at usr/src/OPENSOLARIS.LICENSE. +.\" If applicable, add the following below this CDDL HEADER, with the +.\" fields enclosed by brackets "[]" replaced with your own identifying +.\" information: Portions Copyright [yyyy] [name of copyright owner] +.\" +.\" CDDL HEADER END +.\" +.\" +.\" Copyright (c) 2020 by Delphix. All rights reserved. +.Dd March 25, 2020 +.Dt ZSTREAM 8 +.Os +.Sh NAME +.Nm zstream +.Nd manipulate zfs send streams +.Sh SYNOPSIS +.Nm +.Cm dump +.Op Fl Cvd +.Op Ar file +.Nm +.Cm redup +.Op Fl v +.Ar file +.Nm +.Cm token +.Ar resume_token +.Sh DESCRIPTION +.sp +.LP +The +.Sy zstream +utility manipulates zfs send streams, which are the output of the +.Sy zfs send +command. +.Bl -tag -width "" +.It Xo +.Nm +.Cm dump +.Op Fl Cvd +.Op Ar file +.Xc +Print information about the specified send stream, including headers and +record counts. +The send stream may either be in the specified +.Ar file , +or provided on standard input. +.Bl -tag -width "-D" +.It Fl C +Suppress the validation of checksums. +.It Fl v +Verbose. +Print metadata for each record. +.It Fl d +Dump data contained in each record. +Implies verbose. +.El +.It Xo +.Nm +.Cm token +.Ar resume_token +.Xc +Dumps zfs resume token information +.It Xo +.Nm +.Cm redup +.Op Fl v +.Ar file +.Xc +Deduplicated send streams can be generated by using the +.Nm zfs Cm send Fl D +command. +The ability to send deduplicated send streams is deprecated. +In the future, the ability to receive a deduplicated send stream with +.Nm zfs Cm receive +will be removed. +However, deduplicated send streams can still be received by utilizing +.Nm zstream Cm redup . +.Pp +The +.Nm zstream Cm redup +command is provided a +.Ar file +containing a deduplicated send stream, and outputs an equivalent +non-deduplicated send stream on standard output. +Therefore, a deduplicated send stream can be received by running: +.Bd -literal +# zstream redup DEDUP_STREAM_FILE | zfs receive ... +.Ed +.Bl -tag -width "-D" +.It Fl v +Verbose. +Print summary of converted records. +.Sh SEE ALSO +.Xr zfs 8 , +.Xr zfs-send 8 , +.Xr zfs-receive 8 diff --git a/man/man8/zstreamdump.8 b/man/man8/zstreamdump.8 new file mode 100644 index 000000000000..33cd047f5d78 --- /dev/null +++ b/man/man8/zstreamdump.8 @@ -0,0 +1,58 @@ +'\" te +.\" Copyright (c) 2009, Sun Microsystems, Inc. All Rights Reserved +.\" The contents of this file are subject to the terms of the Common Development and Distribution License (the "License"). You may not use this file except in compliance with the License. You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE or http://www.opensolaris.org/os/licensing. +.\" See the License for the specific language governing permissions and limitations under the License. When distributing Covered Code, include this CDDL HEADER in each file and include the License file at usr/src/OPENSOLARIS.LICENSE. If applicable, add the following below this CDDL HEADER, with +.\" the fields enclosed by brackets "[]" replaced with your own identifying information: Portions Copyright [yyyy] [name of copyright owner] +.TH zstreamdump 8 "29 Aug 2012" "ZFS pool 28, filesystem 5" "System Administration Commands" +.SH NAME +zstreamdump \- filter data in zfs send stream +.SH SYNOPSIS +.LP +.nf +\fBzstreamdump\fR [\fB-C\fR] [\fB-v\fR] [\fB-d\fR] +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBzstreamdump\fR utility reads from the output of the \fBzfs send\fR +command, then displays headers and some statistics from that output. See +\fBzfs\fR(8). +.SH OPTIONS +.sp +.LP +The following options are supported: +.sp +.ne 2 +.na +\fB-C\fR +.ad +.sp .6 +.RS 4n +Suppress the validation of checksums. +.RE + +.sp +.ne 2 +.na +\fB-v\fR +.ad +.sp .6 +.RS 4n +Verbose. Dump all headers, not only begin and end headers. +.RE + +.sp +.ne 2 +.na +\fB-d\fR +.ad +.sp .6 +.RS 4n +Dump contents of blocks modified. Implies verbose. +.RE + +.SH SEE ALSO +.sp +.LP +\fBzfs\fR(8) diff --git a/module/.gitignore b/module/.gitignore new file mode 100644 index 000000000000..7a4bd3673e77 --- /dev/null +++ b/module/.gitignore @@ -0,0 +1,26 @@ +*.ko +*.ko.unsigned +*.ko.out +*.ko.out.sig +*.ko.debug +*.ko.full +*.dwo +.*.cmd +.*.d +*.mod + +/Kbuild +/.cache.mk +/.tmp_versions +/Module.markers +/Module.symvers +/vnode_if* +/bus_if.h +/device_if.h +/opt_global.h + +/export_syms +/machine +/x86 + +!Makefile.in diff --git a/module/Kbuild.in b/module/Kbuild.in new file mode 100644 index 000000000000..1507965c5750 --- /dev/null +++ b/module/Kbuild.in @@ -0,0 +1,47 @@ +# When integrated in to a monolithic kernel the spl module must appear +# first. This ensures its module initialization function is run before +# any of the other module initialization functions which depend on it. +ZFS_MODULES += spl/ +ZFS_MODULES += avl/ +ZFS_MODULES += icp/ +ZFS_MODULES += lua/ +ZFS_MODULES += nvpair/ +ZFS_MODULES += unicode/ +ZFS_MODULES += zcommon/ +ZFS_MODULES += zfs/ +ZFS_MODULES += zstd/ + +# The rest is only relevant when run by kbuild +ifneq ($(KERNELRELEASE),) + +obj-$(CONFIG_ZFS) := $(ZFS_MODULES) + +ZFS_MODULE_CFLAGS += -std=gnu99 -Wno-declaration-after-statement +ZFS_MODULE_CFLAGS += -Wmissing-prototypes +ZFS_MODULE_CFLAGS += @KERNEL_DEBUG_CFLAGS@ @NO_FORMAT_ZERO_LENGTH@ + +ifneq ($(KBUILD_EXTMOD),) +zfs_include = @abs_top_srcdir@/include +ZFS_MODULE_CFLAGS += -include @abs_top_builddir@/zfs_config.h +ZFS_MODULE_CFLAGS += -I@abs_top_builddir@/include +else +zfs_include = $(srctree)/include/zfs +ZFS_MODULE_CFLAGS += -include $(zfs_include)/zfs_config.h +endif + +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/kernel +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/spl +ZFS_MODULE_CFLAGS += -I$(zfs_include)/os/linux/zfs +ZFS_MODULE_CFLAGS += -I$(zfs_include) +ZFS_MODULE_CPPFLAGS += -D_KERNEL +ZFS_MODULE_CPPFLAGS += @KERNEL_DEBUG_CPPFLAGS@ + +ifneq ($(KBUILD_EXTMOD),) +@CONFIG_QAT_TRUE@ZFS_MODULE_CFLAGS += -I@QAT_SRC@/include +@CONFIG_QAT_TRUE@KBUILD_EXTRA_SYMBOLS += @QAT_SYMBOLS@ +endif + +subdir-asflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) +subdir-ccflags-y := $(ZFS_MODULE_CFLAGS) $(ZFS_MODULE_CPPFLAGS) + +endif diff --git a/module/Makefile.bsd b/module/Makefile.bsd new file mode 100644 index 000000000000..53b97dafd9e9 --- /dev/null +++ b/module/Makefile.bsd @@ -0,0 +1,359 @@ +.if !defined(WITH_CTF) +WITH_CTF=1 +.endif + +.include + +SRCDIR=${.CURDIR} +INCDIR=${.CURDIR:H}/include + +KMOD= openzfs + +.PATH: ${SRCDIR}/avl \ + ${SRCDIR}/lua \ + ${SRCDIR}/nvpair \ + ${SRCDIR}/os/freebsd/spl \ + ${SRCDIR}/os/freebsd/zfs \ + ${SRCDIR}/unicode \ + ${SRCDIR}/zcommon \ + ${SRCDIR}/zfs \ + ${SRCDIR}/zstd \ + ${SRCDIR}/zstd/lib + + + +CFLAGS+= -I${.OBJDIR:H}/include +CFLAGS+= -I${INCDIR} +CFLAGS+= -I${INCDIR}/spl +CFLAGS+= -I${INCDIR}/os/freebsd +CFLAGS+= -I${INCDIR}/os/freebsd/spl +CFLAGS+= -I${INCDIR}/os/freebsd/zfs +CFLAGS+= -I${SRCDIR}/zstd/include +CFLAGS+= -include ${INCDIR}/os/freebsd/spl/sys/ccompile.h + +CFLAGS+= -D__KERNEL__ -DFREEBSD_NAMECACHE -DBUILDING_ZFS -D__BSD_VISIBLE=1 \ + -DHAVE_UIO_ZEROCOPY -DWITHOUT_NETDUMP -D__KERNEL -D_SYS_CONDVAR_H_ \ + -D_SYS_VMEM_H_ -DKDTRACE_HOOKS -DSMP -DHAVE_KSID -DCOMPAT_FREEBSD11 + +.if ${MACHINE_ARCH} == "amd64" +CFLAGS+= -DHAVE_AVX2 -DHAVE_AVX -D__x86_64 -DHAVE_SSE2 -DHAVE_AVX512F -DHAVE_SSSE3 +.endif + +.if defined(WITH_DEBUG) && ${WITH_DEBUG} == "true" +CFLAGS+= -DINVARIANTS -DWITNESS -g -O0 -DZFS_DEBUG -DOPENSOLARIS_WITNESS +.else +CFLAGS += -DNDEBUG +.endif + +.if defined(WITH_VFS_DEBUG) && ${WITH_VFS_DEBUG} == "true" +# kernel must also be built with this option for this to work +CFLAGS+= -DDEBUG_VFS_LOCKS +.endif + +.if defined(WITH_GCOV) && ${WITH_GCOV} == "true" +CFLAGS+= -fprofile-arcs -ftest-coverage +.endif + +DEBUG_FLAGS=-g + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +CFLAGS+= -DBITS_PER_LONG=32 +.else +CFLAGS+= -DBITS_PER_LONG=64 +.endif + +SRCS= vnode_if.h device_if.h bus_if.h + +# avl +SRCS+= avl.c + +#lua +SRCS+= lapi.c \ + lauxlib.c \ + lbaselib.c \ + lcode.c \ + lcompat.c \ + lcorolib.c \ + lctype.c \ + ldebug.c \ + ldo.c \ + lfunc.c \ + lgc.c \ + llex.c \ + lmem.c \ + lobject.c \ + lopcodes.c \ + lparser.c \ + lstate.c \ + lstring.c \ + lstrlib.c \ + ltable.c \ + ltablib.c \ + ltm.c \ + lvm.c \ + lzio.c + +#nvpair +SRCS+= nvpair.c \ + fnvpair.c \ + nvpair_alloc_spl.c \ + nvpair_alloc_fixed.c + +#os/freebsd/spl +SRCS+= acl_common.c \ + btree.c \ + callb.c \ + list.c \ + spl_acl.c \ + spl_cmn_err.c \ + spl_dtrace.c \ + spl_kmem.c \ + spl_kstat.c \ + spl_misc.c \ + spl_policy.c \ + spl_string.c \ + spl_sunddi.c \ + spl_sysevent.c \ + spl_taskq.c \ + spl_uio.c \ + spl_vfs.c \ + spl_vm.c \ + spl_zone.c \ + sha256c.c \ + sha512c.c \ + spl_procfs_list.c \ + spl_zlib.c + + +.if ${MACHINE_ARCH} == "i386" || ${MACHINE_ARCH} == "powerpc" || \ + ${MACHINE_ARCH} == "arm" +SRCS+= spl_atomic.c +.endif + +#os/freebsd/zfs +SRCS+= abd_os.c \ + crypto_os.c \ + dmu_os.c \ + hkdf.c \ + kmod_core.c \ + spa_os.c \ + sysctl_os.c \ + vdev_file.c \ + vdev_label_os.c \ + vdev_geom.c \ + zfs_acl.c \ + zfs_ctldir.c \ + zfs_dir.c \ + zfs_ioctl_compat.c \ + zfs_ioctl_os.c \ + zfs_log.c \ + zfs_replay.c \ + zfs_vfsops.c \ + zfs_vnops.c \ + zfs_znode.c \ + zio_crypt.c \ + zvol_os.c + +#unicode +SRCS+= uconv.c \ + u8_textprep.c + +#zcommon +SRCS+= zfeature_common.c \ + zfs_comutil.c \ + zfs_deleg.c \ + zfs_fletcher.c \ + zfs_fletcher_avx512.c \ + zfs_fletcher_intel.c \ + zfs_fletcher_sse.c \ + zfs_fletcher_superscalar.c \ + zfs_fletcher_superscalar4.c \ + zfs_namecheck.c \ + zfs_prop.c \ + zpool_prop.c \ + zprop_common.c + +#zfs +SRCS+= abd.c \ + aggsum.c \ + arc.c \ + arc_os.c \ + blkptr.c \ + bplist.c \ + bpobj.c \ + cityhash.c \ + dbuf.c \ + dbuf_stats.c \ + bptree.c \ + bqueue.c \ + dataset_kstats.c \ + ddt.c \ + ddt_zap.c \ + dmu.c \ + dmu_diff.c \ + dmu_object.c \ + dmu_objset.c \ + dmu_recv.c \ + dmu_redact.c \ + dmu_send.c \ + dmu_traverse.c \ + dmu_tx.c \ + dmu_zfetch.c \ + dnode.c \ + dnode_sync.c \ + dsl_dataset.c \ + dsl_deadlist.c \ + dsl_deleg.c \ + dsl_bookmark.c \ + dsl_dir.c \ + dsl_crypt.c \ + dsl_destroy.c \ + dsl_pool.c \ + dsl_prop.c \ + dsl_scan.c \ + dsl_synctask.c \ + dsl_userhold.c \ + fm.c \ + gzip.c \ + lzjb.c \ + lz4.c \ + metaslab.c \ + mmp.c \ + multilist.c \ + objlist.c \ + pathname.c \ + range_tree.c \ + refcount.c \ + rrwlock.c \ + sa.c \ + sha256.c \ + skein_zfs.c \ + spa.c \ + spa_boot.c \ + spa_checkpoint.c \ + spa_config.c \ + spa_errlog.c \ + spa_history.c \ + spa_log_spacemap.c \ + spa_misc.c \ + spa_stats.c \ + space_map.c \ + space_reftree.c \ + txg.c \ + uberblock.c \ + unique.c \ + vdev.c \ + vdev_cache.c \ + vdev_indirect.c \ + vdev_indirect_births.c \ + vdev_indirect_mapping.c \ + vdev_initialize.c \ + vdev_label.c \ + vdev_mirror.c \ + vdev_missing.c \ + vdev_queue.c \ + vdev_raidz.c \ + vdev_raidz_math.c \ + vdev_raidz_math_scalar.c \ + vdev_rebuild.c \ + vdev_raidz_math_avx2.c \ + vdev_raidz_math_avx512bw.c \ + vdev_raidz_math_avx512f.c \ + vdev_raidz_math_sse2.c \ + vdev_raidz_math_ssse3.c \ + vdev_removal.c \ + vdev_root.c \ + vdev_trim.c \ + zap.c \ + zap_leaf.c \ + zap_micro.c \ + zcp.c \ + zcp_get.c \ + zcp_global.c \ + zcp_iter.c \ + zcp_set.c \ + zcp_synctask.c \ + zfeature.c \ + zfs_byteswap.c \ + zfs_debug.c \ + zfs_file_os.c \ + zfs_fm.c \ + zfs_fuid.c \ + zfs_ioctl.c \ + zfs_onexit.c \ + zfs_quota.c \ + zfs_ratelimit.c \ + zfs_rlock.c \ + zfs_sa.c \ + zil.c \ + zio.c \ + zio_checksum.c \ + zio_compress.c \ + zio_inject.c \ + zle.c \ + zrlock.c \ + zthr.c \ + zvol.c + +#zstd +SRCS+= zfs_zstd.c \ + zstd.c + +beforeinstall: +.if ${MK_DEBUG_FILES} != "no" + mtree -eu \ + -f /etc/mtree/BSD.debug.dist \ + -p ${DESTDIR}/usr/lib +.endif + +.include + + +CFLAGS.gcc+= -Wno-pointer-to-int-cast + +CFLAGS.lapi.c= -Wno-cast-qual +CFLAGS.lcompat.c= -Wno-cast-qual +CFLAGS.lobject.c= -Wno-cast-qual +CFLAGS.ltable.c= -Wno-cast-qual +CFLAGS.lvm.c= -Wno-cast-qual +CFLAGS.nvpair.c= -DHAVE_RPC_TYPES -Wno-cast-qual +CFLAGS.spl_string.c= -Wno-cast-qual +CFLAGS.spl_vm.c= -Wno-cast-qual +CFLAGS.spl_zlib.c= -Wno-cast-qual +CFLAGS.abd.c= -Wno-cast-qual +CFLAGS.zfs_log.c= -Wno-cast-qual +CFLAGS.zfs_vnops.c= -Wno-pointer-arith +CFLAGS.u8_textprep.c= -Wno-cast-qual +CFLAGS.zfs_fletcher.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_intel.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_sse.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zfs_fletcher_avx512.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zprop_common.c= -Wno-cast-qual +CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.dmu.c= -Wno-cast-qual +CFLAGS.dmu_traverse.c= -Wno-cast-qual +CFLAGS.dsl_dir.c= -Wno-cast-qual +CFLAGS.dsl_deadlist.c= -Wno-cast-qual +CFLAGS.dsl_prop.c= -Wno-cast-qual +CFLAGS.fm.c= -Wno-cast-qual +CFLAGS.lz4.c= -Wno-cast-qual +CFLAGS.spa.c= -Wno-cast-qual +CFLAGS.spa_misc.c= -Wno-cast-qual +CFLAGS.sysctl_os.c= -include ../zfs_config.h +CFLAGS.vdev_raidz.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_scalar.c= -Wno-cast-qual +CFLAGS.vdev_raidz_math_avx2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_avx512f.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.vdev_raidz_math_sse2.c= -Wno-cast-qual -Wno-duplicate-decl-specifier +CFLAGS.zap_leaf.c= -Wno-cast-qual +CFLAGS.zap_micro.c= -Wno-cast-qual +CFLAGS.zcp.c= -Wno-cast-qual +CFLAGS.zfs_fm.c= -Wno-cast-qual +CFLAGS.zfs_ioctl.c= -Wno-cast-qual +CFLAGS.zil.c= -Wno-cast-qual +CFLAGS.zio.c= -Wno-cast-qual +CFLAGS.zrlock.c= -Wno-cast-qual +CFLAGS.zfs_zstd.c= -Wno-cast-qual -Wno-pointer-arith +CFLAGS.zstd.c= -fno-tree-vectorize diff --git a/module/Makefile.in b/module/Makefile.in new file mode 100644 index 000000000000..ead4ff1360b2 --- /dev/null +++ b/module/Makefile.in @@ -0,0 +1,115 @@ +include Kbuild + +INSTALL_MOD_DIR ?= extra + +SUBDIR_TARGETS = icp lua zstd + +all: modules +distclean maintainer-clean: clean +install: modules_install +uninstall: modules_uninstall +check: + +.PHONY: all distclean maintainer-clean install uninstall check distdir \ + modules modules-Linux modules-FreeBSD modules-unknown \ + clean clean-Linux clean-FreeBSD \ + modules_install modules_install-Linux modules_install-FreeBSD \ + modules_uninstall modules_uninstall-Linux modules_uninstall-FreeBSD + +# Filter out options that FreeBSD make doesn't understand +getflags = ( \ +set -- \ + $(filter-out --%,$(firstword $(MFLAGS))) \ + $(filter -I%,$(MFLAGS)) \ + $(filter -j%,$(MFLAGS)); \ +fmakeflags=""; \ +while getopts :deiI:j:knqrstw flag; do \ + case $$flag in \ + \?) :;; \ + :) if [ $$OPTARG = "j" ]; then \ + ncpus=$$(sysctl -n kern.smp.cpus 2>/dev/null || :); \ + if [ -n "$$ncpus" ]; then fmakeflags="$$fmakeflags -j$$ncpus"; fi; \ + fi;; \ + d) fmakeflags="$$fmakeflags -dA";; \ + *) fmakeflags="$$fmakeflags -$$flag$$OPTARG";; \ + esac; \ +done; \ +echo $$fmakeflags \ +) +FMAKEFLAGS = -C @abs_srcdir@ -f Makefile.bsd $(shell $(getflags)) + +ifneq (@abs_srcdir@,@abs_builddir@) +FMAKEFLAGS += MAKEOBJDIR=@abs_builddir@ +endif +FMAKE = env -u MAKEFLAGS make $(FMAKEFLAGS) + +modules-Linux: + list='$(SUBDIR_TARGETS)'; for targetdir in $$list; do \ + $(MAKE) -C $$targetdir; \ + done + $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ CONFIG_ZFS=m modules + +modules-FreeBSD: + +$(FMAKE) + +modules-unknown: + @true + +modules: modules-@ac_system@ + +clean-Linux: + @# Only cleanup the kernel build directories when CONFIG_KERNEL + @# is defined. This indicates that kernel modules should be built. +@CONFIG_KERNEL_TRUE@ $(MAKE) -C @LINUX_OBJ@ M=`pwd` @KERNEL_MAKE@ clean + + if [ -f @LINUX_SYMBOLS@ ]; then $(RM) @LINUX_SYMBOLS@; fi + if [ -f Module.markers ]; then $(RM) Module.markers; fi + + find . -name '*.ur-safe' -type f -print | xargs $(RM) + +clean-FreeBSD: + +$(FMAKE) clean + +clean: clean-@ac_system@ + +modules_install-Linux: + @# Install the kernel modules + $(MAKE) -C @LINUX_OBJ@ M=`pwd` modules_install \ + INSTALL_MOD_PATH=$(DESTDIR)$(INSTALL_MOD_PATH) \ + INSTALL_MOD_DIR=$(INSTALL_MOD_DIR) \ + KERNELRELEASE=@LINUX_VERSION@ + @# Remove extraneous build products when packaging + kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@; \ + if [ -n "$(DESTDIR)" ]; then \ + find $$kmoddir -name 'modules.*' | xargs $(RM); \ + fi + sysmap=$(DESTDIR)$(INSTALL_MOD_PATH)/boot/System.map-@LINUX_VERSION@; \ + if [ -f $$sysmap ]; then \ + depmod -ae -F $$sysmap @LINUX_VERSION@; \ + fi + +modules_install-FreeBSD: + @# Install the kernel modules + +$(FMAKE) install + +modules_install: modules_install-@ac_system@ + +modules_uninstall-Linux: + @# Uninstall the kernel modules + kmoddir=$(DESTDIR)$(INSTALL_MOD_PATH)/lib/modules/@LINUX_VERSION@ \ + for objdir in $(ZFS_MODULES); do \ + $(RM) -R $$kmoddir/$(INSTALL_MOD_DIR)/$$objdir; \ + done + +modules_uninstall-FreeBSD: + @false + +modules_uninstall: modules_uninstall-@ac_system@ + +distdir: + (cd @srcdir@ && find $(ZFS_MODULES) os -name '*.[chS]') | \ + while read path; do \ + mkdir -p $$distdir/$${path%/*}; \ + cp @srcdir@/$$path $$distdir/$$path; \ + done; \ + cp @srcdir@/Makefile.bsd $$distdir/Makefile.bsd diff --git a/module/avl/Makefile.in b/module/avl/Makefile.in new file mode 100644 index 000000000000..991d5f95b8c0 --- /dev/null +++ b/module/avl/Makefile.in @@ -0,0 +1,10 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +endif + +MODULE := zavl + +obj-$(CONFIG_ZFS) := $(MODULE).o + +$(MODULE)-objs += avl.o diff --git a/module/avl/avl.c b/module/avl/avl.c new file mode 100644 index 000000000000..9cc8362399d9 --- /dev/null +++ b/module/avl/avl.c @@ -0,0 +1,1093 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +/* + * AVL - generic AVL tree implementation for kernel use + * + * A complete description of AVL trees can be found in many CS textbooks. + * + * Here is a very brief overview. An AVL tree is a binary search tree that is + * almost perfectly balanced. By "almost" perfectly balanced, we mean that at + * any given node, the left and right subtrees are allowed to differ in height + * by at most 1 level. + * + * This relaxation from a perfectly balanced binary tree allows doing + * insertion and deletion relatively efficiently. Searching the tree is + * still a fast operation, roughly O(log(N)). + * + * The key to insertion and deletion is a set of tree manipulations called + * rotations, which bring unbalanced subtrees back into the semi-balanced state. + * + * This implementation of AVL trees has the following peculiarities: + * + * - The AVL specific data structures are physically embedded as fields + * in the "using" data structures. To maintain generality the code + * must constantly translate between "avl_node_t *" and containing + * data structure "void *"s by adding/subtracting the avl_offset. + * + * - Since the AVL data is always embedded in other structures, there is + * no locking or memory allocation in the AVL routines. This must be + * provided for by the enclosing data structure's semantics. Typically, + * avl_insert()/_add()/_remove()/avl_insert_here() require some kind of + * exclusive write lock. Other operations require a read lock. + * + * - The implementation uses iteration instead of explicit recursion, + * since it is intended to run on limited size kernel stacks. Since + * there is no recursion stack present to move "up" in the tree, + * there is an explicit "parent" link in the avl_node_t. + * + * - The left/right children pointers of a node are in an array. + * In the code, variables (instead of constants) are used to represent + * left and right indices. The implementation is written as if it only + * dealt with left handed manipulations. By changing the value assigned + * to "left", the code also works for right handed trees. The + * following variables/terms are frequently used: + * + * int left; // 0 when dealing with left children, + * // 1 for dealing with right children + * + * int left_heavy; // -1 when left subtree is taller at some node, + * // +1 when right subtree is taller + * + * int right; // will be the opposite of left (0 or 1) + * int right_heavy;// will be the opposite of left_heavy (-1 or 1) + * + * int direction; // 0 for "<" (ie. left child); 1 for ">" (right) + * + * Though it is a little more confusing to read the code, the approach + * allows using half as much code (and hence cache footprint) for tree + * manipulations and eliminates many conditional branches. + * + * - The avl_index_t is an opaque "cookie" used to find nodes at or + * adjacent to where a new value would be inserted in the tree. The value + * is a modified "avl_node_t *". The bottom bit (normally 0 for a + * pointer) is set to indicate if that the new node has a value greater + * than the value of the indicated "avl_node_t *". + * + * Note - in addition to userland (e.g. libavl and libutil) and the kernel + * (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module, + * which each have their own compilation environments and subsequent + * requirements. Each of these environments must be considered when adding + * dependencies from avl.c. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Small arrays to translate between balance (or diff) values and child indices. + * + * Code that deals with binary tree data structures will randomly use + * left and right children when examining a tree. C "if()" statements + * which evaluate randomly suffer from very poor hardware branch prediction. + * In this code we avoid some of the branch mispredictions by using the + * following translation arrays. They replace random branches with an + * additional memory reference. Since the translation arrays are both very + * small the data should remain efficiently in cache. + */ +static const int avl_child2balance[2] = {-1, 1}; +static const int avl_balance2child[] = {0, 0, 1}; + + +/* + * Walk from one node to the previous valued node (ie. an infix walk + * towards the left). At any given node we do one of 2 things: + * + * - If there is a left child, go to it, then to it's rightmost descendant. + * + * - otherwise we return through parent nodes until we've come from a right + * child. + * + * Return Value: + * NULL - if at the end of the nodes + * otherwise next node + */ +void * +avl_walk(avl_tree_t *tree, void *oldnode, int left) +{ + size_t off = tree->avl_offset; + avl_node_t *node = AVL_DATA2NODE(oldnode, off); + int right = 1 - left; + int was_child; + + + /* + * nowhere to walk to if tree is empty + */ + if (node == NULL) + return (NULL); + + /* + * Visit the previous valued node. There are two possibilities: + * + * If this node has a left child, go down one left, then all + * the way right. + */ + if (node->avl_child[left] != NULL) { + for (node = node->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + /* + * Otherwise, return through left children as far as we can. + */ + } else { + for (;;) { + was_child = AVL_XCHILD(node); + node = AVL_XPARENT(node); + if (node == NULL) + return (NULL); + if (was_child == right) + break; + } + } + + return (AVL_NODE2DATA(node, off)); +} + +/* + * Return the lowest valued node in a tree or NULL. + * (leftmost child from root of tree) + */ +void * +avl_first(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[0]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Return the highest valued node in a tree or NULL. + * (rightmost child from root of tree) + */ +void * +avl_last(avl_tree_t *tree) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; node = node->avl_child[1]) + prev = node; + + if (prev != NULL) + return (AVL_NODE2DATA(prev, off)); + return (NULL); +} + +/* + * Access the node immediately before or after an insertion point. + * + * "avl_index_t" is a (avl_node_t *) with the bottom bit indicating a child + * + * Return value: + * NULL: no node in the given direction + * "void *" of the found tree node + */ +void * +avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) +{ + int child = AVL_INDEX2CHILD(where); + avl_node_t *node = AVL_INDEX2NODE(where); + void *data; + size_t off = tree->avl_offset; + + if (node == NULL) { + ASSERT(tree->avl_root == NULL); + return (NULL); + } + data = AVL_NODE2DATA(node, off); + if (child != direction) + return (data); + + return (avl_walk(tree, data, direction)); +} + + +/* + * Search for the node which contains "value". The algorithm is a + * simple binary tree search. + * + * return value: + * NULL: the value is not in the AVL tree + * *where (if not NULL) is set to indicate the insertion point + * "void *" of the found tree node + */ +void * +avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) +{ + avl_node_t *node; + avl_node_t *prev = NULL; + int child = 0; + int diff; + size_t off = tree->avl_offset; + + for (node = tree->avl_root; node != NULL; + node = node->avl_child[child]) { + + prev = node; + + diff = tree->avl_compar(value, AVL_NODE2DATA(node, off)); + ASSERT(-1 <= diff && diff <= 1); + if (diff == 0) { +#ifdef ZFS_DEBUG + if (where != NULL) + *where = 0; +#endif + return (AVL_NODE2DATA(node, off)); + } + child = avl_balance2child[1 + diff]; + + } + + if (where != NULL) + *where = AVL_MKINDEX(prev, child); + + return (NULL); +} + + +/* + * Perform a rotation to restore balance at the subtree given by depth. + * + * This routine is used by both insertion and deletion. The return value + * indicates: + * 0 : subtree did not change height + * !0 : subtree was reduced in height + * + * The code is written as if handling left rotations, right rotations are + * symmetric and handled by swapping values of variables right/left[_heavy] + * + * On input balance is the "new" balance at "node". This value is either + * -2 or +2. + */ +static int +avl_rotation(avl_tree_t *tree, avl_node_t *node, int balance) +{ + int left = !(balance < 0); /* when balance = -2, left will be 0 */ + int right = 1 - left; + int left_heavy = balance >> 1; + int right_heavy = -left_heavy; + avl_node_t *parent = AVL_XPARENT(node); + avl_node_t *child = node->avl_child[left]; + avl_node_t *cright; + avl_node_t *gchild; + avl_node_t *gright; + avl_node_t *gleft; + int which_child = AVL_XCHILD(node); + int child_bal = AVL_XBALANCE(child); + + /* BEGIN CSTYLED */ + /* + * case 1 : node is overly left heavy, the left child is balanced or + * also left heavy. This requires the following rotation. + * + * (node bal:-2) + * / \ + * / \ + * (child bal:0 or -1) + * / \ + * / \ + * cright + * + * becomes: + * + * (child bal:1 or 0) + * / \ + * / \ + * (node bal:-1 or 0) + * / \ + * / \ + * cright + * + * we detect this situation by noting that child's balance is not + * right_heavy. + */ + /* END CSTYLED */ + if (child_bal != right_heavy) { + + /* + * compute new balance of nodes + * + * If child used to be left heavy (now balanced) we reduced + * the height of this sub-tree -- used in "return...;" below + */ + child_bal += right_heavy; /* adjust towards right */ + + /* + * move "cright" to be node's left child + */ + cright = child->avl_child[right]; + node->avl_child[left] = cright; + if (cright != NULL) { + AVL_SETPARENT(cright, node); + AVL_SETCHILD(cright, left); + } + + /* + * move node to be child's right child + */ + child->avl_child[right] = node; + AVL_SETBALANCE(node, -child_bal); + AVL_SETCHILD(node, right); + AVL_SETPARENT(node, child); + + /* + * update the pointer into this subtree + */ + AVL_SETBALANCE(child, child_bal); + AVL_SETCHILD(child, which_child); + AVL_SETPARENT(child, parent); + if (parent != NULL) + parent->avl_child[which_child] = child; + else + tree->avl_root = child; + + return (child_bal == 0); + } + + /* BEGIN CSTYLED */ + /* + * case 2 : When node is left heavy, but child is right heavy we use + * a different rotation. + * + * (node b:-2) + * / \ + * / \ + * / \ + * (child b:+1) + * / \ + * / \ + * (gchild b: != 0) + * / \ + * / \ + * gleft gright + * + * becomes: + * + * (gchild b:0) + * / \ + * / \ + * / \ + * (child b:?) (node b:?) + * / \ / \ + * / \ / \ + * gleft gright + * + * computing the new balances is more complicated. As an example: + * if gchild was right_heavy, then child is now left heavy + * else it is balanced + */ + /* END CSTYLED */ + gchild = child->avl_child[right]; + gleft = gchild->avl_child[left]; + gright = gchild->avl_child[right]; + + /* + * move gright to left child of node and + * + * move gleft to right child of node + */ + node->avl_child[left] = gright; + if (gright != NULL) { + AVL_SETPARENT(gright, node); + AVL_SETCHILD(gright, left); + } + + child->avl_child[right] = gleft; + if (gleft != NULL) { + AVL_SETPARENT(gleft, child); + AVL_SETCHILD(gleft, right); + } + + /* + * move child to left child of gchild and + * + * move node to right child of gchild and + * + * fixup parent of all this to point to gchild + */ + balance = AVL_XBALANCE(gchild); + gchild->avl_child[left] = child; + AVL_SETBALANCE(child, (balance == right_heavy ? left_heavy : 0)); + AVL_SETPARENT(child, gchild); + AVL_SETCHILD(child, left); + + gchild->avl_child[right] = node; + AVL_SETBALANCE(node, (balance == left_heavy ? right_heavy : 0)); + AVL_SETPARENT(node, gchild); + AVL_SETCHILD(node, right); + + AVL_SETBALANCE(gchild, 0); + AVL_SETPARENT(gchild, parent); + AVL_SETCHILD(gchild, which_child); + if (parent != NULL) + parent->avl_child[which_child] = gchild; + else + tree->avl_root = gchild; + + return (1); /* the new tree is always shorter */ +} + + +/* + * Insert a new node into an AVL tree at the specified (from avl_find()) place. + * + * Newly inserted nodes are always leaf nodes in the tree, since avl_find() + * searches out to the leaf positions. The avl_index_t indicates the node + * which will be the parent of the new node. + * + * After the node is inserted, a single rotation further up the tree may + * be necessary to maintain an acceptable AVL balance. + */ +void +avl_insert(avl_tree_t *tree, void *new_data, avl_index_t where) +{ + avl_node_t *node; + avl_node_t *parent = AVL_INDEX2NODE(where); + int old_balance; + int new_balance; + int which_child = AVL_INDEX2CHILD(where); + size_t off = tree->avl_offset; + + ASSERT(tree); +#ifdef _LP64 + ASSERT(((uintptr_t)new_data & 0x7) == 0); +#endif + + node = AVL_DATA2NODE(new_data, off); + + /* + * First, add the node to the tree at the indicated position. + */ + ++tree->avl_numnodes; + + node->avl_child[0] = NULL; + node->avl_child[1] = NULL; + + AVL_SETCHILD(node, which_child); + AVL_SETBALANCE(node, 0); + AVL_SETPARENT(node, parent); + if (parent != NULL) { + ASSERT(parent->avl_child[which_child] == NULL); + parent->avl_child[which_child] = node; + } else { + ASSERT(tree->avl_root == NULL); + tree->avl_root = node; + } + /* + * Now, back up the tree modifying the balance of all nodes above the + * insertion point. If we get to a highly unbalanced ancestor, we + * need to do a rotation. If we back out of the tree we are done. + * If we brought any subtree into perfect balance (0), we are also done. + */ + for (;;) { + node = parent; + if (node == NULL) + return; + + /* + * Compute the new balance + */ + old_balance = AVL_XBALANCE(node); + new_balance = old_balance + avl_child2balance[which_child]; + + /* + * If we introduced equal balance, then we are done immediately + */ + if (new_balance == 0) { + AVL_SETBALANCE(node, 0); + return; + } + + /* + * If both old and new are not zero we went + * from -1 to -2 balance, do a rotation. + */ + if (old_balance != 0) + break; + + AVL_SETBALANCE(node, new_balance); + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + } + + /* + * perform a rotation to fix the tree and return + */ + (void) avl_rotation(tree, node, new_balance); +} + +/* + * Insert "new_data" in "tree" in the given "direction" either after or + * before (AVL_AFTER, AVL_BEFORE) the data "here". + * + * Insertions can only be done at empty leaf points in the tree, therefore + * if the given child of the node is already present we move to either + * the AVL_PREV or AVL_NEXT and reverse the insertion direction. Since + * every other node in the tree is a leaf, this always works. + * + * To help developers using this interface, we assert that the new node + * is correctly ordered at every step of the way in DEBUG kernels. + */ +void +avl_insert_here( + avl_tree_t *tree, + void *new_data, + void *here, + int direction) +{ + avl_node_t *node; + int child = direction; /* rely on AVL_BEFORE == 0, AVL_AFTER == 1 */ +#ifdef ZFS_DEBUG + int diff; +#endif + + ASSERT(tree != NULL); + ASSERT(new_data != NULL); + ASSERT(here != NULL); + ASSERT(direction == AVL_BEFORE || direction == AVL_AFTER); + + /* + * If corresponding child of node is not NULL, go to the neighboring + * node and reverse the insertion direction. + */ + node = AVL_DATA2NODE(here, tree->avl_offset); + +#ifdef ZFS_DEBUG + diff = tree->avl_compar(new_data, here); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + + if (node->avl_child[child] != NULL) { + node = node->avl_child[child]; + child = 1 - child; + while (node->avl_child[child] != NULL) { +#ifdef ZFS_DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + node = node->avl_child[child]; + } +#ifdef ZFS_DEBUG + diff = tree->avl_compar(new_data, + AVL_NODE2DATA(node, tree->avl_offset)); + ASSERT(-1 <= diff && diff <= 1); + ASSERT(diff != 0); + ASSERT(diff > 0 ? child == 1 : child == 0); +#endif + } + ASSERT(node->avl_child[child] == NULL); + + avl_insert(tree, new_data, AVL_MKINDEX(node, child)); +} + +/* + * Add a new node to an AVL tree. Strictly enforce that no duplicates can + * be added to the tree with a VERIFY which is enabled for non-DEBUG builds. + */ +void +avl_add(avl_tree_t *tree, void *new_node) +{ + avl_index_t where = 0; + + VERIFY(avl_find(tree, new_node, &where) == NULL); + + avl_insert(tree, new_node, where); +} + +/* + * Delete a node from the AVL tree. Deletion is similar to insertion, but + * with 2 complications. + * + * First, we may be deleting an interior node. Consider the following subtree: + * + * d c c + * / \ / \ / \ + * b e b e b e + * / \ / \ / + * a c a a + * + * When we are deleting node (d), we find and bring up an adjacent valued leaf + * node, say (c), to take the interior node's place. In the code this is + * handled by temporarily swapping (d) and (c) in the tree and then using + * common code to delete (d) from the leaf position. + * + * Secondly, an interior deletion from a deep tree may require more than one + * rotation to fix the balance. This is handled by moving up the tree through + * parents and applying rotations as needed. The return value from + * avl_rotation() is used to detect when a subtree did not change overall + * height due to a rotation. + */ +void +avl_remove(avl_tree_t *tree, void *data) +{ + avl_node_t *delete; + avl_node_t *parent; + avl_node_t *node; + avl_node_t tmp; + int old_balance; + int new_balance; + int left; + int right; + int which_child; + size_t off = tree->avl_offset; + + ASSERT(tree); + + delete = AVL_DATA2NODE(data, off); + + /* + * Deletion is easiest with a node that has at most 1 child. + * We swap a node with 2 children with a sequentially valued + * neighbor node. That node will have at most 1 child. Note this + * has no effect on the ordering of the remaining nodes. + * + * As an optimization, we choose the greater neighbor if the tree + * is right heavy, otherwise the left neighbor. This reduces the + * number of rotations needed. + */ + if (delete->avl_child[0] != NULL && delete->avl_child[1] != NULL) { + + /* + * choose node to swap from whichever side is taller + */ + old_balance = AVL_XBALANCE(delete); + left = avl_balance2child[old_balance + 1]; + right = 1 - left; + + /* + * get to the previous value'd node + * (down 1 left, as far as possible right) + */ + for (node = delete->avl_child[left]; + node->avl_child[right] != NULL; + node = node->avl_child[right]) + ; + + /* + * create a temp placeholder for 'node' + * move 'node' to delete's spot in the tree + */ + tmp = *node; + + *node = *delete; + if (node->avl_child[left] == node) + node->avl_child[left] = &tmp; + + parent = AVL_XPARENT(node); + if (parent != NULL) + parent->avl_child[AVL_XCHILD(node)] = node; + else + tree->avl_root = node; + AVL_SETPARENT(node->avl_child[left], node); + AVL_SETPARENT(node->avl_child[right], node); + + /* + * Put tmp where node used to be (just temporary). + * It always has a parent and at most 1 child. + */ + delete = &tmp; + parent = AVL_XPARENT(delete); + parent->avl_child[AVL_XCHILD(delete)] = delete; + which_child = (delete->avl_child[1] != 0); + if (delete->avl_child[which_child] != NULL) + AVL_SETPARENT(delete->avl_child[which_child], delete); + } + + + /* + * Here we know "delete" is at least partially a leaf node. It can + * be easily removed from the tree. + */ + ASSERT(tree->avl_numnodes > 0); + --tree->avl_numnodes; + parent = AVL_XPARENT(delete); + which_child = AVL_XCHILD(delete); + if (delete->avl_child[0] != NULL) + node = delete->avl_child[0]; + else + node = delete->avl_child[1]; + + /* + * Connect parent directly to node (leaving out delete). + */ + if (node != NULL) { + AVL_SETPARENT(node, parent); + AVL_SETCHILD(node, which_child); + } + if (parent == NULL) { + tree->avl_root = node; + return; + } + parent->avl_child[which_child] = node; + + + /* + * Since the subtree is now shorter, begin adjusting parent balances + * and performing any needed rotations. + */ + do { + + /* + * Move up the tree and adjust the balance + * + * Capture the parent and which_child values for the next + * iteration before any rotations occur. + */ + node = parent; + old_balance = AVL_XBALANCE(node); + new_balance = old_balance - avl_child2balance[which_child]; + parent = AVL_XPARENT(node); + which_child = AVL_XCHILD(node); + + /* + * If a node was in perfect balance but isn't anymore then + * we can stop, since the height didn't change above this point + * due to a deletion. + */ + if (old_balance == 0) { + AVL_SETBALANCE(node, new_balance); + break; + } + + /* + * If the new balance is zero, we don't need to rotate + * else + * need a rotation to fix the balance. + * If the rotation doesn't change the height + * of the sub-tree we have finished adjusting. + */ + if (new_balance == 0) + AVL_SETBALANCE(node, new_balance); + else if (!avl_rotation(tree, node, new_balance)) + break; + } while (parent != NULL); +} + +#define AVL_REINSERT(tree, obj) \ + avl_remove((tree), (obj)); \ + avl_add((tree), (obj)) + +boolean_t +avl_update_lt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_NEXT(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) <= 0)); + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update_gt(avl_tree_t *t, void *obj) +{ + void *neighbor; + + ASSERT(((neighbor = AVL_PREV(t, obj)) == NULL) || + (t->avl_compar(obj, neighbor) >= 0)); + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +boolean_t +avl_update(avl_tree_t *t, void *obj) +{ + void *neighbor; + + neighbor = AVL_PREV(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) < 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + neighbor = AVL_NEXT(t, obj); + if ((neighbor != NULL) && (t->avl_compar(obj, neighbor) > 0)) { + AVL_REINSERT(t, obj); + return (B_TRUE); + } + + return (B_FALSE); +} + +void +avl_swap(avl_tree_t *tree1, avl_tree_t *tree2) +{ + avl_node_t *temp_node; + ulong_t temp_numnodes; + + ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar); + ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset); + ASSERT3U(tree1->avl_size, ==, tree2->avl_size); + + temp_node = tree1->avl_root; + temp_numnodes = tree1->avl_numnodes; + tree1->avl_root = tree2->avl_root; + tree1->avl_numnodes = tree2->avl_numnodes; + tree2->avl_root = temp_node; + tree2->avl_numnodes = temp_numnodes; +} + +/* + * initialize a new AVL tree + */ +void +avl_create(avl_tree_t *tree, int (*compar) (const void *, const void *), + size_t size, size_t offset) +{ + ASSERT(tree); + ASSERT(compar); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (avl_node_t)); +#ifdef _LP64 + ASSERT((offset & 0x7) == 0); +#endif + + tree->avl_compar = compar; + tree->avl_root = NULL; + tree->avl_numnodes = 0; + tree->avl_size = size; + tree->avl_offset = offset; +} + +/* + * Delete a tree. + */ +/* ARGSUSED */ +void +avl_destroy(avl_tree_t *tree) +{ + ASSERT(tree); + ASSERT(tree->avl_numnodes == 0); + ASSERT(tree->avl_root == NULL); +} + + +/* + * Return the number of nodes in an AVL tree. + */ +ulong_t +avl_numnodes(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes); +} + +boolean_t +avl_is_empty(avl_tree_t *tree) +{ + ASSERT(tree); + return (tree->avl_numnodes == 0); +} + +#define CHILDBIT (1L) + +/* + * Post-order tree walk used to visit all tree nodes and destroy the tree + * in post order. This is used for removing all the nodes from a tree without + * paying any cost for rebalancing it. + * + * example: + * + * void *cookie = NULL; + * my_data_t *node; + * + * while ((node = avl_destroy_nodes(tree, &cookie)) != NULL) + * free(node); + * avl_destroy(tree); + * + * The cookie is really an avl_node_t to the current node's parent and + * an indication of which child you looked at last. + * + * On input, a cookie value of CHILDBIT indicates the tree is done. + */ +void * +avl_destroy_nodes(avl_tree_t *tree, void **cookie) +{ + avl_node_t *node; + avl_node_t *parent; + int child; + void *first; + size_t off = tree->avl_offset; + + /* + * Initial calls go to the first node or it's right descendant. + */ + if (*cookie == NULL) { + first = avl_first(tree); + + /* + * deal with an empty tree + */ + if (first == NULL) { + *cookie = (void *)CHILDBIT; + return (NULL); + } + + node = AVL_DATA2NODE(first, off); + parent = AVL_XPARENT(node); + goto check_right_side; + } + + /* + * If there is no parent to return to we are done. + */ + parent = (avl_node_t *)((uintptr_t)(*cookie) & ~CHILDBIT); + if (parent == NULL) { + if (tree->avl_root != NULL) { + ASSERT(tree->avl_numnodes == 1); + tree->avl_root = NULL; + tree->avl_numnodes = 0; + } + return (NULL); + } + + /* + * Remove the child pointer we just visited from the parent and tree. + */ + child = (uintptr_t)(*cookie) & CHILDBIT; + parent->avl_child[child] = NULL; + ASSERT(tree->avl_numnodes > 1); + --tree->avl_numnodes; + + /* + * If we just did a right child or there isn't one, go up to parent. + */ + if (child == 1 || parent->avl_child[1] == NULL) { + node = parent; + parent = AVL_XPARENT(parent); + goto done; + } + + /* + * Do parent's right child, then leftmost descendent. + */ + node = parent->avl_child[1]; + while (node->avl_child[0] != NULL) { + parent = node; + node = node->avl_child[0]; + } + + /* + * If here, we moved to a left child. It may have one + * child on the right (when balance == +1). + */ +check_right_side: + if (node->avl_child[1] != NULL) { + ASSERT(AVL_XBALANCE(node) == 1); + parent = node; + node = node->avl_child[1]; + ASSERT(node->avl_child[0] == NULL && + node->avl_child[1] == NULL); + } else { + ASSERT(AVL_XBALANCE(node) <= 0); + } + +done: + if (parent == NULL) { + *cookie = (void *)CHILDBIT; + ASSERT(node == tree->avl_root); + } else { + *cookie = (void *)((uintptr_t)parent | AVL_XCHILD(node)); + } + + return (AVL_NODE2DATA(node, off)); +} + +#if defined(_KERNEL) + +static int __init +avl_init(void) +{ + return (0); +} + +static void __exit +avl_fini(void) +{ +} + +module_init(avl_init); +module_exit(avl_fini); +#endif + +ZFS_MODULE_DESCRIPTION("Generic AVL tree implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +EXPORT_SYMBOL(avl_create); +EXPORT_SYMBOL(avl_find); +EXPORT_SYMBOL(avl_insert); +EXPORT_SYMBOL(avl_insert_here); +EXPORT_SYMBOL(avl_walk); +EXPORT_SYMBOL(avl_first); +EXPORT_SYMBOL(avl_last); +EXPORT_SYMBOL(avl_nearest); +EXPORT_SYMBOL(avl_add); +EXPORT_SYMBOL(avl_swap); +EXPORT_SYMBOL(avl_is_empty); +EXPORT_SYMBOL(avl_remove); +EXPORT_SYMBOL(avl_numnodes); +EXPORT_SYMBOL(avl_destroy_nodes); +EXPORT_SYMBOL(avl_destroy); +EXPORT_SYMBOL(avl_update_lt); +EXPORT_SYMBOL(avl_update_gt); +EXPORT_SYMBOL(avl_update); diff --git a/module/icp/Makefile.in b/module/icp/Makefile.in new file mode 100644 index 000000000000..7a01b2f08b8e --- /dev/null +++ b/module/icp/Makefile.in @@ -0,0 +1,96 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +icp_include = $(src)/include +else +icp_include = $(srctree)/$(src)/include +endif + +MODULE := icp + +obj-$(CONFIG_ZFS) := $(MODULE).o + +asflags-y := -I$(icp_include) +ccflags-y := -I$(icp_include) + +$(MODULE)-objs += illumos-crypto.o +$(MODULE)-objs += api/kcf_cipher.o +$(MODULE)-objs += api/kcf_digest.o +$(MODULE)-objs += api/kcf_mac.o +$(MODULE)-objs += api/kcf_miscapi.o +$(MODULE)-objs += api/kcf_ctxops.o +$(MODULE)-objs += core/kcf_callprov.o +$(MODULE)-objs += core/kcf_prov_tabs.o +$(MODULE)-objs += core/kcf_sched.o +$(MODULE)-objs += core/kcf_mech_tabs.o +$(MODULE)-objs += core/kcf_prov_lib.o +$(MODULE)-objs += spi/kcf_spi.o +$(MODULE)-objs += io/aes.o +$(MODULE)-objs += io/edonr_mod.o +$(MODULE)-objs += io/sha1_mod.o +$(MODULE)-objs += io/sha2_mod.o +$(MODULE)-objs += io/skein_mod.o +$(MODULE)-objs += os/modhash.o +$(MODULE)-objs += os/modconf.o +$(MODULE)-objs += algs/modes/cbc.o +$(MODULE)-objs += algs/modes/ccm.o +$(MODULE)-objs += algs/modes/ctr.o +$(MODULE)-objs += algs/modes/ecb.o +$(MODULE)-objs += algs/modes/gcm_generic.o +$(MODULE)-objs += algs/modes/gcm.o +$(MODULE)-objs += algs/modes/modes.o +$(MODULE)-objs += algs/aes/aes_impl_generic.o +$(MODULE)-objs += algs/aes/aes_impl.o +$(MODULE)-objs += algs/aes/aes_modes.o +$(MODULE)-objs += algs/edonr/edonr.o +$(MODULE)-objs += algs/sha1/sha1.o +$(MODULE)-objs += algs/sha2/sha2.o +$(MODULE)-objs += algs/sha1/sha1.o +$(MODULE)-objs += algs/skein/skein.o +$(MODULE)-objs += algs/skein/skein_block.o +$(MODULE)-objs += algs/skein/skein_iv.o + +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aeskey.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_amd64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/aes/aes_aesni.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/gcm_pclmulqdq.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/aesni-gcm-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/modes/ghash-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha1/sha1-x86_64.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha256_impl.o +$(MODULE)-$(CONFIG_X86_64) += asm-x86_64/sha2/sha512_impl.o + +$(MODULE)-$(CONFIG_X86) += algs/modes/gcm_pclmulqdq.o +$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_aesni.o +$(MODULE)-$(CONFIG_X86) += algs/aes/aes_impl_x86-64.o + +# Suppress objtool "can't find jump dest instruction at" warnings. They +# are caused by the constants which are defined in the text section of the +# assembly file using .byte instructions (e.g. bswap_mask). The objtool +# utility tries to interpret them as opcodes and obviously fails doing so. +OBJECT_FILES_NON_STANDARD_aesni-gcm-x86_64.o := y +OBJECT_FILES_NON_STANDARD_ghash-x86_64.o := y + +ICP_DIRS = \ + api \ + core \ + spi \ + io \ + os \ + algs \ + algs/aes \ + algs/edonr \ + algs/modes \ + algs/sha1 \ + algs/sha2 \ + algs/skein \ + asm-x86_64 \ + asm-x86_64/aes \ + asm-x86_64/modes \ + asm-x86_64/sha1 \ + asm-x86_64/sha2 \ + asm-i386 \ + asm-generic + +all: + mkdir -p $(ICP_DIRS) diff --git a/module/icp/algs/aes/aes_impl.c b/module/icp/algs/aes/aes_impl.c new file mode 100644 index 000000000000..037be0db60d7 --- /dev/null +++ b/module/icp/algs/aes/aes_impl.c @@ -0,0 +1,443 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Initialize AES encryption and decryption key schedules. + * + * Parameters: + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + * keysched AES key schedule to be initialized, of type aes_key_t. + * Allocated by aes_alloc_keysched(). + */ +void +aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, void *keysched) +{ + const aes_impl_ops_t *ops = aes_impl_get_ops(); + aes_key_t *newbie = keysched; + uint_t keysize, i, j; + union { + uint64_t ka64[4]; + uint32_t ka32[8]; + } keyarr; + + switch (keyBits) { + case 128: + newbie->nr = 10; + break; + + case 192: + newbie->nr = 12; + break; + + case 256: + newbie->nr = 14; + break; + + default: + /* should never get here */ + return; + } + keysize = CRYPTO_BITS2BYTES(keyBits); + + /* + * Generic C implementation requires byteswap for little endian + * machines, various accelerated implementations for various + * architectures may not. + */ + if (!ops->needs_byteswap) { + /* no byteswap needed */ + if (IS_P2ALIGNED(cipherKey, sizeof (uint64_t))) { + for (i = 0, j = 0; j < keysize; i++, j += 8) { + /* LINTED: pointer alignment */ + keyarr.ka64[i] = *((uint64_t *)&cipherKey[j]); + } + } else { + bcopy(cipherKey, keyarr.ka32, keysize); + } + } else { + /* byte swap */ + for (i = 0, j = 0; j < keysize; i++, j += 4) { + keyarr.ka32[i] = + htonl(*(uint32_t *)(void *)&cipherKey[j]); + } + } + + ops->generate(newbie, keyarr.ka32, keyBits); + newbie->ops = ops; + + /* + * Note: if there are systems that need the AES_64BIT_KS type in the + * future, move setting key schedule type to individual implementations + */ + newbie->type = AES_32BIT_KS; +} + + +/* + * Encrypt one block using AES. + * Align if needed and (for x86 32-bit only) byte-swap. + * + * Parameters: + * ks Key schedule, of type aes_key_t + * pt Input block (plain text) + * ct Output block (crypto text). Can overlap with pt + */ +int +aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct) +{ + aes_key_t *ksch = (aes_key_t *)ks; + const aes_impl_ops_t *ops = ksch->ops; + + if (IS_P2ALIGNED2(pt, ct, sizeof (uint32_t)) && !ops->needs_byteswap) { + /* LINTED: pointer alignment */ + ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)pt, (uint32_t *)ct); + } else { + uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; + + /* Copy input block into buffer */ + if (ops->needs_byteswap) { + buffer[0] = htonl(*(uint32_t *)(void *)&pt[0]); + buffer[1] = htonl(*(uint32_t *)(void *)&pt[4]); + buffer[2] = htonl(*(uint32_t *)(void *)&pt[8]); + buffer[3] = htonl(*(uint32_t *)(void *)&pt[12]); + } else + bcopy(pt, &buffer, AES_BLOCK_LEN); + + ops->encrypt(&ksch->encr_ks.ks32[0], ksch->nr, buffer, buffer); + + /* Copy result from buffer to output block */ + if (ops->needs_byteswap) { + *(uint32_t *)(void *)&ct[0] = htonl(buffer[0]); + *(uint32_t *)(void *)&ct[4] = htonl(buffer[1]); + *(uint32_t *)(void *)&ct[8] = htonl(buffer[2]); + *(uint32_t *)(void *)&ct[12] = htonl(buffer[3]); + } else + bcopy(&buffer, ct, AES_BLOCK_LEN); + } + return (CRYPTO_SUCCESS); +} + + +/* + * Decrypt one block using AES. + * Align and byte-swap if needed. + * + * Parameters: + * ks Key schedule, of type aes_key_t + * ct Input block (crypto text) + * pt Output block (plain text). Can overlap with pt + */ +int +aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt) +{ + aes_key_t *ksch = (aes_key_t *)ks; + const aes_impl_ops_t *ops = ksch->ops; + + if (IS_P2ALIGNED2(ct, pt, sizeof (uint32_t)) && !ops->needs_byteswap) { + /* LINTED: pointer alignment */ + ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, + /* LINTED: pointer alignment */ + (uint32_t *)ct, (uint32_t *)pt); + } else { + uint32_t buffer[AES_BLOCK_LEN / sizeof (uint32_t)]; + + /* Copy input block into buffer */ + if (ops->needs_byteswap) { + buffer[0] = htonl(*(uint32_t *)(void *)&ct[0]); + buffer[1] = htonl(*(uint32_t *)(void *)&ct[4]); + buffer[2] = htonl(*(uint32_t *)(void *)&ct[8]); + buffer[3] = htonl(*(uint32_t *)(void *)&ct[12]); + } else + bcopy(ct, &buffer, AES_BLOCK_LEN); + + ops->decrypt(&ksch->decr_ks.ks32[0], ksch->nr, buffer, buffer); + + /* Copy result from buffer to output block */ + if (ops->needs_byteswap) { + *(uint32_t *)(void *)&pt[0] = htonl(buffer[0]); + *(uint32_t *)(void *)&pt[4] = htonl(buffer[1]); + *(uint32_t *)(void *)&pt[8] = htonl(buffer[2]); + *(uint32_t *)(void *)&pt[12] = htonl(buffer[3]); + } else + bcopy(&buffer, pt, AES_BLOCK_LEN); + } + return (CRYPTO_SUCCESS); +} + + +/* + * Allocate key schedule for AES. + * + * Return the pointer and set size to the number of bytes allocated. + * Memory allocated must be freed by the caller when done. + * + * Parameters: + * size Size of key schedule allocated, in bytes + * kmflag Flag passed to kmem_alloc(9F); ignored in userland. + */ +/* ARGSUSED */ +void * +aes_alloc_keysched(size_t *size, int kmflag) +{ + aes_key_t *keysched; + + keysched = (aes_key_t *)kmem_alloc(sizeof (aes_key_t), kmflag); + if (keysched != NULL) { + *size = sizeof (aes_key_t); + return (keysched); + } + return (NULL); +} + +/* AES implementation that contains the fastest methods */ +static aes_impl_ops_t aes_fastest_impl = { + .name = "fastest" +}; + +/* All compiled in implementations */ +const aes_impl_ops_t *aes_all_impl[] = { + &aes_generic_impl, +#if defined(__x86_64) + &aes_x86_64_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AES) + &aes_aesni_impl, +#endif +}; + +/* Indicate that benchmark has been completed */ +static boolean_t aes_impl_initialized = B_FALSE; + +/* Select aes implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) + +#define AES_IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +static uint32_t icp_aes_impl = IMPL_FASTEST; +static uint32_t user_sel_impl = IMPL_FASTEST; + +/* Hold all supported implementations */ +static size_t aes_supp_impl_cnt = 0; +static aes_impl_ops_t *aes_supp_impl[ARRAY_SIZE(aes_all_impl)]; + +/* + * Returns the AES operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. + */ +const aes_impl_ops_t * +aes_impl_get_ops(void) +{ + if (!kfpu_allowed()) + return (&aes_generic_impl); + + const aes_impl_ops_t *ops = NULL; + const uint32_t impl = AES_IMPL_READ(icp_aes_impl); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(aes_impl_initialized); + ops = &aes_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through supported implementations */ + ASSERT(aes_impl_initialized); + ASSERT3U(aes_supp_impl_cnt, >, 0); + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % aes_supp_impl_cnt; + ops = aes_supp_impl[idx]; + break; + default: + ASSERT3U(impl, <, aes_supp_impl_cnt); + ASSERT3U(aes_supp_impl_cnt, >, 0); + if (impl < ARRAY_SIZE(aes_all_impl)) + ops = aes_supp_impl[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +/* + * Initialize all supported implementations. + */ +void +aes_impl_init(void) +{ + aes_impl_ops_t *curr_impl; + int i, c; + + /* Move supported implementations into aes_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(aes_all_impl); i++) { + curr_impl = (aes_impl_ops_t *)aes_all_impl[i]; + + if (curr_impl->is_supported()) + aes_supp_impl[c++] = (aes_impl_ops_t *)curr_impl; + } + aes_supp_impl_cnt = c; + + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ +#if defined(__x86_64) +#if defined(HAVE_AES) + if (aes_aesni_impl.is_supported()) { + memcpy(&aes_fastest_impl, &aes_aesni_impl, + sizeof (aes_fastest_impl)); + } else +#endif + { + memcpy(&aes_fastest_impl, &aes_x86_64_impl, + sizeof (aes_fastest_impl)); + } +#else + memcpy(&aes_fastest_impl, &aes_generic_impl, + sizeof (aes_fastest_impl)); +#endif + + strlcpy(aes_fastest_impl.name, "fastest", AES_IMPL_NAME_MAX); + + /* Finish initialization */ + atomic_swap_32(&icp_aes_impl, user_sel_impl); + aes_impl_initialized = B_TRUE; +} + +static const struct { + char *name; + uint32_t sel; +} aes_impl_opts[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, +}; + +/* + * Function sets desired aes implementation. + * + * If we are called before init(), user preference will be saved in + * user_sel_impl, and applied in later init() call. This occurs when module + * parameter is specified on module load. Otherwise, directly update + * icp_aes_impl. + * + * @val Name of aes implementation to use + * @param Unused. + */ +int +aes_impl_set(const char *val) +{ + int err = -EINVAL; + char req_name[AES_IMPL_NAME_MAX]; + uint32_t impl = AES_IMPL_READ(user_sel_impl); + size_t i; + + /* sanitize input */ + i = strnlen(val, AES_IMPL_NAME_MAX); + if (i == 0 || i >= AES_IMPL_NAME_MAX) + return (err); + + strlcpy(req_name, val, AES_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + /* Check mandatory options */ + for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) { + if (strcmp(req_name, aes_impl_opts[i].name) == 0) { + impl = aes_impl_opts[i].sel; + err = 0; + break; + } + } + + /* check all supported impl if init() was already called */ + if (err != 0 && aes_impl_initialized) { + /* check all supported implementations */ + for (i = 0; i < aes_supp_impl_cnt; i++) { + if (strcmp(req_name, aes_supp_impl[i]->name) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + if (aes_impl_initialized) + atomic_swap_32(&icp_aes_impl, impl); + else + atomic_swap_32(&user_sel_impl, impl); + } + + return (err); +} + +#if defined(_KERNEL) && defined(__linux__) + +static int +icp_aes_impl_set(const char *val, zfs_kernel_param_t *kp) +{ + return (aes_impl_set(val)); +} + +static int +icp_aes_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cnt = 0; + char *fmt; + const uint32_t impl = AES_IMPL_READ(icp_aes_impl); + + ASSERT(aes_impl_initialized); + + /* list mandatory options */ + for (i = 0; i < ARRAY_SIZE(aes_impl_opts); i++) { + fmt = (impl == aes_impl_opts[i].sel) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, aes_impl_opts[i].name); + } + + /* list all supported implementations */ + for (i = 0; i < aes_supp_impl_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, aes_supp_impl[i]->name); + } + + return (cnt); +} + +module_param_call(icp_aes_impl, icp_aes_impl_set, icp_aes_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_aes_impl, "Select aes implementation."); +#endif diff --git a/module/icp/algs/aes/aes_impl_aesni.c b/module/icp/algs/aes/aes_impl_aesni.c new file mode 100644 index 000000000000..4b5eefd71b17 --- /dev/null +++ b/module/icp/algs/aes/aes_impl_aesni.c @@ -0,0 +1,124 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#if defined(__x86_64) && defined(HAVE_AES) + +#include +#include + +/* These functions are used to execute AES-NI instructions: */ +extern int rijndael_key_setup_enc_intel(uint32_t rk[], + const uint32_t cipherKey[], uint64_t keyBits); +extern int rijndael_key_setup_dec_intel(uint32_t rk[], + const uint32_t cipherKey[], uint64_t keyBits); +extern void aes_encrypt_intel(const uint32_t rk[], int Nr, + const uint32_t pt[4], uint32_t ct[4]); +extern void aes_decrypt_intel(const uint32_t rk[], int Nr, + const uint32_t ct[4], uint32_t pt[4]); + + +#include + +/* + * Expand the 32-bit AES cipher key array into the encryption and decryption + * key schedules. + * + * Parameters: + * key AES key schedule to be initialized + * keyarr32 User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static void +aes_aesni_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits) +{ + kfpu_begin(); + key->nr = rijndael_key_setup_enc_intel(&(key->encr_ks.ks32[0]), + keyarr32, keybits); + key->nr = rijndael_key_setup_dec_intel(&(key->decr_ks.ks32[0]), + keyarr32, keybits); + kfpu_end(); +} + +/* + * Encrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * pt Input block (plain text) + * ct Output block (crypto text). Can overlap with pt + */ +static void +aes_aesni_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) +{ + kfpu_begin(); + aes_encrypt_intel(rk, Nr, pt, ct); + kfpu_end(); +} + +/* + * Decrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * ct Input block (crypto text) + * pt Output block (plain text). Can overlap with pt + */ +static void +aes_aesni_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) +{ + kfpu_begin(); + aes_decrypt_intel(rk, Nr, ct, pt); + kfpu_end(); +} + +static boolean_t +aes_aesni_will_work(void) +{ + return (kfpu_allowed() && zfs_aes_available()); +} + +const aes_impl_ops_t aes_aesni_impl = { + .generate = &aes_aesni_generate, + .encrypt = &aes_aesni_encrypt, + .decrypt = &aes_aesni_decrypt, + .is_supported = &aes_aesni_will_work, + .needs_byteswap = B_FALSE, + .name = "aesni" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AES) */ diff --git a/module/icp/algs/aes/aes_impl_generic.c b/module/icp/algs/aes/aes_impl_generic.c new file mode 100644 index 000000000000..427c096c6ab3 --- /dev/null +++ b/module/icp/algs/aes/aes_impl_generic.c @@ -0,0 +1,1242 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +/* + * This file is derived from the file rijndael-alg-fst.c taken from the + * "optimized C code v3.0" on the "rijndael home page" + * http://www.iaik.tu-graz.ac.at/research/krypto/AES/old/~rijmen/rijndael/ + * pointed by the NIST web-site http://csrc.nist.gov/archive/aes/ + * + * The following note is from the original file: + */ + +/* + * rijndael-alg-fst.c + * + * @version 3.0 (December 2000) + * + * Optimised ANSI C code for the Rijndael cipher (now AES) + * + * @author Vincent Rijmen + * @author Antoon Bosselaers + * @author Paulo Barreto + * + * This code is hereby placed in the public domain. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ''AS IS'' AND ANY EXPRESS + * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED + * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR + * BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, + * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE + * OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, + * EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Constant tables + */ + +/* + * Te0[x] = S [x].[02, 01, 01, 03]; + * Te1[x] = S [x].[03, 02, 01, 01]; + * Te2[x] = S [x].[01, 03, 02, 01]; + * Te3[x] = S [x].[01, 01, 03, 02]; + * Te4[x] = S [x].[01, 01, 01, 01]; + * + * Td0[x] = Si[x].[0e, 09, 0d, 0b]; + * Td1[x] = Si[x].[0b, 0e, 09, 0d]; + * Td2[x] = Si[x].[0d, 0b, 0e, 09]; + * Td3[x] = Si[x].[09, 0d, 0b, 0e]; + * Td4[x] = Si[x].[01, 01, 01, 01]; + */ + +/* Encrypt Sbox constants (for the substitute bytes operation) */ + +static const uint32_t Te0[256] = +{ + 0xc66363a5U, 0xf87c7c84U, 0xee777799U, 0xf67b7b8dU, + 0xfff2f20dU, 0xd66b6bbdU, 0xde6f6fb1U, 0x91c5c554U, + 0x60303050U, 0x02010103U, 0xce6767a9U, 0x562b2b7dU, + 0xe7fefe19U, 0xb5d7d762U, 0x4dababe6U, 0xec76769aU, + 0x8fcaca45U, 0x1f82829dU, 0x89c9c940U, 0xfa7d7d87U, + 0xeffafa15U, 0xb25959ebU, 0x8e4747c9U, 0xfbf0f00bU, + 0x41adadecU, 0xb3d4d467U, 0x5fa2a2fdU, 0x45afafeaU, + 0x239c9cbfU, 0x53a4a4f7U, 0xe4727296U, 0x9bc0c05bU, + 0x75b7b7c2U, 0xe1fdfd1cU, 0x3d9393aeU, 0x4c26266aU, + 0x6c36365aU, 0x7e3f3f41U, 0xf5f7f702U, 0x83cccc4fU, + 0x6834345cU, 0x51a5a5f4U, 0xd1e5e534U, 0xf9f1f108U, + 0xe2717193U, 0xabd8d873U, 0x62313153U, 0x2a15153fU, + 0x0804040cU, 0x95c7c752U, 0x46232365U, 0x9dc3c35eU, + 0x30181828U, 0x379696a1U, 0x0a05050fU, 0x2f9a9ab5U, + 0x0e070709U, 0x24121236U, 0x1b80809bU, 0xdfe2e23dU, + 0xcdebeb26U, 0x4e272769U, 0x7fb2b2cdU, 0xea75759fU, + 0x1209091bU, 0x1d83839eU, 0x582c2c74U, 0x341a1a2eU, + 0x361b1b2dU, 0xdc6e6eb2U, 0xb45a5aeeU, 0x5ba0a0fbU, + 0xa45252f6U, 0x763b3b4dU, 0xb7d6d661U, 0x7db3b3ceU, + 0x5229297bU, 0xdde3e33eU, 0x5e2f2f71U, 0x13848497U, + 0xa65353f5U, 0xb9d1d168U, 0x00000000U, 0xc1eded2cU, + 0x40202060U, 0xe3fcfc1fU, 0x79b1b1c8U, 0xb65b5bedU, + 0xd46a6abeU, 0x8dcbcb46U, 0x67bebed9U, 0x7239394bU, + 0x944a4adeU, 0x984c4cd4U, 0xb05858e8U, 0x85cfcf4aU, + 0xbbd0d06bU, 0xc5efef2aU, 0x4faaaae5U, 0xedfbfb16U, + 0x864343c5U, 0x9a4d4dd7U, 0x66333355U, 0x11858594U, + 0x8a4545cfU, 0xe9f9f910U, 0x04020206U, 0xfe7f7f81U, + 0xa05050f0U, 0x783c3c44U, 0x259f9fbaU, 0x4ba8a8e3U, + 0xa25151f3U, 0x5da3a3feU, 0x804040c0U, 0x058f8f8aU, + 0x3f9292adU, 0x219d9dbcU, 0x70383848U, 0xf1f5f504U, + 0x63bcbcdfU, 0x77b6b6c1U, 0xafdada75U, 0x42212163U, + 0x20101030U, 0xe5ffff1aU, 0xfdf3f30eU, 0xbfd2d26dU, + 0x81cdcd4cU, 0x180c0c14U, 0x26131335U, 0xc3ecec2fU, + 0xbe5f5fe1U, 0x359797a2U, 0x884444ccU, 0x2e171739U, + 0x93c4c457U, 0x55a7a7f2U, 0xfc7e7e82U, 0x7a3d3d47U, + 0xc86464acU, 0xba5d5de7U, 0x3219192bU, 0xe6737395U, + 0xc06060a0U, 0x19818198U, 0x9e4f4fd1U, 0xa3dcdc7fU, + 0x44222266U, 0x542a2a7eU, 0x3b9090abU, 0x0b888883U, + 0x8c4646caU, 0xc7eeee29U, 0x6bb8b8d3U, 0x2814143cU, + 0xa7dede79U, 0xbc5e5ee2U, 0x160b0b1dU, 0xaddbdb76U, + 0xdbe0e03bU, 0x64323256U, 0x743a3a4eU, 0x140a0a1eU, + 0x924949dbU, 0x0c06060aU, 0x4824246cU, 0xb85c5ce4U, + 0x9fc2c25dU, 0xbdd3d36eU, 0x43acacefU, 0xc46262a6U, + 0x399191a8U, 0x319595a4U, 0xd3e4e437U, 0xf279798bU, + 0xd5e7e732U, 0x8bc8c843U, 0x6e373759U, 0xda6d6db7U, + 0x018d8d8cU, 0xb1d5d564U, 0x9c4e4ed2U, 0x49a9a9e0U, + 0xd86c6cb4U, 0xac5656faU, 0xf3f4f407U, 0xcfeaea25U, + 0xca6565afU, 0xf47a7a8eU, 0x47aeaee9U, 0x10080818U, + 0x6fbabad5U, 0xf0787888U, 0x4a25256fU, 0x5c2e2e72U, + 0x381c1c24U, 0x57a6a6f1U, 0x73b4b4c7U, 0x97c6c651U, + 0xcbe8e823U, 0xa1dddd7cU, 0xe874749cU, 0x3e1f1f21U, + 0x964b4bddU, 0x61bdbddcU, 0x0d8b8b86U, 0x0f8a8a85U, + 0xe0707090U, 0x7c3e3e42U, 0x71b5b5c4U, 0xcc6666aaU, + 0x904848d8U, 0x06030305U, 0xf7f6f601U, 0x1c0e0e12U, + 0xc26161a3U, 0x6a35355fU, 0xae5757f9U, 0x69b9b9d0U, + 0x17868691U, 0x99c1c158U, 0x3a1d1d27U, 0x279e9eb9U, + 0xd9e1e138U, 0xebf8f813U, 0x2b9898b3U, 0x22111133U, + 0xd26969bbU, 0xa9d9d970U, 0x078e8e89U, 0x339494a7U, + 0x2d9b9bb6U, 0x3c1e1e22U, 0x15878792U, 0xc9e9e920U, + 0x87cece49U, 0xaa5555ffU, 0x50282878U, 0xa5dfdf7aU, + 0x038c8c8fU, 0x59a1a1f8U, 0x09898980U, 0x1a0d0d17U, + 0x65bfbfdaU, 0xd7e6e631U, 0x844242c6U, 0xd06868b8U, + 0x824141c3U, 0x299999b0U, 0x5a2d2d77U, 0x1e0f0f11U, + 0x7bb0b0cbU, 0xa85454fcU, 0x6dbbbbd6U, 0x2c16163aU +}; + + +static const uint32_t Te1[256] = +{ + 0xa5c66363U, 0x84f87c7cU, 0x99ee7777U, 0x8df67b7bU, + 0x0dfff2f2U, 0xbdd66b6bU, 0xb1de6f6fU, 0x5491c5c5U, + 0x50603030U, 0x03020101U, 0xa9ce6767U, 0x7d562b2bU, + 0x19e7fefeU, 0x62b5d7d7U, 0xe64dababU, 0x9aec7676U, + 0x458fcacaU, 0x9d1f8282U, 0x4089c9c9U, 0x87fa7d7dU, + 0x15effafaU, 0xebb25959U, 0xc98e4747U, 0x0bfbf0f0U, + 0xec41adadU, 0x67b3d4d4U, 0xfd5fa2a2U, 0xea45afafU, + 0xbf239c9cU, 0xf753a4a4U, 0x96e47272U, 0x5b9bc0c0U, + 0xc275b7b7U, 0x1ce1fdfdU, 0xae3d9393U, 0x6a4c2626U, + 0x5a6c3636U, 0x417e3f3fU, 0x02f5f7f7U, 0x4f83ccccU, + 0x5c683434U, 0xf451a5a5U, 0x34d1e5e5U, 0x08f9f1f1U, + 0x93e27171U, 0x73abd8d8U, 0x53623131U, 0x3f2a1515U, + 0x0c080404U, 0x5295c7c7U, 0x65462323U, 0x5e9dc3c3U, + 0x28301818U, 0xa1379696U, 0x0f0a0505U, 0xb52f9a9aU, + 0x090e0707U, 0x36241212U, 0x9b1b8080U, 0x3ddfe2e2U, + 0x26cdebebU, 0x694e2727U, 0xcd7fb2b2U, 0x9fea7575U, + 0x1b120909U, 0x9e1d8383U, 0x74582c2cU, 0x2e341a1aU, + 0x2d361b1bU, 0xb2dc6e6eU, 0xeeb45a5aU, 0xfb5ba0a0U, + 0xf6a45252U, 0x4d763b3bU, 0x61b7d6d6U, 0xce7db3b3U, + 0x7b522929U, 0x3edde3e3U, 0x715e2f2fU, 0x97138484U, + 0xf5a65353U, 0x68b9d1d1U, 0x00000000U, 0x2cc1ededU, + 0x60402020U, 0x1fe3fcfcU, 0xc879b1b1U, 0xedb65b5bU, + 0xbed46a6aU, 0x468dcbcbU, 0xd967bebeU, 0x4b723939U, + 0xde944a4aU, 0xd4984c4cU, 0xe8b05858U, 0x4a85cfcfU, + 0x6bbbd0d0U, 0x2ac5efefU, 0xe54faaaaU, 0x16edfbfbU, + 0xc5864343U, 0xd79a4d4dU, 0x55663333U, 0x94118585U, + 0xcf8a4545U, 0x10e9f9f9U, 0x06040202U, 0x81fe7f7fU, + 0xf0a05050U, 0x44783c3cU, 0xba259f9fU, 0xe34ba8a8U, + 0xf3a25151U, 0xfe5da3a3U, 0xc0804040U, 0x8a058f8fU, + 0xad3f9292U, 0xbc219d9dU, 0x48703838U, 0x04f1f5f5U, + 0xdf63bcbcU, 0xc177b6b6U, 0x75afdadaU, 0x63422121U, + 0x30201010U, 0x1ae5ffffU, 0x0efdf3f3U, 0x6dbfd2d2U, + 0x4c81cdcdU, 0x14180c0cU, 0x35261313U, 0x2fc3ececU, + 0xe1be5f5fU, 0xa2359797U, 0xcc884444U, 0x392e1717U, + 0x5793c4c4U, 0xf255a7a7U, 0x82fc7e7eU, 0x477a3d3dU, + 0xacc86464U, 0xe7ba5d5dU, 0x2b321919U, 0x95e67373U, + 0xa0c06060U, 0x98198181U, 0xd19e4f4fU, 0x7fa3dcdcU, + 0x66442222U, 0x7e542a2aU, 0xab3b9090U, 0x830b8888U, + 0xca8c4646U, 0x29c7eeeeU, 0xd36bb8b8U, 0x3c281414U, + 0x79a7dedeU, 0xe2bc5e5eU, 0x1d160b0bU, 0x76addbdbU, + 0x3bdbe0e0U, 0x56643232U, 0x4e743a3aU, 0x1e140a0aU, + 0xdb924949U, 0x0a0c0606U, 0x6c482424U, 0xe4b85c5cU, + 0x5d9fc2c2U, 0x6ebdd3d3U, 0xef43acacU, 0xa6c46262U, + 0xa8399191U, 0xa4319595U, 0x37d3e4e4U, 0x8bf27979U, + 0x32d5e7e7U, 0x438bc8c8U, 0x596e3737U, 0xb7da6d6dU, + 0x8c018d8dU, 0x64b1d5d5U, 0xd29c4e4eU, 0xe049a9a9U, + 0xb4d86c6cU, 0xfaac5656U, 0x07f3f4f4U, 0x25cfeaeaU, + 0xafca6565U, 0x8ef47a7aU, 0xe947aeaeU, 0x18100808U, + 0xd56fbabaU, 0x88f07878U, 0x6f4a2525U, 0x725c2e2eU, + 0x24381c1cU, 0xf157a6a6U, 0xc773b4b4U, 0x5197c6c6U, + 0x23cbe8e8U, 0x7ca1ddddU, 0x9ce87474U, 0x213e1f1fU, + 0xdd964b4bU, 0xdc61bdbdU, 0x860d8b8bU, 0x850f8a8aU, + 0x90e07070U, 0x427c3e3eU, 0xc471b5b5U, 0xaacc6666U, + 0xd8904848U, 0x05060303U, 0x01f7f6f6U, 0x121c0e0eU, + 0xa3c26161U, 0x5f6a3535U, 0xf9ae5757U, 0xd069b9b9U, + 0x91178686U, 0x5899c1c1U, 0x273a1d1dU, 0xb9279e9eU, + 0x38d9e1e1U, 0x13ebf8f8U, 0xb32b9898U, 0x33221111U, + 0xbbd26969U, 0x70a9d9d9U, 0x89078e8eU, 0xa7339494U, + 0xb62d9b9bU, 0x223c1e1eU, 0x92158787U, 0x20c9e9e9U, + 0x4987ceceU, 0xffaa5555U, 0x78502828U, 0x7aa5dfdfU, + 0x8f038c8cU, 0xf859a1a1U, 0x80098989U, 0x171a0d0dU, + 0xda65bfbfU, 0x31d7e6e6U, 0xc6844242U, 0xb8d06868U, + 0xc3824141U, 0xb0299999U, 0x775a2d2dU, 0x111e0f0fU, + 0xcb7bb0b0U, 0xfca85454U, 0xd66dbbbbU, 0x3a2c1616U +}; + + +static const uint32_t Te2[256] = +{ + 0x63a5c663U, 0x7c84f87cU, 0x7799ee77U, 0x7b8df67bU, + 0xf20dfff2U, 0x6bbdd66bU, 0x6fb1de6fU, 0xc55491c5U, + 0x30506030U, 0x01030201U, 0x67a9ce67U, 0x2b7d562bU, + 0xfe19e7feU, 0xd762b5d7U, 0xabe64dabU, 0x769aec76U, + 0xca458fcaU, 0x829d1f82U, 0xc94089c9U, 0x7d87fa7dU, + 0xfa15effaU, 0x59ebb259U, 0x47c98e47U, 0xf00bfbf0U, + 0xadec41adU, 0xd467b3d4U, 0xa2fd5fa2U, 0xafea45afU, + 0x9cbf239cU, 0xa4f753a4U, 0x7296e472U, 0xc05b9bc0U, + 0xb7c275b7U, 0xfd1ce1fdU, 0x93ae3d93U, 0x266a4c26U, + 0x365a6c36U, 0x3f417e3fU, 0xf702f5f7U, 0xcc4f83ccU, + 0x345c6834U, 0xa5f451a5U, 0xe534d1e5U, 0xf108f9f1U, + 0x7193e271U, 0xd873abd8U, 0x31536231U, 0x153f2a15U, + 0x040c0804U, 0xc75295c7U, 0x23654623U, 0xc35e9dc3U, + 0x18283018U, 0x96a13796U, 0x050f0a05U, 0x9ab52f9aU, + 0x07090e07U, 0x12362412U, 0x809b1b80U, 0xe23ddfe2U, + 0xeb26cdebU, 0x27694e27U, 0xb2cd7fb2U, 0x759fea75U, + 0x091b1209U, 0x839e1d83U, 0x2c74582cU, 0x1a2e341aU, + 0x1b2d361bU, 0x6eb2dc6eU, 0x5aeeb45aU, 0xa0fb5ba0U, + 0x52f6a452U, 0x3b4d763bU, 0xd661b7d6U, 0xb3ce7db3U, + 0x297b5229U, 0xe33edde3U, 0x2f715e2fU, 0x84971384U, + 0x53f5a653U, 0xd168b9d1U, 0x00000000U, 0xed2cc1edU, + 0x20604020U, 0xfc1fe3fcU, 0xb1c879b1U, 0x5bedb65bU, + 0x6abed46aU, 0xcb468dcbU, 0xbed967beU, 0x394b7239U, + 0x4ade944aU, 0x4cd4984cU, 0x58e8b058U, 0xcf4a85cfU, + 0xd06bbbd0U, 0xef2ac5efU, 0xaae54faaU, 0xfb16edfbU, + 0x43c58643U, 0x4dd79a4dU, 0x33556633U, 0x85941185U, + 0x45cf8a45U, 0xf910e9f9U, 0x02060402U, 0x7f81fe7fU, + 0x50f0a050U, 0x3c44783cU, 0x9fba259fU, 0xa8e34ba8U, + 0x51f3a251U, 0xa3fe5da3U, 0x40c08040U, 0x8f8a058fU, + 0x92ad3f92U, 0x9dbc219dU, 0x38487038U, 0xf504f1f5U, + 0xbcdf63bcU, 0xb6c177b6U, 0xda75afdaU, 0x21634221U, + 0x10302010U, 0xff1ae5ffU, 0xf30efdf3U, 0xd26dbfd2U, + 0xcd4c81cdU, 0x0c14180cU, 0x13352613U, 0xec2fc3ecU, + 0x5fe1be5fU, 0x97a23597U, 0x44cc8844U, 0x17392e17U, + 0xc45793c4U, 0xa7f255a7U, 0x7e82fc7eU, 0x3d477a3dU, + 0x64acc864U, 0x5de7ba5dU, 0x192b3219U, 0x7395e673U, + 0x60a0c060U, 0x81981981U, 0x4fd19e4fU, 0xdc7fa3dcU, + 0x22664422U, 0x2a7e542aU, 0x90ab3b90U, 0x88830b88U, + 0x46ca8c46U, 0xee29c7eeU, 0xb8d36bb8U, 0x143c2814U, + 0xde79a7deU, 0x5ee2bc5eU, 0x0b1d160bU, 0xdb76addbU, + 0xe03bdbe0U, 0x32566432U, 0x3a4e743aU, 0x0a1e140aU, + 0x49db9249U, 0x060a0c06U, 0x246c4824U, 0x5ce4b85cU, + 0xc25d9fc2U, 0xd36ebdd3U, 0xacef43acU, 0x62a6c462U, + 0x91a83991U, 0x95a43195U, 0xe437d3e4U, 0x798bf279U, + 0xe732d5e7U, 0xc8438bc8U, 0x37596e37U, 0x6db7da6dU, + 0x8d8c018dU, 0xd564b1d5U, 0x4ed29c4eU, 0xa9e049a9U, + 0x6cb4d86cU, 0x56faac56U, 0xf407f3f4U, 0xea25cfeaU, + 0x65afca65U, 0x7a8ef47aU, 0xaee947aeU, 0x08181008U, + 0xbad56fbaU, 0x7888f078U, 0x256f4a25U, 0x2e725c2eU, + 0x1c24381cU, 0xa6f157a6U, 0xb4c773b4U, 0xc65197c6U, + 0xe823cbe8U, 0xdd7ca1ddU, 0x749ce874U, 0x1f213e1fU, + 0x4bdd964bU, 0xbddc61bdU, 0x8b860d8bU, 0x8a850f8aU, + 0x7090e070U, 0x3e427c3eU, 0xb5c471b5U, 0x66aacc66U, + 0x48d89048U, 0x03050603U, 0xf601f7f6U, 0x0e121c0eU, + 0x61a3c261U, 0x355f6a35U, 0x57f9ae57U, 0xb9d069b9U, + 0x86911786U, 0xc15899c1U, 0x1d273a1dU, 0x9eb9279eU, + 0xe138d9e1U, 0xf813ebf8U, 0x98b32b98U, 0x11332211U, + 0x69bbd269U, 0xd970a9d9U, 0x8e89078eU, 0x94a73394U, + 0x9bb62d9bU, 0x1e223c1eU, 0x87921587U, 0xe920c9e9U, + 0xce4987ceU, 0x55ffaa55U, 0x28785028U, 0xdf7aa5dfU, + 0x8c8f038cU, 0xa1f859a1U, 0x89800989U, 0x0d171a0dU, + 0xbfda65bfU, 0xe631d7e6U, 0x42c68442U, 0x68b8d068U, + 0x41c38241U, 0x99b02999U, 0x2d775a2dU, 0x0f111e0fU, + 0xb0cb7bb0U, 0x54fca854U, 0xbbd66dbbU, 0x163a2c16U +}; + + +static const uint32_t Te3[256] = +{ + 0x6363a5c6U, 0x7c7c84f8U, 0x777799eeU, 0x7b7b8df6U, + 0xf2f20dffU, 0x6b6bbdd6U, 0x6f6fb1deU, 0xc5c55491U, + 0x30305060U, 0x01010302U, 0x6767a9ceU, 0x2b2b7d56U, + 0xfefe19e7U, 0xd7d762b5U, 0xababe64dU, 0x76769aecU, + 0xcaca458fU, 0x82829d1fU, 0xc9c94089U, 0x7d7d87faU, + 0xfafa15efU, 0x5959ebb2U, 0x4747c98eU, 0xf0f00bfbU, + 0xadadec41U, 0xd4d467b3U, 0xa2a2fd5fU, 0xafafea45U, + 0x9c9cbf23U, 0xa4a4f753U, 0x727296e4U, 0xc0c05b9bU, + 0xb7b7c275U, 0xfdfd1ce1U, 0x9393ae3dU, 0x26266a4cU, + 0x36365a6cU, 0x3f3f417eU, 0xf7f702f5U, 0xcccc4f83U, + 0x34345c68U, 0xa5a5f451U, 0xe5e534d1U, 0xf1f108f9U, + 0x717193e2U, 0xd8d873abU, 0x31315362U, 0x15153f2aU, + 0x04040c08U, 0xc7c75295U, 0x23236546U, 0xc3c35e9dU, + 0x18182830U, 0x9696a137U, 0x05050f0aU, 0x9a9ab52fU, + 0x0707090eU, 0x12123624U, 0x80809b1bU, 0xe2e23ddfU, + 0xebeb26cdU, 0x2727694eU, 0xb2b2cd7fU, 0x75759feaU, + 0x09091b12U, 0x83839e1dU, 0x2c2c7458U, 0x1a1a2e34U, + 0x1b1b2d36U, 0x6e6eb2dcU, 0x5a5aeeb4U, 0xa0a0fb5bU, + 0x5252f6a4U, 0x3b3b4d76U, 0xd6d661b7U, 0xb3b3ce7dU, + 0x29297b52U, 0xe3e33eddU, 0x2f2f715eU, 0x84849713U, + 0x5353f5a6U, 0xd1d168b9U, 0x00000000U, 0xeded2cc1U, + 0x20206040U, 0xfcfc1fe3U, 0xb1b1c879U, 0x5b5bedb6U, + 0x6a6abed4U, 0xcbcb468dU, 0xbebed967U, 0x39394b72U, + 0x4a4ade94U, 0x4c4cd498U, 0x5858e8b0U, 0xcfcf4a85U, + 0xd0d06bbbU, 0xefef2ac5U, 0xaaaae54fU, 0xfbfb16edU, + 0x4343c586U, 0x4d4dd79aU, 0x33335566U, 0x85859411U, + 0x4545cf8aU, 0xf9f910e9U, 0x02020604U, 0x7f7f81feU, + 0x5050f0a0U, 0x3c3c4478U, 0x9f9fba25U, 0xa8a8e34bU, + 0x5151f3a2U, 0xa3a3fe5dU, 0x4040c080U, 0x8f8f8a05U, + 0x9292ad3fU, 0x9d9dbc21U, 0x38384870U, 0xf5f504f1U, + 0xbcbcdf63U, 0xb6b6c177U, 0xdada75afU, 0x21216342U, + 0x10103020U, 0xffff1ae5U, 0xf3f30efdU, 0xd2d26dbfU, + 0xcdcd4c81U, 0x0c0c1418U, 0x13133526U, 0xecec2fc3U, + 0x5f5fe1beU, 0x9797a235U, 0x4444cc88U, 0x1717392eU, + 0xc4c45793U, 0xa7a7f255U, 0x7e7e82fcU, 0x3d3d477aU, + 0x6464acc8U, 0x5d5de7baU, 0x19192b32U, 0x737395e6U, + 0x6060a0c0U, 0x81819819U, 0x4f4fd19eU, 0xdcdc7fa3U, + 0x22226644U, 0x2a2a7e54U, 0x9090ab3bU, 0x8888830bU, + 0x4646ca8cU, 0xeeee29c7U, 0xb8b8d36bU, 0x14143c28U, + 0xdede79a7U, 0x5e5ee2bcU, 0x0b0b1d16U, 0xdbdb76adU, + 0xe0e03bdbU, 0x32325664U, 0x3a3a4e74U, 0x0a0a1e14U, + 0x4949db92U, 0x06060a0cU, 0x24246c48U, 0x5c5ce4b8U, + 0xc2c25d9fU, 0xd3d36ebdU, 0xacacef43U, 0x6262a6c4U, + 0x9191a839U, 0x9595a431U, 0xe4e437d3U, 0x79798bf2U, + 0xe7e732d5U, 0xc8c8438bU, 0x3737596eU, 0x6d6db7daU, + 0x8d8d8c01U, 0xd5d564b1U, 0x4e4ed29cU, 0xa9a9e049U, + 0x6c6cb4d8U, 0x5656faacU, 0xf4f407f3U, 0xeaea25cfU, + 0x6565afcaU, 0x7a7a8ef4U, 0xaeaee947U, 0x08081810U, + 0xbabad56fU, 0x787888f0U, 0x25256f4aU, 0x2e2e725cU, + 0x1c1c2438U, 0xa6a6f157U, 0xb4b4c773U, 0xc6c65197U, + 0xe8e823cbU, 0xdddd7ca1U, 0x74749ce8U, 0x1f1f213eU, + 0x4b4bdd96U, 0xbdbddc61U, 0x8b8b860dU, 0x8a8a850fU, + 0x707090e0U, 0x3e3e427cU, 0xb5b5c471U, 0x6666aaccU, + 0x4848d890U, 0x03030506U, 0xf6f601f7U, 0x0e0e121cU, + 0x6161a3c2U, 0x35355f6aU, 0x5757f9aeU, 0xb9b9d069U, + 0x86869117U, 0xc1c15899U, 0x1d1d273aU, 0x9e9eb927U, + 0xe1e138d9U, 0xf8f813ebU, 0x9898b32bU, 0x11113322U, + 0x6969bbd2U, 0xd9d970a9U, 0x8e8e8907U, 0x9494a733U, + 0x9b9bb62dU, 0x1e1e223cU, 0x87879215U, 0xe9e920c9U, + 0xcece4987U, 0x5555ffaaU, 0x28287850U, 0xdfdf7aa5U, + 0x8c8c8f03U, 0xa1a1f859U, 0x89898009U, 0x0d0d171aU, + 0xbfbfda65U, 0xe6e631d7U, 0x4242c684U, 0x6868b8d0U, + 0x4141c382U, 0x9999b029U, 0x2d2d775aU, 0x0f0f111eU, + 0xb0b0cb7bU, 0x5454fca8U, 0xbbbbd66dU, 0x16163a2cU +}; + +static const uint32_t Te4[256] = +{ + 0x63636363U, 0x7c7c7c7cU, 0x77777777U, 0x7b7b7b7bU, + 0xf2f2f2f2U, 0x6b6b6b6bU, 0x6f6f6f6fU, 0xc5c5c5c5U, + 0x30303030U, 0x01010101U, 0x67676767U, 0x2b2b2b2bU, + 0xfefefefeU, 0xd7d7d7d7U, 0xababababU, 0x76767676U, + 0xcacacacaU, 0x82828282U, 0xc9c9c9c9U, 0x7d7d7d7dU, + 0xfafafafaU, 0x59595959U, 0x47474747U, 0xf0f0f0f0U, + 0xadadadadU, 0xd4d4d4d4U, 0xa2a2a2a2U, 0xafafafafU, + 0x9c9c9c9cU, 0xa4a4a4a4U, 0x72727272U, 0xc0c0c0c0U, + 0xb7b7b7b7U, 0xfdfdfdfdU, 0x93939393U, 0x26262626U, + 0x36363636U, 0x3f3f3f3fU, 0xf7f7f7f7U, 0xccccccccU, + 0x34343434U, 0xa5a5a5a5U, 0xe5e5e5e5U, 0xf1f1f1f1U, + 0x71717171U, 0xd8d8d8d8U, 0x31313131U, 0x15151515U, + 0x04040404U, 0xc7c7c7c7U, 0x23232323U, 0xc3c3c3c3U, + 0x18181818U, 0x96969696U, 0x05050505U, 0x9a9a9a9aU, + 0x07070707U, 0x12121212U, 0x80808080U, 0xe2e2e2e2U, + 0xebebebebU, 0x27272727U, 0xb2b2b2b2U, 0x75757575U, + 0x09090909U, 0x83838383U, 0x2c2c2c2cU, 0x1a1a1a1aU, + 0x1b1b1b1bU, 0x6e6e6e6eU, 0x5a5a5a5aU, 0xa0a0a0a0U, + 0x52525252U, 0x3b3b3b3bU, 0xd6d6d6d6U, 0xb3b3b3b3U, + 0x29292929U, 0xe3e3e3e3U, 0x2f2f2f2fU, 0x84848484U, + 0x53535353U, 0xd1d1d1d1U, 0x00000000U, 0xededededU, + 0x20202020U, 0xfcfcfcfcU, 0xb1b1b1b1U, 0x5b5b5b5bU, + 0x6a6a6a6aU, 0xcbcbcbcbU, 0xbebebebeU, 0x39393939U, + 0x4a4a4a4aU, 0x4c4c4c4cU, 0x58585858U, 0xcfcfcfcfU, + 0xd0d0d0d0U, 0xefefefefU, 0xaaaaaaaaU, 0xfbfbfbfbU, + 0x43434343U, 0x4d4d4d4dU, 0x33333333U, 0x85858585U, + 0x45454545U, 0xf9f9f9f9U, 0x02020202U, 0x7f7f7f7fU, + 0x50505050U, 0x3c3c3c3cU, 0x9f9f9f9fU, 0xa8a8a8a8U, + 0x51515151U, 0xa3a3a3a3U, 0x40404040U, 0x8f8f8f8fU, + 0x92929292U, 0x9d9d9d9dU, 0x38383838U, 0xf5f5f5f5U, + 0xbcbcbcbcU, 0xb6b6b6b6U, 0xdadadadaU, 0x21212121U, + 0x10101010U, 0xffffffffU, 0xf3f3f3f3U, 0xd2d2d2d2U, + 0xcdcdcdcdU, 0x0c0c0c0cU, 0x13131313U, 0xececececU, + 0x5f5f5f5fU, 0x97979797U, 0x44444444U, 0x17171717U, + 0xc4c4c4c4U, 0xa7a7a7a7U, 0x7e7e7e7eU, 0x3d3d3d3dU, + 0x64646464U, 0x5d5d5d5dU, 0x19191919U, 0x73737373U, + 0x60606060U, 0x81818181U, 0x4f4f4f4fU, 0xdcdcdcdcU, + 0x22222222U, 0x2a2a2a2aU, 0x90909090U, 0x88888888U, + 0x46464646U, 0xeeeeeeeeU, 0xb8b8b8b8U, 0x14141414U, + 0xdedededeU, 0x5e5e5e5eU, 0x0b0b0b0bU, 0xdbdbdbdbU, + 0xe0e0e0e0U, 0x32323232U, 0x3a3a3a3aU, 0x0a0a0a0aU, + 0x49494949U, 0x06060606U, 0x24242424U, 0x5c5c5c5cU, + 0xc2c2c2c2U, 0xd3d3d3d3U, 0xacacacacU, 0x62626262U, + 0x91919191U, 0x95959595U, 0xe4e4e4e4U, 0x79797979U, + 0xe7e7e7e7U, 0xc8c8c8c8U, 0x37373737U, 0x6d6d6d6dU, + 0x8d8d8d8dU, 0xd5d5d5d5U, 0x4e4e4e4eU, 0xa9a9a9a9U, + 0x6c6c6c6cU, 0x56565656U, 0xf4f4f4f4U, 0xeaeaeaeaU, + 0x65656565U, 0x7a7a7a7aU, 0xaeaeaeaeU, 0x08080808U, + 0xbabababaU, 0x78787878U, 0x25252525U, 0x2e2e2e2eU, + 0x1c1c1c1cU, 0xa6a6a6a6U, 0xb4b4b4b4U, 0xc6c6c6c6U, + 0xe8e8e8e8U, 0xddddddddU, 0x74747474U, 0x1f1f1f1fU, + 0x4b4b4b4bU, 0xbdbdbdbdU, 0x8b8b8b8bU, 0x8a8a8a8aU, + 0x70707070U, 0x3e3e3e3eU, 0xb5b5b5b5U, 0x66666666U, + 0x48484848U, 0x03030303U, 0xf6f6f6f6U, 0x0e0e0e0eU, + 0x61616161U, 0x35353535U, 0x57575757U, 0xb9b9b9b9U, + 0x86868686U, 0xc1c1c1c1U, 0x1d1d1d1dU, 0x9e9e9e9eU, + 0xe1e1e1e1U, 0xf8f8f8f8U, 0x98989898U, 0x11111111U, + 0x69696969U, 0xd9d9d9d9U, 0x8e8e8e8eU, 0x94949494U, + 0x9b9b9b9bU, 0x1e1e1e1eU, 0x87878787U, 0xe9e9e9e9U, + 0xcecececeU, 0x55555555U, 0x28282828U, 0xdfdfdfdfU, + 0x8c8c8c8cU, 0xa1a1a1a1U, 0x89898989U, 0x0d0d0d0dU, + 0xbfbfbfbfU, 0xe6e6e6e6U, 0x42424242U, 0x68686868U, + 0x41414141U, 0x99999999U, 0x2d2d2d2dU, 0x0f0f0f0fU, + 0xb0b0b0b0U, 0x54545454U, 0xbbbbbbbbU, 0x16161616U +}; + +/* Decrypt Sbox constants (for the substitute bytes operation) */ + +static const uint32_t Td0[256] = +{ + 0x51f4a750U, 0x7e416553U, 0x1a17a4c3U, 0x3a275e96U, + 0x3bab6bcbU, 0x1f9d45f1U, 0xacfa58abU, 0x4be30393U, + 0x2030fa55U, 0xad766df6U, 0x88cc7691U, 0xf5024c25U, + 0x4fe5d7fcU, 0xc52acbd7U, 0x26354480U, 0xb562a38fU, + 0xdeb15a49U, 0x25ba1b67U, 0x45ea0e98U, 0x5dfec0e1U, + 0xc32f7502U, 0x814cf012U, 0x8d4697a3U, 0x6bd3f9c6U, + 0x038f5fe7U, 0x15929c95U, 0xbf6d7aebU, 0x955259daU, + 0xd4be832dU, 0x587421d3U, 0x49e06929U, 0x8ec9c844U, + 0x75c2896aU, 0xf48e7978U, 0x99583e6bU, 0x27b971ddU, + 0xbee14fb6U, 0xf088ad17U, 0xc920ac66U, 0x7dce3ab4U, + 0x63df4a18U, 0xe51a3182U, 0x97513360U, 0x62537f45U, + 0xb16477e0U, 0xbb6bae84U, 0xfe81a01cU, 0xf9082b94U, + 0x70486858U, 0x8f45fd19U, 0x94de6c87U, 0x527bf8b7U, + 0xab73d323U, 0x724b02e2U, 0xe31f8f57U, 0x6655ab2aU, + 0xb2eb2807U, 0x2fb5c203U, 0x86c57b9aU, 0xd33708a5U, + 0x302887f2U, 0x23bfa5b2U, 0x02036abaU, 0xed16825cU, + 0x8acf1c2bU, 0xa779b492U, 0xf307f2f0U, 0x4e69e2a1U, + 0x65daf4cdU, 0x0605bed5U, 0xd134621fU, 0xc4a6fe8aU, + 0x342e539dU, 0xa2f355a0U, 0x058ae132U, 0xa4f6eb75U, + 0x0b83ec39U, 0x4060efaaU, 0x5e719f06U, 0xbd6e1051U, + 0x3e218af9U, 0x96dd063dU, 0xdd3e05aeU, 0x4de6bd46U, + 0x91548db5U, 0x71c45d05U, 0x0406d46fU, 0x605015ffU, + 0x1998fb24U, 0xd6bde997U, 0x894043ccU, 0x67d99e77U, + 0xb0e842bdU, 0x07898b88U, 0xe7195b38U, 0x79c8eedbU, + 0xa17c0a47U, 0x7c420fe9U, 0xf8841ec9U, 0x00000000U, + 0x09808683U, 0x322bed48U, 0x1e1170acU, 0x6c5a724eU, + 0xfd0efffbU, 0x0f853856U, 0x3daed51eU, 0x362d3927U, + 0x0a0fd964U, 0x685ca621U, 0x9b5b54d1U, 0x24362e3aU, + 0x0c0a67b1U, 0x9357e70fU, 0xb4ee96d2U, 0x1b9b919eU, + 0x80c0c54fU, 0x61dc20a2U, 0x5a774b69U, 0x1c121a16U, + 0xe293ba0aU, 0xc0a02ae5U, 0x3c22e043U, 0x121b171dU, + 0x0e090d0bU, 0xf28bc7adU, 0x2db6a8b9U, 0x141ea9c8U, + 0x57f11985U, 0xaf75074cU, 0xee99ddbbU, 0xa37f60fdU, + 0xf701269fU, 0x5c72f5bcU, 0x44663bc5U, 0x5bfb7e34U, + 0x8b432976U, 0xcb23c6dcU, 0xb6edfc68U, 0xb8e4f163U, + 0xd731dccaU, 0x42638510U, 0x13972240U, 0x84c61120U, + 0x854a247dU, 0xd2bb3df8U, 0xaef93211U, 0xc729a16dU, + 0x1d9e2f4bU, 0xdcb230f3U, 0x0d8652ecU, 0x77c1e3d0U, + 0x2bb3166cU, 0xa970b999U, 0x119448faU, 0x47e96422U, + 0xa8fc8cc4U, 0xa0f03f1aU, 0x567d2cd8U, 0x223390efU, + 0x87494ec7U, 0xd938d1c1U, 0x8ccaa2feU, 0x98d40b36U, + 0xa6f581cfU, 0xa57ade28U, 0xdab78e26U, 0x3fadbfa4U, + 0x2c3a9de4U, 0x5078920dU, 0x6a5fcc9bU, 0x547e4662U, + 0xf68d13c2U, 0x90d8b8e8U, 0x2e39f75eU, 0x82c3aff5U, + 0x9f5d80beU, 0x69d0937cU, 0x6fd52da9U, 0xcf2512b3U, + 0xc8ac993bU, 0x10187da7U, 0xe89c636eU, 0xdb3bbb7bU, + 0xcd267809U, 0x6e5918f4U, 0xec9ab701U, 0x834f9aa8U, + 0xe6956e65U, 0xaaffe67eU, 0x21bccf08U, 0xef15e8e6U, + 0xbae79bd9U, 0x4a6f36ceU, 0xea9f09d4U, 0x29b07cd6U, + 0x31a4b2afU, 0x2a3f2331U, 0xc6a59430U, 0x35a266c0U, + 0x744ebc37U, 0xfc82caa6U, 0xe090d0b0U, 0x33a7d815U, + 0xf104984aU, 0x41ecdaf7U, 0x7fcd500eU, 0x1791f62fU, + 0x764dd68dU, 0x43efb04dU, 0xccaa4d54U, 0xe49604dfU, + 0x9ed1b5e3U, 0x4c6a881bU, 0xc12c1fb8U, 0x4665517fU, + 0x9d5eea04U, 0x018c355dU, 0xfa877473U, 0xfb0b412eU, + 0xb3671d5aU, 0x92dbd252U, 0xe9105633U, 0x6dd64713U, + 0x9ad7618cU, 0x37a10c7aU, 0x59f8148eU, 0xeb133c89U, + 0xcea927eeU, 0xb761c935U, 0xe11ce5edU, 0x7a47b13cU, + 0x9cd2df59U, 0x55f2733fU, 0x1814ce79U, 0x73c737bfU, + 0x53f7cdeaU, 0x5ffdaa5bU, 0xdf3d6f14U, 0x7844db86U, + 0xcaaff381U, 0xb968c43eU, 0x3824342cU, 0xc2a3405fU, + 0x161dc372U, 0xbce2250cU, 0x283c498bU, 0xff0d9541U, + 0x39a80171U, 0x080cb3deU, 0xd8b4e49cU, 0x6456c190U, + 0x7bcb8461U, 0xd532b670U, 0x486c5c74U, 0xd0b85742U +}; + +static const uint32_t Td1[256] = +{ + 0x5051f4a7U, 0x537e4165U, 0xc31a17a4U, 0x963a275eU, + 0xcb3bab6bU, 0xf11f9d45U, 0xabacfa58U, 0x934be303U, + 0x552030faU, 0xf6ad766dU, 0x9188cc76U, 0x25f5024cU, + 0xfc4fe5d7U, 0xd7c52acbU, 0x80263544U, 0x8fb562a3U, + 0x49deb15aU, 0x6725ba1bU, 0x9845ea0eU, 0xe15dfec0U, + 0x02c32f75U, 0x12814cf0U, 0xa38d4697U, 0xc66bd3f9U, + 0xe7038f5fU, 0x9515929cU, 0xebbf6d7aU, 0xda955259U, + 0x2dd4be83U, 0xd3587421U, 0x2949e069U, 0x448ec9c8U, + 0x6a75c289U, 0x78f48e79U, 0x6b99583eU, 0xdd27b971U, + 0xb6bee14fU, 0x17f088adU, 0x66c920acU, 0xb47dce3aU, + 0x1863df4aU, 0x82e51a31U, 0x60975133U, 0x4562537fU, + 0xe0b16477U, 0x84bb6baeU, 0x1cfe81a0U, 0x94f9082bU, + 0x58704868U, 0x198f45fdU, 0x8794de6cU, 0xb7527bf8U, + 0x23ab73d3U, 0xe2724b02U, 0x57e31f8fU, 0x2a6655abU, + 0x07b2eb28U, 0x032fb5c2U, 0x9a86c57bU, 0xa5d33708U, + 0xf2302887U, 0xb223bfa5U, 0xba02036aU, 0x5ced1682U, + 0x2b8acf1cU, 0x92a779b4U, 0xf0f307f2U, 0xa14e69e2U, + 0xcd65daf4U, 0xd50605beU, 0x1fd13462U, 0x8ac4a6feU, + 0x9d342e53U, 0xa0a2f355U, 0x32058ae1U, 0x75a4f6ebU, + 0x390b83ecU, 0xaa4060efU, 0x065e719fU, 0x51bd6e10U, + 0xf93e218aU, 0x3d96dd06U, 0xaedd3e05U, 0x464de6bdU, + 0xb591548dU, 0x0571c45dU, 0x6f0406d4U, 0xff605015U, + 0x241998fbU, 0x97d6bde9U, 0xcc894043U, 0x7767d99eU, + 0xbdb0e842U, 0x8807898bU, 0x38e7195bU, 0xdb79c8eeU, + 0x47a17c0aU, 0xe97c420fU, 0xc9f8841eU, 0x00000000U, + 0x83098086U, 0x48322bedU, 0xac1e1170U, 0x4e6c5a72U, + 0xfbfd0effU, 0x560f8538U, 0x1e3daed5U, 0x27362d39U, + 0x640a0fd9U, 0x21685ca6U, 0xd19b5b54U, 0x3a24362eU, + 0xb10c0a67U, 0x0f9357e7U, 0xd2b4ee96U, 0x9e1b9b91U, + 0x4f80c0c5U, 0xa261dc20U, 0x695a774bU, 0x161c121aU, + 0x0ae293baU, 0xe5c0a02aU, 0x433c22e0U, 0x1d121b17U, + 0x0b0e090dU, 0xadf28bc7U, 0xb92db6a8U, 0xc8141ea9U, + 0x8557f119U, 0x4caf7507U, 0xbbee99ddU, 0xfda37f60U, + 0x9ff70126U, 0xbc5c72f5U, 0xc544663bU, 0x345bfb7eU, + 0x768b4329U, 0xdccb23c6U, 0x68b6edfcU, 0x63b8e4f1U, + 0xcad731dcU, 0x10426385U, 0x40139722U, 0x2084c611U, + 0x7d854a24U, 0xf8d2bb3dU, 0x11aef932U, 0x6dc729a1U, + 0x4b1d9e2fU, 0xf3dcb230U, 0xec0d8652U, 0xd077c1e3U, + 0x6c2bb316U, 0x99a970b9U, 0xfa119448U, 0x2247e964U, + 0xc4a8fc8cU, 0x1aa0f03fU, 0xd8567d2cU, 0xef223390U, + 0xc787494eU, 0xc1d938d1U, 0xfe8ccaa2U, 0x3698d40bU, + 0xcfa6f581U, 0x28a57adeU, 0x26dab78eU, 0xa43fadbfU, + 0xe42c3a9dU, 0x0d507892U, 0x9b6a5fccU, 0x62547e46U, + 0xc2f68d13U, 0xe890d8b8U, 0x5e2e39f7U, 0xf582c3afU, + 0xbe9f5d80U, 0x7c69d093U, 0xa96fd52dU, 0xb3cf2512U, + 0x3bc8ac99U, 0xa710187dU, 0x6ee89c63U, 0x7bdb3bbbU, + 0x09cd2678U, 0xf46e5918U, 0x01ec9ab7U, 0xa8834f9aU, + 0x65e6956eU, 0x7eaaffe6U, 0x0821bccfU, 0xe6ef15e8U, + 0xd9bae79bU, 0xce4a6f36U, 0xd4ea9f09U, 0xd629b07cU, + 0xaf31a4b2U, 0x312a3f23U, 0x30c6a594U, 0xc035a266U, + 0x37744ebcU, 0xa6fc82caU, 0xb0e090d0U, 0x1533a7d8U, + 0x4af10498U, 0xf741ecdaU, 0x0e7fcd50U, 0x2f1791f6U, + 0x8d764dd6U, 0x4d43efb0U, 0x54ccaa4dU, 0xdfe49604U, + 0xe39ed1b5U, 0x1b4c6a88U, 0xb8c12c1fU, 0x7f466551U, + 0x049d5eeaU, 0x5d018c35U, 0x73fa8774U, 0x2efb0b41U, + 0x5ab3671dU, 0x5292dbd2U, 0x33e91056U, 0x136dd647U, + 0x8c9ad761U, 0x7a37a10cU, 0x8e59f814U, 0x89eb133cU, + 0xeecea927U, 0x35b761c9U, 0xede11ce5U, 0x3c7a47b1U, + 0x599cd2dfU, 0x3f55f273U, 0x791814ceU, 0xbf73c737U, + 0xea53f7cdU, 0x5b5ffdaaU, 0x14df3d6fU, 0x867844dbU, + 0x81caaff3U, 0x3eb968c4U, 0x2c382434U, 0x5fc2a340U, + 0x72161dc3U, 0x0cbce225U, 0x8b283c49U, 0x41ff0d95U, + 0x7139a801U, 0xde080cb3U, 0x9cd8b4e4U, 0x906456c1U, + 0x617bcb84U, 0x70d532b6U, 0x74486c5cU, 0x42d0b857U +}; + +static const uint32_t Td2[256] = +{ + 0xa75051f4U, 0x65537e41U, 0xa4c31a17U, 0x5e963a27U, + 0x6bcb3babU, 0x45f11f9dU, 0x58abacfaU, 0x03934be3U, + 0xfa552030U, 0x6df6ad76U, 0x769188ccU, 0x4c25f502U, + 0xd7fc4fe5U, 0xcbd7c52aU, 0x44802635U, 0xa38fb562U, + 0x5a49deb1U, 0x1b6725baU, 0x0e9845eaU, 0xc0e15dfeU, + 0x7502c32fU, 0xf012814cU, 0x97a38d46U, 0xf9c66bd3U, + 0x5fe7038fU, 0x9c951592U, 0x7aebbf6dU, 0x59da9552U, + 0x832dd4beU, 0x21d35874U, 0x692949e0U, 0xc8448ec9U, + 0x896a75c2U, 0x7978f48eU, 0x3e6b9958U, 0x71dd27b9U, + 0x4fb6bee1U, 0xad17f088U, 0xac66c920U, 0x3ab47dceU, + 0x4a1863dfU, 0x3182e51aU, 0x33609751U, 0x7f456253U, + 0x77e0b164U, 0xae84bb6bU, 0xa01cfe81U, 0x2b94f908U, + 0x68587048U, 0xfd198f45U, 0x6c8794deU, 0xf8b7527bU, + 0xd323ab73U, 0x02e2724bU, 0x8f57e31fU, 0xab2a6655U, + 0x2807b2ebU, 0xc2032fb5U, 0x7b9a86c5U, 0x08a5d337U, + 0x87f23028U, 0xa5b223bfU, 0x6aba0203U, 0x825ced16U, + 0x1c2b8acfU, 0xb492a779U, 0xf2f0f307U, 0xe2a14e69U, + 0xf4cd65daU, 0xbed50605U, 0x621fd134U, 0xfe8ac4a6U, + 0x539d342eU, 0x55a0a2f3U, 0xe132058aU, 0xeb75a4f6U, + 0xec390b83U, 0xefaa4060U, 0x9f065e71U, 0x1051bd6eU, + 0x8af93e21U, 0x063d96ddU, 0x05aedd3eU, 0xbd464de6U, + 0x8db59154U, 0x5d0571c4U, 0xd46f0406U, 0x15ff6050U, + 0xfb241998U, 0xe997d6bdU, 0x43cc8940U, 0x9e7767d9U, + 0x42bdb0e8U, 0x8b880789U, 0x5b38e719U, 0xeedb79c8U, + 0x0a47a17cU, 0x0fe97c42U, 0x1ec9f884U, 0x00000000U, + 0x86830980U, 0xed48322bU, 0x70ac1e11U, 0x724e6c5aU, + 0xfffbfd0eU, 0x38560f85U, 0xd51e3daeU, 0x3927362dU, + 0xd9640a0fU, 0xa621685cU, 0x54d19b5bU, 0x2e3a2436U, + 0x67b10c0aU, 0xe70f9357U, 0x96d2b4eeU, 0x919e1b9bU, + 0xc54f80c0U, 0x20a261dcU, 0x4b695a77U, 0x1a161c12U, + 0xba0ae293U, 0x2ae5c0a0U, 0xe0433c22U, 0x171d121bU, + 0x0d0b0e09U, 0xc7adf28bU, 0xa8b92db6U, 0xa9c8141eU, + 0x198557f1U, 0x074caf75U, 0xddbbee99U, 0x60fda37fU, + 0x269ff701U, 0xf5bc5c72U, 0x3bc54466U, 0x7e345bfbU, + 0x29768b43U, 0xc6dccb23U, 0xfc68b6edU, 0xf163b8e4U, + 0xdccad731U, 0x85104263U, 0x22401397U, 0x112084c6U, + 0x247d854aU, 0x3df8d2bbU, 0x3211aef9U, 0xa16dc729U, + 0x2f4b1d9eU, 0x30f3dcb2U, 0x52ec0d86U, 0xe3d077c1U, + 0x166c2bb3U, 0xb999a970U, 0x48fa1194U, 0x642247e9U, + 0x8cc4a8fcU, 0x3f1aa0f0U, 0x2cd8567dU, 0x90ef2233U, + 0x4ec78749U, 0xd1c1d938U, 0xa2fe8ccaU, 0x0b3698d4U, + 0x81cfa6f5U, 0xde28a57aU, 0x8e26dab7U, 0xbfa43fadU, + 0x9de42c3aU, 0x920d5078U, 0xcc9b6a5fU, 0x4662547eU, + 0x13c2f68dU, 0xb8e890d8U, 0xf75e2e39U, 0xaff582c3U, + 0x80be9f5dU, 0x937c69d0U, 0x2da96fd5U, 0x12b3cf25U, + 0x993bc8acU, 0x7da71018U, 0x636ee89cU, 0xbb7bdb3bU, + 0x7809cd26U, 0x18f46e59U, 0xb701ec9aU, 0x9aa8834fU, + 0x6e65e695U, 0xe67eaaffU, 0xcf0821bcU, 0xe8e6ef15U, + 0x9bd9bae7U, 0x36ce4a6fU, 0x09d4ea9fU, 0x7cd629b0U, + 0xb2af31a4U, 0x23312a3fU, 0x9430c6a5U, 0x66c035a2U, + 0xbc37744eU, 0xcaa6fc82U, 0xd0b0e090U, 0xd81533a7U, + 0x984af104U, 0xdaf741ecU, 0x500e7fcdU, 0xf62f1791U, + 0xd68d764dU, 0xb04d43efU, 0x4d54ccaaU, 0x04dfe496U, + 0xb5e39ed1U, 0x881b4c6aU, 0x1fb8c12cU, 0x517f4665U, + 0xea049d5eU, 0x355d018cU, 0x7473fa87U, 0x412efb0bU, + 0x1d5ab367U, 0xd25292dbU, 0x5633e910U, 0x47136dd6U, + 0x618c9ad7U, 0x0c7a37a1U, 0x148e59f8U, 0x3c89eb13U, + 0x27eecea9U, 0xc935b761U, 0xe5ede11cU, 0xb13c7a47U, + 0xdf599cd2U, 0x733f55f2U, 0xce791814U, 0x37bf73c7U, + 0xcdea53f7U, 0xaa5b5ffdU, 0x6f14df3dU, 0xdb867844U, + 0xf381caafU, 0xc43eb968U, 0x342c3824U, 0x405fc2a3U, + 0xc372161dU, 0x250cbce2U, 0x498b283cU, 0x9541ff0dU, + 0x017139a8U, 0xb3de080cU, 0xe49cd8b4U, 0xc1906456U, + 0x84617bcbU, 0xb670d532U, 0x5c74486cU, 0x5742d0b8U +}; + +static const uint32_t Td3[256] = +{ + 0xf4a75051U, 0x4165537eU, 0x17a4c31aU, 0x275e963aU, + 0xab6bcb3bU, 0x9d45f11fU, 0xfa58abacU, 0xe303934bU, + 0x30fa5520U, 0x766df6adU, 0xcc769188U, 0x024c25f5U, + 0xe5d7fc4fU, 0x2acbd7c5U, 0x35448026U, 0x62a38fb5U, + 0xb15a49deU, 0xba1b6725U, 0xea0e9845U, 0xfec0e15dU, + 0x2f7502c3U, 0x4cf01281U, 0x4697a38dU, 0xd3f9c66bU, + 0x8f5fe703U, 0x929c9515U, 0x6d7aebbfU, 0x5259da95U, + 0xbe832dd4U, 0x7421d358U, 0xe0692949U, 0xc9c8448eU, + 0xc2896a75U, 0x8e7978f4U, 0x583e6b99U, 0xb971dd27U, + 0xe14fb6beU, 0x88ad17f0U, 0x20ac66c9U, 0xce3ab47dU, + 0xdf4a1863U, 0x1a3182e5U, 0x51336097U, 0x537f4562U, + 0x6477e0b1U, 0x6bae84bbU, 0x81a01cfeU, 0x082b94f9U, + 0x48685870U, 0x45fd198fU, 0xde6c8794U, 0x7bf8b752U, + 0x73d323abU, 0x4b02e272U, 0x1f8f57e3U, 0x55ab2a66U, + 0xeb2807b2U, 0xb5c2032fU, 0xc57b9a86U, 0x3708a5d3U, + 0x2887f230U, 0xbfa5b223U, 0x036aba02U, 0x16825cedU, + 0xcf1c2b8aU, 0x79b492a7U, 0x07f2f0f3U, 0x69e2a14eU, + 0xdaf4cd65U, 0x05bed506U, 0x34621fd1U, 0xa6fe8ac4U, + 0x2e539d34U, 0xf355a0a2U, 0x8ae13205U, 0xf6eb75a4U, + 0x83ec390bU, 0x60efaa40U, 0x719f065eU, 0x6e1051bdU, + 0x218af93eU, 0xdd063d96U, 0x3e05aeddU, 0xe6bd464dU, + 0x548db591U, 0xc45d0571U, 0x06d46f04U, 0x5015ff60U, + 0x98fb2419U, 0xbde997d6U, 0x4043cc89U, 0xd99e7767U, + 0xe842bdb0U, 0x898b8807U, 0x195b38e7U, 0xc8eedb79U, + 0x7c0a47a1U, 0x420fe97cU, 0x841ec9f8U, 0x00000000U, + 0x80868309U, 0x2bed4832U, 0x1170ac1eU, 0x5a724e6cU, + 0x0efffbfdU, 0x8538560fU, 0xaed51e3dU, 0x2d392736U, + 0x0fd9640aU, 0x5ca62168U, 0x5b54d19bU, 0x362e3a24U, + 0x0a67b10cU, 0x57e70f93U, 0xee96d2b4U, 0x9b919e1bU, + 0xc0c54f80U, 0xdc20a261U, 0x774b695aU, 0x121a161cU, + 0x93ba0ae2U, 0xa02ae5c0U, 0x22e0433cU, 0x1b171d12U, + 0x090d0b0eU, 0x8bc7adf2U, 0xb6a8b92dU, 0x1ea9c814U, + 0xf1198557U, 0x75074cafU, 0x99ddbbeeU, 0x7f60fda3U, + 0x01269ff7U, 0x72f5bc5cU, 0x663bc544U, 0xfb7e345bU, + 0x4329768bU, 0x23c6dccbU, 0xedfc68b6U, 0xe4f163b8U, + 0x31dccad7U, 0x63851042U, 0x97224013U, 0xc6112084U, + 0x4a247d85U, 0xbb3df8d2U, 0xf93211aeU, 0x29a16dc7U, + 0x9e2f4b1dU, 0xb230f3dcU, 0x8652ec0dU, 0xc1e3d077U, + 0xb3166c2bU, 0x70b999a9U, 0x9448fa11U, 0xe9642247U, + 0xfc8cc4a8U, 0xf03f1aa0U, 0x7d2cd856U, 0x3390ef22U, + 0x494ec787U, 0x38d1c1d9U, 0xcaa2fe8cU, 0xd40b3698U, + 0xf581cfa6U, 0x7ade28a5U, 0xb78e26daU, 0xadbfa43fU, + 0x3a9de42cU, 0x78920d50U, 0x5fcc9b6aU, 0x7e466254U, + 0x8d13c2f6U, 0xd8b8e890U, 0x39f75e2eU, 0xc3aff582U, + 0x5d80be9fU, 0xd0937c69U, 0xd52da96fU, 0x2512b3cfU, + 0xac993bc8U, 0x187da710U, 0x9c636ee8U, 0x3bbb7bdbU, + 0x267809cdU, 0x5918f46eU, 0x9ab701ecU, 0x4f9aa883U, + 0x956e65e6U, 0xffe67eaaU, 0xbccf0821U, 0x15e8e6efU, + 0xe79bd9baU, 0x6f36ce4aU, 0x9f09d4eaU, 0xb07cd629U, + 0xa4b2af31U, 0x3f23312aU, 0xa59430c6U, 0xa266c035U, + 0x4ebc3774U, 0x82caa6fcU, 0x90d0b0e0U, 0xa7d81533U, + 0x04984af1U, 0xecdaf741U, 0xcd500e7fU, 0x91f62f17U, + 0x4dd68d76U, 0xefb04d43U, 0xaa4d54ccU, 0x9604dfe4U, + 0xd1b5e39eU, 0x6a881b4cU, 0x2c1fb8c1U, 0x65517f46U, + 0x5eea049dU, 0x8c355d01U, 0x877473faU, 0x0b412efbU, + 0x671d5ab3U, 0xdbd25292U, 0x105633e9U, 0xd647136dU, + 0xd7618c9aU, 0xa10c7a37U, 0xf8148e59U, 0x133c89ebU, + 0xa927eeceU, 0x61c935b7U, 0x1ce5ede1U, 0x47b13c7aU, + 0xd2df599cU, 0xf2733f55U, 0x14ce7918U, 0xc737bf73U, + 0xf7cdea53U, 0xfdaa5b5fU, 0x3d6f14dfU, 0x44db8678U, + 0xaff381caU, 0x68c43eb9U, 0x24342c38U, 0xa3405fc2U, + 0x1dc37216U, 0xe2250cbcU, 0x3c498b28U, 0x0d9541ffU, + 0xa8017139U, 0x0cb3de08U, 0xb4e49cd8U, 0x56c19064U, + 0xcb84617bU, 0x32b670d5U, 0x6c5c7448U, 0xb85742d0U +}; + +static const uint32_t Td4[256] = +{ + 0x52525252U, 0x09090909U, 0x6a6a6a6aU, 0xd5d5d5d5U, + 0x30303030U, 0x36363636U, 0xa5a5a5a5U, 0x38383838U, + 0xbfbfbfbfU, 0x40404040U, 0xa3a3a3a3U, 0x9e9e9e9eU, + 0x81818181U, 0xf3f3f3f3U, 0xd7d7d7d7U, 0xfbfbfbfbU, + 0x7c7c7c7cU, 0xe3e3e3e3U, 0x39393939U, 0x82828282U, + 0x9b9b9b9bU, 0x2f2f2f2fU, 0xffffffffU, 0x87878787U, + 0x34343434U, 0x8e8e8e8eU, 0x43434343U, 0x44444444U, + 0xc4c4c4c4U, 0xdedededeU, 0xe9e9e9e9U, 0xcbcbcbcbU, + 0x54545454U, 0x7b7b7b7bU, 0x94949494U, 0x32323232U, + 0xa6a6a6a6U, 0xc2c2c2c2U, 0x23232323U, 0x3d3d3d3dU, + 0xeeeeeeeeU, 0x4c4c4c4cU, 0x95959595U, 0x0b0b0b0bU, + 0x42424242U, 0xfafafafaU, 0xc3c3c3c3U, 0x4e4e4e4eU, + 0x08080808U, 0x2e2e2e2eU, 0xa1a1a1a1U, 0x66666666U, + 0x28282828U, 0xd9d9d9d9U, 0x24242424U, 0xb2b2b2b2U, + 0x76767676U, 0x5b5b5b5bU, 0xa2a2a2a2U, 0x49494949U, + 0x6d6d6d6dU, 0x8b8b8b8bU, 0xd1d1d1d1U, 0x25252525U, + 0x72727272U, 0xf8f8f8f8U, 0xf6f6f6f6U, 0x64646464U, + 0x86868686U, 0x68686868U, 0x98989898U, 0x16161616U, + 0xd4d4d4d4U, 0xa4a4a4a4U, 0x5c5c5c5cU, 0xccccccccU, + 0x5d5d5d5dU, 0x65656565U, 0xb6b6b6b6U, 0x92929292U, + 0x6c6c6c6cU, 0x70707070U, 0x48484848U, 0x50505050U, + 0xfdfdfdfdU, 0xededededU, 0xb9b9b9b9U, 0xdadadadaU, + 0x5e5e5e5eU, 0x15151515U, 0x46464646U, 0x57575757U, + 0xa7a7a7a7U, 0x8d8d8d8dU, 0x9d9d9d9dU, 0x84848484U, + 0x90909090U, 0xd8d8d8d8U, 0xababababU, 0x00000000U, + 0x8c8c8c8cU, 0xbcbcbcbcU, 0xd3d3d3d3U, 0x0a0a0a0aU, + 0xf7f7f7f7U, 0xe4e4e4e4U, 0x58585858U, 0x05050505U, + 0xb8b8b8b8U, 0xb3b3b3b3U, 0x45454545U, 0x06060606U, + 0xd0d0d0d0U, 0x2c2c2c2cU, 0x1e1e1e1eU, 0x8f8f8f8fU, + 0xcacacacaU, 0x3f3f3f3fU, 0x0f0f0f0fU, 0x02020202U, + 0xc1c1c1c1U, 0xafafafafU, 0xbdbdbdbdU, 0x03030303U, + 0x01010101U, 0x13131313U, 0x8a8a8a8aU, 0x6b6b6b6bU, + 0x3a3a3a3aU, 0x91919191U, 0x11111111U, 0x41414141U, + 0x4f4f4f4fU, 0x67676767U, 0xdcdcdcdcU, 0xeaeaeaeaU, + 0x97979797U, 0xf2f2f2f2U, 0xcfcfcfcfU, 0xcecececeU, + 0xf0f0f0f0U, 0xb4b4b4b4U, 0xe6e6e6e6U, 0x73737373U, + 0x96969696U, 0xacacacacU, 0x74747474U, 0x22222222U, + 0xe7e7e7e7U, 0xadadadadU, 0x35353535U, 0x85858585U, + 0xe2e2e2e2U, 0xf9f9f9f9U, 0x37373737U, 0xe8e8e8e8U, + 0x1c1c1c1cU, 0x75757575U, 0xdfdfdfdfU, 0x6e6e6e6eU, + 0x47474747U, 0xf1f1f1f1U, 0x1a1a1a1aU, 0x71717171U, + 0x1d1d1d1dU, 0x29292929U, 0xc5c5c5c5U, 0x89898989U, + 0x6f6f6f6fU, 0xb7b7b7b7U, 0x62626262U, 0x0e0e0e0eU, + 0xaaaaaaaaU, 0x18181818U, 0xbebebebeU, 0x1b1b1b1bU, + 0xfcfcfcfcU, 0x56565656U, 0x3e3e3e3eU, 0x4b4b4b4bU, + 0xc6c6c6c6U, 0xd2d2d2d2U, 0x79797979U, 0x20202020U, + 0x9a9a9a9aU, 0xdbdbdbdbU, 0xc0c0c0c0U, 0xfefefefeU, + 0x78787878U, 0xcdcdcdcdU, 0x5a5a5a5aU, 0xf4f4f4f4U, + 0x1f1f1f1fU, 0xddddddddU, 0xa8a8a8a8U, 0x33333333U, + 0x88888888U, 0x07070707U, 0xc7c7c7c7U, 0x31313131U, + 0xb1b1b1b1U, 0x12121212U, 0x10101010U, 0x59595959U, + 0x27272727U, 0x80808080U, 0xececececU, 0x5f5f5f5fU, + 0x60606060U, 0x51515151U, 0x7f7f7f7fU, 0xa9a9a9a9U, + 0x19191919U, 0xb5b5b5b5U, 0x4a4a4a4aU, 0x0d0d0d0dU, + 0x2d2d2d2dU, 0xe5e5e5e5U, 0x7a7a7a7aU, 0x9f9f9f9fU, + 0x93939393U, 0xc9c9c9c9U, 0x9c9c9c9cU, 0xefefefefU, + 0xa0a0a0a0U, 0xe0e0e0e0U, 0x3b3b3b3bU, 0x4d4d4d4dU, + 0xaeaeaeaeU, 0x2a2a2a2aU, 0xf5f5f5f5U, 0xb0b0b0b0U, + 0xc8c8c8c8U, 0xebebebebU, 0xbbbbbbbbU, 0x3c3c3c3cU, + 0x83838383U, 0x53535353U, 0x99999999U, 0x61616161U, + 0x17171717U, 0x2b2b2b2bU, 0x04040404U, 0x7e7e7e7eU, + 0xbabababaU, 0x77777777U, 0xd6d6d6d6U, 0x26262626U, + 0xe1e1e1e1U, 0x69696969U, 0x14141414U, 0x63636363U, + 0x55555555U, 0x21212121U, 0x0c0c0c0cU, 0x7d7d7d7dU +}; + +/* Rcon is Round Constant; used for encryption key expansion */ +static const uint32_t rcon[RC_LENGTH] = +{ + /* for 128-bit blocks, Rijndael never uses more than 10 rcon values */ + 0x01000000, 0x02000000, 0x04000000, 0x08000000, + 0x10000000, 0x20000000, 0x40000000, 0x80000000, + 0x1B000000, 0x36000000 +}; + + +/* + * Expand the cipher key into the encryption key schedule. + * + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static int +rijndael_key_setup_enc(uint32_t rk[], const uint32_t cipherKey[], + int keyBits) +{ + int i = 0; + uint32_t temp; + + rk[0] = cipherKey[0]; + rk[1] = cipherKey[1]; + rk[2] = cipherKey[2]; + rk[3] = cipherKey[3]; + + if (keyBits == 128) { + for (;;) { + temp = rk[3]; + rk[4] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[temp >> 24] & 0x000000ff) ^ + rcon[i]; + rk[5] = rk[1] ^ rk[4]; + rk[6] = rk[2] ^ rk[5]; + rk[7] = rk[3] ^ rk[6]; + + if (++i == 10) { + return (10); + } + rk += 4; + } + } + + rk[4] = cipherKey[4]; + rk[5] = cipherKey[5]; + + if (keyBits == 192) { + for (;;) { + temp = rk[5]; + rk[6] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[temp >> 24] & 0x000000ff) ^ + rcon[i]; + rk[7] = rk[1] ^ rk[6]; + rk[8] = rk[2] ^ rk[7]; + rk[9] = rk[3] ^ rk[8]; + + if (++i == 8) { + return (12); + } + + rk[10] = rk[4] ^ rk[9]; + rk[11] = rk[5] ^ rk[10]; + rk += 6; + } + } + + rk[6] = cipherKey[6]; + rk[7] = cipherKey[7]; + + if (keyBits == 256) { + for (;;) { + temp = rk[7]; + rk[8] = rk[0] ^ + (Te4[(temp >> 16) & 0xff] & 0xff000000) ^ + (Te4[(temp >> 8) & 0xff] & 0x00ff0000) ^ + (Te4[temp & 0xff] & 0x0000ff00) ^ + (Te4[temp >> 24] & 0x000000ff) ^ + rcon[i]; + rk[9] = rk[1] ^ rk[8]; + rk[10] = rk[2] ^ rk[9]; + rk[11] = rk[3] ^ rk[10]; + + if (++i == 7) { + return (14); + } + temp = rk[11]; + rk[12] = rk[4] ^ + (Te4[temp >> 24] & 0xff000000) ^ + (Te4[(temp >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(temp >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[temp & 0xff] & 0x000000ff); + rk[13] = rk[5] ^ rk[12]; + rk[14] = rk[6] ^ rk[13]; + rk[15] = rk[7] ^ rk[14]; + + rk += 8; + } + } + + return (0); +} + +/* + * Expand the cipher key into the decryption key schedule. + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static int +rijndael_key_setup_dec(uint32_t rk[], const uint32_t cipherKey[], int keyBits) +{ + int Nr, i, j; + uint32_t temp; + + /* expand the cipher key: */ + Nr = rijndael_key_setup_enc(rk, cipherKey, keyBits); + + /* invert the order of the round keys: */ + for (i = 0, j = 4 * Nr; i < j; i += 4, j -= 4) { + temp = rk[i]; + rk[i] = rk[j]; + rk[j] = temp; + temp = rk[i + 1]; + rk[i + 1] = rk[j + 1]; + rk[j + 1] = temp; + temp = rk[i + 2]; + rk[i + 2] = rk[j + 2]; + rk[j + 2] = temp; + temp = rk[i + 3]; + rk[i + 3] = rk[j + 3]; + rk[j + 3] = temp; + } + + /* + * apply the inverse MixColumn transform to all + * round keys but the first and the last: + */ + for (i = 1; i < Nr; i++) { + rk += 4; + rk[0] = Td0[Te4[rk[0] >> 24] & 0xff] ^ + Td1[Te4[(rk[0] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[0] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[0] & 0xff] & 0xff]; + rk[1] = Td0[Te4[rk[1] >> 24] & 0xff] ^ + Td1[Te4[(rk[1] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[1] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[1] & 0xff] & 0xff]; + rk[2] = Td0[Te4[rk[2] >> 24] & 0xff] ^ + Td1[Te4[(rk[2] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[2] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[2] & 0xff] & 0xff]; + rk[3] = Td0[Te4[rk[3] >> 24] & 0xff] ^ + Td1[Te4[(rk[3] >> 16) & 0xff] & 0xff] ^ + Td2[Te4[(rk[3] >> 8) & 0xff] & 0xff] ^ + Td3[Te4[rk[3] & 0xff] & 0xff]; + } + + return (Nr); +} + +/* + * Expand the 32-bit AES cipher key array into the encryption and decryption + * key schedules. + * + * Parameters: + * key AES key schedule to be initialized + * keyarr32 User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static void +aes_generic_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits) +{ + key->nr = rijndael_key_setup_enc(&(key->encr_ks.ks32[0]), keyarr32, + keybits); + key->nr = rijndael_key_setup_dec(&(key->decr_ks.ks32[0]), keyarr32, + keybits); +} + +/* + * Encrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * pt Input block (plain text) + * ct Output block (crypto text). Can overlap with pt + */ +static void +aes_generic_encrypt(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) +{ + uint32_t s0, s1, s2, s3, t0, t1, t2, t3; + int r; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + + s0 = pt[0] ^ rk[0]; + s1 = pt[1] ^ rk[1]; + s2 = pt[2] ^ rk[2]; + s3 = pt[3] ^ rk[3]; + + /* + * Nr - 1 full rounds: + */ + + r = Nr >> 1; + + for (;;) { + t0 = Te0[s0 >> 24] ^ + Te1[(s1 >> 16) & 0xff] ^ + Te2[(s2 >> 8) & 0xff] ^ + Te3[s3 & 0xff] ^ + rk[4]; + + t1 = Te0[s1 >> 24] ^ + Te1[(s2 >> 16) & 0xff] ^ + Te2[(s3 >> 8) & 0xff] ^ + Te3[s0 & 0xff] ^ + rk[5]; + + t2 = Te0[s2 >> 24] ^ + Te1[(s3 >> 16) & 0xff] ^ + Te2[(s0 >> 8) & 0xff] ^ + Te3[s1 & 0xff] ^ + rk[6]; + + t3 = Te0[s3 >> 24] ^ + Te1[(s0 >> 16) & 0xff] ^ + Te2[(s1 >> 8) & 0xff] ^ + Te3[s2 & 0xff] ^ + rk[7]; + + rk += 8; + + if (--r == 0) { + break; + } + + s0 = Te0[t0 >> 24] ^ + Te1[(t1 >> 16) & 0xff] ^ + Te2[(t2 >> 8) & 0xff] ^ + Te3[t3 & 0xff] ^ + rk[0]; + + s1 = Te0[t1 >> 24] ^ + Te1[(t2 >> 16) & 0xff] ^ + Te2[(t3 >> 8) & 0xff] ^ + Te3[t0 & 0xff] ^ + rk[1]; + + s2 = Te0[t2 >> 24] ^ + Te1[(t3 >> 16) & 0xff] ^ + Te2[(t0 >> 8) & 0xff] ^ + Te3[t1 & 0xff] ^ + rk[2]; + + s3 = Te0[t3 >> 24] ^ + Te1[(t0 >> 16) & 0xff] ^ + Te2[(t1 >> 8) & 0xff] ^ + Te3[t2 & 0xff] ^ + rk[3]; + } + + /* + * apply last round and + * map cipher state to byte array block: + */ + + s0 = (Te4[(t0 >> 24)] & 0xff000000) ^ + (Te4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t3 & 0xff] & 0x000000ff) ^ + rk[0]; + ct[0] = s0; + + s1 = (Te4[(t1 >> 24)] & 0xff000000) ^ + (Te4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t0 & 0xff] & 0x000000ff) ^ + rk[1]; + ct[1] = s1; + + s2 = (Te4[(t2 >> 24)] & 0xff000000) ^ + (Te4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t1 & 0xff] & 0x000000ff) ^ + rk[2]; + ct[2] = s2; + + s3 = (Te4[(t3 >> 24)] & 0xff000000) ^ + (Te4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Te4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Te4[t2 & 0xff] & 0x000000ff) ^ + rk[3]; + ct[3] = s3; +} + + +/* + * Decrypt one block of data. The block is assumed to be an array + * of four uint32_t values, so copy for alignment (and byte-order + * reversal for little endian systems might be necessary on the + * input and output byte streams. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4*(Nr + 1). + * + * Parameters: + * rk Key schedule, of aes_ks_t (60 32-bit integers) + * Nr Number of rounds + * ct Input block (crypto text) + * pt Output block (plain text). Can overlap with pt + */ +static void +aes_generic_decrypt(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) +{ + uint32_t s0, s1, s2, s3, t0, t1, t2, t3; + int r; + + /* + * map byte array block to cipher state + * and add initial round key: + */ + s0 = ct[0] ^ rk[0]; + s1 = ct[1] ^ rk[1]; + s2 = ct[2] ^ rk[2]; + s3 = ct[3] ^ rk[3]; + + /* + * Nr - 1 full rounds: + */ + + r = Nr >> 1; + + for (;;) { + t0 = Td0[s0 >> 24] ^ + Td1[(s3 >> 16) & 0xff] ^ + Td2[(s2 >> 8) & 0xff] ^ + Td3[s1 & 0xff] ^ + rk[4]; + + t1 = Td0[s1 >> 24] ^ + Td1[(s0 >> 16) & 0xff] ^ + Td2[(s3 >> 8) & 0xff] ^ + Td3[s2 & 0xff] ^ + rk[5]; + + t2 = Td0[s2 >> 24] ^ + Td1[(s1 >> 16) & 0xff] ^ + Td2[(s0 >> 8) & 0xff] ^ + Td3[s3 & 0xff] ^ + rk[6]; + + t3 = Td0[s3 >> 24] ^ + Td1[(s2 >> 16) & 0xff] ^ + Td2[(s1 >> 8) & 0xff] ^ + Td3[s0 & 0xff] ^ + rk[7]; + + rk += 8; + + if (--r == 0) { + break; + } + + s0 = Td0[t0 >> 24] ^ + Td1[(t3 >> 16) & 0xff] ^ + Td2[(t2 >> 8) & 0xff] ^ + Td3[t1 & 0xff] ^ + rk[0]; + + s1 = Td0[t1 >> 24] ^ + Td1[(t0 >> 16) & 0xff] ^ + Td2[(t3 >> 8) & 0xff] ^ + Td3[t2 & 0xff] ^ + rk[1]; + + s2 = Td0[t2 >> 24] ^ + Td1[(t1 >> 16) & 0xff] ^ + Td2[(t0 >> 8) & 0xff] ^ + Td3[t3 & 0xff] ^ + rk[2]; + + s3 = Td0[t3 >> 24] ^ + Td1[(t2 >> 16) & 0xff] ^ + Td2[(t1 >> 8) & 0xff] ^ + Td3[t0 & 0xff] ^ + rk[3]; + } + + /* + * apply last round and + * map cipher state to byte array block: + */ + + s0 = (Td4[t0 >> 24] & 0xff000000) ^ + (Td4[(t3 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t2 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t1 & 0xff] & 0x000000ff) ^ + rk[0]; + pt[0] = s0; + + s1 = (Td4[t1 >> 24] & 0xff000000) ^ + (Td4[(t0 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t3 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t2 & 0xff] & 0x000000ff) ^ + rk[1]; + pt[1] = s1; + + s2 = (Td4[t2 >> 24] & 0xff000000) ^ + (Td4[(t1 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t0 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t3 & 0xff] & 0x000000ff) ^ + rk[2]; + pt[2] = s2; + + s3 = (Td4[t3 >> 24] & 0xff000000) ^ + (Td4[(t2 >> 16) & 0xff] & 0x00ff0000) ^ + (Td4[(t1 >> 8) & 0xff] & 0x0000ff00) ^ + (Td4[t0 & 0xff] & 0x000000ff) ^ + rk[3]; + pt[3] = s3; +} + +static boolean_t +aes_generic_will_work(void) +{ + return (B_TRUE); +} + +/* + * For _LITTLE_ENDIAN machines, reverse every 4 bytes in the key. + * On _BIG_ENDIAN, copy the key without reversing bytes. + * + * SPARCv8/v9 uses a key schedule array with 64-bit elements. + * X86/AMD64 uses a key schedule array with 32-bit elements. + */ +const aes_impl_ops_t aes_generic_impl = { + .generate = &aes_generic_generate, + .encrypt = &aes_generic_encrypt, + .decrypt = &aes_generic_decrypt, + .is_supported = &aes_generic_will_work, +#if defined(_ZFS_LITTLE_ENDIAN) + .needs_byteswap = B_TRUE, +#else + .needs_byteswap = B_FALSE, +#endif + .name = "generic" +}; diff --git a/module/icp/algs/aes/aes_impl_x86-64.c b/module/icp/algs/aes/aes_impl_x86-64.c new file mode 100644 index 000000000000..19f8fd5012cf --- /dev/null +++ b/module/icp/algs/aes/aes_impl_x86-64.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#if defined(__x86_64) + +#include +#include + +/* + * Expand the 32-bit AES cipher key array into the encryption and decryption + * key schedules. + * + * Parameters: + * key AES key schedule to be initialized + * keyarr32 User key + * keyBits AES key size (128, 192, or 256 bits) + */ +static void +aes_x86_64_generate(aes_key_t *key, const uint32_t *keyarr32, int keybits) +{ + key->nr = rijndael_key_setup_enc_amd64(&(key->encr_ks.ks32[0]), + keyarr32, keybits); + key->nr = rijndael_key_setup_dec_amd64(&(key->decr_ks.ks32[0]), + keyarr32, keybits); +} + +static boolean_t +aes_x86_64_will_work(void) +{ + return (B_TRUE); +} + +const aes_impl_ops_t aes_x86_64_impl = { + .generate = &aes_x86_64_generate, + .encrypt = &aes_encrypt_amd64, + .decrypt = &aes_decrypt_amd64, + .is_supported = &aes_x86_64_will_work, + .needs_byteswap = B_FALSE, + .name = "x86_64" +}; + +#endif /* defined(__x86_64) */ diff --git a/module/icp/algs/aes/aes_modes.c b/module/icp/algs/aes/aes_modes.c new file mode 100644 index 000000000000..9e4b498fffcb --- /dev/null +++ b/module/icp/algs/aes/aes_modes.c @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +/* Copy a 16-byte AES block from "in" to "out" */ +void +aes_copy_block(uint8_t *in, uint8_t *out) +{ + if (IS_P2ALIGNED2(in, out, sizeof (uint32_t))) { + /* LINTED: pointer alignment */ + *(uint32_t *)&out[0] = *(uint32_t *)&in[0]; + /* LINTED: pointer alignment */ + *(uint32_t *)&out[4] = *(uint32_t *)&in[4]; + /* LINTED: pointer alignment */ + *(uint32_t *)&out[8] = *(uint32_t *)&in[8]; + /* LINTED: pointer alignment */ + *(uint32_t *)&out[12] = *(uint32_t *)&in[12]; + } else { + AES_COPY_BLOCK(in, out); + } +} + + +/* XOR a 16-byte AES block of data into dst */ +void +aes_xor_block(uint8_t *data, uint8_t *dst) +{ + if (IS_P2ALIGNED2(dst, data, sizeof (uint32_t))) { + /* LINTED: pointer alignment */ + *(uint32_t *)&dst[0] ^= *(uint32_t *)&data[0]; + /* LINTED: pointer alignment */ + *(uint32_t *)&dst[4] ^= *(uint32_t *)&data[4]; + /* LINTED: pointer alignment */ + *(uint32_t *)&dst[8] ^= *(uint32_t *)&data[8]; + /* LINTED: pointer alignment */ + *(uint32_t *)&dst[12] ^= *(uint32_t *)&data[12]; + } else { + AES_XOR_BLOCK(data, dst); + } +} + + +/* + * Encrypt multiple blocks of data according to mode. + */ +int +aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, + crypto_data_t *out) +{ + aes_ctx_t *aes_ctx = ctx; + int rv; + + if (aes_ctx->ac_flags & CTR_MODE) { + rv = ctr_mode_contiguous_blocks(ctx, data, length, out, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + } else if (aes_ctx->ac_flags & CCM_MODE) { + rv = ccm_mode_encrypt_contiguous_blocks(ctx, data, length, + out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + rv = gcm_mode_encrypt_contiguous_blocks(ctx, data, length, + out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + } else if (aes_ctx->ac_flags & CBC_MODE) { + rv = cbc_encrypt_contiguous_blocks(ctx, + data, length, out, AES_BLOCK_LEN, aes_encrypt_block, + aes_copy_block, aes_xor_block); + } else { + rv = ecb_cipher_contiguous_blocks(ctx, data, length, out, + AES_BLOCK_LEN, aes_encrypt_block); + } + return (rv); +} + + +/* + * Decrypt multiple blocks of data according to mode. + */ +int +aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, + crypto_data_t *out) +{ + aes_ctx_t *aes_ctx = ctx; + int rv; + + if (aes_ctx->ac_flags & CTR_MODE) { + rv = ctr_mode_contiguous_blocks(ctx, data, length, out, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + if (rv == CRYPTO_DATA_LEN_RANGE) + rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; + } else if (aes_ctx->ac_flags & CCM_MODE) { + rv = ccm_mode_decrypt_contiguous_blocks(ctx, data, length, + out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + rv = gcm_mode_decrypt_contiguous_blocks(ctx, data, length, + out, AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + } else if (aes_ctx->ac_flags & CBC_MODE) { + rv = cbc_decrypt_contiguous_blocks(ctx, data, length, out, + AES_BLOCK_LEN, aes_decrypt_block, aes_copy_block, + aes_xor_block); + } else { + rv = ecb_cipher_contiguous_blocks(ctx, data, length, out, + AES_BLOCK_LEN, aes_decrypt_block); + if (rv == CRYPTO_DATA_LEN_RANGE) + rv = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; + } + return (rv); +} diff --git a/module/icp/algs/edonr/edonr.c b/module/icp/algs/edonr/edonr.c new file mode 100644 index 000000000000..7c677095f1ef --- /dev/null +++ b/module/icp/algs/edonr/edonr.c @@ -0,0 +1,746 @@ +/* + * IDI,NTNU + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (C) 2009, 2010, Jorn Amundsen + * Tweaked Edon-R implementation for SUPERCOP, based on NIST API. + * + * $Id: edonr.c 517 2013-02-17 20:34:39Z joern $ + */ +/* + * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + */ + +#include +#include +#include + +/* big endian support, provides no-op's if run on little endian hosts */ +#include "edonr_byteorder.h" + +#define hashState224(x) ((x)->pipe->p256) +#define hashState256(x) ((x)->pipe->p256) +#define hashState384(x) ((x)->pipe->p512) +#define hashState512(x) ((x)->pipe->p512) + +/* shift and rotate shortcuts */ +#define shl(x, n) ((x) << n) +#define shr(x, n) ((x) >> n) + +#define rotl32(x, n) (((x) << (n)) | ((x) >> (32 - (n)))) +#define rotr32(x, n) (((x) >> (n)) | ((x) << (32 - (n)))) + +#define rotl64(x, n) (((x) << (n)) | ((x) >> (64 - (n)))) +#define rotr64(x, n) (((x) >> (n)) | ((x) << (64 - (n)))) + +#if !defined(__C99_RESTRICT) +#define restrict /* restrict */ +#endif + +#define EDONR_VALID_HASHBITLEN(x) \ + ((x) == 512 || (x) == 384 || (x) == 256 || (x) == 224) + +/* EdonR224 initial double chaining pipe */ +static const uint32_t i224p2[16] = { + 0x00010203ul, 0x04050607ul, 0x08090a0bul, 0x0c0d0e0ful, + 0x10111213ul, 0x14151617ul, 0x18191a1bul, 0x1c1d1e1ful, + 0x20212223ul, 0x24252627ul, 0x28292a2bul, 0x2c2d2e2ful, + 0x30313233ul, 0x34353637ul, 0x38393a3bul, 0x3c3d3e3ful, +}; + +/* EdonR256 initial double chaining pipe */ +static const uint32_t i256p2[16] = { + 0x40414243ul, 0x44454647ul, 0x48494a4bul, 0x4c4d4e4ful, + 0x50515253ul, 0x54555657ul, 0x58595a5bul, 0x5c5d5e5ful, + 0x60616263ul, 0x64656667ul, 0x68696a6bul, 0x6c6d6e6ful, + 0x70717273ul, 0x74757677ul, 0x78797a7bul, 0x7c7d7e7ful, +}; + +/* EdonR384 initial double chaining pipe */ +static const uint64_t i384p2[16] = { + 0x0001020304050607ull, 0x08090a0b0c0d0e0full, + 0x1011121314151617ull, 0x18191a1b1c1d1e1full, + 0x2021222324252627ull, 0x28292a2b2c2d2e2full, + 0x3031323334353637ull, 0x38393a3b3c3d3e3full, + 0x4041424344454647ull, 0x48494a4b4c4d4e4full, + 0x5051525354555657ull, 0x58595a5b5c5d5e5full, + 0x6061626364656667ull, 0x68696a6b6c6d6e6full, + 0x7071727374757677ull, 0x78797a7b7c7d7e7full +}; + +/* EdonR512 initial double chaining pipe */ +static const uint64_t i512p2[16] = { + 0x8081828384858687ull, 0x88898a8b8c8d8e8full, + 0x9091929394959697ull, 0x98999a9b9c9d9e9full, + 0xa0a1a2a3a4a5a6a7ull, 0xa8a9aaabacadaeafull, + 0xb0b1b2b3b4b5b6b7ull, 0xb8b9babbbcbdbebfull, + 0xc0c1c2c3c4c5c6c7ull, 0xc8c9cacbcccdcecfull, + 0xd0d1d2d3d4d5d6d7ull, 0xd8d9dadbdcdddedfull, + 0xe0e1e2e3e4e5e6e7ull, 0xe8e9eaebecedeeefull, + 0xf0f1f2f3f4f5f6f7ull, 0xf8f9fafbfcfdfeffull +}; + +/* + * First Latin Square + * 0 7 1 3 2 4 6 5 + * 4 1 7 6 3 0 5 2 + * 7 0 4 2 5 3 1 6 + * 1 4 0 5 6 2 7 3 + * 2 3 6 7 1 5 0 4 + * 5 2 3 1 7 6 4 0 + * 3 6 5 0 4 7 2 1 + * 6 5 2 4 0 1 3 7 + */ +#define LS1_256(c, x0, x1, x2, x3, x4, x5, x6, x7) \ +{ \ + uint32_t x04, x17, x23, x56, x07, x26; \ + x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \ + s0 = c + x07 + x2; \ + s1 = rotl32(x07 + x3, 4); \ + s2 = rotl32(x07 + x6, 8); \ + x23 = x2 + x3; \ + s5 = rotl32(x04 + x23 + x5, 22); \ + x56 = x5 + x6; \ + s6 = rotl32(x17 + x56 + x0, 24); \ + x26 = x23+x56; \ + s3 = rotl32(x26 + x7, 13); \ + s4 = rotl32(x26 + x1, 17); \ + s7 = rotl32(x26 + x4, 29); \ +} + +#define LS1_512(c, x0, x1, x2, x3, x4, x5, x6, x7) \ +{ \ + uint64_t x04, x17, x23, x56, x07, x26; \ + x04 = x0+x4, x17 = x1+x7, x07 = x04+x17; \ + s0 = c + x07 + x2; \ + s1 = rotl64(x07 + x3, 5); \ + s2 = rotl64(x07 + x6, 15); \ + x23 = x2 + x3; \ + s5 = rotl64(x04 + x23 + x5, 40); \ + x56 = x5 + x6; \ + s6 = rotl64(x17 + x56 + x0, 50); \ + x26 = x23+x56; \ + s3 = rotl64(x26 + x7, 22); \ + s4 = rotl64(x26 + x1, 31); \ + s7 = rotl64(x26 + x4, 59); \ +} + +/* + * Second Orthogonal Latin Square + * 0 4 2 3 1 6 5 7 + * 7 6 3 2 5 4 1 0 + * 5 3 1 6 0 2 7 4 + * 1 0 5 4 3 7 2 6 + * 2 1 0 7 4 5 6 3 + * 3 5 7 0 6 1 4 2 + * 4 7 6 1 2 0 3 5 + * 6 2 4 5 7 3 0 1 + */ +#define LS2_256(c, y0, y1, y2, y3, y4, y5, y6, y7) \ +{ \ + uint32_t y01, y25, y34, y67, y04, y05, y27, y37; \ + y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \ + t0 = ~c + y05 + y7; \ + t2 = rotl32(y05 + y3, 9); \ + y34 = y3+y4, y04 = y01+y34; \ + t1 = rotl32(y04 + y6, 5); \ + t4 = rotl32(y04 + y5, 15); \ + y67 = y6+y7, y37 = y34+y67; \ + t3 = rotl32(y37 + y2, 11); \ + t7 = rotl32(y37 + y0, 27); \ + y27 = y25+y67; \ + t5 = rotl32(y27 + y4, 20); \ + t6 = rotl32(y27 + y1, 25); \ +} + +#define LS2_512(c, y0, y1, y2, y3, y4, y5, y6, y7) \ +{ \ + uint64_t y01, y25, y34, y67, y04, y05, y27, y37; \ + y01 = y0+y1, y25 = y2+y5, y05 = y01+y25; \ + t0 = ~c + y05 + y7; \ + t2 = rotl64(y05 + y3, 19); \ + y34 = y3+y4, y04 = y01+y34; \ + t1 = rotl64(y04 + y6, 10); \ + t4 = rotl64(y04 + y5, 36); \ + y67 = y6+y7, y37 = y34+y67; \ + t3 = rotl64(y37 + y2, 29); \ + t7 = rotl64(y37 + y0, 55); \ + y27 = y25+y67; \ + t5 = rotl64(y27 + y4, 44); \ + t6 = rotl64(y27 + y1, 48); \ +} + +#define quasi_exform256(r0, r1, r2, r3, r4, r5, r6, r7) \ +{ \ + uint32_t s04, s17, s23, s56, t01, t25, t34, t67; \ + s04 = s0 ^ s4, t01 = t0 ^ t1; \ + r0 = (s04 ^ s1) + (t01 ^ t5); \ + t67 = t6 ^ t7; \ + r1 = (s04 ^ s7) + (t2 ^ t67); \ + s23 = s2 ^ s3; \ + r7 = (s23 ^ s5) + (t4 ^ t67); \ + t34 = t3 ^ t4; \ + r3 = (s23 ^ s4) + (t0 ^ t34); \ + s56 = s5 ^ s6; \ + r5 = (s3 ^ s56) + (t34 ^ t6); \ + t25 = t2 ^ t5; \ + r6 = (s2 ^ s56) + (t25 ^ t7); \ + s17 = s1 ^ s7; \ + r4 = (s0 ^ s17) + (t1 ^ t25); \ + r2 = (s17 ^ s6) + (t01 ^ t3); \ +} + +#define quasi_exform512(r0, r1, r2, r3, r4, r5, r6, r7) \ +{ \ + uint64_t s04, s17, s23, s56, t01, t25, t34, t67; \ + s04 = s0 ^ s4, t01 = t0 ^ t1; \ + r0 = (s04 ^ s1) + (t01 ^ t5); \ + t67 = t6 ^ t7; \ + r1 = (s04 ^ s7) + (t2 ^ t67); \ + s23 = s2 ^ s3; \ + r7 = (s23 ^ s5) + (t4 ^ t67); \ + t34 = t3 ^ t4; \ + r3 = (s23 ^ s4) + (t0 ^ t34); \ + s56 = s5 ^ s6; \ + r5 = (s3 ^ s56) + (t34 ^ t6); \ + t25 = t2 ^ t5; \ + r6 = (s2 ^ s56) + (t25 ^ t7); \ + s17 = s1 ^ s7; \ + r4 = (s0 ^ s17) + (t1 ^ t25); \ + r2 = (s17 ^ s6) + (t01 ^ t3); \ +} + +static size_t +Q256(size_t bitlen, const uint32_t *data, uint32_t *restrict p) +{ + size_t bl; + + for (bl = bitlen; bl >= EdonR256_BLOCK_BITSIZE; + bl -= EdonR256_BLOCK_BITSIZE, data += 16) { + uint32_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, + t5, t6, t7; + uint32_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4, + q5, q6, q7; + const uint32_t defix = 0xaaaaaaaa; +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8, + swp9, swp10, swp11, swp12, swp13, swp14, swp15; +#define d(j) swp ## j +#define s32(j) ld_swap32((uint32_t *)data + j, swp ## j) +#else +#define d(j) data[j] +#endif + + /* First row of quasigroup e-transformations */ +#if defined(MACHINE_IS_BIG_ENDIAN) + s32(8); + s32(9); + s32(10); + s32(11); + s32(12); + s32(13); + s32(14); + s32(15); +#endif + LS1_256(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9), + d(8)); +#if defined(MACHINE_IS_BIG_ENDIAN) + s32(0); + s32(1); + s32(2); + s32(3); + s32(4); + s32(5); + s32(6); + s32(7); +#undef s32 +#endif + LS2_256(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7)); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14), + d(15)); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Second row of quasigroup e-transformations */ + LS1_256(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14], + p[15]); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Third row of quasigroup e-transformations */ + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Fourth row of quasigroup e-transformations */ + LS1_256(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0)); + LS2_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform256(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_256(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_256(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform256(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Edon-R tweak on the original SHA-3 Edon-R submission. */ + p[0] ^= d(8) ^ p0; + p[1] ^= d(9) ^ p1; + p[2] ^= d(10) ^ p2; + p[3] ^= d(11) ^ p3; + p[4] ^= d(12) ^ p4; + p[5] ^= d(13) ^ p5; + p[6] ^= d(14) ^ p6; + p[7] ^= d(15) ^ p7; + p[8] ^= d(0) ^ q0; + p[9] ^= d(1) ^ q1; + p[10] ^= d(2) ^ q2; + p[11] ^= d(3) ^ q3; + p[12] ^= d(4) ^ q4; + p[13] ^= d(5) ^ q5; + p[14] ^= d(6) ^ q6; + p[15] ^= d(7) ^ q7; + } + +#undef d + return (bitlen - bl); +} + +/* + * Why is this #pragma here? + * + * Checksum functions like this one can go over the stack frame size check + * Linux imposes on 32-bit platforms (-Wframe-larger-than=1024). We can + * safely ignore the compiler error since we know that in ZoL, that + * the function will be called from a worker thread that won't be using + * much stack. The only function that goes over the 1k limit is Q512(), + * which only goes over it by a hair (1248 bytes on ARM32). + */ +#include /* for _ILP32 */ +#ifdef _ILP32 /* We're 32-bit, assume small stack frames */ +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +#endif + +#if defined(__IBMC__) && defined(_AIX) && defined(__64BIT__) +static inline size_t +#else +static size_t +#endif +Q512(size_t bitlen, const uint64_t *data, uint64_t *restrict p) +{ + size_t bl; + + for (bl = bitlen; bl >= EdonR512_BLOCK_BITSIZE; + bl -= EdonR512_BLOCK_BITSIZE, data += 16) { + uint64_t s0, s1, s2, s3, s4, s5, s6, s7, t0, t1, t2, t3, t4, + t5, t6, t7; + uint64_t p0, p1, p2, p3, p4, p5, p6, p7, q0, q1, q2, q3, q4, + q5, q6, q7; + const uint64_t defix = 0xaaaaaaaaaaaaaaaaull; +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t swp0, swp1, swp2, swp3, swp4, swp5, swp6, swp7, swp8, + swp9, swp10, swp11, swp12, swp13, swp14, swp15; +#define d(j) swp##j +#define s64(j) ld_swap64((uint64_t *)data+j, swp##j) +#else +#define d(j) data[j] +#endif + + /* First row of quasigroup e-transformations */ +#if defined(MACHINE_IS_BIG_ENDIAN) + s64(8); + s64(9); + s64(10); + s64(11); + s64(12); + s64(13); + s64(14); + s64(15); +#endif + LS1_512(defix, d(15), d(14), d(13), d(12), d(11), d(10), d(9), + d(8)); +#if defined(MACHINE_IS_BIG_ENDIAN) + s64(0); + s64(1); + s64(2); + s64(3); + s64(4); + s64(5); + s64(6); + s64(7); +#undef s64 +#endif + LS2_512(defix, d(0), d(1), d(2), d(3), d(4), d(5), d(6), d(7)); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, d(8), d(9), d(10), d(11), d(12), d(13), d(14), + d(15)); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Second row of quasigroup e-transformations */ + LS1_512(defix, p[8], p[9], p[10], p[11], p[12], p[13], p[14], + p[15]); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Third row of quasigroup e-transformations */ + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, p[0], p[1], p[2], p[3], p[4], p[5], p[6], p[7]); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Fourth row of quasigroup e-transformations */ + LS1_512(defix, d(7), d(6), d(5), d(4), d(3), d(2), d(1), d(0)); + LS2_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + quasi_exform512(p0, p1, p2, p3, p4, p5, p6, p7); + + LS1_512(defix, p0, p1, p2, p3, p4, p5, p6, p7); + LS2_512(defix, q0, q1, q2, q3, q4, q5, q6, q7); + quasi_exform512(q0, q1, q2, q3, q4, q5, q6, q7); + + /* Edon-R tweak on the original SHA-3 Edon-R submission. */ + p[0] ^= d(8) ^ p0; + p[1] ^= d(9) ^ p1; + p[2] ^= d(10) ^ p2; + p[3] ^= d(11) ^ p3; + p[4] ^= d(12) ^ p4; + p[5] ^= d(13) ^ p5; + p[6] ^= d(14) ^ p6; + p[7] ^= d(15) ^ p7; + p[8] ^= d(0) ^ q0; + p[9] ^= d(1) ^ q1; + p[10] ^= d(2) ^ q2; + p[11] ^= d(3) ^ q3; + p[12] ^= d(4) ^ q4; + p[13] ^= d(5) ^ q5; + p[14] ^= d(6) ^ q6; + p[15] ^= d(7) ^ q7; + } + +#undef d + return (bitlen - bl); +} + +void +EdonRInit(EdonRState *state, size_t hashbitlen) +{ + ASSERT(EDONR_VALID_HASHBITLEN(hashbitlen)); + switch (hashbitlen) { + case 224: + state->hashbitlen = 224; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i224p2, hashState224(state)->DoublePipe, + 16 * sizeof (uint32_t)); + break; + + case 256: + state->hashbitlen = 256; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i256p2, hashState256(state)->DoublePipe, + 16 * sizeof (uint32_t)); + break; + + case 384: + state->hashbitlen = 384; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i384p2, hashState384(state)->DoublePipe, + 16 * sizeof (uint64_t)); + break; + + case 512: + state->hashbitlen = 512; + state->bits_processed = 0; + state->unprocessed_bits = 0; + bcopy(i512p2, hashState224(state)->DoublePipe, + 16 * sizeof (uint64_t)); + break; + } +} + + +void +EdonRUpdate(EdonRState *state, const uint8_t *data, size_t databitlen) +{ + uint32_t *data32; + uint64_t *data64; + + size_t bits_processed; + + ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen)); + switch (state->hashbitlen) { + case 224: + case 256: + if (state->unprocessed_bits > 0) { + /* LastBytes = databitlen / 8 */ + int LastBytes = (int)databitlen >> 3; + + ASSERT(state->unprocessed_bits + databitlen <= + EdonR256_BLOCK_SIZE * 8); + + bcopy(data, hashState256(state)->LastPart + + (state->unprocessed_bits >> 3), LastBytes); + state->unprocessed_bits += (int)databitlen; + databitlen = state->unprocessed_bits; + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)hashState256(state)->LastPart; + } else + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)data; + + bits_processed = Q256(databitlen, data32, + hashState256(state)->DoublePipe); + state->bits_processed += bits_processed; + databitlen -= bits_processed; + state->unprocessed_bits = (int)databitlen; + if (databitlen > 0) { + /* LastBytes = Ceil(databitlen / 8) */ + int LastBytes = + ((~(((-(int)databitlen) >> 3) & 0x01ff)) + + 1) & 0x01ff; + + data32 += bits_processed >> 5; /* byte size update */ + bcopy(data32, hashState256(state)->LastPart, LastBytes); + } + break; + + case 384: + case 512: + if (state->unprocessed_bits > 0) { + /* LastBytes = databitlen / 8 */ + int LastBytes = (int)databitlen >> 3; + + ASSERT(state->unprocessed_bits + databitlen <= + EdonR512_BLOCK_SIZE * 8); + + bcopy(data, hashState512(state)->LastPart + + (state->unprocessed_bits >> 3), LastBytes); + state->unprocessed_bits += (int)databitlen; + databitlen = state->unprocessed_bits; + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState512(state)->LastPart; + } else + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)data; + + bits_processed = Q512(databitlen, data64, + hashState512(state)->DoublePipe); + state->bits_processed += bits_processed; + databitlen -= bits_processed; + state->unprocessed_bits = (int)databitlen; + if (databitlen > 0) { + /* LastBytes = Ceil(databitlen / 8) */ + int LastBytes = + ((~(((-(int)databitlen) >> 3) & 0x03ff)) + + 1) & 0x03ff; + + data64 += bits_processed >> 6; /* byte size update */ + bcopy(data64, hashState512(state)->LastPart, LastBytes); + } + break; + } +} + +void +EdonRFinal(EdonRState *state, uint8_t *hashval) +{ + uint32_t *data32; + uint64_t *data64, num_bits; + + size_t databitlen; + int LastByte, PadOnePosition; + + num_bits = state->bits_processed + state->unprocessed_bits; + ASSERT(EDONR_VALID_HASHBITLEN(state->hashbitlen)); + switch (state->hashbitlen) { + case 224: + case 256: + LastByte = (int)state->unprocessed_bits >> 3; + PadOnePosition = 7 - (state->unprocessed_bits & 0x07); + hashState256(state)->LastPart[LastByte] = + (hashState256(state)->LastPart[LastByte] + & (0xff << (PadOnePosition + 1))) ^ + (0x01 << PadOnePosition); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState256(state)->LastPart; + + if (state->unprocessed_bits < 448) { + (void) memset((hashState256(state)->LastPart) + + LastByte + 1, 0x00, + EdonR256_BLOCK_SIZE - LastByte - 9); + databitlen = EdonR256_BLOCK_SIZE * 8; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 7); +#else + data64[7] = num_bits; +#endif + } else { + (void) memset((hashState256(state)->LastPart) + + LastByte + 1, 0x00, + EdonR256_BLOCK_SIZE * 2 - LastByte - 9); + databitlen = EdonR256_BLOCK_SIZE * 16; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 15); +#else + data64[15] = num_bits; +#endif + } + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data32 = (uint32_t *)hashState256(state)->LastPart; + state->bits_processed += Q256(databitlen, data32, + hashState256(state)->DoublePipe); + break; + + case 384: + case 512: + LastByte = (int)state->unprocessed_bits >> 3; + PadOnePosition = 7 - (state->unprocessed_bits & 0x07); + hashState512(state)->LastPart[LastByte] = + (hashState512(state)->LastPart[LastByte] + & (0xff << (PadOnePosition + 1))) ^ + (0x01 << PadOnePosition); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + data64 = (uint64_t *)hashState512(state)->LastPart; + + if (state->unprocessed_bits < 960) { + (void) memset((hashState512(state)->LastPart) + + LastByte + 1, 0x00, + EdonR512_BLOCK_SIZE - LastByte - 9); + databitlen = EdonR512_BLOCK_SIZE * 8; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 15); +#else + data64[15] = num_bits; +#endif + } else { + (void) memset((hashState512(state)->LastPart) + + LastByte + 1, 0x00, + EdonR512_BLOCK_SIZE * 2 - LastByte - 9); + databitlen = EdonR512_BLOCK_SIZE * 16; +#if defined(MACHINE_IS_BIG_ENDIAN) + st_swap64(num_bits, data64 + 31); +#else + data64[31] = num_bits; +#endif + } + + state->bits_processed += Q512(databitlen, data64, + hashState512(state)->DoublePipe); + break; + } + + switch (state->hashbitlen) { + case 224: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t *d32 = (uint32_t *)hashval; + uint32_t *s32 = hashState224(state)->DoublePipe + 9; + int j; + + for (j = 0; j < EdonR224_DIGEST_SIZE >> 2; j++) + st_swap32(s32[j], d32 + j); +#else + bcopy(hashState256(state)->DoublePipe + 9, hashval, + EdonR224_DIGEST_SIZE); +#endif + break; + } + case 256: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint32_t *d32 = (uint32_t *)hashval; + uint32_t *s32 = hashState224(state)->DoublePipe + 8; + int j; + + for (j = 0; j < EdonR256_DIGEST_SIZE >> 2; j++) + st_swap32(s32[j], d32 + j); +#else + bcopy(hashState256(state)->DoublePipe + 8, hashval, + EdonR256_DIGEST_SIZE); +#endif + break; + } + case 384: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t *d64 = (uint64_t *)hashval; + uint64_t *s64 = hashState384(state)->DoublePipe + 10; + int j; + + for (j = 0; j < EdonR384_DIGEST_SIZE >> 3; j++) + st_swap64(s64[j], d64 + j); +#else + bcopy(hashState384(state)->DoublePipe + 10, hashval, + EdonR384_DIGEST_SIZE); +#endif + break; + } + case 512: { +#if defined(MACHINE_IS_BIG_ENDIAN) + uint64_t *d64 = (uint64_t *)hashval; + uint64_t *s64 = hashState512(state)->DoublePipe + 8; + int j; + + for (j = 0; j < EdonR512_DIGEST_SIZE >> 3; j++) + st_swap64(s64[j], d64 + j); +#else + bcopy(hashState512(state)->DoublePipe + 8, hashval, + EdonR512_DIGEST_SIZE); +#endif + break; + } + } +} + + +void +EdonRHash(size_t hashbitlen, const uint8_t *data, size_t databitlen, + uint8_t *hashval) +{ + EdonRState state; + + EdonRInit(&state, hashbitlen); + EdonRUpdate(&state, data, databitlen); + EdonRFinal(&state, hashval); +} + +#ifdef _KERNEL +EXPORT_SYMBOL(EdonRInit); +EXPORT_SYMBOL(EdonRUpdate); +EXPORT_SYMBOL(EdonRHash); +EXPORT_SYMBOL(EdonRFinal); +#endif diff --git a/module/icp/algs/edonr/edonr_byteorder.h b/module/icp/algs/edonr/edonr_byteorder.h new file mode 100644 index 000000000000..2b5d48287f26 --- /dev/null +++ b/module/icp/algs/edonr/edonr_byteorder.h @@ -0,0 +1,216 @@ +/* + * IDI,NTNU + * + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * Copyright (C) 2009, 2010, Jorn Amundsen + * + * C header file to determine compile machine byte order. Take care when cross + * compiling. + * + * $Id: byteorder.h 517 2013-02-17 20:34:39Z joern $ + */ +/* + * Portions copyright (c) 2013, Saso Kiselkov, All rights reserved + */ + +#ifndef _CRYPTO_EDONR_BYTEORDER_H +#define _CRYPTO_EDONR_BYTEORDER_H + +#include +#include + +#if defined(__BYTE_ORDER) +#if (__BYTE_ORDER == __BIG_ENDIAN) +#define MACHINE_IS_BIG_ENDIAN +#elif (__BYTE_ORDER == __LITTLE_ENDIAN) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#elif defined(BYTE_ORDER) +#if (BYTE_ORDER == BIG_ENDIAN) +#define MACHINE_IS_BIG_ENDIAN +#elif (BYTE_ORDER == LITTLE_ENDIAN) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#endif /* __BYTE_ORDER || BYTE_ORDER */ + +#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN) +#if defined(_ZFS_BIG_ENDIAN) || defined(_MIPSEB) +#define MACHINE_IS_BIG_ENDIAN +#endif +#if defined(_ZFS_LITTLE_ENDIAN) || defined(_MIPSEL) +#define MACHINE_IS_LITTLE_ENDIAN +#endif +#endif /* !MACHINE_IS_BIG_ENDIAN && !MACHINE_IS_LITTLE_ENDIAN */ + +#if !defined(MACHINE_IS_BIG_ENDIAN) && !defined(MACHINE_IS_LITTLE_ENDIAN) +#error unknown machine byte sex +#endif + +#define BYTEORDER_INCLUDED + +#if defined(MACHINE_IS_BIG_ENDIAN) +/* + * Byte swapping macros for big endian architectures and compilers, + * add as appropriate for other architectures and/or compilers. + * + * ld_swap64(src,dst) : uint64_t dst = *(src) + * st_swap64(src,dst) : *(dst) = uint64_t src + */ + +#if defined(__PPC__) || defined(_ARCH_PPC) + +#if defined(__64BIT__) +#if defined(_ARCH_PWR7) +#define aix_ld_swap64(s64, d64)\ + __asm__("ldbrx %0,0,%1" : "=r"(d64) : "r"(s64)) +#define aix_st_swap64(s64, d64)\ + __asm__ volatile("stdbrx %1,0,%0" : : "r"(d64), "r"(s64)) +#else +#define aix_ld_swap64(s64, d64) \ +{ \ + uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \ + \ + __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0;rldimi %1,%2,32,0"\ + : "+r"(s4), "=r"(d64), "=r"(h) : "b"(s64)); \ +} + +#define aix_st_swap64(s64, d64) \ +{ \ + uint64_t *s4 = 0, h; /* initialize to zero for gcc warning */ \ + h = (s64) >> 32; \ + __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \ + : "+r"(s4) : "r"(s64), "r"(h), "b"(d64)); \ +} +#endif /* 64BIT && PWR7 */ +#else +#define aix_ld_swap64(s64, d64) \ +{ \ + uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\ + __asm__("addi %0,%3,4;lwbrx %1,0,%3;lwbrx %2,0,%0" \ + : "+r"(s4), "=r"(l), "=r"(h) : "b"(s64)); \ + d64 = ((uint64_t)h<<32) | l; \ +} + +#define aix_st_swap64(s64, d64) \ +{ \ + uint32_t *s4 = 0, h, l; /* initialize to zero for gcc warning */\ + l = (s64) & 0xfffffffful, h = (s64) >> 32; \ + __asm__ volatile("addi %0,%3,4;stwbrx %1,0,%3;stwbrx %2,0,%0" \ + : "+r"(s4) : "r"(l), "r"(h), "b"(d64)); \ +} +#endif /* __64BIT__ */ +#define aix_ld_swap32(s32, d32)\ + __asm__("lwbrx %0,0,%1" : "=r"(d32) : "r"(s32)) +#define aix_st_swap32(s32, d32)\ + __asm__ volatile("stwbrx %1,0,%0" : : "r"(d32), "r"(s32)) +#define ld_swap32(s, d) aix_ld_swap32(s, d) +#define st_swap32(s, d) aix_st_swap32(s, d) +#define ld_swap64(s, d) aix_ld_swap64(s, d) +#define st_swap64(s, d) aix_st_swap64(s, d) +#endif /* __PPC__ || _ARCH_PPC */ + +#if defined(__sparc) +#if !defined(__arch64__) && !defined(__sparcv8) && defined(__sparcv9) +#define __arch64__ +#endif +#if defined(__GNUC__) || (defined(__SUNPRO_C) && __SUNPRO_C > 0x590) +/* need Sun Studio C 5.10 and above for GNU inline assembly */ +#if defined(__arch64__) +#define sparc_ld_swap64(s64, d64) \ + __asm__("ldxa [%1]0x88,%0" : "=r"(d64) : "r"(s64)) +#define sparc_st_swap64(s64, d64) \ + __asm__ volatile("stxa %0,[%1]0x88" : : "r"(s64), "r"(d64)) +#define st_swap64(s, d) sparc_st_swap64(s, d) +#else +#define sparc_ld_swap64(s64, d64) \ +{ \ + uint32_t *s4, h, l; \ + __asm__("add %3,4,%0\n\tlda [%3]0x88,%1\n\tlda [%0]0x88,%2" \ + : "+r"(s4), "=r"(l), "=r"(h) : "r"(s64)); \ + d64 = ((uint64_t)h<<32) | l; \ +} +#define sparc_st_swap64(s64, d64) \ +{ \ + uint32_t *s4, h, l; \ + l = (s64) & 0xfffffffful, h = (s64) >> 32; \ + __asm__ volatile("add %3,4,%0\n\tsta %1,[%3]0x88\n\tsta %2,[%0]0x88"\ + : "+r"(s4) : "r"(l), "r"(h), "r"(d64)); \ +} +#endif /* sparc64 */ +#define sparc_ld_swap32(s32, d32)\ + __asm__("lda [%1]0x88,%0" : "=r"(d32) : "r"(s32)) +#define sparc_st_swap32(s32, d32)\ + __asm__ volatile("sta %0,[%1]0x88" : : "r"(s32), "r"(d32)) +#define ld_swap32(s, d) sparc_ld_swap32(s, d) +#define st_swap32(s, d) sparc_st_swap32(s, d) +#define ld_swap64(s, d) sparc_ld_swap64(s, d) +#define st_swap64(s, d) sparc_st_swap64(s, d) +#endif /* GCC || Sun Studio C > 5.9 */ +#endif /* sparc */ + +/* GCC fallback */ +#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap32) +#define ld_swap32(s, d) (d = __builtin_bswap32(*(s))) +#define st_swap32(s, d) (*(d) = __builtin_bswap32(s)) +#endif /* GCC4/PGIC && !swap32 */ +#if ((__GNUC__ >= 4) || defined(__PGIC__)) && !defined(ld_swap64) +#define ld_swap64(s, d) (d = __builtin_bswap64(*(s))) +#define st_swap64(s, d) (*(d) = __builtin_bswap64(s)) +#endif /* GCC4/PGIC && !swap64 */ + +/* generic fallback */ +#if !defined(ld_swap32) +#define ld_swap32(s, d) \ + (d = (*(s) >> 24) | (*(s) >> 8 & 0xff00) | \ + (*(s) << 8 & 0xff0000) | (*(s) << 24)) +#define st_swap32(s, d) \ + (*(d) = ((s) >> 24) | ((s) >> 8 & 0xff00) | \ + ((s) << 8 & 0xff0000) | ((s) << 24)) +#endif +#if !defined(ld_swap64) +#define ld_swap64(s, d) \ + (d = (*(s) >> 56) | (*(s) >> 40 & 0xff00) | \ + (*(s) >> 24 & 0xff0000) | (*(s) >> 8 & 0xff000000) | \ + (*(s) & 0xff000000) << 8 | (*(s) & 0xff0000) << 24 | \ + (*(s) & 0xff00) << 40 | *(s) << 56) +#define st_swap64(s, d) \ + (*(d) = ((s) >> 56) | ((s) >> 40 & 0xff00) | \ + ((s) >> 24 & 0xff0000) | ((s) >> 8 & 0xff000000) | \ + ((s) & 0xff000000) << 8 | ((s) & 0xff0000) << 24 | \ + ((s) & 0xff00) << 40 | (s) << 56) +#endif + +#endif /* MACHINE_IS_BIG_ENDIAN */ + + +#if defined(MACHINE_IS_LITTLE_ENDIAN) +/* replace swaps with simple assignments on little endian systems */ +#undef ld_swap32 +#undef st_swap32 +#define ld_swap32(s, d) (d = *(s)) +#define st_swap32(s, d) (*(d) = s) +#undef ld_swap64 +#undef st_swap64 +#define ld_swap64(s, d) (d = *(s)) +#define st_swap64(s, d) (*(d) = s) +#endif /* MACHINE_IS_LITTLE_ENDIAN */ + +#endif /* _CRYPTO_EDONR_BYTEORDER_H */ diff --git a/module/icp/algs/modes/cbc.c b/module/icp/algs/modes/cbc.c new file mode 100644 index 000000000000..85864f56dead --- /dev/null +++ b/module/icp/algs/modes/cbc.c @@ -0,0 +1,273 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +/* + * Algorithm independent CBC functions. + */ +int +cbc_encrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*encrypt)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + + if (length + ctx->cbc_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len, + length); + ctx->cbc_remainder_len += length; + ctx->cbc_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + lastp = (uint8_t *)ctx->cbc_iv; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + do { + /* Unprocessed data from last call. */ + if (ctx->cbc_remainder_len > 0) { + need = block_size - ctx->cbc_remainder_len; + + if (need > remainder) + return (CRYPTO_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->cbc_remainder) + [ctx->cbc_remainder_len], need); + + blockp = (uint8_t *)ctx->cbc_remainder; + } else { + blockp = datap; + } + + /* + * XOR the previous cipher block or IV with the + * current clear block. + */ + xor_block(blockp, lastp); + encrypt(ctx->cbc_keysched, lastp, lastp); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); + } else { + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); + } + } + /* update offset */ + out->cd_offset += block_size; + + /* Update pointer to next block of data to be processed. */ + if (ctx->cbc_remainder_len != 0) { + datap += need; + ctx->cbc_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->cbc_remainder, remainder); + ctx->cbc_remainder_len = remainder; + ctx->cbc_copy_to = datap; + goto out; + } + ctx->cbc_copy_to = NULL; + + } while (remainder > 0); + +out: + /* + * Save the last encrypted block in the context. + */ + if (ctx->cbc_lastp != NULL) { + copy_block((uint8_t *)ctx->cbc_lastp, (uint8_t *)ctx->cbc_iv); + ctx->cbc_lastp = (uint8_t *)ctx->cbc_iv; + } + + return (CRYPTO_SUCCESS); +} + +#define OTHER(a, ctx) \ + (((a) == (ctx)->cbc_lastblock) ? (ctx)->cbc_iv : (ctx)->cbc_lastblock) + +/* ARGSUSED */ +int +cbc_decrypt_contiguous_blocks(cbc_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*decrypt)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + + if (length + ctx->cbc_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->cbc_remainder + ctx->cbc_remainder_len, + length); + ctx->cbc_remainder_len += length; + ctx->cbc_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + lastp = ctx->cbc_lastp; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + do { + /* Unprocessed data from last call. */ + if (ctx->cbc_remainder_len > 0) { + need = block_size - ctx->cbc_remainder_len; + + if (need > remainder) + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->cbc_remainder) + [ctx->cbc_remainder_len], need); + + blockp = (uint8_t *)ctx->cbc_remainder; + } else { + blockp = datap; + } + + /* LINTED: pointer alignment */ + copy_block(blockp, (uint8_t *)OTHER((uint64_t *)lastp, ctx)); + + decrypt(ctx->cbc_keysched, blockp, + (uint8_t *)ctx->cbc_remainder); + blockp = (uint8_t *)ctx->cbc_remainder; + + /* + * XOR the previous cipher block or IV with the + * currently decrypted block. + */ + xor_block(lastp, blockp); + + /* LINTED: pointer alignment */ + lastp = (uint8_t *)OTHER((uint64_t *)lastp, ctx); + + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + bcopy(blockp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(blockp + out_data_1_len, out_data_2, + block_size - out_data_1_len); + } + + /* update offset */ + out->cd_offset += block_size; + + /* Update pointer to next block of data to be processed. */ + if (ctx->cbc_remainder_len != 0) { + datap += need; + ctx->cbc_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->cbc_remainder, remainder); + ctx->cbc_remainder_len = remainder; + ctx->cbc_lastp = lastp; + ctx->cbc_copy_to = datap; + return (CRYPTO_SUCCESS); + } + ctx->cbc_copy_to = NULL; + + } while (remainder > 0); + + ctx->cbc_lastp = lastp; + return (CRYPTO_SUCCESS); +} + +int +cbc_init_ctx(cbc_ctx_t *cbc_ctx, char *param, size_t param_len, + size_t block_size, void (*copy_block)(uint8_t *, uint64_t *)) +{ + /* + * Copy IV into context. + * + * If cm_param == NULL then the IV comes from the + * cd_miscdata field in the crypto_data structure. + */ + if (param != NULL) { + ASSERT(param_len == block_size); + copy_block((uchar_t *)param, cbc_ctx->cbc_iv); + } + + cbc_ctx->cbc_lastp = (uint8_t *)&cbc_ctx->cbc_iv[0]; + cbc_ctx->cbc_flags |= CBC_MODE; + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +void * +cbc_alloc_ctx(int kmflag) +{ + cbc_ctx_t *cbc_ctx; + + if ((cbc_ctx = kmem_zalloc(sizeof (cbc_ctx_t), kmflag)) == NULL) + return (NULL); + + cbc_ctx->cbc_flags = CBC_MODE; + return (cbc_ctx); +} diff --git a/module/icp/algs/modes/ccm.c b/module/icp/algs/modes/ccm.c new file mode 100644 index 000000000000..5d6507c49db1 --- /dev/null +++ b/module/icp/algs/modes/ccm.c @@ -0,0 +1,907 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS +#include +#define UNALIGNED_POINTERS_PERMITTED +#endif + +/* + * Encrypt multiple blocks of data in CCM mode. Decrypt for CCM mode + * is done in another function. + */ +int +ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + uint64_t counter; + uint8_t *mac_buf; + + if (length + ctx->ccm_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len, + length); + ctx->ccm_remainder_len += length; + ctx->ccm_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + lastp = (uint8_t *)ctx->ccm_cb; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + mac_buf = (uint8_t *)ctx->ccm_mac_buf; + + do { + /* Unprocessed data from last call. */ + if (ctx->ccm_remainder_len > 0) { + need = block_size - ctx->ccm_remainder_len; + + if (need > remainder) + return (CRYPTO_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->ccm_remainder) + [ctx->ccm_remainder_len], need); + + blockp = (uint8_t *)ctx->ccm_remainder; + } else { + blockp = datap; + } + + /* + * do CBC MAC + * + * XOR the previous cipher block current clear block. + * mac_buf always contain previous cipher block. + */ + xor_block(blockp, mac_buf); + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + + /* ccm_cb is the counter block */ + encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, + (uint8_t *)ctx->ccm_tmp); + + lastp = (uint8_t *)ctx->ccm_tmp; + + /* + * Increment counter. Counter bits are confined + * to the bottom 64 bits of the counter block. + */ +#ifdef _ZFS_LITTLE_ENDIAN + counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); + counter = htonll(counter + 1); +#else + counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; + counter++; +#endif /* _ZFS_LITTLE_ENDIAN */ + counter &= ctx->ccm_counter_mask; + ctx->ccm_cb[1] = + (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; + + /* + * XOR encrypted counter block with the current clear block. + */ + xor_block(blockp, lastp); + + ctx->ccm_processed_data_len += block_size; + + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); + } else { + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); + } + } + /* update offset */ + out->cd_offset += block_size; + + /* Update pointer to next block of data to be processed. */ + if (ctx->ccm_remainder_len != 0) { + datap += need; + ctx->ccm_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->ccm_remainder, remainder); + ctx->ccm_remainder_len = remainder; + ctx->ccm_copy_to = datap; + goto out; + } + ctx->ccm_copy_to = NULL; + + } while (remainder > 0); + +out: + return (CRYPTO_SUCCESS); +} + +void +calculate_ccm_mac(ccm_ctx_t *ctx, uint8_t *ccm_mac, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) +{ + uint64_t counter; + uint8_t *counterp, *mac_buf; + int i; + + mac_buf = (uint8_t *)ctx->ccm_mac_buf; + + /* first counter block start with index 0 */ + counter = 0; + ctx->ccm_cb[1] = (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; + + counterp = (uint8_t *)ctx->ccm_tmp; + encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp); + + /* calculate XOR of MAC with first counter block */ + for (i = 0; i < ctx->ccm_mac_len; i++) { + ccm_mac[i] = mac_buf[i] ^ counterp[i]; + } +} + +/* ARGSUSED */ +int +ccm_encrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + uint8_t *lastp, *mac_buf, *ccm_mac_p, *macp = NULL; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + int i; + + if (out->cd_length < (ctx->ccm_remainder_len + ctx->ccm_mac_len)) { + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * When we get here, the number of bytes of payload processed + * plus whatever data remains, if any, + * should be the same as the number of bytes that's being + * passed in the argument during init time. + */ + if ((ctx->ccm_processed_data_len + ctx->ccm_remainder_len) + != (ctx->ccm_data_len)) { + return (CRYPTO_DATA_LEN_RANGE); + } + + mac_buf = (uint8_t *)ctx->ccm_mac_buf; + + if (ctx->ccm_remainder_len > 0) { + + /* ccm_mac_input_buf is not used for encryption */ + macp = (uint8_t *)ctx->ccm_mac_input_buf; + bzero(macp, block_size); + + /* copy remainder to temporary buffer */ + bcopy(ctx->ccm_remainder, macp, ctx->ccm_remainder_len); + + /* calculate the CBC MAC */ + xor_block(macp, mac_buf); + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + + /* calculate the counter mode */ + lastp = (uint8_t *)ctx->ccm_tmp; + encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, lastp); + + /* XOR with counter block */ + for (i = 0; i < ctx->ccm_remainder_len; i++) { + macp[i] ^= lastp[i]; + } + ctx->ccm_processed_data_len += ctx->ccm_remainder_len; + } + + /* Calculate the CCM MAC */ + ccm_mac_p = (uint8_t *)ctx->ccm_tmp; + calculate_ccm_mac(ctx, ccm_mac_p, encrypt_block); + + crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, + ctx->ccm_remainder_len + ctx->ccm_mac_len); + + if (ctx->ccm_remainder_len > 0) { + + /* copy temporary block to where it belongs */ + if (out_data_2 == NULL) { + /* everything will fit in out_data_1 */ + bcopy(macp, out_data_1, ctx->ccm_remainder_len); + bcopy(ccm_mac_p, out_data_1 + ctx->ccm_remainder_len, + ctx->ccm_mac_len); + } else { + + if (out_data_1_len < ctx->ccm_remainder_len) { + + size_t data_2_len_used; + + bcopy(macp, out_data_1, out_data_1_len); + + data_2_len_used = ctx->ccm_remainder_len + - out_data_1_len; + + bcopy((uint8_t *)macp + out_data_1_len, + out_data_2, data_2_len_used); + bcopy(ccm_mac_p, out_data_2 + data_2_len_used, + ctx->ccm_mac_len); + } else { + bcopy(macp, out_data_1, out_data_1_len); + if (out_data_1_len == ctx->ccm_remainder_len) { + /* mac will be in out_data_2 */ + bcopy(ccm_mac_p, out_data_2, + ctx->ccm_mac_len); + } else { + size_t len_not_used = out_data_1_len - + ctx->ccm_remainder_len; + /* + * part of mac in will be in + * out_data_1, part of the mac will be + * in out_data_2 + */ + bcopy(ccm_mac_p, + out_data_1 + ctx->ccm_remainder_len, + len_not_used); + bcopy(ccm_mac_p + len_not_used, + out_data_2, + ctx->ccm_mac_len - len_not_used); + + } + } + } + } else { + /* copy block to where it belongs */ + bcopy(ccm_mac_p, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(ccm_mac_p + out_data_1_len, out_data_2, + block_size - out_data_1_len); + } + } + out->cd_offset += ctx->ccm_remainder_len + ctx->ccm_mac_len; + ctx->ccm_remainder_len = 0; + return (CRYPTO_SUCCESS); +} + +/* + * This will only deal with decrypting the last block of the input that + * might not be a multiple of block length. + */ +static void +ccm_decrypt_incomplete_block(ccm_ctx_t *ctx, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) +{ + uint8_t *datap, *outp, *counterp; + int i; + + datap = (uint8_t *)ctx->ccm_remainder; + outp = &((ctx->ccm_pt_buf)[ctx->ccm_processed_data_len]); + + counterp = (uint8_t *)ctx->ccm_tmp; + encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, counterp); + + /* XOR with counter block */ + for (i = 0; i < ctx->ccm_remainder_len; i++) { + outp[i] = datap[i] ^ counterp[i]; + } +} + +/* + * This will decrypt the cipher text. However, the plaintext won't be + * returned to the caller. It will be returned when decrypt_final() is + * called if the MAC matches + */ +/* ARGSUSED */ +int +ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *cbp; + uint64_t counter; + size_t pt_len, total_decrypted_len, mac_len, pm_len, pd_len; + uint8_t *resultp; + + + pm_len = ctx->ccm_processed_mac_len; + + if (pm_len > 0) { + uint8_t *tmp; + /* + * all ciphertext has been processed, just waiting for + * part of the value of the mac + */ + if ((pm_len + length) > ctx->ccm_mac_len) { + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + } + tmp = (uint8_t *)ctx->ccm_mac_input_buf; + + bcopy(datap, tmp + pm_len, length); + + ctx->ccm_processed_mac_len += length; + return (CRYPTO_SUCCESS); + } + + /* + * If we decrypt the given data, what total amount of data would + * have been decrypted? + */ + pd_len = ctx->ccm_processed_data_len; + total_decrypted_len = pd_len + length + ctx->ccm_remainder_len; + + if (total_decrypted_len > + (ctx->ccm_data_len + ctx->ccm_mac_len)) { + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + } + + pt_len = ctx->ccm_data_len; + + if (total_decrypted_len > pt_len) { + /* + * part of the input will be the MAC, need to isolate that + * to be dealt with later. The left-over data in + * ccm_remainder_len from last time will not be part of the + * MAC. Otherwise, it would have already been taken out + * when this call is made last time. + */ + size_t pt_part = pt_len - pd_len - ctx->ccm_remainder_len; + + mac_len = length - pt_part; + + ctx->ccm_processed_mac_len = mac_len; + bcopy(data + pt_part, ctx->ccm_mac_input_buf, mac_len); + + if (pt_part + ctx->ccm_remainder_len < block_size) { + /* + * since this is last of the ciphertext, will + * just decrypt with it here + */ + bcopy(datap, &((uint8_t *)ctx->ccm_remainder) + [ctx->ccm_remainder_len], pt_part); + ctx->ccm_remainder_len += pt_part; + ccm_decrypt_incomplete_block(ctx, encrypt_block); + ctx->ccm_processed_data_len += ctx->ccm_remainder_len; + ctx->ccm_remainder_len = 0; + return (CRYPTO_SUCCESS); + } else { + /* let rest of the code handle this */ + length = pt_part; + } + } else if (length + ctx->ccm_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->ccm_remainder + ctx->ccm_remainder_len, + length); + ctx->ccm_remainder_len += length; + ctx->ccm_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + do { + /* Unprocessed data from last call. */ + if (ctx->ccm_remainder_len > 0) { + need = block_size - ctx->ccm_remainder_len; + + if (need > remainder) + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->ccm_remainder) + [ctx->ccm_remainder_len], need); + + blockp = (uint8_t *)ctx->ccm_remainder; + } else { + blockp = datap; + } + + /* Calculate the counter mode, ccm_cb is the counter block */ + cbp = (uint8_t *)ctx->ccm_tmp; + encrypt_block(ctx->ccm_keysched, (uint8_t *)ctx->ccm_cb, cbp); + + /* + * Increment counter. + * Counter bits are confined to the bottom 64 bits + */ +#ifdef _ZFS_LITTLE_ENDIAN + counter = ntohll(ctx->ccm_cb[1] & ctx->ccm_counter_mask); + counter = htonll(counter + 1); +#else + counter = ctx->ccm_cb[1] & ctx->ccm_counter_mask; + counter++; +#endif /* _ZFS_LITTLE_ENDIAN */ + counter &= ctx->ccm_counter_mask; + ctx->ccm_cb[1] = + (ctx->ccm_cb[1] & ~(ctx->ccm_counter_mask)) | counter; + + /* XOR with the ciphertext */ + xor_block(blockp, cbp); + + /* Copy the plaintext to the "holding buffer" */ + resultp = (uint8_t *)ctx->ccm_pt_buf + + ctx->ccm_processed_data_len; + copy_block(cbp, resultp); + + ctx->ccm_processed_data_len += block_size; + + ctx->ccm_lastp = blockp; + + /* Update pointer to next block of data to be processed. */ + if (ctx->ccm_remainder_len != 0) { + datap += need; + ctx->ccm_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->ccm_remainder, remainder); + ctx->ccm_remainder_len = remainder; + ctx->ccm_copy_to = datap; + if (ctx->ccm_processed_mac_len > 0) { + /* + * not expecting anymore ciphertext, just + * compute plaintext for the remaining input + */ + ccm_decrypt_incomplete_block(ctx, + encrypt_block); + ctx->ccm_processed_data_len += remainder; + ctx->ccm_remainder_len = 0; + } + goto out; + } + ctx->ccm_copy_to = NULL; + + } while (remainder > 0); + +out: + return (CRYPTO_SUCCESS); +} + +int +ccm_decrypt_final(ccm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t mac_remain, pt_len; + uint8_t *pt, *mac_buf, *macp, *ccm_mac_p; + int rv; + + pt_len = ctx->ccm_data_len; + + /* Make sure output buffer can fit all of the plaintext */ + if (out->cd_length < pt_len) { + return (CRYPTO_DATA_LEN_RANGE); + } + + pt = ctx->ccm_pt_buf; + mac_remain = ctx->ccm_processed_data_len; + mac_buf = (uint8_t *)ctx->ccm_mac_buf; + + macp = (uint8_t *)ctx->ccm_tmp; + + while (mac_remain > 0) { + + if (mac_remain < block_size) { + bzero(macp, block_size); + bcopy(pt, macp, mac_remain); + mac_remain = 0; + } else { + copy_block(pt, macp); + mac_remain -= block_size; + pt += block_size; + } + + /* calculate the CBC MAC */ + xor_block(macp, mac_buf); + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + } + + /* Calculate the CCM MAC */ + ccm_mac_p = (uint8_t *)ctx->ccm_tmp; + calculate_ccm_mac((ccm_ctx_t *)ctx, ccm_mac_p, encrypt_block); + + /* compare the input CCM MAC value with what we calculated */ + if (bcmp(ctx->ccm_mac_input_buf, ccm_mac_p, ctx->ccm_mac_len)) { + /* They don't match */ + return (CRYPTO_INVALID_MAC); + } else { + rv = crypto_put_output_data(ctx->ccm_pt_buf, out, pt_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + out->cd_offset += pt_len; + } + return (CRYPTO_SUCCESS); +} + +static int +ccm_validate_args(CK_AES_CCM_PARAMS *ccm_param, boolean_t is_encrypt_init) +{ + size_t macSize, nonceSize; + uint8_t q; + uint64_t maxValue; + + /* + * Check the length of the MAC. The only valid + * lengths for the MAC are: 4, 6, 8, 10, 12, 14, 16 + */ + macSize = ccm_param->ulMACSize; + if ((macSize < 4) || (macSize > 16) || ((macSize % 2) != 0)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + + /* Check the nonce length. Valid values are 7, 8, 9, 10, 11, 12, 13 */ + nonceSize = ccm_param->ulNonceSize; + if ((nonceSize < 7) || (nonceSize > 13)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + + /* q is the length of the field storing the length, in bytes */ + q = (uint8_t)((15 - nonceSize) & 0xFF); + + + /* + * If it is decrypt, need to make sure size of ciphertext is at least + * bigger than MAC len + */ + if ((!is_encrypt_init) && (ccm_param->ulDataSize < macSize)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + + /* + * Check to make sure the length of the payload is within the + * range of values allowed by q + */ + if (q < 8) { + maxValue = (1ULL << (q * 8)) - 1; + } else { + maxValue = ULONG_MAX; + } + + if (ccm_param->ulDataSize > maxValue) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + return (CRYPTO_SUCCESS); +} + +/* + * Format the first block used in CBC-MAC (B0) and the initial counter + * block based on formatting functions and counter generation functions + * specified in RFC 3610 and NIST publication 800-38C, appendix A + * + * b0 is the first block used in CBC-MAC + * cb0 is the first counter block + * + * It's assumed that the arguments b0 and cb0 are preallocated AES blocks + * + */ +static void +ccm_format_initial_blocks(uchar_t *nonce, ulong_t nonceSize, + ulong_t authDataSize, uint8_t *b0, ccm_ctx_t *aes_ctx) +{ + uint64_t payloadSize; + uint8_t t, q, have_adata = 0; + size_t limit; + int i, j, k; + uint64_t mask = 0; + uint8_t *cb; + + q = (uint8_t)((15 - nonceSize) & 0xFF); + t = (uint8_t)((aes_ctx->ccm_mac_len) & 0xFF); + + /* Construct the first octet of b0 */ + if (authDataSize > 0) { + have_adata = 1; + } + b0[0] = (have_adata << 6) | (((t - 2) / 2) << 3) | (q - 1); + + /* copy the nonce value into b0 */ + bcopy(nonce, &(b0[1]), nonceSize); + + /* store the length of the payload into b0 */ + bzero(&(b0[1+nonceSize]), q); + + payloadSize = aes_ctx->ccm_data_len; + limit = 8 < q ? 8 : q; + + for (i = 0, j = 0, k = 15; i < limit; i++, j += 8, k--) { + b0[k] = (uint8_t)((payloadSize >> j) & 0xFF); + } + + /* format the counter block */ + + cb = (uint8_t *)aes_ctx->ccm_cb; + + cb[0] = 0x07 & (q-1); /* first byte */ + + /* copy the nonce value into the counter block */ + bcopy(nonce, &(cb[1]), nonceSize); + + bzero(&(cb[1+nonceSize]), q); + + /* Create the mask for the counter field based on the size of nonce */ + q <<= 3; + while (q-- > 0) { + mask |= (1ULL << q); + } + +#ifdef _ZFS_LITTLE_ENDIAN + mask = htonll(mask); +#endif + aes_ctx->ccm_counter_mask = mask; + + /* + * During calculation, we start using counter block 1, we will + * set it up right here. + * We can just set the last byte to have the value 1, because + * even with the biggest nonce of 13, the last byte of the + * counter block will be used for the counter value. + */ + cb[15] = 0x01; +} + +/* + * Encode the length of the associated data as + * specified in RFC 3610 and NIST publication 800-38C, appendix A + */ +static void +encode_adata_len(ulong_t auth_data_len, uint8_t *encoded, size_t *encoded_len) +{ +#ifdef UNALIGNED_POINTERS_PERMITTED + uint32_t *lencoded_ptr; +#ifdef _LP64 + uint64_t *llencoded_ptr; +#endif +#endif /* UNALIGNED_POINTERS_PERMITTED */ + + if (auth_data_len < ((1ULL<<16) - (1ULL<<8))) { + /* 0 < a < (2^16-2^8) */ + *encoded_len = 2; + encoded[0] = (auth_data_len & 0xff00) >> 8; + encoded[1] = auth_data_len & 0xff; + + } else if ((auth_data_len >= ((1ULL<<16) - (1ULL<<8))) && + (auth_data_len < (1ULL << 31))) { + /* (2^16-2^8) <= a < 2^32 */ + *encoded_len = 6; + encoded[0] = 0xff; + encoded[1] = 0xfe; +#ifdef UNALIGNED_POINTERS_PERMITTED + lencoded_ptr = (uint32_t *)&encoded[2]; + *lencoded_ptr = htonl(auth_data_len); +#else + encoded[2] = (auth_data_len & 0xff000000) >> 24; + encoded[3] = (auth_data_len & 0xff0000) >> 16; + encoded[4] = (auth_data_len & 0xff00) >> 8; + encoded[5] = auth_data_len & 0xff; +#endif /* UNALIGNED_POINTERS_PERMITTED */ + +#ifdef _LP64 + } else { + /* 2^32 <= a < 2^64 */ + *encoded_len = 10; + encoded[0] = 0xff; + encoded[1] = 0xff; +#ifdef UNALIGNED_POINTERS_PERMITTED + llencoded_ptr = (uint64_t *)&encoded[2]; + *llencoded_ptr = htonl(auth_data_len); +#else + encoded[2] = (auth_data_len & 0xff00000000000000) >> 56; + encoded[3] = (auth_data_len & 0xff000000000000) >> 48; + encoded[4] = (auth_data_len & 0xff0000000000) >> 40; + encoded[5] = (auth_data_len & 0xff00000000) >> 32; + encoded[6] = (auth_data_len & 0xff000000) >> 24; + encoded[7] = (auth_data_len & 0xff0000) >> 16; + encoded[8] = (auth_data_len & 0xff00) >> 8; + encoded[9] = auth_data_len & 0xff; +#endif /* UNALIGNED_POINTERS_PERMITTED */ +#endif /* _LP64 */ + } +} + +static int +ccm_init(ccm_ctx_t *ctx, unsigned char *nonce, size_t nonce_len, + unsigned char *auth_data, size_t auth_data_len, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + uint8_t *mac_buf, *datap, *ivp, *authp; + size_t remainder, processed; + uint8_t encoded_a[10]; /* max encoded auth data length is 10 octets */ + size_t encoded_a_len = 0; + + mac_buf = (uint8_t *)&(ctx->ccm_mac_buf); + + /* + * Format the 1st block for CBC-MAC and construct the + * 1st counter block. + * + * aes_ctx->ccm_iv is used for storing the counter block + * mac_buf will store b0 at this time. + */ + ccm_format_initial_blocks(nonce, nonce_len, + auth_data_len, mac_buf, ctx); + + /* The IV for CBC MAC for AES CCM mode is always zero */ + ivp = (uint8_t *)ctx->ccm_tmp; + bzero(ivp, block_size); + + xor_block(ivp, mac_buf); + + /* encrypt the nonce */ + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + + /* take care of the associated data, if any */ + if (auth_data_len == 0) { + return (CRYPTO_SUCCESS); + } + + encode_adata_len(auth_data_len, encoded_a, &encoded_a_len); + + remainder = auth_data_len; + + /* 1st block: it contains encoded associated data, and some data */ + authp = (uint8_t *)ctx->ccm_tmp; + bzero(authp, block_size); + bcopy(encoded_a, authp, encoded_a_len); + processed = block_size - encoded_a_len; + if (processed > auth_data_len) { + /* in case auth_data is very small */ + processed = auth_data_len; + } + bcopy(auth_data, authp+encoded_a_len, processed); + /* xor with previous buffer */ + xor_block(authp, mac_buf); + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + remainder -= processed; + if (remainder == 0) { + /* a small amount of associated data, it's all done now */ + return (CRYPTO_SUCCESS); + } + + do { + if (remainder < block_size) { + /* + * There's not a block full of data, pad rest of + * buffer with zero + */ + bzero(authp, block_size); + bcopy(&(auth_data[processed]), authp, remainder); + datap = (uint8_t *)authp; + remainder = 0; + } else { + datap = (uint8_t *)(&(auth_data[processed])); + processed += block_size; + remainder -= block_size; + } + + xor_block(datap, mac_buf); + encrypt_block(ctx->ccm_keysched, mac_buf, mac_buf); + + } while (remainder > 0); + + return (CRYPTO_SUCCESS); +} + +/* + * The following function should be call at encrypt or decrypt init time + * for AES CCM mode. + */ +int +ccm_init_ctx(ccm_ctx_t *ccm_ctx, char *param, int kmflag, + boolean_t is_encrypt_init, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + int rv; + CK_AES_CCM_PARAMS *ccm_param; + + if (param != NULL) { + ccm_param = (CK_AES_CCM_PARAMS *)param; + + if ((rv = ccm_validate_args(ccm_param, + is_encrypt_init)) != 0) { + return (rv); + } + + ccm_ctx->ccm_mac_len = ccm_param->ulMACSize; + if (is_encrypt_init) { + ccm_ctx->ccm_data_len = ccm_param->ulDataSize; + } else { + ccm_ctx->ccm_data_len = + ccm_param->ulDataSize - ccm_ctx->ccm_mac_len; + ccm_ctx->ccm_processed_mac_len = 0; + } + ccm_ctx->ccm_processed_data_len = 0; + + ccm_ctx->ccm_flags |= CCM_MODE; + } else { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + + if (ccm_init(ccm_ctx, ccm_param->nonce, ccm_param->ulNonceSize, + ccm_param->authData, ccm_param->ulAuthDataSize, block_size, + encrypt_block, xor_block) != 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + if (!is_encrypt_init) { + /* allocate buffer for storing decrypted plaintext */ + ccm_ctx->ccm_pt_buf = vmem_alloc(ccm_ctx->ccm_data_len, + kmflag); + if (ccm_ctx->ccm_pt_buf == NULL) { + rv = CRYPTO_HOST_MEMORY; + } + } + return (rv); +} + +void * +ccm_alloc_ctx(int kmflag) +{ + ccm_ctx_t *ccm_ctx; + + if ((ccm_ctx = kmem_zalloc(sizeof (ccm_ctx_t), kmflag)) == NULL) + return (NULL); + + ccm_ctx->ccm_flags = CCM_MODE; + return (ccm_ctx); +} diff --git a/module/icp/algs/modes/ctr.c b/module/icp/algs/modes/ctr.c new file mode 100644 index 000000000000..0188bdd395ff --- /dev/null +++ b/module/icp/algs/modes/ctr.c @@ -0,0 +1,228 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +/* + * Encrypt and decrypt multiple blocks of data in counter mode. + */ +int +ctr_mode_contiguous_blocks(ctr_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + uint64_t lower_counter, upper_counter; + + if (length + ctx->ctr_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->ctr_remainder + ctx->ctr_remainder_len, + length); + ctx->ctr_remainder_len += length; + ctx->ctr_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + lastp = (uint8_t *)ctx->ctr_cb; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + do { + /* Unprocessed data from last call. */ + if (ctx->ctr_remainder_len > 0) { + need = block_size - ctx->ctr_remainder_len; + + if (need > remainder) + return (CRYPTO_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->ctr_remainder) + [ctx->ctr_remainder_len], need); + + blockp = (uint8_t *)ctx->ctr_remainder; + } else { + blockp = datap; + } + + /* ctr_cb is the counter block */ + cipher(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, + (uint8_t *)ctx->ctr_tmp); + + lastp = (uint8_t *)ctx->ctr_tmp; + + /* + * Increment Counter. + */ + lower_counter = ntohll(ctx->ctr_cb[1] & ctx->ctr_lower_mask); + lower_counter = htonll(lower_counter + 1); + lower_counter &= ctx->ctr_lower_mask; + ctx->ctr_cb[1] = (ctx->ctr_cb[1] & ~(ctx->ctr_lower_mask)) | + lower_counter; + + /* wrap around */ + if (lower_counter == 0) { + upper_counter = + ntohll(ctx->ctr_cb[0] & ctx->ctr_upper_mask); + upper_counter = htonll(upper_counter + 1); + upper_counter &= ctx->ctr_upper_mask; + ctx->ctr_cb[0] = + (ctx->ctr_cb[0] & ~(ctx->ctr_upper_mask)) | + upper_counter; + } + + /* + * XOR encrypted counter block with the current clear block. + */ + xor_block(blockp, lastp); + + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + /* copy block to where it belongs */ + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, out_data_2, + block_size - out_data_1_len); + } + /* update offset */ + out->cd_offset += block_size; + + /* Update pointer to next block of data to be processed. */ + if (ctx->ctr_remainder_len != 0) { + datap += need; + ctx->ctr_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->ctr_remainder, remainder); + ctx->ctr_remainder_len = remainder; + ctx->ctr_copy_to = datap; + goto out; + } + ctx->ctr_copy_to = NULL; + + } while (remainder > 0); + +out: + return (CRYPTO_SUCCESS); +} + +int +ctr_mode_final(ctr_ctx_t *ctx, crypto_data_t *out, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)) +{ + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + uint8_t *p; + int i; + + if (out->cd_length < ctx->ctr_remainder_len) + return (CRYPTO_DATA_LEN_RANGE); + + encrypt_block(ctx->ctr_keysched, (uint8_t *)ctx->ctr_cb, + (uint8_t *)ctx->ctr_tmp); + + lastp = (uint8_t *)ctx->ctr_tmp; + p = (uint8_t *)ctx->ctr_remainder; + for (i = 0; i < ctx->ctr_remainder_len; i++) { + p[i] ^= lastp[i]; + } + + crypto_init_ptrs(out, &iov_or_mp, &offset); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, ctx->ctr_remainder_len); + + bcopy(p, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy((uint8_t *)p + out_data_1_len, + out_data_2, ctx->ctr_remainder_len - out_data_1_len); + } + out->cd_offset += ctx->ctr_remainder_len; + ctx->ctr_remainder_len = 0; + return (CRYPTO_SUCCESS); +} + +int +ctr_init_ctx(ctr_ctx_t *ctr_ctx, ulong_t count, uint8_t *cb, + void (*copy_block)(uint8_t *, uint8_t *)) +{ + uint64_t upper_mask = 0; + uint64_t lower_mask = 0; + + if (count == 0 || count > 128) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + /* upper 64 bits of the mask */ + if (count >= 64) { + count -= 64; + upper_mask = (count == 64) ? UINT64_MAX : (1ULL << count) - 1; + lower_mask = UINT64_MAX; + } else { + /* now the lower 63 bits */ + lower_mask = (1ULL << count) - 1; + } + ctr_ctx->ctr_lower_mask = htonll(lower_mask); + ctr_ctx->ctr_upper_mask = htonll(upper_mask); + + copy_block(cb, (uchar_t *)ctr_ctx->ctr_cb); + ctr_ctx->ctr_lastp = (uint8_t *)&ctr_ctx->ctr_cb[0]; + ctr_ctx->ctr_flags |= CTR_MODE; + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +void * +ctr_alloc_ctx(int kmflag) +{ + ctr_ctx_t *ctr_ctx; + + if ((ctr_ctx = kmem_zalloc(sizeof (ctr_ctx_t), kmflag)) == NULL) + return (NULL); + + ctr_ctx->ctr_flags = CTR_MODE; + return (ctr_ctx); +} diff --git a/module/icp/algs/modes/ecb.c b/module/icp/algs/modes/ecb.c new file mode 100644 index 000000000000..025f5825cf04 --- /dev/null +++ b/module/icp/algs/modes/ecb.c @@ -0,0 +1,128 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +/* + * Algorithm independent ECB functions. + */ +int +ecb_cipher_contiguous_blocks(ecb_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*cipher)(const void *ks, const uint8_t *pt, uint8_t *ct)) +{ + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + + if (length + ctx->ecb_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->ecb_remainder + ctx->ecb_remainder_len, + length); + ctx->ecb_remainder_len += length; + ctx->ecb_copy_to = datap; + return (CRYPTO_SUCCESS); + } + + lastp = (uint8_t *)ctx->ecb_iv; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + do { + /* Unprocessed data from last call. */ + if (ctx->ecb_remainder_len > 0) { + need = block_size - ctx->ecb_remainder_len; + + if (need > remainder) + return (CRYPTO_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->ecb_remainder) + [ctx->ecb_remainder_len], need); + + blockp = (uint8_t *)ctx->ecb_remainder; + } else { + blockp = datap; + } + + cipher(ctx->ecb_keysched, blockp, lastp); + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + /* copy block to where it belongs */ + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, out_data_2, + block_size - out_data_1_len); + } + /* update offset */ + out->cd_offset += block_size; + + /* Update pointer to next block of data to be processed. */ + if (ctx->ecb_remainder_len != 0) { + datap += need; + ctx->ecb_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->ecb_remainder, remainder); + ctx->ecb_remainder_len = remainder; + ctx->ecb_copy_to = datap; + goto out; + } + ctx->ecb_copy_to = NULL; + + } while (remainder > 0); + +out: + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +void * +ecb_alloc_ctx(int kmflag) +{ + ecb_ctx_t *ecb_ctx; + + if ((ecb_ctx = kmem_zalloc(sizeof (ecb_ctx_t), kmflag)) == NULL) + return (NULL); + + ecb_ctx->ecb_flags = ECB_MODE; + return (ecb_ctx); +} diff --git a/module/icp/algs/modes/gcm.c b/module/icp/algs/modes/gcm.c new file mode 100644 index 000000000000..5553c55e11cd --- /dev/null +++ b/module/icp/algs/modes/gcm.c @@ -0,0 +1,1543 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef CAN_USE_GCM_ASM +#include +#endif + +#define GHASH(c, d, t, o) \ + xor_block((uint8_t *)(d), (uint8_t *)(c)->gcm_ghash); \ + (o)->mul((uint64_t *)(void *)(c)->gcm_ghash, (c)->gcm_H, \ + (uint64_t *)(void *)(t)); + +/* Select GCM implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX-1) +#ifdef CAN_USE_GCM_ASM +#define IMPL_AVX (UINT32_MAX-2) +#endif +#define GCM_IMPL_READ(i) (*(volatile uint32_t *) &(i)) +static uint32_t icp_gcm_impl = IMPL_FASTEST; +static uint32_t user_sel_impl = IMPL_FASTEST; + +#ifdef CAN_USE_GCM_ASM +/* Does the architecture we run on support the MOVBE instruction? */ +boolean_t gcm_avx_can_use_movbe = B_FALSE; +/* + * Whether to use the optimized openssl gcm and ghash implementations. + * Set to true if module parameter icp_gcm_impl == "avx". + */ +static boolean_t gcm_use_avx = B_FALSE; +#define GCM_IMPL_USE_AVX (*(volatile boolean_t *)&gcm_use_avx) + +static inline boolean_t gcm_avx_will_work(void); +static inline void gcm_set_avx(boolean_t); +static inline boolean_t gcm_toggle_avx(void); +extern boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + +static int gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *, char *, size_t, + crypto_data_t *, size_t); + +static int gcm_encrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_decrypt_final_avx(gcm_ctx_t *, crypto_data_t *, size_t); +static int gcm_init_avx(gcm_ctx_t *, unsigned char *, size_t, unsigned char *, + size_t, size_t); +#endif /* ifdef CAN_USE_GCM_ASM */ + +/* + * Encrypt multiple blocks of data in GCM mode. Decrypt for GCM mode + * is done in another function. + */ +int +gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_mode_encrypt_contiguous_blocks_avx( + ctx, data, length, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; + size_t remainder = length; + size_t need = 0; + uint8_t *datap = (uint8_t *)data; + uint8_t *blockp; + uint8_t *lastp; + void *iov_or_mp; + offset_t offset; + uint8_t *out_data_1; + uint8_t *out_data_2; + size_t out_data_1_len; + uint64_t counter; + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + + if (length + ctx->gcm_remainder_len < block_size) { + /* accumulate bytes here and return */ + bcopy(datap, + (uint8_t *)ctx->gcm_remainder + ctx->gcm_remainder_len, + length); + ctx->gcm_remainder_len += length; + if (ctx->gcm_copy_to == NULL) { + ctx->gcm_copy_to = datap; + } + return (CRYPTO_SUCCESS); + } + + lastp = (uint8_t *)ctx->gcm_cb; + crypto_init_ptrs(out, &iov_or_mp, &offset); + + gops = gcm_impl_get_ops(); + do { + /* Unprocessed data from last call. */ + if (ctx->gcm_remainder_len > 0) { + need = block_size - ctx->gcm_remainder_len; + + if (need > remainder) + return (CRYPTO_DATA_LEN_RANGE); + + bcopy(datap, &((uint8_t *)ctx->gcm_remainder) + [ctx->gcm_remainder_len], need); + + blockp = (uint8_t *)ctx->gcm_remainder; + } else { + blockp = datap; + } + + /* + * Increment counter. Counter bits are confined + * to the bottom 32 bits of the counter block. + */ + counter = ntohll(ctx->gcm_cb[1] & counter_mask); + counter = htonll(counter + 1); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; + + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, + (uint8_t *)ctx->gcm_tmp); + xor_block(blockp, (uint8_t *)ctx->gcm_tmp); + + lastp = (uint8_t *)ctx->gcm_tmp; + + ctx->gcm_processed_data_len += block_size; + + crypto_get_ptrs(out, &iov_or_mp, &offset, &out_data_1, + &out_data_1_len, &out_data_2, block_size); + + /* copy block to where it belongs */ + if (out_data_1_len == block_size) { + copy_block(lastp, out_data_1); + } else { + bcopy(lastp, out_data_1, out_data_1_len); + if (out_data_2 != NULL) { + bcopy(lastp + out_data_1_len, + out_data_2, + block_size - out_data_1_len); + } + } + /* update offset */ + out->cd_offset += block_size; + + /* add ciphertext to the hash */ + GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gops); + + /* Update pointer to next block of data to be processed. */ + if (ctx->gcm_remainder_len != 0) { + datap += need; + ctx->gcm_remainder_len = 0; + } else { + datap += block_size; + } + + remainder = (size_t)&data[length] - (size_t)datap; + + /* Incomplete last block. */ + if (remainder > 0 && remainder < block_size) { + bcopy(datap, ctx->gcm_remainder, remainder); + ctx->gcm_remainder_len = remainder; + ctx->gcm_copy_to = datap; + goto out; + } + ctx->gcm_copy_to = NULL; + + } while (remainder > 0); +out: + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +int +gcm_encrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_encrypt_final_avx(ctx, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + uint8_t *ghash, *macp = NULL; + int i, rv; + + if (out->cd_length < + (ctx->gcm_remainder_len + ctx->gcm_tag_len)) { + return (CRYPTO_DATA_LEN_RANGE); + } + + gops = gcm_impl_get_ops(); + ghash = (uint8_t *)ctx->gcm_ghash; + + if (ctx->gcm_remainder_len > 0) { + uint64_t counter; + uint8_t *tmpp = (uint8_t *)ctx->gcm_tmp; + + /* + * Here is where we deal with data that is not a + * multiple of the block size. + */ + + /* + * Increment counter. + */ + counter = ntohll(ctx->gcm_cb[1] & counter_mask); + counter = htonll(counter + 1); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; + + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, + (uint8_t *)ctx->gcm_tmp); + + macp = (uint8_t *)ctx->gcm_remainder; + bzero(macp + ctx->gcm_remainder_len, + block_size - ctx->gcm_remainder_len); + + /* XOR with counter block */ + for (i = 0; i < ctx->gcm_remainder_len; i++) { + macp[i] ^= tmpp[i]; + } + + /* add ciphertext to the hash */ + GHASH(ctx, macp, ghash, gops); + + ctx->gcm_processed_data_len += ctx->gcm_remainder_len; + } + + ctx->gcm_len_a_len_c[1] = + htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); + GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, + (uint8_t *)ctx->gcm_J0); + xor_block((uint8_t *)ctx->gcm_J0, ghash); + + if (ctx->gcm_remainder_len > 0) { + rv = crypto_put_output_data(macp, out, ctx->gcm_remainder_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + } + out->cd_offset += ctx->gcm_remainder_len; + ctx->gcm_remainder_len = 0; + rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + out->cd_offset += ctx->gcm_tag_len; + + return (CRYPTO_SUCCESS); +} + +/* + * This will only deal with decrypting the last block of the input that + * might not be a multiple of block length. + */ +static void +gcm_decrypt_incomplete_block(gcm_ctx_t *ctx, size_t block_size, size_t index, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + uint8_t *datap, *outp, *counterp; + uint64_t counter; + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + int i; + + /* + * Increment counter. + * Counter bits are confined to the bottom 32 bits + */ + counter = ntohll(ctx->gcm_cb[1] & counter_mask); + counter = htonll(counter + 1); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; + + datap = (uint8_t *)ctx->gcm_remainder; + outp = &((ctx->gcm_pt_buf)[index]); + counterp = (uint8_t *)ctx->gcm_tmp; + + /* authentication tag */ + bzero((uint8_t *)ctx->gcm_tmp, block_size); + bcopy(datap, (uint8_t *)ctx->gcm_tmp, ctx->gcm_remainder_len); + + /* add ciphertext to the hash */ + GHASH(ctx, ctx->gcm_tmp, ctx->gcm_ghash, gcm_impl_get_ops()); + + /* decrypt remaining ciphertext */ + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, counterp); + + /* XOR with counter block */ + for (i = 0; i < ctx->gcm_remainder_len; i++) { + outp[i] = datap[i] ^ counterp[i]; + } +} + +/* ARGSUSED */ +int +gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *ctx, char *data, size_t length, + crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + size_t new_len; + uint8_t *new; + + /* + * Copy contiguous ciphertext input blocks to plaintext buffer. + * Ciphertext will be decrypted in the final. + */ + if (length > 0) { + new_len = ctx->gcm_pt_buf_len + length; + new = vmem_alloc(new_len, ctx->gcm_kmflag); + if (new == NULL) { + vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); + ctx->gcm_pt_buf = NULL; + return (CRYPTO_HOST_MEMORY); + } + bcopy(ctx->gcm_pt_buf, new, ctx->gcm_pt_buf_len); + vmem_free(ctx->gcm_pt_buf, ctx->gcm_pt_buf_len); + ctx->gcm_pt_buf = new; + ctx->gcm_pt_buf_len = new_len; + bcopy(data, &ctx->gcm_pt_buf[ctx->gcm_processed_data_len], + length); + ctx->gcm_processed_data_len += length; + } + + ctx->gcm_remainder_len = 0; + return (CRYPTO_SUCCESS); +} + +int +gcm_decrypt_final(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ +#ifdef CAN_USE_GCM_ASM + if (ctx->gcm_use_avx == B_TRUE) + return (gcm_decrypt_final_avx(ctx, out, block_size)); +#endif + + const gcm_impl_ops_t *gops; + size_t pt_len; + size_t remainder; + uint8_t *ghash; + uint8_t *blockp; + uint8_t *cbp; + uint64_t counter; + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + int processed = 0, rv; + + ASSERT(ctx->gcm_processed_data_len == ctx->gcm_pt_buf_len); + + gops = gcm_impl_get_ops(); + pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; + ghash = (uint8_t *)ctx->gcm_ghash; + blockp = ctx->gcm_pt_buf; + remainder = pt_len; + while (remainder > 0) { + /* Incomplete last block */ + if (remainder < block_size) { + bcopy(blockp, ctx->gcm_remainder, remainder); + ctx->gcm_remainder_len = remainder; + /* + * not expecting anymore ciphertext, just + * compute plaintext for the remaining input + */ + gcm_decrypt_incomplete_block(ctx, block_size, + processed, encrypt_block, xor_block); + ctx->gcm_remainder_len = 0; + goto out; + } + /* add ciphertext to the hash */ + GHASH(ctx, blockp, ghash, gops); + + /* + * Increment counter. + * Counter bits are confined to the bottom 32 bits + */ + counter = ntohll(ctx->gcm_cb[1] & counter_mask); + counter = htonll(counter + 1); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; + + cbp = (uint8_t *)ctx->gcm_tmp; + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_cb, cbp); + + /* XOR with ciphertext */ + xor_block(cbp, blockp); + + processed += block_size; + blockp += block_size; + remainder -= block_size; + } +out: + ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); + GHASH(ctx, ctx->gcm_len_a_len_c, ghash, gops); + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_J0, + (uint8_t *)ctx->gcm_J0); + xor_block((uint8_t *)ctx->gcm_J0, ghash); + + /* compare the input authentication tag with what we calculated */ + if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { + /* They don't match */ + return (CRYPTO_INVALID_MAC); + } else { + rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + out->cd_offset += pt_len; + } + return (CRYPTO_SUCCESS); +} + +static int +gcm_validate_args(CK_AES_GCM_PARAMS *gcm_param) +{ + size_t tag_len; + + /* + * Check the length of the authentication tag (in bits). + */ + tag_len = gcm_param->ulTagBits; + switch (tag_len) { + case 32: + case 64: + case 96: + case 104: + case 112: + case 120: + case 128: + break; + default: + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + + if (gcm_param->ulIvLen == 0) + return (CRYPTO_MECHANISM_PARAM_INVALID); + + return (CRYPTO_SUCCESS); +} + +static void +gcm_format_initial_blocks(uchar_t *iv, ulong_t iv_len, + gcm_ctx_t *ctx, size_t block_size, + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + const gcm_impl_ops_t *gops; + uint8_t *cb; + ulong_t remainder = iv_len; + ulong_t processed = 0; + uint8_t *datap, *ghash; + uint64_t len_a_len_c[2]; + + gops = gcm_impl_get_ops(); + ghash = (uint8_t *)ctx->gcm_ghash; + cb = (uint8_t *)ctx->gcm_cb; + if (iv_len == 12) { + bcopy(iv, cb, 12); + cb[12] = 0; + cb[13] = 0; + cb[14] = 0; + cb[15] = 1; + /* J0 will be used again in the final */ + copy_block(cb, (uint8_t *)ctx->gcm_J0); + } else { + /* GHASH the IV */ + do { + if (remainder < block_size) { + bzero(cb, block_size); + bcopy(&(iv[processed]), cb, remainder); + datap = (uint8_t *)cb; + remainder = 0; + } else { + datap = (uint8_t *)(&(iv[processed])); + processed += block_size; + remainder -= block_size; + } + GHASH(ctx, datap, ghash, gops); + } while (remainder > 0); + + len_a_len_c[0] = 0; + len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(iv_len)); + GHASH(ctx, len_a_len_c, ctx->gcm_J0, gops); + + /* J0 will be used again in the final */ + copy_block((uint8_t *)ctx->gcm_J0, (uint8_t *)cb); + } +} + +static int +gcm_init(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, + unsigned char *auth_data, size_t auth_data_len, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + const gcm_impl_ops_t *gops; + uint8_t *ghash, *datap, *authp; + size_t remainder, processed; + + /* encrypt zero block to get subkey H */ + bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); + encrypt_block(ctx->gcm_keysched, (uint8_t *)ctx->gcm_H, + (uint8_t *)ctx->gcm_H); + + gcm_format_initial_blocks(iv, iv_len, ctx, block_size, + copy_block, xor_block); + + gops = gcm_impl_get_ops(); + authp = (uint8_t *)ctx->gcm_tmp; + ghash = (uint8_t *)ctx->gcm_ghash; + bzero(authp, block_size); + bzero(ghash, block_size); + + processed = 0; + remainder = auth_data_len; + do { + if (remainder < block_size) { + /* + * There's not a block full of data, pad rest of + * buffer with zero + */ + bzero(authp, block_size); + bcopy(&(auth_data[processed]), authp, remainder); + datap = (uint8_t *)authp; + remainder = 0; + } else { + datap = (uint8_t *)(&(auth_data[processed])); + processed += block_size; + remainder -= block_size; + } + + /* add auth data to the hash */ + GHASH(ctx, datap, ghash, gops); + + } while (remainder > 0); + + return (CRYPTO_SUCCESS); +} + +/* + * The following function is called at encrypt or decrypt init time + * for AES GCM mode. + * + * Init the GCM context struct. Handle the cycle and avx implementations here. + */ +int +gcm_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + int rv; + CK_AES_GCM_PARAMS *gcm_param; + + if (param != NULL) { + gcm_param = (CK_AES_GCM_PARAMS *)(void *)param; + + if ((rv = gcm_validate_args(gcm_param)) != 0) { + return (rv); + } + + gcm_ctx->gcm_tag_len = gcm_param->ulTagBits; + gcm_ctx->gcm_tag_len >>= 3; + gcm_ctx->gcm_processed_data_len = 0; + + /* these values are in bits */ + gcm_ctx->gcm_len_a_len_c[0] + = htonll(CRYPTO_BYTES2BITS(gcm_param->ulAADLen)); + + rv = CRYPTO_SUCCESS; + gcm_ctx->gcm_flags |= GCM_MODE; + } else { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + +#ifdef CAN_USE_GCM_ASM + if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { + gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + } else { + /* + * Handle the "cycle" implementation by creating avx and + * non-avx contexts alternately. + */ + gcm_ctx->gcm_use_avx = gcm_toggle_avx(); + /* + * We don't handle byte swapped key schedules in the avx + * code path. + */ + aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; + if (ks->ops->needs_byteswap == B_TRUE) { + gcm_ctx->gcm_use_avx = B_FALSE; + } + /* Use the MOVBE and the BSWAP variants alternately. */ + if (gcm_ctx->gcm_use_avx == B_TRUE && + zfs_movbe_available() == B_TRUE) { + (void) atomic_toggle_boolean_nv( + (volatile boolean_t *)&gcm_avx_can_use_movbe); + } + } + /* Avx and non avx context initialization differs from here on. */ + if (gcm_ctx->gcm_use_avx == B_FALSE) { +#endif /* ifdef CAN_USE_GCM_ASM */ + if (gcm_init(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, + gcm_param->pAAD, gcm_param->ulAADLen, block_size, + encrypt_block, copy_block, xor_block) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } +#ifdef CAN_USE_GCM_ASM + } else { + if (gcm_init_avx(gcm_ctx, gcm_param->pIv, gcm_param->ulIvLen, + gcm_param->pAAD, gcm_param->ulAADLen, block_size) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } + } +#endif /* ifdef CAN_USE_GCM_ASM */ + + return (rv); +} + +int +gmac_init_ctx(gcm_ctx_t *gcm_ctx, char *param, size_t block_size, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)) +{ + int rv; + CK_AES_GMAC_PARAMS *gmac_param; + + if (param != NULL) { + gmac_param = (CK_AES_GMAC_PARAMS *)(void *)param; + + gcm_ctx->gcm_tag_len = CRYPTO_BITS2BYTES(AES_GMAC_TAG_BITS); + gcm_ctx->gcm_processed_data_len = 0; + + /* these values are in bits */ + gcm_ctx->gcm_len_a_len_c[0] + = htonll(CRYPTO_BYTES2BITS(gmac_param->ulAADLen)); + + rv = CRYPTO_SUCCESS; + gcm_ctx->gcm_flags |= GMAC_MODE; + } else { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + +#ifdef CAN_USE_GCM_ASM + /* + * Handle the "cycle" implementation by creating avx and non avx + * contexts alternately. + */ + if (GCM_IMPL_READ(icp_gcm_impl) != IMPL_CYCLE) { + gcm_ctx->gcm_use_avx = GCM_IMPL_USE_AVX; + } else { + gcm_ctx->gcm_use_avx = gcm_toggle_avx(); + } + /* We don't handle byte swapped key schedules in the avx code path. */ + aes_key_t *ks = (aes_key_t *)gcm_ctx->gcm_keysched; + if (ks->ops->needs_byteswap == B_TRUE) { + gcm_ctx->gcm_use_avx = B_FALSE; + } + /* Avx and non avx context initialization differs from here on. */ + if (gcm_ctx->gcm_use_avx == B_FALSE) { +#endif /* ifdef CAN_USE_GCM_ASM */ + if (gcm_init(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, + gmac_param->pAAD, gmac_param->ulAADLen, block_size, + encrypt_block, copy_block, xor_block) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } +#ifdef CAN_USE_GCM_ASM + } else { + if (gcm_init_avx(gcm_ctx, gmac_param->pIv, AES_GMAC_IV_LEN, + gmac_param->pAAD, gmac_param->ulAADLen, block_size) != 0) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } + } +#endif /* ifdef CAN_USE_GCM_ASM */ + + return (rv); +} + +void * +gcm_alloc_ctx(int kmflag) +{ + gcm_ctx_t *gcm_ctx; + + if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) + return (NULL); + + gcm_ctx->gcm_flags = GCM_MODE; + return (gcm_ctx); +} + +void * +gmac_alloc_ctx(int kmflag) +{ + gcm_ctx_t *gcm_ctx; + + if ((gcm_ctx = kmem_zalloc(sizeof (gcm_ctx_t), kmflag)) == NULL) + return (NULL); + + gcm_ctx->gcm_flags = GMAC_MODE; + return (gcm_ctx); +} + +void +gcm_set_kmflag(gcm_ctx_t *ctx, int kmflag) +{ + ctx->gcm_kmflag = kmflag; +} + +/* GCM implementation that contains the fastest methods */ +static gcm_impl_ops_t gcm_fastest_impl = { + .name = "fastest" +}; + +/* All compiled in implementations */ +const gcm_impl_ops_t *gcm_all_impl[] = { + &gcm_generic_impl, +#if defined(__x86_64) && defined(HAVE_PCLMULQDQ) + &gcm_pclmulqdq_impl, +#endif +}; + +/* Indicate that benchmark has been completed */ +static boolean_t gcm_impl_initialized = B_FALSE; + +/* Hold all supported implementations */ +static size_t gcm_supp_impl_cnt = 0; +static gcm_impl_ops_t *gcm_supp_impl[ARRAY_SIZE(gcm_all_impl)]; + +/* + * Returns the GCM operations for encrypt/decrypt/key setup. When a + * SIMD implementation is not allowed in the current context, then + * fallback to the fastest generic implementation. + */ +const gcm_impl_ops_t * +gcm_impl_get_ops() +{ + if (!kfpu_allowed()) + return (&gcm_generic_impl); + + const gcm_impl_ops_t *ops = NULL; + const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(gcm_impl_initialized); + ops = &gcm_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through supported implementations */ + ASSERT(gcm_impl_initialized); + ASSERT3U(gcm_supp_impl_cnt, >, 0); + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % gcm_supp_impl_cnt; + ops = gcm_supp_impl[idx]; + break; +#ifdef CAN_USE_GCM_ASM + case IMPL_AVX: + /* + * Make sure that we return a valid implementation while + * switching to the avx implementation since there still + * may be unfinished non-avx contexts around. + */ + ops = &gcm_generic_impl; + break; +#endif + default: + ASSERT3U(impl, <, gcm_supp_impl_cnt); + ASSERT3U(gcm_supp_impl_cnt, >, 0); + if (impl < ARRAY_SIZE(gcm_all_impl)) + ops = gcm_supp_impl[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +/* + * Initialize all supported implementations. + */ +void +gcm_impl_init(void) +{ + gcm_impl_ops_t *curr_impl; + int i, c; + + /* Move supported implementations into gcm_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(gcm_all_impl); i++) { + curr_impl = (gcm_impl_ops_t *)gcm_all_impl[i]; + + if (curr_impl->is_supported()) + gcm_supp_impl[c++] = (gcm_impl_ops_t *)curr_impl; + } + gcm_supp_impl_cnt = c; + + /* + * Set the fastest implementation given the assumption that the + * hardware accelerated version is the fastest. + */ +#if defined(__x86_64) && defined(HAVE_PCLMULQDQ) + if (gcm_pclmulqdq_impl.is_supported()) { + memcpy(&gcm_fastest_impl, &gcm_pclmulqdq_impl, + sizeof (gcm_fastest_impl)); + } else +#endif + { + memcpy(&gcm_fastest_impl, &gcm_generic_impl, + sizeof (gcm_fastest_impl)); + } + + strlcpy(gcm_fastest_impl.name, "fastest", GCM_IMPL_NAME_MAX); + +#ifdef CAN_USE_GCM_ASM + /* + * Use the avx implementation if it's available and the implementation + * hasn't changed from its default value of fastest on module load. + */ + if (gcm_avx_will_work()) { +#ifdef HAVE_MOVBE + if (zfs_movbe_available() == B_TRUE) { + atomic_swap_32(&gcm_avx_can_use_movbe, B_TRUE); + } +#endif + if (GCM_IMPL_READ(user_sel_impl) == IMPL_FASTEST) { + gcm_set_avx(B_TRUE); + } + } +#endif + /* Finish initialization */ + atomic_swap_32(&icp_gcm_impl, user_sel_impl); + gcm_impl_initialized = B_TRUE; +} + +static const struct { + char *name; + uint32_t sel; +} gcm_impl_opts[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, +#ifdef CAN_USE_GCM_ASM + { "avx", IMPL_AVX }, +#endif +}; + +/* + * Function sets desired gcm implementation. + * + * If we are called before init(), user preference will be saved in + * user_sel_impl, and applied in later init() call. This occurs when module + * parameter is specified on module load. Otherwise, directly update + * icp_gcm_impl. + * + * @val Name of gcm implementation to use + * @param Unused. + */ +int +gcm_impl_set(const char *val) +{ + int err = -EINVAL; + char req_name[GCM_IMPL_NAME_MAX]; + uint32_t impl = GCM_IMPL_READ(user_sel_impl); + size_t i; + + /* sanitize input */ + i = strnlen(val, GCM_IMPL_NAME_MAX); + if (i == 0 || i >= GCM_IMPL_NAME_MAX) + return (err); + + strlcpy(req_name, val, GCM_IMPL_NAME_MAX); + while (i > 0 && isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + /* Check mandatory options */ + for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { +#ifdef CAN_USE_GCM_ASM + /* Ignore avx implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + continue; + } +#endif + if (strcmp(req_name, gcm_impl_opts[i].name) == 0) { + impl = gcm_impl_opts[i].sel; + err = 0; + break; + } + } + + /* check all supported impl if init() was already called */ + if (err != 0 && gcm_impl_initialized) { + /* check all supported implementations */ + for (i = 0; i < gcm_supp_impl_cnt; i++) { + if (strcmp(req_name, gcm_supp_impl[i]->name) == 0) { + impl = i; + err = 0; + break; + } + } + } +#ifdef CAN_USE_GCM_ASM + /* + * Use the avx implementation if available and the requested one is + * avx or fastest. + */ + if (gcm_avx_will_work() == B_TRUE && + (impl == IMPL_AVX || impl == IMPL_FASTEST)) { + gcm_set_avx(B_TRUE); + } else { + gcm_set_avx(B_FALSE); + } +#endif + + if (err == 0) { + if (gcm_impl_initialized) + atomic_swap_32(&icp_gcm_impl, impl); + else + atomic_swap_32(&user_sel_impl, impl); + } + + return (err); +} + +#if defined(_KERNEL) && defined(__linux__) + +static int +icp_gcm_impl_set(const char *val, zfs_kernel_param_t *kp) +{ + return (gcm_impl_set(val)); +} + +static int +icp_gcm_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cnt = 0; + char *fmt; + const uint32_t impl = GCM_IMPL_READ(icp_gcm_impl); + + ASSERT(gcm_impl_initialized); + + /* list mandatory options */ + for (i = 0; i < ARRAY_SIZE(gcm_impl_opts); i++) { +#ifdef CAN_USE_GCM_ASM + /* Ignore avx implementation if it won't work. */ + if (gcm_impl_opts[i].sel == IMPL_AVX && !gcm_avx_will_work()) { + continue; + } +#endif + fmt = (impl == gcm_impl_opts[i].sel) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, gcm_impl_opts[i].name); + } + + /* list all supported implementations */ + for (i = 0; i < gcm_supp_impl_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, gcm_supp_impl[i]->name); + } + + return (cnt); +} + +module_param_call(icp_gcm_impl, icp_gcm_impl_set, icp_gcm_impl_get, + NULL, 0644); +MODULE_PARM_DESC(icp_gcm_impl, "Select gcm implementation."); +#endif /* defined(__KERNEL) */ + +#ifdef CAN_USE_GCM_ASM +#define GCM_BLOCK_LEN 16 +/* + * The openssl asm routines are 6x aggregated and need that many bytes + * at minimum. + */ +#define GCM_AVX_MIN_DECRYPT_BYTES (GCM_BLOCK_LEN * 6) +#define GCM_AVX_MIN_ENCRYPT_BYTES (GCM_BLOCK_LEN * 6 * 3) +/* + * Ensure the chunk size is reasonable since we are allocating a + * GCM_AVX_MAX_CHUNK_SIZEd buffer and disabling preemption and interrupts. + */ +#define GCM_AVX_MAX_CHUNK_SIZE \ + (((128*1024)/GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES) + +/* Get the chunk size module parameter. */ +#define GCM_CHUNK_SIZE_READ *(volatile uint32_t *) &gcm_avx_chunk_size + +/* Clear the FPU registers since they hold sensitive internal state. */ +#define clear_fpu_regs() clear_fpu_regs_avx() +#define GHASH_AVX(ctx, in, len) \ + gcm_ghash_avx((ctx)->gcm_ghash, (const uint64_t (*)[2])(ctx)->gcm_Htable, \ + in, len) + +#define gcm_incr_counter_block(ctx) gcm_incr_counter_block_by(ctx, 1) + +/* + * Module parameter: number of bytes to process at once while owning the FPU. + * Rounded down to the next GCM_AVX_MIN_DECRYPT_BYTES byte boundary and is + * ensured to be greater or equal than GCM_AVX_MIN_DECRYPT_BYTES. + */ +static uint32_t gcm_avx_chunk_size = + ((32 * 1024) / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + +extern void clear_fpu_regs_avx(void); +extern void gcm_xor_avx(const uint8_t *src, uint8_t *dst); +extern void aes_encrypt_intel(const uint32_t rk[], int nr, + const uint32_t pt[4], uint32_t ct[4]); + +extern void gcm_init_htab_avx(uint64_t Htable[16][2], const uint64_t H[2]); +extern void gcm_ghash_avx(uint64_t ghash[2], const uint64_t Htable[16][2], + const uint8_t *in, size_t len); + +extern size_t aesni_gcm_encrypt(const uint8_t *, uint8_t *, size_t, + const void *, uint64_t *, uint64_t *); + +extern size_t aesni_gcm_decrypt(const uint8_t *, uint8_t *, size_t, + const void *, uint64_t *, uint64_t *); + +static inline boolean_t +gcm_avx_will_work(void) +{ + /* Avx should imply aes-ni and pclmulqdq, but make sure anyhow. */ + return (kfpu_allowed() && + zfs_avx_available() && zfs_aes_available() && + zfs_pclmulqdq_available()); +} + +static inline void +gcm_set_avx(boolean_t val) +{ + if (gcm_avx_will_work() == B_TRUE) { + atomic_swap_32(&gcm_use_avx, val); + } +} + +static inline boolean_t +gcm_toggle_avx(void) +{ + if (gcm_avx_will_work() == B_TRUE) { + return (atomic_toggle_boolean_nv(&GCM_IMPL_USE_AVX)); + } else { + return (B_FALSE); + } +} + +/* + * Clear sensitive data in the context. + * + * ctx->gcm_remainder may contain a plaintext remainder. ctx->gcm_H and + * ctx->gcm_Htable contain the hash sub key which protects authentication. + * + * Although extremely unlikely, ctx->gcm_J0 and ctx->gcm_tmp could be used for + * a known plaintext attack, they consists of the IV and the first and last + * counter respectively. If they should be cleared is debatable. + */ +static inline void +gcm_clear_ctx(gcm_ctx_t *ctx) +{ + bzero(ctx->gcm_remainder, sizeof (ctx->gcm_remainder)); + bzero(ctx->gcm_H, sizeof (ctx->gcm_H)); + bzero(ctx->gcm_Htable, sizeof (ctx->gcm_Htable)); + bzero(ctx->gcm_J0, sizeof (ctx->gcm_J0)); + bzero(ctx->gcm_tmp, sizeof (ctx->gcm_tmp)); +} + +/* Increment the GCM counter block by n. */ +static inline void +gcm_incr_counter_block_by(gcm_ctx_t *ctx, int n) +{ + uint64_t counter_mask = ntohll(0x00000000ffffffffULL); + uint64_t counter = ntohll(ctx->gcm_cb[1] & counter_mask); + + counter = htonll(counter + n); + counter &= counter_mask; + ctx->gcm_cb[1] = (ctx->gcm_cb[1] & ~counter_mask) | counter; +} + +/* + * Encrypt multiple blocks of data in GCM mode. + * This is done in gcm_avx_chunk_size chunks, utilizing AVX assembler routines + * if possible. While processing a chunk the FPU is "locked". + */ +static int +gcm_mode_encrypt_contiguous_blocks_avx(gcm_ctx_t *ctx, char *data, + size_t length, crypto_data_t *out, size_t block_size) +{ + size_t bleft = length; + size_t need = 0; + size_t done = 0; + uint8_t *datap = (uint8_t *)data; + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); + uint64_t *ghash = ctx->gcm_ghash; + uint64_t *cb = ctx->gcm_cb; + uint8_t *ct_buf = NULL; + uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; + int rv = CRYPTO_SUCCESS; + + ASSERT(block_size == GCM_BLOCK_LEN); + /* + * If the last call left an incomplete block, try to fill + * it first. + */ + if (ctx->gcm_remainder_len > 0) { + need = block_size - ctx->gcm_remainder_len; + if (length < need) { + /* Accumulate bytes here and return. */ + bcopy(datap, (uint8_t *)ctx->gcm_remainder + + ctx->gcm_remainder_len, length); + + ctx->gcm_remainder_len += length; + if (ctx->gcm_copy_to == NULL) { + ctx->gcm_copy_to = datap; + } + return (CRYPTO_SUCCESS); + } else { + /* Complete incomplete block. */ + bcopy(datap, (uint8_t *)ctx->gcm_remainder + + ctx->gcm_remainder_len, need); + + ctx->gcm_copy_to = NULL; + } + } + + /* Allocate a buffer to encrypt to if there is enough input. */ + if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { + ct_buf = vmem_alloc(chunk_size, ctx->gcm_kmflag); + if (ct_buf == NULL) { + return (CRYPTO_HOST_MEMORY); + } + } + + /* If we completed an incomplete block, encrypt and write it out. */ + if (ctx->gcm_remainder_len > 0) { + kfpu_begin(); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, + (const uint32_t *)cb, (uint32_t *)tmp); + + gcm_xor_avx((const uint8_t *) ctx->gcm_remainder, tmp); + GHASH_AVX(ctx, tmp, block_size); + clear_fpu_regs(); + kfpu_end(); + rv = crypto_put_output_data(tmp, out, block_size); + out->cd_offset += block_size; + gcm_incr_counter_block(ctx); + ctx->gcm_processed_data_len += block_size; + bleft -= need; + datap += need; + ctx->gcm_remainder_len = 0; + } + + /* Do the bulk encryption in chunk_size blocks. */ + for (; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + done = aesni_gcm_encrypt( + datap, ct_buf, chunk_size, key, cb, ghash); + + clear_fpu_regs(); + kfpu_end(); + if (done != chunk_size) { + rv = CRYPTO_FAILED; + goto out_nofpu; + } + rv = crypto_put_output_data(ct_buf, out, chunk_size); + if (rv != CRYPTO_SUCCESS) { + goto out_nofpu; + } + out->cd_offset += chunk_size; + datap += chunk_size; + ctx->gcm_processed_data_len += chunk_size; + } + /* Check if we are already done. */ + if (bleft == 0) { + goto out_nofpu; + } + /* Bulk encrypt the remaining data. */ + kfpu_begin(); + if (bleft >= GCM_AVX_MIN_ENCRYPT_BYTES) { + done = aesni_gcm_encrypt(datap, ct_buf, bleft, key, cb, ghash); + if (done == 0) { + rv = CRYPTO_FAILED; + goto out; + } + rv = crypto_put_output_data(ct_buf, out, done); + if (rv != CRYPTO_SUCCESS) { + goto out; + } + out->cd_offset += done; + ctx->gcm_processed_data_len += done; + datap += done; + bleft -= done; + + } + /* Less than GCM_AVX_MIN_ENCRYPT_BYTES remain, operate on blocks. */ + while (bleft > 0) { + if (bleft < block_size) { + bcopy(datap, ctx->gcm_remainder, bleft); + ctx->gcm_remainder_len = bleft; + ctx->gcm_copy_to = datap; + goto out; + } + /* Encrypt, hash and write out. */ + aes_encrypt_intel(key->encr_ks.ks32, key->nr, + (const uint32_t *)cb, (uint32_t *)tmp); + + gcm_xor_avx(datap, tmp); + GHASH_AVX(ctx, tmp, block_size); + rv = crypto_put_output_data(tmp, out, block_size); + if (rv != CRYPTO_SUCCESS) { + goto out; + } + out->cd_offset += block_size; + gcm_incr_counter_block(ctx); + ctx->gcm_processed_data_len += block_size; + datap += block_size; + bleft -= block_size; + } +out: + clear_fpu_regs(); + kfpu_end(); +out_nofpu: + if (ct_buf != NULL) { + vmem_free(ct_buf, chunk_size); + } + return (rv); +} + +/* + * Finalize the encryption: Zero fill, encrypt, hash and write out an eventual + * incomplete last block. Encrypt the ICB. Calculate the tag and write it out. + */ +static int +gcm_encrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +{ + uint8_t *ghash = (uint8_t *)ctx->gcm_ghash; + uint32_t *J0 = (uint32_t *)ctx->gcm_J0; + uint8_t *remainder = (uint8_t *)ctx->gcm_remainder; + size_t rem_len = ctx->gcm_remainder_len; + const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; + int aes_rounds = ((aes_key_t *)keysched)->nr; + int rv; + + ASSERT(block_size == GCM_BLOCK_LEN); + + if (out->cd_length < (rem_len + ctx->gcm_tag_len)) { + return (CRYPTO_DATA_LEN_RANGE); + } + + kfpu_begin(); + /* Pad last incomplete block with zeros, encrypt and hash. */ + if (rem_len > 0) { + uint8_t *tmp = (uint8_t *)ctx->gcm_tmp; + const uint32_t *cb = (uint32_t *)ctx->gcm_cb; + + aes_encrypt_intel(keysched, aes_rounds, cb, (uint32_t *)tmp); + bzero(remainder + rem_len, block_size - rem_len); + for (int i = 0; i < rem_len; i++) { + remainder[i] ^= tmp[i]; + } + GHASH_AVX(ctx, remainder, block_size); + ctx->gcm_processed_data_len += rem_len; + /* No need to increment counter_block, it's the last block. */ + } + /* Finish tag. */ + ctx->gcm_len_a_len_c[1] = + htonll(CRYPTO_BYTES2BITS(ctx->gcm_processed_data_len)); + GHASH_AVX(ctx, (const uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_encrypt_intel(keysched, aes_rounds, J0, J0); + + gcm_xor_avx((uint8_t *)J0, ghash); + clear_fpu_regs(); + kfpu_end(); + + /* Output remainder. */ + if (rem_len > 0) { + rv = crypto_put_output_data(remainder, out, rem_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + } + out->cd_offset += rem_len; + ctx->gcm_remainder_len = 0; + rv = crypto_put_output_data(ghash, out, ctx->gcm_tag_len); + if (rv != CRYPTO_SUCCESS) + return (rv); + + out->cd_offset += ctx->gcm_tag_len; + /* Clear sensitive data in the context before returning. */ + gcm_clear_ctx(ctx); + return (CRYPTO_SUCCESS); +} + +/* + * Finalize decryption: We just have accumulated crypto text, so now we + * decrypt it here inplace. + */ +static int +gcm_decrypt_final_avx(gcm_ctx_t *ctx, crypto_data_t *out, size_t block_size) +{ + ASSERT3U(ctx->gcm_processed_data_len, ==, ctx->gcm_pt_buf_len); + ASSERT3U(block_size, ==, 16); + + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; + uint8_t *datap = ctx->gcm_pt_buf; + const aes_key_t *key = ((aes_key_t *)ctx->gcm_keysched); + uint32_t *cb = (uint32_t *)ctx->gcm_cb; + uint64_t *ghash = ctx->gcm_ghash; + uint32_t *tmp = (uint32_t *)ctx->gcm_tmp; + int rv = CRYPTO_SUCCESS; + size_t bleft, done; + + /* + * Decrypt in chunks of gcm_avx_chunk_size, which is asserted to be + * greater or equal than GCM_AVX_MIN_ENCRYPT_BYTES, and a multiple of + * GCM_AVX_MIN_DECRYPT_BYTES. + */ + for (bleft = pt_len; bleft >= chunk_size; bleft -= chunk_size) { + kfpu_begin(); + done = aesni_gcm_decrypt(datap, datap, chunk_size, + (const void *)key, ctx->gcm_cb, ghash); + clear_fpu_regs(); + kfpu_end(); + if (done != chunk_size) { + return (CRYPTO_FAILED); + } + datap += done; + } + /* Decrypt remainder, which is less then chunk size, in one go. */ + kfpu_begin(); + if (bleft >= GCM_AVX_MIN_DECRYPT_BYTES) { + done = aesni_gcm_decrypt(datap, datap, bleft, + (const void *)key, ctx->gcm_cb, ghash); + if (done == 0) { + clear_fpu_regs(); + kfpu_end(); + return (CRYPTO_FAILED); + } + datap += done; + bleft -= done; + } + ASSERT(bleft < GCM_AVX_MIN_DECRYPT_BYTES); + + /* + * Now less then GCM_AVX_MIN_DECRYPT_BYTES bytes remain, + * decrypt them block by block. + */ + while (bleft > 0) { + /* Incomplete last block. */ + if (bleft < block_size) { + uint8_t *lastb = (uint8_t *)ctx->gcm_remainder; + + bzero(lastb, block_size); + bcopy(datap, lastb, bleft); + /* The GCM processing. */ + GHASH_AVX(ctx, lastb, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); + for (size_t i = 0; i < bleft; i++) { + datap[i] = lastb[i] ^ ((uint8_t *)tmp)[i]; + } + break; + } + /* The GCM processing. */ + GHASH_AVX(ctx, datap, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, cb, tmp); + gcm_xor_avx((uint8_t *)tmp, datap); + gcm_incr_counter_block(ctx); + + datap += block_size; + bleft -= block_size; + } + if (rv != CRYPTO_SUCCESS) { + clear_fpu_regs(); + kfpu_end(); + return (rv); + } + /* Decryption done, finish the tag. */ + ctx->gcm_len_a_len_c[1] = htonll(CRYPTO_BYTES2BITS(pt_len)); + GHASH_AVX(ctx, (uint8_t *)ctx->gcm_len_a_len_c, block_size); + aes_encrypt_intel(key->encr_ks.ks32, key->nr, (uint32_t *)ctx->gcm_J0, + (uint32_t *)ctx->gcm_J0); + + gcm_xor_avx((uint8_t *)ctx->gcm_J0, (uint8_t *)ghash); + + /* We are done with the FPU, restore its state. */ + clear_fpu_regs(); + kfpu_end(); + + /* Compare the input authentication tag with what we calculated. */ + if (bcmp(&ctx->gcm_pt_buf[pt_len], ghash, ctx->gcm_tag_len)) { + /* They don't match. */ + return (CRYPTO_INVALID_MAC); + } + rv = crypto_put_output_data(ctx->gcm_pt_buf, out, pt_len); + if (rv != CRYPTO_SUCCESS) { + return (rv); + } + out->cd_offset += pt_len; + gcm_clear_ctx(ctx); + return (CRYPTO_SUCCESS); +} + +/* + * Initialize the GCM params H, Htabtle and the counter block. Save the + * initial counter block. + */ +static int +gcm_init_avx(gcm_ctx_t *ctx, unsigned char *iv, size_t iv_len, + unsigned char *auth_data, size_t auth_data_len, size_t block_size) +{ + uint8_t *cb = (uint8_t *)ctx->gcm_cb; + uint64_t *H = ctx->gcm_H; + const void *keysched = ((aes_key_t *)ctx->gcm_keysched)->encr_ks.ks32; + int aes_rounds = ((aes_key_t *)ctx->gcm_keysched)->nr; + uint8_t *datap = auth_data; + size_t chunk_size = (size_t)GCM_CHUNK_SIZE_READ; + size_t bleft; + + ASSERT(block_size == GCM_BLOCK_LEN); + + /* Init H (encrypt zero block) and create the initial counter block. */ + bzero(ctx->gcm_ghash, sizeof (ctx->gcm_ghash)); + bzero(H, sizeof (ctx->gcm_H)); + kfpu_begin(); + aes_encrypt_intel(keysched, aes_rounds, + (const uint32_t *)H, (uint32_t *)H); + + gcm_init_htab_avx(ctx->gcm_Htable, H); + + if (iv_len == 12) { + bcopy(iv, cb, 12); + cb[12] = 0; + cb[13] = 0; + cb[14] = 0; + cb[15] = 1; + /* We need the ICB later. */ + bcopy(cb, ctx->gcm_J0, sizeof (ctx->gcm_J0)); + } else { + /* + * Most consumers use 12 byte IVs, so it's OK to use the + * original routines for other IV sizes, just avoid nesting + * kfpu_begin calls. + */ + clear_fpu_regs(); + kfpu_end(); + gcm_format_initial_blocks(iv, iv_len, ctx, block_size, + aes_copy_block, aes_xor_block); + kfpu_begin(); + } + + /* Openssl post increments the counter, adjust for that. */ + gcm_incr_counter_block(ctx); + + /* Ghash AAD in chunk_size blocks. */ + for (bleft = auth_data_len; bleft >= chunk_size; bleft -= chunk_size) { + GHASH_AVX(ctx, datap, chunk_size); + datap += chunk_size; + clear_fpu_regs(); + kfpu_end(); + kfpu_begin(); + } + /* Ghash the remainder and handle possible incomplete GCM block. */ + if (bleft > 0) { + size_t incomp = bleft % block_size; + + bleft -= incomp; + if (bleft > 0) { + GHASH_AVX(ctx, datap, bleft); + datap += bleft; + } + if (incomp > 0) { + /* Zero pad and hash incomplete last block. */ + uint8_t *authp = (uint8_t *)ctx->gcm_tmp; + + bzero(authp, block_size); + bcopy(datap, authp, incomp); + GHASH_AVX(ctx, authp, block_size); + } + } + clear_fpu_regs(); + kfpu_end(); + return (CRYPTO_SUCCESS); +} + +#if defined(_KERNEL) +static int +icp_gcm_avx_set_chunk_size(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + char val_rounded[16]; + int error = 0; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + val = (val / GCM_AVX_MIN_DECRYPT_BYTES) * GCM_AVX_MIN_DECRYPT_BYTES; + + if (val < GCM_AVX_MIN_ENCRYPT_BYTES || val > GCM_AVX_MAX_CHUNK_SIZE) + return (-EINVAL); + + snprintf(val_rounded, 16, "%u", (uint32_t)val); + error = param_set_uint(val_rounded, kp); + return (error); +} + +module_param_call(icp_gcm_avx_chunk_size, icp_gcm_avx_set_chunk_size, + param_get_uint, &gcm_avx_chunk_size, 0644); + +MODULE_PARM_DESC(icp_gcm_avx_chunk_size, + "How many bytes to process while owning the FPU"); + +#endif /* defined(__KERNEL) */ +#endif /* ifdef CAN_USE_GCM_ASM */ diff --git a/module/icp/algs/modes/gcm_generic.c b/module/icp/algs/modes/gcm_generic.c new file mode 100644 index 000000000000..16b57998a92f --- /dev/null +++ b/module/icp/algs/modes/gcm_generic.c @@ -0,0 +1,83 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include + +struct aes_block { + uint64_t a; + uint64_t b; +}; + +/* + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on *x_in and *y and place the result in *res. + * + * Byte swap the input (*x_in and *y) and the output (*res). + * + * Note: x_in, y, and res all point to 16-byte numbers (an array of two + * 64-bit integers). + */ +static void +gcm_generic_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) +{ + static const uint64_t R = 0xe100000000000000ULL; + struct aes_block z = {0, 0}; + struct aes_block v; + uint64_t x; + int i, j; + + v.a = ntohll(y[0]); + v.b = ntohll(y[1]); + + for (j = 0; j < 2; j++) { + x = ntohll(x_in[j]); + for (i = 0; i < 64; i++, x <<= 1) { + if (x & 0x8000000000000000ULL) { + z.a ^= v.a; + z.b ^= v.b; + } + if (v.b & 1ULL) { + v.b = (v.a << 63)|(v.b >> 1); + v.a = (v.a >> 1) ^ R; + } else { + v.b = (v.a << 63)|(v.b >> 1); + v.a = v.a >> 1; + } + } + } + res[0] = htonll(z.a); + res[1] = htonll(z.b); +} + +static boolean_t +gcm_generic_will_work(void) +{ + return (B_TRUE); +} + +const gcm_impl_ops_t gcm_generic_impl = { + .mul = &gcm_generic_mul, + .is_supported = &gcm_generic_will_work, + .name = "generic" +}; diff --git a/module/icp/algs/modes/gcm_pclmulqdq.c b/module/icp/algs/modes/gcm_pclmulqdq.c new file mode 100644 index 000000000000..05920115ce86 --- /dev/null +++ b/module/icp/algs/modes/gcm_pclmulqdq.c @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#if defined(__x86_64) && defined(HAVE_PCLMULQDQ) + +#include +#include + +/* These functions are used to execute pclmulqdq based assembly methods */ +extern void gcm_mul_pclmulqdq(uint64_t *, uint64_t *, uint64_t *); + +#include + +/* + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on *x_in and *y and place the result in *res. + * + * Byte swap the input (*x_in and *y) and the output (*res). + * + * Note: x_in, y, and res all point to 16-byte numbers (an array of two + * 64-bit integers). + */ +static void +gcm_pclmulqdq_mul(uint64_t *x_in, uint64_t *y, uint64_t *res) +{ + kfpu_begin(); + gcm_mul_pclmulqdq(x_in, y, res); + kfpu_end(); +} + +static boolean_t +gcm_pclmulqdq_will_work(void) +{ + return (kfpu_allowed() && zfs_pclmulqdq_available()); +} + +const gcm_impl_ops_t gcm_pclmulqdq_impl = { + .mul = &gcm_pclmulqdq_mul, + .is_supported = &gcm_pclmulqdq_will_work, + .name = "pclmulqdq" +}; + +#endif /* defined(__x86_64) && defined(HAVE_PCLMULQDQ) */ diff --git a/module/icp/algs/modes/modes.c b/module/icp/algs/modes/modes.c new file mode 100644 index 000000000000..f07876a478e2 --- /dev/null +++ b/module/icp/algs/modes/modes.c @@ -0,0 +1,157 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +/* + * Initialize by setting iov_or_mp to point to the current iovec or mp, + * and by setting current_offset to an offset within the current iovec or mp. + */ +void +crypto_init_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset) +{ + offset_t offset; + + switch (out->cd_format) { + case CRYPTO_DATA_RAW: + *current_offset = out->cd_offset; + break; + + case CRYPTO_DATA_UIO: { + uio_t *uiop = out->cd_uio; + uint_t vec_idx; + + offset = out->cd_offset; + offset = uio_index_at_offset(uiop, offset, &vec_idx); + + *current_offset = offset; + *iov_or_mp = (void *)(uintptr_t)vec_idx; + break; + } + } /* end switch */ +} + +/* + * Get pointers for where in the output to copy a block of encrypted or + * decrypted data. The iov_or_mp argument stores a pointer to the current + * iovec or mp, and offset stores an offset into the current iovec or mp. + */ +void +crypto_get_ptrs(crypto_data_t *out, void **iov_or_mp, offset_t *current_offset, + uint8_t **out_data_1, size_t *out_data_1_len, uint8_t **out_data_2, + size_t amt) +{ + offset_t offset; + + switch (out->cd_format) { + case CRYPTO_DATA_RAW: { + iovec_t *iov; + + offset = *current_offset; + iov = &out->cd_raw; + if ((offset + amt) <= iov->iov_len) { + /* one block fits */ + *out_data_1 = (uint8_t *)iov->iov_base + offset; + *out_data_1_len = amt; + *out_data_2 = NULL; + *current_offset = offset + amt; + } + break; + } + + case CRYPTO_DATA_UIO: { + uio_t *uio = out->cd_uio; + offset_t offset; + uint_t vec_idx; + uint8_t *p; + uint64_t iov_len; + void *iov_base; + + offset = *current_offset; + vec_idx = (uintptr_t)(*iov_or_mp); + uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + p = (uint8_t *)iov_base + offset; + *out_data_1 = p; + + if (offset + amt <= iov_len) { + /* can fit one block into this iov */ + *out_data_1_len = amt; + *out_data_2 = NULL; + *current_offset = offset + amt; + } else { + /* one block spans two iovecs */ + *out_data_1_len = iov_len - offset; + if (vec_idx == uio_iovcnt(uio)) + return; + vec_idx++; + uio_iov_at_index(uio, vec_idx, &iov_base, &iov_len); + *out_data_2 = (uint8_t *)iov_base; + *current_offset = amt - *out_data_1_len; + } + *iov_or_mp = (void *)(uintptr_t)vec_idx; + break; + } + } /* end switch */ +} + +void +crypto_free_mode_ctx(void *ctx) +{ + common_ctx_t *common_ctx = (common_ctx_t *)ctx; + + switch (common_ctx->cc_flags & + (ECB_MODE|CBC_MODE|CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) { + case ECB_MODE: + kmem_free(common_ctx, sizeof (ecb_ctx_t)); + break; + + case CBC_MODE: + kmem_free(common_ctx, sizeof (cbc_ctx_t)); + break; + + case CTR_MODE: + kmem_free(common_ctx, sizeof (ctr_ctx_t)); + break; + + case CCM_MODE: + if (((ccm_ctx_t *)ctx)->ccm_pt_buf != NULL) + vmem_free(((ccm_ctx_t *)ctx)->ccm_pt_buf, + ((ccm_ctx_t *)ctx)->ccm_data_len); + + kmem_free(ctx, sizeof (ccm_ctx_t)); + break; + + case GCM_MODE: + case GMAC_MODE: + if (((gcm_ctx_t *)ctx)->gcm_pt_buf != NULL) + vmem_free(((gcm_ctx_t *)ctx)->gcm_pt_buf, + ((gcm_ctx_t *)ctx)->gcm_pt_buf_len); + + kmem_free(ctx, sizeof (gcm_ctx_t)); + } +} diff --git a/module/icp/algs/sha1/sha1.c b/module/icp/algs/sha1/sha1.c new file mode 100644 index 000000000000..da34222c8fc3 --- /dev/null +++ b/module/icp/algs/sha1/sha1.c @@ -0,0 +1,835 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * The basic framework for this code came from the reference + * implementation for MD5. That implementation is Copyright (C) + * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * NOTE: Cleaned-up and optimized, version of SHA1, based on the FIPS 180-1 + * standard, available at http://www.itl.nist.gov/fipspubs/fip180-1.htm + * Not as fast as one would like -- further optimizations are encouraged + * and appreciated. + */ + +#include +#include +#include + +#ifdef _LITTLE_ENDIAN +#include +#define HAVE_HTONL +#endif + +#define _RESTRICT_KYWD + +static void Encode(uint8_t *, const uint32_t *, size_t); + +#if defined(__sparc) + +#define SHA1_TRANSFORM(ctx, in) \ + SHA1Transform((ctx)->state[0], (ctx)->state[1], (ctx)->state[2], \ + (ctx)->state[3], (ctx)->state[4], (ctx), (in)) + +static void SHA1Transform(uint32_t, uint32_t, uint32_t, uint32_t, uint32_t, + SHA1_CTX *, const uint8_t *); + +#elif defined(__amd64) + +#define SHA1_TRANSFORM(ctx, in) sha1_block_data_order((ctx), (in), 1) +#define SHA1_TRANSFORM_BLOCKS(ctx, in, num) sha1_block_data_order((ctx), \ + (in), (num)) + +void sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t num_blocks); + +#else + +#define SHA1_TRANSFORM(ctx, in) SHA1Transform((ctx), (in)) + +static void SHA1Transform(SHA1_CTX *, const uint8_t *); + +#endif + + +static uint8_t PADDING[64] = { 0x80, /* all zeros */ }; + +/* + * F, G, and H are the basic SHA1 functions. + */ +#define F(b, c, d) (((b) & (c)) | ((~b) & (d))) +#define G(b, c, d) ((b) ^ (c) ^ (d)) +#define H(b, c, d) (((b) & (c)) | (((b)|(c)) & (d))) + +/* + * SHA1Init() + * + * purpose: initializes the sha1 context and begins and sha1 digest operation + * input: SHA1_CTX * : the context to initializes. + * output: void + */ + +void +SHA1Init(SHA1_CTX *ctx) +{ + ctx->count[0] = ctx->count[1] = 0; + + /* + * load magic initialization constants. Tell lint + * that these constants are unsigned by using U. + */ + + ctx->state[0] = 0x67452301U; + ctx->state[1] = 0xefcdab89U; + ctx->state[2] = 0x98badcfeU; + ctx->state[3] = 0x10325476U; + ctx->state[4] = 0xc3d2e1f0U; +} + +void +SHA1Update(SHA1_CTX *ctx, const void *inptr, size_t input_len) +{ + uint32_t i, buf_index, buf_len; + const uint8_t *input = inptr; +#if defined(__amd64) + uint32_t block_count; +#endif /* __amd64 */ + + /* check for noop */ + if (input_len == 0) + return; + + /* compute number of bytes mod 64 */ + buf_index = (ctx->count[1] >> 3) & 0x3F; + + /* update number of bits */ + if ((ctx->count[1] += (input_len << 3)) < (input_len << 3)) + ctx->count[0]++; + + ctx->count[0] += (input_len >> 29); + + buf_len = 64 - buf_index; + + /* transform as many times as possible */ + i = 0; + if (input_len >= buf_len) { + + /* + * general optimization: + * + * only do initial bcopy() and SHA1Transform() if + * buf_index != 0. if buf_index == 0, we're just + * wasting our time doing the bcopy() since there + * wasn't any data left over from a previous call to + * SHA1Update(). + */ + + if (buf_index) { + bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len); + SHA1_TRANSFORM(ctx, ctx->buf_un.buf8); + i = buf_len; + } + +#if !defined(__amd64) + for (; i + 63 < input_len; i += 64) + SHA1_TRANSFORM(ctx, &input[i]); +#else + block_count = (input_len - i) >> 6; + if (block_count > 0) { + SHA1_TRANSFORM_BLOCKS(ctx, &input[i], block_count); + i += block_count << 6; + } +#endif /* !__amd64 */ + + /* + * general optimization: + * + * if i and input_len are the same, return now instead + * of calling bcopy(), since the bcopy() in this case + * will be an expensive nop. + */ + + if (input_len == i) + return; + + buf_index = 0; + } + + /* buffer remaining input */ + bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i); +} + +/* + * SHA1Final() + * + * purpose: ends an sha1 digest operation, finalizing the message digest and + * zeroing the context. + * input: uchar_t * : A buffer to store the digest. + * : The function actually uses void* because many + * : callers pass things other than uchar_t here. + * SHA1_CTX * : the context to finalize, save, and zero + * output: void + */ + +void +SHA1Final(void *digest, SHA1_CTX *ctx) +{ + uint8_t bitcount_be[sizeof (ctx->count)]; + uint32_t index = (ctx->count[1] >> 3) & 0x3f; + + /* store bit count, big endian */ + Encode(bitcount_be, ctx->count, sizeof (bitcount_be)); + + /* pad out to 56 mod 64 */ + SHA1Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); + + /* append length (before padding) */ + SHA1Update(ctx, bitcount_be, sizeof (bitcount_be)); + + /* store state in digest */ + Encode(digest, ctx->state, sizeof (ctx->state)); + + /* zeroize sensitive information */ + bzero(ctx, sizeof (*ctx)); +} + + +#if !defined(__amd64) + +typedef uint32_t sha1word; + +/* + * sparc optimization: + * + * on the sparc, we can load big endian 32-bit data easily. note that + * special care must be taken to ensure the address is 32-bit aligned. + * in the interest of speed, we don't check to make sure, since + * careful programming can guarantee this for us. + */ + +#if defined(_ZFS_BIG_ENDIAN) +#define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) + +#elif defined(HAVE_HTONL) +#define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr))) + +#else +#define LOAD_BIG_32(addr) BE_32(*((uint32_t *)(addr))) +#endif /* _BIG_ENDIAN */ + +/* + * SHA1Transform() + */ +#if defined(W_ARRAY) +#define W(n) w[n] +#else /* !defined(W_ARRAY) */ +#define W(n) w_ ## n +#endif /* !defined(W_ARRAY) */ + +/* + * ROTATE_LEFT rotates x left n bits. + */ + +#if defined(__GNUC__) && defined(_LP64) +static __inline__ uint64_t +ROTATE_LEFT(uint64_t value, uint32_t n) +{ + uint32_t t32; + + t32 = (uint32_t)value; + return ((t32 << n) | (t32 >> (32 - n))); +} + +#else + +#define ROTATE_LEFT(x, n) \ + (((x) << (n)) | ((x) >> ((sizeof (x) * NBBY)-(n)))) + +#endif + +#if defined(__sparc) + + +/* + * sparc register window optimization: + * + * `a', `b', `c', `d', and `e' are passed into SHA1Transform + * explicitly since it increases the number of registers available to + * the compiler. under this scheme, these variables can be held in + * %i0 - %i4, which leaves more local and out registers available. + * + * purpose: sha1 transformation -- updates the digest based on `block' + * input: uint32_t : bytes 1 - 4 of the digest + * uint32_t : bytes 5 - 8 of the digest + * uint32_t : bytes 9 - 12 of the digest + * uint32_t : bytes 12 - 16 of the digest + * uint32_t : bytes 16 - 20 of the digest + * SHA1_CTX * : the context to update + * uint8_t [64]: the block to use to update the digest + * output: void + */ + + +void +SHA1Transform(uint32_t a, uint32_t b, uint32_t c, uint32_t d, uint32_t e, + SHA1_CTX *ctx, const uint8_t blk[64]) +{ + /* + * sparc optimization: + * + * while it is somewhat counter-intuitive, on sparc, it is + * more efficient to place all the constants used in this + * function in an array and load the values out of the array + * than to manually load the constants. this is because + * setting a register to a 32-bit value takes two ops in most + * cases: a `sethi' and an `or', but loading a 32-bit value + * from memory only takes one `ld' (or `lduw' on v9). while + * this increases memory usage, the compiler can find enough + * other things to do while waiting to keep the pipeline does + * not stall. additionally, it is likely that many of these + * constants are cached so that later accesses do not even go + * out to the bus. + * + * this array is declared `static' to keep the compiler from + * having to bcopy() this array onto the stack frame of + * SHA1Transform() each time it is called -- which is + * unacceptably expensive. + * + * the `const' is to ensure that callers are good citizens and + * do not try to munge the array. since these routines are + * going to be called from inside multithreaded kernelland, + * this is a good safety check. -- `sha1_consts' will end up in + * .rodata. + * + * unfortunately, loading from an array in this manner hurts + * performance under Intel. So, there is a macro, + * SHA1_CONST(), used in SHA1Transform(), that either expands to + * a reference to this array, or to the actual constant, + * depending on what platform this code is compiled for. + */ + + + static const uint32_t sha1_consts[] = { + SHA1_CONST_0, SHA1_CONST_1, SHA1_CONST_2, SHA1_CONST_3 + }; + + + /* + * general optimization: + * + * use individual integers instead of using an array. this is a + * win, although the amount it wins by seems to vary quite a bit. + */ + + + uint32_t w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7; + uint32_t w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15; + + + /* + * sparc optimization: + * + * if `block' is already aligned on a 4-byte boundary, use + * LOAD_BIG_32() directly. otherwise, bcopy() into a + * buffer that *is* aligned on a 4-byte boundary and then do + * the LOAD_BIG_32() on that buffer. benchmarks have shown + * that using the bcopy() is better than loading the bytes + * individually and doing the endian-swap by hand. + * + * even though it's quite tempting to assign to do: + * + * blk = bcopy(ctx->buf_un.buf32, blk, sizeof (ctx->buf_un.buf32)); + * + * and only have one set of LOAD_BIG_32()'s, the compiler + * *does not* like that, so please resist the urge. + */ + + + if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */ + bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32)); + w_15 = LOAD_BIG_32(ctx->buf_un.buf32 + 15); + w_14 = LOAD_BIG_32(ctx->buf_un.buf32 + 14); + w_13 = LOAD_BIG_32(ctx->buf_un.buf32 + 13); + w_12 = LOAD_BIG_32(ctx->buf_un.buf32 + 12); + w_11 = LOAD_BIG_32(ctx->buf_un.buf32 + 11); + w_10 = LOAD_BIG_32(ctx->buf_un.buf32 + 10); + w_9 = LOAD_BIG_32(ctx->buf_un.buf32 + 9); + w_8 = LOAD_BIG_32(ctx->buf_un.buf32 + 8); + w_7 = LOAD_BIG_32(ctx->buf_un.buf32 + 7); + w_6 = LOAD_BIG_32(ctx->buf_un.buf32 + 6); + w_5 = LOAD_BIG_32(ctx->buf_un.buf32 + 5); + w_4 = LOAD_BIG_32(ctx->buf_un.buf32 + 4); + w_3 = LOAD_BIG_32(ctx->buf_un.buf32 + 3); + w_2 = LOAD_BIG_32(ctx->buf_un.buf32 + 2); + w_1 = LOAD_BIG_32(ctx->buf_un.buf32 + 1); + w_0 = LOAD_BIG_32(ctx->buf_un.buf32 + 0); + } else { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_15 = LOAD_BIG_32(blk + 60); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_14 = LOAD_BIG_32(blk + 56); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_13 = LOAD_BIG_32(blk + 52); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_12 = LOAD_BIG_32(blk + 48); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_11 = LOAD_BIG_32(blk + 44); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_10 = LOAD_BIG_32(blk + 40); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_9 = LOAD_BIG_32(blk + 36); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_8 = LOAD_BIG_32(blk + 32); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_7 = LOAD_BIG_32(blk + 28); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_6 = LOAD_BIG_32(blk + 24); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_5 = LOAD_BIG_32(blk + 20); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_4 = LOAD_BIG_32(blk + 16); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_3 = LOAD_BIG_32(blk + 12); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_2 = LOAD_BIG_32(blk + 8); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_1 = LOAD_BIG_32(blk + 4); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w_0 = LOAD_BIG_32(blk + 0); + } +#else /* !defined(__sparc) */ + +void /* CSTYLED */ +SHA1Transform(SHA1_CTX *ctx, const uint8_t blk[64]) +{ + /* CSTYLED */ + sha1word a = ctx->state[0]; + sha1word b = ctx->state[1]; + sha1word c = ctx->state[2]; + sha1word d = ctx->state[3]; + sha1word e = ctx->state[4]; + +#if defined(W_ARRAY) + sha1word w[16]; +#else /* !defined(W_ARRAY) */ + sha1word w_0, w_1, w_2, w_3, w_4, w_5, w_6, w_7; + sha1word w_8, w_9, w_10, w_11, w_12, w_13, w_14, w_15; +#endif /* !defined(W_ARRAY) */ + + W(0) = LOAD_BIG_32((void *)(blk + 0)); + W(1) = LOAD_BIG_32((void *)(blk + 4)); + W(2) = LOAD_BIG_32((void *)(blk + 8)); + W(3) = LOAD_BIG_32((void *)(blk + 12)); + W(4) = LOAD_BIG_32((void *)(blk + 16)); + W(5) = LOAD_BIG_32((void *)(blk + 20)); + W(6) = LOAD_BIG_32((void *)(blk + 24)); + W(7) = LOAD_BIG_32((void *)(blk + 28)); + W(8) = LOAD_BIG_32((void *)(blk + 32)); + W(9) = LOAD_BIG_32((void *)(blk + 36)); + W(10) = LOAD_BIG_32((void *)(blk + 40)); + W(11) = LOAD_BIG_32((void *)(blk + 44)); + W(12) = LOAD_BIG_32((void *)(blk + 48)); + W(13) = LOAD_BIG_32((void *)(blk + 52)); + W(14) = LOAD_BIG_32((void *)(blk + 56)); + W(15) = LOAD_BIG_32((void *)(blk + 60)); + +#endif /* !defined(__sparc) */ + + /* + * general optimization: + * + * even though this approach is described in the standard as + * being slower algorithmically, it is 30-40% faster than the + * "faster" version under SPARC, because this version has more + * of the constraints specified at compile-time and uses fewer + * variables (and therefore has better register utilization) + * than its "speedier" brother. (i've tried both, trust me) + * + * for either method given in the spec, there is an "assignment" + * phase where the following takes place: + * + * tmp = (main_computation); + * e = d; d = c; c = rotate_left(b, 30); b = a; a = tmp; + * + * we can make the algorithm go faster by not doing this work, + * but just pretending that `d' is now `e', etc. this works + * really well and obviates the need for a temporary variable. + * however, we still explicitly perform the rotate action, + * since it is cheaper on SPARC to do it once than to have to + * do it over and over again. + */ + + /* round 1 */ + e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(0) + SHA1_CONST(0); /* 0 */ + b = ROTATE_LEFT(b, 30); + + d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(1) + SHA1_CONST(0); /* 1 */ + a = ROTATE_LEFT(a, 30); + + c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(2) + SHA1_CONST(0); /* 2 */ + e = ROTATE_LEFT(e, 30); + + b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(3) + SHA1_CONST(0); /* 3 */ + d = ROTATE_LEFT(d, 30); + + a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(4) + SHA1_CONST(0); /* 4 */ + c = ROTATE_LEFT(c, 30); + + e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(5) + SHA1_CONST(0); /* 5 */ + b = ROTATE_LEFT(b, 30); + + d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(6) + SHA1_CONST(0); /* 6 */ + a = ROTATE_LEFT(a, 30); + + c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(7) + SHA1_CONST(0); /* 7 */ + e = ROTATE_LEFT(e, 30); + + b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(8) + SHA1_CONST(0); /* 8 */ + d = ROTATE_LEFT(d, 30); + + a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(9) + SHA1_CONST(0); /* 9 */ + c = ROTATE_LEFT(c, 30); + + e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(10) + SHA1_CONST(0); /* 10 */ + b = ROTATE_LEFT(b, 30); + + d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(11) + SHA1_CONST(0); /* 11 */ + a = ROTATE_LEFT(a, 30); + + c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(12) + SHA1_CONST(0); /* 12 */ + e = ROTATE_LEFT(e, 30); + + b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(13) + SHA1_CONST(0); /* 13 */ + d = ROTATE_LEFT(d, 30); + + a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(14) + SHA1_CONST(0); /* 14 */ + c = ROTATE_LEFT(c, 30); + + e = ROTATE_LEFT(a, 5) + F(b, c, d) + e + W(15) + SHA1_CONST(0); /* 15 */ + b = ROTATE_LEFT(b, 30); + + W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 16 */ + d = ROTATE_LEFT(e, 5) + F(a, b, c) + d + W(0) + SHA1_CONST(0); + a = ROTATE_LEFT(a, 30); + + W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 17 */ + c = ROTATE_LEFT(d, 5) + F(e, a, b) + c + W(1) + SHA1_CONST(0); + e = ROTATE_LEFT(e, 30); + + W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 18 */ + b = ROTATE_LEFT(c, 5) + F(d, e, a) + b + W(2) + SHA1_CONST(0); + d = ROTATE_LEFT(d, 30); + + W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 19 */ + a = ROTATE_LEFT(b, 5) + F(c, d, e) + a + W(3) + SHA1_CONST(0); + c = ROTATE_LEFT(c, 30); + + /* round 2 */ + W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 20 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(4) + SHA1_CONST(1); + b = ROTATE_LEFT(b, 30); + + W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 21 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(5) + SHA1_CONST(1); + a = ROTATE_LEFT(a, 30); + + W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 22 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(6) + SHA1_CONST(1); + e = ROTATE_LEFT(e, 30); + + W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 23 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(7) + SHA1_CONST(1); + d = ROTATE_LEFT(d, 30); + + W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 24 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(8) + SHA1_CONST(1); + c = ROTATE_LEFT(c, 30); + + W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 25 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(9) + SHA1_CONST(1); + b = ROTATE_LEFT(b, 30); + + W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 26 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(10) + SHA1_CONST(1); + a = ROTATE_LEFT(a, 30); + + W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 27 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(11) + SHA1_CONST(1); + e = ROTATE_LEFT(e, 30); + + W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 28 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(12) + SHA1_CONST(1); + d = ROTATE_LEFT(d, 30); + + W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 29 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(13) + SHA1_CONST(1); + c = ROTATE_LEFT(c, 30); + + W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 30 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(14) + SHA1_CONST(1); + b = ROTATE_LEFT(b, 30); + + W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 31 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(15) + SHA1_CONST(1); + a = ROTATE_LEFT(a, 30); + + W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 32 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(0) + SHA1_CONST(1); + e = ROTATE_LEFT(e, 30); + + W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 33 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(1) + SHA1_CONST(1); + d = ROTATE_LEFT(d, 30); + + W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 34 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(2) + SHA1_CONST(1); + c = ROTATE_LEFT(c, 30); + + W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 35 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(3) + SHA1_CONST(1); + b = ROTATE_LEFT(b, 30); + + W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 36 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(4) + SHA1_CONST(1); + a = ROTATE_LEFT(a, 30); + + W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 37 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(5) + SHA1_CONST(1); + e = ROTATE_LEFT(e, 30); + + W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 38 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(6) + SHA1_CONST(1); + d = ROTATE_LEFT(d, 30); + + W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 39 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(7) + SHA1_CONST(1); + c = ROTATE_LEFT(c, 30); + + /* round 3 */ + W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 40 */ + e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(8) + SHA1_CONST(2); + b = ROTATE_LEFT(b, 30); + + W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 41 */ + d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(9) + SHA1_CONST(2); + a = ROTATE_LEFT(a, 30); + + W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 42 */ + c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(10) + SHA1_CONST(2); + e = ROTATE_LEFT(e, 30); + + W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 43 */ + b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(11) + SHA1_CONST(2); + d = ROTATE_LEFT(d, 30); + + W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 44 */ + a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(12) + SHA1_CONST(2); + c = ROTATE_LEFT(c, 30); + + W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 45 */ + e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(13) + SHA1_CONST(2); + b = ROTATE_LEFT(b, 30); + + W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 46 */ + d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(14) + SHA1_CONST(2); + a = ROTATE_LEFT(a, 30); + + W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 47 */ + c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(15) + SHA1_CONST(2); + e = ROTATE_LEFT(e, 30); + + W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 48 */ + b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(0) + SHA1_CONST(2); + d = ROTATE_LEFT(d, 30); + + W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 49 */ + a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(1) + SHA1_CONST(2); + c = ROTATE_LEFT(c, 30); + + W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 50 */ + e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(2) + SHA1_CONST(2); + b = ROTATE_LEFT(b, 30); + + W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 51 */ + d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(3) + SHA1_CONST(2); + a = ROTATE_LEFT(a, 30); + + W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 52 */ + c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(4) + SHA1_CONST(2); + e = ROTATE_LEFT(e, 30); + + W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 53 */ + b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(5) + SHA1_CONST(2); + d = ROTATE_LEFT(d, 30); + + W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 54 */ + a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(6) + SHA1_CONST(2); + c = ROTATE_LEFT(c, 30); + + W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 55 */ + e = ROTATE_LEFT(a, 5) + H(b, c, d) + e + W(7) + SHA1_CONST(2); + b = ROTATE_LEFT(b, 30); + + W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 56 */ + d = ROTATE_LEFT(e, 5) + H(a, b, c) + d + W(8) + SHA1_CONST(2); + a = ROTATE_LEFT(a, 30); + + W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 57 */ + c = ROTATE_LEFT(d, 5) + H(e, a, b) + c + W(9) + SHA1_CONST(2); + e = ROTATE_LEFT(e, 30); + + W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 58 */ + b = ROTATE_LEFT(c, 5) + H(d, e, a) + b + W(10) + SHA1_CONST(2); + d = ROTATE_LEFT(d, 30); + + W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 59 */ + a = ROTATE_LEFT(b, 5) + H(c, d, e) + a + W(11) + SHA1_CONST(2); + c = ROTATE_LEFT(c, 30); + + /* round 4 */ + W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 60 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(12) + SHA1_CONST(3); + b = ROTATE_LEFT(b, 30); + + W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 61 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(13) + SHA1_CONST(3); + a = ROTATE_LEFT(a, 30); + + W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 62 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(14) + SHA1_CONST(3); + e = ROTATE_LEFT(e, 30); + + W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 63 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(15) + SHA1_CONST(3); + d = ROTATE_LEFT(d, 30); + + W(0) = ROTATE_LEFT((W(13) ^ W(8) ^ W(2) ^ W(0)), 1); /* 64 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(0) + SHA1_CONST(3); + c = ROTATE_LEFT(c, 30); + + W(1) = ROTATE_LEFT((W(14) ^ W(9) ^ W(3) ^ W(1)), 1); /* 65 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(1) + SHA1_CONST(3); + b = ROTATE_LEFT(b, 30); + + W(2) = ROTATE_LEFT((W(15) ^ W(10) ^ W(4) ^ W(2)), 1); /* 66 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(2) + SHA1_CONST(3); + a = ROTATE_LEFT(a, 30); + + W(3) = ROTATE_LEFT((W(0) ^ W(11) ^ W(5) ^ W(3)), 1); /* 67 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(3) + SHA1_CONST(3); + e = ROTATE_LEFT(e, 30); + + W(4) = ROTATE_LEFT((W(1) ^ W(12) ^ W(6) ^ W(4)), 1); /* 68 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(4) + SHA1_CONST(3); + d = ROTATE_LEFT(d, 30); + + W(5) = ROTATE_LEFT((W(2) ^ W(13) ^ W(7) ^ W(5)), 1); /* 69 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(5) + SHA1_CONST(3); + c = ROTATE_LEFT(c, 30); + + W(6) = ROTATE_LEFT((W(3) ^ W(14) ^ W(8) ^ W(6)), 1); /* 70 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(6) + SHA1_CONST(3); + b = ROTATE_LEFT(b, 30); + + W(7) = ROTATE_LEFT((W(4) ^ W(15) ^ W(9) ^ W(7)), 1); /* 71 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(7) + SHA1_CONST(3); + a = ROTATE_LEFT(a, 30); + + W(8) = ROTATE_LEFT((W(5) ^ W(0) ^ W(10) ^ W(8)), 1); /* 72 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(8) + SHA1_CONST(3); + e = ROTATE_LEFT(e, 30); + + W(9) = ROTATE_LEFT((W(6) ^ W(1) ^ W(11) ^ W(9)), 1); /* 73 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(9) + SHA1_CONST(3); + d = ROTATE_LEFT(d, 30); + + W(10) = ROTATE_LEFT((W(7) ^ W(2) ^ W(12) ^ W(10)), 1); /* 74 */ + a = ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(10) + SHA1_CONST(3); + c = ROTATE_LEFT(c, 30); + + W(11) = ROTATE_LEFT((W(8) ^ W(3) ^ W(13) ^ W(11)), 1); /* 75 */ + e = ROTATE_LEFT(a, 5) + G(b, c, d) + e + W(11) + SHA1_CONST(3); + b = ROTATE_LEFT(b, 30); + + W(12) = ROTATE_LEFT((W(9) ^ W(4) ^ W(14) ^ W(12)), 1); /* 76 */ + d = ROTATE_LEFT(e, 5) + G(a, b, c) + d + W(12) + SHA1_CONST(3); + a = ROTATE_LEFT(a, 30); + + W(13) = ROTATE_LEFT((W(10) ^ W(5) ^ W(15) ^ W(13)), 1); /* 77 */ + c = ROTATE_LEFT(d, 5) + G(e, a, b) + c + W(13) + SHA1_CONST(3); + e = ROTATE_LEFT(e, 30); + + W(14) = ROTATE_LEFT((W(11) ^ W(6) ^ W(0) ^ W(14)), 1); /* 78 */ + b = ROTATE_LEFT(c, 5) + G(d, e, a) + b + W(14) + SHA1_CONST(3); + d = ROTATE_LEFT(d, 30); + + W(15) = ROTATE_LEFT((W(12) ^ W(7) ^ W(1) ^ W(15)), 1); /* 79 */ + + ctx->state[0] += ROTATE_LEFT(b, 5) + G(c, d, e) + a + W(15) + + SHA1_CONST(3); + ctx->state[1] += b; + ctx->state[2] += ROTATE_LEFT(c, 30); + ctx->state[3] += d; + ctx->state[4] += e; + + /* zeroize sensitive information */ + W(0) = W(1) = W(2) = W(3) = W(4) = W(5) = W(6) = W(7) = W(8) = 0; + W(9) = W(10) = W(11) = W(12) = W(13) = W(14) = W(15) = 0; +} +#endif /* !__amd64 */ + + +/* + * Encode() + * + * purpose: to convert a list of numbers from little endian to big endian + * input: uint8_t * : place to store the converted big endian numbers + * uint32_t * : place to get numbers to convert from + * size_t : the length of the input in bytes + * output: void + */ + +static void +Encode(uint8_t *_RESTRICT_KYWD output, const uint32_t *_RESTRICT_KYWD input, + size_t len) +{ + size_t i, j; + +#if defined(__sparc) + if (IS_P2ALIGNED(output, sizeof (uint32_t))) { + for (i = 0, j = 0; j < len; i++, j += 4) { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + *((uint32_t *)(output + j)) = input[i]; + } + } else { +#endif /* little endian -- will work on big endian, but slowly */ + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (input[i] >> 24) & 0xff; + output[j + 1] = (input[i] >> 16) & 0xff; + output[j + 2] = (input[i] >> 8) & 0xff; + output[j + 3] = input[i] & 0xff; + } +#if defined(__sparc) + } +#endif +} diff --git a/module/icp/algs/sha2/sha2.c b/module/icp/algs/sha2/sha2.c new file mode 100644 index 000000000000..75f6a3c1af4b --- /dev/null +++ b/module/icp/algs/sha2/sha2.c @@ -0,0 +1,956 @@ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +/* + * The basic framework for this code came from the reference + * implementation for MD5. That implementation is Copyright (C) + * 1991-2, RSA Data Security, Inc. Created 1991. All rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * NOTE: Cleaned-up and optimized, version of SHA2, based on the FIPS 180-2 + * standard, available at + * http://csrc.nist.gov/publications/fips/fips180-2/fips180-2.pdf + * Not as fast as one would like -- further optimizations are encouraged + * and appreciated. + */ + +#include +#define _SHA2_IMPL +#include +#include + +#define _RESTRICT_KYWD + +#ifdef _ZFS_LITTLE_ENDIAN +#include +#define HAVE_HTONL +#endif +#include /* for _ILP32 */ + +static void Encode(uint8_t *, uint32_t *, size_t); +static void Encode64(uint8_t *, uint64_t *, size_t); + +/* userspace only supports the generic version */ +#if defined(__amd64) && defined(_KERNEL) +#define SHA512Transform(ctx, in) SHA512TransformBlocks((ctx), (in), 1) +#define SHA256Transform(ctx, in) SHA256TransformBlocks((ctx), (in), 1) + +void SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); +void SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num); + +#else +static void SHA256Transform(SHA2_CTX *, const uint8_t *); +static void SHA512Transform(SHA2_CTX *, const uint8_t *); +#endif /* __amd64 && _KERNEL */ + +static uint8_t PADDING[128] = { 0x80, /* all zeros */ }; + +/* + * The low-level checksum routines use a lot of stack space. On systems where + * small stacks are enforced (like 32-bit kernel builds), insert compiler memory + * barriers to reduce stack frame size. This can reduce the SHA512Transform() + * stack frame usage from 3k to <1k on ARM32, for example. + */ +#if defined(_ILP32) || defined(__powerpc) /* small stack */ +#define SMALL_STACK_MEMORY_BARRIER asm volatile("": : :"memory"); +#else +#define SMALL_STACK_MEMORY_BARRIER +#endif + +/* Ch and Maj are the basic SHA2 functions. */ +#define Ch(b, c, d) (((b) & (c)) ^ ((~b) & (d))) +#define Maj(b, c, d) (((b) & (c)) ^ ((b) & (d)) ^ ((c) & (d))) + +/* Rotates x right n bits. */ +#define ROTR(x, n) \ + (((x) >> (n)) | ((x) << ((sizeof (x) * NBBY)-(n)))) + +/* Shift x right n bits */ +#define SHR(x, n) ((x) >> (n)) + +/* SHA256 Functions */ +#define BIGSIGMA0_256(x) (ROTR((x), 2) ^ ROTR((x), 13) ^ ROTR((x), 22)) +#define BIGSIGMA1_256(x) (ROTR((x), 6) ^ ROTR((x), 11) ^ ROTR((x), 25)) +#define SIGMA0_256(x) (ROTR((x), 7) ^ ROTR((x), 18) ^ SHR((x), 3)) +#define SIGMA1_256(x) (ROTR((x), 17) ^ ROTR((x), 19) ^ SHR((x), 10)) + +#define SHA256ROUND(a, b, c, d, e, f, g, h, i, w) \ + T1 = h + BIGSIGMA1_256(e) + Ch(e, f, g) + SHA256_CONST(i) + w; \ + d += T1; \ + T2 = BIGSIGMA0_256(a) + Maj(a, b, c); \ + h = T1 + T2 + +/* SHA384/512 Functions */ +#define BIGSIGMA0(x) (ROTR((x), 28) ^ ROTR((x), 34) ^ ROTR((x), 39)) +#define BIGSIGMA1(x) (ROTR((x), 14) ^ ROTR((x), 18) ^ ROTR((x), 41)) +#define SIGMA0(x) (ROTR((x), 1) ^ ROTR((x), 8) ^ SHR((x), 7)) +#define SIGMA1(x) (ROTR((x), 19) ^ ROTR((x), 61) ^ SHR((x), 6)) +#define SHA512ROUND(a, b, c, d, e, f, g, h, i, w) \ + T1 = h + BIGSIGMA1(e) + Ch(e, f, g) + SHA512_CONST(i) + w; \ + d += T1; \ + T2 = BIGSIGMA0(a) + Maj(a, b, c); \ + h = T1 + T2; \ + SMALL_STACK_MEMORY_BARRIER; + +/* + * sparc optimization: + * + * on the sparc, we can load big endian 32-bit data easily. note that + * special care must be taken to ensure the address is 32-bit aligned. + * in the interest of speed, we don't check to make sure, since + * careful programming can guarantee this for us. + */ + +#if defined(_ZFS_BIG_ENDIAN) +#define LOAD_BIG_32(addr) (*(uint32_t *)(addr)) +#define LOAD_BIG_64(addr) (*(uint64_t *)(addr)) + +#elif defined(HAVE_HTONL) +#define LOAD_BIG_32(addr) htonl(*((uint32_t *)(addr))) +#define LOAD_BIG_64(addr) htonll(*((uint64_t *)(addr))) + +#else +/* little endian -- will work on big endian, but slowly */ +#define LOAD_BIG_32(addr) \ + (((addr)[0] << 24) | ((addr)[1] << 16) | ((addr)[2] << 8) | (addr)[3]) +#define LOAD_BIG_64(addr) \ + (((uint64_t)(addr)[0] << 56) | ((uint64_t)(addr)[1] << 48) | \ + ((uint64_t)(addr)[2] << 40) | ((uint64_t)(addr)[3] << 32) | \ + ((uint64_t)(addr)[4] << 24) | ((uint64_t)(addr)[5] << 16) | \ + ((uint64_t)(addr)[6] << 8) | (uint64_t)(addr)[7]) +#endif /* _BIG_ENDIAN */ + + +#if !defined(__amd64) || !defined(_KERNEL) +/* SHA256 Transform */ + +static void +SHA256Transform(SHA2_CTX *ctx, const uint8_t *blk) +{ + uint32_t a = ctx->state.s32[0]; + uint32_t b = ctx->state.s32[1]; + uint32_t c = ctx->state.s32[2]; + uint32_t d = ctx->state.s32[3]; + uint32_t e = ctx->state.s32[4]; + uint32_t f = ctx->state.s32[5]; + uint32_t g = ctx->state.s32[6]; + uint32_t h = ctx->state.s32[7]; + + uint32_t w0, w1, w2, w3, w4, w5, w6, w7; + uint32_t w8, w9, w10, w11, w12, w13, w14, w15; + uint32_t T1, T2; + +#if defined(__sparc) + static const uint32_t sha256_consts[] = { + SHA256_CONST_0, SHA256_CONST_1, SHA256_CONST_2, + SHA256_CONST_3, SHA256_CONST_4, SHA256_CONST_5, + SHA256_CONST_6, SHA256_CONST_7, SHA256_CONST_8, + SHA256_CONST_9, SHA256_CONST_10, SHA256_CONST_11, + SHA256_CONST_12, SHA256_CONST_13, SHA256_CONST_14, + SHA256_CONST_15, SHA256_CONST_16, SHA256_CONST_17, + SHA256_CONST_18, SHA256_CONST_19, SHA256_CONST_20, + SHA256_CONST_21, SHA256_CONST_22, SHA256_CONST_23, + SHA256_CONST_24, SHA256_CONST_25, SHA256_CONST_26, + SHA256_CONST_27, SHA256_CONST_28, SHA256_CONST_29, + SHA256_CONST_30, SHA256_CONST_31, SHA256_CONST_32, + SHA256_CONST_33, SHA256_CONST_34, SHA256_CONST_35, + SHA256_CONST_36, SHA256_CONST_37, SHA256_CONST_38, + SHA256_CONST_39, SHA256_CONST_40, SHA256_CONST_41, + SHA256_CONST_42, SHA256_CONST_43, SHA256_CONST_44, + SHA256_CONST_45, SHA256_CONST_46, SHA256_CONST_47, + SHA256_CONST_48, SHA256_CONST_49, SHA256_CONST_50, + SHA256_CONST_51, SHA256_CONST_52, SHA256_CONST_53, + SHA256_CONST_54, SHA256_CONST_55, SHA256_CONST_56, + SHA256_CONST_57, SHA256_CONST_58, SHA256_CONST_59, + SHA256_CONST_60, SHA256_CONST_61, SHA256_CONST_62, + SHA256_CONST_63 + }; +#endif /* __sparc */ + + if ((uintptr_t)blk & 0x3) { /* not 4-byte aligned? */ + bcopy(blk, ctx->buf_un.buf32, sizeof (ctx->buf_un.buf32)); + blk = (uint8_t *)ctx->buf_un.buf32; + } + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w0 = LOAD_BIG_32(blk + 4 * 0); + SHA256ROUND(a, b, c, d, e, f, g, h, 0, w0); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w1 = LOAD_BIG_32(blk + 4 * 1); + SHA256ROUND(h, a, b, c, d, e, f, g, 1, w1); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w2 = LOAD_BIG_32(blk + 4 * 2); + SHA256ROUND(g, h, a, b, c, d, e, f, 2, w2); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w3 = LOAD_BIG_32(blk + 4 * 3); + SHA256ROUND(f, g, h, a, b, c, d, e, 3, w3); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w4 = LOAD_BIG_32(blk + 4 * 4); + SHA256ROUND(e, f, g, h, a, b, c, d, 4, w4); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w5 = LOAD_BIG_32(blk + 4 * 5); + SHA256ROUND(d, e, f, g, h, a, b, c, 5, w5); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w6 = LOAD_BIG_32(blk + 4 * 6); + SHA256ROUND(c, d, e, f, g, h, a, b, 6, w6); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w7 = LOAD_BIG_32(blk + 4 * 7); + SHA256ROUND(b, c, d, e, f, g, h, a, 7, w7); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w8 = LOAD_BIG_32(blk + 4 * 8); + SHA256ROUND(a, b, c, d, e, f, g, h, 8, w8); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w9 = LOAD_BIG_32(blk + 4 * 9); + SHA256ROUND(h, a, b, c, d, e, f, g, 9, w9); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w10 = LOAD_BIG_32(blk + 4 * 10); + SHA256ROUND(g, h, a, b, c, d, e, f, 10, w10); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w11 = LOAD_BIG_32(blk + 4 * 11); + SHA256ROUND(f, g, h, a, b, c, d, e, 11, w11); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w12 = LOAD_BIG_32(blk + 4 * 12); + SHA256ROUND(e, f, g, h, a, b, c, d, 12, w12); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w13 = LOAD_BIG_32(blk + 4 * 13); + SHA256ROUND(d, e, f, g, h, a, b, c, 13, w13); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w14 = LOAD_BIG_32(blk + 4 * 14); + SHA256ROUND(c, d, e, f, g, h, a, b, 14, w14); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w15 = LOAD_BIG_32(blk + 4 * 15); + SHA256ROUND(b, c, d, e, f, g, h, a, 15, w15); + + w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; + SHA256ROUND(a, b, c, d, e, f, g, h, 16, w0); + w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; + SHA256ROUND(h, a, b, c, d, e, f, g, 17, w1); + w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; + SHA256ROUND(g, h, a, b, c, d, e, f, 18, w2); + w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; + SHA256ROUND(f, g, h, a, b, c, d, e, 19, w3); + w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; + SHA256ROUND(e, f, g, h, a, b, c, d, 20, w4); + w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; + SHA256ROUND(d, e, f, g, h, a, b, c, 21, w5); + w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; + SHA256ROUND(c, d, e, f, g, h, a, b, 22, w6); + w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; + SHA256ROUND(b, c, d, e, f, g, h, a, 23, w7); + w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; + SHA256ROUND(a, b, c, d, e, f, g, h, 24, w8); + w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; + SHA256ROUND(h, a, b, c, d, e, f, g, 25, w9); + w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; + SHA256ROUND(g, h, a, b, c, d, e, f, 26, w10); + w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; + SHA256ROUND(f, g, h, a, b, c, d, e, 27, w11); + w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; + SHA256ROUND(e, f, g, h, a, b, c, d, 28, w12); + w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; + SHA256ROUND(d, e, f, g, h, a, b, c, 29, w13); + w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; + SHA256ROUND(c, d, e, f, g, h, a, b, 30, w14); + w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; + SHA256ROUND(b, c, d, e, f, g, h, a, 31, w15); + + w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; + SHA256ROUND(a, b, c, d, e, f, g, h, 32, w0); + w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; + SHA256ROUND(h, a, b, c, d, e, f, g, 33, w1); + w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; + SHA256ROUND(g, h, a, b, c, d, e, f, 34, w2); + w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; + SHA256ROUND(f, g, h, a, b, c, d, e, 35, w3); + w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; + SHA256ROUND(e, f, g, h, a, b, c, d, 36, w4); + w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; + SHA256ROUND(d, e, f, g, h, a, b, c, 37, w5); + w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; + SHA256ROUND(c, d, e, f, g, h, a, b, 38, w6); + w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; + SHA256ROUND(b, c, d, e, f, g, h, a, 39, w7); + w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; + SHA256ROUND(a, b, c, d, e, f, g, h, 40, w8); + w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; + SHA256ROUND(h, a, b, c, d, e, f, g, 41, w9); + w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; + SHA256ROUND(g, h, a, b, c, d, e, f, 42, w10); + w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; + SHA256ROUND(f, g, h, a, b, c, d, e, 43, w11); + w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; + SHA256ROUND(e, f, g, h, a, b, c, d, 44, w12); + w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; + SHA256ROUND(d, e, f, g, h, a, b, c, 45, w13); + w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; + SHA256ROUND(c, d, e, f, g, h, a, b, 46, w14); + w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; + SHA256ROUND(b, c, d, e, f, g, h, a, 47, w15); + + w0 = SIGMA1_256(w14) + w9 + SIGMA0_256(w1) + w0; + SHA256ROUND(a, b, c, d, e, f, g, h, 48, w0); + w1 = SIGMA1_256(w15) + w10 + SIGMA0_256(w2) + w1; + SHA256ROUND(h, a, b, c, d, e, f, g, 49, w1); + w2 = SIGMA1_256(w0) + w11 + SIGMA0_256(w3) + w2; + SHA256ROUND(g, h, a, b, c, d, e, f, 50, w2); + w3 = SIGMA1_256(w1) + w12 + SIGMA0_256(w4) + w3; + SHA256ROUND(f, g, h, a, b, c, d, e, 51, w3); + w4 = SIGMA1_256(w2) + w13 + SIGMA0_256(w5) + w4; + SHA256ROUND(e, f, g, h, a, b, c, d, 52, w4); + w5 = SIGMA1_256(w3) + w14 + SIGMA0_256(w6) + w5; + SHA256ROUND(d, e, f, g, h, a, b, c, 53, w5); + w6 = SIGMA1_256(w4) + w15 + SIGMA0_256(w7) + w6; + SHA256ROUND(c, d, e, f, g, h, a, b, 54, w6); + w7 = SIGMA1_256(w5) + w0 + SIGMA0_256(w8) + w7; + SHA256ROUND(b, c, d, e, f, g, h, a, 55, w7); + w8 = SIGMA1_256(w6) + w1 + SIGMA0_256(w9) + w8; + SHA256ROUND(a, b, c, d, e, f, g, h, 56, w8); + w9 = SIGMA1_256(w7) + w2 + SIGMA0_256(w10) + w9; + SHA256ROUND(h, a, b, c, d, e, f, g, 57, w9); + w10 = SIGMA1_256(w8) + w3 + SIGMA0_256(w11) + w10; + SHA256ROUND(g, h, a, b, c, d, e, f, 58, w10); + w11 = SIGMA1_256(w9) + w4 + SIGMA0_256(w12) + w11; + SHA256ROUND(f, g, h, a, b, c, d, e, 59, w11); + w12 = SIGMA1_256(w10) + w5 + SIGMA0_256(w13) + w12; + SHA256ROUND(e, f, g, h, a, b, c, d, 60, w12); + w13 = SIGMA1_256(w11) + w6 + SIGMA0_256(w14) + w13; + SHA256ROUND(d, e, f, g, h, a, b, c, 61, w13); + w14 = SIGMA1_256(w12) + w7 + SIGMA0_256(w15) + w14; + SHA256ROUND(c, d, e, f, g, h, a, b, 62, w14); + w15 = SIGMA1_256(w13) + w8 + SIGMA0_256(w0) + w15; + SHA256ROUND(b, c, d, e, f, g, h, a, 63, w15); + + ctx->state.s32[0] += a; + ctx->state.s32[1] += b; + ctx->state.s32[2] += c; + ctx->state.s32[3] += d; + ctx->state.s32[4] += e; + ctx->state.s32[5] += f; + ctx->state.s32[6] += g; + ctx->state.s32[7] += h; +} + + +/* SHA384 and SHA512 Transform */ + +static void +SHA512Transform(SHA2_CTX *ctx, const uint8_t *blk) +{ + + uint64_t a = ctx->state.s64[0]; + uint64_t b = ctx->state.s64[1]; + uint64_t c = ctx->state.s64[2]; + uint64_t d = ctx->state.s64[3]; + uint64_t e = ctx->state.s64[4]; + uint64_t f = ctx->state.s64[5]; + uint64_t g = ctx->state.s64[6]; + uint64_t h = ctx->state.s64[7]; + + uint64_t w0, w1, w2, w3, w4, w5, w6, w7; + uint64_t w8, w9, w10, w11, w12, w13, w14, w15; + uint64_t T1, T2; + +#if defined(__sparc) + static const uint64_t sha512_consts[] = { + SHA512_CONST_0, SHA512_CONST_1, SHA512_CONST_2, + SHA512_CONST_3, SHA512_CONST_4, SHA512_CONST_5, + SHA512_CONST_6, SHA512_CONST_7, SHA512_CONST_8, + SHA512_CONST_9, SHA512_CONST_10, SHA512_CONST_11, + SHA512_CONST_12, SHA512_CONST_13, SHA512_CONST_14, + SHA512_CONST_15, SHA512_CONST_16, SHA512_CONST_17, + SHA512_CONST_18, SHA512_CONST_19, SHA512_CONST_20, + SHA512_CONST_21, SHA512_CONST_22, SHA512_CONST_23, + SHA512_CONST_24, SHA512_CONST_25, SHA512_CONST_26, + SHA512_CONST_27, SHA512_CONST_28, SHA512_CONST_29, + SHA512_CONST_30, SHA512_CONST_31, SHA512_CONST_32, + SHA512_CONST_33, SHA512_CONST_34, SHA512_CONST_35, + SHA512_CONST_36, SHA512_CONST_37, SHA512_CONST_38, + SHA512_CONST_39, SHA512_CONST_40, SHA512_CONST_41, + SHA512_CONST_42, SHA512_CONST_43, SHA512_CONST_44, + SHA512_CONST_45, SHA512_CONST_46, SHA512_CONST_47, + SHA512_CONST_48, SHA512_CONST_49, SHA512_CONST_50, + SHA512_CONST_51, SHA512_CONST_52, SHA512_CONST_53, + SHA512_CONST_54, SHA512_CONST_55, SHA512_CONST_56, + SHA512_CONST_57, SHA512_CONST_58, SHA512_CONST_59, + SHA512_CONST_60, SHA512_CONST_61, SHA512_CONST_62, + SHA512_CONST_63, SHA512_CONST_64, SHA512_CONST_65, + SHA512_CONST_66, SHA512_CONST_67, SHA512_CONST_68, + SHA512_CONST_69, SHA512_CONST_70, SHA512_CONST_71, + SHA512_CONST_72, SHA512_CONST_73, SHA512_CONST_74, + SHA512_CONST_75, SHA512_CONST_76, SHA512_CONST_77, + SHA512_CONST_78, SHA512_CONST_79 + }; +#endif /* __sparc */ + + + if ((uintptr_t)blk & 0x7) { /* not 8-byte aligned? */ + bcopy(blk, ctx->buf_un.buf64, sizeof (ctx->buf_un.buf64)); + blk = (uint8_t *)ctx->buf_un.buf64; + } + + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w0 = LOAD_BIG_64(blk + 8 * 0); + SHA512ROUND(a, b, c, d, e, f, g, h, 0, w0); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w1 = LOAD_BIG_64(blk + 8 * 1); + SHA512ROUND(h, a, b, c, d, e, f, g, 1, w1); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w2 = LOAD_BIG_64(blk + 8 * 2); + SHA512ROUND(g, h, a, b, c, d, e, f, 2, w2); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w3 = LOAD_BIG_64(blk + 8 * 3); + SHA512ROUND(f, g, h, a, b, c, d, e, 3, w3); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w4 = LOAD_BIG_64(blk + 8 * 4); + SHA512ROUND(e, f, g, h, a, b, c, d, 4, w4); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w5 = LOAD_BIG_64(blk + 8 * 5); + SHA512ROUND(d, e, f, g, h, a, b, c, 5, w5); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w6 = LOAD_BIG_64(blk + 8 * 6); + SHA512ROUND(c, d, e, f, g, h, a, b, 6, w6); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w7 = LOAD_BIG_64(blk + 8 * 7); + SHA512ROUND(b, c, d, e, f, g, h, a, 7, w7); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w8 = LOAD_BIG_64(blk + 8 * 8); + SHA512ROUND(a, b, c, d, e, f, g, h, 8, w8); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w9 = LOAD_BIG_64(blk + 8 * 9); + SHA512ROUND(h, a, b, c, d, e, f, g, 9, w9); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w10 = LOAD_BIG_64(blk + 8 * 10); + SHA512ROUND(g, h, a, b, c, d, e, f, 10, w10); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w11 = LOAD_BIG_64(blk + 8 * 11); + SHA512ROUND(f, g, h, a, b, c, d, e, 11, w11); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w12 = LOAD_BIG_64(blk + 8 * 12); + SHA512ROUND(e, f, g, h, a, b, c, d, 12, w12); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w13 = LOAD_BIG_64(blk + 8 * 13); + SHA512ROUND(d, e, f, g, h, a, b, c, 13, w13); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w14 = LOAD_BIG_64(blk + 8 * 14); + SHA512ROUND(c, d, e, f, g, h, a, b, 14, w14); + /* LINTED E_BAD_PTR_CAST_ALIGN */ + w15 = LOAD_BIG_64(blk + 8 * 15); + SHA512ROUND(b, c, d, e, f, g, h, a, 15, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 16, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 17, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 18, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 19, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 20, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 21, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 22, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 23, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 24, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 25, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 26, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 27, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 28, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 29, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 30, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 31, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 32, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 33, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 34, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 35, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 36, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 37, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 38, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 39, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 40, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 41, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 42, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 43, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 44, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 45, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 46, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 47, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 48, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 49, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 50, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 51, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 52, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 53, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 54, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 55, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 56, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 57, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 58, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 59, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 60, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 61, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 62, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 63, w15); + + w0 = SIGMA1(w14) + w9 + SIGMA0(w1) + w0; + SHA512ROUND(a, b, c, d, e, f, g, h, 64, w0); + w1 = SIGMA1(w15) + w10 + SIGMA0(w2) + w1; + SHA512ROUND(h, a, b, c, d, e, f, g, 65, w1); + w2 = SIGMA1(w0) + w11 + SIGMA0(w3) + w2; + SHA512ROUND(g, h, a, b, c, d, e, f, 66, w2); + w3 = SIGMA1(w1) + w12 + SIGMA0(w4) + w3; + SHA512ROUND(f, g, h, a, b, c, d, e, 67, w3); + w4 = SIGMA1(w2) + w13 + SIGMA0(w5) + w4; + SHA512ROUND(e, f, g, h, a, b, c, d, 68, w4); + w5 = SIGMA1(w3) + w14 + SIGMA0(w6) + w5; + SHA512ROUND(d, e, f, g, h, a, b, c, 69, w5); + w6 = SIGMA1(w4) + w15 + SIGMA0(w7) + w6; + SHA512ROUND(c, d, e, f, g, h, a, b, 70, w6); + w7 = SIGMA1(w5) + w0 + SIGMA0(w8) + w7; + SHA512ROUND(b, c, d, e, f, g, h, a, 71, w7); + w8 = SIGMA1(w6) + w1 + SIGMA0(w9) + w8; + SHA512ROUND(a, b, c, d, e, f, g, h, 72, w8); + w9 = SIGMA1(w7) + w2 + SIGMA0(w10) + w9; + SHA512ROUND(h, a, b, c, d, e, f, g, 73, w9); + w10 = SIGMA1(w8) + w3 + SIGMA0(w11) + w10; + SHA512ROUND(g, h, a, b, c, d, e, f, 74, w10); + w11 = SIGMA1(w9) + w4 + SIGMA0(w12) + w11; + SHA512ROUND(f, g, h, a, b, c, d, e, 75, w11); + w12 = SIGMA1(w10) + w5 + SIGMA0(w13) + w12; + SHA512ROUND(e, f, g, h, a, b, c, d, 76, w12); + w13 = SIGMA1(w11) + w6 + SIGMA0(w14) + w13; + SHA512ROUND(d, e, f, g, h, a, b, c, 77, w13); + w14 = SIGMA1(w12) + w7 + SIGMA0(w15) + w14; + SHA512ROUND(c, d, e, f, g, h, a, b, 78, w14); + w15 = SIGMA1(w13) + w8 + SIGMA0(w0) + w15; + SHA512ROUND(b, c, d, e, f, g, h, a, 79, w15); + + ctx->state.s64[0] += a; + ctx->state.s64[1] += b; + ctx->state.s64[2] += c; + ctx->state.s64[3] += d; + ctx->state.s64[4] += e; + ctx->state.s64[5] += f; + ctx->state.s64[6] += g; + ctx->state.s64[7] += h; + +} +#endif /* !__amd64 || !_KERNEL */ + + +/* + * Encode() + * + * purpose: to convert a list of numbers from little endian to big endian + * input: uint8_t * : place to store the converted big endian numbers + * uint32_t * : place to get numbers to convert from + * size_t : the length of the input in bytes + * output: void + */ + +static void +Encode(uint8_t *_RESTRICT_KYWD output, uint32_t *_RESTRICT_KYWD input, + size_t len) +{ + size_t i, j; + +#if defined(__sparc) + if (IS_P2ALIGNED(output, sizeof (uint32_t))) { + for (i = 0, j = 0; j < len; i++, j += 4) { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + *((uint32_t *)(output + j)) = input[i]; + } + } else { +#endif /* little endian -- will work on big endian, but slowly */ + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (input[i] >> 24) & 0xff; + output[j + 1] = (input[i] >> 16) & 0xff; + output[j + 2] = (input[i] >> 8) & 0xff; + output[j + 3] = input[i] & 0xff; + } +#if defined(__sparc) + } +#endif +} + +static void +Encode64(uint8_t *_RESTRICT_KYWD output, uint64_t *_RESTRICT_KYWD input, + size_t len) +{ + size_t i, j; + +#if defined(__sparc) + if (IS_P2ALIGNED(output, sizeof (uint64_t))) { + for (i = 0, j = 0; j < len; i++, j += 8) { + /* LINTED E_BAD_PTR_CAST_ALIGN */ + *((uint64_t *)(output + j)) = input[i]; + } + } else { +#endif /* little endian -- will work on big endian, but slowly */ + for (i = 0, j = 0; j < len; i++, j += 8) { + + output[j] = (input[i] >> 56) & 0xff; + output[j + 1] = (input[i] >> 48) & 0xff; + output[j + 2] = (input[i] >> 40) & 0xff; + output[j + 3] = (input[i] >> 32) & 0xff; + output[j + 4] = (input[i] >> 24) & 0xff; + output[j + 5] = (input[i] >> 16) & 0xff; + output[j + 6] = (input[i] >> 8) & 0xff; + output[j + 7] = input[i] & 0xff; + } +#if defined(__sparc) + } +#endif +} + + +void +SHA2Init(uint64_t mech, SHA2_CTX *ctx) +{ + + switch (mech) { + case SHA256_MECH_INFO_TYPE: + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + ctx->state.s32[0] = 0x6a09e667U; + ctx->state.s32[1] = 0xbb67ae85U; + ctx->state.s32[2] = 0x3c6ef372U; + ctx->state.s32[3] = 0xa54ff53aU; + ctx->state.s32[4] = 0x510e527fU; + ctx->state.s32[5] = 0x9b05688cU; + ctx->state.s32[6] = 0x1f83d9abU; + ctx->state.s32[7] = 0x5be0cd19U; + break; + case SHA384_MECH_INFO_TYPE: + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + ctx->state.s64[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state.s64[1] = 0x629a292a367cd507ULL; + ctx->state.s64[2] = 0x9159015a3070dd17ULL; + ctx->state.s64[3] = 0x152fecd8f70e5939ULL; + ctx->state.s64[4] = 0x67332667ffc00b31ULL; + ctx->state.s64[5] = 0x8eb44a8768581511ULL; + ctx->state.s64[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state.s64[7] = 0x47b5481dbefa4fa4ULL; + break; + case SHA512_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x6a09e667f3bcc908ULL; + ctx->state.s64[1] = 0xbb67ae8584caa73bULL; + ctx->state.s64[2] = 0x3c6ef372fe94f82bULL; + ctx->state.s64[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state.s64[4] = 0x510e527fade682d1ULL; + ctx->state.s64[5] = 0x9b05688c2b3e6c1fULL; + ctx->state.s64[6] = 0x1f83d9abfb41bd6bULL; + ctx->state.s64[7] = 0x5be0cd19137e2179ULL; + break; + case SHA512_224_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x8C3D37C819544DA2ULL; + ctx->state.s64[1] = 0x73E1996689DCD4D6ULL; + ctx->state.s64[2] = 0x1DFAB7AE32FF9C82ULL; + ctx->state.s64[3] = 0x679DD514582F9FCFULL; + ctx->state.s64[4] = 0x0F6D2B697BD44DA8ULL; + ctx->state.s64[5] = 0x77E36F7304C48942ULL; + ctx->state.s64[6] = 0x3F9D85A86A1D36C8ULL; + ctx->state.s64[7] = 0x1112E6AD91D692A1ULL; + break; + case SHA512_256_MECH_INFO_TYPE: + ctx->state.s64[0] = 0x22312194FC2BF72CULL; + ctx->state.s64[1] = 0x9F555FA3C84C64C2ULL; + ctx->state.s64[2] = 0x2393B86B6F53B151ULL; + ctx->state.s64[3] = 0x963877195940EABDULL; + ctx->state.s64[4] = 0x96283EE2A88EFFE3ULL; + ctx->state.s64[5] = 0xBE5E1E2553863992ULL; + ctx->state.s64[6] = 0x2B0199FC2C85B8AAULL; + ctx->state.s64[7] = 0x0EB72DDC81C52CA2ULL; + break; +#ifdef _KERNEL + default: + cmn_err(CE_PANIC, + "sha2_init: failed to find a supported algorithm: 0x%x", + (uint32_t)mech); + +#endif /* _KERNEL */ + } + + ctx->algotype = (uint32_t)mech; + ctx->count.c64[0] = ctx->count.c64[1] = 0; +} + +#ifndef _KERNEL + +// #pragma inline(SHA256Init, SHA384Init, SHA512Init) +void +SHA256Init(SHA256_CTX *ctx) +{ + SHA2Init(SHA256, ctx); +} + +void +SHA384Init(SHA384_CTX *ctx) +{ + SHA2Init(SHA384, ctx); +} + +void +SHA512Init(SHA512_CTX *ctx) +{ + SHA2Init(SHA512, ctx); +} + +#endif /* _KERNEL */ + +/* + * SHA2Update() + * + * purpose: continues an sha2 digest operation, using the message block + * to update the context. + * input: SHA2_CTX * : the context to update + * void * : the message block + * size_t : the length of the message block, in bytes + * output: void + */ + +void +SHA2Update(SHA2_CTX *ctx, const void *inptr, size_t input_len) +{ + uint32_t i, buf_index, buf_len, buf_limit; + const uint8_t *input = inptr; + uint32_t algotype = ctx->algotype; + + /* check for noop */ + if (input_len == 0) + return; + + if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { + buf_limit = 64; + + /* compute number of bytes mod 64 */ + buf_index = (ctx->count.c32[1] >> 3) & 0x3F; + + /* update number of bits */ + if ((ctx->count.c32[1] += (input_len << 3)) < (input_len << 3)) + ctx->count.c32[0]++; + + ctx->count.c32[0] += (input_len >> 29); + + } else { + buf_limit = 128; + + /* compute number of bytes mod 128 */ + buf_index = (ctx->count.c64[1] >> 3) & 0x7F; + + /* update number of bits */ + if ((ctx->count.c64[1] += (input_len << 3)) < (input_len << 3)) + ctx->count.c64[0]++; + + ctx->count.c64[0] += (input_len >> 29); + } + + buf_len = buf_limit - buf_index; + + /* transform as many times as possible */ + i = 0; + if (input_len >= buf_len) { + + /* + * general optimization: + * + * only do initial bcopy() and SHA2Transform() if + * buf_index != 0. if buf_index == 0, we're just + * wasting our time doing the bcopy() since there + * wasn't any data left over from a previous call to + * SHA2Update(). + */ + if (buf_index) { + bcopy(input, &ctx->buf_un.buf8[buf_index], buf_len); + if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) + SHA256Transform(ctx, ctx->buf_un.buf8); + else + SHA512Transform(ctx, ctx->buf_un.buf8); + + i = buf_len; + } + +#if !defined(__amd64) || !defined(_KERNEL) + if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { + for (; i + buf_limit - 1 < input_len; i += buf_limit) { + SHA256Transform(ctx, &input[i]); + } + } else { + for (; i + buf_limit - 1 < input_len; i += buf_limit) { + SHA512Transform(ctx, &input[i]); + } + } + +#else + uint32_t block_count; + if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { + block_count = (input_len - i) >> 6; + if (block_count > 0) { + SHA256TransformBlocks(ctx, &input[i], + block_count); + i += block_count << 6; + } + } else { + block_count = (input_len - i) >> 7; + if (block_count > 0) { + SHA512TransformBlocks(ctx, &input[i], + block_count); + i += block_count << 7; + } + } +#endif /* !__amd64 || !_KERNEL */ + + /* + * general optimization: + * + * if i and input_len are the same, return now instead + * of calling bcopy(), since the bcopy() in this case + * will be an expensive noop. + */ + + if (input_len == i) + return; + + buf_index = 0; + } + + /* buffer remaining input */ + bcopy(&input[i], &ctx->buf_un.buf8[buf_index], input_len - i); +} + + +/* + * SHA2Final() + * + * purpose: ends an sha2 digest operation, finalizing the message digest and + * zeroing the context. + * input: uchar_t * : a buffer to store the digest + * : The function actually uses void* because many + * : callers pass things other than uchar_t here. + * SHA2_CTX * : the context to finalize, save, and zero + * output: void + */ + +void +SHA2Final(void *digest, SHA2_CTX *ctx) +{ + uint8_t bitcount_be[sizeof (ctx->count.c32)]; + uint8_t bitcount_be64[sizeof (ctx->count.c64)]; + uint32_t index; + uint32_t algotype = ctx->algotype; + + if (algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { + index = (ctx->count.c32[1] >> 3) & 0x3f; + Encode(bitcount_be, ctx->count.c32, sizeof (bitcount_be)); + SHA2Update(ctx, PADDING, ((index < 56) ? 56 : 120) - index); + SHA2Update(ctx, bitcount_be, sizeof (bitcount_be)); + Encode(digest, ctx->state.s32, sizeof (ctx->state.s32)); + } else { + index = (ctx->count.c64[1] >> 3) & 0x7f; + Encode64(bitcount_be64, ctx->count.c64, + sizeof (bitcount_be64)); + SHA2Update(ctx, PADDING, ((index < 112) ? 112 : 240) - index); + SHA2Update(ctx, bitcount_be64, sizeof (bitcount_be64)); + if (algotype <= SHA384_HMAC_GEN_MECH_INFO_TYPE) { + ctx->state.s64[6] = ctx->state.s64[7] = 0; + Encode64(digest, ctx->state.s64, + sizeof (uint64_t) * 6); + } else if (algotype == SHA512_224_MECH_INFO_TYPE) { + uint8_t last[sizeof (uint64_t)]; + /* + * Since SHA-512/224 doesn't align well to 64-bit + * boundaries, we must do the encoding in three steps: + * 1) encode the three 64-bit words that fit neatly + * 2) encode the last 64-bit word to a temp buffer + * 3) chop out the lower 32-bits from the temp buffer + * and append them to the digest + */ + Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 3); + Encode64(last, &ctx->state.s64[3], sizeof (uint64_t)); + bcopy(last, (uint8_t *)digest + 24, 4); + } else if (algotype == SHA512_256_MECH_INFO_TYPE) { + Encode64(digest, ctx->state.s64, sizeof (uint64_t) * 4); + } else { + Encode64(digest, ctx->state.s64, + sizeof (ctx->state.s64)); + } + } + + /* zeroize sensitive information */ + bzero(ctx, sizeof (*ctx)); +} + +#ifdef _KERNEL +EXPORT_SYMBOL(SHA2Init); +EXPORT_SYMBOL(SHA2Update); +EXPORT_SYMBOL(SHA2Final); +#endif diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE b/module/icp/algs/skein/THIRDPARTYLICENSE new file mode 100644 index 000000000000..b7434fd17872 --- /dev/null +++ b/module/icp/algs/skein/THIRDPARTYLICENSE @@ -0,0 +1,3 @@ +Implementation of the Skein hash function. +Source code author: Doug Whiting, 2008. +This algorithm and source code is released to the public domain. diff --git a/module/icp/algs/skein/THIRDPARTYLICENSE.descrip b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip new file mode 100644 index 000000000000..0ae89cfdf5ce --- /dev/null +++ b/module/icp/algs/skein/THIRDPARTYLICENSE.descrip @@ -0,0 +1 @@ +LICENSE TERMS OF SKEIN HASH ALGORITHM IMPLEMENTATION diff --git a/module/icp/algs/skein/skein.c b/module/icp/algs/skein/skein.c new file mode 100644 index 000000000000..83fe84260307 --- /dev/null +++ b/module/icp/algs/skein/skein.c @@ -0,0 +1,911 @@ +/* + * Implementation of the Skein hash function. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#include +#include +#include /* get the Skein API definitions */ +#include "skein_impl.h" /* get internal definitions */ + +/* 256-bit Skein */ +/* init the context for a straight hashing operation */ +int +Skein_256_Init(Skein_256_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN_256_STATE_BYTES]; + uint64_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 256: + bcopy(SKEIN_256_IV_256, ctx->X, sizeof (ctx->X)); + break; + case 224: + bcopy(SKEIN_256_IV_224, ctx->X, sizeof (ctx->X)); + break; + case 160: + bcopy(SKEIN_256_IV_160, ctx->X, sizeof (ctx->X)); + break; + case 128: + bcopy(SKEIN_256_IV_128, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. + * Set up to process the data message portion of the hash (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein_256_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein_256_InitExt(Skein_256_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN_256_STATE_BYTES]; + uint64_t w[SKEIN_256_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + /* hash the key */ + (void) Skein_256_Update(ctx, key, keyBytes); + /* put result into cfg.b[] */ + (void) Skein_256_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN_256_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(256, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein_256_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein_256_Update(Skein_256_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_256_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN_256_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_256_BLOCK_BYTES); + Skein_256_Process_Block(ctx, ctx->b, 1, + SKEIN_256_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from input + * message data + */ + if (msgByteCnt > SKEIN_256_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN_256_BLOCK_BYTES; + Skein_256_Process_Block(ctx, msg, n, + SKEIN_256_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_256_BLOCK_BYTES; + msg += n * SKEIN_256_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein_256_Final(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_256_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_256_BLOCK_BYTES; + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_256_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* 512-bit Skein */ + +/* init the context for a straight hashing operation */ +int +Skein_512_Init(Skein_512_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN_512_STATE_BYTES]; + uint64_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: + bcopy(SKEIN_512_IV_512, ctx->X, sizeof (ctx->X)); + break; + case 384: + bcopy(SKEIN_512_IV_384, ctx->X, sizeof (ctx->X)); + break; + case 256: + bcopy(SKEIN_512_IV_256, ctx->X, sizeof (ctx->X)); + break; + case 224: + bcopy(SKEIN_512_IV_224, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* + * here if there is no precomputed IV value available + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. Set up to process the data message portion of the + * hash (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein_512_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein_512_InitExt(Skein_512_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN_512_STATE_BYTES]; + uint64_t w[SKEIN_512_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + (void) Skein_512_Update(ctx, key, keyBytes); /* hash the key */ + /* put result into cfg.b[] */ + (void) Skein_512_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN_512_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + cfg.w[1] = Skein_Swap64(hashBitLen); /* hash result length in bits */ + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(512, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein_512_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein_512_Update(Skein_512_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN_512_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN_512_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN_512_BLOCK_BYTES); + Skein_512_Process_Block(ctx, ctx->b, 1, + SKEIN_512_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from input + * message data + */ + if (msgByteCnt > SKEIN_512_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN_512_BLOCK_BYTES; + Skein_512_Process_Block(ctx, msg, n, + SKEIN_512_BLOCK_BYTES); + msgByteCnt -= n * SKEIN_512_BLOCK_BYTES; + msg += n * SKEIN_512_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein_512_Final(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(512, &ctx->h, n, + hashVal + i * SKEIN_512_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* 1024-bit Skein */ + +/* init the context for a straight hashing operation */ +int +Skein1024_Init(Skein1024_Ctxt_t *ctx, size_t hashBitLen) +{ + union { + uint8_t b[SKEIN1024_STATE_BYTES]; + uint64_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + + switch (hashBitLen) { /* use pre-computed values, where available */ +#ifndef SKEIN_NO_PRECOMP + case 512: + bcopy(SKEIN1024_IV_512, ctx->X, sizeof (ctx->X)); + break; + case 384: + bcopy(SKEIN1024_IV_384, ctx->X, sizeof (ctx->X)); + break; + case 1024: + bcopy(SKEIN1024_IV_1024, ctx->X, sizeof (ctx->X)); + break; +#endif + default: + /* here if there is no precomputed IV value available */ + /* + * build/process the config block, type == CONFIG (could be + * precomputed) + */ + /* set tweaks: T0=0; T1=CFG | FINAL */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + /* set the schema, version */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + cfg.w[2] = Skein_Swap64(SKEIN_CFG_TREE_INFO_SEQUENTIAL); + /* zero pad config block */ + bzero(&cfg.w[3], sizeof (cfg) - 3 * sizeof (cfg.w[0])); + + /* compute the initial chaining values from config block */ + /* zero the chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + break; + } + + /* + * The chaining vars ctx->X are now initialized for the given + * hashBitLen. Set up to process the data message portion of the hash + * (default) + */ + Skein_Start_New_Type(ctx, MSG); /* T0=0, T1= MSG type */ + + return (SKEIN_SUCCESS); +} + +/* init the context for a MAC and/or tree hash operation */ +/* + * [identical to Skein1024_Init() when keyBytes == 0 && + * treeInfo == SKEIN_CFG_TREE_INFO_SEQUENTIAL] + */ +int +Skein1024_InitExt(Skein1024_Ctxt_t *ctx, size_t hashBitLen, uint64_t treeInfo, + const uint8_t *key, size_t keyBytes) +{ + union { + uint8_t b[SKEIN1024_STATE_BYTES]; + uint64_t w[SKEIN1024_STATE_WORDS]; + } cfg; /* config block */ + + Skein_Assert(hashBitLen > 0, SKEIN_BAD_HASHLEN); + Skein_Assert(keyBytes == 0 || key != NULL, SKEIN_FAIL); + + /* compute the initial chaining values ctx->X[], based on key */ + if (keyBytes == 0) { /* is there a key? */ + /* no key: use all zeroes as key for config block */ + bzero(ctx->X, sizeof (ctx->X)); + } else { /* here to pre-process a key */ + Skein_assert(sizeof (cfg.b) >= sizeof (ctx->X)); + /* do a mini-Init right here */ + /* set output hash bit count = state size */ + ctx->h.hashBitLen = 8 * sizeof (ctx->X); + /* set tweaks: T0 = 0; T1 = KEY type */ + Skein_Start_New_Type(ctx, KEY); + /* zero the initial chaining variables */ + bzero(ctx->X, sizeof (ctx->X)); + (void) Skein1024_Update(ctx, key, keyBytes); /* hash the key */ + /* put result into cfg.b[] */ + (void) Skein1024_Final_Pad(ctx, cfg.b); + /* copy over into ctx->X[] */ + bcopy(cfg.b, ctx->X, sizeof (cfg.b)); +#if SKEIN_NEED_SWAP + { + uint_t i; + /* convert key bytes to context words */ + for (i = 0; i < SKEIN1024_STATE_WORDS; i++) + ctx->X[i] = Skein_Swap64(ctx->X[i]); + } +#endif + } + /* + * build/process the config block, type == CONFIG (could be + * precomputed for each key) + */ + ctx->h.hashBitLen = hashBitLen; /* output hash bit count */ + Skein_Start_New_Type(ctx, CFG_FINAL); + + bzero(&cfg.w, sizeof (cfg.w)); /* pre-pad cfg.w[] with zeroes */ + cfg.w[0] = Skein_Swap64(SKEIN_SCHEMA_VER); + /* hash result length in bits */ + cfg.w[1] = Skein_Swap64(hashBitLen); + /* tree hash config info (or SKEIN_CFG_TREE_INFO_SEQUENTIAL) */ + cfg.w[2] = Skein_Swap64(treeInfo); + + Skein_Show_Key(1024, &ctx->h, key, keyBytes); + + /* compute the initial chaining values from config block */ + Skein1024_Process_Block(ctx, cfg.b, 1, SKEIN_CFG_STR_LEN); + + /* The chaining vars ctx->X are now initialized */ + /* Set up to process the data message portion of the hash (default) */ + ctx->h.bCnt = 0; /* buffer b[] starts out empty */ + Skein_Start_New_Type(ctx, MSG); + + return (SKEIN_SUCCESS); +} + +/* process the input bytes */ +int +Skein1024_Update(Skein1024_Ctxt_t *ctx, const uint8_t *msg, size_t msgByteCnt) +{ + size_t n; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* process full blocks, if any */ + if (msgByteCnt + ctx->h.bCnt > SKEIN1024_BLOCK_BYTES) { + /* finish up any buffered message data */ + if (ctx->h.bCnt) { + /* # bytes free in buffer b[] */ + n = SKEIN1024_BLOCK_BYTES - ctx->h.bCnt; + if (n) { + /* check on our logic here */ + Skein_assert(n < msgByteCnt); + bcopy(msg, &ctx->b[ctx->h.bCnt], n); + msgByteCnt -= n; + msg += n; + ctx->h.bCnt += n; + } + Skein_assert(ctx->h.bCnt == SKEIN1024_BLOCK_BYTES); + Skein1024_Process_Block(ctx, ctx->b, 1, + SKEIN1024_BLOCK_BYTES); + ctx->h.bCnt = 0; + } + /* + * now process any remaining full blocks, directly from + * input message data + */ + if (msgByteCnt > SKEIN1024_BLOCK_BYTES) { + /* number of full blocks to process */ + n = (msgByteCnt - 1) / SKEIN1024_BLOCK_BYTES; + Skein1024_Process_Block(ctx, msg, n, + SKEIN1024_BLOCK_BYTES); + msgByteCnt -= n * SKEIN1024_BLOCK_BYTES; + msg += n * SKEIN1024_BLOCK_BYTES; + } + Skein_assert(ctx->h.bCnt == 0); + } + + /* copy any remaining source message data bytes into b[] */ + if (msgByteCnt) { + Skein_assert(msgByteCnt + ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES); + bcopy(msg, &ctx->b[ctx->h.bCnt], msgByteCnt); + ctx->h.bCnt += msgByteCnt; + } + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the result */ +int +Skein1024_Final(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN1024_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + + /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(1024, &ctx->h, n, + hashVal + i * SKEIN1024_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* Functions to support MAC/tree hashing */ +/* (this code is identical for Optimized and Reference versions) */ + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein_256_Final_Pad(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_256_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_256_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein_256_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_256_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein_512_Final_Pad(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; /* tag as the final block */ + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN_512_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN_512_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein_512_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN_512_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +/* finalize the hash computation and output the block, no OUTPUT stage */ +int +Skein1024_Final_Pad(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* tag as the final block */ + ctx->h.T[1] |= SKEIN_T1_FLAG_FINAL; + /* zero pad b[] if necessary */ + if (ctx->h.bCnt < SKEIN1024_BLOCK_BYTES) + bzero(&ctx->b[ctx->h.bCnt], + SKEIN1024_BLOCK_BYTES - ctx->h.bCnt); + /* process the final block */ + Skein1024_Process_Block(ctx, ctx->b, 1, ctx->h.bCnt); + + /* "output" the state bytes */ + Skein_Put64_LSB_First(hashVal, ctx->X, SKEIN1024_BLOCK_BYTES); + + return (SKEIN_SUCCESS); +} + +#if SKEIN_TREE_HASH +/* just do the OUTPUT stage */ +int +Skein_256_Output(Skein_256_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_256_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_256_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_256_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_256_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_256_BLOCK_BYTES; + if (n >= SKEIN_256_BLOCK_BYTES) + n = SKEIN_256_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_256_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_256_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* just do the OUTPUT stage */ +int +Skein_512_Output(Skein_512_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN_512_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN_512_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN_512_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein_512_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN_512_BLOCK_BYTES; + if (n >= SKEIN_512_BLOCK_BYTES) + n = SKEIN_512_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN_512_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN_512_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} + +/* just do the OUTPUT stage */ +int +Skein1024_Output(Skein1024_Ctxt_t *ctx, uint8_t *hashVal) +{ + size_t i, n, byteCnt; + uint64_t X[SKEIN1024_STATE_WORDS]; + + /* catch uninitialized context */ + Skein_Assert(ctx->h.bCnt <= SKEIN1024_BLOCK_BYTES, SKEIN_FAIL); + + /* now output the result */ + /* total number of output bytes */ + byteCnt = (ctx->h.hashBitLen + 7) >> 3; + + /* run Threefish in "counter mode" to generate output */ + /* zero out b[], so it can hold the counter */ + bzero(ctx->b, sizeof (ctx->b)); + /* keep a local copy of counter mode "key" */ + bcopy(ctx->X, X, sizeof (X)); + for (i = 0; i * SKEIN1024_BLOCK_BYTES < byteCnt; i++) { + /* build the counter block */ + uint64_t tmp = Skein_Swap64((uint64_t)i); + bcopy(&tmp, ctx->b, sizeof (tmp)); + Skein_Start_New_Type(ctx, OUT_FINAL); + /* run "counter mode" */ + Skein1024_Process_Block(ctx, ctx->b, 1, sizeof (uint64_t)); + /* number of output bytes left to go */ + n = byteCnt - i * SKEIN1024_BLOCK_BYTES; + if (n >= SKEIN1024_BLOCK_BYTES) + n = SKEIN1024_BLOCK_BYTES; + Skein_Put64_LSB_First(hashVal + i * SKEIN1024_BLOCK_BYTES, + ctx->X, n); /* "output" the ctr mode bytes */ + Skein_Show_Final(256, &ctx->h, n, + hashVal + i * SKEIN1024_BLOCK_BYTES); + /* restore the counter mode key for next time */ + bcopy(X, ctx->X, sizeof (X)); + } + return (SKEIN_SUCCESS); +} +#endif + +#ifdef _KERNEL +EXPORT_SYMBOL(Skein_512_Init); +EXPORT_SYMBOL(Skein_512_InitExt); +EXPORT_SYMBOL(Skein_512_Update); +EXPORT_SYMBOL(Skein_512_Final); +#endif diff --git a/module/icp/algs/skein/skein_block.c b/module/icp/algs/skein/skein_block.c new file mode 100644 index 000000000000..7ba165a48511 --- /dev/null +++ b/module/icp/algs/skein/skein_block.c @@ -0,0 +1,790 @@ +/* + * Implementation of the Skein block functions. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * Compile-time switches: + * SKEIN_USE_ASM -- set bits (256/512/1024) to select which + * versions use ASM code for block processing + * [default: use C for all block sizes] + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#include +#include "skein_impl.h" +#include /* for _ILP32 */ + +#ifndef SKEIN_USE_ASM +#define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ +#endif + +#ifndef SKEIN_LOOP +/* + * The low-level checksum routines use a lot of stack space. On systems where + * small stacks frame are enforced (like 32-bit kernel builds), do not unroll + * checksum calculations to save stack space. + * + * Even with no loops unrolled, we still can exceed the 1k stack frame limit + * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can + * safely ignore it though, since that the checksum functions will be called + * from a worker thread that won't be using much stack. That's why we have + * the #pragma here to ignore the warning. + */ +#if defined(_ILP32) || defined(__powerpc) /* Assume small stack */ +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +/* + * We're running on 32-bit, don't unroll loops to save stack frame space + * + * Due to the ways the calculations on SKEIN_LOOP are done in + * Skein_*_Process_Block(), a value of 111 disables unrolling loops + * in any of those functions. + */ +#define SKEIN_LOOP 111 +#else +/* We're compiling with large stacks */ +#define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ +#endif +#endif + +/* some useful definitions for code here */ +#define BLK_BITS (WCNT*64) +#define KW_TWK_BASE (0) +#define KW_KEY_BASE (3) +#define ks (kw + KW_KEY_BASE) +#define ts (kw + KW_TWK_BASE) + +/* no debugging in Illumos version */ +#define DebugSaveTweak(ctx) + +/* Skein_256 */ +#if !(SKEIN_USE_ASM & 256) +void +Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ + enum { + WCNT = SKEIN_256_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) +#else +#define SKEIN_UNROLL_256 (0) +#endif + +#if SKEIN_UNROLL_256 +#if (RCNT % SKEIN_UNROLL_256) +#error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + /* local copy of context vars, for speed */ + uint64_t X0, X1, X2, X3; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[4]; + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; +#endif + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1] + ts[0]; + X2 = w[2] + ks[2] + ts[1]; + X3 = w[3] + ks[3]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); /* show starting state values */ + + blkPtr += SKEIN_256_BLOCK_BYTES; + + /* run the rounds */ + +#define Round256(p0, p1, p2, p3, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ + +#if SKEIN_UNROLL_256 == 0 +#define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I256(R) \ + X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ + X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ + X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ + X3 += ks[((R) + 4) % 5] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R256(p0, p1, p2, p3, ROT, rNum) \ + Round256(p0, p1, p2, p3, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I256(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ + X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ + X3 += ks[r + (R) + 3] + r + (R); \ + ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \ + ts[r + (R) + 2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop through it */ + for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) +#endif + { +#define R256_8_rounds(R) \ + R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ + R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ + R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ + R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ + I256(2 * (R)); \ + R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ + R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ + R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ + R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ + I256(2 * (R) + 1); + + R256_8_rounds(0); + +#define R256_Unroll_R(NN) \ + ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ + (SKEIN_UNROLL_256 > (NN))) + +#if R256_Unroll_R(1) + R256_8_rounds(1); +#endif +#if R256_Unroll_R(2) + R256_8_rounds(2); +#endif +#if R256_Unroll_R(3) + R256_8_rounds(3); +#endif +#if R256_Unroll_R(4) + R256_8_rounds(4); +#endif +#if R256_Unroll_R(5) + R256_8_rounds(5); +#endif +#if R256_Unroll_R(6) + R256_8_rounds(6); +#endif +#if R256_Unroll_R(7) + R256_8_rounds(7); +#endif +#if R256_Unroll_R(8) + R256_8_rounds(8); +#endif +#if R256_Unroll_R(9) + R256_8_rounds(9); +#endif +#if R256_Unroll_R(10) + R256_8_rounds(10); +#endif +#if R256_Unroll_R(11) + R256_8_rounds(11); +#endif +#if R256_Unroll_R(12) + R256_8_rounds(12); +#endif +#if R256_Unroll_R(13) + R256_8_rounds(13); +#endif +#if R256_Unroll_R(14) + R256_8_rounds(14); +#endif +#if (SKEIN_UNROLL_256 > 14) +#error "need more unrolling in Skein_256_Process_Block" +#endif + } + /* + * do the final "feedforward" xor, update context chaining vars + */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein_256_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein_256_Process_Block_CodeSize) - + ((uint8_t *)Skein_256_Process_Block); +} + +uint_t +Skein_256_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_256); +} +#endif +#endif + +/* Skein_512 */ +#if !(SKEIN_USE_ASM & 512) +void +Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ + enum { + WCNT = SKEIN_512_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) +#else +#define SKEIN_UNROLL_512 (0) +#endif + +#if SKEIN_UNROLL_512 +#if (RCNT % SKEIN_UNROLL_512) +#error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + /* local copy of vars, for speed */ + uint64_t X0, X1, X2, X3, X4, X5, X6, X7; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[8]; + Xptr[0] = &X0; + Xptr[1] = &X1; + Xptr[2] = &X2; + Xptr[3] = &X3; + Xptr[4] = &X4; + Xptr[5] = &X5; + Xptr[6] = &X6; + Xptr[7] = &X7; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X0 = w[0] + ks[0]; /* do the first full key injection */ + X1 = w[1] + ks[1]; + X2 = w[2] + ks[2]; + X3 = w[3] + ks[3]; + X4 = w[4] + ks[4]; + X5 = w[5] + ks[5] + ts[0]; + X6 = w[6] + ks[6] + ts[1]; + X7 = w[7] + ks[7]; + + blkPtr += SKEIN_512_BLOCK_BYTES; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); + /* run the rounds */ +#define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ + X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ + X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; + +#if SKEIN_UNROLL_512 == 0 +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); + +#define I512(R) \ + X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\ + X1 += ks[((R) + 2) % 9]; \ + X2 += ks[((R) + 3) % 9]; \ + X3 += ks[((R) + 4) % 9]; \ + X4 += ks[((R) + 5) % 9]; \ + X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ + X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ + X7 += ks[((R) + 8) % 9] + (R) + 1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); + +#define I512(R) \ + X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X1 += ks[r + (R) + 1]; \ + X2 += ks[r + (R) + 2]; \ + X3 += ks[r + (R) + 3]; \ + X4 += ks[r + (R) + 4]; \ + X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ + X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ + X7 += ks[r + (R) + 7] + r + (R); \ + ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\ + ts[r + (R)+2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop through it */ + for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) +#endif /* end of looped code definitions */ + { +#define R512_8_rounds(R) /* do 8 full rounds */ \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ + I512(2 * (R)); \ + R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ + R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ + R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ + R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ + I512(2*(R) + 1); /* and key injection */ + + R512_8_rounds(0); + +#define R512_Unroll_R(NN) \ + ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \ + (SKEIN_UNROLL_512 > (NN))) + +#if R512_Unroll_R(1) + R512_8_rounds(1); +#endif +#if R512_Unroll_R(2) + R512_8_rounds(2); +#endif +#if R512_Unroll_R(3) + R512_8_rounds(3); +#endif +#if R512_Unroll_R(4) + R512_8_rounds(4); +#endif +#if R512_Unroll_R(5) + R512_8_rounds(5); +#endif +#if R512_Unroll_R(6) + R512_8_rounds(6); +#endif +#if R512_Unroll_R(7) + R512_8_rounds(7); +#endif +#if R512_Unroll_R(8) + R512_8_rounds(8); +#endif +#if R512_Unroll_R(9) + R512_8_rounds(9); +#endif +#if R512_Unroll_R(10) + R512_8_rounds(10); +#endif +#if R512_Unroll_R(11) + R512_8_rounds(11); +#endif +#if R512_Unroll_R(12) + R512_8_rounds(12); +#endif +#if R512_Unroll_R(13) + R512_8_rounds(13); +#endif +#if R512_Unroll_R(14) + R512_8_rounds(14); +#endif +#if (SKEIN_UNROLL_512 > 14) +#error "need more unrolling in Skein_512_Process_Block" +#endif + } + + /* + * do the final "feedforward" xor, update context chaining vars + */ + ctx->X[0] = X0 ^ w[0]; + ctx->X[1] = X1 ^ w[1]; + ctx->X[2] = X2 ^ w[2]; + ctx->X[3] = X3 ^ w[3]; + ctx->X[4] = X4 ^ w[4]; + ctx->X[5] = X5 ^ w[5]; + ctx->X[6] = X6 ^ w[6]; + ctx->X[7] = X7 ^ w[7]; + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + } while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein_512_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein_512_Process_Block_CodeSize) - + ((uint8_t *)Skein_512_Process_Block); +} + +uint_t +Skein_512_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_512); +} +#endif +#endif + +/* Skein1024 */ +#if !(SKEIN_USE_ASM & 1024) +void +Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd) +{ + /* do it in C, always looping (unrolled is bigger AND slower!) */ + enum { + WCNT = SKEIN1024_STATE_WORDS + }; +#undef RCNT +#define RCNT (SKEIN1024_ROUNDS_TOTAL/8) + +#ifdef SKEIN_LOOP /* configure how much to unroll the loop */ +#define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) +#else +#define SKEIN_UNROLL_1024 (0) +#endif + +#if (SKEIN_UNROLL_1024 != 0) +#if (RCNT % SKEIN_UNROLL_1024) +#error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ +#endif + size_t r; + /* key schedule words : chaining vars + tweak + "rotation" */ + uint64_t kw[WCNT + 4 + RCNT * 2]; +#else + uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ +#endif + + /* local copy of vars, for speed */ + uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11, + X12, X13, X14, X15; + uint64_t w[WCNT]; /* local copy of input block */ +#ifdef SKEIN_DEBUG + /* use for debugging (help compiler put Xn in registers) */ + const uint64_t *Xptr[16]; + Xptr[0] = &X00; + Xptr[1] = &X01; + Xptr[2] = &X02; + Xptr[3] = &X03; + Xptr[4] = &X04; + Xptr[5] = &X05; + Xptr[6] = &X06; + Xptr[7] = &X07; + Xptr[8] = &X08; + Xptr[9] = &X09; + Xptr[10] = &X10; + Xptr[11] = &X11; + Xptr[12] = &X12; + Xptr[13] = &X13; + Xptr[14] = &X14; + Xptr[15] = &X15; +#endif + + Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ + ts[0] = ctx->h.T[0]; + ts[1] = ctx->h.T[1]; + do { + /* + * this implementation only supports 2**64 input bytes + * (no carry out here) + */ + ts[0] += byteCntAdd; /* update processed length */ + + /* precompute the key schedule for this block */ + ks[0] = ctx->X[0]; + ks[1] = ctx->X[1]; + ks[2] = ctx->X[2]; + ks[3] = ctx->X[3]; + ks[4] = ctx->X[4]; + ks[5] = ctx->X[5]; + ks[6] = ctx->X[6]; + ks[7] = ctx->X[7]; + ks[8] = ctx->X[8]; + ks[9] = ctx->X[9]; + ks[10] = ctx->X[10]; + ks[11] = ctx->X[11]; + ks[12] = ctx->X[12]; + ks[13] = ctx->X[13]; + ks[14] = ctx->X[14]; + ks[15] = ctx->X[15]; + ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ + ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ + ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ + ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; + + ts[2] = ts[0] ^ ts[1]; + + /* get input block in little-endian format */ + Skein_Get64_LSB_First(w, blkPtr, WCNT); + DebugSaveTweak(ctx); + Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); + + X00 = w[0] + ks[0]; /* do the first full key injection */ + X01 = w[1] + ks[1]; + X02 = w[2] + ks[2]; + X03 = w[3] + ks[3]; + X04 = w[4] + ks[4]; + X05 = w[5] + ks[5]; + X06 = w[6] + ks[6]; + X07 = w[7] + ks[7]; + X08 = w[8] + ks[8]; + X09 = w[9] + ks[9]; + X10 = w[10] + ks[10]; + X11 = w[11] + ks[11]; + X12 = w[12] + ks[12]; + X13 = w[13] + ks[13] + ts[0]; + X14 = w[14] + ks[14] + ts[1]; + X15 = w[15] + ks[15]; + + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, + Xptr); + +#define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rNum) \ + X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ + X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ + X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ + X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\ + X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\ + X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\ + X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\ + X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; + +#if SKEIN_UNROLL_1024 == 0 +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ + pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); + +#define I1024(R) \ + X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\ + X01 += ks[((R) + 2) % 17]; \ + X02 += ks[((R) + 3) % 17]; \ + X03 += ks[((R) + 4) % 17]; \ + X04 += ks[((R) + 5) % 17]; \ + X05 += ks[((R) + 6) % 17]; \ + X06 += ks[((R) + 7) % 17]; \ + X07 += ks[((R) + 8) % 17]; \ + X08 += ks[((R) + 9) % 17]; \ + X09 += ks[((R) + 10) % 17]; \ + X10 += ks[((R) + 11) % 17]; \ + X11 += ks[((R) + 12) % 17]; \ + X12 += ks[((R) + 13) % 17]; \ + X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ + X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ + X15 += ks[((R) + 16) % 17] + (R) +1; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); +#else /* looping version */ +#define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ + pE, pF, ROT, rn) \ + Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ + pD, pE, pF, ROT, rn) \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); + +#define I1024(R) \ + X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ + X01 += ks[r + (R) + 1]; \ + X02 += ks[r + (R) + 2]; \ + X03 += ks[r + (R) + 3]; \ + X04 += ks[r + (R) + 4]; \ + X05 += ks[r + (R) + 5]; \ + X06 += ks[r + (R) + 6]; \ + X07 += ks[r + (R) + 7]; \ + X08 += ks[r + (R) + 8]; \ + X09 += ks[r + (R) + 9]; \ + X10 += ks[r + (R) + 10]; \ + X11 += ks[r + (R) + 11]; \ + X12 += ks[r + (R) + 12]; \ + X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ + X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ + X15 += ks[r + (R) + 15] + r + (R); \ + ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\ + ts[r + (R) + 2] = ts[r + (R) - 1]; \ + Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); + + /* loop through it */ + for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) +#endif + { +#define R1024_8_rounds(R) /* do 8 full rounds */ \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ + 14, 15, R1024_0, 8 * (R) + 1); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ + 08, 01, R1024_1, 8 * (R) + 2); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ + 10, 09, R1024_2, 8 * (R) + 3); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ + 12, 07, R1024_3, 8 * (R) + 4); \ + I1024(2 * (R)); \ + R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ + 14, 15, R1024_4, 8 * (R) + 5); \ + R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ + 08, 01, R1024_5, 8 * (R) + 6); \ + R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ + 10, 09, R1024_6, 8 * (R) + 7); \ + R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ + 12, 07, R1024_7, 8 * (R) + 8); \ + I1024(2 * (R) + 1); + + R1024_8_rounds(0); + +#define R1024_Unroll_R(NN) \ + ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \ + (SKEIN_UNROLL_1024 > (NN))) + +#if R1024_Unroll_R(1) + R1024_8_rounds(1); +#endif +#if R1024_Unroll_R(2) + R1024_8_rounds(2); +#endif +#if R1024_Unroll_R(3) + R1024_8_rounds(3); +#endif +#if R1024_Unroll_R(4) + R1024_8_rounds(4); +#endif +#if R1024_Unroll_R(5) + R1024_8_rounds(5); +#endif +#if R1024_Unroll_R(6) + R1024_8_rounds(6); +#endif +#if R1024_Unroll_R(7) + R1024_8_rounds(7); +#endif +#if R1024_Unroll_R(8) + R1024_8_rounds(8); +#endif +#if R1024_Unroll_R(9) + R1024_8_rounds(9); +#endif +#if R1024_Unroll_R(10) + R1024_8_rounds(10); +#endif +#if R1024_Unroll_R(11) + R1024_8_rounds(11); +#endif +#if R1024_Unroll_R(12) + R1024_8_rounds(12); +#endif +#if R1024_Unroll_R(13) + R1024_8_rounds(13); +#endif +#if R1024_Unroll_R(14) + R1024_8_rounds(14); +#endif +#if (SKEIN_UNROLL_1024 > 14) +#error "need more unrolling in Skein_1024_Process_Block" +#endif + } + /* + * do the final "feedforward" xor, update context chaining vars + */ + + ctx->X[0] = X00 ^ w[0]; + ctx->X[1] = X01 ^ w[1]; + ctx->X[2] = X02 ^ w[2]; + ctx->X[3] = X03 ^ w[3]; + ctx->X[4] = X04 ^ w[4]; + ctx->X[5] = X05 ^ w[5]; + ctx->X[6] = X06 ^ w[6]; + ctx->X[7] = X07 ^ w[7]; + ctx->X[8] = X08 ^ w[8]; + ctx->X[9] = X09 ^ w[9]; + ctx->X[10] = X10 ^ w[10]; + ctx->X[11] = X11 ^ w[11]; + ctx->X[12] = X12 ^ w[12]; + ctx->X[13] = X13 ^ w[13]; + ctx->X[14] = X14 ^ w[14]; + ctx->X[15] = X15 ^ w[15]; + + Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); + + ts[1] &= ~SKEIN_T1_FLAG_FIRST; + blkPtr += SKEIN1024_BLOCK_BYTES; + } while (--blkCnt); + ctx->h.T[0] = ts[0]; + ctx->h.T[1] = ts[1]; +} + +#if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) +size_t +Skein1024_Process_Block_CodeSize(void) +{ + return ((uint8_t *)Skein1024_Process_Block_CodeSize) - + ((uint8_t *)Skein1024_Process_Block); +} + +uint_t +Skein1024_Unroll_Cnt(void) +{ + return (SKEIN_UNROLL_1024); +} +#endif +#endif diff --git a/module/icp/algs/skein/skein_impl.h b/module/icp/algs/skein/skein_impl.h new file mode 100644 index 000000000000..205a517d69db --- /dev/null +++ b/module/icp/algs/skein/skein_impl.h @@ -0,0 +1,292 @@ +/* + * Internal definitions for Skein hashing. + * Source code author: Doug Whiting, 2008. + * This algorithm and source code is released to the public domain. + * + * The following compile-time switches may be defined to control some + * tradeoffs between speed, code size, error checking, and security. + * + * The "default" note explains what happens when the switch is not defined. + * + * SKEIN_DEBUG -- make callouts from inside Skein code + * to examine/display intermediate values. + * [default: no callouts (no overhead)] + * + * SKEIN_ERR_CHECK -- how error checking is handled inside Skein + * code. If not defined, most error checking + * is disabled (for performance). Otherwise, + * the switch value is interpreted as: + * 0: use assert() to flag errors + * 1: return SKEIN_FAIL to flag errors + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#ifndef _SKEIN_IMPL_H_ +#define _SKEIN_IMPL_H_ + +#include +#include +#include +#include "skein_impl.h" +#include "skein_port.h" + +/* + * "Internal" Skein definitions + * -- not needed for sequential hashing API, but will be + * helpful for other uses of Skein (e.g., tree hash mode). + * -- included here so that they can be shared between + * reference and optimized code. + */ + +/* tweak word T[1]: bit field starting positions */ +/* offset 64 because it's the second word */ +#define SKEIN_T1_BIT(BIT) ((BIT) - 64) + +/* bits 112..118: level in hash tree */ +#define SKEIN_T1_POS_TREE_LVL SKEIN_T1_BIT(112) +/* bit 119: partial final input byte */ +#define SKEIN_T1_POS_BIT_PAD SKEIN_T1_BIT(119) +/* bits 120..125: type field */ +#define SKEIN_T1_POS_BLK_TYPE SKEIN_T1_BIT(120) +/* bits 126: first block flag */ +#define SKEIN_T1_POS_FIRST SKEIN_T1_BIT(126) +/* bit 127: final block flag */ +#define SKEIN_T1_POS_FINAL SKEIN_T1_BIT(127) + +/* tweak word T[1]: flag bit definition(s) */ +#define SKEIN_T1_FLAG_FIRST (((uint64_t)1) << SKEIN_T1_POS_FIRST) +#define SKEIN_T1_FLAG_FINAL (((uint64_t)1) << SKEIN_T1_POS_FINAL) +#define SKEIN_T1_FLAG_BIT_PAD (((uint64_t)1) << SKEIN_T1_POS_BIT_PAD) + +/* tweak word T[1]: tree level bit field mask */ +#define SKEIN_T1_TREE_LVL_MASK (((uint64_t)0x7F) << SKEIN_T1_POS_TREE_LVL) +#define SKEIN_T1_TREE_LEVEL(n) (((uint64_t)(n)) << SKEIN_T1_POS_TREE_LVL) + +/* tweak word T[1]: block type field */ +#define SKEIN_BLK_TYPE_KEY (0) /* key, for MAC and KDF */ +#define SKEIN_BLK_TYPE_CFG (4) /* configuration block */ +#define SKEIN_BLK_TYPE_PERS (8) /* personalization string */ +#define SKEIN_BLK_TYPE_PK (12) /* public key (for signature hashing) */ +#define SKEIN_BLK_TYPE_KDF (16) /* key identifier for KDF */ +#define SKEIN_BLK_TYPE_NONCE (20) /* nonce for PRNG */ +#define SKEIN_BLK_TYPE_MSG (48) /* message processing */ +#define SKEIN_BLK_TYPE_OUT (63) /* output stage */ +#define SKEIN_BLK_TYPE_MASK (63) /* bit field mask */ + +#define SKEIN_T1_BLK_TYPE(T) \ + (((uint64_t)(SKEIN_BLK_TYPE_##T)) << SKEIN_T1_POS_BLK_TYPE) +/* key, for MAC and KDF */ +#define SKEIN_T1_BLK_TYPE_KEY SKEIN_T1_BLK_TYPE(KEY) +/* configuration block */ +#define SKEIN_T1_BLK_TYPE_CFG SKEIN_T1_BLK_TYPE(CFG) +/* personalization string */ +#define SKEIN_T1_BLK_TYPE_PERS SKEIN_T1_BLK_TYPE(PERS) +/* public key (for digital signature hashing) */ +#define SKEIN_T1_BLK_TYPE_PK SKEIN_T1_BLK_TYPE(PK) +/* key identifier for KDF */ +#define SKEIN_T1_BLK_TYPE_KDF SKEIN_T1_BLK_TYPE(KDF) +/* nonce for PRNG */ +#define SKEIN_T1_BLK_TYPE_NONCE SKEIN_T1_BLK_TYPE(NONCE) +/* message processing */ +#define SKEIN_T1_BLK_TYPE_MSG SKEIN_T1_BLK_TYPE(MSG) +/* output stage */ +#define SKEIN_T1_BLK_TYPE_OUT SKEIN_T1_BLK_TYPE(OUT) +/* field bit mask */ +#define SKEIN_T1_BLK_TYPE_MASK SKEIN_T1_BLK_TYPE(MASK) + +#define SKEIN_T1_BLK_TYPE_CFG_FINAL \ + (SKEIN_T1_BLK_TYPE_CFG | SKEIN_T1_FLAG_FINAL) +#define SKEIN_T1_BLK_TYPE_OUT_FINAL \ + (SKEIN_T1_BLK_TYPE_OUT | SKEIN_T1_FLAG_FINAL) + +#define SKEIN_VERSION (1) + +#ifndef SKEIN_ID_STRING_LE /* allow compile-time personalization */ +#define SKEIN_ID_STRING_LE (0x33414853) /* "SHA3" (little-endian) */ +#endif + +#define SKEIN_MK_64(hi32, lo32) ((lo32) + (((uint64_t)(hi32)) << 32)) +#define SKEIN_SCHEMA_VER SKEIN_MK_64(SKEIN_VERSION, SKEIN_ID_STRING_LE) +#define SKEIN_KS_PARITY SKEIN_MK_64(0x1BD11BDA, 0xA9FC1A22) + +#define SKEIN_CFG_STR_LEN (4*8) + +/* bit field definitions in config block treeInfo word */ +#define SKEIN_CFG_TREE_LEAF_SIZE_POS (0) +#define SKEIN_CFG_TREE_NODE_SIZE_POS (8) +#define SKEIN_CFG_TREE_MAX_LEVEL_POS (16) + +#define SKEIN_CFG_TREE_LEAF_SIZE_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_LEAF_SIZE_POS) +#define SKEIN_CFG_TREE_NODE_SIZE_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_NODE_SIZE_POS) +#define SKEIN_CFG_TREE_MAX_LEVEL_MSK \ + (((uint64_t)0xFF) << SKEIN_CFG_TREE_MAX_LEVEL_POS) + +#define SKEIN_CFG_TREE_INFO(leaf, node, maxLvl) \ + ((((uint64_t)(leaf)) << SKEIN_CFG_TREE_LEAF_SIZE_POS) | \ + (((uint64_t)(node)) << SKEIN_CFG_TREE_NODE_SIZE_POS) | \ + (((uint64_t)(maxLvl)) << SKEIN_CFG_TREE_MAX_LEVEL_POS)) + +/* use as treeInfo in InitExt() call for sequential processing */ +#define SKEIN_CFG_TREE_INFO_SEQUENTIAL SKEIN_CFG_TREE_INFO(0, 0, 0) + +/* + * Skein macros for getting/setting tweak words, etc. + * These are useful for partial input bytes, hash tree init/update, etc. + */ +#define Skein_Get_Tweak(ctxPtr, TWK_NUM) ((ctxPtr)->h.T[TWK_NUM]) +#define Skein_Set_Tweak(ctxPtr, TWK_NUM, tVal) \ + do { \ + (ctxPtr)->h.T[TWK_NUM] = (tVal); \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Get_T0(ctxPtr) Skein_Get_Tweak(ctxPtr, 0) +#define Skein_Get_T1(ctxPtr) Skein_Get_Tweak(ctxPtr, 1) +#define Skein_Set_T0(ctxPtr, T0) Skein_Set_Tweak(ctxPtr, 0, T0) +#define Skein_Set_T1(ctxPtr, T1) Skein_Set_Tweak(ctxPtr, 1, T1) + +/* set both tweak words at once */ +#define Skein_Set_T0_T1(ctxPtr, T0, T1) \ + do { \ + Skein_Set_T0(ctxPtr, (T0)); \ + Skein_Set_T1(ctxPtr, (T1)); \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Set_Type(ctxPtr, BLK_TYPE) \ + Skein_Set_T1(ctxPtr, SKEIN_T1_BLK_TYPE_##BLK_TYPE) + +/* + * set up for starting with a new type: h.T[0]=0; h.T[1] = NEW_TYPE; h.bCnt=0; + */ +#define Skein_Start_New_Type(ctxPtr, BLK_TYPE) \ + do { \ + Skein_Set_T0_T1(ctxPtr, 0, SKEIN_T1_FLAG_FIRST | \ + SKEIN_T1_BLK_TYPE_ ## BLK_TYPE); \ + (ctxPtr)->h.bCnt = 0; \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Clear_First_Flag(hdr) \ + do { \ + (hdr).T[1] &= ~SKEIN_T1_FLAG_FIRST; \ + _NOTE(CONSTCOND) \ + } while (0) +#define Skein_Set_Bit_Pad_Flag(hdr) \ + do { \ + (hdr).T[1] |= SKEIN_T1_FLAG_BIT_PAD; \ + _NOTE(CONSTCOND) \ + } while (0) + +#define Skein_Set_Tree_Level(hdr, height) \ + do { \ + (hdr).T[1] |= SKEIN_T1_TREE_LEVEL(height); \ + _NOTE(CONSTCOND) \ + } while (0) + +/* + * "Internal" Skein definitions for debugging and error checking + * Note: in Illumos we always disable debugging features. + */ +#define Skein_Show_Block(bits, ctx, X, blkPtr, wPtr, ksEvenPtr, ksOddPtr) +#define Skein_Show_Round(bits, ctx, r, X) +#define Skein_Show_R_Ptr(bits, ctx, r, X_ptr) +#define Skein_Show_Final(bits, ctx, cnt, outPtr) +#define Skein_Show_Key(bits, ctx, key, keyBytes) + +/* run-time checks (e.g., bad params, uninitialized context)? */ +#ifndef SKEIN_ERR_CHECK +/* default: ignore all Asserts, for performance */ +#define Skein_Assert(x, retCode) +#define Skein_assert(x) +#elif defined(SKEIN_ASSERT) +#include +#define Skein_Assert(x, retCode) ASSERT(x) +#define Skein_assert(x) ASSERT(x) +#else +#include +/* caller error */ +#define Skein_Assert(x, retCode) \ + do { \ + if (!(x)) \ + return (retCode); \ + _NOTE(CONSTCOND) \ + } while (0) +/* internal error */ +#define Skein_assert(x) ASSERT(x) +#endif + +/* + * Skein block function constants (shared across Ref and Opt code) + */ +enum { + /* Skein_256 round rotation constants */ + R_256_0_0 = 14, R_256_0_1 = 16, + R_256_1_0 = 52, R_256_1_1 = 57, + R_256_2_0 = 23, R_256_2_1 = 40, + R_256_3_0 = 5, R_256_3_1 = 37, + R_256_4_0 = 25, R_256_4_1 = 33, + R_256_5_0 = 46, R_256_5_1 = 12, + R_256_6_0 = 58, R_256_6_1 = 22, + R_256_7_0 = 32, R_256_7_1 = 32, + + /* Skein_512 round rotation constants */ + R_512_0_0 = 46, R_512_0_1 = 36, R_512_0_2 = 19, R_512_0_3 = 37, + R_512_1_0 = 33, R_512_1_1 = 27, R_512_1_2 = 14, R_512_1_3 = 42, + R_512_2_0 = 17, R_512_2_1 = 49, R_512_2_2 = 36, R_512_2_3 = 39, + R_512_3_0 = 44, R_512_3_1 = 9, R_512_3_2 = 54, R_512_3_3 = 56, + R_512_4_0 = 39, R_512_4_1 = 30, R_512_4_2 = 34, R_512_4_3 = 24, + R_512_5_0 = 13, R_512_5_1 = 50, R_512_5_2 = 10, R_512_5_3 = 17, + R_512_6_0 = 25, R_512_6_1 = 29, R_512_6_2 = 39, R_512_6_3 = 43, + R_512_7_0 = 8, R_512_7_1 = 35, R_512_7_2 = 56, R_512_7_3 = 22, + + /* Skein1024 round rotation constants */ + R1024_0_0 = 24, R1024_0_1 = 13, R1024_0_2 = 8, R1024_0_3 = + 47, R1024_0_4 = 8, R1024_0_5 = 17, R1024_0_6 = 22, R1024_0_7 = 37, + R1024_1_0 = 38, R1024_1_1 = 19, R1024_1_2 = 10, R1024_1_3 = + 55, R1024_1_4 = 49, R1024_1_5 = 18, R1024_1_6 = 23, R1024_1_7 = 52, + R1024_2_0 = 33, R1024_2_1 = 4, R1024_2_2 = 51, R1024_2_3 = + 13, R1024_2_4 = 34, R1024_2_5 = 41, R1024_2_6 = 59, R1024_2_7 = 17, + R1024_3_0 = 5, R1024_3_1 = 20, R1024_3_2 = 48, R1024_3_3 = + 41, R1024_3_4 = 47, R1024_3_5 = 28, R1024_3_6 = 16, R1024_3_7 = 25, + R1024_4_0 = 41, R1024_4_1 = 9, R1024_4_2 = 37, R1024_4_3 = + 31, R1024_4_4 = 12, R1024_4_5 = 47, R1024_4_6 = 44, R1024_4_7 = 30, + R1024_5_0 = 16, R1024_5_1 = 34, R1024_5_2 = 56, R1024_5_3 = + 51, R1024_5_4 = 4, R1024_5_5 = 53, R1024_5_6 = 42, R1024_5_7 = 41, + R1024_6_0 = 31, R1024_6_1 = 44, R1024_6_2 = 47, R1024_6_3 = + 46, R1024_6_4 = 19, R1024_6_5 = 42, R1024_6_6 = 44, R1024_6_7 = 25, + R1024_7_0 = 9, R1024_7_1 = 48, R1024_7_2 = 35, R1024_7_3 = + 52, R1024_7_4 = 23, R1024_7_5 = 31, R1024_7_6 = 37, R1024_7_7 = 20 +}; + +/* number of rounds for the different block sizes */ +#define SKEIN_256_ROUNDS_TOTAL (72) +#define SKEIN_512_ROUNDS_TOTAL (72) +#define SKEIN1024_ROUNDS_TOTAL (80) + + +extern const uint64_t SKEIN_256_IV_128[]; +extern const uint64_t SKEIN_256_IV_160[]; +extern const uint64_t SKEIN_256_IV_224[]; +extern const uint64_t SKEIN_256_IV_256[]; +extern const uint64_t SKEIN_512_IV_128[]; +extern const uint64_t SKEIN_512_IV_160[]; +extern const uint64_t SKEIN_512_IV_224[]; +extern const uint64_t SKEIN_512_IV_256[]; +extern const uint64_t SKEIN_512_IV_384[]; +extern const uint64_t SKEIN_512_IV_512[]; +extern const uint64_t SKEIN1024_IV_384[]; +extern const uint64_t SKEIN1024_IV_512[]; +extern const uint64_t SKEIN1024_IV_1024[]; + +/* Functions to process blkCnt (nonzero) full block(s) of data. */ +void Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); +void Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, + size_t blkCnt, size_t byteCntAdd); + +#endif /* _SKEIN_IMPL_H_ */ diff --git a/module/icp/algs/skein/skein_iv.c b/module/icp/algs/skein/skein_iv.c new file mode 100644 index 000000000000..140d38f76547 --- /dev/null +++ b/module/icp/algs/skein/skein_iv.c @@ -0,0 +1,185 @@ +/* + * Pre-computed Skein IVs + * + * NOTE: these values are not "magic" constants, but + * are generated using the Threefish block function. + * They are pre-computed here only for speed; i.e., to + * avoid the need for a Threefish call during Init(). + * + * The IV for any fixed hash length may be pre-computed. + * Only the most common values are included here. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ +/* + * Illumos implementation note: these constants are for Skein v1.3 as per: + * http://www.skein-hash.info/sites/default/files/skein1.3.pdf + */ + +#include /* get Skein macros and types */ +#include "skein_impl.h" /* get internal definitions */ + +#define MK_64 SKEIN_MK_64 + +/* blkSize = 256 bits. hashSize = 128 bits */ +const uint64_t SKEIN_256_IV_128[] = { + MK_64(0xE1111906, 0x964D7260), + MK_64(0x883DAAA7, 0x7C8D811C), + MK_64(0x10080DF4, 0x91960F7A), + MK_64(0xCCF7DDE5, 0xB45BC1C2) +}; + +/* blkSize = 256 bits. hashSize = 160 bits */ +const uint64_t SKEIN_256_IV_160[] = { + MK_64(0x14202314, 0x72825E98), + MK_64(0x2AC4E9A2, 0x5A77E590), + MK_64(0xD47A5856, 0x8838D63E), + MK_64(0x2DD2E496, 0x8586AB7D) +}; + +/* blkSize = 256 bits. hashSize = 224 bits */ +const uint64_t SKEIN_256_IV_224[] = { + MK_64(0xC6098A8C, 0x9AE5EA0B), + MK_64(0x876D5686, 0x08C5191C), + MK_64(0x99CB88D7, 0xD7F53884), + MK_64(0x384BDDB1, 0xAEDDB5DE) +}; + +/* blkSize = 256 bits. hashSize = 256 bits */ +const uint64_t SKEIN_256_IV_256[] = { + MK_64(0xFC9DA860, 0xD048B449), + MK_64(0x2FCA6647, 0x9FA7D833), + MK_64(0xB33BC389, 0x6656840F), + MK_64(0x6A54E920, 0xFDE8DA69) +}; + +/* blkSize = 512 bits. hashSize = 128 bits */ +const uint64_t SKEIN_512_IV_128[] = { + MK_64(0xA8BC7BF3, 0x6FBF9F52), + MK_64(0x1E9872CE, 0xBD1AF0AA), + MK_64(0x309B1790, 0xB32190D3), + MK_64(0xBCFBB854, 0x3F94805C), + MK_64(0x0DA61BCD, 0x6E31B11B), + MK_64(0x1A18EBEA, 0xD46A32E3), + MK_64(0xA2CC5B18, 0xCE84AA82), + MK_64(0x6982AB28, 0x9D46982D) +}; + +/* blkSize = 512 bits. hashSize = 160 bits */ +const uint64_t SKEIN_512_IV_160[] = { + MK_64(0x28B81A2A, 0xE013BD91), + MK_64(0xC2F11668, 0xB5BDF78F), + MK_64(0x1760D8F3, 0xF6A56F12), + MK_64(0x4FB74758, 0x8239904F), + MK_64(0x21EDE07F, 0x7EAF5056), + MK_64(0xD908922E, 0x63ED70B8), + MK_64(0xB8EC76FF, 0xECCB52FA), + MK_64(0x01A47BB8, 0xA3F27A6E) +}; + +/* blkSize = 512 bits. hashSize = 224 bits */ +const uint64_t SKEIN_512_IV_224[] = { + MK_64(0xCCD06162, 0x48677224), + MK_64(0xCBA65CF3, 0xA92339EF), + MK_64(0x8CCD69D6, 0x52FF4B64), + MK_64(0x398AED7B, 0x3AB890B4), + MK_64(0x0F59D1B1, 0x457D2BD0), + MK_64(0x6776FE65, 0x75D4EB3D), + MK_64(0x99FBC70E, 0x997413E9), + MK_64(0x9E2CFCCF, 0xE1C41EF7) +}; + +/* blkSize = 512 bits. hashSize = 256 bits */ +const uint64_t SKEIN_512_IV_256[] = { + MK_64(0xCCD044A1, 0x2FDB3E13), + MK_64(0xE8359030, 0x1A79A9EB), + MK_64(0x55AEA061, 0x4F816E6F), + MK_64(0x2A2767A4, 0xAE9B94DB), + MK_64(0xEC06025E, 0x74DD7683), + MK_64(0xE7A436CD, 0xC4746251), + MK_64(0xC36FBAF9, 0x393AD185), + MK_64(0x3EEDBA18, 0x33EDFC13) +}; + +/* blkSize = 512 bits. hashSize = 384 bits */ +const uint64_t SKEIN_512_IV_384[] = { + MK_64(0xA3F6C6BF, 0x3A75EF5F), + MK_64(0xB0FEF9CC, 0xFD84FAA4), + MK_64(0x9D77DD66, 0x3D770CFE), + MK_64(0xD798CBF3, 0xB468FDDA), + MK_64(0x1BC4A666, 0x8A0E4465), + MK_64(0x7ED7D434, 0xE5807407), + MK_64(0x548FC1AC, 0xD4EC44D6), + MK_64(0x266E1754, 0x6AA18FF8) +}; + +/* blkSize = 512 bits. hashSize = 512 bits */ +const uint64_t SKEIN_512_IV_512[] = { + MK_64(0x4903ADFF, 0x749C51CE), + MK_64(0x0D95DE39, 0x9746DF03), + MK_64(0x8FD19341, 0x27C79BCE), + MK_64(0x9A255629, 0xFF352CB1), + MK_64(0x5DB62599, 0xDF6CA7B0), + MK_64(0xEABE394C, 0xA9D5C3F4), + MK_64(0x991112C7, 0x1A75B523), + MK_64(0xAE18A40B, 0x660FCC33) +}; + +/* blkSize = 1024 bits. hashSize = 384 bits */ +const uint64_t SKEIN1024_IV_384[] = { + MK_64(0x5102B6B8, 0xC1894A35), + MK_64(0xFEEBC9E3, 0xFE8AF11A), + MK_64(0x0C807F06, 0xE32BED71), + MK_64(0x60C13A52, 0xB41A91F6), + MK_64(0x9716D35D, 0xD4917C38), + MK_64(0xE780DF12, 0x6FD31D3A), + MK_64(0x797846B6, 0xC898303A), + MK_64(0xB172C2A8, 0xB3572A3B), + MK_64(0xC9BC8203, 0xA6104A6C), + MK_64(0x65909338, 0xD75624F4), + MK_64(0x94BCC568, 0x4B3F81A0), + MK_64(0x3EBBF51E, 0x10ECFD46), + MK_64(0x2DF50F0B, 0xEEB08542), + MK_64(0x3B5A6530, 0x0DBC6516), + MK_64(0x484B9CD2, 0x167BBCE1), + MK_64(0x2D136947, 0xD4CBAFEA) +}; + +/* blkSize = 1024 bits. hashSize = 512 bits */ +const uint64_t SKEIN1024_IV_512[] = { + MK_64(0xCAEC0E5D, 0x7C1B1B18), + MK_64(0xA01B0E04, 0x5F03E802), + MK_64(0x33840451, 0xED912885), + MK_64(0x374AFB04, 0xEAEC2E1C), + MK_64(0xDF25A0E2, 0x813581F7), + MK_64(0xE4004093, 0x8B12F9D2), + MK_64(0xA662D539, 0xC2ED39B6), + MK_64(0xFA8B85CF, 0x45D8C75A), + MK_64(0x8316ED8E, 0x29EDE796), + MK_64(0x053289C0, 0x2E9F91B8), + MK_64(0xC3F8EF1D, 0x6D518B73), + MK_64(0xBDCEC3C4, 0xD5EF332E), + MK_64(0x549A7E52, 0x22974487), + MK_64(0x67070872, 0x5B749816), + MK_64(0xB9CD28FB, 0xF0581BD1), + MK_64(0x0E2940B8, 0x15804974) +}; + +/* blkSize = 1024 bits. hashSize = 1024 bits */ +const uint64_t SKEIN1024_IV_1024[] = { + MK_64(0xD593DA07, 0x41E72355), + MK_64(0x15B5E511, 0xAC73E00C), + MK_64(0x5180E5AE, 0xBAF2C4F0), + MK_64(0x03BD41D3, 0xFCBCAFAF), + MK_64(0x1CAEC6FD, 0x1983A898), + MK_64(0x6E510B8B, 0xCDD0589F), + MK_64(0x77E2BDFD, 0xC6394ADA), + MK_64(0xC11E1DB5, 0x24DCB0A3), + MK_64(0xD6D14AF9, 0xC6329AB5), + MK_64(0x6A9B0BFC, 0x6EB67E0D), + MK_64(0x9243C60D, 0xCCFF1332), + MK_64(0x1A1F1DDE, 0x743F02D4), + MK_64(0x0996753C, 0x10ED0BB8), + MK_64(0x6572DD22, 0xF2B4969A), + MK_64(0x61FD3062, 0xD00A579A), + MK_64(0x1DE0536E, 0x8682E539) +}; diff --git a/module/icp/algs/skein/skein_port.h b/module/icp/algs/skein/skein_port.h new file mode 100644 index 000000000000..ce4353082552 --- /dev/null +++ b/module/icp/algs/skein/skein_port.h @@ -0,0 +1,116 @@ +/* + * Platform-specific definitions for Skein hash function. + * + * Source code author: Doug Whiting, 2008. + * + * This algorithm and source code is released to the public domain. + * + * Many thanks to Brian Gladman for his portable header files. + * + * To port Skein to an "unsupported" platform, change the definitions + * in this file appropriately. + */ +/* Copyright 2013 Doug Whiting. This code is released to the public domain. */ + +#ifndef _SKEIN_PORT_H_ +#define _SKEIN_PORT_H_ + +#include /* get integer type definitions */ + +#ifndef RotL_64 +#define RotL_64(x, N) (((x) << (N)) | ((x) >> (64 - (N)))) +#endif + +/* + * Skein is "natively" little-endian (unlike SHA-xxx), for optimal + * performance on x86 CPUs. The Skein code requires the following + * definitions for dealing with endianness: + * + * SKEIN_NEED_SWAP: 0 for little-endian, 1 for big-endian + * Skein_Put64_LSB_First + * Skein_Get64_LSB_First + * Skein_Swap64 + * + * If SKEIN_NEED_SWAP is defined at compile time, it is used here + * along with the portable versions of Put64/Get64/Swap64, which + * are slow in general. + * + * Otherwise, an "auto-detect" of endianness is attempted below. + * If the default handling doesn't work well, the user may insert + * platform-specific code instead (e.g., for big-endian CPUs). + * + */ +#ifndef SKEIN_NEED_SWAP /* compile-time "override" for endianness? */ + +#include /* get endianness selection */ + +#if defined(_ZFS_BIG_ENDIAN) +/* here for big-endian CPUs */ +#define SKEIN_NEED_SWAP (1) +#else +/* here for x86 and x86-64 CPUs (and other detected little-endian CPUs) */ +#define SKEIN_NEED_SWAP (0) +#define Skein_Put64_LSB_First(dst08, src64, bCnt) bcopy(src64, dst08, bCnt) +#define Skein_Get64_LSB_First(dst64, src08, wCnt) \ + bcopy(src08, dst64, 8 * (wCnt)) +#endif + +#endif /* ifndef SKEIN_NEED_SWAP */ + +/* + * Provide any definitions still needed. + */ +#ifndef Skein_Swap64 /* swap for big-endian, nop for little-endian */ +#if SKEIN_NEED_SWAP +#define Skein_Swap64(w64) \ + (((((uint64_t)(w64)) & 0xFF) << 56) | \ + (((((uint64_t)(w64)) >> 8) & 0xFF) << 48) | \ + (((((uint64_t)(w64)) >> 16) & 0xFF) << 40) | \ + (((((uint64_t)(w64)) >> 24) & 0xFF) << 32) | \ + (((((uint64_t)(w64)) >> 32) & 0xFF) << 24) | \ + (((((uint64_t)(w64)) >> 40) & 0xFF) << 16) | \ + (((((uint64_t)(w64)) >> 48) & 0xFF) << 8) | \ + (((((uint64_t)(w64)) >> 56) & 0xFF))) +#else +#define Skein_Swap64(w64) (w64) +#endif +#endif /* ifndef Skein_Swap64 */ + +#ifndef Skein_Put64_LSB_First +static inline void +Skein_Put64_LSB_First(uint8_t *dst, const uint64_t *src, size_t bCnt) +{ + /* + * this version is fully portable (big-endian or little-endian), + * but slow + */ + size_t n; + + for (n = 0; n < bCnt; n++) + dst[n] = (uint8_t)(src[n >> 3] >> (8 * (n & 7))); +} +#endif /* ifndef Skein_Put64_LSB_First */ + +#ifndef Skein_Get64_LSB_First +static inline void +Skein_Get64_LSB_First(uint64_t *dst, const uint8_t *src, size_t wCnt) +{ + /* + * this version is fully portable (big-endian or little-endian), + * but slow + */ + size_t n; + + for (n = 0; n < 8 * wCnt; n += 8) + dst[n / 8] = (((uint64_t)src[n])) + + (((uint64_t)src[n + 1]) << 8) + + (((uint64_t)src[n + 2]) << 16) + + (((uint64_t)src[n + 3]) << 24) + + (((uint64_t)src[n + 4]) << 32) + + (((uint64_t)src[n + 5]) << 40) + + (((uint64_t)src[n + 6]) << 48) + + (((uint64_t)src[n + 7]) << 56); +} +#endif /* ifndef Skein_Get64_LSB_First */ + +#endif /* _SKEIN_PORT_H_ */ diff --git a/module/icp/api/kcf_cipher.c b/module/icp/api/kcf_cipher.c new file mode 100644 index 000000000000..d6aa48147edb --- /dev/null +++ b/module/icp/api/kcf_cipher.c @@ -0,0 +1,930 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Encryption and decryption routines. + */ + +/* + * The following are the possible returned values common to all the routines + * below. The applicability of some of these return values depends on the + * presence of the arguments. + * + * CRYPTO_SUCCESS: The operation completed successfully. + * CRYPTO_QUEUED: A request was submitted successfully. The callback + * routine will be called when the operation is done. + * CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or + * CRYPTO_INVALID_MECH for problems with the 'mech'. + * CRYPTO_INVALID_DATA for bogus 'data' + * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work. + * CRYPTO_INVALID_CONTEXT: Not a valid context. + * CRYPTO_BUSY: Cannot process the request now. Schedule a + * crypto_bufcall(), or try later. + * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is + * capable of a function or a mechanism. + * CRYPTO_INVALID_KEY: bogus 'key' argument. + * CRYPTO_INVALID_PLAINTEXT: bogus 'plaintext' argument. + * CRYPTO_INVALID_CIPHERTEXT: bogus 'ciphertext' argument. + */ + +/* + * crypto_cipher_init_prov() + * + * Arguments: + * + * pd: provider descriptor + * sid: session id + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * tmpl: a crypto_ctx_template_t, opaque template of a context of an + * encryption or decryption with the 'mech' using 'key'. + * 'tmpl' is created by a previous call to + * crypto_create_ctx_template(). + * ctxp: Pointer to a crypto_context_t. + * func: CRYPTO_FG_ENCRYPT or CRYPTO_FG_DECRYPT. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * This is a common function invoked internally by both + * crypto_encrypt_init() and crypto_decrypt_init(). + * Asynchronously submits a request for, or synchronously performs the + * initialization of an encryption or a decryption operation. + * When possible and applicable, will internally use the pre-expanded key + * schedule from the context template, tmpl. + * When complete and successful, 'ctxp' will contain a crypto_context_t + * valid for later calls to encrypt_update() and encrypt_final(), or + * decrypt_update() and decrypt_final(). + * The caller should hold a reference on the specified provider + * descriptor before calling this function. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +static int +crypto_cipher_init_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_key_t *key, + crypto_spi_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq, crypto_func_group_t func) +{ + int error; + crypto_ctx_t *ctx; + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + if (func == CRYPTO_FG_ENCRYPT) { + error = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_ENCRYPT); + } else { + error = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_DECRYPT); + } + + if (error != CRYPTO_SUCCESS) + return (error); + } + + /* Allocate and initialize the canonical context */ + if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) { + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + return (CRYPTO_HOST_MEMORY); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech); + + if (func == CRYPTO_FG_ENCRYPT) + error = KCF_PROV_ENCRYPT_INIT(real_provider, ctx, + &lmech, key, tmpl, KCF_SWFP_RHNDL(crq)); + else { + ASSERT(func == CRYPTO_FG_DECRYPT); + + error = KCF_PROV_DECRYPT_INIT(real_provider, ctx, + &lmech, key, tmpl, KCF_SWFP_RHNDL(crq)); + } + KCF_PROV_INCRSTATS(pd, error); + + goto done; + } + + /* Check if context sharing is possible */ + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + key->ck_format == CRYPTO_KEY_RAW && + KCF_CAN_SHARE_OPSTATE(pd, mech->cm_type)) { + kcf_context_t *tctxp = (kcf_context_t *)ctx; + kcf_provider_desc_t *tpd = NULL; + crypto_mech_info_t *sinfo; + + if ((kcf_get_sw_prov(mech->cm_type, &tpd, &tctxp->kc_mech, + B_FALSE) == CRYPTO_SUCCESS)) { + int tlen; + + sinfo = &(KCF_TO_PROV_MECHINFO(tpd, mech->cm_type)); + /* + * key->ck_length from the consumer is always in bits. + * We convert it to be in the same unit registered by + * the provider in order to do a comparison. + */ + if (sinfo->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES) + tlen = key->ck_length >> 3; + else + tlen = key->ck_length; + /* + * Check if the software provider can support context + * sharing and support this key length. + */ + if ((sinfo->cm_mech_flags & CRYPTO_CAN_SHARE_OPSTATE) && + (tlen >= sinfo->cm_min_key_length) && + (tlen <= sinfo->cm_max_key_length)) { + ctx->cc_flags = CRYPTO_INIT_OPSTATE; + tctxp->kc_sw_prov_desc = tpd; + } else + KCF_PROV_REFRELE(tpd); + } + } + + if (func == CRYPTO_FG_ENCRYPT) { + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_INIT, sid, + mech, key, NULL, NULL, tmpl); + } else { + ASSERT(func == CRYPTO_FG_DECRYPT); + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_INIT, sid, + mech, key, NULL, NULL, tmpl); + } + + error = kcf_submit_request(real_provider, ctx, crq, ¶ms, + B_FALSE); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + +done: + if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED)) + *ctxp = (crypto_context_t)ctx; + else { + /* Release the hold done in kcf_new_ctx(). */ + KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private); + } + + return (error); +} + +/* + * Same as crypto_cipher_init_prov(), but relies on the scheduler to pick + * an appropriate provider. See crypto_cipher_init_prov() comments for more + * details. + */ +static int +crypto_cipher_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq, crypto_func_group_t func) +{ + int error; + kcf_mech_entry_t *me; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, func, CHECK_RESTRICT(crq), 0)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + error = crypto_cipher_init_prov(pd, pd->pd_sid, mech, key, + spi_ctx_tmpl, ctxp, crq, func); + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_encrypt_prov() + * + * Arguments: + * pd: provider descriptor + * sid: session id + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * plaintext: The message to be encrypted + * ciphertext: Storage for the encrypted message. The length needed + * depends on the mechanism, and the plaintext's size. + * tmpl: a crypto_ctx_template_t, opaque template of a context of an + * encryption with the 'mech' using 'key'. 'tmpl' is created by + * a previous call to crypto_create_ctx_template(). + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * single-part encryption of 'plaintext' with the mechanism 'mech', using + * the key 'key'. + * When complete and successful, 'ciphertext' will contain the encrypted + * message. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_encrypt_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_data_t *plaintext, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_data_t *ciphertext, + crypto_call_req_t *crq) +{ + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + int error; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + error = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_ENCRYPT_ATOMIC); + + if (error != CRYPTO_SUCCESS) + return (error); + } + + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, sid, mech, key, + plaintext, ciphertext, tmpl); + + error = kcf_submit_request(real_provider, NULL, crq, ¶ms, B_FALSE); + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + return (error); +} + +/* + * Same as crypto_encrypt_prov(), but relies on the scheduler to pick + * a provider. See crypto_encrypt_prov() for more details. + */ +int +crypto_encrypt(crypto_mechanism_t *mech, crypto_data_t *plaintext, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *ciphertext, + crypto_call_req_t *crq) +{ + int error; + kcf_mech_entry_t *me; + kcf_req_params_t params; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, CRYPTO_FG_ENCRYPT_ATOMIC, CHECK_RESTRICT(crq), + plaintext->cd_length)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech); + + error = KCF_PROV_ENCRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key, + plaintext, ciphertext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, pd->pd_sid, + mech, key, plaintext, ciphertext, spi_ctx_tmpl); + error = kcf_submit_request(pd, NULL, crq, ¶ms, B_FALSE); + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_encrypt_init_prov() + * + * Calls crypto_cipher_init_prov() to initialize an encryption operation. + */ +int +crypto_encrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq, + CRYPTO_FG_ENCRYPT)); +} + +/* + * crypto_encrypt_init() + * + * Calls crypto_cipher_init() to initialize an encryption operation + */ +int +crypto_encrypt_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + return (crypto_cipher_init(mech, key, tmpl, ctxp, crq, + CRYPTO_FG_ENCRYPT)); +} + +/* + * crypto_encrypt_update() + * + * Arguments: + * context: A crypto_context_t initialized by encrypt_init(). + * plaintext: The message part to be encrypted + * ciphertext: Storage for the encrypted message part. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * part of an encryption operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_encrypt_update(crypto_context_t context, crypto_data_t *plaintext, + crypto_data_t *ciphertext, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext, + ciphertext, NULL); + KCF_PROV_INCRSTATS(pd, error); + return (error); + } + + /* Check if we should use a software provider for small jobs */ + if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) { + if (plaintext->cd_length < kcf_ctx->kc_mech->me_threshold && + kcf_ctx->kc_sw_prov_desc != NULL && + KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) { + pd = kcf_ctx->kc_sw_prov_desc; + } + } + + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_UPDATE, + ctx->cc_session, NULL, NULL, plaintext, ciphertext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + + return (error); +} + +/* + * crypto_encrypt_final() + * + * Arguments: + * context: A crypto_context_t initialized by encrypt_init(). + * ciphertext: Storage for the last part of encrypted message + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * final part of an encryption operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_encrypt_final(crypto_context_t context, crypto_data_t *ciphertext, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_FINAL, + ctx->cc_session, NULL, NULL, NULL, ciphertext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +/* + * crypto_decrypt_prov() + * + * Arguments: + * pd: provider descriptor + * sid: session id + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * ciphertext: The message to be encrypted + * plaintext: Storage for the encrypted message. The length needed + * depends on the mechanism, and the plaintext's size. + * tmpl: a crypto_ctx_template_t, opaque template of a context of an + * encryption with the 'mech' using 'key'. 'tmpl' is created by + * a previous call to crypto_create_ctx_template(). + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * single-part decryption of 'ciphertext' with the mechanism 'mech', using + * the key 'key'. + * When complete and successful, 'plaintext' will contain the decrypted + * message. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_decrypt_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_data_t *ciphertext, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_data_t *plaintext, + crypto_call_req_t *crq) +{ + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + int rv; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + rv = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_DECRYPT_ATOMIC); + + if (rv != CRYPTO_SUCCESS) + return (rv); + } + + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, sid, mech, key, + ciphertext, plaintext, tmpl); + + rv = kcf_submit_request(real_provider, NULL, crq, ¶ms, B_FALSE); + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + return (rv); +} + +/* + * Same as crypto_decrypt_prov(), but relies on the KCF scheduler to + * choose a provider. See crypto_decrypt_prov() comments for more + * information. + */ +int +crypto_decrypt(crypto_mechanism_t *mech, crypto_data_t *ciphertext, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *plaintext, + crypto_call_req_t *crq) +{ + int error; + kcf_mech_entry_t *me; + kcf_req_params_t params; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, CRYPTO_FG_DECRYPT_ATOMIC, CHECK_RESTRICT(crq), + ciphertext->cd_length)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech); + + error = KCF_PROV_DECRYPT_ATOMIC(pd, pd->pd_sid, &lmech, key, + ciphertext, plaintext, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, pd->pd_sid, + mech, key, ciphertext, plaintext, spi_ctx_tmpl); + error = kcf_submit_request(pd, NULL, crq, ¶ms, B_FALSE); + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_decrypt_init_prov() + * + * Calls crypto_cipher_init_prov() to initialize a decryption operation + */ +int +crypto_decrypt_init_prov(crypto_provider_t pd, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + return (crypto_cipher_init_prov(pd, sid, mech, key, tmpl, ctxp, crq, + CRYPTO_FG_DECRYPT)); +} + +/* + * crypto_decrypt_init() + * + * Calls crypto_cipher_init() to initialize a decryption operation + */ +int +crypto_decrypt_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + return (crypto_cipher_init(mech, key, tmpl, ctxp, crq, + CRYPTO_FG_DECRYPT)); +} + +/* + * crypto_decrypt_update() + * + * Arguments: + * context: A crypto_context_t initialized by decrypt_init(). + * ciphertext: The message part to be decrypted + * plaintext: Storage for the decrypted message part. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * part of an decryption operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_decrypt_update(crypto_context_t context, crypto_data_t *ciphertext, + crypto_data_t *plaintext, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext, + plaintext, NULL); + KCF_PROV_INCRSTATS(pd, error); + return (error); + } + + /* Check if we should use a software provider for small jobs */ + if ((ctx->cc_flags & CRYPTO_USE_OPSTATE) && cr == NULL) { + if (ciphertext->cd_length < kcf_ctx->kc_mech->me_threshold && + kcf_ctx->kc_sw_prov_desc != NULL && + KCF_IS_PROV_USABLE(kcf_ctx->kc_sw_prov_desc)) { + pd = kcf_ctx->kc_sw_prov_desc; + } + } + + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_UPDATE, + ctx->cc_session, NULL, NULL, ciphertext, plaintext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + + return (error); +} + +/* + * crypto_decrypt_final() + * + * Arguments: + * context: A crypto_context_t initialized by decrypt_init(). + * plaintext: Storage for the last part of the decrypted message + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * final part of a decryption operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_decrypt_final(crypto_context_t context, crypto_data_t *plaintext, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext, + NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_FINAL, + ctx->cc_session, NULL, NULL, NULL, plaintext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +/* + * See comments for crypto_encrypt_update(). + */ +int +crypto_encrypt_single(crypto_context_t context, crypto_data_t *plaintext, + crypto_data_t *ciphertext, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_ENCRYPT(pd, ctx, plaintext, + ciphertext, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_ENCRYPT_OPS_PARAMS(¶ms, KCF_OP_SINGLE, pd->pd_sid, + NULL, NULL, plaintext, ciphertext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +/* + * See comments for crypto_decrypt_update(). + */ +int +crypto_decrypt_single(crypto_context_t context, crypto_data_t *ciphertext, + crypto_data_t *plaintext, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DECRYPT(pd, ctx, ciphertext, + plaintext, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DECRYPT_OPS_PARAMS(¶ms, KCF_OP_SINGLE, pd->pd_sid, + NULL, NULL, ciphertext, plaintext, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(crypto_encrypt_prov); +EXPORT_SYMBOL(crypto_encrypt); +EXPORT_SYMBOL(crypto_encrypt_init_prov); +EXPORT_SYMBOL(crypto_encrypt_init); +EXPORT_SYMBOL(crypto_encrypt_update); +EXPORT_SYMBOL(crypto_encrypt_final); +EXPORT_SYMBOL(crypto_decrypt_prov); +EXPORT_SYMBOL(crypto_decrypt); +EXPORT_SYMBOL(crypto_decrypt_init_prov); +EXPORT_SYMBOL(crypto_decrypt_init); +EXPORT_SYMBOL(crypto_decrypt_update); +EXPORT_SYMBOL(crypto_decrypt_final); +EXPORT_SYMBOL(crypto_encrypt_single); +EXPORT_SYMBOL(crypto_decrypt_single); +#endif diff --git a/module/icp/api/kcf_ctxops.c b/module/icp/api/kcf_ctxops.c new file mode 100644 index 000000000000..21b0977d3634 --- /dev/null +++ b/module/icp/api/kcf_ctxops.c @@ -0,0 +1,151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Crypto contexts manipulation routines + */ + +/* + * crypto_create_ctx_template() + * + * Arguments: + * + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * ptmpl: a storage for the opaque crypto_ctx_template_t, allocated and + * initialized by the software provider this routine is + * dispatched to. + * kmflag: KM_SLEEP/KM_NOSLEEP mem. alloc. flag. + * + * Description: + * Redirects the call to the software provider of the specified + * mechanism. That provider will allocate and pre-compute/pre-expand + * the context template, reusable by later calls to crypto_xxx_init(). + * The size and address of that provider context template are stored + * in an internal structure, kcf_ctx_template_t. The address of that + * structure is given back to the caller in *ptmpl. + * + * Context: + * Process or interrupt. + * + * Returns: + * CRYPTO_SUCCESS when the context template is successfully created. + * CRYPTO_HOST_MEMORY: mem alloc failure + * CRYPTO_ARGUMENTS_BAD: NULL storage for the ctx template. + * RYPTO_MECHANISM_INVALID: invalid mechanism 'mech'. + */ +int +crypto_create_ctx_template(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t *ptmpl, int kmflag) +{ + int error; + kcf_mech_entry_t *me; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_mechanism_t prov_mech; + + /* A few args validation */ + + if (ptmpl == NULL) + return (CRYPTO_ARGUMENTS_BAD); + + if (mech == NULL) + return (CRYPTO_MECHANISM_INVALID); + + error = kcf_get_sw_prov(mech->cm_type, &pd, &me, B_TRUE); + if (error != CRYPTO_SUCCESS) + return (error); + + if ((ctx_tmpl = (kcf_ctx_template_t *)kmem_alloc( + sizeof (kcf_ctx_template_t), kmflag)) == NULL) { + KCF_PROV_REFRELE(pd); + return (CRYPTO_HOST_MEMORY); + } + + /* Pass a mechtype that the provider understands */ + prov_mech.cm_type = KCF_TO_PROV_MECHNUM(pd, mech->cm_type); + prov_mech.cm_param = mech->cm_param; + prov_mech.cm_param_len = mech->cm_param_len; + + error = KCF_PROV_CREATE_CTX_TEMPLATE(pd, &prov_mech, key, + &(ctx_tmpl->ct_prov_tmpl), &(ctx_tmpl->ct_size), KCF_RHNDL(kmflag)); + + if (error == CRYPTO_SUCCESS) { + ctx_tmpl->ct_generation = me->me_gen_swprov; + *ptmpl = ctx_tmpl; + } else { + kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t)); + } + KCF_PROV_REFRELE(pd); + + return (error); +} + +/* + * crypto_destroy_ctx_template() + * + * Arguments: + * + * tmpl: an opaque crypto_ctx_template_t previously created by + * crypto_create_ctx_template() + * + * Description: + * Frees the embedded crypto_spi_ctx_template_t, then the + * kcf_ctx_template_t. + * + * Context: + * Process or interrupt. + * + */ +void +crypto_destroy_ctx_template(crypto_ctx_template_t tmpl) +{ + kcf_ctx_template_t *ctx_tmpl = (kcf_ctx_template_t *)tmpl; + + if (ctx_tmpl == NULL) + return; + + ASSERT(ctx_tmpl->ct_prov_tmpl != NULL); + + bzero(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size); + kmem_free(ctx_tmpl->ct_prov_tmpl, ctx_tmpl->ct_size); + kmem_free(ctx_tmpl, sizeof (kcf_ctx_template_t)); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(crypto_create_ctx_template); +EXPORT_SYMBOL(crypto_destroy_ctx_template); +#endif diff --git a/module/icp/api/kcf_digest.c b/module/icp/api/kcf_digest.c new file mode 100644 index 000000000000..aa68d69bc162 --- /dev/null +++ b/module/icp/api/kcf_digest.c @@ -0,0 +1,491 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Message digest routines + */ + +/* + * The following are the possible returned values common to all the routines + * below. The applicability of some of these return values depends on the + * presence of the arguments. + * + * CRYPTO_SUCCESS: The operation completed successfully. + * CRYPTO_QUEUED: A request was submitted successfully. The callback + * routine will be called when the operation is done. + * CRYPTO_MECHANISM_INVALID or CRYPTO_INVALID_MECH_PARAM + * for problems with the 'mech'. + * CRYPTO_INVALID_DATA for bogus 'data' + * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work. + * CRYPTO_INVALID_CONTEXT: Not a valid context. + * CRYPTO_BUSY: Cannot process the request now. Schedule a + * crypto_bufcall(), or try later. + * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: + * No provider is capable of a function or a mechanism. + */ + + +/* + * crypto_digest_prov() + * + * Arguments: + * pd: pointer to the descriptor of the provider to use for this + * operation. + * sid: provider session id. + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * data: The message to be digested. + * digest: Storage for the digest. The length needed depends on the + * mechanism. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * digesting operation of 'data' on the specified + * provider with the specified session. + * When complete and successful, 'digest' will contain the digest value. + * The caller should hold a reference on the specified provider + * descriptor before calling this function. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_digest_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_data_t *data, crypto_data_t *digest, + crypto_call_req_t *crq) +{ + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + int rv; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + rv = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), + pd, &real_provider, CRYPTO_FG_DIGEST_ATOMIC); + + if (rv != CRYPTO_SUCCESS) + return (rv); + } + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, sid, mech, NULL, + data, digest); + + /* no crypto context to carry between multiple parts. */ + rv = kcf_submit_request(real_provider, NULL, crq, ¶ms, B_FALSE); + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + return (rv); +} + + +/* + * Same as crypto_digest_prov(), but relies on the KCF scheduler to + * choose a provider. See crypto_digest_prov() comments for more information. + */ +int +crypto_digest(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_data_t *digest, crypto_call_req_t *crq) +{ + int error; + kcf_provider_desc_t *pd; + kcf_req_params_t params; + kcf_prov_tried_t *list = NULL; + +retry: + /* The pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error, list, + CRYPTO_FG_DIGEST_ATOMIC, CHECK_RESTRICT(crq), + data->cd_length)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech); + error = KCF_PROV_DIGEST_ATOMIC(pd, pd->pd_sid, &lmech, data, + digest, KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) && + (data->cd_length > pd->pd_hash_limit)) { + error = CRYPTO_BUFFER_TOO_BIG; + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, + pd->pd_sid, mech, NULL, data, digest); + + /* no crypto context to carry between multiple parts. */ + error = kcf_submit_request(pd, NULL, crq, ¶ms, + B_FALSE); + } + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_digest_init_prov() + * + * pd: pointer to the descriptor of the provider to use for this + * operation. + * sid: provider session id. + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * ctxp: Pointer to a crypto_context_t. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * initialization of a message digest operation on the specified + * provider with the specified session. + * When complete and successful, 'ctxp' will contain a crypto_context_t + * valid for later calls to digest_update() and digest_final(). + * The caller should hold a reference on the specified provider + * descriptor before calling this function. + */ +int +crypto_digest_init_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_context_t *ctxp, crypto_call_req_t *crq) +{ + int error; + crypto_ctx_t *ctx; + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + error = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_DIGEST); + + if (error != CRYPTO_SUCCESS) + return (error); + } + + /* Allocate and initialize the canonical context */ + if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) { + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + return (CRYPTO_HOST_MEMORY); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech); + error = KCF_PROV_DIGEST_INIT(real_provider, ctx, &lmech, + KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_INIT, sid, + mech, NULL, NULL, NULL); + error = kcf_submit_request(real_provider, ctx, crq, ¶ms, + B_FALSE); + } + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + if ((error == CRYPTO_SUCCESS) || (error == CRYPTO_QUEUED)) + *ctxp = (crypto_context_t)ctx; + else { + /* Release the hold done in kcf_new_ctx(). */ + KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private); + } + + return (error); +} + +/* + * Same as crypto_digest_init_prov(), but relies on the KCF scheduler + * to choose a provider. See crypto_digest_init_prov() comments for + * more information. + */ +int +crypto_digest_init(crypto_mechanism_t *mech, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + int error; + kcf_provider_desc_t *pd; + kcf_prov_tried_t *list = NULL; + +retry: + /* The pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, NULL, &error, + list, CRYPTO_FG_DIGEST, CHECK_RESTRICT(crq), 0)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) { + /* + * The hardware provider has limited digest support. + * So, we fallback early here to using a software provider. + * + * XXX - need to enhance to do the fallback later in + * crypto_digest_update() if the size of accumulated input data + * exceeds the maximum size digestable by hardware provider. + */ + error = CRYPTO_BUFFER_TOO_BIG; + } else { + error = crypto_digest_init_prov(pd, pd->pd_sid, + mech, ctxp, crq); + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_digest_update() + * + * Arguments: + * context: A crypto_context_t initialized by digest_init(). + * data: The part of message to be digested. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * part of a message digest operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_digest_update(crypto_context_t context, crypto_data_t *data, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DIGEST_UPDATE(pd, ctx, data, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_UPDATE, + ctx->cc_session, NULL, NULL, data, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + return (error); +} + +/* + * crypto_digest_final() + * + * Arguments: + * context: A crypto_context_t initialized by digest_init(). + * digest: The storage for the digest. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * final part of a message digest operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_digest_final(crypto_context_t context, crypto_data_t *digest, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DIGEST_FINAL(pd, ctx, digest, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_FINAL, + ctx->cc_session, NULL, NULL, NULL, digest); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +/* + * Performs a digest update on the specified key. Note that there is + * no k-API crypto_digest_key() equivalent of this function. + */ +int +crypto_digest_key_prov(crypto_context_t context, crypto_key_t *key, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DIGEST_KEY(pd, ctx, key, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_DIGEST_KEY, + ctx->cc_session, NULL, key, NULL, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + return (error); +} + +/* + * See comments for crypto_digest_update() and crypto_digest_final(). + */ +int +crypto_digest_single(crypto_context_t context, crypto_data_t *data, + crypto_data_t *digest, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_DIGEST(pd, ctx, data, digest, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_DIGEST_OPS_PARAMS(¶ms, KCF_OP_SINGLE, pd->pd_sid, + NULL, NULL, data, digest); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(crypto_digest_prov); +EXPORT_SYMBOL(crypto_digest); +EXPORT_SYMBOL(crypto_digest_init_prov); +EXPORT_SYMBOL(crypto_digest_init); +EXPORT_SYMBOL(crypto_digest_update); +EXPORT_SYMBOL(crypto_digest_final); +EXPORT_SYMBOL(crypto_digest_key_prov); +EXPORT_SYMBOL(crypto_digest_single); +#endif diff --git a/module/icp/api/kcf_mac.c b/module/icp/api/kcf_mac.c new file mode 100644 index 000000000000..a7722d8f914c --- /dev/null +++ b/module/icp/api/kcf_mac.c @@ -0,0 +1,645 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Message authentication codes routines. + */ + +/* + * The following are the possible returned values common to all the routines + * below. The applicability of some of these return values depends on the + * presence of the arguments. + * + * CRYPTO_SUCCESS: The operation completed successfully. + * CRYPTO_QUEUED: A request was submitted successfully. The callback + * routine will be called when the operation is done. + * CRYPTO_INVALID_MECH_NUMBER, CRYPTO_INVALID_MECH_PARAM, or + * CRYPTO_INVALID_MECH for problems with the 'mech'. + * CRYPTO_INVALID_DATA for bogus 'data' + * CRYPTO_HOST_MEMORY for failure to allocate memory to handle this work. + * CRYPTO_INVALID_CONTEXT: Not a valid context. + * CRYPTO_BUSY: Cannot process the request now. Schedule a + * crypto_bufcall(), or try later. + * CRYPTO_NOT_SUPPORTED and CRYPTO_MECH_NOT_SUPPORTED: No provider is + * capable of a function or a mechanism. + * CRYPTO_INVALID_KEY: bogus 'key' argument. + * CRYPTO_INVALID_MAC: bogus 'mac' argument. + */ + +/* + * crypto_mac_prov() + * + * Arguments: + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * data: The message to compute the MAC for. + * mac: Storage for the MAC. The length needed depends on the mechanism. + * tmpl: a crypto_ctx_template_t, opaque template of a context of a + * MAC with the 'mech' using 'key'. 'tmpl' is created by + * a previous call to crypto_create_ctx_template(). + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * single-part message authentication of 'data' with the mechanism + * 'mech', using * the key 'key', on the specified provider with + * the specified session id. + * When complete and successful, 'mac' will contain the message + * authentication code. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'crq'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_mac_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq) +{ + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + int rv; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + rv = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_MAC_ATOMIC); + + if (rv != CRYPTO_SUCCESS) + return (rv); + } + + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, sid, mech, key, + data, mac, tmpl); + rv = kcf_submit_request(real_provider, NULL, crq, ¶ms, B_FALSE); + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + return (rv); +} + +/* + * Same as crypto_mac_prov(), but relies on the KCF scheduler to choose + * a provider. See crypto_mac() comments for more information. + */ +int +crypto_mac(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac, + crypto_call_req_t *crq) +{ + int error; + kcf_mech_entry_t *me; + kcf_req_params_t params; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* The pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq), + data->cd_length)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech); + + error = KCF_PROV_MAC_ATOMIC(pd, pd->pd_sid, &lmech, key, data, + mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) && + (data->cd_length > pd->pd_hash_limit)) { + /* + * XXX - We need a check to see if this is indeed + * a HMAC. So far, all kernel clients use + * this interface only for HMAC. So, this is fine + * for now. + */ + error = CRYPTO_BUFFER_TOO_BIG; + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_ATOMIC, + pd->pd_sid, mech, key, data, mac, spi_ctx_tmpl); + + error = kcf_submit_request(pd, NULL, crq, ¶ms, + KCF_ISDUALREQ(crq)); + } + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * Single part operation to compute the MAC corresponding to the specified + * 'data' and to verify that it matches the MAC specified by 'mac'. + * The other arguments are the same as the function crypto_mac_prov(). + */ +int +crypto_mac_verify_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_data_t *data, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_data_t *mac, crypto_call_req_t *crq) +{ + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + int rv; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + rv = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_MAC_ATOMIC); + + if (rv != CRYPTO_SUCCESS) + return (rv); + } + + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_MAC_VERIFY_ATOMIC, sid, mech, + key, data, mac, tmpl); + rv = kcf_submit_request(real_provider, NULL, crq, ¶ms, B_FALSE); + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + return (rv); +} + +/* + * Same as crypto_mac_verify_prov(), but relies on the KCF scheduler to choose + * a provider. See crypto_mac_verify_prov() comments for more information. + */ +int +crypto_mac_verify(crypto_mechanism_t *mech, crypto_data_t *data, + crypto_key_t *key, crypto_ctx_template_t tmpl, crypto_data_t *mac, + crypto_call_req_t *crq) +{ + int error; + kcf_mech_entry_t *me; + kcf_req_params_t params; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* The pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, CRYPTO_FG_MAC_ATOMIC, CHECK_RESTRICT(crq), + data->cd_length)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, pd, &lmech); + + error = KCF_PROV_MAC_VERIFY_ATOMIC(pd, pd->pd_sid, &lmech, key, + data, mac, spi_ctx_tmpl, KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, error); + } else { + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + (pd->pd_flags & CRYPTO_HASH_NO_UPDATE) && + (data->cd_length > pd->pd_hash_limit)) { + /* see comments in crypto_mac() */ + error = CRYPTO_BUFFER_TOO_BIG; + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, + KCF_OP_MAC_VERIFY_ATOMIC, pd->pd_sid, mech, + key, data, mac, spi_ctx_tmpl); + + error = kcf_submit_request(pd, NULL, crq, ¶ms, + KCF_ISDUALREQ(crq)); + } + } + + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_mac_init_prov() + * + * Arguments: + * pd: pointer to the descriptor of the provider to use for this + * operation. + * sid: provider session id. + * mech: crypto_mechanism_t pointer. + * mech_type is a valid value previously returned by + * crypto_mech2id(); + * When the mech's parameter is not NULL, its definition depends + * on the standard definition of the mechanism. + * key: pointer to a crypto_key_t structure. + * tmpl: a crypto_ctx_template_t, opaque template of a context of a + * MAC with the 'mech' using 'key'. 'tmpl' is created by + * a previous call to crypto_create_ctx_template(). + * ctxp: Pointer to a crypto_context_t. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs the + * initialization of a MAC operation on the specified provider with + * the specified session. + * When possible and applicable, will internally use the pre-computed MAC + * context from the context template, tmpl. + * When complete and successful, 'ctxp' will contain a crypto_context_t + * valid for later calls to mac_update() and mac_final(). + * The caller should hold a reference on the specified provider + * descriptor before calling this function. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_mac_init_prov(crypto_provider_t provider, crypto_session_id_t sid, + crypto_mechanism_t *mech, crypto_key_t *key, crypto_spi_ctx_template_t tmpl, + crypto_context_t *ctxp, crypto_call_req_t *crq) +{ + int rv; + crypto_ctx_t *ctx; + kcf_req_params_t params; + kcf_provider_desc_t *pd = provider; + kcf_provider_desc_t *real_provider = pd; + + ASSERT(KCF_PROV_REFHELD(pd)); + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + rv = kcf_get_hardware_provider(mech->cm_type, + CRYPTO_MECH_INVALID, CHECK_RESTRICT(crq), pd, + &real_provider, CRYPTO_FG_MAC); + + if (rv != CRYPTO_SUCCESS) + return (rv); + } + + /* Allocate and initialize the canonical context */ + if ((ctx = kcf_new_ctx(crq, real_provider, sid)) == NULL) { + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + return (CRYPTO_HOST_MEMORY); + } + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(crq, pd)) { + crypto_mechanism_t lmech; + + lmech = *mech; + KCF_SET_PROVIDER_MECHNUM(mech->cm_type, real_provider, &lmech); + rv = KCF_PROV_MAC_INIT(real_provider, ctx, &lmech, key, tmpl, + KCF_SWFP_RHNDL(crq)); + KCF_PROV_INCRSTATS(pd, rv); + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_INIT, sid, mech, key, + NULL, NULL, tmpl); + rv = kcf_submit_request(real_provider, ctx, crq, ¶ms, + B_FALSE); + } + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) + KCF_PROV_REFRELE(real_provider); + + if ((rv == CRYPTO_SUCCESS) || (rv == CRYPTO_QUEUED)) + *ctxp = (crypto_context_t)ctx; + else { + /* Release the hold done in kcf_new_ctx(). */ + KCF_CONTEXT_REFRELE((kcf_context_t *)ctx->cc_framework_private); + } + + return (rv); +} + +/* + * Same as crypto_mac_init_prov(), but relies on the KCF scheduler to + * choose a provider. See crypto_mac_init_prov() comments for more + * information. + */ +int +crypto_mac_init(crypto_mechanism_t *mech, crypto_key_t *key, + crypto_ctx_template_t tmpl, crypto_context_t *ctxp, + crypto_call_req_t *crq) +{ + int error; + kcf_mech_entry_t *me; + kcf_provider_desc_t *pd; + kcf_ctx_template_t *ctx_tmpl; + crypto_spi_ctx_template_t spi_ctx_tmpl = NULL; + kcf_prov_tried_t *list = NULL; + +retry: + /* The pd is returned held */ + if ((pd = kcf_get_mech_provider(mech->cm_type, &me, &error, + list, CRYPTO_FG_MAC, CHECK_RESTRICT(crq), 0)) == NULL) { + if (list != NULL) + kcf_free_triedlist(list); + return (error); + } + + /* + * For SW providers, check the validity of the context template + * It is very rare that the generation number mis-matches, so + * is acceptable to fail here, and let the consumer recover by + * freeing this tmpl and create a new one for the key and new SW + * provider + */ + + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + ((ctx_tmpl = (kcf_ctx_template_t *)tmpl) != NULL)) { + if (ctx_tmpl->ct_generation != me->me_gen_swprov) { + if (list != NULL) + kcf_free_triedlist(list); + KCF_PROV_REFRELE(pd); + return (CRYPTO_OLD_CTX_TEMPLATE); + } else { + spi_ctx_tmpl = ctx_tmpl->ct_prov_tmpl; + } + } + + if (pd->pd_prov_type == CRYPTO_HW_PROVIDER && + (pd->pd_flags & CRYPTO_HASH_NO_UPDATE)) { + /* + * The hardware provider has limited HMAC support. + * So, we fallback early here to using a software provider. + * + * XXX - need to enhance to do the fallback later in + * crypto_mac_update() if the size of accumulated input data + * exceeds the maximum size digestable by hardware provider. + */ + error = CRYPTO_BUFFER_TOO_BIG; + } else { + error = crypto_mac_init_prov(pd, pd->pd_sid, mech, key, + spi_ctx_tmpl, ctxp, crq); + } + if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED && + IS_RECOVERABLE(error)) { + /* Add pd to the linked list of providers tried. */ + if (kcf_insert_triedlist(&list, pd, KCF_KMFLAG(crq)) != NULL) + goto retry; + } + + if (list != NULL) + kcf_free_triedlist(list); + + KCF_PROV_REFRELE(pd); + return (error); +} + +/* + * crypto_mac_update() + * + * Arguments: + * context: A crypto_context_t initialized by mac_init(). + * data: The message part to be MAC'ed + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * part of a MAC operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_mac_update(crypto_context_t context, crypto_data_t *data, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + kcf_req_params_t params; + int rv; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + rv = KCF_PROV_MAC_UPDATE(pd, ctx, data, NULL); + KCF_PROV_INCRSTATS(pd, rv); + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_UPDATE, + ctx->cc_session, NULL, NULL, data, NULL, NULL); + rv = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + return (rv); +} + +/* + * crypto_mac_final() + * + * Arguments: + * context: A crypto_context_t initialized by mac_init(). + * mac: Storage for the message authentication code. + * cr: crypto_call_req_t calling conditions and call back info. + * + * Description: + * Asynchronously submits a request for, or synchronously performs a + * part of a message authentication operation. + * + * Context: + * Process or interrupt, according to the semantics dictated by the 'cr'. + * + * Returns: + * See comment in the beginning of the file. + */ +int +crypto_mac_final(crypto_context_t context, crypto_data_t *mac, + crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + kcf_req_params_t params; + int rv; + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + ASSERT(pd->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + rv = KCF_PROV_MAC_FINAL(pd, ctx, mac, NULL); + KCF_PROV_INCRSTATS(pd, rv); + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_FINAL, + ctx->cc_session, NULL, NULL, NULL, mac, NULL); + rv = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx); + return (rv); +} + +/* + * See comments for crypto_mac_update() and crypto_mac_final(). + */ +int +crypto_mac_single(crypto_context_t context, crypto_data_t *data, + crypto_data_t *mac, crypto_call_req_t *cr) +{ + crypto_ctx_t *ctx = (crypto_ctx_t *)context; + kcf_context_t *kcf_ctx; + kcf_provider_desc_t *pd; + int error; + kcf_req_params_t params; + + + if ((ctx == NULL) || + ((kcf_ctx = (kcf_context_t *)ctx->cc_framework_private) == NULL) || + ((pd = kcf_ctx->kc_prov_desc) == NULL)) { + return (CRYPTO_INVALID_CONTEXT); + } + + + /* The fast path for SW providers. */ + if (CHECK_FASTPATH(cr, pd)) { + error = KCF_PROV_MAC(pd, ctx, data, mac, NULL); + KCF_PROV_INCRSTATS(pd, error); + } else { + KCF_WRAP_MAC_OPS_PARAMS(¶ms, KCF_OP_SINGLE, pd->pd_sid, + NULL, NULL, data, mac, NULL); + error = kcf_submit_request(pd, ctx, cr, ¶ms, B_FALSE); + } + + /* Release the hold done in kcf_new_ctx() during init step. */ + KCF_CONTEXT_COND_RELEASE(error, kcf_ctx); + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(crypto_mac_prov); +EXPORT_SYMBOL(crypto_mac); +EXPORT_SYMBOL(crypto_mac_verify_prov); +EXPORT_SYMBOL(crypto_mac_verify); +EXPORT_SYMBOL(crypto_mac_init_prov); +EXPORT_SYMBOL(crypto_mac_init); +EXPORT_SYMBOL(crypto_mac_update); +EXPORT_SYMBOL(crypto_mac_final); +EXPORT_SYMBOL(crypto_mac_single); +#endif diff --git a/module/icp/api/kcf_miscapi.c b/module/icp/api/kcf_miscapi.c new file mode 100644 index 000000000000..c0f415b264a7 --- /dev/null +++ b/module/icp/api/kcf_miscapi.c @@ -0,0 +1,127 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +/* + * All event subscribers are put on a list. kcf_notify_list_lock + * protects changes to this list. + * + * The following locking order is maintained in the code - The + * global kcf_notify_list_lock followed by the individual lock + * in a kcf_ntfy_elem structure (kn_lock). + */ +kmutex_t ntfy_list_lock; +kcondvar_t ntfy_list_cv; /* cv the service thread waits on */ +static kcf_ntfy_elem_t *ntfy_list_head; + +/* + * crypto_mech2id() + * + * Arguments: + * . mechname: A null-terminated string identifying the mechanism name. + * + * Description: + * Walks the mechanisms tables, looking for an entry that matches the + * mechname. Once it find it, it builds the 64-bit mech_type and returns + * it. If there are no hardware or software providers for the mechanism, + * but there is an unloaded software provider, this routine will attempt + * to load it. + * + * Context: + * Process and interruption. + * + * Returns: + * The unique mechanism identified by 'mechname', if found. + * CRYPTO_MECH_INVALID otherwise. + */ +crypto_mech_type_t +crypto_mech2id(char *mechname) +{ + return (crypto_mech2id_common(mechname, B_TRUE)); +} + +/* + * We walk the notification list and do the callbacks. + */ +void +kcf_walk_ntfylist(uint32_t event, void *event_arg) +{ + kcf_ntfy_elem_t *nep; + int nelem = 0; + + mutex_enter(&ntfy_list_lock); + + /* + * Count how many clients are on the notification list. We need + * this count to ensure that clients which joined the list after we + * have started this walk, are not wrongly notified. + */ + for (nep = ntfy_list_head; nep != NULL; nep = nep->kn_next) + nelem++; + + for (nep = ntfy_list_head; (nep != NULL && nelem); nep = nep->kn_next) { + nelem--; + + /* + * Check if this client is interested in the + * event. + */ + if (!(nep->kn_event_mask & event)) + continue; + + mutex_enter(&nep->kn_lock); + nep->kn_state = NTFY_RUNNING; + mutex_exit(&nep->kn_lock); + mutex_exit(&ntfy_list_lock); + + /* + * We invoke the callback routine with no locks held. Another + * client could have joined the list meanwhile. This is fine + * as we maintain nelem as stated above. The NULL check in the + * for loop guards against shrinkage. Also, any callers of + * crypto_unnotify_events() at this point cv_wait till kn_state + * changes to NTFY_WAITING. Hence, nep is assured to be valid. + */ + (*nep->kn_func)(event, event_arg); + + mutex_enter(&nep->kn_lock); + nep->kn_state = NTFY_WAITING; + cv_broadcast(&nep->kn_cv); + mutex_exit(&nep->kn_lock); + + mutex_enter(&ntfy_list_lock); + } + + mutex_exit(&ntfy_list_lock); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(crypto_mech2id); +#endif diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman new file mode 100644 index 000000000000..48fea7bb333e --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman @@ -0,0 +1,23 @@ + --------------------------------------------------------------------------- + Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + + LICENSE TERMS + + The free distribution and use of this software is allowed (with or without + changes) provided that: + + 1. source code distributions include the above copyright notice, this + list of conditions and the following disclaimer; + + 2. binary distributions include the above copyright notice, this list + of conditions and the following disclaimer in their documentation; + + 3. the name of the copyright holder is not used to endorse products + built using this software without specific written permission. + + DISCLAIMER + + This software is provided 'as is' with no explicit or implied warranties + in respect of its properties, including, but not limited to, correctness + and/or fitness for purpose. + --------------------------------------------------------------------------- diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip new file mode 100644 index 000000000000..5f822cf27586 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.gladman.descrip @@ -0,0 +1 @@ +PORTIONS OF AES FUNCTIONALITY diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl new file mode 100644 index 000000000000..92c9e196a318 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl @@ -0,0 +1,127 @@ + + LICENSE ISSUES + ============== + + The OpenSSL toolkit stays under a dual license, i.e. both the conditions of + the OpenSSL License and the original SSLeay license apply to the toolkit. + See below for the actual license texts. Actually both licenses are BSD-style + Open Source licenses. In case of any license issues related to OpenSSL + please contact openssl-core@openssl.org. + + OpenSSL License + --------------- + +/* ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + * + * This product includes cryptographic software written by Eric Young + * (eay@cryptsoft.com). This product includes software written by Tim + * Hudson (tjh@cryptsoft.com). + * + */ + + Original SSLeay License + ----------------------- + +/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) + * All rights reserved. + * + * This package is an SSL implementation written + * by Eric Young (eay@cryptsoft.com). + * The implementation was written so as to conform with Netscapes SSL. + * + * This library is free for commercial and non-commercial use as long as + * the following conditions are aheared to. The following conditions + * apply to all code found in this distribution, be it the RC4, RSA, + * lhash, DES, etc., code; not just the SSL code. The SSL documentation + * included with this distribution is covered by the same copyright terms + * except that the holder is Tim Hudson (tjh@cryptsoft.com). + * + * Copyright remains Eric Young's, and as such any Copyright notices in + * the code are not to be removed. + * If this package is used in a product, Eric Young should be given attribution + * as the author of the parts of the library used. + * This can be in the form of a textual message at program startup or + * in documentation (online or textual) provided with the package. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * "This product includes cryptographic software written by + * Eric Young (eay@cryptsoft.com)" + * The word 'cryptographic' can be left out if the routines from the library + * being used are not cryptographic related :-). + * 4. If you include any Windows specific code (or a derivative thereof) from + * the apps directory (application code) you must include an acknowledgement: + * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" + * + * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * The licence and distribution terms for any publically available version or + * derivative of this code cannot be changed. i.e. this code cannot simply be + * copied and put under another distribution licence + * [including the GNU Public Licence.] + */ + diff --git a/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 000000000000..5f822cf27586 --- /dev/null +++ b/module/icp/asm-x86_64/aes/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PORTIONS OF AES FUNCTIONALITY diff --git a/module/icp/asm-x86_64/aes/aes_aesni.S b/module/icp/asm-x86_64/aes/aes_aesni.S new file mode 100644 index 000000000000..4a80c62097ae --- /dev/null +++ b/module/icp/asm-x86_64/aes/aes_aesni.S @@ -0,0 +1,748 @@ +/* + * ==================================================================== + * Written by Intel Corporation for the OpenSSL project to add support + * for Intel AES-NI instructions. Rights for redistribution and usage + * in source and binary forms are granted according to the OpenSSL + * license. + * + * Author: Huang Ying + * Vinodh Gopal + * Kahraman Akdemir + * + * Intel AES-NI is a new set of Single Instruction Multiple Data (SIMD) + * instructions that are going to be introduced in the next generation + * of Intel processor, as of 2009. These instructions enable fast and + * secure data encryption and decryption, using the Advanced Encryption + * Standard (AES), defined by FIPS Publication number 197. The + * architecture introduces six instructions that offer full hardware + * support for AES. Four of them support high performance data + * encryption and decryption, and the other two instructions support + * the AES key expansion procedure. + * ==================================================================== + */ + +/* + * ==================================================================== + * Copyright (c) 1998-2008 The OpenSSL Project. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * + * 3. All advertising materials mentioning features or use of this + * software must display the following acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" + * + * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to + * endorse or promote products derived from this software without + * prior written permission. For written permission, please contact + * openssl-core@openssl.org. + * + * 5. Products derived from this software may not be called "OpenSSL" + * nor may "OpenSSL" appear in their names without prior written + * permission of the OpenSSL Project. + * + * 6. Redistributions of any form whatsoever must retain the following + * acknowledgment: + * "This product includes software developed by the OpenSSL Project + * for use in the OpenSSL Toolkit (http://www.openssl.org/)" + * + * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY + * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR + * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR + * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; + * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + * ==================================================================== + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as files aes-intel.S and eng_aesni_asm.pl, in + * patches sent sent Dec. 9, 2008 and Dec. 24, 2008, respectively, by + * Huang Ying of Intel to the openssl-dev mailing list under the subject + * of "Add support to Intel AES-NI instruction set for x86_64 platform". + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Renamed functions, reordered parameters, and changed return value + * to match OpenSolaris: + * + * OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return values for above are non-zero on error, 0 on success. + * + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + * typedef struct aes_key_st { + * unsigned int rd_key[4 *(AES_MAXNR + 1)]; + * int rounds; + * unsigned int pad[3]; + * } AES_KEY; + * Note: AES_LONG is undefined (that is, Intel uses 32-bit key schedules + * (ks32) instead of 64-bit (ks64). + * Number of rounds (aka round count) is at offset 240 of AES_KEY. + * + * OpenSolaris OS interface (#ifdefs removed for readability): + * int rijndael_key_setup_dec_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * int rijndael_key_setup_enc_intel(uint32_t rk[], + * const uint32_t cipherKey[], uint64_t keyBits); + * Return values for above are 0 on error, number of rounds on success. + * + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]); + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]; + * uint32_t ks32[(MAX_AES_NR + 1) * 4]; } aes_ks_t; + * + * typedef union { + * uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; + * } aes_ks_t; + * typedef struct aes_key { + * aes_ks_t encr_ks, decr_ks; + * long double align128; + * int flags, nr, type; + * } aes_key_t; + * + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + * + * Note2: aes_ks_t must be aligned on a 0 mod 128 byte boundary. + * + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) + +#include + +/* ARGSUSED */ +void +aes_encrypt_intel(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_intel(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} +/* ARGSUSED */ +int +rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} +/* ARGSUSED */ +int +rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + uint64_t keyBits) { + return (0); +} + + +#elif defined(HAVE_AES) /* guard by instruction set */ + +#define _ASM +#include + +/* + * _key_expansion_128(), * _key_expansion_192a(), _key_expansion_192b(), + * _key_expansion_256a(), _key_expansion_256b() + * + * Helper functions called by rijndael_key_setup_inc_intel(). + * Also used indirectly by rijndael_key_setup_dec_intel(). + * + * Input: + * %xmm0 User-provided cipher key + * %xmm1 Round constant + * Output: + * (%rcx) AES key + */ + +ENTRY_NP2(_key_expansion_128, _key_expansion_256a) +_key_expansion_128_local: +_key_expansion_256a_local: + pshufd $0b11111111, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + movups %xmm0, (%rcx) + add $0x10, %rcx + ret + nop +SET_SIZE(_key_expansion_128) +SET_SIZE(_key_expansion_256a) + + +ENTRY_NP(_key_expansion_192a) +_key_expansion_192a_local: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + movups %xmm2, %xmm6 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, %xmm1 + shufps $0b01000100, %xmm0, %xmm6 + movups %xmm6, (%rcx) + shufps $0b01001110, %xmm2, %xmm1 + movups %xmm1, 0x10(%rcx) + add $0x20, %rcx + ret +SET_SIZE(_key_expansion_192a) + + +ENTRY_NP(_key_expansion_192b) +_key_expansion_192b_local: + pshufd $0b01010101, %xmm1, %xmm1 + shufps $0b00010000, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + shufps $0b10001100, %xmm0, %xmm4 + pxor %xmm4, %xmm0 + pxor %xmm1, %xmm0 + + movups %xmm2, %xmm5 + pslldq $4, %xmm5 + pshufd $0b11111111, %xmm0, %xmm3 + pxor %xmm3, %xmm2 + pxor %xmm5, %xmm2 + + movups %xmm0, (%rcx) + add $0x10, %rcx + ret +SET_SIZE(_key_expansion_192b) + + +ENTRY_NP(_key_expansion_256b) +_key_expansion_256b_local: + pshufd $0b10101010, %xmm1, %xmm1 + shufps $0b00010000, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + shufps $0b10001100, %xmm2, %xmm4 + pxor %xmm4, %xmm2 + pxor %xmm1, %xmm2 + movups %xmm2, (%rcx) + add $0x10, %rcx + ret +SET_SIZE(_key_expansion_256b) + + +/* + * rijndael_key_setup_enc_intel() + * Expand the cipher key into the encryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_enc_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * + * Original Intel OpenSSL interface: + * int intel_AES_set_encrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ + +#ifdef OPENSSL_INTERFACE +#define rijndael_key_setup_enc_intel intel_AES_set_encrypt_key +#define rijndael_key_setup_dec_intel intel_AES_set_decrypt_key + +#define USERCIPHERKEY rdi /* P1, 64 bits */ +#define KEYSIZE32 esi /* P2, 32 bits */ +#define KEYSIZE64 rsi /* P2, 64 bits */ +#define AESKEY rdx /* P3, 64 bits */ + +#else /* OpenSolaris Interface */ +#define AESKEY rdi /* P1, 64 bits */ +#define USERCIPHERKEY rsi /* P2, 64 bits */ +#define KEYSIZE32 edx /* P3, 32 bits */ +#define KEYSIZE64 rdx /* P3, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define ROUNDS32 KEYSIZE32 /* temp */ +#define ROUNDS64 KEYSIZE64 /* temp */ +#define ENDAESKEY USERCIPHERKEY /* temp */ + +ENTRY_NP(rijndael_key_setup_enc_intel) +rijndael_key_setup_enc_intel_local: + FRAME_BEGIN + // NULL pointer sanity check + test %USERCIPHERKEY, %USERCIPHERKEY + jz .Lenc_key_invalid_param + test %AESKEY, %AESKEY + jz .Lenc_key_invalid_param + + movups (%USERCIPHERKEY), %xmm0 // user key (first 16 bytes) + movups %xmm0, (%AESKEY) + lea 0x10(%AESKEY), %rcx // key addr + pxor %xmm4, %xmm4 // xmm4 is assumed 0 in _key_expansion_x + + cmp $256, %KEYSIZE32 + jnz .Lenc_key192 + + // AES 256: 14 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $14, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 14 +#endif /* OPENSSL_INTERFACE */ + + movups 0x10(%USERCIPHERKEY), %xmm2 // other user key (2nd 16 bytes) + movups %xmm2, (%rcx) + add $0x10, %rcx + + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x1, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x2, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x4, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x8, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x10, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + aeskeygenassist $0x20, %xmm0, %xmm1 + call _key_expansion_256b_local + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_256a_local + +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* Open Solaris Interface */ + mov $14, %rax // return # rounds = 14 +#endif + FRAME_END + ret + +.align 4 +.Lenc_key192: + cmp $192, %KEYSIZE32 + jnz .Lenc_key128 + + // AES 192: 12 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $12, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 12 +#endif /* OPENSSL_INTERFACE */ + + movq 0x10(%USERCIPHERKEY), %xmm2 // other user key + aeskeygenassist $0x1, %xmm2, %xmm1 // expand the key + call _key_expansion_192a_local + aeskeygenassist $0x2, %xmm2, %xmm1 // expand the key + call _key_expansion_192b_local + aeskeygenassist $0x4, %xmm2, %xmm1 // expand the key + call _key_expansion_192a_local + aeskeygenassist $0x8, %xmm2, %xmm1 // expand the key + call _key_expansion_192b_local + aeskeygenassist $0x10, %xmm2, %xmm1 // expand the key + call _key_expansion_192a_local + aeskeygenassist $0x20, %xmm2, %xmm1 // expand the key + call _key_expansion_192b_local + aeskeygenassist $0x40, %xmm2, %xmm1 // expand the key + call _key_expansion_192a_local + aeskeygenassist $0x80, %xmm2, %xmm1 // expand the key + call _key_expansion_192b_local + +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $12, %rax // return # rounds = 12 +#endif + FRAME_END + ret + +.align 4 +.Lenc_key128: + cmp $128, %KEYSIZE32 + jnz .Lenc_key_invalid_key_bits + + // AES 128: 10 rounds in encryption key schedule +#ifdef OPENSSL_INTERFACE + mov $10, %ROUNDS32 + movl %ROUNDS32, 240(%AESKEY) // key.rounds = 10 +#endif /* OPENSSL_INTERFACE */ + + aeskeygenassist $0x1, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x2, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x4, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x8, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x10, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x20, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x40, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x80, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x1b, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + aeskeygenassist $0x36, %xmm0, %xmm1 // expand the key + call _key_expansion_128_local + +#ifdef OPENSSL_INTERFACE + xor %rax, %rax // return 0 (OK) +#else /* OpenSolaris Interface */ + mov $10, %rax // return # rounds = 10 +#endif + FRAME_END + ret + +.Lenc_key_invalid_param: +#ifdef OPENSSL_INTERFACE + mov $-1, %rax // user key or AES key pointer is NULL + FRAME_END + ret +#else + /* FALLTHROUGH */ +#endif /* OPENSSL_INTERFACE */ + +.Lenc_key_invalid_key_bits: +#ifdef OPENSSL_INTERFACE + mov $-2, %rax // keysize is invalid +#else /* Open Solaris Interface */ + xor %rax, %rax // a key pointer is NULL or invalid keysize +#endif /* OPENSSL_INTERFACE */ + FRAME_END + ret + SET_SIZE(rijndael_key_setup_enc_intel) + + +/* + * rijndael_key_setup_dec_intel() + * Expand the cipher key into the decryption key schedule. + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * OpenSolaris interface: + * int rijndael_key_setup_dec_intel(uint32_t rk[], const uint32_t cipherKey[], + * uint64_t keyBits); + * Return value is 0 on error, number of rounds on success. + * P1->P2, P2->P3, P3->P1 + * + * Original Intel OpenSSL interface: + * int intel_AES_set_decrypt_key(const unsigned char *userKey, + * const int bits, AES_KEY *key); + * Return value is non-zero on error, 0 on success. + */ + +ENTRY_NP(rijndael_key_setup_dec_intel) +FRAME_BEGIN + // Generate round keys used for encryption + call rijndael_key_setup_enc_intel_local + test %rax, %rax +#ifdef OPENSSL_INTERFACE + jnz .Ldec_key_exit // Failed if returned non-0 +#else /* OpenSolaris Interface */ + jz .Ldec_key_exit // Failed if returned 0 +#endif /* OPENSSL_INTERFACE */ + + /* + * Convert round keys used for encryption + * to a form usable for decryption + */ +#ifndef OPENSSL_INTERFACE /* OpenSolaris Interface */ + mov %rax, %ROUNDS64 // set # rounds (10, 12, or 14) + // (already set for OpenSSL) +#endif + + lea 0x10(%AESKEY), %rcx // key addr + shl $4, %ROUNDS32 + add %AESKEY, %ROUNDS64 + mov %ROUNDS64, %ENDAESKEY + +.align 4 +.Ldec_key_reorder_loop: + movups (%AESKEY), %xmm0 + movups (%ROUNDS64), %xmm1 + movups %xmm0, (%ROUNDS64) + movups %xmm1, (%AESKEY) + lea 0x10(%AESKEY), %AESKEY + lea -0x10(%ROUNDS64), %ROUNDS64 + cmp %AESKEY, %ROUNDS64 + ja .Ldec_key_reorder_loop + +.align 4 +.Ldec_key_inv_loop: + movups (%rcx), %xmm0 + // Convert an encryption round key to a form usable for decryption + // with the "AES Inverse Mix Columns" instruction + aesimc %xmm0, %xmm1 + movups %xmm1, (%rcx) + lea 0x10(%rcx), %rcx + cmp %ENDAESKEY, %rcx + jnz .Ldec_key_inv_loop + +.Ldec_key_exit: + // OpenSolaris: rax = # rounds (10, 12, or 14) or 0 for error + // OpenSSL: rax = 0 for OK, or non-zero for error + FRAME_END + ret + SET_SIZE(rijndael_key_setup_dec_intel) + + +/* + * aes_encrypt_intel() + * Encrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_encrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4]) + * + * Original Intel OpenSSL Interface: + * void intel_AES_encrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key) + */ + +#ifdef OPENSSL_INTERFACE +#define aes_encrypt_intel intel_AES_encrypt +#define aes_decrypt_intel intel_AES_decrypt + +#define INP rdi /* P1, 64 bits */ +#define OUTP rsi /* P2, 64 bits */ +#define KEYP rdx /* P3, 64 bits */ + +/* No NROUNDS parameter--offset 240 from KEYP saved in %ecx: */ +#define NROUNDS32 ecx /* temporary, 32 bits */ +#define NROUNDS cl /* temporary, 8 bits */ + +#else /* OpenSolaris Interface */ +#define KEYP rdi /* P1, 64 bits */ +#define NROUNDS esi /* P2, 32 bits */ +#define INP rdx /* P3, 64 bits */ +#define OUTP rcx /* P4, 64 bits */ +#endif /* OPENSSL_INTERFACE */ + +#define STATE xmm0 /* temporary, 128 bits */ +#define KEY xmm1 /* temporary, 128 bits */ + + +ENTRY_NP(aes_encrypt_intel) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Lenc128 + lea 0x20(%KEYP), %KEYP + je .Lenc192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesenc %KEY, %STATE + +.align 4 +.Lenc128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups (%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesenc %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesenclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + ret + SET_SIZE(aes_encrypt_intel) + + +/* + * aes_decrypt_intel() + * Decrypt a single block (in and out can overlap). + * + * For kernel code, caller is responsible for ensuring kpreempt_disable() + * has been called. This is because %xmm registers are not saved/restored. + * Clear and set the CR0.TS bit on entry and exit, respectively, if TS is set + * on entry. Otherwise, if TS is not set, save and restore %xmm registers + * on the stack. + * + * Temporary register usage: + * %xmm0 State + * %xmm1 Key + * + * Original OpenSolaris Interface: + * void aes_decrypt_intel(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original Intel OpenSSL Interface: + * void intel_AES_decrypt(const unsigned char *in, unsigned char *out, + * const AES_KEY *key); + */ +ENTRY_NP(aes_decrypt_intel) + + movups (%INP), %STATE // input + movups (%KEYP), %KEY // key +#ifdef OPENSSL_INTERFACE + mov 240(%KEYP), %NROUNDS32 // round count +#else /* OpenSolaris Interface */ + /* Round count is already present as P2 in %rsi/%esi */ +#endif /* OPENSSL_INTERFACE */ + + pxor %KEY, %STATE // round 0 + lea 0x30(%KEYP), %KEYP + cmp $12, %NROUNDS + jb .Ldec128 + lea 0x20(%KEYP), %KEYP + je .Ldec192 + + // AES 256 + lea 0x20(%KEYP), %KEYP + movups -0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x50(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec192: + // AES 192 and 256 + movups -0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x30(%KEYP), %KEY + aesdec %KEY, %STATE + +.align 4 +.Ldec128: + // AES 128, 192, and 256 + movups -0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups -0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups (%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x10(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x20(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x30(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x40(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x50(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x60(%KEYP), %KEY + aesdec %KEY, %STATE + movups 0x70(%KEYP), %KEY + aesdeclast %KEY, %STATE // last round + movups %STATE, (%OUTP) // output + + ret + SET_SIZE(aes_decrypt_intel) + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/aes/aes_amd64.S b/module/icp/asm-x86_64/aes/aes_amd64.S new file mode 100644 index 000000000000..9db3a3179230 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aes_amd64.S @@ -0,0 +1,906 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue 20/12/2007 + * + * I am grateful to Dag Arne Osvik for many discussions of the techniques that + * can be used to optimise AES assembler code on AMD64/EM64T architectures. + * Some of the techniques used in this implementation are the result of + * suggestions made by him for which I am most grateful. + * + * An AES implementation for AMD64 processors using the YASM assembler. This + * implementation provides only encryption, decryption and hence requires key + * scheduling support in C. It uses 8k bytes of tables but its encryption and + * decryption performance is very close to that obtained using large tables. + * It can use either MS Windows or Gnu/Linux/OpenSolaris OS calling conventions, + * which are as follows: + * ms windows gnu/linux/opensolaris os + * + * in_blk rcx rdi + * out_blk rdx rsi + * context (cx) r8 rdx + * + * preserved rsi - + rbx, rbp, rsp, r12, r13, r14 & r15 + * registers rdi - on both + * + * destroyed - rsi + rax, rcx, rdx, r8, r9, r10 & r11 + * registers - rdi on both + * + * The convention used here is that for gnu/linux/opensolaris os. + * + * This code provides the standard AES block size (128 bits, 16 bytes) and the + * three standard AES key sizes (128, 192 and 256 bits). It has the same call + * interface as my C implementation. It uses the Microsoft C AMD64 calling + * conventions in which the three parameters are placed in rcx, rdx and r8 + * respectively. The rbx, rsi, rdi, rbp and r12..r15 registers are preserved. + * + * OpenSolaris Note: + * Modified to use GNU/Linux/Solaris calling conventions. + * That is parameters are placed in rdi, rsi, rdx, and rcx, respectively. + * + * AES_RETURN aes_encrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt(const unsigned char in_blk[], + * unsigned char out_blk[], const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * const aes_encrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_encrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * AES_RETURN aes_decrypt_key(const unsigned char key[], + * unsigned int len, const aes_decrypt_ctx cx[1])/ + * + * where is 128, 102 or 256. In the last two calls the length can be in + * either bits or bytes. + * + * Comment in/out the following lines to obtain the desired subroutines. These + * selections MUST match those in the C header file aesopt.h + */ +#define AES_REV_DKS /* define if key decryption schedule is reversed */ + +#define LAST_ROUND_TABLES /* define for the faster version using extra tables */ + +/* + * The encryption key schedule has the following in memory layout where N is the + * number of rounds (10, 12 or 14): + * + * lo: | input key (round 0) | / each round is four 32-bit words + * | encryption round 1 | + * | encryption round 2 | + * .... + * | encryption round N-1 | + * hi: | encryption round N | + * + * The decryption key schedule is normally set up so that it has the same + * layout as above by actually reversing the order of the encryption key + * schedule in memory (this happens when AES_REV_DKS is set): + * + * lo: | decryption round 0 | = | encryption round N | + * | decryption round 1 | = INV_MIX_COL[ | encryption round N-1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round N-2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round 1 | ] + * hi: | decryption round N | = | input key (round 0) | + * + * with rounds except the first and last modified using inv_mix_column() + * But if AES_REV_DKS is NOT set the order of keys is left as it is for + * encryption so that it has to be accessed in reverse when used for + * decryption (although the inverse mix column modifications are done) + * + * lo: | decryption round 0 | = | input key (round 0) | + * | decryption round 1 | = INV_MIX_COL[ | encryption round 1 | ] + * | decryption round 2 | = INV_MIX_COL[ | encryption round 2 | ] + * .... .... + * | decryption round N-1 | = INV_MIX_COL[ | encryption round N-1 | ] + * hi: | decryption round N | = | encryption round N | + * + * This layout is faster when the assembler key scheduling provided here + * is used. + * + * End of user defines + */ + +/* + * --------------------------------------------------------------------------- + * OpenSolaris OS modifications + * + * This source originates from Brian Gladman file aes_amd64.asm + * in http://fp.gladman.plus.com/AES/aes-src-04-03-08.zip + * with these changes: + * + * 1. Removed MS Windows-specific code within DLL_EXPORT, _SEH_, and + * !__GNUC__ ifdefs. Also removed ENCRYPTION, DECRYPTION, + * AES_128, AES_192, AES_256, AES_VAR ifdefs. + * + * 2. Translate yasm/nasm %define and .macro definitions to cpp(1) #define + * + * 3. Translate yasm/nasm %ifdef/%ifndef to cpp(1) #ifdef + * + * 4. Translate Intel/yasm/nasm syntax to ATT/OpenSolaris as(1) syntax + * (operands reversed, literals prefixed with "$", registers prefixed with "%", + * and "[register+offset]", addressing changed to "offset(register)", + * parenthesis in constant expressions "()" changed to square brackets "[]", + * "." removed from local (numeric) labels, and other changes. + * Examples: + * Intel/yasm/nasm Syntax ATT/OpenSolaris Syntax + * mov rax,(4*20h) mov $[4*0x20],%rax + * mov rax,[ebx+20h] mov 0x20(%ebx),%rax + * lea rax,[ebx+ecx] lea (%ebx,%ecx),%rax + * sub rax,[ebx+ecx*4-20h] sub -0x20(%ebx,%ecx,4),%rax + * + * 5. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and dummy C function + * definitions for lint. + * + * 6. Renamed functions and reordered parameters to match OpenSolaris: + * Original Gladman interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + * Note: aes_encrypt_ctx contains ks, a 60 element array of uint32_t, + * and a union type, inf., containing inf.l, a uint32_t and + * inf.b, a 4-element array of uint32_t. Only b[0] in the array (aka "l") is + * used and contains the key schedule length * 16 where key schedule length is + * 10, 12, or 14 bytes. + * + * OpenSolaris OS interface: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * typedef union {uint64_t ks64[(MAX_AES_NR + 1) * 4]/ + * uint32_t ks32[(MAX_AES_NR + 1) * 4]/ } aes_ks_t/ + * Note: ks is the AES key schedule, Nr is number of rounds, pt is plain text, + * ct is crypto text, and MAX_AES_NR is 14. + * For the x86 64-bit architecture, OpenSolaris OS uses ks32 instead of ks64. + */ + +#if defined(lint) || defined(__lint) + +#include +/* ARGSUSED */ +void +aes_encrypt_amd64(const uint32_t rk[], int Nr, const uint32_t pt[4], + uint32_t ct[4]) { +} +/* ARGSUSED */ +void +aes_decrypt_amd64(const uint32_t rk[], int Nr, const uint32_t ct[4], + uint32_t pt[4]) { +} + + +#else + +#define _ASM +#include + +#define KS_LENGTH 60 + +#define raxd eax +#define rdxd edx +#define rcxd ecx +#define rbxd ebx +#define rsid esi +#define rdid edi + +#define raxb al +#define rdxb dl +#define rcxb cl +#define rbxb bl +#define rsib sil +#define rdib dil + +// finite field multiplies by {02}, {04} and {08} + +#define f2(x) [[x<<1]^[[[x>>7]&1]*0x11b]] +#define f4(x) [[x<<2]^[[[x>>6]&1]*0x11b]^[[[x>>6]&2]*0x11b]] +#define f8(x) [[x<<3]^[[[x>>5]&1]*0x11b]^[[[x>>5]&2]*0x11b]^[[[x>>5]&4]*0x11b]] + +// finite field multiplies required in table generation + +#define f3(x) [[f2(x)] ^ [x]] +#define f9(x) [[f8(x)] ^ [x]] +#define fb(x) [[f8(x)] ^ [f2(x)] ^ [x]] +#define fd(x) [[f8(x)] ^ [f4(x)] ^ [x]] +#define fe(x) [[f8(x)] ^ [f4(x)] ^ [f2(x)]] + +// macros for expanding S-box data + +#define u8(x) [f2(x)], [x], [x], [f3(x)], [f2(x)], [x], [x], [f3(x)] +#define v8(x) [fe(x)], [f9(x)], [fd(x)], [fb(x)], [fe(x)], [f9(x)], [fd(x)], [x] +#define w8(x) [x], 0, 0, 0, [x], 0, 0, 0 + +#define enc_vals(x) \ + .byte x(0x63),x(0x7c),x(0x77),x(0x7b),x(0xf2),x(0x6b),x(0x6f),x(0xc5); \ + .byte x(0x30),x(0x01),x(0x67),x(0x2b),x(0xfe),x(0xd7),x(0xab),x(0x76); \ + .byte x(0xca),x(0x82),x(0xc9),x(0x7d),x(0xfa),x(0x59),x(0x47),x(0xf0); \ + .byte x(0xad),x(0xd4),x(0xa2),x(0xaf),x(0x9c),x(0xa4),x(0x72),x(0xc0); \ + .byte x(0xb7),x(0xfd),x(0x93),x(0x26),x(0x36),x(0x3f),x(0xf7),x(0xcc); \ + .byte x(0x34),x(0xa5),x(0xe5),x(0xf1),x(0x71),x(0xd8),x(0x31),x(0x15); \ + .byte x(0x04),x(0xc7),x(0x23),x(0xc3),x(0x18),x(0x96),x(0x05),x(0x9a); \ + .byte x(0x07),x(0x12),x(0x80),x(0xe2),x(0xeb),x(0x27),x(0xb2),x(0x75); \ + .byte x(0x09),x(0x83),x(0x2c),x(0x1a),x(0x1b),x(0x6e),x(0x5a),x(0xa0); \ + .byte x(0x52),x(0x3b),x(0xd6),x(0xb3),x(0x29),x(0xe3),x(0x2f),x(0x84); \ + .byte x(0x53),x(0xd1),x(0x00),x(0xed),x(0x20),x(0xfc),x(0xb1),x(0x5b); \ + .byte x(0x6a),x(0xcb),x(0xbe),x(0x39),x(0x4a),x(0x4c),x(0x58),x(0xcf); \ + .byte x(0xd0),x(0xef),x(0xaa),x(0xfb),x(0x43),x(0x4d),x(0x33),x(0x85); \ + .byte x(0x45),x(0xf9),x(0x02),x(0x7f),x(0x50),x(0x3c),x(0x9f),x(0xa8); \ + .byte x(0x51),x(0xa3),x(0x40),x(0x8f),x(0x92),x(0x9d),x(0x38),x(0xf5); \ + .byte x(0xbc),x(0xb6),x(0xda),x(0x21),x(0x10),x(0xff),x(0xf3),x(0xd2); \ + .byte x(0xcd),x(0x0c),x(0x13),x(0xec),x(0x5f),x(0x97),x(0x44),x(0x17); \ + .byte x(0xc4),x(0xa7),x(0x7e),x(0x3d),x(0x64),x(0x5d),x(0x19),x(0x73); \ + .byte x(0x60),x(0x81),x(0x4f),x(0xdc),x(0x22),x(0x2a),x(0x90),x(0x88); \ + .byte x(0x46),x(0xee),x(0xb8),x(0x14),x(0xde),x(0x5e),x(0x0b),x(0xdb); \ + .byte x(0xe0),x(0x32),x(0x3a),x(0x0a),x(0x49),x(0x06),x(0x24),x(0x5c); \ + .byte x(0xc2),x(0xd3),x(0xac),x(0x62),x(0x91),x(0x95),x(0xe4),x(0x79); \ + .byte x(0xe7),x(0xc8),x(0x37),x(0x6d),x(0x8d),x(0xd5),x(0x4e),x(0xa9); \ + .byte x(0x6c),x(0x56),x(0xf4),x(0xea),x(0x65),x(0x7a),x(0xae),x(0x08); \ + .byte x(0xba),x(0x78),x(0x25),x(0x2e),x(0x1c),x(0xa6),x(0xb4),x(0xc6); \ + .byte x(0xe8),x(0xdd),x(0x74),x(0x1f),x(0x4b),x(0xbd),x(0x8b),x(0x8a); \ + .byte x(0x70),x(0x3e),x(0xb5),x(0x66),x(0x48),x(0x03),x(0xf6),x(0x0e); \ + .byte x(0x61),x(0x35),x(0x57),x(0xb9),x(0x86),x(0xc1),x(0x1d),x(0x9e); \ + .byte x(0xe1),x(0xf8),x(0x98),x(0x11),x(0x69),x(0xd9),x(0x8e),x(0x94); \ + .byte x(0x9b),x(0x1e),x(0x87),x(0xe9),x(0xce),x(0x55),x(0x28),x(0xdf); \ + .byte x(0x8c),x(0xa1),x(0x89),x(0x0d),x(0xbf),x(0xe6),x(0x42),x(0x68); \ + .byte x(0x41),x(0x99),x(0x2d),x(0x0f),x(0xb0),x(0x54),x(0xbb),x(0x16) + +#define dec_vals(x) \ + .byte x(0x52),x(0x09),x(0x6a),x(0xd5),x(0x30),x(0x36),x(0xa5),x(0x38); \ + .byte x(0xbf),x(0x40),x(0xa3),x(0x9e),x(0x81),x(0xf3),x(0xd7),x(0xfb); \ + .byte x(0x7c),x(0xe3),x(0x39),x(0x82),x(0x9b),x(0x2f),x(0xff),x(0x87); \ + .byte x(0x34),x(0x8e),x(0x43),x(0x44),x(0xc4),x(0xde),x(0xe9),x(0xcb); \ + .byte x(0x54),x(0x7b),x(0x94),x(0x32),x(0xa6),x(0xc2),x(0x23),x(0x3d); \ + .byte x(0xee),x(0x4c),x(0x95),x(0x0b),x(0x42),x(0xfa),x(0xc3),x(0x4e); \ + .byte x(0x08),x(0x2e),x(0xa1),x(0x66),x(0x28),x(0xd9),x(0x24),x(0xb2); \ + .byte x(0x76),x(0x5b),x(0xa2),x(0x49),x(0x6d),x(0x8b),x(0xd1),x(0x25); \ + .byte x(0x72),x(0xf8),x(0xf6),x(0x64),x(0x86),x(0x68),x(0x98),x(0x16); \ + .byte x(0xd4),x(0xa4),x(0x5c),x(0xcc),x(0x5d),x(0x65),x(0xb6),x(0x92); \ + .byte x(0x6c),x(0x70),x(0x48),x(0x50),x(0xfd),x(0xed),x(0xb9),x(0xda); \ + .byte x(0x5e),x(0x15),x(0x46),x(0x57),x(0xa7),x(0x8d),x(0x9d),x(0x84); \ + .byte x(0x90),x(0xd8),x(0xab),x(0x00),x(0x8c),x(0xbc),x(0xd3),x(0x0a); \ + .byte x(0xf7),x(0xe4),x(0x58),x(0x05),x(0xb8),x(0xb3),x(0x45),x(0x06); \ + .byte x(0xd0),x(0x2c),x(0x1e),x(0x8f),x(0xca),x(0x3f),x(0x0f),x(0x02); \ + .byte x(0xc1),x(0xaf),x(0xbd),x(0x03),x(0x01),x(0x13),x(0x8a),x(0x6b); \ + .byte x(0x3a),x(0x91),x(0x11),x(0x41),x(0x4f),x(0x67),x(0xdc),x(0xea); \ + .byte x(0x97),x(0xf2),x(0xcf),x(0xce),x(0xf0),x(0xb4),x(0xe6),x(0x73); \ + .byte x(0x96),x(0xac),x(0x74),x(0x22),x(0xe7),x(0xad),x(0x35),x(0x85); \ + .byte x(0xe2),x(0xf9),x(0x37),x(0xe8),x(0x1c),x(0x75),x(0xdf),x(0x6e); \ + .byte x(0x47),x(0xf1),x(0x1a),x(0x71),x(0x1d),x(0x29),x(0xc5),x(0x89); \ + .byte x(0x6f),x(0xb7),x(0x62),x(0x0e),x(0xaa),x(0x18),x(0xbe),x(0x1b); \ + .byte x(0xfc),x(0x56),x(0x3e),x(0x4b),x(0xc6),x(0xd2),x(0x79),x(0x20); \ + .byte x(0x9a),x(0xdb),x(0xc0),x(0xfe),x(0x78),x(0xcd),x(0x5a),x(0xf4); \ + .byte x(0x1f),x(0xdd),x(0xa8),x(0x33),x(0x88),x(0x07),x(0xc7),x(0x31); \ + .byte x(0xb1),x(0x12),x(0x10),x(0x59),x(0x27),x(0x80),x(0xec),x(0x5f); \ + .byte x(0x60),x(0x51),x(0x7f),x(0xa9),x(0x19),x(0xb5),x(0x4a),x(0x0d); \ + .byte x(0x2d),x(0xe5),x(0x7a),x(0x9f),x(0x93),x(0xc9),x(0x9c),x(0xef); \ + .byte x(0xa0),x(0xe0),x(0x3b),x(0x4d),x(0xae),x(0x2a),x(0xf5),x(0xb0); \ + .byte x(0xc8),x(0xeb),x(0xbb),x(0x3c),x(0x83),x(0x53),x(0x99),x(0x61); \ + .byte x(0x17),x(0x2b),x(0x04),x(0x7e),x(0xba),x(0x77),x(0xd6),x(0x26); \ + .byte x(0xe1),x(0x69),x(0x14),x(0x63),x(0x55),x(0x21),x(0x0c),x(0x7d) + +#define tptr %rbp /* table pointer */ +#define kptr %r8 /* key schedule pointer */ +#define fofs 128 /* adjust offset in key schedule to keep |disp| < 128 */ +#define fk_ref(x, y) -16*x+fofs+4*y(kptr) + +#ifdef AES_REV_DKS +#define rofs 128 +#define ik_ref(x, y) -16*x+rofs+4*y(kptr) + +#else +#define rofs -128 +#define ik_ref(x, y) 16*x+rofs+4*y(kptr) +#endif /* AES_REV_DKS */ + +#define tab_0(x) (tptr,x,8) +#define tab_1(x) 3(tptr,x,8) +#define tab_2(x) 2(tptr,x,8) +#define tab_3(x) 1(tptr,x,8) +#define tab_f(x) 1(tptr,x,8) +#define tab_i(x) 7(tptr,x,8) + +#define ff_rnd(p1, p2, p3, p4, round) /* normal forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + add $2048, tptr; \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p1 + +#else + +#define fl_rnd(p1, p2, p3, p4, round) /* last forward round */ \ + mov fk_ref(round,0), p1; \ + mov fk_ref(round,1), p2; \ + mov fk_ref(round,2), p3; \ + mov fk_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p2; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p3; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p4; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_f(%rsi), %esi; \ + movzx tab_f(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p1 + +#endif /* LAST_ROUND_TABLES */ + +#define ii_rnd(p1, p2, p3, p4, round) /* normal inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3; \ + \ + mov p1, %eax; \ + mov p2, %ebx; \ + mov p3, %ecx; \ + mov p4, %edx + +#ifdef LAST_ROUND_TABLES + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + add $2048, tptr; \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + shr $16, %eax; \ + xor tab_0(%rsi), p1; \ + xor tab_1(%rdi), p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + xor tab_2(%rsi), p3; \ + xor tab_3(%rdi), p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + shr $16, %ebx; \ + xor tab_0(%rsi), p2; \ + xor tab_1(%rdi), p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + xor tab_2(%rsi), p4; \ + xor tab_3(%rdi), p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + shr $16, %ecx; \ + xor tab_0(%rsi), p3; \ + xor tab_1(%rdi), p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + xor tab_2(%rsi), p1; \ + xor tab_3(%rdi), p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + shr $16, %edx; \ + xor tab_0(%rsi), p4; \ + xor tab_1(%rdi), p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + xor tab_2(%rsi), p2; \ + xor tab_3(%rdi), p3 + +#else + +#define il_rnd(p1, p2, p3, p4, round) /* last inverse round */ \ + mov ik_ref(round,0), p1; \ + mov ik_ref(round,1), p2; \ + mov ik_ref(round,2), p3; \ + mov ik_ref(round,3), p4; \ + \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %eax; \ + xor %esi, p1; \ + rol $8, %edi; \ + xor %edi, p2; \ + movzx %al, %esi; \ + movzx %ah, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p3; \ + xor %edi, p4; \ + \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ebx; \ + xor %esi, p2; \ + rol $8, %edi; \ + xor %edi, p3; \ + movzx %bl, %esi; \ + movzx %bh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p4; \ + xor %edi, p1; \ + \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %ecx; \ + xor %esi, p3; \ + rol $8, %edi; \ + xor %edi, p4; \ + movzx %cl, %esi; \ + movzx %ch, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p1; \ + xor %edi, p2; \ + \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + shr $16, %edx; \ + xor %esi, p4; \ + rol $8, %edi; \ + xor %edi, p1; \ + movzx %dl, %esi; \ + movzx %dh, %edi; \ + movzx tab_i(%rsi), %esi; \ + movzx tab_i(%rdi), %edi; \ + rol $16, %esi; \ + rol $24, %edi; \ + xor %esi, p2; \ + xor %edi, p3 + +#endif /* LAST_ROUND_TABLES */ + +/* + * OpenSolaris OS: + * void aes_encrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_encrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ +.data +.align 64 +enc_tab: + enc_vals(u8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + enc_vals(w8) +#endif + + +ENTRY_NP(aes_encrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $[4*8], %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea enc_tab(%rip), tptr + sub $fofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + + xor fofs(kptr), %eax + xor fofs+4(kptr), %ebx + xor fofs+8(kptr), %ecx + xor fofs+12(kptr), %edx + + lea (kptr,%rsi), kptr + // Jump based on byte key length * 16: + cmp $[10*16], %esi + je 3f + cmp $[12*16], %esi + je 2f + cmp $[14*16], %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal forward rounds +1: ff_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ff_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ff_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ff_rnd(%r9d, %r10d, %r11d, %r12d, 1) + fl_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $[4*8], %rsp + ret + + SET_SIZE(aes_encrypt_amd64) + +/* + * OpenSolaris OS: + * void aes_decrypt_amd64(const aes_ks_t *ks, int Nr, + * const uint32_t pt[4], uint32_t ct[4])/ + * + * Original interface: + * int aes_decrypt(const unsigned char *in, + * unsigned char *out, const aes_encrypt_ctx cx[1])/ + */ +.data +.align 64 +dec_tab: + dec_vals(v8) +#ifdef LAST_ROUND_TABLES + // Last Round Tables: + dec_vals(w8) +#endif + + +ENTRY_NP(aes_decrypt_amd64) +#ifdef GLADMAN_INTERFACE + // Original interface + sub $[4*8], %rsp // gnu/linux/opensolaris binary interface + mov %rsi, (%rsp) // output pointer (P2) + mov %rdx, %r8 // context (P3) + + mov %rbx, 1*8(%rsp) // P1: input pointer in rdi + mov %rbp, 2*8(%rsp) // P2: output pointer in (rsp) + mov %r12, 3*8(%rsp) // P3: context in r8 + movzx 4*KS_LENGTH(kptr), %esi // Get byte key length * 16 + +#else + // OpenSolaris OS interface + sub $[4*8], %rsp // Make room on stack to save registers + mov %rcx, (%rsp) // Save output pointer (P4) on stack + mov %rdi, %r8 // context (P1) + mov %rdx, %rdi // P3: save input pointer + shl $4, %esi // P2: esi byte key length * 16 + + mov %rbx, 1*8(%rsp) // Save registers + mov %rbp, 2*8(%rsp) + mov %r12, 3*8(%rsp) + // P1: context in r8 + // P2: byte key length * 16 in esi + // P3: input pointer in rdi + // P4: output pointer in (rsp) +#endif /* GLADMAN_INTERFACE */ + + lea dec_tab(%rip), tptr + sub $rofs, kptr + + // Load input block into registers + mov (%rdi), %eax + mov 1*4(%rdi), %ebx + mov 2*4(%rdi), %ecx + mov 3*4(%rdi), %edx + +#ifdef AES_REV_DKS + mov kptr, %rdi + lea (kptr,%rsi), kptr +#else + lea (kptr,%rsi), %rdi +#endif + + xor rofs(%rdi), %eax + xor rofs+4(%rdi), %ebx + xor rofs+8(%rdi), %ecx + xor rofs+12(%rdi), %edx + + // Jump based on byte key length * 16: + cmp $[10*16], %esi + je 3f + cmp $[12*16], %esi + je 2f + cmp $[14*16], %esi + je 1f + mov $-1, %rax // error + jmp 4f + + // Perform normal inverse rounds +1: ii_rnd(%r9d, %r10d, %r11d, %r12d, 13) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 12) +2: ii_rnd(%r9d, %r10d, %r11d, %r12d, 11) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 10) +3: ii_rnd(%r9d, %r10d, %r11d, %r12d, 9) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 8) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 7) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 6) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 5) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 4) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 3) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 2) + ii_rnd(%r9d, %r10d, %r11d, %r12d, 1) + il_rnd(%r9d, %r10d, %r11d, %r12d, 0) + + // Copy results + mov (%rsp), %rbx + mov %r9d, (%rbx) + mov %r10d, 4(%rbx) + mov %r11d, 8(%rbx) + mov %r12d, 12(%rbx) + xor %rax, %rax +4: // Restore registers + mov 1*8(%rsp), %rbx + mov 2*8(%rsp), %rbp + mov 3*8(%rsp), %r12 + add $[4*8], %rsp + ret + + SET_SIZE(aes_decrypt_amd64) +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/aes/aeskey.c b/module/icp/asm-x86_64/aes/aeskey.c new file mode 100644 index 000000000000..c3d1f2990874 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aeskey.c @@ -0,0 +1,580 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + */ + +#include +#include "aesopt.h" +#include "aestab.h" +#include "aestab2.h" + +/* + * Initialise the key schedule from the user supplied key. The key + * length can be specified in bytes, with legal values of 16, 24 + * and 32, or in bits, with legal values of 128, 192 and 256. These + * values correspond with Nk values of 4, 6 and 8 respectively. + * + * The following macros implement a single cycle in the key + * schedule generation process. The number of cycles needed + * for each cx->n_col and nk value is: + * + * nk = 4 5 6 7 8 + * ------------------------------ + * cx->n_col = 4 10 9 8 7 7 + * cx->n_col = 5 14 11 10 9 9 + * cx->n_col = 6 19 15 12 11 11 + * cx->n_col = 7 21 19 16 13 14 + * cx->n_col = 8 29 23 19 17 14 + */ + +/* + * OpenSolaris changes + * 1. Added header files aes_impl.h and aestab2.h + * 2. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 3. Remove code under ifdef USE_VIA_ACE_IF_PRESENT (always undefined) + * 4. Removed always-defined ifdefs FUNCS_IN_C, ENC_KEYING_IN_C, + * AES_128, AES_192, AES_256, AES_VAR defines + * 5. Changed aes_encrypt_key* aes_decrypt_key* functions to "static void" + * 6. Changed N_COLS to MAX_AES_NB + * 7. Replaced functions aes_encrypt_key and aes_decrypt_key with + * OpenSolaris-compatible functions rijndael_key_setup_enc_amd64 and + * rijndael_key_setup_dec_amd64 + * 8. cstyled code and removed lint warnings + */ + +#if defined(REDUCE_CODE_SIZE) +#define ls_box ls_sub + uint32_t ls_sub(const uint32_t t, const uint32_t n); +#define inv_mcol im_sub + uint32_t im_sub(const uint32_t x); +#ifdef ENC_KS_UNROLL +#undef ENC_KS_UNROLL +#endif +#ifdef DEC_KS_UNROLL +#undef DEC_KS_UNROLL +#endif +#endif /* REDUCE_CODE_SIZE */ + + +#define ke4(k, i) \ +{ k[4 * (i) + 4] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[4 * (i) + 5] = ss[1] ^= ss[0]; \ + k[4 * (i) + 6] = ss[2] ^= ss[1]; \ + k[4 * (i) + 7] = ss[3] ^= ss[2]; \ +} + +static void +aes_encrypt_key128(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[4]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + +#ifdef ENC_KS_UNROLL + ke4(rk, 0); ke4(rk, 1); + ke4(rk, 2); ke4(rk, 3); + ke4(rk, 4); ke4(rk, 5); + ke4(rk, 6); ke4(rk, 7); + ke4(rk, 8); +#else + { + uint32_t i; + for (i = 0; i < 9; ++i) + ke4(rk, i); + } +#endif /* ENC_KS_UNROLL */ + ke4(rk, 9); +} + + +#define kef6(k, i) \ +{ k[6 * (i) + 6] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[6 * (i) + 7] = ss[1] ^= ss[0]; \ + k[6 * (i) + 8] = ss[2] ^= ss[1]; \ + k[6 * (i) + 9] = ss[3] ^= ss[2]; \ +} + +#define ke6(k, i) \ +{ kef6(k, i); \ + k[6 * (i) + 10] = ss[4] ^= ss[3]; \ + k[6 * (i) + 11] = ss[5] ^= ss[4]; \ +} + +static void +aes_encrypt_key192(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[6]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + rk[4] = ss[4] = word_in(key, 4); + rk[5] = ss[5] = word_in(key, 5); + +#ifdef ENC_KS_UNROLL + ke6(rk, 0); ke6(rk, 1); + ke6(rk, 2); ke6(rk, 3); + ke6(rk, 4); ke6(rk, 5); + ke6(rk, 6); +#else + { + uint32_t i; + for (i = 0; i < 7; ++i) + ke6(rk, i); + } +#endif /* ENC_KS_UNROLL */ + kef6(rk, 7); +} + + + +#define kef8(k, i) \ +{ k[8 * (i) + 8] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[8 * (i) + 9] = ss[1] ^= ss[0]; \ + k[8 * (i) + 10] = ss[2] ^= ss[1]; \ + k[8 * (i) + 11] = ss[3] ^= ss[2]; \ +} + +#define ke8(k, i) \ +{ kef8(k, i); \ + k[8 * (i) + 12] = ss[4] ^= ls_box(ss[3], 0); \ + k[8 * (i) + 13] = ss[5] ^= ss[4]; \ + k[8 * (i) + 14] = ss[6] ^= ss[5]; \ + k[8 * (i) + 15] = ss[7] ^= ss[6]; \ +} + +static void +aes_encrypt_key256(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[8]; + + rk[0] = ss[0] = word_in(key, 0); + rk[1] = ss[1] = word_in(key, 1); + rk[2] = ss[2] = word_in(key, 2); + rk[3] = ss[3] = word_in(key, 3); + rk[4] = ss[4] = word_in(key, 4); + rk[5] = ss[5] = word_in(key, 5); + rk[6] = ss[6] = word_in(key, 6); + rk[7] = ss[7] = word_in(key, 7); + +#ifdef ENC_KS_UNROLL + ke8(rk, 0); ke8(rk, 1); + ke8(rk, 2); ke8(rk, 3); + ke8(rk, 4); ke8(rk, 5); +#else + { + uint32_t i; + for (i = 0; i < 6; ++i) + ke8(rk, i); + } +#endif /* ENC_KS_UNROLL */ + kef8(rk, 6); +} + + +/* + * Expand the cipher key into the encryption key schedule. + * + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +int +rijndael_key_setup_enc_amd64(uint32_t rk[], const uint32_t cipherKey[], + int keyBits) +{ + switch (keyBits) { + case 128: + aes_encrypt_key128((unsigned char *)&cipherKey[0], rk); + return (10); + case 192: + aes_encrypt_key192((unsigned char *)&cipherKey[0], rk); + return (12); + case 256: + aes_encrypt_key256((unsigned char *)&cipherKey[0], rk); + return (14); + default: /* should never get here */ + break; + } + + return (0); +} + + +/* this is used to store the decryption round keys */ +/* in forward or reverse order */ + +#ifdef AES_REV_DKS +#define v(n, i) ((n) - (i) + 2 * ((i) & 3)) +#else +#define v(n, i) (i) +#endif + +#if DEC_ROUND == NO_TABLES +#define ff(x) (x) +#else +#define ff(x) inv_mcol(x) +#if defined(dec_imvars) +#define d_vars dec_imvars +#endif +#endif /* FUNCS_IN_C & DEC_KEYING_IN_C */ + + +#define k4e(k, i) \ +{ k[v(40, (4 * (i)) + 4)] = ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 5)] = ss[1] ^= ss[0]; \ + k[v(40, (4 * (i)) + 6)] = ss[2] ^= ss[1]; \ + k[v(40, (4 * (i)) + 7)] = ss[3] ^= ss[2]; \ +} + +#if 1 + +#define kdf4(k, i) \ +{ ss[0] = ss[0] ^ ss[2] ^ ss[1] ^ ss[3]; \ + ss[1] = ss[1] ^ ss[3]; \ + ss[2] = ss[2] ^ ss[3]; \ + ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; \ + ss[4] ^= k[v(40, (4 * (i)))]; k[v(40, (4 * (i)) + 4)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 1)]; k[v(40, (4 * (i)) + 5)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 2)]; k[v(40, (4 * (i)) + 6)] = ff(ss[4]); \ + ss[4] ^= k[v(40, (4 * (i)) + 3)]; k[v(40, (4 * (i)) + 7)] = ff(ss[4]); \ +} + +#define kd4(k, i) \ +{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; ss[4] = ff(ss[4]); \ + k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ + k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ + k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ + k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ +} + +#define kdl4(k, i) \ +{ ss[4] = ls_box(ss[(i + 3) % 4], 3) ^ t_use(r, c)[i]; \ + ss[i % 4] ^= ss[4]; \ + k[v(40, (4 * (i)) + 4)] = (ss[0] ^= ss[1]) ^ ss[2] ^ ss[3]; \ + k[v(40, (4 * (i)) + 5)] = ss[1] ^ ss[3]; \ + k[v(40, (4 * (i)) + 6)] = ss[0]; \ + k[v(40, (4 * (i)) + 7)] = ss[1]; \ +} + +#else + +#define kdf4(k, i) \ +{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 4)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ff(ss[3]); \ +} + +#define kd4(k, i) \ +{ ss[4] = ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[4]; \ + ss[4] = ff(ss[4]); \ + k[v(40, (4 * (i)) + 4)] = ss[4] ^= k[v(40, (4 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(40, (4 * (i)) + 5)] = ss[4] ^= k[v(40, (4 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(40, (4 * (i)) + 6)] = ss[4] ^= k[v(40, (4 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(40, (4 * (i)) + 7)] = ss[4] ^= k[v(40, (4 * (i)) + 3)]; \ +} + +#define kdl4(k, i) \ +{ ss[0] ^= ls_box(ss[3], 3) ^ t_use(r, c)[i]; \ + k[v(40, (4 * (i)) + 4)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(40, (4 * (i)) + 5)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(40, (4 * (i)) + 6)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(40, (4 * (i)) + 7)] = ss[3]; \ +} + +#endif + +static void +aes_decrypt_key128(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[5]; +#if defined(d_vars) + d_vars; +#endif + rk[v(40, (0))] = ss[0] = word_in(key, 0); + rk[v(40, (1))] = ss[1] = word_in(key, 1); + rk[v(40, (2))] = ss[2] = word_in(key, 2); + rk[v(40, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + kdf4(rk, 0); kd4(rk, 1); + kd4(rk, 2); kd4(rk, 3); + kd4(rk, 4); kd4(rk, 5); + kd4(rk, 6); kd4(rk, 7); + kd4(rk, 8); kdl4(rk, 9); +#else + { + uint32_t i; + for (i = 0; i < 10; ++i) + k4e(rk, i); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 10 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif /* DEC_KS_UNROLL */ +} + + + +#define k6ef(k, i) \ +{ k[v(48, (6 * (i)) + 6)] = ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 7)] = ss[1] ^= ss[0]; \ + k[v(48, (6 * (i)) + 8)] = ss[2] ^= ss[1]; \ + k[v(48, (6 * (i)) + 9)] = ss[3] ^= ss[2]; \ +} + +#define k6e(k, i) \ +{ k6ef(k, i); \ + k[v(48, (6 * (i)) + 10)] = ss[4] ^= ss[3]; \ + k[v(48, (6 * (i)) + 11)] = ss[5] ^= ss[4]; \ +} + +#define kdf6(k, i) \ +{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 6)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ff(ss[3]); \ + ss[4] ^= ss[3]; k[v(48, (6 * (i)) + 10)] = ff(ss[4]); \ + ss[5] ^= ss[4]; k[v(48, (6 * (i)) + 11)] = ff(ss[5]); \ +} + +#define kd6(k, i) \ +{ ss[6] = ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[6]; ss[6] = ff(ss[6]); \ + k[v(48, (6 * (i)) + 6)] = ss[6] ^= k[v(48, (6 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(48, (6 * (i)) + 7)] = ss[6] ^= k[v(48, (6 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(48, (6 * (i)) + 8)] = ss[6] ^= k[v(48, (6 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(48, (6 * (i)) + 9)] = ss[6] ^= k[v(48, (6 * (i)) + 3)]; \ + ss[4] ^= ss[3]; \ + k[v(48, (6 * (i)) + 10)] = ss[6] ^= k[v(48, (6 * (i)) + 4)]; \ + ss[5] ^= ss[4]; \ + k[v(48, (6 * (i)) + 11)] = ss[6] ^= k[v(48, (6 * (i)) + 5)]; \ +} + +#define kdl6(k, i) \ +{ ss[0] ^= ls_box(ss[5], 3) ^ t_use(r, c)[i]; \ + k[v(48, (6 * (i)) + 6)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(48, (6 * (i)) + 7)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(48, (6 * (i)) + 8)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(48, (6 * (i)) + 9)] = ss[3]; \ +} + +static void +aes_decrypt_key192(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[7]; +#if defined(d_vars) + d_vars; +#endif + rk[v(48, (0))] = ss[0] = word_in(key, 0); + rk[v(48, (1))] = ss[1] = word_in(key, 1); + rk[v(48, (2))] = ss[2] = word_in(key, 2); + rk[v(48, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + ss[4] = word_in(key, 4); + rk[v(48, (4))] = ff(ss[4]); + ss[5] = word_in(key, 5); + rk[v(48, (5))] = ff(ss[5]); + kdf6(rk, 0); kd6(rk, 1); + kd6(rk, 2); kd6(rk, 3); + kd6(rk, 4); kd6(rk, 5); + kd6(rk, 6); kdl6(rk, 7); +#else + rk[v(48, (4))] = ss[4] = word_in(key, 4); + rk[v(48, (5))] = ss[5] = word_in(key, 5); + { + uint32_t i; + + for (i = 0; i < 7; ++i) + k6e(rk, i); + k6ef(rk, 7); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 12 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif +} + + + +#define k8ef(k, i) \ +{ k[v(56, (8 * (i)) + 8)] = ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 9)] = ss[1] ^= ss[0]; \ + k[v(56, (8 * (i)) + 10)] = ss[2] ^= ss[1]; \ + k[v(56, (8 * (i)) + 11)] = ss[3] ^= ss[2]; \ +} + +#define k8e(k, i) \ +{ k8ef(k, i); \ + k[v(56, (8 * (i)) + 12)] = ss[4] ^= ls_box(ss[3], 0); \ + k[v(56, (8 * (i)) + 13)] = ss[5] ^= ss[4]; \ + k[v(56, (8 * (i)) + 14)] = ss[6] ^= ss[5]; \ + k[v(56, (8 * (i)) + 15)] = ss[7] ^= ss[6]; \ +} + +#define kdf8(k, i) \ +{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 8)] = ff(ss[0]); \ + ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ff(ss[1]); \ + ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ff(ss[2]); \ + ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ff(ss[3]); \ + ss[4] ^= ls_box(ss[3], 0); k[v(56, (8 * (i)) + 12)] = ff(ss[4]); \ + ss[5] ^= ss[4]; k[v(56, (8 * (i)) + 13)] = ff(ss[5]); \ + ss[6] ^= ss[5]; k[v(56, (8 * (i)) + 14)] = ff(ss[6]); \ + ss[7] ^= ss[6]; k[v(56, (8 * (i)) + 15)] = ff(ss[7]); \ +} + +#define kd8(k, i) \ +{ ss[8] = ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + ss[0] ^= ss[8]; \ + ss[8] = ff(ss[8]); \ + k[v(56, (8 * (i)) + 8)] = ss[8] ^= k[v(56, (8 * (i)))]; \ + ss[1] ^= ss[0]; \ + k[v(56, (8 * (i)) + 9)] = ss[8] ^= k[v(56, (8 * (i)) + 1)]; \ + ss[2] ^= ss[1]; \ + k[v(56, (8 * (i)) + 10)] = ss[8] ^= k[v(56, (8 * (i)) + 2)]; \ + ss[3] ^= ss[2]; \ + k[v(56, (8 * (i)) + 11)] = ss[8] ^= k[v(56, (8 * (i)) + 3)]; \ + ss[8] = ls_box(ss[3], 0); \ + ss[4] ^= ss[8]; \ + ss[8] = ff(ss[8]); \ + k[v(56, (8 * (i)) + 12)] = ss[8] ^= k[v(56, (8 * (i)) + 4)]; \ + ss[5] ^= ss[4]; \ + k[v(56, (8 * (i)) + 13)] = ss[8] ^= k[v(56, (8 * (i)) + 5)]; \ + ss[6] ^= ss[5]; \ + k[v(56, (8 * (i)) + 14)] = ss[8] ^= k[v(56, (8 * (i)) + 6)]; \ + ss[7] ^= ss[6]; \ + k[v(56, (8 * (i)) + 15)] = ss[8] ^= k[v(56, (8 * (i)) + 7)]; \ +} + +#define kdl8(k, i) \ +{ ss[0] ^= ls_box(ss[7], 3) ^ t_use(r, c)[i]; \ + k[v(56, (8 * (i)) + 8)] = ss[0]; \ + ss[1] ^= ss[0]; k[v(56, (8 * (i)) + 9)] = ss[1]; \ + ss[2] ^= ss[1]; k[v(56, (8 * (i)) + 10)] = ss[2]; \ + ss[3] ^= ss[2]; k[v(56, (8 * (i)) + 11)] = ss[3]; \ +} + +static void +aes_decrypt_key256(const unsigned char *key, uint32_t rk[]) +{ + uint32_t ss[9]; +#if defined(d_vars) + d_vars; +#endif + rk[v(56, (0))] = ss[0] = word_in(key, 0); + rk[v(56, (1))] = ss[1] = word_in(key, 1); + rk[v(56, (2))] = ss[2] = word_in(key, 2); + rk[v(56, (3))] = ss[3] = word_in(key, 3); + +#ifdef DEC_KS_UNROLL + ss[4] = word_in(key, 4); + rk[v(56, (4))] = ff(ss[4]); + ss[5] = word_in(key, 5); + rk[v(56, (5))] = ff(ss[5]); + ss[6] = word_in(key, 6); + rk[v(56, (6))] = ff(ss[6]); + ss[7] = word_in(key, 7); + rk[v(56, (7))] = ff(ss[7]); + kdf8(rk, 0); kd8(rk, 1); + kd8(rk, 2); kd8(rk, 3); + kd8(rk, 4); kd8(rk, 5); + kdl8(rk, 6); +#else + rk[v(56, (4))] = ss[4] = word_in(key, 4); + rk[v(56, (5))] = ss[5] = word_in(key, 5); + rk[v(56, (6))] = ss[6] = word_in(key, 6); + rk[v(56, (7))] = ss[7] = word_in(key, 7); + { + uint32_t i; + + for (i = 0; i < 6; ++i) + k8e(rk, i); + k8ef(rk, 6); +#if !(DEC_ROUND == NO_TABLES) + for (i = MAX_AES_NB; i < 14 * MAX_AES_NB; ++i) + rk[i] = inv_mcol(rk[i]); +#endif + } +#endif /* DEC_KS_UNROLL */ +} + + +/* + * Expand the cipher key into the decryption key schedule. + * + * Return the number of rounds for the given cipher key size. + * The size of the key schedule depends on the number of rounds + * (which can be computed from the size of the key), i.e. 4 * (Nr + 1). + * + * Parameters: + * rk AES key schedule 32-bit array to be initialized + * cipherKey User key + * keyBits AES key size (128, 192, or 256 bits) + */ +int +rijndael_key_setup_dec_amd64(uint32_t rk[], const uint32_t cipherKey[], + int keyBits) +{ + switch (keyBits) { + case 128: + aes_decrypt_key128((unsigned char *)&cipherKey[0], rk); + return (10); + case 192: + aes_decrypt_key192((unsigned char *)&cipherKey[0], rk); + return (12); + case 256: + aes_decrypt_key256((unsigned char *)&cipherKey[0], rk); + return (14); + default: /* should never get here */ + break; + } + + return (0); +} diff --git a/module/icp/asm-x86_64/aes/aesopt.h b/module/icp/asm-x86_64/aes/aesopt.h new file mode 100644 index 000000000000..472111f96e59 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aesopt.h @@ -0,0 +1,770 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + * + * This file contains the compilation options for AES (Rijndael) and code + * that is common across encryption, key scheduling and table generation. + * + * OPERATION + * + * These source code files implement the AES algorithm Rijndael designed by + * Joan Daemen and Vincent Rijmen. This version is designed for the standard + * block size of 16 bytes and for key sizes of 128, 192 and 256 bits (16, 24 + * and 32 bytes). + * + * This version is designed for flexibility and speed using operations on + * 32-bit words rather than operations on bytes. It can be compiled with + * either big or little endian internal byte order but is faster when the + * native byte order for the processor is used. + * + * THE CIPHER INTERFACE + * + * The cipher interface is implemented as an array of bytes in which lower + * AES bit sequence indexes map to higher numeric significance within bytes. + */ + +/* + * OpenSolaris changes + * 1. Added __cplusplus and _AESTAB_H header guards + * 2. Added header files sys/types.h and aes_impl.h + * 3. Added defines for AES_ENCRYPT, AES_DECRYPT, AES_REV_DKS, and ASM_AMD64_C + * 4. Moved defines for IS_BIG_ENDIAN, IS_LITTLE_ENDIAN, PLATFORM_BYTE_ORDER + * from brg_endian.h + * 5. Undefined VIA_ACE_POSSIBLE and ASSUME_VIA_ACE_PRESENT + * 6. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 7. Defined aes_sw32 as htonl() for byte swapping + * 8. Cstyled and hdrchk code + * + */ + +#ifndef _AESOPT_H +#define _AESOPT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* SUPPORT FEATURES */ +#define AES_ENCRYPT /* if support for encryption is needed */ +#define AES_DECRYPT /* if support for decryption is needed */ + +/* PLATFORM-SPECIFIC FEATURES */ +#define IS_BIG_ENDIAN 4321 /* byte 0 is most significant (mc68k) */ +#define IS_LITTLE_ENDIAN 1234 /* byte 0 is least significant (i386) */ +#define PLATFORM_BYTE_ORDER IS_LITTLE_ENDIAN +#define AES_REV_DKS /* define to reverse decryption key schedule */ + + +/* + * CONFIGURATION - THE USE OF DEFINES + * Later in this section there are a number of defines that control the + * operation of the code. In each section, the purpose of each define is + * explained so that the relevant form can be included or excluded by + * setting either 1's or 0's respectively on the branches of the related + * #if clauses. The following local defines should not be changed. + */ + +#define ENCRYPTION_IN_C 1 +#define DECRYPTION_IN_C 2 +#define ENC_KEYING_IN_C 4 +#define DEC_KEYING_IN_C 8 + +#define NO_TABLES 0 +#define ONE_TABLE 1 +#define FOUR_TABLES 4 +#define NONE 0 +#define PARTIAL 1 +#define FULL 2 + +/* --- START OF USER CONFIGURED OPTIONS --- */ + +/* + * 1. BYTE ORDER WITHIN 32 BIT WORDS + * + * The fundamental data processing units in Rijndael are 8-bit bytes. The + * input, output and key input are all enumerated arrays of bytes in which + * bytes are numbered starting at zero and increasing to one less than the + * number of bytes in the array in question. This enumeration is only used + * for naming bytes and does not imply any adjacency or order relationship + * from one byte to another. When these inputs and outputs are considered + * as bit sequences, bits 8*n to 8*n+7 of the bit sequence are mapped to + * byte[n] with bit 8n+i in the sequence mapped to bit 7-i within the byte. + * In this implementation bits are numbered from 0 to 7 starting at the + * numerically least significant end of each byte. Bit n represents 2^n. + * + * However, Rijndael can be implemented more efficiently using 32-bit + * words by packing bytes into words so that bytes 4*n to 4*n+3 are placed + * into word[n]. While in principle these bytes can be assembled into words + * in any positions, this implementation only supports the two formats in + * which bytes in adjacent positions within words also have adjacent byte + * numbers. This order is called big-endian if the lowest numbered bytes + * in words have the highest numeric significance and little-endian if the + * opposite applies. + * + * This code can work in either order irrespective of the order used by the + * machine on which it runs. Normally the internal byte order will be set + * to the order of the processor on which the code is to be run but this + * define can be used to reverse this in special situations + * + * WARNING: Assembler code versions rely on PLATFORM_BYTE_ORDER being set. + * This define will hence be redefined later (in section 4) if necessary + */ + +#if 1 +#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER +#elif 0 +#define ALGORITHM_BYTE_ORDER IS_LITTLE_ENDIAN +#elif 0 +#define ALGORITHM_BYTE_ORDER IS_BIG_ENDIAN +#else +#error The algorithm byte order is not defined +#endif + +/* 2. VIA ACE SUPPORT */ + +#if defined(__GNUC__) && defined(__i386__) || \ + defined(_WIN32) && defined(_M_IX86) && \ + !(defined(_WIN64) || defined(_WIN32_WCE) || \ + defined(_MSC_VER) && (_MSC_VER <= 800)) +#define VIA_ACE_POSSIBLE +#endif + +/* + * Define this option if support for the VIA ACE is required. This uses + * inline assembler instructions and is only implemented for the Microsoft, + * Intel and GCC compilers. If VIA ACE is known to be present, then defining + * ASSUME_VIA_ACE_PRESENT will remove the ordinary encryption/decryption + * code. If USE_VIA_ACE_IF_PRESENT is defined then VIA ACE will be used if + * it is detected (both present and enabled) but the normal AES code will + * also be present. + * + * When VIA ACE is to be used, all AES encryption contexts MUST be 16 byte + * aligned; other input/output buffers do not need to be 16 byte aligned + * but there are very large performance gains if this can be arranged. + * VIA ACE also requires the decryption key schedule to be in reverse + * order (which later checks below ensure). + */ + +/* VIA ACE is not used here for OpenSolaris: */ +#undef VIA_ACE_POSSIBLE +#undef ASSUME_VIA_ACE_PRESENT + +#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(USE_VIA_ACE_IF_PRESENT) +#define USE_VIA_ACE_IF_PRESENT +#endif + +#if 0 && defined(VIA_ACE_POSSIBLE) && !defined(ASSUME_VIA_ACE_PRESENT) +#define ASSUME_VIA_ACE_PRESENT +#endif + + +/* + * 3. ASSEMBLER SUPPORT + * + * This define (which can be on the command line) enables the use of the + * assembler code routines for encryption, decryption and key scheduling + * as follows: + * + * ASM_X86_V1C uses the assembler (aes_x86_v1.asm) with large tables for + * encryption and decryption and but with key scheduling in C + * ASM_X86_V2 uses assembler (aes_x86_v2.asm) with compressed tables for + * encryption, decryption and key scheduling + * ASM_X86_V2C uses assembler (aes_x86_v2.asm) with compressed tables for + * encryption and decryption and but with key scheduling in C + * ASM_AMD64_C uses assembler (aes_amd64.asm) with compressed tables for + * encryption and decryption and but with key scheduling in C + * + * Change one 'if 0' below to 'if 1' to select the version or define + * as a compilation option. + */ + +#if 0 && !defined(ASM_X86_V1C) +#define ASM_X86_V1C +#elif 0 && !defined(ASM_X86_V2) +#define ASM_X86_V2 +#elif 0 && !defined(ASM_X86_V2C) +#define ASM_X86_V2C +#elif 1 && !defined(ASM_AMD64_C) +#define ASM_AMD64_C +#endif + +#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2) || defined(ASM_X86_V2C)) && \ + !defined(_M_IX86) || defined(ASM_AMD64_C) && !defined(_M_X64) && \ + !defined(__amd64) +#error Assembler code is only available for x86 and AMD64 systems +#endif + +/* + * 4. FAST INPUT/OUTPUT OPERATIONS. + * + * On some machines it is possible to improve speed by transferring the + * bytes in the input and output arrays to and from the internal 32-bit + * variables by addressing these arrays as if they are arrays of 32-bit + * words. On some machines this will always be possible but there may + * be a large performance penalty if the byte arrays are not aligned on + * the normal word boundaries. On other machines this technique will + * lead to memory access errors when such 32-bit word accesses are not + * properly aligned. The option SAFE_IO avoids such problems but will + * often be slower on those machines that support misaligned access + * (especially so if care is taken to align the input and output byte + * arrays on 32-bit word boundaries). If SAFE_IO is not defined it is + * assumed that access to byte arrays as if they are arrays of 32-bit + * words will not cause problems when such accesses are misaligned. + */ +#if 1 && !defined(_MSC_VER) +#define SAFE_IO +#endif + +/* + * 5. LOOP UNROLLING + * + * The code for encryption and decryption cycles through a number of rounds + * that can be implemented either in a loop or by expanding the code into a + * long sequence of instructions, the latter producing a larger program but + * one that will often be much faster. The latter is called loop unrolling. + * There are also potential speed advantages in expanding two iterations in + * a loop with half the number of iterations, which is called partial loop + * unrolling. The following options allow partial or full loop unrolling + * to be set independently for encryption and decryption + */ +#if 1 +#define ENC_UNROLL FULL +#elif 0 +#define ENC_UNROLL PARTIAL +#else +#define ENC_UNROLL NONE +#endif + +#if 1 +#define DEC_UNROLL FULL +#elif 0 +#define DEC_UNROLL PARTIAL +#else +#define DEC_UNROLL NONE +#endif + +#if 1 +#define ENC_KS_UNROLL +#endif + +#if 1 +#define DEC_KS_UNROLL +#endif + +/* + * 6. FAST FINITE FIELD OPERATIONS + * + * If this section is included, tables are used to provide faster finite + * field arithmetic. This has no effect if FIXED_TABLES is defined. + */ +#if 1 +#define FF_TABLES +#endif + +/* + * 7. INTERNAL STATE VARIABLE FORMAT + * + * The internal state of Rijndael is stored in a number of local 32-bit + * word variables which can be defined either as an array or as individual + * names variables. Include this section if you want to store these local + * variables in arrays. Otherwise individual local variables will be used. + */ +#if 1 +#define ARRAYS +#endif + +/* + * 8. FIXED OR DYNAMIC TABLES + * + * When this section is included the tables used by the code are compiled + * statically into the binary file. Otherwise the subroutine aes_init() + * must be called to compute them before the code is first used. + */ +#if 1 && !(defined(_MSC_VER) && (_MSC_VER <= 800)) +#define FIXED_TABLES +#endif + +/* + * 9. MASKING OR CASTING FROM LONGER VALUES TO BYTES + * + * In some systems it is better to mask longer values to extract bytes + * rather than using a cast. This option allows this choice. + */ +#if 0 +#define to_byte(x) ((uint8_t)(x)) +#else +#define to_byte(x) ((x) & 0xff) +#endif + +/* + * 10. TABLE ALIGNMENT + * + * On some systems speed will be improved by aligning the AES large lookup + * tables on particular boundaries. This define should be set to a power of + * two giving the desired alignment. It can be left undefined if alignment + * is not needed. This option is specific to the Microsoft VC++ compiler - + * it seems to sometimes cause trouble for the VC++ version 6 compiler. + */ + +#if 1 && defined(_MSC_VER) && (_MSC_VER >= 1300) +#define TABLE_ALIGN 32 +#endif + +/* + * 11. REDUCE CODE AND TABLE SIZE + * + * This replaces some expanded macros with function calls if AES_ASM_V2 or + * AES_ASM_V2C are defined + */ + +#if 1 && (defined(ASM_X86_V2) || defined(ASM_X86_V2C)) +#define REDUCE_CODE_SIZE +#endif + +/* + * 12. TABLE OPTIONS + * + * This cipher proceeds by repeating in a number of cycles known as rounds + * which are implemented by a round function which is optionally be speeded + * up using tables. The basic tables are 256 32-bit words, with either + * one or four tables being required for each round function depending on + * how much speed is required. Encryption and decryption round functions + * are different and the last encryption and decryption round functions are + * different again making four different round functions in all. + * + * This means that: + * 1. Normal encryption and decryption rounds can each use either 0, 1 + * or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + * 2. The last encryption and decryption rounds can also use either 0, 1 + * or 4 tables and table spaces of 0, 1024 or 4096 bytes each. + * + * Include or exclude the appropriate definitions below to set the number + * of tables used by this implementation. + */ + +#if 1 /* set tables for the normal encryption round */ +#define ENC_ROUND FOUR_TABLES +#elif 0 +#define ENC_ROUND ONE_TABLE +#else +#define ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last encryption round */ +#define LAST_ENC_ROUND FOUR_TABLES +#elif 0 +#define LAST_ENC_ROUND ONE_TABLE +#else +#define LAST_ENC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the normal decryption round */ +#define DEC_ROUND FOUR_TABLES +#elif 0 +#define DEC_ROUND ONE_TABLE +#else +#define DEC_ROUND NO_TABLES +#endif + +#if 1 /* set tables for the last decryption round */ +#define LAST_DEC_ROUND FOUR_TABLES +#elif 0 +#define LAST_DEC_ROUND ONE_TABLE +#else +#define LAST_DEC_ROUND NO_TABLES +#endif + +/* + * The decryption key schedule can be speeded up with tables in the same + * way that the round functions can. Include or exclude the following + * defines to set this requirement. + */ +#if 1 +#define KEY_SCHED FOUR_TABLES +#elif 0 +#define KEY_SCHED ONE_TABLE +#else +#define KEY_SCHED NO_TABLES +#endif + +/* ---- END OF USER CONFIGURED OPTIONS ---- */ + +/* VIA ACE support is only available for VC++ and GCC */ + +#if !defined(_MSC_VER) && !defined(__GNUC__) +#if defined(ASSUME_VIA_ACE_PRESENT) +#undef ASSUME_VIA_ACE_PRESENT +#endif +#if defined(USE_VIA_ACE_IF_PRESENT) +#undef USE_VIA_ACE_IF_PRESENT +#endif +#endif + +#if defined(ASSUME_VIA_ACE_PRESENT) && !defined(USE_VIA_ACE_IF_PRESENT) +#define USE_VIA_ACE_IF_PRESENT +#endif + +#if defined(USE_VIA_ACE_IF_PRESENT) && !defined(AES_REV_DKS) +#define AES_REV_DKS +#endif + +/* Assembler support requires the use of platform byte order */ + +#if (defined(ASM_X86_V1C) || defined(ASM_X86_V2C) || defined(ASM_AMD64_C)) && \ + (ALGORITHM_BYTE_ORDER != PLATFORM_BYTE_ORDER) +#undef ALGORITHM_BYTE_ORDER +#define ALGORITHM_BYTE_ORDER PLATFORM_BYTE_ORDER +#endif + +/* + * In this implementation the columns of the state array are each held in + * 32-bit words. The state array can be held in various ways: in an array + * of words, in a number of individual word variables or in a number of + * processor registers. The following define maps a variable name x and + * a column number c to the way the state array variable is to be held. + * The first define below maps the state into an array x[c] whereas the + * second form maps the state into a number of individual variables x0, + * x1, etc. Another form could map individual state columns to machine + * register names. + */ + +#if defined(ARRAYS) +#define s(x, c) x[c] +#else +#define s(x, c) x##c +#endif + +/* + * This implementation provides subroutines for encryption, decryption + * and for setting the three key lengths (separately) for encryption + * and decryption. Since not all functions are needed, masks are set + * up here to determine which will be implemented in C + */ + +#if !defined(AES_ENCRYPT) +#define EFUNCS_IN_C 0 +#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \ + defined(ASM_X86_V2C) || defined(ASM_AMD64_C) +#define EFUNCS_IN_C ENC_KEYING_IN_C +#elif !defined(ASM_X86_V2) +#define EFUNCS_IN_C (ENCRYPTION_IN_C | ENC_KEYING_IN_C) +#else +#define EFUNCS_IN_C 0 +#endif + +#if !defined(AES_DECRYPT) +#define DFUNCS_IN_C 0 +#elif defined(ASSUME_VIA_ACE_PRESENT) || defined(ASM_X86_V1C) || \ + defined(ASM_X86_V2C) || defined(ASM_AMD64_C) +#define DFUNCS_IN_C DEC_KEYING_IN_C +#elif !defined(ASM_X86_V2) +#define DFUNCS_IN_C (DECRYPTION_IN_C | DEC_KEYING_IN_C) +#else +#define DFUNCS_IN_C 0 +#endif + +#define FUNCS_IN_C (EFUNCS_IN_C | DFUNCS_IN_C) + +/* END OF CONFIGURATION OPTIONS */ + +/* Disable or report errors on some combinations of options */ + +#if ENC_ROUND == NO_TABLES && LAST_ENC_ROUND != NO_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND NO_TABLES +#elif ENC_ROUND == ONE_TABLE && LAST_ENC_ROUND == FOUR_TABLES +#undef LAST_ENC_ROUND +#define LAST_ENC_ROUND ONE_TABLE +#endif + +#if ENC_ROUND == NO_TABLES && ENC_UNROLL != NONE +#undef ENC_UNROLL +#define ENC_UNROLL NONE +#endif + +#if DEC_ROUND == NO_TABLES && LAST_DEC_ROUND != NO_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND NO_TABLES +#elif DEC_ROUND == ONE_TABLE && LAST_DEC_ROUND == FOUR_TABLES +#undef LAST_DEC_ROUND +#define LAST_DEC_ROUND ONE_TABLE +#endif + +#if DEC_ROUND == NO_TABLES && DEC_UNROLL != NONE +#undef DEC_UNROLL +#define DEC_UNROLL NONE +#endif + +#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define aes_sw32 htonl +#elif defined(bswap32) +#define aes_sw32 bswap32 +#elif defined(bswap_32) +#define aes_sw32 bswap_32 +#else +#define brot(x, n) (((uint32_t)(x) << (n)) | ((uint32_t)(x) >> (32 - (n)))) +#define aes_sw32(x) ((brot((x), 8) & 0x00ff00ff) | (brot((x), 24) & 0xff00ff00)) +#endif + + +/* + * upr(x, n): rotates bytes within words by n positions, moving bytes to + * higher index positions with wrap around into low positions + * ups(x, n): moves bytes by n positions to higher index positions in + * words but without wrap around + * bval(x, n): extracts a byte from a word + * + * WARNING: The definitions given here are intended only for use with + * unsigned variables and with shift counts that are compile + * time constants + */ + +#if (ALGORITHM_BYTE_ORDER == IS_LITTLE_ENDIAN) +#define upr(x, n) (((uint32_t)(x) << (8 * (n))) | \ + ((uint32_t)(x) >> (32 - 8 * (n)))) +#define ups(x, n) ((uint32_t)(x) << (8 * (n))) +#define bval(x, n) to_byte((x) >> (8 * (n))) +#define bytes2word(b0, b1, b2, b3) \ + (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | \ + ((uint32_t)(b1) << 8) | (b0)) +#endif + +#if (ALGORITHM_BYTE_ORDER == IS_BIG_ENDIAN) +#define upr(x, n) (((uint32_t)(x) >> (8 * (n))) | \ + ((uint32_t)(x) << (32 - 8 * (n)))) +#define ups(x, n) ((uint32_t)(x) >> (8 * (n))) +#define bval(x, n) to_byte((x) >> (24 - 8 * (n))) +#define bytes2word(b0, b1, b2, b3) \ + (((uint32_t)(b0) << 24) | ((uint32_t)(b1) << 16) | \ + ((uint32_t)(b2) << 8) | (b3)) +#endif + +#if defined(SAFE_IO) +#define word_in(x, c) bytes2word(((const uint8_t *)(x) + 4 * c)[0], \ + ((const uint8_t *)(x) + 4 * c)[1], \ + ((const uint8_t *)(x) + 4 * c)[2], \ + ((const uint8_t *)(x) + 4 * c)[3]) +#define word_out(x, c, v) { ((uint8_t *)(x) + 4 * c)[0] = bval(v, 0); \ + ((uint8_t *)(x) + 4 * c)[1] = bval(v, 1); \ + ((uint8_t *)(x) + 4 * c)[2] = bval(v, 2); \ + ((uint8_t *)(x) + 4 * c)[3] = bval(v, 3); } +#elif (ALGORITHM_BYTE_ORDER == PLATFORM_BYTE_ORDER) +#define word_in(x, c) (*((uint32_t *)(x) + (c))) +#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = (v)) +#else +#define word_in(x, c) aes_sw32(*((uint32_t *)(x) + (c))) +#define word_out(x, c, v) (*((uint32_t *)(x) + (c)) = aes_sw32(v)) +#endif + +/* the finite field modular polynomial and elements */ + +#define WPOLY 0x011b +#define BPOLY 0x1b + +/* multiply four bytes in GF(2^8) by 'x' {02} in parallel */ + +#define m1 0x80808080 +#define m2 0x7f7f7f7f +#define gf_mulx(x) ((((x) & m2) << 1) ^ ((((x) & m1) >> 7) * BPOLY)) + +/* + * The following defines provide alternative definitions of gf_mulx that might + * give improved performance if a fast 32-bit multiply is not available. Note + * that a temporary variable u needs to be defined where gf_mulx is used. + * + * #define gf_mulx(x) (u = (x) & m1, u |= (u >> 1), ((x) & m2) << 1) ^ \ + * ((u >> 3) | (u >> 6)) + * #define m4 (0x01010101 * BPOLY) + * #define gf_mulx(x) (u = (x) & m1, ((x) & m2) << 1) ^ ((u - (u >> 7)) \ + * & m4) + */ + +/* Work out which tables are needed for the different options */ + +#if defined(ASM_X86_V1C) +#if defined(ENC_ROUND) +#undef ENC_ROUND +#endif +#define ENC_ROUND FOUR_TABLES +#if defined(LAST_ENC_ROUND) +#undef LAST_ENC_ROUND +#endif +#define LAST_ENC_ROUND FOUR_TABLES +#if defined(DEC_ROUND) +#undef DEC_ROUND +#endif +#define DEC_ROUND FOUR_TABLES +#if defined(LAST_DEC_ROUND) +#undef LAST_DEC_ROUND +#endif +#define LAST_DEC_ROUND FOUR_TABLES +#if defined(KEY_SCHED) +#undef KEY_SCHED +#define KEY_SCHED FOUR_TABLES +#endif +#endif + +#if (FUNCS_IN_C & ENCRYPTION_IN_C) || defined(ASM_X86_V1C) +#if ENC_ROUND == ONE_TABLE +#define FT1_SET +#elif ENC_ROUND == FOUR_TABLES +#define FT4_SET +#else +#define SBX_SET +#endif +#if LAST_ENC_ROUND == ONE_TABLE +#define FL1_SET +#elif LAST_ENC_ROUND == FOUR_TABLES +#define FL4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif + +#if (FUNCS_IN_C & DECRYPTION_IN_C) || defined(ASM_X86_V1C) +#if DEC_ROUND == ONE_TABLE +#define IT1_SET +#elif DEC_ROUND == FOUR_TABLES +#define IT4_SET +#else +#define ISB_SET +#endif +#if LAST_DEC_ROUND == ONE_TABLE +#define IL1_SET +#elif LAST_DEC_ROUND == FOUR_TABLES +#define IL4_SET +#elif !defined(ISB_SET) +#define ISB_SET +#endif +#endif + + +#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \ + defined(ASM_X86_V2C))) +#if ((FUNCS_IN_C & ENC_KEYING_IN_C) || (FUNCS_IN_C & DEC_KEYING_IN_C)) +#if KEY_SCHED == ONE_TABLE +#if !defined(FL1_SET) && !defined(FL4_SET) +#define LS1_SET +#endif +#elif KEY_SCHED == FOUR_TABLES +#if !defined(FL4_SET) +#define LS4_SET +#endif +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif +#if (FUNCS_IN_C & DEC_KEYING_IN_C) +#if KEY_SCHED == ONE_TABLE +#define IM1_SET +#elif KEY_SCHED == FOUR_TABLES +#define IM4_SET +#elif !defined(SBX_SET) +#define SBX_SET +#endif +#endif +#endif + +/* generic definitions of Rijndael macros that use tables */ + +#define no_table(x, box, vf, rf, c) bytes2word(\ + box[bval(vf(x, 0, c), rf(0, c))], \ + box[bval(vf(x, 1, c), rf(1, c))], \ + box[bval(vf(x, 2, c), rf(2, c))], \ + box[bval(vf(x, 3, c), rf(3, c))]) + +#define one_table(x, op, tab, vf, rf, c) \ + (tab[bval(vf(x, 0, c), rf(0, c))] \ + ^ op(tab[bval(vf(x, 1, c), rf(1, c))], 1) \ + ^ op(tab[bval(vf(x, 2, c), rf(2, c))], 2) \ + ^ op(tab[bval(vf(x, 3, c), rf(3, c))], 3)) + +#define four_tables(x, tab, vf, rf, c) \ + (tab[0][bval(vf(x, 0, c), rf(0, c))] \ + ^ tab[1][bval(vf(x, 1, c), rf(1, c))] \ + ^ tab[2][bval(vf(x, 2, c), rf(2, c))] \ + ^ tab[3][bval(vf(x, 3, c), rf(3, c))]) + +#define vf1(x, r, c) (x) +#define rf1(r, c) (r) +#define rf2(r, c) ((8+r-c)&3) + +/* + * Perform forward and inverse column mix operation on four bytes in long word + * x in parallel. NOTE: x must be a simple variable, NOT an expression in + * these macros. + */ + +#if !(defined(REDUCE_CODE_SIZE) && (defined(ASM_X86_V2) || \ + defined(ASM_X86_V2C))) + +#if defined(FM4_SET) /* not currently used */ +#define fwd_mcol(x) four_tables(x, t_use(f, m), vf1, rf1, 0) +#elif defined(FM1_SET) /* not currently used */ +#define fwd_mcol(x) one_table(x, upr, t_use(f, m), vf1, rf1, 0) +#else +#define dec_fmvars uint32_t g2 +#define fwd_mcol(x) (g2 = gf_mulx(x), g2 ^ upr((x) ^ g2, 3) ^ \ + upr((x), 2) ^ upr((x), 1)) +#endif + +#if defined(IM4_SET) +#define inv_mcol(x) four_tables(x, t_use(i, m), vf1, rf1, 0) +#elif defined(IM1_SET) +#define inv_mcol(x) one_table(x, upr, t_use(i, m), vf1, rf1, 0) +#else +#define dec_imvars uint32_t g2, g4, g9 +#define inv_mcol(x) (g2 = gf_mulx(x), g4 = gf_mulx(g2), g9 = \ + (x) ^ gf_mulx(g4), g4 ^= g9, \ + (x) ^ g2 ^ g4 ^ upr(g2 ^ g9, 3) ^ \ + upr(g4, 2) ^ upr(g9, 1)) +#endif + +#if defined(FL4_SET) +#define ls_box(x, c) four_tables(x, t_use(f, l), vf1, rf2, c) +#elif defined(LS4_SET) +#define ls_box(x, c) four_tables(x, t_use(l, s), vf1, rf2, c) +#elif defined(FL1_SET) +#define ls_box(x, c) one_table(x, upr, t_use(f, l), vf1, rf2, c) +#elif defined(LS1_SET) +#define ls_box(x, c) one_table(x, upr, t_use(l, s), vf1, rf2, c) +#else +#define ls_box(x, c) no_table(x, t_use(s, box), vf1, rf2, c) +#endif + +#endif + +#if defined(ASM_X86_V1C) && defined(AES_DECRYPT) && !defined(ISB_SET) +#define ISB_SET +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _AESOPT_H */ diff --git a/module/icp/asm-x86_64/aes/aestab.h b/module/icp/asm-x86_64/aes/aestab.h new file mode 100644 index 000000000000..33cdb6c6f9fe --- /dev/null +++ b/module/icp/asm-x86_64/aes/aestab.h @@ -0,0 +1,165 @@ +/* + * --------------------------------------------------------------------------- + * Copyright (c) 1998-2007, Brian Gladman, Worcester, UK. All rights reserved. + * + * LICENSE TERMS + * + * The free distribution and use of this software is allowed (with or without + * changes) provided that: + * + * 1. source code distributions include the above copyright notice, this + * list of conditions and the following disclaimer; + * + * 2. binary distributions include the above copyright notice, this list + * of conditions and the following disclaimer in their documentation; + * + * 3. the name of the copyright holder is not used to endorse products + * built using this software without specific written permission. + * + * DISCLAIMER + * + * This software is provided 'as is' with no explicit or implied warranties + * in respect of its properties, including, but not limited to, correctness + * and/or fitness for purpose. + * --------------------------------------------------------------------------- + * Issue Date: 20/12/2007 + * + * This file contains the code for declaring the tables needed to implement + * AES. The file aesopt.h is assumed to be included before this header file. + * If there are no global variables, the definitions here can be used to put + * the AES tables in a structure so that a pointer can then be added to the + * AES context to pass them to the AES routines that need them. If this + * facility is used, the calling program has to ensure that this pointer is + * managed appropriately. In particular, the value of the t_dec(in, it) item + * in the table structure must be set to zero in order to ensure that the + * tables are initialised. In practice the three code sequences in aeskey.c + * that control the calls to aes_init() and the aes_init() routine itself will + * have to be changed for a specific implementation. If global variables are + * available it will generally be preferable to use them with the precomputed + * FIXED_TABLES option that uses static global tables. + * + * The following defines can be used to control the way the tables + * are defined, initialised and used in embedded environments that + * require special features for these purposes + * + * the 't_dec' construction is used to declare fixed table arrays + * the 't_set' construction is used to set fixed table values + * the 't_use' construction is used to access fixed table values + * + * 256 byte tables: + * + * t_xxx(s, box) => forward S box + * t_xxx(i, box) => inverse S box + * + * 256 32-bit word OR 4 x 256 32-bit word tables: + * + * t_xxx(f, n) => forward normal round + * t_xxx(f, l) => forward last round + * t_xxx(i, n) => inverse normal round + * t_xxx(i, l) => inverse last round + * t_xxx(l, s) => key schedule table + * t_xxx(i, m) => key schedule table + * + * Other variables and tables: + * + * t_xxx(r, c) => the rcon table + */ + +/* + * OpenSolaris OS modifications + * + * 1. Added __cplusplus and _AESTAB_H header guards + * 2. Added header file sys/types.h + * 3. Remove code defined for _MSC_VER + * 4. Changed all variables to "static const" + * 5. Changed uint_8t and uint_32t to uint8_t and uint32_t + * 6. Cstyled and hdrchk code + */ + +#ifndef _AESTAB_H +#define _AESTAB_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#define t_dec(m, n) t_##m##n +#define t_set(m, n) t_##m##n +#define t_use(m, n) t_##m##n + +#if defined(DO_TABLES) && defined(FIXED_TABLES) +#define d_1(t, n, b, e) static const t n[256] = b(e) +#define d_4(t, n, b, e, f, g, h) static const t n[4][256] = \ + {b(e), b(f), b(g), b(h)} +static const uint32_t t_dec(r, c)[RC_LENGTH] = rc_data(w0); +#else +#define d_1(t, n, b, e) static const t n[256] +#define d_4(t, n, b, e, f, g, h) static const t n[4][256] +static const uint32_t t_dec(r, c)[RC_LENGTH]; +#endif + +#if defined(SBX_SET) + d_1(uint8_t, t_dec(s, box), sb_data, h0); +#endif +#if defined(ISB_SET) + d_1(uint8_t, t_dec(i, box), isb_data, h0); +#endif + +#if defined(FT1_SET) + d_1(uint32_t, t_dec(f, n), sb_data, u0); +#endif +#if defined(FT4_SET) + d_4(uint32_t, t_dec(f, n), sb_data, u0, u1, u2, u3); +#endif + +#if defined(FL1_SET) + d_1(uint32_t, t_dec(f, l), sb_data, w0); +#endif +#if defined(FL4_SET) + d_4(uint32_t, t_dec(f, l), sb_data, w0, w1, w2, w3); +#endif + +#if defined(IT1_SET) + d_1(uint32_t, t_dec(i, n), isb_data, v0); +#endif +#if defined(IT4_SET) + d_4(uint32_t, t_dec(i, n), isb_data, v0, v1, v2, v3); +#endif + +#if defined(IL1_SET) + d_1(uint32_t, t_dec(i, l), isb_data, w0); +#endif +#if defined(IL4_SET) + d_4(uint32_t, t_dec(i, l), isb_data, w0, w1, w2, w3); +#endif + +#if defined(LS1_SET) +#if defined(FL1_SET) +#undef LS1_SET +#else + d_1(uint32_t, t_dec(l, s), sb_data, w0); +#endif +#endif + +#if defined(LS4_SET) +#if defined(FL4_SET) +#undef LS4_SET +#else + d_4(uint32_t, t_dec(l, s), sb_data, w0, w1, w2, w3); +#endif +#endif + +#if defined(IM1_SET) + d_1(uint32_t, t_dec(i, m), mm_data, v0); +#endif +#if defined(IM4_SET) + d_4(uint32_t, t_dec(i, m), mm_data, v0, v1, v2, v3); +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* _AESTAB_H */ diff --git a/module/icp/asm-x86_64/aes/aestab2.h b/module/icp/asm-x86_64/aes/aestab2.h new file mode 100644 index 000000000000..eb13f72b10d8 --- /dev/null +++ b/module/icp/asm-x86_64/aes/aestab2.h @@ -0,0 +1,594 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AESTAB2_H +#define _AESTAB2_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * To create this file for OpenSolaris: + * 1. Compile and run tablegen.c, from aes-src-04-03-08.zip, + * after defining ASM_AMD64_C + * 2. mv aestab2.c aestab2.h + * 3. Add __cplusplus and _AESTAB2_H header guards + * 3. Add #include + * 4. Change "uint_32t" to "uint32_t" + * 5. Change all variables to "static const" + * 6. Cstyle and hdrchk this file + */ + +#include + +static const uint32_t t_rc[RC_LENGTH] = +{ + 0x00000001, 0x00000002, 0x00000004, 0x00000008, + 0x00000010, 0x00000020, 0x00000040, 0x00000080, + 0x0000001b, 0x00000036 +}; + +static const uint32_t t_ls[4][256] = +{ + { + 0x00000063, 0x0000007c, 0x00000077, 0x0000007b, + 0x000000f2, 0x0000006b, 0x0000006f, 0x000000c5, + 0x00000030, 0x00000001, 0x00000067, 0x0000002b, + 0x000000fe, 0x000000d7, 0x000000ab, 0x00000076, + 0x000000ca, 0x00000082, 0x000000c9, 0x0000007d, + 0x000000fa, 0x00000059, 0x00000047, 0x000000f0, + 0x000000ad, 0x000000d4, 0x000000a2, 0x000000af, + 0x0000009c, 0x000000a4, 0x00000072, 0x000000c0, + 0x000000b7, 0x000000fd, 0x00000093, 0x00000026, + 0x00000036, 0x0000003f, 0x000000f7, 0x000000cc, + 0x00000034, 0x000000a5, 0x000000e5, 0x000000f1, + 0x00000071, 0x000000d8, 0x00000031, 0x00000015, + 0x00000004, 0x000000c7, 0x00000023, 0x000000c3, + 0x00000018, 0x00000096, 0x00000005, 0x0000009a, + 0x00000007, 0x00000012, 0x00000080, 0x000000e2, + 0x000000eb, 0x00000027, 0x000000b2, 0x00000075, + 0x00000009, 0x00000083, 0x0000002c, 0x0000001a, + 0x0000001b, 0x0000006e, 0x0000005a, 0x000000a0, + 0x00000052, 0x0000003b, 0x000000d6, 0x000000b3, + 0x00000029, 0x000000e3, 0x0000002f, 0x00000084, + 0x00000053, 0x000000d1, 0x00000000, 0x000000ed, + 0x00000020, 0x000000fc, 0x000000b1, 0x0000005b, + 0x0000006a, 0x000000cb, 0x000000be, 0x00000039, + 0x0000004a, 0x0000004c, 0x00000058, 0x000000cf, + 0x000000d0, 0x000000ef, 0x000000aa, 0x000000fb, + 0x00000043, 0x0000004d, 0x00000033, 0x00000085, + 0x00000045, 0x000000f9, 0x00000002, 0x0000007f, + 0x00000050, 0x0000003c, 0x0000009f, 0x000000a8, + 0x00000051, 0x000000a3, 0x00000040, 0x0000008f, + 0x00000092, 0x0000009d, 0x00000038, 0x000000f5, + 0x000000bc, 0x000000b6, 0x000000da, 0x00000021, + 0x00000010, 0x000000ff, 0x000000f3, 0x000000d2, + 0x000000cd, 0x0000000c, 0x00000013, 0x000000ec, + 0x0000005f, 0x00000097, 0x00000044, 0x00000017, + 0x000000c4, 0x000000a7, 0x0000007e, 0x0000003d, + 0x00000064, 0x0000005d, 0x00000019, 0x00000073, + 0x00000060, 0x00000081, 0x0000004f, 0x000000dc, + 0x00000022, 0x0000002a, 0x00000090, 0x00000088, + 0x00000046, 0x000000ee, 0x000000b8, 0x00000014, + 0x000000de, 0x0000005e, 0x0000000b, 0x000000db, + 0x000000e0, 0x00000032, 0x0000003a, 0x0000000a, + 0x00000049, 0x00000006, 0x00000024, 0x0000005c, + 0x000000c2, 0x000000d3, 0x000000ac, 0x00000062, + 0x00000091, 0x00000095, 0x000000e4, 0x00000079, + 0x000000e7, 0x000000c8, 0x00000037, 0x0000006d, + 0x0000008d, 0x000000d5, 0x0000004e, 0x000000a9, + 0x0000006c, 0x00000056, 0x000000f4, 0x000000ea, + 0x00000065, 0x0000007a, 0x000000ae, 0x00000008, + 0x000000ba, 0x00000078, 0x00000025, 0x0000002e, + 0x0000001c, 0x000000a6, 0x000000b4, 0x000000c6, + 0x000000e8, 0x000000dd, 0x00000074, 0x0000001f, + 0x0000004b, 0x000000bd, 0x0000008b, 0x0000008a, + 0x00000070, 0x0000003e, 0x000000b5, 0x00000066, + 0x00000048, 0x00000003, 0x000000f6, 0x0000000e, + 0x00000061, 0x00000035, 0x00000057, 0x000000b9, + 0x00000086, 0x000000c1, 0x0000001d, 0x0000009e, + 0x000000e1, 0x000000f8, 0x00000098, 0x00000011, + 0x00000069, 0x000000d9, 0x0000008e, 0x00000094, + 0x0000009b, 0x0000001e, 0x00000087, 0x000000e9, + 0x000000ce, 0x00000055, 0x00000028, 0x000000df, + 0x0000008c, 0x000000a1, 0x00000089, 0x0000000d, + 0x000000bf, 0x000000e6, 0x00000042, 0x00000068, + 0x00000041, 0x00000099, 0x0000002d, 0x0000000f, + 0x000000b0, 0x00000054, 0x000000bb, 0x00000016 + }, + { + 0x00006300, 0x00007c00, 0x00007700, 0x00007b00, + 0x0000f200, 0x00006b00, 0x00006f00, 0x0000c500, + 0x00003000, 0x00000100, 0x00006700, 0x00002b00, + 0x0000fe00, 0x0000d700, 0x0000ab00, 0x00007600, + 0x0000ca00, 0x00008200, 0x0000c900, 0x00007d00, + 0x0000fa00, 0x00005900, 0x00004700, 0x0000f000, + 0x0000ad00, 0x0000d400, 0x0000a200, 0x0000af00, + 0x00009c00, 0x0000a400, 0x00007200, 0x0000c000, + 0x0000b700, 0x0000fd00, 0x00009300, 0x00002600, + 0x00003600, 0x00003f00, 0x0000f700, 0x0000cc00, + 0x00003400, 0x0000a500, 0x0000e500, 0x0000f100, + 0x00007100, 0x0000d800, 0x00003100, 0x00001500, + 0x00000400, 0x0000c700, 0x00002300, 0x0000c300, + 0x00001800, 0x00009600, 0x00000500, 0x00009a00, + 0x00000700, 0x00001200, 0x00008000, 0x0000e200, + 0x0000eb00, 0x00002700, 0x0000b200, 0x00007500, + 0x00000900, 0x00008300, 0x00002c00, 0x00001a00, + 0x00001b00, 0x00006e00, 0x00005a00, 0x0000a000, + 0x00005200, 0x00003b00, 0x0000d600, 0x0000b300, + 0x00002900, 0x0000e300, 0x00002f00, 0x00008400, + 0x00005300, 0x0000d100, 0x00000000, 0x0000ed00, + 0x00002000, 0x0000fc00, 0x0000b100, 0x00005b00, + 0x00006a00, 0x0000cb00, 0x0000be00, 0x00003900, + 0x00004a00, 0x00004c00, 0x00005800, 0x0000cf00, + 0x0000d000, 0x0000ef00, 0x0000aa00, 0x0000fb00, + 0x00004300, 0x00004d00, 0x00003300, 0x00008500, + 0x00004500, 0x0000f900, 0x00000200, 0x00007f00, + 0x00005000, 0x00003c00, 0x00009f00, 0x0000a800, + 0x00005100, 0x0000a300, 0x00004000, 0x00008f00, + 0x00009200, 0x00009d00, 0x00003800, 0x0000f500, + 0x0000bc00, 0x0000b600, 0x0000da00, 0x00002100, + 0x00001000, 0x0000ff00, 0x0000f300, 0x0000d200, + 0x0000cd00, 0x00000c00, 0x00001300, 0x0000ec00, + 0x00005f00, 0x00009700, 0x00004400, 0x00001700, + 0x0000c400, 0x0000a700, 0x00007e00, 0x00003d00, + 0x00006400, 0x00005d00, 0x00001900, 0x00007300, + 0x00006000, 0x00008100, 0x00004f00, 0x0000dc00, + 0x00002200, 0x00002a00, 0x00009000, 0x00008800, + 0x00004600, 0x0000ee00, 0x0000b800, 0x00001400, + 0x0000de00, 0x00005e00, 0x00000b00, 0x0000db00, + 0x0000e000, 0x00003200, 0x00003a00, 0x00000a00, + 0x00004900, 0x00000600, 0x00002400, 0x00005c00, + 0x0000c200, 0x0000d300, 0x0000ac00, 0x00006200, + 0x00009100, 0x00009500, 0x0000e400, 0x00007900, + 0x0000e700, 0x0000c800, 0x00003700, 0x00006d00, + 0x00008d00, 0x0000d500, 0x00004e00, 0x0000a900, + 0x00006c00, 0x00005600, 0x0000f400, 0x0000ea00, + 0x00006500, 0x00007a00, 0x0000ae00, 0x00000800, + 0x0000ba00, 0x00007800, 0x00002500, 0x00002e00, + 0x00001c00, 0x0000a600, 0x0000b400, 0x0000c600, + 0x0000e800, 0x0000dd00, 0x00007400, 0x00001f00, + 0x00004b00, 0x0000bd00, 0x00008b00, 0x00008a00, + 0x00007000, 0x00003e00, 0x0000b500, 0x00006600, + 0x00004800, 0x00000300, 0x0000f600, 0x00000e00, + 0x00006100, 0x00003500, 0x00005700, 0x0000b900, + 0x00008600, 0x0000c100, 0x00001d00, 0x00009e00, + 0x0000e100, 0x0000f800, 0x00009800, 0x00001100, + 0x00006900, 0x0000d900, 0x00008e00, 0x00009400, + 0x00009b00, 0x00001e00, 0x00008700, 0x0000e900, + 0x0000ce00, 0x00005500, 0x00002800, 0x0000df00, + 0x00008c00, 0x0000a100, 0x00008900, 0x00000d00, + 0x0000bf00, 0x0000e600, 0x00004200, 0x00006800, + 0x00004100, 0x00009900, 0x00002d00, 0x00000f00, + 0x0000b000, 0x00005400, 0x0000bb00, 0x00001600 + }, + { + 0x00630000, 0x007c0000, 0x00770000, 0x007b0000, + 0x00f20000, 0x006b0000, 0x006f0000, 0x00c50000, + 0x00300000, 0x00010000, 0x00670000, 0x002b0000, + 0x00fe0000, 0x00d70000, 0x00ab0000, 0x00760000, + 0x00ca0000, 0x00820000, 0x00c90000, 0x007d0000, + 0x00fa0000, 0x00590000, 0x00470000, 0x00f00000, + 0x00ad0000, 0x00d40000, 0x00a20000, 0x00af0000, + 0x009c0000, 0x00a40000, 0x00720000, 0x00c00000, + 0x00b70000, 0x00fd0000, 0x00930000, 0x00260000, + 0x00360000, 0x003f0000, 0x00f70000, 0x00cc0000, + 0x00340000, 0x00a50000, 0x00e50000, 0x00f10000, + 0x00710000, 0x00d80000, 0x00310000, 0x00150000, + 0x00040000, 0x00c70000, 0x00230000, 0x00c30000, + 0x00180000, 0x00960000, 0x00050000, 0x009a0000, + 0x00070000, 0x00120000, 0x00800000, 0x00e20000, + 0x00eb0000, 0x00270000, 0x00b20000, 0x00750000, + 0x00090000, 0x00830000, 0x002c0000, 0x001a0000, + 0x001b0000, 0x006e0000, 0x005a0000, 0x00a00000, + 0x00520000, 0x003b0000, 0x00d60000, 0x00b30000, + 0x00290000, 0x00e30000, 0x002f0000, 0x00840000, + 0x00530000, 0x00d10000, 0x00000000, 0x00ed0000, + 0x00200000, 0x00fc0000, 0x00b10000, 0x005b0000, + 0x006a0000, 0x00cb0000, 0x00be0000, 0x00390000, + 0x004a0000, 0x004c0000, 0x00580000, 0x00cf0000, + 0x00d00000, 0x00ef0000, 0x00aa0000, 0x00fb0000, + 0x00430000, 0x004d0000, 0x00330000, 0x00850000, + 0x00450000, 0x00f90000, 0x00020000, 0x007f0000, + 0x00500000, 0x003c0000, 0x009f0000, 0x00a80000, + 0x00510000, 0x00a30000, 0x00400000, 0x008f0000, + 0x00920000, 0x009d0000, 0x00380000, 0x00f50000, + 0x00bc0000, 0x00b60000, 0x00da0000, 0x00210000, + 0x00100000, 0x00ff0000, 0x00f30000, 0x00d20000, + 0x00cd0000, 0x000c0000, 0x00130000, 0x00ec0000, + 0x005f0000, 0x00970000, 0x00440000, 0x00170000, + 0x00c40000, 0x00a70000, 0x007e0000, 0x003d0000, + 0x00640000, 0x005d0000, 0x00190000, 0x00730000, + 0x00600000, 0x00810000, 0x004f0000, 0x00dc0000, + 0x00220000, 0x002a0000, 0x00900000, 0x00880000, + 0x00460000, 0x00ee0000, 0x00b80000, 0x00140000, + 0x00de0000, 0x005e0000, 0x000b0000, 0x00db0000, + 0x00e00000, 0x00320000, 0x003a0000, 0x000a0000, + 0x00490000, 0x00060000, 0x00240000, 0x005c0000, + 0x00c20000, 0x00d30000, 0x00ac0000, 0x00620000, + 0x00910000, 0x00950000, 0x00e40000, 0x00790000, + 0x00e70000, 0x00c80000, 0x00370000, 0x006d0000, + 0x008d0000, 0x00d50000, 0x004e0000, 0x00a90000, + 0x006c0000, 0x00560000, 0x00f40000, 0x00ea0000, + 0x00650000, 0x007a0000, 0x00ae0000, 0x00080000, + 0x00ba0000, 0x00780000, 0x00250000, 0x002e0000, + 0x001c0000, 0x00a60000, 0x00b40000, 0x00c60000, + 0x00e80000, 0x00dd0000, 0x00740000, 0x001f0000, + 0x004b0000, 0x00bd0000, 0x008b0000, 0x008a0000, + 0x00700000, 0x003e0000, 0x00b50000, 0x00660000, + 0x00480000, 0x00030000, 0x00f60000, 0x000e0000, + 0x00610000, 0x00350000, 0x00570000, 0x00b90000, + 0x00860000, 0x00c10000, 0x001d0000, 0x009e0000, + 0x00e10000, 0x00f80000, 0x00980000, 0x00110000, + 0x00690000, 0x00d90000, 0x008e0000, 0x00940000, + 0x009b0000, 0x001e0000, 0x00870000, 0x00e90000, + 0x00ce0000, 0x00550000, 0x00280000, 0x00df0000, + 0x008c0000, 0x00a10000, 0x00890000, 0x000d0000, + 0x00bf0000, 0x00e60000, 0x00420000, 0x00680000, + 0x00410000, 0x00990000, 0x002d0000, 0x000f0000, + 0x00b00000, 0x00540000, 0x00bb0000, 0x00160000 + }, + { + 0x63000000, 0x7c000000, 0x77000000, 0x7b000000, + 0xf2000000, 0x6b000000, 0x6f000000, 0xc5000000, + 0x30000000, 0x01000000, 0x67000000, 0x2b000000, + 0xfe000000, 0xd7000000, 0xab000000, 0x76000000, + 0xca000000, 0x82000000, 0xc9000000, 0x7d000000, + 0xfa000000, 0x59000000, 0x47000000, 0xf0000000, + 0xad000000, 0xd4000000, 0xa2000000, 0xaf000000, + 0x9c000000, 0xa4000000, 0x72000000, 0xc0000000, + 0xb7000000, 0xfd000000, 0x93000000, 0x26000000, + 0x36000000, 0x3f000000, 0xf7000000, 0xcc000000, + 0x34000000, 0xa5000000, 0xe5000000, 0xf1000000, + 0x71000000, 0xd8000000, 0x31000000, 0x15000000, + 0x04000000, 0xc7000000, 0x23000000, 0xc3000000, + 0x18000000, 0x96000000, 0x05000000, 0x9a000000, + 0x07000000, 0x12000000, 0x80000000, 0xe2000000, + 0xeb000000, 0x27000000, 0xb2000000, 0x75000000, + 0x09000000, 0x83000000, 0x2c000000, 0x1a000000, + 0x1b000000, 0x6e000000, 0x5a000000, 0xa0000000, + 0x52000000, 0x3b000000, 0xd6000000, 0xb3000000, + 0x29000000, 0xe3000000, 0x2f000000, 0x84000000, + 0x53000000, 0xd1000000, 0x00000000, 0xed000000, + 0x20000000, 0xfc000000, 0xb1000000, 0x5b000000, + 0x6a000000, 0xcb000000, 0xbe000000, 0x39000000, + 0x4a000000, 0x4c000000, 0x58000000, 0xcf000000, + 0xd0000000, 0xef000000, 0xaa000000, 0xfb000000, + 0x43000000, 0x4d000000, 0x33000000, 0x85000000, + 0x45000000, 0xf9000000, 0x02000000, 0x7f000000, + 0x50000000, 0x3c000000, 0x9f000000, 0xa8000000, + 0x51000000, 0xa3000000, 0x40000000, 0x8f000000, + 0x92000000, 0x9d000000, 0x38000000, 0xf5000000, + 0xbc000000, 0xb6000000, 0xda000000, 0x21000000, + 0x10000000, 0xff000000, 0xf3000000, 0xd2000000, + 0xcd000000, 0x0c000000, 0x13000000, 0xec000000, + 0x5f000000, 0x97000000, 0x44000000, 0x17000000, + 0xc4000000, 0xa7000000, 0x7e000000, 0x3d000000, + 0x64000000, 0x5d000000, 0x19000000, 0x73000000, + 0x60000000, 0x81000000, 0x4f000000, 0xdc000000, + 0x22000000, 0x2a000000, 0x90000000, 0x88000000, + 0x46000000, 0xee000000, 0xb8000000, 0x14000000, + 0xde000000, 0x5e000000, 0x0b000000, 0xdb000000, + 0xe0000000, 0x32000000, 0x3a000000, 0x0a000000, + 0x49000000, 0x06000000, 0x24000000, 0x5c000000, + 0xc2000000, 0xd3000000, 0xac000000, 0x62000000, + 0x91000000, 0x95000000, 0xe4000000, 0x79000000, + 0xe7000000, 0xc8000000, 0x37000000, 0x6d000000, + 0x8d000000, 0xd5000000, 0x4e000000, 0xa9000000, + 0x6c000000, 0x56000000, 0xf4000000, 0xea000000, + 0x65000000, 0x7a000000, 0xae000000, 0x08000000, + 0xba000000, 0x78000000, 0x25000000, 0x2e000000, + 0x1c000000, 0xa6000000, 0xb4000000, 0xc6000000, + 0xe8000000, 0xdd000000, 0x74000000, 0x1f000000, + 0x4b000000, 0xbd000000, 0x8b000000, 0x8a000000, + 0x70000000, 0x3e000000, 0xb5000000, 0x66000000, + 0x48000000, 0x03000000, 0xf6000000, 0x0e000000, + 0x61000000, 0x35000000, 0x57000000, 0xb9000000, + 0x86000000, 0xc1000000, 0x1d000000, 0x9e000000, + 0xe1000000, 0xf8000000, 0x98000000, 0x11000000, + 0x69000000, 0xd9000000, 0x8e000000, 0x94000000, + 0x9b000000, 0x1e000000, 0x87000000, 0xe9000000, + 0xce000000, 0x55000000, 0x28000000, 0xdf000000, + 0x8c000000, 0xa1000000, 0x89000000, 0x0d000000, + 0xbf000000, 0xe6000000, 0x42000000, 0x68000000, + 0x41000000, 0x99000000, 0x2d000000, 0x0f000000, + 0xb0000000, 0x54000000, 0xbb000000, 0x16000000 + } +}; + +static const uint32_t t_im[4][256] = +{ + { + 0x00000000, 0x0b0d090e, 0x161a121c, 0x1d171b12, + 0x2c342438, 0x27392d36, 0x3a2e3624, 0x31233f2a, + 0x58684870, 0x5365417e, 0x4e725a6c, 0x457f5362, + 0x745c6c48, 0x7f516546, 0x62467e54, 0x694b775a, + 0xb0d090e0, 0xbbdd99ee, 0xa6ca82fc, 0xadc78bf2, + 0x9ce4b4d8, 0x97e9bdd6, 0x8afea6c4, 0x81f3afca, + 0xe8b8d890, 0xe3b5d19e, 0xfea2ca8c, 0xf5afc382, + 0xc48cfca8, 0xcf81f5a6, 0xd296eeb4, 0xd99be7ba, + 0x7bbb3bdb, 0x70b632d5, 0x6da129c7, 0x66ac20c9, + 0x578f1fe3, 0x5c8216ed, 0x41950dff, 0x4a9804f1, + 0x23d373ab, 0x28de7aa5, 0x35c961b7, 0x3ec468b9, + 0x0fe75793, 0x04ea5e9d, 0x19fd458f, 0x12f04c81, + 0xcb6bab3b, 0xc066a235, 0xdd71b927, 0xd67cb029, + 0xe75f8f03, 0xec52860d, 0xf1459d1f, 0xfa489411, + 0x9303e34b, 0x980eea45, 0x8519f157, 0x8e14f859, + 0xbf37c773, 0xb43ace7d, 0xa92dd56f, 0xa220dc61, + 0xf66d76ad, 0xfd607fa3, 0xe07764b1, 0xeb7a6dbf, + 0xda595295, 0xd1545b9b, 0xcc434089, 0xc74e4987, + 0xae053edd, 0xa50837d3, 0xb81f2cc1, 0xb31225cf, + 0x82311ae5, 0x893c13eb, 0x942b08f9, 0x9f2601f7, + 0x46bde64d, 0x4db0ef43, 0x50a7f451, 0x5baafd5f, + 0x6a89c275, 0x6184cb7b, 0x7c93d069, 0x779ed967, + 0x1ed5ae3d, 0x15d8a733, 0x08cfbc21, 0x03c2b52f, + 0x32e18a05, 0x39ec830b, 0x24fb9819, 0x2ff69117, + 0x8dd64d76, 0x86db4478, 0x9bcc5f6a, 0x90c15664, + 0xa1e2694e, 0xaaef6040, 0xb7f87b52, 0xbcf5725c, + 0xd5be0506, 0xdeb30c08, 0xc3a4171a, 0xc8a91e14, + 0xf98a213e, 0xf2872830, 0xef903322, 0xe49d3a2c, + 0x3d06dd96, 0x360bd498, 0x2b1ccf8a, 0x2011c684, + 0x1132f9ae, 0x1a3ff0a0, 0x0728ebb2, 0x0c25e2bc, + 0x656e95e6, 0x6e639ce8, 0x737487fa, 0x78798ef4, + 0x495ab1de, 0x4257b8d0, 0x5f40a3c2, 0x544daacc, + 0xf7daec41, 0xfcd7e54f, 0xe1c0fe5d, 0xeacdf753, + 0xdbeec879, 0xd0e3c177, 0xcdf4da65, 0xc6f9d36b, + 0xafb2a431, 0xa4bfad3f, 0xb9a8b62d, 0xb2a5bf23, + 0x83868009, 0x888b8907, 0x959c9215, 0x9e919b1b, + 0x470a7ca1, 0x4c0775af, 0x51106ebd, 0x5a1d67b3, + 0x6b3e5899, 0x60335197, 0x7d244a85, 0x7629438b, + 0x1f6234d1, 0x146f3ddf, 0x097826cd, 0x02752fc3, + 0x335610e9, 0x385b19e7, 0x254c02f5, 0x2e410bfb, + 0x8c61d79a, 0x876cde94, 0x9a7bc586, 0x9176cc88, + 0xa055f3a2, 0xab58faac, 0xb64fe1be, 0xbd42e8b0, + 0xd4099fea, 0xdf0496e4, 0xc2138df6, 0xc91e84f8, + 0xf83dbbd2, 0xf330b2dc, 0xee27a9ce, 0xe52aa0c0, + 0x3cb1477a, 0x37bc4e74, 0x2aab5566, 0x21a65c68, + 0x10856342, 0x1b886a4c, 0x069f715e, 0x0d927850, + 0x64d90f0a, 0x6fd40604, 0x72c31d16, 0x79ce1418, + 0x48ed2b32, 0x43e0223c, 0x5ef7392e, 0x55fa3020, + 0x01b79aec, 0x0aba93e2, 0x17ad88f0, 0x1ca081fe, + 0x2d83bed4, 0x268eb7da, 0x3b99acc8, 0x3094a5c6, + 0x59dfd29c, 0x52d2db92, 0x4fc5c080, 0x44c8c98e, + 0x75ebf6a4, 0x7ee6ffaa, 0x63f1e4b8, 0x68fcedb6, + 0xb1670a0c, 0xba6a0302, 0xa77d1810, 0xac70111e, + 0x9d532e34, 0x965e273a, 0x8b493c28, 0x80443526, + 0xe90f427c, 0xe2024b72, 0xff155060, 0xf418596e, + 0xc53b6644, 0xce366f4a, 0xd3217458, 0xd82c7d56, + 0x7a0ca137, 0x7101a839, 0x6c16b32b, 0x671bba25, + 0x5638850f, 0x5d358c01, 0x40229713, 0x4b2f9e1d, + 0x2264e947, 0x2969e049, 0x347efb5b, 0x3f73f255, + 0x0e50cd7f, 0x055dc471, 0x184adf63, 0x1347d66d, + 0xcadc31d7, 0xc1d138d9, 0xdcc623cb, 0xd7cb2ac5, + 0xe6e815ef, 0xede51ce1, 0xf0f207f3, 0xfbff0efd, + 0x92b479a7, 0x99b970a9, 0x84ae6bbb, 0x8fa362b5, + 0xbe805d9f, 0xb58d5491, 0xa89a4f83, 0xa397468d + }, + { + 0x00000000, 0x0d090e0b, 0x1a121c16, 0x171b121d, + 0x3424382c, 0x392d3627, 0x2e36243a, 0x233f2a31, + 0x68487058, 0x65417e53, 0x725a6c4e, 0x7f536245, + 0x5c6c4874, 0x5165467f, 0x467e5462, 0x4b775a69, + 0xd090e0b0, 0xdd99eebb, 0xca82fca6, 0xc78bf2ad, + 0xe4b4d89c, 0xe9bdd697, 0xfea6c48a, 0xf3afca81, + 0xb8d890e8, 0xb5d19ee3, 0xa2ca8cfe, 0xafc382f5, + 0x8cfca8c4, 0x81f5a6cf, 0x96eeb4d2, 0x9be7bad9, + 0xbb3bdb7b, 0xb632d570, 0xa129c76d, 0xac20c966, + 0x8f1fe357, 0x8216ed5c, 0x950dff41, 0x9804f14a, + 0xd373ab23, 0xde7aa528, 0xc961b735, 0xc468b93e, + 0xe757930f, 0xea5e9d04, 0xfd458f19, 0xf04c8112, + 0x6bab3bcb, 0x66a235c0, 0x71b927dd, 0x7cb029d6, + 0x5f8f03e7, 0x52860dec, 0x459d1ff1, 0x489411fa, + 0x03e34b93, 0x0eea4598, 0x19f15785, 0x14f8598e, + 0x37c773bf, 0x3ace7db4, 0x2dd56fa9, 0x20dc61a2, + 0x6d76adf6, 0x607fa3fd, 0x7764b1e0, 0x7a6dbfeb, + 0x595295da, 0x545b9bd1, 0x434089cc, 0x4e4987c7, + 0x053eddae, 0x0837d3a5, 0x1f2cc1b8, 0x1225cfb3, + 0x311ae582, 0x3c13eb89, 0x2b08f994, 0x2601f79f, + 0xbde64d46, 0xb0ef434d, 0xa7f45150, 0xaafd5f5b, + 0x89c2756a, 0x84cb7b61, 0x93d0697c, 0x9ed96777, + 0xd5ae3d1e, 0xd8a73315, 0xcfbc2108, 0xc2b52f03, + 0xe18a0532, 0xec830b39, 0xfb981924, 0xf691172f, + 0xd64d768d, 0xdb447886, 0xcc5f6a9b, 0xc1566490, + 0xe2694ea1, 0xef6040aa, 0xf87b52b7, 0xf5725cbc, + 0xbe0506d5, 0xb30c08de, 0xa4171ac3, 0xa91e14c8, + 0x8a213ef9, 0x872830f2, 0x903322ef, 0x9d3a2ce4, + 0x06dd963d, 0x0bd49836, 0x1ccf8a2b, 0x11c68420, + 0x32f9ae11, 0x3ff0a01a, 0x28ebb207, 0x25e2bc0c, + 0x6e95e665, 0x639ce86e, 0x7487fa73, 0x798ef478, + 0x5ab1de49, 0x57b8d042, 0x40a3c25f, 0x4daacc54, + 0xdaec41f7, 0xd7e54ffc, 0xc0fe5de1, 0xcdf753ea, + 0xeec879db, 0xe3c177d0, 0xf4da65cd, 0xf9d36bc6, + 0xb2a431af, 0xbfad3fa4, 0xa8b62db9, 0xa5bf23b2, + 0x86800983, 0x8b890788, 0x9c921595, 0x919b1b9e, + 0x0a7ca147, 0x0775af4c, 0x106ebd51, 0x1d67b35a, + 0x3e58996b, 0x33519760, 0x244a857d, 0x29438b76, + 0x6234d11f, 0x6f3ddf14, 0x7826cd09, 0x752fc302, + 0x5610e933, 0x5b19e738, 0x4c02f525, 0x410bfb2e, + 0x61d79a8c, 0x6cde9487, 0x7bc5869a, 0x76cc8891, + 0x55f3a2a0, 0x58faacab, 0x4fe1beb6, 0x42e8b0bd, + 0x099fead4, 0x0496e4df, 0x138df6c2, 0x1e84f8c9, + 0x3dbbd2f8, 0x30b2dcf3, 0x27a9ceee, 0x2aa0c0e5, + 0xb1477a3c, 0xbc4e7437, 0xab55662a, 0xa65c6821, + 0x85634210, 0x886a4c1b, 0x9f715e06, 0x9278500d, + 0xd90f0a64, 0xd406046f, 0xc31d1672, 0xce141879, + 0xed2b3248, 0xe0223c43, 0xf7392e5e, 0xfa302055, + 0xb79aec01, 0xba93e20a, 0xad88f017, 0xa081fe1c, + 0x83bed42d, 0x8eb7da26, 0x99acc83b, 0x94a5c630, + 0xdfd29c59, 0xd2db9252, 0xc5c0804f, 0xc8c98e44, + 0xebf6a475, 0xe6ffaa7e, 0xf1e4b863, 0xfcedb668, + 0x670a0cb1, 0x6a0302ba, 0x7d1810a7, 0x70111eac, + 0x532e349d, 0x5e273a96, 0x493c288b, 0x44352680, + 0x0f427ce9, 0x024b72e2, 0x155060ff, 0x18596ef4, + 0x3b6644c5, 0x366f4ace, 0x217458d3, 0x2c7d56d8, + 0x0ca1377a, 0x01a83971, 0x16b32b6c, 0x1bba2567, + 0x38850f56, 0x358c015d, 0x22971340, 0x2f9e1d4b, + 0x64e94722, 0x69e04929, 0x7efb5b34, 0x73f2553f, + 0x50cd7f0e, 0x5dc47105, 0x4adf6318, 0x47d66d13, + 0xdc31d7ca, 0xd138d9c1, 0xc623cbdc, 0xcb2ac5d7, + 0xe815efe6, 0xe51ce1ed, 0xf207f3f0, 0xff0efdfb, + 0xb479a792, 0xb970a999, 0xae6bbb84, 0xa362b58f, + 0x805d9fbe, 0x8d5491b5, 0x9a4f83a8, 0x97468da3 + }, + { + 0x00000000, 0x090e0b0d, 0x121c161a, 0x1b121d17, + 0x24382c34, 0x2d362739, 0x36243a2e, 0x3f2a3123, + 0x48705868, 0x417e5365, 0x5a6c4e72, 0x5362457f, + 0x6c48745c, 0x65467f51, 0x7e546246, 0x775a694b, + 0x90e0b0d0, 0x99eebbdd, 0x82fca6ca, 0x8bf2adc7, + 0xb4d89ce4, 0xbdd697e9, 0xa6c48afe, 0xafca81f3, + 0xd890e8b8, 0xd19ee3b5, 0xca8cfea2, 0xc382f5af, + 0xfca8c48c, 0xf5a6cf81, 0xeeb4d296, 0xe7bad99b, + 0x3bdb7bbb, 0x32d570b6, 0x29c76da1, 0x20c966ac, + 0x1fe3578f, 0x16ed5c82, 0x0dff4195, 0x04f14a98, + 0x73ab23d3, 0x7aa528de, 0x61b735c9, 0x68b93ec4, + 0x57930fe7, 0x5e9d04ea, 0x458f19fd, 0x4c8112f0, + 0xab3bcb6b, 0xa235c066, 0xb927dd71, 0xb029d67c, + 0x8f03e75f, 0x860dec52, 0x9d1ff145, 0x9411fa48, + 0xe34b9303, 0xea45980e, 0xf1578519, 0xf8598e14, + 0xc773bf37, 0xce7db43a, 0xd56fa92d, 0xdc61a220, + 0x76adf66d, 0x7fa3fd60, 0x64b1e077, 0x6dbfeb7a, + 0x5295da59, 0x5b9bd154, 0x4089cc43, 0x4987c74e, + 0x3eddae05, 0x37d3a508, 0x2cc1b81f, 0x25cfb312, + 0x1ae58231, 0x13eb893c, 0x08f9942b, 0x01f79f26, + 0xe64d46bd, 0xef434db0, 0xf45150a7, 0xfd5f5baa, + 0xc2756a89, 0xcb7b6184, 0xd0697c93, 0xd967779e, + 0xae3d1ed5, 0xa73315d8, 0xbc2108cf, 0xb52f03c2, + 0x8a0532e1, 0x830b39ec, 0x981924fb, 0x91172ff6, + 0x4d768dd6, 0x447886db, 0x5f6a9bcc, 0x566490c1, + 0x694ea1e2, 0x6040aaef, 0x7b52b7f8, 0x725cbcf5, + 0x0506d5be, 0x0c08deb3, 0x171ac3a4, 0x1e14c8a9, + 0x213ef98a, 0x2830f287, 0x3322ef90, 0x3a2ce49d, + 0xdd963d06, 0xd498360b, 0xcf8a2b1c, 0xc6842011, + 0xf9ae1132, 0xf0a01a3f, 0xebb20728, 0xe2bc0c25, + 0x95e6656e, 0x9ce86e63, 0x87fa7374, 0x8ef47879, + 0xb1de495a, 0xb8d04257, 0xa3c25f40, 0xaacc544d, + 0xec41f7da, 0xe54ffcd7, 0xfe5de1c0, 0xf753eacd, + 0xc879dbee, 0xc177d0e3, 0xda65cdf4, 0xd36bc6f9, + 0xa431afb2, 0xad3fa4bf, 0xb62db9a8, 0xbf23b2a5, + 0x80098386, 0x8907888b, 0x9215959c, 0x9b1b9e91, + 0x7ca1470a, 0x75af4c07, 0x6ebd5110, 0x67b35a1d, + 0x58996b3e, 0x51976033, 0x4a857d24, 0x438b7629, + 0x34d11f62, 0x3ddf146f, 0x26cd0978, 0x2fc30275, + 0x10e93356, 0x19e7385b, 0x02f5254c, 0x0bfb2e41, + 0xd79a8c61, 0xde94876c, 0xc5869a7b, 0xcc889176, + 0xf3a2a055, 0xfaacab58, 0xe1beb64f, 0xe8b0bd42, + 0x9fead409, 0x96e4df04, 0x8df6c213, 0x84f8c91e, + 0xbbd2f83d, 0xb2dcf330, 0xa9ceee27, 0xa0c0e52a, + 0x477a3cb1, 0x4e7437bc, 0x55662aab, 0x5c6821a6, + 0x63421085, 0x6a4c1b88, 0x715e069f, 0x78500d92, + 0x0f0a64d9, 0x06046fd4, 0x1d1672c3, 0x141879ce, + 0x2b3248ed, 0x223c43e0, 0x392e5ef7, 0x302055fa, + 0x9aec01b7, 0x93e20aba, 0x88f017ad, 0x81fe1ca0, + 0xbed42d83, 0xb7da268e, 0xacc83b99, 0xa5c63094, + 0xd29c59df, 0xdb9252d2, 0xc0804fc5, 0xc98e44c8, + 0xf6a475eb, 0xffaa7ee6, 0xe4b863f1, 0xedb668fc, + 0x0a0cb167, 0x0302ba6a, 0x1810a77d, 0x111eac70, + 0x2e349d53, 0x273a965e, 0x3c288b49, 0x35268044, + 0x427ce90f, 0x4b72e202, 0x5060ff15, 0x596ef418, + 0x6644c53b, 0x6f4ace36, 0x7458d321, 0x7d56d82c, + 0xa1377a0c, 0xa8397101, 0xb32b6c16, 0xba25671b, + 0x850f5638, 0x8c015d35, 0x97134022, 0x9e1d4b2f, + 0xe9472264, 0xe0492969, 0xfb5b347e, 0xf2553f73, + 0xcd7f0e50, 0xc471055d, 0xdf63184a, 0xd66d1347, + 0x31d7cadc, 0x38d9c1d1, 0x23cbdcc6, 0x2ac5d7cb, + 0x15efe6e8, 0x1ce1ede5, 0x07f3f0f2, 0x0efdfbff, + 0x79a792b4, 0x70a999b9, 0x6bbb84ae, 0x62b58fa3, + 0x5d9fbe80, 0x5491b58d, 0x4f83a89a, 0x468da397 + }, + { + 0x00000000, 0x0e0b0d09, 0x1c161a12, 0x121d171b, + 0x382c3424, 0x3627392d, 0x243a2e36, 0x2a31233f, + 0x70586848, 0x7e536541, 0x6c4e725a, 0x62457f53, + 0x48745c6c, 0x467f5165, 0x5462467e, 0x5a694b77, + 0xe0b0d090, 0xeebbdd99, 0xfca6ca82, 0xf2adc78b, + 0xd89ce4b4, 0xd697e9bd, 0xc48afea6, 0xca81f3af, + 0x90e8b8d8, 0x9ee3b5d1, 0x8cfea2ca, 0x82f5afc3, + 0xa8c48cfc, 0xa6cf81f5, 0xb4d296ee, 0xbad99be7, + 0xdb7bbb3b, 0xd570b632, 0xc76da129, 0xc966ac20, + 0xe3578f1f, 0xed5c8216, 0xff41950d, 0xf14a9804, + 0xab23d373, 0xa528de7a, 0xb735c961, 0xb93ec468, + 0x930fe757, 0x9d04ea5e, 0x8f19fd45, 0x8112f04c, + 0x3bcb6bab, 0x35c066a2, 0x27dd71b9, 0x29d67cb0, + 0x03e75f8f, 0x0dec5286, 0x1ff1459d, 0x11fa4894, + 0x4b9303e3, 0x45980eea, 0x578519f1, 0x598e14f8, + 0x73bf37c7, 0x7db43ace, 0x6fa92dd5, 0x61a220dc, + 0xadf66d76, 0xa3fd607f, 0xb1e07764, 0xbfeb7a6d, + 0x95da5952, 0x9bd1545b, 0x89cc4340, 0x87c74e49, + 0xddae053e, 0xd3a50837, 0xc1b81f2c, 0xcfb31225, + 0xe582311a, 0xeb893c13, 0xf9942b08, 0xf79f2601, + 0x4d46bde6, 0x434db0ef, 0x5150a7f4, 0x5f5baafd, + 0x756a89c2, 0x7b6184cb, 0x697c93d0, 0x67779ed9, + 0x3d1ed5ae, 0x3315d8a7, 0x2108cfbc, 0x2f03c2b5, + 0x0532e18a, 0x0b39ec83, 0x1924fb98, 0x172ff691, + 0x768dd64d, 0x7886db44, 0x6a9bcc5f, 0x6490c156, + 0x4ea1e269, 0x40aaef60, 0x52b7f87b, 0x5cbcf572, + 0x06d5be05, 0x08deb30c, 0x1ac3a417, 0x14c8a91e, + 0x3ef98a21, 0x30f28728, 0x22ef9033, 0x2ce49d3a, + 0x963d06dd, 0x98360bd4, 0x8a2b1ccf, 0x842011c6, + 0xae1132f9, 0xa01a3ff0, 0xb20728eb, 0xbc0c25e2, + 0xe6656e95, 0xe86e639c, 0xfa737487, 0xf478798e, + 0xde495ab1, 0xd04257b8, 0xc25f40a3, 0xcc544daa, + 0x41f7daec, 0x4ffcd7e5, 0x5de1c0fe, 0x53eacdf7, + 0x79dbeec8, 0x77d0e3c1, 0x65cdf4da, 0x6bc6f9d3, + 0x31afb2a4, 0x3fa4bfad, 0x2db9a8b6, 0x23b2a5bf, + 0x09838680, 0x07888b89, 0x15959c92, 0x1b9e919b, + 0xa1470a7c, 0xaf4c0775, 0xbd51106e, 0xb35a1d67, + 0x996b3e58, 0x97603351, 0x857d244a, 0x8b762943, + 0xd11f6234, 0xdf146f3d, 0xcd097826, 0xc302752f, + 0xe9335610, 0xe7385b19, 0xf5254c02, 0xfb2e410b, + 0x9a8c61d7, 0x94876cde, 0x869a7bc5, 0x889176cc, + 0xa2a055f3, 0xacab58fa, 0xbeb64fe1, 0xb0bd42e8, + 0xead4099f, 0xe4df0496, 0xf6c2138d, 0xf8c91e84, + 0xd2f83dbb, 0xdcf330b2, 0xceee27a9, 0xc0e52aa0, + 0x7a3cb147, 0x7437bc4e, 0x662aab55, 0x6821a65c, + 0x42108563, 0x4c1b886a, 0x5e069f71, 0x500d9278, + 0x0a64d90f, 0x046fd406, 0x1672c31d, 0x1879ce14, + 0x3248ed2b, 0x3c43e022, 0x2e5ef739, 0x2055fa30, + 0xec01b79a, 0xe20aba93, 0xf017ad88, 0xfe1ca081, + 0xd42d83be, 0xda268eb7, 0xc83b99ac, 0xc63094a5, + 0x9c59dfd2, 0x9252d2db, 0x804fc5c0, 0x8e44c8c9, + 0xa475ebf6, 0xaa7ee6ff, 0xb863f1e4, 0xb668fced, + 0x0cb1670a, 0x02ba6a03, 0x10a77d18, 0x1eac7011, + 0x349d532e, 0x3a965e27, 0x288b493c, 0x26804435, + 0x7ce90f42, 0x72e2024b, 0x60ff1550, 0x6ef41859, + 0x44c53b66, 0x4ace366f, 0x58d32174, 0x56d82c7d, + 0x377a0ca1, 0x397101a8, 0x2b6c16b3, 0x25671bba, + 0x0f563885, 0x015d358c, 0x13402297, 0x1d4b2f9e, + 0x472264e9, 0x492969e0, 0x5b347efb, 0x553f73f2, + 0x7f0e50cd, 0x71055dc4, 0x63184adf, 0x6d1347d6, + 0xd7cadc31, 0xd9c1d138, 0xcbdcc623, 0xc5d7cb2a, + 0xefe6e815, 0xe1ede51c, 0xf3f0f207, 0xfdfbff0e, + 0xa792b479, 0xa999b970, 0xbb84ae6b, 0xb58fa362, + 0x9fbe805d, 0x91b58d54, 0x83a89a4f, 0x8da39746 + } +}; + +#ifdef __cplusplus +} +#endif + +#endif /* _AESTAB2_H */ diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams new file mode 100644 index 000000000000..0de1883dc81b --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams @@ -0,0 +1,36 @@ +Copyright (c) 2006-2017, CRYPTOGAMS by +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain copyright notices, + this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials + provided with the distribution. + + * Neither the name of the CRYPTOGAMS nor the names of its + copyright holder and contributors may be used to endorse or + promote products derived from this software without specific + prior written permission. + +ALTERNATIVELY, provided that this notice is retained in full, this +product may be distributed under the terms of the GNU General Public +License (GPL), in which case the provisions of the GPL apply INSTEAD OF +those given above. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip new file mode 100644 index 000000000000..6184759c8b74 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.cryptogams.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl new file mode 100644 index 000000000000..49cc83d2ee29 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl @@ -0,0 +1,177 @@ + + Apache License + Version 2.0, January 2004 + https://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS diff --git a/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip new file mode 100644 index 000000000000..6184759c8b74 --- /dev/null +++ b/module/icp/asm-x86_64/modes/THIRDPARTYLICENSE.openssl.descrip @@ -0,0 +1 @@ +PORTIONS OF GCM and GHASH FUNCTIONALITY diff --git a/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S new file mode 100644 index 000000000000..ed9f660fce5b --- /dev/null +++ b/module/icp/asm-x86_64/modes/aesni-gcm-x86_64.S @@ -0,0 +1,1245 @@ +# Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# +# AES-NI-CTR+GHASH stitch. +# +# February 2013 +# +# OpenSSL GCM implementation is organized in such way that its +# performance is rather close to the sum of its streamed components, +# in the context parallelized AES-NI CTR and modulo-scheduled +# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation +# was observed to perform significantly better than the sum of the +# components on contemporary CPUs, the effort was deemed impossible to +# justify. This module is based on combination of Intel submissions, +# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max +# Locktyukhin of Intel Corp. who verified that it reduces shuffles +# pressure with notable relative improvement, achieving 1.0 cycle per +# byte processed with 128-bit key on Haswell processor, 0.74 - on +# Broadwell, 0.63 - on Skylake... [Mentioned results are raw profiled +# measurements for favourable packet size, one divisible by 96. +# Applications using the EVP interface will observe a few percent +# worse performance.] +# +# Knights Landing processes 1 byte in 1.25 cycles (measured with EVP). +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest +# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/aesni-gcm-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) + +.extern gcm_avx_can_use_movbe + +.text + +#ifdef HAVE_MOVBE +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + cmpl $14,%ebp // ICP does not zero key schedule. + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +#endif /* ifdef HAVE_MOVBE */ + +.type _aesni_ctr32_ghash_no_movbe_6x,@function +.align 32 +_aesni_ctr32_ghash_no_movbe_6x: + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x_nmb + +.align 32 +.Loop6x_nmb: + addl $100663296,%ebx + jc .Lhandle_ctr32_nmb + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32_nmb: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movq 88(%r14),%r13 + bswapq %r13 + vaesenc %xmm15,%xmm11,%xmm11 + movq 80(%r14),%r12 + bswapq %r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movq 72(%r14),%r13 + bswapq %r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movq 64(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movq 56(%r14),%r13 + bswapq %r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movq 48(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movq 40(%r14),%r13 + bswapq %r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movq 32(%r14),%r12 + bswapq %r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movq 24(%r14),%r13 + bswapq %r13 + vaesenc %xmm15,%xmm11,%xmm11 + movq 16(%r14),%r12 + bswapq %r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movq 8(%r14),%r13 + bswapq %r13 + vaesenc %xmm1,%xmm13,%xmm13 + movq 0(%r14),%r12 + bswapq %r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $12,%ebp // ICP uses 10,12,14 not 9,11,13 for rounds. + jb .Lenc_tail_nmb + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + cmpl $14,%ebp // ICP does not zero key schedule. + jb .Lenc_tail_nmb + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail_nmb + +.align 32 +.Lhandle_ctr32_nmb: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32_nmb + +.align 32 +.Lenc_tail_nmb: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%r10 + subq $0x6,%rdx + jc .L6x_done_nmb + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x_nmb + +.L6x_done_nmb: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + .byte 0xf3,0xc3 +.size _aesni_ctr32_ghash_no_movbe_6x,.-_aesni_ctr32_ghash_no_movbe_6x + +.globl aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r9),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32+32(%r9),%r9 + movl 504-128(%rcx),%ebp // ICP has a larger offset for rounds. + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + leaq (%rdi),%r14 + vmovdqu 64(%rdi),%xmm4 + leaq -192(%rdi,%rdx,1),%r15 + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %r10,%r10 + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + +#ifdef HAVE_MOVBE +#ifdef _KERNEL + testl $1,gcm_avx_can_use_movbe(%rip) +#else + testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) +#endif + jz 1f + call _aesni_ctr32_ghash_6x + jmp 2f +1: +#endif + call _aesni_ctr32_ghash_no_movbe_6x +2: + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_dec_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -2(%rbp),%r13 // ICP uses 10,12,14 not 9,11,13 for rounds. + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + .byte 0xf3,0xc3 +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + xorq %r10,%r10 + cmpq $288,%rdx + jb .Lgcm_enc_abort + + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 504-128(%rcx),%ebp // ICP has an larger offset for rounds. + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + leaq (%rsi),%r14 + leaq -192(%rsi,%rdx,1),%r15 + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + vmovdqu (%r9),%xmm8 + leaq 32+32(%r9),%r9 + subq $12,%rdx + movq $192,%r10 + vpshufb %xmm0,%xmm8,%xmm8 + +#ifdef HAVE_MOVBE +#ifdef _KERNEL + testl $1,gcm_avx_can_use_movbe(%rip) +#else + testl $1,gcm_avx_can_use_movbe@GOTPCREL(%rip) +#endif + jz 1f + call _aesni_ctr32_ghash_6x + jmp 2f +1: +#endif + call _aesni_ctr32_ghash_no_movbe_6x +2: + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,-64(%r9) + + vzeroupper + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lgcm_enc_abort: + movq %r10,%rax + .byte 0xf3,0xc3 +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt + +/* Some utility routines */ + +/* + * clear all fpu registers + * void clear_fpu_regs_avx(void); + */ +.globl clear_fpu_regs_avx +.type clear_fpu_regs_avx,@function +.align 32 +clear_fpu_regs_avx: + vzeroall + ret +.size clear_fpu_regs_avx,.-clear_fpu_regs_avx + +/* + * void gcm_xor_avx(const uint8_t *src, uint8_t *dst); + * + * XORs one pair of unaligned 128-bit blocks from `src' and `dst' and + * stores the result at `dst'. The XOR is performed using FPU registers, + * so make sure FPU state is saved when running this in the kernel. + */ +.globl gcm_xor_avx +.type gcm_xor_avx,@function +.align 32 +gcm_xor_avx: + movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 + pxor %xmm1, %xmm0 + movdqu %xmm0, (%rsi) + ret +.size gcm_xor_avx,.-gcm_xor_avx + +/* + * Toggle a boolean_t value atomically and return the new value. + * boolean_t atomic_toggle_boolean_nv(volatile boolean_t *); + */ +.globl atomic_toggle_boolean_nv +.type atomic_toggle_boolean_nv,@function +.align 32 +atomic_toggle_boolean_nv: + xorl %eax, %eax + lock + xorl $1, (%rdi) + jz 1f + movl $1, %eax +1: + ret +.size atomic_toggle_boolean_nv,.-atomic_toggle_boolean_nv + +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S new file mode 100644 index 000000000000..59edc4c8d56c --- /dev/null +++ b/module/icp/asm-x86_64/modes/gcm_pclmulqdq.S @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009 Intel Corporation + * All Rights Reserved. + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Accelerated GHASH implementation with Intel PCLMULQDQ-NI + * instructions. This file contains an accelerated + * Galois Field Multiplication implementation. + * + * PCLMULQDQ is used to accelerate the most time-consuming part of GHASH, + * carry-less multiplication. More information about PCLMULQDQ can be + * found at: + * http://software.intel.com/en-us/articles/ + * carry-less-multiplication-and-its-usage-for-computing-the-gcm-mode/ + * + */ + +/* + * ==================================================================== + * OpenSolaris OS modifications + * + * This source originates as file galois_hash_asm.c from + * Intel Corporation dated September 21, 2009. + * + * This OpenSolaris version has these major changes from the original source: + * + * 1. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, lint(1B) guards, and a dummy C function + * definition for lint. + * + * 2. Formatted code, added comments, and added #includes and #defines. + * + * 3. If bit CR0.TS is set, clear and set the TS bit, after and before + * calling kpreempt_disable() and kpreempt_enable(). + * If the TS bit is not set, Save and restore %xmm registers at the beginning + * and end of function calls (%xmm* registers are not saved and restored by + * during kernel thread preemption). + * + * 4. Removed code to perform hashing. This is already done with C macro + * GHASH in gcm.c. For better performance, this removed code should be + * reintegrated in the future to replace the C GHASH macro. + * + * 5. Added code to byte swap 16-byte input and output. + * + * 6. Folded in comments from the original C source with embedded assembly + * (SB_w_shift_xor.c) + * + * 7. Renamed function and reordered parameters to match OpenSolaris: + * Intel interface: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * OpenSolaris OS interface: + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * ==================================================================== + */ + + +#if defined(lint) || defined(__lint) /* lint */ + +#include + +/* ARGSUSED */ +void +gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res) { +} + +#elif defined(HAVE_PCLMULQDQ) /* guard by instruction set */ + +#define _ASM +#include + +/* + * Use this mask to byte-swap a 16-byte integer with the pshufb instruction + */ + +// static uint8_t byte_swap16_mask[] = { +// 15, 14, 13, 12, 11, 10, 9, 8, 7, 6 ,5, 4, 3, 2, 1, 0 }; +.data +.align XMM_ALIGN +.Lbyte_swap16_mask: + .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + + +/* + * void gcm_mul_pclmulqdq(uint64_t *x_in, uint64_t *y, uint64_t *res); + * + * Perform a carry-less multiplication (that is, use XOR instead of the + * multiply operator) on P1 and P2 and place the result in P3. + * + * Byte swap the input and the output. + * + * Note: x_in, y, and res all point to a block of 20-byte numbers + * (an array of two 64-bit integers). + * + * Note2: For kernel code, caller is responsible for ensuring + * kpreempt_disable() has been called. This is because %xmm registers are + * not saved/restored. Clear and set the CR0.TS bit on entry and exit, + * respectively, if TS is set on entry. Otherwise, if TS is not set, + * save and restore %xmm registers on the stack. + * + * Note3: Original Intel definition: + * void galois_hash_asm(unsigned char *hk, unsigned char *s, + * unsigned char *d, int length) + * + * Note4: Register/parameter mapping: + * Intel: + * Parameter 1: %rcx (copied to %xmm0) hk or x_in + * Parameter 2: %rdx (copied to %xmm1) s or y + * Parameter 3: %rdi (result) d or res + * OpenSolaris: + * Parameter 1: %rdi (copied to %xmm0) x_in + * Parameter 2: %rsi (copied to %xmm1) y + * Parameter 3: %rdx (result) res + */ + +ENTRY_NP(gcm_mul_pclmulqdq) + // + // Copy Parameters + // + movdqu (%rdi), %xmm0 // P1 + movdqu (%rsi), %xmm1 // P2 + + // + // Byte swap 16-byte input + // + lea .Lbyte_swap16_mask(%rip), %rax + movups (%rax), %xmm10 + pshufb %xmm10, %xmm0 + pshufb %xmm10, %xmm1 + + + // + // Multiply with the hash key + // + movdqu %xmm0, %xmm3 + pclmulqdq $0, %xmm1, %xmm3 // xmm3 holds a0*b0 + + movdqu %xmm0, %xmm4 + pclmulqdq $16, %xmm1, %xmm4 // xmm4 holds a0*b1 + + movdqu %xmm0, %xmm5 + pclmulqdq $1, %xmm1, %xmm5 // xmm5 holds a1*b0 + movdqu %xmm0, %xmm6 + pclmulqdq $17, %xmm1, %xmm6 // xmm6 holds a1*b1 + + pxor %xmm5, %xmm4 // xmm4 holds a0*b1 + a1*b0 + + movdqu %xmm4, %xmm5 // move the contents of xmm4 to xmm5 + psrldq $8, %xmm4 // shift by xmm4 64 bits to the right + pslldq $8, %xmm5 // shift by xmm5 64 bits to the left + pxor %xmm5, %xmm3 + pxor %xmm4, %xmm6 // Register pair holds the result + // of the carry-less multiplication of + // xmm0 by xmm1. + + // We shift the result of the multiplication by one bit position + // to the left to cope for the fact that the bits are reversed. + movdqu %xmm3, %xmm7 + movdqu %xmm6, %xmm8 + pslld $1, %xmm3 + pslld $1, %xmm6 + psrld $31, %xmm7 + psrld $31, %xmm8 + movdqu %xmm7, %xmm9 + pslldq $4, %xmm8 + pslldq $4, %xmm7 + psrldq $12, %xmm9 + por %xmm7, %xmm3 + por %xmm8, %xmm6 + por %xmm9, %xmm6 + + // + // First phase of the reduction + // + // Move xmm3 into xmm7, xmm8, xmm9 in order to perform the shifts + // independently. + movdqu %xmm3, %xmm7 + movdqu %xmm3, %xmm8 + movdqu %xmm3, %xmm9 + pslld $31, %xmm7 // packed right shift shifting << 31 + pslld $30, %xmm8 // packed right shift shifting << 30 + pslld $25, %xmm9 // packed right shift shifting << 25 + pxor %xmm8, %xmm7 // xor the shifted versions + pxor %xmm9, %xmm7 + movdqu %xmm7, %xmm8 + pslldq $12, %xmm7 + psrldq $4, %xmm8 + pxor %xmm7, %xmm3 // first phase of the reduction complete + + // + // Second phase of the reduction + // + // Make 3 copies of xmm3 in xmm2, xmm4, xmm5 for doing these + // shift operations. + movdqu %xmm3, %xmm2 + movdqu %xmm3, %xmm4 // packed left shifting >> 1 + movdqu %xmm3, %xmm5 + psrld $1, %xmm2 + psrld $2, %xmm4 // packed left shifting >> 2 + psrld $7, %xmm5 // packed left shifting >> 7 + pxor %xmm4, %xmm2 // xor the shifted versions + pxor %xmm5, %xmm2 + pxor %xmm8, %xmm2 + pxor %xmm2, %xmm3 + pxor %xmm3, %xmm6 // the result is in xmm6 + + // + // Byte swap 16-byte result + // + pshufb %xmm10, %xmm6 // %xmm10 has the swap mask + + // + // Store the result + // + movdqu %xmm6, (%rdx) // P3 + + + // + // Return + // + ret + SET_SIZE(gcm_mul_pclmulqdq) + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/modes/ghash-x86_64.S b/module/icp/asm-x86_64/modes/ghash-x86_64.S new file mode 100644 index 000000000000..90cc36b43a78 --- /dev/null +++ b/module/icp/asm-x86_64/modes/ghash-x86_64.S @@ -0,0 +1,714 @@ +# Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +# +# Licensed under the Apache License 2.0 (the "License"). You may not use +# this file except in compliance with the License. You can obtain a copy +# in the file LICENSE in the source distribution or at +# https://www.openssl.org/source/license.html + +# +# ==================================================================== +# Written by Andy Polyakov for the OpenSSL +# project. The module is, however, dual licensed under OpenSSL and +# CRYPTOGAMS licenses depending on where you obtain it. For further +# details see http://www.openssl.org/~appro/cryptogams/. +# ==================================================================== +# +# March, June 2010 +# +# The module implements "4-bit" GCM GHASH function and underlying +# single multiplication operation in GF(2^128). "4-bit" means that +# it uses 256 bytes per-key table [+128 bytes shared table]. GHASH +# function features so called "528B" variant utilizing additional +# 256+16 bytes of per-key storage [+512 bytes shared table]. +# Performance results are for this streamed GHASH subroutine and are +# expressed in cycles per processed byte, less is better: +# +# gcc 3.4.x(*) assembler +# +# P4 28.6 14.0 +100% +# Opteron 19.3 7.7 +150% +# Core2 17.8 8.1(**) +120% +# Atom 31.6 16.8 +88% +# VIA Nano 21.8 10.1 +115% +# +# (*) comparison is not completely fair, because C results are +# for vanilla "256B" implementation, while assembler results +# are for "528B";-) +# (**) it's mystery [to me] why Core2 result is not same as for +# Opteron; + +# May 2010 +# +# Add PCLMULQDQ version performing at 2.02 cycles per processed byte. +# See ghash-x86.pl for background information and details about coding +# techniques. +# +# Special thanks to David Woodhouse for providing access to a +# Westmere-based system on behalf of Intel Open Source Technology Centre. + +# December 2012 +# +# Overhaul: aggregate Karatsuba post-processing, improve ILP in +# reduction_alg9, increase reduction aggregate factor to 4x. As for +# the latter. ghash-x86.pl discusses that it makes lesser sense to +# increase aggregate factor. Then why increase here? Critical path +# consists of 3 independent pclmulqdq instructions, Karatsuba post- +# processing and reduction. "On top" of this we lay down aggregated +# multiplication operations, triplets of independent pclmulqdq's. As +# issue rate for pclmulqdq is limited, it makes lesser sense to +# aggregate more multiplications than it takes to perform remaining +# non-multiplication operations. 2x is near-optimal coefficient for +# contemporary Intel CPUs (therefore modest improvement coefficient), +# but not for Bulldozer. Latter is because logical SIMD operations +# are twice as slow in comparison to Intel, so that critical path is +# longer. A CPU with higher pclmulqdq issue rate would also benefit +# from higher aggregate factor... +# +# Westmere 1.78(+13%) +# Sandy Bridge 1.80(+8%) +# Ivy Bridge 1.80(+7%) +# Haswell 0.55(+93%) (if system doesn't support AVX) +# Broadwell 0.45(+110%)(if system doesn't support AVX) +# Skylake 0.44(+110%)(if system doesn't support AVX) +# Bulldozer 1.49(+27%) +# Silvermont 2.88(+13%) +# Knights L 2.12(-) (if system doesn't support AVX) +# Goldmont 1.08(+24%) + +# March 2013 +# +# ... 8x aggregate factor AVX code path is using reduction algorithm +# suggested by Shay Gueron[1]. Even though contemporary AVX-capable +# CPUs such as Sandy and Ivy Bridge can execute it, the code performs +# sub-optimally in comparison to above mentioned version. But thanks +# to Ilya Albrekht and Max Locktyukhin of Intel Corp. we knew that +# it performs in 0.41 cycles per byte on Haswell processor, in +# 0.29 on Broadwell, and in 0.36 on Skylake. +# +# Knights Landing achieves 1.09 cpb. +# +# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest + +# Generated once from +# https://github.com/openssl/openssl/blob/5ffc3324/crypto/modes/asm/ghash-x86_64.pl +# and modified for ICP. Modification are kept at a bare minimum to ease later +# upstream merges. + +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) + +.text + +.globl gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.cfi_startproc +.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 +.byte 102,15,56,0,197 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 +.byte 102,15,58,68,194,0 +.byte 102,15,58,68,202,17 +.byte 102,15,58,68,220,0 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.byte 102,15,56,0,197 + movdqu %xmm0,(%rdi) + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_gmult_clmul,.-gcm_gmult_clmul + +.globl gcm_init_htab_avx +.type gcm_init_htab_avx,@function +.align 32 +gcm_init_htab_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rsi),%xmm2 + // KCF/ICP stores H in network byte order with the hi qword first + // so we need to swap all bytes, not the 2 qwords. + vmovdqu .Lbswap_mask(%rip),%xmm4 + vpshufb %xmm4,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_init_htab_avx,.-gcm_init_htab_avx + +.globl gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: +.cfi_startproc + jmp .L_gmult_clmul +.cfi_endproc +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: +.cfi_startproc + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + .byte 0xf3,0xc3 +.cfi_endproc +.size gcm_ghash_avx,.-gcm_ghash_avx +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.L7_mask_poly: +.long 7,0,450,0 +.align 64 +.type .Lrem_4bit,@object +.Lrem_4bit: +.long 0,0,0,471859200,0,943718400,0,610271232 +.long 0,1887436800,0,1822425088,0,1220542464,0,1423966208 +.long 0,3774873600,0,4246732800,0,3644850176,0,3311403008 +.long 0,2441084928,0,2376073216,0,2847932416,0,3051356160 +.type .Lrem_8bit,@object +.Lrem_8bit: +.value 0x0000,0x01C2,0x0384,0x0246,0x0708,0x06CA,0x048C,0x054E +.value 0x0E10,0x0FD2,0x0D94,0x0C56,0x0918,0x08DA,0x0A9C,0x0B5E +.value 0x1C20,0x1DE2,0x1FA4,0x1E66,0x1B28,0x1AEA,0x18AC,0x196E +.value 0x1230,0x13F2,0x11B4,0x1076,0x1538,0x14FA,0x16BC,0x177E +.value 0x3840,0x3982,0x3BC4,0x3A06,0x3F48,0x3E8A,0x3CCC,0x3D0E +.value 0x3650,0x3792,0x35D4,0x3416,0x3158,0x309A,0x32DC,0x331E +.value 0x2460,0x25A2,0x27E4,0x2626,0x2368,0x22AA,0x20EC,0x212E +.value 0x2A70,0x2BB2,0x29F4,0x2836,0x2D78,0x2CBA,0x2EFC,0x2F3E +.value 0x7080,0x7142,0x7304,0x72C6,0x7788,0x764A,0x740C,0x75CE +.value 0x7E90,0x7F52,0x7D14,0x7CD6,0x7998,0x785A,0x7A1C,0x7BDE +.value 0x6CA0,0x6D62,0x6F24,0x6EE6,0x6BA8,0x6A6A,0x682C,0x69EE +.value 0x62B0,0x6372,0x6134,0x60F6,0x65B8,0x647A,0x663C,0x67FE +.value 0x48C0,0x4902,0x4B44,0x4A86,0x4FC8,0x4E0A,0x4C4C,0x4D8E +.value 0x46D0,0x4712,0x4554,0x4496,0x41D8,0x401A,0x425C,0x439E +.value 0x54E0,0x5522,0x5764,0x56A6,0x53E8,0x522A,0x506C,0x51AE +.value 0x5AF0,0x5B32,0x5974,0x58B6,0x5DF8,0x5C3A,0x5E7C,0x5FBE +.value 0xE100,0xE0C2,0xE284,0xE346,0xE608,0xE7CA,0xE58C,0xE44E +.value 0xEF10,0xEED2,0xEC94,0xED56,0xE818,0xE9DA,0xEB9C,0xEA5E +.value 0xFD20,0xFCE2,0xFEA4,0xFF66,0xFA28,0xFBEA,0xF9AC,0xF86E +.value 0xF330,0xF2F2,0xF0B4,0xF176,0xF438,0xF5FA,0xF7BC,0xF67E +.value 0xD940,0xD882,0xDAC4,0xDB06,0xDE48,0xDF8A,0xDDCC,0xDC0E +.value 0xD750,0xD692,0xD4D4,0xD516,0xD058,0xD19A,0xD3DC,0xD21E +.value 0xC560,0xC4A2,0xC6E4,0xC726,0xC268,0xC3AA,0xC1EC,0xC02E +.value 0xCB70,0xCAB2,0xC8F4,0xC936,0xCC78,0xCDBA,0xCFFC,0xCE3E +.value 0x9180,0x9042,0x9204,0x93C6,0x9688,0x974A,0x950C,0x94CE +.value 0x9F90,0x9E52,0x9C14,0x9DD6,0x9898,0x995A,0x9B1C,0x9ADE +.value 0x8DA0,0x8C62,0x8E24,0x8FE6,0x8AA8,0x8B6A,0x892C,0x88EE +.value 0x83B0,0x8272,0x8034,0x81F6,0x84B8,0x857A,0x873C,0x86FE +.value 0xA9C0,0xA802,0xAA44,0xAB86,0xAEC8,0xAF0A,0xAD4C,0xAC8E +.value 0xA7D0,0xA612,0xA454,0xA596,0xA0D8,0xA11A,0xA35C,0xA29E +.value 0xB5E0,0xB422,0xB664,0xB7A6,0xB2E8,0xB32A,0xB16C,0xB0AE +.value 0xBBF0,0xBA32,0xB874,0xB9B6,0xBCF8,0xBD3A,0xBF7C,0xBEBE + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 + +/* Mark the stack non-executable. */ +#if defined(__linux__) && defined(__ELF__) +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* defined(__x86_64__) && defined(HAVE_AVX) && defined(HAVE_AES) ... */ diff --git a/module/icp/asm-x86_64/sha1/sha1-x86_64.S b/module/icp/asm-x86_64/sha1/sha1-x86_64.S new file mode 100644 index 000000000000..cb923784a730 --- /dev/null +++ b/module/icp/asm-x86_64/sha1/sha1-x86_64.S @@ -0,0 +1,1353 @@ +/* + * !/usr/bin/env perl + * + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. The module is, however, dual licensed under OpenSSL and + * CRYPTOGAMS licenses depending on where you obtain it. For further + * details see http://www.openssl.org/~appro/cryptogams/. + * ==================================================================== + * + * sha1_block procedure for x86_64. + * + * It was brought to my attention that on EM64T compiler-generated code + * was far behind 32-bit assembler implementation. This is unlike on + * Opteron where compiler-generated code was only 15% behind 32-bit + * assembler, which originally made it hard to motivate the effort. + * There was suggestion to mechanically translate 32-bit code, but I + * dismissed it, reasoning that x86_64 offers enough register bank + * capacity to fully utilize SHA-1 parallelism. Therefore this fresh + * implementation:-) However! While 64-bit code does performs better + * on Opteron, I failed to beat 32-bit assembler on EM64T core. Well, + * x86_64 does offer larger *addressable* bank, but out-of-order core + * reaches for even more registers through dynamic aliasing, and EM64T + * core must have managed to run-time optimize even 32-bit code just as + * good as 64-bit one. Performance improvement is summarized in the + * following table: + * + * gcc 3.4 32-bit asm cycles/byte + * Opteron +45% +20% 6.8 + * Xeon P4 +65% +0% 9.9 + * Core2 +60% +10% 7.0 + * + * + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha1-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). + * + */ + +/* + * This file was generated by a perl script (sha1-x86_64.pl). The comments from + * the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + + +/* ARGSUSED */ +void +sha1_block_data_order(SHA1_CTX *ctx, const void *inpp, size_t blocks) +{ +} + +#else +#define _ASM +#include +ENTRY_NP(sha1_block_data_order) + push %rbx + push %rbp + push %r12 + mov %rsp,%rax + mov %rdi,%r8 # reassigned argument + sub $72,%rsp + mov %rsi,%r9 # reassigned argument + and $-64,%rsp + mov %rdx,%r10 # reassigned argument + mov %rax,64(%rsp) + + mov 0(%r8),%edx + mov 4(%r8),%esi + mov 8(%r8),%edi + mov 12(%r8),%ebp + mov 16(%r8),%r11d +.align 4 +.Lloop: + mov 0(%r9),%eax + bswap %eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 4(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,4(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 8(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,8(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 12(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,12(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 16(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,16(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 20(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,20(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 24(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,24(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 28(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,28(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 32(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,32(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 36(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,36(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov %r12d,%ebx + mov 40(%r9),%eax + mov %ebp,%esi + xor %edx,%ebx + bswap %eax + rol $5,%esi + and %r11d,%ebx + mov %eax,40(%rsp) + add %esi,%edi + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + lea 0x5a827999(%eax,%edx),%esi + mov %r11d,%ebx + mov 44(%r9),%eax + mov %edi,%edx + xor %r12d,%ebx + bswap %eax + rol $5,%edx + and %ebp,%ebx + mov %eax,44(%rsp) + add %edx,%esi + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + lea 0x5a827999(%eax,%r12d),%edx + mov %ebp,%ebx + mov 48(%r9),%eax + mov %esi,%r12d + xor %r11d,%ebx + bswap %eax + rol $5,%r12d + and %edi,%ebx + mov %eax,48(%rsp) + add %r12d,%edx + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + lea 0x5a827999(%eax,%r11d),%r12d + mov %edi,%ebx + mov 52(%r9),%eax + mov %edx,%r11d + xor %ebp,%ebx + bswap %eax + rol $5,%r11d + and %esi,%ebx + mov %eax,52(%rsp) + add %r11d,%r12d + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + lea 0x5a827999(%eax,%ebp),%r11d + mov %esi,%ebx + mov 56(%r9),%eax + mov %r12d,%ebp + xor %edi,%ebx + bswap %eax + rol $5,%ebp + and %edx,%ebx + mov %eax,56(%rsp) + add %ebp,%r11d + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + lea 0x5a827999(%eax,%edi),%ebp + mov %edx,%ebx + mov 60(%r9),%eax + mov %r11d,%edi + xor %esi,%ebx + bswap %eax + rol $5,%edi + and %r12d,%ebx + mov %eax,60(%rsp) + add %edi,%ebp + xor %esi,%ebx + rol $30,%r12d + add %ebx,%ebp + lea 0x5a827999(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%esi + xor 32(%rsp),%eax + and %r11d,%ebx + add %esi,%edi + xor 52(%rsp),%eax + xor %edx,%ebx + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea 0x5a827999(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edx + xor 36(%rsp),%eax + and %ebp,%ebx + add %edx,%esi + xor 56(%rsp),%eax + xor %r12d,%ebx + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea 0x5a827999(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + and %edi,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + xor %r11d,%ebx + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea 0x5a827999(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + and %esi,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + xor %ebp,%ebx + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea 0x5a827999(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + and %edx,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + xor %edi,%ebx + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,52(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,56(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,60(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 0(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 8(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 32(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 52(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,0(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 4(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 12(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 36(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 56(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,4(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 8(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 16(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 40(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 60(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,8(%rsp) + lea 0x6ed9eba1(%eax,%edx),%esi + mov 12(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 20(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 44(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 0(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,12(%rsp) + lea 0x6ed9eba1(%eax,%r12d),%edx + mov 16(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 24(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 48(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 4(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,16(%rsp) + lea 0x6ed9eba1(%eax,%r11d),%r12d + mov 20(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 28(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 52(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 8(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,20(%rsp) + lea 0x6ed9eba1(%eax,%ebp),%r11d + mov 24(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 32(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 56(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 12(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,24(%rsp) + lea 0x6ed9eba1(%eax,%edi),%ebp + mov 28(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 36(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 60(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 16(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,28(%rsp) + lea 0x6ed9eba1(%eax,%esi),%edi + mov 32(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 40(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 0(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 20(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,32(%rsp) + lea -0x70e44324(%eax,%edx),%esi + mov 36(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 44(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 4(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 24(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,36(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 40(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 48(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 8(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 28(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,40(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 44(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 52(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 12(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 32(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,44(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 48(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 56(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 16(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 36(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,48(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 52(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 60(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 20(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 40(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,52(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 56(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 0(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 24(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 44(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,56(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 60(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 4(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 28(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 48(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,60(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 0(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 8(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 32(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 52(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,0(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 4(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 12(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 36(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 56(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,4(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 8(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 16(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 40(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 60(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,8(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 12(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 20(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 44(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 0(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,12(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 16(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 24(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 48(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 4(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,16(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 20(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 28(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 52(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 8(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,20(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 24(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 32(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 56(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 12(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,24(%rsp) + add %ebx,%edx + lea -0x70e44324(%eax,%r11d),%r12d + mov 28(%rsp),%eax + mov %esi,%ebx + mov %esi,%ecx + xor 36(%rsp),%eax + mov %edx,%r11d + and %edi,%ebx + xor 60(%rsp),%eax + or %edi,%ecx + rol $5,%r11d + xor 16(%rsp),%eax + and %ebp,%ecx + add %r11d,%r12d + rol $1,%eax + or %ecx,%ebx + rol $30,%esi + mov %eax,28(%rsp) + add %ebx,%r12d + lea -0x70e44324(%eax,%ebp),%r11d + mov 32(%rsp),%eax + mov %edx,%ebx + mov %edx,%ecx + xor 40(%rsp),%eax + mov %r12d,%ebp + and %esi,%ebx + xor 0(%rsp),%eax + or %esi,%ecx + rol $5,%ebp + xor 20(%rsp),%eax + and %edi,%ecx + add %ebp,%r11d + rol $1,%eax + or %ecx,%ebx + rol $30,%edx + mov %eax,32(%rsp) + add %ebx,%r11d + lea -0x70e44324(%eax,%edi),%ebp + mov 36(%rsp),%eax + mov %r12d,%ebx + mov %r12d,%ecx + xor 44(%rsp),%eax + mov %r11d,%edi + and %edx,%ebx + xor 4(%rsp),%eax + or %edx,%ecx + rol $5,%edi + xor 24(%rsp),%eax + and %esi,%ecx + add %edi,%ebp + rol $1,%eax + or %ecx,%ebx + rol $30,%r12d + mov %eax,36(%rsp) + add %ebx,%ebp + lea -0x70e44324(%eax,%esi),%edi + mov 40(%rsp),%eax + mov %r11d,%ebx + mov %r11d,%ecx + xor 48(%rsp),%eax + mov %ebp,%esi + and %r12d,%ebx + xor 8(%rsp),%eax + or %r12d,%ecx + rol $5,%esi + xor 28(%rsp),%eax + and %edx,%ecx + add %esi,%edi + rol $1,%eax + or %ecx,%ebx + rol $30,%r11d + mov %eax,40(%rsp) + add %ebx,%edi + lea -0x70e44324(%eax,%edx),%esi + mov 44(%rsp),%eax + mov %ebp,%ebx + mov %ebp,%ecx + xor 52(%rsp),%eax + mov %edi,%edx + and %r11d,%ebx + xor 12(%rsp),%eax + or %r11d,%ecx + rol $5,%edx + xor 32(%rsp),%eax + and %r12d,%ecx + add %edx,%esi + rol $1,%eax + or %ecx,%ebx + rol $30,%ebp + mov %eax,44(%rsp) + add %ebx,%esi + lea -0x70e44324(%eax,%r12d),%edx + mov 48(%rsp),%eax + mov %edi,%ebx + mov %edi,%ecx + xor 56(%rsp),%eax + mov %esi,%r12d + and %ebp,%ebx + xor 16(%rsp),%eax + or %ebp,%ecx + rol $5,%r12d + xor 36(%rsp),%eax + and %r11d,%ecx + add %r12d,%edx + rol $1,%eax + or %ecx,%ebx + rol $30,%edi + mov %eax,48(%rsp) + add %ebx,%edx + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 52(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 60(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 20(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 40(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,52(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 56(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 0(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 24(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 44(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,56(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 60(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 4(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 28(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 48(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,60(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 0(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 8(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 32(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 52(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,0(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 4(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 12(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 36(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 56(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,4(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 8(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 16(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 40(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 60(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,8(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 12(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 20(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 44(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 0(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,12(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 16(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 24(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 48(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 4(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,16(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 20(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 28(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 52(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 8(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,20(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 24(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 32(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 56(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 12(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,24(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 28(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 36(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 60(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 16(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + mov %eax,28(%rsp) + lea -0x359d3e2a(%eax,%r12d),%edx + mov 32(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 40(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 0(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 20(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + mov %eax,32(%rsp) + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 36(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 44(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 4(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 24(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + mov %eax,36(%rsp) + lea -0x359d3e2a(%eax,%ebp),%r11d + mov 40(%rsp),%eax + mov %esi,%ebx + mov %r12d,%ebp + xor 48(%rsp),%eax + xor %edx,%ebx + rol $5,%ebp + xor 8(%rsp),%eax + xor %edi,%ebx + add %ebp,%r11d + xor 28(%rsp),%eax + rol $30,%edx + add %ebx,%r11d + rol $1,%eax + mov %eax,40(%rsp) + lea -0x359d3e2a(%eax,%edi),%ebp + mov 44(%rsp),%eax + mov %edx,%ebx + mov %r11d,%edi + xor 52(%rsp),%eax + xor %r12d,%ebx + rol $5,%edi + xor 12(%rsp),%eax + xor %esi,%ebx + add %edi,%ebp + xor 32(%rsp),%eax + rol $30,%r12d + add %ebx,%ebp + rol $1,%eax + mov %eax,44(%rsp) + lea -0x359d3e2a(%eax,%esi),%edi + mov 48(%rsp),%eax + mov %r12d,%ebx + mov %ebp,%esi + xor 56(%rsp),%eax + xor %r11d,%ebx + rol $5,%esi + xor 16(%rsp),%eax + xor %edx,%ebx + add %esi,%edi + xor 36(%rsp),%eax + rol $30,%r11d + add %ebx,%edi + rol $1,%eax + mov %eax,48(%rsp) + lea -0x359d3e2a(%eax,%edx),%esi + mov 52(%rsp),%eax + mov %r11d,%ebx + mov %edi,%edx + xor 60(%rsp),%eax + xor %ebp,%ebx + rol $5,%edx + xor 20(%rsp),%eax + xor %r12d,%ebx + add %edx,%esi + xor 40(%rsp),%eax + rol $30,%ebp + add %ebx,%esi + rol $1,%eax + lea -0x359d3e2a(%eax,%r12d),%edx + mov 56(%rsp),%eax + mov %ebp,%ebx + mov %esi,%r12d + xor 0(%rsp),%eax + xor %edi,%ebx + rol $5,%r12d + xor 24(%rsp),%eax + xor %r11d,%ebx + add %r12d,%edx + xor 44(%rsp),%eax + rol $30,%edi + add %ebx,%edx + rol $1,%eax + lea -0x359d3e2a(%eax,%r11d),%r12d + mov 60(%rsp),%eax + mov %edi,%ebx + mov %edx,%r11d + xor 4(%rsp),%eax + xor %esi,%ebx + rol $5,%r11d + xor 28(%rsp),%eax + xor %ebp,%ebx + add %r11d,%r12d + xor 48(%rsp),%eax + rol $30,%esi + add %ebx,%r12d + rol $1,%eax + lea -0x359d3e2a(%eax,%ebp),%r11d + mov %esi,%ebx + mov %r12d,%ebp + xor %edx,%ebx + rol $5,%ebp + xor %edi,%ebx + add %ebp,%r11d + rol $30,%edx + add %ebx,%r11d + // Update and save state information in SHA-1 context + add 0(%r8),%r11d + add 4(%r8),%r12d + add 8(%r8),%edx + add 12(%r8),%esi + add 16(%r8),%edi + mov %r11d,0(%r8) + mov %r12d,4(%r8) + mov %edx,8(%r8) + mov %esi,12(%r8) + mov %edi,16(%r8) + + xchg %r11d,%edx # mov %r11d,%edx + xchg %r12d,%esi # mov %r12d,%esi + xchg %r11d,%edi # mov %edx,%edi + xchg %r12d,%ebp # mov %esi,%ebp + # mov %edi,%r11d + lea 64(%r9),%r9 + sub $1,%r10 + jnz .Lloop + mov 64(%rsp),%rsp + pop %r12 + pop %rbp + pop %rbx + ret +SET_SIZE(sha1_block_data_order) + +.data +.asciz "SHA1 block transform for x86_64, CRYPTOGAMS by " + +#endif /* lint || __lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/sha2/sha256_impl.S b/module/icp/asm-x86_64/sha2/sha256_impl.S new file mode 100644 index 000000000000..766b75355f0b --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha256_impl.S @@ -0,0 +1,2063 @@ +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * performance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA256TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA256TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*4+4*8,%rsp + lea (%rsi,%rdx,4),%rdx # inp+num*16*4 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*4+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*4+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*4+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*4+3*8(%rsp) # save copy of %rsp + + #.picmeup %rbp + # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + # the address of the "next" instruction into the target register + # (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + #nop # .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K256-.(%rbp),%rbp + + mov 4*0(%rdi),%eax + mov 4*1(%rdi),%ebx + mov 4*2(%rdi),%ecx + mov 4*3(%rdi),%edx + mov 4*4(%rdi),%r8d + mov 4*5(%rdi),%r9d + mov 4*6(%rdi),%r10d + mov 4*7(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + xor %rdi,%rdi + mov 4*0(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*1(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*2(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*3(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*4(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*5(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*6(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*7(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 4*8(%rsi),%r12d + bswap %r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 4*9(%rsi),%r12d + bswap %r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 4*10(%rsi),%r12d + bswap %r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 4*11(%rsi),%r12d + bswap %r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 4*12(%rsi),%r12d + bswap %r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 4*13(%rsi),%r12d + bswap %r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 4*14(%rsi),%r12d + bswap %r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 4*15(%rsi),%r12d + bswap %r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + mov 4(%rsp),%r13d + mov 56(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 36(%rsp),%r12d + + add 0(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,0(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 8(%rsp),%r13d + mov 60(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 40(%rsp),%r12d + + add 4(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,4(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 12(%rsp),%r13d + mov 0(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 44(%rsp),%r12d + + add 8(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,8(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 16(%rsp),%r13d + mov 4(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 48(%rsp),%r12d + + add 12(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,12(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 20(%rsp),%r13d + mov 8(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 52(%rsp),%r12d + + add 16(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,16(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 24(%rsp),%r13d + mov 12(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 56(%rsp),%r12d + + add 20(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,20(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 28(%rsp),%r13d + mov 16(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 60(%rsp),%r12d + + add 24(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,24(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 32(%rsp),%r13d + mov 20(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 0(%rsp),%r12d + + add 28(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,28(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + mov 36(%rsp),%r13d + mov 24(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 4(%rsp),%r12d + + add 32(%rsp),%r12d + mov %r8d,%r13d + mov %r8d,%r14d + mov %r9d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r10d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r8d,%r15d # (f^g)&e + mov %r12d,32(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r10d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r11d,%r12d # T1+=h + + mov %eax,%r11d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %eax,%r13d + mov %eax,%r14d + + ror $2,%r11d + ror $13,%r13d + mov %eax,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r11d + ror $9,%r13d + or %ecx,%r14d # a|c + + xor %r13d,%r11d # h=Sigma0(a) + and %ecx,%r15d # a&c + add %r12d,%edx # d+=T1 + + and %ebx,%r14d # (a|c)&b + add %r12d,%r11d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r11d # h+=Maj(a,b,c) + mov 40(%rsp),%r13d + mov 28(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 8(%rsp),%r12d + + add 36(%rsp),%r12d + mov %edx,%r13d + mov %edx,%r14d + mov %r8d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r9d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %edx,%r15d # (f^g)&e + mov %r12d,36(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r9d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r10d,%r12d # T1+=h + + mov %r11d,%r10d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r11d,%r13d + mov %r11d,%r14d + + ror $2,%r10d + ror $13,%r13d + mov %r11d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r10d + ror $9,%r13d + or %ebx,%r14d # a|c + + xor %r13d,%r10d # h=Sigma0(a) + and %ebx,%r15d # a&c + add %r12d,%ecx # d+=T1 + + and %eax,%r14d # (a|c)&b + add %r12d,%r10d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r10d # h+=Maj(a,b,c) + mov 44(%rsp),%r13d + mov 32(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 12(%rsp),%r12d + + add 40(%rsp),%r12d + mov %ecx,%r13d + mov %ecx,%r14d + mov %edx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r8d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ecx,%r15d # (f^g)&e + mov %r12d,40(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r8d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r9d,%r12d # T1+=h + + mov %r10d,%r9d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r10d,%r13d + mov %r10d,%r14d + + ror $2,%r9d + ror $13,%r13d + mov %r10d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r9d + ror $9,%r13d + or %eax,%r14d # a|c + + xor %r13d,%r9d # h=Sigma0(a) + and %eax,%r15d # a&c + add %r12d,%ebx # d+=T1 + + and %r11d,%r14d # (a|c)&b + add %r12d,%r9d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r9d # h+=Maj(a,b,c) + mov 48(%rsp),%r13d + mov 36(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 16(%rsp),%r12d + + add 44(%rsp),%r12d + mov %ebx,%r13d + mov %ebx,%r14d + mov %ecx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %edx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %ebx,%r15d # (f^g)&e + mov %r12d,44(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %edx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %r8d,%r12d # T1+=h + + mov %r9d,%r8d + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r9d,%r13d + mov %r9d,%r14d + + ror $2,%r8d + ror $13,%r13d + mov %r9d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%r8d + ror $9,%r13d + or %r11d,%r14d # a|c + + xor %r13d,%r8d # h=Sigma0(a) + and %r11d,%r15d # a&c + add %r12d,%eax # d+=T1 + + and %r10d,%r14d # (a|c)&b + add %r12d,%r8d # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%r8d # h+=Maj(a,b,c) + mov 52(%rsp),%r13d + mov 40(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 20(%rsp),%r12d + + add 48(%rsp),%r12d + mov %eax,%r13d + mov %eax,%r14d + mov %ebx,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ecx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %eax,%r15d # (f^g)&e + mov %r12d,48(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ecx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %edx,%r12d # T1+=h + + mov %r8d,%edx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %r8d,%r13d + mov %r8d,%r14d + + ror $2,%edx + ror $13,%r13d + mov %r8d,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%edx + ror $9,%r13d + or %r10d,%r14d # a|c + + xor %r13d,%edx # h=Sigma0(a) + and %r10d,%r15d # a&c + add %r12d,%r11d # d+=T1 + + and %r9d,%r14d # (a|c)&b + add %r12d,%edx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%edx # h+=Maj(a,b,c) + mov 56(%rsp),%r13d + mov 44(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 24(%rsp),%r12d + + add 52(%rsp),%r12d + mov %r11d,%r13d + mov %r11d,%r14d + mov %eax,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %ebx,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r11d,%r15d # (f^g)&e + mov %r12d,52(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %ebx,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ecx,%r12d # T1+=h + + mov %edx,%ecx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %edx,%r13d + mov %edx,%r14d + + ror $2,%ecx + ror $13,%r13d + mov %edx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ecx + ror $9,%r13d + or %r9d,%r14d # a|c + + xor %r13d,%ecx # h=Sigma0(a) + and %r9d,%r15d # a&c + add %r12d,%r10d # d+=T1 + + and %r8d,%r14d # (a|c)&b + add %r12d,%ecx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ecx # h+=Maj(a,b,c) + mov 60(%rsp),%r13d + mov 48(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 28(%rsp),%r12d + + add 56(%rsp),%r12d + mov %r10d,%r13d + mov %r10d,%r14d + mov %r11d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %eax,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r10d,%r15d # (f^g)&e + mov %r12d,56(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %eax,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %ebx,%r12d # T1+=h + + mov %ecx,%ebx + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ecx,%r13d + mov %ecx,%r14d + + ror $2,%ebx + ror $13,%r13d + mov %ecx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%ebx + ror $9,%r13d + or %r8d,%r14d # a|c + + xor %r13d,%ebx # h=Sigma0(a) + and %r8d,%r15d # a&c + add %r12d,%r9d # d+=T1 + + and %edx,%r14d # (a|c)&b + add %r12d,%ebx # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%ebx # h+=Maj(a,b,c) + mov 0(%rsp),%r13d + mov 52(%rsp),%r12d + + mov %r13d,%r15d + + shr $3,%r13d + ror $7,%r15d + + xor %r15d,%r13d + ror $11,%r15d + + xor %r15d,%r13d # sigma0(X[(i+1)&0xf]) + mov %r12d,%r14d + + shr $10,%r12d + ror $17,%r14d + + xor %r14d,%r12d + ror $2,%r14d + + xor %r14d,%r12d # sigma1(X[(i+14)&0xf]) + + add %r13d,%r12d + + add 32(%rsp),%r12d + + add 60(%rsp),%r12d + mov %r9d,%r13d + mov %r9d,%r14d + mov %r10d,%r15d + + ror $6,%r13d + ror $11,%r14d + xor %r11d,%r15d # f^g + + xor %r14d,%r13d + ror $14,%r14d + and %r9d,%r15d # (f^g)&e + mov %r12d,60(%rsp) + + xor %r14d,%r13d # Sigma1(e) + xor %r11d,%r15d # Ch(e,f,g)=((f^g)&e)^g + add %eax,%r12d # T1+=h + + mov %ebx,%eax + add %r13d,%r12d # T1+=Sigma1(e) + + add %r15d,%r12d # T1+=Ch(e,f,g) + mov %ebx,%r13d + mov %ebx,%r14d + + ror $2,%eax + ror $13,%r13d + mov %ebx,%r15d + add (%rbp,%rdi,4),%r12d # T1+=K[round] + + xor %r13d,%eax + ror $9,%r13d + or %edx,%r14d # a|c + + xor %r13d,%eax # h=Sigma0(a) + and %edx,%r15d # a&c + add %r12d,%r8d # d+=T1 + + and %ecx,%r14d # (a|c)&b + add %r12d,%eax # h+=T1 + + or %r15d,%r14d # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14d,%eax # h+=Maj(a,b,c) + cmp $64,%rdi + jb .Lrounds_16_xx + + mov 16*4+0*8(%rsp),%rdi + lea 16*4(%rsi),%rsi + + add 4*0(%rdi),%eax + add 4*1(%rdi),%ebx + add 4*2(%rdi),%ecx + add 4*3(%rdi),%edx + add 4*4(%rdi),%r8d + add 4*5(%rdi),%r9d + add 4*6(%rdi),%r10d + add 4*7(%rdi),%r11d + + cmp 16*4+2*8(%rsp),%rsi + + mov %eax,4*0(%rdi) + mov %ebx,4*1(%rdi) + mov %ecx,4*2(%rdi) + mov %edx,4*3(%rdi) + mov %r8d,4*4(%rdi) + mov %r9d,4*5(%rdi) + mov %r10d,4*6(%rdi) + mov %r11d,4*7(%rdi) + jb .Lloop + + mov 16*4+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA256TransformBlocks) + +.data +.align 64 +.type K256,@object +K256: + .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +#endif /* !lint && !__lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/asm-x86_64/sha2/sha512_impl.S b/module/icp/asm-x86_64/sha2/sha512_impl.S new file mode 100644 index 000000000000..6e37618761b2 --- /dev/null +++ b/module/icp/asm-x86_64/sha2/sha512_impl.S @@ -0,0 +1,2088 @@ +/* + * ==================================================================== + * Written by Andy Polyakov for the OpenSSL + * project. Rights for redistribution and usage in source and binary + * forms are granted according to the OpenSSL license. + * ==================================================================== + * + * sha256/512_block procedure for x86_64. + * + * 40% improvement over compiler-generated code on Opteron. On EM64T + * sha256 was observed to run >80% faster and sha512 - >40%. No magical + * tricks, just straight implementation... I really wonder why gcc + * [being armed with inline assembler] fails to generate as fast code. + * The only thing which is cool about this module is that it's very + * same instruction sequence used for both SHA-256 and SHA-512. In + * former case the instructions operate on 32-bit operands, while in + * latter - on 64-bit ones. All I had to do is to get one flavor right, + * the other one passed the test right away:-) + * + * sha256_block runs in ~1005 cycles on Opteron, which gives you + * asymptotic performance of 64*1000/1005=63.7MBps times CPU clock + * frequency in GHz. sha512_block runs in ~1275 cycles, which results + * in 128*1000/1275=100MBps per GHz. Is there room for improvement? + * Well, if you compare it to IA-64 implementation, which maintains + * X[16] in register bank[!], tends to 4 instructions per CPU clock + * cycle and runs in 1003 cycles, 1275 is very good result for 3-way + * issue Opteron pipeline and X[16] maintained in memory. So that *if* + * there is a way to improve it, *then* the only way would be to try to + * offload X[16] updates to SSE unit, but that would require "deeper" + * loop unroll, which in turn would naturally cause size blow-up, not + * to mention increased complexity! And once again, only *if* it's + * actually possible to noticeably improve overall ILP, instruction + * level parallelism, on a given CPU implementation in this case. + * + * Special note on Intel EM64T. While Opteron CPU exhibits perfect + * performance ratio of 1.5 between 64- and 32-bit flavors [see above], + * [currently available] EM64T CPUs apparently are far from it. On the + * contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit + * sha256_block:-( This is presumably because 64-bit shifts/rotates + * apparently are not atomic instructions, but implemented in microcode. + */ + +/* + * OpenSolaris OS modifications + * + * Sun elects to use this software under the BSD license. + * + * This source originates from OpenSSL file sha512-x86_64.pl at + * ftp://ftp.openssl.org/snapshot/openssl-0.9.8-stable-SNAP-20080131.tar.gz + * (presumably for future OpenSSL release 0.9.8h), with these changes: + * + * 1. Added perl "use strict" and declared variables. + * + * 2. Added OpenSolaris ENTRY_NP/SET_SIZE macros from + * /usr/include/sys/asm_linkage.h, .ident keywords, and lint(1B) guards. + * + * 3. Removed x86_64-xlate.pl script (not needed for as(1) or gas(1) + * assemblers). Replaced the .picmeup macro with assembler code. + * + * 4. Added 8 to $ctx, as OpenSolaris OS has an extra 4-byte field, "algotype", + * at the beginning of SHA2_CTX (the next field is 8-byte aligned). + */ + +/* + * This file was generated by a perl script (sha512-x86_64.pl) that were + * used to generate sha256 and sha512 variants from the same code base. + * The comments from the original file have been pasted above. + */ + + +#if defined(lint) || defined(__lint) +#include +#include + +/* ARGSUSED */ +void +SHA512TransformBlocks(SHA2_CTX *ctx, const void *in, size_t num) +{ +} + + +#else +#define _ASM +#include + +ENTRY_NP(SHA512TransformBlocks) + push %rbx + push %rbp + push %r12 + push %r13 + push %r14 + push %r15 + mov %rsp,%rbp # copy %rsp + shl $4,%rdx # num*16 + sub $16*8+4*8,%rsp + lea (%rsi,%rdx,8),%rdx # inp+num*16*8 + and $-64,%rsp # align stack frame + add $8,%rdi # Skip OpenSolaris field, "algotype" + mov %rdi,16*8+0*8(%rsp) # save ctx, 1st arg + mov %rsi,16*8+1*8(%rsp) # save inp, 2nd arg + mov %rdx,16*8+2*8(%rsp) # save end pointer, "3rd" arg + mov %rbp,16*8+3*8(%rsp) # save copy of %rsp + + #.picmeup %rbp + # The .picmeup pseudo-directive, from perlasm/x86_64_xlate.pl, puts + # the address of the "next" instruction into the target register + # (%rbp). This generates these 2 instructions: + lea .Llea(%rip),%rbp + #nop # .picmeup generates a nop for mod 8 alignment--not needed here + +.Llea: + lea K512-.(%rbp),%rbp + + mov 8*0(%rdi),%rax + mov 8*1(%rdi),%rbx + mov 8*2(%rdi),%rcx + mov 8*3(%rdi),%rdx + mov 8*4(%rdi),%r8 + mov 8*5(%rdi),%r9 + mov 8*6(%rdi),%r10 + mov 8*7(%rdi),%r11 + jmp .Lloop + +.align 16 +.Lloop: + xor %rdi,%rdi + mov 8*0(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*1(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*2(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*3(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*4(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*5(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*6(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*7(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 8*8(%rsi),%r12 + bswap %r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 8*9(%rsi),%r12 + bswap %r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 8*10(%rsi),%r12 + bswap %r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 8*11(%rsi),%r12 + bswap %r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 8*12(%rsi),%r12 + bswap %r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 8*13(%rsi),%r12 + bswap %r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 8*14(%rsi),%r12 + bswap %r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 8*15(%rsi),%r12 + bswap %r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + mov 8(%rsp),%r13 + mov 112(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 72(%rsp),%r12 + + add 0(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,0(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 16(%rsp),%r13 + mov 120(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 80(%rsp),%r12 + + add 8(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,8(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 24(%rsp),%r13 + mov 0(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 88(%rsp),%r12 + + add 16(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,16(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 32(%rsp),%r13 + mov 8(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 96(%rsp),%r12 + + add 24(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,24(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 40(%rsp),%r13 + mov 16(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 104(%rsp),%r12 + + add 32(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,32(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 48(%rsp),%r13 + mov 24(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 112(%rsp),%r12 + + add 40(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,40(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 56(%rsp),%r13 + mov 32(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 120(%rsp),%r12 + + add 48(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,48(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 64(%rsp),%r13 + mov 40(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 0(%rsp),%r12 + + add 56(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,56(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + mov 72(%rsp),%r13 + mov 48(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 8(%rsp),%r12 + + add 64(%rsp),%r12 + mov %r8,%r13 + mov %r8,%r14 + mov %r9,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r10,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r8,%r15 # (f^g)&e + mov %r12,64(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r10,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r11,%r12 # T1+=h + + mov %rax,%r11 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rax,%r13 + mov %rax,%r14 + + ror $28,%r11 + ror $34,%r13 + mov %rax,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r11 + ror $5,%r13 + or %rcx,%r14 # a|c + + xor %r13,%r11 # h=Sigma0(a) + and %rcx,%r15 # a&c + add %r12,%rdx # d+=T1 + + and %rbx,%r14 # (a|c)&b + add %r12,%r11 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r11 # h+=Maj(a,b,c) + mov 80(%rsp),%r13 + mov 56(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 16(%rsp),%r12 + + add 72(%rsp),%r12 + mov %rdx,%r13 + mov %rdx,%r14 + mov %r8,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r9,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rdx,%r15 # (f^g)&e + mov %r12,72(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r9,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r10,%r12 # T1+=h + + mov %r11,%r10 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r11,%r13 + mov %r11,%r14 + + ror $28,%r10 + ror $34,%r13 + mov %r11,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r10 + ror $5,%r13 + or %rbx,%r14 # a|c + + xor %r13,%r10 # h=Sigma0(a) + and %rbx,%r15 # a&c + add %r12,%rcx # d+=T1 + + and %rax,%r14 # (a|c)&b + add %r12,%r10 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r10 # h+=Maj(a,b,c) + mov 88(%rsp),%r13 + mov 64(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 24(%rsp),%r12 + + add 80(%rsp),%r12 + mov %rcx,%r13 + mov %rcx,%r14 + mov %rdx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r8,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rcx,%r15 # (f^g)&e + mov %r12,80(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r8,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r9,%r12 # T1+=h + + mov %r10,%r9 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r10,%r13 + mov %r10,%r14 + + ror $28,%r9 + ror $34,%r13 + mov %r10,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r9 + ror $5,%r13 + or %rax,%r14 # a|c + + xor %r13,%r9 # h=Sigma0(a) + and %rax,%r15 # a&c + add %r12,%rbx # d+=T1 + + and %r11,%r14 # (a|c)&b + add %r12,%r9 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r9 # h+=Maj(a,b,c) + mov 96(%rsp),%r13 + mov 72(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 32(%rsp),%r12 + + add 88(%rsp),%r12 + mov %rbx,%r13 + mov %rbx,%r14 + mov %rcx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rdx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rbx,%r15 # (f^g)&e + mov %r12,88(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rdx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %r8,%r12 # T1+=h + + mov %r9,%r8 + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r9,%r13 + mov %r9,%r14 + + ror $28,%r8 + ror $34,%r13 + mov %r9,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%r8 + ror $5,%r13 + or %r11,%r14 # a|c + + xor %r13,%r8 # h=Sigma0(a) + and %r11,%r15 # a&c + add %r12,%rax # d+=T1 + + and %r10,%r14 # (a|c)&b + add %r12,%r8 # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%r8 # h+=Maj(a,b,c) + mov 104(%rsp),%r13 + mov 80(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 40(%rsp),%r12 + + add 96(%rsp),%r12 + mov %rax,%r13 + mov %rax,%r14 + mov %rbx,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rcx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %rax,%r15 # (f^g)&e + mov %r12,96(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rcx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rdx,%r12 # T1+=h + + mov %r8,%rdx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %r8,%r13 + mov %r8,%r14 + + ror $28,%rdx + ror $34,%r13 + mov %r8,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rdx + ror $5,%r13 + or %r10,%r14 # a|c + + xor %r13,%rdx # h=Sigma0(a) + and %r10,%r15 # a&c + add %r12,%r11 # d+=T1 + + and %r9,%r14 # (a|c)&b + add %r12,%rdx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rdx # h+=Maj(a,b,c) + mov 112(%rsp),%r13 + mov 88(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 48(%rsp),%r12 + + add 104(%rsp),%r12 + mov %r11,%r13 + mov %r11,%r14 + mov %rax,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rbx,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r11,%r15 # (f^g)&e + mov %r12,104(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rbx,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rcx,%r12 # T1+=h + + mov %rdx,%rcx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rdx,%r13 + mov %rdx,%r14 + + ror $28,%rcx + ror $34,%r13 + mov %rdx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rcx + ror $5,%r13 + or %r9,%r14 # a|c + + xor %r13,%rcx # h=Sigma0(a) + and %r9,%r15 # a&c + add %r12,%r10 # d+=T1 + + and %r8,%r14 # (a|c)&b + add %r12,%rcx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rcx # h+=Maj(a,b,c) + mov 120(%rsp),%r13 + mov 96(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 56(%rsp),%r12 + + add 112(%rsp),%r12 + mov %r10,%r13 + mov %r10,%r14 + mov %r11,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %rax,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r10,%r15 # (f^g)&e + mov %r12,112(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %rax,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rbx,%r12 # T1+=h + + mov %rcx,%rbx + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rcx,%r13 + mov %rcx,%r14 + + ror $28,%rbx + ror $34,%r13 + mov %rcx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rbx + ror $5,%r13 + or %r8,%r14 # a|c + + xor %r13,%rbx # h=Sigma0(a) + and %r8,%r15 # a&c + add %r12,%r9 # d+=T1 + + and %rdx,%r14 # (a|c)&b + add %r12,%rbx # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rbx # h+=Maj(a,b,c) + mov 0(%rsp),%r13 + mov 104(%rsp),%r12 + + mov %r13,%r15 + + shr $7,%r13 + ror $1,%r15 + + xor %r15,%r13 + ror $7,%r15 + + xor %r15,%r13 # sigma0(X[(i+1)&0xf]) + mov %r12,%r14 + + shr $6,%r12 + ror $19,%r14 + + xor %r14,%r12 + ror $42,%r14 + + xor %r14,%r12 # sigma1(X[(i+14)&0xf]) + + add %r13,%r12 + + add 64(%rsp),%r12 + + add 120(%rsp),%r12 + mov %r9,%r13 + mov %r9,%r14 + mov %r10,%r15 + + ror $14,%r13 + ror $18,%r14 + xor %r11,%r15 # f^g + + xor %r14,%r13 + ror $23,%r14 + and %r9,%r15 # (f^g)&e + mov %r12,120(%rsp) + + xor %r14,%r13 # Sigma1(e) + xor %r11,%r15 # Ch(e,f,g)=((f^g)&e)^g + add %rax,%r12 # T1+=h + + mov %rbx,%rax + add %r13,%r12 # T1+=Sigma1(e) + + add %r15,%r12 # T1+=Ch(e,f,g) + mov %rbx,%r13 + mov %rbx,%r14 + + ror $28,%rax + ror $34,%r13 + mov %rbx,%r15 + add (%rbp,%rdi,8),%r12 # T1+=K[round] + + xor %r13,%rax + ror $5,%r13 + or %rdx,%r14 # a|c + + xor %r13,%rax # h=Sigma0(a) + and %rdx,%r15 # a&c + add %r12,%r8 # d+=T1 + + and %rcx,%r14 # (a|c)&b + add %r12,%rax # h+=T1 + + or %r15,%r14 # Maj(a,b,c)=((a|c)&b)|(a&c) + lea 1(%rdi),%rdi # round++ + + add %r14,%rax # h+=Maj(a,b,c) + cmp $80,%rdi + jb .Lrounds_16_xx + + mov 16*8+0*8(%rsp),%rdi + lea 16*8(%rsi),%rsi + + add 8*0(%rdi),%rax + add 8*1(%rdi),%rbx + add 8*2(%rdi),%rcx + add 8*3(%rdi),%rdx + add 8*4(%rdi),%r8 + add 8*5(%rdi),%r9 + add 8*6(%rdi),%r10 + add 8*7(%rdi),%r11 + + cmp 16*8+2*8(%rsp),%rsi + + mov %rax,8*0(%rdi) + mov %rbx,8*1(%rdi) + mov %rcx,8*2(%rdi) + mov %rdx,8*3(%rdi) + mov %r8,8*4(%rdi) + mov %r9,8*5(%rdi) + mov %r10,8*6(%rdi) + mov %r11,8*7(%rdi) + jb .Lloop + + mov 16*8+3*8(%rsp),%rsp + pop %r15 + pop %r14 + pop %r13 + pop %r12 + pop %rbp + pop %rbx + + ret +SET_SIZE(SHA512TransformBlocks) + +.data +.align 64 +.type K512,@object +K512: + .quad 0x428a2f98d728ae22,0x7137449123ef65cd + .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + .quad 0x3956c25bf348b538,0x59f111f1b605d019 + .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + .quad 0xd807aa98a3030242,0x12835b0145706fbe + .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + .quad 0x9bdc06a725c71235,0xc19bf174cf692694 + .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + .quad 0x983e5152ee66dfab,0xa831c66d2db43210 + .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 + .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 + .quad 0x06ca6351e003826f,0x142929670a0e6e70 + .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 + .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + .quad 0x650a73548baf63de,0x766a0abb3c77b2a8 + .quad 0x81c2c92e47edaee6,0x92722c851482353b + .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 + .quad 0xc24b8b70d0f89791,0xc76c51a30654be30 + .quad 0xd192e819d6ef5218,0xd69906245565a910 + .quad 0xf40e35855771202a,0x106aa07032bbd1b8 + .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + .quad 0x748f82ee5defb2fc,0x78a5636f43172f60 + .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec + .quad 0x90befffa23631e28,0xa4506cebde82bde9 + .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b + .quad 0xca273eceea26619c,0xd186b8c721c0c207 + .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 + .quad 0x113f9804bef90dae,0x1b710b35131c471b + .quad 0x28db77f523047d84,0x32caab7b40c72493 + .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +#endif /* !lint && !__lint */ + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/icp/core/kcf_callprov.c b/module/icp/core/kcf_callprov.c new file mode 100644 index 000000000000..fd2f7e1aac3d --- /dev/null +++ b/module/icp/core/kcf_callprov.c @@ -0,0 +1,1567 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +static int kcf_emulate_dual(kcf_provider_desc_t *, crypto_ctx_t *, + kcf_req_params_t *); + +void +kcf_free_triedlist(kcf_prov_tried_t *list) +{ + kcf_prov_tried_t *l; + + while ((l = list) != NULL) { + list = list->pt_next; + KCF_PROV_REFRELE(l->pt_pd); + kmem_free(l, sizeof (kcf_prov_tried_t)); + } +} + +kcf_prov_tried_t * +kcf_insert_triedlist(kcf_prov_tried_t **list, kcf_provider_desc_t *pd, + int kmflag) +{ + kcf_prov_tried_t *l; + + l = kmem_alloc(sizeof (kcf_prov_tried_t), kmflag); + if (l == NULL) + return (NULL); + + l->pt_pd = pd; + l->pt_next = *list; + *list = l; + + return (l); +} + +static boolean_t +is_in_triedlist(kcf_provider_desc_t *pd, kcf_prov_tried_t *triedl) +{ + while (triedl != NULL) { + if (triedl->pt_pd == pd) + return (B_TRUE); + triedl = triedl->pt_next; + }; + + return (B_FALSE); +} + +/* + * Search a mech entry's hardware provider list for the specified + * provider. Return true if found. + */ +static boolean_t +is_valid_provider_for_mech(kcf_provider_desc_t *pd, kcf_mech_entry_t *me, + crypto_func_group_t fg) +{ + kcf_prov_mech_desc_t *prov_chain; + + prov_chain = me->me_hw_prov_chain; + if (prov_chain != NULL) { + ASSERT(me->me_num_hwprov > 0); + for (; prov_chain != NULL; prov_chain = prov_chain->pm_next) { + if (prov_chain->pm_prov_desc == pd && + IS_FG_SUPPORTED(prov_chain, fg)) { + return (B_TRUE); + } + } + } + return (B_FALSE); +} + +/* + * This routine, given a logical provider, returns the least loaded + * provider belonging to the logical provider. The provider must be + * able to do the specified mechanism, i.e. check that the mechanism + * hasn't been disabled. In addition, just in case providers are not + * entirely equivalent, the provider's entry point is checked for + * non-nullness. This is accomplished by having the caller pass, as + * arguments, the offset of the function group (offset_1), and the + * offset of the function within the function group (offset_2). + * Returns NULL if no provider can be found. + */ +int +kcf_get_hardware_provider(crypto_mech_type_t mech_type_1, + crypto_mech_type_t mech_type_2, boolean_t call_restrict, + kcf_provider_desc_t *old, kcf_provider_desc_t **new, crypto_func_group_t fg) +{ + kcf_provider_desc_t *provider, *real_pd = old; + kcf_provider_desc_t *gpd = NULL; /* good provider */ + kcf_provider_desc_t *bpd = NULL; /* busy provider */ + kcf_provider_list_t *p; + kcf_ops_class_t class; + kcf_mech_entry_t *me; + kcf_mech_entry_tab_t *me_tab; + int index, len, gqlen = INT_MAX, rv = CRYPTO_SUCCESS; + + /* get the mech entry for the specified mechanism */ + class = KCF_MECH2CLASS(mech_type_1); + if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) { + return (CRYPTO_MECHANISM_INVALID); + } + + me_tab = &kcf_mech_tabs_tab[class]; + index = KCF_MECH2INDEX(mech_type_1); + if ((index < 0) || (index >= me_tab->met_size)) { + return (CRYPTO_MECHANISM_INVALID); + } + + me = &((me_tab->met_tab)[index]); + mutex_enter(&me->me_mutex); + + /* + * We assume the provider descriptor will not go away because + * it is being held somewhere, i.e. its reference count has been + * incremented. In the case of the crypto module, the provider + * descriptor is held by the session structure. + */ + if (old->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + if (old->pd_provider_list == NULL) { + real_pd = NULL; + rv = CRYPTO_DEVICE_ERROR; + goto out; + } + /* + * Find the least loaded real provider. KCF_PROV_LOAD gives + * the load (number of pending requests) of the provider. + */ + mutex_enter(&old->pd_lock); + p = old->pd_provider_list; + while (p != NULL) { + provider = p->pl_provider; + + ASSERT(provider->pd_prov_type != + CRYPTO_LOGICAL_PROVIDER); + + if (call_restrict && + (provider->pd_flags & KCF_PROV_RESTRICTED)) { + p = p->pl_next; + continue; + } + + if (!is_valid_provider_for_mech(provider, me, fg)) { + p = p->pl_next; + continue; + } + + /* provider does second mech */ + if (mech_type_2 != CRYPTO_MECH_INVALID) { + int i; + + i = KCF_TO_PROV_MECH_INDX(provider, + mech_type_2); + if (i == KCF_INVALID_INDX) { + p = p->pl_next; + continue; + } + } + + if (provider->pd_state != KCF_PROV_READY) { + /* choose BUSY if no READY providers */ + if (provider->pd_state == KCF_PROV_BUSY) + bpd = provider; + p = p->pl_next; + continue; + } + + len = KCF_PROV_LOAD(provider); + if (len < gqlen) { + gqlen = len; + gpd = provider; + } + + p = p->pl_next; + } + + if (gpd != NULL) { + real_pd = gpd; + KCF_PROV_REFHOLD(real_pd); + } else if (bpd != NULL) { + real_pd = bpd; + KCF_PROV_REFHOLD(real_pd); + } else { + /* can't find provider */ + real_pd = NULL; + rv = CRYPTO_MECHANISM_INVALID; + } + mutex_exit(&old->pd_lock); + + } else { + if (!KCF_IS_PROV_USABLE(old) || + (call_restrict && (old->pd_flags & KCF_PROV_RESTRICTED))) { + real_pd = NULL; + rv = CRYPTO_DEVICE_ERROR; + goto out; + } + + if (!is_valid_provider_for_mech(old, me, fg)) { + real_pd = NULL; + rv = CRYPTO_MECHANISM_INVALID; + goto out; + } + + KCF_PROV_REFHOLD(real_pd); + } +out: + mutex_exit(&me->me_mutex); + *new = real_pd; + return (rv); +} + +/* + * Return the best provider for the specified mechanism. The provider + * is held and it is the caller's responsibility to release it when done. + * The fg input argument is used as a search criterion to pick a provider. + * A provider has to support this function group to be picked. + * + * Find the least loaded provider in the list of providers. We do a linear + * search to find one. This is fine as we assume there are only a few + * number of providers in this list. If this assumption ever changes, + * we should revisit this. + * + * call_restrict represents if the caller should not be allowed to + * use restricted providers. + */ +kcf_provider_desc_t * +kcf_get_mech_provider(crypto_mech_type_t mech_type, kcf_mech_entry_t **mepp, + int *error, kcf_prov_tried_t *triedl, crypto_func_group_t fg, + boolean_t call_restrict, size_t data_size) +{ + kcf_provider_desc_t *pd = NULL, *gpd = NULL; + kcf_prov_mech_desc_t *prov_chain, *mdesc; + int len, gqlen = INT_MAX; + kcf_ops_class_t class; + int index; + kcf_mech_entry_t *me; + kcf_mech_entry_tab_t *me_tab; + + class = KCF_MECH2CLASS(mech_type); + if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) { + *error = CRYPTO_MECHANISM_INVALID; + return (NULL); + } + + me_tab = &kcf_mech_tabs_tab[class]; + index = KCF_MECH2INDEX(mech_type); + if ((index < 0) || (index >= me_tab->met_size)) { + *error = CRYPTO_MECHANISM_INVALID; + return (NULL); + } + + me = &((me_tab->met_tab)[index]); + if (mepp != NULL) + *mepp = me; + + mutex_enter(&me->me_mutex); + + prov_chain = me->me_hw_prov_chain; + + /* + * We check for the threshold for using a hardware provider for + * this amount of data. If there is no software provider available + * for the mechanism, then the threshold is ignored. + */ + if ((prov_chain != NULL) && + ((data_size == 0) || (me->me_threshold == 0) || + (data_size >= me->me_threshold) || + ((mdesc = me->me_sw_prov) == NULL) || + (!IS_FG_SUPPORTED(mdesc, fg)) || + (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) { + ASSERT(me->me_num_hwprov > 0); + /* there is at least one provider */ + + /* + * Find the least loaded real provider. KCF_PROV_LOAD gives + * the load (number of pending requests) of the provider. + */ + while (prov_chain != NULL) { + pd = prov_chain->pm_prov_desc; + + if (!IS_FG_SUPPORTED(prov_chain, fg) || + !KCF_IS_PROV_USABLE(pd) || + IS_PROVIDER_TRIED(pd, triedl) || + (call_restrict && + (pd->pd_flags & KCF_PROV_RESTRICTED))) { + prov_chain = prov_chain->pm_next; + continue; + } + + if ((len = KCF_PROV_LOAD(pd)) < gqlen) { + gqlen = len; + gpd = pd; + } + + prov_chain = prov_chain->pm_next; + } + + pd = gpd; + } + + /* No HW provider for this mech, is there a SW provider? */ + if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) { + pd = mdesc->pm_prov_desc; + if (!IS_FG_SUPPORTED(mdesc, fg) || + !KCF_IS_PROV_USABLE(pd) || + IS_PROVIDER_TRIED(pd, triedl) || + (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED))) + pd = NULL; + } + + if (pd == NULL) { + /* + * We do not want to report CRYPTO_MECH_NOT_SUPPORTED, when + * we are in the "fallback to the next provider" case. Rather + * we preserve the error, so that the client gets the right + * error code. + */ + if (triedl == NULL) + *error = CRYPTO_MECH_NOT_SUPPORTED; + } else + KCF_PROV_REFHOLD(pd); + + mutex_exit(&me->me_mutex); + return (pd); +} + +/* + * Very similar to kcf_get_mech_provider(). Finds the best provider capable of + * a dual operation with both me1 and me2. + * When no dual-ops capable providers are available, return the best provider + * for me1 only, and sets *prov_mt2 to CRYPTO_INVALID_MECHID; + * We assume/expect that a slower HW capable of the dual is still + * faster than the 2 fastest providers capable of the individual ops + * separately. + */ +kcf_provider_desc_t * +kcf_get_dual_provider(crypto_mechanism_t *mech1, crypto_mechanism_t *mech2, + kcf_mech_entry_t **mepp, crypto_mech_type_t *prov_mt1, + crypto_mech_type_t *prov_mt2, int *error, kcf_prov_tried_t *triedl, + crypto_func_group_t fg1, crypto_func_group_t fg2, boolean_t call_restrict, + size_t data_size) +{ + kcf_provider_desc_t *pd = NULL, *pdm1 = NULL, *pdm1m2 = NULL; + kcf_prov_mech_desc_t *prov_chain, *mdesc; + int len, gqlen = INT_MAX, dgqlen = INT_MAX; + crypto_mech_info_list_t *mil; + crypto_mech_type_t m2id = mech2->cm_type; + kcf_mech_entry_t *me; + + /* when mech is a valid mechanism, me will be its mech_entry */ + if (kcf_get_mech_entry(mech1->cm_type, &me) != KCF_SUCCESS) { + *error = CRYPTO_MECHANISM_INVALID; + return (NULL); + } + + *prov_mt2 = CRYPTO_MECH_INVALID; + + if (mepp != NULL) + *mepp = me; + mutex_enter(&me->me_mutex); + + prov_chain = me->me_hw_prov_chain; + /* + * We check the threshold for using a hardware provider for + * this amount of data. If there is no software provider available + * for the first mechanism, then the threshold is ignored. + */ + if ((prov_chain != NULL) && + ((data_size == 0) || (me->me_threshold == 0) || + (data_size >= me->me_threshold) || + ((mdesc = me->me_sw_prov) == NULL) || + (!IS_FG_SUPPORTED(mdesc, fg1)) || + (!KCF_IS_PROV_USABLE(mdesc->pm_prov_desc)))) { + /* there is at least one provider */ + ASSERT(me->me_num_hwprov > 0); + + /* + * Find the least loaded provider capable of the combo + * me1 + me2, and save a pointer to the least loaded + * provider capable of me1 only. + */ + while (prov_chain != NULL) { + pd = prov_chain->pm_prov_desc; + len = KCF_PROV_LOAD(pd); + + if (!IS_FG_SUPPORTED(prov_chain, fg1) || + !KCF_IS_PROV_USABLE(pd) || + IS_PROVIDER_TRIED(pd, triedl) || + (call_restrict && + (pd->pd_flags & KCF_PROV_RESTRICTED))) { + prov_chain = prov_chain->pm_next; + continue; + } + + /* Save the best provider capable of m1 */ + if (len < gqlen) { + *prov_mt1 = + prov_chain->pm_mech_info.cm_mech_number; + gqlen = len; + pdm1 = pd; + } + + /* See if pd can do me2 too */ + for (mil = prov_chain->pm_mi_list; + mil != NULL; mil = mil->ml_next) { + if ((mil->ml_mech_info.cm_func_group_mask & + fg2) == 0) + continue; + + if ((mil->ml_kcf_mechid == m2id) && + (len < dgqlen)) { + /* Bingo! */ + dgqlen = len; + pdm1m2 = pd; + *prov_mt2 = + mil->ml_mech_info.cm_mech_number; + *prov_mt1 = prov_chain-> + pm_mech_info.cm_mech_number; + break; + } + } + + prov_chain = prov_chain->pm_next; + } + + pd = (pdm1m2 != NULL) ? pdm1m2 : pdm1; + } + + /* no HW provider for this mech, is there a SW provider? */ + if (pd == NULL && (mdesc = me->me_sw_prov) != NULL) { + pd = mdesc->pm_prov_desc; + if (!IS_FG_SUPPORTED(mdesc, fg1) || + !KCF_IS_PROV_USABLE(pd) || + IS_PROVIDER_TRIED(pd, triedl) || + (call_restrict && (pd->pd_flags & KCF_PROV_RESTRICTED))) + pd = NULL; + else { + /* See if pd can do me2 too */ + for (mil = me->me_sw_prov->pm_mi_list; + mil != NULL; mil = mil->ml_next) { + if ((mil->ml_mech_info.cm_func_group_mask & + fg2) == 0) + continue; + + if (mil->ml_kcf_mechid == m2id) { + /* Bingo! */ + *prov_mt2 = + mil->ml_mech_info.cm_mech_number; + break; + } + } + *prov_mt1 = me->me_sw_prov->pm_mech_info.cm_mech_number; + } + } + + if (pd == NULL) + *error = CRYPTO_MECH_NOT_SUPPORTED; + else + KCF_PROV_REFHOLD(pd); + + mutex_exit(&me->me_mutex); + return (pd); +} + +/* + * Do the actual work of calling the provider routines. + * + * pd - Provider structure + * ctx - Context for this operation + * params - Parameters for this operation + * rhndl - Request handle to use for notification + * + * The return values are the same as that of the respective SPI. + */ +int +common_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx, + kcf_req_params_t *params, crypto_req_handle_t rhndl) +{ + int err = CRYPTO_ARGUMENTS_BAD; + kcf_op_type_t optype; + + optype = params->rp_optype; + + switch (params->rp_opgrp) { + case KCF_OG_DIGEST: { + kcf_digest_ops_params_t *dops = ¶ms->rp_u.digest_params; + + switch (optype) { + case KCF_OP_INIT: + /* + * We should do this only here and not in KCF_WRAP_* + * macros. This is because we may want to try other + * providers, in case we recover from a failure. + */ + KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype, + pd, &dops->do_mech); + + err = KCF_PROV_DIGEST_INIT(pd, ctx, &dops->do_mech, + rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_DIGEST(pd, ctx, dops->do_data, + dops->do_digest, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_DIGEST_UPDATE(pd, ctx, + dops->do_data, rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_DIGEST_FINAL(pd, ctx, + dops->do_digest, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(dops->do_framework_mechtype, + pd, &dops->do_mech); + err = KCF_PROV_DIGEST_ATOMIC(pd, dops->do_sid, + &dops->do_mech, dops->do_data, dops->do_digest, + rhndl); + break; + + case KCF_OP_DIGEST_KEY: + err = KCF_PROV_DIGEST_KEY(pd, ctx, dops->do_digest_key, + rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_MAC: { + kcf_mac_ops_params_t *mops = ¶ms->rp_u.mac_params; + + switch (optype) { + case KCF_OP_INIT: + KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype, + pd, &mops->mo_mech); + + err = KCF_PROV_MAC_INIT(pd, ctx, &mops->mo_mech, + mops->mo_key, mops->mo_templ, rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_MAC(pd, ctx, mops->mo_data, + mops->mo_mac, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_MAC_UPDATE(pd, ctx, mops->mo_data, + rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_MAC_FINAL(pd, ctx, mops->mo_mac, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype, + pd, &mops->mo_mech); + + err = KCF_PROV_MAC_ATOMIC(pd, mops->mo_sid, + &mops->mo_mech, mops->mo_key, mops->mo_data, + mops->mo_mac, mops->mo_templ, rhndl); + break; + + case KCF_OP_MAC_VERIFY_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(mops->mo_framework_mechtype, + pd, &mops->mo_mech); + + err = KCF_PROV_MAC_VERIFY_ATOMIC(pd, mops->mo_sid, + &mops->mo_mech, mops->mo_key, mops->mo_data, + mops->mo_mac, mops->mo_templ, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_ENCRYPT: { + kcf_encrypt_ops_params_t *eops = ¶ms->rp_u.encrypt_params; + + switch (optype) { + case KCF_OP_INIT: + KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype, + pd, &eops->eo_mech); + + err = KCF_PROV_ENCRYPT_INIT(pd, ctx, &eops->eo_mech, + eops->eo_key, eops->eo_templ, rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_ENCRYPT(pd, ctx, eops->eo_plaintext, + eops->eo_ciphertext, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_ENCRYPT_UPDATE(pd, ctx, + eops->eo_plaintext, eops->eo_ciphertext, rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_ENCRYPT_FINAL(pd, ctx, + eops->eo_ciphertext, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(eops->eo_framework_mechtype, + pd, &eops->eo_mech); + + err = KCF_PROV_ENCRYPT_ATOMIC(pd, eops->eo_sid, + &eops->eo_mech, eops->eo_key, eops->eo_plaintext, + eops->eo_ciphertext, eops->eo_templ, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_DECRYPT: { + kcf_decrypt_ops_params_t *dcrops = ¶ms->rp_u.decrypt_params; + + switch (optype) { + case KCF_OP_INIT: + KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype, + pd, &dcrops->dop_mech); + + err = KCF_PROV_DECRYPT_INIT(pd, ctx, &dcrops->dop_mech, + dcrops->dop_key, dcrops->dop_templ, rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_DECRYPT(pd, ctx, dcrops->dop_ciphertext, + dcrops->dop_plaintext, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_DECRYPT_UPDATE(pd, ctx, + dcrops->dop_ciphertext, dcrops->dop_plaintext, + rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_DECRYPT_FINAL(pd, ctx, + dcrops->dop_plaintext, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(dcrops->dop_framework_mechtype, + pd, &dcrops->dop_mech); + + err = KCF_PROV_DECRYPT_ATOMIC(pd, dcrops->dop_sid, + &dcrops->dop_mech, dcrops->dop_key, + dcrops->dop_ciphertext, dcrops->dop_plaintext, + dcrops->dop_templ, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_SIGN: { + kcf_sign_ops_params_t *sops = ¶ms->rp_u.sign_params; + + switch (optype) { + case KCF_OP_INIT: + KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype, + pd, &sops->so_mech); + + err = KCF_PROV_SIGN_INIT(pd, ctx, &sops->so_mech, + sops->so_key, sops->so_templ, rhndl); + break; + + case KCF_OP_SIGN_RECOVER_INIT: + KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype, + pd, &sops->so_mech); + + err = KCF_PROV_SIGN_RECOVER_INIT(pd, ctx, + &sops->so_mech, sops->so_key, sops->so_templ, + rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_SIGN(pd, ctx, sops->so_data, + sops->so_signature, rhndl); + break; + + case KCF_OP_SIGN_RECOVER: + err = KCF_PROV_SIGN_RECOVER(pd, ctx, + sops->so_data, sops->so_signature, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_SIGN_UPDATE(pd, ctx, sops->so_data, + rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_SIGN_FINAL(pd, ctx, sops->so_signature, + rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype, + pd, &sops->so_mech); + + err = KCF_PROV_SIGN_ATOMIC(pd, sops->so_sid, + &sops->so_mech, sops->so_key, sops->so_data, + sops->so_templ, sops->so_signature, rhndl); + break; + + case KCF_OP_SIGN_RECOVER_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(sops->so_framework_mechtype, + pd, &sops->so_mech); + + err = KCF_PROV_SIGN_RECOVER_ATOMIC(pd, sops->so_sid, + &sops->so_mech, sops->so_key, sops->so_data, + sops->so_templ, sops->so_signature, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_VERIFY: { + kcf_verify_ops_params_t *vops = ¶ms->rp_u.verify_params; + + switch (optype) { + case KCF_OP_INIT: + KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype, + pd, &vops->vo_mech); + + err = KCF_PROV_VERIFY_INIT(pd, ctx, &vops->vo_mech, + vops->vo_key, vops->vo_templ, rhndl); + break; + + case KCF_OP_VERIFY_RECOVER_INIT: + KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype, + pd, &vops->vo_mech); + + err = KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx, + &vops->vo_mech, vops->vo_key, vops->vo_templ, + rhndl); + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_VERIFY(pd, ctx, vops->vo_data, + vops->vo_signature, rhndl); + break; + + case KCF_OP_VERIFY_RECOVER: + err = KCF_PROV_VERIFY_RECOVER(pd, ctx, + vops->vo_signature, vops->vo_data, rhndl); + break; + + case KCF_OP_UPDATE: + err = KCF_PROV_VERIFY_UPDATE(pd, ctx, vops->vo_data, + rhndl); + break; + + case KCF_OP_FINAL: + err = KCF_PROV_VERIFY_FINAL(pd, ctx, vops->vo_signature, + rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype, + pd, &vops->vo_mech); + + err = KCF_PROV_VERIFY_ATOMIC(pd, vops->vo_sid, + &vops->vo_mech, vops->vo_key, vops->vo_data, + vops->vo_templ, vops->vo_signature, rhndl); + break; + + case KCF_OP_VERIFY_RECOVER_ATOMIC: + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(vops->vo_framework_mechtype, + pd, &vops->vo_mech); + + err = KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, vops->vo_sid, + &vops->vo_mech, vops->vo_key, vops->vo_signature, + vops->vo_templ, vops->vo_data, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_ENCRYPT_MAC: { + kcf_encrypt_mac_ops_params_t *eops = + ¶ms->rp_u.encrypt_mac_params; + kcf_context_t *kcf_secondctx; + + switch (optype) { + case KCF_OP_INIT: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + KCF_SET_PROVIDER_MECHNUM( + eops->em_framework_encr_mechtype, + pd, &eops->em_encr_mech); + + KCF_SET_PROVIDER_MECHNUM( + eops->em_framework_mac_mechtype, + pd, &eops->em_mac_mech); + + err = KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx, + &eops->em_encr_mech, eops->em_encr_key, + &eops->em_mac_mech, eops->em_mac_key, + eops->em_encr_templ, eops->em_mac_templ, + rhndl); + + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_ENCRYPT_MAC(pd, ctx, + eops->em_plaintext, eops->em_ciphertext, + eops->em_mac, rhndl); + break; + + case KCF_OP_UPDATE: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + err = KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx, + eops->em_plaintext, eops->em_ciphertext, rhndl); + break; + + case KCF_OP_FINAL: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + err = KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx, + eops->em_ciphertext, eops->em_mac, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + + KCF_SET_PROVIDER_MECHNUM( + eops->em_framework_encr_mechtype, + pd, &eops->em_encr_mech); + + KCF_SET_PROVIDER_MECHNUM( + eops->em_framework_mac_mechtype, + pd, &eops->em_mac_mech); + + err = KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, eops->em_sid, + &eops->em_encr_mech, eops->em_encr_key, + &eops->em_mac_mech, eops->em_mac_key, + eops->em_plaintext, eops->em_ciphertext, + eops->em_mac, + eops->em_encr_templ, eops->em_mac_templ, + rhndl); + + break; + + default: + break; + } + break; + } + + case KCF_OG_MAC_DECRYPT: { + kcf_mac_decrypt_ops_params_t *dops = + ¶ms->rp_u.mac_decrypt_params; + kcf_context_t *kcf_secondctx; + + switch (optype) { + case KCF_OP_INIT: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_mac_mechtype, + pd, &dops->md_mac_mech); + + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_decr_mechtype, + pd, &dops->md_decr_mech); + + err = KCF_PROV_MAC_DECRYPT_INIT(pd, ctx, + &dops->md_mac_mech, dops->md_mac_key, + &dops->md_decr_mech, dops->md_decr_key, + dops->md_mac_templ, dops->md_decr_templ, + rhndl); + + break; + + case KCF_OP_SINGLE: + err = KCF_PROV_MAC_DECRYPT(pd, ctx, + dops->md_ciphertext, dops->md_mac, + dops->md_plaintext, rhndl); + break; + + case KCF_OP_UPDATE: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + err = KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx, + dops->md_ciphertext, dops->md_plaintext, rhndl); + break; + + case KCF_OP_FINAL: + kcf_secondctx = ((kcf_context_t *) + (ctx->cc_framework_private))->kc_secondctx; + if (kcf_secondctx != NULL) { + err = kcf_emulate_dual(pd, ctx, params); + break; + } + err = KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx, + dops->md_mac, dops->md_plaintext, rhndl); + break; + + case KCF_OP_ATOMIC: + ASSERT(ctx == NULL); + + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_mac_mechtype, + pd, &dops->md_mac_mech); + + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_decr_mechtype, + pd, &dops->md_decr_mech); + + err = KCF_PROV_MAC_DECRYPT_ATOMIC(pd, dops->md_sid, + &dops->md_mac_mech, dops->md_mac_key, + &dops->md_decr_mech, dops->md_decr_key, + dops->md_ciphertext, dops->md_mac, + dops->md_plaintext, + dops->md_mac_templ, dops->md_decr_templ, + rhndl); + + break; + + case KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC: + ASSERT(ctx == NULL); + + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_mac_mechtype, + pd, &dops->md_mac_mech); + + KCF_SET_PROVIDER_MECHNUM( + dops->md_framework_decr_mechtype, + pd, &dops->md_decr_mech); + + err = KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd, + dops->md_sid, &dops->md_mac_mech, dops->md_mac_key, + &dops->md_decr_mech, dops->md_decr_key, + dops->md_ciphertext, dops->md_mac, + dops->md_plaintext, + dops->md_mac_templ, dops->md_decr_templ, + rhndl); + + break; + + default: + break; + } + break; + } + + case KCF_OG_KEY: { + kcf_key_ops_params_t *kops = ¶ms->rp_u.key_params; + + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd, + &kops->ko_mech); + + switch (optype) { + case KCF_OP_KEY_GENERATE: + err = KCF_PROV_KEY_GENERATE(pd, kops->ko_sid, + &kops->ko_mech, + kops->ko_key_template, kops->ko_key_attribute_count, + kops->ko_key_object_id_ptr, rhndl); + break; + + case KCF_OP_KEY_GENERATE_PAIR: + err = KCF_PROV_KEY_GENERATE_PAIR(pd, kops->ko_sid, + &kops->ko_mech, + kops->ko_key_template, kops->ko_key_attribute_count, + kops->ko_private_key_template, + kops->ko_private_key_attribute_count, + kops->ko_key_object_id_ptr, + kops->ko_private_key_object_id_ptr, rhndl); + break; + + case KCF_OP_KEY_WRAP: + err = KCF_PROV_KEY_WRAP(pd, kops->ko_sid, + &kops->ko_mech, + kops->ko_key, kops->ko_key_object_id_ptr, + kops->ko_wrapped_key, kops->ko_wrapped_key_len_ptr, + rhndl); + break; + + case KCF_OP_KEY_UNWRAP: + err = KCF_PROV_KEY_UNWRAP(pd, kops->ko_sid, + &kops->ko_mech, + kops->ko_key, kops->ko_wrapped_key, + kops->ko_wrapped_key_len_ptr, + kops->ko_key_template, kops->ko_key_attribute_count, + kops->ko_key_object_id_ptr, rhndl); + break; + + case KCF_OP_KEY_DERIVE: + err = KCF_PROV_KEY_DERIVE(pd, kops->ko_sid, + &kops->ko_mech, + kops->ko_key, kops->ko_key_template, + kops->ko_key_attribute_count, + kops->ko_key_object_id_ptr, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_RANDOM: { + kcf_random_number_ops_params_t *rops = + ¶ms->rp_u.random_number_params; + + ASSERT(ctx == NULL); + + switch (optype) { + case KCF_OP_RANDOM_SEED: + err = KCF_PROV_SEED_RANDOM(pd, rops->rn_sid, + rops->rn_buf, rops->rn_buflen, rops->rn_entropy_est, + rops->rn_flags, rhndl); + break; + + case KCF_OP_RANDOM_GENERATE: + err = KCF_PROV_GENERATE_RANDOM(pd, rops->rn_sid, + rops->rn_buf, rops->rn_buflen, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_SESSION: { + kcf_session_ops_params_t *sops = ¶ms->rp_u.session_params; + + ASSERT(ctx == NULL); + switch (optype) { + case KCF_OP_SESSION_OPEN: + /* + * so_pd may be a logical provider, in which case + * we need to check whether it has been removed. + */ + if (KCF_IS_PROV_REMOVED(sops->so_pd)) { + err = CRYPTO_DEVICE_ERROR; + break; + } + err = KCF_PROV_SESSION_OPEN(pd, sops->so_sid_ptr, + rhndl, sops->so_pd); + break; + + case KCF_OP_SESSION_CLOSE: + /* + * so_pd may be a logical provider, in which case + * we need to check whether it has been removed. + */ + if (KCF_IS_PROV_REMOVED(sops->so_pd)) { + err = CRYPTO_DEVICE_ERROR; + break; + } + err = KCF_PROV_SESSION_CLOSE(pd, sops->so_sid, + rhndl, sops->so_pd); + break; + + case KCF_OP_SESSION_LOGIN: + err = KCF_PROV_SESSION_LOGIN(pd, sops->so_sid, + sops->so_user_type, sops->so_pin, + sops->so_pin_len, rhndl); + break; + + case KCF_OP_SESSION_LOGOUT: + err = KCF_PROV_SESSION_LOGOUT(pd, sops->so_sid, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_OBJECT: { + kcf_object_ops_params_t *jops = ¶ms->rp_u.object_params; + + ASSERT(ctx == NULL); + switch (optype) { + case KCF_OP_OBJECT_CREATE: + err = KCF_PROV_OBJECT_CREATE(pd, jops->oo_sid, + jops->oo_template, jops->oo_attribute_count, + jops->oo_object_id_ptr, rhndl); + break; + + case KCF_OP_OBJECT_COPY: + err = KCF_PROV_OBJECT_COPY(pd, jops->oo_sid, + jops->oo_object_id, + jops->oo_template, jops->oo_attribute_count, + jops->oo_object_id_ptr, rhndl); + break; + + case KCF_OP_OBJECT_DESTROY: + err = KCF_PROV_OBJECT_DESTROY(pd, jops->oo_sid, + jops->oo_object_id, rhndl); + break; + + case KCF_OP_OBJECT_GET_SIZE: + err = KCF_PROV_OBJECT_GET_SIZE(pd, jops->oo_sid, + jops->oo_object_id, jops->oo_object_size, rhndl); + break; + + case KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE: + err = KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd, + jops->oo_sid, jops->oo_object_id, + jops->oo_template, jops->oo_attribute_count, rhndl); + break; + + case KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE: + err = KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd, + jops->oo_sid, jops->oo_object_id, + jops->oo_template, jops->oo_attribute_count, rhndl); + break; + + case KCF_OP_OBJECT_FIND_INIT: + err = KCF_PROV_OBJECT_FIND_INIT(pd, jops->oo_sid, + jops->oo_template, jops->oo_attribute_count, + jops->oo_find_init_pp_ptr, rhndl); + break; + + case KCF_OP_OBJECT_FIND: + err = KCF_PROV_OBJECT_FIND(pd, jops->oo_find_pp, + jops->oo_object_id_ptr, jops->oo_max_object_count, + jops->oo_object_count_ptr, rhndl); + break; + + case KCF_OP_OBJECT_FIND_FINAL: + err = KCF_PROV_OBJECT_FIND_FINAL(pd, jops->oo_find_pp, + rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_PROVMGMT: { + kcf_provmgmt_ops_params_t *pops = ¶ms->rp_u.provmgmt_params; + + ASSERT(ctx == NULL); + switch (optype) { + case KCF_OP_MGMT_EXTINFO: + /* + * po_pd may be a logical provider, in which case + * we need to check whether it has been removed. + */ + if (KCF_IS_PROV_REMOVED(pops->po_pd)) { + err = CRYPTO_DEVICE_ERROR; + break; + } + err = KCF_PROV_EXT_INFO(pd, pops->po_ext_info, rhndl, + pops->po_pd); + break; + + case KCF_OP_MGMT_INITTOKEN: + err = KCF_PROV_INIT_TOKEN(pd, pops->po_pin, + pops->po_pin_len, pops->po_label, rhndl); + break; + + case KCF_OP_MGMT_INITPIN: + err = KCF_PROV_INIT_PIN(pd, pops->po_sid, pops->po_pin, + pops->po_pin_len, rhndl); + break; + + case KCF_OP_MGMT_SETPIN: + err = KCF_PROV_SET_PIN(pd, pops->po_sid, + pops->po_old_pin, pops->po_old_pin_len, + pops->po_pin, pops->po_pin_len, rhndl); + break; + + default: + break; + } + break; + } + + case KCF_OG_NOSTORE_KEY: { + kcf_key_ops_params_t *kops = ¶ms->rp_u.key_params; + + ASSERT(ctx == NULL); + KCF_SET_PROVIDER_MECHNUM(kops->ko_framework_mechtype, pd, + &kops->ko_mech); + + switch (optype) { + case KCF_OP_KEY_GENERATE: + err = KCF_PROV_NOSTORE_KEY_GENERATE(pd, kops->ko_sid, + &kops->ko_mech, kops->ko_key_template, + kops->ko_key_attribute_count, + kops->ko_out_template1, + kops->ko_out_attribute_count1, rhndl); + break; + + case KCF_OP_KEY_GENERATE_PAIR: + err = KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd, + kops->ko_sid, &kops->ko_mech, + kops->ko_key_template, kops->ko_key_attribute_count, + kops->ko_private_key_template, + kops->ko_private_key_attribute_count, + kops->ko_out_template1, + kops->ko_out_attribute_count1, + kops->ko_out_template2, + kops->ko_out_attribute_count2, + rhndl); + break; + + case KCF_OP_KEY_DERIVE: + err = KCF_PROV_NOSTORE_KEY_DERIVE(pd, kops->ko_sid, + &kops->ko_mech, kops->ko_key, + kops->ko_key_template, + kops->ko_key_attribute_count, + kops->ko_out_template1, + kops->ko_out_attribute_count1, rhndl); + break; + + default: + break; + } + break; + } + default: + break; + } /* end of switch(params->rp_opgrp) */ + + KCF_PROV_INCRSTATS(pd, err); + return (err); +} + + +/* + * Emulate the call for a multipart dual ops with 2 single steps. + * This routine is always called in the context of a working thread + * running kcf_svc_do_run(). + * The single steps are submitted in a pure synchronous way (blocking). + * When this routine returns, kcf_svc_do_run() will call kcf_aop_done() + * so the originating consumer's callback gets invoked. kcf_aop_done() + * takes care of freeing the operation context. So, this routine does + * not free the operation context. + * + * The provider descriptor is assumed held by the callers. + */ +static int +kcf_emulate_dual(kcf_provider_desc_t *pd, crypto_ctx_t *ctx, + kcf_req_params_t *params) +{ + int err = CRYPTO_ARGUMENTS_BAD; + kcf_op_type_t optype; + size_t save_len; + off_t save_offset; + + optype = params->rp_optype; + + switch (params->rp_opgrp) { + case KCF_OG_ENCRYPT_MAC: { + kcf_encrypt_mac_ops_params_t *cmops = + ¶ms->rp_u.encrypt_mac_params; + kcf_context_t *encr_kcf_ctx; + crypto_ctx_t *mac_ctx; + kcf_req_params_t encr_params; + + encr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private); + + switch (optype) { + case KCF_OP_INIT: { + encr_kcf_ctx->kc_secondctx = NULL; + + KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_INIT, + pd->pd_sid, &cmops->em_encr_mech, + cmops->em_encr_key, NULL, NULL, + cmops->em_encr_templ); + + err = kcf_submit_request(pd, ctx, NULL, &encr_params, + B_FALSE); + + /* It can't be CRYPTO_QUEUED */ + if (err != CRYPTO_SUCCESS) { + break; + } + + err = crypto_mac_init(&cmops->em_mac_mech, + cmops->em_mac_key, cmops->em_mac_templ, + (crypto_context_t *)&mac_ctx, NULL); + + if (err == CRYPTO_SUCCESS) { + encr_kcf_ctx->kc_secondctx = (kcf_context_t *) + mac_ctx->cc_framework_private; + KCF_CONTEXT_REFHOLD((kcf_context_t *) + mac_ctx->cc_framework_private); + } + + break; + + } + case KCF_OP_UPDATE: { + crypto_dual_data_t *ct = cmops->em_ciphertext; + crypto_data_t *pt = cmops->em_plaintext; + kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx; + crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx; + + KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_UPDATE, + pd->pd_sid, NULL, NULL, pt, (crypto_data_t *)ct, + NULL); + + err = kcf_submit_request(pd, ctx, NULL, &encr_params, + B_FALSE); + + /* It can't be CRYPTO_QUEUED */ + if (err != CRYPTO_SUCCESS) { + break; + } + + save_offset = ct->dd_offset1; + save_len = ct->dd_len1; + if (ct->dd_len2 == 0) { + /* + * The previous encrypt step was an + * accumulation only and didn't produce any + * partial output + */ + if (ct->dd_len1 == 0) + break; + + } else { + ct->dd_offset1 = ct->dd_offset2; + ct->dd_len1 = ct->dd_len2; + } + err = crypto_mac_update((crypto_context_t)mac_ctx, + (crypto_data_t *)ct, NULL); + + ct->dd_offset1 = save_offset; + ct->dd_len1 = save_len; + + break; + } + case KCF_OP_FINAL: { + crypto_dual_data_t *ct = cmops->em_ciphertext; + crypto_data_t *mac = cmops->em_mac; + kcf_context_t *mac_kcf_ctx = encr_kcf_ctx->kc_secondctx; + crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx; + crypto_context_t mac_context = mac_ctx; + + KCF_WRAP_ENCRYPT_OPS_PARAMS(&encr_params, KCF_OP_FINAL, + pd->pd_sid, NULL, NULL, NULL, (crypto_data_t *)ct, + NULL); + + err = kcf_submit_request(pd, ctx, NULL, &encr_params, + B_FALSE); + + /* It can't be CRYPTO_QUEUED */ + if (err != CRYPTO_SUCCESS) { + crypto_cancel_ctx(mac_context); + break; + } + + if (ct->dd_len2 > 0) { + save_offset = ct->dd_offset1; + save_len = ct->dd_len1; + ct->dd_offset1 = ct->dd_offset2; + ct->dd_len1 = ct->dd_len2; + + err = crypto_mac_update(mac_context, + (crypto_data_t *)ct, NULL); + + ct->dd_offset1 = save_offset; + ct->dd_len1 = save_len; + + if (err != CRYPTO_SUCCESS) { + crypto_cancel_ctx(mac_context); + return (err); + } + } + + /* and finally, collect the MAC */ + err = crypto_mac_final(mac_context, mac, NULL); + break; + } + + default: + break; + } + KCF_PROV_INCRSTATS(pd, err); + break; + } + case KCF_OG_MAC_DECRYPT: { + kcf_mac_decrypt_ops_params_t *mdops = + ¶ms->rp_u.mac_decrypt_params; + kcf_context_t *decr_kcf_ctx; + crypto_ctx_t *mac_ctx; + kcf_req_params_t decr_params; + + decr_kcf_ctx = (kcf_context_t *)(ctx->cc_framework_private); + + switch (optype) { + case KCF_OP_INIT: { + decr_kcf_ctx->kc_secondctx = NULL; + + err = crypto_mac_init(&mdops->md_mac_mech, + mdops->md_mac_key, mdops->md_mac_templ, + (crypto_context_t *)&mac_ctx, NULL); + + /* It can't be CRYPTO_QUEUED */ + if (err != CRYPTO_SUCCESS) { + break; + } + + KCF_WRAP_DECRYPT_OPS_PARAMS(&decr_params, KCF_OP_INIT, + pd->pd_sid, &mdops->md_decr_mech, + mdops->md_decr_key, NULL, NULL, + mdops->md_decr_templ); + + err = kcf_submit_request(pd, ctx, NULL, &decr_params, + B_FALSE); + + /* It can't be CRYPTO_QUEUED */ + if (err != CRYPTO_SUCCESS) { + crypto_cancel_ctx((crypto_context_t)mac_ctx); + break; + } + + decr_kcf_ctx->kc_secondctx = (kcf_context_t *) + mac_ctx->cc_framework_private; + KCF_CONTEXT_REFHOLD((kcf_context_t *) + mac_ctx->cc_framework_private); + + break; + default: + break; + + } + case KCF_OP_UPDATE: { + crypto_dual_data_t *ct = mdops->md_ciphertext; + crypto_data_t *pt = mdops->md_plaintext; + kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx; + crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx; + + err = crypto_mac_update((crypto_context_t)mac_ctx, + (crypto_data_t *)ct, NULL); + + if (err != CRYPTO_SUCCESS) + break; + + save_offset = ct->dd_offset1; + save_len = ct->dd_len1; + + /* zero ct->dd_len2 means decrypt everything */ + if (ct->dd_len2 > 0) { + ct->dd_offset1 = ct->dd_offset2; + ct->dd_len1 = ct->dd_len2; + } + + err = crypto_decrypt_update((crypto_context_t)ctx, + (crypto_data_t *)ct, pt, NULL); + + ct->dd_offset1 = save_offset; + ct->dd_len1 = save_len; + + break; + } + case KCF_OP_FINAL: { + crypto_data_t *pt = mdops->md_plaintext; + crypto_data_t *mac = mdops->md_mac; + kcf_context_t *mac_kcf_ctx = decr_kcf_ctx->kc_secondctx; + crypto_ctx_t *mac_ctx = &mac_kcf_ctx->kc_glbl_ctx; + + err = crypto_mac_final((crypto_context_t)mac_ctx, + mac, NULL); + + if (err != CRYPTO_SUCCESS) { + crypto_cancel_ctx(ctx); + break; + } + + /* Get the last chunk of plaintext */ + KCF_CONTEXT_REFHOLD(decr_kcf_ctx); + err = crypto_decrypt_final((crypto_context_t)ctx, pt, + NULL); + + break; + } + } + break; + } + default: + + break; + } /* end of switch(params->rp_opgrp) */ + + return (err); +} diff --git a/module/icp/core/kcf_mech_tabs.c b/module/icp/core/kcf_mech_tabs.c new file mode 100644 index 000000000000..2642b317d698 --- /dev/null +++ b/module/icp/core/kcf_mech_tabs.c @@ -0,0 +1,791 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +/* Cryptographic mechanisms tables and their access functions */ + +/* + * Internal numbers assigned to mechanisms are coded as follows: + * + * +----------------+----------------+ + * | mech. class | mech. index | + * <--- 32-bits --->+<--- 32-bits ---> + * + * the mech_class identifies the table the mechanism belongs to. + * mech_index is the index for that mechanism in the table. + * A mechanism belongs to exactly 1 table. + * The tables are: + * . digest_mechs_tab[] for the msg digest mechs. + * . cipher_mechs_tab[] for encrypt/decrypt and wrap/unwrap mechs. + * . mac_mechs_tab[] for MAC mechs. + * . sign_mechs_tab[] for sign & verify mechs. + * . keyops_mechs_tab[] for key/key pair generation, and key derivation. + * . misc_mechs_tab[] for mechs that don't belong to any of the above. + * + * There are no holes in the tables. + */ + +/* + * Locking conventions: + * -------------------- + * A global mutex, kcf_mech_tabs_lock, serializes writes to the + * mechanism table via kcf_create_mech_entry(). + * + * A mutex is associated with every entry of the tables. + * The mutex is acquired whenever the entry is accessed for + * 1) retrieving the mech_id (comparing the mech name) + * 2) finding a provider for an xxx_init() or atomic operation. + * 3) altering the mechs entry to add or remove a provider. + * + * In 2), after a provider is chosen, its prov_desc is held and the + * entry's mutex must be dropped. The provider's working function (SPI) is + * called outside the mech_entry's mutex. + * + * The number of providers for a particular mechanism is not expected to be + * long enough to justify the cost of using rwlocks, so the per-mechanism + * entry mutex won't be very *hot*. + * + * When both kcf_mech_tabs_lock and a mech_entry mutex need to be held, + * kcf_mech_tabs_lock must always be acquired first. + * + */ + + /* Mechanisms tables */ + + +/* RFE 4687834 Will deal with the extensibility of these tables later */ + +kcf_mech_entry_t kcf_digest_mechs_tab[KCF_MAXDIGEST]; +kcf_mech_entry_t kcf_cipher_mechs_tab[KCF_MAXCIPHER]; +kcf_mech_entry_t kcf_mac_mechs_tab[KCF_MAXMAC]; +kcf_mech_entry_t kcf_sign_mechs_tab[KCF_MAXSIGN]; +kcf_mech_entry_t kcf_keyops_mechs_tab[KCF_MAXKEYOPS]; +kcf_mech_entry_t kcf_misc_mechs_tab[KCF_MAXMISC]; + +kcf_mech_entry_tab_t kcf_mech_tabs_tab[KCF_LAST_OPSCLASS + 1] = { + {0, NULL}, /* No class zero */ + {KCF_MAXDIGEST, kcf_digest_mechs_tab}, + {KCF_MAXCIPHER, kcf_cipher_mechs_tab}, + {KCF_MAXMAC, kcf_mac_mechs_tab}, + {KCF_MAXSIGN, kcf_sign_mechs_tab}, + {KCF_MAXKEYOPS, kcf_keyops_mechs_tab}, + {KCF_MAXMISC, kcf_misc_mechs_tab} +}; + +/* + * Per-algorithm internal thresholds for the minimum input size of before + * offloading to hardware provider. + * Dispatching a crypto operation to a hardware provider entails paying the + * cost of an additional context switch. Measurements with Sun Accelerator 4000 + * shows that 512-byte jobs or smaller are better handled in software. + * There is room for refinement here. + * + */ +int kcf_md5_threshold = 512; +int kcf_sha1_threshold = 512; +int kcf_des_threshold = 512; +int kcf_des3_threshold = 512; +int kcf_aes_threshold = 512; +int kcf_bf_threshold = 512; +int kcf_rc4_threshold = 512; + +kmutex_t kcf_mech_tabs_lock; +static uint32_t kcf_gen_swprov = 0; + +int kcf_mech_hash_size = 256; +mod_hash_t *kcf_mech_hash; /* mech name to id hash */ + +static crypto_mech_type_t +kcf_mech_hash_find(char *mechname) +{ + mod_hash_val_t hv; + crypto_mech_type_t mt; + + mt = CRYPTO_MECH_INVALID; + if (mod_hash_find(kcf_mech_hash, (mod_hash_key_t)mechname, &hv) == 0) { + mt = *(crypto_mech_type_t *)hv; + ASSERT(mt != CRYPTO_MECH_INVALID); + } + + return (mt); +} + +void +kcf_destroy_mech_tabs(void) +{ + int i, max; + kcf_ops_class_t class; + kcf_mech_entry_t *me_tab; + + if (kcf_mech_hash) + mod_hash_destroy_hash(kcf_mech_hash); + + mutex_destroy(&kcf_mech_tabs_lock); + + for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) { + max = kcf_mech_tabs_tab[class].met_size; + me_tab = kcf_mech_tabs_tab[class].met_tab; + for (i = 0; i < max; i++) + mutex_destroy(&(me_tab[i].me_mutex)); + } +} + +/* + * kcf_init_mech_tabs() + * + * Called by the misc/kcf's _init() routine to initialize the tables + * of mech_entry's. + */ +void +kcf_init_mech_tabs(void) +{ + int i, max; + kcf_ops_class_t class; + kcf_mech_entry_t *me_tab; + + /* Initializes the mutex locks. */ + + mutex_init(&kcf_mech_tabs_lock, NULL, MUTEX_DEFAULT, NULL); + + /* Then the pre-defined mechanism entries */ + + /* Two digests */ + (void) strncpy(kcf_digest_mechs_tab[0].me_name, SUN_CKM_MD5, + CRYPTO_MAX_MECH_NAME); + kcf_digest_mechs_tab[0].me_threshold = kcf_md5_threshold; + + (void) strncpy(kcf_digest_mechs_tab[1].me_name, SUN_CKM_SHA1, + CRYPTO_MAX_MECH_NAME); + kcf_digest_mechs_tab[1].me_threshold = kcf_sha1_threshold; + + /* The symmetric ciphers in various modes */ + (void) strncpy(kcf_cipher_mechs_tab[0].me_name, SUN_CKM_DES_CBC, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[0].me_threshold = kcf_des_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[1].me_name, SUN_CKM_DES3_CBC, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[1].me_threshold = kcf_des3_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[2].me_name, SUN_CKM_DES_ECB, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[2].me_threshold = kcf_des_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[3].me_name, SUN_CKM_DES3_ECB, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[3].me_threshold = kcf_des3_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[4].me_name, SUN_CKM_BLOWFISH_CBC, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[4].me_threshold = kcf_bf_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[5].me_name, SUN_CKM_BLOWFISH_ECB, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[5].me_threshold = kcf_bf_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[6].me_name, SUN_CKM_AES_CBC, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[6].me_threshold = kcf_aes_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[7].me_name, SUN_CKM_AES_ECB, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[7].me_threshold = kcf_aes_threshold; + + (void) strncpy(kcf_cipher_mechs_tab[8].me_name, SUN_CKM_RC4, + CRYPTO_MAX_MECH_NAME); + kcf_cipher_mechs_tab[8].me_threshold = kcf_rc4_threshold; + + + /* 4 HMACs */ + (void) strncpy(kcf_mac_mechs_tab[0].me_name, SUN_CKM_MD5_HMAC, + CRYPTO_MAX_MECH_NAME); + kcf_mac_mechs_tab[0].me_threshold = kcf_md5_threshold; + + (void) strncpy(kcf_mac_mechs_tab[1].me_name, SUN_CKM_MD5_HMAC_GENERAL, + CRYPTO_MAX_MECH_NAME); + kcf_mac_mechs_tab[1].me_threshold = kcf_md5_threshold; + + (void) strncpy(kcf_mac_mechs_tab[2].me_name, SUN_CKM_SHA1_HMAC, + CRYPTO_MAX_MECH_NAME); + kcf_mac_mechs_tab[2].me_threshold = kcf_sha1_threshold; + + (void) strncpy(kcf_mac_mechs_tab[3].me_name, SUN_CKM_SHA1_HMAC_GENERAL, + CRYPTO_MAX_MECH_NAME); + kcf_mac_mechs_tab[3].me_threshold = kcf_sha1_threshold; + + + /* 1 random number generation pseudo mechanism */ + (void) strncpy(kcf_misc_mechs_tab[0].me_name, SUN_RANDOM, + CRYPTO_MAX_MECH_NAME); + + kcf_mech_hash = mod_hash_create_strhash_nodtr("kcf mech2id hash", + kcf_mech_hash_size, mod_hash_null_valdtor); + + for (class = KCF_FIRST_OPSCLASS; class <= KCF_LAST_OPSCLASS; class++) { + max = kcf_mech_tabs_tab[class].met_size; + me_tab = kcf_mech_tabs_tab[class].met_tab; + for (i = 0; i < max; i++) { + mutex_init(&(me_tab[i].me_mutex), NULL, + MUTEX_DEFAULT, NULL); + if (me_tab[i].me_name[0] != 0) { + me_tab[i].me_mechid = KCF_MECHID(class, i); + (void) mod_hash_insert(kcf_mech_hash, + (mod_hash_key_t)me_tab[i].me_name, + (mod_hash_val_t)&(me_tab[i].me_mechid)); + } + } + } +} + +/* + * kcf_create_mech_entry() + * + * Arguments: + * . The class of mechanism. + * . the name of the new mechanism. + * + * Description: + * Creates a new mech_entry for a mechanism not yet known to the + * framework. + * This routine is called by kcf_add_mech_provider, which is + * in turn invoked for each mechanism supported by a provider. + * The'class' argument depends on the crypto_func_group_t bitmask + * in the registering provider's mech_info struct for this mechanism. + * When there is ambiguity in the mapping between the crypto_func_group_t + * and a class (dual ops, ...) the KCF_MISC_CLASS should be used. + * + * Context: + * User context only. + * + * Returns: + * KCF_INVALID_MECH_CLASS or KCF_INVALID_MECH_NAME if the class or + * the mechname is bogus. + * KCF_MECH_TAB_FULL when there is no room left in the mech. tabs. + * KCF_SUCCESS otherwise. + */ +static int +kcf_create_mech_entry(kcf_ops_class_t class, char *mechname) +{ + crypto_mech_type_t mt; + kcf_mech_entry_t *me_tab; + int i = 0, size; + + if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) + return (KCF_INVALID_MECH_CLASS); + + if ((mechname == NULL) || (mechname[0] == 0)) + return (KCF_INVALID_MECH_NAME); + /* + * First check if the mechanism is already in one of the tables. + * The mech_entry could be in another class. + */ + mutex_enter(&kcf_mech_tabs_lock); + mt = kcf_mech_hash_find(mechname); + if (mt != CRYPTO_MECH_INVALID) { + /* Nothing to do, regardless the suggested class. */ + mutex_exit(&kcf_mech_tabs_lock); + return (KCF_SUCCESS); + } + /* Now take the next unused mech entry in the class's tab */ + me_tab = kcf_mech_tabs_tab[class].met_tab; + size = kcf_mech_tabs_tab[class].met_size; + + while (i < size) { + mutex_enter(&(me_tab[i].me_mutex)); + if (me_tab[i].me_name[0] == 0) { + /* Found an empty spot */ + (void) strlcpy(me_tab[i].me_name, mechname, + CRYPTO_MAX_MECH_NAME); + me_tab[i].me_name[CRYPTO_MAX_MECH_NAME-1] = '\0'; + me_tab[i].me_mechid = KCF_MECHID(class, i); + /* + * No a-priori information about the new mechanism, so + * the threshold is set to zero. + */ + me_tab[i].me_threshold = 0; + + mutex_exit(&(me_tab[i].me_mutex)); + /* Add the new mechanism to the hash table */ + (void) mod_hash_insert(kcf_mech_hash, + (mod_hash_key_t)me_tab[i].me_name, + (mod_hash_val_t)&(me_tab[i].me_mechid)); + break; + } + mutex_exit(&(me_tab[i].me_mutex)); + i++; + } + + mutex_exit(&kcf_mech_tabs_lock); + + if (i == size) { + return (KCF_MECH_TAB_FULL); + } + + return (KCF_SUCCESS); +} + +/* + * kcf_add_mech_provider() + * + * Arguments: + * . An index in to the provider mechanism array + * . A pointer to the provider descriptor + * . A storage for the kcf_prov_mech_desc_t the entry was added at. + * + * Description: + * Adds a new provider of a mechanism to the mechanism's mech_entry + * chain. + * + * Context: + * User context only. + * + * Returns + * KCF_SUCCESS on success + * KCF_MECH_TAB_FULL otherwise. + */ +int +kcf_add_mech_provider(short mech_indx, + kcf_provider_desc_t *prov_desc, kcf_prov_mech_desc_t **pmdpp) +{ + int error; + kcf_mech_entry_t *mech_entry = NULL; + crypto_mech_info_t *mech_info; + crypto_mech_type_t kcf_mech_type, mt; + kcf_prov_mech_desc_t *prov_mech, *prov_mech2; + crypto_func_group_t simple_fg_mask, dual_fg_mask; + crypto_mech_info_t *dmi; + crypto_mech_info_list_t *mil, *mil2; + kcf_mech_entry_t *me; + int i; + + ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + mech_info = &prov_desc->pd_mechanisms[mech_indx]; + + /* + * A mechanism belongs to exactly one mechanism table. + * Find the class corresponding to the function group flag of + * the mechanism. + */ + kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name); + if (kcf_mech_type == CRYPTO_MECH_INVALID) { + crypto_func_group_t fg = mech_info->cm_func_group_mask; + kcf_ops_class_t class; + + if (fg & CRYPTO_FG_DIGEST || fg & CRYPTO_FG_DIGEST_ATOMIC) + class = KCF_DIGEST_CLASS; + else if (fg & CRYPTO_FG_ENCRYPT || fg & CRYPTO_FG_DECRYPT || + fg & CRYPTO_FG_ENCRYPT_ATOMIC || + fg & CRYPTO_FG_DECRYPT_ATOMIC) + class = KCF_CIPHER_CLASS; + else if (fg & CRYPTO_FG_MAC || fg & CRYPTO_FG_MAC_ATOMIC) + class = KCF_MAC_CLASS; + else if (fg & CRYPTO_FG_SIGN || fg & CRYPTO_FG_VERIFY || + fg & CRYPTO_FG_SIGN_ATOMIC || + fg & CRYPTO_FG_VERIFY_ATOMIC || + fg & CRYPTO_FG_SIGN_RECOVER || + fg & CRYPTO_FG_VERIFY_RECOVER) + class = KCF_SIGN_CLASS; + else if (fg & CRYPTO_FG_GENERATE || + fg & CRYPTO_FG_GENERATE_KEY_PAIR || + fg & CRYPTO_FG_WRAP || fg & CRYPTO_FG_UNWRAP || + fg & CRYPTO_FG_DERIVE) + class = KCF_KEYOPS_CLASS; + else + class = KCF_MISC_CLASS; + + /* + * Attempt to create a new mech_entry for the specified + * mechanism. kcf_create_mech_entry() can handle the case + * where such an entry already exists. + */ + if ((error = kcf_create_mech_entry(class, + mech_info->cm_mech_name)) != KCF_SUCCESS) { + return (error); + } + /* get the KCF mech type that was assigned to the mechanism */ + kcf_mech_type = kcf_mech_hash_find(mech_info->cm_mech_name); + ASSERT(kcf_mech_type != CRYPTO_MECH_INVALID); + } + + error = kcf_get_mech_entry(kcf_mech_type, &mech_entry); + ASSERT(error == KCF_SUCCESS); + + /* allocate and initialize new kcf_prov_mech_desc */ + prov_mech = kmem_zalloc(sizeof (kcf_prov_mech_desc_t), KM_SLEEP); + bcopy(mech_info, &prov_mech->pm_mech_info, sizeof (crypto_mech_info_t)); + prov_mech->pm_prov_desc = prov_desc; + prov_desc->pd_mech_indx[KCF_MECH2CLASS(kcf_mech_type)] + [KCF_MECH2INDEX(kcf_mech_type)] = mech_indx; + + KCF_PROV_REFHOLD(prov_desc); + KCF_PROV_IREFHOLD(prov_desc); + + dual_fg_mask = mech_info->cm_func_group_mask & CRYPTO_FG_DUAL_MASK; + + if (dual_fg_mask == ((crypto_func_group_t)0)) + goto add_entry; + + simple_fg_mask = (mech_info->cm_func_group_mask & + CRYPTO_FG_SIMPLEOP_MASK) | CRYPTO_FG_RANDOM; + + for (i = 0; i < prov_desc->pd_mech_list_count; i++) { + dmi = &prov_desc->pd_mechanisms[i]; + + /* skip self */ + if (dmi->cm_mech_number == mech_info->cm_mech_number) + continue; + + /* skip if not a dual operation mechanism */ + if (!(dmi->cm_func_group_mask & dual_fg_mask) || + (dmi->cm_func_group_mask & simple_fg_mask)) + continue; + + mt = kcf_mech_hash_find(dmi->cm_mech_name); + if (mt == CRYPTO_MECH_INVALID) + continue; + + if (kcf_get_mech_entry(mt, &me) != KCF_SUCCESS) + continue; + + mil = kmem_zalloc(sizeof (*mil), KM_SLEEP); + mil2 = kmem_zalloc(sizeof (*mil2), KM_SLEEP); + + /* + * Ignore hard-coded entries in the mech table + * if the provider hasn't registered. + */ + mutex_enter(&me->me_mutex); + if (me->me_hw_prov_chain == NULL && me->me_sw_prov == NULL) { + mutex_exit(&me->me_mutex); + kmem_free(mil, sizeof (*mil)); + kmem_free(mil2, sizeof (*mil2)); + continue; + } + + /* + * Add other dual mechanisms that have registered + * with the framework to this mechanism's + * cross-reference list. + */ + mil->ml_mech_info = *dmi; /* struct assignment */ + mil->ml_kcf_mechid = mt; + + /* add to head of list */ + mil->ml_next = prov_mech->pm_mi_list; + prov_mech->pm_mi_list = mil; + + if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER) + prov_mech2 = me->me_hw_prov_chain; + else + prov_mech2 = me->me_sw_prov; + + if (prov_mech2 == NULL) { + kmem_free(mil2, sizeof (*mil2)); + mutex_exit(&me->me_mutex); + continue; + } + + /* + * Update all other cross-reference lists by + * adding this new mechanism. + */ + while (prov_mech2 != NULL) { + if (prov_mech2->pm_prov_desc == prov_desc) { + /* struct assignment */ + mil2->ml_mech_info = *mech_info; + mil2->ml_kcf_mechid = kcf_mech_type; + + /* add to head of list */ + mil2->ml_next = prov_mech2->pm_mi_list; + prov_mech2->pm_mi_list = mil2; + break; + } + prov_mech2 = prov_mech2->pm_next; + } + if (prov_mech2 == NULL) + kmem_free(mil2, sizeof (*mil2)); + + mutex_exit(&me->me_mutex); + } + +add_entry: + /* + * Add new kcf_prov_mech_desc at the front of HW providers + * chain. + */ + switch (prov_desc->pd_prov_type) { + + case CRYPTO_HW_PROVIDER: + mutex_enter(&mech_entry->me_mutex); + prov_mech->pm_me = mech_entry; + prov_mech->pm_next = mech_entry->me_hw_prov_chain; + mech_entry->me_hw_prov_chain = prov_mech; + mech_entry->me_num_hwprov++; + mutex_exit(&mech_entry->me_mutex); + break; + + case CRYPTO_SW_PROVIDER: + mutex_enter(&mech_entry->me_mutex); + if (mech_entry->me_sw_prov != NULL) { + /* + * There is already a SW provider for this mechanism. + * Since we allow only one SW provider per mechanism, + * report this condition. + */ + cmn_err(CE_WARN, "The cryptographic software provider " + "\"%s\" will not be used for %s. The provider " + "\"%s\" will be used for this mechanism " + "instead.", prov_desc->pd_description, + mech_info->cm_mech_name, + mech_entry->me_sw_prov->pm_prov_desc-> + pd_description); + KCF_PROV_REFRELE(prov_desc); + kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t)); + prov_mech = NULL; + } else { + /* + * Set the provider as the software provider for + * this mechanism. + */ + mech_entry->me_sw_prov = prov_mech; + + /* We'll wrap around after 4 billion registrations! */ + mech_entry->me_gen_swprov = kcf_gen_swprov++; + } + mutex_exit(&mech_entry->me_mutex); + break; + default: + break; + } + + *pmdpp = prov_mech; + + return (KCF_SUCCESS); +} + +/* + * kcf_remove_mech_provider() + * + * Arguments: + * . mech_name: the name of the mechanism. + * . prov_desc: The provider descriptor + * + * Description: + * Removes a provider from chain of provider descriptors. + * The provider is made unavailable to kernel consumers for the specified + * mechanism. + * + * Context: + * User context only. + */ +void +kcf_remove_mech_provider(char *mech_name, kcf_provider_desc_t *prov_desc) +{ + crypto_mech_type_t mech_type; + kcf_prov_mech_desc_t *prov_mech = NULL, *prov_chain; + kcf_prov_mech_desc_t **prev_entry_next; + kcf_mech_entry_t *mech_entry; + crypto_mech_info_list_t *mil, *mil2, *next, **prev_next; + + ASSERT(prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER); + + /* get the KCF mech type that was assigned to the mechanism */ + if ((mech_type = kcf_mech_hash_find(mech_name)) == + CRYPTO_MECH_INVALID) { + /* + * Provider was not allowed for this mech due to policy or + * configuration. + */ + return; + } + + /* get a ptr to the mech_entry that was created */ + if (kcf_get_mech_entry(mech_type, &mech_entry) != KCF_SUCCESS) { + /* + * Provider was not allowed for this mech due to policy or + * configuration. + */ + return; + } + + mutex_enter(&mech_entry->me_mutex); + + switch (prov_desc->pd_prov_type) { + + case CRYPTO_HW_PROVIDER: + /* find the provider in the mech_entry chain */ + prev_entry_next = &mech_entry->me_hw_prov_chain; + prov_mech = mech_entry->me_hw_prov_chain; + while (prov_mech != NULL && + prov_mech->pm_prov_desc != prov_desc) { + prev_entry_next = &prov_mech->pm_next; + prov_mech = prov_mech->pm_next; + } + + if (prov_mech == NULL) { + /* entry not found, simply return */ + mutex_exit(&mech_entry->me_mutex); + return; + } + + /* remove provider entry from mech_entry chain */ + *prev_entry_next = prov_mech->pm_next; + ASSERT(mech_entry->me_num_hwprov > 0); + mech_entry->me_num_hwprov--; + break; + + case CRYPTO_SW_PROVIDER: + if (mech_entry->me_sw_prov == NULL || + mech_entry->me_sw_prov->pm_prov_desc != prov_desc) { + /* not the software provider for this mechanism */ + mutex_exit(&mech_entry->me_mutex); + return; + } + prov_mech = mech_entry->me_sw_prov; + mech_entry->me_sw_prov = NULL; + break; + default: + /* unexpected crypto_provider_type_t */ + mutex_exit(&mech_entry->me_mutex); + return; + } + + mutex_exit(&mech_entry->me_mutex); + + /* Free the dual ops cross-reference lists */ + mil = prov_mech->pm_mi_list; + while (mil != NULL) { + next = mil->ml_next; + if (kcf_get_mech_entry(mil->ml_kcf_mechid, + &mech_entry) != KCF_SUCCESS) { + mil = next; + continue; + } + + mutex_enter(&mech_entry->me_mutex); + if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER) + prov_chain = mech_entry->me_hw_prov_chain; + else + prov_chain = mech_entry->me_sw_prov; + + while (prov_chain != NULL) { + if (prov_chain->pm_prov_desc == prov_desc) { + prev_next = &prov_chain->pm_mi_list; + mil2 = prov_chain->pm_mi_list; + while (mil2 != NULL && + mil2->ml_kcf_mechid != mech_type) { + prev_next = &mil2->ml_next; + mil2 = mil2->ml_next; + } + if (mil2 != NULL) { + *prev_next = mil2->ml_next; + kmem_free(mil2, sizeof (*mil2)); + } + break; + } + prov_chain = prov_chain->pm_next; + } + + mutex_exit(&mech_entry->me_mutex); + kmem_free(mil, sizeof (crypto_mech_info_list_t)); + mil = next; + } + + /* free entry */ + KCF_PROV_REFRELE(prov_mech->pm_prov_desc); + KCF_PROV_IREFRELE(prov_mech->pm_prov_desc); + kmem_free(prov_mech, sizeof (kcf_prov_mech_desc_t)); +} + +/* + * kcf_get_mech_entry() + * + * Arguments: + * . The framework mechanism type + * . Storage for the mechanism entry + * + * Description: + * Retrieves the mechanism entry for the mech. + * + * Context: + * User and interrupt contexts. + * + * Returns: + * KCF_MECHANISM_XXX appropriate error code. + * KCF_SUCCESS otherwise. + */ +int +kcf_get_mech_entry(crypto_mech_type_t mech_type, kcf_mech_entry_t **mep) +{ + kcf_ops_class_t class; + int index; + kcf_mech_entry_tab_t *me_tab; + + ASSERT(mep != NULL); + + class = KCF_MECH2CLASS(mech_type); + + if ((class < KCF_FIRST_OPSCLASS) || (class > KCF_LAST_OPSCLASS)) { + /* the caller won't need to know it's an invalid class */ + return (KCF_INVALID_MECH_NUMBER); + } + + me_tab = &kcf_mech_tabs_tab[class]; + index = KCF_MECH2INDEX(mech_type); + + if ((index < 0) || (index >= me_tab->met_size)) { + return (KCF_INVALID_MECH_NUMBER); + } + + *mep = &((me_tab->met_tab)[index]); + + return (KCF_SUCCESS); +} + +/* CURRENTLY UNSUPPORTED: attempting to load the module if it isn't found */ +/* + * Lookup the hash table for an entry that matches the mechname. + * If there are no hardware or software providers for the mechanism, + * but there is an unloaded software provider, this routine will attempt + * to load it. + * + * If the MOD_NOAUTOUNLOAD flag is not set, a software provider is + * in constant danger of being unloaded. For consumers that call + * crypto_mech2id() only once, the provider will not be reloaded + * if it becomes unloaded. If a provider gets loaded elsewhere + * without the MOD_NOAUTOUNLOAD flag being set, we set it now. + */ +crypto_mech_type_t +crypto_mech2id_common(char *mechname, boolean_t load_module) +{ + crypto_mech_type_t mt = kcf_mech_hash_find(mechname); + return (mt); +} diff --git a/module/icp/core/kcf_prov_lib.c b/module/icp/core/kcf_prov_lib.c new file mode 100644 index 000000000000..905ef6657336 --- /dev/null +++ b/module/icp/core/kcf_prov_lib.c @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include + +/* + * Utility routine to copy a buffer to a crypto_data structure. + */ + +/* + * Utility routine to apply the command, 'cmd', to the + * data in the uio structure. + */ +int +crypto_uio_data(crypto_data_t *data, uchar_t *buf, int len, cmd_type_t cmd, + void *digest_ctx, void (*update)(void)) +{ + uio_t *uiop = data->cd_uio; + off_t offset = data->cd_offset; + size_t length = len; + uint_t vec_idx; + size_t cur_len; + uchar_t *datap; + + ASSERT(data->cd_format == CRYPTO_DATA_UIO); + if (uio_segflg(uiop) != UIO_SYSSPACE) { + return (CRYPTO_ARGUMENTS_BAD); + } + + /* + * Jump to the first iovec containing data to be + * processed. + */ + offset = uio_index_at_offset(uiop, offset, &vec_idx); + + if (vec_idx == uio_iovcnt(uiop) && length > 0) { + /* + * The caller specified an offset that is larger than + * the total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + while (vec_idx < uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(uio_iovlen(uiop, vec_idx) - + offset, length); + + datap = (uchar_t *)(uio_iovbase(uiop, vec_idx) + offset); + switch (cmd) { + case COPY_FROM_DATA: + bcopy(datap, buf, cur_len); + buf += cur_len; + break; + case COPY_TO_DATA: + bcopy(buf, datap, cur_len); + buf += cur_len; + break; + case COMPARE_TO_DATA: + if (bcmp(datap, buf, cur_len)) + return (CRYPTO_SIGNATURE_INVALID); + buf += cur_len; + break; + case MD5_DIGEST_DATA: + case SHA1_DIGEST_DATA: + case SHA2_DIGEST_DATA: + case GHASH_DATA: + return (CRYPTO_ARGUMENTS_BAD); + } + + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio_iovcnt(uiop) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed. + */ + switch (cmd) { + case COPY_TO_DATA: + data->cd_length = len; + return (CRYPTO_BUFFER_TOO_SMALL); + default: + return (CRYPTO_DATA_LEN_RANGE); + } + } + + return (CRYPTO_SUCCESS); +} + +int +crypto_put_output_data(uchar_t *buf, crypto_data_t *output, int len) +{ + switch (output->cd_format) { + case CRYPTO_DATA_RAW: + if (output->cd_raw.iov_len < len) { + output->cd_length = len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + bcopy(buf, (uchar_t *)(output->cd_raw.iov_base + + output->cd_offset), len); + break; + + case CRYPTO_DATA_UIO: + return (crypto_uio_data(output, buf, len, + COPY_TO_DATA, NULL, NULL)); + default: + return (CRYPTO_ARGUMENTS_BAD); + } + + return (CRYPTO_SUCCESS); +} + +int +crypto_update_iov(void *ctx, crypto_data_t *input, crypto_data_t *output, + int (*cipher)(void *, caddr_t, size_t, crypto_data_t *), + void (*copy_block)(uint8_t *, uint64_t *)) +{ + common_ctx_t *common_ctx = ctx; + int rv; + + ASSERT(input != output); + if (input->cd_miscdata != NULL) { + copy_block((uint8_t *)input->cd_miscdata, + &common_ctx->cc_iv[0]); + } + + if (input->cd_raw.iov_len < input->cd_length) + return (CRYPTO_ARGUMENTS_BAD); + + rv = (cipher)(ctx, input->cd_raw.iov_base + input->cd_offset, + input->cd_length, output); + + return (rv); +} + +int +crypto_update_uio(void *ctx, crypto_data_t *input, crypto_data_t *output, + int (*cipher)(void *, caddr_t, size_t, crypto_data_t *), + void (*copy_block)(uint8_t *, uint64_t *)) +{ + common_ctx_t *common_ctx = ctx; + uio_t *uiop = input->cd_uio; + off_t offset = input->cd_offset; + size_t length = input->cd_length; + uint_t vec_idx; + size_t cur_len; + + ASSERT(input != output); + if (input->cd_miscdata != NULL) { + copy_block((uint8_t *)input->cd_miscdata, + &common_ctx->cc_iv[0]); + } + + if (uio_segflg(input->cd_uio) != UIO_SYSSPACE) { + return (CRYPTO_ARGUMENTS_BAD); + } + + /* + * Jump to the first iovec containing data to be + * processed. + */ + offset = uio_index_at_offset(uiop, offset, &vec_idx); + if (vec_idx == uio_iovcnt(uiop) && length > 0) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * Now process the iovecs. + */ + while (vec_idx < uio_iovcnt(uiop) && length > 0) { + cur_len = MIN(uio_iovlen(uiop, vec_idx) - + offset, length); + + int rv = (cipher)(ctx, uio_iovbase(uiop, vec_idx) + offset, + cur_len, output); + + if (rv != CRYPTO_SUCCESS) { + return (rv); + } + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio_iovcnt(uiop) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it provided. + */ + + return (CRYPTO_DATA_LEN_RANGE); + } + + return (CRYPTO_SUCCESS); +} diff --git a/module/icp/core/kcf_prov_tabs.c b/module/icp/core/kcf_prov_tabs.c new file mode 100644 index 000000000000..94e6937bcd76 --- /dev/null +++ b/module/icp/core/kcf_prov_tabs.c @@ -0,0 +1,645 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file is part of the core Kernel Cryptographic Framework. + * It implements the management of tables of Providers. Entries to + * added and removed when cryptographic providers register with + * and unregister from the framework, respectively. The KCF scheduler + * and ioctl pseudo driver call this function to obtain the list + * of available providers. + * + * The provider table is indexed by crypto_provider_id_t. Each + * element of the table contains a pointer to a provider descriptor, + * or NULL if the entry is free. + * + * This file also implements helper functions to allocate and free + * provider descriptors. + */ + +#include +#include +#include +#include +#include + +#define KCF_MAX_PROVIDERS 512 /* max number of providers */ + +/* + * Prov_tab is an array of providers which is updated when + * a crypto provider registers with kcf. The provider calls the + * SPI routine, crypto_register_provider(), which in turn calls + * kcf_prov_tab_add_provider(). + * + * A provider unregisters by calling crypto_unregister_provider() + * which triggers the removal of the prov_tab entry. + * It also calls kcf_remove_mech_provider(). + * + * prov_tab entries are not updated from kcf.conf or by cryptoadm(1M). + */ +static kcf_provider_desc_t **prov_tab = NULL; +static kmutex_t prov_tab_mutex; /* ensure exclusive access to the table */ +static uint_t prov_tab_num = 0; /* number of providers in table */ +static uint_t prov_tab_max = KCF_MAX_PROVIDERS; + +void +kcf_prov_tab_destroy(void) +{ + mutex_destroy(&prov_tab_mutex); + + if (prov_tab) + kmem_free(prov_tab, prov_tab_max * + sizeof (kcf_provider_desc_t *)); +} + +/* + * Initialize a mutex and the KCF providers table, prov_tab. + * The providers table is dynamically allocated with prov_tab_max entries. + * Called from kcf module _init(). + */ +void +kcf_prov_tab_init(void) +{ + mutex_init(&prov_tab_mutex, NULL, MUTEX_DEFAULT, NULL); + + prov_tab = kmem_zalloc(prov_tab_max * sizeof (kcf_provider_desc_t *), + KM_SLEEP); +} + +/* + * Add a provider to the provider table. If no free entry can be found + * for the new provider, returns CRYPTO_HOST_MEMORY. Otherwise, add + * the provider to the table, initialize the pd_prov_id field + * of the specified provider descriptor to the index in that table, + * and return CRYPTO_SUCCESS. Note that a REFHOLD is done on the + * provider when pointed to by a table entry. + */ +int +kcf_prov_tab_add_provider(kcf_provider_desc_t *prov_desc) +{ + uint_t i; + + ASSERT(prov_tab != NULL); + + mutex_enter(&prov_tab_mutex); + + /* find free slot in providers table */ + for (i = 1; i < KCF_MAX_PROVIDERS && prov_tab[i] != NULL; i++) + ; + if (i == KCF_MAX_PROVIDERS) { + /* ran out of providers entries */ + mutex_exit(&prov_tab_mutex); + cmn_err(CE_WARN, "out of providers entries"); + return (CRYPTO_HOST_MEMORY); + } + + /* initialize entry */ + prov_tab[i] = prov_desc; + KCF_PROV_REFHOLD(prov_desc); + KCF_PROV_IREFHOLD(prov_desc); + prov_tab_num++; + + mutex_exit(&prov_tab_mutex); + + /* update provider descriptor */ + prov_desc->pd_prov_id = i; + + /* + * The KCF-private provider handle is defined as the internal + * provider id. + */ + prov_desc->pd_kcf_prov_handle = + (crypto_kcf_provider_handle_t)prov_desc->pd_prov_id; + + return (CRYPTO_SUCCESS); +} + +/* + * Remove the provider specified by its id. A REFRELE is done on the + * corresponding provider descriptor before this function returns. + * Returns CRYPTO_UNKNOWN_PROVIDER if the provider id is not valid. + */ +int +kcf_prov_tab_rem_provider(crypto_provider_id_t prov_id) +{ + kcf_provider_desc_t *prov_desc; + + ASSERT(prov_tab != NULL); + ASSERT(prov_tab_num >= 0); + + /* + * Validate provider id, since it can be specified by a 3rd-party + * provider. + */ + + mutex_enter(&prov_tab_mutex); + if (prov_id >= KCF_MAX_PROVIDERS || + ((prov_desc = prov_tab[prov_id]) == NULL)) { + mutex_exit(&prov_tab_mutex); + return (CRYPTO_INVALID_PROVIDER_ID); + } + mutex_exit(&prov_tab_mutex); + + /* + * The provider id must remain valid until the associated provider + * descriptor is freed. For this reason, we simply release our + * reference to the descriptor here. When the reference count + * reaches zero, kcf_free_provider_desc() will be invoked and + * the associated entry in the providers table will be released + * at that time. + */ + + KCF_PROV_REFRELE(prov_desc); + KCF_PROV_IREFRELE(prov_desc); + + return (CRYPTO_SUCCESS); +} + +/* + * Returns the provider descriptor corresponding to the specified + * provider id. A REFHOLD is done on the descriptor before it is + * returned to the caller. It is the responsibility of the caller + * to do a REFRELE once it is done with the provider descriptor. + */ +kcf_provider_desc_t * +kcf_prov_tab_lookup(crypto_provider_id_t prov_id) +{ + kcf_provider_desc_t *prov_desc; + + mutex_enter(&prov_tab_mutex); + + prov_desc = prov_tab[prov_id]; + + if (prov_desc == NULL) { + mutex_exit(&prov_tab_mutex); + return (NULL); + } + + KCF_PROV_REFHOLD(prov_desc); + + mutex_exit(&prov_tab_mutex); + + return (prov_desc); +} + +static void +allocate_ops_v1(crypto_ops_t *src, crypto_ops_t *dst, uint_t *mech_list_count) +{ + if (src->co_control_ops != NULL) + dst->co_control_ops = kmem_alloc(sizeof (crypto_control_ops_t), + KM_SLEEP); + + if (src->co_digest_ops != NULL) + dst->co_digest_ops = kmem_alloc(sizeof (crypto_digest_ops_t), + KM_SLEEP); + + if (src->co_cipher_ops != NULL) + dst->co_cipher_ops = kmem_alloc(sizeof (crypto_cipher_ops_t), + KM_SLEEP); + + if (src->co_mac_ops != NULL) + dst->co_mac_ops = kmem_alloc(sizeof (crypto_mac_ops_t), + KM_SLEEP); + + if (src->co_sign_ops != NULL) + dst->co_sign_ops = kmem_alloc(sizeof (crypto_sign_ops_t), + KM_SLEEP); + + if (src->co_verify_ops != NULL) + dst->co_verify_ops = kmem_alloc(sizeof (crypto_verify_ops_t), + KM_SLEEP); + + if (src->co_dual_ops != NULL) + dst->co_dual_ops = kmem_alloc(sizeof (crypto_dual_ops_t), + KM_SLEEP); + + if (src->co_dual_cipher_mac_ops != NULL) + dst->co_dual_cipher_mac_ops = kmem_alloc( + sizeof (crypto_dual_cipher_mac_ops_t), KM_SLEEP); + + if (src->co_random_ops != NULL) { + dst->co_random_ops = kmem_alloc( + sizeof (crypto_random_number_ops_t), KM_SLEEP); + + /* + * Allocate storage to store the array of supported mechanisms + * specified by provider. We allocate extra mechanism storage + * if the provider has random_ops since we keep an internal + * mechanism, SUN_RANDOM, in this case. + */ + (*mech_list_count)++; + } + + if (src->co_session_ops != NULL) + dst->co_session_ops = kmem_alloc(sizeof (crypto_session_ops_t), + KM_SLEEP); + + if (src->co_object_ops != NULL) + dst->co_object_ops = kmem_alloc(sizeof (crypto_object_ops_t), + KM_SLEEP); + + if (src->co_key_ops != NULL) + dst->co_key_ops = kmem_alloc(sizeof (crypto_key_ops_t), + KM_SLEEP); + + if (src->co_provider_ops != NULL) + dst->co_provider_ops = kmem_alloc( + sizeof (crypto_provider_management_ops_t), KM_SLEEP); + + if (src->co_ctx_ops != NULL) + dst->co_ctx_ops = kmem_alloc(sizeof (crypto_ctx_ops_t), + KM_SLEEP); +} + +static void +allocate_ops_v2(crypto_ops_t *src, crypto_ops_t *dst) +{ + if (src->co_mech_ops != NULL) + dst->co_mech_ops = kmem_alloc(sizeof (crypto_mech_ops_t), + KM_SLEEP); +} + +static void +allocate_ops_v3(crypto_ops_t *src, crypto_ops_t *dst) +{ + if (src->co_nostore_key_ops != NULL) + dst->co_nostore_key_ops = + kmem_alloc(sizeof (crypto_nostore_key_ops_t), KM_SLEEP); +} + +/* + * Allocate a provider descriptor. mech_list_count specifies the + * number of mechanisms supported by the providers, and is used + * to allocate storage for the mechanism table. + * This function may sleep while allocating memory, which is OK + * since it is invoked from user context during provider registration. + */ +kcf_provider_desc_t * +kcf_alloc_provider_desc(crypto_provider_info_t *info) +{ + int i, j; + kcf_provider_desc_t *desc; + uint_t mech_list_count = info->pi_mech_list_count; + crypto_ops_t *src_ops = info->pi_ops_vector; + + desc = kmem_zalloc(sizeof (kcf_provider_desc_t), KM_SLEEP); + + /* + * pd_description serves two purposes + * - Appears as a blank padded PKCS#11 style string, that will be + * returned to applications in CK_SLOT_INFO.slotDescription. + * This means that we should not have a null character in the + * first CRYPTO_PROVIDER_DESCR_MAX_LEN bytes. + * - Appears as a null-terminated string that can be used by + * other kcf routines. + * + * So, we allocate enough room for one extra null terminator + * which keeps every one happy. + */ + desc->pd_description = kmem_alloc(CRYPTO_PROVIDER_DESCR_MAX_LEN + 1, + KM_SLEEP); + (void) memset(desc->pd_description, ' ', + CRYPTO_PROVIDER_DESCR_MAX_LEN); + desc->pd_description[CRYPTO_PROVIDER_DESCR_MAX_LEN] = '\0'; + + /* + * Since the framework does not require the ops vector specified + * by the providers during registration to be persistent, + * KCF needs to allocate storage where copies of the ops + * vectors are copied. + */ + desc->pd_ops_vector = kmem_zalloc(sizeof (crypto_ops_t), KM_SLEEP); + + if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) { + allocate_ops_v1(src_ops, desc->pd_ops_vector, &mech_list_count); + if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2) + allocate_ops_v2(src_ops, desc->pd_ops_vector); + if (info->pi_interface_version == CRYPTO_SPI_VERSION_3) + allocate_ops_v3(src_ops, desc->pd_ops_vector); + } + + desc->pd_mech_list_count = mech_list_count; + desc->pd_mechanisms = kmem_zalloc(sizeof (crypto_mech_info_t) * + mech_list_count, KM_SLEEP); + for (i = 0; i < KCF_OPS_CLASSSIZE; i++) + for (j = 0; j < KCF_MAXMECHTAB; j++) + desc->pd_mech_indx[i][j] = KCF_INVALID_INDX; + + desc->pd_prov_id = KCF_PROVID_INVALID; + desc->pd_state = KCF_PROV_ALLOCATED; + + mutex_init(&desc->pd_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&desc->pd_resume_cv, NULL, CV_DEFAULT, NULL); + cv_init(&desc->pd_remove_cv, NULL, CV_DEFAULT, NULL); + + return (desc); +} + +/* + * Called by KCF_PROV_REFRELE when a provider's reference count drops + * to zero. We free the descriptor when the last reference is released. + * However, for software providers, we do not free it when there is an + * unregister thread waiting. We signal that thread in this case and + * that thread is responsible for freeing the descriptor. + */ +void +kcf_provider_zero_refcnt(kcf_provider_desc_t *desc) +{ + mutex_enter(&desc->pd_lock); + switch (desc->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + if (desc->pd_state == KCF_PROV_REMOVED || + desc->pd_state == KCF_PROV_DISABLED) { + desc->pd_state = KCF_PROV_FREED; + cv_broadcast(&desc->pd_remove_cv); + mutex_exit(&desc->pd_lock); + break; + } + /* FALLTHRU */ + + case CRYPTO_HW_PROVIDER: + case CRYPTO_LOGICAL_PROVIDER: + mutex_exit(&desc->pd_lock); + kcf_free_provider_desc(desc); + } +} + +/* + * Free a provider descriptor. + */ +void +kcf_free_provider_desc(kcf_provider_desc_t *desc) +{ + if (desc == NULL) + return; + + mutex_enter(&prov_tab_mutex); + if (desc->pd_prov_id != KCF_PROVID_INVALID) { + /* release the associated providers table entry */ + ASSERT(prov_tab[desc->pd_prov_id] != NULL); + prov_tab[desc->pd_prov_id] = NULL; + prov_tab_num--; + } + mutex_exit(&prov_tab_mutex); + + /* free the kernel memory associated with the provider descriptor */ + + if (desc->pd_description != NULL) + kmem_free(desc->pd_description, + CRYPTO_PROVIDER_DESCR_MAX_LEN + 1); + + if (desc->pd_ops_vector != NULL) { + + if (desc->pd_ops_vector->co_control_ops != NULL) + kmem_free(desc->pd_ops_vector->co_control_ops, + sizeof (crypto_control_ops_t)); + + if (desc->pd_ops_vector->co_digest_ops != NULL) + kmem_free(desc->pd_ops_vector->co_digest_ops, + sizeof (crypto_digest_ops_t)); + + if (desc->pd_ops_vector->co_cipher_ops != NULL) + kmem_free(desc->pd_ops_vector->co_cipher_ops, + sizeof (crypto_cipher_ops_t)); + + if (desc->pd_ops_vector->co_mac_ops != NULL) + kmem_free(desc->pd_ops_vector->co_mac_ops, + sizeof (crypto_mac_ops_t)); + + if (desc->pd_ops_vector->co_sign_ops != NULL) + kmem_free(desc->pd_ops_vector->co_sign_ops, + sizeof (crypto_sign_ops_t)); + + if (desc->pd_ops_vector->co_verify_ops != NULL) + kmem_free(desc->pd_ops_vector->co_verify_ops, + sizeof (crypto_verify_ops_t)); + + if (desc->pd_ops_vector->co_dual_ops != NULL) + kmem_free(desc->pd_ops_vector->co_dual_ops, + sizeof (crypto_dual_ops_t)); + + if (desc->pd_ops_vector->co_dual_cipher_mac_ops != NULL) + kmem_free(desc->pd_ops_vector->co_dual_cipher_mac_ops, + sizeof (crypto_dual_cipher_mac_ops_t)); + + if (desc->pd_ops_vector->co_random_ops != NULL) + kmem_free(desc->pd_ops_vector->co_random_ops, + sizeof (crypto_random_number_ops_t)); + + if (desc->pd_ops_vector->co_session_ops != NULL) + kmem_free(desc->pd_ops_vector->co_session_ops, + sizeof (crypto_session_ops_t)); + + if (desc->pd_ops_vector->co_object_ops != NULL) + kmem_free(desc->pd_ops_vector->co_object_ops, + sizeof (crypto_object_ops_t)); + + if (desc->pd_ops_vector->co_key_ops != NULL) + kmem_free(desc->pd_ops_vector->co_key_ops, + sizeof (crypto_key_ops_t)); + + if (desc->pd_ops_vector->co_provider_ops != NULL) + kmem_free(desc->pd_ops_vector->co_provider_ops, + sizeof (crypto_provider_management_ops_t)); + + if (desc->pd_ops_vector->co_ctx_ops != NULL) + kmem_free(desc->pd_ops_vector->co_ctx_ops, + sizeof (crypto_ctx_ops_t)); + + if (desc->pd_ops_vector->co_mech_ops != NULL) + kmem_free(desc->pd_ops_vector->co_mech_ops, + sizeof (crypto_mech_ops_t)); + + if (desc->pd_ops_vector->co_nostore_key_ops != NULL) + kmem_free(desc->pd_ops_vector->co_nostore_key_ops, + sizeof (crypto_nostore_key_ops_t)); + + kmem_free(desc->pd_ops_vector, sizeof (crypto_ops_t)); + } + + if (desc->pd_mechanisms != NULL) + /* free the memory associated with the mechanism info's */ + kmem_free(desc->pd_mechanisms, sizeof (crypto_mech_info_t) * + desc->pd_mech_list_count); + + if (desc->pd_sched_info.ks_taskq != NULL) + taskq_destroy(desc->pd_sched_info.ks_taskq); + + mutex_destroy(&desc->pd_lock); + cv_destroy(&desc->pd_resume_cv); + cv_destroy(&desc->pd_remove_cv); + + kmem_free(desc, sizeof (kcf_provider_desc_t)); +} + +/* + * Returns an array of hardware and logical provider descriptors, + * a.k.a the PKCS#11 slot list. A REFHOLD is done on each descriptor + * before the array is returned. The entire table can be freed by + * calling kcf_free_provider_tab(). + */ +int +kcf_get_slot_list(uint_t *count, kcf_provider_desc_t ***array, + boolean_t unverified) +{ + kcf_provider_desc_t *prov_desc; + kcf_provider_desc_t **p = NULL; + char *last; + uint_t cnt = 0; + uint_t i, j; + int rval = CRYPTO_SUCCESS; + size_t n, final_size; + + /* count the providers */ + mutex_enter(&prov_tab_mutex); + for (i = 0; i < KCF_MAX_PROVIDERS; i++) { + if ((prov_desc = prov_tab[i]) != NULL && + ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER && + (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) || + prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) { + if (KCF_IS_PROV_USABLE(prov_desc) || + (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) { + cnt++; + } + } + } + mutex_exit(&prov_tab_mutex); + + if (cnt == 0) + goto out; + + n = cnt * sizeof (kcf_provider_desc_t *); +again: + p = kmem_zalloc(n, KM_SLEEP); + + /* pointer to last entry in the array */ + last = (char *)&p[cnt-1]; + + mutex_enter(&prov_tab_mutex); + /* fill the slot list */ + for (i = 0, j = 0; i < KCF_MAX_PROVIDERS; i++) { + if ((prov_desc = prov_tab[i]) != NULL && + ((prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER && + (prov_desc->pd_flags & CRYPTO_HIDE_PROVIDER) == 0) || + prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER)) { + if (KCF_IS_PROV_USABLE(prov_desc) || + (unverified && KCF_IS_PROV_UNVERIFIED(prov_desc))) { + if ((char *)&p[j] > last) { + mutex_exit(&prov_tab_mutex); + kcf_free_provider_tab(cnt, p); + n = n << 1; + cnt = cnt << 1; + goto again; + } + p[j++] = prov_desc; + KCF_PROV_REFHOLD(prov_desc); + } + } + } + mutex_exit(&prov_tab_mutex); + + final_size = j * sizeof (kcf_provider_desc_t *); + cnt = j; + ASSERT(final_size <= n); + + /* check if buffer we allocated is too large */ + if (final_size < n) { + char *final_buffer = NULL; + + if (final_size > 0) { + final_buffer = kmem_alloc(final_size, KM_SLEEP); + bcopy(p, final_buffer, final_size); + } + kmem_free(p, n); + p = (kcf_provider_desc_t **)final_buffer; + } +out: + *count = cnt; + *array = p; + return (rval); +} + +/* + * Free an array of hardware provider descriptors. A REFRELE + * is done on each descriptor before the table is freed. + */ +void +kcf_free_provider_tab(uint_t count, kcf_provider_desc_t **array) +{ + kcf_provider_desc_t *prov_desc; + int i; + + for (i = 0; i < count; i++) { + if ((prov_desc = array[i]) != NULL) { + KCF_PROV_REFRELE(prov_desc); + } + } + kmem_free(array, count * sizeof (kcf_provider_desc_t *)); +} + +/* + * Returns in the location pointed to by pd a pointer to the descriptor + * for the software provider for the specified mechanism. + * The provider descriptor is returned held and it is the caller's + * responsibility to release it when done. The mechanism entry + * is returned if the optional argument mep is non NULL. + * + * Returns one of the CRYPTO_ * error codes on failure, and + * CRYPTO_SUCCESS on success. + */ +int +kcf_get_sw_prov(crypto_mech_type_t mech_type, kcf_provider_desc_t **pd, + kcf_mech_entry_t **mep, boolean_t log_warn) +{ + kcf_mech_entry_t *me; + + /* get the mechanism entry for this mechanism */ + if (kcf_get_mech_entry(mech_type, &me) != KCF_SUCCESS) + return (CRYPTO_MECHANISM_INVALID); + + /* + * Get the software provider for this mechanism. + * Lock the mech_entry until we grab the 'pd'. + */ + mutex_enter(&me->me_mutex); + + if (me->me_sw_prov == NULL || + (*pd = me->me_sw_prov->pm_prov_desc) == NULL) { + /* no SW provider for this mechanism */ + if (log_warn) + cmn_err(CE_WARN, "no SW provider for \"%s\"\n", + me->me_name); + mutex_exit(&me->me_mutex); + return (CRYPTO_MECH_NOT_SUPPORTED); + } + + KCF_PROV_REFHOLD(*pd); + mutex_exit(&me->me_mutex); + + if (mep != NULL) + *mep = me; + + return (CRYPTO_SUCCESS); +} diff --git a/module/icp/core/kcf_sched.c b/module/icp/core/kcf_sched.c new file mode 100644 index 000000000000..40d50553d67e --- /dev/null +++ b/module/icp/core/kcf_sched.c @@ -0,0 +1,1782 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file contains the core framework routines for the + * kernel cryptographic framework. These routines are at the + * layer, between the kernel API/ioctls and the SPI. + */ + +#include +#include +#include +#include +#include + +kcf_global_swq_t *gswq; /* Global software queue */ + +/* Thread pool related variables */ +static kcf_pool_t *kcfpool; /* Thread pool of kcfd LWPs */ +int kcf_maxthreads = 2; +int kcf_minthreads = 1; +int kcf_thr_multiple = 2; /* Boot-time tunable for experimentation */ +static ulong_t kcf_idlethr_timeout; +#define KCF_DEFAULT_THRTIMEOUT 60000000 /* 60 seconds */ + +/* kmem caches used by the scheduler */ +static kmem_cache_t *kcf_sreq_cache; +static kmem_cache_t *kcf_areq_cache; +static kmem_cache_t *kcf_context_cache; + +/* Global request ID table */ +static kcf_reqid_table_t *kcf_reqid_table[REQID_TABLES]; + +/* KCF stats. Not protected. */ +static kcf_stats_t kcf_ksdata = { + { "total threads in pool", KSTAT_DATA_UINT32}, + { "idle threads in pool", KSTAT_DATA_UINT32}, + { "min threads in pool", KSTAT_DATA_UINT32}, + { "max threads in pool", KSTAT_DATA_UINT32}, + { "requests in gswq", KSTAT_DATA_UINT32}, + { "max requests in gswq", KSTAT_DATA_UINT32}, + { "threads for HW taskq", KSTAT_DATA_UINT32}, + { "minalloc for HW taskq", KSTAT_DATA_UINT32}, + { "maxalloc for HW taskq", KSTAT_DATA_UINT32} +}; + +static kstat_t *kcf_misc_kstat = NULL; +ulong_t kcf_swprov_hndl = 0; + +static kcf_areq_node_t *kcf_areqnode_alloc(kcf_provider_desc_t *, + kcf_context_t *, crypto_call_req_t *, kcf_req_params_t *, boolean_t); +static int kcf_disp_sw_request(kcf_areq_node_t *); +static void process_req_hwp(void *); +static int kcf_enqueue(kcf_areq_node_t *); +static void kcfpool_alloc(void); +static void kcf_reqid_delete(kcf_areq_node_t *areq); +static crypto_req_id_t kcf_reqid_insert(kcf_areq_node_t *areq); +static int kcf_misc_kstat_update(kstat_t *ksp, int rw); + +/* + * Create a new context. + */ +crypto_ctx_t * +kcf_new_ctx(crypto_call_req_t *crq, kcf_provider_desc_t *pd, + crypto_session_id_t sid) +{ + crypto_ctx_t *ctx; + kcf_context_t *kcf_ctx; + + kcf_ctx = kmem_cache_alloc(kcf_context_cache, + (crq == NULL) ? KM_SLEEP : KM_NOSLEEP); + if (kcf_ctx == NULL) + return (NULL); + + /* initialize the context for the consumer */ + kcf_ctx->kc_refcnt = 1; + kcf_ctx->kc_req_chain_first = NULL; + kcf_ctx->kc_req_chain_last = NULL; + kcf_ctx->kc_secondctx = NULL; + KCF_PROV_REFHOLD(pd); + kcf_ctx->kc_prov_desc = pd; + kcf_ctx->kc_sw_prov_desc = NULL; + kcf_ctx->kc_mech = NULL; + + ctx = &kcf_ctx->kc_glbl_ctx; + ctx->cc_provider = pd->pd_prov_handle; + ctx->cc_session = sid; + ctx->cc_provider_private = NULL; + ctx->cc_framework_private = (void *)kcf_ctx; + ctx->cc_flags = 0; + ctx->cc_opstate = NULL; + + return (ctx); +} + +/* + * Allocate a new async request node. + * + * ictx - Framework private context pointer + * crq - Has callback function and argument. Should be non NULL. + * req - The parameters to pass to the SPI + */ +static kcf_areq_node_t * +kcf_areqnode_alloc(kcf_provider_desc_t *pd, kcf_context_t *ictx, + crypto_call_req_t *crq, kcf_req_params_t *req, boolean_t isdual) +{ + kcf_areq_node_t *arptr, *areq; + + ASSERT(crq != NULL); + arptr = kmem_cache_alloc(kcf_areq_cache, KM_NOSLEEP); + if (arptr == NULL) + return (NULL); + + arptr->an_state = REQ_ALLOCATED; + arptr->an_reqarg = *crq; + arptr->an_params = *req; + arptr->an_context = ictx; + arptr->an_isdual = isdual; + + arptr->an_next = arptr->an_prev = NULL; + KCF_PROV_REFHOLD(pd); + arptr->an_provider = pd; + arptr->an_tried_plist = NULL; + arptr->an_refcnt = 1; + arptr->an_idnext = arptr->an_idprev = NULL; + + /* + * Requests for context-less operations do not use the + * fields - an_is_my_turn, and an_ctxchain_next. + */ + if (ictx == NULL) + return (arptr); + + KCF_CONTEXT_REFHOLD(ictx); + /* + * Chain this request to the context. + */ + mutex_enter(&ictx->kc_in_use_lock); + arptr->an_ctxchain_next = NULL; + if ((areq = ictx->kc_req_chain_last) == NULL) { + arptr->an_is_my_turn = B_TRUE; + ictx->kc_req_chain_last = + ictx->kc_req_chain_first = arptr; + } else { + ASSERT(ictx->kc_req_chain_first != NULL); + arptr->an_is_my_turn = B_FALSE; + /* Insert the new request to the end of the chain. */ + areq->an_ctxchain_next = arptr; + ictx->kc_req_chain_last = arptr; + } + mutex_exit(&ictx->kc_in_use_lock); + + return (arptr); +} + +/* + * Queue the request node and do one of the following: + * - If there is an idle thread signal it to run. + * - If there is no idle thread and max running threads is not + * reached, signal the creator thread for more threads. + * + * If the two conditions above are not met, we don't need to do + * anything. The request will be picked up by one of the + * worker threads when it becomes available. + */ +static int +kcf_disp_sw_request(kcf_areq_node_t *areq) +{ + int err; + int cnt = 0; + + if ((err = kcf_enqueue(areq)) != 0) + return (err); + + if (kcfpool->kp_idlethreads > 0) { + /* Signal an idle thread to run */ + mutex_enter(&gswq->gs_lock); + cv_signal(&gswq->gs_cv); + mutex_exit(&gswq->gs_lock); + + return (CRYPTO_QUEUED); + } + + /* + * We keep the number of running threads to be at + * kcf_minthreads to reduce gs_lock contention. + */ + cnt = kcf_minthreads - + (kcfpool->kp_threads - kcfpool->kp_blockedthreads); + if (cnt > 0) { + /* + * The following ensures the number of threads in pool + * does not exceed kcf_maxthreads. + */ + cnt = MIN(cnt, kcf_maxthreads - (int)kcfpool->kp_threads); + if (cnt > 0) { + /* Signal the creator thread for more threads */ + mutex_enter(&kcfpool->kp_user_lock); + if (!kcfpool->kp_signal_create_thread) { + kcfpool->kp_signal_create_thread = B_TRUE; + kcfpool->kp_nthrs = cnt; + cv_signal(&kcfpool->kp_user_cv); + } + mutex_exit(&kcfpool->kp_user_lock); + } + } + + return (CRYPTO_QUEUED); +} + +/* + * This routine is called by the taskq associated with + * each hardware provider. We notify the kernel consumer + * via the callback routine in case of CRYPTO_SUCCESS or + * a failure. + * + * A request can be of type kcf_areq_node_t or of type + * kcf_sreq_node_t. + */ +static void +process_req_hwp(void *ireq) +{ + int error = 0; + crypto_ctx_t *ctx; + kcf_call_type_t ctype; + kcf_provider_desc_t *pd; + kcf_areq_node_t *areq = (kcf_areq_node_t *)ireq; + kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)ireq; + + pd = ((ctype = GET_REQ_TYPE(ireq)) == CRYPTO_SYNCH) ? + sreq->sn_provider : areq->an_provider; + + /* + * Wait if flow control is in effect for the provider. A + * CRYPTO_PROVIDER_READY or CRYPTO_PROVIDER_FAILED + * notification will signal us. We also get signaled if + * the provider is unregistering. + */ + if (pd->pd_state == KCF_PROV_BUSY) { + mutex_enter(&pd->pd_lock); + while (pd->pd_state == KCF_PROV_BUSY) + cv_wait(&pd->pd_resume_cv, &pd->pd_lock); + mutex_exit(&pd->pd_lock); + } + + /* + * Bump the internal reference count while the request is being + * processed. This is how we know when it's safe to unregister + * a provider. This step must precede the pd_state check below. + */ + KCF_PROV_IREFHOLD(pd); + + /* + * Fail the request if the provider has failed. We return a + * recoverable error and the notified clients attempt any + * recovery. For async clients this is done in kcf_aop_done() + * and for sync clients it is done in the k-api routines. + */ + if (pd->pd_state >= KCF_PROV_FAILED) { + error = CRYPTO_DEVICE_ERROR; + goto bail; + } + + if (ctype == CRYPTO_SYNCH) { + mutex_enter(&sreq->sn_lock); + sreq->sn_state = REQ_INPROGRESS; + mutex_exit(&sreq->sn_lock); + + ctx = sreq->sn_context ? &sreq->sn_context->kc_glbl_ctx : NULL; + error = common_submit_request(sreq->sn_provider, ctx, + sreq->sn_params, sreq); + } else { + kcf_context_t *ictx; + ASSERT(ctype == CRYPTO_ASYNCH); + + /* + * We are in the per-hardware provider thread context and + * hence can sleep. Note that the caller would have done + * a taskq_dispatch(..., TQ_NOSLEEP) and would have returned. + */ + ctx = (ictx = areq->an_context) ? &ictx->kc_glbl_ctx : NULL; + + mutex_enter(&areq->an_lock); + /* + * We need to maintain ordering for multi-part requests. + * an_is_my_turn is set to B_TRUE initially for a request + * when it is enqueued and there are no other requests + * for that context. It is set later from kcf_aop_done() when + * the request before us in the chain of requests for the + * context completes. We get signaled at that point. + */ + if (ictx != NULL) { + ASSERT(ictx->kc_prov_desc == areq->an_provider); + + while (areq->an_is_my_turn == B_FALSE) { + cv_wait(&areq->an_turn_cv, &areq->an_lock); + } + } + areq->an_state = REQ_INPROGRESS; + mutex_exit(&areq->an_lock); + + error = common_submit_request(areq->an_provider, ctx, + &areq->an_params, areq); + } + +bail: + if (error == CRYPTO_QUEUED) { + /* + * The request is queued by the provider and we should + * get a crypto_op_notification() from the provider later. + * We notify the consumer at that time. + */ + return; + } else { /* CRYPTO_SUCCESS or other failure */ + KCF_PROV_IREFRELE(pd); + if (ctype == CRYPTO_SYNCH) + kcf_sop_done(sreq, error); + else + kcf_aop_done(areq, error); + } +} + +/* + * This routine checks if a request can be retried on another + * provider. If true, mech1 is initialized to point to the mechanism + * structure. mech2 is also initialized in case of a dual operation. fg + * is initialized to the correct crypto_func_group_t bit flag. They are + * initialized by this routine, so that the caller can pass them to a + * kcf_get_mech_provider() or kcf_get_dual_provider() with no further change. + * + * We check that the request is for a init or atomic routine and that + * it is for one of the operation groups used from k-api . + */ +static boolean_t +can_resubmit(kcf_areq_node_t *areq, crypto_mechanism_t **mech1, + crypto_mechanism_t **mech2, crypto_func_group_t *fg) +{ + kcf_req_params_t *params; + kcf_op_type_t optype; + + params = &areq->an_params; + optype = params->rp_optype; + + if (!(IS_INIT_OP(optype) || IS_ATOMIC_OP(optype))) + return (B_FALSE); + + switch (params->rp_opgrp) { + case KCF_OG_DIGEST: { + kcf_digest_ops_params_t *dops = ¶ms->rp_u.digest_params; + + dops->do_mech.cm_type = dops->do_framework_mechtype; + *mech1 = &dops->do_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DIGEST : + CRYPTO_FG_DIGEST_ATOMIC; + break; + } + + case KCF_OG_MAC: { + kcf_mac_ops_params_t *mops = ¶ms->rp_u.mac_params; + + mops->mo_mech.cm_type = mops->mo_framework_mechtype; + *mech1 = &mops->mo_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC : + CRYPTO_FG_MAC_ATOMIC; + break; + } + + case KCF_OG_SIGN: { + kcf_sign_ops_params_t *sops = ¶ms->rp_u.sign_params; + + sops->so_mech.cm_type = sops->so_framework_mechtype; + *mech1 = &sops->so_mech; + switch (optype) { + case KCF_OP_INIT: + *fg = CRYPTO_FG_SIGN; + break; + case KCF_OP_ATOMIC: + *fg = CRYPTO_FG_SIGN_ATOMIC; + break; + default: + ASSERT(optype == KCF_OP_SIGN_RECOVER_ATOMIC); + *fg = CRYPTO_FG_SIGN_RECOVER_ATOMIC; + } + break; + } + + case KCF_OG_VERIFY: { + kcf_verify_ops_params_t *vops = ¶ms->rp_u.verify_params; + + vops->vo_mech.cm_type = vops->vo_framework_mechtype; + *mech1 = &vops->vo_mech; + switch (optype) { + case KCF_OP_INIT: + *fg = CRYPTO_FG_VERIFY; + break; + case KCF_OP_ATOMIC: + *fg = CRYPTO_FG_VERIFY_ATOMIC; + break; + default: + ASSERT(optype == KCF_OP_VERIFY_RECOVER_ATOMIC); + *fg = CRYPTO_FG_VERIFY_RECOVER_ATOMIC; + } + break; + } + + case KCF_OG_ENCRYPT: { + kcf_encrypt_ops_params_t *eops = ¶ms->rp_u.encrypt_params; + + eops->eo_mech.cm_type = eops->eo_framework_mechtype; + *mech1 = &eops->eo_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT : + CRYPTO_FG_ENCRYPT_ATOMIC; + break; + } + + case KCF_OG_DECRYPT: { + kcf_decrypt_ops_params_t *dcrops = ¶ms->rp_u.decrypt_params; + + dcrops->dop_mech.cm_type = dcrops->dop_framework_mechtype; + *mech1 = &dcrops->dop_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_DECRYPT : + CRYPTO_FG_DECRYPT_ATOMIC; + break; + } + + case KCF_OG_ENCRYPT_MAC: { + kcf_encrypt_mac_ops_params_t *eops = + ¶ms->rp_u.encrypt_mac_params; + + eops->em_encr_mech.cm_type = eops->em_framework_encr_mechtype; + *mech1 = &eops->em_encr_mech; + eops->em_mac_mech.cm_type = eops->em_framework_mac_mechtype; + *mech2 = &eops->em_mac_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_ENCRYPT_MAC : + CRYPTO_FG_ENCRYPT_MAC_ATOMIC; + break; + } + + case KCF_OG_MAC_DECRYPT: { + kcf_mac_decrypt_ops_params_t *dops = + ¶ms->rp_u.mac_decrypt_params; + + dops->md_mac_mech.cm_type = dops->md_framework_mac_mechtype; + *mech1 = &dops->md_mac_mech; + dops->md_decr_mech.cm_type = dops->md_framework_decr_mechtype; + *mech2 = &dops->md_decr_mech; + *fg = (optype == KCF_OP_INIT) ? CRYPTO_FG_MAC_DECRYPT : + CRYPTO_FG_MAC_DECRYPT_ATOMIC; + break; + } + + default: + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * This routine is called when a request to a provider has failed + * with a recoverable error. This routine tries to find another provider + * and dispatches the request to the new provider, if one is available. + * We reuse the request structure. + * + * A return value of NULL from kcf_get_mech_provider() indicates + * we have tried the last provider. + */ +static int +kcf_resubmit_request(kcf_areq_node_t *areq) +{ + int error = CRYPTO_FAILED; + kcf_context_t *ictx; + kcf_provider_desc_t *old_pd; + kcf_provider_desc_t *new_pd; + crypto_mechanism_t *mech1 = NULL, *mech2 = NULL; + crypto_mech_type_t prov_mt1, prov_mt2; + crypto_func_group_t fg = 0; + + if (!can_resubmit(areq, &mech1, &mech2, &fg)) + return (error); + + old_pd = areq->an_provider; + /* + * Add old_pd to the list of providers already tried. We release + * the hold on old_pd (from the earlier kcf_get_mech_provider()) in + * kcf_free_triedlist(). + */ + if (kcf_insert_triedlist(&areq->an_tried_plist, old_pd, + KM_NOSLEEP) == NULL) + return (error); + + if (mech1 && !mech2) { + new_pd = kcf_get_mech_provider(mech1->cm_type, NULL, &error, + areq->an_tried_plist, fg, + (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0); + } else { + ASSERT(mech1 != NULL && mech2 != NULL); + + new_pd = kcf_get_dual_provider(mech1, mech2, NULL, &prov_mt1, + &prov_mt2, &error, areq->an_tried_plist, fg, fg, + (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), 0); + } + + if (new_pd == NULL) + return (error); + + /* + * We reuse the old context by resetting provider specific + * fields in it. + */ + if ((ictx = areq->an_context) != NULL) { + crypto_ctx_t *ctx; + + ASSERT(old_pd == ictx->kc_prov_desc); + KCF_PROV_REFRELE(ictx->kc_prov_desc); + KCF_PROV_REFHOLD(new_pd); + ictx->kc_prov_desc = new_pd; + + ctx = &ictx->kc_glbl_ctx; + ctx->cc_provider = new_pd->pd_prov_handle; + ctx->cc_session = new_pd->pd_sid; + ctx->cc_provider_private = NULL; + } + + /* We reuse areq. by resetting the provider and context fields. */ + KCF_PROV_REFRELE(old_pd); + KCF_PROV_REFHOLD(new_pd); + areq->an_provider = new_pd; + mutex_enter(&areq->an_lock); + areq->an_state = REQ_WAITING; + mutex_exit(&areq->an_lock); + + switch (new_pd->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + error = kcf_disp_sw_request(areq); + break; + + case CRYPTO_HW_PROVIDER: { + taskq_t *taskq = new_pd->pd_sched_info.ks_taskq; + + if (taskq_dispatch(taskq, process_req_hwp, areq, TQ_NOSLEEP) == + TASKQID_INVALID) { + error = CRYPTO_HOST_MEMORY; + } else { + error = CRYPTO_QUEUED; + } + + break; + default: + break; + } + } + + return (error); +} + +static inline int EMPTY_TASKQ(taskq_t *tq) +{ +#ifdef _KERNEL + return (tq->tq_lowest_id == tq->tq_next_id); +#else + return (tq->tq_task.tqent_next == &tq->tq_task || tq->tq_active == 0); +#endif +} + +/* + * Routine called by both ioctl and k-api. The consumer should + * bundle the parameters into a kcf_req_params_t structure. A bunch + * of macros are available in ops_impl.h for this bundling. They are: + * + * KCF_WRAP_DIGEST_OPS_PARAMS() + * KCF_WRAP_MAC_OPS_PARAMS() + * KCF_WRAP_ENCRYPT_OPS_PARAMS() + * KCF_WRAP_DECRYPT_OPS_PARAMS() ... etc. + * + * It is the caller's responsibility to free the ctx argument when + * appropriate. See the KCF_CONTEXT_COND_RELEASE macro for details. + */ +int +kcf_submit_request(kcf_provider_desc_t *pd, crypto_ctx_t *ctx, + crypto_call_req_t *crq, kcf_req_params_t *params, boolean_t cont) +{ + int error = CRYPTO_SUCCESS; + kcf_areq_node_t *areq; + kcf_sreq_node_t *sreq; + kcf_context_t *kcf_ctx; + taskq_t *taskq = pd->pd_sched_info.ks_taskq; + + kcf_ctx = ctx ? (kcf_context_t *)ctx->cc_framework_private : NULL; + + /* Synchronous cases */ + if (crq == NULL) { + switch (pd->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + error = common_submit_request(pd, ctx, params, + KCF_RHNDL(KM_SLEEP)); + break; + + case CRYPTO_HW_PROVIDER: + /* + * Special case for CRYPTO_SYNCHRONOUS providers that + * never return a CRYPTO_QUEUED error. We skip any + * request allocation and call the SPI directly. + */ + if ((pd->pd_flags & CRYPTO_SYNCHRONOUS) && + EMPTY_TASKQ(taskq)) { + KCF_PROV_IREFHOLD(pd); + if (pd->pd_state == KCF_PROV_READY) { + error = common_submit_request(pd, ctx, + params, KCF_RHNDL(KM_SLEEP)); + KCF_PROV_IREFRELE(pd); + ASSERT(error != CRYPTO_QUEUED); + break; + } + KCF_PROV_IREFRELE(pd); + } + + sreq = kmem_cache_alloc(kcf_sreq_cache, KM_SLEEP); + sreq->sn_state = REQ_ALLOCATED; + sreq->sn_rv = CRYPTO_FAILED; + sreq->sn_params = params; + + /* + * Note that we do not need to hold the context + * for synchronous case as the context will never + * become invalid underneath us. We do not need to hold + * the provider here either as the caller has a hold. + */ + sreq->sn_context = kcf_ctx; + ASSERT(KCF_PROV_REFHELD(pd)); + sreq->sn_provider = pd; + + ASSERT(taskq != NULL); + /* + * Call the SPI directly if the taskq is empty and the + * provider is not busy, else dispatch to the taskq. + * Calling directly is fine as this is the synchronous + * case. This is unlike the asynchronous case where we + * must always dispatch to the taskq. + */ + if (EMPTY_TASKQ(taskq) && + pd->pd_state == KCF_PROV_READY) { + process_req_hwp(sreq); + } else { + /* + * We can not tell from taskq_dispatch() return + * value if we exceeded maxalloc. Hence the + * check here. Since we are allowed to wait in + * the synchronous case, we wait for the taskq + * to become empty. + */ + if (taskq->tq_nalloc >= crypto_taskq_maxalloc) { + taskq_wait(taskq); + } + + (void) taskq_dispatch(taskq, process_req_hwp, + sreq, TQ_SLEEP); + } + + /* + * Wait for the notification to arrive, + * if the operation is not done yet. + * Bug# 4722589 will make the wait a cv_wait_sig(). + */ + mutex_enter(&sreq->sn_lock); + while (sreq->sn_state < REQ_DONE) + cv_wait(&sreq->sn_cv, &sreq->sn_lock); + mutex_exit(&sreq->sn_lock); + + error = sreq->sn_rv; + kmem_cache_free(kcf_sreq_cache, sreq); + + break; + + default: + error = CRYPTO_FAILED; + break; + } + + } else { /* Asynchronous cases */ + switch (pd->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + if (!(crq->cr_flag & CRYPTO_ALWAYS_QUEUE)) { + /* + * This case has less overhead since there is + * no switching of context. + */ + error = common_submit_request(pd, ctx, params, + KCF_RHNDL(KM_NOSLEEP)); + } else { + /* + * CRYPTO_ALWAYS_QUEUE is set. We need to + * queue the request and return. + */ + areq = kcf_areqnode_alloc(pd, kcf_ctx, crq, + params, cont); + if (areq == NULL) + error = CRYPTO_HOST_MEMORY; + else { + if (!(crq->cr_flag + & CRYPTO_SKIP_REQID)) { + /* + * Set the request handle. This handle + * is used for any crypto_cancel_req(9f) + * calls from the consumer. We have to + * do this before dispatching the + * request. + */ + crq->cr_reqid = kcf_reqid_insert(areq); + } + + error = kcf_disp_sw_request(areq); + /* + * There is an error processing this + * request. Remove the handle and + * release the request structure. + */ + if (error != CRYPTO_QUEUED) { + if (!(crq->cr_flag + & CRYPTO_SKIP_REQID)) + kcf_reqid_delete(areq); + KCF_AREQ_REFRELE(areq); + } + } + } + break; + + case CRYPTO_HW_PROVIDER: + /* + * We need to queue the request and return. + */ + areq = kcf_areqnode_alloc(pd, kcf_ctx, crq, params, + cont); + if (areq == NULL) { + error = CRYPTO_HOST_MEMORY; + goto done; + } + + ASSERT(taskq != NULL); + /* + * We can not tell from taskq_dispatch() return + * value if we exceeded maxalloc. Hence the check + * here. + */ + if (taskq->tq_nalloc >= crypto_taskq_maxalloc) { + error = CRYPTO_BUSY; + KCF_AREQ_REFRELE(areq); + goto done; + } + + if (!(crq->cr_flag & CRYPTO_SKIP_REQID)) { + /* + * Set the request handle. This handle is used + * for any crypto_cancel_req(9f) calls from the + * consumer. We have to do this before dispatching + * the request. + */ + crq->cr_reqid = kcf_reqid_insert(areq); + } + + if (taskq_dispatch(taskq, + process_req_hwp, areq, TQ_NOSLEEP) == + TASKQID_INVALID) { + error = CRYPTO_HOST_MEMORY; + if (!(crq->cr_flag & CRYPTO_SKIP_REQID)) + kcf_reqid_delete(areq); + KCF_AREQ_REFRELE(areq); + } else { + error = CRYPTO_QUEUED; + } + break; + + default: + error = CRYPTO_FAILED; + break; + } + } + +done: + return (error); +} + +/* + * We're done with this framework context, so free it. Note that freeing + * framework context (kcf_context) frees the global context (crypto_ctx). + * + * The provider is responsible for freeing provider private context after a + * final or single operation and resetting the cc_provider_private field + * to NULL. It should do this before it notifies the framework of the + * completion. We still need to call KCF_PROV_FREE_CONTEXT to handle cases + * like crypto_cancel_ctx(9f). + */ +void +kcf_free_context(kcf_context_t *kcf_ctx) +{ + kcf_provider_desc_t *pd = kcf_ctx->kc_prov_desc; + crypto_ctx_t *gctx = &kcf_ctx->kc_glbl_ctx; + kcf_context_t *kcf_secondctx = kcf_ctx->kc_secondctx; + + /* Release the second context, if any */ + + if (kcf_secondctx != NULL) + KCF_CONTEXT_REFRELE(kcf_secondctx); + + if (gctx->cc_provider_private != NULL) { + mutex_enter(&pd->pd_lock); + if (!KCF_IS_PROV_REMOVED(pd)) { + /* + * Increment the provider's internal refcnt so it + * doesn't unregister from the framework while + * we're calling the entry point. + */ + KCF_PROV_IREFHOLD(pd); + mutex_exit(&pd->pd_lock); + (void) KCF_PROV_FREE_CONTEXT(pd, gctx); + KCF_PROV_IREFRELE(pd); + } else { + mutex_exit(&pd->pd_lock); + } + } + + /* kcf_ctx->kc_prov_desc has a hold on pd */ + KCF_PROV_REFRELE(kcf_ctx->kc_prov_desc); + + /* check if this context is shared with a software provider */ + if ((gctx->cc_flags & CRYPTO_INIT_OPSTATE) && + kcf_ctx->kc_sw_prov_desc != NULL) { + KCF_PROV_REFRELE(kcf_ctx->kc_sw_prov_desc); + } + + kmem_cache_free(kcf_context_cache, kcf_ctx); +} + +/* + * Free the request after releasing all the holds. + */ +void +kcf_free_req(kcf_areq_node_t *areq) +{ + KCF_PROV_REFRELE(areq->an_provider); + if (areq->an_context != NULL) + KCF_CONTEXT_REFRELE(areq->an_context); + + if (areq->an_tried_plist != NULL) + kcf_free_triedlist(areq->an_tried_plist); + kmem_cache_free(kcf_areq_cache, areq); +} + +/* + * Utility routine to remove a request from the chain of requests + * hanging off a context. + */ +static void +kcf_removereq_in_ctxchain(kcf_context_t *ictx, kcf_areq_node_t *areq) +{ + kcf_areq_node_t *cur, *prev; + + /* + * Get context lock, search for areq in the chain and remove it. + */ + ASSERT(ictx != NULL); + mutex_enter(&ictx->kc_in_use_lock); + prev = cur = ictx->kc_req_chain_first; + + while (cur != NULL) { + if (cur == areq) { + if (prev == cur) { + if ((ictx->kc_req_chain_first = + cur->an_ctxchain_next) == NULL) + ictx->kc_req_chain_last = NULL; + } else { + if (cur == ictx->kc_req_chain_last) + ictx->kc_req_chain_last = prev; + prev->an_ctxchain_next = cur->an_ctxchain_next; + } + + break; + } + prev = cur; + cur = cur->an_ctxchain_next; + } + mutex_exit(&ictx->kc_in_use_lock); +} + +/* + * Remove the specified node from the global software queue. + * + * The caller must hold the queue lock and request lock (an_lock). + */ +static void +kcf_remove_node(kcf_areq_node_t *node) +{ + kcf_areq_node_t *nextp = node->an_next; + kcf_areq_node_t *prevp = node->an_prev; + + if (nextp != NULL) + nextp->an_prev = prevp; + else + gswq->gs_last = prevp; + + if (prevp != NULL) + prevp->an_next = nextp; + else + gswq->gs_first = nextp; + + node->an_state = REQ_CANCELED; +} + +/* + * Add the request node to the end of the global software queue. + * + * The caller should not hold the queue lock. Returns 0 if the + * request is successfully queued. Returns CRYPTO_BUSY if the limit + * on the number of jobs is exceeded. + */ +static int +kcf_enqueue(kcf_areq_node_t *node) +{ + kcf_areq_node_t *tnode; + + mutex_enter(&gswq->gs_lock); + + if (gswq->gs_njobs >= gswq->gs_maxjobs) { + mutex_exit(&gswq->gs_lock); + return (CRYPTO_BUSY); + } + + if (gswq->gs_last == NULL) { + gswq->gs_first = gswq->gs_last = node; + } else { + ASSERT(gswq->gs_last->an_next == NULL); + tnode = gswq->gs_last; + tnode->an_next = node; + gswq->gs_last = node; + node->an_prev = tnode; + } + + gswq->gs_njobs++; + + /* an_lock not needed here as we hold gs_lock */ + node->an_state = REQ_WAITING; + + mutex_exit(&gswq->gs_lock); + + return (0); +} + +/* + * kmem_cache_alloc constructor for sync request structure. + */ +/* ARGSUSED */ +static int +kcf_sreq_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf; + + sreq->sn_type = CRYPTO_SYNCH; + cv_init(&sreq->sn_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&sreq->sn_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* ARGSUSED */ +static void +kcf_sreq_cache_destructor(void *buf, void *cdrarg) +{ + kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)buf; + + mutex_destroy(&sreq->sn_lock); + cv_destroy(&sreq->sn_cv); +} + +/* + * kmem_cache_alloc constructor for async request structure. + */ +/* ARGSUSED */ +static int +kcf_areq_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + kcf_areq_node_t *areq = (kcf_areq_node_t *)buf; + + areq->an_type = CRYPTO_ASYNCH; + areq->an_refcnt = 0; + mutex_init(&areq->an_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&areq->an_done, NULL, CV_DEFAULT, NULL); + cv_init(&areq->an_turn_cv, NULL, CV_DEFAULT, NULL); + + return (0); +} + +/* ARGSUSED */ +static void +kcf_areq_cache_destructor(void *buf, void *cdrarg) +{ + kcf_areq_node_t *areq = (kcf_areq_node_t *)buf; + + ASSERT(areq->an_refcnt == 0); + mutex_destroy(&areq->an_lock); + cv_destroy(&areq->an_done); + cv_destroy(&areq->an_turn_cv); +} + +/* + * kmem_cache_alloc constructor for kcf_context structure. + */ +/* ARGSUSED */ +static int +kcf_context_cache_constructor(void *buf, void *cdrarg, int kmflags) +{ + kcf_context_t *kctx = (kcf_context_t *)buf; + + kctx->kc_refcnt = 0; + mutex_init(&kctx->kc_in_use_lock, NULL, MUTEX_DEFAULT, NULL); + + return (0); +} + +/* ARGSUSED */ +static void +kcf_context_cache_destructor(void *buf, void *cdrarg) +{ + kcf_context_t *kctx = (kcf_context_t *)buf; + + ASSERT(kctx->kc_refcnt == 0); + mutex_destroy(&kctx->kc_in_use_lock); +} + +void +kcf_sched_destroy(void) +{ + int i; + + if (kcf_misc_kstat) + kstat_delete(kcf_misc_kstat); + + if (kcfpool) { + mutex_destroy(&kcfpool->kp_thread_lock); + cv_destroy(&kcfpool->kp_nothr_cv); + mutex_destroy(&kcfpool->kp_user_lock); + cv_destroy(&kcfpool->kp_user_cv); + + kmem_free(kcfpool, sizeof (kcf_pool_t)); + } + + for (i = 0; i < REQID_TABLES; i++) { + if (kcf_reqid_table[i]) { + mutex_destroy(&(kcf_reqid_table[i]->rt_lock)); + kmem_free(kcf_reqid_table[i], + sizeof (kcf_reqid_table_t)); + } + } + + if (gswq) { + mutex_destroy(&gswq->gs_lock); + cv_destroy(&gswq->gs_cv); + kmem_free(gswq, sizeof (kcf_global_swq_t)); + } + + if (kcf_context_cache) + kmem_cache_destroy(kcf_context_cache); + if (kcf_areq_cache) + kmem_cache_destroy(kcf_areq_cache); + if (kcf_sreq_cache) + kmem_cache_destroy(kcf_sreq_cache); + + mutex_destroy(&ntfy_list_lock); + cv_destroy(&ntfy_list_cv); +} + +/* + * Creates and initializes all the structures needed by the framework. + */ +void +kcf_sched_init(void) +{ + int i; + kcf_reqid_table_t *rt; + + /* + * Create all the kmem caches needed by the framework. We set the + * align argument to 64, to get a slab aligned to 64-byte as well as + * have the objects (cache_chunksize) to be a 64-byte multiple. + * This helps to avoid false sharing as this is the size of the + * CPU cache line. + */ + kcf_sreq_cache = kmem_cache_create("kcf_sreq_cache", + sizeof (struct kcf_sreq_node), 64, kcf_sreq_cache_constructor, + kcf_sreq_cache_destructor, NULL, NULL, NULL, 0); + + kcf_areq_cache = kmem_cache_create("kcf_areq_cache", + sizeof (struct kcf_areq_node), 64, kcf_areq_cache_constructor, + kcf_areq_cache_destructor, NULL, NULL, NULL, 0); + + kcf_context_cache = kmem_cache_create("kcf_context_cache", + sizeof (struct kcf_context), 64, kcf_context_cache_constructor, + kcf_context_cache_destructor, NULL, NULL, NULL, 0); + + gswq = kmem_alloc(sizeof (kcf_global_swq_t), KM_SLEEP); + + mutex_init(&gswq->gs_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&gswq->gs_cv, NULL, CV_DEFAULT, NULL); + gswq->gs_njobs = 0; + gswq->gs_maxjobs = kcf_maxthreads * crypto_taskq_maxalloc; + gswq->gs_first = gswq->gs_last = NULL; + + /* Initialize the global reqid table */ + for (i = 0; i < REQID_TABLES; i++) { + rt = kmem_zalloc(sizeof (kcf_reqid_table_t), KM_SLEEP); + kcf_reqid_table[i] = rt; + mutex_init(&rt->rt_lock, NULL, MUTEX_DEFAULT, NULL); + rt->rt_curid = i; + } + + /* Allocate and initialize the thread pool */ + kcfpool_alloc(); + + /* Initialize the event notification list variables */ + mutex_init(&ntfy_list_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ntfy_list_cv, NULL, CV_DEFAULT, NULL); + + /* Create the kcf kstat */ + kcf_misc_kstat = kstat_create("kcf", 0, "framework_stats", "crypto", + KSTAT_TYPE_NAMED, sizeof (kcf_stats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (kcf_misc_kstat != NULL) { + kcf_misc_kstat->ks_data = &kcf_ksdata; + kcf_misc_kstat->ks_update = kcf_misc_kstat_update; + kstat_install(kcf_misc_kstat); + } +} + +/* + * Signal the waiting sync client. + */ +void +kcf_sop_done(kcf_sreq_node_t *sreq, int error) +{ + mutex_enter(&sreq->sn_lock); + sreq->sn_state = REQ_DONE; + sreq->sn_rv = error; + cv_signal(&sreq->sn_cv); + mutex_exit(&sreq->sn_lock); +} + +/* + * Callback the async client with the operation status. + * We free the async request node and possibly the context. + * We also handle any chain of requests hanging off of + * the context. + */ +void +kcf_aop_done(kcf_areq_node_t *areq, int error) +{ + kcf_op_type_t optype; + boolean_t skip_notify = B_FALSE; + kcf_context_t *ictx; + kcf_areq_node_t *nextreq; + + /* + * Handle recoverable errors. This has to be done first + * before doing anything else in this routine so that + * we do not change the state of the request. + */ + if (error != CRYPTO_SUCCESS && IS_RECOVERABLE(error)) { + /* + * We try another provider, if one is available. Else + * we continue with the failure notification to the + * client. + */ + if (kcf_resubmit_request(areq) == CRYPTO_QUEUED) + return; + } + + mutex_enter(&areq->an_lock); + areq->an_state = REQ_DONE; + mutex_exit(&areq->an_lock); + + optype = (&areq->an_params)->rp_optype; + if ((ictx = areq->an_context) != NULL) { + /* + * A request after it is removed from the request + * queue, still stays on a chain of requests hanging + * of its context structure. It needs to be removed + * from this chain at this point. + */ + mutex_enter(&ictx->kc_in_use_lock); + nextreq = areq->an_ctxchain_next; + if (nextreq != NULL) { + mutex_enter(&nextreq->an_lock); + nextreq->an_is_my_turn = B_TRUE; + cv_signal(&nextreq->an_turn_cv); + mutex_exit(&nextreq->an_lock); + } + + ictx->kc_req_chain_first = nextreq; + if (nextreq == NULL) + ictx->kc_req_chain_last = NULL; + mutex_exit(&ictx->kc_in_use_lock); + + if (IS_SINGLE_OP(optype) || IS_FINAL_OP(optype)) { + ASSERT(nextreq == NULL); + KCF_CONTEXT_REFRELE(ictx); + } else if (error != CRYPTO_SUCCESS && IS_INIT_OP(optype)) { + /* + * NOTE - We do not release the context in case of update + * operations. We require the consumer to free it explicitly, + * in case it wants to abandon an update operation. This is done + * as there may be mechanisms in ECB mode that can continue + * even if an operation on a block fails. + */ + KCF_CONTEXT_REFRELE(ictx); + } + } + + /* Deal with the internal continuation to this request first */ + + if (areq->an_isdual) { + kcf_dual_req_t *next_arg; + next_arg = (kcf_dual_req_t *)areq->an_reqarg.cr_callback_arg; + next_arg->kr_areq = areq; + KCF_AREQ_REFHOLD(areq); + areq->an_isdual = B_FALSE; + + NOTIFY_CLIENT(areq, error); + return; + } + + /* + * If CRYPTO_NOTIFY_OPDONE flag is set, we should notify + * always. If this flag is clear, we skip the notification + * provided there are no errors. We check this flag for only + * init or update operations. It is ignored for single, final or + * atomic operations. + */ + skip_notify = (IS_UPDATE_OP(optype) || IS_INIT_OP(optype)) && + (!(areq->an_reqarg.cr_flag & CRYPTO_NOTIFY_OPDONE)) && + (error == CRYPTO_SUCCESS); + + if (!skip_notify) { + NOTIFY_CLIENT(areq, error); + } + + if (!(areq->an_reqarg.cr_flag & CRYPTO_SKIP_REQID)) + kcf_reqid_delete(areq); + + KCF_AREQ_REFRELE(areq); +} + +/* + * Allocate the thread pool and initialize all the fields. + */ +static void +kcfpool_alloc() +{ + kcfpool = kmem_alloc(sizeof (kcf_pool_t), KM_SLEEP); + + kcfpool->kp_threads = kcfpool->kp_idlethreads = 0; + kcfpool->kp_blockedthreads = 0; + kcfpool->kp_signal_create_thread = B_FALSE; + kcfpool->kp_nthrs = 0; + kcfpool->kp_user_waiting = B_FALSE; + + mutex_init(&kcfpool->kp_thread_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&kcfpool->kp_nothr_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&kcfpool->kp_user_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&kcfpool->kp_user_cv, NULL, CV_DEFAULT, NULL); + + kcf_idlethr_timeout = KCF_DEFAULT_THRTIMEOUT; +} + +/* + * Insert the async request in the hash table after assigning it + * an ID. Returns the ID. + * + * The ID is used by the caller to pass as an argument to a + * cancel_req() routine later. + */ +static crypto_req_id_t +kcf_reqid_insert(kcf_areq_node_t *areq) +{ + int indx; + crypto_req_id_t id; + kcf_areq_node_t *headp; + kcf_reqid_table_t *rt; + + kpreempt_disable(); + rt = kcf_reqid_table[CPU_SEQID & REQID_TABLE_MASK]; + kpreempt_enable(); + + mutex_enter(&rt->rt_lock); + + rt->rt_curid = id = + (rt->rt_curid - REQID_COUNTER_LOW) | REQID_COUNTER_HIGH; + SET_REQID(areq, id); + indx = REQID_HASH(id); + headp = areq->an_idnext = rt->rt_idhash[indx]; + areq->an_idprev = NULL; + if (headp != NULL) + headp->an_idprev = areq; + + rt->rt_idhash[indx] = areq; + mutex_exit(&rt->rt_lock); + + return (id); +} + +/* + * Delete the async request from the hash table. + */ +static void +kcf_reqid_delete(kcf_areq_node_t *areq) +{ + int indx; + kcf_areq_node_t *nextp, *prevp; + crypto_req_id_t id = GET_REQID(areq); + kcf_reqid_table_t *rt; + + rt = kcf_reqid_table[id & REQID_TABLE_MASK]; + indx = REQID_HASH(id); + + mutex_enter(&rt->rt_lock); + + nextp = areq->an_idnext; + prevp = areq->an_idprev; + if (nextp != NULL) + nextp->an_idprev = prevp; + if (prevp != NULL) + prevp->an_idnext = nextp; + else + rt->rt_idhash[indx] = nextp; + + SET_REQID(areq, 0); + cv_broadcast(&areq->an_done); + + mutex_exit(&rt->rt_lock); +} + +/* + * Cancel a single asynchronous request. + * + * We guarantee that no problems will result from calling + * crypto_cancel_req() for a request which is either running, or + * has already completed. We remove the request from any queues + * if it is possible. We wait for request completion if the + * request is dispatched to a provider. + * + * Calling context: + * Can be called from user context only. + * + * NOTE: We acquire the following locks in this routine (in order): + * - rt_lock (kcf_reqid_table_t) + * - gswq->gs_lock + * - areq->an_lock + * - ictx->kc_in_use_lock (from kcf_removereq_in_ctxchain()) + * + * This locking order MUST be maintained in code every where else. + */ +void +crypto_cancel_req(crypto_req_id_t id) +{ + int indx; + kcf_areq_node_t *areq; + kcf_provider_desc_t *pd; + kcf_context_t *ictx; + kcf_reqid_table_t *rt; + + rt = kcf_reqid_table[id & REQID_TABLE_MASK]; + indx = REQID_HASH(id); + + mutex_enter(&rt->rt_lock); + for (areq = rt->rt_idhash[indx]; areq; areq = areq->an_idnext) { + if (GET_REQID(areq) == id) { + /* + * We found the request. It is either still waiting + * in the framework queues or running at the provider. + */ + pd = areq->an_provider; + ASSERT(pd != NULL); + + switch (pd->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + mutex_enter(&gswq->gs_lock); + mutex_enter(&areq->an_lock); + + /* This request can be safely canceled. */ + if (areq->an_state <= REQ_WAITING) { + /* Remove from gswq, global software queue. */ + kcf_remove_node(areq); + if ((ictx = areq->an_context) != NULL) + kcf_removereq_in_ctxchain(ictx, areq); + + mutex_exit(&areq->an_lock); + mutex_exit(&gswq->gs_lock); + mutex_exit(&rt->rt_lock); + + /* Remove areq from hash table and free it. */ + kcf_reqid_delete(areq); + KCF_AREQ_REFRELE(areq); + return; + } + + mutex_exit(&areq->an_lock); + mutex_exit(&gswq->gs_lock); + break; + + case CRYPTO_HW_PROVIDER: + /* + * There is no interface to remove an entry + * once it is on the taskq. So, we do not do + * anything for a hardware provider. + */ + break; + default: + break; + } + + /* + * The request is running. Wait for the request completion + * to notify us. + */ + KCF_AREQ_REFHOLD(areq); + while (GET_REQID(areq) == id) + cv_wait(&areq->an_done, &rt->rt_lock); + KCF_AREQ_REFRELE(areq); + break; + } + } + + mutex_exit(&rt->rt_lock); +} + +/* + * Cancel all asynchronous requests associated with the + * passed in crypto context and free it. + * + * A client SHOULD NOT call this routine after calling a crypto_*_final + * routine. This routine is called only during intermediate operations. + * The client should not use the crypto context after this function returns + * since we destroy it. + * + * Calling context: + * Can be called from user context only. + */ +void +crypto_cancel_ctx(crypto_context_t ctx) +{ + kcf_context_t *ictx; + kcf_areq_node_t *areq; + + if (ctx == NULL) + return; + + ictx = (kcf_context_t *)((crypto_ctx_t *)ctx)->cc_framework_private; + + mutex_enter(&ictx->kc_in_use_lock); + + /* Walk the chain and cancel each request */ + while ((areq = ictx->kc_req_chain_first) != NULL) { + /* + * We have to drop the lock here as we may have + * to wait for request completion. We hold the + * request before dropping the lock though, so that it + * won't be freed underneath us. + */ + KCF_AREQ_REFHOLD(areq); + mutex_exit(&ictx->kc_in_use_lock); + + crypto_cancel_req(GET_REQID(areq)); + KCF_AREQ_REFRELE(areq); + + mutex_enter(&ictx->kc_in_use_lock); + } + + mutex_exit(&ictx->kc_in_use_lock); + KCF_CONTEXT_REFRELE(ictx); +} + +/* + * Update kstats. + */ +static int +kcf_misc_kstat_update(kstat_t *ksp, int rw) +{ + uint_t tcnt; + kcf_stats_t *ks_data; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ks_data = ksp->ks_data; + + ks_data->ks_thrs_in_pool.value.ui32 = kcfpool->kp_threads; + /* + * The failover thread is counted in kp_idlethreads in + * some corner cases. This is done to avoid doing more checks + * when submitting a request. We account for those cases below. + */ + if ((tcnt = kcfpool->kp_idlethreads) == (kcfpool->kp_threads + 1)) + tcnt--; + ks_data->ks_idle_thrs.value.ui32 = tcnt; + ks_data->ks_minthrs.value.ui32 = kcf_minthreads; + ks_data->ks_maxthrs.value.ui32 = kcf_maxthreads; + ks_data->ks_swq_njobs.value.ui32 = gswq->gs_njobs; + ks_data->ks_swq_maxjobs.value.ui32 = gswq->gs_maxjobs; + ks_data->ks_taskq_threads.value.ui32 = crypto_taskq_threads; + ks_data->ks_taskq_minalloc.value.ui32 = crypto_taskq_minalloc; + ks_data->ks_taskq_maxalloc.value.ui32 = crypto_taskq_maxalloc; + + return (0); +} + +/* + * Allocate and initialize a kcf_dual_req, used for saving the arguments of + * a dual operation or an atomic operation that has to be internally + * simulated with multiple single steps. + * crq determines the memory allocation flags. + */ + +kcf_dual_req_t * +kcf_alloc_req(crypto_call_req_t *crq) +{ + kcf_dual_req_t *kcr; + + kcr = kmem_alloc(sizeof (kcf_dual_req_t), KCF_KMFLAG(crq)); + + if (kcr == NULL) + return (NULL); + + /* Copy the whole crypto_call_req struct, as it isn't persistent */ + if (crq != NULL) + kcr->kr_callreq = *crq; + else + bzero(&(kcr->kr_callreq), sizeof (crypto_call_req_t)); + kcr->kr_areq = NULL; + kcr->kr_saveoffset = 0; + kcr->kr_savelen = 0; + + return (kcr); +} + +/* + * Callback routine for the next part of a simulated dual part. + * Schedules the next step. + * + * This routine can be called from interrupt context. + */ +void +kcf_next_req(void *next_req_arg, int status) +{ + kcf_dual_req_t *next_req = (kcf_dual_req_t *)next_req_arg; + kcf_req_params_t *params = &(next_req->kr_params); + kcf_areq_node_t *areq = next_req->kr_areq; + int error = status; + kcf_provider_desc_t *pd = NULL; + crypto_dual_data_t *ct = NULL; + + /* Stop the processing if an error occurred at this step */ + if (error != CRYPTO_SUCCESS) { +out: + areq->an_reqarg = next_req->kr_callreq; + KCF_AREQ_REFRELE(areq); + kmem_free(next_req, sizeof (kcf_dual_req_t)); + areq->an_isdual = B_FALSE; + kcf_aop_done(areq, error); + return; + } + + switch (params->rp_opgrp) { + case KCF_OG_MAC: { + + /* + * The next req is submitted with the same reqid as the + * first part. The consumer only got back that reqid, and + * should still be able to cancel the operation during its + * second step. + */ + kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params); + crypto_ctx_template_t mac_tmpl; + kcf_mech_entry_t *me; + + ct = (crypto_dual_data_t *)mops->mo_data; + mac_tmpl = (crypto_ctx_template_t)mops->mo_templ; + + /* No expected recoverable failures, so no retry list */ + pd = kcf_get_mech_provider(mops->mo_framework_mechtype, + &me, &error, NULL, CRYPTO_FG_MAC_ATOMIC, + (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len2); + + if (pd == NULL) { + error = CRYPTO_MECH_NOT_SUPPORTED; + goto out; + } + /* Validate the MAC context template here */ + if ((pd->pd_prov_type == CRYPTO_SW_PROVIDER) && + (mac_tmpl != NULL)) { + kcf_ctx_template_t *ctx_mac_tmpl; + + ctx_mac_tmpl = (kcf_ctx_template_t *)mac_tmpl; + + if (ctx_mac_tmpl->ct_generation != me->me_gen_swprov) { + KCF_PROV_REFRELE(pd); + error = CRYPTO_OLD_CTX_TEMPLATE; + goto out; + } + mops->mo_templ = ctx_mac_tmpl->ct_prov_tmpl; + } + + break; + } + case KCF_OG_DECRYPT: { + kcf_decrypt_ops_params_t *dcrops = + &(params->rp_u.decrypt_params); + + ct = (crypto_dual_data_t *)dcrops->dop_ciphertext; + /* No expected recoverable failures, so no retry list */ + pd = kcf_get_mech_provider(dcrops->dop_framework_mechtype, + NULL, &error, NULL, CRYPTO_FG_DECRYPT_ATOMIC, + (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED), ct->dd_len1); + + if (pd == NULL) { + error = CRYPTO_MECH_NOT_SUPPORTED; + goto out; + } + break; + } + default: + break; + } + + /* The second step uses len2 and offset2 of the dual_data */ + next_req->kr_saveoffset = ct->dd_offset1; + next_req->kr_savelen = ct->dd_len1; + ct->dd_offset1 = ct->dd_offset2; + ct->dd_len1 = ct->dd_len2; + + /* preserve if the caller is restricted */ + if (areq->an_reqarg.cr_flag & CRYPTO_RESTRICTED) { + areq->an_reqarg.cr_flag = CRYPTO_RESTRICTED; + } else { + areq->an_reqarg.cr_flag = 0; + } + + areq->an_reqarg.cr_callback_func = kcf_last_req; + areq->an_reqarg.cr_callback_arg = next_req; + areq->an_isdual = B_TRUE; + + /* + * We would like to call kcf_submit_request() here. But, + * that is not possible as that routine allocates a new + * kcf_areq_node_t request structure, while we need to + * reuse the existing request structure. + */ + switch (pd->pd_prov_type) { + case CRYPTO_SW_PROVIDER: + error = common_submit_request(pd, NULL, params, + KCF_RHNDL(KM_NOSLEEP)); + break; + + case CRYPTO_HW_PROVIDER: { + kcf_provider_desc_t *old_pd; + taskq_t *taskq = pd->pd_sched_info.ks_taskq; + + /* + * Set the params for the second step in the + * dual-ops. + */ + areq->an_params = *params; + old_pd = areq->an_provider; + KCF_PROV_REFRELE(old_pd); + KCF_PROV_REFHOLD(pd); + areq->an_provider = pd; + + /* + * Note that we have to do a taskq_dispatch() + * here as we may be in interrupt context. + */ + if (taskq_dispatch(taskq, process_req_hwp, areq, + TQ_NOSLEEP) == (taskqid_t)0) { + error = CRYPTO_HOST_MEMORY; + } else { + error = CRYPTO_QUEUED; + } + break; + } + default: + break; + } + + /* + * We have to release the holds on the request and the provider + * in all cases. + */ + KCF_AREQ_REFRELE(areq); + KCF_PROV_REFRELE(pd); + + if (error != CRYPTO_QUEUED) { + /* restore, clean up, and invoke the client's callback */ + + ct->dd_offset1 = next_req->kr_saveoffset; + ct->dd_len1 = next_req->kr_savelen; + areq->an_reqarg = next_req->kr_callreq; + kmem_free(next_req, sizeof (kcf_dual_req_t)); + areq->an_isdual = B_FALSE; + kcf_aop_done(areq, error); + } +} + +/* + * Last part of an emulated dual operation. + * Clean up and restore ... + */ +void +kcf_last_req(void *last_req_arg, int status) +{ + kcf_dual_req_t *last_req = (kcf_dual_req_t *)last_req_arg; + + kcf_req_params_t *params = &(last_req->kr_params); + kcf_areq_node_t *areq = last_req->kr_areq; + crypto_dual_data_t *ct = NULL; + + switch (params->rp_opgrp) { + case KCF_OG_MAC: { + kcf_mac_ops_params_t *mops = &(params->rp_u.mac_params); + + ct = (crypto_dual_data_t *)mops->mo_data; + break; + } + case KCF_OG_DECRYPT: { + kcf_decrypt_ops_params_t *dcrops = + &(params->rp_u.decrypt_params); + + ct = (crypto_dual_data_t *)dcrops->dop_ciphertext; + break; + } + default: { + panic("invalid kcf_op_group_t %d", (int)params->rp_opgrp); + return; + } + } + ct->dd_offset1 = last_req->kr_saveoffset; + ct->dd_len1 = last_req->kr_savelen; + + /* The submitter used kcf_last_req as its callback */ + + if (areq == NULL) { + crypto_call_req_t *cr = &last_req->kr_callreq; + + (*(cr->cr_callback_func))(cr->cr_callback_arg, status); + kmem_free(last_req, sizeof (kcf_dual_req_t)); + return; + } + areq->an_reqarg = last_req->kr_callreq; + KCF_AREQ_REFRELE(areq); + kmem_free(last_req, sizeof (kcf_dual_req_t)); + areq->an_isdual = B_FALSE; + kcf_aop_done(areq, status); +} diff --git a/module/icp/illumos-crypto.c b/module/icp/illumos-crypto.c new file mode 100644 index 000000000000..3c5ef4393940 --- /dev/null +++ b/module/icp/illumos-crypto.c @@ -0,0 +1,158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#ifdef _KERNEL +#include +#include +#include +#else +#define __exit +#define __init +#endif + +#include +#include +#include +#include +#include +#include + +/* + * Changes made to the original Illumos Crypto Layer for the ICP: + * + * Several changes were needed to allow the Illumos Crypto Layer + * to work in the Linux kernel. Almost all of the changes fall into + * one of the following categories: + * + * 1) Moving the syntax to the C90: This was mostly a matter of + * changing func() definitions to func(void). In a few cases, + * initializations of structs with unions needed to have brackets + * added. + * + * 2) Changes to allow userspace compilation: The ICP is meant to be + * compiled and used in both userspace and kernel space (for ztest and + * libzfs), so the _KERNEL macros did not make sense anymore. For the + * same reason, many header includes were also changed to use + * sys/zfs_context.h + * + * 3) Moving to a statically compiled architecture: At some point in + * the future it may make sense to have encryption algorithms that are + * loadable into the ICP at runtime via separate kernel modules. + * However, considering that this code will probably not see much use + * outside of zfs and zfs encryption only requires aes and sha256 + * algorithms it seemed like more trouble than it was worth to port over + * Illumos's kernel module structure to a Linux kernel module. In + * addition, The Illumos code related to keeping track of kernel modules + * is very much tied to the Illumos OS and proved difficult to port to + * Linux. Therefore, the structure of the ICP was simplified to work + * statically and several pieces of code responsible for keeping track + * of Illumos kernel modules were removed and simplified. All module + * initialization and destruction is now called in this file during + * Linux kernel module loading and unloading. + * + * 4) Adding destructors: The Illumos Crypto Layer is built into + * the Illumos kernel and is not meant to be unloaded. Some destructors + * were added to allow the ICP to be unloaded without leaking + * structures. + * + * 5) Removing CRYPTO_DATA_MBLK related structures and code: + * crypto_data_t can have 3 formats, CRYPTO_DATA_RAW, CRYPTO_DATA_UIO, + * and CRYPTO_DATA_MBLK. ZFS only requires the first 2 formats, as the + * last one is related to streamed data. To simplify the port, code + * related to this format was removed. + * + * 6) Changes for architecture specific code: Some changes were needed + * to make architecture specific assembly compile. The biggest change + * here was to functions related to detecting CPU capabilities for amd64. + * The Illumos Crypto Layer used called into the Illumos kernel's API + * to discover these. They have been converted to instead use the + * 'cpuid' instruction as per the Intel spec. In addition, references to + * the sun4u' and sparc architectures have been removed so that these + * will use the generic implementation. + * + * 7) Removing sha384 and sha512 code: The sha code was actually very + * easy to port. However, the generic sha384 and sha512 code actually + * exceeds the stack size on arm and powerpc architectures. In an effort + * to remove warnings, this code was removed. + * + * 8) Change large allocations from kmem_alloc() to vmem_alloc(): In + * testing the ICP with the ZFS encryption code, a few allocations were + * found that could potentially be very large. These caused the SPL to + * throw warnings and so they were changed to use vmem_alloc(). + * + * 9) Makefiles: Makefiles were added that would work with the existing + * ZFS Makefiles. + */ + +void __exit +icp_fini(void) +{ + skein_mod_fini(); + sha2_mod_fini(); + sha1_mod_fini(); + edonr_mod_fini(); + aes_mod_fini(); + kcf_sched_destroy(); + kcf_prov_tab_destroy(); + kcf_destroy_mech_tabs(); + mod_hash_fini(); +} + +/* roughly equivalent to kcf.c: _init() */ +int __init +icp_init(void) +{ + /* initialize the mod hash module */ + mod_hash_init(); + + /* initialize the mechanisms tables supported out-of-the-box */ + kcf_init_mech_tabs(); + + /* initialize the providers tables */ + kcf_prov_tab_init(); + + /* + * Initialize scheduling structures. Note that this does NOT + * start any threads since it might not be safe to do so. + */ + kcf_sched_init(); + + /* initialize algorithms */ + aes_mod_init(); + edonr_mod_init(); + sha1_mod_init(); + sha2_mod_init(); + skein_mod_init(); + + return (0); +} + +#if defined(_KERNEL) +module_exit(icp_fini); +module_init(icp_init); +MODULE_AUTHOR(ZFS_META_AUTHOR); +MODULE_LICENSE(ZFS_META_LICENSE); +MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); +#endif diff --git a/module/icp/include/aes/aes_impl.h b/module/icp/include/aes/aes_impl.h new file mode 100644 index 000000000000..41dccaa3848a --- /dev/null +++ b/module/icp/include/aes/aes_impl.h @@ -0,0 +1,227 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _AES_IMPL_H +#define _AES_IMPL_H + +/* + * Common definitions used by AES. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Similar to sysmacros.h IS_P2ALIGNED, but checks two pointers: */ +#define IS_P2ALIGNED2(v, w, a) \ + ((((uintptr_t)(v) | (uintptr_t)(w)) & ((uintptr_t)(a) - 1)) == 0) + +#define AES_BLOCK_LEN 16 /* bytes */ +/* Round constant length, in number of 32-bit elements: */ +#define RC_LENGTH (5 * ((AES_BLOCK_LEN) / 4 - 2)) + +#define AES_COPY_BLOCK(src, dst) \ + (dst)[0] = (src)[0]; \ + (dst)[1] = (src)[1]; \ + (dst)[2] = (src)[2]; \ + (dst)[3] = (src)[3]; \ + (dst)[4] = (src)[4]; \ + (dst)[5] = (src)[5]; \ + (dst)[6] = (src)[6]; \ + (dst)[7] = (src)[7]; \ + (dst)[8] = (src)[8]; \ + (dst)[9] = (src)[9]; \ + (dst)[10] = (src)[10]; \ + (dst)[11] = (src)[11]; \ + (dst)[12] = (src)[12]; \ + (dst)[13] = (src)[13]; \ + (dst)[14] = (src)[14]; \ + (dst)[15] = (src)[15] + +#define AES_XOR_BLOCK(src, dst) \ + (dst)[0] ^= (src)[0]; \ + (dst)[1] ^= (src)[1]; \ + (dst)[2] ^= (src)[2]; \ + (dst)[3] ^= (src)[3]; \ + (dst)[4] ^= (src)[4]; \ + (dst)[5] ^= (src)[5]; \ + (dst)[6] ^= (src)[6]; \ + (dst)[7] ^= (src)[7]; \ + (dst)[8] ^= (src)[8]; \ + (dst)[9] ^= (src)[9]; \ + (dst)[10] ^= (src)[10]; \ + (dst)[11] ^= (src)[11]; \ + (dst)[12] ^= (src)[12]; \ + (dst)[13] ^= (src)[13]; \ + (dst)[14] ^= (src)[14]; \ + (dst)[15] ^= (src)[15] + +/* AES key size definitions */ +#define AES_MINBITS 128 +#define AES_MINBYTES ((AES_MINBITS) >> 3) +#define AES_MAXBITS 256 +#define AES_MAXBYTES ((AES_MAXBITS) >> 3) + +#define AES_MIN_KEY_BYTES ((AES_MINBITS) >> 3) +#define AES_MAX_KEY_BYTES ((AES_MAXBITS) >> 3) +#define AES_192_KEY_BYTES 24 +#define AES_IV_LEN 16 + +/* AES key schedule may be implemented with 32- or 64-bit elements: */ +#define AES_32BIT_KS 32 +#define AES_64BIT_KS 64 + +#define MAX_AES_NR 14 /* Maximum number of rounds */ +#define MAX_AES_NB 4 /* Number of columns comprising a state */ + +typedef union { +#ifdef sun4u + uint64_t ks64[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; +#endif + uint32_t ks32[((MAX_AES_NR) + 1) * (MAX_AES_NB)]; +} aes_ks_t; + +typedef struct aes_impl_ops aes_impl_ops_t; + +/* + * The absolute offset of the encr_ks (0) and the nr (504) fields are hard + * coded in aesni-gcm-x86_64, so please don't change (or adjust accordingly). + */ +typedef struct aes_key aes_key_t; +struct aes_key { + aes_ks_t encr_ks; /* encryption key schedule */ + aes_ks_t decr_ks; /* decryption key schedule */ +#ifdef __amd64 + long double align128; /* Align fields above for Intel AES-NI */ +#endif /* __amd64 */ + const aes_impl_ops_t *ops; /* ops associated with this schedule */ + int nr; /* number of rounds (10, 12, or 14) */ + int type; /* key schedule size (32 or 64 bits) */ +}; + +/* + * Core AES functions. + * ks and keysched are pointers to aes_key_t. + * They are declared void* as they are intended to be opaque types. + * Use function aes_alloc_keysched() to allocate memory for ks and keysched. + */ +extern void *aes_alloc_keysched(size_t *size, int kmflag); +extern void aes_init_keysched(const uint8_t *cipherKey, uint_t keyBits, + void *keysched); +extern int aes_encrypt_block(const void *ks, const uint8_t *pt, uint8_t *ct); +extern int aes_decrypt_block(const void *ks, const uint8_t *ct, uint8_t *pt); + +/* + * AES mode functions. + * The first 2 functions operate on 16-byte AES blocks. + */ +extern void aes_copy_block(uint8_t *in, uint8_t *out); +extern void aes_xor_block(uint8_t *data, uint8_t *dst); + +/* Note: ctx is a pointer to aes_ctx_t defined in modes.h */ +extern int aes_encrypt_contiguous_blocks(void *ctx, char *data, size_t length, + crypto_data_t *out); +extern int aes_decrypt_contiguous_blocks(void *ctx, char *data, size_t length, + crypto_data_t *out); + +/* + * The following definitions and declarations are only used by AES FIPS POST + */ +#ifdef _AES_IMPL + +typedef enum aes_mech_type { + AES_ECB_MECH_INFO_TYPE, /* SUN_CKM_AES_ECB */ + AES_CBC_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC */ + AES_CBC_PAD_MECH_INFO_TYPE, /* SUN_CKM_AES_CBC_PAD */ + AES_CTR_MECH_INFO_TYPE, /* SUN_CKM_AES_CTR */ + AES_CCM_MECH_INFO_TYPE, /* SUN_CKM_AES_CCM */ + AES_GCM_MECH_INFO_TYPE, /* SUN_CKM_AES_GCM */ + AES_GMAC_MECH_INFO_TYPE /* SUN_CKM_AES_GMAC */ +} aes_mech_type_t; + +#endif /* _AES_IMPL */ + +/* + * Methods used to define AES implementation + * + * @aes_gen_f Key generation + * @aes_enc_f Function encrypts one block + * @aes_dec_f Function decrypts one block + * @aes_will_work_f Function tests whether method will function + */ +typedef void (*aes_generate_f)(aes_key_t *, const uint32_t *, int); +typedef void (*aes_encrypt_f)(const uint32_t[], int, + const uint32_t[4], uint32_t[4]); +typedef void (*aes_decrypt_f)(const uint32_t[], int, + const uint32_t[4], uint32_t[4]); +typedef boolean_t (*aes_will_work_f)(void); + +#define AES_IMPL_NAME_MAX (16) + +struct aes_impl_ops { + aes_generate_f generate; + aes_encrypt_f encrypt; + aes_decrypt_f decrypt; + aes_will_work_f is_supported; + boolean_t needs_byteswap; + char name[AES_IMPL_NAME_MAX]; +}; + +extern const aes_impl_ops_t aes_generic_impl; +#if defined(__x86_64) +extern const aes_impl_ops_t aes_x86_64_impl; + +/* These functions are used to execute amd64 instructions for AMD or Intel: */ +extern int rijndael_key_setup_enc_amd64(uint32_t rk[], + const uint32_t cipherKey[], int keyBits); +extern int rijndael_key_setup_dec_amd64(uint32_t rk[], + const uint32_t cipherKey[], int keyBits); +extern void aes_encrypt_amd64(const uint32_t rk[], int Nr, + const uint32_t pt[4], uint32_t ct[4]); +extern void aes_decrypt_amd64(const uint32_t rk[], int Nr, + const uint32_t ct[4], uint32_t pt[4]); +#endif +#if defined(__x86_64) && defined(HAVE_AES) +extern const aes_impl_ops_t aes_aesni_impl; +#endif + +/* + * Initializes fastest implementation + */ +void aes_impl_init(void); + +/* + * Returns optimal allowed AES implementation + */ +const struct aes_impl_ops *aes_impl_get_ops(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _AES_IMPL_H */ diff --git a/module/icp/include/modes/gcm_impl.h b/module/icp/include/modes/gcm_impl.h new file mode 100644 index 000000000000..28c8f63a7d46 --- /dev/null +++ b/module/icp/include/modes/gcm_impl.h @@ -0,0 +1,75 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#ifndef _GCM_IMPL_H +#define _GCM_IMPL_H + +/* + * GCM function dispatcher. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* + * Methods used to define GCM implementation + * + * @gcm_mul_f Perform carry-less multiplication + * @gcm_will_work_f Function tests whether implementation will function + */ +typedef void (*gcm_mul_f)(uint64_t *, uint64_t *, uint64_t *); +typedef boolean_t (*gcm_will_work_f)(void); + +#define GCM_IMPL_NAME_MAX (16) + +typedef struct gcm_impl_ops { + gcm_mul_f mul; + gcm_will_work_f is_supported; + char name[GCM_IMPL_NAME_MAX]; +} gcm_impl_ops_t; + +extern const gcm_impl_ops_t gcm_generic_impl; +#if defined(__x86_64) && defined(HAVE_PCLMULQDQ) +extern const gcm_impl_ops_t gcm_pclmulqdq_impl; +#endif + +/* + * Initializes fastest implementation + */ +void gcm_impl_init(void); + +/* + * Returns optimal allowed GCM implementation + */ +const struct gcm_impl_ops *gcm_impl_get_ops(void); + +#ifdef __cplusplus +} +#endif + +#endif /* _GCM_IMPL_H */ diff --git a/module/icp/include/modes/modes.h b/module/icp/include/modes/modes.h new file mode 100644 index 000000000000..57a211ccf1bf --- /dev/null +++ b/module/icp/include/modes/modes.h @@ -0,0 +1,411 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _COMMON_CRYPTO_MODES_H +#define _COMMON_CRYPTO_MODES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include + +/* + * Does the build chain support all instructions needed for the GCM assembler + * routines. AVX support should imply AES-NI and PCLMULQDQ, but make sure + * anyhow. + */ +#if defined(__x86_64__) && defined(HAVE_AVX) && \ + defined(HAVE_AES) && defined(HAVE_PCLMULQDQ) +#define CAN_USE_GCM_ASM +extern boolean_t gcm_avx_can_use_movbe; +#endif + +#define ECB_MODE 0x00000002 +#define CBC_MODE 0x00000004 +#define CTR_MODE 0x00000008 +#define CCM_MODE 0x00000010 +#define GCM_MODE 0x00000020 +#define GMAC_MODE 0x00000040 + +/* + * cc_keysched: Pointer to key schedule. + * + * cc_keysched_len: Length of the key schedule. + * + * cc_remainder: This is for residual data, i.e. data that can't + * be processed because there are too few bytes. + * Must wait until more data arrives. + * + * cc_remainder_len: Number of bytes in cc_remainder. + * + * cc_iv: Scratch buffer that sometimes contains the IV. + * + * cc_lastp: Pointer to previous block of ciphertext. + * + * cc_copy_to: Pointer to where encrypted residual data needs + * to be copied. + * + * cc_flags: PROVIDER_OWNS_KEY_SCHEDULE + * When a context is freed, it is necessary + * to know whether the key schedule was allocated + * by the caller, or internally, e.g. an init routine. + * If allocated by the latter, then it needs to be freed. + * + * ECB_MODE, CBC_MODE, CTR_MODE, or CCM_MODE + */ +struct common_ctx { + void *cc_keysched; + size_t cc_keysched_len; + uint64_t cc_iv[2]; + uint64_t cc_remainder[2]; + size_t cc_remainder_len; + uint8_t *cc_lastp; + uint8_t *cc_copy_to; + uint32_t cc_flags; +}; + +typedef struct common_ctx common_ctx_t; + +typedef struct ecb_ctx { + struct common_ctx ecb_common; + uint64_t ecb_lastblock[2]; +} ecb_ctx_t; + +#define ecb_keysched ecb_common.cc_keysched +#define ecb_keysched_len ecb_common.cc_keysched_len +#define ecb_iv ecb_common.cc_iv +#define ecb_remainder ecb_common.cc_remainder +#define ecb_remainder_len ecb_common.cc_remainder_len +#define ecb_lastp ecb_common.cc_lastp +#define ecb_copy_to ecb_common.cc_copy_to +#define ecb_flags ecb_common.cc_flags + +typedef struct cbc_ctx { + struct common_ctx cbc_common; + uint64_t cbc_lastblock[2]; +} cbc_ctx_t; + +#define cbc_keysched cbc_common.cc_keysched +#define cbc_keysched_len cbc_common.cc_keysched_len +#define cbc_iv cbc_common.cc_iv +#define cbc_remainder cbc_common.cc_remainder +#define cbc_remainder_len cbc_common.cc_remainder_len +#define cbc_lastp cbc_common.cc_lastp +#define cbc_copy_to cbc_common.cc_copy_to +#define cbc_flags cbc_common.cc_flags + +/* + * ctr_lower_mask Bit-mask for lower 8 bytes of counter block. + * ctr_upper_mask Bit-mask for upper 8 bytes of counter block. + */ +typedef struct ctr_ctx { + struct common_ctx ctr_common; + uint64_t ctr_lower_mask; + uint64_t ctr_upper_mask; + uint32_t ctr_tmp[4]; +} ctr_ctx_t; + +/* + * ctr_cb Counter block. + */ +#define ctr_keysched ctr_common.cc_keysched +#define ctr_keysched_len ctr_common.cc_keysched_len +#define ctr_cb ctr_common.cc_iv +#define ctr_remainder ctr_common.cc_remainder +#define ctr_remainder_len ctr_common.cc_remainder_len +#define ctr_lastp ctr_common.cc_lastp +#define ctr_copy_to ctr_common.cc_copy_to +#define ctr_flags ctr_common.cc_flags + +/* + * + * ccm_mac_len: Stores length of the MAC in CCM mode. + * ccm_mac_buf: Stores the intermediate value for MAC in CCM encrypt. + * In CCM decrypt, stores the input MAC value. + * ccm_data_len: Length of the plaintext for CCM mode encrypt, or + * length of the ciphertext for CCM mode decrypt. + * ccm_processed_data_len: + * Length of processed plaintext in CCM mode encrypt, + * or length of processed ciphertext for CCM mode decrypt. + * ccm_processed_mac_len: + * Length of MAC data accumulated in CCM mode decrypt. + * + * ccm_pt_buf: Only used in CCM mode decrypt. It stores the + * decrypted plaintext to be returned when + * MAC verification succeeds in decrypt_final. + * Memory for this should be allocated in the AES module. + * + */ +typedef struct ccm_ctx { + struct common_ctx ccm_common; + uint32_t ccm_tmp[4]; + size_t ccm_mac_len; + uint64_t ccm_mac_buf[2]; + size_t ccm_data_len; + size_t ccm_processed_data_len; + size_t ccm_processed_mac_len; + uint8_t *ccm_pt_buf; + uint64_t ccm_mac_input_buf[2]; + uint64_t ccm_counter_mask; +} ccm_ctx_t; + +#define ccm_keysched ccm_common.cc_keysched +#define ccm_keysched_len ccm_common.cc_keysched_len +#define ccm_cb ccm_common.cc_iv +#define ccm_remainder ccm_common.cc_remainder +#define ccm_remainder_len ccm_common.cc_remainder_len +#define ccm_lastp ccm_common.cc_lastp +#define ccm_copy_to ccm_common.cc_copy_to +#define ccm_flags ccm_common.cc_flags + +/* + * gcm_tag_len: Length of authentication tag. + * + * gcm_ghash: Stores output from the GHASH function. + * + * gcm_processed_data_len: + * Length of processed plaintext (encrypt) or + * length of processed ciphertext (decrypt). + * + * gcm_pt_buf: Stores the decrypted plaintext returned by + * decrypt_final when the computed authentication + * tag matches the user supplied tag. + * + * gcm_pt_buf_len: Length of the plaintext buffer. + * + * gcm_H: Subkey. + * + * gcm_Htable: Pre-computed and pre-shifted H, H^2, ... H^6 for the + * Karatsuba Algorithm in host byte order. + * + * gcm_J0: Pre-counter block generated from the IV. + * + * gcm_len_a_len_c: 64-bit representations of the bit lengths of + * AAD and ciphertext. + * + * gcm_kmflag: Current value of kmflag. Used for allocating + * the plaintext buffer during decryption and a + * gcm_avx_chunk_size'd buffer for avx enabled encryption. + */ +typedef struct gcm_ctx { + struct common_ctx gcm_common; + size_t gcm_tag_len; + size_t gcm_processed_data_len; + size_t gcm_pt_buf_len; + uint32_t gcm_tmp[4]; + /* + * The relative positions of gcm_ghash, gcm_H and pre-computed + * gcm_Htable are hard coded in aesni-gcm-x86_64.S and ghash-x86_64.S, + * so please don't change (or adjust accordingly). + */ + uint64_t gcm_ghash[2]; + uint64_t gcm_H[2]; +#ifdef CAN_USE_GCM_ASM + uint64_t gcm_Htable[12][2]; +#endif + uint64_t gcm_J0[2]; + uint64_t gcm_len_a_len_c[2]; + uint8_t *gcm_pt_buf; + int gcm_kmflag; +#ifdef CAN_USE_GCM_ASM + boolean_t gcm_use_avx; +#endif +} gcm_ctx_t; + +#define gcm_keysched gcm_common.cc_keysched +#define gcm_keysched_len gcm_common.cc_keysched_len +#define gcm_cb gcm_common.cc_iv +#define gcm_remainder gcm_common.cc_remainder +#define gcm_remainder_len gcm_common.cc_remainder_len +#define gcm_lastp gcm_common.cc_lastp +#define gcm_copy_to gcm_common.cc_copy_to +#define gcm_flags gcm_common.cc_flags + +#define AES_GMAC_IV_LEN 12 +#define AES_GMAC_TAG_BITS 128 + +typedef struct aes_ctx { + union { + ecb_ctx_t acu_ecb; + cbc_ctx_t acu_cbc; + ctr_ctx_t acu_ctr; + ccm_ctx_t acu_ccm; + gcm_ctx_t acu_gcm; + } acu; +} aes_ctx_t; + +#define ac_flags acu.acu_ecb.ecb_common.cc_flags +#define ac_remainder_len acu.acu_ecb.ecb_common.cc_remainder_len +#define ac_keysched acu.acu_ecb.ecb_common.cc_keysched +#define ac_keysched_len acu.acu_ecb.ecb_common.cc_keysched_len +#define ac_iv acu.acu_ecb.ecb_common.cc_iv +#define ac_lastp acu.acu_ecb.ecb_common.cc_lastp +#define ac_pt_buf acu.acu_ccm.ccm_pt_buf +#define ac_mac_len acu.acu_ccm.ccm_mac_len +#define ac_data_len acu.acu_ccm.ccm_data_len +#define ac_processed_mac_len acu.acu_ccm.ccm_processed_mac_len +#define ac_processed_data_len acu.acu_ccm.ccm_processed_data_len +#define ac_tag_len acu.acu_gcm.gcm_tag_len + +typedef struct blowfish_ctx { + union { + ecb_ctx_t bcu_ecb; + cbc_ctx_t bcu_cbc; + } bcu; +} blowfish_ctx_t; + +#define bc_flags bcu.bcu_ecb.ecb_common.cc_flags +#define bc_remainder_len bcu.bcu_ecb.ecb_common.cc_remainder_len +#define bc_keysched bcu.bcu_ecb.ecb_common.cc_keysched +#define bc_keysched_len bcu.bcu_ecb.ecb_common.cc_keysched_len +#define bc_iv bcu.bcu_ecb.ecb_common.cc_iv +#define bc_lastp bcu.bcu_ecb.ecb_common.cc_lastp + +typedef struct des_ctx { + union { + ecb_ctx_t dcu_ecb; + cbc_ctx_t dcu_cbc; + } dcu; +} des_ctx_t; + +#define dc_flags dcu.dcu_ecb.ecb_common.cc_flags +#define dc_remainder_len dcu.dcu_ecb.ecb_common.cc_remainder_len +#define dc_keysched dcu.dcu_ecb.ecb_common.cc_keysched +#define dc_keysched_len dcu.dcu_ecb.ecb_common.cc_keysched_len +#define dc_iv dcu.dcu_ecb.ecb_common.cc_iv +#define dc_lastp dcu.dcu_ecb.ecb_common.cc_lastp + +extern int ecb_cipher_contiguous_blocks(ecb_ctx_t *, char *, size_t, + crypto_data_t *, size_t, int (*cipher)(const void *, const uint8_t *, + uint8_t *)); + +extern int cbc_encrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*encrypt)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int cbc_decrypt_contiguous_blocks(cbc_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*decrypt)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int ctr_mode_contiguous_blocks(ctr_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*cipher)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int ccm_mode_encrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int ccm_mode_decrypt_contiguous_blocks(ccm_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int gcm_mode_encrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int gcm_mode_decrypt_contiguous_blocks(gcm_ctx_t *, char *, size_t, + crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +int ccm_encrypt_final(ccm_ctx_t *, crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +int gcm_encrypt_final(gcm_ctx_t *, crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int ccm_decrypt_final(ccm_ctx_t *, crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int gcm_decrypt_final(gcm_ctx_t *, crypto_data_t *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int ctr_mode_final(ctr_ctx_t *, crypto_data_t *, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)); + +extern int cbc_init_ctx(cbc_ctx_t *, char *, size_t, size_t, + void (*copy_block)(uint8_t *, uint64_t *)); + +extern int ctr_init_ctx(ctr_ctx_t *, ulong_t, uint8_t *, + void (*copy_block)(uint8_t *, uint8_t *)); + +extern int ccm_init_ctx(ccm_ctx_t *, char *, int, boolean_t, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int gcm_init_ctx(gcm_ctx_t *, char *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern int gmac_init_ctx(gcm_ctx_t *, char *, size_t, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *), + void (*copy_block)(uint8_t *, uint8_t *), + void (*xor_block)(uint8_t *, uint8_t *)); + +extern void calculate_ccm_mac(ccm_ctx_t *, uint8_t *, + int (*encrypt_block)(const void *, const uint8_t *, uint8_t *)); + +extern void gcm_mul(uint64_t *, uint64_t *, uint64_t *); + +extern void crypto_init_ptrs(crypto_data_t *, void **, offset_t *); +extern void crypto_get_ptrs(crypto_data_t *, void **, offset_t *, + uint8_t **, size_t *, uint8_t **, size_t); + +extern void *ecb_alloc_ctx(int); +extern void *cbc_alloc_ctx(int); +extern void *ctr_alloc_ctx(int); +extern void *ccm_alloc_ctx(int); +extern void *gcm_alloc_ctx(int); +extern void *gmac_alloc_ctx(int); +extern void crypto_free_mode_ctx(void *); +extern void gcm_set_kmflag(gcm_ctx_t *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _COMMON_CRYPTO_MODES_H */ diff --git a/module/icp/include/sha1/sha1.h b/module/icp/include/sha1/sha1.h new file mode 100644 index 000000000000..251b64fcaeee --- /dev/null +++ b/module/icp/include/sha1/sha1.h @@ -0,0 +1,61 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SHA1_H +#define _SYS_SHA1_H + +#include /* for uint_* */ + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * NOTE: n2rng (Niagara2 RNG driver) accesses the state field of + * SHA1_CTX directly. NEVER change this structure without verifying + * compatibility with n2rng. The important thing is that the state + * must be in a field declared as uint32_t state[5]. + */ +/* SHA-1 context. */ +typedef struct { + uint32_t state[5]; /* state (ABCDE) */ + uint32_t count[2]; /* number of bits, modulo 2^64 (msb first) */ + union { + uint8_t buf8[64]; /* undigested input */ + uint32_t buf32[16]; /* realigned input */ + } buf_un; +} SHA1_CTX; + +#define SHA1_DIGEST_LENGTH 20 + +void SHA1Init(SHA1_CTX *); +void SHA1Update(SHA1_CTX *, const void *, size_t); +void SHA1Final(void *, SHA1_CTX *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA1_H */ diff --git a/module/icp/include/sha1/sha1_consts.h b/module/icp/include/sha1/sha1_consts.h new file mode 100644 index 000000000000..848d25ef050f --- /dev/null +++ b/module/icp/include/sha1/sha1_consts.h @@ -0,0 +1,65 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1998, by Sun Microsystems, Inc. + * All rights reserved. + */ + +#ifndef _SYS_SHA1_CONSTS_H +#define _SYS_SHA1_CONSTS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * as explained in sha1.c, loading 32-bit constants on a sparc is expensive + * since it involves both a `sethi' and an `or'. thus, we instead use `ld' + * to load the constants from an array called `sha1_consts'. however, on + * intel (and perhaps other processors), it is cheaper to load the constant + * directly. thus, the c code in SHA1Transform() uses the macro SHA1_CONST() + * which either expands to a constant or an array reference, depending on + * the architecture the code is being compiled for. + */ + +#include /* uint32_t */ + +extern const uint32_t sha1_consts[]; + +#if defined(__sparc) +#define SHA1_CONST(x) (sha1_consts[x]) +#else +#define SHA1_CONST(x) (SHA1_CONST_ ## x) +#endif + +/* constants, as provided in FIPS 180-1 */ + +#define SHA1_CONST_0 0x5a827999U +#define SHA1_CONST_1 0x6ed9eba1U +#define SHA1_CONST_2 0x8f1bbcdcU +#define SHA1_CONST_3 0xca62c1d6U + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA1_CONSTS_H */ diff --git a/module/icp/include/sha1/sha1_impl.h b/module/icp/include/sha1/sha1_impl.h new file mode 100644 index 000000000000..1c1f8728f9b5 --- /dev/null +++ b/module/icp/include/sha1/sha1_impl.h @@ -0,0 +1,73 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SHA1_IMPL_H +#define _SHA1_IMPL_H + + +#ifdef __cplusplus +extern "C" { +#endif + +#define SHA1_HASH_SIZE 20 /* SHA_1 digest length in bytes */ +#define SHA1_DIGEST_LENGTH 20 /* SHA1 digest length in bytes */ +#define SHA1_HMAC_BLOCK_SIZE 64 /* SHA1-HMAC block size */ +#define SHA1_HMAC_MIN_KEY_LEN 1 /* SHA1-HMAC min key length in bytes */ +#define SHA1_HMAC_MAX_KEY_LEN INT_MAX /* SHA1-HMAC max key length in bytes */ +#define SHA1_HMAC_INTS_PER_BLOCK (SHA1_HMAC_BLOCK_SIZE/sizeof (uint32_t)) + +/* + * CSPI information (entry points, provider info, etc.) + */ +typedef enum sha1_mech_type { + SHA1_MECH_INFO_TYPE, /* SUN_CKM_SHA1 */ + SHA1_HMAC_MECH_INFO_TYPE, /* SUN_CKM_SHA1_HMAC */ + SHA1_HMAC_GEN_MECH_INFO_TYPE /* SUN_CKM_SHA1_HMAC_GENERAL */ +} sha1_mech_type_t; + +/* + * Context for SHA1 mechanism. + */ +typedef struct sha1_ctx { + sha1_mech_type_t sc_mech_type; /* type of context */ + SHA1_CTX sc_sha1_ctx; /* SHA1 context */ +} sha1_ctx_t; + +/* + * Context for SHA1-HMAC and SHA1-HMAC-GENERAL mechanisms. + */ +typedef struct sha1_hmac_ctx { + sha1_mech_type_t hc_mech_type; /* type of context */ + uint32_t hc_digest_len; /* digest len in bytes */ + SHA1_CTX hc_icontext; /* inner SHA1 context */ + SHA1_CTX hc_ocontext; /* outer SHA1 context */ +} sha1_hmac_ctx_t; + + +#ifdef __cplusplus +} +#endif + +#endif /* _SHA1_IMPL_H */ diff --git a/module/icp/include/sha2/sha2_consts.h b/module/icp/include/sha2/sha2_consts.h new file mode 100644 index 000000000000..3a6645508fe9 --- /dev/null +++ b/module/icp/include/sha2/sha2_consts.h @@ -0,0 +1,219 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_SHA2_CONSTS_H +#define _SYS_SHA2_CONSTS_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Loading 32-bit constants on a sparc is expensive since it involves both + * a `sethi' and an `or'. thus, we instead use `ld' to load the constants + * from an array called `sha2_consts'. however, on intel (and perhaps other + * processors), it is cheaper to load the constant directly. thus, the c + * code in SHA transform functions uses the macro SHA2_CONST() which either + * expands to a constant or an array reference, depending on + * the architecture the code is being compiled for. + * + * SHA512 constants are used for SHA384 + */ + +#include /* uint32_t */ + +extern const uint32_t sha256_consts[]; +extern const uint64_t sha512_consts[]; + +#if defined(__sparc) +#define SHA256_CONST(x) (sha256_consts[x]) +#define SHA512_CONST(x) (sha512_consts[x]) +#else +#define SHA256_CONST(x) (SHA256_CONST_ ## x) +#define SHA512_CONST(x) (SHA512_CONST_ ## x) +#endif + +/* constants, as provided in FIPS 180-2 */ + +#define SHA256_CONST_0 0x428a2f98U +#define SHA256_CONST_1 0x71374491U +#define SHA256_CONST_2 0xb5c0fbcfU +#define SHA256_CONST_3 0xe9b5dba5U +#define SHA256_CONST_4 0x3956c25bU +#define SHA256_CONST_5 0x59f111f1U +#define SHA256_CONST_6 0x923f82a4U +#define SHA256_CONST_7 0xab1c5ed5U + +#define SHA256_CONST_8 0xd807aa98U +#define SHA256_CONST_9 0x12835b01U +#define SHA256_CONST_10 0x243185beU +#define SHA256_CONST_11 0x550c7dc3U +#define SHA256_CONST_12 0x72be5d74U +#define SHA256_CONST_13 0x80deb1feU +#define SHA256_CONST_14 0x9bdc06a7U +#define SHA256_CONST_15 0xc19bf174U + +#define SHA256_CONST_16 0xe49b69c1U +#define SHA256_CONST_17 0xefbe4786U +#define SHA256_CONST_18 0x0fc19dc6U +#define SHA256_CONST_19 0x240ca1ccU +#define SHA256_CONST_20 0x2de92c6fU +#define SHA256_CONST_21 0x4a7484aaU +#define SHA256_CONST_22 0x5cb0a9dcU +#define SHA256_CONST_23 0x76f988daU + +#define SHA256_CONST_24 0x983e5152U +#define SHA256_CONST_25 0xa831c66dU +#define SHA256_CONST_26 0xb00327c8U +#define SHA256_CONST_27 0xbf597fc7U +#define SHA256_CONST_28 0xc6e00bf3U +#define SHA256_CONST_29 0xd5a79147U +#define SHA256_CONST_30 0x06ca6351U +#define SHA256_CONST_31 0x14292967U + +#define SHA256_CONST_32 0x27b70a85U +#define SHA256_CONST_33 0x2e1b2138U +#define SHA256_CONST_34 0x4d2c6dfcU +#define SHA256_CONST_35 0x53380d13U +#define SHA256_CONST_36 0x650a7354U +#define SHA256_CONST_37 0x766a0abbU +#define SHA256_CONST_38 0x81c2c92eU +#define SHA256_CONST_39 0x92722c85U + +#define SHA256_CONST_40 0xa2bfe8a1U +#define SHA256_CONST_41 0xa81a664bU +#define SHA256_CONST_42 0xc24b8b70U +#define SHA256_CONST_43 0xc76c51a3U +#define SHA256_CONST_44 0xd192e819U +#define SHA256_CONST_45 0xd6990624U +#define SHA256_CONST_46 0xf40e3585U +#define SHA256_CONST_47 0x106aa070U + +#define SHA256_CONST_48 0x19a4c116U +#define SHA256_CONST_49 0x1e376c08U +#define SHA256_CONST_50 0x2748774cU +#define SHA256_CONST_51 0x34b0bcb5U +#define SHA256_CONST_52 0x391c0cb3U +#define SHA256_CONST_53 0x4ed8aa4aU +#define SHA256_CONST_54 0x5b9cca4fU +#define SHA256_CONST_55 0x682e6ff3U + +#define SHA256_CONST_56 0x748f82eeU +#define SHA256_CONST_57 0x78a5636fU +#define SHA256_CONST_58 0x84c87814U +#define SHA256_CONST_59 0x8cc70208U +#define SHA256_CONST_60 0x90befffaU +#define SHA256_CONST_61 0xa4506cebU +#define SHA256_CONST_62 0xbef9a3f7U +#define SHA256_CONST_63 0xc67178f2U + +#define SHA512_CONST_0 0x428a2f98d728ae22ULL +#define SHA512_CONST_1 0x7137449123ef65cdULL +#define SHA512_CONST_2 0xb5c0fbcfec4d3b2fULL +#define SHA512_CONST_3 0xe9b5dba58189dbbcULL +#define SHA512_CONST_4 0x3956c25bf348b538ULL +#define SHA512_CONST_5 0x59f111f1b605d019ULL +#define SHA512_CONST_6 0x923f82a4af194f9bULL +#define SHA512_CONST_7 0xab1c5ed5da6d8118ULL +#define SHA512_CONST_8 0xd807aa98a3030242ULL +#define SHA512_CONST_9 0x12835b0145706fbeULL +#define SHA512_CONST_10 0x243185be4ee4b28cULL +#define SHA512_CONST_11 0x550c7dc3d5ffb4e2ULL +#define SHA512_CONST_12 0x72be5d74f27b896fULL +#define SHA512_CONST_13 0x80deb1fe3b1696b1ULL +#define SHA512_CONST_14 0x9bdc06a725c71235ULL +#define SHA512_CONST_15 0xc19bf174cf692694ULL +#define SHA512_CONST_16 0xe49b69c19ef14ad2ULL +#define SHA512_CONST_17 0xefbe4786384f25e3ULL +#define SHA512_CONST_18 0x0fc19dc68b8cd5b5ULL +#define SHA512_CONST_19 0x240ca1cc77ac9c65ULL +#define SHA512_CONST_20 0x2de92c6f592b0275ULL +#define SHA512_CONST_21 0x4a7484aa6ea6e483ULL +#define SHA512_CONST_22 0x5cb0a9dcbd41fbd4ULL +#define SHA512_CONST_23 0x76f988da831153b5ULL +#define SHA512_CONST_24 0x983e5152ee66dfabULL +#define SHA512_CONST_25 0xa831c66d2db43210ULL +#define SHA512_CONST_26 0xb00327c898fb213fULL +#define SHA512_CONST_27 0xbf597fc7beef0ee4ULL +#define SHA512_CONST_28 0xc6e00bf33da88fc2ULL +#define SHA512_CONST_29 0xd5a79147930aa725ULL +#define SHA512_CONST_30 0x06ca6351e003826fULL +#define SHA512_CONST_31 0x142929670a0e6e70ULL +#define SHA512_CONST_32 0x27b70a8546d22ffcULL +#define SHA512_CONST_33 0x2e1b21385c26c926ULL +#define SHA512_CONST_34 0x4d2c6dfc5ac42aedULL +#define SHA512_CONST_35 0x53380d139d95b3dfULL +#define SHA512_CONST_36 0x650a73548baf63deULL +#define SHA512_CONST_37 0x766a0abb3c77b2a8ULL +#define SHA512_CONST_38 0x81c2c92e47edaee6ULL +#define SHA512_CONST_39 0x92722c851482353bULL +#define SHA512_CONST_40 0xa2bfe8a14cf10364ULL +#define SHA512_CONST_41 0xa81a664bbc423001ULL +#define SHA512_CONST_42 0xc24b8b70d0f89791ULL +#define SHA512_CONST_43 0xc76c51a30654be30ULL +#define SHA512_CONST_44 0xd192e819d6ef5218ULL +#define SHA512_CONST_45 0xd69906245565a910ULL +#define SHA512_CONST_46 0xf40e35855771202aULL +#define SHA512_CONST_47 0x106aa07032bbd1b8ULL +#define SHA512_CONST_48 0x19a4c116b8d2d0c8ULL +#define SHA512_CONST_49 0x1e376c085141ab53ULL +#define SHA512_CONST_50 0x2748774cdf8eeb99ULL +#define SHA512_CONST_51 0x34b0bcb5e19b48a8ULL +#define SHA512_CONST_52 0x391c0cb3c5c95a63ULL +#define SHA512_CONST_53 0x4ed8aa4ae3418acbULL +#define SHA512_CONST_54 0x5b9cca4f7763e373ULL +#define SHA512_CONST_55 0x682e6ff3d6b2b8a3ULL +#define SHA512_CONST_56 0x748f82ee5defb2fcULL +#define SHA512_CONST_57 0x78a5636f43172f60ULL +#define SHA512_CONST_58 0x84c87814a1f0ab72ULL +#define SHA512_CONST_59 0x8cc702081a6439ecULL +#define SHA512_CONST_60 0x90befffa23631e28ULL +#define SHA512_CONST_61 0xa4506cebde82bde9ULL +#define SHA512_CONST_62 0xbef9a3f7b2c67915ULL +#define SHA512_CONST_63 0xc67178f2e372532bULL +#define SHA512_CONST_64 0xca273eceea26619cULL +#define SHA512_CONST_65 0xd186b8c721c0c207ULL +#define SHA512_CONST_66 0xeada7dd6cde0eb1eULL +#define SHA512_CONST_67 0xf57d4f7fee6ed178ULL +#define SHA512_CONST_68 0x06f067aa72176fbaULL +#define SHA512_CONST_69 0x0a637dc5a2c898a6ULL +#define SHA512_CONST_70 0x113f9804bef90daeULL +#define SHA512_CONST_71 0x1b710b35131c471bULL +#define SHA512_CONST_72 0x28db77f523047d84ULL +#define SHA512_CONST_73 0x32caab7b40c72493ULL +#define SHA512_CONST_74 0x3c9ebe0a15c9bebcULL +#define SHA512_CONST_75 0x431d67c49c100d4cULL +#define SHA512_CONST_76 0x4cc5d4becb3e42b6ULL +#define SHA512_CONST_77 0x597f299cfc657e2aULL +#define SHA512_CONST_78 0x5fcb6fab3ad6faecULL +#define SHA512_CONST_79 0x6c44198c4a475817ULL + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_SHA2_CONSTS_H */ diff --git a/module/icp/include/sha2/sha2_impl.h b/module/icp/include/sha2/sha2_impl.h new file mode 100644 index 000000000000..b9768d344e95 --- /dev/null +++ b/module/icp/include/sha2/sha2_impl.h @@ -0,0 +1,64 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SHA2_IMPL_H +#define _SHA2_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum { + SHA1_TYPE, + SHA256_TYPE, + SHA384_TYPE, + SHA512_TYPE +} sha2_mech_t; + +/* + * Context for SHA2 mechanism. + */ +typedef struct sha2_ctx { + sha2_mech_type_t sc_mech_type; /* type of context */ + SHA2_CTX sc_sha2_ctx; /* SHA2 context */ +} sha2_ctx_t; + +/* + * Context for SHA2 HMAC and HMAC GENERAL mechanisms. + */ +typedef struct sha2_hmac_ctx { + sha2_mech_type_t hc_mech_type; /* type of context */ + uint32_t hc_digest_len; /* digest len in bytes */ + SHA2_CTX hc_icontext; /* inner SHA2 context */ + SHA2_CTX hc_ocontext; /* outer SHA2 context */ +} sha2_hmac_ctx_t; + +#ifdef __cplusplus +} +#endif + +#endif /* _SHA2_IMPL_H */ diff --git a/module/icp/include/sys/asm_linkage.h b/module/icp/include/sys/asm_linkage.h new file mode 100644 index 000000000000..49a494b46e0b --- /dev/null +++ b/module/icp/include/sys/asm_linkage.h @@ -0,0 +1,46 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_ASM_LINKAGE_H +#define _SYS_ASM_LINKAGE_H + +#if defined(__i386) || defined(__amd64) + +#include /* XX64 x86/sys/asm_linkage.h */ + +#endif + +#if defined(_KERNEL) && defined(HAVE_KERNEL_OBJTOOL) + +#include + +#else /* userspace */ +#define FRAME_BEGIN +#define FRAME_END +#endif + + +#endif /* _SYS_ASM_LINKAGE_H */ diff --git a/module/icp/include/sys/bitmap.h b/module/icp/include/sys/bitmap.h new file mode 100644 index 000000000000..4e86ee70ed9e --- /dev/null +++ b/module/icp/include/sys/bitmap.h @@ -0,0 +1,183 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + + +#ifndef _SYS_BITMAP_H +#define _SYS_BITMAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +#if defined(__GNUC__) && defined(_ASM_INLINES) && \ + (defined(__i386) || defined(__amd64)) +#include +#endif + +/* + * Operations on bitmaps of arbitrary size + * A bitmap is a vector of 1 or more ulong_t's. + * The user of the package is responsible for range checks and keeping + * track of sizes. + */ + +#ifdef _LP64 +#define BT_ULSHIFT 6 /* log base 2 of BT_NBIPUL, to extract word index */ +#define BT_ULSHIFT32 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#else +#define BT_ULSHIFT 5 /* log base 2 of BT_NBIPUL, to extract word index */ +#endif + +#define BT_NBIPUL (1 << BT_ULSHIFT) /* n bits per ulong_t */ +#define BT_ULMASK (BT_NBIPUL - 1) /* to extract bit index */ + +#ifdef _LP64 +#define BT_NBIPUL32 (1 << BT_ULSHIFT32) /* n bits per ulong_t */ +#define BT_ULMASK32 (BT_NBIPUL32 - 1) /* to extract bit index */ +#define BT_ULMAXMASK 0xffffffffffffffff /* used by bt_getlowbit */ +#else +#define BT_ULMAXMASK 0xffffffff +#endif + +/* + * bitmap is a ulong_t *, bitindex an index_t + * + * The macros BT_WIM and BT_BIW internal; there is no need + * for users of this package to use them. + */ + +/* + * word in map + */ +#define BT_WIM(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT]) +/* + * bit in word + */ +#define BT_BIW(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK)) + +#ifdef _LP64 +#define BT_WIM32(bitmap, bitindex) \ + ((bitmap)[(bitindex) >> BT_ULSHIFT32]) + +#define BT_BIW32(bitindex) \ + (1UL << ((bitindex) & BT_ULMASK32)) +#endif + +/* + * These are public macros + * + * BT_BITOUL == n bits to n ulong_t's + */ +#define BT_BITOUL(nbits) \ + (((nbits) + BT_NBIPUL - 1l) / BT_NBIPUL) +#define BT_SIZEOFMAP(nbits) \ + (BT_BITOUL(nbits) * sizeof (ulong_t)) +#define BT_TEST(bitmap, bitindex) \ + ((BT_WIM((bitmap), (bitindex)) & BT_BIW(bitindex)) ? 1 : 0) +#define BT_SET(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) |= BT_BIW(bitindex); } +#define BT_CLEAR(bitmap, bitindex) \ + { BT_WIM((bitmap), (bitindex)) &= ~BT_BIW(bitindex); } + +#ifdef _LP64 +#define BT_BITOUL32(nbits) \ + (((nbits) + BT_NBIPUL32 - 1l) / BT_NBIPUL32) +#define BT_SIZEOFMAP32(nbits) \ + (BT_BITOUL32(nbits) * sizeof (uint_t)) +#define BT_TEST32(bitmap, bitindex) \ + ((BT_WIM32((bitmap), (bitindex)) & BT_BIW32(bitindex)) ? 1 : 0) +#define BT_SET32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) |= BT_BIW32(bitindex); } +#define BT_CLEAR32(bitmap, bitindex) \ + { BT_WIM32((bitmap), (bitindex)) &= ~BT_BIW32(bitindex); } +#endif /* _LP64 */ + + +/* + * BIT_ONLYONESET is a private macro not designed for bitmaps of + * arbitrary size. u must be an unsigned integer/long. It returns + * true if one and only one bit is set in u. + */ +#define BIT_ONLYONESET(u) \ + ((((u) == 0) ? 0 : ((u) & ((u) - 1)) == 0)) + +#ifndef _ASM + +/* + * return next available bit index from map with specified number of bits + */ +extern index_t bt_availbit(ulong_t *bitmap, size_t nbits); +/* + * find the highest order bit that is on, and is within or below + * the word specified by wx + */ +extern int bt_gethighbit(ulong_t *mapp, int wx); +extern int bt_range(ulong_t *bitmap, size_t *pos1, size_t *pos2, + size_t end_pos); +extern int bt_getlowbit(ulong_t *bitmap, size_t start, size_t stop); +extern void bt_copy(ulong_t *, ulong_t *, ulong_t); + +/* + * find the parity + */ +extern int odd_parity(ulong_t); + +/* + * Atomically set/clear bits + * Atomic exclusive operations will set "result" to "-1" + * if the bit is already set/cleared. "result" will be set + * to 0 otherwise. + */ +#define BT_ATOMIC_SET(bitmap, bitindex) \ + { atomic_or_ulong(&(BT_WIM(bitmap, bitindex)), BT_BIW(bitindex)); } +#define BT_ATOMIC_CLEAR(bitmap, bitindex) \ + { atomic_and_ulong(&(BT_WIM(bitmap, bitindex)), ~BT_BIW(bitindex)); } + +#define BT_ATOMIC_SET_EXCL(bitmap, bitindex, result) \ + { result = atomic_set_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } +#define BT_ATOMIC_CLEAR_EXCL(bitmap, bitindex, result) \ + { result = atomic_clear_long_excl(&(BT_WIM(bitmap, bitindex)), \ + (bitindex) % BT_NBIPUL); } + +/* + * Extracts bits between index h (high, inclusive) and l (low, exclusive) from + * u, which must be an unsigned integer. + */ +#define BITX(u, h, l) (((u) >> (l)) & ((1LU << ((h) - (l) + 1LU)) - 1LU)) + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_BITMAP_H */ diff --git a/module/icp/include/sys/crypto/elfsign.h b/module/icp/include/sys/crypto/elfsign.h new file mode 100644 index 000000000000..5432f0c8d607 --- /dev/null +++ b/module/icp/include/sys/crypto/elfsign.h @@ -0,0 +1,137 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_ELFSIGN_H +#define _SYS_CRYPTO_ELFSIGN_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Consolidation Private Interface for elfsign/libpkcs11/kcfd + */ + +#include + +/* + * Project Private structures and types used for communication between kcfd + * and KCF over the door. + */ + +typedef enum ELFsign_status_e { + ELFSIGN_UNKNOWN, + ELFSIGN_SUCCESS, + ELFSIGN_FAILED, + ELFSIGN_NOTSIGNED, + ELFSIGN_INVALID_CERTPATH, + ELFSIGN_INVALID_ELFOBJ, + ELFSIGN_RESTRICTED +} ELFsign_status_t; + +#define KCF_KCFD_VERSION1 1 +#define SIG_MAX_LENGTH 1024 + +#define ELF_SIGNATURE_SECTION ".SUNW_signature" + +typedef struct kcf_door_arg_s { + short da_version; + boolean_t da_iskernel; + + union { + char filename[MAXPATHLEN]; /* For request */ + + struct kcf_door_result_s { /* For response */ + ELFsign_status_t status; + uint32_t siglen; + uchar_t signature[1]; + } result; + } da_u; +} kcf_door_arg_t; + +typedef uint32_t filesig_vers_t; + +/* + * File Signature Structure + * Applicable to ELF and other file formats + */ +struct filesignatures { + uint32_t filesig_cnt; /* count of signatures */ + uint32_t filesig_pad; /* unused */ + union { + char filesig_data[1]; + struct filesig { /* one of these for each signature */ + uint32_t filesig_size; + filesig_vers_t filesig_version; + union { + struct filesig_version1 { + uint32_t filesig_v1_dnsize; + uint32_t filesig_v1_sigsize; + uint32_t filesig_v1_oidsize; + char filesig_v1_data[1]; + } filesig_v1; + struct filesig_version3 { + uint64_t filesig_v3_time; + uint32_t filesig_v3_dnsize; + uint32_t filesig_v3_sigsize; + uint32_t filesig_v3_oidsize; + char filesig_v3_data[1]; + } filesig_v3; + } _u2; + } filesig_sig; + uint64_t filesig_align; + } _u1; +}; +#define filesig_sig _u1.filesig_sig + +#define filesig_v1_dnsize _u2.filesig_v1.filesig_v1_dnsize +#define filesig_v1_sigsize _u2.filesig_v1.filesig_v1_sigsize +#define filesig_v1_oidsize _u2.filesig_v1.filesig_v1_oidsize +#define filesig_v1_data _u2.filesig_v1.filesig_v1_data + +#define filesig_v3_time _u2.filesig_v3.filesig_v3_time +#define filesig_v3_dnsize _u2.filesig_v3.filesig_v3_dnsize +#define filesig_v3_sigsize _u2.filesig_v3.filesig_v3_sigsize +#define filesig_v3_oidsize _u2.filesig_v3.filesig_v3_oidsize +#define filesig_v3_data _u2.filesig_v3.filesig_v3_data + +#define filesig_ALIGN(s) (((s) + sizeof (uint64_t) - 1) & \ + (-sizeof (uint64_t))) +#define filesig_next(ptr) (struct filesig *)((void *)((char *)(ptr) + \ + filesig_ALIGN((ptr)->filesig_size))) + +#define FILESIG_UNKNOWN 0 /* unrecognized version */ +#define FILESIG_VERSION1 1 /* version1, all but sig section */ +#define FILESIG_VERSION2 2 /* version1 format, SHF_ALLOC only */ +#define FILESIG_VERSION3 3 /* version3, all but sig section */ +#define FILESIG_VERSION4 4 /* version3 format, SHF_ALLOC only */ + +#define _PATH_KCFD_DOOR "/etc/svc/volatile/kcfd_door" + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_ELFSIGN_H */ diff --git a/module/icp/include/sys/crypto/impl.h b/module/icp/include/sys/crypto/impl.h new file mode 100644 index 000000000000..0f37f3f63532 --- /dev/null +++ b/module/icp/include/sys/crypto/impl.h @@ -0,0 +1,1363 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_IMPL_H +#define _SYS_CRYPTO_IMPL_H + +/* + * Kernel Cryptographic Framework private implementation definitions. + */ + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#define KCF_MODULE "kcf" + +/* + * Prefixes convention: structures internal to the kernel cryptographic + * framework start with 'kcf_'. Exposed structure start with 'crypto_'. + */ + +/* Provider stats. Not protected. */ +typedef struct kcf_prov_stats { + kstat_named_t ps_ops_total; + kstat_named_t ps_ops_passed; + kstat_named_t ps_ops_failed; + kstat_named_t ps_ops_busy_rval; +} kcf_prov_stats_t; + +/* Various kcf stats. Not protected. */ +typedef struct kcf_stats { + kstat_named_t ks_thrs_in_pool; + kstat_named_t ks_idle_thrs; + kstat_named_t ks_minthrs; + kstat_named_t ks_maxthrs; + kstat_named_t ks_swq_njobs; + kstat_named_t ks_swq_maxjobs; + kstat_named_t ks_taskq_threads; + kstat_named_t ks_taskq_minalloc; + kstat_named_t ks_taskq_maxalloc; +} kcf_stats_t; + +/* + * Keep all the information needed by the scheduler from + * this provider. + */ +typedef struct kcf_sched_info { + /* The number of operations dispatched. */ + uint64_t ks_ndispatches; + + /* The number of operations that failed. */ + uint64_t ks_nfails; + + /* The number of operations that returned CRYPTO_BUSY. */ + uint64_t ks_nbusy_rval; + + /* taskq used to dispatch crypto requests */ + taskq_t *ks_taskq; +} kcf_sched_info_t; + +/* + * pd_irefcnt approximates the number of inflight requests to the + * provider. Though we increment this counter during registration for + * other purposes, that base value is mostly same across all providers. + * So, it is a good measure of the load on a provider when it is not + * in a busy state. Once a provider notifies it is busy, requests + * backup in the taskq. So, we use tq_nalloc in that case which gives + * the number of task entries in the task queue. Note that we do not + * acquire any locks here as it is not critical to get the exact number + * and the lock contention may be too costly for this code path. + */ +#define KCF_PROV_LOAD(pd) ((pd)->pd_state != KCF_PROV_BUSY ? \ + (pd)->pd_irefcnt : (pd)->pd_sched_info.ks_taskq->tq_nalloc) + +#define KCF_PROV_INCRSTATS(pd, error) { \ + (pd)->pd_sched_info.ks_ndispatches++; \ + if (error == CRYPTO_BUSY) \ + (pd)->pd_sched_info.ks_nbusy_rval++; \ + else if (error != CRYPTO_SUCCESS && error != CRYPTO_QUEUED) \ + (pd)->pd_sched_info.ks_nfails++; \ +} + + +/* + * The following two macros should be + * #define KCF_OPS_CLASSSIZE (KCF_LAST_OPSCLASS - KCF_FIRST_OPSCLASS + 2) + * #define KCF_MAXMECHTAB KCF_MAXCIPHER + * + * However, doing that would involve reorganizing the header file a bit. + * When impl.h is broken up (bug# 4703218), this will be done. For now, + * we hardcode these values. + */ +#define KCF_OPS_CLASSSIZE 8 +#define KCF_MAXMECHTAB 32 + +/* + * Valid values for the state of a provider. The order of + * the elements is important. + * + * Routines which get a provider or the list of providers + * should pick only those that are either in KCF_PROV_READY state + * or in KCF_PROV_BUSY state. + */ +typedef enum { + KCF_PROV_ALLOCATED = 1, + KCF_PROV_UNVERIFIED, + KCF_PROV_VERIFICATION_FAILED, + /* + * state < KCF_PROV_READY means the provider can not + * be used at all. + */ + KCF_PROV_READY, + KCF_PROV_BUSY, + /* + * state > KCF_PROV_BUSY means the provider can not + * be used for new requests. + */ + KCF_PROV_FAILED, + /* + * Threads setting the following two states should do so only + * if the current state < KCF_PROV_DISABLED. + */ + KCF_PROV_DISABLED, + KCF_PROV_REMOVED, + KCF_PROV_FREED +} kcf_prov_state_t; + +#define KCF_IS_PROV_UNVERIFIED(pd) ((pd)->pd_state == KCF_PROV_UNVERIFIED) +#define KCF_IS_PROV_USABLE(pd) ((pd)->pd_state == KCF_PROV_READY || \ + (pd)->pd_state == KCF_PROV_BUSY) +#define KCF_IS_PROV_REMOVED(pd) ((pd)->pd_state >= KCF_PROV_REMOVED) + +/* Internal flags valid for pd_flags field */ +#define KCF_PROV_RESTRICTED 0x40000000 +#define KCF_LPROV_MEMBER 0x80000000 /* is member of a logical provider */ + +/* + * A provider descriptor structure. There is one such structure per + * provider. It is allocated and initialized at registration time and + * freed when the provider unregisters. + * + * pd_prov_type: Provider type, hardware or software + * pd_sid: Session ID of the provider used by kernel clients. + * This is valid only for session-oriented providers. + * pd_refcnt: Reference counter to this provider descriptor + * pd_irefcnt: References held by the framework internal structs + * pd_lock: lock protects pd_state and pd_provider_list + * pd_state: State value of the provider + * pd_provider_list: Used to cross-reference logical providers and their + * members. Not used for software providers. + * pd_resume_cv: cv to wait for state to change from KCF_PROV_BUSY + * pd_prov_handle: Provider handle specified by provider + * pd_ops_vector: The ops vector specified by Provider + * pd_mech_indx: Lookup table which maps a core framework mechanism + * number to an index in pd_mechanisms array + * pd_mechanisms: Array of mechanisms supported by the provider, specified + * by the provider during registration + * pd_sched_info: Scheduling information associated with the provider + * pd_mech_list_count: The number of entries in pi_mechanisms, specified + * by the provider during registration + * pd_name: Device name or module name + * pd_instance: Device instance + * pd_module_id: Module ID returned by modload + * pd_mctlp: Pointer to modctl structure for this provider + * pd_remove_cv: cv to wait on while the provider queue drains + * pd_description: Provider description string + * pd_flags bitwise OR of pi_flags from crypto_provider_info_t + * and other internal flags defined above. + * pd_hash_limit Maximum data size that hash mechanisms of this provider + * can support. + * pd_kcf_prov_handle: KCF-private handle assigned by KCF + * pd_prov_id: Identification # assigned by KCF to provider + * pd_kstat: kstat associated with the provider + * pd_ks_data: kstat data + */ +typedef struct kcf_provider_desc { + crypto_provider_type_t pd_prov_type; + crypto_session_id_t pd_sid; + uint_t pd_refcnt; + uint_t pd_irefcnt; + kmutex_t pd_lock; + kcf_prov_state_t pd_state; + struct kcf_provider_list *pd_provider_list; + kcondvar_t pd_resume_cv; + crypto_provider_handle_t pd_prov_handle; + crypto_ops_t *pd_ops_vector; + ushort_t pd_mech_indx[KCF_OPS_CLASSSIZE]\ + [KCF_MAXMECHTAB]; + crypto_mech_info_t *pd_mechanisms; + kcf_sched_info_t pd_sched_info; + uint_t pd_mech_list_count; + // char *pd_name; + // uint_t pd_instance; + // int pd_module_id; + // struct modctl *pd_mctlp; + kcondvar_t pd_remove_cv; + char *pd_description; + uint_t pd_flags; + uint_t pd_hash_limit; + crypto_kcf_provider_handle_t pd_kcf_prov_handle; + crypto_provider_id_t pd_prov_id; + kstat_t *pd_kstat; + kcf_prov_stats_t pd_ks_data; +} kcf_provider_desc_t; + +/* useful for making a list of providers */ +typedef struct kcf_provider_list { + struct kcf_provider_list *pl_next; + struct kcf_provider_desc *pl_provider; +} kcf_provider_list_t; + +/* atomic operations in linux implicitly form a memory barrier */ +#define membar_exit() + +/* + * If a component has a reference to a kcf_provider_desc_t, + * it REFHOLD()s. A new provider descriptor which is referenced only + * by the providers table has a reference counter of one. + */ +#define KCF_PROV_REFHOLD(desc) { \ + atomic_add_32(&(desc)->pd_refcnt, 1); \ + ASSERT((desc)->pd_refcnt != 0); \ +} + +#define KCF_PROV_IREFHOLD(desc) { \ + atomic_add_32(&(desc)->pd_irefcnt, 1); \ + ASSERT((desc)->pd_irefcnt != 0); \ +} + +#define KCF_PROV_IREFRELE(desc) { \ + ASSERT((desc)->pd_irefcnt != 0); \ + membar_exit(); \ + if (atomic_add_32_nv(&(desc)->pd_irefcnt, -1) == 0) { \ + cv_broadcast(&(desc)->pd_remove_cv); \ + } \ +} + +#define KCF_PROV_REFHELD(desc) ((desc)->pd_refcnt >= 1) + +#define KCF_PROV_REFRELE(desc) { \ + ASSERT((desc)->pd_refcnt != 0); \ + membar_exit(); \ + if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) { \ + kcf_provider_zero_refcnt((desc)); \ + } \ +} + + +/* list of crypto_mech_info_t valid as the second mech in a dual operation */ + +typedef struct crypto_mech_info_list { + struct crypto_mech_info_list *ml_next; + crypto_mech_type_t ml_kcf_mechid; /* KCF's id */ + crypto_mech_info_t ml_mech_info; +} crypto_mech_info_list_t; + +/* + * An element in a mechanism provider descriptors chain. + * The kcf_prov_mech_desc_t is duplicated in every chain the provider belongs + * to. This is a small tradeoff memory vs mutex spinning time to access the + * common provider field. + */ + +typedef struct kcf_prov_mech_desc { + struct kcf_mech_entry *pm_me; /* Back to the head */ + struct kcf_prov_mech_desc *pm_next; /* Next in the chain */ + crypto_mech_info_t pm_mech_info; /* Provider mech info */ + crypto_mech_info_list_t *pm_mi_list; /* list for duals */ + kcf_provider_desc_t *pm_prov_desc; /* Common desc. */ +} kcf_prov_mech_desc_t; + +/* and the notation shortcuts ... */ +#define pm_provider_type pm_prov_desc.pd_provider_type +#define pm_provider_handle pm_prov_desc.pd_provider_handle +#define pm_ops_vector pm_prov_desc.pd_ops_vector + +/* + * A mechanism entry in an xxx_mech_tab[]. me_pad was deemed + * to be unnecessary and removed. + */ +typedef struct kcf_mech_entry { + crypto_mech_name_t me_name; /* mechanism name */ + crypto_mech_type_t me_mechid; /* Internal id for mechanism */ + kmutex_t me_mutex; /* access protection */ + kcf_prov_mech_desc_t *me_hw_prov_chain; /* list of HW providers */ + kcf_prov_mech_desc_t *me_sw_prov; /* SW provider */ + /* + * Number of HW providers in the chain. There is only one + * SW provider. So, we need only a count of HW providers. + */ + int me_num_hwprov; + /* + * When a SW provider is present, this is the generation number that + * ensures no objects from old SW providers are used in the new one + */ + uint32_t me_gen_swprov; + /* + * threshold for using hardware providers for this mech + */ + size_t me_threshold; +} kcf_mech_entry_t; + +/* + * A policy descriptor structure. It is allocated and initialized + * when administrative ioctls load disabled mechanisms. + * + * pd_prov_type: Provider type, hardware or software + * pd_name: Device name or module name. + * pd_instance: Device instance. + * pd_refcnt: Reference counter for this policy descriptor + * pd_mutex: Protects array and count of disabled mechanisms. + * pd_disabled_count: Count of disabled mechanisms. + * pd_disabled_mechs: Array of disabled mechanisms. + */ +typedef struct kcf_policy_desc { + crypto_provider_type_t pd_prov_type; + char *pd_name; + uint_t pd_instance; + uint_t pd_refcnt; + kmutex_t pd_mutex; + uint_t pd_disabled_count; + crypto_mech_name_t *pd_disabled_mechs; +} kcf_policy_desc_t; + +/* + * If a component has a reference to a kcf_policy_desc_t, + * it REFHOLD()s. A new policy descriptor which is referenced only + * by the policy table has a reference count of one. + */ +#define KCF_POLICY_REFHOLD(desc) { \ + atomic_add_32(&(desc)->pd_refcnt, 1); \ + ASSERT((desc)->pd_refcnt != 0); \ +} + +/* + * Releases a reference to a policy descriptor. When the last + * reference is released, the descriptor is freed. + */ +#define KCF_POLICY_REFRELE(desc) { \ + ASSERT((desc)->pd_refcnt != 0); \ + membar_exit(); \ + if (atomic_add_32_nv(&(desc)->pd_refcnt, -1) == 0) \ + kcf_policy_free_desc(desc); \ +} + +/* + * This entry stores the name of a software module and its + * mechanisms. The mechanisms are 'hints' that are used to + * trigger loading of the module. + */ +typedef struct kcf_soft_conf_entry { + struct kcf_soft_conf_entry *ce_next; + char *ce_name; + crypto_mech_name_t *ce_mechs; + uint_t ce_count; +} kcf_soft_conf_entry_t; + +extern kmutex_t soft_config_mutex; +extern kcf_soft_conf_entry_t *soft_config_list; + +/* + * Global tables. The sizes are from the predefined PKCS#11 v2.20 mechanisms, + * with a margin of few extra empty entry points + */ + +#define KCF_MAXDIGEST 16 /* Digests */ +#define KCF_MAXCIPHER 64 /* Ciphers */ +#define KCF_MAXMAC 40 /* Message authentication codes */ +#define KCF_MAXSIGN 24 /* Sign/Verify */ +#define KCF_MAXKEYOPS 116 /* Key generation and derivation */ +#define KCF_MAXMISC 16 /* Others ... */ + +#define KCF_MAXMECHS KCF_MAXDIGEST + KCF_MAXCIPHER + KCF_MAXMAC + \ + KCF_MAXSIGN + KCF_MAXKEYOPS + \ + KCF_MAXMISC + +extern kcf_mech_entry_t kcf_digest_mechs_tab[]; +extern kcf_mech_entry_t kcf_cipher_mechs_tab[]; +extern kcf_mech_entry_t kcf_mac_mechs_tab[]; +extern kcf_mech_entry_t kcf_sign_mechs_tab[]; +extern kcf_mech_entry_t kcf_keyops_mechs_tab[]; +extern kcf_mech_entry_t kcf_misc_mechs_tab[]; + +extern kmutex_t kcf_mech_tabs_lock; + +typedef enum { + KCF_DIGEST_CLASS = 1, + KCF_CIPHER_CLASS, + KCF_MAC_CLASS, + KCF_SIGN_CLASS, + KCF_KEYOPS_CLASS, + KCF_MISC_CLASS +} kcf_ops_class_t; + +#define KCF_FIRST_OPSCLASS KCF_DIGEST_CLASS +#define KCF_LAST_OPSCLASS KCF_MISC_CLASS + +/* The table of all the kcf_xxx_mech_tab[]s, indexed by kcf_ops_class */ + +typedef struct kcf_mech_entry_tab { + int met_size; /* Size of the met_tab[] */ + kcf_mech_entry_t *met_tab; /* the table */ +} kcf_mech_entry_tab_t; + +extern kcf_mech_entry_tab_t kcf_mech_tabs_tab[]; + +#define KCF_MECHID(class, index) \ + (((crypto_mech_type_t)(class) << 32) | (crypto_mech_type_t)(index)) + +#define KCF_MECH2CLASS(mech_type) ((kcf_ops_class_t)((mech_type) >> 32)) + +#define KCF_MECH2INDEX(mech_type) ((int)(mech_type)) + +#define KCF_TO_PROV_MECH_INDX(pd, mech_type) \ + ((pd)->pd_mech_indx[KCF_MECH2CLASS(mech_type)] \ + [KCF_MECH2INDEX(mech_type)]) + +#define KCF_TO_PROV_MECHINFO(pd, mech_type) \ + ((pd)->pd_mechanisms[KCF_TO_PROV_MECH_INDX(pd, mech_type)]) + +#define KCF_TO_PROV_MECHNUM(pd, mech_type) \ + (KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_number) + +#define KCF_CAN_SHARE_OPSTATE(pd, mech_type) \ + ((KCF_TO_PROV_MECHINFO(pd, mech_type).cm_mech_flags) & \ + CRYPTO_CAN_SHARE_OPSTATE) + +/* ps_refcnt is protected by cm_lock in the crypto_minor structure */ +typedef struct crypto_provider_session { + struct crypto_provider_session *ps_next; + crypto_session_id_t ps_session; + kcf_provider_desc_t *ps_provider; + kcf_provider_desc_t *ps_real_provider; + uint_t ps_refcnt; +} crypto_provider_session_t; + +typedef struct crypto_session_data { + kmutex_t sd_lock; + kcondvar_t sd_cv; + uint32_t sd_flags; + int sd_pre_approved_amount; + crypto_ctx_t *sd_digest_ctx; + crypto_ctx_t *sd_encr_ctx; + crypto_ctx_t *sd_decr_ctx; + crypto_ctx_t *sd_sign_ctx; + crypto_ctx_t *sd_verify_ctx; + crypto_ctx_t *sd_sign_recover_ctx; + crypto_ctx_t *sd_verify_recover_ctx; + kcf_provider_desc_t *sd_provider; + void *sd_find_init_cookie; + crypto_provider_session_t *sd_provider_session; +} crypto_session_data_t; + +#define CRYPTO_SESSION_IN_USE 0x00000001 +#define CRYPTO_SESSION_IS_BUSY 0x00000002 +#define CRYPTO_SESSION_IS_CLOSED 0x00000004 + +#define KCF_MAX_PIN_LEN 1024 + +/* + * Per-minor info. + * + * cm_lock protects everything in this structure except for cm_refcnt. + */ +typedef struct crypto_minor { + uint_t cm_refcnt; + kmutex_t cm_lock; + kcondvar_t cm_cv; + crypto_session_data_t **cm_session_table; + uint_t cm_session_table_count; + kcf_provider_desc_t **cm_provider_array; + uint_t cm_provider_count; + crypto_provider_session_t *cm_provider_session; +} crypto_minor_t; + +/* + * Return codes for internal functions + */ +#define KCF_SUCCESS 0x0 /* Successful call */ +#define KCF_INVALID_MECH_NUMBER 0x1 /* invalid mechanism number */ +#define KCF_INVALID_MECH_NAME 0x2 /* invalid mechanism name */ +#define KCF_INVALID_MECH_CLASS 0x3 /* invalid mechanism class */ +#define KCF_MECH_TAB_FULL 0x4 /* Need more room in the mech tabs. */ +#define KCF_INVALID_INDX ((ushort_t)-1) + +/* + * kCF internal mechanism and function group for tracking RNG providers. + */ +#define SUN_RANDOM "random" +#define CRYPTO_FG_RANDOM 0x80000000 /* generate_random() */ + +/* + * Wrappers for ops vectors. In the wrapper definitions below, the pd + * argument always corresponds to a pointer to a provider descriptor + * of type kcf_prov_desc_t. + */ + +#define KCF_PROV_CONTROL_OPS(pd) ((pd)->pd_ops_vector->co_control_ops) +#define KCF_PROV_CTX_OPS(pd) ((pd)->pd_ops_vector->co_ctx_ops) +#define KCF_PROV_DIGEST_OPS(pd) ((pd)->pd_ops_vector->co_digest_ops) +#define KCF_PROV_CIPHER_OPS(pd) ((pd)->pd_ops_vector->co_cipher_ops) +#define KCF_PROV_MAC_OPS(pd) ((pd)->pd_ops_vector->co_mac_ops) +#define KCF_PROV_SIGN_OPS(pd) ((pd)->pd_ops_vector->co_sign_ops) +#define KCF_PROV_VERIFY_OPS(pd) ((pd)->pd_ops_vector->co_verify_ops) +#define KCF_PROV_DUAL_OPS(pd) ((pd)->pd_ops_vector->co_dual_ops) +#define KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) \ + ((pd)->pd_ops_vector->co_dual_cipher_mac_ops) +#define KCF_PROV_RANDOM_OPS(pd) ((pd)->pd_ops_vector->co_random_ops) +#define KCF_PROV_SESSION_OPS(pd) ((pd)->pd_ops_vector->co_session_ops) +#define KCF_PROV_OBJECT_OPS(pd) ((pd)->pd_ops_vector->co_object_ops) +#define KCF_PROV_KEY_OPS(pd) ((pd)->pd_ops_vector->co_key_ops) +#define KCF_PROV_PROVIDER_OPS(pd) ((pd)->pd_ops_vector->co_provider_ops) +#define KCF_PROV_MECH_OPS(pd) ((pd)->pd_ops_vector->co_mech_ops) +#define KCF_PROV_NOSTORE_KEY_OPS(pd) \ + ((pd)->pd_ops_vector->co_nostore_key_ops) + +/* + * Wrappers for crypto_control_ops(9S) entry points. + */ + +#define KCF_PROV_STATUS(pd, status) ( \ + (KCF_PROV_CONTROL_OPS(pd) && \ + KCF_PROV_CONTROL_OPS(pd)->provider_status) ? \ + KCF_PROV_CONTROL_OPS(pd)->provider_status( \ + (pd)->pd_prov_handle, status) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_ctx_ops(9S) entry points. + */ + +#define KCF_PROV_CREATE_CTX_TEMPLATE(pd, mech, key, template, size, req) ( \ + (KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->create_ctx_template) ? \ + KCF_PROV_CTX_OPS(pd)->create_ctx_template( \ + (pd)->pd_prov_handle, mech, key, template, size, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_FREE_CONTEXT(pd, ctx) ( \ + (KCF_PROV_CTX_OPS(pd) && KCF_PROV_CTX_OPS(pd)->free_context) ? \ + KCF_PROV_CTX_OPS(pd)->free_context(ctx) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_COPYIN_MECH(pd, umech, kmech, errorp, mode) ( \ + (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyin_mechanism) ? \ + KCF_PROV_MECH_OPS(pd)->copyin_mechanism( \ + (pd)->pd_prov_handle, umech, kmech, errorp, mode) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_COPYOUT_MECH(pd, kmech, umech, errorp, mode) ( \ + (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->copyout_mechanism) ? \ + KCF_PROV_MECH_OPS(pd)->copyout_mechanism( \ + (pd)->pd_prov_handle, kmech, umech, errorp, mode) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_FREE_MECH(pd, prov_mech) ( \ + (KCF_PROV_MECH_OPS(pd) && KCF_PROV_MECH_OPS(pd)->free_mechanism) ? \ + KCF_PROV_MECH_OPS(pd)->free_mechanism( \ + (pd)->pd_prov_handle, prov_mech) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_digest_ops(9S) entry points. + */ + +#define KCF_PROV_DIGEST_INIT(pd, ctx, mech, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_init) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest_init(ctx, mech, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * The _ (underscore) in _digest is needed to avoid replacing the + * function digest(). + */ +#define KCF_PROV_DIGEST(pd, ctx, data, _digest, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest(ctx, data, _digest, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DIGEST_UPDATE(pd, ctx, data, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_update) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest_update(ctx, data, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DIGEST_KEY(pd, ctx, key, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_key) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest_key(ctx, key, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DIGEST_FINAL(pd, ctx, digest, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_final) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest_final(ctx, digest, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DIGEST_ATOMIC(pd, session, mech, data, digest, req) ( \ + (KCF_PROV_DIGEST_OPS(pd) && KCF_PROV_DIGEST_OPS(pd)->digest_atomic) ? \ + KCF_PROV_DIGEST_OPS(pd)->digest_atomic( \ + (pd)->pd_prov_handle, session, mech, data, digest, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_cipher_ops(9S) entry points. + */ + +#define KCF_PROV_ENCRYPT_INIT(pd, ctx, mech, key, template, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_init) ? \ + KCF_PROV_CIPHER_OPS(pd)->encrypt_init(ctx, mech, key, template, \ + req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT(pd, ctx, plaintext, ciphertext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt) ? \ + KCF_PROV_CIPHER_OPS(pd)->encrypt(ctx, plaintext, ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_update) ? \ + KCF_PROV_CIPHER_OPS(pd)->encrypt_update(ctx, plaintext, \ + ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_FINAL(pd, ctx, ciphertext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_final) ? \ + KCF_PROV_CIPHER_OPS(pd)->encrypt_final(ctx, ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_ATOMIC(pd, session, mech, key, plaintext, ciphertext, \ + template, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic) ? \ + KCF_PROV_CIPHER_OPS(pd)->encrypt_atomic( \ + (pd)->pd_prov_handle, session, mech, key, plaintext, ciphertext, \ + template, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_INIT(pd, ctx, mech, key, template, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_init) ? \ + KCF_PROV_CIPHER_OPS(pd)->decrypt_init(ctx, mech, key, template, \ + req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT(pd, ctx, ciphertext, plaintext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt) ? \ + KCF_PROV_CIPHER_OPS(pd)->decrypt(ctx, ciphertext, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_update) ? \ + KCF_PROV_CIPHER_OPS(pd)->decrypt_update(ctx, ciphertext, \ + plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_FINAL(pd, ctx, plaintext, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_final) ? \ + KCF_PROV_CIPHER_OPS(pd)->decrypt_final(ctx, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_ATOMIC(pd, session, mech, key, ciphertext, plaintext, \ + template, req) ( \ + (KCF_PROV_CIPHER_OPS(pd) && KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic) ? \ + KCF_PROV_CIPHER_OPS(pd)->decrypt_atomic( \ + (pd)->pd_prov_handle, session, mech, key, ciphertext, plaintext, \ + template, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_mac_ops(9S) entry points. + */ + +#define KCF_PROV_MAC_INIT(pd, ctx, mech, key, template, req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_init) ? \ + KCF_PROV_MAC_OPS(pd)->mac_init(ctx, mech, key, template, req) \ + : CRYPTO_NOT_SUPPORTED) + +/* + * The _ (underscore) in _mac is needed to avoid replacing the + * function mac(). + */ +#define KCF_PROV_MAC(pd, ctx, data, _mac, req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac) ? \ + KCF_PROV_MAC_OPS(pd)->mac(ctx, data, _mac, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_UPDATE(pd, ctx, data, req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_update) ? \ + KCF_PROV_MAC_OPS(pd)->mac_update(ctx, data, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_FINAL(pd, ctx, mac, req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_final) ? \ + KCF_PROV_MAC_OPS(pd)->mac_final(ctx, mac, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_ATOMIC(pd, session, mech, key, data, mac, template, \ + req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_atomic) ? \ + KCF_PROV_MAC_OPS(pd)->mac_atomic( \ + (pd)->pd_prov_handle, session, mech, key, data, mac, template, \ + req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_VERIFY_ATOMIC(pd, session, mech, key, data, mac, \ + template, req) ( \ + (KCF_PROV_MAC_OPS(pd) && KCF_PROV_MAC_OPS(pd)->mac_verify_atomic) ? \ + KCF_PROV_MAC_OPS(pd)->mac_verify_atomic( \ + (pd)->pd_prov_handle, session, mech, key, data, mac, template, \ + req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_sign_ops(9S) entry points. + */ + +#define KCF_PROV_SIGN_INIT(pd, ctx, mech, key, template, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_init) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_init( \ + ctx, mech, key, template, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN(pd, ctx, data, sig, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign) ? \ + KCF_PROV_SIGN_OPS(pd)->sign(ctx, data, sig, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_UPDATE(pd, ctx, data, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_update) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_update(ctx, data, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_FINAL(pd, ctx, sig, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_final) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_final(ctx, sig, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_ATOMIC(pd, session, mech, key, data, template, \ + sig, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_atomic) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_atomic( \ + (pd)->pd_prov_handle, session, mech, key, data, sig, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_RECOVER_INIT(pd, ctx, mech, key, template, \ + req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover_init) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_recover_init(ctx, mech, key, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_RECOVER(pd, ctx, data, sig, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && KCF_PROV_SIGN_OPS(pd)->sign_recover) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_recover(ctx, data, sig, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_RECOVER_ATOMIC(pd, session, mech, key, data, template, \ + sig, req) ( \ + (KCF_PROV_SIGN_OPS(pd) && \ + KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic) ? \ + KCF_PROV_SIGN_OPS(pd)->sign_recover_atomic( \ + (pd)->pd_prov_handle, session, mech, key, data, sig, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_verify_ops(9S) entry points. + */ + +#define KCF_PROV_VERIFY_INIT(pd, ctx, mech, key, template, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_init) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_init(ctx, mech, key, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_VERIFY(pd, ctx, data, sig, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->do_verify) ? \ + KCF_PROV_VERIFY_OPS(pd)->do_verify(ctx, data, sig, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_VERIFY_UPDATE(pd, ctx, data, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_update) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_update(ctx, data, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_VERIFY_FINAL(pd, ctx, sig, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_final) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_final(ctx, sig, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_VERIFY_ATOMIC(pd, session, mech, key, data, template, sig, \ + req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_atomic) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_atomic( \ + (pd)->pd_prov_handle, session, mech, key, data, sig, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_VERIFY_RECOVER_INIT(pd, ctx, mech, key, template, \ + req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && \ + KCF_PROV_VERIFY_OPS(pd)->verify_recover_init) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_recover_init(ctx, mech, key, \ + template, req) : CRYPTO_NOT_SUPPORTED) + +/* verify_recover() CSPI routine has different argument order than verify() */ +#define KCF_PROV_VERIFY_RECOVER(pd, ctx, sig, data, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && KCF_PROV_VERIFY_OPS(pd)->verify_recover) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_recover(ctx, sig, data, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * verify_recover_atomic() CSPI routine has different argument order + * than verify_atomic(). + */ +#define KCF_PROV_VERIFY_RECOVER_ATOMIC(pd, session, mech, key, sig, \ + template, data, req) ( \ + (KCF_PROV_VERIFY_OPS(pd) && \ + KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic) ? \ + KCF_PROV_VERIFY_OPS(pd)->verify_recover_atomic( \ + (pd)->pd_prov_handle, session, mech, key, sig, data, template, \ + req) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_dual_ops(9S) entry points. + */ + +#define KCF_PROV_DIGEST_ENCRYPT_UPDATE(digest_ctx, encrypt_ctx, plaintext, \ + ciphertext, req) ( \ + (KCF_PROV_DUAL_OPS(pd) && \ + KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update) ? \ + KCF_PROV_DUAL_OPS(pd)->digest_encrypt_update( \ + digest_ctx, encrypt_ctx, plaintext, ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_DIGEST_UPDATE(decrypt_ctx, digest_ctx, ciphertext, \ + plaintext, req) ( \ + (KCF_PROV_DUAL_OPS(pd) && \ + KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update) ? \ + KCF_PROV_DUAL_OPS(pd)->decrypt_digest_update( \ + decrypt_ctx, digest_ctx, ciphertext, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SIGN_ENCRYPT_UPDATE(sign_ctx, encrypt_ctx, plaintext, \ + ciphertext, req) ( \ + (KCF_PROV_DUAL_OPS(pd) && \ + KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update) ? \ + KCF_PROV_DUAL_OPS(pd)->sign_encrypt_update( \ + sign_ctx, encrypt_ctx, plaintext, ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_DECRYPT_VERIFY_UPDATE(decrypt_ctx, verify_ctx, ciphertext, \ + plaintext, req) ( \ + (KCF_PROV_DUAL_OPS(pd) && \ + KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update) ? \ + KCF_PROV_DUAL_OPS(pd)->decrypt_verify_update( \ + decrypt_ctx, verify_ctx, ciphertext, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_dual_cipher_mac_ops(9S) entry points. + */ + +#define KCF_PROV_ENCRYPT_MAC_INIT(pd, ctx, encr_mech, encr_key, mac_mech, \ + mac_key, encr_ctx_template, mac_ctx_template, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_init( \ + ctx, encr_mech, encr_key, mac_mech, mac_key, encr_ctx_template, \ + mac_ctx_template, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_MAC(pd, ctx, plaintext, ciphertext, mac, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac( \ + ctx, plaintext, ciphertext, mac, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_MAC_UPDATE(pd, ctx, plaintext, ciphertext, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_update( \ + ctx, plaintext, ciphertext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_MAC_FINAL(pd, ctx, ciphertext, mac, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_final( \ + ctx, ciphertext, mac, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_ENCRYPT_MAC_ATOMIC(pd, session, encr_mech, encr_key, \ + mac_mech, mac_key, plaintext, ciphertext, mac, \ + encr_ctx_template, mac_ctx_template, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->encrypt_mac_atomic( \ + (pd)->pd_prov_handle, session, encr_mech, encr_key, \ + mac_mech, mac_key, plaintext, ciphertext, mac, \ + encr_ctx_template, mac_ctx_template, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_DECRYPT_INIT(pd, ctx, mac_mech, mac_key, decr_mech, \ + decr_key, mac_ctx_template, decr_ctx_template, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_init( \ + ctx, mac_mech, mac_key, decr_mech, decr_key, mac_ctx_template, \ + decr_ctx_template, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_DECRYPT(pd, ctx, ciphertext, mac, plaintext, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt( \ + ctx, ciphertext, mac, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_DECRYPT_UPDATE(pd, ctx, ciphertext, plaintext, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_update( \ + ctx, ciphertext, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_DECRYPT_FINAL(pd, ctx, mac, plaintext, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_final( \ + ctx, mac, plaintext, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \ + decr_mech, decr_key, ciphertext, mac, plaintext, \ + mac_ctx_template, decr_ctx_template, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_decrypt_atomic( \ + (pd)->pd_prov_handle, session, mac_mech, mac_key, \ + decr_mech, decr_key, ciphertext, mac, plaintext, \ + mac_ctx_template, decr_ctx_template, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_MAC_VERIFY_DECRYPT_ATOMIC(pd, session, mac_mech, mac_key, \ + decr_mech, decr_key, ciphertext, mac, plaintext, \ + mac_ctx_template, decr_ctx_template, req) ( \ + (KCF_PROV_DUAL_CIPHER_MAC_OPS(pd) && \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic \ + != NULL) ? \ + KCF_PROV_DUAL_CIPHER_MAC_OPS(pd)->mac_verify_decrypt_atomic( \ + (pd)->pd_prov_handle, session, mac_mech, mac_key, \ + decr_mech, decr_key, ciphertext, mac, plaintext, \ + mac_ctx_template, decr_ctx_template, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_random_number_ops(9S) entry points. + */ + +#define KCF_PROV_SEED_RANDOM(pd, session, buf, len, est, flags, req) ( \ + (KCF_PROV_RANDOM_OPS(pd) && KCF_PROV_RANDOM_OPS(pd)->seed_random) ? \ + KCF_PROV_RANDOM_OPS(pd)->seed_random((pd)->pd_prov_handle, \ + session, buf, len, est, flags, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_GENERATE_RANDOM(pd, session, buf, len, req) ( \ + (KCF_PROV_RANDOM_OPS(pd) && \ + KCF_PROV_RANDOM_OPS(pd)->generate_random) ? \ + KCF_PROV_RANDOM_OPS(pd)->generate_random((pd)->pd_prov_handle, \ + session, buf, len, req) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_session_ops(9S) entry points. + * + * ops_pd is the provider descriptor that supplies the ops_vector. + * pd is the descriptor that supplies the provider handle. + * Only session open/close needs two handles. + */ + +#define KCF_PROV_SESSION_OPEN(ops_pd, session, req, pd) ( \ + (KCF_PROV_SESSION_OPS(ops_pd) && \ + KCF_PROV_SESSION_OPS(ops_pd)->session_open) ? \ + KCF_PROV_SESSION_OPS(ops_pd)->session_open((pd)->pd_prov_handle, \ + session, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SESSION_CLOSE(ops_pd, session, req, pd) ( \ + (KCF_PROV_SESSION_OPS(ops_pd) && \ + KCF_PROV_SESSION_OPS(ops_pd)->session_close) ? \ + KCF_PROV_SESSION_OPS(ops_pd)->session_close((pd)->pd_prov_handle, \ + session, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SESSION_LOGIN(pd, session, user_type, pin, len, req) ( \ + (KCF_PROV_SESSION_OPS(pd) && \ + KCF_PROV_SESSION_OPS(pd)->session_login) ? \ + KCF_PROV_SESSION_OPS(pd)->session_login((pd)->pd_prov_handle, \ + session, user_type, pin, len, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SESSION_LOGOUT(pd, session, req) ( \ + (KCF_PROV_SESSION_OPS(pd) && \ + KCF_PROV_SESSION_OPS(pd)->session_logout) ? \ + KCF_PROV_SESSION_OPS(pd)->session_logout((pd)->pd_prov_handle, \ + session, req) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_object_ops(9S) entry points. + */ + +#define KCF_PROV_OBJECT_CREATE(pd, session, template, count, object, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_create) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_create((pd)->pd_prov_handle, \ + session, template, count, object, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_COPY(pd, session, object, template, count, \ + new_object, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_copy) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_copy((pd)->pd_prov_handle, \ + session, object, template, count, new_object, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_DESTROY(pd, session, object, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_destroy) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_destroy((pd)->pd_prov_handle, \ + session, object, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_GET_SIZE(pd, session, object, size, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && \ + KCF_PROV_OBJECT_OPS(pd)->object_get_size) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_get_size((pd)->pd_prov_handle, \ + session, object, size, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_GET_ATTRIBUTE_VALUE(pd, session, object, template, \ + count, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && \ + KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_get_attribute_value( \ + (pd)->pd_prov_handle, session, object, template, count, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_SET_ATTRIBUTE_VALUE(pd, session, object, template, \ + count, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && \ + KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_set_attribute_value( \ + (pd)->pd_prov_handle, session, object, template, count, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_FIND_INIT(pd, session, template, count, ppriv, \ + req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && \ + KCF_PROV_OBJECT_OPS(pd)->object_find_init) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_find_init((pd)->pd_prov_handle, \ + session, template, count, ppriv, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_FIND(pd, ppriv, objects, max_objects, object_count, \ + req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && KCF_PROV_OBJECT_OPS(pd)->object_find) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_find( \ + (pd)->pd_prov_handle, ppriv, objects, max_objects, object_count, \ + req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_OBJECT_FIND_FINAL(pd, ppriv, req) ( \ + (KCF_PROV_OBJECT_OPS(pd) && \ + KCF_PROV_OBJECT_OPS(pd)->object_find_final) ? \ + KCF_PROV_OBJECT_OPS(pd)->object_find_final( \ + (pd)->pd_prov_handle, ppriv, req) : CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_key_ops(9S) entry points. + */ + +#define KCF_PROV_KEY_GENERATE(pd, session, mech, template, count, object, \ + req) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate) ? \ + KCF_PROV_KEY_OPS(pd)->key_generate((pd)->pd_prov_handle, \ + session, mech, template, count, object, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \ + pub_count, priv_template, priv_count, pub_key, priv_key, req) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_generate_pair) ? \ + KCF_PROV_KEY_OPS(pd)->key_generate_pair((pd)->pd_prov_handle, \ + session, mech, pub_template, pub_count, priv_template, \ + priv_count, pub_key, priv_key, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_KEY_WRAP(pd, session, mech, wrapping_key, key, wrapped_key, \ + wrapped_key_len, req) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_wrap) ? \ + KCF_PROV_KEY_OPS(pd)->key_wrap((pd)->pd_prov_handle, \ + session, mech, wrapping_key, key, wrapped_key, wrapped_key_len, \ + req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_KEY_UNWRAP(pd, session, mech, unwrapping_key, wrapped_key, \ + wrapped_key_len, template, count, key, req) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_unwrap) ? \ + KCF_PROV_KEY_OPS(pd)->key_unwrap((pd)->pd_prov_handle, \ + session, mech, unwrapping_key, wrapped_key, wrapped_key_len, \ + template, count, key, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_KEY_DERIVE(pd, session, mech, base_key, template, count, \ + key, req) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_derive) ? \ + KCF_PROV_KEY_OPS(pd)->key_derive((pd)->pd_prov_handle, \ + session, mech, base_key, template, count, key, req) : \ + CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_KEY_CHECK(pd, mech, key) ( \ + (KCF_PROV_KEY_OPS(pd) && KCF_PROV_KEY_OPS(pd)->key_check) ? \ + KCF_PROV_KEY_OPS(pd)->key_check((pd)->pd_prov_handle, mech, key) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_provider_management_ops(9S) entry points. + * + * ops_pd is the provider descriptor that supplies the ops_vector. + * pd is the descriptor that supplies the provider handle. + * Only ext_info needs two handles. + */ + +#define KCF_PROV_EXT_INFO(ops_pd, provext_info, req, pd) ( \ + (KCF_PROV_PROVIDER_OPS(ops_pd) && \ + KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info) ? \ + KCF_PROV_PROVIDER_OPS(ops_pd)->ext_info((pd)->pd_prov_handle, \ + provext_info, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_INIT_TOKEN(pd, pin, pin_len, label, req) ( \ + (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_token) ? \ + KCF_PROV_PROVIDER_OPS(pd)->init_token((pd)->pd_prov_handle, \ + pin, pin_len, label, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_INIT_PIN(pd, session, pin, pin_len, req) ( \ + (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->init_pin) ? \ + KCF_PROV_PROVIDER_OPS(pd)->init_pin((pd)->pd_prov_handle, \ + session, pin, pin_len, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_SET_PIN(pd, session, old_pin, old_len, new_pin, new_len, \ + req) ( \ + (KCF_PROV_PROVIDER_OPS(pd) && KCF_PROV_PROVIDER_OPS(pd)->set_pin) ? \ + KCF_PROV_PROVIDER_OPS(pd)->set_pin((pd)->pd_prov_handle, \ + session, old_pin, old_len, new_pin, new_len, req) : \ + CRYPTO_NOT_SUPPORTED) + +/* + * Wrappers for crypto_nostore_key_ops(9S) entry points. + */ + +#define KCF_PROV_NOSTORE_KEY_GENERATE(pd, session, mech, template, count, \ + out_template, out_count, req) ( \ + (KCF_PROV_NOSTORE_KEY_OPS(pd) && \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate) ? \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate( \ + (pd)->pd_prov_handle, session, mech, template, count, \ + out_template, out_count, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_NOSTORE_KEY_GENERATE_PAIR(pd, session, mech, pub_template, \ + pub_count, priv_template, priv_count, out_pub_template, \ + out_pub_count, out_priv_template, out_priv_count, req) ( \ + (KCF_PROV_NOSTORE_KEY_OPS(pd) && \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair) ? \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_generate_pair( \ + (pd)->pd_prov_handle, session, mech, pub_template, pub_count, \ + priv_template, priv_count, out_pub_template, out_pub_count, \ + out_priv_template, out_priv_count, req) : CRYPTO_NOT_SUPPORTED) + +#define KCF_PROV_NOSTORE_KEY_DERIVE(pd, session, mech, base_key, template, \ + count, out_template, out_count, req) ( \ + (KCF_PROV_NOSTORE_KEY_OPS(pd) && \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive) ? \ + KCF_PROV_NOSTORE_KEY_OPS(pd)->nostore_key_derive( \ + (pd)->pd_prov_handle, session, mech, base_key, template, count, \ + out_template, out_count, req) : CRYPTO_NOT_SUPPORTED) + +/* + * The following routines are exported by the kcf module (/kernel/misc/kcf) + * to the crypto and cryptoadmin modules. + */ + +/* Digest/mac/cipher entry points that take a provider descriptor and session */ +extern int crypto_digest_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +extern int crypto_mac_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +extern int crypto_encrypt_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +extern int crypto_decrypt_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + + +/* Other private digest/mac/cipher entry points not exported through k-API */ +extern int crypto_digest_key_prov(crypto_context_t, crypto_key_t *, + crypto_call_req_t *); + +/* Private sign entry points exported by KCF */ +extern int crypto_sign_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +extern int crypto_sign_recover_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +/* Private verify entry points exported by KCF */ +extern int crypto_verify_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +extern int crypto_verify_recover_single(crypto_context_t, crypto_data_t *, + crypto_data_t *, crypto_call_req_t *); + +/* Private dual operations entry points exported by KCF */ +extern int crypto_digest_encrypt_update(crypto_context_t, crypto_context_t, + crypto_data_t *, crypto_data_t *, crypto_call_req_t *); +extern int crypto_decrypt_digest_update(crypto_context_t, crypto_context_t, + crypto_data_t *, crypto_data_t *, crypto_call_req_t *); +extern int crypto_sign_encrypt_update(crypto_context_t, crypto_context_t, + crypto_data_t *, crypto_data_t *, crypto_call_req_t *); +extern int crypto_decrypt_verify_update(crypto_context_t, crypto_context_t, + crypto_data_t *, crypto_data_t *, crypto_call_req_t *); + +/* Random Number Generation */ +int crypto_seed_random(crypto_provider_handle_t provider, uchar_t *buf, + size_t len, crypto_call_req_t *req); +int crypto_generate_random(crypto_provider_handle_t provider, uchar_t *buf, + size_t len, crypto_call_req_t *req); + +/* Provider Management */ +int crypto_get_provider_info(crypto_provider_id_t id, + crypto_provider_info_t **info, crypto_call_req_t *req); +int crypto_get_provider_mechanisms(crypto_minor_t *, crypto_provider_id_t id, + uint_t *count, crypto_mech_name_t **list); +int crypto_init_token(crypto_provider_handle_t provider, char *pin, + size_t pin_len, char *label, crypto_call_req_t *); +int crypto_init_pin(crypto_provider_handle_t provider, char *pin, + size_t pin_len, crypto_call_req_t *req); +int crypto_set_pin(crypto_provider_handle_t provider, char *old_pin, + size_t old_len, char *new_pin, size_t new_len, crypto_call_req_t *req); +void crypto_free_provider_list(crypto_provider_entry_t *list, uint_t count); +void crypto_free_provider_info(crypto_provider_info_t *info); + +/* Administrative */ +int crypto_get_dev_list(uint_t *count, crypto_dev_list_entry_t **list); +int crypto_get_soft_list(uint_t *count, char **list, size_t *len); +int crypto_get_dev_info(char *name, uint_t instance, uint_t *count, + crypto_mech_name_t **list); +int crypto_get_soft_info(caddr_t name, uint_t *count, + crypto_mech_name_t **list); +int crypto_load_dev_disabled(char *name, uint_t instance, uint_t count, + crypto_mech_name_t *list); +int crypto_load_soft_disabled(caddr_t name, uint_t count, + crypto_mech_name_t *list); +int crypto_unload_soft_module(caddr_t path); +int crypto_load_soft_config(caddr_t name, uint_t count, + crypto_mech_name_t *list); +int crypto_load_door(uint_t did); +void crypto_free_mech_list(crypto_mech_name_t *list, uint_t count); +void crypto_free_dev_list(crypto_dev_list_entry_t *list, uint_t count); + +/* Miscellaneous */ +int crypto_get_mechanism_number(caddr_t name, crypto_mech_type_t *number); +int crypto_get_function_list(crypto_provider_id_t id, + crypto_function_list_t **list, int kmflag); +void crypto_free_function_list(crypto_function_list_t *list); +int crypto_build_permitted_mech_names(kcf_provider_desc_t *, + crypto_mech_name_t **, uint_t *, int); +extern void kcf_destroy_mech_tabs(void); +extern void kcf_init_mech_tabs(void); +extern int kcf_add_mech_provider(short, kcf_provider_desc_t *, + kcf_prov_mech_desc_t **); +extern void kcf_remove_mech_provider(char *, kcf_provider_desc_t *); +extern int kcf_get_mech_entry(crypto_mech_type_t, kcf_mech_entry_t **); +extern kcf_provider_desc_t *kcf_alloc_provider_desc(crypto_provider_info_t *); +extern void kcf_provider_zero_refcnt(kcf_provider_desc_t *); +extern void kcf_free_provider_desc(kcf_provider_desc_t *); +extern void kcf_soft_config_init(void); +extern int get_sw_provider_for_mech(crypto_mech_name_t, char **); +extern crypto_mech_type_t crypto_mech2id_common(char *, boolean_t); +extern void undo_register_provider(kcf_provider_desc_t *, boolean_t); +extern void redo_register_provider(kcf_provider_desc_t *); +extern void kcf_rnd_init(void); +extern boolean_t kcf_rngprov_check(void); +extern int kcf_rnd_get_pseudo_bytes(uint8_t *, size_t); +extern int kcf_rnd_get_bytes(uint8_t *, size_t, boolean_t, boolean_t); +extern int random_add_pseudo_entropy(uint8_t *, size_t, uint_t); +extern void kcf_rnd_schedule_timeout(boolean_t); +extern int crypto_uio_data(crypto_data_t *, uchar_t *, int, cmd_type_t, + void *, void (*update)(void)); +extern int crypto_mblk_data(crypto_data_t *, uchar_t *, int, cmd_type_t, + void *, void (*update)(void)); +extern int crypto_put_output_data(uchar_t *, crypto_data_t *, int); +extern int crypto_get_input_data(crypto_data_t *, uchar_t **, uchar_t *); +extern int crypto_copy_key_to_ctx(crypto_key_t *, crypto_key_t **, size_t *, + int kmflag); +extern int crypto_digest_data(crypto_data_t *, void *, uchar_t *, + void (*update)(void), void (*final)(void), uchar_t); +extern int crypto_update_iov(void *, crypto_data_t *, crypto_data_t *, + int (*cipher)(void *, caddr_t, size_t, crypto_data_t *), + void (*copy_block)(uint8_t *, uint64_t *)); +extern int crypto_update_uio(void *, crypto_data_t *, crypto_data_t *, + int (*cipher)(void *, caddr_t, size_t, crypto_data_t *), + void (*copy_block)(uint8_t *, uint64_t *)); +extern int crypto_update_mp(void *, crypto_data_t *, crypto_data_t *, + int (*cipher)(void *, caddr_t, size_t, crypto_data_t *), + void (*copy_block)(uint8_t *, uint64_t *)); +extern int crypto_get_key_attr(crypto_key_t *, crypto_attr_type_t, uchar_t **, + ssize_t *); + +/* Access to the provider's table */ +extern void kcf_prov_tab_destroy(void); +extern void kcf_prov_tab_init(void); +extern int kcf_prov_tab_add_provider(kcf_provider_desc_t *); +extern int kcf_prov_tab_rem_provider(crypto_provider_id_t); +extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_name(char *); +extern kcf_provider_desc_t *kcf_prov_tab_lookup_by_dev(char *, uint_t); +extern int kcf_get_hw_prov_tab(uint_t *, kcf_provider_desc_t ***, int, + char *, uint_t, boolean_t); +extern int kcf_get_slot_list(uint_t *, kcf_provider_desc_t ***, boolean_t); +extern void kcf_free_provider_tab(uint_t, kcf_provider_desc_t **); +extern kcf_provider_desc_t *kcf_prov_tab_lookup(crypto_provider_id_t); +extern int kcf_get_sw_prov(crypto_mech_type_t, kcf_provider_desc_t **, + kcf_mech_entry_t **, boolean_t); + +/* Access to the policy table */ +extern boolean_t is_mech_disabled(kcf_provider_desc_t *, crypto_mech_name_t); +extern boolean_t is_mech_disabled_byname(crypto_provider_type_t, char *, + uint_t, crypto_mech_name_t); +extern void kcf_policy_tab_init(void); +extern void kcf_policy_free_desc(kcf_policy_desc_t *); +extern void kcf_policy_remove_by_name(char *, uint_t *, crypto_mech_name_t **); +extern void kcf_policy_remove_by_dev(char *, uint_t, uint_t *, + crypto_mech_name_t **); +extern kcf_policy_desc_t *kcf_policy_lookup_by_name(char *); +extern kcf_policy_desc_t *kcf_policy_lookup_by_dev(char *, uint_t); +extern int kcf_policy_load_soft_disabled(char *, uint_t, crypto_mech_name_t *, + uint_t *, crypto_mech_name_t **); +extern int kcf_policy_load_dev_disabled(char *, uint_t, uint_t, + crypto_mech_name_t *, uint_t *, crypto_mech_name_t **); +extern boolean_t in_soft_config_list(char *); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_IMPL_H */ diff --git a/module/icp/include/sys/crypto/ioctl.h b/module/icp/include/sys/crypto/ioctl.h new file mode 100644 index 000000000000..6e371e343945 --- /dev/null +++ b/module/icp/include/sys/crypto/ioctl.h @@ -0,0 +1,1480 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_IOCTL_H +#define _SYS_CRYPTO_IOCTL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include + +#define CRYPTO_MAX_ATTRIBUTE_COUNT 128 + +#define CRYPTO_IOFLAGS_RW_SESSION 0x00000001 + +#define CRYPTO(x) (('y' << 8) | (x)) + +#define MAX_NUM_THRESHOLD 7 + +/* the PKCS11 Mechanisms */ +#define CKM_RC4 0x00000111 +#define CKM_DES3_ECB 0x00000132 +#define CKM_DES3_CBC 0x00000133 +#define CKM_MD5 0x00000210 +#define CKM_SHA_1 0x00000220 +#define CKM_AES_ECB 0x00001081 +#define CKM_AES_CBC 0x00001082 + +/* + * General Purpose Ioctls + */ + +typedef struct fl_mechs_threshold { + int mech_type; + uint32_t mech_threshold; +} fl_mechs_threshold_t; + +typedef struct crypto_function_list { + boolean_t fl_digest_init; + boolean_t fl_digest; + boolean_t fl_digest_update; + boolean_t fl_digest_key; + boolean_t fl_digest_final; + + boolean_t fl_encrypt_init; + boolean_t fl_encrypt; + boolean_t fl_encrypt_update; + boolean_t fl_encrypt_final; + + boolean_t fl_decrypt_init; + boolean_t fl_decrypt; + boolean_t fl_decrypt_update; + boolean_t fl_decrypt_final; + + boolean_t fl_mac_init; + boolean_t fl_mac; + boolean_t fl_mac_update; + boolean_t fl_mac_final; + + boolean_t fl_sign_init; + boolean_t fl_sign; + boolean_t fl_sign_update; + boolean_t fl_sign_final; + boolean_t fl_sign_recover_init; + boolean_t fl_sign_recover; + + boolean_t fl_verify_init; + boolean_t fl_verify; + boolean_t fl_verify_update; + boolean_t fl_verify_final; + boolean_t fl_verify_recover_init; + boolean_t fl_verify_recover; + + boolean_t fl_digest_encrypt_update; + boolean_t fl_decrypt_digest_update; + boolean_t fl_sign_encrypt_update; + boolean_t fl_decrypt_verify_update; + + boolean_t fl_seed_random; + boolean_t fl_generate_random; + + boolean_t fl_session_open; + boolean_t fl_session_close; + boolean_t fl_session_login; + boolean_t fl_session_logout; + + boolean_t fl_object_create; + boolean_t fl_object_copy; + boolean_t fl_object_destroy; + boolean_t fl_object_get_size; + boolean_t fl_object_get_attribute_value; + boolean_t fl_object_set_attribute_value; + boolean_t fl_object_find_init; + boolean_t fl_object_find; + boolean_t fl_object_find_final; + + boolean_t fl_key_generate; + boolean_t fl_key_generate_pair; + boolean_t fl_key_wrap; + boolean_t fl_key_unwrap; + boolean_t fl_key_derive; + + boolean_t fl_init_token; + boolean_t fl_init_pin; + boolean_t fl_set_pin; + + boolean_t prov_is_limited; + uint32_t prov_hash_threshold; + uint32_t prov_hash_limit; + + int total_threshold_count; + fl_mechs_threshold_t fl_threshold[MAX_NUM_THRESHOLD]; +} crypto_function_list_t; + +typedef struct crypto_get_function_list { + uint_t fl_return_value; + crypto_provider_id_t fl_provider_id; + crypto_function_list_t fl_list; +} crypto_get_function_list_t; + +typedef struct crypto_get_mechanism_number { + uint_t pn_return_value; + caddr_t pn_mechanism_string; + size_t pn_mechanism_len; + crypto_mech_type_t pn_internal_number; +} crypto_get_mechanism_number_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_get_mechanism_number32 { + uint32_t pn_return_value; + caddr32_t pn_mechanism_string; + size32_t pn_mechanism_len; + crypto_mech_type_t pn_internal_number; +} crypto_get_mechanism_number32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_GET_FUNCTION_LIST CRYPTO(20) +#define CRYPTO_GET_MECHANISM_NUMBER CRYPTO(21) + +/* + * Session Ioctls + */ + +typedef uint32_t crypto_flags_t; + +typedef struct crypto_open_session { + uint_t os_return_value; + crypto_session_id_t os_session; + crypto_flags_t os_flags; + crypto_provider_id_t os_provider_id; +} crypto_open_session_t; + +typedef struct crypto_close_session { + uint_t cs_return_value; + crypto_session_id_t cs_session; +} crypto_close_session_t; + +typedef struct crypto_close_all_sessions { + uint_t as_return_value; + crypto_provider_id_t as_provider_id; +} crypto_close_all_sessions_t; + +#define CRYPTO_OPEN_SESSION CRYPTO(30) +#define CRYPTO_CLOSE_SESSION CRYPTO(31) +#define CRYPTO_CLOSE_ALL_SESSIONS CRYPTO(32) + +/* + * Login Ioctls + */ +typedef struct crypto_login { + uint_t co_return_value; + crypto_session_id_t co_session; + uint_t co_user_type; + uint_t co_pin_len; + caddr_t co_pin; +} crypto_login_t; + +typedef struct crypto_logout { + uint_t cl_return_value; + crypto_session_id_t cl_session; +} crypto_logout_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_login32 { + uint32_t co_return_value; + crypto_session_id_t co_session; + uint32_t co_user_type; + uint32_t co_pin_len; + caddr32_t co_pin; +} crypto_login32_t; + +typedef struct crypto_logout32 { + uint32_t cl_return_value; + crypto_session_id_t cl_session; +} crypto_logout32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_LOGIN CRYPTO(40) +#define CRYPTO_LOGOUT CRYPTO(41) + +/* + * Cryptographic Ioctls + */ +typedef struct crypto_encrypt { + uint_t ce_return_value; + crypto_session_id_t ce_session; + size_t ce_datalen; + caddr_t ce_databuf; + size_t ce_encrlen; + caddr_t ce_encrbuf; + uint_t ce_flags; +} crypto_encrypt_t; + +typedef struct crypto_encrypt_init { + uint_t ei_return_value; + crypto_session_id_t ei_session; + crypto_mechanism_t ei_mech; + crypto_key_t ei_key; +} crypto_encrypt_init_t; + +typedef struct crypto_encrypt_update { + uint_t eu_return_value; + crypto_session_id_t eu_session; + size_t eu_datalen; + caddr_t eu_databuf; + size_t eu_encrlen; + caddr_t eu_encrbuf; +} crypto_encrypt_update_t; + +typedef struct crypto_encrypt_final { + uint_t ef_return_value; + crypto_session_id_t ef_session; + size_t ef_encrlen; + caddr_t ef_encrbuf; +} crypto_encrypt_final_t; + +typedef struct crypto_decrypt { + uint_t cd_return_value; + crypto_session_id_t cd_session; + size_t cd_encrlen; + caddr_t cd_encrbuf; + size_t cd_datalen; + caddr_t cd_databuf; + uint_t cd_flags; +} crypto_decrypt_t; + +typedef struct crypto_decrypt_init { + uint_t di_return_value; + crypto_session_id_t di_session; + crypto_mechanism_t di_mech; + crypto_key_t di_key; +} crypto_decrypt_init_t; + +typedef struct crypto_decrypt_update { + uint_t du_return_value; + crypto_session_id_t du_session; + size_t du_encrlen; + caddr_t du_encrbuf; + size_t du_datalen; + caddr_t du_databuf; +} crypto_decrypt_update_t; + +typedef struct crypto_decrypt_final { + uint_t df_return_value; + crypto_session_id_t df_session; + size_t df_datalen; + caddr_t df_databuf; +} crypto_decrypt_final_t; + +typedef struct crypto_digest { + uint_t cd_return_value; + crypto_session_id_t cd_session; + size_t cd_datalen; + caddr_t cd_databuf; + size_t cd_digestlen; + caddr_t cd_digestbuf; +} crypto_digest_t; + +typedef struct crypto_digest_init { + uint_t di_return_value; + crypto_session_id_t di_session; + crypto_mechanism_t di_mech; +} crypto_digest_init_t; + +typedef struct crypto_digest_update { + uint_t du_return_value; + crypto_session_id_t du_session; + size_t du_datalen; + caddr_t du_databuf; +} crypto_digest_update_t; + +typedef struct crypto_digest_key { + uint_t dk_return_value; + crypto_session_id_t dk_session; + crypto_key_t dk_key; +} crypto_digest_key_t; + +typedef struct crypto_digest_final { + uint_t df_return_value; + crypto_session_id_t df_session; + size_t df_digestlen; + caddr_t df_digestbuf; +} crypto_digest_final_t; + +typedef struct crypto_mac { + uint_t cm_return_value; + crypto_session_id_t cm_session; + size_t cm_datalen; + caddr_t cm_databuf; + size_t cm_maclen; + caddr_t cm_macbuf; +} crypto_mac_t; + +typedef struct crypto_mac_init { + uint_t mi_return_value; + crypto_session_id_t mi_session; + crypto_mechanism_t mi_mech; + crypto_key_t mi_key; +} crypto_mac_init_t; + +typedef struct crypto_mac_update { + uint_t mu_return_value; + crypto_session_id_t mu_session; + size_t mu_datalen; + caddr_t mu_databuf; +} crypto_mac_update_t; + +typedef struct crypto_mac_final { + uint_t mf_return_value; + crypto_session_id_t mf_session; + size_t mf_maclen; + caddr_t mf_macbuf; +} crypto_mac_final_t; + +typedef struct crypto_sign { + uint_t cs_return_value; + crypto_session_id_t cs_session; + size_t cs_datalen; + caddr_t cs_databuf; + size_t cs_signlen; + caddr_t cs_signbuf; +} crypto_sign_t; + +typedef struct crypto_sign_init { + uint_t si_return_value; + crypto_session_id_t si_session; + crypto_mechanism_t si_mech; + crypto_key_t si_key; +} crypto_sign_init_t; + +typedef struct crypto_sign_update { + uint_t su_return_value; + crypto_session_id_t su_session; + size_t su_datalen; + caddr_t su_databuf; +} crypto_sign_update_t; + +typedef struct crypto_sign_final { + uint_t sf_return_value; + crypto_session_id_t sf_session; + size_t sf_signlen; + caddr_t sf_signbuf; +} crypto_sign_final_t; + +typedef struct crypto_sign_recover_init { + uint_t ri_return_value; + crypto_session_id_t ri_session; + crypto_mechanism_t ri_mech; + crypto_key_t ri_key; +} crypto_sign_recover_init_t; + +typedef struct crypto_sign_recover { + uint_t sr_return_value; + crypto_session_id_t sr_session; + size_t sr_datalen; + caddr_t sr_databuf; + size_t sr_signlen; + caddr_t sr_signbuf; +} crypto_sign_recover_t; + +typedef struct crypto_verify { + uint_t cv_return_value; + crypto_session_id_t cv_session; + size_t cv_datalen; + caddr_t cv_databuf; + size_t cv_signlen; + caddr_t cv_signbuf; +} crypto_verify_t; + +typedef struct crypto_verify_init { + uint_t vi_return_value; + crypto_session_id_t vi_session; + crypto_mechanism_t vi_mech; + crypto_key_t vi_key; +} crypto_verify_init_t; + +typedef struct crypto_verify_update { + uint_t vu_return_value; + crypto_session_id_t vu_session; + size_t vu_datalen; + caddr_t vu_databuf; +} crypto_verify_update_t; + +typedef struct crypto_verify_final { + uint_t vf_return_value; + crypto_session_id_t vf_session; + size_t vf_signlen; + caddr_t vf_signbuf; +} crypto_verify_final_t; + +typedef struct crypto_verify_recover_init { + uint_t ri_return_value; + crypto_session_id_t ri_session; + crypto_mechanism_t ri_mech; + crypto_key_t ri_key; +} crypto_verify_recover_init_t; + +typedef struct crypto_verify_recover { + uint_t vr_return_value; + crypto_session_id_t vr_session; + size_t vr_signlen; + caddr_t vr_signbuf; + size_t vr_datalen; + caddr_t vr_databuf; +} crypto_verify_recover_t; + +typedef struct crypto_digest_encrypt_update { + uint_t eu_return_value; + crypto_session_id_t eu_session; + size_t eu_datalen; + caddr_t eu_databuf; + size_t eu_encrlen; + caddr_t eu_encrbuf; +} crypto_digest_encrypt_update_t; + +typedef struct crypto_decrypt_digest_update { + uint_t du_return_value; + crypto_session_id_t du_session; + size_t du_encrlen; + caddr_t du_encrbuf; + size_t du_datalen; + caddr_t du_databuf; +} crypto_decrypt_digest_update_t; + +typedef struct crypto_sign_encrypt_update { + uint_t eu_return_value; + crypto_session_id_t eu_session; + size_t eu_datalen; + caddr_t eu_databuf; + size_t eu_encrlen; + caddr_t eu_encrbuf; +} crypto_sign_encrypt_update_t; + +typedef struct crypto_decrypt_verify_update { + uint_t vu_return_value; + crypto_session_id_t vu_session; + size_t vu_encrlen; + caddr_t vu_encrbuf; + size_t vu_datalen; + caddr_t vu_databuf; +} crypto_decrypt_verify_update_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_encrypt32 { + uint32_t ce_return_value; + crypto_session_id_t ce_session; + size32_t ce_datalen; + caddr32_t ce_databuf; + size32_t ce_encrlen; + caddr32_t ce_encrbuf; + uint32_t ce_flags; +} crypto_encrypt32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_encrypt_init32 { + uint32_t ei_return_value; + crypto_session_id_t ei_session; + crypto_mechanism32_t ei_mech; + crypto_key32_t ei_key; +} crypto_encrypt_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_encrypt_update32 { + uint32_t eu_return_value; + crypto_session_id_t eu_session; + size32_t eu_datalen; + caddr32_t eu_databuf; + size32_t eu_encrlen; + caddr32_t eu_encrbuf; +} crypto_encrypt_update32_t; + +typedef struct crypto_encrypt_final32 { + uint32_t ef_return_value; + crypto_session_id_t ef_session; + size32_t ef_encrlen; + caddr32_t ef_encrbuf; +} crypto_encrypt_final32_t; + +typedef struct crypto_decrypt32 { + uint32_t cd_return_value; + crypto_session_id_t cd_session; + size32_t cd_encrlen; + caddr32_t cd_encrbuf; + size32_t cd_datalen; + caddr32_t cd_databuf; + uint32_t cd_flags; +} crypto_decrypt32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_decrypt_init32 { + uint32_t di_return_value; + crypto_session_id_t di_session; + crypto_mechanism32_t di_mech; + crypto_key32_t di_key; +} crypto_decrypt_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_decrypt_update32 { + uint32_t du_return_value; + crypto_session_id_t du_session; + size32_t du_encrlen; + caddr32_t du_encrbuf; + size32_t du_datalen; + caddr32_t du_databuf; +} crypto_decrypt_update32_t; + +typedef struct crypto_decrypt_final32 { + uint32_t df_return_value; + crypto_session_id_t df_session; + size32_t df_datalen; + caddr32_t df_databuf; +} crypto_decrypt_final32_t; + +typedef struct crypto_digest32 { + uint32_t cd_return_value; + crypto_session_id_t cd_session; + size32_t cd_datalen; + caddr32_t cd_databuf; + size32_t cd_digestlen; + caddr32_t cd_digestbuf; +} crypto_digest32_t; + +typedef struct crypto_digest_init32 { + uint32_t di_return_value; + crypto_session_id_t di_session; + crypto_mechanism32_t di_mech; +} crypto_digest_init32_t; + +typedef struct crypto_digest_update32 { + uint32_t du_return_value; + crypto_session_id_t du_session; + size32_t du_datalen; + caddr32_t du_databuf; +} crypto_digest_update32_t; + +typedef struct crypto_digest_key32 { + uint32_t dk_return_value; + crypto_session_id_t dk_session; + crypto_key32_t dk_key; +} crypto_digest_key32_t; + +typedef struct crypto_digest_final32 { + uint32_t df_return_value; + crypto_session_id_t df_session; + size32_t df_digestlen; + caddr32_t df_digestbuf; +} crypto_digest_final32_t; + +typedef struct crypto_mac32 { + uint32_t cm_return_value; + crypto_session_id_t cm_session; + size32_t cm_datalen; + caddr32_t cm_databuf; + size32_t cm_maclen; + caddr32_t cm_macbuf; +} crypto_mac32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_mac_init32 { + uint32_t mi_return_value; + crypto_session_id_t mi_session; + crypto_mechanism32_t mi_mech; + crypto_key32_t mi_key; +} crypto_mac_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_mac_update32 { + uint32_t mu_return_value; + crypto_session_id_t mu_session; + size32_t mu_datalen; + caddr32_t mu_databuf; +} crypto_mac_update32_t; + +typedef struct crypto_mac_final32 { + uint32_t mf_return_value; + crypto_session_id_t mf_session; + size32_t mf_maclen; + caddr32_t mf_macbuf; +} crypto_mac_final32_t; + +typedef struct crypto_sign32 { + uint32_t cs_return_value; + crypto_session_id_t cs_session; + size32_t cs_datalen; + caddr32_t cs_databuf; + size32_t cs_signlen; + caddr32_t cs_signbuf; +} crypto_sign32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_sign_init32 { + uint32_t si_return_value; + crypto_session_id_t si_session; + crypto_mechanism32_t si_mech; + crypto_key32_t si_key; +} crypto_sign_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_sign_update32 { + uint32_t su_return_value; + crypto_session_id_t su_session; + size32_t su_datalen; + caddr32_t su_databuf; +} crypto_sign_update32_t; + +typedef struct crypto_sign_final32 { + uint32_t sf_return_value; + crypto_session_id_t sf_session; + size32_t sf_signlen; + caddr32_t sf_signbuf; +} crypto_sign_final32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_sign_recover_init32 { + uint32_t ri_return_value; + crypto_session_id_t ri_session; + crypto_mechanism32_t ri_mech; + crypto_key32_t ri_key; +} crypto_sign_recover_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_sign_recover32 { + uint32_t sr_return_value; + crypto_session_id_t sr_session; + size32_t sr_datalen; + caddr32_t sr_databuf; + size32_t sr_signlen; + caddr32_t sr_signbuf; +} crypto_sign_recover32_t; + +typedef struct crypto_verify32 { + uint32_t cv_return_value; + crypto_session_id_t cv_session; + size32_t cv_datalen; + caddr32_t cv_databuf; + size32_t cv_signlen; + caddr32_t cv_signbuf; +} crypto_verify32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_verify_init32 { + uint32_t vi_return_value; + crypto_session_id_t vi_session; + crypto_mechanism32_t vi_mech; + crypto_key32_t vi_key; +} crypto_verify_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_verify_update32 { + uint32_t vu_return_value; + crypto_session_id_t vu_session; + size32_t vu_datalen; + caddr32_t vu_databuf; +} crypto_verify_update32_t; + +typedef struct crypto_verify_final32 { + uint32_t vf_return_value; + crypto_session_id_t vf_session; + size32_t vf_signlen; + caddr32_t vf_signbuf; +} crypto_verify_final32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_verify_recover_init32 { + uint32_t ri_return_value; + crypto_session_id_t ri_session; + crypto_mechanism32_t ri_mech; + crypto_key32_t ri_key; +} crypto_verify_recover_init32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_verify_recover32 { + uint32_t vr_return_value; + crypto_session_id_t vr_session; + size32_t vr_signlen; + caddr32_t vr_signbuf; + size32_t vr_datalen; + caddr32_t vr_databuf; +} crypto_verify_recover32_t; + +typedef struct crypto_digest_encrypt_update32 { + uint32_t eu_return_value; + crypto_session_id_t eu_session; + size32_t eu_datalen; + caddr32_t eu_databuf; + size32_t eu_encrlen; + caddr32_t eu_encrbuf; +} crypto_digest_encrypt_update32_t; + +typedef struct crypto_decrypt_digest_update32 { + uint32_t du_return_value; + crypto_session_id_t du_session; + size32_t du_encrlen; + caddr32_t du_encrbuf; + size32_t du_datalen; + caddr32_t du_databuf; +} crypto_decrypt_digest_update32_t; + +typedef struct crypto_sign_encrypt_update32 { + uint32_t eu_return_value; + crypto_session_id_t eu_session; + size32_t eu_datalen; + caddr32_t eu_databuf; + size32_t eu_encrlen; + caddr32_t eu_encrbuf; +} crypto_sign_encrypt_update32_t; + +typedef struct crypto_decrypt_verify_update32 { + uint32_t vu_return_value; + crypto_session_id_t vu_session; + size32_t vu_encrlen; + caddr32_t vu_encrbuf; + size32_t vu_datalen; + caddr32_t vu_databuf; +} crypto_decrypt_verify_update32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_ENCRYPT CRYPTO(50) +#define CRYPTO_ENCRYPT_INIT CRYPTO(51) +#define CRYPTO_ENCRYPT_UPDATE CRYPTO(52) +#define CRYPTO_ENCRYPT_FINAL CRYPTO(53) +#define CRYPTO_DECRYPT CRYPTO(54) +#define CRYPTO_DECRYPT_INIT CRYPTO(55) +#define CRYPTO_DECRYPT_UPDATE CRYPTO(56) +#define CRYPTO_DECRYPT_FINAL CRYPTO(57) + +#define CRYPTO_DIGEST CRYPTO(58) +#define CRYPTO_DIGEST_INIT CRYPTO(59) +#define CRYPTO_DIGEST_UPDATE CRYPTO(60) +#define CRYPTO_DIGEST_KEY CRYPTO(61) +#define CRYPTO_DIGEST_FINAL CRYPTO(62) +#define CRYPTO_MAC CRYPTO(63) +#define CRYPTO_MAC_INIT CRYPTO(64) +#define CRYPTO_MAC_UPDATE CRYPTO(65) +#define CRYPTO_MAC_FINAL CRYPTO(66) + +#define CRYPTO_SIGN CRYPTO(67) +#define CRYPTO_SIGN_INIT CRYPTO(68) +#define CRYPTO_SIGN_UPDATE CRYPTO(69) +#define CRYPTO_SIGN_FINAL CRYPTO(70) +#define CRYPTO_SIGN_RECOVER_INIT CRYPTO(71) +#define CRYPTO_SIGN_RECOVER CRYPTO(72) +#define CRYPTO_VERIFY CRYPTO(73) +#define CRYPTO_VERIFY_INIT CRYPTO(74) +#define CRYPTO_VERIFY_UPDATE CRYPTO(75) +#define CRYPTO_VERIFY_FINAL CRYPTO(76) +#define CRYPTO_VERIFY_RECOVER_INIT CRYPTO(77) +#define CRYPTO_VERIFY_RECOVER CRYPTO(78) + +#define CRYPTO_DIGEST_ENCRYPT_UPDATE CRYPTO(79) +#define CRYPTO_DECRYPT_DIGEST_UPDATE CRYPTO(80) +#define CRYPTO_SIGN_ENCRYPT_UPDATE CRYPTO(81) +#define CRYPTO_DECRYPT_VERIFY_UPDATE CRYPTO(82) + +/* + * Random Number Ioctls + */ +typedef struct crypto_seed_random { + uint_t sr_return_value; + crypto_session_id_t sr_session; + size_t sr_seedlen; + caddr_t sr_seedbuf; +} crypto_seed_random_t; + +typedef struct crypto_generate_random { + uint_t gr_return_value; + crypto_session_id_t gr_session; + caddr_t gr_buf; + size_t gr_buflen; +} crypto_generate_random_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_seed_random32 { + uint32_t sr_return_value; + crypto_session_id_t sr_session; + size32_t sr_seedlen; + caddr32_t sr_seedbuf; +} crypto_seed_random32_t; + +typedef struct crypto_generate_random32 { + uint32_t gr_return_value; + crypto_session_id_t gr_session; + caddr32_t gr_buf; + size32_t gr_buflen; +} crypto_generate_random32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_SEED_RANDOM CRYPTO(90) +#define CRYPTO_GENERATE_RANDOM CRYPTO(91) + +/* + * Object Management Ioctls + */ +typedef struct crypto_object_create { + uint_t oc_return_value; + crypto_session_id_t oc_session; + crypto_object_id_t oc_handle; + uint_t oc_count; + caddr_t oc_attributes; +} crypto_object_create_t; + +typedef struct crypto_object_copy { + uint_t oc_return_value; + crypto_session_id_t oc_session; + crypto_object_id_t oc_handle; + crypto_object_id_t oc_new_handle; + uint_t oc_count; + caddr_t oc_new_attributes; +} crypto_object_copy_t; + +typedef struct crypto_object_destroy { + uint_t od_return_value; + crypto_session_id_t od_session; + crypto_object_id_t od_handle; +} crypto_object_destroy_t; + +typedef struct crypto_object_get_attribute_value { + uint_t og_return_value; + crypto_session_id_t og_session; + crypto_object_id_t og_handle; + uint_t og_count; + caddr_t og_attributes; +} crypto_object_get_attribute_value_t; + +typedef struct crypto_object_get_size { + uint_t gs_return_value; + crypto_session_id_t gs_session; + crypto_object_id_t gs_handle; + size_t gs_size; +} crypto_object_get_size_t; + +typedef struct crypto_object_set_attribute_value { + uint_t sa_return_value; + crypto_session_id_t sa_session; + crypto_object_id_t sa_handle; + uint_t sa_count; + caddr_t sa_attributes; +} crypto_object_set_attribute_value_t; + +typedef struct crypto_object_find_init { + uint_t fi_return_value; + crypto_session_id_t fi_session; + uint_t fi_count; + caddr_t fi_attributes; +} crypto_object_find_init_t; + +typedef struct crypto_object_find_update { + uint_t fu_return_value; + crypto_session_id_t fu_session; + uint_t fu_max_count; + uint_t fu_count; + caddr_t fu_handles; +} crypto_object_find_update_t; + +typedef struct crypto_object_find_final { + uint_t ff_return_value; + crypto_session_id_t ff_session; +} crypto_object_find_final_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_object_create32 { + uint32_t oc_return_value; + crypto_session_id_t oc_session; + crypto_object_id_t oc_handle; + uint32_t oc_count; + caddr32_t oc_attributes; +} crypto_object_create32_t; + +typedef struct crypto_object_copy32 { + uint32_t oc_return_value; + crypto_session_id_t oc_session; + crypto_object_id_t oc_handle; + crypto_object_id_t oc_new_handle; + uint32_t oc_count; + caddr32_t oc_new_attributes; +} crypto_object_copy32_t; + +typedef struct crypto_object_destroy32 { + uint32_t od_return_value; + crypto_session_id_t od_session; + crypto_object_id_t od_handle; +} crypto_object_destroy32_t; + +typedef struct crypto_object_get_attribute_value32 { + uint32_t og_return_value; + crypto_session_id_t og_session; + crypto_object_id_t og_handle; + uint32_t og_count; + caddr32_t og_attributes; +} crypto_object_get_attribute_value32_t; + +typedef struct crypto_object_get_size32 { + uint32_t gs_return_value; + crypto_session_id_t gs_session; + crypto_object_id_t gs_handle; + size32_t gs_size; +} crypto_object_get_size32_t; + +typedef struct crypto_object_set_attribute_value32 { + uint32_t sa_return_value; + crypto_session_id_t sa_session; + crypto_object_id_t sa_handle; + uint32_t sa_count; + caddr32_t sa_attributes; +} crypto_object_set_attribute_value32_t; + +typedef struct crypto_object_find_init32 { + uint32_t fi_return_value; + crypto_session_id_t fi_session; + uint32_t fi_count; + caddr32_t fi_attributes; +} crypto_object_find_init32_t; + +typedef struct crypto_object_find_update32 { + uint32_t fu_return_value; + crypto_session_id_t fu_session; + uint32_t fu_max_count; + uint32_t fu_count; + caddr32_t fu_handles; +} crypto_object_find_update32_t; + +typedef struct crypto_object_find_final32 { + uint32_t ff_return_value; + crypto_session_id_t ff_session; +} crypto_object_find_final32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_OBJECT_CREATE CRYPTO(100) +#define CRYPTO_OBJECT_COPY CRYPTO(101) +#define CRYPTO_OBJECT_DESTROY CRYPTO(102) +#define CRYPTO_OBJECT_GET_ATTRIBUTE_VALUE CRYPTO(103) +#define CRYPTO_OBJECT_GET_SIZE CRYPTO(104) +#define CRYPTO_OBJECT_SET_ATTRIBUTE_VALUE CRYPTO(105) +#define CRYPTO_OBJECT_FIND_INIT CRYPTO(106) +#define CRYPTO_OBJECT_FIND_UPDATE CRYPTO(107) +#define CRYPTO_OBJECT_FIND_FINAL CRYPTO(108) + +/* + * Key Generation Ioctls + */ +typedef struct crypto_object_generate_key { + uint_t gk_return_value; + crypto_session_id_t gk_session; + crypto_object_id_t gk_handle; + crypto_mechanism_t gk_mechanism; + uint_t gk_count; + caddr_t gk_attributes; +} crypto_object_generate_key_t; + +typedef struct crypto_object_generate_key_pair { + uint_t kp_return_value; + crypto_session_id_t kp_session; + crypto_object_id_t kp_public_handle; + crypto_object_id_t kp_private_handle; + uint_t kp_public_count; + uint_t kp_private_count; + caddr_t kp_public_attributes; + caddr_t kp_private_attributes; + crypto_mechanism_t kp_mechanism; +} crypto_object_generate_key_pair_t; + +typedef struct crypto_object_wrap_key { + uint_t wk_return_value; + crypto_session_id_t wk_session; + crypto_mechanism_t wk_mechanism; + crypto_key_t wk_wrapping_key; + crypto_object_id_t wk_object_handle; + size_t wk_wrapped_key_len; + caddr_t wk_wrapped_key; +} crypto_object_wrap_key_t; + +typedef struct crypto_object_unwrap_key { + uint_t uk_return_value; + crypto_session_id_t uk_session; + crypto_mechanism_t uk_mechanism; + crypto_key_t uk_unwrapping_key; + crypto_object_id_t uk_object_handle; + size_t uk_wrapped_key_len; + caddr_t uk_wrapped_key; + uint_t uk_count; + caddr_t uk_attributes; +} crypto_object_unwrap_key_t; + +typedef struct crypto_derive_key { + uint_t dk_return_value; + crypto_session_id_t dk_session; + crypto_mechanism_t dk_mechanism; + crypto_key_t dk_base_key; + crypto_object_id_t dk_object_handle; + uint_t dk_count; + caddr_t dk_attributes; +} crypto_derive_key_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_object_generate_key32 { + uint32_t gk_return_value; + crypto_session_id_t gk_session; + crypto_object_id_t gk_handle; + crypto_mechanism32_t gk_mechanism; + uint32_t gk_count; + caddr32_t gk_attributes; +} crypto_object_generate_key32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +typedef struct crypto_object_generate_key_pair32 { + uint32_t kp_return_value; + crypto_session_id_t kp_session; + crypto_object_id_t kp_public_handle; + crypto_object_id_t kp_private_handle; + uint32_t kp_public_count; + uint32_t kp_private_count; + caddr32_t kp_public_attributes; + caddr32_t kp_private_attributes; + crypto_mechanism32_t kp_mechanism; +} crypto_object_generate_key_pair32_t; + +typedef struct crypto_object_wrap_key32 { + uint32_t wk_return_value; + crypto_session_id_t wk_session; + crypto_mechanism32_t wk_mechanism; + crypto_key32_t wk_wrapping_key; + crypto_object_id_t wk_object_handle; + size32_t wk_wrapped_key_len; + caddr32_t wk_wrapped_key; +} crypto_object_wrap_key32_t; + +typedef struct crypto_object_unwrap_key32 { + uint32_t uk_return_value; + crypto_session_id_t uk_session; + crypto_mechanism32_t uk_mechanism; + crypto_key32_t uk_unwrapping_key; + crypto_object_id_t uk_object_handle; + size32_t uk_wrapped_key_len; + caddr32_t uk_wrapped_key; + uint32_t uk_count; + caddr32_t uk_attributes; +} crypto_object_unwrap_key32_t; + +typedef struct crypto_derive_key32 { + uint32_t dk_return_value; + crypto_session_id_t dk_session; + crypto_mechanism32_t dk_mechanism; + crypto_key32_t dk_base_key; + crypto_object_id_t dk_object_handle; + uint32_t dk_count; + caddr32_t dk_attributes; +} crypto_derive_key32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_GENERATE_KEY CRYPTO(110) +#define CRYPTO_GENERATE_KEY_PAIR CRYPTO(111) +#define CRYPTO_WRAP_KEY CRYPTO(112) +#define CRYPTO_UNWRAP_KEY CRYPTO(113) +#define CRYPTO_DERIVE_KEY CRYPTO(114) + +/* + * Provider Management Ioctls + */ + +typedef struct crypto_get_provider_list { + uint_t pl_return_value; + uint_t pl_count; + crypto_provider_entry_t pl_list[1]; +} crypto_get_provider_list_t; + +typedef struct crypto_provider_data { + uchar_t pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN]; + uchar_t pd_label[CRYPTO_EXT_SIZE_LABEL]; + uchar_t pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF]; + uchar_t pd_model[CRYPTO_EXT_SIZE_MODEL]; + uchar_t pd_serial_number[CRYPTO_EXT_SIZE_SERIAL]; + ulong_t pd_flags; + ulong_t pd_max_session_count; + ulong_t pd_session_count; + ulong_t pd_max_rw_session_count; + ulong_t pd_rw_session_count; + ulong_t pd_max_pin_len; + ulong_t pd_min_pin_len; + ulong_t pd_total_public_memory; + ulong_t pd_free_public_memory; + ulong_t pd_total_private_memory; + ulong_t pd_free_private_memory; + crypto_version_t pd_hardware_version; + crypto_version_t pd_firmware_version; + uchar_t pd_time[CRYPTO_EXT_SIZE_TIME]; +} crypto_provider_data_t; + +typedef struct crypto_get_provider_info { + uint_t gi_return_value; + crypto_provider_id_t gi_provider_id; + crypto_provider_data_t gi_provider_data; +} crypto_get_provider_info_t; + +typedef struct crypto_get_provider_mechanisms { + uint_t pm_return_value; + crypto_provider_id_t pm_provider_id; + uint_t pm_count; + crypto_mech_name_t pm_list[1]; +} crypto_get_provider_mechanisms_t; + +typedef struct crypto_get_provider_mechanism_info { + uint_t mi_return_value; + crypto_provider_id_t mi_provider_id; + crypto_mech_name_t mi_mechanism_name; + uint32_t mi_min_key_size; + uint32_t mi_max_key_size; + uint32_t mi_flags; +} crypto_get_provider_mechanism_info_t; + +typedef struct crypto_init_token { + uint_t it_return_value; + crypto_provider_id_t it_provider_id; + caddr_t it_pin; + size_t it_pin_len; + caddr_t it_label; +} crypto_init_token_t; + +typedef struct crypto_init_pin { + uint_t ip_return_value; + crypto_session_id_t ip_session; + caddr_t ip_pin; + size_t ip_pin_len; +} crypto_init_pin_t; + +typedef struct crypto_set_pin { + uint_t sp_return_value; + crypto_session_id_t sp_session; + caddr_t sp_old_pin; + size_t sp_old_len; + caddr_t sp_new_pin; + size_t sp_new_len; +} crypto_set_pin_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_get_provider_list32 { + uint32_t pl_return_value; + uint32_t pl_count; + crypto_provider_entry_t pl_list[1]; +} crypto_get_provider_list32_t; + +typedef struct crypto_version32 { + uchar_t cv_major; + uchar_t cv_minor; +} crypto_version32_t; + +typedef struct crypto_provider_data32 { + uchar_t pd_prov_desc[CRYPTO_PROVIDER_DESCR_MAX_LEN]; + uchar_t pd_label[CRYPTO_EXT_SIZE_LABEL]; + uchar_t pd_manufacturerID[CRYPTO_EXT_SIZE_MANUF]; + uchar_t pd_model[CRYPTO_EXT_SIZE_MODEL]; + uchar_t pd_serial_number[CRYPTO_EXT_SIZE_SERIAL]; + uint32_t pd_flags; + uint32_t pd_max_session_count; + uint32_t pd_session_count; + uint32_t pd_max_rw_session_count; + uint32_t pd_rw_session_count; + uint32_t pd_max_pin_len; + uint32_t pd_min_pin_len; + uint32_t pd_total_public_memory; + uint32_t pd_free_public_memory; + uint32_t pd_total_private_memory; + uint32_t pd_free_private_memory; + crypto_version32_t pd_hardware_version; + crypto_version32_t pd_firmware_version; + uchar_t pd_time[CRYPTO_EXT_SIZE_TIME]; +} crypto_provider_data32_t; + +typedef struct crypto_get_provider_info32 { + uint32_t gi_return_value; + crypto_provider_id_t gi_provider_id; + crypto_provider_data32_t gi_provider_data; +} crypto_get_provider_info32_t; + +typedef struct crypto_get_provider_mechanisms32 { + uint32_t pm_return_value; + crypto_provider_id_t pm_provider_id; + uint32_t pm_count; + crypto_mech_name_t pm_list[1]; +} crypto_get_provider_mechanisms32_t; + +typedef struct crypto_init_token32 { + uint32_t it_return_value; + crypto_provider_id_t it_provider_id; + caddr32_t it_pin; + size32_t it_pin_len; + caddr32_t it_label; +} crypto_init_token32_t; + +typedef struct crypto_init_pin32 { + uint32_t ip_return_value; + crypto_session_id_t ip_session; + caddr32_t ip_pin; + size32_t ip_pin_len; +} crypto_init_pin32_t; + +typedef struct crypto_set_pin32 { + uint32_t sp_return_value; + crypto_session_id_t sp_session; + caddr32_t sp_old_pin; + size32_t sp_old_len; + caddr32_t sp_new_pin; + size32_t sp_new_len; +} crypto_set_pin32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_GET_PROVIDER_LIST CRYPTO(120) +#define CRYPTO_GET_PROVIDER_INFO CRYPTO(121) +#define CRYPTO_GET_PROVIDER_MECHANISMS CRYPTO(122) +#define CRYPTO_GET_PROVIDER_MECHANISM_INFO CRYPTO(123) +#define CRYPTO_INIT_TOKEN CRYPTO(124) +#define CRYPTO_INIT_PIN CRYPTO(125) +#define CRYPTO_SET_PIN CRYPTO(126) + +/* + * No (Key) Store Key Generation Ioctls + */ +typedef struct crypto_nostore_generate_key { + uint_t ngk_return_value; + crypto_session_id_t ngk_session; + crypto_mechanism_t ngk_mechanism; + uint_t ngk_in_count; + uint_t ngk_out_count; + caddr_t ngk_in_attributes; + caddr_t ngk_out_attributes; +} crypto_nostore_generate_key_t; + +typedef struct crypto_nostore_generate_key_pair { + uint_t nkp_return_value; + crypto_session_id_t nkp_session; + uint_t nkp_in_public_count; + uint_t nkp_in_private_count; + uint_t nkp_out_public_count; + uint_t nkp_out_private_count; + caddr_t nkp_in_public_attributes; + caddr_t nkp_in_private_attributes; + caddr_t nkp_out_public_attributes; + caddr_t nkp_out_private_attributes; + crypto_mechanism_t nkp_mechanism; +} crypto_nostore_generate_key_pair_t; + +typedef struct crypto_nostore_derive_key { + uint_t ndk_return_value; + crypto_session_id_t ndk_session; + crypto_mechanism_t ndk_mechanism; + crypto_key_t ndk_base_key; + uint_t ndk_in_count; + uint_t ndk_out_count; + caddr_t ndk_in_attributes; + caddr_t ndk_out_attributes; +} crypto_nostore_derive_key_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_nostore_generate_key32 { + uint32_t ngk_return_value; + crypto_session_id_t ngk_session; + crypto_mechanism32_t ngk_mechanism; + uint32_t ngk_in_count; + uint32_t ngk_out_count; + caddr32_t ngk_in_attributes; + caddr32_t ngk_out_attributes; +} crypto_nostore_generate_key32_t; + +typedef struct crypto_nostore_generate_key_pair32 { + uint32_t nkp_return_value; + crypto_session_id_t nkp_session; + uint32_t nkp_in_public_count; + uint32_t nkp_in_private_count; + uint32_t nkp_out_public_count; + uint32_t nkp_out_private_count; + caddr32_t nkp_in_public_attributes; + caddr32_t nkp_in_private_attributes; + caddr32_t nkp_out_public_attributes; + caddr32_t nkp_out_private_attributes; + crypto_mechanism32_t nkp_mechanism; +} crypto_nostore_generate_key_pair32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack(4) +#endif + +typedef struct crypto_nostore_derive_key32 { + uint32_t ndk_return_value; + crypto_session_id_t ndk_session; + crypto_mechanism32_t ndk_mechanism; + crypto_key32_t ndk_base_key; + uint32_t ndk_in_count; + uint32_t ndk_out_count; + caddr32_t ndk_in_attributes; + caddr32_t ndk_out_attributes; +} crypto_nostore_derive_key32_t; + +#if _LONG_LONG_ALIGNMENT == 8 && _LONG_LONG_ALIGNMENT_32 == 4 +#pragma pack() +#endif + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_NOSTORE_GENERATE_KEY CRYPTO(127) +#define CRYPTO_NOSTORE_GENERATE_KEY_PAIR CRYPTO(128) +#define CRYPTO_NOSTORE_DERIVE_KEY CRYPTO(129) + +/* + * Mechanism Ioctls + */ + +typedef struct crypto_get_mechanism_list { + uint_t ml_return_value; + uint_t ml_count; + crypto_mech_name_t ml_list[1]; +} crypto_get_mechanism_list_t; + +typedef struct crypto_get_all_mechanism_info { + uint_t mi_return_value; + crypto_mech_name_t mi_mechanism_name; + uint_t mi_count; + crypto_mechanism_info_t mi_list[1]; +} crypto_get_all_mechanism_info_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_get_mechanism_list32 { + uint32_t ml_return_value; + uint32_t ml_count; + crypto_mech_name_t ml_list[1]; +} crypto_get_mechanism_list32_t; + +typedef struct crypto_get_all_mechanism_info32 { + uint32_t mi_return_value; + crypto_mech_name_t mi_mechanism_name; + uint32_t mi_count; + crypto_mechanism_info32_t mi_list[1]; +} crypto_get_all_mechanism_info32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_GET_MECHANISM_LIST CRYPTO(140) +#define CRYPTO_GET_ALL_MECHANISM_INFO CRYPTO(141) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_IOCTL_H */ diff --git a/module/icp/include/sys/crypto/ioctladmin.h b/module/icp/include/sys/crypto/ioctladmin.h new file mode 100644 index 000000000000..24babd7755cc --- /dev/null +++ b/module/icp/include/sys/crypto/ioctladmin.h @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_IOCTLADMIN_H +#define _SYS_CRYPTO_IOCTLADMIN_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +#define ADMIN_IOCTL_DEVICE "/dev/cryptoadm" + +#define CRYPTOADMIN(x) (('y' << 8) | (x)) + +/* + * Administrative IOCTLs + */ + +typedef struct crypto_get_dev_list { + uint_t dl_return_value; + uint_t dl_dev_count; + crypto_dev_list_entry_t dl_devs[1]; +} crypto_get_dev_list_t; + +typedef struct crypto_get_soft_list { + uint_t sl_return_value; + uint_t sl_soft_count; + size_t sl_soft_len; + caddr_t sl_soft_names; +} crypto_get_soft_list_t; + +typedef struct crypto_get_dev_info { + uint_t di_return_value; + char di_dev_name[MAXNAMELEN]; + uint_t di_dev_instance; + uint_t di_count; + crypto_mech_name_t di_list[1]; +} crypto_get_dev_info_t; + +typedef struct crypto_get_soft_info { + uint_t si_return_value; + char si_name[MAXNAMELEN]; + uint_t si_count; + crypto_mech_name_t si_list[1]; +} crypto_get_soft_info_t; + +typedef struct crypto_load_dev_disabled { + uint_t dd_return_value; + char dd_dev_name[MAXNAMELEN]; + uint_t dd_dev_instance; + uint_t dd_count; + crypto_mech_name_t dd_list[1]; +} crypto_load_dev_disabled_t; + +typedef struct crypto_load_soft_disabled { + uint_t sd_return_value; + char sd_name[MAXNAMELEN]; + uint_t sd_count; + crypto_mech_name_t sd_list[1]; +} crypto_load_soft_disabled_t; + +typedef struct crypto_unload_soft_module { + uint_t sm_return_value; + char sm_name[MAXNAMELEN]; +} crypto_unload_soft_module_t; + +typedef struct crypto_load_soft_config { + uint_t sc_return_value; + char sc_name[MAXNAMELEN]; + uint_t sc_count; + crypto_mech_name_t sc_list[1]; +} crypto_load_soft_config_t; + +typedef struct crypto_load_door { + uint_t ld_return_value; + uint_t ld_did; +} crypto_load_door_t; + +#ifdef _KERNEL +#ifdef _SYSCALL32 + +typedef struct crypto_get_soft_list32 { + uint32_t sl_return_value; + uint32_t sl_soft_count; + size32_t sl_soft_len; + caddr32_t sl_soft_names; +} crypto_get_soft_list32_t; + +#endif /* _SYSCALL32 */ +#endif /* _KERNEL */ + +#define CRYPTO_GET_VERSION CRYPTOADMIN(1) +#define CRYPTO_GET_DEV_LIST CRYPTOADMIN(2) +#define CRYPTO_GET_SOFT_LIST CRYPTOADMIN(3) +#define CRYPTO_GET_DEV_INFO CRYPTOADMIN(4) +#define CRYPTO_GET_SOFT_INFO CRYPTOADMIN(5) +#define CRYPTO_LOAD_DEV_DISABLED CRYPTOADMIN(8) +#define CRYPTO_LOAD_SOFT_DISABLED CRYPTOADMIN(9) +#define CRYPTO_UNLOAD_SOFT_MODULE CRYPTOADMIN(10) +#define CRYPTO_LOAD_SOFT_CONFIG CRYPTOADMIN(11) +#define CRYPTO_POOL_CREATE CRYPTOADMIN(12) +#define CRYPTO_POOL_WAIT CRYPTOADMIN(13) +#define CRYPTO_POOL_RUN CRYPTOADMIN(14) +#define CRYPTO_LOAD_DOOR CRYPTOADMIN(15) + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_IOCTLADMIN_H */ diff --git a/module/icp/include/sys/crypto/ops_impl.h b/module/icp/include/sys/crypto/ops_impl.h new file mode 100644 index 000000000000..230d74b063fc --- /dev/null +++ b/module/icp/include/sys/crypto/ops_impl.h @@ -0,0 +1,630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_OPS_IMPL_H +#define _SYS_CRYPTO_OPS_IMPL_H + +/* + * Scheduler internal structures. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include + +/* + * The parameters needed for each function group are batched + * in one structure. This is much simpler than having a + * separate structure for each function. + * + * In some cases, a field is generically named to keep the + * structure small. The comments indicate these cases. + */ +typedef struct kcf_digest_ops_params { + crypto_session_id_t do_sid; + crypto_mech_type_t do_framework_mechtype; + crypto_mechanism_t do_mech; + crypto_data_t *do_data; + crypto_data_t *do_digest; + crypto_key_t *do_digest_key; /* Argument for digest_key() */ +} kcf_digest_ops_params_t; + +typedef struct kcf_mac_ops_params { + crypto_session_id_t mo_sid; + crypto_mech_type_t mo_framework_mechtype; + crypto_mechanism_t mo_mech; + crypto_key_t *mo_key; + crypto_data_t *mo_data; + crypto_data_t *mo_mac; + crypto_spi_ctx_template_t mo_templ; +} kcf_mac_ops_params_t; + +typedef struct kcf_encrypt_ops_params { + crypto_session_id_t eo_sid; + crypto_mech_type_t eo_framework_mechtype; + crypto_mechanism_t eo_mech; + crypto_key_t *eo_key; + crypto_data_t *eo_plaintext; + crypto_data_t *eo_ciphertext; + crypto_spi_ctx_template_t eo_templ; +} kcf_encrypt_ops_params_t; + +typedef struct kcf_decrypt_ops_params { + crypto_session_id_t dop_sid; + crypto_mech_type_t dop_framework_mechtype; + crypto_mechanism_t dop_mech; + crypto_key_t *dop_key; + crypto_data_t *dop_ciphertext; + crypto_data_t *dop_plaintext; + crypto_spi_ctx_template_t dop_templ; +} kcf_decrypt_ops_params_t; + +typedef struct kcf_sign_ops_params { + crypto_session_id_t so_sid; + crypto_mech_type_t so_framework_mechtype; + crypto_mechanism_t so_mech; + crypto_key_t *so_key; + crypto_data_t *so_data; + crypto_data_t *so_signature; + crypto_spi_ctx_template_t so_templ; +} kcf_sign_ops_params_t; + +typedef struct kcf_verify_ops_params { + crypto_session_id_t vo_sid; + crypto_mech_type_t vo_framework_mechtype; + crypto_mechanism_t vo_mech; + crypto_key_t *vo_key; + crypto_data_t *vo_data; + crypto_data_t *vo_signature; + crypto_spi_ctx_template_t vo_templ; +} kcf_verify_ops_params_t; + +typedef struct kcf_encrypt_mac_ops_params { + crypto_session_id_t em_sid; + crypto_mech_type_t em_framework_encr_mechtype; + crypto_mechanism_t em_encr_mech; + crypto_key_t *em_encr_key; + crypto_mech_type_t em_framework_mac_mechtype; + crypto_mechanism_t em_mac_mech; + crypto_key_t *em_mac_key; + crypto_data_t *em_plaintext; + crypto_dual_data_t *em_ciphertext; + crypto_data_t *em_mac; + crypto_spi_ctx_template_t em_encr_templ; + crypto_spi_ctx_template_t em_mac_templ; +} kcf_encrypt_mac_ops_params_t; + +typedef struct kcf_mac_decrypt_ops_params { + crypto_session_id_t md_sid; + crypto_mech_type_t md_framework_mac_mechtype; + crypto_mechanism_t md_mac_mech; + crypto_key_t *md_mac_key; + crypto_mech_type_t md_framework_decr_mechtype; + crypto_mechanism_t md_decr_mech; + crypto_key_t *md_decr_key; + crypto_dual_data_t *md_ciphertext; + crypto_data_t *md_mac; + crypto_data_t *md_plaintext; + crypto_spi_ctx_template_t md_mac_templ; + crypto_spi_ctx_template_t md_decr_templ; +} kcf_mac_decrypt_ops_params_t; + +typedef struct kcf_random_number_ops_params { + crypto_session_id_t rn_sid; + uchar_t *rn_buf; + size_t rn_buflen; + uint_t rn_entropy_est; + uint32_t rn_flags; +} kcf_random_number_ops_params_t; + +/* + * so_pd is useful when the provider descriptor (pd) supplying the + * provider handle is different from the pd supplying the ops vector. + * This is the case for session open/close where so_pd can be the pd + * of a logical provider. The pd supplying the ops vector is passed + * as an argument to kcf_submit_request(). + */ +typedef struct kcf_session_ops_params { + crypto_session_id_t *so_sid_ptr; + crypto_session_id_t so_sid; + crypto_user_type_t so_user_type; + char *so_pin; + size_t so_pin_len; + kcf_provider_desc_t *so_pd; +} kcf_session_ops_params_t; + +typedef struct kcf_object_ops_params { + crypto_session_id_t oo_sid; + crypto_object_id_t oo_object_id; + crypto_object_attribute_t *oo_template; + uint_t oo_attribute_count; + crypto_object_id_t *oo_object_id_ptr; + size_t *oo_object_size; + void **oo_find_init_pp_ptr; + void *oo_find_pp; + uint_t oo_max_object_count; + uint_t *oo_object_count_ptr; +} kcf_object_ops_params_t; + +/* + * ko_key is used to encode wrapping key in key_wrap() and + * unwrapping key in key_unwrap(). ko_key_template and + * ko_key_attribute_count are used to encode public template + * and public template attr count in key_generate_pair(). + * kops->ko_key_object_id_ptr is used to encode public key + * in key_generate_pair(). + */ +typedef struct kcf_key_ops_params { + crypto_session_id_t ko_sid; + crypto_mech_type_t ko_framework_mechtype; + crypto_mechanism_t ko_mech; + crypto_object_attribute_t *ko_key_template; + uint_t ko_key_attribute_count; + crypto_object_id_t *ko_key_object_id_ptr; + crypto_object_attribute_t *ko_private_key_template; + uint_t ko_private_key_attribute_count; + crypto_object_id_t *ko_private_key_object_id_ptr; + crypto_key_t *ko_key; + uchar_t *ko_wrapped_key; + size_t *ko_wrapped_key_len_ptr; + crypto_object_attribute_t *ko_out_template1; + crypto_object_attribute_t *ko_out_template2; + uint_t ko_out_attribute_count1; + uint_t ko_out_attribute_count2; +} kcf_key_ops_params_t; + +/* + * po_pin and po_pin_len are used to encode new_pin and new_pin_len + * when wrapping set_pin() function parameters. + * + * po_pd is useful when the provider descriptor (pd) supplying the + * provider handle is different from the pd supplying the ops vector. + * This is true for the ext_info provider entry point where po_pd + * can be the pd of a logical provider. The pd supplying the ops vector + * is passed as an argument to kcf_submit_request(). + */ +typedef struct kcf_provmgmt_ops_params { + crypto_session_id_t po_sid; + char *po_pin; + size_t po_pin_len; + char *po_old_pin; + size_t po_old_pin_len; + char *po_label; + crypto_provider_ext_info_t *po_ext_info; + kcf_provider_desc_t *po_pd; +} kcf_provmgmt_ops_params_t; + +/* + * The operation type within a function group. + */ +typedef enum kcf_op_type { + /* common ops for all mechanisms */ + KCF_OP_INIT = 1, + KCF_OP_SINGLE, /* pkcs11 sense. So, INIT is already done */ + KCF_OP_UPDATE, + KCF_OP_FINAL, + KCF_OP_ATOMIC, + + /* digest_key op */ + KCF_OP_DIGEST_KEY, + + /* mac specific op */ + KCF_OP_MAC_VERIFY_ATOMIC, + + /* mac/cipher specific op */ + KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC, + + /* sign_recover ops */ + KCF_OP_SIGN_RECOVER_INIT, + KCF_OP_SIGN_RECOVER, + KCF_OP_SIGN_RECOVER_ATOMIC, + + /* verify_recover ops */ + KCF_OP_VERIFY_RECOVER_INIT, + KCF_OP_VERIFY_RECOVER, + KCF_OP_VERIFY_RECOVER_ATOMIC, + + /* random number ops */ + KCF_OP_RANDOM_SEED, + KCF_OP_RANDOM_GENERATE, + + /* session management ops */ + KCF_OP_SESSION_OPEN, + KCF_OP_SESSION_CLOSE, + KCF_OP_SESSION_LOGIN, + KCF_OP_SESSION_LOGOUT, + + /* object management ops */ + KCF_OP_OBJECT_CREATE, + KCF_OP_OBJECT_COPY, + KCF_OP_OBJECT_DESTROY, + KCF_OP_OBJECT_GET_SIZE, + KCF_OP_OBJECT_GET_ATTRIBUTE_VALUE, + KCF_OP_OBJECT_SET_ATTRIBUTE_VALUE, + KCF_OP_OBJECT_FIND_INIT, + KCF_OP_OBJECT_FIND, + KCF_OP_OBJECT_FIND_FINAL, + + /* key management ops */ + KCF_OP_KEY_GENERATE, + KCF_OP_KEY_GENERATE_PAIR, + KCF_OP_KEY_WRAP, + KCF_OP_KEY_UNWRAP, + KCF_OP_KEY_DERIVE, + KCF_OP_KEY_CHECK, + + /* provider management ops */ + KCF_OP_MGMT_EXTINFO, + KCF_OP_MGMT_INITTOKEN, + KCF_OP_MGMT_INITPIN, + KCF_OP_MGMT_SETPIN +} kcf_op_type_t; + +/* + * The operation groups that need wrapping of parameters. This is somewhat + * similar to the function group type in spi.h except that this also includes + * all the functions that don't have a mechanism. + * + * The wrapper macros should never take these enum values as an argument. + * Rather, they are assigned in the macro itself since they are known + * from the macro name. + */ +typedef enum kcf_op_group { + KCF_OG_DIGEST = 1, + KCF_OG_MAC, + KCF_OG_ENCRYPT, + KCF_OG_DECRYPT, + KCF_OG_SIGN, + KCF_OG_VERIFY, + KCF_OG_ENCRYPT_MAC, + KCF_OG_MAC_DECRYPT, + KCF_OG_RANDOM, + KCF_OG_SESSION, + KCF_OG_OBJECT, + KCF_OG_KEY, + KCF_OG_PROVMGMT, + KCF_OG_NOSTORE_KEY +} kcf_op_group_t; + +/* + * The kcf_op_type_t enum values used here should be only for those + * operations for which there is a k-api routine in sys/crypto/api.h. + */ +#define IS_INIT_OP(ftype) ((ftype) == KCF_OP_INIT) +#define IS_SINGLE_OP(ftype) ((ftype) == KCF_OP_SINGLE) +#define IS_UPDATE_OP(ftype) ((ftype) == KCF_OP_UPDATE) +#define IS_FINAL_OP(ftype) ((ftype) == KCF_OP_FINAL) +#define IS_ATOMIC_OP(ftype) ( \ + (ftype) == KCF_OP_ATOMIC || (ftype) == KCF_OP_MAC_VERIFY_ATOMIC || \ + (ftype) == KCF_OP_MAC_VERIFY_DECRYPT_ATOMIC || \ + (ftype) == KCF_OP_SIGN_RECOVER_ATOMIC || \ + (ftype) == KCF_OP_VERIFY_RECOVER_ATOMIC) + +/* + * Keep the parameters associated with a request around. + * We need to pass them to the SPI. + */ +typedef struct kcf_req_params { + kcf_op_group_t rp_opgrp; + kcf_op_type_t rp_optype; + + union { + kcf_digest_ops_params_t digest_params; + kcf_mac_ops_params_t mac_params; + kcf_encrypt_ops_params_t encrypt_params; + kcf_decrypt_ops_params_t decrypt_params; + kcf_sign_ops_params_t sign_params; + kcf_verify_ops_params_t verify_params; + kcf_encrypt_mac_ops_params_t encrypt_mac_params; + kcf_mac_decrypt_ops_params_t mac_decrypt_params; + kcf_random_number_ops_params_t random_number_params; + kcf_session_ops_params_t session_params; + kcf_object_ops_params_t object_params; + kcf_key_ops_params_t key_params; + kcf_provmgmt_ops_params_t provmgmt_params; + } rp_u; +} kcf_req_params_t; + + +/* + * The ioctl/k-api code should bundle the parameters into a kcf_req_params_t + * structure before calling a scheduler routine. The following macros are + * available for that purpose. + * + * For the most part, the macro arguments closely correspond to the + * function parameters. In some cases, we use generic names. The comments + * for the structure should indicate these cases. + */ +#define KCF_WRAP_DIGEST_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _data, _digest) { \ + kcf_digest_ops_params_t *dops = &(req)->rp_u.digest_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_DIGEST; \ + (req)->rp_optype = ftype; \ + dops->do_sid = _sid; \ + if (mechp != NULL) { \ + dops->do_mech = *mechp; \ + dops->do_framework_mechtype = mechp->cm_type; \ + } \ + dops->do_digest_key = _key; \ + dops->do_data = _data; \ + dops->do_digest = _digest; \ +} + +#define KCF_WRAP_MAC_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _data, _mac, _templ) { \ + kcf_mac_ops_params_t *mops = &(req)->rp_u.mac_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_MAC; \ + (req)->rp_optype = ftype; \ + mops->mo_sid = _sid; \ + if (mechp != NULL) { \ + mops->mo_mech = *mechp; \ + mops->mo_framework_mechtype = mechp->cm_type; \ + } \ + mops->mo_key = _key; \ + mops->mo_data = _data; \ + mops->mo_mac = _mac; \ + mops->mo_templ = _templ; \ +} + +#define KCF_WRAP_ENCRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _plaintext, _ciphertext, _templ) { \ + kcf_encrypt_ops_params_t *cops = &(req)->rp_u.encrypt_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_ENCRYPT; \ + (req)->rp_optype = ftype; \ + cops->eo_sid = _sid; \ + if (mechp != NULL) { \ + cops->eo_mech = *mechp; \ + cops->eo_framework_mechtype = mechp->cm_type; \ + } \ + cops->eo_key = _key; \ + cops->eo_plaintext = _plaintext; \ + cops->eo_ciphertext = _ciphertext; \ + cops->eo_templ = _templ; \ +} + +#define KCF_WRAP_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _ciphertext, _plaintext, _templ) { \ + kcf_decrypt_ops_params_t *cops = &(req)->rp_u.decrypt_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_DECRYPT; \ + (req)->rp_optype = ftype; \ + cops->dop_sid = _sid; \ + if (mechp != NULL) { \ + cops->dop_mech = *mechp; \ + cops->dop_framework_mechtype = mechp->cm_type; \ + } \ + cops->dop_key = _key; \ + cops->dop_ciphertext = _ciphertext; \ + cops->dop_plaintext = _plaintext; \ + cops->dop_templ = _templ; \ +} + +#define KCF_WRAP_SIGN_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _data, _signature, _templ) { \ + kcf_sign_ops_params_t *sops = &(req)->rp_u.sign_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_SIGN; \ + (req)->rp_optype = ftype; \ + sops->so_sid = _sid; \ + if (mechp != NULL) { \ + sops->so_mech = *mechp; \ + sops->so_framework_mechtype = mechp->cm_type; \ + } \ + sops->so_key = _key; \ + sops->so_data = _data; \ + sops->so_signature = _signature; \ + sops->so_templ = _templ; \ +} + +#define KCF_WRAP_VERIFY_OPS_PARAMS(req, ftype, _sid, _mech, _key, \ + _data, _signature, _templ) { \ + kcf_verify_ops_params_t *vops = &(req)->rp_u.verify_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_VERIFY; \ + (req)->rp_optype = ftype; \ + vops->vo_sid = _sid; \ + if (mechp != NULL) { \ + vops->vo_mech = *mechp; \ + vops->vo_framework_mechtype = mechp->cm_type; \ + } \ + vops->vo_key = _key; \ + vops->vo_data = _data; \ + vops->vo_signature = _signature; \ + vops->vo_templ = _templ; \ +} + +#define KCF_WRAP_ENCRYPT_MAC_OPS_PARAMS(req, ftype, _sid, _encr_key, \ + _mac_key, _plaintext, _ciphertext, _mac, _encr_templ, _mac_templ) { \ + kcf_encrypt_mac_ops_params_t *cmops = &(req)->rp_u.encrypt_mac_params; \ + \ + (req)->rp_opgrp = KCF_OG_ENCRYPT_MAC; \ + (req)->rp_optype = ftype; \ + cmops->em_sid = _sid; \ + cmops->em_encr_key = _encr_key; \ + cmops->em_mac_key = _mac_key; \ + cmops->em_plaintext = _plaintext; \ + cmops->em_ciphertext = _ciphertext; \ + cmops->em_mac = _mac; \ + cmops->em_encr_templ = _encr_templ; \ + cmops->em_mac_templ = _mac_templ; \ +} + +#define KCF_WRAP_MAC_DECRYPT_OPS_PARAMS(req, ftype, _sid, _mac_key, \ + _decr_key, _ciphertext, _mac, _plaintext, _mac_templ, _decr_templ) { \ + kcf_mac_decrypt_ops_params_t *cmops = &(req)->rp_u.mac_decrypt_params; \ + \ + (req)->rp_opgrp = KCF_OG_MAC_DECRYPT; \ + (req)->rp_optype = ftype; \ + cmops->md_sid = _sid; \ + cmops->md_mac_key = _mac_key; \ + cmops->md_decr_key = _decr_key; \ + cmops->md_ciphertext = _ciphertext; \ + cmops->md_mac = _mac; \ + cmops->md_plaintext = _plaintext; \ + cmops->md_mac_templ = _mac_templ; \ + cmops->md_decr_templ = _decr_templ; \ +} + +#define KCF_WRAP_RANDOM_OPS_PARAMS(req, ftype, _sid, _buf, _buflen, \ + _est, _flags) { \ + kcf_random_number_ops_params_t *rops = \ + &(req)->rp_u.random_number_params; \ + \ + (req)->rp_opgrp = KCF_OG_RANDOM; \ + (req)->rp_optype = ftype; \ + rops->rn_sid = _sid; \ + rops->rn_buf = _buf; \ + rops->rn_buflen = _buflen; \ + rops->rn_entropy_est = _est; \ + rops->rn_flags = _flags; \ +} + +#define KCF_WRAP_SESSION_OPS_PARAMS(req, ftype, _sid_ptr, _sid, \ + _user_type, _pin, _pin_len, _pd) { \ + kcf_session_ops_params_t *sops = &(req)->rp_u.session_params; \ + \ + (req)->rp_opgrp = KCF_OG_SESSION; \ + (req)->rp_optype = ftype; \ + sops->so_sid_ptr = _sid_ptr; \ + sops->so_sid = _sid; \ + sops->so_user_type = _user_type; \ + sops->so_pin = _pin; \ + sops->so_pin_len = _pin_len; \ + sops->so_pd = _pd; \ +} + +#define KCF_WRAP_OBJECT_OPS_PARAMS(req, ftype, _sid, _object_id, \ + _template, _attribute_count, _object_id_ptr, _object_size, \ + _find_init_pp_ptr, _find_pp, _max_object_count, _object_count_ptr) { \ + kcf_object_ops_params_t *jops = &(req)->rp_u.object_params; \ + \ + (req)->rp_opgrp = KCF_OG_OBJECT; \ + (req)->rp_optype = ftype; \ + jops->oo_sid = _sid; \ + jops->oo_object_id = _object_id; \ + jops->oo_template = _template; \ + jops->oo_attribute_count = _attribute_count; \ + jops->oo_object_id_ptr = _object_id_ptr; \ + jops->oo_object_size = _object_size; \ + jops->oo_find_init_pp_ptr = _find_init_pp_ptr; \ + jops->oo_find_pp = _find_pp; \ + jops->oo_max_object_count = _max_object_count; \ + jops->oo_object_count_ptr = _object_count_ptr; \ +} + +#define KCF_WRAP_KEY_OPS_PARAMS(req, ftype, _sid, _mech, _key_template, \ + _key_attribute_count, _key_object_id_ptr, _private_key_template, \ + _private_key_attribute_count, _private_key_object_id_ptr, \ + _key, _wrapped_key, _wrapped_key_len_ptr) { \ + kcf_key_ops_params_t *kops = &(req)->rp_u.key_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_KEY; \ + (req)->rp_optype = ftype; \ + kops->ko_sid = _sid; \ + if (mechp != NULL) { \ + kops->ko_mech = *mechp; \ + kops->ko_framework_mechtype = mechp->cm_type; \ + } \ + kops->ko_key_template = _key_template; \ + kops->ko_key_attribute_count = _key_attribute_count; \ + kops->ko_key_object_id_ptr = _key_object_id_ptr; \ + kops->ko_private_key_template = _private_key_template; \ + kops->ko_private_key_attribute_count = _private_key_attribute_count; \ + kops->ko_private_key_object_id_ptr = _private_key_object_id_ptr; \ + kops->ko_key = _key; \ + kops->ko_wrapped_key = _wrapped_key; \ + kops->ko_wrapped_key_len_ptr = _wrapped_key_len_ptr; \ +} + +#define KCF_WRAP_PROVMGMT_OPS_PARAMS(req, ftype, _sid, _old_pin, \ + _old_pin_len, _pin, _pin_len, _label, _ext_info, _pd) { \ + kcf_provmgmt_ops_params_t *pops = &(req)->rp_u.provmgmt_params; \ + \ + (req)->rp_opgrp = KCF_OG_PROVMGMT; \ + (req)->rp_optype = ftype; \ + pops->po_sid = _sid; \ + pops->po_pin = _pin; \ + pops->po_pin_len = _pin_len; \ + pops->po_old_pin = _old_pin; \ + pops->po_old_pin_len = _old_pin_len; \ + pops->po_label = _label; \ + pops->po_ext_info = _ext_info; \ + pops->po_pd = _pd; \ +} + +#define KCF_WRAP_NOSTORE_KEY_OPS_PARAMS(req, ftype, _sid, _mech, \ + _key_template, _key_attribute_count, _private_key_template, \ + _private_key_attribute_count, _key, _out_template1, \ + _out_attribute_count1, _out_template2, _out_attribute_count2) { \ + kcf_key_ops_params_t *kops = &(req)->rp_u.key_params; \ + crypto_mechanism_t *mechp = _mech; \ + \ + (req)->rp_opgrp = KCF_OG_NOSTORE_KEY; \ + (req)->rp_optype = ftype; \ + kops->ko_sid = _sid; \ + if (mechp != NULL) { \ + kops->ko_mech = *mechp; \ + kops->ko_framework_mechtype = mechp->cm_type; \ + } \ + kops->ko_key_template = _key_template; \ + kops->ko_key_attribute_count = _key_attribute_count; \ + kops->ko_key_object_id_ptr = NULL; \ + kops->ko_private_key_template = _private_key_template; \ + kops->ko_private_key_attribute_count = _private_key_attribute_count; \ + kops->ko_private_key_object_id_ptr = NULL; \ + kops->ko_key = _key; \ + kops->ko_wrapped_key = NULL; \ + kops->ko_wrapped_key_len_ptr = 0; \ + kops->ko_out_template1 = _out_template1; \ + kops->ko_out_template2 = _out_template2; \ + kops->ko_out_attribute_count1 = _out_attribute_count1; \ + kops->ko_out_attribute_count2 = _out_attribute_count2; \ +} + +#define KCF_SET_PROVIDER_MECHNUM(fmtype, pd, mechp) \ + (mechp)->cm_type = \ + KCF_TO_PROV_MECHNUM(pd, fmtype); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_OPS_IMPL_H */ diff --git a/module/icp/include/sys/crypto/sched_impl.h b/module/icp/include/sys/crypto/sched_impl.h new file mode 100644 index 000000000000..85ea0ba1d092 --- /dev/null +++ b/module/icp/include/sys/crypto/sched_impl.h @@ -0,0 +1,531 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_SCHED_IMPL_H +#define _SYS_CRYPTO_SCHED_IMPL_H + +/* + * Scheduler internal structures. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include + +typedef void (kcf_func_t)(void *, int); + +typedef enum kcf_req_status { + REQ_ALLOCATED = 1, + REQ_WAITING, /* At the framework level */ + REQ_INPROGRESS, /* At the provider level */ + REQ_DONE, + REQ_CANCELED +} kcf_req_status_t; + +typedef enum kcf_call_type { + CRYPTO_SYNCH = 1, + CRYPTO_ASYNCH +} kcf_call_type_t; + +#define CHECK_RESTRICT(crq) (crq != NULL && \ + ((crq)->cr_flag & CRYPTO_RESTRICTED)) + +#define CHECK_RESTRICT_FALSE B_FALSE + +#define CHECK_FASTPATH(crq, pd) ((crq) == NULL || \ + !((crq)->cr_flag & CRYPTO_ALWAYS_QUEUE)) && \ + (pd)->pd_prov_type == CRYPTO_SW_PROVIDER + +#define KCF_KMFLAG(crq) (((crq) == NULL) ? KM_SLEEP : KM_NOSLEEP) + +/* + * The framework keeps an internal handle to use in the adaptive + * asynchronous case. This is the case when a client has the + * CRYPTO_ALWAYS_QUEUE bit clear and a software provider is used for + * the request. The request is completed in the context of the calling + * thread and kernel memory must be allocated with KM_NOSLEEP. + * + * The framework passes a pointer to the handle in crypto_req_handle_t + * argument when it calls the SPI of the software provider. The macros + * KCF_RHNDL() and KCF_SWFP_RHNDL() are used to do this. + * + * When a provider asks the framework for kmflag value via + * crypto_kmflag(9S) we use REQHNDL2_KMFLAG() macro. + */ +extern ulong_t kcf_swprov_hndl; +#define KCF_RHNDL(kmflag) (((kmflag) == KM_SLEEP) ? NULL : &kcf_swprov_hndl) +#define KCF_SWFP_RHNDL(crq) (((crq) == NULL) ? NULL : &kcf_swprov_hndl) +#define REQHNDL2_KMFLAG(rhndl) \ + ((rhndl == &kcf_swprov_hndl) ? KM_NOSLEEP : KM_SLEEP) + +/* Internal call_req flags. They start after the public ones in api.h */ + +#define CRYPTO_SETDUAL 0x00001000 /* Set the 'cont' boolean before */ + /* submitting the request */ +#define KCF_ISDUALREQ(crq) \ + (((crq) == NULL) ? B_FALSE : (crq->cr_flag & CRYPTO_SETDUAL)) + +typedef struct kcf_prov_tried { + kcf_provider_desc_t *pt_pd; + struct kcf_prov_tried *pt_next; +} kcf_prov_tried_t; + +#define IS_FG_SUPPORTED(mdesc, fg) \ + (((mdesc)->pm_mech_info.cm_func_group_mask & (fg)) != 0) + +#define IS_PROVIDER_TRIED(pd, tlist) \ + (tlist != NULL && is_in_triedlist(pd, tlist)) + +#define IS_RECOVERABLE(error) \ + (error == CRYPTO_BUFFER_TOO_BIG || \ + error == CRYPTO_BUSY || \ + error == CRYPTO_DEVICE_ERROR || \ + error == CRYPTO_DEVICE_MEMORY || \ + error == CRYPTO_KEY_SIZE_RANGE || \ + error == CRYPTO_NO_PERMISSION) + +#define KCF_ATOMIC_INCR(x) atomic_add_32(&(x), 1) +#define KCF_ATOMIC_DECR(x) atomic_add_32(&(x), -1) + +/* + * Node structure for synchronous requests. + */ +typedef struct kcf_sreq_node { + /* Should always be the first field in this structure */ + kcf_call_type_t sn_type; + /* + * sn_cv and sr_lock are used to wait for the + * operation to complete. sn_lock also protects + * the sn_state field. + */ + kcondvar_t sn_cv; + kmutex_t sn_lock; + kcf_req_status_t sn_state; + + /* + * Return value from the operation. This will be + * one of the CRYPTO_* errors defined in common.h. + */ + int sn_rv; + + /* + * parameters to call the SPI with. This can be + * a pointer as we know the caller context/stack stays. + */ + struct kcf_req_params *sn_params; + + /* Internal context for this request */ + struct kcf_context *sn_context; + + /* Provider handling this request */ + kcf_provider_desc_t *sn_provider; +} kcf_sreq_node_t; + +/* + * Node structure for asynchronous requests. A node can be on + * on a chain of requests hanging of the internal context + * structure and can be in the global software provider queue. + */ +typedef struct kcf_areq_node { + /* Should always be the first field in this structure */ + kcf_call_type_t an_type; + + /* an_lock protects the field an_state */ + kmutex_t an_lock; + kcf_req_status_t an_state; + crypto_call_req_t an_reqarg; + + /* + * parameters to call the SPI with. We need to + * save the params since the caller stack can go away. + */ + struct kcf_req_params an_params; + + /* + * The next two fields should be NULL for operations that + * don't need a context. + */ + /* Internal context for this request */ + struct kcf_context *an_context; + + /* next in chain of requests for context */ + struct kcf_areq_node *an_ctxchain_next; + + kcondvar_t an_turn_cv; + boolean_t an_is_my_turn; + boolean_t an_isdual; /* for internal reuse */ + + /* + * Next and previous nodes in the global software + * queue. These fields are NULL for a hardware + * provider since we use a taskq there. + */ + struct kcf_areq_node *an_next; + struct kcf_areq_node *an_prev; + + /* Provider handling this request */ + kcf_provider_desc_t *an_provider; + kcf_prov_tried_t *an_tried_plist; + + struct kcf_areq_node *an_idnext; /* Next in ID hash */ + struct kcf_areq_node *an_idprev; /* Prev in ID hash */ + kcondvar_t an_done; /* Signal request completion */ + uint_t an_refcnt; +} kcf_areq_node_t; + +#define KCF_AREQ_REFHOLD(areq) { \ + atomic_add_32(&(areq)->an_refcnt, 1); \ + ASSERT((areq)->an_refcnt != 0); \ +} + +#define KCF_AREQ_REFRELE(areq) { \ + ASSERT((areq)->an_refcnt != 0); \ + membar_exit(); \ + if (atomic_add_32_nv(&(areq)->an_refcnt, -1) == 0) \ + kcf_free_req(areq); \ +} + +#define GET_REQ_TYPE(arg) *((kcf_call_type_t *)(arg)) + +#define NOTIFY_CLIENT(areq, err) (*(areq)->an_reqarg.cr_callback_func)(\ + (areq)->an_reqarg.cr_callback_arg, err); + +/* For internally generated call requests for dual operations */ +typedef struct kcf_call_req { + crypto_call_req_t kr_callreq; /* external client call req */ + kcf_req_params_t kr_params; /* Params saved for next call */ + kcf_areq_node_t *kr_areq; /* Use this areq */ + off_t kr_saveoffset; + size_t kr_savelen; +} kcf_dual_req_t; + +/* + * The following are some what similar to macros in callo.h, which implement + * callout tables. + * + * The lower four bits of the ID are used to encode the table ID to + * index in to. The REQID_COUNTER_HIGH bit is used to avoid any check for + * wrap around when generating ID. We assume that there won't be a request + * which takes more time than 2^^(sizeof (long) - 5) other requests submitted + * after it. This ensures there won't be any ID collision. + */ +#define REQID_COUNTER_HIGH (1UL << (8 * sizeof (long) - 1)) +#define REQID_COUNTER_SHIFT 4 +#define REQID_COUNTER_LOW (1 << REQID_COUNTER_SHIFT) +#define REQID_TABLES 16 +#define REQID_TABLE_MASK (REQID_TABLES - 1) + +#define REQID_BUCKETS 512 +#define REQID_BUCKET_MASK (REQID_BUCKETS - 1) +#define REQID_HASH(id) (((id) >> REQID_COUNTER_SHIFT) & REQID_BUCKET_MASK) + +#define GET_REQID(areq) (areq)->an_reqarg.cr_reqid +#define SET_REQID(areq, val) GET_REQID(areq) = val + +/* + * Hash table for async requests. + */ +typedef struct kcf_reqid_table { + kmutex_t rt_lock; + crypto_req_id_t rt_curid; + kcf_areq_node_t *rt_idhash[REQID_BUCKETS]; +} kcf_reqid_table_t; + +/* + * Global software provider queue structure. Requests to be + * handled by a SW provider and have the ALWAYS_QUEUE flag set + * get queued here. + */ +typedef struct kcf_global_swq { + /* + * gs_cv and gs_lock are used to wait for new requests. + * gs_lock protects the changes to the queue. + */ + kcondvar_t gs_cv; + kmutex_t gs_lock; + uint_t gs_njobs; + uint_t gs_maxjobs; + kcf_areq_node_t *gs_first; + kcf_areq_node_t *gs_last; +} kcf_global_swq_t; + + +/* + * Internal representation of a canonical context. We contain crypto_ctx_t + * structure in order to have just one memory allocation. The SPI + * ((crypto_ctx_t *)ctx)->cc_framework_private maps to this structure. + */ +typedef struct kcf_context { + crypto_ctx_t kc_glbl_ctx; + uint_t kc_refcnt; + kmutex_t kc_in_use_lock; + /* + * kc_req_chain_first and kc_req_chain_last are used to chain + * multiple async requests using the same context. They should be + * NULL for sync requests. + */ + kcf_areq_node_t *kc_req_chain_first; + kcf_areq_node_t *kc_req_chain_last; + kcf_provider_desc_t *kc_prov_desc; /* Prov. descriptor */ + kcf_provider_desc_t *kc_sw_prov_desc; /* Prov. descriptor */ + kcf_mech_entry_t *kc_mech; + struct kcf_context *kc_secondctx; /* for dual contexts */ +} kcf_context_t; + +/* + * Bump up the reference count on the framework private context. A + * global context or a request that references this structure should + * do a hold. + */ +#define KCF_CONTEXT_REFHOLD(ictx) { \ + atomic_add_32(&(ictx)->kc_refcnt, 1); \ + ASSERT((ictx)->kc_refcnt != 0); \ +} + +/* + * Decrement the reference count on the framework private context. + * When the last reference is released, the framework private + * context structure is freed along with the global context. + */ +#define KCF_CONTEXT_REFRELE(ictx) { \ + ASSERT((ictx)->kc_refcnt != 0); \ + membar_exit(); \ + if (atomic_add_32_nv(&(ictx)->kc_refcnt, -1) == 0) \ + kcf_free_context(ictx); \ +} + +/* + * Check if we can release the context now. In case of CRYPTO_QUEUED + * we do not release it as we can do it only after the provider notified + * us. In case of CRYPTO_BUSY, the client can retry the request using + * the context, so we do not release the context. + * + * This macro should be called only from the final routine in + * an init/update/final sequence. We do not release the context in case + * of update operations. We require the consumer to free it + * explicitly, in case it wants to abandon the operation. This is done + * as there may be mechanisms in ECB mode that can continue even if + * an operation on a block fails. + */ +#define KCF_CONTEXT_COND_RELEASE(rv, kcf_ctx) { \ + if (KCF_CONTEXT_DONE(rv)) \ + KCF_CONTEXT_REFRELE(kcf_ctx); \ +} + +/* + * This macro determines whether we're done with a context. + */ +#define KCF_CONTEXT_DONE(rv) \ + ((rv) != CRYPTO_QUEUED && (rv) != CRYPTO_BUSY && \ + (rv) != CRYPTO_BUFFER_TOO_SMALL) + +/* + * A crypto_ctx_template_t is internally a pointer to this struct + */ +typedef struct kcf_ctx_template { + crypto_kcf_provider_handle_t ct_prov_handle; /* provider handle */ + uint_t ct_generation; /* generation # */ + size_t ct_size; /* for freeing */ + crypto_spi_ctx_template_t ct_prov_tmpl; /* context template */ + /* from the SW prov */ +} kcf_ctx_template_t; + +/* + * Structure for pool of threads working on global software queue. + */ +typedef struct kcf_pool { + uint32_t kp_threads; /* Number of threads in pool */ + uint32_t kp_idlethreads; /* Idle threads in pool */ + uint32_t kp_blockedthreads; /* Blocked threads in pool */ + + /* + * cv & lock to monitor the condition when no threads + * are around. In this case the failover thread kicks in. + */ + kcondvar_t kp_nothr_cv; + kmutex_t kp_thread_lock; + + /* Userspace thread creator variables. */ + boolean_t kp_signal_create_thread; /* Create requested flag */ + int kp_nthrs; /* # of threads to create */ + boolean_t kp_user_waiting; /* Thread waiting for work */ + + /* + * cv & lock for the condition where more threads need to be + * created. kp_user_lock also protects the three fields above. + */ + kcondvar_t kp_user_cv; /* Creator cond. variable */ + kmutex_t kp_user_lock; /* Creator lock */ +} kcf_pool_t; + + +/* + * State of a crypto bufcall element. + */ +typedef enum cbuf_state { + CBUF_FREE = 1, + CBUF_WAITING, + CBUF_RUNNING +} cbuf_state_t; + +/* + * Structure of a crypto bufcall element. + */ +typedef struct kcf_cbuf_elem { + /* + * lock and cv to wait for CBUF_RUNNING to be done + * kc_lock also protects kc_state. + */ + kmutex_t kc_lock; + kcondvar_t kc_cv; + cbuf_state_t kc_state; + + struct kcf_cbuf_elem *kc_next; + struct kcf_cbuf_elem *kc_prev; + + void (*kc_func)(void *arg); + void *kc_arg; +} kcf_cbuf_elem_t; + +/* + * State of a notify element. + */ +typedef enum ntfy_elem_state { + NTFY_WAITING = 1, + NTFY_RUNNING +} ntfy_elem_state_t; + +/* + * Structure of a notify list element. + */ +typedef struct kcf_ntfy_elem { + /* + * lock and cv to wait for NTFY_RUNNING to be done. + * kn_lock also protects kn_state. + */ + kmutex_t kn_lock; + kcondvar_t kn_cv; + ntfy_elem_state_t kn_state; + + struct kcf_ntfy_elem *kn_next; + struct kcf_ntfy_elem *kn_prev; + + crypto_notify_callback_t kn_func; + uint32_t kn_event_mask; +} kcf_ntfy_elem_t; + + +/* + * The following values are based on the assumption that it would + * take around eight cpus to load a hardware provider (This is true for + * at least one product) and a kernel client may come from different + * low-priority interrupt levels. We will have CRYPTO_TASKQ_MIN number + * of cached taskq entries. The CRYPTO_TASKQ_MAX number is based on + * a throughput of 1GB/s using 512-byte buffers. These are just + * reasonable estimates and might need to change in future. + */ +#define CRYPTO_TASKQ_THREADS 8 +#define CRYPTO_TASKQ_MIN 64 +#define CRYPTO_TASKQ_MAX 2 * 1024 * 1024 + +extern int crypto_taskq_threads; +extern int crypto_taskq_minalloc; +extern int crypto_taskq_maxalloc; +extern kcf_global_swq_t *gswq; +extern int kcf_maxthreads; +extern int kcf_minthreads; + +/* + * All pending crypto bufcalls are put on a list. cbuf_list_lock + * protects changes to this list. + */ +extern kmutex_t cbuf_list_lock; +extern kcondvar_t cbuf_list_cv; + +/* + * All event subscribers are put on a list. kcf_notify_list_lock + * protects changes to this list. + */ +extern kmutex_t ntfy_list_lock; +extern kcondvar_t ntfy_list_cv; + +boolean_t kcf_get_next_logical_provider_member(kcf_provider_desc_t *, + kcf_provider_desc_t *, kcf_provider_desc_t **); +extern int kcf_get_hardware_provider(crypto_mech_type_t, crypto_mech_type_t, + boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **, + crypto_func_group_t); +extern int kcf_get_hardware_provider_nomech(offset_t, offset_t, + boolean_t, kcf_provider_desc_t *, kcf_provider_desc_t **); +extern void kcf_free_triedlist(kcf_prov_tried_t *); +extern kcf_prov_tried_t *kcf_insert_triedlist(kcf_prov_tried_t **, + kcf_provider_desc_t *, int); +extern kcf_provider_desc_t *kcf_get_mech_provider(crypto_mech_type_t, + kcf_mech_entry_t **, int *, kcf_prov_tried_t *, crypto_func_group_t, + boolean_t, size_t); +extern kcf_provider_desc_t *kcf_get_dual_provider(crypto_mechanism_t *, + crypto_mechanism_t *, kcf_mech_entry_t **, crypto_mech_type_t *, + crypto_mech_type_t *, int *, kcf_prov_tried_t *, + crypto_func_group_t, crypto_func_group_t, boolean_t, size_t); +extern crypto_ctx_t *kcf_new_ctx(crypto_call_req_t *, kcf_provider_desc_t *, + crypto_session_id_t); +extern int kcf_submit_request(kcf_provider_desc_t *, crypto_ctx_t *, + crypto_call_req_t *, kcf_req_params_t *, boolean_t); +extern void kcf_sched_destroy(void); +extern void kcf_sched_init(void); +extern void kcf_sched_start(void); +extern void kcf_sop_done(kcf_sreq_node_t *, int); +extern void kcf_aop_done(kcf_areq_node_t *, int); +extern int common_submit_request(kcf_provider_desc_t *, + crypto_ctx_t *, kcf_req_params_t *, crypto_req_handle_t); +extern void kcf_free_context(kcf_context_t *); + +extern int kcf_svc_wait(int *); +extern int kcf_svc_do_run(void); +extern int kcf_need_signature_verification(kcf_provider_desc_t *); +extern void kcf_verify_signature(void *); +extern struct modctl *kcf_get_modctl(crypto_provider_info_t *); +extern void verify_unverified_providers(void); +extern void kcf_free_req(kcf_areq_node_t *areq); +extern void crypto_bufcall_service(void); + +extern void kcf_walk_ntfylist(uint32_t, void *); +extern void kcf_do_notify(kcf_provider_desc_t *, boolean_t); + +extern kcf_dual_req_t *kcf_alloc_req(crypto_call_req_t *); +extern void kcf_next_req(void *, int); +extern void kcf_last_req(void *, int); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_SCHED_IMPL_H */ diff --git a/module/icp/include/sys/crypto/spi.h b/module/icp/include/sys/crypto/spi.h new file mode 100644 index 000000000000..2c62b5706651 --- /dev/null +++ b/module/icp/include/sys/crypto/spi.h @@ -0,0 +1,726 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_CRYPTO_SPI_H +#define _SYS_CRYPTO_SPI_H + +/* + * CSPI: Cryptographic Service Provider Interface. + */ + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef CONSTIFY_PLUGIN +#define __no_const __attribute__((no_const)) +#else +#define __no_const +#endif /* CONSTIFY_PLUGIN */ + +#define CRYPTO_SPI_VERSION_1 1 +#define CRYPTO_SPI_VERSION_2 2 +#define CRYPTO_SPI_VERSION_3 3 + +/* + * Provider-private handle. This handle is specified by a provider + * when it registers by means of the pi_provider_handle field of + * the crypto_provider_info structure, and passed to the provider + * when its entry points are invoked. + */ +typedef void *crypto_provider_handle_t; + +/* + * Context templates can be used to by software providers to pre-process + * keying material, such as key schedules. They are allocated by + * a software provider create_ctx_template(9E) entry point, and passed + * as argument to initialization and atomic provider entry points. + */ +typedef void *crypto_spi_ctx_template_t; + +/* + * Request handles are used by the kernel to identify an asynchronous + * request being processed by a provider. It is passed by the kernel + * to a hardware provider when submitting a request, and must be + * specified by a provider when calling crypto_op_notification(9F) + */ +typedef void *crypto_req_handle_t; + +/* Values for cc_flags field */ +#define CRYPTO_INIT_OPSTATE 0x00000001 /* allocate and init cc_opstate */ +#define CRYPTO_USE_OPSTATE 0x00000002 /* .. start using it as context */ + +/* + * The context structure is passed from the kernel to a provider. + * It contains the information needed to process a multi-part or + * single part operation. The context structure is not used + * by atomic operations. + * + * Parameters needed to perform a cryptographic operation, such + * as keys, mechanisms, input and output buffers, are passed + * as separate arguments to Provider routines. + */ +typedef struct crypto_ctx { + crypto_provider_handle_t cc_provider; + crypto_session_id_t cc_session; + void *cc_provider_private; /* owned by provider */ + void *cc_framework_private; /* owned by framework */ + uint32_t cc_flags; /* flags */ + void *cc_opstate; /* state */ +} crypto_ctx_t; + +/* + * Extended provider information. + */ + +/* + * valid values for ei_flags field of extended info structure + * They match the RSA Security, Inc PKCS#11 tokenInfo flags. + */ +#define CRYPTO_EXTF_RNG 0x00000001 +#define CRYPTO_EXTF_WRITE_PROTECTED 0x00000002 +#define CRYPTO_EXTF_LOGIN_REQUIRED 0x00000004 +#define CRYPTO_EXTF_USER_PIN_INITIALIZED 0x00000008 +#define CRYPTO_EXTF_CLOCK_ON_TOKEN 0x00000040 +#define CRYPTO_EXTF_PROTECTED_AUTHENTICATION_PATH 0x00000100 +#define CRYPTO_EXTF_DUAL_CRYPTO_OPERATIONS 0x00000200 +#define CRYPTO_EXTF_TOKEN_INITIALIZED 0x00000400 +#define CRYPTO_EXTF_USER_PIN_COUNT_LOW 0x00010000 +#define CRYPTO_EXTF_USER_PIN_FINAL_TRY 0x00020000 +#define CRYPTO_EXTF_USER_PIN_LOCKED 0x00040000 +#define CRYPTO_EXTF_USER_PIN_TO_BE_CHANGED 0x00080000 +#define CRYPTO_EXTF_SO_PIN_COUNT_LOW 0x00100000 +#define CRYPTO_EXTF_SO_PIN_FINAL_TRY 0x00200000 +#define CRYPTO_EXTF_SO_PIN_LOCKED 0x00400000 +#define CRYPTO_EXTF_SO_PIN_TO_BE_CHANGED 0x00800000 + +/* + * The crypto_control_ops structure contains pointers to control + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_control_ops { + void (*provider_status)(crypto_provider_handle_t, uint_t *); +} __no_const crypto_control_ops_t; + +/* + * The crypto_ctx_ops structure contains points to context and context + * templates management operations for cryptographic providers. It is + * passed through the crypto_ops(9S) structure when providers register + * with the kernel using crypto_register_provider(9F). + */ +typedef struct crypto_ctx_ops { + int (*create_ctx_template)(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t *, size_t *, crypto_req_handle_t); + int (*free_context)(crypto_ctx_t *); +} __no_const crypto_ctx_ops_t; + +/* + * The crypto_digest_ops structure contains pointers to digest + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_digest_ops { + int (*digest_init)(crypto_ctx_t *, crypto_mechanism_t *, + crypto_req_handle_t); + int (*digest)(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + int (*digest_update)(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); + int (*digest_key)(crypto_ctx_t *, crypto_key_t *, crypto_req_handle_t); + int (*digest_final)(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); + int (*digest_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); +} __no_const crypto_digest_ops_t; + +/* + * The crypto_cipher_ops structure contains pointers to encryption + * and decryption operations for cryptographic providers. It is + * passed through the crypto_ops(9S) structure when providers register + * with the kernel using crypto_register_provider(9F). + */ +typedef struct crypto_cipher_ops { + int (*encrypt_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*encrypt)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*encrypt_update)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*encrypt_final)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*encrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); + + int (*decrypt_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*decrypt)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*decrypt_update)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*decrypt_final)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*decrypt_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); +} __no_const crypto_cipher_ops_t; + +/* + * The crypto_mac_ops structure contains pointers to MAC + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_mac_ops { + int (*mac_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*mac)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*mac_update)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*mac_final)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*mac_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*mac_verify_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); +} __no_const crypto_mac_ops_t; + +/* + * The crypto_sign_ops structure contains pointers to signing + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_sign_ops { + int (*sign_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*sign)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*sign_update)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*sign_final)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*sign_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*sign_recover_init)(crypto_ctx_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*sign_recover)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*sign_recover_atomic)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); +} __no_const crypto_sign_ops_t; + +/* + * The crypto_verify_ops structure contains pointers to verify + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_verify_ops { + int (*verify_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*do_verify)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*verify_update)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*verify_final)(crypto_ctx_t *, + crypto_data_t *, crypto_req_handle_t); + int (*verify_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*verify_recover_init)(crypto_ctx_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); + int (*verify_recover)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*verify_recover_atomic)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, + crypto_req_handle_t); +} __no_const crypto_verify_ops_t; + +/* + * The crypto_dual_ops structure contains pointers to dual + * cipher and sign/verify operations for cryptographic providers. + * It is passed through the crypto_ops(9S) structure when + * providers register with the kernel using + * crypto_register_provider(9F). + */ +typedef struct crypto_dual_ops { + int (*digest_encrypt_update)( + crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); + int (*decrypt_digest_update)( + crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); + int (*sign_encrypt_update)( + crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); + int (*decrypt_verify_update)( + crypto_ctx_t *, crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); +} __no_const crypto_dual_ops_t; + +/* + * The crypto_dual_cipher_mac_ops structure contains pointers to dual + * cipher and MAC operations for cryptographic providers. + * It is passed through the crypto_ops(9S) structure when + * providers register with the kernel using + * crypto_register_provider(9F). + */ +typedef struct crypto_dual_cipher_mac_ops { + int (*encrypt_mac_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*encrypt_mac)(crypto_ctx_t *, + crypto_data_t *, crypto_dual_data_t *, crypto_data_t *, + crypto_req_handle_t); + int (*encrypt_mac_update)(crypto_ctx_t *, + crypto_data_t *, crypto_dual_data_t *, crypto_req_handle_t); + int (*encrypt_mac_final)(crypto_ctx_t *, + crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*encrypt_mac_atomic)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_data_t *, crypto_dual_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, + crypto_spi_ctx_template_t, crypto_req_handle_t); + + int (*mac_decrypt_init)(crypto_ctx_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*mac_decrypt)(crypto_ctx_t *, + crypto_dual_data_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + int (*mac_decrypt_update)(crypto_ctx_t *, + crypto_dual_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*mac_decrypt_final)(crypto_ctx_t *, + crypto_data_t *, crypto_data_t *, crypto_req_handle_t); + int (*mac_decrypt_atomic)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *, + crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, + crypto_spi_ctx_template_t, crypto_req_handle_t); + int (*mac_verify_decrypt_atomic)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, crypto_key_t *, + crypto_mechanism_t *, crypto_key_t *, crypto_dual_data_t *, + crypto_data_t *, crypto_data_t *, crypto_spi_ctx_template_t, + crypto_spi_ctx_template_t, crypto_req_handle_t); +} __no_const crypto_dual_cipher_mac_ops_t; + +/* + * The crypto_random_number_ops structure contains pointers to random + * number operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_random_number_ops { + int (*seed_random)(crypto_provider_handle_t, crypto_session_id_t, + uchar_t *, size_t, uint_t, uint32_t, crypto_req_handle_t); + int (*generate_random)(crypto_provider_handle_t, crypto_session_id_t, + uchar_t *, size_t, crypto_req_handle_t); +} __no_const crypto_random_number_ops_t; + +/* + * Flag values for seed_random. + */ +#define CRYPTO_SEED_NOW 0x00000001 + +/* + * The crypto_session_ops structure contains pointers to session + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_session_ops { + int (*session_open)(crypto_provider_handle_t, crypto_session_id_t *, + crypto_req_handle_t); + int (*session_close)(crypto_provider_handle_t, crypto_session_id_t, + crypto_req_handle_t); + int (*session_login)(crypto_provider_handle_t, crypto_session_id_t, + crypto_user_type_t, char *, size_t, crypto_req_handle_t); + int (*session_logout)(crypto_provider_handle_t, crypto_session_id_t, + crypto_req_handle_t); +} __no_const crypto_session_ops_t; + +/* + * The crypto_object_ops structure contains pointers to object + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_object_ops { + int (*object_create)(crypto_provider_handle_t, crypto_session_id_t, + crypto_object_attribute_t *, uint_t, crypto_object_id_t *, + crypto_req_handle_t); + int (*object_copy)(crypto_provider_handle_t, crypto_session_id_t, + crypto_object_id_t, crypto_object_attribute_t *, uint_t, + crypto_object_id_t *, crypto_req_handle_t); + int (*object_destroy)(crypto_provider_handle_t, crypto_session_id_t, + crypto_object_id_t, crypto_req_handle_t); + int (*object_get_size)(crypto_provider_handle_t, crypto_session_id_t, + crypto_object_id_t, size_t *, crypto_req_handle_t); + int (*object_get_attribute_value)(crypto_provider_handle_t, + crypto_session_id_t, crypto_object_id_t, + crypto_object_attribute_t *, uint_t, crypto_req_handle_t); + int (*object_set_attribute_value)(crypto_provider_handle_t, + crypto_session_id_t, crypto_object_id_t, + crypto_object_attribute_t *, uint_t, crypto_req_handle_t); + int (*object_find_init)(crypto_provider_handle_t, crypto_session_id_t, + crypto_object_attribute_t *, uint_t, void **, + crypto_req_handle_t); + int (*object_find)(crypto_provider_handle_t, void *, + crypto_object_id_t *, uint_t, uint_t *, crypto_req_handle_t); + int (*object_find_final)(crypto_provider_handle_t, void *, + crypto_req_handle_t); +} __no_const crypto_object_ops_t; + +/* + * The crypto_key_ops structure contains pointers to key + * operations for cryptographic providers. It is passed through + * the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_key_ops { + int (*key_generate)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_object_attribute_t *, uint_t, + crypto_object_id_t *, crypto_req_handle_t); + int (*key_generate_pair)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_object_attribute_t *, uint_t, + crypto_object_attribute_t *, uint_t, crypto_object_id_t *, + crypto_object_id_t *, crypto_req_handle_t); + int (*key_wrap)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_object_id_t *, + uchar_t *, size_t *, crypto_req_handle_t); + int (*key_unwrap)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, uchar_t *, size_t *, + crypto_object_attribute_t *, uint_t, + crypto_object_id_t *, crypto_req_handle_t); + int (*key_derive)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *, + uint_t, crypto_object_id_t *, crypto_req_handle_t); + int (*key_check)(crypto_provider_handle_t, crypto_mechanism_t *, + crypto_key_t *); +} __no_const crypto_key_ops_t; + +/* + * The crypto_provider_management_ops structure contains pointers + * to management operations for cryptographic providers. It is passed + * through the crypto_ops(9S) structure when providers register with the + * kernel using crypto_register_provider(9F). + */ +typedef struct crypto_provider_management_ops { + int (*ext_info)(crypto_provider_handle_t, + crypto_provider_ext_info_t *, crypto_req_handle_t); + int (*init_token)(crypto_provider_handle_t, char *, size_t, + char *, crypto_req_handle_t); + int (*init_pin)(crypto_provider_handle_t, crypto_session_id_t, + char *, size_t, crypto_req_handle_t); + int (*set_pin)(crypto_provider_handle_t, crypto_session_id_t, + char *, size_t, char *, size_t, crypto_req_handle_t); +} __no_const crypto_provider_management_ops_t; + +typedef struct crypto_mech_ops { + int (*copyin_mechanism)(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_mechanism_t *, int *, int); + int (*copyout_mechanism)(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_mechanism_t *, int *, int); + int (*free_mechanism)(crypto_provider_handle_t, crypto_mechanism_t *); +} __no_const crypto_mech_ops_t; + +typedef struct crypto_nostore_key_ops { + int (*nostore_key_generate)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, + crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *, + uint_t, crypto_req_handle_t); + int (*nostore_key_generate_pair)(crypto_provider_handle_t, + crypto_session_id_t, crypto_mechanism_t *, + crypto_object_attribute_t *, uint_t, crypto_object_attribute_t *, + uint_t, crypto_object_attribute_t *, uint_t, + crypto_object_attribute_t *, uint_t, crypto_req_handle_t); + int (*nostore_key_derive)(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_object_attribute_t *, + uint_t, crypto_object_attribute_t *, uint_t, crypto_req_handle_t); +} __no_const crypto_nostore_key_ops_t; + +/* + * The crypto_ops(9S) structure contains the structures containing + * the pointers to functions implemented by cryptographic providers. + * It is specified as part of the crypto_provider_info(9S) + * supplied by a provider when it registers with the kernel + * by calling crypto_register_provider(9F). + */ +typedef struct crypto_ops_v1 { + crypto_control_ops_t *co_control_ops; + crypto_digest_ops_t *co_digest_ops; + crypto_cipher_ops_t *co_cipher_ops; + crypto_mac_ops_t *co_mac_ops; + crypto_sign_ops_t *co_sign_ops; + crypto_verify_ops_t *co_verify_ops; + crypto_dual_ops_t *co_dual_ops; + crypto_dual_cipher_mac_ops_t *co_dual_cipher_mac_ops; + crypto_random_number_ops_t *co_random_ops; + crypto_session_ops_t *co_session_ops; + crypto_object_ops_t *co_object_ops; + crypto_key_ops_t *co_key_ops; + crypto_provider_management_ops_t *co_provider_ops; + crypto_ctx_ops_t *co_ctx_ops; +} crypto_ops_v1_t; + +typedef struct crypto_ops_v2 { + crypto_ops_v1_t v1_ops; + crypto_mech_ops_t *co_mech_ops; +} crypto_ops_v2_t; + +typedef struct crypto_ops_v3 { + crypto_ops_v2_t v2_ops; + crypto_nostore_key_ops_t *co_nostore_key_ops; +} crypto_ops_v3_t; + +typedef struct crypto_ops { + union { + crypto_ops_v3_t cou_v3; + crypto_ops_v2_t cou_v2; + crypto_ops_v1_t cou_v1; + } cou; +} crypto_ops_t; + +#define co_control_ops cou.cou_v1.co_control_ops +#define co_digest_ops cou.cou_v1.co_digest_ops +#define co_cipher_ops cou.cou_v1.co_cipher_ops +#define co_mac_ops cou.cou_v1.co_mac_ops +#define co_sign_ops cou.cou_v1.co_sign_ops +#define co_verify_ops cou.cou_v1.co_verify_ops +#define co_dual_ops cou.cou_v1.co_dual_ops +#define co_dual_cipher_mac_ops cou.cou_v1.co_dual_cipher_mac_ops +#define co_random_ops cou.cou_v1.co_random_ops +#define co_session_ops cou.cou_v1.co_session_ops +#define co_object_ops cou.cou_v1.co_object_ops +#define co_key_ops cou.cou_v1.co_key_ops +#define co_provider_ops cou.cou_v1.co_provider_ops +#define co_ctx_ops cou.cou_v1.co_ctx_ops +#define co_mech_ops cou.cou_v2.co_mech_ops +#define co_nostore_key_ops cou.cou_v3.co_nostore_key_ops + +/* + * The mechanism info structure crypto_mech_info_t contains a function group + * bit mask cm_func_group_mask. This field, of type crypto_func_group_t, + * specifies the provider entry point that can be used a particular + * mechanism. The function group mask is a combination of the following values. + */ + +typedef uint32_t crypto_func_group_t; + + +#define CRYPTO_FG_ENCRYPT 0x00000001 /* encrypt_init() */ +#define CRYPTO_FG_DECRYPT 0x00000002 /* decrypt_init() */ +#define CRYPTO_FG_DIGEST 0x00000004 /* digest_init() */ +#define CRYPTO_FG_SIGN 0x00000008 /* sign_init() */ +#define CRYPTO_FG_SIGN_RECOVER 0x00000010 /* sign_recover_init() */ +#define CRYPTO_FG_VERIFY 0x00000020 /* verify_init() */ +#define CRYPTO_FG_VERIFY_RECOVER 0x00000040 /* verify_recover_init() */ +#define CRYPTO_FG_GENERATE 0x00000080 /* key_generate() */ +#define CRYPTO_FG_GENERATE_KEY_PAIR 0x00000100 /* key_generate_pair() */ +#define CRYPTO_FG_WRAP 0x00000200 /* key_wrap() */ +#define CRYPTO_FG_UNWRAP 0x00000400 /* key_unwrap() */ +#define CRYPTO_FG_DERIVE 0x00000800 /* key_derive() */ +#define CRYPTO_FG_MAC 0x00001000 /* mac_init() */ +#define CRYPTO_FG_ENCRYPT_MAC 0x00002000 /* encrypt_mac_init() */ +#define CRYPTO_FG_MAC_DECRYPT 0x00004000 /* decrypt_mac_init() */ +#define CRYPTO_FG_ENCRYPT_ATOMIC 0x00008000 /* encrypt_atomic() */ +#define CRYPTO_FG_DECRYPT_ATOMIC 0x00010000 /* decrypt_atomic() */ +#define CRYPTO_FG_MAC_ATOMIC 0x00020000 /* mac_atomic() */ +#define CRYPTO_FG_DIGEST_ATOMIC 0x00040000 /* digest_atomic() */ +#define CRYPTO_FG_SIGN_ATOMIC 0x00080000 /* sign_atomic() */ +#define CRYPTO_FG_SIGN_RECOVER_ATOMIC 0x00100000 /* sign_recover_atomic() */ +#define CRYPTO_FG_VERIFY_ATOMIC 0x00200000 /* verify_atomic() */ +#define CRYPTO_FG_VERIFY_RECOVER_ATOMIC 0x00400000 /* verify_recover_atomic() */ +#define CRYPTO_FG_ENCRYPT_MAC_ATOMIC 0x00800000 /* encrypt_mac_atomic() */ +#define CRYPTO_FG_MAC_DECRYPT_ATOMIC 0x01000000 /* mac_decrypt_atomic() */ +#define CRYPTO_FG_RESERVED 0x80000000 + +/* + * Maximum length of the pi_provider_description field of the + * crypto_provider_info structure. + */ +#define CRYPTO_PROVIDER_DESCR_MAX_LEN 64 + + +/* Bit mask for all the simple operations */ +#define CRYPTO_FG_SIMPLEOP_MASK (CRYPTO_FG_ENCRYPT | CRYPTO_FG_DECRYPT | \ + CRYPTO_FG_DIGEST | CRYPTO_FG_SIGN | CRYPTO_FG_VERIFY | CRYPTO_FG_MAC | \ + CRYPTO_FG_ENCRYPT_ATOMIC | CRYPTO_FG_DECRYPT_ATOMIC | \ + CRYPTO_FG_MAC_ATOMIC | CRYPTO_FG_DIGEST_ATOMIC | CRYPTO_FG_SIGN_ATOMIC | \ + CRYPTO_FG_VERIFY_ATOMIC) + +/* Bit mask for all the dual operations */ +#define CRYPTO_FG_MAC_CIPHER_MASK (CRYPTO_FG_ENCRYPT_MAC | \ + CRYPTO_FG_MAC_DECRYPT | CRYPTO_FG_ENCRYPT_MAC_ATOMIC | \ + CRYPTO_FG_MAC_DECRYPT_ATOMIC) + +/* Add other combos to CRYPTO_FG_DUAL_MASK */ +#define CRYPTO_FG_DUAL_MASK CRYPTO_FG_MAC_CIPHER_MASK + +/* + * The crypto_mech_info structure specifies one of the mechanisms + * supported by a cryptographic provider. The pi_mechanisms field of + * the crypto_provider_info structure contains a pointer to an array + * of crypto_mech_info's. + */ +typedef struct crypto_mech_info { + crypto_mech_name_t cm_mech_name; + crypto_mech_type_t cm_mech_number; + crypto_func_group_t cm_func_group_mask; + ssize_t cm_min_key_length; + ssize_t cm_max_key_length; + uint32_t cm_mech_flags; +} crypto_mech_info_t; + +/* Alias the old name to the new name for compatibility. */ +#define cm_keysize_unit cm_mech_flags + +/* + * The following is used by a provider that sets + * CRYPTO_HASH_NO_UPDATE. It needs to specify the maximum + * input data size it can digest in this field. + */ +#define cm_max_input_length cm_max_key_length + +/* + * crypto_kcf_provider_handle_t is a handle allocated by the kernel. + * It is returned after the provider registers with + * crypto_register_provider(), and must be specified by the provider + * when calling crypto_unregister_provider(), and + * crypto_provider_notification(). + */ +typedef uint_t crypto_kcf_provider_handle_t; + +/* + * Provider information. Passed as argument to crypto_register_provider(9F). + * Describes the provider and its capabilities. Multiple providers can + * register for the same device instance. In this case, the same + * pi_provider_dev must be specified with a different pi_provider_handle. + */ +typedef struct crypto_provider_info_v1 { + uint_t pi_interface_version; + char *pi_provider_description; + crypto_provider_type_t pi_provider_type; + crypto_provider_handle_t pi_provider_handle; + crypto_ops_t *pi_ops_vector; + uint_t pi_mech_list_count; + crypto_mech_info_t *pi_mechanisms; + uint_t pi_logical_provider_count; + crypto_kcf_provider_handle_t *pi_logical_providers; +} crypto_provider_info_v1_t; + +typedef struct crypto_provider_info_v2 { + crypto_provider_info_v1_t v1_info; + uint_t pi_flags; +} crypto_provider_info_v2_t; + +typedef struct crypto_provider_info { + union { + crypto_provider_info_v2_t piu_v2; + crypto_provider_info_v1_t piu_v1; + } piu; +} crypto_provider_info_t; + +#define pi_interface_version piu.piu_v1.pi_interface_version +#define pi_provider_description piu.piu_v1.pi_provider_description +#define pi_provider_type piu.piu_v1.pi_provider_type +#define pi_provider_handle piu.piu_v1.pi_provider_handle +#define pi_ops_vector piu.piu_v1.pi_ops_vector +#define pi_mech_list_count piu.piu_v1.pi_mech_list_count +#define pi_mechanisms piu.piu_v1.pi_mechanisms +#define pi_logical_provider_count piu.piu_v1.pi_logical_provider_count +#define pi_logical_providers piu.piu_v1.pi_logical_providers +#define pi_flags piu.piu_v2.pi_flags + +/* hidden providers can only be accessed via a logical provider */ +#define CRYPTO_HIDE_PROVIDER 0x00000001 +/* + * provider can not do multi-part digest (updates) and has a limit + * on maximum input data that it can digest. + */ +#define CRYPTO_HASH_NO_UPDATE 0x00000002 + +/* provider can handle the request without returning a CRYPTO_QUEUED */ +#define CRYPTO_SYNCHRONOUS 0x00000004 + +#define CRYPTO_PIFLAGS_RESERVED2 0x40000000 +#define CRYPTO_PIFLAGS_RESERVED1 0x80000000 + +/* + * Provider status passed by a provider to crypto_provider_notification(9F) + * and returned by the provider_status(9E) entry point. + */ +#define CRYPTO_PROVIDER_READY 0 +#define CRYPTO_PROVIDER_BUSY 1 +#define CRYPTO_PROVIDER_FAILED 2 + +/* + * Functions exported by Solaris to cryptographic providers. Providers + * call these functions to register and unregister, notify the kernel + * of state changes, and notify the kernel when a asynchronous request + * completed. + */ +extern int crypto_register_provider(crypto_provider_info_t *, + crypto_kcf_provider_handle_t *); +extern int crypto_unregister_provider(crypto_kcf_provider_handle_t); +extern void crypto_provider_notification(crypto_kcf_provider_handle_t, uint_t); +extern void crypto_op_notification(crypto_req_handle_t, int); +extern int crypto_kmflag(crypto_req_handle_t); + + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_CRYPTO_SPI_H */ diff --git a/module/icp/include/sys/ia32/asm_linkage.h b/module/icp/include/sys/ia32/asm_linkage.h new file mode 100644 index 000000000000..f2dae7093b94 --- /dev/null +++ b/module/icp/include/sys/ia32/asm_linkage.h @@ -0,0 +1,307 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_ASM_LINKAGE_H +#define _IA32_SYS_ASM_LINKAGE_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _ASM /* The remainder of this file is only for assembly files */ + +/* + * make annoying differences in assembler syntax go away + */ + +/* + * D16 and A16 are used to insert instructions prefixes; the + * macros help the assembler code be slightly more portable. + */ +#if !defined(__GNUC_AS__) +/* + * /usr/ccs/bin/as prefixes are parsed as separate instructions + */ +#define D16 data16; +#define A16 addr16; + +/* + * (There are some weird constructs in constant expressions) + */ +#define _CONST(const) [const] +#define _BITNOT(const) -1!_CONST(const) +#define _MUL(a, b) _CONST(a \* b) + +#else +/* + * Why not use the 'data16' and 'addr16' prefixes .. well, the + * assembler doesn't quite believe in real mode, and thus argues with + * us about what we're trying to do. + */ +#define D16 .byte 0x66; +#define A16 .byte 0x67; + +#define _CONST(const) (const) +#define _BITNOT(const) ~_CONST(const) +#define _MUL(a, b) _CONST(a * b) + +#endif + +/* + * C pointers are different sizes between i386 and amd64. + * These constants can be used to compute offsets into pointer arrays. + */ +#if defined(__amd64) +#define CLONGSHIFT 3 +#define CLONGSIZE 8 +#define CLONGMASK 7 +#elif defined(__i386) +#define CLONGSHIFT 2 +#define CLONGSIZE 4 +#define CLONGMASK 3 +#endif + +/* + * Since we know we're either ILP32 or LP64 .. + */ +#define CPTRSHIFT CLONGSHIFT +#define CPTRSIZE CLONGSIZE +#define CPTRMASK CLONGMASK + +#if CPTRSIZE != (1 << CPTRSHIFT) || CLONGSIZE != (1 << CLONGSHIFT) +#error "inconsistent shift constants" +#endif + +#if CPTRMASK != (CPTRSIZE - 1) || CLONGMASK != (CLONGSIZE - 1) +#error "inconsistent mask constants" +#endif + +#define ASM_ENTRY_ALIGN 16 + +/* + * SSE register alignment and save areas + */ + +#define XMM_SIZE 16 +#define XMM_ALIGN 16 + +#if defined(__amd64) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp; \ + movq %rsp, sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addq $_CONST(_MUL(XMM_SIZE, nreg)), %rsp + +#elif defined(__i386) + +#define SAVE_XMM_PROLOG(sreg, nreg) \ + subl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; \ + movl %esp, sreg; \ + addl $XMM_ALIGN, sreg; \ + andl $_BITNOT(XMM_ALIGN-1), sreg + +#define RSTOR_XMM_EPILOG(sreg, nreg) \ + addl $_CONST(_MUL(XMM_SIZE, nreg) + XMM_ALIGN), %esp; + +#endif /* __i386 */ + +/* + * profiling causes definitions of the MCOUNT and RTMCOUNT + * particular to the type + */ +#ifdef GPROF + +#define MCOUNT(x) \ + pushl %ebp; \ + movl %esp, %ebp; \ + call _mcount; \ + popl %ebp + +#endif /* GPROF */ + +#ifdef PROF + +#define MCOUNT(x) \ +/* CSTYLED */ \ + .lcomm .L_/**/x/**/1, 4, 4; \ + pushl %ebp; \ + movl %esp, %ebp; \ +/* CSTYLED */ \ + movl $.L_/**/x/**/1, %edx; \ + call _mcount; \ + popl %ebp + +#endif /* PROF */ + +/* + * if we are not profiling, MCOUNT should be defined to nothing + */ +#if !defined(PROF) && !defined(GPROF) +#define MCOUNT(x) +#endif /* !defined(PROF) && !defined(GPROF) */ + +#define RTMCOUNT(x) MCOUNT(x) + +/* + * Macro to define weak symbol aliases. These are similar to the ANSI-C + * #pragma weak _name = name + * except a compiler can determine type. The assembler must be told. Hence, + * the second parameter must be the type of the symbol (i.e.: function,...) + */ +#define ANSI_PRAGMA_WEAK(sym, stype) \ +/* CSTYLED */ \ + .weak _/**/sym; \ +/* CSTYLED */ \ + .type _/**/sym, @stype; \ +/* CSTYLED */ \ +_/**/sym = sym + +/* + * Like ANSI_PRAGMA_WEAK(), but for unrelated names, as in: + * #pragma weak sym1 = sym2 + */ +#define ANSI_PRAGMA_WEAK2(sym1, sym2, stype) \ + .weak sym1; \ + .type sym1, @stype; \ +sym1 = sym2 + +/* + * ENTRY provides the standard procedure entry code and an easy way to + * insert the calls to mcount for profiling. ENTRY_NP is identical, but + * never calls mcount. + */ +#define ENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: MCOUNT(x) + +#define ENTRY_NP(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: + +#define RTENTRY(x) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x; \ + .type x, @function; \ +x: RTMCOUNT(x) + +/* + * ENTRY2 is identical to ENTRY but provides two labels for the entry point. + */ +#define ENTRY2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: MCOUNT(x) + +#define ENTRY_NP2(x, y) \ + .text; \ + .align ASM_ENTRY_ALIGN; \ + .globl x, y; \ + .type x, @function; \ + .type y, @function; \ +/* CSTYLED */ \ +x: ; \ +y: + + +/* + * ALTENTRY provides for additional entry points. + */ +#define ALTENTRY(x) \ + .globl x; \ + .type x, @function; \ +x: + +/* + * DGDEF and DGDEF2 provide global data declarations. + * + * DGDEF provides a word aligned word of storage. + * + * DGDEF2 allocates "sz" bytes of storage with **NO** alignment. This + * implies this macro is best used for byte arrays. + * + * DGDEF3 allocates "sz" bytes of storage with "algn" alignment. + */ +#define DGDEF2(name, sz) \ + .data; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF3(name, sz, algn) \ + .data; \ + .align algn; \ + .globl name; \ + .type name, @object; \ + .size name, sz; \ +name: + +#define DGDEF(name) DGDEF3(name, 4, 4) + +/* + * SET_SIZE trails a function and set the size for the ELF symbol table. + */ +#define SET_SIZE(x) \ + .size x, [.-x] + +/* + * NWORD provides native word value. + */ +#if defined(__amd64) + +/*CSTYLED*/ +#define NWORD quad + +#elif defined(__i386) + +#define NWORD long + +#endif /* __i386 */ + +#endif /* _ASM */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_ASM_LINKAGE_H */ diff --git a/module/icp/include/sys/ia32/stack.h b/module/icp/include/sys/ia32/stack.h new file mode 100644 index 000000000000..9e7c089e1182 --- /dev/null +++ b/module/icp/include/sys/ia32/stack.h @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_STACK_H +#define _IA32_SYS_STACK_H + +#if !defined(_ASM) + +#include + +#endif + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * In the x86 world, a stack frame looks like this: + * + * |--------------------------| + * 4n+8(%ebp) ->| argument word n | + * | ... | (Previous frame) + * 8(%ebp) ->| argument word 0 | + * |--------------------------|-------------------- + * 4(%ebp) ->| return address | + * |--------------------------| + * 0(%ebp) ->| previous %ebp (optional) | + * |--------------------------| + * -4(%ebp) ->| unspecified | (Current frame) + * | ... | + * 0(%esp) ->| variable size | + * |--------------------------| + */ + +/* + * Stack alignment macros. + */ + +#define STACK_ALIGN32 4 +#define STACK_ENTRY_ALIGN32 4 +#define STACK_BIAS32 0 +#define SA32(x) (((x)+(STACK_ALIGN32-1)) & ~(STACK_ALIGN32-1)) +#define STACK_RESERVE32 0 +#define MINFRAME32 0 + +#if defined(__amd64) + +/* + * In the amd64 world, a stack frame looks like this: + * + * |--------------------------| + * 8n+16(%rbp)->| argument word n | + * | ... | (Previous frame) + * 16(%rbp) ->| argument word 0 | + * |--------------------------|-------------------- + * 8(%rbp) ->| return address | + * |--------------------------| + * 0(%rbp) ->| previous %rbp | + * |--------------------------| + * -8(%rbp) ->| unspecified | (Current frame) + * | ... | + * 0(%rsp) ->| variable size | + * |--------------------------| + * -128(%rsp) ->| reserved for function | + * |--------------------------| + * + * The end of the input argument area must be aligned on a 16-byte + * boundary; i.e. (%rsp - 8) % 16 == 0 at function entry. + * + * The 128-byte location beyond %rsp is considered to be reserved for + * functions and is NOT modified by signal handlers. It can be used + * to store temporary data that is not needed across function calls. + */ + +/* + * Stack alignment macros. + */ + +#define STACK_ALIGN64 16 +#define STACK_ENTRY_ALIGN64 8 +#define STACK_BIAS64 0 +#define SA64(x) (((x)+(STACK_ALIGN64-1)) & ~(STACK_ALIGN64-1)) +#define STACK_RESERVE64 128 +#define MINFRAME64 0 + +#define STACK_ALIGN STACK_ALIGN64 +#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN64 +#define STACK_BIAS STACK_BIAS64 +#define SA(x) SA64(x) +#define STACK_RESERVE STACK_RESERVE64 +#define MINFRAME MINFRAME64 + +#elif defined(__i386) + +#define STACK_ALIGN STACK_ALIGN32 +#define STACK_ENTRY_ALIGN STACK_ENTRY_ALIGN32 +#define STACK_BIAS STACK_BIAS32 +#define SA(x) SA32(x) +#define STACK_RESERVE STACK_RESERVE32 +#define MINFRAME MINFRAME32 + +#endif /* __i386 */ + +#if defined(_KERNEL) && !defined(_ASM) + +#if defined(ZFS_DEBUG) +#if STACK_ALIGN == 4 +#define ASSERT_STACK_ALIGNED() \ + { \ + uint32_t __tmp; \ + ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \ + } +#elif (STACK_ALIGN == 16) && (_LONG_DOUBLE_ALIGNMENT == 16) +#define ASSERT_STACK_ALIGNED() \ + { \ + long double __tmp; \ + ASSERT((((uintptr_t)&__tmp) & (STACK_ALIGN - 1)) == 0); \ + } +#endif +#else /* DEBUG */ +#define ASSERT_STACK_ALIGNED() +#endif /* DEBUG */ + +struct regs; + +void traceregs(struct regs *); +void traceback(caddr_t); + +#endif /* defined(_KERNEL) && !defined(_ASM) */ + +#define STACK_GROWTH_DOWN /* stacks grow from high to low addresses */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_STACK_H */ diff --git a/module/icp/include/sys/ia32/trap.h b/module/icp/include/sys/ia32/trap.h new file mode 100644 index 000000000000..55b94969b80b --- /dev/null +++ b/module/icp/include/sys/ia32/trap.h @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* Copyright (c) 1990, 1991 UNIX System Laboratories, Inc. */ +/* Copyright (c) 1984, 1986, 1987, 1988, 1989, 1990 AT&T */ +/* All Rights Reserved */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _IA32_SYS_TRAP_H +#define _IA32_SYS_TRAP_H + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Trap type values + */ + +#define T_ZERODIV 0x0 /* #de divide by 0 error */ +#define T_SGLSTP 0x1 /* #db single step */ +#define T_NMIFLT 0x2 /* NMI */ +#define T_BPTFLT 0x3 /* #bp breakpoint fault, INT3 insn */ +#define T_OVFLW 0x4 /* #of INTO overflow fault */ +#define T_BOUNDFLT 0x5 /* #br BOUND insn fault */ +#define T_ILLINST 0x6 /* #ud invalid opcode fault */ +#define T_NOEXTFLT 0x7 /* #nm device not available: x87 */ +#define T_DBLFLT 0x8 /* #df double fault */ +#define T_EXTOVRFLT 0x9 /* [not generated: 386 only] */ +#define T_TSSFLT 0xa /* #ts invalid TSS fault */ +#define T_SEGFLT 0xb /* #np segment not present fault */ +#define T_STKFLT 0xc /* #ss stack fault */ +#define T_GPFLT 0xd /* #gp general protection fault */ +#define T_PGFLT 0xe /* #pf page fault */ +#define T_EXTERRFLT 0x10 /* #mf x87 FPU error fault */ +#define T_ALIGNMENT 0x11 /* #ac alignment check error */ +#define T_MCE 0x12 /* #mc machine check exception */ +#define T_SIMDFPE 0x13 /* #xm SSE/SSE exception */ +#define T_DBGENTR 0x14 /* debugger entry */ +#define T_ENDPERR 0x21 /* emulated extension error flt */ +#define T_ENOEXTFLT 0x20 /* emulated ext not present */ +#define T_FASTTRAP 0xd2 /* fast system call */ +#define T_SYSCALLINT 0x91 /* general system call */ +#define T_DTRACE_RET 0x7f /* DTrace pid return */ +#define T_INT80 0x80 /* int80 handler for linux emulation */ +#define T_SOFTINT 0x50fd /* pseudo softint trap type */ + +/* + * Pseudo traps. + */ +#define T_INTERRUPT 0x100 +#define T_FAULT 0x200 +#define T_AST 0x400 +#define T_SYSCALL 0x180 + + +/* + * Values of error code on stack in case of page fault + */ + +#define PF_ERR_MASK 0x01 /* Mask for error bit */ +#define PF_ERR_PAGE 0x00 /* page not present */ +#define PF_ERR_PROT 0x01 /* protection error */ +#define PF_ERR_WRITE 0x02 /* fault caused by write (else read) */ +#define PF_ERR_USER 0x04 /* processor was in user mode */ + /* (else supervisor) */ +#define PF_ERR_EXEC 0x10 /* attempt to execute a No eXec page (AMD) */ + +/* + * Definitions for fast system call subfunctions + */ +#define T_FNULL 0 /* Null trap for testing */ +#define T_FGETFP 1 /* Get emulated FP context */ +#define T_FSETFP 2 /* Set emulated FP context */ +#define T_GETHRTIME 3 /* Get high resolution time */ +#define T_GETHRVTIME 4 /* Get high resolution virtual time */ +#define T_GETHRESTIME 5 /* Get high resolution time */ +#define T_GETLGRP 6 /* Get home lgrpid */ + +#define T_LASTFAST 6 /* Last valid subfunction */ + +#ifdef __cplusplus +} +#endif + +#endif /* _IA32_SYS_TRAP_H */ diff --git a/module/icp/include/sys/modctl.h b/module/icp/include/sys/modctl.h new file mode 100644 index 000000000000..6c26ad618c93 --- /dev/null +++ b/module/icp/include/sys/modctl.h @@ -0,0 +1,477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MODCTL_H +#define _SYS_MODCTL_H + +/* + * loadable module support. + */ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +struct modlmisc; +struct modlinkage; + +/* + * The following structure defines the operations used by modctl + * to load and unload modules. Each supported loadable module type + * requires a set of mod_ops. + */ +struct mod_ops { + int (*modm_install)(struct modlmisc *, struct modlinkage *); + int (*modm_remove)(struct modlmisc *, struct modlinkage *); + int (*modm_info)(void *, struct modlinkage *, int *); +}; + +/* + * The defined set of mod_ops structures for each loadable module type + * Defined in modctl.c + */ +extern struct mod_ops mod_brandops; +#if defined(__i386) || defined(__amd64) +extern struct mod_ops mod_cpuops; +#endif +extern struct mod_ops mod_cryptoops; +extern struct mod_ops mod_driverops; +extern struct mod_ops mod_execops; +extern struct mod_ops mod_fsops; +extern struct mod_ops mod_miscops; +extern struct mod_ops mod_schedops; +extern struct mod_ops mod_strmodops; +extern struct mod_ops mod_syscallops; +extern struct mod_ops mod_sockmodops; +#ifdef _SYSCALL32_IMPL +extern struct mod_ops mod_syscallops32; +#endif +extern struct mod_ops mod_dacfops; +extern struct mod_ops mod_ippops; +extern struct mod_ops mod_pcbeops; +extern struct mod_ops mod_devfsops; +extern struct mod_ops mod_kiconvops; + +/* + * Definitions for the module specific linkage structures. + * The first two fields are the same in all of the structures. + * The linkinfo is for informational purposes only and is returned by + * modctl with the MODINFO cmd. + */ + +/* For cryptographic providers */ +struct modlcrypto { + struct mod_ops *crypto_modops; + char *crypto_linkinfo; +}; + +/* For misc */ +struct modlmisc { + struct mod_ops *misc_modops; + char *misc_linkinfo; +}; + +/* + * Revision number of loadable modules support. This is the value + * that must be used in the modlinkage structure. + */ +#define MODREV_1 1 + +/* + * The modlinkage structure is the structure that the module writer + * provides to the routines to install, remove, and stat a module. + * The ml_linkage element is an array of pointers to linkage structures. + * For most modules there is only one linkage structure. We allocate + * enough space for 3 linkage structures which happens to be the most + * we have in any sun supplied module. For those modules with more + * than 3 linkage structures (which is very unlikely), a modlinkage + * structure must be kmem_alloc'd in the module wrapper to be big enough + * for all of the linkage structures. + */ +struct modlinkage { + int ml_rev; /* rev of loadable modules system */ +#ifdef _LP64 + void *ml_linkage[7]; /* more space in 64-bit OS */ +#else + void *ml_linkage[4]; /* NULL terminated list of */ + /* linkage structures */ +#endif +}; + +/* + * commands. These are the commands supported by the modctl system call. + */ +#define MODLOAD 0 +#define MODUNLOAD 1 +#define MODINFO 2 +#define MODRESERVED 3 +#define MODSETMINIROOT 4 +#define MODADDMAJBIND 5 +#define MODGETPATH 6 +#define MODREADSYSBIND 7 +#define MODGETMAJBIND 8 +#define MODGETNAME 9 +#define MODSIZEOF_DEVID 10 +#define MODGETDEVID 11 +#define MODSIZEOF_MINORNAME 12 +#define MODGETMINORNAME 13 +#define MODGETPATHLEN 14 +#define MODEVENTS 15 +#define MODGETFBNAME 16 +#define MODREREADDACF 17 +#define MODLOADDRVCONF 18 +#define MODUNLOADDRVCONF 19 +#define MODREMMAJBIND 20 +#define MODDEVT2INSTANCE 21 +#define MODGETDEVFSPATH_LEN 22 +#define MODGETDEVFSPATH 23 +#define MODDEVID2PATHS 24 +#define MODSETDEVPOLICY 26 +#define MODGETDEVPOLICY 27 +#define MODALLOCPRIV 28 +#define MODGETDEVPOLICYBYNAME 29 +#define MODLOADMINORPERM 31 +#define MODADDMINORPERM 32 +#define MODREMMINORPERM 33 +#define MODREMDRVCLEANUP 34 +#define MODDEVEXISTS 35 +#define MODDEVREADDIR 36 +#define MODDEVNAME 37 +#define MODGETDEVFSPATH_MI_LEN 38 +#define MODGETDEVFSPATH_MI 39 +#define MODRETIRE 40 +#define MODUNRETIRE 41 +#define MODISRETIRED 42 +#define MODDEVEMPTYDIR 43 +#define MODREMDRVALIAS 44 + +/* + * sub cmds for MODEVENTS + */ +#define MODEVENTS_FLUSH 0 +#define MODEVENTS_FLUSH_DUMP 1 +#define MODEVENTS_SET_DOOR_UPCALL_FILENAME 2 +#define MODEVENTS_GETDATA 3 +#define MODEVENTS_FREEDATA 4 +#define MODEVENTS_POST_EVENT 5 +#define MODEVENTS_REGISTER_EVENT 6 + +/* + * devname subcmds for MODDEVNAME + */ +#define MODDEVNAME_LOOKUPDOOR 0 +#define MODDEVNAME_DEVFSADMNODE 1 +#define MODDEVNAME_NSMAPS 2 +#define MODDEVNAME_PROFILE 3 +#define MODDEVNAME_RECONFIG 4 +#define MODDEVNAME_SYSAVAIL 5 + + +/* + * Data structure passed to modconfig command in kernel to build devfs tree + */ + +struct aliases { + struct aliases *a_next; + char *a_name; + int a_len; +}; + +#define MAXMODCONFNAME 256 + +struct modconfig { + char drvname[MAXMODCONFNAME]; + char drvclass[MAXMODCONFNAME]; + int major; + int flags; + int num_aliases; + struct aliases *ap; +}; + +#if defined(_SYSCALL32) + +struct aliases32 { + caddr32_t a_next; + caddr32_t a_name; + int32_t a_len; +}; + +struct modconfig32 { + char drvname[MAXMODCONFNAME]; + char drvclass[MAXMODCONFNAME]; + int32_t major; + int32_t flags; + int32_t num_aliases; + caddr32_t ap; +}; + +#endif /* _SYSCALL32 */ + +/* flags for modconfig */ +#define MOD_UNBIND_OVERRIDE 0x01 /* fail unbind if in use */ + +/* + * Max module path length + */ +#define MOD_MAXPATH 256 + +/* + * Default search path for modules ADDITIONAL to the directory + * where the kernel components we booted from are. + * + * Most often, this will be "/platform/{platform}/kernel /kernel /usr/kernel", + * but we don't wire it down here. + */ +#define MOD_DEFPATH "/kernel /usr/kernel" + +/* + * Default file name extension for autoloading modules. + */ +#define MOD_DEFEXT "" + +/* + * Parameters for modinfo + */ +#define MODMAXNAMELEN 32 /* max module name length */ +#define MODMAXLINKINFOLEN 32 /* max link info length */ + +/* + * Module specific information. + */ +struct modspecific_info { + char msi_linkinfo[MODMAXLINKINFOLEN]; /* name in linkage struct */ + int msi_p0; /* module specific information */ +}; + +/* + * Structure returned by modctl with MODINFO command. + */ +#define MODMAXLINK 10 /* max linkages modinfo can handle */ + +struct modinfo { + int mi_info; /* Flags for info wanted */ + int mi_state; /* Flags for module state */ + int mi_id; /* id of this loaded module */ + int mi_nextid; /* id of next module or -1 */ + caddr_t mi_base; /* virtual addr of text */ + size_t mi_size; /* size of module in bytes */ + int mi_rev; /* loadable modules rev */ + int mi_loadcnt; /* # of times loaded */ + char mi_name[MODMAXNAMELEN]; /* name of module */ + struct modspecific_info mi_msinfo[MODMAXLINK]; + /* mod specific info */ +}; + + +#if defined(_SYSCALL32) + +#define MODMAXNAMELEN32 32 /* max module name length */ +#define MODMAXLINKINFOLEN32 32 /* max link info length */ +#define MODMAXLINK32 10 /* max linkages modinfo can handle */ + +struct modspecific_info32 { + char msi_linkinfo[MODMAXLINKINFOLEN32]; /* name in linkage struct */ + int32_t msi_p0; /* module specific information */ +}; + +struct modinfo32 { + int32_t mi_info; /* Flags for info wanted */ + int32_t mi_state; /* Flags for module state */ + int32_t mi_id; /* id of this loaded module */ + int32_t mi_nextid; /* id of next module or -1 */ + caddr32_t mi_base; /* virtual addr of text */ + uint32_t mi_size; /* size of module in bytes */ + int32_t mi_rev; /* loadable modules rev */ + int32_t mi_loadcnt; /* # of times loaded */ + char mi_name[MODMAXNAMELEN32]; /* name of module */ + struct modspecific_info32 mi_msinfo[MODMAXLINK32]; + /* mod specific info */ +}; + +#endif /* _SYSCALL32 */ + +/* Values for mi_info flags */ +#define MI_INFO_ONE 1 +#define MI_INFO_ALL 2 +#define MI_INFO_CNT 4 +#define MI_INFO_LINKAGE 8 /* used internally to extract modlinkage */ +/* + * MI_INFO_NOBASE indicates caller does not need mi_base. Failure to use this + * flag may lead 32-bit apps to receive an EOVERFLOW error from modctl(MODINFO) + * when used with a 64-bit kernel. + */ +#define MI_INFO_NOBASE 16 + +/* Values for mi_state */ +#define MI_LOADED 1 +#define MI_INSTALLED 2 + +/* + * Macros to vector to the appropriate module specific routine. + */ +#define MODL_INSTALL(MODL, MODLP) \ + (*(MODL)->misc_modops->modm_install)(MODL, MODLP) +#define MODL_REMOVE(MODL, MODLP) \ + (*(MODL)->misc_modops->modm_remove)(MODL, MODLP) +#define MODL_INFO(MODL, MODLP, P0) \ + (*(MODL)->misc_modops->modm_info)(MODL, MODLP, P0) + +/* + * Definitions for stubs + */ +struct mod_stub_info { + uintptr_t mods_func_adr; + struct mod_modinfo *mods_modinfo; + uintptr_t mods_stub_adr; + int (*mods_errfcn)(void); + int mods_flag; /* flags defined below */ +}; + +/* + * Definitions for mods_flag. + */ +#define MODS_WEAK 0x01 /* weak stub (not loaded if called) */ +#define MODS_NOUNLOAD 0x02 /* module not unloadable (no _fini()) */ +#define MODS_INSTALLED 0x10 /* module installed */ + +struct mod_modinfo { + char *modm_module_name; + struct modctl *mp; + struct mod_stub_info modm_stubs[1]; +}; + +struct modctl_list { + struct modctl_list *modl_next; + struct modctl *modl_modp; +}; + +/* + * Structure to manage a loadable module. + * Note: the module (mod_mp) structure's "text" and "text_size" information + * are replicated in the modctl structure so that mod_containing_pc() + * doesn't have to grab any locks (modctls are persistent; modules are not.) + */ +typedef struct modctl { + struct modctl *mod_next; /* &modules based list */ + struct modctl *mod_prev; + int mod_id; + void *mod_mp; + kthread_t *mod_inprogress_thread; + struct mod_modinfo *mod_modinfo; + struct modlinkage *mod_linkage; + char *mod_filename; + char *mod_modname; + + char mod_busy; /* inprogress_thread has locked */ + char mod_want; /* someone waiting for unlock */ + char mod_prim; /* primary module */ + + int mod_ref; /* ref count - from dependent or stub */ + + char mod_loaded; /* module in memory */ + char mod_installed; /* post _init pre _fini */ + char mod_loadflags; + char mod_delay_unload; /* deferred unload */ + + struct modctl_list *mod_requisites; /* mods this one depends on. */ + void *____unused; /* NOTE: reuse (same size) is OK, */ + /* deletion causes mdb.vs.core issues */ + int mod_loadcnt; /* number of times mod was loaded */ + int mod_nenabled; /* # of enabled DTrace probes in mod */ + char *mod_text; + size_t mod_text_size; + + int mod_gencount; /* # times loaded/unloaded */ + struct modctl *mod_requisite_loading; /* mod circular dependency */ +} modctl_t; + +/* + * mod_loadflags + */ + +#define MOD_NOAUTOUNLOAD 0x1 /* Auto mod-unloader skips this mod */ +#define MOD_NONOTIFY 0x2 /* No krtld notifications on (un)load */ +#define MOD_NOUNLOAD 0x4 /* Assume EBUSY for all _fini's */ + +#define MOD_BIND_HASHSIZE 64 +#define MOD_BIND_HASHMASK (MOD_BIND_HASHSIZE-1) + +typedef int modid_t; + +/* + * global function and data declarations + */ +extern kmutex_t mod_lock; + +extern char *systemfile; +extern char **syscallnames; +extern int moddebug; + +/* + * this is the head of a doubly linked list. Only the next and prev + * pointers are used + */ +extern modctl_t modules; + +/* + * Only the following are part of the DDI/DKI + */ +extern int mod_install(struct modlinkage *); +extern int mod_remove(struct modlinkage *); +extern int mod_info(struct modlinkage *, struct modinfo *); + +/* + * bit definitions for moddebug. + */ +#define MODDEBUG_LOADMSG 0x80000000 /* print "[un]loading..." msg */ +#define MODDEBUG_ERRMSG 0x40000000 /* print detailed error msgs */ +#define MODDEBUG_LOADMSG2 0x20000000 /* print 2nd level msgs */ +#define MODDEBUG_RETIRE 0x10000000 /* print retire msgs */ +#define MODDEBUG_BINDING 0x00040000 /* driver/alias binding */ +#define MODDEBUG_FINI_EBUSY 0x00020000 /* pretend fini returns EBUSY */ +#define MODDEBUG_NOAUL_IPP 0x00010000 /* no Autounloading ipp mods */ +#define MODDEBUG_NOAUL_DACF 0x00008000 /* no Autounloading dacf mods */ +#define MODDEBUG_KEEPTEXT 0x00004000 /* keep text after unloading */ +#define MODDEBUG_NOAUL_DRV 0x00001000 /* no Autounloading Drivers */ +#define MODDEBUG_NOAUL_EXEC 0x00000800 /* no Autounloading Execs */ +#define MODDEBUG_NOAUL_FS 0x00000400 /* no Autounloading File sys */ +#define MODDEBUG_NOAUL_MISC 0x00000200 /* no Autounloading misc */ +#define MODDEBUG_NOAUL_SCHED 0x00000100 /* no Autounloading scheds */ +#define MODDEBUG_NOAUL_STR 0x00000080 /* no Autounloading streams */ +#define MODDEBUG_NOAUL_SYS 0x00000040 /* no Autounloading syscalls */ +#define MODDEBUG_NOCTF 0x00000020 /* do not load CTF debug data */ +#define MODDEBUG_NOAUTOUNLOAD 0x00000010 /* no autounloading at all */ +#define MODDEBUG_DDI_MOD 0x00000008 /* ddi_mod{open,sym,close} */ +#define MODDEBUG_MP_MATCH 0x00000004 /* dev_minorperm */ +#define MODDEBUG_MINORPERM 0x00000002 /* minor perm modctls */ +#define MODDEBUG_USERDEBUG 0x00000001 /* bpt after init_module() */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MODCTL_H */ diff --git a/module/icp/include/sys/modhash.h b/module/icp/include/sys/modhash.h new file mode 100644 index 000000000000..06b52ff02604 --- /dev/null +++ b/module/icp/include/sys/modhash.h @@ -0,0 +1,147 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MODHASH_H +#define _SYS_MODHASH_H + +/* + * Generic hash implementation for the kernel. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +/* + * Opaque data types for storing keys and values + */ +typedef void *mod_hash_val_t; +typedef void *mod_hash_key_t; + +/* + * Opaque data type for reservation + */ +typedef void *mod_hash_hndl_t; + +/* + * Opaque type for hash itself. + */ +struct mod_hash; +typedef struct mod_hash mod_hash_t; + +/* + * String hash table + */ +mod_hash_t *mod_hash_create_strhash_nodtr(char *, size_t, + void (*)(mod_hash_val_t)); +mod_hash_t *mod_hash_create_strhash(char *, size_t, void (*)(mod_hash_val_t)); +void mod_hash_destroy_strhash(mod_hash_t *); +int mod_hash_strkey_cmp(mod_hash_key_t, mod_hash_key_t); +void mod_hash_strkey_dtor(mod_hash_key_t); +void mod_hash_strval_dtor(mod_hash_val_t); +uint_t mod_hash_bystr(void *, mod_hash_key_t); + +/* + * Pointer hash table + */ +mod_hash_t *mod_hash_create_ptrhash(char *, size_t, void (*)(mod_hash_val_t), + size_t); +void mod_hash_destroy_ptrhash(mod_hash_t *); +int mod_hash_ptrkey_cmp(mod_hash_key_t, mod_hash_key_t); +uint_t mod_hash_byptr(void *, mod_hash_key_t); + +/* + * ID hash table + */ +mod_hash_t *mod_hash_create_idhash(char *, size_t, void (*)(mod_hash_val_t)); +void mod_hash_destroy_idhash(mod_hash_t *); +int mod_hash_idkey_cmp(mod_hash_key_t, mod_hash_key_t); +uint_t mod_hash_byid(void *, mod_hash_key_t); +uint_t mod_hash_iddata_gen(size_t); + +/* + * Hash management functions + */ +mod_hash_t *mod_hash_create_extended(char *, size_t, void (*)(mod_hash_key_t), + void (*)(mod_hash_val_t), uint_t (*)(void *, mod_hash_key_t), void *, + int (*)(mod_hash_key_t, mod_hash_key_t), int); + +void mod_hash_destroy_hash(mod_hash_t *); +void mod_hash_clear(mod_hash_t *); + +/* + * Null key and value destructors + */ +void mod_hash_null_keydtor(mod_hash_key_t); +void mod_hash_null_valdtor(mod_hash_val_t); + +/* + * Basic hash operations + */ + +/* + * Error codes for insert, remove, find, destroy. + */ +#define MH_ERR_NOMEM -1 +#define MH_ERR_DUPLICATE -2 +#define MH_ERR_NOTFOUND -3 + +/* + * Return codes for hash walkers + */ +#define MH_WALK_CONTINUE 0 +#define MH_WALK_TERMINATE 1 + +/* + * Basic hash operations + */ +int mod_hash_insert(mod_hash_t *, mod_hash_key_t, mod_hash_val_t); +int mod_hash_replace(mod_hash_t *, mod_hash_key_t, mod_hash_val_t); +int mod_hash_remove(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int mod_hash_destroy(mod_hash_t *, mod_hash_key_t); +int mod_hash_find(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int mod_hash_find_cb(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *, + void (*)(mod_hash_key_t, mod_hash_val_t)); +int mod_hash_find_cb_rval(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *, + int (*)(mod_hash_key_t, mod_hash_val_t), int *); +void mod_hash_walk(mod_hash_t *, + uint_t (*)(mod_hash_key_t, mod_hash_val_t *, void *), void *); + +/* + * Reserving hash operations + */ +int mod_hash_reserve(mod_hash_t *, mod_hash_hndl_t *); +int mod_hash_reserve_nosleep(mod_hash_t *, mod_hash_hndl_t *); +void mod_hash_cancel(mod_hash_t *, mod_hash_hndl_t *); +int mod_hash_insert_reserve(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MODHASH_H */ diff --git a/module/icp/include/sys/modhash_impl.h b/module/icp/include/sys/modhash_impl.h new file mode 100644 index 000000000000..3130773aa196 --- /dev/null +++ b/module/icp/include/sys/modhash_impl.h @@ -0,0 +1,108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_MODHASH_IMPL_H +#define _SYS_MODHASH_IMPL_H + +/* + * Internal details for the kernel's generic hash implementation. + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +struct mod_hash_entry { + mod_hash_key_t mhe_key; /* stored hash key */ + mod_hash_val_t mhe_val; /* stored hash value */ + struct mod_hash_entry *mhe_next; /* next item in chain */ +}; + +struct mod_hash_stat { + ulong_t mhs_hit; /* tried a 'find' and it succeeded */ + ulong_t mhs_miss; /* tried a 'find' but it failed */ + ulong_t mhs_coll; /* occur when insert fails because of dup's */ + ulong_t mhs_nelems; /* total number of stored key/value pairs */ + ulong_t mhs_nomem; /* number of times kmem_alloc failed */ +}; + +struct mod_hash { + krwlock_t mh_contents; /* lock protecting contents */ + char *mh_name; /* hash name */ + int mh_sleep; /* kmem_alloc flag */ + size_t mh_nchains; /* # of elements in mh_entries */ + + /* key and val destructor */ + void (*mh_kdtor)(mod_hash_key_t); + void (*mh_vdtor)(mod_hash_val_t); + + /* key comparator */ + int (*mh_keycmp)(mod_hash_key_t, mod_hash_key_t); + + /* hash algorithm, and algorithm-private data */ + uint_t (*mh_hashalg)(void *, mod_hash_key_t); + void *mh_hashalg_data; + + struct mod_hash *mh_next; /* next hash in list */ + + struct mod_hash_stat mh_stat; + + struct mod_hash_entry *mh_entries[1]; +}; + +/* + * MH_SIZE() + * Compute the size of a mod_hash_t, in bytes, given the number of + * elements it contains. + */ +#define MH_SIZE(n) \ + (sizeof (mod_hash_t) + ((n) - 1) * (sizeof (struct mod_hash_entry *))) + +/* + * Module initialization; called once. + */ +void mod_hash_fini(void); +void mod_hash_init(void); + +/* + * Internal routines. Use directly with care. + */ +uint_t i_mod_hash(mod_hash_t *, mod_hash_key_t); +int i_mod_hash_insert_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t, + mod_hash_hndl_t); +int i_mod_hash_remove_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +int i_mod_hash_find_nosync(mod_hash_t *, mod_hash_key_t, mod_hash_val_t *); +void i_mod_hash_walk_nosync(mod_hash_t *, uint_t (*)(mod_hash_key_t, + mod_hash_val_t *, void *), void *); +void i_mod_hash_clear_nosync(mod_hash_t *hash); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_MODHASH_IMPL_H */ diff --git a/module/icp/include/sys/stack.h b/module/icp/include/sys/stack.h new file mode 100644 index 000000000000..64fecf409b5c --- /dev/null +++ b/module/icp/include/sys/stack.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_STACK_H +#define _SYS_STACK_H + +#if defined(__i386) || defined(__amd64) + +#include /* XX64 x86/sys/stack.h */ + +#endif + +#endif /* _SYS_STACK_H */ diff --git a/module/icp/include/sys/trap.h b/module/icp/include/sys/trap.h new file mode 100644 index 000000000000..7f9fd375805f --- /dev/null +++ b/module/icp/include/sys/trap.h @@ -0,0 +1,36 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_TRAP_H +#define _SYS_TRAP_H + +#if defined(__i386) || defined(__amd64) + +#include /* XX64 x86/sys/trap.h */ + +#endif + +#endif /* _SYS_TRAP_H */ diff --git a/module/icp/io/aes.c b/module/icp/io/aes.c new file mode 100644 index 000000000000..96fb6bb1af30 --- /dev/null +++ b/module/icp/io/aes.c @@ -0,0 +1,1439 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * AES provider for the Kernel Cryptographic Framework (KCF) + */ + +#include +#include +#include +#include +#include +#include +#include +#define _AES_IMPL +#include +#include + +#define CRYPTO_PROVIDER_NAME "aes" + +extern struct mod_ops mod_cryptoops; + +/* + * Module linkage information for the kernel. + */ +static struct modlcrypto modlcrypto = { + &mod_cryptoops, + "AES Kernel SW Provider" +}; + +static struct modlinkage modlinkage = { + MODREV_1, { (void *)&modlcrypto, NULL } +}; + +/* + * Mechanism info structure passed to KCF during registration. + */ +static crypto_mech_info_t aes_mech_info_tab[] = { + /* AES_ECB */ + {SUN_CKM_AES_ECB, AES_ECB_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* AES_CBC */ + {SUN_CKM_AES_CBC, AES_CBC_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* AES_CTR */ + {SUN_CKM_AES_CTR, AES_CTR_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* AES_CCM */ + {SUN_CKM_AES_CCM, AES_CCM_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* AES_GCM */ + {SUN_CKM_AES_GCM, AES_GCM_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* AES_GMAC */ + {SUN_CKM_AES_GMAC, AES_GMAC_MECH_INFO_TYPE, + CRYPTO_FG_ENCRYPT | CRYPTO_FG_ENCRYPT_ATOMIC | + CRYPTO_FG_DECRYPT | CRYPTO_FG_DECRYPT_ATOMIC | + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC | + CRYPTO_FG_SIGN | CRYPTO_FG_SIGN_ATOMIC | + CRYPTO_FG_VERIFY | CRYPTO_FG_VERIFY_ATOMIC, + AES_MIN_KEY_BYTES, AES_MAX_KEY_BYTES, CRYPTO_KEYSIZE_UNIT_IN_BYTES} +}; + +static void aes_provider_status(crypto_provider_handle_t, uint_t *); + +static crypto_control_ops_t aes_control_ops = { + aes_provider_status +}; + +static int aes_encrypt_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); +static int aes_decrypt_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); +static int aes_common_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_key_t *, crypto_spi_ctx_template_t, crypto_req_handle_t, boolean_t); +static int aes_common_init_ctx(aes_ctx_t *, crypto_spi_ctx_template_t *, + crypto_mechanism_t *, crypto_key_t *, int, boolean_t); +static int aes_encrypt_final(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int aes_decrypt_final(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); + +static int aes_encrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int aes_encrypt_update(crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); +static int aes_encrypt_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); + +static int aes_decrypt(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int aes_decrypt_update(crypto_ctx_t *, crypto_data_t *, + crypto_data_t *, crypto_req_handle_t); +static int aes_decrypt_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, + crypto_data_t *, crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_cipher_ops_t aes_cipher_ops = { + .encrypt_init = aes_encrypt_init, + .encrypt = aes_encrypt, + .encrypt_update = aes_encrypt_update, + .encrypt_final = aes_encrypt_final, + .encrypt_atomic = aes_encrypt_atomic, + .decrypt_init = aes_decrypt_init, + .decrypt = aes_decrypt, + .decrypt_update = aes_decrypt_update, + .decrypt_final = aes_decrypt_final, + .decrypt_atomic = aes_decrypt_atomic +}; + +static int aes_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int aes_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_mac_ops_t aes_mac_ops = { + .mac_init = NULL, + .mac = NULL, + .mac_update = NULL, + .mac_final = NULL, + .mac_atomic = aes_mac_atomic, + .mac_verify_atomic = aes_mac_verify_atomic +}; + +static int aes_create_ctx_template(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, + size_t *, crypto_req_handle_t); +static int aes_free_context(crypto_ctx_t *); + +static crypto_ctx_ops_t aes_ctx_ops = { + .create_ctx_template = aes_create_ctx_template, + .free_context = aes_free_context +}; + +static crypto_ops_t aes_crypto_ops = {{{{{ + &aes_control_ops, + NULL, + &aes_cipher_ops, + &aes_mac_ops, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &aes_ctx_ops +}}}}}; + +static crypto_provider_info_t aes_prov_info = {{{{ + CRYPTO_SPI_VERSION_1, + "AES Software Provider", + CRYPTO_SW_PROVIDER, + NULL, + &aes_crypto_ops, + sizeof (aes_mech_info_tab)/sizeof (crypto_mech_info_t), + aes_mech_info_tab +}}}}; + +static crypto_kcf_provider_handle_t aes_prov_handle = 0; +static crypto_data_t null_crypto_data = { CRYPTO_DATA_RAW }; + +int +aes_mod_init(void) +{ + int ret; + + /* Determine the fastest available implementation. */ + aes_impl_init(); + gcm_impl_init(); + + if ((ret = mod_install(&modlinkage)) != 0) + return (ret); + + /* Register with KCF. If the registration fails, remove the module. */ + if (crypto_register_provider(&aes_prov_info, &aes_prov_handle)) { + (void) mod_remove(&modlinkage); + return (EACCES); + } + + return (0); +} + +int +aes_mod_fini(void) +{ + /* Unregister from KCF if module is registered */ + if (aes_prov_handle != 0) { + if (crypto_unregister_provider(aes_prov_handle)) + return (EBUSY); + + aes_prov_handle = 0; + } + + return (mod_remove(&modlinkage)); +} + +static int +aes_check_mech_param(crypto_mechanism_t *mechanism, aes_ctx_t **ctx, int kmflag) +{ + void *p = NULL; + boolean_t param_required = B_TRUE; + size_t param_len; + void *(*alloc_fun)(int); + int rv = CRYPTO_SUCCESS; + + switch (mechanism->cm_type) { + case AES_ECB_MECH_INFO_TYPE: + param_required = B_FALSE; + alloc_fun = ecb_alloc_ctx; + break; + case AES_CBC_MECH_INFO_TYPE: + param_len = AES_BLOCK_LEN; + alloc_fun = cbc_alloc_ctx; + break; + case AES_CTR_MECH_INFO_TYPE: + param_len = sizeof (CK_AES_CTR_PARAMS); + alloc_fun = ctr_alloc_ctx; + break; + case AES_CCM_MECH_INFO_TYPE: + param_len = sizeof (CK_AES_CCM_PARAMS); + alloc_fun = ccm_alloc_ctx; + break; + case AES_GCM_MECH_INFO_TYPE: + param_len = sizeof (CK_AES_GCM_PARAMS); + alloc_fun = gcm_alloc_ctx; + break; + case AES_GMAC_MECH_INFO_TYPE: + param_len = sizeof (CK_AES_GMAC_PARAMS); + alloc_fun = gmac_alloc_ctx; + break; + default: + rv = CRYPTO_MECHANISM_INVALID; + return (rv); + } + if (param_required && mechanism->cm_param != NULL && + mechanism->cm_param_len != param_len) { + rv = CRYPTO_MECHANISM_PARAM_INVALID; + } + if (ctx != NULL) { + p = (alloc_fun)(kmflag); + *ctx = p; + } + return (rv); +} + +/* + * Initialize key schedules for AES + */ +static int +init_keysched(crypto_key_t *key, void *newbie) +{ + /* + * Only keys by value are supported by this module. + */ + switch (key->ck_format) { + case CRYPTO_KEY_RAW: + if (key->ck_length < AES_MINBITS || + key->ck_length > AES_MAXBITS) { + return (CRYPTO_KEY_SIZE_RANGE); + } + + /* key length must be either 128, 192, or 256 */ + if ((key->ck_length & 63) != 0) + return (CRYPTO_KEY_SIZE_RANGE); + break; + default: + return (CRYPTO_KEY_TYPE_INCONSISTENT); + } + + aes_init_keysched(key->ck_data, key->ck_length, newbie); + return (CRYPTO_SUCCESS); +} + +/* + * KCF software provider control entry points. + */ +/* ARGSUSED */ +static void +aes_provider_status(crypto_provider_handle_t provider, uint_t *status) +{ + *status = CRYPTO_PROVIDER_READY; +} + +static int +aes_encrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t template, + crypto_req_handle_t req) +{ + return (aes_common_init(ctx, mechanism, key, template, req, B_TRUE)); +} + +static int +aes_decrypt_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t template, + crypto_req_handle_t req) +{ + return (aes_common_init(ctx, mechanism, key, template, req, B_FALSE)); +} + + + +/* + * KCF software provider encrypt entry points. + */ +static int +aes_common_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t template, + crypto_req_handle_t req, boolean_t is_encrypt_init) +{ + aes_ctx_t *aes_ctx; + int rv; + int kmflag; + + /* + * Only keys by value are supported by this module. + */ + if (key->ck_format != CRYPTO_KEY_RAW) { + return (CRYPTO_KEY_TYPE_INCONSISTENT); + } + + kmflag = crypto_kmflag(req); + if ((rv = aes_check_mech_param(mechanism, &aes_ctx, kmflag)) + != CRYPTO_SUCCESS) + return (rv); + + rv = aes_common_init_ctx(aes_ctx, template, mechanism, key, kmflag, + is_encrypt_init); + if (rv != CRYPTO_SUCCESS) { + crypto_free_mode_ctx(aes_ctx); + return (rv); + } + + ctx->cc_provider_private = aes_ctx; + + return (CRYPTO_SUCCESS); +} + +static void +aes_copy_block64(uint8_t *in, uint64_t *out) +{ + if (IS_P2ALIGNED(in, sizeof (uint64_t))) { + /* LINTED: pointer alignment */ + out[0] = *(uint64_t *)&in[0]; + /* LINTED: pointer alignment */ + out[1] = *(uint64_t *)&in[8]; + } else { + uint8_t *iv8 = (uint8_t *)&out[0]; + + AES_COPY_BLOCK(in, iv8); + } +} + + +static int +aes_encrypt(crypto_ctx_t *ctx, crypto_data_t *plaintext, + crypto_data_t *ciphertext, crypto_req_handle_t req) +{ + int ret = CRYPTO_FAILED; + + aes_ctx_t *aes_ctx; + size_t saved_length, saved_offset, length_needed; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + /* + * For block ciphers, plaintext must be a multiple of AES block size. + * This test is only valid for ciphers whose blocksize is a power of 2. + */ + if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) + == 0) && (plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) + return (CRYPTO_DATA_LEN_RANGE); + + ASSERT(ciphertext != NULL); + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following case. + */ + switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { + case CCM_MODE: + length_needed = plaintext->cd_length + aes_ctx->ac_mac_len; + break; + case GCM_MODE: + length_needed = plaintext->cd_length + aes_ctx->ac_tag_len; + break; + case GMAC_MODE: + if (plaintext->cd_length != 0) + return (CRYPTO_ARGUMENTS_BAD); + + length_needed = aes_ctx->ac_tag_len; + break; + default: + length_needed = plaintext->cd_length; + } + + if (ciphertext->cd_length < length_needed) { + ciphertext->cd_length = length_needed; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + saved_length = ciphertext->cd_length; + saved_offset = ciphertext->cd_offset; + + /* + * Do an update on the specified input data. + */ + ret = aes_encrypt_update(ctx, plaintext, ciphertext, req); + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + + /* + * For CCM mode, aes_ccm_encrypt_final() will take care of any + * left-over unprocessed data, and compute the MAC + */ + if (aes_ctx->ac_flags & CCM_MODE) { + /* + * ccm_encrypt_final() will compute the MAC and append + * it to existing ciphertext. So, need to adjust the left over + * length value accordingly + */ + + /* order of following 2 lines MUST not be reversed */ + ciphertext->cd_offset = ciphertext->cd_length; + ciphertext->cd_length = saved_length - ciphertext->cd_length; + ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, ciphertext, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + + if (plaintext != ciphertext) { + ciphertext->cd_length = + ciphertext->cd_offset - saved_offset; + } + ciphertext->cd_offset = saved_offset; + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + /* + * gcm_encrypt_final() will compute the MAC and append + * it to existing ciphertext. So, need to adjust the left over + * length value accordingly + */ + + /* order of following 2 lines MUST not be reversed */ + ciphertext->cd_offset = ciphertext->cd_length; + ciphertext->cd_length = saved_length - ciphertext->cd_length; + ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, ciphertext, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + + if (plaintext != ciphertext) { + ciphertext->cd_length = + ciphertext->cd_offset - saved_offset; + } + ciphertext->cd_offset = saved_offset; + } + + ASSERT(aes_ctx->ac_remainder_len == 0); + (void) aes_free_context(ctx); + + return (ret); +} + + +static int +aes_decrypt(crypto_ctx_t *ctx, crypto_data_t *ciphertext, + crypto_data_t *plaintext, crypto_req_handle_t req) +{ + int ret = CRYPTO_FAILED; + + aes_ctx_t *aes_ctx; + off_t saved_offset; + size_t saved_length, length_needed; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + /* + * For block ciphers, plaintext must be a multiple of AES block size. + * This test is only valid for ciphers whose blocksize is a power of 2. + */ + if (((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) + == 0) && (ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) { + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + } + + ASSERT(plaintext != NULL); + + /* + * Return length needed to store the output. + * Do not destroy context when plaintext buffer is too small. + * + * CCM: plaintext is MAC len smaller than cipher text + * GCM: plaintext is TAG len smaller than cipher text + * GMAC: plaintext length must be zero + */ + switch (aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) { + case CCM_MODE: + length_needed = aes_ctx->ac_processed_data_len; + break; + case GCM_MODE: + length_needed = ciphertext->cd_length - aes_ctx->ac_tag_len; + break; + case GMAC_MODE: + if (plaintext->cd_length != 0) + return (CRYPTO_ARGUMENTS_BAD); + + length_needed = 0; + break; + default: + length_needed = ciphertext->cd_length; + } + + if (plaintext->cd_length < length_needed) { + plaintext->cd_length = length_needed; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + saved_offset = plaintext->cd_offset; + saved_length = plaintext->cd_length; + + /* + * Do an update on the specified input data. + */ + ret = aes_decrypt_update(ctx, ciphertext, plaintext, req); + if (ret != CRYPTO_SUCCESS) { + goto cleanup; + } + + if (aes_ctx->ac_flags & CCM_MODE) { + ASSERT(aes_ctx->ac_processed_data_len == aes_ctx->ac_data_len); + ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); + + /* order of following 2 lines MUST not be reversed */ + plaintext->cd_offset = plaintext->cd_length; + plaintext->cd_length = saved_length - plaintext->cd_length; + + ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, plaintext, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + if (ret == CRYPTO_SUCCESS) { + if (plaintext != ciphertext) { + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } + } else { + plaintext->cd_length = saved_length; + } + + plaintext->cd_offset = saved_offset; + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + /* order of following 2 lines MUST not be reversed */ + plaintext->cd_offset = plaintext->cd_length; + plaintext->cd_length = saved_length - plaintext->cd_length; + + ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, plaintext, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + if (ret == CRYPTO_SUCCESS) { + if (plaintext != ciphertext) { + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } + } else { + plaintext->cd_length = saved_length; + } + + plaintext->cd_offset = saved_offset; + } + + ASSERT(aes_ctx->ac_remainder_len == 0); + +cleanup: + (void) aes_free_context(ctx); + + return (ret); +} + + +/* ARGSUSED */ +static int +aes_encrypt_update(crypto_ctx_t *ctx, crypto_data_t *plaintext, + crypto_data_t *ciphertext, crypto_req_handle_t req) +{ + off_t saved_offset; + size_t saved_length, out_len; + int ret = CRYPTO_SUCCESS; + aes_ctx_t *aes_ctx; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + ASSERT(ciphertext != NULL); + + /* compute number of bytes that will hold the ciphertext */ + out_len = aes_ctx->ac_remainder_len; + out_len += plaintext->cd_length; + out_len &= ~(AES_BLOCK_LEN - 1); + + /* return length needed to store the output */ + if (ciphertext->cd_length < out_len) { + ciphertext->cd_length = out_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + saved_offset = ciphertext->cd_offset; + saved_length = ciphertext->cd_length; + + /* + * Do the AES update on the specified input data. + */ + switch (plaintext->cd_format) { + case CRYPTO_DATA_RAW: + ret = crypto_update_iov(ctx->cc_provider_private, + plaintext, ciphertext, aes_encrypt_contiguous_blocks, + aes_copy_block64); + break; + case CRYPTO_DATA_UIO: + ret = crypto_update_uio(ctx->cc_provider_private, + plaintext, ciphertext, aes_encrypt_contiguous_blocks, + aes_copy_block64); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* + * Since AES counter mode is a stream cipher, we call + * ctr_mode_final() to pick up any remaining bytes. + * It is an internal function that does not destroy + * the context like *normal* final routines. + */ + if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { + ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, + ciphertext, aes_encrypt_block); + } + + if (ret == CRYPTO_SUCCESS) { + if (plaintext != ciphertext) + ciphertext->cd_length = + ciphertext->cd_offset - saved_offset; + } else { + ciphertext->cd_length = saved_length; + } + ciphertext->cd_offset = saved_offset; + + return (ret); +} + + +static int +aes_decrypt_update(crypto_ctx_t *ctx, crypto_data_t *ciphertext, + crypto_data_t *plaintext, crypto_req_handle_t req) +{ + off_t saved_offset; + size_t saved_length, out_len; + int ret = CRYPTO_SUCCESS; + aes_ctx_t *aes_ctx; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + ASSERT(plaintext != NULL); + + /* + * Compute number of bytes that will hold the plaintext. + * This is not necessary for CCM, GCM, and GMAC since these + * mechanisms never return plaintext for update operations. + */ + if ((aes_ctx->ac_flags & (CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { + out_len = aes_ctx->ac_remainder_len; + out_len += ciphertext->cd_length; + out_len &= ~(AES_BLOCK_LEN - 1); + + /* return length needed to store the output */ + if (plaintext->cd_length < out_len) { + plaintext->cd_length = out_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + } + + saved_offset = plaintext->cd_offset; + saved_length = plaintext->cd_length; + + if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) + gcm_set_kmflag((gcm_ctx_t *)aes_ctx, crypto_kmflag(req)); + + /* + * Do the AES update on the specified input data. + */ + switch (ciphertext->cd_format) { + case CRYPTO_DATA_RAW: + ret = crypto_update_iov(ctx->cc_provider_private, + ciphertext, plaintext, aes_decrypt_contiguous_blocks, + aes_copy_block64); + break; + case CRYPTO_DATA_UIO: + ret = crypto_update_uio(ctx->cc_provider_private, + ciphertext, plaintext, aes_decrypt_contiguous_blocks, + aes_copy_block64); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* + * Since AES counter mode is a stream cipher, we call + * ctr_mode_final() to pick up any remaining bytes. + * It is an internal function that does not destroy + * the context like *normal* final routines. + */ + if ((aes_ctx->ac_flags & CTR_MODE) && (aes_ctx->ac_remainder_len > 0)) { + ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, plaintext, + aes_encrypt_block); + if (ret == CRYPTO_DATA_LEN_RANGE) + ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; + } + + if (ret == CRYPTO_SUCCESS) { + if (ciphertext != plaintext) + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } else { + plaintext->cd_length = saved_length; + } + plaintext->cd_offset = saved_offset; + + + return (ret); +} + +/* ARGSUSED */ +static int +aes_encrypt_final(crypto_ctx_t *ctx, crypto_data_t *data, + crypto_req_handle_t req) +{ + aes_ctx_t *aes_ctx; + int ret; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + if (data->cd_format != CRYPTO_DATA_RAW && + data->cd_format != CRYPTO_DATA_UIO) { + return (CRYPTO_ARGUMENTS_BAD); + } + + if (aes_ctx->ac_flags & CTR_MODE) { + if (aes_ctx->ac_remainder_len > 0) { + ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, + aes_encrypt_block); + if (ret != CRYPTO_SUCCESS) + return (ret); + } + } else if (aes_ctx->ac_flags & CCM_MODE) { + ret = ccm_encrypt_final((ccm_ctx_t *)aes_ctx, data, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + size_t saved_offset = data->cd_offset; + + ret = gcm_encrypt_final((gcm_ctx_t *)aes_ctx, data, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + data->cd_length = data->cd_offset - saved_offset; + data->cd_offset = saved_offset; + } else { + /* + * There must be no unprocessed plaintext. + * This happens if the length of the last data is + * not a multiple of the AES block length. + */ + if (aes_ctx->ac_remainder_len > 0) { + return (CRYPTO_DATA_LEN_RANGE); + } + data->cd_length = 0; + } + + (void) aes_free_context(ctx); + + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +static int +aes_decrypt_final(crypto_ctx_t *ctx, crypto_data_t *data, + crypto_req_handle_t req) +{ + aes_ctx_t *aes_ctx; + int ret; + off_t saved_offset; + size_t saved_length; + + ASSERT(ctx->cc_provider_private != NULL); + aes_ctx = ctx->cc_provider_private; + + if (data->cd_format != CRYPTO_DATA_RAW && + data->cd_format != CRYPTO_DATA_UIO) { + return (CRYPTO_ARGUMENTS_BAD); + } + + /* + * There must be no unprocessed ciphertext. + * This happens if the length of the last ciphertext is + * not a multiple of the AES block length. + */ + if (aes_ctx->ac_remainder_len > 0) { + if ((aes_ctx->ac_flags & CTR_MODE) == 0) + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + else { + ret = ctr_mode_final((ctr_ctx_t *)aes_ctx, data, + aes_encrypt_block); + if (ret == CRYPTO_DATA_LEN_RANGE) + ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; + if (ret != CRYPTO_SUCCESS) + return (ret); + } + } + + if (aes_ctx->ac_flags & CCM_MODE) { + /* + * This is where all the plaintext is returned, make sure + * the plaintext buffer is big enough + */ + size_t pt_len = aes_ctx->ac_data_len; + if (data->cd_length < pt_len) { + data->cd_length = pt_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + ASSERT(aes_ctx->ac_processed_data_len == pt_len); + ASSERT(aes_ctx->ac_processed_mac_len == aes_ctx->ac_mac_len); + saved_offset = data->cd_offset; + saved_length = data->cd_length; + ret = ccm_decrypt_final((ccm_ctx_t *)aes_ctx, data, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + if (ret == CRYPTO_SUCCESS) { + data->cd_length = data->cd_offset - saved_offset; + } else { + data->cd_length = saved_length; + } + + data->cd_offset = saved_offset; + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + } else if (aes_ctx->ac_flags & (GCM_MODE|GMAC_MODE)) { + /* + * This is where all the plaintext is returned, make sure + * the plaintext buffer is big enough + */ + gcm_ctx_t *ctx = (gcm_ctx_t *)aes_ctx; + size_t pt_len = ctx->gcm_processed_data_len - ctx->gcm_tag_len; + + if (data->cd_length < pt_len) { + data->cd_length = pt_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + saved_offset = data->cd_offset; + saved_length = data->cd_length; + ret = gcm_decrypt_final((gcm_ctx_t *)aes_ctx, data, + AES_BLOCK_LEN, aes_encrypt_block, aes_xor_block); + if (ret == CRYPTO_SUCCESS) { + data->cd_length = data->cd_offset - saved_offset; + } else { + data->cd_length = saved_length; + } + + data->cd_offset = saved_offset; + if (ret != CRYPTO_SUCCESS) { + return (ret); + } + } + + + if ((aes_ctx->ac_flags & (CTR_MODE|CCM_MODE|GCM_MODE|GMAC_MODE)) == 0) { + data->cd_length = 0; + } + + (void) aes_free_context(ctx); + + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +static int +aes_encrypt_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *plaintext, crypto_data_t *ciphertext, + crypto_spi_ctx_template_t template, crypto_req_handle_t req) +{ + aes_ctx_t aes_ctx; /* on the stack */ + off_t saved_offset; + size_t saved_length; + size_t length_needed; + int ret; + + ASSERT(ciphertext != NULL); + + /* + * CTR, CCM, GCM, and GMAC modes do not require that plaintext + * be a multiple of AES block size. + */ + switch (mechanism->cm_type) { + case AES_CTR_MECH_INFO_TYPE: + case AES_CCM_MECH_INFO_TYPE: + case AES_GCM_MECH_INFO_TYPE: + case AES_GMAC_MECH_INFO_TYPE: + break; + default: + if ((plaintext->cd_length & (AES_BLOCK_LEN - 1)) != 0) + return (CRYPTO_DATA_LEN_RANGE); + } + + if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS) + return (ret); + + bzero(&aes_ctx, sizeof (aes_ctx_t)); + + ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key, + crypto_kmflag(req), B_TRUE); + if (ret != CRYPTO_SUCCESS) + return (ret); + + switch (mechanism->cm_type) { + case AES_CCM_MECH_INFO_TYPE: + length_needed = plaintext->cd_length + aes_ctx.ac_mac_len; + break; + case AES_GMAC_MECH_INFO_TYPE: + if (plaintext->cd_length != 0) + return (CRYPTO_ARGUMENTS_BAD); + /* FALLTHRU */ + case AES_GCM_MECH_INFO_TYPE: + length_needed = plaintext->cd_length + aes_ctx.ac_tag_len; + break; + default: + length_needed = plaintext->cd_length; + } + + /* return size of buffer needed to store output */ + if (ciphertext->cd_length < length_needed) { + ciphertext->cd_length = length_needed; + ret = CRYPTO_BUFFER_TOO_SMALL; + goto out; + } + + saved_offset = ciphertext->cd_offset; + saved_length = ciphertext->cd_length; + + /* + * Do an update on the specified input data. + */ + switch (plaintext->cd_format) { + case CRYPTO_DATA_RAW: + ret = crypto_update_iov(&aes_ctx, plaintext, ciphertext, + aes_encrypt_contiguous_blocks, aes_copy_block64); + break; + case CRYPTO_DATA_UIO: + ret = crypto_update_uio(&aes_ctx, plaintext, ciphertext, + aes_encrypt_contiguous_blocks, aes_copy_block64); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) { + ret = ccm_encrypt_final((ccm_ctx_t *)&aes_ctx, + ciphertext, AES_BLOCK_LEN, aes_encrypt_block, + aes_xor_block); + if (ret != CRYPTO_SUCCESS) + goto out; + ASSERT(aes_ctx.ac_remainder_len == 0); + } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || + mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { + ret = gcm_encrypt_final((gcm_ctx_t *)&aes_ctx, + ciphertext, AES_BLOCK_LEN, aes_encrypt_block, + aes_copy_block, aes_xor_block); + if (ret != CRYPTO_SUCCESS) + goto out; + ASSERT(aes_ctx.ac_remainder_len == 0); + } else if (mechanism->cm_type == AES_CTR_MECH_INFO_TYPE) { + if (aes_ctx.ac_remainder_len > 0) { + ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, + ciphertext, aes_encrypt_block); + if (ret != CRYPTO_SUCCESS) + goto out; + } + } else { + ASSERT(aes_ctx.ac_remainder_len == 0); + } + + if (plaintext != ciphertext) { + ciphertext->cd_length = + ciphertext->cd_offset - saved_offset; + } + } else { + ciphertext->cd_length = saved_length; + } + ciphertext->cd_offset = saved_offset; + +out: + if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { + bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); + kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); + } + + return (ret); +} + +/* ARGSUSED */ +static int +aes_decrypt_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *ciphertext, crypto_data_t *plaintext, + crypto_spi_ctx_template_t template, crypto_req_handle_t req) +{ + aes_ctx_t aes_ctx; /* on the stack */ + off_t saved_offset; + size_t saved_length; + size_t length_needed; + int ret; + + ASSERT(plaintext != NULL); + + /* + * CCM, GCM, CTR, and GMAC modes do not require that ciphertext + * be a multiple of AES block size. + */ + switch (mechanism->cm_type) { + case AES_CTR_MECH_INFO_TYPE: + case AES_CCM_MECH_INFO_TYPE: + case AES_GCM_MECH_INFO_TYPE: + case AES_GMAC_MECH_INFO_TYPE: + break; + default: + if ((ciphertext->cd_length & (AES_BLOCK_LEN - 1)) != 0) + return (CRYPTO_ENCRYPTED_DATA_LEN_RANGE); + } + + if ((ret = aes_check_mech_param(mechanism, NULL, 0)) != CRYPTO_SUCCESS) + return (ret); + + bzero(&aes_ctx, sizeof (aes_ctx_t)); + + ret = aes_common_init_ctx(&aes_ctx, template, mechanism, key, + crypto_kmflag(req), B_FALSE); + if (ret != CRYPTO_SUCCESS) + return (ret); + + switch (mechanism->cm_type) { + case AES_CCM_MECH_INFO_TYPE: + length_needed = aes_ctx.ac_data_len; + break; + case AES_GCM_MECH_INFO_TYPE: + length_needed = ciphertext->cd_length - aes_ctx.ac_tag_len; + break; + case AES_GMAC_MECH_INFO_TYPE: + if (plaintext->cd_length != 0) + return (CRYPTO_ARGUMENTS_BAD); + length_needed = 0; + break; + default: + length_needed = ciphertext->cd_length; + } + + /* return size of buffer needed to store output */ + if (plaintext->cd_length < length_needed) { + plaintext->cd_length = length_needed; + ret = CRYPTO_BUFFER_TOO_SMALL; + goto out; + } + + saved_offset = plaintext->cd_offset; + saved_length = plaintext->cd_length; + + if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || + mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) + gcm_set_kmflag((gcm_ctx_t *)&aes_ctx, crypto_kmflag(req)); + + /* + * Do an update on the specified input data. + */ + switch (ciphertext->cd_format) { + case CRYPTO_DATA_RAW: + ret = crypto_update_iov(&aes_ctx, ciphertext, plaintext, + aes_decrypt_contiguous_blocks, aes_copy_block64); + break; + case CRYPTO_DATA_UIO: + ret = crypto_update_uio(&aes_ctx, ciphertext, plaintext, + aes_decrypt_contiguous_blocks, aes_copy_block64); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + if (mechanism->cm_type == AES_CCM_MECH_INFO_TYPE) { + ASSERT(aes_ctx.ac_processed_data_len + == aes_ctx.ac_data_len); + ASSERT(aes_ctx.ac_processed_mac_len + == aes_ctx.ac_mac_len); + ret = ccm_decrypt_final((ccm_ctx_t *)&aes_ctx, + plaintext, AES_BLOCK_LEN, aes_encrypt_block, + aes_copy_block, aes_xor_block); + ASSERT(aes_ctx.ac_remainder_len == 0); + if ((ret == CRYPTO_SUCCESS) && + (ciphertext != plaintext)) { + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } else { + plaintext->cd_length = saved_length; + } + } else if (mechanism->cm_type == AES_GCM_MECH_INFO_TYPE || + mechanism->cm_type == AES_GMAC_MECH_INFO_TYPE) { + ret = gcm_decrypt_final((gcm_ctx_t *)&aes_ctx, + plaintext, AES_BLOCK_LEN, aes_encrypt_block, + aes_xor_block); + ASSERT(aes_ctx.ac_remainder_len == 0); + if ((ret == CRYPTO_SUCCESS) && + (ciphertext != plaintext)) { + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } else { + plaintext->cd_length = saved_length; + } + } else if (mechanism->cm_type != AES_CTR_MECH_INFO_TYPE) { + ASSERT(aes_ctx.ac_remainder_len == 0); + if (ciphertext != plaintext) + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } else { + if (aes_ctx.ac_remainder_len > 0) { + ret = ctr_mode_final((ctr_ctx_t *)&aes_ctx, + plaintext, aes_encrypt_block); + if (ret == CRYPTO_DATA_LEN_RANGE) + ret = CRYPTO_ENCRYPTED_DATA_LEN_RANGE; + if (ret != CRYPTO_SUCCESS) + goto out; + } + if (ciphertext != plaintext) + plaintext->cd_length = + plaintext->cd_offset - saved_offset; + } + } else { + plaintext->cd_length = saved_length; + } + plaintext->cd_offset = saved_offset; + +out: + if (aes_ctx.ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { + bzero(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); + kmem_free(aes_ctx.ac_keysched, aes_ctx.ac_keysched_len); + } + + if (aes_ctx.ac_flags & CCM_MODE) { + if (aes_ctx.ac_pt_buf != NULL) { + vmem_free(aes_ctx.ac_pt_buf, aes_ctx.ac_data_len); + } + } else if (aes_ctx.ac_flags & (GCM_MODE|GMAC_MODE)) { + if (((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf != NULL) { + vmem_free(((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf, + ((gcm_ctx_t *)&aes_ctx)->gcm_pt_buf_len); + } + } + + return (ret); +} + +/* + * KCF software provider context template entry points. + */ +/* ARGSUSED */ +static int +aes_create_ctx_template(crypto_provider_handle_t provider, + crypto_mechanism_t *mechanism, crypto_key_t *key, + crypto_spi_ctx_template_t *tmpl, size_t *tmpl_size, crypto_req_handle_t req) +{ + void *keysched; + size_t size; + int rv; + + if (mechanism->cm_type != AES_ECB_MECH_INFO_TYPE && + mechanism->cm_type != AES_CBC_MECH_INFO_TYPE && + mechanism->cm_type != AES_CTR_MECH_INFO_TYPE && + mechanism->cm_type != AES_CCM_MECH_INFO_TYPE && + mechanism->cm_type != AES_GCM_MECH_INFO_TYPE && + mechanism->cm_type != AES_GMAC_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + if ((keysched = aes_alloc_keysched(&size, + crypto_kmflag(req))) == NULL) { + return (CRYPTO_HOST_MEMORY); + } + + /* + * Initialize key schedule. Key length information is stored + * in the key. + */ + if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) { + bzero(keysched, size); + kmem_free(keysched, size); + return (rv); + } + + *tmpl = keysched; + *tmpl_size = size; + + return (CRYPTO_SUCCESS); +} + + +static int +aes_free_context(crypto_ctx_t *ctx) +{ + aes_ctx_t *aes_ctx = ctx->cc_provider_private; + + if (aes_ctx != NULL) { + if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { + ASSERT(aes_ctx->ac_keysched_len != 0); + bzero(aes_ctx->ac_keysched, aes_ctx->ac_keysched_len); + kmem_free(aes_ctx->ac_keysched, + aes_ctx->ac_keysched_len); + } + crypto_free_mode_ctx(aes_ctx); + ctx->cc_provider_private = NULL; + } + + return (CRYPTO_SUCCESS); +} + + +static int +aes_common_init_ctx(aes_ctx_t *aes_ctx, crypto_spi_ctx_template_t *template, + crypto_mechanism_t *mechanism, crypto_key_t *key, int kmflag, + boolean_t is_encrypt_init) +{ + int rv = CRYPTO_SUCCESS; + void *keysched; + size_t size = 0; + + if (template == NULL) { + if ((keysched = aes_alloc_keysched(&size, kmflag)) == NULL) + return (CRYPTO_HOST_MEMORY); + /* + * Initialize key schedule. + * Key length is stored in the key. + */ + if ((rv = init_keysched(key, keysched)) != CRYPTO_SUCCESS) { + kmem_free(keysched, size); + return (rv); + } + + aes_ctx->ac_flags |= PROVIDER_OWNS_KEY_SCHEDULE; + aes_ctx->ac_keysched_len = size; + } else { + keysched = template; + } + aes_ctx->ac_keysched = keysched; + + switch (mechanism->cm_type) { + case AES_CBC_MECH_INFO_TYPE: + rv = cbc_init_ctx((cbc_ctx_t *)aes_ctx, mechanism->cm_param, + mechanism->cm_param_len, AES_BLOCK_LEN, aes_copy_block64); + break; + case AES_CTR_MECH_INFO_TYPE: { + CK_AES_CTR_PARAMS *pp; + + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (CK_AES_CTR_PARAMS)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + pp = (CK_AES_CTR_PARAMS *)(void *)mechanism->cm_param; + rv = ctr_init_ctx((ctr_ctx_t *)aes_ctx, pp->ulCounterBits, + pp->cb, aes_copy_block); + break; + } + case AES_CCM_MECH_INFO_TYPE: + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (CK_AES_CCM_PARAMS)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + rv = ccm_init_ctx((ccm_ctx_t *)aes_ctx, mechanism->cm_param, + kmflag, is_encrypt_init, AES_BLOCK_LEN, aes_encrypt_block, + aes_xor_block); + break; + case AES_GCM_MECH_INFO_TYPE: + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (CK_AES_GCM_PARAMS)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + rv = gcm_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + break; + case AES_GMAC_MECH_INFO_TYPE: + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + rv = gmac_init_ctx((gcm_ctx_t *)aes_ctx, mechanism->cm_param, + AES_BLOCK_LEN, aes_encrypt_block, aes_copy_block, + aes_xor_block); + break; + case AES_ECB_MECH_INFO_TYPE: + aes_ctx->ac_flags |= ECB_MODE; + } + + if (rv != CRYPTO_SUCCESS) { + if (aes_ctx->ac_flags & PROVIDER_OWNS_KEY_SCHEDULE) { + bzero(keysched, size); + kmem_free(keysched, size); + } + } + + return (rv); +} + +static int +process_gmac_mech(crypto_mechanism_t *mech, crypto_data_t *data, + CK_AES_GCM_PARAMS *gcm_params) +{ + /* LINTED: pointer alignment */ + CK_AES_GMAC_PARAMS *params = (CK_AES_GMAC_PARAMS *)mech->cm_param; + + if (mech->cm_type != AES_GMAC_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + if (mech->cm_param_len != sizeof (CK_AES_GMAC_PARAMS)) + return (CRYPTO_MECHANISM_PARAM_INVALID); + + if (params->pIv == NULL) + return (CRYPTO_MECHANISM_PARAM_INVALID); + + gcm_params->pIv = params->pIv; + gcm_params->ulIvLen = AES_GMAC_IV_LEN; + gcm_params->ulTagBits = AES_GMAC_TAG_BITS; + + if (data == NULL) + return (CRYPTO_SUCCESS); + + if (data->cd_format != CRYPTO_DATA_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + gcm_params->pAAD = (uchar_t *)data->cd_raw.iov_base; + gcm_params->ulAADLen = data->cd_length; + return (CRYPTO_SUCCESS); +} + +static int +aes_mac_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t template, crypto_req_handle_t req) +{ + CK_AES_GCM_PARAMS gcm_params; + crypto_mechanism_t gcm_mech; + int rv; + + if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) + != CRYPTO_SUCCESS) + return (rv); + + gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; + gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + gcm_mech.cm_param = (char *)&gcm_params; + + return (aes_encrypt_atomic(provider, session_id, &gcm_mech, + key, &null_crypto_data, mac, template, req)); +} + +static int +aes_mac_verify_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t template, crypto_req_handle_t req) +{ + CK_AES_GCM_PARAMS gcm_params; + crypto_mechanism_t gcm_mech; + int rv; + + if ((rv = process_gmac_mech(mechanism, data, &gcm_params)) + != CRYPTO_SUCCESS) + return (rv); + + gcm_mech.cm_type = AES_GCM_MECH_INFO_TYPE; + gcm_mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + gcm_mech.cm_param = (char *)&gcm_params; + + return (aes_decrypt_atomic(provider, session_id, &gcm_mech, + key, mac, &null_crypto_data, template, req)); +} diff --git a/module/icp/io/edonr_mod.c b/module/icp/io/edonr_mod.c new file mode 100644 index 000000000000..a806af610629 --- /dev/null +++ b/module/icp/io/edonr_mod.c @@ -0,0 +1,63 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Unlike sha2 or skein, we won't expose edonr via the Kernel Cryptographic + * Framework (KCF), because Edon-R is *NOT* suitable for general-purpose + * cryptographic use. Users of Edon-R must interface directly to this module. + */ + +static struct modlmisc modlmisc = { + &mod_cryptoops, + "Edon-R Message-Digest Algorithm" +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modlmisc, NULL} +}; + +int +edonr_mod_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) + return (error); + + return (0); +} + +int +edonr_mod_fini(void) +{ + return (mod_remove(&modlinkage)); +} diff --git a/module/icp/io/sha1_mod.c b/module/icp/io/sha1_mod.c new file mode 100644 index 000000000000..ffae143cded0 --- /dev/null +++ b/module/icp/io/sha1_mod.c @@ -0,0 +1,1230 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +#include +#include + +/* + * The sha1 module is created with two modlinkages: + * - a modlmisc that allows consumers to directly call the entry points + * SHA1Init, SHA1Update, and SHA1Final. + * - a modlcrypto that allows the module to register with the Kernel + * Cryptographic Framework (KCF) as a software provider for the SHA1 + * mechanisms. + */ + +static struct modlcrypto modlcrypto = { + &mod_cryptoops, + "SHA1 Kernel SW Provider 1.1" +}; + +static struct modlinkage modlinkage = { + MODREV_1, { &modlcrypto, NULL } +}; + + +/* + * Macros to access the SHA1 or SHA1-HMAC contexts from a context passed + * by KCF to one of the entry points. + */ + +#define PROV_SHA1_CTX(ctx) ((sha1_ctx_t *)(ctx)->cc_provider_private) +#define PROV_SHA1_HMAC_CTX(ctx) ((sha1_hmac_ctx_t *)(ctx)->cc_provider_private) + +/* to extract the digest length passed as mechanism parameter */ +#define PROV_SHA1_GET_DIGEST_LEN(m, len) { \ + if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t))) \ + (len) = (uint32_t)*((ulong_t *)(void *)mechanism->cm_param); \ + else { \ + ulong_t tmp_ulong; \ + bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t)); \ + (len) = (uint32_t)tmp_ulong; \ + } \ +} + +#define PROV_SHA1_DIGEST_KEY(ctx, key, len, digest) { \ + SHA1Init(ctx); \ + SHA1Update(ctx, key, len); \ + SHA1Final(digest, ctx); \ +} + +/* + * Mechanism info structure passed to KCF during registration. + */ +static crypto_mech_info_t sha1_mech_info_tab[] = { + /* SHA1 */ + {SUN_CKM_SHA1, SHA1_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA1-HMAC */ + {SUN_CKM_SHA1_HMAC, SHA1_HMAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA1-HMAC GENERAL */ + {SUN_CKM_SHA1_HMAC_GENERAL, SHA1_HMAC_GEN_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA1_HMAC_MIN_KEY_LEN, SHA1_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES} +}; + +static void sha1_provider_status(crypto_provider_handle_t, uint_t *); + +static crypto_control_ops_t sha1_control_ops = { + sha1_provider_status +}; + +static int sha1_digest_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_req_handle_t); +static int sha1_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha1_digest_update(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha1_digest_final(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha1_digest_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + +static crypto_digest_ops_t sha1_digest_ops = { + .digest_init = sha1_digest_init, + .digest = sha1_digest, + .digest_update = sha1_digest_update, + .digest_key = NULL, + .digest_final = sha1_digest_final, + .digest_atomic = sha1_digest_atomic +}; + +static int sha1_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int sha1_mac_update(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha1_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int sha1_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int sha1_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_mac_ops_t sha1_mac_ops = { + .mac_init = sha1_mac_init, + .mac = NULL, + .mac_update = sha1_mac_update, + .mac_final = sha1_mac_final, + .mac_atomic = sha1_mac_atomic, + .mac_verify_atomic = sha1_mac_verify_atomic +}; + +static int sha1_create_ctx_template(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, + size_t *, crypto_req_handle_t); +static int sha1_free_context(crypto_ctx_t *); + +static crypto_ctx_ops_t sha1_ctx_ops = { + .create_ctx_template = sha1_create_ctx_template, + .free_context = sha1_free_context +}; + +static crypto_ops_t sha1_crypto_ops = {{{{{ + &sha1_control_ops, + &sha1_digest_ops, + NULL, + &sha1_mac_ops, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &sha1_ctx_ops, +}}}}}; + +static crypto_provider_info_t sha1_prov_info = {{{{ + CRYPTO_SPI_VERSION_1, + "SHA1 Software Provider", + CRYPTO_SW_PROVIDER, + NULL, + &sha1_crypto_ops, + sizeof (sha1_mech_info_tab)/sizeof (crypto_mech_info_t), + sha1_mech_info_tab +}}}}; + +static crypto_kcf_provider_handle_t sha1_prov_handle = 0; + +int +sha1_mod_init(void) +{ + int ret; + + if ((ret = mod_install(&modlinkage)) != 0) + return (ret); + + /* + * Register with KCF. If the registration fails, log an + * error but do not uninstall the module, since the functionality + * provided by misc/sha1 should still be available. + */ + if ((ret = crypto_register_provider(&sha1_prov_info, + &sha1_prov_handle)) != CRYPTO_SUCCESS) + cmn_err(CE_WARN, "sha1 _init: " + "crypto_register_provider() failed (0x%x)", ret); + + return (0); +} + +int +sha1_mod_fini(void) +{ + int ret; + + if (sha1_prov_handle != 0) { + if ((ret = crypto_unregister_provider(sha1_prov_handle)) != + CRYPTO_SUCCESS) { + cmn_err(CE_WARN, + "sha1 _fini: crypto_unregister_provider() " + "failed (0x%x)", ret); + return (EBUSY); + } + sha1_prov_handle = 0; + } + + return (mod_remove(&modlinkage)); +} + +/* + * KCF software provider control entry points. + */ +/* ARGSUSED */ +static void +sha1_provider_status(crypto_provider_handle_t provider, uint_t *status) +{ + *status = CRYPTO_PROVIDER_READY; +} + +/* + * KCF software provider digest entry points. + */ + +static int +sha1_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_req_handle_t req) +{ + if (mechanism->cm_type != SHA1_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + /* + * Allocate and initialize SHA1 context. + */ + ctx->cc_provider_private = kmem_alloc(sizeof (sha1_ctx_t), + crypto_kmflag(req)); + if (ctx->cc_provider_private == NULL) + return (CRYPTO_HOST_MEMORY); + + PROV_SHA1_CTX(ctx)->sc_mech_type = SHA1_MECH_INFO_TYPE; + SHA1Init(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx); + + return (CRYPTO_SUCCESS); +} + +/* + * Helper SHA1 digest update function for uio data. + */ +static int +sha1_digest_update_uio(SHA1_CTX *sha1_ctx, crypto_data_t *data) +{ + off_t offset = data->cd_offset; + size_t length = data->cd_length; + uint_t vec_idx = 0; + size_t cur_len; + + /* we support only kernel buffer */ + if (uio_segflg(data->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing data to be + * digested. + */ + offset = uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(data->cd_uio)) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * Now do the digesting on the iovecs. + */ + while (vec_idx < uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(uio_iovlen(data->cd_uio, vec_idx) - + offset, length); + + SHA1Update(sha1_ctx, + (uint8_t *)uio_iovbase(data->cd_uio, vec_idx) + offset, + cur_len); + + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio_iovcnt(data->cd_uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + return (CRYPTO_SUCCESS); +} + +/* + * Helper SHA1 digest final function for uio data. + * digest_len is the length of the desired digest. If digest_len + * is smaller than the default SHA1 digest length, the caller + * must pass a scratch buffer, digest_scratch, which must + * be at least SHA1_DIGEST_LENGTH bytes. + */ +static int +sha1_digest_final_uio(SHA1_CTX *sha1_ctx, crypto_data_t *digest, + ulong_t digest_len, uchar_t *digest_scratch) +{ + off_t offset = digest->cd_offset; + uint_t vec_idx = 0; + + /* we support only kernel buffer */ + if (uio_segflg(digest->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing ptr to the digest to + * be returned. + */ + offset = uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(digest->cd_uio)) { + /* + * The caller specified an offset that is + * larger than the total size of the buffers + * it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + if (offset + digest_len <= + uio_iovlen(digest->cd_uio, vec_idx)) { + /* + * The computed SHA1 digest will fit in the current + * iovec. + */ + if (digest_len != SHA1_DIGEST_LENGTH) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA1Final(digest_scratch, sha1_ctx); + bcopy(digest_scratch, (uchar_t *)uio_iovbase(digest-> + cd_uio, vec_idx) + offset, + digest_len); + } else { + SHA1Final((uchar_t *)uio_iovbase(digest-> + cd_uio, vec_idx) + offset, + sha1_ctx); + } + } else { + /* + * The computed digest will be crossing one or more iovec's. + * This is bad performance-wise but we need to support it. + * Allocate a small scratch buffer on the stack and + * copy it piece meal to the specified digest iovec's. + */ + uchar_t digest_tmp[SHA1_DIGEST_LENGTH]; + off_t scratch_offset = 0; + size_t length = digest_len; + size_t cur_len; + + SHA1Final(digest_tmp, sha1_ctx); + + while (vec_idx < uio_iovcnt(digest->cd_uio) && length > 0) { + cur_len = MIN(uio_iovlen(digest->cd_uio, vec_idx) - + offset, length); + bcopy(digest_tmp + scratch_offset, + uio_iovbase(digest->cd_uio, vec_idx) + offset, + cur_len); + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + + if (vec_idx == uio_iovcnt(digest->cd_uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it + * provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + } + + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +static int +sha1_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((digest->cd_length == 0) || + (digest->cd_length < SHA1_DIGEST_LENGTH)) { + digest->cd_length = SHA1_DIGEST_LENGTH; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do the SHA1 update on the specified input data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret != CRYPTO_SUCCESS) { + /* the update failed, free context and bail */ + kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t)); + ctx->cc_provider_private = NULL; + digest->cd_length = 0; + return (ret); + } + + /* + * Do a SHA1 final, must be done separately since the digest + * type can be different than the input data type. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + digest, SHA1_DIGEST_LENGTH, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* all done, free context and return */ + + if (ret == CRYPTO_SUCCESS) { + digest->cd_length = SHA1_DIGEST_LENGTH; + } else { + digest->cd_length = 0; + } + + kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t)); + ctx->cc_provider_private = NULL; + return (ret); +} + +/* ARGSUSED */ +static int +sha1_digest_update(crypto_ctx_t *ctx, crypto_data_t *data, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * Do the SHA1 update on the specified input data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Update(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_update_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha1_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((digest->cd_length == 0) || + (digest->cd_length < SHA1_DIGEST_LENGTH)) { + digest->cd_length = SHA1_DIGEST_LENGTH; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do a SHA1 final. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &PROV_SHA1_CTX(ctx)->sc_sha1_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_final_uio(&PROV_SHA1_CTX(ctx)->sc_sha1_ctx, + digest, SHA1_DIGEST_LENGTH, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* all done, free context and return */ + + if (ret == CRYPTO_SUCCESS) { + digest->cd_length = SHA1_DIGEST_LENGTH; + } else { + digest->cd_length = 0; + } + + kmem_free(ctx->cc_provider_private, sizeof (sha1_ctx_t)); + ctx->cc_provider_private = NULL; + + return (ret); +} + +/* ARGSUSED */ +static int +sha1_digest_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + SHA1_CTX sha1_ctx; + + if (mechanism->cm_type != SHA1_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + /* + * Do the SHA1 init. + */ + SHA1Init(&sha1_ctx); + + /* + * Do the SHA1 update on the specified input data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Update(&sha1_ctx, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_update_uio(&sha1_ctx, data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret != CRYPTO_SUCCESS) { + /* the update failed, bail */ + digest->cd_length = 0; + return (ret); + } + + /* + * Do a SHA1 final, must be done separately since the digest + * type can be different than the input data type. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &sha1_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_final_uio(&sha1_ctx, digest, + SHA1_DIGEST_LENGTH, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + digest->cd_length = SHA1_DIGEST_LENGTH; + } else { + digest->cd_length = 0; + } + + return (ret); +} + +/* + * KCF software provider mac entry points. + * + * SHA1 HMAC is: SHA1(key XOR opad, SHA1(key XOR ipad, text)) + * + * Init: + * The initialization routine initializes what we denote + * as the inner and outer contexts by doing + * - for inner context: SHA1(key XOR ipad) + * - for outer context: SHA1(key XOR opad) + * + * Update: + * Each subsequent SHA1 HMAC update will result in an + * update of the inner context with the specified data. + * + * Final: + * The SHA1 HMAC final will do a SHA1 final operation on the + * inner context, and the resulting digest will be used + * as the data for an update on the outer context. Last + * but not least, a SHA1 final on the outer context will + * be performed to obtain the SHA1 HMAC digest to return + * to the user. + */ + +/* + * Initialize a SHA1-HMAC context. + */ +static void +sha1_mac_init_ctx(sha1_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes) +{ + uint32_t ipad[SHA1_HMAC_INTS_PER_BLOCK]; + uint32_t opad[SHA1_HMAC_INTS_PER_BLOCK]; + uint_t i; + + bzero(ipad, SHA1_HMAC_BLOCK_SIZE); + bzero(opad, SHA1_HMAC_BLOCK_SIZE); + + bcopy(keyval, ipad, length_in_bytes); + bcopy(keyval, opad, length_in_bytes); + + /* XOR key with ipad (0x36) and opad (0x5c) */ + for (i = 0; i < SHA1_HMAC_INTS_PER_BLOCK; i++) { + ipad[i] ^= 0x36363636; + opad[i] ^= 0x5c5c5c5c; + } + + /* perform SHA1 on ipad */ + SHA1Init(&ctx->hc_icontext); + SHA1Update(&ctx->hc_icontext, (uint8_t *)ipad, SHA1_HMAC_BLOCK_SIZE); + + /* perform SHA1 on opad */ + SHA1Init(&ctx->hc_ocontext); + SHA1Update(&ctx->hc_ocontext, (uint8_t *)opad, SHA1_HMAC_BLOCK_SIZE); +} + +/* + */ +static int +sha1_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t ctx_template, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE && + mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + ctx->cc_provider_private = kmem_alloc(sizeof (sha1_hmac_ctx_t), + crypto_kmflag(req)); + if (ctx->cc_provider_private == NULL) + return (CRYPTO_HOST_MEMORY); + + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, PROV_SHA1_HMAC_CTX(ctx), + sizeof (sha1_hmac_ctx_t)); + } else { + /* no context template, compute context */ + if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) { + uchar_t digested_key[SHA1_DIGEST_LENGTH]; + sha1_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private; + + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA1_DIGEST_KEY(&hmac_ctx->hc_icontext, + key->ck_data, keylen_in_bytes, digested_key); + sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx), + digested_key, SHA1_DIGEST_LENGTH); + } else { + sha1_mac_init_ctx(PROV_SHA1_HMAC_CTX(ctx), + key->ck_data, keylen_in_bytes); + } + } + + /* + * Get the mechanism parameters, if applicable. + */ + PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type; + if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) + ret = CRYPTO_MECHANISM_PARAM_INVALID; + PROV_SHA1_GET_DIGEST_LEN(mechanism, + PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len); + if (PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len > + SHA1_DIGEST_LENGTH) + ret = CRYPTO_MECHANISM_PARAM_INVALID; + } + + if (ret != CRYPTO_SUCCESS) { + bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t)); + kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t)); + ctx->cc_provider_private = NULL; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha1_mac_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * Do a SHA1 update of the inner context using the specified + * data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_icontext, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_update_uio( + &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext, data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha1_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA1_DIGEST_LENGTH]; + uint32_t digest_len = SHA1_DIGEST_LENGTH; + + ASSERT(ctx->cc_provider_private != NULL); + + if (PROV_SHA1_HMAC_CTX(ctx)->hc_mech_type == + SHA1_HMAC_GEN_MECH_INFO_TYPE) + digest_len = PROV_SHA1_HMAC_CTX(ctx)->hc_digest_len; + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) { + mac->cd_length = digest_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do a SHA1 final on the inner context. + */ + SHA1Final(digest, &PROV_SHA1_HMAC_CTX(ctx)->hc_icontext); + + /* + * Do a SHA1 update on the outer context, feeding the inner + * digest as data. + */ + SHA1Update(&PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, digest, + SHA1_DIGEST_LENGTH); + + /* + * Do a SHA1 final on the outer context, storing the computing + * digest in the users buffer. + */ + switch (mac->cd_format) { + case CRYPTO_DATA_RAW: + if (digest_len != SHA1_DIGEST_LENGTH) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA1Final(digest, + &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext); + bcopy(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len); + } else { + SHA1Final((unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, + &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext); + } + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_final_uio( + &PROV_SHA1_HMAC_CTX(ctx)->hc_ocontext, mac, + digest_len, digest); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + mac->cd_length = digest_len; + } else { + mac->cd_length = 0; + } + + bzero(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t)); + kmem_free(ctx->cc_provider_private, sizeof (sha1_hmac_ctx_t)); + ctx->cc_provider_private = NULL; + + return (ret); +} + +#define SHA1_MAC_UPDATE(data, ctx, ret) { \ + switch (data->cd_format) { \ + case CRYPTO_DATA_RAW: \ + SHA1Update(&(ctx).hc_icontext, \ + (uint8_t *)data->cd_raw.iov_base + \ + data->cd_offset, data->cd_length); \ + break; \ + case CRYPTO_DATA_UIO: \ + ret = sha1_digest_update_uio(&(ctx).hc_icontext, data); \ + break; \ + default: \ + ret = CRYPTO_ARGUMENTS_BAD; \ + } \ +} + +/* ARGSUSED */ +static int +sha1_mac_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA1_DIGEST_LENGTH]; + sha1_hmac_ctx_t sha1_hmac_ctx; + uint32_t digest_len = SHA1_DIGEST_LENGTH; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE && + mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + } else { + /* no context template, initialize context */ + if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) { + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext, + key->ck_data, keylen_in_bytes, digest); + sha1_mac_init_ctx(&sha1_hmac_ctx, digest, + SHA1_DIGEST_LENGTH); + } else { + sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data, + keylen_in_bytes); + } + } + + /* get the mechanism parameters, if applicable */ + if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len); + if (digest_len > SHA1_DIGEST_LENGTH) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + } + + /* do a SHA1 update of the inner context using the specified data */ + SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret); + if (ret != CRYPTO_SUCCESS) + /* the update failed, free context and bail */ + goto bail; + + /* + * Do a SHA1 final on the inner context. + */ + SHA1Final(digest, &sha1_hmac_ctx.hc_icontext); + + /* + * Do an SHA1 update on the outer context, feeding the inner + * digest as data. + */ + SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH); + + /* + * Do a SHA1 final on the outer context, storing the computed + * digest in the users buffer. + */ + switch (mac->cd_format) { + case CRYPTO_DATA_RAW: + if (digest_len != SHA1_DIGEST_LENGTH) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext); + bcopy(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len); + } else { + SHA1Final((unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, &sha1_hmac_ctx.hc_ocontext); + } + break; + case CRYPTO_DATA_UIO: + ret = sha1_digest_final_uio(&sha1_hmac_ctx.hc_ocontext, mac, + digest_len, digest); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + mac->cd_length = digest_len; + } else { + mac->cd_length = 0; + } + /* Extra paranoia: zeroize the context on the stack */ + bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + + return (ret); +bail: + bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + mac->cd_length = 0; + return (ret); +} + +/* ARGSUSED */ +static int +sha1_mac_verify_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA1_DIGEST_LENGTH]; + sha1_hmac_ctx_t sha1_hmac_ctx; + uint32_t digest_len = SHA1_DIGEST_LENGTH; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + if (mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE && + mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE) + return (CRYPTO_MECHANISM_INVALID); + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, &sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + } else { + /* no context template, initialize context */ + if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) { + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx.hc_icontext, + key->ck_data, keylen_in_bytes, digest); + sha1_mac_init_ctx(&sha1_hmac_ctx, digest, + SHA1_DIGEST_LENGTH); + } else { + sha1_mac_init_ctx(&sha1_hmac_ctx, key->ck_data, + keylen_in_bytes); + } + } + + /* get the mechanism parameters, if applicable */ + if (mechanism->cm_type == SHA1_HMAC_GEN_MECH_INFO_TYPE) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + PROV_SHA1_GET_DIGEST_LEN(mechanism, digest_len); + if (digest_len > SHA1_DIGEST_LENGTH) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + } + + if (mac->cd_length != digest_len) { + ret = CRYPTO_INVALID_MAC; + goto bail; + } + + /* do a SHA1 update of the inner context using the specified data */ + SHA1_MAC_UPDATE(data, sha1_hmac_ctx, ret); + if (ret != CRYPTO_SUCCESS) + /* the update failed, free context and bail */ + goto bail; + + /* do a SHA1 final on the inner context */ + SHA1Final(digest, &sha1_hmac_ctx.hc_icontext); + + /* + * Do an SHA1 update on the outer context, feeding the inner + * digest as data. + */ + SHA1Update(&sha1_hmac_ctx.hc_ocontext, digest, SHA1_DIGEST_LENGTH); + + /* + * Do a SHA1 final on the outer context, storing the computed + * digest in the users buffer. + */ + SHA1Final(digest, &sha1_hmac_ctx.hc_ocontext); + + /* + * Compare the computed digest against the expected digest passed + * as argument. + */ + + switch (mac->cd_format) { + + case CRYPTO_DATA_RAW: + if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len) != 0) + ret = CRYPTO_INVALID_MAC; + break; + + case CRYPTO_DATA_UIO: { + off_t offset = mac->cd_offset; + uint_t vec_idx = 0; + off_t scratch_offset = 0; + size_t length = digest_len; + size_t cur_len; + + /* we support only kernel buffer */ + if (uio_segflg(mac->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* jump to the first iovec containing the expected digest */ + offset = uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(mac->cd_uio)) { + /* + * The caller specified an offset that is + * larger than the total size of the buffers + * it provided. + */ + ret = CRYPTO_DATA_LEN_RANGE; + break; + } + + /* do the comparison of computed digest vs specified one */ + while (vec_idx < uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(uio_iovlen(mac->cd_uio, vec_idx) - + offset, length); + + if (bcmp(digest + scratch_offset, + uio_iovbase(mac->cd_uio, vec_idx) + offset, + cur_len) != 0) { + ret = CRYPTO_INVALID_MAC; + break; + } + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + break; + } + + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + return (ret); +bail: + bzero(&sha1_hmac_ctx, sizeof (sha1_hmac_ctx_t)); + mac->cd_length = 0; + return (ret); +} + +/* + * KCF software provider context management entry points. + */ + +/* ARGSUSED */ +static int +sha1_create_ctx_template(crypto_provider_handle_t provider, + crypto_mechanism_t *mechanism, crypto_key_t *key, + crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size, + crypto_req_handle_t req) +{ + sha1_hmac_ctx_t *sha1_hmac_ctx_tmpl; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + if ((mechanism->cm_type != SHA1_HMAC_MECH_INFO_TYPE) && + (mechanism->cm_type != SHA1_HMAC_GEN_MECH_INFO_TYPE)) { + return (CRYPTO_MECHANISM_INVALID); + } + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Allocate and initialize SHA1 context. + */ + sha1_hmac_ctx_tmpl = kmem_alloc(sizeof (sha1_hmac_ctx_t), + crypto_kmflag(req)); + if (sha1_hmac_ctx_tmpl == NULL) + return (CRYPTO_HOST_MEMORY); + + if (keylen_in_bytes > SHA1_HMAC_BLOCK_SIZE) { + uchar_t digested_key[SHA1_DIGEST_LENGTH]; + + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA1_DIGEST_KEY(&sha1_hmac_ctx_tmpl->hc_icontext, + key->ck_data, keylen_in_bytes, digested_key); + sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, digested_key, + SHA1_DIGEST_LENGTH); + } else { + sha1_mac_init_ctx(sha1_hmac_ctx_tmpl, key->ck_data, + keylen_in_bytes); + } + + sha1_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type; + *ctx_template = (crypto_spi_ctx_template_t)sha1_hmac_ctx_tmpl; + *ctx_template_size = sizeof (sha1_hmac_ctx_t); + + + return (CRYPTO_SUCCESS); +} + +static int +sha1_free_context(crypto_ctx_t *ctx) +{ + uint_t ctx_len; + sha1_mech_type_t mech_type; + + if (ctx->cc_provider_private == NULL) + return (CRYPTO_SUCCESS); + + /* + * We have to free either SHA1 or SHA1-HMAC contexts, which + * have different lengths. + */ + + mech_type = PROV_SHA1_CTX(ctx)->sc_mech_type; + if (mech_type == SHA1_MECH_INFO_TYPE) + ctx_len = sizeof (sha1_ctx_t); + else { + ASSERT(mech_type == SHA1_HMAC_MECH_INFO_TYPE || + mech_type == SHA1_HMAC_GEN_MECH_INFO_TYPE); + ctx_len = sizeof (sha1_hmac_ctx_t); + } + + bzero(ctx->cc_provider_private, ctx_len); + kmem_free(ctx->cc_provider_private, ctx_len); + ctx->cc_provider_private = NULL; + + return (CRYPTO_SUCCESS); +} diff --git a/module/icp/io/sha2_mod.c b/module/icp/io/sha2_mod.c new file mode 100644 index 000000000000..a4a5c6041dd0 --- /dev/null +++ b/module/icp/io/sha2_mod.c @@ -0,0 +1,1399 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#define _SHA2_IMPL +#include +#include + +/* + * The sha2 module is created with two modlinkages: + * - a modlmisc that allows consumers to directly call the entry points + * SHA2Init, SHA2Update, and SHA2Final. + * - a modlcrypto that allows the module to register with the Kernel + * Cryptographic Framework (KCF) as a software provider for the SHA2 + * mechanisms. + */ + +static struct modlcrypto modlcrypto = { + &mod_cryptoops, + "SHA2 Kernel SW Provider" +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modlcrypto, NULL} +}; + +/* + * Macros to access the SHA2 or SHA2-HMAC contexts from a context passed + * by KCF to one of the entry points. + */ + +#define PROV_SHA2_CTX(ctx) ((sha2_ctx_t *)(ctx)->cc_provider_private) +#define PROV_SHA2_HMAC_CTX(ctx) ((sha2_hmac_ctx_t *)(ctx)->cc_provider_private) + +/* to extract the digest length passed as mechanism parameter */ +#define PROV_SHA2_GET_DIGEST_LEN(m, len) { \ + if (IS_P2ALIGNED((m)->cm_param, sizeof (ulong_t))) \ + (len) = (uint32_t)*((ulong_t *)(m)->cm_param); \ + else { \ + ulong_t tmp_ulong; \ + bcopy((m)->cm_param, &tmp_ulong, sizeof (ulong_t)); \ + (len) = (uint32_t)tmp_ulong; \ + } \ +} + +#define PROV_SHA2_DIGEST_KEY(mech, ctx, key, len, digest) { \ + SHA2Init(mech, ctx); \ + SHA2Update(ctx, key, len); \ + SHA2Final(digest, ctx); \ +} + +/* + * Mechanism info structure passed to KCF during registration. + */ +static crypto_mech_info_t sha2_mech_info_tab[] = { + /* SHA256 */ + {SUN_CKM_SHA256, SHA256_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA256-HMAC */ + {SUN_CKM_SHA256_HMAC, SHA256_HMAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA256-HMAC GENERAL */ + {SUN_CKM_SHA256_HMAC_GENERAL, SHA256_HMAC_GEN_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA384 */ + {SUN_CKM_SHA384, SHA384_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA384-HMAC */ + {SUN_CKM_SHA384_HMAC, SHA384_HMAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA384-HMAC GENERAL */ + {SUN_CKM_SHA384_HMAC_GENERAL, SHA384_HMAC_GEN_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA512 */ + {SUN_CKM_SHA512, SHA512_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + /* SHA512-HMAC */ + {SUN_CKM_SHA512_HMAC, SHA512_HMAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + /* SHA512-HMAC GENERAL */ + {SUN_CKM_SHA512_HMAC_GENERAL, SHA512_HMAC_GEN_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, + SHA2_HMAC_MIN_KEY_LEN, SHA2_HMAC_MAX_KEY_LEN, + CRYPTO_KEYSIZE_UNIT_IN_BYTES} +}; + +static void sha2_provider_status(crypto_provider_handle_t, uint_t *); + +static crypto_control_ops_t sha2_control_ops = { + sha2_provider_status +}; + +static int sha2_digest_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_req_handle_t); +static int sha2_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha2_digest_update(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha2_digest_final(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha2_digest_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + +static crypto_digest_ops_t sha2_digest_ops = { + .digest_init = sha2_digest_init, + .digest = sha2_digest, + .digest_update = sha2_digest_update, + .digest_key = NULL, + .digest_final = sha2_digest_final, + .digest_atomic = sha2_digest_atomic +}; + +static int sha2_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int sha2_mac_update(crypto_ctx_t *, crypto_data_t *, + crypto_req_handle_t); +static int sha2_mac_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int sha2_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int sha2_mac_verify_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_mac_ops_t sha2_mac_ops = { + .mac_init = sha2_mac_init, + .mac = NULL, + .mac_update = sha2_mac_update, + .mac_final = sha2_mac_final, + .mac_atomic = sha2_mac_atomic, + .mac_verify_atomic = sha2_mac_verify_atomic +}; + +static int sha2_create_ctx_template(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, + size_t *, crypto_req_handle_t); +static int sha2_free_context(crypto_ctx_t *); + +static crypto_ctx_ops_t sha2_ctx_ops = { + .create_ctx_template = sha2_create_ctx_template, + .free_context = sha2_free_context +}; + +static crypto_ops_t sha2_crypto_ops = {{{{{ + &sha2_control_ops, + &sha2_digest_ops, + NULL, + &sha2_mac_ops, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &sha2_ctx_ops +}}}}}; + +static crypto_provider_info_t sha2_prov_info = {{{{ + CRYPTO_SPI_VERSION_1, + "SHA2 Software Provider", + CRYPTO_SW_PROVIDER, + NULL, + &sha2_crypto_ops, + sizeof (sha2_mech_info_tab)/sizeof (crypto_mech_info_t), + sha2_mech_info_tab +}}}}; + +static crypto_kcf_provider_handle_t sha2_prov_handle = 0; + +int +sha2_mod_init(void) +{ + int ret; + + if ((ret = mod_install(&modlinkage)) != 0) + return (ret); + + /* + * Register with KCF. If the registration fails, log an + * error but do not uninstall the module, since the functionality + * provided by misc/sha2 should still be available. + */ + if ((ret = crypto_register_provider(&sha2_prov_info, + &sha2_prov_handle)) != CRYPTO_SUCCESS) + cmn_err(CE_WARN, "sha2 _init: " + "crypto_register_provider() failed (0x%x)", ret); + + return (0); +} + +int +sha2_mod_fini(void) +{ + int ret; + + if (sha2_prov_handle != 0) { + if ((ret = crypto_unregister_provider(sha2_prov_handle)) != + CRYPTO_SUCCESS) { + cmn_err(CE_WARN, + "sha2 _fini: crypto_unregister_provider() " + "failed (0x%x)", ret); + return (EBUSY); + } + sha2_prov_handle = 0; + } + + return (mod_remove(&modlinkage)); +} + +/* + * KCF software provider control entry points. + */ +/* ARGSUSED */ +static void +sha2_provider_status(crypto_provider_handle_t provider, uint_t *status) +{ + *status = CRYPTO_PROVIDER_READY; +} + +/* + * KCF software provider digest entry points. + */ + +static int +sha2_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_req_handle_t req) +{ + + /* + * Allocate and initialize SHA2 context. + */ + ctx->cc_provider_private = kmem_alloc(sizeof (sha2_ctx_t), + crypto_kmflag(req)); + if (ctx->cc_provider_private == NULL) + return (CRYPTO_HOST_MEMORY); + + PROV_SHA2_CTX(ctx)->sc_mech_type = mechanism->cm_type; + SHA2Init(mechanism->cm_type, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); + + return (CRYPTO_SUCCESS); +} + +/* + * Helper SHA2 digest update function for uio data. + */ +static int +sha2_digest_update_uio(SHA2_CTX *sha2_ctx, crypto_data_t *data) +{ + off_t offset = data->cd_offset; + size_t length = data->cd_length; + uint_t vec_idx = 0; + size_t cur_len; + + /* we support only kernel buffer */ + if (uio_segflg(data->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing data to be + * digested. + */ + offset = uio_index_at_offset(data->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(data->cd_uio)) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * Now do the digesting on the iovecs. + */ + while (vec_idx < uio_iovcnt(data->cd_uio) && length > 0) { + cur_len = MIN(uio_iovlen(data->cd_uio, vec_idx) - + offset, length); + + SHA2Update(sha2_ctx, (uint8_t *)uio_iovbase(data->cd_uio, + vec_idx) + offset, cur_len); + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio_iovcnt(data->cd_uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + return (CRYPTO_SUCCESS); +} + +/* + * Helper SHA2 digest final function for uio data. + * digest_len is the length of the desired digest. If digest_len + * is smaller than the default SHA2 digest length, the caller + * must pass a scratch buffer, digest_scratch, which must + * be at least the algorithm's digest length bytes. + */ +static int +sha2_digest_final_uio(SHA2_CTX *sha2_ctx, crypto_data_t *digest, + ulong_t digest_len, uchar_t *digest_scratch) +{ + off_t offset = digest->cd_offset; + uint_t vec_idx = 0; + + /* we support only kernel buffer */ + if (uio_segflg(digest->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing ptr to the digest to + * be returned. + */ + offset = uio_index_at_offset(digest->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(digest->cd_uio)) { + /* + * The caller specified an offset that is + * larger than the total size of the buffers + * it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + if (offset + digest_len <= + uio_iovlen(digest->cd_uio, vec_idx)) { + /* + * The computed SHA2 digest will fit in the current + * iovec. + */ + if (((sha2_ctx->algotype <= SHA256_HMAC_GEN_MECH_INFO_TYPE) && + (digest_len != SHA256_DIGEST_LENGTH)) || + ((sha2_ctx->algotype > SHA256_HMAC_GEN_MECH_INFO_TYPE) && + (digest_len != SHA512_DIGEST_LENGTH))) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA2Final(digest_scratch, sha2_ctx); + + bcopy(digest_scratch, (uchar_t *)uio_iovbase(digest-> + cd_uio, vec_idx) + offset, + digest_len); + } else { + SHA2Final((uchar_t *)uio_iovbase(digest-> + cd_uio, vec_idx) + offset, + sha2_ctx); + + } + } else { + /* + * The computed digest will be crossing one or more iovec's. + * This is bad performance-wise but we need to support it. + * Allocate a small scratch buffer on the stack and + * copy it piece meal to the specified digest iovec's. + */ + uchar_t digest_tmp[SHA512_DIGEST_LENGTH]; + off_t scratch_offset = 0; + size_t length = digest_len; + size_t cur_len; + + SHA2Final(digest_tmp, sha2_ctx); + + while (vec_idx < uio_iovcnt(digest->cd_uio) && length > 0) { + cur_len = + MIN(uio_iovlen(digest->cd_uio, vec_idx) - + offset, length); + bcopy(digest_tmp + scratch_offset, + uio_iovbase(digest->cd_uio, vec_idx) + offset, + cur_len); + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + + if (vec_idx == uio_iovcnt(digest->cd_uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it + * provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + } + + return (CRYPTO_SUCCESS); +} + +/* ARGSUSED */ +static int +sha2_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uint_t sha_digest_len; + + ASSERT(ctx->cc_provider_private != NULL); + + switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { + case SHA256_MECH_INFO_TYPE: + sha_digest_len = SHA256_DIGEST_LENGTH; + break; + case SHA384_MECH_INFO_TYPE: + sha_digest_len = SHA384_DIGEST_LENGTH; + break; + case SHA512_MECH_INFO_TYPE: + sha_digest_len = SHA512_DIGEST_LENGTH; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((digest->cd_length == 0) || + (digest->cd_length < sha_digest_len)) { + digest->cd_length = sha_digest_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do the SHA2 update on the specified input data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret != CRYPTO_SUCCESS) { + /* the update failed, free context and bail */ + kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); + ctx->cc_provider_private = NULL; + digest->cd_length = 0; + return (ret); + } + + /* + * Do a SHA2 final, must be done separately since the digest + * type can be different than the input data type. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + digest, sha_digest_len, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* all done, free context and return */ + + if (ret == CRYPTO_SUCCESS) + digest->cd_length = sha_digest_len; + else + digest->cd_length = 0; + + kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); + ctx->cc_provider_private = NULL; + return (ret); +} + +/* ARGSUSED */ +static int +sha2_digest_update(crypto_ctx_t *ctx, crypto_data_t *data, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * Do the SHA2 update on the specified input data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Update(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_update_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha2_digest_final(crypto_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uint_t sha_digest_len; + + ASSERT(ctx->cc_provider_private != NULL); + + switch (PROV_SHA2_CTX(ctx)->sc_mech_type) { + case SHA256_MECH_INFO_TYPE: + sha_digest_len = SHA256_DIGEST_LENGTH; + break; + case SHA384_MECH_INFO_TYPE: + sha_digest_len = SHA384_DIGEST_LENGTH; + break; + case SHA512_MECH_INFO_TYPE: + sha_digest_len = SHA512_DIGEST_LENGTH; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((digest->cd_length == 0) || + (digest->cd_length < sha_digest_len)) { + digest->cd_length = sha_digest_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do a SHA2 final. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &PROV_SHA2_CTX(ctx)->sc_sha2_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_final_uio(&PROV_SHA2_CTX(ctx)->sc_sha2_ctx, + digest, sha_digest_len, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* all done, free context and return */ + + if (ret == CRYPTO_SUCCESS) + digest->cd_length = sha_digest_len; + else + digest->cd_length = 0; + + kmem_free(ctx->cc_provider_private, sizeof (sha2_ctx_t)); + ctx->cc_provider_private = NULL; + + return (ret); +} + +/* ARGSUSED */ +static int +sha2_digest_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + SHA2_CTX sha2_ctx; + uint32_t sha_digest_len; + + /* + * Do the SHA inits. + */ + + SHA2Init(mechanism->cm_type, &sha2_ctx); + + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Update(&sha2_ctx, (uint8_t *)data-> + cd_raw.iov_base + data->cd_offset, data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_update_uio(&sha2_ctx, data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + /* + * Do the SHA updates on the specified input data. + */ + + if (ret != CRYPTO_SUCCESS) { + /* the update failed, bail */ + digest->cd_length = 0; + return (ret); + } + + if (mechanism->cm_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) + sha_digest_len = SHA256_DIGEST_LENGTH; + else + sha_digest_len = SHA512_DIGEST_LENGTH; + + /* + * Do a SHA2 final, must be done separately since the digest + * type can be different than the input data type. + */ + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Final((unsigned char *)digest->cd_raw.iov_base + + digest->cd_offset, &sha2_ctx); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_final_uio(&sha2_ctx, digest, + sha_digest_len, NULL); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) + digest->cd_length = sha_digest_len; + else + digest->cd_length = 0; + + return (ret); +} + +/* + * KCF software provider mac entry points. + * + * SHA2 HMAC is: SHA2(key XOR opad, SHA2(key XOR ipad, text)) + * + * Init: + * The initialization routine initializes what we denote + * as the inner and outer contexts by doing + * - for inner context: SHA2(key XOR ipad) + * - for outer context: SHA2(key XOR opad) + * + * Update: + * Each subsequent SHA2 HMAC update will result in an + * update of the inner context with the specified data. + * + * Final: + * The SHA2 HMAC final will do a SHA2 final operation on the + * inner context, and the resulting digest will be used + * as the data for an update on the outer context. Last + * but not least, a SHA2 final on the outer context will + * be performed to obtain the SHA2 HMAC digest to return + * to the user. + */ + +/* + * Initialize a SHA2-HMAC context. + */ +static void +sha2_mac_init_ctx(sha2_hmac_ctx_t *ctx, void *keyval, uint_t length_in_bytes) +{ + uint64_t ipad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)]; + uint64_t opad[SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t)]; + int i, block_size, blocks_per_int64; + + /* Determine the block size */ + if (ctx->hc_mech_type <= SHA256_HMAC_GEN_MECH_INFO_TYPE) { + block_size = SHA256_HMAC_BLOCK_SIZE; + blocks_per_int64 = SHA256_HMAC_BLOCK_SIZE / sizeof (uint64_t); + } else { + block_size = SHA512_HMAC_BLOCK_SIZE; + blocks_per_int64 = SHA512_HMAC_BLOCK_SIZE / sizeof (uint64_t); + } + + (void) bzero(ipad, block_size); + (void) bzero(opad, block_size); + (void) bcopy(keyval, ipad, length_in_bytes); + (void) bcopy(keyval, opad, length_in_bytes); + + /* XOR key with ipad (0x36) and opad (0x5c) */ + for (i = 0; i < blocks_per_int64; i ++) { + ipad[i] ^= 0x3636363636363636; + opad[i] ^= 0x5c5c5c5c5c5c5c5c; + } + + /* perform SHA2 on ipad */ + SHA2Init(ctx->hc_mech_type, &ctx->hc_icontext); + SHA2Update(&ctx->hc_icontext, (uint8_t *)ipad, block_size); + + /* perform SHA2 on opad */ + SHA2Init(ctx->hc_mech_type, &ctx->hc_ocontext); + SHA2Update(&ctx->hc_ocontext, (uint8_t *)opad, block_size); + +} + +/* + */ +static int +sha2_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t ctx_template, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + uint_t sha_digest_len, sha_hmac_block_size; + + /* + * Set the digest length and block size to values appropriate to the + * mechanism + */ + switch (mechanism->cm_type) { + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA256_DIGEST_LENGTH; + sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; + break; + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA512_DIGEST_LENGTH; + sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + ctx->cc_provider_private = kmem_alloc(sizeof (sha2_hmac_ctx_t), + crypto_kmflag(req)); + if (ctx->cc_provider_private == NULL) + return (CRYPTO_HOST_MEMORY); + + PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type = mechanism->cm_type; + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, PROV_SHA2_HMAC_CTX(ctx), + sizeof (sha2_hmac_ctx_t)); + } else { + /* no context template, compute context */ + if (keylen_in_bytes > sha_hmac_block_size) { + uchar_t digested_key[SHA512_DIGEST_LENGTH]; + sha2_hmac_ctx_t *hmac_ctx = ctx->cc_provider_private; + + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, + &hmac_ctx->hc_icontext, + key->ck_data, keylen_in_bytes, digested_key); + sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx), + digested_key, sha_digest_len); + } else { + sha2_mac_init_ctx(PROV_SHA2_HMAC_CTX(ctx), + key->ck_data, keylen_in_bytes); + } + } + + /* + * Get the mechanism parameters, if applicable. + */ + if (mechanism->cm_type % 3 == 2) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) + ret = CRYPTO_MECHANISM_PARAM_INVALID; + PROV_SHA2_GET_DIGEST_LEN(mechanism, + PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len); + if (PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len > sha_digest_len) + ret = CRYPTO_MECHANISM_PARAM_INVALID; + } + + if (ret != CRYPTO_SUCCESS) { + bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); + kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); + ctx->cc_provider_private = NULL; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha2_mac_update(crypto_ctx_t *ctx, crypto_data_t *data, + crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + + ASSERT(ctx->cc_provider_private != NULL); + + /* + * Do a SHA2 update of the inner context using the specified + * data. + */ + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_update_uio( + &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext, data); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + return (ret); +} + +/* ARGSUSED */ +static int +sha2_mac_final(crypto_ctx_t *ctx, crypto_data_t *mac, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA512_DIGEST_LENGTH]; + uint32_t digest_len, sha_digest_len; + + ASSERT(ctx->cc_provider_private != NULL); + + /* Set the digest lengths to values appropriate to the mechanism */ + switch (PROV_SHA2_HMAC_CTX(ctx)->hc_mech_type) { + case SHA256_HMAC_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; + break; + case SHA384_HMAC_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA384_DIGEST_LENGTH; + break; + case SHA512_HMAC_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; + break; + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA256_DIGEST_LENGTH; + digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; + break; + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA512_DIGEST_LENGTH; + digest_len = PROV_SHA2_HMAC_CTX(ctx)->hc_digest_len; + break; + default: + return (CRYPTO_ARGUMENTS_BAD); + } + + /* + * We need to just return the length needed to store the output. + * We should not destroy the context for the following cases. + */ + if ((mac->cd_length == 0) || (mac->cd_length < digest_len)) { + mac->cd_length = digest_len; + return (CRYPTO_BUFFER_TOO_SMALL); + } + + /* + * Do a SHA2 final on the inner context. + */ + SHA2Final(digest, &PROV_SHA2_HMAC_CTX(ctx)->hc_icontext); + + /* + * Do a SHA2 update on the outer context, feeding the inner + * digest as data. + */ + SHA2Update(&PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, digest, + sha_digest_len); + + /* + * Do a SHA2 final on the outer context, storing the computing + * digest in the users buffer. + */ + switch (mac->cd_format) { + case CRYPTO_DATA_RAW: + if (digest_len != sha_digest_len) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA2Final(digest, + &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext); + bcopy(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len); + } else { + SHA2Final((unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, + &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext); + } + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_final_uio( + &PROV_SHA2_HMAC_CTX(ctx)->hc_ocontext, mac, + digest_len, digest); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) + mac->cd_length = digest_len; + else + mac->cd_length = 0; + + bzero(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); + kmem_free(ctx->cc_provider_private, sizeof (sha2_hmac_ctx_t)); + ctx->cc_provider_private = NULL; + + return (ret); +} + +#define SHA2_MAC_UPDATE(data, ctx, ret) { \ + switch (data->cd_format) { \ + case CRYPTO_DATA_RAW: \ + SHA2Update(&(ctx).hc_icontext, \ + (uint8_t *)data->cd_raw.iov_base + \ + data->cd_offset, data->cd_length); \ + break; \ + case CRYPTO_DATA_UIO: \ + ret = sha2_digest_update_uio(&(ctx).hc_icontext, data); \ + break; \ + default: \ + ret = CRYPTO_ARGUMENTS_BAD; \ + } \ +} + +/* ARGSUSED */ +static int +sha2_mac_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA512_DIGEST_LENGTH]; + sha2_hmac_ctx_t sha2_hmac_ctx; + uint32_t sha_digest_len, digest_len, sha_hmac_block_size; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + /* + * Set the digest length and block size to values appropriate to the + * mechanism + */ + switch (mechanism->cm_type) { + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; + sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; + break; + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; + sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t)); + } else { + sha2_hmac_ctx.hc_mech_type = mechanism->cm_type; + /* no context template, initialize context */ + if (keylen_in_bytes > sha_hmac_block_size) { + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, + &sha2_hmac_ctx.hc_icontext, + key->ck_data, keylen_in_bytes, digest); + sha2_mac_init_ctx(&sha2_hmac_ctx, digest, + sha_digest_len); + } else { + sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data, + keylen_in_bytes); + } + } + + /* get the mechanism parameters, if applicable */ + if ((mechanism->cm_type % 3) == 2) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); + if (digest_len > sha_digest_len) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + } + + /* do a SHA2 update of the inner context using the specified data */ + SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret); + if (ret != CRYPTO_SUCCESS) + /* the update failed, free context and bail */ + goto bail; + + /* + * Do a SHA2 final on the inner context. + */ + SHA2Final(digest, &sha2_hmac_ctx.hc_icontext); + + /* + * Do an SHA2 update on the outer context, feeding the inner + * digest as data. + * + * HMAC-SHA384 needs special handling as the outer hash needs only 48 + * bytes of the inner hash value. + */ + if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || + mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, + SHA384_DIGEST_LENGTH); + else + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); + + /* + * Do a SHA2 final on the outer context, storing the computed + * digest in the users buffer. + */ + switch (mac->cd_format) { + case CRYPTO_DATA_RAW: + if (digest_len != sha_digest_len) { + /* + * The caller requested a short digest. Digest + * into a scratch buffer and return to + * the user only what was requested. + */ + SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext); + bcopy(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len); + } else { + SHA2Final((unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, &sha2_hmac_ctx.hc_ocontext); + } + break; + case CRYPTO_DATA_UIO: + ret = sha2_digest_final_uio(&sha2_hmac_ctx.hc_ocontext, mac, + digest_len, digest); + break; + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + if (ret == CRYPTO_SUCCESS) { + mac->cd_length = digest_len; + return (CRYPTO_SUCCESS); + } +bail: + bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t)); + mac->cd_length = 0; + return (ret); +} + +/* ARGSUSED */ +static int +sha2_mac_verify_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + int ret = CRYPTO_SUCCESS; + uchar_t digest[SHA512_DIGEST_LENGTH]; + sha2_hmac_ctx_t sha2_hmac_ctx; + uint32_t sha_digest_len, digest_len, sha_hmac_block_size; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + + /* + * Set the digest length and block size to values appropriate to the + * mechanism + */ + switch (mechanism->cm_type) { + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA256_DIGEST_LENGTH; + sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; + break; + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = digest_len = SHA512_DIGEST_LENGTH; + sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + if (ctx_template != NULL) { + /* reuse context template */ + bcopy(ctx_template, &sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t)); + } else { + sha2_hmac_ctx.hc_mech_type = mechanism->cm_type; + /* no context template, initialize context */ + if (keylen_in_bytes > sha_hmac_block_size) { + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, + &sha2_hmac_ctx.hc_icontext, + key->ck_data, keylen_in_bytes, digest); + sha2_mac_init_ctx(&sha2_hmac_ctx, digest, + sha_digest_len); + } else { + sha2_mac_init_ctx(&sha2_hmac_ctx, key->ck_data, + keylen_in_bytes); + } + } + + /* get the mechanism parameters, if applicable */ + if (mechanism->cm_type % 3 == 2) { + if (mechanism->cm_param == NULL || + mechanism->cm_param_len != sizeof (ulong_t)) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + PROV_SHA2_GET_DIGEST_LEN(mechanism, digest_len); + if (digest_len > sha_digest_len) { + ret = CRYPTO_MECHANISM_PARAM_INVALID; + goto bail; + } + } + + if (mac->cd_length != digest_len) { + ret = CRYPTO_INVALID_MAC; + goto bail; + } + + /* do a SHA2 update of the inner context using the specified data */ + SHA2_MAC_UPDATE(data, sha2_hmac_ctx, ret); + if (ret != CRYPTO_SUCCESS) + /* the update failed, free context and bail */ + goto bail; + + /* do a SHA2 final on the inner context */ + SHA2Final(digest, &sha2_hmac_ctx.hc_icontext); + + /* + * Do an SHA2 update on the outer context, feeding the inner + * digest as data. + * + * HMAC-SHA384 needs special handling as the outer hash needs only 48 + * bytes of the inner hash value. + */ + if (mechanism->cm_type == SHA384_HMAC_MECH_INFO_TYPE || + mechanism->cm_type == SHA384_HMAC_GEN_MECH_INFO_TYPE) + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, + SHA384_DIGEST_LENGTH); + else + SHA2Update(&sha2_hmac_ctx.hc_ocontext, digest, sha_digest_len); + + /* + * Do a SHA2 final on the outer context, storing the computed + * digest in the users buffer. + */ + SHA2Final(digest, &sha2_hmac_ctx.hc_ocontext); + + /* + * Compare the computed digest against the expected digest passed + * as argument. + */ + + switch (mac->cd_format) { + + case CRYPTO_DATA_RAW: + if (bcmp(digest, (unsigned char *)mac->cd_raw.iov_base + + mac->cd_offset, digest_len) != 0) + ret = CRYPTO_INVALID_MAC; + break; + + case CRYPTO_DATA_UIO: { + off_t offset = mac->cd_offset; + uint_t vec_idx = 0; + off_t scratch_offset = 0; + size_t length = digest_len; + size_t cur_len; + + /* we support only kernel buffer */ + if (uio_segflg(mac->cd_uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* jump to the first iovec containing the expected digest */ + offset = uio_index_at_offset(mac->cd_uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(mac->cd_uio)) { + /* + * The caller specified an offset that is + * larger than the total size of the buffers + * it provided. + */ + ret = CRYPTO_DATA_LEN_RANGE; + break; + } + + /* do the comparison of computed digest vs specified one */ + while (vec_idx < uio_iovcnt(mac->cd_uio) && length > 0) { + cur_len = MIN(uio_iovlen(mac->cd_uio, vec_idx) - + offset, length); + + if (bcmp(digest + scratch_offset, + uio_iovbase(mac->cd_uio, vec_idx) + offset, + cur_len) != 0) { + ret = CRYPTO_INVALID_MAC; + break; + } + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + break; + } + + default: + ret = CRYPTO_ARGUMENTS_BAD; + } + + return (ret); +bail: + bzero(&sha2_hmac_ctx, sizeof (sha2_hmac_ctx_t)); + mac->cd_length = 0; + return (ret); +} + +/* + * KCF software provider context management entry points. + */ + +/* ARGSUSED */ +static int +sha2_create_ctx_template(crypto_provider_handle_t provider, + crypto_mechanism_t *mechanism, crypto_key_t *key, + crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size, + crypto_req_handle_t req) +{ + sha2_hmac_ctx_t *sha2_hmac_ctx_tmpl; + uint_t keylen_in_bytes = CRYPTO_BITS2BYTES(key->ck_length); + uint32_t sha_digest_len, sha_hmac_block_size; + + /* + * Set the digest length and block size to values appropriate to the + * mechanism + */ + switch (mechanism->cm_type) { + case SHA256_HMAC_MECH_INFO_TYPE: + case SHA256_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA256_DIGEST_LENGTH; + sha_hmac_block_size = SHA256_HMAC_BLOCK_SIZE; + break; + case SHA384_HMAC_MECH_INFO_TYPE: + case SHA384_HMAC_GEN_MECH_INFO_TYPE: + case SHA512_HMAC_MECH_INFO_TYPE: + case SHA512_HMAC_GEN_MECH_INFO_TYPE: + sha_digest_len = SHA512_DIGEST_LENGTH; + sha_hmac_block_size = SHA512_HMAC_BLOCK_SIZE; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + + /* Add support for key by attributes (RFE 4706552) */ + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Allocate and initialize SHA2 context. + */ + sha2_hmac_ctx_tmpl = kmem_alloc(sizeof (sha2_hmac_ctx_t), + crypto_kmflag(req)); + if (sha2_hmac_ctx_tmpl == NULL) + return (CRYPTO_HOST_MEMORY); + + sha2_hmac_ctx_tmpl->hc_mech_type = mechanism->cm_type; + + if (keylen_in_bytes > sha_hmac_block_size) { + uchar_t digested_key[SHA512_DIGEST_LENGTH]; + + /* + * Hash the passed-in key to get a smaller key. + * The inner context is used since it hasn't been + * initialized yet. + */ + PROV_SHA2_DIGEST_KEY(mechanism->cm_type / 3, + &sha2_hmac_ctx_tmpl->hc_icontext, + key->ck_data, keylen_in_bytes, digested_key); + sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, digested_key, + sha_digest_len); + } else { + sha2_mac_init_ctx(sha2_hmac_ctx_tmpl, key->ck_data, + keylen_in_bytes); + } + + *ctx_template = (crypto_spi_ctx_template_t)sha2_hmac_ctx_tmpl; + *ctx_template_size = sizeof (sha2_hmac_ctx_t); + + return (CRYPTO_SUCCESS); +} + +static int +sha2_free_context(crypto_ctx_t *ctx) +{ + uint_t ctx_len; + + if (ctx->cc_provider_private == NULL) + return (CRYPTO_SUCCESS); + + /* + * We have to free either SHA2 or SHA2-HMAC contexts, which + * have different lengths. + * + * Note: Below is dependent on the mechanism ordering. + */ + + if (PROV_SHA2_CTX(ctx)->sc_mech_type % 3 == 0) + ctx_len = sizeof (sha2_ctx_t); + else + ctx_len = sizeof (sha2_hmac_ctx_t); + + bzero(ctx->cc_provider_private, ctx_len); + kmem_free(ctx->cc_provider_private, ctx_len); + ctx->cc_provider_private = NULL; + + return (CRYPTO_SUCCESS); +} diff --git a/module/icp/io/skein_mod.c b/module/icp/io/skein_mod.c new file mode 100644 index 000000000000..18026807fd84 --- /dev/null +++ b/module/icp/io/skein_mod.c @@ -0,0 +1,729 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include +#include +#include +#include +#include +#define SKEIN_MODULE_IMPL +#include + +/* + * Like the sha2 module, we create the skein module with two modlinkages: + * - modlmisc to allow direct calls to Skein_* API functions. + * - modlcrypto to integrate well into the Kernel Crypto Framework (KCF). + */ +static struct modlmisc modlmisc = { + &mod_cryptoops, + "Skein Message-Digest Algorithm" +}; + +static struct modlcrypto modlcrypto = { + &mod_cryptoops, + "Skein Kernel SW Provider" +}; + +static struct modlinkage modlinkage = { + MODREV_1, {&modlmisc, &modlcrypto, NULL} +}; + +static crypto_mech_info_t skein_mech_info_tab[] = { + {CKM_SKEIN_256, SKEIN_256_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN_256_MAC, SKEIN_256_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + {CKM_SKEIN_512, SKEIN_512_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN_512_MAC, SKEIN_512_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES}, + {CKM_SKEIN1024, SKEIN1024_MECH_INFO_TYPE, + CRYPTO_FG_DIGEST | CRYPTO_FG_DIGEST_ATOMIC, + 0, 0, CRYPTO_KEYSIZE_UNIT_IN_BITS}, + {CKM_SKEIN1024_MAC, SKEIN1024_MAC_MECH_INFO_TYPE, + CRYPTO_FG_MAC | CRYPTO_FG_MAC_ATOMIC, 1, INT_MAX, + CRYPTO_KEYSIZE_UNIT_IN_BYTES} +}; + +static void skein_provider_status(crypto_provider_handle_t, uint_t *); + +static crypto_control_ops_t skein_control_ops = { + skein_provider_status +}; + +static int skein_digest_init(crypto_ctx_t *, crypto_mechanism_t *, + crypto_req_handle_t); +static int skein_digest(crypto_ctx_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); +static int skein_update(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int skein_final(crypto_ctx_t *, crypto_data_t *, crypto_req_handle_t); +static int skein_digest_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_data_t *, crypto_data_t *, + crypto_req_handle_t); + +static crypto_digest_ops_t skein_digest_ops = { + .digest_init = skein_digest_init, + .digest = skein_digest, + .digest_update = skein_update, + .digest_key = NULL, + .digest_final = skein_final, + .digest_atomic = skein_digest_atomic +}; + +static int skein_mac_init(crypto_ctx_t *, crypto_mechanism_t *, crypto_key_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); +static int skein_mac_atomic(crypto_provider_handle_t, crypto_session_id_t, + crypto_mechanism_t *, crypto_key_t *, crypto_data_t *, crypto_data_t *, + crypto_spi_ctx_template_t, crypto_req_handle_t); + +static crypto_mac_ops_t skein_mac_ops = { + .mac_init = skein_mac_init, + .mac = NULL, + .mac_update = skein_update, /* using regular digest update is OK here */ + .mac_final = skein_final, /* using regular digest final is OK here */ + .mac_atomic = skein_mac_atomic, + .mac_verify_atomic = NULL +}; + +static int skein_create_ctx_template(crypto_provider_handle_t, + crypto_mechanism_t *, crypto_key_t *, crypto_spi_ctx_template_t *, + size_t *, crypto_req_handle_t); +static int skein_free_context(crypto_ctx_t *); + +static crypto_ctx_ops_t skein_ctx_ops = { + .create_ctx_template = skein_create_ctx_template, + .free_context = skein_free_context +}; + +static crypto_ops_t skein_crypto_ops = {{{{{ + &skein_control_ops, + &skein_digest_ops, + NULL, + &skein_mac_ops, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + &skein_ctx_ops, +}}}}}; + +static crypto_provider_info_t skein_prov_info = {{{{ + CRYPTO_SPI_VERSION_1, + "Skein Software Provider", + CRYPTO_SW_PROVIDER, + NULL, + &skein_crypto_ops, + sizeof (skein_mech_info_tab) / sizeof (crypto_mech_info_t), + skein_mech_info_tab +}}}}; + +static crypto_kcf_provider_handle_t skein_prov_handle = 0; + +typedef struct skein_ctx { + skein_mech_type_t sc_mech_type; + size_t sc_digest_bitlen; + /*LINTED(E_ANONYMOUS_UNION_DECL)*/ + union { + Skein_256_Ctxt_t sc_256; + Skein_512_Ctxt_t sc_512; + Skein1024_Ctxt_t sc_1024; + }; +} skein_ctx_t; +#define SKEIN_CTX(_ctx_) ((skein_ctx_t *)((_ctx_)->cc_provider_private)) +#define SKEIN_CTX_LVALUE(_ctx_) (_ctx_)->cc_provider_private +#define SKEIN_OP(_skein_ctx, _op, ...) \ + do { \ + skein_ctx_t *sc = (_skein_ctx); \ + switch (sc->sc_mech_type) { \ + case SKEIN_256_MECH_INFO_TYPE: \ + case SKEIN_256_MAC_MECH_INFO_TYPE: \ + (void) Skein_256_ ## _op(&sc->sc_256, __VA_ARGS__);\ + break; \ + case SKEIN_512_MECH_INFO_TYPE: \ + case SKEIN_512_MAC_MECH_INFO_TYPE: \ + (void) Skein_512_ ## _op(&sc->sc_512, __VA_ARGS__);\ + break; \ + case SKEIN1024_MECH_INFO_TYPE: \ + case SKEIN1024_MAC_MECH_INFO_TYPE: \ + (void) Skein1024_ ## _op(&sc->sc_1024, __VA_ARGS__);\ + break; \ + } \ + _NOTE(CONSTCOND) \ + } while (0) + +static int +skein_get_digest_bitlen(const crypto_mechanism_t *mechanism, size_t *result) +{ + if (mechanism->cm_param != NULL) { + /*LINTED(E_BAD_PTR_CAST_ALIGN)*/ + skein_param_t *param = (skein_param_t *)mechanism->cm_param; + + if (mechanism->cm_param_len != sizeof (*param) || + param->sp_digest_bitlen == 0) { + return (CRYPTO_MECHANISM_PARAM_INVALID); + } + *result = param->sp_digest_bitlen; + } else { + switch (mechanism->cm_type) { + case SKEIN_256_MECH_INFO_TYPE: + *result = 256; + break; + case SKEIN_512_MECH_INFO_TYPE: + *result = 512; + break; + case SKEIN1024_MECH_INFO_TYPE: + *result = 1024; + break; + default: + return (CRYPTO_MECHANISM_INVALID); + } + } + return (CRYPTO_SUCCESS); +} + +int +skein_mod_init(void) +{ + int error; + + if ((error = mod_install(&modlinkage)) != 0) + return (error); + + /* + * Try to register with KCF - failure shouldn't unload us, since we + * still may want to continue providing misc/skein functionality. + */ + (void) crypto_register_provider(&skein_prov_info, &skein_prov_handle); + + return (0); +} + +int +skein_mod_fini(void) +{ + int ret; + + if (skein_prov_handle != 0) { + if ((ret = crypto_unregister_provider(skein_prov_handle)) != + CRYPTO_SUCCESS) { + cmn_err(CE_WARN, + "skein _fini: crypto_unregister_provider() " + "failed (0x%x)", ret); + return (EBUSY); + } + skein_prov_handle = 0; + } + + return (mod_remove(&modlinkage)); +} + +/* + * KCF software provider control entry points. + */ +/* ARGSUSED */ +static void +skein_provider_status(crypto_provider_handle_t provider, uint_t *status) +{ + *status = CRYPTO_PROVIDER_READY; +} + +/* + * General Skein hashing helper functions. + */ + +/* + * Performs an Update on a context with uio input data. + */ +static int +skein_digest_update_uio(skein_ctx_t *ctx, const crypto_data_t *data) +{ + off_t offset = data->cd_offset; + size_t length = data->cd_length; + uint_t vec_idx = 0; + size_t cur_len; + uio_t *uio = data->cd_uio; + + /* we support only kernel buffer */ + if (uio_segflg(uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing data to be + * digested. + */ + offset = uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(uio)) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + /* + * Now do the digesting on the iovecs. + */ + while (vec_idx < uio_iovcnt(uio) && length > 0) { + cur_len = MIN(uio_iovlen(uio, vec_idx) - offset, length); + SKEIN_OP(ctx, Update, (uint8_t *)uio_iovbase(uio, vec_idx) + + offset, cur_len); + length -= cur_len; + vec_idx++; + offset = 0; + } + + if (vec_idx == uio_iovcnt(uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + + return (CRYPTO_SUCCESS); +} + +/* + * Performs a Final on a context and writes to a uio digest output. + */ +static int +skein_digest_final_uio(skein_ctx_t *ctx, crypto_data_t *digest, + crypto_req_handle_t req) +{ + off_t offset = digest->cd_offset; + uint_t vec_idx = 0; + uio_t *uio = digest->cd_uio; + + /* we support only kernel buffer */ + if (uio_segflg(uio) != UIO_SYSSPACE) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Jump to the first iovec containing ptr to the digest to be returned. + */ + offset = uio_index_at_offset(uio, offset, &vec_idx); + if (vec_idx == uio_iovcnt(uio)) { + /* + * The caller specified an offset that is larger than the + * total size of the buffers it provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + if (offset + CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen) <= + uio_iovlen(uio, vec_idx)) { + /* The computed digest will fit in the current iovec. */ + SKEIN_OP(ctx, Final, + (uchar_t *)uio_iovbase(uio, vec_idx) + offset); + } else { + uint8_t *digest_tmp; + off_t scratch_offset = 0; + size_t length = CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen); + size_t cur_len; + + digest_tmp = kmem_alloc(CRYPTO_BITS2BYTES( + ctx->sc_digest_bitlen), crypto_kmflag(req)); + if (digest_tmp == NULL) + return (CRYPTO_HOST_MEMORY); + SKEIN_OP(ctx, Final, digest_tmp); + while (vec_idx < uio_iovcnt(uio) && length > 0) { + cur_len = MIN(uio_iovlen(uio, vec_idx) - offset, + length); + bcopy(digest_tmp + scratch_offset, + uio_iovbase(uio, vec_idx) + offset, cur_len); + + length -= cur_len; + vec_idx++; + scratch_offset += cur_len; + offset = 0; + } + kmem_free(digest_tmp, CRYPTO_BITS2BYTES(ctx->sc_digest_bitlen)); + + if (vec_idx == uio_iovcnt(uio) && length > 0) { + /* + * The end of the specified iovec's was reached but + * the length requested could not be processed, i.e. + * The caller requested to digest more data than it + * provided. + */ + return (CRYPTO_DATA_LEN_RANGE); + } + } + + return (CRYPTO_SUCCESS); +} + +/* + * KCF software provider digest entry points. + */ + +/* + * Initializes a skein digest context to the configuration in `mechanism'. + * The mechanism cm_type must be one of SKEIN_*_MECH_INFO_TYPE. The cm_param + * field may contain a skein_param_t structure indicating the length of the + * digest the algorithm should produce. Otherwise the default output lengths + * are applied (32 bytes for Skein-256, 64 bytes for Skein-512 and 128 bytes + * for Skein-1024). + */ +static int +skein_digest_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + + SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), + crypto_kmflag(req)); + if (SKEIN_CTX(ctx) == NULL) + return (CRYPTO_HOST_MEMORY); + + SKEIN_CTX(ctx)->sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, + &SKEIN_CTX(ctx)->sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + goto errout; + SKEIN_OP(SKEIN_CTX(ctx), Init, SKEIN_CTX(ctx)->sc_digest_bitlen); + + return (CRYPTO_SUCCESS); +errout: + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + return (error); +} + +/* + * Executes a skein_update and skein_digest on a pre-initialized crypto + * context in a single step. See the documentation to these functions to + * see what to pass here. + */ +static int +skein_digest(crypto_ctx_t *ctx, crypto_data_t *data, crypto_data_t *digest, + crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + if (digest->cd_length < + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + return (CRYPTO_BUFFER_TOO_SMALL); + } + + error = skein_update(ctx, data, req); + if (error != CRYPTO_SUCCESS) { + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + digest->cd_length = 0; + return (error); + } + error = skein_final(ctx, digest, req); + + return (error); +} + +/* + * Performs a skein Update with the input message in `data' (successive calls + * can push more data). This is used both for digest and MAC operation. + * Supported input data formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_update(crypto_ctx_t *ctx, crypto_data_t *data, crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + switch (data->cd_format) { + case CRYPTO_DATA_RAW: + SKEIN_OP(SKEIN_CTX(ctx), Update, + (uint8_t *)data->cd_raw.iov_base + data->cd_offset, + data->cd_length); + break; + case CRYPTO_DATA_UIO: + error = skein_digest_update_uio(SKEIN_CTX(ctx), data); + break; + default: + error = CRYPTO_ARGUMENTS_BAD; + } + + return (error); +} + +/* + * Performs a skein Final, writing the output to `digest'. This is used both + * for digest and MAC operation. + * Supported output digest formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_final(crypto_ctx_t *ctx, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error = CRYPTO_SUCCESS; + + ASSERT(SKEIN_CTX(ctx) != NULL); + + if (digest->cd_length < + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen)) { + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + return (CRYPTO_BUFFER_TOO_SMALL); + } + + switch (digest->cd_format) { + case CRYPTO_DATA_RAW: + SKEIN_OP(SKEIN_CTX(ctx), Final, + (uint8_t *)digest->cd_raw.iov_base + digest->cd_offset); + break; + case CRYPTO_DATA_UIO: + error = skein_digest_final_uio(SKEIN_CTX(ctx), digest, req); + break; + default: + error = CRYPTO_ARGUMENTS_BAD; + } + + if (error == CRYPTO_SUCCESS) + digest->cd_length = + CRYPTO_BITS2BYTES(SKEIN_CTX(ctx)->sc_digest_bitlen); + else + digest->cd_length = 0; + + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*(SKEIN_CTX(ctx)))); + SKEIN_CTX_LVALUE(ctx) = NULL; + + return (error); +} + +/* + * Performs a full skein digest computation in a single call, configuring the + * algorithm according to `mechanism', reading the input to be digested from + * `data' and writing the output to `digest'. + * Supported input/output formats are raw, uio and mblk. + */ +/*ARGSUSED*/ +static int +skein_digest_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_data_t *data, crypto_data_t *digest, crypto_req_handle_t req) +{ + int error; + skein_ctx_t skein_ctx; + crypto_ctx_t ctx; + SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; + + /* Init */ + if (!VALID_SKEIN_DIGEST_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + skein_ctx.sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, &skein_ctx.sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + goto out; + SKEIN_OP(&skein_ctx, Init, skein_ctx.sc_digest_bitlen); + + if ((error = skein_update(&ctx, data, digest)) != CRYPTO_SUCCESS) + goto out; + if ((error = skein_final(&ctx, data, digest)) != CRYPTO_SUCCESS) + goto out; + +out: + if (error == CRYPTO_SUCCESS) + digest->cd_length = + CRYPTO_BITS2BYTES(skein_ctx.sc_digest_bitlen); + else + digest->cd_length = 0; + bzero(&skein_ctx, sizeof (skein_ctx)); + + return (error); +} + +/* + * Helper function that builds a Skein MAC context from the provided + * mechanism and key. + */ +static int +skein_mac_ctx_build(skein_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key) +{ + int error; + + if (!VALID_SKEIN_MAC_MECH(mechanism->cm_type)) + return (CRYPTO_MECHANISM_INVALID); + if (key->ck_format != CRYPTO_KEY_RAW) + return (CRYPTO_ARGUMENTS_BAD); + ctx->sc_mech_type = mechanism->cm_type; + error = skein_get_digest_bitlen(mechanism, &ctx->sc_digest_bitlen); + if (error != CRYPTO_SUCCESS) + return (error); + SKEIN_OP(ctx, InitExt, ctx->sc_digest_bitlen, 0, key->ck_data, + CRYPTO_BITS2BYTES(key->ck_length)); + + return (CRYPTO_SUCCESS); +} + +/* + * KCF software provide mac entry points. + */ +/* + * Initializes a skein MAC context. You may pass a ctx_template, in which + * case the template will be reused to make initialization more efficient. + * Otherwise a new context will be constructed. The mechanism cm_type must + * be one of SKEIN_*_MAC_MECH_INFO_TYPE. Same as in skein_digest_init, you + * may pass a skein_param_t in cm_param to configure the length of the + * digest. The key must be in raw format. + */ +static int +skein_mac_init(crypto_ctx_t *ctx, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_spi_ctx_template_t ctx_template, + crypto_req_handle_t req) +{ + int error; + + SKEIN_CTX_LVALUE(ctx) = kmem_alloc(sizeof (*SKEIN_CTX(ctx)), + crypto_kmflag(req)); + if (SKEIN_CTX(ctx) == NULL) + return (CRYPTO_HOST_MEMORY); + + if (ctx_template != NULL) { + bcopy(ctx_template, SKEIN_CTX(ctx), + sizeof (*SKEIN_CTX(ctx))); + } else { + error = skein_mac_ctx_build(SKEIN_CTX(ctx), mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + } + + return (CRYPTO_SUCCESS); +errout: + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + return (error); +} + +/* + * The MAC update and final calls are reused from the regular digest code. + */ + +/*ARGSUSED*/ +/* + * Same as skein_digest_atomic, performs an atomic Skein MAC operation in + * one step. All the same properties apply to the arguments of this + * function as to those of the partial operations above. + */ +static int +skein_mac_atomic(crypto_provider_handle_t provider, + crypto_session_id_t session_id, crypto_mechanism_t *mechanism, + crypto_key_t *key, crypto_data_t *data, crypto_data_t *mac, + crypto_spi_ctx_template_t ctx_template, crypto_req_handle_t req) +{ + /* faux crypto context just for skein_digest_{update,final} */ + int error; + crypto_ctx_t ctx; + skein_ctx_t skein_ctx; + SKEIN_CTX_LVALUE(&ctx) = &skein_ctx; + + if (ctx_template != NULL) { + bcopy(ctx_template, &skein_ctx, sizeof (skein_ctx)); + } else { + error = skein_mac_ctx_build(&skein_ctx, mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + } + + if ((error = skein_update(&ctx, data, req)) != CRYPTO_SUCCESS) + goto errout; + if ((error = skein_final(&ctx, mac, req)) != CRYPTO_SUCCESS) + goto errout; + + return (CRYPTO_SUCCESS); +errout: + bzero(&skein_ctx, sizeof (skein_ctx)); + return (error); +} + +/* + * KCF software provider context management entry points. + */ + +/* + * Constructs a context template for the Skein MAC algorithm. The same + * properties apply to the arguments of this function as to those of + * skein_mac_init. + */ +/*ARGSUSED*/ +static int +skein_create_ctx_template(crypto_provider_handle_t provider, + crypto_mechanism_t *mechanism, crypto_key_t *key, + crypto_spi_ctx_template_t *ctx_template, size_t *ctx_template_size, + crypto_req_handle_t req) +{ + int error; + skein_ctx_t *ctx_tmpl; + + ctx_tmpl = kmem_alloc(sizeof (*ctx_tmpl), crypto_kmflag(req)); + if (ctx_tmpl == NULL) + return (CRYPTO_HOST_MEMORY); + error = skein_mac_ctx_build(ctx_tmpl, mechanism, key); + if (error != CRYPTO_SUCCESS) + goto errout; + *ctx_template = ctx_tmpl; + *ctx_template_size = sizeof (*ctx_tmpl); + + return (CRYPTO_SUCCESS); +errout: + bzero(ctx_tmpl, sizeof (*ctx_tmpl)); + kmem_free(ctx_tmpl, sizeof (*ctx_tmpl)); + return (error); +} + +/* + * Frees a skein context in a parent crypto context. + */ +static int +skein_free_context(crypto_ctx_t *ctx) +{ + if (SKEIN_CTX(ctx) != NULL) { + bzero(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + kmem_free(SKEIN_CTX(ctx), sizeof (*SKEIN_CTX(ctx))); + SKEIN_CTX_LVALUE(ctx) = NULL; + } + + return (CRYPTO_SUCCESS); +} diff --git a/module/icp/os/modconf.c b/module/icp/os/modconf.c new file mode 100644 index 000000000000..3743416ed951 --- /dev/null +++ b/module/icp/os/modconf.c @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include + +/* + * Null operations; used for uninitialized and "misc" modules. + */ +static int mod_null(struct modlmisc *, struct modlinkage *); +static int mod_infonull(void *, struct modlinkage *, int *); + +/* + * Cryptographic Modules + */ +struct mod_ops mod_cryptoops = { + .modm_install = mod_null, + .modm_remove = mod_null, + .modm_info = mod_infonull +}; + +/* + * Null operation; return 0. + */ +static int +mod_null(struct modlmisc *modl, struct modlinkage *modlp) +{ + return (0); +} + +/* + * Status for User modules. + */ +static int +mod_infonull(void *modl, struct modlinkage *modlp, int *p0) +{ + *p0 = -1; /* for modinfo display */ + return (0); +} + +/* + * Install a module. + * (This routine is in the Solaris SPARC DDI/DKI) + */ +int +mod_install(struct modlinkage *modlp) +{ + int retval = -1; /* No linkage structures */ + struct modlmisc **linkpp; + struct modlmisc **linkpp1; + + if (modlp->ml_rev != MODREV_1) { + cmn_err(CE_WARN, "mod_install: " + "modlinkage structure is not MODREV_1\n"); + return (EINVAL); + } + linkpp = (struct modlmisc **)&modlp->ml_linkage[0]; + + while (*linkpp != NULL) { + if ((retval = MODL_INSTALL(*linkpp, modlp)) != 0) { + linkpp1 = (struct modlmisc **)&modlp->ml_linkage[0]; + + while (linkpp1 != linkpp) { + MODL_REMOVE(*linkpp1, modlp); /* clean up */ + linkpp1++; + } + break; + } + linkpp++; + } + return (retval); +} + +static char *reins_err = + "Could not reinstall %s\nReboot to correct the problem"; + +/* + * Remove a module. This is called by the module wrapper routine. + * (This routine is in the Solaris SPARC DDI/DKI) + */ +int +mod_remove(struct modlinkage *modlp) +{ + int retval = 0; + struct modlmisc **linkpp, *last_linkp; + + linkpp = (struct modlmisc **)&modlp->ml_linkage[0]; + + while (*linkpp != NULL) { + if ((retval = MODL_REMOVE(*linkpp, modlp)) != 0) { + last_linkp = *linkpp; + linkpp = (struct modlmisc **)&modlp->ml_linkage[0]; + while (*linkpp != last_linkp) { + if (MODL_INSTALL(*linkpp, modlp) != 0) { + cmn_err(CE_WARN, reins_err, + (*linkpp)->misc_linkinfo); + break; + } + linkpp++; + } + break; + } + linkpp++; + } + return (retval); +} + +/* + * Get module status. + * (This routine is in the Solaris SPARC DDI/DKI) + */ +int +mod_info(struct modlinkage *modlp, struct modinfo *modinfop) +{ + int i; + int retval = 0; + struct modspecific_info *msip; + struct modlmisc **linkpp; + + modinfop->mi_rev = modlp->ml_rev; + + linkpp = (struct modlmisc **)modlp->ml_linkage; + msip = &modinfop->mi_msinfo[0]; + + for (i = 0; i < MODMAXLINK; i++) { + if (*linkpp == NULL) { + msip->msi_linkinfo[0] = '\0'; + } else { + (void) strlcpy(msip->msi_linkinfo, + (*linkpp)->misc_linkinfo, MODMAXLINKINFOLEN); + retval = MODL_INFO(*linkpp, modlp, &msip->msi_p0); + if (retval != 0) + break; + linkpp++; + } + msip++; + } + + if (modinfop->mi_info == MI_INFO_LINKAGE) { + /* + * Slight kludge used to extract the address of the + * modlinkage structure from the module (just after + * loading a module for the very first time) + */ + modinfop->mi_base = (void *)modlp; + } + + if (retval == 0) + return (1); + return (0); +} diff --git a/module/icp/os/modhash.c b/module/icp/os/modhash.c new file mode 100644 index 000000000000..a897871001ce --- /dev/null +++ b/module/icp/os/modhash.c @@ -0,0 +1,927 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * mod_hash: flexible hash table implementation. + * + * This is a reasonably fast, reasonably flexible hash table implementation + * which features pluggable hash algorithms to support storing arbitrary keys + * and values. It is designed to handle small (< 100,000 items) amounts of + * data. The hash uses chaining to resolve collisions, and does not feature a + * mechanism to grow the hash. Care must be taken to pick nchains to be large + * enough for the application at hand, or lots of time will be wasted searching + * hash chains. + * + * The client of the hash is required to supply a number of items to support + * the various hash functions: + * + * - Destructor functions for the key and value being hashed. + * A destructor is responsible for freeing an object when the hash + * table is no longer storing it. Since keys and values can be of + * arbitrary type, separate destructors for keys & values are used. + * These may be mod_hash_null_keydtor and mod_hash_null_valdtor if no + * destructor is needed for either a key or value. + * + * - A hashing algorithm which returns a uint_t representing a hash index + * The number returned need _not_ be between 0 and nchains. The mod_hash + * code will take care of doing that. The second argument (after the + * key) to the hashing function is a void * that represents + * hash_alg_data-- this is provided so that the hashing algorithm can + * maintain some state across calls, or keep algorithm-specific + * constants associated with the hash table. + * + * A pointer-hashing and a string-hashing algorithm are supplied in + * this file. + * + * - A key comparator (a la qsort). + * This is used when searching the hash chain. The key comparator + * determines if two keys match. It should follow the return value + * semantics of strcmp. + * + * string and pointer comparators are supplied in this file. + * + * mod_hash_create_strhash() and mod_hash_create_ptrhash() provide good + * examples of how to create a customized hash table. + * + * Basic hash operations: + * + * mod_hash_create_strhash(name, nchains, dtor), + * create a hash using strings as keys. + * NOTE: This create a hash which automatically cleans up the string + * values it is given for keys. + * + * mod_hash_create_ptrhash(name, nchains, dtor, key_elem_size): + * create a hash using pointers as keys. + * + * mod_hash_create_extended(name, nchains, kdtor, vdtor, + * hash_alg, hash_alg_data, + * keycmp, sleep) + * create a customized hash table. + * + * mod_hash_destroy_hash(hash): + * destroy the given hash table, calling the key and value destructors + * on each key-value pair stored in the hash. + * + * mod_hash_insert(hash, key, val): + * place a key, value pair into the given hash. + * duplicate keys are rejected. + * + * mod_hash_insert_reserve(hash, key, val, handle): + * place a key, value pair into the given hash, using handle to indicate + * the reserved storage for the pair. (no memory allocation is needed + * during a mod_hash_insert_reserve.) duplicate keys are rejected. + * + * mod_hash_reserve(hash, *handle): + * reserve storage for a key-value pair using the memory allocation + * policy of 'hash', returning the storage handle in 'handle'. + * + * mod_hash_reserve_nosleep(hash, *handle): reserve storage for a key-value + * pair ignoring the memory allocation policy of 'hash' and always without + * sleep, returning the storage handle in 'handle'. + * + * mod_hash_remove(hash, key, *val): + * remove a key-value pair with key 'key' from 'hash', destroying the + * stored key, and returning the value in val. + * + * mod_hash_replace(hash, key, val) + * atomically remove an existing key-value pair from a hash, and replace + * the key and value with the ones supplied. The removed key and value + * (if any) are destroyed. + * + * mod_hash_destroy(hash, key): + * remove a key-value pair with key 'key' from 'hash', destroying both + * stored key and stored value. + * + * mod_hash_find(hash, key, val): + * find a value in the hash table corresponding to the given key. + * + * mod_hash_find_cb(hash, key, val, found_callback) + * find a value in the hash table corresponding to the given key. + * If a value is found, call specified callback passing key and val to it. + * The callback is called with the hash lock held. + * It is intended to be used in situations where the act of locating the + * data must also modify it - such as in reference counting schemes. + * + * mod_hash_walk(hash, callback(key, elem, arg), arg) + * walks all the elements in the hashtable and invokes the callback + * function with the key/value pair for each element. the hashtable + * is locked for readers so the callback function should not attempt + * to do any updates to the hashable. the callback function should + * return MH_WALK_CONTINUE to continue walking the hashtable or + * MH_WALK_TERMINATE to abort the walk of the hashtable. + * + * mod_hash_clear(hash): + * clears the given hash table of entries, calling the key and value + * destructors for every element in the hash. + */ + +#include +#include +#include +#include + +/* + * MH_KEY_DESTROY() + * Invoke the key destructor. + */ +#define MH_KEY_DESTROY(hash, key) ((hash->mh_kdtor)(key)) + +/* + * MH_VAL_DESTROY() + * Invoke the value destructor. + */ +#define MH_VAL_DESTROY(hash, val) ((hash->mh_vdtor)(val)) + +/* + * MH_KEYCMP() + * Call the key comparator for the given hash keys. + */ +#define MH_KEYCMP(hash, key1, key2) ((hash->mh_keycmp)(key1, key2)) + +/* + * Cache for struct mod_hash_entry + */ +kmem_cache_t *mh_e_cache = NULL; +mod_hash_t *mh_head = NULL; +kmutex_t mh_head_lock; + +/* + * mod_hash_null_keydtor() + * mod_hash_null_valdtor() + * no-op key and value destructors. + */ +/*ARGSUSED*/ +void +mod_hash_null_keydtor(mod_hash_key_t key) +{ +} + +/*ARGSUSED*/ +void +mod_hash_null_valdtor(mod_hash_val_t val) +{ +} + +/* + * mod_hash_bystr() + * mod_hash_strkey_cmp() + * mod_hash_strkey_dtor() + * mod_hash_strval_dtor() + * Hash and key comparison routines for hashes with string keys. + * + * mod_hash_create_strhash() + * Create a hash using strings as keys + * + * The string hashing algorithm is from the "Dragon Book" -- + * "Compilers: Principles, Tools & Techniques", by Aho, Sethi, Ullman + */ + +/*ARGSUSED*/ +uint_t +mod_hash_bystr(void *hash_data, mod_hash_key_t key) +{ + uint_t hash = 0; + uint_t g; + char *p, *k = (char *)key; + + ASSERT(k); + for (p = k; *p != '\0'; p++) { + hash = (hash << 4) + *p; + if ((g = (hash & 0xf0000000)) != 0) { + hash ^= (g >> 24); + hash ^= g; + } + } + return (hash); +} + +int +mod_hash_strkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) +{ + return (strcmp((char *)key1, (char *)key2)); +} + +void +mod_hash_strkey_dtor(mod_hash_key_t key) +{ + char *c = (char *)key; + kmem_free(c, strlen(c) + 1); +} + +void +mod_hash_strval_dtor(mod_hash_val_t val) +{ + char *c = (char *)val; + kmem_free(c, strlen(c) + 1); +} + +mod_hash_t * +mod_hash_create_strhash_nodtr(char *name, size_t nchains, + void (*val_dtor)(mod_hash_val_t)) +{ + return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor, + val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); +} + +mod_hash_t * +mod_hash_create_strhash(char *name, size_t nchains, + void (*val_dtor)(mod_hash_val_t)) +{ + return mod_hash_create_extended(name, nchains, mod_hash_strkey_dtor, + val_dtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); +} + +void +mod_hash_destroy_strhash(mod_hash_t *strhash) +{ + ASSERT(strhash); + mod_hash_destroy_hash(strhash); +} + + +/* + * mod_hash_byptr() + * mod_hash_ptrkey_cmp() + * Hash and key comparison routines for hashes with pointer keys. + * + * mod_hash_create_ptrhash() + * mod_hash_destroy_ptrhash() + * Create a hash that uses pointers as keys. This hash algorithm + * picks an appropriate set of middle bits in the address to hash on + * based on the size of the hash table and a hint about the size of + * the items pointed at. + */ +uint_t +mod_hash_byptr(void *hash_data, mod_hash_key_t key) +{ + uintptr_t k = (uintptr_t)key; + k >>= (int)(uintptr_t)hash_data; + + return ((uint_t)k); +} + +int +mod_hash_ptrkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) +{ + uintptr_t k1 = (uintptr_t)key1; + uintptr_t k2 = (uintptr_t)key2; + if (k1 > k2) + return (-1); + else if (k1 < k2) + return (1); + else + return (0); +} + +mod_hash_t * +mod_hash_create_ptrhash(char *name, size_t nchains, + void (*val_dtor)(mod_hash_val_t), size_t key_elem_size) +{ + size_t rshift; + + /* + * We want to hash on the bits in the middle of the address word + * Bits far to the right in the word have little significance, and + * are likely to all look the same (for example, an array of + * 256-byte structures will have the bottom 8 bits of address + * words the same). So we want to right-shift each address to + * ignore the bottom bits. + * + * The high bits, which are also unused, will get taken out when + * mod_hash takes hashkey % nchains. + */ + rshift = highbit64(key_elem_size); + + return mod_hash_create_extended(name, nchains, mod_hash_null_keydtor, + val_dtor, mod_hash_byptr, (void *)rshift, mod_hash_ptrkey_cmp, + KM_SLEEP); +} + +void +mod_hash_destroy_ptrhash(mod_hash_t *hash) +{ + ASSERT(hash); + mod_hash_destroy_hash(hash); +} + +/* + * mod_hash_byid() + * mod_hash_idkey_cmp() + * Hash and key comparison routines for hashes with 32-bit unsigned keys. + * + * mod_hash_create_idhash() + * mod_hash_destroy_idhash() + * mod_hash_iddata_gen() + * Create a hash that uses numeric keys. + * + * The hash algorithm is documented in "Introduction to Algorithms" + * (Cormen, Leiserson, Rivest); when the hash table is created, it + * attempts to find the next largest prime above the number of hash + * slots. The hash index is then this number times the key modulo + * the hash size, or (key * prime) % nchains. + */ +uint_t +mod_hash_byid(void *hash_data, mod_hash_key_t key) +{ + uint_t kval = (uint_t)(uintptr_t)hash_data; + return ((uint_t)(uintptr_t)key * (uint_t)kval); +} + +int +mod_hash_idkey_cmp(mod_hash_key_t key1, mod_hash_key_t key2) +{ + return ((uint_t)(uintptr_t)key1 - (uint_t)(uintptr_t)key2); +} + +/* + * Generate the next largest prime number greater than nchains; this value + * is intended to be later passed in to mod_hash_create_extended() as the + * hash_data. + */ +uint_t +mod_hash_iddata_gen(size_t nchains) +{ + uint_t kval, i, prime; + + /* + * Pick the first (odd) prime greater than nchains. Make sure kval is + * odd (so start with nchains +1 or +2 as appropriate). + */ + kval = (nchains % 2 == 0) ? nchains + 1 : nchains + 2; + + for (;;) { + prime = 1; + for (i = 3; i * i <= kval; i += 2) { + if (kval % i == 0) + prime = 0; + } + if (prime == 1) + break; + kval += 2; + } + return (kval); +} + +mod_hash_t * +mod_hash_create_idhash(char *name, size_t nchains, + void (*val_dtor)(mod_hash_val_t)) +{ + uint_t kval = mod_hash_iddata_gen(nchains); + + return (mod_hash_create_extended(name, nchains, mod_hash_null_keydtor, + val_dtor, mod_hash_byid, (void *)(uintptr_t)kval, + mod_hash_idkey_cmp, KM_SLEEP)); +} + +void +mod_hash_destroy_idhash(mod_hash_t *hash) +{ + ASSERT(hash); + mod_hash_destroy_hash(hash); +} + +void +mod_hash_fini(void) +{ + mutex_destroy(&mh_head_lock); + + if (mh_e_cache) { + kmem_cache_destroy(mh_e_cache); + mh_e_cache = NULL; + } +} + +/* + * mod_hash_init() + * sets up globals, etc for mod_hash_* + */ +void +mod_hash_init(void) +{ + ASSERT(mh_e_cache == NULL); + mh_e_cache = kmem_cache_create("mod_hash_entries", + sizeof (struct mod_hash_entry), 0, NULL, NULL, NULL, NULL, + NULL, 0); + + mutex_init(&mh_head_lock, NULL, MUTEX_DEFAULT, NULL); +} + +/* + * mod_hash_create_extended() + * The full-blown hash creation function. + * + * notes: + * nchains - how many hash slots to create. More hash slots will + * result in shorter hash chains, but will consume + * slightly more memory up front. + * sleep - should be KM_SLEEP or KM_NOSLEEP, to indicate whether + * to sleep for memory, or fail in low-memory conditions. + * + * Fails only if KM_NOSLEEP was specified, and no memory was available. + */ +mod_hash_t * +mod_hash_create_extended( + char *hname, /* descriptive name for hash */ + size_t nchains, /* number of hash slots */ + void (*kdtor)(mod_hash_key_t), /* key destructor */ + void (*vdtor)(mod_hash_val_t), /* value destructor */ + uint_t (*hash_alg)(void *, mod_hash_key_t), /* hash algorithm */ + void *hash_alg_data, /* pass-thru arg for hash_alg */ + int (*keycmp)(mod_hash_key_t, mod_hash_key_t), /* key comparator */ + int sleep) /* whether to sleep for mem */ +{ + mod_hash_t *mod_hash; + size_t size; + ASSERT(hname && keycmp && hash_alg && vdtor && kdtor); + + if ((mod_hash = kmem_zalloc(MH_SIZE(nchains), sleep)) == NULL) + return (NULL); + + size = strlen(hname) + 1; + mod_hash->mh_name = kmem_alloc(size, sleep); + if (mod_hash->mh_name == NULL) { + kmem_free(mod_hash, MH_SIZE(nchains)); + return (NULL); + } + (void) strlcpy(mod_hash->mh_name, hname, size); + + rw_init(&mod_hash->mh_contents, NULL, RW_DEFAULT, NULL); + mod_hash->mh_sleep = sleep; + mod_hash->mh_nchains = nchains; + mod_hash->mh_kdtor = kdtor; + mod_hash->mh_vdtor = vdtor; + mod_hash->mh_hashalg = hash_alg; + mod_hash->mh_hashalg_data = hash_alg_data; + mod_hash->mh_keycmp = keycmp; + + /* + * Link the hash up on the list of hashes + */ + mutex_enter(&mh_head_lock); + mod_hash->mh_next = mh_head; + mh_head = mod_hash; + mutex_exit(&mh_head_lock); + + return (mod_hash); +} + +/* + * mod_hash_destroy_hash() + * destroy a hash table, destroying all of its stored keys and values + * as well. + */ +void +mod_hash_destroy_hash(mod_hash_t *hash) +{ + mod_hash_t *mhp, *mhpp; + + mutex_enter(&mh_head_lock); + /* + * Remove the hash from the hash list + */ + if (hash == mh_head) { /* removing 1st list elem */ + mh_head = mh_head->mh_next; + } else { + /* + * mhpp can start out NULL since we know the 1st elem isn't the + * droid we're looking for. + */ + mhpp = NULL; + for (mhp = mh_head; mhp != NULL; mhp = mhp->mh_next) { + if (mhp == hash) { + mhpp->mh_next = mhp->mh_next; + break; + } + mhpp = mhp; + } + } + mutex_exit(&mh_head_lock); + + /* + * Clean out keys and values. + */ + mod_hash_clear(hash); + + rw_destroy(&hash->mh_contents); + kmem_free(hash->mh_name, strlen(hash->mh_name) + 1); + kmem_free(hash, MH_SIZE(hash->mh_nchains)); +} + +/* + * i_mod_hash() + * Call the hashing algorithm for this hash table, with the given key. + */ +uint_t +i_mod_hash(mod_hash_t *hash, mod_hash_key_t key) +{ + uint_t h; + /* + * Prevent div by 0 problems; + * Also a nice shortcut when using a hash as a list + */ + if (hash->mh_nchains == 1) + return (0); + + h = (hash->mh_hashalg)(hash->mh_hashalg_data, key); + return (h % (hash->mh_nchains - 1)); +} + +/* + * i_mod_hash_insert_nosync() + * mod_hash_insert() + * mod_hash_insert_reserve() + * insert 'val' into the hash table, using 'key' as its key. If 'key' is + * already a key in the hash, an error will be returned, and the key-val + * pair will not be inserted. i_mod_hash_insert_nosync() supports a simple + * handle abstraction, allowing hash entry allocation to be separated from + * the hash insertion. this abstraction allows simple use of the mod_hash + * structure in situations where mod_hash_insert() with a KM_SLEEP + * allocation policy would otherwise be unsafe. + */ +int +i_mod_hash_insert_nosync(mod_hash_t *hash, mod_hash_key_t key, + mod_hash_val_t val, mod_hash_hndl_t handle) +{ + uint_t hashidx; + struct mod_hash_entry *entry; + + ASSERT(hash); + + /* + * If we've not been given reserved storage, allocate storage directly, + * using the hash's allocation policy. + */ + if (handle == (mod_hash_hndl_t)0) { + entry = kmem_cache_alloc(mh_e_cache, hash->mh_sleep); + if (entry == NULL) { + hash->mh_stat.mhs_nomem++; + return (MH_ERR_NOMEM); + } + } else { + entry = (struct mod_hash_entry *)handle; + } + + hashidx = i_mod_hash(hash, key); + entry->mhe_key = key; + entry->mhe_val = val; + entry->mhe_next = hash->mh_entries[hashidx]; + + hash->mh_entries[hashidx] = entry; + hash->mh_stat.mhs_nelems++; + + return (0); +} + +int +mod_hash_insert(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val) +{ + int res; + mod_hash_val_t v; + + rw_enter(&hash->mh_contents, RW_WRITER); + + /* + * Disallow duplicate keys in the hash + */ + if (i_mod_hash_find_nosync(hash, key, &v) == 0) { + rw_exit(&hash->mh_contents); + hash->mh_stat.mhs_coll++; + return (MH_ERR_DUPLICATE); + } + + res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0); + rw_exit(&hash->mh_contents); + + return (res); +} + +int +mod_hash_insert_reserve(mod_hash_t *hash, mod_hash_key_t key, + mod_hash_val_t val, mod_hash_hndl_t handle) +{ + int res; + mod_hash_val_t v; + + rw_enter(&hash->mh_contents, RW_WRITER); + + /* + * Disallow duplicate keys in the hash + */ + if (i_mod_hash_find_nosync(hash, key, &v) == 0) { + rw_exit(&hash->mh_contents); + hash->mh_stat.mhs_coll++; + return (MH_ERR_DUPLICATE); + } + res = i_mod_hash_insert_nosync(hash, key, val, handle); + rw_exit(&hash->mh_contents); + + return (res); +} + +/* + * mod_hash_reserve() + * mod_hash_reserve_nosleep() + * mod_hash_cancel() + * Make or cancel a mod_hash_entry_t reservation. Reservations are used in + * mod_hash_insert_reserve() above. + */ +int +mod_hash_reserve(mod_hash_t *hash, mod_hash_hndl_t *handlep) +{ + *handlep = kmem_cache_alloc(mh_e_cache, hash->mh_sleep); + if (*handlep == NULL) { + hash->mh_stat.mhs_nomem++; + return (MH_ERR_NOMEM); + } + + return (0); +} + +int +mod_hash_reserve_nosleep(mod_hash_t *hash, mod_hash_hndl_t *handlep) +{ + *handlep = kmem_cache_alloc(mh_e_cache, KM_NOSLEEP); + if (*handlep == NULL) { + hash->mh_stat.mhs_nomem++; + return (MH_ERR_NOMEM); + } + + return (0); + +} + +/*ARGSUSED*/ +void +mod_hash_cancel(mod_hash_t *hash, mod_hash_hndl_t *handlep) +{ + kmem_cache_free(mh_e_cache, *handlep); + *handlep = (mod_hash_hndl_t)0; +} + +/* + * i_mod_hash_remove_nosync() + * mod_hash_remove() + * Remove an element from the hash table. + */ +int +i_mod_hash_remove_nosync(mod_hash_t *hash, mod_hash_key_t key, + mod_hash_val_t *val) +{ + int hashidx; + struct mod_hash_entry *e, *ep; + + hashidx = i_mod_hash(hash, key); + ep = NULL; /* e's parent */ + + for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) { + if (MH_KEYCMP(hash, e->mhe_key, key) == 0) + break; + ep = e; + } + + if (e == NULL) { /* not found */ + return (MH_ERR_NOTFOUND); + } + + if (ep == NULL) /* special case 1st element in bucket */ + hash->mh_entries[hashidx] = e->mhe_next; + else + ep->mhe_next = e->mhe_next; + + /* + * Clean up resources used by the node's key. + */ + MH_KEY_DESTROY(hash, e->mhe_key); + + *val = e->mhe_val; + kmem_cache_free(mh_e_cache, e); + hash->mh_stat.mhs_nelems--; + + return (0); +} + +int +mod_hash_remove(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) +{ + int res; + + rw_enter(&hash->mh_contents, RW_WRITER); + res = i_mod_hash_remove_nosync(hash, key, val); + rw_exit(&hash->mh_contents); + + return (res); +} + +/* + * mod_hash_replace() + * atomically remove an existing key-value pair from a hash, and replace + * the key and value with the ones supplied. The removed key and value + * (if any) are destroyed. + */ +int +mod_hash_replace(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t val) +{ + int res; + mod_hash_val_t v; + + rw_enter(&hash->mh_contents, RW_WRITER); + + if (i_mod_hash_remove_nosync(hash, key, &v) == 0) { + /* + * mod_hash_remove() takes care of freeing up the key resources. + */ + MH_VAL_DESTROY(hash, v); + } + res = i_mod_hash_insert_nosync(hash, key, val, (mod_hash_hndl_t)0); + + rw_exit(&hash->mh_contents); + + return (res); +} + +/* + * mod_hash_destroy() + * Remove an element from the hash table matching 'key', and destroy it. + */ +int +mod_hash_destroy(mod_hash_t *hash, mod_hash_key_t key) +{ + mod_hash_val_t val; + int rv; + + rw_enter(&hash->mh_contents, RW_WRITER); + + if ((rv = i_mod_hash_remove_nosync(hash, key, &val)) == 0) { + /* + * mod_hash_remove() takes care of freeing up the key resources. + */ + MH_VAL_DESTROY(hash, val); + } + + rw_exit(&hash->mh_contents); + return (rv); +} + +/* + * i_mod_hash_find_nosync() + * mod_hash_find() + * Find a value in the hash table corresponding to the given key. + */ +int +i_mod_hash_find_nosync(mod_hash_t *hash, mod_hash_key_t key, + mod_hash_val_t *val) +{ + uint_t hashidx; + struct mod_hash_entry *e; + + hashidx = i_mod_hash(hash, key); + + for (e = hash->mh_entries[hashidx]; e != NULL; e = e->mhe_next) { + if (MH_KEYCMP(hash, e->mhe_key, key) == 0) { + *val = e->mhe_val; + hash->mh_stat.mhs_hit++; + return (0); + } + } + hash->mh_stat.mhs_miss++; + return (MH_ERR_NOTFOUND); +} + +int +mod_hash_find(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val) +{ + int res; + + rw_enter(&hash->mh_contents, RW_READER); + res = i_mod_hash_find_nosync(hash, key, val); + rw_exit(&hash->mh_contents); + + return (res); +} + +int +mod_hash_find_cb(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, + void (*find_cb)(mod_hash_key_t, mod_hash_val_t)) +{ + int res; + + rw_enter(&hash->mh_contents, RW_READER); + res = i_mod_hash_find_nosync(hash, key, val); + if (res == 0) { + find_cb(key, *val); + } + rw_exit(&hash->mh_contents); + + return (res); +} + +int +mod_hash_find_cb_rval(mod_hash_t *hash, mod_hash_key_t key, mod_hash_val_t *val, + int (*find_cb)(mod_hash_key_t, mod_hash_val_t), int *cb_rval) +{ + int res; + + rw_enter(&hash->mh_contents, RW_READER); + res = i_mod_hash_find_nosync(hash, key, val); + if (res == 0) { + *cb_rval = find_cb(key, *val); + } + rw_exit(&hash->mh_contents); + + return (res); +} + +void +i_mod_hash_walk_nosync(mod_hash_t *hash, + uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) +{ + struct mod_hash_entry *e; + uint_t hashidx; + int res = MH_WALK_CONTINUE; + + for (hashidx = 0; + (hashidx < (hash->mh_nchains - 1)) && (res == MH_WALK_CONTINUE); + hashidx++) { + e = hash->mh_entries[hashidx]; + while ((e != NULL) && (res == MH_WALK_CONTINUE)) { + res = callback(e->mhe_key, e->mhe_val, arg); + e = e->mhe_next; + } + } +} + +/* + * mod_hash_walk() + * Walks all the elements in the hashtable and invokes the callback + * function with the key/value pair for each element. The hashtable + * is locked for readers so the callback function should not attempt + * to do any updates to the hashable. The callback function should + * return MH_WALK_CONTINUE to continue walking the hashtable or + * MH_WALK_TERMINATE to abort the walk of the hashtable. + */ +void +mod_hash_walk(mod_hash_t *hash, + uint_t (*callback)(mod_hash_key_t, mod_hash_val_t *, void *), void *arg) +{ + rw_enter(&hash->mh_contents, RW_READER); + i_mod_hash_walk_nosync(hash, callback, arg); + rw_exit(&hash->mh_contents); +} + + +/* + * i_mod_hash_clear_nosync() + * mod_hash_clear() + * Clears the given hash table by calling the destructor of every hash + * element and freeing up all mod_hash_entry's. + */ +void +i_mod_hash_clear_nosync(mod_hash_t *hash) +{ + int i; + struct mod_hash_entry *e, *old_e; + + for (i = 0; i < hash->mh_nchains; i++) { + e = hash->mh_entries[i]; + while (e != NULL) { + MH_KEY_DESTROY(hash, e->mhe_key); + MH_VAL_DESTROY(hash, e->mhe_val); + old_e = e; + e = e->mhe_next; + kmem_cache_free(mh_e_cache, old_e); + } + hash->mh_entries[i] = NULL; + } + hash->mh_stat.mhs_nelems = 0; +} + +void +mod_hash_clear(mod_hash_t *hash) +{ + ASSERT(hash); + rw_enter(&hash->mh_contents, RW_WRITER); + i_mod_hash_clear_nosync(hash); + rw_exit(&hash->mh_contents); +} diff --git a/module/icp/spi/kcf_spi.c b/module/icp/spi/kcf_spi.c new file mode 100644 index 000000000000..34b36b81c0ab --- /dev/null +++ b/module/icp/spi/kcf_spi.c @@ -0,0 +1,925 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * This file is part of the core Kernel Cryptographic Framework. + * It implements the SPI functions exported to cryptographic + * providers. + */ + + +#include +#include +#include +#include +#include + +/* + * minalloc and maxalloc values to be used for taskq_create(). + */ +int crypto_taskq_threads = CRYPTO_TASKQ_THREADS; +int crypto_taskq_minalloc = CRYPTO_TASKQ_MIN; +int crypto_taskq_maxalloc = CRYPTO_TASKQ_MAX; + +static void remove_provider(kcf_provider_desc_t *); +static void process_logical_providers(crypto_provider_info_t *, + kcf_provider_desc_t *); +static int init_prov_mechs(crypto_provider_info_t *, kcf_provider_desc_t *); +static int kcf_prov_kstat_update(kstat_t *, int); +static void delete_kstat(kcf_provider_desc_t *); + +static kcf_prov_stats_t kcf_stats_ks_data_template = { + { "kcf_ops_total", KSTAT_DATA_UINT64 }, + { "kcf_ops_passed", KSTAT_DATA_UINT64 }, + { "kcf_ops_failed", KSTAT_DATA_UINT64 }, + { "kcf_ops_returned_busy", KSTAT_DATA_UINT64 } +}; + +#define KCF_SPI_COPY_OPS(src, dst, ops) if ((src)->ops != NULL) \ + *((dst)->ops) = *((src)->ops); + +/* + * Copy an ops vector from src to dst. Used during provider registration + * to copy the ops vector from the provider info structure to the + * provider descriptor maintained by KCF. + * Copying the ops vector specified by the provider is needed since the + * framework does not require the provider info structure to be + * persistent. + */ +static void +copy_ops_vector_v1(crypto_ops_t *src_ops, crypto_ops_t *dst_ops) +{ + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_control_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_digest_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_cipher_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mac_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_sign_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_verify_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_dual_cipher_mac_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_random_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_session_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_object_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_key_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_provider_ops); + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_ctx_ops); +} + +static void +copy_ops_vector_v2(crypto_ops_t *src_ops, crypto_ops_t *dst_ops) +{ + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_mech_ops); +} + +static void +copy_ops_vector_v3(crypto_ops_t *src_ops, crypto_ops_t *dst_ops) +{ + KCF_SPI_COPY_OPS(src_ops, dst_ops, co_nostore_key_ops); +} + +/* + * This routine is used to add cryptographic providers to the KEF framework. + * Providers pass a crypto_provider_info structure to crypto_register_provider() + * and get back a handle. The crypto_provider_info structure contains a + * list of mechanisms supported by the provider and an ops vector containing + * provider entry points. Hardware providers call this routine in their attach + * routines. Software providers call this routine in their _init() routine. + */ +int +crypto_register_provider(crypto_provider_info_t *info, + crypto_kcf_provider_handle_t *handle) +{ + char *ks_name; + + kcf_provider_desc_t *prov_desc = NULL; + int ret = CRYPTO_ARGUMENTS_BAD; + + if (info->pi_interface_version > CRYPTO_SPI_VERSION_3) + return (CRYPTO_VERSION_MISMATCH); + + /* + * Check provider type, must be software, hardware, or logical. + */ + if (info->pi_provider_type != CRYPTO_HW_PROVIDER && + info->pi_provider_type != CRYPTO_SW_PROVIDER && + info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) + return (CRYPTO_ARGUMENTS_BAD); + + /* + * Allocate and initialize a new provider descriptor. We also + * hold it and release it when done. + */ + prov_desc = kcf_alloc_provider_desc(info); + KCF_PROV_REFHOLD(prov_desc); + + prov_desc->pd_prov_type = info->pi_provider_type; + + /* provider-private handle, opaque to KCF */ + prov_desc->pd_prov_handle = info->pi_provider_handle; + + /* copy provider description string */ + if (info->pi_provider_description != NULL) { + /* + * pi_provider_descriptor is a string that can contain + * up to CRYPTO_PROVIDER_DESCR_MAX_LEN + 1 characters + * INCLUDING the terminating null character. A bcopy() + * is necessary here as pd_description should not have + * a null character. See comments in kcf_alloc_provider_desc() + * for details on pd_description field. + */ + bcopy(info->pi_provider_description, prov_desc->pd_description, + MIN(strlen(info->pi_provider_description), + (size_t)CRYPTO_PROVIDER_DESCR_MAX_LEN)); + } + + if (info->pi_provider_type != CRYPTO_LOGICAL_PROVIDER) { + if (info->pi_ops_vector == NULL) { + goto bail; + } + copy_ops_vector_v1(info->pi_ops_vector, + prov_desc->pd_ops_vector); + if (info->pi_interface_version >= CRYPTO_SPI_VERSION_2) { + copy_ops_vector_v2(info->pi_ops_vector, + prov_desc->pd_ops_vector); + prov_desc->pd_flags = info->pi_flags; + } + if (info->pi_interface_version == CRYPTO_SPI_VERSION_3) { + copy_ops_vector_v3(info->pi_ops_vector, + prov_desc->pd_ops_vector); + } + } + + /* object_ops and nostore_key_ops are mutually exclusive */ + if (prov_desc->pd_ops_vector->co_object_ops && + prov_desc->pd_ops_vector->co_nostore_key_ops) { + goto bail; + } + + /* process the mechanisms supported by the provider */ + if ((ret = init_prov_mechs(info, prov_desc)) != CRYPTO_SUCCESS) + goto bail; + + /* + * Add provider to providers tables, also sets the descriptor + * pd_prov_id field. + */ + if ((ret = kcf_prov_tab_add_provider(prov_desc)) != CRYPTO_SUCCESS) { + undo_register_provider(prov_desc, B_FALSE); + goto bail; + } + + /* + * We create a taskq only for a hardware provider. The global + * software queue is used for software providers. We handle ordering + * of multi-part requests in the taskq routine. So, it is safe to + * have multiple threads for the taskq. We pass TASKQ_PREPOPULATE flag + * to keep some entries cached to improve performance. + */ + if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER) + prov_desc->pd_sched_info.ks_taskq = taskq_create("kcf_taskq", + crypto_taskq_threads, minclsyspri, + crypto_taskq_minalloc, crypto_taskq_maxalloc, + TASKQ_PREPOPULATE); + else + prov_desc->pd_sched_info.ks_taskq = NULL; + + /* no kernel session to logical providers */ + if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) { + /* + * Open a session for session-oriented providers. This session + * is used for all kernel consumers. This is fine as a provider + * is required to support multiple thread access to a session. + * We can do this only after the taskq has been created as we + * do a kcf_submit_request() to open the session. + */ + if (KCF_PROV_SESSION_OPS(prov_desc) != NULL) { + kcf_req_params_t params; + + KCF_WRAP_SESSION_OPS_PARAMS(¶ms, + KCF_OP_SESSION_OPEN, &prov_desc->pd_sid, 0, + CRYPTO_USER, NULL, 0, prov_desc); + ret = kcf_submit_request(prov_desc, NULL, NULL, ¶ms, + B_FALSE); + + if (ret != CRYPTO_SUCCESS) { + undo_register_provider(prov_desc, B_TRUE); + ret = CRYPTO_FAILED; + goto bail; + } + } + } + + if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) { + /* + * Create the kstat for this provider. There is a kstat + * installed for each successfully registered provider. + * This kstat is deleted, when the provider unregisters. + */ + if (prov_desc->pd_prov_type == CRYPTO_SW_PROVIDER) { + ks_name = kmem_asprintf("%s_%s", + "NONAME", "provider_stats"); + } else { + ks_name = kmem_asprintf("%s_%d_%u_%s", + "NONAME", 0, prov_desc->pd_prov_id, + "provider_stats"); + } + + prov_desc->pd_kstat = kstat_create("kcf", 0, ks_name, "crypto", + KSTAT_TYPE_NAMED, sizeof (kcf_prov_stats_t) / + sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (prov_desc->pd_kstat != NULL) { + bcopy(&kcf_stats_ks_data_template, + &prov_desc->pd_ks_data, + sizeof (kcf_stats_ks_data_template)); + prov_desc->pd_kstat->ks_data = &prov_desc->pd_ks_data; + KCF_PROV_REFHOLD(prov_desc); + KCF_PROV_IREFHOLD(prov_desc); + prov_desc->pd_kstat->ks_private = prov_desc; + prov_desc->pd_kstat->ks_update = kcf_prov_kstat_update; + kstat_install(prov_desc->pd_kstat); + } + kmem_strfree(ks_name); + } + + if (prov_desc->pd_prov_type == CRYPTO_HW_PROVIDER) + process_logical_providers(info, prov_desc); + + mutex_enter(&prov_desc->pd_lock); + prov_desc->pd_state = KCF_PROV_READY; + mutex_exit(&prov_desc->pd_lock); + kcf_do_notify(prov_desc, B_TRUE); + + *handle = prov_desc->pd_kcf_prov_handle; + ret = CRYPTO_SUCCESS; + +bail: + KCF_PROV_REFRELE(prov_desc); + return (ret); +} + +/* + * This routine is used to notify the framework when a provider is being + * removed. Hardware providers call this routine in their detach routines. + * Software providers call this routine in their _fini() routine. + */ +int +crypto_unregister_provider(crypto_kcf_provider_handle_t handle) +{ + uint_t mech_idx; + kcf_provider_desc_t *desc; + kcf_prov_state_t saved_state; + + /* lookup provider descriptor */ + if ((desc = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL) + return (CRYPTO_UNKNOWN_PROVIDER); + + mutex_enter(&desc->pd_lock); + /* + * Check if any other thread is disabling or removing + * this provider. We return if this is the case. + */ + if (desc->pd_state >= KCF_PROV_DISABLED) { + mutex_exit(&desc->pd_lock); + /* Release reference held by kcf_prov_tab_lookup(). */ + KCF_PROV_REFRELE(desc); + return (CRYPTO_BUSY); + } + + saved_state = desc->pd_state; + desc->pd_state = KCF_PROV_REMOVED; + + if (saved_state == KCF_PROV_BUSY) { + /* + * The per-provider taskq threads may be waiting. We + * signal them so that they can start failing requests. + */ + cv_broadcast(&desc->pd_resume_cv); + } + + if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) { + /* + * Check if this provider is currently being used. + * pd_irefcnt is the number of holds from the internal + * structures. We add one to account for the above lookup. + */ + if (desc->pd_refcnt > desc->pd_irefcnt + 1) { + desc->pd_state = saved_state; + mutex_exit(&desc->pd_lock); + /* Release reference held by kcf_prov_tab_lookup(). */ + KCF_PROV_REFRELE(desc); + /* + * The administrator presumably will stop the clients + * thus removing the holds, when they get the busy + * return value. Any retry will succeed then. + */ + return (CRYPTO_BUSY); + } + } + mutex_exit(&desc->pd_lock); + + if (desc->pd_prov_type != CRYPTO_SW_PROVIDER) { + remove_provider(desc); + } + + if (desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) { + /* remove the provider from the mechanisms tables */ + for (mech_idx = 0; mech_idx < desc->pd_mech_list_count; + mech_idx++) { + kcf_remove_mech_provider( + desc->pd_mechanisms[mech_idx].cm_mech_name, desc); + } + } + + /* remove provider from providers table */ + if (kcf_prov_tab_rem_provider((crypto_provider_id_t)handle) != + CRYPTO_SUCCESS) { + /* Release reference held by kcf_prov_tab_lookup(). */ + KCF_PROV_REFRELE(desc); + return (CRYPTO_UNKNOWN_PROVIDER); + } + + delete_kstat(desc); + + if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) { + /* Release reference held by kcf_prov_tab_lookup(). */ + KCF_PROV_REFRELE(desc); + + /* + * Wait till the existing requests complete. + */ + mutex_enter(&desc->pd_lock); + while (desc->pd_state != KCF_PROV_FREED) + cv_wait(&desc->pd_remove_cv, &desc->pd_lock); + mutex_exit(&desc->pd_lock); + } else { + /* + * Wait until requests that have been sent to the provider + * complete. + */ + mutex_enter(&desc->pd_lock); + while (desc->pd_irefcnt > 0) + cv_wait(&desc->pd_remove_cv, &desc->pd_lock); + mutex_exit(&desc->pd_lock); + } + + kcf_do_notify(desc, B_FALSE); + + if (desc->pd_prov_type == CRYPTO_SW_PROVIDER) { + /* + * This is the only place where kcf_free_provider_desc() + * is called directly. KCF_PROV_REFRELE() should free the + * structure in all other places. + */ + ASSERT(desc->pd_state == KCF_PROV_FREED && + desc->pd_refcnt == 0); + kcf_free_provider_desc(desc); + } else { + KCF_PROV_REFRELE(desc); + } + + return (CRYPTO_SUCCESS); +} + +/* + * This routine is used to notify the framework that the state of + * a cryptographic provider has changed. Valid state codes are: + * + * CRYPTO_PROVIDER_READY + * The provider indicates that it can process more requests. A provider + * will notify with this event if it previously has notified us with a + * CRYPTO_PROVIDER_BUSY. + * + * CRYPTO_PROVIDER_BUSY + * The provider can not take more requests. + * + * CRYPTO_PROVIDER_FAILED + * The provider encountered an internal error. The framework will not + * be sending any more requests to the provider. The provider may notify + * with a CRYPTO_PROVIDER_READY, if it is able to recover from the error. + * + * This routine can be called from user or interrupt context. + */ +void +crypto_provider_notification(crypto_kcf_provider_handle_t handle, uint_t state) +{ + kcf_provider_desc_t *pd; + + /* lookup the provider from the given handle */ + if ((pd = kcf_prov_tab_lookup((crypto_provider_id_t)handle)) == NULL) + return; + + mutex_enter(&pd->pd_lock); + + if (pd->pd_state <= KCF_PROV_VERIFICATION_FAILED) + goto out; + + if (pd->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + cmn_err(CE_WARN, "crypto_provider_notification: " + "logical provider (%x) ignored\n", handle); + goto out; + } + switch (state) { + case CRYPTO_PROVIDER_READY: + switch (pd->pd_state) { + case KCF_PROV_BUSY: + pd->pd_state = KCF_PROV_READY; + /* + * Signal the per-provider taskq threads that they + * can start submitting requests. + */ + cv_broadcast(&pd->pd_resume_cv); + break; + + case KCF_PROV_FAILED: + /* + * The provider recovered from the error. Let us + * use it now. + */ + pd->pd_state = KCF_PROV_READY; + break; + default: + break; + } + break; + + case CRYPTO_PROVIDER_BUSY: + switch (pd->pd_state) { + case KCF_PROV_READY: + pd->pd_state = KCF_PROV_BUSY; + break; + default: + break; + } + break; + + case CRYPTO_PROVIDER_FAILED: + /* + * We note the failure and return. The per-provider taskq + * threads check this flag and start failing the + * requests, if it is set. See process_req_hwp() for details. + */ + switch (pd->pd_state) { + case KCF_PROV_READY: + pd->pd_state = KCF_PROV_FAILED; + break; + + case KCF_PROV_BUSY: + pd->pd_state = KCF_PROV_FAILED; + /* + * The per-provider taskq threads may be waiting. We + * signal them so that they can start failing requests. + */ + cv_broadcast(&pd->pd_resume_cv); + break; + default: + break; + } + break; + default: + break; + } +out: + mutex_exit(&pd->pd_lock); + KCF_PROV_REFRELE(pd); +} + +/* + * This routine is used to notify the framework the result of + * an asynchronous request handled by a provider. Valid error + * codes are the same as the CRYPTO_* errors defined in common.h. + * + * This routine can be called from user or interrupt context. + */ +void +crypto_op_notification(crypto_req_handle_t handle, int error) +{ + kcf_call_type_t ctype; + + if (handle == NULL) + return; + + if ((ctype = GET_REQ_TYPE(handle)) == CRYPTO_SYNCH) { + kcf_sreq_node_t *sreq = (kcf_sreq_node_t *)handle; + + if (error != CRYPTO_SUCCESS) + sreq->sn_provider->pd_sched_info.ks_nfails++; + KCF_PROV_IREFRELE(sreq->sn_provider); + kcf_sop_done(sreq, error); + } else { + kcf_areq_node_t *areq = (kcf_areq_node_t *)handle; + + ASSERT(ctype == CRYPTO_ASYNCH); + if (error != CRYPTO_SUCCESS) + areq->an_provider->pd_sched_info.ks_nfails++; + KCF_PROV_IREFRELE(areq->an_provider); + kcf_aop_done(areq, error); + } +} + +/* + * This routine is used by software providers to determine + * whether to use KM_SLEEP or KM_NOSLEEP during memory allocation. + * Note that hardware providers can always use KM_SLEEP. So, + * they do not need to call this routine. + * + * This routine can be called from user or interrupt context. + */ +int +crypto_kmflag(crypto_req_handle_t handle) +{ + return (REQHNDL2_KMFLAG(handle)); +} + +/* + * Process the mechanism info structures specified by the provider + * during registration. A NULL crypto_provider_info_t indicates + * an already initialized provider descriptor. + * + * Mechanisms are not added to the kernel's mechanism table if the + * provider is a logical provider. + * + * Returns CRYPTO_SUCCESS on success, CRYPTO_ARGUMENTS if one + * of the specified mechanisms was malformed, or CRYPTO_HOST_MEMORY + * if the table of mechanisms is full. + */ +static int +init_prov_mechs(crypto_provider_info_t *info, kcf_provider_desc_t *desc) +{ + uint_t mech_idx; + uint_t cleanup_idx; + int err = CRYPTO_SUCCESS; + kcf_prov_mech_desc_t *pmd; + int desc_use_count = 0; + int mcount = desc->pd_mech_list_count; + + if (desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER) { + if (info != NULL) { + ASSERT(info->pi_mechanisms != NULL); + bcopy(info->pi_mechanisms, desc->pd_mechanisms, + sizeof (crypto_mech_info_t) * mcount); + } + return (CRYPTO_SUCCESS); + } + + /* + * Copy the mechanism list from the provider info to the provider + * descriptor. desc->pd_mechanisms has an extra crypto_mech_info_t + * element if the provider has random_ops since we keep an internal + * mechanism, SUN_RANDOM, in this case. + */ + if (info != NULL) { + if (info->pi_ops_vector->co_random_ops != NULL) { + crypto_mech_info_t *rand_mi; + + /* + * Need the following check as it is possible to have + * a provider that implements just random_ops and has + * pi_mechanisms == NULL. + */ + if (info->pi_mechanisms != NULL) { + bcopy(info->pi_mechanisms, desc->pd_mechanisms, + sizeof (crypto_mech_info_t) * (mcount - 1)); + } + rand_mi = &desc->pd_mechanisms[mcount - 1]; + + bzero(rand_mi, sizeof (crypto_mech_info_t)); + (void) strncpy(rand_mi->cm_mech_name, SUN_RANDOM, + CRYPTO_MAX_MECH_NAME); + rand_mi->cm_func_group_mask = CRYPTO_FG_RANDOM; + } else { + ASSERT(info->pi_mechanisms != NULL); + bcopy(info->pi_mechanisms, desc->pd_mechanisms, + sizeof (crypto_mech_info_t) * mcount); + } + } + + /* + * For each mechanism support by the provider, add the provider + * to the corresponding KCF mechanism mech_entry chain. + */ + for (mech_idx = 0; mech_idx < desc->pd_mech_list_count; mech_idx++) { + crypto_mech_info_t *mi = &desc->pd_mechanisms[mech_idx]; + + if ((mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BITS) && + (mi->cm_mech_flags & CRYPTO_KEYSIZE_UNIT_IN_BYTES)) { + err = CRYPTO_ARGUMENTS_BAD; + break; + } + + if (desc->pd_flags & CRYPTO_HASH_NO_UPDATE && + mi->cm_func_group_mask & CRYPTO_FG_DIGEST) { + /* + * We ask the provider to specify the limit + * per hash mechanism. But, in practice, a + * hardware limitation means all hash mechanisms + * will have the same maximum size allowed for + * input data. So, we make it a per provider + * limit to keep it simple. + */ + if (mi->cm_max_input_length == 0) { + err = CRYPTO_ARGUMENTS_BAD; + break; + } else { + desc->pd_hash_limit = mi->cm_max_input_length; + } + } + + if ((err = kcf_add_mech_provider(mech_idx, desc, &pmd)) != + KCF_SUCCESS) + break; + + if (pmd == NULL) + continue; + + /* The provider will be used for this mechanism */ + desc_use_count++; + } + + /* + * Don't allow multiple software providers with disabled mechanisms + * to register. Subsequent enabling of mechanisms will result in + * an unsupported configuration, i.e. multiple software providers + * per mechanism. + */ + if (desc_use_count == 0 && desc->pd_prov_type == CRYPTO_SW_PROVIDER) + return (CRYPTO_ARGUMENTS_BAD); + + if (err == KCF_SUCCESS) + return (CRYPTO_SUCCESS); + + /* + * An error occurred while adding the mechanism, cleanup + * and bail. + */ + for (cleanup_idx = 0; cleanup_idx < mech_idx; cleanup_idx++) { + kcf_remove_mech_provider( + desc->pd_mechanisms[cleanup_idx].cm_mech_name, desc); + } + + if (err == KCF_MECH_TAB_FULL) + return (CRYPTO_HOST_MEMORY); + + return (CRYPTO_ARGUMENTS_BAD); +} + +/* + * Update routine for kstat. Only privileged users are allowed to + * access this information, since this information is sensitive. + * There are some cryptographic attacks (e.g. traffic analysis) + * which can use this information. + */ +static int +kcf_prov_kstat_update(kstat_t *ksp, int rw) +{ + kcf_prov_stats_t *ks_data; + kcf_provider_desc_t *pd = (kcf_provider_desc_t *)ksp->ks_private; + + if (rw == KSTAT_WRITE) + return (EACCES); + + ks_data = ksp->ks_data; + + ks_data->ps_ops_total.value.ui64 = pd->pd_sched_info.ks_ndispatches; + ks_data->ps_ops_failed.value.ui64 = pd->pd_sched_info.ks_nfails; + ks_data->ps_ops_busy_rval.value.ui64 = pd->pd_sched_info.ks_nbusy_rval; + ks_data->ps_ops_passed.value.ui64 = + pd->pd_sched_info.ks_ndispatches - + pd->pd_sched_info.ks_nfails - + pd->pd_sched_info.ks_nbusy_rval; + + return (0); +} + + +/* + * Utility routine called from failure paths in crypto_register_provider() + * and from crypto_load_soft_disabled(). + */ +void +undo_register_provider(kcf_provider_desc_t *desc, boolean_t remove_prov) +{ + uint_t mech_idx; + + /* remove the provider from the mechanisms tables */ + for (mech_idx = 0; mech_idx < desc->pd_mech_list_count; + mech_idx++) { + kcf_remove_mech_provider( + desc->pd_mechanisms[mech_idx].cm_mech_name, desc); + } + + /* remove provider from providers table */ + if (remove_prov) + (void) kcf_prov_tab_rem_provider(desc->pd_prov_id); +} + +/* + * Utility routine called from crypto_load_soft_disabled(). Callers + * should have done a prior undo_register_provider(). + */ +void +redo_register_provider(kcf_provider_desc_t *pd) +{ + /* process the mechanisms supported by the provider */ + (void) init_prov_mechs(NULL, pd); + + /* + * Hold provider in providers table. We should not call + * kcf_prov_tab_add_provider() here as the provider descriptor + * is still valid which means it has an entry in the provider + * table. + */ + KCF_PROV_REFHOLD(pd); + KCF_PROV_IREFHOLD(pd); +} + +/* + * Add provider (p1) to another provider's array of providers (p2). + * Hardware and logical providers use this array to cross-reference + * each other. + */ +static void +add_provider_to_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2) +{ + kcf_provider_list_t *new; + + new = kmem_alloc(sizeof (kcf_provider_list_t), KM_SLEEP); + mutex_enter(&p2->pd_lock); + new->pl_next = p2->pd_provider_list; + p2->pd_provider_list = new; + KCF_PROV_IREFHOLD(p1); + new->pl_provider = p1; + mutex_exit(&p2->pd_lock); +} + +/* + * Remove provider (p1) from another provider's array of providers (p2). + * Hardware and logical providers use this array to cross-reference + * each other. + */ +static void +remove_provider_from_array(kcf_provider_desc_t *p1, kcf_provider_desc_t *p2) +{ + + kcf_provider_list_t *pl = NULL, **prev; + + mutex_enter(&p2->pd_lock); + for (pl = p2->pd_provider_list, prev = &p2->pd_provider_list; + pl != NULL; prev = &pl->pl_next, pl = pl->pl_next) { + if (pl->pl_provider == p1) { + break; + } + } + + if (p1 == NULL) { + mutex_exit(&p2->pd_lock); + return; + } + + /* detach and free kcf_provider_list structure */ + KCF_PROV_IREFRELE(p1); + *prev = pl->pl_next; + kmem_free(pl, sizeof (*pl)); + mutex_exit(&p2->pd_lock); +} + +/* + * Convert an array of logical provider handles (crypto_provider_id) + * stored in a crypto_provider_info structure into an array of provider + * descriptors (kcf_provider_desc_t) attached to a logical provider. + */ +static void +process_logical_providers(crypto_provider_info_t *info, kcf_provider_desc_t *hp) +{ + kcf_provider_desc_t *lp; + crypto_provider_id_t handle; + int count = info->pi_logical_provider_count; + int i; + + /* add hardware provider to each logical provider */ + for (i = 0; i < count; i++) { + handle = info->pi_logical_providers[i]; + lp = kcf_prov_tab_lookup((crypto_provider_id_t)handle); + if (lp == NULL) { + continue; + } + add_provider_to_array(hp, lp); + hp->pd_flags |= KCF_LPROV_MEMBER; + + /* + * A hardware provider has to have the provider descriptor of + * every logical provider it belongs to, so it can be removed + * from the logical provider if the hardware provider + * unregisters from the framework. + */ + add_provider_to_array(lp, hp); + KCF_PROV_REFRELE(lp); + } +} + +/* + * This routine removes a provider from all of the logical or + * hardware providers it belongs to, and frees the provider's + * array of pointers to providers. + */ +static void +remove_provider(kcf_provider_desc_t *pp) +{ + kcf_provider_desc_t *p; + kcf_provider_list_t *e, *next; + + mutex_enter(&pp->pd_lock); + for (e = pp->pd_provider_list; e != NULL; e = next) { + p = e->pl_provider; + remove_provider_from_array(pp, p); + if (p->pd_prov_type == CRYPTO_HW_PROVIDER && + p->pd_provider_list == NULL) + p->pd_flags &= ~KCF_LPROV_MEMBER; + KCF_PROV_IREFRELE(p); + next = e->pl_next; + kmem_free(e, sizeof (*e)); + } + pp->pd_provider_list = NULL; + mutex_exit(&pp->pd_lock); +} + +/* + * Dispatch events as needed for a provider. is_added flag tells + * whether the provider is registering or unregistering. + */ +void +kcf_do_notify(kcf_provider_desc_t *prov_desc, boolean_t is_added) +{ + int i; + crypto_notify_event_change_t ec; + + ASSERT(prov_desc->pd_state > KCF_PROV_VERIFICATION_FAILED); + + /* + * Inform interested clients of the mechanisms becoming + * available/unavailable. We skip this for logical providers + * as they do not affect mechanisms. + */ + if (prov_desc->pd_prov_type != CRYPTO_LOGICAL_PROVIDER) { + ec.ec_provider_type = prov_desc->pd_prov_type; + ec.ec_change = is_added ? CRYPTO_MECH_ADDED : + CRYPTO_MECH_REMOVED; + for (i = 0; i < prov_desc->pd_mech_list_count; i++) { + (void) strlcpy(ec.ec_mech_name, + prov_desc->pd_mechanisms[i].cm_mech_name, + CRYPTO_MAX_MECH_NAME); + kcf_walk_ntfylist(CRYPTO_EVENT_MECHS_CHANGED, &ec); + } + + } + + /* + * Inform interested clients about the new or departing provider. + * In case of a logical provider, we need to notify the event only + * for the logical provider and not for the underlying + * providers which are known by the KCF_LPROV_MEMBER bit. + */ + if (prov_desc->pd_prov_type == CRYPTO_LOGICAL_PROVIDER || + (prov_desc->pd_flags & KCF_LPROV_MEMBER) == 0) { + kcf_walk_ntfylist(is_added ? CRYPTO_EVENT_PROVIDER_REGISTERED : + CRYPTO_EVENT_PROVIDER_UNREGISTERED, prov_desc); + } +} + +static void +delete_kstat(kcf_provider_desc_t *desc) +{ + /* destroy the kstat created for this provider */ + if (desc->pd_kstat != NULL) { + kcf_provider_desc_t *kspd = desc->pd_kstat->ks_private; + + /* release reference held by desc->pd_kstat->ks_private */ + ASSERT(desc == kspd); + kstat_delete(kspd->pd_kstat); + desc->pd_kstat = NULL; + KCF_PROV_REFRELE(kspd); + KCF_PROV_IREFRELE(kspd); + } +} diff --git a/module/lua/Makefile.in b/module/lua/Makefile.in new file mode 100644 index 000000000000..0a74c17e64e8 --- /dev/null +++ b/module/lua/Makefile.in @@ -0,0 +1,39 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +endif + +MODULE := zlua + +obj-$(CONFIG_ZFS) := $(MODULE).o + +ccflags-y := -DLUA_USE_LONGLONG + +$(MODULE)-objs += lapi.o +$(MODULE)-objs += lauxlib.o +$(MODULE)-objs += lbaselib.o +$(MODULE)-objs += lcode.o +$(MODULE)-objs += lcompat.o +$(MODULE)-objs += lcorolib.o +$(MODULE)-objs += lctype.o +$(MODULE)-objs += ldebug.o +$(MODULE)-objs += ldo.o +$(MODULE)-objs += lfunc.o +$(MODULE)-objs += lgc.o +$(MODULE)-objs += llex.o +$(MODULE)-objs += lmem.o +$(MODULE)-objs += lobject.o +$(MODULE)-objs += lopcodes.o +$(MODULE)-objs += lparser.o +$(MODULE)-objs += lstate.o +$(MODULE)-objs += lstring.o +$(MODULE)-objs += lstrlib.o +$(MODULE)-objs += ltable.o +$(MODULE)-objs += ltablib.o +$(MODULE)-objs += ltm.o +$(MODULE)-objs += lvm.o +$(MODULE)-objs += lzio.o +$(MODULE)-objs += setjmp/setjmp.o + +all: + mkdir -p setjmp diff --git a/module/lua/README.zfs b/module/lua/README.zfs new file mode 100644 index 000000000000..0e22de7a4a18 --- /dev/null +++ b/module/lua/README.zfs @@ -0,0 +1,80 @@ +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +Introduction +------------ + +This README describes the Lua interpreter source code that lives in the ZFS +source tree to enable execution of ZFS channel programs, including its +maintenance policy, the modifications that have been made to it, and how it +should (and should not) be used. + +For a description of the Lua language and features exposed by ZFS channel +programs, please refer to the zfs-program(1m) man page instead. + + +Maintenance policy +------------------ + +The Lua runtime is considered stable software. Channel programs don't need much +complicated logic, so updates to the Lua runtime from upstream are viewed as +nice-to-have, but not required for channel programs to be well-supported. As +such, the Lua runtime in ZFS should be updated on an as-needed basis for +security vulnerabilities, but not much else. + + +Modifications to Lua +-------------------- + +The version of the Lua runtime we're using in ZFS has been modified in a variety +of ways to make it more useful for the specific purpose of running channel +programs. These changes include: + +1. "Normal" Lua uses floating point for all numbers it stores, but those aren't + useful inside ZFS / the kernel. We have changed the runtime to use int64_t + throughout for all numbers. +2. Some of the Lua standard libraries do file I/O or spawn processes, but + neither of these make sense from inside channel programs. We have removed + those libraries rather than reimplementing them using kernel APIs. +3. The "normal" Lua runtime handles errors by failing fatally, but since this + version of Lua runs inside the kernel we must handle these failures and + return meaningful error codes to userland. We have customized the Lua + failure paths so that they aren't fatal. +4. Running poorly-vetted code inside the kernel is always a risk; even if the + ability to do so is restricted to the root user, it's still possible to write + an incorrect program that results in an infinite loop or massive memory use. + We've added new protections into the Lua interpreter to limit the runtime + (measured in number of Lua instructions run) and memory overhead of running + a channel program. +5. The Lua bytecode is not designed to be secure / safe, so it would be easy to + pass invalid bytecode which can panic the kernel. By comparison, the parser + is hardened and fails gracefully on invalid input. Therefore, we only accept + Lua source code at the ioctl level and then interpret it inside the kernel. + +Each of these modifications have been tested in the zfs-test suite. If / when +new modifications are made, new tests should be added to the suite located in +zfs-tests/tests/functional/channel_program/lua_core. + + +How to use this Lua interpreter +------------------------------- + +From the above, it should be clear that this is not a general-purpose Lua +interpreter. Additional work would be required to extricate this custom version +of Lua from ZFS and make it usable by other areas of the kernel. diff --git a/module/lua/lapi.c b/module/lua/lapi.c new file mode 100644 index 000000000000..8f072531fde5 --- /dev/null +++ b/module/lua/lapi.c @@ -0,0 +1,1345 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lapi.c,v 2.171.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua API +** See Copyright Notice in lua.h +*/ + + +#define lapi_c +#define LUA_CORE + +#include + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" + + + +const char lua_ident[] = + "$LuaVersion: " LUA_COPYRIGHT " $" + "$LuaAuthors: " LUA_AUTHORS " $"; + + +/* value at a non-valid index */ +#define NONVALIDVALUE cast(TValue *, luaO_nilobject) + +/* corresponding test */ +#define isvalid(o) ((o) != luaO_nilobject) + +/* test for pseudo index */ +#define ispseudo(i) ((i) <= LUA_REGISTRYINDEX) + +/* test for valid but not pseudo index */ +#define isstackindex(i, o) (isvalid(o) && !ispseudo(i)) + +#define api_checkvalidindex(L, o) api_check(L, isvalid(o), "invalid index") + +#define api_checkstackindex(L, i, o) \ + api_check(L, isstackindex(i, o), "index not in the stack") + + +static TValue *index2addr (lua_State *L, int idx) { + CallInfo *ci = L->ci; + if (idx > 0) { + TValue *o = ci->func + idx; + api_check(L, idx <= ci->top - (ci->func + 1), "unacceptable index"); + if (o >= L->top) return NONVALIDVALUE; + else return o; + } + else if (!ispseudo(idx)) { /* negative index */ + api_check(L, idx != 0 && -idx <= L->top - (ci->func + 1), "invalid index"); + return L->top + idx; + } + else if (idx == LUA_REGISTRYINDEX) + return &G(L)->l_registry; + else { /* upvalues */ + idx = LUA_REGISTRYINDEX - idx; + api_check(L, idx <= MAXUPVAL + 1, "upvalue index too large"); + if (ttislcf(ci->func)) /* light C function? */ + return NONVALIDVALUE; /* it has no upvalues */ + else { + CClosure *func = clCvalue(ci->func); + return (idx <= func->nupvalues) ? &func->upvalue[idx-1] : NONVALIDVALUE; + } + } +} + + +/* +** to be called by 'lua_checkstack' in protected mode, to grow stack +** capturing memory errors +*/ +static void growstack (lua_State *L, void *ud) { + int size = *(int *)ud; + luaD_growstack(L, size); +} + + +LUA_API int lua_checkstack (lua_State *L, int size) { + int res; + CallInfo *ci = L->ci; + lua_lock(L); + if (L->stack_last - L->top > size) /* stack large enough? */ + res = 1; /* yes; check is OK */ + else { /* no; need to grow stack */ + int inuse = cast_int(L->top - L->stack) + EXTRA_STACK; + if (inuse > LUAI_MAXSTACK - size) /* can grow without overflow? */ + res = 0; /* no */ + else /* try to grow stack */ + res = (luaD_rawrunprotected(L, &growstack, &size) == LUA_OK); + } + if (res && ci->top < L->top + size) + ci->top = L->top + size; /* adjust frame top */ + lua_unlock(L); + return res; +} + + +LUA_API void lua_xmove (lua_State *from, lua_State *to, int n) { + int i; + if (from == to) return; + lua_lock(to); + api_checknelems(from, n); + api_check(from, G(from) == G(to), "moving among independent states"); + api_check(from, to->ci->top - to->top >= n, "not enough elements to move"); + from->top -= n; + for (i = 0; i < n; i++) { + setobj2s(to, to->top++, from->top + i); + } + lua_unlock(to); +} + + +LUA_API lua_CFunction lua_atpanic (lua_State *L, lua_CFunction panicf) { + lua_CFunction old; + lua_lock(L); + old = G(L)->panic; + G(L)->panic = panicf; + lua_unlock(L); + return old; +} + + +LUA_API const lua_Number *lua_version (lua_State *L) { + static const lua_Number version = LUA_VERSION_NUM; + if (L == NULL) return &version; + else return G(L)->version; +} + + + +/* +** basic stack manipulation +*/ + + +/* +** convert an acceptable stack index into an absolute index +*/ +LUA_API int lua_absindex (lua_State *L, int idx) { + return (idx > 0 || ispseudo(idx)) + ? idx + : cast_int(L->top - L->ci->func + idx); +} + + +LUA_API int lua_gettop (lua_State *L) { + return cast_int(L->top - (L->ci->func + 1)); +} + + +LUA_API void lua_settop (lua_State *L, int idx) { + StkId func = L->ci->func; + lua_lock(L); + if (idx >= 0) { + api_check(L, idx <= L->stack_last - (func + 1), "new top too large"); + while (L->top < (func + 1) + idx) + setnilvalue(L->top++); + L->top = (func + 1) + idx; + } + else { + api_check(L, -(idx+1) <= (L->top - (func + 1)), "invalid new top"); + L->top += idx+1; /* `subtract' index (index is negative) */ + } + lua_unlock(L); +} + + +LUA_API void lua_remove (lua_State *L, int idx) { + StkId p; + lua_lock(L); + p = index2addr(L, idx); + api_checkstackindex(L, idx, p); + while (++p < L->top) setobjs2s(L, p-1, p); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_insert (lua_State *L, int idx) { + StkId p; + StkId q; + lua_lock(L); + p = index2addr(L, idx); + api_checkstackindex(L, idx, p); + for (q = L->top; q > p; q--) /* use L->top as a temporary */ + setobjs2s(L, q, q - 1); + setobjs2s(L, p, L->top); + lua_unlock(L); +} + + +static void moveto (lua_State *L, TValue *fr, int idx) { + TValue *to = index2addr(L, idx); + api_checkvalidindex(L, to); + setobj(L, to, fr); + if (idx < LUA_REGISTRYINDEX) /* function upvalue? */ + luaC_barrier(L, clCvalue(L->ci->func), fr); + /* LUA_REGISTRYINDEX does not need gc barrier + (collector revisits it before finishing collection) */ +} + + +LUA_API void lua_replace (lua_State *L, int idx) { + lua_lock(L); + api_checknelems(L, 1); + moveto(L, L->top - 1, idx); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_copy (lua_State *L, int fromidx, int toidx) { + TValue *fr; + lua_lock(L); + fr = index2addr(L, fromidx); + moveto(L, fr, toidx); + lua_unlock(L); +} + + +LUA_API void lua_pushvalue (lua_State *L, int idx) { + lua_lock(L); + setobj2s(L, L->top, index2addr(L, idx)); + api_incr_top(L); + lua_unlock(L); +} + + + +/* +** access functions (stack -> C) +*/ + + +LUA_API int lua_type (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (isvalid(o) ? ttypenv(o) : LUA_TNONE); +} + + +LUA_API const char *lua_typename (lua_State *L, int t) { + UNUSED(L); + return ttypename(t); +} + + +LUA_API int lua_iscfunction (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (ttislcf(o) || (ttisCclosure(o))); +} + + +LUA_API int lua_isnumber (lua_State *L, int idx) { + TValue n; + const TValue *o = index2addr(L, idx); + return tonumber(o, &n); +} + + +LUA_API int lua_isstring (lua_State *L, int idx) { + int t = lua_type(L, idx); + return (t == LUA_TSTRING || t == LUA_TNUMBER); +} + + +LUA_API int lua_isuserdata (lua_State *L, int idx) { + const TValue *o = index2addr(L, idx); + return (ttisuserdata(o) || ttislightuserdata(o)); +} + + +LUA_API int lua_rawequal (lua_State *L, int index1, int index2) { + StkId o1 = index2addr(L, index1); + StkId o2 = index2addr(L, index2); + return (isvalid(o1) && isvalid(o2)) ? luaV_rawequalobj(o1, o2) : 0; +} + + +LUA_API void lua_arith (lua_State *L, int op) { + StkId o1; /* 1st operand */ + StkId o2; /* 2nd operand */ + lua_lock(L); + if (op != LUA_OPUNM) /* all other operations expect two operands */ + api_checknelems(L, 2); + else { /* for unary minus, add fake 2nd operand */ + api_checknelems(L, 1); + setobjs2s(L, L->top, L->top - 1); + L->top++; + } + o1 = L->top - 2; + o2 = L->top - 1; + if (ttisnumber(o1) && ttisnumber(o2)) { + setnvalue(o1, luaO_arith(op, nvalue(o1), nvalue(o2))); + } + else + luaV_arith(L, o1, o1, o2, cast(TMS, op - LUA_OPADD + TM_ADD)); + L->top--; + lua_unlock(L); +} + + +LUA_API int lua_compare (lua_State *L, int index1, int index2, int op) { + StkId o1, o2; + int i = 0; + lua_lock(L); /* may call tag method */ + o1 = index2addr(L, index1); + o2 = index2addr(L, index2); + if (isvalid(o1) && isvalid(o2)) { + switch (op) { + case LUA_OPEQ: i = equalobj(L, o1, o2); break; + case LUA_OPLT: i = luaV_lessthan(L, o1, o2); break; + case LUA_OPLE: i = luaV_lessequal(L, o1, o2); break; + default: api_check(L, 0, "invalid option"); + } + } + lua_unlock(L); + return i; +} + + +LUA_API lua_Number lua_tonumberx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + if (isnum) *isnum = 1; + return nvalue(o); + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API lua_Integer lua_tointegerx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + lua_Integer res; + lua_Number num = nvalue(o); + lua_number2integer(res, num); + if (isnum) *isnum = 1; + return res; + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API lua_Unsigned lua_tounsignedx (lua_State *L, int idx, int *isnum) { + TValue n; + const TValue *o = index2addr(L, idx); + if (tonumber(o, &n)) { + lua_Unsigned res; + lua_Number num = nvalue(o); + lua_number2unsigned(res, num); + if (isnum) *isnum = 1; + return res; + } + else { + if (isnum) *isnum = 0; + return 0; + } +} + + +LUA_API int lua_toboolean (lua_State *L, int idx) { + const TValue *o = index2addr(L, idx); + return !l_isfalse(o); +} + + +LUA_API const char *lua_tolstring (lua_State *L, int idx, size_t *len) { + StkId o = index2addr(L, idx); + if (!ttisstring(o)) { + lua_lock(L); /* `luaV_tostring' may create a new string */ + if (!luaV_tostring(L, o)) { /* conversion failed? */ + if (len != NULL) *len = 0; + lua_unlock(L); + return NULL; + } + luaC_checkGC(L); + o = index2addr(L, idx); /* previous call may reallocate the stack */ + lua_unlock(L); + } + if (len != NULL) *len = tsvalue(o)->len; + return svalue(o); +} + + +LUA_API size_t lua_rawlen (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttypenv(o)) { + case LUA_TSTRING: return tsvalue(o)->len; + case LUA_TUSERDATA: return uvalue(o)->len; + case LUA_TTABLE: return luaH_getn(hvalue(o)); + default: return 0; + } +} + + +LUA_API lua_CFunction lua_tocfunction (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + if (ttislcf(o)) return fvalue(o); + else if (ttisCclosure(o)) + return clCvalue(o)->f; + else return NULL; /* not a C function */ +} + + +LUA_API void *lua_touserdata (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttypenv(o)) { + case LUA_TUSERDATA: return ((void *)(rawuvalue(o) + 1)); + case LUA_TLIGHTUSERDATA: return pvalue(o); + default: return NULL; + } +} + + +LUA_API lua_State *lua_tothread (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + return (!ttisthread(o)) ? NULL : thvalue(o); +} + + +LUA_API const void *lua_topointer (lua_State *L, int idx) { + StkId o = index2addr(L, idx); + switch (ttype(o)) { + case LUA_TTABLE: return hvalue(o); + case LUA_TLCL: return clLvalue(o); + case LUA_TCCL: return clCvalue(o); + case LUA_TLCF: return cast(void *, cast(size_t, fvalue(o))); + case LUA_TTHREAD: return thvalue(o); + case LUA_TUSERDATA: + case LUA_TLIGHTUSERDATA: + return lua_touserdata(L, idx); + default: return NULL; + } +} + + + +/* +** push functions (C -> stack) +*/ + + +LUA_API void lua_pushnil (lua_State *L) { + lua_lock(L); + setnilvalue(L->top); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushnumber (lua_State *L, lua_Number n) { + lua_lock(L); + setnvalue(L->top, n); + luai_checknum(L, L->top, + luaG_runerror(L, "C API - attempt to push a signaling NaN")); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushinteger (lua_State *L, lua_Integer n) { + lua_lock(L); + setnvalue(L->top, cast_num(n)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushunsigned (lua_State *L, lua_Unsigned u) { + lua_Number n; + lua_lock(L); + n = lua_unsigned2number(u); + setnvalue(L->top, n); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API const char *lua_pushlstring (lua_State *L, const char *s, size_t len) { + TString *ts; + lua_lock(L); + luaC_checkGC(L); + ts = luaS_newlstr(L, s, len); + setsvalue2s(L, L->top, ts); + api_incr_top(L); + lua_unlock(L); + return getstr(ts); +} + + +LUA_API const char *lua_pushstring (lua_State *L, const char *s) { + if (s == NULL) { + lua_pushnil(L); + return NULL; + } + else { + TString *ts; + lua_lock(L); + luaC_checkGC(L); + ts = luaS_new(L, s); + setsvalue2s(L, L->top, ts); + api_incr_top(L); + lua_unlock(L); + return getstr(ts); + } +} + + +LUA_API const char *lua_pushvfstring (lua_State *L, const char *fmt, + va_list argp) { + const char *ret; + lua_lock(L); + luaC_checkGC(L); + ret = luaO_pushvfstring(L, fmt, argp); + lua_unlock(L); + return ret; +} + + +LUA_API const char *lua_pushfstring (lua_State *L, const char *fmt, ...) { + const char *ret; + va_list argp; + lua_lock(L); + luaC_checkGC(L); + va_start(argp, fmt); + ret = luaO_pushvfstring(L, fmt, argp); + va_end(argp); + lua_unlock(L); + return ret; +} + + +LUA_API void lua_pushcclosure (lua_State *L, lua_CFunction fn, int n) { + lua_lock(L); + if (n == 0) { + setfvalue(L->top, fn); + } + else { + Closure *cl; + api_checknelems(L, n); + api_check(L, n <= MAXUPVAL, "upvalue index too large"); + luaC_checkGC(L); + cl = luaF_newCclosure(L, n); + cl->c.f = fn; + L->top -= n; + while (n--) + setobj2n(L, &cl->c.upvalue[n], L->top + n); + setclCvalue(L, L->top, cl); + } + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushboolean (lua_State *L, int b) { + lua_lock(L); + setbvalue(L->top, (b != 0)); /* ensure that true is 1 */ + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_pushlightuserdata (lua_State *L, void *p) { + lua_lock(L); + setpvalue(L->top, p); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API int lua_pushthread (lua_State *L) { + lua_lock(L); + setthvalue(L, L->top, L); + api_incr_top(L); + lua_unlock(L); + return (G(L)->mainthread == L); +} + + + +/* +** get functions (Lua -> stack) +*/ + + +LUA_API void lua_getglobal (lua_State *L, const char *var) { + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt; /* global table */ + lua_lock(L); + gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + setsvalue2s(L, L->top++, luaS_new(L, var)); + luaV_gettable(L, gt, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_gettable (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + luaV_gettable(L, t, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_getfield (lua_State *L, int idx, const char *k) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + setsvalue2s(L, L->top, luaS_new(L, k)); + api_incr_top(L); + luaV_gettable(L, t, L->top - 1, L->top - 1); + lua_unlock(L); +} + + +LUA_API void lua_rawget (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2s(L, L->top - 1, luaH_get(hvalue(t), L->top - 1)); + lua_unlock(L); +} + + +LUA_API void lua_rawgeti (lua_State *L, int idx, int n) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2s(L, L->top, luaH_getint(hvalue(t), n)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_rawgetp (lua_State *L, int idx, const void *p) { + StkId t; + TValue k; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setpvalue(&k, cast(void *, p)); + setobj2s(L, L->top, luaH_get(hvalue(t), &k)); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API void lua_createtable (lua_State *L, int narray, int nrec) { + Table *t; + lua_lock(L); + luaC_checkGC(L); + t = luaH_new(L); + sethvalue(L, L->top, t); + api_incr_top(L); + if (narray > 0 || nrec > 0) + luaH_resize(L, t, narray, nrec); + lua_unlock(L); +} + + +LUA_API int lua_getmetatable (lua_State *L, int objindex) { + const TValue *obj; + Table *mt = NULL; + int res; + lua_lock(L); + obj = index2addr(L, objindex); + switch (ttypenv(obj)) { + case LUA_TTABLE: + mt = hvalue(obj)->metatable; + break; + case LUA_TUSERDATA: + mt = uvalue(obj)->metatable; + break; + default: + mt = G(L)->mt[ttypenv(obj)]; + break; + } + if (mt == NULL) + res = 0; + else { + sethvalue(L, L->top, mt); + api_incr_top(L); + res = 1; + } + lua_unlock(L); + return res; +} + + +LUA_API void lua_getuservalue (lua_State *L, int idx) { + StkId o; + lua_lock(L); + o = index2addr(L, idx); + api_check(L, ttisuserdata(o), "userdata expected"); + if (uvalue(o)->env) { + sethvalue(L, L->top, uvalue(o)->env); + } else + setnilvalue(L->top); + api_incr_top(L); + lua_unlock(L); +} + + +/* +** set functions (stack -> Lua) +*/ + + +LUA_API void lua_setglobal (lua_State *L, const char *var) { + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt; /* global table */ + lua_lock(L); + api_checknelems(L, 1); + gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + setsvalue2s(L, L->top++, luaS_new(L, var)); + luaV_settable(L, gt, L->top - 1, L->top - 2); + L->top -= 2; /* pop value and key */ + lua_unlock(L); +} + + +LUA_API void lua_settable (lua_State *L, int idx) { + StkId t; + lua_lock(L); + api_checknelems(L, 2); + t = index2addr(L, idx); + luaV_settable(L, t, L->top - 2, L->top - 1); + L->top -= 2; /* pop index and value */ + lua_unlock(L); +} + + +LUA_API void lua_setfield (lua_State *L, int idx, const char *k) { + StkId t; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + setsvalue2s(L, L->top++, luaS_new(L, k)); + luaV_settable(L, t, L->top - 1, L->top - 2); + L->top -= 2; /* pop value and key */ + lua_unlock(L); +} + + +LUA_API void lua_rawset (lua_State *L, int idx) { + StkId t; + lua_lock(L); + api_checknelems(L, 2); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setobj2t(L, luaH_set(L, hvalue(t), L->top-2), L->top-1); + invalidateTMcache(hvalue(t)); + luaC_barrierback(L, gcvalue(t), L->top-1); + L->top -= 2; + lua_unlock(L); +} + + +LUA_API void lua_rawseti (lua_State *L, int idx, int n) { + StkId t; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + luaH_setint(L, hvalue(t), n, L->top - 1); + luaC_barrierback(L, gcvalue(t), L->top-1); + L->top--; + lua_unlock(L); +} + + +LUA_API void lua_rawsetp (lua_State *L, int idx, const void *p) { + StkId t; + TValue k; + lua_lock(L); + api_checknelems(L, 1); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + setpvalue(&k, cast(void *, p)); + setobj2t(L, luaH_set(L, hvalue(t), &k), L->top - 1); + luaC_barrierback(L, gcvalue(t), L->top - 1); + L->top--; + lua_unlock(L); +} + + +LUA_API int lua_setmetatable (lua_State *L, int objindex) { + TValue *obj; + Table *mt; + lua_lock(L); + api_checknelems(L, 1); + obj = index2addr(L, objindex); + if (ttisnil(L->top - 1)) + mt = NULL; + else { + api_check(L, ttistable(L->top - 1), "table expected"); + mt = hvalue(L->top - 1); + } + switch (ttypenv(obj)) { + case LUA_TTABLE: { + hvalue(obj)->metatable = mt; + if (mt) { + luaC_objbarrierback(L, gcvalue(obj), mt); + luaC_checkfinalizer(L, gcvalue(obj), mt); + } + break; + } + case LUA_TUSERDATA: { + uvalue(obj)->metatable = mt; + if (mt) { + luaC_objbarrier(L, rawuvalue(obj), mt); + luaC_checkfinalizer(L, gcvalue(obj), mt); + } + break; + } + default: { + G(L)->mt[ttypenv(obj)] = mt; + break; + } + } + L->top--; + lua_unlock(L); + return 1; +} + + +LUA_API void lua_setuservalue (lua_State *L, int idx) { + StkId o; + lua_lock(L); + api_checknelems(L, 1); + o = index2addr(L, idx); + api_check(L, ttisuserdata(o), "userdata expected"); + if (ttisnil(L->top - 1)) + uvalue(o)->env = NULL; + else { + api_check(L, ttistable(L->top - 1), "table expected"); + uvalue(o)->env = hvalue(L->top - 1); + luaC_objbarrier(L, gcvalue(o), hvalue(L->top - 1)); + } + L->top--; + lua_unlock(L); +} + + +/* +** `load' and `call' functions (run Lua code) +*/ + + +#define checkresults(L,na,nr) \ + api_check(L, (nr) == LUA_MULTRET || (L->ci->top - L->top >= (nr) - (na)), \ + "results from function overflow current stack size") + + +LUA_API int lua_getctx (lua_State *L, int *ctx) { + if (L->ci->callstatus & CIST_YIELDED) { + if (ctx) *ctx = L->ci->u.c.ctx; + return L->ci->u.c.status; + } + else return LUA_OK; +} + + +LUA_API void lua_callk (lua_State *L, int nargs, int nresults, int ctx, + lua_CFunction k) { + StkId func; + lua_lock(L); + api_check(L, k == NULL || !isLua(L->ci), + "cannot use continuations inside hooks"); + api_checknelems(L, nargs+1); + api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); + checkresults(L, nargs, nresults); + func = L->top - (nargs+1); + if (k != NULL && L->nny == 0) { /* need to prepare continuation? */ + L->ci->u.c.k = k; /* save continuation */ + L->ci->u.c.ctx = ctx; /* save context */ + luaD_call(L, func, nresults, 1); /* do the call */ + } + else /* no continuation or no yieldable */ + luaD_call(L, func, nresults, 0); /* just do the call */ + adjustresults(L, nresults); + lua_unlock(L); +} + + + +/* +** Execute a protected call. +*/ +struct CallS { /* data to `f_call' */ + StkId func; + int nresults; +}; + + +static void f_call (lua_State *L, void *ud) { + struct CallS *c = cast(struct CallS *, ud); + luaD_call(L, c->func, c->nresults, 0); +} + + + +LUA_API int lua_pcallk (lua_State *L, int nargs, int nresults, int errfunc, + int ctx, lua_CFunction k) { + struct CallS c; + int status; + ptrdiff_t func; + lua_lock(L); + api_check(L, k == NULL || !isLua(L->ci), + "cannot use continuations inside hooks"); + api_checknelems(L, nargs+1); + api_check(L, L->status == LUA_OK, "cannot do calls on non-normal thread"); + checkresults(L, nargs, nresults); + if (errfunc == 0) + func = 0; + else { + StkId o = index2addr(L, errfunc); + api_checkstackindex(L, errfunc, o); + func = savestack(L, o); + } + c.func = L->top - (nargs+1); /* function to be called */ + if (k == NULL || L->nny > 0) { /* no continuation or no yieldable? */ + c.nresults = nresults; /* do a 'conventional' protected call */ + status = luaD_pcall(L, f_call, &c, savestack(L, c.func), func); + } + else { /* prepare continuation (call is already protected by 'resume') */ + CallInfo *ci = L->ci; + ci->u.c.k = k; /* save continuation */ + ci->u.c.ctx = ctx; /* save context */ + /* save information for error recovery */ + ci->extra = savestack(L, c.func); + ci->u.c.old_allowhook = L->allowhook; + ci->u.c.old_errfunc = L->errfunc; + L->errfunc = func; + /* mark that function may do error recovery */ + ci->callstatus |= CIST_YPCALL; + luaD_call(L, c.func, nresults, 1); /* do the call */ + ci->callstatus &= ~CIST_YPCALL; + L->errfunc = ci->u.c.old_errfunc; + status = LUA_OK; /* if it is here, there were no errors */ + } + adjustresults(L, nresults); + lua_unlock(L); + return status; +} + + +LUA_API int lua_load (lua_State *L, lua_Reader reader, void *data, + const char *chunkname, const char *mode) { + ZIO z; + int status; + lua_lock(L); + if (!chunkname) chunkname = "?"; + luaZ_init(L, &z, reader, data); + status = luaD_protectedparser(L, &z, chunkname, mode); + if (status == LUA_OK) { /* no errors? */ + LClosure *f = clLvalue(L->top - 1); /* get newly created function */ + if (f->nupvalues == 1) { /* does it have one upvalue? */ + /* get global table from registry */ + Table *reg = hvalue(&G(L)->l_registry); + const TValue *gt = luaH_getint(reg, LUA_RIDX_GLOBALS); + /* set global table as 1st upvalue of 'f' (may be LUA_ENV) */ + setobj(L, f->upvals[0]->v, gt); + luaC_barrier(L, f->upvals[0], gt); + } + } + lua_unlock(L); + return status; +} + +#if defined(LUA_USE_DUMP) +LUA_API int lua_dump (lua_State *L, lua_Writer writer, void *data) { + int status; + TValue *o; + lua_lock(L); + api_checknelems(L, 1); + o = L->top - 1; + if (isLfunction(o)) + status = luaU_dump(L, getproto(o), writer, data, 0); + else + status = 1; + lua_unlock(L); + return status; +} +#endif + +LUA_API int lua_status (lua_State *L) { + return L->status; +} + + +/* +** Garbage-collection function +*/ + +LUA_API int lua_gc (lua_State *L, int what, int data) { + int res = 0; + global_State *g; + lua_lock(L); + g = G(L); + switch (what) { + case LUA_GCSTOP: { + g->gcrunning = 0; + break; + } + case LUA_GCRESTART: { + luaE_setdebt(g, 0); + g->gcrunning = 1; + break; + } + case LUA_GCCOLLECT: { + luaC_fullgc(L, 0); + break; + } + case LUA_GCCOUNT: { + /* GC values are expressed in Kbytes: #bytes/2^10 */ + res = cast_int(gettotalbytes(g) >> 10); + break; + } + case LUA_GCCOUNTB: { + res = cast_int(gettotalbytes(g) & 0x3ff); + break; + } + case LUA_GCSTEP: { + if (g->gckind == KGC_GEN) { /* generational mode? */ + res = (g->GCestimate == 0); /* true if it will do major collection */ + luaC_forcestep(L); /* do a single step */ + } + else { + lu_mem debt = cast(lu_mem, data) * 1024 - GCSTEPSIZE; + if (g->gcrunning) + debt += g->GCdebt; /* include current debt */ + luaE_setdebt(g, debt); + luaC_forcestep(L); + if (g->gcstate == GCSpause) /* end of cycle? */ + res = 1; /* signal it */ + } + break; + } + case LUA_GCSETPAUSE: { + res = g->gcpause; + g->gcpause = data; + break; + } + case LUA_GCSETMAJORINC: { + res = g->gcmajorinc; + g->gcmajorinc = data; + break; + } + case LUA_GCSETSTEPMUL: { + res = g->gcstepmul; + g->gcstepmul = data; + break; + } + case LUA_GCISRUNNING: { + res = g->gcrunning; + break; + } + case LUA_GCGEN: { /* change collector to generational mode */ + luaC_changemode(L, KGC_GEN); + break; + } + case LUA_GCINC: { /* change collector to incremental mode */ + luaC_changemode(L, KGC_NORMAL); + break; + } + default: res = -1; /* invalid option */ + } + lua_unlock(L); + return res; +} + + + +/* +** miscellaneous functions +*/ + + +LUA_API int lua_error (lua_State *L) { + lua_lock(L); + api_checknelems(L, 1); + luaG_errormsg(L); + /* code unreachable; will unlock when control actually leaves the kernel */ + return 0; /* to avoid warnings */ +} + + +LUA_API int lua_next (lua_State *L, int idx) { + StkId t; + int more; + lua_lock(L); + t = index2addr(L, idx); + api_check(L, ttistable(t), "table expected"); + more = luaH_next(L, hvalue(t), L->top - 1); + if (more) { + api_incr_top(L); + } + else /* no more elements */ + L->top -= 1; /* remove key */ + lua_unlock(L); + return more; +} + + +LUA_API void lua_concat (lua_State *L, int n) { + lua_lock(L); + api_checknelems(L, n); + if (n >= 2) { + luaC_checkGC(L); + luaV_concat(L, n); + } + else if (n == 0) { /* push empty string */ + setsvalue2s(L, L->top, luaS_newlstr(L, "", 0)); + api_incr_top(L); + } + /* else n == 1; nothing to do */ + lua_unlock(L); +} + + +LUA_API void lua_len (lua_State *L, int idx) { + StkId t; + lua_lock(L); + t = index2addr(L, idx); + luaV_objlen(L, L->top, t); + api_incr_top(L); + lua_unlock(L); +} + + +LUA_API lua_Alloc lua_getallocf (lua_State *L, void **ud) { + lua_Alloc f; + lua_lock(L); + if (ud) *ud = G(L)->ud; + f = G(L)->frealloc; + lua_unlock(L); + return f; +} + + +LUA_API void lua_setallocf (lua_State *L, lua_Alloc f, void *ud) { + lua_lock(L); + G(L)->ud = ud; + G(L)->frealloc = f; + lua_unlock(L); +} + + +LUA_API void *lua_newuserdata (lua_State *L, size_t size) { + Udata *u; + lua_lock(L); + luaC_checkGC(L); + u = luaS_newudata(L, size, NULL); + setuvalue(L, L->top, u); + api_incr_top(L); + lua_unlock(L); + return u + 1; +} + + + +static const char *aux_upvalue (StkId fi, int n, TValue **val, + GCObject **owner) { + switch (ttype(fi)) { + case LUA_TCCL: { /* C closure */ + CClosure *f = clCvalue(fi); + if (!(1 <= n && n <= f->nupvalues)) return NULL; + *val = &f->upvalue[n-1]; + if (owner) *owner = obj2gco(f); + return ""; + } + case LUA_TLCL: { /* Lua closure */ + LClosure *f = clLvalue(fi); + TString *name; + Proto *p = f->p; + if (!(1 <= n && n <= p->sizeupvalues)) return NULL; + *val = f->upvals[n-1]->v; + if (owner) *owner = obj2gco(f->upvals[n - 1]); + name = p->upvalues[n-1].name; + return (name == NULL) ? "" : getstr(name); + } + default: return NULL; /* not a closure */ + } +} + + +LUA_API const char *lua_getupvalue (lua_State *L, int funcindex, int n) { + const char *name; + TValue *val = NULL; /* to avoid warnings */ + lua_lock(L); + name = aux_upvalue(index2addr(L, funcindex), n, &val, NULL); + if (name) { + setobj2s(L, L->top, val); + api_incr_top(L); + } + lua_unlock(L); + return name; +} + + +LUA_API const char *lua_setupvalue (lua_State *L, int funcindex, int n) { + const char *name; + TValue *val = NULL; /* to avoid warnings */ + GCObject *owner = NULL; /* to avoid warnings */ + StkId fi; + lua_lock(L); + fi = index2addr(L, funcindex); + api_checknelems(L, 1); + name = aux_upvalue(fi, n, &val, &owner); + if (name) { + L->top--; + setobj(L, val, L->top); + luaC_barrier(L, owner, L->top); + } + lua_unlock(L); + return name; +} + + +static UpVal **getupvalref (lua_State *L, int fidx, int n, LClosure **pf) { + LClosure *f; + StkId fi = index2addr(L, fidx); + api_check(L, ttisLclosure(fi), "Lua function expected"); + f = clLvalue(fi); + api_check(L, (1 <= n && n <= f->p->sizeupvalues), "invalid upvalue index"); + if (pf) *pf = f; + return &f->upvals[n - 1]; /* get its upvalue pointer */ +} + + +LUA_API void *lua_upvalueid (lua_State *L, int fidx, int n) { + StkId fi = index2addr(L, fidx); + switch (ttype(fi)) { + case LUA_TLCL: { /* lua closure */ + return *getupvalref(L, fidx, n, NULL); + } + case LUA_TCCL: { /* C closure */ + CClosure *f = clCvalue(fi); + api_check(L, 1 <= n && n <= f->nupvalues, "invalid upvalue index"); + return &f->upvalue[n - 1]; + } + default: { + api_check(L, 0, "closure expected"); + return NULL; + } + } +} + + +LUA_API void lua_upvaluejoin (lua_State *L, int fidx1, int n1, + int fidx2, int n2) { + LClosure *f1; + UpVal **up1 = getupvalref(L, fidx1, n1, &f1); + UpVal **up2 = getupvalref(L, fidx2, n2, NULL); + *up1 = *up2; + luaC_objbarrier(L, f1, *up2); +} + +#if defined(_KERNEL) + +static int __init +lua_init(void) +{ + return (0); +} + +static void __exit +lua_fini(void) +{ +} + +module_init(lua_init); +module_exit(lua_fini); + +#endif +/* END CSTYLED */ + +ZFS_MODULE_DESCRIPTION("Lua Interpreter for ZFS"); +ZFS_MODULE_AUTHOR("Lua.org"); +ZFS_MODULE_LICENSE("MIT"); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +EXPORT_SYMBOL(lua_absindex); +EXPORT_SYMBOL(lua_atpanic); +EXPORT_SYMBOL(lua_checkstack); +EXPORT_SYMBOL(lua_close); +EXPORT_SYMBOL(lua_createtable); +EXPORT_SYMBOL(lua_error); +EXPORT_SYMBOL(lua_getfield); +EXPORT_SYMBOL(lua_gettable); +EXPORT_SYMBOL(lua_gettop); +EXPORT_SYMBOL(lua_isnumber); +EXPORT_SYMBOL(lua_isstring); +EXPORT_SYMBOL(lua_newstate); +EXPORT_SYMBOL(lua_newuserdata); +EXPORT_SYMBOL(lua_next); +EXPORT_SYMBOL(lua_pcallk); +EXPORT_SYMBOL(lua_pushboolean); +EXPORT_SYMBOL(lua_pushcclosure); +EXPORT_SYMBOL(lua_pushfstring); +EXPORT_SYMBOL(lua_pushinteger); +EXPORT_SYMBOL(lua_pushlightuserdata); +EXPORT_SYMBOL(lua_pushnil); +EXPORT_SYMBOL(lua_pushnumber); +EXPORT_SYMBOL(lua_pushstring); +EXPORT_SYMBOL(lua_pushvalue); +EXPORT_SYMBOL(lua_pushvfstring); +EXPORT_SYMBOL(lua_remove); +EXPORT_SYMBOL(lua_replace); +EXPORT_SYMBOL(lua_setfield); +EXPORT_SYMBOL(lua_setglobal); +EXPORT_SYMBOL(lua_sethook); +EXPORT_SYMBOL(lua_setmetatable); +EXPORT_SYMBOL(lua_settable); +EXPORT_SYMBOL(lua_settop); +EXPORT_SYMBOL(lua_toboolean); +EXPORT_SYMBOL(lua_tointegerx); +EXPORT_SYMBOL(lua_tolstring); +EXPORT_SYMBOL(lua_tonumberx); +EXPORT_SYMBOL(lua_touserdata); +EXPORT_SYMBOL(lua_type); +EXPORT_SYMBOL(lua_typename); diff --git a/module/lua/lapi.h b/module/lua/lapi.h new file mode 100644 index 000000000000..509f46f692a7 --- /dev/null +++ b/module/lua/lapi.h @@ -0,0 +1,26 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lapi.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions from Lua API +** See Copyright Notice in lua.h +*/ + +#ifndef lapi_h +#define lapi_h + + +#include "llimits.h" +#include "lstate.h" + +#define api_incr_top(L) {L->top++; api_check(L, L->top <= L->ci->top, \ + "stack overflow");} + +#define adjustresults(L,nres) \ + { if ((nres) == LUA_MULTRET && L->ci->top < L->top) L->ci->top = L->top; } + +#define api_checknelems(L,n) api_check(L, (n) < (L->top - L->ci->func), \ + "not enough elements in the stack") + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lauxlib.c b/module/lua/lauxlib.c new file mode 100644 index 000000000000..1e0356e7c00e --- /dev/null +++ b/module/lua/lauxlib.c @@ -0,0 +1,800 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lauxlib.c,v 1.248.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions for building Lua libraries +** See Copyright Notice in lua.h +*/ + + +/* This file uses only the official API of Lua. +** Any function declared here could be written as an application function. +*/ + +#define lauxlib_c +#define LUA_LIB + +#include + +#include + + +/* +** {====================================================== +** Traceback +** ======================================================= +*/ + + +#define LEVELS1 12 /* size of the first part of the stack */ +#define LEVELS2 10 /* size of the second part of the stack */ + + + +/* +** search for 'objidx' in table at index -1. +** return 1 + string at top if find a good name. +*/ +static int findfield (lua_State *L, int objidx, int level) { + if (level == 0 || !lua_istable(L, -1)) + return 0; /* not found */ + lua_pushnil(L); /* start 'next' loop */ + while (lua_next(L, -2)) { /* for each pair in table */ + if (lua_type(L, -2) == LUA_TSTRING) { /* ignore non-string keys */ + if (lua_rawequal(L, objidx, -1)) { /* found object? */ + lua_pop(L, 1); /* remove value (but keep name) */ + return 1; + } + else if (findfield(L, objidx, level - 1)) { /* try recursively */ + lua_remove(L, -2); /* remove table (but keep name) */ + lua_pushliteral(L, "."); + lua_insert(L, -2); /* place '.' between the two names */ + lua_concat(L, 3); + return 1; + } + } + lua_pop(L, 1); /* remove value */ + } + return 0; /* not found */ +} + + +static int pushglobalfuncname (lua_State *L, lua_Debug *ar) { + int top = lua_gettop(L); + lua_getinfo(L, "f", ar); /* push function */ + lua_pushglobaltable(L); + if (findfield(L, top + 1, 2)) { + lua_copy(L, -1, top + 1); /* move name to proper place */ + lua_pop(L, 2); /* remove pushed values */ + return 1; + } + else { + lua_settop(L, top); /* remove function and global table */ + return 0; + } +} + + +static void pushfuncname (lua_State *L, lua_Debug *ar) { + if (*ar->namewhat != '\0') /* is there a name? */ + lua_pushfstring(L, "function " LUA_QS, ar->name); + else if (*ar->what == 'm') /* main? */ + lua_pushliteral(L, "main chunk"); + else if (*ar->what == 'C') { + if (pushglobalfuncname(L, ar)) { + lua_pushfstring(L, "function " LUA_QS, lua_tostring(L, -1)); + lua_remove(L, -2); /* remove name */ + } + else + lua_pushliteral(L, "?"); + } + else + lua_pushfstring(L, "function <%s:%d>", ar->short_src, ar->linedefined); +} + + +static int countlevels (lua_State *L) { + lua_Debug ar; + int li = 1, le = 1; + /* find an upper bound */ + while (lua_getstack(L, le, &ar)) { li = le; le *= 2; } + /* do a binary search */ + while (li < le) { + int m = (li + le)/2; + if (lua_getstack(L, m, &ar)) li = m + 1; + else le = m; + } + return le - 1; +} + + +LUALIB_API void luaL_traceback (lua_State *L, lua_State *L1, + const char *msg, int level) { + lua_Debug ar; + int top = lua_gettop(L); + int numlevels = countlevels(L1); + int mark = (numlevels > LEVELS1 + LEVELS2) ? LEVELS1 : 0; + if (msg) lua_pushfstring(L, "%s\n", msg); + lua_pushliteral(L, "stack traceback:"); + while (lua_getstack(L1, level++, &ar)) { + if (level == mark) { /* too many levels? */ + lua_pushliteral(L, "\n\t..."); /* add a '...' */ + level = numlevels - LEVELS2; /* and skip to last ones */ + } + else { + lua_getinfo(L1, "Slnt", &ar); + lua_pushfstring(L, "\n\t%s:", ar.short_src); + if (ar.currentline > 0) + lua_pushfstring(L, "%d:", ar.currentline); + lua_pushliteral(L, " in "); + pushfuncname(L, &ar); + if (ar.istailcall) + lua_pushliteral(L, "\n\t(...tail calls...)"); + lua_concat(L, lua_gettop(L) - top); + } + } + lua_concat(L, lua_gettop(L) - top); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Error-report functions +** ======================================================= +*/ + +LUALIB_API int luaL_argerror (lua_State *L, int narg, const char *extramsg) { + lua_Debug ar; + if (!lua_getstack(L, 0, &ar)) /* no stack frame? */ + return luaL_error(L, "bad argument #%d (%s)", narg, extramsg); + lua_getinfo(L, "n", &ar); + if (strcmp(ar.namewhat, "method") == 0) { + narg--; /* do not count `self' */ + if (narg == 0) /* error is in the self argument itself? */ + return luaL_error(L, "calling " LUA_QS " on bad self (%s)", + ar.name, extramsg); + } + if (ar.name == NULL) + ar.name = (pushglobalfuncname(L, &ar)) ? lua_tostring(L, -1) : "?"; + return luaL_error(L, "bad argument #%d to " LUA_QS " (%s)", + narg, ar.name, extramsg); +} + + +static int typeerror (lua_State *L, int narg, const char *tname) { + const char *msg = lua_pushfstring(L, "%s expected, got %s", + tname, luaL_typename(L, narg)); + return luaL_argerror(L, narg, msg); +} + + +static void tag_error (lua_State *L, int narg, int tag) { + typeerror(L, narg, lua_typename(L, tag)); +} + + +LUALIB_API void luaL_where (lua_State *L, int level) { + lua_Debug ar; + if (lua_getstack(L, level, &ar)) { /* check function at level */ + lua_getinfo(L, "Sl", &ar); /* get info about it */ + if (ar.currentline > 0) { /* is there info? */ + lua_pushfstring(L, "%s:%d: ", ar.short_src, ar.currentline); + return; + } + } + lua_pushliteral(L, ""); /* else, no information available... */ +} + + +LUALIB_API int luaL_error (lua_State *L, const char *fmt, ...) { + va_list argp; + va_start(argp, fmt); + luaL_where(L, 1); + lua_pushvfstring(L, fmt, argp); + va_end(argp); + lua_concat(L, 2); + return lua_error(L); +} + + +#if !defined(inspectstat) /* { */ + +#if defined(LUA_USE_POSIX) + +#include + +/* +** use appropriate macros to interpret 'pclose' return status +*/ +#define inspectstat(stat,what) \ + if (WIFEXITED(stat)) { stat = WEXITSTATUS(stat); } \ + else if (WIFSIGNALED(stat)) { stat = WTERMSIG(stat); what = "signal"; } + +#else + +#define inspectstat(stat,what) /* no op */ + +#endif + +#endif /* } */ + + +/* }====================================================== */ + + +/* +** {====================================================== +** Userdata's metatable manipulation +** ======================================================= +*/ + +LUALIB_API int luaL_newmetatable (lua_State *L, const char *tname) { + luaL_getmetatable(L, tname); /* try to get metatable */ + if (!lua_isnil(L, -1)) /* name already in use? */ + return 0; /* leave previous value on top, but return 0 */ + lua_pop(L, 1); + lua_newtable(L); /* create metatable */ + lua_pushvalue(L, -1); + lua_setfield(L, LUA_REGISTRYINDEX, tname); /* registry.name = metatable */ + return 1; +} + + +LUALIB_API void luaL_setmetatable (lua_State *L, const char *tname) { + luaL_getmetatable(L, tname); + lua_setmetatable(L, -2); +} + + +LUALIB_API void *luaL_testudata (lua_State *L, int ud, const char *tname) { + void *p = lua_touserdata(L, ud); + if (p != NULL) { /* value is a userdata? */ + if (lua_getmetatable(L, ud)) { /* does it have a metatable? */ + luaL_getmetatable(L, tname); /* get correct metatable */ + if (!lua_rawequal(L, -1, -2)) /* not the same? */ + p = NULL; /* value is a userdata with wrong metatable */ + lua_pop(L, 2); /* remove both metatables */ + return p; + } + } + return NULL; /* value is not a userdata with a metatable */ +} + + +LUALIB_API void *luaL_checkudata (lua_State *L, int ud, const char *tname) { + void *p = luaL_testudata(L, ud, tname); + if (p == NULL) typeerror(L, ud, tname); + return p; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Argument check functions +** ======================================================= +*/ + +LUALIB_API int luaL_checkoption (lua_State *L, int narg, const char *def, + const char *const lst[]) { + const char *name = (def) ? luaL_optstring(L, narg, def) : + luaL_checkstring(L, narg); + int i; + for (i=0; lst[i]; i++) + if (strcmp(lst[i], name) == 0) + return i; + return luaL_argerror(L, narg, + lua_pushfstring(L, "invalid option " LUA_QS, name)); +} + + +LUALIB_API void luaL_checkstack (lua_State *L, int space, const char *msg) { + /* keep some extra space to run error routines, if needed */ + const int extra = LUA_MINSTACK; + if (!lua_checkstack(L, space + extra)) { + if (msg) + luaL_error(L, "stack overflow (%s)", msg); + else + luaL_error(L, "stack overflow"); + } +} + + +LUALIB_API void luaL_checktype (lua_State *L, int narg, int t) { + if (lua_type(L, narg) != t) + tag_error(L, narg, t); +} + + +LUALIB_API void luaL_checkany (lua_State *L, int narg) { + if (lua_type(L, narg) == LUA_TNONE) + luaL_argerror(L, narg, "value expected"); +} + + +LUALIB_API const char *luaL_checklstring (lua_State *L, int narg, size_t *len) { + const char *s = lua_tolstring(L, narg, len); + if (!s) tag_error(L, narg, LUA_TSTRING); + return s; +} + + +LUALIB_API const char *luaL_optlstring (lua_State *L, int narg, + const char *def, size_t *len) { + if (lua_isnoneornil(L, narg)) { + if (len) + *len = (def ? strlen(def) : 0); + return def; + } + else return luaL_checklstring(L, narg, len); +} + + +LUALIB_API lua_Number luaL_checknumber (lua_State *L, int narg) { + int isnum; + lua_Number d = lua_tonumberx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Number luaL_optnumber (lua_State *L, int narg, lua_Number def) { + return luaL_opt(L, luaL_checknumber, narg, def); +} + + +LUALIB_API lua_Integer luaL_checkinteger (lua_State *L, int narg) { + int isnum; + lua_Integer d = lua_tointegerx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Unsigned luaL_checkunsigned (lua_State *L, int narg) { + int isnum; + lua_Unsigned d = lua_tounsignedx(L, narg, &isnum); + if (!isnum) + tag_error(L, narg, LUA_TNUMBER); + return d; +} + + +LUALIB_API lua_Integer luaL_optinteger (lua_State *L, int narg, + lua_Integer def) { + return luaL_opt(L, luaL_checkinteger, narg, def); +} + + +LUALIB_API lua_Unsigned luaL_optunsigned (lua_State *L, int narg, + lua_Unsigned def) { + return luaL_opt(L, luaL_checkunsigned, narg, def); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Generic Buffer manipulation +** ======================================================= +*/ + +/* +** check whether buffer is using a userdata on the stack as a temporary +** buffer +*/ +#define buffonstack(B) ((B)->b != (B)->initb) + + +/* +** returns a pointer to a free area with at least 'sz' bytes +*/ +LUALIB_API char *luaL_prepbuffsize (luaL_Buffer *B, size_t sz) { + lua_State *L = B->L; + if (B->size - B->n < sz) { /* not enough space? */ + char *newbuff; + size_t newsize = B->size * 2; /* double buffer size */ + if (newsize - B->n < sz) /* not big enough? */ + newsize = B->n + sz; + if (newsize < B->n || newsize - B->n < sz) + luaL_error(L, "buffer too large"); + /* create larger buffer */ + newbuff = (char *)lua_newuserdata(L, newsize * sizeof(char)); + /* move content to new buffer */ + memcpy(newbuff, B->b, B->n * sizeof(char)); + if (buffonstack(B)) + lua_remove(L, -2); /* remove old buffer */ + B->b = newbuff; + B->size = newsize; + } + return &B->b[B->n]; +} + + +LUALIB_API void luaL_addlstring (luaL_Buffer *B, const char *s, size_t l) { + char *b = luaL_prepbuffsize(B, l); + memcpy(b, s, l * sizeof(char)); + luaL_addsize(B, l); +} + + +LUALIB_API void luaL_addstring (luaL_Buffer *B, const char *s) { + luaL_addlstring(B, s, strlen(s)); +} + + +LUALIB_API void luaL_pushresult (luaL_Buffer *B) { + lua_State *L = B->L; + lua_pushlstring(L, B->b, B->n); + if (buffonstack(B)) + lua_remove(L, -2); /* remove old buffer */ +} + + +LUALIB_API void luaL_pushresultsize (luaL_Buffer *B, size_t sz) { + luaL_addsize(B, sz); + luaL_pushresult(B); +} + + +LUALIB_API void luaL_addvalue (luaL_Buffer *B) { + lua_State *L = B->L; + size_t l; + const char *s = lua_tolstring(L, -1, &l); + if (buffonstack(B)) + lua_insert(L, -2); /* put value below buffer */ + luaL_addlstring(B, s, l); + lua_remove(L, (buffonstack(B)) ? -2 : -1); /* remove value */ +} + + +LUALIB_API void luaL_buffinit (lua_State *L, luaL_Buffer *B) { + B->L = L; + B->b = B->initb; + B->n = 0; + B->size = LUAL_BUFFERSIZE; +} + + +LUALIB_API char *luaL_buffinitsize (lua_State *L, luaL_Buffer *B, size_t sz) { + luaL_buffinit(L, B); + return luaL_prepbuffsize(B, sz); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Reference system +** ======================================================= +*/ + +/* index of free-list header */ +#define freelist 0 + + +LUALIB_API int luaL_ref (lua_State *L, int t) { + int ref; + if (lua_isnil(L, -1)) { + lua_pop(L, 1); /* remove from stack */ + return LUA_REFNIL; /* `nil' has a unique fixed reference */ + } + t = lua_absindex(L, t); + lua_rawgeti(L, t, freelist); /* get first free element */ + ref = (int)lua_tointeger(L, -1); /* ref = t[freelist] */ + lua_pop(L, 1); /* remove it from stack */ + if (ref != 0) { /* any free element? */ + lua_rawgeti(L, t, ref); /* remove it from list */ + lua_rawseti(L, t, freelist); /* (t[freelist] = t[ref]) */ + } + else /* no free elements */ + ref = (int)lua_rawlen(L, t) + 1; /* get a new reference */ + lua_rawseti(L, t, ref); + return ref; +} + + +LUALIB_API void luaL_unref (lua_State *L, int t, int ref) { + if (ref >= 0) { + t = lua_absindex(L, t); + lua_rawgeti(L, t, freelist); + lua_rawseti(L, t, ref); /* t[ref] = t[freelist] */ + lua_pushinteger(L, ref); + lua_rawseti(L, t, freelist); /* t[freelist] = ref */ + } +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Load functions +** ======================================================= +*/ + +typedef struct LoadS { + const char *s; + size_t size; +} LoadS; + + +static const char *getS (lua_State *L, void *ud, size_t *size) { + LoadS *ls = (LoadS *)ud; + (void)L; /* not used */ + if (ls->size == 0) return NULL; + *size = ls->size; + ls->size = 0; + return ls->s; +} + + +LUALIB_API int luaL_loadbufferx (lua_State *L, const char *buff, size_t size, + const char *name, const char *mode) { + LoadS ls; + ls.s = buff; + ls.size = size; + return lua_load(L, getS, &ls, name, mode); +} + + +LUALIB_API int luaL_loadstring (lua_State *L, const char *s) { + return luaL_loadbuffer(L, s, strlen(s), s); +} + +/* }====================================================== */ + + + +LUALIB_API int luaL_getmetafield (lua_State *L, int obj, const char *event) { + if (!lua_getmetatable(L, obj)) /* no metatable? */ + return 0; + lua_pushstring(L, event); + lua_rawget(L, -2); + if (lua_isnil(L, -1)) { + lua_pop(L, 2); /* remove metatable and metafield */ + return 0; + } + else { + lua_remove(L, -2); /* remove only metatable */ + return 1; + } +} + + +LUALIB_API int luaL_callmeta (lua_State *L, int obj, const char *event) { + obj = lua_absindex(L, obj); + if (!luaL_getmetafield(L, obj, event)) /* no metafield? */ + return 0; + lua_pushvalue(L, obj); + lua_call(L, 1, 1); + return 1; +} + + +LUALIB_API int luaL_len (lua_State *L, int idx) { + int l; + int isnum; + lua_len(L, idx); + l = (int)lua_tointegerx(L, -1, &isnum); + if (!isnum) + luaL_error(L, "object length is not a number"); + lua_pop(L, 1); /* remove object */ + return l; +} + + +LUALIB_API const char *luaL_tolstring (lua_State *L, int idx, size_t *len) { + if (!luaL_callmeta(L, idx, "__tostring")) { /* no metafield? */ + switch (lua_type(L, idx)) { + case LUA_TNUMBER: + case LUA_TSTRING: + lua_pushvalue(L, idx); + break; + case LUA_TBOOLEAN: + lua_pushstring(L, (lua_toboolean(L, idx) ? "true" : "false")); + break; + case LUA_TNIL: + lua_pushliteral(L, "nil"); + break; + default: + lua_pushfstring(L, "%s: %p", luaL_typename(L, idx), + lua_topointer(L, idx)); + break; + } + } + return lua_tolstring(L, -1, len); +} + + +/* +** {====================================================== +** Compatibility with 5.1 module functions +** ======================================================= +*/ +#if defined(LUA_COMPAT_MODULE) + +static const char *luaL_findtable (lua_State *L, int idx, + const char *fname, int szhint) { + const char *e; + if (idx) lua_pushvalue(L, idx); + do { + e = strchr(fname, '.'); + if (e == NULL) e = fname + strlen(fname); + lua_pushlstring(L, fname, e - fname); + lua_rawget(L, -2); + if (lua_isnil(L, -1)) { /* no such field? */ + lua_pop(L, 1); /* remove this nil */ + lua_createtable(L, 0, (*e == '.' ? 1 : szhint)); /* new table for field */ + lua_pushlstring(L, fname, e - fname); + lua_pushvalue(L, -2); + lua_settable(L, -4); /* set new table into field */ + } + else if (!lua_istable(L, -1)) { /* field has a non-table value? */ + lua_pop(L, 2); /* remove table and value */ + return fname; /* return problematic part of the name */ + } + lua_remove(L, -2); /* remove previous table */ + fname = e + 1; + } while (*e == '.'); + return NULL; +} + + +/* +** Count number of elements in a luaL_Reg list. +*/ +static int libsize (const luaL_Reg *l) { + int size = 0; + for (; l && l->name; l++) size++; + return size; +} + + +/* +** Find or create a module table with a given name. The function +** first looks at the _LOADED table and, if that fails, try a +** global variable with that name. In any case, leaves on the stack +** the module table. +*/ +LUALIB_API void luaL_pushmodule (lua_State *L, const char *modname, + int sizehint) { + luaL_findtable(L, LUA_REGISTRYINDEX, "_LOADED", 1); /* get _LOADED table */ + lua_getfield(L, -1, modname); /* get _LOADED[modname] */ + if (!lua_istable(L, -1)) { /* not found? */ + lua_pop(L, 1); /* remove previous result */ + /* try global variable (and create one if it does not exist) */ + lua_pushglobaltable(L); + if (luaL_findtable(L, 0, modname, sizehint) != NULL) + luaL_error(L, "name conflict for module " LUA_QS, modname); + lua_pushvalue(L, -1); + lua_setfield(L, -3, modname); /* _LOADED[modname] = new table */ + } + lua_remove(L, -2); /* remove _LOADED table */ +} + + +LUALIB_API void luaL_openlib (lua_State *L, const char *libname, + const luaL_Reg *l, int nup) { + luaL_checkversion(L); + if (libname) { + luaL_pushmodule(L, libname, libsize(l)); /* get/create library table */ + lua_insert(L, -(nup + 1)); /* move library table to below upvalues */ + } + if (l) + luaL_setfuncs(L, l, nup); + else + lua_pop(L, nup); /* remove upvalues */ +} + +#endif +/* }====================================================== */ + +/* +** set functions from list 'l' into table at top - 'nup'; each +** function gets the 'nup' elements at the top as upvalues. +** Returns with only the table at the stack. +*/ +LUALIB_API void luaL_setfuncs (lua_State *L, const luaL_Reg *l, int nup) { + luaL_checkversion(L); + luaL_checkstack(L, nup, "too many upvalues"); + for (; l->name != NULL; l++) { /* fill the table with given functions */ + int i; + for (i = 0; i < nup; i++) /* copy upvalues to the top */ + lua_pushvalue(L, -nup); + lua_pushcclosure(L, l->func, nup); /* closure with those upvalues */ + lua_setfield(L, -(nup + 2), l->name); + } + lua_pop(L, nup); /* remove upvalues */ +} + + +/* +** ensure that stack[idx][fname] has a table and push that table +** into the stack +*/ +LUALIB_API int luaL_getsubtable (lua_State *L, int idx, const char *fname) { + lua_getfield(L, idx, fname); + if (lua_istable(L, -1)) return 1; /* table already there */ + else { + lua_pop(L, 1); /* remove previous result */ + idx = lua_absindex(L, idx); + lua_newtable(L); + lua_pushvalue(L, -1); /* copy to be left at top */ + lua_setfield(L, idx, fname); /* assign new table to field */ + return 0; /* false, because did not find table there */ + } +} + + +/* +** stripped-down 'require'. Calls 'openf' to open a module, +** registers the result in 'package.loaded' table and, if 'glb' +** is true, also registers the result in the global table. +** Leaves resulting module on the top. +*/ +LUALIB_API void luaL_requiref (lua_State *L, const char *modname, + lua_CFunction openf, int glb) { + lua_pushcfunction(L, openf); + lua_pushstring(L, modname); /* argument to open function */ + lua_call(L, 1, 1); /* open module */ + luaL_getsubtable(L, LUA_REGISTRYINDEX, "_LOADED"); + lua_pushvalue(L, -2); /* make copy of module (call result) */ + lua_setfield(L, -2, modname); /* _LOADED[modname] = module */ + lua_pop(L, 1); /* remove _LOADED table */ + if (glb) { + lua_pushvalue(L, -1); /* copy of 'mod' */ + lua_setglobal(L, modname); /* _G[modname] = module */ + } +} + + +LUALIB_API const char *luaL_gsub (lua_State *L, const char *s, const char *p, + const char *r) { + const char *wild; + size_t l = strlen(p); + luaL_Buffer b; + luaL_buffinit(L, &b); + while ((wild = strstr(s, p)) != NULL) { + luaL_addlstring(&b, s, wild - s); /* push prefix */ + luaL_addstring(&b, r); /* push replacement in place of pattern */ + s = wild + l; /* continue after `p' */ + } + luaL_addstring(&b, s); /* push last suffix */ + luaL_pushresult(&b); + return lua_tostring(L, -1); +} + + +LUALIB_API void luaL_checkversion_ (lua_State *L, lua_Number ver) { + const lua_Number *v = lua_version(L); + if (v != lua_version(NULL)) + luaL_error(L, "multiple Lua VMs detected"); + else if (*v != ver) + luaL_error(L, "version mismatch: app. needs %f, Lua core provides %f", + ver, *v); + /* check conversions number -> integer types */ + lua_pushnumber(L, -(lua_Number)0x1234); + if (lua_tointeger(L, -1) != -0x1234 || + lua_tounsigned(L, -1) != (lua_Unsigned)-0x1234) + luaL_error(L, "bad conversion number->int;" + " must recompile Lua with proper settings"); + lua_pop(L, 1); +} + +#if defined(_KERNEL) + +EXPORT_SYMBOL(luaL_argerror); +EXPORT_SYMBOL(luaL_error); +EXPORT_SYMBOL(luaL_loadbufferx); +EXPORT_SYMBOL(luaL_newmetatable); +EXPORT_SYMBOL(luaL_traceback); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lbaselib.c b/module/lua/lbaselib.c new file mode 100644 index 000000000000..854649a0fb4d --- /dev/null +++ b/module/lua/lbaselib.c @@ -0,0 +1,296 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lbaselib.c,v 1.276.1.1 2013/04/12 18:48:47 roberto Exp $ +** Basic library +** See Copyright Notice in lua.h +*/ + +/* The following built-in lua functions have been removed and are not available + * for use in ZFS channel programs: + * + * dofile + * loadfile + * load + * pcall + * print + * xpcall + */ + + +#define lbaselib_c +#define LUA_LIB + +#include + +#include +#include + +#define SPACECHARS " \f\n\r\t\v" + +static int luaB_tonumber (lua_State *L) { + if (lua_isnoneornil(L, 2)) { /* standard conversion */ + int isnum; + lua_Number n = lua_tonumberx(L, 1, &isnum); + if (isnum) { + lua_pushnumber(L, n); + return 1; + } /* else not a number; must be something */ + luaL_checkany(L, 1); + } + else { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + const char *e = s + l; /* end point for 's' */ + int base = luaL_checkint(L, 2); + int neg = 0; + luaL_argcheck(L, 2 <= base && base <= 36, 2, "base out of range"); + s += strspn(s, SPACECHARS); /* skip initial spaces */ + if (*s == '-') { s++; neg = 1; } /* handle signal */ + else if (*s == '+') s++; + if (isalnum((unsigned char)*s)) { + lua_Number n = 0; + do { + int digit = (isdigit((unsigned char)*s)) ? *s - '0' + : toupper((unsigned char)*s) - 'A' + 10; + if (digit >= base) break; /* invalid numeral; force a fail */ + n = n * (lua_Number)base + (lua_Number)digit; + s++; + } while (isalnum((unsigned char)*s)); + s += strspn(s, SPACECHARS); /* skip trailing spaces */ + if (s == e) { /* no invalid trailing characters? */ + lua_pushnumber(L, (neg) ? -n : n); + return 1; + } /* else not a number */ + } /* else not a number */ + } + lua_pushnil(L); /* not a number */ + return 1; +} + + +static int luaB_error (lua_State *L) { + int level = luaL_optint(L, 2, 1); + lua_settop(L, 1); + if (lua_isstring(L, 1) && level > 0) { /* add extra information? */ + luaL_where(L, level); + lua_pushvalue(L, 1); + lua_concat(L, 2); + } + return lua_error(L); +} + + +static int luaB_getmetatable (lua_State *L) { + luaL_checkany(L, 1); + if (!lua_getmetatable(L, 1)) { + lua_pushnil(L); + return 1; /* no metatable */ + } + luaL_getmetafield(L, 1, "__metatable"); + return 1; /* returns either __metatable field (if present) or metatable */ +} + + +static int luaB_setmetatable (lua_State *L) { + int t = lua_type(L, 2); + luaL_checktype(L, 1, LUA_TTABLE); + luaL_argcheck(L, t == LUA_TNIL || t == LUA_TTABLE, 2, + "nil or table expected"); + if (luaL_getmetafield(L, 1, "__metatable")) + return luaL_error(L, "cannot change a protected metatable"); + lua_settop(L, 2); + lua_setmetatable(L, 1); + return 1; +} + + +static int luaB_rawequal (lua_State *L) { + luaL_checkany(L, 1); + luaL_checkany(L, 2); + lua_pushboolean(L, lua_rawequal(L, 1, 2)); + return 1; +} + + +static int luaB_rawlen (lua_State *L) { + int t = lua_type(L, 1); + luaL_argcheck(L, t == LUA_TTABLE || t == LUA_TSTRING, 1, + "table or string expected"); + lua_pushinteger(L, lua_rawlen(L, 1)); + return 1; +} + + +static int luaB_rawget (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + luaL_checkany(L, 2); + lua_settop(L, 2); + lua_rawget(L, 1); + return 1; +} + +static int luaB_rawset (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + luaL_checkany(L, 2); + luaL_checkany(L, 3); + lua_settop(L, 3); + lua_rawset(L, 1); + return 1; +} + + +static int luaB_collectgarbage (lua_State *L) { + static const char *const opts[] = {"stop", "restart", "collect", + "count", "step", "setpause", "setstepmul", + "setmajorinc", "isrunning", "generational", "incremental", NULL}; + static const int optsnum[] = {LUA_GCSTOP, LUA_GCRESTART, LUA_GCCOLLECT, + LUA_GCCOUNT, LUA_GCSTEP, LUA_GCSETPAUSE, LUA_GCSETSTEPMUL, + LUA_GCSETMAJORINC, LUA_GCISRUNNING, LUA_GCGEN, LUA_GCINC}; + int o = optsnum[luaL_checkoption(L, 1, "collect", opts)]; + int ex = luaL_optint(L, 2, 0); + int res = lua_gc(L, o, ex); + switch (o) { + case LUA_GCCOUNT: { + int b = lua_gc(L, LUA_GCCOUNTB, 0); + lua_pushnumber(L, res + ((lua_Number)b/1024)); + lua_pushinteger(L, b); + return 2; + } + case LUA_GCSTEP: case LUA_GCISRUNNING: { + lua_pushboolean(L, res); + return 1; + } + default: { + lua_pushinteger(L, res); + return 1; + } + } +} + + +static int luaB_type (lua_State *L) { + luaL_checkany(L, 1); + lua_pushstring(L, luaL_typename(L, 1)); + return 1; +} + + +static int pairsmeta (lua_State *L, const char *method, int iszero, + lua_CFunction iter) { + if (!luaL_getmetafield(L, 1, method)) { /* no metamethod? */ + luaL_checktype(L, 1, LUA_TTABLE); /* argument must be a table */ + lua_pushcfunction(L, iter); /* will return generator, */ + lua_pushvalue(L, 1); /* state, */ + if (iszero) lua_pushinteger(L, 0); /* and initial value */ + else lua_pushnil(L); + } + else { + lua_pushvalue(L, 1); /* argument 'self' to metamethod */ + lua_call(L, 1, 3); /* get 3 values from metamethod */ + } + return 3; +} + + +static int luaB_next (lua_State *L) { + luaL_checktype(L, 1, LUA_TTABLE); + lua_settop(L, 2); /* create a 2nd argument if there isn't one */ + if (lua_next(L, 1)) + return 2; + else { + lua_pushnil(L); + return 1; + } +} + + +static int luaB_pairs (lua_State *L) { + return pairsmeta(L, "__pairs", 0, luaB_next); +} + + +static int ipairsaux (lua_State *L) { + int i = luaL_checkint(L, 2); + luaL_checktype(L, 1, LUA_TTABLE); + i++; /* next value */ + lua_pushinteger(L, i); + lua_rawgeti(L, 1, i); + return (lua_isnil(L, -1)) ? 1 : 2; +} + + +static int luaB_ipairs (lua_State *L) { + return pairsmeta(L, "__ipairs", 1, ipairsaux); +} + + +static int luaB_assert (lua_State *L) { + if (!lua_toboolean(L, 1)) + return luaL_error(L, "%s", luaL_optstring(L, 2, "assertion failed!")); + return lua_gettop(L); +} + + +static int luaB_select (lua_State *L) { + int n = lua_gettop(L); + if (lua_type(L, 1) == LUA_TSTRING && *lua_tostring(L, 1) == '#') { + lua_pushinteger(L, n-1); + return 1; + } + else { + int i = luaL_checkint(L, 1); + if (i < 0) i = n + i; + else if (i > n) i = n; + luaL_argcheck(L, 1 <= i, 1, "index out of range"); + return n - i; + } +} + +static int luaB_tostring (lua_State *L) { + luaL_checkany(L, 1); + luaL_tolstring(L, 1, NULL); + return 1; +} + +static const luaL_Reg base_funcs[] = { + {"assert", luaB_assert}, + {"collectgarbage", luaB_collectgarbage}, + {"error", luaB_error}, + {"getmetatable", luaB_getmetatable}, + {"ipairs", luaB_ipairs}, +#if defined(LUA_COMPAT_LOADSTRING) + {"loadstring", luaB_load}, +#endif + {"next", luaB_next}, + {"pairs", luaB_pairs}, + {"rawequal", luaB_rawequal}, + {"rawlen", luaB_rawlen}, + {"rawget", luaB_rawget}, + {"rawset", luaB_rawset}, + {"select", luaB_select}, + {"setmetatable", luaB_setmetatable}, + {"tonumber", luaB_tonumber}, + {"tostring", luaB_tostring}, + {"type", luaB_type}, + {NULL, NULL} +}; + + +LUAMOD_API int luaopen_base (lua_State *L) { + /* set global _G */ + lua_pushglobaltable(L); + lua_pushglobaltable(L); + lua_setfield(L, -2, "_G"); + /* open lib into global table */ + luaL_setfuncs(L, base_funcs, 0); + lua_pushliteral(L, LUA_VERSION); + lua_setfield(L, -2, "_VERSION"); /* set global _VERSION */ + return 1; +} + +#if defined(_KERNEL) + +EXPORT_SYMBOL(luaopen_base); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lcode.c b/module/lua/lcode.c new file mode 100644 index 000000000000..ae9a3d91d810 --- /dev/null +++ b/module/lua/lcode.c @@ -0,0 +1,884 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lcode.c,v 2.62.1.1 2013/04/12 18:48:47 roberto Exp $ +** Code generator for Lua +** See Copyright Notice in lua.h +*/ + +#define lcode_c +#define LUA_CORE + +#include + +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "llex.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstring.h" +#include "ltable.h" +#include "lvm.h" + + +#define hasjumps(e) ((e)->t != (e)->f) + + +static int isnumeral(expdesc *e) { + return (e->k == VKNUM && e->t == NO_JUMP && e->f == NO_JUMP); +} + + +void luaK_nil (FuncState *fs, int from, int n) { + Instruction *previous; + int l = from + n - 1; /* last register to set nil */ + if (fs->pc > fs->lasttarget) { /* no jumps to current position? */ + previous = &fs->f->code[fs->pc-1]; + if (GET_OPCODE(*previous) == OP_LOADNIL) { + int pfrom = GETARG_A(*previous); + int pl = pfrom + GETARG_B(*previous); + if ((pfrom <= from && from <= pl + 1) || + (from <= pfrom && pfrom <= l + 1)) { /* can connect both? */ + if (pfrom < from) from = pfrom; /* from = min(from, pfrom) */ + if (pl > l) l = pl; /* l = max(l, pl) */ + SETARG_A(*previous, from); + SETARG_B(*previous, l - from); + return; + } + } /* else go through */ + } + luaK_codeABC(fs, OP_LOADNIL, from, n - 1, 0); /* else no optimization */ +} + + +int luaK_jump (FuncState *fs) { + int jpc = fs->jpc; /* save list of jumps to here */ + int j; + fs->jpc = NO_JUMP; + j = luaK_codeAsBx(fs, OP_JMP, 0, NO_JUMP); + luaK_concat(fs, &j, jpc); /* keep them on hold */ + return j; +} + + +void luaK_ret (FuncState *fs, int first, int nret) { + luaK_codeABC(fs, OP_RETURN, first, nret+1, 0); +} + + +static int condjump (FuncState *fs, OpCode op, int A, int B, int C) { + luaK_codeABC(fs, op, A, B, C); + return luaK_jump(fs); +} + + +static void fixjump (FuncState *fs, int pc, int dest) { + Instruction *jmp = &fs->f->code[pc]; + int offset = dest-(pc+1); + lua_assert(dest != NO_JUMP); + if (abs(offset) > MAXARG_sBx) + luaX_syntaxerror(fs->ls, "control structure too long"); + SETARG_sBx(*jmp, offset); +} + + +/* +** returns current `pc' and marks it as a jump target (to avoid wrong +** optimizations with consecutive instructions not in the same basic block). +*/ +int luaK_getlabel (FuncState *fs) { + fs->lasttarget = fs->pc; + return fs->pc; +} + + +static int getjump (FuncState *fs, int pc) { + int offset = GETARG_sBx(fs->f->code[pc]); + if (offset == NO_JUMP) /* point to itself represents end of list */ + return NO_JUMP; /* end of list */ + else + return (pc+1)+offset; /* turn offset into absolute position */ +} + + +static Instruction *getjumpcontrol (FuncState *fs, int pc) { + Instruction *pi = &fs->f->code[pc]; + if (pc >= 1 && testTMode(GET_OPCODE(*(pi-1)))) + return pi-1; + else + return pi; +} + + +/* +** check whether list has any jump that do not produce a value +** (or produce an inverted value) +*/ +static int need_value (FuncState *fs, int list) { + for (; list != NO_JUMP; list = getjump(fs, list)) { + Instruction i = *getjumpcontrol(fs, list); + if (GET_OPCODE(i) != OP_TESTSET) return 1; + } + return 0; /* not found */ +} + + +static int patchtestreg (FuncState *fs, int node, int reg) { + Instruction *i = getjumpcontrol(fs, node); + if (GET_OPCODE(*i) != OP_TESTSET) + return 0; /* cannot patch other instructions */ + if (reg != NO_REG && reg != GETARG_B(*i)) + SETARG_A(*i, reg); + else /* no register to put value or register already has the value */ + *i = CREATE_ABC(OP_TEST, GETARG_B(*i), 0, GETARG_C(*i)); + + return 1; +} + + +static void removevalues (FuncState *fs, int list) { + for (; list != NO_JUMP; list = getjump(fs, list)) + patchtestreg(fs, list, NO_REG); +} + + +static void patchlistaux (FuncState *fs, int list, int vtarget, int reg, + int dtarget) { + while (list != NO_JUMP) { + int next = getjump(fs, list); + if (patchtestreg(fs, list, reg)) + fixjump(fs, list, vtarget); + else + fixjump(fs, list, dtarget); /* jump to default target */ + list = next; + } +} + + +static void dischargejpc (FuncState *fs) { + patchlistaux(fs, fs->jpc, fs->pc, NO_REG, fs->pc); + fs->jpc = NO_JUMP; +} + + +void luaK_patchlist (FuncState *fs, int list, int target) { + if (target == fs->pc) + luaK_patchtohere(fs, list); + else { + lua_assert(target < fs->pc); + patchlistaux(fs, list, target, NO_REG, target); + } +} + + +LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level) { + level++; /* argument is +1 to reserve 0 as non-op */ + while (list != NO_JUMP) { + int next = getjump(fs, list); + lua_assert(GET_OPCODE(fs->f->code[list]) == OP_JMP && + (GETARG_A(fs->f->code[list]) == 0 || + GETARG_A(fs->f->code[list]) >= level)); + SETARG_A(fs->f->code[list], level); + list = next; + } +} + + +void luaK_patchtohere (FuncState *fs, int list) { + luaK_getlabel(fs); + luaK_concat(fs, &fs->jpc, list); +} + + +void luaK_concat (FuncState *fs, int *l1, int l2) { + if (l2 == NO_JUMP) return; + else if (*l1 == NO_JUMP) + *l1 = l2; + else { + int list = *l1; + int next; + while ((next = getjump(fs, list)) != NO_JUMP) /* find last element */ + list = next; + fixjump(fs, list, l2); + } +} + + +static int luaK_code (FuncState *fs, Instruction i) { + Proto *f = fs->f; + dischargejpc(fs); /* `pc' will change */ + /* put new instruction in code array */ + luaM_growvector(fs->ls->L, f->code, fs->pc, f->sizecode, Instruction, + MAX_INT, "opcodes"); + f->code[fs->pc] = i; + /* save corresponding line information */ + luaM_growvector(fs->ls->L, f->lineinfo, fs->pc, f->sizelineinfo, int, + MAX_INT, "opcodes"); + f->lineinfo[fs->pc] = fs->ls->lastline; + return fs->pc++; +} + + +int luaK_codeABC (FuncState *fs, OpCode o, int a, int b, int c) { + lua_assert(getOpMode(o) == iABC); + lua_assert(getBMode(o) != OpArgN || b == 0); + lua_assert(getCMode(o) != OpArgN || c == 0); + lua_assert(a <= MAXARG_A && b <= MAXARG_B && c <= MAXARG_C); + return luaK_code(fs, CREATE_ABC(o, a, b, c)); +} + + +int luaK_codeABx (FuncState *fs, OpCode o, int a, unsigned int bc) { + lua_assert(getOpMode(o) == iABx || getOpMode(o) == iAsBx); + lua_assert(getCMode(o) == OpArgN); + lua_assert(a <= MAXARG_A && bc <= MAXARG_Bx); + return luaK_code(fs, CREATE_ABx(o, a, bc)); +} + + +static int codeextraarg (FuncState *fs, int a) { + lua_assert(a <= MAXARG_Ax); + return luaK_code(fs, CREATE_Ax(OP_EXTRAARG, a)); +} + + +int luaK_codek (FuncState *fs, int reg, int k) { + if (k <= MAXARG_Bx) + return luaK_codeABx(fs, OP_LOADK, reg, k); + else { + int p = luaK_codeABx(fs, OP_LOADKX, reg, 0); + codeextraarg(fs, k); + return p; + } +} + + +void luaK_checkstack (FuncState *fs, int n) { + int newstack = fs->freereg + n; + if (newstack > fs->f->maxstacksize) { + if (newstack >= MAXSTACK) + luaX_syntaxerror(fs->ls, "function or expression too complex"); + fs->f->maxstacksize = cast_byte(newstack); + } +} + + +void luaK_reserveregs (FuncState *fs, int n) { + luaK_checkstack(fs, n); + fs->freereg += n; +} + + +static void freereg (FuncState *fs, int reg) { + if (!ISK(reg) && reg >= fs->nactvar) { + fs->freereg--; + lua_assert(reg == fs->freereg); + } +} + + +static void freeexp (FuncState *fs, expdesc *e) { + if (e->k == VNONRELOC) + freereg(fs, e->u.info); +} + + +static int addk (FuncState *fs, TValue *key, TValue *v) { + lua_State *L = fs->ls->L; + TValue *idx = luaH_set(L, fs->h, key); + Proto *f = fs->f; + int k, oldsize; + if (ttisnumber(idx)) { + lua_Number n = nvalue(idx); + lua_number2int(k, n); + if (luaV_rawequalobj(&f->k[k], v)) + return k; + /* else may be a collision (e.g., between 0.0 and "\0\0\0\0\0\0\0\0"); + go through and create a new entry for this value */ + } + /* constant not found; create a new entry */ + oldsize = f->sizek; + k = fs->nk; + /* numerical value does not need GC barrier; + table has no metatable, so it does not need to invalidate cache */ + setnvalue(idx, cast_num(k)); + luaM_growvector(L, f->k, k, f->sizek, TValue, MAXARG_Ax, "constants"); + while (oldsize < f->sizek) setnilvalue(&f->k[oldsize++]); + setobj(L, &f->k[k], v); + fs->nk++; + luaC_barrier(L, f, v); + return k; +} + + +int luaK_stringK (FuncState *fs, TString *s) { + TValue o; + setsvalue(fs->ls->L, &o, s); + return addk(fs, &o, &o); +} + + +int luaK_numberK (FuncState *fs, lua_Number r) { + int n; + lua_State *L = fs->ls->L; + TValue o; + setnvalue(&o, r); + if (r == 0 || luai_numisnan(NULL, r)) { /* handle -0 and NaN */ + /* use raw representation as key to avoid numeric problems */ + setsvalue(L, L->top++, luaS_newlstr(L, (char *)&r, sizeof(r))); + n = addk(fs, L->top - 1, &o); + L->top--; + } + else + n = addk(fs, &o, &o); /* regular case */ + return n; +} + + +static int boolK (FuncState *fs, int b) { + TValue o; + setbvalue(&o, b); + return addk(fs, &o, &o); +} + + +static int nilK (FuncState *fs) { + TValue k, v; + setnilvalue(&v); + /* cannot use nil as key; instead use table itself to represent nil */ + sethvalue(fs->ls->L, &k, fs->h); + return addk(fs, &k, &v); +} + + +void luaK_setreturns (FuncState *fs, expdesc *e, int nresults) { + if (e->k == VCALL) { /* expression is an open function call? */ + SETARG_C(getcode(fs, e), nresults+1); + } + else if (e->k == VVARARG) { + SETARG_B(getcode(fs, e), nresults+1); + SETARG_A(getcode(fs, e), fs->freereg); + luaK_reserveregs(fs, 1); + } +} + + +void luaK_setoneret (FuncState *fs, expdesc *e) { + if (e->k == VCALL) { /* expression is an open function call? */ + e->k = VNONRELOC; + e->u.info = GETARG_A(getcode(fs, e)); + } + else if (e->k == VVARARG) { + SETARG_B(getcode(fs, e), 2); + e->k = VRELOCABLE; /* can relocate its simple result */ + } +} + + +void luaK_dischargevars (FuncState *fs, expdesc *e) { + switch (e->k) { + case VLOCAL: { + e->k = VNONRELOC; + break; + } + case VUPVAL: { + e->u.info = luaK_codeABC(fs, OP_GETUPVAL, 0, e->u.info, 0); + e->k = VRELOCABLE; + break; + } + case VINDEXED: { + OpCode op = OP_GETTABUP; /* assume 't' is in an upvalue */ + freereg(fs, e->u.ind.idx); + if (e->u.ind.vt == VLOCAL) { /* 't' is in a register? */ + freereg(fs, e->u.ind.t); + op = OP_GETTABLE; + } + e->u.info = luaK_codeABC(fs, op, 0, e->u.ind.t, e->u.ind.idx); + e->k = VRELOCABLE; + break; + } + case VVARARG: + case VCALL: { + luaK_setoneret(fs, e); + break; + } + default: break; /* there is one value available (somewhere) */ + } +} + + +static int code_label (FuncState *fs, int A, int b, int jump) { + luaK_getlabel(fs); /* those instructions may be jump targets */ + return luaK_codeABC(fs, OP_LOADBOOL, A, b, jump); +} + + +static void discharge2reg (FuncState *fs, expdesc *e, int reg) { + luaK_dischargevars(fs, e); + switch (e->k) { + case VNIL: { + luaK_nil(fs, reg, 1); + break; + } + case VFALSE: case VTRUE: { + luaK_codeABC(fs, OP_LOADBOOL, reg, e->k == VTRUE, 0); + break; + } + case VK: { + luaK_codek(fs, reg, e->u.info); + break; + } + case VKNUM: { + luaK_codek(fs, reg, luaK_numberK(fs, e->u.nval)); + break; + } + case VRELOCABLE: { + Instruction *pc = &getcode(fs, e); + SETARG_A(*pc, reg); + break; + } + case VNONRELOC: { + if (reg != e->u.info) + luaK_codeABC(fs, OP_MOVE, reg, e->u.info, 0); + break; + } + default: { + lua_assert(e->k == VVOID || e->k == VJMP); + return; /* nothing to do... */ + } + } + e->u.info = reg; + e->k = VNONRELOC; +} + + +static void discharge2anyreg (FuncState *fs, expdesc *e) { + if (e->k != VNONRELOC) { + luaK_reserveregs(fs, 1); + discharge2reg(fs, e, fs->freereg-1); + } +} + + +static void exp2reg (FuncState *fs, expdesc *e, int reg) { + discharge2reg(fs, e, reg); + if (e->k == VJMP) + luaK_concat(fs, &e->t, e->u.info); /* put this jump in `t' list */ + if (hasjumps(e)) { + int final; /* position after whole expression */ + int p_f = NO_JUMP; /* position of an eventual LOAD false */ + int p_t = NO_JUMP; /* position of an eventual LOAD true */ + if (need_value(fs, e->t) || need_value(fs, e->f)) { + int fj = (e->k == VJMP) ? NO_JUMP : luaK_jump(fs); + p_f = code_label(fs, reg, 0, 1); + p_t = code_label(fs, reg, 1, 0); + luaK_patchtohere(fs, fj); + } + final = luaK_getlabel(fs); + patchlistaux(fs, e->f, final, reg, p_f); + patchlistaux(fs, e->t, final, reg, p_t); + } + e->f = e->t = NO_JUMP; + e->u.info = reg; + e->k = VNONRELOC; +} + + +void luaK_exp2nextreg (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + freeexp(fs, e); + luaK_reserveregs(fs, 1); + exp2reg(fs, e, fs->freereg - 1); +} + + +int luaK_exp2anyreg (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + if (e->k == VNONRELOC) { + if (!hasjumps(e)) return e->u.info; /* exp is already in a register */ + if (e->u.info >= fs->nactvar) { /* reg. is not a local? */ + exp2reg(fs, e, e->u.info); /* put value on it */ + return e->u.info; + } + } + luaK_exp2nextreg(fs, e); /* default */ + return e->u.info; +} + + +void luaK_exp2anyregup (FuncState *fs, expdesc *e) { + if (e->k != VUPVAL || hasjumps(e)) + luaK_exp2anyreg(fs, e); +} + + +void luaK_exp2val (FuncState *fs, expdesc *e) { + if (hasjumps(e)) + luaK_exp2anyreg(fs, e); + else + luaK_dischargevars(fs, e); +} + + +int luaK_exp2RK (FuncState *fs, expdesc *e) { + luaK_exp2val(fs, e); + switch (e->k) { + case VTRUE: + case VFALSE: + case VNIL: { + if (fs->nk <= MAXINDEXRK) { /* constant fits in RK operand? */ + e->u.info = (e->k == VNIL) ? nilK(fs) : boolK(fs, (e->k == VTRUE)); + e->k = VK; + return RKASK(e->u.info); + } + else break; + } + case VKNUM: { + e->u.info = luaK_numberK(fs, e->u.nval); + e->k = VK; + /* go through */ + } + case VK: { + if (e->u.info <= MAXINDEXRK) /* constant fits in argC? */ + return RKASK(e->u.info); + else break; + } + default: break; + } + /* not a constant in the right range: put it in a register */ + return luaK_exp2anyreg(fs, e); +} + + +void luaK_storevar (FuncState *fs, expdesc *var, expdesc *ex) { + switch (var->k) { + case VLOCAL: { + freeexp(fs, ex); + exp2reg(fs, ex, var->u.info); + return; + } + case VUPVAL: { + int e = luaK_exp2anyreg(fs, ex); + luaK_codeABC(fs, OP_SETUPVAL, e, var->u.info, 0); + break; + } + case VINDEXED: { + OpCode op = (var->u.ind.vt == VLOCAL) ? OP_SETTABLE : OP_SETTABUP; + int e = luaK_exp2RK(fs, ex); + luaK_codeABC(fs, op, var->u.ind.t, var->u.ind.idx, e); + break; + } + default: { + lua_assert(0); /* invalid var kind to store */ + break; + } + } + freeexp(fs, ex); +} + + +void luaK_self (FuncState *fs, expdesc *e, expdesc *key) { + int ereg; + luaK_exp2anyreg(fs, e); + ereg = e->u.info; /* register where 'e' was placed */ + freeexp(fs, e); + e->u.info = fs->freereg; /* base register for op_self */ + e->k = VNONRELOC; + luaK_reserveregs(fs, 2); /* function and 'self' produced by op_self */ + luaK_codeABC(fs, OP_SELF, e->u.info, ereg, luaK_exp2RK(fs, key)); + freeexp(fs, key); +} + + +static void invertjump (FuncState *fs, expdesc *e) { + Instruction *pc = getjumpcontrol(fs, e->u.info); + lua_assert(testTMode(GET_OPCODE(*pc)) && GET_OPCODE(*pc) != OP_TESTSET && + GET_OPCODE(*pc) != OP_TEST); + SETARG_A(*pc, !(GETARG_A(*pc))); +} + + +static int jumponcond (FuncState *fs, expdesc *e, int cond) { + if (e->k == VRELOCABLE) { + Instruction ie = getcode(fs, e); + if (GET_OPCODE(ie) == OP_NOT) { + fs->pc--; /* remove previous OP_NOT */ + return condjump(fs, OP_TEST, GETARG_B(ie), 0, !cond); + } + /* else go through */ + } + discharge2anyreg(fs, e); + freeexp(fs, e); + return condjump(fs, OP_TESTSET, NO_REG, e->u.info, cond); +} + + +void luaK_goiftrue (FuncState *fs, expdesc *e) { + int pc; /* pc of last jump */ + luaK_dischargevars(fs, e); + switch (e->k) { + case VJMP: { + invertjump(fs, e); + pc = e->u.info; + break; + } + case VK: case VKNUM: case VTRUE: { + pc = NO_JUMP; /* always true; do nothing */ + break; + } + default: { + pc = jumponcond(fs, e, 0); + break; + } + } + luaK_concat(fs, &e->f, pc); /* insert last jump in `f' list */ + luaK_patchtohere(fs, e->t); + e->t = NO_JUMP; +} + + +void luaK_goiffalse (FuncState *fs, expdesc *e) { + int pc; /* pc of last jump */ + luaK_dischargevars(fs, e); + switch (e->k) { + case VJMP: { + pc = e->u.info; + break; + } + case VNIL: case VFALSE: { + pc = NO_JUMP; /* always false; do nothing */ + break; + } + default: { + pc = jumponcond(fs, e, 1); + break; + } + } + luaK_concat(fs, &e->t, pc); /* insert last jump in `t' list */ + luaK_patchtohere(fs, e->f); + e->f = NO_JUMP; +} + + +static void codenot (FuncState *fs, expdesc *e) { + luaK_dischargevars(fs, e); + switch (e->k) { + case VNIL: case VFALSE: { + e->k = VTRUE; + break; + } + case VK: case VKNUM: case VTRUE: { + e->k = VFALSE; + break; + } + case VJMP: { + invertjump(fs, e); + break; + } + case VRELOCABLE: + case VNONRELOC: { + discharge2anyreg(fs, e); + freeexp(fs, e); + e->u.info = luaK_codeABC(fs, OP_NOT, 0, e->u.info, 0); + e->k = VRELOCABLE; + break; + } + default: { + lua_assert(0); /* cannot happen */ + break; + } + } + /* interchange true and false lists */ + { int temp = e->f; e->f = e->t; e->t = temp; } + removevalues(fs, e->f); + removevalues(fs, e->t); +} + + +void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k) { + lua_assert(!hasjumps(t)); + t->u.ind.t = t->u.info; + t->u.ind.idx = luaK_exp2RK(fs, k); + t->u.ind.vt = (t->k == VUPVAL) ? VUPVAL + : check_exp(vkisinreg(t->k), VLOCAL); + t->k = VINDEXED; +} + + +static int constfolding (OpCode op, expdesc *e1, expdesc *e2) { + lua_Number r; + if (!isnumeral(e1) || !isnumeral(e2)) return 0; + if ((op == OP_DIV || op == OP_MOD) && e2->u.nval == 0) + return 0; /* do not attempt to divide by 0 */ + /* + * Patched: check for MIN_INT / -1 + */ + if (op == OP_DIV && e1->u.nval == INT64_MIN && e2->u.nval == -1) + return 0; + r = luaO_arith(op - OP_ADD + LUA_OPADD, e1->u.nval, e2->u.nval); + e1->u.nval = r; + return 1; +} + + +static void codearith (FuncState *fs, OpCode op, + expdesc *e1, expdesc *e2, int line) { + if (constfolding(op, e1, e2)) + return; + else { + int o2 = (op != OP_UNM && op != OP_LEN) ? luaK_exp2RK(fs, e2) : 0; + int o1 = luaK_exp2RK(fs, e1); + if (o1 > o2) { + freeexp(fs, e1); + freeexp(fs, e2); + } + else { + freeexp(fs, e2); + freeexp(fs, e1); + } + e1->u.info = luaK_codeABC(fs, op, 0, o1, o2); + e1->k = VRELOCABLE; + luaK_fixline(fs, line); + } +} + + +static void codecomp (FuncState *fs, OpCode op, int cond, expdesc *e1, + expdesc *e2) { + int o1 = luaK_exp2RK(fs, e1); + int o2 = luaK_exp2RK(fs, e2); + freeexp(fs, e2); + freeexp(fs, e1); + if (cond == 0 && op != OP_EQ) { + int temp; /* exchange args to replace by `<' or `<=' */ + temp = o1; o1 = o2; o2 = temp; /* o1 <==> o2 */ + cond = 1; + } + e1->u.info = condjump(fs, op, cond, o1, o2); + e1->k = VJMP; +} + + +void luaK_prefix (FuncState *fs, UnOpr op, expdesc *e, int line) { + expdesc e2; + e2.t = e2.f = NO_JUMP; e2.k = VKNUM; e2.u.nval = 0; + switch (op) { + case OPR_MINUS: { + if (isnumeral(e)) /* minus constant? */ + e->u.nval = luai_numunm(NULL, e->u.nval); /* fold it */ + else { + luaK_exp2anyreg(fs, e); + codearith(fs, OP_UNM, e, &e2, line); + } + break; + } + case OPR_NOT: codenot(fs, e); break; + case OPR_LEN: { + luaK_exp2anyreg(fs, e); /* cannot operate on constants */ + codearith(fs, OP_LEN, e, &e2, line); + break; + } + default: lua_assert(0); + } +} + + +void luaK_infix (FuncState *fs, BinOpr op, expdesc *v) { + switch (op) { + case OPR_AND: { + luaK_goiftrue(fs, v); + break; + } + case OPR_OR: { + luaK_goiffalse(fs, v); + break; + } + case OPR_CONCAT: { + luaK_exp2nextreg(fs, v); /* operand must be on the `stack' */ + break; + } + case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: + case OPR_MOD: case OPR_POW: { + if (!isnumeral(v)) luaK_exp2RK(fs, v); + break; + } + default: { + luaK_exp2RK(fs, v); + break; + } + } +} + + +void luaK_posfix (FuncState *fs, BinOpr op, + expdesc *e1, expdesc *e2, int line) { + switch (op) { + case OPR_AND: { + lua_assert(e1->t == NO_JUMP); /* list must be closed */ + luaK_dischargevars(fs, e2); + luaK_concat(fs, &e2->f, e1->f); + *e1 = *e2; + break; + } + case OPR_OR: { + lua_assert(e1->f == NO_JUMP); /* list must be closed */ + luaK_dischargevars(fs, e2); + luaK_concat(fs, &e2->t, e1->t); + *e1 = *e2; + break; + } + case OPR_CONCAT: { + luaK_exp2val(fs, e2); + if (e2->k == VRELOCABLE && GET_OPCODE(getcode(fs, e2)) == OP_CONCAT) { + lua_assert(e1->u.info == GETARG_B(getcode(fs, e2))-1); + freeexp(fs, e1); + SETARG_B(getcode(fs, e2), e1->u.info); + e1->k = VRELOCABLE; e1->u.info = e2->u.info; + } + else { + luaK_exp2nextreg(fs, e2); /* operand must be on the 'stack' */ + codearith(fs, OP_CONCAT, e1, e2, line); + } + break; + } + case OPR_ADD: case OPR_SUB: case OPR_MUL: case OPR_DIV: + case OPR_MOD: case OPR_POW: { + codearith(fs, cast(OpCode, op - OPR_ADD + OP_ADD), e1, e2, line); + break; + } + case OPR_EQ: case OPR_LT: case OPR_LE: { + codecomp(fs, cast(OpCode, op - OPR_EQ + OP_EQ), 1, e1, e2); + break; + } + case OPR_NE: case OPR_GT: case OPR_GE: { + codecomp(fs, cast(OpCode, op - OPR_NE + OP_EQ), 0, e1, e2); + break; + } + default: lua_assert(0); + } +} + + +void luaK_fixline (FuncState *fs, int line) { + fs->f->lineinfo[fs->pc - 1] = line; +} + + +void luaK_setlist (FuncState *fs, int base, int nelems, int tostore) { + int c = (nelems - 1)/LFIELDS_PER_FLUSH + 1; + int b = (tostore == LUA_MULTRET) ? 0 : tostore; + lua_assert(tostore != 0); + if (c <= MAXARG_C) + luaK_codeABC(fs, OP_SETLIST, base, b, c); + else if (c <= MAXARG_Ax) { + luaK_codeABC(fs, OP_SETLIST, base, b, 0); + codeextraarg(fs, c); + } + else + luaX_syntaxerror(fs->ls, "constructor too long"); + fs->freereg = base + 1; /* free registers with list values */ +} +/* END CSTYLED */ diff --git a/module/lua/lcode.h b/module/lua/lcode.h new file mode 100644 index 000000000000..fd5fad00df3d --- /dev/null +++ b/module/lua/lcode.h @@ -0,0 +1,85 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lcode.h,v 1.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Code generator for Lua +** See Copyright Notice in lua.h +*/ + +#ifndef lcode_h +#define lcode_h + +#include "llex.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" + + +/* +** Marks the end of a patch list. It is an invalid value both as an absolute +** address, and as a list link (would link an element to itself). +*/ +#define NO_JUMP (-1) + + +/* +** grep "ORDER OPR" if you change these enums (ORDER OP) +*/ +typedef enum BinOpr { + OPR_ADD, OPR_SUB, OPR_MUL, OPR_DIV, OPR_MOD, OPR_POW, + OPR_CONCAT, + OPR_EQ, OPR_LT, OPR_LE, + OPR_NE, OPR_GT, OPR_GE, + OPR_AND, OPR_OR, + OPR_NOBINOPR +} BinOpr; + + +typedef enum UnOpr { OPR_MINUS, OPR_NOT, OPR_LEN, OPR_NOUNOPR } UnOpr; + + +#define getcode(fs,e) ((fs)->f->code[(e)->u.info]) + +#define luaK_codeAsBx(fs,o,A,sBx) luaK_codeABx(fs,o,A,(sBx)+MAXARG_sBx) + +#define luaK_setmultret(fs,e) luaK_setreturns(fs, e, LUA_MULTRET) + +#define luaK_jumpto(fs,t) luaK_patchlist(fs, luaK_jump(fs), t) + +LUAI_FUNC int luaK_codeABx (FuncState *fs, OpCode o, int A, unsigned int Bx); +LUAI_FUNC int luaK_codeABC (FuncState *fs, OpCode o, int A, int B, int C); +LUAI_FUNC int luaK_codek (FuncState *fs, int reg, int k); +LUAI_FUNC void luaK_fixline (FuncState *fs, int line); +LUAI_FUNC void luaK_nil (FuncState *fs, int from, int n); +LUAI_FUNC void luaK_reserveregs (FuncState *fs, int n); +LUAI_FUNC void luaK_checkstack (FuncState *fs, int n); +LUAI_FUNC int luaK_stringK (FuncState *fs, TString *s); +LUAI_FUNC int luaK_numberK (FuncState *fs, lua_Number r); +LUAI_FUNC void luaK_dischargevars (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_exp2anyreg (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2anyregup (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2nextreg (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_exp2val (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_exp2RK (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_self (FuncState *fs, expdesc *e, expdesc *key); +LUAI_FUNC void luaK_indexed (FuncState *fs, expdesc *t, expdesc *k); +LUAI_FUNC void luaK_goiftrue (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_goiffalse (FuncState *fs, expdesc *e); +LUAI_FUNC void luaK_storevar (FuncState *fs, expdesc *var, expdesc *e); +LUAI_FUNC void luaK_setreturns (FuncState *fs, expdesc *e, int nresults); +LUAI_FUNC void luaK_setoneret (FuncState *fs, expdesc *e); +LUAI_FUNC int luaK_jump (FuncState *fs); +LUAI_FUNC void luaK_ret (FuncState *fs, int first, int nret); +LUAI_FUNC void luaK_patchlist (FuncState *fs, int list, int target); +LUAI_FUNC void luaK_patchtohere (FuncState *fs, int list); +LUAI_FUNC void luaK_patchclose (FuncState *fs, int list, int level); +LUAI_FUNC void luaK_concat (FuncState *fs, int *l1, int l2); +LUAI_FUNC int luaK_getlabel (FuncState *fs); +LUAI_FUNC void luaK_prefix (FuncState *fs, UnOpr op, expdesc *v, int line); +LUAI_FUNC void luaK_infix (FuncState *fs, BinOpr op, expdesc *v); +LUAI_FUNC void luaK_posfix (FuncState *fs, BinOpr op, expdesc *v1, + expdesc *v2, int line); +LUAI_FUNC void luaK_setlist (FuncState *fs, int base, int nelems, int tostore); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lcompat.c b/module/lua/lcompat.c new file mode 100644 index 000000000000..c0a27182c7d8 --- /dev/null +++ b/module/lua/lcompat.c @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include + + +ssize_t +lcompat_sprintf(char *buf, size_t size, const char *fmt, ...) +{ + ssize_t res; + va_list args; + + va_start(args, fmt); + res = vsnprintf(buf, size, fmt, args); + va_end(args); + + return (res); +} + +int64_t +lcompat_strtoll(const char *str, char **ptr) +{ + int base; + const char *cp; + int digits; + int64_t value; + boolean_t is_negative; + + cp = str; + while (*cp == ' ' || *cp == '\t' || *cp == '\n') { + cp++; + } + is_negative = (*cp == '-'); + if (is_negative) { + cp++; + } + base = 10; + + if (*cp == '0') { + base = 8; + cp++; + if (*cp == 'x' || *cp == 'X') { + base = 16; + cp++; + } + } + + value = 0; + for (; *cp != '\0'; cp++) { + if (*cp >= '0' && *cp <= '9') { + digits = *cp - '0'; + } else if (*cp >= 'a' && *cp <= 'f') { + digits = *cp - 'a' + 10; + } else if (*cp >= 'A' && *cp <= 'F') { + digits = *cp - 'A' + 10; + } else { + break; + } + if (digits >= base) { + break; + } + value = (value * base) + digits; + } + + if (ptr != NULL) { + *ptr = (char *)cp; + } + if (is_negative) { + value = -value; + } + return (value); +} + +int64_t +lcompat_pow(int64_t x, int64_t y) +{ + int64_t result = 1; + if (y < 0) + return (0); + + while (y) { + if (y & 1) + result *= x; + y >>= 1; + x *= x; + } + return (result); +} + +int +lcompat_hashnum(int64_t x) +{ + x = (~x) + (x << 18); + x = x ^ (x >> 31); + x = x * 21; + x = x ^ (x >> 11); + x = x + (x << 6); + x = x ^ (x >> 22); + return ((int)x); +} diff --git a/module/lua/lcorolib.c b/module/lua/lcorolib.c new file mode 100644 index 000000000000..0300e7ee17d5 --- /dev/null +++ b/module/lua/lcorolib.c @@ -0,0 +1,159 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lcorolib.c,v 1.5.1.1 2013/04/12 18:48:47 roberto Exp $ +** Coroutine Library +** See Copyright Notice in lua.h +*/ + + +#define lcorolib_c +#define LUA_LIB + +#include + +#include +#include + + +static int auxresume (lua_State *L, lua_State *co, int narg) { + int status; + if (!lua_checkstack(co, narg)) { + lua_pushliteral(L, "too many arguments to resume"); + return -1; /* error flag */ + } + if (lua_status(co) == LUA_OK && lua_gettop(co) == 0) { + lua_pushliteral(L, "cannot resume dead coroutine"); + return -1; /* error flag */ + } + lua_xmove(L, co, narg); + status = lua_resume(co, L, narg); + if (status == LUA_OK || status == LUA_YIELD) { + int nres = lua_gettop(co); + if (!lua_checkstack(L, nres + 1)) { + lua_pop(co, nres); /* remove results anyway */ + lua_pushliteral(L, "too many results to resume"); + return -1; /* error flag */ + } + lua_xmove(co, L, nres); /* move yielded values */ + return nres; + } + else { + lua_xmove(co, L, 1); /* move error message */ + return -1; /* error flag */ + } +} + + +static int luaB_coresume (lua_State *L) { + lua_State *co = lua_tothread(L, 1); + int r; + luaL_argcheck(L, co, 1, "coroutine expected"); + r = auxresume(L, co, lua_gettop(L) - 1); + if (r < 0) { + lua_pushboolean(L, 0); + lua_insert(L, -2); + return 2; /* return false + error message */ + } + else { + lua_pushboolean(L, 1); + lua_insert(L, -(r + 1)); + return r + 1; /* return true + 'resume' returns */ + } +} + + +static int luaB_auxwrap (lua_State *L) { + lua_State *co = lua_tothread(L, lua_upvalueindex(1)); + int r = auxresume(L, co, lua_gettop(L)); + if (r < 0) { + if (lua_isstring(L, -1)) { /* error object is a string? */ + luaL_where(L, 1); /* add extra info */ + lua_insert(L, -2); + lua_concat(L, 2); + } + return lua_error(L); /* propagate error */ + } + return r; +} + + +static int luaB_cocreate (lua_State *L) { + lua_State *NL; + luaL_checktype(L, 1, LUA_TFUNCTION); + NL = lua_newthread(L); + lua_pushvalue(L, 1); /* move function to top */ + lua_xmove(L, NL, 1); /* move function from L to NL */ + return 1; +} + + +static int luaB_cowrap (lua_State *L) { + luaB_cocreate(L); + lua_pushcclosure(L, luaB_auxwrap, 1); + return 1; +} + + +static int luaB_yield (lua_State *L) { + return lua_yield(L, lua_gettop(L)); +} + + +static int luaB_costatus (lua_State *L) { + lua_State *co = lua_tothread(L, 1); + luaL_argcheck(L, co, 1, "coroutine expected"); + if (L == co) lua_pushliteral(L, "running"); + else { + switch (lua_status(co)) { + case LUA_YIELD: + lua_pushliteral(L, "suspended"); + break; + case LUA_OK: { + lua_Debug ar; + if (lua_getstack(co, 0, &ar) > 0) /* does it have frames? */ + lua_pushliteral(L, "normal"); /* it is running */ + else if (lua_gettop(co) == 0) + lua_pushliteral(L, "dead"); + else + lua_pushliteral(L, "suspended"); /* initial state */ + break; + } + default: /* some error occurred */ + lua_pushliteral(L, "dead"); + break; + } + } + return 1; +} + + +static int luaB_corunning (lua_State *L) { + int ismain = lua_pushthread(L); + lua_pushboolean(L, ismain); + return 2; +} + + +static const luaL_Reg co_funcs[] = { + {"create", luaB_cocreate}, + {"resume", luaB_coresume}, + {"running", luaB_corunning}, + {"status", luaB_costatus}, + {"wrap", luaB_cowrap}, + {"yield", luaB_yield}, + {NULL, NULL} +}; + + + +LUAMOD_API int luaopen_coroutine (lua_State *L) { + luaL_newlib(L, co_funcs); + return 1; +} + +#if defined(_KERNEL) + +EXPORT_SYMBOL(luaopen_coroutine); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lctype.c b/module/lua/lctype.c new file mode 100644 index 000000000000..028d278ae4da --- /dev/null +++ b/module/lua/lctype.c @@ -0,0 +1,52 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lctype.c,v 1.11.1.1 2013/04/12 18:48:47 roberto Exp $ +** 'ctype' functions for Lua +** See Copyright Notice in lua.h +*/ + +#define lctype_c +#define LUA_CORE + +#include "lctype.h" + +#if !LUA_USE_CTYPE /* { */ + +LUAI_DDEF const lu_byte luai_ctype_[UCHAR_MAX + 2] = { + 0x00, /* EOZ */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 0. */ + 0x00, 0x08, 0x08, 0x08, 0x08, 0x08, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 1. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x0c, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, /* 2. */ + 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, 0x16, /* 3. */ + 0x16, 0x16, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, + 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 4. */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 5. */ + 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x05, + 0x04, 0x15, 0x15, 0x15, 0x15, 0x15, 0x15, 0x05, /* 6. */ + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, + 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, /* 7. */ + 0x05, 0x05, 0x05, 0x04, 0x04, 0x04, 0x04, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 8. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* 9. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* a. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* b. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* c. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* d. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* e. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, /* f. */ + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +#endif /* } */ +/* END CSTYLED */ diff --git a/module/lua/lctype.h b/module/lua/lctype.h new file mode 100644 index 000000000000..b16b6bc7dab3 --- /dev/null +++ b/module/lua/lctype.h @@ -0,0 +1,94 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lctype.h,v 1.12.1.1 2013/04/12 18:48:47 roberto Exp $ +** 'ctype' functions for Lua +** See Copyright Notice in lua.h +*/ + +#ifndef lctype_h +#define lctype_h + +#include + + +/* +** WARNING: the functions defined here do not necessarily correspond +** to the similar functions in the standard C ctype.h. They are +** optimized for the specific needs of Lua +*/ + +#if !defined(LUA_USE_CTYPE) + +#if 'A' == 65 && '0' == 48 +/* ASCII case: can use its own tables; faster and fixed */ +#define LUA_USE_CTYPE 0 +#else +/* must use standard C ctype */ +#define LUA_USE_CTYPE 1 +#endif + +#endif + + +#if !LUA_USE_CTYPE /* { */ + +#include "llimits.h" + + +#define ALPHABIT 0 +#define DIGITBIT 1 +#define PRINTBIT 2 +#define SPACEBIT 3 +#define XDIGITBIT 4 + + +#define MASK(B) (1 << (B)) + + +/* +** add 1 to char to allow index -1 (EOZ) +*/ +#define testprop(c,p) (luai_ctype_[(lu_byte)(c)+1] & (p)) + +/* +** 'lalpha' (Lua alphabetic) and 'lalnum' (Lua alphanumeric) both include '_' +*/ +#define lislalpha(c) testprop(c, MASK(ALPHABIT)) +#define lislalnum(c) testprop(c, (MASK(ALPHABIT) | MASK(DIGITBIT))) +#define lisdigit(c) testprop(c, MASK(DIGITBIT)) +#define lisspace(c) testprop(c, MASK(SPACEBIT)) +#define lisprint(c) testprop(c, MASK(PRINTBIT)) +#define lisxdigit(c) testprop(c, MASK(XDIGITBIT)) + +/* +** this 'ltolower' only works for alphabetic characters +*/ +#define ltolower(c) ((c) | ('A' ^ 'a')) + + +/* two more entries for 0 and -1 (EOZ) */ +LUAI_DDEC const lu_byte luai_ctype_[UCHAR_MAX + 2]; + + +#else /* }{ */ + +/* +** use standard C ctypes +*/ + +#include + + +#define lislalpha(c) (isalpha(c) || (c) == '_') +#define lislalnum(c) (isalnum(c) || (c) == '_') +#define lisdigit(c) (isdigit(c)) +#define lisspace(c) (isspace(c)) +#define lisprint(c) (isprint(c)) +#define lisxdigit(c) (isxdigit(c)) + +#define ltolower(c) (tolower(c)) + +#endif /* } */ + +#endif +/* END CSTYLED */ diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c new file mode 100644 index 000000000000..2e1efa4e7250 --- /dev/null +++ b/module/lua/ldebug.c @@ -0,0 +1,609 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ldebug.c,v 2.90.1.4 2015/02/19 17:05:13 roberto Exp $ +** Debug Interface +** See Copyright Notice in lua.h +*/ + + +#define ldebug_c +#define LUA_CORE + +#include + +#include "lapi.h" +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" + + + +#define noLuaClosure(f) ((f) == NULL || (f)->c.tt == LUA_TCCL) + + +static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name); + + +static int currentpc (CallInfo *ci) { + lua_assert(isLua(ci)); + return pcRel(ci->u.l.savedpc, ci_func(ci)->p); +} + + +static int currentline (CallInfo *ci) { + return getfuncline(ci_func(ci)->p, currentpc(ci)); +} + + +static void swapextra (lua_State *L) { + if (L->status == LUA_YIELD) { + CallInfo *ci = L->ci; /* get function that yielded */ + StkId temp = ci->func; /* exchange its 'func' and 'extra' values */ + ci->func = restorestack(L, ci->extra); + ci->extra = savestack(L, temp); + } +} + + +/* +** this function can be called asynchronous (e.g. during a signal) +*/ +LUA_API int lua_sethook (lua_State *L, lua_Hook func, int mask, int count) { + if (func == NULL || mask == 0) { /* turn off hooks? */ + mask = 0; + func = NULL; + } + if (isLua(L->ci)) + L->oldpc = L->ci->u.l.savedpc; + L->hook = func; + L->basehookcount = count; + resethookcount(L); + L->hookmask = cast_byte(mask); + return 1; +} + + +LUA_API lua_Hook lua_gethook (lua_State *L) { + return L->hook; +} + + +LUA_API int lua_gethookmask (lua_State *L) { + return L->hookmask; +} + + +LUA_API int lua_gethookcount (lua_State *L) { + return L->basehookcount; +} + + +LUA_API int lua_getstack (lua_State *L, int level, lua_Debug *ar) { + int status; + CallInfo *ci; + if (level < 0) return 0; /* invalid (negative) level */ + lua_lock(L); + for (ci = L->ci; level > 0 && ci != &L->base_ci; ci = ci->previous) + level--; + if (level == 0 && ci != &L->base_ci) { /* level found? */ + status = 1; + ar->i_ci = ci; + } + else status = 0; /* no such level */ + lua_unlock(L); + return status; +} + + +static const char *upvalname (Proto *p, int uv) { + TString *s = check_exp(uv < p->sizeupvalues, p->upvalues[uv].name); + if (s == NULL) return "?"; + else return getstr(s); +} + + +static const char *findvararg (CallInfo *ci, int n, StkId *pos) { + int nparams = clLvalue(ci->func)->p->numparams; + if (n >= ci->u.l.base - ci->func - nparams) + return NULL; /* no such vararg */ + else { + *pos = ci->func + nparams + n; + return "(*vararg)"; /* generic name for any vararg */ + } +} + + +static const char *findlocal (lua_State *L, CallInfo *ci, int n, + StkId *pos) { + const char *name = NULL; + StkId base; + if (isLua(ci)) { + if (n < 0) /* access to vararg values? */ + return findvararg(ci, -n, pos); + else { + base = ci->u.l.base; + name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); + } + } + else + base = ci->func + 1; + if (name == NULL) { /* no 'standard' name? */ + StkId limit = (ci == L->ci) ? L->top : ci->next->func; + if (limit - base >= n && n > 0) /* is 'n' inside 'ci' stack? */ + name = "(*temporary)"; /* generic name for any valid slot */ + else + return NULL; /* no name */ + } + *pos = base + (n - 1); + return name; +} + + +LUA_API const char *lua_getlocal (lua_State *L, const lua_Debug *ar, int n) { + const char *name; + lua_lock(L); + swapextra(L); + if (ar == NULL) { /* information about non-active function? */ + if (!isLfunction(L->top - 1)) /* not a Lua function? */ + name = NULL; + else /* consider live variables at function start (parameters) */ + name = luaF_getlocalname(clLvalue(L->top - 1)->p, n, 0); + } + else { /* active function; get information through 'ar' */ + StkId pos = 0; /* to avoid warnings */ + name = findlocal(L, ar->i_ci, n, &pos); + if (name) { + setobj2s(L, L->top, pos); + api_incr_top(L); + } + } + swapextra(L); + lua_unlock(L); + return name; +} + + +LUA_API const char *lua_setlocal (lua_State *L, const lua_Debug *ar, int n) { + StkId pos = 0; /* to avoid warnings */ + const char *name; + lua_lock(L); + swapextra(L); + name = findlocal(L, ar->i_ci, n, &pos); + if (name) + setobjs2s(L, pos, L->top - 1); + L->top--; /* pop value */ + swapextra(L); + lua_unlock(L); + return name; +} + + +static void funcinfo (lua_Debug *ar, Closure *cl) { + if (noLuaClosure(cl)) { + ar->source = "=[C]"; + ar->linedefined = -1; + ar->lastlinedefined = -1; + ar->what = "C"; + } + else { + Proto *p = cl->l.p; + ar->source = p->source ? getstr(p->source) : "=?"; + ar->linedefined = p->linedefined; + ar->lastlinedefined = p->lastlinedefined; + ar->what = (ar->linedefined == 0) ? "main" : "Lua"; + } + luaO_chunkid(ar->short_src, ar->source, LUA_IDSIZE); +} + + +static void collectvalidlines (lua_State *L, Closure *f) { + if (noLuaClosure(f)) { + setnilvalue(L->top); + api_incr_top(L); + } + else { + int i; + TValue v; + int *lineinfo = f->l.p->lineinfo; + Table *t = luaH_new(L); /* new table to store active lines */ + sethvalue(L, L->top, t); /* push it on stack */ + api_incr_top(L); + setbvalue(&v, 1); /* boolean 'true' to be the value of all indices */ + for (i = 0; i < f->l.p->sizelineinfo; i++) /* for all lines with code */ + luaH_setint(L, t, lineinfo[i], &v); /* table[line] = true */ + } +} + + +static int auxgetinfo (lua_State *L, const char *what, lua_Debug *ar, + Closure *f, CallInfo *ci) { + int status = 1; + for (; *what; what++) { + switch (*what) { + case 'S': { + funcinfo(ar, f); + break; + } + case 'l': { + ar->currentline = (ci && isLua(ci)) ? currentline(ci) : -1; + break; + } + case 'u': { + ar->nups = (f == NULL) ? 0 : f->c.nupvalues; + if (noLuaClosure(f)) { + ar->isvararg = 1; + ar->nparams = 0; + } + else { + ar->isvararg = f->l.p->is_vararg; + ar->nparams = f->l.p->numparams; + } + break; + } + case 't': { + ar->istailcall = (ci) ? ci->callstatus & CIST_TAIL : 0; + break; + } + case 'n': { + /* calling function is a known Lua function? */ + if (ci && !(ci->callstatus & CIST_TAIL) && isLua(ci->previous)) + ar->namewhat = getfuncname(L, ci->previous, &ar->name); + else + ar->namewhat = NULL; + if (ar->namewhat == NULL) { + ar->namewhat = ""; /* not found */ + ar->name = NULL; + } + break; + } + case 'L': + case 'f': /* handled by lua_getinfo */ + break; + default: status = 0; /* invalid option */ + } + } + return status; +} + + +LUA_API int lua_getinfo (lua_State *L, const char *what, lua_Debug *ar) { + int status; + Closure *cl; + CallInfo *ci; + StkId func; + lua_lock(L); + swapextra(L); + if (*what == '>') { + ci = NULL; + func = L->top - 1; + api_check(L, ttisfunction(func), "function expected"); + what++; /* skip the '>' */ + L->top--; /* pop function */ + } + else { + ci = ar->i_ci; + func = ci->func; + lua_assert(ttisfunction(ci->func)); + } + cl = ttisclosure(func) ? clvalue(func) : NULL; + status = auxgetinfo(L, what, ar, cl, ci); + if (strchr(what, 'f')) { + setobjs2s(L, L->top, func); + api_incr_top(L); + } + swapextra(L); + if (strchr(what, 'L')) + collectvalidlines(L, cl); + lua_unlock(L); + return status; +} + + +/* +** {====================================================== +** Symbolic Execution +** ======================================================= +*/ + +static const char *getobjname (Proto *p, int lastpc, int reg, + const char **name); + + +/* +** find a "name" for the RK value 'c' +*/ +static void kname (Proto *p, int pc, int c, const char **name) { + if (ISK(c)) { /* is 'c' a constant? */ + TValue *kvalue = &p->k[INDEXK(c)]; + if (ttisstring(kvalue)) { /* literal constant? */ + // cppcheck-suppress autoVariables + *name = svalue(kvalue); /* it is its own name */ + return; + } + /* else no reasonable name found */ + } + else { /* 'c' is a register */ + const char *what = getobjname(p, pc, c, name); /* search for 'c' */ + if (what && *what == 'c') { /* found a constant name? */ + return; /* 'name' already filled */ + } + /* else no reasonable name found */ + } + *name = "?"; /* no reasonable name found */ +} + + +static int filterpc (int pc, int jmptarget) { + if (pc < jmptarget) /* is code conditional (inside a jump)? */ + return -1; /* cannot know who sets that register */ + else return pc; /* current position sets that register */ +} + + +/* +** try to find last instruction before 'lastpc' that modified register 'reg' +*/ +static int findsetreg (Proto *p, int lastpc, int reg) { + int pc; + int setreg = -1; /* keep last instruction that changed 'reg' */ + int jmptarget = 0; /* any code before this address is conditional */ + for (pc = 0; pc < lastpc; pc++) { + Instruction i = p->code[pc]; + OpCode op = GET_OPCODE(i); + int a = GETARG_A(i); + switch (op) { + case OP_LOADNIL: { + int b = GETARG_B(i); + if (a <= reg && reg <= a + b) /* set registers from 'a' to 'a+b' */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_TFORCALL: { + if (reg >= a + 2) /* affect all regs above its base */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_CALL: + case OP_TAILCALL: { + if (reg >= a) /* affect all registers above base */ + setreg = filterpc(pc, jmptarget); + break; + } + case OP_JMP: { + int b = GETARG_sBx(i); + int dest = pc + 1 + b; + /* jump is forward and do not skip `lastpc'? */ + if (pc < dest && dest <= lastpc) { + if (dest > jmptarget) + jmptarget = dest; /* update 'jmptarget' */ + } + break; + } + case OP_TEST: { + if (reg == a) /* jumped code can change 'a' */ + setreg = filterpc(pc, jmptarget); + break; + } + default: + if (testAMode(op) && reg == a) /* any instruction that set A */ + setreg = filterpc(pc, jmptarget); + break; + } + } + return setreg; +} + + +static const char *getobjname (Proto *p, int lastpc, int reg, + const char **name) { + int pc; + *name = luaF_getlocalname(p, reg + 1, lastpc); + if (*name) /* is a local? */ + return "local"; + /* else try symbolic execution */ + pc = findsetreg(p, lastpc, reg); + if (pc != -1) { /* could find instruction? */ + Instruction i = p->code[pc]; + OpCode op = GET_OPCODE(i); + switch (op) { + case OP_MOVE: { + int b = GETARG_B(i); /* move from 'b' to 'a' */ + if (b < GETARG_A(i)) + return getobjname(p, pc, b, name); /* get name for 'b' */ + break; + } + case OP_GETTABUP: + case OP_GETTABLE: { + int k = GETARG_C(i); /* key index */ + int t = GETARG_B(i); /* table index */ + const char *vn = (op == OP_GETTABLE) /* name of indexed variable */ + ? luaF_getlocalname(p, t + 1, pc) + : upvalname(p, t); + kname(p, pc, k, name); + return (vn && strcmp(vn, LUA_ENV) == 0) ? "global" : "field"; + } + case OP_GETUPVAL: { + *name = upvalname(p, GETARG_B(i)); + return "upvalue"; + } + case OP_LOADK: + case OP_LOADKX: { + int b = (op == OP_LOADK) ? GETARG_Bx(i) + : GETARG_Ax(p->code[pc + 1]); + if (ttisstring(&p->k[b])) { + *name = svalue(&p->k[b]); + return "constant"; + } + break; + } + case OP_SELF: { + int k = GETARG_C(i); /* key index */ + kname(p, pc, k, name); + return "method"; + } + default: break; /* go through to return NULL */ + } + } + return NULL; /* could not find reasonable name */ +} + + +static const char *getfuncname (lua_State *L, CallInfo *ci, const char **name) { + TMS tm; + Proto *p = ci_func(ci)->p; /* calling function */ + int pc = currentpc(ci); /* calling instruction index */ + Instruction i = p->code[pc]; /* calling instruction */ + switch (GET_OPCODE(i)) { + case OP_CALL: + case OP_TAILCALL: /* get function name */ + return getobjname(p, pc, GETARG_A(i), name); + case OP_TFORCALL: { /* for iterator */ + *name = "for iterator"; + return "for iterator"; + } + /* all other instructions can call only through metamethods */ + case OP_SELF: + case OP_GETTABUP: + case OP_GETTABLE: tm = TM_INDEX; break; + case OP_SETTABUP: + case OP_SETTABLE: tm = TM_NEWINDEX; break; + case OP_EQ: tm = TM_EQ; break; + case OP_ADD: tm = TM_ADD; break; + case OP_SUB: tm = TM_SUB; break; + case OP_MUL: tm = TM_MUL; break; + case OP_DIV: tm = TM_DIV; break; + case OP_MOD: tm = TM_MOD; break; + case OP_POW: tm = TM_POW; break; + case OP_UNM: tm = TM_UNM; break; + case OP_LEN: tm = TM_LEN; break; + case OP_LT: tm = TM_LT; break; + case OP_LE: tm = TM_LE; break; + case OP_CONCAT: tm = TM_CONCAT; break; + default: + return NULL; /* else no useful name can be found */ + } + *name = getstr(G(L)->tmname[tm]); + return "metamethod"; +} + +/* }====================================================== */ + + + +/* +** only ANSI way to check whether a pointer points to an array +** (used only for error messages, so efficiency is not a big concern) +*/ +static int isinstack (CallInfo *ci, const TValue *o) { + StkId p; + for (p = ci->u.l.base; p < ci->top; p++) + if (o == p) return 1; + return 0; +} + + +static const char *getupvalname (CallInfo *ci, const TValue *o, + const char **name) { + LClosure *c = ci_func(ci); + int i; + for (i = 0; i < c->nupvalues; i++) { + if (c->upvals[i]->v == o) { + *name = upvalname(c->p, i); + return "upvalue"; + } + } + return NULL; +} + + +l_noret luaG_typeerror (lua_State *L, const TValue *o, const char *op) { + CallInfo *ci = L->ci; + const char *name = NULL; + const char *t = objtypename(o); + const char *kind = NULL; + if (isLua(ci)) { + kind = getupvalname(ci, o, &name); /* check whether 'o' is an upvalue */ + if (!kind && isinstack(ci, o)) /* no? try a register */ + kind = getobjname(ci_func(ci)->p, currentpc(ci), + cast_int(o - ci->u.l.base), &name); + } + if (kind) + luaG_runerror(L, "attempt to %s %s " LUA_QS " (a %s value)", + op, kind, name, t); + else + luaG_runerror(L, "attempt to %s a %s value", op, t); +} + + +l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2) { + if (ttisstring(p1) || ttisnumber(p1)) p1 = p2; + lua_assert(!ttisstring(p1) && !ttisnumber(p1)); + luaG_typeerror(L, p1, "concatenate"); +} + + +l_noret luaG_aritherror (lua_State *L, const TValue *p1, const TValue *p2) { + TValue temp; + if (luaV_tonumber(p1, &temp) == NULL) + p2 = p1; /* first operand is wrong */ + luaG_typeerror(L, p2, "perform arithmetic on"); +} + + +l_noret luaG_ordererror (lua_State *L, const TValue *p1, const TValue *p2) { + const char *t1 = objtypename(p1); + const char *t2 = objtypename(p2); + if (t1 == t2) + luaG_runerror(L, "attempt to compare two %s values", t1); + else + luaG_runerror(L, "attempt to compare %s with %s", t1, t2); +} + + +static void addinfo (lua_State *L, const char *msg) { + CallInfo *ci = L->ci; + if (isLua(ci)) { /* is Lua code? */ + char buff[LUA_IDSIZE]; /* add file:line information */ + int line = currentline(ci); + TString *src = ci_func(ci)->p->source; + if (src) + luaO_chunkid(buff, getstr(src), LUA_IDSIZE); + else { /* no source available; use "?" instead */ + buff[0] = '?'; buff[1] = '\0'; + } + luaO_pushfstring(L, "%s:%d: %s", buff, line, msg); + } +} + + +l_noret luaG_errormsg (lua_State *L) { + if (L->errfunc != 0) { /* is there an error handling function? */ + StkId errfunc = restorestack(L, L->errfunc); + if (!ttisfunction(errfunc)) luaD_throw(L, LUA_ERRERR); + setobjs2s(L, L->top, L->top - 1); /* move argument */ + setobjs2s(L, L->top - 1, errfunc); /* push function */ + L->top++; + luaD_call(L, L->top - 2, 1, 0); /* call it */ + } + luaD_throw(L, LUA_ERRRUN); +} + + +l_noret luaG_runerror (lua_State *L, const char *fmt, ...) { + L->runerror++; + va_list argp; + va_start(argp, fmt); + addinfo(L, luaO_pushvfstring(L, fmt, argp)); + va_end(argp); + luaG_errormsg(L); + L->runerror--; +} +/* END CSTYLED */ diff --git a/module/lua/ldebug.h b/module/lua/ldebug.h new file mode 100644 index 000000000000..36ed396f26c9 --- /dev/null +++ b/module/lua/ldebug.h @@ -0,0 +1,36 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ldebug.h,v 2.7.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions from Debug Interface module +** See Copyright Notice in lua.h +*/ + +#ifndef ldebug_h +#define ldebug_h + + +#include "lstate.h" + + +#define pcRel(pc, p) (cast(int, (pc) - (p)->code) - 1) + +#define getfuncline(f,pc) (((f)->lineinfo) ? (f)->lineinfo[pc] : 0) + +#define resethookcount(L) (L->hookcount = L->basehookcount) + +/* Active Lua function (given call info) */ +#define ci_func(ci) (clLvalue((ci)->func)) + + +LUAI_FUNC l_noret luaG_typeerror (lua_State *L, const TValue *o, + const char *opname); +LUAI_FUNC l_noret luaG_concaterror (lua_State *L, StkId p1, StkId p2); +LUAI_FUNC l_noret luaG_aritherror (lua_State *L, const TValue *p1, + const TValue *p2); +LUAI_FUNC l_noret luaG_ordererror (lua_State *L, const TValue *p1, + const TValue *p2); +LUAI_FUNC l_noret luaG_runerror (lua_State *L, const char *fmt, ...); +LUAI_FUNC l_noret luaG_errormsg (lua_State *L); + +#endif +/* END CSTYLED */ diff --git a/module/lua/ldo.c b/module/lua/ldo.c new file mode 100644 index 000000000000..0344a29fd8e8 --- /dev/null +++ b/module/lua/ldo.c @@ -0,0 +1,748 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ldo.c,v 2.108.1.3 2013/11/08 18:22:50 roberto Exp $ +** Stack and Call structure of Lua +** See Copyright Notice in lua.h +*/ + + +#define ldo_c +#define LUA_CORE + +#include + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" +#include "lzio.h" + + + +/* Return the number of bytes available on the stack. */ +#if defined (_KERNEL) && defined(__linux__) +#include +static intptr_t stack_remaining(void) { + char local; + return (intptr_t)(&local - (char *)current->stack); +} +#elif defined (_KERNEL) && defined(__FreeBSD__) +#include +static intptr_t stack_remaining(void) { + char local; + return (intptr_t)(&local - (char *)curthread->td_kstack); +} +#else +static intptr_t stack_remaining(void) { + return INTPTR_MAX; +} +#endif + +/* +** {====================================================== +** Error-recovery functions +** ======================================================= +*/ + +/* +** LUAI_THROW/LUAI_TRY define how Lua does exception handling. By +** default, Lua handles errors with exceptions when compiling as +** C++ code, with _longjmp/_setjmp when asked to use them, and with +** longjmp/setjmp otherwise. +*/ +#if !defined(LUAI_THROW) + +#ifdef _KERNEL + +#ifdef __linux__ +#if defined(__i386__) +#define JMP_BUF_CNT 6 +#elif defined(__x86_64__) +#define JMP_BUF_CNT 8 +#elif defined(__sparc__) && defined(__arch64__) +#define JMP_BUF_CNT 6 +#elif defined(__powerpc__) +#define JMP_BUF_CNT 26 +#elif defined(__aarch64__) +#define JMP_BUF_CNT 64 +#elif defined(__arm__) +#define JMP_BUF_CNT 65 +#elif defined(__mips__) +#define JMP_BUF_CNT 12 +#elif defined(__s390x__) +#define JMP_BUF_CNT 18 +#elif defined(__riscv) +#define JMP_BUF_CNT 64 +#else +#define JMP_BUF_CNT 1 +#endif + +typedef struct _label_t { long long unsigned val[JMP_BUF_CNT]; } label_t; + +int setjmp(label_t *) __attribute__ ((__nothrow__)); +extern void longjmp(label_t *) __attribute__((__noreturn__)); + +#define LUAI_THROW(L,c) longjmp(&(c)->b) +#define LUAI_TRY(L,c,a) if (setjmp(&(c)->b) == 0) { a } +#define luai_jmpbuf label_t + +/* unsupported arches will build but not be able to run lua programs */ +#if JMP_BUF_CNT == 1 +int setjmp (label_t *buf) { + return 1; +} + +void longjmp (label_t * buf) { + for (;;); +} +#endif +#else +#define LUAI_THROW(L,c) longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf +#endif + +#else /* _KERNEL */ + +#if defined(__cplusplus) && !defined(LUA_USE_LONGJMP) +/* C++ exceptions */ +#define LUAI_THROW(L,c) throw(c) +#define LUAI_TRY(L,c,a) \ + try { a } catch(...) { if ((c)->status == 0) (c)->status = -1; } +#define luai_jmpbuf int /* dummy variable */ + +#elif defined(LUA_USE_ULONGJMP) +/* in Unix, try _longjmp/_setjmp (more efficient) */ +#define LUAI_THROW(L,c) _longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (_setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf + +#else +/* default handling with long jumps */ +#define LUAI_THROW(L,c) longjmp((c)->b, 1) +#define LUAI_TRY(L,c,a) if (setjmp((c)->b) == 0) { a } +#define luai_jmpbuf jmp_buf + +#endif + +#endif /* _KERNEL */ + +#endif /* LUAI_THROW */ + + +/* chain list of long jump buffers */ +struct lua_longjmp { + struct lua_longjmp *previous; + luai_jmpbuf b; + volatile int status; /* error code */ +}; + + +static void seterrorobj (lua_State *L, int errcode, StkId oldtop) { + switch (errcode) { + case LUA_ERRMEM: { /* memory error? */ + setsvalue2s(L, oldtop, G(L)->memerrmsg); /* reuse preregistered msg. */ + break; + } + case LUA_ERRERR: { + setsvalue2s(L, oldtop, luaS_newliteral(L, "error in error handling")); + break; + } + default: { + setobjs2s(L, oldtop, L->top - 1); /* error message on current top */ + break; + } + } + L->top = oldtop + 1; +} + + +l_noret luaD_throw (lua_State *L, int errcode) { + if (L->errorJmp) { /* thread has an error handler? */ + L->errorJmp->status = errcode; /* set status */ + LUAI_THROW(L, L->errorJmp); /* jump to it */ + } + else { /* thread has no error handler */ + L->status = cast_byte(errcode); /* mark it as dead */ + if (G(L)->mainthread->errorJmp) { /* main thread has a handler? */ + setobjs2s(L, G(L)->mainthread->top++, L->top - 1); /* copy error obj. */ + luaD_throw(G(L)->mainthread, errcode); /* re-throw in main thread */ + } + else { /* no handler at all; abort */ + if (G(L)->panic) { /* panic function? */ + lua_unlock(L); + G(L)->panic(L); /* call it (last chance to jump out) */ + } + panic("no error handler"); + } + } +} + + +int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud) { + unsigned short oldnCcalls = L->nCcalls; + struct lua_longjmp lj; + lj.status = LUA_OK; + lj.previous = L->errorJmp; /* chain new error handler */ + // cppcheck-suppress autoVariables + L->errorJmp = &lj; + LUAI_TRY(L, &lj, + (*f)(L, ud); + ); + L->errorJmp = lj.previous; /* restore old error handler */ + L->nCcalls = oldnCcalls; + return lj.status; +} + +/* }====================================================== */ + + +static void correctstack (lua_State *L, TValue *oldstack) { + CallInfo *ci; + GCObject *up; + L->top = (L->top - oldstack) + L->stack; + for (up = L->openupval; up != NULL; up = up->gch.next) + gco2uv(up)->v = (gco2uv(up)->v - oldstack) + L->stack; + for (ci = L->ci; ci != NULL; ci = ci->previous) { + ci->top = (ci->top - oldstack) + L->stack; + ci->func = (ci->func - oldstack) + L->stack; + if (isLua(ci)) + ci->u.l.base = (ci->u.l.base - oldstack) + L->stack; + } +} + + +/* some space for error handling */ +#define ERRORSTACKSIZE (LUAI_MAXSTACK + 200) + + +void luaD_reallocstack (lua_State *L, int newsize) { + TValue *oldstack = L->stack; + int lim = L->stacksize; + lua_assert(newsize <= LUAI_MAXSTACK || newsize == ERRORSTACKSIZE); + lua_assert(L->stack_last - L->stack == L->stacksize - EXTRA_STACK); + luaM_reallocvector(L, L->stack, L->stacksize, newsize, TValue); + for (; lim < newsize; lim++) + setnilvalue(L->stack + lim); /* erase new segment */ + L->stacksize = newsize; + L->stack_last = L->stack + newsize - EXTRA_STACK; + correctstack(L, oldstack); +} + + +void luaD_growstack (lua_State *L, int n) { + int size = L->stacksize; + if (size > LUAI_MAXSTACK) /* error after extra size? */ + luaD_throw(L, LUA_ERRERR); + else { + int needed = cast_int(L->top - L->stack) + n + EXTRA_STACK; + int newsize = 2 * size; + if (newsize > LUAI_MAXSTACK) newsize = LUAI_MAXSTACK; + if (newsize < needed) newsize = needed; + if (newsize > LUAI_MAXSTACK) { /* stack overflow? */ + luaD_reallocstack(L, ERRORSTACKSIZE); + luaG_runerror(L, "stack overflow"); + } + else + luaD_reallocstack(L, newsize); + } +} + + +static int stackinuse (lua_State *L) { + CallInfo *ci; + StkId lim = L->top; + for (ci = L->ci; ci != NULL; ci = ci->previous) { + lua_assert(ci->top <= L->stack_last); + if (lim < ci->top) lim = ci->top; + } + return cast_int(lim - L->stack) + 1; /* part of stack in use */ +} + + +void luaD_shrinkstack (lua_State *L) { + int inuse = stackinuse(L); + int goodsize = inuse + (inuse / 8) + 2*EXTRA_STACK; + if (goodsize > LUAI_MAXSTACK) goodsize = LUAI_MAXSTACK; + if (inuse > LUAI_MAXSTACK || /* handling stack overflow? */ + goodsize >= L->stacksize) /* would grow instead of shrink? */ + condmovestack(L); /* don't change stack (change only for debugging) */ + else + luaD_reallocstack(L, goodsize); /* shrink it */ +} + + +void luaD_hook (lua_State *L, int event, int line) { + lua_Hook hook = L->hook; + if (hook && L->allowhook) { + CallInfo *ci = L->ci; + ptrdiff_t top = savestack(L, L->top); + ptrdiff_t ci_top = savestack(L, ci->top); + lua_Debug ar; + ar.event = event; + ar.currentline = line; + ar.i_ci = ci; + luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ + ci->top = L->top + LUA_MINSTACK; + lua_assert(ci->top <= L->stack_last); + L->allowhook = 0; /* cannot call hooks inside a hook */ + ci->callstatus |= CIST_HOOKED; + lua_unlock(L); + (*hook)(L, &ar); + lua_lock(L); + lua_assert(!L->allowhook); + L->allowhook = 1; + ci->top = restorestack(L, ci_top); + L->top = restorestack(L, top); + ci->callstatus &= ~CIST_HOOKED; + } +} + + +static void callhook (lua_State *L, CallInfo *ci) { + int hook = LUA_HOOKCALL; + ci->u.l.savedpc++; /* hooks assume 'pc' is already incremented */ + if (isLua(ci->previous) && + GET_OPCODE(*(ci->previous->u.l.savedpc - 1)) == OP_TAILCALL) { + ci->callstatus |= CIST_TAIL; + hook = LUA_HOOKTAILCALL; + } + luaD_hook(L, hook, -1); + ci->u.l.savedpc--; /* correct 'pc' */ +} + + +static StkId adjust_varargs (lua_State *L, Proto *p, int actual) { + int i; + int nfixargs = p->numparams; + StkId base, fixed; + lua_assert(actual >= nfixargs); + /* move fixed parameters to final position */ + luaD_checkstack(L, p->maxstacksize); /* check again for new 'base' */ + fixed = L->top - actual; /* first fixed argument */ + base = L->top; /* final position of first argument */ + for (i=0; itop++, fixed + i); + setnilvalue(fixed + i); + } + return base; +} + + +static StkId tryfuncTM (lua_State *L, StkId func) { + const TValue *tm = luaT_gettmbyobj(L, func, TM_CALL); + StkId p; + ptrdiff_t funcr = savestack(L, func); + if (!ttisfunction(tm)) + luaG_typeerror(L, func, "call"); + /* Open a hole inside the stack at `func' */ + for (p = L->top; p > func; p--) setobjs2s(L, p, p-1); + incr_top(L); + func = restorestack(L, funcr); /* previous call may change stack */ + setobj2s(L, func, tm); /* tag method is the new function to be called */ + return func; +} + + + +#define next_ci(L) (L->ci = (L->ci->next ? L->ci->next : luaE_extendCI(L))) + + +/* +** returns true if function has been executed (C function) +*/ +int luaD_precall (lua_State *L, StkId func, int nresults) { + lua_CFunction f; + CallInfo *ci; + int n; /* number of arguments (Lua) or returns (C) */ + ptrdiff_t funcr = savestack(L, func); + switch (ttype(func)) { + case LUA_TLCF: /* light C function */ + f = fvalue(func); + goto Cfunc; + case LUA_TCCL: { /* C closure */ + f = clCvalue(func)->f; + Cfunc: + luaD_checkstack(L, LUA_MINSTACK); /* ensure minimum stack size */ + ci = next_ci(L); /* now 'enter' new function */ + ci->nresults = nresults; + ci->func = restorestack(L, funcr); + ci->top = L->top + LUA_MINSTACK; + lua_assert(ci->top <= L->stack_last); + ci->callstatus = 0; + luaC_checkGC(L); /* stack grow uses memory */ + if (L->hookmask & LUA_MASKCALL) + luaD_hook(L, LUA_HOOKCALL, -1); + lua_unlock(L); + n = (*f)(L); /* do the actual call */ + lua_lock(L); + api_checknelems(L, n); + luaD_poscall(L, L->top - n); + return 1; + } + case LUA_TLCL: { /* Lua function: prepare its call */ + StkId base; + Proto *p = clLvalue(func)->p; + n = cast_int(L->top - func) - 1; /* number of real arguments */ + luaD_checkstack(L, p->maxstacksize); + for (; n < p->numparams; n++) + setnilvalue(L->top++); /* complete missing arguments */ + if (!p->is_vararg) { + func = restorestack(L, funcr); + base = func + 1; + } + else { + base = adjust_varargs(L, p, n); + func = restorestack(L, funcr); /* previous call can change stack */ + } + ci = next_ci(L); /* now 'enter' new function */ + ci->nresults = nresults; + ci->func = func; + ci->u.l.base = base; + ci->top = base + p->maxstacksize; + lua_assert(ci->top <= L->stack_last); + ci->u.l.savedpc = p->code; /* starting point */ + ci->callstatus = CIST_LUA; + L->top = ci->top; + luaC_checkGC(L); /* stack grow uses memory */ + if (L->hookmask & LUA_MASKCALL) + callhook(L, ci); + return 0; + } + default: { /* not a function */ + func = tryfuncTM(L, func); /* retry with 'function' tag method */ + return luaD_precall(L, func, nresults); /* now it must be a function */ + } + } +} + + +int luaD_poscall (lua_State *L, StkId firstResult) { + StkId res; + int wanted, i; + CallInfo *ci = L->ci; + if (L->hookmask & (LUA_MASKRET | LUA_MASKLINE)) { + if (L->hookmask & LUA_MASKRET) { + ptrdiff_t fr = savestack(L, firstResult); /* hook may change stack */ + luaD_hook(L, LUA_HOOKRET, -1); + firstResult = restorestack(L, fr); + } + L->oldpc = ci->previous->u.l.savedpc; /* 'oldpc' for caller function */ + } + res = ci->func; /* res == final position of 1st result */ + wanted = ci->nresults; + L->ci = ci = ci->previous; /* back to caller */ + /* move results to correct place */ + for (i = wanted; i != 0 && firstResult < L->top; i--) + setobjs2s(L, res++, firstResult++); + while (i-- > 0) + setnilvalue(res++); + L->top = res; + return (wanted - LUA_MULTRET); /* 0 iff wanted == LUA_MULTRET */ +} + + +/* +** Call a function (C or Lua). The function to be called is at *func. +** The arguments are on the stack, right after the function. +** When returns, all the results are on the stack, starting at the original +** function position. +*/ +void luaD_call (lua_State *L, StkId func, int nResults, int allowyield) { + if (++L->nCcalls >= LUAI_MAXCCALLS) { + if (L->nCcalls == LUAI_MAXCCALLS) + luaG_runerror(L, "C stack overflow"); + else if (L->nCcalls >= (LUAI_MAXCCALLS + (LUAI_MAXCCALLS>>3))) + luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ + } + intptr_t remaining = stack_remaining(); + if (L->runerror == 0 && remaining < LUAI_MINCSTACK) + luaG_runerror(L, "C stack overflow"); + if (L->runerror != 0 && remaining < LUAI_MINCSTACK / 2) + luaD_throw(L, LUA_ERRERR); /* error while handling stack error */ + if (!allowyield) L->nny++; + if (!luaD_precall(L, func, nResults)) /* is a Lua function? */ + luaV_execute(L); /* call it */ + if (!allowyield) L->nny--; + L->nCcalls--; +} + + +static void finishCcall (lua_State *L) { + CallInfo *ci = L->ci; + int n; + lua_assert(ci->u.c.k != NULL); /* must have a continuation */ + lua_assert(L->nny == 0); + if (ci->callstatus & CIST_YPCALL) { /* was inside a pcall? */ + ci->callstatus &= ~CIST_YPCALL; /* finish 'lua_pcall' */ + L->errfunc = ci->u.c.old_errfunc; + } + /* finish 'lua_callk'/'lua_pcall' */ + adjustresults(L, ci->nresults); + /* call continuation function */ + if (!(ci->callstatus & CIST_STAT)) /* no call status? */ + ci->u.c.status = LUA_YIELD; /* 'default' status */ + lua_assert(ci->u.c.status != LUA_OK); + ci->callstatus = (ci->callstatus & ~(CIST_YPCALL | CIST_STAT)) | CIST_YIELDED; + lua_unlock(L); + n = (*ci->u.c.k)(L); + lua_lock(L); + api_checknelems(L, n); + /* finish 'luaD_precall' */ + luaD_poscall(L, L->top - n); +} + + +static void unroll (lua_State *L, void *ud) { + UNUSED(ud); + for (;;) { + if (L->ci == &L->base_ci) /* stack is empty? */ + return; /* coroutine finished normally */ + if (!isLua(L->ci)) /* C function? */ + finishCcall(L); + else { /* Lua function */ + luaV_finishOp(L); /* finish interrupted instruction */ + luaV_execute(L); /* execute down to higher C 'boundary' */ + } + } +} + + +/* +** check whether thread has a suspended protected call +*/ +static CallInfo *findpcall (lua_State *L) { + CallInfo *ci; + for (ci = L->ci; ci != NULL; ci = ci->previous) { /* search for a pcall */ + if (ci->callstatus & CIST_YPCALL) + return ci; + } + return NULL; /* no pending pcall */ +} + + +static int recover (lua_State *L, int status) { + StkId oldtop; + CallInfo *ci = findpcall(L); + if (ci == NULL) return 0; /* no recovery point */ + /* "finish" luaD_pcall */ + oldtop = restorestack(L, ci->extra); + luaF_close(L, oldtop); + seterrorobj(L, status, oldtop); + L->ci = ci; + L->allowhook = ci->u.c.old_allowhook; + L->nny = 0; /* should be zero to be yieldable */ + luaD_shrinkstack(L); + L->errfunc = ci->u.c.old_errfunc; + ci->callstatus |= CIST_STAT; /* call has error status */ + ci->u.c.status = status; /* (here it is) */ + return 1; /* continue running the coroutine */ +} + + +/* +** signal an error in the call to 'resume', not in the execution of the +** coroutine itself. (Such errors should not be handled by any coroutine +** error handler and should not kill the coroutine.) +*/ +static l_noret resume_error (lua_State *L, const char *msg, StkId firstArg) { + L->top = firstArg; /* remove args from the stack */ + setsvalue2s(L, L->top, luaS_new(L, msg)); /* push error message */ + api_incr_top(L); + luaD_throw(L, -1); /* jump back to 'lua_resume' */ +} + + +/* +** do the work for 'lua_resume' in protected mode +*/ +static void resume_cb (lua_State *L, void *ud) { + int nCcalls = L->nCcalls; + StkId firstArg = cast(StkId, ud); + CallInfo *ci = L->ci; + if (nCcalls >= LUAI_MAXCCALLS) + resume_error(L, "C stack overflow", firstArg); + if (L->status == LUA_OK) { /* may be starting a coroutine */ + if (ci != &L->base_ci) /* not in base level? */ + resume_error(L, "cannot resume non-suspended coroutine", firstArg); + /* coroutine is in base level; start running it */ + if (!luaD_precall(L, firstArg - 1, LUA_MULTRET)) /* Lua function? */ + luaV_execute(L); /* call it */ + } + else if (L->status != LUA_YIELD) + resume_error(L, "cannot resume dead coroutine", firstArg); + else { /* resuming from previous yield */ + L->status = LUA_OK; + ci->func = restorestack(L, ci->extra); + if (isLua(ci)) /* yielded inside a hook? */ + luaV_execute(L); /* just continue running Lua code */ + else { /* 'common' yield */ + if (ci->u.c.k != NULL) { /* does it have a continuation? */ + int n; + ci->u.c.status = LUA_YIELD; /* 'default' status */ + ci->callstatus |= CIST_YIELDED; + lua_unlock(L); + n = (*ci->u.c.k)(L); /* call continuation */ + lua_lock(L); + api_checknelems(L, n); + firstArg = L->top - n; /* yield results come from continuation */ + } + luaD_poscall(L, firstArg); /* finish 'luaD_precall' */ + } + unroll(L, NULL); + } + lua_assert(nCcalls == L->nCcalls); +} + + +LUA_API int lua_resume (lua_State *L, lua_State *from, int nargs) { + int status; + int oldnny = L->nny; /* save 'nny' */ + lua_lock(L); + luai_userstateresume(L, nargs); + L->nCcalls = (from) ? from->nCcalls + 1 : 1; + L->nny = 0; /* allow yields */ + api_checknelems(L, (L->status == LUA_OK) ? nargs + 1 : nargs); + status = luaD_rawrunprotected(L, resume_cb, L->top - nargs); + if (status == -1) /* error calling 'lua_resume'? */ + status = LUA_ERRRUN; + else { /* yield or regular error */ + while (status != LUA_OK && status != LUA_YIELD) { /* error? */ + if (recover(L, status)) /* recover point? */ + status = luaD_rawrunprotected(L, unroll, NULL); /* run continuation */ + else { /* unrecoverable error */ + L->status = cast_byte(status); /* mark thread as `dead' */ + seterrorobj(L, status, L->top); + L->ci->top = L->top; + break; + } + } + lua_assert(status == L->status); + } + L->nny = oldnny; /* restore 'nny' */ + L->nCcalls--; + lua_assert(L->nCcalls == ((from) ? from->nCcalls : 0)); + lua_unlock(L); + return status; +} + + +LUA_API int lua_yieldk (lua_State *L, int nresults, int ctx, lua_CFunction k) { + CallInfo *ci = L->ci; + luai_userstateyield(L, nresults); + lua_lock(L); + api_checknelems(L, nresults); + if (L->nny > 0) { + if (L != G(L)->mainthread) + luaG_runerror(L, "attempt to yield across a C-call boundary"); + else + luaG_runerror(L, "attempt to yield from outside a coroutine"); + } + L->status = LUA_YIELD; + ci->extra = savestack(L, ci->func); /* save current 'func' */ + if (isLua(ci)) { /* inside a hook? */ + api_check(L, k == NULL, "hooks cannot continue after yielding"); + } + else { + if ((ci->u.c.k = k) != NULL) /* is there a continuation? */ + ci->u.c.ctx = ctx; /* save context */ + ci->func = L->top - nresults - 1; /* protect stack below results */ + luaD_throw(L, LUA_YIELD); + } + lua_assert(ci->callstatus & CIST_HOOKED); /* must be inside a hook */ + lua_unlock(L); + return 0; /* return to 'luaD_hook' */ +} + + +int luaD_pcall (lua_State *L, Pfunc func, void *u, + ptrdiff_t old_top, ptrdiff_t ef) { + int status; + CallInfo *old_ci = L->ci; + lu_byte old_allowhooks = L->allowhook; + unsigned short old_nny = L->nny; + ptrdiff_t old_errfunc = L->errfunc; + L->errfunc = ef; + status = luaD_rawrunprotected(L, func, u); + if (status != LUA_OK) { /* an error occurred? */ + StkId oldtop = restorestack(L, old_top); + luaF_close(L, oldtop); /* close possible pending closures */ + seterrorobj(L, status, oldtop); + L->ci = old_ci; + L->allowhook = old_allowhooks; + L->nny = old_nny; + luaD_shrinkstack(L); + } + L->errfunc = old_errfunc; + return status; +} + + + +/* +** Execute a protected parser. +*/ +struct SParser { /* data to `f_parser' */ + ZIO *z; + Mbuffer buff; /* dynamic structure used by the scanner */ + Dyndata dyd; /* dynamic structures used by the parser */ + const char *mode; + const char *name; +}; + + +static void checkmode (lua_State *L, const char *mode, const char *x) { + if (mode && strchr(mode, x[0]) == NULL) { + luaO_pushfstring(L, + "attempt to load a %s chunk (mode is " LUA_QS ")", x, mode); + luaD_throw(L, LUA_ERRSYNTAX); + } +} + + +static void f_parser (lua_State *L, void *ud) { + int i; + Closure *cl; + struct SParser *p = cast(struct SParser *, ud); + int c = zgetc(p->z); /* read first character */ + lua_assert(c != LUA_SIGNATURE[0]); /* binary not supported */ + checkmode(L, p->mode, "text"); + cl = luaY_parser(L, p->z, &p->buff, &p->dyd, p->name, c); + lua_assert(cl->l.nupvalues == cl->l.p->sizeupvalues); + for (i = 0; i < cl->l.nupvalues; i++) { /* initialize upvalues */ + UpVal *up = luaF_newupval(L); + cl->l.upvals[i] = up; + luaC_objbarrier(L, cl, up); + } +} + + +int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, + const char *mode) { + struct SParser p; + int status; + L->nny++; /* cannot yield during parsing */ + p.z = z; p.name = name; p.mode = mode; + p.dyd.actvar.arr = NULL; p.dyd.actvar.size = 0; + p.dyd.gt.arr = NULL; p.dyd.gt.size = 0; + p.dyd.label.arr = NULL; p.dyd.label.size = 0; + luaZ_initbuffer(L, &p.buff); + status = luaD_pcall(L, f_parser, &p, savestack(L, L->top), L->errfunc); + luaZ_freebuffer(L, &p.buff); + luaM_freearray(L, p.dyd.actvar.arr, p.dyd.actvar.size); + luaM_freearray(L, p.dyd.gt.arr, p.dyd.gt.size); + luaM_freearray(L, p.dyd.label.arr, p.dyd.label.size); + L->nny--; + return status; +} +/* END CSTYLED */ diff --git a/module/lua/ldo.h b/module/lua/ldo.h new file mode 100644 index 000000000000..2c0e1704d072 --- /dev/null +++ b/module/lua/ldo.h @@ -0,0 +1,47 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ldo.h,v 2.20.1.1 2013/04/12 18:48:47 roberto Exp $ +** Stack and Call structure of Lua +** See Copyright Notice in lua.h +*/ + +#ifndef ldo_h +#define ldo_h + + +#include "lobject.h" +#include "lstate.h" +#include "lzio.h" + + +#define luaD_checkstack(L,n) if (L->stack_last - L->top <= (n)) \ + luaD_growstack(L, n); else condmovestack(L); + + +#define incr_top(L) {L->top++; luaD_checkstack(L,0);} + +#define savestack(L,p) ((char *)(p) - (char *)L->stack) +#define restorestack(L,n) ((TValue *)((char *)L->stack + (n))) + + +/* type of protected functions, to be ran by `runprotected' */ +typedef void (*Pfunc) (lua_State *L, void *ud); + +LUAI_FUNC int luaD_protectedparser (lua_State *L, ZIO *z, const char *name, + const char *mode); +LUAI_FUNC void luaD_hook (lua_State *L, int event, int line); +LUAI_FUNC int luaD_precall (lua_State *L, StkId func, int nresults); +LUAI_FUNC void luaD_call (lua_State *L, StkId func, int nResults, + int allowyield); +LUAI_FUNC int luaD_pcall (lua_State *L, Pfunc func, void *u, + ptrdiff_t oldtop, ptrdiff_t ef); +LUAI_FUNC int luaD_poscall (lua_State *L, StkId firstResult); +LUAI_FUNC void luaD_reallocstack (lua_State *L, int newsize); +LUAI_FUNC void luaD_growstack (lua_State *L, int n); +LUAI_FUNC void luaD_shrinkstack (lua_State *L); + +LUAI_FUNC l_noret luaD_throw (lua_State *L, int errcode); +LUAI_FUNC int luaD_rawrunprotected (lua_State *L, Pfunc f, void *ud); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lfunc.c b/module/lua/lfunc.c new file mode 100644 index 000000000000..1a510831259c --- /dev/null +++ b/module/lua/lfunc.c @@ -0,0 +1,160 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lfunc.c,v 2.30.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions to manipulate prototypes and closures +** See Copyright Notice in lua.h +*/ + + +#define lfunc_c +#define LUA_CORE + +#include + +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" + + + +Closure *luaF_newCclosure (lua_State *L, int n) { + Closure *c = &luaC_newobj(L, LUA_TCCL, sizeCclosure(n), NULL, 0)->cl; + c->c.nupvalues = cast_byte(n); + return c; +} + + +Closure *luaF_newLclosure (lua_State *L, int n) { + Closure *c = &luaC_newobj(L, LUA_TLCL, sizeLclosure(n), NULL, 0)->cl; + c->l.p = NULL; + c->l.nupvalues = cast_byte(n); + while (n--) c->l.upvals[n] = NULL; + return c; +} + + +UpVal *luaF_newupval (lua_State *L) { + UpVal *uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), NULL, 0)->uv; + uv->v = &uv->u.value; + setnilvalue(uv->v); + return uv; +} + + +UpVal *luaF_findupval (lua_State *L, StkId level) { + global_State *g = G(L); + GCObject **pp = &L->openupval; + UpVal *p; + UpVal *uv; + while (*pp != NULL && (p = gco2uv(*pp))->v >= level) { + GCObject *o = obj2gco(p); + lua_assert(p->v != &p->u.value); + lua_assert(!isold(o) || isold(obj2gco(L))); + if (p->v == level) { /* found a corresponding upvalue? */ + if (isdead(g, o)) /* is it dead? */ + changewhite(o); /* resurrect it */ + return p; + } + pp = &p->next; + } + /* not found: create a new one */ + uv = &luaC_newobj(L, LUA_TUPVAL, sizeof(UpVal), pp, 0)->uv; + uv->v = level; /* current value lives in the stack */ + uv->u.l.prev = &g->uvhead; /* double link it in `uvhead' list */ + uv->u.l.next = g->uvhead.u.l.next; + uv->u.l.next->u.l.prev = uv; + g->uvhead.u.l.next = uv; + lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); + return uv; +} + + +static void unlinkupval (UpVal *uv) { + lua_assert(uv->u.l.next->u.l.prev == uv && uv->u.l.prev->u.l.next == uv); + uv->u.l.next->u.l.prev = uv->u.l.prev; /* remove from `uvhead' list */ + uv->u.l.prev->u.l.next = uv->u.l.next; +} + + +void luaF_freeupval (lua_State *L, UpVal *uv) { + if (uv->v != &uv->u.value) /* is it open? */ + unlinkupval(uv); /* remove from open list */ + luaM_free(L, uv); /* free upvalue */ +} + + +void luaF_close (lua_State *L, StkId level) { + UpVal *uv; + global_State *g = G(L); + while (L->openupval != NULL && (uv = gco2uv(L->openupval))->v >= level) { + GCObject *o = obj2gco(uv); + lua_assert(!isblack(o) && uv->v != &uv->u.value); + L->openupval = uv->next; /* remove from `open' list */ + if (isdead(g, o)) + luaF_freeupval(L, uv); /* free upvalue */ + else { + unlinkupval(uv); /* remove upvalue from 'uvhead' list */ + setobj(L, &uv->u.value, uv->v); /* move value to upvalue slot */ + uv->v = &uv->u.value; /* now current value lives here */ + gch(o)->next = g->allgc; /* link upvalue into 'allgc' list */ + g->allgc = o; + luaC_checkupvalcolor(g, uv); + } + } +} + + +Proto *luaF_newproto (lua_State *L) { + Proto *f = &luaC_newobj(L, LUA_TPROTO, sizeof(Proto), NULL, 0)->p; + f->k = NULL; + f->sizek = 0; + f->p = NULL; + f->sizep = 0; + f->code = NULL; + f->cache = NULL; + f->sizecode = 0; + f->lineinfo = NULL; + f->sizelineinfo = 0; + f->upvalues = NULL; + f->sizeupvalues = 0; + f->numparams = 0; + f->is_vararg = 0; + f->maxstacksize = 0; + f->locvars = NULL; + f->sizelocvars = 0; + f->linedefined = 0; + f->lastlinedefined = 0; + f->source = NULL; + return f; +} + + +void luaF_freeproto (lua_State *L, Proto *f) { + luaM_freearray(L, f->code, f->sizecode); + luaM_freearray(L, f->p, f->sizep); + luaM_freearray(L, f->k, f->sizek); + luaM_freearray(L, f->lineinfo, f->sizelineinfo); + luaM_freearray(L, f->locvars, f->sizelocvars); + luaM_freearray(L, f->upvalues, f->sizeupvalues); + luaM_free(L, f); +} + + +/* +** Look for n-th local variable at line `line' in function `func'. +** Returns NULL if not found. +*/ +const char *luaF_getlocalname (const Proto *f, int local_number, int pc) { + int i; + for (i = 0; isizelocvars && f->locvars[i].startpc <= pc; i++) { + if (pc < f->locvars[i].endpc) { /* is variable active? */ + local_number--; + if (local_number == 0) + return getstr(f->locvars[i].varname); + } + } + return NULL; /* not found */ +} +/* END CSTYLED */ diff --git a/module/lua/lfunc.h b/module/lua/lfunc.h new file mode 100644 index 000000000000..59a4fa75c46e --- /dev/null +++ b/module/lua/lfunc.h @@ -0,0 +1,35 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lfunc.h,v 2.8.1.1 2013/04/12 18:48:47 roberto Exp $ +** Auxiliary functions to manipulate prototypes and closures +** See Copyright Notice in lua.h +*/ + +#ifndef lfunc_h +#define lfunc_h + + +#include "lobject.h" + + +#define sizeCclosure(n) (cast(int, sizeof(CClosure)) + \ + cast(int, sizeof(TValue)*((n)-1))) + +#define sizeLclosure(n) (cast(int, sizeof(LClosure)) + \ + cast(int, sizeof(TValue *)*((n)-1))) + + +LUAI_FUNC Proto *luaF_newproto (lua_State *L); +LUAI_FUNC Closure *luaF_newCclosure (lua_State *L, int nelems); +LUAI_FUNC Closure *luaF_newLclosure (lua_State *L, int nelems); +LUAI_FUNC UpVal *luaF_newupval (lua_State *L); +LUAI_FUNC UpVal *luaF_findupval (lua_State *L, StkId level); +LUAI_FUNC void luaF_close (lua_State *L, StkId level); +LUAI_FUNC void luaF_freeproto (lua_State *L, Proto *f); +LUAI_FUNC void luaF_freeupval (lua_State *L, UpVal *uv); +LUAI_FUNC const char *luaF_getlocalname (const Proto *func, int local_number, + int pc); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lgc.c b/module/lua/lgc.c new file mode 100644 index 000000000000..55feb24119d3 --- /dev/null +++ b/module/lua/lgc.c @@ -0,0 +1,1218 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lgc.c,v 2.140.1.3 2014/09/01 16:55:08 roberto Exp $ +** Garbage Collector +** See Copyright Notice in lua.h +*/ + +#define lgc_c +#define LUA_CORE + +#include + +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + + +/* +** cost of sweeping one element (the size of a small object divided +** by some adjust for the sweep speed) +*/ +#define GCSWEEPCOST ((sizeof(TString) + 4) / 4) + +/* maximum number of elements to sweep in each single step */ +#define GCSWEEPMAX (cast_int((GCSTEPSIZE / GCSWEEPCOST) / 4)) + +/* maximum number of finalizers to call in each GC step */ +#define GCFINALIZENUM 4 + + +/* +** macro to adjust 'stepmul': 'stepmul' is actually used like +** 'stepmul / STEPMULADJ' (value chosen by tests) +*/ +#define STEPMULADJ 200 + + +/* +** macro to adjust 'pause': 'pause' is actually used like +** 'pause / PAUSEADJ' (value chosen by tests) +*/ +#define PAUSEADJ 100 + + +/* +** 'makewhite' erases all color bits plus the old bit and then +** sets only the current white bit +*/ +#define maskcolors (~(bit2mask(BLACKBIT, OLDBIT) | WHITEBITS)) +#define makewhite(g,x) \ + (gch(x)->marked = cast_byte((gch(x)->marked & maskcolors) | luaC_white(g))) + +#define white2gray(x) resetbits(gch(x)->marked, WHITEBITS) +#define black2gray(x) resetbit(gch(x)->marked, BLACKBIT) + + +#define isfinalized(x) testbit(gch(x)->marked, FINALIZEDBIT) + +#define checkdeadkey(n) lua_assert(!ttisdeadkey(gkey(n)) || ttisnil(gval(n))) + + +#define checkconsistency(obj) \ + lua_longassert(!iscollectable(obj) || righttt(obj)) + + +#define markvalue(g,o) { checkconsistency(o); \ + if (valiswhite(o)) reallymarkobject(g,gcvalue(o)); } + +#define markobject(g,t) { if ((t) && iswhite(obj2gco(t))) \ + reallymarkobject(g, obj2gco(t)); } + +static void reallymarkobject (global_State *g, GCObject *o); + + +/* +** {====================================================== +** Generic functions +** ======================================================= +*/ + + +/* +** one after last element in a hash array +*/ +#define gnodelast(h) gnode(h, cast(size_t, sizenode(h))) + + +/* +** link table 'h' into list pointed by 'p' +*/ +#define linktable(h,p) ((h)->gclist = *(p), *(p) = obj2gco(h)) + + +/* +** if key is not marked, mark its entry as dead (therefore removing it +** from the table) +*/ +static void removeentry (Node *n) { + lua_assert(ttisnil(gval(n))); + if (valiswhite(gkey(n))) + setdeadvalue(gkey(n)); /* unused and unmarked key; remove it */ +} + + +/* +** tells whether a key or value can be cleared from a weak +** table. Non-collectable objects are never removed from weak +** tables. Strings behave as `values', so are never removed too. for +** other objects: if really collected, cannot keep them; for objects +** being finalized, keep them in keys, but not in values +*/ +static int iscleared (global_State *g, const TValue *o) { + if (!iscollectable(o)) return 0; + else if (ttisstring(o)) { + markobject(g, rawtsvalue(o)); /* strings are `values', so are never weak */ + return 0; + } + else return iswhite(gcvalue(o)); +} + + +/* +** barrier that moves collector forward, that is, mark the white object +** being pointed by a black object. +*/ +void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v) { + global_State *g = G(L); + lua_assert(isblack(o) && iswhite(v) && !isdead(g, v) && !isdead(g, o)); + lua_assert(g->gcstate != GCSpause); + lua_assert(gch(o)->tt != LUA_TTABLE); + if (keepinvariantout(g)) /* must keep invariant? */ + reallymarkobject(g, v); /* restore invariant */ + else { /* sweep phase */ + lua_assert(issweepphase(g)); + makewhite(g, o); /* mark main obj. as white to avoid other barriers */ + } +} + + +/* +** barrier that moves collector backward, that is, mark the black object +** pointing to a white object as gray again. (Current implementation +** only works for tables; access to 'gclist' is not uniform across +** different types.) +*/ +void luaC_barrierback_ (lua_State *L, GCObject *o) { + global_State *g = G(L); + lua_assert(isblack(o) && !isdead(g, o) && gch(o)->tt == LUA_TTABLE); + black2gray(o); /* make object gray (again) */ + gco2t(o)->gclist = g->grayagain; + g->grayagain = o; +} + + +/* +** barrier for prototypes. When creating first closure (cache is +** NULL), use a forward barrier; this may be the only closure of the +** prototype (if it is a "regular" function, with a single instance) +** and the prototype may be big, so it is better to avoid traversing +** it again. Otherwise, use a backward barrier, to avoid marking all +** possible instances. +*/ +LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c) { + global_State *g = G(L); + lua_assert(isblack(obj2gco(p))); + if (p->cache == NULL) { /* first time? */ + luaC_objbarrier(L, p, c); + } + else { /* use a backward barrier */ + black2gray(obj2gco(p)); /* make prototype gray (again) */ + p->gclist = g->grayagain; + g->grayagain = obj2gco(p); + } +} + + +/* +** check color (and invariants) for an upvalue that was closed, +** i.e., moved into the 'allgc' list +*/ +void luaC_checkupvalcolor (global_State *g, UpVal *uv) { + GCObject *o = obj2gco(uv); + lua_assert(!isblack(o)); /* open upvalues are never black */ + if (isgray(o)) { + if (keepinvariant(g)) { + resetoldbit(o); /* see MOVE OLD rule */ + gray2black(o); /* it is being visited now */ + markvalue(g, uv->v); + } + else { + lua_assert(issweepphase(g)); + makewhite(g, o); + } + } +} + + +/* +** create a new collectable object (with given type and size) and link +** it to '*list'. 'offset' tells how many bytes to allocate before the +** object itself (used only by states). +*/ +GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, GCObject **list, + int offset) { + global_State *g = G(L); + char *raw = cast(char *, luaM_newobject(L, novariant(tt), sz)); + GCObject *o = obj2gco(raw + offset); + if (list == NULL) + list = &g->allgc; /* standard list for collectable objects */ + gch(o)->marked = luaC_white(g); + gch(o)->tt = tt; + gch(o)->next = *list; + *list = o; + return o; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Mark functions +** ======================================================= +*/ + + +/* +** mark an object. Userdata, strings, and closed upvalues are visited +** and turned black here. Other objects are marked gray and added +** to appropriate list to be visited (and turned black) later. (Open +** upvalues are already linked in 'headuv' list.) +*/ +static void reallymarkobject (global_State *g, GCObject *o) { + lu_mem size; + white2gray(o); + switch (gch(o)->tt) { + case LUA_TSHRSTR: + case LUA_TLNGSTR: { + size = sizestring(gco2ts(o)); + break; /* nothing else to mark; make it black */ + } + case LUA_TUSERDATA: { + Table *mt = gco2u(o)->metatable; + markobject(g, mt); + markobject(g, gco2u(o)->env); + size = sizeudata(gco2u(o)); + break; + } + case LUA_TUPVAL: { + UpVal *uv = gco2uv(o); + markvalue(g, uv->v); + if (uv->v != &uv->u.value) /* open? */ + return; /* open upvalues remain gray */ + size = sizeof(UpVal); + break; + } + case LUA_TLCL: { + gco2lcl(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TCCL: { + gco2ccl(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TTABLE: { + linktable(gco2t(o), &g->gray); + return; + } + case LUA_TTHREAD: { + gco2th(o)->gclist = g->gray; + g->gray = o; + return; + } + case LUA_TPROTO: { + gco2p(o)->gclist = g->gray; + g->gray = o; + return; + } + default: lua_assert(0); return; + } + gray2black(o); + g->GCmemtrav += size; +} + + +/* +** mark metamethods for basic types +*/ +static void markmt (global_State *g) { + int i; + for (i=0; i < LUA_NUMTAGS; i++) + markobject(g, g->mt[i]); +} + + +/* +** mark all objects in list of being-finalized +*/ +static void markbeingfnz (global_State *g) { + GCObject *o; + for (o = g->tobefnz; o != NULL; o = gch(o)->next) { + makewhite(g, o); + reallymarkobject(g, o); + } +} + + +/* +** mark all values stored in marked open upvalues. (See comment in +** 'lstate.h'.) +*/ +static void remarkupvals (global_State *g) { + UpVal *uv; + for (uv = g->uvhead.u.l.next; uv != &g->uvhead; uv = uv->u.l.next) { + if (isgray(obj2gco(uv))) + markvalue(g, uv->v); + } +} + + +/* +** mark root set and reset all gray lists, to start a new +** incremental (or full) collection +*/ +static void restartcollection (global_State *g) { + g->gray = g->grayagain = NULL; + g->weak = g->allweak = g->ephemeron = NULL; + markobject(g, g->mainthread); + markvalue(g, &g->l_registry); + markmt(g); + markbeingfnz(g); /* mark any finalizing object left from previous cycle */ +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Traverse functions +** ======================================================= +*/ + +static void traverseweakvalue (global_State *g, Table *h) { + Node *n, *limit = gnodelast(h); + /* if there is array part, assume it may have white values (do not + traverse it just to check) */ + int hasclears = (h->sizearray > 0); + for (n = gnode(h, 0); n < limit; n++) { + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else { + lua_assert(!ttisnil(gkey(n))); + markvalue(g, gkey(n)); /* mark key */ + if (!hasclears && iscleared(g, gval(n))) /* is there a white value? */ + hasclears = 1; /* table will have to be cleared */ + } + } + if (hasclears) + linktable(h, &g->weak); /* has to be cleared later */ + else /* no white values */ + linktable(h, &g->grayagain); /* no need to clean */ +} + + +static int traverseephemeron (global_State *g, Table *h) { + int marked = 0; /* true if an object is marked in this traversal */ + int hasclears = 0; /* true if table has white keys */ + int prop = 0; /* true if table has entry "white-key -> white-value" */ + Node *n, *limit = gnodelast(h); + int i; + /* traverse array part (numeric keys are 'strong') */ + for (i = 0; i < h->sizearray; i++) { + if (valiswhite(&h->array[i])) { + marked = 1; + reallymarkobject(g, gcvalue(&h->array[i])); + } + } + /* traverse hash part */ + for (n = gnode(h, 0); n < limit; n++) { + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else if (iscleared(g, gkey(n))) { /* key is not marked (yet)? */ + hasclears = 1; /* table must be cleared */ + if (valiswhite(gval(n))) /* value not marked yet? */ + prop = 1; /* must propagate again */ + } + else if (valiswhite(gval(n))) { /* value not marked yet? */ + marked = 1; + reallymarkobject(g, gcvalue(gval(n))); /* mark it now */ + } + } + if (g->gcstate != GCSatomic || prop) + linktable(h, &g->ephemeron); /* have to propagate again */ + else if (hasclears) /* does table have white keys? */ + linktable(h, &g->allweak); /* may have to clean white keys */ + else /* no white keys */ + linktable(h, &g->grayagain); /* no need to clean */ + return marked; +} + + +static void traversestrongtable (global_State *g, Table *h) { + Node *n, *limit = gnodelast(h); + int i; + for (i = 0; i < h->sizearray; i++) /* traverse array part */ + markvalue(g, &h->array[i]); + for (n = gnode(h, 0); n < limit; n++) { /* traverse hash part */ + checkdeadkey(n); + if (ttisnil(gval(n))) /* entry is empty? */ + removeentry(n); /* remove it */ + else { + lua_assert(!ttisnil(gkey(n))); + markvalue(g, gkey(n)); /* mark key */ + markvalue(g, gval(n)); /* mark value */ + } + } +} + + +static lu_mem traversetable (global_State *g, Table *h) { + const char *weakkey, *weakvalue; + const TValue *mode = gfasttm(g, h->metatable, TM_MODE); + markobject(g, h->metatable); + if (mode && ttisstring(mode) && /* is there a weak mode? */ + ((weakkey = strchr(svalue(mode), 'k')), + (weakvalue = strchr(svalue(mode), 'v')), + (weakkey || weakvalue))) { /* is really weak? */ + black2gray(obj2gco(h)); /* keep table gray */ + if (!weakkey) /* strong keys? */ + traverseweakvalue(g, h); + else if (!weakvalue) /* strong values? */ + traverseephemeron(g, h); + else /* all weak */ + linktable(h, &g->allweak); /* nothing to traverse now */ + } + else /* not weak */ + traversestrongtable(g, h); + return sizeof(Table) + sizeof(TValue) * h->sizearray + + sizeof(Node) * cast(size_t, sizenode(h)); +} + + +static int traverseproto (global_State *g, Proto *f) { + int i; + if (f->cache && iswhite(obj2gco(f->cache))) + f->cache = NULL; /* allow cache to be collected */ + markobject(g, f->source); + for (i = 0; i < f->sizek; i++) /* mark literals */ + markvalue(g, &f->k[i]); + for (i = 0; i < f->sizeupvalues; i++) /* mark upvalue names */ + markobject(g, f->upvalues[i].name); + for (i = 0; i < f->sizep; i++) /* mark nested protos */ + markobject(g, f->p[i]); + for (i = 0; i < f->sizelocvars; i++) /* mark local-variable names */ + markobject(g, f->locvars[i].varname); + return sizeof(Proto) + sizeof(Instruction) * f->sizecode + + sizeof(Proto *) * f->sizep + + sizeof(TValue) * f->sizek + + sizeof(int) * f->sizelineinfo + + sizeof(LocVar) * f->sizelocvars + + sizeof(Upvaldesc) * f->sizeupvalues; +} + + +static lu_mem traverseCclosure (global_State *g, CClosure *cl) { + int i; + for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ + markvalue(g, &cl->upvalue[i]); + return sizeCclosure(cl->nupvalues); +} + +static lu_mem traverseLclosure (global_State *g, LClosure *cl) { + int i; + markobject(g, cl->p); /* mark its prototype */ + for (i = 0; i < cl->nupvalues; i++) /* mark its upvalues */ + markobject(g, cl->upvals[i]); + return sizeLclosure(cl->nupvalues); +} + + +static lu_mem traversestack (global_State *g, lua_State *th) { + int n = 0; + StkId o = th->stack; + if (o == NULL) + return 1; /* stack not completely built yet */ + for (; o < th->top; o++) /* mark live elements in the stack */ + markvalue(g, o); + if (g->gcstate == GCSatomic) { /* final traversal? */ + StkId lim = th->stack + th->stacksize; /* real end of stack */ + for (; o < lim; o++) /* clear not-marked stack slice */ + setnilvalue(o); + } + else { /* count call infos to compute size */ + CallInfo *ci; + for (ci = &th->base_ci; ci != th->ci; ci = ci->next) + n++; + } + return sizeof(lua_State) + sizeof(TValue) * th->stacksize + + sizeof(CallInfo) * n; +} + + +/* +** traverse one gray object, turning it to black (except for threads, +** which are always gray). +*/ +static void propagatemark (global_State *g) { + lu_mem size; + GCObject *o = g->gray; + lua_assert(isgray(o)); + gray2black(o); + switch (gch(o)->tt) { + case LUA_TTABLE: { + Table *h = gco2t(o); + g->gray = h->gclist; /* remove from 'gray' list */ + size = traversetable(g, h); + break; + } + case LUA_TLCL: { + LClosure *cl = gco2lcl(o); + g->gray = cl->gclist; /* remove from 'gray' list */ + size = traverseLclosure(g, cl); + break; + } + case LUA_TCCL: { + CClosure *cl = gco2ccl(o); + g->gray = cl->gclist; /* remove from 'gray' list */ + size = traverseCclosure(g, cl); + break; + } + case LUA_TTHREAD: { + lua_State *th = gco2th(o); + g->gray = th->gclist; /* remove from 'gray' list */ + th->gclist = g->grayagain; + g->grayagain = o; /* insert into 'grayagain' list */ + black2gray(o); + size = traversestack(g, th); + break; + } + case LUA_TPROTO: { + Proto *p = gco2p(o); + g->gray = p->gclist; /* remove from 'gray' list */ + size = traverseproto(g, p); + break; + } + default: lua_assert(0); return; + } + g->GCmemtrav += size; +} + + +static void propagateall (global_State *g) { + while (g->gray) propagatemark(g); +} + + +static void propagatelist (global_State *g, GCObject *l) { + lua_assert(g->gray == NULL); /* no grays left */ + g->gray = l; + propagateall(g); /* traverse all elements from 'l' */ +} + +/* +** retraverse all gray lists. Because tables may be reinserted in other +** lists when traversed, traverse the original lists to avoid traversing +** twice the same table (which is not wrong, but inefficient) +*/ +static void retraversegrays (global_State *g) { + GCObject *weak = g->weak; /* save original lists */ + GCObject *grayagain = g->grayagain; + GCObject *ephemeron = g->ephemeron; + g->weak = g->grayagain = g->ephemeron = NULL; + propagateall(g); /* traverse main gray list */ + propagatelist(g, grayagain); + propagatelist(g, weak); + propagatelist(g, ephemeron); +} + + +static void convergeephemerons (global_State *g) { + int changed; + do { + GCObject *w; + GCObject *next = g->ephemeron; /* get ephemeron list */ + g->ephemeron = NULL; /* tables will return to this list when traversed */ + changed = 0; + while ((w = next) != NULL) { + next = gco2t(w)->gclist; + if (traverseephemeron(g, gco2t(w))) { /* traverse marked some value? */ + propagateall(g); /* propagate changes */ + changed = 1; /* will have to revisit all ephemeron tables */ + } + } + } while (changed); +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Sweep Functions +** ======================================================= +*/ + + +/* +** clear entries with unmarked keys from all weaktables in list 'l' up +** to element 'f' +*/ +static void clearkeys (global_State *g, GCObject *l, GCObject *f) { + for (; l != f; l = gco2t(l)->gclist) { + Table *h = gco2t(l); + Node *n, *limit = gnodelast(h); + for (n = gnode(h, 0); n < limit; n++) { + if (!ttisnil(gval(n)) && (iscleared(g, gkey(n)))) { + setnilvalue(gval(n)); /* remove value ... */ + removeentry(n); /* and remove entry from table */ + } + } + } +} + + +/* +** clear entries with unmarked values from all weaktables in list 'l' up +** to element 'f' +*/ +static void clearvalues (global_State *g, GCObject *l, GCObject *f) { + for (; l != f; l = gco2t(l)->gclist) { + Table *h = gco2t(l); + Node *n, *limit = gnodelast(h); + int i; + for (i = 0; i < h->sizearray; i++) { + TValue *o = &h->array[i]; + if (iscleared(g, o)) /* value was collected? */ + setnilvalue(o); /* remove value */ + } + for (n = gnode(h, 0); n < limit; n++) { + if (!ttisnil(gval(n)) && iscleared(g, gval(n))) { + setnilvalue(gval(n)); /* remove value ... */ + removeentry(n); /* and remove entry from table */ + } + } + } +} + + +static void freeobj (lua_State *L, GCObject *o) { + switch (gch(o)->tt) { + case LUA_TPROTO: luaF_freeproto(L, gco2p(o)); break; + case LUA_TLCL: { + luaM_freemem(L, o, sizeLclosure(gco2lcl(o)->nupvalues)); + break; + } + case LUA_TCCL: { + luaM_freemem(L, o, sizeCclosure(gco2ccl(o)->nupvalues)); + break; + } + case LUA_TUPVAL: luaF_freeupval(L, gco2uv(o)); break; + case LUA_TTABLE: luaH_free(L, gco2t(o)); break; + case LUA_TTHREAD: luaE_freethread(L, gco2th(o)); break; + case LUA_TUSERDATA: luaM_freemem(L, o, sizeudata(gco2u(o))); break; + case LUA_TSHRSTR: + G(L)->strt.nuse--; + /* FALLTHROUGH */ + case LUA_TLNGSTR: { + luaM_freemem(L, o, sizestring(gco2ts(o))); + break; + } + default: lua_assert(0); + } +} + + +#define sweepwholelist(L,p) sweeplist(L,p,MAX_LUMEM) +static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count); + + +/* +** sweep the (open) upvalues of a thread and resize its stack and +** list of call-info structures. +*/ +static void sweepthread (lua_State *L, lua_State *L1) { + if (L1->stack == NULL) return; /* stack not completely built yet */ + sweepwholelist(L, &L1->openupval); /* sweep open upvalues */ + luaE_freeCI(L1); /* free extra CallInfo slots */ + /* should not change the stack during an emergency gc cycle */ + if (G(L)->gckind != KGC_EMERGENCY) + luaD_shrinkstack(L1); +} + + +/* +** sweep at most 'count' elements from a list of GCObjects erasing dead +** objects, where a dead (not alive) object is one marked with the "old" +** (non current) white and not fixed. +** In non-generational mode, change all non-dead objects back to white, +** preparing for next collection cycle. +** In generational mode, keep black objects black, and also mark them as +** old; stop when hitting an old object, as all objects after that +** one will be old too. +** When object is a thread, sweep its list of open upvalues too. +*/ +static GCObject **sweeplist (lua_State *L, GCObject **p, lu_mem count) { + global_State *g = G(L); + int ow = otherwhite(g); + int toclear, toset; /* bits to clear and to set in all live objects */ + int tostop; /* stop sweep when this is true */ + if (isgenerational(g)) { /* generational mode? */ + toclear = ~0; /* clear nothing */ + toset = bitmask(OLDBIT); /* set the old bit of all surviving objects */ + tostop = bitmask(OLDBIT); /* do not sweep old generation */ + } + else { /* normal mode */ + toclear = maskcolors; /* clear all color bits + old bit */ + toset = luaC_white(g); /* make object white */ + tostop = 0; /* do not stop */ + } + while (*p != NULL && count-- > 0) { + GCObject *curr = *p; + int marked = gch(curr)->marked; + if (isdeadm(ow, marked)) { /* is 'curr' dead? */ + *p = gch(curr)->next; /* remove 'curr' from list */ + freeobj(L, curr); /* erase 'curr' */ + } + else { + if (testbits(marked, tostop)) + return NULL; /* stop sweeping this list */ + if (gch(curr)->tt == LUA_TTHREAD) + sweepthread(L, gco2th(curr)); /* sweep thread's upvalues */ + /* update marks */ + gch(curr)->marked = cast_byte((marked & toclear) | toset); + p = &gch(curr)->next; /* go to next element */ + } + } + return (*p == NULL) ? NULL : p; +} + + +/* +** sweep a list until a live object (or end of list) +*/ +static GCObject **sweeptolive (lua_State *L, GCObject **p, int *n) { + GCObject ** old = p; + int i = 0; + do { + i++; + p = sweeplist(L, p, 1); + } while (p == old); + if (n) *n += i; + return p; +} + +/* }====================================================== */ + + +/* +** {====================================================== +** Finalization +** ======================================================= +*/ + +static void checkSizes (lua_State *L) { + global_State *g = G(L); + if (g->gckind != KGC_EMERGENCY) { /* do not change sizes in emergency */ + int hs = g->strt.size / 2; /* half the size of the string table */ + if (g->strt.nuse < cast(lu_int32, hs)) /* using less than that half? */ + luaS_resize(L, hs); /* halve its size */ + luaZ_freebuffer(L, &g->buff); /* free concatenation buffer */ + } +} + + +static GCObject *udata2finalize (global_State *g) { + GCObject *o = g->tobefnz; /* get first element */ + lua_assert(isfinalized(o)); + g->tobefnz = gch(o)->next; /* remove it from 'tobefnz' list */ + gch(o)->next = g->allgc; /* return it to 'allgc' list */ + g->allgc = o; + resetbit(gch(o)->marked, SEPARATED); /* mark that it is not in 'tobefnz' */ + lua_assert(!isold(o)); /* see MOVE OLD rule */ + if (!keepinvariantout(g)) /* not keeping invariant? */ + makewhite(g, o); /* "sweep" object */ + return o; +} + + +static void dothecall (lua_State *L, void *ud) { + UNUSED(ud); + luaD_call(L, L->top - 2, 0, 0); +} + + +static void GCTM (lua_State *L, int propagateerrors) { + global_State *g = G(L); + const TValue *tm; + TValue v; + setgcovalue(L, &v, udata2finalize(g)); + tm = luaT_gettmbyobj(L, &v, TM_GC); + if (tm != NULL && ttisfunction(tm)) { /* is there a finalizer? */ + int status; + lu_byte oldah = L->allowhook; + int running = g->gcrunning; + L->allowhook = 0; /* stop debug hooks during GC metamethod */ + g->gcrunning = 0; /* avoid GC steps */ + setobj2s(L, L->top, tm); /* push finalizer... */ + setobj2s(L, L->top + 1, &v); /* ... and its argument */ + L->top += 2; /* and (next line) call the finalizer */ + status = luaD_pcall(L, dothecall, NULL, savestack(L, L->top - 2), 0); + L->allowhook = oldah; /* restore hooks */ + g->gcrunning = running; /* restore state */ + if (status != LUA_OK && propagateerrors) { /* error while running __gc? */ + if (status == LUA_ERRRUN) { /* is there an error object? */ + const char *msg = (ttisstring(L->top - 1)) + ? svalue(L->top - 1) + : "no message"; + luaO_pushfstring(L, "error in __gc metamethod (%s)", msg); + status = LUA_ERRGCMM; /* error in __gc metamethod */ + } + luaD_throw(L, status); /* re-throw error */ + } + } +} + + +/* +** move all unreachable objects (or 'all' objects) that need +** finalization from list 'finobj' to list 'tobefnz' (to be finalized) +*/ +static void separatetobefnz (lua_State *L, int all) { + global_State *g = G(L); + GCObject **p = &g->finobj; + GCObject *curr; + GCObject **lastnext = &g->tobefnz; + /* find last 'next' field in 'tobefnz' list (to add elements in its end) */ + while (*lastnext != NULL) + lastnext = &gch(*lastnext)->next; + while ((curr = *p) != NULL) { /* traverse all finalizable objects */ + lua_assert(!isfinalized(curr)); + lua_assert(testbit(gch(curr)->marked, SEPARATED)); + if (!(iswhite(curr) || all)) /* not being collected? */ + p = &gch(curr)->next; /* don't bother with it */ + else { + l_setbit(gch(curr)->marked, FINALIZEDBIT); /* won't be finalized again */ + *p = gch(curr)->next; /* remove 'curr' from 'finobj' list */ + gch(curr)->next = *lastnext; /* link at the end of 'tobefnz' list */ + *lastnext = curr; + lastnext = &gch(curr)->next; + } + } +} + + +/* +** if object 'o' has a finalizer, remove it from 'allgc' list (must +** search the list to find it) and link it in 'finobj' list. +*/ +void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt) { + global_State *g = G(L); + if (testbit(gch(o)->marked, SEPARATED) || /* obj. is already separated... */ + isfinalized(o) || /* ... or is finalized... */ + gfasttm(g, mt, TM_GC) == NULL) /* or has no finalizer? */ + return; /* nothing to be done */ + else { /* move 'o' to 'finobj' list */ + GCObject **p; + GCheader *ho = gch(o); + if (g->sweepgc == &ho->next) { /* avoid removing current sweep object */ + lua_assert(issweepphase(g)); + g->sweepgc = sweeptolive(L, g->sweepgc, NULL); + } + /* search for pointer pointing to 'o' */ + for (p = &g->allgc; *p != o; p = &gch(*p)->next) { /* empty */ } + *p = ho->next; /* remove 'o' from root list */ + ho->next = g->finobj; /* link it in list 'finobj' */ + g->finobj = o; + l_setbit(ho->marked, SEPARATED); /* mark it as such */ + if (!keepinvariantout(g)) /* not keeping invariant? */ + makewhite(g, o); /* "sweep" object */ + else + resetoldbit(o); /* see MOVE OLD rule */ + } +} + +/* }====================================================== */ + + +/* +** {====================================================== +** GC control +** ======================================================= +*/ + + +/* +** set a reasonable "time" to wait before starting a new GC cycle; +** cycle will start when memory use hits threshold +*/ +static void setpause (global_State *g, l_mem estimate) { + l_mem debt, threshold; + estimate = estimate / PAUSEADJ; /* adjust 'estimate' */ + threshold = (g->gcpause < MAX_LMEM / estimate) /* overflow? */ + ? estimate * g->gcpause /* no overflow */ + : MAX_LMEM; /* overflow; truncate to maximum */ + debt = -cast(l_mem, threshold - gettotalbytes(g)); + luaE_setdebt(g, debt); +} + + +#define sweepphases \ + (bitmask(GCSsweepstring) | bitmask(GCSsweepudata) | bitmask(GCSsweep)) + + +/* +** enter first sweep phase (strings) and prepare pointers for other +** sweep phases. The calls to 'sweeptolive' make pointers point to an +** object inside the list (instead of to the header), so that the real +** sweep do not need to skip objects created between "now" and the start +** of the real sweep. +** Returns how many objects it swept. +*/ +static int entersweep (lua_State *L) { + global_State *g = G(L); + int n = 0; + g->gcstate = GCSsweepstring; + lua_assert(g->sweepgc == NULL && g->sweepfin == NULL); + /* prepare to sweep strings, finalizable objects, and regular objects */ + g->sweepstrgc = 0; + g->sweepfin = sweeptolive(L, &g->finobj, &n); + g->sweepgc = sweeptolive(L, &g->allgc, &n); + return n; +} + + +/* +** change GC mode +*/ +void luaC_changemode (lua_State *L, int mode) { + global_State *g = G(L); + if (mode == g->gckind) return; /* nothing to change */ + if (mode == KGC_GEN) { /* change to generational mode */ + /* make sure gray lists are consistent */ + luaC_runtilstate(L, bitmask(GCSpropagate)); + g->GCestimate = gettotalbytes(g); + g->gckind = KGC_GEN; + } + else { /* change to incremental mode */ + /* sweep all objects to turn them back to white + (as white has not changed, nothing extra will be collected) */ + g->gckind = KGC_NORMAL; + entersweep(L); + luaC_runtilstate(L, ~sweepphases); + } +} + + +/* +** call all pending finalizers +*/ +static void callallpendingfinalizers (lua_State *L, int propagateerrors) { + global_State *g = G(L); + while (g->tobefnz) { + resetoldbit(g->tobefnz); + GCTM(L, propagateerrors); + } +} + + +void luaC_freeallobjects (lua_State *L) { + global_State *g = G(L); + int i; + separatetobefnz(L, 1); /* separate all objects with finalizers */ + lua_assert(g->finobj == NULL); + callallpendingfinalizers(L, 0); + g->currentwhite = WHITEBITS; /* this "white" makes all objects look dead */ + g->gckind = KGC_NORMAL; + sweepwholelist(L, &g->finobj); /* finalizers can create objs. in 'finobj' */ + sweepwholelist(L, &g->allgc); + for (i = 0; i < g->strt.size; i++) /* free all string lists */ + sweepwholelist(L, &g->strt.hash[i]); + lua_assert(g->strt.nuse == 0); +} + + +static l_mem atomic (lua_State *L) { + global_State *g = G(L); + l_mem work = -cast(l_mem, g->GCmemtrav); /* start counting work */ + GCObject *origweak, *origall; + lua_assert(!iswhite(obj2gco(g->mainthread))); + markobject(g, L); /* mark running thread */ + /* registry and global metatables may be changed by API */ + markvalue(g, &g->l_registry); + markmt(g); /* mark basic metatables */ + /* remark occasional upvalues of (maybe) dead threads */ + remarkupvals(g); + propagateall(g); /* propagate changes */ + work += g->GCmemtrav; /* stop counting (do not (re)count grays) */ + /* traverse objects caught by write barrier and by 'remarkupvals' */ + retraversegrays(g); + work -= g->GCmemtrav; /* restart counting */ + convergeephemerons(g); + /* at this point, all strongly accessible objects are marked. */ + /* clear values from weak tables, before checking finalizers */ + clearvalues(g, g->weak, NULL); + clearvalues(g, g->allweak, NULL); + origweak = g->weak; origall = g->allweak; + work += g->GCmemtrav; /* stop counting (objects being finalized) */ + separatetobefnz(L, 0); /* separate objects to be finalized */ + markbeingfnz(g); /* mark objects that will be finalized */ + propagateall(g); /* remark, to propagate `preserveness' */ + work -= g->GCmemtrav; /* restart counting */ + convergeephemerons(g); + /* at this point, all resurrected objects are marked. */ + /* remove dead objects from weak tables */ + clearkeys(g, g->ephemeron, NULL); /* clear keys from all ephemeron tables */ + clearkeys(g, g->allweak, NULL); /* clear keys from all allweak tables */ + /* clear values from resurrected weak tables */ + clearvalues(g, g->weak, origweak); + clearvalues(g, g->allweak, origall); + g->currentwhite = cast_byte(otherwhite(g)); /* flip current white */ + work += g->GCmemtrav; /* complete counting */ + return work; /* estimate of memory marked by 'atomic' */ +} + + +static lu_mem singlestep (lua_State *L) { + global_State *g = G(L); + switch (g->gcstate) { + case GCSpause: { + /* start to count memory traversed */ + g->GCmemtrav = g->strt.size * sizeof(GCObject*); + lua_assert(!isgenerational(g)); + restartcollection(g); + g->gcstate = GCSpropagate; + return g->GCmemtrav; + } + case GCSpropagate: { + if (g->gray) { + lu_mem oldtrav = g->GCmemtrav; + propagatemark(g); + return g->GCmemtrav - oldtrav; /* memory traversed in this step */ + } + else { /* no more `gray' objects */ + lu_mem work; + int sw; + g->gcstate = GCSatomic; /* finish mark phase */ + g->GCestimate = g->GCmemtrav; /* save what was counted */; + work = atomic(L); /* add what was traversed by 'atomic' */ + g->GCestimate += work; /* estimate of total memory traversed */ + sw = entersweep(L); + return work + sw * GCSWEEPCOST; + } + } + case GCSsweepstring: { + int i; + for (i = 0; i < GCSWEEPMAX && g->sweepstrgc + i < g->strt.size; i++) + sweepwholelist(L, &g->strt.hash[g->sweepstrgc + i]); + g->sweepstrgc += i; + if (g->sweepstrgc >= g->strt.size) /* no more strings to sweep? */ + g->gcstate = GCSsweepudata; + return i * GCSWEEPCOST; + } + case GCSsweepudata: { + if (g->sweepfin) { + g->sweepfin = sweeplist(L, g->sweepfin, GCSWEEPMAX); + return GCSWEEPMAX*GCSWEEPCOST; + } + else { + g->gcstate = GCSsweep; + return 0; + } + } + case GCSsweep: { + if (g->sweepgc) { + g->sweepgc = sweeplist(L, g->sweepgc, GCSWEEPMAX); + return GCSWEEPMAX*GCSWEEPCOST; + } + else { + /* sweep main thread */ + GCObject *mt = obj2gco(g->mainthread); + sweeplist(L, &mt, 1); + checkSizes(L); + g->gcstate = GCSpause; /* finish collection */ + return GCSWEEPCOST; + } + } + default: lua_assert(0); return 0; + } +} + + +/* +** advances the garbage collector until it reaches a state allowed +** by 'statemask' +*/ +void luaC_runtilstate (lua_State *L, int statesmask) { + global_State *g = G(L); + while (!testbit(statesmask, g->gcstate)) + singlestep(L); +} + + +static void generationalcollection (lua_State *L) { + global_State *g = G(L); + lua_assert(g->gcstate == GCSpropagate); + if (g->GCestimate == 0) { /* signal for another major collection? */ + luaC_fullgc(L, 0); /* perform a full regular collection */ + g->GCestimate = gettotalbytes(g); /* update control */ + } + else { + lu_mem estimate = g->GCestimate; + luaC_runtilstate(L, bitmask(GCSpause)); /* run complete (minor) cycle */ + g->gcstate = GCSpropagate; /* skip restart */ + if (gettotalbytes(g) > (estimate / 100) * g->gcmajorinc) + g->GCestimate = 0; /* signal for a major collection */ + else + g->GCestimate = estimate; /* keep estimate from last major coll. */ + + } + setpause(g, gettotalbytes(g)); + lua_assert(g->gcstate == GCSpropagate); +} + + +static void incstep (lua_State *L) { + global_State *g = G(L); + l_mem debt = g->GCdebt; + int stepmul = g->gcstepmul; + if (stepmul < 40) stepmul = 40; /* avoid ridiculous low values (and 0) */ + /* convert debt from Kb to 'work units' (avoid zero debt and overflows) */ + debt = (debt / STEPMULADJ) + 1; + debt = (debt < MAX_LMEM / stepmul) ? debt * stepmul : MAX_LMEM; + do { /* always perform at least one single step */ + lu_mem work = singlestep(L); /* do some work */ + debt -= work; + } while (debt > -GCSTEPSIZE && g->gcstate != GCSpause); + if (g->gcstate == GCSpause) + setpause(g, g->GCestimate); /* pause until next cycle */ + else { + debt = (debt / stepmul) * STEPMULADJ; /* convert 'work units' to Kb */ + luaE_setdebt(g, debt); + } +} + + +/* +** performs a basic GC step +*/ +void luaC_forcestep (lua_State *L) { + global_State *g = G(L); + int i; + if (isgenerational(g)) generationalcollection(L); + else incstep(L); + /* run a few finalizers (or all of them at the end of a collect cycle) */ + for (i = 0; g->tobefnz && (i < GCFINALIZENUM || g->gcstate == GCSpause); i++) + GCTM(L, 1); /* call one finalizer */ +} + + +/* +** performs a basic GC step only if collector is running +*/ +void luaC_step (lua_State *L) { + global_State *g = G(L); + if (g->gcrunning) luaC_forcestep(L); + else luaE_setdebt(g, -GCSTEPSIZE); /* avoid being called too often */ +} + + + +/* +** performs a full GC cycle; if "isemergency", does not call +** finalizers (which could change stack positions) +*/ +void luaC_fullgc (lua_State *L, int isemergency) { + global_State *g = G(L); + int origkind = g->gckind; + lua_assert(origkind != KGC_EMERGENCY); + if (isemergency) /* do not run finalizers during emergency GC */ + g->gckind = KGC_EMERGENCY; + else { + g->gckind = KGC_NORMAL; + callallpendingfinalizers(L, 1); + } + if (keepinvariant(g)) { /* may there be some black objects? */ + /* must sweep all objects to turn them back to white + (as white has not changed, nothing will be collected) */ + entersweep(L); + } + /* finish any pending sweep phase to start a new cycle */ + luaC_runtilstate(L, bitmask(GCSpause)); + luaC_runtilstate(L, ~bitmask(GCSpause)); /* start new collection */ + luaC_runtilstate(L, bitmask(GCSpause)); /* run entire collection */ + if (origkind == KGC_GEN) { /* generational mode? */ + /* generational mode must be kept in propagate phase */ + luaC_runtilstate(L, bitmask(GCSpropagate)); + } + g->gckind = origkind; + setpause(g, gettotalbytes(g)); + if (!isemergency) /* do not run finalizers during emergency GC */ + callallpendingfinalizers(L, 1); +} + +/* }====================================================== */ +/* END CSTYLED */ diff --git a/module/lua/lgc.h b/module/lua/lgc.h new file mode 100644 index 000000000000..34097a45edfc --- /dev/null +++ b/module/lua/lgc.h @@ -0,0 +1,159 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lgc.h,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Garbage Collector +** See Copyright Notice in lua.h +*/ + +#ifndef lgc_h +#define lgc_h + + +#include "lobject.h" +#include "lstate.h" + +/* +** Collectable objects may have one of three colors: white, which +** means the object is not marked; gray, which means the +** object is marked, but its references may be not marked; and +** black, which means that the object and all its references are marked. +** The main invariant of the garbage collector, while marking objects, +** is that a black object can never point to a white one. Moreover, +** any gray object must be in a "gray list" (gray, grayagain, weak, +** allweak, ephemeron) so that it can be visited again before finishing +** the collection cycle. These lists have no meaning when the invariant +** is not being enforced (e.g., sweep phase). +*/ + + + +/* how much to allocate before next GC step */ +#if !defined(GCSTEPSIZE) +/* ~100 small strings */ +#define GCSTEPSIZE (cast_int(100 * sizeof(TString))) +#endif + + +/* +** Possible states of the Garbage Collector +*/ +#define GCSpropagate 0 +#define GCSatomic 1 +#define GCSsweepstring 2 +#define GCSsweepudata 3 +#define GCSsweep 4 +#define GCSpause 5 + + +#define issweepphase(g) \ + (GCSsweepstring <= (g)->gcstate && (g)->gcstate <= GCSsweep) + +#define isgenerational(g) ((g)->gckind == KGC_GEN) + +/* +** macros to tell when main invariant (white objects cannot point to black +** ones) must be kept. During a non-generational collection, the sweep +** phase may break the invariant, as objects turned white may point to +** still-black objects. The invariant is restored when sweep ends and +** all objects are white again. During a generational collection, the +** invariant must be kept all times. +*/ + +#define keepinvariant(g) (isgenerational(g) || g->gcstate <= GCSatomic) + + +/* +** Outside the collector, the state in generational mode is kept in +** 'propagate', so 'keepinvariant' is always true. +*/ +#define keepinvariantout(g) \ + check_exp(g->gcstate == GCSpropagate || !isgenerational(g), \ + g->gcstate <= GCSatomic) + + +/* +** some useful bit tricks +*/ +#define resetbits(x,m) ((x) &= cast(lu_byte, ~(m))) +#define setbits(x,m) ((x) |= (m)) +#define testbits(x,m) ((x) & (m)) +#define bitmask(b) (1<<(b)) +#define bit2mask(b1,b2) (bitmask(b1) | bitmask(b2)) +#define l_setbit(x,b) setbits(x, bitmask(b)) +#define resetbit(x,b) resetbits(x, bitmask(b)) +#define testbit(x,b) testbits(x, bitmask(b)) + + +/* Layout for bit use in `marked' field: */ +#define WHITE0BIT 0 /* object is white (type 0) */ +#define WHITE1BIT 1 /* object is white (type 1) */ +#define BLACKBIT 2 /* object is black */ +#define FINALIZEDBIT 3 /* object has been separated for finalization */ +#define SEPARATED 4 /* object is in 'finobj' list or in 'tobefnz' */ +#define FIXEDBIT 5 /* object is fixed (should not be collected) */ +#define OLDBIT 6 /* object is old (only in generational mode) */ +/* bit 7 is currently used by tests (luaL_checkmemory) */ + +#define WHITEBITS bit2mask(WHITE0BIT, WHITE1BIT) + + +#define iswhite(x) testbits((x)->gch.marked, WHITEBITS) +#define isblack(x) testbit((x)->gch.marked, BLACKBIT) +#define isgray(x) /* neither white nor black */ \ + (!testbits((x)->gch.marked, WHITEBITS | bitmask(BLACKBIT))) + +#define isold(x) testbit((x)->gch.marked, OLDBIT) + +/* MOVE OLD rule: whenever an object is moved to the beginning of + a GC list, its old bit must be cleared */ +#define resetoldbit(o) resetbit((o)->gch.marked, OLDBIT) + +#define otherwhite(g) (g->currentwhite ^ WHITEBITS) +#define isdeadm(ow,m) (!(((m) ^ WHITEBITS) & (ow))) +#define isdead(g,v) isdeadm(otherwhite(g), (v)->gch.marked) + +#define changewhite(x) ((x)->gch.marked ^= WHITEBITS) +#define gray2black(x) l_setbit((x)->gch.marked, BLACKBIT) + +#define valiswhite(x) (iscollectable(x) && iswhite(gcvalue(x))) + +#define luaC_white(g) cast(lu_byte, (g)->currentwhite & WHITEBITS) + + +#define luaC_condGC(L,c) \ + {if (G(L)->GCdebt > 0) {c;}; condchangemem(L);} +#define luaC_checkGC(L) luaC_condGC(L, luaC_step(L);) + + +#define luaC_barrier(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ + luaC_barrier_(L,obj2gco(p),gcvalue(v)); } + +#define luaC_barrierback(L,p,v) { if (valiswhite(v) && isblack(obj2gco(p))) \ + luaC_barrierback_(L,p); } + +#define luaC_objbarrier(L,p,o) \ + { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) \ + luaC_barrier_(L,obj2gco(p),obj2gco(o)); } + +#define luaC_objbarrierback(L,p,o) \ + { if (iswhite(obj2gco(o)) && isblack(obj2gco(p))) luaC_barrierback_(L,p); } + +#define luaC_barrierproto(L,p,c) \ + { if (isblack(obj2gco(p))) luaC_barrierproto_(L,p,c); } + +LUAI_FUNC void luaC_freeallobjects (lua_State *L); +LUAI_FUNC void luaC_step (lua_State *L); +LUAI_FUNC void luaC_forcestep (lua_State *L); +LUAI_FUNC void luaC_runtilstate (lua_State *L, int statesmask); +LUAI_FUNC void luaC_fullgc (lua_State *L, int isemergency); +LUAI_FUNC GCObject *luaC_newobj (lua_State *L, int tt, size_t sz, + GCObject **list, int offset); +LUAI_FUNC void luaC_barrier_ (lua_State *L, GCObject *o, GCObject *v); +LUAI_FUNC void luaC_barrierback_ (lua_State *L, GCObject *o); +LUAI_FUNC void luaC_barrierproto_ (lua_State *L, Proto *p, Closure *c); +LUAI_FUNC void luaC_checkfinalizer (lua_State *L, GCObject *o, Table *mt); +LUAI_FUNC void luaC_checkupvalcolor (global_State *g, UpVal *uv); +LUAI_FUNC void luaC_changemode (lua_State *L, int mode); + +#endif +/* END CSTYLED */ diff --git a/module/lua/llex.c b/module/lua/llex.c new file mode 100644 index 000000000000..50c301f599f1 --- /dev/null +++ b/module/lua/llex.c @@ -0,0 +1,531 @@ +/* BEGIN CSTYLED */ +/* +** $Id: llex.c,v 2.63.1.3 2015/02/09 17:56:34 roberto Exp $ +** Lexical Analyzer +** See Copyright Notice in lua.h +*/ + +#define llex_c +#define LUA_CORE + +#include + +#include "lctype.h" +#include "ldo.h" +#include "llex.h" +#include "lobject.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "lzio.h" + + + +#define next(ls) (ls->current = zgetc(ls->z)) + + + +#define currIsNewline(ls) (ls->current == '\n' || ls->current == '\r') + + +/* ORDER RESERVED */ +static const char *const luaX_tokens [] = { + "and", "break", "do", "else", "elseif", + "end", "false", "for", "function", "goto", "if", + "in", "local", "nil", "not", "or", "repeat", + "return", "then", "true", "until", "while", + "..", "...", "==", ">=", "<=", "~=", "::", "", + "", "", "" +}; + + +#define save_and_next(ls) (save(ls, ls->current), next(ls)) + + +static l_noret lexerror (LexState *ls, const char *msg, int token); + + +static void save (LexState *ls, int c) { + Mbuffer *b = ls->buff; + if (luaZ_bufflen(b) + 1 > luaZ_sizebuffer(b)) { + size_t newsize; + if (luaZ_sizebuffer(b) >= MAX_SIZET/2) + lexerror(ls, "lexical element too long", 0); + newsize = luaZ_sizebuffer(b) * 2; + luaZ_resizebuffer(ls->L, b, newsize); + } + b->buffer[luaZ_bufflen(b)++] = cast(char, c); +} + + +void luaX_init (lua_State *L) { + int i; + for (i=0; itsv.extra = cast_byte(i+1); /* reserved word */ + } +} + + +const char *luaX_token2str (LexState *ls, int token) { + if (token < FIRST_RESERVED) { /* single-byte symbols? */ + lua_assert(token == cast(unsigned char, token)); + return (lisprint(token)) ? luaO_pushfstring(ls->L, LUA_QL("%c"), token) : + luaO_pushfstring(ls->L, "char(%d)", token); + } + else { + const char *s = luaX_tokens[token - FIRST_RESERVED]; + if (token < TK_EOS) /* fixed format (symbols and reserved words)? */ + return luaO_pushfstring(ls->L, LUA_QS, s); + else /* names, strings, and numerals */ + return s; + } +} + + +static const char *txtToken (LexState *ls, int token) { + switch (token) { + case TK_NAME: + case TK_STRING: + case TK_NUMBER: + save(ls, '\0'); + return luaO_pushfstring(ls->L, LUA_QS, luaZ_buffer(ls->buff)); + default: + return luaX_token2str(ls, token); + } +} + + +static l_noret lexerror (LexState *ls, const char *msg, int token) { + char buff[LUA_IDSIZE]; + luaO_chunkid(buff, getstr(ls->source), LUA_IDSIZE); + msg = luaO_pushfstring(ls->L, "%s:%d: %s", buff, ls->linenumber, msg); + if (token) + luaO_pushfstring(ls->L, "%s near %s", msg, txtToken(ls, token)); + luaD_throw(ls->L, LUA_ERRSYNTAX); +} + + +l_noret luaX_syntaxerror (LexState *ls, const char *msg) { + lexerror(ls, msg, ls->t.token); +} + + +/* +** creates a new string and anchors it in function's table so that +** it will not be collected until the end of the function's compilation +** (by that time it should be anchored in function's prototype) +*/ +TString *luaX_newstring (LexState *ls, const char *str, size_t l) { + lua_State *L = ls->L; + TValue *o; /* entry for `str' */ + TString *ts = luaS_newlstr(L, str, l); /* create new string */ + setsvalue2s(L, L->top++, ts); /* temporarily anchor it in stack */ + o = luaH_set(L, ls->fs->h, L->top - 1); + if (ttisnil(o)) { /* not in use yet? (see 'addK') */ + /* boolean value does not need GC barrier; + table has no metatable, so it does not need to invalidate cache */ + setbvalue(o, 1); /* t[string] = true */ + luaC_checkGC(L); + } + else { /* string already present */ + ts = rawtsvalue(keyfromval(o)); /* re-use value previously stored */ + } + L->top--; /* remove string from stack */ + return ts; +} + + +/* +** increment line number and skips newline sequence (any of +** \n, \r, \n\r, or \r\n) +*/ +static void inclinenumber (LexState *ls) { + int old = ls->current; + lua_assert(currIsNewline(ls)); + next(ls); /* skip `\n' or `\r' */ + if (currIsNewline(ls) && ls->current != old) + next(ls); /* skip `\n\r' or `\r\n' */ + if (++ls->linenumber >= MAX_INT) + lexerror(ls, "chunk has too many lines", 0); +} + + +void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, TString *source, + int firstchar) { + ls->decpoint = '.'; + ls->L = L; + ls->current = firstchar; + ls->lookahead.token = TK_EOS; /* no look-ahead token */ + ls->z = z; + ls->fs = NULL; + ls->linenumber = 1; + ls->lastline = 1; + ls->source = source; + ls->envn = luaS_new(L, LUA_ENV); /* create env name */ + luaS_fix(ls->envn); /* never collect this name */ + luaZ_resizebuffer(ls->L, ls->buff, LUA_MINBUFFER); /* initialize buffer */ +} + + + +/* +** ======================================================= +** LEXICAL ANALYZER +** ======================================================= +*/ + + + +static int check_next (LexState *ls, const char *set) { + if (ls->current == '\0' || !strchr(set, ls->current)) + return 0; + save_and_next(ls); + return 1; +} + + +/* +** change all characters 'from' in buffer to 'to' +*/ +static void buffreplace (LexState *ls, char from, char to) { + size_t n = luaZ_bufflen(ls->buff); + char *p = luaZ_buffer(ls->buff); + while (n--) + if (p[n] == from) p[n] = to; +} + + +#if !defined(getlocaledecpoint) +#define getlocaledecpoint() (localeconv()->decimal_point[0]) +#endif + + +#define buff2d(b,e) luaO_str2d(luaZ_buffer(b), luaZ_bufflen(b) - 1, e) + +/* +** in case of format error, try to change decimal point separator to +** the one defined in the current locale and check again +*/ +static void trydecpoint (LexState *ls, SemInfo *seminfo) { + char old = ls->decpoint; + ls->decpoint = getlocaledecpoint(); + buffreplace(ls, old, ls->decpoint); /* try new decimal separator */ + if (!buff2d(ls->buff, &seminfo->r)) { + /* format error with correct decimal point: no more options */ + buffreplace(ls, ls->decpoint, '.'); /* undo change (for error message) */ + lexerror(ls, "malformed number", TK_NUMBER); + } +} + + +/* LUA_NUMBER */ +/* +** this function is quite liberal in what it accepts, as 'luaO_str2d' +** will reject ill-formed numerals. +*/ +static void read_numeral (LexState *ls, SemInfo *seminfo) { + const char *expo = "Ee"; + int first = ls->current; + lua_assert(lisdigit(ls->current)); + save_and_next(ls); + if (first == '0' && check_next(ls, "Xx")) /* hexadecimal? */ + expo = "Pp"; + for (;;) { + if (check_next(ls, expo)) /* exponent part? */ + (void) check_next(ls, "+-"); /* optional exponent sign */ + if (lisxdigit(ls->current) || ls->current == '.') + save_and_next(ls); + else break; + } + save(ls, '\0'); + buffreplace(ls, '.', ls->decpoint); /* follow locale for decimal point */ + if (!buff2d(ls->buff, &seminfo->r)) /* format error? */ + trydecpoint(ls, seminfo); /* try to update decimal point separator */ +} + + +/* +** skip a sequence '[=*[' or ']=*]' and return its number of '='s or +** -1 if sequence is malformed +*/ +static int skip_sep (LexState *ls) { + int count = 0; + int s = ls->current; + lua_assert(s == '[' || s == ']'); + save_and_next(ls); + while (ls->current == '=') { + save_and_next(ls); + count++; + } + return (ls->current == s) ? count : (-count) - 1; +} + + +static void read_long_string (LexState *ls, SemInfo *seminfo, int sep) { + save_and_next(ls); /* skip 2nd `[' */ + if (currIsNewline(ls)) /* string starts with a newline? */ + inclinenumber(ls); /* skip it */ + for (;;) { + switch (ls->current) { + case EOZ: + lexerror(ls, (seminfo) ? "unfinished long string" : + "unfinished long comment", TK_EOS); + break; /* to avoid warnings */ + case ']': { + if (skip_sep(ls) == sep) { + save_and_next(ls); /* skip 2nd `]' */ + goto endloop; + } + break; + } + case '\n': case '\r': { + save(ls, '\n'); + inclinenumber(ls); + if (!seminfo) luaZ_resetbuffer(ls->buff); /* avoid wasting space */ + break; + } + default: { + if (seminfo) save_and_next(ls); + else next(ls); + } + } + } endloop: + if (seminfo) + seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + (2 + sep), + luaZ_bufflen(ls->buff) - 2*(2 + sep)); +} + + +static void escerror (LexState *ls, int *c, int n, const char *msg) { + int i; + luaZ_resetbuffer(ls->buff); /* prepare error message */ + save(ls, '\\'); + for (i = 0; i < n && c[i] != EOZ; i++) + save(ls, c[i]); + lexerror(ls, msg, TK_STRING); +} + + +static int readhexaesc (LexState *ls) { + int c[3], i; /* keep input for error message */ + int r = 0; /* result accumulator */ + c[0] = 'x'; /* for error message */ + for (i = 1; i < 3; i++) { /* read two hexadecimal digits */ + c[i] = next(ls); + if (!lisxdigit(c[i])) + escerror(ls, c, i + 1, "hexadecimal digit expected"); + r = (r << 4) + luaO_hexavalue(c[i]); + } + return r; +} + + +static int readdecesc (LexState *ls) { + int c[3], i; + int r = 0; /* result accumulator */ + for (i = 0; i < 3 && lisdigit(ls->current); i++) { /* read up to 3 digits */ + c[i] = ls->current; + r = 10*r + c[i] - '0'; + next(ls); + } + if (r > UCHAR_MAX) + escerror(ls, c, i, "decimal escape too large"); + return r; +} + + +static void read_string (LexState *ls, int del, SemInfo *seminfo) { + save_and_next(ls); /* keep delimiter (for error messages) */ + while (ls->current != del) { + switch (ls->current) { + case EOZ: + lexerror(ls, "unfinished string", TK_EOS); + break; /* to avoid warnings */ + case '\n': + case '\r': + lexerror(ls, "unfinished string", TK_STRING); + break; /* to avoid warnings */ + case '\\': { /* escape sequences */ + int c; /* final character to be saved */ + next(ls); /* do not save the `\' */ + switch (ls->current) { + case 'a': c = '\a'; goto read_save; + case 'b': c = '\b'; goto read_save; + case 'f': c = '\f'; goto read_save; + case 'n': c = '\n'; goto read_save; + case 'r': c = '\r'; goto read_save; + case 't': c = '\t'; goto read_save; + case 'v': c = '\v'; goto read_save; + case 'x': c = readhexaesc(ls); goto read_save; + case '\n': case '\r': + inclinenumber(ls); c = '\n'; goto only_save; + case '\\': case '\"': case '\'': + c = ls->current; goto read_save; + case EOZ: goto no_save; /* will raise an error next loop */ + case 'z': { /* zap following span of spaces */ + next(ls); /* skip the 'z' */ + while (lisspace(ls->current)) { + if (currIsNewline(ls)) inclinenumber(ls); + else next(ls); + } + goto no_save; + } + default: { + if (!lisdigit(ls->current)) + escerror(ls, &ls->current, 1, "invalid escape sequence"); + /* digital escape \ddd */ + c = readdecesc(ls); + goto only_save; + } + } + read_save: next(ls); /* read next character */ + only_save: save(ls, c); /* save 'c' */ + no_save: break; + } + default: + save_and_next(ls); + } + } + save_and_next(ls); /* skip delimiter */ + seminfo->ts = luaX_newstring(ls, luaZ_buffer(ls->buff) + 1, + luaZ_bufflen(ls->buff) - 2); +} + + +static int llex (LexState *ls, SemInfo *seminfo) { + luaZ_resetbuffer(ls->buff); + for (;;) { + switch (ls->current) { + case '\n': case '\r': { /* line breaks */ + inclinenumber(ls); + break; + } + case ' ': case '\f': case '\t': case '\v': { /* spaces */ + next(ls); + break; + } + case '-': { /* '-' or '--' (comment) */ + next(ls); + if (ls->current != '-') return '-'; + /* else is a comment */ + next(ls); + if (ls->current == '[') { /* long comment? */ + int sep = skip_sep(ls); + luaZ_resetbuffer(ls->buff); /* `skip_sep' may dirty the buffer */ + if (sep >= 0) { + read_long_string(ls, NULL, sep); /* skip long comment */ + luaZ_resetbuffer(ls->buff); /* previous call may dirty the buff. */ + break; + } + } + /* else short comment */ + while (!currIsNewline(ls) && ls->current != EOZ) + next(ls); /* skip until end of line (or end of file) */ + break; + } + case '[': { /* long string or simply '[' */ + int sep = skip_sep(ls); + if (sep >= 0) { + read_long_string(ls, seminfo, sep); + return TK_STRING; + } else if (sep == -1) { + return '['; + } else { + lexerror(ls, "invalid long string delimiter", TK_STRING); + break; + } + } + case '=': { + next(ls); + if (ls->current != '=') return '='; + else { next(ls); return TK_EQ; } + } + case '<': { + next(ls); + if (ls->current != '=') return '<'; + else { next(ls); return TK_LE; } + } + case '>': { + next(ls); + if (ls->current != '=') return '>'; + else { next(ls); return TK_GE; } + } + case '~': { + next(ls); + if (ls->current != '=') return '~'; + else { next(ls); return TK_NE; } + } + case ':': { + next(ls); + if (ls->current != ':') return ':'; + else { next(ls); return TK_DBCOLON; } + } + case '"': case '\'': { /* short literal strings */ + read_string(ls, ls->current, seminfo); + return TK_STRING; + } + case '.': { /* '.', '..', '...', or number */ + save_and_next(ls); + if (check_next(ls, ".")) { + if (check_next(ls, ".")) + return TK_DOTS; /* '...' */ + else return TK_CONCAT; /* '..' */ + } + else if (!lisdigit(ls->current)) return '.'; + /* else go through */ + } + /* FALLTHROUGH */ + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': { + read_numeral(ls, seminfo); + return TK_NUMBER; + } + case EOZ: { + return TK_EOS; + } + default: { + if (lislalpha(ls->current)) { /* identifier or reserved word? */ + TString *ts; + do { + save_and_next(ls); + } while (lislalnum(ls->current)); + ts = luaX_newstring(ls, luaZ_buffer(ls->buff), + luaZ_bufflen(ls->buff)); + seminfo->ts = ts; + if (isreserved(ts)) /* reserved word? */ + return ts->tsv.extra - 1 + FIRST_RESERVED; + else { + return TK_NAME; + } + } + else { /* single-char tokens (+ - / ...) */ + int c = ls->current; + next(ls); + return c; + } + } + } + } +} + + +void luaX_next (LexState *ls) { + ls->lastline = ls->linenumber; + if (ls->lookahead.token != TK_EOS) { /* is there a look-ahead token? */ + ls->t = ls->lookahead; /* use this one */ + ls->lookahead.token = TK_EOS; /* and discharge it */ + } + else + ls->t.token = llex(ls, &ls->t.seminfo); /* read next token */ +} + + +int luaX_lookahead (LexState *ls) { + lua_assert(ls->lookahead.token == TK_EOS); + ls->lookahead.token = llex(ls, &ls->lookahead.seminfo); + return ls->lookahead.token; +} +/* END CSTYLED */ diff --git a/module/lua/llex.h b/module/lua/llex.h new file mode 100644 index 000000000000..da58203e8dc8 --- /dev/null +++ b/module/lua/llex.h @@ -0,0 +1,83 @@ +/* BEGIN CSTYLED */ +/* +** $Id: llex.h,v 1.72.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lexical Analyzer +** See Copyright Notice in lua.h +*/ + +#ifndef llex_h +#define llex_h + +#include "lobject.h" +#include "lzio.h" + + +#define FIRST_RESERVED 257 + + + +/* +* WARNING: if you change the order of this enumeration, +* grep "ORDER RESERVED" +*/ +enum RESERVED { + /* terminal symbols denoted by reserved words */ + TK_AND = FIRST_RESERVED, TK_BREAK, + TK_DO, TK_ELSE, TK_ELSEIF, TK_END, TK_FALSE, TK_FOR, TK_FUNCTION, + TK_GOTO, TK_IF, TK_IN, TK_LOCAL, TK_NIL, TK_NOT, TK_OR, TK_REPEAT, + TK_RETURN, TK_THEN, TK_TRUE, TK_UNTIL, TK_WHILE, + /* other terminal symbols */ + TK_CONCAT, TK_DOTS, TK_EQ, TK_GE, TK_LE, TK_NE, TK_DBCOLON, TK_EOS, + TK_NUMBER, TK_NAME, TK_STRING +}; + +/* number of reserved words */ +#define NUM_RESERVED (cast(int, TK_WHILE-FIRST_RESERVED+1)) + + +typedef union { + lua_Number r; + TString *ts; +} SemInfo; /* semantics information */ + + +typedef struct Token { + int token; + SemInfo seminfo; +} Token; + +#ifdef current +#undef current +#endif + +/* state of the lexer plus state of the parser when shared by all + functions */ +typedef struct LexState { + int current; /* current character (charint) */ + int linenumber; /* input line counter */ + int lastline; /* line of last token `consumed' */ + Token t; /* current token */ + Token lookahead; /* look ahead token */ + struct FuncState *fs; /* current function (parser) */ + struct lua_State *L; + ZIO *z; /* input stream */ + Mbuffer *buff; /* buffer for tokens */ + struct Dyndata *dyd; /* dynamic structures used by the parser */ + TString *source; /* current source name */ + TString *envn; /* environment variable name */ + char decpoint; /* locale decimal point */ +} LexState; + + +LUAI_FUNC void luaX_init (lua_State *L); +LUAI_FUNC void luaX_setinput (lua_State *L, LexState *ls, ZIO *z, + TString *source, int firstchar); +LUAI_FUNC TString *luaX_newstring (LexState *ls, const char *str, size_t l); +LUAI_FUNC void luaX_next (LexState *ls); +LUAI_FUNC int luaX_lookahead (LexState *ls); +LUAI_FUNC l_noret luaX_syntaxerror (LexState *ls, const char *s); +LUAI_FUNC const char *luaX_token2str (LexState *ls, int token); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/llimits.h b/module/lua/llimits.h new file mode 100644 index 000000000000..25466f14edca --- /dev/null +++ b/module/lua/llimits.h @@ -0,0 +1,323 @@ +/* BEGIN CSTYLED */ +/* +** $Id: llimits.h,v 1.103.1.1 2013/04/12 18:48:47 roberto Exp $ +** Limits, basic types, and some other `installation-dependent' definitions +** See Copyright Notice in lua.h +*/ + +#ifndef llimits_h +#define llimits_h + + +#include + + +typedef unsigned LUA_INT32 lu_int32; + +typedef LUAI_UMEM lu_mem; + +typedef LUAI_MEM l_mem; + + + +/* chars used as small naturals (so that `char' is reserved for characters) */ +typedef unsigned char lu_byte; + + +#define MAX_SIZET ((size_t)(~(size_t)0)-2) + +#define MAX_LUMEM ((lu_mem)(~(lu_mem)0)-2) + +#define MAX_LMEM ((l_mem) ((MAX_LUMEM >> 1) - 2)) + + +#define MAX_INT (INT_MAX-2) /* maximum value of an int (-2 for safety) */ + +/* +** conversion of pointer to integer +** this is for hashing only; there is no problem if the integer +** cannot hold the whole pointer value +*/ +#define IntPoint(p) ((unsigned int)(lu_mem)(p)) + + + +/* type to ensure maximum alignment */ +#if !defined(LUAI_USER_ALIGNMENT_T) +#define LUAI_USER_ALIGNMENT_T union { double u; void *s; long l; } +#endif + +typedef LUAI_USER_ALIGNMENT_T L_Umaxalign; + + +/* result of a `usual argument conversion' over lua_Number */ +typedef LUAI_UACNUMBER l_uacNumber; + + +/* internal assertions for in-house debugging */ +#if defined(lua_assert) +#define check_exp(c,e) (lua_assert(c), (e)) +/* to avoid problems with conditions too long */ +#define lua_longassert(c) { if (!(c)) lua_assert(0); } +#else +#define lua_assert(c) ((void)0) +#define check_exp(c,e) (e) +#define lua_longassert(c) ((void)0) +#endif + +/* +** assertion for checking API calls +*/ +#if !defined(luai_apicheck) + +#if defined(LUA_USE_APICHECK) +#include +#define luai_apicheck(L,e) assert(e) +#else +#define luai_apicheck(L,e) lua_assert(e) +#endif + +#endif + +#define api_check(l,e,msg) luai_apicheck(l,(e) && msg) + + +#if !defined(UNUSED) +#define UNUSED(x) ((void)(x)) /* to avoid warnings */ +#endif + + +#define cast(t, exp) ((t)(exp)) + +#define cast_byte(i) cast(lu_byte, (i)) +#define cast_num(i) cast(lua_Number, (i)) +#define cast_int(i) cast(int, (i)) +#define cast_uchar(i) cast(unsigned char, (i)) + + +/* +** non-return type +** +** Suppress noreturn attribute in kernel builds to avoid objtool check warnings +*/ +#if defined(__GNUC__) && !defined(_KERNEL) +#define l_noret void __attribute__((noreturn)) +#elif defined(_MSC_VER) +#define l_noret void __declspec(noreturn) +#else +#define l_noret void +#endif + + + +/* +** maximum depth for nested C calls and syntactical nested non-terminals +** in a program. (Value must fit in an unsigned short int.) +** +** Note: On amd64 platform, the limit has been measured to be 45. We set +** the maximum lower to give a margin for changing the amount of stack +** used by various functions involved in parsing and executing code. +*/ +#if !defined(LUAI_MAXCCALLS) +#define LUAI_MAXCCALLS 20 +#endif + +/* + * Minimum amount of available stack space (in bytes) to make a C call. With + * gsub() recursion, the stack space between each luaD_call() is 1256 bytes. + */ +#if defined(__FreeBSD__) +/* + * FreeBSD needs a few extra bytes in unoptimized debug builds to avoid a + * double-fault handling the error when the max call depth is exceeded just + * before the C stack runs out. 64 bytes seems to do the trick. + */ +#define LUAI_MINCSTACK 4160 +#else +#define LUAI_MINCSTACK 4096 +#endif + +/* +** maximum number of upvalues in a closure (both C and Lua). (Value +** must fit in an unsigned char.) +*/ +#define MAXUPVAL UCHAR_MAX + + +/* +** type for virtual-machine instructions +** must be an unsigned with (at least) 4 bytes (see details in lopcodes.h) +*/ +typedef lu_int32 Instruction; + + + +/* maximum stack for a Lua function */ +#define MAXSTACK 250 + + + +/* minimum size for the string table (must be power of 2) */ +#if !defined(MINSTRTABSIZE) +#define MINSTRTABSIZE 32 +#endif + + +/* minimum size for string buffer */ +#if !defined(LUA_MINBUFFER) +#define LUA_MINBUFFER 32 +#endif + + +#if !defined(lua_lock) +#define lua_lock(L) ((void) 0) +#define lua_unlock(L) ((void) 0) +#endif + +#if !defined(luai_threadyield) +#define luai_threadyield(L) {lua_unlock(L); lua_lock(L);} +#endif + + +/* +** these macros allow user-specific actions on threads when you defined +** LUAI_EXTRASPACE and need to do something extra when a thread is +** created/deleted/resumed/yielded. +*/ +#if !defined(luai_userstateopen) +#define luai_userstateopen(L) ((void)L) +#endif + +#if !defined(luai_userstateclose) +#define luai_userstateclose(L) ((void)L) +#endif + +#if !defined(luai_userstatethread) +#define luai_userstatethread(L,L1) ((void)L) +#endif + +#if !defined(luai_userstatefree) +#define luai_userstatefree(L,L1) ((void)L) +#endif + +#if !defined(luai_userstateresume) +#define luai_userstateresume(L,n) ((void)L) +#endif + +#if !defined(luai_userstateyield) +#define luai_userstateyield(L,n) ((void)L) +#endif + +/* +** lua_number2int is a macro to convert lua_Number to int. +** lua_number2integer is a macro to convert lua_Number to lua_Integer. +** lua_number2unsigned is a macro to convert a lua_Number to a lua_Unsigned. +** lua_unsigned2number is a macro to convert a lua_Unsigned to a lua_Number. +** luai_hashnum is a macro to hash a lua_Number value into an integer. +** The hash must be deterministic and give reasonable values for +** both small and large values (outside the range of integers). +*/ + +#if defined(MS_ASMTRICK) || defined(LUA_MSASMTRICK) /* { */ +/* trick with Microsoft assembler for X86 */ + +#define lua_number2int(i,n) __asm {__asm fld n __asm fistp i} +#define lua_number2integer(i,n) lua_number2int(i, n) +#define lua_number2unsigned(i,n) \ + {__int64 l; __asm {__asm fld n __asm fistp l} i = (unsigned int)l;} + + +#elif defined(LUA_IEEE754TRICK) /* }{ */ +/* the next trick should work on any machine using IEEE754 with + a 32-bit int type */ + +union luai_Cast { double l_d; LUA_INT32 l_p[2]; }; + +#if !defined(LUA_IEEEENDIAN) /* { */ +#define LUAI_EXTRAIEEE \ + static const union luai_Cast ieeeendian = {-(33.0 + 6755399441055744.0)}; +#define LUA_IEEEENDIANLOC (ieeeendian.l_p[1] == 33) +#else +#define LUA_IEEEENDIANLOC LUA_IEEEENDIAN +#define LUAI_EXTRAIEEE /* empty */ +#endif /* } */ + +#define lua_number2int32(i,n,t) \ + { LUAI_EXTRAIEEE \ + volatile union luai_Cast u; u.l_d = (n) + 6755399441055744.0; \ + (i) = (t)u.l_p[LUA_IEEEENDIANLOC]; } + +#define luai_hashnum(i,n) \ + { volatile union luai_Cast u; u.l_d = (n) + 1.0; /* avoid -0 */ \ + (i) = u.l_p[0]; (i) += u.l_p[1]; } /* add double bits for his hash */ + +#define lua_number2int(i,n) lua_number2int32(i, n, int) +#define lua_number2unsigned(i,n) lua_number2int32(i, n, lua_Unsigned) + +/* the trick can be expanded to lua_Integer when it is a 32-bit value */ +#if defined(LUA_IEEELL) +#define lua_number2integer(i,n) lua_number2int32(i, n, lua_Integer) +#endif + +#endif /* } */ + + +/* the following definitions always work, but may be slow */ + +#if !defined(lua_number2int) +#define lua_number2int(i,n) ((i)=(int)(n)) +#endif + +#if !defined(lua_number2integer) +#define lua_number2integer(i,n) ((i)=(lua_Integer)(n)) +#endif + +#if !defined(lua_number2unsigned) /* { */ +/* the following definition assures proper modulo behavior */ +#if defined(LUA_NUMBER_DOUBLE) || defined(LUA_NUMBER_FLOAT) +#include +#define SUPUNSIGNED ((lua_Number)(~(lua_Unsigned)0) + 1) +#define lua_number2unsigned(i,n) \ + ((i)=(lua_Unsigned)((n) - floor((n)/SUPUNSIGNED)*SUPUNSIGNED)) +#else +#define lua_number2unsigned(i,n) ((i)=(lua_Unsigned)(n)) +#endif +#endif /* } */ + + +#if !defined(lua_unsigned2number) +/* on several machines, coercion from unsigned to double is slow, + so it may be worth to avoid */ +#define lua_unsigned2number(u) \ + (((u) <= (lua_Unsigned)INT_MAX) ? (lua_Number)(int)(u) : (lua_Number)(u)) +#endif + + + +#if defined(ltable_c) && !defined(luai_hashnum) + +#define luai_hashnum(i,n) (i = lcompat_hashnum(n)) + +#endif + + + +/* +** macro to control inclusion of some hard tests on stack reallocation +*/ +#if !defined(HARDSTACKTESTS) +#define condmovestack(L) ((void)0) +#else +/* realloc stack keeping its size */ +#define condmovestack(L) luaD_reallocstack((L), (L)->stacksize) +#endif + +#if !defined(HARDMEMTESTS) +#define condchangemem(L) condmovestack(L) +#else +#define condchangemem(L) \ + ((void)(!(G(L)->gcrunning) || (luaC_fullgc(L, 0), 1))) +#endif + +#endif +/* END CSTYLED */ diff --git a/module/lua/lmem.c b/module/lua/lmem.c new file mode 100644 index 000000000000..18bb2514cb01 --- /dev/null +++ b/module/lua/lmem.c @@ -0,0 +1,98 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lmem.c,v 1.84.1.1 2013/04/12 18:48:47 roberto Exp $ +** Interface to Memory Manager +** See Copyright Notice in lua.h +*/ + + +#define lmem_c +#define LUA_CORE + +#include + +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" + + + +/* +** About the realloc function: +** void * frealloc (void *ud, void *ptr, size_t osize, size_t nsize); +** (`osize' is the old size, `nsize' is the new size) +** +** * frealloc(ud, NULL, x, s) creates a new block of size `s' (no +** matter 'x'). +** +** * frealloc(ud, p, x, 0) frees the block `p' +** (in this specific case, frealloc must return NULL); +** particularly, frealloc(ud, NULL, 0, 0) does nothing +** (which is equivalent to free(NULL) in ANSI C) +** +** frealloc returns NULL if it cannot create or reallocate the area +** (any reallocation to an equal or smaller size cannot fail!) +*/ + + + +#define MINSIZEARRAY 4 + + +void *luaM_growaux_ (lua_State *L, void *block, int *size, size_t size_elems, + int limit, const char *what) { + void *newblock; + int newsize; + if (*size >= limit/2) { /* cannot double it? */ + if (*size >= limit) /* cannot grow even a little? */ + luaG_runerror(L, "too many %s (limit is %d)", what, limit); + newsize = limit; /* still have at least one free place */ + } + else { + newsize = (*size)*2; + if (newsize < MINSIZEARRAY) + newsize = MINSIZEARRAY; /* minimum size */ + } + newblock = luaM_reallocv(L, block, *size, newsize, size_elems); + *size = newsize; /* update only when everything else is OK */ + return newblock; +} + + +l_noret luaM_toobig (lua_State *L) { + luaG_runerror(L, "memory allocation error: block too big"); +} + + + +/* +** generic allocation routine. +*/ +void *luaM_realloc_ (lua_State *L, void *block, size_t osize, size_t nsize) { + void *newblock; + global_State *g = G(L); + size_t realosize = (block) ? osize : 0; + lua_assert((realosize == 0) == (block == NULL)); +#if defined(HARDMEMTESTS) + if (nsize > realosize && g->gcrunning) + luaC_fullgc(L, 1); /* force a GC whenever possible */ +#endif + newblock = (*g->frealloc)(g->ud, block, osize, nsize); + if (newblock == NULL && nsize > 0) { + api_check(L, nsize > realosize, + "realloc cannot fail when shrinking a block"); + if (g->gcrunning) { + luaC_fullgc(L, 1); /* try to free some memory... */ + newblock = (*g->frealloc)(g->ud, block, osize, nsize); /* try again */ + } + if (newblock == NULL) + luaD_throw(L, LUA_ERRMEM); + } + lua_assert((nsize == 0) == (newblock == NULL)); + g->GCdebt = (g->GCdebt + nsize) - realosize; + return newblock; +} +/* END CSTYLED */ diff --git a/module/lua/lmem.h b/module/lua/lmem.h new file mode 100644 index 000000000000..22c04c98c863 --- /dev/null +++ b/module/lua/lmem.h @@ -0,0 +1,56 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lmem.h,v 1.40.1.1 2013/04/12 18:48:47 roberto Exp $ +** Interface to Memory Manager +** See Copyright Notice in lua.h +*/ + +#ifndef lmem_h +#define lmem_h + + +#include "llimits.h" +#include + + +/* +** This macro avoids the runtime division MAX_SIZET/(e), as 'e' is +** always constant. +** The macro is somewhat complex to avoid warnings: +** +1 avoids warnings of "comparison has constant result"; +** cast to 'void' avoids warnings of "value unused". +*/ +#define luaM_reallocv(L,b,on,n,e) \ + (cast(void, \ + (cast(size_t, (n)+1) > MAX_SIZET/(e)) ? (luaM_toobig(L), 0) : 0), \ + luaM_realloc_(L, (b), (on)*(e), (n)*(e))) + +#define luaM_freemem(L, b, s) luaM_realloc_(L, (b), (s), 0) +#define luaM_free(L, b) luaM_realloc_(L, (b), sizeof(*(b)), 0) +#define luaM_freearray(L, b, n) luaM_reallocv(L, (b), n, 0, sizeof((b)[0])) + +#define luaM_malloc(L,s) luaM_realloc_(L, NULL, 0, (s)) +#define luaM_new(L,t) cast(t *, luaM_malloc(L, sizeof(t))) +#define luaM_newvector(L,n,t) \ + cast(t *, luaM_reallocv(L, NULL, 0, n, sizeof(t))) + +#define luaM_newobject(L,tag,s) luaM_realloc_(L, NULL, tag, (s)) + +#define luaM_growvector(L,v,nelems,size,t,limit,e) \ + if ((nelems)+1 > (size)) \ + ((v)=cast(t *, luaM_growaux_(L,v,&(size),sizeof(t),limit,e))) + +#define luaM_reallocvector(L, v,oldn,n,t) \ + ((v)=cast(t *, luaM_reallocv(L, v, oldn, n, sizeof(t)))) + +LUAI_FUNC l_noret luaM_toobig (lua_State *L); + +/* not to be called directly */ +LUAI_FUNC void *luaM_realloc_ (lua_State *L, void *block, size_t oldsize, + size_t size); +LUAI_FUNC void *luaM_growaux_ (lua_State *L, void *block, int *size, + size_t size_elem, int limit, + const char *what); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lobject.c b/module/lua/lobject.c new file mode 100644 index 000000000000..024d3199fe24 --- /dev/null +++ b/module/lua/lobject.c @@ -0,0 +1,282 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lobject.c,v 2.58.1.1 2013/04/12 18:48:47 roberto Exp $ +** Some generic functions over Lua objects +** See Copyright Notice in lua.h +*/ + +#define lobject_c +#define LUA_CORE + +#include + +#include "lctype.h" +#include "ldebug.h" +#include "ldo.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "lvm.h" + + + +LUAI_DDEF const TValue luaO_nilobject_ = {NILCONSTANT}; + + +/* +** converts an integer to a "floating point byte", represented as +** (eeeeexxx), where the real value is (1xxx) * 2^(eeeee - 1) if +** eeeee != 0 and (xxx) otherwise. +*/ +int luaO_int2fb (unsigned int x) { + int e = 0; /* exponent */ + if (x < 8) return x; + while (x >= 0x10) { + x = (x+1) >> 1; + e++; + } + return ((e+1) << 3) | (cast_int(x) - 8); +} + + +/* converts back */ +int luaO_fb2int (int x) { + int e = (x >> 3) & 0x1f; + if (e == 0) return x; + else return ((x & 7) + 8) << (e - 1); +} + + +int luaO_ceillog2 (unsigned int x) { + static const lu_byte log_2[256] = { + 0,1,2,2,3,3,3,3,4,4,4,4,4,4,4,4,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5, + 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8, + 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8 + }; + int l = 0; + x--; + while (x >= 256) { l += 8; x >>= 8; } + return l + log_2[x]; +} + + +lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2) { + switch (op) { + case LUA_OPADD: return luai_numadd(NULL, v1, v2); + case LUA_OPSUB: return luai_numsub(NULL, v1, v2); + case LUA_OPMUL: return luai_nummul(NULL, v1, v2); + case LUA_OPDIV: return luai_numdiv(NULL, v1, v2); + case LUA_OPMOD: return luai_nummod(NULL, v1, v2); + case LUA_OPPOW: return luai_numpow(NULL, v1, v2); + case LUA_OPUNM: return luai_numunm(NULL, v1); + default: lua_assert(0); return 0; + } +} + + +int luaO_hexavalue (int c) { + if (lisdigit(c)) return c - '0'; + else return ltolower(c) - 'a' + 10; +} + + +#if !defined(lua_strx2number) + + + +static int isneg (const char **s) { + if (**s == '-') { (*s)++; return 1; } + else if (**s == '+') (*s)++; + return 0; +} + + +static lua_Number readhexa (const char **s, lua_Number r, int *count) { + for (; lisxdigit(cast_uchar(**s)); (*s)++) { /* read integer part */ + r = (r * cast_num(16.0)) + cast_num(luaO_hexavalue(cast_uchar(**s))); + (*count)++; + } + return r; +} + + +/* +** convert an hexadecimal numeric string to a number, following +** C99 specification for 'strtod' +*/ +static lua_Number lua_strx2number (const char *s, char **endptr) { + lua_Number r = 0.0; + int e = 0, i = 0; + int neg = 0; /* 1 if number is negative */ + *endptr = cast(char *, s); /* nothing is valid yet */ + while (lisspace(cast_uchar(*s))) s++; /* skip initial spaces */ + neg = isneg(&s); /* check signal */ + if (!(*s == '0' && (*(s + 1) == 'x' || *(s + 1) == 'X'))) /* check '0x' */ + return 0.0; /* invalid format (no '0x') */ + s += 2; /* skip '0x' */ + r = readhexa(&s, r, &i); /* read integer part */ + if (*s == '.') { + s++; /* skip dot */ + r = readhexa(&s, r, &e); /* read fractional part */ + } + if (i == 0 && e == 0) + return 0.0; /* invalid format (no digit) */ + e *= -4; /* each fractional digit divides value by 2^-4 */ + *endptr = cast(char *, s); /* valid up to here */ + if (*s == 'p' || *s == 'P') { /* exponent part? */ + int exp1 = 0; + int neg1; + s++; /* skip 'p' */ + neg1 = isneg(&s); /* signal */ + if (!lisdigit(cast_uchar(*s))) + goto ret; /* must have at least one digit */ + while (lisdigit(cast_uchar(*s))) /* read exponent */ + exp1 = exp1 * 10 + *(s++) - '0'; + if (neg1) exp1 = -exp1; + e += exp1; + } + *endptr = cast(char *, s); /* valid up to here */ + ret: + if (neg) r = -r; + return (r * (1 << e)); +} + +#endif + + +int luaO_str2d (const char *s, size_t len, lua_Number *result) { + char *endptr; + if (strpbrk(s, "nN")) /* reject 'inf' and 'nan' */ + return 0; + else if (strpbrk(s, "xX")) /* hexa? */ + *result = lua_strx2number(s, &endptr); + else + *result = lua_str2number(s, &endptr); + if (endptr == s) return 0; /* nothing recognized */ + while (lisspace(cast_uchar(*endptr))) endptr++; + return (endptr == s + len); /* OK if no trailing characters */ +} + + + +static void pushstr (lua_State *L, const char *str, size_t l) { + setsvalue2s(L, L->top++, luaS_newlstr(L, str, l)); +} + + +/* this function handles only `%d', `%c', %f, %p, and `%s' formats */ +const char *luaO_pushvfstring (lua_State *L, const char *fmt, va_list argp) { + int n = 0; + for (;;) { + const char *e = strchr(fmt, '%'); + if (e == NULL) break; + luaD_checkstack(L, 2); /* fmt + item */ + pushstr(L, fmt, e - fmt); + switch (*(e+1)) { + case 's': { + const char *s = va_arg(argp, char *); + if (s == NULL) s = "(null)"; + pushstr(L, s, strlen(s)); + break; + } + case 'c': { + char buff; + buff = cast(char, va_arg(argp, int)); + pushstr(L, &buff, 1); + break; + } + case 'd': { + setnvalue(L->top++, cast_num(va_arg(argp, int))); + break; + } + case 'f': { + setnvalue(L->top++, cast_num(va_arg(argp, l_uacNumber))); + break; + } + case 'p': { + char buff[4*sizeof(void *) + 8]; /* should be enough space for a `%p' */ + int l = lcompat_sprintf(buff, sizeof(buff), "%p", va_arg(argp, void *)); + pushstr(L, buff, l); + break; + } + case '%': { + pushstr(L, "%", 1); + break; + } + default: { + luaG_runerror(L, + "invalid option " LUA_QL("%%%c") " to " LUA_QL("lua_pushfstring"), + *(e + 1)); + } + } + n += 2; + fmt = e+2; + } + luaD_checkstack(L, 1); + pushstr(L, fmt, strlen(fmt)); + if (n > 0) luaV_concat(L, n + 1); + return svalue(L->top - 1); +} + + +const char *luaO_pushfstring (lua_State *L, const char *fmt, ...) { + const char *msg; + va_list argp; + va_start(argp, fmt); + msg = luaO_pushvfstring(L, fmt, argp); + va_end(argp); + return msg; +} + + +/* number of chars of a literal string without the ending \0 */ +#define LL(x) (sizeof(x)/sizeof(char) - 1) + +#define RETS "..." +#define PRE "[string \"" +#define POS "\"]" + +#define addstr(a,b,l) ( memcpy(a,b,(l) * sizeof(char)), a += (l) ) + +void luaO_chunkid (char *out, const char *source, size_t bufflen) { + size_t l = strlen(source); + if (*source == '=') { /* 'literal' source */ + if (l <= bufflen) /* small enough? */ + memcpy(out, source + 1, l * sizeof(char)); + else { /* truncate it */ + addstr(out, source + 1, bufflen - 1); + *out = '\0'; + } + } + else if (*source == '@') { /* file name */ + if (l <= bufflen) /* small enough? */ + memcpy(out, source + 1, l * sizeof(char)); + else { /* add '...' before rest of name */ + addstr(out, RETS, LL(RETS)); + bufflen -= LL(RETS); + memcpy(out, source + 1 + l - bufflen, bufflen * sizeof(char)); + } + } + else { /* string; format as [string "source"] */ + const char *nl = strchr(source, '\n'); /* find first new line (if any) */ + addstr(out, PRE, LL(PRE)); /* add prefix */ + bufflen -= LL(PRE RETS POS) + 1; /* save space for prefix+suffix+'\0' */ + if (l < bufflen && nl == NULL) { /* small one-line source? */ + addstr(out, source, l); /* keep it */ + } + else { + if (nl != NULL) l = nl - source; /* stop at first newline */ + if (l > bufflen) l = bufflen; + addstr(out, source, l); + addstr(out, RETS, LL(RETS)); + } + memcpy(out, POS, (LL(POS) + 1) * sizeof(char)); + } +} +/* END CSTYLED */ diff --git a/module/lua/lobject.h b/module/lua/lobject.h new file mode 100644 index 000000000000..a16b8d62eb4b --- /dev/null +++ b/module/lua/lobject.h @@ -0,0 +1,605 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lobject.h,v 2.71.1.2 2014/05/07 14:14:58 roberto Exp $ +** Type definitions for Lua objects +** See Copyright Notice in lua.h +*/ + + +#ifndef lobject_h +#define lobject_h + + +#include "llimits.h" +#include + + +/* +** Extra tags for non-values +*/ +#define LUA_TPROTO LUA_NUMTAGS +#define LUA_TUPVAL (LUA_NUMTAGS+1) +#define LUA_TDEADKEY (LUA_NUMTAGS+2) + +/* +** number of all possible tags (including LUA_TNONE but excluding DEADKEY) +*/ +#define LUA_TOTALTAGS (LUA_TUPVAL+2) + + +/* +** tags for Tagged Values have the following use of bits: +** bits 0-3: actual tag (a LUA_T* value) +** bits 4-5: variant bits +** bit 6: whether value is collectable +*/ + +#define VARBITS (3 << 4) + + +/* +** LUA_TFUNCTION variants: +** 0 - Lua function +** 1 - light C function +** 2 - regular C function (closure) +*/ + +/* Variant tags for functions */ +#define LUA_TLCL (LUA_TFUNCTION | (0 << 4)) /* Lua closure */ +#define LUA_TLCF (LUA_TFUNCTION | (1 << 4)) /* light C function */ +#define LUA_TCCL (LUA_TFUNCTION | (2 << 4)) /* C closure */ + + +/* Variant tags for strings */ +#define LUA_TSHRSTR (LUA_TSTRING | (0 << 4)) /* short strings */ +#define LUA_TLNGSTR (LUA_TSTRING | (1 << 4)) /* long strings */ + + +/* Bit mark for collectable types */ +#define BIT_ISCOLLECTABLE (1 << 6) + +/* mark a tag as collectable */ +#define ctb(t) ((t) | BIT_ISCOLLECTABLE) + + +/* +** Union of all collectable objects +*/ +typedef union GCObject GCObject; + + +/* +** Common Header for all collectable objects (in macro form, to be +** included in other objects) +*/ +#define CommonHeader GCObject *next; lu_byte tt; lu_byte marked + + +/* +** Common header in struct form +*/ +typedef struct GCheader { + CommonHeader; +} GCheader; + + + +/* +** Union of all Lua values +*/ +typedef union Value Value; + + +#define numfield lua_Number n; /* numbers */ + + + +/* +** Tagged Values. This is the basic representation of values in Lua, +** an actual value plus a tag with its type. +*/ + +#define TValuefields Value value_; int tt_ + +typedef struct lua_TValue TValue; + + +/* macro defining a nil value */ +#define NILCONSTANT {NULL}, LUA_TNIL + + +#define val_(o) ((o)->value_) +#define num_(o) (val_(o).n) + + +/* raw type tag of a TValue */ +#define rttype(o) ((o)->tt_) + +/* tag with no variants (bits 0-3) */ +#define novariant(x) ((x) & 0x0F) + +/* type tag of a TValue (bits 0-3 for tags + variant bits 4-5) */ +#define ttype(o) (rttype(o) & 0x3F) + +/* type tag of a TValue with no variants (bits 0-3) */ +#define ttypenv(o) (novariant(rttype(o))) + + +/* Macros to test type */ +#define checktag(o,t) (rttype(o) == (t)) +#define checktype(o,t) (ttypenv(o) == (t)) +#define ttisnumber(o) checktag((o), LUA_TNUMBER) +#define ttisnil(o) checktag((o), LUA_TNIL) +#define ttisboolean(o) checktag((o), LUA_TBOOLEAN) +#define ttislightuserdata(o) checktag((o), LUA_TLIGHTUSERDATA) +#define ttisstring(o) checktype((o), LUA_TSTRING) +#define ttisshrstring(o) checktag((o), ctb(LUA_TSHRSTR)) +#define ttislngstring(o) checktag((o), ctb(LUA_TLNGSTR)) +#define ttistable(o) checktag((o), ctb(LUA_TTABLE)) +#define ttisfunction(o) checktype(o, LUA_TFUNCTION) +#define ttisclosure(o) ((rttype(o) & 0x1F) == LUA_TFUNCTION) +#define ttisCclosure(o) checktag((o), ctb(LUA_TCCL)) +#define ttisLclosure(o) checktag((o), ctb(LUA_TLCL)) +#define ttislcf(o) checktag((o), LUA_TLCF) +#define ttisuserdata(o) checktag((o), ctb(LUA_TUSERDATA)) +#define ttisthread(o) checktag((o), ctb(LUA_TTHREAD)) +#define ttisdeadkey(o) checktag((o), LUA_TDEADKEY) + +#define ttisequal(o1,o2) (rttype(o1) == rttype(o2)) + +/* Macros to access values */ +#define nvalue(o) check_exp(ttisnumber(o), num_(o)) +#define gcvalue(o) check_exp(iscollectable(o), val_(o).gc) +#define pvalue(o) check_exp(ttislightuserdata(o), val_(o).p) +#define rawtsvalue(o) check_exp(ttisstring(o), &val_(o).gc->ts) +#define tsvalue(o) (&rawtsvalue(o)->tsv) +#define rawuvalue(o) check_exp(ttisuserdata(o), &val_(o).gc->u) +#define uvalue(o) (&rawuvalue(o)->uv) +#define clvalue(o) check_exp(ttisclosure(o), &val_(o).gc->cl) +#define clLvalue(o) check_exp(ttisLclosure(o), &val_(o).gc->cl.l) +#define clCvalue(o) check_exp(ttisCclosure(o), &val_(o).gc->cl.c) +#define fvalue(o) check_exp(ttislcf(o), val_(o).f) +#define hvalue(o) check_exp(ttistable(o), &val_(o).gc->h) +#define bvalue(o) check_exp(ttisboolean(o), val_(o).b) +#define thvalue(o) check_exp(ttisthread(o), &val_(o).gc->th) +/* a dead value may get the 'gc' field, but cannot access its contents */ +#define deadvalue(o) check_exp(ttisdeadkey(o), cast(void *, val_(o).gc)) + +#define l_isfalse(o) (ttisnil(o) || (ttisboolean(o) && bvalue(o) == 0)) + + +#define iscollectable(o) (rttype(o) & BIT_ISCOLLECTABLE) + + +/* Macros for internal tests */ +#define righttt(obj) (ttype(obj) == gcvalue(obj)->gch.tt) + +#define checkliveness(g,obj) \ + lua_longassert(!iscollectable(obj) || \ + (righttt(obj) && !isdead(g,gcvalue(obj)))) + + +/* Macros to set values */ +#define settt_(o,t) ((o)->tt_=(t)) + +#define setnvalue(obj,x) \ + { TValue *io=(obj); num_(io)=(x); settt_(io, LUA_TNUMBER); } + +#define setnilvalue(obj) settt_(obj, LUA_TNIL) + +#define setfvalue(obj,x) \ + { TValue *io=(obj); val_(io).f=(x); settt_(io, LUA_TLCF); } + +#define setpvalue(obj,x) \ + { TValue *io=(obj); val_(io).p=(x); settt_(io, LUA_TLIGHTUSERDATA); } + +#define setbvalue(obj,x) \ + { TValue *io=(obj); val_(io).b=(x); settt_(io, LUA_TBOOLEAN); } + +#define setgcovalue(L,obj,x) \ + { TValue *io=(obj); GCObject *i_g=(x); \ + val_(io).gc=i_g; settt_(io, ctb(gch(i_g)->tt)); } + +#define setsvalue(L,obj,x) \ + { TValue *io=(obj); \ + TString *x_ = (x); \ + val_(io).gc=cast(GCObject *, x_); settt_(io, ctb(x_->tsv.tt)); \ + checkliveness(G(L),io); } + +#define setuvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TUSERDATA)); \ + checkliveness(G(L),io); } + +#define setthvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTHREAD)); \ + checkliveness(G(L),io); } + +#define setclLvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TLCL)); \ + checkliveness(G(L),io); } + +#define setclCvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TCCL)); \ + checkliveness(G(L),io); } + +#define sethvalue(L,obj,x) \ + { TValue *io=(obj); \ + val_(io).gc=cast(GCObject *, (x)); settt_(io, ctb(LUA_TTABLE)); \ + checkliveness(G(L),io); } + +#define setdeadvalue(obj) settt_(obj, LUA_TDEADKEY) + + + +#define setobj(L,obj1,obj2) \ + { const TValue *io2=(obj2); TValue *io1=(obj1); \ + io1->value_ = io2->value_; io1->tt_ = io2->tt_; \ + checkliveness(G(L),io1); } + + +/* +** different types of assignments, according to destination +*/ + +/* from stack to (same) stack */ +#define setobjs2s setobj +/* to stack (not from same stack) */ +#define setobj2s setobj +#define setsvalue2s setsvalue +#define sethvalue2s sethvalue +#define setptvalue2s setptvalue +/* from table to same table */ +#define setobjt2t setobj +/* to table */ +#define setobj2t setobj +/* to new object */ +#define setobj2n setobj +#define setsvalue2n setsvalue + + +/* check whether a number is valid (useful only for NaN trick) */ +#define luai_checknum(L,o,c) { /* empty */ } + + +/* +** {====================================================== +** NaN Trick +** ======================================================= +*/ +#if defined(LUA_NANTRICK) + +/* +** numbers are represented in the 'd_' field. All other values have the +** value (NNMARK | tag) in 'tt__'. A number with such pattern would be +** a "signaled NaN", which is never generated by regular operations by +** the CPU (nor by 'strtod') +*/ + +/* allows for external implementation for part of the trick */ +#if !defined(NNMARK) /* { */ + + +#if !defined(LUA_IEEEENDIAN) +#error option 'LUA_NANTRICK' needs 'LUA_IEEEENDIAN' +#endif + + +#define NNMARK 0x7FF7A500 +#define NNMASK 0x7FFFFF00 + +#undef TValuefields +#undef NILCONSTANT + +#if (LUA_IEEEENDIAN == 0) /* { */ + +/* little endian */ +#define TValuefields \ + union { struct { Value v__; int tt__; } i; double d__; } u +#define NILCONSTANT {{{NULL}, tag2tt(LUA_TNIL)}} +/* field-access macros */ +#define v_(o) ((o)->u.i.v__) +#define d_(o) ((o)->u.d__) +#define tt_(o) ((o)->u.i.tt__) + +#else /* }{ */ + +/* big endian */ +#define TValuefields \ + union { struct { int tt__; Value v__; } i; double d__; } u +#define NILCONSTANT {{tag2tt(LUA_TNIL), {NULL}}} +/* field-access macros */ +#define v_(o) ((o)->u.i.v__) +#define d_(o) ((o)->u.d__) +#define tt_(o) ((o)->u.i.tt__) + +#endif /* } */ + +#endif /* } */ + + +/* correspondence with standard representation */ +#undef val_ +#define val_(o) v_(o) +#undef num_ +#define num_(o) d_(o) + + +#undef numfield +#define numfield /* no such field; numbers are the entire struct */ + +/* basic check to distinguish numbers from non-numbers */ +#undef ttisnumber +#define ttisnumber(o) ((tt_(o) & NNMASK) != NNMARK) + +#define tag2tt(t) (NNMARK | (t)) + +#undef rttype +#define rttype(o) (ttisnumber(o) ? LUA_TNUMBER : tt_(o) & 0xff) + +#undef settt_ +#define settt_(o,t) (tt_(o) = tag2tt(t)) + +#undef setnvalue +#define setnvalue(obj,x) \ + { TValue *io_=(obj); num_(io_)=(x); lua_assert(ttisnumber(io_)); } + +#undef setobj +#define setobj(L,obj1,obj2) \ + { const TValue *o2_=(obj2); TValue *o1_=(obj1); \ + o1_->u = o2_->u; \ + checkliveness(G(L),o1_); } + + +/* +** these redefinitions are not mandatory, but these forms are more efficient +*/ + +#undef checktag +#undef checktype +#define checktag(o,t) (tt_(o) == tag2tt(t)) +#define checktype(o,t) (ctb(tt_(o) | VARBITS) == ctb(tag2tt(t) | VARBITS)) + +#undef ttisequal +#define ttisequal(o1,o2) \ + (ttisnumber(o1) ? ttisnumber(o2) : (tt_(o1) == tt_(o2))) + + +#undef luai_checknum +#define luai_checknum(L,o,c) { if (!ttisnumber(o)) c; } + +#endif +/* }====================================================== */ + + + +/* +** {====================================================== +** types and prototypes +** ======================================================= +*/ + + +union Value { + GCObject *gc; /* collectable objects */ + void *p; /* light userdata */ + int b; /* booleans */ + lua_CFunction f; /* light C functions */ + numfield /* numbers */ +}; + + +struct lua_TValue { + TValuefields; +}; + + +typedef TValue *StkId; /* index to stack elements */ + + + + +/* +** Header for string value; string bytes follow the end of this structure +*/ +typedef union TString { + L_Umaxalign dummy; /* ensures maximum alignment for strings */ + struct { + CommonHeader; + lu_byte extra; /* reserved words for short strings; "has hash" for longs */ + unsigned int hash; + size_t len; /* number of characters in string */ + } tsv; +} TString; + + +/* get the actual string (array of bytes) from a TString */ +#define getstr(ts) cast(const char *, (ts) + 1) + +/* get the actual string (array of bytes) from a Lua value */ +#define svalue(o) getstr(rawtsvalue(o)) + + +/* +** Header for userdata; memory area follows the end of this structure +*/ +typedef union Udata { + L_Umaxalign dummy; /* ensures maximum alignment for `local' udata */ + struct { + CommonHeader; + struct Table *metatable; + struct Table *env; + size_t len; /* number of bytes */ + } uv; +} Udata; + + + +/* +** Description of an upvalue for function prototypes +*/ +typedef struct Upvaldesc { + TString *name; /* upvalue name (for debug information) */ + lu_byte instack; /* whether it is in stack */ + lu_byte idx; /* index of upvalue (in stack or in outer function's list) */ +} Upvaldesc; + + +/* +** Description of a local variable for function prototypes +** (used for debug information) +*/ +typedef struct LocVar { + TString *varname; + int startpc; /* first point where variable is active */ + int endpc; /* first point where variable is dead */ +} LocVar; + + +/* +** Function Prototypes +*/ +typedef struct Proto { + CommonHeader; + TValue *k; /* constants used by the function */ + Instruction *code; + struct Proto **p; /* functions defined inside the function */ + int *lineinfo; /* map from opcodes to source lines (debug information) */ + LocVar *locvars; /* information about local variables (debug information) */ + Upvaldesc *upvalues; /* upvalue information */ + union Closure *cache; /* last created closure with this prototype */ + TString *source; /* used for debug information */ + int sizeupvalues; /* size of 'upvalues' */ + int sizek; /* size of `k' */ + int sizecode; + int sizelineinfo; + int sizep; /* size of `p' */ + int sizelocvars; + int linedefined; + int lastlinedefined; + GCObject *gclist; + lu_byte numparams; /* number of fixed parameters */ + lu_byte is_vararg; + lu_byte maxstacksize; /* maximum stack used by this function */ +} Proto; + + + +/* +** Lua Upvalues +*/ +typedef struct UpVal { + CommonHeader; + TValue *v; /* points to stack or to its own value */ + union { + TValue value; /* the value (when closed) */ + struct { /* double linked list (when open) */ + struct UpVal *prev; + struct UpVal *next; + } l; + } u; +} UpVal; + + +/* +** Closures +*/ + +#define ClosureHeader \ + CommonHeader; lu_byte nupvalues; GCObject *gclist + +typedef struct CClosure { + ClosureHeader; + lua_CFunction f; + TValue upvalue[1]; /* list of upvalues */ +} CClosure; + + +typedef struct LClosure { + ClosureHeader; + struct Proto *p; + UpVal *upvals[1]; /* list of upvalues */ +} LClosure; + + +typedef union Closure { + CClosure c; + LClosure l; +} Closure; + + +#define isLfunction(o) ttisLclosure(o) + +#define getproto(o) (clLvalue(o)->p) + + +/* +** Tables +*/ + +typedef union TKey { + struct { + TValuefields; + struct Node *next; /* for chaining */ + } nk; + TValue tvk; +} TKey; + + +typedef struct Node { + TValue i_val; + TKey i_key; +} Node; + + +typedef struct Table { + CommonHeader; + lu_byte flags; /* 1<

lsizenode)) + + +/* +** (address of) a fixed nil value +*/ +#define luaO_nilobject (&luaO_nilobject_) + + +LUAI_DDEC const TValue luaO_nilobject_; + + +LUAI_FUNC int luaO_int2fb (unsigned int x); +LUAI_FUNC int luaO_fb2int (int x); +LUAI_FUNC int luaO_ceillog2 (unsigned int x); +LUAI_FUNC lua_Number luaO_arith (int op, lua_Number v1, lua_Number v2); +LUAI_FUNC int luaO_str2d (const char *s, size_t len, lua_Number *result); +LUAI_FUNC int luaO_hexavalue (int c); +LUAI_FUNC const char *luaO_pushvfstring (lua_State *L, const char *fmt, + va_list argp); +LUAI_FUNC const char *luaO_pushfstring (lua_State *L, const char *fmt, ...); +LUAI_FUNC void luaO_chunkid (char *out, const char *source, size_t len); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lopcodes.c b/module/lua/lopcodes.c new file mode 100644 index 000000000000..5f34e6d90515 --- /dev/null +++ b/module/lua/lopcodes.c @@ -0,0 +1,108 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lopcodes.c,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ +** Opcodes for Lua virtual machine +** See Copyright Notice in lua.h +*/ + + +#define lopcodes_c +#define LUA_CORE + + +#include "lopcodes.h" + + +/* ORDER OP */ + +LUAI_DDEF const char *const luaP_opnames[NUM_OPCODES+1] = { + "MOVE", + "LOADK", + "LOADKX", + "LOADBOOL", + "LOADNIL", + "GETUPVAL", + "GETTABUP", + "GETTABLE", + "SETTABUP", + "SETUPVAL", + "SETTABLE", + "NEWTABLE", + "SELF", + "ADD", + "SUB", + "MUL", + "DIV", + "MOD", + "POW", + "UNM", + "NOT", + "LEN", + "CONCAT", + "JMP", + "EQ", + "LT", + "LE", + "TEST", + "TESTSET", + "CALL", + "TAILCALL", + "RETURN", + "FORLOOP", + "FORPREP", + "TFORCALL", + "TFORLOOP", + "SETLIST", + "CLOSURE", + "VARARG", + "EXTRAARG", + NULL +}; + + +#define opmode(t,a,b,c,m) (((t)<<7) | ((a)<<6) | ((b)<<4) | ((c)<<2) | (m)) + +LUAI_DDEF const lu_byte luaP_opmodes[NUM_OPCODES] = { +/* T A B C mode opcode */ + opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_MOVE */ + ,opmode(0, 1, OpArgK, OpArgN, iABx) /* OP_LOADK */ + ,opmode(0, 1, OpArgN, OpArgN, iABx) /* OP_LOADKX */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_LOADBOOL */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_LOADNIL */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_GETUPVAL */ + ,opmode(0, 1, OpArgU, OpArgK, iABC) /* OP_GETTABUP */ + ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_GETTABLE */ + ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABUP */ + ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_SETUPVAL */ + ,opmode(0, 0, OpArgK, OpArgK, iABC) /* OP_SETTABLE */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_NEWTABLE */ + ,opmode(0, 1, OpArgR, OpArgK, iABC) /* OP_SELF */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_ADD */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_SUB */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MUL */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_DIV */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_MOD */ + ,opmode(0, 1, OpArgK, OpArgK, iABC) /* OP_POW */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_UNM */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_NOT */ + ,opmode(0, 1, OpArgR, OpArgN, iABC) /* OP_LEN */ + ,opmode(0, 1, OpArgR, OpArgR, iABC) /* OP_CONCAT */ + ,opmode(0, 0, OpArgR, OpArgN, iAsBx) /* OP_JMP */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_EQ */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LT */ + ,opmode(1, 0, OpArgK, OpArgK, iABC) /* OP_LE */ + ,opmode(1, 0, OpArgN, OpArgU, iABC) /* OP_TEST */ + ,opmode(1, 1, OpArgR, OpArgU, iABC) /* OP_TESTSET */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_CALL */ + ,opmode(0, 1, OpArgU, OpArgU, iABC) /* OP_TAILCALL */ + ,opmode(0, 0, OpArgU, OpArgN, iABC) /* OP_RETURN */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORLOOP */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_FORPREP */ + ,opmode(0, 0, OpArgN, OpArgU, iABC) /* OP_TFORCALL */ + ,opmode(0, 1, OpArgR, OpArgN, iAsBx) /* OP_TFORLOOP */ + ,opmode(0, 0, OpArgU, OpArgU, iABC) /* OP_SETLIST */ + ,opmode(0, 1, OpArgU, OpArgN, iABx) /* OP_CLOSURE */ + ,opmode(0, 1, OpArgU, OpArgN, iABC) /* OP_VARARG */ + ,opmode(0, 0, OpArgU, OpArgU, iAx) /* OP_EXTRAARG */ +}; +/* END CSTYLED */ diff --git a/module/lua/lopcodes.h b/module/lua/lopcodes.h new file mode 100644 index 000000000000..02eeec1ecd06 --- /dev/null +++ b/module/lua/lopcodes.h @@ -0,0 +1,290 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lopcodes.h,v 1.142.1.2 2014/10/20 18:32:09 roberto Exp $ +** Opcodes for Lua virtual machine +** See Copyright Notice in lua.h +*/ + +#ifndef lopcodes_h +#define lopcodes_h + +#include "llimits.h" + + +/*=========================================================================== + We assume that instructions are unsigned numbers. + All instructions have an opcode in the first 6 bits. + Instructions can have the following fields: + `A' : 8 bits + `B' : 9 bits + `C' : 9 bits + 'Ax' : 26 bits ('A', 'B', and 'C' together) + `Bx' : 18 bits (`B' and `C' together) + `sBx' : signed Bx + + A signed argument is represented in excess K; that is, the number + value is the unsigned value minus K. K is exactly the maximum value + for that argument (so that -max is represented by 0, and +max is + represented by 2*max), which is half the maximum for the corresponding + unsigned argument. +===========================================================================*/ + + +enum OpMode {iABC, iABx, iAsBx, iAx}; /* basic instruction format */ + + +/* +** size and position of opcode arguments. +*/ +#define SIZE_C 9 +#define SIZE_B 9 +#define SIZE_Bx (SIZE_C + SIZE_B) +#define SIZE_A 8 +#define SIZE_Ax (SIZE_C + SIZE_B + SIZE_A) + +#define SIZE_OP 6 + +#define POS_OP 0 +#define POS_A (POS_OP + SIZE_OP) +#define POS_C (POS_A + SIZE_A) +#define POS_B (POS_C + SIZE_C) +#define POS_Bx POS_C +#define POS_Ax POS_A + + +/* +** limits for opcode arguments. +** we use (signed) int to manipulate most arguments, +** so they must fit in LUAI_BITSINT-1 bits (-1 for sign) +*/ +#if SIZE_Bx < LUAI_BITSINT-1 +#define MAXARG_Bx ((1<>1) /* `sBx' is signed */ +#else +#define MAXARG_Bx MAX_INT +#define MAXARG_sBx MAX_INT +#endif + +#if SIZE_Ax < LUAI_BITSINT-1 +#define MAXARG_Ax ((1<>POS_OP) & MASK1(SIZE_OP,0))) +#define SET_OPCODE(i,o) ((i) = (((i)&MASK0(SIZE_OP,POS_OP)) | \ + ((cast(Instruction, o)<>pos) & MASK1(size,0))) +#define setarg(i,v,pos,size) ((i) = (((i)&MASK0(size,pos)) | \ + ((cast(Instruction, v)<= R(A - 1) */ +OP_EQ,/* A B C if ((RK(B) == RK(C)) ~= A) then pc++ */ +OP_LT,/* A B C if ((RK(B) < RK(C)) ~= A) then pc++ */ +OP_LE,/* A B C if ((RK(B) <= RK(C)) ~= A) then pc++ */ + +OP_TEST,/* A C if not (R(A) <=> C) then pc++ */ +OP_TESTSET,/* A B C if (R(B) <=> C) then R(A) := R(B) else pc++ */ + +OP_CALL,/* A B C R(A), ... ,R(A+C-2) := R(A)(R(A+1), ... ,R(A+B-1)) */ +OP_TAILCALL,/* A B C return R(A)(R(A+1), ... ,R(A+B-1)) */ +OP_RETURN,/* A B return R(A), ... ,R(A+B-2) (see note) */ + +OP_FORLOOP,/* A sBx R(A)+=R(A+2); + if R(A) > 4) & 3)) +#define getCMode(m) (cast(enum OpArgMask, (luaP_opmodes[m] >> 2) & 3)) +#define testAMode(m) (luaP_opmodes[m] & (1 << 6)) +#define testTMode(m) (luaP_opmodes[m] & (1 << 7)) + + +LUAI_DDEC const char *const luaP_opnames[NUM_OPCODES+1]; /* opcode names */ + + +/* number of list items to accumulate before a SETLIST instruction */ +#define LFIELDS_PER_FLUSH 50 + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lparser.c b/module/lua/lparser.c new file mode 100644 index 000000000000..e1dd88f2f654 --- /dev/null +++ b/module/lua/lparser.c @@ -0,0 +1,1643 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lparser.c,v 2.130.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua Parser +** See Copyright Notice in lua.h +*/ + +#define lparser_c +#define LUA_CORE + +#include + +#include "lcode.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "llex.h" +#include "lmem.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lparser.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" + + + +/* maximum number of local variables per function (must be smaller + than 250, due to the bytecode format) */ +#define MAXVARS 200 + + +#define hasmultret(k) ((k) == VCALL || (k) == VVARARG) + + + +/* +** nodes for block list (list of active blocks) +*/ +typedef struct BlockCnt { + struct BlockCnt *previous; /* chain */ + short firstlabel; /* index of first label in this block */ + short firstgoto; /* index of first pending goto in this block */ + lu_byte nactvar; /* # active locals outside the block */ + lu_byte upval; /* true if some variable in the block is an upvalue */ + lu_byte isloop; /* true if `block' is a loop */ +} BlockCnt; + + + +/* +** prototypes for recursive non-terminal functions +*/ +static void statement (LexState *ls); +static void expr (LexState *ls, expdesc *v); + + +static void anchor_token (LexState *ls) { + /* last token from outer function must be EOS */ + lua_assert(ls->fs != NULL || ls->t.token == TK_EOS); + if (ls->t.token == TK_NAME || ls->t.token == TK_STRING) { + TString *ts = ls->t.seminfo.ts; + luaX_newstring(ls, getstr(ts), ts->tsv.len); + } +} + + +/* semantic error */ +static l_noret semerror (LexState *ls, const char *msg) { + ls->t.token = 0; /* remove 'near to' from final message */ + luaX_syntaxerror(ls, msg); +} + + +static l_noret error_expected (LexState *ls, int token) { + luaX_syntaxerror(ls, + luaO_pushfstring(ls->L, "%s expected", luaX_token2str(ls, token))); +} + + +static l_noret errorlimit (FuncState *fs, int limit, const char *what) { + lua_State *L = fs->ls->L; + const char *msg; + int line = fs->f->linedefined; + const char *where = (line == 0) + ? "main function" + : luaO_pushfstring(L, "function at line %d", line); + msg = luaO_pushfstring(L, "too many %s (limit is %d) in %s", + what, limit, where); + luaX_syntaxerror(fs->ls, msg); +} + + +static void checklimit (FuncState *fs, int v, int l, const char *what) { + if (v > l) errorlimit(fs, l, what); +} + + +static int testnext (LexState *ls, int c) { + if (ls->t.token == c) { + luaX_next(ls); + return 1; + } + else return 0; +} + + +static void check (LexState *ls, int c) { + if (ls->t.token != c) + error_expected(ls, c); +} + + +static void checknext (LexState *ls, int c) { + check(ls, c); + luaX_next(ls); +} + + +#define check_condition(ls,c,msg) { if (!(c)) luaX_syntaxerror(ls, msg); } + + + +static void check_match (LexState *ls, int what, int who, int where) { + if (!testnext(ls, what)) { + if (where == ls->linenumber) + error_expected(ls, what); + else { + luaX_syntaxerror(ls, luaO_pushfstring(ls->L, + "%s expected (to close %s at line %d)", + luaX_token2str(ls, what), luaX_token2str(ls, who), where)); + } + } +} + + +static TString *str_checkname (LexState *ls) { + TString *ts; + check(ls, TK_NAME); + ts = ls->t.seminfo.ts; + luaX_next(ls); + return ts; +} + + +static void init_exp (expdesc *e, expkind k, int i) { + e->f = e->t = NO_JUMP; + e->k = k; + e->u.info = i; +} + + +static void codestring (LexState *ls, expdesc *e, TString *s) { + init_exp(e, VK, luaK_stringK(ls->fs, s)); +} + + +static void checkname (LexState *ls, expdesc *e) { + codestring(ls, e, str_checkname(ls)); +} + + +static int registerlocalvar (LexState *ls, TString *varname) { + FuncState *fs = ls->fs; + Proto *f = fs->f; + int oldsize = f->sizelocvars; + luaM_growvector(ls->L, f->locvars, fs->nlocvars, f->sizelocvars, + LocVar, SHRT_MAX, "local variables"); + while (oldsize < f->sizelocvars) f->locvars[oldsize++].varname = NULL; + f->locvars[fs->nlocvars].varname = varname; + luaC_objbarrier(ls->L, f, varname); + return fs->nlocvars++; +} + + +static void new_localvar (LexState *ls, TString *name) { + FuncState *fs = ls->fs; + Dyndata *dyd = ls->dyd; + int reg = registerlocalvar(ls, name); + checklimit(fs, dyd->actvar.n + 1 - fs->firstlocal, + MAXVARS, "local variables"); + luaM_growvector(ls->L, dyd->actvar.arr, dyd->actvar.n + 1, + dyd->actvar.size, Vardesc, MAX_INT, "local variables"); + dyd->actvar.arr[dyd->actvar.n++].idx = cast(short, reg); +} + + +static void new_localvarliteral_ (LexState *ls, const char *name, size_t sz) { + new_localvar(ls, luaX_newstring(ls, name, sz)); +} + +#define new_localvarliteral(ls,v) \ + new_localvarliteral_(ls, "" v, (sizeof(v)/sizeof(char))-1) + + +static LocVar *getlocvar (FuncState *fs, int i) { + int idx = fs->ls->dyd->actvar.arr[fs->firstlocal + i].idx; + lua_assert(idx < fs->nlocvars); + return &fs->f->locvars[idx]; +} + + +static void adjustlocalvars (LexState *ls, int nvars) { + FuncState *fs = ls->fs; + fs->nactvar = cast_byte(fs->nactvar + nvars); + for (; nvars; nvars--) { + getlocvar(fs, fs->nactvar - nvars)->startpc = fs->pc; + } +} + + +static void removevars (FuncState *fs, int tolevel) { + fs->ls->dyd->actvar.n -= (fs->nactvar - tolevel); + while (fs->nactvar > tolevel) + getlocvar(fs, --fs->nactvar)->endpc = fs->pc; +} + + +static int searchupvalue (FuncState *fs, TString *name) { + int i; + Upvaldesc *up = fs->f->upvalues; + for (i = 0; i < fs->nups; i++) { + if (luaS_eqstr(up[i].name, name)) return i; + } + return -1; /* not found */ +} + + +static int newupvalue (FuncState *fs, TString *name, expdesc *v) { + Proto *f = fs->f; + int oldsize = f->sizeupvalues; + checklimit(fs, fs->nups + 1, MAXUPVAL, "upvalues"); + luaM_growvector(fs->ls->L, f->upvalues, fs->nups, f->sizeupvalues, + Upvaldesc, MAXUPVAL, "upvalues"); + while (oldsize < f->sizeupvalues) f->upvalues[oldsize++].name = NULL; + f->upvalues[fs->nups].instack = (v->k == VLOCAL); + f->upvalues[fs->nups].idx = cast_byte(v->u.info); + f->upvalues[fs->nups].name = name; + luaC_objbarrier(fs->ls->L, f, name); + return fs->nups++; +} + + +static int searchvar (FuncState *fs, TString *n) { + int i; + for (i = cast_int(fs->nactvar) - 1; i >= 0; i--) { + if (luaS_eqstr(n, getlocvar(fs, i)->varname)) + return i; + } + return -1; /* not found */ +} + + +/* + Mark block where variable at given level was defined + (to emit close instructions later). +*/ +static void markupval (FuncState *fs, int level) { + BlockCnt *bl = fs->bl; + while (bl->nactvar > level) bl = bl->previous; + bl->upval = 1; +} + + +/* + Find variable with given name 'n'. If it is an upvalue, add this + upvalue into all intermediate functions. +*/ +static int singlevaraux (FuncState *fs, TString *n, expdesc *var, int base) { + if (fs == NULL) /* no more levels? */ + return VVOID; /* default is global */ + else { + int v = searchvar(fs, n); /* look up locals at current level */ + if (v >= 0) { /* found? */ + init_exp(var, VLOCAL, v); /* variable is local */ + if (!base) + markupval(fs, v); /* local will be used as an upval */ + return VLOCAL; + } + else { /* not found as local at current level; try upvalues */ + int idx = searchupvalue(fs, n); /* try existing upvalues */ + if (idx < 0) { /* not found? */ + if (singlevaraux(fs->prev, n, var, 0) == VVOID) /* try upper levels */ + return VVOID; /* not found; is a global */ + /* else was LOCAL or UPVAL */ + idx = newupvalue(fs, n, var); /* will be a new upvalue */ + } + init_exp(var, VUPVAL, idx); + return VUPVAL; + } + } +} + + +static void singlevar (LexState *ls, expdesc *var) { + TString *varname = str_checkname(ls); + FuncState *fs = ls->fs; + if (singlevaraux(fs, varname, var, 1) == VVOID) { /* global name? */ + expdesc key; + singlevaraux(fs, ls->envn, var, 1); /* get environment variable */ + lua_assert(var->k == VLOCAL || var->k == VUPVAL); + codestring(ls, &key, varname); /* key is variable name */ + luaK_indexed(fs, var, &key); /* env[varname] */ + } +} + + +static void adjust_assign (LexState *ls, int nvars, int nexps, expdesc *e) { + FuncState *fs = ls->fs; + int extra = nvars - nexps; + if (hasmultret(e->k)) { + extra++; /* includes call itself */ + if (extra < 0) extra = 0; + luaK_setreturns(fs, e, extra); /* last exp. provides the difference */ + if (extra > 1) luaK_reserveregs(fs, extra-1); + } + else { + if (e->k != VVOID) luaK_exp2nextreg(fs, e); /* close last expression */ + if (extra > 0) { + int reg = fs->freereg; + luaK_reserveregs(fs, extra); + luaK_nil(fs, reg, extra); + } + } +} + + +static void enterlevel (LexState *ls) { + lua_State *L = ls->L; + ++L->nCcalls; + checklimit(ls->fs, L->nCcalls, LUAI_MAXCCALLS, "C levels"); +} + + +#define leavelevel(ls) ((ls)->L->nCcalls--) + + +static void closegoto (LexState *ls, int g, Labeldesc *label) { + int i; + FuncState *fs = ls->fs; + Labellist *gl = &ls->dyd->gt; + Labeldesc *gt = &gl->arr[g]; + lua_assert(luaS_eqstr(gt->name, label->name)); + if (gt->nactvar < label->nactvar) { + TString *vname = getlocvar(fs, gt->nactvar)->varname; + const char *msg = luaO_pushfstring(ls->L, + " at line %d jumps into the scope of local " LUA_QS, + getstr(gt->name), gt->line, getstr(vname)); + semerror(ls, msg); + } + luaK_patchlist(fs, gt->pc, label->pc); + /* remove goto from pending list */ + for (i = g; i < gl->n - 1; i++) + gl->arr[i] = gl->arr[i + 1]; + gl->n--; +} + + +/* +** try to close a goto with existing labels; this solves backward jumps +*/ +static int findlabel (LexState *ls, int g) { + int i; + BlockCnt *bl = ls->fs->bl; + Dyndata *dyd = ls->dyd; + Labeldesc *gt = &dyd->gt.arr[g]; + /* check labels in current block for a match */ + for (i = bl->firstlabel; i < dyd->label.n; i++) { + Labeldesc *lb = &dyd->label.arr[i]; + if (luaS_eqstr(lb->name, gt->name)) { /* correct label? */ + if (gt->nactvar > lb->nactvar && + (bl->upval || dyd->label.n > bl->firstlabel)) + luaK_patchclose(ls->fs, gt->pc, lb->nactvar); + closegoto(ls, g, lb); /* close it */ + return 1; + } + } + return 0; /* label not found; cannot close goto */ +} + + +static int newlabelentry (LexState *ls, Labellist *l, TString *name, + int line, int pc) { + int n = l->n; + luaM_growvector(ls->L, l->arr, n, l->size, + Labeldesc, SHRT_MAX, "labels/gotos"); + l->arr[n].name = name; + l->arr[n].line = line; + l->arr[n].nactvar = ls->fs->nactvar; + l->arr[n].pc = pc; + l->n++; + return n; +} + + +/* +** check whether new label 'lb' matches any pending gotos in current +** block; solves forward jumps +*/ +static void findgotos (LexState *ls, Labeldesc *lb) { + Labellist *gl = &ls->dyd->gt; + int i = ls->fs->bl->firstgoto; + while (i < gl->n) { + if (luaS_eqstr(gl->arr[i].name, lb->name)) + closegoto(ls, i, lb); + else + i++; + } +} + + +/* +** "export" pending gotos to outer level, to check them against +** outer labels; if the block being exited has upvalues, and +** the goto exits the scope of any variable (which can be the +** upvalue), close those variables being exited. +*/ +static void movegotosout (FuncState *fs, BlockCnt *bl) { + int i = bl->firstgoto; + Labellist *gl = &fs->ls->dyd->gt; + /* correct pending gotos to current block and try to close it + with visible labels */ + while (i < gl->n) { + Labeldesc *gt = &gl->arr[i]; + if (gt->nactvar > bl->nactvar) { + if (bl->upval) + luaK_patchclose(fs, gt->pc, bl->nactvar); + gt->nactvar = bl->nactvar; + } + if (!findlabel(fs->ls, i)) + i++; /* move to next one */ + } +} + + +static void enterblock (FuncState *fs, BlockCnt *bl, lu_byte isloop) { + bl->isloop = isloop; + bl->nactvar = fs->nactvar; + bl->firstlabel = fs->ls->dyd->label.n; + bl->firstgoto = fs->ls->dyd->gt.n; + bl->upval = 0; + bl->previous = fs->bl; + fs->bl = bl; + lua_assert(fs->freereg == fs->nactvar); +} + + +/* +** create a label named "break" to resolve break statements +*/ +static void breaklabel (LexState *ls) { + TString *n = luaS_new(ls->L, "break"); + int l = newlabelentry(ls, &ls->dyd->label, n, 0, ls->fs->pc); + findgotos(ls, &ls->dyd->label.arr[l]); +} + +/* +** generates an error for an undefined 'goto'; choose appropriate +** message when label name is a reserved word (which can only be 'break') +*/ +static l_noret undefgoto (LexState *ls, Labeldesc *gt) { + const char *msg = isreserved(gt->name) + ? "<%s> at line %d not inside a loop" + : "no visible label " LUA_QS " for at line %d"; + msg = luaO_pushfstring(ls->L, msg, getstr(gt->name), gt->line); + semerror(ls, msg); +} + + +static void leaveblock (FuncState *fs) { + BlockCnt *bl = fs->bl; + LexState *ls = fs->ls; + if (bl->previous && bl->upval) { + /* create a 'jump to here' to close upvalues */ + int j = luaK_jump(fs); + luaK_patchclose(fs, j, bl->nactvar); + luaK_patchtohere(fs, j); + } + if (bl->isloop) + breaklabel(ls); /* close pending breaks */ + fs->bl = bl->previous; + removevars(fs, bl->nactvar); + lua_assert(bl->nactvar == fs->nactvar); + fs->freereg = fs->nactvar; /* free registers */ + ls->dyd->label.n = bl->firstlabel; /* remove local labels */ + if (bl->previous) /* inner block? */ + movegotosout(fs, bl); /* update pending gotos to outer block */ + else if (bl->firstgoto < ls->dyd->gt.n) /* pending gotos in outer block? */ + undefgoto(ls, &ls->dyd->gt.arr[bl->firstgoto]); /* error */ +} + + +/* +** adds a new prototype into list of prototypes +*/ +static Proto *addprototype (LexState *ls) { + Proto *clp; + lua_State *L = ls->L; + FuncState *fs = ls->fs; + Proto *f = fs->f; /* prototype of current function */ + if (fs->np >= f->sizep) { + int oldsize = f->sizep; + luaM_growvector(L, f->p, fs->np, f->sizep, Proto *, MAXARG_Bx, "functions"); + while (oldsize < f->sizep) f->p[oldsize++] = NULL; + } + f->p[fs->np++] = clp = luaF_newproto(L); + luaC_objbarrier(L, f, clp); + return clp; +} + + +/* +** codes instruction to create new closure in parent function. +** The OP_CLOSURE instruction must use the last available register, +** so that, if it invokes the GC, the GC knows which registers +** are in use at that time. +*/ +static void codeclosure (LexState *ls, expdesc *v) { + FuncState *fs = ls->fs->prev; + init_exp(v, VRELOCABLE, luaK_codeABx(fs, OP_CLOSURE, 0, fs->np - 1)); + luaK_exp2nextreg(fs, v); /* fix it at the last register */ +} + + +static void open_func (LexState *ls, FuncState *fs, BlockCnt *bl) { + lua_State *L = ls->L; + Proto *f; + fs->prev = ls->fs; /* linked list of funcstates */ + fs->ls = ls; + ls->fs = fs; + fs->pc = 0; + fs->lasttarget = 0; + fs->jpc = NO_JUMP; + fs->freereg = 0; + fs->nk = 0; + fs->np = 0; + fs->nups = 0; + fs->nlocvars = 0; + fs->nactvar = 0; + fs->firstlocal = ls->dyd->actvar.n; + fs->bl = NULL; + f = fs->f; + f->source = ls->source; + f->maxstacksize = 2; /* registers 0/1 are always valid */ + fs->h = luaH_new(L); + /* anchor table of constants (to avoid being collected) */ + sethvalue2s(L, L->top, fs->h); + incr_top(L); + enterblock(fs, bl, 0); +} + + +static void close_func (LexState *ls) { + lua_State *L = ls->L; + FuncState *fs = ls->fs; + Proto *f = fs->f; + luaK_ret(fs, 0, 0); /* final return */ + leaveblock(fs); + luaM_reallocvector(L, f->code, f->sizecode, fs->pc, Instruction); + f->sizecode = fs->pc; + luaM_reallocvector(L, f->lineinfo, f->sizelineinfo, fs->pc, int); + f->sizelineinfo = fs->pc; + luaM_reallocvector(L, f->k, f->sizek, fs->nk, TValue); + f->sizek = fs->nk; + luaM_reallocvector(L, f->p, f->sizep, fs->np, Proto *); + f->sizep = fs->np; + luaM_reallocvector(L, f->locvars, f->sizelocvars, fs->nlocvars, LocVar); + f->sizelocvars = fs->nlocvars; + luaM_reallocvector(L, f->upvalues, f->sizeupvalues, fs->nups, Upvaldesc); + f->sizeupvalues = fs->nups; + lua_assert(fs->bl == NULL); + ls->fs = fs->prev; + /* last token read was anchored in defunct function; must re-anchor it */ + anchor_token(ls); + L->top--; /* pop table of constants */ + luaC_checkGC(L); +} + + + +/*============================================================*/ +/* GRAMMAR RULES */ +/*============================================================*/ + + +/* +** check whether current token is in the follow set of a block. +** 'until' closes syntactical blocks, but do not close scope, +** so it handled in separate. +*/ +static int block_follow (LexState *ls, int withuntil) { + switch (ls->t.token) { + case TK_ELSE: case TK_ELSEIF: + case TK_END: case TK_EOS: + return 1; + case TK_UNTIL: return withuntil; + default: return 0; + } +} + + +/* + * by inlining statlist() and test_then_block() we cut back the + * native stack usage per nested C call from 272 bytes to 152 + * which allows us to stay within budget for 8K kernel stacks + */ +__attribute__((always_inline)) inline +static void statlist (LexState *ls) { + /* statlist -> { stat [`;'] } */ + while (!block_follow(ls, 1)) { + if (ls->t.token == TK_RETURN) { + statement(ls); + return; /* 'return' must be last statement */ + } + statement(ls); + } +} + + +static void fieldsel (LexState *ls, expdesc *v) { + /* fieldsel -> ['.' | ':'] NAME */ + FuncState *fs = ls->fs; + expdesc key; + luaK_exp2anyregup(fs, v); + luaX_next(ls); /* skip the dot or colon */ + checkname(ls, &key); + luaK_indexed(fs, v, &key); +} + + +static void yindex (LexState *ls, expdesc *v) { + /* index -> '[' expr ']' */ + luaX_next(ls); /* skip the '[' */ + expr(ls, v); + luaK_exp2val(ls->fs, v); + checknext(ls, ']'); +} + + +/* +** {====================================================================== +** Rules for Constructors +** ======================================================================= +*/ + + +struct ConsControl { + expdesc v; /* last list item read */ + expdesc *t; /* table descriptor */ + int nh; /* total number of `record' elements */ + int na; /* total number of array elements */ + int tostore; /* number of array elements pending to be stored */ +}; + + +static void recfield (LexState *ls, struct ConsControl *cc) { + /* recfield -> (NAME | `['exp1`]') = exp1 */ + FuncState *fs = ls->fs; + int reg = ls->fs->freereg; + expdesc key, val; + int rkkey; + if (ls->t.token == TK_NAME) { + checklimit(fs, cc->nh, MAX_INT, "items in a constructor"); + checkname(ls, &key); + } + else /* ls->t.token == '[' */ + yindex(ls, &key); + cc->nh++; + checknext(ls, '='); + rkkey = luaK_exp2RK(fs, &key); + expr(ls, &val); + luaK_codeABC(fs, OP_SETTABLE, cc->t->u.info, rkkey, luaK_exp2RK(fs, &val)); + fs->freereg = reg; /* free registers */ +} + + +static void closelistfield (FuncState *fs, struct ConsControl *cc) { + if (cc->v.k == VVOID) return; /* there is no list item */ + luaK_exp2nextreg(fs, &cc->v); + cc->v.k = VVOID; + if (cc->tostore == LFIELDS_PER_FLUSH) { + luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); /* flush */ + cc->tostore = 0; /* no more items pending */ + } +} + + +static void lastlistfield (FuncState *fs, struct ConsControl *cc) { + if (cc->tostore == 0) return; + if (hasmultret(cc->v.k)) { + luaK_setmultret(fs, &cc->v); + luaK_setlist(fs, cc->t->u.info, cc->na, LUA_MULTRET); + cc->na--; /* do not count last expression (unknown number of elements) */ + } + else { + if (cc->v.k != VVOID) + luaK_exp2nextreg(fs, &cc->v); + luaK_setlist(fs, cc->t->u.info, cc->na, cc->tostore); + } +} + + +static void listfield (LexState *ls, struct ConsControl *cc) { + /* listfield -> exp */ + expr(ls, &cc->v); + checklimit(ls->fs, cc->na, MAX_INT, "items in a constructor"); + cc->na++; + cc->tostore++; +} + + +static void field (LexState *ls, struct ConsControl *cc) { + /* field -> listfield | recfield */ + switch(ls->t.token) { + case TK_NAME: { /* may be 'listfield' or 'recfield' */ + if (luaX_lookahead(ls) != '=') /* expression? */ + listfield(ls, cc); + else + recfield(ls, cc); + break; + } + case '[': { + recfield(ls, cc); + break; + } + default: { + listfield(ls, cc); + break; + } + } +} + + +static void constructor (LexState *ls, expdesc *t) { + /* constructor -> '{' [ field { sep field } [sep] ] '}' + sep -> ',' | ';' */ + FuncState *fs = ls->fs; + int line = ls->linenumber; + int pc = luaK_codeABC(fs, OP_NEWTABLE, 0, 0, 0); + struct ConsControl cc; + cc.na = cc.nh = cc.tostore = 0; + cc.t = t; + init_exp(t, VRELOCABLE, pc); + init_exp(&cc.v, VVOID, 0); /* no value (yet) */ + luaK_exp2nextreg(ls->fs, t); /* fix it at stack top */ + checknext(ls, '{'); + do { + lua_assert(cc.v.k == VVOID || cc.tostore > 0); + if (ls->t.token == '}') break; + closelistfield(fs, &cc); + field(ls, &cc); + } while (testnext(ls, ',') || testnext(ls, ';')); + check_match(ls, '}', '{', line); + lastlistfield(fs, &cc); + SETARG_B(fs->f->code[pc], luaO_int2fb(cc.na)); /* set initial array size */ + SETARG_C(fs->f->code[pc], luaO_int2fb(cc.nh)); /* set initial table size */ +} + +/* }====================================================================== */ + + + +static void parlist (LexState *ls) { + /* parlist -> [ param { `,' param } ] */ + FuncState *fs = ls->fs; + Proto *f = fs->f; + int nparams = 0; + f->is_vararg = 0; + if (ls->t.token != ')') { /* is `parlist' not empty? */ + do { + switch (ls->t.token) { + case TK_NAME: { /* param -> NAME */ + new_localvar(ls, str_checkname(ls)); + nparams++; + break; + } + case TK_DOTS: { /* param -> `...' */ + luaX_next(ls); + f->is_vararg = 1; + break; + } + default: luaX_syntaxerror(ls, " or " LUA_QL("...") " expected"); + } + } while (!f->is_vararg && testnext(ls, ',')); + } + adjustlocalvars(ls, nparams); + f->numparams = cast_byte(fs->nactvar); + luaK_reserveregs(fs, fs->nactvar); /* reserve register for parameters */ +} + + +static void body (LexState *ls, expdesc *e, int ismethod, int line) { + /* body -> `(' parlist `)' block END */ + FuncState new_fs; + BlockCnt bl; + new_fs.f = addprototype(ls); + new_fs.f->linedefined = line; + open_func(ls, &new_fs, &bl); + checknext(ls, '('); + if (ismethod) { + new_localvarliteral(ls, "self"); /* create 'self' parameter */ + adjustlocalvars(ls, 1); + } + parlist(ls); + checknext(ls, ')'); + statlist(ls); + new_fs.f->lastlinedefined = ls->linenumber; + check_match(ls, TK_END, TK_FUNCTION, line); + codeclosure(ls, e); + close_func(ls); +} + + +static int explist (LexState *ls, expdesc *v) { + /* explist -> expr { `,' expr } */ + int n = 1; /* at least one expression */ + expr(ls, v); + while (testnext(ls, ',')) { + luaK_exp2nextreg(ls->fs, v); + expr(ls, v); + n++; + } + return n; +} + + +static void funcargs (LexState *ls, expdesc *f, int line) { + FuncState *fs = ls->fs; + expdesc args; + int base, nparams; + switch (ls->t.token) { + case '(': { /* funcargs -> `(' [ explist ] `)' */ + luaX_next(ls); + if (ls->t.token == ')') /* arg list is empty? */ + args.k = VVOID; + else { + explist(ls, &args); + luaK_setmultret(fs, &args); + } + check_match(ls, ')', '(', line); + break; + } + case '{': { /* funcargs -> constructor */ + constructor(ls, &args); + break; + } + case TK_STRING: { /* funcargs -> STRING */ + codestring(ls, &args, ls->t.seminfo.ts); + luaX_next(ls); /* must use `seminfo' before `next' */ + break; + } + default: { + luaX_syntaxerror(ls, "function arguments expected"); + } + } + lua_assert(f->k == VNONRELOC); + base = f->u.info; /* base register for call */ + if (hasmultret(args.k)) + nparams = LUA_MULTRET; /* open call */ + else { + if (args.k != VVOID) + luaK_exp2nextreg(fs, &args); /* close last argument */ + nparams = fs->freereg - (base+1); + } + init_exp(f, VCALL, luaK_codeABC(fs, OP_CALL, base, nparams+1, 2)); + luaK_fixline(fs, line); + fs->freereg = base+1; /* call remove function and arguments and leaves + (unless changed) one result */ +} + + + + +/* +** {====================================================================== +** Expression parsing +** ======================================================================= +*/ + + +static void primaryexp (LexState *ls, expdesc *v) { + /* primaryexp -> NAME | '(' expr ')' */ + switch (ls->t.token) { + case '(': { + int line = ls->linenumber; + luaX_next(ls); + expr(ls, v); + check_match(ls, ')', '(', line); + luaK_dischargevars(ls->fs, v); + return; + } + case TK_NAME: { + singlevar(ls, v); + return; + } + default: { + luaX_syntaxerror(ls, "unexpected symbol"); + } + } +} + + +static void suffixedexp (LexState *ls, expdesc *v) { + /* suffixedexp -> + primaryexp { '.' NAME | '[' exp ']' | ':' NAME funcargs | funcargs } */ + FuncState *fs = ls->fs; + int line = ls->linenumber; + primaryexp(ls, v); + for (;;) { + switch (ls->t.token) { + case '.': { /* fieldsel */ + fieldsel(ls, v); + break; + } + case '[': { /* `[' exp1 `]' */ + expdesc key; + luaK_exp2anyregup(fs, v); + yindex(ls, &key); + luaK_indexed(fs, v, &key); + break; + } + case ':': { /* `:' NAME funcargs */ + expdesc key; + luaX_next(ls); + checkname(ls, &key); + luaK_self(fs, v, &key); + funcargs(ls, v, line); + break; + } + case '(': case TK_STRING: case '{': { /* funcargs */ + luaK_exp2nextreg(fs, v); + funcargs(ls, v, line); + break; + } + default: return; + } + } +} + + +static void simpleexp (LexState *ls, expdesc *v) { + /* simpleexp -> NUMBER | STRING | NIL | TRUE | FALSE | ... | + constructor | FUNCTION body | suffixedexp */ + switch (ls->t.token) { + case TK_NUMBER: { + init_exp(v, VKNUM, 0); + v->u.nval = ls->t.seminfo.r; + break; + } + case TK_STRING: { + codestring(ls, v, ls->t.seminfo.ts); + break; + } + case TK_NIL: { + init_exp(v, VNIL, 0); + break; + } + case TK_TRUE: { + init_exp(v, VTRUE, 0); + break; + } + case TK_FALSE: { + init_exp(v, VFALSE, 0); + break; + } + case TK_DOTS: { /* vararg */ + FuncState *fs = ls->fs; + check_condition(ls, fs->f->is_vararg, + "cannot use " LUA_QL("...") " outside a vararg function"); + init_exp(v, VVARARG, luaK_codeABC(fs, OP_VARARG, 0, 1, 0)); + break; + } + case '{': { /* constructor */ + constructor(ls, v); + return; + } + case TK_FUNCTION: { + luaX_next(ls); + body(ls, v, 0, ls->linenumber); + return; + } + default: { + suffixedexp(ls, v); + return; + } + } + luaX_next(ls); +} + + +static UnOpr getunopr (int op) { + switch (op) { + case TK_NOT: return OPR_NOT; + case '-': return OPR_MINUS; + case '#': return OPR_LEN; + default: return OPR_NOUNOPR; + } +} + + +static BinOpr getbinopr (int op) { + switch (op) { + case '+': return OPR_ADD; + case '-': return OPR_SUB; + case '*': return OPR_MUL; + case '/': return OPR_DIV; + case '%': return OPR_MOD; + case '^': return OPR_POW; + case TK_CONCAT: return OPR_CONCAT; + case TK_NE: return OPR_NE; + case TK_EQ: return OPR_EQ; + case '<': return OPR_LT; + case TK_LE: return OPR_LE; + case '>': return OPR_GT; + case TK_GE: return OPR_GE; + case TK_AND: return OPR_AND; + case TK_OR: return OPR_OR; + default: return OPR_NOBINOPR; + } +} + + +static const struct { + lu_byte left; /* left priority for each binary operator */ + lu_byte right; /* right priority */ +} priority[] = { /* ORDER OPR */ + {6, 6}, {6, 6}, {7, 7}, {7, 7}, {7, 7}, /* `+' `-' `*' `/' `%' */ + {10, 9}, {5, 4}, /* ^, .. (right associative) */ + {3, 3}, {3, 3}, {3, 3}, /* ==, <, <= */ + {3, 3}, {3, 3}, {3, 3}, /* ~=, >, >= */ + {2, 2}, {1, 1} /* and, or */ +}; + +#define UNARY_PRIORITY 8 /* priority for unary operators */ + + +/* +** subexpr -> (simpleexp | unop subexpr) { binop subexpr } +** where `binop' is any binary operator with a priority higher than `limit' +*/ +static BinOpr subexpr (LexState *ls, expdesc *v, int limit) { + BinOpr op; + UnOpr uop; + enterlevel(ls); + uop = getunopr(ls->t.token); + if (uop != OPR_NOUNOPR) { + int line = ls->linenumber; + luaX_next(ls); + subexpr(ls, v, UNARY_PRIORITY); + luaK_prefix(ls->fs, uop, v, line); + } + else simpleexp(ls, v); + /* expand while operators have priorities higher than `limit' */ + op = getbinopr(ls->t.token); + while (op != OPR_NOBINOPR && priority[op].left > limit) { + expdesc v2; + BinOpr nextop; + int line = ls->linenumber; + luaX_next(ls); + luaK_infix(ls->fs, op, v); + /* read sub-expression with higher priority */ + nextop = subexpr(ls, &v2, priority[op].right); + luaK_posfix(ls->fs, op, v, &v2, line); + op = nextop; + } + leavelevel(ls); + return op; /* return first untreated operator */ +} + + +static void expr (LexState *ls, expdesc *v) { + subexpr(ls, v, 0); +} + +/* }==================================================================== */ + + + +/* +** {====================================================================== +** Rules for Statements +** ======================================================================= +*/ + + +static void block (LexState *ls) { + /* block -> statlist */ + FuncState *fs = ls->fs; + BlockCnt bl; + enterblock(fs, &bl, 0); + statlist(ls); + leaveblock(fs); +} + + +/* +** structure to chain all variables in the left-hand side of an +** assignment +*/ +struct LHS_assign { + struct LHS_assign *prev; + expdesc v; /* variable (global, local, upvalue, or indexed) */ +}; + + +/* +** check whether, in an assignment to an upvalue/local variable, the +** upvalue/local variable is begin used in a previous assignment to a +** table. If so, save original upvalue/local value in a safe place and +** use this safe copy in the previous assignment. +*/ +static void check_conflict (LexState *ls, struct LHS_assign *lh, expdesc *v) { + FuncState *fs = ls->fs; + int extra = fs->freereg; /* eventual position to save local variable */ + int conflict = 0; + for (; lh; lh = lh->prev) { /* check all previous assignments */ + if (lh->v.k == VINDEXED) { /* assigning to a table? */ + /* table is the upvalue/local being assigned now? */ + if (lh->v.u.ind.vt == v->k && lh->v.u.ind.t == v->u.info) { + conflict = 1; + lh->v.u.ind.vt = VLOCAL; + lh->v.u.ind.t = extra; /* previous assignment will use safe copy */ + } + /* index is the local being assigned? (index cannot be upvalue) */ + if (v->k == VLOCAL && lh->v.u.ind.idx == v->u.info) { + conflict = 1; + lh->v.u.ind.idx = extra; /* previous assignment will use safe copy */ + } + } + } + if (conflict) { + /* copy upvalue/local value to a temporary (in position 'extra') */ + OpCode op = (v->k == VLOCAL) ? OP_MOVE : OP_GETUPVAL; + luaK_codeABC(fs, op, extra, v->u.info, 0); + luaK_reserveregs(fs, 1); + } +} + + +static void assignment (LexState *ls, struct LHS_assign *lh, int nvars) { + expdesc e; + check_condition(ls, vkisvar(lh->v.k), "syntax error"); + if (testnext(ls, ',')) { /* assignment -> ',' suffixedexp assignment */ + struct LHS_assign nv; + nv.prev = lh; + suffixedexp(ls, &nv.v); + if (nv.v.k != VINDEXED) + check_conflict(ls, lh, &nv.v); + checklimit(ls->fs, nvars + ls->L->nCcalls, LUAI_MAXCCALLS, + "C levels"); + assignment(ls, &nv, nvars+1); + } + else { /* assignment -> `=' explist */ + int nexps; + checknext(ls, '='); + nexps = explist(ls, &e); + if (nexps != nvars) { + adjust_assign(ls, nvars, nexps, &e); + if (nexps > nvars) + ls->fs->freereg -= nexps - nvars; /* remove extra values */ + } + else { + luaK_setoneret(ls->fs, &e); /* close last expression */ + luaK_storevar(ls->fs, &lh->v, &e); + return; /* avoid default */ + } + } + init_exp(&e, VNONRELOC, ls->fs->freereg-1); /* default assignment */ + luaK_storevar(ls->fs, &lh->v, &e); +} + + +static int cond (LexState *ls) { + /* cond -> exp */ + expdesc v; + expr(ls, &v); /* read condition */ + if (v.k == VNIL) v.k = VFALSE; /* `falses' are all equal here */ + luaK_goiftrue(ls->fs, &v); + return v.f; +} + + +static void gotostat (LexState *ls, int pc) { + int line = ls->linenumber; + TString *label; + int g; + if (testnext(ls, TK_GOTO)) + label = str_checkname(ls); + else { + luaX_next(ls); /* skip break */ + label = luaS_new(ls->L, "break"); + } + g = newlabelentry(ls, &ls->dyd->gt, label, line, pc); + findlabel(ls, g); /* close it if label already defined */ +} + + +/* check for repeated labels on the same block */ +static void checkrepeated (FuncState *fs, Labellist *ll, TString *label) { + int i; + for (i = fs->bl->firstlabel; i < ll->n; i++) { + if (luaS_eqstr(label, ll->arr[i].name)) { + const char *msg = luaO_pushfstring(fs->ls->L, + "label " LUA_QS " already defined on line %d", + getstr(label), ll->arr[i].line); + semerror(fs->ls, msg); + } + } +} + + +/* skip no-op statements */ +static void skipnoopstat (LexState *ls) { + while (ls->t.token == ';' || ls->t.token == TK_DBCOLON) + statement(ls); +} + + +static void labelstat (LexState *ls, TString *label, int line) { + /* label -> '::' NAME '::' */ + FuncState *fs = ls->fs; + Labellist *ll = &ls->dyd->label; + int l; /* index of new label being created */ + checkrepeated(fs, ll, label); /* check for repeated labels */ + checknext(ls, TK_DBCOLON); /* skip double colon */ + /* create new entry for this label */ + l = newlabelentry(ls, ll, label, line, fs->pc); + skipnoopstat(ls); /* skip other no-op statements */ + if (block_follow(ls, 0)) { /* label is last no-op statement in the block? */ + /* assume that locals are already out of scope */ + ll->arr[l].nactvar = fs->bl->nactvar; + } + findgotos(ls, &ll->arr[l]); +} + + +static void whilestat (LexState *ls, int line) { + /* whilestat -> WHILE cond DO block END */ + FuncState *fs = ls->fs; + int whileinit; + int condexit; + BlockCnt bl; + luaX_next(ls); /* skip WHILE */ + whileinit = luaK_getlabel(fs); + condexit = cond(ls); + enterblock(fs, &bl, 1); + checknext(ls, TK_DO); + block(ls); + luaK_jumpto(fs, whileinit); + check_match(ls, TK_END, TK_WHILE, line); + leaveblock(fs); + luaK_patchtohere(fs, condexit); /* false conditions finish the loop */ +} + + +static void repeatstat (LexState *ls, int line) { + /* repeatstat -> REPEAT block UNTIL cond */ + int condexit; + FuncState *fs = ls->fs; + int repeat_init = luaK_getlabel(fs); + BlockCnt bl1, bl2; + enterblock(fs, &bl1, 1); /* loop block */ + enterblock(fs, &bl2, 0); /* scope block */ + luaX_next(ls); /* skip REPEAT */ + statlist(ls); + check_match(ls, TK_UNTIL, TK_REPEAT, line); + condexit = cond(ls); /* read condition (inside scope block) */ + if (bl2.upval) /* upvalues? */ + luaK_patchclose(fs, condexit, bl2.nactvar); + leaveblock(fs); /* finish scope */ + luaK_patchlist(fs, condexit, repeat_init); /* close the loop */ + leaveblock(fs); /* finish loop */ +} + + +static int exp1 (LexState *ls) { + expdesc e; + int reg; + expr(ls, &e); + luaK_exp2nextreg(ls->fs, &e); + lua_assert(e.k == VNONRELOC); + reg = e.u.info; + return reg; +} + + +static void forbody (LexState *ls, int base, int line, int nvars, int isnum) { + /* forbody -> DO block */ + BlockCnt bl; + FuncState *fs = ls->fs; + int prep, endfor; + adjustlocalvars(ls, 3); /* control variables */ + checknext(ls, TK_DO); + prep = isnum ? luaK_codeAsBx(fs, OP_FORPREP, base, NO_JUMP) : luaK_jump(fs); + enterblock(fs, &bl, 0); /* scope for declared variables */ + adjustlocalvars(ls, nvars); + luaK_reserveregs(fs, nvars); + block(ls); + leaveblock(fs); /* end of scope for declared variables */ + luaK_patchtohere(fs, prep); + if (isnum) /* numeric for? */ + endfor = luaK_codeAsBx(fs, OP_FORLOOP, base, NO_JUMP); + else { /* generic for */ + luaK_codeABC(fs, OP_TFORCALL, base, 0, nvars); + luaK_fixline(fs, line); + endfor = luaK_codeAsBx(fs, OP_TFORLOOP, base + 2, NO_JUMP); + } + luaK_patchlist(fs, endfor, prep + 1); + luaK_fixline(fs, line); +} + + +static void fornum (LexState *ls, TString *varname, int line) { + /* fornum -> NAME = exp1,exp1[,exp1] forbody */ + FuncState *fs = ls->fs; + int base = fs->freereg; + new_localvarliteral(ls, "(for index)"); + new_localvarliteral(ls, "(for limit)"); + new_localvarliteral(ls, "(for step)"); + new_localvar(ls, varname); + checknext(ls, '='); + exp1(ls); /* initial value */ + checknext(ls, ','); + exp1(ls); /* limit */ + if (testnext(ls, ',')) + exp1(ls); /* optional step */ + else { /* default step = 1 */ + luaK_codek(fs, fs->freereg, luaK_numberK(fs, 1)); + luaK_reserveregs(fs, 1); + } + forbody(ls, base, line, 1, 1); +} + + +static void forlist (LexState *ls, TString *indexname) { + /* forlist -> NAME {,NAME} IN explist forbody */ + FuncState *fs = ls->fs; + expdesc e; + int nvars = 4; /* gen, state, control, plus at least one declared var */ + int line; + int base = fs->freereg; + /* create control variables */ + new_localvarliteral(ls, "(for generator)"); + new_localvarliteral(ls, "(for state)"); + new_localvarliteral(ls, "(for control)"); + /* create declared variables */ + new_localvar(ls, indexname); + while (testnext(ls, ',')) { + new_localvar(ls, str_checkname(ls)); + nvars++; + } + checknext(ls, TK_IN); + line = ls->linenumber; + adjust_assign(ls, 3, explist(ls, &e), &e); + luaK_checkstack(fs, 3); /* extra space to call generator */ + forbody(ls, base, line, nvars - 3, 0); +} + + +static void forstat (LexState *ls, int line) { + /* forstat -> FOR (fornum | forlist) END */ + FuncState *fs = ls->fs; + TString *varname; + BlockCnt bl; + enterblock(fs, &bl, 1); /* scope for loop and control variables */ + luaX_next(ls); /* skip `for' */ + varname = str_checkname(ls); /* first variable name */ + switch (ls->t.token) { + case '=': fornum(ls, varname, line); break; + case ',': case TK_IN: forlist(ls, varname); break; + default: luaX_syntaxerror(ls, LUA_QL("=") " or " LUA_QL("in") " expected"); + } + check_match(ls, TK_END, TK_FOR, line); + leaveblock(fs); /* loop scope (`break' jumps to this point) */ +} + + +__attribute__((always_inline)) inline +static void test_then_block (LexState *ls, int *escapelist) { + /* test_then_block -> [IF | ELSEIF] cond THEN block */ + BlockCnt bl; + FuncState *fs = ls->fs; + expdesc v; + int jf; /* instruction to skip 'then' code (if condition is false) */ + luaX_next(ls); /* skip IF or ELSEIF */ + expr(ls, &v); /* read condition */ + checknext(ls, TK_THEN); + if (ls->t.token == TK_GOTO || ls->t.token == TK_BREAK) { + luaK_goiffalse(ls->fs, &v); /* will jump to label if condition is true */ + enterblock(fs, &bl, 0); /* must enter block before 'goto' */ + gotostat(ls, v.t); /* handle goto/break */ + skipnoopstat(ls); /* skip other no-op statements */ + if (block_follow(ls, 0)) { /* 'goto' is the entire block? */ + leaveblock(fs); + return; /* and that is it */ + } + else /* must skip over 'then' part if condition is false */ + jf = luaK_jump(fs); + } + else { /* regular case (not goto/break) */ + luaK_goiftrue(ls->fs, &v); /* skip over block if condition is false */ + enterblock(fs, &bl, 0); + jf = v.f; + } + statlist(ls); /* `then' part */ + leaveblock(fs); + if (ls->t.token == TK_ELSE || + ls->t.token == TK_ELSEIF) /* followed by 'else'/'elseif'? */ + luaK_concat(fs, escapelist, luaK_jump(fs)); /* must jump over it */ + luaK_patchtohere(fs, jf); +} + + +static void ifstat (LexState *ls, int line) { + /* ifstat -> IF cond THEN block {ELSEIF cond THEN block} [ELSE block] END */ + FuncState *fs = ls->fs; + int escapelist = NO_JUMP; /* exit list for finished parts */ + test_then_block(ls, &escapelist); /* IF cond THEN block */ + while (ls->t.token == TK_ELSEIF) + test_then_block(ls, &escapelist); /* ELSEIF cond THEN block */ + if (testnext(ls, TK_ELSE)) + block(ls); /* `else' part */ + check_match(ls, TK_END, TK_IF, line); + luaK_patchtohere(fs, escapelist); /* patch escape list to 'if' end */ +} + + +static void localfunc (LexState *ls) { + expdesc b; + FuncState *fs = ls->fs; + new_localvar(ls, str_checkname(ls)); /* new local variable */ + adjustlocalvars(ls, 1); /* enter its scope */ + body(ls, &b, 0, ls->linenumber); /* function created in next register */ + /* debug information will only see the variable after this point! */ + getlocvar(fs, b.u.info)->startpc = fs->pc; +} + + +static void localstat (LexState *ls) { + /* stat -> LOCAL NAME {`,' NAME} [`=' explist] */ + int nvars = 0; + int nexps; + expdesc e; + do { + new_localvar(ls, str_checkname(ls)); + nvars++; + } while (testnext(ls, ',')); + if (testnext(ls, '=')) + nexps = explist(ls, &e); + else { + e.k = VVOID; + nexps = 0; + } + adjust_assign(ls, nvars, nexps, &e); + adjustlocalvars(ls, nvars); +} + + +static int funcname (LexState *ls, expdesc *v) { + /* funcname -> NAME {fieldsel} [`:' NAME] */ + int ismethod = 0; + singlevar(ls, v); + while (ls->t.token == '.') + fieldsel(ls, v); + if (ls->t.token == ':') { + ismethod = 1; + fieldsel(ls, v); + } + return ismethod; +} + + +static void funcstat (LexState *ls, int line) { + /* funcstat -> FUNCTION funcname body */ + int ismethod; + expdesc v, b; + luaX_next(ls); /* skip FUNCTION */ + ismethod = funcname(ls, &v); + body(ls, &b, ismethod, line); + luaK_storevar(ls->fs, &v, &b); + luaK_fixline(ls->fs, line); /* definition `happens' in the first line */ +} + + +static void exprstat (LexState *ls) { + /* stat -> func | assignment */ + FuncState *fs = ls->fs; + struct LHS_assign v; + suffixedexp(ls, &v.v); + if (ls->t.token == '=' || ls->t.token == ',') { /* stat -> assignment ? */ + v.prev = NULL; + assignment(ls, &v, 1); + } + else { /* stat -> func */ + check_condition(ls, v.v.k == VCALL, "syntax error"); + SETARG_C(getcode(fs, &v.v), 1); /* call statement uses no results */ + } +} + + +static void retstat (LexState *ls) { + /* stat -> RETURN [explist] [';'] */ + FuncState *fs = ls->fs; + expdesc e; + int first, nret; /* registers with returned values */ + if (block_follow(ls, 1) || ls->t.token == ';') + first = nret = 0; /* return no values */ + else { + nret = explist(ls, &e); /* optional return values */ + if (hasmultret(e.k)) { + luaK_setmultret(fs, &e); + if (e.k == VCALL && nret == 1) { /* tail call? */ + SET_OPCODE(getcode(fs,&e), OP_TAILCALL); + lua_assert(GETARG_A(getcode(fs,&e)) == fs->nactvar); + } + first = fs->nactvar; + nret = LUA_MULTRET; /* return all values */ + } + else { + if (nret == 1) /* only one single value? */ + first = luaK_exp2anyreg(fs, &e); + else { + luaK_exp2nextreg(fs, &e); /* values must go to the `stack' */ + first = fs->nactvar; /* return all `active' values */ + lua_assert(nret == fs->freereg - first); + } + } + } + luaK_ret(fs, first, nret); + (void) testnext(ls, ';'); /* skip optional semicolon */ +} + + +static void statement (LexState *ls) { + int line = ls->linenumber; /* may be needed for error messages */ + enterlevel(ls); + switch (ls->t.token) { + case ';': { /* stat -> ';' (empty statement) */ + luaX_next(ls); /* skip ';' */ + break; + } + case TK_IF: { /* stat -> ifstat */ + ifstat(ls, line); + break; + } + case TK_WHILE: { /* stat -> whilestat */ + whilestat(ls, line); + break; + } + case TK_DO: { /* stat -> DO block END */ + luaX_next(ls); /* skip DO */ + block(ls); + check_match(ls, TK_END, TK_DO, line); + break; + } + case TK_FOR: { /* stat -> forstat */ + forstat(ls, line); + break; + } + case TK_REPEAT: { /* stat -> repeatstat */ + repeatstat(ls, line); + break; + } + case TK_FUNCTION: { /* stat -> funcstat */ + funcstat(ls, line); + break; + } + case TK_LOCAL: { /* stat -> localstat */ + luaX_next(ls); /* skip LOCAL */ + if (testnext(ls, TK_FUNCTION)) /* local function? */ + localfunc(ls); + else + localstat(ls); + break; + } + case TK_DBCOLON: { /* stat -> label */ + luaX_next(ls); /* skip double colon */ + labelstat(ls, str_checkname(ls), line); + break; + } + case TK_RETURN: { /* stat -> retstat */ + luaX_next(ls); /* skip RETURN */ + retstat(ls); + break; + } + case TK_BREAK: /* stat -> breakstat */ + case TK_GOTO: { /* stat -> 'goto' NAME */ + gotostat(ls, luaK_jump(ls->fs)); + break; + } + default: { /* stat -> func | assignment */ + exprstat(ls); + break; + } + } + lua_assert(ls->fs->f->maxstacksize >= ls->fs->freereg && + ls->fs->freereg >= ls->fs->nactvar); + ls->fs->freereg = ls->fs->nactvar; /* free registers */ + leavelevel(ls); +} + +/* }====================================================================== */ + + +/* +** compiles the main function, which is a regular vararg function with an +** upvalue named LUA_ENV +*/ +static void mainfunc (LexState *ls, FuncState *fs) { + BlockCnt bl; + expdesc v; + open_func(ls, fs, &bl); + fs->f->is_vararg = 1; /* main function is always vararg */ + init_exp(&v, VLOCAL, 0); /* create and... */ + newupvalue(fs, ls->envn, &v); /* ...set environment upvalue */ + luaX_next(ls); /* read first token */ + statlist(ls); /* parse main body */ + check(ls, TK_EOS); + close_func(ls); +} + + +Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, + Dyndata *dyd, const char *name, int firstchar) { + LexState lexstate; + FuncState funcstate; + Closure *cl = luaF_newLclosure(L, 1); /* create main closure */ + /* anchor closure (to avoid being collected) */ + setclLvalue(L, L->top, cl); + incr_top(L); + funcstate.f = cl->l.p = luaF_newproto(L); + funcstate.f->source = luaS_new(L, name); /* create and anchor TString */ + lexstate.buff = buff; + lexstate.dyd = dyd; + dyd->actvar.n = dyd->gt.n = dyd->label.n = 0; + luaX_setinput(L, &lexstate, z, funcstate.f->source, firstchar); + mainfunc(&lexstate, &funcstate); + lua_assert(!funcstate.prev && funcstate.nups == 1 && !lexstate.fs); + /* all scopes should be correctly finished */ + lua_assert(dyd->actvar.n == 0 && dyd->gt.n == 0 && dyd->label.n == 0); + return cl; /* it's on the stack too */ +} +/* END CSTYLED */ diff --git a/module/lua/lparser.h b/module/lua/lparser.h new file mode 100644 index 000000000000..8aea0523f3e3 --- /dev/null +++ b/module/lua/lparser.h @@ -0,0 +1,121 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lparser.h,v 1.70.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua Parser +** See Copyright Notice in lua.h +*/ + +#ifndef lparser_h +#define lparser_h + +#include "llimits.h" +#include "lobject.h" +#include "lzio.h" + + +/* +** Expression descriptor +*/ + +typedef enum { + VVOID, /* no value */ + VNIL, + VTRUE, + VFALSE, + VK, /* info = index of constant in `k' */ + VKNUM, /* nval = numerical value */ + VNONRELOC, /* info = result register */ + VLOCAL, /* info = local register */ + VUPVAL, /* info = index of upvalue in 'upvalues' */ + VINDEXED, /* t = table register/upvalue; idx = index R/K */ + VJMP, /* info = instruction pc */ + VRELOCABLE, /* info = instruction pc */ + VCALL, /* info = instruction pc */ + VVARARG /* info = instruction pc */ +} expkind; + + +#define vkisvar(k) (VLOCAL <= (k) && (k) <= VINDEXED) +#define vkisinreg(k) ((k) == VNONRELOC || (k) == VLOCAL) + +typedef struct expdesc { + expkind k; + union { + struct { /* for indexed variables (VINDEXED) */ + short idx; /* index (R/K) */ + lu_byte t; /* table (register or upvalue) */ + lu_byte vt; /* whether 't' is register (VLOCAL) or upvalue (VUPVAL) */ + } ind; + int info; /* for generic use */ + lua_Number nval; /* for VKNUM */ + } u; + int t; /* patch list of `exit when true' */ + int f; /* patch list of `exit when false' */ +} expdesc; + + +/* description of active local variable */ +typedef struct Vardesc { + short idx; /* variable index in stack */ +} Vardesc; + + +/* description of pending goto statements and label statements */ +typedef struct Labeldesc { + TString *name; /* label identifier */ + int pc; /* position in code */ + int line; /* line where it appeared */ + lu_byte nactvar; /* local level where it appears in current block */ +} Labeldesc; + + +/* list of labels or gotos */ +typedef struct Labellist { + Labeldesc *arr; /* array */ + int n; /* number of entries in use */ + int size; /* array size */ +} Labellist; + + +/* dynamic structures used by the parser */ +typedef struct Dyndata { + struct { /* list of active local variables */ + Vardesc *arr; + int n; + int size; + } actvar; + Labellist gt; /* list of pending gotos */ + Labellist label; /* list of active labels */ +} Dyndata; + + +/* control of blocks */ +struct BlockCnt; /* defined in lparser.c */ + + +/* state needed to generate code for a given function */ +typedef struct FuncState { + Proto *f; /* current function header */ + Table *h; /* table to find (and reuse) elements in `k' */ + struct FuncState *prev; /* enclosing function */ + struct LexState *ls; /* lexical state */ + struct BlockCnt *bl; /* chain of current blocks */ + int pc; /* next position to code (equivalent to `ncode') */ + int lasttarget; /* 'label' of last 'jump label' */ + int jpc; /* list of pending jumps to `pc' */ + int nk; /* number of elements in `k' */ + int np; /* number of elements in `p' */ + int firstlocal; /* index of first local var (in Dyndata array) */ + short nlocvars; /* number of elements in 'f->locvars' */ + lu_byte nactvar; /* number of active local variables */ + lu_byte nups; /* number of upvalues */ + lu_byte freereg; /* first free register */ +} FuncState; + + +LUAI_FUNC Closure *luaY_parser (lua_State *L, ZIO *z, Mbuffer *buff, + Dyndata *dyd, const char *name, int firstchar); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lstate.c b/module/lua/lstate.c new file mode 100644 index 000000000000..4d196eced6a3 --- /dev/null +++ b/module/lua/lstate.c @@ -0,0 +1,320 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lstate.c,v 2.99.1.2 2013/11/08 17:45:31 roberto Exp $ +** Global State +** See Copyright Notice in lua.h +*/ + + +#define lstate_c +#define LUA_CORE + +#include + +#include "lapi.h" +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "llex.h" +#include "lmem.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + +#if !defined(LUAI_GCPAUSE) +#define LUAI_GCPAUSE 200 /* 200% */ +#endif + +#if !defined(LUAI_GCMAJOR) +#define LUAI_GCMAJOR 200 /* 200% */ +#endif + +#if !defined(LUAI_GCMUL) +#define LUAI_GCMUL 200 /* GC runs 'twice the speed' of memory allocation */ +#endif + + +#define MEMERRMSG "not enough memory" + + +/* +** a macro to help the creation of a unique random seed when a state is +** created; the seed is used to randomize hashes. +*/ +#if !defined(luai_makeseed) +#define luai_makeseed() cast(unsigned int, gethrtime()) +#endif + + + +/* +** thread state + extra space +*/ +typedef struct LX { +#if defined(LUAI_EXTRASPACE) + char buff[LUAI_EXTRASPACE]; +#endif + lua_State l; +} LX; + + +/* +** Main thread combines a thread state and the global state +*/ +typedef struct LG { + LX l; + global_State g; +} LG; + + + +#define fromstate(L) (cast(LX *, cast(lu_byte *, (L)) - offsetof(LX, l))) + + +/* +** Compute an initial seed as random as possible. In ANSI, rely on +** Address Space Layout Randomization (if present) to increase +** randomness.. +*/ +#define addbuff(b,p,e) \ + { size_t t = cast(size_t, e); \ + memcpy(buff + p, &t, sizeof(t)); p += sizeof(t); } + +static unsigned int makeseed (lua_State *L) { + char buff[4 * sizeof(size_t)]; + unsigned int h = luai_makeseed(); + int p = 0; + addbuff(buff, p, L); /* heap variable */ + addbuff(buff, p, &h); /* local variable */ + addbuff(buff, p, luaO_nilobject); /* global variable */ + addbuff(buff, p, &lua_newstate); /* public function */ + lua_assert(p == sizeof(buff)); + return luaS_hash(buff, p, h); +} + + +/* +** set GCdebt to a new value keeping the value (totalbytes + GCdebt) +** invariant +*/ +void luaE_setdebt (global_State *g, l_mem debt) { + g->totalbytes -= (debt - g->GCdebt); + g->GCdebt = debt; +} + + +CallInfo *luaE_extendCI (lua_State *L) { + CallInfo *ci = luaM_new(L, CallInfo); + lua_assert(L->ci->next == NULL); + L->ci->next = ci; + ci->previous = L->ci; + ci->next = NULL; + return ci; +} + + +void luaE_freeCI (lua_State *L) { + CallInfo *ci = L->ci; + CallInfo *next = ci->next; + ci->next = NULL; + while ((ci = next) != NULL) { + next = ci->next; + luaM_free(L, ci); + } +} + + +static void stack_init (lua_State *L1, lua_State *L) { + int i; CallInfo *ci; + /* initialize stack array */ + L1->stack = luaM_newvector(L, BASIC_STACK_SIZE, TValue); + L1->stacksize = BASIC_STACK_SIZE; + for (i = 0; i < BASIC_STACK_SIZE; i++) + setnilvalue(L1->stack + i); /* erase new stack */ + L1->top = L1->stack; + L1->stack_last = L1->stack + L1->stacksize - EXTRA_STACK; + /* initialize first ci */ + ci = &L1->base_ci; + ci->next = ci->previous = NULL; + ci->callstatus = 0; + ci->func = L1->top; + setnilvalue(L1->top++); /* 'function' entry for this 'ci' */ + ci->top = L1->top + LUA_MINSTACK; + L1->ci = ci; +} + + +static void freestack (lua_State *L) { + if (L->stack == NULL) + return; /* stack not completely built yet */ + L->ci = &L->base_ci; /* free the entire 'ci' list */ + luaE_freeCI(L); + luaM_freearray(L, L->stack, L->stacksize); /* free stack array */ +} + + +/* +** Create registry table and its predefined values +*/ +static void init_registry (lua_State *L, global_State *g) { + TValue mt; + /* create registry */ + Table *registry = luaH_new(L); + sethvalue(L, &g->l_registry, registry); + luaH_resize(L, registry, LUA_RIDX_LAST, 0); + /* registry[LUA_RIDX_MAINTHREAD] = L */ + setthvalue(L, &mt, L); + luaH_setint(L, registry, LUA_RIDX_MAINTHREAD, &mt); + /* registry[LUA_RIDX_GLOBALS] = table of globals */ + sethvalue(L, &mt, luaH_new(L)); + luaH_setint(L, registry, LUA_RIDX_GLOBALS, &mt); +} + + +/* +** open parts of the state that may cause memory-allocation errors +*/ +static void f_luaopen (lua_State *L, void *ud) { + global_State *g = G(L); + UNUSED(ud); + stack_init(L, L); /* init stack */ + init_registry(L, g); + luaS_resize(L, MINSTRTABSIZE); /* initial size of string table */ + luaT_init(L); + luaX_init(L); + /* pre-create memory-error message */ + g->memerrmsg = luaS_newliteral(L, MEMERRMSG); + luaS_fix(g->memerrmsg); /* it should never be collected */ + g->gcrunning = 1; /* allow gc */ + g->version = lua_version(NULL); + luai_userstateopen(L); +} + + +/* +** preinitialize a state with consistent values without allocating +** any memory (to avoid errors) +*/ +static void preinit_state (lua_State *L, global_State *g) { + G(L) = g; + L->stack = NULL; + L->ci = NULL; + L->stacksize = 0; + L->errorJmp = NULL; + L->nCcalls = 0; + L->hook = NULL; + L->hookmask = 0; + L->basehookcount = 0; + L->allowhook = 1; + resethookcount(L); + L->openupval = NULL; + L->nny = 1; + L->status = LUA_OK; + L->errfunc = 0; + L->runerror = 0; +} + + +static void close_state (lua_State *L) { + global_State *g = G(L); + luaF_close(L, L->stack); /* close all upvalues for this thread */ + luaC_freeallobjects(L); /* collect all objects */ + if (g->version) /* closing a fully built state? */ + luai_userstateclose(L); + luaM_freearray(L, G(L)->strt.hash, G(L)->strt.size); + luaZ_freebuffer(L, &g->buff); + freestack(L); + lua_assert(gettotalbytes(g) == sizeof(LG)); + (*g->frealloc)(g->ud, fromstate(L), sizeof(LG), 0); /* free main block */ +} + + +LUA_API lua_State *lua_newthread (lua_State *L) { + lua_State *L1; + lua_lock(L); + luaC_checkGC(L); + L1 = &luaC_newobj(L, LUA_TTHREAD, sizeof(LX), NULL, offsetof(LX, l))->th; + setthvalue(L, L->top, L1); + api_incr_top(L); + preinit_state(L1, G(L)); + L1->hookmask = L->hookmask; + L1->basehookcount = L->basehookcount; + L1->hook = L->hook; + resethookcount(L1); + luai_userstatethread(L, L1); + stack_init(L1, L); /* init stack */ + lua_unlock(L); + return L1; +} + + +void luaE_freethread (lua_State *L, lua_State *L1) { + LX *l = fromstate(L1); + luaF_close(L1, L1->stack); /* close all upvalues for this thread */ + lua_assert(L1->openupval == NULL); + luai_userstatefree(L, L1); + freestack(L1); + luaM_free(L, l); +} + + +LUA_API lua_State *lua_newstate (lua_Alloc f, void *ud) { + int i; + lua_State *L; + global_State *g; + LG *l = cast(LG *, (*f)(ud, NULL, LUA_TTHREAD, sizeof(LG))); + if (l == NULL) return NULL; + L = &l->l.l; + g = &l->g; + L->next = NULL; + L->tt = LUA_TTHREAD; + g->currentwhite = bit2mask(WHITE0BIT, FIXEDBIT); + L->marked = luaC_white(g); + g->gckind = KGC_NORMAL; + preinit_state(L, g); + g->frealloc = f; + g->ud = ud; + g->mainthread = L; + g->seed = makeseed(L); + g->uvhead.u.l.prev = &g->uvhead; + g->uvhead.u.l.next = &g->uvhead; + g->gcrunning = 0; /* no GC while building state */ + g->GCestimate = 0; + g->strt.size = 0; + g->strt.nuse = 0; + g->strt.hash = NULL; + setnilvalue(&g->l_registry); + luaZ_initbuffer(L, &g->buff); + g->panic = NULL; + g->version = NULL; + g->gcstate = GCSpause; + g->allgc = NULL; + g->finobj = NULL; + g->tobefnz = NULL; + g->sweepgc = g->sweepfin = NULL; + g->gray = g->grayagain = NULL; + g->weak = g->ephemeron = g->allweak = NULL; + g->totalbytes = sizeof(LG); + g->GCdebt = 0; + g->gcpause = LUAI_GCPAUSE; + g->gcmajorinc = LUAI_GCMAJOR; + g->gcstepmul = LUAI_GCMUL; + for (i=0; i < LUA_NUMTAGS; i++) g->mt[i] = NULL; + if (luaD_rawrunprotected(L, f_luaopen, NULL) != LUA_OK) { + /* memory allocation error: free partial state */ + close_state(L); + L = NULL; + } + return L; +} + + +LUA_API void lua_close (lua_State *L) { + L = G(L)->mainthread; /* only the main thread can be closed */ + lua_lock(L); + close_state(L); +} +/* END CSTYLED */ diff --git a/module/lua/lstate.h b/module/lua/lstate.h new file mode 100644 index 000000000000..b636396a6015 --- /dev/null +++ b/module/lua/lstate.h @@ -0,0 +1,230 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lstate.h,v 2.82.1.1 2013/04/12 18:48:47 roberto Exp $ +** Global State +** See Copyright Notice in lua.h +*/ + +#ifndef lstate_h +#define lstate_h + +#include + +#include "lobject.h" +#include "ltm.h" +#include "lzio.h" + + +/* + +** Some notes about garbage-collected objects: All objects in Lua must +** be kept somehow accessible until being freed. +** +** Lua keeps most objects linked in list g->allgc. The link uses field +** 'next' of the CommonHeader. +** +** Strings are kept in several lists headed by the array g->strt.hash. +** +** Open upvalues are not subject to independent garbage collection. They +** are collected together with their respective threads. Lua keeps a +** double-linked list with all open upvalues (g->uvhead) so that it can +** mark objects referred by them. (They are always gray, so they must +** be remarked in the atomic step. Usually their contents would be marked +** when traversing the respective threads, but the thread may already be +** dead, while the upvalue is still accessible through closures.) +** +** Objects with finalizers are kept in the list g->finobj. +** +** The list g->tobefnz links all objects being finalized. + +*/ + + +struct lua_longjmp; /* defined in ldo.c */ + + + +/* extra stack space to handle TM calls and some other extras */ +#define EXTRA_STACK 5 + + +#define BASIC_STACK_SIZE (2*LUA_MINSTACK) + + +/* kinds of Garbage Collection */ +#define KGC_NORMAL 0 +#define KGC_EMERGENCY 1 /* gc was forced by an allocation failure */ +#define KGC_GEN 2 /* generational collection */ + + +typedef struct stringtable { + GCObject **hash; + lu_int32 nuse; /* number of elements */ + int size; +} stringtable; + + +/* +** information about a call +*/ +typedef struct CallInfo { + StkId func; /* function index in the stack */ + StkId top; /* top for this function */ + struct CallInfo *previous, *next; /* dynamic call link */ + short nresults; /* expected number of results from this function */ + lu_byte callstatus; + ptrdiff_t extra; + union { + struct { /* only for Lua functions */ + StkId base; /* base for this function */ + const Instruction *savedpc; + } l; + struct { /* only for C functions */ + int ctx; /* context info. in case of yields */ + lua_CFunction k; /* continuation in case of yields */ + ptrdiff_t old_errfunc; + lu_byte old_allowhook; + lu_byte status; + } c; + } u; +} CallInfo; + + +/* +** Bits in CallInfo status +*/ +#define CIST_LUA (1<<0) /* call is running a Lua function */ +#define CIST_HOOKED (1<<1) /* call is running a debug hook */ +#define CIST_REENTRY (1<<2) /* call is running on same invocation of + luaV_execute of previous call */ +#define CIST_YIELDED (1<<3) /* call reentered after suspension */ +#define CIST_YPCALL (1<<4) /* call is a yieldable protected call */ +#define CIST_STAT (1<<5) /* call has an error status (pcall) */ +#define CIST_TAIL (1<<6) /* call was tail called */ +#define CIST_HOOKYIELD (1<<7) /* last hook called yielded */ + + +#define isLua(ci) ((ci)->callstatus & CIST_LUA) + + +/* +** `global state', shared by all threads of this state +*/ +typedef struct global_State { + lua_Alloc frealloc; /* function to reallocate memory */ + void *ud; /* auxiliary data to `frealloc' */ + lu_mem totalbytes; /* number of bytes currently allocated - GCdebt */ + l_mem GCdebt; /* bytes allocated not yet compensated by the collector */ + lu_mem GCmemtrav; /* memory traversed by the GC */ + lu_mem GCestimate; /* an estimate of the non-garbage memory in use */ + stringtable strt; /* hash table for strings */ + TValue l_registry; + unsigned int seed; /* randomized seed for hashes */ + lu_byte currentwhite; + lu_byte gcstate; /* state of garbage collector */ + lu_byte gckind; /* kind of GC running */ + lu_byte gcrunning; /* true if GC is running */ + int sweepstrgc; /* position of sweep in `strt' */ + GCObject *allgc; /* list of all collectable objects */ + GCObject *finobj; /* list of collectable objects with finalizers */ + GCObject **sweepgc; /* current position of sweep in list 'allgc' */ + GCObject **sweepfin; /* current position of sweep in list 'finobj' */ + GCObject *gray; /* list of gray objects */ + GCObject *grayagain; /* list of objects to be traversed atomically */ + GCObject *weak; /* list of tables with weak values */ + GCObject *ephemeron; /* list of ephemeron tables (weak keys) */ + GCObject *allweak; /* list of all-weak tables */ + GCObject *tobefnz; /* list of userdata to be GC */ + UpVal uvhead; /* head of double-linked list of all open upvalues */ + Mbuffer buff; /* temporary buffer for string concatenation */ + int gcpause; /* size of pause between successive GCs */ + int gcmajorinc; /* pause between major collections (only in gen. mode) */ + int gcstepmul; /* GC `granularity' */ + lua_CFunction panic; /* to be called in unprotected errors */ + struct lua_State *mainthread; + const lua_Number *version; /* pointer to version number */ + TString *memerrmsg; /* memory-error message */ + TString *tmname[TM_N]; /* array with tag-method names */ + struct Table *mt[LUA_NUMTAGS]; /* metatables for basic types */ +} global_State; + + +/* +** `per thread' state +*/ +struct lua_State { + CommonHeader; + lu_byte status; + StkId top; /* first free slot in the stack */ + global_State *l_G; + CallInfo *ci; /* call info for current function */ + const Instruction *oldpc; /* last pc traced */ + StkId stack_last; /* last free slot in the stack */ + StkId stack; /* stack base */ + int stacksize; + unsigned short nny; /* number of non-yieldable calls in stack */ + unsigned short nCcalls; /* number of nested C calls */ + lu_byte hookmask; + lu_byte allowhook; + lu_byte runerror; /* handling a runtime error */ + int basehookcount; + int hookcount; + lua_Hook hook; + GCObject *openupval; /* list of open upvalues in this stack */ + GCObject *gclist; + struct lua_longjmp *errorJmp; /* current error recover point */ + ptrdiff_t errfunc; /* current error handling function (stack index) */ + CallInfo base_ci; /* CallInfo for first level (C calling Lua) */ +}; + + +#define G(L) (L->l_G) + + +/* +** Union of all collectable objects +*/ +union GCObject { + GCheader gch; /* common header */ + union TString ts; + union Udata u; + union Closure cl; + struct Table h; + struct Proto p; + struct UpVal uv; + struct lua_State th; /* thread */ +}; + + +#define gch(o) (&(o)->gch) + +/* macros to convert a GCObject into a specific value */ +#define rawgco2ts(o) \ + check_exp(novariant((o)->gch.tt) == LUA_TSTRING, &((o)->ts)) +#define gco2ts(o) (&rawgco2ts(o)->tsv) +#define rawgco2u(o) check_exp((o)->gch.tt == LUA_TUSERDATA, &((o)->u)) +#define gco2u(o) (&rawgco2u(o)->uv) +#define gco2lcl(o) check_exp((o)->gch.tt == LUA_TLCL, &((o)->cl.l)) +#define gco2ccl(o) check_exp((o)->gch.tt == LUA_TCCL, &((o)->cl.c)) +#define gco2cl(o) \ + check_exp(novariant((o)->gch.tt) == LUA_TFUNCTION, &((o)->cl)) +#define gco2t(o) check_exp((o)->gch.tt == LUA_TTABLE, &((o)->h)) +#define gco2p(o) check_exp((o)->gch.tt == LUA_TPROTO, &((o)->p)) +#define gco2uv(o) check_exp((o)->gch.tt == LUA_TUPVAL, &((o)->uv)) +#define gco2th(o) check_exp((o)->gch.tt == LUA_TTHREAD, &((o)->th)) + +/* macro to convert any Lua object into a GCObject */ +#define obj2gco(v) (cast(GCObject *, (v))) + + +/* actual number of total bytes allocated */ +#define gettotalbytes(g) ((g)->totalbytes + (g)->GCdebt) + +LUAI_FUNC void luaE_setdebt (global_State *g, l_mem debt); +LUAI_FUNC void luaE_freethread (lua_State *L, lua_State *L1); +LUAI_FUNC CallInfo *luaE_extendCI (lua_State *L); +LUAI_FUNC void luaE_freeCI (lua_State *L); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lstring.c b/module/lua/lstring.c new file mode 100644 index 000000000000..7fcef3d88aa3 --- /dev/null +++ b/module/lua/lstring.c @@ -0,0 +1,186 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lstring.c,v 2.26.1.1 2013/04/12 18:48:47 roberto Exp $ +** String table (keeps all strings handled by Lua) +** See Copyright Notice in lua.h +*/ + + +#define lstring_c +#define LUA_CORE + +#include + +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" + + +/* +** Lua will use at most ~(2^LUAI_HASHLIMIT) bytes from a string to +** compute its hash +*/ +#if !defined(LUAI_HASHLIMIT) +#define LUAI_HASHLIMIT 5 +#endif + + +/* +** equality for long strings +*/ +int luaS_eqlngstr (TString *a, TString *b) { + size_t len = a->tsv.len; + lua_assert(a->tsv.tt == LUA_TLNGSTR && b->tsv.tt == LUA_TLNGSTR); + return (a == b) || /* same instance or... */ + ((len == b->tsv.len) && /* equal length and ... */ + (memcmp(getstr(a), getstr(b), len) == 0)); /* equal contents */ +} + + +/* +** equality for strings +*/ +int luaS_eqstr (TString *a, TString *b) { + return (a->tsv.tt == b->tsv.tt) && + (a->tsv.tt == LUA_TSHRSTR ? eqshrstr(a, b) : luaS_eqlngstr(a, b)); +} + + +unsigned int luaS_hash (const char *str, size_t l, unsigned int seed) { + unsigned int h = seed ^ cast(unsigned int, l); + size_t l1; + size_t step = (l >> LUAI_HASHLIMIT) + 1; + for (l1 = l; l1 >= step; l1 -= step) + h = h ^ ((h<<5) + (h>>2) + cast_byte(str[l1 - 1])); + return h; +} + + +/* +** resizes the string table +*/ +void luaS_resize (lua_State *L, int newsize) { + int i; + stringtable *tb = &G(L)->strt; + /* cannot resize while GC is traversing strings */ + luaC_runtilstate(L, ~bitmask(GCSsweepstring)); + if (newsize > tb->size) { + luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); + for (i = tb->size; i < newsize; i++) tb->hash[i] = NULL; + } + /* rehash */ + for (i=0; isize; i++) { + GCObject *p = tb->hash[i]; + tb->hash[i] = NULL; + while (p) { /* for each node in the list */ + GCObject *next = gch(p)->next; /* save next */ + unsigned int h = lmod(gco2ts(p)->hash, newsize); /* new position */ + gch(p)->next = tb->hash[h]; /* chain it */ + tb->hash[h] = p; + resetoldbit(p); /* see MOVE OLD rule */ + p = next; + } + } + if (newsize < tb->size) { + /* shrinking slice must be empty */ + lua_assert(tb->hash[newsize] == NULL && tb->hash[tb->size - 1] == NULL); + luaM_reallocvector(L, tb->hash, tb->size, newsize, GCObject *); + } + tb->size = newsize; +} + + +/* +** creates a new string object +*/ +static TString *createstrobj (lua_State *L, const char *str, size_t l, + int tag, unsigned int h, GCObject **list) { + TString *ts; + char *sbuf; + size_t totalsize; /* total size of TString object */ + totalsize = sizeof(TString) + ((l + 1) * sizeof(char)); + ts = &luaC_newobj(L, tag, totalsize, list, 0)->ts; + ts->tsv.len = l; + ts->tsv.hash = h; + ts->tsv.extra = 0; + sbuf = (char *)(TString *)(ts + 1); + memcpy(sbuf, str, l*sizeof(char)); + sbuf[l] = '\0'; /* ending 0 */ + return ts; +} + + +/* +** creates a new short string, inserting it into string table +*/ +static TString *newshrstr (lua_State *L, const char *str, size_t l, + unsigned int h) { + GCObject **list; /* (pointer to) list where it will be inserted */ + stringtable *tb = &G(L)->strt; + TString *s; + if (tb->nuse >= cast(lu_int32, tb->size) && tb->size <= MAX_INT/2) + luaS_resize(L, tb->size*2); /* too crowded */ + list = &tb->hash[lmod(h, tb->size)]; + s = createstrobj(L, str, l, LUA_TSHRSTR, h, list); + tb->nuse++; + return s; +} + + +/* +** checks whether short string exists and reuses it or creates a new one +*/ +static TString *internshrstr (lua_State *L, const char *str, size_t l) { + GCObject *o; + global_State *g = G(L); + unsigned int h = luaS_hash(str, l, g->seed); + for (o = g->strt.hash[lmod(h, g->strt.size)]; + o != NULL; + o = gch(o)->next) { + TString *ts = rawgco2ts(o); + if (h == ts->tsv.hash && + l == ts->tsv.len && + (memcmp(str, getstr(ts), l * sizeof(char)) == 0)) { + if (isdead(G(L), o)) /* string is dead (but was not collected yet)? */ + changewhite(o); /* resurrect it */ + return ts; + } + } + return newshrstr(L, str, l, h); /* not found; create a new string */ +} + + +/* +** new string (with explicit length) +*/ +TString *luaS_newlstr (lua_State *L, const char *str, size_t l) { + if (l <= LUAI_MAXSHORTLEN) /* short string? */ + return internshrstr(L, str, l); + else { + if (l + 1 > (MAX_SIZET - sizeof(TString))/sizeof(char)) + luaM_toobig(L); + return createstrobj(L, str, l, LUA_TLNGSTR, G(L)->seed, NULL); + } +} + + +/* +** new zero-terminated string +*/ +TString *luaS_new (lua_State *L, const char *str) { + return luaS_newlstr(L, str, strlen(str)); +} + + +Udata *luaS_newudata (lua_State *L, size_t s, Table *e) { + Udata *u; + if (s > MAX_SIZET - sizeof(Udata)) + luaM_toobig(L); + u = &luaC_newobj(L, LUA_TUSERDATA, sizeof(Udata) + s, NULL, 0)->u; + u->uv.len = s; + u->uv.metatable = NULL; + u->uv.env = e; + return u; +} +/* END CSTYLED */ diff --git a/module/lua/lstring.h b/module/lua/lstring.h new file mode 100644 index 000000000000..66e65379b8e7 --- /dev/null +++ b/module/lua/lstring.h @@ -0,0 +1,48 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lstring.h,v 1.49.1.1 2013/04/12 18:48:47 roberto Exp $ +** String table (keep all strings handled by Lua) +** See Copyright Notice in lua.h +*/ + +#ifndef lstring_h +#define lstring_h + +#include "lgc.h" +#include "lobject.h" +#include "lstate.h" + + +#define sizestring(s) (sizeof(union TString)+((s)->len+1)*sizeof(char)) + +#define sizeudata(u) (sizeof(union Udata)+(u)->len) + +#define luaS_newliteral(L, s) (luaS_newlstr(L, "" s, \ + (sizeof(s)/sizeof(char))-1)) + +#define luaS_fix(s) l_setbit((s)->tsv.marked, FIXEDBIT) + + +/* +** test whether a string is a reserved word +*/ +#define isreserved(s) ((s)->tsv.tt == LUA_TSHRSTR && (s)->tsv.extra > 0) + + +/* +** equality for short strings, which are always internalized +*/ +#define eqshrstr(a,b) check_exp((a)->tsv.tt == LUA_TSHRSTR, (a) == (b)) + + +LUAI_FUNC unsigned int luaS_hash (const char *str, size_t l, unsigned int seed); +LUAI_FUNC int luaS_eqlngstr (TString *a, TString *b); +LUAI_FUNC int luaS_eqstr (TString *a, TString *b); +LUAI_FUNC void luaS_resize (lua_State *L, int newsize); +LUAI_FUNC Udata *luaS_newudata (lua_State *L, size_t s, Table *e); +LUAI_FUNC TString *luaS_newlstr (lua_State *L, const char *str, size_t l); +LUAI_FUNC TString *luaS_new (lua_State *L, const char *str); + + +#endif +/* END CSTYLED */ diff --git a/module/lua/lstrlib.c b/module/lua/lstrlib.c new file mode 100644 index 000000000000..12027757bf53 --- /dev/null +++ b/module/lua/lstrlib.c @@ -0,0 +1,1040 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lstrlib.c,v 1.178.1.1 2013/04/12 18:48:47 roberto Exp $ +** Standard library for string operations and pattern-matching +** See Copyright Notice in lua.h +*/ + + +#define lstrlib_c +#define LUA_LIB + +#include + +#include +#include + + +/* +** maximum number of captures that a pattern can do during +** pattern-matching. This limit is arbitrary. +*/ +#if !defined(LUA_MAXCAPTURES) +#define LUA_MAXCAPTURES 16 +#endif + + +/* macro to `unsign' a character */ +#define uchar(c) ((unsigned char)(c)) + +/* + * The provided version of sprintf returns a char *, but str_format expects + * it to return the number of characters printed. This version has the expected + * behavior. + */ +static size_t str_sprintf(char *buf, const char *fmt, ...) { + va_list args; + size_t len; + + va_start(args, fmt); + len = vsnprintf(buf, INT_MAX, fmt, args); + va_end(args); + + return len; +} + + +static int str_len (lua_State *L) { + size_t l; + luaL_checklstring(L, 1, &l); + lua_pushinteger(L, (lua_Integer)l); + return 1; +} + + +/* translate a relative string position: negative means back from end */ +static size_t posrelat (ptrdiff_t pos, size_t len) { + if (pos >= 0) return (size_t)pos; + else if (0u - (size_t)pos > len) return 0; + else return len - ((size_t)-pos) + 1; +} + + +static int str_sub (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + size_t start = posrelat(luaL_checkinteger(L, 2), l); + size_t end = posrelat(luaL_optinteger(L, 3, -1), l); + if (start < 1) start = 1; + if (end > l) end = l; + if (start <= end) + lua_pushlstring(L, s + start - 1, end - start + 1); + else lua_pushliteral(L, ""); + return 1; +} + + +static int str_reverse (lua_State *L) { + size_t l, i; + luaL_Buffer b; + const char *s = luaL_checklstring(L, 1, &l); + char *p = luaL_buffinitsize(L, &b, l); + for (i = 0; i < l; i++) + p[i] = s[l - i - 1]; + luaL_pushresultsize(&b, l); + return 1; +} + + +static int str_lower (lua_State *L) { + size_t l; + size_t i; + luaL_Buffer b; + const char *s = luaL_checklstring(L, 1, &l); + char *p = luaL_buffinitsize(L, &b, l); + for (i=0; i> 1) + +static int str_rep (lua_State *L) { + size_t l, lsep; + const char *s = luaL_checklstring(L, 1, &l); + int n = luaL_checkint(L, 2); + const char *sep = luaL_optlstring(L, 3, "", &lsep); + if (n <= 0) lua_pushliteral(L, ""); + else if (l + lsep < l || l + lsep >= MAXSIZE / n) /* may overflow? */ + return luaL_error(L, "resulting string too large"); + else { + size_t totallen = n * l + (n - 1) * lsep; + luaL_Buffer b; + char *p = luaL_buffinitsize(L, &b, totallen); + while (n-- > 1) { /* first n-1 copies (followed by separator) */ + memcpy(p, s, l * sizeof(char)); p += l; + if (lsep > 0) { /* avoid empty 'memcpy' (may be expensive) */ + memcpy(p, sep, lsep * sizeof(char)); p += lsep; + } + } + memcpy(p, s, l * sizeof(char)); /* last copy (not followed by separator) */ + luaL_pushresultsize(&b, totallen); + } + return 1; +} + + +static int str_byte (lua_State *L) { + size_t l; + const char *s = luaL_checklstring(L, 1, &l); + size_t posi = posrelat(luaL_optinteger(L, 2, 1), l); + size_t pose = posrelat(luaL_optinteger(L, 3, posi), l); + int n, i; + if (posi < 1) posi = 1; + if (pose > l) pose = l; + if (posi > pose) return 0; /* empty interval; return no values */ + n = (int)(pose - posi + 1); + if (posi + n <= pose) /* (size_t -> int) overflow? */ + return luaL_error(L, "string slice too long"); + luaL_checkstack(L, n, "string slice too long"); + for (i=0; i= ms->level || ms->capture[l].len == CAP_UNFINISHED) + return luaL_error(ms->L, "invalid capture index %%%d", l + 1); + return l; +} + + +static int capture_to_close (MatchState *ms) { + int level = ms->level; + for (level--; level>=0; level--) + if (ms->capture[level].len == CAP_UNFINISHED) return level; + return luaL_error(ms->L, "invalid pattern capture"); +} + + +static const char *classend (MatchState *ms, const char *p) { + switch (*p++) { + case L_ESC: { + if (p == ms->p_end) + luaL_error(ms->L, "malformed pattern (ends with " LUA_QL("%%") ")"); + return p+1; + } + case '[': { + if (*p == '^') p++; + do { /* look for a `]' */ + if (p == ms->p_end) + luaL_error(ms->L, "malformed pattern (missing " LUA_QL("]") ")"); + if (*(p++) == L_ESC && p < ms->p_end) + p++; /* skip escapes (e.g. `%]') */ + } while (*p != ']'); + return p+1; + } + default: { + return p; + } + } +} + + +static int match_class (int c, int cl) { + int res; + switch (tolower(cl)) { + case 'a' : res = isalpha(c); break; + case 'c' : res = iscntrl(c); break; + case 'd' : res = isdigit(c); break; + case 'g' : res = isgraph(c); break; + case 'l' : res = islower(c); break; + case 'p' : res = ispunct(c); break; + case 's' : res = isspace(c); break; + case 'u' : res = isupper(c); break; + case 'w' : res = isalnum(c); break; + case 'x' : res = isxdigit(c); break; + case 'z' : res = (c == 0); break; /* deprecated option */ + default: return (cl == c); + } + return (islower(cl) ? res : !res); +} + + +static int matchbracketclass (int c, const char *p, const char *ec) { + int sig = 1; + if (*(p+1) == '^') { + sig = 0; + p++; /* skip the `^' */ + } + while (++p < ec) { + if (*p == L_ESC) { + p++; + if (match_class(c, uchar(*p))) + return sig; + } + else if ((*(p+1) == '-') && (p+2 < ec)) { + p+=2; + if (uchar(*(p-2)) <= c && c <= uchar(*p)) + return sig; + } + else if (uchar(*p) == c) return sig; + } + return !sig; +} + + +static int singlematch (MatchState *ms, const char *s, const char *p, + const char *ep) { + if (s >= ms->src_end) + return 0; + else { + int c = uchar(*s); + switch (*p) { + case '.': return 1; /* matches any char */ + case L_ESC: return match_class(c, uchar(*(p+1))); + case '[': return matchbracketclass(c, p, ep-1); + default: return (uchar(*p) == c); + } + } +} + + +static const char *matchbalance (MatchState *ms, const char *s, + const char *p) { + if (p >= ms->p_end - 1) + luaL_error(ms->L, "malformed pattern " + "(missing arguments to " LUA_QL("%%b") ")"); + if (*s != *p) return NULL; + else { + int b = *p; + int e = *(p+1); + int cont = 1; + while (++s < ms->src_end) { + if (*s == e) { + if (--cont == 0) return s+1; + } + else if (*s == b) cont++; + } + } + return NULL; /* string ends out of balance */ +} + + +static const char *max_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + ptrdiff_t i = 0; /* counts maximum expand for item */ + while (singlematch(ms, s + i, p, ep)) + i++; + /* keeps trying to match with the maximum repetitions */ + while (i>=0) { + const char *res = match(ms, (s+i), ep+1); + if (res) return res; + i--; /* else didn't match; reduce 1 repetition to try again */ + } + return NULL; +} + + +static const char *min_expand (MatchState *ms, const char *s, + const char *p, const char *ep) { + for (;;) { + const char *res = match(ms, s, ep+1); + if (res != NULL) + return res; + else if (singlematch(ms, s, p, ep)) + s++; /* try with one more repetition */ + else return NULL; + } +} + + +static const char *start_capture (MatchState *ms, const char *s, + const char *p, int what) { + const char *res; + int level = ms->level; + if (level >= LUA_MAXCAPTURES) luaL_error(ms->L, "too many captures"); + ms->capture[level].init = s; + ms->capture[level].len = what; + ms->level = level+1; + if ((res=match(ms, s, p)) == NULL) /* match failed? */ + ms->level--; /* undo capture */ + return res; +} + + +static const char *end_capture (MatchState *ms, const char *s, + const char *p) { + int l = capture_to_close(ms); + const char *res; + ms->capture[l].len = s - ms->capture[l].init; /* close capture */ + if ((res = match(ms, s, p)) == NULL) /* match failed? */ + ms->capture[l].len = CAP_UNFINISHED; /* undo capture */ + return res; +} + + +static const char *match_capture (MatchState *ms, const char *s, int l) { + size_t len; + l = check_capture(ms, l); + len = ms->capture[l].len; + if ((size_t)(ms->src_end-s) >= len && + memcmp(ms->capture[l].init, s, len) == 0) + return s+len; + else return NULL; +} + + +static const char *match (MatchState *ms, const char *s, const char *p) { + if (ms->matchdepth-- == 0) + luaL_error(ms->L, "pattern too complex"); + init: /* using goto's to optimize tail recursion */ + if (p != ms->p_end) { /* end of pattern? */ + switch (*p) { + case '(': { /* start capture */ + if (*(p + 1) == ')') /* position capture? */ + s = start_capture(ms, s, p + 2, CAP_POSITION); + else + s = start_capture(ms, s, p + 1, CAP_UNFINISHED); + break; + } + case ')': { /* end capture */ + s = end_capture(ms, s, p + 1); + break; + } + case '$': { + if ((p + 1) != ms->p_end) /* is the `$' the last char in pattern? */ + goto dflt; /* no; go to default */ + s = (s == ms->src_end) ? s : NULL; /* check end of string */ + break; + } + case L_ESC: { /* escaped sequences not in the format class[*+?-]? */ + switch (*(p + 1)) { + case 'b': { /* balanced string? */ + s = matchbalance(ms, s, p + 2); + if (s != NULL) { + p += 4; goto init; /* return match(ms, s, p + 4); */ + } /* else fail (s == NULL) */ + break; + } + case 'f': { /* frontier? */ + const char *ep; char previous; + p += 2; + if (*p != '[') + luaL_error(ms->L, "missing " LUA_QL("[") " after " + LUA_QL("%%f") " in pattern"); + ep = classend(ms, p); /* points to what is next */ + previous = (s == ms->src_init) ? '\0' : *(s - 1); + if (!matchbracketclass(uchar(previous), p, ep - 1) && + matchbracketclass(uchar(*s), p, ep - 1)) { + p = ep; goto init; /* return match(ms, s, ep); */ + } + s = NULL; /* match failed */ + break; + } + case '0': case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + case '8': case '9': { /* capture results (%0-%9)? */ + s = match_capture(ms, s, uchar(*(p + 1))); + if (s != NULL) { + p += 2; goto init; /* return match(ms, s, p + 2) */ + } + break; + } + default: goto dflt; + } + break; + } + default: dflt: { /* pattern class plus optional suffix */ + const char *ep = classend(ms, p); /* points to optional suffix */ + /* does not match at least once? */ + if (!singlematch(ms, s, p, ep)) { + if (*ep == '*' || *ep == '?' || *ep == '-') { /* accept empty? */ + p = ep + 1; goto init; /* return match(ms, s, ep + 1); */ + } + else /* '+' or no suffix */ + s = NULL; /* fail */ + } + else { /* matched once */ + switch (*ep) { /* handle optional suffix */ + case '?': { /* optional */ + const char *res; + if ((res = match(ms, s + 1, ep + 1)) != NULL) + s = res; + else { + p = ep + 1; goto init; /* else return match(ms, s, ep + 1); */ + } + break; + } + case '+': /* 1 or more repetitions */ + s++; /* 1 match already done */ + /* FALLTHROUGH */ + case '*': /* 0 or more repetitions */ + s = max_expand(ms, s, p, ep); + break; + case '-': /* 0 or more repetitions (minimum) */ + s = min_expand(ms, s, p, ep); + break; + default: /* no suffix */ + s++; p = ep; goto init; /* return match(ms, s + 1, ep); */ + } + } + break; + } + } + } + ms->matchdepth++; + return s; +} + + + +static const char *lmemfind (const char *s1, size_t l1, + const char *s2, size_t l2) { + if (l2 == 0) return s1; /* empty strings are everywhere */ + else if (l2 > l1) return NULL; /* avoids a negative `l1' */ + else { + const char *init; /* to search for a `*s2' inside `s1' */ + l2--; /* 1st char will be checked by `memchr' */ + l1 = l1-l2; /* `s2' cannot be found after that */ + while (l1 > 0 && (init = (const char *)memchr(s1, *s2, l1)) != NULL) { + init++; /* 1st char is already checked */ + if (memcmp(init, s2+1, l2) == 0) + return init-1; + else { /* correct `l1' and `s1' to try again */ + l1 -= init-s1; + s1 = init; + } + } + return NULL; /* not found */ + } +} + + +static void push_onecapture (MatchState *ms, int i, const char *s, + const char *e) { + if (i >= ms->level) { + if (i == 0) /* ms->level == 0, too */ + lua_pushlstring(ms->L, s, e - s); /* add whole match */ + else + luaL_error(ms->L, "invalid capture index"); + } + else { + ptrdiff_t l = ms->capture[i].len; + if (l == CAP_UNFINISHED) luaL_error(ms->L, "unfinished capture"); + if (l == CAP_POSITION) + lua_pushinteger(ms->L, ms->capture[i].init - ms->src_init + 1); + else + lua_pushlstring(ms->L, ms->capture[i].init, l); + } +} + + +static int push_captures (MatchState *ms, const char *s, const char *e) { + int i; + int nlevels = (ms->level == 0 && s) ? 1 : ms->level; + luaL_checkstack(ms->L, nlevels, "too many captures"); + for (i = 0; i < nlevels; i++) + push_onecapture(ms, i, s, e); + return nlevels; /* number of strings pushed */ +} + + +/* check whether pattern has no special characters */ +static int nospecials (const char *p, size_t l) { + size_t upto = 0; + do { + if (strpbrk(p + upto, SPECIALS)) + return 0; /* pattern has a special character */ + upto += strlen(p + upto) + 1; /* may have more after \0 */ + } while (upto <= l); + return 1; /* no special chars found */ +} + + +static int str_find_aux (lua_State *L, int find) { + size_t ls, lp; + const char *s = luaL_checklstring(L, 1, &ls); + const char *p = luaL_checklstring(L, 2, &lp); + size_t init = posrelat(luaL_optinteger(L, 3, 1), ls); + if (init < 1) init = 1; + else if (init > ls + 1) { /* start after string's end? */ + lua_pushnil(L); /* cannot find anything */ + return 1; + } + /* explicit request or no special characters? */ + if (find && (lua_toboolean(L, 4) || nospecials(p, lp))) { + /* do a plain search */ + const char *s2 = lmemfind(s + init - 1, ls - init + 1, p, lp); + if (s2) { + lua_pushinteger(L, s2 - s + 1); + lua_pushinteger(L, s2 - s + lp); + return 2; + } + } + else { + MatchState ms; + const char *s1 = s + init - 1; + int anchor = (*p == '^'); + if (anchor) { + p++; lp--; /* skip anchor character */ + } + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = s; + ms.src_end = s + ls; + ms.p_end = p + lp; + do { + const char *res; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + if ((res=match(&ms, s1, p)) != NULL) { + if (find) { + lua_pushinteger(L, s1 - s + 1); /* start */ + lua_pushinteger(L, res - s); /* end */ + return push_captures(&ms, NULL, 0) + 2; + } + else + return push_captures(&ms, s1, res); + } + } while (s1++ < ms.src_end && !anchor); + } + lua_pushnil(L); /* not found */ + return 1; +} + + +static int str_find (lua_State *L) { + return str_find_aux(L, 1); +} + + +static int str_match (lua_State *L) { + return str_find_aux(L, 0); +} + + +static int gmatch_aux (lua_State *L) { + MatchState ms; + size_t ls, lp; + const char *s = lua_tolstring(L, lua_upvalueindex(1), &ls); + const char *p = lua_tolstring(L, lua_upvalueindex(2), &lp); + const char *src; + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = s; + ms.src_end = s+ls; + ms.p_end = p + lp; + for (src = s + (size_t)lua_tointeger(L, lua_upvalueindex(3)); + src <= ms.src_end; + src++) { + const char *e; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + if ((e = match(&ms, src, p)) != NULL) { + lua_Integer newstart = e-s; + if (e == src) newstart++; /* empty match? go at least one position */ + lua_pushinteger(L, newstart); + lua_replace(L, lua_upvalueindex(3)); + return push_captures(&ms, src, e); + } + } + return 0; /* not found */ +} + + +static int str_gmatch (lua_State *L) { + luaL_checkstring(L, 1); + luaL_checkstring(L, 2); + lua_settop(L, 2); + lua_pushinteger(L, 0); + lua_pushcclosure(L, gmatch_aux, 3); + return 1; +} + + +static void add_s (MatchState *ms, luaL_Buffer *b, const char *s, + const char *e) { + size_t l, i; + const char *news = lua_tolstring(ms->L, 3, &l); + for (i = 0; i < l; i++) { + if (news[i] != L_ESC) + luaL_addchar(b, news[i]); + else { + i++; /* skip ESC */ + if (!isdigit(uchar(news[i]))) { + if (news[i] != L_ESC) + luaL_error(ms->L, "invalid use of " LUA_QL("%c") + " in replacement string", L_ESC); + luaL_addchar(b, news[i]); + } + else if (news[i] == '0') + luaL_addlstring(b, s, e - s); + else { + push_onecapture(ms, news[i] - '1', s, e); + luaL_addvalue(b); /* add capture to accumulated result */ + } + } + } +} + + +static void add_value (MatchState *ms, luaL_Buffer *b, const char *s, + const char *e, int tr) { + lua_State *L = ms->L; + switch (tr) { + case LUA_TFUNCTION: { + int n; + lua_pushvalue(L, 3); + n = push_captures(ms, s, e); + lua_call(L, n, 1); + break; + } + case LUA_TTABLE: { + push_onecapture(ms, 0, s, e); + lua_gettable(L, 3); + break; + } + default: { /* LUA_TNUMBER or LUA_TSTRING */ + add_s(ms, b, s, e); + return; + } + } + if (!lua_toboolean(L, -1)) { /* nil or false? */ + lua_pop(L, 1); + lua_pushlstring(L, s, e - s); /* keep original text */ + } + else if (!lua_isstring(L, -1)) + luaL_error(L, "invalid replacement value (a %s)", luaL_typename(L, -1)); + luaL_addvalue(b); /* add result to accumulator */ +} + + +static int str_gsub (lua_State *L) { + size_t srcl, lp; + const char *src = luaL_checklstring(L, 1, &srcl); + const char *p = luaL_checklstring(L, 2, &lp); + int tr = lua_type(L, 3); + size_t max_s = luaL_optinteger(L, 4, srcl+1); + int anchor = (*p == '^'); + size_t n = 0; + MatchState ms; + luaL_Buffer b; + luaL_argcheck(L, tr == LUA_TNUMBER || tr == LUA_TSTRING || + tr == LUA_TFUNCTION || tr == LUA_TTABLE, 3, + "string/function/table expected"); + luaL_buffinit(L, &b); + if (anchor) { + p++; lp--; /* skip anchor character */ + } + ms.L = L; + ms.matchdepth = MAXCCALLS; + ms.src_init = src; + ms.src_end = src+srcl; + ms.p_end = p + lp; + while (n < max_s) { + const char *e; + ms.level = 0; + lua_assert(ms.matchdepth == MAXCCALLS); + e = match(&ms, src, p); + if (e) { + n++; + add_value(&ms, &b, src, e, tr); + } + if (e && e>src) /* non empty match? */ + src = e; /* skip it */ + else if (src < ms.src_end) + luaL_addchar(&b, *src++); + else break; + if (anchor) break; + } + luaL_addlstring(&b, src, ms.src_end-src); + luaL_pushresult(&b); + lua_pushinteger(L, n); /* number of substitutions */ + return 2; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** STRING FORMAT +** ======================================================= +*/ + +/* +** LUA_INTFRMLEN is the length modifier for integer conversions in +** 'string.format'; LUA_INTFRM_T is the integer type corresponding to +** the previous length +*/ +#if !defined(LUA_INTFRMLEN) /* { */ +#if defined(LUA_USE_LONGLONG) + +#define LUA_INTFRMLEN "ll" +#define LUA_INTFRM_T long long + +#else + +#define LUA_INTFRMLEN "l" +#define LUA_INTFRM_T long + +#endif +#endif /* } */ + + +/* +** LUA_FLTFRMLEN is the length modifier for float conversions in +** 'string.format'; LUA_FLTFRM_T is the float type corresponding to +** the previous length +*/ +#if !defined(LUA_FLTFRMLEN) + +#define LUA_FLTFRMLEN "" +#define LUA_FLTFRM_T double + +#endif + + +/* maximum size of each formatted item (> len(format('%99.99f', -1e308))) */ +#define MAX_ITEM 512 +/* valid flags in a format specification */ +#define FLAGS "-+ #0" +/* +** maximum size of each format specification (such as '%-099.99d') +** (+10 accounts for %99.99x plus margin of error) +*/ +#define MAX_FORMAT (sizeof(FLAGS) + sizeof(LUA_INTFRMLEN) + 10) + + +static void addquoted (lua_State *L, luaL_Buffer *b, int arg) { + size_t l; + const char *s = luaL_checklstring(L, arg, &l); + luaL_addchar(b, '"'); + while (l--) { + if (*s == '"' || *s == '\\' || *s == '\n') { + luaL_addchar(b, '\\'); + luaL_addchar(b, *s); + } + else if (*s == '\0' || iscntrl(uchar(*s))) { + char buff[10]; + if (!isdigit(uchar(*(s+1)))) + snprintf(buff, sizeof(buff), "\\%d", (int)uchar(*s)); + else + snprintf(buff, sizeof(buff), "\\%03d", (int)uchar(*s)); + luaL_addstring(b, buff); + } + else + luaL_addchar(b, *s); + s++; + } + luaL_addchar(b, '"'); +} + +static const char *scanformat (lua_State *L, const char *strfrmt, char *form) { + const char *p = strfrmt; + while (*p != '\0' && strchr(FLAGS, *p) != NULL) p++; /* skip flags */ + if ((size_t)(p - strfrmt) >= sizeof(FLAGS)/sizeof(char)) + luaL_error(L, "invalid format (repeated flags)"); + if (isdigit(uchar(*p))) p++; /* skip width */ + if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ + if (*p == '.') { + p++; + if (isdigit(uchar(*p))) p++; /* skip precision */ + if (isdigit(uchar(*p))) p++; /* (2 digits at most) */ + } + if (isdigit(uchar(*p))) + luaL_error(L, "invalid format (width or precision too long)"); + *(form++) = '%'; + memcpy(form, strfrmt, (p - strfrmt + 1) * sizeof(char)); + form += p - strfrmt + 1; + *form = '\0'; + return p; +} + + +/* +** add length modifier into formats +*/ +static void addlenmod (char *form, const char *lenmod, size_t size) { + size_t l = strlen(form); + size_t lm = strlen(lenmod); + char spec = form[l - 1]; + strlcpy(form + l - 1, lenmod, size - (l - 1)); + form[l + lm - 1] = spec; + form[l + lm] = '\0'; +} + + +static int str_format (lua_State *L) { + int top = lua_gettop(L); + int arg = 1; + size_t sfl; + const char *strfrmt = luaL_checklstring(L, arg, &sfl); + const char *strfrmt_end = strfrmt+sfl; + luaL_Buffer b; + luaL_buffinit(L, &b); + while (strfrmt < strfrmt_end) { + if (*strfrmt != L_ESC) + luaL_addchar(&b, *strfrmt++); + else if (*++strfrmt == L_ESC) + luaL_addchar(&b, *strfrmt++); /* %% */ + else { /* format item */ + char form[MAX_FORMAT]; /* to store the format (`%...') */ + char *buff = luaL_prepbuffsize(&b, MAX_ITEM); /* to put formatted item */ + int nb = 0; /* number of bytes in added item */ + if (++arg > top) + luaL_argerror(L, arg, "no value"); + strfrmt = scanformat(L, strfrmt, form); + switch (*strfrmt++) { + case 'c': { + nb = str_sprintf(buff, form, luaL_checkint(L, arg)); + break; + } + case 'd': case 'i': { + lua_Number n = luaL_checknumber(L, arg); + LUA_INTFRM_T ni = (LUA_INTFRM_T)n; + lua_Number diff = n - (lua_Number)ni; + luaL_argcheck(L, -1 < diff && diff < 1, arg, + "not a number in proper range"); + addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT); + nb = str_sprintf(buff, form, ni); + break; + } + case 'o': case 'u': case 'x': case 'X': { + lua_Number n = luaL_checknumber(L, arg); + unsigned LUA_INTFRM_T ni = (unsigned LUA_INTFRM_T)n; + lua_Number diff = n - (lua_Number)ni; + luaL_argcheck(L, -1 < diff && diff < 1, arg, + "not a non-negative number in proper range"); + addlenmod(form, LUA_INTFRMLEN, MAX_FORMAT); + nb = str_sprintf(buff, form, ni); + break; + } +#if defined(LUA_USE_FLOAT_FORMATS) + case 'e': case 'E': case 'f': +#if defined(LUA_USE_AFORMAT) + case 'a': case 'A': +#endif + case 'g': case 'G': { + addlenmod(form, LUA_FLTFRMLEN, MAX_FORMAT); + nb = str_sprintf(buff, form, (LUA_FLTFRM_T)luaL_checknumber(L, arg)); + break; + } +#endif + case 'q': { + addquoted(L, &b, arg); + break; + } + case 's': { + size_t l; + const char *s = luaL_tolstring(L, arg, &l); + if (!strchr(form, '.') && l >= 100) { + /* no precision and string is too long to be formatted; + keep original string */ + luaL_addvalue(&b); + break; + } + else { + nb = str_sprintf(buff, form, s); + lua_pop(L, 1); /* remove result from 'luaL_tolstring' */ + break; + } + } + default: { /* also treat cases `pnLlh' */ + return luaL_error(L, "invalid option " LUA_QL("%%%c") " to " + LUA_QL("format"), *(strfrmt - 1)); + } + } + luaL_addsize(&b, nb); + } + } + luaL_pushresult(&b); + return 1; +} + +/* }====================================================== */ + + +static const luaL_Reg strlib[] = { + {"byte", str_byte}, + {"char", str_char}, +#if defined(LUA_USE_DUMP) + {"dump", str_dump}, +#endif + {"find", str_find}, + {"format", str_format}, + {"gmatch", str_gmatch}, + {"gsub", str_gsub}, + {"len", str_len}, + {"lower", str_lower}, + {"match", str_match}, + {"rep", str_rep}, + {"reverse", str_reverse}, + {"sub", str_sub}, + {"upper", str_upper}, + {NULL, NULL} +}; + + +static void createmetatable (lua_State *L) { + lua_createtable(L, 0, 1); /* table to be metatable for strings */ + lua_pushliteral(L, ""); /* dummy string */ + lua_pushvalue(L, -2); /* copy table */ + lua_setmetatable(L, -2); /* set table as metatable for strings */ + lua_pop(L, 1); /* pop dummy string */ + lua_pushvalue(L, -2); /* get string library */ + lua_setfield(L, -2, "__index"); /* metatable.__index = string */ + lua_pop(L, 1); /* pop metatable */ +} + + +/* +** Open string library +*/ +LUAMOD_API int luaopen_string (lua_State *L) { + luaL_newlib(L, strlib); + createmetatable(L); + return 1; +} + +#if defined(_KERNEL) + +EXPORT_SYMBOL(luaopen_string); + +#endif +/* END CSTYLED */ diff --git a/module/lua/ltable.c b/module/lua/ltable.c new file mode 100644 index 000000000000..f60418721bef --- /dev/null +++ b/module/lua/ltable.c @@ -0,0 +1,592 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ltable.c,v 2.72.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua tables (hash) +** See Copyright Notice in lua.h +*/ + + +/* +** Implementation of tables (aka arrays, objects, or hash tables). +** Tables keep its elements in two parts: an array part and a hash part. +** Non-negative integer keys are all candidates to be kept in the array +** part. The actual size of the array is the largest `n' such that at +** least half the slots between 0 and n are in use. +** Hash uses a mix of chained scatter table with Brent's variation. +** A main invariant of these tables is that, if an element is not +** in its main position (i.e. the `original' position that its hash gives +** to it), then the colliding element is in its own main position. +** Hence even when the load factor reaches 100%, performance remains good. +*/ + + +#define ltable_c +#define LUA_CORE + +#include + +#include "ldebug.h" +#include "ldo.h" +#include "lgc.h" +#include "lmem.h" +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "lvm.h" + + +/* +** max size of array part is 2^MAXBITS +*/ +#if LUAI_BITSINT >= 32 +#define MAXBITS 30 +#else +#define MAXBITS (LUAI_BITSINT-2) +#endif + +#define MAXASIZE (1 << MAXBITS) + + +#define hashpow2(t,n) (gnode(t, lmod((n), sizenode(t)))) + +#define hashstr(t,str) hashpow2(t, (str)->tsv.hash) +#define hashboolean(t,p) hashpow2(t, p) + + +/* +** for some types, it is better to avoid modulus by power of 2, as +** they tend to have many 2 factors. +*/ +#define hashmod(t,n) (gnode(t, ((n) % ((sizenode(t)-1)|1)))) + + +#define hashpointer(t,p) hashmod(t, IntPoint(p)) + + +#define dummynode (&dummynode_) + +#define isdummy(n) ((n) == dummynode) + +static const Node dummynode_ = { + {NILCONSTANT}, /* value */ + {{NILCONSTANT, NULL}} /* key */ +}; + + +/* +** hash for lua_Numbers +*/ +static Node *hashnum (const Table *t, lua_Number n) { + int i; + luai_hashnum(i, n); + if (i < 0) { + if (cast(unsigned int, i) == 0u - i) /* use unsigned to avoid overflows */ + i = 0; /* handle INT_MIN */ + i = -i; /* must be a positive value */ + } + return hashmod(t, i); +} + + + +/* +** returns the `main' position of an element in a table (that is, the index +** of its hash value) +*/ +static Node *mainposition (const Table *t, const TValue *key) { + switch (ttype(key)) { + case LUA_TNUMBER: + return hashnum(t, nvalue(key)); + case LUA_TLNGSTR: { + TString *s = rawtsvalue(key); + if (s->tsv.extra == 0) { /* no hash? */ + s->tsv.hash = luaS_hash(getstr(s), s->tsv.len, s->tsv.hash); + s->tsv.extra = 1; /* now it has its hash */ + } + return hashstr(t, rawtsvalue(key)); + } + case LUA_TSHRSTR: + return hashstr(t, rawtsvalue(key)); + case LUA_TBOOLEAN: + return hashboolean(t, bvalue(key)); + case LUA_TLIGHTUSERDATA: + return hashpointer(t, pvalue(key)); + case LUA_TLCF: + return hashpointer(t, fvalue(key)); + default: + return hashpointer(t, gcvalue(key)); + } +} + + +/* +** returns the index for `key' if `key' is an appropriate key to live in +** the array part of the table, -1 otherwise. +*/ +static int arrayindex (const TValue *key) { + if (ttisnumber(key)) { + lua_Number n = nvalue(key); + int k; + lua_number2int(k, n); + if (luai_numeq(cast_num(k), n)) + return k; + } + return -1; /* `key' did not match some condition */ +} + + +/* +** returns the index of a `key' for table traversals. First goes all +** elements in the array part, then elements in the hash part. The +** beginning of a traversal is signaled by -1. +*/ +static int findindex (lua_State *L, Table *t, StkId key) { + int i; + if (ttisnil(key)) return -1; /* first iteration */ + i = arrayindex(key); + if (0 < i && i <= t->sizearray) /* is `key' inside array part? */ + return i-1; /* yes; that's the index (corrected to C) */ + else { + Node *n = mainposition(t, key); + for (;;) { /* check whether `key' is somewhere in the chain */ + /* key may be dead already, but it is ok to use it in `next' */ + if (luaV_rawequalobj(gkey(n), key) || + (ttisdeadkey(gkey(n)) && iscollectable(key) && + deadvalue(gkey(n)) == gcvalue(key))) { + i = cast_int(n - gnode(t, 0)); /* key index in hash table */ + /* hash elements are numbered after array ones */ + return i + t->sizearray; + } + else n = gnext(n); + if (n == NULL) + luaG_runerror(L, "invalid key to " LUA_QL("next")); /* key not found */ + } + } +} + + +int luaH_next (lua_State *L, Table *t, StkId key) { + int i = findindex(L, t, key); /* find original element */ + for (i++; i < t->sizearray; i++) { /* try first array part */ + if (!ttisnil(&t->array[i])) { /* a non-nil value? */ + setnvalue(key, cast_num(i+1)); + setobj2s(L, key+1, &t->array[i]); + return 1; + } + } + for (i -= t->sizearray; i < sizenode(t); i++) { /* then hash part */ + if (!ttisnil(gval(gnode(t, i)))) { /* a non-nil value? */ + setobj2s(L, key, gkey(gnode(t, i))); + setobj2s(L, key+1, gval(gnode(t, i))); + return 1; + } + } + return 0; /* no more elements */ +} + + +/* +** {============================================================= +** Rehash +** ============================================================== +*/ + + +static int computesizes (int nums[], int *narray) { + int i; + int twotoi; /* 2^i */ + int a = 0; /* number of elements smaller than 2^i */ + int na = 0; /* number of elements to go to array part */ + int n = 0; /* optimal size for array part */ + for (i = 0, twotoi = 1; twotoi/2 < *narray; i++, twotoi *= 2) { + if (nums[i] > 0) { + a += nums[i]; + if (a > twotoi/2) { /* more than half elements present? */ + n = twotoi; /* optimal size (till now) */ + na = a; /* all elements smaller than n will go to array part */ + } + } + if (a == *narray) break; /* all elements already counted */ + } + *narray = n; + lua_assert(*narray/2 <= na && na <= *narray); + return na; +} + + +static int countint (const TValue *key, int *nums) { + int k = arrayindex(key); + if (0 < k && k <= MAXASIZE) { /* is `key' an appropriate array index? */ + nums[luaO_ceillog2(k)]++; /* count as such */ + return 1; + } + else + return 0; +} + + +static int numusearray (const Table *t, int *nums) { + int lg; + int ttlg; /* 2^lg */ + int ause = 0; /* summation of `nums' */ + int i = 1; /* count to traverse all array keys */ + for (lg=0, ttlg=1; lg<=MAXBITS; lg++, ttlg*=2) { /* for each slice */ + int lc = 0; /* counter */ + int lim = ttlg; + if (lim > t->sizearray) { + lim = t->sizearray; /* adjust upper limit */ + if (i > lim) + break; /* no more elements to count */ + } + /* count elements in range (2^(lg-1), 2^lg] */ + for (; i <= lim; i++) { + if (!ttisnil(&t->array[i-1])) + lc++; + } + nums[lg] += lc; + ause += lc; + } + return ause; +} + + +static int numusehash (const Table *t, int *nums, int *pnasize) { + int totaluse = 0; /* total number of elements */ + int ause = 0; /* summation of `nums' */ + int i = sizenode(t); + while (i--) { + Node *n = &t->node[i]; + if (!ttisnil(gval(n))) { + ause += countint(gkey(n), nums); + totaluse++; + } + } + *pnasize += ause; + return totaluse; +} + + +static void setarrayvector (lua_State *L, Table *t, int size) { + int i; + luaM_reallocvector(L, t->array, t->sizearray, size, TValue); + for (i=t->sizearray; iarray[i]); + t->sizearray = size; +} + + +static void setnodevector (lua_State *L, Table *t, int size) { + int lsize; + if (size == 0) { /* no elements to hash part? */ + t->node = cast(Node *, dummynode); /* use common `dummynode' */ + lsize = 0; + } + else { + int i; + lsize = luaO_ceillog2(size); + if (lsize > MAXBITS) + luaG_runerror(L, "table overflow"); + size = twoto(lsize); + t->node = luaM_newvector(L, size, Node); + for (i=0; ilsizenode = cast_byte(lsize); + t->lastfree = gnode(t, size); /* all positions are free */ +} + + +void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize) { + int i; + int oldasize = t->sizearray; + int oldhsize = t->lsizenode; + Node *nold = t->node; /* save old hash ... */ + if (nasize > oldasize) /* array part must grow? */ + setarrayvector(L, t, nasize); + /* create new hash part with appropriate size */ + setnodevector(L, t, nhsize); + if (nasize < oldasize) { /* array part must shrink? */ + t->sizearray = nasize; + /* re-insert elements from vanishing slice */ + for (i=nasize; iarray[i])) + luaH_setint(L, t, i + 1, &t->array[i]); + } + /* shrink array */ + luaM_reallocvector(L, t->array, oldasize, nasize, TValue); + } + /* re-insert elements from hash part */ + for (i = twoto(oldhsize) - 1; i >= 0; i--) { + Node *old = nold+i; + if (!ttisnil(gval(old))) { + /* doesn't need barrier/invalidate cache, as entry was + already present in the table */ + setobjt2t(L, luaH_set(L, t, gkey(old)), gval(old)); + } + } + if (!isdummy(nold)) + luaM_freearray(L, nold, cast(size_t, twoto(oldhsize))); /* free old array */ +} + + +void luaH_resizearray (lua_State *L, Table *t, int nasize) { + int nsize = isdummy(t->node) ? 0 : sizenode(t); + luaH_resize(L, t, nasize, nsize); +} + + +static void rehash (lua_State *L, Table *t, const TValue *ek) { + int nasize, na; + int nums[MAXBITS+1]; /* nums[i] = number of keys with 2^(i-1) < k <= 2^i */ + int i; + int totaluse; + for (i=0; i<=MAXBITS; i++) nums[i] = 0; /* reset counts */ + nasize = numusearray(t, nums); /* count keys in array part */ + totaluse = nasize; /* all those keys are integer keys */ + totaluse += numusehash(t, nums, &nasize); /* count keys in hash part */ + /* count extra key */ + nasize += countint(ek, nums); + totaluse++; + /* compute new size for array part */ + na = computesizes(nums, &nasize); + /* resize the table to new computed sizes */ + luaH_resize(L, t, nasize, totaluse - na); +} + + + +/* +** }============================================================= +*/ + + +Table *luaH_new (lua_State *L) { + Table *t = &luaC_newobj(L, LUA_TTABLE, sizeof(Table), NULL, 0)->h; + t->metatable = NULL; + t->flags = cast_byte(~0); + t->array = NULL; + t->sizearray = 0; + setnodevector(L, t, 0); + return t; +} + + +void luaH_free (lua_State *L, Table *t) { + if (!isdummy(t->node)) + luaM_freearray(L, t->node, cast(size_t, sizenode(t))); + luaM_freearray(L, t->array, t->sizearray); + luaM_free(L, t); +} + + +static Node *getfreepos (Table *t) { + while (t->lastfree > t->node) { + t->lastfree--; + if (ttisnil(gkey(t->lastfree))) + return t->lastfree; + } + return NULL; /* could not find a free place */ +} + + + +/* +** inserts a new key into a hash table; first, check whether key's main +** position is free. If not, check whether colliding node is in its main +** position or not: if it is not, move colliding node to an empty place and +** put new key in its main position; otherwise (colliding node is in its main +** position), new key goes to an empty position. +*/ +TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key) { + Node *mp; + if (ttisnil(key)) luaG_runerror(L, "table index is nil"); +#if defined LUA_HAS_FLOAT_NUMBERS + else if (ttisnumber(key) && luai_numisnan(L, nvalue(key))) + luaG_runerror(L, "table index is NaN"); +#endif + mp = mainposition(t, key); + if (!ttisnil(gval(mp)) || isdummy(mp)) { /* main position is taken? */ + Node *othern; + Node *n = getfreepos(t); /* get a free place */ + if (n == NULL) { /* cannot find a free place? */ + rehash(L, t, key); /* grow table */ + /* whatever called 'newkey' take care of TM cache and GC barrier */ + return luaH_set(L, t, key); /* insert key into grown table */ + } + lua_assert(!isdummy(n)); + othern = mainposition(t, gkey(mp)); + if (othern != mp) { /* is colliding node out of its main position? */ + /* yes; move colliding node into free position */ + while (gnext(othern) != mp) othern = gnext(othern); /* find previous */ + gnext(othern) = n; /* redo the chain with `n' in place of `mp' */ + *n = *mp; /* copy colliding node into free pos. (mp->next also goes) */ + gnext(mp) = NULL; /* now `mp' is free */ + setnilvalue(gval(mp)); + } + else { /* colliding node is in its own main position */ + /* new node will go into free position */ + gnext(n) = gnext(mp); /* chain new position */ + gnext(mp) = n; + mp = n; + } + } + setobj2t(L, gkey(mp), key); + luaC_barrierback(L, obj2gco(t), key); + lua_assert(ttisnil(gval(mp))); + return gval(mp); +} + + +/* +** search function for integers +*/ +const TValue *luaH_getint (Table *t, int key) { + /* (1 <= key && key <= t->sizearray) */ + if (cast(unsigned int, key-1) < cast(unsigned int, t->sizearray)) + return &t->array[key-1]; + else { + lua_Number nk = cast_num(key); + Node *n = hashnum(t, nk); + do { /* check whether `key' is somewhere in the chain */ + if (ttisnumber(gkey(n)) && luai_numeq(nvalue(gkey(n)), nk)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; + } +} + + +/* +** search function for short strings +*/ +const TValue *luaH_getstr (Table *t, TString *key) { + Node *n = hashstr(t, key); + lua_assert(key->tsv.tt == LUA_TSHRSTR); + do { /* check whether `key' is somewhere in the chain */ + if (ttisshrstring(gkey(n)) && eqshrstr(rawtsvalue(gkey(n)), key)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; +} + + +/* +** main search function +*/ +const TValue *luaH_get (Table *t, const TValue *key) { + switch (ttype(key)) { + case LUA_TSHRSTR: return luaH_getstr(t, rawtsvalue(key)); + case LUA_TNIL: return luaO_nilobject; + case LUA_TNUMBER: { + int k; + lua_Number n = nvalue(key); + lua_number2int(k, n); + if (luai_numeq(cast_num(k), n)) /* index is int? */ + return luaH_getint(t, k); /* use specialized version */ + /* else go through */ + } + /* FALLTHROUGH */ + default: { + Node *n = mainposition(t, key); + do { /* check whether `key' is somewhere in the chain */ + if (luaV_rawequalobj(gkey(n), key)) + return gval(n); /* that's it */ + else n = gnext(n); + } while (n); + return luaO_nilobject; + } + } +} + + +/* +** beware: when using this function you probably need to check a GC +** barrier and invalidate the TM cache. +*/ +TValue *luaH_set (lua_State *L, Table *t, const TValue *key) { + const TValue *p = luaH_get(t, key); + if (p != luaO_nilobject) + return cast(TValue *, p); + else return luaH_newkey(L, t, key); +} + + +void luaH_setint (lua_State *L, Table *t, int key, TValue *value) { + const TValue *p = luaH_getint(t, key); + TValue *cell; + if (p != luaO_nilobject) + cell = cast(TValue *, p); + else { + TValue k; + setnvalue(&k, cast_num(key)); + cell = luaH_newkey(L, t, &k); + } + setobj2t(L, cell, value); +} + + +static int unbound_search (Table *t, unsigned int j) { + unsigned int i = j; /* i is zero or a present index */ + j++; + /* find `i' and `j' such that i is present and j is not */ + while (!ttisnil(luaH_getint(t, j))) { + i = j; + j *= 2; + if (j > cast(unsigned int, MAX_INT)) { /* overflow? */ + /* table was built with bad purposes: resort to linear search */ + i = 1; + while (!ttisnil(luaH_getint(t, i))) i++; + return i - 1; + } + } + /* now do a binary search between them */ + while (j - i > 1) { + unsigned int m = (i+j)/2; + if (ttisnil(luaH_getint(t, m))) j = m; + else i = m; + } + return i; +} + + +/* +** Try to find a boundary in table `t'. A `boundary' is an integer index +** such that t[i] is non-nil and t[i+1] is nil (and 0 if t[1] is nil). +*/ +int luaH_getn (Table *t) { + unsigned int j = t->sizearray; + if (j > 0 && ttisnil(&t->array[j - 1])) { + /* there is a boundary in the array part: (binary) search for it */ + unsigned int i = 0; + while (j - i > 1) { + unsigned int m = (i+j)/2; + if (ttisnil(&t->array[m - 1])) j = m; + else i = m; + } + return i; + } + /* else must find a boundary in hash part */ + else if (isdummy(t->node)) /* hash part is empty? */ + return j; /* that is easy... */ + else return unbound_search(t, j); +} + + + +#if defined(LUA_DEBUG) + +Node *luaH_mainposition (const Table *t, const TValue *key) { + return mainposition(t, key); +} + +int luaH_isdummy (Node *n) { return isdummy(n); } + +#endif +/* END CSTYLED */ diff --git a/module/lua/ltable.h b/module/lua/ltable.h new file mode 100644 index 000000000000..ea877ebf4eb0 --- /dev/null +++ b/module/lua/ltable.h @@ -0,0 +1,47 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ltable.h,v 2.16.1.2 2013/08/30 15:49:41 roberto Exp $ +** Lua tables (hash) +** See Copyright Notice in lua.h +*/ + +#ifndef ltable_h +#define ltable_h + +#include "lobject.h" + + +#define gnode(t,i) ((Node *)&(t)->node[i]) +#define gkey(n) (&(n)->i_key.tvk) +#define gval(n) (&(n)->i_val) +#define gnext(n) ((n)->i_key.nk.next) + +#define invalidateTMcache(t) ((t)->flags = 0) + +/* returns the key, given the value of a table entry */ +#define keyfromval(v) \ + (gkey(cast(Node *, cast(char *, (v)) - offsetof(Node, i_val)))) + + +LUAI_FUNC const TValue *luaH_getint (Table *t, int key); +LUAI_FUNC void luaH_setint (lua_State *L, Table *t, int key, TValue *value); +LUAI_FUNC const TValue *luaH_getstr (Table *t, TString *key); +LUAI_FUNC const TValue *luaH_get (Table *t, const TValue *key); +LUAI_FUNC TValue *luaH_newkey (lua_State *L, Table *t, const TValue *key); +LUAI_FUNC TValue *luaH_set (lua_State *L, Table *t, const TValue *key); +LUAI_FUNC Table *luaH_new (lua_State *L); +LUAI_FUNC void luaH_resize (lua_State *L, Table *t, int nasize, int nhsize); +LUAI_FUNC void luaH_resizearray (lua_State *L, Table *t, int nasize); +LUAI_FUNC void luaH_free (lua_State *L, Table *t); +LUAI_FUNC int luaH_next (lua_State *L, Table *t, StkId key); +LUAI_FUNC int luaH_getn (Table *t); + + +#if defined(LUA_DEBUG) +LUAI_FUNC Node *luaH_mainposition (const Table *t, const TValue *key); +LUAI_FUNC int luaH_isdummy (Node *n); +#endif + + +#endif +/* END CSTYLED */ diff --git a/module/lua/ltablib.c b/module/lua/ltablib.c new file mode 100644 index 000000000000..51cafffaafcd --- /dev/null +++ b/module/lua/ltablib.c @@ -0,0 +1,289 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ltablib.c,v 1.65.1.2 2014/05/07 16:32:55 roberto Exp $ +** Library for Table Manipulation +** See Copyright Notice in lua.h +*/ + + +#define ltablib_c +#define LUA_LIB + +#include + +#include +#include + + +#define aux_getn(L,n) (luaL_checktype(L, n, LUA_TTABLE), luaL_len(L, n)) + + + +#if defined(LUA_COMPAT_MAXN) +static int maxn (lua_State *L) { + lua_Number max = 0; + luaL_checktype(L, 1, LUA_TTABLE); + lua_pushnil(L); /* first key */ + while (lua_next(L, 1)) { + lua_pop(L, 1); /* remove value */ + if (lua_type(L, -1) == LUA_TNUMBER) { + lua_Number v = lua_tonumber(L, -1); + if (v > max) max = v; + } + } + lua_pushnumber(L, max); + return 1; +} +#endif + + +static int tinsert (lua_State *L) { + int e = aux_getn(L, 1) + 1; /* first empty element */ + int pos; /* where to insert new element */ + switch (lua_gettop(L)) { + case 2: { /* called with only 2 arguments */ + pos = e; /* insert new element at the end */ + break; + } + case 3: { + int i; + pos = luaL_checkint(L, 2); /* 2nd argument is the position */ + luaL_argcheck(L, 1 <= pos && pos <= e, 2, "position out of bounds"); + for (i = e; i > pos; i--) { /* move up elements */ + lua_rawgeti(L, 1, i-1); + lua_rawseti(L, 1, i); /* t[i] = t[i-1] */ + } + break; + } + default: { + return luaL_error(L, "wrong number of arguments to " LUA_QL("insert")); + } + } + lua_rawseti(L, 1, pos); /* t[pos] = v */ + return 0; +} + + +static int tremove (lua_State *L) { + int size = aux_getn(L, 1); + int pos = luaL_optint(L, 2, size); + if (pos != size) /* validate 'pos' if given */ + luaL_argcheck(L, 1 <= pos && pos <= size + 1, 1, "position out of bounds"); + lua_rawgeti(L, 1, pos); /* result = t[pos] */ + for ( ; pos < size; pos++) { + lua_rawgeti(L, 1, pos+1); + lua_rawseti(L, 1, pos); /* t[pos] = t[pos+1] */ + } + lua_pushnil(L); + lua_rawseti(L, 1, pos); /* t[pos] = nil */ + return 1; +} + + +static void addfield (lua_State *L, luaL_Buffer *b, int i) { + lua_rawgeti(L, 1, i); + if (!lua_isstring(L, -1)) + luaL_error(L, "invalid value (%s) at index %d in table for " + LUA_QL("concat"), luaL_typename(L, -1), i); + luaL_addvalue(b); +} + + +static int tconcat (lua_State *L) { + luaL_Buffer b; + size_t lsep; + int i, last; + const char *sep = luaL_optlstring(L, 2, "", &lsep); + luaL_checktype(L, 1, LUA_TTABLE); + i = luaL_optint(L, 3, 1); + last = luaL_opt(L, luaL_checkint, 4, luaL_len(L, 1)); + luaL_buffinit(L, &b); + for (; i < last; i++) { + addfield(L, &b, i); + luaL_addlstring(&b, sep, lsep); + } + if (i == last) /* add last value (if interval was not empty) */ + addfield(L, &b, i); + luaL_pushresult(&b); + return 1; +} + + +/* +** {====================================================== +** Pack/unpack +** ======================================================= +*/ + +static int pack (lua_State *L) { + int n = lua_gettop(L); /* number of elements to pack */ + lua_createtable(L, n, 1); /* create result table */ + lua_pushinteger(L, n); + lua_setfield(L, -2, "n"); /* t.n = number of elements */ + if (n > 0) { /* at least one element? */ + int i; + lua_pushvalue(L, 1); + lua_rawseti(L, -2, 1); /* insert first element */ + lua_replace(L, 1); /* move table into index 1 */ + for (i = n; i >= 2; i--) /* assign other elements */ + lua_rawseti(L, 1, i); + } + return 1; /* return table */ +} + + +static int unpack (lua_State *L) { + int i, e; + unsigned int n; + luaL_checktype(L, 1, LUA_TTABLE); + i = luaL_optint(L, 2, 1); + e = luaL_opt(L, luaL_checkint, 3, luaL_len(L, 1)); + if (i > e) return 0; /* empty range */ + n = (unsigned int)e - (unsigned int)i; /* number of elements minus 1 */ + if (n > (INT_MAX - 10) || !lua_checkstack(L, ++n)) + return luaL_error(L, "too many results to unpack"); + lua_rawgeti(L, 1, i); /* push arg[i] (avoiding overflow problems) */ + while (i++ < e) /* push arg[i + 1...e] */ + lua_rawgeti(L, 1, i); + return n; +} + +/* }====================================================== */ + + + +/* +** {====================================================== +** Quicksort +** (based on `Algorithms in MODULA-3', Robert Sedgewick; +** Addison-Wesley, 1993.) +** ======================================================= +*/ + + +static void set2 (lua_State *L, int i, int j) { + lua_rawseti(L, 1, i); + lua_rawseti(L, 1, j); +} + +static int sort_comp (lua_State *L, int a, int b) { + if (!lua_isnil(L, 2)) { /* function? */ + int res; + lua_pushvalue(L, 2); + lua_pushvalue(L, a-1); /* -1 to compensate function */ + lua_pushvalue(L, b-2); /* -2 to compensate function and `a' */ + lua_call(L, 2, 1); + res = lua_toboolean(L, -1); + lua_pop(L, 1); + return res; + } + else /* a < b? */ + return lua_compare(L, a, b, LUA_OPLT); +} + +static void auxsort (lua_State *L, int l, int u) { + while (l < u) { /* for tail recursion */ + int i, j; + /* sort elements a[l], a[(l+u)/2] and a[u] */ + lua_rawgeti(L, 1, l); + lua_rawgeti(L, 1, u); + if (sort_comp(L, -1, -2)) /* a[u] < a[l]? */ + set2(L, l, u); /* swap a[l] - a[u] */ + else + lua_pop(L, 2); + if (u-l == 1) break; /* only 2 elements */ + i = (l+u)/2; + lua_rawgeti(L, 1, i); + lua_rawgeti(L, 1, l); + if (sort_comp(L, -2, -1)) /* a[i]= P */ + while (lua_rawgeti(L, 1, ++i), sort_comp(L, -1, -2)) { + if (i>=u) luaL_error(L, "invalid order function for sorting"); + lua_pop(L, 1); /* remove a[i] */ + } + /* repeat --j until a[j] <= P */ + while (lua_rawgeti(L, 1, --j), sort_comp(L, -3, -1)) { + if (j<=l) luaL_error(L, "invalid order function for sorting"); + lua_pop(L, 1); /* remove a[j] */ + } + if (j + +#include "lobject.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" + + +static const char udatatypename[] = "userdata"; + +LUAI_DDEF const char *const luaT_typenames_[LUA_TOTALTAGS] = { + "no value", + "nil", "boolean", udatatypename, "number", + "string", "table", "function", udatatypename, "thread", + "proto", "upval" /* these last two cases are used for tests only */ +}; + + +void luaT_init (lua_State *L) { + static const char *const luaT_eventname[] = { /* ORDER TM */ + "__index", "__newindex", + "__gc", "__mode", "__len", "__eq", + "__add", "__sub", "__mul", "__div", "__mod", + "__pow", "__unm", "__lt", "__le", + "__concat", "__call" + }; + int i; + for (i=0; itmname[i] = luaS_new(L, luaT_eventname[i]); + luaS_fix(G(L)->tmname[i]); /* never collect these names */ + } +} + + +/* +** function to be used with macro "fasttm": optimized for absence of +** tag methods +*/ +const TValue *luaT_gettm (Table *events, TMS event, TString *ename) { + const TValue *tm = luaH_getstr(events, ename); + lua_assert(event <= TM_EQ); + if (ttisnil(tm)) { /* no tag method? */ + events->flags |= cast_byte(1u<metatable; + break; + case LUA_TUSERDATA: + mt = uvalue(o)->metatable; + break; + default: + mt = G(L)->mt[ttypenv(o)]; + } + return (mt ? luaH_getstr(mt, G(L)->tmname[event]) : luaO_nilobject); +} +/* END CSTYLED */ diff --git a/module/lua/ltm.h b/module/lua/ltm.h new file mode 100644 index 000000000000..c056f4637353 --- /dev/null +++ b/module/lua/ltm.h @@ -0,0 +1,59 @@ +/* BEGIN CSTYLED */ +/* +** $Id: ltm.h,v 2.11.1.1 2013/04/12 18:48:47 roberto Exp $ +** Tag methods +** See Copyright Notice in lua.h +*/ + +#ifndef ltm_h +#define ltm_h + + +#include "lobject.h" + + +/* +* WARNING: if you change the order of this enumeration, +* grep "ORDER TM" +*/ +typedef enum { + TM_INDEX, + TM_NEWINDEX, + TM_GC, + TM_MODE, + TM_LEN, + TM_EQ, /* last tag method with `fast' access */ + TM_ADD, + TM_SUB, + TM_MUL, + TM_DIV, + TM_MOD, + TM_POW, + TM_UNM, + TM_LT, + TM_LE, + TM_CONCAT, + TM_CALL, + TM_N /* number of elements in the enum */ +} TMS; + + + +#define gfasttm(g,et,e) ((et) == NULL ? NULL : \ + ((et)->flags & (1u<<(e))) ? NULL : luaT_gettm(et, e, (g)->tmname[e])) + +#define fasttm(l,et,e) gfasttm(G(l), et, e) + +#define ttypename(x) luaT_typenames_[(x) + 1] +#define objtypename(x) ttypename(ttypenv(x)) + +LUAI_DDEC const char *const luaT_typenames_[LUA_TOTALTAGS]; + + +LUAI_FUNC const TValue *luaT_gettm (Table *events, TMS event, TString *ename); +LUAI_FUNC const TValue *luaT_gettmbyobj (lua_State *L, const TValue *o, + TMS event); +LUAI_FUNC void luaT_init (lua_State *L); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lvm.c b/module/lua/lvm.c new file mode 100644 index 000000000000..4685be52b449 --- /dev/null +++ b/module/lua/lvm.c @@ -0,0 +1,932 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lvm.c,v 2.155.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua virtual machine +** See Copyright Notice in lua.h +*/ + + +#define lvm_c +#define LUA_CORE + +#include + +#include "ldebug.h" +#include "ldo.h" +#include "lfunc.h" +#include "lgc.h" +#include "lobject.h" +#include "lopcodes.h" +#include "lstate.h" +#include "lstring.h" +#include "ltable.h" +#include "ltm.h" +#include "lvm.h" + +#ifdef _KERNEL +#define strcoll(l,r) (strcmp((l),(r))) +#endif + +/* limit for table tag-method chains (to avoid loops) */ +#define MAXTAGLOOP 100 + + +const TValue *luaV_tonumber (const TValue *obj, TValue *n) { + lua_Number num; + if (ttisnumber(obj)) return obj; + if (ttisstring(obj) && luaO_str2d(svalue(obj), tsvalue(obj)->len, &num)) { + setnvalue(n, num); + return n; + } + else + return NULL; +} + + +int luaV_tostring (lua_State *L, StkId obj) { + if (!ttisnumber(obj)) + return 0; + else { + char s[LUAI_MAXNUMBER2STR]; + lua_Number n = nvalue(obj); + int l = lua_number2str(s, n); + setsvalue2s(L, obj, luaS_newlstr(L, s, l)); + return 1; + } +} + + +static void traceexec (lua_State *L) { + CallInfo *ci = L->ci; + lu_byte mask = L->hookmask; + int counthook = ((mask & LUA_MASKCOUNT) && L->hookcount == 0); + if (counthook) + resethookcount(L); /* reset count */ + if (ci->callstatus & CIST_HOOKYIELD) { /* called hook last time? */ + ci->callstatus &= ~CIST_HOOKYIELD; /* erase mark */ + return; /* do not call hook again (VM yielded, so it did not move) */ + } + if (counthook) + luaD_hook(L, LUA_HOOKCOUNT, -1); /* call count hook */ + if (mask & LUA_MASKLINE) { + Proto *p = ci_func(ci)->p; + int npc = pcRel(ci->u.l.savedpc, p); + int newline = getfuncline(p, npc); + if (npc == 0 || /* call linehook when enter a new function, */ + ci->u.l.savedpc <= L->oldpc || /* when jump back (loop), or when */ + newline != getfuncline(p, pcRel(L->oldpc, p))) /* enter a new line */ + luaD_hook(L, LUA_HOOKLINE, newline); /* call line hook */ + } + L->oldpc = ci->u.l.savedpc; + if (L->status == LUA_YIELD) { /* did hook yield? */ + if (counthook) + L->hookcount = 1; /* undo decrement to zero */ + ci->u.l.savedpc--; /* undo increment (resume will increment it again) */ + ci->callstatus |= CIST_HOOKYIELD; /* mark that it yielded */ + ci->func = L->top - 1; /* protect stack below results */ + luaD_throw(L, LUA_YIELD); + } +} + + +static void callTM (lua_State *L, const TValue *f, const TValue *p1, + const TValue *p2, TValue *p3, int hasres) { + if (L == NULL) return; + + ptrdiff_t result = savestack(L, p3); + setobj2s(L, L->top++, f); /* push function */ + setobj2s(L, L->top++, p1); /* 1st argument */ + setobj2s(L, L->top++, p2); /* 2nd argument */ + if (!hasres) /* no result? 'p3' is third argument */ + setobj2s(L, L->top++, p3); /* 3rd argument */ + /* metamethod may yield only when called from Lua code */ + luaD_call(L, L->top - (4 - hasres), hasres, isLua(L->ci)); + if (hasres) { /* if has result, move it to its place */ + p3 = restorestack(L, result); + setobjs2s(L, p3, --L->top); + } +} + + +void luaV_gettable (lua_State *L, const TValue *t, TValue *key, StkId val) { + int loop; + for (loop = 0; loop < MAXTAGLOOP; loop++) { + const TValue *tm; + if (ttistable(t)) { /* `t' is a table? */ + Table *h = hvalue(t); + const TValue *res = luaH_get(h, key); /* do a primitive get */ + if (!ttisnil(res) || /* result is not nil? */ + (tm = fasttm(L, h->metatable, TM_INDEX)) == NULL) { /* or no TM? */ + setobj2s(L, val, res); + return; + } + /* else will try the tag method */ + } + else if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_INDEX))) + luaG_typeerror(L, t, "index"); + if (ttisfunction(tm)) { + callTM(L, tm, t, key, val, 1); + return; + } + t = tm; /* else repeat with 'tm' */ + } + luaG_runerror(L, "loop in gettable"); +} + + +void luaV_settable (lua_State *L, const TValue *t, TValue *key, StkId val) { + int loop; + for (loop = 0; loop < MAXTAGLOOP; loop++) { + const TValue *tm; + if (ttistable(t)) { /* `t' is a table? */ + Table *h = hvalue(t); + TValue *oldval = cast(TValue *, luaH_get(h, key)); + /* if previous value is not nil, there must be a previous entry + in the table; moreover, a metamethod has no relevance */ + if (!ttisnil(oldval) || + /* previous value is nil; must check the metamethod */ + ((tm = fasttm(L, h->metatable, TM_NEWINDEX)) == NULL && + /* no metamethod; is there a previous entry in the table? */ + (oldval != luaO_nilobject || + /* no previous entry; must create one. (The next test is + always true; we only need the assignment.) */ + (oldval = luaH_newkey(L, h, key), 1)))) { + /* no metamethod and (now) there is an entry with given key */ + setobj2t(L, oldval, val); /* assign new value to that entry */ + invalidateTMcache(h); + luaC_barrierback(L, obj2gco(h), val); + return; + } + /* else will try the metamethod */ + } + else /* not a table; check metamethod */ + if (ttisnil(tm = luaT_gettmbyobj(L, t, TM_NEWINDEX))) + luaG_typeerror(L, t, "index"); + /* there is a metamethod */ + if (ttisfunction(tm)) { + callTM(L, tm, t, key, val, 0); + return; + } + t = tm; /* else repeat with 'tm' */ + } + luaG_runerror(L, "loop in settable"); +} + + +static int call_binTM (lua_State *L, const TValue *p1, const TValue *p2, + StkId res, TMS event) { + const TValue *tm = luaT_gettmbyobj(L, p1, event); /* try first operand */ + if (ttisnil(tm)) + tm = luaT_gettmbyobj(L, p2, event); /* try second operand */ + if (ttisnil(tm)) return 0; + callTM(L, tm, p1, p2, res, 1); + return 1; +} + + +static const TValue *get_equalTM (lua_State *L, Table *mt1, Table *mt2, + TMS event) { + const TValue *tm1 = fasttm(L, mt1, event); + const TValue *tm2; + if (tm1 == NULL) return NULL; /* no metamethod */ + if (mt1 == mt2) return tm1; /* same metatables => same metamethods */ + tm2 = fasttm(L, mt2, event); + if (tm2 == NULL) return NULL; /* no metamethod */ + if (luaV_rawequalobj(tm1, tm2)) /* same metamethods? */ + return tm1; + return NULL; +} + + +static int call_orderTM (lua_State *L, const TValue *p1, const TValue *p2, + TMS event) { + if (!call_binTM(L, p1, p2, L->top, event)) + return -1; /* no metamethod */ + else + return !l_isfalse(L->top); +} + + +static int l_strcmp (const TString *ls, const TString *rs) { + const char *l = getstr(ls); + size_t ll = ls->tsv.len; + const char *r = getstr(rs); + size_t lr = rs->tsv.len; + for (;;) { + int temp = strcoll(l, r); + if (temp != 0) return temp; + else { /* strings are equal up to a `\0' */ + size_t len = strlen(l); /* index of first `\0' in both strings */ + if (len == lr) /* r is finished? */ + return (len == ll) ? 0 : 1; + else if (len == ll) /* l is finished? */ + return -1; /* l is smaller than r (because r is not finished) */ + /* both strings longer than `len'; go on comparing (after the `\0') */ + len++; + l += len; ll -= len; r += len; lr -= len; + } + } +} + + +int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r) { + int res; + if (ttisnumber(l) && ttisnumber(r)) + return luai_numlt(L, nvalue(l), nvalue(r)); + else if (ttisstring(l) && ttisstring(r)) + return l_strcmp(rawtsvalue(l), rawtsvalue(r)) < 0; + else if ((res = call_orderTM(L, l, r, TM_LT)) < 0) + luaG_ordererror(L, l, r); + return res; +} + + +int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r) { + int res; + if (ttisnumber(l) && ttisnumber(r)) + return luai_numle(L, nvalue(l), nvalue(r)); + else if (ttisstring(l) && ttisstring(r)) + return l_strcmp(rawtsvalue(l), rawtsvalue(r)) <= 0; + else if ((res = call_orderTM(L, l, r, TM_LE)) >= 0) /* first try `le' */ + return res; + else if ((res = call_orderTM(L, r, l, TM_LT)) < 0) /* else try `lt' */ + luaG_ordererror(L, l, r); + return !res; +} + + +/* +** equality of Lua values. L == NULL means raw equality (no metamethods) +*/ +int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2) { + const TValue *tm; + lua_assert(ttisequal(t1, t2)); + switch (ttype(t1)) { + case LUA_TNIL: return 1; + case LUA_TNUMBER: return luai_numeq(nvalue(t1), nvalue(t2)); + case LUA_TBOOLEAN: return bvalue(t1) == bvalue(t2); /* true must be 1 !! */ + case LUA_TLIGHTUSERDATA: return pvalue(t1) == pvalue(t2); + case LUA_TLCF: return fvalue(t1) == fvalue(t2); + case LUA_TSHRSTR: return eqshrstr(rawtsvalue(t1), rawtsvalue(t2)); + case LUA_TLNGSTR: return luaS_eqlngstr(rawtsvalue(t1), rawtsvalue(t2)); + case LUA_TUSERDATA: { + if (uvalue(t1) == uvalue(t2)) return 1; + else if (L == NULL) return 0; + tm = get_equalTM(L, uvalue(t1)->metatable, uvalue(t2)->metatable, TM_EQ); + break; /* will try TM */ + } + case LUA_TTABLE: { + if (hvalue(t1) == hvalue(t2)) return 1; + else if (L == NULL) return 0; + tm = get_equalTM(L, hvalue(t1)->metatable, hvalue(t2)->metatable, TM_EQ); + break; /* will try TM */ + } + default: + lua_assert(iscollectable(t1)); + return gcvalue(t1) == gcvalue(t2); + } + if (tm == NULL || L == NULL) return 0; /* no TM? */ + callTM(L, tm, t1, t2, L->top, 1); /* call TM */ + return !l_isfalse(L->top); +} + + +void luaV_concat (lua_State *L, int total) { + lua_assert(total >= 2); + do { + StkId top = L->top; + int n = 2; /* number of elements handled in this pass (at least 2) */ + if (!(ttisstring(top-2) || ttisnumber(top-2)) || !tostring(L, top-1)) { + if (!call_binTM(L, top-2, top-1, top-2, TM_CONCAT)) + luaG_concaterror(L, top-2, top-1); + } + else if (tsvalue(top-1)->len == 0) /* second operand is empty? */ + (void)tostring(L, top - 2); /* result is first operand */ + else if (ttisstring(top-2) && tsvalue(top-2)->len == 0) { + setobjs2s(L, top - 2, top - 1); /* result is second op. */ + } + else { + /* at least two non-empty string values; get as many as possible */ + size_t tl = tsvalue(top-1)->len; + char *buffer; + int i; + /* collect total length */ + for (i = 1; i < total && tostring(L, top-i-1); i++) { + size_t l = tsvalue(top-i-1)->len; + if (l >= (MAX_SIZET/sizeof(char)) - tl) + luaG_runerror(L, "string length overflow"); + tl += l; + } + buffer = luaZ_openspace(L, &G(L)->buff, tl); + tl = 0; + n = i; + do { /* concat all strings */ + size_t l = tsvalue(top-i)->len; + memcpy(buffer+tl, svalue(top-i), l * sizeof(char)); + tl += l; + } while (--i > 0); + setsvalue2s(L, top-n, luaS_newlstr(L, buffer, tl)); + } + total -= n-1; /* got 'n' strings to create 1 new */ + L->top -= n-1; /* popped 'n' strings and pushed one */ + } while (total > 1); /* repeat until only 1 result left */ +} + + +void luaV_objlen (lua_State *L, StkId ra, const TValue *rb) { + const TValue *tm; + switch (ttypenv(rb)) { + case LUA_TTABLE: { + Table *h = hvalue(rb); + tm = fasttm(L, h->metatable, TM_LEN); + if (tm) break; /* metamethod? break switch to call it */ + setnvalue(ra, cast_num(luaH_getn(h))); /* else primitive len */ + return; + } + case LUA_TSTRING: { + setnvalue(ra, cast_num(tsvalue(rb)->len)); + return; + } + default: { /* try metamethod */ + tm = luaT_gettmbyobj(L, rb, TM_LEN); + if (ttisnil(tm)) /* no metamethod? */ + luaG_typeerror(L, rb, "get length of"); + break; + } + } + callTM(L, tm, rb, rb, ra, 1); +} + +/* + * luaV_div and luaV_mod patched in from Lua 5.3.2 in order to properly handle + * div/mod by zero (instead of crashing, which is the default behavior in + * Lua 5.2) + */ + +/* +** Integer division; return 'm // n', that is, floor(m/n). +** C division truncates its result (rounds towards zero). +** 'floor(q) == trunc(q)' when 'q >= 0' or when 'q' is integer, +** otherwise 'floor(q) == trunc(q) - 1'. +*/ +static lua_Number luaV_div (lua_State *L, lua_Number m, lua_Number n) { + if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ + if (n == 0) + luaG_runerror(L, "attempt to divide by zero"); + return (0 - m); /* n==-1; avoid overflow with 0x80000...//-1 */ + } + else { + lua_Number q = m / n; /* perform C division */ + if ((m ^ n) < 0 && m % n != 0) /* 'm/n' would be negative non-integer? */ + q -= 1; /* correct result for different rounding */ + return q; + } +} + + +/* +** Integer modulus; return 'm % n'. (Assume that C '%' with +** negative operands follows C99 behavior. See previous comment +** about luaV_div.) +*/ +static lua_Number luaV_mod (lua_State *L, lua_Number m, lua_Number n) { + if ((lua_Unsigned)(n) + 1u <= 1u) { /* special cases: -1 or 0 */ + if (n == 0) + luaG_runerror(L, "attempt to perform 'n%%0'"); + return 0; /* m % -1 == 0; avoid overflow with 0x80000...%-1 */ + } + else { + lua_Number r = m % n; + if (r != 0 && (m ^ n) < 0) /* 'm/n' would be non-integer negative? */ + r += n; /* correct result for different rounding */ + return r; + } +} + +/* + * End patch from 5.3.2 + */ + +void luaV_arith (lua_State *L, StkId ra, const TValue *rb, + const TValue *rc, TMS op) { + TValue tempb, tempc; + const TValue *b, *c; + if ((b = luaV_tonumber(rb, &tempb)) != NULL && + (c = luaV_tonumber(rc, &tempc)) != NULL) { + /* + * Patched: if dividing or modding, use patched functions from 5.3 + */ + lua_Number res; + int lop = op - TM_ADD + LUA_OPADD; + if (lop == LUA_OPDIV) { + res = luaV_div(L, nvalue(b), nvalue(c)); + } else if (lop == LUA_OPMOD) { + res = luaV_mod(L, nvalue(b), nvalue(c)); + } else { + res = luaO_arith(op - TM_ADD + LUA_OPADD, nvalue(b), nvalue(c)); + } + setnvalue(ra, res); + } + else if (!call_binTM(L, rb, rc, ra, op)) + luaG_aritherror(L, rb, rc); +} + + +/* +** check whether cached closure in prototype 'p' may be reused, that is, +** whether there is a cached closure with the same upvalues needed by +** new closure to be created. +*/ +static Closure *getcached (Proto *p, UpVal **encup, StkId base) { + Closure *c = p->cache; + if (c != NULL) { /* is there a cached closure? */ + int nup = p->sizeupvalues; + Upvaldesc *uv = p->upvalues; + int i; + for (i = 0; i < nup; i++) { /* check whether it has right upvalues */ + TValue *v = uv[i].instack ? base + uv[i].idx : encup[uv[i].idx]->v; + if (c->l.upvals[i]->v != v) + return NULL; /* wrong upvalue; cannot reuse closure */ + } + } + return c; /* return cached closure (or NULL if no cached closure) */ +} + + +/* +** create a new Lua closure, push it in the stack, and initialize +** its upvalues. Note that the call to 'luaC_barrierproto' must come +** before the assignment to 'p->cache', as the function needs the +** original value of that field. +*/ +static void pushclosure (lua_State *L, Proto *p, UpVal **encup, StkId base, + StkId ra) { + int nup = p->sizeupvalues; + Upvaldesc *uv = p->upvalues; + int i; + Closure *ncl = luaF_newLclosure(L, nup); + ncl->l.p = p; + setclLvalue(L, ra, ncl); /* anchor new closure in stack */ + for (i = 0; i < nup; i++) { /* fill in its upvalues */ + if (uv[i].instack) /* upvalue refers to local variable? */ + ncl->l.upvals[i] = luaF_findupval(L, base + uv[i].idx); + else /* get upvalue from enclosing function */ + ncl->l.upvals[i] = encup[uv[i].idx]; + } + luaC_barrierproto(L, p, ncl); + p->cache = ncl; /* save it on cache for reuse */ +} + + +/* +** finish execution of an opcode interrupted by an yield +*/ +void luaV_finishOp (lua_State *L) { + CallInfo *ci = L->ci; + StkId base = ci->u.l.base; + Instruction inst = *(ci->u.l.savedpc - 1); /* interrupted instruction */ + OpCode op = GET_OPCODE(inst); + switch (op) { /* finish its execution */ + case OP_ADD: case OP_SUB: case OP_MUL: case OP_DIV: + case OP_MOD: case OP_POW: case OP_UNM: case OP_LEN: + case OP_GETTABUP: case OP_GETTABLE: case OP_SELF: { + setobjs2s(L, base + GETARG_A(inst), --L->top); + break; + } + case OP_LE: case OP_LT: case OP_EQ: { + int res = !l_isfalse(L->top - 1); + L->top--; + /* metamethod should not be called when operand is K */ + lua_assert(!ISK(GETARG_B(inst))); + if (op == OP_LE && /* "<=" using "<" instead? */ + ttisnil(luaT_gettmbyobj(L, base + GETARG_B(inst), TM_LE))) + res = !res; /* invert result */ + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_JMP); + if (res != GETARG_A(inst)) /* condition failed? */ + ci->u.l.savedpc++; /* skip jump instruction */ + break; + } + case OP_CONCAT: { + StkId top = L->top - 1; /* top when 'call_binTM' was called */ + int b = GETARG_B(inst); /* first element to concatenate */ + int total = cast_int(top - 1 - (base + b)); /* yet to concatenate */ + setobj2s(L, top - 2, top); /* put TM result in proper position */ + if (total > 1) { /* are there elements to concat? */ + L->top = top - 1; /* top is one after last element (at top-2) */ + luaV_concat(L, total); /* concat them (may yield again) */ + } + /* move final result to final position */ + setobj2s(L, ci->u.l.base + GETARG_A(inst), L->top - 1); + L->top = ci->top; /* restore top */ + break; + } + case OP_TFORCALL: { + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_TFORLOOP); + L->top = ci->top; /* correct top */ + break; + } + case OP_CALL: { + if (GETARG_C(inst) - 1 >= 0) /* nresults >= 0? */ + L->top = ci->top; /* adjust results */ + break; + } + case OP_TAILCALL: case OP_SETTABUP: case OP_SETTABLE: + break; + default: lua_assert(0); + } +} + + + +/* +** some macros for common tasks in `luaV_execute' +*/ + +#if !defined luai_runtimecheck +#define luai_runtimecheck(L, c) /* void */ +#endif + + +#define RA(i) (base+GETARG_A(i)) +/* to be used after possible stack reallocation */ +#define RB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgR, base+GETARG_B(i)) +#define RC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgR, base+GETARG_C(i)) +#define RKB(i) check_exp(getBMode(GET_OPCODE(i)) == OpArgK, \ + ISK(GETARG_B(i)) ? k+INDEXK(GETARG_B(i)) : base+GETARG_B(i)) +#define RKC(i) check_exp(getCMode(GET_OPCODE(i)) == OpArgK, \ + ISK(GETARG_C(i)) ? k+INDEXK(GETARG_C(i)) : base+GETARG_C(i)) +#define KBx(i) \ + (k + (GETARG_Bx(i) != 0 ? GETARG_Bx(i) - 1 : GETARG_Ax(*ci->u.l.savedpc++))) + + +/* execute a jump instruction */ +#define dojump(ci,i,e) \ + { int a = GETARG_A(i); \ + if (a > 0) luaF_close(L, ci->u.l.base + a - 1); \ + ci->u.l.savedpc += GETARG_sBx(i) + e; } + +/* for test instructions, execute the jump instruction that follows it */ +#define donextjump(ci) { i = *ci->u.l.savedpc; dojump(ci, i, 1); } + + +#define Protect(x) { {x;}; base = ci->u.l.base; } + +#define checkGC(L,c) \ + Protect( luaC_condGC(L,{L->top = (c); /* limit of live values */ \ + luaC_step(L); \ + L->top = ci->top;}) /* restore top */ \ + luai_threadyield(L); ) + + +#define arith_op(op,tm) { \ + TValue *rb = RKB(i); \ + TValue *rc = RKC(i); \ + if (ttisnumber(rb) && ttisnumber(rc)) { \ + lua_Number nb = nvalue(rb), nc = nvalue(rc); \ + setnvalue(ra, op(L, nb, nc)); \ + } \ + else { Protect(luaV_arith(L, ra, rb, rc, tm)); } } + + +#define vmdispatch(o) switch(o) +#define vmcase(l,b) case l: {b} break; +#define vmcasenb(l,b) case l: {b} /* nb = no break */ + +void luaV_execute (lua_State *L) { + CallInfo *ci = L->ci; + LClosure *cl; + TValue *k; + StkId base; + newframe: /* reentry point when frame changes (call/return) */ + lua_assert(ci == L->ci); + cl = clLvalue(ci->func); + k = cl->p->k; + base = ci->u.l.base; + /* main loop of interpreter */ + for (;;) { + Instruction i = *(ci->u.l.savedpc++); + StkId ra; + if ((L->hookmask & (LUA_MASKLINE | LUA_MASKCOUNT)) && + (--L->hookcount == 0 || L->hookmask & LUA_MASKLINE)) { + Protect(traceexec(L)); + } + /* WARNING: several calls may realloc the stack and invalidate `ra' */ + ra = RA(i); + lua_assert(base == ci->u.l.base); + lua_assert(base <= L->top && L->top < L->stack + L->stacksize); + vmdispatch (GET_OPCODE(i)) { + vmcase(OP_MOVE, + setobjs2s(L, ra, RB(i)); + ) + vmcase(OP_LOADK, + TValue *rb = k + GETARG_Bx(i); + setobj2s(L, ra, rb); + ) + vmcase(OP_LOADKX, + TValue *rb; + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); + rb = k + GETARG_Ax(*ci->u.l.savedpc++); + setobj2s(L, ra, rb); + ) + vmcase(OP_LOADBOOL, + setbvalue(ra, GETARG_B(i)); + if (GETARG_C(i)) ci->u.l.savedpc++; /* skip next instruction (if C) */ + ) + vmcase(OP_LOADNIL, + int b = GETARG_B(i); + do { + setnilvalue(ra++); + } while (b--); + ) + vmcase(OP_GETUPVAL, + int b = GETARG_B(i); + setobj2s(L, ra, cl->upvals[b]->v); + ) + vmcase(OP_GETTABUP, + int b = GETARG_B(i); + Protect(luaV_gettable(L, cl->upvals[b]->v, RKC(i), ra)); + ) + vmcase(OP_GETTABLE, + Protect(luaV_gettable(L, RB(i), RKC(i), ra)); + ) + vmcase(OP_SETTABUP, + int a = GETARG_A(i); + Protect(luaV_settable(L, cl->upvals[a]->v, RKB(i), RKC(i))); + ) + vmcase(OP_SETUPVAL, + UpVal *uv = cl->upvals[GETARG_B(i)]; + setobj(L, uv->v, ra); + luaC_barrier(L, uv, ra); + ) + vmcase(OP_SETTABLE, + Protect(luaV_settable(L, ra, RKB(i), RKC(i))); + ) + vmcase(OP_NEWTABLE, + int b = GETARG_B(i); + int c = GETARG_C(i); + Table *t = luaH_new(L); + sethvalue(L, ra, t); + if (b != 0 || c != 0) + luaH_resize(L, t, luaO_fb2int(b), luaO_fb2int(c)); + checkGC(L, ra + 1); + ) + vmcase(OP_SELF, + StkId rb = RB(i); + setobjs2s(L, ra+1, rb); + Protect(luaV_gettable(L, rb, RKC(i), ra)); + ) + vmcase(OP_ADD, + arith_op(luai_numadd, TM_ADD); + ) + vmcase(OP_SUB, + arith_op(luai_numsub, TM_SUB); + ) + vmcase(OP_MUL, + arith_op(luai_nummul, TM_MUL); + ) + /* + * Patched: use luaV_* instead of luai_* to handle div/mod by 0 + */ + vmcase(OP_DIV, + arith_op(luaV_div, TM_DIV); + ) + vmcase(OP_MOD, + arith_op(luaV_mod, TM_MOD); + ) + vmcase(OP_POW, + arith_op(luai_numpow, TM_POW); + ) + vmcase(OP_UNM, + TValue *rb = RB(i); + if (ttisnumber(rb)) { + lua_Number nb = nvalue(rb); + setnvalue(ra, luai_numunm(L, nb)); + } + else { + Protect(luaV_arith(L, ra, rb, rb, TM_UNM)); + } + ) + vmcase(OP_NOT, + TValue *rb = RB(i); + int res = l_isfalse(rb); /* next assignment may change this value */ + setbvalue(ra, res); + ) + vmcase(OP_LEN, + Protect(luaV_objlen(L, ra, RB(i))); + ) + vmcase(OP_CONCAT, + int b = GETARG_B(i); + int c = GETARG_C(i); + StkId rb; + L->top = base + c + 1; /* mark the end of concat operands */ + Protect(luaV_concat(L, c - b + 1)); + ra = RA(i); /* 'luav_concat' may invoke TMs and move the stack */ + rb = b + base; + setobjs2s(L, ra, rb); + checkGC(L, (ra >= rb ? ra + 1 : rb)); + L->top = ci->top; /* restore top */ + ) + vmcase(OP_JMP, + dojump(ci, i, 0); + ) + vmcase(OP_EQ, + TValue *rb = RKB(i); + TValue *rc = RKC(i); + Protect( + if (cast_int(equalobj(L, rb, rc)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_LT, + Protect( + if (luaV_lessthan(L, RKB(i), RKC(i)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_LE, + Protect( + if (luaV_lessequal(L, RKB(i), RKC(i)) != GETARG_A(i)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + ) + vmcase(OP_TEST, + if (GETARG_C(i) ? l_isfalse(ra) : !l_isfalse(ra)) + ci->u.l.savedpc++; + else + donextjump(ci); + ) + vmcase(OP_TESTSET, + TValue *rb = RB(i); + if (GETARG_C(i) ? l_isfalse(rb) : !l_isfalse(rb)) + ci->u.l.savedpc++; + else { + setobjs2s(L, ra, rb); + donextjump(ci); + } + ) + vmcase(OP_CALL, + int b = GETARG_B(i); + int nresults = GETARG_C(i) - 1; + if (b != 0) L->top = ra+b; /* else previous instruction set top */ + if (luaD_precall(L, ra, nresults)) { /* C function? */ + if (nresults >= 0) L->top = ci->top; /* adjust results */ + base = ci->u.l.base; + } + else { /* Lua function */ + ci = L->ci; + ci->callstatus |= CIST_REENTRY; + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcase(OP_TAILCALL, + int b = GETARG_B(i); + if (b != 0) L->top = ra+b; /* else previous instruction set top */ + lua_assert(GETARG_C(i) - 1 == LUA_MULTRET); + if (luaD_precall(L, ra, LUA_MULTRET)) /* C function? */ + base = ci->u.l.base; + else { + /* tail call: put called frame (n) in place of caller one (o) */ + CallInfo *nci = L->ci; /* called frame */ + CallInfo *oci = nci->previous; /* caller frame */ + StkId nfunc = nci->func; /* called function */ + StkId ofunc = oci->func; /* caller function */ + /* last stack slot filled by 'precall' */ + StkId lim = nci->u.l.base + getproto(nfunc)->numparams; + int aux; + /* close all upvalues from previous call */ + if (cl->p->sizep > 0) luaF_close(L, oci->u.l.base); + /* move new frame into old one */ + for (aux = 0; nfunc + aux < lim; aux++) + setobjs2s(L, ofunc + aux, nfunc + aux); + oci->u.l.base = ofunc + (nci->u.l.base - nfunc); /* correct base */ + oci->top = L->top = ofunc + (L->top - nfunc); /* correct top */ + oci->u.l.savedpc = nci->u.l.savedpc; + oci->callstatus |= CIST_TAIL; /* function was tail called */ + ci = L->ci = oci; /* remove new frame */ + lua_assert(L->top == oci->u.l.base + getproto(ofunc)->maxstacksize); + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcasenb(OP_RETURN, + int b = GETARG_B(i); + if (b != 0) L->top = ra+b-1; + if (cl->p->sizep > 0) luaF_close(L, base); + b = luaD_poscall(L, ra); + if (!(ci->callstatus & CIST_REENTRY)) /* 'ci' still the called one */ + return; /* external invocation: return */ + else { /* invocation via reentry: continue execution */ + ci = L->ci; + if (b) L->top = ci->top; + lua_assert(isLua(ci)); + lua_assert(GET_OPCODE(*((ci)->u.l.savedpc - 1)) == OP_CALL); + goto newframe; /* restart luaV_execute over new Lua function */ + } + ) + vmcase(OP_FORLOOP, + lua_Number step = nvalue(ra+2); + lua_Number idx = luai_numadd(L, nvalue(ra), step); /* increment index */ + lua_Number limit = nvalue(ra+1); + if (luai_numlt(L, 0, step) ? luai_numle(L, idx, limit) + : luai_numle(L, limit, idx)) { + ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ + setnvalue(ra, idx); /* update internal index... */ + setnvalue(ra+3, idx); /* ...and external index */ + } + ) + vmcase(OP_FORPREP, + const TValue *init = ra; + const TValue *plimit = ra+1; + const TValue *pstep = ra+2; + if (!tonumber(init, ra)) + luaG_runerror(L, LUA_QL("for") " initial value must be a number"); + else if (!tonumber(plimit, ra+1)) + luaG_runerror(L, LUA_QL("for") " limit must be a number"); + else if (!tonumber(pstep, ra+2)) + luaG_runerror(L, LUA_QL("for") " step must be a number"); + setnvalue(ra, luai_numsub(L, nvalue(ra), nvalue(pstep))); + ci->u.l.savedpc += GETARG_sBx(i); + ) + vmcasenb(OP_TFORCALL, + StkId cb = ra + 3; /* call base */ + setobjs2s(L, cb+2, ra+2); + setobjs2s(L, cb+1, ra+1); + setobjs2s(L, cb, ra); + L->top = cb + 3; /* func. + 2 args (state and index) */ + Protect(luaD_call(L, cb, GETARG_C(i), 1)); + L->top = ci->top; + i = *(ci->u.l.savedpc++); /* go to next instruction */ + ra = RA(i); + lua_assert(GET_OPCODE(i) == OP_TFORLOOP); + goto l_tforloop; + ) + vmcase(OP_TFORLOOP, + l_tforloop: + if (!ttisnil(ra + 1)) { /* continue loop? */ + setobjs2s(L, ra, ra + 1); /* save control variable */ + ci->u.l.savedpc += GETARG_sBx(i); /* jump back */ + } + ) + vmcase(OP_SETLIST, + int n = GETARG_B(i); + int c = GETARG_C(i); + int last; + Table *h; + if (n == 0) n = cast_int(L->top - ra) - 1; + if (c == 0) { + lua_assert(GET_OPCODE(*ci->u.l.savedpc) == OP_EXTRAARG); + c = GETARG_Ax(*ci->u.l.savedpc++); + } + luai_runtimecheck(L, ttistable(ra)); + h = hvalue(ra); + last = ((c-1)*LFIELDS_PER_FLUSH) + n; + if (last > h->sizearray) /* needs more space? */ + luaH_resizearray(L, h, last); /* pre-allocate it at once */ + for (; n > 0; n--) { + TValue *val = ra+n; + luaH_setint(L, h, last--, val); + luaC_barrierback(L, obj2gco(h), val); + } + L->top = ci->top; /* correct top (in case of previous open call) */ + ) + vmcase(OP_CLOSURE, + Proto *p = cl->p->p[GETARG_Bx(i)]; + Closure *ncl = getcached(p, cl->upvals, base); /* cached closure */ + if (ncl == NULL) /* no match? */ + pushclosure(L, p, cl->upvals, base, ra); /* create a new one */ + else + setclLvalue(L, ra, ncl); /* push cashed closure */ + checkGC(L, ra + 1); + ) + vmcase(OP_VARARG, + int b = GETARG_B(i) - 1; + int j; + int n = cast_int(base - ci->func) - cl->p->numparams - 1; + if (b < 0) { /* B == 0? */ + b = n; /* get all var. arguments */ + Protect(luaD_checkstack(L, n)); + ra = RA(i); /* previous call may change the stack */ + L->top = ra + n; + } + for (j = 0; j < b; j++) { + if (j < n) { + setobjs2s(L, ra + j, base - n + j); + } + else { + setnilvalue(ra + j); + } + } + ) + vmcase(OP_EXTRAARG, + lua_assert(0); + ) + } + } +} + +/* END CSTYLED */ diff --git a/module/lua/lvm.h b/module/lua/lvm.h new file mode 100644 index 000000000000..2d2be9836f69 --- /dev/null +++ b/module/lua/lvm.h @@ -0,0 +1,46 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lvm.h,v 2.18.1.1 2013/04/12 18:48:47 roberto Exp $ +** Lua virtual machine +** See Copyright Notice in lua.h +*/ + +#ifndef lvm_h +#define lvm_h + + +#include "ldo.h" +#include "lobject.h" +#include "ltm.h" + + +#define tostring(L,o) (ttisstring(o) || (luaV_tostring(L, o))) + +#define tonumber(o,n) (ttisnumber(o) || (((o) = luaV_tonumber(o,n)) != NULL)) + +#define equalobj(L,o1,o2) (ttisequal(o1, o2) && luaV_equalobj_(L, o1, o2)) + +#define luaV_rawequalobj(o1,o2) equalobj(NULL,o1,o2) + + +/* not to called directly */ +LUAI_FUNC int luaV_equalobj_ (lua_State *L, const TValue *t1, const TValue *t2); + + +LUAI_FUNC int luaV_lessthan (lua_State *L, const TValue *l, const TValue *r); +LUAI_FUNC int luaV_lessequal (lua_State *L, const TValue *l, const TValue *r); +LUAI_FUNC const TValue *luaV_tonumber (const TValue *obj, TValue *n); +LUAI_FUNC int luaV_tostring (lua_State *L, StkId obj); +LUAI_FUNC void luaV_gettable (lua_State *L, const TValue *t, TValue *key, + StkId val); +LUAI_FUNC void luaV_settable (lua_State *L, const TValue *t, TValue *key, + StkId val); +LUAI_FUNC void luaV_finishOp (lua_State *L); +LUAI_FUNC void luaV_execute (lua_State *L); +LUAI_FUNC void luaV_concat (lua_State *L, int total); +LUAI_FUNC void luaV_arith (lua_State *L, StkId ra, const TValue *rb, + const TValue *rc, TMS op); +LUAI_FUNC void luaV_objlen (lua_State *L, StkId ra, const TValue *rb); + +#endif +/* END CSTYLED */ diff --git a/module/lua/lzio.c b/module/lua/lzio.c new file mode 100644 index 000000000000..bfbb41cf8ed3 --- /dev/null +++ b/module/lua/lzio.c @@ -0,0 +1,74 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lzio.c,v 1.35.1.1 2013/04/12 18:48:47 roberto Exp $ +** Buffered streams +** See Copyright Notice in lua.h +*/ + + +#define lzio_c +#define LUA_CORE + +#include + +#include "llimits.h" +#include "lmem.h" +#include "lstate.h" +#include "lzio.h" + + +int luaZ_fill (ZIO *z) { + size_t size; + lua_State *L = z->L; + const char *buff; + lua_unlock(L); + buff = z->reader(L, z->data, &size); + lua_lock(L); + if (buff == NULL || size == 0) + return EOZ; + z->n = size - 1; /* discount char being returned */ + z->p = buff; + return cast_uchar(*(z->p++)); +} + + +void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, void *data) { + z->L = L; + z->reader = reader; + z->data = data; + z->n = 0; + z->p = NULL; +} + + +/* --------------------------------------------------------------- read --- */ +size_t luaZ_read (ZIO *z, void *b, size_t n) { + while (n) { + size_t m; + if (z->n == 0) { /* no bytes in buffer? */ + if (luaZ_fill(z) == EOZ) /* try to read more */ + return n; /* no more input; return number of missing bytes */ + else { + z->n++; /* luaZ_fill consumed first byte; put it back */ + z->p--; + } + } + m = (n <= z->n) ? n : z->n; /* min. between n and z->n */ + memcpy(b, z->p, m); + z->n -= m; + z->p += m; + b = (char *)b + m; + n -= m; + } + return 0; +} + +/* ------------------------------------------------------------------------ */ +char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n) { + if (n > buff->buffsize) { + if (n < LUA_MINBUFFER) n = LUA_MINBUFFER; + luaZ_resizebuffer(L, buff, n); + } + return buff->buffer; +} +/* END CSTYLED */ diff --git a/module/lua/lzio.h b/module/lua/lzio.h new file mode 100644 index 000000000000..27908759d509 --- /dev/null +++ b/module/lua/lzio.h @@ -0,0 +1,67 @@ +/* BEGIN CSTYLED */ +/* +** $Id: lzio.h,v 1.26.1.1 2013/04/12 18:48:47 roberto Exp $ +** Buffered streams +** See Copyright Notice in lua.h +*/ + + +#ifndef lzio_h +#define lzio_h + +#include + +#include "lmem.h" + + +#define EOZ (-1) /* end of stream */ + +typedef struct Zio ZIO; + +#define zgetc(z) (((z)->n--)>0 ? cast_uchar(*(z)->p++) : luaZ_fill(z)) + + +typedef struct Mbuffer { + char *buffer; + size_t n; + size_t buffsize; +} Mbuffer; + +#define luaZ_initbuffer(L, buff) ((buff)->buffer = NULL, (buff)->buffsize = 0) + +#define luaZ_buffer(buff) ((buff)->buffer) +#define luaZ_sizebuffer(buff) ((buff)->buffsize) +#define luaZ_bufflen(buff) ((buff)->n) + +#define luaZ_resetbuffer(buff) ((buff)->n = 0) + + +#define luaZ_resizebuffer(L, buff, size) \ + (luaM_reallocvector(L, (buff)->buffer, (buff)->buffsize, size, char), \ + (buff)->buffsize = size) + +#define luaZ_freebuffer(L, buff) luaZ_resizebuffer(L, buff, 0) + + +LUAI_FUNC char *luaZ_openspace (lua_State *L, Mbuffer *buff, size_t n); +LUAI_FUNC void luaZ_init (lua_State *L, ZIO *z, lua_Reader reader, + void *data); +LUAI_FUNC size_t luaZ_read (ZIO* z, void* b, size_t n); /* read next n bytes */ + + + +/* --------- Private Part ------------------ */ + +struct Zio { + size_t n; /* bytes still unread */ + const char *p; /* current position in buffer */ + lua_Reader reader; /* reader function */ + void* data; /* additional data */ + lua_State *L; /* Lua state (for reader) */ +}; + + +LUAI_FUNC int luaZ_fill (ZIO *z); + +#endif +/* END CSTYLED */ diff --git a/module/lua/setjmp/setjmp.S b/module/lua/setjmp/setjmp.S new file mode 100644 index 000000000000..1f461a0a4ef3 --- /dev/null +++ b/module/lua/setjmp/setjmp.S @@ -0,0 +1,19 @@ +#if defined(__x86_64__) +#include "setjmp_x86_64.S" +#elif defined(__i386__) +#include "setjmp_i386.S" +#elif defined(__aarch64__) +#include "setjmp_aarch64.S" +#elif defined(__arm__) +#include "setjmp_arm.S" +#elif defined(__sparc__) && defined(__arch64__) +#include "setjmp_sparc64.S" +#elif defined(__powerpc__) +#include "setjmp_ppc.S" +#elif defined(__mips__) +#include "setjmp_mips.S" +#elif defined(__s390x__) +#include "setjmp_s390x.S" +#elif defined(__riscv) +#include "setjmp_rv64g.S" +#endif diff --git a/module/lua/setjmp/setjmp_aarch64.S b/module/lua/setjmp/setjmp_aarch64.S new file mode 100644 index 000000000000..a5a9a85fd57e --- /dev/null +++ b/module/lua/setjmp/setjmp_aarch64.S @@ -0,0 +1,86 @@ +/*- + * Copyright (c) 2014 Andrew Turner + * Copyright (c) 2014-2015 The FreeBSD Foundation + * All rights reserved. + * + * Portions of this software were developed by Andrew Turner + * under sponsorship from the FreeBSD Foundation + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + + +#ifdef __aarch64__ + +#define ENTRY(sym) \ + .text; \ + .globl sym; \ + .align 2; \ + .type sym,#function; \ +sym: + +#define END(sym) \ + .size sym, . - sym + + +ENTRY(setjmp) + /* Store the stack pointer */ + mov x8, sp + str x8, [x0], #8 + + /* Store the general purpose registers and lr */ + stp x19, x20, [x0], #16 + stp x21, x22, [x0], #16 + stp x23, x24, [x0], #16 + stp x25, x26, [x0], #16 + stp x27, x28, [x0], #16 + stp x29, x30, [x0], #16 + + /* Return value */ + mov x0, #0 + ret +END(setjmp) + +ENTRY(longjmp) + /* Restore the stack pointer */ + ldr x8, [x0], #8 + mov sp, x8 + + /* Restore the general purpose registers and lr */ + ldp x19, x20, [x0], #16 + ldp x21, x22, [x0], #16 + ldp x23, x24, [x0], #16 + ldp x25, x26, [x0], #16 + ldp x27, x28, [x0], #16 + ldp x29, x30, [x0], #16 + + /* Load the return value */ + mov x0, x1 + ret +END(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* __aarch64__ */ diff --git a/module/lua/setjmp/setjmp_arm.S b/module/lua/setjmp/setjmp_arm.S new file mode 100644 index 000000000000..78bc3e0b347d --- /dev/null +++ b/module/lua/setjmp/setjmp_arm.S @@ -0,0 +1,84 @@ +/*- + * Copyright 2004-2014 Olivier Houchard + * Copyright 2012-2014 Ian Lepore + * Copyright 2013-2014 Andrew Turner + * Copyright 2014 Svatopluk Kraus + * Copyright 2014 Michal Meloun + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + + +#if defined(__arm__) && !defined(__aarch64__) + +#if defined(__thumb2__) +#define _FUNC_MODE .code 16; .thumb_func +#else +#define _FUNC_MODE .code 32 +#endif + +#define ENTRY(x) \ + .text; \ + .syntax unified; \ + .align 2; \ + .global x; \ + .type x,#function; \ + _FUNC_MODE; \ +x: + +#define END(x) \ + .size x, . - x; + +#define RET bx lr + + +/* + * setjump + longjmp + */ +ENTRY(setjmp) +#if defined(__thumb2__) + mov ip, sp + stmia r0, {r4-r12,r14} +#else + stmia r0, {r4-r14} +#endif + mov r0, #0x00000000 + RET +END(setjmp) + +ENTRY(longjmp) +#if defined(__thumb2__) + ldmia r0, {r4-r12,r14} + mov sp, ip +#else + ldmia r0, {r4-r14} +#endif + mov r0, #0x00000001 + RET +END(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +#endif diff --git a/module/lua/setjmp/setjmp_i386.S b/module/lua/setjmp/setjmp_i386.S new file mode 100644 index 000000000000..6d6a5f332688 --- /dev/null +++ b/module/lua/setjmp/setjmp_i386.S @@ -0,0 +1,69 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#define ENTRY(x) \ + .text; \ + .align 8; \ + .globl x; \ + .type x, @function; \ +x: + +#define SET_SIZE(x) \ + .size x, [.-x] + +/* + * Setjmp and longjmp implement non-local gotos using state vectors + * type label_t. + */ +#ifdef __i386__ + + ENTRY(setjmp) /* save area is passed in eax */ + movl %ebp, 0(%eax) /* save ebp */ + movl %ebx, 4(%eax) /* save ebx */ + movl %esi, 8(%eax) /* save esi */ + movl %edi, 12(%eax) /* save edi */ + movl %esp, 16(%eax) /* save esp */ + movl (%esp), %ecx /* %eip (return address) */ + movl %ecx, 20(%eax) /* save eip */ + subl %eax, %eax /* return 0 */ + ret + SET_SIZE(setjmp) + + ENTRY(longjmp) /* save area is passed in eax */ + movl 0(%eax), %ebp /* restore ebp */ + movl 4(%eax), %ebx /* restore ebx */ + movl 8(%eax), %esi /* restore esi */ + movl 12(%eax), %edi /* restore edi */ + movl 16(%eax), %esp /* restore esp */ + movl 20(%eax), %ecx /* %eip (return address) */ + addl $4, %esp /* pop ret adr */ + jmp *%ecx /* indirect jump */ + SET_SIZE(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* __i386__ */ diff --git a/module/lua/setjmp/setjmp_mips.S b/module/lua/setjmp/setjmp_mips.S new file mode 100644 index 000000000000..0084fbfa4bec --- /dev/null +++ b/module/lua/setjmp/setjmp_mips.S @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2000, 2001, 2002, 2003, 2004, 2005, 2008, 2009 + * The President and Fellows of Harvard College. + * Copyright (c) 2017 MIPS Technologies, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE UNIVERSITY OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +#include + +/* + * setjmp and longjmp for MIPS. + */ + + .text + .set noreorder + + /* + * int setjmp(jmp_buf jb); + * + * Save the current state so we can return again from the call later + * if/when longjmp is called. (If the function that called setjmp + * returns before longjmp is called, the results are undefined. We + * only need to save registers, not the whole contents of the stack.) + */ +LEAF(setjmp) + /* + * jmp_buf is in a0. We need to save s0-s8, sp, gp, and ra in it. + * Don't store more registers without adjusting machine/setjmp.h. + */ + + REG_S sp, 0(a0) /* save registers */ + REG_S ra, 1*SZREG(a0) + REG_S gp, 2*SZREG(a0) + REG_S s0, 3*SZREG(a0) + REG_S s1, 4*SZREG(a0) + REG_S s2, 5*SZREG(a0) + REG_S s3, 6*SZREG(a0) + REG_S s4, 7*SZREG(a0) + REG_S s5, 8*SZREG(a0) + REG_S s6, 9*SZREG(a0) + REG_S s7, 10*SZREG(a0) + REG_S s8, 11*SZREG(a0) + + jr ra /* done */ + move v0, zero /* return 0 (in delay slot) */ +END(setjmp) + + + /* + * void longjmp(jmp_buf jb, int code); + */ +LEAF(longjmp) + /* + * jmp_buf is in a0. Return code is in a1. + * We need to restore s0-s8, sp, gp, and ra from the jmp_buf. + * The return code is forced to 1 if 0 is passed in. + */ + + sltiu t0, a1, 1 /* set t0 to 1 if return code is 0... otherwise 0 */ + addu a1, a1, t0 /* update the return code */ + + REG_L sp, 0(a0) /* restore registers */ + REG_L ra, 1*SZREG(a0) + REG_L gp, 2*SZREG(a0) + REG_L s0, 3*SZREG(a0) + REG_L s1, 4*SZREG(a0) + REG_L s2, 5*SZREG(a0) + REG_L s3, 6*SZREG(a0) + REG_L s4, 7*SZREG(a0) + REG_L s5, 8*SZREG(a0) + REG_L s6, 9*SZREG(a0) + REG_L s7, 10*SZREG(a0) + REG_L s8, 11*SZREG(a0) + + jr ra /* return, to where setjmp was called from */ + move v0, a1 /* set return value */ +END(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/lua/setjmp/setjmp_ppc.S b/module/lua/setjmp/setjmp_ppc.S new file mode 100644 index 000000000000..f787ef349136 --- /dev/null +++ b/module/lua/setjmp/setjmp_ppc.S @@ -0,0 +1,165 @@ +/* $FreeBSD$ */ +/* from: NetBSD: setjmp.S,v 1.1 1998/01/27 15:13:12 sakamoto Exp $ */ +/* from: OpenBSD: setjmp.S,v 1.2 1996/12/28 06:22:18 rahnds Exp */ +/* kernel version of this file, does not have signal goop */ +/* int setjmp(jmp_buf env) */ + +#define _ASM +#include + +#ifdef __powerpc64__ +#if !defined(PPC64_ELF_ABI_v2) && !defined(PPC64_ELF_ABI_v1) +#if defined(_CALL_ELF) && _CALL_ELF == 2 +#define PPC64_ELF_ABI_v2 +#endif /* _CALL_ELF */ +#endif /* PPC64_ELF_ABI_ */ +#endif /* __powerpc64__ */ + +#ifdef __powerpc64__ +#define LD_REG ld +#define ST_REG std +#define REGWIDTH 8 +#else +#define LD_REG lwz +#define ST_REG stw +#define REGWIDTH 4 +#endif /* __powerpc64__ */ + +#define JMP_r1 1*REGWIDTH +#define JMP_r2 2*REGWIDTH +#define JMP_r14 3*REGWIDTH +#define JMP_r15 4*REGWIDTH +#define JMP_r16 5*REGWIDTH +#define JMP_r17 6*REGWIDTH +#define JMP_r18 7*REGWIDTH +#define JMP_r19 8*REGWIDTH +#define JMP_r20 9*REGWIDTH +#define JMP_r21 10*REGWIDTH +#define JMP_r22 11*REGWIDTH +#define JMP_r23 12*REGWIDTH +#define JMP_r24 13*REGWIDTH +#define JMP_r25 14*REGWIDTH +#define JMP_r26 15*REGWIDTH +#define JMP_r27 16*REGWIDTH +#define JMP_r28 17*REGWIDTH +#define JMP_r29 18*REGWIDTH +#define JMP_r30 19*REGWIDTH +#define JMP_r31 20*REGWIDTH +#define JMP_lr 21*REGWIDTH +#define JMP_cr 22*REGWIDTH +#define JMP_ctr 23*REGWIDTH +#define JMP_xer 24*REGWIDTH + +#ifdef __powerpc64__ +#ifdef PPC64_ELF_ABI_v2 + +#define ENTRY(name) \ + .align 2 ; \ + .type name,@function; \ + .globl name; \ +name: + +#else /* PPC64_ELF_ABI_v1 */ + +#define XGLUE(a,b) a##b +#define GLUE(a,b) XGLUE(a,b) +#define ENTRY(name) \ + .align 2 ; \ + .globl name; \ + .globl GLUE(.,name); \ + .pushsection ".opd","aw"; \ +name: \ + .quad GLUE(.,name); \ + .quad .TOC.@tocbase; \ + .quad 0; \ + .popsection; \ + .type GLUE(.,name),@function; \ +GLUE(.,name): + +#endif /* PPC64_ELF_ABI_v2 */ + +#else /* 32-bit */ + +#define ENTRY(name) \ + .text; \ + .p2align 4; \ + .globl name; \ + .type name,@function; \ +name: + +#endif /* __powerpc64__ */ + + +ENTRY(setjmp) + ST_REG 31, JMP_r31(3) + /* r1, r2, r14-r30 */ + ST_REG 1, JMP_r1 (3) + ST_REG 2, JMP_r2 (3) + ST_REG 14, JMP_r14(3) + ST_REG 15, JMP_r15(3) + ST_REG 16, JMP_r16(3) + ST_REG 17, JMP_r17(3) + ST_REG 18, JMP_r18(3) + ST_REG 19, JMP_r19(3) + ST_REG 20, JMP_r20(3) + ST_REG 21, JMP_r21(3) + ST_REG 22, JMP_r22(3) + ST_REG 23, JMP_r23(3) + ST_REG 24, JMP_r24(3) + ST_REG 25, JMP_r25(3) + ST_REG 26, JMP_r26(3) + ST_REG 27, JMP_r27(3) + ST_REG 28, JMP_r28(3) + ST_REG 29, JMP_r29(3) + ST_REG 30, JMP_r30(3) + /* cr, lr, ctr, xer */ + mfcr 0 + ST_REG 0, JMP_cr(3) + mflr 0 + ST_REG 0, JMP_lr(3) + mfctr 0 + ST_REG 0, JMP_ctr(3) + mfxer 0 + ST_REG 0, JMP_xer(3) + /* f14-f31, fpscr */ + li 3, 0 + blr + +ENTRY(longjmp) + LD_REG 31, JMP_r31(3) + /* r1, r2, r14-r30 */ + LD_REG 1, JMP_r1 (3) + LD_REG 2, JMP_r2 (3) + LD_REG 14, JMP_r14(3) + LD_REG 15, JMP_r15(3) + LD_REG 16, JMP_r16(3) + LD_REG 17, JMP_r17(3) + LD_REG 18, JMP_r18(3) + LD_REG 19, JMP_r19(3) + LD_REG 20, JMP_r20(3) + LD_REG 21, JMP_r21(3) + LD_REG 22, JMP_r22(3) + LD_REG 23, JMP_r23(3) + LD_REG 24, JMP_r24(3) + LD_REG 25, JMP_r25(3) + LD_REG 26, JMP_r26(3) + LD_REG 27, JMP_r27(3) + LD_REG 28, JMP_r28(3) + LD_REG 29, JMP_r29(3) + LD_REG 30, JMP_r30(3) + /* cr, lr, ctr, xer */ + LD_REG 0, JMP_cr(3) + mtcr 0 + LD_REG 0, JMP_lr(3) + mtlr 0 + LD_REG 0, JMP_ctr(3) + mtctr 0 + LD_REG 0, JMP_xer(3) + mtxer 0 + /* f14-f31, fpscr */ + mr 3, 4 + blr + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/lua/setjmp/setjmp_rv64g.S b/module/lua/setjmp/setjmp_rv64g.S new file mode 100644 index 000000000000..7f6c50d25a4c --- /dev/null +++ b/module/lua/setjmp/setjmp_rv64g.S @@ -0,0 +1,91 @@ +/*- + * Copyright (c) 2015-2016 Ruslan Bukin + * All rights reserved. + * + * Portions of this software were developed by SRI International and the + * University of Cambridge Computer Laboratory under DARPA/AFRL contract + * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme. + * + * Portions of this software were developed by the University of Cambridge + * Computer Laboratory as part of the CTSRD Project, with support from the + * UK Higher Education Innovation Fund (HEIF). + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#define ENTRY(sym) \ + .text; .globl sym; .type sym,@function; sym: +#define END(sym) .size sym, . - sym + + +ENTRY(setjmp) + /* Store the stack pointer */ + sd sp, (0 * 8)(a0) + addi a0, a0, (1 * 8) + + /* Store the general purpose registers and ra */ + sd s0, (0 * 8)(a0) + sd s1, (1 * 8)(a0) + sd s2, (2 * 8)(a0) + sd s3, (3 * 8)(a0) + sd s4, (4 * 8)(a0) + sd s5, (5 * 8)(a0) + sd s6, (6 * 8)(a0) + sd s7, (7 * 8)(a0) + sd s8, (8 * 8)(a0) + sd s9, (9 * 8)(a0) + sd s10, (10 * 8)(a0) + sd s11, (11 * 8)(a0) + sd ra, (12 * 8)(a0) + addi a0, a0, (13 * 8) + + /* Return value */ + li a0, 0 + ret +END(setjmp) + +ENTRY(longjmp) + /* Restore the stack pointer */ + ld t0, 0(a0) + mv sp, t0 + addi a0, a0, (1 * 8) + + /* Restore the general purpose registers and ra */ + ld s0, (0 * 8)(a0) + ld s1, (1 * 8)(a0) + ld s2, (2 * 8)(a0) + ld s3, (3 * 8)(a0) + ld s4, (4 * 8)(a0) + ld s5, (5 * 8)(a0) + ld s6, (6 * 8)(a0) + ld s7, (7 * 8)(a0) + ld s8, (8 * 8)(a0) + ld s9, (9 * 8)(a0) + ld s10, (10 * 8)(a0) + ld s11, (11 * 8)(a0) + ld ra, (12 * 8)(a0) + addi a0, a0, (13 * 8) + + /* Load the return value */ + mv a0, a1 + ret +END(longjmp) diff --git a/module/lua/setjmp/setjmp_s390x.S b/module/lua/setjmp/setjmp_s390x.S new file mode 100644 index 000000000000..336c66c08b51 --- /dev/null +++ b/module/lua/setjmp/setjmp_s390x.S @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2005-2014 Rich Felker, et al. + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + * IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + * CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + */ + + .global setjmp + .type setjmp,@function +setjmp: + stmg %r6, %r15, 0(%r2) + + std %f8, 10*8(%r2) + std %f9, 11*8(%r2) + std %f10, 12*8(%r2) + std %f11, 13*8(%r2) + std %f12, 14*8(%r2) + std %f13, 15*8(%r2) + std %f14, 16*8(%r2) + std %f15, 17*8(%r2) + + lghi %r2, 0 + br %r14 + + .global longjmp + .type longjmp,@function +longjmp: + +1: + lmg %r6, %r15, 0(%r2) + + ld %f8, 10*8(%r2) + ld %f9, 11*8(%r2) + ld %f10, 12*8(%r2) + ld %f11, 13*8(%r2) + ld %f12, 14*8(%r2) + ld %f13, 15*8(%r2) + ld %f14, 16*8(%r2) + ld %f15, 17*8(%r2) + + ltgr %r2, %r3 + bnzr %r14 + lhi %r2, 1 + br %r14 + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/lua/setjmp/setjmp_sparc64.S b/module/lua/setjmp/setjmp_sparc64.S new file mode 100644 index 000000000000..a37a71cbce33 --- /dev/null +++ b/module/lua/setjmp/setjmp_sparc64.S @@ -0,0 +1,105 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Header: _setjmp.s,v 1.1 91/07/06 16:45:53 torek Exp + */ + +#if defined(LIBC_SCCS) && !defined(lint) +#if 0 + .asciz "@(#)_setjmp.s 8.1 (Berkeley) 6/4/93" +#else + RCSID("$NetBSD: _setjmp.S,v 1.4 1998/10/08 02:27:59 eeh Exp $") +#endif +#endif /* LIBC_SCCS and not lint */ + +#define _JB_FP 0x0 +#define _JB_PC 0x8 +#define _JB_SP 0x10 + + .register %g2,#ignore + .register %g3,#ignore + +#define ENTRY(x) \ + .text ; \ + .align 32 ; \ + .globl x ; \ + .type x,@function ; \ +x: + +#define END(x) \ + .size x, . - x + +/* + * C library -- setjmp, longjmp + * + * longjmp(a,v) + * will generate a "return(v?v:1)" from + * the last call to + * setjmp(a) + * by restoring the previous context. + */ + +ENTRY(setjmp) + stx %sp, [%o0 + _JB_SP] + stx %o7, [%o0 + _JB_PC] + stx %fp, [%o0 + _JB_FP] + retl + clr %o0 +END(setjmp) + +ENTRY(longjmp) + mov 1, %g1 + movrnz %o1, %o1, %g1 + mov %o0, %g2 + ldx [%g2 + _JB_FP], %g3 +1: cmp %fp, %g3 + bl,a 1b + restore + be,a 2f + ldx [%g2 + _JB_SP], %o0 + +.Lbotch: + illtrap + +2: cmp %o0, %sp + bge,a 3f + mov %o0, %sp + b,a .Lbotch + nop +3: ldx [%g2 + _JB_PC], %o7 + retl + mov %g1, %o0 +END(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif diff --git a/module/lua/setjmp/setjmp_x86_64.S b/module/lua/setjmp/setjmp_x86_64.S new file mode 100644 index 000000000000..a469cbad780e --- /dev/null +++ b/module/lua/setjmp/setjmp_x86_64.S @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 1992, 2010, Oracle and/or its affiliates. All rights reserved. + */ + + +#define ENTRY(x) \ + .text; \ + .align 8; \ + .globl x; \ + .type x, @function; \ +x: + +#define SET_SIZE(x) \ + .size x, [.-x] + + +/* + * Setjmp and longjmp implement non-local gotos using state vectors + * type label_t. + */ +#ifdef __x86_64__ + + ENTRY(setjmp) + movq %rsp, 0(%rdi) + movq %rbp, 8(%rdi) + movq %rbx, 16(%rdi) + movq %r12, 24(%rdi) + movq %r13, 32(%rdi) + movq %r14, 40(%rdi) + movq %r15, 48(%rdi) + movq 0(%rsp), %rdx /* return address */ + movq %rdx, 56(%rdi) /* rip */ + xorl %eax, %eax /* return 0 */ + ret + SET_SIZE(setjmp) + + ENTRY(longjmp) + movq 0(%rdi), %rsp + movq 8(%rdi), %rbp + movq 16(%rdi), %rbx + movq 24(%rdi), %r12 + movq 32(%rdi), %r13 + movq 40(%rdi), %r14 + movq 48(%rdi), %r15 + movq 56(%rdi), %rdx /* return address */ + movq %rdx, 0(%rsp) + xorl %eax, %eax + incl %eax /* return 1 */ + ret + SET_SIZE(longjmp) + +#ifdef __ELF__ +.section .note.GNU-stack,"",%progbits +#endif + +#endif /* __x86_64__ */ diff --git a/module/nvpair/Makefile.in b/module/nvpair/Makefile.in new file mode 100644 index 000000000000..d8145236674b --- /dev/null +++ b/module/nvpair/Makefile.in @@ -0,0 +1,13 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +endif + +MODULE := znvpair + +obj-$(CONFIG_ZFS) := $(MODULE).o + +$(MODULE)-objs += nvpair.o +$(MODULE)-objs += fnvpair.o +$(MODULE)-objs += nvpair_alloc_spl.o +$(MODULE)-objs += nvpair_alloc_fixed.o diff --git a/module/nvpair/fnvpair.c b/module/nvpair/fnvpair.c new file mode 100644 index 000000000000..dc8257e48594 --- /dev/null +++ b/module/nvpair/fnvpair.c @@ -0,0 +1,660 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#ifndef _KERNEL +#include +#endif + +/* + * "Force" nvlist wrapper. + * + * These functions wrap the nvlist_* functions with assertions that assume + * the operation is successful. This allows the caller's code to be much + * more readable, especially for the fnvlist_lookup_* and fnvpair_value_* + * functions, which can return the requested value (rather than filling in + * a pointer). + * + * These functions use NV_UNIQUE_NAME, encoding NV_ENCODE_NATIVE, and allocate + * with KM_SLEEP. + * + * More wrappers should be added as needed -- for example + * nvlist_lookup_*_array and nvpair_value_*_array. + */ + +nvlist_t * +fnvlist_alloc(void) +{ + nvlist_t *nvl; + VERIFY0(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP)); + return (nvl); +} + +void +fnvlist_free(nvlist_t *nvl) +{ + nvlist_free(nvl); +} + +size_t +fnvlist_size(nvlist_t *nvl) +{ + size_t size; + VERIFY0(nvlist_size(nvl, &size, NV_ENCODE_NATIVE)); + return (size); +} + +/* + * Returns allocated buffer of size *sizep. Caller must free the buffer with + * fnvlist_pack_free(). + */ +char * +fnvlist_pack(nvlist_t *nvl, size_t *sizep) +{ + char *packed = 0; + VERIFY3U(nvlist_pack(nvl, &packed, sizep, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + return (packed); +} + +/*ARGSUSED*/ +void +fnvlist_pack_free(char *pack, size_t size) +{ +#ifdef _KERNEL + kmem_free(pack, size); +#else + free(pack); +#endif +} + +nvlist_t * +fnvlist_unpack(char *buf, size_t buflen) +{ + nvlist_t *rv; + VERIFY0(nvlist_unpack(buf, buflen, &rv, KM_SLEEP)); + return (rv); +} + +nvlist_t * +fnvlist_dup(nvlist_t *nvl) +{ + nvlist_t *rv; + VERIFY0(nvlist_dup(nvl, &rv, KM_SLEEP)); + return (rv); +} + +void +fnvlist_merge(nvlist_t *dst, nvlist_t *src) +{ + VERIFY0(nvlist_merge(dst, src, KM_SLEEP)); +} + +size_t +fnvlist_num_pairs(nvlist_t *nvl) +{ + size_t count = 0; + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(nvl, 0); pair != NULL; + pair = nvlist_next_nvpair(nvl, pair)) + count++; + return (count); +} + +void +fnvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + VERIFY0(nvlist_add_boolean(nvl, name)); +} + +void +fnvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + VERIFY0(nvlist_add_boolean_value(nvl, name, val)); +} + +void +fnvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + VERIFY0(nvlist_add_byte(nvl, name, val)); +} + +void +fnvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + VERIFY0(nvlist_add_int8(nvl, name, val)); +} + +void +fnvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + VERIFY0(nvlist_add_uint8(nvl, name, val)); +} + +void +fnvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + VERIFY0(nvlist_add_int16(nvl, name, val)); +} + +void +fnvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + VERIFY0(nvlist_add_uint16(nvl, name, val)); +} + +void +fnvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + VERIFY0(nvlist_add_int32(nvl, name, val)); +} + +void +fnvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + VERIFY0(nvlist_add_uint32(nvl, name, val)); +} + +void +fnvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + VERIFY0(nvlist_add_int64(nvl, name, val)); +} + +void +fnvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + VERIFY0(nvlist_add_uint64(nvl, name, val)); +} + +void +fnvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + VERIFY0(nvlist_add_string(nvl, name, val)); +} + +void +fnvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + VERIFY0(nvlist_add_nvlist(nvl, name, val)); +} + +void +fnvlist_add_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY0(nvlist_add_nvpair(nvl, pair)); +} + +void +fnvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *val, uint_t n) +{ + VERIFY0(nvlist_add_boolean_array(nvl, name, val, n)); +} + +void +fnvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *val, uint_t n) +{ + VERIFY0(nvlist_add_byte_array(nvl, name, val, n)); +} + +void +fnvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int8_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint8_array(nvl, name, val, n)); +} + +void +fnvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int16_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint16_array(nvlist_t *nvl, const char *name, + uint16_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint16_array(nvl, name, val, n)); +} + +void +fnvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int32_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint32_array(nvlist_t *nvl, const char *name, + uint32_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint32_array(nvl, name, val, n)); +} + +void +fnvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *val, uint_t n) +{ + VERIFY0(nvlist_add_int64_array(nvl, name, val, n)); +} + +void +fnvlist_add_uint64_array(nvlist_t *nvl, const char *name, + uint64_t *val, uint_t n) +{ + VERIFY0(nvlist_add_uint64_array(nvl, name, val, n)); +} + +void +fnvlist_add_string_array(nvlist_t *nvl, const char *name, + char * const *val, uint_t n) +{ + VERIFY0(nvlist_add_string_array(nvl, name, val, n)); +} + +void +fnvlist_add_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t **val, uint_t n) +{ + VERIFY0(nvlist_add_nvlist_array(nvl, name, val, n)); +} + +void +fnvlist_remove(nvlist_t *nvl, const char *name) +{ + VERIFY0(nvlist_remove_all(nvl, name)); +} + +void +fnvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *pair) +{ + VERIFY0(nvlist_remove_nvpair(nvl, pair)); +} + +nvpair_t * +fnvlist_lookup_nvpair(nvlist_t *nvl, const char *name) +{ + nvpair_t *rv; + VERIFY0(nvlist_lookup_nvpair(nvl, name, &rv)); + return (rv); +} + +/* returns B_TRUE if the entry exists */ +boolean_t +fnvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_boolean(nvl, name) == 0); +} + +boolean_t +fnvlist_lookup_boolean_value(nvlist_t *nvl, const char *name) +{ + boolean_t rv; + VERIFY0(nvlist_lookup_boolean_value(nvl, name, &rv)); + return (rv); +} + +uchar_t +fnvlist_lookup_byte(nvlist_t *nvl, const char *name) +{ + uchar_t rv; + VERIFY0(nvlist_lookup_byte(nvl, name, &rv)); + return (rv); +} + +int8_t +fnvlist_lookup_int8(nvlist_t *nvl, const char *name) +{ + int8_t rv; + VERIFY0(nvlist_lookup_int8(nvl, name, &rv)); + return (rv); +} + +int16_t +fnvlist_lookup_int16(nvlist_t *nvl, const char *name) +{ + int16_t rv; + VERIFY0(nvlist_lookup_int16(nvl, name, &rv)); + return (rv); +} + +int32_t +fnvlist_lookup_int32(nvlist_t *nvl, const char *name) +{ + int32_t rv; + VERIFY0(nvlist_lookup_int32(nvl, name, &rv)); + return (rv); +} + +int64_t +fnvlist_lookup_int64(nvlist_t *nvl, const char *name) +{ + int64_t rv; + VERIFY0(nvlist_lookup_int64(nvl, name, &rv)); + return (rv); +} + +uint8_t +fnvlist_lookup_uint8(nvlist_t *nvl, const char *name) +{ + uint8_t rv; + VERIFY0(nvlist_lookup_uint8(nvl, name, &rv)); + return (rv); +} + +uint16_t +fnvlist_lookup_uint16(nvlist_t *nvl, const char *name) +{ + uint16_t rv; + VERIFY0(nvlist_lookup_uint16(nvl, name, &rv)); + return (rv); +} + +uint32_t +fnvlist_lookup_uint32(nvlist_t *nvl, const char *name) +{ + uint32_t rv; + VERIFY0(nvlist_lookup_uint32(nvl, name, &rv)); + return (rv); +} + +uint64_t +fnvlist_lookup_uint64(nvlist_t *nvl, const char *name) +{ + uint64_t rv; + VERIFY0(nvlist_lookup_uint64(nvl, name, &rv)); + return (rv); +} + +char * +fnvlist_lookup_string(nvlist_t *nvl, const char *name) +{ + char *rv; + VERIFY0(nvlist_lookup_string(nvl, name, &rv)); + return (rv); +} + +nvlist_t * +fnvlist_lookup_nvlist(nvlist_t *nvl, const char *name) +{ + nvlist_t *rv; + VERIFY0(nvlist_lookup_nvlist(nvl, name, &rv)); + return (rv); +} +boolean_t * +fnvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + boolean_t *rv; + VERIFY0(nvlist_lookup_boolean_array(nvl, name, &rv, n)); + return (rv); +} + +uchar_t * +fnvlist_lookup_byte_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uchar_t *rv; + VERIFY0(nvlist_lookup_byte_array(nvl, name, &rv, n)); + return (rv); +} + +int8_t * +fnvlist_lookup_int8_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int8_t *rv; + VERIFY0(nvlist_lookup_int8_array(nvl, name, &rv, n)); + return (rv); +} + +uint8_t * +fnvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint8_t *rv; + VERIFY0(nvlist_lookup_uint8_array(nvl, name, &rv, n)); + return (rv); +} + +int16_t * +fnvlist_lookup_int16_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int16_t *rv; + VERIFY0(nvlist_lookup_int16_array(nvl, name, &rv, n)); + return (rv); +} + +uint16_t * +fnvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint16_t *rv; + VERIFY0(nvlist_lookup_uint16_array(nvl, name, &rv, n)); + return (rv); +} + +int32_t * +fnvlist_lookup_int32_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int32_t *rv; + VERIFY0(nvlist_lookup_int32_array(nvl, name, &rv, n)); + return (rv); +} + +uint32_t * +fnvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint32_t *rv; + VERIFY0(nvlist_lookup_uint32_array(nvl, name, &rv, n)); + return (rv); +} + +int64_t * +fnvlist_lookup_int64_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + int64_t *rv; + VERIFY0(nvlist_lookup_int64_array(nvl, name, &rv, n)); + return (rv); +} + +uint64_t * +fnvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, uint_t *n) +{ + uint64_t *rv; + VERIFY0(nvlist_lookup_uint64_array(nvl, name, &rv, n)); + return (rv); +} + +boolean_t +fnvpair_value_boolean_value(nvpair_t *nvp) +{ + boolean_t rv; + VERIFY0(nvpair_value_boolean_value(nvp, &rv)); + return (rv); +} + +uchar_t +fnvpair_value_byte(nvpair_t *nvp) +{ + uchar_t rv; + VERIFY0(nvpair_value_byte(nvp, &rv)); + return (rv); +} + +int8_t +fnvpair_value_int8(nvpair_t *nvp) +{ + int8_t rv; + VERIFY0(nvpair_value_int8(nvp, &rv)); + return (rv); +} + +int16_t +fnvpair_value_int16(nvpair_t *nvp) +{ + int16_t rv; + VERIFY0(nvpair_value_int16(nvp, &rv)); + return (rv); +} + +int32_t +fnvpair_value_int32(nvpair_t *nvp) +{ + int32_t rv; + VERIFY0(nvpair_value_int32(nvp, &rv)); + return (rv); +} + +int64_t +fnvpair_value_int64(nvpair_t *nvp) +{ + int64_t rv; + VERIFY0(nvpair_value_int64(nvp, &rv)); + return (rv); +} + +uint8_t +fnvpair_value_uint8(nvpair_t *nvp) +{ + uint8_t rv; + VERIFY0(nvpair_value_uint8(nvp, &rv)); + return (rv); +} + +uint16_t +fnvpair_value_uint16(nvpair_t *nvp) +{ + uint16_t rv; + VERIFY0(nvpair_value_uint16(nvp, &rv)); + return (rv); +} + +uint32_t +fnvpair_value_uint32(nvpair_t *nvp) +{ + uint32_t rv; + VERIFY0(nvpair_value_uint32(nvp, &rv)); + return (rv); +} + +uint64_t +fnvpair_value_uint64(nvpair_t *nvp) +{ + uint64_t rv; + VERIFY0(nvpair_value_uint64(nvp, &rv)); + return (rv); +} + +char * +fnvpair_value_string(nvpair_t *nvp) +{ + char *rv; + VERIFY0(nvpair_value_string(nvp, &rv)); + return (rv); +} + +nvlist_t * +fnvpair_value_nvlist(nvpair_t *nvp) +{ + nvlist_t *rv; + VERIFY0(nvpair_value_nvlist(nvp, &rv)); + return (rv); +} + +#if defined(_KERNEL) + +EXPORT_SYMBOL(fnvlist_alloc); +EXPORT_SYMBOL(fnvlist_free); +EXPORT_SYMBOL(fnvlist_size); +EXPORT_SYMBOL(fnvlist_pack); +EXPORT_SYMBOL(fnvlist_pack_free); +EXPORT_SYMBOL(fnvlist_unpack); +EXPORT_SYMBOL(fnvlist_dup); +EXPORT_SYMBOL(fnvlist_merge); + +EXPORT_SYMBOL(fnvlist_add_nvpair); +EXPORT_SYMBOL(fnvlist_add_boolean); +EXPORT_SYMBOL(fnvlist_add_boolean_value); +EXPORT_SYMBOL(fnvlist_add_byte); +EXPORT_SYMBOL(fnvlist_add_int8); +EXPORT_SYMBOL(fnvlist_add_uint8); +EXPORT_SYMBOL(fnvlist_add_int16); +EXPORT_SYMBOL(fnvlist_add_uint16); +EXPORT_SYMBOL(fnvlist_add_int32); +EXPORT_SYMBOL(fnvlist_add_uint32); +EXPORT_SYMBOL(fnvlist_add_int64); +EXPORT_SYMBOL(fnvlist_add_uint64); +EXPORT_SYMBOL(fnvlist_add_string); +EXPORT_SYMBOL(fnvlist_add_nvlist); +EXPORT_SYMBOL(fnvlist_add_boolean_array); +EXPORT_SYMBOL(fnvlist_add_byte_array); +EXPORT_SYMBOL(fnvlist_add_int8_array); +EXPORT_SYMBOL(fnvlist_add_uint8_array); +EXPORT_SYMBOL(fnvlist_add_int16_array); +EXPORT_SYMBOL(fnvlist_add_uint16_array); +EXPORT_SYMBOL(fnvlist_add_int32_array); +EXPORT_SYMBOL(fnvlist_add_uint32_array); +EXPORT_SYMBOL(fnvlist_add_int64_array); +EXPORT_SYMBOL(fnvlist_add_uint64_array); +EXPORT_SYMBOL(fnvlist_add_string_array); +EXPORT_SYMBOL(fnvlist_add_nvlist_array); + +EXPORT_SYMBOL(fnvlist_remove); +EXPORT_SYMBOL(fnvlist_remove_nvpair); + +EXPORT_SYMBOL(fnvlist_lookup_nvpair); +EXPORT_SYMBOL(fnvlist_lookup_boolean); +EXPORT_SYMBOL(fnvlist_lookup_boolean_value); +EXPORT_SYMBOL(fnvlist_lookup_byte); +EXPORT_SYMBOL(fnvlist_lookup_int8); +EXPORT_SYMBOL(fnvlist_lookup_uint8); +EXPORT_SYMBOL(fnvlist_lookup_int16); +EXPORT_SYMBOL(fnvlist_lookup_uint16); +EXPORT_SYMBOL(fnvlist_lookup_int32); +EXPORT_SYMBOL(fnvlist_lookup_uint32); +EXPORT_SYMBOL(fnvlist_lookup_int64); +EXPORT_SYMBOL(fnvlist_lookup_uint64); +EXPORT_SYMBOL(fnvlist_lookup_string); +EXPORT_SYMBOL(fnvlist_lookup_nvlist); + +EXPORT_SYMBOL(fnvpair_value_boolean_value); +EXPORT_SYMBOL(fnvpair_value_byte); +EXPORT_SYMBOL(fnvpair_value_int8); +EXPORT_SYMBOL(fnvpair_value_uint8); +EXPORT_SYMBOL(fnvpair_value_int16); +EXPORT_SYMBOL(fnvpair_value_uint16); +EXPORT_SYMBOL(fnvpair_value_int32); +EXPORT_SYMBOL(fnvpair_value_uint32); +EXPORT_SYMBOL(fnvpair_value_int64); +EXPORT_SYMBOL(fnvpair_value_uint64); +EXPORT_SYMBOL(fnvpair_value_string); +EXPORT_SYMBOL(fnvpair_value_nvlist); +EXPORT_SYMBOL(fnvlist_num_pairs); + +#endif diff --git a/module/nvpair/nvpair.c b/module/nvpair/nvpair.c new file mode 100644 index 000000000000..20a58802f6b7 --- /dev/null +++ b/module/nvpair/nvpair.c @@ -0,0 +1,3729 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2000, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. + * Copyright 2018 RackTop Systems. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_KERNEL) +#include +#include +#else +#include +#include +#include +#endif + +#define skip_whitespace(p) while ((*(p) == ' ') || (*(p) == '\t')) p++ + +/* + * nvpair.c - Provides kernel & userland interfaces for manipulating + * name-value pairs. + * + * Overview Diagram + * + * +--------------+ + * | nvlist_t | + * |--------------| + * | nvl_version | + * | nvl_nvflag | + * | nvl_priv -+-+ + * | nvl_flag | | + * | nvl_pad | | + * +--------------+ | + * V + * +--------------+ last i_nvp in list + * | nvpriv_t | +---------------------> + * |--------------| | + * +--+- nvp_list | | +------------+ + * | | nvp_last -+--+ + nv_alloc_t | + * | | nvp_curr | |------------| + * | | nvp_nva -+----> | nva_ops | + * | | nvp_stat | | nva_arg | + * | +--------------+ +------------+ + * | + * +-------+ + * V + * +---------------------+ +-------------------+ + * | i_nvp_t | +-->| i_nvp_t | +--> + * |---------------------| | |-------------------| | + * | nvi_next -+--+ | nvi_next -+--+ + * | nvi_prev (NULL) | <----+ nvi_prev | + * | . . . . . . . . . . | | . . . . . . . . . | + * | nvp (nvpair_t) | | nvp (nvpair_t) | + * | - nvp_size | | - nvp_size | + * | - nvp_name_sz | | - nvp_name_sz | + * | - nvp_value_elem | | - nvp_value_elem | + * | - nvp_type | | - nvp_type | + * | - data ... | | - data ... | + * +---------------------+ +-------------------+ + * + * + * + * +---------------------+ +---------------------+ + * | i_nvp_t | +--> +-->| i_nvp_t (last) | + * |---------------------| | | |---------------------| + * | nvi_next -+--+ ... --+ | nvi_next (NULL) | + * <-+- nvi_prev |<-- ... <----+ nvi_prev | + * | . . . . . . . . . | | . . . . . . . . . | + * | nvp (nvpair_t) | | nvp (nvpair_t) | + * | - nvp_size | | - nvp_size | + * | - nvp_name_sz | | - nvp_name_sz | + * | - nvp_value_elem | | - nvp_value_elem | + * | - DATA_TYPE_NVLIST | | - nvp_type | + * | - data (embedded) | | - data ... | + * | nvlist name | +---------------------+ + * | +--------------+ | + * | | nvlist_t | | + * | |--------------| | + * | | nvl_version | | + * | | nvl_nvflag | | + * | | nvl_priv --+---+----> + * | | nvl_flag | | + * | | nvl_pad | | + * | +--------------+ | + * +---------------------+ + * + * + * N.B. nvpair_t may be aligned on 4 byte boundary, so +4 will + * allow value to be aligned on 8 byte boundary + * + * name_len is the length of the name string including the null terminator + * so it must be >= 1 + */ +#define NVP_SIZE_CALC(name_len, data_len) \ + (NV_ALIGN((sizeof (nvpair_t)) + name_len) + NV_ALIGN(data_len)) + +static int i_get_value_size(data_type_t type, const void *data, uint_t nelem); +static int nvlist_add_common(nvlist_t *nvl, const char *name, data_type_t type, + uint_t nelem, const void *data); + +#define NV_STAT_EMBEDDED 0x1 +#define EMBEDDED_NVL(nvp) ((nvlist_t *)(void *)NVP_VALUE(nvp)) +#define EMBEDDED_NVL_ARRAY(nvp) ((nvlist_t **)(void *)NVP_VALUE(nvp)) + +#define NVP_VALOFF(nvp) (NV_ALIGN(sizeof (nvpair_t) + (nvp)->nvp_name_sz)) +#define NVPAIR2I_NVP(nvp) \ + ((i_nvp_t *)((size_t)(nvp) - offsetof(i_nvp_t, nvi_nvp))) + +#ifdef _KERNEL +int nvpair_max_recursion = 20; +#else +int nvpair_max_recursion = 100; +#endif + +uint64_t nvlist_hashtable_init_size = (1 << 4); + +int +nv_alloc_init(nv_alloc_t *nva, const nv_alloc_ops_t *nvo, /* args */ ...) +{ + va_list valist; + int err = 0; + + nva->nva_ops = nvo; + nva->nva_arg = NULL; + + va_start(valist, nvo); + if (nva->nva_ops->nv_ao_init != NULL) + err = nva->nva_ops->nv_ao_init(nva, valist); + va_end(valist); + + return (err); +} + +void +nv_alloc_reset(nv_alloc_t *nva) +{ + if (nva->nva_ops->nv_ao_reset != NULL) + nva->nva_ops->nv_ao_reset(nva); +} + +void +nv_alloc_fini(nv_alloc_t *nva) +{ + if (nva->nva_ops->nv_ao_fini != NULL) + nva->nva_ops->nv_ao_fini(nva); +} + +nv_alloc_t * +nvlist_lookup_nv_alloc(nvlist_t *nvl) +{ + nvpriv_t *priv; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + return (priv->nvp_nva); +} + +static void * +nv_mem_zalloc(nvpriv_t *nvp, size_t size) +{ + nv_alloc_t *nva = nvp->nvp_nva; + void *buf; + + if ((buf = nva->nva_ops->nv_ao_alloc(nva, size)) != NULL) + bzero(buf, size); + + return (buf); +} + +static void +nv_mem_free(nvpriv_t *nvp, void *buf, size_t size) +{ + nv_alloc_t *nva = nvp->nvp_nva; + + nva->nva_ops->nv_ao_free(nva, buf, size); +} + +static void +nv_priv_init(nvpriv_t *priv, nv_alloc_t *nva, uint32_t stat) +{ + bzero(priv, sizeof (nvpriv_t)); + + priv->nvp_nva = nva; + priv->nvp_stat = stat; +} + +static nvpriv_t * +nv_priv_alloc(nv_alloc_t *nva) +{ + nvpriv_t *priv; + + /* + * nv_mem_alloc() cannot called here because it needs the priv + * argument. + */ + if ((priv = nva->nva_ops->nv_ao_alloc(nva, sizeof (nvpriv_t))) == NULL) + return (NULL); + + nv_priv_init(priv, nva, 0); + + return (priv); +} + +/* + * Embedded lists need their own nvpriv_t's. We create a new + * nvpriv_t using the parameters and allocator from the parent + * list's nvpriv_t. + */ +static nvpriv_t * +nv_priv_alloc_embedded(nvpriv_t *priv) +{ + nvpriv_t *emb_priv; + + if ((emb_priv = nv_mem_zalloc(priv, sizeof (nvpriv_t))) == NULL) + return (NULL); + + nv_priv_init(emb_priv, priv->nvp_nva, NV_STAT_EMBEDDED); + + return (emb_priv); +} + +static int +nvt_tab_alloc(nvpriv_t *priv, uint64_t buckets) +{ + ASSERT3P(priv->nvp_hashtable, ==, NULL); + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + + i_nvp_t **tab = nv_mem_zalloc(priv, buckets * sizeof (i_nvp_t *)); + if (tab == NULL) + return (ENOMEM); + + priv->nvp_hashtable = tab; + priv->nvp_nbuckets = buckets; + return (0); +} + +static void +nvt_tab_free(nvpriv_t *priv) +{ + i_nvp_t **tab = priv->nvp_hashtable; + if (tab == NULL) { + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + return; + } + + nv_mem_free(priv, tab, priv->nvp_nbuckets * sizeof (i_nvp_t *)); + + priv->nvp_hashtable = NULL; + priv->nvp_nbuckets = 0; + priv->nvp_nentries = 0; +} + +static uint32_t +nvt_hash(const char *p) +{ + uint32_t g, hval = 0; + + while (*p) { + hval = (hval << 4) + *p++; + if ((g = (hval & 0xf0000000)) != 0) + hval ^= g >> 24; + hval &= ~g; + } + return (hval); +} + +static boolean_t +nvt_nvpair_match(nvpair_t *nvp1, nvpair_t *nvp2, uint32_t nvflag) +{ + boolean_t match = B_FALSE; + if (nvflag & NV_UNIQUE_NAME_TYPE) { + if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0 && + NVP_TYPE(nvp1) == NVP_TYPE(nvp2)) + match = B_TRUE; + } else { + ASSERT(nvflag == 0 || nvflag & NV_UNIQUE_NAME); + if (strcmp(NVP_NAME(nvp1), NVP_NAME(nvp2)) == 0) + match = B_TRUE; + } + return (match); +} + +static nvpair_t * +nvt_lookup_name_type(nvlist_t *nvl, const char *name, data_type_t type) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + ASSERT(priv != NULL); + + i_nvp_t **tab = priv->nvp_hashtable; + + if (tab == NULL) { + ASSERT3P(priv->nvp_list, ==, NULL); + ASSERT0(priv->nvp_nbuckets); + ASSERT0(priv->nvp_nentries); + return (NULL); + } else { + ASSERT(priv->nvp_nbuckets != 0); + } + + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *entry = tab[index]; + + for (i_nvp_t *e = entry; e != NULL; e = e->nvi_hashtable_next) { + if (strcmp(NVP_NAME(&e->nvi_nvp), name) == 0 && + (type == DATA_TYPE_DONTCARE || + NVP_TYPE(&e->nvi_nvp) == type)) + return (&e->nvi_nvp); + } + return (NULL); +} + +static nvpair_t * +nvt_lookup_name(nvlist_t *nvl, const char *name) +{ + return (nvt_lookup_name_type(nvl, name, DATA_TYPE_DONTCARE)); +} + +static int +nvt_resize(nvpriv_t *priv, uint32_t new_size) +{ + i_nvp_t **tab = priv->nvp_hashtable; + + /* + * Migrate all the entries from the current table + * to a newly-allocated table with the new size by + * re-adjusting the pointers of their entries. + */ + uint32_t size = priv->nvp_nbuckets; + uint32_t new_mask = new_size - 1; + ASSERT(ISP2(new_size)); + + i_nvp_t **new_tab = nv_mem_zalloc(priv, new_size * sizeof (i_nvp_t *)); + if (new_tab == NULL) + return (ENOMEM); + + uint32_t nentries = 0; + for (uint32_t i = 0; i < size; i++) { + i_nvp_t *next, *e = tab[i]; + + while (e != NULL) { + next = e->nvi_hashtable_next; + + uint32_t hash = nvt_hash(NVP_NAME(&e->nvi_nvp)); + uint32_t index = hash & new_mask; + + e->nvi_hashtable_next = new_tab[index]; + new_tab[index] = e; + nentries++; + + e = next; + } + tab[i] = NULL; + } + ASSERT3U(nentries, ==, priv->nvp_nentries); + + nvt_tab_free(priv); + + priv->nvp_hashtable = new_tab; + priv->nvp_nbuckets = new_size; + priv->nvp_nentries = nentries; + + return (0); +} + +static boolean_t +nvt_needs_togrow(nvpriv_t *priv) +{ + /* + * Grow only when we have more elements than buckets + * and the # of buckets doesn't overflow. + */ + return (priv->nvp_nentries > priv->nvp_nbuckets && + (UINT32_MAX >> 1) >= priv->nvp_nbuckets); +} + +/* + * Allocate a new table that's twice the size of the old one, + * and migrate all the entries from the old one to the new + * one by re-adjusting their pointers. + */ +static int +nvt_grow(nvpriv_t *priv) +{ + uint32_t current_size = priv->nvp_nbuckets; + /* ensure we won't overflow */ + ASSERT3U(UINT32_MAX >> 1, >=, current_size); + return (nvt_resize(priv, current_size << 1)); +} + +static boolean_t +nvt_needs_toshrink(nvpriv_t *priv) +{ + /* + * Shrink only when the # of elements is less than or + * equal to 1/4 the # of buckets. Never shrink less than + * nvlist_hashtable_init_size. + */ + ASSERT3U(priv->nvp_nbuckets, >=, nvlist_hashtable_init_size); + if (priv->nvp_nbuckets == nvlist_hashtable_init_size) + return (B_FALSE); + return (priv->nvp_nentries <= (priv->nvp_nbuckets >> 2)); +} + +/* + * Allocate a new table that's half the size of the old one, + * and migrate all the entries from the old one to the new + * one by re-adjusting their pointers. + */ +static int +nvt_shrink(nvpriv_t *priv) +{ + uint32_t current_size = priv->nvp_nbuckets; + /* ensure we won't overflow */ + ASSERT3U(current_size, >=, nvlist_hashtable_init_size); + return (nvt_resize(priv, current_size >> 1)); +} + +static int +nvt_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + + if (nvt_needs_toshrink(priv)) { + int err = nvt_shrink(priv); + if (err != 0) + return (err); + } + i_nvp_t **tab = priv->nvp_hashtable; + + char *name = NVP_NAME(nvp); + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *bucket = tab[index]; + + for (i_nvp_t *prev = NULL, *e = bucket; + e != NULL; prev = e, e = e->nvi_hashtable_next) { + if (nvt_nvpair_match(&e->nvi_nvp, nvp, nvl->nvl_nvflag)) { + if (prev != NULL) { + prev->nvi_hashtable_next = + e->nvi_hashtable_next; + } else { + ASSERT3P(e, ==, bucket); + tab[index] = e->nvi_hashtable_next; + } + e->nvi_hashtable_next = NULL; + priv->nvp_nentries--; + break; + } + } + + return (0); +} + +static int +nvt_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + + /* initialize nvpair table now if it doesn't exist. */ + if (priv->nvp_hashtable == NULL) { + int err = nvt_tab_alloc(priv, nvlist_hashtable_init_size); + if (err != 0) + return (err); + } + + /* + * if we don't allow duplicate entries, make sure to + * unlink any existing entries from the table. + */ + if (nvl->nvl_nvflag != 0) { + int err = nvt_remove_nvpair(nvl, nvp); + if (err != 0) + return (err); + } + + if (nvt_needs_togrow(priv)) { + int err = nvt_grow(priv); + if (err != 0) + return (err); + } + i_nvp_t **tab = priv->nvp_hashtable; + + char *name = NVP_NAME(nvp); + uint64_t hash = nvt_hash(name); + uint64_t index = hash & (priv->nvp_nbuckets - 1); + + ASSERT3U(index, <, priv->nvp_nbuckets); + i_nvp_t *bucket = tab[index]; + + /* insert link at the beginning of the bucket */ + i_nvp_t *new_entry = NVPAIR2I_NVP(nvp); + ASSERT3P(new_entry->nvi_hashtable_next, ==, NULL); + new_entry->nvi_hashtable_next = bucket; + tab[index] = new_entry; + + priv->nvp_nentries++; + return (0); +} + +static void +nvlist_init(nvlist_t *nvl, uint32_t nvflag, nvpriv_t *priv) +{ + nvl->nvl_version = NV_VERSION; + nvl->nvl_nvflag = nvflag & (NV_UNIQUE_NAME|NV_UNIQUE_NAME_TYPE); + nvl->nvl_priv = (uint64_t)(uintptr_t)priv; + nvl->nvl_flag = 0; + nvl->nvl_pad = 0; +} + +uint_t +nvlist_nvflag(nvlist_t *nvl) +{ + return (nvl->nvl_nvflag); +} + +static nv_alloc_t * +nvlist_nv_alloc(int kmflag) +{ +#if defined(_KERNEL) + switch (kmflag) { + case KM_SLEEP: + return (nv_alloc_sleep); + case KM_NOSLEEP: + return (nv_alloc_nosleep); + default: + return (nv_alloc_pushpage); + } +#else + return (nv_alloc_nosleep); +#endif /* _KERNEL */ +} + +/* + * nvlist_alloc - Allocate nvlist. + */ +int +nvlist_alloc(nvlist_t **nvlp, uint_t nvflag, int kmflag) +{ + return (nvlist_xalloc(nvlp, nvflag, nvlist_nv_alloc(kmflag))); +} + +int +nvlist_xalloc(nvlist_t **nvlp, uint_t nvflag, nv_alloc_t *nva) +{ + nvpriv_t *priv; + + if (nvlp == NULL || nva == NULL) + return (EINVAL); + + if ((priv = nv_priv_alloc(nva)) == NULL) + return (ENOMEM); + + if ((*nvlp = nv_mem_zalloc(priv, + NV_ALIGN(sizeof (nvlist_t)))) == NULL) { + nv_mem_free(priv, priv, sizeof (nvpriv_t)); + return (ENOMEM); + } + + nvlist_init(*nvlp, nvflag, priv); + + return (0); +} + +/* + * nvp_buf_alloc - Allocate i_nvp_t for storing a new nv pair. + */ +static nvpair_t * +nvp_buf_alloc(nvlist_t *nvl, size_t len) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *buf; + nvpair_t *nvp; + size_t nvsize; + + /* + * Allocate the buffer + */ + nvsize = len + offsetof(i_nvp_t, nvi_nvp); + + if ((buf = nv_mem_zalloc(priv, nvsize)) == NULL) + return (NULL); + + nvp = &buf->nvi_nvp; + nvp->nvp_size = len; + + return (nvp); +} + +/* + * nvp_buf_free - de-Allocate an i_nvp_t. + */ +static void +nvp_buf_free(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + size_t nvsize = nvp->nvp_size + offsetof(i_nvp_t, nvi_nvp); + + nv_mem_free(priv, NVPAIR2I_NVP(nvp), nvsize); +} + +/* + * nvp_buf_link - link a new nv pair into the nvlist. + */ +static void +nvp_buf_link(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr = NVPAIR2I_NVP(nvp); + + /* Put element at end of nvlist */ + if (priv->nvp_list == NULL) { + priv->nvp_list = priv->nvp_last = curr; + } else { + curr->nvi_prev = priv->nvp_last; + priv->nvp_last->nvi_next = curr; + priv->nvp_last = curr; + } +} + +/* + * nvp_buf_unlink - unlink an removed nvpair out of the nvlist. + */ +static void +nvp_buf_unlink(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr = NVPAIR2I_NVP(nvp); + + /* + * protect nvlist_next_nvpair() against walking on freed memory. + */ + if (priv->nvp_curr == curr) + priv->nvp_curr = curr->nvi_next; + + if (curr == priv->nvp_list) + priv->nvp_list = curr->nvi_next; + else + curr->nvi_prev->nvi_next = curr->nvi_next; + + if (curr == priv->nvp_last) + priv->nvp_last = curr->nvi_prev; + else + curr->nvi_next->nvi_prev = curr->nvi_prev; +} + +/* + * take a nvpair type and number of elements and make sure the are valid + */ +static int +i_validate_type_nelem(data_type_t type, uint_t nelem) +{ + switch (type) { + case DATA_TYPE_BOOLEAN: + if (nelem != 0) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_STRING: + case DATA_TYPE_HRTIME: + case DATA_TYPE_NVLIST: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif + if (nelem != 1) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + /* we allow arrays with 0 elements */ + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * Verify nvp_name_sz and check the name string length. + */ +static int +i_validate_nvpair_name(nvpair_t *nvp) +{ + if ((nvp->nvp_name_sz <= 0) || + (nvp->nvp_size < NVP_SIZE_CALC(nvp->nvp_name_sz, 0))) + return (EFAULT); + + /* verify the name string, make sure its terminated */ + if (NVP_NAME(nvp)[nvp->nvp_name_sz - 1] != '\0') + return (EFAULT); + + return (strlen(NVP_NAME(nvp)) == nvp->nvp_name_sz - 1 ? 0 : EFAULT); +} + +static int +i_validate_nvpair_value(data_type_t type, uint_t nelem, const void *data) +{ + switch (type) { + case DATA_TYPE_BOOLEAN_VALUE: + if (*(boolean_t *)data != B_TRUE && + *(boolean_t *)data != B_FALSE) + return (EINVAL); + break; + case DATA_TYPE_BOOLEAN_ARRAY: { + int i; + + for (i = 0; i < nelem; i++) + if (((boolean_t *)data)[i] != B_TRUE && + ((boolean_t *)data)[i] != B_FALSE) + return (EINVAL); + break; + } + default: + break; + } + + return (0); +} + +/* + * This function takes a pointer to what should be a nvpair and it's size + * and then verifies that all the nvpair fields make sense and can be + * trusted. This function is used when decoding packed nvpairs. + */ +static int +i_validate_nvpair(nvpair_t *nvp) +{ + data_type_t type = NVP_TYPE(nvp); + int size1, size2; + + /* verify nvp_name_sz, check the name string length */ + if (i_validate_nvpair_name(nvp) != 0) + return (EFAULT); + + if (i_validate_nvpair_value(type, NVP_NELEM(nvp), NVP_VALUE(nvp)) != 0) + return (EFAULT); + + /* + * verify nvp_type, nvp_value_elem, and also possibly + * verify string values and get the value size. + */ + size2 = i_get_value_size(type, NVP_VALUE(nvp), NVP_NELEM(nvp)); + size1 = nvp->nvp_size - NVP_VALOFF(nvp); + if (size2 < 0 || size1 != NV_ALIGN(size2)) + return (EFAULT); + + return (0); +} + +static int +nvlist_copy_pairs(nvlist_t *snvl, nvlist_t *dnvl) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if ((priv = (nvpriv_t *)(uintptr_t)snvl->nvl_priv) == NULL) + return (EINVAL); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + nvpair_t *nvp = &curr->nvi_nvp; + int err; + + if ((err = nvlist_add_common(dnvl, NVP_NAME(nvp), NVP_TYPE(nvp), + NVP_NELEM(nvp), NVP_VALUE(nvp))) != 0) + return (err); + } + + return (0); +} + +/* + * Frees all memory allocated for an nvpair (like embedded lists) with + * the exception of the nvpair buffer itself. + */ +static void +nvpair_free(nvpair_t *nvp) +{ + switch (NVP_TYPE(nvp)) { + case DATA_TYPE_NVLIST: + nvlist_free(EMBEDDED_NVL(nvp)); + break; + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + int i; + + for (i = 0; i < NVP_NELEM(nvp); i++) + if (nvlp[i] != NULL) + nvlist_free(nvlp[i]); + break; + } + default: + break; + } +} + +/* + * nvlist_free - free an unpacked nvlist + */ +void +nvlist_free(nvlist_t *nvl) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return; + + /* + * Unpacked nvlist are linked through i_nvp_t + */ + curr = priv->nvp_list; + while (curr != NULL) { + nvpair_t *nvp = &curr->nvi_nvp; + curr = curr->nvi_next; + + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + } + + if (!(priv->nvp_stat & NV_STAT_EMBEDDED)) + nv_mem_free(priv, nvl, NV_ALIGN(sizeof (nvlist_t))); + else + nvl->nvl_priv = 0; + + nvt_tab_free(priv); + nv_mem_free(priv, priv, sizeof (nvpriv_t)); +} + +static int +nvlist_contains_nvp(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + + if (nvp == NULL) + return (0); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) + if (&curr->nvi_nvp == nvp) + return (1); + + return (0); +} + +/* + * Make a copy of nvlist + */ +int +nvlist_dup(nvlist_t *nvl, nvlist_t **nvlp, int kmflag) +{ + return (nvlist_xdup(nvl, nvlp, nvlist_nv_alloc(kmflag))); +} + +int +nvlist_xdup(nvlist_t *nvl, nvlist_t **nvlp, nv_alloc_t *nva) +{ + int err; + nvlist_t *ret; + + if (nvl == NULL || nvlp == NULL) + return (EINVAL); + + if ((err = nvlist_xalloc(&ret, nvl->nvl_nvflag, nva)) != 0) + return (err); + + if ((err = nvlist_copy_pairs(nvl, ret)) != 0) + nvlist_free(ret); + else + *nvlp = ret; + + return (err); +} + +/* + * Remove all with matching name + */ +int +nvlist_remove_all(nvlist_t *nvl, const char *name) +{ + int error = ENOENT; + + if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) + return (EINVAL); + + nvpair_t *nvp; + while ((nvp = nvt_lookup_name(nvl, name)) != NULL) { + VERIFY0(nvlist_remove_nvpair(nvl, nvp)); + error = 0; + } + + return (error); +} + +/* + * Remove first one with matching name and type + */ +int +nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) +{ + if (nvl == NULL || name == NULL || nvl->nvl_priv == 0) + return (EINVAL); + + nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); + if (nvp == NULL) + return (ENOENT); + + return (nvlist_remove_nvpair(nvl, nvp)); +} + +int +nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + if (nvl == NULL || nvp == NULL) + return (EINVAL); + + int err = nvt_remove_nvpair(nvl, nvp); + if (err != 0) + return (err); + + nvp_buf_unlink(nvl, nvp); + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (0); +} + +/* + * This function calculates the size of an nvpair value. + * + * The data argument controls the behavior in case of the data types + * DATA_TYPE_STRING and + * DATA_TYPE_STRING_ARRAY + * Is data == NULL then the size of the string(s) is excluded. + */ +static int +i_get_value_size(data_type_t type, const void *data, uint_t nelem) +{ + uint64_t value_sz; + + if (i_validate_type_nelem(type, nelem) != 0) + return (-1); + + /* Calculate required size for holding value */ + switch (type) { + case DATA_TYPE_BOOLEAN: + value_sz = 0; + break; + case DATA_TYPE_BOOLEAN_VALUE: + value_sz = sizeof (boolean_t); + break; + case DATA_TYPE_BYTE: + value_sz = sizeof (uchar_t); + break; + case DATA_TYPE_INT8: + value_sz = sizeof (int8_t); + break; + case DATA_TYPE_UINT8: + value_sz = sizeof (uint8_t); + break; + case DATA_TYPE_INT16: + value_sz = sizeof (int16_t); + break; + case DATA_TYPE_UINT16: + value_sz = sizeof (uint16_t); + break; + case DATA_TYPE_INT32: + value_sz = sizeof (int32_t); + break; + case DATA_TYPE_UINT32: + value_sz = sizeof (uint32_t); + break; + case DATA_TYPE_INT64: + value_sz = sizeof (int64_t); + break; + case DATA_TYPE_UINT64: + value_sz = sizeof (uint64_t); + break; +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: + value_sz = sizeof (double); + break; +#endif + case DATA_TYPE_STRING: + if (data == NULL) + value_sz = 0; + else + value_sz = strlen(data) + 1; + break; + case DATA_TYPE_BOOLEAN_ARRAY: + value_sz = (uint64_t)nelem * sizeof (boolean_t); + break; + case DATA_TYPE_BYTE_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uchar_t); + break; + case DATA_TYPE_INT8_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int8_t); + break; + case DATA_TYPE_UINT8_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint8_t); + break; + case DATA_TYPE_INT16_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int16_t); + break; + case DATA_TYPE_UINT16_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint16_t); + break; + case DATA_TYPE_INT32_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int32_t); + break; + case DATA_TYPE_UINT32_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint32_t); + break; + case DATA_TYPE_INT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof (int64_t); + break; + case DATA_TYPE_UINT64_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t); + break; + case DATA_TYPE_STRING_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t); + + if (data != NULL) { + char *const *strs = data; + uint_t i; + + /* no alignment requirement for strings */ + for (i = 0; i < nelem; i++) { + if (strs[i] == NULL) + return (-1); + value_sz += strlen(strs[i]) + 1; + } + } + break; + case DATA_TYPE_HRTIME: + value_sz = sizeof (hrtime_t); + break; + case DATA_TYPE_NVLIST: + value_sz = NV_ALIGN(sizeof (nvlist_t)); + break; + case DATA_TYPE_NVLIST_ARRAY: + value_sz = (uint64_t)nelem * sizeof (uint64_t) + + (uint64_t)nelem * NV_ALIGN(sizeof (nvlist_t)); + break; + default: + return (-1); + } + + return (value_sz > INT32_MAX ? -1 : (int)value_sz); +} + +static int +nvlist_copy_embedded(nvlist_t *nvl, nvlist_t *onvl, nvlist_t *emb_nvl) +{ + nvpriv_t *priv; + int err; + + if ((priv = nv_priv_alloc_embedded((nvpriv_t *)(uintptr_t) + nvl->nvl_priv)) == NULL) + return (ENOMEM); + + nvlist_init(emb_nvl, onvl->nvl_nvflag, priv); + + if ((err = nvlist_copy_pairs(onvl, emb_nvl)) != 0) { + nvlist_free(emb_nvl); + emb_nvl->nvl_priv = 0; + } + + return (err); +} + +/* + * nvlist_add_common - Add new pair to nvlist + */ +static int +nvlist_add_common(nvlist_t *nvl, const char *name, + data_type_t type, uint_t nelem, const void *data) +{ + nvpair_t *nvp; + uint_t i; + + int nvp_sz, name_sz, value_sz; + int err = 0; + + if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) + return (EINVAL); + + if (nelem != 0 && data == NULL) + return (EINVAL); + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) included. + */ + if ((value_sz = i_get_value_size(type, data, nelem)) < 0) + return (EINVAL); + + if (i_validate_nvpair_value(type, nelem, data) != 0) + return (EINVAL); + + /* + * If we're adding an nvlist or nvlist array, ensure that we are not + * adding the input nvlist to itself, which would cause recursion, + * and ensure that no NULL nvlist pointers are present. + */ + switch (type) { + case DATA_TYPE_NVLIST: + if (data == nvl || data == NULL) + return (EINVAL); + break; + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **onvlp = (nvlist_t **)data; + for (i = 0; i < nelem; i++) { + if (onvlp[i] == nvl || onvlp[i] == NULL) + return (EINVAL); + } + break; + } + default: + break; + } + + /* calculate sizes of the nvpair elements and the nvpair itself */ + name_sz = strlen(name) + 1; + if (name_sz >= 1ULL << (sizeof (nvp->nvp_name_sz) * NBBY - 1)) + return (EINVAL); + + nvp_sz = NVP_SIZE_CALC(name_sz, value_sz); + + if ((nvp = nvp_buf_alloc(nvl, nvp_sz)) == NULL) + return (ENOMEM); + + ASSERT(nvp->nvp_size == nvp_sz); + nvp->nvp_name_sz = name_sz; + nvp->nvp_value_elem = nelem; + nvp->nvp_type = type; + bcopy(name, NVP_NAME(nvp), name_sz); + + switch (type) { + case DATA_TYPE_BOOLEAN: + break; + case DATA_TYPE_STRING_ARRAY: { + char *const *strs = data; + char *buf = NVP_VALUE(nvp); + char **cstrs = (void *)buf; + + /* skip pre-allocated space for pointer array */ + buf += nelem * sizeof (uint64_t); + for (i = 0; i < nelem; i++) { + int slen = strlen(strs[i]) + 1; + bcopy(strs[i], buf, slen); + cstrs[i] = buf; + buf += slen; + } + break; + } + case DATA_TYPE_NVLIST: { + nvlist_t *nnvl = EMBEDDED_NVL(nvp); + nvlist_t *onvl = (nvlist_t *)data; + + if ((err = nvlist_copy_embedded(nvl, onvl, nnvl)) != 0) { + nvp_buf_free(nvl, nvp); + return (err); + } + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **onvlp = (nvlist_t **)data; + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + nvlist_t *embedded = (nvlist_t *) + ((uintptr_t)nvlp + nelem * sizeof (uint64_t)); + + for (i = 0; i < nelem; i++) { + if ((err = nvlist_copy_embedded(nvl, + onvlp[i], embedded)) != 0) { + /* + * Free any successfully created lists + */ + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } + + nvlp[i] = embedded++; + } + break; + } + default: + bcopy(data, NVP_VALUE(nvp), value_sz); + } + + /* if unique name, remove before add */ + if (nvl->nvl_nvflag & NV_UNIQUE_NAME) + (void) nvlist_remove_all(nvl, name); + else if (nvl->nvl_nvflag & NV_UNIQUE_NAME_TYPE) + (void) nvlist_remove(nvl, name, type); + + err = nvt_add_nvpair(nvl, nvp); + if (err != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } + nvp_buf_link(nvl, nvp); + + return (0); +} + +int +nvlist_add_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN, 0, NULL)); +} + +int +nvlist_add_boolean_value(nvlist_t *nvl, const char *name, boolean_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_VALUE, 1, &val)); +} + +int +nvlist_add_byte(nvlist_t *nvl, const char *name, uchar_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE, 1, &val)); +} + +int +nvlist_add_int8(nvlist_t *nvl, const char *name, int8_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8, 1, &val)); +} + +int +nvlist_add_uint8(nvlist_t *nvl, const char *name, uint8_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8, 1, &val)); +} + +int +nvlist_add_int16(nvlist_t *nvl, const char *name, int16_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16, 1, &val)); +} + +int +nvlist_add_uint16(nvlist_t *nvl, const char *name, uint16_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16, 1, &val)); +} + +int +nvlist_add_int32(nvlist_t *nvl, const char *name, int32_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32, 1, &val)); +} + +int +nvlist_add_uint32(nvlist_t *nvl, const char *name, uint32_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32, 1, &val)); +} + +int +nvlist_add_int64(nvlist_t *nvl, const char *name, int64_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64, 1, &val)); +} + +int +nvlist_add_uint64(nvlist_t *nvl, const char *name, uint64_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64, 1, &val)); +} + +#if !defined(_KERNEL) +int +nvlist_add_double(nvlist_t *nvl, const char *name, double val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_DOUBLE, 1, &val)); +} +#endif + +int +nvlist_add_string(nvlist_t *nvl, const char *name, const char *val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING, 1, (void *)val)); +} + +int +nvlist_add_boolean_array(nvlist_t *nvl, const char *name, + boolean_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BOOLEAN_ARRAY, n, a)); +} + +int +nvlist_add_byte_array(nvlist_t *nvl, const char *name, uchar_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); +} + +int +nvlist_add_int8_array(nvlist_t *nvl, const char *name, int8_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); +} + +int +nvlist_add_uint8_array(nvlist_t *nvl, const char *name, uint8_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); +} + +int +nvlist_add_int16_array(nvlist_t *nvl, const char *name, int16_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); +} + +int +nvlist_add_uint16_array(nvlist_t *nvl, const char *name, uint16_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); +} + +int +nvlist_add_int32_array(nvlist_t *nvl, const char *name, int32_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); +} + +int +nvlist_add_uint32_array(nvlist_t *nvl, const char *name, uint32_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); +} + +int +nvlist_add_int64_array(nvlist_t *nvl, const char *name, int64_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); +} + +int +nvlist_add_uint64_array(nvlist_t *nvl, const char *name, uint64_t *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); +} + +int +nvlist_add_string_array(nvlist_t *nvl, const char *name, + char *const *a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); +} + +int +nvlist_add_hrtime(nvlist_t *nvl, const char *name, hrtime_t val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_HRTIME, 1, &val)); +} + +int +nvlist_add_nvlist(nvlist_t *nvl, const char *name, nvlist_t *val) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST, 1, val)); +} + +int +nvlist_add_nvlist_array(nvlist_t *nvl, const char *name, nvlist_t **a, uint_t n) +{ + return (nvlist_add_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); +} + +/* reading name-value pairs */ +nvpair_t * +nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + curr = NVPAIR2I_NVP(nvp); + + /* + * Ensure that nvp is a valid nvpair on this nvlist. + * NB: nvp_curr is used only as a hint so that we don't always + * have to walk the list to determine if nvp is still on the list. + */ + if (nvp == NULL) + curr = priv->nvp_list; + else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) + curr = curr->nvi_next; + else + curr = NULL; + + priv->nvp_curr = curr; + + return (curr != NULL ? &curr->nvi_nvp : NULL); +} + +nvpair_t * +nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + curr = NVPAIR2I_NVP(nvp); + + if (nvp == NULL) + curr = priv->nvp_last; + else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) + curr = curr->nvi_prev; + else + curr = NULL; + + priv->nvp_curr = curr; + + return (curr != NULL ? &curr->nvi_nvp : NULL); +} + +boolean_t +nvlist_empty(nvlist_t *nvl) +{ + nvpriv_t *priv; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (B_TRUE); + + return (priv->nvp_list == NULL); +} + +char * +nvpair_name(nvpair_t *nvp) +{ + return (NVP_NAME(nvp)); +} + +data_type_t +nvpair_type(nvpair_t *nvp) +{ + return (NVP_TYPE(nvp)); +} + +int +nvpair_type_is_array(nvpair_t *nvp) +{ + data_type_t type = NVP_TYPE(nvp); + + if ((type == DATA_TYPE_BYTE_ARRAY) || + (type == DATA_TYPE_INT8_ARRAY) || + (type == DATA_TYPE_UINT8_ARRAY) || + (type == DATA_TYPE_INT16_ARRAY) || + (type == DATA_TYPE_UINT16_ARRAY) || + (type == DATA_TYPE_INT32_ARRAY) || + (type == DATA_TYPE_UINT32_ARRAY) || + (type == DATA_TYPE_INT64_ARRAY) || + (type == DATA_TYPE_UINT64_ARRAY) || + (type == DATA_TYPE_BOOLEAN_ARRAY) || + (type == DATA_TYPE_STRING_ARRAY) || + (type == DATA_TYPE_NVLIST_ARRAY)) + return (1); + return (0); + +} + +static int +nvpair_value_common(nvpair_t *nvp, data_type_t type, uint_t *nelem, void *data) +{ + int value_sz; + + if (nvp == NULL || nvpair_type(nvp) != type) + return (EINVAL); + + /* + * For non-array types, we copy the data. + * For array types (including string), we set a pointer. + */ + switch (type) { + case DATA_TYPE_BOOLEAN: + if (nelem != NULL) + *nelem = 0; + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif + if (data == NULL) + return (EINVAL); + if ((value_sz = i_get_value_size(type, NULL, 1)) < 0) + return (EINVAL); + bcopy(NVP_VALUE(nvp), data, (size_t)value_sz); + if (nelem != NULL) + *nelem = 1; + break; + + case DATA_TYPE_NVLIST: + case DATA_TYPE_STRING: + if (data == NULL) + return (EINVAL); + *(void **)data = (void *)NVP_VALUE(nvp); + if (nelem != NULL) + *nelem = 1; + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + if (nelem == NULL || data == NULL) + return (EINVAL); + if ((*nelem = NVP_NELEM(nvp)) != 0) + *(void **)data = (void *)NVP_VALUE(nvp); + else + *(void **)data = NULL; + break; + + default: + return (ENOTSUP); + } + + return (0); +} + +static int +nvlist_lookup_common(nvlist_t *nvl, const char *name, data_type_t type, + uint_t *nelem, void *data) +{ + if (name == NULL || nvl == NULL || nvl->nvl_priv == 0) + return (EINVAL); + + if (!(nvl->nvl_nvflag & (NV_UNIQUE_NAME | NV_UNIQUE_NAME_TYPE))) + return (ENOTSUP); + + nvpair_t *nvp = nvt_lookup_name_type(nvl, name, type); + if (nvp == NULL) + return (ENOENT); + + return (nvpair_value_common(nvp, type, nelem, data)); +} + +int +nvlist_lookup_boolean(nvlist_t *nvl, const char *name) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BOOLEAN, NULL, NULL)); +} + +int +nvlist_lookup_boolean_value(nvlist_t *nvl, const char *name, boolean_t *val) +{ + return (nvlist_lookup_common(nvl, name, + DATA_TYPE_BOOLEAN_VALUE, NULL, val)); +} + +int +nvlist_lookup_byte(nvlist_t *nvl, const char *name, uchar_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE, NULL, val)); +} + +int +nvlist_lookup_int8(nvlist_t *nvl, const char *name, int8_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8, NULL, val)); +} + +int +nvlist_lookup_uint8(nvlist_t *nvl, const char *name, uint8_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8, NULL, val)); +} + +int +nvlist_lookup_int16(nvlist_t *nvl, const char *name, int16_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16, NULL, val)); +} + +int +nvlist_lookup_uint16(nvlist_t *nvl, const char *name, uint16_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16, NULL, val)); +} + +int +nvlist_lookup_int32(nvlist_t *nvl, const char *name, int32_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32, NULL, val)); +} + +int +nvlist_lookup_uint32(nvlist_t *nvl, const char *name, uint32_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32, NULL, val)); +} + +int +nvlist_lookup_int64(nvlist_t *nvl, const char *name, int64_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64, NULL, val)); +} + +int +nvlist_lookup_uint64(nvlist_t *nvl, const char *name, uint64_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64, NULL, val)); +} + +#if !defined(_KERNEL) +int +nvlist_lookup_double(nvlist_t *nvl, const char *name, double *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_DOUBLE, NULL, val)); +} +#endif + +int +nvlist_lookup_string(nvlist_t *nvl, const char *name, char **val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING, NULL, val)); +} + +int +nvlist_lookup_nvlist(nvlist_t *nvl, const char *name, nvlist_t **val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST, NULL, val)); +} + +int +nvlist_lookup_boolean_array(nvlist_t *nvl, const char *name, + boolean_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, + DATA_TYPE_BOOLEAN_ARRAY, n, a)); +} + +int +nvlist_lookup_byte_array(nvlist_t *nvl, const char *name, + uchar_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_BYTE_ARRAY, n, a)); +} + +int +nvlist_lookup_int8_array(nvlist_t *nvl, const char *name, int8_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT8_ARRAY, n, a)); +} + +int +nvlist_lookup_uint8_array(nvlist_t *nvl, const char *name, + uint8_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT8_ARRAY, n, a)); +} + +int +nvlist_lookup_int16_array(nvlist_t *nvl, const char *name, + int16_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT16_ARRAY, n, a)); +} + +int +nvlist_lookup_uint16_array(nvlist_t *nvl, const char *name, + uint16_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT16_ARRAY, n, a)); +} + +int +nvlist_lookup_int32_array(nvlist_t *nvl, const char *name, + int32_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT32_ARRAY, n, a)); +} + +int +nvlist_lookup_uint32_array(nvlist_t *nvl, const char *name, + uint32_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT32_ARRAY, n, a)); +} + +int +nvlist_lookup_int64_array(nvlist_t *nvl, const char *name, + int64_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_INT64_ARRAY, n, a)); +} + +int +nvlist_lookup_uint64_array(nvlist_t *nvl, const char *name, + uint64_t **a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_UINT64_ARRAY, n, a)); +} + +int +nvlist_lookup_string_array(nvlist_t *nvl, const char *name, + char ***a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_STRING_ARRAY, n, a)); +} + +int +nvlist_lookup_nvlist_array(nvlist_t *nvl, const char *name, + nvlist_t ***a, uint_t *n) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_NVLIST_ARRAY, n, a)); +} + +int +nvlist_lookup_hrtime(nvlist_t *nvl, const char *name, hrtime_t *val) +{ + return (nvlist_lookup_common(nvl, name, DATA_TYPE_HRTIME, NULL, val)); +} + +int +nvlist_lookup_pairs(nvlist_t *nvl, int flag, ...) +{ + va_list ap; + char *name; + int noentok = (flag & NV_FLAG_NOENTOK ? 1 : 0); + int ret = 0; + + va_start(ap, flag); + while (ret == 0 && (name = va_arg(ap, char *)) != NULL) { + data_type_t type; + void *val; + uint_t *nelem; + + switch (type = va_arg(ap, data_type_t)) { + case DATA_TYPE_BOOLEAN: + ret = nvlist_lookup_common(nvl, name, type, NULL, NULL); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: + case DATA_TYPE_STRING: + case DATA_TYPE_NVLIST: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif + val = va_arg(ap, void *); + ret = nvlist_lookup_common(nvl, name, type, NULL, val); + break; + + case DATA_TYPE_BYTE_ARRAY: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_NVLIST_ARRAY: + val = va_arg(ap, void *); + nelem = va_arg(ap, uint_t *); + ret = nvlist_lookup_common(nvl, name, type, nelem, val); + break; + + default: + ret = EINVAL; + } + + if (ret == ENOENT && noentok) + ret = 0; + } + va_end(ap); + + return (ret); +} + +/* + * Find the 'name'ed nvpair in the nvlist 'nvl'. If 'name' found, the function + * returns zero and a pointer to the matching nvpair is returned in '*ret' + * (given 'ret' is non-NULL). If 'sep' is specified then 'name' will penitrate + * multiple levels of embedded nvlists, with 'sep' as the separator. As an + * example, if sep is '.', name might look like: "a" or "a.b" or "a.c[3]" or + * "a.d[3].e[1]". This matches the C syntax for array embed (for convenience, + * code also supports "a.d[3]e[1]" syntax). + * + * If 'ip' is non-NULL and the last name component is an array, return the + * value of the "...[index]" array index in *ip. For an array reference that + * is not indexed, *ip will be returned as -1. If there is a syntax error in + * 'name', and 'ep' is non-NULL then *ep will be set to point to the location + * inside the 'name' string where the syntax error was detected. + */ +static int +nvlist_lookup_nvpair_ei_sep(nvlist_t *nvl, const char *name, const char sep, + nvpair_t **ret, int *ip, char **ep) +{ + nvpair_t *nvp; + const char *np; + char *sepp = NULL; + char *idxp, *idxep; + nvlist_t **nva; + long idx = 0; + int n; + + if (ip) + *ip = -1; /* not indexed */ + if (ep) + *ep = NULL; + + if ((nvl == NULL) || (name == NULL)) + return (EINVAL); + + sepp = NULL; + idx = 0; + /* step through components of name */ + for (np = name; np && *np; np = sepp) { + /* ensure unique names */ + if (!(nvl->nvl_nvflag & NV_UNIQUE_NAME)) + return (ENOTSUP); + + /* skip white space */ + skip_whitespace(np); + if (*np == 0) + break; + + /* set 'sepp' to end of current component 'np' */ + if (sep) + sepp = strchr(np, sep); + else + sepp = NULL; + + /* find start of next "[ index ]..." */ + idxp = strchr(np, '['); + + /* if sepp comes first, set idxp to NULL */ + if (sepp && idxp && (sepp < idxp)) + idxp = NULL; + + /* + * At this point 'idxp' is set if there is an index + * expected for the current component. + */ + if (idxp) { + /* set 'n' to length of current 'np' name component */ + n = idxp++ - np; + + /* keep sepp up to date for *ep use as we advance */ + skip_whitespace(idxp); + sepp = idxp; + + /* determine the index value */ +#if defined(_KERNEL) + if (ddi_strtol(idxp, &idxep, 0, &idx)) + goto fail; +#else + idx = strtol(idxp, &idxep, 0); +#endif + if (idxep == idxp) + goto fail; + + /* keep sepp up to date for *ep use as we advance */ + sepp = idxep; + + /* skip white space index value and check for ']' */ + skip_whitespace(sepp); + if (*sepp++ != ']') + goto fail; + + /* for embedded arrays, support C syntax: "a[1].b" */ + skip_whitespace(sepp); + if (sep && (*sepp == sep)) + sepp++; + } else if (sepp) { + n = sepp++ - np; + } else { + n = strlen(np); + } + + /* trim trailing whitespace by reducing length of 'np' */ + if (n == 0) + goto fail; + for (n--; (np[n] == ' ') || (np[n] == '\t'); n--) + ; + n++; + + /* skip whitespace, and set sepp to NULL if complete */ + if (sepp) { + skip_whitespace(sepp); + if (*sepp == 0) + sepp = NULL; + } + + /* + * At this point: + * o 'n' is the length of current 'np' component. + * o 'idxp' is set if there was an index, and value 'idx'. + * o 'sepp' is set to the beginning of the next component, + * and set to NULL if we have no more components. + * + * Search for nvpair with matching component name. + */ + for (nvp = nvlist_next_nvpair(nvl, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(nvl, nvp)) { + + /* continue if no match on name */ + if (strncmp(np, nvpair_name(nvp), n) || + (strlen(nvpair_name(nvp)) != n)) + continue; + + /* if indexed, verify type is array oriented */ + if (idxp && !nvpair_type_is_array(nvp)) + goto fail; + + /* + * Full match found, return nvp and idx if this + * was the last component. + */ + if (sepp == NULL) { + if (ret) + *ret = nvp; + if (ip && idxp) + *ip = (int)idx; /* return index */ + return (0); /* found */ + } + + /* + * More components: current match must be + * of DATA_TYPE_NVLIST or DATA_TYPE_NVLIST_ARRAY + * to support going deeper. + */ + if (nvpair_type(nvp) == DATA_TYPE_NVLIST) { + nvl = EMBEDDED_NVL(nvp); + break; + } else if (nvpair_type(nvp) == DATA_TYPE_NVLIST_ARRAY) { + (void) nvpair_value_nvlist_array(nvp, + &nva, (uint_t *)&n); + if ((n < 0) || (idx >= n)) + goto fail; + nvl = nva[idx]; + break; + } + + /* type does not support more levels */ + goto fail; + } + if (nvp == NULL) + goto fail; /* 'name' not found */ + + /* search for match of next component in embedded 'nvl' list */ + } + +fail: if (ep && sepp) + *ep = sepp; + return (EINVAL); +} + +/* + * Return pointer to nvpair with specified 'name'. + */ +int +nvlist_lookup_nvpair(nvlist_t *nvl, const char *name, nvpair_t **ret) +{ + return (nvlist_lookup_nvpair_ei_sep(nvl, name, 0, ret, NULL, NULL)); +} + +/* + * Determine if named nvpair exists in nvlist (use embedded separator of '.' + * and return array index). See nvlist_lookup_nvpair_ei_sep for more detailed + * description. + */ +int nvlist_lookup_nvpair_embedded_index(nvlist_t *nvl, + const char *name, nvpair_t **ret, int *ip, char **ep) +{ + return (nvlist_lookup_nvpair_ei_sep(nvl, name, '.', ret, ip, ep)); +} + +boolean_t +nvlist_exists(nvlist_t *nvl, const char *name) +{ + nvpriv_t *priv; + nvpair_t *nvp; + i_nvp_t *curr; + + if (name == NULL || nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (B_FALSE); + + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + nvp = &curr->nvi_nvp; + + if (strcmp(name, NVP_NAME(nvp)) == 0) + return (B_TRUE); + } + + return (B_FALSE); +} + +int +nvpair_value_boolean_value(nvpair_t *nvp, boolean_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_VALUE, NULL, val)); +} + +int +nvpair_value_byte(nvpair_t *nvp, uchar_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BYTE, NULL, val)); +} + +int +nvpair_value_int8(nvpair_t *nvp, int8_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT8, NULL, val)); +} + +int +nvpair_value_uint8(nvpair_t *nvp, uint8_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT8, NULL, val)); +} + +int +nvpair_value_int16(nvpair_t *nvp, int16_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT16, NULL, val)); +} + +int +nvpair_value_uint16(nvpair_t *nvp, uint16_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT16, NULL, val)); +} + +int +nvpair_value_int32(nvpair_t *nvp, int32_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT32, NULL, val)); +} + +int +nvpair_value_uint32(nvpair_t *nvp, uint32_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT32, NULL, val)); +} + +int +nvpair_value_int64(nvpair_t *nvp, int64_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT64, NULL, val)); +} + +int +nvpair_value_uint64(nvpair_t *nvp, uint64_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT64, NULL, val)); +} + +#if !defined(_KERNEL) +int +nvpair_value_double(nvpair_t *nvp, double *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_DOUBLE, NULL, val)); +} +#endif + +int +nvpair_value_string(nvpair_t *nvp, char **val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_STRING, NULL, val)); +} + +int +nvpair_value_nvlist(nvpair_t *nvp, nvlist_t **val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_NVLIST, NULL, val)); +} + +int +nvpair_value_boolean_array(nvpair_t *nvp, boolean_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BOOLEAN_ARRAY, nelem, val)); +} + +int +nvpair_value_byte_array(nvpair_t *nvp, uchar_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_BYTE_ARRAY, nelem, val)); +} + +int +nvpair_value_int8_array(nvpair_t *nvp, int8_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT8_ARRAY, nelem, val)); +} + +int +nvpair_value_uint8_array(nvpair_t *nvp, uint8_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT8_ARRAY, nelem, val)); +} + +int +nvpair_value_int16_array(nvpair_t *nvp, int16_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT16_ARRAY, nelem, val)); +} + +int +nvpair_value_uint16_array(nvpair_t *nvp, uint16_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT16_ARRAY, nelem, val)); +} + +int +nvpair_value_int32_array(nvpair_t *nvp, int32_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT32_ARRAY, nelem, val)); +} + +int +nvpair_value_uint32_array(nvpair_t *nvp, uint32_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT32_ARRAY, nelem, val)); +} + +int +nvpair_value_int64_array(nvpair_t *nvp, int64_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_INT64_ARRAY, nelem, val)); +} + +int +nvpair_value_uint64_array(nvpair_t *nvp, uint64_t **val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_UINT64_ARRAY, nelem, val)); +} + +int +nvpair_value_string_array(nvpair_t *nvp, char ***val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_STRING_ARRAY, nelem, val)); +} + +int +nvpair_value_nvlist_array(nvpair_t *nvp, nvlist_t ***val, uint_t *nelem) +{ + return (nvpair_value_common(nvp, DATA_TYPE_NVLIST_ARRAY, nelem, val)); +} + +int +nvpair_value_hrtime(nvpair_t *nvp, hrtime_t *val) +{ + return (nvpair_value_common(nvp, DATA_TYPE_HRTIME, NULL, val)); +} + +/* + * Add specified pair to the list. + */ +int +nvlist_add_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + if (nvl == NULL || nvp == NULL) + return (EINVAL); + + return (nvlist_add_common(nvl, NVP_NAME(nvp), NVP_TYPE(nvp), + NVP_NELEM(nvp), NVP_VALUE(nvp))); +} + +/* + * Merge the supplied nvlists and put the result in dst. + * The merged list will contain all names specified in both lists, + * the values are taken from nvl in the case of duplicates. + * Return 0 on success. + */ +/*ARGSUSED*/ +int +nvlist_merge(nvlist_t *dst, nvlist_t *nvl, int flag) +{ + if (nvl == NULL || dst == NULL) + return (EINVAL); + + if (dst != nvl) + return (nvlist_copy_pairs(nvl, dst)); + + return (0); +} + +/* + * Encoding related routines + */ +#define NVS_OP_ENCODE 0 +#define NVS_OP_DECODE 1 +#define NVS_OP_GETSIZE 2 + +typedef struct nvs_ops nvs_ops_t; + +typedef struct { + int nvs_op; + const nvs_ops_t *nvs_ops; + void *nvs_private; + nvpriv_t *nvs_priv; + int nvs_recursion; +} nvstream_t; + +/* + * nvs operations are: + * - nvs_nvlist + * encoding / decoding of an nvlist header (nvlist_t) + * calculates the size used for header and end detection + * + * - nvs_nvpair + * responsible for the first part of encoding / decoding of an nvpair + * calculates the decoded size of an nvpair + * + * - nvs_nvp_op + * second part of encoding / decoding of an nvpair + * + * - nvs_nvp_size + * calculates the encoding size of an nvpair + * + * - nvs_nvl_fini + * encodes the end detection mark (zeros). + */ +struct nvs_ops { + int (*nvs_nvlist)(nvstream_t *, nvlist_t *, size_t *); + int (*nvs_nvpair)(nvstream_t *, nvpair_t *, size_t *); + int (*nvs_nvp_op)(nvstream_t *, nvpair_t *); + int (*nvs_nvp_size)(nvstream_t *, nvpair_t *, size_t *); + int (*nvs_nvl_fini)(nvstream_t *); +}; + +typedef struct { + char nvh_encoding; /* nvs encoding method */ + char nvh_endian; /* nvs endian */ + char nvh_reserved1; /* reserved for future use */ + char nvh_reserved2; /* reserved for future use */ +} nvs_header_t; + +static int +nvs_encode_pairs(nvstream_t *nvs, nvlist_t *nvl) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + + /* + * Walk nvpair in list and encode each nvpair + */ + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) + if (nvs->nvs_ops->nvs_nvpair(nvs, &curr->nvi_nvp, NULL) != 0) + return (EFAULT); + + return (nvs->nvs_ops->nvs_nvl_fini(nvs)); +} + +static int +nvs_decode_pairs(nvstream_t *nvs, nvlist_t *nvl) +{ + nvpair_t *nvp; + size_t nvsize; + int err; + + /* + * Get decoded size of next pair in stream, alloc + * memory for nvpair_t, then decode the nvpair + */ + while ((err = nvs->nvs_ops->nvs_nvpair(nvs, NULL, &nvsize)) == 0) { + if (nvsize == 0) /* end of list */ + break; + + /* make sure len makes sense */ + if (nvsize < NVP_SIZE_CALC(1, 0)) + return (EFAULT); + + if ((nvp = nvp_buf_alloc(nvl, nvsize)) == NULL) + return (ENOMEM); + + if ((err = nvs->nvs_ops->nvs_nvp_op(nvs, nvp)) != 0) { + nvp_buf_free(nvl, nvp); + return (err); + } + + if (i_validate_nvpair(nvp) != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (EFAULT); + } + + err = nvt_add_nvpair(nvl, nvp); + if (err != 0) { + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (err); + } + nvp_buf_link(nvl, nvp); + } + return (err); +} + +static int +nvs_getsize_pairs(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) +{ + nvpriv_t *priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv; + i_nvp_t *curr; + uint64_t nvsize = *buflen; + size_t size; + + /* + * Get encoded size of nvpairs in nvlist + */ + for (curr = priv->nvp_list; curr != NULL; curr = curr->nvi_next) { + if (nvs->nvs_ops->nvs_nvp_size(nvs, &curr->nvi_nvp, &size) != 0) + return (EINVAL); + + if ((nvsize += size) > INT32_MAX) + return (EINVAL); + } + + *buflen = nvsize; + return (0); +} + +static int +nvs_operation(nvstream_t *nvs, nvlist_t *nvl, size_t *buflen) +{ + int err; + + if (nvl->nvl_priv == 0) + return (EFAULT); + + /* + * Perform the operation, starting with header, then each nvpair + */ + if ((err = nvs->nvs_ops->nvs_nvlist(nvs, nvl, buflen)) != 0) + return (err); + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + err = nvs_encode_pairs(nvs, nvl); + break; + + case NVS_OP_DECODE: + err = nvs_decode_pairs(nvs, nvl); + break; + + case NVS_OP_GETSIZE: + err = nvs_getsize_pairs(nvs, nvl, buflen); + break; + + default: + err = EINVAL; + } + + return (err); +} + +static int +nvs_embedded(nvstream_t *nvs, nvlist_t *embedded) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: { + int err; + + if (nvs->nvs_recursion >= nvpair_max_recursion) + return (EINVAL); + nvs->nvs_recursion++; + err = nvs_operation(nvs, embedded, NULL); + nvs->nvs_recursion--; + return (err); + } + case NVS_OP_DECODE: { + nvpriv_t *priv; + int err; + + if (embedded->nvl_version != NV_VERSION) + return (ENOTSUP); + + if ((priv = nv_priv_alloc_embedded(nvs->nvs_priv)) == NULL) + return (ENOMEM); + + nvlist_init(embedded, embedded->nvl_nvflag, priv); + + if (nvs->nvs_recursion >= nvpair_max_recursion) { + nvlist_free(embedded); + return (EINVAL); + } + nvs->nvs_recursion++; + if ((err = nvs_operation(nvs, embedded, NULL)) != 0) + nvlist_free(embedded); + nvs->nvs_recursion--; + return (err); + } + default: + break; + } + + return (EINVAL); +} + +static int +nvs_embedded_nvl_array(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + size_t nelem = NVP_NELEM(nvp); + nvlist_t **nvlp = EMBEDDED_NVL_ARRAY(nvp); + int i; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + for (i = 0; i < nelem; i++) + if (nvs_embedded(nvs, nvlp[i]) != 0) + return (EFAULT); + break; + + case NVS_OP_DECODE: { + size_t len = nelem * sizeof (uint64_t); + nvlist_t *embedded = (nvlist_t *)((uintptr_t)nvlp + len); + + bzero(nvlp, len); /* don't trust packed data */ + for (i = 0; i < nelem; i++) { + if (nvs_embedded(nvs, embedded) != 0) { + nvpair_free(nvp); + return (EFAULT); + } + + nvlp[i] = embedded++; + } + break; + } + case NVS_OP_GETSIZE: { + uint64_t nvsize = 0; + + for (i = 0; i < nelem; i++) { + size_t nvp_sz = 0; + + if (nvs_operation(nvs, nvlp[i], &nvp_sz) != 0) + return (EINVAL); + + if ((nvsize += nvp_sz) > INT32_MAX) + return (EINVAL); + } + + *size = nvsize; + break; + } + default: + return (EINVAL); + } + + return (0); +} + +static int nvs_native(nvstream_t *, nvlist_t *, char *, size_t *); +static int nvs_xdr(nvstream_t *, nvlist_t *, char *, size_t *); + +/* + * Common routine for nvlist operations: + * encode, decode, getsize (encoded size). + */ +static int +nvlist_common(nvlist_t *nvl, char *buf, size_t *buflen, int encoding, + int nvs_op) +{ + int err = 0; + nvstream_t nvs; + int nvl_endian; +#if defined(_ZFS_LITTLE_ENDIAN) + int host_endian = 1; +#elif defined(_ZFS_BIG_ENDIAN) + int host_endian = 0; +#else +#error "No endian defined!" +#endif /* _ZFS_LITTLE_ENDIAN */ + nvs_header_t *nvh; + + if (buflen == NULL || nvl == NULL || + (nvs.nvs_priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (EINVAL); + + nvs.nvs_op = nvs_op; + nvs.nvs_recursion = 0; + + /* + * For NVS_OP_ENCODE and NVS_OP_DECODE make sure an nvlist and + * a buffer is allocated. The first 4 bytes in the buffer are + * used for encoding method and host endian. + */ + switch (nvs_op) { + case NVS_OP_ENCODE: + if (buf == NULL || *buflen < sizeof (nvs_header_t)) + return (EINVAL); + + nvh = (void *)buf; + nvh->nvh_encoding = encoding; + nvh->nvh_endian = nvl_endian = host_endian; + nvh->nvh_reserved1 = 0; + nvh->nvh_reserved2 = 0; + break; + + case NVS_OP_DECODE: + if (buf == NULL || *buflen < sizeof (nvs_header_t)) + return (EINVAL); + + /* get method of encoding from first byte */ + nvh = (void *)buf; + encoding = nvh->nvh_encoding; + nvl_endian = nvh->nvh_endian; + break; + + case NVS_OP_GETSIZE: + nvl_endian = host_endian; + + /* + * add the size for encoding + */ + *buflen = sizeof (nvs_header_t); + break; + + default: + return (ENOTSUP); + } + + /* + * Create an nvstream with proper encoding method + */ + switch (encoding) { + case NV_ENCODE_NATIVE: + /* + * check endianness, in case we are unpacking + * from a file + */ + if (nvl_endian != host_endian) + return (ENOTSUP); + err = nvs_native(&nvs, nvl, buf, buflen); + break; + case NV_ENCODE_XDR: + err = nvs_xdr(&nvs, nvl, buf, buflen); + break; + default: + err = ENOTSUP; + break; + } + + return (err); +} + +int +nvlist_size(nvlist_t *nvl, size_t *size, int encoding) +{ + return (nvlist_common(nvl, NULL, size, encoding, NVS_OP_GETSIZE)); +} + +/* + * Pack nvlist into contiguous memory + */ +int +nvlist_pack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, + int kmflag) +{ + return (nvlist_xpack(nvl, bufp, buflen, encoding, + nvlist_nv_alloc(kmflag))); +} + +int +nvlist_xpack(nvlist_t *nvl, char **bufp, size_t *buflen, int encoding, + nv_alloc_t *nva) +{ + nvpriv_t nvpriv; + size_t alloc_size; + char *buf; + int err; + + if (nva == NULL || nvl == NULL || bufp == NULL || buflen == NULL) + return (EINVAL); + + if (*bufp != NULL) + return (nvlist_common(nvl, *bufp, buflen, encoding, + NVS_OP_ENCODE)); + + /* + * Here is a difficult situation: + * 1. The nvlist has fixed allocator properties. + * All other nvlist routines (like nvlist_add_*, ...) use + * these properties. + * 2. When using nvlist_pack() the user can specify their own + * allocator properties (e.g. by using KM_NOSLEEP). + * + * We use the user specified properties (2). A clearer solution + * will be to remove the kmflag from nvlist_pack(), but we will + * not change the interface. + */ + nv_priv_init(&nvpriv, nva, 0); + + if ((err = nvlist_size(nvl, &alloc_size, encoding))) + return (err); + + if ((buf = nv_mem_zalloc(&nvpriv, alloc_size)) == NULL) + return (ENOMEM); + + if ((err = nvlist_common(nvl, buf, &alloc_size, encoding, + NVS_OP_ENCODE)) != 0) { + nv_mem_free(&nvpriv, buf, alloc_size); + } else { + *buflen = alloc_size; + *bufp = buf; + } + + return (err); +} + +/* + * Unpack buf into an nvlist_t + */ +int +nvlist_unpack(char *buf, size_t buflen, nvlist_t **nvlp, int kmflag) +{ + return (nvlist_xunpack(buf, buflen, nvlp, nvlist_nv_alloc(kmflag))); +} + +int +nvlist_xunpack(char *buf, size_t buflen, nvlist_t **nvlp, nv_alloc_t *nva) +{ + nvlist_t *nvl; + int err; + + if (nvlp == NULL) + return (EINVAL); + + if ((err = nvlist_xalloc(&nvl, 0, nva)) != 0) + return (err); + + if ((err = nvlist_common(nvl, buf, &buflen, NV_ENCODE_NATIVE, + NVS_OP_DECODE)) != 0) + nvlist_free(nvl); + else + *nvlp = nvl; + + return (err); +} + +/* + * Native encoding functions + */ +typedef struct { + /* + * This structure is used when decoding a packed nvpair in + * the native format. n_base points to a buffer containing the + * packed nvpair. n_end is a pointer to the end of the buffer. + * (n_end actually points to the first byte past the end of the + * buffer.) n_curr is a pointer that lies between n_base and n_end. + * It points to the current data that we are decoding. + * The amount of data left in the buffer is equal to n_end - n_curr. + * n_flag is used to recognize a packed embedded list. + */ + caddr_t n_base; + caddr_t n_end; + caddr_t n_curr; + uint_t n_flag; +} nvs_native_t; + +static int +nvs_native_create(nvstream_t *nvs, nvs_native_t *native, char *buf, + size_t buflen) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + nvs->nvs_private = native; + native->n_curr = native->n_base = buf; + native->n_end = buf + buflen; + native->n_flag = 0; + return (0); + + case NVS_OP_GETSIZE: + nvs->nvs_private = native; + native->n_curr = native->n_base = native->n_end = NULL; + native->n_flag = 0; + return (0); + default: + return (EINVAL); + } +} + +/*ARGSUSED*/ +static void +nvs_native_destroy(nvstream_t *nvs) +{ +} + +static int +native_cp(nvstream_t *nvs, void *buf, size_t size) +{ + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + + if (native->n_curr + size > native->n_end) + return (EFAULT); + + /* + * The bcopy() below eliminates alignment requirement + * on the buffer (stream) and is preferred over direct access. + */ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + bcopy(buf, native->n_curr, size); + break; + case NVS_OP_DECODE: + bcopy(native->n_curr, buf, size); + break; + default: + return (EINVAL); + } + + native->n_curr += size; + return (0); +} + +/* + * operate on nvlist_t header + */ +static int +nvs_native_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) +{ + nvs_native_t *native = nvs->nvs_private; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + if (native->n_flag) + return (0); /* packed embedded list */ + + native->n_flag = 1; + + /* copy version and nvflag of the nvlist_t */ + if (native_cp(nvs, &nvl->nvl_version, sizeof (int32_t)) != 0 || + native_cp(nvs, &nvl->nvl_nvflag, sizeof (int32_t)) != 0) + return (EFAULT); + + return (0); + + case NVS_OP_GETSIZE: + /* + * if calculate for packed embedded list + * 4 for end of the embedded list + * else + * 2 * sizeof (int32_t) for nvl_version and nvl_nvflag + * and 4 for end of the entire list + */ + if (native->n_flag) { + *size += 4; + } else { + native->n_flag = 1; + *size += 2 * sizeof (int32_t) + 4; + } + + return (0); + + default: + return (EINVAL); + } +} + +static int +nvs_native_nvl_fini(nvstream_t *nvs) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + /* + * Add 4 zero bytes at end of nvlist. They are used + * for end detection by the decode routine. + */ + if (native->n_curr + sizeof (int) > native->n_end) + return (EFAULT); + + bzero(native->n_curr, sizeof (int)); + native->n_curr += sizeof (int); + } + + return (0); +} + +static int +nvpair_native_embedded(nvstream_t *nvs, nvpair_t *nvp) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + nvlist_t *packed = (void *) + (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); + /* + * Null out the pointer that is meaningless in the packed + * structure. The address may not be aligned, so we have + * to use bzero. + */ + bzero((char *)packed + offsetof(nvlist_t, nvl_priv), + sizeof (uint64_t)); + } + + return (nvs_embedded(nvs, EMBEDDED_NVL(nvp))); +} + +static int +nvpair_native_embedded_array(nvstream_t *nvs, nvpair_t *nvp) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + char *value = native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp); + size_t len = NVP_NELEM(nvp) * sizeof (uint64_t); + nvlist_t *packed = (nvlist_t *)((uintptr_t)value + len); + int i; + /* + * Null out pointers that are meaningless in the packed + * structure. The addresses may not be aligned, so we have + * to use bzero. + */ + bzero(value, len); + + for (i = 0; i < NVP_NELEM(nvp); i++, packed++) + /* + * Null out the pointer that is meaningless in the + * packed structure. The address may not be aligned, + * so we have to use bzero. + */ + bzero((char *)packed + offsetof(nvlist_t, nvl_priv), + sizeof (uint64_t)); + } + + return (nvs_embedded_nvl_array(nvs, nvp, NULL)); +} + +static void +nvpair_native_string_array(nvstream_t *nvs, nvpair_t *nvp) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + uint64_t *strp = (void *) + (native->n_curr - nvp->nvp_size + NVP_VALOFF(nvp)); + /* + * Null out pointers that are meaningless in the packed + * structure. The addresses may not be aligned, so we have + * to use bzero. + */ + bzero(strp, NVP_NELEM(nvp) * sizeof (uint64_t)); + break; + } + case NVS_OP_DECODE: { + char **strp = (void *)NVP_VALUE(nvp); + char *buf = ((char *)strp + NVP_NELEM(nvp) * sizeof (uint64_t)); + int i; + + for (i = 0; i < NVP_NELEM(nvp); i++) { + strp[i] = buf; + buf += strlen(buf) + 1; + } + break; + } + } +} + +static int +nvs_native_nvp_op(nvstream_t *nvs, nvpair_t *nvp) +{ + data_type_t type; + int value_sz; + int ret = 0; + + /* + * We do the initial bcopy of the data before we look at + * the nvpair type, because when we're decoding, we won't + * have the correct values for the pair until we do the bcopy. + */ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + if (native_cp(nvs, nvp, nvp->nvp_size) != 0) + return (EFAULT); + break; + default: + return (EINVAL); + } + + /* verify nvp_name_sz, check the name string length */ + if (i_validate_nvpair_name(nvp) != 0) + return (EFAULT); + + type = NVP_TYPE(nvp); + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) excluded. + */ + if ((value_sz = i_get_value_size(type, NULL, NVP_NELEM(nvp))) < 0) + return (EFAULT); + + if (NVP_SIZE_CALC(nvp->nvp_name_sz, value_sz) > nvp->nvp_size) + return (EFAULT); + + switch (type) { + case DATA_TYPE_NVLIST: + ret = nvpair_native_embedded(nvs, nvp); + break; + case DATA_TYPE_NVLIST_ARRAY: + ret = nvpair_native_embedded_array(nvs, nvp); + break; + case DATA_TYPE_STRING_ARRAY: + nvpair_native_string_array(nvs, nvp); + break; + default: + break; + } + + return (ret); +} + +static int +nvs_native_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + uint64_t nvp_sz = nvp->nvp_size; + + switch (NVP_TYPE(nvp)) { + case DATA_TYPE_NVLIST: { + size_t nvsize = 0; + + if (nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize) != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + case DATA_TYPE_NVLIST_ARRAY: { + size_t nvsize; + + if (nvs_embedded_nvl_array(nvs, nvp, &nvsize) != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + default: + break; + } + + if (nvp_sz > INT32_MAX) + return (EINVAL); + + *size = nvp_sz; + + return (0); +} + +static int +nvs_native_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + return (nvs_native_nvp_op(nvs, nvp)); + + case NVS_OP_DECODE: { + nvs_native_t *native = (nvs_native_t *)nvs->nvs_private; + int32_t decode_len; + + /* try to read the size value from the stream */ + if (native->n_curr + sizeof (int32_t) > native->n_end) + return (EFAULT); + bcopy(native->n_curr, &decode_len, sizeof (int32_t)); + + /* sanity check the size value */ + if (decode_len < 0 || + decode_len > native->n_end - native->n_curr) + return (EFAULT); + + *size = decode_len; + + /* + * If at the end of the stream then move the cursor + * forward, otherwise nvpair_native_op() will read + * the entire nvpair at the same cursor position. + */ + if (*size == 0) + native->n_curr += sizeof (int32_t); + break; + } + + default: + return (EINVAL); + } + + return (0); +} + +static const nvs_ops_t nvs_native_ops = { + .nvs_nvlist = nvs_native_nvlist, + .nvs_nvpair = nvs_native_nvpair, + .nvs_nvp_op = nvs_native_nvp_op, + .nvs_nvp_size = nvs_native_nvp_size, + .nvs_nvl_fini = nvs_native_nvl_fini +}; + +static int +nvs_native(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) +{ + nvs_native_t native; + int err; + + nvs->nvs_ops = &nvs_native_ops; + + if ((err = nvs_native_create(nvs, &native, buf + sizeof (nvs_header_t), + *buflen - sizeof (nvs_header_t))) != 0) + return (err); + + err = nvs_operation(nvs, nvl, buflen); + + nvs_native_destroy(nvs); + + return (err); +} + +/* + * XDR encoding functions + * + * An xdr packed nvlist is encoded as: + * + * - encoding method and host endian (4 bytes) + * - nvl_version (4 bytes) + * - nvl_nvflag (4 bytes) + * + * - encoded nvpairs, the format of one xdr encoded nvpair is: + * - encoded size of the nvpair (4 bytes) + * - decoded size of the nvpair (4 bytes) + * - name string, (4 + sizeof(NV_ALIGN4(string)) + * a string is coded as size (4 bytes) and data + * - data type (4 bytes) + * - number of elements in the nvpair (4 bytes) + * - data + * + * - 2 zero's for end of the entire list (8 bytes) + */ +static int +nvs_xdr_create(nvstream_t *nvs, XDR *xdr, char *buf, size_t buflen) +{ + /* xdr data must be 4 byte aligned */ + if ((ulong_t)buf % 4 != 0) + return (EFAULT); + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + xdrmem_create(xdr, buf, (uint_t)buflen, XDR_ENCODE); + nvs->nvs_private = xdr; + return (0); + case NVS_OP_DECODE: + xdrmem_create(xdr, buf, (uint_t)buflen, XDR_DECODE); + nvs->nvs_private = xdr; + return (0); + case NVS_OP_GETSIZE: + nvs->nvs_private = NULL; + return (0); + default: + return (EINVAL); + } +} + +static void +nvs_xdr_destroy(nvstream_t *nvs) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: + xdr_destroy((XDR *)nvs->nvs_private); + break; + default: + break; + } +} + +static int +nvs_xdr_nvlist(nvstream_t *nvs, nvlist_t *nvl, size_t *size) +{ + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: + case NVS_OP_DECODE: { + XDR *xdr = nvs->nvs_private; + + if (!xdr_int(xdr, &nvl->nvl_version) || + !xdr_u_int(xdr, &nvl->nvl_nvflag)) + return (EFAULT); + break; + } + case NVS_OP_GETSIZE: { + /* + * 2 * 4 for nvl_version + nvl_nvflag + * and 8 for end of the entire list + */ + *size += 2 * 4 + 8; + break; + } + default: + return (EINVAL); + } + return (0); +} + +static int +nvs_xdr_nvl_fini(nvstream_t *nvs) +{ + if (nvs->nvs_op == NVS_OP_ENCODE) { + XDR *xdr = nvs->nvs_private; + int zero = 0; + + if (!xdr_int(xdr, &zero) || !xdr_int(xdr, &zero)) + return (EFAULT); + } + + return (0); +} + +/* + * The format of xdr encoded nvpair is: + * encode_size, decode_size, name string, data type, nelem, data + */ +static int +nvs_xdr_nvp_op(nvstream_t *nvs, nvpair_t *nvp) +{ + data_type_t type; + char *buf; + char *buf_end = (char *)nvp + nvp->nvp_size; + int value_sz; + uint_t nelem, buflen; + bool_t ret = FALSE; + XDR *xdr = nvs->nvs_private; + + ASSERT(xdr != NULL && nvp != NULL); + + /* name string */ + if ((buf = NVP_NAME(nvp)) >= buf_end) + return (EFAULT); + buflen = buf_end - buf; + + if (!xdr_string(xdr, &buf, buflen - 1)) + return (EFAULT); + nvp->nvp_name_sz = strlen(buf) + 1; + + /* type and nelem */ + if (!xdr_int(xdr, (int *)&nvp->nvp_type) || + !xdr_int(xdr, &nvp->nvp_value_elem)) + return (EFAULT); + + type = NVP_TYPE(nvp); + nelem = nvp->nvp_value_elem; + + /* + * Verify type and nelem and get the value size. + * In case of data types DATA_TYPE_STRING and DATA_TYPE_STRING_ARRAY + * is the size of the string(s) excluded. + */ + if ((value_sz = i_get_value_size(type, NULL, nelem)) < 0) + return (EFAULT); + + /* if there is no data to extract then return */ + if (nelem == 0) + return (0); + + /* value */ + if ((buf = NVP_VALUE(nvp)) >= buf_end) + return (EFAULT); + buflen = buf_end - buf; + + if (buflen < value_sz) + return (EFAULT); + + switch (type) { + case DATA_TYPE_NVLIST: + if (nvs_embedded(nvs, (void *)buf) == 0) + return (0); + break; + + case DATA_TYPE_NVLIST_ARRAY: + if (nvs_embedded_nvl_array(nvs, nvp, NULL) == 0) + return (0); + break; + + case DATA_TYPE_BOOLEAN: + ret = TRUE; + break; + + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + ret = xdr_char(xdr, buf); + break; + + case DATA_TYPE_INT16: + ret = xdr_short(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT16: + ret = xdr_u_short(xdr, (void *)buf); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_INT32: + ret = xdr_int(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT32: + ret = xdr_u_int(xdr, (void *)buf); + break; + + case DATA_TYPE_INT64: + ret = xdr_longlong_t(xdr, (void *)buf); + break; + + case DATA_TYPE_UINT64: + ret = xdr_u_longlong_t(xdr, (void *)buf); + break; + + case DATA_TYPE_HRTIME: + /* + * NOTE: must expose the definition of hrtime_t here + */ + ret = xdr_longlong_t(xdr, (void *)buf); + break; +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: + ret = xdr_double(xdr, (void *)buf); + break; +#endif + case DATA_TYPE_STRING: + ret = xdr_string(xdr, &buf, buflen - 1); + break; + + case DATA_TYPE_BYTE_ARRAY: + ret = xdr_opaque(xdr, buf, nelem); + break; + + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen, sizeof (int8_t), + (xdrproc_t)xdr_char); + break; + + case DATA_TYPE_INT16_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int16_t), + sizeof (int16_t), (xdrproc_t)xdr_short); + break; + + case DATA_TYPE_UINT16_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint16_t), + sizeof (uint16_t), (xdrproc_t)xdr_u_short); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT32_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int32_t), + sizeof (int32_t), (xdrproc_t)xdr_int); + break; + + case DATA_TYPE_UINT32_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint32_t), + sizeof (uint32_t), (xdrproc_t)xdr_u_int); + break; + + case DATA_TYPE_INT64_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (int64_t), + sizeof (int64_t), (xdrproc_t)xdr_longlong_t); + break; + + case DATA_TYPE_UINT64_ARRAY: + ret = xdr_array(xdr, &buf, &nelem, buflen / sizeof (uint64_t), + sizeof (uint64_t), (xdrproc_t)xdr_u_longlong_t); + break; + + case DATA_TYPE_STRING_ARRAY: { + size_t len = nelem * sizeof (uint64_t); + char **strp = (void *)buf; + int i; + + if (nvs->nvs_op == NVS_OP_DECODE) + bzero(buf, len); /* don't trust packed data */ + + for (i = 0; i < nelem; i++) { + if (buflen <= len) + return (EFAULT); + + buf += len; + buflen -= len; + + if (xdr_string(xdr, &buf, buflen - 1) != TRUE) + return (EFAULT); + + if (nvs->nvs_op == NVS_OP_DECODE) + strp[i] = buf; + len = strlen(buf) + 1; + } + ret = TRUE; + break; + } + default: + break; + } + + return (ret == TRUE ? 0 : EFAULT); +} + +static int +nvs_xdr_nvp_size(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + data_type_t type = NVP_TYPE(nvp); + /* + * encode_size + decode_size + name string size + data type + nelem + * where name string size = 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + */ + uint64_t nvp_sz = 4 + 4 + 4 + NV_ALIGN4(strlen(NVP_NAME(nvp))) + 4 + 4; + + switch (type) { + case DATA_TYPE_BOOLEAN: + break; + + case DATA_TYPE_BOOLEAN_VALUE: + case DATA_TYPE_BYTE: + case DATA_TYPE_INT8: + case DATA_TYPE_UINT8: + case DATA_TYPE_INT16: + case DATA_TYPE_UINT16: + case DATA_TYPE_INT32: + case DATA_TYPE_UINT32: + nvp_sz += 4; /* 4 is the minimum xdr unit */ + break; + + case DATA_TYPE_INT64: + case DATA_TYPE_UINT64: + case DATA_TYPE_HRTIME: +#if !defined(_KERNEL) + case DATA_TYPE_DOUBLE: +#endif + nvp_sz += 8; + break; + + case DATA_TYPE_STRING: + nvp_sz += 4 + NV_ALIGN4(strlen((char *)NVP_VALUE(nvp))); + break; + + case DATA_TYPE_BYTE_ARRAY: + nvp_sz += NV_ALIGN4(NVP_NELEM(nvp)); + break; + + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_INT8_ARRAY: + case DATA_TYPE_UINT8_ARRAY: + case DATA_TYPE_INT16_ARRAY: + case DATA_TYPE_UINT16_ARRAY: + case DATA_TYPE_INT32_ARRAY: + case DATA_TYPE_UINT32_ARRAY: + nvp_sz += 4 + 4 * (uint64_t)NVP_NELEM(nvp); + break; + + case DATA_TYPE_INT64_ARRAY: + case DATA_TYPE_UINT64_ARRAY: + nvp_sz += 4 + 8 * (uint64_t)NVP_NELEM(nvp); + break; + + case DATA_TYPE_STRING_ARRAY: { + int i; + char **strs = (void *)NVP_VALUE(nvp); + + for (i = 0; i < NVP_NELEM(nvp); i++) + nvp_sz += 4 + NV_ALIGN4(strlen(strs[i])); + + break; + } + + case DATA_TYPE_NVLIST: + case DATA_TYPE_NVLIST_ARRAY: { + size_t nvsize = 0; + int old_nvs_op = nvs->nvs_op; + int err; + + nvs->nvs_op = NVS_OP_GETSIZE; + if (type == DATA_TYPE_NVLIST) + err = nvs_operation(nvs, EMBEDDED_NVL(nvp), &nvsize); + else + err = nvs_embedded_nvl_array(nvs, nvp, &nvsize); + nvs->nvs_op = old_nvs_op; + + if (err != 0) + return (EINVAL); + + nvp_sz += nvsize; + break; + } + + default: + return (EINVAL); + } + + if (nvp_sz > INT32_MAX) + return (EINVAL); + + *size = nvp_sz; + + return (0); +} + + +/* + * The NVS_XDR_MAX_LEN macro takes a packed xdr buffer of size x and estimates + * the largest nvpair that could be encoded in the buffer. + * + * See comments above nvpair_xdr_op() for the format of xdr encoding. + * The size of a xdr packed nvpair without any data is 5 words. + * + * Using the size of the data directly as an estimate would be ok + * in all cases except one. If the data type is of DATA_TYPE_STRING_ARRAY + * then the actual nvpair has space for an array of pointers to index + * the strings. These pointers are not encoded into the packed xdr buffer. + * + * If the data is of type DATA_TYPE_STRING_ARRAY and all the strings are + * of length 0, then each string is encoded in xdr format as a single word. + * Therefore when expanded to an nvpair there will be 2.25 word used for + * each string. (a int64_t allocated for pointer usage, and a single char + * for the null termination.) + * + * This is the calculation performed by the NVS_XDR_MAX_LEN macro. + */ +#define NVS_XDR_HDR_LEN ((size_t)(5 * 4)) +#define NVS_XDR_DATA_LEN(y) (((size_t)(y) <= NVS_XDR_HDR_LEN) ? \ + 0 : ((size_t)(y) - NVS_XDR_HDR_LEN)) +#define NVS_XDR_MAX_LEN(x) (NVP_SIZE_CALC(1, 0) + \ + (NVS_XDR_DATA_LEN(x) * 2) + \ + NV_ALIGN4((NVS_XDR_DATA_LEN(x) / 4))) + +static int +nvs_xdr_nvpair(nvstream_t *nvs, nvpair_t *nvp, size_t *size) +{ + XDR *xdr = nvs->nvs_private; + int32_t encode_len, decode_len; + + switch (nvs->nvs_op) { + case NVS_OP_ENCODE: { + size_t nvsize; + + if (nvs_xdr_nvp_size(nvs, nvp, &nvsize) != 0) + return (EFAULT); + + decode_len = nvp->nvp_size; + encode_len = nvsize; + if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) + return (EFAULT); + + return (nvs_xdr_nvp_op(nvs, nvp)); + } + case NVS_OP_DECODE: { + struct xdr_bytesrec bytesrec; + + /* get the encode and decode size */ + if (!xdr_int(xdr, &encode_len) || !xdr_int(xdr, &decode_len)) + return (EFAULT); + *size = decode_len; + + /* are we at the end of the stream? */ + if (*size == 0) + return (0); + + /* sanity check the size parameter */ + if (!xdr_control(xdr, XDR_GET_BYTES_AVAIL, &bytesrec)) + return (EFAULT); + + if (*size > NVS_XDR_MAX_LEN(bytesrec.xc_num_avail)) + return (EFAULT); + break; + } + + default: + return (EINVAL); + } + return (0); +} + +static const struct nvs_ops nvs_xdr_ops = { + .nvs_nvlist = nvs_xdr_nvlist, + .nvs_nvpair = nvs_xdr_nvpair, + .nvs_nvp_op = nvs_xdr_nvp_op, + .nvs_nvp_size = nvs_xdr_nvp_size, + .nvs_nvl_fini = nvs_xdr_nvl_fini +}; + +static int +nvs_xdr(nvstream_t *nvs, nvlist_t *nvl, char *buf, size_t *buflen) +{ + XDR xdr; + int err; + + nvs->nvs_ops = &nvs_xdr_ops; + + if ((err = nvs_xdr_create(nvs, &xdr, buf + sizeof (nvs_header_t), + *buflen - sizeof (nvs_header_t))) != 0) + return (err); + + err = nvs_operation(nvs, nvl, buflen); + + nvs_xdr_destroy(nvs); + + return (err); +} + +#if defined(_KERNEL) +static int __init +nvpair_init(void) +{ + return (0); +} + +static void __exit +nvpair_fini(void) +{ +} + +module_init(nvpair_init); +module_exit(nvpair_fini); +#endif + +ZFS_MODULE_DESCRIPTION("Generic name/value pair implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +EXPORT_SYMBOL(nv_alloc_init); +EXPORT_SYMBOL(nv_alloc_reset); +EXPORT_SYMBOL(nv_alloc_fini); + +/* list management */ +EXPORT_SYMBOL(nvlist_alloc); +EXPORT_SYMBOL(nvlist_free); +EXPORT_SYMBOL(nvlist_size); +EXPORT_SYMBOL(nvlist_pack); +EXPORT_SYMBOL(nvlist_unpack); +EXPORT_SYMBOL(nvlist_dup); +EXPORT_SYMBOL(nvlist_merge); + +EXPORT_SYMBOL(nvlist_xalloc); +EXPORT_SYMBOL(nvlist_xpack); +EXPORT_SYMBOL(nvlist_xunpack); +EXPORT_SYMBOL(nvlist_xdup); +EXPORT_SYMBOL(nvlist_lookup_nv_alloc); + +EXPORT_SYMBOL(nvlist_add_nvpair); +EXPORT_SYMBOL(nvlist_add_boolean); +EXPORT_SYMBOL(nvlist_add_boolean_value); +EXPORT_SYMBOL(nvlist_add_byte); +EXPORT_SYMBOL(nvlist_add_int8); +EXPORT_SYMBOL(nvlist_add_uint8); +EXPORT_SYMBOL(nvlist_add_int16); +EXPORT_SYMBOL(nvlist_add_uint16); +EXPORT_SYMBOL(nvlist_add_int32); +EXPORT_SYMBOL(nvlist_add_uint32); +EXPORT_SYMBOL(nvlist_add_int64); +EXPORT_SYMBOL(nvlist_add_uint64); +EXPORT_SYMBOL(nvlist_add_string); +EXPORT_SYMBOL(nvlist_add_nvlist); +EXPORT_SYMBOL(nvlist_add_boolean_array); +EXPORT_SYMBOL(nvlist_add_byte_array); +EXPORT_SYMBOL(nvlist_add_int8_array); +EXPORT_SYMBOL(nvlist_add_uint8_array); +EXPORT_SYMBOL(nvlist_add_int16_array); +EXPORT_SYMBOL(nvlist_add_uint16_array); +EXPORT_SYMBOL(nvlist_add_int32_array); +EXPORT_SYMBOL(nvlist_add_uint32_array); +EXPORT_SYMBOL(nvlist_add_int64_array); +EXPORT_SYMBOL(nvlist_add_uint64_array); +EXPORT_SYMBOL(nvlist_add_string_array); +EXPORT_SYMBOL(nvlist_add_nvlist_array); +EXPORT_SYMBOL(nvlist_next_nvpair); +EXPORT_SYMBOL(nvlist_prev_nvpair); +EXPORT_SYMBOL(nvlist_empty); +EXPORT_SYMBOL(nvlist_add_hrtime); + +EXPORT_SYMBOL(nvlist_remove); +EXPORT_SYMBOL(nvlist_remove_nvpair); +EXPORT_SYMBOL(nvlist_remove_all); + +EXPORT_SYMBOL(nvlist_lookup_boolean); +EXPORT_SYMBOL(nvlist_lookup_boolean_value); +EXPORT_SYMBOL(nvlist_lookup_byte); +EXPORT_SYMBOL(nvlist_lookup_int8); +EXPORT_SYMBOL(nvlist_lookup_uint8); +EXPORT_SYMBOL(nvlist_lookup_int16); +EXPORT_SYMBOL(nvlist_lookup_uint16); +EXPORT_SYMBOL(nvlist_lookup_int32); +EXPORT_SYMBOL(nvlist_lookup_uint32); +EXPORT_SYMBOL(nvlist_lookup_int64); +EXPORT_SYMBOL(nvlist_lookup_uint64); +EXPORT_SYMBOL(nvlist_lookup_string); +EXPORT_SYMBOL(nvlist_lookup_nvlist); +EXPORT_SYMBOL(nvlist_lookup_boolean_array); +EXPORT_SYMBOL(nvlist_lookup_byte_array); +EXPORT_SYMBOL(nvlist_lookup_int8_array); +EXPORT_SYMBOL(nvlist_lookup_uint8_array); +EXPORT_SYMBOL(nvlist_lookup_int16_array); +EXPORT_SYMBOL(nvlist_lookup_uint16_array); +EXPORT_SYMBOL(nvlist_lookup_int32_array); +EXPORT_SYMBOL(nvlist_lookup_uint32_array); +EXPORT_SYMBOL(nvlist_lookup_int64_array); +EXPORT_SYMBOL(nvlist_lookup_uint64_array); +EXPORT_SYMBOL(nvlist_lookup_string_array); +EXPORT_SYMBOL(nvlist_lookup_nvlist_array); +EXPORT_SYMBOL(nvlist_lookup_hrtime); +EXPORT_SYMBOL(nvlist_lookup_pairs); + +EXPORT_SYMBOL(nvlist_lookup_nvpair); +EXPORT_SYMBOL(nvlist_exists); + +/* processing nvpair */ +EXPORT_SYMBOL(nvpair_name); +EXPORT_SYMBOL(nvpair_type); +EXPORT_SYMBOL(nvpair_value_boolean_value); +EXPORT_SYMBOL(nvpair_value_byte); +EXPORT_SYMBOL(nvpair_value_int8); +EXPORT_SYMBOL(nvpair_value_uint8); +EXPORT_SYMBOL(nvpair_value_int16); +EXPORT_SYMBOL(nvpair_value_uint16); +EXPORT_SYMBOL(nvpair_value_int32); +EXPORT_SYMBOL(nvpair_value_uint32); +EXPORT_SYMBOL(nvpair_value_int64); +EXPORT_SYMBOL(nvpair_value_uint64); +EXPORT_SYMBOL(nvpair_value_string); +EXPORT_SYMBOL(nvpair_value_nvlist); +EXPORT_SYMBOL(nvpair_value_boolean_array); +EXPORT_SYMBOL(nvpair_value_byte_array); +EXPORT_SYMBOL(nvpair_value_int8_array); +EXPORT_SYMBOL(nvpair_value_uint8_array); +EXPORT_SYMBOL(nvpair_value_int16_array); +EXPORT_SYMBOL(nvpair_value_uint16_array); +EXPORT_SYMBOL(nvpair_value_int32_array); +EXPORT_SYMBOL(nvpair_value_uint32_array); +EXPORT_SYMBOL(nvpair_value_int64_array); +EXPORT_SYMBOL(nvpair_value_uint64_array); +EXPORT_SYMBOL(nvpair_value_string_array); +EXPORT_SYMBOL(nvpair_value_nvlist_array); +EXPORT_SYMBOL(nvpair_value_hrtime); diff --git a/module/nvpair/nvpair_alloc_fixed.c b/module/nvpair/nvpair_alloc_fixed.c new file mode 100644 index 000000000000..c8a604a2bfac --- /dev/null +++ b/module/nvpair/nvpair_alloc_fixed.c @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +/* + * This allocator is very simple. + * - it uses a pre-allocated buffer for memory allocations. + * - it does _not_ free memory in the pre-allocated buffer. + * + * The reason for the selected implementation is simplicity. + * This allocator is designed for the usage in interrupt context when + * the caller may not wait for free memory. + */ + +/* pre-allocated buffer for memory allocations */ +typedef struct nvbuf { + uintptr_t nvb_buf; /* address of pre-allocated buffer */ + uintptr_t nvb_lim; /* limit address in the buffer */ + uintptr_t nvb_cur; /* current address in the buffer */ +} nvbuf_t; + +/* + * Initialize the pre-allocated buffer allocator. The caller needs to supply + * + * buf address of pre-allocated buffer + * bufsz size of pre-allocated buffer + * + * nv_fixed_init() calculates the remaining members of nvbuf_t. + */ +static int +nv_fixed_init(nv_alloc_t *nva, va_list valist) +{ + uintptr_t base = va_arg(valist, uintptr_t); + uintptr_t lim = base + va_arg(valist, size_t); + nvbuf_t *nvb = (nvbuf_t *)P2ROUNDUP(base, sizeof (uintptr_t)); + + if (base == 0 || (uintptr_t)&nvb[1] > lim) + return (EINVAL); + + nvb->nvb_buf = (uintptr_t)&nvb[0]; + nvb->nvb_cur = (uintptr_t)&nvb[1]; + nvb->nvb_lim = lim; + nva->nva_arg = nvb; + + return (0); +} + +static void * +nv_fixed_alloc(nv_alloc_t *nva, size_t size) +{ + nvbuf_t *nvb = nva->nva_arg; + uintptr_t new = nvb->nvb_cur; + + if (size == 0 || new + size > nvb->nvb_lim) + return (NULL); + + nvb->nvb_cur = P2ROUNDUP(new + size, sizeof (uintptr_t)); + + return ((void *)new); +} + +/*ARGSUSED*/ +static void +nv_fixed_free(nv_alloc_t *nva, void *buf, size_t size) +{ + /* don't free memory in the pre-allocated buffer */ +} + +static void +nv_fixed_reset(nv_alloc_t *nva) +{ + nvbuf_t *nvb = nva->nva_arg; + + nvb->nvb_cur = (uintptr_t)&nvb[1]; +} + +const nv_alloc_ops_t nv_fixed_ops_def = { + .nv_ao_init = nv_fixed_init, + .nv_ao_fini = NULL, + .nv_ao_alloc = nv_fixed_alloc, + .nv_ao_free = nv_fixed_free, + .nv_ao_reset = nv_fixed_reset +}; + +const nv_alloc_ops_t *nv_fixed_ops = &nv_fixed_ops_def; + +#if defined(_KERNEL) +EXPORT_SYMBOL(nv_fixed_ops); +#endif diff --git a/module/nvpair/nvpair_alloc_spl.c b/module/nvpair/nvpair_alloc_spl.c new file mode 100644 index 000000000000..ed8fa4d09402 --- /dev/null +++ b/module/nvpair/nvpair_alloc_spl.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at * usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include + +static void * +nv_alloc_sleep_spl(nv_alloc_t *nva, size_t size) +{ + return (vmem_alloc(size, KM_SLEEP)); +} + +static void * +nv_alloc_pushpage_spl(nv_alloc_t *nva, size_t size) +{ + return (vmem_alloc(size, KM_PUSHPAGE)); +} + +static void * +nv_alloc_nosleep_spl(nv_alloc_t *nva, size_t size) +{ + return (kmem_alloc(size, KM_NOSLEEP)); +} + +static void +nv_free_spl(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +const nv_alloc_ops_t spl_sleep_ops_def = { + .nv_ao_init = NULL, + .nv_ao_fini = NULL, + .nv_ao_alloc = nv_alloc_sleep_spl, + .nv_ao_free = nv_free_spl, + .nv_ao_reset = NULL +}; + +const nv_alloc_ops_t spl_pushpage_ops_def = { + .nv_ao_init = NULL, + .nv_ao_fini = NULL, + .nv_ao_alloc = nv_alloc_pushpage_spl, + .nv_ao_free = nv_free_spl, + .nv_ao_reset = NULL +}; + +const nv_alloc_ops_t spl_nosleep_ops_def = { + .nv_ao_init = NULL, + .nv_ao_fini = NULL, + .nv_ao_alloc = nv_alloc_nosleep_spl, + .nv_ao_free = nv_free_spl, + .nv_ao_reset = NULL +}; + +nv_alloc_t nv_alloc_sleep_def = { + &spl_sleep_ops_def, + NULL +}; + +nv_alloc_t nv_alloc_pushpage_def = { + &spl_pushpage_ops_def, + NULL +}; + +nv_alloc_t nv_alloc_nosleep_def = { + &spl_nosleep_ops_def, + NULL +}; + +nv_alloc_t *nv_alloc_sleep = &nv_alloc_sleep_def; +nv_alloc_t *nv_alloc_pushpage = &nv_alloc_pushpage_def; +nv_alloc_t *nv_alloc_nosleep = &nv_alloc_nosleep_def; diff --git a/module/os/freebsd/spl/acl_common.c b/module/os/freebsd/spl/acl_common.c new file mode 100644 index 000000000000..66e27cefa396 --- /dev/null +++ b/module/os/freebsd/spl/acl_common.c @@ -0,0 +1,1709 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#if defined(_KERNEL) +#include +#include +#include +#include +#include +#else +#include +#include +#include +#include +#include +#include +#include +#include +#include +#define ASSERT assert +#endif + +#define ACE_POSIX_SUPPORTED_BITS (ACE_READ_DATA | \ + ACE_WRITE_DATA | ACE_APPEND_DATA | ACE_EXECUTE | \ + ACE_READ_ATTRIBUTES | ACE_READ_ACL | ACE_WRITE_ACL) + + +#define ACL_SYNCHRONIZE_SET_DENY 0x0000001 +#define ACL_SYNCHRONIZE_SET_ALLOW 0x0000002 +#define ACL_SYNCHRONIZE_ERR_DENY 0x0000004 +#define ACL_SYNCHRONIZE_ERR_ALLOW 0x0000008 + +#define ACL_WRITE_OWNER_SET_DENY 0x0000010 +#define ACL_WRITE_OWNER_SET_ALLOW 0x0000020 +#define ACL_WRITE_OWNER_ERR_DENY 0x0000040 +#define ACL_WRITE_OWNER_ERR_ALLOW 0x0000080 + +#define ACL_DELETE_SET_DENY 0x0000100 +#define ACL_DELETE_SET_ALLOW 0x0000200 +#define ACL_DELETE_ERR_DENY 0x0000400 +#define ACL_DELETE_ERR_ALLOW 0x0000800 + +#define ACL_WRITE_ATTRS_OWNER_SET_DENY 0x0001000 +#define ACL_WRITE_ATTRS_OWNER_SET_ALLOW 0x0002000 +#define ACL_WRITE_ATTRS_OWNER_ERR_DENY 0x0004000 +#define ACL_WRITE_ATTRS_OWNER_ERR_ALLOW 0x0008000 + +#define ACL_WRITE_ATTRS_WRITER_SET_DENY 0x0010000 +#define ACL_WRITE_ATTRS_WRITER_SET_ALLOW 0x0020000 +#define ACL_WRITE_ATTRS_WRITER_ERR_DENY 0x0040000 +#define ACL_WRITE_ATTRS_WRITER_ERR_ALLOW 0x0080000 + +#define ACL_WRITE_NAMED_WRITER_SET_DENY 0x0100000 +#define ACL_WRITE_NAMED_WRITER_SET_ALLOW 0x0200000 +#define ACL_WRITE_NAMED_WRITER_ERR_DENY 0x0400000 +#define ACL_WRITE_NAMED_WRITER_ERR_ALLOW 0x0800000 + +#define ACL_READ_NAMED_READER_SET_DENY 0x1000000 +#define ACL_READ_NAMED_READER_SET_ALLOW 0x2000000 +#define ACL_READ_NAMED_READER_ERR_DENY 0x4000000 +#define ACL_READ_NAMED_READER_ERR_ALLOW 0x8000000 + + +#define ACE_VALID_MASK_BITS (\ + ACE_READ_DATA | \ + ACE_LIST_DIRECTORY | \ + ACE_WRITE_DATA | \ + ACE_ADD_FILE | \ + ACE_APPEND_DATA | \ + ACE_ADD_SUBDIRECTORY | \ + ACE_READ_NAMED_ATTRS | \ + ACE_WRITE_NAMED_ATTRS | \ + ACE_EXECUTE | \ + ACE_DELETE_CHILD | \ + ACE_READ_ATTRIBUTES | \ + ACE_WRITE_ATTRIBUTES | \ + ACE_DELETE | \ + ACE_READ_ACL | \ + ACE_WRITE_ACL | \ + ACE_WRITE_OWNER | \ + ACE_SYNCHRONIZE) + +#define ACE_MASK_UNDEFINED 0x80000000 + +#define ACE_VALID_FLAG_BITS (ACE_FILE_INHERIT_ACE | \ + ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE | ACE_INHERIT_ONLY_ACE | \ + ACE_SUCCESSFUL_ACCESS_ACE_FLAG | ACE_FAILED_ACCESS_ACE_FLAG | \ + ACE_IDENTIFIER_GROUP | ACE_OWNER | ACE_GROUP | ACE_EVERYONE) + +/* + * ACL conversion helpers + */ + +typedef enum { + ace_unused, + ace_user_obj, + ace_user, + ace_group, /* includes GROUP and GROUP_OBJ */ + ace_other_obj +} ace_to_aent_state_t; + +typedef struct acevals { + uid_t key; + avl_node_t avl; + uint32_t mask; + uint32_t allowed; + uint32_t denied; + int aent_type; +} acevals_t; + +typedef struct ace_list { + acevals_t user_obj; + avl_tree_t user; + int numusers; + acevals_t group_obj; + avl_tree_t group; + int numgroups; + acevals_t other_obj; + uint32_t acl_mask; + int hasmask; + int dfacl_flag; + ace_to_aent_state_t state; + int seen; /* bitmask of all aclent_t a_type values seen */ +} ace_list_t; + +/* + * Generic shellsort, from K&R (1st ed, p 58.), somewhat modified. + * v = Ptr to array/vector of objs + * n = # objs in the array + * s = size of each obj (must be multiples of a word size) + * f = ptr to function to compare two objs + * returns (-1 = less than, 0 = equal, 1 = greater than + */ +void +ksort(caddr_t v, int n, int s, int (*f)(void *, void *)) +{ + int g, i, j, ii; + unsigned int *p1, *p2; + unsigned int tmp; + + /* No work to do */ + if (v == NULL || n <= 1) + return; + + /* Sanity check on arguments */ + ASSERT(((uintptr_t)v & 0x3) == 0 && (s & 0x3) == 0); + ASSERT(s > 0); + for (g = n / 2; g > 0; g /= 2) { + for (i = g; i < n; i++) { + for (j = i - g; j >= 0 && + (*f)(v + j * s, v + (j + g) * s) == 1; + j -= g) { + p1 = (void *)(v + j * s); + p2 = (void *)(v + (j + g) * s); + for (ii = 0; ii < s / 4; ii++) { + tmp = *p1; + *p1++ = *p2; + *p2++ = tmp; + } + } + } + } +} + +/* + * Compare two acls, all fields. Returns: + * -1 (less than) + * 0 (equal) + * +1 (greater than) + */ +int +cmp2acls(void *a, void *b) +{ + aclent_t *x = (aclent_t *)a; + aclent_t *y = (aclent_t *)b; + + /* Compare types */ + if (x->a_type < y->a_type) + return (-1); + if (x->a_type > y->a_type) + return (1); + /* Equal types; compare id's */ + if (x->a_id < y->a_id) + return (-1); + if (x->a_id > y->a_id) + return (1); + /* Equal ids; compare perms */ + if (x->a_perm < y->a_perm) + return (-1); + if (x->a_perm > y->a_perm) + return (1); + /* Totally equal */ + return (0); +} + +static int +cacl_malloc(void **ptr, size_t size) +{ + *ptr = kmem_zalloc(size, KM_SLEEP); + return (0); +} + + +#if !defined(_KERNEL) +acl_t * +acl_alloc(enum acl_type type) +{ + acl_t *aclp; + + if (cacl_malloc((void **)&aclp, sizeof (acl_t)) != 0) + return (NULL); + + aclp->acl_aclp = NULL; + aclp->acl_cnt = 0; + + switch (type) { + case ACE_T: + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + break; + case ACLENT_T: + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + break; + default: + acl_free(aclp); + aclp = NULL; + } + return (aclp); +} + +/* + * Free acl_t structure + */ +void +acl_free(acl_t *aclp) +{ + int acl_size; + + if (aclp == NULL) + return; + + if (aclp->acl_aclp) { + acl_size = aclp->acl_cnt * aclp->acl_entry_size; + cacl_free(aclp->acl_aclp, acl_size); + } + + cacl_free(aclp, sizeof (acl_t)); +} + +static uint32_t +access_mask_set(int haswriteperm, int hasreadperm, int isowner, int isallow) +{ + uint32_t access_mask = 0; + int acl_produce; + int synchronize_set = 0, write_owner_set = 0; + int delete_set = 0, write_attrs_set = 0; + int read_named_set = 0, write_named_set = 0; + + acl_produce = (ACL_SYNCHRONIZE_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_WRITER_SET_DENY); + + if (isallow) { + synchronize_set = ACL_SYNCHRONIZE_SET_ALLOW; + write_owner_set = ACL_WRITE_OWNER_SET_ALLOW; + delete_set = ACL_DELETE_SET_ALLOW; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_ALLOW; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + } else { + + synchronize_set = ACL_SYNCHRONIZE_SET_DENY; + write_owner_set = ACL_WRITE_OWNER_SET_DENY; + delete_set = ACL_DELETE_SET_DENY; + if (hasreadperm) + read_named_set = ACL_READ_NAMED_READER_SET_DENY; + if (haswriteperm) + write_named_set = ACL_WRITE_NAMED_WRITER_SET_DENY; + if (isowner) + write_attrs_set = ACL_WRITE_ATTRS_OWNER_SET_DENY; + else if (haswriteperm) + write_attrs_set = ACL_WRITE_ATTRS_WRITER_SET_DENY; + else + /* + * If the entity is not the owner and does not + * have write permissions ACE_WRITE_ATTRIBUTES will + * always go in the DENY ACE. + */ + access_mask |= ACE_WRITE_ATTRIBUTES; + } + + if (acl_produce & synchronize_set) + access_mask |= ACE_SYNCHRONIZE; + if (acl_produce & write_owner_set) + access_mask |= ACE_WRITE_OWNER; + if (acl_produce & delete_set) + access_mask |= ACE_DELETE; + if (acl_produce & write_attrs_set) + access_mask |= ACE_WRITE_ATTRIBUTES; + if (acl_produce & read_named_set) + access_mask |= ACE_READ_NAMED_ATTRS; + if (acl_produce & write_named_set) + access_mask |= ACE_WRITE_NAMED_ATTRS; + + return (access_mask); +} + +/* + * Given an mode_t, convert it into an access_mask as used + * by nfsace, assuming aclent_t -> nfsace semantics. + */ +static uint32_t +mode_to_ace_access(mode_t mode, boolean_t isdir, int isowner, int isallow) +{ + uint32_t access = 0; + int haswriteperm = 0; + int hasreadperm = 0; + + if (isallow) { + haswriteperm = (mode & S_IWOTH); + hasreadperm = (mode & S_IROTH); + } else { + haswriteperm = !(mode & S_IWOTH); + hasreadperm = !(mode & S_IROTH); + } + + /* + * The following call takes care of correctly setting the following + * mask bits in the access_mask: + * ACE_SYNCHRONIZE, ACE_WRITE_OWNER, ACE_DELETE, + * ACE_WRITE_ATTRIBUTES, ACE_WRITE_NAMED_ATTRS, ACE_READ_NAMED_ATTRS + */ + access = access_mask_set(haswriteperm, hasreadperm, isowner, isallow); + + if (isallow) { + access |= ACE_READ_ACL | ACE_READ_ATTRIBUTES; + if (isowner) + access |= ACE_WRITE_ACL; + } else { + if (! isowner) + access |= ACE_WRITE_ACL; + } + + /* read */ + if (mode & S_IROTH) { + access |= ACE_READ_DATA; + } + /* write */ + if (mode & S_IWOTH) { + access |= ACE_WRITE_DATA | + ACE_APPEND_DATA; + if (isdir) + access |= ACE_DELETE_CHILD; + } + /* exec */ + if (mode & S_IXOTH) { + access |= ACE_EXECUTE; + } + + return (access); +} + +/* + * Given an nfsace (presumably an ALLOW entry), make a + * corresponding DENY entry at the address given. + */ +static void +ace_make_deny(ace_t *allow, ace_t *deny, int isdir, int isowner) +{ + (void) memcpy(deny, allow, sizeof (ace_t)); + + deny->a_who = allow->a_who; + + deny->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + deny->a_access_mask ^= ACE_POSIX_SUPPORTED_BITS; + if (isdir) + deny->a_access_mask ^= ACE_DELETE_CHILD; + + deny->a_access_mask &= ~(ACE_SYNCHRONIZE | ACE_WRITE_OWNER | + ACE_DELETE | ACE_WRITE_ATTRIBUTES | ACE_READ_NAMED_ATTRS | + ACE_WRITE_NAMED_ATTRS); + deny->a_access_mask |= access_mask_set((allow->a_access_mask & + ACE_WRITE_DATA), (allow->a_access_mask & ACE_READ_DATA), isowner, + B_FALSE); +} +/* + * Make an initial pass over an array of aclent_t's. Gather + * information such as an ACL_MASK (if any), number of users, + * number of groups, and whether the array needs to be sorted. + */ +static int +ln_aent_preprocess(aclent_t *aclent, int n, + int *hasmask, mode_t *mask, + int *numuser, int *numgroup, int *needsort) +{ + int error = 0; + int i; + int curtype = 0; + + *hasmask = 0; + *mask = 07; + *needsort = 0; + *numuser = 0; + *numgroup = 0; + + for (i = 0; i < n; i++) { + if (aclent[i].a_type < curtype) + *needsort = 1; + else if (aclent[i].a_type > curtype) + curtype = aclent[i].a_type; + if (aclent[i].a_type & USER) + (*numuser)++; + if (aclent[i].a_type & (GROUP | GROUP_OBJ)) + (*numgroup)++; + if (aclent[i].a_type & CLASS_OBJ) { + if (*hasmask) { + error = EINVAL; + goto out; + } else { + *hasmask = 1; + *mask = aclent[i].a_perm; + } + } + } + + if ((! *hasmask) && (*numuser + *numgroup > 1)) { + error = EINVAL; + goto out; + } + +out: + return (error); +} + +/* + * Convert an array of aclent_t into an array of nfsace entries, + * following POSIX draft -> nfsv4 conversion semantics as outlined in + * the IETF draft. + */ +static int +ln_aent_to_ace(aclent_t *aclent, int n, ace_t **acepp, int *rescount, int isdir) +{ + int error = 0; + mode_t mask; + int numuser, numgroup, needsort; + int resultsize = 0; + int i, groupi = 0, skip; + ace_t *acep, *result = NULL; + int hasmask; + + error = ln_aent_preprocess(aclent, n, &hasmask, &mask, + &numuser, &numgroup, &needsort); + if (error != 0) + goto out; + + /* allow + deny for each aclent */ + resultsize = n * 2; + if (hasmask) { + /* + * stick extra deny on the group_obj and on each + * user|group for the mask (the group_obj was added + * into the count for numgroup) + */ + resultsize += numuser + numgroup; + /* ... and don't count the mask itself */ + resultsize -= 2; + } + + /* sort the source if necessary */ + if (needsort) + ksort((caddr_t)aclent, n, sizeof (aclent_t), cmp2acls); + + if (cacl_malloc((void **)&result, resultsize * sizeof (ace_t)) != 0) + goto out; + + acep = result; + + for (i = 0; i < n; i++) { + /* + * don't process CLASS_OBJ (mask); mask was grabbed in + * ln_aent_preprocess() + */ + if (aclent[i].a_type & CLASS_OBJ) + continue; + + /* If we need an ACL_MASK emulator, prepend it now */ + if ((hasmask) && + (aclent[i].a_type & (USER | GROUP | GROUP_OBJ))) { + acep->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + acep->a_flags = 0; + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= + (ACE_IDENTIFIER_GROUP|ACE_GROUP); + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + } else { + acep->a_who = aclent[i].a_id; + acep->a_flags |= ACE_IDENTIFIER_GROUP; + } + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + /* + * Set the access mask for the prepended deny + * ace. To do this, we invert the mask (found + * in ln_aent_preprocess()) then convert it to an + * DENY ace access_mask. + */ + acep->a_access_mask = mode_to_ace_access((mask ^ 07), + isdir, 0, 0); + acep += 1; + } + + /* handle a_perm -> access_mask */ + acep->a_access_mask = mode_to_ace_access(aclent[i].a_perm, + isdir, aclent[i].a_type & USER_OBJ, 1); + + /* emulate a default aclent */ + if (aclent[i].a_type & ACL_DEFAULT) { + acep->a_flags |= ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE; + } + + /* + * handle a_perm and a_id + * + * this must be done last, since it involves the + * corresponding deny aces, which are handled + * differently for each different a_type. + */ + if (aclent[i].a_type & USER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_OWNER; + ace_make_deny(acep, acep + 1, isdir, B_TRUE); + acep += 2; + } else if (aclent[i].a_type & USER) { + acep->a_who = aclent[i].a_id; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else if (aclent[i].a_type & (GROUP_OBJ | GROUP)) { + if (aclent[i].a_type & GROUP_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_GROUP; + } else { + acep->a_who = aclent[i].a_id; + } + acep->a_flags |= ACE_IDENTIFIER_GROUP; + /* + * Set the corresponding deny for the group ace. + * + * The deny aces go after all of the groups, unlike + * everything else, where they immediately follow + * the allow ace. + * + * We calculate "skip", the number of slots to + * skip ahead for the deny ace, here. + * + * The pattern is: + * MD1 A1 MD2 A2 MD3 A3 D1 D2 D3 + * thus, skip is + * (2 * numgroup) - 1 - groupi + * (2 * numgroup) to account for MD + A + * - 1 to account for the fact that we're on the + * access (A), not the mask (MD) + * - groupi to account for the fact that we have + * passed up groupi number of MD's. + */ + skip = (2 * numgroup) - 1 - groupi; + ace_make_deny(acep, acep + skip, isdir, B_FALSE); + /* + * If we just did the last group, skip acep past + * all of the denies; else, just move ahead one. + */ + if (++groupi >= numgroup) + acep += numgroup + 1; + else + acep += 1; + } else if (aclent[i].a_type & OTHER_OBJ) { + acep->a_who = (uid_t)-1; + acep->a_flags |= ACE_EVERYONE; + ace_make_deny(acep, acep + 1, isdir, B_FALSE); + acep += 2; + } else { + error = EINVAL; + goto out; + } + } + + *acepp = result; + *rescount = resultsize; + +out: + if (error != 0) { + if ((result != NULL) && (resultsize > 0)) { + cacl_free(result, resultsize * sizeof (ace_t)); + } + } + + return (error); +} + +static int +convert_aent_to_ace(aclent_t *aclentp, int aclcnt, boolean_t isdir, + ace_t **retacep, int *retacecnt) +{ + ace_t *acep; + ace_t *dfacep; + int acecnt = 0; + int dfacecnt = 0; + int dfaclstart = 0; + int dfaclcnt = 0; + aclent_t *aclp; + int i; + int error; + int acesz, dfacesz; + + ksort((caddr_t)aclentp, aclcnt, sizeof (aclent_t), cmp2acls); + + for (i = 0, aclp = aclentp; i < aclcnt; aclp++, i++) { + if (aclp->a_type & ACL_DEFAULT) + break; + } + + if (i < aclcnt) { + dfaclstart = i; + dfaclcnt = aclcnt - i; + } + + if (dfaclcnt && !isdir) { + return (EINVAL); + } + + error = ln_aent_to_ace(aclentp, i, &acep, &acecnt, isdir); + if (error) + return (error); + + if (dfaclcnt) { + error = ln_aent_to_ace(&aclentp[dfaclstart], dfaclcnt, + &dfacep, &dfacecnt, isdir); + if (error) { + if (acep) { + cacl_free(acep, acecnt * sizeof (ace_t)); + } + return (error); + } + } + + if (dfacecnt != 0) { + acesz = sizeof (ace_t) * acecnt; + dfacesz = sizeof (ace_t) * dfacecnt; + acep = cacl_realloc(acep, acesz, acesz + dfacesz); + if (acep == NULL) + return (ENOMEM); + if (dfaclcnt) { + (void) memcpy(acep + acecnt, dfacep, dfacesz); + } + } + if (dfaclcnt) + cacl_free(dfacep, dfacecnt * sizeof (ace_t)); + + *retacecnt = acecnt + dfacecnt; + *retacep = acep; + return (0); +} + +static int +ace_mask_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + int error = 0; + o_mode_t mode = 0; + uint32_t bits, wantbits; + + /* read */ + if (mask & ACE_READ_DATA) + mode |= S_IROTH; + + /* write */ + wantbits = (ACE_WRITE_DATA | ACE_APPEND_DATA); + if (isdir) + wantbits |= ACE_DELETE_CHILD; + bits = mask & wantbits; + if (bits != 0) { + if (bits != wantbits) { + error = ENOTSUP; + goto out; + } + mode |= S_IWOTH; + } + + /* exec */ + if (mask & ACE_EXECUTE) { + mode |= S_IXOTH; + } + + *modep = mode; + +out: + return (error); +} + +static void +acevals_init(acevals_t *vals, uid_t key) +{ + bzero(vals, sizeof (*vals)); + vals->allowed = ACE_MASK_UNDEFINED; + vals->denied = ACE_MASK_UNDEFINED; + vals->mask = ACE_MASK_UNDEFINED; + vals->key = key; +} + +static void +ace_list_init(ace_list_t *al, int dfacl_flag) +{ + acevals_init(&al->user_obj, 0); + acevals_init(&al->group_obj, 0); + acevals_init(&al->other_obj, 0); + al->numusers = 0; + al->numgroups = 0; + al->acl_mask = 0; + al->hasmask = 0; + al->state = ace_unused; + al->seen = 0; + al->dfacl_flag = dfacl_flag; +} + +/* + * Find or create an acevals holder for a given id and avl tree. + * + * Note that only one thread will ever touch these avl trees, so + * there is no need for locking. + */ +static acevals_t * +acevals_find(ace_t *ace, avl_tree_t *avl, int *num) +{ + acevals_t key, *rc; + avl_index_t where; + + key.key = ace->a_who; + rc = avl_find(avl, &key, &where); + if (rc != NULL) + return (rc); + + /* this memory is freed by ln_ace_to_aent()->ace_list_free() */ + if (cacl_malloc((void **)&rc, sizeof (acevals_t)) != 0) + return (NULL); + + acevals_init(rc, ace->a_who); + avl_insert(avl, rc, where); + (*num)++; + + return (rc); +} + +static int +access_mask_check(ace_t *acep, int mask_bit, int isowner) +{ + int set_deny, err_deny; + int set_allow, err_allow; + int acl_consume; + int haswriteperm, hasreadperm; + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 0 : 1; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 0 : 1; + } else { + haswriteperm = (acep->a_access_mask & ACE_WRITE_DATA) ? 1 : 0; + hasreadperm = (acep->a_access_mask & ACE_READ_DATA) ? 1 : 0; + } + + acl_consume = (ACL_SYNCHRONIZE_ERR_DENY | + ACL_DELETE_ERR_DENY | + ACL_WRITE_OWNER_ERR_DENY | + ACL_WRITE_OWNER_ERR_ALLOW | + ACL_WRITE_ATTRS_OWNER_SET_ALLOW | + ACL_WRITE_ATTRS_OWNER_ERR_DENY | + ACL_WRITE_ATTRS_WRITER_SET_DENY | + ACL_WRITE_ATTRS_WRITER_ERR_ALLOW | + ACL_WRITE_NAMED_WRITER_ERR_DENY | + ACL_READ_NAMED_READER_ERR_DENY); + + if (mask_bit == ACE_SYNCHRONIZE) { + set_deny = ACL_SYNCHRONIZE_SET_DENY; + err_deny = ACL_SYNCHRONIZE_ERR_DENY; + set_allow = ACL_SYNCHRONIZE_SET_ALLOW; + err_allow = ACL_SYNCHRONIZE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_OWNER) { + set_deny = ACL_WRITE_OWNER_SET_DENY; + err_deny = ACL_WRITE_OWNER_ERR_DENY; + set_allow = ACL_WRITE_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_OWNER_ERR_ALLOW; + } else if (mask_bit == ACE_DELETE) { + set_deny = ACL_DELETE_SET_DENY; + err_deny = ACL_DELETE_ERR_DENY; + set_allow = ACL_DELETE_SET_ALLOW; + err_allow = ACL_DELETE_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_ATTRIBUTES) { + if (isowner) { + set_deny = ACL_WRITE_ATTRS_OWNER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_OWNER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_OWNER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_OWNER_ERR_ALLOW; + } else if (haswriteperm) { + set_deny = ACL_WRITE_ATTRS_WRITER_SET_DENY; + err_deny = ACL_WRITE_ATTRS_WRITER_ERR_DENY; + set_allow = ACL_WRITE_ATTRS_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_ATTRS_WRITER_ERR_ALLOW; + } else { + if ((acep->a_access_mask & mask_bit) && + (acep->a_type & ACE_ACCESS_ALLOWED_ACE_TYPE)) { + return (ENOTSUP); + } + return (0); + } + } else if (mask_bit == ACE_READ_NAMED_ATTRS) { + if (!hasreadperm) + return (0); + + set_deny = ACL_READ_NAMED_READER_SET_DENY; + err_deny = ACL_READ_NAMED_READER_ERR_DENY; + set_allow = ACL_READ_NAMED_READER_SET_ALLOW; + err_allow = ACL_READ_NAMED_READER_ERR_ALLOW; + } else if (mask_bit == ACE_WRITE_NAMED_ATTRS) { + if (!haswriteperm) + return (0); + + set_deny = ACL_WRITE_NAMED_WRITER_SET_DENY; + err_deny = ACL_WRITE_NAMED_WRITER_ERR_DENY; + set_allow = ACL_WRITE_NAMED_WRITER_SET_ALLOW; + err_allow = ACL_WRITE_NAMED_WRITER_ERR_ALLOW; + } else { + return (EINVAL); + } + + if (acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) { + if (acl_consume & set_deny) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_deny) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } else { + /* ACE_ACCESS_ALLOWED_ACE_TYPE */ + if (acl_consume & set_allow) { + if (!(acep->a_access_mask & mask_bit)) { + return (ENOTSUP); + } + } else if (acl_consume & err_allow) { + if (acep->a_access_mask & mask_bit) { + return (ENOTSUP); + } + } + } + return (0); +} + +static int +ace_to_aent_legal(ace_t *acep) +{ + int error = 0; + int isowner; + + /* only ALLOW or DENY */ + if ((acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE) && + (acep->a_type != ACE_ACCESS_DENIED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid flags */ + if (acep->a_flags & ~(ACE_VALID_FLAG_BITS)) { + error = EINVAL; + goto out; + } + + /* some flags are illegal */ + if (acep->a_flags & (ACE_SUCCESSFUL_ACCESS_ACE_FLAG | + ACE_FAILED_ACCESS_ACE_FLAG | + ACE_NO_PROPAGATE_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + + /* check for invalid masks */ + if (acep->a_access_mask & ~(ACE_VALID_MASK_BITS)) { + error = EINVAL; + goto out; + } + + if ((acep->a_flags & ACE_OWNER)) { + isowner = 1; + } else { + isowner = 0; + } + + error = access_mask_check(acep, ACE_SYNCHRONIZE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_OWNER, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_DELETE, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_ATTRIBUTES, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_READ_NAMED_ATTRS, isowner); + if (error) + goto out; + + error = access_mask_check(acep, ACE_WRITE_NAMED_ATTRS, isowner); + if (error) + goto out; + + /* more detailed checking of masks */ + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + if (! (acep->a_access_mask & ACE_READ_ATTRIBUTES)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_access_mask & ACE_WRITE_DATA) && + (! (acep->a_access_mask & ACE_APPEND_DATA))) { + error = ENOTSUP; + goto out; + } + if ((! (acep->a_access_mask & ACE_WRITE_DATA)) && + (acep->a_access_mask & ACE_APPEND_DATA)) { + error = ENOTSUP; + goto out; + } + } + + /* ACL enforcement */ + if ((acep->a_access_mask & ACE_READ_ACL) && + (acep->a_type != ACE_ACCESS_ALLOWED_ACE_TYPE)) { + error = ENOTSUP; + goto out; + } + if (acep->a_access_mask & ACE_WRITE_ACL) { + if ((acep->a_type == ACE_ACCESS_DENIED_ACE_TYPE) && + (isowner)) { + error = ENOTSUP; + goto out; + } + if ((acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) && + (! isowner)) { + error = ENOTSUP; + goto out; + } + } + +out: + return (error); +} + +static int +ace_allow_to_mode(uint32_t mask, o_mode_t *modep, boolean_t isdir) +{ + /* ACE_READ_ACL and ACE_READ_ATTRIBUTES must both be set */ + if ((mask & (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) != + (ACE_READ_ACL | ACE_READ_ATTRIBUTES)) { + return (ENOTSUP); + } + + return (ace_mask_to_mode(mask, modep, isdir)); +} + +static int +acevals_to_aent(acevals_t *vals, aclent_t *dest, ace_list_t *list, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error; + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + + if (isdir) + flips |= ACE_DELETE_CHILD; + if (vals->allowed != (vals->denied ^ flips)) { + error = ENOTSUP; + goto out; + } + if ((list->hasmask) && (list->acl_mask != vals->mask) && + (vals->aent_type & (USER | GROUP | GROUP_OBJ))) { + error = ENOTSUP; + goto out; + } + error = ace_allow_to_mode(vals->allowed, &dest->a_perm, isdir); + if (error != 0) + goto out; + dest->a_type = vals->aent_type; + if (dest->a_type & (USER | GROUP)) { + dest->a_id = vals->key; + } else if (dest->a_type & USER_OBJ) { + dest->a_id = owner; + } else if (dest->a_type & GROUP_OBJ) { + dest->a_id = group; + } else if (dest->a_type & OTHER_OBJ) { + dest->a_id = 0; + } else { + error = EINVAL; + goto out; + } + +out: + return (error); +} + + +static int +ace_list_to_aent(ace_list_t *list, aclent_t **aclentp, int *aclcnt, + uid_t owner, gid_t group, boolean_t isdir) +{ + int error = 0; + aclent_t *aent, *result = NULL; + acevals_t *vals; + int resultcount; + + if ((list->seen & (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) != + (USER_OBJ | GROUP_OBJ | OTHER_OBJ)) { + error = ENOTSUP; + goto out; + } + if ((! list->hasmask) && (list->numusers + list->numgroups > 0)) { + error = ENOTSUP; + goto out; + } + + resultcount = 3 + list->numusers + list->numgroups; + /* + * This must be the same condition as below, when we add the CLASS_OBJ + * (aka ACL mask) + */ + if ((list->hasmask) || (! list->dfacl_flag)) + resultcount += 1; + + if (cacl_malloc((void **)&result, + resultcount * sizeof (aclent_t)) != 0) { + error = ENOMEM; + goto out; + } + aent = result; + + /* USER_OBJ */ + if (!(list->user_obj.aent_type & USER_OBJ)) { + error = EINVAL; + goto out; + } + + error = acevals_to_aent(&list->user_obj, aent, list, owner, group, + isdir); + + if (error != 0) + goto out; + ++aent; + /* USER */ + vals = NULL; + for (vals = avl_first(&list->user); vals != NULL; + vals = AVL_NEXT(&list->user, vals)) { + if (!(vals->aent_type & USER)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* GROUP_OBJ */ + if (!(list->group_obj.aent_type & GROUP_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->group_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + /* GROUP */ + vals = NULL; + for (vals = avl_first(&list->group); vals != NULL; + vals = AVL_NEXT(&list->group, vals)) { + if (!(vals->aent_type & GROUP)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(vals, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + } + /* + * CLASS_OBJ (aka ACL_MASK) + * + * An ACL_MASK is not fabricated if the ACL is a default ACL. + * This is to follow UFS's behavior. + */ + if ((list->hasmask) || (! list->dfacl_flag)) { + if (list->hasmask) { + uint32_t flips = ACE_POSIX_SUPPORTED_BITS; + if (isdir) + flips |= ACE_DELETE_CHILD; + error = ace_mask_to_mode(list->acl_mask ^ flips, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } else { + /* fabricate the ACL_MASK from the group permissions */ + error = ace_mask_to_mode(list->group_obj.allowed, + &aent->a_perm, isdir); + if (error != 0) + goto out; + } + aent->a_id = 0; + aent->a_type = CLASS_OBJ | list->dfacl_flag; + ++aent; + } + /* OTHER_OBJ */ + if (!(list->other_obj.aent_type & OTHER_OBJ)) { + error = EINVAL; + goto out; + } + error = acevals_to_aent(&list->other_obj, aent, list, owner, group, + isdir); + if (error != 0) + goto out; + ++aent; + + *aclentp = result; + *aclcnt = resultcount; + +out: + if (error != 0) { + if (result != NULL) + cacl_free(result, resultcount * sizeof (aclent_t)); + } + + return (error); +} + + +/* + * free all data associated with an ace_list + */ +static void +ace_list_free(ace_list_t *al) +{ + acevals_t *node; + void *cookie; + + if (al == NULL) + return; + + cookie = NULL; + while ((node = avl_destroy_nodes(&al->user, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + cookie = NULL; + while ((node = avl_destroy_nodes(&al->group, &cookie)) != NULL) + cacl_free(node, sizeof (acevals_t)); + + avl_destroy(&al->user); + avl_destroy(&al->group); + + /* free the container itself */ + cacl_free(al, sizeof (ace_list_t)); +} + +static int +acevals_compare(const void *va, const void *vb) +{ + const acevals_t *a = va, *b = vb; + + if (a->key == b->key) + return (0); + + if (a->key > b->key) + return (1); + + else + return (-1); +} + +/* + * Convert a list of ace_t entries to equivalent regular and default + * aclent_t lists. Return error (ENOTSUP) when conversion is not possible. + */ +static int +ln_ace_to_aent(ace_t *ace, int n, uid_t owner, gid_t group, + aclent_t **aclentp, int *aclcnt, aclent_t **dfaclentp, int *dfaclcnt, + boolean_t isdir) +{ + int error = 0; + ace_t *acep; + uint32_t bits; + int i; + ace_list_t *normacl = NULL, *dfacl = NULL, *acl; + acevals_t *vals; + + *aclentp = NULL; + *aclcnt = 0; + *dfaclentp = NULL; + *dfaclcnt = 0; + + /* we need at least user_obj, group_obj, and other_obj */ + if (n < 6) { + error = ENOTSUP; + goto out; + } + if (ace == NULL) { + error = EINVAL; + goto out; + } + + error = cacl_malloc((void **)&normacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&normacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&normacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + + ace_list_init(normacl, 0); + + error = cacl_malloc((void **)&dfacl, sizeof (ace_list_t)); + if (error != 0) + goto out; + + avl_create(&dfacl->user, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + avl_create(&dfacl->group, acevals_compare, sizeof (acevals_t), + offsetof(acevals_t, avl)); + ace_list_init(dfacl, ACL_DEFAULT); + + /* process every ace_t... */ + for (i = 0; i < n; i++) { + acep = &ace[i]; + + /* rule out certain cases quickly */ + error = ace_to_aent_legal(acep); + if (error != 0) + goto out; + + /* + * Turn off these bits in order to not have to worry about + * them when doing the checks for compliments. + */ + acep->a_access_mask &= ~(ACE_WRITE_OWNER | ACE_DELETE | + ACE_SYNCHRONIZE | ACE_WRITE_ATTRIBUTES | + ACE_READ_NAMED_ATTRS | ACE_WRITE_NAMED_ATTRS); + + /* see if this should be a regular or default acl */ + bits = acep->a_flags & + (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE); + if (bits != 0) { + /* all or nothing on these inherit bits */ + if (bits != (ACE_INHERIT_ONLY_ACE | + ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) { + error = ENOTSUP; + goto out; + } + acl = dfacl; + } else { + acl = normacl; + } + + if ((acep->a_flags & ACE_OWNER)) { + if (acl->state > ace_user_obj) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user_obj; + acl->seen |= USER_OBJ; + vals = &acl->user_obj; + vals->aent_type = USER_OBJ | acl->dfacl_flag; + } else if ((acep->a_flags & ACE_EVERYONE)) { + acl->state = ace_other_obj; + acl->seen |= OTHER_OBJ; + vals = &acl->other_obj; + vals->aent_type = OTHER_OBJ | acl->dfacl_flag; + } else if (acep->a_flags & ACE_IDENTIFIER_GROUP) { + if (acl->state > ace_group) { + error = ENOTSUP; + goto out; + } + if ((acep->a_flags & ACE_GROUP)) { + acl->seen |= GROUP_OBJ; + vals = &acl->group_obj; + vals->aent_type = GROUP_OBJ | acl->dfacl_flag; + } else { + acl->seen |= GROUP; + vals = acevals_find(acep, &acl->group, + &acl->numgroups); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = GROUP | acl->dfacl_flag; + } + acl->state = ace_group; + } else { + if (acl->state > ace_user) { + error = ENOTSUP; + goto out; + } + acl->state = ace_user; + acl->seen |= USER; + vals = acevals_find(acep, &acl->user, + &acl->numusers); + if (vals == NULL) { + error = ENOMEM; + goto out; + } + vals->aent_type = USER | acl->dfacl_flag; + } + + if (!(acl->state > ace_unused)) { + error = EINVAL; + goto out; + } + + if (acep->a_type == ACE_ACCESS_ALLOWED_ACE_TYPE) { + /* no more than one allowed per aclent_t */ + if (vals->allowed != ACE_MASK_UNDEFINED) { + error = ENOTSUP; + goto out; + } + vals->allowed = acep->a_access_mask; + } else { + /* + * it's a DENY; if there was a previous DENY, it + * must have been an ACL_MASK. + */ + if (vals->denied != ACE_MASK_UNDEFINED) { + /* ACL_MASK is for USER and GROUP only */ + if ((acl->state != ace_user) && + (acl->state != ace_group)) { + error = ENOTSUP; + goto out; + } + + if (! acl->hasmask) { + acl->hasmask = 1; + acl->acl_mask = vals->denied; + /* check for mismatched ACL_MASK emulations */ + } else if (acl->acl_mask != vals->denied) { + error = ENOTSUP; + goto out; + } + vals->mask = vals->denied; + } + vals->denied = acep->a_access_mask; + } + } + + /* done collating; produce the aclent_t lists */ + if (normacl->state != ace_unused) { + error = ace_list_to_aent(normacl, aclentp, aclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + if (dfacl->state != ace_unused) { + error = ace_list_to_aent(dfacl, dfaclentp, dfaclcnt, + owner, group, isdir); + if (error != 0) { + goto out; + } + } + +out: + if (normacl != NULL) + ace_list_free(normacl); + if (dfacl != NULL) + ace_list_free(dfacl); + + return (error); +} + +static int +convert_ace_to_aent(ace_t *acebufp, int acecnt, boolean_t isdir, + uid_t owner, gid_t group, aclent_t **retaclentp, int *retaclcnt) +{ + int error = 0; + aclent_t *aclentp, *dfaclentp; + int aclcnt, dfaclcnt; + int aclsz, dfaclsz; + + error = ln_ace_to_aent(acebufp, acecnt, owner, group, + &aclentp, &aclcnt, &dfaclentp, &dfaclcnt, isdir); + + if (error) + return (error); + + + if (dfaclcnt != 0) { + /* + * Slap aclentp and dfaclentp into a single array. + */ + aclsz = sizeof (aclent_t) * aclcnt; + dfaclsz = sizeof (aclent_t) * dfaclcnt; + aclentp = cacl_realloc(aclentp, aclsz, aclsz + dfaclsz); + if (aclentp != NULL) { + (void) memcpy(aclentp + aclcnt, dfaclentp, dfaclsz); + } else { + error = ENOMEM; + } + } + + if (aclentp) { + *retaclentp = aclentp; + *retaclcnt = aclcnt + dfaclcnt; + } + + if (dfaclentp) + cacl_free(dfaclentp, dfaclsz); + + return (error); +} + + +int +acl_translate(acl_t *aclp, int target_flavor, boolean_t isdir, uid_t owner, + gid_t group) +{ + int aclcnt; + void *acldata; + int error; + + /* + * See if we need to translate + */ + if ((target_flavor == _ACL_ACE_ENABLED && aclp->acl_type == ACE_T) || + (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACLENT_T)) + return (0); + + if (target_flavor == -1) { + error = EINVAL; + goto out; + } + + if (target_flavor == _ACL_ACE_ENABLED && + aclp->acl_type == ACLENT_T) { + error = convert_aent_to_ace(aclp->acl_aclp, + aclp->acl_cnt, isdir, (ace_t **)&acldata, &aclcnt); + if (error) + goto out; + + } else if (target_flavor == _ACL_ACLENT_ENABLED && + aclp->acl_type == ACE_T) { + error = convert_ace_to_aent(aclp->acl_aclp, aclp->acl_cnt, + isdir, owner, group, (aclent_t **)&acldata, &aclcnt); + if (error) + goto out; + } else { + error = ENOTSUP; + goto out; + } + + /* + * replace old acl with newly translated acl + */ + cacl_free(aclp->acl_aclp, aclp->acl_cnt * aclp->acl_entry_size); + aclp->acl_aclp = acldata; + aclp->acl_cnt = aclcnt; + if (target_flavor == _ACL_ACE_ENABLED) { + aclp->acl_type = ACE_T; + aclp->acl_entry_size = sizeof (ace_t); + } else { + aclp->acl_type = ACLENT_T; + aclp->acl_entry_size = sizeof (aclent_t); + } + return (0); + +out: + +#if !defined(_KERNEL) + errno = error; + return (-1); +#else + return (error); +#endif +} +#endif /* !_KERNEL */ + +#define SET_ACE(acl, index, who, mask, type, flags) { \ + acl[0][index].a_who = (uint32_t)who; \ + acl[0][index].a_type = type; \ + acl[0][index].a_flags = flags; \ + acl[0][index++].a_access_mask = mask; \ +} + +void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + (void) isdir; /* will need this later */ + + masks->deny1 = 0; + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +int +acl_trivial_create(mode_t mode, boolean_t isdir, ace_t **acl, int *count) +{ + int index = 0; + int error; + trivial_acl_t masks; + + *count = 3; + acl_trivial_access_masks(mode, isdir, &masks); + + if (masks.allow0) + (*count)++; + if (masks.deny1) + (*count)++; + if (masks.deny2) + (*count)++; + + if ((error = cacl_malloc((void **)acl, *count * sizeof (ace_t))) != 0) + return (error); + + if (masks.allow0) { + SET_ACE(acl, index, -1, masks.allow0, + ACE_ACCESS_ALLOWED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny1) { + SET_ACE(acl, index, -1, masks.deny1, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_OWNER); + } + if (masks.deny2) { + SET_ACE(acl, index, -1, masks.deny2, + ACE_ACCESS_DENIED_ACE_TYPE, ACE_GROUP|ACE_IDENTIFIER_GROUP); + } + + SET_ACE(acl, index, -1, masks.owner, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_OWNER); + SET_ACE(acl, index, -1, masks.group, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_IDENTIFIER_GROUP|ACE_GROUP); + SET_ACE(acl, index, -1, masks.everyone, ACE_ACCESS_ALLOWED_ACE_TYPE, + ACE_EVERYONE); + + return (0); +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permissions are never set by default + */ + if (mask & (ACE_DELETE|ACE_DELETE_CHILD)) + return (1); + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + return (0); +} diff --git a/module/os/freebsd/spl/callb.c b/module/os/freebsd/spl/callb.c new file mode 100644 index 000000000000..fffa85b6b91b --- /dev/null +++ b/module/os/freebsd/spl/callb.c @@ -0,0 +1,373 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for delay() */ +#include /* For TASKQ_NAMELEN */ +#include + +#define CB_MAXNAME TASKQ_NAMELEN + +/* + * The callb mechanism provides generic event scheduling/echoing. + * A callb function is registered and called on behalf of the event. + */ +typedef struct callb { + struct callb *c_next; /* next in class or on freelist */ + kthread_id_t c_thread; /* ptr to caller's thread struct */ + char c_flag; /* info about the callb state */ + uchar_t c_class; /* this callb's class */ + kcondvar_t c_done_cv; /* signal callb completion */ + boolean_t (*c_func)(void *, int); + /* cb function: returns true if ok */ + void *c_arg; /* arg to c_func */ + char c_name[CB_MAXNAME+1]; /* debug:max func name length */ +} callb_t; + +/* + * callb c_flag bitmap definitions + */ +#define CALLB_FREE 0x0 +#define CALLB_TAKEN 0x1 +#define CALLB_EXECUTING 0x2 + +/* + * Basic structure for a callb table. + * All callbs are organized into different class groups described + * by ct_class array. + * The callbs within a class are single-linked and normally run by a + * serial execution. + */ +typedef struct callb_table { + kmutex_t ct_lock; /* protect all callb states */ + callb_t *ct_freelist; /* free callb structures */ + int ct_busy; /* != 0 prevents additions */ + kcondvar_t ct_busy_cv; /* to wait for not busy */ + int ct_ncallb; /* num of callbs allocated */ + callb_t *ct_first_cb[NCBCLASS]; /* ptr to 1st callb in a class */ +} callb_table_t; + +int callb_timeout_sec = CPR_KTHREAD_TIMEOUT_SEC; + +static callb_id_t callb_add_common(boolean_t (*)(void *, int), + void *, int, char *, kthread_id_t); + +static callb_table_t callb_table; /* system level callback table */ +static callb_table_t *ct = &callb_table; +static kmutex_t callb_safe_mutex; +callb_cpr_t callb_cprinfo_safe = { + &callb_safe_mutex, CALLB_CPR_ALWAYS_SAFE, 0, {0, 0} }; + +/* + * Init all callb tables in the system. + */ +static void +callb_init(void *dummy __unused) +{ + callb_table.ct_busy = 0; /* mark table open for additions */ + mutex_init(&callb_safe_mutex, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&callb_table.ct_lock, NULL, MUTEX_DEFAULT, NULL); +} + +static void +callb_fini(void *dummy __unused) +{ + callb_t *cp; + int i; + + mutex_enter(&ct->ct_lock); + for (i = 0; i < 16; i++) { + while ((cp = ct->ct_freelist) != NULL) { + ct->ct_freelist = cp->c_next; + ct->ct_ncallb--; + kmem_free(cp, sizeof (callb_t)); + } + if (ct->ct_ncallb == 0) + break; + /* Not all callbacks finished, waiting for the rest. */ + mutex_exit(&ct->ct_lock); + tsleep(ct, 0, "callb", hz / 4); + mutex_enter(&ct->ct_lock); + } + if (ct->ct_ncallb > 0) + printf("%s: Leaked %d callbacks!\n", __func__, ct->ct_ncallb); + mutex_exit(&ct->ct_lock); + mutex_destroy(&callb_safe_mutex); + mutex_destroy(&callb_table.ct_lock); +} + +/* + * callout_add() is called to register func() be called later. + */ +static callb_id_t +callb_add_common(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + callb_t *cp; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + while (ct->ct_busy) + cv_wait(&ct->ct_busy_cv, &ct->ct_lock); + if ((cp = ct->ct_freelist) == NULL) { + ct->ct_ncallb++; + cp = (callb_t *)kmem_zalloc(sizeof (callb_t), KM_SLEEP); + } + ct->ct_freelist = cp->c_next; + cp->c_thread = t; + cp->c_func = func; + cp->c_arg = arg; + cp->c_class = (uchar_t)class; + cp->c_flag |= CALLB_TAKEN; +#ifdef ZFS_DEBUG + if (strlen(name) > CB_MAXNAME) + cmn_err(CE_WARN, "callb_add: name of callback function '%s' " + "too long -- truncated to %d chars", + name, CB_MAXNAME); +#endif + (void) strncpy(cp->c_name, name, CB_MAXNAME); + cp->c_name[CB_MAXNAME] = '\0'; + + /* + * Insert the new callb at the head of its class list. + */ + cp->c_next = ct->ct_first_cb[class]; + ct->ct_first_cb[class] = cp; + + mutex_exit(&ct->ct_lock); + return ((callb_id_t)cp); +} + +/* + * The default function to add an entry to the callback table. Since + * it uses curthread as the thread identifier to store in the table, + * it should be used for the normal case of a thread which is calling + * to add ITSELF to the table. + */ +callb_id_t +callb_add(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name) +{ + return (callb_add_common(func, arg, class, name, curthread)); +} + +/* + * A special version of callb_add() above for use by threads which + * might be adding an entry to the table on behalf of some other + * thread (for example, one which is constructed but not yet running). + * In this version the thread id is an argument. + */ +callb_id_t +callb_add_thread(boolean_t (*func)(void *arg, int code), + void *arg, int class, char *name, kthread_id_t t) +{ + return (callb_add_common(func, arg, class, name, t)); +} + +/* + * callout_delete() is called to remove an entry identified by id + * that was originally placed there by a call to callout_add(). + * return -1 if fail to delete a callb entry otherwise return 0. + */ +int +callb_delete(callb_id_t id) +{ + callb_t **pp; + callb_t *me = (callb_t *)id; + + mutex_enter(&ct->ct_lock); + + for (;;) { + pp = &ct->ct_first_cb[me->c_class]; + while (*pp != NULL && *pp != me) + pp = &(*pp)->c_next; + +#ifdef ZFS_DEBUG + if (*pp != me) { + cmn_err(CE_WARN, "callb delete bogus entry 0x%p", + (void *)me); + mutex_exit(&ct->ct_lock); + return (-1); + } +#endif /* DEBUG */ + + /* + * It is not allowed to delete a callb in the middle of + * executing otherwise, the callb_execute() will be confused. + */ + if (!(me->c_flag & CALLB_EXECUTING)) + break; + + cv_wait(&me->c_done_cv, &ct->ct_lock); + } + /* relink the class list */ + *pp = me->c_next; + + /* clean up myself and return the free callb to the head of freelist */ + me->c_flag = CALLB_FREE; + me->c_next = ct->ct_freelist; + ct->ct_freelist = me; + + mutex_exit(&ct->ct_lock); + return (0); +} + +/* + * class: indicates to execute all callbs in the same class; + * code: optional argument for the callb functions. + * return: = 0: success + * != 0: ptr to string supplied when callback was registered + */ +void * +callb_execute_class(int class, int code) +{ + callb_t *cp; + void *ret = NULL; + + ASSERT(class < NCBCLASS); + + mutex_enter(&ct->ct_lock); + + for (cp = ct->ct_first_cb[class]; + cp != NULL && ret == 0; cp = cp->c_next) { + while (cp->c_flag & CALLB_EXECUTING) + cv_wait(&cp->c_done_cv, &ct->ct_lock); + /* + * cont if the callb is deleted while we're sleeping + */ + if (cp->c_flag == CALLB_FREE) + continue; + cp->c_flag |= CALLB_EXECUTING; + +#ifdef CALLB_DEBUG + printf("callb_execute: name=%s func=%p arg=%p\n", + cp->c_name, (void *)cp->c_func, (void *)cp->c_arg); +#endif /* CALLB_DEBUG */ + + mutex_exit(&ct->ct_lock); + /* If callback function fails, pass back client's name */ + if (!(*cp->c_func)(cp->c_arg, code)) + ret = cp->c_name; + mutex_enter(&ct->ct_lock); + + cp->c_flag &= ~CALLB_EXECUTING; + cv_broadcast(&cp->c_done_cv); + } + mutex_exit(&ct->ct_lock); + return (ret); +} + +/* + * callers make sure no recursive entries to this func. + * dp->cc_lockp is registered by callb_add to protect callb_cpr_t structure. + * + * When calling to stop a kernel thread (code == CB_CODE_CPR_CHKPT) we + * use a cv_timedwait() in case the kernel thread is blocked. + * + * Note that this is a generic callback handler for daemon CPR and + * should NOT be changed to accommodate any specific requirement in a daemon. + * Individual daemons that require changes to the handler shall write + * callback routines in their own daemon modules. + */ +boolean_t +callb_generic_cpr(void *arg, int code) +{ + callb_cpr_t *cp = (callb_cpr_t *)arg; + clock_t ret = 0; /* assume success */ + + mutex_enter(cp->cc_lockp); + + switch (code) { + case CB_CODE_CPR_CHKPT: + cp->cc_events |= CALLB_CPR_START; +#ifdef CPR_NOT_THREAD_SAFE + while (!(cp->cc_events & CALLB_CPR_SAFE)) + /* cv_timedwait() returns -1 if it times out. */ + if ((ret = cv_reltimedwait(&cp->cc_callb_cv, + cp->cc_lockp, (callb_timeout_sec * hz), + TR_CLOCK_TICK)) == -1) + break; +#endif + break; + + case CB_CODE_CPR_RESUME: + cp->cc_events &= ~CALLB_CPR_START; + cv_signal(&cp->cc_stop_cv); + break; + } + mutex_exit(cp->cc_lockp); + return (ret != -1); +} + +/* + * The generic callback function associated with kernel threads which + * are always considered safe. + */ +/* ARGSUSED */ +boolean_t +callb_generic_cpr_safe(void *arg, int code) +{ + return (B_TRUE); +} +/* + * Prevent additions to callback table. + */ +void +callb_lock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy == 0); + ct->ct_busy = 1; + mutex_exit(&ct->ct_lock); +} + +/* + * Allow additions to callback table. + */ +void +callb_unlock_table(void) +{ + mutex_enter(&ct->ct_lock); + ASSERT(ct->ct_busy != 0); + ct->ct_busy = 0; + cv_broadcast(&ct->ct_busy_cv); + mutex_exit(&ct->ct_lock); +} + +SYSINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_init, NULL); +SYSUNINIT(sol_callb, SI_SUB_DRIVERS, SI_ORDER_FIRST, callb_fini, NULL); diff --git a/module/os/freebsd/spl/list.c b/module/os/freebsd/spl/list.c new file mode 100644 index 000000000000..21230b2adddb --- /dev/null +++ b/module/os/freebsd/spl/list.c @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +/* + * Generic doubly-linked list implementation + */ + +#include +#include +#include +#include +#include + +#define list_d2l(a, obj) ((list_node_t *)(((char *)obj) + (a)->list_offset)) +#define list_object(a, node) ((void *)(((char *)node) - (a)->list_offset)) +#define list_empty(a) ((a)->list_head.list_next == &(a)->list_head) + +#define list_insert_after_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_prev = (node); \ + lnew->list_next = (node)->list_next; \ + (node)->list_next->list_prev = lnew; \ + (node)->list_next = lnew; \ +} + +#define list_insert_before_node(list, node, object) { \ + list_node_t *lnew = list_d2l(list, object); \ + lnew->list_next = (node); \ + lnew->list_prev = (node)->list_prev; \ + (node)->list_prev->list_next = lnew; \ + (node)->list_prev = lnew; \ +} + +#define list_remove_node(node) \ + (node)->list_prev->list_next = (node)->list_next; \ + (node)->list_next->list_prev = (node)->list_prev; \ + (node)->list_next = (node)->list_prev = NULL + +void +list_create(list_t *list, size_t size, size_t offset) +{ + ASSERT(list); + ASSERT(size > 0); + ASSERT(size >= offset + sizeof (list_node_t)); + + list->list_size = size; + list->list_offset = offset; + list->list_head.list_next = list->list_head.list_prev = + &list->list_head; +} + +void +list_destroy(list_t *list) +{ + list_node_t *node = &list->list_head; + + ASSERT(list); + ASSERT(list->list_head.list_next == node); + ASSERT(list->list_head.list_prev == node); + + node->list_next = node->list_prev = NULL; +} + +void +list_insert_after(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_head(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_after_node(list, lold, nobject); + } +} + +void +list_insert_before(list_t *list, void *object, void *nobject) +{ + if (object == NULL) { + list_insert_tail(list, nobject); + } else { + list_node_t *lold = list_d2l(list, object); + list_insert_before_node(list, lold, nobject); + } +} + +void +list_insert_head(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_after_node(list, lold, object); +} + +void +list_insert_tail(list_t *list, void *object) +{ + list_node_t *lold = &list->list_head; + list_insert_before_node(list, lold, object); +} + +void +list_remove(list_t *list, void *object) +{ + list_node_t *lold = list_d2l(list, object); + ASSERT(!list_empty(list)); + ASSERT(lold->list_next != NULL); + list_remove_node(lold); +} + +void * +list_remove_head(list_t *list) +{ + list_node_t *head = list->list_head.list_next; + if (head == &list->list_head) + return (NULL); + list_remove_node(head); + return (list_object(list, head)); +} + +void * +list_remove_tail(list_t *list) +{ + list_node_t *tail = list->list_head.list_prev; + if (tail == &list->list_head) + return (NULL); + list_remove_node(tail); + return (list_object(list, tail)); +} + +void * +list_head(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_next)); +} + +void * +list_tail(list_t *list) +{ + if (list_empty(list)) + return (NULL); + return (list_object(list, list->list_head.list_prev)); +} + +void * +list_next(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_next != &list->list_head) + return (list_object(list, node->list_next)); + + return (NULL); +} + +void * +list_prev(list_t *list, void *object) +{ + list_node_t *node = list_d2l(list, object); + + if (node->list_prev != &list->list_head) + return (list_object(list, node->list_prev)); + + return (NULL); +} + +/* + * Insert src list after dst list. Empty src list thereafter. + */ +void +list_move_tail(list_t *dst, list_t *src) +{ + list_node_t *dstnode = &dst->list_head; + list_node_t *srcnode = &src->list_head; + + ASSERT(dst->list_size == src->list_size); + ASSERT(dst->list_offset == src->list_offset); + + if (list_empty(src)) + return; + + dstnode->list_prev->list_next = srcnode->list_next; + srcnode->list_next->list_prev = dstnode->list_prev; + dstnode->list_prev = srcnode->list_prev; + srcnode->list_prev->list_next = dstnode; + + /* empty src list */ + srcnode->list_next = srcnode->list_prev = srcnode; +} + +void +list_link_replace(list_node_t *lold, list_node_t *lnew) +{ + ASSERT(list_link_active(lold)); + ASSERT(!list_link_active(lnew)); + + lnew->list_next = lold->list_next; + lnew->list_prev = lold->list_prev; + lold->list_prev->list_next = lnew; + lold->list_next->list_prev = lnew; + lold->list_next = lold->list_prev = NULL; +} + +void +list_link_init(list_node_t *link) +{ + link->list_next = NULL; + link->list_prev = NULL; +} + +int +list_link_active(list_node_t *link) +{ + EQUIV(link->list_next == NULL, link->list_prev == NULL); + return (link->list_next != NULL); +} + +int +list_is_empty(list_t *list) +{ + return (list_empty(list)); +} diff --git a/module/os/freebsd/spl/sha224.h b/module/os/freebsd/spl/sha224.h new file mode 100644 index 000000000000..0abd43068708 --- /dev/null +++ b/module/os/freebsd/spl/sha224.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA224_H_ +#define _SHA224_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA224_BLOCK_LENGTH 64 +#define SHA224_DIGEST_LENGTH 28 +#define SHA224_DIGEST_STRING_LENGTH (SHA224_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA224Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA224_BLOCK_LENGTH]; +} SHA224_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA224_Init +#define SHA224_Init _libmd_SHA224_Init +#endif +#ifndef SHA224_Update +#define SHA224_Update _libmd_SHA224_Update +#endif +#ifndef SHA224_Final +#define SHA224_Final _libmd_SHA224_Final +#endif +#ifndef SHA224_End +#define SHA224_End _libmd_SHA224_End +#endif +#ifndef SHA224_Fd +#define SHA224_Fd _libmd_SHA224_Fd +#endif +#ifndef SHA224_FdChunk +#define SHA224_FdChunk _libmd_SHA224_FdChunk +#endif +#ifndef SHA224_File +#define SHA224_File _libmd_SHA224_File +#endif +#ifndef SHA224_FileChunk +#define SHA224_FileChunk _libmd_SHA224_FileChunk +#endif +#ifndef SHA224_Data +#define SHA224_Data _libmd_SHA224_Data +#endif + +#ifndef SHA224_version +#define SHA224_version _libmd_SHA224_version +#endif + +void SHA224_Init(SHA224_CTX *); +void SHA224_Update(SHA224_CTX *, const void *, size_t); +void SHA224_Final(unsigned char [__min_size(SHA224_DIGEST_LENGTH)], + SHA224_CTX *); +#ifndef _KERNEL +char *SHA224_End(SHA224_CTX *, char *); +char *SHA224_Data(const void *, unsigned int, char *); +char *SHA224_Fd(int, char *); +char *SHA224_FdChunk(int, char *, off_t, off_t); +char *SHA224_File(const char *, char *); +char *SHA224_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA224_H_ */ diff --git a/module/os/freebsd/spl/sha256.h b/module/os/freebsd/spl/sha256.h new file mode 100644 index 000000000000..193c0c025120 --- /dev/null +++ b/module/os/freebsd/spl/sha256.h @@ -0,0 +1,99 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA256_H_ +#define _SHA256_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA256_BLOCK_LENGTH 64 +#define SHA256_DIGEST_LENGTH 32 +#define SHA256_DIGEST_STRING_LENGTH (SHA256_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA256Context { + uint32_t state[8]; + uint64_t count; + uint8_t buf[SHA256_BLOCK_LENGTH]; +} SHA256_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ + +#ifndef SHA256_Init +#define SHA256_Init _libmd_SHA256_Init +#endif +#ifndef SHA256_Update +#define SHA256_Update _libmd_SHA256_Update +#endif +#ifndef SHA256_Final +#define SHA256_Final _libmd_SHA256_Final +#endif +#ifndef SHA256_End +#define SHA256_End _libmd_SHA256_End +#endif +#ifndef SHA256_Fd +#define SHA256_Fd _libmd_SHA256_Fd +#endif +#ifndef SHA256_FdChunk +#define SHA256_FdChunk _libmd_SHA256_FdChunk +#endif +#ifndef SHA256_File +#define SHA256_File _libmd_SHA256_File +#endif +#ifndef SHA256_FileChunk +#define SHA256_FileChunk _libmd_SHA256_FileChunk +#endif +#ifndef SHA256_Data +#define SHA256_Data _libmd_SHA256_Data +#endif + +#ifndef SHA256_Transform +#define SHA256_Transform _libmd_SHA256_Transform +#endif +#ifndef SHA256_version +#define SHA256_version _libmd_SHA256_version +#endif + +void SHA256_Init(SHA256_CTX *); +void SHA256_Update(SHA256_CTX *, const void *, size_t); +void SHA256_Final(unsigned char [__min_size(SHA256_DIGEST_LENGTH)], + SHA256_CTX *); +#ifndef _KERNEL +char *SHA256_End(SHA256_CTX *, char *); +char *SHA256_Data(const void *, unsigned int, char *); +char *SHA256_Fd(int, char *); +char *SHA256_FdChunk(int, char *, off_t, off_t); +char *SHA256_File(const char *, char *); +char *SHA256_FileChunk(const char *, char *, off_t, off_t); +#endif +__END_DECLS + +#endif /* !_SHA256_H_ */ diff --git a/module/os/freebsd/spl/sha256c.c b/module/os/freebsd/spl/sha256c.c new file mode 100644 index 000000000000..241cf8c9ae76 --- /dev/null +++ b/module/os/freebsd/spl/sha256c.c @@ -0,0 +1,378 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + + +#include +#include +#include "sha224.h" +#include "sha256.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint32_t into a vector of bytes */ +#define be32enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint32_t */ +#define be32dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint32_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 4. + */ +static void +be32enc_vect(unsigned char *dst, const uint32_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + be32enc(dst + i * 4, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint32_t). Assumes len is a multiple of 4. + */ +static void +be32dec_vect(uint32_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 4; i++) + dst[i] = be32dec(src + i * 4); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA256 round constants. */ +static const uint32_t K[64] = { + 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, + 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, + 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, + 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, + 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, + 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, + 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, + 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, + 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, + 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, + 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, + 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, + 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, + 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, + 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, + 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 +}; + +/* Elementary functions used by SHA256 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (32 - n))) +#define S0(x) (ROTR(x, 2) ^ ROTR(x, 13) ^ ROTR(x, 22)) +#define S1(x) (ROTR(x, 6) ^ ROTR(x, 11) ^ ROTR(x, 25)) +#define s0(x) (ROTR(x, 7) ^ ROTR(x, 18) ^ SHR(x, 3)) +#define s1(x) (ROTR(x, 17) ^ ROTR(x, 19) ^ SHR(x, 10)) + +/* SHA256 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(64 - i) % 8], S[(65 - i) % 8], \ + S[(66 - i) % 8], S[(67 - i) % 8], \ + S[(68 - i) % 8], S[(69 - i) % 8], \ + S[(70 - i) % 8], S[(71 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA256 block compression function. The 256-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA256_Transform(uint32_t *state, const unsigned char block[64]) +{ + uint32_t W[64]; + uint32_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be32dec_vect(W, block, 64); + + /* 2. Initialize working variables. */ + memcpy(S, state, 32); + + /* 3. Mix. */ + for (i = 0; i < 64; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 48) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA256_Pad(SHA256_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count >> 3) & 0x3f; + + /* Pad to 56 mod 64, transforming if we finish a block en route. */ + if (r < 56) { + /* Pad to 56 mod 64. */ + memcpy(&ctx->buf[r], PAD, 56 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 56); + } + + /* Add the terminating bit-count. */ + be64enc(&ctx->buf[56], ctx->count); + + /* Mix in the final block. */ + SHA256_Transform(ctx->state, ctx->buf); +} + +/* SHA-256 initialization. Begins a SHA-256 operation. */ +void +SHA256_Init(SHA256_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6A09E667; + ctx->state[1] = 0xBB67AE85; + ctx->state[2] = 0x3C6EF372; + ctx->state[3] = 0xA54FF53A; + ctx->state[4] = 0x510E527F; + ctx->state[5] = 0x9B05688C; + ctx->state[6] = 0x1F83D9AB; + ctx->state[7] = 0x5BE0CD19; +} + +/* Add bytes into the hash */ +void +SHA256_Update(SHA256_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen; + uint32_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count >> 3) & 0x3f; + + /* Convert the length into a number of bits */ + bitlen = len << 3; + + /* Update number of bits */ + ctx->count += bitlen; + + /* Handle the case where we don't need to perform any transforms */ + if (len < 64 - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, 64 - r); + SHA256_Transform(ctx->state, ctx->buf); + src += 64 - r; + len -= 64 - r; + + /* Perform complete blocks */ + while (len >= 64) { + SHA256_Transform(ctx->state, src); + src += 64; + len -= 64; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-256 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA256_Final(unsigned char digest[static SHA256_DIGEST_LENGTH], SHA256_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad(ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-224: ******************************************************* */ +/* + * the SHA224 and SHA256 transforms are identical + */ + +/* SHA-224 initialization. Begins a SHA-224 operation. */ +void +SHA224_Init(SHA224_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xC1059ED8; + ctx->state[1] = 0x367CD507; + ctx->state[2] = 0x3070DD17; + ctx->state[3] = 0xF70E5939; + ctx->state[4] = 0xFFC00B31; + ctx->state[5] = 0x68581511; + ctx->state[6] = 0x64f98FA7; + ctx->state[7] = 0xBEFA4FA4; +} + +/* Add bytes into the SHA-224 hash */ +void +SHA224_Update(SHA224_CTX * ctx, const void *in, size_t len) +{ + + SHA256_Update((SHA256_CTX *)ctx, in, len); +} + +/* + * SHA-224 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA224_Final(unsigned char digest[static SHA224_DIGEST_LENGTH], SHA224_CTX *ctx) +{ + + /* Add padding */ + SHA256_Pad((SHA256_CTX *)ctx); + + /* Write the hash */ + be32enc_vect(digest, ctx->state, SHA224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#ifdef WEAK_REFS +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA256_Init +__weak_reference(_libmd_SHA256_Init, SHA256_Init); +#undef SHA256_Update +__weak_reference(_libmd_SHA256_Update, SHA256_Update); +#undef SHA256_Final +__weak_reference(_libmd_SHA256_Final, SHA256_Final); +#undef SHA256_Transform +__weak_reference(_libmd_SHA256_Transform, SHA256_Transform); + +#undef SHA224_Init +__weak_reference(_libmd_SHA224_Init, SHA224_Init); +#undef SHA224_Update +__weak_reference(_libmd_SHA224_Update, SHA224_Update); +#undef SHA224_Final +__weak_reference(_libmd_SHA224_Final, SHA224_Final); +#endif diff --git a/module/os/freebsd/spl/sha384.h b/module/os/freebsd/spl/sha384.h new file mode 100644 index 000000000000..67250cee0313 --- /dev/null +++ b/module/os/freebsd/spl/sha384.h @@ -0,0 +1,96 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA384_H_ +#define _SHA384_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA384_BLOCK_LENGTH 128 +#define SHA384_DIGEST_LENGTH 48 +#define SHA384_DIGEST_STRING_LENGTH (SHA384_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA384Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA384_BLOCK_LENGTH]; +} SHA384_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA384_Init +#define SHA384_Init _libmd_SHA384_Init +#endif +#ifndef SHA384_Update +#define SHA384_Update _libmd_SHA384_Update +#endif +#ifndef SHA384_Final +#define SHA384_Final _libmd_SHA384_Final +#endif +#ifndef SHA384_End +#define SHA384_End _libmd_SHA384_End +#endif +#ifndef SHA384_Fd +#define SHA384_Fd _libmd_SHA384_Fd +#endif +#ifndef SHA384_FdChunk +#define SHA384_FdChunk _libmd_SHA384_FdChunk +#endif +#ifndef SHA384_File +#define SHA384_File _libmd_SHA384_File +#endif +#ifndef SHA384_FileChunk +#define SHA384_FileChunk _libmd_SHA384_FileChunk +#endif +#ifndef SHA384_Data +#define SHA384_Data _libmd_SHA384_Data +#endif + +#ifndef SHA384_version +#define SHA384_version _libmd_SHA384_version +#endif + +void SHA384_Init(SHA384_CTX *); +void SHA384_Update(SHA384_CTX *, const void *, size_t); +void SHA384_Final(unsigned char [__min_size(SHA384_DIGEST_LENGTH)], + SHA384_CTX *); +#ifndef _KERNEL +char *SHA384_End(SHA384_CTX *, char *); +char *SHA384_Data(const void *, unsigned int, char *); +char *SHA384_Fd(int, char *); +char *SHA384_FdChunk(int, char *, off_t, off_t); +char *SHA384_File(const char *, char *); +char *SHA384_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA384_H_ */ diff --git a/module/os/freebsd/spl/sha512.h b/module/os/freebsd/spl/sha512.h new file mode 100644 index 000000000000..b6fb733ca54e --- /dev/null +++ b/module/os/freebsd/spl/sha512.h @@ -0,0 +1,101 @@ +/* + * Copyright 2005 Colin Percival + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512_H_ +#define _SHA512_H_ + +#ifndef _KERNEL +#include +#endif + +#define SHA512_BLOCK_LENGTH 128 +#define SHA512_DIGEST_LENGTH 64 +#define SHA512_DIGEST_STRING_LENGTH (SHA512_DIGEST_LENGTH * 2 + 1) + +typedef struct SHA512Context { + uint64_t state[8]; + uint64_t count[2]; + uint8_t buf[SHA512_BLOCK_LENGTH]; +} SHA512_CTX; + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#if 0 +#ifndef SHA512_Init +#define SHA512_Init _libmd_SHA512_Init +#endif +#ifndef SHA512_Update +#define SHA512_Update _libmd_SHA512_Update +#endif +#ifndef SHA512_Final +#define SHA512_Final _libmd_SHA512_Final +#endif +#endif +#ifndef SHA512_End +#define SHA512_End _libmd_SHA512_End +#endif +#ifndef SHA512_Fd +#define SHA512_Fd _libmd_SHA512_Fd +#endif +#ifndef SHA512_FdChunk +#define SHA512_FdChunk _libmd_SHA512_FdChunk +#endif +#ifndef SHA512_File +#define SHA512_File _libmd_SHA512_File +#endif +#ifndef SHA512_FileChunk +#define SHA512_FileChunk _libmd_SHA512_FileChunk +#endif +#ifndef SHA512_Data +#define SHA512_Data _libmd_SHA512_Data +#endif + +#ifndef SHA512_Transform +#define SHA512_Transform _libmd_SHA512_Transform +#endif +#ifndef SHA512_version +#define SHA512_version _libmd_SHA512_version +#endif + +void SHA512_Init(SHA512_CTX *); +void SHA512_Update(SHA512_CTX *, const void *, size_t); +void SHA512_Final(unsigned char [__min_size(SHA512_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_End(SHA512_CTX *, char *); +char *SHA512_Data(const void *, unsigned int, char *); +char *SHA512_Fd(int, char *); +char *SHA512_FdChunk(int, char *, off_t, off_t); +char *SHA512_File(const char *, char *); +char *SHA512_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512_H_ */ diff --git a/module/os/freebsd/spl/sha512c.c b/module/os/freebsd/spl/sha512c.c new file mode 100644 index 000000000000..146f338f0ed4 --- /dev/null +++ b/module/os/freebsd/spl/sha512c.c @@ -0,0 +1,508 @@ +/* + * Copyright 2005 Colin Percival + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#else +#include +#endif + +#include "sha512.h" +#include "sha512t.h" +#include "sha384.h" + +#if BYTE_ORDER == BIG_ENDIAN + +/* Copy a vector of big-endian uint64_t into a vector of bytes */ +#define be64enc_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +/* Copy a vector of bytes into a vector of big-endian uint64_t */ +#define be64dec_vect(dst, src, len) \ + memcpy((void *)dst, (const void *)src, (size_t)len) + +#else /* BYTE_ORDER != BIG_ENDIAN */ + +/* + * Encode a length len/4 vector of (uint64_t) into a length len vector of + * (unsigned char) in big-endian form. Assumes len is a multiple of 8. + */ +static void +be64enc_vect(unsigned char *dst, const uint64_t *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + be64enc(dst + i * 8, src[i]); +} + +/* + * Decode a big-endian length len vector of (unsigned char) into a length + * len/4 vector of (uint64_t). Assumes len is a multiple of 8. + */ +static void +be64dec_vect(uint64_t *dst, const unsigned char *src, size_t len) +{ + size_t i; + + for (i = 0; i < len / 8; i++) + dst[i] = be64dec(src + i * 8); +} + +#endif /* BYTE_ORDER != BIG_ENDIAN */ + +/* SHA512 round constants. */ +static const uint64_t K[80] = { + 0x428a2f98d728ae22ULL, 0x7137449123ef65cdULL, + 0xb5c0fbcfec4d3b2fULL, 0xe9b5dba58189dbbcULL, + 0x3956c25bf348b538ULL, 0x59f111f1b605d019ULL, + 0x923f82a4af194f9bULL, 0xab1c5ed5da6d8118ULL, + 0xd807aa98a3030242ULL, 0x12835b0145706fbeULL, + 0x243185be4ee4b28cULL, 0x550c7dc3d5ffb4e2ULL, + 0x72be5d74f27b896fULL, 0x80deb1fe3b1696b1ULL, + 0x9bdc06a725c71235ULL, 0xc19bf174cf692694ULL, + 0xe49b69c19ef14ad2ULL, 0xefbe4786384f25e3ULL, + 0x0fc19dc68b8cd5b5ULL, 0x240ca1cc77ac9c65ULL, + 0x2de92c6f592b0275ULL, 0x4a7484aa6ea6e483ULL, + 0x5cb0a9dcbd41fbd4ULL, 0x76f988da831153b5ULL, + 0x983e5152ee66dfabULL, 0xa831c66d2db43210ULL, + 0xb00327c898fb213fULL, 0xbf597fc7beef0ee4ULL, + 0xc6e00bf33da88fc2ULL, 0xd5a79147930aa725ULL, + 0x06ca6351e003826fULL, 0x142929670a0e6e70ULL, + 0x27b70a8546d22ffcULL, 0x2e1b21385c26c926ULL, + 0x4d2c6dfc5ac42aedULL, 0x53380d139d95b3dfULL, + 0x650a73548baf63deULL, 0x766a0abb3c77b2a8ULL, + 0x81c2c92e47edaee6ULL, 0x92722c851482353bULL, + 0xa2bfe8a14cf10364ULL, 0xa81a664bbc423001ULL, + 0xc24b8b70d0f89791ULL, 0xc76c51a30654be30ULL, + 0xd192e819d6ef5218ULL, 0xd69906245565a910ULL, + 0xf40e35855771202aULL, 0x106aa07032bbd1b8ULL, + 0x19a4c116b8d2d0c8ULL, 0x1e376c085141ab53ULL, + 0x2748774cdf8eeb99ULL, 0x34b0bcb5e19b48a8ULL, + 0x391c0cb3c5c95a63ULL, 0x4ed8aa4ae3418acbULL, + 0x5b9cca4f7763e373ULL, 0x682e6ff3d6b2b8a3ULL, + 0x748f82ee5defb2fcULL, 0x78a5636f43172f60ULL, + 0x84c87814a1f0ab72ULL, 0x8cc702081a6439ecULL, + 0x90befffa23631e28ULL, 0xa4506cebde82bde9ULL, + 0xbef9a3f7b2c67915ULL, 0xc67178f2e372532bULL, + 0xca273eceea26619cULL, 0xd186b8c721c0c207ULL, + 0xeada7dd6cde0eb1eULL, 0xf57d4f7fee6ed178ULL, + 0x06f067aa72176fbaULL, 0x0a637dc5a2c898a6ULL, + 0x113f9804bef90daeULL, 0x1b710b35131c471bULL, + 0x28db77f523047d84ULL, 0x32caab7b40c72493ULL, + 0x3c9ebe0a15c9bebcULL, 0x431d67c49c100d4cULL, + 0x4cc5d4becb3e42b6ULL, 0x597f299cfc657e2aULL, + 0x5fcb6fab3ad6faecULL, 0x6c44198c4a475817ULL +}; + +/* Elementary functions used by SHA512 */ +#define Ch(x, y, z) ((x & (y ^ z)) ^ z) +#define Maj(x, y, z) ((x & (y | z)) | (y & z)) +#define SHR(x, n) (x >> n) +#define ROTR(x, n) ((x >> n) | (x << (64 - n))) +#define S0(x) (ROTR(x, 28) ^ ROTR(x, 34) ^ ROTR(x, 39)) +#define S1(x) (ROTR(x, 14) ^ ROTR(x, 18) ^ ROTR(x, 41)) +#define s0(x) (ROTR(x, 1) ^ ROTR(x, 8) ^ SHR(x, 7)) +#define s1(x) (ROTR(x, 19) ^ ROTR(x, 61) ^ SHR(x, 6)) + +/* SHA512 round function */ +#define RND(a, b, c, d, e, f, g, h, k) \ + h += S1(e) + Ch(e, f, g) + k; \ + d += h; \ + h += S0(a) + Maj(a, b, c); + +/* Adjusted round function for rotating state */ +#define RNDr(S, W, i, ii) \ + RND(S[(80 - i) % 8], S[(81 - i) % 8], \ + S[(82 - i) % 8], S[(83 - i) % 8], \ + S[(84 - i) % 8], S[(85 - i) % 8], \ + S[(86 - i) % 8], S[(87 - i) % 8], \ + W[i + ii] + K[i + ii]) + +/* Message schedule computation */ +#define MSCH(W, ii, i) \ + W[i + ii + 16] = s1(W[i + ii + 14]) + W[i + ii + 9] + \ + s0(W[i + ii + 1]) + W[i + ii] + +/* + * SHA512 block compression function. The 512-bit state is transformed via + * the 512-bit input block to produce a new state. + */ +static void +SHA512_Transform(uint64_t *state, + const unsigned char block[SHA512_BLOCK_LENGTH]) +{ + uint64_t W[80]; + uint64_t S[8]; + int i; + + /* 1. Prepare the first part of the message schedule W. */ + be64dec_vect(W, block, SHA512_BLOCK_LENGTH); + + /* 2. Initialize working variables. */ + memcpy(S, state, SHA512_DIGEST_LENGTH); + + /* 3. Mix. */ + for (i = 0; i < 80; i += 16) { + RNDr(S, W, 0, i); + RNDr(S, W, 1, i); + RNDr(S, W, 2, i); + RNDr(S, W, 3, i); + RNDr(S, W, 4, i); + RNDr(S, W, 5, i); + RNDr(S, W, 6, i); + RNDr(S, W, 7, i); + RNDr(S, W, 8, i); + RNDr(S, W, 9, i); + RNDr(S, W, 10, i); + RNDr(S, W, 11, i); + RNDr(S, W, 12, i); + RNDr(S, W, 13, i); + RNDr(S, W, 14, i); + RNDr(S, W, 15, i); + + if (i == 64) + break; + MSCH(W, 0, i); + MSCH(W, 1, i); + MSCH(W, 2, i); + MSCH(W, 3, i); + MSCH(W, 4, i); + MSCH(W, 5, i); + MSCH(W, 6, i); + MSCH(W, 7, i); + MSCH(W, 8, i); + MSCH(W, 9, i); + MSCH(W, 10, i); + MSCH(W, 11, i); + MSCH(W, 12, i); + MSCH(W, 13, i); + MSCH(W, 14, i); + MSCH(W, 15, i); + } + + /* 4. Mix local working variables into global state */ + for (i = 0; i < 8; i++) + state[i] += S[i]; +} + +static unsigned char PAD[SHA512_BLOCK_LENGTH] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* Add padding and terminating bit-count. */ +static void +SHA512_Pad(SHA512_CTX * ctx) +{ + size_t r; + + /* Figure out how many bytes we have buffered. */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Pad to 112 mod 128, transforming if we finish a block en route. */ + if (r < 112) { + /* Pad to 112 mod 128. */ + memcpy(&ctx->buf[r], PAD, 112 - r); + } else { + /* Finish the current block and mix. */ + memcpy(&ctx->buf[r], PAD, 128 - r); + SHA512_Transform(ctx->state, ctx->buf); + + /* The start of the final block is all zeroes. */ + memset(&ctx->buf[0], 0, 112); + } + + /* Add the terminating bit-count. */ + be64enc_vect(&ctx->buf[112], ctx->count, 16); + + /* Mix in the final block. */ + SHA512_Transform(ctx->state, ctx->buf); +} + +/* SHA-512 initialization. Begins a SHA-512 operation. */ +void +SHA512_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x6a09e667f3bcc908ULL; + ctx->state[1] = 0xbb67ae8584caa73bULL; + ctx->state[2] = 0x3c6ef372fe94f82bULL; + ctx->state[3] = 0xa54ff53a5f1d36f1ULL; + ctx->state[4] = 0x510e527fade682d1ULL; + ctx->state[5] = 0x9b05688c2b3e6c1fULL; + ctx->state[6] = 0x1f83d9abfb41bd6bULL; + ctx->state[7] = 0x5be0cd19137e2179ULL; +} + +/* Add bytes into the hash */ +void +SHA512_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + uint64_t bitlen[2]; + uint64_t r; + const unsigned char *src = in; + + /* Number of bytes left in the buffer from previous updates */ + r = (ctx->count[1] >> 3) & 0x7f; + + /* Convert the length into a number of bits */ + bitlen[1] = ((uint64_t)len) << 3; + bitlen[0] = ((uint64_t)len) >> 61; + + /* Update number of bits */ + if ((ctx->count[1] += bitlen[1]) < bitlen[1]) + ctx->count[0]++; + ctx->count[0] += bitlen[0]; + + /* Handle the case where we don't need to perform any transforms */ + if (len < SHA512_BLOCK_LENGTH - r) { + memcpy(&ctx->buf[r], src, len); + return; + } + + /* Finish the current block */ + memcpy(&ctx->buf[r], src, SHA512_BLOCK_LENGTH - r); + SHA512_Transform(ctx->state, ctx->buf); + src += SHA512_BLOCK_LENGTH - r; + len -= SHA512_BLOCK_LENGTH - r; + + /* Perform complete blocks */ + while (len >= SHA512_BLOCK_LENGTH) { + SHA512_Transform(ctx->state, src); + src += SHA512_BLOCK_LENGTH; + len -= SHA512_BLOCK_LENGTH; + } + + /* Copy left over data into buffer */ + memcpy(ctx->buf, src, len); +} + +/* + * SHA-512 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA512_Final(unsigned char digest[static SHA512_DIGEST_LENGTH], SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* SHA-512t: ******************************************************** */ +/* + * the SHA512t transforms are identical to SHA512 so reuse the existing function + */ +void +SHA512_224_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x8c3d37c819544da2ULL; + ctx->state[1] = 0x73e1996689dcd4d6ULL; + ctx->state[2] = 0x1dfab7ae32ff9c82ULL; + ctx->state[3] = 0x679dd514582f9fcfULL; + ctx->state[4] = 0x0f6d2b697bd44da8ULL; + ctx->state[5] = 0x77e36f7304c48942ULL; + ctx->state[6] = 0x3f9d85a86a1d36c8ULL; + ctx->state[7] = 0x1112e6ad91d692a1ULL; +} + +void +SHA512_224_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_224_Final(unsigned char digest[static SHA512_224_DIGEST_LENGTH], + SHA512_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_224_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +void +SHA512_256_Init(SHA512_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0x22312194fc2bf72cULL; + ctx->state[1] = 0x9f555fa3c84c64c2ULL; + ctx->state[2] = 0x2393b86b6f53b151ULL; + ctx->state[3] = 0x963877195940eabdULL; + ctx->state[4] = 0x96283ee2a88effe3ULL; + ctx->state[5] = 0xbe5e1e2553863992ULL; + ctx->state[6] = 0x2b0199fc2c85b8aaULL; + ctx->state[7] = 0x0eb72ddc81c52ca2ULL; +} + +void +SHA512_256_Update(SHA512_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update(ctx, in, len); +} + +void +SHA512_256_Final(unsigned char digest[static SHA512_256_DIGEST_LENGTH], + SHA512_CTX * ctx) +{ + + /* Add padding */ + SHA512_Pad(ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA512_256_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +/* ** SHA-384: ******************************************************** */ +/* + * the SHA384 and SHA512 transforms are identical, so SHA384 is skipped + */ + +/* SHA-384 initialization. Begins a SHA-384 operation. */ +void +SHA384_Init(SHA384_CTX * ctx) +{ + + /* Zero bits processed so far */ + ctx->count[0] = ctx->count[1] = 0; + + /* Magic initialization constants */ + ctx->state[0] = 0xcbbb9d5dc1059ed8ULL; + ctx->state[1] = 0x629a292a367cd507ULL; + ctx->state[2] = 0x9159015a3070dd17ULL; + ctx->state[3] = 0x152fecd8f70e5939ULL; + ctx->state[4] = 0x67332667ffc00b31ULL; + ctx->state[5] = 0x8eb44a8768581511ULL; + ctx->state[6] = 0xdb0c2e0d64f98fa7ULL; + ctx->state[7] = 0x47b5481dbefa4fa4ULL; +} + +/* Add bytes into the SHA-384 hash */ +void +SHA384_Update(SHA384_CTX * ctx, const void *in, size_t len) +{ + + SHA512_Update((SHA512_CTX *)ctx, in, len); +} + +/* + * SHA-384 finalization. Pads the input data, exports the hash value, + * and clears the context state. + */ +void +SHA384_Final(unsigned char digest[static SHA384_DIGEST_LENGTH], SHA384_CTX *ctx) +{ + + /* Add padding */ + SHA512_Pad((SHA512_CTX *)ctx); + + /* Write the hash */ + be64enc_vect(digest, ctx->state, SHA384_DIGEST_LENGTH); + + /* Clear the context state */ + explicit_bzero(ctx, sizeof (*ctx)); +} + +#if 0 +/* + * When building libmd, provide weak references. Note: this is not + * activated in the context of compiling these sources for internal + * use in libcrypt. + */ +#undef SHA512_Init +__weak_reference(_libmd_SHA512_Init, SHA512_Init); +#undef SHA512_Update +__weak_reference(_libmd_SHA512_Update, SHA512_Update); +#undef SHA512_Final +__weak_reference(_libmd_SHA512_Final, SHA512_Final); +#undef SHA512_Transform +__weak_reference(_libmd_SHA512_Transform, SHA512_Transform); + +#undef SHA512_224_Init +__weak_reference(_libmd_SHA512_224_Init, SHA512_224_Init); +#undef SHA512_224_Update +__weak_reference(_libmd_SHA512_224_Update, SHA512_224_Update); +#undef SHA512_224_Final +__weak_reference(_libmd_SHA512_224_Final, SHA512_224_Final); + +#undef SHA512_256_Init +__weak_reference(_libmd_SHA512_256_Init, SHA512_256_Init); +#undef SHA512_256_Update +__weak_reference(_libmd_SHA512_256_Update, SHA512_256_Update); +#undef SHA512_256_Final +__weak_reference(_libmd_SHA512_256_Final, SHA512_256_Final); + +#undef SHA384_Init +__weak_reference(_libmd_SHA384_Init, SHA384_Init); +#undef SHA384_Update +__weak_reference(_libmd_SHA384_Update, SHA384_Update); +#undef SHA384_Final +__weak_reference(_libmd_SHA384_Final, SHA384_Final); +#endif diff --git a/module/os/freebsd/spl/sha512t.h b/module/os/freebsd/spl/sha512t.h new file mode 100644 index 000000000000..703867fc0288 --- /dev/null +++ b/module/os/freebsd/spl/sha512t.h @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2015 Allan Jude + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +#ifndef _SHA512T_H_ +#define _SHA512T_H_ + +#include "sha512.h" + +#ifndef _KERNEL +#include +#endif + +#define SHA512_224_DIGEST_LENGTH 28 +#define SHA512_224_DIGEST_STRING_LENGTH (SHA512_224_DIGEST_LENGTH * 2 + 1) +#define SHA512_256_DIGEST_LENGTH 32 +#define SHA512_256_DIGEST_STRING_LENGTH (SHA512_256_DIGEST_LENGTH * 2 + 1) + +__BEGIN_DECLS + +/* Ensure libmd symbols do not clash with libcrypto */ +#ifndef SHA512_224_Init +#define SHA512_224_Init _libmd_SHA512_224_Init +#endif +#ifndef SHA512_224_Update +#define SHA512_224_Update _libmd_SHA512_224_Update +#endif +#ifndef SHA512_224_Final +#define SHA512_224_Final _libmd_SHA512_224_Final +#endif +#ifndef SHA512_224_End +#define SHA512_224_End _libmd_SHA512_224_End +#endif +#ifndef SHA512_224_Fd +#define SHA512_224_Fd _libmd_SHA512_224_Fd +#endif +#ifndef SHA512_224_FdChunk +#define SHA512_224_FdChunk _libmd_SHA512_224_FdChunk +#endif +#ifndef SHA512_224_File +#define SHA512_224_File _libmd_SHA512_224_File +#endif +#ifndef SHA512_224_FileChunk +#define SHA512_224_FileChunk _libmd_SHA512_224_FileChunk +#endif +#ifndef SHA512_224_Data +#define SHA512_224_Data _libmd_SHA512_224_Data +#endif + +#ifndef SHA512_224_Transform +#define SHA512_224_Transform _libmd_SHA512_224_Transform +#endif +#ifndef SHA512_224_version +#define SHA512_224_version _libmd_SHA512_224_version +#endif + +#ifndef SHA512_256_Init +#define SHA512_256_Init _libmd_SHA512_256_Init +#endif +#ifndef SHA512_256_Update +#define SHA512_256_Update _libmd_SHA512_256_Update +#endif +#ifndef SHA512_256_Final +#define SHA512_256_Final _libmd_SHA512_256_Final +#endif +#ifndef SHA512_256_End +#define SHA512_256_End _libmd_SHA512_256_End +#endif +#ifndef SHA512_256_Fd +#define SHA512_256_Fd _libmd_SHA512_256_Fd +#endif +#ifndef SHA512_256_FdChunk +#define SHA512_256_FdChunk _libmd_SHA512_256_FdChunk +#endif +#ifndef SHA512_256_File +#define SHA512_256_File _libmd_SHA512_256_File +#endif +#ifndef SHA512_256_FileChunk +#define SHA512_256_FileChunk _libmd_SHA512_256_FileChunk +#endif +#ifndef SHA512_256_Data +#define SHA512_256_Data _libmd_SHA512_256_Data +#endif + +#ifndef SHA512_256_Transform +#define SHA512_256_Transform _libmd_SHA512_256_Transform +#endif +#ifndef SHA512_256_version +#define SHA512_256_version _libmd_SHA512_256_version +#endif + +void SHA512_224_Init(SHA512_CTX *); +void SHA512_224_Update(SHA512_CTX *, const void *, size_t); +void SHA512_224_Final(unsigned char [__min_size(SHA512_224_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_224_End(SHA512_CTX *, char *); +char *SHA512_224_Data(const void *, unsigned int, char *); +char *SHA512_224_Fd(int, char *); +char *SHA512_224_FdChunk(int, char *, off_t, off_t); +char *SHA512_224_File(const char *, char *); +char *SHA512_224_FileChunk(const char *, char *, off_t, off_t); +#endif +void SHA512_256_Init(SHA512_CTX *); +void SHA512_256_Update(SHA512_CTX *, const void *, size_t); +void SHA512_256_Final(unsigned char [__min_size(SHA512_256_DIGEST_LENGTH)], + SHA512_CTX *); +#ifndef _KERNEL +char *SHA512_256_End(SHA512_CTX *, char *); +char *SHA512_256_Data(const void *, unsigned int, char *); +char *SHA512_256_Fd(int, char *); +char *SHA512_256_FdChunk(int, char *, off_t, off_t); +char *SHA512_256_File(const char *, char *); +char *SHA512_256_FileChunk(const char *, char *, off_t, off_t); +#endif + +__END_DECLS + +#endif /* !_SHA512T_H_ */ diff --git a/module/os/freebsd/spl/spl_acl.c b/module/os/freebsd/spl/spl_acl.c new file mode 100644 index 000000000000..18188ca0adec --- /dev/null +++ b/module/os/freebsd/spl/spl_acl.c @@ -0,0 +1,223 @@ +/* + * Copyright (c) 2008, 2009 Edward Tomasz Napierała + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +struct zfs2bsd { + uint32_t zb_zfs; + int zb_bsd; +}; + +struct zfs2bsd perms[] = {{ACE_READ_DATA, ACL_READ_DATA}, + {ACE_WRITE_DATA, ACL_WRITE_DATA}, + {ACE_EXECUTE, ACL_EXECUTE}, + {ACE_APPEND_DATA, ACL_APPEND_DATA}, + {ACE_DELETE_CHILD, ACL_DELETE_CHILD}, + {ACE_DELETE, ACL_DELETE}, + {ACE_READ_ATTRIBUTES, ACL_READ_ATTRIBUTES}, + {ACE_WRITE_ATTRIBUTES, ACL_WRITE_ATTRIBUTES}, + {ACE_READ_NAMED_ATTRS, ACL_READ_NAMED_ATTRS}, + {ACE_WRITE_NAMED_ATTRS, ACL_WRITE_NAMED_ATTRS}, + {ACE_READ_ACL, ACL_READ_ACL}, + {ACE_WRITE_ACL, ACL_WRITE_ACL}, + {ACE_WRITE_OWNER, ACL_WRITE_OWNER}, + {ACE_SYNCHRONIZE, ACL_SYNCHRONIZE}, + {0, 0}}; + +struct zfs2bsd flags[] = {{ACE_FILE_INHERIT_ACE, + ACL_ENTRY_FILE_INHERIT}, + {ACE_DIRECTORY_INHERIT_ACE, + ACL_ENTRY_DIRECTORY_INHERIT}, + {ACE_NO_PROPAGATE_INHERIT_ACE, + ACL_ENTRY_NO_PROPAGATE_INHERIT}, + {ACE_INHERIT_ONLY_ACE, + ACL_ENTRY_INHERIT_ONLY}, + {ACE_INHERITED_ACE, + ACL_ENTRY_INHERITED}, + {ACE_SUCCESSFUL_ACCESS_ACE_FLAG, + ACL_ENTRY_SUCCESSFUL_ACCESS}, + {ACE_FAILED_ACCESS_ACE_FLAG, + ACL_ENTRY_FAILED_ACCESS}, + {0, 0}}; + +static int +_bsd_from_zfs(uint32_t zfs, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + int bsd = 0; + + for (tmp = table; tmp->zb_zfs != 0; tmp++) { + if (zfs & tmp->zb_zfs) + bsd |= tmp->zb_bsd; + } + + return (bsd); +} + +static uint32_t +_zfs_from_bsd(int bsd, const struct zfs2bsd *table) +{ + const struct zfs2bsd *tmp; + uint32_t zfs = 0; + + for (tmp = table; tmp->zb_bsd != 0; tmp++) { + if (bsd & tmp->zb_bsd) + zfs |= tmp->zb_zfs; + } + + return (zfs); +} + +int +acl_from_aces(struct acl *aclp, const ace_t *aces, int nentries) +{ + int i; + struct acl_entry *entry; + const ace_t *ace; + + if (nentries < 1) { + printf("acl_from_aces: empty ZFS ACL; returning EINVAL.\n"); + return (EINVAL); + } + + if (nentries > ACL_MAX_ENTRIES) { + /* + * I believe it may happen only when moving a pool + * from SunOS to FreeBSD. + */ + printf("acl_from_aces: ZFS ACL too big to fit " + "into 'struct acl'; returning EINVAL.\n"); + return (EINVAL); + } + + bzero(aclp, sizeof (*aclp)); + aclp->acl_maxcnt = ACL_MAX_ENTRIES; + aclp->acl_cnt = nentries; + + for (i = 0; i < nentries; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + if (ace->a_flags & ACE_OWNER) + entry->ae_tag = ACL_USER_OBJ; + else if (ace->a_flags & ACE_GROUP) + entry->ae_tag = ACL_GROUP_OBJ; + else if (ace->a_flags & ACE_EVERYONE) + entry->ae_tag = ACL_EVERYONE; + else if (ace->a_flags & ACE_IDENTIFIER_GROUP) + entry->ae_tag = ACL_GROUP; + else + entry->ae_tag = ACL_USER; + + if (entry->ae_tag == ACL_USER || entry->ae_tag == ACL_GROUP) + entry->ae_id = ace->a_who; + else + entry->ae_id = ACL_UNDEFINED_ID; + + entry->ae_perm = _bsd_from_zfs(ace->a_access_mask, perms); + entry->ae_flags = _bsd_from_zfs(ace->a_flags, flags); + + switch (ace->a_type) { + case ACE_ACCESS_ALLOWED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALLOW; + break; + case ACE_ACCESS_DENIED_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_DENY; + break; + case ACE_SYSTEM_AUDIT_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_AUDIT; + break; + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry->ae_entry_type = ACL_ENTRY_TYPE_ALARM; + break; + default: + panic("acl_from_aces: a_type is 0x%x", ace->a_type); + } + } + + return (0); +} + +void +aces_from_acl(ace_t *aces, int *nentries, const struct acl *aclp) +{ + int i; + const struct acl_entry *entry; + ace_t *ace; + + bzero(aces, sizeof (*aces) * aclp->acl_cnt); + + *nentries = aclp->acl_cnt; + + for (i = 0; i < aclp->acl_cnt; i++) { + entry = &(aclp->acl_entry[i]); + ace = &(aces[i]); + + ace->a_who = entry->ae_id; + + if (entry->ae_tag == ACL_USER_OBJ) + ace->a_flags = ACE_OWNER; + else if (entry->ae_tag == ACL_GROUP_OBJ) + ace->a_flags = (ACE_GROUP | ACE_IDENTIFIER_GROUP); + else if (entry->ae_tag == ACL_GROUP) + ace->a_flags = ACE_IDENTIFIER_GROUP; + else if (entry->ae_tag == ACL_EVERYONE) + ace->a_flags = ACE_EVERYONE; + else /* ACL_USER */ + ace->a_flags = 0; + + ace->a_access_mask = _zfs_from_bsd(entry->ae_perm, perms); + ace->a_flags |= _zfs_from_bsd(entry->ae_flags, flags); + + switch (entry->ae_entry_type) { + case ACL_ENTRY_TYPE_ALLOW: + ace->a_type = ACE_ACCESS_ALLOWED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_DENY: + ace->a_type = ACE_ACCESS_DENIED_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_ALARM: + ace->a_type = ACE_SYSTEM_ALARM_ACE_TYPE; + break; + case ACL_ENTRY_TYPE_AUDIT: + ace->a_type = ACE_SYSTEM_AUDIT_ACE_TYPE; + break; + default: + panic("aces_from_acl: ae_entry_type is 0x%x", + entry->ae_entry_type); + } + } +} diff --git a/module/os/freebsd/spl/spl_atomic.c b/module/os/freebsd/spl/spl_atomic.c new file mode 100644 index 000000000000..80040fc6a3e3 --- /dev/null +++ b/module/os/freebsd/spl/spl_atomic.c @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +#if !defined(__LP64__) && !defined(__mips_n32) && \ + !defined(ARM_HAVE_ATOMIC64) && !defined(I386_HAVE_ATOMIC64) && \ + !defined(HAS_EMULATED_ATOMIC64) + +#ifdef _KERNEL +#include + +struct mtx atomic_mtx; +MTX_SYSINIT(atomic, &atomic_mtx, "atomic", MTX_DEF); +#else +#include + +#define mtx_lock(lock) pthread_mutex_lock(lock) +#define mtx_unlock(lock) pthread_mutex_unlock(lock) + +static pthread_mutex_t atomic_mtx; + +static __attribute__((constructor)) void +atomic_init(void) +{ + pthread_mutex_init(&atomic_mtx, NULL); +} +#endif + +void +atomic_add_64(volatile uint64_t *target, int64_t delta) +{ + + mtx_lock(&atomic_mtx); + *target += delta; + mtx_unlock(&atomic_mtx); +} + +void +atomic_dec_64(volatile uint64_t *target) +{ + + mtx_lock(&atomic_mtx); + *target -= 1; + mtx_unlock(&atomic_mtx); +} + +uint64_t +atomic_swap_64(volatile uint64_t *a, uint64_t value) +{ + uint64_t ret; + + mtx_lock(&atomic_mtx); + ret = *a; + *a = value; + mtx_unlock(&atomic_mtx); + return (ret); +} + +uint64_t +atomic_load_64(volatile uint64_t *a) +{ + uint64_t ret; + + mtx_lock(&atomic_mtx); + ret = *a; + mtx_unlock(&atomic_mtx); + return (ret); +} + +uint64_t +atomic_add_64_nv(volatile uint64_t *target, int64_t delta) +{ + uint64_t newval; + + mtx_lock(&atomic_mtx); + newval = (*target += delta); + mtx_unlock(&atomic_mtx); + return (newval); +} + +uint64_t +atomic_cas_64(volatile uint64_t *target, uint64_t cmp, uint64_t newval) +{ + uint64_t oldval; + + mtx_lock(&atomic_mtx); + oldval = *target; + if (oldval == cmp) + *target = newval; + mtx_unlock(&atomic_mtx); + return (oldval); +} +#endif diff --git a/module/os/freebsd/spl/spl_cmn_err.c b/module/os/freebsd/spl/spl_cmn_err.c new file mode 100644 index 000000000000..22c7338b7399 --- /dev/null +++ b/module/os/freebsd/spl/spl_cmn_err.c @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 John Birrell . All rights reserved. + * Copyright 2012 Martin Matuska . All rights reserved. + */ + +#include +#include +#include +#include + +void +vcmn_err(int ce, const char *fmt, va_list adx) +{ + char buf[256]; + const char *prefix; + + prefix = NULL; /* silence unwitty compilers */ + switch (ce) { + case CE_CONT: + prefix = "Solaris(cont): "; + break; + case CE_NOTE: + prefix = "Solaris: NOTICE: "; + break; + case CE_WARN: + prefix = "Solaris: WARNING: "; + break; + case CE_PANIC: + prefix = "Solaris(panic): "; + break; + case CE_IGNORE: + break; + default: + panic("Solaris: unknown severity level"); + } + if (ce == CE_PANIC) { + vsnprintf(buf, sizeof (buf), fmt, adx); + panic("%s%s", prefix, buf); + } + if (ce != CE_IGNORE) { + printf("%s", prefix); + vprintf(fmt, adx); + printf("\n"); + } +} + +void +cmn_err(int type, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(type, fmt, ap); + va_end(ap); +} diff --git a/module/os/freebsd/spl/spl_dtrace.c b/module/os/freebsd/spl/spl_dtrace.c new file mode 100644 index 000000000000..6b2872bcc066 --- /dev/null +++ b/module/os/freebsd/spl/spl_dtrace.c @@ -0,0 +1,38 @@ +/* + * Copyright 2014 The FreeBSD Project. + * All rights reserved. + * + * This software was developed by Steven Hartland. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include + +/* CSTYLED */ +SDT_PROBE_DEFINE1(sdt, , , set__error, "int"); diff --git a/module/os/freebsd/spl/spl_kmem.c b/module/os/freebsd/spl/spl_kmem.c new file mode 100644 index 000000000000..cfc61dd7fc2a --- /dev/null +++ b/module/os/freebsd/spl/spl_kmem.c @@ -0,0 +1,352 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#include +#include +#include +#include + +#ifdef KMEM_DEBUG +#include +#include +#endif + +#ifdef _KERNEL +MALLOC_DEFINE(M_SOLARIS, "solaris", "Solaris"); +#else +#define malloc(size, type, flags) malloc(size) +#define free(addr, type) free(addr) +#endif + +#ifdef KMEM_DEBUG +struct kmem_item { + struct stack stack; + LIST_ENTRY(kmem_item) next; +}; +static LIST_HEAD(, kmem_item) kmem_items; +static struct mtx kmem_items_mtx; +MTX_SYSINIT(kmem_items_mtx, &kmem_items_mtx, "kmem_items", MTX_DEF); +#endif /* KMEM_DEBUG */ + +#include + +void * +zfs_kmem_alloc(size_t size, int kmflags) +{ + void *p; +#ifdef KMEM_DEBUG + struct kmem_item *i; + + size += sizeof (struct kmem_item); +#endif + p = malloc(MAX(size, 16), M_SOLARIS, kmflags); +#ifndef _KERNEL + if (kmflags & KM_SLEEP) + assert(p != NULL); +#endif +#ifdef KMEM_DEBUG + if (p != NULL) { + i = p; + p = (uint8_t *)p + sizeof (struct kmem_item); + stack_save(&i->stack); + mtx_lock(&kmem_items_mtx); + LIST_INSERT_HEAD(&kmem_items, i, next); + mtx_unlock(&kmem_items_mtx); + } +#endif + return (p); +} + +void +zfs_kmem_free(void *buf, size_t size __unused) +{ +#ifdef KMEM_DEBUG + if (buf == NULL) { + printf("%s: attempt to free NULL\n", __func__); + return; + } + struct kmem_item *i; + + buf = (uint8_t *)buf - sizeof (struct kmem_item); + mtx_lock(&kmem_items_mtx); + LIST_FOREACH(i, &kmem_items, next) { + if (i == buf) + break; + } + ASSERT(i != NULL); + LIST_REMOVE(i, next); + mtx_unlock(&kmem_items_mtx); + memset(buf, 0xDC, MAX(size, 16)); +#endif + free(buf, M_SOLARIS); +} + +static uint64_t kmem_size_val; + +static void +kmem_size_init(void *unused __unused) +{ + + kmem_size_val = (uint64_t)vm_cnt.v_page_count * PAGE_SIZE; + if (kmem_size_val > vm_kmem_size) + kmem_size_val = vm_kmem_size; +} +SYSINIT(kmem_size_init, SI_SUB_KMEM, SI_ORDER_ANY, kmem_size_init, NULL); + +uint64_t +kmem_size(void) +{ + + return (kmem_size_val); +} + +static int +kmem_std_constructor(void *mem, int size __unused, void *private, int flags) +{ + struct kmem_cache *cache = private; + + return (cache->kc_constructor(mem, cache->kc_private, flags)); +} + +static void +kmem_std_destructor(void *mem, int size __unused, void *private) +{ + struct kmem_cache *cache = private; + + cache->kc_destructor(mem, cache->kc_private); +} + +kmem_cache_t * +kmem_cache_create(char *name, size_t bufsize, size_t align, + int (*constructor)(void *, void *, int), void (*destructor)(void *, void *), + void (*reclaim)(void *) __unused, void *private, vmem_t *vmp, int cflags) +{ + kmem_cache_t *cache; + + ASSERT(vmp == NULL); + + cache = kmem_alloc(sizeof (*cache), KM_SLEEP); + strlcpy(cache->kc_name, name, sizeof (cache->kc_name)); + cache->kc_constructor = constructor; + cache->kc_destructor = destructor; + cache->kc_private = private; +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + cache->kc_zone = uma_zcreate(cache->kc_name, bufsize, + constructor != NULL ? kmem_std_constructor : NULL, + destructor != NULL ? kmem_std_destructor : NULL, + NULL, NULL, align > 0 ? align - 1 : 0, cflags); +#else + cache->kc_size = bufsize; +#endif + + return (cache); +} + +void +kmem_cache_destroy(kmem_cache_t *cache) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zdestroy(cache->kc_zone); +#endif + kmem_free(cache, sizeof (*cache)); +} + +void * +kmem_cache_alloc(kmem_cache_t *cache, int flags) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + return (uma_zalloc_arg(cache->kc_zone, cache, flags)); +#else + void *p; + + p = kmem_alloc(cache->kc_size, flags); + if (p != NULL && cache->kc_constructor != NULL) + kmem_std_constructor(p, cache->kc_size, cache, flags); + return (p); +#endif +} + +void +kmem_cache_free(kmem_cache_t *cache, void *buf) +{ +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + uma_zfree_arg(cache->kc_zone, buf, cache); +#else + if (cache->kc_destructor != NULL) + kmem_std_destructor(buf, cache->kc_size, cache); + kmem_free(buf, cache->kc_size); +#endif +} + +/* + * Allow our caller to determine if there are running reaps. + * + * This call is very conservative and may return B_TRUE even when + * reaping activity isn't active. If it returns B_FALSE, then reaping + * activity is definitely inactive. + */ +boolean_t +kmem_cache_reap_active(void) +{ + + return (B_FALSE); +} + +/* + * Reap (almost) everything soon. + * + * Note: this does not wait for the reap-tasks to complete. Caller + * should use kmem_cache_reap_active() (above) and/or moderation to + * avoid scheduling too many reap-tasks. + */ +#ifdef _KERNEL +void +kmem_cache_reap_soon(kmem_cache_t *cache) +{ +#ifndef KMEM_DEBUG +#if __FreeBSD_version >= 1300043 + uma_zone_reclaim(cache->kc_zone, UMA_RECLAIM_DRAIN); +#else + zone_drain(cache->kc_zone); +#endif +#endif +} + +void +kmem_reap(void) +{ +#if __FreeBSD_version >= 1300043 + uma_reclaim(UMA_RECLAIM_TRIM); +#else + uma_reclaim(); +#endif +} +#else +void +kmem_cache_reap_soon(kmem_cache_t *cache __unused) +{ +} + +void +kmem_reap(void) +{ +} +#endif + +int +kmem_debugging(void) +{ + return (0); +} + +void * +calloc(size_t n, size_t s) +{ + return (kmem_zalloc(n * s, KM_NOSLEEP)); +} + +char * +kmem_vasprintf(const char *fmt, va_list adx) +{ + char *msg; + va_list adx2; + + va_copy(adx2, adx); + msg = kmem_alloc(vsnprintf(NULL, 0, fmt, adx) + 1, KM_SLEEP); + (void) vsprintf(msg, fmt, adx2); + va_end(adx2); + + return (msg); +} + +#include +#include +#ifdef KMEM_DEBUG +#error "KMEM_DEBUG not currently supported" +#endif + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (uma_zone_get_cur(cache->kc_zone)); +} + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->kc_zone->uz_size); +} + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} + +#ifdef KMEM_DEBUG +void kmem_show(void *); +void +kmem_show(void *dummy __unused) +{ + struct kmem_item *i; + + mtx_lock(&kmem_items_mtx); + if (LIST_EMPTY(&kmem_items)) + printf("KMEM_DEBUG: No leaked elements.\n"); + else { + printf("KMEM_DEBUG: Leaked elements:\n\n"); + LIST_FOREACH(i, &kmem_items, next) { + printf("address=%p\n", i); + stack_print_ddb(&i->stack); + printf("\n"); + } + } + mtx_unlock(&kmem_items_mtx); +} + +SYSUNINIT(sol_kmem, SI_SUB_CPU, SI_ORDER_FIRST, kmem_show, NULL); +#endif /* KMEM_DEBUG */ diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c new file mode 100644 index 000000000000..756667045b17 --- /dev/null +++ b/module/os/freebsd/spl/spl_kstat.c @@ -0,0 +1,351 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include + +static MALLOC_DEFINE(M_KSTAT, "kstat_data", "Kernel statistics"); + +SYSCTL_ROOT_NODE(OID_AUTO, kstat, CTLFLAG_RW, 0, "Kernel statistics"); + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +kstat_t * +__kstat_create(const char *module, int instance, const char *name, + const char *class, uchar_t ks_type, uint_t ks_ndata, uchar_t flags) +{ + struct sysctl_oid *root; + kstat_t *ksp; + + KASSERT(instance == 0, ("instance=%d", instance)); + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + /* + * Allocate the main structure. We don't need to copy module/class/name + * stuff in here, because it is only used for sysctl node creation + * done in this function. + */ + ksp = malloc(sizeof (*ksp), M_KSTAT, M_WAITOK|M_ZERO); + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = instance; + strncpy(ksp->ks_name, name, KSTAT_STRLEN); + strncpy(ksp->ks_class, class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = flags; + ksp->ks_update = kstat_default_update; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + panic("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + /* + * Create sysctl tree for those statistics: + * + * kstat.... + */ + sysctl_ctx_init(&ksp->ks_sysctl_ctx); + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, + SYSCTL_STATIC_CHILDREN(_kstat), OID_AUTO, module, CTLFLAG_RW, 0, + ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s tree!\n", __func__, module); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, class, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s tree!\n", __func__, + module, class); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + root = SYSCTL_ADD_NODE(&ksp->ks_sysctl_ctx, SYSCTL_CHILDREN(root), + OID_AUTO, name, CTLFLAG_RW, 0, ""); + if (root == NULL) { + printf("%s: Cannot create kstat.%s.%s.%s tree!\n", __func__, + module, class, name); + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); + return (NULL); + } + ksp->ks_sysctl_root = root; + + return (ksp); +} + +static int +kstat_sysctl(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; + uint64_t val; + + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = ksent->value.ui64; + + return (sysctl_handle_64(oidp, &val, 0, req)); +} + +static int +kstat_sysctl_string(SYSCTL_HANDLER_ARGS) +{ + kstat_t *ksp = arg1; + kstat_named_t *ksent = ksp->ks_data; + char *val; + uint32_t len = 0; + + /* Select the correct element */ + ksent += arg2; + /* Update the aggsums before reading */ + (void) ksp->ks_update(ksp, KSTAT_READ); + val = KSTAT_NAMED_STR_PTR(ksent); + len = KSTAT_NAMED_STR_BUFLEN(ksent); + val[len-1] = '\0'; + + return (sysctl_handle_string(oidp, val, len, req)); +} + +void +kstat_install(kstat_t *ksp) +{ + kstat_named_t *ksent; + char *namelast; + int typelast; + + ksent = ksp->ks_data; + if (ksp->ks_ndata == UINT32_MAX) { +#ifdef INVARIANTS + printf("can't handle raw ops yet!!!\n"); +#endif + return; + } + if (ksent == NULL) { + printf("%s ksp->ks_data == NULL!!!!\n", __func__); + return; + } + typelast = 0; + namelast = NULL; + for (int i = 0; i < ksp->ks_ndata; i++, ksent++) { + if (ksent->data_type != 0) { + typelast = ksent->data_type; + namelast = ksent->name; + } + switch (typelast) { + case KSTAT_DATA_CHAR: + /* Not Implemented */ + break; + case KSTAT_DATA_INT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S32 | CTLFLAG_RD, ksp, i, + kstat_sysctl, "I", namelast); + break; + case KSTAT_DATA_UINT32: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U32 | CTLFLAG_RD, ksp, i, + kstat_sysctl, "IU", namelast); + break; + case KSTAT_DATA_INT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_S64 | CTLFLAG_RD, ksp, i, + kstat_sysctl, "Q", namelast); + break; + case KSTAT_DATA_UINT64: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_U64 | CTLFLAG_RD, ksp, i, + kstat_sysctl, "QU", namelast); + break; + case KSTAT_DATA_LONG: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_LONG | CTLFLAG_RD, ksp, i, + kstat_sysctl, "L", namelast); + break; + case KSTAT_DATA_ULONG: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_ULONG | CTLFLAG_RD, ksp, i, + kstat_sysctl, "LU", namelast); + break; + case KSTAT_DATA_STRING: + SYSCTL_ADD_PROC(&ksp->ks_sysctl_ctx, + SYSCTL_CHILDREN(ksp->ks_sysctl_root), + OID_AUTO, namelast, + CTLTYPE_STRING | CTLFLAG_RD, ksp, i, + kstat_sysctl_string, "A", namelast); + break; + default: + panic("unsupported type: %d", typelast); + } + + } +} + +void +kstat_delete(kstat_t *ksp) +{ + + sysctl_ctx_free(&ksp->ks_sysctl_ctx); + free(ksp, M_KSTAT); +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} diff --git a/module/os/freebsd/spl/spl_misc.c b/module/os/freebsd/spl/spl_misc.c new file mode 100644 index 000000000000..0354b986cd5f --- /dev/null +++ b/module/os/freebsd/spl/spl_misc.c @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static struct opensolaris_utsname hw_utsname = { + .machine = MACHINE +}; + +#ifndef KERNEL_STATIC +char hw_serial[11] = "0"; + +utsname_t * +utsname(void) +{ + return (&hw_utsname); +} +#endif + +static void +opensolaris_utsname_init(void *arg) +{ + + hw_utsname.sysname = ostype; + hw_utsname.nodename = prison0.pr_hostname; + hw_utsname.release = osrelease; + snprintf(hw_utsname.version, sizeof (hw_utsname.version), + "%d", osreldate); +} + +char * +kmem_strdup(const char *s) +{ + char *buf; + + buf = kmem_alloc(strlen(s) + 1, KM_SLEEP); + strcpy(buf, s); + return (buf); +} + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vpanic(fmt, ap); + va_end(ap); +} + + +SYSINIT(opensolaris_utsname_init, SI_SUB_TUNABLES, SI_ORDER_ANY, + opensolaris_utsname_init, NULL); diff --git a/module/os/freebsd/spl/spl_policy.c b/module/os/freebsd/spl/spl_policy.c new file mode 100644 index 000000000000..5cd5c69efa71 --- /dev/null +++ b/module/os/freebsd/spl/spl_policy.c @@ -0,0 +1,437 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +int +secpolicy_nfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NFS_DAEMON)); +} + +int +secpolicy_zfs(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_zfs_proc(cred_t *cr, proc_t *proc) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_sys_config(cred_t *cr, int checkonly __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_POOL_CONFIG)); +} + +int +secpolicy_zinject(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_ZFS_INJECT)); +} + +int +secpolicy_fs_unmount(cred_t *cr, struct mount *vfsp __unused) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_UNMOUNT)); +} + +int +secpolicy_fs_owner(struct mount *mp, cred_t *cr) +{ + + if (zfs_super_owner) { + if (cr->cr_uid == mp->mnt_cred->cr_uid && + cr->cr_prison == mp->mnt_cred->cr_prison) { + return (0); + } + } + return (EPERM); +} + +/* + * This check is done in kern_link(), so we could just return 0 here. + */ +extern int hardlink_check_uid; +int +secpolicy_basic_link(vnode_t *vp, cred_t *cr) +{ + + if (!hardlink_check_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_LINK)); +} + +int +secpolicy_vnode_stky_modify(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_remove(vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_access(cred_t *cr, vnode_t *vp, uid_t owner, accmode_t accmode) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + if ((accmode & VREAD) && spl_priv_check_cred(cr, PRIV_VFS_READ) != 0) + return (EACCES); + if ((accmode & VWRITE) && + spl_priv_check_cred(cr, PRIV_VFS_WRITE) != 0) { + return (EACCES); + } + if (accmode & VEXEC) { + if (vp->v_type == VDIR) { + if (spl_priv_check_cred(cr, PRIV_VFS_LOOKUP) != 0) + return (EACCES); + } else { + if (spl_priv_check_cred(cr, PRIV_VFS_EXEC) != 0) + return (EACCES); + } + } + return (0); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + */ +int +secpolicy_vnode_access2(cred_t *cr, vnode_t *vp, uid_t owner, + accmode_t curmode, accmode_t wantmode) +{ + accmode_t mode; + + mode = ~curmode & wantmode; + + if (mode == 0) + return (0); + + return (secpolicy_vnode_access(cr, vp, owner, mode)); +} + +int +secpolicy_vnode_any_access(cred_t *cr, vnode_t *vp, uid_t owner) +{ + static int privs[] = { + PRIV_VFS_ADMIN, + PRIV_VFS_READ, + PRIV_VFS_WRITE, + PRIV_VFS_EXEC, + PRIV_VFS_LOOKUP + }; + int i; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* Same as secpolicy_vnode_setdac */ + if (owner == cr->cr_uid) + return (0); + + for (i = 0; i < sizeof (privs)/sizeof (int); i++) { + int priv; + + switch (priv = privs[i]) { + case PRIV_VFS_EXEC: + if (vp->v_type == VDIR) + continue; + break; + case PRIV_VFS_LOOKUP: + if (vp->v_type != VDIR) + continue; + break; + } + if (spl_priv_check_cred(cr, priv) == 0) + return (0); + } + return (EPERM); +} + +int +secpolicy_vnode_setdac(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_ADMIN)); +} + +int +secpolicy_vnode_setattr(cred_t *cr, vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + int mask = vap->va_mask; + int error; + + if (mask & AT_SIZE) { + if (vp->v_type == VDIR) + return (EISDIR); + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + if (mask & AT_MODE) { + /* + * If not the owner of the file then check privilege + * for two things: the privilege to set the mode at all + * and, if we're setting setuid, we also need permissions + * to add the set-uid bit, if we're not the owner. + * In the specific case of creating a set-uid root + * file, we need even more permissions. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + error = secpolicy_setid_setsticky_clear(vp, vap, ovap, cr); + if (error) + return (error); + } else { + vap->va_mode = ovap->va_mode; + } + if (mask & (AT_UID | AT_GID)) { + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error) + return (error); + + /* + * To change the owner of a file, or change the group of + * a file to a group of which we are not a member, the + * caller must have privilege. + */ + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid && + !groupmember(vap->va_gid, cr))) { + if (secpolicy_fs_owner(vp->v_mount, cr) != 0) { + error = spl_priv_check_cred(cr, PRIV_VFS_CHOWN); + if (error) + return (error); + } + } + + if (((mask & AT_UID) && vap->va_uid != ovap->va_uid) || + ((mask & AT_GID) && vap->va_gid != ovap->va_gid)) { + secpolicy_setid_clear(vap, vp, cr); + } + } + if (mask & (AT_ATIME | AT_MTIME)) { + /* + * From utimes(2): + * If times is NULL, ... The caller must be the owner of + * the file, have permission to write the file, or be the + * super-user. + * If times is non-NULL, ... The caller must be the owner of + * the file or be the super-user. + */ + error = secpolicy_vnode_setdac(vp, cr, ovap->va_uid); + if (error && (vap->va_vaflags & VA_UTIMES_NULL)) + error = unlocked_access(node, VWRITE, cr); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_vnode_create_gid(cred_t *cr) +{ + + return (EPERM); +} + +int +secpolicy_vnode_setids_setgids(vnode_t *vp, cred_t *cr, gid_t gid) +{ + + if (groupmember(gid, cr)) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SETGID)); +} + +int +secpolicy_vnode_setid_retain(vnode_t *vp, cred_t *cr, + boolean_t issuidroot __unused) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)); +} + +void +secpolicy_setid_clear(struct vattr *vap, vnode_t *vp, cred_t *cr) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return; + + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0) { + if (spl_priv_check_cred(cr, PRIV_VFS_RETAINSUGID)) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } + } +} + +int +secpolicy_setid_setsticky_clear(vnode_t *vp, struct vattr *vap, + const struct vattr *ovap, cred_t *cr) +{ + int error; + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* + * Privileged processes may set the sticky bit on non-directories, + * as well as set the setgid bit on a file with a group that the process + * is not a member of. Both of these are allowed in jail(8). + */ + if (vp->v_type != VDIR && (vap->va_mode & S_ISTXT)) { + if (spl_priv_check_cred(cr, PRIV_VFS_STICKYFILE)) + return (EFTYPE); + } + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0) { + error = secpolicy_vnode_setids_setgids(vp, cr, ovap->va_gid); + if (error) + return (error); + } + /* + * Deny setting setuid if we are not the file owner. + */ + if ((vap->va_mode & S_ISUID) && ovap->va_uid != cr->cr_uid) { + error = spl_priv_check_cred(cr, PRIV_VFS_ADMIN); + if (error) + return (error); + } + return (0); +} + +int +secpolicy_fs_mount(cred_t *cr, vnode_t *mvp, struct mount *vfsp) +{ + + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT)); +} + +int +secpolicy_vnode_owner(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (owner == cr->cr_uid) + return (0); + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + + /* XXX: vfs_suser()? */ + return (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_OWNER)); +} + +int +secpolicy_vnode_chown(vnode_t *vp, cred_t *cr, uid_t owner) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_CHOWN)); +} + +void +secpolicy_fs_mount_clearopts(cred_t *cr, struct mount *vfsp) +{ + + if (spl_priv_check_cred(cr, PRIV_VFS_MOUNT_NONUSER) != 0) { + MNT_ILOCK(vfsp); + vfsp->vfs_flag |= VFS_NOSETUID | MNT_USER; + vfs_clearmntopt(vfsp, MNTOPT_SETUID); + vfs_setmntopt(vfsp, MNTOPT_NOSETUID, NULL, 0); + MNT_IUNLOCK(vfsp); + } +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(vnode_t *vp, xvattr_t *xvap, uid_t owner, cred_t *cr, + vtype_t vtype) +{ + + if (secpolicy_fs_owner(vp->v_mount, cr) == 0) + return (0); + return (spl_priv_check_cred(cr, PRIV_VFS_SYSFLAGS)); +} + +int +secpolicy_smb(cred_t *cr) +{ + + return (spl_priv_check_cred(cr, PRIV_NETSMB)); +} diff --git a/module/os/freebsd/spl/spl_procfs_list.c b/module/os/freebsd/spl/spl_procfs_list.c new file mode 100644 index 000000000000..7b4ae9d0e357 --- /dev/null +++ b/module/os/freebsd/spl/spl_procfs_list.c @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +void +seq_printf(struct seq_file *m, const char *fmt, ...) +{} + +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; + procfs_list->pl_node_offset = procfs_list_node_off; +} + +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{} + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} diff --git a/module/os/freebsd/spl/spl_string.c b/module/os/freebsd/spl/spl_string.c new file mode 100644 index 000000000000..d13b64b4cd26 --- /dev/null +++ b/module/os/freebsd/spl/spl_string.c @@ -0,0 +1,107 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + * + * $FreeBSD$ + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include + +#define IS_DIGIT(c) ((c) >= '0' && (c) <= '9') + +#define IS_ALPHA(c) \ + (((c) >= 'a' && (c) <= 'z') || ((c) >= 'A' && (c) <= 'Z')) + +char * +strpbrk(const char *s, const char *b) +{ + const char *p; + + do { + for (p = b; *p != '\0' && *p != *s; ++p) + ; + if (*p != '\0') + return ((char *)s); + } while (*s++); + + return (NULL); +} + +/* + * Convert a string into a valid C identifier by replacing invalid + * characters with '_'. Also makes sure the string is nul-terminated + * and takes up at most n bytes. + */ +void +strident_canon(char *s, size_t n) +{ + char c; + char *end = s + n - 1; + + if ((c = *s) == 0) + return; + + if (!IS_ALPHA(c) && c != '_') + *s = '_'; + + while (s < end && ((c = *(++s)) != 0)) { + if (!IS_ALPHA(c) && !IS_DIGIT(c) && c != '_') + *s = '_'; + } + *s = 0; +} + +/* + * Do not change the length of the returned string; it must be freed + * with strfree(). + */ +char * +kmem_asprintf(const char *fmt, ...) +{ + int size; + va_list adx; + char *buf; + + va_start(adx, fmt); + size = vsnprintf(NULL, 0, fmt, adx) + 1; + va_end(adx); + + buf = kmem_alloc(size, KM_SLEEP); + + va_start(adx, fmt); + (void) vsnprintf(buf, size, fmt, adx); + va_end(adx); + + return (buf); +} + +void +kmem_strfree(char *str) +{ + ASSERT(str != NULL); + kmem_free(str, strlen(str) + 1); +} diff --git a/module/os/freebsd/spl/spl_sunddi.c b/module/os/freebsd/spl/spl_sunddi.c new file mode 100644 index 000000000000..ebec77bdb37f --- /dev/null +++ b/module/os/freebsd/spl/spl_sunddi.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +ddi_strtol(const char *str, char **nptr, int base, long *result) +{ + + *result = strtol(str, nptr, base); + return (0); +} + +int +ddi_strtoul(const char *str, char **nptr, int base, unsigned long *result) +{ + + if (str == hw_serial) { + *result = prison0.pr_hostid; + return (0); + } + + *result = strtoul(str, nptr, base); + return (0); +} + +int +ddi_strtoull(const char *str, char **nptr, int base, unsigned long long *result) +{ + + *result = (unsigned long long)strtouq(str, nptr, base); + return (0); +} + +int +ddi_strtoll(const char *str, char **nptr, int base, long long *result) +{ + + *result = (long long)strtoq(str, nptr, base); + return (0); +} diff --git a/module/os/freebsd/spl/spl_sysevent.c b/module/os/freebsd/spl/spl_sysevent.c new file mode 100644 index 000000000000..8c0e495681e9 --- /dev/null +++ b/module/os/freebsd/spl/spl_sysevent.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 2010 Pawel Jakub Dawidek + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +log_sysevent(nvlist_t *event) +{ + struct sbuf *sb; + const char *type; + char typestr[128]; + nvpair_t *elem = NULL; + + sb = sbuf_new_auto(); + if (sb == NULL) + return (ENOMEM); + type = NULL; + + while ((elem = nvlist_next_nvpair(event, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN: + { + boolean_t value; + + (void) nvpair_value_boolean_value(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), + value ? "true" : "false"); + break; + } + case DATA_TYPE_UINT8: + { + uint8_t value; + + (void) nvpair_value_uint8(elem, &value); + sbuf_printf(sb, " %s=%hhu", nvpair_name(elem), value); + break; + } + case DATA_TYPE_INT32: + { + int32_t value; + + (void) nvpair_value_int32(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT32: + { + uint32_t value; + + (void) nvpair_value_uint32(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_INT64: + { + int64_t value; + + (void) nvpair_value_int64(elem, &value); + sbuf_printf(sb, " %s=%jd", nvpair_name(elem), + (intmax_t)value); + break; + } + case DATA_TYPE_UINT64: + { + uint64_t value; + + (void) nvpair_value_uint64(elem, &value); + sbuf_printf(sb, " %s=%ju", nvpair_name(elem), + (uintmax_t)value); + break; + } + case DATA_TYPE_STRING: + { + char *value; + + (void) nvpair_value_string(elem, &value); + sbuf_printf(sb, " %s=%s", nvpair_name(elem), value); + if (strcmp(FM_CLASS, nvpair_name(elem)) == 0) + type = value; + break; + } + case DATA_TYPE_UINT8_ARRAY: + { + uint8_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint8_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%02hhx", value[ii]); + break; + } + case DATA_TYPE_UINT16_ARRAY: + { + uint16_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint16_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%04hx", value[ii]); + break; + } + case DATA_TYPE_UINT32_ARRAY: + { + uint32_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint32_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%08jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_INT64_ARRAY: + { + int64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_int64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016lld", + (long long)value[ii]); + break; + } + case DATA_TYPE_UINT64_ARRAY: + { + uint64_t *value; + uint_t ii, nelem; + + (void) nvpair_value_uint64_array(elem, &value, &nelem); + sbuf_printf(sb, " %s=", nvpair_name(elem)); + for (ii = 0; ii < nelem; ii++) + sbuf_printf(sb, "%016jx", (uintmax_t)value[ii]); + break; + } + case DATA_TYPE_STRING_ARRAY: + { + char **strarr; + uint_t ii, nelem; + + (void) nvpair_value_string_array(elem, &strarr, &nelem); + + for (ii = 0; ii < nelem; ii++) { + if (strarr[ii] == NULL) { + sbuf_printf(sb, " "); + continue; + } + + sbuf_printf(sb, " %s", strarr[ii]); + if (strcmp(FM_CLASS, strarr[ii]) == 0) + type = strarr[ii]; + } + break; + } + case DATA_TYPE_NVLIST: + /* XXX - requires recursing in log_sysevent */ + break; + default: + printf("%s: type %d is not implemented\n", __func__, + nvpair_type(elem)); + break; + } + } + + if (sbuf_finish(sb) != 0) { + sbuf_delete(sb); + return (ENOMEM); + } + + if (type == NULL) + type = ""; + if (strncmp(type, "ESC_ZFS_", 8) == 0) { + snprintf(typestr, sizeof (typestr), "misc.fs.zfs.%s", type + 8); + type = typestr; + } + devctl_notify("ZFS", "ZFS", type, sbuf_data(sb)); + sbuf_delete(sb); + + return (0); +} + +static void +sysevent_worker(void *arg __unused) +{ + zfs_zevent_t *ze; + nvlist_t *event; + uint64_t dropped = 0; + uint64_t dst_size; + int error; + + zfs_zevent_init(&ze); + for (;;) { + dst_size = 131072; + dropped = 0; + event = NULL; + error = zfs_zevent_next(ze, &event, + &dst_size, &dropped); + if (error) { + error = zfs_zevent_wait(ze); + if (error == ESHUTDOWN) + break; + } else { + VERIFY(event != NULL); + log_sysevent(event); + nvlist_free(event); + } + } + zfs_zevent_destroy(ze); + kthread_exit(); +} + +void +ddi_sysevent_init(void) +{ + kproc_kthread_add(sysevent_worker, NULL, &system_proc, NULL, 0, 0, + "zfskern", "sysevent"); +} diff --git a/module/os/freebsd/spl/spl_taskq.c b/module/os/freebsd/spl/spl_taskq.c new file mode 100644 index 000000000000..049e889cf304 --- /dev/null +++ b/module/os/freebsd/spl/spl_taskq.c @@ -0,0 +1,409 @@ +/* + * Copyright (c) 2009 Pawel Jakub Dawidek + * All rights reserved. + * + * Copyright (c) 2012 Spectra Logic Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __FreeBSD_version < 1201522 +#define taskqueue_start_threads_in_proc(tqp, count, pri, proc, name, ...) \ + taskqueue_start_threads(tqp, count, pri, name, __VA_ARGS__) +#endif + +static uint_t taskq_tsd; +static uma_zone_t taskq_zone; + +taskq_t *system_taskq = NULL; +taskq_t *system_delay_taskq = NULL; +taskq_t *dynamic_taskq = NULL; + +proc_t *system_proc; + +extern int uma_align_cache; + +static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures"); + +static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl; +static unsigned long tqenthash; +static unsigned long tqenthashlock; +static struct sx *tqenthashtbl_lock; + +static uint32_t tqidnext = 1; + +#define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash]) +#define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)]) + +#define TIMEOUT_TASK 1 +#define NORMAL_TASK 2 + +static void +system_taskq_init(void *arg) +{ + int i; + + tsd_create(&taskq_tsd, NULL); + tqenthashtbl = hashinit(mp_ncpus * 8, M_TASKQ, &tqenthash); + tqenthashlock = (tqenthash + 1) / 8; + if (tqenthashlock > 0) + tqenthashlock--; + tqenthashtbl_lock = + malloc(sizeof (*tqenthashtbl_lock) * (tqenthashlock + 1), + M_TASKQ, M_WAITOK | M_ZERO); + for (i = 0; i < tqenthashlock + 1; i++) + sx_init_flags(&tqenthashtbl_lock[i], "tqenthash", SX_DUPOK); + tqidnext = 1; + taskq_zone = uma_zcreate("taskq_zone", sizeof (taskq_ent_t), + NULL, NULL, NULL, NULL, + UMA_ALIGN_CACHE, 0); + system_taskq = taskq_create("system_taskq", mp_ncpus, minclsyspri, + 0, 0, 0); + system_delay_taskq = taskq_create("system_delay_taskq", mp_ncpus, + minclsyspri, 0, 0, 0); +} +SYSINIT(system_taskq_init, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_init, + NULL); + +static void +system_taskq_fini(void *arg) +{ + int i; + + taskq_destroy(system_delay_taskq); + taskq_destroy(system_taskq); + uma_zdestroy(taskq_zone); + tsd_destroy(&taskq_tsd); + for (i = 0; i < tqenthashlock + 1; i++) + sx_destroy(&tqenthashtbl_lock[i]); + for (i = 0; i < tqenthash + 1; i++) + VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i])); + free(tqenthashtbl_lock, M_TASKQ); + free(tqenthashtbl, M_TASKQ); +} +SYSUNINIT(system_taskq_fini, SI_SUB_CONFIGURE, SI_ORDER_ANY, system_taskq_fini, + NULL); + +static taskq_ent_t * +taskq_lookup(taskqid_t tqid) +{ + taskq_ent_t *ent = NULL; + + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) { + if (ent->tqent_id == tqid) + break; + } + if (ent != NULL) + refcount_acquire(&ent->tqent_rc); + sx_xunlock(TQIDHASHLOCK(tqid)); + return (ent); +} + +static taskqid_t +taskq_insert(taskq_ent_t *ent) +{ + taskqid_t tqid = atomic_fetchadd_int(&tqidnext, 1); + + ent->tqent_id = tqid; + ent->tqent_registered = B_TRUE; + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash); + sx_xunlock(TQIDHASHLOCK(tqid)); + return (tqid); +} + +static void +taskq_remove(taskq_ent_t *ent) +{ + taskqid_t tqid = ent->tqent_id; + + if (!ent->tqent_registered) + return; + + sx_xlock(TQIDHASHLOCK(tqid)); + CK_LIST_REMOVE(ent, tqent_hash); + sx_xunlock(TQIDHASHLOCK(tqid)); + ent->tqent_registered = B_FALSE; +} + +static void +taskq_tsd_set(void *context) +{ + taskq_t *tq = context; + + tsd_set(taskq_tsd, tq); +} + +static taskq_t * +taskq_create_impl(const char *name, int nthreads, pri_t pri, + proc_t *proc __maybe_unused, uint_t flags) +{ + taskq_t *tq; + + if ((flags & TASKQ_THREADS_CPU_PCT) != 0) + nthreads = MAX((mp_ncpus * nthreads) / 100, 1); + + tq = kmem_alloc(sizeof (*tq), KM_SLEEP); + tq->tq_queue = taskqueue_create(name, M_WAITOK, + taskqueue_thread_enqueue, &tq->tq_queue); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_INIT, + taskq_tsd_set, tq); + taskqueue_set_callback(tq->tq_queue, TASKQUEUE_CALLBACK_TYPE_SHUTDOWN, + taskq_tsd_set, NULL); + (void) taskqueue_start_threads_in_proc(&tq->tq_queue, nthreads, pri, + proc, "%s", name); + + return ((taskq_t *)tq); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, int minalloc __unused, + int maxalloc __unused, uint_t flags) +{ + return (taskq_create_impl(name, nthreads, pri, system_proc, flags)); +} + +taskq_t * +taskq_create_proc(const char *name, int nthreads, pri_t pri, + int minalloc __unused, int maxalloc __unused, proc_t *proc, uint_t flags) +{ + return (taskq_create_impl(name, nthreads, pri, proc, flags)); +} + +void +taskq_destroy(taskq_t *tq) +{ + + taskqueue_free(tq->tq_queue); + kmem_free(tq, sizeof (*tq)); +} + +int +taskq_member(taskq_t *tq, kthread_t *thread) +{ + + return (taskqueue_member(tq->tq_queue, thread)); +} + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} + +static void +taskq_free(taskq_ent_t *task) +{ + taskq_remove(task); + if (refcount_release(&task->tqent_rc)) + uma_zfree(taskq_zone, task); +} + +int +taskq_cancel_id(taskq_t *tq, taskqid_t tid) +{ + uint32_t pend; + int rc; + taskq_ent_t *ent; + + if (tid == 0) + return (0); + + if ((ent = taskq_lookup(tid)) == NULL) + return (0); + + ent->tqent_cancelled = B_TRUE; + if (ent->tqent_type == TIMEOUT_TASK) { + rc = taskqueue_cancel_timeout(tq->tq_queue, + &ent->tqent_timeout_task, &pend); + } else + rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend); + if (rc == EBUSY) { + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + } else if (pend) { + /* + * Tasks normally free themselves when run, but here the task + * was cancelled so it did not free itself. + */ + taskq_free(ent); + } + /* Free the extra reference we added with taskq_lookup. */ + taskq_free(ent); + return (rc); +} + +static void +taskq_run(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + if (!task->tqent_cancelled) + task->tqent_func(task->tqent_arg); + taskq_free(task); +} + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskq_ent_t *task; + taskqid_t tid; + clock_t timo; + int mflag; + + timo = expire_time - ddi_get_lbolt(); + if (timo <= 0) + return (taskq_dispatch(tq, func, arg, flags)); + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_type = TIMEOUT_TASK; + task->tqent_cancelled = B_FALSE; + refcount_init(&task->tqent_rc, 1); + tid = taskq_insert(task); + TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0, + taskq_run, task); + + taskqueue_enqueue_timeout(tq->tq_queue, &task->tqent_timeout_task, + timo); + return (tid); +} + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *task; + int mflag, prio; + taskqid_t tid; + + if ((flags & (TQ_SLEEP | TQ_NOQUEUE)) == TQ_SLEEP) + mflag = M_WAITOK; + else + mflag = M_NOWAIT; + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + + task = uma_zalloc(taskq_zone, mflag); + if (task == NULL) + return (0); + refcount_init(&task->tqent_rc, 1); + task->tqent_func = func; + task->tqent_arg = arg; + task->tqent_cancelled = B_FALSE; + task->tqent_type = NORMAL_TASK; + tid = taskq_insert(task); + TASK_INIT(&task->tqent_task, prio, taskq_run, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); + VERIFY(tid); + return (tid); +} + +static void +taskq_run_ent(void *arg, int pending __unused) +{ + taskq_ent_t *task = arg; + + task->tqent_func(task->tqent_arg); +} + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags, + taskq_ent_t *task) +{ + int prio; + + /* + * If TQ_FRONT is given, we want higher priority for this task, so it + * can go at the front of the queue. + */ + prio = !!(flags & TQ_FRONT); + task->tqent_cancelled = B_FALSE; + task->tqent_registered = B_FALSE; + task->tqent_id = 0; + task->tqent_func = func; + task->tqent_arg = arg; + + TASK_INIT(&task->tqent_task, prio, taskq_run_ent, task); + taskqueue_enqueue(tq->tq_queue, &task->tqent_task); +} + +void +taskq_wait(taskq_t *tq) +{ + taskqueue_quiesce(tq->tq_queue); +} + +void +taskq_wait_id(taskq_t *tq, taskqid_t tid) +{ + taskq_ent_t *ent; + + if (tid == 0) + return; + if ((ent = taskq_lookup(tid)) == NULL) + return; + + taskqueue_drain(tq->tq_queue, &ent->tqent_task); + taskq_free(ent); +} + +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id __unused) +{ + taskqueue_drain_all(tq->tq_queue); +} + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (t->tqent_task.ta_pending == 0); +} diff --git a/module/os/freebsd/spl/spl_uio.c b/module/os/freebsd/spl/spl_uio.c new file mode 100644 index 000000000000..c6b610394718 --- /dev/null +++ b/module/os/freebsd/spl/spl_uio.c @@ -0,0 +1,92 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + +/* + * $FreeBSD$ + */ + +#include +#include +#include + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +{ + struct iovec small_iovec[1]; + struct uio small_uio_clone; + struct uio *uio_clone; + int error; + + ASSERT3U(uio->uio_rw, ==, rw); + if (uio->uio_iovcnt == 1) { + small_uio_clone = *uio; + small_iovec[0] = *uio->uio_iov; + small_uio_clone.uio_iov = small_iovec; + uio_clone = &small_uio_clone; + } else { + uio_clone = cloneuio(uio); + } + + error = vn_io_fault_uiomove(p, n, uio_clone); + *cbytes = uio->uio_resid - uio_clone->uio_resid; + if (uio_clone != &small_uio_clone) + free(uio_clone, M_IOV); + return (error); +} + +/* + * Drop the next n chars out of *uiop. + */ +void +uioskip(uio_t *uio, size_t n) +{ + enum uio_seg segflg; + + /* For the full compatibility with illumos. */ + if (n > uio->uio_resid) + return; + + segflg = uio->uio_segflg; + uio->uio_segflg = UIO_NOCOPY; + uiomove(NULL, n, uio->uio_rw, uio); + uio->uio_segflg = segflg; +} diff --git a/module/os/freebsd/spl/spl_vfs.c b/module/os/freebsd/spl/spl_vfs.c new file mode 100644 index 000000000000..991a11fe2baf --- /dev/null +++ b/module/os/freebsd/spl/spl_vfs.c @@ -0,0 +1,285 @@ +/* + * Copyright (c) 2006-2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +MALLOC_DECLARE(M_MOUNT); + +void +vfs_setmntopt(vfs_t *vfsp, const char *name, const char *arg, + int flags __unused) +{ + struct vfsopt *opt; + size_t namesize; + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + + if (vfsp->mnt_opt == NULL) { + void *opts; + + MNT_IUNLOCK(vfsp); + opts = malloc(sizeof (*vfsp->mnt_opt), M_MOUNT, M_WAITOK); + MNT_ILOCK(vfsp); + if (vfsp->mnt_opt == NULL) { + vfsp->mnt_opt = opts; + TAILQ_INIT(vfsp->mnt_opt); + } else { + free(opts, M_MOUNT); + } + } + + MNT_IUNLOCK(vfsp); + + opt = malloc(sizeof (*opt), M_MOUNT, M_WAITOK); + namesize = strlen(name) + 1; + opt->name = malloc(namesize, M_MOUNT, M_WAITOK); + strlcpy(opt->name, name, namesize); + opt->pos = -1; + opt->seen = 1; + if (arg == NULL) { + opt->value = NULL; + opt->len = 0; + } else { + opt->len = strlen(arg) + 1; + opt->value = malloc(opt->len, M_MOUNT, M_WAITOK); + bcopy(arg, opt->value, opt->len); + } + + MNT_ILOCK(vfsp); + TAILQ_INSERT_TAIL(vfsp->mnt_opt, opt, link); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +void +vfs_clearmntopt(vfs_t *vfsp, const char *name) +{ + int locked; + + if (!(locked = mtx_owned(MNT_MTX(vfsp)))) + MNT_ILOCK(vfsp); + vfs_deleteopt(vfsp->mnt_opt, name); + if (!locked) + MNT_IUNLOCK(vfsp); +} + +int +vfs_optionisset(const vfs_t *vfsp, const char *opt, char **argp) +{ + struct vfsoptlist *opts = vfsp->mnt_optnew; + int error; + + if (opts == NULL) + return (0); + error = vfs_getopt(opts, opt, (void **)argp, NULL); + return (error != 0 ? 0 : 1); +} + +int +mount_snapshot(kthread_t *td, vnode_t **vpp, const char *fstype, char *fspath, + char *fspec, int fsflags) +{ + struct vfsconf *vfsp; + struct mount *mp; + vnode_t *vp, *mvp; + struct ucred *cr; + int error; + + ASSERT_VOP_ELOCKED(*vpp, "mount_snapshot"); + + vp = *vpp; + *vpp = NULL; + error = 0; + + /* + * Be ultra-paranoid about making sure the type and fspath + * variables will fit in our mp buffers, including the + * terminating NUL. + */ + if (strlen(fstype) >= MFSNAMELEN || strlen(fspath) >= MNAMELEN) + error = ENAMETOOLONG; + if (error == 0 && (vfsp = vfs_byname_kld(fstype, td, &error)) == NULL) + error = ENODEV; + if (error == 0 && vp->v_type != VDIR) + error = ENOTDIR; + /* + * We need vnode lock to protect v_mountedhere and vnode interlock + * to protect v_iflag. + */ + if (error == 0) { + VI_LOCK(vp); + if ((vp->v_iflag & VI_MOUNT) == 0 && vp->v_mountedhere == NULL) + vp->v_iflag |= VI_MOUNT; + else + error = EBUSY; + VI_UNLOCK(vp); + } + if (error != 0) { + vput(vp); + return (error); + } + vn_seqc_write_begin(vp); + VOP_UNLOCK1(vp); + + /* + * Allocate and initialize the filesystem. + * We don't want regular user that triggered snapshot mount to be able + * to unmount it, so pass credentials of the parent mount. + */ + mp = vfs_mount_alloc(vp, vfsp, fspath, vp->v_mount->mnt_cred); + + mp->mnt_optnew = NULL; + vfs_setmntopt(mp, "from", fspec, 0); + mp->mnt_optnew = mp->mnt_opt; + mp->mnt_opt = NULL; + + /* + * Set the mount level flags. + */ + mp->mnt_flag = fsflags & MNT_UPDATEMASK; + /* + * Snapshots are always read-only. + */ + mp->mnt_flag |= MNT_RDONLY; + /* + * We don't want snapshots to allow access to vulnerable setuid + * programs, so we turn off setuid when mounting snapshots. + */ + mp->mnt_flag |= MNT_NOSUID; + /* + * We don't want snapshots to be visible in regular + * mount(8) and df(1) output. + */ + mp->mnt_flag |= MNT_IGNORE; + /* + * XXX: This is evil, but we can't mount a snapshot as a regular user. + * XXX: Is is safe when snapshot is mounted from within a jail? + */ + cr = td->td_ucred; + td->td_ucred = kcred; + error = VFS_MOUNT(mp); + td->td_ucred = cr; + + if (error != 0) { + /* + * Clear VI_MOUNT and decrement the use count "atomically", + * under the vnode lock. This is not strictly required, + * but makes it easier to reason about the life-cycle and + * ownership of the covered vnode. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + vn_seqc_write_end(vp); + vput(vp); + vfs_unbusy(mp); + vfs_freeopts(mp->mnt_optnew); + mp->mnt_vnodecovered = NULL; + vfs_mount_destroy(mp); + return (error); + } + + if (mp->mnt_opt != NULL) + vfs_freeopts(mp->mnt_opt); + mp->mnt_opt = mp->mnt_optnew; + (void) VFS_STATFS(mp, &mp->mnt_stat); + + /* + * Prevent external consumers of mount options from reading + * mnt_optnew. + */ + mp->mnt_optnew = NULL; + + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); +#ifdef FREEBSD_NAMECACHE + cache_purge(vp); +#endif + VI_LOCK(vp); + vp->v_iflag &= ~VI_MOUNT; + VI_UNLOCK(vp); + + vp->v_mountedhere = mp; + /* Put the new filesystem on the mount list. */ + mtx_lock(&mountlist_mtx); + TAILQ_INSERT_TAIL(&mountlist, mp, mnt_list); + mtx_unlock(&mountlist_mtx); + vfs_event_signal(NULL, VQ_MOUNT, 0); + if (VFS_ROOT(mp, LK_EXCLUSIVE, &mvp)) + panic("mount: lost mount"); + vn_seqc_write_end(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300048 + vfs_op_exit(mp); +#endif + vfs_unbusy(mp); + *vpp = mvp; + return (0); +} + +/* + * Like vn_rele() except if we are going to call VOP_INACTIVE() then do it + * asynchronously using a taskq. This can avoid deadlocks caused by re-entering + * the file system as a result of releasing the vnode. Note, file systems + * already have to handle the race where the vnode is incremented before the + * inactive routine is called and does its locking. + * + * Warning: Excessive use of this routine can lead to performance problems. + * This is because taskqs throttle back allocation if too many are created. + */ +void +vn_rele_async(vnode_t *vp, taskq_t *taskq) +{ + VERIFY(vp->v_count > 0); + if (refcount_release_if_not_last(&vp->v_usecount)) { +#if __FreeBSD_version < 1300045 + vdrop(vp); +#endif + return; + } + VERIFY(taskq_dispatch((taskq_t *)taskq, + (task_func_t *)vrele, vp, TQ_SLEEP) != 0); +} diff --git a/module/os/freebsd/spl/spl_vm.c b/module/os/freebsd/spl/spl_vm.c new file mode 100644 index 000000000000..739ddb05e895 --- /dev/null +++ b/module/os/freebsd/spl/spl_vm.c @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2013 EMC Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +const int zfs_vm_pagerret_bad = VM_PAGER_BAD; +const int zfs_vm_pagerret_error = VM_PAGER_ERROR; +const int zfs_vm_pagerret_ok = VM_PAGER_OK; +const int zfs_vm_pagerput_sync = VM_PAGER_PUT_SYNC; +const int zfs_vm_pagerput_inval = VM_PAGER_PUT_INVAL; + +void +zfs_vmobject_assert_wlocked(vm_object_t object) +{ + + /* + * This is not ideal because FILE/LINE used by assertions will not + * be too helpful, but it must be an hard function for + * compatibility reasons. + */ + VM_OBJECT_ASSERT_WLOCKED(object); +} + +void +zfs_vmobject_wlock(vm_object_t object) +{ + + VM_OBJECT_WLOCK(object); +} + +void +zfs_vmobject_wunlock(vm_object_t object) +{ + + VM_OBJECT_WUNLOCK(object); +} diff --git a/module/os/freebsd/spl/spl_zlib.c b/module/os/freebsd/spl/spl_zlib.c new file mode 100644 index 000000000000..3644eba77ca1 --- /dev/null +++ b/module/os/freebsd/spl/spl_zlib.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#if __FreeBSD_version >= 1300041 +#include +#else +#include +#endif +#include + + +/*ARGSUSED*/ +static void * +zcalloc(void *opaque, uint_t items, uint_t size) +{ + + return (malloc((size_t)items*size, M_SOLARIS, M_NOWAIT)); +} + +/*ARGSUSED*/ +static void +zcfree(void *opaque, void *ptr) +{ + + free(ptr, M_SOLARIS); +} + +static int +zlib_deflateInit(z_stream *stream, int level) +{ + + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (deflateInit(stream, level)); +} + +static int +zlib_deflate(z_stream *stream, int flush) +{ + return (deflate(stream, flush)); +} + +static int +zlib_deflateEnd(z_stream *stream) +{ + return (deflateEnd(stream)); +} + +static int +zlib_inflateInit(z_stream *stream) +{ + stream->zalloc = zcalloc; + stream->opaque = NULL; + stream->zfree = zcfree; + + return (inflateInit(stream)); +} + +static int +zlib_inflate(z_stream *stream, int finish) +{ +#if __FreeBSD_version >= 1300024 + return (inflate(stream, finish)); +#else + return (_zlib104_inflate(stream, finish)); +#endif +} + + +static int +zlib_inflateEnd(z_stream *stream) +{ + return (inflateEnd(stream)); +} + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + // return (kmem_cache_alloc(zlib_workspace_cache, flags)); + return (NULL); +} + +static void +zlib_workspace_free(void *workspace) +{ + // kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + stream.opaque = NULL; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.opaque); + return (err); +} + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + bzero(&stream, sizeof (stream)); + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.opaque = zlib_workspace_alloc(KM_SLEEP); +#if 0 + if (!stream.opaque) + return (Z_MEM_ERROR); +#endif + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.opaque); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.opaque); + + return (err); +} diff --git a/module/os/freebsd/spl/spl_zone.c b/module/os/freebsd/spl/spl_zone.c new file mode 100644 index 000000000000..0b3b04d2a73e --- /dev/null +++ b/module/os/freebsd/spl/spl_zone.c @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2007 Pawel Jakub Dawidek + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static MALLOC_DEFINE(M_ZONES, "zones_data", "Zones data"); + +/* + * Structure to record list of ZFS datasets exported to a zone. + */ +typedef struct zone_dataset { + LIST_ENTRY(zone_dataset) zd_next; + char zd_dataset[0]; +} zone_dataset_t; + +LIST_HEAD(zone_dataset_head, zone_dataset); + +static int zone_slot; + +int +zone_dataset_attach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd, *zd2; + struct prison *pr; + int dofree, error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + /* Allocate memory before we grab prison's mutex. */ + zd = malloc(sizeof (*zd) + strlen(dataset) + 1, M_ZONES, M_WAITOK); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); /* Locks &pr->pr_mtx. */ + sx_sunlock(&allprison_lock); + if (pr == NULL) { + free(zd, M_ZONES); + return (ENOENT); + } + + head = osd_jail_get(pr, zone_slot); + if (head != NULL) { + dofree = 0; + LIST_FOREACH(zd2, head, zd_next) { + if (strcmp(dataset, zd2->zd_dataset) == 0) { + free(zd, M_ZONES); + error = EEXIST; + goto end; + } + } + } else { + dofree = 1; + prison_hold_locked(pr); + mtx_unlock(&pr->pr_mtx); + head = malloc(sizeof (*head), M_ZONES, M_WAITOK); + LIST_INIT(head); + mtx_lock(&pr->pr_mtx); + error = osd_jail_set(pr, zone_slot, head); + KASSERT(error == 0, ("osd_jail_set() failed (error=%d)", + error)); + } + strcpy(zd->zd_dataset, dataset); + LIST_INSERT_HEAD(head, zd, zd_next); +end: + if (dofree) + prison_free_locked(pr); + else + mtx_unlock(&pr->pr_mtx); + return (error); +} + +int +zone_dataset_detach(struct ucred *cred, const char *dataset, int jailid) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + int error; + + if ((error = spl_priv_check_cred(cred, PRIV_ZFS_JAIL)) != 0) + return (error); + + sx_slock(&allprison_lock); + pr = prison_find(jailid); + sx_sunlock(&allprison_lock); + if (pr == NULL) + return (ENOENT); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) { + error = ENOENT; + goto end; + } + LIST_FOREACH(zd, head, zd_next) { + if (strcmp(dataset, zd->zd_dataset) == 0) + break; + } + if (zd == NULL) + error = ENOENT; + else { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + if (LIST_EMPTY(head)) + osd_jail_del(pr, zone_slot); + error = 0; + } +end: + mtx_unlock(&pr->pr_mtx); + return (error); +} + +/* + * Returns true if the named dataset is visible in the current zone. + * The 'write' parameter is set to 1 if the dataset is also writable. + */ +int +zone_dataset_visible(const char *dataset, int *write) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + struct prison *pr; + size_t len; + int ret = 0; + + if (dataset[0] == '\0') + return (0); + if (INGLOBALZONE(curproc)) { + if (write != NULL) + *write = 1; + return (1); + } + pr = curthread->td_ucred->cr_prison; + mtx_lock(&pr->pr_mtx); + head = osd_jail_get(pr, zone_slot); + if (head == NULL) + goto end; + + /* + * Walk the list once, looking for datasets which match exactly, or + * specify a dataset underneath an exported dataset. If found, return + * true and note that it is writable. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(zd->zd_dataset); + if (strlen(dataset) >= len && + bcmp(dataset, zd->zd_dataset, len) == 0 && + (dataset[len] == '\0' || dataset[len] == '/' || + dataset[len] == '@')) { + if (write) + *write = 1; + ret = 1; + goto end; + } + } + + /* + * Walk the list a second time, searching for datasets which are parents + * of exported datasets. These should be visible, but read-only. + * + * Note that we also have to support forms such as 'pool/dataset/', with + * a trailing slash. + */ + LIST_FOREACH(zd, head, zd_next) { + len = strlen(dataset); + if (dataset[len - 1] == '/') + len--; /* Ignore trailing slash */ + if (len < strlen(zd->zd_dataset) && + bcmp(dataset, zd->zd_dataset, len) == 0 && + zd->zd_dataset[len] == '/') { + if (write) + *write = 0; + ret = 1; + goto end; + } + } +end: + mtx_unlock(&pr->pr_mtx); + return (ret); +} + +static void +zone_destroy(void *arg) +{ + struct zone_dataset_head *head; + zone_dataset_t *zd; + + head = arg; + while ((zd = LIST_FIRST(head)) != NULL) { + LIST_REMOVE(zd, zd_next); + free(zd, M_ZONES); + } + free(head, M_ZONES); +} + +uint32_t +zone_get_hostid(void *ptr) +{ + + KASSERT(ptr == NULL, ("only NULL pointer supported in %s", __func__)); + + return ((uint32_t)curthread->td_ucred->cr_prison->pr_hostid); +} + +boolean_t +in_globalzone(struct proc *p) +{ + return (!jailed(FIRST_THREAD_IN_PROC((p))->td_ucred)); +} + +static void +zone_sysinit(void *arg __unused) +{ + + zone_slot = osd_jail_register(zone_destroy, NULL); +} + +static void +zone_sysuninit(void *arg __unused) +{ + + osd_jail_deregister(zone_slot); +} + +SYSINIT(zone_sysinit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysinit, NULL); +SYSUNINIT(zone_sysuninit, SI_SUB_DRIVERS, SI_ORDER_ANY, zone_sysuninit, NULL); diff --git a/module/os/freebsd/zfs/abd_os.c b/module/os/freebsd/zfs/abd_os.c new file mode 100644 index 000000000000..a7bda509bf54 --- /dev/null +++ b/module/os/freebsd/zfs/abd_os.c @@ -0,0 +1,498 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Using a large proportion of scattered ABDs decreases ARC fragmentation since + * when we are at the limit of allocatable space, using equal-size chunks will + * allow us to quickly reclaim enough space for a new large allocation (assuming + * it is also scattered). + * + * ABDs are allocated scattered by default unless the caller uses + * abd_alloc_linear() or zfs_abd_scatter_enabled is disabled. + */ + +#include +#include +#include +#include +#include +#include + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, +}; + +/* + * The size of the chunks ABD allocates. Because the sizes allocated from the + * kmem_cache can't change, this tunable can only be modified at boot. Changing + * it at runtime would cause ABD iteration to work incorrectly for ABDs which + * were allocated with the old size, so a safeguard has been put in place which + * will cause the machine to panic if you change it and try to access the data + * within a scattered ABD. + */ +size_t zfs_abd_chunk_size = 4096; + +#if defined(_KERNEL) +SYSCTL_DECL(_vfs_zfs); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, abd_scatter_enabled, CTLFLAG_RWTUN, + &zfs_abd_scatter_enabled, 0, "Enable scattered ARC data buffers"); +SYSCTL_ULONG(_vfs_zfs, OID_AUTO, abd_chunk_size, CTLFLAG_RDTUN, + &zfs_abd_chunk_size, 0, "The size of the chunks ABD allocates"); +#endif + +kmem_cache_t *abd_chunk_cache; +static kstat_t *abd_ksp; + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose chunks are + * just a single zero'd sized zfs_abd_chunk_size buffer. This + * allows us to conserve memory by only using a single zero buffer + * for the scatter chunks. + */ +abd_t *abd_zero_scatter = NULL; +static char *abd_zero_buf = NULL; + +static void +abd_free_chunk(void *c) +{ + kmem_cache_free(abd_chunk_cache, c); +} + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, zfs_abd_chunk_size) / zfs_abd_chunk_size); +} + +static inline size_t +abd_scatter_chunkcnt(abd_t *abd) +{ + ASSERT(!abd_is_linear(abd)); + return (abd_chunkcnt_for_bytes( + ABD_SCATTER(abd).abd_offset + abd->abd_size)); +} + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size <= zfs_abd_chunk_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + size_t n = abd_scatter_chunkcnt(abd); + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + int waste = n * zfs_abd_chunk_size - abd->abd_size; + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); + arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); + arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + /* + * There is no scatter linear pages in FreeBSD so there is an + * if an error if the ABD has been marked as a linear page. + */ + VERIFY(!abd_is_linear_page(abd)); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + zfs_abd_chunk_size); + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + ASSERT3P( + ABD_SCATTER(abd).abd_chunks[i], !=, NULL); + } +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + size_t n = abd_chunkcnt_for_bytes(size); + for (int i = 0; i < n; i++) { + void *c = kmem_cache_alloc(abd_chunk_cache, KM_PUSHPAGE); + ASSERT3P(c, !=, NULL); + ABD_SCATTER(abd).abd_chunks[i] = c; + } + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; +} + +void +abd_free_chunks(abd_t *abd) +{ + size_t n = abd_scatter_chunkcnt(abd); + for (int i = 0; i < n; i++) { + abd_free_chunk(ABD_SCATTER(abd).abd_chunks[i]); + } +} + +abd_t * +abd_alloc_struct(size_t size) +{ + size_t chunkcnt = abd_chunkcnt_for_bytes(size); + /* + * In the event we are allocating a gang ABD, the size passed in + * will be 0. We must make sure to set abd_size to the size of an + * ABD struct as opposed to an ABD scatter with 0 chunks. The gang + * ABD struct allocation accounts for an additional 24 bytes over + * a scatter ABD with 0 chunks. + */ + size_t abd_size = MAX(sizeof (abd_t), + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + size_t chunkcnt = abd_is_linear(abd) || abd_is_gang(abd) ? 0 : + abd_scatter_chunkcnt(abd); + int size = MAX(sizeof (abd_t), + offsetof(abd_t, abd_u.abd_scatter.abd_chunks[chunkcnt])); + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); + kmem_free(abd, size); + ABDSTAT_INCR(abdstat_struct_size, -size); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where + * each chunk in the scatterlist will be set to abd_zero_buf. + */ +static void +abd_alloc_zero_scatter(void) +{ + size_t n = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + abd_zero_buf = kmem_zalloc(zfs_abd_chunk_size, KM_SLEEP); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER | ABD_FLAG_ZEROS; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_chunk_size = + zfs_abd_chunk_size; + + for (int i = 0; i < n; i++) { + ABD_SCATTER(abd_zero_scatter).abd_chunks[i] = + abd_zero_buf; + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, zfs_abd_chunk_size); +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)zfs_abd_chunk_size); + + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + kmem_free(abd_zero_buf, zfs_abd_chunk_size); +} + +void +abd_init(void) +{ + abd_chunk_cache = kmem_cache_create("abd_chunk", zfs_abd_chunk_size, 0, + NULL, NULL, NULL, NULL, 0, KMC_NODEBUG); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + kmem_cache_destroy(abd_chunk_cache); + abd_chunk_cache = NULL; +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* + * FreeBSD does not have have scatter linear pages + * so there is an error. + */ + VERIFY(0); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * Currently this is linear ABDs, however if ldi_strategy() can ever issue I/Os + * using a scatter/gather list we should switch to that and replace this call + * with vanilla abd_alloc(). + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc_linear(size, is_metadata)); +} + +/* + * This is just a helper function to abd_get_offset_scatter() to alloc a + * scatter ABD using the calculated chunkcnt based on the offset within the + * parent ABD. + */ +static abd_t * +abd_alloc_scatter_offset_chunkcnt(size_t chunkcnt) +{ + size_t abd_size = offsetof(abd_t, + abd_u.abd_scatter.abd_chunks[chunkcnt]); + abd_t *abd = kmem_alloc(abd_size, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, abd_size); + + return (abd); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + size_t chunkcnt = abd_scatter_chunkcnt(sabd) - + (new_offset / zfs_abd_chunk_size); + + abd = abd_alloc_scatter_offset_chunkcnt(chunkcnt); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + ABD_SCATTER(abd).abd_offset = new_offset % zfs_abd_chunk_size; + ABD_SCATTER(abd).abd_chunk_size = zfs_abd_chunk_size; + + /* Copy the scatterlist starting at the correct offset */ + (void) memcpy(&ABD_SCATTER(abd).abd_chunks, + &ABD_SCATTER(sabd).abd_chunks[new_offset / + zfs_abd_chunk_size], + chunkcnt * sizeof (void *)); + + return (abd); +} + +static inline size_t +abd_iter_scatter_chunk_offset(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) % zfs_abd_chunk_size); +} + +static inline size_t +abd_iter_scatter_chunk_index(struct abd_iter *aiter) +{ + ASSERT(!abd_is_linear(aiter->iter_abd)); + return ((ABD_SCATTER(aiter->iter_abd).abd_offset + + aiter->iter_pos) / zfs_abd_chunk_size); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_pos = 0; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* Panic if someone has changed zfs_abd_chunk_size */ + IMPLY(!abd_is_linear(aiter->iter_abd), zfs_abd_chunk_size == + ABD_SCATTER(aiter->iter_abd).abd_chunk_size); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + offset = aiter->iter_pos; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + size_t index = abd_iter_scatter_chunk_index(aiter); + offset = abd_iter_scatter_chunk_offset(aiter); + aiter->iter_mapsize = MIN(zfs_abd_chunk_size - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + paddr = ABD_SCATTER(aiter->iter_abd).abd_chunks[index]; + } + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ + kmem_cache_reap_soon(abd_chunk_cache); +} diff --git a/module/os/freebsd/zfs/arc_os.c b/module/os/freebsd/zfs/arc_os.c new file mode 100644 index 000000000000..94df750035a4 --- /dev/null +++ b/module/os/freebsd/zfs/arc_os.c @@ -0,0 +1,245 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern struct vfsops zfs_vfsops; + +uint_t zfs_arc_free_target = 0; + +static void +arc_free_target_init(void *unused __unused) +{ + zfs_arc_free_target = vm_cnt.v_free_target; +} +SYSINIT(arc_free_target_init, SI_SUB_KTHREAD_PAGE, SI_ORDER_ANY, + arc_free_target_init, NULL); + +/* + * We don't have a tunable for arc_free_target due to the dependency on + * pagedaemon initialisation. + */ +static int +sysctl_vfs_zfs_arc_free_target(SYSCTL_HANDLER_ARGS) +{ + uint_t val; + int err; + + val = zfs_arc_free_target; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < minfree) + return (EINVAL); + if (val > vm_cnt.v_page_count) + return (EINVAL); + + zfs_arc_free_target = val; + + return (0); +} +SYSCTL_DECL(_vfs_zfs); +/* BEGIN CSTYLED */ +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_free_target, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof (uint_t), + sysctl_vfs_zfs_arc_free_target, "IU", + "Desired number of free pages below which ARC triggers reclaim"); +/* END CSTYLED */ + +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + int64_t n __unused; + + /* + * Cooperate with pagedaemon when it's time for it to scan + * and reclaim some pages. + */ + n = PAGESIZE * ((int64_t)freemem - zfs_arc_free_target); + if (n < lowest) { + lowest = n; + } +#if defined(__i386) || !defined(UMA_MD_SMALL_ALLOC) + /* + * If we're on an i386 platform, it's possible that we'll exhaust the + * kernel heap space before we ever run out of available physical + * memory. Most checks of the size of the heap_area compare against + * tune.t_minarmem, which is the minimum available real memory that we + * can have in the system. However, this is generally fixed at 25 pages + * which is so low that it's useless. In this comparison, we seek to + * calculate the total heap-size, and reclaim if more than 3/4ths of the + * heap is allocated. (Or, in the calculation, if less than 1/4th is + * free) + */ + n = uma_avail() - (long)(uma_limit() / 4); + if (n < lowest) { + lowest = n; + } +#endif + + DTRACE_PROBE1(arc__available_memory, int64_t, lowest); + return (lowest); +} + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + uint64_t size; + + if (allmem >= 1 << 30) + size = allmem - (1 << 30); + else + size = min; + return (MAX(allmem * 5 / 8, size)); +} + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *arg) +{ + int64_t nr_scan = *(int64_t *)arg; + + arc_reduce_target_size(ptob(nr_scan)); + free(arg, M_TEMP); + vnlru_free(nr_scan, &zfs_vfsops); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + + int64_t *adjustptr; + + if ((adjustptr = malloc(sizeof (int64_t), M_TEMP, M_NOWAIT)) == NULL) + return; + + *adjustptr = adjust; + taskq_dispatch(arc_prune_taskq, arc_prune_task, adjustptr, TQ_SLEEP); + ARCSTAT_BUMP(arcstat_prune); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem)); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_free_memory(void) +{ + return (ptob(freemem)); +} + +static eventhandler_tag arc_event_lowmem = NULL; + +static void +arc_lowmem(void *arg __unused, int howto __unused) +{ + int64_t free_memory, to_free; + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + free_memory = arc_available_memory(); + to_free = (arc_c >> arc_shrink_shift) - MIN(free_memory, 0); + DTRACE_PROBE2(arc__needfree, int64_t, free_memory, int64_t, to_free); + arc_reduce_target_size(to_free); + + /* + * It is unsafe to block here in arbitrary threads, because we can come + * here from ARC itself and may hold ARC locks and thus risk a deadlock + * with ARC reclaim thread. + */ + if (curproc == pageproc) + arc_wait_for_eviction(to_free); + else + arc_wait_for_eviction(0); +} + +void +arc_lowmem_init(void) +{ + arc_event_lowmem = EVENTHANDLER_REGISTER(vm_lowmem, arc_lowmem, NULL, + EVENTHANDLER_PRI_FIRST); + +} + +void +arc_lowmem_fini(void) +{ + if (arc_event_lowmem != NULL) + EVENTHANDLER_DEREGISTER(vm_lowmem, arc_event_lowmem); +} diff --git a/module/os/freebsd/zfs/crypto_os.c b/module/os/freebsd/zfs/crypto_os.c new file mode 100644 index 000000000000..b86ffc59a21d --- /dev/null +++ b/module/os/freebsd/zfs/crypto_os.c @@ -0,0 +1,611 @@ +/* + * Copyright (c) 2005-2010 Pawel Jakub Dawidek + * Copyright (c) 2018 Sean Eric Fagan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Portions of this file are derived from sys/geom/eli/g_eli_hmac.c + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#endif + +#include +#include +#include + +#include + +#define SHA512_HMAC_BLOCK_SIZE 128 + +static int crypt_sessions = 0; +SYSCTL_DECL(_vfs_zfs); +SYSCTL_INT(_vfs_zfs, OID_AUTO, crypt_sessions, CTLFLAG_RD, + &crypt_sessions, 0, "Number of cryptographic sessions created"); + +void +crypto_mac_init(struct hmac_ctx *ctx, const crypto_key_t *c_key) +{ + uint8_t k_ipad[SHA512_HMAC_BLOCK_SIZE], + k_opad[SHA512_HMAC_BLOCK_SIZE], + key[SHA512_HMAC_BLOCK_SIZE]; + SHA512_CTX lctx; + int i; + size_t cl_bytes = CRYPTO_BITS2BYTES(c_key->ck_length); + + /* + * This code is based on the similar code in geom/eli/g_eli_hmac.c + */ + explicit_bzero(key, sizeof (key)); + if (c_key->ck_length == 0) + /* do nothing */; + else if (cl_bytes <= SHA512_HMAC_BLOCK_SIZE) + bcopy(c_key->ck_data, key, cl_bytes); + else { + /* + * If key is longer than 128 bytes reset it to + * key = SHA512(key). + */ + SHA512_Init(&lctx); + SHA512_Update(&lctx, c_key->ck_data, cl_bytes); + SHA512_Final(key, &lctx); + } + + /* XOR key with ipad and opad values. */ + for (i = 0; i < sizeof (key); i++) { + k_ipad[i] = key[i] ^ 0x36; + k_opad[i] = key[i] ^ 0x5c; + } + explicit_bzero(key, sizeof (key)); + + /* Start inner SHA512. */ + SHA512_Init(&ctx->innerctx); + SHA512_Update(&ctx->innerctx, k_ipad, sizeof (k_ipad)); + explicit_bzero(k_ipad, sizeof (k_ipad)); + /* Start outer SHA512. */ + SHA512_Init(&ctx->outerctx); + SHA512_Update(&ctx->outerctx, k_opad, sizeof (k_opad)); + explicit_bzero(k_opad, sizeof (k_opad)); +} + +void +crypto_mac_update(struct hmac_ctx *ctx, const void *data, size_t datasize) +{ + SHA512_Update(&ctx->innerctx, data, datasize); +} + +void +crypto_mac_final(struct hmac_ctx *ctx, void *md, size_t mdsize) +{ + uint8_t digest[SHA512_DIGEST_LENGTH]; + + /* Complete inner hash */ + SHA512_Final(digest, &ctx->innerctx); + + /* Complete outer hash */ + SHA512_Update(&ctx->outerctx, digest, sizeof (digest)); + SHA512_Final(digest, &ctx->outerctx); + + explicit_bzero(ctx, sizeof (*ctx)); + /* mdsize == 0 means "Give me the whole hash!" */ + if (mdsize == 0) + mdsize = SHA512_DIGEST_LENGTH; + bcopy(digest, md, mdsize); + explicit_bzero(digest, sizeof (digest)); +} + +void +crypto_mac(const crypto_key_t *key, const void *in_data, size_t in_data_size, + void *out_data, size_t out_data_size) +{ + struct hmac_ctx ctx; + + crypto_mac_init(&ctx, key); + crypto_mac_update(&ctx, in_data, in_data_size); + crypto_mac_final(&ctx, out_data, out_data_size); +} + +static int +freebsd_zfs_crypt_done(struct cryptop *crp) +{ + freebsd_crypt_session_t *ses; + + ses = crp->crp_opaque; + mtx_lock(&ses->fs_lock); + ses->fs_done = true; + mtx_unlock(&ses->fs_lock); + wakeup(crp); + return (0); +} + +void +freebsd_crypt_freesession(freebsd_crypt_session_t *sess) +{ + mtx_destroy(&sess->fs_lock); + crypto_freesession(sess->fs_sid); + explicit_bzero(sess, sizeof (*sess)); +} + +static int +zfs_crypto_dispatch(freebsd_crypt_session_t *session, struct cryptop *crp) +{ + int error; + + crp->crp_opaque = session; + crp->crp_callback = freebsd_zfs_crypt_done; + for (;;) { + error = crypto_dispatch(crp); + if (error) + break; + mtx_lock(&session->fs_lock); + while (session->fs_done == false) + msleep(crp, &session->fs_lock, PRIBIO, + "zfs_crypto", hz/5); + mtx_unlock(&session->fs_lock); + + if (crp->crp_etype != EAGAIN) { + error = crp->crp_etype; + break; + } + crp->crp_etype = 0; + crp->crp_flags &= ~CRYPTO_F_DONE; + session->fs_done = false; +#if __FreeBSD_version < 1300087 + /* + * Session ID changed, so we should record that, + * and try again + */ + session->fs_sid = crp->crp_session; +#endif + } + return (error); +} +static void +freebsd_crypt_uio_debug_log(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ +#ifdef FCRYPTO_DEBUG + struct cryptodesc *crd; + uint8_t *p = NULL; + size_t total = 0; + + printf("%s(%s, %p, { %s, %d, %d, %s }, %p, { %d, %p, %u }, " + "%p, %u, %u)\n", + __FUNCTION__, encrypt ? "encrypt" : "decrypt", input_sessionp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + data_uio, key->ck_format, key->ck_data, + (unsigned int)key->ck_length, + ivbuf, (unsigned int)datalen, (unsigned int)auth_len); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); + for (int i = 0; i < data_uio->uio_iovcnt; i++) { + printf("\tiovec #%d: <%p, %u>\n", i, + data_uio->uio_iov[i].iov_base, + (unsigned int)data_uio->uio_iov[i].iov_len); + total += data_uio->uio_iov[i].iov_len; + } + data_uio->uio_resid = total; +#endif +} +/* + * Create a new cryptographic session. This should + * happen every time the key changes (including when + * it's first loaded). + */ +#if __FreeBSD_version >= 1300087 +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct crypto_session_params csp; + int error = 0; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + bzero(&csp, sizeof (csp)); + csp.csp_mode = CSP_MODE_AEAD; + csp.csp_cipher_key = key->ck_data; + csp.csp_cipher_klen = key->ck_length / 8; + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + csp.csp_cipher_alg = CRYPTO_AES_NIST_GCM_16; + csp.csp_ivlen = AES_GCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + case AES_192_GMAC_KEY_LEN: + case AES_256_GMAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + csp.csp_cipher_alg = CRYPTO_AES_CCM_16; + csp.csp_ivlen = AES_CCM_IV_LEN; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + case AES_192_CBC_MAC_KEY_LEN: + case AES_256_CBC_MAC_KEY_LEN: + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + error = crypto_newsession(&sessp->fs_sid, &csp, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} + +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + freebsd_crypt_session_t *session = NULL; + int error = 0; + size_t total = 0; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + for (int i = 0; i < data_uio->uio_iovcnt; i++) + total += data_uio->uio_iov[i].iov_len; + data_uio->uio_resid = total; + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(session->fs_sid, M_WAITOK); + if (encrypt) { + crp->crp_op = CRYPTO_OP_ENCRYPT | + CRYPTO_OP_COMPUTE_DIGEST; + } else { + crp->crp_op = CRYPTO_OP_DECRYPT | + CRYPTO_OP_VERIFY_DIGEST; + } + crp->crp_flags = CRYPTO_F_CBIFSYNC | CRYPTO_F_IV_SEPARATE; + crypto_use_uio(crp, data_uio); + + crp->crp_aad_start = 0; + crp->crp_aad_length = auth_len; + crp->crp_payload_start = auth_len; + crp->crp_payload_length = datalen; + crp->crp_digest_start = auth_len + datalen; + + bcopy(ivbuf, crp->crp_iv, ZIO_DATA_IV_LEN); + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } + return (error); +} + +#else +int +freebsd_crypt_newsession(freebsd_crypt_session_t *sessp, + struct zio_crypt_info *c_info, crypto_key_t *key) +{ + struct cryptoini cria, crie, *crip; + struct enc_xform *xform; + struct auth_hash *xauth; + int error = 0; + crypto_session_t sid; + +#ifdef FCRYPTO_DEBUG + printf("%s(%p, { %s, %d, %d, %s }, { %d, %p, %u })\n", + __FUNCTION__, sessp, + c_info->ci_algname, c_info->ci_crypt_type, + (unsigned int)c_info->ci_keylen, c_info->ci_name, + key->ck_format, key->ck_data, (unsigned int)key->ck_length); + printf("\tkey = { "); + for (int i = 0; i < key->ck_length / 8; i++) { + uint8_t *b = (uint8_t *)key->ck_data; + printf("%02x ", b[i]); + } + printf("}\n"); +#endif + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + bzero(&crie, sizeof (crie)); + bzero(&cria, sizeof (cria)); + + crie.cri_alg = xform->type; + crie.cri_key = key->ck_data; + crie.cri_klen = key->ck_length; + + cria.cri_alg = xauth->type; + cria.cri_key = key->ck_data; + cria.cri_klen = key->ck_length; + + cria.cri_next = &crie; + crie.cri_next = NULL; + crip = &cria; + // Everything else is bzero'd + + error = crypto_newsession(&sid, crip, + CRYPTOCAP_F_HARDWARE | CRYPTOCAP_F_SOFTWARE); + if (error != 0) { + printf("%s(%d): crypto_newsession failed with %d\n", + __FUNCTION__, __LINE__, error); + goto bad; + } + sessp->fs_sid = sid; + mtx_init(&sessp->fs_lock, "FreeBSD Cryptographic Session Lock", + NULL, MTX_DEF); + crypt_sessions++; +bad: + return (error); +} + +/* + * The meat of encryption/decryption. + * If sessp is NULL, then it will create a + * temporary cryptographic session, and release + * it when done. + */ +int +freebsd_crypt_uio(boolean_t encrypt, + freebsd_crypt_session_t *input_sessionp, + struct zio_crypt_info *c_info, + uio_t *data_uio, + crypto_key_t *key, + uint8_t *ivbuf, + size_t datalen, + size_t auth_len) +{ + struct cryptop *crp; + struct cryptodesc *enc_desc, *auth_desc; + struct enc_xform *xform; + struct auth_hash *xauth; + freebsd_crypt_session_t *session = NULL; + int error; + + freebsd_crypt_uio_debug_log(encrypt, input_sessionp, c_info, data_uio, + key, ivbuf, datalen, auth_len); + switch (c_info->ci_crypt_type) { + case ZC_TYPE_GCM: + xform = &enc_xform_aes_nist_gcm; + switch (key->ck_length/8) { + case AES_128_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_128; + break; + case AES_192_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_192; + break; + case AES_256_GMAC_KEY_LEN: + xauth = &auth_hash_nist_gmac_aes_256; + break; + default: + error = EINVAL; + goto bad; + } + break; + case ZC_TYPE_CCM: + xform = &enc_xform_ccm; + switch (key->ck_length/8) { + case AES_128_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_128; + break; + case AES_192_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_192; + break; + case AES_256_CBC_MAC_KEY_LEN: + xauth = &auth_hash_ccm_cbc_mac_256; + break; + default: + error = EINVAL; + goto bad; + break; + } + break; + default: + error = ENOTSUP; + goto bad; + } + +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Using crypt %s (key length %u [%u bytes]), " + "auth %s (key length %d)\n", + __FUNCTION__, __LINE__, + xform->name, (unsigned int)key->ck_length, + (unsigned int)key->ck_length/8, + xauth->name, xauth->keysize); +#endif + + if (input_sessionp == NULL) { + session = kmem_zalloc(sizeof (*session), KM_SLEEP); + error = freebsd_crypt_newsession(session, c_info, key); + if (error) + goto out; + } else + session = input_sessionp; + + crp = crypto_getreq(2); + if (crp == NULL) { + error = ENOMEM; + goto bad; + } + + auth_desc = crp->crp_desc; + enc_desc = auth_desc->crd_next; + + crp->crp_session = session->fs_sid; + crp->crp_ilen = auth_len + datalen; + crp->crp_buf = (void*)data_uio; + crp->crp_flags = CRYPTO_F_IOV | CRYPTO_F_CBIFSYNC; + + auth_desc->crd_skip = 0; + auth_desc->crd_len = auth_len; + auth_desc->crd_inject = auth_len + datalen; + auth_desc->crd_alg = xauth->type; +#ifdef FCRYPTO_DEBUG + printf("%s: auth: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, auth_desc->crd_skip, auth_desc->crd_len, + auth_desc->crd_inject); +#endif + + enc_desc->crd_skip = auth_len; + enc_desc->crd_len = datalen; + enc_desc->crd_inject = auth_len; + enc_desc->crd_alg = xform->type; + enc_desc->crd_flags = CRD_F_IV_EXPLICIT | CRD_F_IV_PRESENT; + bcopy(ivbuf, enc_desc->crd_iv, ZIO_DATA_IV_LEN); + enc_desc->crd_next = NULL; + +#ifdef FCRYPTO_DEBUG + printf("%s: enc: skip = %u, len = %u, inject = %u\n", + __FUNCTION__, enc_desc->crd_skip, enc_desc->crd_len, + enc_desc->crd_inject); +#endif + + if (encrypt) + enc_desc->crd_flags |= CRD_F_ENCRYPT; + + error = zfs_crypto_dispatch(session, crp); + crypto_freereq(crp); +out: + if (input_sessionp == NULL) { + freebsd_crypt_freesession(session); + kmem_free(session, sizeof (*session)); + } +bad: +#ifdef FCRYPTO_DEBUG + if (error) + printf("%s: returning error %d\n", __FUNCTION__, error); +#endif + return (error); +} +#endif diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c new file mode 100644 index 000000000000..8e412d9c1359 --- /dev/null +++ b/module/os/freebsd/zfs/dmu_os.c @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifndef IDX_TO_OFF +#define IDX_TO_OFF(idx) (((vm_ooffset_t)(idx)) << PAGE_SHIFT) +#endif + +#if __FreeBSD_version < 1300051 +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_NOBUSY +#else +#define VM_ALLOC_BUSY_FLAGS VM_ALLOC_SBUSY | VM_ALLOC_IGN_SBUSY +#endif + + +#if __FreeBSD_version < 1300072 +#define dmu_page_lock(m) vm_page_lock(m) +#define dmu_page_unlock(m) vm_page_unlock(m) +#else +#define dmu_page_lock(m) +#define dmu_page_unlock(m) +#endif + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + vm_page_t *ma, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + struct sf_buf *sf; + int numbufs, i; + int err; + + if (size == 0) + return (0); + + err = dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + int tocpy, copied, thiscpy; + int bufoff; + dmu_buf_t *db = dbp[i]; + caddr_t va; + + ASSERT(size > 0); + ASSERT3U(db->db_size, >=, PAGESIZE); + + bufoff = offset - db->db_offset; + tocpy = (int)MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + for (copied = 0; copied < tocpy; copied += PAGESIZE) { + ASSERT3U(ptoa((*ma)->pindex), ==, + db->db_offset + bufoff); + thiscpy = MIN(PAGESIZE, tocpy - copied); + va = zfs_map_page(*ma, &sf); + bcopy(va, (char *)db->db_data + bufoff, thiscpy); + zfs_unmap_page(sf); + ma += 1; + bufoff += PAGESIZE; + } + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +int +dmu_read_pages(objset_t *os, uint64_t object, vm_page_t *ma, int count, + int *rbehind, int *rahead, int last_size) +{ + struct sf_buf *sf; + vm_object_t vmobj; + vm_page_t m; + dmu_buf_t **dbp; + dmu_buf_t *db; + caddr_t va; + int numbufs, i; + int bufoff, pgoff, tocpy; + int mi, di; + int err; + + ASSERT3U(ma[0]->pindex + count - 1, ==, ma[count - 1]->pindex); + ASSERT(last_size <= PAGE_SIZE); + + err = dmu_buf_hold_array(os, object, IDX_TO_OFF(ma[0]->pindex), + IDX_TO_OFF(count - 1) + last_size, TRUE, FTAG, &numbufs, &dbp); + if (err != 0) + return (err); + +#ifdef ZFS_DEBUG + IMPLY(last_size < PAGE_SIZE, *rahead == 0); + if (dbp[0]->db_offset != 0 || numbufs > 1) { + for (i = 0; i < numbufs; i++) { + ASSERT(ISP2(dbp[i]->db_size)); + ASSERT((dbp[i]->db_offset % dbp[i]->db_size) == 0); + ASSERT3U(dbp[i]->db_size, ==, dbp[0]->db_size); + } + } +#endif + + vmobj = ma[0]->object; + zfs_vmobject_wlock_12(vmobj); + + db = dbp[0]; + for (i = 0; i < *rbehind; i++) { + m = vm_page_grab_unlocked(vmobj, ma[0]->pindex - 1 - i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_write_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, PAGESIZE); + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rbehind = i; + + bufoff = IDX_TO_OFF(ma[0]->pindex) % db->db_size; + pgoff = 0; + for (mi = 0, di = 0; mi < count && di < numbufs; ) { + if (pgoff == 0) { + m = ma[mi]; + if (m != bogus_page) { + vm_page_assert_xbusied(m); + ASSERT(vm_page_none_valid(m)); + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_write_mapped(m)); + va = zfs_map_page(m, &sf); + } + } + if (bufoff == 0) + db = dbp[di]; + + if (m != bogus_page) { + ASSERT3U(IDX_TO_OFF(m->pindex) + pgoff, ==, + db->db_offset + bufoff); + } + + /* + * We do not need to clamp the copy size by the file + * size as the last block is zero-filled beyond the + * end of file anyway. + */ + tocpy = MIN(db->db_size - bufoff, PAGESIZE - pgoff); + if (m != bogus_page) + bcopy((char *)db->db_data + bufoff, va + pgoff, tocpy); + + pgoff += tocpy; + ASSERT(pgoff <= PAGESIZE); + if (pgoff == PAGESIZE) { + if (m != bogus_page) { + zfs_unmap_page(sf); + vm_page_valid(m); + } + ASSERT(mi < count); + mi++; + pgoff = 0; + } + + bufoff += tocpy; + ASSERT(bufoff <= db->db_size); + if (bufoff == db->db_size) { + ASSERT(di < numbufs); + di++; + bufoff = 0; + } + } + +#ifdef ZFS_DEBUG + /* + * Three possibilities: + * - last requested page ends at a buffer boundary and , thus, + * all pages and buffers have been iterated; + * - all requested pages are filled, but the last buffer + * has not been exhausted; + * the read-ahead is possible only in this case; + * - all buffers have been read, but the last page has not been + * fully filled; + * this is only possible if the file has only a single buffer + * with a size that is not a multiple of the page size. + */ + if (mi == count) { + ASSERT(di >= numbufs - 1); + IMPLY(*rahead != 0, di == numbufs - 1); + IMPLY(*rahead != 0, bufoff != 0); + ASSERT(pgoff == 0); + } + if (di == numbufs) { + ASSERT(mi >= count - 1); + ASSERT(*rahead == 0); + IMPLY(pgoff == 0, mi == count); + if (pgoff != 0) { + ASSERT(mi == count - 1); + ASSERT((dbp[0]->db_size & PAGE_MASK) != 0); + } + } +#endif + if (pgoff != 0) { + ASSERT(m != bogus_page); + bzero(va + pgoff, PAGESIZE - pgoff); + zfs_unmap_page(sf); + vm_page_valid(m); + } + + for (i = 0; i < *rahead; i++) { + m = vm_page_grab_unlocked(vmobj, ma[count - 1]->pindex + 1 + i, + VM_ALLOC_NORMAL | VM_ALLOC_NOWAIT | VM_ALLOC_BUSY_FLAGS); + if (m == NULL) + break; + if (!vm_page_none_valid(m)) { + ASSERT3U(m->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(m); + break; + } + ASSERT(m->dirty == 0); + ASSERT(!pmap_page_is_mapped(m)); + + ASSERT(db->db_size > PAGE_SIZE); + bufoff = IDX_TO_OFF(m->pindex) % db->db_size; + tocpy = MIN(db->db_size - bufoff, PAGESIZE); + va = zfs_map_page(m, &sf); + bcopy((char *)db->db_data + bufoff, va, tocpy); + if (tocpy < PAGESIZE) { + ASSERT(i == *rahead - 1); + ASSERT((db->db_size & PAGE_MASK) != 0); + bzero(va + tocpy, PAGESIZE - tocpy); + } + zfs_unmap_page(sf); + vm_page_valid(m); + dmu_page_lock(m); + if ((m->busy_lock & VPB_BIT_WAITERS) != 0) + vm_page_activate(m); + else + vm_page_deactivate(m); + dmu_page_unlock(m); + vm_page_do_sunbusy(m); + } + *rahead = i; + zfs_vmobject_wunlock_12(vmobj); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (0); +} diff --git a/module/os/freebsd/zfs/hkdf.c b/module/os/freebsd/zfs/hkdf.c new file mode 100644 index 000000000000..8324ff2319b6 --- /dev/null +++ b/module/os/freebsd/zfs/hkdf.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + crypto_key_t key; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + crypto_mac(&key, key_material, km_len, out_buf, SHA512_DIGEST_LENGTH); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + struct hmac_ctx ctx; + crypto_key_t key; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + for (i = 1; i <= N; i++) { + c = i; + + crypto_mac_init(&ctx, &key); + crypto_mac_update(&ctx, T, T_len); + crypto_mac_update(&ctx, info, info_len); + crypto_mac_update(&ctx, &c, 1); + crypto_mac_final(&ctx, T, SHA512_DIGEST_LENGTH); + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/module/os/freebsd/zfs/kmod_core.c b/module/os/freebsd/zfs/kmod_core.c new file mode 100644 index 000000000000..dce73577eacd --- /dev/null +++ b/module/os/freebsd/zfs/kmod_core.c @@ -0,0 +1,381 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_comutil.h" + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_DECL(_vfs_zfs_vdev); + + +static int zfs_version_ioctl = ZFS_IOCVER_OZFS; +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, ioctl, CTLFLAG_RD, &zfs_version_ioctl, + 0, "ZFS_IOCTL_VERSION"); + +static struct cdev *zfsdev; + +static struct root_hold_token *zfs_root_token; + +extern uint_t rrw_tsd_key; +extern uint_t zfs_allow_log_key; +extern uint_t zfs_geom_probe_vdev_key; + +static int zfs__init(void); +static int zfs__fini(void); +static void zfs_shutdown(void *, int); + +static eventhandler_tag zfs_shutdown_event_tag; +extern zfsdev_state_t *zfsdev_state_list; + +#define ZFS_MIN_KSTACK_PAGES 4 + + +static int +zfsdev_ioctl(struct cdev *dev, ulong_t zcmd, caddr_t arg, int flag, + struct thread *td) +{ + uint_t len; + int vecnum; + zfs_iocparm_t *zp; + zfs_cmd_t *zc; + zfs_cmd_legacy_t *zcl; + int rc, error; + void *uaddr; + + len = IOCPARM_LEN(zcmd); + vecnum = zcmd & 0xff; + zp = (void *)arg; + uaddr = (void *)zp->zfs_cmd; + error = 0; + zcl = NULL; + + if (len != sizeof (zfs_iocparm_t)) { + printf("len %d vecnum: %d sizeof (zfs_cmd_t) %ju\n", + len, vecnum, (uintmax_t)sizeof (zfs_cmd_t)); + return (EINVAL); + } + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + /* + * Remap ioctl code for legacy user binaries + */ + if (zp->zfs_ioctl_version == ZFS_IOCVER_LEGACY) { + vecnum = zfs_ioctl_legacy_to_ozfs(vecnum); + if (vecnum < 0) { + kmem_free(zc, sizeof (zfs_cmd_t)); + return (ENOTSUP); + } + zcl = kmem_zalloc(sizeof (zfs_cmd_legacy_t), KM_SLEEP); + if (copyin(uaddr, zcl, sizeof (zfs_cmd_legacy_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + zfs_cmd_legacy_to_ozfs(zcl, zc); + } else if (copyin(uaddr, zc, sizeof (zfs_cmd_t))) { + error = SET_ERROR(EFAULT); + goto out; + } + error = zfsdev_ioctl_common(vecnum, zc, 0); + if (zcl) { + zfs_cmd_ozfs_to_legacy(zc, zcl); + rc = copyout(zcl, uaddr, sizeof (*zcl)); + } else { + rc = copyout(zc, uaddr, sizeof (*zc)); + } + if (error == 0 && rc != 0) + error = SET_ERROR(EFAULT); +out: + if (zcl) + kmem_free(zcl, sizeof (zfs_cmd_legacy_t)); + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); +} + +static void +zfsdev_close(void *data) +{ + zfsdev_state_t *zs, *zsp = data; + + mutex_enter(&zfsdev_state_lock); + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs == zsp) + break; + } + if (zs == NULL || zs->zs_minor <= 0) { + mutex_exit(&zfsdev_state_lock); + return; + } + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + mutex_exit(&zfsdev_state_lock); + zs->zs_onexit = NULL; + zs->zs_zevent = NULL; +} + +static int +zfs_ctldev_init(struct cdev *devp) +{ + boolean_t newzs = B_FALSE; + minor_t minor; + zfsdev_state_t *zs, *zsprev = NULL; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + devfs_set_cdevpriv(zs, zfsdev_close); + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + if (newzs) { + zs->zs_minor = minor; + wmb(); + zsprev->zs_next = zs; + } else { + wmb(); + zs->zs_minor = minor; + } + return (0); +} + +static int +zfsdev_open(struct cdev *devp, int flag, int mode, struct thread *td) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfs_ctldev_init(devp); + mutex_exit(&zfsdev_state_lock); + + return (error); +} + +static struct cdevsw zfs_cdevsw = { + .d_version = D_VERSION, + .d_open = zfsdev_open, + .d_ioctl = zfsdev_ioctl, + .d_name = ZFS_DRIVER +}; + +int +zfsdev_attach(void) +{ + zfsdev = make_dev(&zfs_cdevsw, 0x0, UID_ROOT, GID_OPERATOR, 0666, + ZFS_DRIVER); + return (0); +} + +void +zfsdev_detach(void) +{ + if (zfsdev != NULL) + destroy_dev(zfsdev); +} + +int +zfs__init(void) +{ + int error; + +#if KSTACK_PAGES < ZFS_MIN_KSTACK_PAGES + printf("ZFS NOTICE: KSTACK_PAGES is %d which could result in stack " + "overflow panic!\nPlease consider adding " + "'options KSTACK_PAGES=%d' to your kernel config\n", KSTACK_PAGES, + ZFS_MIN_KSTACK_PAGES); +#endif + zfs_root_token = root_mount_hold("ZFS"); + if ((error = zfs_kmod_init()) != 0) { + printf("ZFS: Failed to Load ZFS Filesystem" + ", rc = %d\n", error); + root_mount_rel(zfs_root_token); + return (error); + } + + + tsd_create(&zfs_geom_probe_vdev_key, NULL); + + printf("ZFS storage pool version: features support (" + SPA_VERSION_STRING ")\n"); + root_mount_rel(zfs_root_token); + ddi_sysevent_init(); + return (0); +} + +int +zfs__fini(void) +{ + if (zfs_busy() || zvol_busy() || + zio_injection_enabled) { + return (EBUSY); + } + zfs_kmod_fini(); + tsd_destroy(&zfs_geom_probe_vdev_key); + return (0); +} + +static void +zfs_shutdown(void *arg __unused, int howto __unused) +{ + + /* + * ZFS fini routines can not properly work in a panic-ed system. + */ + if (panicstr == NULL) + zfs__fini(); +} + + +static int +zfs_modevent(module_t mod, int type, void *unused __unused) +{ + int err; + + switch (type) { + case MOD_LOAD: + err = zfs__init(); + if (err == 0) + zfs_shutdown_event_tag = EVENTHANDLER_REGISTER( + shutdown_post_sync, zfs_shutdown, NULL, + SHUTDOWN_PRI_FIRST); + return (err); + case MOD_UNLOAD: + err = zfs__fini(); + if (err == 0 && zfs_shutdown_event_tag != NULL) + EVENTHANDLER_DEREGISTER(shutdown_post_sync, + zfs_shutdown_event_tag); + return (err); + case MOD_SHUTDOWN: + return (0); + default: + break; + } + return (EOPNOTSUPP); +} + +static moduledata_t zfs_mod = { + "zfsctrl", + zfs_modevent, + 0 +}; + +#ifdef _KERNEL +EVENTHANDLER_DEFINE(mountroot, spa_boot_init, NULL, 0); +#endif + +DECLARE_MODULE(zfsctrl, zfs_mod, SI_SUB_CLOCKS, SI_ORDER_ANY); +MODULE_VERSION(zfsctrl, 1); +#if __FreeBSD_version > 1300092 +MODULE_DEPEND(zfsctrl, xdr, 1, 1, 1); +#else +MODULE_DEPEND(zfsctrl, krpc, 1, 1, 1); +#endif +MODULE_DEPEND(zfsctrl, acl_nfs4, 1, 1, 1); +MODULE_DEPEND(zfsctrl, crypto, 1, 1, 1); +MODULE_DEPEND(zfsctrl, cryptodev, 1, 1, 1); diff --git a/module/os/freebsd/zfs/spa_os.c b/module/os/freebsd/zfs/spa_os.c new file mode 100644 index 000000000000..2bc78cb451e8 --- /dev/null +++ b/module/os/freebsd/zfs/spa_os.c @@ -0,0 +1,281 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska . All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +static nvlist_t * +spa_generate_rootconf(const char *name) +{ + nvlist_t **configs, **tops; + nvlist_t *config; + nvlist_t *best_cfg, *nvtop, *nvroot; + uint64_t *holes; + uint64_t best_txg; + uint64_t nchildren; + uint64_t pgid; + uint64_t count; + uint64_t i; + uint_t nholes; + + if (vdev_geom_read_pool_label(name, &configs, &count) != 0) + return (NULL); + + ASSERT3U(count, !=, 0); + best_txg = 0; + for (i = 0; i < count; i++) { + uint64_t txg; + + VERIFY(nvlist_lookup_uint64(configs[i], ZPOOL_CONFIG_POOL_TXG, + &txg) == 0); + if (txg > best_txg) { + best_txg = txg; + best_cfg = configs[i]; + } + } + + nchildren = 1; + nvlist_lookup_uint64(best_cfg, ZPOOL_CONFIG_VDEV_CHILDREN, &nchildren); + holes = NULL; + nvlist_lookup_uint64_array(best_cfg, ZPOOL_CONFIG_HOLE_ARRAY, + &holes, &nholes); + + tops = kmem_zalloc(nchildren * sizeof (void *), KM_SLEEP); + for (i = 0; i < nchildren; i++) { + if (i >= count) + break; + if (configs[i] == NULL) + continue; + VERIFY(nvlist_lookup_nvlist(configs[i], ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + nvlist_dup(nvtop, &tops[i], KM_SLEEP); + } + for (i = 0; holes != NULL && i < nholes; i++) { + if (i >= nchildren) + continue; + if (tops[holes[i]] != NULL) + continue; + nvlist_alloc(&tops[holes[i]], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[holes[i]], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_ID, + holes[i]) == 0); + VERIFY(nvlist_add_uint64(tops[holes[i]], ZPOOL_CONFIG_GUID, + 0) == 0); + } + for (i = 0; i < nchildren; i++) { + if (tops[i] != NULL) + continue; + nvlist_alloc(&tops[i], NV_UNIQUE_NAME, KM_SLEEP); + VERIFY(nvlist_add_string(tops[i], ZPOOL_CONFIG_TYPE, + VDEV_TYPE_MISSING) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_ID, + i) == 0); + VERIFY(nvlist_add_uint64(tops[i], ZPOOL_CONFIG_GUID, + 0) == 0); + } + + /* + * Create pool config based on the best vdev config. + */ + nvlist_dup(best_cfg, &config, KM_SLEEP); + + /* + * Put this pool's top-level vdevs into a root vdev. + */ + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + tops, nchildren) == 0); + + /* + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + + /* + * Drop vdev config elements that should not be present at pool level. + */ + nvlist_remove(config, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64); + nvlist_remove(config, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64); + + for (i = 0; i < count; i++) + nvlist_free(configs[i]); + kmem_free(configs, count * sizeof (void *)); + for (i = 0; i < nchildren; i++) + nvlist_free(tops[i]); + kmem_free(tops, nchildren * sizeof (void *)); + nvlist_free(nvroot); + return (config); +} + +int +spa_import_rootpool(const char *name, bool checkpointrewind) +{ + spa_t *spa; + vdev_t *rvd; + nvlist_t *config, *nvtop; + uint64_t txg; + char *pname; + int error; + + /* + * Read the label from the boot device and generate a configuration. + */ + config = spa_generate_rootconf(name); + + mutex_enter(&spa_namespace_lock); + if (config != NULL) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0 && strcmp(name, pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) + == 0); + + if ((spa = spa_lookup(pname)) != NULL) { + /* + * The pool could already be imported, + * e.g., after reboot -r. + */ + if (spa->spa_state == POOL_STATE_ACTIVE) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + return (0); + } + + /* + * Remove the existing root pool from the namespace so + * that we can replace it with the correct config + * we just read in. + */ + spa_remove(spa); + } + spa = spa_add(pname, config, NULL); + + /* + * Set spa_ubsync.ub_version as it can be used in vdev_alloc() + * via spa_version(). + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + } else if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Cannot find the pool label for '%s'", + name); + return (EIO); + } else { + VERIFY(nvlist_dup(spa->spa_config, &config, KM_SLEEP) == 0); + } + spa->spa_is_root = B_TRUE; + spa->spa_import_flags = ZFS_IMPORT_VERBATIM; + if (checkpointrewind) { + spa->spa_import_flags |= ZFS_IMPORT_CHECKPOINT; + } + + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("freebsd"); +} diff --git a/module/os/freebsd/zfs/spa_stats.c b/module/os/freebsd/zfs/spa_stats.c new file mode 100644 index 000000000000..45c880ada24d --- /dev/null +++ b/module/os/freebsd/zfs/spa_stats.c @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include + +void +spa_stats_init(spa_t *spa) +{ + +} + +void +spa_stats_destroy(spa_t *spa) +{ + +} + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ +} + +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + +} +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + return (0); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + return (NULL); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + +} + +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + +} + +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + return (0); +} + +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + return (0); +} diff --git a/module/os/freebsd/zfs/sysctl_os.c b/module/os/freebsd/zfs/sysctl_os.c new file mode 100644 index 000000000000..200bbf43d757 --- /dev/null +++ b/module/os/freebsd/zfs/sysctl_os.c @@ -0,0 +1,693 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + + +/* BEGIN CSTYLED */ +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, arc, CTLFLAG_RW, 0, "ZFS adaptive replacement cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, condense, CTLFLAG_RW, 0, "ZFS condense"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dbuf_cache, CTLFLAG_RW, 0, "ZFS disk buf cache"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, deadman, CTLFLAG_RW, 0, "ZFS deadman"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, dedup, CTLFLAG_RW, 0, "ZFS dedup"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, l2arc, CTLFLAG_RW, 0, "ZFS l2arc"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, livelist, CTLFLAG_RW, 0, "ZFS livelist"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, lua, CTLFLAG_RW, 0, "ZFS lua"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, metaslab, CTLFLAG_RW, 0, "ZFS metaslab"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, mg, CTLFLAG_RW, 0, "ZFS metaslab group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, multihost, CTLFLAG_RW, 0, "ZFS multihost protection"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, prefetch, CTLFLAG_RW, 0, "ZFS prefetch"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, reconstruct, CTLFLAG_RW, 0, "ZFS reconstruct"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, recv, CTLFLAG_RW, 0, "ZFS receive"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, send, CTLFLAG_RW, 0, "ZFS send"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, spa, CTLFLAG_RW, 0, "ZFS space allocation"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, trim, CTLFLAG_RW, 0, "ZFS TRIM"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, txg, CTLFLAG_RW, 0, "ZFS transaction group"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zevent, CTLFLAG_RW, 0, "ZFS event"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zil, CTLFLAG_RW, 0, "ZFS ZIL"); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zio, CTLFLAG_RW, 0, "ZFS ZIO"); + +SYSCTL_NODE(_vfs_zfs_livelist, OID_AUTO, condense, CTLFLAG_RW, 0, + "ZFS livelist condense"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, cache, CTLFLAG_RW, 0, "ZFS VDEV Cache"); +SYSCTL_NODE(_vfs_zfs_vdev, OID_AUTO, mirror, CTLFLAG_RD, 0, + "ZFS VDEV mirror"); + +SYSCTL_DECL(_vfs_zfs_version); +SYSCTL_CONST_STRING(_vfs_zfs_version, OID_AUTO, module, CTLFLAG_RD, + (ZFS_META_VERSION "-" ZFS_META_RELEASE), "OpenZFS module version"); + +extern arc_state_t ARC_anon; +extern arc_state_t ARC_mru; +extern arc_state_t ARC_mru_ghost; +extern arc_state_t ARC_mfu; +extern arc_state_t ARC_mfu_ghost; +extern arc_state_t ARC_l2c_only; + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ + +/* arc.c */ + +/* legacy compat */ +extern uint64_t l2arc_write_max; /* def max write size */ +extern uint64_t l2arc_write_boost; /* extra warmup write */ +extern uint64_t l2arc_headroom; /* # of dev writes */ +extern uint64_t l2arc_headroom_boost; +extern uint64_t l2arc_feed_secs; /* interval seconds */ +extern uint64_t l2arc_feed_min_ms; /* min interval msecs */ +extern int l2arc_noprefetch; /* don't cache prefetch bufs */ +extern int l2arc_feed_again; /* turbo warmup */ +extern int l2arc_norw; /* no reads during writes */ + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_max, CTLFLAG_RW, + &l2arc_write_max, 0, "max write size (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_write_boost, CTLFLAG_RW, + &l2arc_write_boost, 0, "extra write during warmup (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_headroom, CTLFLAG_RW, + &l2arc_headroom, 0, "number of dev writes (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_secs, CTLFLAG_RW, + &l2arc_feed_secs, 0, "interval seconds (LEGACY)"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2arc_feed_min_ms, CTLFLAG_RW, + &l2arc_feed_min_ms, 0, "min interval milliseconds (LEGACY)"); + +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_noprefetch, CTLFLAG_RW, + &l2arc_noprefetch, 0, "don't cache prefetch bufs (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_feed_again, CTLFLAG_RW, + &l2arc_feed_again, 0, "turbo warmup (LEGACY)"); +SYSCTL_INT(_vfs_zfs, OID_AUTO, l2arc_norw, CTLFLAG_RW, + &l2arc_norw, 0, "no reads during writes (LEGACY)"); +#if 0 +extern int zfs_compressed_arc_enabled; +SYSCTL_INT(_vfs_zfs, OID_AUTO, compressed_arc_enabled, CTLFLAG_RW, + &zfs_compressed_arc_enabled, 1, "compressed arc buffers (LEGACY)"); +#endif + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_size, CTLFLAG_RD, + &ARC_anon.arcs_size.rc_count, 0, "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_metadata_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of anonymous state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, anon_data_esize, CTLFLAG_RD, + &ARC_anon.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of anonymous state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_size, CTLFLAG_RD, + &ARC_mru.arcs_size.rc_count, 0, "size of mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_metadata_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_data_esize, CTLFLAG_RD, + &ARC_mru.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_size, CTLFLAG_RD, + &ARC_mru_ghost.arcs_size.rc_count, 0, "size of mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mru ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mru_ghost_data_esize, CTLFLAG_RD, + &ARC_mru_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mru ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_size, CTLFLAG_RD, + &ARC_mfu.arcs_size.rc_count, 0, "size of mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_metadata_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_data_esize, CTLFLAG_RD, + &ARC_mfu.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_size, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_size.rc_count, 0, "size of mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_metadata_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_METADATA].rc_count, 0, + "size of metadata in mfu ghost state"); +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, mfu_ghost_data_esize, CTLFLAG_RD, + &ARC_mfu_ghost.arcs_esize[ARC_BUFC_DATA].rc_count, 0, + "size of data in mfu ghost state"); + +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, l2c_only_size, CTLFLAG_RD, + &ARC_l2c_only.arcs_size.rc_count, 0, "size of mru state"); + +static int +sysctl_vfs_zfs_arc_no_grow_shift(SYSCTL_HANDLER_ARGS) +{ + uint32_t val; + int err; + + val = arc_no_grow_shift; + err = sysctl_handle_32(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val >= arc_shrink_shift) + return (EINVAL); + + arc_no_grow_shift = val; + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_no_grow_shift, CTLTYPE_U32 | CTLFLAG_RWTUN, + 0, sizeof (uint32_t), sysctl_vfs_zfs_arc_no_grow_shift, "U", + "log2(fraction of ARC which must be free to allow growing)"); + +int +param_set_arc_long(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_int(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_int(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + arc_tuning_update(B_TRUE); + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_min, CTLTYPE_ULONG | CTLFLAG_RWTUN, + &zfs_arc_min, sizeof (zfs_arc_min), param_set_arc_long, "LU", + "min arc size (LEGACY)"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, arc_max, CTLTYPE_ULONG | CTLFLAG_RWTUN, + &zfs_arc_max, sizeof (zfs_arc_max), param_set_arc_long, "LU", + "max arc size (LEGACY)"); + +/* dbuf.c */ + + +/* dmu.c */ + +/* dmu_zfetch.c */ +SYSCTL_NODE(_vfs_zfs, OID_AUTO, zfetch, CTLFLAG_RW, 0, "ZFS ZFETCH (LEGACY)"); + +/* max bytes to prefetch per stream (default 8MB) */ +extern uint32_t zfetch_max_distance; +SYSCTL_UINT(_vfs_zfs_zfetch, OID_AUTO, max_distance, CTLFLAG_RWTUN, + &zfetch_max_distance, 0, "Max bytes to prefetch per stream (LEGACY)"); + +/* max bytes to prefetch indirects for per stream (default 64MB) */ +extern uint32_t zfetch_max_idistance; +SYSCTL_UINT(_vfs_zfs_prefetch, OID_AUTO, max_idistance, CTLFLAG_RWTUN, + &zfetch_max_idistance, 0, "Max bytes to prefetch indirects for per stream"); + +/* dsl_pool.c */ + +/* dnode.c */ +extern int zfs_default_bs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_bs, CTLFLAG_RWTUN, + &zfs_default_bs, 0, "Default dnode block shift"); + +extern int zfs_default_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, default_ibs, CTLFLAG_RWTUN, + &zfs_default_ibs, 0, "Default dnode indirect block shift"); + + +/* dsl_scan.c */ + +/* metaslab.c */ + +/* + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. So a sane default for the space map block size + * is 8~16K. + */ +extern int zfs_metaslab_sm_blksz_no_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_no_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_no_log, 0, + "Block size for space map in pools with log space map disabled. " + "Power of 2 and greater than 4096."); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +extern int zfs_metaslab_sm_blksz_with_log; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, sm_blksz_with_log, CTLFLAG_RDTUN, + &zfs_metaslab_sm_blksz_with_log, 0, + "Block size for space map in pools with log space map enabled. " + "Power of 2 and greater than 4096."); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +extern int zfs_condense_pct; +SYSCTL_INT(_vfs_zfs, OID_AUTO, condense_pct, CTLFLAG_RWTUN, + &zfs_condense_pct, 0, + "Condense on-disk spacemap when it is more than this many percents" + " of in-memory counterpart"); + +extern int zfs_remove_max_segment; +SYSCTL_INT(_vfs_zfs, OID_AUTO, remove_max_segment, CTLFLAG_RWTUN, + &zfs_remove_max_segment, 0, "Largest contiguous segment ZFS will attempt to" + " allocate when removing a device"); + +extern int zfs_removal_suspend_progress; +SYSCTL_INT(_vfs_zfs, OID_AUTO, removal_suspend_progress, CTLFLAG_RWTUN, + &zfs_removal_suspend_progress, 0, "Ensures certain actions can happen while" + " in the middle of a removal"); + + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +extern uint64_t metaslab_df_alloc_threshold; +SYSCTL_QUAD(_vfs_zfs_metaslab, OID_AUTO, df_alloc_threshold, CTLFLAG_RWTUN, + &metaslab_df_alloc_threshold, 0, + "Minimum size which forces the dynamic allocator to change it's allocation strategy"); + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +extern int metaslab_df_free_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, df_free_pct, CTLFLAG_RWTUN, + &metaslab_df_free_pct, 0, + "The minimum free space, in percent, which must be available in a " + "space map to continue allocations in a first-fit fashion"); + +/* + * Percentage of all cpus that can be used by the metaslab taskq. + */ +extern int metaslab_load_pct; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, load_pct, CTLFLAG_RWTUN, + &metaslab_load_pct, 0, + "Percentage of cpus that can be used by the metaslab taskq"); + +/* + * Max number of metaslabs per group to preload. + */ +extern int metaslab_preload_limit; +SYSCTL_INT(_vfs_zfs_metaslab, OID_AUTO, preload_limit, CTLFLAG_RWTUN, + &metaslab_preload_limit, 0, + "Max number of metaslabs per group to preload"); + +/* refcount.c */ +extern int reference_tracking_enable; +SYSCTL_INT(_vfs_zfs, OID_AUTO, reference_tracking_enable, CTLFLAG_RDTUN, + &reference_tracking_enable, 0, + "Track reference holders to refcount_t objects, used mostly by ZFS"); + +/* spa.c */ +extern int zfs_ccw_retry_interval; +SYSCTL_INT(_vfs_zfs, OID_AUTO, ccw_retry_interval, CTLFLAG_RWTUN, + &zfs_ccw_retry_interval, 0, + "Configuration cache file write, retry after failure, interval (seconds)"); + +extern uint64_t zfs_max_missing_tvds_cachefile; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_cachefile, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_cachefile, 0, + "allow importing pools with missing top-level vdevs in cache file"); + +extern uint64_t zfs_max_missing_tvds_scan; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, max_missing_tvds_scan, CTLFLAG_RWTUN, + &zfs_max_missing_tvds_scan, 0, + "allow importing pools with missing top-level vdevs during scan"); + +/* spa_misc.c */ +extern int zfs_flags; +static int +sysctl_vfs_zfs_debug_flags(SYSCTL_HANDLER_ARGS) +{ + int err, val; + + val = zfs_flags; + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + /* + * ZFS_DEBUG_MODIFY must be enabled prior to boot so all + * arc buffers in the system have the necessary additional + * checksum data. However, it is safe to disable at any + * time. + */ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + val &= ~ZFS_DEBUG_MODIFY; + zfs_flags = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, debugflags, + CTLTYPE_UINT | CTLFLAG_MPSAFE | CTLFLAG_RWTUN, NULL, 0, + sysctl_vfs_zfs_debug_flags, "IU", "Debug flags for ZFS testing."); + +int +param_set_deadman_synctime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_synctime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_synctime_ms = val; + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_ziotime(SYSCTL_HANDLER_ARGS) +{ + unsigned long val; + int err; + + val = zfs_deadman_ziotime_ms; + err = sysctl_handle_long(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + zfs_deadman_ziotime_ms = val; + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_deadman_failmode(SYSCTL_HANDLER_ARGS) +{ + char buf[16]; + int rc; + + if (req->newptr == NULL) + strlcpy(buf, zfs_deadman_failmode, sizeof (buf)); + + rc = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (rc || req->newptr == NULL) + return (rc); + if (strcmp(buf, zfs_deadman_failmode) == 0) + return (0); + if (!strcmp(buf, "wait")) + zfs_deadman_failmode = "wait"; + if (!strcmp(buf, "continue")) + zfs_deadman_failmode = "continue"; + if (!strcmp(buf, "panic")) + zfs_deadman_failmode = "panic"; + + return (-param_set_deadman_failmode_common(buf)); +} + + +/* spacemap.c */ +extern int space_map_ibs; +SYSCTL_INT(_vfs_zfs, OID_AUTO, space_map_ibs, CTLFLAG_RWTUN, + &space_map_ibs, 0, "Space map indirect block shift"); + + +/* vdev.c */ +int +param_set_min_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_vdev_min_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(EINVAL)); + + zfs_vdev_min_auto_ashift = val; + + return (0); +} + +int +param_set_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_vdev_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (SET_ERROR(err)); + + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(EINVAL)); + + zfs_vdev_max_auto_ashift = val; + + return (0); +} + +SYSCTL_PROC(_vfs_zfs, OID_AUTO, min_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN, + &zfs_vdev_min_auto_ashift, sizeof (zfs_vdev_min_auto_ashift), + param_set_min_auto_ashift, "QU", + "Min ashift used when creating new top-level vdev. (LEGACY)"); +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, CTLTYPE_U64 | CTLFLAG_RWTUN, + &zfs_vdev_max_auto_ashift, sizeof (zfs_vdev_max_auto_ashift), + param_set_max_auto_ashift, "QU", + "Max ashift used when optimizing for logical -> physical sector size on " + "new top-level vdevs. (LEGACY)"); + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +extern int zfs_vdev_dtl_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, dtl_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_dtl_sm_blksz, 0, + "Block size for DTL space map. Power of 2 and greater than 4096."); + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +extern int zfs_vdev_standard_sm_blksz; +SYSCTL_INT(_vfs_zfs, OID_AUTO, standard_sm_blksz, CTLFLAG_RDTUN, + &zfs_vdev_standard_sm_blksz, 0, + "Block size for standard space map. Power of 2 and greater than 4096."); + +extern int vdev_validate_skip; +SYSCTL_INT(_vfs_zfs, OID_AUTO, validate_skip, CTLFLAG_RDTUN, + &vdev_validate_skip, 0, + "Enable to bypass vdev_validate()."); + + +/* vdev_cache.c */ + +/* vdev_mirror.c */ +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * non_rotating_seek_inc to 0 may well provide better results as it + * will direct more reads to the non-rotating vdevs which are more + * likely to have a higher performance. + */ + + +/* vdev_queue.c */ +#define ZFS_VDEV_QUEUE_KNOB_MIN(name) \ +extern uint32_t zfs_vdev_ ## name ## _min_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _min_active, CTLFLAG_RWTUN,\ + &zfs_vdev_ ## name ## _min_active, 0, \ + "Initial number of I/O requests of type " #name \ + " active for each device"); + +#define ZFS_VDEV_QUEUE_KNOB_MAX(name) \ +extern uint32_t zfs_vdev_ ## name ## _max_active; \ +SYSCTL_UINT(_vfs_zfs_vdev, OID_AUTO, name ## _max_active, CTLFLAG_RWTUN, \ + &zfs_vdev_ ## name ## _max_active, 0, \ + "Maximum number of I/O requests of type " #name \ + " active for each device"); + + +#undef ZFS_VDEV_QUEUE_KNOB + +extern uint32_t zfs_vdev_max_active; +SYSCTL_UINT(_vfs_zfs, OID_AUTO, top_maxinflight, CTLFLAG_RWTUN, + &zfs_vdev_max_active, 0, + "The maximum number of I/Os of all types active for each device. (LEGACY)"); + +extern int zfs_vdev_def_queue_depth; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, def_queue_depth, CTLFLAG_RWTUN, + &zfs_vdev_def_queue_depth, 0, + "Default queue depth for each allocator"); + +/*extern uint64_t zfs_multihost_history; +SYSCTL_UQUAD(_vfs_zfs, OID_AUTO, multihost_history, CTLFLAG_RWTUN, + &zfs_multihost_history, 0, + "Historical staticists for the last N multihost updates");*/ + +#ifdef notyet +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, trim_on_init, CTLFLAG_RW, + &vdev_trim_on_init, 0, "Enable/disable full vdev trim on initialisation"); +#endif + + +/* zio.c */ +#if defined(__LP64__) +int zio_use_uma = 1; +#else +int zio_use_uma = 0; +#endif + +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, use_uma, CTLFLAG_RDTUN, &zio_use_uma, 0, + "Use uma(9) for ZIO allocations"); +SYSCTL_INT(_vfs_zfs_zio, OID_AUTO, exclude_metadata, CTLFLAG_RDTUN, &zio_exclude_metadata, 0, + "Exclude metadata buffers from dumps as well"); + +int +param_set_slop_shift(SYSCTL_HANDLER_ARGS) +{ + int val; + int err; + + val = *(int *)arg1; + + err = sysctl_handle_int(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val < 1 || val > 31) + return (EINVAL); + + *(int *)arg1 = val; + + return (0); +} + +int +param_set_multihost_interval(SYSCTL_HANDLER_ARGS) +{ + int err; + + err = sysctl_handle_long(oidp, arg1, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (spa_mode_global != SPA_MODE_UNINIT) + mmp_signal_all_threads(); + + return (0); +} diff --git a/module/os/freebsd/zfs/vdev_file.c b/module/os/freebsd/zfs/vdev_file.c new file mode 100644 index 000000000000..4d27751c8893 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_file.c @@ -0,0 +1,328 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(max_ncpus, 16), + minclsyspri, max_ncpus, INT_MAX, 0); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + + error = zfs_file_getattr(vf->vf_file, &zfa); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +/* + * Implements the interrupt side for file vdev types. This routine will be + * called when the I/O completes allowing us to transfer the I/O to the + * interrupt taskqs. For consistency, the code structure mimics disk vdev + * types. + */ +static void +vdev_file_io_intr(zio_t *zio) +{ + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf; + void *buf; + ssize_t resid; + loff_t off; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + vf = vd->vdev_tsd; + + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + if (resid != 0 && zio->io_error == 0) + zio->io_error = ENOSPC; + + vdev_file_io_intr(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC|O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { +#ifdef notyet + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); + + /* XXX FreeBSD has no fallocate routine in file ops */ + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); +#endif + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); + return; + } + ASSERT(zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE); + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, 0); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_FILE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + vdev_file_open, + vdev_file_close, + vdev_default_asize, + vdev_file_io_start, + vdev_file_io_done, + NULL, + NULL, + vdev_file_hold, + vdev_file_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/freebsd/zfs/vdev_geom.c b/module/os/freebsd/zfs/vdev_geom.c new file mode 100644 index 000000000000..bf06f69192d9 --- /dev/null +++ b/module/os/freebsd/zfs/vdev_geom.c @@ -0,0 +1,1206 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright (c) 2012 Martin Matuska + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef g_topology_locked +#define g_topology_locked() sx_xlocked(&topology_lock) +#endif + +/* + * Virtual device vector for GEOM. + */ + +static g_attrchanged_t vdev_geom_attrchanged; +struct g_class zfs_vdev_class = { + .name = "ZFS::VDEV", + .version = G_VERSION, + .attrchanged = vdev_geom_attrchanged, +}; + +struct consumer_vdev_elem { + SLIST_ENTRY(consumer_vdev_elem) elems; + vdev_t *vd; +}; + +SLIST_HEAD(consumer_priv_t, consumer_vdev_elem); +/* BEGIN CSTYLED */ +_Static_assert(sizeof (((struct g_consumer *)NULL)->private) + == sizeof (struct consumer_priv_t*), + "consumer_priv_t* can't be stored in g_consumer.private"); + +DECLARE_GEOM_CLASS(zfs_vdev_class, zfs_vdev); + +SYSCTL_DECL(_vfs_zfs_vdev); +/* Don't send BIO_FLUSH. */ +static int vdev_geom_bio_flush_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_flush_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_flush_disable, 0, "Disable BIO_FLUSH"); +/* Don't send BIO_DELETE. */ +static int vdev_geom_bio_delete_disable; +SYSCTL_INT(_vfs_zfs_vdev, OID_AUTO, bio_delete_disable, CTLFLAG_RWTUN, + &vdev_geom_bio_delete_disable, 0, "Disable BIO_DELETE"); +/* END CSTYLED */ + +/* Declare local functions */ +static void vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read); + +/* + * Thread local storage used to indicate when a thread is probing geoms + * for their guids. If NULL, this thread is not tasting geoms. If non NULL, + * it is looking for a replacement for the vdev_t* that is its value. + */ +uint_t zfs_geom_probe_vdev_key; + +static void +vdev_geom_set_physpath(vdev_t *vd, struct g_consumer *cp, + boolean_t do_null_update) +{ + boolean_t needs_update = B_FALSE; + char *physpath; + int error, physpath_len; + + physpath_len = MAXPATHLEN; + physpath = g_malloc(physpath_len, M_WAITOK|M_ZERO); + error = g_io_getattr("GEOM::physpath", cp, &physpath_len, physpath); + if (error == 0) { + char *old_physpath; + + /* g_topology lock ensures that vdev has not been closed */ + g_topology_assert(); + old_physpath = vd->vdev_physpath; + vd->vdev_physpath = spa_strdup(physpath); + + if (old_physpath != NULL) { + needs_update = (strcmp(old_physpath, + vd->vdev_physpath) != 0); + spa_strfree(old_physpath); + } else + needs_update = do_null_update; + } + g_free(physpath); + + /* + * If the physical path changed, update the config. + * Only request an update for previously unset physpaths if + * requested by the caller. + */ + if (needs_update) + spa_async_request(vd->vdev_spa, SPA_ASYNC_CONFIG_UPDATE); + +} + +static void +vdev_geom_attrchanged(struct g_consumer *cp, const char *attr) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vdev_t *vd = elem->vd; + if (strcmp(attr, "GEOM::physpath") == 0) { + vdev_geom_set_physpath(vd, cp, /* null_update */B_TRUE); + return; + } + } +} + +static void +vdev_geom_resize(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + spa_t *spa; + vdev_t *vd; + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + return; + + SLIST_FOREACH(elem, priv, elems) { + vd = elem->vd; + if (vd->vdev_state != VDEV_STATE_HEALTHY) + continue; + spa = vd->vdev_spa; + if (!spa->spa_autoexpand) + continue; + vdev_online(spa, vd->vdev_guid, ZFS_ONLINE_EXPAND, NULL); + } +} + +static void +vdev_geom_orphan(struct g_consumer *cp) +{ + struct consumer_priv_t *priv; + // cppcheck-suppress uninitvar + struct consumer_vdev_elem *elem; + + g_topology_assert(); + + priv = (struct consumer_priv_t *)&cp->private; + if (SLIST_EMPTY(priv)) + /* Vdev close in progress. Ignore the event. */ + return; + + /* + * Orphan callbacks occur from the GEOM event thread. + * Concurrent with this call, new I/O requests may be + * working their way through GEOM about to find out + * (only once executed by the g_down thread) that we've + * been orphaned from our disk provider. These I/Os + * must be retired before we can detach our consumer. + * This is most easily achieved by acquiring the + * SPA ZIO configuration lock as a writer, but doing + * so with the GEOM topology lock held would cause + * a lock order reversal. Instead, rely on the SPA's + * async removal support to invoke a close on this + * vdev once it is safe to do so. + */ + // cppcheck-suppress All + SLIST_FOREACH(elem, priv, elems) { + // cppcheck-suppress uninitvar + vdev_t *vd = elem->vd; + + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(vd->vdev_spa, SPA_ASYNC_REMOVE); + } +} + +static struct g_consumer * +vdev_geom_attach(struct g_provider *pp, vdev_t *vd, boolean_t sanity) +{ + struct g_geom *gp; + struct g_consumer *cp; + int error; + + g_topology_assert(); + + ZFS_LOG(1, "Attaching to %s.", pp->name); + + if (sanity) { + if (pp->sectorsize > VDEV_PAD_SIZE || !ISP2(pp->sectorsize)) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible sectorsize %d\n", + pp->name, pp->sectorsize); + return (NULL); + } else if (pp->mediasize < SPA_MINDEVSIZE) { + ZFS_LOG(1, "Failing attach of %s. " + "Incompatible mediasize %ju\n", + pp->name, pp->mediasize); + return (NULL); + } + } + + /* Do we have geom already? No? Create one. */ + LIST_FOREACH(gp, &zfs_vdev_class.geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + if (strcmp(gp->name, "zfs::vdev") != 0) + continue; + break; + } + if (gp == NULL) { + gp = g_new_geomf(&zfs_vdev_class, "zfs::vdev"); + gp->orphan = vdev_geom_orphan; + gp->attrchanged = vdev_geom_attrchanged; + gp->resize = vdev_geom_resize; + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", __func__, + __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created geom and consumer for %s.", pp->name); + } else { + /* Check if we are already connected to this provider. */ + LIST_FOREACH(cp, &gp->consumer, consumer) { + if (cp->provider == pp) { + ZFS_LOG(1, "Found consumer for %s.", pp->name); + break; + } + } + if (cp == NULL) { + cp = g_new_consumer(gp); + error = g_attach(cp, pp); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_attach failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + vdev_geom_detach(cp, B_FALSE); + return (NULL); + } + ZFS_LOG(1, "Created consumer for %s.", pp->name); + } else { + error = g_access(cp, 1, 0, 1); + if (error != 0) { + ZFS_LOG(1, "%s(%d): g_access failed: %d\n", + __func__, __LINE__, error); + return (NULL); + } + ZFS_LOG(1, "Used existing consumer for %s.", pp->name); + } + } + + if (vd != NULL) + vd->vdev_tsd = cp; + + cp->flags |= G_CF_DIRECT_SEND | G_CF_DIRECT_RECEIVE; + return (cp); +} + +static void +vdev_geom_detach(struct g_consumer *cp, boolean_t open_for_read) +{ + struct g_geom *gp; + + g_topology_assert(); + + ZFS_LOG(1, "Detaching from %s.", + cp->provider && cp->provider->name ? cp->provider->name : "NULL"); + + gp = cp->geom; + if (open_for_read) + g_access(cp, -1, 0, -1); + /* Destroy consumer on last close. */ + if (cp->acr == 0 && cp->ace == 0) { + if (cp->acw > 0) + g_access(cp, 0, -cp->acw, 0); + if (cp->provider != NULL) { + ZFS_LOG(1, "Destroying consumer for %s.", + cp->provider->name ? cp->provider->name : "NULL"); + g_detach(cp); + } + g_destroy_consumer(cp); + } + /* Destroy geom if there are no consumers left. */ + if (LIST_EMPTY(&gp->consumer)) { + ZFS_LOG(1, "Destroyed geom %s.", gp->name); + g_wither_geom(gp, ENXIO); + } +} + +static void +vdev_geom_close_locked(vdev_t *vd) +{ + struct g_consumer *cp; + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem, *elem_temp; + + g_topology_assert(); + + cp = vd->vdev_tsd; + vd->vdev_delayed_close = B_FALSE; + if (cp == NULL) + return; + + ZFS_LOG(1, "Closing access to %s.", cp->provider->name); + KASSERT(cp->private != NULL, ("%s: cp->private is NULL", __func__)); + priv = (struct consumer_priv_t *)&cp->private; + vd->vdev_tsd = NULL; + SLIST_FOREACH_SAFE(elem, priv, elems, elem_temp) { + if (elem->vd == vd) { + SLIST_REMOVE(priv, elem, consumer_vdev_elem, elems); + g_free(elem); + } + } + + vdev_geom_detach(cp, B_TRUE); +} + +/* + * Issue one or more bios to the vdev in parallel + * cmds, datas, offsets, errors, and sizes are arrays of length ncmds. Each IO + * operation is described by parallel entries from each array. There may be + * more bios actually issued than entries in the array + */ +static void +vdev_geom_io(struct g_consumer *cp, int *cmds, void **datas, off_t *offsets, + off_t *sizes, int *errors, int ncmds) +{ + struct bio **bios; + uint8_t *p; + off_t off, maxio, s, end; + int i, n_bios, j; + size_t bios_size; + + maxio = MAXPHYS - (MAXPHYS % cp->provider->sectorsize); + n_bios = 0; + + /* How many bios are required for all commands ? */ + for (i = 0; i < ncmds; i++) + n_bios += (sizes[i] + maxio - 1) / maxio; + + /* Allocate memory for the bios */ + bios_size = n_bios * sizeof (struct bio *); + bios = kmem_zalloc(bios_size, KM_SLEEP); + + /* Prepare and issue all of the bios */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + p = datas[i]; + s = sizes[i]; + end = off + s; + ASSERT((off % cp->provider->sectorsize) == 0); + ASSERT((s % cp->provider->sectorsize) == 0); + + for (; off < end; off += maxio, p += maxio, s -= maxio, j++) { + bios[j] = g_alloc_bio(); + bios[j]->bio_cmd = cmds[i]; + bios[j]->bio_done = NULL; + bios[j]->bio_offset = off; + bios[j]->bio_length = MIN(s, maxio); + bios[j]->bio_data = (caddr_t)p; + g_io_request(bios[j], cp); + } + } + ASSERT(j == n_bios); + + /* Wait for all of the bios to complete, and clean them up */ + for (i = j = 0; i < ncmds; i++) { + off = offsets[i]; + s = sizes[i]; + end = off + s; + + for (; off < end; off += maxio, s -= maxio, j++) { + errors[i] = biowait(bios[j], "vdev_geom_io") || + errors[i]; + g_destroy_bio(bios[j]); + } + } + kmem_free(bios, bios_size); +} + +/* + * Read the vdev config from a device. Return the number of valid labels that + * were found. The vdev config will be returned in config if and only if at + * least one valid label was found. + */ +static int +vdev_geom_read_config(struct g_consumer *cp, nvlist_t **configp) +{ + struct g_provider *pp; + nvlist_t *config; + vdev_phys_t *vdev_lists[VDEV_LABELS]; + char *buf; + size_t buflen; + uint64_t psize, state, txg; + off_t offsets[VDEV_LABELS]; + off_t size; + off_t sizes[VDEV_LABELS]; + int cmds[VDEV_LABELS]; + int errors[VDEV_LABELS]; + int l, nlabels; + + g_topology_assert_not(); + + pp = cp->provider; + ZFS_LOG(1, "Reading config from %s...", pp->name); + + psize = pp->mediasize; + psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); + + size = sizeof (*vdev_lists[0]) + pp->sectorsize - + ((sizeof (*vdev_lists[0]) - 1) % pp->sectorsize) - 1; + + buflen = sizeof (vdev_lists[0]->vp_nvlist); + + /* Create all of the IO requests */ + for (l = 0; l < VDEV_LABELS; l++) { + cmds[l] = BIO_READ; + vdev_lists[l] = kmem_alloc(size, KM_SLEEP); + offsets[l] = vdev_label_offset(psize, l, 0) + VDEV_SKIP_SIZE; + sizes[l] = size; + errors[l] = 0; + ASSERT(offsets[l] % pp->sectorsize == 0); + } + + /* Issue the IO requests */ + vdev_geom_io(cp, cmds, (void**)vdev_lists, offsets, sizes, errors, + VDEV_LABELS); + + /* Parse the labels */ + config = *configp = NULL; + nlabels = 0; + for (l = 0; l < VDEV_LABELS; l++) { + if (errors[l] != 0) + continue; + + buf = vdev_lists[l]->vp_nvlist; + + if (nvlist_unpack(buf, buflen, &config, 0) != 0) + continue; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || state > POOL_STATE_L2CACHE) { + nvlist_free(config); + continue; + } + + if (state != POOL_STATE_SPARE && + state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0 || txg == 0)) { + nvlist_free(config); + continue; + } + + if (*configp != NULL) + nvlist_free(*configp); + *configp = config; + nlabels++; + } + + /* Free the label storage */ + for (l = 0; l < VDEV_LABELS; l++) + kmem_free(vdev_lists[l], size); + + return (nlabels); +} + +static void +resize_configs(nvlist_t ***configs, uint64_t *count, uint64_t id) +{ + nvlist_t **new_configs; + uint64_t i; + + if (id < *count) + return; + new_configs = kmem_zalloc((id + 1) * sizeof (nvlist_t *), + KM_SLEEP); + for (i = 0; i < *count; i++) + new_configs[i] = (*configs)[i]; + if (*configs != NULL) + kmem_free(*configs, *count * sizeof (void *)); + *configs = new_configs; + *count = id + 1; +} + +static void +process_vdev_config(nvlist_t ***configs, uint64_t *count, nvlist_t *cfg, + const char *name, uint64_t *known_pool_guid) +{ + nvlist_t *vdev_tree; + uint64_t pool_guid; + uint64_t vdev_guid; + uint64_t id, txg, known_txg; + char *pname; + + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &pname) != 0 || + strcmp(pname, name) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + goto ignore; + + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_TOP_GUID, &vdev_guid) != 0) + goto ignore; + + if (nvlist_lookup_nvlist(cfg, ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) + goto ignore; + + if (nvlist_lookup_uint64(vdev_tree, ZPOOL_CONFIG_ID, &id) != 0) + goto ignore; + + VERIFY(nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + if (*known_pool_guid != 0) { + if (pool_guid != *known_pool_guid) + goto ignore; + } else + *known_pool_guid = pool_guid; + + resize_configs(configs, count, id); + + if ((*configs)[id] != NULL) { + VERIFY(nvlist_lookup_uint64((*configs)[id], + ZPOOL_CONFIG_POOL_TXG, &known_txg) == 0); + if (txg <= known_txg) + goto ignore; + nvlist_free((*configs)[id]); + } + + (*configs)[id] = cfg; + return; + +ignore: + nvlist_free(cfg); +} + +int +vdev_geom_read_pool_label(const char *name, + nvlist_t ***configs, uint64_t *count) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp; + struct g_consumer *zcp; + nvlist_t *vdev_cfg; + uint64_t pool_guid; + int nlabels; + + DROP_GIANT(); + g_topology_lock(); + + *configs = NULL; + *count = 0; + pool_guid = 0; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + if (pp->flags & G_PF_WITHER) + continue; + zcp = vdev_geom_attach(pp, NULL, B_TRUE); + if (zcp == NULL) + continue; + g_topology_unlock(); + nlabels = vdev_geom_read_config(zcp, &vdev_cfg); + g_topology_lock(); + vdev_geom_detach(zcp, B_TRUE); + if (nlabels == 0) + continue; + ZFS_LOG(1, "successfully read vdev config"); + + process_vdev_config(configs, count, + vdev_cfg, name, &pool_guid); + } + } + } + g_topology_unlock(); + PICKUP_GIANT(); + + return (*count > 0 ? 0 : ENOENT); +} + +enum match { + NO_MATCH = 0, /* No matching labels found */ + TOPGUID_MATCH = 1, /* Labels match top guid, not vdev guid */ + ZERO_MATCH = 1, /* Should never be returned */ + ONE_MATCH = 2, /* 1 label matching the vdev_guid */ + TWO_MATCH = 3, /* 2 label matching the vdev_guid */ + THREE_MATCH = 4, /* 3 label matching the vdev_guid */ + FULL_MATCH = 5 /* all labels match the vdev_guid */ +}; + +static enum match +vdev_attach_ok(vdev_t *vd, struct g_provider *pp) +{ + nvlist_t *config; + uint64_t pool_guid, top_guid, vdev_guid; + struct g_consumer *cp; + int nlabels; + + cp = vdev_geom_attach(pp, NULL, B_TRUE); + if (cp == NULL) { + ZFS_LOG(1, "Unable to attach tasting instance to %s.", + pp->name); + return (NO_MATCH); + } + g_topology_unlock(); + nlabels = vdev_geom_read_config(cp, &config); + g_topology_lock(); + vdev_geom_detach(cp, B_TRUE); + if (nlabels == 0) { + ZFS_LOG(1, "Unable to read config from %s.", pp->name); + return (NO_MATCH); + } + + pool_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid); + top_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TOP_GUID, &top_guid); + vdev_guid = 0; + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, &vdev_guid); + nvlist_free(config); + + /* + * Check that the label's pool guid matches the desired guid. + * Inactive spares and L2ARCs do not have any pool guid in the label. + */ + if (pool_guid != 0 && pool_guid != spa_guid(vd->vdev_spa)) { + ZFS_LOG(1, "pool guid mismatch for provider %s: %ju != %ju.", + pp->name, + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)pool_guid); + return (NO_MATCH); + } + + /* + * Check that the label's vdev guid matches the desired guid. + * The second condition handles possible race on vdev detach, when + * remaining vdev receives GUID of destroyed top level mirror vdev. + */ + if (vdev_guid == vd->vdev_guid) { + ZFS_LOG(1, "guids match for provider %s.", pp->name); + return (ZERO_MATCH + nlabels); + } else if (top_guid == vd->vdev_guid && vd == vd->vdev_top) { + ZFS_LOG(1, "top vdev guid match for provider %s.", pp->name); + return (TOPGUID_MATCH); + } + ZFS_LOG(1, "vdev guid mismatch for provider %s: %ju != %ju.", + pp->name, (uintmax_t)vd->vdev_guid, (uintmax_t)vdev_guid); + return (NO_MATCH); +} + +static struct g_consumer * +vdev_geom_attach_by_guids(vdev_t *vd) +{ + struct g_class *mp; + struct g_geom *gp; + struct g_provider *pp, *best_pp; + struct g_consumer *cp; + const char *vdpath; + enum match match, best_match; + + g_topology_assert(); + + vdpath = vd->vdev_path + sizeof ("/dev/") - 1; + cp = NULL; + best_pp = NULL; + best_match = NO_MATCH; + LIST_FOREACH(mp, &g_classes, class) { + if (mp == &zfs_vdev_class) + continue; + LIST_FOREACH(gp, &mp->geom, geom) { + if (gp->flags & G_GEOM_WITHER) + continue; + LIST_FOREACH(pp, &gp->provider, provider) { + match = vdev_attach_ok(vd, pp); + if (match > best_match) { + best_match = match; + best_pp = pp; + } else if (match == best_match) { + if (strcmp(pp->name, vdpath) == 0) { + best_pp = pp; + } + } + if (match == FULL_MATCH) + goto out; + } + } + } + +out: + if (best_pp) { + cp = vdev_geom_attach(best_pp, vd, B_TRUE); + if (cp == NULL) { + printf("ZFS WARNING: Unable to attach to %s.\n", + best_pp->name); + } + } + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_guids(vdev_t *vd) +{ + struct g_consumer *cp; + char *buf; + size_t len; + + g_topology_assert(); + + ZFS_LOG(1, "Searching by guids [%ju:%ju].", + (uintmax_t)spa_guid(vd->vdev_spa), (uintmax_t)vd->vdev_guid); + cp = vdev_geom_attach_by_guids(vd); + if (cp != NULL) { + len = strlen(cp->provider->name) + strlen("/dev/") + 1; + buf = kmem_alloc(len, KM_SLEEP); + + snprintf(buf, len, "/dev/%s", cp->provider->name); + spa_strfree(vd->vdev_path); + vd->vdev_path = buf; + + ZFS_LOG(1, "Attach by guid [%ju:%ju] succeeded, provider %s.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid, cp->provider->name); + } else { + ZFS_LOG(1, "Search by guid [%ju:%ju] failed.", + (uintmax_t)spa_guid(vd->vdev_spa), + (uintmax_t)vd->vdev_guid); + } + + return (cp); +} + +static struct g_consumer * +vdev_geom_open_by_path(vdev_t *vd, int check_guid) +{ + struct g_provider *pp; + struct g_consumer *cp; + + g_topology_assert(); + + cp = NULL; + pp = g_provider_by_name(vd->vdev_path + sizeof ("/dev/") - 1); + if (pp != NULL) { + ZFS_LOG(1, "Found provider by name %s.", vd->vdev_path); + if (!check_guid || vdev_attach_ok(vd, pp) == FULL_MATCH) + cp = vdev_geom_attach(pp, vd, B_FALSE); + } + + return (cp); +} + +static int +vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + struct g_provider *pp; + struct g_consumer *cp; + int error, has_trim; + uint16_t rate; + + /* + * Set the TLS to indicate downstack that we + * should not access zvols + */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, vd) == 0); + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || strncmp(vd->vdev_path, "/dev/", 5) != 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (EINVAL); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if ((cp = vd->vdev_tsd) != NULL) { + ASSERT(vd->vdev_reopening); + goto skip_open; + } + + DROP_GIANT(); + g_topology_lock(); + error = 0; + + if (vd->vdev_spa->spa_is_splitting || + ((vd->vdev_prevstate == VDEV_STATE_UNKNOWN && + (vd->vdev_spa->spa_load_state == SPA_LOAD_NONE || + vd->vdev_spa->spa_load_state == SPA_LOAD_CREATE)))) { + /* + * We are dealing with a vdev that hasn't been previously + * opened (since boot), and we are not loading an + * existing pool configuration. This looks like a + * vdev add operation to a new or existing pool. + * Assume the user really wants to do this, and find + * GEOM provider by its name, ignoring GUID mismatches. + * + * XXPOLICY: It would be safer to only allow a device + * that is unlabeled or labeled but missing + * GUID information to be opened in this fashion, + * unless we are doing a split, in which case we + * should allow any guid. + */ + cp = vdev_geom_open_by_path(vd, 0); + } else { + /* + * Try using the recorded path for this device, but only + * accept it if its label data contains the expected GUIDs. + */ + cp = vdev_geom_open_by_path(vd, 1); + if (cp == NULL) { + /* + * The device at vd->vdev_path doesn't have the + * expected GUIDs. The disks might have merely + * moved around so try all other GEOM providers + * to find one with the right GUIDs. + */ + cp = vdev_geom_open_by_guids(vd); + } + } + + /* Clear the TLS now that tasting is done */ + VERIFY(tsd_set(zfs_geom_probe_vdev_key, NULL) == 0); + + if (cp == NULL) { + ZFS_LOG(1, "Vdev %s not found.", vd->vdev_path); + error = ENOENT; + } else { + struct consumer_priv_t *priv; + struct consumer_vdev_elem *elem; + int spamode; + + priv = (struct consumer_priv_t *)&cp->private; + if (cp->private == NULL) + SLIST_INIT(priv); + elem = g_malloc(sizeof (*elem), M_WAITOK|M_ZERO); + elem->vd = vd; + SLIST_INSERT_HEAD(priv, elem, elems); + + spamode = spa_mode(vd->vdev_spa); + if (cp->provider->sectorsize > VDEV_PAD_SIZE || + !ISP2(cp->provider->sectorsize)) { + ZFS_LOG(1, "Provider %s has unsupported sectorsize.", + cp->provider->name); + + vdev_geom_close_locked(vd); + error = EINVAL; + cp = NULL; + } else if (cp->acw == 0 && (spamode & FWRITE) != 0) { + int i; + + for (i = 0; i < 5; i++) { + error = g_access(cp, 0, 1, 0); + if (error == 0) + break; + g_topology_unlock(); + tsleep(vd, 0, "vdev", hz / 2); + g_topology_lock(); + } + if (error != 0) { + printf("ZFS WARNING: Unable to open %s for " + "writing (error=%d).\n", + cp->provider->name, error); + vdev_geom_close_locked(vd); + cp = NULL; + } + } + } + + /* Fetch initial physical path information for this device. */ + if (cp != NULL) { + vdev_geom_attrchanged(cp, "GEOM::physpath"); + + /* Set other GEOM characteristics */ + vdev_geom_set_physpath(vd, cp, /* do_null_update */B_FALSE); + } + + g_topology_unlock(); + PICKUP_GIANT(); + if (cp == NULL) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + vdev_dbgmsg(vd, "vdev_geom_open: failed to open [error=%d]", + error); + return (error); + } +skip_open: + pp = cp->provider; + + /* + * Determine the actual size of the device. + */ + *max_psize = *psize = pp->mediasize; + + /* + * Determine the device's minimum transfer size and preferred + * transfer size. + */ + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = 0; + if (pp->stripesize && pp->stripesize > (1 << *logical_ashift) && + ISP2(pp->stripesize) && pp->stripesize <= (1 << ASHIFT_MAX) && + pp->stripeoffset == 0) + *physical_ashift = highbit(pp->stripesize) - 1; + + /* + * Clear the nowritecache settings, so that on a vdev_reopen() + * we will try again. + */ + vd->vdev_nowritecache = B_FALSE; + + /* Inform the ZIO pipeline that we are non-rotational. */ + error = g_getattr("GEOM::rotation_rate", cp, &rate); + if (error == 0 && rate == DISK_RR_NON_ROTATING) + vd->vdev_nonrot = B_TRUE; + else + vd->vdev_nonrot = B_FALSE; + + /* Set when device reports it supports TRIM. */ + error = g_getattr("GEOM::candelete", cp, &has_trim); + vd->vdev_has_trim = (error == 0 && has_trim); + + /* Set when device reports it supports secure TRIM. */ + /* unavailable on FreeBSD */ + vd->vdev_has_securetrim = B_FALSE; + + return (0); +} + +static void +vdev_geom_close(vdev_t *vd) +{ + struct g_consumer *cp; + boolean_t locked; + + cp = vd->vdev_tsd; + + DROP_GIANT(); + locked = g_topology_locked(); + if (!locked) + g_topology_lock(); + + if (!vd->vdev_reopening || + (cp != NULL && ((cp->flags & G_CF_ORPHAN) != 0 || + (cp->provider != NULL && cp->provider->error != 0)))) + vdev_geom_close_locked(vd); + + if (!locked) + g_topology_unlock(); + PICKUP_GIANT(); +} + +static void +vdev_geom_io_intr(struct bio *bp) +{ + vdev_t *vd; + zio_t *zio; + + zio = bp->bio_caller1; + vd = zio->io_vd; + zio->io_error = bp->bio_error; + if (zio->io_error == 0 && bp->bio_resid != 0) + zio->io_error = SET_ERROR(EIO); + + switch (zio->io_error) { + case ENOTSUP: + /* + * If we get ENOTSUP for BIO_FLUSH or BIO_DELETE we know + * that future attempts will never succeed. In this case + * we set a persistent flag so that we don't bother with + * requests in the future. + */ + switch (bp->bio_cmd) { + case BIO_FLUSH: + vd->vdev_nowritecache = B_TRUE; + break; + case BIO_DELETE: + break; + } + break; + case ENXIO: + if (!vd->vdev_remove_wanted) { + /* + * If provider's error is set we assume it is being + * removed. + */ + if (bp->bio_to->error != 0) { + vd->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, + SPA_ASYNC_REMOVE); + } else if (!vd->vdev_delayed_close) { + vd->vdev_delayed_close = B_TRUE; + } + } + break; + } + + /* + * We have to split bio freeing into two parts, because the ABD code + * cannot be called in this context and vdev_op_io_done is not called + * for ZIO_TYPE_IOCTL zio-s. + */ + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + g_destroy_bio(bp); + zio->io_bio = NULL; + } + zio_delay_interrupt(zio); +} + +static void +vdev_geom_io_start(zio_t *zio) +{ + vdev_t *vd; + struct g_consumer *cp; + struct bio *bp; + + vd = zio->io_vd; + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } else { + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + if (zfs_nocacheflush || + vdev_geom_bio_flush_disable) + break; + if (vd->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + goto sendreq; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + } + + zio_execute(zio); + return; + case ZIO_TYPE_TRIM: + if (!vdev_geom_bio_delete_disable) { + goto sendreq; + } + zio_execute(zio); + return; + default: + ; + /* PASSTHROUGH --- placate compiler */ + } +sendreq: + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || + zio->io_type == ZIO_TYPE_TRIM || + zio->io_type == ZIO_TYPE_IOCTL); + + cp = vd->vdev_tsd; + if (cp == NULL) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + bp = g_alloc_bio(); + bp->bio_caller1 = zio; + switch (zio->io_type) { + case ZIO_TYPE_READ: + case ZIO_TYPE_WRITE: + zio->io_target_timestamp = zio_handle_io_delay(zio); + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + if (zio->io_type == ZIO_TYPE_READ) { + bp->bio_cmd = BIO_READ; + bp->bio_data = + abd_borrow_buf(zio->io_abd, zio->io_size); + } else { + bp->bio_cmd = BIO_WRITE; + bp->bio_data = + abd_borrow_buf_copy(zio->io_abd, zio->io_size); + } + break; + case ZIO_TYPE_TRIM: + bp->bio_cmd = BIO_DELETE; + bp->bio_data = NULL; + bp->bio_offset = zio->io_offset; + bp->bio_length = zio->io_size; + break; + case ZIO_TYPE_IOCTL: + bp->bio_cmd = BIO_FLUSH; + bp->bio_flags |= BIO_ORDERED; + bp->bio_data = NULL; + bp->bio_offset = cp->provider->mediasize; + bp->bio_length = 0; + break; + default: + panic("invalid zio->io_type: %d\n", zio->io_type); + } + bp->bio_done = vdev_geom_io_intr; + zio->io_bio = bp; + + g_io_request(bp, cp); +} + +static void +vdev_geom_io_done(zio_t *zio) +{ + struct bio *bp = zio->io_bio; + + if (zio->io_type != ZIO_TYPE_READ && zio->io_type != ZIO_TYPE_WRITE) { + ASSERT(bp == NULL); + return; + } + + if (bp == NULL) { + ASSERT3S(zio->io_error, ==, ENXIO); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) + abd_return_buf_copy(zio->io_abd, bp->bio_data, zio->io_size); + else + abd_return_buf(zio->io_abd, bp->bio_data, zio->io_size); + + g_destroy_bio(bp); + zio->io_bio = NULL; +} + +static void +vdev_geom_hold(vdev_t *vd) +{ +} + +static void +vdev_geom_rele(vdev_t *vd) +{ +} + +vdev_ops_t vdev_disk_ops = { + vdev_geom_open, + vdev_geom_close, + vdev_default_asize, + vdev_geom_io_start, + vdev_geom_io_done, + NULL, + NULL, + vdev_geom_hold, + vdev_geom_rele, + NULL, + vdev_default_xlate, + VDEV_TYPE_DISK, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/module/os/freebsd/zfs/vdev_label_os.c b/module/os/freebsd/zfs/vdev_label_os.c new file mode 100644 index 000000000000..97cb201934dc --- /dev/null +++ b/module/os/freebsd/zfs/vdev_label_os.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +vdev_label_write_pad2(vdev_t *vd, const char *buf, size_t size) +{ + spa_t *spa = vd->vdev_spa; + zio_t *zio; + abd_t *pad2; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error; + + if (size > VDEV_PAD_SIZE) + return (EINVAL); + + if (!vd->vdev_ops->vdev_op_leaf) + return (ENODEV); + if (vdev_is_dead(vd)) + return (ENXIO); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + pad2 = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(pad2, VDEV_PAD_SIZE); + abd_copy_from_buf(pad2, buf, size); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + vdev_label_write(zio, vd, 0, pad2, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(pad2); + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_acl.c b/module/os/freebsd/zfs/zfs_acl.c new file mode 100644 index 000000000000..018120c82ab3 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_acl.c @@ -0,0 +1,2700 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + zfs_ace_v0_get_mask, + zfs_ace_v0_set_mask, + zfs_ace_v0_get_flags, + zfs_ace_v0_set_flags, + zfs_ace_v0_get_type, + zfs_ace_v0_set_type, + zfs_ace_v0_get_who, + zfs_ace_v0_set_who, + zfs_ace_v0_size, + zfs_ace_v0_abstract_size, + zfs_ace_v0_mask_off, + zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + zfs_ace_fuid_get_mask, + zfs_ace_fuid_set_mask, + zfs_ace_fuid_get_flags, + zfs_ace_fuid_set_flags, + zfs_ace_fuid_get_type, + zfs_ace_fuid_set_type, + zfs_ace_fuid_get_who, + zfs_ace_fuid_set_who, + zfs_ace_fuid_size, + zfs_ace_fuid_abstract_size, + zfs_ace_fuid_mask_off, + zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(zp->z_zfsvfs), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(zp->z_zfsvfs->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(vtype_t obj_type, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (obj_type == VDIR && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +static int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_type, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * everytime. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(znode_t *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize; + int acl_count; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(zp->z_zfsvfs->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zp->z_zfsvfs), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + + if ((error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE)) == 0) + zp->z_mode = zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, zp->z_uid, zp->z_gid); + return (error); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + zp->z_uid, zp->z_gid); + + zp->z_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(vtype_t vtype, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + boolean_t isdir; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + isdir = (vtype == VDIR); + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions granted by ACEs to be no greater + * than permissions of the requested group mode. + * Applies when the "aclmode" property is set to + * "groupmask". + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(ZTOV(zp)->v_type, mode, B_TRUE, + (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!((vtype == VDIR) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = (vtype == VDIR); + boolean_t isreg = (vtype == VREG); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || vtype == VLNK) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(vtype, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + aclp->z_acl_count != 0) { + *need_chmod = B_FALSE; + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zfs_acl_t *paclp; + gid_t gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + if ((flag & IS_ROOT_NODE) == 0) { + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + } else + ASSERT(dzp->z_vnode == NULL); + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (vap->va_type == VDIR))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != dzp->z_gid && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = + zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } + } + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(ZTOV(dzp), cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY0(zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (vap->va_type == VDIR) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_type, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_type, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_type, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && + (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common(). + */ + if ((v4_mode & WRITE_MASK_DATA) && + (zp->z_pflags & ZFS_IMMUTABLE)) { + return (SET_ERROR(EPERM)); + } + + /* + * In FreeBSD we allow to modify directory's content is ZFS_NOUNLINK + * (sunlnk) is set. We just don't allow directory removal, which is + * handled in zfs_zaccess_delete(). + */ + if ((v4_mode & ACE_DELETE) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (EPERM); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + if (zp->z_zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(zp), __func__); + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != UID_NOBODY && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOV(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + /* + * Note: ZFS_READONLY represents the "DOS R/O" attribute. + * When that flag is set, we should behave as if write access + * were not granted by anything in the ACL. In particular: + * We _must_ allow writes after opening the file r/w, then + * setting the DOS R/O attribute, and writing some more. + * (Similar to how you can write after fchmod(fd, 0444).) + * + * Therefore ZFS_READONLY is ignored in the dataset check + * above, and checked here as if part of the ACL check. + * Also note: DOS R/O is ignored for directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + (ZTOV(zp)->v_type != VDIR) && + (zp->z_pflags & ZFS_READONLY)) { + return (SET_ERROR(EPERM)); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +/* + * Check if VEXEC is allowed. + * + * This routine is based on zfs_fastaccesschk_execute which has slowpath + * calling zfs_zaccess. This would be incorrect on FreeBSD (see + * zfs_freebsd_access for the difference). Thus this variant let's the + * caller handle the slowpath (if necessary). + * + * On top of that we perform a lockless check for ZFS_NO_EXECS_DENIED. + * + * Safe access to znode_t is provided by the vnode lock. + */ +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t is_attr; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (1); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + return (1); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) + return (0); + + return (1); +} + + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp = NULL; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && (ZTOV(zp)->v_type == VDIR)); + +#ifdef __FreeBSD_kernel__ + /* + * In FreeBSD, we don't care about permissions of individual ADS. + * Note that not checking them is not just an optimization - without + * this shortcut, EA operations may bogusly fail with EACCES. + */ + if (zp->z_pflags & ZFS_XATTR) + return (0); +#else + /* + * If attribute then validate against base file + */ + if (is_attr) { + uint64_t parent; + + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zp->z_zfsvfs), &parent, + sizeof (parent))) != 0) + return (error); + + if ((error = zfs_zget(zp->z_zfsvfs, + parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } +#endif + + owner = zfs_fuid_map_id(zp->z_zfsvfs, zp->z_uid, cr, ZFS_OWNER); + /* + * Map the bits required to the standard vnode flags VREAD|VWRITE|VEXEC + * in needed_bits. Map the bits mapped by working_mode (currently + * missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= VWRITE; + if (working_mode & ACE_EXECUTE) + needed_bits |= VEXEC; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + VN_RELE(ZTOV(xzp)); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + vnode_t *check_vp = ZTOV(check_zp); + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VREAD; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= VWRITE; + if (working_mode & ACE_EXECUTE) + checkmode |= VEXEC; + + error = secpolicy_vnode_access2(cr, check_vp, owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(check_vp, cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(check_vp, cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(check_vp, cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(check_vp, cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOV(zp), owner, + needed_bits, needed_bits); + } + + + if (is_attr) + VN_RELE(ZTOV(xzp)); + + return (error); +} + +/* + * Translate traditional unix VREAD/VWRITE/VEXEC mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +static int +zfs_delete_final_check(znode_t *zp, znode_t *dzp, + mode_t available_perms, cred_t *cr) +{ + int error; + uid_t downer; + + downer = zfs_fuid_map_id(dzp->z_zfsvfs, dzp->z_uid, cr, ZFS_OWNER); + + error = secpolicy_vnode_access2(cr, ZTOV(dzp), + downer, available_perms, VWRITE|VEXEC); + + if (error == 0) + error = zfs_sticky_remove_access(dzp, zp, cr); + + return (error); +} + +/* + * Determine whether Access should be granted/deny, without + * consulting least priv subsystem. + * + * The following chart is the recommended NFSv4 enforcement for + * ability to delete an object. + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Permit | Permit | + * | DELETE_CHILD | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Permit | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * No search privilege, can't even look up file? + * + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + mode_t available_perms; + boolean_t dzpcheck_privs = B_TRUE; + boolean_t zpcheck_privs = B_TRUE; + + /* + * We want specific DELETE permissions to + * take precedence over WRITE/EXECUTE. We don't + * want an ACL such as this to mess us up. + * user:joe:write_data:deny,user:joe:delete:allow + * + * However, deny permissions may ultimately be overridden + * by secpolicy_vnode_access(). + * + * We will ask for all of the necessary permissions and then + * look at the working modes from the directory and target object + * to determine what was found. + */ + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * First row + * If the directory permissions allow the delete, we are done. + */ + if ((dzp_error = zfs_zaccess_common(dzp, ACE_DELETE_CHILD, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + /* + * If target object has delete permission then we are done + */ + if ((zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr)) == 0) + return (0); + + ASSERT(dzp_error && zp_error); + + if (!dzpcheck_privs) + return (dzp_error); + if (!zpcheck_privs) + return (zp_error); + + /* + * Second row + * + * If directory returns EACCES then delete_child was denied + * due to deny delete_child. In this case send the request through + * secpolicy_vnode_remove(). We don't use zfs_delete_final_check() + * since that *could* allow the delete based on write/execute permission + * and we want delete permissions to override write/execute. + */ + + if (dzp_error == EACCES) { + /* XXXPJD: s/dzp/zp/ ? */ + return (secpolicy_vnode_remove(ZTOV(dzp), cr)); + } + /* + * Third Row + * only need to see if we have write/execute on directory. + */ + + dzp_error = zfs_zaccess_common(dzp, ACE_EXECUTE|ACE_WRITE_DATA, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + + if (dzp_error != 0 && !dzpcheck_privs) + return (dzp_error); + + /* + * Fourth row + */ + + available_perms = (dzp_working_mode & ACE_WRITE_DATA) ? 0 : VWRITE; + available_perms |= (dzp_working_mode & ACE_EXECUTE) ? 0 : VEXEC; + + return (zfs_delete_final_check(zp, dzp, available_perms, cr)); + +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = (ZTOV(szp)->v_type == VDIR) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + * + * BSD operating systems also require write permission + * on the directory being moved from one parent directory + * to another. + */ + if (ZTOV(szp)->v_type == VDIR && ZTOV(sdzp) != ZTOV(tdzp)) { + if ((error = zfs_zaccess(szp, ACE_WRITE_DATA, 0, B_FALSE, cr))) + return (error); + } + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp && (error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/freebsd/zfs/zfs_ctldir.c b/module/os/freebsd/zfs/zfs_ctldir.c new file mode 100644 index 000000000000..0fe32b19520c --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ctldir.c @@ -0,0 +1,1350 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' directory, but this may expand in the + * future. The elements are built using the GFS primitives, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding vnode. + * + * All mounts are handled automatically by the kernel, but unmounts are + * (currently) handled from user land. The main reason is that there is no + * reliable way to auto-unmount the filesystem when it's "no longer in use". + * When the user unmounts a filesystem, we call zfsctl_unmount(), which + * unmounts any snapshots within the snapshot directory. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') are all GFS nodes and + * share the same vfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted ontop of the GFS nodes '.zfs/snapshot/' + * (ie: snapshots) are ZFS nodes and have their own unique vfs_t. + * However, vnodes within these mounted on file systems have their v_vfsp + * fields set to the head filesystem to make NFS happy (see + * zfsctl_snapdir_lookup()). We VFS_HOLD the head filesystem's vfs_t + * so that it cannot be freed until all snapshots have been unmounted. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" + +#include +#include + +/* Common access mode for all virtual directories under the ctldir */ +const uint16_t zfsctl_ctldir_mode = S_IRUSR | S_IXUSR | S_IRGRP | S_IXGRP | + S_IROTH | S_IXOTH; + +/* + * "Synthetic" filesystem implementation. + */ + +/* + * Assert that A implies B. + */ +#define KASSERT_IMPLY(A, B, msg) KASSERT(!(A) || (B), (msg)); + +static MALLOC_DEFINE(M_SFSNODES, "sfs_nodes", "synthetic-fs nodes"); + +typedef struct sfs_node { + char sn_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t sn_parent_id; + uint64_t sn_id; +} sfs_node_t; + +/* + * Check the parent's ID as well as the node's to account for a chance + * that IDs originating from different domains (snapshot IDs, artificial + * IDs, znode IDs) may clash. + */ +static int +sfs_compare_ids(struct vnode *vp, void *arg) +{ + sfs_node_t *n1 = vp->v_data; + sfs_node_t *n2 = arg; + bool equal; + + equal = n1->sn_id == n2->sn_id && + n1->sn_parent_id == n2->sn_parent_id; + + /* Zero means equality. */ + return (!equal); +} + +static int +sfs_vnode_get(const struct mount *mp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + sfs_node_t search; + int err; + + search.sn_id = id; + search.sn_parent_id = parent_id; + err = vfs_hash_get(mp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, &search); + return (err); +} + +static int +sfs_vnode_insert(struct vnode *vp, int flags, uint64_t parent_id, + uint64_t id, struct vnode **vpp) +{ + int err; + + KASSERT(vp->v_data != NULL, ("sfs_vnode_insert with NULL v_data")); + err = vfs_hash_insert(vp, (uint32_t)id, flags, curthread, vpp, + sfs_compare_ids, vp->v_data); + return (err); +} + +static void +sfs_vnode_remove(struct vnode *vp) +{ + vfs_hash_remove(vp); +} + +typedef void sfs_vnode_setup_fn(vnode_t *vp, void *arg); + +static int +sfs_vgetx(struct mount *mp, int flags, uint64_t parent_id, uint64_t id, + const char *tag, struct vop_vector *vops, + sfs_vnode_setup_fn setup, void *arg, + struct vnode **vpp) +{ + struct vnode *vp; + int error; + + error = sfs_vnode_get(mp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + /* Allocate a new vnode/inode. */ + error = getnewvnode(tag, mp, vops, &vp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + /* + * Exclusively lock the vnode vnode while it's being constructed. + */ + lockmgr(vp->v_vnlock, LK_EXCLUSIVE, NULL); + error = insmntque(vp, mp); + if (error != 0) { + *vpp = NULL; + return (error); + } + + setup(vp, arg); + + error = sfs_vnode_insert(vp, flags, parent_id, id, vpp); + if (error != 0 || *vpp != NULL) { + KASSERT_IMPLY(error == 0, (*vpp)->v_data != NULL, + "sfs vnode with no data"); + return (error); + } + + *vpp = vp; + return (0); +} + +static void +sfs_print_node(sfs_node_t *node) +{ + printf("\tname = %s\n", node->sn_name); + printf("\tparent_id = %ju\n", (uintmax_t)node->sn_parent_id); + printf("\tid = %ju\n", (uintmax_t)node->sn_id); +} + +static sfs_node_t * +sfs_alloc_node(size_t size, const char *name, uint64_t parent_id, uint64_t id) +{ + struct sfs_node *node; + + KASSERT(strlen(name) < sizeof (node->sn_name), + ("sfs node name is too long")); + KASSERT(size >= sizeof (*node), ("sfs node size is too small")); + node = malloc(size, M_SFSNODES, M_WAITOK | M_ZERO); + strlcpy(node->sn_name, name, sizeof (node->sn_name)); + node->sn_parent_id = parent_id; + node->sn_id = id; + + return (node); +} + +static void +sfs_destroy_node(sfs_node_t *node) +{ + free(node, M_SFSNODES); +} + +static void * +sfs_reclaim_vnode(vnode_t *vp) +{ + void *data; + + sfs_vnode_remove(vp); + data = vp->v_data; + vp->v_data = NULL; + return (data); +} + +static int +sfs_readdir_common(uint64_t parent_id, uint64_t id, struct vop_readdir_args *ap, + uio_t *uio, off_t *offp) +{ + struct dirent entry; + int error; + + /* Reset ncookies for subsequent use of vfs_read_dirent. */ + if (ap->a_ncookies != NULL) + *ap->a_ncookies = 0; + + if (uio->uio_resid < sizeof (entry)) + return (SET_ERROR(EINVAL)); + + if (uio->uio_offset < 0) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == 0) { + entry.d_fileno = id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '\0'; + entry.d_namlen = 1; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (uio->uio_offset < sizeof (entry)) + return (SET_ERROR(EINVAL)); + if (uio->uio_offset == sizeof (entry)) { + entry.d_fileno = parent_id; + entry.d_type = DT_DIR; + entry.d_name[0] = '.'; + entry.d_name[1] = '.'; + entry.d_name[2] = '\0'; + entry.d_namlen = 2; + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) + return (SET_ERROR(error)); + } + + if (offp != NULL) + *offp = 2 * sizeof (entry); + return (0); +} + + +/* + * .zfs inode namespace + * + * We need to generate unique inode numbers for all files and directories + * within the .zfs pseudo-filesystem. We use the following scheme: + * + * ENTRY ZFSCTL_INODE + * .zfs 1 + * .zfs/snapshot 2 + * .zfs/snapshot/ objectid(snap) + */ +#define ZFSCTL_INO_SNAP(id) (id) + +static struct vop_vector zfsctl_ops_root; +static struct vop_vector zfsctl_ops_snapdir; +static struct vop_vector zfsctl_ops_snapshot; +static struct vop_vector zfsctl_ops_shares_dir; + +void +zfsctl_init(void) +{ +} + +void +zfsctl_fini(void) +{ +} + +boolean_t +zfsctl_is_node(vnode_t *vp) +{ + return (vn_matchops(vp, zfsctl_ops_root) || + vn_matchops(vp, zfsctl_ops_snapdir) || + vn_matchops(vp, zfsctl_ops_snapshot) || + vn_matchops(vp, zfsctl_ops_shares_dir)); + +} + +typedef struct zfsctl_root { + sfs_node_t node; + sfs_node_t *snapdir; + timestruc_t cmtime; +} zfsctl_root_t; + + +/* + * Create the '.zfs' directory. + */ +void +zfsctl_create(zfsvfs_t *zfsvfs) +{ + zfsctl_root_t *dot_zfs; + sfs_node_t *snapdir; + vnode_t *rvp; + uint64_t crtime[2]; + + ASSERT(zfsvfs->z_ctldir == NULL); + + snapdir = sfs_alloc_node(sizeof (*snapdir), "snapshot", ZFSCTL_INO_ROOT, + ZFSCTL_INO_SNAPDIR); + dot_zfs = (zfsctl_root_t *)sfs_alloc_node(sizeof (*dot_zfs), ".zfs", 0, + ZFSCTL_INO_ROOT); + dot_zfs->snapdir = snapdir; + + VERIFY(VFS_ROOT(zfsvfs->z_vfs, LK_EXCLUSIVE, &rvp) == 0); + VERIFY(0 == sa_lookup(VTOZ(rvp)->z_sa_hdl, SA_ZPL_CRTIME(zfsvfs), + &crtime, sizeof (crtime))); + ZFS_TIME_DECODE(&dot_zfs->cmtime, crtime); + vput(rvp); + + zfsvfs->z_ctldir = dot_zfs; +} + +/* + * Destroy the '.zfs' directory. Only called when the filesystem is unmounted. + * The nodes must not have any associated vnodes by now as they should be + * vflush-ed. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + sfs_destroy_node(zfsvfs->z_ctldir->snapdir); + sfs_destroy_node((sfs_node_t *)zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; +} + +static int +zfsctl_fs_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + return (VFS_ROOT(mp, flags, vpp)); +} + +static void +zfsctl_common_vnode_setup(vnode_t *vp, void *arg) +{ + ASSERT_VOP_ELOCKED(vp, __func__); + + /* We support shared locking. */ + VN_LOCK_ASHARE(vp); + vp->v_type = VDIR; + vp->v_data = arg; +} + +static int +zfsctl_root_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir; + err = sfs_vgetx(mp, flags, 0, ZFSCTL_INO_ROOT, "zfs", &zfsctl_ops_root, + zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +static int +zfsctl_snapdir_vnode(struct mount *mp, void *arg __unused, int flags, + struct vnode **vpp) +{ + void *node; + int err; + + node = ((zfsvfs_t *)mp->mnt_data)->z_ctldir->snapdir; + err = sfs_vgetx(mp, flags, ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, "zfs", + &zfsctl_ops_snapdir, zfsctl_common_vnode_setup, node, vpp); + return (err); +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +int +zfsctl_root(zfsvfs_t *zfsvfs, int flags, vnode_t **vpp) +{ + int error; + + error = zfsctl_root_vnode(zfsvfs->z_vfs, NULL, flags, vpp); + return (error); +} + +/* + * Common open routine. Disallow any write access. + */ +static int +zfsctl_common_open(struct vop_open_args *ap) +{ + int flags = ap->a_mode; + + if (flags & FWRITE) + return (SET_ERROR(EACCES)); + + return (0); +} + +/* + * Common close routine. Nothing to do here. + */ +/* ARGSUSED */ +static int +zfsctl_common_close(struct vop_close_args *ap) +{ + return (0); +} + +/* + * Common access routine. Disallow writes. + */ +static int +zfsctl_common_access(struct vop_access_args *ap) +{ + accmode_t accmode = ap->a_accmode; + + if (accmode & VWRITE) + return (SET_ERROR(EACCES)); + return (0); +} + +/* + * Common getattr function. Fill in basic information. + */ +static void +zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) +{ + timestruc_t now; + sfs_node_t *node; + + node = vp->v_data; + + vap->va_uid = 0; + vap->va_gid = 0; + vap->va_rdev = 0; + /* + * We are a purely virtual object, so we have no + * blocksize or allocated blocks. + */ + vap->va_blksize = 0; + vap->va_nblocks = 0; + vap->va_seq = 0; + vn_fsid(vp, vap); + vap->va_mode = zfsctl_ctldir_mode; + vap->va_type = VDIR; + /* + * We live in the now (for atime). + */ + gethrestime(&now); + vap->va_atime = now; + /* FreeBSD: Reset chflags(2) flags. */ + vap->va_flags = 0; + + vap->va_nodeid = node->sn_id; + + /* At least '.' and '..'. */ + vap->va_nlink = 2; +} + +#ifndef _OPENSOLARIS_SYS_VNODE_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfsctl_common_fid(struct vop_fid_args *ap) +{ + vnode_t *vp = ap->a_vp; + fid_t *fidp = (void *)ap->a_fid; + sfs_node_t *node = vp->v_data; + uint64_t object = node->sn_id; + zfid_short_t *zfid; + int i; + + zfid = (zfid_short_t *)fidp; + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs nodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfsctl_common_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + + (void) sfs_reclaim_vnode(vp); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_print_args { + struct vnode *a_vp; +}; +#endif + +static int +zfsctl_common_print(struct vop_print_args *ap) +{ + sfs_print_node(ap->a_vp->v_data); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +/* + * Get root directory attributes. + */ +static int +zfsctl_root_getattr(struct vop_getattr_args *ap) +{ + struct vnode *vp = ap->a_vp; + struct vattr *vap = ap->a_vap; + zfsctl_root_t *node = vp->v_data; + + zfsctl_common_getattr(vp, vap); + vap->va_ctime = node->cmtime; + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + vap->va_nlink += 1; /* snapdir */ + vap->va_size = vap->va_nlink; + return (0); +} + +/* + * When we lookup "." we still can be asked to lock it + * differently, can't we? + */ +static int +zfsctl_relock_dot(vnode_t *dvp, int ltype) +{ + vref(dvp); + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* Relock for the "." case may left us with reclaimed vnode. */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); +} + +/* + * Special case the handling of "..". + */ +static int +zfsctl_root_lookup(struct vop_lookup_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + int flags = ap->a_cnp->cn_flags; + int lkflags = ap->a_cnp->cn_lkflags; + int nameiop = ap->a_cnp->cn_nameiop; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + } else if ((flags & ISDOTDOT) != 0) { + err = vn_vget_ino_gen(dvp, zfsctl_fs_root_vnode, NULL, + lkflags, vpp); + } else if (strncmp(cnp->cn_nameptr, "snapshot", cnp->cn_namelen) == 0) { + err = zfsctl_snapdir_vnode(dvp->v_mount, NULL, lkflags, vpp); + } else { + err = SET_ERROR(ENOENT); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_root_readdir(struct vop_readdir_args *ap) +{ + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_root_t *node = vp->v_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(zfsvfs->z_root, ZFSCTL_INO_ROOT, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + if (uio->uio_offset != dots_offset) + return (SET_ERROR(EINVAL)); + + CTASSERT(sizeof (node->snapdir->sn_name) <= sizeof (entry.d_name)); + entry.d_fileno = node->snapdir->sn_id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, node->snapdir->sn_name); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + return (SET_ERROR(error)); + } + if (eofp != NULL) + *eofp = 1; + return (0); +} + +static int +zfsctl_root_vptocnp(struct vop_vptocnp_args *ap) +{ + static const char dotzfs_name[4] = ".zfs"; + vnode_t *dvp; + int error; + + if (*ap->a_buflen < sizeof (dotzfs_name)) + return (SET_ERROR(ENOMEM)); + + error = vn_vget_ino_gen(ap->a_vp, zfsctl_fs_root_vnode, NULL, + LK_SHARED, &dvp); + if (error != 0) + return (SET_ERROR(error)); + + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= sizeof (dotzfs_name); + bcopy(dotzfs_name, ap->a_buf + *ap->a_buflen, sizeof (dotzfs_name)); + return (0); +} + +static int +zfsctl_common_pathconf(struct vop_pathconf_args *ap) +{ + /* + * We care about ACL variables so that user land utilities like ls + * can display them correctly. Since the ctldir's st_dev is set to be + * the same as the parent dataset, we must support all variables that + * it supports. + */ + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *ap->a_retval = 64; + return (0); + + case _PC_MIN_HOLE_SIZE: + *ap->a_retval = (int)SPA_MINBLOCKSIZE; + return (0); + + case _PC_ACL_EXTENDED: + *ap->a_retval = 0; + return (0); + + case _PC_ACL_NFS4: + *ap->a_retval = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *ap->a_retval = ACL_MAX_ENTRIES; + return (0); + + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * Returns a trivial ACL + */ +static int +zfsctl_common_getacl(struct vop_getacl_args *ap) +{ + int i; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + acl_nfs4_sync_acl_from_mode(ap->a_aclp, zfsctl_ctldir_mode, 0); + /* + * acl_nfs4_sync_acl_from_mode assumes that the owner can always modify + * attributes. That is not the case for the ctldir, so we must clear + * those bits. We also must clear ACL_READ_NAMED_ATTRS, because xattrs + * aren't supported by the ctldir. + */ + for (i = 0; i < ap->a_aclp->acl_cnt; i++) { + struct acl_entry *entry; + entry = &(ap->a_aclp->acl_entry[i]); + entry->ae_perm &= ~(ACL_WRITE_ACL | ACL_WRITE_OWNER | + ACL_WRITE_ATTRIBUTES | ACL_WRITE_NAMED_ATTRS | + ACL_READ_NAMED_ATTRS); + } + + return (0); +} + +static struct vop_vector zfsctl_ops_root = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_ioctl = VOP_EINVAL, + .vop_getattr = zfsctl_root_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_root_readdir, + .vop_lookup = zfsctl_root_lookup, + .vop_inactive = VOP_NULL, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_vptocnp = zfsctl_root_vptocnp, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_root); + +static int +zfsctl_snapshot_zname(vnode_t *vp, const char *name, int len, char *zname) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + + dmu_objset_name(os, zname); + if (strlen(zname) + 1 + strlen(name) >= len) + return (SET_ERROR(ENAMETOOLONG)); + (void) strcat(zname, "@"); + (void) strcat(zname, name); + return (0); +} + +static int +zfsctl_snapshot_lookup(vnode_t *vp, const char *name, uint64_t *id) +{ + objset_t *os = ((zfsvfs_t *)((vp)->v_vfsp->vfs_data))->z_os; + int err; + + err = dsl_dataset_snap_lookup(dmu_objset_ds(os), name, id); + return (err); +} + +/* + * Given a vnode get a root vnode of a filesystem mounted on top of + * the vnode, if any. The root vnode is referenced and locked. + * If no filesystem is mounted then the orinal vnode remains referenced + * and locked. If any error happens the orinal vnode is unlocked and + * released. + */ +static int +zfsctl_mounted_here(vnode_t **vpp, int flags) +{ + struct mount *mp; + int err; + + ASSERT_VOP_LOCKED(*vpp, __func__); + ASSERT3S((*vpp)->v_type, ==, VDIR); + + if ((mp = (*vpp)->v_mountedhere) != NULL) { + err = vfs_busy(mp, 0); + KASSERT(err == 0, ("vfs_busy(mp, 0) failed with %d", err)); + KASSERT(vrefcnt(*vpp) > 1, ("unreferenced mountpoint")); + vput(*vpp); + err = VFS_ROOT(mp, flags, vpp); + vfs_unbusy(mp); + return (err); + } + return (EJUSTRETURN); +} + +typedef struct { + const char *snap_name; + uint64_t snap_id; +} snapshot_setup_arg_t; + +static void +zfsctl_snapshot_vnode_setup(vnode_t *vp, void *arg) +{ + snapshot_setup_arg_t *ssa = arg; + sfs_node_t *node; + + ASSERT_VOP_ELOCKED(vp, __func__); + + node = sfs_alloc_node(sizeof (sfs_node_t), + ssa->snap_name, ZFSCTL_INO_SNAPDIR, ssa->snap_id); + zfsctl_common_vnode_setup(vp, node); + + /* We have to support recursive locking. */ + VN_LOCK_AREC(vp); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem vnode as necessary. + * Perform a mount of the associated dataset on top of the vnode. + * There are four possibilities: + * - the snapshot node and vnode do not exist + * - the snapshot vnode is covered by the mounted snapshot + * - the snapshot vnode is not covered yet, the mount operation is in progress + * - the snapshot vnode is not covered, because the snapshot has been unmounted + * The last two states are transient and should be relatively short-lived. + */ +static int +zfsctl_snapdir_lookup(struct vop_lookup_args *ap) +{ + vnode_t *dvp = ap->a_dvp; + vnode_t **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + char name[NAME_MAX + 1]; + char fullname[ZFS_MAX_DATASET_NAME_LEN]; + char *mountpoint; + size_t mountpoint_len; + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + uint64_t snap_id; + int nameiop = cnp->cn_nameiop; + int lkflags = cnp->cn_lkflags; + int flags = cnp->cn_flags; + int err; + + ASSERT(dvp->v_type == VDIR); + + if ((flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + + if (cnp->cn_namelen == 1 && *cnp->cn_nameptr == '.') { + err = zfsctl_relock_dot(dvp, lkflags & LK_TYPE_MASK); + if (err == 0) + *vpp = dvp; + return (err); + } + if (flags & ISDOTDOT) { + err = vn_vget_ino_gen(dvp, zfsctl_root_vnode, NULL, lkflags, + vpp); + return (err); + } + + if (cnp->cn_namelen >= sizeof (name)) + return (SET_ERROR(ENAMETOOLONG)); + + strlcpy(name, ap->a_cnp->cn_nameptr, ap->a_cnp->cn_namelen + 1); + err = zfsctl_snapshot_lookup(dvp, name, &snap_id); + if (err != 0) + return (SET_ERROR(ENOENT)); + + for (;;) { + snapshot_setup_arg_t ssa; + + ssa.snap_name = name; + ssa.snap_id = snap_id; + err = sfs_vgetx(dvp->v_mount, LK_SHARED, ZFSCTL_INO_SNAPDIR, + snap_id, "zfs", &zfsctl_ops_snapshot, + zfsctl_snapshot_vnode_setup, &ssa, vpp); + if (err != 0) + return (err); + + /* Check if a new vnode has just been created. */ + if (VOP_ISLOCKED(*vpp) == LK_EXCLUSIVE) + break; + + /* + * Check if a snapshot is already mounted on top of the vnode. + */ + err = zfsctl_mounted_here(vpp, lkflags); + if (err != EJUSTRETURN) + return (err); + + /* + * If the vnode is not covered, then either the mount operation + * is in progress or the snapshot has already been unmounted + * but the vnode hasn't been inactivated and reclaimed yet. + * We can try to re-use the vnode in the latter case. + */ + VI_LOCK(*vpp); + if (((*vpp)->v_iflag & VI_MOUNT) == 0) { + /* + * Upgrade to exclusive lock in order to: + * - avoid race conditions + * - satisfy the contract of mount_snapshot() + */ + err = VOP_LOCK(*vpp, LK_TRYUPGRADE | LK_INTERLOCK); + if (err == 0) + break; + } else { + VI_UNLOCK(*vpp); + } + + /* + * In this state we can loop on uncontested locks and starve + * the thread doing the lengthy, non-trivial mount operation. + * So, yield to prevent that from happening. + */ + vput(*vpp); + kern_yield(PRI_USER); + } + + VERIFY0(zfsctl_snapshot_zname(dvp, name, sizeof (fullname), fullname)); + + mountpoint_len = strlen(dvp->v_vfsp->mnt_stat.f_mntonname) + + strlen("/" ZFS_CTLDIR_NAME "/snapshot/") + strlen(name) + 1; + mountpoint = kmem_alloc(mountpoint_len, KM_SLEEP); + (void) snprintf(mountpoint, mountpoint_len, + "%s/" ZFS_CTLDIR_NAME "/snapshot/%s", + dvp->v_vfsp->mnt_stat.f_mntonname, name); + + err = mount_snapshot(curthread, vpp, "zfs", mountpoint, fullname, 0); + kmem_free(mountpoint, mountpoint_len); + if (err == 0) { + /* + * Fix up the root vnode mounted on .zfs/snapshot/. + * + * This is where we lie about our v_vfsp in order to + * make .zfs/snapshot/ accessible over NFS + * without requiring manual mounts of . + */ + ASSERT(VTOZ(*vpp)->z_zfsvfs != zfsvfs); + VTOZ(*vpp)->z_zfsvfs->z_parent = zfsvfs; + + /* Clear the root flag (set via VFS_ROOT) as well. */ + (*vpp)->v_vflag &= ~VV_ROOT; + } + + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +zfsctl_snapdir_readdir(struct vop_readdir_args *ap) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + struct dirent entry; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + uio_t *uio = ap->a_uio; + int *eofp = ap->a_eofflag; + off_t dots_offset; + int error; + + ASSERT(vp->v_type == VDIR); + + error = sfs_readdir_common(ZFSCTL_INO_ROOT, ZFSCTL_INO_SNAPDIR, ap, uio, + &dots_offset); + if (error != 0) { + if (error == ENAMETOOLONG) /* ran out of destination space */ + error = 0; + return (error); + } + + ZFS_ENTER(zfsvfs); + for (;;) { + uint64_t cookie; + uint64_t id; + + cookie = uio->uio_offset - dots_offset; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) { + if (eofp != NULL) + *eofp = 1; + error = 0; + } + ZFS_EXIT(zfsvfs); + return (error); + } + + entry.d_fileno = id; + entry.d_type = DT_DIR; + strcpy(entry.d_name, snapname); + entry.d_namlen = strlen(entry.d_name); + entry.d_reclen = sizeof (entry); + error = vfs_read_dirent(ap, &entry, uio->uio_offset); + if (error != 0) { + if (error == ENAMETOOLONG) + error = 0; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(error)); + } + uio->uio_offset = cookie + dots_offset; + } + /* NOTREACHED */ +} + +static int +zfsctl_snapdir_getattr(struct vop_getattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + dsl_dataset_t *ds; + uint64_t snap_count; + int err; + + ZFS_ENTER(zfsvfs); + ds = dmu_objset_ds(zfsvfs->z_os); + zfsctl_common_getattr(vp, vap); + vap->va_ctime = dmu_objset_snap_cmtime(zfsvfs->z_os); + vap->va_mtime = vap->va_ctime; + vap->va_birthtime = vap->va_ctime; + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0) { + err = zap_count(dmu_objset_pool(ds->ds_objset)->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, &snap_count); + if (err != 0) { + ZFS_EXIT(zfsvfs); + return (err); + } + vap->va_nlink += snap_count; + } + vap->va_size = vap->va_nlink; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static struct vop_vector zfsctl_ops_snapdir = { + .vop_default = &default_vnodeops, + .vop_open = zfsctl_common_open, + .vop_close = zfsctl_common_close, + .vop_getattr = zfsctl_snapdir_getattr, + .vop_access = zfsctl_common_access, + .vop_readdir = zfsctl_snapdir_readdir, + .vop_lookup = zfsctl_snapdir_lookup, + .vop_reclaim = zfsctl_common_reclaim, + .vop_fid = zfsctl_common_fid, + .vop_print = zfsctl_common_print, + .vop_pathconf = zfsctl_common_pathconf, + .vop_getacl = zfsctl_common_getacl, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapdir); + + +static int +zfsctl_snapshot_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + VERIFY(vrecycle(vp) == 1); + return (0); +} + +static int +zfsctl_snapshot_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + void *data = vp->v_data; + + sfs_reclaim_vnode(vp); + sfs_destroy_node(data); + return (0); +} + +static int +zfsctl_snapshot_vptocnp(struct vop_vptocnp_args *ap) +{ + struct mount *mp; + vnode_t *dvp; + vnode_t *vp; + sfs_node_t *node; + size_t len; + int locked; + int error; + + vp = ap->a_vp; + node = vp->v_data; + len = strlen(node->sn_name); + if (*ap->a_buflen < len) + return (SET_ERROR(ENOMEM)); + + /* + * Prevent unmounting of the snapshot while the vnode lock + * is not held. That is not strictly required, but allows + * us to assert that an uncovered snapshot vnode is never + * "leaked". + */ + mp = vp->v_mountedhere; + if (mp == NULL) + return (SET_ERROR(ENOENT)); + error = vfs_busy(mp, 0); + KASSERT(error == 0, ("vfs_busy(mp, 0) failed with %d", error)); + + /* + * We can vput the vnode as we can now depend on the reference owned + * by the busied mp. But we also need to hold the vnode, because + * the reference may go after vfs_unbusy() which has to be called + * before we can lock the vnode again. + */ + locked = VOP_ISLOCKED(vp); +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(vp); +#else + vhold(vp); +#endif + vput(vp); + + /* Look up .zfs/snapshot, our parent. */ + error = zfsctl_snapdir_vnode(vp->v_mount, NULL, LK_SHARED, &dvp); + if (error == 0) { + VOP_UNLOCK1(dvp); + *ap->a_vpp = dvp; + *ap->a_buflen -= len; + bcopy(node->sn_name, ap->a_buf + *ap->a_buflen, len); + } + vfs_unbusy(mp); +#if __FreeBSD_version >= 1300045 + vget_finish(vp, locked | LK_RETRY, vs); +#else + vget(vp, locked | LK_VNHELD | LK_RETRY, curthread); +#endif + return (error); +} + +/* + * These VP's should never see the light of day. They should always + * be covered. + */ +static struct vop_vector zfsctl_ops_snapshot = { + .vop_default = NULL, /* ensure very restricted access */ + .vop_inactive = zfsctl_snapshot_inactive, +#if __FreeBSD_version >= 1300045 + .vop_need_inactive = vop_stdneed_inactive, +#endif + .vop_reclaim = zfsctl_snapshot_reclaim, + .vop_vptocnp = zfsctl_snapshot_vptocnp, + .vop_lock1 = vop_stdlock, + .vop_unlock = vop_stdunlock, + .vop_islocked = vop_stdislocked, + .vop_advlockpurge = vop_stdadvlockpurge, /* called by vgone */ + .vop_print = zfsctl_common_print, +}; +VFS_VOP_VECTOR_REGISTER(zfsctl_ops_snapshot); + +int +zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp) +{ + zfsvfs_t *zfsvfs __unused = vfsp->vfs_data; + vnode_t *vp; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + *zfsvfsp = NULL; + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, objsetid, &vp); + if (error == 0 && vp != NULL) { + /* + * XXX Probably need to at least reference, if not busy, the mp. + */ + if (vp->v_mountedhere != NULL) + *zfsvfsp = vp->v_mountedhere->mnt_data; + vput(vp); + } + if (*zfsvfsp == NULL) + return (SET_ERROR(EINVAL)); + return (0); +} + +/* + * Unmount any snapshots for the given filesystem. This is called from + * zfs_umount() - if we have a ctldir, then go through and unmount all the + * snapshots. + */ +int +zfsctl_umount_snapshots(vfs_t *vfsp, int fflags, cred_t *cr) +{ + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct mount *mp; + vnode_t *vp; + uint64_t cookie; + int error; + + ASSERT(zfsvfs->z_ctldir != NULL); + + cookie = 0; + for (;;) { + uint64_t id; + + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, sizeof (snapname), + snapname, &id, &cookie, NULL); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error != 0) { + if (error == ENOENT) + error = 0; + break; + } + + for (;;) { + error = sfs_vnode_get(vfsp, LK_EXCLUSIVE, + ZFSCTL_INO_SNAPDIR, id, &vp); + if (error != 0 || vp == NULL) + break; + + mp = vp->v_mountedhere; + + /* + * v_mountedhere being NULL means that the + * (uncovered) vnode is in a transient state + * (mounting or unmounting), so loop until it + * settles down. + */ + if (mp != NULL) + break; + vput(vp); + } + if (error != 0) + break; + if (vp == NULL) + continue; /* no mountpoint, nothing to do */ + + /* + * The mount-point vnode is kept locked to avoid spurious EBUSY + * from a concurrent umount. + * The vnode lock must have recursive locking enabled. + */ + vfs_ref(mp); + error = dounmount(mp, fflags, curthread); + KASSERT_IMPLY(error == 0, vrefcnt(vp) == 1, + ("extra references after unmount")); + vput(vp); + if (error != 0) + break; + } + KASSERT_IMPLY((fflags & MS_FORCE) != 0, error == 0, + ("force unmounting failed")); + return (error); +} + +int +zfsctl_snapshot_unmount(char *snapname, int flags __unused) +{ + vfs_t *vfsp = NULL; + zfsvfs_t *zfsvfs = NULL; + + if (strchr(snapname, '@') == NULL) + return (0); + + int err = getzfsvfs(snapname, &zfsvfs); + if (err != 0) { + ASSERT3P(zfsvfs, ==, NULL); + return (0); + } + vfsp = zfsvfs->z_vfs; + + ASSERT(!dsl_pool_config_held(dmu_objset_pool(zfsvfs->z_os))); + + vfs_ref(vfsp); + vfs_unbusy(vfsp); + return (dounmount(vfsp, MS_FORCE, curthread)); +} diff --git a/module/os/freebsd/zfs/zfs_debug.c b/module/os/freebsd/zfs/zfs_debug.c new file mode 100644 index 000000000000..74742ad3669f --- /dev/null +++ b/module/os/freebsd/zfs/zfs_debug.c @@ -0,0 +1,251 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +typedef struct zfs_dbgmsg { + list_node_t zdm_node; + time_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +kmutex_t zfs_dbgmsgs_lock; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ +kstat_t *zfs_dbgmsg_kstat; + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages + * dtrace -n 'zfs-dbgmsg { print(stringof(arg0)); }' + * + * # Disable the kernel debug message log. + * sysctl vfs.zfs.dbgmsg_enable=0 + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, "%-12s %-8s\n", "timestamp", "message"); + + return (0); +} + +static int +zfs_dbgmsg_data(char *buf, size_t size, void *data) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)data; + + (void) snprintf(buf, size, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + + return (0); +} + +static void * +zfs_dbgmsg_addr(kstat_t *ksp, loff_t n) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)ksp->ks_private; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + if (n == 0) + ksp->ks_private = list_head(&zfs_dbgmsgs); + else if (zdm) + ksp->ks_private = list_next(&zfs_dbgmsgs, zdm); + + return (ksp->ks_private); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + zfs_dbgmsg_t *zdm; + int size; + + ASSERT(MUTEX_HELD(&zfs_dbgmsgs_lock)); + + while (zfs_dbgmsg_size > max_size) { + zdm = list_remove_head(&zfs_dbgmsgs); + if (zdm == NULL) + return; + + size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + zfs_dbgmsg_purge(0); + + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + list_create(&zfs_dbgmsgs, sizeof (zfs_dbgmsg_t), + offsetof(zfs_dbgmsg_t, zdm_node)); + mutex_init(&zfs_dbgmsgs_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_dbgmsg_kstat = kstat_create("zfs", 0, "dbgmsg", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (zfs_dbgmsg_kstat) { + zfs_dbgmsg_kstat->ks_lock = &zfs_dbgmsgs_lock; + zfs_dbgmsg_kstat->ks_ndata = UINT32_MAX; + zfs_dbgmsg_kstat->ks_private = NULL; + zfs_dbgmsg_kstat->ks_update = zfs_dbgmsg_update; + kstat_set_raw_ops(zfs_dbgmsg_kstat, zfs_dbgmsg_headers, + zfs_dbgmsg_data, zfs_dbgmsg_addr); + kstat_install(zfs_dbgmsg_kstat); + } +} + +void +zfs_dbgmsg_fini(void) +{ + if (zfs_dbgmsg_kstat) + kstat_delete(zfs_dbgmsg_kstat); + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + mutex_enter(&zfs_dbgmsgs_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs_lock); + mutex_destroy(&zfs_dbgmsgs_lock); +#endif +} + +void +__zfs_dbgmsg(char *buf) +{ + zfs_dbgmsg_t *zdm; + int size; + + DTRACE_PROBE1(zfs__dbgmsg, char *, buf); + + size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs_lock); + list_insert_tail(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs_lock); +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +#ifdef _KERNEL +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s:%d:%s(): ", newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline. + */ + nl = strrchr(buf, '\n'); + if (nl != NULL) + *nl = '\0'; + + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + zfs_dbgmsg_t *zdm; + + (void) printf("ZFS_DBGMSG(%s):\n", tag); + mutex_enter(&zfs_dbgmsgs_lock); + for (zdm = list_head(&zfs_dbgmsgs); zdm; + zdm = list_next(&zfs_dbgmsgs, zdm)) + (void) printf("%s\n", zdm->zdm_msg); + mutex_exit(&zfs_dbgmsgs_lock); +} +#endif /* _KERNEL */ + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_enable, INT, ZMOD_RW, + "Enable ZFS debug message log"); + +ZFS_MODULE_PARAM(zfs, zfs_, dbgmsg_maxsize, INT, ZMOD_RW, + "Maximum ZFS debug log size"); +/* END CSTYLED */ diff --git a/module/os/freebsd/zfs/zfs_dir.c b/module/os/freebsd/zfs/zfs_dir.c new file mode 100644 index 000000000000..4b1f4a8832e0 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_dir.c @@ -0,0 +1,967 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +/* + * zfs_match_find() is used by zfs_dirent_lookup() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, const char *name, + matchtype_t mt, uint64_t *zoid) +{ + int error; + + if (zfsvfs->z_norm) { + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, NULL, 0, NULL); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Look up a directory entry under a locked vnode. + * dvp being locked gives us a guarantee that there are no concurrent + * modification of the directory and, thus, if a node can be found in + * the directory, then it must not be unlinked. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZXATTR: we want dzp's xattr directory + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + */ +int +zfs_dirent_lookup(znode_t *dzp, const char *name, znode_t **zpp, int flag) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + + *zpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if (name[0] == '.' && + (((name[1] == '\0') || (name[1] == '.' && name[2] == '\0')) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0))) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect how we perform zap lookups. + * + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if (zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * NB: we do not need to worry about this flag for ZFS_CASE_SENSITIVE + * because in that case MT_EXACT and MT_FIRST should produce exactly + * the same result. + */ + + if (dzp->z_unlinked && !(flag & ZXATTR)) + return (ENOENT); + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? ENOENT : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + return (error); + } + } else { + if (flag & ZNEW) { + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, &zp); + if (error) + return (error); + ASSERT(!zp->z_unlinked); + *zpp = zp; + } + + return (0); +} + +static int +zfs_dd_lookup(znode_t *dzp, znode_t **zpp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + znode_t *zp; + uint64_t parent; + int error; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); + + if (dzp->z_unlinked) + return (ENOENT); + + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + return (error); +} + +int +zfs_dirlook(znode_t *dzp, const char *name, znode_t **zpp) +{ + zfsvfs_t *zfsvfs __unused = dzp->z_zfsvfs; + znode_t *zp = NULL; + int error = 0; + +#ifdef ZFS_DEBUG + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(ZTOV(dzp), __func__); + ASSERT(RRM_READ_HELD(&zfsvfs->z_teardown_lock)); +#endif + if (dzp->z_unlinked) + return (SET_ERROR(ENOENT)); + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + error = zfs_dd_lookup(dzp, &zp); + if (error == 0) + *zpp = zp; + } else { + error = zfs_dirent_lookup(dzp, name, &zp, ZEXISTS); + if (error == 0) { + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + *zpp = zp; + } + } + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_unlinked); + ASSERT(zp->z_links == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + dmu_tx_t *tx; + int error; + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + vn_lock(ZTOV(zp), LK_EXCLUSIVE | LK_RETRY); + + /* + * Due to changes in zfs_rmnode we need to make sure the + * link count is set to zero here. + */ + if (zp->z_links != 0) { + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + vput(ZTOV(zp)); + continue; + } + zp->z_links = 0; + VERIFY0(sa_update(zp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &zp->z_links, sizeof (zp->z_links), tx)); + dmu_tx_commit(tx); + } + + zp->z_unlinked = B_TRUE; + vput(ZTOV(zp)); + } + zap_cursor_fini(&zc); +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + vn_lock(ZTOV(xzp), LK_EXCLUSIVE | LK_RETRY); + ASSERT((ZTOV(xzp)->v_type == VREG) || + (ZTOV(xzp)->v_type == VLNK)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + vput(ZTOV(xzp)); + skipped += 1; + continue; + } + + error = zfs_link_destroy(dzp, zap.za_name, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + vput(ZTOV(xzp)); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +extern taskq_t *zfsvfs_taskq; + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t count; + int error; + + ASSERT(zp->z_links == 0); + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + + /* + * If this is an attribute directory, purge its contents. + */ + if (ZTOV(zp) != NULL && ZTOV(zp)->v_type == VDIR && + (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } else { + /* + * Free up all the data in the file. We don't do this for + * XATTR directories because we need truncate and remove to be + * in the same tx, like in zfs_znode_delete(). Otherwise, if + * we crash here we'll end up with an inconsistent truncated + * zap object in the delete queue. Note a truncated file is + * harmless since it only contains user data. + */ + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error) + xattr_obj = 0; + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xattr_obj) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + return; + } + + /* + * FreeBSD's implementation of zfs_zget requires a vnode to back it. + * This means that we could end up calling into getnewvnode while + * calling zfs_rmnode as a result of a prior call to getnewvnode + * trying to clear vnodes out of the cache. If this repeats we can + * recurse enough that we overflow our stack. To avoid this, we + * avoid calling zfs_zget on the xattr znode and instead simply add + * it to the unlinked set and schedule a call to zfs_unlinked_drain. + */ + if (xattr_obj) { + /* Add extended attribute directory to the unlinked set. */ + VERIFY3U(0, ==, + zap_add_int(os, zfsvfs->z_unlinkedobj, xattr_obj, tx)); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* Remove this znode from the unlinked set */ + VERIFY3U(0, ==, + zap_remove_int(os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); + + if (xattr_obj) { + /* + * We're using the FreeBSD taskqueue API here instead of + * the Solaris taskq API since the FreeBSD API allows for a + * task to be enqueued multiple times but executed once. + */ + taskqueue_enqueue(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + } +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (zp->z_zfsvfs->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dzp. Can only fail if zp has been unlinked. + */ +int +zfs_link_create(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + uint64_t value; + int zp_is_dir = (vp->v_type == VDIR); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (zp_is_dir) { + if (dzp->z_links >= ZFS_LINK_MAX) + return (SET_ERROR(EMLINK)); + } + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + return (SET_ERROR(ENOENT)); + } + if (zp->z_links >= ZFS_LINK_MAX - zp_is_dir) { + return (SET_ERROR(EMLINK)); + } + zp->z_links++; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + + } else { + ASSERT(zp->z_unlinked == 0); + } + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(zp->z_zfsvfs->z_os, dzp->z_id, name, + 8, 1, &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + zp->z_links--; + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + dzp->z_size++; + dzp->z_links += zp_is_dir; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (zp->z_zfsvfs->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if (zp->z_zfsvfs->z_case == ZFS_CASE_MIXED) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(zp->z_zfsvfs->z_os, dzp->z_id, + name, mt, tx); + } else { + error = zap_remove(zp->z_zfsvfs->z_os, dzp->z_id, name, tx); + } + + return (error); +} + +/* + * Unlink zp from dzp, and mark zp for deletion if this was the last link. + * Can fail if zp is a mount point (EBUSY) or a non-empty directory (EEXIST). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(znode_t *dzp, const char *name, znode_t *zp, dmu_tx_t *tx, + int flag, boolean_t *unlinkedp) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + int zp_is_dir = (vp->v_type == VDIR); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + int count = 0; + int error; + + if (zfsvfs->z_replay == B_FALSE) { + ASSERT_VOP_ELOCKED(ZTOV(dzp), __func__); + ASSERT_VOP_ELOCKED(ZTOV(zp), __func__); + } + if (!(flag & ZRENAMING)) { + + if (zp_is_dir && !zfs_dirempty(zp)) + return (SET_ERROR(ENOTEMPTY)); + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) { + return (error); + } + + if (zp->z_links <= zp_is_dir) { + zfs_panic_recover("zfs: link count on vnode %p is %u, " + "should be at least %u", zp->z_vnode, + (int)zp->z_links, + zp_is_dir + 1); + zp->z_links = zp_is_dir + 1; + } + if (--zp->z_links == zp_is_dir) { + zp->z_unlinked = B_TRUE; + zp->z_links = 0; + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &zp->z_links, sizeof (zp->z_links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT0(error); + } else { + ASSERT(zp->z_unlinked == 0); + error = zfs_dropname(dzp, name, zp, tx, flag); + if (error != 0) + return (error); + } + + dzp->z_size--; /* one dirent removed */ + dzp->z_links -= zp_is_dir; /* ".." link from zp */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &dzp->z_links, sizeof (dzp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT0(error); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + return (dzp->z_size == 2); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xvpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t parent __unused; + + *xvpp = NULL; + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, 0)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef ZFS_DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + *xvpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xzpp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + znode_t *xzp; + vattr_t va; + int error; +top: + error = zfs_dirent_lookup(zp, "", &xzp, ZXATTR); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + return (0); + } + + + if (!(flags & CREATE_XATTR_DIR)) + return (SET_ERROR(ENOATTR)); + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = AT_MODE | AT_UID | AT_GID; + va.va_type = VDIR; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + if (error == 0) + VOP_UNLOCK1(ZTOV(*xzpp)); + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * the entry is a plain file and you have write access, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + + if (zdp->z_zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, zdp->z_uid, cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, zp->z_uid, cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + (ZTOV(zp)->v_type == VREG && + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0)) + return (0); + else + return (secpolicy_vnode_remove(ZTOV(zp), cr)); +} diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c new file mode 100644 index 000000000000..ec7c04717c84 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -0,0 +1,309 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct thread *td; + int rc, fd; + + td = curthread; + pwd_ensure_dirs(); + /* 12.x doesn't take a const char * */ + rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), + UIO_SYSSPACE, flags, mode); + if (rc) + return (SET_ERROR(rc)); + fd = td->td_retval[0]; + td->td_retval[0] = 0; + if (fget(curthread, fd, &cap_no_rights, fpp)) + kern_close(td, fd); + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + fo_close(fp, curthread); +} + +static int +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_WRITE; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FWRITE) == 0) + return (SET_ERROR(EBADF)); + + if (fp->f_type == DTYPE_VNODE) + bwillwrite(); + + rc = fo_write(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + if (resid) + *resid = auio.uio_resid; + else if (auio.uio_resid) + return (SET_ERROR(EIO)); + *offp += count - auio.uio_resid; + return (rc); +} + +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + + return (SET_ERROR(rc)); +} + +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_write_impl(fp, buf, count, &off, resid)); +} + +static int +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *offp, + ssize_t *resid) +{ + ssize_t rc; + struct uio auio; + struct thread *td; + struct iovec aiov; + + td = curthread; + aiov.iov_base = (void *)(uintptr_t)buf; + aiov.iov_len = count; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_resid = count; + auio.uio_rw = UIO_READ; + auio.uio_td = td; + auio.uio_offset = *offp; + + if ((fp->f_flag & FREAD) == 0) + return (SET_ERROR(EBADF)); + + rc = fo_read(fp, &auio, td->td_ucred, FOF_OFFSET, td); + if (rc) + return (SET_ERROR(rc)); + *resid = auio.uio_resid; + *offp += count - auio.uio_resid; + return (SET_ERROR(0)); +} + +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_offset; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off, resid); + if (rc == 0) + fp->f_offset = off; + return (rc); +} + +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + return (zfs_file_read_impl(fp, buf, count, &off, resid)); +} + +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + int rc; + struct thread *td; + + td = curthread; + if ((fp->f_ops->fo_flags & DFLAG_SEEKABLE) == 0) + return (SET_ERROR(ESPIPE)); + rc = fo_seek(fp, *offp, whence, td); + if (rc == 0) + *offp = td->td_uretoff.tdu_off; + return (SET_ERROR(rc)); +} + +int +zfs_file_getattr(zfs_file_t *fp, zfs_file_attr_t *zfattr) +{ + struct thread *td; + struct stat sb; + int rc; + + td = curthread; + + rc = fo_stat(fp, &sb, td->td_ucred, td); + if (rc) + return (SET_ERROR(rc)); + zfattr->zfa_size = sb.st_size; + zfattr->zfa_mode = sb.st_mode; + + return (0); +} + +static __inline int +zfs_vop_fsync(vnode_t *vp) +{ + struct mount *mp; + int error; + + if ((error = vn_start_write(vp, &mp, V_WAIT | PCATCH)) != 0) + goto drop; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + error = VOP_FSYNC(vp, MNT_WAIT, curthread); + VOP_UNLOCK1(vp); + vn_finished_write(mp); +drop: + return (SET_ERROR(error)); +} + +int +zfs_file_fsync(zfs_file_t *fp, int flags) +{ + struct vnode *v; + + if (fp->f_type != DTYPE_VNODE) + return (EINVAL); + + v = fp->f_data; + return (zfs_vop_fsync(v)); +} + +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + struct file *fp; + + if (fget(curthread, fd, &cap_no_rights, &fp)) + return (SET_ERROR(EBADF)); + + *fpp = fp; + return (0); +} + +void +zfs_file_put(int fd) +{ + struct file *fp; + + /* No CAP_ rights required, as we're only releasing. */ + if (fget(curthread, fd, &cap_no_rights, &fp) == 0) { + fdrop(fp, curthread); + fdrop(fp, curthread); + } +} + +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_offset); +} + +void * +zfs_file_private(zfs_file_t *fp) +{ + file_t *tmpfp; + void *data; + int error; + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + curthread->td_fpop = tmpfp; + if (error != 0) + return (NULL); + return (data); +} + +int +zfs_file_unlink(const char *fnamep) +{ + enum uio_seg seg = UIO_SYSSPACE; + int rc; + +#if __FreeBSD_version >= 1300018 + rc = kern_funlinkat(curthread, AT_FDCWD, fnamep, FD_NONE, seg, 0, 0); +#else +#ifdef AT_BENEATH + rc = kern_unlinkat(curthread, AT_FDCWD, fnamep, seg, 0, 0); +#else + rc = kern_unlinkat(curthread, AT_FDCWD, __DECONST(char *, fnamep), + seg, 0); +#endif +#endif + return (SET_ERROR(rc)); +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_compat.c b/module/os/freebsd/zfs/zfs_ioctl_compat.c new file mode 100644 index 000000000000..8dec8644c06e --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_compat.c @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +enum zfs_ioc_legacy { + ZFS_IOC_LEGACY_NONE = -1, + ZFS_IOC_LEGACY_FIRST = 0, + ZFS_LEGACY_IOC = ZFS_IOC_LEGACY_FIRST, + ZFS_IOC_LEGACY_POOL_CREATE = ZFS_IOC_LEGACY_FIRST, + ZFS_IOC_LEGACY_POOL_DESTROY, + ZFS_IOC_LEGACY_POOL_IMPORT, + ZFS_IOC_LEGACY_POOL_EXPORT, + ZFS_IOC_LEGACY_POOL_CONFIGS, + ZFS_IOC_LEGACY_POOL_STATS, + ZFS_IOC_LEGACY_POOL_TRYIMPORT, + ZFS_IOC_LEGACY_POOL_SCAN, + ZFS_IOC_LEGACY_POOL_FREEZE, + ZFS_IOC_LEGACY_POOL_UPGRADE, + ZFS_IOC_LEGACY_POOL_GET_HISTORY, + ZFS_IOC_LEGACY_VDEV_ADD, + ZFS_IOC_LEGACY_VDEV_REMOVE, + ZFS_IOC_LEGACY_VDEV_SET_STATE, + ZFS_IOC_LEGACY_VDEV_ATTACH, + ZFS_IOC_LEGACY_VDEV_DETACH, + ZFS_IOC_LEGACY_VDEV_SETPATH, + ZFS_IOC_LEGACY_VDEV_SETFRU, + ZFS_IOC_LEGACY_OBJSET_STATS, + ZFS_IOC_LEGACY_OBJSET_ZPLPROPS, + ZFS_IOC_LEGACY_DATASET_LIST_NEXT, + ZFS_IOC_LEGACY_SNAPSHOT_LIST_NEXT, + ZFS_IOC_LEGACY_SET_PROP, + ZFS_IOC_LEGACY_CREATE, + ZFS_IOC_LEGACY_DESTROY, + ZFS_IOC_LEGACY_ROLLBACK, + ZFS_IOC_LEGACY_RENAME, + ZFS_IOC_LEGACY_RECV, + ZFS_IOC_LEGACY_SEND, + ZFS_IOC_LEGACY_INJECT_FAULT, + ZFS_IOC_LEGACY_CLEAR_FAULT, + ZFS_IOC_LEGACY_INJECT_LIST_NEXT, + ZFS_IOC_LEGACY_ERROR_LOG, + ZFS_IOC_LEGACY_CLEAR, + ZFS_IOC_LEGACY_PROMOTE, + ZFS_IOC_LEGACY_DESTROY_SNAPS, + ZFS_IOC_LEGACY_SNAPSHOT, + ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME, + ZFS_IOC_LEGACY_OBJ_TO_PATH, + ZFS_IOC_LEGACY_POOL_SET_PROPS, + ZFS_IOC_LEGACY_POOL_GET_PROPS, + ZFS_IOC_LEGACY_SET_FSACL, + ZFS_IOC_LEGACY_GET_FSACL, + ZFS_IOC_LEGACY_SHARE, + ZFS_IOC_LEGACY_INHERIT_PROP, + ZFS_IOC_LEGACY_SMB_ACL, + ZFS_IOC_LEGACY_USERSPACE_ONE, + ZFS_IOC_LEGACY_USERSPACE_MANY, + ZFS_IOC_LEGACY_USERSPACE_UPGRADE, + ZFS_IOC_LEGACY_HOLD, + ZFS_IOC_LEGACY_RELEASE, + ZFS_IOC_LEGACY_GET_HOLDS, + ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS, + ZFS_IOC_LEGACY_VDEV_SPLIT, + ZFS_IOC_LEGACY_NEXT_OBJ, + ZFS_IOC_LEGACY_DIFF, + ZFS_IOC_LEGACY_TMP_SNAPSHOT, + ZFS_IOC_LEGACY_OBJ_TO_STATS, + ZFS_IOC_LEGACY_JAIL, + ZFS_IOC_LEGACY_UNJAIL, + ZFS_IOC_LEGACY_POOL_REGUID, + ZFS_IOC_LEGACY_SPACE_WRITTEN, + ZFS_IOC_LEGACY_SPACE_SNAPS, + ZFS_IOC_LEGACY_SEND_PROGRESS, + ZFS_IOC_LEGACY_POOL_REOPEN, + ZFS_IOC_LEGACY_LOG_HISTORY, + ZFS_IOC_LEGACY_SEND_NEW, + ZFS_IOC_LEGACY_SEND_SPACE, + ZFS_IOC_LEGACY_CLONE, + ZFS_IOC_LEGACY_BOOKMARK, + ZFS_IOC_LEGACY_GET_BOOKMARKS, + ZFS_IOC_LEGACY_DESTROY_BOOKMARKS, + ZFS_IOC_LEGACY_NEXTBOOT, + ZFS_IOC_LEGACY_CHANNEL_PROGRAM, + ZFS_IOC_LEGACY_REMAP, + ZFS_IOC_LEGACY_POOL_CHECKPOINT, + ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT, + ZFS_IOC_LEGACY_POOL_INITIALIZE, + ZFS_IOC_LEGACY_POOL_SYNC, + ZFS_IOC_LEGACY_LAST +}; + +unsigned static long zfs_ioctl_legacy_to_ozfs_[] = { + ZFS_IOC_POOL_CREATE, /* 0x00 */ + ZFS_IOC_POOL_DESTROY, /* 0x01 */ + ZFS_IOC_POOL_IMPORT, /* 0x02 */ + ZFS_IOC_POOL_EXPORT, /* 0x03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x04 */ + ZFS_IOC_POOL_STATS, /* 0x05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */ + ZFS_IOC_POOL_SCAN, /* 0x07 */ + ZFS_IOC_POOL_FREEZE, /* 0x08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */ + ZFS_IOC_VDEV_ADD, /* 0x0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x0e */ + ZFS_IOC_VDEV_DETACH, /* 0x0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x11 */ + ZFS_IOC_OBJSET_STATS, /* 0x12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */ + ZFS_IOC_SET_PROP, /* 0x16 */ + ZFS_IOC_CREATE, /* 0x17 */ + ZFS_IOC_DESTROY, /* 0x18 */ + ZFS_IOC_ROLLBACK, /* 0x19 */ + ZFS_IOC_RENAME, /* 0x1a */ + ZFS_IOC_RECV, /* 0x1b */ + ZFS_IOC_SEND, /* 0x1c */ + ZFS_IOC_INJECT_FAULT, /* 0x1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */ + ZFS_IOC_ERROR_LOG, /* 0x20 */ + ZFS_IOC_CLEAR, /* 0x21 */ + ZFS_IOC_PROMOTE, /* 0x22 */ + /* start of mismatch */ + + ZFS_IOC_DESTROY_SNAPS, /* 0x23:0x3b */ + ZFS_IOC_SNAPSHOT, /* 0x24:0x23 */ + ZFS_IOC_DSOBJ_TO_DSNAME, /* 0x25:0x24 */ + ZFS_IOC_OBJ_TO_PATH, /* 0x26:0x25 */ + ZFS_IOC_POOL_SET_PROPS, /* 0x27:0x26 */ + ZFS_IOC_POOL_GET_PROPS, /* 0x28:0x27 */ + ZFS_IOC_SET_FSACL, /* 0x29:0x28 */ + ZFS_IOC_GET_FSACL, /* 0x30:0x29 */ + ZFS_IOC_SHARE, /* 0x2b:0x2a */ + ZFS_IOC_INHERIT_PROP, /* 0x2c:0x2b */ + ZFS_IOC_SMB_ACL, /* 0x2d:0x2c */ + ZFS_IOC_USERSPACE_ONE, /* 0x2e:0x2d */ + ZFS_IOC_USERSPACE_MANY, /* 0x2f:0x2e */ + ZFS_IOC_USERSPACE_UPGRADE, /* 0x30:0x2f */ + ZFS_IOC_HOLD, /* 0x31:0x30 */ + ZFS_IOC_RELEASE, /* 0x32:0x31 */ + ZFS_IOC_GET_HOLDS, /* 0x33:0x32 */ + ZFS_IOC_OBJSET_RECVD_PROPS, /* 0x34:0x33 */ + ZFS_IOC_VDEV_SPLIT, /* 0x35:0x34 */ + ZFS_IOC_NEXT_OBJ, /* 0x36:0x35 */ + ZFS_IOC_DIFF, /* 0x37:0x36 */ + ZFS_IOC_TMP_SNAPSHOT, /* 0x38:0x37 */ + ZFS_IOC_OBJ_TO_STATS, /* 0x39:0x38 */ + ZFS_IOC_JAIL, /* 0x3a:0xc2 */ + ZFS_IOC_UNJAIL, /* 0x3b:0xc3 */ + ZFS_IOC_POOL_REGUID, /* 0x3c:0x3c */ + ZFS_IOC_SPACE_WRITTEN, /* 0x3d:0x39 */ + ZFS_IOC_SPACE_SNAPS, /* 0x3e:0x3a */ + ZFS_IOC_SEND_PROGRESS, /* 0x3f:0x3e */ + ZFS_IOC_POOL_REOPEN, /* 0x40:0x3d */ + ZFS_IOC_LOG_HISTORY, /* 0x41:0x3f */ + ZFS_IOC_SEND_NEW, /* 0x42:0x40 */ + ZFS_IOC_SEND_SPACE, /* 0x43:0x41 */ + ZFS_IOC_CLONE, /* 0x44:0x42 */ + ZFS_IOC_BOOKMARK, /* 0x45:0x43 */ + ZFS_IOC_GET_BOOKMARKS, /* 0x46:0x44 */ + ZFS_IOC_DESTROY_BOOKMARKS, /* 0x47:0x45 */ + ZFS_IOC_NEXTBOOT, /* 0x48:0xc1 */ + ZFS_IOC_CHANNEL_PROGRAM, /* 0x49:0x48 */ + ZFS_IOC_REMAP, /* 0x4a:0x4c */ + ZFS_IOC_POOL_CHECKPOINT, /* 0x4b:0x4d */ + ZFS_IOC_POOL_DISCARD_CHECKPOINT, /* 0x4c:0x4e */ + ZFS_IOC_POOL_INITIALIZE, /* 0x4d:0x4f */ +}; + +unsigned static long zfs_ioctl_ozfs_to_legacy_common_[] = { + ZFS_IOC_POOL_CREATE, /* 0x00 */ + ZFS_IOC_POOL_DESTROY, /* 0x01 */ + ZFS_IOC_POOL_IMPORT, /* 0x02 */ + ZFS_IOC_POOL_EXPORT, /* 0x03 */ + ZFS_IOC_POOL_CONFIGS, /* 0x04 */ + ZFS_IOC_POOL_STATS, /* 0x05 */ + ZFS_IOC_POOL_TRYIMPORT, /* 0x06 */ + ZFS_IOC_POOL_SCAN, /* 0x07 */ + ZFS_IOC_POOL_FREEZE, /* 0x08 */ + ZFS_IOC_POOL_UPGRADE, /* 0x09 */ + ZFS_IOC_POOL_GET_HISTORY, /* 0x0a */ + ZFS_IOC_VDEV_ADD, /* 0x0b */ + ZFS_IOC_VDEV_REMOVE, /* 0x0c */ + ZFS_IOC_VDEV_SET_STATE, /* 0x0d */ + ZFS_IOC_VDEV_ATTACH, /* 0x0e */ + ZFS_IOC_VDEV_DETACH, /* 0x0f */ + ZFS_IOC_VDEV_SETPATH, /* 0x10 */ + ZFS_IOC_VDEV_SETFRU, /* 0x11 */ + ZFS_IOC_OBJSET_STATS, /* 0x12 */ + ZFS_IOC_OBJSET_ZPLPROPS, /* 0x13 */ + ZFS_IOC_DATASET_LIST_NEXT, /* 0x14 */ + ZFS_IOC_SNAPSHOT_LIST_NEXT, /* 0x15 */ + ZFS_IOC_SET_PROP, /* 0x16 */ + ZFS_IOC_CREATE, /* 0x17 */ + ZFS_IOC_DESTROY, /* 0x18 */ + ZFS_IOC_ROLLBACK, /* 0x19 */ + ZFS_IOC_RENAME, /* 0x1a */ + ZFS_IOC_RECV, /* 0x1b */ + ZFS_IOC_SEND, /* 0x1c */ + ZFS_IOC_INJECT_FAULT, /* 0x1d */ + ZFS_IOC_CLEAR_FAULT, /* 0x1e */ + ZFS_IOC_INJECT_LIST_NEXT, /* 0x1f */ + ZFS_IOC_ERROR_LOG, /* 0x20 */ + ZFS_IOC_CLEAR, /* 0x21 */ + ZFS_IOC_PROMOTE, /* 0x22 */ + /* start of mismatch */ + ZFS_IOC_LEGACY_SNAPSHOT, /* 0x23 */ + ZFS_IOC_LEGACY_DSOBJ_TO_DSNAME, /* 0x24 */ + ZFS_IOC_LEGACY_OBJ_TO_PATH, /* 0x25 */ + ZFS_IOC_LEGACY_POOL_SET_PROPS, /* 0x26 */ + ZFS_IOC_LEGACY_POOL_GET_PROPS, /* 0x27 */ + ZFS_IOC_LEGACY_SET_FSACL, /* 0x28 */ + ZFS_IOC_LEGACY_GET_FSACL, /* 0x29 */ + ZFS_IOC_LEGACY_SHARE, /* 0x2a */ + ZFS_IOC_LEGACY_INHERIT_PROP, /* 0x2b */ + ZFS_IOC_LEGACY_SMB_ACL, /* 0x2c */ + ZFS_IOC_LEGACY_USERSPACE_ONE, /* 0x2d */ + ZFS_IOC_LEGACY_USERSPACE_MANY, /* 0x2e */ + ZFS_IOC_LEGACY_USERSPACE_UPGRADE, /* 0x2f */ + ZFS_IOC_LEGACY_HOLD, /* 0x30 */ + ZFS_IOC_LEGACY_RELEASE, /* 0x31 */ + ZFS_IOC_LEGACY_GET_HOLDS, /* 0x32 */ + ZFS_IOC_LEGACY_OBJSET_RECVD_PROPS, /* 0x33 */ + ZFS_IOC_LEGACY_VDEV_SPLIT, /* 0x34 */ + ZFS_IOC_LEGACY_NEXT_OBJ, /* 0x35 */ + ZFS_IOC_LEGACY_DIFF, /* 0x36 */ + ZFS_IOC_LEGACY_TMP_SNAPSHOT, /* 0x37 */ + ZFS_IOC_LEGACY_OBJ_TO_STATS, /* 0x38 */ + ZFS_IOC_LEGACY_SPACE_WRITTEN, /* 0x39 */ + ZFS_IOC_LEGACY_SPACE_SNAPS, /* 0x3a */ + ZFS_IOC_LEGACY_DESTROY_SNAPS, /* 0x3b */ + ZFS_IOC_LEGACY_POOL_REGUID, /* 0x3c */ + ZFS_IOC_LEGACY_POOL_REOPEN, /* 0x3d */ + ZFS_IOC_LEGACY_SEND_PROGRESS, /* 0x3e */ + ZFS_IOC_LEGACY_LOG_HISTORY, /* 0x3f */ + ZFS_IOC_LEGACY_SEND_NEW, /* 0x40 */ + ZFS_IOC_LEGACY_SEND_SPACE, /* 0x41 */ + ZFS_IOC_LEGACY_CLONE, /* 0x42 */ + ZFS_IOC_LEGACY_BOOKMARK, /* 0x43 */ + ZFS_IOC_LEGACY_GET_BOOKMARKS, /* 0x44 */ + ZFS_IOC_LEGACY_DESTROY_BOOKMARKS, /* 0x45 */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_RECV_NEW */ + ZFS_IOC_LEGACY_POOL_SYNC, /* 0x47 */ + ZFS_IOC_LEGACY_CHANNEL_PROGRAM, /* 0x48 */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_LOAD_KEY */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_UNLOAD_KEY */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_CHANGE_KEY */ + ZFS_IOC_LEGACY_REMAP, /* 0x4c */ + ZFS_IOC_LEGACY_POOL_CHECKPOINT, /* 0x4d */ + ZFS_IOC_LEGACY_POOL_DISCARD_CHECKPOINT, /* 0x4e */ + ZFS_IOC_LEGACY_POOL_INITIALIZE, /* 0x4f */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_POOL_TRIM */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_REDACT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOKMARK_PROPS */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_WAIT_FS */ +}; + +unsigned static long zfs_ioctl_ozfs_to_legacy_platform_[] = { + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_NEXT */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_CLEAR */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_EVENTS_SEEK */ + ZFS_IOC_LEGACY_NEXTBOOT, + ZFS_IOC_LEGACY_JAIL, + ZFS_IOC_LEGACY_UNJAIL, + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_SET_BOOTENV */ + ZFS_IOC_LEGACY_NONE, /* ZFS_IOC_GET_BOOTENV */ +}; + +int +zfs_ioctl_legacy_to_ozfs(int request) +{ + if (request >= sizeof (zfs_ioctl_legacy_to_ozfs_)/sizeof (long)) + return (-1); + return (zfs_ioctl_legacy_to_ozfs_[request]); +} + +int +zfs_ioctl_ozfs_to_legacy(int request) +{ + if (request > ZFS_IOC_LAST) + return (-1); + + if (request > ZFS_IOC_PLATFORM) + return (zfs_ioctl_ozfs_to_legacy_platform_[request]); + if (request >= sizeof (zfs_ioctl_ozfs_to_legacy_common_)/sizeof (long)) + return (-1); + return (zfs_ioctl_ozfs_to_legacy_common_[request]); +} + +void +zfs_cmd_legacy_to_ozfs(zfs_cmd_legacy_t *src, zfs_cmd_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + memcpy(&dst->zc_begin_record, &src->zc_begin_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_begin_record)); + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_zoneid = src->zc_jailid; +} + +void +zfs_cmd_ozfs_to_legacy(zfs_cmd_t *src, zfs_cmd_legacy_t *dst) +{ + memcpy(dst, src, offsetof(zfs_cmd_t, zc_objset_stats)); + *&dst->zc_objset_stats = *&src->zc_objset_stats; + *&dst->zc_begin_record.drr_u.drr_begin = *&src->zc_begin_record; + dst->zc_begin_record.drr_payloadlen = 0; + dst->zc_begin_record.drr_type = 0; + + memcpy(&dst->zc_inject_record, &src->zc_inject_record, + offsetof(zfs_cmd_t, zc_sendobj) - + offsetof(zfs_cmd_t, zc_inject_record)); + dst->zc_resumable = B_FALSE; + memcpy(&dst->zc_sendobj, &src->zc_sendobj, + sizeof (zfs_cmd_t) - 8 - offsetof(zfs_cmd_t, zc_sendobj)); + dst->zc_jailid = src->zc_zoneid; +} diff --git a/module/os/freebsd/zfs/zfs_ioctl_os.c b/module/os/freebsd/zfs/zfs_ioctl_os.c new file mode 100644 index 000000000000..0e0c16033b15 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_ioctl_os.c @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2020 iXsystems, Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHORS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include +__FBSDID("$FreeBSD$"); + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#if __FreeBSD_version < 1201517 +#define vm_page_max_user_wired vm_page_max_wired +#endif + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + int error = 0; + + if (*zfvp == NULL) + return (SET_ERROR(ESRCH)); + + error = vfs_busy((*zfvp)->z_vfs, 0); + if (error != 0) { + *zfvp = NULL; + error = SET_ERROR(ESRCH); + } + return (error); +} + +int +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_vfs != NULL); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + vfs_unbusy(zfsvfs->z_vfs); +} + +static const zfs_ioc_key_t zfs_keys_nextboot[] = { + {"command", DATA_TYPE_STRING, 0}, + { ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, 0}, + { ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, 0} +}; + +static int +zfs_ioc_jail(zfs_cmd_t *zc) +{ + + return (zone_dataset_attach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_unjail(zfs_cmd_t *zc) +{ + + return (zone_dataset_detach(curthread->td_ucred, zc->zc_name, + (int)zc->zc_zoneid)); +} + +static int +zfs_ioc_nextboot(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char name[MAXNAMELEN]; + spa_t *spa; + vdev_t *vd; + char *command; + uint64_t pool_guid; + uint64_t vdev_guid; + int error; + + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) != 0) + return (EINVAL); + if (nvlist_lookup_uint64(innvl, + ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (EINVAL); + if (nvlist_lookup_string(innvl, + "command", &command) != 0) + return (EINVAL); + + mutex_enter(&spa_namespace_lock); + spa = spa_by_guid(pool_guid, vdev_guid); + if (spa != NULL) + strcpy(name, spa_name(spa)); + mutex_exit(&spa_namespace_lock); + if (spa == NULL) + return (ENOENT); + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + vd = spa_lookup_by_guid(spa, vdev_guid, B_TRUE); + if (vd == NULL) { + (void) spa_vdev_state_exit(spa, NULL, ENXIO); + spa_close(spa, FTAG); + return (ENODEV); + } + error = vdev_label_write_pad2(vd, command, strlen(command)); + (void) spa_vdev_state_exit(spa, NULL, 0); + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_close(spa, FTAG); + return (error); +} + +uint64_t +zfs_max_nvlist_src_size_os(void) +{ + if (zfs_max_nvlist_src_size != 0) + return (zfs_max_nvlist_src_size); + + return (ptob(vm_page_max_user_wired) / 4); +} + +void +zfs_ioctl_init_os(void) +{ + zfs_ioctl_register_dataset_nolog(ZFS_IOC_JAIL, zfs_ioc_jail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_UNJAIL, zfs_ioc_unjail, + zfs_secpolicy_config, POOL_CHECK_NONE); + zfs_ioctl_register("fbsd_nextboot", ZFS_IOC_NEXTBOOT, + zfs_ioc_nextboot, zfs_secpolicy_config, NO_NAME, + POOL_CHECK_NONE, B_FALSE, B_FALSE, zfs_keys_nextboot, 3); + +} diff --git a/module/os/freebsd/zfs/zfs_onexit_os.c b/module/os/freebsd/zfs/zfs_onexit_os.c new file mode 100644 index 000000000000..8b22f2fdc3b3 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_onexit_os.c @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_state(minor, ZST_ONEXIT); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + file_t *fp, *tmpfp; + zfs_onexit_t *zo; + void *data; + int error; + + if ((error = zfs_file_get(fd, &fp))) + return (error); + + tmpfp = curthread->td_fpop; + curthread->td_fpop = fp; + error = devfs_get_cdevpriv(&data); + if (error == 0) + *minorp = (minor_t)(uintptr_t)data; + curthread->td_fpop = tmpfp; + if (error != 0) + return (SET_ERROR(EBADF)); + return (zfs_onexit_minor_to_state(*minorp, &zo)); +} + +void +zfs_onexit_fd_rele(int fd) +{ + zfs_file_put(fd); +} diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c new file mode 100644 index 000000000000..f94ea44335c6 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -0,0 +1,2482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_comutil.h" + +#ifndef MNTK_VMSETSIZE_BUG +#define MNTK_VMSETSIZE_BUG 0 +#endif +#ifndef MNTK_NOMSYNC +#define MNTK_NOMSYNC 8 +#endif + +/* BEGIN CSTYLED */ +struct mtx zfs_debug_mtx; +MTX_SYSINIT(zfs_debug_mtx, &zfs_debug_mtx, "zfs_debug", MTX_DEF); + +SYSCTL_NODE(_vfs, OID_AUTO, zfs, CTLFLAG_RW, 0, "ZFS file system"); + +int zfs_super_owner; +SYSCTL_INT(_vfs_zfs, OID_AUTO, super_owner, CTLFLAG_RW, &zfs_super_owner, 0, + "File system owner can perform privileged operation on his file systems"); + +int zfs_debug_level; +SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, + "Debug level"); + +SYSCTL_NODE(_vfs_zfs, OID_AUTO, version, CTLFLAG_RD, 0, "ZFS versions"); +static int zfs_version_acl = ZFS_ACL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, acl, CTLFLAG_RD, &zfs_version_acl, 0, + "ZFS_ACL_VERSION"); +static int zfs_version_spa = SPA_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, spa, CTLFLAG_RD, &zfs_version_spa, 0, + "SPA_VERSION"); +static int zfs_version_zpl = ZPL_VERSION; +SYSCTL_INT(_vfs_zfs_version, OID_AUTO, zpl, CTLFLAG_RD, &zfs_version_zpl, 0, + "ZPL_VERSION"); +/* END CSTYLED */ + +static int zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg); +static int zfs_mount(vfs_t *vfsp); +static int zfs_umount(vfs_t *vfsp, int fflag); +static int zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp); +static int zfs_statfs(vfs_t *vfsp, struct statfs *statp); +static int zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp); +static int zfs_sync(vfs_t *vfsp, int waitfor); +#if __FreeBSD_version >= 1300098 +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, + struct ucred **credanonp, int *numsecflavors, int *secflavors); +#else +static int zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors); +#endif +static int zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp); +static void zfs_freevfs(vfs_t *vfsp); + +struct vfsops zfs_vfsops = { + .vfs_mount = zfs_mount, + .vfs_unmount = zfs_umount, +#if __FreeBSD_version >= 1300049 + .vfs_root = vfs_cache_root, + .vfs_cachedroot = zfs_root, +#else + .vfs_root = zfs_root, +#endif + .vfs_statfs = zfs_statfs, + .vfs_vget = zfs_vget, + .vfs_sync = zfs_sync, + .vfs_checkexp = zfs_checkexp, + .vfs_fhtovp = zfs_fhtovp, + .vfs_quotactl = zfs_quotactl, +}; + +VFS_SET(zfs_vfsops, zfs, VFCF_JAIL | VFCF_DELEGADMIN); + +/* + * We need to keep a count of active fs's. + * This is necessary to prevent our module + * from being unloaded after a umount -f + */ +static uint32_t zfs_active_fs_count = 0; + +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, &zfvp); + if (error != 0) + return (error); + if (zfvp == NULL) + return (ENOENT); + vfsp = zfvp->z_vfs; + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) + tmp = 1; + break; + case ZFS_PROP_DEVICES: + if (vfs_optionisset(vfsp, MNTOPT_NODEVICES, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_DEVICES, NULL)) + tmp = 1; + break; + case ZFS_PROP_EXEC: + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) + tmp = 1; + break; + case ZFS_PROP_SETUID: + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) + tmp = 1; + break; + case ZFS_PROP_READONLY: + if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL)) + tmp = 1; + break; + case ZFS_PROP_XATTR: + if (zfvp->z_flags & ZSB_XATTR) + tmp = zfvp->z_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) + tmp = 0; + if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) + tmp = 1; + break; + default: + vfs_unbusy(vfsp); + return (ENOENT); + } + + vfs_unbusy(vfsp); + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +} + +static int +zfs_getquota(zfsvfs_t *zfsvfs, uid_t id, int isgroup, struct dqblk64 *dqp) +{ + int error = 0; + char buf[32]; + uint64_t usedobj, quotaobj; + uint64_t quota, used = 0; + timespec_t now; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) { + error = ENOENT; + goto done; + } + (void) sprintf(buf, "%llx", (longlong_t)id); + if ((error = zap_lookup(zfsvfs->z_os, quotaobj, + buf, sizeof (quota), 1, "a)) != 0) { + dprintf("%s(%d): quotaobj lookup failed\n", + __FUNCTION__, __LINE__); + goto done; + } + /* + * quota(8) uses bsoftlimit as "quoota", and hardlimit as "limit". + * So we set them to be the same. + */ + dqp->dqb_bsoftlimit = dqp->dqb_bhardlimit = btodb(quota); + error = zap_lookup(zfsvfs->z_os, usedobj, buf, sizeof (used), 1, &used); + if (error && error != ENOENT) { + dprintf("%s(%d): usedobj failed; %d\n", + __FUNCTION__, __LINE__, error); + goto done; + } + dqp->dqb_curblocks = btodb(used); + dqp->dqb_ihardlimit = dqp->dqb_isoftlimit = 0; + vfs_timestamp(&now); + /* + * Setting this to 0 causes FreeBSD quota(8) to print + * the number of days since the epoch, which isn't + * particularly useful. + */ + dqp->dqb_btime = dqp->dqb_itime = now.tv_sec; +done: + return (error); +} + +static int +zfs_quotactl(vfs_t *vfsp, int cmds, uid_t id, void *arg) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + struct thread *td; + int cmd, type, error = 0; + int bitsize; + zfs_userquota_prop_t quota_type; + struct dqblk64 dqblk = { 0 }; + + td = curthread; + cmd = cmds >> SUBCMDSHIFT; + type = cmds & SUBCMDMASK; + + ZFS_ENTER(zfsvfs); + if (id == -1) { + switch (type) { + case USRQUOTA: + id = td->td_ucred->cr_ruid; + break; + case GRPQUOTA: + id = td->td_ucred->cr_rgid; + break; + default: + error = EINVAL; + if (cmd == Q_QUOTAON || cmd == Q_QUOTAOFF) + vfs_unbusy(vfsp); + goto done; + } + } + /* + * Map BSD type to: + * ZFS_PROP_USERUSED, + * ZFS_PROP_USERQUOTA, + * ZFS_PROP_GROUPUSED, + * ZFS_PROP_GROUPQUOTA + */ + switch (cmd) { + case Q_SETQUOTA: + case Q_SETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERQUOTA; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPQUOTA; + else + error = EINVAL; + break; + case Q_GETQUOTA: + case Q_GETQUOTA32: + if (type == USRQUOTA) + quota_type = ZFS_PROP_USERUSED; + else if (type == GRPQUOTA) + quota_type = ZFS_PROP_GROUPUSED; + else + error = EINVAL; + break; + } + + /* + * Depending on the cmd, we may need to get + * the ruid and domain (see fuidstr_to_sid?), + * the fuid (how?), or other information. + * Create fuid using zfs_fuid_create(zfsvfs, id, + * ZFS_OWNER or ZFS_GROUP, cr, &fuidp)? + * I think I can use just the id? + * + * Look at zfs_id_overquota() to look up a quota. + * zap_lookup(something, quotaobj, fuidstring, + * sizeof (long long), 1, "a) + * + * See zfs_set_userquota() to set a quota. + */ + if ((uint32_t)type >= MAXQUOTAS) { + error = EINVAL; + goto done; + } + + switch (cmd) { + case Q_GETQUOTASIZE: + bitsize = 64; + error = copyout(&bitsize, arg, sizeof (int)); + break; + case Q_QUOTAON: + // As far as I can tell, you can't turn quotas on or off on zfs + error = 0; + vfs_unbusy(vfsp); + break; + case Q_QUOTAOFF: + error = ENOTSUP; + vfs_unbusy(vfsp); + break; + case Q_SETQUOTA: + error = copyin(arg, &dqblk, sizeof (dqblk)); + if (error == 0) + error = zfs_set_userquota(zfsvfs, quota_type, + "", id, dbtob(dqblk.dqb_bhardlimit)); + break; + case Q_GETQUOTA: + error = zfs_getquota(zfsvfs, id, type == GRPQUOTA, &dqblk); + if (error == 0) + error = copyout(&dqblk, arg, sizeof (dqblk)); + break; + default: + error = EINVAL; + break; + } +done: + ZFS_EXIT(zfsvfs); + return (error); +} + + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY)); +} + +/*ARGSUSED*/ +static int +zfs_sync(vfs_t *vfsp, int waitfor) +{ + + /* + * Data integrity is job one. We don't want a compromised kernel + * writing to the storage pool, so we never sync during panic. + */ + if (panicstr) + return (0); + + /* + * Ignore the system syncher. ZFS already commits async data + * at zfs_txg_timeout intervals. + */ + if (waitfor == MNT_LAZY) + return (0); + + if (vfsp != NULL) { + /* + * Sync a specific filesystem. + */ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; + int error; + + error = vfs_stdsync(vfsp, waitfor); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (rebooting && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == TRUE) { + zfsvfs->z_atime = TRUE; + zfsvfs->z_vfs->vfs_flag &= ~MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_ATIME, NULL, 0); + } else { + zfsvfs->z_atime = FALSE; + zfsvfs->z_vfs->vfs_flag |= MNT_NOATIME; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_ATIME); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOATIME, NULL, 0); + } +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; + zfsvfs->z_vfs->mnt_stat.f_iosize = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval) { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RW); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RO, NULL, 0); + } else { + /* XXX locking on vfs_flag? */ + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_RO); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_RW, NULL, 0); + } +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_SETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOSETUID; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOSETUID); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_SETUID, NULL, 0); + } +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == FALSE) { + zfsvfs->z_vfs->vfs_flag |= VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_EXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC, NULL, 0); + } else { + zfsvfs->z_vfs->vfs_flag &= ~VFS_NOEXEC; + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NOEXEC); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_EXEC, NULL, 0); + } +} + +/* + * The nbmand mount option can be changed at mount time. + * We can't allow it to be toggled on live file systems or incorrect + * behavior may be seen from cifs clients + * + * This property isn't registered via dsl_prop_register(), but this callback + * will be called when a file system is first mounted + */ +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + if (newval == FALSE) { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND, NULL, 0); + } else { + vfs_clearmntopt(zfsvfs->z_vfs, MNTOPT_NONBMAND); + vfs_setmntopt(zfsvfs->z_vfs, MNTOPT_NBMAND, NULL, 0); + } +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + uint64_t nbmand; + boolean_t readonly = B_FALSE; + boolean_t do_readonly = B_FALSE; + boolean_t setuid = B_FALSE; + boolean_t do_setuid = B_FALSE; + boolean_t exec = B_FALSE; + boolean_t do_exec = B_FALSE; + boolean_t xattr = B_FALSE; + boolean_t atime = B_FALSE; + boolean_t do_atime = B_FALSE; + boolean_t do_xattr = B_FALSE; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * This function can be called for a snapshot when we update snapshot's + * mount point, which isn't really supported. + */ + if (dmu_objset_is_snapshot(os)) + return (EOPNOTSUPP); + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (vfs_optionisset(vfsp, MNTOPT_RO, NULL) || + !spa_writeable(dmu_objset_spa(os))) { + readonly = B_TRUE; + do_readonly = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_RW, NULL)) { + readonly = B_FALSE; + do_readonly = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOSETUID, NULL)) { + setuid = B_FALSE; + do_setuid = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SETUID, NULL)) { + setuid = B_TRUE; + do_setuid = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOEXEC, NULL)) { + exec = B_FALSE; + do_exec = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_EXEC, NULL)) { + exec = B_TRUE; + do_exec = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_OFF; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_XATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_DIRXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_DIR; + do_xattr = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_SAXATTR, NULL)) { + zfsvfs->z_xattr = xattr = ZFS_XATTR_SA; + do_xattr = B_TRUE; + } + if (vfs_optionisset(vfsp, MNTOPT_NOATIME, NULL)) { + atime = B_FALSE; + do_atime = B_TRUE; + } else if (vfs_optionisset(vfsp, MNTOPT_ATIME, NULL)) { + atime = B_TRUE; + do_atime = B_TRUE; + } + + /* + * We need to enter pool configuration here, so that we can use + * dsl_prop_get_int_ds() to handle the special nbmand property below. + * dsl_prop_get_integer() can not be used, because it has to acquire + * spa_namespace_lock and we can not do that because we already hold + * z_teardown_lock. The problem is that spa_write_cachefile() is called + * with spa_namespace_lock held and the function calls ZFS vnode + * operations to write the cache file and thus z_teardown_lock is + * acquired after spa_namespace_lock. + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + + /* + * nbmand is a special property. It can only be changed at + * mount time. + * + * This is weird, but it is documented to only be changeable + * at mount time. + */ + if (vfs_optionisset(vfsp, MNTOPT_NONBMAND, NULL)) { + nbmand = B_FALSE; + } else if (vfs_optionisset(vfsp, MNTOPT_NBMAND, NULL)) { + nbmand = B_TRUE; + } else if ((error = dsl_prop_get_int_ds(ds, "nbmand", &nbmand) != 0)) { + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + return (error); + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (do_readonly) + readonly_changed_cb(zfsvfs, readonly); + if (do_setuid) + setuid_changed_cb(zfsvfs, setuid); + if (do_exec) + exec_changed_cb(zfsvfs, exec); + if (do_xattr) + xattr_changed_cb(zfsvfs, xattr); + if (do_atime) + atime_changed_cb(zfsvfs, atime); + + nbmand_changed_cb(zfsvfs, nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printf("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + } + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + /* + * Only use the name cache if we are looking for a + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name (which is always the case on + * FreeBSD). + */ + zfsvfs->z_use_namecache = !zfsvfs->z_norm || + ((zfsvfs->z_case == ZFS_CASE_MIXED) && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER)); + + return (0); +} + +taskq_t *zfsvfs_taskq; + +static void +zfsvfs_task_unlinked_drain(void *context, int pending __unused) +{ + + zfs_unlinked_drain((zfsvfs_t *)context); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + /* + * XXX: Fix struct statfs so this isn't necessary! + * + * The 'osname' is used as the filesystem's special node, which means + * it must fit in statfs.f_mntfromname, or else it can't be + * enumerated, so libzfs_mnttab_find() returns NULL, which causes + * 'zfs unmount' to think it's not mounted when it is. + */ + if (strlen(osname) >= MNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, + &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + + return (error); +} + + +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + TASK_INIT(&zfsvfs->z_unlinked_drain_task, 0, + zfsvfs_task_unlinked_drain, zfsvfs); +#ifdef DIAGNOSTIC + rrm_init(&zfsvfs->z_teardown_lock, B_TRUE); +#else + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); +#endif + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (int i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if (!(zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && + dmu_objset_incompatible_encryption_version(zfsvfs->z_os)) + return (SET_ERROR(EROFS)); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + boolean_t readonly; + + ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; + if (readonly != 0) { + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + } else { + dsl_dir_t *dd; + zap_stats_t zs; + + if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + &zs) == 0) { + dataset_kstats_update_nunlinks_kstat( + &zfsvfs->z_kstat, zs.zs_num_entries); + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "num_entries in unlinked set: %llu", + zs.zs_num_entries); + } + + zfs_unlinked_drain(zfsvfs); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + boolean_t use_nc = zfsvfs->z_use_namecache; + zfsvfs->z_use_namecache = B_FALSE; + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + zfsvfs->z_use_namecache = use_nc; + } + } + + /* restore readonly bit */ + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag |= VFS_RDONLY; + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i; + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + ASSERT(zfsvfs->z_nr_znodes == 0); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + dataset_kstats_destroy(&zfsvfs->z_kstat); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_vfs) { + if (zfsvfs->z_use_fuids) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } else { + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_clear_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } + } + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static int +zfs_domount(vfs_t *vfsp, char *osname) +{ + uint64_t recordsize, fsid_guid; + int error = 0; + zfsvfs_t *zfsvfs; + + ASSERT(vfsp); + ASSERT(osname); + + error = zfsvfs_create(osname, vfsp->mnt_flag & MNT_RDONLY, &zfsvfs); + if (error) + return (error); + zfsvfs->z_vfs = vfsp; + + if ((error = dsl_prop_get_integer(osname, + "recordsize", &recordsize, NULL))) + goto out; + zfsvfs->z_vfs->vfs_bsize = SPA_MINBLOCKSIZE; + zfsvfs->z_vfs->mnt_stat.f_iosize = recordsize; + + vfsp->vfs_data = zfsvfs; + vfsp->mnt_flag |= MNT_LOCAL; + vfsp->mnt_kern_flag |= MNTK_LOOKUP_SHARED; + vfsp->mnt_kern_flag |= MNTK_SHARED_WRITES; + vfsp->mnt_kern_flag |= MNTK_EXTENDED_SHARED; + /* + * This can cause a loss of coherence between ARC and page cache + * on ZoF - unclear if the problem is in FreeBSD or ZoF + */ + vfsp->mnt_kern_flag |= MNTK_NO_IOPF; /* vn_io_fault can be used */ + vfsp->mnt_kern_flag |= MNTK_NOMSYNC; + vfsp->mnt_kern_flag |= MNTK_VMSETSIZE_BUG; + +#if defined(_KERNEL) && !defined(KMEM_DEBUG) + vfsp->mnt_kern_flag |= MNTK_FPLOOKUP; +#endif + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + (vfsp->mnt_vfc->vfc_typenum & 0xFF); + + /* + * Set features for file system. + */ + zfs_set_fuid_feature(zfsvfs); + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + vfs_set_feature(vfsp, VFSFT_NOCASESENSITIVE); + } else if (zfsvfs->z_case == ZFS_CASE_MIXED) { + vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); + vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); + } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + vfs_mountedfrom(vfsp, osname); + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); +out: + if (error) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } else { + atomic_inc_32(&zfs_active_fs_count); + } + + return (error); +} + +static void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef SECLABEL +/* + * Convert a decimal digit string to a uint64_t integer. + */ +static int +str_to_uint64(char *str, uint64_t *objnum) +{ + uint64_t num = 0; + + while (*str) { + if (*str < '0' || *str > '9') + return (SET_ERROR(EINVAL)); + + num = num*10 + *str++ - '0'; + } + + *objnum = num; + return (0); +} + +/* + * The boot path passed from the boot loader is in the form of + * "rootpool-name/root-filesystem-object-number'. Convert this + * string to a dataset name: "rootpool-name/root-filesystem-name". + */ +static int +zfs_parse_bootfs(char *bpath, char *outpath) +{ + char *slashp; + uint64_t objnum; + int error; + + if (*bpath == 0 || *bpath == '/') + return (SET_ERROR(EINVAL)); + + (void) strcpy(outpath, bpath); + + slashp = strchr(bpath, '/'); + + /* if no '/', just return the pool name */ + if (slashp == NULL) { + return (0); + } + + /* if not a number, just return the root dataset name */ + if (str_to_uint64(slashp+1, &objnum)) { + return (0); + } + + *slashp = '\0'; + error = dsl_dsobj_to_dsname(bpath, objnum, outpath); + *slashp = '/'; + + return (error); +} + +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : EACCES); + } + return (SET_ERROR(EACCES)); +} + +/* + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns 0 if access allowed, error otherwise (e.g. EACCES) + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (SET_ERROR(EACCES)); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (SET_ERROR(EACCES)); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(vfsp->vfs_mntpt, B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EACCES)); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set_string(osname, + zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} +#endif /* SECLABEL */ + +static int +getpoolname(const char *osname, char *poolname) +{ + char *p; + + p = strchr(osname, '/'); + if (p == NULL) { + if (strlen(osname) >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strcpy(poolname, osname); + } else { + if (p - osname >= MAXNAMELEN) + return (ENAMETOOLONG); + (void) strncpy(poolname, osname, p - osname); + poolname[p - osname] = '\0'; + } + return (0); +} + +/*ARGSUSED*/ +static int +zfs_mount(vfs_t *vfsp) +{ + kthread_t *td = curthread; + vnode_t *mvp = vfsp->mnt_vnodecovered; + cred_t *cr = td->td_ucred; + char *osname; + int error = 0; + int canwrite; + + if (vfs_getopt(vfsp->mnt_optnew, "from", (void **)&osname, NULL)) + return (SET_ERROR(EINVAL)); + + /* + * If full-owner-access is enabled and delegated administration is + * turned on, we must set nosuid. + */ + if (zfs_super_owner && + dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != ECANCELED) { + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Check for mount privilege? + * + * If we don't have privilege then see if + * we have local permission to allow it + */ + error = secpolicy_fs_mount(cr, mvp, vfsp); + if (error) { + if (dsl_deleg_access(osname, ZFS_DELEG_PERM_MOUNT, cr) != 0) + goto out; + + if (!(vfsp->vfs_flag & MS_REMOUNT)) { + vattr_t vattr; + + /* + * Make sure user is the owner of the mount point + * or has sufficient privileges. + */ + + vattr.va_mask = AT_UID; + + vn_lock(mvp, LK_SHARED | LK_RETRY); + if (VOP_GETATTR(mvp, &vattr, cr)) { + VOP_UNLOCK1(mvp); + goto out; + } + + if (secpolicy_vnode_owner(mvp, cr, vattr.va_uid) != 0 && + VOP_ACCESS(mvp, VWRITE, cr, td) != 0) { + VOP_UNLOCK1(mvp); + goto out; + } + VOP_UNLOCK1(mvp); + } + + secpolicy_fs_mount_clearopts(cr, vfsp); + } + + /* + * Refuse to mount a filesystem if we are in a local zone and the + * dataset is not visible. + */ + if (!INGLOBALZONE(curproc) && + (!zone_dataset_visible(osname, &canwrite) || !canwrite)) { + error = SET_ERROR(EPERM); + goto out; + } + +#ifdef SECLABEL + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; +#endif + + vfsp->vfs_flag |= MNT_NFS4ACLS; + + /* + * When doing a remount, we simply refresh our temporary properties + * according to those options set in the current VFS options. + */ + if (vfsp->vfs_flag & MS_REMOUNT) { + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * Refresh mount options with z_teardown_lock blocking I/O while + * the filesystem is in an inconsistent state. + * The lock also serializes this code with filesystem + * manipulations between entry to zfs_suspend_fs() and return + * from zfs_resume_fs(). + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfs_unregister_callbacks(zfsvfs); + error = zfs_register_callbacks(vfsp); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + goto out; + } + + /* Initial root mount: try hard to import the requested root pool. */ + if ((vfsp->vfs_flag & MNT_ROOTFS) != 0 && + (vfsp->vfs_flag & MNT_UPDATE) == 0) { + char pname[MAXNAMELEN]; + + error = getpoolname(osname, pname); + if (error == 0) + error = spa_import_rootpool(pname, false); + if (error) + goto out; + } + DROP_GIANT(); + error = zfs_domount(vfsp, osname); + PICKUP_GIANT(); + +out: + return (error); +} + +static int +zfs_statfs(vfs_t *vfsp, struct statfs *statp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + uint64_t refdbytes, availbytes, usedobjs, availobjs; + + statp->f_version = STATFS_VERSION; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + /* + * The underlying storage pool actually uses multiple block sizes. + * We report the fragsize as the smallest block size we support, + * and we report our blocksize as the filesystem's maximum blocksize. + */ + statp->f_bsize = SPA_MINBLOCKSIZE; + statp->f_iosize = zfsvfs->z_vfs->mnt_stat.f_iosize; + + /* + * The following report "total" blocks of various kinds in the + * file system, but reported in terms of f_frsize - the + * "fragment" size. + */ + + statp->f_blocks = (refdbytes + availbytes) >> SPA_MINBLOCKSHIFT; + statp->f_bfree = availbytes / statp->f_bsize; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of object available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, statp->f_bfree); + statp->f_files = statp->f_ffree + usedobjs; + + /* + * We're a zfs filesystem. + */ + strlcpy(statp->f_fstypename, "zfs", + sizeof (statp->f_fstypename)); + + strlcpy(statp->f_mntfromname, vfsp->mnt_stat.f_mntfromname, + sizeof (statp->f_mntfromname)); + strlcpy(statp->f_mntonname, vfsp->mnt_stat.f_mntonname, + sizeof (statp->f_mntonname)); + + statp->f_namemax = MAXNAMELEN - 1; + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_root(vfs_t *vfsp, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *vpp = ZTOV(rootzp); + + ZFS_EXIT(zfsvfs); + + if (error == 0) { + error = vn_lock(*vpp, flags); + if (error != 0) { + VN_RELE(*vpp); + *vpp = NULL; + } + } + return (error); +} + +/* + * Teardown the zfsvfs::z_os. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + dsl_dir_t *dd; + + /* + * If someone has not already unmounted this file system, + * drain the zrele_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but zreles run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's vfsp as the parent + * filesystem and all of its snapshots have their vnode's + * v_vfsp set to the parent's filesystem's vfsp. Note, + * 'z_parent' is self referential for non-snapshots. + */ +#ifdef FREEBSD_NAMECACHE + cache_purgevfs(zfsvfs->z_parent->z_vfs, true); +#endif + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no vops active, and any new vops will + * fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) + if (zp->z_sa_hdl) { + ASSERT(ZTOV(zp)->v_count >= 0); + zfs_znode_dmu_fini(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * If we are unmounting, set the unmounted flag and let new vops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other vops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data + */ + if (!zfs_is_readonly(zfsvfs)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + dmu_objset_evict_dbufs(zfsvfs->z_os); + dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +/*ARGSUSED*/ +static int +zfs_umount(vfs_t *vfsp, int fflag) +{ + kthread_t *td = curthread; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + objset_t *os; + cred_t *cr = td->td_ucred; + int ret; + + ret = secpolicy_fs_unmount(cr, vfsp); + if (ret) { + if (dsl_deleg_access((char *)vfsp->vfs_resource, + ZFS_DELEG_PERM_MOUNT, cr)) + return (ret); + } + + /* + * Unmount any snapshots mounted under .zfs before unmounting the + * dataset itself. + */ + if (zfsvfs->z_ctldir != NULL) { + if ((ret = zfsctl_umount_snapshots(vfsp, fflag, cr)) != 0) + return (ret); + } + + if (fflag & MS_FORCE) { + /* + * Mark file system as unmounted before calling + * vflush(FORCECLOSE). This way we ensure no future vnops + * will be called and risk operating on DOOMED vnodes. + */ + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + zfsvfs->z_unmounted = B_TRUE; + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * Flush all the files. + */ + ret = vflush(vfsp, 0, (fflag & MS_FORCE) ? FORCECLOSE : 0, td); + if (ret != 0) + return (ret); + while (taskqueue_cancel(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task, NULL) != 0) + taskqueue_drain(zfsvfs_taskq->tq_queue, + &zfsvfs->z_unlinked_drain_task); + + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + /* + * We can now safely destroy the '.zfs' directory node. + */ + if (zfsvfs->z_ctldir != NULL) + zfsctl_destroy(zfsvfs); + zfs_freevfs(vfsp); + + return (0); +} + +static int +zfs_vget(vfs_t *vfsp, ino_t ino, int flags, vnode_t **vpp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + int err; + + /* + * zfs_zget() can't operate on virtual entries like .zfs/ or + * .zfs/snapshot/ directories, that's why we return EOPNOTSUPP. + * This will make NFS to switch to LOOKUP instead of using VGET. + */ + if (ino == ZFSCTL_INO_ROOT || ino == ZFSCTL_INO_SNAPDIR || + (zfsvfs->z_shares_dir != 0 && ino == zfsvfs->z_shares_dir)) + return (EOPNOTSUPP); + + ZFS_ENTER(zfsvfs); + err = zfs_zget(zfsvfs, ino, &zp); + if (err == 0 && zp->z_unlinked) { + vrele(ZTOV(zp)); + err = EINVAL; + } + if (err == 0) + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + if (err == 0) { + err = vn_lock(*vpp, flags); + if (err != 0) + vrele(*vpp); + } + if (err != 0) + *vpp = NULL; + return (err); +} + +static int +#if __FreeBSD_version >= 1300098 +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, uint64_t *extflagsp, + struct ucred **credanonp, int *numsecflavors, int *secflavors) +#else +zfs_checkexp(vfs_t *vfsp, struct sockaddr *nam, int *extflagsp, + struct ucred **credanonp, int *numsecflavors, int **secflavors) +#endif +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + /* + * If this is regular file system vfsp is the same as + * zfsvfs->z_parent->z_vfs, but if it is snapshot, + * zfsvfs->z_parent->z_vfs represents parent file system + * which we have to use here, because only this file system + * has mnt_export configured. + */ + return (vfs_stdcheckexp(zfsvfs->z_parent->z_vfs, nam, extflagsp, + credanonp, numsecflavors, secflavors)); +} + +CTASSERT(SHORT_FID_LEN <= sizeof (struct fid)); +CTASSERT(LONG_FID_LEN <= sizeof (struct fid)); + +static int +zfs_fhtovp(vfs_t *vfsp, fid_t *fidp, int flags, vnode_t **vpp) +{ + struct componentname cn; + zfsvfs_t *zfsvfs = vfsp->vfs_data; + znode_t *zp; + vnode_t *dvp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *vpp = NULL; + + ZFS_ENTER(zfsvfs); + + /* + * On FreeBSD we can get snapshot's mount point or its parent file + * system mount point depending if snapshot is already mounted or not. + */ + if (zfsvfs->z_parent == zfsvfs && fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + ZFS_EXIT(zfsvfs); + + err = zfsctl_lookup_objset(vfsp, objsetid, &zfsvfs); + if (err) + return (SET_ERROR(EINVAL)); + ZFS_ENTER(zfsvfs); + } + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * A zero fid_gen means we are in .zfs or the .zfs/snapshot + * directory tree. If the object == zfsvfs->z_shares_dir, then + * we are in the .zfs/shares directory tree. + */ + if ((fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) || + (zfsvfs->z_shares_dir != 0 && object == zfsvfs->z_shares_dir)) { + ZFS_EXIT(zfsvfs); + VERIFY0(zfsctl_root(zfsvfs, LK_SHARED, &dvp)); + if (object == ZFSCTL_INO_SNAPDIR) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN | LOCKLEAF; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else if (object == zfsvfs->z_shares_dir) { + /* + * XXX This branch must not be taken, + * if it is, then the lookup below will + * explode. + */ + cn.cn_nameptr = "shares"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = LOOKUP; + cn.cn_flags = ISLASTCN; + cn.cn_lkflags = flags; + VERIFY0(VOP_LOOKUP(dvp, vpp, &cn)); + vput(dvp); + } else { + *vpp = dvp; + } + return (err); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%u mask %llx]\n", object, fid_gen, gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%u) != fid gen (%u)\n", zp_gen, fid_gen); + vrele(ZTOV(zp)); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + *vpp = ZTOV(zp); + ZFS_EXIT(zfsvfs); + err = vn_lock(*vpp, flags); + if (err == 0) + vnode_create_vobject(*vpp, zp->z_size, curthread); + else + *vpp = NULL; + return (err); +} + +/* + * Block out VOPs and close zfsvfs_t::z_os + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + + /* + * Attempt to re-establish all the active znodes with + * their dbufs. If a zfs_rezget() fails, then we'll let + * any potential callers discover that via ZFS_ENTER_VERIFY_VP + * when they try to use their znode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + (void) zfs_rezget(zp); + } + mutex_exit(&zfsvfs->z_znodes_lock); + +bail: + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (vn_vfswlock(zfsvfs->z_vfs->vfs_vnodecovered) == 0) { + vfs_ref(zfsvfs->z_vfs); + (void) dounmount(zfsvfs->z_vfs, MS_FORCE, curthread); + } + } + return (err); +} + +static void +zfs_freevfs(vfs_t *vfsp) +{ + zfsvfs_t *zfsvfs = vfsp->vfs_data; + + zfsvfs_free(zfsvfs); + + atomic_dec_32(&zfs_active_fs_count); +} + +#ifdef __i386__ +static int desiredvnodes_backup; +#include + + +#include +#include +#include +#include +#endif + +static void +zfs_vnodes_adjust(void) +{ +#ifdef __i386__ + int newdesiredvnodes; + + desiredvnodes_backup = desiredvnodes; + + /* + * We calculate newdesiredvnodes the same way it is done in + * vntblinit(). If it is equal to desiredvnodes, it means that + * it wasn't tuned by the administrator and we can tune it down. + */ + newdesiredvnodes = min(maxproc + vm_cnt.v_page_count / 4, 2 * + vm_kmem_size / (5 * (sizeof (struct vm_object) + + sizeof (struct vnode)))); + if (newdesiredvnodes == desiredvnodes) + desiredvnodes = (3 * newdesiredvnodes) / 4; +#endif +} + +static void +zfs_vnodes_adjust_back(void) +{ + +#ifdef __i386__ + desiredvnodes = desiredvnodes_backup; +#endif +} + +void +zfs_init(void) +{ + + printf("ZFS filesystem version: " ZPL_VERSION_STRING "\n"); + + /* + * Initialize .zfs directory structures + */ + zfsctl_init(); + + /* + * Initialize znode cache, vnode ops, etc... + */ + zfs_znode_init(); + + /* + * Reduce number of vnodes. Originally number of vnodes is calculated + * with UFS inode in mind. We reduce it here, because it's too big for + * ZFS/i386. + */ + zfs_vnodes_adjust(); + + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); + + zfsvfs_taskq = taskq_create("zfsvfs", 1, minclsyspri, 0, 0, 0); +} + +void +zfs_fini(void) +{ + taskq_destroy(zfsvfs_taskq); + zfsctl_fini(); + zfs_znode_fini(); + zfs_vnodes_adjust_back(); +} + +int +zfs_busy(void) +{ + return (zfs_active_fs_count != 0); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_vfs, 0); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %ju to %ju", (uintmax_t)zfsvfs->z_version, + (uintmax_t)newvers); + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) { + pname = ZPL_VERSION_STR; + } else { + pname = zfs_prop_to_name(prop); + } + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the corresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_vfs != NULL && + (zfvp->z_vfs->mnt_kern_flag & MNTK_UNMOUNT)) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +#ifdef _KERNEL +void +zfsvfs_update_fromname(const char *oldname, const char *newname) +{ + char tmpbuf[MAXPATHLEN]; + struct mount *mp; + char *fromname; + size_t oldlen; + + oldlen = strlen(oldname); + + mtx_lock(&mountlist_mtx); + TAILQ_FOREACH(mp, &mountlist, mnt_list) { + fromname = mp->mnt_stat.f_mntfromname; + if (strcmp(fromname, oldname) == 0) { + (void) strlcpy(fromname, newname, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + if (strncmp(fromname, oldname, oldlen) == 0 && + (fromname[oldlen] == '/' || fromname[oldlen] == '@')) { + (void) snprintf(tmpbuf, sizeof (tmpbuf), "%s%s", + newname, fromname + oldlen); + (void) strlcpy(fromname, tmpbuf, + sizeof (mp->mnt_stat.f_mntfromname)); + continue; + } + } + mtx_unlock(&mountlist_mtx); +} +#endif diff --git a/module/os/freebsd/zfs/zfs_vnops.c b/module/os/freebsd/zfs/zfs_vnops.c new file mode 100644 index 000000000000..2a4acf21582f --- /dev/null +++ b/module/os/freebsd/zfs/zfs_vnops.c @@ -0,0 +1,6629 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#if __FreeBSD_version >= 1300102 +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#ifndef VN_OPEN_INVFS +#define VN_OPEN_INVFS 0x0 +#endif + +VFS_SMR_DECLARE; + +#if __FreeBSD_version >= 1300047 +#define vm_page_wire_lock(pp) +#define vm_page_wire_unlock(pp) +#else +#define vm_page_wire_lock(pp) vm_page_lock(pp) +#define vm_page_wire_unlock(pp) vm_page_unlock(pp) +#endif + +static int +zfs_u8_validate(const char *u8str, size_t n, char **list, int flag, int *errnum) +{ + + return (u8_validate(__DECONST(char *, u8str), n, list, flag, errnum)); +} +#define u8_validate zfs_u8_validate + +#ifdef DEBUG_VFS_LOCKS +#define VNCHECKREF(vp) \ + VNASSERT((vp)->v_holdcnt > 0 && (vp)->v_usecount > 0, vp, \ + ("%s: wrong ref counts", __func__)); +#else +#define VNCHECKREF(vp) +#endif + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) VN_RELE() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lookup(&dl, ...) // lock directory entry (may VN_HOLD()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * VN_RELE(...); // release held vnodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* ARGSUSED */ +static int +zfs_open(vnode_t **vpp, int flag, cred_t *cr) +{ + znode_t *zp = VTOZ(*vpp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((flag & FWRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & FAPPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (fs_vscan(*vpp, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & (FSYNC | FDSYNC)) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +static int +zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if ((flag & (FSYNC | FDSYNC)) && (count == 1)) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && + ZTOV(zp)->v_type == VREG && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(fs_vscan(vp, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Lseek support for finding holes (cmd == _FIO_SEEK_HOLE) and + * data (cmd == _FIO_SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey(vnode_t *vp, ulong_t cmd, offset_t *off) +{ + znode_t *zp = VTOZ(vp); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == _FIO_SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(zp->z_zfsvfs->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +/* ARGSUSED */ +static int +zfs_ioctl(vnode_t *vp, ulong_t com, intptr_t data, int flag, cred_t *cred, + int *rvalp) +{ + offset_t off; + int error; + zfsvfs_t *zfsvfs; + znode_t *zp; + + switch (com) { + case _FIOFFS: + { + return (0); + + /* + * The following two ioctls are used by bfu. Faking out, + * necessary to avoid bfu errors. + */ + } + case _FIOGDIO: + case _FIOSDIO: + { + return (0); + } + + case _FIO_SEEK_DATA: + case _FIO_SEEK_HOLE: + { + off = *(offset_t *)data; + zp = VTOZ(vp); + zfsvfs = zp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* offset parameter is in/out */ + error = zfs_holey(vp, com, &off); + ZFS_EXIT(zfsvfs); + if (error) + return (error); + *(offset_t *)data = off; + return (0); + } + } + return (SET_ERROR(ENOTTY)); +} + +static vm_page_t +page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) +{ + vm_object_t obj; + vm_page_t pp; + int64_t end; + + /* + * At present vm_page_clear_dirty extends the cleared range to DEV_BSIZE + * aligned boundaries, if the range is not aligned. As a result a + * DEV_BSIZE subrange with partially dirty data may get marked as clean. + * It may happen that all DEV_BSIZE subranges are marked clean and thus + * the whole page would be considered clean despite have some + * dirty data. + * For this reason we should shrink the range to DEV_BSIZE aligned + * boundaries before calling vm_page_clear_dirty. + */ + end = rounddown2(off + nbytes, DEV_BSIZE); + off = roundup2(off, DEV_BSIZE); + nbytes = end - off; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked_12(obj); +#if __FreeBSD_version < 1300050 + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + vm_page_sbusy(pp); + } else if (pp != NULL) { + ASSERT(!pp->valid); + pp = NULL; + } + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } + break; + } +#else + vm_page_grab_valid_unlocked(&pp, obj, OFF_TO_IDX(start), + VM_ALLOC_NOCREAT | VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | + VM_ALLOC_IGN_SBUSY); + if (pp != NULL) { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_object_pip_add(obj, 1); + pmap_remove_write(pp); + if (nbytes != 0) + vm_page_clear_dirty(pp, off, nbytes); + } +#endif + return (pp); +} + +static void +page_unbusy(vm_page_t pp) +{ + + vm_page_sunbusy(pp); +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(pp->object); +#else + vm_object_pip_subtract(pp->object, 1); +#endif +} + +#if __FreeBSD_version > 1300051 +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t m; + + obj = vp->v_object; + vm_page_grab_valid_unlocked(&m, obj, OFF_TO_IDX(start), + VM_ALLOC_NOCREAT | VM_ALLOC_WIRED | VM_ALLOC_IGN_SBUSY | + VM_ALLOC_NOBUSY); + return (m); +} +#else +static vm_page_t +page_hold(vnode_t *vp, int64_t start) +{ + vm_object_t obj; + vm_page_t pp; + + obj = vp->v_object; + zfs_vmobject_assert_wlocked(obj); + + for (;;) { + if ((pp = vm_page_lookup(obj, OFF_TO_IDX(start))) != NULL && + pp->valid) { + if (vm_page_xbusied(pp)) { + /* + * Reference the page before unlocking and + * sleeping so that the page daemon is less + * likely to reclaim it. + */ + vm_page_reference(pp); + vm_page_lock(pp); + zfs_vmobject_wunlock(obj); + vm_page_busy_sleep(pp, "zfsmwb", true); + zfs_vmobject_wlock(obj); + continue; + } + + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_wire_lock(pp); + vm_page_hold(pp); + vm_page_wire_unlock(pp); + + } else + pp = NULL; + break; + } + return (pp); +} +#endif + +static void +page_unhold(vm_page_t pp) +{ + + vm_page_wire_lock(pp); +#if __FreeBSD_version >= 1300035 + vm_page_unwire(pp, PQ_ACTIVE); +#else + vm_page_unhold(pp); +#endif + vm_page_wire_unlock(pp); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid, + int segflg, dmu_tx_t *tx) +{ + vm_object_t obj; + struct sf_buf *sf; + caddr_t va; + int off; + + ASSERT(segflg != UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + off = start & PAGEOFFSET; + zfs_vmobject_wlock_12(obj); +#if __FreeBSD_version >= 1300041 + vm_object_pip_add(obj, 1); +#endif + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + int nbytes = imin(PAGESIZE - off, len); + + if ((pp = page_busy(vp, start, off, nbytes)) != NULL) { + zfs_vmobject_wunlock_12(obj); + + va = zfs_map_page(pp, &sf); + (void) dmu_read(os, oid, start+off, nbytes, + va+off, DMU_READ_PREFETCH); + zfs_unmap_page(sf); + + zfs_vmobject_wlock_12(obj); + page_unbusy(pp); + } + len -= nbytes; + off = 0; + } +#if __FreeBSD_version >= 1300041 + vm_object_pip_wakeup(obj); +#else + vm_object_pip_wakeupn(obj, 0); +#endif + zfs_vmobject_wunlock_12(obj); +} + +/* + * Read with UIO_NOCOPY flag means that sendfile(2) requests + * ZFS to populate a range of page cache pages with data. + * + * NOTE: this function could be optimized to pre-allocate + * all pages in advance, drain exclusive busy on all of them, + * map them into contiguous KVA region and populate them + * in one single dmu_read() call. + */ +static int +mappedread_sf(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + objset_t *os = zp->z_zfsvfs->z_os; + struct sf_buf *sf; + vm_object_t obj; + vm_page_t pp; + int64_t start; + caddr_t va; + int len = nbytes; + int error = 0; + + ASSERT(uio->uio_segflg == UIO_NOCOPY); + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + ASSERT((uio->uio_loffset & PAGEOFFSET) == 0); + + zfs_vmobject_wlock_12(obj); + for (start = uio->uio_loffset; len > 0; start += PAGESIZE) { + int bytes = MIN(PAGESIZE, len); + + pp = vm_page_grab_unlocked(obj, OFF_TO_IDX(start), + VM_ALLOC_SBUSY | VM_ALLOC_NORMAL | VM_ALLOC_IGN_SBUSY); + if (vm_page_none_valid(pp)) { + zfs_vmobject_wunlock_12(obj); + va = zfs_map_page(pp, &sf); + error = dmu_read(os, zp->z_id, start, bytes, va, + DMU_READ_PREFETCH); + if (bytes != PAGESIZE && error == 0) + bzero(va + bytes, PAGESIZE - bytes); + zfs_unmap_page(sf); + zfs_vmobject_wlock_12(obj); +#if __FreeBSD_version >= 1300081 + if (error == 0) { + vm_page_valid(pp); + vm_page_activate(pp); + vm_page_do_sunbusy(pp); + } else { + zfs_vmobject_wlock(obj); + if (!vm_page_wired(pp) && pp->valid == 0 && + vm_page_busy_tryupgrade(pp)) + vm_page_free(pp); + else + vm_page_sunbusy(pp); + zfs_vmobject_wunlock(obj); + } +#else + vm_page_do_sunbusy(pp); + vm_page_lock(pp); + if (error) { + if (pp->wire_count == 0 && pp->valid == 0 && + !vm_page_busied(pp)) + vm_page_free(pp); + } else { + pp->valid = VM_PAGE_BITS_ALL; + vm_page_activate(pp); + } + vm_page_unlock(pp); +#endif + } else { + ASSERT3U(pp->valid, ==, VM_PAGE_BITS_ALL); + vm_page_do_sunbusy(pp); + } + if (error) + break; + uio->uio_resid -= bytes; + uio->uio_offset += bytes; + len -= bytes; + } + zfs_vmobject_wunlock_12(obj); + return (error); +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(vnode_t *vp, int nbytes, uio_t *uio) +{ + znode_t *zp = VTOZ(vp); + vm_object_t obj; + int64_t start; + int len = nbytes; + int off; + int error = 0; + + ASSERT(vp->v_mount != NULL); + obj = vp->v_object; + ASSERT(obj != NULL); + + start = uio->uio_loffset; + off = start & PAGEOFFSET; + zfs_vmobject_wlock_12(obj); + for (start &= PAGEMASK; len > 0; start += PAGESIZE) { + vm_page_t pp; + uint64_t bytes = MIN(PAGESIZE - off, len); + + if ((pp = page_hold(vp, start))) { + struct sf_buf *sf; + caddr_t va; + + zfs_vmobject_wunlock_12(obj); + va = zfs_map_page(pp, &sf); + error = vn_io_fault_uiomove(va + off, bytes, uio); + zfs_unmap_page(sf); + zfs_vmobject_wlock_12(obj); + page_unhold(pp); + } else { + zfs_vmobject_wunlock_12(obj); + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + zfs_vmobject_wlock_12(obj); + } + len -= bytes; + off = 0; + if (error) + break; + } + zfs_vmobject_wunlock_12(obj); + return (error); +} + +offset_t zfs_read_chunk_size = 1024 * 1024; /* Tunable */ + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: vp - vnode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - SYNC flags; used to provide FRSYNC semantics. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * vp - atime updated if byte count > 0 + */ +/* ARGSUSED */ +static int +zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + ssize_t n, nbytes, start_resid; + int error = 0; + int64_t nread; + zfs_locked_range_t *lr; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* We don't copy out anything useful for directories. */ + if (vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + */ + if (zfsvfs->z_log && + (ioflag & FRSYNC || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + start_resid = n; + + while (n > 0) { + nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (uio->uio_segflg == UIO_NOCOPY) + error = mappedread_sf(vp, nbytes, uio); + else if (vn_has_cached_data(vp)) { + error = mappedread(vp, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + + nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + +out: + zfs_rangelock_exit(lr); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: vp - vnode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - FAPPEND, FSYNC, and/or FDSYNC. FAPPEND is + * set if in append mode. + * cr - credentials of caller. + * ct - caller context (NFS/CIFS fem monitor only) + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +static int +zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + rlim64_t limit = MAXOFFSET_T; + ssize_t start_resid = uio->uio_resid; + ssize_t tx_bytes; + uint64_t end_size; + dmu_buf_impl_t *db; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog; + offset_t woff; + ssize_t n, nbytes; + zfs_locked_range_t *lr; + int max_blksz = zfsvfs->z_max_blksz; + int error = 0; + arc_buf_t *abuf; + iovec_t *aiov = NULL; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt __unused = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; + int count = 0; + sa_bulk_attr_t bulk[4]; + uint64_t mtime[2], ctime[2]; + uint64_t uid, gid, projid; + int64_t nwritten; + + /* + * Fasttrack empty write + */ + n = start_resid; + if (n == 0) + return (0); + + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM. + * Intentionally allow ZFS_READONLY through here. + * See zfs_zaccess_common() + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & FAPPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + zilog = zfsvfs->z_log; + + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + if (ioflag & FAPPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (vn_rlimit_fsize(vp, uio, uio->uio_td)) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (EFBIG); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_size); + + end_size = MAX(zp->z_size, woff + n); + + uid = zp->z_uid; + gid = zp->z_gid; + projid = zp->z_projid; + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, gid) || + (projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + abuf = NULL; + if (xuio) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (n >= max_blksz && + woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If zfs_range_lock() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since zfs_range_reduce() will + * shrink down r_len to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + if (woff + nbytes > zp->z_size) + vnode_pager_setsize(vp, woff + nbytes); + + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(sa_get_db(zp->z_sa_hdl), woff, + abuf, tx); + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, tx_bytes, zfsvfs->z_os, + zp->z_id, uio->uio_segflg, tx); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to to this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(vp, cr, + (zp->z_mode & S_ISUID) != 0 && zp->z_uid == 0) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0 || error == EFAULT); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + if (error == 0) + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + else + (void) sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, + ioflag, NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + + } + + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * EFAULT means that at least one page of the source buffer was not + * available. VFS will re-try remaining I/O upon this error. + */ + if (error == EFAULT) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (FSYNC | FDSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); + + ZFS_EXIT(zfsvfs); + return (0); +} + +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *presid) +{ + int error = 0; + ssize_t resid; + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), __DECONST(void *, data), len, pos, + UIO_SYSSPACE, IO_SYNC, kcred, NOCRED, &resid, curthread); + + if (error) { + return (SET_ERROR(error)); + } else if (presid == NULL) { + if (resid != 0) { + error = SET_ERROR(EIO); + } + } else { + *presid = resid; + } + return (error); +} + +static void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_zrele_taskq(dmu_objset_pool(os))); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef ZFS_DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(os))); + return (SET_ERROR(ENOENT)); + } + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, offset, + size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef ZFS_DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP, + * so there's no need to zio_flush() its + * vdevs (flushing would needlesly hurt + * performance, and doesn't work on + * indirect vdevs). + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +static int +zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_dd_callback(struct mount *mp, void *arg, int lkflags, struct vnode **vpp) +{ + int error; + + *vpp = arg; + error = vn_lock(*vpp, lkflags); + if (error != 0) + vrele(*vpp); + return (error); +} + +static int +zfs_lookup_lock(vnode_t *dvp, vnode_t *vp, const char *name, int lkflags) +{ + znode_t *zdp = VTOZ(dvp); + zfsvfs_t *zfsvfs __unused = zdp->z_zfsvfs; + int error; + int ltype; + + if (zfsvfs->z_replay == B_FALSE) + ASSERT_VOP_LOCKED(dvp, __func__); +#ifdef DIAGNOSTIC + if ((zdp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zfsvfs->z_teardown_lock)); +#endif + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + ASSERT3P(dvp, ==, vp); + vref(dvp); + ltype = lkflags & LK_TYPE_MASK; + if (ltype != VOP_ISLOCKED(dvp)) { + if (ltype == LK_EXCLUSIVE) + vn_lock(dvp, LK_UPGRADE | LK_RETRY); + else /* if (ltype == LK_SHARED) */ + vn_lock(dvp, LK_DOWNGRADE | LK_RETRY); + + /* + * Relock for the "." case could leave us with + * reclaimed vnode. + */ + if (VN_IS_DOOMED(dvp)) { + vrele(dvp); + return (SET_ERROR(ENOENT)); + } + } + return (0); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + /* + * Note that in this case, dvp is the child vnode, and we + * are looking up the parent vnode - exactly reverse from + * normal operation. Unlocking dvp requires some rather + * tricky unlock/relock dance to prevent mp from being freed; + * use vn_vget_ino_gen() which takes care of all that. + * + * XXX Note that there is a time window when both vnodes are + * unlocked. It is possible, although highly unlikely, that + * during that window the parent-child relationship between + * the vnodes may change, for example, get reversed. + * In that case we would have a wrong lock order for the vnodes. + * All other filesystems seem to ignore this problem, so we + * do the same here. + * A potential solution could be implemented as follows: + * - using LK_NOWAIT when locking the second vnode and retrying + * if necessary + * - checking that the parent-child relationship still holds + * after locking both vnodes and retrying if it doesn't + */ + error = vn_vget_ino_gen(dvp, zfs_dd_callback, vp, lkflags, &vp); + return (error); + } else { + error = vn_lock(vp, lkflags); + if (error != 0) + vrele(vp); + return (error); + } +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held vnode reference for it. + * + * IN: dvp - vnode of directory to search. + * nm - name of entry to lookup. + * pnp - full pathname to lookup [UNUSED]. + * flags - LOOKUP_XATTR set if looking for an attribute. + * rdir - root directory vnode [UNUSED]. + * cr - credentials of caller. + * ct - caller context + * + * OUT: vpp - vnode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +static int +zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct componentname *cnp, + int nameiop, cred_t *cr, kthread_t *td, int flags, boolean_t cached) +{ + znode_t *zdp = VTOZ(dvp); + znode_t *zp; + zfsvfs_t *zfsvfs = zdp->z_zfsvfs; + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & LOOKUP_XATTR)) { + if (dvp->v_type != VDIR) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *vpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * If the xattr property is off, refuse the lookup request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(VTOZ(dvp), &zp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + *vpp = ZTOV(zp); + + /* + * Do we have permission to get into attribute directory? + */ + error = zfs_zaccess(zp, ACE_EXECUTE, 0, B_FALSE, cr); + if (error) { + vrele(ZTOV(zp)); + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Check accessibility of directory if we're not coming in via + * VOP_CACHEDLOOKUP. + */ + if (!cached) { +#ifdef NOEXECCHECK + if ((cnp->cn_flags & NOEXECCHECK) != 0) { + cnp->cn_flags &= ~NOEXECCHECK; + } else +#endif + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + + /* + * First handle the special cases. + */ + if ((cnp->cn_flags & ISDOTDOT) != 0) { + /* + * If we are a snapshot mounted under .zfs, return + * the vp for the snapshot directory. + */ + if (zdp->z_id == zfsvfs->z_root && zfsvfs->z_parent != zfsvfs) { + struct componentname cn; + vnode_t *zfsctl_vp; + int ltype; + + ZFS_EXIT(zfsvfs); + ltype = VOP_ISLOCKED(dvp); + VOP_UNLOCK1(dvp); + error = zfsctl_root(zfsvfs->z_parent, LK_SHARED, + &zfsctl_vp); + if (error == 0) { + cn.cn_nameptr = "snapshot"; + cn.cn_namelen = strlen(cn.cn_nameptr); + cn.cn_nameiop = cnp->cn_nameiop; + cn.cn_flags = cnp->cn_flags & ~ISDOTDOT; + cn.cn_lkflags = cnp->cn_lkflags; + error = VOP_LOOKUP(zfsctl_vp, vpp, &cn); + vput(zfsctl_vp); + } + vn_lock(dvp, ltype | LK_RETRY); + return (error); + } + } + if (zfs_has_ctldir(zdp) && strcmp(nm, ZFS_CTLDIR_NAME) == 0) { + ZFS_EXIT(zfsvfs); + if ((cnp->cn_flags & ISLASTCN) != 0 && nameiop != LOOKUP) + return (SET_ERROR(ENOTSUP)); + error = zfsctl_root(zfsvfs, cnp->cn_lkflags, vpp); + return (error); + } + + /* + * The loop is retry the lookup if the parent-child relationship + * changes during the dot-dot locking complexities. + */ + for (;;) { + uint64_t parent; + + error = zfs_dirlook(zdp, nm, &zp); + if (error == 0) + *vpp = ZTOV(zp); + + ZFS_EXIT(zfsvfs); + if (error != 0) + break; + + error = zfs_lookup_lock(dvp, *vpp, nm, cnp->cn_lkflags); + if (error != 0) { + /* + * If we've got a locking error, then the vnode + * got reclaimed because of a force unmount. + * We never enter doomed vnodes into the name cache. + */ + *vpp = NULL; + return (error); + } + + if ((cnp->cn_flags & ISDOTDOT) == 0) + break; + + ZFS_ENTER(zfsvfs); + if (zdp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + } else { + error = sa_lookup(zdp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + } + if (error != 0) { + ZFS_EXIT(zfsvfs); + vput(ZTOV(zp)); + break; + } + if (zp->z_id == parent) { + ZFS_EXIT(zfsvfs); + break; + } + vput(ZTOV(zp)); + } + + if (error != 0) + *vpp = NULL; + + /* Translate errors and add SAVENAME when needed. */ + if (cnp->cn_flags & ISLASTCN) { + switch (nameiop) { + case CREATE: + case RENAME: + if (error == ENOENT) { + error = EJUSTRETURN; + cnp->cn_flags |= SAVENAME; + break; + } + /* FALLTHROUGH */ + case DELETE: + if (error == 0) + cnp->cn_flags |= SAVENAME; + break; + } + } + + /* Insert name into cache (as non-existent) if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == ENOENT && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(dvp, NULL, cnp); + + /* Insert name into cache if appropriate. */ + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay && + error == 0 && (cnp->cn_flags & MAKEENTRY)) { + if (!(cnp->cn_flags & ISLASTCN) || + (nameiop != DELETE && nameiop != RENAME)) { + cache_enter(dvp, *vpp, cnp); + } + } + + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the vp of the created or trunc'd file. + * + * IN: dvp - vnode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - large file flag [UNUSED]. + * ct - caller context + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated if new entry created + * vp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, int mode, + znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + objset_t *os; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + uint64_t projid = ZFS_DEFAULT_PROJID; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype; +#ifdef DEBUG_VFS_LOCKS + vnode_t *dvp = ZTOV(dzp); +#endif + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || (vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + *zpp = NULL; + + if ((vap->va_mode & S_ISVTX) && secpolicy_vnode_stky_modify(cr)) + vap->va_mode &= ~S_ISVTX; + + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && + (vap->va_type != VREG)) { + error = SET_ERROR(EINVAL); + goto out; + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + getnewvnode_reserve_(); + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + +out: + VNCHECKREF(dvp); + if (error == 0) { + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dvp - vnode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime + * vp - ctime (if nlink > 0) + */ + +/*ARGSUSED*/ +static int +zfs_remove_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t xattr_obj; + uint64_t obj = 0; + dmu_tx_t *tx; + boolean_t unlinked; + uint64_t txtype; + int error; + + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zp = VTOZ(vp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + xattr_obj = 0; + xzp = NULL; + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (vp->v_type == VDIR) { + error = SET_ERROR(EPERM); + goto out; + } + + vnevent_remove(vp, dvp, name, ct); + + obj = zp->z_id; + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + } + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the vnode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + + if (xzp) { + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + zfs_unlinked_add(zp, tx); + vp->v_vflag |= VV_NOSYNC; + } + /* XXX check changes to linux vnops */ + txtype = TX_REMOVE; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + + if (xzp) + vrele(ZTOV(xzp)); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + + ZFS_EXIT(zfsvfs); + return (error); +} + + +static int +zfs_lookup_internal(znode_t *dzp, char *name, vnode_t **vpp, + struct componentname *cnp, int nameiop) +{ + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + cnp->cn_nameptr = name; + cnp->cn_namelen = strlen(name); + cnp->cn_nameiop = nameiop; + cnp->cn_flags = ISLASTCN | SAVENAME; + cnp->cn_lkflags = LK_EXCLUSIVE | LK_RETRY; + cnp->cn_cred = kcred; + cnp->cn_thread = curthread; + + if (zfsvfs->z_use_namecache && !zfsvfs->z_replay) { + struct vop_lookup_args a; + + a.a_gen.a_desc = &vop_lookup_desc; + a.a_dvp = ZTOV(dzp); + a.a_vpp = vpp; + a.a_cnp = cnp; + error = vfs_cache_lookup(&a); + } else { + error = zfs_lookup(ZTOV(dzp), name, vpp, cnp, nameiop, kcred, + curthread, 0, B_FALSE); + } +#ifdef ZFS_DEBUG + if (error) { + printf("got error %d on name %s on op %d\n", error, name, + nameiop); + kdb_backtrace(); + } +#endif + return (error); +} + +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + vnode_t *vp; + int error; + struct componentname cn; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_remove_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} +/* + * Create a new directory and insert it into dvp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dvp - vnode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * vsecp - ACL to be set + * + * OUT: vpp - vnode of created directory. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + * vp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, cred_t *cr, + int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t txtype; + dmu_tx_t *tx; + int error; + ksid_t *ksid; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + + ASSERT(vap->va_type == VDIR); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + ksid = crgetsid(cr, KSID_OWNER); + if (ksid) + uid = ksid_getid(ksid); + else + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + ((vap->va_mask & AT_XVATTR) || + IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & AT_XVATTR) { + if ((error = secpolicy_xvattr(ZTOV(dzp), (xvattr_t *)vap, + crgetuid(cr), cr, vap->va_type)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ + *zpp = NULL; + + if ((error = zfs_dirent_lookup(dzp, dirname, &zp, ZNEW))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + ASSERT3P(zp, ==, NULL); + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* + * Now put new name in parent dir. + */ + (void) zfs_link_create(dzp, dirname, zp, tx, ZNEW); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, NULL, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, NULL, + acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dvp - vnode of directory to remove from. + * name - name of directory to be removed. + * cwd - vnode of current working directory. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rmdir_(vnode_t *dvp, vnode_t *vp, char *name, cred_t *cr) +{ + znode_t *dzp = VTOZ(dvp); + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + ZFS_VERIFY_ZP(zp); + zilog = zfsvfs->z_log; + + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (vp->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + vnevent_rmdir(vp, dvp, name, ct); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + cache_purge(dvp); + + error = zfs_link_destroy(dzp, name, zp, tx, ZEXISTS, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + zfs_log_remove(zilog, tx, txtype, dzp, name, + ZFS_NO_OBJECT, B_FALSE); + } + + dmu_tx_commit(tx); + + cache_purge(vp); +out: + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, int flags) +{ + struct componentname cn; + vnode_t *vp; + int error; + + if ((error = zfs_lookup_internal(dzp, name, &vp, &cn, DELETE))) + return (error); + + error = zfs_rmdir_(ZTOV(dzp), vp, name, cr); + vput(vp); + return (error); +} + +/* + * Read as many directory entries as will fit into the provided + * buffer from the given directory cursor position (specified in + * the uio structure). + * + * IN: vp - vnode of directory to read. + * uio - structure supplying read location, range info, + * and return buffer. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * OUT: uio - updated offset and range, buffer filled. + * eofp - set to true if end-of-file detected. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +static int +zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, + int *ncookies, ulong_t **cookies) +{ + znode_t *zp = VTOZ(vp); + iovec_t *iovp; + edirent_t *eodp; + dirent64_t *odp; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os; + caddr_t outbuf; + size_t bufsize; + zap_cursor_t zc; + zap_attribute_t zap; + uint_t bytes_wanted; + uint64_t offset; /* must be unsigned; checks for < 1 */ + uint64_t parent; + int local_eof; + int outcount; + int error; + uint8_t prefetch; + boolean_t check_sysattrs; + uint8_t type; + int ncooks; + ulong_t *cooks = NULL; + int flags = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If we are not given an eof variable, + * use a local one. + */ + if (eofp == NULL) + eofp = &local_eof; + + /* + * Check for valid iov_len. + */ + if (uio->uio_iov->iov_len <= 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Quit if directory has been removed (posix) + */ + if ((*eofp = zp->z_unlinked) != 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + error = 0; + os = zfsvfs->z_os; + offset = uio->uio_loffset; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Get space to change directory entries into fs independent format. + */ + iovp = uio->uio_iov; + bytes_wanted = iovp->iov_len; + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) { + bufsize = bytes_wanted; + outbuf = kmem_alloc(bufsize, KM_SLEEP); + odp = (struct dirent64 *)outbuf; + } else { + bufsize = bytes_wanted; + outbuf = NULL; + odp = (struct dirent64 *)iovp->iov_base; + } + eodp = (struct edirent *)odp; + + if (ncookies != NULL) { + /* + * Minimum entry size is dirent size and 1 byte for a file name. + */ + ncooks = uio->uio_resid / (sizeof (struct dirent) - + sizeof (((struct dirent *)NULL)->d_name) + 1); + cooks = malloc(ncooks * sizeof (ulong_t), M_TEMP, M_WAITOK); + *cookies = cooks; + *ncookies = ncooks; + } + /* + * If this VFS supports the system attribute view interface; and + * we're looking at an extended attribute directory; and we care + * about normalization conflicts on this vfs; then we must check + * for normalization conflicts with the sysattr name space. + */ +#ifdef TODO + check_sysattrs = vfs_has_feature(vp->v_vfsp, VFSFT_SYSATTR_VIEWS) && + (vp->v_flag & V_XATTRDIR) && zfsvfs->z_norm && + (flags & V_RDDIR_ENTFLAGS); +#else + check_sysattrs = 0; +#endif + + /* + * Transform to file-system independent format + */ + outcount = 0; + while (outcount < bytes_wanted) { + ino64_t objnum; + ushort_t reclen; + off64_t *next = NULL; + + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if ((*eofp = (error == ENOENT)) != 0) + break; + else + goto update; + } + + if (zap.za_integer_length != 8 || + zap.za_num_integers != 1) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + /* + * MacOS X can extract the object type here such as: + * uint8_t type = ZFS_DIRENT_TYPE(zap.za_first_integer); + */ + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + + if (check_sysattrs && !zap.za_normalization_conflict) { +#ifdef TODO + zap.za_normalization_conflict = + xattr_sysattr_casechk(zap.za_name); +#else + panic("%s:%u: TODO", __func__, __LINE__); +#endif + } + } + + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + vrele(ZTOV(ezp)); + goto skip_entry; + } + vrele(ZTOV(ezp)); + } + + if (flags & V_RDDIR_ENTFLAGS) + reclen = EDIRENT_RECLEN(strlen(zap.za_name)); + else + reclen = DIRENT64_RECLEN(strlen(zap.za_name)); + + /* + * Will this entry fit in the buffer? + */ + if (outcount + reclen > bufsize) { + /* + * Did we manage to fit anything in the buffer? + */ + if (!outcount) { + error = SET_ERROR(EINVAL); + goto update; + } + break; + } + if (flags & V_RDDIR_ENTFLAGS) { + /* + * Add extended flag entry: + */ + eodp->ed_ino = objnum; + eodp->ed_reclen = reclen; + /* NOTE: ed_off is the offset for the *next* entry */ + next = &(eodp->ed_off); + eodp->ed_eflags = zap.za_normalization_conflict ? + ED_CASE_CONFLICT : 0; + (void) strncpy(eodp->ed_name, zap.za_name, + EDIRENT_NAMELEN(reclen)); + eodp = (edirent_t *)((intptr_t)eodp + reclen); + } else { + /* + * Add normal entry: + */ + odp->d_ino = objnum; + odp->d_reclen = reclen; + odp->d_namlen = strlen(zap.za_name); + /* NOTE: d_off is the offset for the *next* entry. */ + next = &odp->d_off; + strlcpy(odp->d_name, zap.za_name, odp->d_namlen + 1); + odp->d_type = type; + dirent_terminate(odp); + odp = (dirent64_t *)((intptr_t)odp + reclen); + } + outcount += reclen; + + ASSERT(outcount <= bufsize); + + /* Prefetch znode */ + if (prefetch) + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + + skip_entry: + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + + /* Fill the offset right after advancing the cursor. */ + if (next != NULL) + *next = offset; + if (cooks != NULL) { + *cooks++ = offset; + ncooks--; + KASSERT(ncooks >= 0, ("ncookies=%d", ncooks)); + } + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + + /* Subtract unused cookies */ + if (ncookies != NULL) + *ncookies -= ncooks; + + if (uio->uio_segflg == UIO_SYSSPACE && uio->uio_iovcnt == 1) { + iovp->iov_base += outcount; + iovp->iov_len -= outcount; + uio->uio_resid -= outcount; + } else if ((error = uiomove(outbuf, (long)outcount, UIO_READ, uio))) { + /* + * Reset the pointer. + */ + offset = uio->uio_loffset; + } + +update: + zap_cursor_fini(&zc); + if (uio->uio_segflg != UIO_SYSSPACE || uio->uio_iovcnt != 1) + kmem_free(outbuf, bufsize); + + if (error == ENOENT) + error = 0; + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + uio->uio_loffset = offset; + ZFS_EXIT(zfsvfs); + if (error != 0 && cookies != NULL) { + free(*cookies, M_TEMP); + *cookies = NULL; + *ncookies = 0; + } + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +static int +zfs_fsync(vnode_t *vp, int syncflag, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + return (0); +} + + +/* + * Get the requested file attributes and place them in the provided + * vattr structure. + * + * IN: vp - vnode of file. + * vap - va_mask identifies requested attributes. + * If AT_XVATTR set, then optional attrs are requested + * flags - ATTR_NOACLCHECK (CIFS server context) + * cr - credentials of caller. + * + * OUT: vap - attribute values. + * + * RETURN: 0 (always succeeds). + */ +/* ARGSUSED */ +static int +zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error = 0; + uint32_t blksize; + u_longlong_t nblocks; + uint64_t mtime[2], ctime[2], crtime[2], rdev; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap = NULL; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + sa_bulk_attr_t bulk[4]; + int count = 0; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + if (vp->v_type == VBLK || vp->v_type == VCHR) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if ((error = sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. + * Also, if we are the owner don't bother, since owner should + * always be allowed to read basic attributes of file. + */ + if (!(zp->z_pflags & ZFS_ACL_TRIVIAL) && + (vap->va_uid != crgetuid(cr))) { + if ((error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, + skipaclchk, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + /* + * Return all attributes. It's cheaper to provide the answer + * than to determine whether we were asked the question. + */ + + vap->va_type = IFTOVT(zp->z_mode); + vap->va_mode = zp->z_mode & ~S_IFMT; + vn_fsid(vp, vap); + vap->va_nodeid = zp->z_id; + vap->va_nlink = zp->z_links; + if ((vp->v_flag & VROOT) && zfs_show_ctldir(zp) && + zp->z_links < ZFS_LINK_MAX) + vap->va_nlink++; + vap->va_size = zp->z_size; + if (vp->v_type == VBLK || vp->v_type == VCHR) + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_seq = zp->z_seq; + vap->va_flags = 0; /* FreeBSD: Reset chflags(2) flags. */ + vap->va_filerev = zp->z_seq; + + /* + * Add in any requested optional attributes and the create time. + * Also set the corresponding bits in the returned attribute bitmap. + */ + if ((xoap = xva_getxoptattr(xvap)) != NULL && zfsvfs->z_use_fuids) { + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + xoap->xoa_archive = + ((zp->z_pflags & ZFS_ARCHIVE) != 0); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + xoap->xoa_readonly = + ((zp->z_pflags & ZFS_READONLY) != 0); + XVA_SET_RTN(xvap, XAT_READONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + xoap->xoa_system = + ((zp->z_pflags & ZFS_SYSTEM) != 0); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + xoap->xoa_hidden = + ((zp->z_pflags & ZFS_HIDDEN) != 0); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + xoap->xoa_nounlink = + ((zp->z_pflags & ZFS_NOUNLINK) != 0); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + xoap->xoa_immutable = + ((zp->z_pflags & ZFS_IMMUTABLE) != 0); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + xoap->xoa_appendonly = + ((zp->z_pflags & ZFS_APPENDONLY) != 0); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + xoap->xoa_nodump = + ((zp->z_pflags & ZFS_NODUMP) != 0); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + xoap->xoa_opaque = + ((zp->z_pflags & ZFS_OPAQUE) != 0); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + xoap->xoa_av_quarantined = + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + xoap->xoa_av_modified = + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) && + vp->v_type == VREG) { + zfs_sa_get_scanstamp(zp, xvap); + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = ((zp->z_pflags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_GEN)) { + xoap->xoa_generation = zp->z_gen; + XVA_SET_RTN(xvap, XAT_GEN); + } + + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + xoap->xoa_offline = + ((zp->z_pflags & ZFS_OFFLINE) != 0); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + xoap->xoa_sparse = + ((zp->z_pflags & ZFS_SPARSE) != 0); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + xoap->xoa_projinherit = + ((zp->z_pflags & ZFS_PROJINHERIT) != 0); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + xoap->xoa_projid = zp->z_projid; + XVA_SET_RTN(xvap, XAT_PROJID); + } + } + + ZFS_TIME_DECODE(&vap->va_atime, zp->z_atime); + ZFS_TIME_DECODE(&vap->va_mtime, mtime); + ZFS_TIME_DECODE(&vap->va_ctime, ctime); + ZFS_TIME_DECODE(&vap->va_birthtime, crtime); + + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + vap->va_blksize = blksize; + vap->va_bytes = nblocks << 9; /* nblocks * 512 */ + + if (zp->z_blksz == 0) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + vap->va_blksize = zfsvfs->z_max_blksz; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If AT_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * ct - caller context + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + vnode_t *vp = ZTOV(zp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + uint64_t saved_mode; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + sa_bulk_attr_t bulk[7], xattr_bulk[7]; + int count = 0, xattr_count = 0; + + if (mask == 0) + return (0); + + if (mask & AT_NOSET) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & AT_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & AT_SIZE && vp->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & AT_SIZE && vp->v_type != VREG && vp->v_type != VFIFO) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * If this is an xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + + xva_init(&tmpxvattr); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (AT_SIZE|AT_UID|AT_GID|AT_MTIME|AT_MODE)) || + ((mask & AT_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Note: ZFS_READONLY is handled in zfs_zaccess_common. + */ + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (AT_ATIME | AT_MTIME)) { + if (((mask & AT_ATIME) && TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & AT_MTIME) && TIMESPEC_OVERFLOW(&vap->va_mtime))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + } + if (xoap != NULL && (mask & AT_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME) && + TIMESPEC_OVERFLOW(&vap->va_birthtime)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOVERFLOW)); + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(zp->z_mode) && !S_ISDIR(zp->z_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EOPNOTSUPP)); + } + } + + attrzp = NULL; + aclp = NULL; + + if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * First validate permissions + */ + + if (mask & AT_SIZE) { + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + } + + if (mask & (AT_ATIME|AT_MTIME) || + ((mask & AT_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (AT_UID|AT_GID)) { + int idmask = (mask & (AT_UID|AT_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & AT_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & AT_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & AT_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both AT_UID and AT_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (AT_UID|AT_GID)) && take_owner && take_group) || + ((idmask == AT_UID) && take_owner) || + ((idmask == AT_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + secpolicy_setid_clear(vap, vp, cr); + trim_mask = (mask & (AT_UID|AT_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & AT_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(&tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + if (trim_mask & AT_MODE) { + /* + * Save the mode, as secpolicy_vnode_setattr() + * will overwrite it with ova.va_mode. + */ + saved_mode = vap->va_mode; + } + } + err = secpolicy_vnode_setattr(cr, vp, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + + if (trim_mask) { + vap->va_mask |= saved_mask; + if (trim_mask & AT_MODE) { + /* + * Recover the mode after + * secpolicy_vnode_setattr(). + */ + vap->va_mode = saved_mode; + } + } + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (AT_UID | AT_GID)) || projid != ZFS_INVALID_PROJID) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(zp->z_zfsvfs, xattr_obj, &attrzp); + if (err == 0) { + err = vn_lock(ZTOV(attrzp), LK_EXCLUSIVE); + if (err != 0) + vrele(ZTOV(attrzp)); + } + if (err) + goto out2; + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != zp->z_uid && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_uid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != zp->z_gid && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_gid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + vput(ZTOV(attrzp)); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & AT_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (zp->z_zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = SET_ERROR(EPERM); + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & AT_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&zp->z_acl_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_enter(&attrzp->z_acl_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (AT_UID|AT_GID)) { + + if (mask & AT_UID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + zp->z_uid = new_uid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + attrzp->z_uid = new_uid; + } + } + + if (mask & AT_GID) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + zp->z_gid = new_gid; + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + attrzp->z_gid = new_gid; + } + } + if (!(mask & AT_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & AT_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = new_mode; + ASSERT3U((uintptr_t)aclp, !=, 0); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + + if (mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, zp->z_atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + } + + if (mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + /* XXX - shouldn't this be done *before* the ATIME/MTIME checks? */ + if (mask & AT_SIZE && !(mask & AT_MTIME)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + } else if (mask != 0) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, ctime); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + zfs_tstamp_update_setup(attrzp, STATE_CHANGED, + mtime, ctime); + } + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & AT_XVATTR)) { + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + xoap->xoa_createtime = vap->va_birthtime; + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(vp->v_type == VREG); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (AT_UID|AT_GID|AT_MODE)) + mutex_exit(&attrzp->z_acl_lock); + } +out: + if (err == 0 && attrzp) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (attrzp) + vput(ZTOV(attrzp)); + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + } else { + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * We acquire all but fdvp locks using non-blocking acquisitions. If we + * fail to acquire any lock in the path we will drop all held locks, + * acquire the new lock in a blocking fashion, and then release it and + * restart the rename. This acquire/release step ensures that we do not + * spin on a lock waiting for release. On error release all vnode locks + * and decrement references the way tmpfs_rename() would do. + */ +static int +zfs_rename_relock(struct vnode *sdvp, struct vnode **svpp, + struct vnode *tdvp, struct vnode **tvpp, + const struct componentname *scnp, const struct componentname *tcnp) +{ + zfsvfs_t *zfsvfs; + struct vnode *nvp, *svp, *tvp; + znode_t *sdzp, *tdzp, *szp, *tzp; + const char *snm = scnp->cn_nameptr; + const char *tnm = tcnp->cn_nameptr; + int error; + + VOP_UNLOCK1(tdvp); + if (*tvpp != NULL && *tvpp != tdvp) + VOP_UNLOCK1(*tvpp); + +relock: + error = vn_lock(sdvp, LK_EXCLUSIVE); + if (error) + goto out; + sdzp = VTOZ(sdvp); + + error = vn_lock(tdvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + if (error != EBUSY) + goto out; + error = vn_lock(tdvp, LK_EXCLUSIVE); + if (error) + goto out; + VOP_UNLOCK1(tdvp); + goto relock; + } + tdzp = VTOZ(tdvp); + + /* + * Before using sdzp and tdzp we must ensure that they are live. + * As a porting legacy from illumos we have two things to worry + * about. One is typical for FreeBSD and it is that the vnode is + * not reclaimed (doomed). The other is that the znode is live. + * The current code can invalidate the znode without acquiring the + * corresponding vnode lock if the object represented by the znode + * and vnode is no longer valid after a rollback or receive operation. + * z_teardown_lock hidden behind ZFS_ENTER and ZFS_EXIT is the lock + * that protects the znodes from the invalidation. + */ + zfsvfs = sdzp->z_zfsvfs; + ASSERT3P(zfsvfs, ==, tdzp->z_zfsvfs); + ZFS_ENTER(zfsvfs); + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + error = SET_ERROR(EIO); + goto out; + } + + /* + * Re-resolve svp to be certain it still exists and fetch the + * correct vnode. + */ + error = zfs_dirent_lookup(sdzp, snm, &szp, ZEXISTS); + if (error != 0) { + /* Source entry invalid or not there. */ + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if ((scnp->cn_flags & ISDOTDOT) != 0 || + (scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.')) + error = SET_ERROR(EINVAL); + goto out; + } + svp = ZTOV(szp); + + /* + * Re-resolve tvp, if it disappeared we just carry on. + */ + error = zfs_dirent_lookup(tdzp, tnm, &tzp, 0); + if (error != 0) { + ZFS_EXIT(zfsvfs); + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + vrele(svp); + if ((tcnp->cn_flags & ISDOTDOT) != 0) + error = SET_ERROR(EINVAL); + goto out; + } + if (tzp != NULL) + tvp = ZTOV(tzp); + else + tvp = NULL; + + /* + * At present the vnode locks must be acquired before z_teardown_lock, + * although it would be more logical to use the opposite order. + */ + ZFS_EXIT(zfsvfs); + + /* + * Now try acquire locks on svp and tvp. + */ + nvp = svp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + if (tvp != NULL) + vrele(tvp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + VOP_UNLOCK1(nvp); + /* + * Concurrent rename race. + * XXX ? + */ + if (nvp == tdvp) { + vrele(nvp); + error = SET_ERROR(EINVAL); + goto out; + } + vrele(*svpp); + *svpp = nvp; + goto relock; + } + vrele(*svpp); + *svpp = nvp; + + if (*tvpp != NULL) + vrele(*tvpp); + *tvpp = NULL; + if (tvp != NULL) { + nvp = tvp; + error = vn_lock(nvp, LK_EXCLUSIVE | LK_NOWAIT); + if (error != 0) { + VOP_UNLOCK1(sdvp); + VOP_UNLOCK1(tdvp); + VOP_UNLOCK1(*svpp); + if (error != EBUSY) { + vrele(nvp); + goto out; + } + error = vn_lock(nvp, LK_EXCLUSIVE); + if (error != 0) { + vrele(nvp); + goto out; + } + vput(nvp); + goto relock; + } + *tvpp = nvp; + } + + return (0); + +out: + return (error); +} + +/* + * Note that we must use VRELE_ASYNC in this function as it walks + * up the directory tree and vrele may need to acquire an exclusive + * lock if a last reference to a vnode is dropped. + */ +static int +zfs_rename_check(znode_t *szp, znode_t *sdzp, znode_t *tdzp) +{ + zfsvfs_t *zfsvfs; + znode_t *zp, *zp1; + uint64_t parent; + int error; + + zfsvfs = tdzp->z_zfsvfs; + if (tdzp == szp) + return (SET_ERROR(EINVAL)); + if (tdzp == sdzp) + return (0); + if (tdzp->z_id == zfsvfs->z_root) + return (0); + zp = tdzp; + for (;;) { + ASSERT(!zp->z_unlinked); + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + break; + + if (parent == szp->z_id) { + error = SET_ERROR(EINVAL); + break; + } + if (parent == zfsvfs->z_root) + break; + if (parent == sdzp->z_id) + break; + + error = zfs_zget(zfsvfs, parent, &zp1); + if (error != 0) + break; + + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os))); + zp = zp1; + } + + if (error == ENOTDIR) + panic("checkpath: .. not a directory\n"); + if (zp != tdzp) + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_zrele_taskq(dmu_objset_pool(zfsvfs->z_os))); + return (error); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdvp - Source directory containing the "old entry". + * snm - Old entry name. + * tdvp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdvp,tdvp - ctime|mtime updated + */ +/*ARGSUSED*/ +static int +zfs_rename_(vnode_t *sdvp, vnode_t **svpp, struct componentname *scnp, + vnode_t *tdvp, vnode_t **tvpp, struct componentname *tcnp, + cred_t *cr, int log) +{ + zfsvfs_t *zfsvfs; + znode_t *sdzp, *tdzp, *szp, *tzp; + zilog_t *zilog = NULL; + dmu_tx_t *tx; + char *snm = scnp->cn_nameptr; + char *tnm = tcnp->cn_nameptr; + int error = 0; + bool want_seqc_end __maybe_unused = false; + + /* Reject renames across filesystems. */ + if ((*svpp)->v_mount != tdvp->v_mount || + ((*tvpp) != NULL && (*svpp)->v_mount != (*tvpp)->v_mount)) { + error = SET_ERROR(EXDEV); + goto out; + } + + if (zfsctl_is_node(tdvp)) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Lock all four vnodes to ensure safety and semantics of renaming. + */ + error = zfs_rename_relock(sdvp, svpp, tdvp, tvpp, scnp, tcnp); + if (error != 0) { + /* no vnodes are locked in the case of error here */ + return (error); + } + + tdzp = VTOZ(tdvp); + sdzp = VTOZ(sdvp); + zfsvfs = tdzp->z_zfsvfs; + zilog = zfsvfs->z_log; + + /* + * After we re-enter ZFS_ENTER() we will have to revalidate all + * znodes involved. + */ + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + error = SET_ERROR(EILSEQ); + goto unlockout; + } + + /* If source and target are the same file, there is nothing to do. */ + if ((*svpp) == (*tvpp)) { + error = 0; + goto unlockout; + } + + if (((*svpp)->v_type == VDIR && (*svpp)->v_mountedhere != NULL) || + ((*tvpp) != NULL && (*tvpp)->v_type == VDIR && + (*tvpp)->v_mountedhere != NULL)) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * We can not use ZFS_VERIFY_ZP() here because it could directly return + * bypassing the cleanup code in the case of an error. + */ + if (tdzp->z_sa_hdl == NULL || sdzp->z_sa_hdl == NULL) { + error = SET_ERROR(EIO); + goto unlockout; + } + + szp = VTOZ(*svpp); + tzp = *tvpp == NULL ? NULL : VTOZ(*tvpp); + if (szp->z_sa_hdl == NULL || (tzp != NULL && tzp->z_sa_hdl == NULL)) { + error = SET_ERROR(EIO); + goto unlockout; + } + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + error = SET_ERROR(EINVAL); + goto unlockout; + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto unlockout; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto unlockout; + + if ((*svpp)->v_type == VDIR) { + /* + * Avoid ".", "..", and aliases of "." for obvious reasons. + */ + if ((scnp->cn_namelen == 1 && scnp->cn_nameptr[0] == '.') || + sdzp == szp || + (scnp->cn_flags | tcnp->cn_flags) & ISDOTDOT) { + error = EINVAL; + goto unlockout; + } + + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_check(szp, sdzp, tdzp))) + goto unlockout; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if ((*svpp)->v_type == VDIR) { + if ((*tvpp)->v_type != VDIR) { + error = SET_ERROR(ENOTDIR); + goto unlockout; + } else { + cache_purge(tdvp); + if (sdvp != tdvp) + cache_purge(sdvp); + } + } else { + if ((*tvpp)->v_type == VDIR) { + error = SET_ERROR(EISDIR); + goto unlockout; + } + } + } + + vn_seqc_write_begin(*svpp); + vn_seqc_write_begin(sdvp); + if (*tvpp != NULL) + vn_seqc_write_begin(*tvpp); + if (tdvp != *tvpp) + vn_seqc_write_begin(tdvp); +#if __FreeBSD_version >= 1300102 + want_seqc_end = true; +#endif + vnevent_rename_src(*svpp, sdvp, scnp->cn_nameptr, ct); + if (tzp) + vnevent_rename_dest(*tvpp, tdvp, tnm, ct); + + /* + * notify the target directory if it is not the same + * as source directory. + */ + if (tdvp != sdvp) { + vnevent_rename_dest_dir(tdvp, ct); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto unlockout; + } + + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdzp, tnm, tzp, tx, 0, NULL); + + if (error == 0) { + error = zfs_link_create(tdzp, tnm, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdzp, snm, szp, tx, ZRENAMING, + NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME, sdzp, + snm, tdzp, tnm, szp); + + /* + * Update path information for the target vnode + */ + vn_renamepath(tdvp, *svpp, tnm, strlen(tnm)); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdzp, tnm, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } + if (error == 0) { + cache_purge(*svpp); + if (*tvpp != NULL) + cache_purge(*tvpp); + cache_purge_negative(tdvp); + } + } + + dmu_tx_commit(tx); + +unlockout: /* all 4 vnodes are locked, ZFS_ENTER called */ + ZFS_EXIT(zfsvfs); + if (want_seqc_end) { + vn_seqc_write_end(*svpp); + vn_seqc_write_end(sdvp); + if (*tvpp != NULL) + vn_seqc_write_end(*tvpp); + if (tdvp != *tvpp) + vn_seqc_write_end(tdvp); + want_seqc_end = false; + } + VOP_UNLOCK1(*svpp); + VOP_UNLOCK1(sdvp); + +out: /* original two vnodes are locked */ + MPASS(!want_seqc_end); + if (error == 0 && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (*tvpp != NULL) + VOP_UNLOCK1(*tvpp); + if (tdvp != *tvpp) + VOP_UNLOCK1(tdvp); + return (error); +} + +int +zfs_rename(znode_t *sdzp, char *sname, znode_t *tdzp, char *tname, + cred_t *cr, int flags) +{ + struct componentname scn, tcn; + vnode_t *sdvp, *tdvp; + vnode_t *svp, *tvp; + int error; + svp = tvp = NULL; + + sdvp = ZTOV(sdzp); + tdvp = ZTOV(tdzp); + error = zfs_lookup_internal(sdzp, sname, &svp, &scn, DELETE); + if (sdzp->z_zfsvfs->z_replay == B_FALSE) + VOP_UNLOCK1(sdvp); + if (error != 0) + goto fail; + VOP_UNLOCK1(svp); + + vn_lock(tdvp, LK_EXCLUSIVE | LK_RETRY); + error = zfs_lookup_internal(tdzp, tname, &tvp, &tcn, RENAME); + if (error == EJUSTRETURN) + tvp = NULL; + else if (error != 0) { + VOP_UNLOCK1(tdvp); + goto fail; + } + + error = zfs_rename_(sdvp, &svp, &scn, tdvp, &tvp, &tcn, cr, 0); +fail: + if (svp != NULL) + vrele(svp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dvp - Directory to contain new symbolic link. + * link - Name for new symlink entry. + * vap - Attributes of new entry. + * cr - credentials of caller. + * ct - caller context + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dvp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, const char *name, vattr_t *vap, + const char *link, znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + + ASSERT(vap->va_type == VLNK); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(dzp, name, &zp, ZNEW); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, + 0 /* projid */)) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + getnewvnode_reserve_(); + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + getnewvnode_drop_reserve(); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + __DECONST(void *, link), len, tx); + else + zfs_sa_symlink(zp, __DECONST(char *, link), len, tx); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + (void) zfs_link_create(dzp, name, zp, tx, ZNEW); + + zfs_log_symlink(zilog, tx, txtype, dzp, zp, + __DECONST(char *, name), __DECONST(char *, link)); + *zpp = zp; + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + getnewvnode_drop_reserve(); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by vp. + * + * IN: vp - vnode of symbolic link. + * uio - structure to contain the link path. + * cr - credentials of caller. + * ct - caller context + * + * OUT: uio - structure containing the link path. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +static int +zfs_readlink(vnode_t *vp, uio_t *uio, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdvp referencing svp. + * + * IN: tdvp - Directory to contain new entry. + * svp - vnode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * tdvp - ctime|mtime updated + * svp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + znode_t *tzp; + zfsvfs_t *zfsvfs = tdzp->z_zfsvfs; + zilog_t *zilog; + dmu_tx_t *tx; + int error; + uint64_t parent; + uid_t owner; + + ASSERT(ZTOV(tdzp)->v_type == VDIR); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (ZTOV(szp)->v_type == VDIR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (szp->z_pflags & (ZFS_APPENDONLY | + ZFS_IMMUTABLE | ZFS_READONLY)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + + owner = zfs_fuid_map_id(zfsvfs, szp->z_uid, cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(ZTOV(szp), cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lookup(tdzp, name, &tzp, ZNEW); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_create(tdzp, name, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + + dmu_tx_commit(tx); + + if (error == 0) { + vnevent_link(ZTOV(szp), ct); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: ip - inode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +static void +zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) { + /* + * The fs has been unmounted, or we did a + * suspend/resume and this file no longer exists. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_unlinked) { + /* + * Fast path to recycle a vnode of a removed file. + */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + vrecycle(vp); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == 0) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&zp->z_atime, sizeof (zp->z_atime), tx); + zp->z_atime_dirty = 0; + dmu_tx_commit(tx); + } + } + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + + +CTASSERT(sizeof (struct zfid_short) <= sizeof (struct fid)); +CTASSERT(sizeof (struct zfid_long) <= sizeof (struct fid)); + +/*ARGSUSED*/ +static int +zfs_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = (zfsvfs->z_parent != zfsvfs) ? LONG_FID_LEN : SHORT_FID_LEN; + fidp->fid_len = size; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + if (size == LONG_FID_LEN) { + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + zfid_long_t *zlfid; + + zlfid = (zfid_long_t *)fidp; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + /* XXX - this should be the generation number for the objset */ + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +static int +zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + + switch (cmd) { + case _PC_LINK_MAX: + *valp = MIN(LONG_MAX, ZFS_LINK_MAX); + return (0); + + case _PC_FILESIZEBITS: + *valp = 64; + return (0); + case _PC_MIN_HOLE_SIZE: + *valp = (int)SPA_MINBLOCKSIZE; + return (0); + case _PC_ACL_EXTENDED: + *valp = 0; + return (0); + + case _PC_ACL_NFS4: + *valp = 1; + return (0); + + case _PC_ACL_PATH_MAX: + *valp = ACL_MAX_ENTRIES; + return (0); + + default: + return (EOPNOTSUPP); + } +} + +/*ARGSUSED*/ +static int +zfs_getsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +static int +zfs_getpages(struct vnode *vp, vm_page_t *ma, int count, int *rbehind, + int *rahead) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zp->z_zfsvfs->z_os; + zfs_locked_range_t *lr; + vm_object_t object; + off_t start, end, obj_size; + uint_t blksz; + int pgsin_b, pgsin_a; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + start = IDX_TO_OFF(ma[0]->pindex); + end = IDX_TO_OFF(ma[count - 1]->pindex + 1); + + /* + * Lock a range covering all required and optional pages. + * Note that we need to handle the case of the block size growing. + */ + for (;;) { + blksz = zp->z_blksz; + lr = zfs_rangelock_tryenter(&zp->z_rangelock, + rounddown(start, blksz), + roundup(end, blksz) - rounddown(start, blksz), RL_READER); + if (lr == NULL) { + if (rahead != NULL) { + *rahead = 0; + rahead = NULL; + } + if (rbehind != NULL) { + *rbehind = 0; + rbehind = NULL; + } + break; + } + if (blksz == zp->z_blksz) + break; + zfs_rangelock_exit(lr); + } + + object = ma[0]->object; + zfs_vmobject_wlock(object); + obj_size = object->un_pager.vnp.vnp_size; + zfs_vmobject_wunlock(object); + if (IDX_TO_OFF(ma[count - 1]->pindex) >= obj_size) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (zfs_vm_pagerret_bad); + } + + pgsin_b = 0; + if (rbehind != NULL) { + pgsin_b = OFF_TO_IDX(start - rounddown(start, blksz)); + pgsin_b = MIN(*rbehind, pgsin_b); + } + + pgsin_a = 0; + if (rahead != NULL) { + pgsin_a = OFF_TO_IDX(roundup(end, blksz) - end); + if (end + IDX_TO_OFF(pgsin_a) >= obj_size) + pgsin_a = OFF_TO_IDX(round_page(obj_size) - end); + pgsin_a = MIN(*rahead, pgsin_a); + } + + /* + * NB: we need to pass the exact byte size of the data that we expect + * to read after accounting for the file size. This is required because + * ZFS will panic if we request DMU to read beyond the end of the last + * allocated block. + */ + error = dmu_read_pages(os, zp->z_id, ma, count, &pgsin_b, &pgsin_a, + MIN(end, obj_size) - (end - PAGE_SIZE)); + + zfs_rangelock_exit(lr); + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); + ZFS_EXIT(zfsvfs); + + if (error != 0) + return (zfs_vm_pagerret_error); + + VM_CNT_INC(v_vnodein); + VM_CNT_ADD(v_vnodepgsin, count + pgsin_b + pgsin_a); + if (rbehind != NULL) + *rbehind = pgsin_b; + if (rahead != NULL) + *rahead = pgsin_a; + return (zfs_vm_pagerret_ok); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int *a_rbehind; + int *a_rahead; +}; +#endif + +static int +zfs_freebsd_getpages(struct vop_getpages_args *ap) +{ + + return (zfs_getpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_rbehind, + ap->a_rahead)); +} + +static int +zfs_putpages(struct vnode *vp, vm_page_t *ma, size_t len, int flags, + int *rtvals) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + dmu_tx_t *tx; + struct sf_buf *sf; + vm_object_t object; + vm_page_t m; + caddr_t va; + size_t tocopy; + size_t lo_len; + vm_ooffset_t lo_off; + vm_ooffset_t off; + uint_t blksz; + int ncount; + int pcount; + int err; + int i; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + object = vp->v_object; + pcount = btoc(len); + ncount = pcount; + + KASSERT(ma[0]->object == object, ("mismatching object")); + KASSERT(len > 0 && (len & PAGE_MASK) == 0, ("unexpected length")); + + for (i = 0; i < pcount; i++) + rtvals[i] = zfs_vm_pagerret_error; + + off = IDX_TO_OFF(ma[0]->pindex); + blksz = zp->z_blksz; + lo_off = rounddown(off, blksz); + lo_len = roundup(len + (off - lo_off), blksz); + lr = zfs_rangelock_enter(&zp->z_rangelock, lo_off, lo_len, RL_WRITER); + + zfs_vmobject_wlock(object); + if (len + off > object->un_pager.vnp.vnp_size) { + if (object->un_pager.vnp.vnp_size > off) { + int pgoff; + + len = object->un_pager.vnp.vnp_size - off; + ncount = btoc(len); + if ((pgoff = (int)len & PAGE_MASK) != 0) { + /* + * If the object is locked and the following + * conditions hold, then the page's dirty + * field cannot be concurrently changed by a + * pmap operation. + */ + m = ma[ncount - 1]; + vm_page_assert_sbusied(m); + KASSERT(!pmap_page_is_write_mapped(m), + ("zfs_putpages: page %p is not read-only", + m)); + vm_page_clear_dirty(m, pgoff, PAGE_SIZE - + pgoff); + } + } else { + len = 0; + ncount = 0; + } + if (ncount < pcount) { + for (i = ncount; i < pcount; i++) { + rtvals[i] = zfs_vm_pagerret_bad; + } + } + } + zfs_vmobject_wunlock(object); + + if (ncount == 0) + goto out; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, zp->z_uid) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, zp->z_gid) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + goto out; + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, off, len); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + goto out; + } + + if (zp->z_blksz < PAGE_SIZE) { + for (i = 0; len > 0; off += tocopy, len -= tocopy, i++) { + tocopy = len > PAGE_SIZE ? PAGE_SIZE : len; + va = zfs_map_page(ma[i], &sf); + dmu_write(zfsvfs->z_os, zp->z_id, off, tocopy, va, tx); + zfs_unmap_page(sf); + } + } else { + err = dmu_write_pages(zfsvfs->z_os, zp->z_id, off, len, ma, tx); + } + + if (err == 0) { + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT0(err); + /* + * XXX we should be passing a callback to undirty + * but that would make the locking messier + */ + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, + len, 0, NULL, NULL); + + zfs_vmobject_wlock(object); + for (i = 0; i < ncount; i++) { + rtvals[i] = zfs_vm_pagerret_ok; + vm_page_undirty(ma[i]); + } + zfs_vmobject_wunlock(object); + VM_CNT_INC(v_vnodeout); + VM_CNT_ADD(v_vnodepgsout, ncount); + } + dmu_tx_commit(tx); + +out: + zfs_rangelock_exit(lr); + if ((flags & (zfs_vm_pagerput_sync | zfs_vm_pagerput_inval)) != 0 || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + return (rtvals[0]); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_putpages_args { + struct vnode *a_vp; + vm_page_t *a_m; + int a_count; + int a_sync; + int *a_rtvals; +}; +#endif + +static int +zfs_freebsd_putpages(struct vop_putpages_args *ap) +{ + + return (zfs_putpages(ap->a_vp, ap->a_m, ap->a_count, ap->a_sync, + ap->a_rtvals)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_bmap_args { + struct vnode *a_vp; + daddr_t a_bn; + struct bufobj **a_bop; + daddr_t *a_bnp; + int *a_runp; + int *a_runb; +}; +#endif + +static int +zfs_freebsd_bmap(struct vop_bmap_args *ap) +{ + + if (ap->a_bop != NULL) + *ap->a_bop = &ap->a_vp->v_bufobj; + if (ap->a_bnp != NULL) + *ap->a_bnp = ap->a_bn; + if (ap->a_runp != NULL) + *ap->a_runp = 0; + if (ap->a_runb != NULL) + *ap->a_runb = 0; + + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_open_args { + struct vnode *a_vp; + int a_mode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_open(struct vop_open_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + int error; + + error = zfs_open(&vp, ap->a_mode, ap->a_cred); + if (error == 0) + vnode_create_vobject(vp, zp->z_size, ap->a_td); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_close_args { + struct vnode *a_vp; + int a_fflag; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_close(struct vop_close_args *ap) +{ + + return (zfs_close(ap->a_vp, ap->a_fflag, 1, 0, ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_ioctl_args { + struct vnode *a_vp; + ulong_t a_command; + caddr_t a_data; + int a_fflag; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_ioctl(struct vop_ioctl_args *ap) +{ + + return (zfs_ioctl(ap->a_vp, ap->a_command, (intptr_t)ap->a_data, + ap->a_fflag, ap->a_cred, NULL)); +} + +static int +ioflags(int ioflags) +{ + int flags = 0; + + if (ioflags & IO_APPEND) + flags |= FAPPEND; + if (ioflags & IO_NDELAY) + flags |= FNONBLOCK; + if (ioflags & IO_SYNC) + flags |= (FSYNC | FDSYNC | FRSYNC); + + return (flags); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_read_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_read(struct vop_read_args *ap) +{ + + return (zfs_read(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_write_args { + struct vnode *a_vp; + struct uio *a_uio; + int a_ioflag; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_write(struct vop_write_args *ap) +{ + + return (zfs_write(ap->a_vp, ap->a_uio, ioflags(ap->a_ioflag), + ap->a_cred)); +} + +#if __FreeBSD_version >= 1300102 +/* + * VOP_FPLOOKUP_VEXEC routines are subject to special circumstances, see + * the comment above cache_fplookup for details. + */ +static int +zfs_freebsd_fplookup_vexec(struct vop_fplookup_vexec_args *v) +{ + vnode_t *vp; + znode_t *zp; + uint64_t pflags; + + vp = v->a_vp; + zp = VTOZ_SMR(vp); + if (__predict_false(zp == NULL)) + return (EAGAIN); + pflags = atomic_load_64(&zp->z_pflags); + if (pflags & ZFS_AV_QUARANTINED) + return (EAGAIN); + if (pflags & ZFS_XATTR) + return (EAGAIN); + if ((pflags & ZFS_NO_EXECS_DENIED) == 0) + return (EAGAIN); + return (0); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_access_args { + struct vnode *a_vp; + accmode_t a_accmode; + struct ucred *a_cred; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_access(struct vop_access_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + accmode_t accmode; + int error = 0; + + + if (ap->a_accmode == VEXEC) { + if (zfs_fastaccesschk_execute(zp, ap->a_cred) == 0) + return (0); + } + + /* + * ZFS itself only knowns about VREAD, VWRITE, VEXEC and VAPPEND, + */ + accmode = ap->a_accmode & (VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) + error = zfs_access(ap->a_vp, accmode, 0, ap->a_cred, NULL); + + /* + * VADMIN has to be handled by vaccess(). + */ + if (error == 0) { + accmode = ap->a_accmode & ~(VREAD|VWRITE|VEXEC|VAPPEND); + if (accmode != 0) { +#if __FreeBSD_version >= 1300105 + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred); +#else + error = vaccess(vp->v_type, zp->z_mode, zp->z_uid, + zp->z_gid, accmode, ap->a_cred, NULL); +#endif + } + } + + /* + * For VEXEC, ensure that at least one execute bit is set for + * non-directories. + */ + if (error == 0 && (ap->a_accmode & VEXEC) != 0 && vp->v_type != VDIR && + (zp->z_mode & (S_IXUSR | S_IXGRP | S_IXOTH)) == 0) { + error = EACCES; + } + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_lookup(struct vop_lookup_args *ap, boolean_t cached) +{ + struct componentname *cnp = ap->a_cnp; + char nm[NAME_MAX + 1]; + + ASSERT(cnp->cn_namelen < sizeof (nm)); + strlcpy(nm, cnp->cn_nameptr, MIN(cnp->cn_namelen + 1, sizeof (nm))); + + return (zfs_lookup(ap->a_dvp, nm, ap->a_vpp, cnp, cnp->cn_nameiop, + cnp->cn_cred, cnp->cn_thread, 0, cached)); +} + +static int +zfs_freebsd_cachedlookup(struct vop_cachedlookup_args *ap) +{ + + return (zfs_freebsd_lookup((struct vop_lookup_args *)ap, B_TRUE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_lookup_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_cache_lookup(struct vop_lookup_args *ap) +{ + zfsvfs_t *zfsvfs; + + zfsvfs = ap->a_dvp->v_mount->mnt_data; + if (zfsvfs->z_use_namecache) + return (vfs_cache_lookup(ap)); + else + return (zfs_freebsd_lookup(ap, B_FALSE)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_create_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_create(struct vop_create_args *ap) +{ + zfsvfs_t *zfsvfs; + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc, mode; + + ASSERT(cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + mode = vap->va_mode & ALLPERMS; + zfsvfs = ap->a_dvp->v_mount->mnt_data; + *ap->a_vpp = NULL; + + rc = zfs_create(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, !EXCL, mode, + &zp, cnp->cn_cred, 0 /* flag */, NULL /* vsecattr */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + if (zfsvfs->z_use_namecache && + rc == 0 && (cnp->cn_flags & MAKEENTRY) != 0) + cache_enter(ap->a_dvp, *ap->a_vpp, cnp); + + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_remove_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_remove(struct vop_remove_args *ap) +{ + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + return (zfs_remove_(ap->a_dvp, ap->a_vp, ap->a_cnp->cn_nameptr, + ap->a_cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_mkdir_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; +}; +#endif + +static int +zfs_freebsd_mkdir(struct vop_mkdir_args *ap) +{ + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(ap->a_cnp->cn_flags & SAVENAME); + + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_mkdir(VTOZ(ap->a_dvp), ap->a_cnp->cn_nameptr, vap, &zp, + ap->a_cnp->cn_cred, 0, NULL); + + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rmdir_args { + struct vnode *a_dvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_rmdir(struct vop_rmdir_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_rmdir_(ap->a_dvp, ap->a_vp, cnp->cn_nameptr, cnp->cn_cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readdir_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; + int *a_eofflag; + int *a_ncookies; + ulong_t **a_cookies; +}; +#endif + +static int +zfs_freebsd_readdir(struct vop_readdir_args *ap) +{ + + return (zfs_readdir(ap->a_vp, ap->a_uio, ap->a_cred, ap->a_eofflag, + ap->a_ncookies, ap->a_cookies)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fsync_args { + struct vnode *a_vp; + int a_waitfor; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_fsync(struct vop_fsync_args *ap) +{ + + vop_stdfsync(ap); + return (zfs_fsync(ap->a_vp, 0, ap->a_td->td_ucred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_getattr(struct vop_getattr_args *ap) +{ + vattr_t *vap = ap->a_vap; + xvattr_t xvap; + ulong_t fflags = 0; + int error; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + xvap.xva_vattr.va_mask |= AT_XVATTR; + + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + XVA_SET_REQ(&xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&xvap, XAT_APPENDONLY); + XVA_SET_REQ(&xvap, XAT_NOUNLINK); + XVA_SET_REQ(&xvap, XAT_NODUMP); + XVA_SET_REQ(&xvap, XAT_READONLY); + XVA_SET_REQ(&xvap, XAT_ARCHIVE); + XVA_SET_REQ(&xvap, XAT_SYSTEM); + XVA_SET_REQ(&xvap, XAT_HIDDEN); + XVA_SET_REQ(&xvap, XAT_REPARSE); + XVA_SET_REQ(&xvap, XAT_OFFLINE); + XVA_SET_REQ(&xvap, XAT_SPARSE); + + error = zfs_getattr(ap->a_vp, (vattr_t *)&xvap, 0, ap->a_cred); + if (error != 0) + return (error); + + /* Convert ZFS xattr into chflags. */ +#define FLAG_CHECK(fflag, xflag, xfield) do { \ + if (XVA_ISSET_RTN(&xvap, (xflag)) && (xfield) != 0) \ + fflags |= (fflag); \ +} while (0) + FLAG_CHECK(SF_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHECK(SF_APPEND, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHECK(SF_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHECK(UF_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHECK(UF_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHECK(UF_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHECK(UF_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHECK(UF_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHECK(UF_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHECK(UF_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHECK(UF_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); + +#undef FLAG_CHECK + *vap = xvap.xva_vattr; + vap->va_flags = fflags; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setattr_args { + struct vnode *a_vp; + struct vattr *a_vap; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_setattr(struct vop_setattr_args *ap) +{ + vnode_t *vp = ap->a_vp; + vattr_t *vap = ap->a_vap; + cred_t *cred = ap->a_cred; + xvattr_t xvap; + ulong_t fflags; + uint64_t zflags; + + vattr_init_mask(vap); + vap->va_mask &= ~AT_NOSET; + + xva_init(&xvap); + xvap.xva_vattr = *vap; + + zflags = VTOZ(vp)->z_pflags; + + if (vap->va_flags != VNOVAL) { + zfsvfs_t *zfsvfs = VTOZ(vp)->z_zfsvfs; + int error; + + if (zfsvfs->z_use_fuids == B_FALSE) + return (EOPNOTSUPP); + + fflags = vap->va_flags; + /* + * XXX KDM + * We need to figure out whether it makes sense to allow + * UF_REPARSE through, since we don't really have other + * facilities to handle reparse points and zfs_setattr() + * doesn't currently allow setting that attribute anyway. + */ + if ((fflags & ~(SF_IMMUTABLE|SF_APPEND|SF_NOUNLINK|UF_ARCHIVE| + UF_NODUMP|UF_SYSTEM|UF_HIDDEN|UF_READONLY|UF_REPARSE| + UF_OFFLINE|UF_SPARSE)) != 0) + return (EOPNOTSUPP); + /* + * Unprivileged processes are not permitted to unset system + * flags, or modify flags if any system flags are set. + * Privileged non-jail processes may not modify system flags + * if securelevel > 0 and any existing system flags are set. + * Privileged jail processes behave like privileged non-jail + * processes if the PR_ALLOW_CHFLAGS permission bit is set; + * otherwise, they behave like unprivileged processes. + */ + if (secpolicy_fs_owner(vp->v_mount, cred) == 0 || + spl_priv_check_cred(cred, PRIV_VFS_SYSFLAGS) == 0) { + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | ZFS_NOUNLINK)) { + error = securelevel_gt(cred, 0); + if (error != 0) + return (error); + } + } else { + /* + * Callers may only modify the file flags on + * objects they have VADMIN rights for. + */ + if ((error = VOP_ACCESS(vp, VADMIN, cred, + curthread)) != 0) + return (error); + if (zflags & + (ZFS_IMMUTABLE | ZFS_APPENDONLY | + ZFS_NOUNLINK)) { + return (EPERM); + } + if (fflags & + (SF_IMMUTABLE | SF_APPEND | SF_NOUNLINK)) { + return (EPERM); + } + } + +#define FLAG_CHANGE(fflag, zflag, xflag, xfield) do { \ + if (((fflags & (fflag)) && !(zflags & (zflag))) || \ + ((zflags & (zflag)) && !(fflags & (fflag)))) { \ + XVA_SET_REQ(&xvap, (xflag)); \ + (xfield) = ((fflags & (fflag)) != 0); \ + } \ +} while (0) + /* Convert chflags into ZFS-type flags. */ + /* XXX: what about SF_SETTABLE?. */ + FLAG_CHANGE(SF_IMMUTABLE, ZFS_IMMUTABLE, XAT_IMMUTABLE, + xvap.xva_xoptattrs.xoa_immutable); + FLAG_CHANGE(SF_APPEND, ZFS_APPENDONLY, XAT_APPENDONLY, + xvap.xva_xoptattrs.xoa_appendonly); + FLAG_CHANGE(SF_NOUNLINK, ZFS_NOUNLINK, XAT_NOUNLINK, + xvap.xva_xoptattrs.xoa_nounlink); + FLAG_CHANGE(UF_ARCHIVE, ZFS_ARCHIVE, XAT_ARCHIVE, + xvap.xva_xoptattrs.xoa_archive); + FLAG_CHANGE(UF_NODUMP, ZFS_NODUMP, XAT_NODUMP, + xvap.xva_xoptattrs.xoa_nodump); + FLAG_CHANGE(UF_READONLY, ZFS_READONLY, XAT_READONLY, + xvap.xva_xoptattrs.xoa_readonly); + FLAG_CHANGE(UF_SYSTEM, ZFS_SYSTEM, XAT_SYSTEM, + xvap.xva_xoptattrs.xoa_system); + FLAG_CHANGE(UF_HIDDEN, ZFS_HIDDEN, XAT_HIDDEN, + xvap.xva_xoptattrs.xoa_hidden); + FLAG_CHANGE(UF_REPARSE, ZFS_REPARSE, XAT_REPARSE, + xvap.xva_xoptattrs.xoa_reparse); + FLAG_CHANGE(UF_OFFLINE, ZFS_OFFLINE, XAT_OFFLINE, + xvap.xva_xoptattrs.xoa_offline); + FLAG_CHANGE(UF_SPARSE, ZFS_SPARSE, XAT_SPARSE, + xvap.xva_xoptattrs.xoa_sparse); +#undef FLAG_CHANGE + } + if (vap->va_birthtime.tv_sec != VNOVAL) { + xvap.xva_vattr.va_mask |= AT_XVATTR; + XVA_SET_REQ(&xvap, XAT_CREATETIME); + } + return (zfs_setattr(VTOZ(vp), (vattr_t *)&xvap, 0, cred)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_rename_args { + struct vnode *a_fdvp; + struct vnode *a_fvp; + struct componentname *a_fcnp; + struct vnode *a_tdvp; + struct vnode *a_tvp; + struct componentname *a_tcnp; +}; +#endif + +static int +zfs_freebsd_rename(struct vop_rename_args *ap) +{ + vnode_t *fdvp = ap->a_fdvp; + vnode_t *fvp = ap->a_fvp; + vnode_t *tdvp = ap->a_tdvp; + vnode_t *tvp = ap->a_tvp; + int error; + + ASSERT(ap->a_fcnp->cn_flags & (SAVENAME|SAVESTART)); + ASSERT(ap->a_tcnp->cn_flags & (SAVENAME|SAVESTART)); + + error = zfs_rename_(fdvp, &fvp, ap->a_fcnp, tdvp, &tvp, + ap->a_tcnp, ap->a_fcnp->cn_cred, 1); + + vrele(fdvp); + vrele(fvp); + vrele(tdvp); + if (tvp != NULL) + vrele(tvp); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_symlink_args { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + struct vattr *a_vap; + char *a_target; +}; +#endif + +static int +zfs_freebsd_symlink(struct vop_symlink_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vattr_t *vap = ap->a_vap; + znode_t *zp = NULL; + int rc; + + ASSERT(cnp->cn_flags & SAVENAME); + + vap->va_type = VLNK; /* FreeBSD: Syscall only sets va_mode. */ + vattr_init_mask(vap); + *ap->a_vpp = NULL; + + rc = zfs_symlink(VTOZ(ap->a_dvp), cnp->cn_nameptr, vap, + ap->a_target, &zp, cnp->cn_cred, 0 /* flags */); + if (rc == 0) + *ap->a_vpp = ZTOV(zp); + return (rc); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_readlink_args { + struct vnode *a_vp; + struct uio *a_uio; + struct ucred *a_cred; +}; +#endif + +static int +zfs_freebsd_readlink(struct vop_readlink_args *ap) +{ + + return (zfs_readlink(ap->a_vp, ap->a_uio, ap->a_cred, NULL)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_link_args { + struct vnode *a_tdvp; + struct vnode *a_vp; + struct componentname *a_cnp; +}; +#endif + +static int +zfs_freebsd_link(struct vop_link_args *ap) +{ + struct componentname *cnp = ap->a_cnp; + vnode_t *vp = ap->a_vp; + vnode_t *tdvp = ap->a_tdvp; + + if (tdvp->v_mount != vp->v_mount) + return (EXDEV); + + ASSERT(cnp->cn_flags & SAVENAME); + + return (zfs_link(VTOZ(tdvp), VTOZ(vp), + cnp->cn_nameptr, cnp->cn_cred, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_inactive(struct vop_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + + zfs_inactive(vp, ap->a_td->td_ucred, NULL); + return (0); +} + +#if __FreeBSD_version >= 1300042 +#ifndef _SYS_SYSPROTO_H_ +struct vop_need_inactive_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_need_inactive(struct vop_need_inactive_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int need; + + if (vn_need_pageq_flush(vp)) + return (1); + + if (!rw_tryenter(&zfsvfs->z_teardown_inactive_lock, RW_READER)) + return (1); + need = (zp->z_sa_hdl == NULL || zp->z_unlinked || zp->z_atime_dirty); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + return (need); +} +#endif + +#ifndef _SYS_SYSPROTO_H_ +struct vop_reclaim_args { + struct vnode *a_vp; + struct thread *a_td; +}; +#endif + +static int +zfs_freebsd_reclaim(struct vop_reclaim_args *ap) +{ + vnode_t *vp = ap->a_vp; + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp != NULL); + +#if __FreeBSD_version < 1300042 + /* Destroy the vm object and flush associated pages. */ + vnode_destroy_vobject(vp); +#endif + /* + * z_teardown_inactive_lock protects from a race with + * zfs_znode_dmu_fini in zfsvfs_teardown during + * force unmount. + */ + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + if (zp->z_sa_hdl == NULL) + zfs_znode_free(zp); + else + zfs_zinactive(zp); + rw_exit(&zfsvfs->z_teardown_inactive_lock); + + vp->v_data = NULL; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_fid_args { + struct vnode *a_vp; + struct fid *a_fid; +}; +#endif + +static int +zfs_freebsd_fid(struct vop_fid_args *ap) +{ + + return (zfs_fid(ap->a_vp, (void *)ap->a_fid, NULL)); +} + + +#ifndef _SYS_SYSPROTO_H_ +struct vop_pathconf_args { + struct vnode *a_vp; + int a_name; + register_t *a_retval; +} *ap; +#endif + +static int +zfs_freebsd_pathconf(struct vop_pathconf_args *ap) +{ + ulong_t val; + int error; + + error = zfs_pathconf(ap->a_vp, ap->a_name, &val, + curthread->td_ucred, NULL); + if (error == 0) { + *ap->a_retval = val; + return (error); + } + if (error != EOPNOTSUPP) + return (error); + + switch (ap->a_name) { + case _PC_NAME_MAX: + *ap->a_retval = NAME_MAX; + return (0); + case _PC_PIPE_BUF: + if (ap->a_vp->v_type == VDIR || ap->a_vp->v_type == VFIFO) { + *ap->a_retval = PIPE_BUF; + return (0); + } + return (EINVAL); + default: + return (vop_stdpathconf(ap)); + } +} + +/* + * FreeBSD's extended attributes namespace defines file name prefix for ZFS' + * extended attribute name: + * + * NAMESPACE PREFIX + * system freebsd:system: + * user (none, can be used to access ZFS fsattr(5) attributes + * created on Solaris) + */ +static int +zfs_create_attrname(int attrnamespace, const char *name, char *attrname, + size_t size) +{ + const char *namespace, *prefix, *suffix; + + /* We don't allow '/' character in attribute name. */ + if (strchr(name, '/') != NULL) + return (EINVAL); + /* We don't allow attribute names that start with "freebsd:" string. */ + if (strncmp(name, "freebsd:", 8) == 0) + return (EINVAL); + + bzero(attrname, size); + + switch (attrnamespace) { + case EXTATTR_NAMESPACE_USER: +#if 0 + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_USER_STRING; + suffix = ":"; +#else + /* + * This is the default namespace by which we can access all + * attributes created on Solaris. + */ + prefix = namespace = suffix = ""; +#endif + break; + case EXTATTR_NAMESPACE_SYSTEM: + prefix = "freebsd:"; + namespace = EXTATTR_NAMESPACE_SYSTEM_STRING; + suffix = ":"; + break; + case EXTATTR_NAMESPACE_EMPTY: + default: + return (EINVAL); + } + if (snprintf(attrname, size, "%s%s%s%s", prefix, namespace, suffix, + name) >= size) { + return (ENAMETOOLONG); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operating to retrieve a named extended attribute. + */ +static int +zfs_getextattr(struct vop_getextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FREAD; + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0, VN_OPEN_INVFS, ap->a_cred, NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + if (ap->a_size != NULL) { + error = VOP_GETATTR(vp, &va, ap->a_cred); + if (error == 0) + *ap->a_size = (size_t)va.va_size; + } else if (ap->a_uio != NULL) + error = VOP_READ(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_deleteextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to remove a named attribute. + */ +static int +zfs_deleteextattr(struct vop_deleteextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + vnode_t *xvp = NULL, *vp; + int error; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + NDINIT_ATVP(&nd, DELETE, NOFOLLOW | LOCKPARENT | LOCKLEAF, + UIO_SYSSPACE, attrname, xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + if (error != 0) { + ZFS_EXIT(zfsvfs); + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ENOENT) + error = ENOATTR; + return (error); + } + + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + NDFREE(&nd, NDF_ONLY_PNBUF); + + vput(nd.ni_dvp); + if (vp == nd.ni_dvp) + vrele(vp); + else + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + IN const char *a_name; + INOUT struct uio *a_uio; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to set a named attribute. + */ +static int +zfs_setextattr(struct vop_setextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrname[255]; + struct vattr va; + vnode_t *xvp = NULL, *vp; + int error, flags; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VWRITE); + if (error != 0) + return (error); + error = zfs_create_attrname(ap->a_attrnamespace, ap->a_name, attrname, + sizeof (attrname)); + if (error != 0) + return (error); + + ZFS_ENTER(zfsvfs); + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR | CREATE_XATTR_DIR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + flags = FFLAGS(O_WRONLY | O_CREAT); + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW, UIO_SYSSPACE, attrname, + xvp, td); + error = vn_open_cred(&nd, &flags, 0600, VN_OPEN_INVFS, ap->a_cred, + NULL); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + VATTR_NULL(&va); + va.va_size = 0; + error = VOP_SETATTR(vp, &va, ap->a_cred); + if (error == 0) + VOP_WRITE(vp, ap->a_uio, IO_UNIT, ap->a_cred); + + VOP_UNLOCK1(vp); + vn_close(vp, flags, ap->a_cred, td); + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_listextattr { + IN struct vnode *a_vp; + IN int a_attrnamespace; + INOUT struct uio *a_uio; + OUT size_t *a_size; + IN struct ucred *a_cred; + IN struct thread *a_td; +}; +#endif + +/* + * Vnode operation to retrieve extended attributes on a vnode. + */ +static int +zfs_listextattr(struct vop_listextattr_args *ap) +{ + zfsvfs_t *zfsvfs = VTOZ(ap->a_vp)->z_zfsvfs; + struct thread *td = ap->a_td; + struct nameidata nd; + char attrprefix[16]; + uint8_t dirbuf[sizeof (struct dirent)]; + struct dirent *dp; + struct iovec aiov; + struct uio auio, *uio = ap->a_uio; + size_t *sizep = ap->a_size; + size_t plen; + vnode_t *xvp = NULL, *vp; + int done, error, eof, pos; + + /* + * If the xattr property is off, refuse the request. + */ + if (!(zfsvfs->z_flags & ZSB_XATTR)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + error = extattr_check_cred(ap->a_vp, ap->a_attrnamespace, + ap->a_cred, ap->a_td, VREAD); + if (error != 0) + return (error); + + error = zfs_create_attrname(ap->a_attrnamespace, "", attrprefix, + sizeof (attrprefix)); + if (error != 0) + return (error); + plen = strlen(attrprefix); + + ZFS_ENTER(zfsvfs); + + if (sizep != NULL) + *sizep = 0; + + error = zfs_lookup(ap->a_vp, NULL, &xvp, NULL, 0, ap->a_cred, td, + LOOKUP_XATTR, B_FALSE); + if (error != 0) { + ZFS_EXIT(zfsvfs); + /* + * ENOATTR means that the EA directory does not yet exist, + * i.e. there are no extended attributes there. + */ + if (error == ENOATTR) + error = 0; + return (error); + } + + NDINIT_ATVP(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKSHARED, + UIO_SYSSPACE, ".", xvp, td); + error = namei(&nd); + vp = nd.ni_vp; + NDFREE(&nd, NDF_ONLY_PNBUF); + if (error != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; + auio.uio_rw = UIO_READ; + auio.uio_offset = 0; + + do { + uint8_t nlen; + + aiov.iov_base = (void *)dirbuf; + aiov.iov_len = sizeof (dirbuf); + auio.uio_resid = sizeof (dirbuf); + error = VOP_READDIR(vp, &auio, ap->a_cred, &eof, NULL, NULL); + done = sizeof (dirbuf) - auio.uio_resid; + if (error != 0) + break; + for (pos = 0; pos < done; ) { + dp = (struct dirent *)(dirbuf + pos); + pos += dp->d_reclen; + /* + * XXX: Temporarily we also accept DT_UNKNOWN, as this + * is what we get when attribute was created on Solaris. + */ + if (dp->d_type != DT_REG && dp->d_type != DT_UNKNOWN) + continue; + if (plen == 0 && + strncmp(dp->d_name, "freebsd:", 8) == 0) + continue; + else if (strncmp(dp->d_name, attrprefix, plen) != 0) + continue; + nlen = dp->d_namlen - plen; + if (sizep != NULL) + *sizep += 1 + nlen; + else if (uio != NULL) { + /* + * Format of extattr name entry is one byte for + * length and the rest for name. + */ + error = uiomove(&nlen, 1, uio->uio_rw, uio); + if (error == 0) { + error = uiomove(dp->d_name + plen, nlen, + uio->uio_rw, uio); + } + if (error != 0) + break; + } + } + } while (!eof && error == 0); + + vput(vp); + ZFS_EXIT(zfsvfs); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_getacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_getacl(struct vop_getacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + vsecattr.vsa_mask = VSA_ACE | VSA_ACECNT; + if ((error = zfs_getsecattr(ap->a_vp, &vsecattr, 0, ap->a_cred, NULL))) + return (error); + + error = acl_from_aces(ap->a_aclp, vsecattr.vsa_aclentp, + vsecattr.vsa_aclcnt); + if (vsecattr.vsa_aclentp != NULL) + kmem_free(vsecattr.vsa_aclentp, vsecattr.vsa_aclentsz); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_setacl_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_setacl(struct vop_setacl_args *ap) +{ + int error; + vsecattr_t vsecattr; + int aclbsize; /* size of acl list in bytes */ + aclent_t *aaclp; + + if (ap->a_type != ACL_TYPE_NFS4) + return (EINVAL); + + if (ap->a_aclp == NULL) + return (EINVAL); + + if (ap->a_aclp->acl_cnt < 1 || ap->a_aclp->acl_cnt > MAX_ACL_ENTRIES) + return (EINVAL); + + /* + * With NFSv4 ACLs, chmod(2) may need to add additional entries, + * splitting every entry into two and appending "canonical six" + * entries at the end. Don't allow for setting an ACL that would + * cause chmod(2) to run out of ACL entries. + */ + if (ap->a_aclp->acl_cnt * 2 + 6 > ACL_MAX_ENTRIES) + return (ENOSPC); + + error = acl_nfs4_check(ap->a_aclp, ap->a_vp->v_type == VDIR); + if (error != 0) + return (error); + + vsecattr.vsa_mask = VSA_ACE; + aclbsize = ap->a_aclp->acl_cnt * sizeof (ace_t); + vsecattr.vsa_aclentp = kmem_alloc(aclbsize, KM_SLEEP); + aaclp = vsecattr.vsa_aclentp; + vsecattr.vsa_aclentsz = aclbsize; + + aces_from_acl(vsecattr.vsa_aclentp, &vsecattr.vsa_aclcnt, ap->a_aclp); + error = zfs_setsecattr(VTOZ(ap->a_vp), &vsecattr, 0, ap->a_cred); + kmem_free(aaclp, aclbsize); + + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct vop_aclcheck_args { + struct vnode *vp; + acl_type_t type; + struct acl *aclp; + struct ucred *cred; + struct thread *td; +}; +#endif + +static int +zfs_freebsd_aclcheck(struct vop_aclcheck_args *ap) +{ + + return (EOPNOTSUPP); +} + +static int +zfs_vptocnp(struct vop_vptocnp_args *ap) +{ + vnode_t *covered_vp; + vnode_t *vp = ap->a_vp; + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *zp = VTOZ(vp); + int ltype; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* + * If we are a snapshot mounted under .zfs, run the operation + * on the covered vnode. + */ + if (zp->z_id != zfsvfs->z_root || zfsvfs->z_parent == zfsvfs) { + char name[MAXNAMLEN + 1]; + znode_t *dzp; + size_t len; + + error = zfs_znode_parent_and_name(zp, &dzp, name); + if (error == 0) { + len = strlen(name); + if (*ap->a_buflen < len) + error = SET_ERROR(ENOMEM); + } + if (error == 0) { + *ap->a_buflen -= len; + bcopy(name, ap->a_buf + *ap->a_buflen, len); + *ap->a_vpp = ZTOV(dzp); + } + ZFS_EXIT(zfsvfs); + return (error); + } + ZFS_EXIT(zfsvfs); + + covered_vp = vp->v_mount->mnt_vnodecovered; +#if __FreeBSD_version >= 1300045 + enum vgetstate vs = vget_prep(covered_vp); +#else + vhold(covered_vp); +#endif + ltype = VOP_ISLOCKED(vp); + VOP_UNLOCK1(vp); +#if __FreeBSD_version >= 1300045 + error = vget_finish(covered_vp, LK_SHARED, vs); +#else + error = vget(covered_vp, LK_SHARED | LK_VNHELD, curthread); +#endif + if (error == 0) { + error = VOP_VPTOCNP(covered_vp, ap->a_vpp, ap->a_cred, + ap->a_buf, ap->a_buflen); + vput(covered_vp); + } + vn_lock(vp, ltype | LK_RETRY); + if (VN_IS_DOOMED(vp)) + error = SET_ERROR(ENOENT); + return (error); +} + +#ifdef DIAGNOSTIC +#ifndef _SYS_SYSPROTO_H_ +struct vop_lock1_args { + struct vnode *a_vp; + int a_flags; + char *file; + int line; +}; +#endif + +static int +zfs_lock(struct vop_lock1_args *ap) +{ + vnode_t *vp; + znode_t *zp; + int err; + +#if __FreeBSD_version >= 1300064 + err = vop_lock(ap); +#else + err = vop_stdlock(ap); +#endif + if (err == 0 && (ap->a_flags & LK_NOWAIT) == 0) { + vp = ap->a_vp; + zp = vp->v_data; + if (vp->v_mount != NULL && !VN_IS_DOOMED(vp) && + zp != NULL && (zp->z_pflags & ZFS_XATTR) == 0) + VERIFY(!RRM_LOCK_HELD(&zp->z_zfsvfs->z_teardown_lock)); + } + return (err); +} +#endif + +struct vop_vector zfs_vnodeops; +struct vop_vector zfs_fifoops; +struct vop_vector zfs_shareops; + +struct vop_vector zfs_vnodeops = { + .vop_default = &default_vnodeops, + .vop_inactive = zfs_freebsd_inactive, +#if __FreeBSD_version >= 1300042 + .vop_need_inactive = zfs_freebsd_need_inactive, +#endif + .vop_reclaim = zfs_freebsd_reclaim, +#if __FreeBSD_version >= 1300102 + .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, +#endif + .vop_access = zfs_freebsd_access, + .vop_allocate = VOP_EINVAL, + .vop_lookup = zfs_cache_lookup, + .vop_cachedlookup = zfs_freebsd_cachedlookup, + .vop_getattr = zfs_freebsd_getattr, + .vop_setattr = zfs_freebsd_setattr, + .vop_create = zfs_freebsd_create, + .vop_mknod = (vop_mknod_t *)zfs_freebsd_create, + .vop_mkdir = zfs_freebsd_mkdir, + .vop_readdir = zfs_freebsd_readdir, + .vop_fsync = zfs_freebsd_fsync, + .vop_open = zfs_freebsd_open, + .vop_close = zfs_freebsd_close, + .vop_rmdir = zfs_freebsd_rmdir, + .vop_ioctl = zfs_freebsd_ioctl, + .vop_link = zfs_freebsd_link, + .vop_symlink = zfs_freebsd_symlink, + .vop_readlink = zfs_freebsd_readlink, + .vop_read = zfs_freebsd_read, + .vop_write = zfs_freebsd_write, + .vop_remove = zfs_freebsd_remove, + .vop_rename = zfs_freebsd_rename, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_bmap = zfs_freebsd_bmap, + .vop_fid = zfs_freebsd_fid, + .vop_getextattr = zfs_getextattr, + .vop_deleteextattr = zfs_deleteextattr, + .vop_setextattr = zfs_setextattr, + .vop_listextattr = zfs_listextattr, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, + .vop_getpages = zfs_freebsd_getpages, + .vop_putpages = zfs_freebsd_putpages, + .vop_vptocnp = zfs_vptocnp, +#if __FreeBSD_version >= 1300064 +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#else + .vop_lock1 = vop_lock, +#endif + .vop_unlock = vop_unlock, + .vop_islocked = vop_islocked, +#else +#ifdef DIAGNOSTIC + .vop_lock1 = zfs_lock, +#endif +#endif +}; +VFS_VOP_VECTOR_REGISTER(zfs_vnodeops); + +struct vop_vector zfs_fifoops = { + .vop_default = &fifo_specops, + .vop_fsync = zfs_freebsd_fsync, +#if __FreeBSD_version >= 1300102 + .vop_fplookup_vexec = zfs_freebsd_fplookup_vexec, +#endif + .vop_access = zfs_freebsd_access, + .vop_getattr = zfs_freebsd_getattr, + .vop_inactive = zfs_freebsd_inactive, + .vop_read = VOP_PANIC, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_setattr = zfs_freebsd_setattr, + .vop_write = VOP_PANIC, + .vop_pathconf = zfs_freebsd_pathconf, + .vop_fid = zfs_freebsd_fid, + .vop_getacl = zfs_freebsd_getacl, + .vop_setacl = zfs_freebsd_setacl, + .vop_aclcheck = zfs_freebsd_aclcheck, +}; +VFS_VOP_VECTOR_REGISTER(zfs_fifoops); + +/* + * special share hidden files vnode operations template + */ +struct vop_vector zfs_shareops = { + .vop_default = &default_vnodeops, + .vop_access = zfs_freebsd_access, + .vop_inactive = zfs_freebsd_inactive, + .vop_reclaim = zfs_freebsd_reclaim, + .vop_fid = zfs_freebsd_fid, + .vop_pathconf = zfs_freebsd_pathconf, +}; +VFS_VOP_VECTOR_REGISTER(zfs_shareops); diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c new file mode 100644 index 000000000000..76e24b1bdf51 --- /dev/null +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -0,0 +1,2067 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2011 Martin Matuska */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* Used by fstat(1). */ +SYSCTL_INT(_debug_sizeof, OID_AUTO, znode, CTLFLAG_RD, + SYSCTL_NULL_INT_PTR, sizeof (znode_t), "sizeof(znode_t)"); + +/* + * Define ZNODE_STATS to turn on statistic gathering. By default, it is only + * turned on when DEBUG is also defined. + */ +#ifdef ZFS_DEBUG +#define ZNODE_STATS +#endif /* DEBUG */ + +#ifdef ZNODE_STATS +#define ZNODE_STAT_ADD(stat) ((stat)++) +#else +#define ZNODE_STAT_ADD(stat) /* nothing */ +#endif /* ZNODE_STATS */ + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + +#if defined(_KERNEL) && !defined(KMEM_DEBUG) && \ + __FreeBSD_version >= 1300102 +#define _ZFS_USE_SMR +static uma_zone_t znode_uma_zone; +#else +static kmem_cache_t *znode_cache = NULL; +#endif + +extern struct vop_vector zfs_vnodeops; +extern struct vop_vector zfs_fifoops; +extern struct vop_vector zfs_shareops; + + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + POINTER_INVALIDATE(&zp->z_zfsvfs); + + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_acl_cached = NULL; + zp->z_vnode = NULL; + zp->z_moved = 0; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + ASSERT3P(zp->z_vnode, ==, NULL); + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_acl_lock); + zfs_rangelock_fini(&zp->z_rangelock); + + ASSERT(zp->z_acl_cached == NULL); +} + + +#ifdef _ZFS_USE_SMR +VFS_SMR_DECLARE; + +static int +zfs_znode_cache_constructor_smr(void *mem, int size __unused, void *private, + int flags) +{ + + return (zfs_znode_cache_constructor(mem, private, flags)); +} + +static void +zfs_znode_cache_destructor_smr(void *mem, int size __unused, void *private) +{ + + zfs_znode_cache_destructor(mem, private); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); + ASSERT(znode_uma_zone == NULL); + znode_uma_zone = uma_zcreate("zfs_znode_cache", + sizeof (znode_t), zfs_znode_cache_constructor_smr, + zfs_znode_cache_destructor_smr, NULL, NULL, 0, 0); + VFS_SMR_ZONE_SET(znode_uma_zone); +} + +static znode_t * +zfs_znode_alloc_kmem(int flags) +{ + + return (uma_zalloc_smr(znode_uma_zone, flags)); +} + +static void +zfs_znode_free_kmem(znode_t *zp) +{ + + uma_zfree_smr(znode_uma_zone, zp); +} +#else +void +zfs_znode_init(void) +{ + /* + * Initialize zcache + */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, 0); +} + +static znode_t * +zfs_znode_alloc_kmem(int flags) +{ + + return (kmem_cache_alloc(znode_cache, flags)); +} + +static void +zfs_znode_free_kmem(znode_t *zp) +{ + + kmem_cache_free(znode_cache, zp); +} +#endif + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ +#ifdef _ZFS_USE_SMR + if (znode_uma_zone) { + uma_zdestroy(znode_uma_zone); + znode_uma_zone = NULL; + } +#else + if (znode_cache) { + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + } +#endif + rw_destroy(&zfsvfs_lock); +} + + +static int +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + znode_t *zp; + int error; + + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); + + sharezp = zfs_znode_alloc_kmem(KM_SLEEP); + ASSERT(!POINTER_IS_VALID(sharezp->z_zfsvfs)); + sharezp->z_moved = 0; + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; + sharezp->z_is_sa = zfsvfs->z_use_sa; + + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, sharezp); + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + sa_handle_destroy(sharezp->z_sa_hdl); + zfs_znode_free_kmem(sharezp); + + return (error); +} + +/* + * define a couple of values we need available + * for both 64 and 32 bit environments. + */ +#ifndef NBITSMINOR64 +#define NBITSMINOR64 32 +#endif +#ifndef MAXMAJ64 +#define MAXMAJ64 0xffffffffUL +#endif +#ifndef MAXMIN64 +#define MAXMIN64 0xffffffffUL +#endif + +/* + * Create special expldev for ZFS private use. + * Can't use standard expldev since it doesn't do + * what we want. The standard expldev() takes a + * dev32_t in LP64 and expands it to a long dev_t. + * We need an interface that takes a dev32_t in ILP32 + * and expands it to a long dev_t. + */ +static uint64_t +zfs_expldev(dev_t dev) +{ + return (((uint64_t)major(dev) << NBITSMINOR64) | minor(dev)); +} +/* + * Special cmpldev for ZFS private use. + * Can't use standard cmpldev since it takes + * a long dev_t and compresses it to dev32_t in + * LP64. We need to do a compaction of a long dev_t + * to a dev32_t in ILP32. + */ +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (makedev((dev >> NBITSMINOR64), (dev & MAXMIN64))); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs) || (zfsvfs == zp->z_zfsvfs)); + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zfsvfs, zp->z_id))); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + /* + * Slap on VROOT if we are the root znode unless we are the root + * node of a snapshot mounted under .zfs. + */ + if (zp->z_id == zfsvfs->z_root && zfsvfs->z_parent == zfsvfs) + ZTOV(zp)->v_flag |= VROOT; + + vn_exists(ZTOV(zp)); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(MUTEX_HELD(ZFS_OBJ_MUTEX(zp->z_zfsvfs, zp->z_id)) || + zp->z_unlinked || + RW_WRITE_HELD(&zp->z_zfsvfs->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +static void +zfs_vnode_forget(vnode_t *vp) +{ + + /* copied from insmntque_stddtr */ + vp->v_data = NULL; + vp->v_op = &dead_vnodeops; + vgone(vp); + vput(vp); +} + +/* + * Construct a new znode/vnode and initialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + vnode_t *vp; + uint64_t mode; + uint64_t parent; +#ifdef notyet + uint64_t mtime[2], ctime[2]; +#endif + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[9]; + int count = 0; + int error; + + zp = zfs_znode_alloc_kmem(KM_SLEEP); + +#ifndef _ZFS_USE_SMR + KASSERT((zfsvfs->z_parent->z_vfs->mnt_kern_flag & MNTK_FPLOOKUP) == 0, + ("%s: fast path lookup enabled without smr", __func__)); +#endif + +#if __FreeBSD_version >= 1300076 + KASSERT(curthread->td_vp_reserved != NULL, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#else + KASSERT(curthread->td_vp_reserv > 0, + ("zfs_znode_alloc: getnewvnode without any vnodes reserved")); +#endif + error = getnewvnode("zfs", zfsvfs->z_parent->z_vfs, &zfs_vnodeops, &vp); + if (error != 0) { + zfs_znode_free_kmem(zp); + return (NULL); + } + zp->z_vnode = vp; + vp->v_data = zp; + + ASSERT(!POINTER_IS_VALID(zp->z_zfsvfs)); + zp->z_moved = 0; + + /* + * Defer setting z_zfsvfs until the znode is ready to be a candidate for + * the zfs_znode_move() callback. + */ + zp->z_sa_hdl = NULL; + zp->z_unlinked = 0; + zp->z_atime_dirty = 0; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + vp = ZTOV(zp); + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &zp->z_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, 16); +#ifdef notyet + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); +#endif + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, 8); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || zp->z_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zfs_vnode_forget(vp); + zp->z_vnode = NULL; + zfs_znode_free_kmem(zp); + return (NULL); + } + + zp->z_projid = projid; + zp->z_mode = mode; + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + vp->v_type = IFTOVT((mode_t)mode); + + switch (vp->v_type) { + case VDIR: + zp->z_zn_prefetch = B_TRUE; /* z_prefetch default is enabled */ + break; + case VFIFO: + vp->v_op = &zfs_fifoops; + break; + case VREG: + if (parent == zfsvfs->z_shares_dir) { + ASSERT(zp->z_uid == 0 && zp->z_gid == 0); + vp->v_op = &zfs_shareops; + } + break; + default: + break; + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + /* + * Everything else must be valid before assigning z_zfsvfs makes the + * znode eligible for zfs_znode_move(). + */ + zp->z_zfsvfs = zfsvfs; + mutex_exit(&zfsvfs->z_znodes_lock); + + /* + * Acquire vnode lock before making it available to the world. + */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); + VN_LOCK_AREC(vp); + if (vp->v_type != VFIFO) + VN_LOCK_ASHARE(vp); + + return (zp); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_XATTR - new object is an attribute + * bonuslen - length of bonus buffer + * setaclp - File/Dir initial ACL + * fuidp - Tracks fuid allocation. + * + * OUT: zpp - allocated znode + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t dzp_pflags = 0; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + dmu_buf_t *db; + timestruc_t now; + uint64_t gen, obj; + int err; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + + ASSERT(vap && ((vap->va_mask & AT_MODE) == AT_MODE)); + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + vfs_timestamp(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (vap->va_type == VDIR) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + VERIFY(0 == sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } else { + dzp_pflags = dzp->z_pflags; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (vap->va_type == VDIR) { + size = 2; /* contents ("." and "..") */ + links = (flag & (IS_ROOT_NODE | IS_XATTR)) ? 2 : 1; + } else { + size = links = 0; + } + + if (vap->va_type == VBLK || vap->va_type == VCHR) { + rdev = zfs_expldev(vap->va_rdev); + } + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + /* + * No execs denied will be determined when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & AT_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & AT_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } + if (obj_type == DMU_OT_ZNODE || + (vap->va_type == VBLK || vap->va_type == VCHR)) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + ASSERT(*zpp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap, tx); + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + if (!(flag & IS_ROOT_NODE)) { + vnode_t *vp; + + vp = ZTOV(*zpp); + vp->v_vflag |= VV_FORCEINSMQ; + err = insmntque(vp, zfsvfs->z_vfs); + vp->v_vflag &= ~VV_FORCEINSMQ; + KASSERT(err == 0, ("insmntque() failed: error %d", err)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + ASSERT_VOP_IN_SEQC(ZTOV(zp)); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(zp->z_zfsvfs), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + vnode_t *vp; + sa_handle_t *hdl; + struct thread *td; + int locked; + int err; + + td = curthread; + getnewvnode_reserve_(); +again: + *zpp = NULL; + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + ASSERT3P(zp, !=, NULL); + ASSERT3U(zp->z_id, ==, obj_num); + if (zp->z_unlinked) { + err = SET_ERROR(ENOENT); + } else { + vp = ZTOV(zp); + /* + * Don't let the vnode disappear after + * ZFS_OBJ_HOLD_EXIT. + */ + VN_HOLD(vp); + *zpp = zp; + err = 0; + } + + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + if (err) { + getnewvnode_drop_reserve(); + return (err); + } + + locked = VOP_ISLOCKED(vp); + VI_LOCK(vp); + if (VN_IS_DOOMED(vp) && locked != LK_EXCLUSIVE) { + /* + * The vnode is doomed and this thread doesn't + * hold the exclusive lock on it, so the vnode + * must be being reclaimed by another thread. + * Otherwise the doomed vnode is being reclaimed + * by this thread and zfs_zget is called from + * ZIL internals. + */ + VI_UNLOCK(vp); + + /* + * XXX vrele() locks the vnode when the last reference + * is dropped. Although in this case the vnode is + * doomed / dead and so no inactivation is required, + * the vnode lock is still acquired. That could result + * in a LOR with z_teardown_lock if another thread holds + * the vnode's lock and tries to take z_teardown_lock. + * But that is only possible if the other thread peforms + * a ZFS vnode operation on the vnode. That either + * should not happen if the vnode is dead or the thread + * should also have a reference to the vnode and thus + * our reference is not last. + */ + VN_RELE(vp); + goto again; + } + VI_UNLOCK(vp); + getnewvnode_drop_reserve(); + return (err); + } + + /* + * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + if (err == 0) { + vnode_t *vp = ZTOV(zp); + + err = insmntque(vp, zfsvfs->z_vfs); + if (err == 0) { + vp->v_hash = obj_num; + VOP_UNLOCK1(vp); + } else { + zp->z_vnode = NULL; + zfs_znode_dmu_fini(zp); + zfs_znode_free(zp); + *zpp = NULL; + } + } + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + getnewvnode_drop_reserve(); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_object_info_t doi; + dmu_buf_t *db; + vnode_t *vp; + uint64_t obj_num = zp->z_id; + uint64_t mode, size; + sa_bulk_attr_t bulk[8]; + int err; + int count = 0; + uint64_t gen; + + /* + * Remove cached pages before reloading the znode, so that they are not + * lingering after we run into any error. Ideally, we should vgone() + * the vnode in case of error, but currently we cannot do that + * because of the LOR between the vnode lock and z_teardown_lock. + * So, instead, we have to "doom" the znode in the illumos style. + */ + vp = ZTOV(zp); + vn_pages_remove(vp, 0, 0); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + mutex_exit(&zp->z_acl_lock); + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + size = zp->z_size; + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &zp->z_links, sizeof (zp->z_links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &zp->z_atime, sizeof (zp->z_atime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &zp->z_uid, sizeof (zp->z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &zp->z_gid, sizeof (zp->z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + zp->z_mode = mode; + + if (gen != zp->z_gen) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * It is highly improbable but still quite possible that two + * objects in different datasets are created with the same + * object numbers and in transaction groups with the same + * numbers. znodes corresponding to those objects would + * have the same z_id and z_gen, but their other attributes + * may be different. + * zfs recv -F may replace one of such objects with the other. + * As a result file properties recorded in the replaced + * object's vnode may no longer match the received object's + * properties. At present the only cached property is the + * files type recorded in v_type. + * So, handle this case by leaving the old vnode and znode + * disassociated from the actual object. A new vnode and a + * znode will be created if the object is accessed + * (e.g. via a look-up). The old vnode and znode will be + * recycled when the last vnode reference is dropped. + */ + if (vp->v_type != IFTOVT((mode_t)zp->z_mode)) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (SET_ERROR(EIO)); + } + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatically removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (zp->z_links == 0); + if (zp->z_unlinked) { + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + return (0); + } + + zp->z_blksz = doi.doi_data_block_size; + if (zp->z_size != size) + vnode_pager_setsize(vp, zp->z_size); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); + zfs_znode_free(zp); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t z_id = zp->z_id; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode + */ + ZFS_OBJ_HOLD_ENTER(zfsvfs, z_id); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if ((zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) == 0) { + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_rmnode(zp); + return; + } + } + + zfs_znode_dmu_fini(zp); + ZFS_OBJ_HOLD_EXIT(zfsvfs, z_id); + zfs_znode_free(zp); +} + +void +zfs_znode_free(znode_t *zp) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + + ASSERT(zp->z_sa_hdl == NULL); + zp->z_vnode = NULL; + mutex_enter(&zfsvfs->z_znodes_lock); + POINTER_INVALIDATE(&zp->z_zfsvfs); + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + zfs_znode_free_kmem(zp); +} + +void +zfs_tstamp_update_setup_ext(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2], boolean_t have_tx) +{ + timestruc_t now; + + vfs_timestamp(&now); + + if (have_tx) { /* will sa_bulk_update happen really soon? */ + zp->z_atime_dirty = 0; + zp->z_seq++; + } else { + zp->z_atime_dirty = 1; + } + + if (flag & AT_ATIME) { + ZFS_TIME_ENCODE(&now, zp->z_atime); + } + + if (flag & AT_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + if (zp->z_zfsvfs->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & AT_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + if (zp->z_zfsvfs->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + + +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + zfs_tstamp_update_setup_ext(zp, flag, mtime, ctime, B_TRUE); +} +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(zp->z_zfsvfs->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > zp->z_zfsvfs->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, zp->z_zfsvfs->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zp->z_zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx)); + + vnode_pager_setsize(ZTOV(zp), end); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + if (error == 0) { + /* + * In FreeBSD we cannot free block in the middle of a file, + * but only at the end of a file, so this code path should + * never happen. + */ + vnode_pager_setsize(ZTOV(zp), off); + } + + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + vnode_t *vp = ZTOV(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + + /* + * Clear any mapped pages in the truncated region. This has to + * happen outside of the transaction to avoid the possibility of + * a deadlock with someone trying to push a page that we are + * about to invalidate. + */ + vnode_pager_setsize(vp, end); + + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + else + return (error); + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + return (error); +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + return (0); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int error; + int i; + znode_t *rootzp = NULL; + zfsvfs_t *zfsvfs; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/vnode/zfsvfs + * to allow zfs_mknode to work. + */ + VATTR_NULL(&vattr); + vattr.va_mask = AT_MODE|AT_UID|AT_GID; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + rootzp = zfs_znode_alloc_kmem(KM_SLEEP); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); + rootzp->z_moved = 0; + rootzp->z_unlinked = 0; + rootzp->z_atime_dirty = 0; + rootzp->z_is_sa = USE_SA(version, os); + + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + rootzp->z_zfsvfs = zfsvfs; + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + POINTER_INVALIDATE(&rootzp->z_zfsvfs); + + sa_handle_destroy(rootzp->z_sa_hdl); + zfs_znode_free_kmem(rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +static void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#ifdef _KERNEL +int +zfs_znode_parent_and_name(znode_t *zp, znode_t **dzpp, char *buf) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + uint64_t parent; + int is_xattrdir; + int err; + + /* Extended attributes should not be visible as regular files. */ + if ((zp->z_pflags & ZFS_XATTR) != 0) + return (SET_ERROR(EINVAL)); + + err = zfs_obj_to_pobj(zfsvfs->z_os, zp->z_sa_hdl, zfsvfs->z_attr_table, + &parent, &is_xattrdir); + if (err != 0) + return (err); + ASSERT0(is_xattrdir); + + /* No name as this is a root object. */ + if (parent == zp->z_id) + return (SET_ERROR(EINVAL)); + + err = zap_value_search(zfsvfs->z_os, parent, zp->z_id, + ZFS_DIRENT_OBJ(-1ULL), buf); + if (err != 0) + return (err); + err = zfs_zget(zfsvfs, parent, dzpp); + return (err); +} +#endif /* _KERNEL */ diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c new file mode 100644 index 000000000000..d89ef80edd66 --- /dev/null +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -0,0 +1,1882 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +/* + * Set to a nonzero value to cause zio_do_crypt_uio() to fail 1/this many + * calls, to test decryption error handling code paths. + */ +uint64_t zio_decrypt_fail_fraction = 0; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +static void +zio_crypt_key_destroy_early(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + bzero(&key->zk_session, sizeof (key->zk_session)); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + + freebsd_crypt_freesession(&key->zk_session); + zio_crypt_key_destroy_early(key); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech __unused; + uint_t keydata_len; + zio_crypt_info_t *ci = NULL; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + ret = freebsd_crypt_newsession(&key->zk_session, ci, + &key->zk_current_key); + if (ret) + goto error; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech __unused; + + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + freebsd_crypt_freesession(&key->zk_session); + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[key->zk_crypt], &key->zk_current_key); + if (ret != 0) + goto out_unlock; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int failed_decrypt_size; + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +/* + * The implementation for FreeBSD's OpenCrypto. + * + * The big difference between ICP and FOC is that FOC uses a single + * buffer for input and output. This means that (for AES-GCM, the + * only one supported right now) the source must be copied into the + * destination, and the destination must have the AAD, and the tag/MAC, + * already associated with it. (Both implementations can use a uio.) + * + * Since the auth data is part of the iovec array, all we need to know + * is the length: 0 means there's no AAD. + * + */ +static int +zio_do_crypt_uio_opencrypto(boolean_t encrypt, freebsd_crypt_session_t *sess, + uint64_t crypt, crypto_key_t *key, uint8_t *ivbuf, uint_t datalen, + uio_t *uio, uint_t auth_len) +{ + zio_crypt_info_t *ci; + int ret; + + ci = &zio_crypt_table[crypt]; + if (ci->ci_crypt_type != ZC_TYPE_GCM && + ci->ci_crypt_type != ZC_TYPE_CCM) + return (ENOTSUP); + + + ret = freebsd_crypt_uio(encrypt, sess, ci, uio, key, ivbuf, + datalen, auth_len); + if (ret != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Returning error %s\n", + __FUNCTION__, __LINE__, encrypt ? "EIO" : "ECKSUM"); +#endif + ret = SET_ERROR(encrypt ? EIO : ECKSUM); + } + + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* + * Since we only support one buffer, we need to copy + * the plain text (source) to the cipher buffer (dest). + * We set iovecs[0] -- the authentication data -- below. + */ + bcopy((void*)key->zk_master_keydata, keydata_out, keydata_len); + bcopy((void*)key->zk_hmac_keydata, hmac_keydata_out, + SHA512_HMAC_KEYLEN); + iovecs[1].iov_base = keydata_out; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = hmac_keydata_out; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio_opencrypto(B_TRUE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + int ret; + uint64_t aad[3]; + /* + * With OpenCrypto in FreeBSD, the same buffer is used for + * input and output. Also, the AAD (for AES-GMC at least) + * needs to logically go in front. + */ + uio_t cuio; + iovec_t iovecs[4]; + void *src, *dst; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + /* + * Since we only support one buffer, we need to copy + * the encrypted buffer (source) to the plain buffer + * (dest). We set iovecs[0] -- the authentication data -- + * below. + */ + dst = key->zk_master_keydata; + src = keydata; + + bcopy(src, dst, keydata_len); + + dst = key->zk_hmac_keydata; + src = hmac_keydata; + bcopy(src, dst, SHA512_HMAC_KEYLEN); + + iovecs[1].iov_base = key->zk_master_keydata; + iovecs[1].iov_len = keydata_len; + iovecs[2].iov_base = key->zk_hmac_keydata; + iovecs[2].iov_len = SHA512_HMAC_KEYLEN; + iovecs[3].iov_base = mac; + iovecs[3].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + iovecs[0].iov_base = aad; + iovecs[0].iov_len = aad_len; + + cuio.uio_iov = iovecs; + cuio.uio_iovcnt = 4; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio_opencrypto(B_FALSE, NULL, crypt, cwkey, + iv, enc_len, &cuio, aad_len); + + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + ret = freebsd_crypt_newsession(&key->zk_session, + &zio_crypt_table[crypt], &key->zk_current_key); + if (ret != 0) + goto error; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy_early(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + crypto_mac(&key->zk_hmac_key, data, datalen, + raw_digestbuf, SHA512_DIGEST_LENGTH); + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + int avoidlint = SPA_MINBLOCKSIZE; + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, avoidlint); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, avoidlint); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + crypto_mac_update(ctx, &bab, bab_len); + + return (0); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + crypto_mac_update(ctx, adnp, sizeof (tmp_dncore)); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +/* ARGSUSED */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + struct hmac_ctx hash_ctx; + struct hmac_ctx *ctx = &hash_ctx; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + + /* calculate the portable MAC from the portable fields and metadnode */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + crypto_mac_final(ctx, raw_portable_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + crypto_mac_init(ctx, &key->zk_hmac_key); + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + /* CONSTCOND */ + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + crypto_mac_update(ctx, &intval, sizeof (uint64_t)); + + /* XXX check dnode type ... */ + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + crypto_mac_final(ctx, raw_local_mac, SHA512_DIGEST_LENGTH); + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) { +#ifdef FCRYPTO_DEBUG + printf("%s(%d): Setting ECKSUM\n", __FUNCTION__, __LINE__); +#endif + return (SET_ERROR(ECKSUM)); + } + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +/* + * The OpenCrypto used in FreeBSD does not use separate source and + * destination buffers; instead, the same buffer is used. Further, to + * accommodate some of the drivers, the authbuf needs to be logically before + * the data. This means that we need to copy the source to the destination, + * and set up an extra iovec_t at the beginning to handle the authbuf. + * It also means we'll only return one uio_t, which we do via the clumsy + * ifdef in the function declaration. + */ + +/* ARGSUSED */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* + * We need at least two iovecs -- one for the AAD, + * one for the MAC. + */ + bcopy(src, dst, datalen); + nr_dst = 2; + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src = 0; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + /* The first one will contain the authbuf */ + nr_iovecs = 1; + + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_base = (char *) + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_base = (char *)dlrp + + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *out_uio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + bcopy(src, dst, datalen); + nr_dst = 2; + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src = 0; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(src_iovecs, nr_src * sizeof (iovec_t)); + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(dst_iovecs, nr_dst * sizeof (iovec_t)); + } + + nr_iovecs = 1; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(dst_iovecs, !=, NULL); + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + dst_iovecs[0].iov_base = aadbuf; + dst_iovecs[0].iov_len = aad_len; + out_uio->uio_iov = dst_iovecs; + out_uio->uio_iovcnt = nr_dst; + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* ARGSUSED */ +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *out_uio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + void *src, *dst; + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + bzero(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + } else { + src = cipherbuf; + dst = plainbuf; + } + bcopy(src, dst, datalen); + cipher_iovecs[0].iov_base = dst; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + out_uio->uio_iov = cipher_iovecs; + out_uio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + out_uio->uio_iov = NULL; + out_uio->uio_iovcnt = 0; + + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = (void *)mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +void *failed_decrypt_buf; +int faile_decrypt_size; + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + freebsd_crypt_session_t *tmpl = NULL; + uint8_t *authbuf = NULL; + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + +#ifdef FCRYPTO_DEBUG + printf("%s(%s, %p, %p, %d, %p, %p, %u, %s, %p, %p, %p)\n", + __FUNCTION__, + encrypt ? "encrypt" : "decrypt", + key, salt, ot, iv, mac, datalen, + byteswap ? "byteswap" : "native_endian", plainbuf, + cipherbuf, no_crypt); + + printf("\tkey = {"); + for (int i = 0; i < key->zk_current_key.ck_length/8; i++) + printf("%02x ", ((uint8_t *)key->zk_current_key.ck_data)[i]); + printf("}\n"); +#endif + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + return (ret); + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = &key->zk_session; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* perform the encryption / decryption */ + ret = zio_do_crypt_uio_opencrypto(encrypt, tmpl, key->zk_crypt, + ckey, iv, enc_len, &cuio, auth_len); + if (ret != 0) + goto error; + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (!encrypt) { + if (failed_decrypt_buf != NULL) + kmem_free(failed_decrypt_buf, failed_decrypt_size); + failed_decrypt_buf = kmem_alloc(datalen, KM_SLEEP); + failed_decrypt_size = datalen; + bcopy(cipherbuf, failed_decrypt_buf, datalen); + } + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + return (SET_ERROR(ret)); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (SET_ERROR(ret)); +} + +#if defined(_KERNEL) && defined(HAVE_SPL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c new file mode 100644 index 000000000000..113733a5c11a --- /dev/null +++ b/module/os/freebsd/zfs/zvol_os.c @@ -0,0 +1,1454 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * + * Copyright (c) 2006-2010 Pawel Jakub Dawidek + * All rights reserved. + * + * Portions Copyright 2010 Robert Milkowski + * + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + */ + +/* Portions Copyright 2011 Martin Matuska */ + +/* + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev/zvol// + * + * Volumes are persistent through reboot. No user command needs to be + * run before opening and using a device. + * + * On FreeBSD ZVOLs are simply GEOM providers like any other storage device + * in the system. Except when they're simply character devices (volmode=dev). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "zfs_namecheck.h" + +#define ZVOL_DUMPSIZE "dumpsize" + +#ifdef ZVOL_LOCK_DEBUG +#define ZVOL_RW_READER RW_WRITER +#define ZVOL_RW_READ_HELD RW_WRITE_HELD +#else +#define ZVOL_RW_READER RW_READER +#define ZVOL_RW_READ_HELD RW_READ_HELD +#endif + +enum zvol_geom_state { + ZVOL_GEOM_UNINIT, + ZVOL_GEOM_STOPPED, + ZVOL_GEOM_RUNNING, +}; + +struct zvol_state_os { + int zso_volmode; +#define zso_dev _zso_state._zso_dev +#define zso_geom _zso_state._zso_geom + union { + /* volmode=dev */ + struct zvol_state_dev { + struct cdev *zsd_cdev; + uint64_t zsd_sync_cnt; + } _zso_dev; + + /* volmode=geom */ + struct zvol_state_geom { + struct g_provider *zsg_provider; + struct bio_queue_head zsg_queue; + struct mtx zsg_queue_mtx; + enum zvol_geom_state zsg_state; + } _zso_geom; + } _zso_state; +}; + +static uint32_t zvol_minors; + +SYSCTL_DECL(_vfs_zfs); +SYSCTL_NODE(_vfs_zfs, OID_AUTO, vol, CTLFLAG_RW, 0, "ZFS VOLUME"); +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, mode, CTLFLAG_RWTUN, &zvol_volmode, 0, + "Expose as GEOM providers (1), device files (2) or neither"); +static boolean_t zpool_on_zvol = B_FALSE; +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, recursive, CTLFLAG_RWTUN, &zpool_on_zvol, 0, + "Allow zpools to use zvols as vdevs (DANGEROUS)"); + +/* + * Toggle unmap functionality. + */ +boolean_t zvol_unmap_enabled = B_TRUE; + +SYSCTL_INT(_vfs_zfs_vol, OID_AUTO, unmap_enabled, CTLFLAG_RWTUN, + &zvol_unmap_enabled, 0, "Enable UNMAP functionality"); + +/* + * zvol maximum transfer in one DMU tx. + */ +int zvol_maxphys = DMU_MAX_ACCESS / 2; + +static void zvol_ensure_zilog(zvol_state_t *zv); + +static d_open_t zvol_cdev_open; +static d_close_t zvol_cdev_close; +static d_ioctl_t zvol_cdev_ioctl; +static d_read_t zvol_cdev_read; +static d_write_t zvol_cdev_write; +static d_strategy_t zvol_geom_bio_strategy; + +static struct cdevsw zvol_cdevsw = { + .d_name = "zvol", + .d_version = D_VERSION, + .d_flags = D_DISK | D_TRACKCLOSE, + .d_open = zvol_cdev_open, + .d_close = zvol_cdev_close, + .d_ioctl = zvol_cdev_ioctl, + .d_read = zvol_cdev_read, + .d_write = zvol_cdev_write, + .d_strategy = zvol_geom_bio_strategy, +}; + +extern uint_t zfs_geom_probe_vdev_key; + +struct g_class zfs_zvol_class = { + .name = "ZFS::ZVOL", + .version = G_VERSION, +}; + +DECLARE_GEOM_CLASS(zfs_zvol_class, zfs_zvol); + +static int zvol_geom_open(struct g_provider *pp, int flag, int count); +static int zvol_geom_close(struct g_provider *pp, int flag, int count); +static void zvol_geom_run(zvol_state_t *zv); +static void zvol_geom_destroy(zvol_state_t *zv); +static int zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace); +static void zvol_geom_worker(void *arg); +static void zvol_geom_bio_start(struct bio *bp); +static int zvol_geom_bio_getattr(struct bio *bp); +/* static d_strategy_t zvol_geom_bio_strategy; (declared elsewhere) */ + +/* + * GEOM mode implementation + */ + +/*ARGSUSED*/ +static int +zvol_geom_open(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + if (!zpool_on_zvol && tsd_get(zfs_geom_probe_vdev_key) != NULL) { + /* + * if zfs_geom_probe_vdev_key is set, that means that zfs is + * attempting to probe geom providers while looking for a + * replacement for a missing VDEV. In this case, the + * spa_namespace_lock will not be held, but it is still illegal + * to use a zvol as a vdev. Deadlocks can result if another + * thread has spa_namespace_lock + */ + return (SET_ERROR(EOPNOTSUPP)); + } + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flag & FWRITE)); + if (err) + goto out_mutex; + pp->mediasize = zv->zv_volsize; + pp->stripeoffset = 0; + pp->stripesize = zv->zv_volblocksize; + } + + /* + * Check for a bad on-disk format version now since we + * lied about owning the dataset readonly before. + */ + if ((flag & FWRITE) && ((zv->zv_flags & ZVOL_RDONLY) || + dmu_objset_incompatible_encryption_version(zv->zv_objset))) { + err = EROFS; + goto out_open_count; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_open_count; + } +#ifdef FEXCL + if (flag & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_open_count; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count += count; + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_mutex: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +/*ARGSUSED*/ +static int +zvol_geom_close(struct g_provider *pp, int flag, int count) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = pp->private; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if ((zv->zv_open_count - count) == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count -= count; + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static void +zvol_geom_run(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_error_provider(pp, 0); + + kproc_kthread_add(zvol_geom_worker, zv, &system_proc, NULL, 0, 0, + "zfskern", "zvol %s", pp->name + sizeof (ZVOL_DRIVER)); +} + +static void +zvol_geom_destroy(zvol_state_t *zv) +{ + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + g_topology_assert(); + + mutex_enter(&zv->zv_state_lock); + VERIFY(zsg->zsg_state == ZVOL_GEOM_RUNNING); + mutex_exit(&zv->zv_state_lock); + zsg->zsg_provider = NULL; + pp->private = NULL; + g_wither_geom(pp->geom, ENXIO); +} + +static int +zvol_geom_access(struct g_provider *pp, int acr, int acw, int ace) +{ + int count, error, flags; + + g_topology_assert(); + + /* + * To make it easier we expect either open or close, but not both + * at the same time. + */ + KASSERT((acr >= 0 && acw >= 0 && ace >= 0) || + (acr <= 0 && acw <= 0 && ace <= 0), + ("Unsupported access request to %s (acr=%d, acw=%d, ace=%d).", + pp->name, acr, acw, ace)); + + if (pp->private == NULL) { + if (acr <= 0 && acw <= 0 && ace <= 0) + return (0); + return (pp->error); + } + + /* + * We don't pass FEXCL flag to zvol_geom_open()/zvol_geom_close() if + * ace != 0, because GEOM already handles that and handles it a bit + * differently. GEOM allows for multiple read/exclusive consumers and + * ZFS allows only one exclusive consumer, no matter if it is reader or + * writer. I like better the way GEOM works so I'll leave it for GEOM + * to decide what to do. + */ + + count = acr + acw + ace; + if (count == 0) + return (0); + + flags = 0; + if (acr != 0 || ace != 0) + flags |= FREAD; + if (acw != 0) + flags |= FWRITE; + + g_topology_unlock(); + if (count > 0) + error = zvol_geom_open(pp, flags, count); + else + error = zvol_geom_close(pp, flags, -count); + g_topology_lock(); + return (error); +} + +static void +zvol_geom_worker(void *arg) +{ + zvol_state_t *zv = arg; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct bio *bp; + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM); + + thread_lock(curthread); + sched_prio(curthread, PRIBIO); + thread_unlock(curthread); + + for (;;) { + mtx_lock(&zsg->zsg_queue_mtx); + bp = bioq_takefirst(&zsg->zsg_queue); + if (bp == NULL) { + if (zsg->zsg_state == ZVOL_GEOM_STOPPED) { + zsg->zsg_state = ZVOL_GEOM_RUNNING; + wakeup(&zsg->zsg_state); + mtx_unlock(&zsg->zsg_queue_mtx); + kthread_exit(); + } + msleep(&zsg->zsg_queue, &zsg->zsg_queue_mtx, + PRIBIO | PDROP, "zvol:io", 0); + continue; + } + mtx_unlock(&zsg->zsg_queue_mtx); + zvol_geom_bio_strategy(bp); + } +} + +static void +zvol_geom_bio_start(struct bio *bp) +{ + zvol_state_t *zv = bp->bio_to->private; + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + boolean_t first; + + if (bp->bio_cmd == BIO_GETATTR) { + if (zvol_geom_bio_getattr(bp)) + g_io_deliver(bp, EOPNOTSUPP); + return; + } + + if (!THREAD_CAN_SLEEP()) { + mtx_lock(&zsg->zsg_queue_mtx); + first = (bioq_first(&zsg->zsg_queue) == NULL); + bioq_insert_tail(&zsg->zsg_queue, bp); + mtx_unlock(&zsg->zsg_queue_mtx); + if (first) + wakeup_one(&zsg->zsg_queue); + return; + } + + zvol_geom_bio_strategy(bp); +} + +static int +zvol_geom_bio_getattr(struct bio *bp) +{ + zvol_state_t *zv; + + zv = bp->bio_to->private; + ASSERT(zv != NULL); + + spa_t *spa = dmu_objset_spa(zv->zv_objset); + uint64_t refd, avail, usedobjs, availobjs; + + if (g_handleattr_int(bp, "GEOM::candelete", 1)) + return (0); + if (strcmp(bp->bio_attribute, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksavail", avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + if (g_handleattr_off_t(bp, "blocksused", refd / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksavail", + avail / DEV_BSIZE)) + return (0); + } else if (strcmp(bp->bio_attribute, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + if (g_handleattr_off_t(bp, "poolblocksused", refd / DEV_BSIZE)) + return (0); + } + return (1); +} + +static void +zvol_geom_bio_strategy(struct bio *bp) +{ + zvol_state_t *zv; + uint64_t off, volsize; + size_t resid; + char *addr; + objset_t *os; + zfs_locked_range_t *lr; + int error = 0; + boolean_t doread = B_FALSE; + boolean_t is_dumpified; + boolean_t sync; + + if (bp->bio_to) + zv = bp->bio_to->private; + else + zv = bp->bio_dev->si_drv2; + + if (zv == NULL) { + error = SET_ERROR(ENXIO); + goto out; + } + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + + switch (bp->bio_cmd) { + case BIO_READ: + doread = B_TRUE; + break; + case BIO_WRITE: + case BIO_FLUSH: + case BIO_DELETE: + if (zv->zv_flags & ZVOL_RDONLY) { + error = SET_ERROR(EROFS); + goto resume; + } + zvol_ensure_zilog(zv); + if (bp->bio_cmd == BIO_FLUSH) + goto sync; + break; + default: + error = EOPNOTSUPP; + goto resume; + } + + off = bp->bio_offset; + volsize = zv->zv_volsize; + + os = zv->zv_objset; + ASSERT(os != NULL); + + addr = bp->bio_data; + resid = bp->bio_length; + + if (resid > 0 && (off < 0 || off >= volsize)) { + error = SET_ERROR(EIO); + goto resume; + } + + is_dumpified = B_FALSE; + sync = !doread && !is_dumpified && + zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + /* + * There must be no buffer changes when doing a dmu_sync() because + * we can't change the data whilst calculating the checksum. + */ + lr = zfs_rangelock_enter(&zv->zv_rangelock, off, resid, + doread ? RL_READER : RL_WRITER); + + if (bp->bio_cmd == BIO_DELETE) { + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, off, resid, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + off, resid); + resid = 0; + } + goto unlock; + } + while (resid != 0 && off < volsize) { + size_t size = MIN(resid, zvol_maxphys); + if (doread) { + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); + } else { + dmu_tx_t *tx = dmu_tx_create(os); + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, size); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, off, size, addr, tx); + zvol_log_write(zv, tx, off, size, sync); + dmu_tx_commit(tx); + } + } + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + off += size; + addr += size; + resid -= size; + } +unlock: + zfs_rangelock_exit(lr); + + bp->bio_completed = bp->bio_length - resid; + if (bp->bio_completed < bp->bio_length && off > volsize) + error = EINVAL; + + switch (bp->bio_cmd) { + case BIO_FLUSH: + break; + case BIO_READ: + dataset_kstats_update_read_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_WRITE: + dataset_kstats_update_write_kstats(&zv->zv_kstat, + bp->bio_completed); + break; + case BIO_DELETE: + break; + default: + break; + } + + if (sync) { +sync: + zil_commit(zv->zv_zilog, ZVOL_OBJ); + } +resume: + rw_exit(&zv->zv_suspend_lock); +out: + if (bp->bio_to) + g_io_deliver(bp, error); + else + biofinish(bp, NULL, error); +} + +/* + * Character device mode implementation + */ + +static int +zvol_cdev_read(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + /* + * uio_loffset == volsize isn't an error as + * its required for EOF processing. + */ + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_READER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio->uio_loffset) + bytes = volsize - uio->uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + + return (error); +} + +static int +zvol_cdev_write(struct cdev *dev, struct uio *uio, int ioflag) +{ + zvol_state_t *zv; + uint64_t volsize; + zfs_locked_range_t *lr; + int error = 0; + boolean_t sync; + + zv = dev->si_drv2; + + volsize = zv->zv_volsize; + + if (uio->uio_resid > 0 && + (uio->uio_loffset < 0 || uio->uio_loffset > volsize)) + return (SET_ERROR(EIO)); + + sync = (ioflag & IO_SYNC) || + (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + + lr = zfs_rangelock_enter(&zv->zv_rangelock, uio->uio_loffset, + uio->uio_resid, RL_WRITER); + while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { + uint64_t bytes = MIN(uio->uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio->uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, uio, bytes, tx); + if (error == 0) + zvol_log_write(zv, tx, off, bytes, sync); + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + return (error); +} + +static int +zvol_cdev_open(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + int err = 0; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + err = zvol_first_open(zv, !(flags & FWRITE)); + if (err) + goto out_locked; + } + + if ((flags & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = EROFS; + goto out_opened; + } + if (zv->zv_flags & ZVOL_EXCL) { + err = EBUSY; + goto out_opened; + } +#ifdef FEXCL + if (flags & FEXCL) { + if (zv->zv_open_count != 0) { + err = EBUSY; + goto out_opened; + } + zv->zv_flags |= ZVOL_EXCL; + } +#endif + + zv->zv_open_count++; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt++; + if (zsd->zsd_sync_cnt == 1) + zil_async_to_sync(zv->zv_zilog, ZVOL_OBJ); + } + + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); + +out_opened: + if (zv->zv_open_count == 0) + zvol_last_close(zv); +out_locked: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(err)); +} + +static int +zvol_cdev_close(struct cdev *dev, int flags, int fmt, struct thread *td) +{ + zvol_state_t *zv; + struct zvol_state_dev *zsd; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, ZVOL_RW_READER); + zv = dev->si_drv2; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + if (zv->zv_flags & ZVOL_EXCL) { + ASSERT(zv->zv_open_count == 1); + zv->zv_flags &= ~ZVOL_EXCL; + } + + ASSERT(zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV); + + /* + * If the open count is zero, this is a spurious close. + * That indicates a bug in the kernel / DDI framework. + */ + ASSERT(zv->zv_open_count > 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, ZVOL_RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * You may get multiple opens, but only one close. + */ + zv->zv_open_count--; + if (flags & (FSYNC | FDSYNC)) { + zsd = &zv->zv_zso->zso_dev; + zsd->zsd_sync_cnt--; + } + + if (zv->zv_open_count == 0) { + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + zvol_last_close(zv); + } + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + return (0); +} + +static int +zvol_cdev_ioctl(struct cdev *dev, ulong_t cmd, caddr_t data, + int fflag, struct thread *td) +{ + zvol_state_t *zv; + zfs_locked_range_t *lr; + off_t offset, length; + int i, error; + boolean_t sync; + + zv = dev->si_drv2; + + error = 0; + KASSERT(zv->zv_open_count > 0, + ("Device with zero access count in %s", __func__)); + + i = IOCPARM_LEN(cmd); + switch (cmd) { + case DIOCGSECTORSIZE: + *(uint32_t *)data = DEV_BSIZE; + break; + case DIOCGMEDIASIZE: + *(off_t *)data = zv->zv_volsize; + break; + case DIOCGFLUSH: + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + if (zv->zv_zilog != NULL) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGDELETE: + if (!zvol_unmap_enabled) + break; + + offset = ((off_t *)data)[0]; + length = ((off_t *)data)[1]; + if ((offset % DEV_BSIZE) != 0 || (length % DEV_BSIZE) != 0 || + offset < 0 || offset >= zv->zv_volsize || + length <= 0) { + printf("%s: offset=%jd length=%jd\n", __func__, offset, + length); + error = EINVAL; + break; + } + rw_enter(&zv->zv_suspend_lock, ZVOL_RW_READER); + zvol_ensure_zilog(zv); + lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, length, + RL_WRITER); + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + sync = FALSE; + dmu_tx_abort(tx); + } else { + sync = (zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS); + zvol_log_truncate(zv, tx, offset, length, sync); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, + offset, length); + } + zfs_rangelock_exit(lr); + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + rw_exit(&zv->zv_suspend_lock); + break; + case DIOCGSTRIPESIZE: + *(off_t *)data = zv->zv_volblocksize; + break; + case DIOCGSTRIPEOFFSET: + *(off_t *)data = 0; + break; + case DIOCGATTR: { + spa_t *spa = dmu_objset_spa(zv->zv_objset); + struct diocgattr_arg *arg = (struct diocgattr_arg *)data; + uint64_t refd, avail, usedobjs, availobjs; + + if (strcmp(arg->name, "GEOM::candelete") == 0) + arg->value.i = 1; + else if (strcmp(arg->name, "blocksavail") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "blocksused") == 0) { + dmu_objset_space(zv->zv_objset, &refd, &avail, + &usedobjs, &availobjs); + arg->value.off = refd / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksavail") == 0) { + avail = metaslab_class_get_space(spa_normal_class(spa)); + avail -= metaslab_class_get_alloc( + spa_normal_class(spa)); + arg->value.off = avail / DEV_BSIZE; + } else if (strcmp(arg->name, "poolblocksused") == 0) { + refd = metaslab_class_get_alloc(spa_normal_class(spa)); + arg->value.off = refd / DEV_BSIZE; + } else + error = ENOIOCTL; + break; + } + case FIOSEEKHOLE: + case FIOSEEKDATA: { + off_t *off = (off_t *)data; + uint64_t noff; + boolean_t hole; + + hole = (cmd == FIOSEEKHOLE); + noff = *off; + error = dmu_offset_next(zv->zv_objset, ZVOL_OBJ, hole, &noff); + *off = noff; + break; + } + default: + error = ENOIOCTL; + } + + return (error); +} + +/* + * Misc. helpers + */ + +static void +zvol_ensure_zilog(zvol_state_t *zv) +{ + ASSERT(ZVOL_RW_READ_HELD(&zv->zv_suspend_lock)); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + if (!rw_tryupgrade(&zv->zv_suspend_lock)) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + } + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } +} + +static boolean_t +zvol_is_zvol_impl(const char *device) +{ + return (device && strncmp(device, ZVOL_DIR, strlen(ZVOL_DIR)) == 0); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + struct g_geom *gp; + + g_topology_lock(); + gp = pp->geom; + ASSERT(gp != NULL); + + zsg->zsg_provider = NULL; + g_wither_provider(pp, ENXIO); + + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, newname); + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = zv->zv_volsize; + pp->private = zv; + zsg->zsg_provider = pp; + g_error_provider(pp, 0); + g_topology_unlock(); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + dev = zsd->zsd_cdev; + if (dev != NULL) { + destroy_dev(dev); + dev = zsd->zsd_cdev = NULL; + if (zv->zv_open_count > 0) { + zv->zv_flags &= ~ZVOL_EXCL; + zv->zv_open_count = 0; + /* XXX need suspend lock but lock order */ + zvol_last_close(zv); + } + } + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + if (make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, newname) + == 0) { + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + } + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); +} + +/* + * Remove minor node for the specified volume. + */ +static void +zvol_free(zvol_state_t *zv) +{ + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + + ZFS_LOG(1, "ZVOL %s destroyed.", zv->zv_name); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + + g_topology_lock(); + zvol_geom_destroy(zv); + g_topology_unlock(); + mtx_destroy(&zsg->zsg_queue_mtx); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev = zsd->zsd_cdev; + + if (dev != NULL) + destroy_dev(dev); + } + + mutex_destroy(&zv->zv_state_lock); + dataset_kstats_destroy(&zv->zv_kstat); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + zvol_minors--; +} + +/* + * Create a minor node (plus a whole lot more) for the specified volume. + */ +static int +zvol_create_minor_impl(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t volmode, hash; + int error; + + ZFS_LOG(1, "Creating ZVOL %s...", name); + + hash = zvol_name_hash(name); + if ((zv = zvol_find_by_name_hash(name, hash, RW_NONE)) != NULL) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(EEXIST)); + } + + DROP_GIANT(); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_VOLMODE), &volmode, NULL); + if (error != 0 || volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + /* + * zvol_alloc equivalent ... + */ + zv = kmem_zalloc(sizeof (*zv), KM_SLEEP); + zv->zv_hash = hash; + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + zv->zv_zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso->zso_volmode = volmode; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp; + struct g_geom *gp; + + zsg->zsg_state = ZVOL_GEOM_UNINIT; + mtx_init(&zsg->zsg_queue_mtx, "zvol", NULL, MTX_DEF); + + g_topology_lock(); + gp = g_new_geomf(&zfs_zvol_class, "zfs::zvol::%s", name); + gp->start = zvol_geom_bio_start; + gp->access = zvol_geom_access; + pp = g_new_providerf(gp, "%s/%s", ZVOL_DRIVER, name); + /* TODO: NULL check? */ + pp->flags |= G_PF_DIRECT_RECEIVE | G_PF_DIRECT_SEND; + pp->sectorsize = DEV_BSIZE; + pp->mediasize = 0; + pp->private = zv; + + zsg->zsg_provider = pp; + bioq_init(&zsg->zsg_queue); + } else if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_DEV) { + struct zvol_state_dev *zsd = &zv->zv_zso->zso_dev; + struct cdev *dev; + struct make_dev_args args; + + make_dev_args_init(&args); + args.mda_flags = MAKEDEV_CHECKNAME | MAKEDEV_WAITOK; + args.mda_devsw = &zvol_cdevsw; + args.mda_cr = NULL; + args.mda_uid = UID_ROOT; + args.mda_gid = GID_OPERATOR; + args.mda_mode = 0640; + args.mda_si_drv2 = zv; + error = make_dev_s(&args, &dev, "%s/%s", ZVOL_DRIVER, name); + if (error != 0) { + mutex_destroy(&zv->zv_state_lock); + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (*zv)); + dmu_objset_disown(os, B_TRUE, FTAG); + goto out_giant; + } + dev->si_iosize_max = MAXPHYS; + zsd->zsd_cdev = dev; + } + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + + if (dmu_objset_is_snapshot(os) || !spa_writeable(dmu_objset_spa(os))) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + + /* XXX do prefetch */ + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); + + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + if (error == 0) + zvol_geom_run(zv); + g_topology_unlock(); + } +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + zvol_minors++; + rw_exit(&zvol_state_lock); + } + ZFS_LOG(1, "ZVOL %s created.", name); +out_giant: + PICKUP_GIANT(); + return (error); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return; + + mtx_lock(&zsg->zsg_queue_mtx); + zsg->zsg_state = ZVOL_GEOM_STOPPED; + pp->private = NULL; + wakeup_one(&zsg->zsg_queue); + while (zsg->zsg_state != ZVOL_GEOM_RUNNING) + msleep(&zsg->zsg_state, + &zsg->zsg_queue_mtx, + 0, "zvol:w", 0); + mtx_unlock(&zsg->zsg_queue_mtx); + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + } +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + zv->zv_volsize = volsize; + if (zv->zv_zso->zso_volmode == ZFS_VOLMODE_GEOM) { + struct zvol_state_geom *zsg = &zv->zv_zso->zso_geom; + struct g_provider *pp = zsg->zsg_provider; + + if (pp == NULL) /* XXX when? */ + return (0); + + g_topology_lock(); + + /* + * Do not invoke resize event when initial size was zero. + * ZVOL initializes the size on first open, this is not + * real resizing. + */ + if (pp->mediasize == 0) + pp->mediasize = zv->zv_volsize; + else + g_resize_provider(pp, zv->zv_volsize); + + g_topology_unlock(); + } + return (0); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + // XXX? set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + // XXX? set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_freebsd_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_create_minor_impl, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +/* + * Public interfaces + */ + +int +zvol_busy(void) +{ + return (zvol_minors != 0); +} + +int +zvol_init(void) +{ + zvol_init_impl(); + zvol_register_ops(&zvol_freebsd_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); +} diff --git a/module/os/linux/spl/Makefile.in b/module/os/linux/spl/Makefile.in new file mode 100644 index 000000000000..b2325f91b4a7 --- /dev/null +++ b/module/os/linux/spl/Makefile.in @@ -0,0 +1,17 @@ +$(MODULE)-objs += ../os/linux/spl/spl-atomic.o +$(MODULE)-objs += ../os/linux/spl/spl-condvar.o +$(MODULE)-objs += ../os/linux/spl/spl-cred.o +$(MODULE)-objs += ../os/linux/spl/spl-err.o +$(MODULE)-objs += ../os/linux/spl/spl-generic.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem.o +$(MODULE)-objs += ../os/linux/spl/spl-kmem-cache.o +$(MODULE)-objs += ../os/linux/spl/spl-kstat.o +$(MODULE)-objs += ../os/linux/spl/spl-proc.o +$(MODULE)-objs += ../os/linux/spl/spl-procfs-list.o +$(MODULE)-objs += ../os/linux/spl/spl-taskq.o +$(MODULE)-objs += ../os/linux/spl/spl-thread.o +$(MODULE)-objs += ../os/linux/spl/spl-trace.o +$(MODULE)-objs += ../os/linux/spl/spl-tsd.o +$(MODULE)-objs += ../os/linux/spl/spl-vmem.o +$(MODULE)-objs += ../os/linux/spl/spl-xdr.o +$(MODULE)-objs += ../os/linux/spl/spl-zlib.o diff --git a/module/os/linux/spl/README.md b/module/os/linux/spl/README.md new file mode 100644 index 000000000000..51166425f063 --- /dev/null +++ b/module/os/linux/spl/README.md @@ -0,0 +1,16 @@ +The Solaris Porting Layer, SPL, is a Linux kernel module which provides a +compatibility layer used by the [ZFS on Linux](https://zfsonlinux.org) project. + +# Installation + +The latest version of the SPL is maintained as part of this repository. +Only when building ZFS version 0.7.x or earlier must an external SPL release +be used. These releases can be found at: + + * Version 0.7.x: https://github.com/zfsonlinux/spl/tree/spl-0.7-release + * Version 0.6.5.x: https://github.com/zfsonlinux/spl/tree/spl-0.6.5-release + +# Release + +The SPL is released under a GPLv2 license. +For more details see the NOTICE and THIRDPARTYLICENSE files; `UCRL-CODE-235197` diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 new file mode 100644 index 000000000000..d159169d1050 --- /dev/null +++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2 @@ -0,0 +1,339 @@ + GNU GENERAL PUBLIC LICENSE + Version 2, June 1991 + + Copyright (C) 1989, 1991 Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The licenses for most software are designed to take away your +freedom to share and change it. By contrast, the GNU General Public +License is intended to guarantee your freedom to share and change free +software--to make sure the software is free for all its users. This +General Public License applies to most of the Free Software +Foundation's software and to any other program whose authors commit to +using it. (Some other Free Software Foundation software is covered by +the GNU Lesser General Public License instead.) You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +this service if you wish), that you receive source code or can get it +if you want it, that you can change the software or use pieces of it +in new free programs; and that you know you can do these things. + + To protect your rights, we need to make restrictions that forbid +anyone to deny you these rights or to ask you to surrender the rights. +These restrictions translate to certain responsibilities for you if you +distribute copies of the software, or if you modify it. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must give the recipients all the rights that +you have. You must make sure that they, too, receive or can get the +source code. And you must show them these terms so they know their +rights. + + We protect your rights with two steps: (1) copyright the software, and +(2) offer you this license which gives you legal permission to copy, +distribute and/or modify the software. + + Also, for each author's protection and ours, we want to make certain +that everyone understands that there is no warranty for this free +software. If the software is modified by someone else and passed on, we +want its recipients to know that what they have is not the original, so +that any problems introduced by others will not reflect on the original +authors' reputations. + + Finally, any free program is threatened constantly by software +patents. We wish to avoid the danger that redistributors of a free +program will individually obtain patent licenses, in effect making the +program proprietary. To prevent this, we have made it clear that any +patent must be licensed for everyone's free use or not licensed at all. + + The precise terms and conditions for copying, distribution and +modification follow. + + GNU GENERAL PUBLIC LICENSE + TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION + + 0. This License applies to any program or other work which contains +a notice placed by the copyright holder saying it may be distributed +under the terms of this General Public License. The "Program", below, +refers to any such program or work, and a "work based on the Program" +means either the Program or any derivative work under copyright law: +that is to say, a work containing the Program or a portion of it, +either verbatim or with modifications and/or translated into another +language. (Hereinafter, translation is included without limitation in +the term "modification".) Each licensee is addressed as "you". + +Activities other than copying, distribution and modification are not +covered by this License; they are outside its scope. The act of +running the Program is not restricted, and the output from the Program +is covered only if its contents constitute a work based on the +Program (independent of having been made by running the Program). +Whether that is true depends on what the Program does. + + 1. You may copy and distribute verbatim copies of the Program's +source code as you receive it, in any medium, provided that you +conspicuously and appropriately publish on each copy an appropriate +copyright notice and disclaimer of warranty; keep intact all the +notices that refer to this License and to the absence of any warranty; +and give any other recipients of the Program a copy of this License +along with the Program. + +You may charge a fee for the physical act of transferring a copy, and +you may at your option offer warranty protection in exchange for a fee. + + 2. You may modify your copy or copies of the Program or any portion +of it, thus forming a work based on the Program, and copy and +distribute such modifications or work under the terms of Section 1 +above, provided that you also meet all of these conditions: + + a) You must cause the modified files to carry prominent notices + stating that you changed the files and the date of any change. + + b) You must cause any work that you distribute or publish, that in + whole or in part contains or is derived from the Program or any + part thereof, to be licensed as a whole at no charge to all third + parties under the terms of this License. + + c) If the modified program normally reads commands interactively + when run, you must cause it, when started running for such + interactive use in the most ordinary way, to print or display an + announcement including an appropriate copyright notice and a + notice that there is no warranty (or else, saying that you provide + a warranty) and that users may redistribute the program under + these conditions, and telling the user how to view a copy of this + License. (Exception: if the Program itself is interactive but + does not normally print such an announcement, your work based on + the Program is not required to print an announcement.) + +These requirements apply to the modified work as a whole. If +identifiable sections of that work are not derived from the Program, +and can be reasonably considered independent and separate works in +themselves, then this License, and its terms, do not apply to those +sections when you distribute them as separate works. But when you +distribute the same sections as part of a whole which is a work based +on the Program, the distribution of the whole must be on the terms of +this License, whose permissions for other licensees extend to the +entire whole, and thus to each and every part regardless of who wrote it. + +Thus, it is not the intent of this section to claim rights or contest +your rights to work written entirely by you; rather, the intent is to +exercise the right to control the distribution of derivative or +collective works based on the Program. + +In addition, mere aggregation of another work not based on the Program +with the Program (or with a work based on the Program) on a volume of +a storage or distribution medium does not bring the other work under +the scope of this License. + + 3. You may copy and distribute the Program (or a work based on it, +under Section 2) in object code or executable form under the terms of +Sections 1 and 2 above provided that you also do one of the following: + + a) Accompany it with the complete corresponding machine-readable + source code, which must be distributed under the terms of Sections + 1 and 2 above on a medium customarily used for software interchange; or, + + b) Accompany it with a written offer, valid for at least three + years, to give any third party, for a charge no more than your + cost of physically performing source distribution, a complete + machine-readable copy of the corresponding source code, to be + distributed under the terms of Sections 1 and 2 above on a medium + customarily used for software interchange; or, + + c) Accompany it with the information you received as to the offer + to distribute corresponding source code. (This alternative is + allowed only for noncommercial distribution and only if you + received the program in object code or executable form with such + an offer, in accord with Subsection b above.) + +The source code for a work means the preferred form of the work for +making modifications to it. For an executable work, complete source +code means all the source code for all modules it contains, plus any +associated interface definition files, plus the scripts used to +control compilation and installation of the executable. However, as a +special exception, the source code distributed need not include +anything that is normally distributed (in either source or binary +form) with the major components (compiler, kernel, and so on) of the +operating system on which the executable runs, unless that component +itself accompanies the executable. + +If distribution of executable or object code is made by offering +access to copy from a designated place, then offering equivalent +access to copy the source code from the same place counts as +distribution of the source code, even though third parties are not +compelled to copy the source along with the object code. + + 4. You may not copy, modify, sublicense, or distribute the Program +except as expressly provided under this License. Any attempt +otherwise to copy, modify, sublicense or distribute the Program is +void, and will automatically terminate your rights under this License. +However, parties who have received copies, or rights, from you under +this License will not have their licenses terminated so long as such +parties remain in full compliance. + + 5. You are not required to accept this License, since you have not +signed it. However, nothing else grants you permission to modify or +distribute the Program or its derivative works. These actions are +prohibited by law if you do not accept this License. Therefore, by +modifying or distributing the Program (or any work based on the +Program), you indicate your acceptance of this License to do so, and +all its terms and conditions for copying, distributing or modifying +the Program or works based on it. + + 6. Each time you redistribute the Program (or any work based on the +Program), the recipient automatically receives a license from the +original licensor to copy, distribute or modify the Program subject to +these terms and conditions. You may not impose any further +restrictions on the recipients' exercise of the rights granted herein. +You are not responsible for enforcing compliance by third parties to +this License. + + 7. If, as a consequence of a court judgment or allegation of patent +infringement or for any other reason (not limited to patent issues), +conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot +distribute so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you +may not distribute the Program at all. For example, if a patent +license would not permit royalty-free redistribution of the Program by +all those who receive copies directly or indirectly through you, then +the only way you could satisfy both it and this License would be to +refrain entirely from distribution of the Program. + +If any portion of this section is held invalid or unenforceable under +any particular circumstance, the balance of the section is intended to +apply and the section as a whole is intended to apply in other +circumstances. + +It is not the purpose of this section to induce you to infringe any +patents or other property right claims or to contest validity of any +such claims; this section has the sole purpose of protecting the +integrity of the free software distribution system, which is +implemented by public license practices. Many people have made +generous contributions to the wide range of software distributed +through that system in reliance on consistent application of that +system; it is up to the author/donor to decide if he or she is willing +to distribute software through any other system and a licensee cannot +impose that choice. + +This section is intended to make thoroughly clear what is believed to +be a consequence of the rest of this License. + + 8. If the distribution and/or use of the Program is restricted in +certain countries either by patents or by copyrighted interfaces, the +original copyright holder who places the Program under this License +may add an explicit geographical distribution limitation excluding +those countries, so that distribution is permitted only in or among +countries not thus excluded. In such case, this License incorporates +the limitation as if written in the body of this License. + + 9. The Free Software Foundation may publish revised and/or new versions +of the General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + +Each version is given a distinguishing version number. If the Program +specifies a version number of this License which applies to it and "any +later version", you have the option of following the terms and conditions +either of that version or of any later version published by the Free +Software Foundation. If the Program does not specify a version number of +this License, you may choose any version ever published by the Free Software +Foundation. + + 10. If you wish to incorporate parts of the Program into other free +programs whose distribution conditions are different, write to the author +to ask for permission. For software which is copyrighted by the Free +Software Foundation, write to the Free Software Foundation; we sometimes +make exceptions for this. Our decision will be guided by the two goals +of preserving the free status of all derivatives of our free software and +of promoting the sharing and reuse of software generally. + + NO WARRANTY + + 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +REPAIR OR CORRECTION. + + 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +POSSIBILITY OF SUCH DAMAGES. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +convey the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License along + with this program; if not, write to the Free Software Foundation, Inc., + 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. + +Also add information on how to contact you by electronic and paper mail. + +If the program is interactive, make it output a short notice like this +when it starts in an interactive mode: + + Gnomovision version 69, Copyright (C) year name of author + Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, the commands you use may +be called something other than `show w' and `show c'; they could even be +mouse-clicks or menu items--whatever suits your program. + +You should also get your employer (if you work as a programmer) or your +school, if any, to sign a "copyright disclaimer" for the program, if +necessary. Here is a sample; alter the names: + + Yoyodyne, Inc., hereby disclaims all copyright interest in the program + `Gnomovision' (which makes passes at compilers) written by James Hacker. + + , 1 April 1989 + Ty Coon, President of Vice + +This General Public License does not permit incorporating your program into +proprietary programs. If your program is a subroutine library, you may +consider it more useful to permit linking proprietary applications with the +library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. diff --git a/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip new file mode 100644 index 000000000000..78535a8ee133 --- /dev/null +++ b/module/os/linux/spl/THIRDPARTYLICENSE.gplv2.descrip @@ -0,0 +1 @@ +COMPATIBILITY LAYER FOR OPENZFS ON LINUX diff --git a/module/os/linux/spl/spl-atomic.c b/module/os/linux/spl/spl-atomic.c new file mode 100644 index 000000000000..47ed1886e157 --- /dev/null +++ b/module/os/linux/spl/spl-atomic.c @@ -0,0 +1,36 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Atomic Implementation. + */ + +#include + +#ifdef ATOMIC_SPINLOCK +/* Global atomic lock declarations */ +DEFINE_SPINLOCK(atomic32_lock); +DEFINE_SPINLOCK(atomic64_lock); + +EXPORT_SYMBOL(atomic32_lock); +EXPORT_SYMBOL(atomic64_lock); +#endif /* ATOMIC_SPINLOCK */ diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c new file mode 100644 index 000000000000..9d045e3e8a66 --- /dev/null +++ b/module/os/linux/spl/spl-condvar.c @@ -0,0 +1,466 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef HAVE_SCHED_SIGNAL_HEADER +#include +#endif + +#define MAX_HRTIMEOUT_SLACK_US 1000 +unsigned int spl_schedule_hrtimeout_slack_us = 0; + +static int +param_set_hrtimeout_slack(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (error); + + if (val > MAX_HRTIMEOUT_SLACK_US) + return (-EINVAL); + + error = param_set_uint(buf, kp); + if (error < 0) + return (error); + + return (0); +} + +module_param_call(spl_schedule_hrtimeout_slack_us, param_set_hrtimeout_slack, + param_get_uint, &spl_schedule_hrtimeout_slack_us, 0644); +MODULE_PARM_DESC(spl_schedule_hrtimeout_slack_us, + "schedule_hrtimeout_range() delta/slack value in us, default(0)"); + +void +__cv_init(kcondvar_t *cvp, char *name, kcv_type_t type, void *arg) +{ + ASSERT(cvp); + ASSERT(name == NULL); + ASSERT(type == CV_DEFAULT); + ASSERT(arg == NULL); + + cvp->cv_magic = CV_MAGIC; + init_waitqueue_head(&cvp->cv_event); + init_waitqueue_head(&cvp->cv_destroy); + atomic_set(&cvp->cv_waiters, 0); + atomic_set(&cvp->cv_refs, 1); + cvp->cv_mutex = NULL; +} +EXPORT_SYMBOL(__cv_init); + +static int +cv_destroy_wakeup(kcondvar_t *cvp) +{ + if (!atomic_read(&cvp->cv_waiters) && !atomic_read(&cvp->cv_refs)) { + ASSERT(cvp->cv_mutex == NULL); + ASSERT(!waitqueue_active(&cvp->cv_event)); + return (1); + } + + return (0); +} + +void +__cv_destroy(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + + cvp->cv_magic = CV_DESTROY; + atomic_dec(&cvp->cv_refs); + + /* Block until all waiters are woken and references dropped. */ + while (cv_destroy_wakeup(cvp) == 0) + wait_event_timeout(cvp->cv_destroy, cv_destroy_wakeup(cvp), 1); + + ASSERT3P(cvp->cv_mutex, ==, NULL); + ASSERT3S(atomic_read(&cvp->cv_refs), ==, 0); + ASSERT3S(atomic_read(&cvp->cv_waiters), ==, 0); + ASSERT3S(waitqueue_active(&cvp->cv_event), ==, 0); +} +EXPORT_SYMBOL(__cv_destroy); + +static void +cv_wait_common(kcondvar_t *cvp, kmutex_t *mp, int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + atomic_inc(&cvp->cv_refs); + + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + io_schedule(); + else + schedule(); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); +} + +void +__cv_wait(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 0); +} +EXPORT_SYMBOL(__cv_wait); + +void +__cv_wait_io(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_UNINTERRUPTIBLE, 1); +} +EXPORT_SYMBOL(__cv_wait_io); + +int +__cv_wait_io_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 1); + + return (signal_pending(current) ? 0 : 1); +} +EXPORT_SYMBOL(__cv_wait_io_sig); + +int +__cv_wait_sig(kcondvar_t *cvp, kmutex_t *mp) +{ + cv_wait_common(cvp, mp, TASK_INTERRUPTIBLE, 0); + + return (signal_pending(current) ? 0 : 1); +} +EXPORT_SYMBOL(__cv_wait_sig); + +#if defined(HAVE_IO_SCHEDULE_TIMEOUT) +#define spl_io_schedule_timeout(t) io_schedule_timeout(t) +#else + +struct spl_task_timer { + struct timer_list timer; + struct task_struct *task; +}; + +static void +__cv_wakeup(spl_timer_list_t t) +{ + struct timer_list *tmr = (struct timer_list *)t; + struct spl_task_timer *task_timer = from_timer(task_timer, tmr, timer); + + wake_up_process(task_timer->task); +} + +static long +spl_io_schedule_timeout(long time_left) +{ + long expire_time = jiffies + time_left; + struct spl_task_timer task_timer; + struct timer_list *timer = &task_timer.timer; + + task_timer.task = current; + + timer_setup(timer, __cv_wakeup, 0); + + timer->expires = expire_time; + add_timer(timer); + + io_schedule(); + + del_timer_sync(timer); + + time_left = expire_time - jiffies; + + return (time_left < 0 ? 0 : time_left); +} +#endif + +/* + * 'expire_time' argument is an absolute wall clock time in jiffies. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_common(kcondvar_t *cvp, kmutex_t *mp, clock_t expire_time, + int state, int io) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + clock_t time_left; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + /* XXX - Does not handle jiffie wrap properly */ + time_left = expire_time - jiffies; + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + if (io) + time_left = spl_io_schedule_timeout(time_left); + else + time_left = schedule_timeout(time_left); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + /* + * Hold mutex after we release the cvp, otherwise we could dead lock + * with a thread holding the mutex and call cv_destroy. + */ + mutex_enter(mp); + return (time_left > 0 ? 1 : -1); +} + +int +__cv_timedwait(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 0)); +} +EXPORT_SYMBOL(__cv_timedwait); + +int +__cv_timedwait_io(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + return (__cv_timedwait_common(cvp, mp, exp_time, + TASK_UNINTERRUPTIBLE, 1)); +} +EXPORT_SYMBOL(__cv_timedwait_io); + +int +__cv_timedwait_sig(kcondvar_t *cvp, kmutex_t *mp, clock_t exp_time) +{ + int rc; + + rc = __cv_timedwait_common(cvp, mp, exp_time, TASK_INTERRUPTIBLE, 0); + return (signal_pending(current) ? 0 : rc); +} +EXPORT_SYMBOL(__cv_timedwait_sig); + +/* + * 'expire_time' argument is an absolute clock time in nanoseconds. + * Return value is time left (expire_time - now) or -1 if timeout occurred. + */ +static clock_t +__cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t expire_time, + hrtime_t res, int state) +{ + DEFINE_WAIT(wait); + kmutex_t *m; + hrtime_t time_left; + ktime_t ktime_left; + u64 slack = 0; + int rc; + + ASSERT(cvp); + ASSERT(mp); + ASSERT(cvp->cv_magic == CV_MAGIC); + ASSERT(mutex_owned(mp)); + + time_left = expire_time - gethrtime(); + if (time_left <= 0) + return (-1); + + atomic_inc(&cvp->cv_refs); + m = READ_ONCE(cvp->cv_mutex); + if (!m) + m = xchg(&cvp->cv_mutex, mp); + /* Ensure the same mutex is used by all callers */ + ASSERT(m == NULL || m == mp); + + prepare_to_wait_exclusive(&cvp->cv_event, &wait, state); + atomic_inc(&cvp->cv_waiters); + + /* + * Mutex should be dropped after prepare_to_wait() this + * ensures we're linked in to the waiters list and avoids the + * race where 'cvp->cv_waiters > 0' but the list is empty. + */ + mutex_exit(mp); + + ktime_left = ktime_set(0, time_left); + slack = MIN(MAX(res, spl_schedule_hrtimeout_slack_us * NSEC_PER_USEC), + MAX_HRTIMEOUT_SLACK_US * NSEC_PER_USEC); + rc = schedule_hrtimeout_range(&ktime_left, slack, HRTIMER_MODE_REL); + + /* No more waiters a different mutex could be used */ + if (atomic_dec_and_test(&cvp->cv_waiters)) { + /* + * This is set without any lock, so it's racy. But this is + * just for debug anyway, so make it best-effort + */ + cvp->cv_mutex = NULL; + wake_up(&cvp->cv_destroy); + } + + finish_wait(&cvp->cv_event, &wait); + atomic_dec(&cvp->cv_refs); + + mutex_enter(mp); + return (rc == -EINTR ? 1 : -1); +} + +/* + * Compatibility wrapper for the cv_timedwait_hires() Illumos interface. + */ +static int +cv_timedwait_hires_common(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag, int state) +{ + if (!(flag & CALLOUT_FLAG_ABSOLUTE)) + tim += gethrtime(); + + return (__cv_timedwait_hires(cvp, mp, tim, res, state)); +} + +int +cv_timedwait_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, hrtime_t res, + int flag) +{ + return (cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_UNINTERRUPTIBLE)); +} +EXPORT_SYMBOL(cv_timedwait_hires); + +int +cv_timedwait_sig_hires(kcondvar_t *cvp, kmutex_t *mp, hrtime_t tim, + hrtime_t res, int flag) +{ + int rc; + + rc = cv_timedwait_hires_common(cvp, mp, tim, res, flag, + TASK_INTERRUPTIBLE); + return (signal_pending(current) ? 0 : rc); +} +EXPORT_SYMBOL(cv_timedwait_sig_hires); + +void +__cv_signal(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * All waiters are added with WQ_FLAG_EXCLUSIVE so only one + * waiter will be set runnable with each call to wake_up(). + * Additionally wake_up() holds a spin_lock associated with + * the wait queue to ensure we don't race waking up processes. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_signal); + +void +__cv_broadcast(kcondvar_t *cvp) +{ + ASSERT(cvp); + ASSERT(cvp->cv_magic == CV_MAGIC); + atomic_inc(&cvp->cv_refs); + + /* + * Wake_up_all() will wake up all waiters even those which + * have the WQ_FLAG_EXCLUSIVE flag set. + */ + if (atomic_read(&cvp->cv_waiters) > 0) + wake_up_all(&cvp->cv_event); + + atomic_dec(&cvp->cv_refs); +} +EXPORT_SYMBOL(__cv_broadcast); diff --git a/module/os/linux/spl/spl-cred.c b/module/os/linux/spl/spl-cred.c new file mode 100644 index 000000000000..6e93a32e60d7 --- /dev/null +++ b/module/os/linux/spl/spl-cred.c @@ -0,0 +1,196 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Credential Implementation. + */ + +#include + +static int +cr_groups_search(const struct group_info *group_info, kgid_t grp) +{ + unsigned int left, right, mid; + int cmp; + + if (!group_info) + return (0); + + left = 0; + right = group_info->ngroups; + while (left < right) { + mid = (left + right) / 2; + cmp = KGID_TO_SGID(grp) - + KGID_TO_SGID(GROUP_AT(group_info, mid)); + + if (cmp > 0) + left = mid + 1; + else if (cmp < 0) + right = mid; + else + return (1); + } + return (0); +} + +/* Hold a reference on the credential */ +void +crhold(cred_t *cr) +{ + (void) get_cred((const cred_t *)cr); +} + +/* Free a reference on the credential */ +void +crfree(cred_t *cr) +{ + put_cred((const cred_t *)cr); +} + +/* Return the number of supplemental groups */ +int +crgetngroups(const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = gi->ngroups; +#ifndef HAVE_GROUP_INFO_GID + /* + * For Linux <= 4.8, + * crgetgroups will only returns gi->blocks[0], which contains only + * the first NGROUPS_PER_BLOCK groups. + */ + if (rc > NGROUPS_PER_BLOCK) { + WARN_ON_ONCE(1); + rc = NGROUPS_PER_BLOCK; + } +#endif + return (rc); +} + +/* + * Return an array of supplemental gids. The returned address is safe + * to use as long as the caller has taken a reference with crhold(). + * + * Linux 4.9 API change, group_info changed from 2d array via ->blocks to 1d + * array via ->gid. + */ +gid_t * +crgetgroups(const cred_t *cr) +{ + struct group_info *gi; + gid_t *gids = NULL; + + gi = cr->group_info; +#ifdef HAVE_GROUP_INFO_GID + gids = KGIDP_TO_SGIDP(gi->gid); +#else + if (gi->nblocks > 0) + gids = KGIDP_TO_SGIDP(gi->blocks[0]); +#endif + return (gids); +} + +/* Check if the passed gid is available in supplied credential. */ +int +groupmember(gid_t gid, const cred_t *cr) +{ + struct group_info *gi; + int rc; + + gi = cr->group_info; + rc = cr_groups_search(gi, SGID_TO_KGID(gid)); + + return (rc); +} + +/* Return the effective user id */ +uid_t +crgetuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->euid)); +} + +/* Return the real user id */ +uid_t +crgetruid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->uid)); +} + +/* Return the saved user id */ +uid_t +crgetsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->suid)); +} + +/* Return the filesystem user id */ +uid_t +crgetfsuid(const cred_t *cr) +{ + return (KUID_TO_SUID(cr->fsuid)); +} + +/* Return the effective group id */ +gid_t +crgetgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->egid)); +} + +/* Return the real group id */ +gid_t +crgetrgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->gid)); +} + +/* Return the saved group id */ +gid_t +crgetsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->sgid)); +} + +/* Return the filesystem group id */ +gid_t +crgetfsgid(const cred_t *cr) +{ + return (KGID_TO_SGID(cr->fsgid)); +} + +EXPORT_SYMBOL(crhold); +EXPORT_SYMBOL(crfree); +EXPORT_SYMBOL(crgetuid); +EXPORT_SYMBOL(crgetruid); +EXPORT_SYMBOL(crgetsuid); +EXPORT_SYMBOL(crgetfsuid); +EXPORT_SYMBOL(crgetgid); +EXPORT_SYMBOL(crgetrgid); +EXPORT_SYMBOL(crgetsgid); +EXPORT_SYMBOL(crgetfsgid); +EXPORT_SYMBOL(crgetngroups); +EXPORT_SYMBOL(crgetgroups); +EXPORT_SYMBOL(groupmember); diff --git a/module/os/linux/spl/spl-err.c b/module/os/linux/spl/spl-err.c new file mode 100644 index 000000000000..3c0bb71c0629 --- /dev/null +++ b/module/os/linux/spl/spl-err.c @@ -0,0 +1,124 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Error Implementation. + */ + +#include +#include + +/* + * It is often useful to actually have the panic crash the node so you + * can then get notified of the event, get the crashdump for later + * analysis and other such goodies. + * But we would still default to the current default of not to do that. + */ +/* BEGIN CSTYLED */ +unsigned int spl_panic_halt; +module_param(spl_panic_halt, uint, 0644); +MODULE_PARM_DESC(spl_panic_halt, "Cause kernel panic on assertion failures"); +/* END CSTYLED */ + +void +spl_dumpstack(void) +{ + printk("Showing stack for process %d\n", current->pid); + dump_stack(); +} +EXPORT_SYMBOL(spl_dumpstack); + +int +spl_panic(const char *file, const char *func, int line, const char *fmt, ...) +{ + const char *newfile; + char msg[MAXMSGLEN]; + va_list ap; + + newfile = strrchr(file, '/'); + if (newfile != NULL) + newfile = newfile + 1; + else + newfile = file; + + va_start(ap, fmt); + (void) vsnprintf(msg, sizeof (msg), fmt, ap); + va_end(ap); + + printk(KERN_EMERG "%s", msg); + printk(KERN_EMERG "PANIC at %s:%d:%s()\n", newfile, line, func); + if (spl_panic_halt) + panic("%s", msg); + + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + + /* Unreachable */ + return (1); +} +EXPORT_SYMBOL(spl_panic); + +void +vcmn_err(int ce, const char *fmt, va_list ap) +{ + char msg[MAXMSGLEN]; + + vsnprintf(msg, MAXMSGLEN, fmt, ap); + + switch (ce) { + case CE_IGNORE: + break; + case CE_CONT: + printk("%s", msg); + break; + case CE_NOTE: + printk(KERN_NOTICE "NOTICE: %s\n", msg); + break; + case CE_WARN: + printk(KERN_WARNING "WARNING: %s\n", msg); + break; + case CE_PANIC: + printk(KERN_EMERG "PANIC: %s\n", msg); + spl_dumpstack(); + + /* Halt the thread to facilitate further debugging */ + set_current_state(TASK_UNINTERRUPTIBLE); + while (1) + schedule(); + } +} /* vcmn_err() */ +EXPORT_SYMBOL(vcmn_err); + +void +cmn_err(int ce, const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + vcmn_err(ce, fmt, ap); + va_end(ap); +} /* cmn_err() */ +EXPORT_SYMBOL(cmn_err); diff --git a/module/os/linux/spl/spl-generic.c b/module/os/linux/spl/spl-generic.c new file mode 100644 index 000000000000..820fb86c3c7d --- /dev/null +++ b/module/os/linux/spl/spl-generic.c @@ -0,0 +1,844 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Generic Implementation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_gitrev.h" +#include +#include +#include + +char spl_gitrev[64] = ZFS_META_GITREV; + +/* BEGIN CSTYLED */ +unsigned long spl_hostid = 0; +EXPORT_SYMBOL(spl_hostid); +/* BEGIN CSTYLED */ +module_param(spl_hostid, ulong, 0644); +MODULE_PARM_DESC(spl_hostid, "The system hostid."); +/* END CSTYLED */ + +proc_t p0; +EXPORT_SYMBOL(p0); + +/* + * Xorshift Pseudo Random Number Generator based on work by Sebastiano Vigna + * + * "Further scramblings of Marsaglia's xorshift generators" + * http://vigna.di.unimi.it/ftp/papers/xorshiftplus.pdf + * + * random_get_pseudo_bytes() is an API function on Illumos whose sole purpose + * is to provide bytes containing random numbers. It is mapped to /dev/urandom + * on Illumos, which uses a "FIPS 186-2 algorithm". No user of the SPL's + * random_get_pseudo_bytes() needs bytes that are of cryptographic quality, so + * we can implement it using a fast PRNG that we seed using Linux' actual + * equivalent to random_get_pseudo_bytes(). We do this by providing each CPU + * with an independent seed so that all calls to random_get_pseudo_bytes() are + * free of atomic instructions. + * + * A consequence of using a fast PRNG is that using random_get_pseudo_bytes() + * to generate words larger than 128 bits will paradoxically be limited to + * `2^128 - 1` possibilities. This is because we have a sequence of `2^128 - 1` + * 128-bit words and selecting the first will implicitly select the second. If + * a caller finds this behavior undesirable, random_get_bytes() should be used + * instead. + * + * XXX: Linux interrupt handlers that trigger within the critical section + * formed by `s[1] = xp[1];` and `xp[0] = s[0];` and call this function will + * see the same numbers. Nothing in the code currently calls this in an + * interrupt handler, so this is considered to be okay. If that becomes a + * problem, we could create a set of per-cpu variables for interrupt handlers + * and use them when in_interrupt() from linux/preempt_mask.h evaluates to + * true. + */ +void __percpu *spl_pseudo_entropy; + +/* + * spl_rand_next()/spl_rand_jump() are copied from the following CC-0 licensed + * file: + * + * http://xorshift.di.unimi.it/xorshift128plus.c + */ + +static inline uint64_t +spl_rand_next(uint64_t *s) +{ + uint64_t s1 = s[0]; + const uint64_t s0 = s[1]; + s[0] = s0; + s1 ^= s1 << 23; // a + s[1] = s1 ^ s0 ^ (s1 >> 18) ^ (s0 >> 5); // b, c + return (s[1] + s0); +} + +static inline void +spl_rand_jump(uint64_t *s) +{ + static const uint64_t JUMP[] = + { 0x8a5cd789635d2dff, 0x121fd2155c472f96 }; + + uint64_t s0 = 0; + uint64_t s1 = 0; + int i, b; + for (i = 0; i < sizeof (JUMP) / sizeof (*JUMP); i++) + for (b = 0; b < 64; b++) { + if (JUMP[i] & 1ULL << b) { + s0 ^= s[0]; + s1 ^= s[1]; + } + (void) spl_rand_next(s); + } + + s[0] = s0; + s[1] = s1; +} + +int +random_get_pseudo_bytes(uint8_t *ptr, size_t len) +{ + uint64_t *xp, s[2]; + + ASSERT(ptr); + + xp = get_cpu_ptr(spl_pseudo_entropy); + + s[0] = xp[0]; + s[1] = xp[1]; + + while (len) { + union { + uint64_t ui64; + uint8_t byte[sizeof (uint64_t)]; + }entropy; + int i = MIN(len, sizeof (uint64_t)); + + len -= i; + entropy.ui64 = spl_rand_next(s); + + while (i--) + *ptr++ = entropy.byte[i]; + } + + xp[0] = s[0]; + xp[1] = s[1]; + + put_cpu_ptr(spl_pseudo_entropy); + + return (0); +} + + +EXPORT_SYMBOL(random_get_pseudo_bytes); + +#if BITS_PER_LONG == 32 + +/* + * Support 64/64 => 64 division on a 32-bit platform. While the kernel + * provides a div64_u64() function for this we do not use it because the + * implementation is flawed. There are cases which return incorrect + * results as late as linux-2.6.35. Until this is fixed upstream the + * spl must provide its own implementation. + * + * This implementation is a slightly modified version of the algorithm + * proposed by the book 'Hacker's Delight'. The original source can be + * found here and is available for use without restriction. + * + * http://www.hackersdelight.org/HDcode/newCode/divDouble.c + */ + +/* + * Calculate number of leading of zeros for a 64-bit value. + */ +static int +nlz64(uint64_t x) +{ + register int n = 0; + + if (x == 0) + return (64); + + if (x <= 0x00000000FFFFFFFFULL) { n = n + 32; x = x << 32; } + if (x <= 0x0000FFFFFFFFFFFFULL) { n = n + 16; x = x << 16; } + if (x <= 0x00FFFFFFFFFFFFFFULL) { n = n + 8; x = x << 8; } + if (x <= 0x0FFFFFFFFFFFFFFFULL) { n = n + 4; x = x << 4; } + if (x <= 0x3FFFFFFFFFFFFFFFULL) { n = n + 2; x = x << 2; } + if (x <= 0x7FFFFFFFFFFFFFFFULL) { n = n + 1; } + + return (n); +} + +/* + * Newer kernels have a div_u64() function but we define our own + * to simplify portability between kernel versions. + */ +static inline uint64_t +__div_u64(uint64_t u, uint32_t v) +{ + (void) do_div(u, v); + return (u); +} + +/* + * Turn off missing prototypes warning for these functions. They are + * replacements for libgcc-provided functions and will never be called + * directly. + */ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wmissing-prototypes" + +/* + * Implementation of 64-bit unsigned division for 32-bit machines. + * + * First the procedure takes care of the case in which the divisor is a + * 32-bit quantity. There are two subcases: (1) If the left half of the + * dividend is less than the divisor, one execution of do_div() is all that + * is required (overflow is not possible). (2) Otherwise it does two + * divisions, using the grade school method. + */ +uint64_t +__udivdi3(uint64_t u, uint64_t v) +{ + uint64_t u0, u1, v1, q0, q1, k; + int n; + + if (v >> 32 == 0) { // If v < 2**32: + if (u >> 32 < v) { // If u/v cannot overflow, + return (__div_u64(u, v)); // just do one division. + } else { // If u/v would overflow: + u1 = u >> 32; // Break u into two halves. + u0 = u & 0xFFFFFFFF; + q1 = __div_u64(u1, v); // First quotient digit. + k = u1 - q1 * v; // First remainder, < v. + u0 += (k << 32); + q0 = __div_u64(u0, v); // Seconds quotient digit. + return ((q1 << 32) + q0); + } + } else { // If v >= 2**32: + n = nlz64(v); // 0 <= n <= 31. + v1 = (v << n) >> 32; // Normalize divisor, MSB is 1. + u1 = u >> 1; // To ensure no overflow. + q1 = __div_u64(u1, v1); // Get quotient from + q0 = (q1 << n) >> 31; // Undo normalization and + // division of u by 2. + if (q0 != 0) // Make q0 correct or + q0 = q0 - 1; // too small by 1. + if ((u - q0 * v) >= v) + q0 = q0 + 1; // Now q0 is correct. + + return (q0); + } +} +EXPORT_SYMBOL(__udivdi3); + +/* BEGIN CSTYLED */ +#ifndef abs64 +#define abs64(x) ({ uint64_t t = (x) >> 63; ((x) ^ t) - t; }) +#endif +/* END CSTYLED */ + +/* + * Implementation of 64-bit signed division for 32-bit machines. + */ +int64_t +__divdi3(int64_t u, int64_t v) +{ + int64_t q, t; + // cppcheck-suppress shiftTooManyBitsSigned + q = __udivdi3(abs64(u), abs64(v)); + // cppcheck-suppress shiftTooManyBitsSigned + t = (u ^ v) >> 63; // If u, v have different + return ((q ^ t) - t); // signs, negate q. +} +EXPORT_SYMBOL(__divdi3); + +/* + * Implementation of 64-bit unsigned modulo for 32-bit machines. + */ +uint64_t +__umoddi3(uint64_t dividend, uint64_t divisor) +{ + return (dividend - (divisor * __udivdi3(dividend, divisor))); +} +EXPORT_SYMBOL(__umoddi3); + +/* 64-bit signed modulo for 32-bit machines. */ +int64_t +__moddi3(int64_t n, int64_t d) +{ + int64_t q; + boolean_t nn = B_FALSE; + + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) + d = -d; + + q = __umoddi3(n, d); + + return (nn ? -q : q); +} +EXPORT_SYMBOL(__moddi3); + +/* + * Implementation of 64-bit unsigned division/modulo for 32-bit machines. + */ +uint64_t +__udivmoddi4(uint64_t n, uint64_t d, uint64_t *r) +{ + uint64_t q = __udivdi3(n, d); + if (r) + *r = n - d * q; + return (q); +} +EXPORT_SYMBOL(__udivmoddi4); + +/* + * Implementation of 64-bit signed division/modulo for 32-bit machines. + */ +int64_t +__divmoddi4(int64_t n, int64_t d, int64_t *r) +{ + int64_t q, rr; + boolean_t nn = B_FALSE; + boolean_t nd = B_FALSE; + if (n < 0) { + nn = B_TRUE; + n = -n; + } + if (d < 0) { + nd = B_TRUE; + d = -d; + } + + q = __udivmoddi4(n, d, (uint64_t *)&rr); + + if (nn != nd) + q = -q; + if (nn) + rr = -rr; + if (r) + *r = rr; + return (q); +} +EXPORT_SYMBOL(__divmoddi4); + +#if defined(__arm) || defined(__arm__) +/* + * Implementation of 64-bit (un)signed division for 32-bit arm machines. + * + * Run-time ABI for the ARM Architecture (page 20). A pair of (unsigned) + * long longs is returned in {{r0, r1}, {r2,r3}}, the quotient in {r0, r1}, + * and the remainder in {r2, r3}. The return type is specifically left + * set to 'void' to ensure the compiler does not overwrite these registers + * during the return. All results are in registers as per ABI + */ +void +__aeabi_uldivmod(uint64_t u, uint64_t v) +{ + uint64_t res; + uint64_t mod; + + res = __udivdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_uldivmod); + +void +__aeabi_ldivmod(int64_t u, int64_t v) +{ + int64_t res; + uint64_t mod; + + res = __divdi3(u, v); + mod = __umoddi3(u, v); + { + register uint32_t r0 asm("r0") = (res & 0xFFFFFFFF); + register uint32_t r1 asm("r1") = (res >> 32); + register uint32_t r2 asm("r2") = (mod & 0xFFFFFFFF); + register uint32_t r3 asm("r3") = (mod >> 32); + + /* BEGIN CSTYLED */ + asm volatile("" + : "+r"(r0), "+r"(r1), "+r"(r2),"+r"(r3) /* output */ + : "r"(r0), "r"(r1), "r"(r2), "r"(r3)); /* input */ + /* END CSTYLED */ + + return; /* r0; */ + } +} +EXPORT_SYMBOL(__aeabi_ldivmod); +#endif /* __arm || __arm__ */ + +#pragma GCC diagnostic pop + +#endif /* BITS_PER_LONG */ + +/* + * NOTE: The strtoxx behavior is solely based on my reading of the Solaris + * ddi_strtol(9F) man page. I have not verified the behavior of these + * functions against their Solaris counterparts. It is possible that I + * may have misinterpreted the man page or the man page is incorrect. + */ +int ddi_strtoul(const char *, char **, int, unsigned long *); +int ddi_strtol(const char *, char **, int, long *); +int ddi_strtoull(const char *, char **, int, unsigned long long *); +int ddi_strtoll(const char *, char **, int, long long *); + +#define define_ddi_strtoux(type, valtype) \ +int ddi_strtou##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + valtype last_value, value = 0; \ + char *ptr = (char *)str; \ + int flag = 1, digit; \ + \ + if (strlen(ptr) == 0) \ + return (EINVAL); \ + \ + /* Auto-detect base based on prefix */ \ + if (!base) { \ + if (str[0] == '0') { \ + if (tolower(str[1]) == 'x' && isxdigit(str[2])) { \ + base = 16; /* hex */ \ + ptr += 2; \ + } else if (str[1] >= '0' && str[1] < 8) { \ + base = 8; /* octal */ \ + ptr += 1; \ + } else { \ + return (EINVAL); \ + } \ + } else { \ + base = 10; /* decimal */ \ + } \ + } \ + \ + while (1) { \ + if (isdigit(*ptr)) \ + digit = *ptr - '0'; \ + else if (isalpha(*ptr)) \ + digit = tolower(*ptr) - 'a' + 10; \ + else \ + break; \ + \ + if (digit >= base) \ + break; \ + \ + last_value = value; \ + value = value * base + digit; \ + if (last_value > value) /* Overflow */ \ + return (ERANGE); \ + \ + flag = 1; \ + ptr++; \ + } \ + \ + if (flag) \ + *result = value; \ + \ + if (endptr) \ + *endptr = (char *)(flag ? ptr : str); \ + \ + return (0); \ +} \ + +#define define_ddi_strtox(type, valtype) \ +int ddi_strto##type(const char *str, char **endptr, \ + int base, valtype *result) \ +{ \ + int rc; \ + \ + if (*str == '-') { \ + rc = ddi_strtou##type(str + 1, endptr, base, result); \ + if (!rc) { \ + if (*endptr == str + 1) \ + *endptr = (char *)str; \ + else \ + *result = -*result; \ + } \ + } else { \ + rc = ddi_strtou##type(str, endptr, base, result); \ + } \ + \ + return (rc); \ +} + +define_ddi_strtoux(l, unsigned long) +define_ddi_strtox(l, long) +define_ddi_strtoux(ll, unsigned long long) +define_ddi_strtox(ll, long long) + +EXPORT_SYMBOL(ddi_strtoul); +EXPORT_SYMBOL(ddi_strtol); +EXPORT_SYMBOL(ddi_strtoll); +EXPORT_SYMBOL(ddi_strtoull); + +int +ddi_copyin(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyin(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyin); + +int +ddi_copyout(const void *from, void *to, size_t len, int flags) +{ + /* Fake ioctl() issued by kernel, 'from' is a kernel address */ + if (flags & FKIOCTL) { + memcpy(to, from, len); + return (0); + } + + return (copyout(from, to, len)); +} +EXPORT_SYMBOL(ddi_copyout); + +static ssize_t +spl_kernel_read(struct file *file, void *buf, size_t count, loff_t *pos) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(file, buf, count, pos)); +#else + mm_segment_t saved_fs; + ssize_t ret; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + ret = vfs_read(file, (void __user *)buf, count, pos); + + set_fs(saved_fs); + + return (ret); +#endif +} + +static int +spl_getattr(struct file *filp, struct kstat *stat) +{ + int rc; + + ASSERT(filp); + ASSERT(stat); + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, stat); +#else + rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, stat); +#endif + if (rc) + return (-rc); + + return (0); +} + +/* + * Read the unique system identifier from the /etc/hostid file. + * + * The behavior of /usr/bin/hostid on Linux systems with the + * regular eglibc and coreutils is: + * + * 1. Generate the value if the /etc/hostid file does not exist + * or if the /etc/hostid file is less than four bytes in size. + * + * 2. If the /etc/hostid file is at least 4 bytes, then return + * the first four bytes [0..3] in native endian order. + * + * 3. Always ignore bytes [4..] if they exist in the file. + * + * Only the first four bytes are significant, even on systems that + * have a 64-bit word size. + * + * See: + * + * eglibc: sysdeps/unix/sysv/linux/gethostid.c + * coreutils: src/hostid.c + * + * Notes: + * + * The /etc/hostid file on Solaris is a text file that often reads: + * + * # DO NOT EDIT + * "0123456789" + * + * Directly copying this file to Linux results in a constant + * hostid of 4f442023 because the default comment constitutes + * the first four bytes of the file. + * + */ + +char *spl_hostid_path = HW_HOSTID_PATH; +module_param(spl_hostid_path, charp, 0444); +MODULE_PARM_DESC(spl_hostid_path, "The system hostid file (/etc/hostid)"); + +static int +hostid_read(uint32_t *hostid) +{ + uint64_t size; + uint32_t value = 0; + int error; + loff_t off; + struct file *filp; + struct kstat stat; + + filp = filp_open(spl_hostid_path, 0, 0); + + if (IS_ERR(filp)) + return (ENOENT); + + error = spl_getattr(filp, &stat); + if (error) { + filp_close(filp, 0); + return (error); + } + size = stat.size; + if (size < sizeof (HW_HOSTID_MASK)) { + filp_close(filp, 0); + return (EINVAL); + } + + off = 0; + /* + * Read directly into the variable like eglibc does. + * Short reads are okay; native behavior is preserved. + */ + error = spl_kernel_read(filp, &value, sizeof (value), &off); + if (error < 0) { + filp_close(filp, 0); + return (EIO); + } + + /* Mask down to 32 bits like coreutils does. */ + *hostid = (value & HW_HOSTID_MASK); + filp_close(filp, 0); + + return (0); +} + +/* + * Return the system hostid. Preferentially use the spl_hostid module option + * when set, otherwise use the value in the /etc/hostid file. + */ +uint32_t +zone_get_hostid(void *zone) +{ + uint32_t hostid; + + ASSERT3P(zone, ==, NULL); + + if (spl_hostid != 0) + return ((uint32_t)(spl_hostid & HW_HOSTID_MASK)); + + if (hostid_read(&hostid) == 0) + return (hostid); + + return (0); +} +EXPORT_SYMBOL(zone_get_hostid); + +static int +spl_kvmem_init(void) +{ + int rc = 0; + + rc = spl_kmem_init(); + if (rc) + return (rc); + + rc = spl_vmem_init(); + if (rc) { + spl_kmem_fini(); + return (rc); + } + + return (rc); +} + +/* + * We initialize the random number generator with 128 bits of entropy from the + * system random number generator. In the improbable case that we have a zero + * seed, we fallback to the system jiffies, unless it is also zero, in which + * situation we use a preprogrammed seed. We step forward by 2^64 iterations to + * initialize each of the per-cpu seeds so that the sequences generated on each + * CPU are guaranteed to never overlap in practice. + */ +static void __init +spl_random_init(void) +{ + uint64_t s[2]; + int i = 0; + + spl_pseudo_entropy = __alloc_percpu(2 * sizeof (uint64_t), + sizeof (uint64_t)); + + get_random_bytes(s, sizeof (s)); + + if (s[0] == 0 && s[1] == 0) { + if (jiffies != 0) { + s[0] = jiffies; + s[1] = ~0 - jiffies; + } else { + (void) memcpy(s, "improbable seed", sizeof (s)); + } + printk("SPL: get_random_bytes() returned 0 " + "when generating random seed. Setting initial seed to " + "0x%016llx%016llx.\n", cpu_to_be64(s[0]), + cpu_to_be64(s[1])); + } + + for_each_possible_cpu(i) { + uint64_t *wordp = per_cpu_ptr(spl_pseudo_entropy, i); + + spl_rand_jump(s); + + wordp[0] = s[0]; + wordp[1] = s[1]; + } +} + +static void +spl_random_fini(void) +{ + free_percpu(spl_pseudo_entropy); +} + +static void +spl_kvmem_fini(void) +{ + spl_vmem_fini(); + spl_kmem_fini(); +} + +static int __init +spl_init(void) +{ + int rc = 0; + + bzero(&p0, sizeof (proc_t)); + spl_random_init(); + + if ((rc = spl_kvmem_init())) + goto out1; + + if ((rc = spl_tsd_init())) + goto out2; + + if ((rc = spl_taskq_init())) + goto out3; + + if ((rc = spl_kmem_cache_init())) + goto out4; + + if ((rc = spl_proc_init())) + goto out5; + + if ((rc = spl_kstat_init())) + goto out6; + + if ((rc = spl_zlib_init())) + goto out7; + + return (rc); + +out7: + spl_kstat_fini(); +out6: + spl_proc_fini(); +out5: + spl_kmem_cache_fini(); +out4: + spl_taskq_fini(); +out3: + spl_tsd_fini(); +out2: + spl_kvmem_fini(); +out1: + return (rc); +} + +static void __exit +spl_fini(void) +{ + spl_zlib_fini(); + spl_kstat_fini(); + spl_proc_fini(); + spl_kmem_cache_fini(); + spl_taskq_fini(); + spl_tsd_fini(); + spl_kvmem_fini(); + spl_random_fini(); +} + +module_init(spl_init); +module_exit(spl_fini); + +ZFS_MODULE_DESCRIPTION("Solaris Porting Layer"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE("GPL"); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c new file mode 100644 index 000000000000..15dc27624c55 --- /dev/null +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -0,0 +1,1469 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Within the scope of spl-kmem.c file the kmem_cache_* definitions + * are removed to allow access to the real Linux slab allocator. + */ +#undef kmem_cache_destroy +#undef kmem_cache_create +#undef kmem_cache_alloc +#undef kmem_cache_free + + +/* + * Linux 3.16 replaced smp_mb__{before,after}_{atomic,clear}_{dec,inc,bit}() + * with smp_mb__{before,after}_atomic() because they were redundant. This is + * only used inside our SLAB allocator, so we implement an internal wrapper + * here to give us smp_mb__{before,after}_atomic() on older kernels. + */ +#ifndef smp_mb__before_atomic +#define smp_mb__before_atomic(x) smp_mb__before_clear_bit(x) +#endif + +#ifndef smp_mb__after_atomic +#define smp_mb__after_atomic(x) smp_mb__after_clear_bit(x) +#endif + +/* BEGIN CSTYLED */ + +/* + * Cache magazines are an optimization designed to minimize the cost of + * allocating memory. They do this by keeping a per-cpu cache of recently + * freed objects, which can then be reallocated without taking a lock. This + * can improve performance on highly contended caches. However, because + * objects in magazines will prevent otherwise empty slabs from being + * immediately released this may not be ideal for low memory machines. + * + * For this reason spl_kmem_cache_magazine_size can be used to set a maximum + * magazine size. When this value is set to 0 the magazine size will be + * automatically determined based on the object size. Otherwise magazines + * will be limited to 2-256 objects per magazine (i.e per cpu). Magazines + * may never be entirely disabled in this implementation. + */ +unsigned int spl_kmem_cache_magazine_size = 0; +module_param(spl_kmem_cache_magazine_size, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_magazine_size, + "Default magazine size (2-256), set automatically (0)"); + +/* + * The default behavior is to report the number of objects remaining in the + * cache. This allows the Linux VM to repeatedly reclaim objects from the + * cache when memory is low satisfy other memory allocations. Alternately, + * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache + * is reclaimed. This may increase the likelihood of out of memory events. + */ +unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; +module_param(spl_kmem_cache_reclaim, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); + +unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; +module_param(spl_kmem_cache_obj_per_slab, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); + +unsigned int spl_kmem_cache_max_size = SPL_KMEM_CACHE_MAX_SIZE; +module_param(spl_kmem_cache_max_size, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); + +/* + * For small objects the Linux slab allocator should be used to make the most + * efficient use of the memory. However, large objects are not supported by + * the Linux slab and therefore the SPL implementation is preferred. A cutoff + * of 16K was determined to be optimal for architectures using 4K pages. + */ +#if PAGE_SIZE == 4096 +unsigned int spl_kmem_cache_slab_limit = 16384; +#else +unsigned int spl_kmem_cache_slab_limit = 0; +#endif +module_param(spl_kmem_cache_slab_limit, uint, 0644); +MODULE_PARM_DESC(spl_kmem_cache_slab_limit, + "Objects less than N bytes use the Linux slab"); + +/* + * The number of threads available to allocate new slabs for caches. This + * should not need to be tuned but it is available for performance analysis. + */ +unsigned int spl_kmem_cache_kmem_threads = 4; +module_param(spl_kmem_cache_kmem_threads, uint, 0444); +MODULE_PARM_DESC(spl_kmem_cache_kmem_threads, + "Number of spl_kmem_cache threads"); +/* END CSTYLED */ + +/* + * Slab allocation interfaces + * + * While the Linux slab implementation was inspired by the Solaris + * implementation I cannot use it to emulate the Solaris APIs. I + * require two features which are not provided by the Linux slab. + * + * 1) Constructors AND destructors. Recent versions of the Linux + * kernel have removed support for destructors. This is a deal + * breaker for the SPL which contains particularly expensive + * initializers for mutex's, condition variables, etc. We also + * require a minimal level of cleanup for these data types unlike + * many Linux data types which do need to be explicitly destroyed. + * + * 2) Virtual address space backed slab. Callers of the Solaris slab + * expect it to work well for both small are very large allocations. + * Because of memory fragmentation the Linux slab which is backed + * by kmalloc'ed memory performs very badly when confronted with + * large numbers of large allocations. Basing the slab on the + * virtual address space removes the need for contiguous pages + * and greatly improve performance for large allocations. + * + * For these reasons, the SPL has its own slab implementation with + * the needed features. It is not as highly optimized as either the + * Solaris or Linux slabs, but it should get me most of what is + * needed until it can be optimized or obsoleted by another approach. + * + * One serious concern I do have about this method is the relatively + * small virtual address space on 32bit arches. This will seriously + * constrain the size of the slab caches and their performance. + */ + +struct list_head spl_kmem_cache_list; /* List of caches */ +struct rw_semaphore spl_kmem_cache_sem; /* Cache list lock */ +taskq_t *spl_kmem_cache_taskq; /* Task queue for aging / reclaim */ + +static void spl_cache_shrink(spl_kmem_cache_t *skc, void *obj); + +static void * +kv_alloc(spl_kmem_cache_t *skc, int size, int flags) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); + + /* Resulting allocated memory will be page aligned */ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + return (ptr); +} + +static void +kv_free(spl_kmem_cache_t *skc, void *ptr, int size) +{ + ASSERT(IS_P2ALIGNED(ptr, PAGE_SIZE)); + + /* + * The Linux direct reclaim path uses this out of band value to + * determine if forward progress is being made. Normally this is + * incremented by kmem_freepages() which is part of the various + * Linux slab implementations. However, since we are using none + * of that infrastructure we are responsible for incrementing it. + */ + if (current->reclaim_state) + current->reclaim_state->reclaimed_slab += size >> PAGE_SHIFT; + + vfree(ptr); +} + +/* + * Required space for each aligned sks. + */ +static inline uint32_t +spl_sks_size(spl_kmem_cache_t *skc) +{ + return (P2ROUNDUP_TYPED(sizeof (spl_kmem_slab_t), + skc->skc_obj_align, uint32_t)); +} + +/* + * Required space for each aligned object. + */ +static inline uint32_t +spl_obj_size(spl_kmem_cache_t *skc) +{ + uint32_t align = skc->skc_obj_align; + + return (P2ROUNDUP_TYPED(skc->skc_obj_size, align, uint32_t) + + P2ROUNDUP_TYPED(sizeof (spl_kmem_obj_t), align, uint32_t)); +} + +uint64_t +spl_kmem_cache_inuse(kmem_cache_t *cache) +{ + return (cache->skc_obj_total); +} +EXPORT_SYMBOL(spl_kmem_cache_inuse); + +uint64_t +spl_kmem_cache_entry_size(kmem_cache_t *cache) +{ + return (cache->skc_obj_size); +} +EXPORT_SYMBOL(spl_kmem_cache_entry_size); + +/* + * Lookup the spl_kmem_object_t for an object given that object. + */ +static inline spl_kmem_obj_t * +spl_sko_from_obj(spl_kmem_cache_t *skc, void *obj) +{ + return (obj + P2ROUNDUP_TYPED(skc->skc_obj_size, + skc->skc_obj_align, uint32_t)); +} + +/* + * It's important that we pack the spl_kmem_obj_t structure and the + * actual objects in to one large address space to minimize the number + * of calls to the allocator. It is far better to do a few large + * allocations and then subdivide it ourselves. Now which allocator + * we use requires balancing a few trade offs. + * + * For small objects we use kmem_alloc() because as long as you are + * only requesting a small number of pages (ideally just one) its cheap. + * However, when you start requesting multiple pages with kmem_alloc() + * it gets increasingly expensive since it requires contiguous pages. + * For this reason we shift to vmem_alloc() for slabs of large objects + * which removes the need for contiguous pages. We do not use + * vmem_alloc() in all cases because there is significant locking + * overhead in __get_vm_area_node(). This function takes a single + * global lock when acquiring an available virtual address range which + * serializes all vmem_alloc()'s for all slab caches. Using slightly + * different allocation functions for small and large objects should + * give us the best of both worlds. + * + * +------------------------+ + * | spl_kmem_slab_t --+-+ | + * | skc_obj_size <-+ | | + * | spl_kmem_obj_t | | + * | skc_obj_size <---+ | + * | spl_kmem_obj_t | | + * | ... v | + * +------------------------+ + */ +static spl_kmem_slab_t * +spl_slab_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + void *base; + uint32_t obj_size; + + base = kv_alloc(skc, skc->skc_slab_size, flags); + if (base == NULL) + return (NULL); + + sks = (spl_kmem_slab_t *)base; + sks->sks_magic = SKS_MAGIC; + sks->sks_objs = skc->skc_slab_objs; + sks->sks_age = jiffies; + sks->sks_cache = skc; + INIT_LIST_HEAD(&sks->sks_list); + INIT_LIST_HEAD(&sks->sks_free_list); + sks->sks_ref = 0; + obj_size = spl_obj_size(skc); + + for (int i = 0; i < sks->sks_objs; i++) { + void *obj = base + spl_sks_size(skc) + (i * obj_size); + + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + spl_kmem_obj_t *sko = spl_sko_from_obj(skc, obj); + sko->sko_addr = obj; + sko->sko_magic = SKO_MAGIC; + sko->sko_slab = sks; + INIT_LIST_HEAD(&sko->sko_list); + list_add_tail(&sko->sko_list, &sks->sks_free_list); + } + + return (sks); +} + +/* + * Remove a slab from complete or partial list, it must be called with + * the 'skc->skc_lock' held but the actual free must be performed + * outside the lock to prevent deadlocking on vmem addresses. + */ +static void +spl_slab_free(spl_kmem_slab_t *sks, + struct list_head *sks_list, struct list_head *sko_list) +{ + spl_kmem_cache_t *skc; + + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref == 0); + + skc = sks->sks_cache; + ASSERT(skc->skc_magic == SKC_MAGIC); + + /* + * Update slab/objects counters in the cache, then remove the + * slab from the skc->skc_partial_list. Finally add the slab + * and all its objects in to the private work lists where the + * destructors will be called and the memory freed to the system. + */ + skc->skc_obj_total -= sks->sks_objs; + skc->skc_slab_total--; + list_del(&sks->sks_list); + list_add(&sks->sks_list, sks_list); + list_splice_init(&sks->sks_free_list, sko_list); +} + +/* + * Reclaim empty slabs at the end of the partial list. + */ +static void +spl_slab_reclaim(spl_kmem_cache_t *skc) +{ + spl_kmem_slab_t *sks = NULL, *m = NULL; + spl_kmem_obj_t *sko = NULL, *n = NULL; + LIST_HEAD(sks_list); + LIST_HEAD(sko_list); + + /* + * Empty slabs and objects must be moved to a private list so they + * can be safely freed outside the spin lock. All empty slabs are + * at the end of skc->skc_partial_list, therefore once a non-empty + * slab is found we can stop scanning. + */ + spin_lock(&skc->skc_lock); + list_for_each_entry_safe_reverse(sks, m, + &skc->skc_partial_list, sks_list) { + + if (sks->sks_ref > 0) + break; + + spl_slab_free(sks, &sks_list, &sko_list); + } + spin_unlock(&skc->skc_lock); + + /* + * The following two loops ensure all the object destructors are run, + * and the slabs themselves are freed. This is all done outside the + * skc->skc_lock since this allows the destructor to sleep, and + * allows us to perform a conditional reschedule when a freeing a + * large number of objects and slabs back to the system. + */ + + list_for_each_entry_safe(sko, n, &sko_list, sko_list) { + ASSERT(sko->sko_magic == SKO_MAGIC); + } + + list_for_each_entry_safe(sks, m, &sks_list, sks_list) { + ASSERT(sks->sks_magic == SKS_MAGIC); + kv_free(skc, sks, skc->skc_slab_size); + } +} + +static spl_kmem_emergency_t * +spl_emergency_search(struct rb_root *root, void *obj) +{ + struct rb_node *node = root->rb_node; + spl_kmem_emergency_t *ske; + unsigned long address = (unsigned long)obj; + + while (node) { + ske = container_of(node, spl_kmem_emergency_t, ske_node); + + if (address < ske->ske_obj) + node = node->rb_left; + else if (address > ske->ske_obj) + node = node->rb_right; + else + return (ske); + } + + return (NULL); +} + +static int +spl_emergency_insert(struct rb_root *root, spl_kmem_emergency_t *ske) +{ + struct rb_node **new = &(root->rb_node), *parent = NULL; + spl_kmem_emergency_t *ske_tmp; + unsigned long address = ske->ske_obj; + + while (*new) { + ske_tmp = container_of(*new, spl_kmem_emergency_t, ske_node); + + parent = *new; + if (address < ske_tmp->ske_obj) + new = &((*new)->rb_left); + else if (address > ske_tmp->ske_obj) + new = &((*new)->rb_right); + else + return (0); + } + + rb_link_node(&ske->ske_node, parent, new); + rb_insert_color(&ske->ske_node, root); + + return (1); +} + +/* + * Allocate a single emergency object and track it in a red black tree. + */ +static int +spl_emergency_alloc(spl_kmem_cache_t *skc, int flags, void **obj) +{ + gfp_t lflags = kmem_flags_convert(flags); + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + int empty; + + /* Last chance use a partial slab if one now exists */ + spin_lock(&skc->skc_lock); + empty = list_empty(&skc->skc_partial_list); + spin_unlock(&skc->skc_lock); + if (!empty) + return (-EEXIST); + + ske = kmalloc(sizeof (*ske), lflags); + if (ske == NULL) + return (-ENOMEM); + + ske->ske_obj = __get_free_pages(lflags, order); + if (ske->ske_obj == 0) { + kfree(ske); + return (-ENOMEM); + } + + spin_lock(&skc->skc_lock); + empty = spl_emergency_insert(&skc->skc_emergency_tree, ske); + if (likely(empty)) { + skc->skc_obj_total++; + skc->skc_obj_emergency++; + if (skc->skc_obj_emergency > skc->skc_obj_emergency_max) + skc->skc_obj_emergency_max = skc->skc_obj_emergency; + } + spin_unlock(&skc->skc_lock); + + if (unlikely(!empty)) { + free_pages(ske->ske_obj, order); + kfree(ske); + return (-EINVAL); + } + + *obj = (void *)ske->ske_obj; + + return (0); +} + +/* + * Locate the passed object in the red black tree and free it. + */ +static int +spl_emergency_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_emergency_t *ske; + int order = get_order(skc->skc_obj_size); + + spin_lock(&skc->skc_lock); + ske = spl_emergency_search(&skc->skc_emergency_tree, obj); + if (ske) { + rb_erase(&ske->ske_node, &skc->skc_emergency_tree); + skc->skc_obj_emergency--; + skc->skc_obj_total--; + } + spin_unlock(&skc->skc_lock); + + if (ske == NULL) + return (-ENOENT); + + free_pages(ske->ske_obj, order); + kfree(ske); + + return (0); +} + +/* + * Release objects from the per-cpu magazine back to their slab. The flush + * argument contains the max number of entries to remove from the magazine. + */ +static void +spl_cache_flush(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flush) +{ + spin_lock(&skc->skc_lock); + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + int count = MIN(flush, skm->skm_avail); + for (int i = 0; i < count; i++) + spl_cache_shrink(skc, skm->skm_objs[i]); + + skm->skm_avail -= count; + memmove(skm->skm_objs, &(skm->skm_objs[count]), + sizeof (void *) * skm->skm_avail); + + spin_unlock(&skc->skc_lock); +} + +/* + * Size a slab based on the size of each aligned object plus spl_kmem_obj_t. + * When on-slab we want to target spl_kmem_cache_obj_per_slab. However, + * for very small objects we may end up with more than this so as not + * to waste space in the minimal allocation of a single page. Also for + * very large objects we may use as few as spl_kmem_cache_obj_per_slab_min, + * lower than this and we will fail. + */ +static int +spl_slab_size(spl_kmem_cache_t *skc, uint32_t *objs, uint32_t *size) +{ + uint32_t sks_size, obj_size, max_size, tgt_size, tgt_objs; + + sks_size = spl_sks_size(skc); + obj_size = spl_obj_size(skc); + max_size = (spl_kmem_cache_max_size * 1024 * 1024); + tgt_size = (spl_kmem_cache_obj_per_slab * obj_size + sks_size); + + if (tgt_size <= max_size) { + tgt_objs = (tgt_size - sks_size) / obj_size; + } else { + tgt_objs = (max_size - sks_size) / obj_size; + tgt_size = (tgt_objs * obj_size) + sks_size; + } + + if (tgt_objs == 0) + return (-ENOSPC); + + *objs = tgt_objs; + *size = tgt_size; + + return (0); +} + +/* + * Make a guess at reasonable per-cpu magazine size based on the size of + * each object and the cost of caching N of them in each magazine. Long + * term this should really adapt based on an observed usage heuristic. + */ +static int +spl_magazine_size(spl_kmem_cache_t *skc) +{ + uint32_t obj_size = spl_obj_size(skc); + int size; + + if (spl_kmem_cache_magazine_size > 0) + return (MAX(MIN(spl_kmem_cache_magazine_size, 256), 2)); + + /* Per-magazine sizes below assume a 4Kib page size */ + if (obj_size > (PAGE_SIZE * 256)) + size = 4; /* Minimum 4Mib per-magazine */ + else if (obj_size > (PAGE_SIZE * 32)) + size = 16; /* Minimum 2Mib per-magazine */ + else if (obj_size > (PAGE_SIZE)) + size = 64; /* Minimum 256Kib per-magazine */ + else if (obj_size > (PAGE_SIZE / 4)) + size = 128; /* Minimum 128Kib per-magazine */ + else + size = 256; + + return (size); +} + +/* + * Allocate a per-cpu magazine to associate with a specific core. + */ +static spl_kmem_magazine_t * +spl_magazine_alloc(spl_kmem_cache_t *skc, int cpu) +{ + spl_kmem_magazine_t *skm; + int size = sizeof (spl_kmem_magazine_t) + + sizeof (void *) * skc->skc_mag_size; + + skm = kmalloc_node(size, GFP_KERNEL, cpu_to_node(cpu)); + if (skm) { + skm->skm_magic = SKM_MAGIC; + skm->skm_avail = 0; + skm->skm_size = skc->skc_mag_size; + skm->skm_refill = skc->skc_mag_refill; + skm->skm_cache = skc; + skm->skm_cpu = cpu; + } + + return (skm); +} + +/* + * Free a per-cpu magazine associated with a specific core. + */ +static void +spl_magazine_free(spl_kmem_magazine_t *skm) +{ + ASSERT(skm->skm_magic == SKM_MAGIC); + ASSERT(skm->skm_avail == 0); + kfree(skm); +} + +/* + * Create all pre-cpu magazines of reasonable sizes. + */ +static int +spl_magazine_create(spl_kmem_cache_t *skc) +{ + int i = 0; + + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + + skc->skc_mag = kzalloc(sizeof (spl_kmem_magazine_t *) * + num_possible_cpus(), kmem_flags_convert(KM_SLEEP)); + skc->skc_mag_size = spl_magazine_size(skc); + skc->skc_mag_refill = (skc->skc_mag_size + 1) / 2; + + for_each_possible_cpu(i) { + skc->skc_mag[i] = spl_magazine_alloc(skc, i); + if (!skc->skc_mag[i]) { + for (i--; i >= 0; i--) + spl_magazine_free(skc->skc_mag[i]); + + kfree(skc->skc_mag); + return (-ENOMEM); + } + } + + return (0); +} + +/* + * Destroy all pre-cpu magazines. + */ +static void +spl_magazine_destroy(spl_kmem_cache_t *skc) +{ + spl_kmem_magazine_t *skm; + int i = 0; + + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + + for_each_possible_cpu(i) { + skm = skc->skc_mag[i]; + spl_cache_flush(skc, skm, skm->skm_avail); + spl_magazine_free(skm); + } + + kfree(skc->skc_mag); +} + +/* + * Create a object cache based on the following arguments: + * name cache name + * size cache object size + * align cache object alignment + * ctor cache object constructor + * dtor cache object destructor + * reclaim cache object reclaim + * priv cache private data for ctor/dtor/reclaim + * vmp unused must be NULL + * flags + * KMC_KVMEM Force kvmem backed SPL cache + * KMC_SLAB Force Linux slab backed cache + * KMC_NODEBUG Disable debugging (unsupported) + */ +spl_kmem_cache_t * +spl_kmem_cache_create(char *name, size_t size, size_t align, + spl_kmem_ctor_t ctor, spl_kmem_dtor_t dtor, void *reclaim, + void *priv, void *vmp, int flags) +{ + gfp_t lflags = kmem_flags_convert(KM_SLEEP); + spl_kmem_cache_t *skc; + int rc; + + /* + * Unsupported flags + */ + ASSERT(vmp == NULL); + ASSERT(reclaim == NULL); + + might_sleep(); + + skc = kzalloc(sizeof (*skc), lflags); + if (skc == NULL) + return (NULL); + + skc->skc_magic = SKC_MAGIC; + skc->skc_name_size = strlen(name) + 1; + skc->skc_name = (char *)kmalloc(skc->skc_name_size, lflags); + if (skc->skc_name == NULL) { + kfree(skc); + return (NULL); + } + strncpy(skc->skc_name, name, skc->skc_name_size); + + skc->skc_ctor = ctor; + skc->skc_dtor = dtor; + skc->skc_private = priv; + skc->skc_vmp = vmp; + skc->skc_linux_cache = NULL; + skc->skc_flags = flags; + skc->skc_obj_size = size; + skc->skc_obj_align = SPL_KMEM_CACHE_ALIGN; + atomic_set(&skc->skc_ref, 0); + + INIT_LIST_HEAD(&skc->skc_list); + INIT_LIST_HEAD(&skc->skc_complete_list); + INIT_LIST_HEAD(&skc->skc_partial_list); + skc->skc_emergency_tree = RB_ROOT; + spin_lock_init(&skc->skc_lock); + init_waitqueue_head(&skc->skc_waitq); + skc->skc_slab_fail = 0; + skc->skc_slab_create = 0; + skc->skc_slab_destroy = 0; + skc->skc_slab_total = 0; + skc->skc_slab_alloc = 0; + skc->skc_slab_max = 0; + skc->skc_obj_total = 0; + skc->skc_obj_alloc = 0; + skc->skc_obj_max = 0; + skc->skc_obj_deadlock = 0; + skc->skc_obj_emergency = 0; + skc->skc_obj_emergency_max = 0; + + rc = percpu_counter_init_common(&skc->skc_linux_alloc, 0, + GFP_KERNEL); + if (rc != 0) { + kfree(skc); + return (NULL); + } + + /* + * Verify the requested alignment restriction is sane. + */ + if (align) { + VERIFY(ISP2(align)); + VERIFY3U(align, >=, SPL_KMEM_CACHE_ALIGN); + VERIFY3U(align, <=, PAGE_SIZE); + skc->skc_obj_align = align; + } + + /* + * When no specific type of slab is requested (kmem, vmem, or + * linuxslab) then select a cache type based on the object size + * and default tunables. + */ + if (!(skc->skc_flags & (KMC_SLAB | KMC_KVMEM))) { + if (spl_kmem_cache_slab_limit && + size <= (size_t)spl_kmem_cache_slab_limit) { + /* + * Objects smaller than spl_kmem_cache_slab_limit can + * use the Linux slab for better space-efficiency. + */ + skc->skc_flags |= KMC_SLAB; + } else { + /* + * All other objects are considered large and are + * placed on kvmem backed slabs. + */ + skc->skc_flags |= KMC_KVMEM; + } + } + + /* + * Given the type of slab allocate the required resources. + */ + if (skc->skc_flags & KMC_KVMEM) { + rc = spl_slab_size(skc, + &skc->skc_slab_objs, &skc->skc_slab_size); + if (rc) + goto out; + + rc = spl_magazine_create(skc); + if (rc) + goto out; + } else { + unsigned long slabflags = 0; + + if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) { + rc = EINVAL; + goto out; + } + +#if defined(SLAB_USERCOPY) + /* + * Required for PAX-enabled kernels if the slab is to be + * used for copying between user and kernel space. + */ + slabflags |= SLAB_USERCOPY; +#endif + +#if defined(HAVE_KMEM_CACHE_CREATE_USERCOPY) + /* + * Newer grsec patchset uses kmem_cache_create_usercopy() + * instead of SLAB_USERCOPY flag + */ + skc->skc_linux_cache = kmem_cache_create_usercopy( + skc->skc_name, size, align, slabflags, 0, size, NULL); +#else + skc->skc_linux_cache = kmem_cache_create( + skc->skc_name, size, align, slabflags, NULL); +#endif + if (skc->skc_linux_cache == NULL) { + rc = ENOMEM; + goto out; + } + } + + down_write(&spl_kmem_cache_sem); + list_add_tail(&skc->skc_list, &spl_kmem_cache_list); + up_write(&spl_kmem_cache_sem); + + return (skc); +out: + kfree(skc->skc_name); + percpu_counter_destroy(&skc->skc_linux_alloc); + kfree(skc); + return (NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_create); + +/* + * Register a move callback for cache defragmentation. + * XXX: Unimplemented but harmless to stub out for now. + */ +void +spl_kmem_cache_set_move(spl_kmem_cache_t *skc, + kmem_cbrc_t (move)(void *, void *, size_t, void *)) +{ + ASSERT(move != NULL); +} +EXPORT_SYMBOL(spl_kmem_cache_set_move); + +/* + * Destroy a cache and all objects associated with the cache. + */ +void +spl_kmem_cache_destroy(spl_kmem_cache_t *skc) +{ + DECLARE_WAIT_QUEUE_HEAD(wq); + taskqid_t id; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skc->skc_flags & (KMC_KVMEM | KMC_SLAB)); + + down_write(&spl_kmem_cache_sem); + list_del_init(&skc->skc_list); + up_write(&spl_kmem_cache_sem); + + /* Cancel any and wait for any pending delayed tasks */ + VERIFY(!test_and_set_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + spin_lock(&skc->skc_lock); + id = skc->skc_taskqid; + spin_unlock(&skc->skc_lock); + + taskq_cancel_id(spl_kmem_cache_taskq, id); + + /* + * Wait until all current callers complete, this is mainly + * to catch the case where a low memory situation triggers a + * cache reaping action which races with this destroy. + */ + wait_event(wq, atomic_read(&skc->skc_ref) == 0); + + if (skc->skc_flags & KMC_KVMEM) { + spl_magazine_destroy(skc); + spl_slab_reclaim(skc); + } else { + ASSERT(skc->skc_flags & KMC_SLAB); + kmem_cache_destroy(skc->skc_linux_cache); + } + + spin_lock(&skc->skc_lock); + + /* + * Validate there are no objects in use and free all the + * spl_kmem_slab_t, spl_kmem_obj_t, and object buffers. + */ + ASSERT3U(skc->skc_slab_alloc, ==, 0); + ASSERT3U(skc->skc_obj_alloc, ==, 0); + ASSERT3U(skc->skc_slab_total, ==, 0); + ASSERT3U(skc->skc_obj_total, ==, 0); + ASSERT3U(skc->skc_obj_emergency, ==, 0); + ASSERT(list_empty(&skc->skc_complete_list)); + + ASSERT3U(percpu_counter_sum(&skc->skc_linux_alloc), ==, 0); + percpu_counter_destroy(&skc->skc_linux_alloc); + + spin_unlock(&skc->skc_lock); + + kfree(skc->skc_name); + kfree(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_destroy); + +/* + * Allocate an object from a slab attached to the cache. This is used to + * repopulate the per-cpu magazine caches in batches when they run low. + */ +static void * +spl_cache_obj(spl_kmem_cache_t *skc, spl_kmem_slab_t *sks) +{ + spl_kmem_obj_t *sko; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(sks->sks_magic == SKS_MAGIC); + + sko = list_entry(sks->sks_free_list.next, spl_kmem_obj_t, sko_list); + ASSERT(sko->sko_magic == SKO_MAGIC); + ASSERT(sko->sko_addr != NULL); + + /* Remove from sks_free_list */ + list_del_init(&sko->sko_list); + + sks->sks_age = jiffies; + sks->sks_ref++; + skc->skc_obj_alloc++; + + /* Track max obj usage statistics */ + if (skc->skc_obj_alloc > skc->skc_obj_max) + skc->skc_obj_max = skc->skc_obj_alloc; + + /* Track max slab usage statistics */ + if (sks->sks_ref == 1) { + skc->skc_slab_alloc++; + + if (skc->skc_slab_alloc > skc->skc_slab_max) + skc->skc_slab_max = skc->skc_slab_alloc; + } + + return (sko->sko_addr); +} + +/* + * Generic slab allocation function to run by the global work queues. + * It is responsible for allocating a new slab, linking it in to the list + * of partial slabs, and then waking any waiters. + */ +static int +__spl_cache_grow(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_slab_t *sks; + + fstrans_cookie_t cookie = spl_fstrans_mark(); + sks = spl_slab_alloc(skc, flags); + spl_fstrans_unmark(cookie); + + spin_lock(&skc->skc_lock); + if (sks) { + skc->skc_slab_total++; + skc->skc_obj_total += sks->sks_objs; + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + + smp_mb__before_atomic(); + clear_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + smp_mb__after_atomic(); + } + spin_unlock(&skc->skc_lock); + + return (sks == NULL ? -ENOMEM : 0); +} + +static void +spl_cache_grow_work(void *data) +{ + spl_kmem_alloc_t *ska = (spl_kmem_alloc_t *)data; + spl_kmem_cache_t *skc = ska->ska_cache; + + int error = __spl_cache_grow(skc, ska->ska_flags); + + atomic_dec(&skc->skc_ref); + smp_mb__before_atomic(); + clear_bit(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + if (error == 0) + wake_up_all(&skc->skc_waitq); + + kfree(ska); +} + +/* + * Returns non-zero when a new slab should be available. + */ +static int +spl_cache_grow_wait(spl_kmem_cache_t *skc) +{ + return (!test_bit(KMC_BIT_GROWING, &skc->skc_flags)); +} + +/* + * No available objects on any slabs, create a new slab. Note that this + * functionality is disabled for KMC_SLAB caches which are backed by the + * Linux slab. + */ +static int +spl_cache_grow(spl_kmem_cache_t *skc, int flags, void **obj) +{ + int remaining, rc = 0; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT((skc->skc_flags & KMC_SLAB) == 0); + might_sleep(); + *obj = NULL; + + /* + * Before allocating a new slab wait for any reaping to complete and + * then return so the local magazine can be rechecked for new objects. + */ + if (test_bit(KMC_BIT_REAPING, &skc->skc_flags)) { + rc = spl_wait_on_bit(&skc->skc_flags, KMC_BIT_REAPING, + TASK_UNINTERRUPTIBLE); + return (rc ? rc : -EAGAIN); + } + + /* + * Note: It would be nice to reduce the overhead of context switch + * and improve NUMA locality, by trying to allocate a new slab in the + * current process context with KM_NOSLEEP flag. + * + * However, this can't be applied to vmem/kvmem due to a bug that + * spl_vmalloc() doesn't honor gfp flags in page table allocation. + */ + + /* + * This is handled by dispatching a work request to the global work + * queue. This allows us to asynchronously allocate a new slab while + * retaining the ability to safely fall back to a smaller synchronous + * allocations to ensure forward progress is always maintained. + */ + if (test_and_set_bit(KMC_BIT_GROWING, &skc->skc_flags) == 0) { + spl_kmem_alloc_t *ska; + + ska = kmalloc(sizeof (*ska), kmem_flags_convert(flags)); + if (ska == NULL) { + clear_bit_unlock(KMC_BIT_GROWING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_all(&skc->skc_waitq); + return (-ENOMEM); + } + + atomic_inc(&skc->skc_ref); + ska->ska_cache = skc; + ska->ska_flags = flags; + taskq_init_ent(&ska->ska_tqe); + taskq_dispatch_ent(spl_kmem_cache_taskq, + spl_cache_grow_work, ska, 0, &ska->ska_tqe); + } + + /* + * The goal here is to only detect the rare case where a virtual slab + * allocation has deadlocked. We must be careful to minimize the use + * of emergency objects which are more expensive to track. Therefore, + * we set a very long timeout for the asynchronous allocation and if + * the timeout is reached the cache is flagged as deadlocked. From + * this point only new emergency objects will be allocated until the + * asynchronous allocation completes and clears the deadlocked flag. + */ + if (test_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags)) { + rc = spl_emergency_alloc(skc, flags, obj); + } else { + remaining = wait_event_timeout(skc->skc_waitq, + spl_cache_grow_wait(skc), HZ / 10); + + if (!remaining) { + spin_lock(&skc->skc_lock); + if (test_bit(KMC_BIT_GROWING, &skc->skc_flags)) { + set_bit(KMC_BIT_DEADLOCKED, &skc->skc_flags); + skc->skc_obj_deadlock++; + } + spin_unlock(&skc->skc_lock); + } + + rc = -ENOMEM; + } + + return (rc); +} + +/* + * Refill a per-cpu magazine with objects from the slabs for this cache. + * Ideally the magazine can be repopulated using existing objects which have + * been released, however if we are unable to locate enough free objects new + * slabs of objects will be created. On success NULL is returned, otherwise + * the address of a single emergency object is returned for use by the caller. + */ +static void * +spl_cache_refill(spl_kmem_cache_t *skc, spl_kmem_magazine_t *skm, int flags) +{ + spl_kmem_slab_t *sks; + int count = 0, rc, refill; + void *obj = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(skm->skm_magic == SKM_MAGIC); + + refill = MIN(skm->skm_refill, skm->skm_size - skm->skm_avail); + spin_lock(&skc->skc_lock); + + while (refill > 0) { + /* No slabs available we may need to grow the cache */ + if (list_empty(&skc->skc_partial_list)) { + spin_unlock(&skc->skc_lock); + + local_irq_enable(); + rc = spl_cache_grow(skc, flags, &obj); + local_irq_disable(); + + /* Emergency object for immediate use by caller */ + if (rc == 0 && obj != NULL) + return (obj); + + if (rc) + goto out; + + /* Rescheduled to different CPU skm is not local */ + if (skm != skc->skc_mag[smp_processor_id()]) + goto out; + + /* + * Potentially rescheduled to the same CPU but + * allocations may have occurred from this CPU while + * we were sleeping so recalculate max refill. + */ + refill = MIN(refill, skm->skm_size - skm->skm_avail); + + spin_lock(&skc->skc_lock); + continue; + } + + /* Grab the next available slab */ + sks = list_entry((&skc->skc_partial_list)->next, + spl_kmem_slab_t, sks_list); + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_ref < sks->sks_objs); + ASSERT(!list_empty(&sks->sks_free_list)); + + /* + * Consume as many objects as needed to refill the requested + * cache. We must also be careful not to overfill it. + */ + while (sks->sks_ref < sks->sks_objs && refill-- > 0 && + ++count) { + ASSERT(skm->skm_avail < skm->skm_size); + ASSERT(count < skm->skm_size); + skm->skm_objs[skm->skm_avail++] = + spl_cache_obj(skc, sks); + } + + /* Move slab to skc_complete_list when full */ + if (sks->sks_ref == sks->sks_objs) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_complete_list); + } + } + + spin_unlock(&skc->skc_lock); +out: + return (NULL); +} + +/* + * Release an object back to the slab from which it came. + */ +static void +spl_cache_shrink(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_slab_t *sks = NULL; + spl_kmem_obj_t *sko = NULL; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + sko = spl_sko_from_obj(skc, obj); + ASSERT(sko->sko_magic == SKO_MAGIC); + sks = sko->sko_slab; + ASSERT(sks->sks_magic == SKS_MAGIC); + ASSERT(sks->sks_cache == skc); + list_add(&sko->sko_list, &sks->sks_free_list); + + sks->sks_age = jiffies; + sks->sks_ref--; + skc->skc_obj_alloc--; + + /* + * Move slab to skc_partial_list when no longer full. Slabs + * are added to the head to keep the partial list is quasi-full + * sorted order. Fuller at the head, emptier at the tail. + */ + if (sks->sks_ref == (sks->sks_objs - 1)) { + list_del(&sks->sks_list); + list_add(&sks->sks_list, &skc->skc_partial_list); + } + + /* + * Move empty slabs to the end of the partial list so + * they can be easily found and freed during reclamation. + */ + if (sks->sks_ref == 0) { + list_del(&sks->sks_list); + list_add_tail(&sks->sks_list, &skc->skc_partial_list); + skc->skc_slab_alloc--; + } +} + +/* + * Allocate an object from the per-cpu magazine, or if the magazine + * is empty directly allocate from a slab and repopulate the magazine. + */ +void * +spl_kmem_cache_alloc(spl_kmem_cache_t *skc, int flags) +{ + spl_kmem_magazine_t *skm; + void *obj = NULL; + + ASSERT0(flags & ~KM_PUBLIC_MASK); + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Allocate directly from a Linux slab. All optimizations are left + * to the underlying cache we only need to guarantee that KM_SLEEP + * callers will never fail. + */ + if (skc->skc_flags & KMC_SLAB) { + struct kmem_cache *slc = skc->skc_linux_cache; + do { + obj = kmem_cache_alloc(slc, kmem_flags_convert(flags)); + } while ((obj == NULL) && !(flags & KM_NOSLEEP)); + + if (obj != NULL) { + /* + * Even though we leave everything up to the + * underlying cache we still keep track of + * how many objects we've allocated in it for + * better debuggability. + */ + percpu_counter_inc(&skc->skc_linux_alloc); + } + goto ret; + } + + local_irq_disable(); + +restart: + /* + * Safe to update per-cpu structure without lock, but + * in the restart case we must be careful to reacquire + * the local magazine since this may have changed + * when we need to grow the cache. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + if (likely(skm->skm_avail)) { + /* Object available in CPU cache, use it */ + obj = skm->skm_objs[--skm->skm_avail]; + } else { + obj = spl_cache_refill(skc, skm, flags); + if ((obj == NULL) && !(flags & KM_NOSLEEP)) + goto restart; + + local_irq_enable(); + goto ret; + } + + local_irq_enable(); + ASSERT(obj); + ASSERT(IS_P2ALIGNED(obj, skc->skc_obj_align)); + +ret: + /* Pre-emptively migrate object to CPU L1 cache */ + if (obj) { + if (obj && skc->skc_ctor) + skc->skc_ctor(obj, skc->skc_private, flags); + else + prefetchw(obj); + } + + return (obj); +} +EXPORT_SYMBOL(spl_kmem_cache_alloc); + +/* + * Free an object back to the local per-cpu magazine, there is no + * guarantee that this is the same magazine the object was originally + * allocated from. We may need to flush entire from the magazine + * back to the slabs to make space. + */ +void +spl_kmem_cache_free(spl_kmem_cache_t *skc, void *obj) +{ + spl_kmem_magazine_t *skm; + unsigned long flags; + int do_reclaim = 0; + int do_emergency = 0; + + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + /* + * Run the destructor + */ + if (skc->skc_dtor) + skc->skc_dtor(obj, skc->skc_private); + + /* + * Free the object from the Linux underlying Linux slab. + */ + if (skc->skc_flags & KMC_SLAB) { + kmem_cache_free(skc->skc_linux_cache, obj); + percpu_counter_dec(&skc->skc_linux_alloc); + return; + } + + /* + * While a cache has outstanding emergency objects all freed objects + * must be checked. However, since emergency objects will never use + * a virtual address these objects can be safely excluded as an + * optimization. + */ + if (!is_vmalloc_addr(obj)) { + spin_lock(&skc->skc_lock); + do_emergency = (skc->skc_obj_emergency > 0); + spin_unlock(&skc->skc_lock); + + if (do_emergency && (spl_emergency_free(skc, obj) == 0)) + return; + } + + local_irq_save(flags); + + /* + * Safe to update per-cpu structure without lock, but + * no remote memory allocation tracking is being performed + * it is entirely possible to allocate an object from one + * CPU cache and return it to another. + */ + skm = skc->skc_mag[smp_processor_id()]; + ASSERT(skm->skm_magic == SKM_MAGIC); + + /* + * Per-CPU cache full, flush it to make space for this object, + * this may result in an empty slab which can be reclaimed once + * interrupts are re-enabled. + */ + if (unlikely(skm->skm_avail >= skm->skm_size)) { + spl_cache_flush(skc, skm, skm->skm_refill); + do_reclaim = 1; + } + + /* Available space in cache, use it */ + skm->skm_objs[skm->skm_avail++] = obj; + + local_irq_restore(flags); + + if (do_reclaim) + spl_slab_reclaim(skc); +} +EXPORT_SYMBOL(spl_kmem_cache_free); + +/* + * Depending on how many and which objects are released it may simply + * repopulate the local magazine which will then need to age-out. Objects + * which cannot fit in the magazine will be released back to their slabs + * which will also need to age out before being released. This is all just + * best effort and we do not want to thrash creating and destroying slabs. + */ +void +spl_kmem_cache_reap_now(spl_kmem_cache_t *skc) +{ + ASSERT(skc->skc_magic == SKC_MAGIC); + ASSERT(!test_bit(KMC_BIT_DESTROY, &skc->skc_flags)); + + if (skc->skc_flags & KMC_SLAB) + return; + + atomic_inc(&skc->skc_ref); + + /* + * Prevent concurrent cache reaping when contended. + */ + if (test_and_set_bit(KMC_BIT_REAPING, &skc->skc_flags)) + goto out; + + /* Reclaim from the magazine and free all now empty slabs. */ + unsigned long irq_flags; + local_irq_save(irq_flags); + spl_kmem_magazine_t *skm = skc->skc_mag[smp_processor_id()]; + spl_cache_flush(skc, skm, skm->skm_avail); + local_irq_restore(irq_flags); + + spl_slab_reclaim(skc); + clear_bit_unlock(KMC_BIT_REAPING, &skc->skc_flags); + smp_mb__after_atomic(); + wake_up_bit(&skc->skc_flags, KMC_BIT_REAPING); +out: + atomic_dec(&skc->skc_ref); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_now); + +/* + * This is stubbed out for code consistency with other platforms. There + * is existing logic to prevent concurrent reaping so while this is ugly + * it should do no harm. + */ +int +spl_kmem_cache_reap_active() +{ + return (0); +} +EXPORT_SYMBOL(spl_kmem_cache_reap_active); + +/* + * Reap all free slabs from all registered caches. + */ +void +spl_kmem_reap(void) +{ + spl_kmem_cache_t *skc = NULL; + + down_read(&spl_kmem_cache_sem); + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + spl_kmem_cache_reap_now(skc); + } + up_read(&spl_kmem_cache_sem); +} +EXPORT_SYMBOL(spl_kmem_reap); + +int +spl_kmem_cache_init(void) +{ + init_rwsem(&spl_kmem_cache_sem); + INIT_LIST_HEAD(&spl_kmem_cache_list); + spl_kmem_cache_taskq = taskq_create("spl_kmem_cache", + spl_kmem_cache_kmem_threads, maxclsyspri, + spl_kmem_cache_kmem_threads * 8, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + return (0); +} + +void +spl_kmem_cache_fini(void) +{ + taskq_destroy(spl_kmem_cache_taskq); +} diff --git a/module/os/linux/spl/spl-kmem.c b/module/os/linux/spl/spl-kmem.c new file mode 100644 index 000000000000..f19421cfcc03 --- /dev/null +++ b/module/os/linux/spl/spl-kmem.c @@ -0,0 +1,618 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include + +/* + * As a general rule kmem_alloc() allocations should be small, preferably + * just a few pages since they must by physically contiguous. Therefore, a + * rate limited warning will be printed to the console for any kmem_alloc() + * which exceeds a reasonable threshold. + * + * The default warning threshold is set to sixteen pages but capped at 64K to + * accommodate systems using large pages. This value was selected to be small + * enough to ensure the largest allocations are quickly noticed and fixed. + * But large enough to avoid logging any warnings when a allocation size is + * larger than optimal but not a serious concern. Since this value is tunable, + * developers are encouraged to set it lower when testing so any new largish + * allocations are quickly caught. These warnings may be disabled by setting + * the threshold to zero. + */ +/* BEGIN CSTYLED */ +unsigned int spl_kmem_alloc_warn = MIN(16 * PAGE_SIZE, 64 * 1024); +module_param(spl_kmem_alloc_warn, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_warn, + "Warning threshold in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_warn); + +/* + * Large kmem_alloc() allocations will fail if they exceed KMALLOC_MAX_SIZE. + * Allocations which are marginally smaller than this limit may succeed but + * should still be avoided due to the expense of locating a contiguous range + * of free pages. Therefore, a maximum kmem size with reasonable safely + * margin of 4x is set. Kmem_alloc() allocations larger than this maximum + * will quickly fail. Vmem_alloc() allocations less than or equal to this + * value will use kmalloc(), but shift to vmalloc() when exceeding this value. + */ +unsigned int spl_kmem_alloc_max = (KMALLOC_MAX_SIZE >> 2); +module_param(spl_kmem_alloc_max, uint, 0644); +MODULE_PARM_DESC(spl_kmem_alloc_max, + "Maximum size in bytes for a kmem_alloc()"); +EXPORT_SYMBOL(spl_kmem_alloc_max); +/* END CSTYLED */ + +int +kmem_debugging(void) +{ + return (0); +} +EXPORT_SYMBOL(kmem_debugging); + +char * +kmem_vasprintf(const char *fmt, va_list ap) +{ + va_list aq; + char *ptr; + + do { + va_copy(aq, ap); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, aq); + va_end(aq); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_vasprintf); + +char * +kmem_asprintf(const char *fmt, ...) +{ + va_list ap; + char *ptr; + + do { + va_start(ap, fmt); + ptr = kvasprintf(kmem_flags_convert(KM_SLEEP), fmt, ap); + va_end(ap); + } while (ptr == NULL); + + return (ptr); +} +EXPORT_SYMBOL(kmem_asprintf); + +static char * +__strdup(const char *str, int flags) +{ + char *ptr; + int n; + + n = strlen(str); + ptr = kmalloc(n + 1, kmem_flags_convert(flags)); + if (ptr) + memcpy(ptr, str, n + 1); + + return (ptr); +} + +char * +kmem_strdup(const char *str) +{ + return (__strdup(str, KM_SLEEP)); +} +EXPORT_SYMBOL(kmem_strdup); + +void +kmem_strfree(char *str) +{ + kfree(str); +} +EXPORT_SYMBOL(kmem_strfree); + +void * +spl_kvmalloc(size_t size, gfp_t lflags) +{ +#ifdef HAVE_KVMALLOC + /* + * GFP_KERNEL allocations can safely use kvmalloc which may + * improve performance by avoiding a) high latency caused by + * vmalloc's on-access allocation, b) performance loss due to + * MMU memory address mapping and c) vmalloc locking overhead. + * This has the side-effect that the slab statistics will + * incorrectly report this as a vmem allocation, but that is + * purely cosmetic. + */ + if ((lflags & GFP_KERNEL) == GFP_KERNEL) + return (kvmalloc(size, lflags)); +#endif + + gfp_t kmalloc_lflags = lflags; + + if (size > PAGE_SIZE) { + /* + * We need to set __GFP_NOWARN here since spl_kvmalloc is not + * only called by spl_kmem_alloc_impl but can be called + * directly with custom lflags, too. In that case + * kmem_flags_convert does not get called, which would + * implicitly set __GFP_NOWARN. + */ + kmalloc_lflags |= __GFP_NOWARN; + + /* + * N.B. __GFP_RETRY_MAYFAIL is supported only for large + * e (>32kB) allocations. + * + * We have to override __GFP_RETRY_MAYFAIL by __GFP_NORETRY + * for !costly requests because there is no other way to tell + * the allocator that we want to fail rather than retry + * endlessly. + */ + if (!(kmalloc_lflags & __GFP_RETRY_MAYFAIL) || + (size <= PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER)) { + kmalloc_lflags |= __GFP_NORETRY; + } + } + + /* + * We first try kmalloc - even for big sizes - and fall back to + * spl_vmalloc if that fails. + * + * For non-__GFP-RECLAIM allocations we always stick to + * kmalloc_node, and fail when kmalloc is not successful (returns + * NULL). + * We cannot fall back to spl_vmalloc in this case because spl_vmalloc + * internally uses GPF_KERNEL allocations. + */ + void *ptr = kmalloc_node(size, kmalloc_lflags, NUMA_NO_NODE); + if (ptr || size <= PAGE_SIZE || + (lflags & __GFP_RECLAIM) != __GFP_RECLAIM) { + return (ptr); + } + + return (spl_vmalloc(size, lflags | __GFP_HIGHMEM)); +} + +/* + * General purpose unified implementation of kmem_alloc(). It is an + * amalgamation of Linux and Illumos allocator design. It should never be + * exported to ensure that code using kmem_alloc()/kmem_zalloc() remains + * relatively portable. Consumers may only access this function through + * wrappers that enforce the common flags to ensure portability. + */ +inline void * +spl_kmem_alloc_impl(size_t size, int flags, int node) +{ + gfp_t lflags = kmem_flags_convert(flags); + void *ptr; + + /* + * Log abnormally large allocations and rate limit the console output. + * Allocations larger than spl_kmem_alloc_warn should be performed + * through the vmem_alloc()/vmem_zalloc() interfaces. + */ + if ((spl_kmem_alloc_warn > 0) && (size > spl_kmem_alloc_warn) && + !(flags & KM_VMEM)) { + printk(KERN_WARNING + "Large kmem_alloc(%lu, 0x%x), please file an issue at:\n" + "https://github.com/zfsonlinux/zfs/issues/new\n", + (unsigned long)size, flags); + dump_stack(); + } + + /* + * Use a loop because kmalloc_node() can fail when GFP_KERNEL is used + * unlike kmem_alloc() with KM_SLEEP on Illumos. + */ + do { + /* + * Calling kmalloc_node() when the size >= spl_kmem_alloc_max + * is unsafe. This must fail for all for kmem_alloc() and + * kmem_zalloc() callers. + * + * For vmem_alloc() and vmem_zalloc() callers it is permissible + * to use spl_vmalloc(). However, in general use of + * spl_vmalloc() is strongly discouraged because a global lock + * must be acquired. Contention on this lock can significantly + * impact performance so frequently manipulating the virtual + * address space is strongly discouraged. + */ + if (size > spl_kmem_alloc_max) { + if (flags & KM_VMEM) { + ptr = spl_vmalloc(size, lflags | __GFP_HIGHMEM); + } else { + return (NULL); + } + } else { + if (flags & KM_VMEM) { + ptr = spl_kvmalloc(size, lflags); + } else { + ptr = kmalloc_node(size, lflags, node); + } + } + + if (likely(ptr) || (flags & KM_NOSLEEP)) + return (ptr); + + /* + * Try hard to satisfy the allocation. However, when progress + * cannot be made, the allocation is allowed to fail. + */ + if ((lflags & GFP_KERNEL) == GFP_KERNEL) + lflags |= __GFP_RETRY_MAYFAIL; + + /* + * Use cond_resched() instead of congestion_wait() to avoid + * deadlocking systems where there are no block devices. + */ + cond_resched(); + } while (1); + + return (NULL); +} + +inline void +spl_kmem_free_impl(const void *buf, size_t size) +{ + if (is_vmalloc_addr(buf)) + vfree(buf); + else + kfree(buf); +} + +/* + * Memory allocation and accounting for kmem_* * style allocations. When + * DEBUG_KMEM is enabled the total memory allocated will be tracked and + * any memory leaked will be reported during module unload. + * + * ./configure --enable-debug-kmem + */ +#ifdef DEBUG_KMEM + +/* Shim layer memory accounting */ +#ifdef HAVE_ATOMIC64_T +atomic64_t kmem_alloc_used = ATOMIC64_INIT(0); +unsigned long long kmem_alloc_max = 0; +#else /* HAVE_ATOMIC64_T */ +atomic_t kmem_alloc_used = ATOMIC_INIT(0); +unsigned long long kmem_alloc_max = 0; +#endif /* HAVE_ATOMIC64_T */ + +EXPORT_SYMBOL(kmem_alloc_used); +EXPORT_SYMBOL(kmem_alloc_max); + +inline void * +spl_kmem_alloc_debug(size_t size, int flags, int node) +{ + void *ptr; + + ptr = spl_kmem_alloc_impl(size, flags, node); + if (ptr) { + kmem_alloc_used_add(size); + if (unlikely(kmem_alloc_used_read() > kmem_alloc_max)) + kmem_alloc_max = kmem_alloc_used_read(); + } + + return (ptr); +} + +inline void +spl_kmem_free_debug(const void *ptr, size_t size) +{ + kmem_alloc_used_sub(size); + spl_kmem_free_impl(ptr, size); +} + +/* + * When DEBUG_KMEM_TRACKING is enabled not only will total bytes be tracked + * but also the location of every alloc and free. When the SPL module is + * unloaded a list of all leaked addresses and where they were allocated + * will be dumped to the console. Enabling this feature has a significant + * impact on performance but it makes finding memory leaks straight forward. + * + * Not surprisingly with debugging enabled the xmem_locks are very highly + * contended particularly on xfree(). If we want to run with this detailed + * debugging enabled for anything other than debugging we need to minimize + * the contention by moving to a lock per xmem_table entry model. + * + * ./configure --enable-debug-kmem-tracking + */ +#ifdef DEBUG_KMEM_TRACKING + +#include +#include + +#define KMEM_HASH_BITS 10 +#define KMEM_TABLE_SIZE (1 << KMEM_HASH_BITS) + +typedef struct kmem_debug { + struct hlist_node kd_hlist; /* Hash node linkage */ + struct list_head kd_list; /* List of all allocations */ + void *kd_addr; /* Allocation pointer */ + size_t kd_size; /* Allocation size */ + const char *kd_func; /* Allocation function */ + int kd_line; /* Allocation line */ +} kmem_debug_t; + +static spinlock_t kmem_lock; +static struct hlist_head kmem_table[KMEM_TABLE_SIZE]; +static struct list_head kmem_list; + +static kmem_debug_t * +kmem_del_init(spinlock_t *lock, struct hlist_head *table, + int bits, const void *addr) +{ + struct hlist_head *head; + struct hlist_node *node = NULL; + struct kmem_debug *p; + unsigned long flags; + + spin_lock_irqsave(lock, flags); + + head = &table[hash_ptr((void *)addr, bits)]; + hlist_for_each(node, head) { + p = list_entry(node, struct kmem_debug, kd_hlist); + if (p->kd_addr == addr) { + hlist_del_init(&p->kd_hlist); + list_del_init(&p->kd_list); + spin_unlock_irqrestore(lock, flags); + return (p); + } + } + + spin_unlock_irqrestore(lock, flags); + + return (NULL); +} + +inline void * +spl_kmem_alloc_track(size_t size, int flags, + const char *func, int line, int node) +{ + void *ptr = NULL; + kmem_debug_t *dptr; + unsigned long irq_flags; + + dptr = kmalloc(sizeof (kmem_debug_t), kmem_flags_convert(flags)); + if (dptr == NULL) + return (NULL); + + dptr->kd_func = __strdup(func, flags); + if (dptr->kd_func == NULL) { + kfree(dptr); + return (NULL); + } + + ptr = spl_kmem_alloc_debug(size, flags, node); + if (ptr == NULL) { + kfree(dptr->kd_func); + kfree(dptr); + return (NULL); + } + + INIT_HLIST_NODE(&dptr->kd_hlist); + INIT_LIST_HEAD(&dptr->kd_list); + + dptr->kd_addr = ptr; + dptr->kd_size = size; + dptr->kd_line = line; + + spin_lock_irqsave(&kmem_lock, irq_flags); + hlist_add_head(&dptr->kd_hlist, + &kmem_table[hash_ptr(ptr, KMEM_HASH_BITS)]); + list_add_tail(&dptr->kd_list, &kmem_list); + spin_unlock_irqrestore(&kmem_lock, irq_flags); + + return (ptr); +} + +inline void +spl_kmem_free_track(const void *ptr, size_t size) +{ + kmem_debug_t *dptr; + + /* Ignore NULL pointer since we haven't tracked it at all */ + if (ptr == NULL) + return; + + /* Must exist in hash due to kmem_alloc() */ + dptr = kmem_del_init(&kmem_lock, kmem_table, KMEM_HASH_BITS, ptr); + ASSERT3P(dptr, !=, NULL); + ASSERT3S(dptr->kd_size, ==, size); + + kfree(dptr->kd_func); + kfree(dptr); + + spl_kmem_free_debug(ptr, size); +} +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + +/* + * Public kmem_alloc(), kmem_zalloc() and kmem_free() interfaces. + */ +void * +spl_kmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_alloc); + +void * +spl_kmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_ZERO; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_kmem_zalloc); + +void +spl_kmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_kmem_free); + +#if defined(DEBUG_KMEM) && defined(DEBUG_KMEM_TRACKING) +static char * +spl_sprintf_addr(kmem_debug_t *kd, char *str, int len, int min) +{ + int size = ((len - 1) < kd->kd_size) ? (len - 1) : kd->kd_size; + int i, flag = 1; + + ASSERT(str != NULL && len >= 17); + memset(str, 0, len); + + /* + * Check for a fully printable string, and while we are at + * it place the printable characters in the passed buffer. + */ + for (i = 0; i < size; i++) { + str[i] = ((char *)(kd->kd_addr))[i]; + if (isprint(str[i])) { + continue; + } else { + /* + * Minimum number of printable characters found + * to make it worthwhile to print this as ascii. + */ + if (i > min) + break; + + flag = 0; + break; + } + } + + if (!flag) { + sprintf(str, "%02x%02x%02x%02x%02x%02x%02x%02x", + *((uint8_t *)kd->kd_addr), + *((uint8_t *)kd->kd_addr + 2), + *((uint8_t *)kd->kd_addr + 4), + *((uint8_t *)kd->kd_addr + 6), + *((uint8_t *)kd->kd_addr + 8), + *((uint8_t *)kd->kd_addr + 10), + *((uint8_t *)kd->kd_addr + 12), + *((uint8_t *)kd->kd_addr + 14)); + } + + return (str); +} + +static int +spl_kmem_init_tracking(struct list_head *list, spinlock_t *lock, int size) +{ + int i; + + spin_lock_init(lock); + INIT_LIST_HEAD(list); + + for (i = 0; i < size; i++) + INIT_HLIST_HEAD(&kmem_table[i]); + + return (0); +} + +static void +spl_kmem_fini_tracking(struct list_head *list, spinlock_t *lock) +{ + unsigned long flags; + kmem_debug_t *kd = NULL; + char str[17]; + + spin_lock_irqsave(lock, flags); + if (!list_empty(list)) + printk(KERN_WARNING "%-16s %-5s %-16s %s:%s\n", "address", + "size", "data", "func", "line"); + + list_for_each_entry(kd, list, kd_list) { + printk(KERN_WARNING "%p %-5d %-16s %s:%d\n", kd->kd_addr, + (int)kd->kd_size, spl_sprintf_addr(kd, str, 17, 8), + kd->kd_func, kd->kd_line); + } + + spin_unlock_irqrestore(lock, flags); +} +#endif /* DEBUG_KMEM && DEBUG_KMEM_TRACKING */ + +int +spl_kmem_init(void) +{ + +#ifdef DEBUG_KMEM + kmem_alloc_used_set(0); + + + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_init_tracking(&kmem_list, &kmem_lock, KMEM_TABLE_SIZE); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ + + return (0); +} + +void +spl_kmem_fini(void) +{ +#ifdef DEBUG_KMEM + /* + * Display all unreclaimed memory addresses, including the + * allocation size and the first few bytes of what's located + * at that address to aid in debugging. Performance is not + * a serious concern here since it is module unload time. + */ + if (kmem_alloc_used_read() != 0) + printk(KERN_WARNING "kmem leaked %ld/%llu bytes\n", + (unsigned long)kmem_alloc_used_read(), kmem_alloc_max); + +#ifdef DEBUG_KMEM_TRACKING + spl_kmem_fini_tracking(&kmem_list, &kmem_lock); +#endif /* DEBUG_KMEM_TRACKING */ +#endif /* DEBUG_KMEM */ +} diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c new file mode 100644 index 000000000000..b971b4498cc6 --- /dev/null +++ b/module/os/linux/spl/spl-kstat.c @@ -0,0 +1,778 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Kstat Implementation. + */ + +#include +#include +#include +#include +#include + +static kmutex_t kstat_module_lock; +static struct list_head kstat_module_list; +static kid_t kstat_id; + +static int +kstat_resize_raw(kstat_t *ksp) +{ + if (ksp->ks_raw_bufsize == KSTAT_RAW_MAX) + return (ENOMEM); + + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + ksp->ks_raw_bufsize = MIN(ksp->ks_raw_bufsize * 2, KSTAT_RAW_MAX); + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + + return (0); +} + +void +kstat_waitq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt++; + if (wcnt != 0) { + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; + } +} +EXPORT_SYMBOL(kstat_waitq_enter); + +void +kstat_waitq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t wcnt; + + new = gethrtime(); + delta = new - kiop->wlastupdate; + kiop->wlastupdate = new; + wcnt = kiop->wcnt--; + ASSERT((int)wcnt > 0); + kiop->wlentime += delta * wcnt; + kiop->wtime += delta; +} +EXPORT_SYMBOL(kstat_waitq_exit); + +void +kstat_runq_enter(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt++; + if (rcnt != 0) { + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; + } +} +EXPORT_SYMBOL(kstat_runq_enter); + +void +kstat_runq_exit(kstat_io_t *kiop) +{ + hrtime_t new, delta; + ulong_t rcnt; + + new = gethrtime(); + delta = new - kiop->rlastupdate; + kiop->rlastupdate = new; + rcnt = kiop->rcnt--; + ASSERT((int)rcnt > 0); + kiop->rlentime += delta * rcnt; + kiop->rtime += delta; +} +EXPORT_SYMBOL(kstat_runq_exit); + +static int +kstat_seq_show_headers(struct seq_file *f) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + seq_printf(f, "%d %d 0x%02x %d %d %lld %lld\n", + ksp->ks_kid, ksp->ks_type, ksp->ks_flags, + ksp->ks_ndata, (int)ksp->ks_data_size, + ksp->ks_crtime, ksp->ks_snaptime); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.headers) { + rc = ksp->ks_raw_ops.headers( + ksp->ks_raw_buf, ksp->ks_raw_bufsize); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + seq_printf(f, "raw data\n"); + } + break; + case KSTAT_TYPE_NAMED: + seq_printf(f, "%-31s %-4s %s\n", + "name", "type", "data"); + break; + case KSTAT_TYPE_INTR: + seq_printf(f, "%-8s %-8s %-8s %-8s %-8s\n", + "hard", "soft", "watchdog", + "spurious", "multsvc"); + break; + case KSTAT_TYPE_IO: + seq_printf(f, + "%-8s %-8s %-8s %-8s %-8s %-8s " + "%-8s %-8s %-8s %-8s %-8s %-8s\n", + "nread", "nwritten", "reads", "writes", + "wtime", "wlentime", "wupdate", + "rtime", "rlentime", "rupdate", + "wcnt", "rcnt"); + break; + case KSTAT_TYPE_TIMER: + seq_printf(f, + "%-31s %-8s " + "%-8s %-8s %-8s %-8s %-8s\n", + "name", "events", "elapsed", + "min", "max", "start", "stop"); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_seq_show_raw(struct seq_file *f, unsigned char *p, int l) +{ + int i, j; + + for (i = 0; ; i++) { + seq_printf(f, "%03x:", i); + + for (j = 0; j < 16; j++) { + if (i * 16 + j >= l) { + seq_printf(f, "\n"); + goto out; + } + + seq_printf(f, " %02x", (unsigned char)p[i * 16 + j]); + } + seq_printf(f, "\n"); + } +out: + return (0); +} + +static int +kstat_seq_show_named(struct seq_file *f, kstat_named_t *knp) +{ + seq_printf(f, "%-31s %-4d ", knp->name, knp->data_type); + + switch (knp->data_type) { + case KSTAT_DATA_CHAR: + knp->value.c[15] = '\0'; /* NULL terminate */ + seq_printf(f, "%-16s", knp->value.c); + break; + /* + * NOTE - We need to be more careful able what tokens are + * used for each arch, for now this is correct for x86_64. + */ + case KSTAT_DATA_INT32: + seq_printf(f, "%d", knp->value.i32); + break; + case KSTAT_DATA_UINT32: + seq_printf(f, "%u", knp->value.ui32); + break; + case KSTAT_DATA_INT64: + seq_printf(f, "%lld", (signed long long)knp->value.i64); + break; + case KSTAT_DATA_UINT64: + seq_printf(f, "%llu", + (unsigned long long)knp->value.ui64); + break; + case KSTAT_DATA_LONG: + seq_printf(f, "%ld", knp->value.l); + break; + case KSTAT_DATA_ULONG: + seq_printf(f, "%lu", knp->value.ul); + break; + case KSTAT_DATA_STRING: + KSTAT_NAMED_STR_PTR(knp) + [KSTAT_NAMED_STR_BUFLEN(knp)-1] = '\0'; + seq_printf(f, "%s", KSTAT_NAMED_STR_PTR(knp)); + break; + default: + PANIC("Undefined kstat data type %d\n", knp->data_type); + } + + seq_printf(f, "\n"); + + return (0); +} + +static int +kstat_seq_show_intr(struct seq_file *f, kstat_intr_t *kip) +{ + seq_printf(f, "%-8u %-8u %-8u %-8u %-8u\n", + kip->intrs[KSTAT_INTR_HARD], + kip->intrs[KSTAT_INTR_SOFT], + kip->intrs[KSTAT_INTR_WATCHDOG], + kip->intrs[KSTAT_INTR_SPURIOUS], + kip->intrs[KSTAT_INTR_MULTSVC]); + + return (0); +} + +static int +kstat_seq_show_io(struct seq_file *f, kstat_io_t *kip) +{ + /* though wlentime & friends are signed, they will never be negative */ + seq_printf(f, + "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " + "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", + kip->nread, kip->nwritten, + kip->reads, kip->writes, + kip->wtime, kip->wlentime, kip->wlastupdate, + kip->rtime, kip->rlentime, kip->rlastupdate, + kip->wcnt, kip->rcnt); + + return (0); +} + +static int +kstat_seq_show_timer(struct seq_file *f, kstat_timer_t *ktp) +{ + seq_printf(f, + "%-31s %-8llu %-8llu %-8llu %-8llu %-8llu %-8llu\n", + ktp->name, ktp->num_events, ktp->elapsed_time, + ktp->min_time, ktp->max_time, + ktp->start_time, ktp->stop_time); + + return (0); +} + +static int +kstat_seq_show(struct seq_file *f, void *p) +{ + kstat_t *ksp = (kstat_t *)f->private; + int rc = 0; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: +restart: + if (ksp->ks_raw_ops.data) { + rc = ksp->ks_raw_ops.data( + ksp->ks_raw_buf, ksp->ks_raw_bufsize, p); + if (rc == ENOMEM && !kstat_resize_raw(ksp)) + goto restart; + if (!rc) + seq_puts(f, ksp->ks_raw_buf); + } else { + ASSERT(ksp->ks_ndata == 1); + rc = kstat_seq_show_raw(f, ksp->ks_data, + ksp->ks_data_size); + } + break; + case KSTAT_TYPE_NAMED: + rc = kstat_seq_show_named(f, (kstat_named_t *)p); + break; + case KSTAT_TYPE_INTR: + rc = kstat_seq_show_intr(f, (kstat_intr_t *)p); + break; + case KSTAT_TYPE_IO: + rc = kstat_seq_show_io(f, (kstat_io_t *)p); + break; + case KSTAT_TYPE_TIMER: + rc = kstat_seq_show_timer(f, (kstat_timer_t *)p); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (-rc); +} + +static int +kstat_default_update(kstat_t *ksp, int rw) +{ + ASSERT(ksp != NULL); + + if (rw == KSTAT_WRITE) + return (EACCES); + + return (0); +} + +static void * +kstat_seq_data_addr(kstat_t *ksp, loff_t n) +{ + void *rc = NULL; + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + if (ksp->ks_raw_ops.addr) + rc = ksp->ks_raw_ops.addr(ksp, n); + else + rc = ksp->ks_data; + break; + case KSTAT_TYPE_NAMED: + rc = ksp->ks_data + n * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + rc = ksp->ks_data + n * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + rc = ksp->ks_data + n * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + rc = ksp->ks_data + n * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + return (rc); +} + +static void * +kstat_seq_start(struct seq_file *f, loff_t *pos) +{ + loff_t n = *pos; + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + + if (ksp->ks_type == KSTAT_TYPE_RAW) { + ksp->ks_raw_bufsize = PAGE_SIZE; + ksp->ks_raw_buf = vmem_alloc(ksp->ks_raw_bufsize, KM_SLEEP); + } + + /* Dynamically update kstat, on error existing kstats are used */ + (void) ksp->ks_update(ksp, KSTAT_READ); + + ksp->ks_snaptime = gethrtime(); + + if (!(ksp->ks_flags & KSTAT_FLAG_NO_HEADERS) && !n && + kstat_seq_show_headers(f)) + return (NULL); + + if (n >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, n)); +} + +static void * +kstat_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + ++*pos; + if (*pos >= ksp->ks_ndata) + return (NULL); + + return (kstat_seq_data_addr(ksp, *pos)); +} + +static void +kstat_seq_stop(struct seq_file *f, void *v) +{ + kstat_t *ksp = (kstat_t *)f->private; + ASSERT(ksp->ks_magic == KS_MAGIC); + + if (ksp->ks_type == KSTAT_TYPE_RAW) + vmem_free(ksp->ks_raw_buf, ksp->ks_raw_bufsize); + + mutex_exit(ksp->ks_lock); +} + +static struct seq_operations kstat_seq_ops = { + .show = kstat_seq_show, + .start = kstat_seq_start, + .next = kstat_seq_next, + .stop = kstat_seq_stop, +}; + +static kstat_module_t * +kstat_find_module(char *name) +{ + kstat_module_t *module = NULL; + + list_for_each_entry(module, &kstat_module_list, ksm_module_list) { + if (strncmp(name, module->ksm_name, KSTAT_STRLEN) == 0) + return (module); + } + + return (NULL); +} + +static kstat_module_t * +kstat_create_module(char *name) +{ + kstat_module_t *module; + struct proc_dir_entry *pde; + + pde = proc_mkdir(name, proc_spl_kstat); + if (pde == NULL) + return (NULL); + + module = kmem_alloc(sizeof (kstat_module_t), KM_SLEEP); + module->ksm_proc = pde; + strlcpy(module->ksm_name, name, KSTAT_STRLEN+1); + INIT_LIST_HEAD(&module->ksm_kstat_list); + list_add_tail(&module->ksm_module_list, &kstat_module_list); + + return (module); + +} + +static void +kstat_delete_module(kstat_module_t *module) +{ + ASSERT(list_empty(&module->ksm_kstat_list)); + remove_proc_entry(module->ksm_name, proc_spl_kstat); + list_del(&module->ksm_module_list); + kmem_free(module, sizeof (kstat_module_t)); +} + +static int +proc_kstat_open(struct inode *inode, struct file *filp) +{ + struct seq_file *f; + int rc; + + rc = seq_open(filp, &kstat_seq_ops); + if (rc) + return (rc); + + f = filp->private_data; + f->private = PDE_DATA(inode); + + return (rc); +} + +static ssize_t +proc_kstat_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + kstat_t *ksp = f->private; + int rc; + + ASSERT(ksp->ks_magic == KS_MAGIC); + + mutex_enter(ksp->ks_lock); + rc = ksp->ks_update(ksp, KSTAT_WRITE); + mutex_exit(ksp->ks_lock); + + if (rc) + return (-rc); + + *ppos += len; + return (len); +} + +static const kstat_proc_op_t proc_kstat_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_kstat_open, + .proc_write = proc_kstat_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else + .open = proc_kstat_open, + .write = proc_kstat_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +#endif +}; + +void +__kstat_set_raw_ops(kstat_t *ksp, + int (*headers)(char *buf, size_t size), + int (*data)(char *buf, size_t size, void *data), + void *(*addr)(kstat_t *ksp, loff_t index)) +{ + ksp->ks_raw_ops.headers = headers; + ksp->ks_raw_ops.data = data; + ksp->ks_raw_ops.addr = addr; +} +EXPORT_SYMBOL(__kstat_set_raw_ops); + +void +kstat_proc_entry_init(kstat_proc_entry_t *kpep, const char *module, + const char *name) +{ + kpep->kpe_owner = NULL; + kpep->kpe_proc = NULL; + INIT_LIST_HEAD(&kpep->kpe_list); + strncpy(kpep->kpe_module, module, KSTAT_STRLEN); + strncpy(kpep->kpe_name, name, KSTAT_STRLEN); +} +EXPORT_SYMBOL(kstat_proc_entry_init); + +kstat_t * +__kstat_create(const char *ks_module, int ks_instance, const char *ks_name, + const char *ks_class, uchar_t ks_type, uint_t ks_ndata, + uchar_t ks_flags) +{ + kstat_t *ksp; + + ASSERT(ks_module); + ASSERT(ks_instance == 0); + ASSERT(ks_name); + + if ((ks_type == KSTAT_TYPE_INTR) || (ks_type == KSTAT_TYPE_IO)) + ASSERT(ks_ndata == 1); + + ksp = kmem_zalloc(sizeof (*ksp), KM_SLEEP); + if (ksp == NULL) + return (ksp); + + mutex_enter(&kstat_module_lock); + ksp->ks_kid = kstat_id; + kstat_id++; + mutex_exit(&kstat_module_lock); + + ksp->ks_magic = KS_MAGIC; + mutex_init(&ksp->ks_private_lock, NULL, MUTEX_DEFAULT, NULL); + ksp->ks_lock = &ksp->ks_private_lock; + + ksp->ks_crtime = gethrtime(); + ksp->ks_snaptime = ksp->ks_crtime; + ksp->ks_instance = ks_instance; + strncpy(ksp->ks_class, ks_class, KSTAT_STRLEN); + ksp->ks_type = ks_type; + ksp->ks_flags = ks_flags; + ksp->ks_update = kstat_default_update; + ksp->ks_private = NULL; + ksp->ks_raw_ops.headers = NULL; + ksp->ks_raw_ops.data = NULL; + ksp->ks_raw_ops.addr = NULL; + ksp->ks_raw_buf = NULL; + ksp->ks_raw_bufsize = 0; + kstat_proc_entry_init(&ksp->ks_proc, ks_module, ks_name); + + switch (ksp->ks_type) { + case KSTAT_TYPE_RAW: + ksp->ks_ndata = 1; + ksp->ks_data_size = ks_ndata; + break; + case KSTAT_TYPE_NAMED: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_named_t); + break; + case KSTAT_TYPE_INTR: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_intr_t); + break; + case KSTAT_TYPE_IO: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_io_t); + break; + case KSTAT_TYPE_TIMER: + ksp->ks_ndata = ks_ndata; + ksp->ks_data_size = ks_ndata * sizeof (kstat_timer_t); + break; + default: + PANIC("Undefined kstat type %d\n", ksp->ks_type); + } + + if (ksp->ks_flags & KSTAT_FLAG_VIRTUAL) { + ksp->ks_data = NULL; + } else { + ksp->ks_data = kmem_zalloc(ksp->ks_data_size, KM_SLEEP); + if (ksp->ks_data == NULL) { + kmem_free(ksp, sizeof (*ksp)); + ksp = NULL; + } + } + + return (ksp); +} +EXPORT_SYMBOL(__kstat_create); + +static int +kstat_detect_collision(kstat_proc_entry_t *kpep) +{ + kstat_module_t *module; + kstat_proc_entry_t *tmp = NULL; + char *parent; + char *cp; + + parent = kmem_asprintf("%s", kpep->kpe_module); + + if ((cp = strrchr(parent, '/')) == NULL) { + kmem_strfree(parent); + return (0); + } + + cp[0] = '\0'; + if ((module = kstat_find_module(parent)) != NULL) { + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, cp+1, KSTAT_STRLEN) == 0) { + kmem_strfree(parent); + return (EEXIST); + } + } + } + + kmem_strfree(parent); + return (0); +} + +/* + * Add a file to the proc filesystem under the kstat namespace (i.e. + * /proc/spl/kstat/). The file need not necessarily be implemented as a + * kstat. + */ +void +kstat_proc_entry_install(kstat_proc_entry_t *kpep, mode_t mode, + const kstat_proc_op_t *proc_ops, void *data) +{ + kstat_module_t *module; + kstat_proc_entry_t *tmp = NULL; + + ASSERT(kpep); + + mutex_enter(&kstat_module_lock); + + module = kstat_find_module(kpep->kpe_module); + if (module == NULL) { + if (kstat_detect_collision(kpep) != 0) { + cmn_err(CE_WARN, "kstat_create('%s', '%s'): namespace" \ + " collision", kpep->kpe_module, kpep->kpe_name); + goto out; + } + module = kstat_create_module(kpep->kpe_module); + if (module == NULL) + goto out; + } + + /* + * Only one entry by this name per-module, on failure the module + * shouldn't be deleted because we know it has at least one entry. + */ + list_for_each_entry(tmp, &module->ksm_kstat_list, kpe_list) { + if (strncmp(tmp->kpe_name, kpep->kpe_name, KSTAT_STRLEN) == 0) + goto out; + } + + list_add_tail(&kpep->kpe_list, &module->ksm_kstat_list); + + kpep->kpe_owner = module; + kpep->kpe_proc = proc_create_data(kpep->kpe_name, mode, + module->ksm_proc, proc_ops, data); + if (kpep->kpe_proc == NULL) { + list_del_init(&kpep->kpe_list); + if (list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + } +out: + mutex_exit(&kstat_module_lock); + +} +EXPORT_SYMBOL(kstat_proc_entry_install); + +void +__kstat_install(kstat_t *ksp) +{ + ASSERT(ksp); + mode_t mode; + /* Specify permission modes for different kstats */ + if (strncmp(ksp->ks_proc.kpe_name, "dbufs", KSTAT_STRLEN) == 0) { + mode = 0600; + } else { + mode = 0644; + } + kstat_proc_entry_install( + &ksp->ks_proc, mode, &proc_kstat_operations, ksp); +} +EXPORT_SYMBOL(__kstat_install); + +void +kstat_proc_entry_delete(kstat_proc_entry_t *kpep) +{ + kstat_module_t *module = kpep->kpe_owner; + if (kpep->kpe_proc) + remove_proc_entry(kpep->kpe_name, module->ksm_proc); + + mutex_enter(&kstat_module_lock); + list_del_init(&kpep->kpe_list); + + /* + * Remove top level module directory if it wasn't empty before, but now + * is. + */ + if (kpep->kpe_proc && list_empty(&module->ksm_kstat_list)) + kstat_delete_module(module); + mutex_exit(&kstat_module_lock); + +} +EXPORT_SYMBOL(kstat_proc_entry_delete); + +void +__kstat_delete(kstat_t *ksp) +{ + kstat_proc_entry_delete(&ksp->ks_proc); + + if (!(ksp->ks_flags & KSTAT_FLAG_VIRTUAL)) + kmem_free(ksp->ks_data, ksp->ks_data_size); + + ksp->ks_lock = NULL; + mutex_destroy(&ksp->ks_private_lock); + kmem_free(ksp, sizeof (*ksp)); +} +EXPORT_SYMBOL(__kstat_delete); + +int +spl_kstat_init(void) +{ + mutex_init(&kstat_module_lock, NULL, MUTEX_DEFAULT, NULL); + INIT_LIST_HEAD(&kstat_module_list); + kstat_id = 0; + return (0); +} + +void +spl_kstat_fini(void) +{ + ASSERT(list_empty(&kstat_module_list)); + mutex_destroy(&kstat_module_lock); +} diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c new file mode 100644 index 000000000000..6936db5d6466 --- /dev/null +++ b/module/os/linux/spl/spl-proc.c @@ -0,0 +1,791 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Proc Implementation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(CONSTIFY_PLUGIN) && LINUX_VERSION_CODE >= KERNEL_VERSION(3, 8, 0) +typedef struct ctl_table __no_const spl_ctl_table; +#else +typedef struct ctl_table spl_ctl_table; +#endif + +static unsigned long table_min = 0; +static unsigned long table_max = ~0; + +static struct ctl_table_header *spl_header = NULL; +static struct proc_dir_entry *proc_spl = NULL; +static struct proc_dir_entry *proc_spl_kmem = NULL; +static struct proc_dir_entry *proc_spl_kmem_slab = NULL; +static struct proc_dir_entry *proc_spl_taskq_all = NULL; +static struct proc_dir_entry *proc_spl_taskq = NULL; +struct proc_dir_entry *proc_spl_kstat = NULL; + +static int +proc_copyin_string(char *kbuffer, int kbuffer_size, const char *ubuffer, + int ubuffer_size) +{ + int size; + + if (ubuffer_size > kbuffer_size) + return (-EOVERFLOW); + + if (copy_from_user((void *)kbuffer, (void *)ubuffer, ubuffer_size)) + return (-EFAULT); + + /* strip trailing whitespace */ + size = strnlen(kbuffer, ubuffer_size); + while (size-- >= 0) + if (!isspace(kbuffer[size])) + break; + + /* empty string */ + if (size < 0) + return (-EINVAL); + + /* no space to terminate */ + if (size == kbuffer_size) + return (-EOVERFLOW); + + kbuffer[size + 1] = 0; + return (0); +} + +static int +proc_copyout_string(char *ubuffer, int ubuffer_size, const char *kbuffer, + char *append) +{ + /* + * NB if 'append' != NULL, it's a single character to append to the + * copied out string - usually "\n", for /proc entries and + * (i.e. a terminating zero byte) for sysctl entries + */ + int size = MIN(strlen(kbuffer), ubuffer_size); + + if (copy_to_user(ubuffer, kbuffer, size)) + return (-EFAULT); + + if (append != NULL && size < ubuffer_size) { + if (copy_to_user(ubuffer + size, append, 1)) + return (-EFAULT); + + size++; + } + + return (size); +} + +#ifdef DEBUG_KMEM +static int +proc_domemused(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val; + spl_ctl_table dummy = *table; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { +#ifdef HAVE_ATOMIC64_T + val = atomic64_read((atomic64_t *)table->data); +#else + val = atomic_read((atomic_t *)table->data); +#endif /* HAVE_ATOMIC64_T */ + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} +#endif /* DEBUG_KMEM */ + +static int +proc_doslab(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int rc = 0; + unsigned long min = 0, max = ~0, val = 0, mask; + spl_ctl_table dummy = *table; + spl_kmem_cache_t *skc = NULL; + + dummy.data = &val; + dummy.proc_handler = &proc_dointvec; + dummy.extra1 = &min; + dummy.extra2 = &max; + + if (write) { + *ppos += *lenp; + } else { + down_read(&spl_kmem_cache_sem); + mask = (unsigned long)table->data; + + list_for_each_entry(skc, &spl_kmem_cache_list, skc_list) { + + /* Only use slabs of the correct kmem/vmem type */ + if (!(skc->skc_flags & mask)) + continue; + + /* Sum the specified field for selected slabs */ + switch (mask & (KMC_TOTAL | KMC_ALLOC | KMC_MAX)) { + case KMC_TOTAL: + val += skc->skc_slab_size * skc->skc_slab_total; + break; + case KMC_ALLOC: + val += skc->skc_obj_size * skc->skc_obj_alloc; + break; + case KMC_MAX: + val += skc->skc_obj_size * skc->skc_obj_max; + break; + } + } + + up_read(&spl_kmem_cache_sem); + rc = proc_doulongvec_minmax(&dummy, write, buffer, lenp, ppos); + } + + return (rc); +} + +static int +proc_dohostid(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos) +{ + int len, rc = 0; + char *end, str[32]; + + if (write) { + /* + * We can't use proc_doulongvec_minmax() in the write + * case here because hostid while a hex value has no + * leading 0x which confuses the helper function. + */ + rc = proc_copyin_string(str, sizeof (str), buffer, *lenp); + if (rc < 0) + return (rc); + + spl_hostid = simple_strtoul(str, &end, 16); + if (str == end) + return (-EINVAL); + + } else { + len = snprintf(str, sizeof (str), "%lx", + (unsigned long) zone_get_hostid(NULL)); + if (*ppos >= len) + rc = 0; + else + rc = proc_copyout_string(buffer, + *lenp, str + *ppos, "\n"); + + if (rc >= 0) { + *lenp = rc; + *ppos += rc; + } + } + + return (rc); +} + +static void +taskq_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, "%-25s %5s %5s %5s %5s %5s %5s %12s %5s %10s\n", + "taskq", "act", "nthr", "spwn", "maxt", "pri", + "mina", "maxa", "cura", "flags"); +} + +/* indices into the lheads array below */ +#define LHEAD_PEND 0 +#define LHEAD_PRIO 1 +#define LHEAD_DELAY 2 +#define LHEAD_WAIT 3 +#define LHEAD_ACTIVE 4 +#define LHEAD_SIZE 5 + +/* BEGIN CSTYLED */ +static unsigned int spl_max_show_tasks = 512; +module_param(spl_max_show_tasks, uint, 0644); +MODULE_PARM_DESC(spl_max_show_tasks, "Max number of tasks shown in taskq proc"); +/* END CSTYLED */ + +static int +taskq_seq_show_impl(struct seq_file *f, void *p, boolean_t allflag) +{ + taskq_t *tq = p; + taskq_thread_t *tqt = NULL; + spl_wait_queue_entry_t *wq; + struct task_struct *tsk; + taskq_ent_t *tqe; + char name[100]; + struct list_head *lheads[LHEAD_SIZE], *lh; + static char *list_names[LHEAD_SIZE] = + {"pend", "prio", "delay", "wait", "active" }; + int i, j, have_lheads = 0; + unsigned long wflags, flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + spin_lock_irqsave(&tq->tq_wait_waitq.lock, wflags); + + /* get the various lists and check whether they're empty */ + lheads[LHEAD_PEND] = &tq->tq_pend_list; + lheads[LHEAD_PRIO] = &tq->tq_prio_list; + lheads[LHEAD_DELAY] = &tq->tq_delay_list; +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.head; +#else + lheads[LHEAD_WAIT] = &tq->tq_wait_waitq.task_list; +#endif + lheads[LHEAD_ACTIVE] = &tq->tq_active_list; + + for (i = 0; i < LHEAD_SIZE; ++i) { + if (list_empty(lheads[i])) + lheads[i] = NULL; + else + ++have_lheads; + } + + /* early return in non-"all" mode if lists are all empty */ + if (!allflag && !have_lheads) { + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return (0); + } + + /* unlock the waitq quickly */ + if (!lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + + /* show the base taskq contents */ + snprintf(name, sizeof (name), "%s/%d", tq->tq_name, tq->tq_instance); + seq_printf(f, "%-25s ", name); + seq_printf(f, "%5d %5d %5d %5d %5d %5d %12d %5d %10x\n", + tq->tq_nactive, tq->tq_nthreads, tq->tq_nspawn, + tq->tq_maxthreads, tq->tq_pri, tq->tq_minalloc, tq->tq_maxalloc, + tq->tq_nalloc, tq->tq_flags); + + /* show the active list */ + if (lheads[LHEAD_ACTIVE]) { + j = 0; + list_for_each_entry(tqt, &tq->tq_active_list, tqt_active_list) { + if (j == 0) + seq_printf(f, "\t%s:", + list_names[LHEAD_ACTIVE]); + else if (j == 2) { + seq_printf(f, "\n\t "); + j = 0; + } + seq_printf(f, " [%d]%pf(%ps)", + tqt->tqt_thread->pid, + tqt->tqt_task->tqent_func, + tqt->tqt_task->tqent_arg); + ++j; + } + seq_printf(f, "\n"); + } + + for (i = LHEAD_PEND; i <= LHEAD_WAIT; ++i) + if (lheads[i]) { + j = 0; + list_for_each(lh, lheads[i]) { + if (spl_max_show_tasks != 0 && + j >= spl_max_show_tasks) { + seq_printf(f, "\n\t(truncated)"); + break; + } + /* show the wait waitq list */ + if (i == LHEAD_WAIT) { +#ifdef HAVE_WAIT_QUEUE_HEAD_ENTRY + wq = list_entry(lh, + spl_wait_queue_entry_t, entry); +#else + wq = list_entry(lh, + spl_wait_queue_entry_t, task_list); +#endif + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 8 == 0) + seq_printf(f, "\n\t "); + + tsk = wq->private; + seq_printf(f, " %d", tsk->pid); + /* pend, prio and delay lists */ + } else { + tqe = list_entry(lh, taskq_ent_t, + tqent_list); + if (j == 0) + seq_printf(f, "\t%s:", + list_names[i]); + else if (j % 2 == 0) + seq_printf(f, "\n\t "); + + seq_printf(f, " %pf(%ps)", + tqe->tqent_func, + tqe->tqent_arg); + } + ++j; + } + seq_printf(f, "\n"); + } + if (lheads[LHEAD_WAIT]) + spin_unlock_irqrestore(&tq->tq_wait_waitq.lock, wflags); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (0); +} + +static int +taskq_all_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_TRUE)); +} + +static int +taskq_seq_show(struct seq_file *f, void *p) +{ + return (taskq_seq_show_impl(f, p, B_FALSE)); +} + +static void * +taskq_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&tq_list_sem); + if (!n) + taskq_seq_show_headers(f); + + p = tq_list.next; + while (n--) { + p = p->next; + if (p == &tq_list) + return (NULL); + } + + return (list_entry(p, taskq_t, tq_taskqs)); +} + +static void * +taskq_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + taskq_t *tq = p; + + ++*pos; + return ((tq->tq_taskqs.next == &tq_list) ? + NULL : list_entry(tq->tq_taskqs.next, taskq_t, tq_taskqs)); +} + +static void +slab_seq_show_headers(struct seq_file *f) +{ + seq_printf(f, + "--------------------- cache ----------" + "--------------------------------------------- " + "----- slab ------ " + "---- object ----- " + "--- emergency ---\n"); + seq_printf(f, + "name " + " flags size alloc slabsize objsize " + "total alloc max " + "total alloc max " + "dlock alloc max\n"); +} + +static int +slab_seq_show(struct seq_file *f, void *p) +{ + spl_kmem_cache_t *skc = p; + + ASSERT(skc->skc_magic == SKC_MAGIC); + + if (skc->skc_flags & KMC_SLAB) { + /* + * This cache is backed by a generic Linux kmem cache which + * has its own accounting. For these caches we only track + * the number of active allocated objects that exist within + * the underlying Linux slabs. For the overall statistics of + * the underlying Linux cache please refer to /proc/slabinfo. + */ + spin_lock(&skc->skc_lock); + uint64_t objs_allocated = + percpu_counter_sum(&skc->skc_linux_alloc); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9s %9lu %8s %8u " + "%5s %5s %5s %5s %5lu %5s %5s %5s %5s\n", + (long unsigned)skc->skc_flags, + "-", + (long unsigned)(skc->skc_obj_size * objs_allocated), + "-", + (unsigned)skc->skc_obj_size, + "-", "-", "-", "-", + (long unsigned)objs_allocated, + "-", "-", "-", "-"); + spin_unlock(&skc->skc_lock); + return (0); + } + + spin_lock(&skc->skc_lock); + seq_printf(f, "%-36s ", skc->skc_name); + seq_printf(f, "0x%05lx %9lu %9lu %8u %8u " + "%5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu %5lu\n", + (long unsigned)skc->skc_flags, + (long unsigned)(skc->skc_slab_size * skc->skc_slab_total), + (long unsigned)(skc->skc_obj_size * skc->skc_obj_alloc), + (unsigned)skc->skc_slab_size, + (unsigned)skc->skc_obj_size, + (long unsigned)skc->skc_slab_total, + (long unsigned)skc->skc_slab_alloc, + (long unsigned)skc->skc_slab_max, + (long unsigned)skc->skc_obj_total, + (long unsigned)skc->skc_obj_alloc, + (long unsigned)skc->skc_obj_max, + (long unsigned)skc->skc_obj_deadlock, + (long unsigned)skc->skc_obj_emergency, + (long unsigned)skc->skc_obj_emergency_max); + spin_unlock(&skc->skc_lock); + return (0); +} + +static void * +slab_seq_start(struct seq_file *f, loff_t *pos) +{ + struct list_head *p; + loff_t n = *pos; + + down_read(&spl_kmem_cache_sem); + if (!n) + slab_seq_show_headers(f); + + p = spl_kmem_cache_list.next; + while (n--) { + p = p->next; + if (p == &spl_kmem_cache_list) + return (NULL); + } + + return (list_entry(p, spl_kmem_cache_t, skc_list)); +} + +static void * +slab_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + spl_kmem_cache_t *skc = p; + + ++*pos; + return ((skc->skc_list.next == &spl_kmem_cache_list) ? + NULL : list_entry(skc->skc_list.next, spl_kmem_cache_t, skc_list)); +} + +static void +slab_seq_stop(struct seq_file *f, void *v) +{ + up_read(&spl_kmem_cache_sem); +} + +static struct seq_operations slab_seq_ops = { + .show = slab_seq_show, + .start = slab_seq_start, + .next = slab_seq_next, + .stop = slab_seq_stop, +}; + +static int +proc_slab_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &slab_seq_ops)); +} + +static const kstat_proc_op_t proc_slab_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_slab_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else + .open = proc_slab_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +#endif +}; + +static void +taskq_seq_stop(struct seq_file *f, void *v) +{ + up_read(&tq_list_sem); +} + +static struct seq_operations taskq_all_seq_ops = { + .show = taskq_all_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static struct seq_operations taskq_seq_ops = { + .show = taskq_seq_show, + .start = taskq_seq_start, + .next = taskq_seq_next, + .stop = taskq_seq_stop, +}; + +static int +proc_taskq_all_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_all_seq_ops)); +} + +static int +proc_taskq_open(struct inode *inode, struct file *filp) +{ + return (seq_open(filp, &taskq_seq_ops)); +} + +static const kstat_proc_op_t proc_taskq_all_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_taskq_all_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else + .open = proc_taskq_all_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +#endif +}; + +static const kstat_proc_op_t proc_taskq_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = proc_taskq_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release, +#else + .open = proc_taskq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +#endif +}; + +static struct ctl_table spl_kmem_table[] = { +#ifdef DEBUG_KMEM + { + .procname = "kmem_used", + .data = &kmem_alloc_used, +#ifdef HAVE_ATOMIC64_T + .maxlen = sizeof (atomic64_t), +#else + .maxlen = sizeof (atomic_t), +#endif /* HAVE_ATOMIC64_T */ + .mode = 0444, + .proc_handler = &proc_domemused, + }, + { + .procname = "kmem_max", + .data = &kmem_alloc_max, + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doulongvec_minmax, + }, +#endif /* DEBUG_KMEM */ + { + .procname = "slab_kvmem_total", + .data = (void *)(KMC_KVMEM | KMC_TOTAL), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kvmem_alloc", + .data = (void *)(KMC_KVMEM | KMC_ALLOC), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + { + .procname = "slab_kvmem_max", + .data = (void *)(KMC_KVMEM | KMC_MAX), + .maxlen = sizeof (unsigned long), + .extra1 = &table_min, + .extra2 = &table_max, + .mode = 0444, + .proc_handler = &proc_doslab, + }, + {}, +}; + +static struct ctl_table spl_kstat_table[] = { + {}, +}; + +static struct ctl_table spl_table[] = { + /* + * NB No .strategy entries have been provided since + * sysctl(8) prefers to go via /proc for portability. + */ + { + .procname = "gitrev", + .data = spl_gitrev, + .maxlen = sizeof (spl_gitrev), + .mode = 0444, + .proc_handler = &proc_dostring, + }, + { + .procname = "hostid", + .data = &spl_hostid, + .maxlen = sizeof (unsigned long), + .mode = 0644, + .proc_handler = &proc_dohostid, + }, + { + .procname = "kmem", + .mode = 0555, + .child = spl_kmem_table, + }, + { + .procname = "kstat", + .mode = 0555, + .child = spl_kstat_table, + }, + {}, +}; + +static struct ctl_table spl_dir[] = { + { + .procname = "spl", + .mode = 0555, + .child = spl_table, + }, + {} +}; + +static struct ctl_table spl_root[] = { + { + .procname = "kernel", + .mode = 0555, + .child = spl_dir, + }, + {} +}; + +int +spl_proc_init(void) +{ + int rc = 0; + + spl_header = register_sysctl_table(spl_root); + if (spl_header == NULL) + return (-EUNATCH); + + proc_spl = proc_mkdir("spl", NULL); + if (proc_spl == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq_all = proc_create_data("taskq-all", 0444, proc_spl, + &proc_taskq_all_operations, NULL); + if (proc_spl_taskq_all == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_taskq = proc_create_data("taskq", 0444, proc_spl, + &proc_taskq_operations, NULL); + if (proc_spl_taskq == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem = proc_mkdir("kmem", proc_spl); + if (proc_spl_kmem == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kmem_slab = proc_create_data("slab", 0444, proc_spl_kmem, + &proc_slab_operations, NULL); + if (proc_spl_kmem_slab == NULL) { + rc = -EUNATCH; + goto out; + } + + proc_spl_kstat = proc_mkdir("kstat", proc_spl); + if (proc_spl_kstat == NULL) { + rc = -EUNATCH; + goto out; + } +out: + if (rc) { + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + unregister_sysctl_table(spl_header); + } + + return (rc); +} + +void +spl_proc_fini(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + ASSERT(spl_header != NULL); + unregister_sysctl_table(spl_header); +} diff --git a/module/os/linux/spl/spl-procfs-list.c b/module/os/linux/spl/spl-procfs-list.c new file mode 100644 index 000000000000..189d6a7c6082 --- /dev/null +++ b/module/os/linux/spl/spl-procfs-list.c @@ -0,0 +1,264 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +/* + * A procfs_list is a wrapper around a linked list which implements the seq_file + * interface, allowing the contents of the list to be exposed through procfs. + * The kernel already has some utilities to help implement the seq_file + * interface for linked lists (seq_list_*), but they aren't appropriate for use + * with lists that have many entries, because seq_list_start walks the list at + * the start of each read syscall to find where it left off, so reading a file + * ends up being quadratic in the number of entries in the list. + * + * This implementation avoids this penalty by maintaining a separate cursor into + * the list per instance of the file that is open. It also maintains some extra + * information in each node of the list to prevent reads of entries that have + * been dropped from the list. + * + * Callers should only add elements to the list using procfs_list_add, which + * adds an element to the tail of the list. Other operations can be performed + * directly on the wrapped list using the normal list manipulation functions, + * but elements should only be removed from the head of the list. + */ + +#define NODE_ID(procfs_list, obj) \ + (((procfs_list_node_t *)(((char *)obj) + \ + (procfs_list)->pl_node_offset))->pln_id) + +typedef struct procfs_list_cursor { + procfs_list_t *procfs_list; /* List into which this cursor points */ + void *cached_node; /* Most recently accessed node */ + loff_t cached_pos; /* Position of cached_node */ +} procfs_list_cursor_t; + +static int +procfs_list_seq_show(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + if (p == SEQ_START_TOKEN) { + if (procfs_list->pl_show_header != NULL) + return (procfs_list->pl_show_header(f)); + else + return (0); + } + return (procfs_list->pl_show(f, p)); +} + +static void * +procfs_list_next_node(procfs_list_cursor_t *cursor, loff_t *pos) +{ + void *next_node; + procfs_list_t *procfs_list = cursor->procfs_list; + + if (cursor->cached_node == SEQ_START_TOKEN) + next_node = list_head(&procfs_list->pl_list); + else + next_node = list_next(&procfs_list->pl_list, + cursor->cached_node); + + if (next_node != NULL) { + cursor->cached_node = next_node; + cursor->cached_pos = NODE_ID(procfs_list, cursor->cached_node); + *pos = cursor->cached_pos; + } + return (next_node); +} + +static void * +procfs_list_seq_start(struct seq_file *f, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + + mutex_enter(&procfs_list->pl_lock); + + if (*pos == 0) { + cursor->cached_node = SEQ_START_TOKEN; + cursor->cached_pos = 0; + return (SEQ_START_TOKEN); + } + + /* + * Check if our cached pointer has become stale, which happens if the + * the message where we left off has been dropped from the list since + * the last read syscall completed. + */ + void *oldest_node = list_head(&procfs_list->pl_list); + if (cursor->cached_node != SEQ_START_TOKEN && (oldest_node == NULL || + NODE_ID(procfs_list, oldest_node) > cursor->cached_pos)) + return (ERR_PTR(-EIO)); + + /* + * If it isn't starting from the beginning of the file, the seq_file + * code will either pick up at the same position it visited last or the + * following one. + */ + if (*pos == cursor->cached_pos) { + return (cursor->cached_node); + } else { + ASSERT3U(*pos, ==, cursor->cached_pos + 1); + return (procfs_list_next_node(cursor, pos)); + } +} + +static void * +procfs_list_seq_next(struct seq_file *f, void *p, loff_t *pos) +{ + procfs_list_cursor_t *cursor = f->private; + ASSERT(MUTEX_HELD(&cursor->procfs_list->pl_lock)); + return (procfs_list_next_node(cursor, pos)); +} + +static void +procfs_list_seq_stop(struct seq_file *f, void *p) +{ + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + mutex_exit(&procfs_list->pl_lock); +} + +static struct seq_operations procfs_list_seq_ops = { + .show = procfs_list_seq_show, + .start = procfs_list_seq_start, + .next = procfs_list_seq_next, + .stop = procfs_list_seq_stop, +}; + +static int +procfs_list_open(struct inode *inode, struct file *filp) +{ + int rc = seq_open_private(filp, &procfs_list_seq_ops, + sizeof (procfs_list_cursor_t)); + if (rc != 0) + return (rc); + + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + cursor->procfs_list = PDE_DATA(inode); + cursor->cached_node = NULL; + cursor->cached_pos = 0; + + return (0); +} + +static ssize_t +procfs_list_write(struct file *filp, const char __user *buf, size_t len, + loff_t *ppos) +{ + struct seq_file *f = filp->private_data; + procfs_list_cursor_t *cursor = f->private; + procfs_list_t *procfs_list = cursor->procfs_list; + int rc; + + if (procfs_list->pl_clear != NULL && + (rc = procfs_list->pl_clear(procfs_list)) != 0) + return (-rc); + return (len); +} + +static const kstat_proc_op_t procfs_list_operations = { +#ifdef HAVE_PROC_OPS_STRUCT + .proc_open = procfs_list_open, + .proc_write = procfs_list_write, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = seq_release_private, +#else + .open = procfs_list_open, + .write = procfs_list_write, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_private, +#endif +}; + +/* + * Initialize a procfs_list and create a file for it in the proc filesystem + * under the kstat namespace. + */ +void +procfs_list_install(const char *module, + const char *name, + mode_t mode, + procfs_list_t *procfs_list, + int (*show)(struct seq_file *f, void *p), + int (*show_header)(struct seq_file *f), + int (*clear)(procfs_list_t *procfs_list), + size_t procfs_list_node_off) +{ + mutex_init(&procfs_list->pl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&procfs_list->pl_list, + procfs_list_node_off + sizeof (procfs_list_node_t), + procfs_list_node_off + offsetof(procfs_list_node_t, pln_link)); + procfs_list->pl_next_id = 1; /* Save id 0 for SEQ_START_TOKEN */ + procfs_list->pl_show = show; + procfs_list->pl_show_header = show_header; + procfs_list->pl_clear = clear; + procfs_list->pl_node_offset = procfs_list_node_off; + + kstat_proc_entry_init(&procfs_list->pl_kstat_entry, module, name); + kstat_proc_entry_install(&procfs_list->pl_kstat_entry, mode, + &procfs_list_operations, procfs_list); +} +EXPORT_SYMBOL(procfs_list_install); + +/* Remove the proc filesystem file corresponding to the given list */ +void +procfs_list_uninstall(procfs_list_t *procfs_list) +{ + kstat_proc_entry_delete(&procfs_list->pl_kstat_entry); +} +EXPORT_SYMBOL(procfs_list_uninstall); + +void +procfs_list_destroy(procfs_list_t *procfs_list) +{ + ASSERT(list_is_empty(&procfs_list->pl_list)); + list_destroy(&procfs_list->pl_list); + mutex_destroy(&procfs_list->pl_lock); +} +EXPORT_SYMBOL(procfs_list_destroy); + +/* + * Add a new node to the tail of the list. While the standard list manipulation + * functions can be use for all other operation, adding elements to the list + * should only be done using this helper so that the id of the new node is set + * correctly. + */ +void +procfs_list_add(procfs_list_t *procfs_list, void *p) +{ + ASSERT(MUTEX_HELD(&procfs_list->pl_lock)); + NODE_ID(procfs_list, p) = procfs_list->pl_next_id++; + list_insert_tail(&procfs_list->pl_list, p); +} +EXPORT_SYMBOL(procfs_list_add); diff --git a/module/os/linux/spl/spl-taskq.c b/module/os/linux/spl/spl-taskq.c new file mode 100644 index 000000000000..9cbf3e38137c --- /dev/null +++ b/module/os/linux/spl/spl-taskq.c @@ -0,0 +1,1308 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Task Queue Implementation. + */ + +#include +#include +#include +#include +#include + +int spl_taskq_thread_bind = 0; +module_param(spl_taskq_thread_bind, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_bind, "Bind taskq thread to CPU by default"); + + +int spl_taskq_thread_dynamic = 1; +module_param(spl_taskq_thread_dynamic, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_dynamic, "Allow dynamic taskq threads"); + +int spl_taskq_thread_priority = 1; +module_param(spl_taskq_thread_priority, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_priority, + "Allow non-default priority for taskq threads"); + +int spl_taskq_thread_sequential = 4; +module_param(spl_taskq_thread_sequential, int, 0644); +MODULE_PARM_DESC(spl_taskq_thread_sequential, + "Create new taskq threads after N sequential tasks"); + +/* Global system-wide dynamic task queue available for all consumers */ +taskq_t *system_taskq; +EXPORT_SYMBOL(system_taskq); +/* Global dynamic task queue for long delay */ +taskq_t *system_delay_taskq; +EXPORT_SYMBOL(system_delay_taskq); + +/* Private dedicated taskq for creating new taskq threads on demand. */ +static taskq_t *dynamic_taskq; +static taskq_thread_t *taskq_thread_create(taskq_t *); + +/* List of all taskqs */ +LIST_HEAD(tq_list); +struct rw_semaphore tq_list_sem; +static uint_t taskq_tsd; + +static int +task_km_flags(uint_t flags) +{ + if (flags & TQ_NOSLEEP) + return (KM_NOSLEEP); + + if (flags & TQ_PUSHPAGE) + return (KM_PUSHPAGE); + + return (KM_SLEEP); +} + +/* + * taskq_find_by_name - Find the largest instance number of a named taskq. + */ +static int +taskq_find_by_name(const char *name) +{ + struct list_head *tql = NULL; + taskq_t *tq; + + list_for_each_prev(tql, &tq_list) { + tq = list_entry(tql, taskq_t, tq_taskqs); + if (strcmp(name, tq->tq_name) == 0) + return (tq->tq_instance); + } + return (-1); +} + +/* + * NOTE: Must be called with tq->tq_lock held, returns a list_t which + * is not attached to the free, work, or pending taskq lists. + */ +static taskq_ent_t * +task_alloc(taskq_t *tq, uint_t flags, unsigned long *irqflags) +{ + taskq_ent_t *t; + int count = 0; + + ASSERT(tq); +retry: + /* Acquire taskq_ent_t's from free list if available */ + if (!list_empty(&tq->tq_free_list) && !(flags & TQ_NEW)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + ASSERT(!(t->tqent_flags & TQENT_FLAG_CANCEL)); + ASSERT(!timer_pending(&t->tqent_timer)); + + list_del_init(&t->tqent_list); + return (t); + } + + /* Free list is empty and memory allocations are prohibited */ + if (flags & TQ_NOALLOC) + return (NULL); + + /* Hit maximum taskq_ent_t pool size */ + if (tq->tq_nalloc >= tq->tq_maxalloc) { + if (flags & TQ_NOSLEEP) + return (NULL); + + /* + * Sleep periodically polling the free list for an available + * taskq_ent_t. Dispatching with TQ_SLEEP should always succeed + * but we cannot block forever waiting for an taskq_ent_t to + * show up in the free list, otherwise a deadlock can happen. + * + * Therefore, we need to allocate a new task even if the number + * of allocated tasks is above tq->tq_maxalloc, but we still + * end up delaying the task allocation by one second, thereby + * throttling the task dispatch rate. + */ + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + schedule_timeout(HZ / 100); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, + tq->tq_lock_class); + if (count < 100) { + count++; + goto retry; + } + } + + spin_unlock_irqrestore(&tq->tq_lock, *irqflags); + t = kmem_alloc(sizeof (taskq_ent_t), task_km_flags(flags)); + spin_lock_irqsave_nested(&tq->tq_lock, *irqflags, tq->tq_lock_class); + + if (t) { + taskq_init_ent(t); + tq->tq_nalloc++; + } + + return (t); +} + +/* + * NOTE: Must be called with tq->tq_lock held, expects the taskq_ent_t + * to already be removed from the free, work, or pending taskq lists. + */ +static void +task_free(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + ASSERT(list_empty(&t->tqent_list)); + ASSERT(!timer_pending(&t->tqent_timer)); + + kmem_free(t, sizeof (taskq_ent_t)); + tq->tq_nalloc--; +} + +/* + * NOTE: Must be called with tq->tq_lock held, either destroys the + * taskq_ent_t if too many exist or moves it to the free list for later use. + */ +static void +task_done(taskq_t *tq, taskq_ent_t *t) +{ + ASSERT(tq); + ASSERT(t); + + /* Wake tasks blocked in taskq_wait_id() */ + wake_up_all(&t->tqent_waitq); + + list_del_init(&t->tqent_list); + + if (tq->tq_nalloc <= tq->tq_minalloc) { + t->tqent_id = TASKQID_INVALID; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + + list_add_tail(&t->tqent_list, &tq->tq_free_list); + } else { + task_free(tq, t); + } +} + +/* + * When a delayed task timer expires remove it from the delay list and + * add it to the priority list in order for immediate processing. + */ +static void +task_expire_impl(taskq_ent_t *t) +{ + taskq_ent_t *w; + taskq_t *tq = t->tqent_taskq; + struct list_head *l = NULL; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + + if (t->tqent_flags & TQENT_FLAG_CANCEL) { + ASSERT(list_empty(&t->tqent_list)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + return; + } + + t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); + + /* + * The priority list must be maintained in strict task id order + * from lowest to highest for lowest_id to be easily calculable. + */ + list_del(&t->tqent_list); + list_for_each_prev(l, &tq->tq_prio_list) { + w = list_entry(l, taskq_ent_t, tqent_list); + if (w->tqent_id < t->tqent_id) { + list_add(&t->tqent_list, l); + break; + } + } + if (l == &tq->tq_prio_list) + list_add(&t->tqent_list, &tq->tq_prio_list); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + wake_up(&tq->tq_work_waitq); +} + +static void +task_expire(spl_timer_list_t tl) +{ + struct timer_list *tmr = (struct timer_list *)tl; + taskq_ent_t *t = from_timer(t, tmr, tqent_timer); + task_expire_impl(t); +} + +/* + * Returns the lowest incomplete taskqid_t. The taskqid_t may + * be queued on the pending list, on the priority list, on the + * delay list, or on the work list currently being handled, but + * it is not 100% complete yet. + */ +static taskqid_t +taskq_lowest_id(taskq_t *tq) +{ + taskqid_t lowest_id = tq->tq_next_id; + taskq_ent_t *t; + taskq_thread_t *tqt; + + ASSERT(tq); + + if (!list_empty(&tq->tq_pend_list)) { + t = list_entry(tq->tq_pend_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_prio_list)) { + t = list_entry(tq->tq_prio_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_delay_list)) { + t = list_entry(tq->tq_delay_list.next, taskq_ent_t, tqent_list); + lowest_id = MIN(lowest_id, t->tqent_id); + } + + if (!list_empty(&tq->tq_active_list)) { + tqt = list_entry(tq->tq_active_list.next, taskq_thread_t, + tqt_active_list); + ASSERT(tqt->tqt_id != TASKQID_INVALID); + lowest_id = MIN(lowest_id, tqt->tqt_id); + } + + return (lowest_id); +} + +/* + * Insert a task into a list keeping the list sorted by increasing taskqid. + */ +static void +taskq_insert_in_order(taskq_t *tq, taskq_thread_t *tqt) +{ + taskq_thread_t *w; + struct list_head *l = NULL; + + ASSERT(tq); + ASSERT(tqt); + + list_for_each_prev(l, &tq->tq_active_list) { + w = list_entry(l, taskq_thread_t, tqt_active_list); + if (w->tqt_id < tqt->tqt_id) { + list_add(&tqt->tqt_active_list, l); + break; + } + } + if (l == &tq->tq_active_list) + list_add(&tqt->tqt_active_list, &tq->tq_active_list); +} + +/* + * Find and return a task from the given list if it exists. The list + * must be in lowest to highest task id order. + */ +static taskq_ent_t * +taskq_find_list(taskq_t *tq, struct list_head *lh, taskqid_t id) +{ + struct list_head *l = NULL; + taskq_ent_t *t; + + list_for_each(l, lh) { + t = list_entry(l, taskq_ent_t, tqent_list); + + if (t->tqent_id == id) + return (t); + + if (t->tqent_id > id) + break; + } + + return (NULL); +} + +/* + * Find an already dispatched task given the task id regardless of what + * state it is in. If a task is still pending it will be returned. + * If a task is executing, then -EBUSY will be returned instead. + * If the task has already been run then NULL is returned. + */ +static taskq_ent_t * +taskq_find(taskq_t *tq, taskqid_t id) +{ + taskq_thread_t *tqt; + struct list_head *l = NULL; + taskq_ent_t *t; + + t = taskq_find_list(tq, &tq->tq_delay_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_prio_list, id); + if (t) + return (t); + + t = taskq_find_list(tq, &tq->tq_pend_list, id); + if (t) + return (t); + + list_for_each(l, &tq->tq_active_list) { + tqt = list_entry(l, taskq_thread_t, tqt_active_list); + if (tqt->tqt_id == id) { + /* + * Instead of returning tqt_task, we just return a non + * NULL value to prevent misuse, since tqt_task only + * has two valid fields. + */ + return (ERR_PTR(-EBUSY)); + } + } + + return (NULL); +} + +/* + * Theory for the taskq_wait_id(), taskq_wait_outstanding(), and + * taskq_wait() functions below. + * + * Taskq waiting is accomplished by tracking the lowest outstanding task + * id and the next available task id. As tasks are dispatched they are + * added to the tail of the pending, priority, or delay lists. As worker + * threads become available the tasks are removed from the heads of these + * lists and linked to the worker threads. This ensures the lists are + * kept sorted by lowest to highest task id. + * + * Therefore the lowest outstanding task id can be quickly determined by + * checking the head item from all of these lists. This value is stored + * with the taskq as the lowest id. It only needs to be recalculated when + * either the task with the current lowest id completes or is canceled. + * + * By blocking until the lowest task id exceeds the passed task id the + * taskq_wait_outstanding() function can be easily implemented. Similarly, + * by blocking until the lowest task id matches the next task id taskq_wait() + * can be implemented. + * + * Callers should be aware that when there are multiple worked threads it + * is possible for larger task ids to complete before smaller ones. Also + * when the taskq contains delay tasks with small task ids callers may + * block for a considerable length of time waiting for them to expire and + * execute. + */ +static int +taskq_wait_id_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (taskq_find(tq, id) == NULL); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_id() function blocks until the passed task id completes. + * This does not guarantee that all lower task ids have completed. + */ +void +taskq_wait_id(taskq_t *tq, taskqid_t id) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_id_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_id); + +static int +taskq_wait_outstanding_check(taskq_t *tq, taskqid_t id) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (id < tq->tq_lowest_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait_outstanding() function will block until all tasks with a + * lower taskqid than the passed 'id' have been completed. Note that all + * task id's are assigned monotonically at dispatch time. Zero may be + * passed for the id to indicate all tasks dispatch up to this point, + * but not after, should be waited for. + */ +void +taskq_wait_outstanding(taskq_t *tq, taskqid_t id) +{ + id = id ? id : tq->tq_next_id - 1; + wait_event(tq->tq_wait_waitq, taskq_wait_outstanding_check(tq, id)); +} +EXPORT_SYMBOL(taskq_wait_outstanding); + +static int +taskq_wait_check(taskq_t *tq) +{ + int rc; + unsigned long flags; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + rc = (tq->tq_lowest_id == tq->tq_next_id); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + return (rc); +} + +/* + * The taskq_wait() function will block until the taskq is empty. + * This means that if a taskq re-dispatches work to itself taskq_wait() + * callers will block indefinitely. + */ +void +taskq_wait(taskq_t *tq) +{ + wait_event(tq->tq_wait_waitq, taskq_wait_check(tq)); +} +EXPORT_SYMBOL(taskq_wait); + +int +taskq_member(taskq_t *tq, kthread_t *t) +{ + return (tq == (taskq_t *)tsd_get_by_thread(taskq_tsd, t)); +} +EXPORT_SYMBOL(taskq_member); + +taskq_t * +taskq_of_curthread(void) +{ + return (tsd_get(taskq_tsd)); +} +EXPORT_SYMBOL(taskq_of_curthread); + +/* + * Cancel an already dispatched task given the task id. Still pending tasks + * will be immediately canceled, and if the task is active the function will + * block until it completes. Preallocated tasks which are canceled must be + * freed by the caller. + */ +int +taskq_cancel_id(taskq_t *tq, taskqid_t id) +{ + taskq_ent_t *t; + int rc = ENOENT; + unsigned long flags; + + ASSERT(tq); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + t = taskq_find(tq, id); + if (t && t != ERR_PTR(-EBUSY)) { + list_del_init(&t->tqent_list); + t->tqent_flags |= TQENT_FLAG_CANCEL; + + /* + * When canceling the lowest outstanding task id we + * must recalculate the new lowest outstanding id. + */ + if (tq->tq_lowest_id == t->tqent_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, t->tqent_id); + } + + /* + * The task_expire() function takes the tq->tq_lock so drop + * drop the lock before synchronously cancelling the timer. + */ + if (timer_pending(&t->tqent_timer)) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + del_timer_sync(&t->tqent_timer); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + if (!(t->tqent_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + rc = 0; + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + + if (t == ERR_PTR(-EBUSY)) { + taskq_wait_id(tq, id); + rc = EBUSY; + } + + return (rc); +} +EXPORT_SYMBOL(taskq_cancel_id); + +static int taskq_thread_spawn(taskq_t *tq); + +taskqid_t +taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags) +{ + taskq_ent_t *t; + taskqid_t rc = TASKQID_INVALID; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + /* Do not queue the task unless there is idle thread for it */ + ASSERT(tq->tq_nactive <= tq->tq_nthreads); + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out; + } + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the front of the list to enforce TQ_NOQUEUE semantics */ + if (flags & TQ_NOQUEUE) + list_add(&t->tqent_list, &tq->tq_prio_list); + /* Queue to the priority list instead of the pending list */ + else if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.function = NULL; + t->tqent_timer.expires = 0; + + t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (!(flags & TQ_NOQUEUE) && tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch); + +taskqid_t +taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg, + uint_t flags, clock_t expire_time) +{ + taskqid_t rc = TASKQID_INVALID; + taskq_ent_t *t; + unsigned long irqflags; + + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) + goto out; + + if ((t = task_alloc(tq, flags, &irqflags)) == NULL) + goto out; + + spin_lock(&t->tqent_lock); + + /* Queue to the delay list for subsequent execution */ + list_add_tail(&t->tqent_list, &tq->tq_delay_list); + + t->tqent_id = rc = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + t->tqent_timer.function = task_expire; + t->tqent_timer.expires = (unsigned long)expire_time; + add_timer(&t->tqent_timer); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + spin_unlock(&t->tqent_lock); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + return (rc); +} +EXPORT_SYMBOL(taskq_dispatch_delay); + +void +taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint_t flags, + taskq_ent_t *t) +{ + unsigned long irqflags; + ASSERT(tq); + ASSERT(func); + + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + /* Taskq being destroyed and all tasks drained */ + if (!(tq->tq_flags & TASKQ_ACTIVE)) { + t->tqent_id = TASKQID_INVALID; + goto out; + } + + if ((flags & TQ_NOQUEUE) && (tq->tq_nactive == tq->tq_nthreads)) { + /* Dynamic taskq may be able to spawn another thread */ + if (!(tq->tq_flags & TASKQ_DYNAMIC) || + taskq_thread_spawn(tq) == 0) + goto out2; + flags |= TQ_FRONT; + } + + spin_lock(&t->tqent_lock); + + /* + * Make sure the entry is not on some other taskq; it is important to + * ASSERT() under lock + */ + ASSERT(taskq_empty_ent(t)); + + /* + * Mark it as a prealloc'd task. This is important + * to ensure that we don't free it later. + */ + t->tqent_flags |= TQENT_FLAG_PREALLOC; + + /* Queue to the priority list instead of the pending list */ + if (flags & TQ_FRONT) + list_add_tail(&t->tqent_list, &tq->tq_prio_list); + else + list_add_tail(&t->tqent_list, &tq->tq_pend_list); + + t->tqent_id = tq->tq_next_id; + tq->tq_next_id++; + t->tqent_func = func; + t->tqent_arg = arg; + t->tqent_taskq = tq; + + t->tqent_birth = jiffies; + DTRACE_PROBE1(taskq_ent__birth, taskq_ent_t *, t); + + spin_unlock(&t->tqent_lock); + + wake_up(&tq->tq_work_waitq); +out: + /* Spawn additional taskq threads if required. */ + if (tq->tq_nactive == tq->tq_nthreads) + (void) taskq_thread_spawn(tq); +out2: + spin_unlock_irqrestore(&tq->tq_lock, irqflags); +} +EXPORT_SYMBOL(taskq_dispatch_ent); + +int +taskq_empty_ent(taskq_ent_t *t) +{ + return (list_empty(&t->tqent_list)); +} +EXPORT_SYMBOL(taskq_empty_ent); + +void +taskq_init_ent(taskq_ent_t *t) +{ + spin_lock_init(&t->tqent_lock); + init_waitqueue_head(&t->tqent_waitq); + timer_setup(&t->tqent_timer, NULL, 0); + INIT_LIST_HEAD(&t->tqent_list); + t->tqent_id = 0; + t->tqent_func = NULL; + t->tqent_arg = NULL; + t->tqent_flags = 0; + t->tqent_taskq = NULL; +} +EXPORT_SYMBOL(taskq_init_ent); + +/* + * Return the next pending task, preference is given to tasks on the + * priority list which were dispatched with TQ_FRONT. + */ +static taskq_ent_t * +taskq_next_ent(taskq_t *tq) +{ + struct list_head *list; + + if (!list_empty(&tq->tq_prio_list)) + list = &tq->tq_prio_list; + else if (!list_empty(&tq->tq_pend_list)) + list = &tq->tq_pend_list; + else + return (NULL); + + return (list_entry(list->next, taskq_ent_t, tqent_list)); +} + +/* + * Spawns a new thread for the specified taskq. + */ +static void +taskq_thread_spawn_task(void *arg) +{ + taskq_t *tq = (taskq_t *)arg; + unsigned long flags; + + if (taskq_thread_create(tq) == NULL) { + /* restore spawning count if failed */ + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nspawn--; + spin_unlock_irqrestore(&tq->tq_lock, flags); + } +} + +/* + * Spawn addition threads for dynamic taskqs (TASKQ_DYNAMIC) the current + * number of threads is insufficient to handle the pending tasks. These + * new threads must be created by the dedicated dynamic_taskq to avoid + * deadlocks between thread creation and memory reclaim. The system_taskq + * which is also a dynamic taskq cannot be safely used for this. + */ +static int +taskq_thread_spawn(taskq_t *tq) +{ + int spawning = 0; + + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if ((tq->tq_nthreads + tq->tq_nspawn < tq->tq_maxthreads) && + (tq->tq_flags & TASKQ_ACTIVE)) { + spawning = (++tq->tq_nspawn); + taskq_dispatch(dynamic_taskq, taskq_thread_spawn_task, + tq, TQ_NOSLEEP); + } + + return (spawning); +} + +/* + * Threads in a dynamic taskq should only exit once it has been completely + * drained and no other threads are actively servicing tasks. This prevents + * threads from being created and destroyed more than is required. + * + * The first thread is the thread list is treated as the primary thread. + * There is nothing special about the primary thread but in order to avoid + * all the taskq pids from changing we opt to make it long running. + */ +static int +taskq_thread_should_stop(taskq_t *tq, taskq_thread_t *tqt) +{ + if (!(tq->tq_flags & TASKQ_DYNAMIC)) + return (0); + + if (list_first_entry(&(tq->tq_thread_list), taskq_thread_t, + tqt_thread_list) == tqt) + return (0); + + return + ((tq->tq_nspawn == 0) && /* No threads are being spawned */ + (tq->tq_nactive == 0) && /* No threads are handling tasks */ + (tq->tq_nthreads > 1) && /* More than 1 thread is running */ + (!taskq_next_ent(tq)) && /* There are no pending tasks */ + (spl_taskq_thread_dynamic)); /* Dynamic taskqs are allowed */ +} + +static int +taskq_thread(void *args) +{ + DECLARE_WAITQUEUE(wait, current); + sigset_t blocked; + taskq_thread_t *tqt = args; + taskq_t *tq; + taskq_ent_t *t; + int seq_tasks = 0; + unsigned long flags; + taskq_ent_t dup_task = {}; + + ASSERT(tqt); + ASSERT(tqt->tqt_tq); + tq = tqt->tqt_tq; + current->flags |= PF_NOFREEZE; + + (void) spl_fstrans_mark(); + + sigfillset(&blocked); + sigprocmask(SIG_BLOCK, &blocked, NULL); + flush_signals(current); + + tsd_set(taskq_tsd, tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* + * If we are dynamically spawned, decrease spawning count. Note that + * we could be created during taskq_create, in which case we shouldn't + * do the decrement. But it's fine because taskq_create will reset + * tq_nspawn later. + */ + if (tq->tq_flags & TASKQ_DYNAMIC) + tq->tq_nspawn--; + + /* Immediately exit if more threads than allowed were created. */ + if (tq->tq_nthreads >= tq->tq_maxthreads) + goto error; + + tq->tq_nthreads++; + list_add_tail(&tqt->tqt_thread_list, &tq->tq_thread_list); + wake_up(&tq->tq_wait_waitq); + set_current_state(TASK_INTERRUPTIBLE); + + while (!kthread_should_stop()) { + + if (list_empty(&tq->tq_pend_list) && + list_empty(&tq->tq_prio_list)) { + + if (taskq_thread_should_stop(tq, tqt)) { + wake_up_all(&tq->tq_wait_waitq); + break; + } + + add_wait_queue_exclusive(&tq->tq_work_waitq, &wait); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + schedule(); + seq_tasks = 0; + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + remove_wait_queue(&tq->tq_work_waitq, &wait); + } else { + __set_current_state(TASK_RUNNING); + } + + if ((t = taskq_next_ent(tq)) != NULL) { + list_del_init(&t->tqent_list); + + /* + * A TQENT_FLAG_PREALLOC task may be reused or freed + * during the task function call. Store tqent_id and + * tqent_flags here. + * + * Also use an on stack taskq_ent_t for tqt_task + * assignment in this case; we want to make sure + * to duplicate all fields, so the values are + * correct when it's accessed via DTRACE_PROBE*. + */ + tqt->tqt_id = t->tqent_id; + tqt->tqt_flags = t->tqent_flags; + + if (t->tqent_flags & TQENT_FLAG_PREALLOC) { + dup_task = *t; + t = &dup_task; + } + tqt->tqt_task = t; + + taskq_insert_in_order(tq, tqt); + tq->tq_nactive++; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + DTRACE_PROBE1(taskq_ent__start, taskq_ent_t *, t); + + /* Perform the requested task */ + t->tqent_func(t->tqent_arg); + + DTRACE_PROBE1(taskq_ent__finish, taskq_ent_t *, t); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + tq->tq_nactive--; + list_del_init(&tqt->tqt_active_list); + tqt->tqt_task = NULL; + + /* For prealloc'd tasks, we don't free anything. */ + if (!(tqt->tqt_flags & TQENT_FLAG_PREALLOC)) + task_done(tq, t); + + /* + * When the current lowest outstanding taskqid is + * done calculate the new lowest outstanding id + */ + if (tq->tq_lowest_id == tqt->tqt_id) { + tq->tq_lowest_id = taskq_lowest_id(tq); + ASSERT3S(tq->tq_lowest_id, >, tqt->tqt_id); + } + + /* Spawn additional taskq threads if required. */ + if ((++seq_tasks) > spl_taskq_thread_sequential && + taskq_thread_spawn(tq)) + seq_tasks = 0; + + tqt->tqt_id = TASKQID_INVALID; + tqt->tqt_flags = 0; + wake_up_all(&tq->tq_wait_waitq); + } else { + if (taskq_thread_should_stop(tq, tqt)) + break; + } + + set_current_state(TASK_INTERRUPTIBLE); + + } + + __set_current_state(TASK_RUNNING); + tq->tq_nthreads--; + list_del_init(&tqt->tqt_thread_list); +error: + kmem_free(tqt, sizeof (taskq_thread_t)); + spin_unlock_irqrestore(&tq->tq_lock, flags); + + tsd_set(taskq_tsd, NULL); + + return (0); +} + +static taskq_thread_t * +taskq_thread_create(taskq_t *tq) +{ + static int last_used_cpu = 0; + taskq_thread_t *tqt; + + tqt = kmem_alloc(sizeof (*tqt), KM_PUSHPAGE); + INIT_LIST_HEAD(&tqt->tqt_thread_list); + INIT_LIST_HEAD(&tqt->tqt_active_list); + tqt->tqt_tq = tq; + tqt->tqt_id = TASKQID_INVALID; + + tqt->tqt_thread = spl_kthread_create(taskq_thread, tqt, + "%s", tq->tq_name); + if (tqt->tqt_thread == NULL) { + kmem_free(tqt, sizeof (taskq_thread_t)); + return (NULL); + } + + if (spl_taskq_thread_bind) { + last_used_cpu = (last_used_cpu + 1) % num_online_cpus(); + kthread_bind(tqt->tqt_thread, last_used_cpu); + } + + if (spl_taskq_thread_priority) + set_user_nice(tqt->tqt_thread, PRIO_TO_NICE(tq->tq_pri)); + + wake_up_process(tqt->tqt_thread); + + return (tqt); +} + +taskq_t * +taskq_create(const char *name, int nthreads, pri_t pri, + int minalloc, int maxalloc, uint_t flags) +{ + taskq_t *tq; + taskq_thread_t *tqt; + int count = 0, rc = 0, i; + unsigned long irqflags; + + ASSERT(name != NULL); + ASSERT(minalloc >= 0); + ASSERT(maxalloc <= INT_MAX); + ASSERT(!(flags & (TASKQ_CPR_SAFE))); /* Unsupported */ + + /* Scale the number of threads using nthreads as a percentage */ + if (flags & TASKQ_THREADS_CPU_PCT) { + ASSERT(nthreads <= 100); + ASSERT(nthreads >= 0); + nthreads = MIN(nthreads, 100); + nthreads = MAX(nthreads, 0); + nthreads = MAX((num_online_cpus() * nthreads) / 100, 1); + } + + tq = kmem_alloc(sizeof (*tq), KM_PUSHPAGE); + if (tq == NULL) + return (NULL); + + spin_lock_init(&tq->tq_lock); + INIT_LIST_HEAD(&tq->tq_thread_list); + INIT_LIST_HEAD(&tq->tq_active_list); + tq->tq_name = kmem_strdup(name); + tq->tq_nactive = 0; + tq->tq_nthreads = 0; + tq->tq_nspawn = 0; + tq->tq_maxthreads = nthreads; + tq->tq_pri = pri; + tq->tq_minalloc = minalloc; + tq->tq_maxalloc = maxalloc; + tq->tq_nalloc = 0; + tq->tq_flags = (flags | TASKQ_ACTIVE); + tq->tq_next_id = TASKQID_INITIAL; + tq->tq_lowest_id = TASKQID_INITIAL; + INIT_LIST_HEAD(&tq->tq_free_list); + INIT_LIST_HEAD(&tq->tq_pend_list); + INIT_LIST_HEAD(&tq->tq_prio_list); + INIT_LIST_HEAD(&tq->tq_delay_list); + init_waitqueue_head(&tq->tq_work_waitq); + init_waitqueue_head(&tq->tq_wait_waitq); + tq->tq_lock_class = TQ_LOCK_GENERAL; + INIT_LIST_HEAD(&tq->tq_taskqs); + + if (flags & TASKQ_PREPOPULATE) { + spin_lock_irqsave_nested(&tq->tq_lock, irqflags, + tq->tq_lock_class); + + for (i = 0; i < minalloc; i++) + task_done(tq, task_alloc(tq, TQ_PUSHPAGE | TQ_NEW, + &irqflags)); + + spin_unlock_irqrestore(&tq->tq_lock, irqflags); + } + + if ((flags & TASKQ_DYNAMIC) && spl_taskq_thread_dynamic) + nthreads = 1; + + for (i = 0; i < nthreads; i++) { + tqt = taskq_thread_create(tq); + if (tqt == NULL) + rc = 1; + else + count++; + } + + /* Wait for all threads to be started before potential destroy */ + wait_event(tq->tq_wait_waitq, tq->tq_nthreads == count); + /* + * taskq_thread might have touched nspawn, but we don't want them to + * because they're not dynamically spawned. So we reset it to 0 + */ + tq->tq_nspawn = 0; + + if (rc) { + taskq_destroy(tq); + tq = NULL; + } else { + down_write(&tq_list_sem); + tq->tq_instance = taskq_find_by_name(name) + 1; + list_add_tail(&tq->tq_taskqs, &tq_list); + up_write(&tq_list_sem); + } + + return (tq); +} +EXPORT_SYMBOL(taskq_create); + +void +taskq_destroy(taskq_t *tq) +{ + struct task_struct *thread; + taskq_thread_t *tqt; + taskq_ent_t *t; + unsigned long flags; + + ASSERT(tq); + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + tq->tq_flags &= ~TASKQ_ACTIVE; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + /* + * When TASKQ_ACTIVE is clear new tasks may not be added nor may + * new worker threads be spawned for dynamic taskq. + */ + if (dynamic_taskq != NULL) + taskq_wait_outstanding(dynamic_taskq, 0); + + taskq_wait(tq); + + /* remove taskq from global list used by the kstats */ + down_write(&tq_list_sem); + list_del(&tq->tq_taskqs); + up_write(&tq_list_sem); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, tq->tq_lock_class); + /* wait for spawning threads to insert themselves to the list */ + while (tq->tq_nspawn) { + spin_unlock_irqrestore(&tq->tq_lock, flags); + schedule_timeout_interruptible(1); + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + /* + * Signal each thread to exit and block until it does. Each thread + * is responsible for removing itself from the list and freeing its + * taskq_thread_t. This allows for idle threads to opt to remove + * themselves from the taskq. They can be recreated as needed. + */ + while (!list_empty(&tq->tq_thread_list)) { + tqt = list_entry(tq->tq_thread_list.next, + taskq_thread_t, tqt_thread_list); + thread = tqt->tqt_thread; + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kthread_stop(thread); + + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + } + + while (!list_empty(&tq->tq_free_list)) { + t = list_entry(tq->tq_free_list.next, taskq_ent_t, tqent_list); + + ASSERT(!(t->tqent_flags & TQENT_FLAG_PREALLOC)); + + list_del_init(&t->tqent_list); + task_free(tq, t); + } + + ASSERT0(tq->tq_nthreads); + ASSERT0(tq->tq_nalloc); + ASSERT0(tq->tq_nspawn); + ASSERT(list_empty(&tq->tq_thread_list)); + ASSERT(list_empty(&tq->tq_active_list)); + ASSERT(list_empty(&tq->tq_free_list)); + ASSERT(list_empty(&tq->tq_pend_list)); + ASSERT(list_empty(&tq->tq_prio_list)); + ASSERT(list_empty(&tq->tq_delay_list)); + + spin_unlock_irqrestore(&tq->tq_lock, flags); + + kmem_strfree(tq->tq_name); + kmem_free(tq, sizeof (taskq_t)); +} +EXPORT_SYMBOL(taskq_destroy); + + +static unsigned int spl_taskq_kick = 0; + +/* + * 2.6.36 API Change + * module_param_cb is introduced to take kernel_param_ops and + * module_param_call is marked as obsolete. Also set and get operations + * were changed to take a 'const struct kernel_param *'. + */ +static int +#ifdef module_param_cb +param_set_taskq_kick(const char *val, const struct kernel_param *kp) +#else +param_set_taskq_kick(const char *val, struct kernel_param *kp) +#endif +{ + int ret; + taskq_t *tq = NULL; + taskq_ent_t *t; + unsigned long flags; + + ret = param_set_uint(val, kp); + if (ret < 0 || !spl_taskq_kick) + return (ret); + /* reset value */ + spl_taskq_kick = 0; + + down_read(&tq_list_sem); + list_for_each_entry(tq, &tq_list, tq_taskqs) { + spin_lock_irqsave_nested(&tq->tq_lock, flags, + tq->tq_lock_class); + /* Check if the first pending is older than 5 seconds */ + t = taskq_next_ent(tq); + if (t && time_after(jiffies, t->tqent_birth + 5*HZ)) { + (void) taskq_thread_spawn(tq); + printk(KERN_INFO "spl: Kicked taskq %s/%d\n", + tq->tq_name, tq->tq_instance); + } + spin_unlock_irqrestore(&tq->tq_lock, flags); + } + up_read(&tq_list_sem); + return (ret); +} + +#ifdef module_param_cb +static const struct kernel_param_ops param_ops_taskq_kick = { + .set = param_set_taskq_kick, + .get = param_get_uint, +}; +module_param_cb(spl_taskq_kick, ¶m_ops_taskq_kick, &spl_taskq_kick, 0644); +#else +module_param_call(spl_taskq_kick, param_set_taskq_kick, param_get_uint, + &spl_taskq_kick, 0644); +#endif +MODULE_PARM_DESC(spl_taskq_kick, + "Write nonzero to kick stuck taskqs to spawn more threads"); + +int +spl_taskq_init(void) +{ + init_rwsem(&tq_list_sem); + tsd_create(&taskq_tsd, NULL); + + system_taskq = taskq_create("spl_system_taskq", MAX(boot_ncpus, 64), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_taskq == NULL) + return (1); + + system_delay_taskq = taskq_create("spl_delay_taskq", MAX(boot_ncpus, 4), + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE|TASKQ_DYNAMIC); + if (system_delay_taskq == NULL) { + taskq_destroy(system_taskq); + return (1); + } + + dynamic_taskq = taskq_create("spl_dynamic_taskq", 1, + maxclsyspri, boot_ncpus, INT_MAX, TASKQ_PREPOPULATE); + if (dynamic_taskq == NULL) { + taskq_destroy(system_taskq); + taskq_destroy(system_delay_taskq); + return (1); + } + + /* + * This is used to annotate tq_lock, so + * taskq_dispatch -> taskq_thread_spawn -> taskq_dispatch + * does not trigger a lockdep warning re: possible recursive locking + */ + dynamic_taskq->tq_lock_class = TQ_LOCK_DYNAMIC; + + return (0); +} + +void +spl_taskq_fini(void) +{ + taskq_destroy(dynamic_taskq); + dynamic_taskq = NULL; + + taskq_destroy(system_delay_taskq); + system_delay_taskq = NULL; + + taskq_destroy(system_taskq); + system_taskq = NULL; + + tsd_destroy(&taskq_tsd); +} diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c new file mode 100644 index 000000000000..0352a31ea835 --- /dev/null +++ b/module/os/linux/spl/spl-thread.c @@ -0,0 +1,161 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Thread Implementation. + */ + +#include +#include +#include + +/* + * Thread interfaces + */ +typedef struct thread_priv_s { + unsigned long tp_magic; /* Magic */ + int tp_name_size; /* Name size */ + char *tp_name; /* Name (without _thread suffix) */ + void (*tp_func)(void *); /* Registered function */ + void *tp_args; /* Args to be passed to function */ + size_t tp_len; /* Len to be passed to function */ + int tp_state; /* State to start thread at */ + pri_t tp_pri; /* Priority to start threat at */ +} thread_priv_t; + +static int +thread_generic_wrapper(void *arg) +{ + thread_priv_t *tp = (thread_priv_t *)arg; + void (*func)(void *); + void *args; + + ASSERT(tp->tp_magic == TP_MAGIC); + func = tp->tp_func; + args = tp->tp_args; + set_current_state(tp->tp_state); + set_user_nice((kthread_t *)current, PRIO_TO_NICE(tp->tp_pri)); + kmem_free(tp->tp_name, tp->tp_name_size); + kmem_free(tp, sizeof (thread_priv_t)); + + if (func) + func(args); + + return (0); +} + +void +__thread_exit(void) +{ + tsd_exit(); + complete_and_exit(NULL, 0); + /* Unreachable */ +} +EXPORT_SYMBOL(__thread_exit); + +/* + * thread_create() may block forever if it cannot create a thread or + * allocate memory. This is preferable to returning a NULL which Solaris + * style callers likely never check for... since it can't fail. + */ +kthread_t * +__thread_create(caddr_t stk, size_t stksize, thread_func_t func, + const char *name, void *args, size_t len, proc_t *pp, int state, pri_t pri) +{ + thread_priv_t *tp; + struct task_struct *tsk; + char *p; + + /* Option pp is simply ignored */ + /* Variable stack size unsupported */ + ASSERT(stk == NULL); + + tp = kmem_alloc(sizeof (thread_priv_t), KM_PUSHPAGE); + if (tp == NULL) + return (NULL); + + tp->tp_magic = TP_MAGIC; + tp->tp_name_size = strlen(name) + 1; + + tp->tp_name = kmem_alloc(tp->tp_name_size, KM_PUSHPAGE); + if (tp->tp_name == NULL) { + kmem_free(tp, sizeof (thread_priv_t)); + return (NULL); + } + + strncpy(tp->tp_name, name, tp->tp_name_size); + + /* + * Strip trailing "_thread" from passed name which will be the func + * name since the exposed API has no parameter for passing a name. + */ + p = strstr(tp->tp_name, "_thread"); + if (p) + p[0] = '\0'; + + tp->tp_func = func; + tp->tp_args = args; + tp->tp_len = len; + tp->tp_state = state; + tp->tp_pri = pri; + + tsk = spl_kthread_create(thread_generic_wrapper, (void *)tp, + "%s", tp->tp_name); + if (IS_ERR(tsk)) + return (NULL); + + wake_up_process(tsk); + return ((kthread_t *)tsk); +} +EXPORT_SYMBOL(__thread_create); + +/* + * spl_kthread_create - Wrapper providing pre-3.13 semantics for + * kthread_create() in which it is not killable and less likely + * to return -ENOMEM. + */ +struct task_struct * +spl_kthread_create(int (*func)(void *), void *data, const char namefmt[], ...) +{ + struct task_struct *tsk; + va_list args; + char name[TASK_COMM_LEN]; + + va_start(args, namefmt); + vsnprintf(name, sizeof (name), namefmt, args); + va_end(args); + do { + tsk = kthread_create(func, data, "%s", name); + if (IS_ERR(tsk)) { + if (signal_pending(current)) { + clear_thread_flag(TIF_SIGPENDING); + continue; + } + if (PTR_ERR(tsk) == -ENOMEM) + continue; + return (NULL); + } else { + return (tsk); + } + } while (1); +} +EXPORT_SYMBOL(spl_kthread_create); diff --git a/module/os/linux/spl/spl-trace.c b/module/os/linux/spl/spl-trace.c new file mode 100644 index 000000000000..7912a381294d --- /dev/null +++ b/module/os/linux/spl/spl-trace.c @@ -0,0 +1,33 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Each DTRACE_PROBE must define its trace point in one (and only one) + * source file, so this dummy file exists for that purpose. + */ + +#include + +#ifdef _KERNEL +#define CREATE_TRACE_POINTS +#include +#include +#endif diff --git a/module/os/linux/spl/spl-tsd.c b/module/os/linux/spl/spl-tsd.c new file mode 100644 index 000000000000..b955ed65470f --- /dev/null +++ b/module/os/linux/spl/spl-tsd.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * + * Solaris Porting Layer (SPL) Thread Specific Data Implementation. + * + * Thread specific data has implemented using a hash table, this avoids + * the need to add a member to the task structure and allows maximum + * portability between kernels. This implementation has been optimized + * to keep the tsd_set() and tsd_get() times as small as possible. + * + * The majority of the entries in the hash table are for specific tsd + * entries. These entries are hashed by the product of their key and + * pid because by design the key and pid are guaranteed to be unique. + * Their product also has the desirable properly that it will be uniformly + * distributed over the hash bins providing neither the pid nor key is zero. + * Under linux the zero pid is always the init process and thus won't be + * used, and this implementation is careful to never to assign a zero key. + * By default the hash table is sized to 512 bins which is expected to + * be sufficient for light to moderate usage of thread specific data. + * + * The hash table contains two additional type of entries. They first + * type is entry is called a 'key' entry and it is added to the hash during + * tsd_create(). It is used to store the address of the destructor function + * and it is used as an anchor point. All tsd entries which use the same + * key will be linked to this entry. This is used during tsd_destroy() to + * quickly call the destructor function for all tsd associated with the key. + * The 'key' entry may be looked up with tsd_hash_search() by passing the + * key you wish to lookup and DTOR_PID constant as the pid. + * + * The second type of entry is called a 'pid' entry and it is added to the + * hash the first time a process set a key. The 'pid' entry is also used + * as an anchor and all tsd for the process will be linked to it. This + * list is using during tsd_exit() to ensure all registered destructors + * are run for the process. The 'pid' entry may be looked up with + * tsd_hash_search() by passing the PID_KEY constant as the key, and + * the process pid. Note that tsd_exit() is called by thread_exit() + * so if your using the Solaris thread API you should not need to call + * tsd_exit() directly. + * + */ + +#include +#include +#include +#include + +typedef struct tsd_hash_bin { + spinlock_t hb_lock; + struct hlist_head hb_head; +} tsd_hash_bin_t; + +typedef struct tsd_hash_table { + spinlock_t ht_lock; + uint_t ht_bits; + uint_t ht_key; + tsd_hash_bin_t *ht_bins; +} tsd_hash_table_t; + +typedef struct tsd_hash_entry { + uint_t he_key; + pid_t he_pid; + dtor_func_t he_dtor; + void *he_value; + struct hlist_node he_list; + struct list_head he_key_list; + struct list_head he_pid_list; +} tsd_hash_entry_t; + +static tsd_hash_table_t *tsd_hash_table = NULL; + + +/* + * tsd_hash_search - searches hash table for tsd_hash_entry + * @table: hash table + * @key: search key + * @pid: search pid + */ +static tsd_hash_entry_t * +tsd_hash_search(tsd_hash_table_t *table, uint_t key, pid_t pid) +{ + struct hlist_node *node = NULL; + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + hlist_for_each(node, &bin->hb_head) { + entry = list_entry(node, tsd_hash_entry_t, he_list); + if ((entry->he_key == key) && (entry->he_pid == pid)) { + spin_unlock(&bin->hb_lock); + return (entry); + } + } + + spin_unlock(&bin->hb_lock); + return (NULL); +} + +/* + * tsd_hash_dtor - call the destructor and free all entries on the list + * @work: list of hash entries + * + * For a list of entries which have all already been removed from the + * hash call their registered destructor then free the associated memory. + */ +static void +tsd_hash_dtor(struct hlist_head *work) +{ + tsd_hash_entry_t *entry; + + while (!hlist_empty(work)) { + entry = hlist_entry(work->first, tsd_hash_entry_t, he_list); + hlist_del(&entry->he_list); + + if (entry->he_dtor && entry->he_pid != DTOR_PID) + entry->he_dtor(entry->he_value); + + kmem_free(entry, sizeof (tsd_hash_entry_t)); + } +} + +/* + * tsd_hash_add - adds an entry to hash table + * @table: hash table + * @key: search key + * @pid: search pid + * + * The caller is responsible for ensuring the unique key/pid do not + * already exist in the hash table. This possible because all entries + * are thread specific thus a concurrent thread will never attempt to + * add this key/pid. Because multiple bins must be checked to add + * links to the dtor and pid entries the entire table is locked. + */ +static int +tsd_hash_add(tsd_hash_table_t *table, uint_t key, pid_t pid, void *value) +{ + tsd_hash_entry_t *entry, *dtor_entry, *pid_entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int rc = 0; + + ASSERT3P(tsd_hash_search(table, key, pid), ==, NULL); + + /* New entry allocate structure, set value, and add to hash */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + entry->he_key = key; + entry->he_pid = pid; + entry->he_value = value; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + spin_lock(&table->ht_lock); + + /* Destructor entry must exist for all valid keys */ + dtor_entry = tsd_hash_search(table, entry->he_key, DTOR_PID); + ASSERT3P(dtor_entry, !=, NULL); + entry->he_dtor = dtor_entry->he_dtor; + + /* Process entry must exist for all valid processes */ + pid_entry = tsd_hash_search(table, PID_KEY, entry->he_pid); + ASSERT3P(pid_entry, !=, NULL); + + hash = hash_long((ulong_t)key * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + /* Add to the hash, key, and pid lists */ + hlist_add_head(&entry->he_list, &bin->hb_head); + list_add(&entry->he_key_list, &dtor_entry->he_key_list); + list_add(&entry->he_pid_list, &pid_entry->he_pid_list); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (rc); +} + +/* + * tsd_hash_add_key - adds a destructor entry to the hash table + * @table: hash table + * @keyp: search key + * @dtor: key destructor + * + * For every unique key there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this key are linked + * to this anchor via the 'he_key_list' list head. On return they keyp + * will be set to the next available key for the hash table. + */ +static int +tsd_hash_add_key(tsd_hash_table_t *table, uint_t *keyp, dtor_func_t dtor) +{ + tsd_hash_entry_t *tmp_entry, *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + int keys_checked = 0; + + ASSERT3P(table, !=, NULL); + + /* Allocate entry to be used as a destructor for this key */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + /* Determine next available key value */ + spin_lock(&table->ht_lock); + do { + /* Limited to TSD_KEYS_MAX concurrent unique keys */ + if (table->ht_key++ > TSD_KEYS_MAX) + table->ht_key = 1; + + /* Ensure failure when all TSD_KEYS_MAX keys are in use */ + if (keys_checked++ >= TSD_KEYS_MAX) { + spin_unlock(&table->ht_lock); + return (ENOENT); + } + + tmp_entry = tsd_hash_search(table, table->ht_key, DTOR_PID); + } while (tmp_entry); + + /* Add destructor entry in to hash table */ + entry->he_key = *keyp = table->ht_key; + entry->he_pid = DTOR_PID; + entry->he_dtor = dtor; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)*keyp * (ulong_t)DTOR_PID, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_add_pid - adds a process entry to the hash table + * @table: hash table + * @pid: search pid + * + * For every process there is a single entry in the hash which is used + * as anchor. All other thread specific entries for this process are + * linked to this anchor via the 'he_pid_list' list head. + */ +static int +tsd_hash_add_pid(tsd_hash_table_t *table, pid_t pid) +{ + tsd_hash_entry_t *entry; + tsd_hash_bin_t *bin; + ulong_t hash; + + /* Allocate entry to be used as the process reference */ + entry = kmem_alloc(sizeof (tsd_hash_entry_t), KM_PUSHPAGE); + if (entry == NULL) + return (ENOMEM); + + spin_lock(&table->ht_lock); + entry->he_key = PID_KEY; + entry->he_pid = pid; + entry->he_dtor = NULL; + entry->he_value = NULL; + INIT_HLIST_NODE(&entry->he_list); + INIT_LIST_HEAD(&entry->he_key_list); + INIT_LIST_HEAD(&entry->he_pid_list); + + hash = hash_long((ulong_t)PID_KEY * (ulong_t)pid, table->ht_bits); + bin = &table->ht_bins[hash]; + spin_lock(&bin->hb_lock); + + hlist_add_head(&entry->he_list, &bin->hb_head); + + spin_unlock(&bin->hb_lock); + spin_unlock(&table->ht_lock); + + return (0); +} + +/* + * tsd_hash_del - delete an entry from hash table, key, and pid lists + * @table: hash table + * @key: search key + * @pid: search pid + */ +static void +tsd_hash_del(tsd_hash_table_t *table, tsd_hash_entry_t *entry) +{ + hlist_del(&entry->he_list); + list_del_init(&entry->he_key_list); + list_del_init(&entry->he_pid_list); +} + +/* + * tsd_hash_table_init - allocate a hash table + * @bits: hash table size + * + * A hash table with 2^bits bins will be created, it may not be resized + * after the fact and must be free'd with tsd_hash_table_fini(). + */ +static tsd_hash_table_t * +tsd_hash_table_init(uint_t bits) +{ + tsd_hash_table_t *table; + int hash, size = (1 << bits); + + table = kmem_zalloc(sizeof (tsd_hash_table_t), KM_SLEEP); + if (table == NULL) + return (NULL); + + table->ht_bins = kmem_zalloc(sizeof (tsd_hash_bin_t) * size, KM_SLEEP); + if (table->ht_bins == NULL) { + kmem_free(table, sizeof (tsd_hash_table_t)); + return (NULL); + } + + for (hash = 0; hash < size; hash++) { + spin_lock_init(&table->ht_bins[hash].hb_lock); + INIT_HLIST_HEAD(&table->ht_bins[hash].hb_head); + } + + spin_lock_init(&table->ht_lock); + table->ht_bits = bits; + table->ht_key = 1; + + return (table); +} + +/* + * tsd_hash_table_fini - free a hash table + * @table: hash table + * + * Free a hash table allocated by tsd_hash_table_init(). If the hash + * table is not empty this function will call the proper destructor for + * all remaining entries before freeing the memory used by those entries. + */ +static void +tsd_hash_table_fini(tsd_hash_table_t *table) +{ + HLIST_HEAD(work); + tsd_hash_bin_t *bin; + tsd_hash_entry_t *entry; + int size, i; + + ASSERT3P(table, !=, NULL); + spin_lock(&table->ht_lock); + for (i = 0, size = (1 << table->ht_bits); i < size; i++) { + bin = &table->ht_bins[i]; + spin_lock(&bin->hb_lock); + while (!hlist_empty(&bin->hb_head)) { + entry = hlist_entry(bin->hb_head.first, + tsd_hash_entry_t, he_list); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + } + spin_unlock(&bin->hb_lock); + } + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + kmem_free(table->ht_bins, sizeof (tsd_hash_bin_t)*(1<ht_bits)); + kmem_free(table, sizeof (tsd_hash_table_t)); +} + +/* + * tsd_remove_entry - remove a tsd entry for this thread + * @entry: entry to remove + * + * Remove the thread specific data @entry for this thread. + * If this is the last entry for this thread, also remove the PID entry. + */ +static void +tsd_remove_entry(tsd_hash_entry_t *entry) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + ASSERT3P(entry, !=, NULL); + + spin_lock(&table->ht_lock); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + /* save the possible pid_entry */ + pid_entry = list_entry(entry->he_pid_list.next, tsd_hash_entry_t, + he_pid_list); + + /* remove entry */ + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + + /* if pid_entry is indeed pid_entry, then remove it if it's empty */ + if (pid_entry->he_key == PID_KEY && + list_empty(&pid_entry->he_pid_list)) { + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + } + + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} + +/* + * tsd_set - set thread specific data + * @key: lookup key + * @value: value to set + * + * Caller must prevent racing tsd_create() or tsd_destroy(), protected + * from racing tsd_get() or tsd_set() because it is thread specific. + * This function has been optimized to be fast for the update case. + * When setting the tsd initially it will be slower due to additional + * required locking and potential memory allocations. + */ +int +tsd_set(uint_t key, void *value) +{ + tsd_hash_table_t *table; + tsd_hash_entry_t *entry; + pid_t pid; + int rc; + /* mark remove if value is NULL */ + boolean_t remove = (value == NULL); + + table = tsd_hash_table; + pid = curthread->pid; + ASSERT3P(table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (EINVAL); + + /* Entry already exists in hash table update value */ + entry = tsd_hash_search(table, key, pid); + if (entry) { + entry->he_value = value; + /* remove the entry */ + if (remove) + tsd_remove_entry(entry); + return (0); + } + + /* don't create entry if value is NULL */ + if (remove) + return (0); + + /* Add a process entry to the hash if not yet exists */ + entry = tsd_hash_search(table, PID_KEY, pid); + if (entry == NULL) { + rc = tsd_hash_add_pid(table, pid); + if (rc) + return (rc); + } + + rc = tsd_hash_add(table, key, pid, value); + return (rc); +} +EXPORT_SYMBOL(tsd_set); + +/* + * tsd_get - get thread specific data + * @key: lookup key + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get(uint_t key) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, curthread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get); + +/* + * tsd_get_by_thread - get thread specific data for specified thread + * @key: lookup key + * @thread: thread to lookup + * + * Caller must prevent racing tsd_create() or tsd_destroy(). This + * implementation is designed to be fast and scalable, it does not + * lock the entire table only a single hash bin. + */ +void * +tsd_get_by_thread(uint_t key, kthread_t *thread) +{ + tsd_hash_entry_t *entry; + + ASSERT3P(tsd_hash_table, !=, NULL); + + if ((key == 0) || (key > TSD_KEYS_MAX)) + return (NULL); + + entry = tsd_hash_search(tsd_hash_table, key, thread->pid); + if (entry == NULL) + return (NULL); + + return (entry->he_value); +} +EXPORT_SYMBOL(tsd_get_by_thread); + +/* + * tsd_create - create thread specific data key + * @keyp: lookup key address + * @dtor: destructor called during tsd_destroy() or tsd_exit() + * + * Provided key must be set to 0 or it assumed to be already in use. + * The dtor is allowed to be NULL in which case no additional cleanup + * for the data is performed during tsd_destroy() or tsd_exit(). + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_create(uint_t *keyp, dtor_func_t dtor) +{ + ASSERT3P(keyp, !=, NULL); + if (*keyp) + return; + + (void) tsd_hash_add_key(tsd_hash_table, keyp, dtor); +} +EXPORT_SYMBOL(tsd_create); + +/* + * tsd_destroy - destroy thread specific data + * @keyp: lookup key address + * + * Destroys the thread specific data on all threads which use this key. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_destroy(uint_t *keyp) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *dtor_entry, *entry; + tsd_hash_bin_t *dtor_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + dtor_entry = tsd_hash_search(table, *keyp, DTOR_PID); + if (dtor_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All threads which use this key must be linked off of the + * DTOR_PID entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + while (!list_empty(&dtor_entry->he_key_list)) { + entry = list_entry(dtor_entry->he_key_list.next, + tsd_hash_entry_t, he_key_list); + ASSERT3U(dtor_entry->he_key, ==, entry->he_key); + ASSERT3P(dtor_entry->he_dtor, ==, entry->he_dtor); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)dtor_entry->he_key * + (ulong_t)dtor_entry->he_pid, table->ht_bits); + dtor_entry_bin = &table->ht_bins[hash]; + + spin_lock(&dtor_entry_bin->hb_lock); + tsd_hash_del(table, dtor_entry); + hlist_add_head(&dtor_entry->he_list, &work); + spin_unlock(&dtor_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); + *keyp = 0; +} +EXPORT_SYMBOL(tsd_destroy); + +/* + * tsd_exit - destroys all thread specific data for this thread + * + * Destroys all the thread specific data for this thread. + * + * Caller must prevent racing tsd_set() or tsd_get(), this function is + * safe from racing tsd_create(), tsd_destroy(), and tsd_exit(). + */ +void +tsd_exit(void) +{ + HLIST_HEAD(work); + tsd_hash_table_t *table; + tsd_hash_entry_t *pid_entry, *entry; + tsd_hash_bin_t *pid_entry_bin, *entry_bin; + ulong_t hash; + + table = tsd_hash_table; + ASSERT3P(table, !=, NULL); + + spin_lock(&table->ht_lock); + pid_entry = tsd_hash_search(table, PID_KEY, curthread->pid); + if (pid_entry == NULL) { + spin_unlock(&table->ht_lock); + return; + } + + /* + * All keys associated with this pid must be linked off of the + * PID_KEY entry. They are removed from the hash table and + * linked in to a private working list to be destroyed. + */ + + while (!list_empty(&pid_entry->he_pid_list)) { + entry = list_entry(pid_entry->he_pid_list.next, + tsd_hash_entry_t, he_pid_list); + ASSERT3U(pid_entry->he_pid, ==, entry->he_pid); + + hash = hash_long((ulong_t)entry->he_key * + (ulong_t)entry->he_pid, table->ht_bits); + entry_bin = &table->ht_bins[hash]; + + spin_lock(&entry_bin->hb_lock); + tsd_hash_del(table, entry); + hlist_add_head(&entry->he_list, &work); + spin_unlock(&entry_bin->hb_lock); + } + + hash = hash_long((ulong_t)pid_entry->he_key * + (ulong_t)pid_entry->he_pid, table->ht_bits); + pid_entry_bin = &table->ht_bins[hash]; + + spin_lock(&pid_entry_bin->hb_lock); + tsd_hash_del(table, pid_entry); + hlist_add_head(&pid_entry->he_list, &work); + spin_unlock(&pid_entry_bin->hb_lock); + spin_unlock(&table->ht_lock); + + tsd_hash_dtor(&work); +} +EXPORT_SYMBOL(tsd_exit); + +int +spl_tsd_init(void) +{ + tsd_hash_table = tsd_hash_table_init(TSD_HASH_TABLE_BITS_DEFAULT); + if (tsd_hash_table == NULL) + return (1); + + return (0); +} + +void +spl_tsd_fini(void) +{ + tsd_hash_table_fini(tsd_hash_table); + tsd_hash_table = NULL; +} diff --git a/module/os/linux/spl/spl-vmem.c b/module/os/linux/spl/spl-vmem.c new file mode 100644 index 000000000000..32372e6f2b60 --- /dev/null +++ b/module/os/linux/spl/spl-vmem.c @@ -0,0 +1,91 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#include +#include +#include +#include +#include +#include + +/* + * Public vmem_alloc(), vmem_zalloc() and vmem_free() interfaces. + */ +void * +spl_vmem_alloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= KM_VMEM; + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_alloc); + +void * +spl_vmem_zalloc(size_t size, int flags, const char *func, int line) +{ + ASSERT0(flags & ~KM_PUBLIC_MASK); + + flags |= (KM_VMEM | KM_ZERO); + +#if !defined(DEBUG_KMEM) + return (spl_kmem_alloc_impl(size, flags, NUMA_NO_NODE)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_alloc_debug(size, flags, NUMA_NO_NODE)); +#else + return (spl_kmem_alloc_track(size, flags, func, line, NUMA_NO_NODE)); +#endif +} +EXPORT_SYMBOL(spl_vmem_zalloc); + +void +spl_vmem_free(const void *buf, size_t size) +{ +#if !defined(DEBUG_KMEM) + return (spl_kmem_free_impl(buf, size)); +#elif !defined(DEBUG_KMEM_TRACKING) + return (spl_kmem_free_debug(buf, size)); +#else + return (spl_kmem_free_track(buf, size)); +#endif +} +EXPORT_SYMBOL(spl_vmem_free); + +int +spl_vmem_init(void) +{ + return (0); +} + +void +spl_vmem_fini(void) +{ +} diff --git a/module/os/linux/spl/spl-xdr.c b/module/os/linux/spl/spl-xdr.c new file mode 100644 index 000000000000..1dd31ffc1483 --- /dev/null +++ b/module/os/linux/spl/spl-xdr.c @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2008-2010 Sun Microsystems, Inc. + * Written by Ricardo Correia + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) XDR Implementation. + */ + +#include +#include +#include +#include +#include +#include + +/* + * SPL's XDR mem implementation. + * + * This is used by libnvpair to serialize/deserialize the name-value pair data + * structures into byte arrays in a well-defined and portable manner. + * + * These data structures are used by the DMU/ZFS to flexibly manipulate various + * information in memory and later serialize it/deserialize it to disk. + * Examples of usages include the pool configuration, lists of pool and dataset + * properties, etc. + * + * Reference documentation for the XDR representation and XDR operations can be + * found in RFC 1832 and xdr(3), respectively. + * + * === Implementation shortcomings === + * + * It is assumed that the following C types have the following sizes: + * + * char/unsigned char: 1 byte + * short/unsigned short: 2 bytes + * int/unsigned int: 4 bytes + * longlong_t/u_longlong_t: 8 bytes + * + * The C standard allows these types to be larger (and in the case of ints, + * shorter), so if that is the case on some compiler/architecture, the build + * will fail (on purpose). + * + * If someone wants to fix the code to work properly on such environments, then: + * + * 1) Preconditions should be added to xdrmem_enc functions to make sure the + * caller doesn't pass arguments which exceed the expected range. + * 2) Functions which take signed integers should be changed to properly do + * sign extension. + * 3) For ints with less than 32 bits, well.. I suspect you'll have bigger + * problems than this implementation. + * + * It is also assumed that: + * + * 1) Chars have 8 bits. + * 2) We can always do 32-bit-aligned int memory accesses and byte-aligned + * memcpy, memset and memcmp. + * 3) Arrays passed to xdr_array() are packed and the compiler/architecture + * supports element-sized-aligned memory accesses. + * 4) Negative integers are natively stored in two's complement binary + * representation. + * + * No checks are done for the 4 assumptions above, though. + * + * === Caller expectations === + * + * Existing documentation does not describe the semantics of XDR operations very + * well. Therefore, some assumptions about failure semantics will be made and + * will be described below: + * + * 1) If any encoding operation fails (e.g., due to lack of buffer space), the + * the stream should be considered valid only up to the encoding operation + * previous to the one that first failed. However, the stream size as returned + * by xdr_control() cannot be considered to be strictly correct (it may be + * bigger). + * + * Putting it another way, if there is an encoding failure it's undefined + * whether anything is added to the stream in that operation and therefore + * neither xdr_control() nor future encoding operations on the same stream can + * be relied upon to produce correct results. + * + * 2) If a decoding operation fails, it's undefined whether anything will be + * decoded into passed buffers/pointers during that operation, or what the + * values on those buffers will look like. + * + * Future decoding operations on the same stream will also have similar + * undefined behavior. + * + * 3) When the first decoding operation fails it is OK to trust the results of + * previous decoding operations on the same stream, as long as the caller + * expects a failure to be possible (e.g. due to end-of-stream). + * + * However, this is highly discouraged because the caller should know the + * stream size and should be coded to expect any decoding failure to be data + * corruption due to hardware, accidental or even malicious causes, which should + * be handled gracefully in all cases. + * + * In very rare situations where there are strong reasons to believe the data + * can be trusted to be valid and non-tampered with, then the caller may assume + * a decoding failure to be a bug (e.g. due to mismatched data types) and may + * fail non-gracefully. + * + * 4) Non-zero padding bytes will cause the decoding operation to fail. + * + * 5) Zero bytes on string types will also cause the decoding operation to fail. + * + * 6) It is assumed that either the pointer to the stream buffer given by the + * caller is 32-bit aligned or the architecture supports non-32-bit-aligned int + * memory accesses. + * + * 7) The stream buffer and encoding/decoding buffers/ptrs should not overlap. + * + * 8) If a caller passes pointers to non-kernel memory (e.g., pointers to user + * space or MMIO space), the computer may explode. + */ + +static struct xdr_ops xdrmem_encode_ops; +static struct xdr_ops xdrmem_decode_ops; + +void +xdrmem_create(XDR *xdrs, const caddr_t addr, const uint_t size, + const enum xdr_op op) +{ + switch (op) { + case XDR_ENCODE: + xdrs->x_ops = &xdrmem_encode_ops; + break; + case XDR_DECODE: + xdrs->x_ops = &xdrmem_decode_ops; + break; + default: + xdrs->x_ops = NULL; /* Let the caller know we failed */ + return; + } + + xdrs->x_op = op; + xdrs->x_addr = addr; + xdrs->x_addr_end = addr + size; + + if (xdrs->x_addr_end < xdrs->x_addr) { + xdrs->x_ops = NULL; + } +} +EXPORT_SYMBOL(xdrmem_create); + +static bool_t +xdrmem_control(XDR *xdrs, int req, void *info) +{ + struct xdr_bytesrec *rec = (struct xdr_bytesrec *)info; + + if (req != XDR_GET_BYTES_AVAIL) + return (FALSE); + + rec->xc_is_last_record = TRUE; /* always TRUE in xdrmem streams */ + rec->xc_num_avail = xdrs->x_addr_end - xdrs->x_addr; + + return (TRUE); +} + +static bool_t +xdrmem_enc_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(xdrs->x_addr, cp, cnt); + + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + memset(xdrs->x_addr, 0, pad); + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_dec_bytes(XDR *xdrs, caddr_t cp, const uint_t cnt) +{ + static uint32_t zero = 0; + uint_t size = roundup(cnt, 4); + uint_t pad; + + if (size < cnt) + return (FALSE); /* Integer overflow */ + + if (xdrs->x_addr > xdrs->x_addr_end) + return (FALSE); + + if (xdrs->x_addr_end - xdrs->x_addr < size) + return (FALSE); + + memcpy(cp, xdrs->x_addr, cnt); + xdrs->x_addr += cnt; + + pad = size - cnt; + if (pad > 0) { + /* An inverted memchr() would be useful here... */ + if (memcmp(&zero, xdrs->x_addr, pad) != 0) + return (FALSE); + + xdrs->x_addr += pad; + } + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint32(XDR *xdrs, uint32_t val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *((uint32_t *)xdrs->x_addr) = cpu_to_be32(val); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_dec_uint32(XDR *xdrs, uint32_t *val) +{ + if (xdrs->x_addr + sizeof (uint32_t) > xdrs->x_addr_end) + return (FALSE); + + *val = be32_to_cpu(*((uint32_t *)xdrs->x_addr)); + + xdrs->x_addr += sizeof (uint32_t); + + return (TRUE); +} + +static bool_t +xdrmem_enc_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + val = *((unsigned char *) cp); + + return (xdrmem_enc_uint32(xdrs, val)); +} + +static bool_t +xdrmem_dec_char(XDR *xdrs, char *cp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (char) != 1); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * If any of the 3 other bytes are non-zero then val will be greater + * than 0xff and we fail because according to the RFC, this block does + * not have a char encoded in it. + */ + if (val > 0xff) + return (FALSE); + + *((unsigned char *) cp) = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_ushort(XDR *xdrs, unsigned short *usp) +{ + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + return (xdrmem_enc_uint32(xdrs, *usp)); +} + +static bool_t +xdrmem_dec_ushort(XDR *xdrs, unsigned short *usp) +{ + uint32_t val; + + BUILD_BUG_ON(sizeof (unsigned short) != 2); + + if (!xdrmem_dec_uint32(xdrs, &val)) + return (FALSE); + + /* + * Short ints are not in the RFC, but we assume similar logic as in + * xdrmem_dec_char(). + */ + if (val > 0xffff) + return (FALSE); + + *usp = val; + + return (TRUE); +} + +static bool_t +xdrmem_enc_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_enc_uint32(xdrs, *up)); +} + +static bool_t +xdrmem_dec_uint(XDR *xdrs, unsigned *up) +{ + BUILD_BUG_ON(sizeof (unsigned) != 4); + + return (xdrmem_dec_uint32(xdrs, (uint32_t *)up)); +} + +static bool_t +xdrmem_enc_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_enc_uint32(xdrs, *ullp >> 32)) + return (FALSE); + + return (xdrmem_enc_uint32(xdrs, *ullp & 0xffffffff)); +} + +static bool_t +xdrmem_dec_ulonglong(XDR *xdrs, u_longlong_t *ullp) +{ + uint32_t low, high; + + BUILD_BUG_ON(sizeof (u_longlong_t) != 8); + + if (!xdrmem_dec_uint32(xdrs, &high)) + return (FALSE); + if (!xdrmem_dec_uint32(xdrs, &low)) + return (FALSE); + + *ullp = ((u_longlong_t)high << 32) | low; + + return (TRUE); +} + +static bool_t +xdr_enc_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i; + caddr_t addr = *arrp; + + if (*sizep > maxsize || *sizep > UINT_MAX / elsize) + return (FALSE); + + if (!xdrmem_enc_uint(xdrs, sizep)) + return (FALSE); + + for (i = 0; i < *sizep; i++) { + if (!elproc(xdrs, addr)) + return (FALSE); + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_dec_array(XDR *xdrs, caddr_t *arrp, uint_t *sizep, const uint_t maxsize, + const uint_t elsize, const xdrproc_t elproc) +{ + uint_t i, size; + bool_t alloc = FALSE; + caddr_t addr; + + if (!xdrmem_dec_uint(xdrs, sizep)) + return (FALSE); + + size = *sizep; + + if (size > maxsize || size > UINT_MAX / elsize) + return (FALSE); + + /* + * The Solaris man page says: "If *arrp is NULL when decoding, + * xdr_array() allocates memory and *arrp points to it". + */ + if (*arrp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *arrp = kmem_alloc(size * elsize, KM_NOSLEEP); + if (*arrp == NULL) + return (FALSE); + + alloc = TRUE; + } + + addr = *arrp; + + for (i = 0; i < size; i++) { + if (!elproc(xdrs, addr)) { + if (alloc) + kmem_free(*arrp, size * elsize); + return (FALSE); + } + addr += elsize; + } + + return (TRUE); +} + +static bool_t +xdr_enc_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + size_t slen = strlen(*sp); + uint_t len; + + if (slen > maxsize) + return (FALSE); + + len = slen; + + if (!xdrmem_enc_uint(xdrs, &len)) + return (FALSE); + + return (xdrmem_enc_bytes(xdrs, *sp, len)); +} + +static bool_t +xdr_dec_string(XDR *xdrs, char **sp, const uint_t maxsize) +{ + uint_t size; + bool_t alloc = FALSE; + + if (!xdrmem_dec_uint(xdrs, &size)) + return (FALSE); + + if (size > maxsize || size > UINT_MAX - 1) + return (FALSE); + + /* + * Solaris man page: "If *sp is NULL when decoding, xdr_string() + * allocates memory and *sp points to it". + */ + if (*sp == NULL) { + BUILD_BUG_ON(sizeof (uint_t) > sizeof (size_t)); + + *sp = kmem_alloc(size + 1, KM_NOSLEEP); + if (*sp == NULL) + return (FALSE); + + alloc = TRUE; + } + + if (!xdrmem_dec_bytes(xdrs, *sp, size)) + goto fail; + + if (memchr(*sp, 0, size) != NULL) + goto fail; + + (*sp)[size] = '\0'; + + return (TRUE); + +fail: + if (alloc) + kmem_free(*sp, size + 1); + + return (FALSE); +} + +static struct xdr_ops xdrmem_encode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_enc_char, + .xdr_u_short = xdrmem_enc_ushort, + .xdr_u_int = xdrmem_enc_uint, + .xdr_u_longlong_t = xdrmem_enc_ulonglong, + .xdr_opaque = xdrmem_enc_bytes, + .xdr_string = xdr_enc_string, + .xdr_array = xdr_enc_array +}; + +static struct xdr_ops xdrmem_decode_ops = { + .xdr_control = xdrmem_control, + .xdr_char = xdrmem_dec_char, + .xdr_u_short = xdrmem_dec_ushort, + .xdr_u_int = xdrmem_dec_uint, + .xdr_u_longlong_t = xdrmem_dec_ulonglong, + .xdr_opaque = xdrmem_dec_bytes, + .xdr_string = xdr_dec_string, + .xdr_array = xdr_dec_array +}; diff --git a/module/os/linux/spl/spl-zlib.c b/module/os/linux/spl/spl-zlib.c new file mode 100644 index 000000000000..db05e28925b5 --- /dev/null +++ b/module/os/linux/spl/spl-zlib.c @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * For details, see . + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * + * z_compress_level/z_uncompress are nearly identical copies of the + * compress2/uncompress functions provided by the official zlib package + * available at http://zlib.net/. The only changes made we to slightly + * adapt the functions called to match the linux kernel implementation + * of zlib. The full zlib license follows: + * + * zlib.h -- interface of the 'zlib' general purpose compression library + * version 1.2.5, April 19th, 2010 + * + * Copyright (C) 1995-2010 Jean-loup Gailly and Mark Adler + * + * This software is provided 'as-is', without any express or implied + * warranty. In no event will the authors be held liable for any damages + * arising from the use of this software. + * + * Permission is granted to anyone to use this software for any purpose, + * including commercial applications, and to alter it and redistribute it + * freely, subject to the following restrictions: + * + * 1. The origin of this software must not be misrepresented; you must not + * claim that you wrote the original software. If you use this software + * in a product, an acknowledgment in the product documentation would be + * appreciated but is not required. + * 2. Altered source versions must be plainly marked as such, and must not be + * misrepresented as being the original software. + * 3. This notice may not be removed or altered from any source distribution. + * + * Jean-loup Gailly + * Mark Adler + */ + + +#include +#include +#include +#include + +static spl_kmem_cache_t *zlib_workspace_cache; + +/* + * A kmem_cache is used for the zlib workspaces to avoid having to vmalloc + * and vfree for every call. Using a kmem_cache also has the advantage + * that improves the odds that the memory used will be local to this cpu. + * To further improve things it might be wise to create a dedicated per-cpu + * workspace for use. This would take some additional care because we then + * must disable preemption around the critical section, and verify that + * zlib_deflate* and zlib_inflate* never internally call schedule(). + */ +static void * +zlib_workspace_alloc(int flags) +{ + return (kmem_cache_alloc(zlib_workspace_cache, flags & ~(__GFP_FS))); +} + +static void +zlib_workspace_free(void *workspace) +{ + kmem_cache_free(zlib_workspace_cache, workspace); +} + +/* + * Compresses the source buffer into the destination buffer. The level + * parameter has the same meaning as in deflateInit. sourceLen is the byte + * length of the source buffer. Upon entry, destLen is the total size of the + * destination buffer, which must be at least 0.1% larger than sourceLen plus + * 12 bytes. Upon exit, destLen is the actual size of the compressed buffer. + * + * compress2 returns Z_OK if success, Z_MEM_ERROR if there was not enough + * memory, Z_BUF_ERROR if there was not enough room in the output buffer, + * Z_STREAM_ERROR if the level parameter is invalid. + */ +int +z_compress_level(void *dest, size_t *destLen, const void *source, + size_t sourceLen, int level) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_deflateInit(&stream, level); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_deflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + return (err == Z_OK ? Z_BUF_ERROR : err); + } + *destLen = stream.total_out; + + err = zlib_deflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_compress_level); + +/* + * Decompresses the source buffer into the destination buffer. sourceLen is + * the byte length of the source buffer. Upon entry, destLen is the total + * size of the destination buffer, which must be large enough to hold the + * entire uncompressed data. (The size of the uncompressed data must have + * been saved previously by the compressor and transmitted to the decompressor + * by some mechanism outside the scope of this compression library.) + * Upon exit, destLen is the actual size of the compressed buffer. + * This function can be used to decompress a whole file at once if the + * input file is mmap'ed. + * + * uncompress returns Z_OK if success, Z_MEM_ERROR if there was not + * enough memory, Z_BUF_ERROR if there was not enough room in the output + * buffer, or Z_DATA_ERROR if the input data was corrupted. + */ +int +z_uncompress(void *dest, size_t *destLen, const void *source, size_t sourceLen) +{ + z_stream stream; + int err; + + stream.next_in = (Byte *)source; + stream.avail_in = (uInt)sourceLen; + stream.next_out = dest; + stream.avail_out = (uInt)*destLen; + + if ((size_t)stream.avail_out != *destLen) + return (Z_BUF_ERROR); + + stream.workspace = zlib_workspace_alloc(KM_SLEEP); + if (!stream.workspace) + return (Z_MEM_ERROR); + + err = zlib_inflateInit(&stream); + if (err != Z_OK) { + zlib_workspace_free(stream.workspace); + return (err); + } + + err = zlib_inflate(&stream, Z_FINISH); + if (err != Z_STREAM_END) { + zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + if (err == Z_NEED_DICT || + (err == Z_BUF_ERROR && stream.avail_in == 0)) + return (Z_DATA_ERROR); + + return (err); + } + *destLen = stream.total_out; + + err = zlib_inflateEnd(&stream); + zlib_workspace_free(stream.workspace); + + return (err); +} +EXPORT_SYMBOL(z_uncompress); + +int +spl_zlib_init(void) +{ + int size; + + size = MAX(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + + zlib_workspace_cache = kmem_cache_create( + "spl_zlib_workspace_cache", + size, 0, NULL, NULL, NULL, NULL, NULL, + KMC_KVMEM); + if (!zlib_workspace_cache) + return (1); + + return (0); +} + +void +spl_zlib_fini(void) +{ + kmem_cache_destroy(zlib_workspace_cache); + zlib_workspace_cache = NULL; +} diff --git a/module/os/linux/zfs/Makefile.in b/module/os/linux/zfs/Makefile.in new file mode 100644 index 000000000000..9f493ef16de5 --- /dev/null +++ b/module/os/linux/zfs/Makefile.in @@ -0,0 +1,37 @@ +# +# Linux specific sources included from module/zfs/Makefile.in +# + +# Suppress unused-value warnings in sparc64 architecture headers +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value + +$(MODULE)-objs += ../os/linux/zfs/abd_os.o +$(MODULE)-objs += ../os/linux/zfs/arc_os.o +$(MODULE)-objs += ../os/linux/zfs/mmp_os.o +$(MODULE)-objs += ../os/linux/zfs/policy.o +$(MODULE)-objs += ../os/linux/zfs/trace.o +$(MODULE)-objs += ../os/linux/zfs/qat.o +$(MODULE)-objs += ../os/linux/zfs/qat_compress.o +$(MODULE)-objs += ../os/linux/zfs/qat_crypt.o +$(MODULE)-objs += ../os/linux/zfs/spa_misc_os.o +$(MODULE)-objs += ../os/linux/zfs/spa_stats.o +$(MODULE)-objs += ../os/linux/zfs/vdev_disk.o +$(MODULE)-objs += ../os/linux/zfs/vdev_file.o +$(MODULE)-objs += ../os/linux/zfs/zfs_acl.o +$(MODULE)-objs += ../os/linux/zfs/zfs_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_debug.o +$(MODULE)-objs += ../os/linux/zfs/zfs_dir.o +$(MODULE)-objs += ../os/linux/zfs/zfs_file_os.o +$(MODULE)-objs += ../os/linux/zfs/zfs_ioctl_os.o +$(MODULE)-objs += ../os/linux/zfs/zfs_sysfs.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vfsops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_vnops.o +$(MODULE)-objs += ../os/linux/zfs/zfs_znode.o +$(MODULE)-objs += ../os/linux/zfs/zio_crypt.o +$(MODULE)-objs += ../os/linux/zfs/zpl_ctldir.o +$(MODULE)-objs += ../os/linux/zfs/zpl_export.o +$(MODULE)-objs += ../os/linux/zfs/zpl_file.o +$(MODULE)-objs += ../os/linux/zfs/zpl_inode.o +$(MODULE)-objs += ../os/linux/zfs/zpl_super.o +$(MODULE)-objs += ../os/linux/zfs/zpl_xattr.o +$(MODULE)-objs += ../os/linux/zfs/zvol_os.o diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c new file mode 100644 index 000000000000..c2281449ed12 --- /dev/null +++ b/module/os/linux/zfs/abd_os.c @@ -0,0 +1,1074 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * See abd.c for a general overview of the arc buffered data (ABD). + * + * Linear buffers act exactly like normal buffers and are always mapped into the + * kernel's virtual memory space, while scattered ABD data chunks are allocated + * as physical pages and then mapped in only while they are actually being + * accessed through one of the abd_* library functions. Using scattered ABDs + * provides several benefits: + * + * (1) They avoid use of kmem_*, preventing performance problems where running + * kmem_reap on very large memory systems never finishes and causes + * constant TLB shootdowns. + * + * (2) Fragmentation is less of an issue since when we are at the limit of + * allocatable space, we won't have to search around for a long free + * hole in the VA space for large ARC allocations. Each chunk is mapped in + * individually, so even if we are using HIGHMEM (see next point) we + * wouldn't need to worry about finding a contiguous address range. + * + * (3) If we are not using HIGHMEM, then all physical memory is always + * mapped into the kernel's address space, so we also avoid the map / + * unmap costs on each ABD access. + * + * If we are not using HIGHMEM, scattered buffers which have only one chunk + * can be treated as linear buffers, because they are contiguous in the + * kernel's virtual address space. See abd_alloc_chunks() for details. + */ + +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#else +#define MAX_ORDER 1 +#endif + +typedef struct abd_stats { + kstat_named_t abdstat_struct_size; + kstat_named_t abdstat_linear_cnt; + kstat_named_t abdstat_linear_data_size; + kstat_named_t abdstat_scatter_cnt; + kstat_named_t abdstat_scatter_data_size; + kstat_named_t abdstat_scatter_chunk_waste; + kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_page_multi_chunk; + kstat_named_t abdstat_scatter_page_multi_zone; + kstat_named_t abdstat_scatter_page_alloc_retry; + kstat_named_t abdstat_scatter_sg_table_retry; +} abd_stats_t; + +static abd_stats_t abd_stats = { + /* Amount of memory occupied by all of the abd_t struct allocations */ + { "struct_size", KSTAT_DATA_UINT64 }, + /* + * The number of linear ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset() and abd_get_from_buf()). If an + * ABD takes ownership of its buf then it will become tracked. + */ + { "linear_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all linear ABDs tracked by linear_cnt */ + { "linear_data_size", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are currently allocated, excluding + * ABDs which don't own their data (for instance the ones which were + * allocated through abd_get_offset()). + */ + { "scatter_cnt", KSTAT_DATA_UINT64 }, + /* Amount of data stored in all scatter ABDs tracked by scatter_cnt */ + { "scatter_data_size", KSTAT_DATA_UINT64 }, + /* + * The amount of space wasted at the end of the last chunk across all + * scatter ABDs tracked by scatter_cnt. + */ + { "scatter_chunk_waste", KSTAT_DATA_UINT64 }, + /* + * The number of compound allocations of a given order. These + * allocations are spread over all currently allocated ABDs, and + * act as a measure of memory fragmentation. + */ + { { "scatter_order_N", KSTAT_DATA_UINT64 } }, + /* + * The number of scatter ABDs which contain multiple chunks. + * ABDs are preferentially allocated from the minimum number of + * contiguous multi-page chunks, a single chunk is optimal. + */ + { "scatter_page_multi_chunk", KSTAT_DATA_UINT64 }, + /* + * The number of scatter ABDs which are split across memory zones. + * ABDs are preferentially allocated using pages from a single zone. + */ + { "scatter_page_multi_zone", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the pages to populate the scatter ABD. + */ + { "scatter_page_alloc_retry", KSTAT_DATA_UINT64 }, + /* + * The total number of retries encountered when attempting to + * allocate the sg table for an ABD. + */ + { "scatter_sg_table_retry", KSTAT_DATA_UINT64 }, +}; + +#define abd_for_each_sg(abd, sg, n, i) \ + for_each_sg(ABD_SCATTER(abd).abd_sgl, sg, n, i) + +unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; + +/* + * zfs_abd_scatter_min_size is the minimum allocation size to use scatter + * ABD's. Smaller allocations will use linear ABD's which uses + * zio_[data_]buf_alloc(). + * + * Scatter ABD's use at least one page each, so sub-page allocations waste + * some space when allocated as scatter (e.g. 2KB scatter allocation wastes + * half of each page). Using linear ABD's for small allocations means that + * they will be put on slabs which contain many allocations. This can + * improve memory efficiency, but it also makes it much harder for ARC + * evictions to actually free pages, because all the buffers on one slab need + * to be freed in order for the slab (and underlying pages) to be freed. + * Typically, 512B and 1KB kmem caches have 16 buffers per slab, so it's + * possible for them to actually waste more memory than scatter (one page per + * buf = wasting 3/4 or 7/8th; one buf per slab = wasting 15/16th). + * + * Spill blocks are typically 512B and are heavily used on systems running + * selinux with the default dnode size and the `xattr=sa` property set. + * + * By default we use linear allocations for 512B and 1KB, and scatter + * allocations for larger (1.5KB and up). + */ +int zfs_abd_scatter_min_size = 512 * 3; + +/* + * We use a scattered SPA_MAXBLOCKSIZE sized ABD whose pages are + * just a single zero'd page. This allows us to conserve memory by + * only using a single zero page for the scatterlist. + */ +abd_t *abd_zero_scatter = NULL; + +struct page; +/* + * abd_zero_page we will be an allocated zero'd PAGESIZE buffer, which is + * assigned to set each of the pages of abd_zero_scatter. + */ +static struct page *abd_zero_page = NULL; + +static kmem_cache_t *abd_cache = NULL; +static kstat_t *abd_ksp; + +static size_t +abd_chunkcnt_for_bytes(size_t size) +{ + return (P2ROUNDUP(size, PAGESIZE) / PAGESIZE); +} + +abd_t * +abd_alloc_struct(size_t size) +{ + /* + * In Linux we do not use the size passed in during ABD + * allocation, so we just ignore it. + */ + abd_t *abd = kmem_cache_alloc(abd_cache, KM_PUSHPAGE); + ASSERT3P(abd, !=, NULL); + list_link_init(&abd->abd_gang_link); + mutex_init(&abd->abd_mtx, NULL, MUTEX_DEFAULT, NULL); + ABDSTAT_INCR(abdstat_struct_size, sizeof (abd_t)); + + return (abd); +} + +void +abd_free_struct(abd_t *abd) +{ + mutex_destroy(&abd->abd_mtx); + ASSERT(!list_link_active(&abd->abd_gang_link)); + kmem_cache_free(abd_cache, abd); + ABDSTAT_INCR(abdstat_struct_size, -(int)sizeof (abd_t)); +} + +#ifdef _KERNEL +/* + * Mark zfs data pages so they can be excluded from kernel crash dumps + */ +#ifdef _LP64 +#define ABD_FILE_CACHE_PAGE 0x2F5ABDF11ECAC4E + +static inline void +abd_mark_zfs_page(struct page *page) +{ + get_page(page); + SetPagePrivate(page); + set_page_private(page, ABD_FILE_CACHE_PAGE); +} + +static inline void +abd_unmark_zfs_page(struct page *page) +{ + set_page_private(page, 0UL); + ClearPagePrivate(page); + put_page(page); +} +#else +#define abd_mark_zfs_page(page) +#define abd_unmark_zfs_page(page) +#endif /* _LP64 */ + +#ifndef CONFIG_HIGHMEM + +#ifndef __GFP_RECLAIM +#define __GFP_RECLAIM __GFP_WAIT +#endif + +/* + * The goal is to minimize fragmentation by preferentially populating ABDs + * with higher order compound pages from a single zone. Allocation size is + * progressively decreased until it can be satisfied without performing + * reclaim or compaction. When necessary this function will degenerate to + * allocating individual pages and allowing reclaim to satisfy allocations. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct list_head pages; + struct sg_table table; + struct scatterlist *sg; + struct page *page, *tmp_page = NULL; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; + int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + int nr_pages = abd_chunkcnt_for_bytes(size); + int chunks = 0, zones = 0; + size_t remaining_size; + int nid = NUMA_NO_NODE; + int alloc_pages = 0; + + INIT_LIST_HEAD(&pages); + + while (alloc_pages < nr_pages) { + unsigned chunk_pages; + int order; + + order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order); + chunk_pages = (1U << order); + + page = alloc_pages_node(nid, order ? gfp_comp : gfp, order); + if (page == NULL) { + if (order == 0) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } else { + max_order = MAX(0, order - 1); + } + continue; + } + + list_add_tail(&page->lru, &pages); + + if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid)) + zones++; + + nid = page_to_nid(page); + ABDSTAT_BUMP(abdstat_scatter_orders[order]); + chunks++; + alloc_pages += chunk_pages; + } + + ASSERT3S(alloc_pages, ==, nr_pages); + + while (sg_alloc_table(&table, chunks, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + sg = table.sgl; + remaining_size = size; + list_for_each_entry_safe(page, tmp_page, &pages, lru) { + size_t sg_size = MIN(PAGESIZE << compound_order(page), + remaining_size); + sg_set_page(sg, page, sg_size, 0); + abd_mark_zfs_page(page); + remaining_size -= sg_size; + + sg = sg_next(sg); + list_del(&page->lru); + } + + /* + * These conditions ensure that a possible transformation to a linear + * ABD would be valid. + */ + ASSERT(!PageHighMem(sg_page(table.sgl))); + ASSERT0(ABD_SCATTER(abd).abd_offset); + + if (table.nents == 1) { + /* + * Since there is only one entry, this ABD can be represented + * as a linear buffer. All single-page (4K) ABD's can be + * represented this way. Some multi-page ABD's can also be + * represented this way, if we were able to allocate a single + * "chunk" (higher-order "page" which represents a power-of-2 + * series of physically-contiguous pages). This is often the + * case for 2-page (8K) ABD's. + * + * Representing a single-entry scatter ABD as a linear ABD + * has the performance advantage of avoiding the copy (and + * allocation) in abd_borrow_buf_copy / abd_return_buf_copy. + * A performance increase of around 5% has been observed for + * ARC-cached reads (of small blocks which can take advantage + * of this). + * + * Note that this optimization is only possible because the + * pages are always mapped into the kernel's address space. + * This is not the case for highmem pages, so the + * optimization can not be made there. + */ + abd->abd_flags |= ABD_FLAG_LINEAR; + abd->abd_flags |= ABD_FLAG_LINEAR_PAGE; + abd->abd_u.abd_linear.abd_sgl = table.sgl; + ABD_LINEAR_BUF(abd) = page_address(sg_page(table.sgl)); + } else if (table.nents > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + + if (zones) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_zone); + abd->abd_flags |= ABD_FLAG_MULTI_ZONE; + } + + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = table.nents; + } +} +#else + +/* + * Allocate N individual pages to construct a scatter ABD. This function + * makes no attempt to request contiguous pages and requires the minimal + * number of kernel interfaces. It's designed for maximum compatibility. + */ +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + struct page *page; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + int nr_pages = abd_chunkcnt_for_bytes(size); + int i = 0; + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + + ASSERT3U(table.nents, ==, nr_pages); + ABD_SCATTER(abd).abd_sgl = table.sgl; + ABD_SCATTER(abd).abd_nents = nr_pages; + + abd_for_each_sg(abd, sg, nr_pages, i) { + while ((page = __page_cache_alloc(gfp)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + + ABDSTAT_BUMP(abdstat_scatter_orders[0]); + sg_set_page(sg, page, PAGESIZE, 0); + abd_mark_zfs_page(page); + } + + if (nr_pages > 1) { + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); + abd->abd_flags |= ABD_FLAG_MULTI_CHUNK; + } +} +#endif /* !CONFIG_HIGHMEM */ + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + struct sg_table table; + + table.sgl = ABD_SCATTER(abd).abd_sgl; + table.nents = table.orig_nents = ABD_SCATTER(abd).abd_nents; + sg_free_table(&table); +} + +void +abd_free_chunks(abd_t *abd) +{ + struct scatterlist *sg = NULL; + struct page *page; + int nr_pages = ABD_SCATTER(abd).abd_nents; + int order, i = 0; + + if (abd->abd_flags & ABD_FLAG_MULTI_ZONE) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_zone); + + if (abd->abd_flags & ABD_FLAG_MULTI_CHUNK) + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_for_each_sg(abd, sg, nr_pages, i) { + page = sg_page(sg); + abd_unmark_zfs_page(page); + order = compound_order(page); + __free_pages(page, order); + ASSERT3U(sg->length, <=, PAGE_SIZE << order); + ABDSTAT_BUMPDOWN(abdstat_scatter_orders[order]); + } + abd_free_sg_table(abd); +} + +/* + * Allocate scatter ABD of size SPA_MAXBLOCKSIZE, where each page in + * the scatterlist will be set to the zero'd out buffer abd_zero_page. + */ +static void +abd_alloc_zero_scatter(void) +{ + struct scatterlist *sg = NULL; + struct sg_table table; + gfp_t gfp = __GFP_NOWARN | GFP_NOIO; + gfp_t gfp_zero_page = gfp | __GFP_ZERO; + int nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + int i = 0; + + while ((abd_zero_page = __page_cache_alloc(gfp_zero_page)) == NULL) { + ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry); + schedule_timeout_interruptible(1); + } + abd_mark_zfs_page(abd_zero_page); + + while (sg_alloc_table(&table, nr_pages, gfp)) { + ABDSTAT_BUMP(abdstat_scatter_sg_table_retry); + schedule_timeout_interruptible(1); + } + ASSERT3U(table.nents, ==, nr_pages); + + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_sgl = table.sgl; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; + zfs_refcount_create(&abd_zero_scatter->abd_children); + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, abd_zero_page, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); +} + +#else /* _KERNEL */ + +#ifndef PAGE_SHIFT +#define PAGE_SHIFT (highbit64(PAGESIZE)-1) +#endif + +#define zfs_kmap_atomic(chunk, km) ((void *)chunk) +#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0) +#define local_irq_save(flags) do { (void)(flags); } while (0) +#define local_irq_restore(flags) do { (void)(flags); } while (0) +#define nth_page(pg, i) \ + ((struct page *)((void *)(pg) + (i) * PAGESIZE)) + +struct scatterlist { + struct page *page; + int length; + int end; +}; + +static void +sg_init_table(struct scatterlist *sg, int nr) +{ + memset(sg, 0, nr * sizeof (struct scatterlist)); + sg[nr - 1].end = 1; +} + +/* + * This must be called if any of the sg_table allocation functions + * are called. + */ +static void +abd_free_sg_table(abd_t *abd) +{ + int nents = ABD_SCATTER(abd).abd_nents; + vmem_free(ABD_SCATTER(abd).abd_sgl, + nents * sizeof (struct scatterlist)); +} + +#define for_each_sg(sgl, sg, nr, i) \ + for ((i) = 0, (sg) = (sgl); (i) < (nr); (i)++, (sg) = sg_next(sg)) + +static inline void +sg_set_page(struct scatterlist *sg, struct page *page, unsigned int len, + unsigned int offset) +{ + /* currently we don't use offset */ + ASSERT(offset == 0); + sg->page = page; + sg->length = len; +} + +static inline struct page * +sg_page(struct scatterlist *sg) +{ + return (sg->page); +} + +static inline struct scatterlist * +sg_next(struct scatterlist *sg) +{ + if (sg->end) + return (NULL); + + return (sg + 1); +} + +void +abd_alloc_chunks(abd_t *abd, size_t size) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(size); + struct scatterlist *sg; + int i; + + ABD_SCATTER(abd).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages); + + abd_for_each_sg(abd, sg, nr_pages, i) { + struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + sg_set_page(sg, p, PAGESIZE, 0); + } + ABD_SCATTER(abd).abd_nents = nr_pages; +} + +void +abd_free_chunks(abd_t *abd) +{ + int i, n = ABD_SCATTER(abd).abd_nents; + struct scatterlist *sg; + + abd_for_each_sg(abd, sg, n, i) { + for (int j = 0; j < sg->length; j += PAGESIZE) { + struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT); + umem_free(p, PAGESIZE); + } + } + abd_free_sg_table(abd); +} + +static void +abd_alloc_zero_scatter(void) +{ + unsigned nr_pages = abd_chunkcnt_for_bytes(SPA_MAXBLOCKSIZE); + struct scatterlist *sg; + int i; + + abd_zero_page = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP); + memset(abd_zero_page, 0, PAGESIZE); + abd_zero_scatter = abd_alloc_struct(SPA_MAXBLOCKSIZE); + abd_zero_scatter->abd_flags = ABD_FLAG_OWNER; + abd_zero_scatter->abd_flags |= ABD_FLAG_MULTI_CHUNK | ABD_FLAG_ZEROS; + ABD_SCATTER(abd_zero_scatter).abd_offset = 0; + ABD_SCATTER(abd_zero_scatter).abd_nents = nr_pages; + abd_zero_scatter->abd_size = SPA_MAXBLOCKSIZE; + abd_zero_scatter->abd_parent = NULL; + zfs_refcount_create(&abd_zero_scatter->abd_children); + ABD_SCATTER(abd_zero_scatter).abd_sgl = vmem_alloc(nr_pages * + sizeof (struct scatterlist), KM_SLEEP); + + sg_init_table(ABD_SCATTER(abd_zero_scatter).abd_sgl, nr_pages); + + abd_for_each_sg(abd_zero_scatter, sg, nr_pages, i) { + sg_set_page(sg, abd_zero_page, PAGESIZE, 0); + } + + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, PAGESIZE); + ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk); +} + +#endif /* _KERNEL */ + +boolean_t +abd_size_alloc_linear(size_t size) +{ + return (size < zfs_abd_scatter_min_size ? B_TRUE : B_FALSE); +} + +void +abd_update_scatter_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + int waste = P2ROUNDUP(abd->abd_size, PAGESIZE) - abd->abd_size; + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, waste); + arc_space_consume(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } else { + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)abd->abd_size); + ABDSTAT_INCR(abdstat_scatter_chunk_waste, -waste); + arc_space_return(waste, ARC_SPACE_ABD_CHUNK_WASTE); + } +} + +void +abd_update_linear_stats(abd_t *abd, abd_stats_op_t op) +{ + ASSERT(op == ABDSTAT_INCR || op == ABDSTAT_DECR); + if (op == ABDSTAT_INCR) { + ABDSTAT_BUMP(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, abd->abd_size); + } else { + ABDSTAT_BUMPDOWN(abdstat_linear_cnt); + ABDSTAT_INCR(abdstat_linear_data_size, -(int)abd->abd_size); + } +} + +void +abd_verify_scatter(abd_t *abd) +{ + size_t n; + int i = 0; + struct scatterlist *sg = NULL; + + ASSERT3U(ABD_SCATTER(abd).abd_nents, >, 0); + ASSERT3U(ABD_SCATTER(abd).abd_offset, <, + ABD_SCATTER(abd).abd_sgl->length); + n = ABD_SCATTER(abd).abd_nents; + abd_for_each_sg(abd, sg, n, i) { + ASSERT3P(sg_page(sg), !=, NULL); + } +} + +static void +abd_free_zero_scatter(void) +{ + zfs_refcount_destroy(&abd_zero_scatter->abd_children); + ABDSTAT_BUMPDOWN(abdstat_scatter_cnt); + ABDSTAT_INCR(abdstat_scatter_data_size, -(int)PAGESIZE); + ABDSTAT_BUMPDOWN(abdstat_scatter_page_multi_chunk); + + abd_free_sg_table(abd_zero_scatter); + abd_free_struct(abd_zero_scatter); + abd_zero_scatter = NULL; + ASSERT3P(abd_zero_page, !=, NULL); +#if defined(_KERNEL) + abd_unmark_zfs_page(abd_zero_page); + __free_page(abd_zero_page); +#else + umem_free(abd_zero_page, PAGESIZE); +#endif /* _KERNEL */ +} + +void +abd_init(void) +{ + int i; + + abd_cache = kmem_cache_create("abd_t", sizeof (abd_t), + 0, NULL, NULL, NULL, NULL, NULL, 0); + + abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, + sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (abd_ksp != NULL) { + for (i = 0; i < MAX_ORDER; i++) { + snprintf(abd_stats.abdstat_scatter_orders[i].name, + KSTAT_STRLEN, "scatter_order_%d", i); + abd_stats.abdstat_scatter_orders[i].data_type = + KSTAT_DATA_UINT64; + } + abd_ksp->ks_data = &abd_stats; + kstat_install(abd_ksp); + } + + abd_alloc_zero_scatter(); +} + +void +abd_fini(void) +{ + abd_free_zero_scatter(); + + if (abd_ksp != NULL) { + kstat_delete(abd_ksp); + abd_ksp = NULL; + } + + if (abd_cache) { + kmem_cache_destroy(abd_cache); + abd_cache = NULL; + } +} + +void +abd_free_linear_page(abd_t *abd) +{ + /* Transform it back into a scatter ABD for freeing */ + struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl; + abd->abd_flags &= ~ABD_FLAG_LINEAR; + abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE; + ABD_SCATTER(abd).abd_nents = 1; + ABD_SCATTER(abd).abd_offset = 0; + ABD_SCATTER(abd).abd_sgl = sg; + abd_free_chunks(abd); + + zfs_refcount_destroy(&abd->abd_children); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} + +/* + * If we're going to use this ABD for doing I/O using the block layer, the + * consumer of the ABD data doesn't care if it's scattered or not, and we don't + * plan to store this ABD in memory for a long period of time, we should + * allocate the ABD type that requires the least data copying to do the I/O. + * + * On Linux the optimal thing to do would be to use abd_get_offset() and + * construct a new ABD which shares the original pages thereby eliminating + * the copy. But for the moment a new linear ABD is allocated until this + * performance optimization can be implemented. + */ +abd_t * +abd_alloc_for_io(size_t size, boolean_t is_metadata) +{ + return (abd_alloc(size, is_metadata)); +} + +abd_t * +abd_get_offset_scatter(abd_t *sabd, size_t off) +{ + abd_t *abd = NULL; + int i = 0; + struct scatterlist *sg = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + size_t new_offset = ABD_SCATTER(sabd).abd_offset + off; + + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = 0; + + abd_for_each_sg(sabd, sg, ABD_SCATTER(sabd).abd_nents, i) { + if (new_offset < sg->length) + break; + new_offset -= sg->length; + } + + ABD_SCATTER(abd).abd_sgl = sg; + ABD_SCATTER(abd).abd_offset = new_offset; + ABD_SCATTER(abd).abd_nents = ABD_SCATTER(sabd).abd_nents - i; + + return (abd); +} + +/* + * Initialize the abd_iter. + */ +void +abd_iter_init(struct abd_iter *aiter, abd_t *abd) +{ + ASSERT(!abd_is_gang(abd)); + abd_verify(abd); + aiter->iter_abd = abd; + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; + aiter->iter_pos = 0; + if (abd_is_linear(abd)) { + aiter->iter_offset = 0; + aiter->iter_sg = NULL; + } else { + aiter->iter_offset = ABD_SCATTER(abd).abd_offset; + aiter->iter_sg = ABD_SCATTER(abd).abd_sgl; + } +} + +/* + * This is just a helper function to see if we have exhausted the + * abd_iter and reached the end. + */ +boolean_t +abd_iter_at_end(struct abd_iter *aiter) +{ + return (aiter->iter_pos == aiter->iter_abd->abd_size); +} + +/* + * Advance the iterator by a certain amount. Cannot be called when a chunk is + * in use. This can be safely called when the aiter has already exhausted, in + * which case this does nothing. + */ +void +abd_iter_advance(struct abd_iter *aiter, size_t amount) +{ + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to advance to, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + aiter->iter_pos += amount; + aiter->iter_offset += amount; + if (!abd_is_linear(aiter->iter_abd)) { + while (aiter->iter_offset >= aiter->iter_sg->length) { + aiter->iter_offset -= aiter->iter_sg->length; + aiter->iter_sg = sg_next(aiter->iter_sg); + if (aiter->iter_sg == NULL) { + ASSERT0(aiter->iter_offset); + break; + } + } + } +} + +/* + * Map the current chunk into aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_map(struct abd_iter *aiter) +{ + void *paddr; + size_t offset = 0; + + ASSERT3P(aiter->iter_mapaddr, ==, NULL); + ASSERT0(aiter->iter_mapsize); + + /* There's nothing left to iterate over, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (abd_is_linear(aiter->iter_abd)) { + ASSERT3U(aiter->iter_pos, ==, aiter->iter_offset); + offset = aiter->iter_offset; + aiter->iter_mapsize = aiter->iter_abd->abd_size - offset; + paddr = ABD_LINEAR_BUF(aiter->iter_abd); + } else { + offset = aiter->iter_offset; + aiter->iter_mapsize = MIN(aiter->iter_sg->length - offset, + aiter->iter_abd->abd_size - aiter->iter_pos); + + paddr = zfs_kmap_atomic(sg_page(aiter->iter_sg), + km_table[aiter->iter_km]); + } + + aiter->iter_mapaddr = (char *)paddr + offset; +} + +/* + * Unmap the current chunk from aiter. This can be safely called when the aiter + * has already exhausted, in which case this does nothing. + */ +void +abd_iter_unmap(struct abd_iter *aiter) +{ + /* There's nothing left to unmap, so do nothing */ + if (abd_iter_at_end(aiter)) + return; + + if (!abd_is_linear(aiter->iter_abd)) { + /* LINTED E_FUNC_SET_NOT_USED */ + zfs_kunmap_atomic(aiter->iter_mapaddr - aiter->iter_offset, + km_table[aiter->iter_km]); + } + + ASSERT3P(aiter->iter_mapaddr, !=, NULL); + ASSERT3U(aiter->iter_mapsize, >, 0); + + aiter->iter_mapaddr = NULL; + aiter->iter_mapsize = 0; +} + +void +abd_cache_reap_now(void) +{ +} + +#if defined(_KERNEL) +/* + * bio_nr_pages for ABD. + * @off is the offset in @abd + */ +unsigned long +abd_nr_pages_off(abd_t *abd, unsigned int size, size_t off) +{ + unsigned long pos; + + while (abd_is_gang(abd)) + abd = abd_gang_get_offset(abd, &off); + + ASSERT(!abd_is_gang(abd)); + if (abd_is_linear(abd)) + pos = (unsigned long)abd_to_buf(abd) + off; + else + pos = ABD_SCATTER(abd).abd_offset + off; + + return ((pos + size + PAGESIZE - 1) >> PAGE_SHIFT) - + (pos >> PAGE_SHIFT); +} + +static unsigned int +bio_map(struct bio *bio, void *buf_ptr, unsigned int bio_size) +{ + unsigned int offset, size, i; + struct page *page; + + offset = offset_in_page(buf_ptr); + for (i = 0; i < bio->bi_max_vecs; i++) { + size = PAGE_SIZE - offset; + + if (bio_size <= 0) + break; + + if (size > bio_size) + size = bio_size; + + if (is_vmalloc_addr(buf_ptr)) + page = vmalloc_to_page(buf_ptr); + else + page = virt_to_page(buf_ptr); + + /* + * Some network related block device uses tcp_sendpage, which + * doesn't behave well when using 0-count page, this is a + * safety net to catch them. + */ + ASSERT3S(page_count(page), >, 0); + + if (bio_add_page(bio, page, size, offset) != size) + break; + + buf_ptr += size; + bio_size -= size; + offset = 0; + } + + return (bio_size); +} + +/* + * bio_map for gang ABD. + */ +static unsigned int +abd_gang_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + ASSERT(abd_is_gang(abd)); + + for (abd_t *cabd = abd_gang_get_offset(abd, &off); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT3U(off, <, cabd->abd_size); + int size = MIN(io_size, cabd->abd_size - off); + int remainder = abd_bio_map_off(bio, cabd, size, off); + io_size -= (size - remainder); + if (io_size == 0 || remainder > 0) + return (io_size); + off = 0; + } + ASSERT0(io_size); + return (io_size); +} + +/* + * bio_map for ABD. + * @off is the offset in @abd + * Remaining IO size is returned + */ +unsigned int +abd_bio_map_off(struct bio *bio, abd_t *abd, + unsigned int io_size, size_t off) +{ + int i; + struct abd_iter aiter; + + ASSERT3U(io_size, <=, abd->abd_size - off); + if (abd_is_linear(abd)) + return (bio_map(bio, ((char *)abd_to_buf(abd)) + off, io_size)); + + ASSERT(!abd_is_linear(abd)); + if (abd_is_gang(abd)) + return (abd_gang_bio_map_off(bio, abd, io_size, off)); + + abd_iter_init(&aiter, abd); + abd_iter_advance(&aiter, off); + + for (i = 0; i < bio->bi_max_vecs; i++) { + struct page *pg; + size_t len, sgoff, pgoff; + struct scatterlist *sg; + + if (io_size <= 0) + break; + + sg = aiter.iter_sg; + sgoff = aiter.iter_offset; + pgoff = sgoff & (PAGESIZE - 1); + len = MIN(io_size, PAGESIZE - pgoff); + ASSERT(len > 0); + + pg = nth_page(sg_page(sg), sgoff >> PAGE_SHIFT); + if (bio_add_page(bio, pg, len, pgoff) != len) + break; + + io_size -= len; + abd_iter_advance(&aiter, len); + } + + return (io_size); +} + +/* Tunable Parameters */ +module_param(zfs_abd_scatter_enabled, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_enabled, + "Toggle whether ABD allocations must be linear."); +module_param(zfs_abd_scatter_min_size, int, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_min_size, + "Minimum size of scatter allocations."); +/* CSTYLED */ +module_param(zfs_abd_scatter_max_order, uint, 0644); +MODULE_PARM_DESC(zfs_abd_scatter_max_order, + "Maximum order allocation used for a scatter ABD."); +#endif diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c new file mode 100644 index 000000000000..92f9bae8ccd3 --- /dev/null +++ b/module/os/linux/zfs/arc_os.c @@ -0,0 +1,465 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 by Saso Kiselkov. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include + +/* + * This is a limit on how many pages the ARC shrinker makes available for + * eviction in response to one page allocation attempt. Note that in + * practice, the kernel's shrinker can ask us to evict up to about 4x this + * for one allocation attempt. + * + * The default limit of 10,000 (in practice, 160MB per allocation attempt + * with 4K pages) limits the amount of time spent attempting to reclaim ARC + * memory to less than 100ms per allocation attempt, even with a small + * average compressed block size of ~8KB. + * + * See also the comment in arc_shrinker_count(). + * Set to 0 to disable limit. + */ +int zfs_arc_shrinker_limit = 10000; + + +/* + * Return a default max arc size based on the amount of physical memory. + */ +uint64_t +arc_default_max(uint64_t min, uint64_t allmem) +{ + /* Default to 1/2 of all memory. */ + return (MAX(allmem / 2, min)); +} + +#ifdef _KERNEL +/* + * Return maximum amount of memory that we could possibly use. Reduced + * to half of all memory in user space which is primarily used for testing. + */ +uint64_t +arc_all_memory(void) +{ +#ifdef CONFIG_HIGHMEM + return (ptob(zfs_totalram_pages - zfs_totalhigh_pages)); +#else + return (ptob(zfs_totalram_pages)); +#endif /* CONFIG_HIGHMEM */ +} + +/* + * Return the amount of memory that is considered free. In user space + * which is primarily used for testing we pretend that free memory ranges + * from 0-20% of all memory. + */ +uint64_t +arc_free_memory(void) +{ +#ifdef CONFIG_HIGHMEM + struct sysinfo si; + si_meminfo(&si); + return (ptob(si.freeram - si.freehigh)); +#else + return (ptob(nr_free_pages() + + nr_inactive_file_pages() + + nr_slab_reclaimable_pages())); +#endif /* CONFIG_HIGHMEM */ +} + +/* + * Return the amount of memory that can be consumed before reclaim will be + * needed. Positive if there is sufficient free memory, negative indicates + * the amount of memory that needs to be freed up. + */ +int64_t +arc_available_memory(void) +{ + return (arc_free_memory() - arc_sys_free); +} + +static uint64_t +arc_evictable_memory(void) +{ + int64_t asize = aggsum_value(&arc_size); + uint64_t arc_clean = + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_mru->arcs_esize[ARC_BUFC_METADATA]) + + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_DATA]) + + zfs_refcount_count(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + uint64_t arc_dirty = MAX((int64_t)asize - (int64_t)arc_clean, 0); + + /* + * Scale reported evictable memory in proportion to page cache, cap + * at specified min/max. + */ + uint64_t min = (ptob(nr_file_pages()) / 100) * zfs_arc_pc_percent; + min = MAX(arc_c_min, MIN(arc_c_max, min)); + + if (arc_dirty >= min) + return (arc_clean); + + return (MAX((int64_t)asize - (int64_t)min, 0)); +} + +/* + * The _count() function returns the number of free-able objects. + * The _scan() function returns the number of objects that were freed. + */ +static unsigned long +arc_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) +{ + /* + * __GFP_FS won't be set if we are called from ZFS code (see + * kmem_flags_convert(), which removes it). To avoid a deadlock, we + * don't allow evicting in this case. We return 0 rather than + * SHRINK_STOP so that the shrinker logic doesn't accumulate a + * deficit against us. + */ + if (!(sc->gfp_mask & __GFP_FS)) { + return (0); + } + + /* + * This code is reached in the "direct reclaim" case, where the + * kernel (outside ZFS) is trying to allocate a page, and the system + * is low on memory. + * + * The kernel's shrinker code doesn't understand how many pages the + * ARC's callback actually frees, so it may ask the ARC to shrink a + * lot for one page allocation. This is problematic because it may + * take a long time, thus delaying the page allocation, and because + * it may force the ARC to unnecessarily shrink very small. + * + * Therefore, we limit the amount of data that we say is evictable, + * which limits the amount that the shrinker will ask us to evict for + * one page allocation attempt. + * + * In practice, we may be asked to shrink 4x the limit to satisfy one + * page allocation, before the kernel's shrinker code gives up on us. + * When that happens, we rely on the kernel code to find the pages + * that we freed before invoking the OOM killer. This happens in + * __alloc_pages_slowpath(), which retries and finds the pages we + * freed when it calls get_page_from_freelist(). + * + * See also the comment above zfs_arc_shrinker_limit. + */ + int64_t limit = zfs_arc_shrinker_limit != 0 ? + zfs_arc_shrinker_limit : INT64_MAX; + return (MIN(limit, btop((int64_t)arc_evictable_memory()))); +} + +static unsigned long +arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + ASSERT((sc->gfp_mask & __GFP_FS) != 0); + + /* The arc is considered warm once reclaim has occurred */ + if (unlikely(arc_warm == B_FALSE)) + arc_warm = B_TRUE; + + /* + * Evict the requested number of pages by reducing arc_c and waiting + * for the requested amount of data to be evicted. + */ + arc_reduce_target_size(ptob(sc->nr_to_scan)); + arc_wait_for_eviction(ptob(sc->nr_to_scan)); + if (current->reclaim_state != NULL) + current->reclaim_state->reclaimed_slab += sc->nr_to_scan; + + /* + * We are experiencing memory pressure which the arc_evict_zthr was + * unable to keep up with. Set arc_no_grow to briefly pause arc + * growth to avoid compounding the memory pressure. + */ + arc_no_grow = B_TRUE; + + /* + * When direct reclaim is observed it usually indicates a rapid + * increase in memory pressure. This occurs because the kswapd + * threads were unable to asynchronously keep enough free memory + * available. + */ + if (current_is_kswapd()) { + ARCSTAT_BUMP(arcstat_memory_indirect_count); + } else { + ARCSTAT_BUMP(arcstat_memory_direct_count); + } + + return (sc->nr_to_scan); +} + +SPL_SHRINKER_DECLARE(arc_shrinker, + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + uint64_t free_memory = arc_free_memory(); + + if (free_memory > arc_all_memory() * arc_lotsfree_percent / 100) + return (0); + + if (txg > spa->spa_lowmem_last_txg) { + spa->spa_lowmem_last_txg = txg; + spa->spa_lowmem_page_load = 0; + } + /* + * If we are in pageout, we know that memory is already tight, + * the arc is already going to be evicting, so we just want to + * continue to let page writes occur as quickly as possible. + */ + if (current_is_kswapd()) { + if (spa->spa_lowmem_page_load > + MAX(arc_sys_free / 4, free_memory) / 4) { + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); + return (SET_ERROR(ERESTART)); + } + /* Note: reserve is inflated, so we deflate */ + atomic_add_64(&spa->spa_lowmem_page_load, reserve / 8); + return (0); + } else if (spa->spa_lowmem_page_load > 0 && arc_reclaim_needed()) { + /* memory is low, delay before restarting */ + ARCSTAT_INCR(arcstat_memory_throttle_count, 1); + DMU_TX_STAT_BUMP(dmu_tx_memory_reclaim); + return (SET_ERROR(EAGAIN)); + } + spa->spa_lowmem_page_load = 0; + return (0); +} + +void +arc_lowmem_init(void) +{ + uint64_t allmem = arc_all_memory(); + + /* + * Register a shrinker to support synchronous (direct) memory + * reclaim from the arc. This is done to prevent kswapd from + * swapping out pages when it is preferable to shrink the arc. + */ + spl_register_shrinker(&arc_shrinker); + + /* + * The ARC tries to keep at least this much memory available for the + * system. This gives the ARC time to shrink in response to memory + * pressure, before running completely out of memory and invoking the + * direct-reclaim ARC shrinker. + * + * This should be more than twice high_wmark_pages(), so that + * arc_wait_for_eviction() will wait until at least the + * high_wmark_pages() are free (see arc_evict_state_impl()). + * + * Note: Even when the system is very low on memory, the kernel's + * shrinker code may only ask for one "batch" of pages (512KB) to be + * evicted. If concurrent allocations consume these pages, there may + * still be insufficient free pages, and the OOM killer takes action. + * + * By setting arc_sys_free large enough, and having + * arc_wait_for_eviction() wait until there is at least arc_sys_free/2 + * free memory, it is much less likely that concurrent allocations can + * consume all the memory that was evicted before checking for + * OOM. + * + * It's hard to iterate the zones from a linux kernel module, which + * makes it difficult to determine the watermark dynamically. Instead + * we compute the maximum high watermark for this system, based + * on the amount of memory, assuming default parameters on Linux kernel + * 5.3. + */ + + /* + * Base wmark_low is 4 * the square root of Kbytes of RAM. + */ + long wmark = 4 * int_sqrt(allmem/1024) * 1024; + + /* + * Clamp to between 128K and 64MB. + */ + wmark = MAX(wmark, 128 * 1024); + wmark = MIN(wmark, 64 * 1024 * 1024); + + /* + * watermark_boost can increase the wmark by up to 150%. + */ + wmark += wmark * 150 / 100; + + /* + * arc_sys_free needs to be more than 2x the watermark, because + * arc_wait_for_eviction() waits for half of arc_sys_free. Bump this up + * to 3x to ensure we're above it. + */ + arc_sys_free = wmark * 3 + allmem / 32; +} + +void +arc_lowmem_fini(void) +{ + spl_unregister_shrinker(&arc_shrinker); +} + +int +param_set_arc_long(const char *buf, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_long(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + arc_tuning_update(B_TRUE); + + return (0); +} + +int +param_set_arc_int(const char *buf, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_int(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + arc_tuning_update(B_TRUE); + + return (0); +} +#else /* _KERNEL */ +int64_t +arc_available_memory(void) +{ + int64_t lowest = INT64_MAX; + + /* Every 100 calls, free a small amount */ + if (spa_get_random(100) == 0) + lowest = -1024; + + return (lowest); +} + +int +arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + return (0); +} + +uint64_t +arc_all_memory(void) +{ + return (ptob(physmem) / 2); +} + +uint64_t +arc_free_memory(void) +{ + return (spa_get_random(arc_all_memory() * 20 / 100)); +} +#endif /* _KERNEL */ + +/* + * Helper function for arc_prune_async() it is responsible for safely + * handling the execution of a registered arc_prune_func_t. + */ +static void +arc_prune_task(void *ptr) +{ + arc_prune_t *ap = (arc_prune_t *)ptr; + arc_prune_func_t *func = ap->p_pfunc; + + if (func != NULL) + func(ap->p_adjust, ap->p_private); + + zfs_refcount_remove(&ap->p_refcnt, func); +} + +/* + * Notify registered consumers they must drop holds on a portion of the ARC + * buffered they reference. This provides a mechanism to ensure the ARC can + * honor the arc_meta_limit and reclaim otherwise pinned ARC buffers. This + * is analogous to dnlc_reduce_cache() but more generic. + * + * This operation is performed asynchronously so it may be safely called + * in the context of the arc_reclaim_thread(). A reference is taken here + * for each registered arc_prune_t and the arc_prune_task() is responsible + * for releasing it once the registered arc_prune_func_t has completed. + */ +void +arc_prune_async(int64_t adjust) +{ + arc_prune_t *ap; + + mutex_enter(&arc_prune_mtx); + for (ap = list_head(&arc_prune_list); ap != NULL; + ap = list_next(&arc_prune_list, ap)) { + + if (zfs_refcount_count(&ap->p_refcnt) >= 2) + continue; + + zfs_refcount_add(&ap->p_refcnt, ap->p_pfunc); + ap->p_adjust = adjust; + if (taskq_dispatch(arc_prune_taskq, arc_prune_task, + ap, TQ_SLEEP) == TASKQID_INVALID) { + zfs_refcount_remove(&ap->p_refcnt, ap->p_pfunc); + continue; + } + ARCSTAT_BUMP(arcstat_prune); + } + mutex_exit(&arc_prune_mtx); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, shrinker_limit, INT, ZMOD_RW, + "Limit on number of pages that ARC shrinker can reclaim at once"); +/* END CSTYLED */ diff --git a/module/os/linux/zfs/mmp_os.c b/module/os/linux/zfs/mmp_os.c new file mode 100644 index 000000000000..ff3ef1bf6ad9 --- /dev/null +++ b/module/os/linux/zfs/mmp_os.c @@ -0,0 +1,41 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. + */ + +#include +#include + +int +param_set_multihost_interval(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return (ret); + + if (spa_mode_global != SPA_MODE_UNINIT) + mmp_signal_all_threads(); + + return (ret); +} diff --git a/module/os/linux/zfs/policy.c b/module/os/linux/zfs/policy.c new file mode 100644 index 000000000000..5267d67eea82 --- /dev/null +++ b/module/os/linux/zfs/policy.c @@ -0,0 +1,374 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2013, Joyent, Inc. All rights reserved. + * Copyright (C) 2016 Lawrence Livermore National Security, LLC. + * + * For Linux the vast majority of this enforcement is already handled via + * the standard Linux VFS permission checks. However certain administrative + * commands which bypass the standard mechanisms may need to make use of + * this functionality. + */ + +#include +#include +#include + +/* + * The passed credentials cannot be directly verified because Linux only + * provides and interface to check the *current* process credentials. In + * order to handle this the capable() test is only run when the passed + * credentials match the current process credentials or the kcred. In + * all other cases this function must fail and return the passed err. + */ +static int +priv_policy_ns(const cred_t *cr, int capability, int err, + struct user_namespace *ns) +{ + if (cr != CRED() && (cr != kcred)) + return (err); + +#if defined(CONFIG_USER_NS) + if (!(ns ? ns_capable(ns, capability) : capable(capability))) +#else + if (!capable(capability)) +#endif + return (err); + + return (0); +} + +static int +priv_policy(const cred_t *cr, int capability, int err) +{ + return (priv_policy_ns(cr, capability, err, NULL)); +} + +static int +priv_policy_user(const cred_t *cr, int capability, int err) +{ + /* + * All priv_policy_user checks are preceded by kuid/kgid_has_mapping() + * checks. If we cannot do them, we shouldn't be using ns_capable() + * since we don't know whether the affected files are valid in our + * namespace. + */ +#if defined(CONFIG_USER_NS) + return (priv_policy_ns(cr, capability, err, cr->user_ns)); +#else + return (priv_policy_ns(cr, capability, err, NULL)); +#endif +} + +/* + * Checks for operations that are either client-only or are used by + * both clients and servers. + */ +int +secpolicy_nfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); +} + +/* + * Catch all system configuration. + */ +int +secpolicy_sys_config(const cred_t *cr, boolean_t checkonly) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, EPERM)); +} + +/* + * Like secpolicy_vnode_access() but we get the actual wanted mode and the + * current mode of the file, not the missing bits. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_access2(const cred_t *cr, struct inode *ip, uid_t owner, + mode_t curmode, mode_t wantmode) +{ + return (0); +} + +/* + * This is a special routine for ZFS; it is used to determine whether + * any of the privileges in effect allow any form of access to the + * file. There's no reason to audit this or any reason to record + * this. More work is needed to do the "KPLD" stuff. + */ +int +secpolicy_vnode_any_access(const cred_t *cr, struct inode *ip, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + + if (inode_owner_or_capable(ip)) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + if (priv_policy_user(cr, CAP_DAC_OVERRIDE, EPERM) == 0) + return (0); + + if (priv_policy_user(cr, CAP_DAC_READ_SEARCH, EPERM) == 0) + return (0); + + return (EPERM); +} + +/* + * Determine if subject can chown owner of a file. + */ +int +secpolicy_vnode_chown(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, EPERM)); +} + +/* + * Determine if subject can change group ownership of a file. + */ +int +secpolicy_vnode_create_gid(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SETGID, EPERM)); +} + +/* + * Policy determines whether we can remove an entry from a directory, + * regardless of permission bits. + */ +int +secpolicy_vnode_remove(const cred_t *cr) +{ + return (priv_policy(cr, CAP_FOWNER, EPERM)); +} + +/* + * Determine that subject can modify the mode of a file. allzone privilege + * needed when modifying root owned object. + */ +int +secpolicy_vnode_setdac(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FOWNER, EPERM)); +} + +/* + * Are we allowed to retain the set-uid/set-gid bits when + * changing ownership or when writing to a file? + * "issuid" should be true when set-uid; only in that case + * root ownership is checked (setgid is assumed). + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setid_retain(const cred_t *cr, boolean_t issuidroot) +{ + return (priv_policy_user(cr, CAP_FSETID, EPERM)); +} + +/* + * Determine that subject can set the file setgid flag. + */ +int +secpolicy_vnode_setids_setgids(const cred_t *cr, gid_t gid) +{ +#if defined(CONFIG_USER_NS) + if (!kgid_has_mapping(cr->user_ns, SGID_TO_KGID(gid))) + return (EPERM); +#endif + if (crgetfsgid(cr) != gid && !groupmember(gid, cr)) + return (priv_policy_user(cr, CAP_FSETID, EPERM)); + + return (0); +} + +/* + * Determine if the subject can inject faults in the ZFS fault injection + * framework. Requires all privileges. + */ +int +secpolicy_zinject(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); +} + +/* + * Determine if the subject has permission to manipulate ZFS datasets + * (not pools). Equivalent to the SYS_MOUNT privilege. + */ +int +secpolicy_zfs(const cred_t *cr) +{ + return (priv_policy(cr, CAP_SYS_ADMIN, EACCES)); +} + +/* + * Equivalent to secpolicy_zfs(), but works even if the cred_t is not that of + * the current process. Takes both cred_t and proc_t so that this can work + * easily on all platforms. + * + * The has_capability() function was first exported in the 4.10 Linux kernel + * then backported to some LTS kernels. Prior to this change there was no + * mechanism to perform this check therefore EACCES is returned when the + * functionality is not present in the kernel. + */ +int +secpolicy_zfs_proc(const cred_t *cr, proc_t *proc) +{ +#if defined(HAVE_HAS_CAPABILITY) + if (!has_capability(proc, CAP_SYS_ADMIN)) + return (EACCES); + return (0); +#else + return (EACCES); +#endif +} + +void +secpolicy_setid_clear(vattr_t *vap, cred_t *cr) +{ + if ((vap->va_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + (vap->va_mode & S_ISUID) != 0 && + (vap->va_mask & AT_UID) != 0 && vap->va_uid == 0) != 0) { + vap->va_mask |= AT_MODE; + vap->va_mode &= ~(S_ISUID|S_ISGID); + } +} + +/* + * Determine that subject can set the file setid flags. + */ +static int +secpolicy_vnode_setid_modify(const cred_t *cr, uid_t owner) +{ + if (crgetfsuid(cr) == owner) + return (0); + +#if defined(CONFIG_USER_NS) + if (!kuid_has_mapping(cr->user_ns, SUID_TO_KUID(owner))) + return (EPERM); +#endif + + return (priv_policy_user(cr, CAP_FSETID, EPERM)); +} + +/* + * Determine that subject can make a file a "sticky". + * + * Enforced in the Linux VFS. + */ +static int +secpolicy_vnode_stky_modify(const cred_t *cr) +{ + return (0); +} + +int +secpolicy_setid_setsticky_clear(struct inode *ip, vattr_t *vap, + const vattr_t *ovap, cred_t *cr) +{ + int error; + + if ((vap->va_mode & S_ISUID) != 0 && + (error = secpolicy_vnode_setid_modify(cr, + ovap->va_uid)) != 0) { + return (error); + } + + /* + * Check privilege if attempting to set the + * sticky bit on a non-directory. + */ + if (!S_ISDIR(ip->i_mode) && (vap->va_mode & S_ISVTX) != 0 && + secpolicy_vnode_stky_modify(cr) != 0) { + vap->va_mode &= ~S_ISVTX; + } + + /* + * Check for privilege if attempting to set the + * group-id bit. + */ + if ((vap->va_mode & S_ISGID) != 0 && + secpolicy_vnode_setids_setgids(cr, ovap->va_gid) != 0) { + vap->va_mode &= ~S_ISGID; + } + + return (0); +} + +/* + * Check privileges for setting xvattr attributes + */ +int +secpolicy_xvattr(xvattr_t *xvap, uid_t owner, cred_t *cr, mode_t type) +{ + return (secpolicy_vnode_chown(cr, owner)); +} + +/* + * Check privileges for setattr attributes. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_vnode_setattr(cred_t *cr, struct inode *ip, struct vattr *vap, + const struct vattr *ovap, int flags, + int unlocked_access(void *, int, cred_t *), void *node) +{ + return (0); +} + +/* + * Check privileges for links. + * + * Enforced in the Linux VFS. + */ +int +secpolicy_basic_link(const cred_t *cr) +{ + return (0); +} diff --git a/module/os/linux/zfs/qat.c b/module/os/linux/zfs/qat.c new file mode 100644 index 000000000000..08613b3a2042 --- /dev/null +++ b/module/os/linux/zfs/qat.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include + +qat_stats_t qat_stats = { + { "comp_requests", KSTAT_DATA_UINT64 }, + { "comp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "comp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decomp_requests", KSTAT_DATA_UINT64 }, + { "decomp_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decomp_total_out_bytes", KSTAT_DATA_UINT64 }, + { "dc_fails", KSTAT_DATA_UINT64 }, + { "encrypt_requests", KSTAT_DATA_UINT64 }, + { "encrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "encrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_requests", KSTAT_DATA_UINT64 }, + { "decrypt_total_in_bytes", KSTAT_DATA_UINT64 }, + { "decrypt_total_out_bytes", KSTAT_DATA_UINT64 }, + { "crypt_fails", KSTAT_DATA_UINT64 }, + { "cksum_requests", KSTAT_DATA_UINT64 }, + { "cksum_total_in_bytes", KSTAT_DATA_UINT64 }, + { "cksum_fails", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *qat_ksp = NULL; + +CpaStatus +qat_mem_alloc_contig(void **pp_mem_addr, Cpa32U size_bytes) +{ + *pp_mem_addr = kmalloc(size_bytes, GFP_KERNEL); + if (*pp_mem_addr == NULL) + return (CPA_STATUS_RESOURCE); + return (CPA_STATUS_SUCCESS); +} + +void +qat_mem_free_contig(void **pp_mem_addr) +{ + if (*pp_mem_addr != NULL) { + kfree(*pp_mem_addr); + *pp_mem_addr = NULL; + } +} + +int +qat_init(void) +{ + qat_ksp = kstat_create("zfs", 0, "qat", "misc", + KSTAT_TYPE_NAMED, sizeof (qat_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (qat_ksp != NULL) { + qat_ksp->ks_data = &qat_stats; + kstat_install(qat_ksp); + } + + /* + * Just set the disable flag when qat init failed, qat can be + * turned on again in post-process after zfs module is loaded, e.g.: + * echo 0 > /sys/module/zfs/parameters/zfs_qat_compress_disable + */ + if (qat_dc_init() != 0) + zfs_qat_compress_disable = 1; + + if (qat_cy_init() != 0) { + zfs_qat_checksum_disable = 1; + zfs_qat_encrypt_disable = 1; + } + + return (0); +} + +void +qat_fini(void) +{ + if (qat_ksp != NULL) { + kstat_delete(qat_ksp); + qat_ksp = NULL; + } + + qat_cy_fini(); + qat_dc_fini(); +} + +#endif diff --git a/module/os/linux/zfs/qat_compress.c b/module/os/linux/zfs/qat_compress.c new file mode 100644 index 000000000000..ad3ead3b16e3 --- /dev/null +++ b/module/os/linux/zfs/qat_compress.c @@ -0,0 +1,569 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instance and + * session arrays; the actual number of instances are defined in the + * QAT driver's configuration file. + */ +#define QAT_DC_MAX_INSTANCES 48 + +/* + * ZLIB head and foot size + */ +#define ZLIB_HEAD_SZ 2 +#define ZLIB_FOOT_SZ 4 + +static CpaInstanceHandle dc_inst_handles[QAT_DC_MAX_INSTANCES]; +static CpaDcSessionHandle session_handles[QAT_DC_MAX_INSTANCES]; +static CpaBufferList **buffer_array[QAT_DC_MAX_INSTANCES]; +static Cpa16U num_inst = 0; +static Cpa32U inst_num = 0; +static boolean_t qat_dc_init_done = B_FALSE; +int zfs_qat_compress_disable = 0; + +boolean_t +qat_dc_use_accel(size_t s_len) +{ + return (!zfs_qat_compress_disable && + qat_dc_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +static void +qat_dc_callback(void *p_callback, CpaStatus status) +{ + if (p_callback != NULL) + complete((struct completion *)p_callback); +} + +static void +qat_dc_clean(void) +{ + Cpa16U buff_num = 0; + Cpa16U num_inter_buff_lists = 0; + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcStopInstance(dc_inst_handles[i]); + QAT_PHYS_CONTIG_FREE(session_handles[i]); + /* free intermediate buffers */ + if (buffer_array[i] != NULL) { + cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + CpaBufferList *buffer_inter = + buffer_array[i][buff_num]; + if (buffer_inter->pBuffers) { + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers->pData); + QAT_PHYS_CONTIG_FREE( + buffer_inter->pBuffers); + } + QAT_PHYS_CONTIG_FREE( + buffer_inter->pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(buffer_inter); + } + } + } + + num_inst = 0; + qat_dc_init_done = B_FALSE; +} + +int +qat_dc_init(void) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U sess_size = 0; + Cpa32U ctx_size = 0; + Cpa16U num_inter_buff_lists = 0; + Cpa16U buff_num = 0; + Cpa32U buff_meta_size = 0; + CpaDcSessionSetupData sd = {0}; + + if (qat_dc_init_done) + return (0); + + status = cpaDcGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT compression units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_DC_MAX_INSTANCES) + num_inst = QAT_DC_MAX_INSTANCES; + + status = cpaDcGetInstances(num_inst, &dc_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + cpaDcSetAddressTranslation(dc_inst_handles[i], + (void*)virt_to_phys); + + status = cpaDcBufferListGetMetaSize(dc_inst_handles[i], + 1, &buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = cpaDcGetNumIntermediateBuffers( + dc_inst_handles[i], &num_inter_buff_lists); + + if (status == CPA_STATUS_SUCCESS && num_inter_buff_lists != 0) + status = QAT_PHYS_CONTIG_ALLOC(&buffer_array[i], + num_inter_buff_lists * + sizeof (CpaBufferList *)); + + for (buff_num = 0; buff_num < num_inter_buff_lists; + buff_num++) { + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num], + sizeof (CpaBufferList)); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]-> + pPrivateMetaData, + buff_meta_size); + + if (status == CPA_STATUS_SUCCESS) + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers, + sizeof (CpaFlatBuffer)); + + if (status == CPA_STATUS_SUCCESS) { + /* + * implementation requires an intermediate + * buffer approximately twice the size of + * output buffer, which is 2x max buffer + * size here. + */ + status = QAT_PHYS_CONTIG_ALLOC( + &buffer_array[i][buff_num]->pBuffers-> + pData, 2 * QAT_MAX_BUF_SIZE); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + buffer_array[i][buff_num]->numBuffers = 1; + buffer_array[i][buff_num]->pBuffers-> + dataLenInBytes = 2 * QAT_MAX_BUF_SIZE; + } + } + + status = cpaDcStartInstance(dc_inst_handles[i], + num_inter_buff_lists, buffer_array[i]); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + sd.compLevel = CPA_DC_L1; + sd.compType = CPA_DC_DEFLATE; + sd.huffType = CPA_DC_HT_FULL_DYNAMIC; + sd.sessDirection = CPA_DC_DIR_COMBINED; + sd.sessState = CPA_DC_STATELESS; + sd.deflateWindowSize = 7; + sd.checksum = CPA_DC_ADLER32; + status = cpaDcGetSessionSize(dc_inst_handles[i], + &sd, &sess_size, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + QAT_PHYS_CONTIG_ALLOC(&session_handles[i], sess_size); + if (session_handles[i] == NULL) + goto fail; + + status = cpaDcInitSession(dc_inst_handles[i], + session_handles[i], + &sd, NULL, qat_dc_callback); + if (status != CPA_STATUS_SUCCESS) + goto fail; + } + + qat_dc_init_done = B_TRUE; + return (0); +fail: + qat_dc_clean(); + return (-1); +} + +void +qat_dc_fini(void) +{ + if (!qat_dc_init_done) + return; + + qat_dc_clean(); +} + +/* + * The "add" parameter is an additional buffer which is passed + * to QAT as a scratch buffer alongside the destination buffer + * in case the "compressed" data ends up being larger than the + * original source data. This is necessary to prevent QAT from + * generating buffer overflow warnings for incompressible data. + */ +static int +qat_compress_impl(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, char *add, int add_len, size_t *c_len) +{ + CpaInstanceHandle dc_inst_handle; + CpaDcSessionHandle session_handle; + CpaBufferList *buf_list_src = NULL; + CpaBufferList *buf_list_dst = NULL; + CpaFlatBuffer *flat_buf_src = NULL; + CpaFlatBuffer *flat_buf_dst = NULL; + Cpa8U *buffer_meta_src = NULL; + Cpa8U *buffer_meta_dst = NULL; + Cpa32U buffer_meta_size = 0; + CpaDcRqResults dc_results; + CpaStatus status = CPA_STATUS_FAIL; + Cpa32U hdr_sz = 0; + Cpa32U compressed_sz; + Cpa32U num_src_buf = (src_len >> PAGE_SHIFT) + 2; + Cpa32U num_dst_buf = (dst_len >> PAGE_SHIFT) + 2; + Cpa32U num_add_buf = (add_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left; + Cpa32U dst_pages = 0; + Cpa32U adler32 = 0; + char *data; + struct page *page; + struct page **in_pages = NULL; + struct page **out_pages = NULL; + struct page **add_pages = NULL; + Cpa32U page_off = 0; + struct completion complete; + Cpa32U page_num = 0; + Cpa16U i; + + /* + * We increment num_src_buf and num_dst_buf by 2 to allow + * us to handle non page-aligned buffer addresses and buffers + * whose sizes are not divisible by PAGE_SIZE. + */ + Cpa32U src_buffer_list_mem_size = sizeof (CpaBufferList) + + (num_src_buf * sizeof (CpaFlatBuffer)); + Cpa32U dst_buffer_list_mem_size = sizeof (CpaBufferList) + + ((num_dst_buf + num_add_buf) * sizeof (CpaFlatBuffer)); + + status = QAT_PHYS_CONTIG_ALLOC(&in_pages, + num_src_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&out_pages, + num_dst_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&add_pages, + num_add_buf * sizeof (struct page *)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + dc_inst_handle = dc_inst_handles[i]; + session_handle = session_handles[i]; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_src_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_src, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + cpaDcBufferListGetMetaSize(dc_inst_handle, num_dst_buf + num_add_buf, + &buffer_meta_size); + status = QAT_PHYS_CONTIG_ALLOC(&buffer_meta_dst, buffer_meta_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* build source buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_src, src_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_src = (CpaFlatBuffer *)(buf_list_src + 1); + + buf_list_src->pBuffers = flat_buf_src; /* always point to first one */ + + /* build destination buffer list */ + status = QAT_PHYS_CONTIG_ALLOC(&buf_list_dst, dst_buffer_list_mem_size); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + + buf_list_dst->pBuffers = flat_buf_dst; /* always point to first one */ + + buf_list_src->numBuffers = 0; + buf_list_src->pPrivateMetaData = buffer_meta_src; + bytes_left = src_len; + data = src; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + in_pages[page_num] = page; + flat_buf_src->pData = kmap(page) + page_off; + flat_buf_src->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_src->dataLenInBytes; + data += flat_buf_src->dataLenInBytes; + flat_buf_src++; + buf_list_src->numBuffers++; + page_num++; + } + + buf_list_dst->numBuffers = 0; + buf_list_dst->pPrivateMetaData = buffer_meta_dst; + bytes_left = dst_len; + data = dst; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + out_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + dst_pages++; + } + + /* map additional scratch pages into the destination buffer list */ + bytes_left = add_len; + data = add; + page_num = 0; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + page = qat_mem_to_page(data); + flat_buf_dst->pData = kmap(page) + page_off; + add_pages[page_num] = page; + flat_buf_dst->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + + bytes_left -= flat_buf_dst->dataLenInBytes; + data += flat_buf_dst->dataLenInBytes; + flat_buf_dst++; + buf_list_dst->numBuffers++; + page_num++; + } + + init_completion(&complete); + + if (dir == QAT_COMPRESS) { + QAT_STAT_BUMP(comp_requests); + QAT_STAT_INCR(comp_total_in_bytes, src_len); + + cpaDcGenerateHeader(session_handle, + buf_list_dst->pBuffers, &hdr_sz); + buf_list_dst->pBuffers->pData += hdr_sz; + buf_list_dst->pBuffers->dataLenInBytes -= hdr_sz; + status = cpaDcCompressData( + dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, + &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + if (status != CPA_STATUS_SUCCESS) { + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + compressed_sz = dc_results.produced; + if (compressed_sz + hdr_sz + ZLIB_FOOT_SZ > dst_len) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + flat_buf_dst = (CpaFlatBuffer *)(buf_list_dst + 1); + /* move to the last page */ + flat_buf_dst += (compressed_sz + hdr_sz) >> PAGE_SHIFT; + + /* no space for gzip footer in the last page */ + if (((compressed_sz + hdr_sz) % PAGE_SIZE) + + ZLIB_FOOT_SZ > PAGE_SIZE) { + status = CPA_STATUS_INCOMPRESSIBLE; + goto fail; + } + + /* jump to the end of the buffer and append footer */ + flat_buf_dst->pData = + (char *)((unsigned long)flat_buf_dst->pData & PAGE_MASK) + + ((compressed_sz + hdr_sz) % PAGE_SIZE); + flat_buf_dst->dataLenInBytes = ZLIB_FOOT_SZ; + + dc_results.produced = 0; + status = cpaDcGenerateFooter(session_handle, + flat_buf_dst, &dc_results); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + *c_len = compressed_sz + dc_results.produced + hdr_sz; + QAT_STAT_INCR(comp_total_out_bytes, *c_len); + } else { + ASSERT3U(dir, ==, QAT_DECOMPRESS); + QAT_STAT_BUMP(decomp_requests); + QAT_STAT_INCR(decomp_total_in_bytes, src_len); + + buf_list_src->pBuffers->pData += ZLIB_HEAD_SZ; + buf_list_src->pBuffers->dataLenInBytes -= ZLIB_HEAD_SZ; + status = cpaDcDecompressData(dc_inst_handle, session_handle, + buf_list_src, buf_list_dst, &dc_results, CPA_DC_FLUSH_FINAL, + &complete); + + if (CPA_STATUS_SUCCESS != status) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* we now wait until the completion of the operation. */ + wait_for_completion(&complete); + + if (dc_results.status != CPA_STATUS_SUCCESS) { + status = CPA_STATUS_FAIL; + goto fail; + } + + /* verify adler checksum */ + adler32 = *(Cpa32U *)(src + dc_results.consumed + ZLIB_HEAD_SZ); + if (adler32 != BSWAP_32(dc_results.checksum)) { + status = CPA_STATUS_FAIL; + goto fail; + } + *c_len = dc_results.produced; + QAT_STAT_INCR(decomp_total_out_bytes, *c_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS && status != CPA_STATUS_INCOMPRESSIBLE) + QAT_STAT_BUMP(dc_fails); + + if (in_pages) { + for (page_num = 0; + page_num < buf_list_src->numBuffers; + page_num++) { + kunmap(in_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(in_pages); + } + + if (out_pages) { + for (page_num = 0; page_num < dst_pages; page_num++) { + kunmap(out_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(out_pages); + } + + if (add_pages) { + for (page_num = 0; + page_num < buf_list_dst->numBuffers - dst_pages; + page_num++) { + kunmap(add_pages[page_num]); + } + QAT_PHYS_CONTIG_FREE(add_pages); + } + + QAT_PHYS_CONTIG_FREE(buffer_meta_src); + QAT_PHYS_CONTIG_FREE(buffer_meta_dst); + QAT_PHYS_CONTIG_FREE(buf_list_src); + QAT_PHYS_CONTIG_FREE(buf_list_dst); + + return (status); +} + +/* + * Entry point for QAT accelerated compression / decompression. + */ +int +qat_compress(qat_compress_dir_t dir, char *src, int src_len, + char *dst, int dst_len, size_t *c_len) +{ + int ret; + size_t add_len = 0; + void *add = NULL; + + if (dir == QAT_COMPRESS) { + add_len = dst_len; + add = zio_data_buf_alloc(add_len); + } + + ret = qat_compress_impl(dir, src, src_len, dst, + dst_len, add, add_len, c_len); + + if (dir == QAT_COMPRESS) + zio_data_buf_free(add, add_len); + + return (ret); +} + +static int +param_set_qat_compress(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_compress_disable = 0: enable qat compress + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_dc_init_done) { + ret = qat_dc_init(); + if (ret != 0) { + zfs_qat_compress_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_compress_disable, param_set_qat_compress, + param_get_int, &zfs_qat_compress_disable, 0644); +MODULE_PARM_DESC(zfs_qat_compress_disable, "Enable/Disable QAT compression"); + +#endif diff --git a/module/os/linux/zfs/qat_crypt.c b/module/os/linux/zfs/qat_crypt.c new file mode 100644 index 000000000000..4771b2f3bec5 --- /dev/null +++ b/module/os/linux/zfs/qat_crypt.c @@ -0,0 +1,630 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This file represents the QAT implementation of checksums and encryption. + * Internally, QAT shares the same cryptographic instances for both of these + * operations, so the code has been combined here. QAT data compression uses + * compression instances, so that code is separated into qat_compress.c + */ + +#if defined(_KERNEL) && defined(HAVE_QAT) +#include +#include +#include +#include +#include +#include +#include "lac/cpa_cy_im.h" +#include "lac/cpa_cy_common.h" +#include + +/* + * Max instances in a QAT device, each instance is a channel to submit + * jobs to QAT hardware, this is only for pre-allocating instances + * and session arrays; the actual number of instances are defined in + * the QAT driver's configure file. + */ +#define QAT_CRYPT_MAX_INSTANCES 48 + +#define MAX_PAGE_NUM 1024 + +static Cpa32U inst_num = 0; +static Cpa16U num_inst = 0; +static CpaInstanceHandle cy_inst_handles[QAT_CRYPT_MAX_INSTANCES]; +static boolean_t qat_cy_init_done = B_FALSE; +int zfs_qat_encrypt_disable = 0; +int zfs_qat_checksum_disable = 0; + +typedef struct cy_callback { + CpaBoolean verify_result; + struct completion complete; +} cy_callback_t; + +static void +symcallback(void *p_callback, CpaStatus status, const CpaCySymOp operation, + void *op_data, CpaBufferList *buf_list_dst, CpaBoolean verify) +{ + cy_callback_t *cb = p_callback; + + if (cb != NULL) { + /* indicate that the function has been called */ + cb->verify_result = verify; + complete(&cb->complete); + } +} + +boolean_t +qat_crypt_use_accel(size_t s_len) +{ + return (!zfs_qat_encrypt_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +boolean_t +qat_checksum_use_accel(size_t s_len) +{ + return (!zfs_qat_checksum_disable && + qat_cy_init_done && + s_len >= QAT_MIN_BUF_SIZE && + s_len <= QAT_MAX_BUF_SIZE); +} + +void +qat_cy_clean(void) +{ + for (Cpa16U i = 0; i < num_inst; i++) + cpaCyStopInstance(cy_inst_handles[i]); + + num_inst = 0; + qat_cy_init_done = B_FALSE; +} + +int +qat_cy_init(void) +{ + CpaStatus status = CPA_STATUS_FAIL; + + if (qat_cy_init_done) + return (0); + + status = cpaCyGetNumInstances(&num_inst); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + /* if the user has configured no QAT encryption units just return */ + if (num_inst == 0) + return (0); + + if (num_inst > QAT_CRYPT_MAX_INSTANCES) + num_inst = QAT_CRYPT_MAX_INSTANCES; + + status = cpaCyGetInstances(num_inst, &cy_inst_handles[0]); + if (status != CPA_STATUS_SUCCESS) + return (-1); + + for (Cpa16U i = 0; i < num_inst; i++) { + status = cpaCySetAddressTranslation(cy_inst_handles[i], + (void *)virt_to_phys); + if (status != CPA_STATUS_SUCCESS) + goto error; + + status = cpaCyStartInstance(cy_inst_handles[i]); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + qat_cy_init_done = B_TRUE; + return (0); + +error: + qat_cy_clean(); + return (-1); +} + +void +qat_cy_fini(void) +{ + if (!qat_cy_init_done) + return; + + qat_cy_clean(); +} + +static CpaStatus +qat_init_crypt_session_ctx(qat_encrypt_dir_t dir, CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, crypto_key_t *key, + Cpa64U crypt, Cpa32U aad_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U ciper_algorithm; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_CCM) { + return (CPA_STATUS_FAIL); + } else { + ciper_algorithm = CPA_CY_SYM_CIPHER_AES_GCM; + hash_algorithm = CPA_CY_SYM_HASH_AES_GCM; + } + + sd.cipherSetupData.cipherAlgorithm = ciper_algorithm; + sd.cipherSetupData.pCipherKey = key->ck_data; + sd.cipherSetupData.cipherKeyLenInBytes = key->ck_length / 8; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_AUTH; + sd.hashSetupData.digestResultLenInBytes = ZIO_DATA_MAC_LEN; + sd.hashSetupData.authModeSetupData.aadLenInBytes = aad_len; + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_ALGORITHM_CHAINING; + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + if (dir == QAT_ENCRYPT) { + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_ENCRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_HASH_THEN_CIPHER; + } else { + ASSERT3U(dir, ==, QAT_DECRYPT); + sd.cipherSetupData.cipherDirection = + CPA_CY_SYM_CIPHER_DIRECTION_DECRYPT; + sd.algChainOrder = + CPA_CY_SYM_ALG_CHAIN_ORDER_CIPHER_THEN_HASH; + } + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_checksum_session_ctx(CpaInstanceHandle inst_handle, + CpaCySymSessionCtx **cy_session_ctx, Cpa64U cksum) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U ctx_size; + Cpa32U hash_algorithm; + CpaCySymSessionSetupData sd = { 0 }; + + /* + * ZFS's SHA512 checksum is actually SHA512/256, which uses + * a different IV from standard SHA512. QAT does not support + * SHA512/256, so we can only support SHA256. + */ + if (cksum == ZIO_CHECKSUM_SHA256) + hash_algorithm = CPA_CY_SYM_HASH_SHA256; + else + return (CPA_STATUS_FAIL); + + sd.sessionPriority = CPA_CY_PRIORITY_NORMAL; + sd.symOperation = CPA_CY_SYM_OP_HASH; + sd.hashSetupData.hashAlgorithm = hash_algorithm; + sd.hashSetupData.hashMode = CPA_CY_SYM_HASH_MODE_PLAIN; + sd.hashSetupData.digestResultLenInBytes = sizeof (zio_cksum_t); + sd.digestIsAppended = CPA_FALSE; + sd.verifyDigest = CPA_FALSE; + + status = cpaCySymSessionCtxGetSize(inst_handle, &sd, &ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(cy_session_ctx, ctx_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = cpaCySymInitSession(inst_handle, symcallback, &sd, + *cy_session_ctx); + if (status != CPA_STATUS_SUCCESS) { + QAT_PHYS_CONTIG_FREE(*cy_session_ctx); + return (status); + } + + return (CPA_STATUS_SUCCESS); +} + +static CpaStatus +qat_init_cy_buffer_lists(CpaInstanceHandle inst_handle, uint32_t nr_bufs, + CpaBufferList *src, CpaBufferList *dst) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa32U meta_size = 0; + + status = cpaCyBufferListGetMetaSize(inst_handle, nr_bufs, &meta_size); + if (status != CPA_STATUS_SUCCESS) + return (status); + + status = QAT_PHYS_CONTIG_ALLOC(&src->pPrivateMetaData, meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + + if (src != dst) { + status = QAT_PHYS_CONTIG_ALLOC(&dst->pPrivateMetaData, + meta_size); + if (status != CPA_STATUS_SUCCESS) + goto error; + } + + return (CPA_STATUS_SUCCESS); + +error: + QAT_PHYS_CONTIG_FREE(src->pPrivateMetaData); + if (src != dst) + QAT_PHYS_CONTIG_FREE(dst->pPrivateMetaData); + + return (status); +} + +int +qat_crypt(qat_encrypt_dir_t dir, uint8_t *src_buf, uint8_t *dst_buf, + uint8_t *aad_buf, uint32_t aad_len, uint8_t *iv_buf, uint8_t *digest_buf, + crypto_key_t *key, uint64_t crypt, uint32_t enc_len) +{ + CpaStatus status = CPA_STATUS_SUCCESS; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (enc_len >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaBufferList dst_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + CpaFlatBuffer *flat_dst_buf_array = NULL; + CpaFlatBuffer *flat_dst_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + struct page *out_pages[MAX_PAGE_NUM]; + Cpa32U in_page_num = 0; + Cpa32U out_page_num = 0; + Cpa32U in_page_off = 0; + Cpa32U out_page_off = 0; + + if (dir == QAT_ENCRYPT) { + QAT_STAT_BUMP(encrypt_requests); + QAT_STAT_INCR(encrypt_total_in_bytes, enc_len); + } else { + QAT_STAT_BUMP(decrypt_requests); + QAT_STAT_INCR(decrypt_total_in_bytes, enc_len); + } + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_crypt_session_ctx(dir, cy_inst_handle, + &cy_session_ctx, key, crypt, aad_len); + if (status != CPA_STATUS_SUCCESS) { + /* don't count CCM as a failure since it's not supported */ + if (zio_crypt_table[crypt].ci_crypt_type == ZC_TYPE_GCM) + QAT_STAT_BUMP(crypt_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &dst_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&flat_dst_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pDigestResult, + ZIO_DATA_MAC_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pIv, + ZIO_DATA_IV_LEN); + if (status != CPA_STATUS_SUCCESS) + goto fail; + if (aad_len > 0) { + status = QAT_PHYS_CONTIG_ALLOC(&op_data.pAdditionalAuthData, + aad_len); + if (status != CPA_STATUS_SUCCESS) + goto fail; + bcopy(aad_buf, op_data.pAdditionalAuthData, aad_len); + } + + bytes_left = enc_len; + data = src_buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + in_page_off = ((long)data & ~PAGE_MASK); + in_pages[in_page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[in_page_num]) + in_page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - in_page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + in_page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = in_page_num; + + bytes_left = enc_len; + data = dst_buf; + flat_dst_buf = flat_dst_buf_array; + while (bytes_left > 0) { + out_page_off = ((long)data & ~PAGE_MASK); + out_pages[out_page_num] = qat_mem_to_page(data); + flat_dst_buf->pData = kmap(out_pages[out_page_num]) + + out_page_off; + flat_dst_buf->dataLenInBytes = + min((long)PAGE_SIZE - out_page_off, (long)bytes_left); + data += flat_dst_buf->dataLenInBytes; + bytes_left -= flat_dst_buf->dataLenInBytes; + flat_dst_buf++; + out_page_num++; + } + dst_buffer_list.pBuffers = flat_dst_buf_array; + dst_buffer_list.numBuffers = out_page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.cryptoStartSrcOffsetInBytes = 0; + op_data.messageLenToCipherInBytes = 0; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = 0; + op_data.messageLenToCipherInBytes = enc_len; + op_data.ivLenInBytes = ZIO_DATA_IV_LEN; + bcopy(iv_buf, op_data.pIv, ZIO_DATA_IV_LEN); + /* if dir is QAT_DECRYPT, copy digest_buf to pDigestResult */ + if (dir == QAT_DECRYPT) + bcopy(digest_buf, op_data.pDigestResult, ZIO_DATA_MAC_LEN); + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &dst_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + if (dir == QAT_ENCRYPT) { + /* if dir is QAT_ENCRYPT, save pDigestResult to digest_buf */ + bcopy(op_data.pDigestResult, digest_buf, ZIO_DATA_MAC_LEN); + QAT_STAT_INCR(encrypt_total_out_bytes, enc_len); + } else { + QAT_STAT_INCR(decrypt_total_out_bytes, enc_len); + } + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(crypt_fails); + + for (i = 0; i < in_page_num; i++) + kunmap(in_pages[i]); + for (i = 0; i < out_page_num; i++) + kunmap(out_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + if (aad_len > 0) + QAT_PHYS_CONTIG_FREE(op_data.pAdditionalAuthData); + QAT_PHYS_CONTIG_FREE(op_data.pIv); + QAT_PHYS_CONTIG_FREE(op_data.pDigestResult); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(dst_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + QAT_PHYS_CONTIG_FREE(flat_dst_buf_array); + + return (status); +} + +int +qat_checksum(uint64_t cksum, uint8_t *buf, uint64_t size, zio_cksum_t *zcp) +{ + CpaStatus status; + Cpa16U i; + CpaInstanceHandle cy_inst_handle; + Cpa16U nr_bufs = (size >> PAGE_SHIFT) + 2; + Cpa32U bytes_left = 0; + Cpa8S *data = NULL; + CpaCySymSessionCtx *cy_session_ctx = NULL; + cy_callback_t cb; + Cpa8U *digest_buffer = NULL; + CpaCySymOpData op_data = { 0 }; + CpaBufferList src_buffer_list = { 0 }; + CpaFlatBuffer *flat_src_buf_array = NULL; + CpaFlatBuffer *flat_src_buf = NULL; + struct page *in_pages[MAX_PAGE_NUM]; + Cpa32U page_num = 0; + Cpa32U page_off = 0; + + QAT_STAT_BUMP(cksum_requests); + QAT_STAT_INCR(cksum_total_in_bytes, size); + + i = (Cpa32U)atomic_inc_32_nv(&inst_num) % num_inst; + cy_inst_handle = cy_inst_handles[i]; + + status = qat_init_checksum_session_ctx(cy_inst_handle, + &cy_session_ctx, cksum); + if (status != CPA_STATUS_SUCCESS) { + /* don't count unsupported checksums as a failure */ + if (cksum == ZIO_CHECKSUM_SHA256 || + cksum == ZIO_CHECKSUM_SHA512) + QAT_STAT_BUMP(cksum_fails); + return (status); + } + + /* + * We increment nr_bufs by 2 to allow us to handle non + * page-aligned buffer addresses and buffers whose sizes + * are not divisible by PAGE_SIZE. + */ + status = qat_init_cy_buffer_lists(cy_inst_handle, nr_bufs, + &src_buffer_list, &src_buffer_list); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + status = QAT_PHYS_CONTIG_ALLOC(&flat_src_buf_array, + nr_bufs * sizeof (CpaFlatBuffer)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + status = QAT_PHYS_CONTIG_ALLOC(&digest_buffer, + sizeof (zio_cksum_t)); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + bytes_left = size; + data = buf; + flat_src_buf = flat_src_buf_array; + while (bytes_left > 0) { + page_off = ((long)data & ~PAGE_MASK); + in_pages[page_num] = qat_mem_to_page(data); + flat_src_buf->pData = kmap(in_pages[page_num]) + page_off; + flat_src_buf->dataLenInBytes = + min((long)PAGE_SIZE - page_off, (long)bytes_left); + data += flat_src_buf->dataLenInBytes; + bytes_left -= flat_src_buf->dataLenInBytes; + flat_src_buf++; + page_num++; + } + src_buffer_list.pBuffers = flat_src_buf_array; + src_buffer_list.numBuffers = page_num; + + op_data.sessionCtx = cy_session_ctx; + op_data.packetType = CPA_CY_SYM_PACKET_TYPE_FULL; + op_data.hashStartSrcOffsetInBytes = 0; + op_data.messageLenToHashInBytes = size; + op_data.pDigestResult = digest_buffer; + + cb.verify_result = CPA_FALSE; + init_completion(&cb.complete); + status = cpaCySymPerformOp(cy_inst_handle, &cb, &op_data, + &src_buffer_list, &src_buffer_list, NULL); + if (status != CPA_STATUS_SUCCESS) + goto fail; + + /* we now wait until the completion of the operation. */ + wait_for_completion(&cb.complete); + + if (cb.verify_result == CPA_FALSE) { + status = CPA_STATUS_FAIL; + goto fail; + } + + bcopy(digest_buffer, zcp, sizeof (zio_cksum_t)); + +fail: + if (status != CPA_STATUS_SUCCESS) + QAT_STAT_BUMP(cksum_fails); + + for (i = 0; i < page_num; i++) + kunmap(in_pages[i]); + + cpaCySymRemoveSession(cy_inst_handle, cy_session_ctx); + QAT_PHYS_CONTIG_FREE(digest_buffer); + QAT_PHYS_CONTIG_FREE(src_buffer_list.pPrivateMetaData); + QAT_PHYS_CONTIG_FREE(cy_session_ctx); + QAT_PHYS_CONTIG_FREE(flat_src_buf_array); + + return (status); +} + +static int +param_set_qat_encrypt(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * zfs_qat_encrypt_disable = 0: enable qat encrypt + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_encrypt_disable = 1; + return (ret); + } + } + return (ret); +} + +static int +param_set_qat_checksum(const char *val, zfs_kernel_param_t *kp) +{ + int ret; + int *pvalue = kp->arg; + ret = param_set_int(val, kp); + if (ret) + return (ret); + /* + * set_checksum_param_ops = 0: enable qat checksum + * try to initialize qat instance if it has not been done + */ + if (*pvalue == 0 && !qat_cy_init_done) { + ret = qat_cy_init(); + if (ret != 0) { + zfs_qat_checksum_disable = 1; + return (ret); + } + } + return (ret); +} + +module_param_call(zfs_qat_encrypt_disable, param_set_qat_encrypt, + param_get_int, &zfs_qat_encrypt_disable, 0644); +MODULE_PARM_DESC(zfs_qat_encrypt_disable, "Enable/Disable QAT encryption"); + +module_param_call(zfs_qat_checksum_disable, param_set_qat_checksum, + param_get_int, &zfs_qat_checksum_disable, 0644); +MODULE_PARM_DESC(zfs_qat_checksum_disable, "Enable/Disable QAT checksumming"); + +#endif diff --git a/module/os/linux/zfs/spa_misc_os.c b/module/os/linux/zfs/spa_misc_os.c new file mode 100644 index 000000000000..5672cd6d5c5e --- /dev/null +++ b/module/os/linux/zfs/spa_misc_os.c @@ -0,0 +1,110 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_prop.h" + + +int +param_set_deadman_failmode(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = -param_set_deadman_failmode_common(val); + if (error == 0) + error = param_set_charp(val, kp); + + return (error); +} + +int +param_set_deadman_ziotime(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_ulong(val, kp); + if (error < 0) + return (SET_ERROR(error)); + + spa_set_deadman_ziotime(MSEC2NSEC(zfs_deadman_ziotime_ms)); + + return (0); +} + +int +param_set_deadman_synctime(const char *val, zfs_kernel_param_t *kp) +{ + int error; + + error = param_set_ulong(val, kp); + if (error < 0) + return (SET_ERROR(error)); + + spa_set_deadman_synctime(MSEC2NSEC(zfs_deadman_synctime_ms)); + + return (0); +} + +int +param_set_slop_shift(const char *buf, zfs_kernel_param_t *kp) +{ + unsigned long val; + int error; + + error = kstrtoul(buf, 0, &val); + if (error) + return (SET_ERROR(error)); + + if (val < 1 || val > 31) + return (SET_ERROR(-EINVAL)); + + error = param_set_int(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +const char * +spa_history_zone(void) +{ + return ("linux"); +} diff --git a/module/os/linux/zfs/spa_stats.c b/module/os/linux/zfs/spa_stats.c new file mode 100644 index 000000000000..86cefa6dddab --- /dev/null +++ b/module/os/linux/zfs/spa_stats.c @@ -0,0 +1,1047 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include + +/* + * Keeps stats on last N reads per spa_t, disabled by default. + */ +int zfs_read_history = 0; + +/* + * Include cache hits in history, disabled by default. + */ +int zfs_read_history_hits = 0; + +/* + * Keeps stats on the last 100 txgs by default. + */ +int zfs_txg_history = 100; + +/* + * Keeps stats on the last N MMP updates, disabled by default. + */ +int zfs_multihost_history = 0; + +/* + * ========================================================================== + * SPA Read History Routines + * ========================================================================== + */ + +/* + * Read statistics - Information exported regarding each arc_read call + */ +typedef struct spa_read_history { + hrtime_t start; /* time read completed */ + uint64_t objset; /* read from this objset */ + uint64_t object; /* read of this object number */ + uint64_t level; /* block's indirection level */ + uint64_t blkid; /* read of this block id */ + char origin[24]; /* read originated from here */ + uint32_t aflags; /* ARC flags (cached, prefetch, etc.) */ + pid_t pid; /* PID of task doing read */ + char comm[16]; /* process name of task doing read */ + procfs_list_node_t srh_node; +} spa_read_history_t; + +static int +spa_read_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-8s %-16s %-8s %-8s %-8s %-8s %-8s " + "%-24s %-8s %-16s\n", "UID", "start", "objset", "object", + "level", "blkid", "aflags", "origin", "pid", "process"); + + return (0); +} + +static int +spa_read_history_show(struct seq_file *f, void *data) +{ + spa_read_history_t *srh = (spa_read_history_t *)data; + + seq_printf(f, "%-8llu %-16llu 0x%-6llx " + "%-8lli %-8lli %-8lli 0x%-6x %-24s %-8i %-16s\n", + (u_longlong_t)srh->srh_node.pln_id, srh->start, + (longlong_t)srh->objset, (longlong_t)srh->object, + (longlong_t)srh->level, (longlong_t)srh->blkid, + srh->aflags, srh->origin, srh->pid, srh->comm); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_read_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_read_history_t *srh; + while (shl->size > size) { + srh = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(srh, !=, NULL); + kmem_free(srh, sizeof (spa_read_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); +} + +static int +spa_read_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_read_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_read_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "reads", + 0600, + &shl->procfs_list, + spa_read_history_show, + spa_read_history_show_header, + spa_read_history_clear, + offsetof(spa_read_history_t, srh_node)); + + kmem_strfree(module); +} + +static void +spa_read_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + procfs_list_uninstall(&shl->procfs_list); + spa_read_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +void +spa_read_history_add(spa_t *spa, const zbookmark_phys_t *zb, uint32_t aflags) +{ + spa_history_list_t *shl = &spa->spa_stats.read_history; + spa_read_history_t *srh; + + ASSERT3P(spa, !=, NULL); + ASSERT3P(zb, !=, NULL); + + if (zfs_read_history == 0 && shl->size == 0) + return; + + if (zfs_read_history_hits == 0 && (aflags & ARC_FLAG_CACHED)) + return; + + srh = kmem_zalloc(sizeof (spa_read_history_t), KM_SLEEP); + strlcpy(srh->comm, getcomm(), sizeof (srh->comm)); + srh->start = gethrtime(); + srh->objset = zb->zb_objset; + srh->object = zb->zb_object; + srh->level = zb->zb_level; + srh->blkid = zb->zb_blkid; + srh->aflags = aflags; + srh->pid = getpid(); + + mutex_enter(&shl->procfs_list.pl_lock); + + procfs_list_add(&shl->procfs_list, srh); + shl->size++; + + spa_read_history_truncate(shl, zfs_read_history); + + mutex_exit(&shl->procfs_list.pl_lock); +} + +/* + * ========================================================================== + * SPA TXG History Routines + * ========================================================================== + */ + +/* + * Txg statistics - Information exported regarding each txg sync + */ + +typedef struct spa_txg_history { + uint64_t txg; /* txg id */ + txg_state_t state; /* active txg state */ + uint64_t nread; /* number of bytes read */ + uint64_t nwritten; /* number of bytes written */ + uint64_t reads; /* number of read operations */ + uint64_t writes; /* number of write operations */ + uint64_t ndirty; /* number of dirty bytes */ + hrtime_t times[TXG_STATE_COMMITTED]; /* completion times */ + procfs_list_node_t sth_node; +} spa_txg_history_t; + +static int +spa_txg_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-8s %-16s %-5s %-12s %-12s %-12s " + "%-8s %-8s %-12s %-12s %-12s %-12s\n", "txg", "birth", "state", + "ndirty", "nread", "nwritten", "reads", "writes", + "otime", "qtime", "wtime", "stime"); + return (0); +} + +static int +spa_txg_history_show(struct seq_file *f, void *data) +{ + spa_txg_history_t *sth = (spa_txg_history_t *)data; + uint64_t open = 0, quiesce = 0, wait = 0, sync = 0; + char state; + + switch (sth->state) { + case TXG_STATE_BIRTH: state = 'B'; break; + case TXG_STATE_OPEN: state = 'O'; break; + case TXG_STATE_QUIESCED: state = 'Q'; break; + case TXG_STATE_WAIT_FOR_SYNC: state = 'W'; break; + case TXG_STATE_SYNCED: state = 'S'; break; + case TXG_STATE_COMMITTED: state = 'C'; break; + default: state = '?'; break; + } + + if (sth->times[TXG_STATE_OPEN]) + open = sth->times[TXG_STATE_OPEN] - + sth->times[TXG_STATE_BIRTH]; + + if (sth->times[TXG_STATE_QUIESCED]) + quiesce = sth->times[TXG_STATE_QUIESCED] - + sth->times[TXG_STATE_OPEN]; + + if (sth->times[TXG_STATE_WAIT_FOR_SYNC]) + wait = sth->times[TXG_STATE_WAIT_FOR_SYNC] - + sth->times[TXG_STATE_QUIESCED]; + + if (sth->times[TXG_STATE_SYNCED]) + sync = sth->times[TXG_STATE_SYNCED] - + sth->times[TXG_STATE_WAIT_FOR_SYNC]; + + seq_printf(f, "%-8llu %-16llu %-5c %-12llu " + "%-12llu %-12llu %-8llu %-8llu %-12llu %-12llu %-12llu %-12llu\n", + (longlong_t)sth->txg, sth->times[TXG_STATE_BIRTH], state, + (u_longlong_t)sth->ndirty, + (u_longlong_t)sth->nread, (u_longlong_t)sth->nwritten, + (u_longlong_t)sth->reads, (u_longlong_t)sth->writes, + (u_longlong_t)open, (u_longlong_t)quiesce, (u_longlong_t)wait, + (u_longlong_t)sync); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_txg_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_txg_history_t *sth; + while (shl->size > size) { + sth = list_remove_head(&shl->procfs_list.pl_list); + ASSERT3P(sth, !=, NULL); + kmem_free(sth, sizeof (spa_txg_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); + +} + +static int +spa_txg_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_txg_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_txg_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "txgs", + 0644, + &shl->procfs_list, + spa_txg_history_show, + spa_txg_history_show_header, + spa_txg_history_clear, + offsetof(spa_txg_history_t, sth_node)); + + kmem_strfree(module); +} + +static void +spa_txg_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + procfs_list_uninstall(&shl->procfs_list); + spa_txg_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +/* + * Add a new txg to historical record. + */ +void +spa_txg_history_add(spa_t *spa, uint64_t txg, hrtime_t birth_time) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + + if (zfs_txg_history == 0 && shl->size == 0) + return; + + sth = kmem_zalloc(sizeof (spa_txg_history_t), KM_SLEEP); + sth->txg = txg; + sth->state = TXG_STATE_OPEN; + sth->times[TXG_STATE_BIRTH] = birth_time; + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, sth); + shl->size++; + spa_txg_history_truncate(shl, zfs_txg_history); + mutex_exit(&shl->procfs_list.pl_lock); +} + +/* + * Set txg state completion time and increment current state. + */ +int +spa_txg_history_set(spa_t *spa, uint64_t txg, txg_state_t completed_state, + hrtime_t completed_time) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { + if (sth->txg == txg) { + sth->times[completed_state] = completed_time; + sth->state++; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Set txg IO stats. + */ +static int +spa_txg_history_set_io(spa_t *spa, uint64_t txg, uint64_t nread, + uint64_t nwritten, uint64_t reads, uint64_t writes, uint64_t ndirty) +{ + spa_history_list_t *shl = &spa->spa_stats.txg_history; + spa_txg_history_t *sth; + int error = ENOENT; + + if (zfs_txg_history == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sth = list_tail(&shl->procfs_list.pl_list); sth != NULL; + sth = list_prev(&shl->procfs_list.pl_list, sth)) { + if (sth->txg == txg) { + sth->nread = nread; + sth->nwritten = nwritten; + sth->reads = reads; + sth->writes = writes; + sth->ndirty = ndirty; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +txg_stat_t * +spa_txg_history_init_io(spa_t *spa, uint64_t txg, dsl_pool_t *dp) +{ + txg_stat_t *ts; + + if (zfs_txg_history == 0) + return (NULL); + + ts = kmem_alloc(sizeof (txg_stat_t), KM_SLEEP); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs1); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + ts->txg = txg; + ts->ndirty = dp->dp_dirty_pertxg[txg & TXG_MASK]; + + spa_txg_history_set(spa, txg, TXG_STATE_WAIT_FOR_SYNC, gethrtime()); + + return (ts); +} + +void +spa_txg_history_fini_io(spa_t *spa, txg_stat_t *ts) +{ + if (ts == NULL) + return; + + if (zfs_txg_history == 0) { + kmem_free(ts, sizeof (txg_stat_t)); + return; + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_get_stats(spa->spa_root_vdev, &ts->vs2); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + spa_txg_history_set(spa, ts->txg, TXG_STATE_SYNCED, gethrtime()); + spa_txg_history_set_io(spa, ts->txg, + ts->vs2.vs_bytes[ZIO_TYPE_READ] - ts->vs1.vs_bytes[ZIO_TYPE_READ], + ts->vs2.vs_bytes[ZIO_TYPE_WRITE] - ts->vs1.vs_bytes[ZIO_TYPE_WRITE], + ts->vs2.vs_ops[ZIO_TYPE_READ] - ts->vs1.vs_ops[ZIO_TYPE_READ], + ts->vs2.vs_ops[ZIO_TYPE_WRITE] - ts->vs1.vs_ops[ZIO_TYPE_WRITE], + ts->ndirty); + + kmem_free(ts, sizeof (txg_stat_t)); +} + +/* + * ========================================================================== + * SPA TX Assign Histogram Routines + * ========================================================================== + */ + +/* + * Tx statistics - Information exported regarding dmu_tx_assign time. + */ + +/* + * When the kstat is written zero all buckets. When the kstat is read + * count the number of trailing buckets set to zero and update ks_ndata + * such that they are not output. + */ +static int +spa_tx_assign_update(kstat_t *ksp, int rw) +{ + spa_t *spa = ksp->ks_private; + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + int i; + + if (rw == KSTAT_WRITE) { + for (i = 0; i < shk->count; i++) + ((kstat_named_t *)shk->priv)[i].value.ui64 = 0; + } + + for (i = shk->count; i > 0; i--) + if (((kstat_named_t *)shk->priv)[i-1].value.ui64 != 0) + break; + + ksp->ks_ndata = i; + ksp->ks_data_size = i * sizeof (kstat_named_t); + + return (0); +} + +static void +spa_tx_assign_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + char *name; + kstat_named_t *ks; + kstat_t *ksp; + int i; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + shk->count = 42; /* power of two buckets for 1ns to 2,199s */ + shk->size = shk->count * sizeof (kstat_named_t); + shk->priv = kmem_alloc(shk->size, KM_SLEEP); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + + for (i = 0; i < shk->count; i++) { + ks = &((kstat_named_t *)shk->priv)[i]; + ks->data_type = KSTAT_DATA_UINT64; + ks->value.ui64 = 0; + (void) snprintf(ks->name, KSTAT_STRLEN, "%llu ns", + (u_longlong_t)1 << i); + } + + ksp = kstat_create(name, 0, "dmu_tx_assign", "misc", + KSTAT_TYPE_NAMED, 0, KSTAT_FLAG_VIRTUAL); + shk->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = shk->priv; + ksp->ks_ndata = shk->count; + ksp->ks_data_size = shk->size; + ksp->ks_private = spa; + ksp->ks_update = spa_tx_assign_update; + kstat_install(ksp); + } + kmem_strfree(name); +} + +static void +spa_tx_assign_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + kstat_t *ksp; + + ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + kmem_free(shk->priv, shk->size); + mutex_destroy(&shk->lock); +} + +void +spa_tx_assign_add_nsecs(spa_t *spa, uint64_t nsecs) +{ + spa_history_kstat_t *shk = &spa->spa_stats.tx_assign_histogram; + uint64_t idx = 0; + + while (((1ULL << idx) < nsecs) && (idx < shk->size - 1)) + idx++; + + atomic_inc_64(&((kstat_named_t *)shk->priv)[idx].value.ui64); +} + +/* + * ========================================================================== + * SPA IO History Routines + * ========================================================================== + */ +static int +spa_io_history_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) + memset(ksp->ks_data, 0, ksp->ks_data_size); + + return (0); +} + +static void +spa_io_history_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + + ksp = kstat_create(name, 0, "io", "disk", KSTAT_TYPE_IO, 1, 0); + shk->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_private = spa; + ksp->ks_update = spa_io_history_update; + kstat_install(ksp); + } + kmem_strfree(name); +} + +static void +spa_io_history_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + if (shk->kstat) + kstat_delete(shk->kstat); + + mutex_destroy(&shk->lock); +} + +/* + * ========================================================================== + * SPA MMP History Routines + * ========================================================================== + */ + +/* + * MMP statistics - Information exported regarding attempted MMP writes + * For MMP writes issued, fields used as per comments below. + * For MMP writes skipped, an entry represents a span of time when + * writes were skipped for same reason (error from mmp_random_leaf). + * Differences are: + * timestamp time first write skipped, if >1 skipped in a row + * mmp_delay delay value at timestamp + * vdev_guid number of writes skipped + * io_error one of enum mmp_error + * duration time span (ns) of skipped writes + */ + +typedef struct spa_mmp_history { + uint64_t mmp_node_id; /* unique # for updates */ + uint64_t txg; /* txg of last sync */ + uint64_t timestamp; /* UTC time MMP write issued */ + uint64_t mmp_delay; /* mmp_thread.mmp_delay at timestamp */ + uint64_t vdev_guid; /* unique ID of leaf vdev */ + char *vdev_path; + int vdev_label; /* vdev label */ + int io_error; /* error status of MMP write */ + hrtime_t error_start; /* hrtime of start of error period */ + hrtime_t duration; /* time from submission to completion */ + procfs_list_node_t smh_node; +} spa_mmp_history_t; + +static int +spa_mmp_history_show_header(struct seq_file *f) +{ + seq_printf(f, "%-10s %-10s %-10s %-6s %-10s %-12s %-24s " + "%-10s %s\n", "id", "txg", "timestamp", "error", "duration", + "mmp_delay", "vdev_guid", "vdev_label", "vdev_path"); + return (0); +} + +static int +spa_mmp_history_show(struct seq_file *f, void *data) +{ + spa_mmp_history_t *smh = (spa_mmp_history_t *)data; + char skip_fmt[] = "%-10llu %-10llu %10llu %#6llx %10lld %12llu %-24llu " + "%-10lld %s\n"; + char write_fmt[] = "%-10llu %-10llu %10llu %6lld %10lld %12llu %-24llu " + "%-10lld %s\n"; + + seq_printf(f, (smh->error_start ? skip_fmt : write_fmt), + (u_longlong_t)smh->mmp_node_id, (u_longlong_t)smh->txg, + (u_longlong_t)smh->timestamp, (longlong_t)smh->io_error, + (longlong_t)smh->duration, (u_longlong_t)smh->mmp_delay, + (u_longlong_t)smh->vdev_guid, (u_longlong_t)smh->vdev_label, + (smh->vdev_path ? smh->vdev_path : "-")); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_mmp_history_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_mmp_history_t *smh; + while (shl->size > size) { + smh = list_remove_head(&shl->procfs_list.pl_list); + if (smh->vdev_path) + kmem_strfree(smh->vdev_path); + kmem_free(smh, sizeof (spa_mmp_history_t)); + shl->size--; + } + + if (size == 0) + ASSERT(list_is_empty(&shl->procfs_list.pl_list)); + +} + +static int +spa_mmp_history_clear(procfs_list_t *procfs_list) +{ + spa_history_list_t *shl = procfs_list->pl_private; + mutex_enter(&procfs_list->pl_lock); + spa_mmp_history_truncate(shl, 0); + mutex_exit(&procfs_list->pl_lock); + return (0); +} + +static void +spa_mmp_history_init(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + char *module; + + shl->size = 0; + + module = kmem_asprintf("zfs/%s", spa_name(spa)); + + shl->procfs_list.pl_private = shl; + procfs_list_install(module, + "multihost", + 0644, + &shl->procfs_list, + spa_mmp_history_show, + spa_mmp_history_show_header, + spa_mmp_history_clear, + offsetof(spa_mmp_history_t, smh_node)); + + kmem_strfree(module); +} + +static void +spa_mmp_history_destroy(spa_t *spa) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + procfs_list_uninstall(&shl->procfs_list); + spa_mmp_history_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); +} + +/* + * Set duration in existing "skip" record to how long we have waited for a leaf + * vdev to become available. + * + * Important that we start search at the tail of the list where new + * records are inserted, so this is normally an O(1) operation. + */ +int +spa_mmp_history_set_skip(spa_t *spa, uint64_t mmp_node_id) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT3U(smh->io_error, !=, 0); + smh->duration = gethrtime() - smh->error_start; + smh->vdev_guid++; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Set MMP write duration and error status in existing record. + * See comment re: search order above spa_mmp_history_set_skip(). + */ +int +spa_mmp_history_set(spa_t *spa, uint64_t mmp_node_id, int io_error, + hrtime_t duration) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + int error = ENOENT; + + if (zfs_multihost_history == 0 && shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (smh = list_tail(&shl->procfs_list.pl_list); smh != NULL; + smh = list_prev(&shl->procfs_list.pl_list, smh)) { + if (smh->mmp_node_id == mmp_node_id) { + ASSERT(smh->io_error == 0); + smh->io_error = io_error; + smh->duration = duration; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * Add a new MMP historical record. + * error == 0 : a write was issued. + * error != 0 : a write was not issued because no leaves were found. + */ +void +spa_mmp_history_add(spa_t *spa, uint64_t txg, uint64_t timestamp, + uint64_t mmp_delay, vdev_t *vd, int label, uint64_t mmp_node_id, + int error) +{ + spa_history_list_t *shl = &spa->spa_stats.mmp_history; + spa_mmp_history_t *smh; + + if (zfs_multihost_history == 0 && shl->size == 0) + return; + + smh = kmem_zalloc(sizeof (spa_mmp_history_t), KM_SLEEP); + smh->txg = txg; + smh->timestamp = timestamp; + smh->mmp_delay = mmp_delay; + if (vd) { + smh->vdev_guid = vd->vdev_guid; + if (vd->vdev_path) + smh->vdev_path = kmem_strdup(vd->vdev_path); + } + smh->vdev_label = label; + smh->mmp_node_id = mmp_node_id; + + if (error) { + smh->io_error = error; + smh->error_start = gethrtime(); + smh->vdev_guid = 1; + } + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, smh); + shl->size++; + spa_mmp_history_truncate(shl, zfs_multihost_history); + mutex_exit(&shl->procfs_list.pl_lock); +} + +static void * +spa_state_addr(kstat_t *ksp, loff_t n) +{ + return (ksp->ks_private); /* return the spa_t */ +} + +static int +spa_state_data(char *buf, size_t size, void *data) +{ + spa_t *spa = (spa_t *)data; + (void) snprintf(buf, size, "%s\n", spa_state_to_name(spa)); + return (0); +} + +/* + * Return the state of the pool in /proc/spl/kstat/zfs//state. + * + * This is a lock-less read of the pool's state (unlike using 'zpool', which + * can potentially block for seconds). Because it doesn't block, it can useful + * as a pool heartbeat value. + */ +static void +spa_state_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + char *name; + kstat_t *ksp; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + name = kmem_asprintf("zfs/%s", spa_name(spa)); + ksp = kstat_create(name, 0, "state", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + ksp->ks_lock = &shk->lock; + ksp->ks_data = NULL; + ksp->ks_private = spa; + ksp->ks_flags |= KSTAT_FLAG_NO_HEADERS; + kstat_set_raw_ops(ksp, NULL, spa_state_data, spa_state_addr); + kstat_install(ksp); + } + + kmem_strfree(name); +} + +static void +spa_health_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.state; + kstat_t *ksp = shk->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&shk->lock); +} + +static spa_iostats_t spa_iostats_template = { + { "trim_extents_written", KSTAT_DATA_UINT64 }, + { "trim_bytes_written", KSTAT_DATA_UINT64 }, + { "trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "trim_extents_failed", KSTAT_DATA_UINT64 }, + { "trim_bytes_failed", KSTAT_DATA_UINT64 }, + { "autotrim_extents_written", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_written", KSTAT_DATA_UINT64 }, + { "autotrim_extents_skipped", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "autotrim_extents_failed", KSTAT_DATA_UINT64 }, + { "autotrim_bytes_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_written", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_written", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_skipped", KSTAT_DATA_UINT64 }, + { "simple_trim_extents_failed", KSTAT_DATA_UINT64 }, + { "simple_trim_bytes_failed", KSTAT_DATA_UINT64 }, +}; + +#define SPA_IOSTATS_ADD(stat, val) \ + atomic_add_64(&iostats->stat.value.ui64, (val)); + +void +spa_iostats_trim_add(spa_t *spa, trim_type_t type, + uint64_t extents_written, uint64_t bytes_written, + uint64_t extents_skipped, uint64_t bytes_skipped, + uint64_t extents_failed, uint64_t bytes_failed) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + spa_iostats_t *iostats; + + if (ksp == NULL) + return; + + iostats = ksp->ks_data; + if (type == TRIM_TYPE_MANUAL) { + SPA_IOSTATS_ADD(trim_extents_written, extents_written); + SPA_IOSTATS_ADD(trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(trim_bytes_failed, bytes_failed); + } else if (type == TRIM_TYPE_AUTO) { + SPA_IOSTATS_ADD(autotrim_extents_written, extents_written); + SPA_IOSTATS_ADD(autotrim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(autotrim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(autotrim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(autotrim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(autotrim_bytes_failed, bytes_failed); + } else { + SPA_IOSTATS_ADD(simple_trim_extents_written, extents_written); + SPA_IOSTATS_ADD(simple_trim_bytes_written, bytes_written); + SPA_IOSTATS_ADD(simple_trim_extents_skipped, extents_skipped); + SPA_IOSTATS_ADD(simple_trim_bytes_skipped, bytes_skipped); + SPA_IOSTATS_ADD(simple_trim_extents_failed, extents_failed); + SPA_IOSTATS_ADD(simple_trim_bytes_failed, bytes_failed); + } +} + +static int +spa_iostats_update(kstat_t *ksp, int rw) +{ + if (rw == KSTAT_WRITE) { + memcpy(ksp->ks_data, &spa_iostats_template, + sizeof (spa_iostats_t)); + } + + return (0); +} + +static void +spa_iostats_init(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + + mutex_init(&shk->lock, NULL, MUTEX_DEFAULT, NULL); + + char *name = kmem_asprintf("zfs/%s", spa_name(spa)); + kstat_t *ksp = kstat_create(name, 0, "iostats", "misc", + KSTAT_TYPE_NAMED, sizeof (spa_iostats_t) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + shk->kstat = ksp; + if (ksp) { + int size = sizeof (spa_iostats_t); + ksp->ks_lock = &shk->lock; + ksp->ks_private = spa; + ksp->ks_update = spa_iostats_update; + ksp->ks_data = kmem_alloc(size, KM_SLEEP); + memcpy(ksp->ks_data, &spa_iostats_template, size); + kstat_install(ksp); + } + + kmem_strfree(name); +} + +static void +spa_iostats_destroy(spa_t *spa) +{ + spa_history_kstat_t *shk = &spa->spa_stats.iostats; + kstat_t *ksp = shk->kstat; + if (ksp) { + kmem_free(ksp->ks_data, sizeof (spa_iostats_t)); + kstat_delete(ksp); + } + + mutex_destroy(&shk->lock); +} + +void +spa_stats_init(spa_t *spa) +{ + spa_read_history_init(spa); + spa_txg_history_init(spa); + spa_tx_assign_init(spa); + spa_io_history_init(spa); + spa_mmp_history_init(spa); + spa_state_init(spa); + spa_iostats_init(spa); +} + +void +spa_stats_destroy(spa_t *spa) +{ + spa_iostats_destroy(spa); + spa_health_destroy(spa); + spa_tx_assign_destroy(spa); + spa_txg_history_destroy(spa); + spa_read_history_destroy(spa); + spa_io_history_destroy(spa); + spa_mmp_history_destroy(spa); +} + +#if defined(_KERNEL) +/* CSTYLED */ +module_param(zfs_read_history, int, 0644); +MODULE_PARM_DESC(zfs_read_history, + "Historical statistics for the last N reads"); + +module_param(zfs_read_history_hits, int, 0644); +MODULE_PARM_DESC(zfs_read_history_hits, + "Include cache hits in read history"); + +module_param(zfs_txg_history, int, 0644); +MODULE_PARM_DESC(zfs_txg_history, + "Historical statistics for the last N txgs"); + +module_param(zfs_multihost_history, int, 0644); +MODULE_PARM_DESC(zfs_multihost_history, + "Historical statistics for last N multihost writes"); +/* END CSTYLED */ +#endif diff --git a/module/os/linux/zfs/trace.c b/module/os/linux/zfs/trace.c new file mode 100644 index 000000000000..a690822ae14c --- /dev/null +++ b/module/os/linux/zfs/trace.c @@ -0,0 +1,55 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Each DTRACE_PROBE must define its trace point in one (and only one) + * source file, so this dummy file exists for that purpose. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#define CREATE_TRACE_POINTS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c new file mode 100644 index 000000000000..5a2245436c72 --- /dev/null +++ b/module/os/linux/zfs/vdev_disk.c @@ -0,0 +1,873 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Rewritten for Linux by Brian Behlendorf . + * LLNL-CODE-403049. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct vdev_disk { + struct block_device *vd_bdev; + krwlock_t vd_lock; +} vdev_disk_t; + +/* + * Unique identifier for the exclusive vdev holder. + */ +static void *zfs_vdev_holder = VDEV_HOLDER; + +/* + * Wait up to zfs_vdev_open_timeout_ms milliseconds before determining the + * device is missing. The missing path may be transient since the links + * can be briefly removed and recreated in response to udev events. + */ +static unsigned zfs_vdev_open_timeout_ms = 1000; + +/* + * Size of the "reserved" partition, in blocks. + */ +#define EFI_MIN_RESV_SIZE (16 * 1024) + +/* + * Virtual device vector for disks. + */ +typedef struct dio_request { + zio_t *dr_zio; /* Parent ZIO */ + atomic_t dr_ref; /* References */ + int dr_error; /* Bio error */ + int dr_bio_count; /* Count of bio's */ + struct bio *dr_bio[0]; /* Attached bio's */ +} dio_request_t; + +static fmode_t +vdev_bdev_mode(spa_mode_t spa_mode) +{ + fmode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= FMODE_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= FMODE_WRITE; + + return (mode); +} + +/* + * Returns the usable capacity (in bytes) for the partition or disk. + */ +static uint64_t +bdev_capacity(struct block_device *bdev) +{ + return (i_size_read(bdev->bd_inode)); +} + +/* + * Returns the maximum expansion capacity of the block device (in bytes). + * + * It is possible to expand a vdev when it has been created as a wholedisk + * and the containing block device has increased in capacity. Or when the + * partition containing the pool has been manually increased in size. + * + * This function is only responsible for calculating the potential expansion + * size so it can be reported by 'zpool list'. The efi_use_whole_disk() is + * responsible for verifying the expected partition layout in the wholedisk + * case, and updating the partition table if appropriate. Once the partition + * size has been increased the additional capacity will be visible using + * bdev_capacity(). + * + * The returned maximum expansion capacity is always expected to be larger, or + * at the very least equal, to its usable capacity to prevent overestimating + * the pool expandsize. + */ +static uint64_t +bdev_max_capacity(struct block_device *bdev, uint64_t wholedisk) +{ + uint64_t psize; + int64_t available; + + if (wholedisk && bdev->bd_part != NULL && bdev != bdev->bd_contains) { + /* + * When reporting maximum expansion capacity for a wholedisk + * deduct any capacity which is expected to be lost due to + * alignment restrictions. Over reporting this value isn't + * harmful and would only result in slightly less capacity + * than expected post expansion. + * The estimated available space may be slightly smaller than + * bdev_capacity() for devices where the number of sectors is + * not a multiple of the alignment size and the partition layout + * is keeping less than PARTITION_END_ALIGNMENT bytes after the + * "reserved" EFI partition: in such cases return the device + * usable capacity. + */ + available = i_size_read(bdev->bd_contains->bd_inode) - + ((EFI_MIN_RESV_SIZE + NEW_START_BLOCK + + PARTITION_END_ALIGNMENT) << SECTOR_BITS); + psize = MAX(available, bdev_capacity(bdev)); + } else { + psize = bdev_capacity(bdev); + } + + return (psize); +} + +static void +vdev_disk_error(zio_t *zio) +{ + /* + * This function can be called in interrupt context, for instance while + * handling IRQs coming from a misbehaving disk device; use printk() + * which is safe from any context. + */ + printk(KERN_WARNING "zio pool=%s vdev=%s error=%d type=%d " + "offset=%llu size=%llu flags=%x\n", spa_name(zio->io_spa), + zio->io_vd->vdev_path, zio->io_error, zio->io_type, + (u_longlong_t)zio->io_offset, (u_longlong_t)zio->io_size, + zio->io_flags); +} + +static int +vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + struct block_device *bdev; + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); + vdev_disk_t *vd; + + /* Must have a pathname and it must be absolute. */ + if (v->vdev_path == NULL || v->vdev_path[0] != '/') { + v->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + vdev_dbgmsg(v, "invalid vdev_path"); + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it is currently open. When expanding a + * partition force re-scanning the partition table while closed + * in order to get an accurate updated block device size. Then + * since udev may need to recreate the device links increase the + * open retry timeout before reporting the device as unavailable. + */ + vd = v->vdev_tsd; + if (vd) { + char disk_name[BDEVNAME_SIZE + 6] = "/dev/"; + boolean_t reread_part = B_FALSE; + + rw_enter(&vd->vd_lock, RW_WRITER); + bdev = vd->vd_bdev; + vd->vd_bdev = NULL; + + if (bdev) { + if (v->vdev_expanding && bdev != bdev->bd_contains) { + bdevname(bdev->bd_contains, disk_name + 5); + reread_part = B_TRUE; + } + + blkdev_put(bdev, mode | FMODE_EXCL); + } + + if (reread_part) { + bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, + zfs_vdev_holder); + if (!IS_ERR(bdev)) { + int error = vdev_bdev_reread_part(bdev); + blkdev_put(bdev, mode | FMODE_EXCL); + if (error == 0) { + timeout = MSEC2NSEC( + zfs_vdev_open_timeout_ms * 2); + } + } + } + } else { + vd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); + + rw_init(&vd->vd_lock, NULL, RW_DEFAULT, NULL); + rw_enter(&vd->vd_lock, RW_WRITER); + } + + /* + * Devices are always opened by the path provided at configuration + * time. This means that if the provided path is a udev by-id path + * then drives may be re-cabled without an issue. If the provided + * path is a udev by-path path, then the physical location information + * will be preserved. This can be critical for more complicated + * configurations where drives are located in specific physical + * locations to maximize the systems tolerance to component failure. + * + * Alternatively, you can provide your own udev rule to flexibly map + * the drives as you see fit. It is not advised that you use the + * /dev/[hd]d devices which may be reordered due to probing order. + * Devices in the wrong locations will be detected by the higher + * level vdev validation. + * + * The specified paths may be briefly removed and recreated in + * response to udev events. This should be exceptionally unlikely + * because the zpool command makes every effort to verify these paths + * have already settled prior to reaching this point. Therefore, + * a ENOENT failure at this point is highly likely to be transient + * and it is reasonable to sleep and retry before giving up. In + * practice delays have been observed to be on the order of 100ms. + */ + hrtime_t start = gethrtime(); + bdev = ERR_PTR(-ENXIO); + while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { + bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, + zfs_vdev_holder); + if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + schedule_timeout(MSEC_TO_TICK(10)); + } else if (IS_ERR(bdev)) { + break; + } + } + + if (IS_ERR(bdev)) { + int error = -PTR_ERR(bdev); + vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, + (u_longlong_t)(gethrtime() - start), + (u_longlong_t)timeout); + vd->vd_bdev = NULL; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); + return (SET_ERROR(error)); + } else { + vd->vd_bdev = bdev; + v->vdev_tsd = vd; + rw_exit(&vd->vd_lock); + } + + struct request_queue *q = bdev_get_queue(vd->vd_bdev); + + /* Determine the physical block size */ + int physical_block_size = bdev_physical_block_size(vd->vd_bdev); + + /* Determine the logical block size */ + int logical_block_size = bdev_logical_block_size(vd->vd_bdev); + + /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ + v->vdev_nowritecache = B_FALSE; + + /* Set when device reports it supports TRIM. */ + v->vdev_has_trim = !!blk_queue_discard(q); + + /* Set when device reports it supports secure TRIM. */ + v->vdev_has_securetrim = !!blk_queue_discard_secure(q); + + /* Inform the ZIO pipeline that we are non-rotational */ + v->vdev_nonrot = blk_queue_nonrot(q); + + /* Physical volume size in bytes for the partition */ + *psize = bdev_capacity(vd->vd_bdev); + + /* Physical volume size in bytes including possible expansion space */ + *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); + + /* Based on the minimum sector size set the block size */ + *physical_ashift = highbit64(MAX(physical_block_size, + SPA_MINBLOCKSIZE)) - 1; + + *logical_ashift = highbit64(MAX(logical_block_size, + SPA_MINBLOCKSIZE)) - 1; + + return (0); +} + +static void +vdev_disk_close(vdev_t *v) +{ + vdev_disk_t *vd = v->vdev_tsd; + + if (v->vdev_reopening || vd == NULL) + return; + + if (vd->vd_bdev != NULL) { + blkdev_put(vd->vd_bdev, + vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); + } + + rw_destroy(&vd->vd_lock); + kmem_free(vd, sizeof (vdev_disk_t)); + v->vdev_tsd = NULL; +} + +static dio_request_t * +vdev_disk_dio_alloc(int bio_count) +{ + dio_request_t *dr; + int i; + + dr = kmem_zalloc(sizeof (dio_request_t) + + sizeof (struct bio *) * bio_count, KM_SLEEP); + if (dr) { + atomic_set(&dr->dr_ref, 0); + dr->dr_bio_count = bio_count; + dr->dr_error = 0; + + for (i = 0; i < dr->dr_bio_count; i++) + dr->dr_bio[i] = NULL; + } + + return (dr); +} + +static void +vdev_disk_dio_free(dio_request_t *dr) +{ + int i; + + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + bio_put(dr->dr_bio[i]); + + kmem_free(dr, sizeof (dio_request_t) + + sizeof (struct bio *) * dr->dr_bio_count); +} + +static void +vdev_disk_dio_get(dio_request_t *dr) +{ + atomic_inc(&dr->dr_ref); +} + +static int +vdev_disk_dio_put(dio_request_t *dr) +{ + int rc = atomic_dec_return(&dr->dr_ref); + + /* + * Free the dio_request when the last reference is dropped and + * ensure zio_interpret is called only once with the correct zio + */ + if (rc == 0) { + zio_t *zio = dr->dr_zio; + int error = dr->dr_error; + + vdev_disk_dio_free(dr); + + if (zio) { + zio->io_error = error; + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + + zio_delay_interrupt(zio); + } + } + + return (rc); +} + +BIO_END_IO_PROTO(vdev_disk_physio_completion, bio, error) +{ + dio_request_t *dr = bio->bi_private; + int rc; + + if (dr->dr_error == 0) { +#ifdef HAVE_1ARG_BIO_END_IO_T + dr->dr_error = BIO_END_IO_ERROR(bio); +#else + if (error) + dr->dr_error = -(error); + else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) + dr->dr_error = EIO; +#endif + } + + /* Drop reference acquired by __vdev_disk_physio */ + rc = vdev_disk_dio_put(dr); +} + +static inline void +vdev_submit_bio_impl(struct bio *bio) +{ +#ifdef HAVE_1ARG_SUBMIT_BIO + submit_bio(bio); +#else + submit_bio(0, bio); +#endif +} + +#ifdef HAVE_BIO_SET_DEV +#if defined(CONFIG_BLK_CGROUP) && defined(HAVE_BIO_SET_DEV_GPL_ONLY) +/* + * The Linux 5.5 kernel updated percpu_ref_tryget() which is inlined by + * blkg_tryget() to use rcu_read_lock() instead of rcu_read_lock_sched(). + * As a side effect the function was converted to GPL-only. Define our + * own version when needed which uses rcu_read_lock_sched(). + */ +#if defined(HAVE_BLKG_TRYGET_GPL_ONLY) +static inline bool +vdev_blkg_tryget(struct blkcg_gq *blkg) +{ + struct percpu_ref *ref = &blkg->refcnt; + unsigned long __percpu *count; + bool rc; + + rcu_read_lock_sched(); + + if (__ref_is_percpu(ref, &count)) { + this_cpu_inc(*count); + rc = true; + } else { + rc = atomic_long_inc_not_zero(&ref->count); + } + + rcu_read_unlock_sched(); + + return (rc); +} +#elif defined(HAVE_BLKG_TRYGET) +#define vdev_blkg_tryget(bg) blkg_tryget(bg) +#endif +/* + * The Linux 5.0 kernel updated the bio_set_dev() macro so it calls the + * GPL-only bio_associate_blkg() symbol thus inadvertently converting + * the entire macro. Provide a minimal version which always assigns the + * request queue's root_blkg to the bio. + */ +static inline void +vdev_bio_associate_blkg(struct bio *bio) +{ + struct request_queue *q = bio->bi_disk->queue; + + ASSERT3P(q, !=, NULL); + ASSERT3P(bio->bi_blkg, ==, NULL); + + if (q->root_blkg && vdev_blkg_tryget(q->root_blkg)) + bio->bi_blkg = q->root_blkg; +} +#define bio_associate_blkg vdev_bio_associate_blkg +#endif +#else +/* + * Provide a bio_set_dev() helper macro for pre-Linux 4.14 kernels. + */ +static inline void +bio_set_dev(struct bio *bio, struct block_device *bdev) +{ + bio->bi_bdev = bdev; +} +#endif /* HAVE_BIO_SET_DEV */ + +static inline void +vdev_submit_bio(struct bio *bio) +{ + struct bio_list *bio_list = current->bio_list; + current->bio_list = NULL; + vdev_submit_bio_impl(bio); + current->bio_list = bio_list; +} + +static int +__vdev_disk_physio(struct block_device *bdev, zio_t *zio, + size_t io_size, uint64_t io_offset, int rw, int flags) +{ + dio_request_t *dr; + uint64_t abd_offset; + uint64_t bio_offset; + int bio_size, bio_count = 16; + int i = 0, error = 0; + struct blk_plug plug; + + /* + * Accessing outside the block device is never allowed. + */ + if (io_offset + io_size > bdev->bd_inode->i_size) { + vdev_dbgmsg(zio->io_vd, + "Illegal access %llu size %llu, device size %llu", + io_offset, io_size, i_size_read(bdev->bd_inode)); + return (SET_ERROR(EIO)); + } + +retry: + dr = vdev_disk_dio_alloc(bio_count); + if (dr == NULL) + return (SET_ERROR(ENOMEM)); + + if (zio && !(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bio_set_flags_failfast(bdev, &flags); + + dr->dr_zio = zio; + + /* + * When the IO size exceeds the maximum bio size for the request + * queue we are forced to break the IO in multiple bio's and wait + * for them all to complete. Ideally, all pool users will set + * their volume block size to match the maximum request size and + * the common case will be one bio per vdev IO request. + */ + + abd_offset = 0; + bio_offset = io_offset; + bio_size = io_size; + for (i = 0; i <= dr->dr_bio_count; i++) { + + /* Finished constructing bio's for given buffer */ + if (bio_size <= 0) + break; + + /* + * By default only 'bio_count' bio's per dio are allowed. + * However, if we find ourselves in a situation where more + * are needed we allocate a larger dio and warn the user. + */ + if (dr->dr_bio_count == i) { + vdev_disk_dio_free(dr); + bio_count *= 2; + goto retry; + } + + /* bio_alloc() with __GFP_WAIT never returns NULL */ + dr->dr_bio[i] = bio_alloc(GFP_NOIO, + MIN(abd_nr_pages_off(zio->io_abd, bio_size, abd_offset), + BIO_MAX_PAGES)); + if (unlikely(dr->dr_bio[i] == NULL)) { + vdev_disk_dio_free(dr); + return (SET_ERROR(ENOMEM)); + } + + /* Matching put called by vdev_disk_physio_completion */ + vdev_disk_dio_get(dr); + + bio_set_dev(dr->dr_bio[i], bdev); + BIO_BI_SECTOR(dr->dr_bio[i]) = bio_offset >> 9; + dr->dr_bio[i]->bi_end_io = vdev_disk_physio_completion; + dr->dr_bio[i]->bi_private = dr; + bio_set_op_attrs(dr->dr_bio[i], rw, flags); + + /* Remaining size is returned to become the new size */ + bio_size = abd_bio_map_off(dr->dr_bio[i], zio->io_abd, + bio_size, abd_offset); + + /* Advance in buffer and construct another bio if needed */ + abd_offset += BIO_BI_SIZE(dr->dr_bio[i]); + bio_offset += BIO_BI_SIZE(dr->dr_bio[i]); + } + + /* Extra reference to protect dio_request during vdev_submit_bio */ + vdev_disk_dio_get(dr); + + if (dr->dr_bio_count > 1) + blk_start_plug(&plug); + + /* Submit all bio's associated with this dio */ + for (i = 0; i < dr->dr_bio_count; i++) + if (dr->dr_bio[i]) + vdev_submit_bio(dr->dr_bio[i]); + + if (dr->dr_bio_count > 1) + blk_finish_plug(&plug); + + (void) vdev_disk_dio_put(dr); + + return (error); +} + +BIO_END_IO_PROTO(vdev_disk_io_flush_completion, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + + if (zio->io_error && (zio->io_error == EOPNOTSUPP)) + zio->io_vd->vdev_nowritecache = B_TRUE; + + bio_put(bio); + ASSERT3S(zio->io_error, >=, 0); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + +static int +vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) +{ + struct request_queue *q; + struct bio *bio; + + q = bdev_get_queue(bdev); + if (!q) + return (SET_ERROR(ENXIO)); + + bio = bio_alloc(GFP_NOIO, 0); + /* bio_alloc() with __GFP_WAIT never returns NULL */ + if (unlikely(bio == NULL)) + return (SET_ERROR(ENOMEM)); + + bio->bi_end_io = vdev_disk_io_flush_completion; + bio->bi_private = zio; + bio_set_dev(bio, bdev); + bio_set_flush(bio); + vdev_submit_bio(bio); + invalidate_bdev(bdev); + + return (0); +} + +static void +vdev_disk_io_start(zio_t *zio) +{ + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + unsigned long trim_flags = 0; + int rw, error; + + /* + * If the vdev is closed, it's likely in the REMOVED or FAULTED state. + * Nothing to be done here but return failure. + */ + if (vd == NULL) { + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + rw_enter(&vd->vd_lock, RW_READER); + + /* + * If the vdev is closed, it's likely due to a failed reopen and is + * in the UNAVAIL state. Nothing to be done here but return failure. + */ + if (vd->vd_bdev == NULL) { + rw_exit(&vd->vd_lock); + zio->io_error = ENXIO; + zio_interrupt(zio); + return; + } + + switch (zio->io_type) { + case ZIO_TYPE_IOCTL: + + if (!vdev_readable(v)) { + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + if (v->vdev_nowritecache) { + zio->io_error = SET_ERROR(ENOTSUP); + break; + } + + error = vdev_disk_io_flush(vd->vd_bdev, zio); + if (error == 0) { + rw_exit(&vd->vd_lock); + return; + } + + zio->io_error = error; + + break; + + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + rw_exit(&vd->vd_lock); + zio_execute(zio); + return; + case ZIO_TYPE_WRITE: + rw = WRITE; + break; + + case ZIO_TYPE_READ: + rw = READ; + break; + + case ZIO_TYPE_TRIM: +#if defined(BLKDEV_DISCARD_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + trim_flags |= BLKDEV_DISCARD_SECURE; +#endif + zio->io_error = -blkdev_issue_discard(vd->vd_bdev, + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, + trim_flags); + + rw_exit(&vd->vd_lock); + zio_interrupt(zio); + return; + + default: + rw_exit(&vd->vd_lock); + zio->io_error = SET_ERROR(ENOTSUP); + zio_interrupt(zio); + return; + } + + zio->io_target_timestamp = zio_handle_io_delay(zio); + error = __vdev_disk_physio(vd->vd_bdev, zio, + zio->io_size, zio->io_offset, rw, 0); + rw_exit(&vd->vd_lock); + + if (error) { + zio->io_error = error; + zio_interrupt(zio); + return; + } +} + +static void +vdev_disk_io_done(zio_t *zio) +{ + /* + * If the device returned EIO, we revalidate the media. If it is + * determined the media has changed this triggers the asynchronous + * removal of the device from the configuration. + */ + if (zio->io_error == EIO) { + vdev_t *v = zio->io_vd; + vdev_disk_t *vd = v->vdev_tsd; + + if (check_disk_change(vd->vd_bdev)) { + invalidate_bdev(vd->vd_bdev); + v->vdev_remove_wanted = B_TRUE; + spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); + } + } +} + +static void +vdev_disk_hold(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* We must have a pathname, and it must be absolute. */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') + return; + + /* + * Only prefetch path and devid info if the device has + * never been opened. + */ + if (vd->vdev_tsd != NULL) + return; + +} + +static void +vdev_disk_rele(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_WRITER)); + + /* XXX: Implement me as a vnode rele for the device */ +} + +vdev_ops_t vdev_disk_ops = { + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +/* + * The zfs_vdev_scheduler module option has been deprecated. Setting this + * value no longer has any effect. It has not yet been entirely removed + * to allow the module to be loaded if this option is specified in the + * /etc/modprobe.d/zfs.conf file. The following warning will be logged. + */ +static int +param_set_vdev_scheduler(const char *val, zfs_kernel_param_t *kp) +{ + int error = param_set_charp(val, kp); + if (error == 0) { + printk(KERN_INFO "The 'zfs_vdev_scheduler' module option " + "is not supported.\n"); + } + + return (error); +} + +char *zfs_vdev_scheduler = "unused"; +module_param_call(zfs_vdev_scheduler, param_set_vdev_scheduler, + param_get_charp, &zfs_vdev_scheduler, 0644); +MODULE_PARM_DESC(zfs_vdev_scheduler, "I/O scheduler"); + +int +param_set_min_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val < ASHIFT_MIN || val > zfs_vdev_max_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} + +int +param_set_max_auto_ashift(const char *buf, zfs_kernel_param_t *kp) +{ + uint64_t val; + int error; + + error = kstrtoull(buf, 0, &val); + if (error < 0) + return (SET_ERROR(error)); + + if (val > ASHIFT_MAX || val < zfs_vdev_min_auto_ashift) + return (SET_ERROR(-EINVAL)); + + error = param_set_ulong(buf, kp); + if (error < 0) + return (SET_ERROR(error)); + + return (0); +} diff --git a/module/os/linux/zfs/vdev_file.c b/module/os/linux/zfs/vdev_file.c new file mode 100644 index 000000000000..a4e71ca40788 --- /dev/null +++ b/module/os/linux/zfs/vdev_file.c @@ -0,0 +1,348 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif +/* + * Virtual device vector for files. + */ + +static taskq_t *vdev_file_taskq; + +static void +vdev_file_hold(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static void +vdev_file_rele(vdev_t *vd) +{ + ASSERT(vd->vdev_path != NULL); +} + +static mode_t +vdev_file_open_mode(spa_mode_t spa_mode) +{ + mode_t mode = 0; + + if ((spa_mode & SPA_MODE_READ) && (spa_mode & SPA_MODE_WRITE)) { + mode = O_RDWR; + } else if (spa_mode & SPA_MODE_READ) { + mode = O_RDONLY; + } else if (spa_mode & SPA_MODE_WRITE) { + mode = O_WRONLY; + } + + return (mode | O_LARGEFILE); +} + +static int +vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_file_t *vf; + zfs_file_t *fp; + zfs_file_attr_t zfa; + int error; + + /* + * Rotational optimizations only make sense on block devices. + */ + vd->vdev_nonrot = B_TRUE; + + /* + * Allow TRIM on file based vdevs. This may not always be supported, + * since it depends on your kernel version and underlying filesystem + * type but it is always safe to attempt. + */ + vd->vdev_has_trim = B_TRUE; + + /* + * Disable secure TRIM on file based vdevs. There is no way to + * request this behavior from the underlying filesystem. + */ + vd->vdev_has_securetrim = B_FALSE; + + /* + * We must have a pathname, and it must be absolute. + */ + if (vd->vdev_path == NULL || vd->vdev_path[0] != '/') { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); + + /* + * We always open the files from the root of the global zone, even if + * we're in a local zone. If the user has gotten to this point, the + * administrator has already decided that the pool should be available + * to local zone users, so the underlying devices should be as well. + */ + ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); + + error = zfs_file_open(vd->vdev_path, + vdev_file_open_mode(spa_mode(vd->vdev_spa)), 0, &fp); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + vf->vf_file = fp; + +#ifdef _KERNEL + /* + * Make sure it's a regular file. + */ + if (zfs_file_getattr(fp, &zfa)) { + return (SET_ERROR(ENODEV)); + } + if (!S_ISREG(zfa.zfa_mode)) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (SET_ERROR(ENODEV)); + } +#endif + +skip_open: + + error = zfs_file_getattr(vf->vf_file, &zfa); + if (error) { + vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; + return (error); + } + + *max_psize = *psize = zfa.zfa_size; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; + + return (0); +} + +static void +vdev_file_close(vdev_t *vd) +{ + vdev_file_t *vf = vd->vdev_tsd; + + if (vd->vdev_reopening || vf == NULL) + return; + + if (vf->vf_file != NULL) { + (void) zfs_file_close(vf->vf_file); + } + + vd->vdev_delayed_close = B_FALSE; + kmem_free(vf, sizeof (vdev_file_t)); + vd->vdev_tsd = NULL; +} + +static void +vdev_file_io_strategy(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + ssize_t resid; + void *buf; + loff_t off; + ssize_t size; + int err; + + off = zio->io_offset; + size = zio->io_size; + resid = 0; + + if (zio->io_type == ZIO_TYPE_READ) { + buf = abd_borrow_buf(zio->io_abd, zio->io_size); + err = zfs_file_pread(vf->vf_file, buf, size, off, &resid); + abd_return_buf_copy(zio->io_abd, buf, size); + } else { + buf = abd_borrow_buf_copy(zio->io_abd, zio->io_size); + err = zfs_file_pwrite(vf->vf_file, buf, size, off, &resid); + abd_return_buf(zio->io_abd, buf, size); + } + zio->io_error = err; + if (resid != 0 && zio->io_error == 0) + zio->io_error = SET_ERROR(ENOSPC); + + zio_delay_interrupt(zio); +} + +static void +vdev_file_io_fsync(void *arg) +{ + zio_t *zio = (zio_t *)arg; + vdev_file_t *vf = zio->io_vd->vdev_tsd; + + zio->io_error = zfs_file_fsync(vf->vf_file, O_SYNC | O_DSYNC); + + zio_interrupt(zio); +} + +static void +vdev_file_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_file_t *vf = vd->vdev_tsd; + + if (zio->io_type == ZIO_TYPE_IOCTL) { + /* XXPOLICY */ + if (!vdev_readable(vd)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return; + } + + switch (zio->io_cmd) { + case DKIOCFLUSHWRITECACHE: + + if (zfs_nocacheflush) + break; + + /* + * We cannot safely call vfs_fsync() when PF_FSTRANS + * is set in the current context. Filesystems like + * XFS include sanity checks to verify it is not + * already set, see xfs_vm_writepage(). Therefore + * the sync must be dispatched to a different context. + */ + if (__spl_pf_fstrans_check()) { + VERIFY3U(taskq_dispatch(vdev_file_taskq, + vdev_file_io_fsync, zio, TQ_SLEEP), !=, + TASKQID_INVALID); + return; + } + + zio->io_error = zfs_file_fsync(vf->vf_file, + O_SYNC | O_DSYNC); + break; + default: + zio->io_error = SET_ERROR(ENOTSUP); + } + + zio_execute(zio); + return; + } else if (zio->io_type == ZIO_TYPE_TRIM) { + int mode = 0; + + ASSERT3U(zio->io_size, !=, 0); +#ifdef __linux__ + mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE; +#endif + zio->io_error = zfs_file_fallocate(vf->vf_file, + mode, zio->io_offset, zio->io_size); + zio_execute(zio); + return; + } + + zio->io_target_timestamp = zio_handle_io_delay(zio); + + VERIFY3U(taskq_dispatch(vdev_file_taskq, vdev_file_io_strategy, zio, + TQ_SLEEP), !=, TASKQID_INVALID); +} + +/* ARGSUSED */ +static void +vdev_file_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_file_ops = { + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +void +vdev_file_init(void) +{ + vdev_file_taskq = taskq_create("z_vdev_file", MAX(boot_ncpus, 16), + minclsyspri, boot_ncpus, INT_MAX, TASKQ_DYNAMIC); + + VERIFY(vdev_file_taskq); +} + +void +vdev_file_fini(void) +{ + taskq_destroy(vdev_file_taskq); +} + +/* + * From userland we access disks just like files. + */ +#ifndef _KERNEL + +vdev_ops_t vdev_disk_ops = { + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +#endif diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c new file mode 100644 index 000000000000..8d79878c0458 --- /dev/null +++ b/module/os/linux/zfs/zfs_acl.c @@ -0,0 +1,2932 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define ALLOW ACE_ACCESS_ALLOWED_ACE_TYPE +#define DENY ACE_ACCESS_DENIED_ACE_TYPE +#define MAX_ACE_TYPE ACE_SYSTEM_ALARM_CALLBACK_OBJECT_ACE_TYPE +#define MIN_ACE_TYPE ALLOW + +#define OWNING_GROUP (ACE_GROUP|ACE_IDENTIFIER_GROUP) +#define EVERYONE_ALLOW_MASK (ACE_READ_ACL|ACE_READ_ATTRIBUTES | \ + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE) +#define EVERYONE_DENY_MASK (ACE_WRITE_ACL|ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) +#define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) + +#define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ + ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ + ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ + ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) + +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) + +#define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define OKAY_MASK_BITS (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ + ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) + +#define ALL_INHERIT (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE | \ + ACE_NO_PROPAGATE_INHERIT_ACE|ACE_INHERIT_ONLY_ACE|ACE_INHERITED_ACE) + +#define RESTRICTED_CLEAR (ACE_WRITE_ACL|ACE_WRITE_OWNER) + +#define V4_ACL_WIDE_FLAGS (ZFS_ACL_AUTO_INHERIT|ZFS_ACL_DEFAULTED|\ + ZFS_ACL_PROTECTED) + +#define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ + ZFS_ACL_OBJ_ACE) + +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + +#define IDMAP_WK_CREATOR_OWNER_UID 2147483648U + +static uint16_t +zfs_ace_v0_get_type(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_v0_get_flags(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_v0_get_mask(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_v0_get_who(void *acep) +{ + return (((zfs_oldace_t *)acep)->z_fuid); +} + +static void +zfs_ace_v0_set_type(void *acep, uint16_t type) +{ + ((zfs_oldace_t *)acep)->z_type = type; +} + +static void +zfs_ace_v0_set_flags(void *acep, uint16_t flags) +{ + ((zfs_oldace_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_v0_set_mask(void *acep, uint32_t mask) +{ + ((zfs_oldace_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_v0_set_who(void *acep, uint64_t who) +{ + ((zfs_oldace_t *)acep)->z_fuid = who; +} + +/*ARGSUSED*/ +static size_t +zfs_ace_v0_size(void *acep) +{ + return (sizeof (zfs_oldace_t)); +} + +static size_t +zfs_ace_v0_abstract_size(void) +{ + return (sizeof (zfs_oldace_t)); +} + +static int +zfs_ace_v0_mask_off(void) +{ + return (offsetof(zfs_oldace_t, z_access_mask)); +} + +/*ARGSUSED*/ +static int +zfs_ace_v0_data(void *acep, void **datap) +{ + *datap = NULL; + return (0); +} + +static acl_ops_t zfs_acl_v0_ops = { + .ace_mask_get = zfs_ace_v0_get_mask, + .ace_mask_set = zfs_ace_v0_set_mask, + .ace_flags_get = zfs_ace_v0_get_flags, + .ace_flags_set = zfs_ace_v0_set_flags, + .ace_type_get = zfs_ace_v0_get_type, + .ace_type_set = zfs_ace_v0_set_type, + .ace_who_get = zfs_ace_v0_get_who, + .ace_who_set = zfs_ace_v0_set_who, + .ace_size = zfs_ace_v0_size, + .ace_abstract_size = zfs_ace_v0_abstract_size, + .ace_mask_off = zfs_ace_v0_mask_off, + .ace_data = zfs_ace_v0_data +}; + +static uint16_t +zfs_ace_fuid_get_type(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_type); +} + +static uint16_t +zfs_ace_fuid_get_flags(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_flags); +} + +static uint32_t +zfs_ace_fuid_get_mask(void *acep) +{ + return (((zfs_ace_hdr_t *)acep)->z_access_mask); +} + +static uint64_t +zfs_ace_fuid_get_who(void *args) +{ + uint16_t entry_type; + zfs_ace_t *acep = args; + + entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (-1); + return (((zfs_ace_t *)acep)->z_fuid); +} + +static void +zfs_ace_fuid_set_type(void *acep, uint16_t type) +{ + ((zfs_ace_hdr_t *)acep)->z_type = type; +} + +static void +zfs_ace_fuid_set_flags(void *acep, uint16_t flags) +{ + ((zfs_ace_hdr_t *)acep)->z_flags = flags; +} + +static void +zfs_ace_fuid_set_mask(void *acep, uint32_t mask) +{ + ((zfs_ace_hdr_t *)acep)->z_access_mask = mask; +} + +static void +zfs_ace_fuid_set_who(void *arg, uint64_t who) +{ + zfs_ace_t *acep = arg; + + uint16_t entry_type = acep->z_hdr.z_flags & ACE_TYPE_FLAGS; + + if (entry_type == ACE_OWNER || entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return; + acep->z_fuid = who; +} + +static size_t +zfs_ace_fuid_size(void *acep) +{ + zfs_ace_hdr_t *zacep = acep; + uint16_t entry_type; + + switch (zacep->z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + return (sizeof (zfs_object_ace_t)); + case ALLOW: + case DENY: + entry_type = + (((zfs_ace_hdr_t *)acep)->z_flags & ACE_TYPE_FLAGS); + if (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE) + return (sizeof (zfs_ace_hdr_t)); + /*FALLTHROUGH*/ + default: + return (sizeof (zfs_ace_t)); + } +} + +static size_t +zfs_ace_fuid_abstract_size(void) +{ + return (sizeof (zfs_ace_hdr_t)); +} + +static int +zfs_ace_fuid_mask_off(void) +{ + return (offsetof(zfs_ace_hdr_t, z_access_mask)); +} + +static int +zfs_ace_fuid_data(void *acep, void **datap) +{ + zfs_ace_t *zacep = acep; + zfs_object_ace_t *zobjp; + + switch (zacep->z_hdr.z_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjp = acep; + *datap = (caddr_t)zobjp + sizeof (zfs_ace_t); + return (sizeof (zfs_object_ace_t) - sizeof (zfs_ace_t)); + default: + *datap = NULL; + return (0); + } +} + +static acl_ops_t zfs_acl_fuid_ops = { + .ace_mask_get = zfs_ace_fuid_get_mask, + .ace_mask_set = zfs_ace_fuid_set_mask, + .ace_flags_get = zfs_ace_fuid_get_flags, + .ace_flags_set = zfs_ace_fuid_set_flags, + .ace_type_get = zfs_ace_fuid_get_type, + .ace_type_set = zfs_ace_fuid_set_type, + .ace_who_get = zfs_ace_fuid_get_who, + .ace_who_set = zfs_ace_fuid_set_who, + .ace_size = zfs_ace_fuid_size, + .ace_abstract_size = zfs_ace_fuid_abstract_size, + .ace_mask_off = zfs_ace_fuid_mask_off, + .ace_data = zfs_ace_fuid_data +}; + +/* + * The following three functions are provided for compatibility with + * older ZPL version in order to determine if the file use to have + * an external ACL and what version of ACL previously existed on the + * file. Would really be nice to not need this, sigh. + */ +uint64_t +zfs_external_acl(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + int error; + + if (zp->z_is_sa) + return (0); + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_extern_obj); + else { + /* + * after upgrade the SA_ZPL_ZNODE_ACL should have been + * removed + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (0); + } +} + +/* + * Determine size of ACL in bytes + * + * This is more complicated than it should be since we have to deal + * with old external ACLs. + */ +static int +zfs_acl_znode_info(znode_t *zp, int *aclsize, int *aclcount, + zfs_acl_phys_t *aclphys) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t acl_count; + int size; + int error; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_is_sa) { + if ((error = sa_size(zp->z_sa_hdl, SA_ZPL_DACL_ACES(zfsvfs), + &size)) != 0) + return (error); + *aclsize = size; + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_COUNT(zfsvfs), + &acl_count, sizeof (acl_count))) != 0) + return (error); + *aclcount = acl_count; + } else { + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + aclphys, sizeof (*aclphys))) != 0) + return (error); + + if (aclphys->z_acl_version == ZFS_ACL_VERSION_INITIAL) { + *aclsize = ZFS_ACL_SIZE(aclphys->z_acl_size); + *aclcount = aclphys->z_acl_size; + } else { + *aclsize = aclphys->z_acl_size; + *aclcount = aclphys->z_acl_count; + } + } + return (0); +} + +int +zfs_znode_acl_version(znode_t *zp) +{ + zfs_acl_phys_t acl_phys; + + if (zp->z_is_sa) + return (ZFS_ACL_VERSION_FUID); + else { + int error; + + /* + * Need to deal with a potential + * race where zfs_sa_upgrade could cause + * z_isa_sa to change. + * + * If the lookup fails then the state of z_is_sa should have + * changed. + */ + if ((error = sa_lookup(zp->z_sa_hdl, + SA_ZPL_ZNODE_ACL(ZTOZSB(zp)), + &acl_phys, sizeof (acl_phys))) == 0) + return (acl_phys.z_acl_version); + else { + /* + * After upgrade SA_ZPL_ZNODE_ACL should have + * been removed. + */ + VERIFY(zp->z_is_sa && error == ENOENT); + return (ZFS_ACL_VERSION_FUID); + } + } +} + +static int +zfs_acl_version(int version) +{ + if (version < ZPL_VERSION_FUID) + return (ZFS_ACL_VERSION_INITIAL); + else + return (ZFS_ACL_VERSION_FUID); +} + +static int +zfs_acl_version_zp(znode_t *zp) +{ + return (zfs_acl_version(ZTOZSB(zp)->z_version)); +} + +zfs_acl_t * +zfs_acl_alloc(int vers) +{ + zfs_acl_t *aclp; + + aclp = kmem_zalloc(sizeof (zfs_acl_t), KM_SLEEP); + list_create(&aclp->z_acl, sizeof (zfs_acl_node_t), + offsetof(zfs_acl_node_t, z_next)); + aclp->z_version = vers; + if (vers == ZFS_ACL_VERSION_FUID) + aclp->z_ops = &zfs_acl_fuid_ops; + else + aclp->z_ops = &zfs_acl_v0_ops; + return (aclp); +} + +zfs_acl_node_t * +zfs_acl_node_alloc(size_t bytes) +{ + zfs_acl_node_t *aclnode; + + aclnode = kmem_zalloc(sizeof (zfs_acl_node_t), KM_SLEEP); + if (bytes) { + aclnode->z_acldata = kmem_alloc(bytes, KM_SLEEP); + aclnode->z_allocdata = aclnode->z_acldata; + aclnode->z_allocsize = bytes; + aclnode->z_size = bytes; + } + + return (aclnode); +} + +static void +zfs_acl_node_free(zfs_acl_node_t *aclnode) +{ + if (aclnode->z_allocsize) + kmem_free(aclnode->z_allocdata, aclnode->z_allocsize); + kmem_free(aclnode, sizeof (zfs_acl_node_t)); +} + +static void +zfs_acl_release_nodes(zfs_acl_t *aclp) +{ + zfs_acl_node_t *aclnode; + + while ((aclnode = list_head(&aclp->z_acl))) { + list_remove(&aclp->z_acl, aclnode); + zfs_acl_node_free(aclnode); + } + aclp->z_acl_count = 0; + aclp->z_acl_bytes = 0; +} + +void +zfs_acl_free(zfs_acl_t *aclp) +{ + zfs_acl_release_nodes(aclp); + list_destroy(&aclp->z_acl); + kmem_free(aclp, sizeof (zfs_acl_t)); +} + +static boolean_t +zfs_acl_valid_ace_type(uint_t type, uint_t flags) +{ + uint16_t entry_type; + + switch (type) { + case ALLOW: + case DENY: + case ACE_SYSTEM_AUDIT_ACE_TYPE: + case ACE_SYSTEM_ALARM_ACE_TYPE: + entry_type = flags & ACE_TYPE_FLAGS; + return (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE || entry_type == 0 || + entry_type == ACE_IDENTIFIER_GROUP); + default: + if (type >= MIN_ACE_TYPE && type <= MAX_ACE_TYPE) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +zfs_ace_valid(umode_t obj_mode, zfs_acl_t *aclp, uint16_t type, uint16_t iflags) +{ + /* + * first check type of entry + */ + + if (!zfs_acl_valid_ace_type(type, iflags)) + return (B_FALSE); + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (aclp->z_version < ZFS_ACL_VERSION_FUID) + return (B_FALSE); + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + } + + /* + * next check inheritance level flags + */ + + if (S_ISDIR(obj_mode) && + (iflags & (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if (iflags & (ACE_INHERIT_ONLY_ACE|ACE_NO_PROPAGATE_INHERIT_ACE)) { + if ((iflags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE)) == 0) { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void * +zfs_acl_next_ace(zfs_acl_t *aclp, void *start, uint64_t *who, + uint32_t *access_mask, uint16_t *iflags, uint16_t *type) +{ + zfs_acl_node_t *aclnode; + + ASSERT(aclp); + + if (start == NULL) { + aclnode = list_head(&aclp->z_acl); + if (aclnode == NULL) + return (NULL); + + aclp->z_next_ace = aclnode->z_acldata; + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + } + + aclnode = aclp->z_curr_node; + + if (aclnode == NULL) + return (NULL); + + if (aclnode->z_ace_idx >= aclnode->z_ace_count) { + aclnode = list_next(&aclp->z_acl, aclnode); + if (aclnode == NULL) + return (NULL); + else { + aclp->z_curr_node = aclnode; + aclnode->z_ace_idx = 0; + aclp->z_next_ace = aclnode->z_acldata; + } + } + + if (aclnode->z_ace_idx < aclnode->z_ace_count) { + void *acep = aclp->z_next_ace; + size_t ace_size; + + /* + * Make sure we don't overstep our bounds + */ + ace_size = aclp->z_ops->ace_size(acep); + + if (((caddr_t)acep + ace_size) > + ((caddr_t)aclnode->z_acldata + aclnode->z_size)) { + return (NULL); + } + + *iflags = aclp->z_ops->ace_flags_get(acep); + *type = aclp->z_ops->ace_type_get(acep); + *access_mask = aclp->z_ops->ace_mask_get(acep); + *who = aclp->z_ops->ace_who_get(acep); + aclp->z_next_ace = (caddr_t)aclp->z_next_ace + ace_size; + aclnode->z_ace_idx++; + + return ((void *)acep); + } + return (NULL); +} + +/*ARGSUSED*/ +static uint64_t +zfs_ace_walk(void *datap, uint64_t cookie, int aclcnt, + uint16_t *flags, uint16_t *type, uint32_t *mask) +{ + zfs_acl_t *aclp = datap; + zfs_ace_hdr_t *acep = (zfs_ace_hdr_t *)(uintptr_t)cookie; + uint64_t who; + + acep = zfs_acl_next_ace(aclp, acep, &who, mask, + flags, type); + return ((uint64_t)(uintptr_t)acep); +} + +/* + * Copy ACE to internal ZFS format. + * While processing the ACL each ACE will be validated for correctness. + * ACE FUIDs will be created later. + */ +static int +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, umode_t obj_mode, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, uint64_t aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) +{ + int i; + uint16_t entry_type; + zfs_ace_t *aceptr = z_acl; + ace_t *acep = datap; + zfs_object_ace_t *zobjacep; + ace_object_t *aceobjp; + + for (i = 0; i != aclcnt; i++) { + aceptr->z_hdr.z_access_mask = acep->a_access_mask; + aceptr->z_hdr.z_flags = acep->a_flags; + aceptr->z_hdr.z_type = acep->a_type; + entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; + if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE) { + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); + } + + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_hdr.z_type, + aceptr->z_hdr.z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + + switch (acep->a_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + zobjacep = (zfs_object_ace_t *)aceptr; + aceobjp = (ace_object_t *)acep; + + bcopy(aceobjp->a_obj_type, zobjacep->z_object_type, + sizeof (aceobjp->a_obj_type)); + bcopy(aceobjp->a_inherit_obj_type, + zobjacep->z_inherit_type, + sizeof (aceobjp->a_inherit_obj_type)); + acep = (ace_t *)((caddr_t)acep + sizeof (ace_object_t)); + break; + default: + acep = (ace_t *)((caddr_t)acep + sizeof (ace_t)); + } + + aceptr = (zfs_ace_t *)((caddr_t)aceptr + + aclp->z_ops->ace_size(aceptr)); + } + + *size = (caddr_t)aceptr - (caddr_t)z_acl; + + return (0); +} + +/* + * Copy ZFS ACEs to fixed size ace_t layout + */ +static void +zfs_copy_fuid_2_ace(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, cred_t *cr, + void *datap, int filter) +{ + uint64_t who; + uint32_t access_mask; + uint16_t iflags, type; + zfs_ace_hdr_t *zacep = NULL; + ace_t *acep = datap; + ace_object_t *objacep; + zfs_object_ace_t *zobjacep; + size_t ace_size; + uint16_t entry_type; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + if (filter) { + continue; + } + zobjacep = (zfs_object_ace_t *)zacep; + objacep = (ace_object_t *)acep; + bcopy(zobjacep->z_object_type, + objacep->a_obj_type, + sizeof (zobjacep->z_object_type)); + bcopy(zobjacep->z_inherit_type, + objacep->a_inherit_obj_type, + sizeof (zobjacep->z_inherit_type)); + ace_size = sizeof (ace_object_t); + break; + default: + ace_size = sizeof (ace_t); + break; + } + + entry_type = (iflags & ACE_TYPE_FLAGS); + if ((entry_type != ACE_OWNER && + entry_type != OWNING_GROUP && + entry_type != ACE_EVERYONE)) { + acep->a_who = zfs_fuid_map_id(zfsvfs, who, + cr, (entry_type & ACE_IDENTIFIER_GROUP) ? + ZFS_ACE_GROUP : ZFS_ACE_USER); + } else { + acep->a_who = (uid_t)(int64_t)who; + } + acep->a_access_mask = access_mask; + acep->a_flags = iflags; + acep->a_type = type; + acep = (ace_t *)((caddr_t)acep + ace_size); + } +} + +static int +zfs_copy_ace_2_oldace(umode_t obj_mode, zfs_acl_t *aclp, ace_t *acep, + zfs_oldace_t *z_acl, int aclcnt, size_t *size) +{ + int i; + zfs_oldace_t *aceptr = z_acl; + + for (i = 0; i != aclcnt; i++, aceptr++) { + aceptr->z_access_mask = acep[i].a_access_mask; + aceptr->z_type = acep[i].a_type; + aceptr->z_flags = acep[i].a_flags; + aceptr->z_fuid = acep[i].a_who; + /* + * Make sure ACE is valid + */ + if (zfs_ace_valid(obj_mode, aclp, aceptr->z_type, + aceptr->z_flags) != B_TRUE) + return (SET_ERROR(EINVAL)); + } + *size = (caddr_t)aceptr - (caddr_t)z_acl; + return (0); +} + +/* + * convert old ACL format to new + */ +void +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) +{ + zfs_oldace_t *oldaclp; + int i; + uint16_t type, iflags; + uint32_t access_mask; + uint64_t who; + void *cookie = NULL; + zfs_acl_node_t *newaclnode; + + ASSERT(aclp->z_version == ZFS_ACL_VERSION_INITIAL); + /* + * First create the ACE in a contiguous piece of memory + * for zfs_copy_ace_2_fuid(). + * + * We only convert an ACL once, so this won't happen + * every time. + */ + oldaclp = kmem_alloc(sizeof (zfs_oldace_t) * aclp->z_acl_count, + KM_SLEEP); + i = 0; + while ((cookie = zfs_acl_next_ace(aclp, cookie, &who, + &access_mask, &iflags, &type))) { + oldaclp[i].z_flags = iflags; + oldaclp[i].z_type = type; + oldaclp[i].z_fuid = who; + oldaclp[i++].z_access_mask = access_mask; + } + + newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * + sizeof (zfs_object_ace_t)); + aclp->z_ops = &zfs_acl_fuid_ops; + VERIFY(zfs_copy_ace_2_fuid(ZTOZSB(zp), ZTOI(zp)->i_mode, + aclp, oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); + newaclnode->z_ace_count = aclp->z_acl_count; + aclp->z_version = ZFS_ACL_VERSION; + kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); + + /* + * Release all previous ACL nodes + */ + + zfs_acl_release_nodes(aclp); + + list_insert_head(&aclp->z_acl, newaclnode); + + aclp->z_acl_bytes = newaclnode->z_size; + aclp->z_acl_count = newaclnode->z_ace_count; + +} + +/* + * Convert unix access mask to v4 access mask + */ +static uint32_t +zfs_unix_to_v4(uint32_t access_mask) +{ + uint32_t new_mask = 0; + + if (access_mask & S_IXOTH) + new_mask |= ACE_EXECUTE; + if (access_mask & S_IWOTH) + new_mask |= ACE_WRITE_DATA; + if (access_mask & S_IROTH) + new_mask |= ACE_READ_DATA; + return (new_mask); +} + +static void +zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, + uint16_t access_type, uint64_t fuid, uint16_t entry_type) +{ + uint16_t type = entry_type & ACE_TYPE_FLAGS; + + aclp->z_ops->ace_mask_set(acep, access_mask); + aclp->z_ops->ace_type_set(acep, access_type); + aclp->z_ops->ace_flags_set(acep, entry_type); + if ((type != ACE_OWNER && type != OWNING_GROUP && + type != ACE_EVERYONE)) + aclp->z_ops->ace_who_set(acep, fuid); +} + +/* + * Determine mode of file based on ACL. + */ +uint64_t +zfs_mode_compute(uint64_t fmode, zfs_acl_t *aclp, + uint64_t *pflags, uint64_t fuid, uint64_t fgid) +{ + int entry_type; + mode_t mode; + mode_t seen = 0; + zfs_ace_hdr_t *acep = NULL; + uint64_t who; + uint16_t iflags, type; + uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; + + mode = (fmode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, + &access_mask, &iflags, &type))) { + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + /* + * Skip over any inherit_only ACEs + */ + if (iflags & ACE_INHERIT_ONLY_ACE) + continue; + + if (entry_type == ACE_OWNER || (entry_type == 0 && + who == fuid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRUSR))) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWUSR))) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXUSR))) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + } else if (entry_type == OWNING_GROUP || + (entry_type == ACE_IDENTIFIER_GROUP && who == fgid)) { + if ((access_mask & ACE_READ_DATA) && + (!(seen & S_IRGRP))) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if ((access_mask & ACE_WRITE_DATA) && + (!(seen & S_IWGRP))) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if ((access_mask & ACE_EXECUTE) && + (!(seen & S_IXGRP))) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + } else if (entry_type == ACE_EVERYONE) { + if ((access_mask & ACE_READ_DATA)) { + if (!(seen & S_IRUSR)) { + seen |= S_IRUSR; + if (type == ALLOW) { + mode |= S_IRUSR; + } + } + if (!(seen & S_IRGRP)) { + seen |= S_IRGRP; + if (type == ALLOW) { + mode |= S_IRGRP; + } + } + if (!(seen & S_IROTH)) { + seen |= S_IROTH; + if (type == ALLOW) { + mode |= S_IROTH; + } + } + } + if ((access_mask & ACE_WRITE_DATA)) { + if (!(seen & S_IWUSR)) { + seen |= S_IWUSR; + if (type == ALLOW) { + mode |= S_IWUSR; + } + } + if (!(seen & S_IWGRP)) { + seen |= S_IWGRP; + if (type == ALLOW) { + mode |= S_IWGRP; + } + } + if (!(seen & S_IWOTH)) { + seen |= S_IWOTH; + if (type == ALLOW) { + mode |= S_IWOTH; + } + } + } + if ((access_mask & ACE_EXECUTE)) { + if (!(seen & S_IXUSR)) { + seen |= S_IXUSR; + if (type == ALLOW) { + mode |= S_IXUSR; + } + } + if (!(seen & S_IXGRP)) { + seen |= S_IXGRP; + if (type == ALLOW) { + mode |= S_IXGRP; + } + } + if (!(seen & S_IXOTH)) { + seen |= S_IXOTH; + if (type == ALLOW) { + mode |= S_IXOTH; + } + } + } + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; + } + } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + *pflags &= ~ZFS_NO_EXECS_DENIED; + else + *pflags |= ZFS_NO_EXECS_DENIED; + + return (mode); +} + +/* + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. + */ +int +zfs_acl_node_read(struct znode *zp, boolean_t have_lock, zfs_acl_t **aclpp, + boolean_t will_modify) +{ + zfs_acl_t *aclp; + int aclsize = 0; + int acl_count = 0; + zfs_acl_node_t *aclnode; + zfs_acl_phys_t znode_acl; + int version; + int error; + boolean_t drop_lock = B_FALSE; + + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + + /* + * close race where znode could be upgrade while trying to + * read the znode attributes. + * + * But this could only happen if the file isn't already an SA + * znode + */ + if (!zp->z_is_sa && !have_lock) { + mutex_enter(&zp->z_lock); + drop_lock = B_TRUE; + } + version = zfs_znode_acl_version(zp); + + if ((error = zfs_acl_znode_info(zp, &aclsize, + &acl_count, &znode_acl)) != 0) { + goto done; + } + + aclp = zfs_acl_alloc(version); + + aclp->z_acl_count = acl_count; + aclp->z_acl_bytes = aclsize; + + aclnode = zfs_acl_node_alloc(aclsize); + aclnode->z_ace_count = aclp->z_acl_count; + aclnode->z_size = aclsize; + + if (!zp->z_is_sa) { + if (znode_acl.z_acl_extern_obj) { + error = dmu_read(ZTOZSB(zp)->z_os, + znode_acl.z_acl_extern_obj, 0, aclnode->z_size, + aclnode->z_acldata, DMU_READ_PREFETCH); + } else { + bcopy(znode_acl.z_ace_data, aclnode->z_acldata, + aclnode->z_size); + } + } else { + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DACL_ACES(ZTOZSB(zp)), + aclnode->z_acldata, aclnode->z_size); + } + + if (error != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + goto done; + } + + list_insert_head(&aclp->z_acl, aclnode); + + *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; +done: + if (drop_lock) + mutex_exit(&zp->z_lock); + return (error); +} + +/*ARGSUSED*/ +void +zfs_acl_data_locator(void **dataptr, uint32_t *length, uint32_t buflen, + boolean_t start, void *userdata) +{ + zfs_acl_locator_cb_t *cb = (zfs_acl_locator_cb_t *)userdata; + + if (start) { + cb->cb_acl_node = list_head(&cb->cb_aclp->z_acl); + } else { + cb->cb_acl_node = list_next(&cb->cb_aclp->z_acl, + cb->cb_acl_node); + } + *dataptr = cb->cb_acl_node->z_acldata; + *length = cb->cb_acl_node->z_size; +} + +int +zfs_acl_chown_setattr(znode_t *zp) +{ + int error; + zfs_acl_t *aclp; + + if (ZTOZSB(zp)->z_acl_type == ZFS_ACLTYPE_POSIXACL) + return (0); + + ASSERT(MUTEX_HELD(&zp->z_lock)); + ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + + error = zfs_acl_node_read(zp, B_TRUE, &aclp, B_FALSE); + if (error == 0 && aclp->z_acl_count > 0) + zp->z_mode = ZTOI(zp)->i_mode = + zfs_mode_compute(zp->z_mode, aclp, + &zp->z_pflags, KUID_TO_SUID(ZTOI(zp)->i_uid), + KGID_TO_SGID(ZTOI(zp)->i_gid)); + + /* + * Some ZFS implementations (ZEVO) create neither a ZNODE_ACL + * nor a DACL_ACES SA in which case ENOENT is returned from + * zfs_acl_node_read() when the SA can't be located. + * Allow chown/chgrp to succeed in these cases rather than + * returning an error that makes no sense in the context of + * the caller. + */ + if (error == ENOENT) + return (0); + + return (error); +} + +typedef struct trivial_acl { + uint32_t allow0; /* allow mask for bits only in owner */ + uint32_t deny1; /* deny mask for bits not in owner */ + uint32_t deny2; /* deny mask for bits not in group */ + uint32_t owner; /* allow mask matching mode */ + uint32_t group; /* allow mask matching mode */ + uint32_t everyone; /* allow mask matching mode */ +} trivial_acl_t; + +static void +acl_trivial_access_masks(mode_t mode, boolean_t isdir, trivial_acl_t *masks) +{ + uint32_t read_mask = ACE_READ_DATA; + uint32_t write_mask = ACE_WRITE_DATA|ACE_APPEND_DATA; + uint32_t execute_mask = ACE_EXECUTE; + + if (isdir) + write_mask |= ACE_DELETE_CHILD; + + masks->deny1 = 0; + + if (!(mode & S_IRUSR) && (mode & (S_IRGRP|S_IROTH))) + masks->deny1 |= read_mask; + if (!(mode & S_IWUSR) && (mode & (S_IWGRP|S_IWOTH))) + masks->deny1 |= write_mask; + if (!(mode & S_IXUSR) && (mode & (S_IXGRP|S_IXOTH))) + masks->deny1 |= execute_mask; + + masks->deny2 = 0; + if (!(mode & S_IRGRP) && (mode & S_IROTH)) + masks->deny2 |= read_mask; + if (!(mode & S_IWGRP) && (mode & S_IWOTH)) + masks->deny2 |= write_mask; + if (!(mode & S_IXGRP) && (mode & S_IXOTH)) + masks->deny2 |= execute_mask; + + masks->allow0 = 0; + if ((mode & S_IRUSR) && (!(mode & S_IRGRP) && (mode & S_IROTH))) + masks->allow0 |= read_mask; + if ((mode & S_IWUSR) && (!(mode & S_IWGRP) && (mode & S_IWOTH))) + masks->allow0 |= write_mask; + if ((mode & S_IXUSR) && (!(mode & S_IXGRP) && (mode & S_IXOTH))) + masks->allow0 |= execute_mask; + + masks->owner = ACE_WRITE_ATTRIBUTES|ACE_WRITE_OWNER|ACE_WRITE_ACL| + ACE_WRITE_NAMED_ATTRS|ACE_READ_ACL|ACE_READ_ATTRIBUTES| + ACE_READ_NAMED_ATTRS|ACE_SYNCHRONIZE; + if (mode & S_IRUSR) + masks->owner |= read_mask; + if (mode & S_IWUSR) + masks->owner |= write_mask; + if (mode & S_IXUSR) + masks->owner |= execute_mask; + + masks->group = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IRGRP) + masks->group |= read_mask; + if (mode & S_IWGRP) + masks->group |= write_mask; + if (mode & S_IXGRP) + masks->group |= execute_mask; + + masks->everyone = ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_NAMED_ATTRS| + ACE_SYNCHRONIZE; + if (mode & S_IROTH) + masks->everyone |= read_mask; + if (mode & S_IWOTH) + masks->everyone |= write_mask; + if (mode & S_IXOTH) + masks->everyone |= execute_mask; +} + +/* + * ace_trivial: + * determine whether an ace_t acl is trivial + * + * Trivialness implies that the acl is composed of only + * owner, group, everyone entries. ACL can't + * have read_acl denied, and write_owner/write_acl/write_attributes + * can only be owner@ entry. + */ +static int +ace_trivial_common(void *acep, int aclcnt, + uint64_t (*walk)(void *, uint64_t, int aclcnt, + uint16_t *, uint16_t *, uint32_t *)) +{ + uint16_t flags; + uint32_t mask; + uint16_t type; + uint64_t cookie = 0; + + while ((cookie = walk(acep, cookie, aclcnt, &flags, &type, &mask))) { + switch (flags & ACE_TYPE_FLAGS) { + case ACE_OWNER: + case ACE_GROUP|ACE_IDENTIFIER_GROUP: + case ACE_EVERYONE: + break; + default: + return (1); + } + + if (flags & (ACE_FILE_INHERIT_ACE| + ACE_DIRECTORY_INHERIT_ACE|ACE_NO_PROPAGATE_INHERIT_ACE| + ACE_INHERIT_ONLY_ACE)) + return (1); + + /* + * Special check for some special bits + * + * Don't allow anybody to deny reading basic + * attributes or a files ACL. + */ + if ((mask & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + (type == ACE_ACCESS_DENIED_ACE_TYPE)) + return (1); + + /* + * Delete permission is never set by default + */ + if (mask & ACE_DELETE) + return (1); + + /* + * Child delete permission should be accompanied by write + */ + if ((mask & ACE_DELETE_CHILD) && !(mask & ACE_WRITE_DATA)) + return (1); + + /* + * only allow owner@ to have + * write_acl/write_owner/write_attributes/write_xattr/ + */ + if (type == ACE_ACCESS_ALLOWED_ACE_TYPE && + (!(flags & ACE_OWNER) && (mask & + (ACE_WRITE_OWNER|ACE_WRITE_ACL| ACE_WRITE_ATTRIBUTES| + ACE_WRITE_NAMED_ATTRS)))) + return (1); + + } + + return (0); +} + +/* + * common code for setting ACLs. + * + * This function is called from zfs_mode_update, zfs_perm_init, and zfs_setacl. + * zfs_setacl passes a non-NULL inherit pointer (ihp) to indicate that it's + * already checked the acl and knows whether to inherit. + */ +int +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) +{ + int error; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_object_type_t otype; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t mode; + sa_bulk_attr_t bulk[5]; + uint64_t ctime[2]; + int count = 0; + zfs_acl_phys_t acl_phys; + + mode = zp->z_mode; + + mode = zfs_mode_compute(mode, aclp, &zp->z_pflags, + KUID_TO_SUID(ZTOI(zp)->i_uid), KGID_TO_SGID(ZTOI(zp)->i_gid)); + + zp->z_mode = ZTOI(zp)->i_mode = mode; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, sizeof (ctime)); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + /* + * Upgrade needed? + */ + if (!zfsvfs->z_use_fuids) { + otype = DMU_OT_OLDACL; + } else { + if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && + (zfsvfs->z_version >= ZPL_VERSION_FUID)) + zfs_acl_xform(zp, aclp, cr); + ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); + otype = DMU_OT_ACL; + } + + /* + * Arrgh, we have to handle old on disk format + * as well as newer (preferred) SA format. + */ + + if (zp->z_is_sa) { /* the easy case, just update the ACL attribute */ + locate.cb_aclp = aclp; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, aclp->z_acl_bytes); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DACL_COUNT(zfsvfs), + NULL, &aclp->z_acl_count, sizeof (uint64_t)); + } else { /* Painful legacy way */ + zfs_acl_node_t *aclnode; + uint64_t off = 0; + uint64_t aoid; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_ZNODE_ACL(zfsvfs), + &acl_phys, sizeof (acl_phys))) != 0) + return (error); + + aoid = acl_phys.z_acl_extern_obj; + + if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { + /* + * If ACL was previously external and we are now + * converting to new ACL format then release old + * ACL object and create a new one. + */ + if (aoid && + aclp->z_version != acl_phys.z_acl_version) { + error = dmu_object_free(zfsvfs->z_os, aoid, tx); + if (error) + return (error); + aoid = 0; + } + if (aoid == 0) { + aoid = dmu_object_alloc(zfsvfs->z_os, + otype, aclp->z_acl_bytes, + otype == DMU_OT_ACL ? + DMU_OT_SYSACL : DMU_OT_NONE, + otype == DMU_OT_ACL ? + DN_OLD_MAX_BONUSLEN : 0, tx); + } else { + (void) dmu_object_set_blocksize(zfsvfs->z_os, + aoid, aclp->z_acl_bytes, 0, tx); + } + acl_phys.z_acl_extern_obj = aoid; + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + dmu_write(zfsvfs->z_os, aoid, off, + aclnode->z_size, aclnode->z_acldata, tx); + off += aclnode->z_size; + } + } else { + void *start = acl_phys.z_ace_data; + /* + * Migrating back embedded? + */ + if (acl_phys.z_acl_extern_obj) { + error = dmu_object_free(zfsvfs->z_os, + acl_phys.z_acl_extern_obj, tx); + if (error) + return (error); + acl_phys.z_acl_extern_obj = 0; + } + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + if (aclnode->z_ace_count == 0) + continue; + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + } + /* + * If Old version then swap count/bytes to match old + * layout of znode_acl_phys_t. + */ + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + acl_phys.z_acl_size = aclp->z_acl_count; + acl_phys.z_acl_count = aclp->z_acl_bytes; + } else { + acl_phys.z_acl_size = aclp->z_acl_bytes; + acl_phys.z_acl_count = aclp->z_acl_count; + } + acl_phys.z_acl_version = aclp->z_version; + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (acl_phys)); + } + + /* + * Replace ACL wide bits, but first clear them. + */ + zp->z_pflags &= ~ZFS_ACL_WIDE_FLAGS; + + zp->z_pflags |= aclp->z_hints; + + if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) + zp->z_pflags |= ZFS_ACL_TRIVIAL; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + return (sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); +} + +static void +zfs_acl_chmod(boolean_t isdir, uint64_t mode, boolean_t split, boolean_t trim, + zfs_acl_t *aclp) +{ + void *acep = NULL; + uint64_t who; + int new_count, new_bytes; + int ace_size; + int entry_type; + uint16_t iflags, type; + uint32_t access_mask; + zfs_acl_node_t *newnode; + size_t abstract_size = aclp->z_ops->ace_abstract_size(); + void *zacep; + trivial_acl_t masks; + + new_count = new_bytes = 0; + + acl_trivial_access_masks((mode_t)mode, isdir, &masks); + + newnode = zfs_acl_node_alloc((abstract_size * 6) + aclp->z_acl_bytes); + + zacep = newnode->z_acldata; + if (masks.allow0) { + zfs_set_ace(aclp, zacep, masks.allow0, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny1) { + zfs_set_ace(aclp, zacep, masks.deny1, DENY, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + if (masks.deny2) { + zfs_set_ace(aclp, zacep, masks.deny2, DENY, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + new_count++; + new_bytes += abstract_size; + } + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + entry_type = (iflags & ACE_TYPE_FLAGS); + /* + * ACEs used to represent the file mode may be divided + * into an equivalent pair of inherit-only and regular + * ACEs, if they are inheritable. + * Skip regular ACEs, which are replaced by the new mode. + */ + if (split && (entry_type == ACE_OWNER || + entry_type == OWNING_GROUP || + entry_type == ACE_EVERYONE)) { + if (!isdir || !(iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + continue; + /* + * We preserve owner@, group@, or @everyone + * permissions, if they are inheritable, by + * copying them to inherit_only ACEs. This + * prevents inheritable permissions from being + * altered along with the file mode. + */ + iflags |= ACE_INHERIT_ONLY_ACE; + } + + /* + * If this ACL has any inheritable ACEs, mark that in + * the hints (which are later masked into the pflags) + * so create knows to do inheritance. + */ + if (isdir && (iflags & + (ACE_FILE_INHERIT_ACE|ACE_DIRECTORY_INHERIT_ACE))) + aclp->z_hints |= ZFS_INHERIT_ACE; + + if ((type != ALLOW && type != DENY) || + (iflags & ACE_INHERIT_ONLY_ACE)) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + aclp->z_hints |= ZFS_ACL_OBJ_ACE; + break; + } + } else { + /* + * Limit permissions to be no greater than + * group permissions. + * The "aclinherit" and "aclmode" properties + * affect policy for create and chmod(2), + * respectively. + */ + if ((type == ALLOW) && trim) + access_mask &= masks.group; + } + zfs_set_ace(aclp, zacep, access_mask, type, who, iflags); + ace_size = aclp->z_ops->ace_size(acep); + zacep = (void *)((uintptr_t)zacep + ace_size); + new_count++; + new_bytes += ace_size; + } + zfs_set_ace(aclp, zacep, masks.owner, ALLOW, -1, ACE_OWNER); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.group, ALLOW, -1, OWNING_GROUP); + zacep = (void *)((uintptr_t)zacep + abstract_size); + zfs_set_ace(aclp, zacep, masks.everyone, ALLOW, -1, ACE_EVERYONE); + + new_count += 3; + new_bytes += abstract_size * 3; + zfs_acl_release_nodes(aclp); + aclp->z_acl_count = new_count; + aclp->z_acl_bytes = new_bytes; + newnode->z_ace_count = new_count; + newnode->z_size = new_bytes; + list_insert_tail(&aclp->z_acl, newnode); +} + +int +zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) +{ + int error = 0; + + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_DISCARD) + *aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + else + error = zfs_acl_node_read(zp, B_TRUE, aclp, B_TRUE); + + if (error == 0) { + (*aclp)->z_hints = zp->z_pflags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(S_ISDIR(ZTOI(zp)->i_mode), mode, B_TRUE, + (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_GROUPMASK), *aclp); + } + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Should ACE be inherited? + */ +static int +zfs_ace_can_use(umode_t obj_mode, uint16_t acep_flags) +{ + int iflags = (acep_flags & 0xf); + + if (S_ISDIR(obj_mode) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) + return (1); + else if (iflags & ACE_FILE_INHERIT_ACE) + return (!(S_ISDIR(obj_mode) && + (iflags & ACE_NO_PROPAGATE_INHERIT_ACE))); + return (0); +} + +/* + * inherit inheritable ACEs from parent + */ +static zfs_acl_t * +zfs_acl_inherit(zfsvfs_t *zfsvfs, umode_t va_mode, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) +{ + void *pacep = NULL; + void *acep; + zfs_acl_node_t *aclnode; + zfs_acl_t *aclp = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t iflags, newflags, type; + size_t ace_size; + void *data1, *data2; + size_t data1sz, data2sz; + uint_t aclinherit; + boolean_t isdir = S_ISDIR(va_mode); + boolean_t isreg = S_ISREG(va_mode); + + *need_chmod = B_TRUE; + + aclp = zfs_acl_alloc(paclp->z_version); + aclinherit = zfsvfs->z_acl_inherit; + if (aclinherit == ZFS_ACL_DISCARD || S_ISLNK(va_mode)) + return (aclp); + + while ((pacep = zfs_acl_next_ace(paclp, pacep, &who, + &access_mask, &iflags, &type))) { + + /* + * don't inherit bogus ACEs + */ + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + /* + * Check if ACE is inheritable by this vnode + */ + if ((aclinherit == ZFS_ACL_NOALLOW && type == ALLOW) || + !zfs_ace_can_use(va_mode, iflags)) + continue; + + /* + * If owner@, group@, or everyone@ inheritable + * then zfs_acl_chmod() isn't needed. + */ + if ((aclinherit == ZFS_ACL_PASSTHROUGH || + aclinherit == ZFS_ACL_PASSTHROUGH_X) && + ((iflags & (ACE_OWNER|ACE_EVERYONE)) || + ((iflags & OWNING_GROUP) == OWNING_GROUP)) && + (isreg || (isdir && (iflags & ACE_DIRECTORY_INHERIT_ACE)))) + *need_chmod = B_FALSE; + + /* + * Strip inherited execute permission from file if + * not in mode + */ + if (aclinherit == ZFS_ACL_PASSTHROUGH_X && type == ALLOW && + !isdir && ((mode & (S_IXUSR|S_IXGRP|S_IXOTH)) == 0)) { + access_mask &= ~ACE_EXECUTE; + } + + /* + * Strip write_acl and write_owner from permissions + * when inheriting an ACE + */ + if (aclinherit == ZFS_ACL_RESTRICTED && type == ALLOW) { + access_mask &= ~RESTRICTED_CLEAR; + } + + ace_size = aclp->z_ops->ace_size(pacep); + aclnode = zfs_acl_node_alloc(ace_size); + list_insert_tail(&aclp->z_acl, aclnode); + acep = aclnode->z_acldata; + + zfs_set_ace(aclp, acep, access_mask, type, + who, iflags|ACE_INHERITED_ACE); + + /* + * Copy special opaque data if any + */ + if ((data1sz = paclp->z_ops->ace_data(pacep, &data1)) != 0) { + VERIFY((data2sz = aclp->z_ops->ace_data(acep, + &data2)) == data1sz); + bcopy(data1, data2, data2sz); + } + + aclp->z_acl_count++; + aclnode->z_ace_count++; + aclp->z_acl_bytes += aclnode->z_size; + newflags = aclp->z_ops->ace_flags_get(acep); + + /* + * If ACE is not to be inherited further, or if the vnode is + * not a directory, remove all inheritance flags + */ + if (!isdir || (iflags & ACE_NO_PROPAGATE_INHERIT_ACE)) { + newflags &= ~ALL_INHERIT; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + continue; + } + + /* + * This directory has an inheritable ACE + */ + aclp->z_hints |= ZFS_INHERIT_ACE; + + /* + * If only FILE_INHERIT is set then turn on + * inherit_only + */ + if ((iflags & (ACE_FILE_INHERIT_ACE | + ACE_DIRECTORY_INHERIT_ACE)) == ACE_FILE_INHERIT_ACE) { + newflags |= ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } else { + newflags &= ~ACE_INHERIT_ONLY_ACE; + aclp->z_ops->ace_flags_set(acep, + newflags|ACE_INHERITED_ACE); + } + } + if (zfsvfs->z_acl_mode == ZFS_ACL_RESTRICTED && + aclp->z_acl_count != 0) { + *need_chmod = B_FALSE; + } + + return (aclp); +} + +/* + * Create file system object initial permissions + * including inheritable ACEs. + * Also, create FUIDs for owner and group. + */ +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) +{ + int error; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_acl_t *paclp; + gid_t gid = vap->va_gid; + boolean_t need_chmod = B_TRUE; + boolean_t trim = B_FALSE; + boolean_t inherited = B_FALSE; + + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = vap->va_mode; + + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_mode, vsecp, + cr, &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); + + acl_ids->z_fuid = vap->va_uid; + acl_ids->z_fgid = vap->va_gid; +#ifdef HAVE_KSID + /* + * Determine uid and gid. + */ + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || + ((flag & IS_XATTR) && (S_ISDIR(vap->va_mode)))) { + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_uid, + cr, ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + } else { + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; + if (vap->va_mask & AT_GID) { + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); + gid = vap->va_gid; + if (acl_ids->z_fgid != KGID_TO_SGID(ZTOI(dzp)->i_gid) && + !groupmember(vap->va_gid, cr) && + secpolicy_vnode_create_gid(cr) != 0) + acl_ids->z_fgid = 0; + } + if (acl_ids->z_fgid == 0) { + if (dzp->z_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = KGID_TO_SGID( + ZTOI(dzp)->i_gid); + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, + cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } + } else { + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); + gid = crgetgid(cr); + } + } + } +#endif /* HAVE_KSID */ + + /* + * If we're creating a directory, and the parent directory has the + * set-GID bit set, set in on the new directory. + * Otherwise, if the user is neither privileged nor a member of the + * file's new group, clear the file's set-GID bit. + */ + + if (!(flag & IS_ROOT_NODE) && (dzp->z_mode & S_ISGID) && + (S_ISDIR(vap->va_mode))) { + acl_ids->z_mode |= S_ISGID; + } else { + if ((acl_ids->z_mode & S_ISGID) && + secpolicy_vnode_setids_setgids(cr, gid) != 0) + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_acl_lock); + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && + (dzp->z_pflags & ZFS_INHERIT_ACE) && + !(dzp->z_pflags & ZFS_XATTR)) { + VERIFY(0 == zfs_acl_node_read(dzp, B_TRUE, + &paclp, B_FALSE)); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_mode, paclp, acl_ids->z_mode, &need_chmod); + inherited = B_TRUE; + } else { + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + mutex_exit(&dzp->z_lock); + mutex_exit(&dzp->z_acl_lock); + + if (need_chmod) { + if (S_ISDIR(vap->va_mode)) + acl_ids->z_aclp->z_hints |= + ZFS_ACL_AUTO_INHERIT; + + if (zfsvfs->z_acl_mode == ZFS_ACL_GROUPMASK && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && + zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) + trim = B_TRUE; + zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, + trim, acl_ids->z_aclp); + } + } + + if (inherited || vsecp) { + acl_ids->z_mode = zfs_mode_compute(acl_ids->z_mode, + acl_ids->z_aclp, &acl_ids->z_aclp->z_hints, + acl_ids->z_fuid, acl_ids->z_fgid); + if (ace_trivial_common(acl_ids->z_aclp, 0, zfs_ace_walk) == 0) + acl_ids->z_aclp->z_hints |= ZFS_ACL_TRIVIAL; + } + + return (0); +} + +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} + +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zv, zfs_acl_ids_t *acl_ids, uint64_t projid) +{ + return (zfs_id_overquota(zv, DMU_USERUSED_OBJECT, acl_ids->z_fuid) || + zfs_id_overquota(zv, DMU_GROUPUSED_OBJECT, acl_ids->z_fgid) || + (projid != ZFS_DEFAULT_PROJID && projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zv, DMU_PROJECTUSED_OBJECT, projid))); +} + +/* + * Retrieve a file's ACL + */ +int +zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfs_acl_t *aclp; + ulong_t mask; + int error; + int count = 0; + int largeace = 0; + + mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT | + VSA_ACE_ACLFLAGS | VSA_ACE_ALLTYPES); + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if ((error = zfs_zaccess(zp, ACE_READ_ACL, 0, skipaclchk, cr))) + return (error); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + /* + * Scan ACL to determine number of ACEs + */ + if ((zp->z_pflags & ZFS_ACL_OBJ_ACE) && !(mask & VSA_ACE_ALLTYPES)) { + void *zacep = NULL; + uint64_t who; + uint32_t access_mask; + uint16_t type, iflags; + + while ((zacep = zfs_acl_next_ace(aclp, zacep, + &who, &access_mask, &iflags, &type))) { + switch (type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + largeace++; + continue; + default: + count++; + } + } + vsecp->vsa_aclcnt = count; + } else + count = (int)aclp->z_acl_count; + + if (mask & VSA_ACECNT) { + vsecp->vsa_aclcnt = count; + } + + if (mask & VSA_ACE) { + size_t aclsz; + + aclsz = count * sizeof (ace_t) + + sizeof (ace_object_t) * largeace; + + vsecp->vsa_aclentp = kmem_alloc(aclsz, KM_SLEEP); + vsecp->vsa_aclentsz = aclsz; + + if (aclp->z_version == ZFS_ACL_VERSION_FUID) + zfs_copy_fuid_2_ace(ZTOZSB(zp), aclp, cr, + vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); + else { + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); + } + } + if (mask & VSA_ACE_ACLFLAGS) { + vsecp->vsa_aclflags = 0; + if (zp->z_pflags & ZFS_ACL_DEFAULTED) + vsecp->vsa_aclflags |= ACL_DEFAULTED; + if (zp->z_pflags & ZFS_ACL_PROTECTED) + vsecp->vsa_aclflags |= ACL_PROTECTED; + if (zp->z_pflags & ZFS_ACL_AUTO_INHERIT) + vsecp->vsa_aclflags |= ACL_AUTO_INHERIT; + } + + mutex_exit(&zp->z_acl_lock); + + return (0); +} + +int +zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, umode_t obj_mode, + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) +{ + zfs_acl_t *aclp; + zfs_acl_node_t *aclnode; + int aclcnt = vsecp->vsa_aclcnt; + int error; + + if (vsecp->vsa_aclcnt > MAX_ACL_ENTRIES || vsecp->vsa_aclcnt <= 0) + return (SET_ERROR(EINVAL)); + + aclp = zfs_acl_alloc(zfs_acl_version(zfsvfs->z_version)); + + aclp->z_hints = 0; + aclnode = zfs_acl_node_alloc(aclcnt * sizeof (zfs_object_ace_t)); + if (aclp->z_version == ZFS_ACL_VERSION_INITIAL) { + if ((error = zfs_copy_ace_2_oldace(obj_mode, aclp, + (ace_t *)vsecp->vsa_aclentp, aclnode->z_acldata, + aclcnt, &aclnode->z_size)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } else { + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_mode, aclp, + vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, + &aclnode->z_size, fuidp, cr)) != 0) { + zfs_acl_free(aclp); + zfs_acl_node_free(aclnode); + return (error); + } + } + aclp->z_acl_bytes = aclnode->z_size; + aclnode->z_ace_count = aclcnt; + aclp->z_acl_count = aclcnt; + list_insert_head(&aclp->z_acl, aclnode); + + /* + * If flags are being set then add them to z_hints + */ + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) { + if (vsecp->vsa_aclflags & ACL_PROTECTED) + aclp->z_hints |= ZFS_ACL_PROTECTED; + if (vsecp->vsa_aclflags & ACL_DEFAULTED) + aclp->z_hints |= ZFS_ACL_DEFAULTED; + if (vsecp->vsa_aclflags & ACL_AUTO_INHERIT) + aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; + } + + *zaclp = aclp; + + return (0); +} + +/* + * Set a file's ACL + */ +int +zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog = zfsvfs->z_log; + ulong_t mask = vsecp->vsa_mask & (VSA_ACE | VSA_ACECNT); + dmu_tx_t *tx; + int error; + zfs_acl_t *aclp; + zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; + uint64_t acl_obj; + + if (mask == 0) + return (SET_ERROR(ENOSYS)); + + if (zp->z_pflags & ZFS_IMMUTABLE) + return (SET_ERROR(EPERM)); + + if ((error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))) + return (error); + + error = zfs_vsec_2_aclp(zfsvfs, ZTOI(zp)->i_mode, vsecp, cr, &fuidp, + &aclp); + if (error) + return (error); + + /* + * If ACL wide flags aren't being set then preserve any + * existing flags. + */ + if (!(vsecp->vsa_mask & VSA_ACE_ACLFLAGS)) { + aclp->z_hints |= + (zp->z_pflags & V4_ACL_WIDE_FLAGS); + } +top: + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + /* + * If old version and ACL won't fit in bonus and we aren't + * upgrading then take out necessary DMU holds + */ + + if ((acl_obj = zfs_external_acl(zp)) != 0) { + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) <= ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); + } + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_NOWAIT); + if (error) { + mutex_exit(&zp->z_acl_lock); + mutex_exit(&zp->z_lock); + + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + zfs_acl_free(aclp); + return (error); + } + + error = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT(error == 0); + ASSERT(zp->z_acl_cached == NULL); + zp->z_acl_cached = aclp; + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_log_acl(zilog, tx, zp, vsecp, fuidp); + + if (fuidp) + zfs_fuid_info_free(fuidp); + dmu_tx_commit(tx); + + mutex_exit(&zp->z_lock); + mutex_exit(&zp->z_acl_lock); + + return (error); +} + +/* + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. + */ +static int +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) +{ + if ((v4_mode & WRITE_MASK) && (zfs_is_readonly(ZTOZSB(zp))) && + (!Z_ISDEV(ZTOI(zp)->i_mode) || + (Z_ISDEV(ZTOI(zp)->i_mode) && (v4_mode & WRITE_MASK_ATTRS)))) { + return (SET_ERROR(EROFS)); + } + + /* + * Only check for READONLY on non-directories. + */ + if ((v4_mode & WRITE_MASK_DATA) && + ((!S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & (ZFS_READONLY | ZFS_IMMUTABLE))) || + (S_ISDIR(ZTOI(zp)->i_mode) && + (zp->z_pflags & ZFS_IMMUTABLE)))) { + return (SET_ERROR(EPERM)); + } + + if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && + (zp->z_pflags & ZFS_NOUNLINK)) { + return (SET_ERROR(EPERM)); + } + + if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && + (zp->z_pflags & ZFS_AV_QUARANTINED))) { + return (SET_ERROR(EACCES)); + } + + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCES if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t gowner; + uid_t fowner; + + zfs_fuid_map_ids(zp, cr, &fowner, &gowner); + + mutex_enter(&zp->z_acl_lock); + + error = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + if (error != 0) { + mutex_exit(&zp->z_acl_lock); + return (error); + } + + ASSERT(zp->z_acl_cached); + + while ((acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, + &iflags, &type))) { + uint32_t mask_matched; + + if (!zfs_acl_valid_ace_type(type, iflags)) + continue; + + if (S_ISDIR(ZTOI(zp)->i_mode) && + (iflags & ACE_INHERIT_ONLY_ACE)) + continue; + + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + + entry_type = (iflags & ACE_TYPE_FLAGS); + + checkit = B_FALSE; + + switch (entry_type) { + case ACE_OWNER: + if (uid == fowner) + checkit = B_TRUE; + break; + case OWNING_GROUP: + who = gowner; + /*FALLTHROUGH*/ + case ACE_IDENTIFIER_GROUP: + checkit = zfs_groupmember(zfsvfs, who, cr); + break; + case ACE_EVERYONE: + checkit = B_TRUE; + break; + + /* USER Entry */ + default: + if (entry_type == 0) { + uid_t newid; + + newid = zfs_fuid_map_id(zfsvfs, who, cr, + ZFS_ACE_USER); + if (newid != IDMAP_WK_CREATOR_OWNER_UID && + uid == newid) + checkit = B_TRUE; + break; + } else { + mutex_exit(&zp->z_acl_lock); + return (SET_ERROR(EIO)); + } + } + + if (checkit) { + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } + } + *working_mode &= ~mask_matched; + } + + /* Are we done? */ + if (*working_mode == 0) + break; + } + + mutex_exit(&zp->z_acl_lock); + + /* Put the found 'denies' back on the working mode */ + if (deny_mask) { + *working_mode |= deny_mask; + return (SET_ERROR(EACCES)); + } else if (*working_mode) { + return (-1); + } + + return (0); +} + +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(ZTOZSB(zp), + KUID_TO_SUID(ZTOI(zp)->i_uid), cr, ZFS_OWNER); + return (secpolicy_vnode_any_access(cr, ZTOI(zp), owner) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + +static int +zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, + cred_t *cr) +{ + if (*working_mode != ACE_WRITE_DATA) + return (SET_ERROR(EACCES)); + + return (zfs_zaccess_common(zp, ACE_APPEND_DATA, working_mode, + check_privs, B_FALSE, cr)); +} + +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + is_attr = ((zdp->z_pflags & ZFS_XATTR) && + (S_ISDIR(ZTOI(zdp)->i_mode))); + if (is_attr) + goto slow; + + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_pflags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (KUID_TO_SUID(ZTOI(zdp)->i_uid) != 0 || + KGID_TO_SGID(ZTOI(zdp)->i_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + if (uid == KUID_TO_SUID(ZTOI(zdp)->i_uid)) { + owner = B_TRUE; + if (zdp->z_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(KGID_TO_SGID(ZTOI(zdp)->i_gid), cr)) { + groupmbr = B_TRUE; + if (zdp->z_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(ZTOZSB(zdp)); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(ZTOZSB(zdp)); + return (error); +} + +/* + * Determine whether Access should be granted/denied. + * + * The least priv subsystem is always consulted as a basic privilege + * can define any form of access. + */ +int +zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) +{ + uint32_t working_mode; + int error; + int is_attr; + boolean_t check_privs; + znode_t *xzp; + znode_t *check_zp = zp; + mode_t needed_bits; + uid_t owner; + + is_attr = ((zp->z_pflags & ZFS_XATTR) && S_ISDIR(ZTOI(zp)->i_mode)); + + /* + * If attribute then validate against base file + */ + if (is_attr) { + if ((error = zfs_zget(ZTOZSB(zp), + zp->z_xattr_parent, &xzp)) != 0) { + return (error); + } + + check_zp = xzp; + + /* + * fixup mode to map to xattr perms + */ + + if (mode & (ACE_WRITE_DATA|ACE_APPEND_DATA)) { + mode &= ~(ACE_WRITE_DATA|ACE_APPEND_DATA); + mode |= ACE_WRITE_NAMED_ATTRS; + } + + if (mode & (ACE_READ_DATA|ACE_EXECUTE)) { + mode &= ~(ACE_READ_DATA|ACE_EXECUTE); + mode |= ACE_READ_NAMED_ATTRS; + } + } + + owner = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOI(zp)->i_uid), + cr, ZFS_OWNER); + /* + * Map the bits required to the standard inode flags + * S_IRUSR|S_IWUSR|S_IXUSR in the needed_bits. Map the bits + * mapped by working_mode (currently missing) in missing_bits. + * Call secpolicy_vnode_access2() with (needed_bits & ~checkmode), + * needed_bits. + */ + needed_bits = 0; + + working_mode = mode; + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES)) && + owner == crgetuid(cr)) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= S_IRUSR; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + needed_bits |= S_IWUSR; + if (working_mode & ACE_EXECUTE) + needed_bits |= S_IXUSR; + + if ((error = zfs_zaccess_common(check_zp, mode, &working_mode, + &check_privs, skipaclchk, cr)) == 0) { + if (is_attr) + zrele(xzp); + return (secpolicy_vnode_access2(cr, ZTOI(zp), owner, + needed_bits, needed_bits)); + } + + if (error && !check_privs) { + if (is_attr) + zrele(xzp); + return (error); + } + + if (error && (flags & V_APPEND)) { + error = zfs_zaccess_append(zp, &working_mode, &check_privs, cr); + } + + if (error && check_privs) { + mode_t checkmode = 0; + + /* + * First check for implicit owner permission on + * read_acl/read_attributes + */ + + error = 0; + ASSERT(working_mode != 0); + + if ((working_mode & (ACE_READ_ACL|ACE_READ_ATTRIBUTES) && + owner == crgetuid(cr))) + working_mode &= ~(ACE_READ_ACL|ACE_READ_ATTRIBUTES); + + if (working_mode & (ACE_READ_DATA|ACE_READ_NAMED_ATTRS| + ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= S_IRUSR; + if (working_mode & (ACE_WRITE_DATA|ACE_WRITE_NAMED_ATTRS| + ACE_APPEND_DATA|ACE_WRITE_ATTRIBUTES|ACE_SYNCHRONIZE)) + checkmode |= S_IWUSR; + if (working_mode & ACE_EXECUTE) + checkmode |= S_IXUSR; + + error = secpolicy_vnode_access2(cr, ZTOI(check_zp), owner, + needed_bits & ~checkmode, needed_bits); + + if (error == 0 && (working_mode & ACE_WRITE_OWNER)) + error = secpolicy_vnode_chown(cr, owner); + if (error == 0 && (working_mode & ACE_WRITE_ACL)) + error = secpolicy_vnode_setdac(cr, owner); + + if (error == 0 && (working_mode & + (ACE_DELETE|ACE_DELETE_CHILD))) + error = secpolicy_vnode_remove(cr); + + if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { + error = secpolicy_vnode_chown(cr, owner); + } + if (error == 0) { + /* + * See if any bits other than those already checked + * for are still present. If so then return EACCES + */ + if (working_mode & ~(ZFS_CHECKED_MASKS)) { + error = SET_ERROR(EACCES); + } + } + } else if (error == 0) { + error = secpolicy_vnode_access2(cr, ZTOI(zp), owner, + needed_bits, needed_bits); + } + + if (is_attr) + zrele(xzp); + + return (error); +} + +/* + * Translate traditional unix S_IRUSR/S_IWUSR/S_IXUSR mode into + * native ACL format and call zfs_zaccess() + */ +int +zfs_zaccess_rwx(znode_t *zp, mode_t mode, int flags, cred_t *cr) +{ + return (zfs_zaccess(zp, zfs_unix_to_v4(mode >> 6), flags, B_FALSE, cr)); +} + +/* + * Access function for secpolicy_vnode_setattr + */ +int +zfs_zaccess_unix(znode_t *zp, mode_t mode, cred_t *cr) +{ + int v4_mode = zfs_unix_to_v4(mode >> 6); + + return (zfs_zaccess(zp, v4_mode, 0, B_FALSE, cr)); +} + +/* See zfs_zaccess_delete() */ +int zfs_write_implies_delete_child = 1; + +/* + * Determine whether delete access should be granted. + * + * The following chart outlines how we handle delete permissions which is + * how recent versions of windows (Windows 2008) handles it. The efficiency + * comes from not having to check the parent ACL where the object itself grants + * delete: + * + * ------------------------------------------------------- + * | Parent Dir | Target Object Permissions | + * | permissions | | + * ------------------------------------------------------- + * | | ACL Allows | ACL Denies| Delete | + * | | Delete | Delete | unspecified| + * ------------------------------------------------------- + * | ACL Allows | Permit | Deny * | Permit | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL Denies | Permit | Deny | Deny | + * | DELETE_CHILD | | | | + * ------------------------------------------------------- + * | ACL specifies | | | | + * | only allow | Permit | Deny * | Permit | + * | write and | | | | + * | execute | | | | + * ------------------------------------------------------- + * | ACL denies | | | | + * | write and | Permit | Deny | Deny | + * | execute | | | | + * ------------------------------------------------------- + * ^ + * | + * Re. execute permission on the directory: if that's missing, + * the vnode lookup of the target will fail before we get here. + * + * Re [*] in the table above: NFSv4 would normally Permit delete for + * these two cells of the matrix. + * See acl.h for notes on which ACE_... flags should be checked for which + * operations. Specifically, the NFSv4 committee recommendation is in + * conflict with the Windows interpretation of DENY ACEs, where DENY ACEs + * should take precedence ahead of ALLOW ACEs. + * + * This implementation always consults the target object's ACL first. + * If a DENY ACE is present on the target object that specifies ACE_DELETE, + * delete access is denied. If an ALLOW ACE with ACE_DELETE is present on + * the target object, access is allowed. If and only if no entries with + * ACE_DELETE are present in the object's ACL, check the container's ACL + * for entries with ACE_DELETE_CHILD. + * + * A summary of the logic implemented from the table above is as follows: + * + * First check for DENY ACEs that apply. + * If either target or container has a deny, EACCES. + * + * Delete access can then be summarized as follows: + * 1: The object to be deleted grants ACE_DELETE, or + * 2: The containing directory grants ACE_DELETE_CHILD. + * In a Windows system, that would be the end of the story. + * In this system, (2) has some complications... + * 2a: "sticky" bit on a directory adds restrictions, and + * 2b: existing ACEs from previous versions of ZFS may + * not carry ACE_DELETE_CHILD where they should, so we + * also allow delete when ACE_WRITE_DATA is granted. + * + * Note: 2b is technically a work-around for a prior bug, + * which hopefully can go away some day. For those who + * no longer need the work around, and for testing, this + * work-around is made conditional via the tunable: + * zfs_write_implies_delete_child + */ +int +zfs_zaccess_delete(znode_t *dzp, znode_t *zp, cred_t *cr) +{ + uint32_t wanted_dirperms; + uint32_t dzp_working_mode = 0; + uint32_t zp_working_mode = 0; + int dzp_error, zp_error; + boolean_t dzpcheck_privs; + boolean_t zpcheck_privs; + + if (zp->z_pflags & (ZFS_IMMUTABLE | ZFS_NOUNLINK)) + return (SET_ERROR(EPERM)); + + /* + * Case 1: + * If target object grants ACE_DELETE then we are done. This is + * indicated by a return value of 0. For this case we don't worry + * about the sticky bit because sticky only applies to the parent + * directory and this is the child access result. + * + * If we encounter a DENY ACE here, we're also done (EACCES). + * Note that if we hit a DENY ACE here (on the target) it should + * take precedence over a DENY ACE on the container, so that when + * we have more complete auditing support we will be able to + * report an access failure against the specific target. + * (This is part of why we're checking the target first.) + */ + zp_error = zfs_zaccess_common(zp, ACE_DELETE, &zp_working_mode, + &zpcheck_privs, B_FALSE, cr); + if (zp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!zpcheck_privs) + return (SET_ERROR(zp_error)); + return (secpolicy_vnode_remove(cr)); + + } + if (zp_error == 0) + return (0); + + /* + * Case 2: + * If the containing directory grants ACE_DELETE_CHILD, + * or we're in backward compatibility mode and the + * containing directory has ACE_WRITE_DATA, allow. + * Case 2b is handled with wanted_dirperms. + */ + wanted_dirperms = ACE_DELETE_CHILD; + if (zfs_write_implies_delete_child) + wanted_dirperms |= ACE_WRITE_DATA; + dzp_error = zfs_zaccess_common(dzp, wanted_dirperms, + &dzp_working_mode, &dzpcheck_privs, B_FALSE, cr); + if (dzp_error == EACCES) { + /* We hit a DENY ACE. */ + if (!dzpcheck_privs) + return (SET_ERROR(dzp_error)); + return (secpolicy_vnode_remove(cr)); + } + + /* + * Cases 2a, 2b (continued) + * + * Note: dzp_working_mode now contains any permissions + * that were NOT granted. Therefore, if any of the + * wanted_dirperms WERE granted, we will have: + * dzp_working_mode != wanted_dirperms + * We're really asking if ANY of those permissions + * were granted, and if so, grant delete access. + */ + if (dzp_working_mode != wanted_dirperms) + dzp_error = 0; + + /* + * dzp_error is 0 if the container granted us permissions to "modify". + * If we do not have permission via one or more ACEs, our current + * privileges may still permit us to modify the container. + * + * dzpcheck_privs is false when i.e. the FS is read-only. + * Otherwise, do privilege checks for the container. + */ + if (dzp_error != 0 && dzpcheck_privs) { + uid_t owner; + + /* + * The secpolicy call needs the requested access and + * the current access mode of the container, but it + * only knows about Unix-style modes (VEXEC, VWRITE), + * so this must condense the fine-grained ACE bits into + * Unix modes. + * + * The VEXEC flag is easy, because we know that has + * always been checked before we get here (during the + * lookup of the target vnode). The container has not + * granted us permissions to "modify", so we do not set + * the VWRITE flag in the current access mode. + */ + owner = zfs_fuid_map_id(ZTOZSB(dzp), + KUID_TO_SUID(ZTOI(dzp)->i_uid), cr, ZFS_OWNER); + dzp_error = secpolicy_vnode_access2(cr, ZTOI(dzp), + owner, S_IXUSR, S_IWUSR|S_IXUSR); + } + if (dzp_error != 0) { + /* + * Note: We may have dzp_error = -1 here (from + * zfs_zacess_common). Don't return that. + */ + return (SET_ERROR(EACCES)); + } + + + /* + * At this point, we know that the directory permissions allow + * us to modify, but we still need to check for the additional + * restrictions that apply when the "sticky bit" is set. + * + * Yes, zfs_sticky_remove_access() also checks this bit, but + * checking it here and skipping the call below is nice when + * you're watching all of this with dtrace. + */ + if ((dzp->z_mode & S_ISVTX) == 0) + return (0); + + /* + * zfs_sticky_remove_access will succeed if: + * 1. The sticky bit is absent. + * 2. We pass the sticky bit restrictions. + * 3. We have privileges that always allow file removal. + */ + return (zfs_sticky_remove_access(dzp, zp, cr)); +} + +int +zfs_zaccess_rename(znode_t *sdzp, znode_t *szp, znode_t *tdzp, + znode_t *tzp, cred_t *cr) +{ + int add_perm; + int error; + + if (szp->z_pflags & ZFS_AV_QUARANTINED) + return (SET_ERROR(EACCES)); + + add_perm = S_ISDIR(ZTOI(szp)->i_mode) ? + ACE_ADD_SUBDIRECTORY : ACE_ADD_FILE; + + /* + * Rename permissions are combination of delete permission + + * add file/subdir permission. + */ + + /* + * first make sure we do the delete portion. + * + * If that succeeds then check for add_file/add_subdir permissions + */ + + if ((error = zfs_zaccess_delete(sdzp, szp, cr))) + return (error); + + /* + * If we have a tzp, see if we can delete it? + */ + if (tzp) { + if ((error = zfs_zaccess_delete(tdzp, tzp, cr))) + return (error); + } + + /* + * Now check for add permissions + */ + error = zfs_zaccess(tdzp, add_perm, 0, B_FALSE, cr); + + return (error); +} diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c new file mode 100644 index 000000000000..c2748ce450b4 --- /dev/null +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -0,0 +1,1241 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + * Copyright (c) 2013 by Delphix. All rights reserved. + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright (c) 2018 George Melikov. All Rights Reserved. + * Copyright (c) 2019 Datto, Inc. All rights reserved. + */ + +/* + * ZFS control directory (a.k.a. ".zfs") + * + * This directory provides a common location for all ZFS meta-objects. + * Currently, this is only the 'snapshot' and 'shares' directory, but this may + * expand in the future. The elements are built dynamically, as the hierarchy + * does not actually exist on disk. + * + * For 'snapshot', we don't want to have all snapshots always mounted, because + * this would take up a huge amount of space in /etc/mnttab. We have three + * types of objects: + * + * ctldir ------> snapshotdir -------> snapshot + * | + * | + * V + * mounted fs + * + * The 'snapshot' node contains just enough information to lookup '..' and act + * as a mountpoint for the snapshot. Whenever we lookup a specific snapshot, we + * perform an automount of the underlying filesystem and return the + * corresponding inode. + * + * All mounts are handled automatically by an user mode helper which invokes + * the mount procedure. Unmounts are handled by allowing the mount + * point to expire so the kernel may automatically unmount it. + * + * The '.zfs', '.zfs/snapshot', and all directories created under + * '.zfs/snapshot' (ie: '.zfs/snapshot/') all share the same + * zfsvfs_t as the head filesystem (what '.zfs' lives under). + * + * File systems mounted on top of the '.zfs/snapshot/' paths + * (ie: snapshots) are complete ZFS filesystems and have their own unique + * zfsvfs_t. However, the fsid reported by these mounts will be the same + * as that used by the parent zfsvfs_t to make NFS happy. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" + +/* + * Two AVL trees are maintained which contain all currently automounted + * snapshots. Every automounted snapshots maps to a single zfs_snapentry_t + * entry which MUST: + * + * - be attached to both trees, and + * - be unique, no duplicate entries are allowed. + * + * The zfs_snapshots_by_name tree is indexed by the full dataset name + * while the zfs_snapshots_by_objsetid tree is indexed by the unique + * objsetid. This allows for fast lookups either by name or objsetid. + */ +static avl_tree_t zfs_snapshots_by_name; +static avl_tree_t zfs_snapshots_by_objsetid; +static krwlock_t zfs_snapshot_lock; + +/* + * Control Directory Tunables (.zfs) + */ +int zfs_expire_snapshot = ZFSCTL_EXPIRE_SNAPSHOT; +int zfs_admin_snapshot = 0; + +typedef struct { + char *se_name; /* full snapshot name */ + char *se_path; /* full mount path */ + spa_t *se_spa; /* pool spa */ + uint64_t se_objsetid; /* snapshot objset id */ + struct dentry *se_root_dentry; /* snapshot root dentry */ + taskqid_t se_taskqid; /* scheduled unmount taskqid */ + avl_node_t se_node_name; /* zfs_snapshots_by_name link */ + avl_node_t se_node_objsetid; /* zfs_snapshots_by_objsetid link */ + zfs_refcount_t se_refcount; /* reference count */ +} zfs_snapentry_t; + +static void zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay); + +/* + * Allocate a new zfs_snapentry_t being careful to make a copy of the + * the snapshot name and provided mount point. No reference is taken. + */ +static zfs_snapentry_t * +zfsctl_snapshot_alloc(char *full_name, char *full_path, spa_t *spa, + uint64_t objsetid, struct dentry *root_dentry) +{ + zfs_snapentry_t *se; + + se = kmem_zalloc(sizeof (zfs_snapentry_t), KM_SLEEP); + + se->se_name = kmem_strdup(full_name); + se->se_path = kmem_strdup(full_path); + se->se_spa = spa; + se->se_objsetid = objsetid; + se->se_root_dentry = root_dentry; + se->se_taskqid = TASKQID_INVALID; + + zfs_refcount_create(&se->se_refcount); + + return (se); +} + +/* + * Free a zfs_snapentry_t the caller must ensure there are no active + * references. + */ +static void +zfsctl_snapshot_free(zfs_snapentry_t *se) +{ + zfs_refcount_destroy(&se->se_refcount); + kmem_strfree(se->se_name); + kmem_strfree(se->se_path); + + kmem_free(se, sizeof (zfs_snapentry_t)); +} + +/* + * Hold a reference on the zfs_snapentry_t. + */ +static void +zfsctl_snapshot_hold(zfs_snapentry_t *se) +{ + zfs_refcount_add(&se->se_refcount, NULL); +} + +/* + * Release a reference on the zfs_snapentry_t. When the number of + * references drops to zero the structure will be freed. + */ +static void +zfsctl_snapshot_rele(zfs_snapentry_t *se) +{ + if (zfs_refcount_remove(&se->se_refcount, NULL) == 0) + zfsctl_snapshot_free(se); +} + +/* + * Add a zfs_snapentry_t to both the zfs_snapshots_by_name and + * zfs_snapshots_by_objsetid trees. While the zfs_snapentry_t is part + * of the trees a reference is held. + */ +static void +zfsctl_snapshot_add(zfs_snapentry_t *se) +{ + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + zfsctl_snapshot_hold(se); + avl_add(&zfs_snapshots_by_name, se); + avl_add(&zfs_snapshots_by_objsetid, se); +} + +/* + * Remove a zfs_snapentry_t from both the zfs_snapshots_by_name and + * zfs_snapshots_by_objsetid trees. Upon removal a reference is dropped, + * this can result in the structure being freed if that was the last + * remaining reference. + */ +static void +zfsctl_snapshot_remove(zfs_snapentry_t *se) +{ + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + avl_remove(&zfs_snapshots_by_name, se); + avl_remove(&zfs_snapshots_by_objsetid, se); + zfsctl_snapshot_rele(se); +} + +/* + * Snapshot name comparison function for the zfs_snapshots_by_name. + */ +static int +snapentry_compare_by_name(const void *a, const void *b) +{ + const zfs_snapentry_t *se_a = a; + const zfs_snapentry_t *se_b = b; + int ret; + + ret = strcmp(se_a->se_name, se_b->se_name); + + if (ret < 0) + return (-1); + else if (ret > 0) + return (1); + else + return (0); +} + +/* + * Snapshot name comparison function for the zfs_snapshots_by_objsetid. + */ +static int +snapentry_compare_by_objsetid(const void *a, const void *b) +{ + const zfs_snapentry_t *se_a = a; + const zfs_snapentry_t *se_b = b; + + if (se_a->se_spa != se_b->se_spa) + return ((ulong_t)se_a->se_spa < (ulong_t)se_b->se_spa ? -1 : 1); + + if (se_a->se_objsetid < se_b->se_objsetid) + return (-1); + else if (se_a->se_objsetid > se_b->se_objsetid) + return (1); + else + return (0); +} + +/* + * Find a zfs_snapentry_t in zfs_snapshots_by_name. If the snapname + * is found a pointer to the zfs_snapentry_t is returned and a reference + * taken on the structure. The caller is responsible for dropping the + * reference with zfsctl_snapshot_rele(). If the snapname is not found + * NULL will be returned. + */ +static zfs_snapentry_t * +zfsctl_snapshot_find_by_name(char *snapname) +{ + zfs_snapentry_t *se, search; + + ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); + + search.se_name = snapname; + se = avl_find(&zfs_snapshots_by_name, &search, NULL); + if (se) + zfsctl_snapshot_hold(se); + + return (se); +} + +/* + * Find a zfs_snapentry_t in zfs_snapshots_by_objsetid given the objset id + * rather than the snapname. In all other respects it behaves the same + * as zfsctl_snapshot_find_by_name(). + */ +static zfs_snapentry_t * +zfsctl_snapshot_find_by_objsetid(spa_t *spa, uint64_t objsetid) +{ + zfs_snapentry_t *se, search; + + ASSERT(RW_LOCK_HELD(&zfs_snapshot_lock)); + + search.se_spa = spa; + search.se_objsetid = objsetid; + se = avl_find(&zfs_snapshots_by_objsetid, &search, NULL); + if (se) + zfsctl_snapshot_hold(se); + + return (se); +} + +/* + * Rename a zfs_snapentry_t in the zfs_snapshots_by_name. The structure is + * removed, renamed, and added back to the new correct location in the tree. + */ +static int +zfsctl_snapshot_rename(char *old_snapname, char *new_snapname) +{ + zfs_snapentry_t *se; + + ASSERT(RW_WRITE_HELD(&zfs_snapshot_lock)); + + se = zfsctl_snapshot_find_by_name(old_snapname); + if (se == NULL) + return (SET_ERROR(ENOENT)); + + zfsctl_snapshot_remove(se); + kmem_strfree(se->se_name); + se->se_name = kmem_strdup(new_snapname); + zfsctl_snapshot_add(se); + zfsctl_snapshot_rele(se); + + return (0); +} + +/* + * Delayed task responsible for unmounting an expired automounted snapshot. + */ +static void +snapentry_expire(void *data) +{ + zfs_snapentry_t *se = (zfs_snapentry_t *)data; + spa_t *spa = se->se_spa; + uint64_t objsetid = se->se_objsetid; + + if (zfs_expire_snapshot <= 0) { + zfsctl_snapshot_rele(se); + return; + } + + se->se_taskqid = TASKQID_INVALID; + (void) zfsctl_snapshot_unmount(se->se_name, MNT_EXPIRE); + zfsctl_snapshot_rele(se); + + /* + * Reschedule the unmount if the zfs_snapentry_t wasn't removed. + * This can occur when the snapshot is busy. + */ + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { + zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); + zfsctl_snapshot_rele(se); + } + rw_exit(&zfs_snapshot_lock); +} + +/* + * Cancel an automatic unmount of a snapname. This callback is responsible + * for dropping the reference on the zfs_snapentry_t which was taken when + * during dispatch. + */ +static void +zfsctl_snapshot_unmount_cancel(zfs_snapentry_t *se) +{ + if (taskq_cancel_id(system_delay_taskq, se->se_taskqid) == 0) { + se->se_taskqid = TASKQID_INVALID; + zfsctl_snapshot_rele(se); + } +} + +/* + * Dispatch the unmount task for delayed handling with a hold protecting it. + */ +static void +zfsctl_snapshot_unmount_delay_impl(zfs_snapentry_t *se, int delay) +{ + ASSERT3S(se->se_taskqid, ==, TASKQID_INVALID); + + if (delay <= 0) + return; + + zfsctl_snapshot_hold(se); + se->se_taskqid = taskq_dispatch_delay(system_delay_taskq, + snapentry_expire, se, TQ_SLEEP, ddi_get_lbolt() + delay * HZ); +} + +/* + * Schedule an automatic unmount of objset id to occur in delay seconds from + * now. Any previous delayed unmount will be cancelled in favor of the + * updated deadline. A reference is taken by zfsctl_snapshot_find_by_name() + * and held until the outstanding task is handled or cancelled. + */ +int +zfsctl_snapshot_unmount_delay(spa_t *spa, uint64_t objsetid, int delay) +{ + zfs_snapentry_t *se; + int error = ENOENT; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_objsetid(spa, objsetid)) != NULL) { + zfsctl_snapshot_unmount_cancel(se); + zfsctl_snapshot_unmount_delay_impl(se, delay); + zfsctl_snapshot_rele(se); + error = 0; + } + rw_exit(&zfs_snapshot_lock); + + return (error); +} + +/* + * Check if snapname is currently mounted. Returned non-zero when mounted + * and zero when unmounted. + */ +static boolean_t +zfsctl_snapshot_ismounted(char *snapname) +{ + zfs_snapentry_t *se; + boolean_t ismounted = B_FALSE; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_name(snapname)) != NULL) { + zfsctl_snapshot_rele(se); + ismounted = B_TRUE; + } + rw_exit(&zfs_snapshot_lock); + + return (ismounted); +} + +/* + * Check if the given inode is a part of the virtual .zfs directory. + */ +boolean_t +zfsctl_is_node(struct inode *ip) +{ + return (ITOZ(ip)->z_is_ctldir); +} + +/* + * Check if the given inode is a .zfs/snapshots/snapname directory. + */ +boolean_t +zfsctl_is_snapdir(struct inode *ip) +{ + return (zfsctl_is_node(ip) && (ip->i_ino <= ZFSCTL_INO_SNAPDIRS)); +} + +/* + * Allocate a new inode with the passed id and ops. + */ +static struct inode * +zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, + const struct file_operations *fops, const struct inode_operations *ops) +{ + inode_timespec_t now; + struct inode *ip; + znode_t *zp; + + ip = new_inode(zfsvfs->z_sb); + if (ip == NULL) + return (NULL); + + now = current_time(ip); + zp = ITOZ(ip); + ASSERT3P(zp->z_dirlocks, ==, NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_id = id; + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_zn_prefetch = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_sa = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_TRUE; + zp->z_is_stale = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_blksz = 0; + zp->z_seq = 0; + zp->z_mapcnt = 0; + zp->z_size = 0; + zp->z_pflags = 0; + zp->z_mode = 0; + zp->z_sync_cnt = 0; + ip->i_generation = 0; + ip->i_ino = id; + ip->i_mode = (S_IFDIR | S_IRWXUGO); + ip->i_uid = SUID_TO_KUID(0); + ip->i_gid = SGID_TO_KGID(0); + ip->i_blkbits = SPA_MINBLOCKSHIFT; + ip->i_atime = now; + ip->i_mtime = now; + ip->i_ctime = now; + ip->i_fop = fops; + ip->i_op = ops; +#if defined(IOP_XATTR) + ip->i_opflags &= ~IOP_XATTR; +#endif + + if (insert_inode_locked(ip)) { + unlock_new_inode(ip); + iput(ip); + return (NULL); + } + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + mutex_exit(&zfsvfs->z_znodes_lock); + + unlock_new_inode(ip); + + return (ip); +} + +/* + * Lookup the inode with given id, it will be allocated if needed. + */ +static struct inode * +zfsctl_inode_lookup(zfsvfs_t *zfsvfs, uint64_t id, + const struct file_operations *fops, const struct inode_operations *ops) +{ + struct inode *ip = NULL; + + while (ip == NULL) { + ip = ilookup(zfsvfs->z_sb, (unsigned long)id); + if (ip) + break; + + /* May fail due to concurrent zfsctl_inode_alloc() */ + ip = zfsctl_inode_alloc(zfsvfs, id, fops, ops); + } + + return (ip); +} + +/* + * Create the '.zfs' directory. This directory is cached as part of the VFS + * structure. This results in a hold on the zfsvfs_t. The code in zfs_umount() + * therefore checks against a vfs_count of 2 instead of 1. This reference + * is removed when the ctldir is destroyed in the unmount. All other entities + * under the '.zfs' directory are created dynamically as needed. + * + * Because the dynamically created '.zfs' directory entries assume the use + * of 64-bit inode numbers this support must be disabled on 32-bit systems. + */ +int +zfsctl_create(zfsvfs_t *zfsvfs) +{ + ASSERT(zfsvfs->z_ctldir == NULL); + + zfsvfs->z_ctldir = zfsctl_inode_alloc(zfsvfs, ZFSCTL_INO_ROOT, + &zpl_fops_root, &zpl_ops_root); + if (zfsvfs->z_ctldir == NULL) + return (SET_ERROR(ENOENT)); + + return (0); +} + +/* + * Destroy the '.zfs' directory or remove a snapshot from zfs_snapshots_by_name. + * Only called when the filesystem is unmounted. + */ +void +zfsctl_destroy(zfsvfs_t *zfsvfs) +{ + if (zfsvfs->z_issnap) { + zfs_snapentry_t *se; + spa_t *spa = zfsvfs->z_os->os_spa; + uint64_t objsetid = dmu_objset_id(zfsvfs->z_os); + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + se = zfsctl_snapshot_find_by_objsetid(spa, objsetid); + if (se != NULL) + zfsctl_snapshot_remove(se); + rw_exit(&zfs_snapshot_lock); + if (se != NULL) { + zfsctl_snapshot_unmount_cancel(se); + zfsctl_snapshot_rele(se); + } + } else if (zfsvfs->z_ctldir) { + iput(zfsvfs->z_ctldir); + zfsvfs->z_ctldir = NULL; + } +} + +/* + * Given a root znode, retrieve the associated .zfs directory. + * Add a hold to the vnode and return it. + */ +struct inode * +zfsctl_root(znode_t *zp) +{ + ASSERT(zfs_has_ctldir(zp)); + igrab(ZTOZSB(zp)->z_ctldir); + return (ZTOZSB(zp)->z_ctldir); +} + +/* + * Generate a long fid to indicate a snapdir. We encode whether snapdir is + * already mounted in gen field. We do this because nfsd lookup will not + * trigger automount. Next time the nfsd does fh_to_dentry, we will notice + * this and do automount and return ESTALE to force nfsd revalidate and follow + * mount. + */ +static int +zfsctl_snapdir_fid(struct inode *ip, fid_t *fidp) +{ + zfid_short_t *zfid = (zfid_short_t *)fidp; + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint32_t gen = 0; + uint64_t object; + uint64_t objsetid; + int i; + struct dentry *dentry; + + if (fidp->fid_len < LONG_FID_LEN) { + fidp->fid_len = LONG_FID_LEN; + return (SET_ERROR(ENOSPC)); + } + + object = ip->i_ino; + objsetid = ZFSCTL_INO_SNAPDIRS - ip->i_ino; + zfid->zf_len = LONG_FID_LEN; + + dentry = d_obtain_alias(igrab(ip)); + if (!IS_ERR(dentry)) { + gen = !!d_mountpoint(dentry); + dput(dentry); + } + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + zlfid->zf_setid[i] = (uint8_t)(objsetid >> (8 * i)); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + zlfid->zf_setgen[i] = 0; + + return (0); +} + +/* + * Generate an appropriate fid for an entry in the .zfs directory. + */ +int +zfsctl_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int i; + + ZFS_ENTER(zfsvfs); + + if (zfsctl_is_snapdir(ip)) { + ZFS_EXIT(zfsvfs); + return (zfsctl_snapdir_fid(ip, fidp)); + } + + if (fidp->fid_len < SHORT_FID_LEN) { + fidp->fid_len = SHORT_FID_LEN; + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOSPC)); + } + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = SHORT_FID_LEN; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* .zfs znodes always have a generation number of 0 */ + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = 0; + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Construct a full dataset name in full_name: "pool/dataset@snap_name" + */ +static int +zfsctl_snapshot_name(zfsvfs_t *zfsvfs, const char *snap_name, int len, + char *full_name) +{ + objset_t *os = zfsvfs->z_os; + + if (zfs_component_namecheck(snap_name, NULL, NULL) != 0) + return (SET_ERROR(EILSEQ)); + + dmu_objset_name(os, full_name); + if ((strlen(full_name) + 1 + strlen(snap_name)) >= len) + return (SET_ERROR(ENAMETOOLONG)); + + (void) strcat(full_name, "@"); + (void) strcat(full_name, snap_name); + + return (0); +} + +/* + * Returns full path in full_path: "/pool/dataset/.zfs/snapshot/snap_name/" + */ +static int +zfsctl_snapshot_path_objset(zfsvfs_t *zfsvfs, uint64_t objsetid, + int path_len, char *full_path) +{ + objset_t *os = zfsvfs->z_os; + fstrans_cookie_t cookie; + char *snapname; + boolean_t case_conflict; + uint64_t id, pos = 0; + int error = 0; + + if (zfsvfs->z_vfs->vfs_mntpoint == NULL) + return (SET_ERROR(ENOENT)); + + cookie = spl_fstrans_mark(); + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + while (error == 0) { + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dmu_snapshot_list_next(zfsvfs->z_os, + ZFS_MAX_DATASET_NAME_LEN, snapname, &id, &pos, + &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto out; + + if (id == objsetid) + break; + } + + snprintf(full_path, path_len, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint, snapname); +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + spl_fstrans_unmark(cookie); + + return (error); +} + +/* + * Special case the handling of "..". + */ +int +zfsctl_root_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + int error = 0; + + ZFS_ENTER(zfsvfs); + + if (strcmp(name, "..") == 0) { + *ipp = dip->i_sb->s_root->d_inode; + } else if (strcmp(name, ZFS_SNAPDIR_NAME) == 0) { + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIR, + &zpl_fops_snapdir, &zpl_ops_snapdir); + } else if (strcmp(name, ZFS_SHAREDIR_NAME) == 0) { + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SHARES, + &zpl_fops_shares, &zpl_ops_shares); + } else { + *ipp = NULL; + } + + if (*ipp == NULL) + error = SET_ERROR(ENOENT); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Lookup entry point for the 'snapshot' directory. Try to open the + * snapshot if it exist, creating the pseudo filesystem inode as necessary. + */ +int +zfsctl_snapdir_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + uint64_t id; + int error; + + ZFS_ENTER(zfsvfs); + + error = dmu_snapshot_lookup(zfsvfs->z_os, name, &id); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + *ipp = zfsctl_inode_lookup(zfsvfs, ZFSCTL_INO_SNAPDIRS - id, + &simple_dir_operations, &simple_dir_inode_operations); + if (*ipp == NULL) + error = SET_ERROR(ENOENT); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Renaming a directory under '.zfs/snapshot' will automatically trigger + * a rename of the snapshot to the new given name. The rename is confined + * to the '.zfs/snapshot' directory snapshots cannot be moved elsewhere. + */ +int +zfsctl_snapdir_rename(struct inode *sdip, char *snm, + struct inode *tdip, char *tnm, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(sdip); + char *to, *from, *real, *fsname; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + to = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + from = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + fsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, snm, real, + ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + snm = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + dmu_objset_name(zfsvfs->z_os, fsname); + + error = zfsctl_snapshot_name(ITOZSB(sdip), snm, + ZFS_MAX_DATASET_NAME_LEN, from); + if (error == 0) + error = zfsctl_snapshot_name(ITOZSB(tdip), tnm, + ZFS_MAX_DATASET_NAME_LEN, to); + if (error == 0) + error = zfs_secpolicy_rename_perms(from, to, cr); + if (error != 0) + goto out; + + /* + * Cannot move snapshots out of the snapdir. + */ + if (sdip != tdip) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * No-op when names are identical. + */ + if (strcmp(snm, tnm) == 0) { + error = 0; + goto out; + } + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + + error = dsl_dataset_rename_snapshot(fsname, snm, tnm, B_FALSE); + if (error == 0) + (void) zfsctl_snapshot_rename(snm, tnm); + + rw_exit(&zfs_snapshot_lock); +out: + kmem_free(from, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(to, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(fsname, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Removing a directory under '.zfs/snapshot' will automatically trigger + * the removal of the snapshot with the given name. + */ +int +zfsctl_snapdir_remove(struct inode *dip, char *name, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + char *snapname, *real; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + ZFS_ENTER(zfsvfs); + + snapname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + real = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + error = dmu_snapshot_realname(zfsvfs->z_os, name, real, + ZFS_MAX_DATASET_NAME_LEN, NULL); + if (error == 0) { + name = real; + } else if (error != ENOTSUP) { + goto out; + } + } + + error = zfsctl_snapshot_name(ITOZSB(dip), name, + ZFS_MAX_DATASET_NAME_LEN, snapname); + if (error == 0) + error = zfs_secpolicy_destroy_perms(snapname, cr); + if (error != 0) + goto out; + + error = zfsctl_snapshot_unmount(snapname, MNT_FORCE); + if ((error == 0) || (error == ENOENT)) + error = dsl_destroy_snapshot(snapname, B_FALSE); +out: + kmem_free(snapname, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(real, ZFS_MAX_DATASET_NAME_LEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Creating a directory under '.zfs/snapshot' will automatically trigger + * the creation of a new snapshot with the given name. + */ +int +zfsctl_snapdir_mkdir(struct inode *dip, char *dirname, vattr_t *vap, + struct inode **ipp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + char *dsname; + int error; + + if (!zfs_admin_snapshot) + return (SET_ERROR(EACCES)); + + dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + + if (zfs_component_namecheck(dirname, NULL, NULL) != 0) { + error = SET_ERROR(EILSEQ); + goto out; + } + + dmu_objset_name(zfsvfs->z_os, dsname); + + error = zfs_secpolicy_snapshot_perms(dsname, cr); + if (error != 0) + goto out; + + if (error == 0) { + error = dmu_objset_snapshot_one(dsname, dirname); + if (error != 0) + goto out; + + error = zfsctl_snapdir_lookup(dip, dirname, ipp, + 0, cr, NULL, NULL); + } +out: + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + return (error); +} + +/* + * Attempt to unmount a snapshot by making a call to user space. + * There is no assurance that this can or will succeed, is just a + * best effort. In the case where it does fail, perhaps because + * it's in use, the unmount will fail harmlessly. + */ +int +zfsctl_snapshot_unmount(char *snapname, int flags) +{ + char *argv[] = { "/usr/bin/env", "umount", "-t", "zfs", "-n", NULL, + NULL }; + char *envp[] = { NULL }; + zfs_snapentry_t *se; + int error; + + rw_enter(&zfs_snapshot_lock, RW_READER); + if ((se = zfsctl_snapshot_find_by_name(snapname)) == NULL) { + rw_exit(&zfs_snapshot_lock); + return (SET_ERROR(ENOENT)); + } + rw_exit(&zfs_snapshot_lock); + + if (flags & MNT_FORCE) + argv[4] = "-fn"; + argv[5] = se->se_path; + dprintf("unmount; path=%s\n", se->se_path); + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + zfsctl_snapshot_rele(se); + + + /* + * The umount system utility will return 256 on error. We must + * assume this error is because the file system is busy so it is + * converted to the more sensible EBUSY. + */ + if (error) + error = SET_ERROR(EBUSY); + + return (error); +} + +int +zfsctl_snapshot_mount(struct path *path, int flags) +{ + struct dentry *dentry = path->dentry; + struct inode *ip = dentry->d_inode; + zfsvfs_t *zfsvfs; + zfsvfs_t *snap_zfsvfs; + zfs_snapentry_t *se; + char *full_name, *full_path; + char *argv[] = { "/usr/bin/env", "mount", "-t", "zfs", "-n", NULL, NULL, + NULL }; + char *envp[] = { NULL }; + int error; + struct path spath; + + if (ip == NULL) + return (SET_ERROR(EISDIR)); + + zfsvfs = ITOZSB(ip); + ZFS_ENTER(zfsvfs); + + full_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + full_path = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + error = zfsctl_snapshot_name(zfsvfs, dname(dentry), + ZFS_MAX_DATASET_NAME_LEN, full_name); + if (error) + goto error; + + /* + * Construct a mount point path from sb of the ctldir inode and dirent + * name, instead of from d_path(), so that chroot'd process doesn't fail + * on mount.zfs(8). + */ + snprintf(full_path, MAXPATHLEN, "%s/.zfs/snapshot/%s", + zfsvfs->z_vfs->vfs_mntpoint ? zfsvfs->z_vfs->vfs_mntpoint : "", + dname(dentry)); + + /* + * Multiple concurrent automounts of a snapshot are never allowed. + * The snapshot may be manually mounted as many times as desired. + */ + if (zfsctl_snapshot_ismounted(full_name)) { + error = 0; + goto error; + } + + /* + * Attempt to mount the snapshot from user space. Normally this + * would be done using the vfs_kern_mount() function, however that + * function is marked GPL-only and cannot be used. On error we + * careful to log the real error to the console and return EISDIR + * to safely abort the automount. This should be very rare. + * + * If the user mode helper happens to return EBUSY, a concurrent + * mount is already in progress in which case the error is ignored. + * Take note that if the program was executed successfully the return + * value from call_usermodehelper() will be (exitcode << 8 + signal). + */ + dprintf("mount; name=%s path=%s\n", full_name, full_path); + argv[5] = full_name; + argv[6] = full_path; + error = call_usermodehelper(argv[0], argv, envp, UMH_WAIT_PROC); + if (error) { + if (!(error & MOUNT_BUSY << 8)) { + zfs_dbgmsg("Unable to automount %s error=%d", + full_path, error); + error = SET_ERROR(EISDIR); + } else { + /* + * EBUSY, this could mean a concurrent mount, or the + * snapshot has already been mounted at completely + * different place. We return 0 so VFS will retry. For + * the latter case the VFS will retry several times + * and return ELOOP, which is probably not a very good + * behavior. + */ + error = 0; + } + goto error; + } + + /* + * Follow down in to the mounted snapshot and set MNT_SHRINKABLE + * to identify this as an automounted filesystem. + */ + spath = *path; + path_get(&spath); + if (follow_down_one(&spath)) { + snap_zfsvfs = ITOZSB(spath.dentry->d_inode); + snap_zfsvfs->z_parent = zfsvfs; + dentry = spath.dentry; + spath.mnt->mnt_flags |= MNT_SHRINKABLE; + + rw_enter(&zfs_snapshot_lock, RW_WRITER); + se = zfsctl_snapshot_alloc(full_name, full_path, + snap_zfsvfs->z_os->os_spa, dmu_objset_id(snap_zfsvfs->z_os), + dentry); + zfsctl_snapshot_add(se); + zfsctl_snapshot_unmount_delay_impl(se, zfs_expire_snapshot); + rw_exit(&zfs_snapshot_lock); + } + path_put(&spath); +error: + kmem_free(full_name, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(full_path, MAXPATHLEN); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Get the snapdir inode from fid + */ +int +zfsctl_snapdir_vget(struct super_block *sb, uint64_t objsetid, int gen, + struct inode **ipp) +{ + int error; + struct path path; + char *mnt; + struct dentry *dentry; + + mnt = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + error = zfsctl_snapshot_path_objset(sb->s_fs_info, objsetid, + MAXPATHLEN, mnt); + if (error) + goto out; + + /* Trigger automount */ + error = -kern_path(mnt, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); + if (error) + goto out; + + path_put(&path); + /* + * Get the snapdir inode. Note, we don't want to use the above + * path because it contains the root of the snapshot rather + * than the snapdir. + */ + *ipp = ilookup(sb, ZFSCTL_INO_SNAPDIRS - objsetid); + if (*ipp == NULL) { + error = SET_ERROR(ENOENT); + goto out; + } + + /* check gen, see zfsctl_snapdir_fid */ + dentry = d_obtain_alias(igrab(*ipp)); + if (gen != (!IS_ERR(dentry) && d_mountpoint(dentry))) { + iput(*ipp); + *ipp = NULL; + error = SET_ERROR(ENOENT); + } + if (!IS_ERR(dentry)) + dput(dentry); +out: + kmem_free(mnt, MAXPATHLEN); + return (error); +} + +int +zfsctl_shares_lookup(struct inode *dip, char *name, struct inode **ipp, + int flags, cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ITOZSB(dip); + znode_t *zp; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = zfs_lookup(dzp, name, &zp, 0, cr, NULL, NULL); + zrele(dzp); + } + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * Initialize the various pieces we'll need to create and manipulate .zfs + * directories. Currently this is unused but available. + */ +void +zfsctl_init(void) +{ + avl_create(&zfs_snapshots_by_name, snapentry_compare_by_name, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, + se_node_name)); + avl_create(&zfs_snapshots_by_objsetid, snapentry_compare_by_objsetid, + sizeof (zfs_snapentry_t), offsetof(zfs_snapentry_t, + se_node_objsetid)); + rw_init(&zfs_snapshot_lock, NULL, RW_DEFAULT, NULL); +} + +/* + * Cleanup the various pieces we needed for .zfs directories. In particular + * ensure the expiry timer is canceled safely. + */ +void +zfsctl_fini(void) +{ + avl_destroy(&zfs_snapshots_by_name); + avl_destroy(&zfs_snapshots_by_objsetid); + rw_destroy(&zfs_snapshot_lock); +} + +module_param(zfs_admin_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_admin_snapshot, "Enable mkdir/rmdir/mv in .zfs/snapshot"); + +module_param(zfs_expire_snapshot, int, 0644); +MODULE_PARM_DESC(zfs_expire_snapshot, "Seconds to expire .zfs/snapshot"); diff --git a/module/os/linux/zfs/zfs_debug.c b/module/os/linux/zfs/zfs_debug.c new file mode 100644 index 000000000000..d98463f1b7f7 --- /dev/null +++ b/module/os/linux/zfs/zfs_debug.c @@ -0,0 +1,254 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include +#include + +typedef struct zfs_dbgmsg { + procfs_list_node_t zdm_node; + uint64_t zdm_timestamp; + int zdm_size; + char zdm_msg[1]; /* variable length allocation */ +} zfs_dbgmsg_t; + +procfs_list_t zfs_dbgmsgs; +int zfs_dbgmsg_size = 0; +int zfs_dbgmsg_maxsize = 4<<20; /* 4MB */ + +/* + * Internal ZFS debug messages are enabled by default. + * + * # Print debug messages + * cat /proc/spl/kstat/zfs/dbgmsg + * + * # Disable the kernel debug message log. + * echo 0 > /sys/module/zfs/parameters/zfs_dbgmsg_enable + * + * # Clear the kernel debug message log. + * echo 0 >/proc/spl/kstat/zfs/dbgmsg + */ +int zfs_dbgmsg_enable = 1; + +static int +zfs_dbgmsg_show_header(struct seq_file *f) +{ + seq_printf(f, "%-12s %-8s\n", "timestamp", "message"); + return (0); +} + +static int +zfs_dbgmsg_show(struct seq_file *f, void *p) +{ + zfs_dbgmsg_t *zdm = (zfs_dbgmsg_t *)p; + seq_printf(f, "%-12llu %-s\n", + (u_longlong_t)zdm->zdm_timestamp, zdm->zdm_msg); + return (0); +} + +static void +zfs_dbgmsg_purge(int max_size) +{ + while (zfs_dbgmsg_size > max_size) { + zfs_dbgmsg_t *zdm = list_remove_head(&zfs_dbgmsgs.pl_list); + if (zdm == NULL) + return; + + int size = zdm->zdm_size; + kmem_free(zdm, size); + zfs_dbgmsg_size -= size; + } +} + +static int +zfs_dbgmsg_clear(procfs_list_t *procfs_list) +{ + mutex_enter(&zfs_dbgmsgs.pl_lock); + zfs_dbgmsg_purge(0); + mutex_exit(&zfs_dbgmsgs.pl_lock); + return (0); +} + +void +zfs_dbgmsg_init(void) +{ + procfs_list_install("zfs", + "dbgmsg", + 0600, + &zfs_dbgmsgs, + zfs_dbgmsg_show, + zfs_dbgmsg_show_header, + zfs_dbgmsg_clear, + offsetof(zfs_dbgmsg_t, zdm_node)); +} + +void +zfs_dbgmsg_fini(void) +{ + procfs_list_uninstall(&zfs_dbgmsgs); + zfs_dbgmsg_purge(0); + + /* + * TODO - decide how to make this permanent + */ +#ifdef _KERNEL + procfs_list_destroy(&zfs_dbgmsgs); +#endif +} + +void +__set_error(const char *file, const char *func, int line, int err) +{ + /* + * To enable this: + * + * $ echo 512 >/sys/module/zfs/parameters/zfs_flags + */ + if (zfs_flags & ZFS_DEBUG_SET_ERROR) + __dprintf(B_FALSE, file, func, line, "error %lu", err); +} + +void +__zfs_dbgmsg(char *buf) +{ + int size = sizeof (zfs_dbgmsg_t) + strlen(buf); + zfs_dbgmsg_t *zdm = kmem_zalloc(size, KM_SLEEP); + zdm->zdm_size = size; + zdm->zdm_timestamp = gethrestime_sec(); + strcpy(zdm->zdm_msg, buf); + + mutex_enter(&zfs_dbgmsgs.pl_lock); + procfs_list_add(&zfs_dbgmsgs, zdm); + zfs_dbgmsg_size += size; + zfs_dbgmsg_purge(MAX(zfs_dbgmsg_maxsize, 0)); + mutex_exit(&zfs_dbgmsgs.pl_lock); +} + +#ifdef _KERNEL + +void +__dprintf(boolean_t dprint, const char *file, const char *func, + int line, const char *fmt, ...) +{ + const char *newfile; + va_list adx; + size_t size; + char *buf; + char *nl; + int i; + char *prefix = (dprint) ? "dprintf: " : ""; + + size = 1024; + buf = kmem_alloc(size, KM_SLEEP); + + /* + * Get rid of annoying prefix to filename. + */ + newfile = strrchr(file, '/'); + if (newfile != NULL) { + newfile = newfile + 1; /* Get rid of leading / */ + } else { + newfile = file; + } + + i = snprintf(buf, size, "%s%s:%d:%s(): ", prefix, newfile, line, func); + + if (i < size) { + va_start(adx, fmt); + (void) vsnprintf(buf + i, size - i, fmt, adx); + va_end(adx); + } + + /* + * Get rid of trailing newline for dprintf logs. + */ + if (dprint && buf[0] != '\0') { + nl = &buf[strlen(buf) - 1]; + if (*nl == '\n') + *nl = '\0'; + } + + /* + * To get this data enable the zfs__dprintf trace point as shown: + * + * # Enable zfs__dprintf tracepoint, clear the tracepoint ring buffer + * $ echo 1 > /sys/kernel/debug/tracing/events/zfs/enable + * $ echo 0 > /sys/kernel/debug/tracing/trace + * + * # Dump the ring buffer. + * $ cat /sys/kernel/debug/tracing/trace + */ + DTRACE_PROBE1(zfs__dprintf, char *, buf); + + /* + * To get this data: + * + * $ cat /proc/spl/kstat/zfs/dbgmsg + * + * To clear the buffer: + * $ echo 0 > /proc/spl/kstat/zfs/dbgmsg + */ + __zfs_dbgmsg(buf); + + kmem_free(buf, size); +} + +#else + +void +zfs_dbgmsg_print(const char *tag) +{ + ssize_t ret __attribute__((unused)); + + /* + * We use write() in this function instead of printf() + * so it is safe to call from a signal handler. + */ + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") START:\n", 9); + + mutex_enter(&zfs_dbgmsgs.pl_lock); + for (zfs_dbgmsg_t *zdm = list_head(&zfs_dbgmsgs.pl_list); zdm != NULL; + zdm = list_next(&zfs_dbgmsgs.pl_list, zdm)) { + ret = write(STDOUT_FILENO, zdm->zdm_msg, + strlen(zdm->zdm_msg)); + ret = write(STDOUT_FILENO, "\n", 1); + } + + ret = write(STDOUT_FILENO, "ZFS_DBGMSG(", 11); + ret = write(STDOUT_FILENO, tag, strlen(tag)); + ret = write(STDOUT_FILENO, ") END\n", 6); + + mutex_exit(&zfs_dbgmsgs.pl_lock); +} +#endif /* _KERNEL */ + +#ifdef _KERNEL +module_param(zfs_dbgmsg_enable, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_enable, "Enable ZFS debug message log"); + +module_param(zfs_dbgmsg_maxsize, int, 0644); +MODULE_PARM_DESC(zfs_dbgmsg_maxsize, "Maximum ZFS debug log size"); +#endif diff --git a/module/os/linux/zfs/zfs_dir.c b/module/os/linux/zfs/zfs_dir.c new file mode 100644 index 000000000000..383657208df3 --- /dev/null +++ b/module/os/linux/zfs/zfs_dir.c @@ -0,0 +1,1224 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * zfs_match_find() is used by zfs_dirent_lock() to perform zap lookups + * of names after deciding which is the appropriate lookup interface. + */ +static int +zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, matchtype_t mt, + boolean_t update, int *deflags, pathname_t *rpnp, uint64_t *zoid) +{ + boolean_t conflict = B_FALSE; + int error; + + if (zfsvfs->z_norm) { + size_t bufsz = 0; + char *buf = NULL; + + if (rpnp) { + buf = rpnp->pn_buf; + bufsz = rpnp->pn_bufsize; + } + + /* + * In the non-mixed case we only expect there would ever + * be one match, but we need to use the normalizing lookup. + */ + error = zap_lookup_norm(zfsvfs->z_os, dzp->z_id, name, 8, 1, + zoid, mt, buf, bufsz, &conflict); + } else { + error = zap_lookup(zfsvfs->z_os, dzp->z_id, name, 8, 1, zoid); + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (error == EOVERFLOW) + error = 0; + + if (zfsvfs->z_norm && !error && deflags) + *deflags = conflict ? ED_CASE_CONFLICT : 0; + + *zoid = ZFS_DIRENT_OBJ(*zoid); + + return (error); +} + +/* + * Lock a directory entry. A dirlock on protects that name + * in dzp's directory zap object. As long as you hold a dirlock, you can + * assume two things: (1) dzp cannot be reaped, and (2) no other thread + * can change the zap entry for (i.e. link or unlink) this name. + * + * Input arguments: + * dzp - znode for directory + * name - name of entry to lock + * flag - ZNEW: if the entry already exists, fail with EEXIST. + * ZEXISTS: if the entry does not exist, fail with ENOENT. + * ZSHARED: allow concurrent access with other ZSHARED callers. + * ZXATTR: we want dzp's xattr directory + * ZCILOOK: On a mixed sensitivity file system, + * this lookup should be case-insensitive. + * ZCIEXACT: On a purely case-insensitive file system, + * this lookup should be case-sensitive. + * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. + * + * Output arguments: + * zpp - pointer to the znode for the entry (NULL if there isn't one) + * dlpp - pointer to the dirlock for this entry (NULL on error) + * direntflags - (case-insensitive lookup only) + * flags if multiple case-sensitive matches exist in directory + * realpnp - (case-insensitive lookup only) + * actual name matched within the directory + * + * Return value: 0 on success or errno on failure. + * + * NOTE: Always checks for, and rejects, '.' and '..'. + * NOTE: For case-insensitive file systems we take wide locks (see below), + * but return znode pointers to a single match. + */ +int +zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, + int flag, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t *dl; + boolean_t update; + matchtype_t mt = 0; + uint64_t zoid; + int error = 0; + int cmpflags; + + *zpp = NULL; + *dlpp = NULL; + + /* + * Verify that we are not trying to lock '.', '..', or '.zfs' + */ + if ((name[0] == '.' && + (name[1] == '\0' || (name[1] == '.' && name[2] == '\0'))) || + (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0)) + return (SET_ERROR(EEXIST)); + + /* + * Case sensitivity and normalization preferences are set when + * the file system is created. These are stored in the + * zfsvfs->z_case and zfsvfs->z_norm fields. These choices + * affect what vnodes can be cached in the DNLC, how we + * perform zap lookups, and the "width" of our dirlocks. + * + * A normal dirlock locks a single name. Note that with + * normalization a name can be composed multiple ways, but + * when normalized, these names all compare equal. A wide + * dirlock locks multiple names. We need these when the file + * system is supporting mixed-mode access. It is sometimes + * necessary to lock all case permutations of file name at + * once so that simultaneous case-insensitive/case-sensitive + * behaves as rationally as possible. + */ + + /* + * When matching we may need to normalize & change case according to + * FS settings. + * + * Note that a normalized match is necessary for a case insensitive + * filesystem when the lookup request is not exact because normalization + * can fold case independent of normalizing code point sequences. + * + * See the table above zfs_dropname(). + */ + if (zfsvfs->z_norm != 0) { + mt = MT_NORMALIZE; + + /* + * Determine if the match needs to honor the case specified in + * lookup, and if so keep track of that so that during + * normalization we don't fold case. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (zfsvfs->z_case == ZFS_CASE_MIXED && !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + } + + /* + * Only look in or update the DNLC if we are looking for the + * name on a file system that does not require normalization + * or case folding. We can also look there if we happen to be + * on a non-normalizing, mixed sensitivity file system IF we + * are looking for the exact name. + * + * Maybe can add TO-UPPERed version of name to dnlc in ci-only + * case for performance improvement? + */ + update = !zfsvfs->z_norm || + (zfsvfs->z_case == ZFS_CASE_MIXED && + !(zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER) && !(flag & ZCILOOK)); + + /* + * ZRENAMING indicates we are in a situation where we should + * take narrow locks regardless of the file system's + * preferences for normalizing and case folding. This will + * prevent us deadlocking trying to grab the same wide lock + * twice if the two names happen to be case-insensitive + * matches. + */ + if (flag & ZRENAMING) + cmpflags = 0; + else + cmpflags = zfsvfs->z_norm; + + /* + * Wait until there are no locks on this name. + * + * Don't grab the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. + */ + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + + mutex_enter(&dzp->z_lock); + for (;;) { + if (dzp->z_unlinked && !(flag & ZXATTR)) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { + if ((u8_strcmp(name, dl->dl_name, 0, cmpflags, + U8_UNICODE_LATEST, &error) == 0) || error != 0) + break; + } + if (error != 0) { + mutex_exit(&dzp->z_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); + return (SET_ERROR(ENOENT)); + } + if (dl == NULL) { + /* + * Allocate a new dirlock and add it to the list. + */ + dl = kmem_alloc(sizeof (zfs_dirlock_t), KM_SLEEP); + cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); + dl->dl_name = name; + dl->dl_sharecnt = 0; + dl->dl_namelock = 0; + dl->dl_namesize = 0; + dl->dl_dzp = dzp; + dl->dl_next = dzp->z_dirlocks; + dzp->z_dirlocks = dl; + break; + } + if ((flag & ZSHARED) && dl->dl_sharecnt != 0) + break; + cv_wait(&dl->dl_cv, &dzp->z_lock); + } + + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { + /* + * We're the second shared reference to dl. Make a copy of + * dl_name in case the first thread goes away before we do. + * Note that we initialize the new name before storing its + * pointer into dl_name, because the first thread may load + * dl->dl_name at any time. It'll either see the old value, + * which belongs to it, or the new shared copy; either is OK. + */ + dl->dl_namesize = strlen(dl->dl_name) + 1; + name = kmem_alloc(dl->dl_namesize, KM_SLEEP); + bcopy(dl->dl_name, name, dl->dl_namesize); + dl->dl_name = name; + } + + mutex_exit(&dzp->z_lock); + + /* + * We have a dirlock on the name. (Note that it is the dirlock, + * not the dzp's z_lock, that protects the name in the zap object.) + * See if there's an object by this name; if so, put a hold on it. + */ + if (flag & ZXATTR) { + error = sa_lookup(dzp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &zoid, + sizeof (zoid)); + if (error == 0) + error = (zoid == 0 ? SET_ERROR(ENOENT) : 0); + } else { + error = zfs_match_find(zfsvfs, dzp, name, mt, + update, direntflags, realpnp, &zoid); + } + if (error) { + if (error != ENOENT || (flag & ZEXISTS)) { + zfs_dirent_unlock(dl); + return (error); + } + } else { + if (flag & ZNEW) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EEXIST)); + } + error = zfs_zget(zfsvfs, zoid, zpp); + if (error) { + zfs_dirent_unlock(dl); + return (error); + } + } + + *dlpp = dl; + + return (0); +} + +/* + * Unlock this directory entry and wake anyone who was waiting for it. + */ +void +zfs_dirent_unlock(zfs_dirlock_t *dl) +{ + znode_t *dzp = dl->dl_dzp; + zfs_dirlock_t **prev_dl, *cur_dl; + + mutex_enter(&dzp->z_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + + if (dl->dl_sharecnt > 1) { + dl->dl_sharecnt--; + mutex_exit(&dzp->z_lock); + return; + } + prev_dl = &dzp->z_dirlocks; + while ((cur_dl = *prev_dl) != dl) + prev_dl = &cur_dl->dl_next; + *prev_dl = dl->dl_next; + cv_broadcast(&dl->dl_cv); + mutex_exit(&dzp->z_lock); + + if (dl->dl_namesize != 0) + kmem_free(dl->dl_name, dl->dl_namesize); + cv_destroy(&dl->dl_cv); + kmem_free(dl, sizeof (*dl)); +} + +/* + * Look up an entry in a directory. + * + * NOTE: '.' and '..' are handled as special cases because + * no directory entries are actually stored for them. If this is + * the root of a filesystem, then '.zfs' is also treated as a + * special pseudo-directory. + */ +int +zfs_dirlook(znode_t *dzp, char *name, znode_t **zpp, int flags, + int *deflg, pathname_t *rpnp) +{ + zfs_dirlock_t *dl; + znode_t *zp; + struct inode *ip; + int error = 0; + uint64_t parent; + + if (name[0] == 0 || (name[0] == '.' && name[1] == 0)) { + *zpp = dzp; + zhold(*zpp); + } else if (name[0] == '.' && name[1] == '.' && name[2] == 0) { + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + + /* + * If we are a snapshot mounted under .zfs, return + * the inode pointer for the snapshot directory. + */ + if ((error = sa_lookup(dzp->z_sa_hdl, + SA_ZPL_PARENT(zfsvfs), &parent, sizeof (parent))) != 0) + return (error); + + if (parent == dzp->z_id && zfsvfs->z_parent != zfsvfs) { + error = zfsctl_root_lookup(zfsvfs->z_parent->z_ctldir, + "snapshot", &ip, 0, kcred, NULL, NULL); + *zpp = ITOZ(ip); + return (error); + } + rw_enter(&dzp->z_parent_lock, RW_READER); + error = zfs_zget(zfsvfs, parent, &zp); + if (error == 0) + *zpp = zp; + rw_exit(&dzp->z_parent_lock); + } else if (zfs_has_ctldir(dzp) && strcmp(name, ZFS_CTLDIR_NAME) == 0) { + ip = zfsctl_root(dzp); + *zpp = ITOZ(ip); + } else { + int zf; + + zf = ZEXISTS | ZSHARED; + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zf, deflg, rpnp); + if (error == 0) { + *zpp = zp; + zfs_dirent_unlock(dl); + dzp->z_zn_prefetch = B_TRUE; /* enable prefetching */ + } + rpnp = NULL; + } + + if ((flags & FIGNORECASE) && rpnp && !error) + (void) strlcpy(rpnp->pn_buf, name, rpnp->pn_bufsize); + + return (error); +} + +/* + * unlinked Set (formerly known as the "delete queue") Error Handling + * + * When dealing with the unlinked set, we dmu_tx_hold_zap(), but we + * don't specify the name of the entry that we will be manipulating. We + * also fib and say that we won't be adding any new entries to the + * unlinked set, even though we might (this is to lower the minimum file + * size that can be deleted in a full filesystem). So on the small + * chance that the nlink list is using a fat zap (ie. has more than + * 2000 entries), we *may* not pre-read a block that's needed. + * Therefore it is remotely possible for some of the assertions + * regarding the unlinked set below to fail due to i/o error. On a + * nondebug system, this will result in the space being leaked. + */ +void +zfs_unlinked_add(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + ASSERT(zp->z_unlinked); + ASSERT(ZTOI(zp)->i_nlink == 0); + + VERIFY3U(0, ==, + zap_add_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, zp->z_id, tx)); + + dataset_kstats_update_nunlinks_kstat(&zfsvfs->z_kstat, 1); +} + +/* + * Clean up any znodes that had no links when we either crashed or + * (force) umounted the file system. + */ +static void +zfs_unlinked_drain_task(void *arg) +{ + zfsvfs_t *zfsvfs = arg; + zap_cursor_t zc; + zap_attribute_t zap; + dmu_object_info_t doi; + znode_t *zp; + int error; + + ASSERT3B(zfsvfs->z_draining, ==, B_TRUE); + + /* + * Iterate over the contents of the unlinked set. + */ + for (zap_cursor_init(&zc, zfsvfs->z_os, zfsvfs->z_unlinkedobj); + zap_cursor_retrieve(&zc, &zap) == 0 && !zfsvfs->z_drain_cancel; + zap_cursor_advance(&zc)) { + + /* + * See what kind of object we have in list + */ + + error = dmu_object_info(zfsvfs->z_os, + zap.za_first_integer, &doi); + if (error != 0) + continue; + + ASSERT((doi.doi_type == DMU_OT_PLAIN_FILE_CONTENTS) || + (doi.doi_type == DMU_OT_DIRECTORY_CONTENTS)); + /* + * We need to re-mark these list entries for deletion, + * so we pull them back into core and set zp->z_unlinked. + */ + error = zfs_zget(zfsvfs, zap.za_first_integer, &zp); + + /* + * We may pick up znodes that are already marked for deletion. + * This could happen during the purge of an extended attribute + * directory. All we need to do is skip over them, since they + * are already in the system marked z_unlinked. + */ + if (error != 0) + continue; + + zp->z_unlinked = B_TRUE; + + /* + * zrele() decrements the znode's ref count and may cause + * it to be synchronously freed. We interrupt freeing + * of this znode by checking the return value of + * dmu_objset_zfs_unmounting() in dmu_free_long_range() + * when an unmount is requested. + */ + zrele(zp); + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + } + zap_cursor_fini(&zc); + + zfsvfs->z_draining = B_FALSE; + zfsvfs->z_drain_task = TASKQID_INVALID; +} + +/* + * Sets z_draining then tries to dispatch async unlinked drain. + * If that fails executes synchronous unlinked drain. + */ +void +zfs_unlinked_drain(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + ASSERT3B(zfsvfs->z_draining, ==, B_FALSE); + + zfsvfs->z_draining = B_TRUE; + zfsvfs->z_drain_cancel = B_FALSE; + + zfsvfs->z_drain_task = taskq_dispatch( + dsl_pool_unlinked_drain_taskq(dmu_objset_pool(zfsvfs->z_os)), + zfs_unlinked_drain_task, zfsvfs, TQ_SLEEP); + if (zfsvfs->z_drain_task == TASKQID_INVALID) { + zfs_dbgmsg("async zfs_unlinked_drain dispatch failed"); + zfs_unlinked_drain_task(zfsvfs); + } +} + +/* + * Wait for the unlinked drain taskq task to stop. This will interrupt the + * unlinked set processing if it is in progress. + */ +void +zfs_unlinked_drain_stop_wait(zfsvfs_t *zfsvfs) +{ + ASSERT3B(zfsvfs->z_unmounted, ==, B_FALSE); + + if (zfsvfs->z_draining) { + zfsvfs->z_drain_cancel = B_TRUE; + taskq_cancel_id(dsl_pool_unlinked_drain_taskq( + dmu_objset_pool(zfsvfs->z_os)), zfsvfs->z_drain_task); + zfsvfs->z_drain_task = TASKQID_INVALID; + zfsvfs->z_draining = B_FALSE; + } +} + +/* + * Delete the entire contents of a directory. Return a count + * of the number of entries that could not be deleted. If we encounter + * an error, return a count of at least one so that the directory stays + * in the unlinked set. + * + * NOTE: this function assumes that the directory is inactive, + * so there is no need to lock its entries before deletion. + * Also, it assumes the directory contents is *only* regular + * files. + */ +static int +zfs_purgedir(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + znode_t *xzp; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zfs_dirlock_t dl; + int skipped = 0; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + error = zfs_zget(zfsvfs, + ZFS_DIRENT_OBJ(zap.za_first_integer), &xzp); + if (error) { + skipped += 1; + continue; + } + + ASSERT(S_ISREG(ZTOI(xzp)->i_mode) || + S_ISLNK(ZTOI(xzp)->i_mode)); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, zap.za_name); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + /* Is this really needed ? */ + zfs_sa_upgrade_txholds(tx, xzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_zrele_async(xzp); + skipped += 1; + continue; + } + bzero(&dl, sizeof (dl)); + dl.dl_dzp = dzp; + dl.dl_name = zap.za_name; + + error = zfs_link_destroy(&dl, xzp, tx, 0, NULL); + if (error) + skipped += 1; + dmu_tx_commit(tx); + + zfs_zrele_async(xzp); + } + zap_cursor_fini(&zc); + if (error != ENOENT) + skipped += 1; + return (skipped); +} + +void +zfs_rmnode(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + znode_t *xzp = NULL; + dmu_tx_t *tx; + uint64_t acl_obj; + uint64_t xattr_obj; + uint64_t links; + int error; + + ASSERT(ZTOI(zp)->i_nlink == 0); + ASSERT(atomic_read(&ZTOI(zp)->i_count) == 0); + + /* + * If this is an attribute directory, purge its contents. + */ + if (S_ISDIR(ZTOI(zp)->i_mode) && (zp->z_pflags & ZFS_XATTR)) { + if (zfs_purgedir(zp) != 0) { + /* + * Not enough space to delete some xattrs. + * Leave it in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + + return; + } + } + + /* + * Free up all the data in the file. We don't do this for directories + * because we need truncate and remove to be in the same tx, like in + * zfs_znode_delete(). Otherwise, if we crash here we'll end up with + * an inconsistent truncated zap object in the delete queue. Note a + * truncated file is harmless since it only contains user data. + */ + if (S_ISREG(ZTOI(zp)->i_mode)) { + error = dmu_free_long_range(os, zp->z_id, 0, DMU_OBJECT_END); + if (error) { + /* + * Not enough space or we were interrupted by unmount. + * Leave the file in the unlinked set. + */ + zfs_znode_dmu_fini(zp); + return; + } + } + + /* + * If the file has extended attributes, we're going to unlink + * the xattr dir. + */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT(error == 0); + } + + acl_obj = zfs_external_acl(zp); + + /* + * Set up the final transaction. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, zp->z_id, 0, DMU_OBJECT_END); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + if (xzp) { + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, TRUE, NULL); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + if (acl_obj) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + /* + * Not enough space to delete the file. Leave it in the + * unlinked set, leaking it until the fs is remounted (at + * which point we'll call zfs_unlinked_drain() to process it). + */ + dmu_tx_abort(tx); + zfs_znode_dmu_fini(zp); + goto out; + } + + if (xzp) { + ASSERT(error == 0); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; /* mark xzp for deletion */ + clear_nlink(ZTOI(xzp)); /* no more links to it */ + links = 0; + VERIFY(0 == sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx)); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + } + + mutex_enter(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + /* + * Remove this znode from the unlinked set. If a has rollback has + * occurred while a file is open and unlinked. Then when the file + * is closed post rollback it will not exist in the rolled back + * version of the unlinked object. + */ + error = zap_remove_int(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + zp->z_id, tx); + VERIFY(error == 0 || error == ENOENT); + + uint64_t count; + if (zap_count(os, zfsvfs->z_unlinkedobj, &count) == 0 && count == 0) { + cv_broadcast(&os->os_dsl_dataset->ds_dir->dd_activity_cv); + } + + mutex_exit(&os->os_dsl_dataset->ds_dir->dd_activity_lock); + + dataset_kstats_update_nunlinked_kstat(&zfsvfs->z_kstat, 1); + + zfs_znode_delete(zp, tx); + + dmu_tx_commit(tx); +out: + if (xzp) + zfs_zrele_async(xzp); +} + +static uint64_t +zfs_dirent(znode_t *zp, uint64_t mode) +{ + uint64_t de = zp->z_id; + + if (ZTOZSB(zp)->z_version >= ZPL_VERSION_DIRENT_TYPE) + de |= IFTODT(mode) << 60; + return (de); +} + +/* + * Link zp into dl. Can fail in the following cases : + * - if zp has been unlinked. + * - if the number of entries with the same hash (aka. colliding entries) + * exceed the capacity of a leaf-block of fatzap and splitting of the + * leaf-block does not help. + */ +int +zfs_link_create(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t value; + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + mutex_enter(&zp->z_lock); + + if (!(flag & ZRENAMING)) { + if (zp->z_unlinked) { /* no new links to unlinked zp */ + ASSERT(!(flag & (ZNEW | ZEXISTS))); + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOENT)); + } + if (!(flag & ZNEW)) { + /* + * ZNEW nodes come from zfs_mknode() where the link + * count has already been initialised + */ + inc_nlink(ZTOI(zp)); + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + } + } + + value = zfs_dirent(zp, zp->z_mode); + error = zap_add(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, 8, 1, + &value, tx); + + /* + * zap_add could fail to add the entry if it exceeds the capacity of the + * leaf-block and zap_leaf_split() failed to help. + * The caller of this routine is responsible for failing the transaction + * which will rollback the SA updates done above. + */ + if (error != 0) { + if (!(flag & ZRENAMING) && !(flag & ZNEW)) + drop_nlink(ZTOI(zp)); + mutex_exit(&zp->z_lock); + return (error); + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &dzp->z_id, sizeof (dzp->z_id)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (!(flag & ZNEW)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + mutex_exit(&zp->z_lock); + + mutex_enter(&dzp->z_lock); + dzp->z_size++; + if (zp_is_dir) + inc_nlink(ZTOI(dzp)); + links = ZTOI(dzp)->i_nlink; + count = 0; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + return (0); +} + +/* + * The match type in the code for this function should conform to: + * + * ------------------------------------------------------------------------ + * fs type | z_norm | lookup type | match type + * ---------|-------------|-------------|---------------------------------- + * CS !norm | 0 | 0 | 0 (exact) + * CS norm | formX | 0 | MT_NORMALIZE + * CI !norm | upper | !ZCIEXACT | MT_NORMALIZE + * CI !norm | upper | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CI norm | upper|formX | !ZCIEXACT | MT_NORMALIZE + * CI norm | upper|formX | ZCIEXACT | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM !norm | upper | ZCILOOK | MT_NORMALIZE + * CM norm | upper|formX | !ZCILOOK | MT_NORMALIZE | MT_MATCH_CASE + * CM norm | upper|formX | ZCILOOK | MT_NORMALIZE + * + * Abbreviations: + * CS = Case Sensitive, CI = Case Insensitive, CM = Case Mixed + * upper = case folding set by fs type on creation (U8_TEXTPREP_TOUPPER) + * formX = unicode normalization form set on fs creation + */ +static int +zfs_dropname(zfs_dirlock_t *dl, znode_t *zp, znode_t *dzp, dmu_tx_t *tx, + int flag) +{ + int error; + + if (ZTOZSB(zp)->z_norm) { + matchtype_t mt = MT_NORMALIZE; + + if ((ZTOZSB(zp)->z_case == ZFS_CASE_INSENSITIVE && + (flag & ZCIEXACT)) || + (ZTOZSB(zp)->z_case == ZFS_CASE_MIXED && + !(flag & ZCILOOK))) { + mt |= MT_MATCH_CASE; + } + + error = zap_remove_norm(ZTOZSB(zp)->z_os, dzp->z_id, + dl->dl_name, mt, tx); + } else { + error = zap_remove(ZTOZSB(zp)->z_os, dzp->z_id, dl->dl_name, + tx); + } + + return (error); +} + +/* + * Unlink zp from dl, and mark zp for deletion if this was the last link. Can + * fail if zp is a mount point (EBUSY) or a non-empty directory (ENOTEMPTY). + * If 'unlinkedp' is NULL, we put unlinked znodes on the unlinked list. + * If it's non-NULL, we use it to indicate whether the znode needs deletion, + * and it's the caller's job to do it. + */ +int +zfs_link_destroy(zfs_dirlock_t *dl, znode_t *zp, dmu_tx_t *tx, int flag, + boolean_t *unlinkedp) +{ + znode_t *dzp = dl->dl_dzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + int zp_is_dir = S_ISDIR(ZTOI(zp)->i_mode); + boolean_t unlinked = B_FALSE; + sa_bulk_attr_t bulk[5]; + uint64_t mtime[2], ctime[2]; + uint64_t links; + int count = 0; + int error; + + if (!(flag & ZRENAMING)) { + mutex_enter(&zp->z_lock); + + if (zp_is_dir && !zfs_dirempty(zp)) { + mutex_exit(&zp->z_lock); + return (SET_ERROR(ENOTEMPTY)); + } + + /* + * If we get here, we are going to try to remove the object. + * First try removing the name from the directory; if that + * fails, return the error. + */ + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) { + mutex_exit(&zp->z_lock); + return (error); + } + + if (ZTOI(zp)->i_nlink <= zp_is_dir) { + zfs_panic_recover("zfs: link count on %lu is %u, " + "should be at least %u", zp->z_id, + (int)ZTOI(zp)->i_nlink, zp_is_dir + 1); + set_nlink(ZTOI(zp), zp_is_dir + 1); + } + drop_nlink(ZTOI(zp)); + if (ZTOI(zp)->i_nlink == zp_is_dir) { + zp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(zp)); + unlinked = B_TRUE; + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, sizeof (zp->z_pflags)); + zfs_tstamp_update_setup(zp, STATE_CHANGED, mtime, + ctime); + } + links = ZTOI(zp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + count = 0; + ASSERT(error == 0); + mutex_exit(&zp->z_lock); + } else { + error = zfs_dropname(dl, zp, dzp, tx, flag); + if (error != 0) + return (error); + } + + mutex_enter(&dzp->z_lock); + dzp->z_size--; /* one dirent removed */ + if (zp_is_dir) + drop_nlink(ZTOI(dzp)); /* ".." link from zp */ + links = ZTOI(dzp)->i_nlink; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), + NULL, &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &dzp->z_size, sizeof (dzp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, ctime, sizeof (ctime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), + NULL, mtime, sizeof (mtime)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &dzp->z_pflags, sizeof (dzp->z_pflags)); + zfs_tstamp_update_setup(dzp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(dzp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + mutex_exit(&dzp->z_lock); + + if (unlinkedp != NULL) + *unlinkedp = unlinked; + else if (unlinked) + zfs_unlinked_add(zp, tx); + + return (0); +} + +/* + * Indicate whether the directory is empty. Works with or without z_lock + * held, but can only be consider a hint in the latter case. Returns true + * if only "." and ".." remain and there's no work in progress. + * + * The internal ZAP size, rather than zp->z_size, needs to be checked since + * some consumers (Lustre) do not strictly maintain an accurate SA_ZPL_SIZE. + */ +boolean_t +zfs_dirempty(znode_t *dzp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + uint64_t count; + int error; + + if (dzp->z_dirlocks != NULL) + return (B_FALSE); + + error = zap_count(zfsvfs->z_os, dzp->z_id, &count); + if (error != 0 || count != 0) + return (B_FALSE); + + return (B_TRUE); +} + +int +zfs_make_xattrdir(znode_t *zp, vattr_t *vap, znode_t **xzpp, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + dmu_tx_t *tx; + int error; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; +#ifdef ZFS_DEBUG + uint64_t parent; +#endif + + *xzpp = NULL; + + if ((error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))) + return (error); + + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zp->z_projid)) { + zfs_acl_ids_free(&acl_ids); + return (SET_ERROR(EDQUOT)); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + return (error); + } + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + +#ifdef ZFS_DEBUG + error = sa_lookup(xzp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent)); + ASSERT(error == 0 && parent == zp->z_id); +#endif + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), &xzp->z_id, + sizeof (xzp->z_id), tx)); + + if (!zp->z_unlinked) + (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + + *xzpp = xzp; + + return (0); +} + +/* + * Return a znode for the extended attribute directory for zp. + * ** If the directory does not already exist, it is created ** + * + * IN: zp - znode to obtain attribute directory from + * cr - credentials of caller + * flags - flags from the VOP_LOOKUP call + * + * OUT: xipp - pointer to extended attribute znode + * + * RETURN: 0 on success + * error number on failure + */ +int +zfs_get_xattrdir(znode_t *zp, znode_t **xzpp, cred_t *cr, int flags) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + znode_t *xzp; + zfs_dirlock_t *dl; + vattr_t va; + int error; +top: + error = zfs_dirent_lock(&dl, zp, "", &xzp, ZXATTR, NULL, NULL); + if (error) + return (error); + + if (xzp != NULL) { + *xzpp = xzp; + zfs_dirent_unlock(dl); + return (0); + } + + if (!(flags & CREATE_XATTR_DIR)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(ENOENT)); + } + + if (zfs_is_readonly(zfsvfs)) { + zfs_dirent_unlock(dl); + return (SET_ERROR(EROFS)); + } + + /* + * The ability to 'create' files in an attribute + * directory comes from the write_xattr permission on the base file. + * + * The ability to 'search' an attribute directory requires + * read_xattr permission on the base file. + * + * Once in a directory the ability to read/write attributes + * is controlled by the permissions on the attribute file. + */ + va.va_mask = ATTR_MODE | ATTR_UID | ATTR_GID; + va.va_mode = S_IFDIR | S_ISVTX | 0777; + zfs_fuid_map_ids(zp, cr, &va.va_uid, &va.va_gid); + + va.va_dentry = NULL; + error = zfs_make_xattrdir(zp, &va, xzpp, cr); + zfs_dirent_unlock(dl); + + if (error == ERESTART) { + /* NB: we already did dmu_tx_wait() if necessary */ + goto top; + } + + return (error); +} + +/* + * Decide whether it is okay to remove within a sticky directory. + * + * In sticky directories, write access is not sufficient; + * you can remove entries from a directory only if: + * + * you own the directory, + * you own the entry, + * you have write access to the entry, + * or you are privileged (checked in secpolicy...). + * + * The function returns 0 if remove access is granted. + */ +int +zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) +{ + uid_t uid; + uid_t downer; + uid_t fowner; + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + + if (zfsvfs->z_replay) + return (0); + + if ((zdp->z_mode & S_ISVTX) == 0) + return (0); + + downer = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zdp)->i_uid), + cr, ZFS_OWNER); + fowner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(ZTOI(zp)->i_uid), + cr, ZFS_OWNER); + + if ((uid = crgetuid(cr)) == downer || uid == fowner || + zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr) == 0) + return (0); + else + return (secpolicy_vnode_remove(cr)); +} diff --git a/module/os/linux/zfs/zfs_file_os.c b/module/os/linux/zfs/zfs_file_os.c new file mode 100644 index 000000000000..99c6ffc95940 --- /dev/null +++ b/module/os/linux/zfs/zfs_file_os.c @@ -0,0 +1,440 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#ifdef HAVE_FDTABLE_HEADER +#include +#endif + +/* + * Open file + * + * path - fully qualified path to file + * flags - file attributes O_READ / O_WRITE / O_EXCL + * fpp - pointer to return file pointer + * + * Returns 0 on success underlying error on failure. + */ +int +zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) +{ + struct file *filp; + int saved_umask; + + if (!(flags & O_CREAT) && (flags & O_WRONLY)) + flags |= O_EXCL; + + if (flags & O_CREAT) + saved_umask = xchg(¤t->fs->umask, 0); + + filp = filp_open(path, flags, mode); + + if (flags & O_CREAT) + (void) xchg(¤t->fs->umask, saved_umask); + + if (IS_ERR(filp)) + return (-PTR_ERR(filp)); + + *fpp = filp; + return (0); +} + +void +zfs_file_close(zfs_file_t *fp) +{ + filp_close(fp, 0); +} + +static ssize_t +zfs_file_write_impl(zfs_file_t *fp, const void *buf, size_t count, loff_t *off) +{ +#if defined(HAVE_KERNEL_WRITE_PPOS) + return (kernel_write(fp, buf, count, off)); +#else + mm_segment_t saved_fs; + ssize_t rc; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + rc = vfs_write(fp, (__force const char __user __user *)buf, count, off); + + set_fs(saved_fs); + + return (rc); +#endif +} + +/* + * Stateful write - use os internal file pointer to determine where to + * write and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_write(zfs_file_t *fp, const void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_pos; + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + fp->f_pos = off; + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless write - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to write to + * buf - buffer to write + * count - # of bytes to write + * off - file offset to write to (only valid for seekable types) + * resid - pointer to count of unwritten bytes + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pwrite(zfs_file_t *fp, const void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = zfs_file_write_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +static ssize_t +zfs_file_read_impl(zfs_file_t *fp, void *buf, size_t count, loff_t *off) +{ +#if defined(HAVE_KERNEL_READ_PPOS) + return (kernel_read(fp, buf, count, off)); +#else + mm_segment_t saved_fs; + ssize_t rc; + + saved_fs = get_fs(); + set_fs(KERNEL_DS); + + rc = vfs_read(fp, (void __user *)buf, count, off); + set_fs(saved_fs); + + return (rc); +#endif +} + +/* + * Stateful read - use os internal file pointer to determine where to + * read and update on successful completion. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to read + * resid - pointer to count of unread bytes (if short read) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_read(zfs_file_t *fp, void *buf, size_t count, ssize_t *resid) +{ + loff_t off = fp->f_pos; + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + fp->f_pos = off; + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * Stateless read - os internal file pointer is not updated. + * + * fp - pointer to file (pipe, socket, etc) to read from + * buf - buffer to write + * count - # of bytes to write + * off - file offset to read from (only valid for seekable types) + * resid - pointer to count of unwritten bytes (if short write) + * + * Returns 0 on success errno on failure. + */ +int +zfs_file_pread(zfs_file_t *fp, void *buf, size_t count, loff_t off, + ssize_t *resid) +{ + ssize_t rc; + + rc = zfs_file_read_impl(fp, buf, count, &off); + if (rc < 0) + return (-rc); + + if (resid) { + *resid = count - rc; + } else if (rc != count) { + return (EIO); + } + + return (0); +} + +/* + * lseek - set / get file pointer + * + * fp - pointer to file (pipe, socket, etc) to read from + * offp - value to seek to, returns current value plus passed offset + * whence - see man pages for standard lseek whence values + * + * Returns 0 on success errno on failure (ESPIPE for non seekable types) + */ +int +zfs_file_seek(zfs_file_t *fp, loff_t *offp, int whence) +{ + loff_t rc; + + if (*offp < 0 || *offp > MAXOFFSET_T) + return (EINVAL); + + rc = vfs_llseek(fp, *offp, whence); + if (rc < 0) + return (-rc); + + *offp = rc; + + return (0); +} + +/* + * Get file attributes + * + * filp - file pointer + * zfattr - pointer to file attr structure + * + * Currently only used for fetching size and file mode. + * + * Returns 0 on success or error code of underlying getattr call on failure. + */ +int +zfs_file_getattr(zfs_file_t *filp, zfs_file_attr_t *zfattr) +{ + struct kstat stat; + int rc; + +#if defined(HAVE_4ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, &stat, STATX_BASIC_STATS, + AT_STATX_SYNC_AS_STAT); +#elif defined(HAVE_2ARGS_VFS_GETATTR) + rc = vfs_getattr(&filp->f_path, &stat); +#else + rc = vfs_getattr(filp->f_path.mnt, filp->f_dentry, &stat); +#endif + if (rc) + return (-rc); + + zfattr->zfa_size = stat.size; + zfattr->zfa_mode = stat.mode; + + return (0); +} + +/* + * Sync file to disk + * + * filp - file pointer + * flags - O_SYNC and or O_DSYNC + * + * Returns 0 on success or error code of underlying sync call on failure. + */ +int +zfs_file_fsync(zfs_file_t *filp, int flags) +{ + int datasync = 0; + int error; + int fstrans; + + if (flags & O_DSYNC) + datasync = 1; + + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + error = -vfs_fsync(filp, datasync); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} + +/* + * fallocate - allocate or free space on disk + * + * fp - file pointer + * mode (non-standard options for hole punching etc) + * offset - offset to start allocating or freeing from + * len - length to free / allocate + * + * OPTIONAL + */ +int +zfs_file_fallocate(zfs_file_t *fp, int mode, loff_t offset, loff_t len) +{ + /* + * May enter XFS which generates a warning when PF_FSTRANS is set. + * To avoid this the flag is cleared over vfs_sync() and then reset. + */ + int fstrans = __spl_pf_fstrans_check(); + if (fstrans) + current->flags &= ~(__SPL_PF_FSTRANS); + + /* + * When supported by the underlying file system preferentially + * use the fallocate() callback to preallocate the space. + */ + int error = EOPNOTSUPP; + if (fp->f_op->fallocate) + error = fp->f_op->fallocate(fp, mode, offset, len); + + if (fstrans) + current->flags |= __SPL_PF_FSTRANS; + + return (error); +} + +/* + * Request current file pointer offset + * + * fp - pointer to file + * + * Returns current file offset. + */ +loff_t +zfs_file_off(zfs_file_t *fp) +{ + return (fp->f_pos); +} + +/* + * Request file pointer private data + * + * fp - pointer to file + * + * Returns pointer to file private data. + */ +void * +zfs_file_private(zfs_file_t *fp) +{ + return (fp->private_data); +} + +/* + * unlink file + * + * path - fully qualified file path + * + * Returns 0 on success. + * + * OPTIONAL + */ +int +zfs_file_unlink(const char *path) +{ + return (EOPNOTSUPP); +} + +/* + * Get reference to file pointer + * + * fd - input file descriptor + * fpp - pointer to file pointer + * + * Returns 0 on success EBADF on failure. + */ +int +zfs_file_get(int fd, zfs_file_t **fpp) +{ + zfs_file_t *fp; + + fp = fget(fd); + if (fp == NULL) + return (EBADF); + + *fpp = fp; + + return (0); +} + +/* + * Drop reference to file pointer + * + * fd - input file descriptor + */ +void +zfs_file_put(int fd) +{ + struct file *fp; + + if ((fp = fget(fd)) != NULL) { + fput(fp); + fput(fp); + } +} diff --git a/module/os/linux/zfs/zfs_ioctl_os.c b/module/os/linux/zfs/zfs_ioctl_os.c new file mode 100644 index 000000000000..b88e0497d000 --- /dev/null +++ b/module/os/linux/zfs/zfs_ioctl_os.c @@ -0,0 +1,329 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Portions Copyright 2012 Pawel Jakub Dawidek + * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +boolean_t +zfs_vfs_held(zfsvfs_t *zfsvfs) +{ + return (zfsvfs->z_sb != NULL); +} + +int +zfs_vfs_ref(zfsvfs_t **zfvp) +{ + if (*zfvp == NULL || (*zfvp)->z_sb == NULL || + !atomic_inc_not_zero(&((*zfvp)->z_sb->s_active))) { + return (SET_ERROR(ESRCH)); + } + return (0); +} + +void +zfs_vfs_rele(zfsvfs_t *zfsvfs) +{ + deactivate_super(zfsvfs->z_sb); +} + +static int +zfsdev_state_init(struct file *filp) +{ + zfsdev_state_t *zs, *zsprev = NULL; + minor_t minor; + boolean_t newzs = B_FALSE; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + minor = zfsdev_minor_alloc(); + if (minor == 0) + return (SET_ERROR(ENXIO)); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == -1) + break; + zsprev = zs; + } + + if (!zs) { + zs = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + newzs = B_TRUE; + } + + filp->private_data = zs; + + zfs_onexit_init((zfs_onexit_t **)&zs->zs_onexit); + zfs_zevent_init((zfs_zevent_t **)&zs->zs_zevent); + + /* + * In order to provide for lock-free concurrent read access + * to the minor list in zfsdev_get_state_impl(), new entries + * must be completely written before linking them into the + * list whereas existing entries are already linked; the last + * operation must be updating zs_minor (from -1 to the new + * value). + */ + if (newzs) { + zs->zs_minor = minor; + smp_wmb(); + zsprev->zs_next = zs; + } else { + smp_wmb(); + zs->zs_minor = minor; + } + + return (0); +} + +static int +zfsdev_state_destroy(struct file *filp) +{ + zfsdev_state_t *zs; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + ASSERT(filp->private_data != NULL); + + zs = filp->private_data; + zs->zs_minor = -1; + zfs_onexit_destroy(zs->zs_onexit); + zfs_zevent_destroy(zs->zs_zevent); + zs->zs_onexit = NULL; + zs->zs_zevent = NULL; + + return (0); +} + +static int +zfsdev_open(struct inode *ino, struct file *filp) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_init(filp); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static int +zfsdev_release(struct inode *ino, struct file *filp) +{ + int error; + + mutex_enter(&zfsdev_state_lock); + error = zfsdev_state_destroy(filp); + mutex_exit(&zfsdev_state_lock); + + return (-error); +} + +static long +zfsdev_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + uint_t vecnum; + zfs_cmd_t *zc; + int error, rc; + + vecnum = cmd - ZFS_IOC_FIRST; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + + if (ddi_copyin((void *)(uintptr_t)arg, zc, sizeof (zfs_cmd_t), 0)) { + error = -SET_ERROR(EFAULT); + goto out; + } + error = -zfsdev_ioctl_common(vecnum, zc, 0); + rc = ddi_copyout(zc, (void *)(uintptr_t)arg, sizeof (zfs_cmd_t), 0); + if (error == 0 && rc != 0) + error = -SET_ERROR(EFAULT); +out: + kmem_free(zc, sizeof (zfs_cmd_t)); + return (error); + +} + +uint64_t +zfs_max_nvlist_src_size_os(void) +{ + if (zfs_max_nvlist_src_size != 0) + return (zfs_max_nvlist_src_size); + + return (KMALLOC_MAX_SIZE); +} + +void +zfs_ioctl_init_os(void) +{ +} + +#ifdef CONFIG_COMPAT +static long +zfsdev_compat_ioctl(struct file *filp, unsigned cmd, unsigned long arg) +{ + return (zfsdev_ioctl(filp, cmd, arg)); +} +#else +#define zfsdev_compat_ioctl NULL +#endif + +static const struct file_operations zfsdev_fops = { + .open = zfsdev_open, + .release = zfsdev_release, + .unlocked_ioctl = zfsdev_ioctl, + .compat_ioctl = zfsdev_compat_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice zfs_misc = { + .minor = ZFS_DEVICE_MINOR, + .name = ZFS_DRIVER, + .fops = &zfsdev_fops, +}; + +MODULE_ALIAS_MISCDEV(ZFS_DEVICE_MINOR); +MODULE_ALIAS("devname:zfs"); + +int +zfsdev_attach(void) +{ + int error; + + error = misc_register(&zfs_misc); + if (error == -EBUSY) { + /* + * Fallback to dynamic minor allocation in the event of a + * collision with a reserved minor in linux/miscdevice.h. + * In this case the kernel modules must be manually loaded. + */ + printk(KERN_INFO "ZFS: misc_register() with static minor %d " + "failed %d, retrying with MISC_DYNAMIC_MINOR\n", + ZFS_DEVICE_MINOR, error); + + zfs_misc.minor = MISC_DYNAMIC_MINOR; + error = misc_register(&zfs_misc); + } + + if (error) + printk(KERN_INFO "ZFS: misc_register() failed %d\n", error); + + return (error); +} + +void +zfsdev_detach(void) +{ + misc_deregister(&zfs_misc); +} + +#ifdef ZFS_DEBUG +#define ZFS_DEBUG_STR " (DEBUG mode)" +#else +#define ZFS_DEBUG_STR "" +#endif + +static int __init +_init(void) +{ + int error; + + if ((error = zfs_kmod_init()) != 0) { + printk(KERN_NOTICE "ZFS: Failed to Load ZFS Filesystem v%s-%s%s" + ", rc = %d\n", ZFS_META_VERSION, ZFS_META_RELEASE, + ZFS_DEBUG_STR, error); + + return (-error); + } + + zfs_sysfs_init(); + + printk(KERN_NOTICE "ZFS: Loaded module v%s-%s%s, " + "ZFS pool version %s, ZFS filesystem version %s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR, + SPA_VERSION_STRING, ZPL_VERSION_STRING); +#ifndef CONFIG_FS_POSIX_ACL + printk(KERN_NOTICE "ZFS: Posix ACLs disabled by kernel\n"); +#endif /* CONFIG_FS_POSIX_ACL */ + + return (0); +} + +static void __exit +_fini(void) +{ + zfs_sysfs_fini(); + zfs_kmod_fini(); + + printk(KERN_NOTICE "ZFS: Unloaded module v%s-%s%s\n", + ZFS_META_VERSION, ZFS_META_RELEASE, ZFS_DEBUG_STR); +} + +#if defined(_KERNEL) +module_init(_init); +module_exit(_fini); +#endif + +ZFS_MODULE_DESCRIPTION("ZFS"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); diff --git a/module/os/linux/zfs/zfs_sysfs.c b/module/os/linux/zfs/zfs_sysfs.c new file mode 100644 index 000000000000..fb7c68987360 --- /dev/null +++ b/module/os/linux/zfs/zfs_sysfs.c @@ -0,0 +1,662 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" + +#if !defined(_KERNEL) +#error kernel builds only +#endif + +/* + * ZFS Module sysfs support + * + * This extends our sysfs '/sys/module/zfs' entry to include feature + * and property attributes. The primary consumer of this information + * is user processes, like the zfs CLI, that need to know what the + * current loaded ZFS module supports. The libzfs binary will consult + * this information when instantiating the zfs|zpool property tables + * and the pool features table. + * + * The added top-level directories are: + * /sys/module/zfs + * ├── features.kernel + * ├── features.pool + * ├── properties.dataset + * └── properties.pool + * + * The local interface for the zfs kobjects includes: + * zfs_kobj_init() + * zfs_kobj_add() + * zfs_kobj_release() + * zfs_kobj_add_attr() + * zfs_kobj_fini() + */ + +/* + * A zfs_mod_kobj_t represents a zfs kobject under '/sys/module/zfs' + */ +struct zfs_mod_kobj; +typedef struct zfs_mod_kobj zfs_mod_kobj_t; + +struct zfs_mod_kobj { + struct kobject zko_kobj; + struct kobj_type zko_kobj_type; + struct sysfs_ops zko_sysfs_ops; + size_t zko_attr_count; + struct attribute *zko_attr_list; /* allocated */ + struct attribute **zko_default_attrs; /* allocated */ + size_t zko_child_count; + zfs_mod_kobj_t *zko_children; /* allocated */ +}; + +#define ATTR_TABLE_SIZE(cnt) (sizeof (struct attribute) * (cnt)) +/* Note +1 for NULL terminator slot */ +#define DEFAULT_ATTR_SIZE(cnt) (sizeof (struct attribute *) * (cnt + 1)) +#define CHILD_TABLE_SIZE(cnt) (sizeof (zfs_mod_kobj_t) * (cnt)) + +/* + * These are the top-level kobjects under '/sys/module/zfs/' + */ +static zfs_mod_kobj_t kernel_features_kobj; +static zfs_mod_kobj_t pool_features_kobj; +static zfs_mod_kobj_t dataset_props_kobj; +static zfs_mod_kobj_t pool_props_kobj; + +/* + * The show function is used to provide the content + * of an attribute into a PAGE_SIZE buffer. + */ +typedef ssize_t (*sysfs_show_func)(struct kobject *, struct attribute *, + char *); + +static void +zfs_kobj_fini(zfs_mod_kobj_t *zkobj) +{ + /* finalize any child kobjects */ + if (zkobj->zko_child_count != 0) { + ASSERT(zkobj->zko_children); + for (int i = 0; i < zkobj->zko_child_count; i++) + zfs_kobj_fini(&zkobj->zko_children[i]); + } + + /* kobject_put() will call zfs_kobj_release() to release memory */ + kobject_del(&zkobj->zko_kobj); + kobject_put(&zkobj->zko_kobj); +} + +static void +zfs_kobj_release(struct kobject *kobj) +{ + zfs_mod_kobj_t *zkobj = container_of(kobj, zfs_mod_kobj_t, zko_kobj); + + if (zkobj->zko_attr_list != NULL) { + ASSERT3S(zkobj->zko_attr_count, !=, 0); + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(zkobj->zko_attr_count)); + zkobj->zko_attr_list = NULL; + } + + if (zkobj->zko_default_attrs != NULL) { + kmem_free(zkobj->zko_default_attrs, + DEFAULT_ATTR_SIZE(zkobj->zko_attr_count)); + zkobj->zko_default_attrs = NULL; + } + + if (zkobj->zko_child_count != 0) { + ASSERT(zkobj->zko_children); + + kmem_free(zkobj->zko_children, + CHILD_TABLE_SIZE(zkobj->zko_child_count)); + zkobj->zko_child_count = 0; + zkobj->zko_children = NULL; + } + + zkobj->zko_attr_count = 0; +} + +#ifndef sysfs_attr_init +#define sysfs_attr_init(attr) do {} while (0) +#endif + +static void +zfs_kobj_add_attr(zfs_mod_kobj_t *zkobj, int attr_num, const char *attr_name) +{ + VERIFY3U(attr_num, <, zkobj->zko_attr_count); + ASSERT(zkobj->zko_attr_list); + ASSERT(zkobj->zko_default_attrs); + + zkobj->zko_attr_list[attr_num].name = attr_name; + zkobj->zko_attr_list[attr_num].mode = 0444; + zkobj->zko_default_attrs[attr_num] = &zkobj->zko_attr_list[attr_num]; + sysfs_attr_init(&zkobj->zko_attr_list[attr_num]); +} + +static int +zfs_kobj_init(zfs_mod_kobj_t *zkobj, int attr_cnt, int child_cnt, + sysfs_show_func show_func) +{ + /* + * Initialize object's attributes. Count can be zero. + */ + if (attr_cnt > 0) { + zkobj->zko_attr_list = kmem_zalloc(ATTR_TABLE_SIZE(attr_cnt), + KM_SLEEP); + if (zkobj->zko_attr_list == NULL) + return (ENOMEM); + } + /* this will always have at least one slot for NULL termination */ + zkobj->zko_default_attrs = kmem_zalloc(DEFAULT_ATTR_SIZE(attr_cnt), + KM_SLEEP); + if (zkobj->zko_default_attrs == NULL) { + if (zkobj->zko_attr_list != NULL) { + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(attr_cnt)); + } + return (ENOMEM); + } + zkobj->zko_attr_count = attr_cnt; + zkobj->zko_kobj_type.default_attrs = zkobj->zko_default_attrs; + + if (child_cnt > 0) { + zkobj->zko_children = kmem_zalloc(CHILD_TABLE_SIZE(child_cnt), + KM_SLEEP); + if (zkobj->zko_children == NULL) { + if (zkobj->zko_default_attrs != NULL) { + kmem_free(zkobj->zko_default_attrs, + DEFAULT_ATTR_SIZE(attr_cnt)); + } + if (zkobj->zko_attr_list != NULL) { + kmem_free(zkobj->zko_attr_list, + ATTR_TABLE_SIZE(attr_cnt)); + } + return (ENOMEM); + } + zkobj->zko_child_count = child_cnt; + } + + zkobj->zko_sysfs_ops.show = show_func; + zkobj->zko_kobj_type.sysfs_ops = &zkobj->zko_sysfs_ops; + zkobj->zko_kobj_type.release = zfs_kobj_release; + + return (0); +} + +static int +zfs_kobj_add(zfs_mod_kobj_t *zkobj, struct kobject *parent, const char *name) +{ + /* zko_default_attrs must be NULL terminated */ + ASSERT(zkobj->zko_default_attrs != NULL); + ASSERT(zkobj->zko_default_attrs[zkobj->zko_attr_count] == NULL); + + kobject_init(&zkobj->zko_kobj, &zkobj->zko_kobj_type); + return (kobject_add(&zkobj->zko_kobj, parent, name)); +} + +/* + * Each zfs property has these common attributes + */ +static const char *zprop_attrs[] = { + "type", + "readonly", + "setonce", + "visible", + "values", + "default", + "datasets" /* zfs properties only */ +}; + +#define ZFS_PROP_ATTR_COUNT ARRAY_SIZE(zprop_attrs) +#define ZPOOL_PROP_ATTR_COUNT (ZFS_PROP_ATTR_COUNT - 1) + +static const char *zprop_types[] = { + "number", + "string", + "index", +}; + +typedef struct zfs_type_map { + zfs_type_t ztm_type; + const char *ztm_name; +} zfs_type_map_t; + +static zfs_type_map_t type_map[] = { + {ZFS_TYPE_FILESYSTEM, "filesystem"}, + {ZFS_TYPE_SNAPSHOT, "snapshot"}, + {ZFS_TYPE_VOLUME, "volume"}, + {ZFS_TYPE_BOOKMARK, "bookmark"} +}; + +/* + * Show the content for a zfs property attribute + */ +static ssize_t +zprop_sysfs_show(const char *attr_name, const zprop_desc_t *property, + char *buf, size_t buflen) +{ + const char *show_str; + char number[32]; + + /* For dataset properties list the dataset types that apply */ + if (strcmp(attr_name, "datasets") == 0 && + property->pd_types != ZFS_TYPE_POOL) { + int len = 0; + + for (int i = 0; i < ARRAY_SIZE(type_map); i++) { + if (type_map[i].ztm_type & property->pd_types) { + len += snprintf(buf + len, buflen - len, "%s ", + type_map[i].ztm_name); + } + } + len += snprintf(buf + len, buflen - len, "\n"); + return (len); + } + + if (strcmp(attr_name, "type") == 0) { + show_str = zprop_types[property->pd_proptype]; + } else if (strcmp(attr_name, "readonly") == 0) { + show_str = property->pd_attr == PROP_READONLY ? "1" : "0"; + } else if (strcmp(attr_name, "setonce") == 0) { + show_str = property->pd_attr == PROP_ONETIME ? "1" : "0"; + } else if (strcmp(attr_name, "visible") == 0) { + show_str = property->pd_visible ? "1" : "0"; + } else if (strcmp(attr_name, "values") == 0) { + show_str = property->pd_values ? property->pd_values : ""; + } else if (strcmp(attr_name, "default") == 0) { + switch (property->pd_proptype) { + case PROP_TYPE_NUMBER: + (void) snprintf(number, sizeof (number), "%llu", + (u_longlong_t)property->pd_numdefault); + show_str = number; + break; + case PROP_TYPE_STRING: + show_str = property->pd_strdefault ? + property->pd_strdefault : ""; + break; + case PROP_TYPE_INDEX: + if (zprop_index_to_string(property->pd_propnum, + property->pd_numdefault, &show_str, + property->pd_types) != 0) { + show_str = ""; + } + break; + default: + return (0); + } + } else { + return (0); + } + + return (snprintf(buf, buflen, "%s\n", show_str)); +} + +static ssize_t +dataset_property_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + zfs_prop_t prop = zfs_name_to_prop(kobject_name(kobj)); + zprop_desc_t *prop_tbl = zfs_prop_get_table(); + ssize_t len; + + ASSERT3U(prop, <, ZFS_NUM_PROPS); + + len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); + + return (len); +} + +static ssize_t +pool_property_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + zpool_prop_t prop = zpool_name_to_prop(kobject_name(kobj)); + zprop_desc_t *prop_tbl = zpool_prop_get_table(); + ssize_t len; + + ASSERT3U(prop, <, ZPOOL_NUM_PROPS); + + len = zprop_sysfs_show(attr->name, &prop_tbl[prop], buf, PAGE_SIZE); + + return (len); +} + +/* + * ZFS kernel feature attributes for '/sys/module/zfs/features.kernel' + * + * This list is intended for kernel features that don't have a pool feature + * association or that extend existing user kernel interfaces. + * + * A user process can easily check if the running zfs kernel module + * supports the new feature. + */ +static const char *zfs_kernel_features[] = { + /* --> Add new kernel features here */ + "com.delphix:vdev_initialize", + "org.zfsonlinux:vdev_trim", + "org.openzfs:l2arc_persistent", +}; + +#define KERNEL_FEATURE_COUNT ARRAY_SIZE(zfs_kernel_features) + +static ssize_t +kernel_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + if (strcmp(attr->name, "supported") == 0) + return (snprintf(buf, PAGE_SIZE, "yes\n")); + return (0); +} + +static void +kernel_feature_to_kobj(zfs_mod_kobj_t *parent, int slot, const char *name) +{ + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[slot]; + + ASSERT3U(slot, <, KERNEL_FEATURE_COUNT); + ASSERT(name); + + int err = zfs_kobj_init(zfs_kobj, 1, 0, kernel_feature_show); + if (err) + return; + + zfs_kobj_add_attr(zfs_kobj, 0, "supported"); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); +} + +static int +zfs_kernel_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) +{ + /* + * Create a parent kobject to host kernel features. + * + * '/sys/module/zfs/features.kernel' + */ + int err = zfs_kobj_init(zfs_kobj, 0, KERNEL_FEATURE_COUNT, + kernel_feature_show); + if (err) + return (err); + err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_KERNEL_FEATURES); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Now create a kobject for each feature. + * + * '/sys/module/zfs/features.kernel/' + */ + for (int f = 0; f < KERNEL_FEATURE_COUNT; f++) + kernel_feature_to_kobj(zfs_kobj, f, zfs_kernel_features[f]); + + return (0); +} + +/* + * Each pool feature has these common attributes + */ +static const char *pool_feature_attrs[] = { + "description", + "guid", + "uname", + "readonly_compatible", + "required_for_mos", + "activate_on_enable", + "per_dataset" +}; + +#define ZPOOL_FEATURE_ATTR_COUNT ARRAY_SIZE(pool_feature_attrs) + +/* + * Show the content for the given zfs pool feature attribute + */ +static ssize_t +pool_feature_show(struct kobject *kobj, struct attribute *attr, char *buf) +{ + spa_feature_t fid; + + if (zfeature_lookup_guid(kobject_name(kobj), &fid) != 0) + return (0); + + ASSERT3U(fid, <, SPA_FEATURES); + + zfeature_flags_t flags = spa_feature_table[fid].fi_flags; + const char *show_str = NULL; + + if (strcmp(attr->name, "description") == 0) { + show_str = spa_feature_table[fid].fi_desc; + } else if (strcmp(attr->name, "guid") == 0) { + show_str = spa_feature_table[fid].fi_guid; + } else if (strcmp(attr->name, "uname") == 0) { + show_str = spa_feature_table[fid].fi_uname; + } else if (strcmp(attr->name, "readonly_compatible") == 0) { + show_str = flags & ZFEATURE_FLAG_READONLY_COMPAT ? "1" : "0"; + } else if (strcmp(attr->name, "required_for_mos") == 0) { + show_str = flags & ZFEATURE_FLAG_MOS ? "1" : "0"; + } else if (strcmp(attr->name, "activate_on_enable") == 0) { + show_str = flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE ? "1" : "0"; + } else if (strcmp(attr->name, "per_dataset") == 0) { + show_str = flags & ZFEATURE_FLAG_PER_DATASET ? "1" : "0"; + } + if (show_str == NULL) + return (0); + + return (snprintf(buf, PAGE_SIZE, "%s\n", show_str)); +} + +static void +pool_feature_to_kobj(zfs_mod_kobj_t *parent, spa_feature_t fid, + const char *name) +{ + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[fid]; + + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(name); + + int err = zfs_kobj_init(zfs_kobj, ZPOOL_FEATURE_ATTR_COUNT, 0, + pool_feature_show); + if (err) + return; + + for (int i = 0; i < ZPOOL_FEATURE_ATTR_COUNT; i++) + zfs_kobj_add_attr(zfs_kobj, i, pool_feature_attrs[i]); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); +} + +static int +zfs_pool_features_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent) +{ + /* + * Create a parent kobject to host pool features. + * + * '/sys/module/zfs/features.pool' + */ + int err = zfs_kobj_init(zfs_kobj, 0, SPA_FEATURES, pool_feature_show); + if (err) + return (err); + err = zfs_kobj_add(zfs_kobj, parent, ZFS_SYSFS_POOL_FEATURES); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Now create a kobject for each feature. + * + * '/sys/module/zfs/features.pool/' + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) + pool_feature_to_kobj(zfs_kobj, i, spa_feature_table[i].fi_guid); + + return (0); +} + +typedef struct prop_to_kobj_arg { + zprop_desc_t *p2k_table; + zfs_mod_kobj_t *p2k_parent; + sysfs_show_func p2k_show_func; + int p2k_attr_count; +} prop_to_kobj_arg_t; + +static int +zprop_to_kobj(int prop, void *args) +{ + prop_to_kobj_arg_t *data = args; + zfs_mod_kobj_t *parent = data->p2k_parent; + zfs_mod_kobj_t *zfs_kobj = &parent->zko_children[prop]; + const char *name = data->p2k_table[prop].pd_name; + int err; + + ASSERT(name); + + err = zfs_kobj_init(zfs_kobj, data->p2k_attr_count, 0, + data->p2k_show_func); + if (err) + return (ZPROP_CONT); + + for (int i = 0; i < data->p2k_attr_count; i++) + zfs_kobj_add_attr(zfs_kobj, i, zprop_attrs[i]); + + err = zfs_kobj_add(zfs_kobj, &parent->zko_kobj, name); + if (err) + zfs_kobj_release(&zfs_kobj->zko_kobj); + + return (ZPROP_CONT); +} + +static int +zfs_sysfs_properties_init(zfs_mod_kobj_t *zfs_kobj, struct kobject *parent, + zfs_type_t type) +{ + prop_to_kobj_arg_t context; + const char *name; + int err; + + /* + * Create a parent kobject to host properties. + * + * '/sys/module/zfs/properties.' + */ + if (type == ZFS_TYPE_POOL) { + name = ZFS_SYSFS_POOL_PROPERTIES; + context.p2k_table = zpool_prop_get_table(); + context.p2k_attr_count = ZPOOL_PROP_ATTR_COUNT; + context.p2k_parent = zfs_kobj; + context.p2k_show_func = pool_property_show; + err = zfs_kobj_init(zfs_kobj, 0, ZPOOL_NUM_PROPS, + pool_property_show); + } else { + name = ZFS_SYSFS_DATASET_PROPERTIES; + context.p2k_table = zfs_prop_get_table(); + context.p2k_attr_count = ZFS_PROP_ATTR_COUNT; + context.p2k_parent = zfs_kobj; + context.p2k_show_func = dataset_property_show; + err = zfs_kobj_init(zfs_kobj, 0, ZFS_NUM_PROPS, + dataset_property_show); + } + + if (err) + return (err); + + err = zfs_kobj_add(zfs_kobj, parent, name); + if (err) { + zfs_kobj_release(&zfs_kobj->zko_kobj); + return (err); + } + + /* + * Create a kobject for each property. + * + * '/sys/module/zfs/properties./' + */ + (void) zprop_iter_common(zprop_to_kobj, &context, B_TRUE, + B_FALSE, type); + + return (err); +} + +void +zfs_sysfs_init(void) +{ + struct kobject *parent; +#if defined(CONFIG_ZFS) && !defined(CONFIG_ZFS_MODULE) + parent = kobject_create_and_add("zfs", fs_kobj); +#else + parent = &(((struct module *)(THIS_MODULE))->mkobj).kobj; +#endif + int err; + + if (parent == NULL) + return; + + err = zfs_kernel_features_init(&kernel_features_kobj, parent); + if (err) + return; + + err = zfs_pool_features_init(&pool_features_kobj, parent); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + return; + } + + err = zfs_sysfs_properties_init(&pool_props_kobj, parent, + ZFS_TYPE_POOL); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + return; + } + + err = zfs_sysfs_properties_init(&dataset_props_kobj, parent, + ZFS_TYPE_FILESYSTEM); + if (err) { + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + zfs_kobj_fini(&pool_props_kobj); + return; + } +} + +void +zfs_sysfs_fini(void) +{ + /* + * Remove top-level kobjects; each will remove any children kobjects + */ + zfs_kobj_fini(&kernel_features_kobj); + zfs_kobj_fini(&pool_features_kobj); + zfs_kobj_fini(&dataset_props_kobj); + zfs_kobj_fini(&pool_props_kobj); +} diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c new file mode 100644 index 000000000000..db831bf54704 --- /dev/null +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -0,0 +1,2165 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_comutil.h" + +enum { + TOKEN_RO, + TOKEN_RW, + TOKEN_SETUID, + TOKEN_NOSETUID, + TOKEN_EXEC, + TOKEN_NOEXEC, + TOKEN_DEVICES, + TOKEN_NODEVICES, + TOKEN_DIRXATTR, + TOKEN_SAXATTR, + TOKEN_XATTR, + TOKEN_NOXATTR, + TOKEN_ATIME, + TOKEN_NOATIME, + TOKEN_RELATIME, + TOKEN_NORELATIME, + TOKEN_NBMAND, + TOKEN_NONBMAND, + TOKEN_MNTPOINT, + TOKEN_LAST, +}; + +static const match_table_t zpl_tokens = { + { TOKEN_RO, MNTOPT_RO }, + { TOKEN_RW, MNTOPT_RW }, + { TOKEN_SETUID, MNTOPT_SETUID }, + { TOKEN_NOSETUID, MNTOPT_NOSETUID }, + { TOKEN_EXEC, MNTOPT_EXEC }, + { TOKEN_NOEXEC, MNTOPT_NOEXEC }, + { TOKEN_DEVICES, MNTOPT_DEVICES }, + { TOKEN_NODEVICES, MNTOPT_NODEVICES }, + { TOKEN_DIRXATTR, MNTOPT_DIRXATTR }, + { TOKEN_SAXATTR, MNTOPT_SAXATTR }, + { TOKEN_XATTR, MNTOPT_XATTR }, + { TOKEN_NOXATTR, MNTOPT_NOXATTR }, + { TOKEN_ATIME, MNTOPT_ATIME }, + { TOKEN_NOATIME, MNTOPT_NOATIME }, + { TOKEN_RELATIME, MNTOPT_RELATIME }, + { TOKEN_NORELATIME, MNTOPT_NORELATIME }, + { TOKEN_NBMAND, MNTOPT_NBMAND }, + { TOKEN_NONBMAND, MNTOPT_NONBMAND }, + { TOKEN_MNTPOINT, MNTOPT_MNTPOINT "=%s" }, + { TOKEN_LAST, NULL }, +}; + +static void +zfsvfs_vfs_free(vfs_t *vfsp) +{ + if (vfsp != NULL) { + if (vfsp->vfs_mntpoint != NULL) + kmem_strfree(vfsp->vfs_mntpoint); + + kmem_free(vfsp, sizeof (vfs_t)); + } +} + +static int +zfsvfs_parse_option(char *option, int token, substring_t *args, vfs_t *vfsp) +{ + switch (token) { + case TOKEN_RO: + vfsp->vfs_readonly = B_TRUE; + vfsp->vfs_do_readonly = B_TRUE; + break; + case TOKEN_RW: + vfsp->vfs_readonly = B_FALSE; + vfsp->vfs_do_readonly = B_TRUE; + break; + case TOKEN_SETUID: + vfsp->vfs_setuid = B_TRUE; + vfsp->vfs_do_setuid = B_TRUE; + break; + case TOKEN_NOSETUID: + vfsp->vfs_setuid = B_FALSE; + vfsp->vfs_do_setuid = B_TRUE; + break; + case TOKEN_EXEC: + vfsp->vfs_exec = B_TRUE; + vfsp->vfs_do_exec = B_TRUE; + break; + case TOKEN_NOEXEC: + vfsp->vfs_exec = B_FALSE; + vfsp->vfs_do_exec = B_TRUE; + break; + case TOKEN_DEVICES: + vfsp->vfs_devices = B_TRUE; + vfsp->vfs_do_devices = B_TRUE; + break; + case TOKEN_NODEVICES: + vfsp->vfs_devices = B_FALSE; + vfsp->vfs_do_devices = B_TRUE; + break; + case TOKEN_DIRXATTR: + vfsp->vfs_xattr = ZFS_XATTR_DIR; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_SAXATTR: + vfsp->vfs_xattr = ZFS_XATTR_SA; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_XATTR: + vfsp->vfs_xattr = ZFS_XATTR_DIR; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_NOXATTR: + vfsp->vfs_xattr = ZFS_XATTR_OFF; + vfsp->vfs_do_xattr = B_TRUE; + break; + case TOKEN_ATIME: + vfsp->vfs_atime = B_TRUE; + vfsp->vfs_do_atime = B_TRUE; + break; + case TOKEN_NOATIME: + vfsp->vfs_atime = B_FALSE; + vfsp->vfs_do_atime = B_TRUE; + break; + case TOKEN_RELATIME: + vfsp->vfs_relatime = B_TRUE; + vfsp->vfs_do_relatime = B_TRUE; + break; + case TOKEN_NORELATIME: + vfsp->vfs_relatime = B_FALSE; + vfsp->vfs_do_relatime = B_TRUE; + break; + case TOKEN_NBMAND: + vfsp->vfs_nbmand = B_TRUE; + vfsp->vfs_do_nbmand = B_TRUE; + break; + case TOKEN_NONBMAND: + vfsp->vfs_nbmand = B_FALSE; + vfsp->vfs_do_nbmand = B_TRUE; + break; + case TOKEN_MNTPOINT: + vfsp->vfs_mntpoint = match_strdup(&args[0]); + if (vfsp->vfs_mntpoint == NULL) + return (SET_ERROR(ENOMEM)); + + break; + default: + break; + } + + return (0); +} + +/* + * Parse the raw mntopts and return a vfs_t describing the options. + */ +static int +zfsvfs_parse_options(char *mntopts, vfs_t **vfsp) +{ + vfs_t *tmp_vfsp; + int error; + + tmp_vfsp = kmem_zalloc(sizeof (vfs_t), KM_SLEEP); + + if (mntopts != NULL) { + substring_t args[MAX_OPT_ARGS]; + char *tmp_mntopts, *p, *t; + int token; + + tmp_mntopts = t = kmem_strdup(mntopts); + if (tmp_mntopts == NULL) + return (SET_ERROR(ENOMEM)); + + while ((p = strsep(&t, ",")) != NULL) { + if (!*p) + continue; + + args[0].to = args[0].from = NULL; + token = match_token(p, zpl_tokens, args); + error = zfsvfs_parse_option(p, token, args, tmp_vfsp); + if (error) { + kmem_strfree(tmp_mntopts); + zfsvfs_vfs_free(tmp_vfsp); + return (error); + } + } + + kmem_strfree(tmp_mntopts); + } + + *vfsp = tmp_vfsp; + + return (0); +} + +boolean_t +zfs_is_readonly(zfsvfs_t *zfsvfs) +{ + return (!!(zfsvfs->z_sb->s_flags & SB_RDONLY)); +} + +/*ARGSUSED*/ +int +zfs_sync(struct super_block *sb, int wait, cred_t *cr) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + /* + * Semantically, the only requirement is that the sync be initiated. + * The DMU syncs out txgs frequently, so there's nothing to do. + */ + if (!wait) + return (0); + + if (zfsvfs != NULL) { + /* + * Sync a specific filesystem. + */ + dsl_pool_t *dp; + + ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, 0); + + ZFS_EXIT(zfsvfs); + } else { + /* + * Sync all ZFS filesystems. This is what happens when you + * run sync(1M). Unlike other filesystems, ZFS honors the + * request by waiting for all pools to commit all dirty data. + */ + spa_sync_allpools(); + } + + return (0); +} + +static void +atime_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + /* + * Update SB_NOATIME bit in VFS super block. Since atime update is + * determined by atime_needs_update(), atime_needs_update() needs to + * return false if atime is turned off, and not unconditionally return + * false if atime is turned on. + */ + if (newval) + sb->s_flags &= ~SB_NOATIME; + else + sb->s_flags |= SB_NOATIME; +} + +static void +relatime_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_relatime = newval; +} + +static void +xattr_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + if (newval == ZFS_XATTR_OFF) { + zfsvfs->z_flags &= ~ZSB_XATTR; + } else { + zfsvfs->z_flags |= ZSB_XATTR; + + if (newval == ZFS_XATTR_SA) + zfsvfs->z_xattr_sa = B_TRUE; + else + zfsvfs->z_xattr_sa = B_FALSE; + } +} + +static void +acltype_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + switch (newval) { + case ZFS_ACLTYPE_OFF: + zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; + zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; + break; + case ZFS_ACLTYPE_POSIXACL: +#ifdef CONFIG_FS_POSIX_ACL + zfsvfs->z_acl_type = ZFS_ACLTYPE_POSIXACL; + zfsvfs->z_sb->s_flags |= SB_POSIXACL; +#else + zfsvfs->z_acl_type = ZFS_ACLTYPE_OFF; + zfsvfs->z_sb->s_flags &= ~SB_POSIXACL; +#endif /* CONFIG_FS_POSIX_ACL */ + break; + default: + break; + } +} + +static void +blksz_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + ASSERT3U(newval, <=, spa_maxblocksize(dmu_objset_spa(zfsvfs->z_os))); + ASSERT3U(newval, >=, SPA_MINBLOCKSIZE); + ASSERT(ISP2(newval)); + + zfsvfs->z_max_blksz = newval; +} + +static void +readonly_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + + if (newval) + sb->s_flags |= SB_RDONLY; + else + sb->s_flags &= ~SB_RDONLY; +} + +static void +devices_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +setuid_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +exec_changed_cb(void *arg, uint64_t newval) +{ +} + +static void +nbmand_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + struct super_block *sb = zfsvfs->z_sb; + + if (sb == NULL) + return; + + if (newval == TRUE) + sb->s_flags |= SB_MANDLOCK; + else + sb->s_flags &= ~SB_MANDLOCK; +} + +static void +snapdir_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_show_ctldir = newval; +} + +static void +vscan_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_vscan = newval; +} + +static void +acl_mode_changed_cb(void *arg, uint64_t newval) +{ + zfsvfs_t *zfsvfs = arg; + + zfsvfs->z_acl_mode = newval; +} + +static void +acl_inherit_changed_cb(void *arg, uint64_t newval) +{ + ((zfsvfs_t *)arg)->z_acl_inherit = newval; +} + +static int +zfs_register_callbacks(vfs_t *vfsp) +{ + struct dsl_dataset *ds = NULL; + objset_t *os = NULL; + zfsvfs_t *zfsvfs = NULL; + int error = 0; + + ASSERT(vfsp); + zfsvfs = vfsp->vfs_data; + ASSERT(zfsvfs); + os = zfsvfs->z_os; + + /* + * The act of registering our callbacks will destroy any mount + * options we may have. In order to enable temporary overrides + * of mount options, we stash away the current values and + * restore them after we register the callbacks. + */ + if (zfs_is_readonly(zfsvfs) || !spa_writeable(dmu_objset_spa(os))) { + vfsp->vfs_do_readonly = B_TRUE; + vfsp->vfs_readonly = B_TRUE; + } + + /* + * Register property callbacks. + * + * It would probably be fine to just check for i/o error from + * the first prop_register(), but I guess I like to go + * overboard... + */ + ds = dmu_objset_ds(os); + dsl_pool_config_enter(dmu_objset_pool(os), FTAG); + error = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ATIME), atime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RELATIME), relatime_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_XATTR), xattr_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), blksz_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_READONLY), readonly_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEVICES), devices_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SETUID), setuid_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_EXEC), exec_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SNAPDIR), snapdir_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLTYPE), acltype_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLMODE), acl_mode_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_ACLINHERIT), acl_inherit_changed_cb, + zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_VSCAN), vscan_changed_cb, zfsvfs); + error = error ? error : dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_NBMAND), nbmand_changed_cb, zfsvfs); + dsl_pool_config_exit(dmu_objset_pool(os), FTAG); + if (error) + goto unregister; + + /* + * Invoke our callbacks to restore temporary mount options. + */ + if (vfsp->vfs_do_readonly) + readonly_changed_cb(zfsvfs, vfsp->vfs_readonly); + if (vfsp->vfs_do_setuid) + setuid_changed_cb(zfsvfs, vfsp->vfs_setuid); + if (vfsp->vfs_do_exec) + exec_changed_cb(zfsvfs, vfsp->vfs_exec); + if (vfsp->vfs_do_devices) + devices_changed_cb(zfsvfs, vfsp->vfs_devices); + if (vfsp->vfs_do_xattr) + xattr_changed_cb(zfsvfs, vfsp->vfs_xattr); + if (vfsp->vfs_do_atime) + atime_changed_cb(zfsvfs, vfsp->vfs_atime); + if (vfsp->vfs_do_relatime) + relatime_changed_cb(zfsvfs, vfsp->vfs_relatime); + if (vfsp->vfs_do_nbmand) + nbmand_changed_cb(zfsvfs, vfsp->vfs_nbmand); + + return (0); + +unregister: + dsl_prop_unregister_all(ds, zfsvfs); + return (error); +} + +/* + * Takes a dataset, a property, a value and that value's setpoint as + * found in the ZAP. Checks if the property has been changed in the vfs. + * If so, val and setpoint will be overwritten with updated content. + * Otherwise, they are left unchanged. + */ +int +zfs_get_temporary_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, uint64_t *val, + char *setpoint) +{ + int error; + zfsvfs_t *zfvp; + vfs_t *vfsp; + objset_t *os; + uint64_t tmp = *val; + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + + if (dmu_objset_type(os) != DMU_OST_ZFS) + return (EINVAL); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (zfvp == NULL) + return (ESRCH); + + vfsp = zfvp->z_vfs; + + switch (zfs_prop) { + case ZFS_PROP_ATIME: + if (vfsp->vfs_do_atime) + tmp = vfsp->vfs_atime; + break; + case ZFS_PROP_RELATIME: + if (vfsp->vfs_do_relatime) + tmp = vfsp->vfs_relatime; + break; + case ZFS_PROP_DEVICES: + if (vfsp->vfs_do_devices) + tmp = vfsp->vfs_devices; + break; + case ZFS_PROP_EXEC: + if (vfsp->vfs_do_exec) + tmp = vfsp->vfs_exec; + break; + case ZFS_PROP_SETUID: + if (vfsp->vfs_do_setuid) + tmp = vfsp->vfs_setuid; + break; + case ZFS_PROP_READONLY: + if (vfsp->vfs_do_readonly) + tmp = vfsp->vfs_readonly; + break; + case ZFS_PROP_XATTR: + if (vfsp->vfs_do_xattr) + tmp = vfsp->vfs_xattr; + break; + case ZFS_PROP_NBMAND: + if (vfsp->vfs_do_nbmand) + tmp = vfsp->vfs_nbmand; + break; + default: + return (ENOENT); + } + + if (tmp != *val) { + (void) strcpy(setpoint, "temporary"); + *val = tmp; + } + return (0); +} + +/* + * Associate this zfsvfs with the given objset, which must be owned. + * This will cache a bunch of on-disk state from the objset in the + * zfsvfs. + */ +static int +zfsvfs_init(zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + uint64_t val; + + zfsvfs->z_max_blksz = SPA_OLD_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error != 0) + return (error); + if (zfsvfs->z_version > + zfs_zpl_version_map(spa_version(dmu_objset_spa(os)))) { + (void) printk("Can't mount a version %lld file system " + "on a version %lld pool\n. Pool must be upgraded to mount " + "this file system.\n", (u_longlong_t)zfsvfs->z_version, + (u_longlong_t)spa_version(dmu_objset_spa(os))); + return (SET_ERROR(ENOTSUP)); + } + error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &val); + if (error != 0) + return (error); + zfsvfs->z_norm = (int)val; + + error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &val); + if (error != 0) + return (error); + zfsvfs->z_utf8 = (val != 0); + + error = zfs_get_zplprop(os, ZFS_PROP_CASE, &val); + if (error != 0) + return (error); + zfsvfs->z_case = (uint_t)val; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_ACLTYPE, &val)) != 0) + return (error); + zfsvfs->z_acl_type = (uint_t)val; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); + + uint64_t sa_obj = 0; + if (zfsvfs->z_use_sa) { + /* should either have both of these objects or none */ + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, + &sa_obj); + if (error != 0) + return (error); + + error = zfs_get_zplprop(os, ZFS_PROP_XATTR, &val); + if ((error == 0) && (val == ZFS_XATTR_SA)) + zfsvfs->z_xattr_sa = B_TRUE; + } + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error != 0) + return (error); + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error == ENOENT) + zfsvfs->z_userquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error == ENOENT) + zfsvfs->z_groupquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA], + 8, 1, &zfsvfs->z_projectquota_obj); + if (error == ENOENT) + zfsvfs->z_projectquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA], + 8, 1, &zfsvfs->z_userobjquota_obj); + if (error == ENOENT) + zfsvfs->z_userobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA], + 8, 1, &zfsvfs->z_groupobjquota_obj); + if (error == ENOENT) + zfsvfs->z_groupobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTOBJQUOTA], + 8, 1, &zfsvfs->z_projectobjquota_obj); + if (error == ENOENT) + zfsvfs->z_projectobjquota_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error == ENOENT) + zfsvfs->z_fuid_obj = 0; + else if (error != 0) + return (error); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error == ENOENT) + zfsvfs->z_shares_dir = 0; + else if (error != 0) + return (error); + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + if (error != 0) + return (error); + + if (zfsvfs->z_version >= ZPL_VERSION_SA) + sa_register_update_callback(os, zfs_sa_upgrade); + + return (0); +} + +int +zfsvfs_create(const char *osname, boolean_t readonly, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + int error; + boolean_t ro = (readonly || (strchr(osname, '@') != NULL)); + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + error = dmu_objset_own(osname, DMU_OST_ZFS, ro, B_TRUE, zfsvfs, &os); + if (error != 0) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + error = zfsvfs_create_impl(zfvp, zfsvfs, os); + if (error != 0) { + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + return (error); +} + + +/* + * Note: zfsvfs is assumed to be malloc'd, and will be freed by this function + * on a failure. Do not pass in a statically allocated zfsvfs. + */ +int +zfsvfs_create_impl(zfsvfs_t **zfvp, zfsvfs_t *zfsvfs, objset_t *os) +{ + int error; + + zfsvfs->z_vfs = NULL; + zfsvfs->z_sb = NULL; + zfsvfs->z_parent = zfsvfs; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrm_init(&zfsvfs->z_teardown_lock, B_FALSE); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + + int size = MIN(1 << (highbit64(zfs_object_mutex_size) - 1), + ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (int i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + error = zfsvfs_init(zfsvfs, os); + if (error != 0) { + *zfvp = NULL; + zfsvfs_free(zfsvfs); + return (error); + } + + zfsvfs->z_drain_task = TASKQID_INVALID; + zfsvfs->z_draining = B_FALSE; + zfsvfs->z_drain_cancel = B_TRUE; + + *zfvp = zfsvfs; + return (0); +} + +static int +zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) +{ + int error; + boolean_t readonly = zfs_is_readonly(zfsvfs); + + error = zfs_register_callbacks(zfsvfs->z_vfs); + if (error) + return (error); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + + /* + * If we are not mounting (ie: online recv), then we don't + * have to worry about replaying the log as we blocked all + * operations out since we closed the ZIL. + */ + if (mounting) { + ASSERT3P(zfsvfs->z_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zfsvfs->z_kstat, zfsvfs->z_os); + + /* + * During replay we remove the read only flag to + * allow replays to succeed. + */ + if (readonly != 0) { + readonly_changed_cb(zfsvfs, B_FALSE); + } else { + zap_stats_t zs; + if (zap_get_stats(zfsvfs->z_os, zfsvfs->z_unlinkedobj, + &zs) == 0) { + dataset_kstats_update_nunlinks_kstat( + &zfsvfs->z_kstat, zs.zs_num_entries); + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "num_entries in unlinked set: %llu", + zs.zs_num_entries); + } + zfs_unlinked_drain(zfsvfs); + dsl_dir_t *dd = zfsvfs->z_os->os_dsl_dataset->ds_dir; + dd->dd_activity_cancelled = B_FALSE; + } + + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + if (spa_writeable(dmu_objset_spa(zfsvfs->z_os))) { + if (zil_replay_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + } else { + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, + zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } + } + + /* restore readonly bit */ + if (readonly != 0) + readonly_changed_cb(zfsvfs, B_TRUE); + } + + /* + * Set the objset user_ptr to track its zfsvfs. + */ + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + return (0); +} + +void +zfsvfs_free(zfsvfs_t *zfsvfs) +{ + int i, size = zfsvfs->z_hold_size; + + zfs_fuid_destroy(zfsvfs); + + mutex_destroy(&zfsvfs->z_znodes_lock); + mutex_destroy(&zfsvfs->z_lock); + list_destroy(&zfsvfs->z_all_znodes); + rrm_destroy(&zfsvfs->z_teardown_lock); + rw_destroy(&zfsvfs->z_teardown_inactive_lock); + rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + zfsvfs_vfs_free(zfsvfs->z_vfs); + dataset_kstats_destroy(&zfsvfs->z_kstat); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} + +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + zfsvfs->z_use_sa = USE_SA(zfsvfs->z_version, zfsvfs->z_os); +} + +static void +zfs_unregister_callbacks(zfsvfs_t *zfsvfs) +{ + objset_t *os = zfsvfs->z_os; + + if (!dmu_objset_is_snapshot(os)) + dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs); +} + +#ifdef HAVE_MLSLABEL +/* + * Check that the hex label string is appropriate for the dataset being + * mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (SET_ERROR(EACCES)); + return (rdonly ? 0 : SET_ERROR(EACCES)); + } + return (SET_ERROR(EACCES)); +} +#endif /* HAVE_MLSLABEL */ + +static int +zfs_statfs_project(zfsvfs_t *zfsvfs, znode_t *zp, struct kstatfs *statp, + uint32_t bshift) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + uint64_t offset = DMU_OBJACCT_PREFIX_LEN; + uint64_t quota; + uint64_t used; + int err; + + strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); + err = zfs_id_to_fuidstr(zfsvfs, NULL, zp->z_projid, buf + offset, + sizeof (buf) - offset, B_FALSE); + if (err) + return (err); + + if (zfsvfs->z_projectquota_obj == 0) + goto objs; + + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectquota_obj, + buf + offset, 8, 1, "a); + if (err == ENOENT) + goto objs; + else if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, + buf + offset, 8, 1, &used); + if (unlikely(err == ENOENT)) { + uint32_t blksize; + u_longlong_t nblocks; + + /* + * Quota accounting is async, so it is possible race case. + * There is at least one object with the given project ID. + */ + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + if (unlikely(zp->z_blksz == 0)) + blksize = zfsvfs->z_max_blksz; + + used = blksize * nblocks; + } else if (err) { + return (err); + } + + statp->f_blocks = quota >> bshift; + statp->f_bfree = (quota > used) ? ((quota - used) >> bshift) : 0; + statp->f_bavail = statp->f_bfree; + +objs: + if (zfsvfs->z_projectobjquota_obj == 0) + return (0); + + err = zap_lookup(zfsvfs->z_os, zfsvfs->z_projectobjquota_obj, + buf + offset, 8, 1, "a); + if (err == ENOENT) + return (0); + else if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, DMU_PROJECTUSED_OBJECT, + buf, 8, 1, &used); + if (unlikely(err == ENOENT)) { + /* + * Quota accounting is async, so it is possible race case. + * There is at least one object with the given project ID. + */ + used = 1; + } else if (err) { + return (err); + } + + statp->f_files = quota; + statp->f_ffree = (quota > used) ? (quota - used) : 0; + + return (0); +} + +int +zfs_statvfs(struct inode *ip, struct kstatfs *statp) +{ + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t refdbytes, availbytes, usedobjs, availobjs; + int err = 0; + + ZFS_ENTER(zfsvfs); + + dmu_objset_space(zfsvfs->z_os, + &refdbytes, &availbytes, &usedobjs, &availobjs); + + uint64_t fsid = dmu_objset_fsid_guid(zfsvfs->z_os); + /* + * The underlying storage pool actually uses multiple block + * size. Under Solaris frsize (fragment size) is reported as + * the smallest block size we support, and bsize (block size) + * as the filesystem's maximum block size. Unfortunately, + * under Linux the fragment size and block size are often used + * interchangeably. Thus we are forced to report both of them + * as the filesystem's maximum block size. + */ + statp->f_frsize = zfsvfs->z_max_blksz; + statp->f_bsize = zfsvfs->z_max_blksz; + uint32_t bshift = fls(statp->f_bsize) - 1; + + /* + * The following report "total" blocks of various kinds in + * the file system, but reported in terms of f_bsize - the + * "preferred" size. + */ + + /* Round up so we never have a filesystem using 0 blocks. */ + refdbytes = P2ROUNDUP(refdbytes, statp->f_bsize); + statp->f_blocks = (refdbytes + availbytes) >> bshift; + statp->f_bfree = availbytes >> bshift; + statp->f_bavail = statp->f_bfree; /* no root reservation */ + + /* + * statvfs() should really be called statufs(), because it assumes + * static metadata. ZFS doesn't preallocate files, so the best + * we can do is report the max that could possibly fit in f_files, + * and that minus the number actually used in f_ffree. + * For f_ffree, report the smaller of the number of objects available + * and the number of blocks (each object will take at least a block). + */ + statp->f_ffree = MIN(availobjs, availbytes >> DNODE_SHIFT); + statp->f_files = statp->f_ffree + usedobjs; + statp->f_fsid.val[0] = (uint32_t)fsid; + statp->f_fsid.val[1] = (uint32_t)(fsid >> 32); + statp->f_type = ZFS_SUPER_MAGIC; + statp->f_namelen = MAXNAMELEN - 1; + + /* + * We have all of 40 characters to stuff a string here. + * Is there anything useful we could/should provide? + */ + bzero(statp->f_spare, sizeof (statp->f_spare)); + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + dmu_objset_projectquota_present(zfsvfs->z_os)) { + znode_t *zp = ITOZ(ip); + + if (zp->z_pflags & ZFS_PROJINHERIT && zp->z_projid && + zpl_is_valid_projid(zp->z_projid)) + err = zfs_statfs_project(zfsvfs, zp, statp, bshift); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +static int +zfs_root(zfsvfs_t *zfsvfs, struct inode **ipp) +{ + znode_t *rootzp; + int error; + + ZFS_ENTER(zfsvfs); + + error = zfs_zget(zfsvfs, zfsvfs->z_root, &rootzp); + if (error == 0) + *ipp = ZTOI(rootzp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Linux kernels older than 3.1 do not support a per-filesystem shrinker. + * To accommodate this we must improvise and manually walk the list of znodes + * attempting to prune dentries in order to be able to drop the inodes. + * + * To avoid scanning the same znodes multiple times they are always rotated + * to the end of the z_all_znodes list. New znodes are inserted at the + * end of the list so we're always scanning the oldest znodes first. + */ +static int +zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) +{ + znode_t **zp_array, *zp; + int max_array = MIN(nr_to_scan, PAGE_SIZE * 8 / sizeof (znode_t *)); + int objects = 0; + int i = 0, j = 0; + + zp_array = kmem_zalloc(max_array * sizeof (znode_t *), KM_SLEEP); + + mutex_enter(&zfsvfs->z_znodes_lock); + while ((zp = list_head(&zfsvfs->z_all_znodes)) != NULL) { + + if ((i++ > nr_to_scan) || (j >= max_array)) + break; + + ASSERT(list_link_active(&zp->z_link_node)); + list_remove(&zfsvfs->z_all_znodes, zp); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + + /* Skip active znodes and .zfs entries */ + if (MUTEX_HELD(&zp->z_lock) || zp->z_is_ctldir) + continue; + + if (igrab(ZTOI(zp)) == NULL) + continue; + + zp_array[j] = zp; + j++; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + for (i = 0; i < j; i++) { + zp = zp_array[i]; + + ASSERT3P(zp, !=, NULL); + d_prune_aliases(ZTOI(zp)); + + if (atomic_read(&ZTOI(zp)->i_count) == 1) + objects++; + + zrele(zp); + } + + kmem_free(zp_array, max_array * sizeof (znode_t *)); + + return (objects); +} + +/* + * The ARC has requested that the filesystem drop entries from the dentry + * and inode caches. This can occur when the ARC needs to free meta data + * blocks but can't because they are all pinned by entries in these caches. + */ +int +zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + int error = 0; + struct shrinker *shrinker = &sb->s_shrink; + struct shrink_control sc = { + .nr_to_scan = nr_to_scan, + .gfp_mask = GFP_KERNEL, + }; + + ZFS_ENTER(zfsvfs); + +#if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ + defined(SHRINK_CONTROL_HAS_NID) && \ + defined(SHRINKER_NUMA_AWARE) + if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + *objects = 0; + for_each_online_node(sc.nid) { + *objects += (*shrinker->scan_objects)(shrinker, &sc); + } + } else { + *objects = (*shrinker->scan_objects)(shrinker, &sc); + } + +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + *objects = (*shrinker->scan_objects)(shrinker, &sc); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + *objects = (*shrinker->shrink)(shrinker, &sc); +#elif defined(HAVE_D_PRUNE_ALIASES) +#define D_PRUNE_ALIASES_IS_DEFAULT + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); +#else +#error "No available dentry and inode cache pruning mechanism." +#endif + +#if defined(HAVE_D_PRUNE_ALIASES) && !defined(D_PRUNE_ALIASES_IS_DEFAULT) +#undef D_PRUNE_ALIASES_IS_DEFAULT + /* + * Fall back to zfs_prune_aliases if the kernel's per-superblock + * shrinker couldn't free anything, possibly due to the inodes being + * allocated in a different memcg. + */ + if (*objects == 0) + *objects = zfs_prune_aliases(zfsvfs, nr_to_scan); +#endif + + ZFS_EXIT(zfsvfs); + + dprintf_ds(zfsvfs->z_os->os_dsl_dataset, + "pruning, nr_to_scan=%lu objects=%d error=%d\n", + nr_to_scan, *objects, error); + + return (error); +} + +/* + * Teardown the zfsvfs_t. + * + * Note, if 'unmounting' is FALSE, we return with the 'z_teardown_lock' + * and 'z_teardown_inactive_lock' held. + */ +static int +zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) +{ + znode_t *zp; + + zfs_unlinked_drain_stop_wait(zfsvfs); + + /* + * If someone has not already unmounted this file system, + * drain the zrele_taskq to ensure all active references to the + * zfsvfs_t have been handled only then can it be safely destroyed. + */ + if (zfsvfs->z_os) { + /* + * If we're unmounting we have to wait for the list to + * drain completely. + * + * If we're not unmounting there's no guarantee the list + * will drain completely, but iputs run from the taskq + * may add the parents of dir-based xattrs to the taskq + * so we want to wait for these. + * + * We can safely read z_nr_znodes without locking because the + * VFS has already blocked operations which add to the + * z_all_znodes list and thus increment z_nr_znodes. + */ + int round = 0; + while (zfsvfs->z_nr_znodes > 0) { + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + if (++round > 1 && !unmounting) + break; + } + } + + rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG); + + if (!unmounting) { + /* + * We purge the parent filesystem's super block as the + * parent filesystem and all of its snapshots have their + * inode's super block set to the parent's filesystem's + * super block. Note, 'z_parent' is self referential + * for non-snapshots. + */ + shrink_dcache_sb(zfsvfs->z_parent->z_sb); + } + + /* + * Close the zil. NB: Can't close the zil while zfs_inactive + * threads are blocked as zil_close can call zfs_inactive. + */ + if (zfsvfs->z_log) { + zil_close(zfsvfs->z_log); + zfsvfs->z_log = NULL; + } + + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_WRITER); + + /* + * If we are not unmounting (ie: online recv) and someone already + * unmounted this file system while we were doing the switcheroo, + * or a reopen of z_os failed then just bail out now. + */ + if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) { + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + return (SET_ERROR(EIO)); + } + + /* + * At this point there are no VFS ops active, and any new VFS ops + * will fail with EIO since we have z_teardown_lock for writer (only + * relevant for forced unmount). + * + * Release all holds on dbufs. We also grab an extra reference to all + * the remaining inodes so that the kernel does not attempt to free + * any inodes of a suspended fs. This can cause deadlocks since the + * zfs_resume_fs() process may involve starting threads, which might + * attempt to free unreferenced inodes to free up memory for the new + * thread. + */ + if (!unmounting) { + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp != NULL; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + if (zp->z_sa_hdl) + zfs_znode_dmu_fini(zp); + if (igrab(ZTOI(zp)) != NULL) + zp->z_suspended = B_TRUE; + + } + mutex_exit(&zfsvfs->z_znodes_lock); + } + + /* + * If we are unmounting, set the unmounted flag and let new VFS ops + * unblock. zfs_inactive will have the unmounted behavior, and all + * other VFS ops will fail with EIO. + */ + if (unmounting) { + zfsvfs->z_unmounted = B_TRUE; + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + } + + /* + * z_os will be NULL if there was an error in attempting to reopen + * zfsvfs, so just return as the properties had already been + * + * unregistered and cached data had been evicted before. + */ + if (zfsvfs->z_os == NULL) + return (0); + + /* + * Unregister properties. + */ + zfs_unregister_callbacks(zfsvfs); + + /* + * Evict cached data. We must write out any dirty data before + * disowning the dataset. + */ + objset_t *os = zfsvfs->z_os; + boolean_t os_dirty = B_FALSE; + for (int t = 0; t < TXG_SIZE; t++) { + if (dmu_objset_is_dirty(os, t)) { + os_dirty = B_TRUE; + break; + } + } + if (!zfs_is_readonly(zfsvfs) && os_dirty) { + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + } + dmu_objset_evict_dbufs(zfsvfs->z_os); + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + dsl_dir_cancel_waiters(dd); + + return (0); +} + +#if defined(HAVE_SUPER_SETUP_BDI_NAME) +atomic_long_t zfs_bdi_seq = ATOMIC_LONG_INIT(0); +#endif + +int +zfs_domount(struct super_block *sb, zfs_mnt_t *zm, int silent) +{ + const char *osname = zm->mnt_osname; + struct inode *root_inode; + uint64_t recordsize; + int error = 0; + zfsvfs_t *zfsvfs = NULL; + vfs_t *vfs = NULL; + + ASSERT(zm); + ASSERT(osname); + + error = zfsvfs_parse_options(zm->mnt_data, &vfs); + if (error) + return (error); + + error = zfsvfs_create(osname, vfs->vfs_readonly, &zfsvfs); + if (error) { + zfsvfs_vfs_free(vfs); + goto out; + } + + if ((error = dsl_prop_get_integer(osname, "recordsize", + &recordsize, NULL))) { + zfsvfs_vfs_free(vfs); + goto out; + } + + vfs->vfs_data = zfsvfs; + zfsvfs->z_vfs = vfs; + zfsvfs->z_sb = sb; + sb->s_fs_info = zfsvfs; + sb->s_magic = ZFS_SUPER_MAGIC; + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_time_gran = 1; + sb->s_blocksize = recordsize; + sb->s_blocksize_bits = ilog2(recordsize); + + error = -zpl_bdi_setup(sb, "zfs"); + if (error) + goto out; + + sb->s_bdi->ra_pages = 0; + + /* Set callback operations for the file system. */ + sb->s_op = &zpl_super_operations; + sb->s_xattr = zpl_xattr_handlers; + sb->s_export_op = &zpl_export_operations; + sb->s_d_op = &zpl_dentry_operations; + + /* Set features for file system. */ + zfs_set_fuid_feature(zfsvfs); + + if (dmu_objset_is_snapshot(zfsvfs->z_os)) { + uint64_t pval; + + atime_changed_cb(zfsvfs, B_FALSE); + readonly_changed_cb(zfsvfs, B_TRUE); + if ((error = dsl_prop_get_integer(osname, + "xattr", &pval, NULL))) + goto out; + xattr_changed_cb(zfsvfs, pval); + if ((error = dsl_prop_get_integer(osname, + "acltype", &pval, NULL))) + goto out; + acltype_changed_cb(zfsvfs, pval); + zfsvfs->z_issnap = B_TRUE; + zfsvfs->z_os->os_sync = ZFS_SYNC_DISABLED; + zfsvfs->z_snap_defer_time = jiffies; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + } else { + if ((error = zfsvfs_setup(zfsvfs, B_TRUE))) + goto out; + } + + /* Allocate a root inode for the filesystem. */ + error = zfs_root(zfsvfs, &root_inode); + if (error) { + (void) zfs_umount(sb); + goto out; + } + + /* Allocate a root dentry for the filesystem */ + sb->s_root = d_make_root(root_inode); + if (sb->s_root == NULL) { + (void) zfs_umount(sb); + error = SET_ERROR(ENOMEM); + goto out; + } + + if (!zfsvfs->z_issnap) + zfsctl_create(zfsvfs); + + zfsvfs->z_arc_prune = arc_add_prune_callback(zpl_prune_sb, sb); +out: + if (error) { + if (zfsvfs != NULL) { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } + /* + * make sure we don't have dangling sb->s_fs_info which + * zfs_preumount will use. + */ + sb->s_fs_info = NULL; + } + + return (error); +} + +/* + * Called when an unmount is requested and certain sanity checks have + * already passed. At this point no dentries or inodes have been reclaimed + * from their respective caches. We drop the extra reference on the .zfs + * control directory to allow everything to be reclaimed. All snapshots + * must already have been unmounted to reach this point. + */ +void +zfs_preumount(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + + /* zfsvfs is NULL when zfs_domount fails during mount */ + if (zfsvfs) { + zfs_unlinked_drain_stop_wait(zfsvfs); + zfsctl_destroy(sb->s_fs_info); + /* + * Wait for zrele_async before entering evict_inodes in + * generic_shutdown_super. The reason we must finish before + * evict_inodes is when lazytime is on, or when zfs_purgedir + * calls zfs_zget, zrele would bump i_count from 0 to 1. This + * would race with the i_count check in evict_inodes. This means + * it could destroy the inode while we are still using it. + * + * We wait for two passes. xattr directories in the first pass + * may add xattr entries in zfs_purgedir, so in the second pass + * we wait for them. We don't use taskq_wait here because it is + * a pool wide taskq. Other mounted filesystems can constantly + * do zrele_async and there's no guarantee when taskq will be + * empty. + */ + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + taskq_wait_outstanding(dsl_pool_zrele_taskq( + dmu_objset_pool(zfsvfs->z_os)), 0); + } +} + +/* + * Called once all other unmount released tear down has occurred. + * It is our responsibility to release any remaining infrastructure. + */ +/*ARGSUSED*/ +int +zfs_umount(struct super_block *sb) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + objset_t *os; + + if (zfsvfs->z_arc_prune != NULL) + arc_remove_prune_callback(zfsvfs->z_arc_prune); + VERIFY(zfsvfs_teardown(zfsvfs, B_TRUE) == 0); + os = zfsvfs->z_os; + zpl_bdi_destroy(sb); + + /* + * z_os will be NULL if there was an error in + * attempting to reopen zfsvfs. + */ + if (os != NULL) { + /* + * Unset the objset user_ptr. + */ + mutex_enter(&os->os_user_ptr_lock); + dmu_objset_set_user(os, NULL); + mutex_exit(&os->os_user_ptr_lock); + + /* + * Finally release the objset + */ + dmu_objset_disown(os, B_TRUE, zfsvfs); + } + + zfsvfs_free(zfsvfs); + return (0); +} + +int +zfs_remount(struct super_block *sb, int *flags, zfs_mnt_t *zm) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + vfs_t *vfsp; + boolean_t issnap = dmu_objset_is_snapshot(zfsvfs->z_os); + int error; + + if ((issnap || !spa_writeable(dmu_objset_spa(zfsvfs->z_os))) && + !(*flags & SB_RDONLY)) { + *flags |= SB_RDONLY; + return (EROFS); + } + + error = zfsvfs_parse_options(zm->mnt_data, &vfsp); + if (error) + return (error); + + if (!zfs_is_readonly(zfsvfs) && (*flags & SB_RDONLY)) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + + zfs_unregister_callbacks(zfsvfs); + zfsvfs_vfs_free(zfsvfs->z_vfs); + + vfsp->vfs_data = zfsvfs; + zfsvfs->z_vfs = vfsp; + if (!issnap) + (void) zfs_register_callbacks(vfsp); + + return (error); +} + +int +zfs_vget(struct super_block *sb, struct inode **ipp, fid_t *fidp) +{ + zfsvfs_t *zfsvfs = sb->s_fs_info; + znode_t *zp; + uint64_t object = 0; + uint64_t fid_gen = 0; + uint64_t gen_mask; + uint64_t zp_gen; + int i, err; + + *ipp = NULL; + + if (fidp->fid_len == SHORT_FID_LEN || fidp->fid_len == LONG_FID_LEN) { + zfid_short_t *zfid = (zfid_short_t *)fidp; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + object |= ((uint64_t)zfid->zf_object[i]) << (8 * i); + + for (i = 0; i < sizeof (zfid->zf_gen); i++) + fid_gen |= ((uint64_t)zfid->zf_gen[i]) << (8 * i); + } else { + return (SET_ERROR(EINVAL)); + } + + /* LONG_FID_LEN means snapdirs */ + if (fidp->fid_len == LONG_FID_LEN) { + zfid_long_t *zlfid = (zfid_long_t *)fidp; + uint64_t objsetid = 0; + uint64_t setgen = 0; + + for (i = 0; i < sizeof (zlfid->zf_setid); i++) + objsetid |= ((uint64_t)zlfid->zf_setid[i]) << (8 * i); + + for (i = 0; i < sizeof (zlfid->zf_setgen); i++) + setgen |= ((uint64_t)zlfid->zf_setgen[i]) << (8 * i); + + if (objsetid != ZFSCTL_INO_SNAPDIRS - object) { + dprintf("snapdir fid: objsetid (%llu) != " + "ZFSCTL_INO_SNAPDIRS (%llu) - object (%llu)\n", + objsetid, ZFSCTL_INO_SNAPDIRS, object); + + return (SET_ERROR(EINVAL)); + } + + if (fid_gen > 1 || setgen != 0) { + dprintf("snapdir fid: fid_gen (%llu) and setgen " + "(%llu)\n", fid_gen, setgen); + return (SET_ERROR(EINVAL)); + } + + return (zfsctl_snapdir_vget(sb, objsetid, fid_gen, ipp)); + } + + ZFS_ENTER(zfsvfs); + /* A zero fid_gen means we are in the .zfs control directories */ + if (fid_gen == 0 && + (object == ZFSCTL_INO_ROOT || object == ZFSCTL_INO_SNAPDIR)) { + *ipp = zfsvfs->z_ctldir; + ASSERT(*ipp != NULL); + if (object == ZFSCTL_INO_SNAPDIR) { + VERIFY(zfsctl_root_lookup(*ipp, "snapshot", ipp, + 0, kcred, NULL, NULL) == 0); + } else { + igrab(*ipp); + } + ZFS_EXIT(zfsvfs); + return (0); + } + + gen_mask = -1ULL >> (64 - 8 * i); + + dprintf("getting %llu [%llu mask %llx]\n", object, fid_gen, gen_mask); + if ((err = zfs_zget(zfsvfs, object, &zp))) { + ZFS_EXIT(zfsvfs); + return (err); + } + + /* Don't export xattr stuff */ + if (zp->z_pflags & ZFS_XATTR) { + zrele(zp); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOENT)); + } + + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), &zp_gen, + sizeof (uint64_t)); + zp_gen = zp_gen & gen_mask; + if (zp_gen == 0) + zp_gen = 1; + if ((fid_gen == 0) && (zfsvfs->z_root == object)) + fid_gen = zp_gen; + if (zp->z_unlinked || zp_gen != fid_gen) { + dprintf("znode gen (%llu) != fid gen (%llu)\n", zp_gen, + fid_gen); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOENT)); + } + + *ipp = ZTOI(zp); + if (*ipp) + zfs_inode_update(ITOZ(*ipp)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Block out VFS ops and close zfsvfs_t + * + * Note, if successful, then we return with the 'z_teardown_lock' and + * 'z_teardown_inactive_lock' write held. We leave ownership of the underlying + * dataset and objset intact so that they can be atomically handed off during + * a subsequent rollback or recv operation and the resume thereafter. + */ +int +zfs_suspend_fs(zfsvfs_t *zfsvfs) +{ + int error; + + if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) + return (error); + + return (0); +} + +/* + * Rebuild SA and release VOPs. Note that ownership of the underlying dataset + * is an invariant across any of the operations that can be performed while the + * filesystem was suspended. Whether it succeeded or failed, the preconditions + * are the same: the relevant objset and associated dataset are owned by + * zfsvfs, held, and long held on entry. + */ +int +zfs_resume_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + int err, err2; + znode_t *zp; + + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just update the objset_t, as the one we + * had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + + err = zfsvfs_init(zfsvfs, os); + if (err != 0) + goto bail; + + ds->ds_dir->dd_activity_cancelled = B_FALSE; + VERIFY(zfsvfs_setup(zfsvfs, B_FALSE) == 0); + + zfs_set_fuid_feature(zfsvfs); + zfsvfs->z_rollback_time = jiffies; + + /* + * Attempt to re-establish all the active inodes with their + * dbufs. If a zfs_rezget() fails, then we unhash the inode + * and mark it stale. This prevents a collision if a new + * inode/object is created which must use the same inode + * number. The stale inode will be be released when the + * VFS prunes the dentry holding the remaining references + * on the stale inode. + */ + mutex_enter(&zfsvfs->z_znodes_lock); + for (zp = list_head(&zfsvfs->z_all_znodes); zp; + zp = list_next(&zfsvfs->z_all_znodes, zp)) { + err2 = zfs_rezget(zp); + if (err2) { + remove_inode_hash(ZTOI(zp)); + zp->z_is_stale = B_TRUE; + } + + /* see comment in zfs_suspend_fs() */ + if (zp->z_suspended) { + zfs_zrele_async(zp); + zp->z_suspended = B_FALSE; + } + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (!zfs_is_readonly(zfsvfs) && !zfsvfs->z_unmounted) { + /* + * zfs_suspend_fs() could have interrupted freeing + * of dnodes. We need to restart this freeing so + * that we don't "leak" the space. + */ + zfs_unlinked_drain(zfsvfs); + } + + /* + * Most of the time zfs_suspend_fs is used for changing the contents + * of the underlying dataset. ZFS rollback and receive operations + * might create files for which negative dentries are present in + * the cache. Since walking the dcache would require a lot of GPL-only + * code duplication, it's much easier on these rather rare occasions + * just to flush the whole dcache for the given dataset/filesystem. + */ + shrink_dcache_sb(zfsvfs->z_sb); + +bail: + if (err != 0) + zfsvfs->z_unmounted = B_TRUE; + + /* release the VFS ops */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + if (err != 0) { + /* + * Since we couldn't setup the sa framework, try to force + * unmount this file system. + */ + if (zfsvfs->z_os) + (void) zfs_umount(zfsvfs->z_sb); + } + return (err); +} + +/* + * Release VOPs and unmount a suspended filesystem. + */ +int +zfs_end_fs(zfsvfs_t *zfsvfs, dsl_dataset_t *ds) +{ + ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock)); + ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); + + /* + * We already own this, so just hold and rele it to update the + * objset_t, as the one we had before may have been evicted. + */ + objset_t *os; + VERIFY3P(ds->ds_owner, ==, zfsvfs); + VERIFY(dsl_dataset_long_held(ds)); + dsl_pool_t *dp = spa_get_dsl(dsl_dataset_get_spa(ds)); + dsl_pool_config_enter(dp, FTAG); + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_config_exit(dp, FTAG); + zfsvfs->z_os = os; + + /* release the VOPs */ + rw_exit(&zfsvfs->z_teardown_inactive_lock); + rrm_exit(&zfsvfs->z_teardown_lock, FTAG); + + /* + * Try to force unmount this file system. + */ + (void) zfs_umount(zfsvfs->z_sb); + zfsvfs->z_unmounted = B_TRUE; + return (0); +} + +/* + * Automounted snapshots rely on periodic revalidation + * to defer snapshots from being automatically unmounted. + */ + +inline void +zfs_exit_fs(zfsvfs_t *zfsvfs) +{ + if (!zfsvfs->z_issnap) + return; + + if (time_after(jiffies, zfsvfs->z_snap_defer_time + + MAX(zfs_expire_snapshot * HZ / 2, HZ))) { + zfsvfs->z_snap_defer_time = jiffies; + zfsctl_snapshot_unmount_delay(zfsvfs->z_os->os_spa, + dmu_objset_id(zfsvfs->z_os), + zfs_expire_snapshot); + } +} + +int +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) +{ + int error; + objset_t *os = zfsvfs->z_os; + dmu_tx_t *tx; + + if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) + return (SET_ERROR(EINVAL)); + + if (newvers < zfsvfs->z_version) + return (SET_ERROR(EINVAL)); + + if (zfs_spa_version_map(newvers) > + spa_version(dmu_objset_spa(zfsvfs->z_os))) + return (SET_ERROR(ENOTSUP)); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + ZFS_SA_ATTRS); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + } + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); + } + + if (newvers >= ZPL_VERSION_SA && !zfsvfs->z_use_sa) { + uint64_t sa_obj; + + ASSERT3U(spa_version(dmu_objset_spa(zfsvfs->z_os)), >=, + SPA_VERSION_SA); + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + + error = zap_add(os, MASTER_NODE_OBJ, + ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT0(error); + + VERIFY(0 == sa_set_sa_object(os, sa_obj)); + sa_register_update_callback(os, zfs_sa_upgrade); + } + + spa_history_log_internal_ds(dmu_objset_ds(os), "upgrade", tx, + "from %llu to %llu", zfsvfs->z_version, newvers); + + dmu_tx_commit(tx); + + zfsvfs->z_version = newvers; + os->os_version = newvers; + + zfs_set_fuid_feature(zfsvfs); + + return (0); +} + +/* + * Read a property stored within the master node. + */ +int +zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value) +{ + uint64_t *cached_copy = NULL; + + /* + * Figure out where in the objset_t the cached copy would live, if it + * is available for the requested property. + */ + if (os != NULL) { + switch (prop) { + case ZFS_PROP_VERSION: + cached_copy = &os->os_version; + break; + case ZFS_PROP_NORMALIZE: + cached_copy = &os->os_normalization; + break; + case ZFS_PROP_UTF8ONLY: + cached_copy = &os->os_utf8only; + break; + case ZFS_PROP_CASE: + cached_copy = &os->os_casesensitivity; + break; + default: + break; + } + } + if (cached_copy != NULL && *cached_copy != OBJSET_PROP_UNINITIALIZED) { + *value = *cached_copy; + return (0); + } + + /* + * If the property wasn't cached, look up the file system's value for + * the property. For the version property, we look up a slightly + * different string. + */ + const char *pname; + int error = ENOENT; + if (prop == ZFS_PROP_VERSION) + pname = ZPL_VERSION_STR; + else + pname = zfs_prop_to_name(prop); + + if (os != NULL) { + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + error = zap_lookup(os, MASTER_NODE_OBJ, pname, 8, 1, value); + } + + if (error == ENOENT) { + /* No value set, use the default value */ + switch (prop) { + case ZFS_PROP_VERSION: + *value = ZPL_VERSION; + break; + case ZFS_PROP_NORMALIZE: + case ZFS_PROP_UTF8ONLY: + *value = 0; + break; + case ZFS_PROP_CASE: + *value = ZFS_CASE_SENSITIVE; + break; + case ZFS_PROP_ACLTYPE: + *value = ZFS_ACLTYPE_OFF; + break; + default: + return (error); + } + error = 0; + } + + /* + * If one of the methods for getting the property value above worked, + * copy it into the objset_t's cache. + */ + if (error == 0 && cached_copy != NULL) { + *cached_copy = *value; + } + + return (error); +} + +/* + * Return true if the corresponding vfs's unmounted flag is set. + * Otherwise return false. + * If this function returns true we know VFS unmount has been initiated. + */ +boolean_t +zfs_get_vfs_flag_unmounted(objset_t *os) +{ + zfsvfs_t *zfvp; + boolean_t unmounted = B_FALSE; + + ASSERT(dmu_objset_type(os) == DMU_OST_ZFS); + + mutex_enter(&os->os_user_ptr_lock); + zfvp = dmu_objset_get_user(os); + if (zfvp != NULL && zfvp->z_unmounted) + unmounted = B_TRUE; + mutex_exit(&os->os_user_ptr_lock); + + return (unmounted); +} + +void +zfs_init(void) +{ + zfsctl_init(); + zfs_znode_init(); + dmu_objset_register_type(DMU_OST_ZFS, zpl_get_file_info); + register_filesystem(&zpl_fs_type); +} + +void +zfs_fini(void) +{ + /* + * we don't use outstanding because zpl_posix_acl_free might add more. + */ + taskq_wait(system_delay_taskq); + taskq_wait(system_taskq); + unregister_filesystem(&zpl_fs_type); + zfs_znode_fini(); + zfsctl_fini(); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_suspend_fs); +EXPORT_SYMBOL(zfs_resume_fs); +EXPORT_SYMBOL(zfs_set_version); +EXPORT_SYMBOL(zfsvfs_create); +EXPORT_SYMBOL(zfsvfs_free); +EXPORT_SYMBOL(zfs_is_readonly); +EXPORT_SYMBOL(zfs_domount); +EXPORT_SYMBOL(zfs_preumount); +EXPORT_SYMBOL(zfs_umount); +EXPORT_SYMBOL(zfs_remount); +EXPORT_SYMBOL(zfs_statvfs); +EXPORT_SYMBOL(zfs_vget); +EXPORT_SYMBOL(zfs_prune); +#endif diff --git a/module/os/linux/zfs/zfs_vnops.c b/module/os/linux/zfs/zfs_vnops.c new file mode 100644 index 000000000000..2d104a5001ec --- /dev/null +++ b/module/os/linux/zfs/zfs_vnops.c @@ -0,0 +1,5031 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* Portions Copyright 2007 Jeremy Teo */ +/* Portions Copyright 2010 Robert Milkowski */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Programming rules. + * + * Each vnode op performs some logical unit of work. To do this, the ZPL must + * properly lock its in-core state, create a DMU transaction, do the work, + * record this work in the intent log (ZIL), commit the DMU transaction, + * and wait for the intent log to commit if it is a synchronous operation. + * Moreover, the vnode ops must work in both normal and log replay context. + * The ordering of events is important to avoid deadlocks and references + * to freed memory. The example below illustrates the following Big Rules: + * + * (1) A check must be made in each zfs thread for a mounted file system. + * This is done avoiding races using ZFS_ENTER(zfsvfs). + * A ZFS_EXIT(zfsvfs) is needed before all returns. Any znodes + * must be checked with ZFS_VERIFY_ZP(zp). Both of these macros + * can return EIO from the calling function. + * + * (2) zrele() should always be the last thing except for zil_commit() + * (if necessary) and ZFS_EXIT(). This is for 3 reasons: + * First, if it's the last reference, the vnode/znode + * can be freed, so the zp may point to freed memory. Second, the last + * reference will call zfs_zinactive(), which may induce a lot of work -- + * pushing cached pages (which acquires range locks) and syncing out + * cached atime changes. Third, zfs_zinactive() may require a new tx, + * which could deadlock the system if you were already holding one. + * If you must call zrele() within a tx then use zfs_zrele_async(). + * + * (3) All range locks must be grabbed before calling dmu_tx_assign(), + * as they can span dmu_tx_assign() calls. + * + * (4) If ZPL locks are held, pass TXG_NOWAIT as the second argument to + * dmu_tx_assign(). This is critical because we don't want to block + * while holding locks. + * + * If no ZPL locks are held (aside from ZFS_ENTER()), use TXG_WAIT. This + * reduces lock contention and CPU usage when we must wait (note that if + * throughput is constrained by the storage, nearly every transaction + * must wait). + * + * Note, in particular, that if a lock is sometimes acquired before + * the tx assigns, and sometimes after (e.g. z_lock), then failing + * to use a non-blocking assign can deadlock the system. The scenario: + * + * Thread A has grabbed a lock before calling dmu_tx_assign(). + * Thread B is in an already-assigned tx, and blocks for this lock. + * Thread A calls dmu_tx_assign(TXG_WAIT) and blocks in txg_wait_open() + * forever, because the previous txg can't quiesce until B's tx commits. + * + * If dmu_tx_assign() returns ERESTART and zfsvfs->z_assign is TXG_NOWAIT, + * then drop all locks, call dmu_tx_wait(), and try again. On subsequent + * calls to dmu_tx_assign(), pass TXG_NOTHROTTLE in addition to TXG_NOWAIT, + * to indicate that this operation has already called dmu_tx_wait(). + * This will ensure that we don't retry forever, waiting a short bit + * each time. + * + * (5) If the operation succeeded, generate the intent log entry for it + * before dropping locks. This ensures that the ordering of events + * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. + * + * (6) At the end of each vnode op, the DMU tx must always commit, + * regardless of whether there were any errors. + * + * (7) After dropping all locks, invoke zil_commit(zilog, foid) + * to ensure that synchronous semantics are provided when necessary. + * + * In general, this is how things should be ordered in each vnode op: + * + * ZFS_ENTER(zfsvfs); // exit if unmounted + * top: + * zfs_dirent_lock(&dl, ...) // lock directory entry (may igrab()) + * rw_enter(...); // grab any other locks you need + * tx = dmu_tx_create(...); // get DMU tx + * dmu_tx_hold_*(); // hold each object you might modify + * error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + * if (error) { + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * if (error == ERESTART) { + * waited = B_TRUE; + * dmu_tx_wait(tx); + * dmu_tx_abort(tx); + * goto top; + * } + * dmu_tx_abort(tx); // abort DMU tx + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // really out of space + * } + * error = do_real_work(); // do whatever this VOP does + * if (error == 0) + * zfs_log_*(...); // on success, make ZIL entry + * dmu_tx_commit(tx); // commit DMU tx -- error or not + * rw_exit(...); // drop locks + * zfs_dirent_unlock(dl); // unlock directory entry + * zrele(...); // release held znodes + * zil_commit(zilog, foid); // synchronous when necessary + * ZFS_EXIT(zfsvfs); // finished in zfs + * return (error); // done, report error + */ + +/* + * Virus scanning is unsupported. It would be possible to add a hook + * here to performance the required virus scan. This could be done + * entirely in the kernel or potentially as an update to invoke a + * scanning utility. + */ +static int +zfs_vscan(struct inode *ip, cred_t *cr, int async) +{ + return (0); +} + +/* ARGSUSED */ +int +zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Honor ZFS_APPENDONLY file attribute */ + if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + ((flag & O_APPEND) == 0)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* Virus scan eligible files on open */ + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) { + if (zfs_vscan(ip, cr, 0) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + } + + /* Keep a count of the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_inc_32(&zp->z_sync_cnt); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* ARGSUSED */ +int +zfs_close(struct inode *ip, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + /* Decrement the synchronous opens in the znode */ + if (flag & O_SYNC) + atomic_dec_32(&zp->z_sync_cnt); + + if (!zfs_has_ctldir(zp) && zfsvfs->z_vscan && S_ISREG(ip->i_mode) && + !(zp->z_pflags & ZFS_AV_QUARANTINED) && zp->z_size > 0) + VERIFY(zfs_vscan(ip, cr, 1) == 0); + + ZFS_EXIT(zfsvfs); + return (0); +} + +#if defined(SEEK_HOLE) && defined(SEEK_DATA) +/* + * Lseek support for finding holes (cmd == SEEK_HOLE) and + * data (cmd == SEEK_DATA). "off" is an in/out parameter. + */ +static int +zfs_holey_common(struct inode *ip, int cmd, loff_t *off) +{ + znode_t *zp = ITOZ(ip); + uint64_t noff = (uint64_t)*off; /* new offset */ + uint64_t file_sz; + int error; + boolean_t hole; + + file_sz = zp->z_size; + if (noff >= file_sz) { + return (SET_ERROR(ENXIO)); + } + + if (cmd == SEEK_HOLE) + hole = B_TRUE; + else + hole = B_FALSE; + + error = dmu_offset_next(ZTOZSB(zp)->z_os, zp->z_id, hole, &noff); + + if (error == ESRCH) + return (SET_ERROR(ENXIO)); + + /* file was dirty, so fall back to using generic logic */ + if (error == EBUSY) { + if (hole) + *off = file_sz; + + return (0); + } + + /* + * We could find a hole that begins after the logical end-of-file, + * because dmu_offset_next() only works on whole blocks. If the + * EOF falls mid-block, then indicate that the "virtual hole" + * at the end of the file begins at the logical EOF, rather than + * at the end of the last block. + */ + if (noff > file_sz) { + ASSERT(hole); + noff = file_sz; + } + + if (noff < *off) + return (error); + *off = noff; + return (error); +} + +int +zfs_holey(struct inode *ip, int cmd, loff_t *off) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_holey_common(ip, cmd, off); + + ZFS_EXIT(zfsvfs); + return (error); +} +#endif /* SEEK_HOLE && SEEK_DATA */ + +#if defined(_KERNEL) +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Write: If we find a memory mapped page, we write to *both* + * the page and the dmu buffer. + */ +static void +update_pages(struct inode *ip, int64_t start, int len, + objset_t *os, uint64_t oid) +{ + struct address_space *mp = ip->i_mapping; + struct page *pp; + uint64_t nbytes; + int64_t off; + void *pb; + + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + nbytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + (void) dmu_read(os, oid, start+off, nbytes, pb+off, + DMU_READ_PREFETCH); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } + + len -= nbytes; + off = 0; + } +} + +/* + * When a file is memory mapped, we must keep the IO data synchronized + * between the DMU cache and the memory mapped pages. What this means: + * + * On Read: We "read" preferentially from memory mapped pages, + * else we default from the dmu buffer. + * + * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when + * the file is memory mapped. + */ +static int +mappedread(struct inode *ip, int nbytes, uio_t *uio) +{ + struct address_space *mp = ip->i_mapping; + struct page *pp; + znode_t *zp = ITOZ(ip); + int64_t start, off; + uint64_t bytes; + int len = nbytes; + int error = 0; + void *pb; + + start = uio->uio_loffset; + off = start & (PAGE_SIZE-1); + for (start &= PAGE_MASK; len > 0; start += PAGE_SIZE) { + bytes = MIN(PAGE_SIZE - off, len); + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + ASSERT(PageUptodate(pp)); + unlock_page(pp); + + pb = kmap(pp); + error = uiomove(pb + off, bytes, UIO_READ, uio); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + put_page(pp); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, bytes); + } + + len -= bytes; + off = 0; + if (error) + break; + } + return (error); +} +#endif /* _KERNEL */ + +unsigned long zfs_read_chunk_size = 1024 * 1024; /* Tunable */ +unsigned long zfs_delete_blocks = DMU_MAX_DELETEBLKCNT; + +/* + * Read bytes from specified file into supplied buffer. + * + * IN: ip - inode of file to be read from. + * uio - structure supplying read location, range info, + * and return buffer. + * ioflag - O_SYNC flags; used to provide FRSYNC semantics. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range, buffer filled. + * + * RETURN: 0 on success, error code on failure. + * + * Side Effects: + * inode - atime updated if byte count > 0 + */ +/* ARGSUSED */ +int +zfs_read(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + boolean_t frsync = B_FALSE; + + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (zp->z_pflags & ZFS_AV_QUARANTINED) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + /* + * Validate file offset + */ + if (uio->uio_loffset < (offset_t)0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Fasttrack empty reads + */ + if (uio->uio_resid == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + +#ifdef FRSYNC + /* + * If we're in FRSYNC mode, sync out this znode before reading it. + * Only do this for non-snapshots. + * + * Some platforms do not support FRSYNC and instead map it + * to O_SYNC, which results in unnecessary calls to zil_commit. We + * only honor FRSYNC requests on platforms which support it. + */ + frsync = !!(ioflag & FRSYNC); +#endif + if (zfsvfs->z_log && + (frsync || zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS)) + zil_commit(zfsvfs->z_log, zp->z_id); + + /* + * Lock the range against changes. + */ + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + uio->uio_loffset, uio->uio_resid, RL_READER); + + /* + * If we are reading past end-of-file we can skip + * to the end; but we might still need to set atime. + */ + if (uio->uio_loffset >= zp->z_size) { + error = 0; + goto out; + } + + ASSERT(uio->uio_loffset < zp->z_size); + ssize_t n = MIN(uio->uio_resid, zp->z_size - uio->uio_loffset); + ssize_t start_resid = n; + +#ifdef HAVE_UIO_ZEROCOPY + xuio_t *xuio = NULL; + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(ip)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz), 0, blksz); + } + } + } +#endif /* HAVE_UIO_ZEROCOPY */ + + while (n > 0) { + ssize_t nbytes = MIN(n, zfs_read_chunk_size - + P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); + + if (zp->z_is_mapped && !(ioflag & O_DIRECT)) { + error = mappedread(ip, nbytes, uio); + } else { + error = dmu_read_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes); + } + + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + + n -= nbytes; + } + + int64_t nread = start_resid - n; + dataset_kstats_update_read_kstats(&zfsvfs->z_kstat, nread); + task_io_account_read(nread); +out: + zfs_rangelock_exit(lr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Write the bytes to a file. + * + * IN: ip - inode of file to be written to. + * uio - structure supplying write location, range info, + * and data buffer. + * ioflag - O_APPEND flag set if in append mode. + * O_DIRECT flag; used to bypass page cache. + * cr - credentials of caller. + * + * OUT: uio - updated offset and range. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated if byte count > 0 + */ + +/* ARGSUSED */ +int +zfs_write(struct inode *ip, uio_t *uio, int ioflag, cred_t *cr) +{ + int error = 0; + ssize_t start_resid = uio->uio_resid; + + /* + * Fasttrack empty write + */ + ssize_t n = start_resid; + if (n == 0) + return (0); + + rlim64_t limit = uio->uio_limit; + if (limit == RLIM64_INFINITY || limit > MAXOFFSET_T) + limit = MAXOFFSET_T; + + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + sa_bulk_attr_t bulk[4]; + int count = 0; + uint64_t mtime[2], ctime[2]; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + /* + * If immutable or not appending then return EPERM + */ + if ((zp->z_pflags & (ZFS_IMMUTABLE | ZFS_READONLY)) || + ((zp->z_pflags & ZFS_APPENDONLY) && !(ioflag & O_APPEND) && + (uio->uio_loffset < zp->z_size))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + /* + * Validate file offset + */ + offset_t woff = ioflag & O_APPEND ? zp->z_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + int max_blksz = zfsvfs->z_max_blksz; + xuio_t *xuio = NULL; + + /* + * Pre-fault the pages to ensure slow (eg NFS) pages + * don't hold up txg. + * Skip this if uio contains loaned arc_buf. + */ +#ifdef HAVE_UIO_ZEROCOPY + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else +#endif + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFAULT)); + } + + /* + * If in append mode, set the io offset pointer to eof. + */ + zfs_locked_range_t *lr; + if (ioflag & O_APPEND) { + /* + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, n, RL_APPEND); + woff = lr->lr_offset; + if (lr->lr_length == UINT64_MAX) { + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_size; + } + uio->uio_loffset = woff; + } else { + /* + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, woff, n, RL_WRITER); + } + + if (woff >= limit) { + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EFBIG)); + } + + if ((woff + n) > limit || woff > (limit - n)) + n = limit - woff; + + /* Will this write extend the file length? */ + int write_eof = (woff + n > zp->z_size); + + uint64_t end_size = MAX(zp->z_size, woff + n); + zilog_t *zilog = zfsvfs->z_log; +#ifdef HAVE_UIO_ZEROCOPY + int i_iov = 0; + const iovec_t *iovp = uio->uio_iov; + int iovcnt __maybe_unused = uio->uio_iovcnt; +#endif + + + /* + * Write the file in reasonable size chunks. Each chunk is written + * in a separate transaction; this keeps the intent log records small + * and allows us to do more fine-grained space accounting. + */ + while (n > 0) { + woff = uio->uio_loffset; + + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + error = SET_ERROR(EDQUOT); + break; + } + + arc_buf_t *abuf = NULL; + const iovec_t *aiov = NULL; + if (xuio) { +#ifdef HAVE_UIO_ZEROCOPY + ASSERT(i_iov < iovcnt); + ASSERT3U(uio->uio_segflg, !=, UIO_BVEC); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; +#endif + } else if (n >= max_blksz && woff >= zp->z_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if ((error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes))) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + + /* + * Start a transaction. + */ + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + DB_DNODE_ENTER(db); + dmu_tx_hold_write_by_dnode(tx, DB_DNODE(db), woff, + MIN(n, max_blksz)); + DB_DNODE_EXIT(db); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); + break; + } + + /* + * If rangelock_enter() over-locked we grow the blocksize + * and then reduce the lock range. This will only happen + * on the first iteration since rangelock_reduce() will + * shrink down lr_length to the appropriate size. + */ + if (lr->lr_length == UINT64_MAX) { + uint64_t new_blksz; + + if (zp->z_blksz > max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + new_blksz = MIN(end_size, + 1 << highbit64(zp->z_blksz)); + } else { + new_blksz = MIN(end_size, max_blksz); + } + zfs_grow_blocksize(zp, new_blksz, tx); + zfs_rangelock_reduce(lr, woff, n); + } + + /* + * XXX - should we really limit each write to z_max_blksz? + * Perhaps we should use SPA_MAXBLOCKSIZE chunks? + */ + ssize_t nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); + + ssize_t tx_bytes; + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + uio->uio_fault_disable = B_TRUE; + error = dmu_write_uio_dbuf(sa_get_db(zp->z_sa_hdl), + uio, nbytes, tx); + uio->uio_fault_disable = B_FALSE; + if (error == EFAULT) { + dmu_tx_commit(tx); + /* + * Account for partial writes before + * continuing the loop. + * Update needs to occur before the next + * uio_prefaultpages, or prefaultpages may + * error, and we may break the loop early. + */ + if (tx_bytes != uio->uio_resid) + n -= tx_bytes - uio->uio_resid; + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + break; + } + continue; + } else if (error != 0) { + dmu_tx_commit(tx); + break; + } + tx_bytes -= uio->uio_resid; + } else { + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + /* cppcheck-suppress nullPointer */ + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + error = dmu_assign_arcbuf_by_dbuf( + sa_get_db(zp->z_sa_hdl), woff, abuf, tx); + if (error != 0) { + dmu_return_arcbuf(abuf); + dmu_tx_commit(tx); + break; + } + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && zp->z_is_mapped && !(ioflag & O_DIRECT)) { + update_pages(ip, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); + } + + /* + * If we made no progress, we're done. If we made even + * partial progress, update the znode and ZIL accordingly. + */ + if (tx_bytes == 0) { + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + dmu_tx_commit(tx); + ASSERT(error != 0); + break; + } + + /* + * Clear Set-UID/Set-GID bits on successful write if not + * privileged and at least one of the execute bits is set. + * + * It would be nice to do this after all writes have + * been done, but that would still expose the ISUID/ISGID + * to another app after the partial write is committed. + * + * Note: we don't call zfs_fuid_map_id() here because + * user 0 is not an ephemeral uid. + */ + mutex_enter(&zp->z_acl_lock); + uint32_t uid = KUID_TO_SUID(ip->i_uid); + if ((zp->z_mode & (S_IXUSR | (S_IXUSR >> 3) | + (S_IXUSR >> 6))) != 0 && + (zp->z_mode & (S_ISUID | S_ISGID)) != 0 && + secpolicy_vnode_setid_retain(cr, + ((zp->z_mode & S_ISUID) != 0 && uid == 0)) != 0) { + uint64_t newmode; + zp->z_mode &= ~(S_ISUID | S_ISGID); + ip->i_mode = newmode = zp->z_mode; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), + (void *)&newmode, sizeof (uint64_t), tx); + } + mutex_exit(&zp->z_acl_lock); + + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + + /* + * Update the file size (zp_size) if it has changed; + * account for possible concurrent updates. + */ + while ((end_size = zp->z_size) < uio->uio_loffset) { + (void) atomic_cas_64(&zp->z_size, end_size, + uio->uio_loffset); + ASSERT(error == 0); + } + /* + * If we are replaying and eof is non zero then force + * the file size to the specified eof. Note, there's no + * concurrency during replay. + */ + if (zfsvfs->z_replay && zfsvfs->z_replay_eof != 0) + zp->z_size = zfsvfs->z_replay_eof; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + + zfs_log_write(zilog, tx, TX_WRITE, zp, woff, tx_bytes, ioflag, + NULL, NULL); + dmu_tx_commit(tx); + + if (error != 0) + break; + ASSERT(tx_bytes == nbytes); + n -= nbytes; + + if (!xuio && n > 0) { + if (uio_prefaultpages(MIN(n, max_blksz), uio)) { + error = EFAULT; + break; + } + } + } + + zfs_inode_update(zp); + zfs_rangelock_exit(lr); + + /* + * If we're in replay mode, or we made no progress, return error. + * Otherwise, it's at least a partial write, so it's successful. + */ + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (ioflag & (O_SYNC | O_DSYNC) || + zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, zp->z_id); + + int64_t nwritten = start_resid - uio->uio_resid; + dataset_kstats_update_write_kstats(&zfsvfs->z_kstat, nwritten); + task_io_account_write(nwritten); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Write the bytes to a file. + * + * IN: zp - znode of file to be written to + * data - bytes to write + * len - number of bytes to write + * pos - offset to start writing at + * + * OUT: resid - remaining bytes to write + * + * RETURN: 0 if success + * positive error code if failure + * + * Timestamps: + * zp - ctime|mtime updated if byte count > 0 + */ +int +zfs_write_simple(znode_t *zp, const void *data, size_t len, + loff_t pos, size_t *resid) +{ + ssize_t written; + int error = 0; + + written = zpl_write_common(ZTOI(zp), data, len, &pos, + UIO_SYSSPACE, 0, kcred); + if (written < 0) { + error = -written; + } else if (resid == NULL) { + if (written < len) + error = SET_ERROR(EIO); /* short write */ + } else { + *resid = len - written; + } + return (error); +} + +/* + * Drop a reference on the passed inode asynchronously. This ensures + * that the caller will never drop the last reference on an inode in + * the current context. Doing so while holding open a tx could result + * in a deadlock if iput_final() re-enters the filesystem code. + */ +void +zfs_zrele_async(znode_t *zp) +{ + struct inode *ip = ZTOI(zp); + objset_t *os = ITOZSB(ip)->z_os; + + ASSERT(atomic_read(&ip->i_count) > 0); + ASSERT(os != NULL); + + if (atomic_read(&ip->i_count) == 1) + VERIFY(taskq_dispatch(dsl_pool_zrele_taskq(dmu_objset_pool(os)), + (task_func_t *)iput, ip, TQ_SLEEP) != TASKQID_INVALID); + else + zrele(zp); +} + +/* ARGSUSED */ +static void +zfs_get_done(zgd_t *zgd, int error) +{ + znode_t *zp = zgd->zgd_private; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + + kmem_free(zgd, sizeof (zgd_t)); +} + +#ifdef ZFS_DEBUG +static int zil_fault_io = 0; +#endif + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zfs_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zfsvfs_t *zfsvfs = arg; + objset_t *os = zfsvfs->z_os; + znode_t *zp; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error = 0; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + /* + * Nothing to do if the file has been removed + */ + if (zfs_zget(zfsvfs, object, &zp) != 0) + return (SET_ERROR(ENOENT)); + if (zp->z_unlinked) { + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + zfs_zrele_async(zp); + return (SET_ERROR(ENOENT)); + } + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + zgd->zgd_private = zp; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + /* test for truncation needs to be done while range locked */ + if (offset >= zp->z_size) { + error = SET_ERROR(ENOENT); + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } + ASSERT(error == 0 || error == ENOENT); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's + * written out and its checksum is being calculated + * that no one can change the data. We need to re-check + * blocksize after we get the lock in case it's changed! + */ + for (;;) { + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_lr = zfs_rangelock_enter(&zp->z_rangelock, + offset, size, RL_READER); + if (zp->z_blksz == size) + break; + offset += blkoff; + zfs_rangelock_exit(zgd->zgd_lr); + } + /* test for truncation needs to be done while range locked */ + if (lr->lr_offset >= zp->z_size) + error = SET_ERROR(ENOENT); +#ifdef ZFS_DEBUG + if (zil_fault_io) { + error = SET_ERROR(EIO); + zil_fault_io = 0; + } +#endif + if (error == 0) + error = dmu_buf_hold(os, object, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= size); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + /* + * TX_WRITE2 relies on the data previously + * written by the TX_WRITE that caused + * EALREADY. We zero out the BP because + * it is the old, currently-on-disk BP. + */ + zgd->zgd_bp = NULL; + BP_ZERO(bp); + error = 0; + } + } + } + + zfs_get_done(zgd, error); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_access(struct inode *ip, int mode, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (flag & V_ACE_MASK) + error = zfs_zaccess(zp, mode, flag, B_FALSE, cr); + else + error = zfs_zaccess_rwx(zp, mode, flag, cr); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Lookup an entry in a directory, or an extended attribute directory. + * If it exists, return a held inode reference for it. + * + * IN: zdp - znode of directory to search. + * nm - name of entry to lookup. + * flags - LOOKUP_XATTR set if looking for an attribute. + * cr - credentials of caller. + * direntflags - directory lookup flags + * realpnp - returned pathname. + * + * OUT: zpp - znode of located entry, NULL if not found. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * NA + */ +/* ARGSUSED */ +int +zfs_lookup(znode_t *zdp, char *nm, znode_t **zpp, int flags, + cred_t *cr, int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zdp); + int error = 0; + + /* + * Fast path lookup, however we must skip DNLC lookup + * for case folding or normalizing lookups because the + * DNLC code only stores the passed in name. This means + * creating 'a' and removing 'A' on a case insensitive + * file system would work, but DNLC still thinks 'a' + * exists and won't let you create it again on the next + * pass through fast path. + */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + return (SET_ERROR(ENOTDIR)); + } else if (zdp->z_sa_hdl == NULL) { + return (SET_ERROR(EIO)); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *zpp = zdp; + zhold(*zpp); + return (0); + } + return (error); + } + } + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zdp); + + *zpp = NULL; + + if (flags & LOOKUP_XATTR) { + /* + * We don't allow recursive attributes.. + * Maybe someday we will. + */ + if (zdp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if ((error = zfs_get_xattrdir(zdp, zpp, cr, flags))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Do we have permission to get into attribute directory? + */ + + if ((error = zfs_zaccess(*zpp, ACE_EXECUTE, 0, + B_FALSE, cr))) { + zrele(*zpp); + *zpp = NULL; + } + + ZFS_EXIT(zfsvfs); + return (error); + } + + if (!S_ISDIR(ZTOI(zdp)->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTDIR)); + } + + /* + * Check accessibility of directory. + */ + + if ((error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfsvfs->z_utf8 && u8_validate(nm, strlen(nm), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + error = zfs_dirlook(zdp, nm, zpp, flags, direntflags, realpnp); + if ((error == 0) && (*zpp)) + zfs_inode_update(*zpp); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Attempt to create a new entry in a directory. If the entry + * already exists, truncate the file if permissible, else return + * an error. Return the ip of the created or trunc'd file. + * + * IN: dzp - znode of directory to put new file entry in. + * name - name of new file entry. + * vap - attributes of new file. + * excl - flag indicating exclusive or non-exclusive mode. + * mode - mode to open file with. + * cr - credentials of caller. + * flag - file flag. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created or trunc'd entry. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated if new entry created + * zp - ctime|mtime always, atime if new + */ + +/* ARGSUSED */ +int +zfs_create(znode_t *dzp, char *name, vattr_t *vap, int excl, + int mode, znode_t **zpp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + objset_t *os; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *zpp = NULL; + if (*name == '\0') { + /* + * Null component name refers to the directory itself. + */ + zhold(dzp); + zp = dzp; + dl = NULL; + error = 0; + } else { + /* possible igrab(zp) */ + int zflg = 0; + + if (flag & FIGNORECASE) + zflg |= ZCILOOK; + + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL); + if (error) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + if (strcmp(name, "..") == 0) + error = SET_ERROR(EISDIR); + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if (zp == NULL) { + uint64_t txtype; + uint64_t projid = ZFS_DEFAULT_PROJID; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + /* + * We only support the creation of regular files in + * extended attribute directories. + */ + + if ((dzp->z_pflags & ZFS_XATTR) && !S_ISREG(vap->va_mode)) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EINVAL); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + + error = dmu_tx_assign(tx, + (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + /* + * Since, we failed to add the directory entry for it, + * delete the newly created dnode. + */ + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); + if (flag & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, name, + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); + } else { + int aflags = (flag & O_APPEND) ? V_APPEND : 0; + + if (have_acl) + zfs_acl_ids_free(&acl_ids); + have_acl = B_FALSE; + + /* + * A directory entry already exists for this name. + */ + /* + * Can't truncate an existing file if in exclusive mode. + */ + if (excl) { + error = SET_ERROR(EEXIST); + goto out; + } + /* + * Can't open a directory for writing. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + /* + * Verify requested access to file. + */ + if (mode && (error = zfs_zaccess_rwx(zp, mode, aflags, cr))) { + goto out; + } + + mutex_enter(&dzp->z_lock); + dzp->z_seq++; + mutex_exit(&dzp->z_lock); + + /* + * Truncate regular files if requested. + */ + if (S_ISREG(ZTOI(zp)->i_mode) && + (vap->va_mask & ATTR_SIZE) && (vap->va_size == 0)) { + /* we can't hold any locks when calling zfs_freesp() */ + if (dl) { + zfs_dirent_unlock(dl); + dl = NULL; + } + error = zfs_freesp(zp, 0, 0, mode, TRUE); + } + } +out: + + if (dl) + zfs_dirent_unlock(dl); + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *zpp = zp; + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* ARGSUSED */ +int +zfs_tmpfile(struct inode *dip, vattr_t *vap, int excl, + int mode, struct inode **ipp, cred_t *cr, int flag, vsecattr_t *vsecp) +{ + znode_t *zp = NULL, *dzp = ITOZ(dip); + zfsvfs_t *zfsvfs = ITOZSB(dip); + objset_t *os; + dmu_tx_t *tx; + int error; + uid_t uid; + gid_t gid; + zfs_acl_ids_t acl_ids; + uint64_t projid = ZFS_DEFAULT_PROJID; + boolean_t fuid_dirtied; + boolean_t have_acl = B_FALSE; + boolean_t waited = B_FALSE; + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + gid = crgetgid(cr); + uid = crgetuid(cr); + + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + os = zfsvfs->z_os; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + +top: + *ipp = NULL; + + /* + * Create a new file object and update the directory + * to reference it. + */ + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + if (have_acl) + zfs_acl_ids_free(&acl_ids); + goto out; + } + + if (!have_acl && (error = zfs_acl_ids_create(dzp, 0, vap, + cr, vsecp, &acl_ids)) != 0) + goto out; + have_acl = B_TRUE; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) + projid = zfs_inherit_projid(dzp); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, projid)) { + zfs_acl_ids_free(&acl_ids); + error = SET_ERROR(EDQUOT); + goto out; + } + + tx = dmu_tx_create(os); + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && + acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, acl_ids.z_aclp->z_acl_bytes); + } + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + zfs_mknode(dzp, vap, tx, cr, IS_TMPFILE, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + /* Add to unlinked set */ + zp->z_unlinked = B_TRUE; + zfs_unlinked_add(zp, tx); + zfs_acl_ids_free(&acl_ids); + dmu_tx_commit(tx); +out: + + if (error) { + if (zp) + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + *ipp = ZTOI(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove an entry from a directory. + * + * IN: dzp - znode of directory to remove entry from. + * name - name of entry to remove. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime + * ip - ctime (if nlink > 0) + */ + +uint64_t null_xattr = 0; + +/*ARGSUSED*/ +int +zfs_remove(znode_t *dzp, char *name, cred_t *cr, int flags) +{ + znode_t *zp; + znode_t *xzp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t acl_obj, xattr_obj; + uint64_t xattr_obj_unlinked = 0; + uint64_t obj = 0; + uint64_t links; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + boolean_t may_delete_now, delete_now = FALSE; + boolean_t unlinked, toobig = FALSE; + uint64_t txtype; + pathname_t *realnmp = NULL; + pathname_t realnm; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) { + zflg |= ZCILOOK; + pn_alloc(&realnm); + realnmp = &realnm; + } + +top: + xattr_obj = 0; + xzp = NULL; + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, realnmp))) { + if (realnmp) + pn_free(realnmp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + /* + * Need to use rmdir for removing directories. + */ + if (S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(EPERM); + goto out; + } + + mutex_enter(&zp->z_lock); + may_delete_now = atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped); + mutex_exit(&zp->z_lock); + + /* + * We may delete the znode now, or we may put it in the unlinked set; + * it depends on whether we're the last link, and on whether there are + * other holds on the inode. So we dmu_tx_hold() the right things to + * allow for either case. + */ + obj = zp->z_id; + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + if (may_delete_now) { + toobig = zp->z_size > zp->z_blksz * zfs_delete_blocks; + /* if the file is too big, only hold_free a token amount */ + dmu_tx_hold_free(tx, zp->z_id, 0, + (toobig ? DMU_MAX_ACCESS : DMU_OBJECT_END)); + } + + /* are there any extended attributes? */ + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + if (error == 0 && xattr_obj) { + error = zfs_zget(zfsvfs, xattr_obj, &xzp); + ASSERT0(error); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + dmu_tx_hold_sa(tx, xzp->z_sa_hdl, B_FALSE); + } + + mutex_enter(&zp->z_lock); + if ((acl_obj = zfs_external_acl(zp)) != 0 && may_delete_now) + dmu_tx_hold_free(tx, acl_obj, 0, DMU_OBJECT_END); + mutex_exit(&zp->z_lock); + + /* charge as an update -- would be nice not to charge at all */ + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + /* + * Mark this transaction as typically resulting in a net free of space + */ + dmu_tx_mark_netfree(tx); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + goto top; + } + if (realnmp) + pn_free(realnmp); + dmu_tx_abort(tx); + zrele(zp); + if (xzp) + zrele(xzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Remove the directory entry. + */ + error = zfs_link_destroy(dl, zp, tx, zflg, &unlinked); + + if (error) { + dmu_tx_commit(tx); + goto out; + } + + if (unlinked) { + /* + * Hold z_lock so that we can make sure that the ACL obj + * hasn't changed. Could have been deleted due to + * zfs_sa_upgrade(). + */ + mutex_enter(&zp->z_lock); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj_unlinked, sizeof (xattr_obj_unlinked)); + delete_now = may_delete_now && !toobig && + atomic_read(&ZTOI(zp)->i_count) == 1 && + !(zp->z_is_mapped) && xattr_obj == xattr_obj_unlinked && + zfs_external_acl(zp) == acl_obj; + } + + if (delete_now) { + if (xattr_obj_unlinked) { + ASSERT3U(ZTOI(xzp)->i_nlink, ==, 2); + mutex_enter(&xzp->z_lock); + xzp->z_unlinked = B_TRUE; + clear_nlink(ZTOI(xzp)); + links = 0; + error = sa_update(xzp->z_sa_hdl, SA_ZPL_LINKS(zfsvfs), + &links, sizeof (links), tx); + ASSERT3U(error, ==, 0); + mutex_exit(&xzp->z_lock); + zfs_unlinked_add(xzp, tx); + + if (zp->z_is_sa) + error = sa_remove(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), tx); + else + error = sa_update(zp->z_sa_hdl, + SA_ZPL_XATTR(zfsvfs), &null_xattr, + sizeof (uint64_t), tx); + ASSERT0(error); + } + /* + * Add to the unlinked set because a new reference could be + * taken concurrently resulting in a deferred destruction. + */ + zfs_unlinked_add(zp, tx); + mutex_exit(&zp->z_lock); + } else if (unlinked) { + mutex_exit(&zp->z_lock); + zfs_unlinked_add(zp, tx); + } + + txtype = TX_REMOVE; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, obj, unlinked); + + dmu_tx_commit(tx); +out: + if (realnmp) + pn_free(realnmp); + + zfs_dirent_unlock(dl); + zfs_inode_update(dzp); + zfs_inode_update(zp); + + if (delete_now) + zrele(zp); + else + zfs_zrele_async(zp); + + if (xzp) { + zfs_inode_update(xzp); + zfs_zrele_async(xzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Create a new directory and insert it into dzp using the name + * provided. Return a pointer to the inserted directory. + * + * IN: dzp - znode of directory to add subdir to. + * dirname - name of new directory. + * vap - attributes of new directory. + * cr - credentials of caller. + * flags - case flags. + * vsecp - ACL to be set + * + * OUT: zpp - znode of created directory. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * dzp - ctime|mtime updated + * zpp - ctime|mtime|atime updated + */ +/*ARGSUSED*/ +int +zfs_mkdir(znode_t *dzp, char *dirname, vattr_t *vap, znode_t **zpp, + cred_t *cr, int flags, vsecattr_t *vsecp) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + uint64_t txtype; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uid_t uid; + gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + boolean_t waited = B_FALSE; + + ASSERT(S_ISDIR(vap->va_mode)); + + /* + * If we have an ephemeral id, ACL, or XVATTR then + * make sure file system is at proper version + */ + + uid = crgetuid(cr); + if (zfsvfs->z_use_fuids == B_FALSE && + (vsecp || IS_EPHEMERAL(uid) || IS_EPHEMERAL(gid))) + return (SET_ERROR(EINVAL)); + + if (dirname == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (dzp->z_pflags & ZFS_XATTR) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (zfsvfs->z_utf8 && u8_validate(dirname, + strlen(dirname), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + if (vap->va_mask & ATTR_XVATTR) { + if ((error = secpolicy_xvattr((xvattr_t *)vap, + crgetuid(cr), cr, vap->va_mode)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + } + + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, + vsecp, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* + * First make sure the new directory doesn't exist. + * + * Existence is checked first to make sure we don't return + * EACCES instead of EEXIST which can cause some applications + * to fail. + */ +top: + *zpp = NULL; + + if ((error = zfs_dirent_lock(&dl, dzp, dirname, &zp, zf, + NULL, NULL))) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_SUBDIRECTORY, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, zfs_inherit_projid(dzp))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + + /* + * Add a new entry to the directory. + */ + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE); + + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create new node. + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + /* + * Now put new name in parent dir. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + goto out; + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + *zpp = zp; + + txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); + +out: + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (error != 0) { + zrele(zp); + } else { + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Remove a directory subdir entry. If the current working + * directory is the same as the subdir to be removed, the + * remove will fail. + * + * IN: dzp - znode of directory to remove from. + * name - name of directory to be removed. + * cwd - inode of current working directory. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rmdir(znode_t *dzp, char *name, znode_t *cwd, cred_t *cr, + int flags) +{ + znode_t *zp; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zflg = ZEXISTS; + boolean_t waited = B_FALSE; + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; +top: + zp = NULL; + + /* + * Attempt to lock directory; fail if entry doesn't exist. + */ + if ((error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, + NULL, NULL))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess_delete(dzp, zp, cr))) { + goto out; + } + + if (!S_ISDIR(ZTOI(zp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + + if (zp == cwd) { + error = SET_ERROR(EINVAL); + goto out; + } + + /* + * Grab a lock on the directory to make sure that no one is + * trying to add (or lookup) entries while we are removing it. + */ + rw_enter(&zp->z_name_lock, RW_WRITER); + + /* + * Grab a lock on the parent pointer to make sure we play well + * with the treewalk and directory rename code. + */ + rw_enter(&zp->z_parent_lock, RW_WRITER); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + zfs_sa_upgrade_txholds(tx, zp); + zfs_sa_upgrade_txholds(tx, dzp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(zp); + goto top; + } + dmu_tx_abort(tx); + zrele(zp); + ZFS_EXIT(zfsvfs); + return (error); + } + + error = zfs_link_destroy(dl, zp, tx, zflg, NULL); + + if (error == 0) { + uint64_t txtype = TX_RMDIR; + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_remove(zilog, tx, txtype, dzp, name, ZFS_NO_OBJECT, + B_FALSE); + } + + dmu_tx_commit(tx); + + rw_exit(&zp->z_parent_lock); + rw_exit(&zp->z_name_lock); +out: + zfs_dirent_unlock(dl); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + zrele(zp); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Read directory entries from the given directory cursor position and emit + * name and position for each entry. + * + * IN: ip - inode of directory to read. + * ctx - directory entry context. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + * + * Note that the low 4 bits of the cookie returned by zap is always zero. + * This allows us to use the low range for "special" directory entries: + * We use 0 for '.', and 1 for '..'. If this is the root of the filesystem, + * we use the offset 2 for the '.zfs' directory. + */ +/* ARGSUSED */ +int +zfs_readdir(struct inode *ip, zpl_dir_context_t *ctx, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + zap_cursor_t zc; + zap_attribute_t zap; + int error; + uint8_t prefetch; + uint8_t type; + int done = 0; + uint64_t parent; + uint64_t offset; /* must be unsigned; checks for < 1 */ + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (parent))) != 0) + goto out; + + /* + * Quit if directory has been removed (posix) + */ + if (zp->z_unlinked) + goto out; + + error = 0; + os = zfsvfs->z_os; + offset = ctx->pos; + prefetch = zp->z_zn_prefetch; + + /* + * Initialize the iterator cursor. + */ + if (offset <= 3) { + /* + * Start iteration from the beginning of the directory. + */ + zap_cursor_init(&zc, os, zp->z_id); + } else { + /* + * The offset is a serialized cursor. + */ + zap_cursor_init_serialized(&zc, os, zp->z_id, offset); + } + + /* + * Transform to file-system independent format + */ + while (!done) { + uint64_t objnum; + /* + * Special case `.', `..', and `.zfs'. + */ + if (offset == 0) { + (void) strcpy(zap.za_name, "."); + zap.za_normalization_conflict = 0; + objnum = zp->z_id; + type = DT_DIR; + } else if (offset == 1) { + (void) strcpy(zap.za_name, ".."); + zap.za_normalization_conflict = 0; + objnum = parent; + type = DT_DIR; + } else if (offset == 2 && zfs_show_ctldir(zp)) { + (void) strcpy(zap.za_name, ZFS_CTLDIR_NAME); + zap.za_normalization_conflict = 0; + objnum = ZFSCTL_INO_ROOT; + type = DT_DIR; + } else { + /* + * Grab next entry. + */ + if ((error = zap_cursor_retrieve(&zc, &zap))) { + if (error == ENOENT) + break; + else + goto update; + } + + /* + * Allow multiple entries provided the first entry is + * the object id. Non-zpl consumers may safely make + * use of the additional space. + * + * XXX: This should be a feature flag for compatibility + */ + if (zap.za_integer_length != 8 || + zap.za_num_integers == 0) { + cmn_err(CE_WARN, "zap_readdir: bad directory " + "entry, obj = %lld, offset = %lld, " + "length = %d, num = %lld\n", + (u_longlong_t)zp->z_id, + (u_longlong_t)offset, + zap.za_integer_length, + (u_longlong_t)zap.za_num_integers); + error = SET_ERROR(ENXIO); + goto update; + } + + objnum = ZFS_DIRENT_OBJ(zap.za_first_integer); + type = ZFS_DIRENT_TYPE(zap.za_first_integer); + } + + done = !zpl_dir_emit(ctx, zap.za_name, strlen(zap.za_name), + objnum, type); + if (done) + break; + + /* Prefetch znode */ + if (prefetch) { + dmu_prefetch(os, objnum, 0, 0, 0, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Move to the next entry, fill in the previous offset. + */ + if (offset > 2 || (offset == 2 && !zfs_show_ctldir(zp))) { + zap_cursor_advance(&zc); + offset = zap_cursor_serialize(&zc); + } else { + offset += 1; + } + ctx->pos = offset; + } + zp->z_zn_prefetch = B_FALSE; /* a lookup will re-enable pre-fetching */ + +update: + zap_cursor_fini(&zc); + if (error == ENOENT) + error = 0; +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +ulong_t zfs_fsync_sync_cnt = 4; + +int +zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + (void) tsd_set(zfs_fsyncer_key, (void *)zfs_fsync_sync_cnt); + + if (zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + } + tsd_set(zfs_fsyncer_key, NULL); + + return (0); +} + +/* + * Get the basic file attributes and place them in the provided kstat + * structure. The inode is assumed to be the authoritative source + * for most of the attributes. However, the znode currently has the + * authoritative atime, blksize, and block count. + * + * IN: ip - inode of file. + * + * OUT: sp - kstat values. + * + * RETURN: 0 (always succeeds) + */ +/* ARGSUSED */ +int +zfs_getattr_fast(struct inode *ip, struct kstat *sp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t blksize; + u_longlong_t nblocks; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + + generic_fillattr(ip, sp); + /* + * +1 link count for root inode with visible '.zfs' directory. + */ + if ((zp->z_id == zfsvfs->z_root) && zfs_show_ctldir(zp)) + if (sp->nlink < ZFS_LINK_MAX) + sp->nlink++; + + sa_object_size(zp->z_sa_hdl, &blksize, &nblocks); + sp->blksize = blksize; + sp->blocks = nblocks; + + if (unlikely(zp->z_blksz == 0)) { + /* + * Block size hasn't been set; suggest maximal I/O transfers. + */ + sp->blksize = zfsvfs->z_max_blksz; + } + + mutex_exit(&zp->z_lock); + + /* + * Required to prevent NFS client from detecting different inode + * numbers of snapshot root dentry before and after snapshot mount. + */ + if (zfsvfs->z_issnap) { + if (ip->i_sb->s_root->d_inode == ip) + sp->ino = ZFSCTL_INO_SNAPDIRS - + dmu_objset_id(zfsvfs->z_os); + } + + ZFS_EXIT(zfsvfs); + + return (0); +} + +/* + * For the operation of changing file's user/group/project, we need to + * handle not only the main object that is assigned to the file directly, + * but also the ones that are used by the file via hidden xattr directory. + * + * Because the xattr directory may contains many EA entries, as to it may + * be impossible to change all of them via the transaction of changing the + * main object's user/group/project attributes. Then we have to change them + * via other multiple independent transactions one by one. It may be not good + * solution, but we have no better idea yet. + */ +static int +zfs_setattr_dir(znode_t *dzp) +{ + struct inode *dxip = ZTOI(dzp); + struct inode *xip = NULL; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + objset_t *os = zfsvfs->z_os; + zap_cursor_t zc; + zap_attribute_t zap; + zfs_dirlock_t *dl; + znode_t *zp; + dmu_tx_t *tx = NULL; + uint64_t uid, gid; + sa_bulk_attr_t bulk[4]; + int count; + int err; + + zap_cursor_init(&zc, os, dzp->z_id); + while ((err = zap_cursor_retrieve(&zc, &zap)) == 0) { + count = 0; + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + err = ENXIO; + break; + } + + err = zfs_dirent_lock(&dl, dzp, (char *)zap.za_name, &zp, + ZEXISTS, NULL, NULL); + if (err == ENOENT) + goto next; + if (err) + break; + + xip = ZTOI(zp); + if (KUID_TO_SUID(xip->i_uid) == KUID_TO_SUID(dxip->i_uid) && + KGID_TO_SGID(xip->i_gid) == KGID_TO_SGID(dxip->i_gid) && + zp->z_projid == dzp->z_projid) + goto next; + + tx = dmu_tx_create(os); + if (!(zp->z_pflags & ZFS_PROJID)) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + break; + + mutex_enter(&dzp->z_lock); + + if (KUID_TO_SUID(xip->i_uid) != KUID_TO_SUID(dxip->i_uid)) { + xip->i_uid = dxip->i_uid; + uid = zfs_uid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, sizeof (uid)); + } + + if (KGID_TO_SGID(xip->i_gid) != KGID_TO_SGID(dxip->i_gid)) { + xip->i_gid = dxip->i_gid; + gid = zfs_gid_read(dxip); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, sizeof (gid)); + } + + if (zp->z_projid != dzp->z_projid) { + if (!(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_FLAGS(zfsvfs), NULL, &zp->z_pflags, + sizeof (zp->z_pflags)); + } + + zp->z_projid = dzp->z_projid; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PROJID(zfsvfs), + NULL, &zp->z_projid, sizeof (zp->z_projid)); + } + + mutex_exit(&dzp->z_lock); + + if (likely(count > 0)) { + err = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + tx = NULL; + if (err != 0 && err != ENOENT) + break; + +next: + if (zp) { + zrele(zp); + zp = NULL; + zfs_dirent_unlock(dl); + } + zap_cursor_advance(&zc); + } + + if (tx) + dmu_tx_abort(tx); + if (zp) { + zrele(zp); + zfs_dirent_unlock(dl); + } + zap_cursor_fini(&zc); + + return (err == ENOENT ? 0 : err); +} + +/* + * Set the file attributes to the values contained in the + * vattr structure. + * + * IN: zp - znode of file to be modified. + * vap - new attribute values. + * If ATTR_XVATTR set, then optional attrs are being set + * flags - ATTR_UTIME set if non-default time values provided. + * - ATTR_NOACLCHECK (CIFS context only). + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime updated, mtime updated if size changed. + */ +/* ARGSUSED */ +int +zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr) +{ + struct inode *ip; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + zilog_t *zilog; + dmu_tx_t *tx; + vattr_t oldva; + xvattr_t *tmpxvattr; + uint_t mask = vap->va_mask; + uint_t saved_mask = 0; + int trim_mask = 0; + uint64_t new_mode; + uint64_t new_kuid = 0, new_kgid = 0, new_uid, new_gid; + uint64_t xattr_obj; + uint64_t mtime[2], ctime[2], atime[2]; + uint64_t projid = ZFS_INVALID_PROJID; + znode_t *attrzp; + int need_policy = FALSE; + int err, err2 = 0; + zfs_fuid_info_t *fuidp = NULL; + xvattr_t *xvap = (xvattr_t *)vap; /* vap may be an xvattr_t * */ + xoptattr_t *xoap; + zfs_acl_t *aclp; + boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; + boolean_t handle_eadir = B_FALSE; + sa_bulk_attr_t *bulk, *xattr_bulk; + int count = 0, xattr_count = 0, bulks = 8; + + if (mask == 0) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + ip = ZTOI(zp); + + /* + * If this is a xvattr_t, then get a pointer to the structure of + * optional attributes. If this is NULL, then we have a vattr_t. + */ + xoap = xva_getxoptattr(xvap); + if (xoap != NULL && (mask & ATTR_XVATTR)) { + if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + if (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + + projid = xoap->xoa_projid; + if (unlikely(projid == ZFS_INVALID_PROJID)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (projid == zp->z_projid && zp->z_pflags & ZFS_PROJID) + projid = ZFS_INVALID_PROJID; + else + need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT) && + (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) && + (!dmu_objset_projectquota_enabled(os) || + (!S_ISREG(ip->i_mode) && !S_ISDIR(ip->i_mode)))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENOTSUP)); + } + } + + zilog = zfsvfs->z_log; + + /* + * Make sure that if we have ephemeral uid/gid or xvattr specified + * that file system is at proper version level + */ + + if (zfsvfs->z_use_fuids == B_FALSE && + (((mask & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) || + ((mask & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) || + (mask & ATTR_XVATTR))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + if (mask & ATTR_SIZE && S_ISDIR(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EISDIR)); + } + + if (mask & ATTR_SIZE && !S_ISREG(ip->i_mode) && !S_ISFIFO(ip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + tmpxvattr = kmem_alloc(sizeof (xvattr_t), KM_SLEEP); + xva_init(tmpxvattr); + + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + xattr_bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * bulks, KM_SLEEP); + + /* + * Immutable files can only alter immutable bit and atime + */ + if ((zp->z_pflags & ZFS_IMMUTABLE) && + ((mask & (ATTR_SIZE|ATTR_UID|ATTR_GID|ATTR_MTIME|ATTR_MODE)) || + ((mask & ATTR_XVATTR) && XVA_ISSET_REQ(xvap, XAT_CREATETIME)))) { + err = SET_ERROR(EPERM); + goto out3; + } + + if ((mask & ATTR_SIZE) && (zp->z_pflags & ZFS_READONLY)) { + err = SET_ERROR(EPERM); + goto out3; + } + + /* + * Verify timestamps doesn't overflow 32 bits. + * ZFS can handle large timestamps, but 32bit syscalls can't + * handle times greater than 2039. This check should be removed + * once large timestamps are fully supported. + */ + if (mask & (ATTR_ATIME | ATTR_MTIME)) { + if (((mask & ATTR_ATIME) && + TIMESPEC_OVERFLOW(&vap->va_atime)) || + ((mask & ATTR_MTIME) && + TIMESPEC_OVERFLOW(&vap->va_mtime))) { + err = SET_ERROR(EOVERFLOW); + goto out3; + } + } + +top: + attrzp = NULL; + aclp = NULL; + + /* Can this be moved to before the top label? */ + if (zfs_is_readonly(zfsvfs)) { + err = SET_ERROR(EROFS); + goto out3; + } + + /* + * First validate permissions + */ + + if (mask & ATTR_SIZE) { + err = zfs_zaccess(zp, ACE_WRITE_DATA, 0, skipaclchk, cr); + if (err) + goto out3; + + /* + * XXX - Note, we are not providing any open + * mode flags here (like FNDELAY), so we may + * block if there are locks present... this + * should be addressed in openat(). + */ + /* XXX - would it be OK to generate a log record here? */ + err = zfs_freesp(zp, vap->va_size, 0, 0, FALSE); + if (err) + goto out3; + } + + if (mask & (ATTR_ATIME|ATTR_MTIME) || + ((mask & ATTR_XVATTR) && (XVA_ISSET_REQ(xvap, XAT_HIDDEN) || + XVA_ISSET_REQ(xvap, XAT_READONLY) || + XVA_ISSET_REQ(xvap, XAT_ARCHIVE) || + XVA_ISSET_REQ(xvap, XAT_OFFLINE) || + XVA_ISSET_REQ(xvap, XAT_SPARSE) || + XVA_ISSET_REQ(xvap, XAT_CREATETIME) || + XVA_ISSET_REQ(xvap, XAT_SYSTEM)))) { + need_policy = zfs_zaccess(zp, ACE_WRITE_ATTRIBUTES, 0, + skipaclchk, cr); + } + + if (mask & (ATTR_UID|ATTR_GID)) { + int idmask = (mask & (ATTR_UID|ATTR_GID)); + int take_owner; + int take_group; + + /* + * NOTE: even if a new mode is being set, + * we may clear S_ISUID/S_ISGID bits. + */ + + if (!(mask & ATTR_MODE)) + vap->va_mode = zp->z_mode; + + /* + * Take ownership or chgrp to group we are a member of + */ + + take_owner = (mask & ATTR_UID) && (vap->va_uid == crgetuid(cr)); + take_group = (mask & ATTR_GID) && + zfs_groupmember(zfsvfs, vap->va_gid, cr); + + /* + * If both ATTR_UID and ATTR_GID are set then take_owner and + * take_group must both be set in order to allow taking + * ownership. + * + * Otherwise, send the check through secpolicy_vnode_setattr() + * + */ + + if (((idmask == (ATTR_UID|ATTR_GID)) && + take_owner && take_group) || + ((idmask == ATTR_UID) && take_owner) || + ((idmask == ATTR_GID) && take_group)) { + if (zfs_zaccess(zp, ACE_WRITE_OWNER, 0, + skipaclchk, cr) == 0) { + /* + * Remove setuid/setgid for non-privileged users + */ + (void) secpolicy_setid_clear(vap, cr); + trim_mask = (mask & (ATTR_UID|ATTR_GID)); + } else { + need_policy = TRUE; + } + } else { + need_policy = TRUE; + } + } + + mutex_enter(&zp->z_lock); + oldva.va_mode = zp->z_mode; + zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); + if (mask & ATTR_XVATTR) { + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((zp->z_pflags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(tmpxvattr, XAT_APPENDONLY); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + if (xoap->xoa_projinherit != + ((zp->z_pflags & ZFS_PROJINHERIT) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_PROJINHERIT); + XVA_SET_REQ(tmpxvattr, XAT_PROJINHERIT); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((zp->z_pflags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(tmpxvattr, XAT_NOUNLINK); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((zp->z_pflags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((zp->z_pflags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((zp->z_pflags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((!S_ISREG(ip->i_mode) && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((zp->z_pflags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + err = SET_ERROR(EPERM); + goto out3; + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & ATTR_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(ip, vap, + &oldva, cr); + if (err) + goto out3; + + trim_mask |= ATTR_MODE; + } else { + need_policy = TRUE; + } + } + + if (need_policy) { + /* + * If trim_mask is set then take ownership + * has been granted or write_acl is present and user + * has the ability to modify mode. In that case remove + * UID|GID and or MODE from mask so that + * secpolicy_vnode_setattr() doesn't revoke it. + */ + + if (trim_mask) { + saved_mask = vap->va_mask; + vap->va_mask &= ~trim_mask; + } + err = secpolicy_vnode_setattr(cr, ip, vap, &oldva, flags, + (int (*)(void *, int, cred_t *))zfs_zaccess_unix, zp); + if (err) + goto out3; + + if (trim_mask) + vap->va_mask |= saved_mask; + } + + /* + * secpolicy_vnode_setattr, or take ownership may have + * changed va_mask + */ + mask = vap->va_mask; + + if ((mask & (ATTR_UID | ATTR_GID)) || projid != ZFS_INVALID_PROJID) { + handle_eadir = B_TRUE; + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_XATTR(zfsvfs), + &xattr_obj, sizeof (xattr_obj)); + + if (err == 0 && xattr_obj) { + err = zfs_zget(ZTOZSB(zp), xattr_obj, &attrzp); + if (err) + goto out2; + } + if (mask & ATTR_UID) { + new_kuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_kuid != KUID_TO_SUID(ZTOI(zp)->i_uid) && + zfs_id_overquota(zfsvfs, DMU_USERUSED_OBJECT, + new_kuid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (mask & ATTR_GID) { + new_kgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, ZFS_GROUP, &fuidp); + if (new_kgid != KGID_TO_SGID(ZTOI(zp)->i_gid) && + zfs_id_overquota(zfsvfs, DMU_GROUPUSED_OBJECT, + new_kgid)) { + if (attrzp) + zrele(attrzp); + err = SET_ERROR(EDQUOT); + goto out2; + } + } + + if (projid != ZFS_INVALID_PROJID && + zfs_id_overquota(zfsvfs, DMU_PROJECTUSED_OBJECT, projid)) { + if (attrzp) + zrele(attrzp); + err = EDQUOT; + goto out2; + } + } + tx = dmu_tx_create(os); + + if (mask & ATTR_MODE) { + uint64_t pmode = zp->z_mode; + uint64_t acl_obj; + new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); + + if (ZTOZSB(zp)->z_acl_mode == ZFS_ACL_RESTRICTED && + !(zp->z_pflags & ZFS_ACL_TRIVIAL)) { + err = EPERM; + goto out; + } + + if ((err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))) + goto out; + + mutex_enter(&zp->z_lock); + if (!zp->z_is_sa && ((acl_obj = zfs_external_acl(zp)) != 0)) { + /* + * Are we upgrading ACL from old V0 format + * to V1 format? + */ + if (zfsvfs->z_version >= ZPL_VERSION_FUID && + zfs_znode_acl_version(zp) == + ZFS_ACL_VERSION_INITIAL) { + dmu_tx_hold_free(tx, acl_obj, 0, + DMU_OBJECT_END); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } else { + dmu_tx_hold_write(tx, acl_obj, 0, + aclp->z_acl_bytes); + } + } else if (!zp->z_is_sa && aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, + 0, aclp->z_acl_bytes); + } + mutex_exit(&zp->z_lock); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + } else { + if (((mask & ATTR_XVATTR) && + XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || + (projid != ZFS_INVALID_PROJID && + !(zp->z_pflags & ZFS_PROJID))) + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + else + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + } + + if (attrzp) { + dmu_tx_hold_sa(tx, attrzp->z_sa_hdl, B_FALSE); + } + + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) + goto out; + + count = 0; + /* + * Set each attribute requested. + * We group settings according to the locks they need to acquire. + * + * Note: you cannot set ctime directly, although it will be + * updated as a side-effect of calling this function. + */ + + if (projid != ZFS_INVALID_PROJID && !(zp->z_pflags & ZFS_PROJID)) { + /* + * For the existed object that is upgraded from old system, + * its on-disk layout has no slot for the project ID attribute. + * But quota accounting logic needs to access related slots by + * offset directly. So we need to adjust old objects' layout + * to make the project ID to some unified and fixed offset. + */ + if (attrzp) + err = sa_add_projid(attrzp->z_sa_hdl, tx, projid); + if (err == 0) + err = sa_add_projid(zp->z_sa_hdl, tx, projid); + + if (unlikely(err == EEXIST)) + err = 0; + else if (err != 0) + goto out; + else + projid = ZFS_INVALID_PROJID; + } + + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&zp->z_acl_lock); + mutex_enter(&zp->z_lock); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_enter(&attrzp->z_acl_lock); + mutex_enter(&attrzp->z_lock); + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_FLAGS(zfsvfs), NULL, &attrzp->z_pflags, + sizeof (attrzp->z_pflags)); + if (projid != ZFS_INVALID_PROJID) { + attrzp->z_projid = projid; + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_PROJID(zfsvfs), NULL, &attrzp->z_projid, + sizeof (attrzp->z_projid)); + } + } + + if (mask & (ATTR_UID|ATTR_GID)) { + + if (mask & ATTR_UID) { + ZTOI(zp)->i_uid = SUID_TO_KUID(new_kuid); + new_uid = zfs_uid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &new_uid, sizeof (new_uid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_UID(zfsvfs), NULL, &new_uid, + sizeof (new_uid)); + ZTOI(attrzp)->i_uid = SUID_TO_KUID(new_uid); + } + } + + if (mask & ATTR_GID) { + ZTOI(zp)->i_gid = SGID_TO_KGID(new_kgid); + new_gid = zfs_gid_read(ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), + NULL, &new_gid, sizeof (new_gid)); + if (attrzp) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_GID(zfsvfs), NULL, &new_gid, + sizeof (new_gid)); + ZTOI(attrzp)->i_gid = SGID_TO_KGID(new_kgid); + } + } + if (!(mask & ATTR_MODE)) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), + NULL, &new_mode, sizeof (new_mode)); + new_mode = zp->z_mode; + } + err = zfs_acl_chown_setattr(zp); + ASSERT(err == 0); + if (attrzp) { + err = zfs_acl_chown_setattr(attrzp); + ASSERT(err == 0); + } + } + + if (mask & ATTR_MODE) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &new_mode, sizeof (new_mode)); + zp->z_mode = ZTOI(zp)->i_mode = new_mode; + ASSERT3P(aclp, !=, NULL); + err = zfs_aclset_common(zp, aclp, cr, tx); + ASSERT0(err); + if (zp->z_acl_cached) + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = aclp; + aclp = NULL; + } + + if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { + zp->z_atime_dirty = B_FALSE; + ZFS_TIME_ENCODE(&ip->i_atime, atime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, sizeof (atime)); + } + + if (mask & (ATTR_MTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( + vap->va_mtime, ZTOI(zp)); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + mtime, sizeof (mtime)); + } + + if (mask & (ATTR_CTIME | ATTR_SIZE)) { + ZFS_TIME_ENCODE(&vap->va_ctime, ctime); + ZTOI(zp)->i_ctime = zpl_inode_timestamp_truncate(vap->va_ctime, + ZTOI(zp)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + ctime, sizeof (ctime)); + } + + if (projid != ZFS_INVALID_PROJID) { + zp->z_projid = projid; + SA_ADD_BULK_ATTR(bulk, count, + SA_ZPL_PROJID(zfsvfs), NULL, &zp->z_projid, + sizeof (zp->z_projid)); + } + + if (attrzp && mask) { + SA_ADD_BULK_ATTR(xattr_bulk, xattr_count, + SA_ZPL_CTIME(zfsvfs), NULL, &ctime, + sizeof (ctime)); + } + + /* + * Do this after setting timestamps to prevent timestamp + * update from toggling bit + */ + + if (xoap && (mask & ATTR_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(tmpxvattr, XAT_PROJINHERIT)) { + XVA_SET_REQ(xvap, XAT_PROJINHERIT); + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) + ASSERT(S_ISREG(ip->i_mode)); + + zfs_xvattr_set(zp, xvap, tx); + } + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + if (mask != 0) + zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); + + mutex_exit(&zp->z_lock); + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&zp->z_acl_lock); + + if (attrzp) { + if (mask & (ATTR_UID|ATTR_GID|ATTR_MODE)) + mutex_exit(&attrzp->z_acl_lock); + mutex_exit(&attrzp->z_lock); + } +out: + if (err == 0 && xattr_count > 0) { + err2 = sa_bulk_update(attrzp->z_sa_hdl, xattr_bulk, + xattr_count, tx); + ASSERT(err2 == 0); + } + + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) { + dmu_tx_abort(tx); + if (attrzp) + zrele(attrzp); + if (err == ERESTART) + goto top; + } else { + if (count > 0) + err2 = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + dmu_tx_commit(tx); + if (attrzp) { + if (err2 == 0 && handle_eadir) + err2 = zfs_setattr_dir(attrzp); + zrele(attrzp); + } + zfs_inode_update(zp); + } + +out2: + if (os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + +out3: + kmem_free(xattr_bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * bulks); + kmem_free(tmpxvattr, sizeof (xvattr_t)); + ZFS_EXIT(zfsvfs); + return (err); +} + +typedef struct zfs_zlock { + krwlock_t *zl_rwlock; /* lock we acquired */ + znode_t *zl_znode; /* znode we held */ + struct zfs_zlock *zl_next; /* next in list */ +} zfs_zlock_t; + +/* + * Drop locks and release vnodes that were held by zfs_rename_lock(). + */ +static void +zfs_rename_unlock(zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + + while ((zl = *zlpp) != NULL) { + if (zl->zl_znode != NULL) + zfs_zrele_async(zl->zl_znode); + rw_exit(zl->zl_rwlock); + *zlpp = zl->zl_next; + kmem_free(zl, sizeof (*zl)); + } +} + +/* + * Search back through the directory tree, using the ".." entries. + * Lock each directory in the chain to prevent concurrent renames. + * Fail any attempt to move a directory into one of its own descendants. + * XXX - z_parent_lock can overlap with map or grow locks + */ +static int +zfs_rename_lock(znode_t *szp, znode_t *tdzp, znode_t *sdzp, zfs_zlock_t **zlpp) +{ + zfs_zlock_t *zl; + znode_t *zp = tdzp; + uint64_t rootid = ZTOZSB(zp)->z_root; + uint64_t oidp = zp->z_id; + krwlock_t *rwlp = &szp->z_parent_lock; + krw_t rw = RW_WRITER; + + /* + * First pass write-locks szp and compares to zp->z_id. + * Later passes read-lock zp and compare to zp->z_parent. + */ + do { + if (!rw_tryenter(rwlp, rw)) { + /* + * Another thread is renaming in this path. + * Note that if we are a WRITER, we don't have any + * parent_locks held yet. + */ + if (rw == RW_READER && zp->z_id > szp->z_id) { + /* + * Drop our locks and restart + */ + zfs_rename_unlock(&zl); + *zlpp = NULL; + zp = tdzp; + oidp = zp->z_id; + rwlp = &szp->z_parent_lock; + rw = RW_WRITER; + continue; + } else { + /* + * Wait for other thread to drop its locks + */ + rw_enter(rwlp, rw); + } + } + + zl = kmem_alloc(sizeof (*zl), KM_SLEEP); + zl->zl_rwlock = rwlp; + zl->zl_znode = NULL; + zl->zl_next = *zlpp; + *zlpp = zl; + + if (oidp == szp->z_id) /* We're a descendant of szp */ + return (SET_ERROR(EINVAL)); + + if (oidp == rootid) /* We've hit the top */ + return (0); + + if (rw == RW_READER) { /* i.e. not the first pass */ + int error = zfs_zget(ZTOZSB(zp), oidp, &zp); + if (error) + return (error); + zl->zl_znode = zp; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_PARENT(ZTOZSB(zp)), + &oidp, sizeof (oidp)); + rwlp = &zp->z_parent_lock; + rw = RW_READER; + + } while (zp->z_id != sdzp->z_id); + + return (0); +} + +/* + * Move an entry from the provided source directory to the target + * directory. Change the entry name as indicated. + * + * IN: sdzp - Source directory containing the "old entry". + * snm - Old entry name. + * tdzp - Target directory to contain the "new entry". + * tnm - New entry name. + * cr - credentials of caller. + * flags - case flags + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * sdzp,tdzp - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_rename(znode_t *sdzp, char *snm, znode_t *tdzp, char *tnm, + cred_t *cr, int flags) +{ + znode_t *szp, *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(sdzp); + zilog_t *zilog; + zfs_dirlock_t *sdl, *tdl; + dmu_tx_t *tx; + zfs_zlock_t *zl; + int cmp, serr, terr; + int error = 0; + int zflg = 0; + boolean_t waited = B_FALSE; + + if (snm == NULL || tnm == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(sdzp); + zilog = zfsvfs->z_log; + + ZFS_VERIFY_ZP(tdzp); + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (ZTOI(tdzp)->i_sb != ZTOI(sdzp)->i_sb || + zfsctl_is_node(ZTOI(tdzp))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + if (zfsvfs->z_utf8 && u8_validate(tnm, + strlen(tnm), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + +top: + szp = NULL; + tzp = NULL; + zl = NULL; + + /* + * This is to prevent the creation of links into attribute space + * by renaming a linked file into/outof an attribute directory. + * See the comment in zfs_link() for why this is considered bad. + */ + if ((tdzp->z_pflags & ZFS_XATTR) != (sdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Lock source and target directory entries. To prevent deadlock, + * a lock ordering must be defined. We lock the directory with + * the smallest object id first, or if it's a tie, the one with + * the lexically first name. + */ + if (sdzp->z_id < tdzp->z_id) { + cmp = -1; + } else if (sdzp->z_id > tdzp->z_id) { + cmp = 1; + } else { + /* + * First compare the two name arguments without + * considering any case folding. + */ + int nofold = (zfsvfs->z_norm & ~U8_TEXTPREP_TOUPPER); + + cmp = u8_strcmp(snm, tnm, 0, nofold, U8_UNICODE_LATEST, &error); + ASSERT(error == 0 || !zfsvfs->z_utf8); + if (cmp == 0) { + /* + * POSIX: "If the old argument and the new argument + * both refer to links to the same existing file, + * the rename() function shall return successfully + * and perform no other action." + */ + ZFS_EXIT(zfsvfs); + return (0); + } + /* + * If the file system is case-folding, then we may + * have some more checking to do. A case-folding file + * system is either supporting mixed case sensitivity + * access or is completely case-insensitive. Note + * that the file system is always case preserving. + * + * In mixed sensitivity mode case sensitive behavior + * is the default. FIGNORECASE must be used to + * explicitly request case insensitive behavior. + * + * If the source and target names provided differ only + * by case (e.g., a request to rename 'tim' to 'Tim'), + * we will treat this as a special case in the + * case-insensitive mode: as long as the source name + * is an exact match, we will allow this to proceed as + * a name-change request. + */ + if ((zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + (zfsvfs->z_case == ZFS_CASE_MIXED && + flags & FIGNORECASE)) && + u8_strcmp(snm, tnm, 0, zfsvfs->z_norm, U8_UNICODE_LATEST, + &error) == 0) { + /* + * case preserving rename request, require exact + * name matches + */ + zflg |= ZCIEXACT; + zflg &= ~ZCILOOK; + } + } + + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + + if (cmp < 0) { + serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, + ZEXISTS | zflg, NULL, NULL); + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, ZRENAMING | zflg, NULL, NULL); + } else { + terr = zfs_dirent_lock(&tdl, + tdzp, tnm, &tzp, zflg, NULL, NULL); + serr = zfs_dirent_lock(&sdl, + sdzp, snm, &szp, ZEXISTS | ZRENAMING | zflg, + NULL, NULL); + } + + if (serr) { + /* + * Source entry invalid or not there. + */ + if (!terr) { + zfs_dirent_unlock(tdl); + if (tzp) + zrele(tzp); + } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(snm, "..") == 0) + serr = EINVAL; + ZFS_EXIT(zfsvfs); + return (serr); + } + if (terr) { + zfs_dirent_unlock(sdl); + zrele(szp); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (strcmp(tnm, "..") == 0) + terr = EINVAL; + ZFS_EXIT(zfsvfs); + return (terr); + } + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow renames into our tree when the project + * IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + error = SET_ERROR(EXDEV); + goto out; + } + + /* + * Must have write access at the source to remove the old entry + * and write access at the target to create the new entry. + * Note that if target and source are the same, this can be + * done in a single check. + */ + + if ((error = zfs_zaccess_rename(sdzp, szp, tdzp, tzp, cr))) + goto out; + + if (S_ISDIR(ZTOI(szp)->i_mode)) { + /* + * Check to make sure rename is valid. + * Can't do a move like this: /usr/a/b to /usr/a/b/c/d + */ + if ((error = zfs_rename_lock(szp, tdzp, sdzp, &zl))) + goto out; + } + + /* + * Does target exist? + */ + if (tzp) { + /* + * Source and target must be the same type. + */ + if (S_ISDIR(ZTOI(szp)->i_mode)) { + if (!S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(ENOTDIR); + goto out; + } + } else { + if (S_ISDIR(ZTOI(tzp)->i_mode)) { + error = SET_ERROR(EISDIR); + goto out; + } + } + /* + * POSIX dictates that when the source and target + * entries refer to the same file object, rename + * must do nothing and exit without error. + */ + if (szp->z_id == tzp->z_id) { + error = 0; + goto out; + } + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_sa(tx, sdzp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, sdzp->z_id, FALSE, snm); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, tnm); + if (sdzp != tdzp) { + dmu_tx_hold_sa(tx, tdzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tdzp); + } + if (tzp) { + dmu_tx_hold_sa(tx, tzp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, tzp); + } + + zfs_sa_upgrade_txholds(tx, szp); + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + if (zl != NULL) + zfs_rename_unlock(&zl); + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + goto top; + } + dmu_tx_abort(tx); + zrele(szp); + if (tzp) + zrele(tzp); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (tzp) /* Attempt to remove the existing target */ + error = zfs_link_destroy(tdl, tzp, tx, zflg, NULL); + + if (error == 0) { + error = zfs_link_create(tdl, szp, tx, ZRENAMING); + if (error == 0) { + szp->z_pflags |= ZFS_AV_MODIFIED; + if (tdzp->z_pflags & ZFS_PROJINHERIT) + szp->z_pflags |= ZFS_PROJINHERIT; + + error = sa_update(szp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + (void *)&szp->z_pflags, sizeof (uint64_t), tx); + ASSERT0(error); + + error = zfs_link_destroy(sdl, szp, tx, ZRENAMING, NULL); + if (error == 0) { + zfs_log_rename(zilog, tx, TX_RENAME | + (flags & FIGNORECASE ? TX_CI : 0), sdzp, + sdl->dl_name, tdzp, tdl->dl_name, szp); + } else { + /* + * At this point, we have successfully created + * the target name, but have failed to remove + * the source name. Since the create was done + * with the ZRENAMING flag, there are + * complications; for one, the link count is + * wrong. The easiest way to deal with this + * is to remove the newly created target, and + * return the original error. This must + * succeed; fortunately, it is very unlikely to + * fail, since we just created it. + */ + VERIFY3U(zfs_link_destroy(tdl, szp, tx, + ZRENAMING, NULL), ==, 0); + } + } else { + /* + * If we had removed the existing target, subsequent + * call to zfs_link_create() to add back the same entry + * but, the new dnode (szp) should not fail. + */ + ASSERT(tzp == NULL); + } + } + + dmu_tx_commit(tx); +out: + if (zl != NULL) + zfs_rename_unlock(&zl); + + zfs_dirent_unlock(sdl); + zfs_dirent_unlock(tdl); + + zfs_inode_update(sdzp); + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + if (sdzp != tdzp) + zfs_inode_update(tdzp); + + zfs_inode_update(szp); + zrele(szp); + if (tzp) { + zfs_inode_update(tzp); + zrele(tzp); + } + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert the indicated symbolic reference entry into the directory. + * + * IN: dzp - Directory to contain new symbolic link. + * name - Name of directory entry in dip. + * vap - Attributes of new entry. + * link - Name for new symlink entry. + * cr - credentials of caller. + * flags - case flags + * + * OUT: zpp - Znode for new symbolic link. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * dip - ctime|mtime updated + */ +/*ARGSUSED*/ +int +zfs_symlink(znode_t *dzp, char *name, vattr_t *vap, char *link, + znode_t **zpp, cred_t *cr, int flags) +{ + znode_t *zp; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + zilog_t *zilog; + uint64_t len = strlen(link); + int error; + int zflg = ZNEW; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; + uint64_t txtype = TX_SYMLINK; + boolean_t waited = B_FALSE; + + ASSERT(S_ISLNK(vap->va_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(dzp); + zilog = zfsvfs->z_log; + + if (zfsvfs->z_utf8 && u8_validate(name, strlen(name), + NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zflg |= ZCILOOK; + + if (len > MAXPATHLEN) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENAMETOOLONG)); + } + + if ((error = zfs_acl_ids_create(dzp, 0, + vap, cr, NULL, &acl_ids)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } +top: + *zpp = NULL; + + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, dzp, name, &zp, zflg, NULL, NULL); + if (error) { + zfs_acl_ids_free(&acl_ids); + ZFS_EXIT(zfsvfs); + return (error); + } + + if ((error = zfs_zaccess(dzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids, ZFS_DEFAULT_PROJID)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EDQUOT)); + } + tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); + dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); + dmu_tx_hold_sa_create(tx, acl_ids.z_aclp->z_acl_bytes + + ZFS_SA_BASE_ATTR_SIZE + len); + dmu_tx_hold_sa(tx, dzp->z_sa_hdl, B_FALSE); + if (!zfsvfs->z_use_sa && acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + acl_ids.z_aclp->z_acl_bytes); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + zfs_acl_ids_free(&acl_ids); + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + + /* + * Create a new object for the symlink. + * for version 4 ZPL datsets the symlink will be an SA attribute + */ + zfs_mknode(dzp, vap, tx, cr, 0, &zp, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_update(zp->z_sa_hdl, SA_ZPL_SYMLINK(zfsvfs), + link, len, tx); + else + zfs_sa_symlink(zp, link, len, tx); + mutex_exit(&zp->z_lock); + + zp->z_size = len; + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + &zp->z_size, sizeof (zp->z_size), tx); + /* + * Insert the new object into the directory. + */ + error = zfs_link_create(dl, zp, tx, ZNEW); + if (error != 0) { + zfs_znode_delete(zp, tx); + remove_inode_hash(ZTOI(zp)); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); + + zfs_inode_update(dzp); + zfs_inode_update(zp); + } + + zfs_acl_ids_free(&acl_ids); + + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (error == 0) { + *zpp = zp; + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + } else { + zrele(zp); + } + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Return, in the buffer contained in the provided uio structure, + * the symbolic path referred to by ip. + * + * IN: ip - inode of symbolic link + * uio - structure to contain the link path. + * cr - credentials of caller. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - atime updated + */ +/* ARGSUSED */ +int +zfs_readlink(struct inode *ip, uio_t *uio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + mutex_enter(&zp->z_lock); + if (zp->z_is_sa) + error = sa_lookup_uio(zp->z_sa_hdl, + SA_ZPL_SYMLINK(zfsvfs), uio); + else + error = zfs_sa_readlink(zp, uio); + mutex_exit(&zp->z_lock); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/* + * Insert a new entry into directory tdzp referencing szp. + * + * IN: tdzp - Directory to contain new entry. + * szp - znode of new entry. + * name - name of new entry. + * cr - credentials of caller. + * flags - case flags. + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * tdzp - ctime|mtime updated + * szp - ctime updated + */ +/* ARGSUSED */ +int +zfs_link(znode_t *tdzp, znode_t *szp, char *name, cred_t *cr, + int flags) +{ + struct inode *sip = ZTOI(szp); + znode_t *tzp; + zfsvfs_t *zfsvfs = ZTOZSB(tdzp); + zilog_t *zilog; + zfs_dirlock_t *dl; + dmu_tx_t *tx; + int error; + int zf = ZNEW; + uint64_t parent; + uid_t owner; + boolean_t waited = B_FALSE; + boolean_t is_tmpfile = 0; + uint64_t txg; +#ifdef HAVE_TMPFILE + is_tmpfile = (sip->i_nlink == 0 && (sip->i_state & I_LINKABLE)); +#endif + ASSERT(S_ISDIR(ZTOI(tdzp)->i_mode)); + + if (name == NULL) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(tdzp); + zilog = zfsvfs->z_log; + + /* + * POSIX dictates that we return EPERM here. + * Better choices include ENOTSUP or EISDIR. + */ + if (S_ISDIR(sip->i_mode)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + ZFS_VERIFY_ZP(szp); + + /* + * If we are using project inheritance, means if the directory has + * ZFS_PROJINHERIT set, then its descendant directories will inherit + * not only the project ID, but also the ZFS_PROJINHERIT flag. Under + * such case, we only allow hard link creation in our tree when the + * project IDs are the same. + */ + if (tdzp->z_pflags & ZFS_PROJINHERIT && + tdzp->z_projid != szp->z_projid) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* + * We check i_sb because snapshots and the ctldir must have different + * super blocks. + */ + if (sip->i_sb != ZTOI(tdzp)->i_sb || zfsctl_is_node(sip)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EXDEV)); + } + + /* Prevent links to .zfs/shares files */ + + if ((error = sa_lookup(szp->z_sa_hdl, SA_ZPL_PARENT(zfsvfs), + &parent, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + if (parent == zfsvfs->z_shares_dir) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if (zfsvfs->z_utf8 && u8_validate(name, + strlen(name), NULL, U8_VALIDATE_ENTIRE, &error) < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EILSEQ)); + } + if (flags & FIGNORECASE) + zf |= ZCILOOK; + + /* + * We do not support links between attributes and non-attributes + * because of the potential security risk of creating links + * into "normal" file space in order to circumvent restrictions + * imposed in attribute space. + */ + if ((szp->z_pflags & ZFS_XATTR) != (tdzp->z_pflags & ZFS_XATTR)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + owner = zfs_fuid_map_id(zfsvfs, KUID_TO_SUID(sip->i_uid), + cr, ZFS_OWNER); + if (owner != crgetuid(cr) && secpolicy_basic_link(cr) != 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((error = zfs_zaccess(tdzp, ACE_ADD_FILE, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + +top: + /* + * Attempt to lock directory; fail if entry already exists. + */ + error = zfs_dirent_lock(&dl, tdzp, name, &tzp, zf, NULL, NULL); + if (error) { + ZFS_EXIT(zfsvfs); + return (error); + } + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, szp->z_sa_hdl, B_FALSE); + dmu_tx_hold_zap(tx, tdzp->z_id, TRUE, name); + if (is_tmpfile) + dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); + + zfs_sa_upgrade_txholds(tx, szp); + zfs_sa_upgrade_txholds(tx, tdzp); + error = dmu_tx_assign(tx, (waited ? TXG_NOTHROTTLE : 0) | TXG_NOWAIT); + if (error) { + zfs_dirent_unlock(dl); + if (error == ERESTART) { + waited = B_TRUE; + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + ZFS_EXIT(zfsvfs); + return (error); + } + /* unmark z_unlinked so zfs_link_create will not reject */ + if (is_tmpfile) + szp->z_unlinked = B_FALSE; + error = zfs_link_create(dl, szp, tx, 0); + + if (error == 0) { + uint64_t txtype = TX_LINK; + /* + * tmpfile is created to be in z_unlinkedobj, so remove it. + * Also, we don't log in ZIL, because all previous file + * operation on the tmpfile are ignored by ZIL. Instead we + * always wait for txg to sync to make sure all previous + * operation are sync safe. + */ + if (is_tmpfile) { + VERIFY(zap_remove_int(zfsvfs->z_os, + zfsvfs->z_unlinkedobj, szp->z_id, tx) == 0); + } else { + if (flags & FIGNORECASE) + txtype |= TX_CI; + zfs_log_link(zilog, tx, txtype, tdzp, szp, name); + } + } else if (is_tmpfile) { + /* restore z_unlinked since when linking failed */ + szp->z_unlinked = B_TRUE; + } + txg = dmu_tx_get_txg(tx); + dmu_tx_commit(tx); + + zfs_dirent_unlock(dl); + + if (!is_tmpfile && zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + if (is_tmpfile && zfsvfs->z_os->os_sync != ZFS_SYNC_DISABLED) + txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), txg); + + zfs_inode_update(tdzp); + zfs_inode_update(szp); + ZFS_EXIT(zfsvfs); + return (error); +} + +static void +zfs_putpage_commit_cb(void *arg) +{ + struct page *pp = arg; + + ClearPageError(pp); + end_page_writeback(pp); +} + +/* + * Push a page out to disk, once the page is on stable storage the + * registered commit callback will be run as notification of completion. + * + * IN: ip - page mapped for inode. + * pp - page to push (page is locked) + * wbc - writeback control data + * + * RETURN: 0 if success + * error code if failure + * + * Timestamps: + * ip - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + loff_t offset; + loff_t pgoff; + unsigned int pglen; + dmu_tx_t *tx; + caddr_t va; + int err = 0; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int cnt = 0; + struct address_space *mapping; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + ASSERT(PageLocked(pp)); + + pgoff = page_offset(pp); /* Page byte-offset in file */ + offset = i_size_read(ip); /* File length in bytes */ + pglen = MIN(PAGE_SIZE, /* Page length in bytes */ + P2ROUNDUP(offset, PAGE_SIZE)-pgoff); + + /* Page is beyond end of file */ + if (pgoff >= offset) { + unlock_page(pp); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Truncate page length to end of file */ + if (pgoff + pglen > offset) + pglen = offset - pgoff; + +#if 0 + /* + * FIXME: Allow mmap writes past its quota. The correct fix + * is to register a page_mkwrite() handler to count the page + * against its quota when it is about to be dirtied. + */ + if (zfs_id_overblockquota(zfsvfs, DMU_USERUSED_OBJECT, + KUID_TO_SUID(ip->i_uid)) || + zfs_id_overblockquota(zfsvfs, DMU_GROUPUSED_OBJECT, + KGID_TO_SGID(ip->i_gid)) || + (zp->z_projid != ZFS_DEFAULT_PROJID && + zfs_id_overblockquota(zfsvfs, DMU_PROJECTUSED_OBJECT, + zp->z_projid))) { + err = EDQUOT; + } +#endif + + /* + * The ordering here is critical and must adhere to the following + * rules in order to avoid deadlocking in either zfs_read() or + * zfs_free_range() due to a lock inversion. + * + * 1) The page must be unlocked prior to acquiring the range lock. + * This is critical because zfs_read() calls find_lock_page() + * which may block on the page lock while holding the range lock. + * + * 2) Before setting or clearing write back on a page the range lock + * must be held in order to prevent a lock inversion with the + * zfs_free_range() function. + * + * This presents a problem because upon entering this function the + * page lock is already held. To safely acquire the range lock the + * page lock must be dropped. This creates a window where another + * process could truncate, invalidate, dirty, or write out the page. + * + * Therefore, after successfully reacquiring the range and page locks + * the current page state is checked. In the common case everything + * will be as is expected and it can be written out. However, if + * the page state has changed it must be handled accordingly. + */ + mapping = pp->mapping; + redirty_page_for_writepage(wbc, pp); + unlock_page(pp); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zp->z_rangelock, + pgoff, pglen, RL_WRITER); + lock_page(pp); + + /* Page mapping changed or it was no longer dirty, we're done */ + if (unlikely((mapping != pp->mapping) || !PageDirty(pp))) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Another process started write block if required */ + if (PageWriteback(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(pp)) + wait_on_page_bit(pp, PG_writeback); + } + + ZFS_EXIT(zfsvfs); + return (0); + } + + /* Clear the dirty flag the required locks are held */ + if (!clear_page_dirty_for_io(pp)) { + unlock_page(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (0); + } + + /* + * Counterpart for redirty_page_for_writepage() above. This page + * was in fact not skipped and should not be counted as if it were. + */ + wbc->pages_skipped--; + set_page_writeback(pp); + unlock_page(pp); + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_write(tx, zp->z_id, pgoff, pglen); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err != 0) { + if (err == ERESTART) + dmu_tx_wait(tx); + + dmu_tx_abort(tx); + __set_page_dirty_nobuffers(pp); + ClearPageError(pp); + end_page_writeback(pp); + zfs_rangelock_exit(lr); + ZFS_EXIT(zfsvfs); + return (err); + } + + va = kmap(pp); + ASSERT3U(pglen, <=, PAGE_SIZE); + dmu_write(zfsvfs->z_os, zp->z_id, pgoff, pglen, va, tx); + kunmap(pp); + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + + /* Preserve the mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + zp->z_atime_dirty = B_FALSE; + zp->z_seq++; + + err = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, pgoff, pglen, 0, + zfs_putpage_commit_cb, pp); + dmu_tx_commit(tx); + + zfs_rangelock_exit(lr); + + if (wbc->sync_mode != WB_SYNC_NONE) { + /* + * Note that this is rarely called under writepages(), because + * writepages() normally handles the entire commit for + * performance reasons. + */ + zil_commit(zfsvfs->z_log, zp->z_id); + } + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Update the system attributes when the inode has been dirtied. For the + * moment we only update the mode, atime, mtime, and ctime. + */ +int +zfs_dirty_inode(struct inode *ip, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + dmu_tx_t *tx; + uint64_t mode, atime[2], mtime[2], ctime[2]; + sa_bulk_attr_t bulk[4]; + int error = 0; + int cnt = 0; + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + +#ifdef I_DIRTY_TIME + /* + * This is the lazytime semantic introduced in Linux 4.0 + * This flag will only be called from update_time when lazytime is set. + * (Note, I_DIRTY_SYNC will also set if not lazytime) + * Fortunately mtime and ctime are managed within ZFS itself, so we + * only need to dirty atime. + */ + if (flags == I_DIRTY_TIME) { + zp->z_atime_dirty = B_TRUE; + goto out; + } +#endif + + tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + mutex_enter(&zp->z_lock); + zp->z_atime_dirty = B_FALSE; + + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + /* Preserve the mode, mtime and ctime provided by the inode */ + ZFS_TIME_ENCODE(&ip->i_atime, atime); + ZFS_TIME_ENCODE(&ip->i_mtime, mtime); + ZFS_TIME_ENCODE(&ip->i_ctime, ctime); + mode = ip->i_mode; + + zp->z_mode = mode; + + error = sa_bulk_update(zp->z_sa_hdl, bulk, cnt, tx); + mutex_exit(&zp->z_lock); + + dmu_tx_commit(tx); +out: + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +void +zfs_inactive(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint64_t atime[2]; + int error; + int need_unlock = 0; + + /* Only read lock if we haven't already write locked, e.g. rollback */ + if (!RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)) { + need_unlock = 1; + rw_enter(&zfsvfs->z_teardown_inactive_lock, RW_READER); + } + if (zp->z_sa_hdl == NULL) { + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); + return; + } + + if (zp->z_atime_dirty && zp->z_unlinked == B_FALSE) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + ZFS_TIME_ENCODE(&ip->i_atime, atime); + mutex_enter(&zp->z_lock); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), + (void *)&atime, sizeof (atime), tx); + zp->z_atime_dirty = B_FALSE; + mutex_exit(&zp->z_lock); + dmu_tx_commit(tx); + } + } + + zfs_zinactive(zp); + if (need_unlock) + rw_exit(&zfsvfs->z_teardown_inactive_lock); +} + +/* + * Fill pages with data from the disk. + */ +static int +zfs_fillpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + objset_t *os; + struct page *cur_pp; + u_offset_t io_off, total; + size_t io_len; + loff_t i_size; + unsigned page_idx; + int err; + + os = zfsvfs->z_os; + io_len = nr_pages << PAGE_SHIFT; + i_size = i_size_read(ip); + io_off = page_offset(pl[0]); + + if (io_off + io_len > i_size) + io_len = i_size - io_off; + + /* + * Iterate over list of pages and read each page individually. + */ + page_idx = 0; + for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + + cur_pp = pl[page_idx++]; + va = kmap(cur_pp); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); + kunmap(cur_pp); + if (err) { + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + } + + return (0); +} + +/* + * Uses zfs_fillpage to read data from the file and fill the pages. + * + * IN: ip - inode of file to get data from. + * pl - list of pages to read + * nr_pages - number of pages to read + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * vp - atime updated + */ +/* ARGSUSED */ +int +zfs_getpage(struct inode *ip, struct page *pl[], int nr_pages) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int err; + + if (pl == NULL) + return (0); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + err = zfs_fillpage(ip, pl, nr_pages); + + ZFS_EXIT(zfsvfs); + return (err); +} + +/* + * Check ZFS specific permissions to memory map a section of a file. + * + * IN: ip - inode of the file to mmap + * off - file offset + * addrp - start address in memory region + * len - length of memory region + * vm_flags- address flags + * + * RETURN: 0 if success + * error code if failure + */ +/*ARGSUSED*/ +int +zfs_map(struct inode *ip, offset_t off, caddr_t *addrp, size_t len, + unsigned long vm_flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((vm_flags & VM_WRITE) && (zp->z_pflags & + (ZFS_IMMUTABLE | ZFS_READONLY | ZFS_APPENDONLY))) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EPERM)); + } + + if ((vm_flags & (VM_READ | VM_EXEC)) && + (zp->z_pflags & ZFS_AV_QUARANTINED)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EACCES)); + } + + if (off < 0 || len > MAXOFFSET_T - off) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(ENXIO)); + } + + ZFS_EXIT(zfsvfs); + return (0); +} + +/* + * Free or allocate space in a file. Currently, this function only + * supports the `F_FREESP' command. However, this command is somewhat + * misnamed, as its functionality includes the ability to allocate as + * well as free space. + * + * IN: zp - znode of file to free data in. + * cmd - action to take (only F_FREESP supported). + * bfp - section of file to free/alloc. + * flag - current file open mode flags. + * offset - current file offset. + * cr - credentials of caller. + * + * RETURN: 0 on success, error code on failure. + * + * Timestamps: + * zp - ctime|mtime updated + */ +/* ARGSUSED */ +int +zfs_space(znode_t *zp, int cmd, flock64_t *bfp, int flag, + offset_t offset, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t off, len; + int error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if (cmd != F_FREESP) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Callers might not be able to detect properly that we are read-only, + * so check it explicitly here. + */ + if (zfs_is_readonly(zfsvfs)) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EROFS)); + } + + if (bfp->l_len < 0) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + /* + * Permissions aren't checked on Solaris because on this OS + * zfs_space() can only be called with an opened file handle. + * On Linux we can get here through truncate_range() which + * operates directly on inodes, so we need to check access rights. + */ + if ((error = zfs_zaccess(zp, ACE_WRITE_DATA, 0, B_FALSE, cr))) { + ZFS_EXIT(zfsvfs); + return (error); + } + + off = bfp->l_start; + len = bfp->l_len; /* 0 means from off to end of file */ + + error = zfs_freesp(zp, off, len, flag, TRUE); + + ZFS_EXIT(zfsvfs); + return (error); +} + +/*ARGSUSED*/ +int +zfs_fid(struct inode *ip, fid_t *fidp) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + uint32_t gen; + uint64_t gen64; + uint64_t object = zp->z_id; + zfid_short_t *zfid; + int size, i, error; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(zfsvfs), + &gen64, sizeof (uint64_t))) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + + gen = (uint32_t)gen64; + + size = SHORT_FID_LEN; + + zfid = (zfid_short_t *)fidp; + + zfid->zf_len = size; + + for (i = 0; i < sizeof (zfid->zf_object); i++) + zfid->zf_object[i] = (uint8_t)(object >> (8 * i)); + + /* Must have a non-zero generation number to distinguish from .zfs */ + if (gen == 0) + gen = 1; + for (i = 0; i < sizeof (zfid->zf_gen); i++) + zfid->zf_gen[i] = (uint8_t)(gen >> (8 * i)); + + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +int +zfs_getsecattr(struct inode *ip, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + error = zfs_getacl(zp, vsecp, skipaclchk, cr); + ZFS_EXIT(zfsvfs); + + return (error); +} + +/*ARGSUSED*/ +int +zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + zilog_t *zilog = zfsvfs->z_log; + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + + error = zfs_setacl(zp, vsecp, skipaclchk, cr); + + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + zil_commit(zilog, 0); + + ZFS_EXIT(zfsvfs); + return (error); +} + +#ifdef HAVE_UIO_ZEROCOPY +/* + * The smallest read we may consider to loan out an arcbuf. + * This must be a power of 2. + */ +int zcr_blksz_min = (1 << 10); /* 1K */ +/* + * If set to less than the file block size, allow loaning out of an + * arcbuf for a partial block read. This must be a power of 2. + */ +int zcr_blksz_max = (1 << 17); /* 128K */ + +/*ARGSUSED*/ +static int +zfs_reqzcbuf(struct inode *ip, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ITOZSB(ip); + int max_blksz = zfsvfs->z_max_blksz; + uio_t *uio = &xuio->xu_uio; + ssize_t size = uio->uio_resid; + offset_t offset = uio->uio_loffset; + int blksz; + int fullblk, i; + arc_buf_t *abuf; + ssize_t maxsize; + int preamble, postamble; + + if (xuio->xu_type != UIOTYPE_ZEROCOPY) + return (SET_ERROR(EINVAL)); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + switch (ioflag) { + case UIO_WRITE: + /* + * Loan out an arc_buf for write if write size is bigger than + * max_blksz, and the file's block size is also max_blksz. + */ + blksz = max_blksz; + if (size < blksz || zp->z_blksz != blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + /* + * Caller requests buffers for write before knowing where the + * write offset might be (e.g. NFS TCP write). + */ + if (offset == -1) { + preamble = 0; + } else { + preamble = P2PHASE(offset, blksz); + if (preamble) { + preamble = blksz - preamble; + size -= preamble; + } + } + + postamble = P2PHASE(size, blksz); + size -= postamble; + + fullblk = size / blksz; + (void) dmu_xuio_init(xuio, + (preamble != 0) + fullblk + (postamble != 0)); + + /* + * Have to fix iov base/len for partial buffers. They + * currently represent full arc_buf's. + */ + if (preamble) { + /* data begins in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, + blksz - preamble, preamble); + } + + for (i = 0; i < fullblk; i++) { + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, blksz); + } + + if (postamble) { + /* data ends in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(sa_get_db(zp->z_sa_hdl), + blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, postamble); + } + break; + case UIO_READ: + /* + * Loan out an arc_buf for read if the read size is larger than + * the current file block size. Block alignment is not + * considered. Partial arc_buf will be loaned out for read. + */ + blksz = zp->z_blksz; + if (blksz < zcr_blksz_min) + blksz = zcr_blksz_min; + if (blksz > zcr_blksz_max) + blksz = zcr_blksz_max; + /* avoid potential complexity of dealing with it */ + if (blksz > max_blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + maxsize = zp->z_size - uio->uio_loffset; + if (size > maxsize) + size = maxsize; + + if (size < blksz) { + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + break; + default: + ZFS_EXIT(zfsvfs); + return (SET_ERROR(EINVAL)); + } + + uio->uio_extflg = UIO_XUIO; + XUIO_XUZC_RW(xuio) = ioflag; + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static int +zfs_retzcbuf(struct inode *ip, xuio_t *xuio, cred_t *cr) +{ + int i; + arc_buf_t *abuf; + int ioflag = XUIO_XUZC_RW(xuio); + + ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); + + i = dmu_xuio_cnt(xuio); + while (i-- > 0) { + abuf = dmu_xuio_arcbuf(xuio, i); + /* + * if abuf == NULL, it must be a write buffer + * that has been returned in zfs_write(). + */ + if (abuf) + dmu_return_arcbuf(abuf); + ASSERT(abuf || ioflag == UIO_WRITE); + } + + dmu_xuio_fini(xuio); + return (0); +} +#endif /* HAVE_UIO_ZEROCOPY */ + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_open); +EXPORT_SYMBOL(zfs_close); +EXPORT_SYMBOL(zfs_read); +EXPORT_SYMBOL(zfs_write); +EXPORT_SYMBOL(zfs_access); +EXPORT_SYMBOL(zfs_lookup); +EXPORT_SYMBOL(zfs_create); +EXPORT_SYMBOL(zfs_tmpfile); +EXPORT_SYMBOL(zfs_remove); +EXPORT_SYMBOL(zfs_mkdir); +EXPORT_SYMBOL(zfs_rmdir); +EXPORT_SYMBOL(zfs_readdir); +EXPORT_SYMBOL(zfs_fsync); +EXPORT_SYMBOL(zfs_getattr_fast); +EXPORT_SYMBOL(zfs_setattr); +EXPORT_SYMBOL(zfs_rename); +EXPORT_SYMBOL(zfs_symlink); +EXPORT_SYMBOL(zfs_readlink); +EXPORT_SYMBOL(zfs_link); +EXPORT_SYMBOL(zfs_inactive); +EXPORT_SYMBOL(zfs_space); +EXPORT_SYMBOL(zfs_fid); +EXPORT_SYMBOL(zfs_getsecattr); +EXPORT_SYMBOL(zfs_setsecattr); +EXPORT_SYMBOL(zfs_getpage); +EXPORT_SYMBOL(zfs_putpage); +EXPORT_SYMBOL(zfs_dirty_inode); +EXPORT_SYMBOL(zfs_map); + +/* BEGIN CSTYLED */ +module_param(zfs_delete_blocks, ulong, 0644); +MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); +module_param(zfs_read_chunk_size, ulong, 0644); +MODULE_PARM_DESC(zfs_read_chunk_size, "Bytes to read per chunk"); +/* END CSTYLED */ + +#endif diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c new file mode 100644 index 000000000000..a542c662cb15 --- /dev/null +++ b/module/os/linux/zfs/zfs_znode.c @@ -0,0 +1,2249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* Portions Copyright 2007 Jeremy Teo */ + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* + * Functions needed for userland (ie: libzpool) are not put under + * #ifdef_KERNEL; the rest of the functions have dependencies + * (such as VFS logic) that will not compile easily in userland. + */ +#ifdef _KERNEL + +static kmem_cache_t *znode_cache = NULL; +static kmem_cache_t *znode_hold_cache = NULL; +unsigned int zfs_object_mutex_size = ZFS_OBJ_MTX_SZ; + +/* + * This is used by the test suite so that it can delay znodes from being + * freed in order to inspect the unlinked set. + */ +int zfs_unlink_suspend_progress = 0; + +/* + * This callback is invoked when acquiring a RL_WRITER or RL_APPEND lock on + * z_rangelock. It will modify the offset and length of the lock to reflect + * znode-specific information, and convert RL_APPEND to RL_WRITER. This is + * called with the rangelock_t's rl_lock held, which avoids races. + */ +static void +zfs_rangelock_cb(zfs_locked_range_t *new, void *arg) +{ + znode_t *zp = arg; + + /* + * If in append mode, convert to writer and lock starting at the + * current end of file. + */ + if (new->lr_type == RL_APPEND) { + new->lr_offset = zp->z_size; + new->lr_type = RL_WRITER; + } + + /* + * If we need to grow the block size then lock the whole file range. + */ + uint64_t end_size = MAX(zp->z_size, new->lr_offset + new->lr_length); + if (end_size > zp->z_blksz && (!ISP2(zp->z_blksz) || + zp->z_blksz < ZTOZSB(zp)->z_max_blksz)) { + new->lr_offset = 0; + new->lr_length = UINT64_MAX; + } +} + +/*ARGSUSED*/ +static int +zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_t *zp = buf; + + inode_init_once(ZTOI(zp)); + list_link_init(&zp->z_link_node); + + mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zp->z_name_lock, NULL, RW_NOLOCKDEP, NULL); + mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); + rw_init(&zp->z_xattr_lock, NULL, RW_DEFAULT, NULL); + + zfs_rangelock_init(&zp->z_rangelock, zfs_rangelock_cb, zp); + + zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; + zp->z_xattr_cached = NULL; + zp->z_xattr_parent = 0; + zp->z_moved = B_FALSE; + return (0); +} + +/*ARGSUSED*/ +static void +zfs_znode_cache_destructor(void *buf, void *arg) +{ + znode_t *zp = buf; + + ASSERT(!list_link_active(&zp->z_link_node)); + mutex_destroy(&zp->z_lock); + rw_destroy(&zp->z_parent_lock); + rw_destroy(&zp->z_name_lock); + mutex_destroy(&zp->z_acl_lock); + rw_destroy(&zp->z_xattr_lock); + zfs_rangelock_fini(&zp->z_rangelock); + + ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); + ASSERT(zp->z_xattr_cached == NULL); +} + +static int +zfs_znode_hold_cache_constructor(void *buf, void *arg, int kmflags) +{ + znode_hold_t *zh = buf; + + mutex_init(&zh->zh_lock, NULL, MUTEX_DEFAULT, NULL); + zfs_refcount_create(&zh->zh_refcount); + zh->zh_obj = ZFS_NO_OBJECT; + + return (0); +} + +static void +zfs_znode_hold_cache_destructor(void *buf, void *arg) +{ + znode_hold_t *zh = buf; + + mutex_destroy(&zh->zh_lock); + zfs_refcount_destroy(&zh->zh_refcount); +} + +void +zfs_znode_init(void) +{ + /* + * Initialize zcache. The KMC_SLAB hint is used in order that it be + * backed by kmalloc() when on the Linux slab in order that any + * wait_on_bit() operations on the related inode operate properly. + */ + ASSERT(znode_cache == NULL); + znode_cache = kmem_cache_create("zfs_znode_cache", + sizeof (znode_t), 0, zfs_znode_cache_constructor, + zfs_znode_cache_destructor, NULL, NULL, NULL, KMC_SLAB); + + ASSERT(znode_hold_cache == NULL); + znode_hold_cache = kmem_cache_create("zfs_znode_hold_cache", + sizeof (znode_hold_t), 0, zfs_znode_hold_cache_constructor, + zfs_znode_hold_cache_destructor, NULL, NULL, NULL, 0); +} + +void +zfs_znode_fini(void) +{ + /* + * Cleanup zcache + */ + if (znode_cache) + kmem_cache_destroy(znode_cache); + znode_cache = NULL; + + if (znode_hold_cache) + kmem_cache_destroy(znode_hold_cache); + znode_hold_cache = NULL; +} + +/* + * The zfs_znode_hold_enter() / zfs_znode_hold_exit() functions are used to + * serialize access to a znode and its SA buffer while the object is being + * created or destroyed. This kind of locking would normally reside in the + * znode itself but in this case that's impossible because the znode and SA + * buffer may not yet exist. Therefore the locking is handled externally + * with an array of mutexs and AVLs trees which contain per-object locks. + * + * In zfs_znode_hold_enter() a per-object lock is created as needed, inserted + * in to the correct AVL tree and finally the per-object lock is held. In + * zfs_znode_hold_exit() the process is reversed. The per-object lock is + * released, removed from the AVL tree and destroyed if there are no waiters. + * + * This scheme has two important properties: + * + * 1) No memory allocations are performed while holding one of the z_hold_locks. + * This ensures evict(), which can be called from direct memory reclaim, will + * never block waiting on a z_hold_locks which just happens to have hashed + * to the same index. + * + * 2) All locks used to serialize access to an object are per-object and never + * shared. This minimizes lock contention without creating a large number + * of dedicated locks. + * + * On the downside it does require znode_lock_t structures to be frequently + * allocated and freed. However, because these are backed by a kmem cache + * and very short lived this cost is minimal. + */ +int +zfs_znode_hold_compare(const void *a, const void *b) +{ + const znode_hold_t *zh_a = (const znode_hold_t *)a; + const znode_hold_t *zh_b = (const znode_hold_t *)b; + + return (TREE_CMP(zh_a->zh_obj, zh_b->zh_obj)); +} + +static boolean_t __maybe_unused +zfs_znode_held(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t held; + + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + held = (zh && MUTEX_HELD(&zh->zh_lock)) ? B_TRUE : B_FALSE; + mutex_exit(&zfsvfs->z_hold_locks[i]); + + return (held); +} + +static znode_hold_t * +zfs_znode_hold_enter(zfsvfs_t *zfsvfs, uint64_t obj) +{ + znode_hold_t *zh, *zh_new, search; + int i = ZFS_OBJ_HASH(zfsvfs, obj); + boolean_t found = B_FALSE; + + zh_new = kmem_cache_alloc(znode_hold_cache, KM_SLEEP); + zh_new->zh_obj = obj; + search.zh_obj = obj; + + mutex_enter(&zfsvfs->z_hold_locks[i]); + zh = avl_find(&zfsvfs->z_hold_trees[i], &search, NULL); + if (likely(zh == NULL)) { + zh = zh_new; + avl_add(&zfsvfs->z_hold_trees[i], zh); + } else { + ASSERT3U(zh->zh_obj, ==, obj); + found = B_TRUE; + } + zfs_refcount_add(&zh->zh_refcount, NULL); + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (found == B_TRUE) + kmem_cache_free(znode_hold_cache, zh_new); + + ASSERT(MUTEX_NOT_HELD(&zh->zh_lock)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_enter(&zh->zh_lock); + + return (zh); +} + +static void +zfs_znode_hold_exit(zfsvfs_t *zfsvfs, znode_hold_t *zh) +{ + int i = ZFS_OBJ_HASH(zfsvfs, zh->zh_obj); + boolean_t remove = B_FALSE; + + ASSERT(zfs_znode_held(zfsvfs, zh->zh_obj)); + ASSERT3S(zfs_refcount_count(&zh->zh_refcount), >, 0); + mutex_exit(&zh->zh_lock); + + mutex_enter(&zfsvfs->z_hold_locks[i]); + if (zfs_refcount_remove(&zh->zh_refcount, NULL) == 0) { + avl_remove(&zfsvfs->z_hold_trees[i], zh); + remove = B_TRUE; + } + mutex_exit(&zfsvfs->z_hold_locks[i]); + + if (remove == B_TRUE) + kmem_cache_free(znode_hold_cache, zh); +} + +dev_t +zfs_cmpldev(uint64_t dev) +{ + return (dev); +} + +static void +zfs_znode_sa_init(zfsvfs_t *zfsvfs, znode_t *zp, + dmu_buf_t *db, dmu_object_type_t obj_type, sa_handle_t *sa_hdl) +{ + ASSERT(zfs_znode_held(zfsvfs, zp->z_id)); + + mutex_enter(&zp->z_lock); + + ASSERT(zp->z_sa_hdl == NULL); + ASSERT(zp->z_acl_cached == NULL); + if (sa_hdl == NULL) { + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, zp, + SA_HDL_SHARED, &zp->z_sa_hdl)); + } else { + zp->z_sa_hdl = sa_hdl; + sa_set_userp(sa_hdl, zp); + } + + zp->z_is_sa = (obj_type == DMU_OT_SA) ? B_TRUE : B_FALSE; + + mutex_exit(&zp->z_lock); +} + +void +zfs_znode_dmu_fini(znode_t *zp) +{ + ASSERT(zfs_znode_held(ZTOZSB(zp), zp->z_id) || zp->z_unlinked || + RW_WRITE_HELD(&ZTOZSB(zp)->z_teardown_inactive_lock)); + + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; +} + +/* + * Called by new_inode() to allocate a new inode. + */ +int +zfs_inode_alloc(struct super_block *sb, struct inode **ip) +{ + znode_t *zp; + + zp = kmem_cache_alloc(znode_cache, KM_SLEEP); + *ip = ZTOI(zp); + + return (0); +} + +/* + * Called in multiple places when an inode should be destroyed. + */ +void +zfs_inode_destroy(struct inode *ip) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + + mutex_enter(&zfsvfs->z_znodes_lock); + if (list_link_active(&zp->z_link_node)) { + list_remove(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes--; + } + mutex_exit(&zfsvfs->z_znodes_lock); + + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + + kmem_cache_free(znode_cache, zp); +} + +static void +zfs_inode_set_ops(zfsvfs_t *zfsvfs, struct inode *ip) +{ + uint64_t rdev = 0; + + switch (ip->i_mode & S_IFMT) { + case S_IFREG: + ip->i_op = &zpl_inode_operations; + ip->i_fop = &zpl_file_operations; + ip->i_mapping->a_ops = &zpl_address_space_operations; + break; + + case S_IFDIR: + ip->i_op = &zpl_dir_inode_operations; + ip->i_fop = &zpl_dir_file_operations; + ITOZ(ip)->z_zn_prefetch = B_TRUE; + break; + + case S_IFLNK: + ip->i_op = &zpl_symlink_inode_operations; + break; + + /* + * rdev is only stored in a SA only for device files. + */ + case S_IFCHR: + case S_IFBLK: + (void) sa_lookup(ITOZ(ip)->z_sa_hdl, SA_ZPL_RDEV(zfsvfs), &rdev, + sizeof (rdev)); + /*FALLTHROUGH*/ + case S_IFIFO: + case S_IFSOCK: + init_special_inode(ip, ip->i_mode, rdev); + ip->i_op = &zpl_special_inode_operations; + break; + + default: + zfs_panic_recover("inode %llu has invalid mode: 0x%x\n", + (u_longlong_t)ip->i_ino, ip->i_mode); + + /* Assume the inode is a file and attempt to continue */ + ip->i_mode = S_IFREG | 0644; + ip->i_op = &zpl_inode_operations; + ip->i_fop = &zpl_file_operations; + ip->i_mapping->a_ops = &zpl_address_space_operations; + break; + } +} + +static void +zfs_set_inode_flags(znode_t *zp, struct inode *ip) +{ + /* + * Linux and Solaris have different sets of file attributes, so we + * restrict this conversion to the intersection of the two. + */ +#ifdef HAVE_INODE_SET_FLAGS + unsigned int flags = 0; + if (zp->z_pflags & ZFS_IMMUTABLE) + flags |= S_IMMUTABLE; + if (zp->z_pflags & ZFS_APPENDONLY) + flags |= S_APPEND; + + inode_set_flags(ip, flags, S_IMMUTABLE|S_APPEND); +#else + if (zp->z_pflags & ZFS_IMMUTABLE) + ip->i_flags |= S_IMMUTABLE; + else + ip->i_flags &= ~S_IMMUTABLE; + + if (zp->z_pflags & ZFS_APPENDONLY) + ip->i_flags |= S_APPEND; + else + ip->i_flags &= ~S_APPEND; +#endif +} + +/* + * Update the embedded inode given the znode. We should work toward + * eliminating this function as soon as possible by removing values + * which are duplicated between the znode and inode. If the generic + * inode has the correct field it should be used, and the ZFS code + * updated to access the inode. This can be done incrementally. + */ +void +zfs_inode_update(znode_t *zp) +{ + zfsvfs_t *zfsvfs; + struct inode *ip; + uint32_t blksize; + u_longlong_t i_blocks; + + ASSERT(zp != NULL); + zfsvfs = ZTOZSB(zp); + ip = ZTOI(zp); + + /* Skip .zfs control nodes which do not exist on disk. */ + if (zfsctl_is_node(ip)) + return; + + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &blksize, &i_blocks); + + spin_lock(&ip->i_lock); + ip->i_blocks = i_blocks; + i_size_write(ip, zp->z_size); + spin_unlock(&ip->i_lock); +} + + +/* + * Construct a znode+inode and initialize. + * + * This does not do a call to dmu_set_user() that is + * up to the caller to do, in case you don't want to + * return the znode + */ +static znode_t * +zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, + dmu_object_type_t obj_type, sa_handle_t *hdl) +{ + znode_t *zp; + struct inode *ip; + uint64_t mode; + uint64_t parent; + uint64_t tmp_gen; + uint64_t links; + uint64_t z_uid, z_gid; + uint64_t atime[2], mtime[2], ctime[2]; + uint64_t projid = ZFS_DEFAULT_PROJID; + sa_bulk_attr_t bulk[11]; + int count = 0; + + ASSERT(zfsvfs != NULL); + + ip = new_inode(zfsvfs->z_sb); + if (ip == NULL) + return (NULL); + + zp = ITOZ(ip); + ASSERT(zp->z_dirlocks == NULL); + ASSERT3P(zp->z_acl_cached, ==, NULL); + ASSERT3P(zp->z_xattr_cached, ==, NULL); + zp->z_unlinked = B_FALSE; + zp->z_atime_dirty = B_FALSE; + zp->z_moved = B_FALSE; + zp->z_is_mapped = B_FALSE; + zp->z_is_ctldir = B_FALSE; + zp->z_is_stale = B_FALSE; + zp->z_suspended = B_FALSE; + zp->z_sa_hdl = NULL; + zp->z_mapcnt = 0; + zp->z_id = db->db_object; + zp->z_blksz = blksz; + zp->z_seq = 0x7A4653; + zp->z_sync_cnt = 0; + + zfs_znode_sa_init(zfsvfs, zp, db, obj_type, hdl); + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &z_uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &z_gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count) != 0 || tmp_gen == 0 || + (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + (zp->z_pflags & ZFS_PROJID) && + sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), &projid, 8) != 0)) { + if (hdl == NULL) + sa_handle_destroy(zp->z_sa_hdl); + zp->z_sa_hdl = NULL; + goto error; + } + + zp->z_projid = projid; + zp->z_mode = ip->i_mode = mode; + ip->i_generation = (uint32_t)tmp_gen; + ip->i_blkbits = SPA_MINBLOCKSHIFT; + set_nlink(ip, (uint32_t)links); + zfs_uid_write(ip, z_uid); + zfs_gid_write(ip, z_gid); + zfs_set_inode_flags(zp, ip); + + /* Cache the xattr parent id */ + if (zp->z_pflags & ZFS_XATTR) + zp->z_xattr_parent = parent; + + ZFS_TIME_DECODE(&ip->i_atime, atime); + ZFS_TIME_DECODE(&ip->i_mtime, mtime); + ZFS_TIME_DECODE(&ip->i_ctime, ctime); + + ip->i_ino = zp->z_id; + zfs_inode_update(zp); + zfs_inode_set_ops(zfsvfs, ip); + + /* + * The only way insert_inode_locked() can fail is if the ip->i_ino + * number is already hashed for this super block. This can never + * happen because the inode numbers map 1:1 with the object numbers. + * + * The one exception is rolling back a mounted file system, but in + * this case all the active inode are unhashed during the rollback. + */ + VERIFY3S(insert_inode_locked(ip), ==, 0); + + mutex_enter(&zfsvfs->z_znodes_lock); + list_insert_tail(&zfsvfs->z_all_znodes, zp); + zfsvfs->z_nr_znodes++; + membar_producer(); + mutex_exit(&zfsvfs->z_znodes_lock); + + unlock_new_inode(ip); + return (zp); + +error: + iput(ip); + return (NULL); +} + +/* + * Safely mark an inode dirty. Inodes which are part of a read-only + * file system or snapshot may not be dirtied. + */ +void +zfs_mark_inode_dirty(struct inode *ip) +{ + zfsvfs_t *zfsvfs = ITOZSB(ip); + + if (zfs_is_readonly(zfsvfs) || dmu_objset_is_snapshot(zfsvfs->z_os)) + return; + + mark_inode_dirty(ip); +} + +static uint64_t empty_xattr; +static uint64_t pad[4]; +static zfs_acl_phys_t acl_phys; +/* + * Create a new DMU object to hold a zfs znode. + * + * IN: dzp - parent directory for new znode + * vap - file attributes for new znode + * tx - dmu transaction id for zap operations + * cr - credentials of caller + * flag - flags: + * IS_ROOT_NODE - new object will be root + * IS_TMPFILE - new object is of O_TMPFILE + * IS_XATTR - new object is an attribute + * acl_ids - ACL related attributes + * + * OUT: zpp - allocated znode (set to dzp if IS_ROOT_NODE) + * + */ +void +zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, + uint_t flag, znode_t **zpp, zfs_acl_ids_t *acl_ids) +{ + uint64_t crtime[2], atime[2], mtime[2], ctime[2]; + uint64_t mode, size, links, parent, pflags; + uint64_t projid = ZFS_DEFAULT_PROJID; + uint64_t rdev = 0; + zfsvfs_t *zfsvfs = ZTOZSB(dzp); + dmu_buf_t *db; + inode_timespec_t now; + uint64_t gen, obj; + int bonuslen; + int dnodesize; + sa_handle_t *sa_hdl; + dmu_object_type_t obj_type; + sa_bulk_attr_t *sa_attrs; + int cnt = 0; + zfs_acl_locator_cb_t locate = { 0 }; + znode_hold_t *zh; + + if (zfsvfs->z_replay) { + obj = vap->va_nodeid; + now = vap->va_ctime; /* see zfs_replay_create() */ + gen = vap->va_nblocks; /* ditto */ + dnodesize = vap->va_fsid; /* ditto */ + } else { + obj = 0; + gethrestime(&now); + gen = dmu_tx_get_txg(tx); + dnodesize = dmu_objset_dnodesize(zfsvfs->z_os); + } + + if (dnodesize == 0) + dnodesize = DNODE_MIN_SIZE; + + obj_type = zfsvfs->z_use_sa ? DMU_OT_SA : DMU_OT_ZNODE; + + bonuslen = (obj_type == DMU_OT_SA) ? + DN_BONUS_SIZE(dnodesize) : ZFS_OLD_ZNODE_PHYS_SIZE; + + /* + * Create a new DMU object. + */ + /* + * There's currently no mechanism for pre-reading the blocks that will + * be needed to allocate a new object, so we accept the small chance + * that there will be an i/o error and we will fail one of the + * assertions below. + */ + if (S_ISDIR(vap->va_mode)) { + if (zfsvfs->z_replay) { + VERIFY0(zap_create_claim_norm_dnsize(zfsvfs->z_os, obj, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = zap_create_norm_dnsize(zfsvfs->z_os, + zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, + obj_type, bonuslen, dnodesize, tx); + } + } else { + if (zfsvfs->z_replay) { + VERIFY0(dmu_object_claim_dnsize(zfsvfs->z_os, obj, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx)); + } else { + obj = dmu_object_alloc_dnsize(zfsvfs->z_os, + DMU_OT_PLAIN_FILE_CONTENTS, 0, + obj_type, bonuslen, dnodesize, tx); + } + } + + zh = zfs_znode_hold_enter(zfsvfs, obj); + VERIFY0(sa_buf_hold(zfsvfs->z_os, obj, NULL, &db)); + + /* + * If this is the root, fix up the half-initialized parent pointer + * to reference the just-allocated physical data area. + */ + if (flag & IS_ROOT_NODE) { + dzp->z_id = obj; + } + + /* + * If parent is an xattr, so am I. + */ + if (dzp->z_pflags & ZFS_XATTR) { + flag |= IS_XATTR; + } + + if (zfsvfs->z_use_fuids) + pflags = ZFS_ARCHIVE | ZFS_AV_MODIFIED; + else + pflags = 0; + + if (S_ISDIR(vap->va_mode)) { + size = 2; /* contents ("." and "..") */ + links = 2; + } else { + size = 0; + links = (flag & IS_TMPFILE) ? 0 : 1; + } + + if (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode)) + rdev = vap->va_rdev; + + parent = dzp->z_id; + mode = acl_ids->z_mode; + if (flag & IS_XATTR) + pflags |= ZFS_XATTR; + + if (S_ISREG(vap->va_mode) || S_ISDIR(vap->va_mode)) { + /* + * With ZFS_PROJID flag, we can easily know whether there is + * project ID stored on disk or not. See zfs_space_delta_cb(). + */ + if (obj_type != DMU_OT_ZNODE && + dmu_objset_projectquota_enabled(zfsvfs->z_os)) + pflags |= ZFS_PROJID; + + /* + * Inherit project ID from parent if required. + */ + projid = zfs_inherit_projid(dzp); + if (dzp->z_pflags & ZFS_PROJINHERIT) + pflags |= ZFS_PROJINHERIT; + } + + /* + * No execs denied will be determined when zfs_mode_compute() is called. + */ + pflags |= acl_ids->z_aclp->z_hints & + (ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|ZFS_ACL_AUTO_INHERIT| + ZFS_ACL_DEFAULTED|ZFS_ACL_PROTECTED); + + ZFS_TIME_ENCODE(&now, crtime); + ZFS_TIME_ENCODE(&now, ctime); + + if (vap->va_mask & ATTR_ATIME) { + ZFS_TIME_ENCODE(&vap->va_atime, atime); + } else { + ZFS_TIME_ENCODE(&now, atime); + } + + if (vap->va_mask & ATTR_MTIME) { + ZFS_TIME_ENCODE(&vap->va_mtime, mtime); + } else { + ZFS_TIME_ENCODE(&now, mtime); + } + + /* Now add in all of the "SA" attributes */ + VERIFY(0 == sa_handle_get_from_db(zfsvfs->z_os, db, NULL, SA_HDL_SHARED, + &sa_hdl)); + + /* + * Setup the array of attributes to be replaced/set on the new file + * + * order for DMU_OT_ZNODE is critical since it needs to be constructed + * in the old znode_phys_t format. Don't change this ordering + */ + sa_attrs = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + } else { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MODE(zfsvfs), + NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_SIZE(zfsvfs), + NULL, &size, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GEN(zfsvfs), + NULL, &gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), + NULL, &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), + NULL, &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ATIME(zfsvfs), + NULL, &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_MTIME(zfsvfs), + NULL, &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_CRTIME(zfsvfs), + NULL, &crtime, 16); + } + + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_XATTR(zfsvfs), NULL, + &empty_xattr, 8); + } else if (dmu_objset_projectquota_enabled(zfsvfs->z_os) && + pflags & ZFS_PROJID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PROJID(zfsvfs), + NULL, &projid, 8); + } + if (obj_type == DMU_OT_ZNODE || + (S_ISBLK(vap->va_mode) || S_ISCHR(vap->va_mode))) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_RDEV(zfsvfs), + NULL, &rdev, 8); + } + if (obj_type == DMU_OT_ZNODE) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_FLAGS(zfsvfs), + NULL, &pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_UID(zfsvfs), NULL, + &acl_ids->z_fuid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_GID(zfsvfs), NULL, + &acl_ids->z_fgid, 8); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_PAD(zfsvfs), NULL, pad, + sizeof (uint64_t) * 4); + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &acl_phys, sizeof (zfs_acl_phys_t)); + } else if (acl_ids->z_aclp->z_version >= ZFS_ACL_VERSION_FUID) { + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &acl_ids->z_aclp->z_acl_count, 8); + locate.cb_aclp = acl_ids->z_aclp; + SA_ADD_BULK_ATTR(sa_attrs, cnt, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + acl_ids->z_aclp->z_acl_bytes); + mode = zfs_mode_compute(mode, acl_ids->z_aclp, &pflags, + acl_ids->z_fuid, acl_ids->z_fgid); + } + + VERIFY(sa_replace_all_by_template(sa_hdl, sa_attrs, cnt, tx) == 0); + + if (!(flag & IS_ROOT_NODE)) { + /* + * The call to zfs_znode_alloc() may fail if memory is low + * via the call path: alloc_inode() -> inode_init_always() -> + * security_inode_alloc() -> inode_alloc_security(). Since + * the existing code is written such that zfs_mknode() can + * not fail retry until sufficient memory has been reclaimed. + */ + do { + *zpp = zfs_znode_alloc(zfsvfs, db, 0, obj_type, sa_hdl); + } while (*zpp == NULL); + + VERIFY(*zpp != NULL); + VERIFY(dzp != NULL); + } else { + /* + * If we are creating the root node, the "parent" we + * passed in is the znode for the root. + */ + *zpp = dzp; + + (*zpp)->z_sa_hdl = sa_hdl; + } + + (*zpp)->z_pflags = pflags; + (*zpp)->z_mode = ZTOI(*zpp)->i_mode = mode; + (*zpp)->z_dnodesize = dnodesize; + (*zpp)->z_projid = projid; + + if (obj_type == DMU_OT_ZNODE || + acl_ids->z_aclp->z_version < ZFS_ACL_VERSION_FUID) { + VERIFY0(zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + } + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + zfs_znode_hold_exit(zfsvfs, zh); +} + +/* + * Update in-core attributes. It is assumed the caller will be doing an + * sa_bulk_update to push the changes out. + */ +void +zfs_xvattr_set(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + xoptattr_t *xoap; + boolean_t update_inode = B_FALSE; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) { + uint64_t times[2]; + ZFS_TIME_ENCODE(&xoap->xoa_createtime, times); + (void) sa_update(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), + ×, sizeof (times), tx); + XVA_SET_RTN(xvap, XAT_CREATETIME); + } + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) { + ZFS_ATTR_SET(zp, ZFS_READONLY, xoap->xoa_readonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_READONLY); + } + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) { + ZFS_ATTR_SET(zp, ZFS_HIDDEN, xoap->xoa_hidden, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_HIDDEN); + } + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) { + ZFS_ATTR_SET(zp, ZFS_SYSTEM, xoap->xoa_system, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SYSTEM); + } + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) { + ZFS_ATTR_SET(zp, ZFS_ARCHIVE, xoap->xoa_archive, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_ARCHIVE); + } + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + ZFS_ATTR_SET(zp, ZFS_IMMUTABLE, xoap->xoa_immutable, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_IMMUTABLE); + + update_inode = B_TRUE; + } + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + ZFS_ATTR_SET(zp, ZFS_NOUNLINK, xoap->xoa_nounlink, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + ZFS_ATTR_SET(zp, ZFS_APPENDONLY, xoap->xoa_appendonly, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_APPENDONLY); + + update_inode = B_TRUE; + } + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + ZFS_ATTR_SET(zp, ZFS_NODUMP, xoap->xoa_nodump, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) { + ZFS_ATTR_SET(zp, ZFS_OPAQUE, xoap->xoa_opaque, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OPAQUE); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + ZFS_ATTR_SET(zp, ZFS_AV_QUARANTINED, + xoap->xoa_av_quarantined, zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + ZFS_ATTR_SET(zp, ZFS_AV_MODIFIED, xoap->xoa_av_modified, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + zfs_sa_set_scanstamp(zp, xvap, tx); + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_REPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) { + ZFS_ATTR_SET(zp, ZFS_OFFLINE, xoap->xoa_offline, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_OFFLINE); + } + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) { + ZFS_ATTR_SET(zp, ZFS_SPARSE, xoap->xoa_sparse, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_SPARSE); + } + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) { + ZFS_ATTR_SET(zp, ZFS_PROJINHERIT, xoap->xoa_projinherit, + zp->z_pflags, tx); + XVA_SET_RTN(xvap, XAT_PROJINHERIT); + } + + if (update_inode) + zfs_set_inode_flags(zp, ZTOI(zp)); +} + +int +zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) +{ + dmu_object_info_t doi; + dmu_buf_t *db; + znode_t *zp; + znode_hold_t *zh; + int err; + sa_handle_t *hdl; + + *zpp = NULL; + +again: + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + hdl = dmu_buf_get_user(db); + if (hdl != NULL) { + zp = sa_get_userdata(hdl); + + + /* + * Since "SA" does immediate eviction we + * should never find a sa handle that doesn't + * know about the znode. + */ + + ASSERT3P(zp, !=, NULL); + + mutex_enter(&zp->z_lock); + ASSERT3U(zp->z_id, ==, obj_num); + /* + * If zp->z_unlinked is set, the znode is already marked + * for deletion and should not be discovered. Check this + * after checking igrab() due to fsetxattr() & O_TMPFILE. + * + * If igrab() returns NULL the VFS has independently + * determined the inode should be evicted and has + * called iput_final() to start the eviction process. + * The SA handle is still valid but because the VFS + * requires that the eviction succeed we must drop + * our locks and references to allow the eviction to + * complete. The zfs_zget() may then be retried. + * + * This unlikely case could be optimized by registering + * a sops->drop_inode() callback. The callback would + * need to detect the active SA hold thereby informing + * the VFS that this inode should not be evicted. + */ + if (igrab(ZTOI(zp)) == NULL) { + if (zp->z_unlinked) + err = SET_ERROR(ENOENT); + else + err = SET_ERROR(EAGAIN); + } else { + *zpp = zp; + err = 0; + } + + mutex_exit(&zp->z_lock); + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + + if (err == EAGAIN) { + /* inode might need this to finish evict */ + cond_resched(); + goto again; + } + return (err); + } + + /* + * Not found create new znode/vnode but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. This is checked for in zfs_znode_alloc() + * + * if zfs_znode_alloc() fails it will drop the hold on the + * bonus buffer. + */ + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size, + doi.doi_bonus_type, NULL); + if (zp == NULL) { + err = SET_ERROR(ENOENT); + } else { + *zpp = zp; + } + zfs_znode_hold_exit(zfsvfs, zh); + return (err); +} + +int +zfs_rezget(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_object_info_t doi; + dmu_buf_t *db; + uint64_t obj_num = zp->z_id; + uint64_t mode; + uint64_t links; + sa_bulk_attr_t bulk[10]; + int err; + int count = 0; + uint64_t gen; + uint64_t z_uid, z_gid; + uint64_t atime[2], mtime[2], ctime[2]; + uint64_t projid = ZFS_DEFAULT_PROJID; + znode_hold_t *zh; + + /* + * skip ctldir, otherwise they will always get invalidated. This will + * cause funny behaviour for the mounted snapdirs. Especially for + * Linux >= 3.18, d_invalidate will detach the mountpoint and prevent + * anyone automount it again as long as someone is still using the + * detached mount. + */ + if (zp->z_is_ctldir) + return (0); + + zh = zfs_znode_hold_enter(zfsvfs, obj_num); + + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + + rw_enter(&zp->z_xattr_lock, RW_WRITER); + if (zp->z_xattr_cached) { + nvlist_free(zp->z_xattr_cached); + zp->z_xattr_cached = NULL; + } + rw_exit(&zp->z_xattr_lock); + + ASSERT(zp->z_sa_hdl == NULL); + err = sa_buf_hold(zfsvfs->z_os, obj_num, NULL, &db); + if (err) { + zfs_znode_hold_exit(zfsvfs, zh); + return (err); + } + + dmu_object_info_from_db(db, &doi); + if (doi.doi_bonus_type != DMU_OT_SA && + (doi.doi_bonus_type != DMU_OT_ZNODE || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t)))) { + sa_buf_rele(db, NULL); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EINVAL)); + } + + zfs_znode_sa_init(zfsvfs, zp, db, doi.doi_bonus_type, NULL); + + /* reload cached values */ + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, sizeof (gen)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, sizeof (zp->z_size)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_LINKS(zfsvfs), NULL, + &links, sizeof (links)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, sizeof (zp->z_pflags)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &z_uid, sizeof (z_uid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &z_gid, sizeof (z_gid)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, sizeof (mode)); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + + if (sa_bulk_lookup(zp->z_sa_hdl, bulk, count)) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + if (dmu_objset_projectquota_enabled(zfsvfs->z_os)) { + err = sa_lookup(zp->z_sa_hdl, SA_ZPL_PROJID(zfsvfs), + &projid, 8); + if (err != 0 && err != ENOENT) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(err)); + } + } + + zp->z_projid = projid; + zp->z_mode = ZTOI(zp)->i_mode = mode; + zfs_uid_write(ZTOI(zp), z_uid); + zfs_gid_write(ZTOI(zp), z_gid); + + ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); + ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); + ZFS_TIME_DECODE(&ZTOI(zp)->i_ctime, ctime); + + if ((uint32_t)gen != ZTOI(zp)->i_generation) { + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); + return (SET_ERROR(EIO)); + } + + set_nlink(ZTOI(zp), (uint32_t)links); + zfs_set_inode_flags(zp, ZTOI(zp)); + + zp->z_blksz = doi.doi_data_block_size; + zp->z_atime_dirty = B_FALSE; + zfs_inode_update(zp); + + /* + * If the file has zero links, then it has been unlinked on the send + * side and it must be in the received unlinked set. + * We call zfs_znode_dmu_fini() now to prevent any accesses to the + * stale data and to prevent automatic removal of the file in + * zfs_zinactive(). The file will be removed either when it is removed + * on the send side and the next incremental stream is received or + * when the unlinked set gets processed. + */ + zp->z_unlinked = (ZTOI(zp)->i_nlink == 0); + if (zp->z_unlinked) + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); + + return (0); +} + +void +zfs_znode_delete(znode_t *zp, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + objset_t *os = zfsvfs->z_os; + uint64_t obj = zp->z_id; + uint64_t acl_obj = zfs_external_acl(zp); + znode_hold_t *zh; + + zh = zfs_znode_hold_enter(zfsvfs, obj); + if (acl_obj) { + VERIFY(!zp->z_is_sa); + VERIFY(0 == dmu_object_free(os, acl_obj, tx)); + } + VERIFY(0 == dmu_object_free(os, obj, tx)); + zfs_znode_dmu_fini(zp); + zfs_znode_hold_exit(zfsvfs, zh); +} + +void +zfs_zinactive(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + uint64_t z_id = zp->z_id; + znode_hold_t *zh; + + ASSERT(zp->z_sa_hdl); + + /* + * Don't allow a zfs_zget() while were trying to release this znode. + */ + zh = zfs_znode_hold_enter(zfsvfs, z_id); + + mutex_enter(&zp->z_lock); + + /* + * If this was the last reference to a file with no links, remove + * the file from the file system unless the file system is mounted + * read-only. That can happen, for example, if the file system was + * originally read-write, the file was opened, then unlinked and + * the file system was made read-only before the file was finally + * closed. The file will remain in the unlinked set. + */ + if (zp->z_unlinked) { + ASSERT(!zfsvfs->z_issnap); + if (!zfs_is_readonly(zfsvfs) && !zfs_unlink_suspend_progress) { + mutex_exit(&zp->z_lock); + zfs_znode_hold_exit(zfsvfs, zh); + zfs_rmnode(zp); + return; + } + } + + mutex_exit(&zp->z_lock); + zfs_znode_dmu_fini(zp); + + zfs_znode_hold_exit(zfsvfs, zh); +} + +#if defined(HAVE_INODE_TIMESPEC64_TIMES) +#define zfs_compare_timespec timespec64_compare +#else +#define zfs_compare_timespec timespec_compare +#endif + +/* + * Determine whether the znode's atime must be updated. The logic mostly + * duplicates the Linux kernel's relatime_need_update() functionality. + * This function is only called if the underlying filesystem actually has + * atime updates enabled. + */ +boolean_t +zfs_relatime_need_update(const struct inode *ip) +{ + inode_timespec_t now; + + gethrestime(&now); + /* + * In relatime mode, only update the atime if the previous atime + * is earlier than either the ctime or mtime or if at least a day + * has passed since the last update of atime. + */ + if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + return (B_TRUE); + + if (zfs_compare_timespec(&ip->i_ctime, &ip->i_atime) >= 0) + return (B_TRUE); + + if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Prepare to update znode time stamps. + * + * IN: zp - znode requiring timestamp update + * flag - ATTR_MTIME, ATTR_CTIME flags + * + * OUT: zp - z_seq + * mtime - new mtime + * ctime - new ctime + * + * Note: We don't update atime here, because we rely on Linux VFS to do + * atime updating. + */ +void +zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], + uint64_t ctime[2]) +{ + inode_timespec_t now; + + gethrestime(&now); + + zp->z_seq++; + + if (flag & ATTR_MTIME) { + ZFS_TIME_ENCODE(&now, mtime); + ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + if (ZTOZSB(zp)->z_use_fuids) { + zp->z_pflags |= (ZFS_ARCHIVE | + ZFS_AV_MODIFIED); + } + } + + if (flag & ATTR_CTIME) { + ZFS_TIME_ENCODE(&now, ctime); + ZFS_TIME_DECODE(&(ZTOI(zp)->i_ctime), ctime); + if (ZTOZSB(zp)->z_use_fuids) + zp->z_pflags |= ZFS_ARCHIVE; + } +} + +/* + * Grow the block size for a file. + * + * IN: zp - znode of file to free data in. + * size - requested block size + * tx - open transaction. + * + * NOTE: this function assumes that the znode is write locked. + */ +void +zfs_grow_blocksize(znode_t *zp, uint64_t size, dmu_tx_t *tx) +{ + int error; + u_longlong_t dummy; + + if (size <= zp->z_blksz) + return; + /* + * If the file size is already greater than the current blocksize, + * we will not grow. If there is more than one block in a file, + * the blocksize cannot change. + */ + if (zp->z_blksz && zp->z_size > zp->z_blksz) + return; + + error = dmu_object_set_blocksize(ZTOZSB(zp)->z_os, zp->z_id, + size, 0, tx); + + if (error == ENOTSUP) + return; + ASSERT0(error); + + /* What blocksize did we actually get? */ + dmu_object_size_from_db(sa_get_db(zp->z_sa_hdl), &zp->z_blksz, &dummy); +} + +/* + * Increase the file length + * + * IN: zp - znode of file to free data in. + * end - new end-of-file + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_extend(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + uint64_t newblksz; + int error; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end <= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + if (end > zp->z_blksz && + (!ISP2(zp->z_blksz) || zp->z_blksz < zfsvfs->z_max_blksz)) { + /* + * We are growing the file past the current block size. + */ + if (zp->z_blksz > ZTOZSB(zp)->z_max_blksz) { + /* + * File's blocksize is already larger than the + * "recordsize" property. Only let it grow to + * the next power of 2. + */ + ASSERT(!ISP2(zp->z_blksz)); + newblksz = MIN(end, 1 << highbit64(zp->z_blksz)); + } else { + newblksz = MIN(end, ZTOZSB(zp)->z_max_blksz); + } + dmu_tx_hold_write(tx, zp->z_id, 0, newblksz); + } else { + newblksz = 0; + } + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + if (newblksz) + zfs_grow_blocksize(zp, newblksz, tx); + + zp->z_size = end; + + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(ZTOZSB(zp)), + &zp->z_size, sizeof (zp->z_size), tx)); + + zfs_rangelock_exit(lr); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * zfs_zero_partial_page - Modeled after update_pages() but + * with different arguments and semantics for use by zfs_freesp(). + * + * Zeroes a piece of a single page cache entry for zp at offset + * start and length len. + * + * Caller must acquire a range lock on the file for the region + * being zeroed in order that the ARC and page cache stay in sync. + */ +static void +zfs_zero_partial_page(znode_t *zp, uint64_t start, uint64_t len) +{ + struct address_space *mp = ZTOI(zp)->i_mapping; + struct page *pp; + int64_t off; + void *pb; + + ASSERT((start & PAGE_MASK) == ((start + len - 1) & PAGE_MASK)); + + off = start & (PAGE_SIZE - 1); + start &= PAGE_MASK; + + pp = find_lock_page(mp, start >> PAGE_SHIFT); + if (pp) { + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + pb = kmap(pp); + bzero(pb + off, len); + kunmap(pp); + + if (mapping_writably_mapped(mp)) + flush_dcache_page(pp); + + mark_page_accessed(pp); + SetPageUptodate(pp); + ClearPageError(pp); + unlock_page(pp); + put_page(pp); + } +} + +/* + * Free space in a file. + * + * IN: zp - znode of file to free data in. + * off - start of section to free. + * len - length of section to free. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_free_range(znode_t *zp, uint64_t off, uint64_t len) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zfs_locked_range_t *lr; + int error; + + /* + * Lock the range being freed. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, off, len, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (off >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + if (off + len > zp->z_size) + len = zp->z_size - off; + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, off, len); + + /* + * Zero partial page cache entries. This must be done under a + * range lock in order to keep the ARC and page cache in sync. + */ + if (zp->z_is_mapped) { + loff_t first_page, last_page, page_len; + loff_t first_page_offset, last_page_offset; + + /* first possible full page in hole */ + first_page = (off + PAGE_SIZE - 1) >> PAGE_SHIFT; + /* last page of hole */ + last_page = (off + len) >> PAGE_SHIFT; + + /* offset of first_page */ + first_page_offset = first_page << PAGE_SHIFT; + /* offset of last_page */ + last_page_offset = last_page << PAGE_SHIFT; + + /* truncate whole pages */ + if (last_page_offset > first_page_offset) { + truncate_inode_pages_range(ZTOI(zp)->i_mapping, + first_page_offset, last_page_offset - 1); + } + + /* truncate sub-page ranges */ + if (first_page > last_page) { + /* entire punched area within a single page */ + zfs_zero_partial_page(zp, off, len); + } else { + /* beginning of punched area at the end of a page */ + page_len = first_page_offset - off; + if (page_len > 0) + zfs_zero_partial_page(zp, off, page_len); + + /* end of punched area at the beginning of a page */ + page_len = off + len - last_page_offset; + if (page_len > 0) + zfs_zero_partial_page(zp, last_page_offset, + page_len); + } + } + zfs_rangelock_exit(lr); + + return (error); +} + +/* + * Truncate a file + * + * IN: zp - znode of file to free data in. + * end - new end-of-file. + * + * RETURN: 0 on success, error code on failure + */ +static int +zfs_trunc(znode_t *zp, uint64_t end) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_tx_t *tx; + zfs_locked_range_t *lr; + int error; + sa_bulk_attr_t bulk[2]; + int count = 0; + + /* + * We will change zp_size, lock the whole file. + */ + lr = zfs_rangelock_enter(&zp->z_rangelock, 0, UINT64_MAX, RL_WRITER); + + /* + * Nothing to do if file already at desired length. + */ + if (end >= zp->z_size) { + zfs_rangelock_exit(lr); + return (0); + } + + error = dmu_free_long_range(zfsvfs->z_os, zp->z_id, end, + DMU_OBJECT_END); + if (error) { + zfs_rangelock_exit(lr); + return (error); + } + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + zfs_rangelock_exit(lr); + return (error); + } + + zp->z_size = end; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_SIZE(zfsvfs), + NULL, &zp->z_size, sizeof (zp->z_size)); + + if (end == 0) { + zp->z_pflags &= ~ZFS_SPARSE; + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + } + VERIFY(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx) == 0); + + dmu_tx_commit(tx); + zfs_rangelock_exit(lr); + + return (0); +} + +/* + * Free space in a file + * + * IN: zp - znode of file to free data in. + * off - start of range + * len - end of range (0 => EOF) + * flag - current file open mode flags. + * log - TRUE if this action should be logged + * + * RETURN: 0 on success, error code on failure + */ +int +zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) +{ + dmu_tx_t *tx; + zfsvfs_t *zfsvfs = ZTOZSB(zp); + zilog_t *zilog = zfsvfs->z_log; + uint64_t mode; + uint64_t mtime[2], ctime[2]; + sa_bulk_attr_t bulk[3]; + int count = 0; + int error; + + if ((error = sa_lookup(zp->z_sa_hdl, SA_ZPL_MODE(zfsvfs), &mode, + sizeof (mode))) != 0) + return (error); + + if (off > zp->z_size) { + error = zfs_extend(zp, off+len); + if (error == 0 && log) + goto log; + goto out; + } + + if (len == 0) { + error = zfs_trunc(zp, off); + } else { + if ((error = zfs_free_range(zp, off, len)) == 0 && + off + len > zp->z_size) + error = zfs_extend(zp, off+len); + } + if (error || !log) + goto out; +log: + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + zfs_sa_upgrade_txholds(tx, zp); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + goto out; + } + + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_FLAGS(zfsvfs), + NULL, &zp->z_pflags, 8); + zfs_tstamp_update_setup(zp, CONTENT_MODIFIED, mtime, ctime); + error = sa_bulk_update(zp->z_sa_hdl, bulk, count, tx); + ASSERT(error == 0); + + zfs_log_truncate(zilog, tx, TX_TRUNCATE, zp, off, len); + + dmu_tx_commit(tx); + + zfs_inode_update(zp); + error = 0; + +out: + /* + * Truncate the page cache - for file truncate operations, use + * the purpose-built API for truncations. For punching operations, + * the truncation is handled under a range lock in zfs_free_range. + */ + if (len == 0) + truncate_setsize(ZTOI(zp), off); + return (error); +} + +void +zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) +{ + struct super_block *sb; + zfsvfs_t *zfsvfs; + uint64_t moid, obj, sa_obj, version; + uint64_t sense = ZFS_CASE_SENSITIVE; + uint64_t norm = 0; + nvpair_t *elem; + int size; + int error; + int i; + znode_t *rootzp = NULL; + vattr_t vattr; + znode_t *zp; + zfs_acl_ids_t acl_ids; + + /* + * First attempt to create master node. + */ + /* + * In an empty objset, there are no blocks to read and thus + * there can be no i/o errors (which we assert below). + */ + moid = MASTER_NODE_OBJ; + error = zap_create_claim(os, moid, DMU_OT_MASTER_NODE, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + /* + * Set starting attributes. + */ + version = zfs_zpl_version_map(spa_version(dmu_objset_spa(os))); + elem = NULL; + while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { + /* For the moment we expect all zpl props to be uint64_ts */ + uint64_t val; + char *name; + + ASSERT(nvpair_type(elem) == DATA_TYPE_UINT64); + VERIFY(nvpair_value_uint64(elem, &val) == 0); + name = nvpair_name(elem); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { + if (val < version) + version = val; + } else { + error = zap_update(os, moid, name, 8, 1, &val, tx); + } + ASSERT(error == 0); + if (strcmp(name, zfs_prop_to_name(ZFS_PROP_NORMALIZE)) == 0) + norm = val; + else if (strcmp(name, zfs_prop_to_name(ZFS_PROP_CASE)) == 0) + sense = val; + } + ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); + + /* + * Create zap object used for SA attribute registration + */ + + if (version >= ZPL_VERSION_SA) { + sa_obj = zap_create(os, DMU_OT_SA_MASTER_NODE, + DMU_OT_NONE, 0, tx); + error = zap_add(os, moid, ZFS_SA_ATTRS, 8, 1, &sa_obj, tx); + ASSERT(error == 0); + } else { + sa_obj = 0; + } + /* + * Create a delete queue. + */ + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); + ASSERT(error == 0); + + /* + * Create root znode. Create minimal znode/inode/zfsvfs/sb + * to allow zfs_mknode to work. + */ + vattr.va_mask = ATTR_MODE|ATTR_UID|ATTR_GID; + vattr.va_mode = S_IFDIR|0755; + vattr.va_uid = crgetuid(cr); + vattr.va_gid = crgetgid(cr); + + rootzp = kmem_cache_alloc(znode_cache, KM_SLEEP); + rootzp->z_unlinked = B_FALSE; + rootzp->z_atime_dirty = B_FALSE; + rootzp->z_moved = B_FALSE; + rootzp->z_is_sa = USE_SA(version, os); + rootzp->z_pflags = 0; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + zfsvfs->z_os = os; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_version = version; + zfsvfs->z_use_fuids = USE_FUIDS(version, os); + zfsvfs->z_use_sa = USE_SA(version, os); + zfsvfs->z_norm = norm; + + sb = kmem_zalloc(sizeof (struct super_block), KM_SLEEP); + sb->s_fs_info = zfsvfs; + + ZTOI(rootzp)->i_sb = sb; + + error = sa_setup(os, sa_obj, zfs_attr_table, ZPL_END, + &zfsvfs->z_attr_table); + + ASSERT(error == 0); + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (sense == ZFS_CASE_INSENSITIVE || sense == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + + size = MIN(1 << (highbit64(zfs_object_mutex_size)-1), ZFS_OBJ_MTX_MAX); + zfsvfs->z_hold_size = size; + zfsvfs->z_hold_trees = vmem_zalloc(sizeof (avl_tree_t) * size, + KM_SLEEP); + zfsvfs->z_hold_locks = vmem_zalloc(sizeof (kmutex_t) * size, KM_SLEEP); + for (i = 0; i != size; i++) { + avl_create(&zfsvfs->z_hold_trees[i], zfs_znode_hold_compare, + sizeof (znode_hold_t), offsetof(znode_hold_t, zh_node)); + mutex_init(&zfsvfs->z_hold_locks[i], NULL, MUTEX_DEFAULT, NULL); + } + + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, &acl_ids); + ASSERT3P(zp, ==, rootzp); + error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); + ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); + + atomic_set(&ZTOI(rootzp)->i_count, 0); + sa_handle_destroy(rootzp->z_sa_hdl); + kmem_cache_free(znode_cache, rootzp); + + for (i = 0; i != size; i++) { + avl_destroy(&zfsvfs->z_hold_trees[i]); + mutex_destroy(&zfsvfs->z_hold_locks[i]); + } + + mutex_destroy(&zfsvfs->z_znodes_lock); + + vmem_free(zfsvfs->z_hold_trees, sizeof (avl_tree_t) * size); + vmem_free(zfsvfs->z_hold_locks, sizeof (kmutex_t) * size); + kmem_free(sb, sizeof (struct super_block)); + kmem_free(zfsvfs, sizeof (zfsvfs_t)); +} +#endif /* _KERNEL */ + +static int +zfs_sa_setup(objset_t *osp, sa_attr_type_t **sa_table) +{ + uint64_t sa_obj = 0; + int error; + + error = zap_lookup(osp, MASTER_NODE_OBJ, ZFS_SA_ATTRS, 8, 1, &sa_obj); + if (error != 0 && error != ENOENT) + return (error); + + error = sa_setup(osp, sa_obj, zfs_attr_table, ZPL_END, sa_table); + return (error); +} + +static int +zfs_grab_sa_handle(objset_t *osp, uint64_t obj, sa_handle_t **hdlp, + dmu_buf_t **db, void *tag) +{ + dmu_object_info_t doi; + int error; + + if ((error = sa_buf_hold(osp, obj, tag, db)) != 0) + return (error); + + dmu_object_info_from_db(*db, &doi); + if ((doi.doi_bonus_type != DMU_OT_SA && + doi.doi_bonus_type != DMU_OT_ZNODE) || + (doi.doi_bonus_type == DMU_OT_ZNODE && + doi.doi_bonus_size < sizeof (znode_phys_t))) { + sa_buf_rele(*db, tag); + return (SET_ERROR(ENOTSUP)); + } + + error = sa_handle_get(osp, obj, NULL, SA_HDL_PRIVATE, hdlp); + if (error != 0) { + sa_buf_rele(*db, tag); + return (error); + } + + return (0); +} + +static void +zfs_release_sa_handle(sa_handle_t *hdl, dmu_buf_t *db, void *tag) +{ + sa_handle_destroy(hdl); + sa_buf_rele(db, tag); +} + +/* + * Given an object number, return its parent object number and whether + * or not the object is an extended attribute directory. + */ +static int +zfs_obj_to_pobj(objset_t *osp, sa_handle_t *hdl, sa_attr_type_t *sa_table, + uint64_t *pobjp, int *is_xattrdir) +{ + uint64_t parent; + uint64_t pflags; + uint64_t mode; + uint64_t parent_mode; + sa_bulk_attr_t bulk[3]; + sa_handle_t *sa_hdl; + dmu_buf_t *sa_db; + int count = 0; + int error; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_PARENT], NULL, + &parent, sizeof (parent)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_FLAGS], NULL, + &pflags, sizeof (pflags)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &mode, sizeof (mode)); + + if ((error = sa_bulk_lookup(hdl, bulk, count)) != 0) + return (error); + + /* + * When a link is removed its parent pointer is not changed and will + * be invalid. There are two cases where a link is removed but the + * file stays around, when it goes to the delete queue and when there + * are additional links. + */ + error = zfs_grab_sa_handle(osp, parent, &sa_hdl, &sa_db, FTAG); + if (error != 0) + return (error); + + error = sa_lookup(sa_hdl, ZPL_MODE, &parent_mode, sizeof (parent_mode)); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + if (error != 0) + return (error); + + *is_xattrdir = ((pflags & ZFS_XATTR) != 0) && S_ISDIR(mode); + + /* + * Extended attributes can be applied to files, directories, etc. + * Otherwise the parent must be a directory. + */ + if (!*is_xattrdir && !S_ISDIR(parent_mode)) + return (SET_ERROR(EINVAL)); + + *pobjp = parent; + + return (0); +} + +/* + * Given an object number, return some zpl level statistics + */ +static int +zfs_obj_to_stats_impl(sa_handle_t *hdl, sa_attr_type_t *sa_table, + zfs_stat_t *sb) +{ + sa_bulk_attr_t bulk[4]; + int count = 0; + + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_MODE], NULL, + &sb->zs_mode, sizeof (sb->zs_mode)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_GEN], NULL, + &sb->zs_gen, sizeof (sb->zs_gen)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_LINKS], NULL, + &sb->zs_links, sizeof (sb->zs_links)); + SA_ADD_BULK_ATTR(bulk, count, sa_table[ZPL_CTIME], NULL, + &sb->zs_ctime, sizeof (sb->zs_ctime)); + + return (sa_bulk_lookup(hdl, bulk, count)); +} + +static int +zfs_obj_to_path_impl(objset_t *osp, uint64_t obj, sa_handle_t *hdl, + sa_attr_type_t *sa_table, char *buf, int len) +{ + sa_handle_t *sa_hdl; + sa_handle_t *prevhdl = NULL; + dmu_buf_t *prevdb = NULL; + dmu_buf_t *sa_db = NULL; + char *path = buf + len - 1; + int error; + + *path = '\0'; + sa_hdl = hdl; + + uint64_t deleteq_obj; + VERIFY0(zap_lookup(osp, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + error = zap_lookup_int(osp, deleteq_obj, obj); + if (error == 0) { + return (ESTALE); + } else if (error != ENOENT) { + return (error); + } + error = 0; + + for (;;) { + uint64_t pobj = 0; + char component[MAXNAMELEN + 2]; + size_t complen; + int is_xattrdir = 0; + + if (prevdb) + zfs_release_sa_handle(prevhdl, prevdb, FTAG); + + if ((error = zfs_obj_to_pobj(osp, sa_hdl, sa_table, &pobj, + &is_xattrdir)) != 0) + break; + + if (pobj == obj) { + if (path[0] != '/') + *--path = '/'; + break; + } + + component[0] = '/'; + if (is_xattrdir) { + (void) sprintf(component + 1, ""); + } else { + error = zap_value_search(osp, pobj, obj, + ZFS_DIRENT_OBJ(-1ULL), component + 1); + if (error != 0) + break; + } + + complen = strlen(component); + path -= complen; + ASSERT(path >= buf); + bcopy(component, path, complen); + obj = pobj; + + if (sa_hdl != hdl) { + prevhdl = sa_hdl; + prevdb = sa_db; + } + error = zfs_grab_sa_handle(osp, obj, &sa_hdl, &sa_db, FTAG); + if (error != 0) { + sa_hdl = prevhdl; + sa_db = prevdb; + break; + } + } + + if (sa_hdl != NULL && sa_hdl != hdl) { + ASSERT(sa_db != NULL); + zfs_release_sa_handle(sa_hdl, sa_db, FTAG); + } + + if (error == 0) + (void) memmove(buf, path, buf + len - path); + + return (error); +} + +int +zfs_obj_to_path(objset_t *osp, uint64_t obj, char *buf, int len) +{ + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +int +zfs_obj_to_stats(objset_t *osp, uint64_t obj, zfs_stat_t *sb, + char *buf, int len) +{ + char *path = buf + len - 1; + sa_attr_type_t *sa_table; + sa_handle_t *hdl; + dmu_buf_t *db; + int error; + + *path = '\0'; + + error = zfs_sa_setup(osp, &sa_table); + if (error != 0) + return (error); + + error = zfs_grab_sa_handle(osp, obj, &hdl, &db, FTAG); + if (error != 0) + return (error); + + error = zfs_obj_to_stats_impl(hdl, sa_table, sb); + if (error != 0) { + zfs_release_sa_handle(hdl, db, FTAG); + return (error); + } + + error = zfs_obj_to_path_impl(osp, obj, hdl, sa_table, buf, len); + + zfs_release_sa_handle(hdl, db, FTAG); + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_create_fs); +EXPORT_SYMBOL(zfs_obj_to_path); + +/* CSTYLED */ +module_param(zfs_object_mutex_size, uint, 0644); +MODULE_PARM_DESC(zfs_object_mutex_size, "Size of znode hold array"); +module_param(zfs_unlink_suspend_progress, int, 0644); +MODULE_PARM_DESC(zfs_unlink_suspend_progress, "Set to prevent async unlinks " +"(debug - leaks space into the unlinked set)"); +#endif diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c new file mode 100644 index 000000000000..96dabe55a138 --- /dev/null +++ b/module/os/linux/zfs/zio_crypt.c @@ -0,0 +1,2036 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file is responsible for handling all of the details of generating + * encryption parameters and performing encryption and authentication. + * + * BLOCK ENCRYPTION PARAMETERS: + * Encryption /Authentication Algorithm Suite (crypt): + * The encryption algorithm, mode, and key length we are going to use. We + * currently support AES in either GCM or CCM modes with 128, 192, and 256 bit + * keys. All authentication is currently done with SHA512-HMAC. + * + * Plaintext: + * The unencrypted data that we want to encrypt. + * + * Initialization Vector (IV): + * An initialization vector for the encryption algorithms. This is used to + * "tweak" the encryption algorithms so that two blocks of the same data are + * encrypted into different ciphertext outputs, thus obfuscating block patterns. + * The supported encryption modes (AES-GCM and AES-CCM) require that an IV is + * never reused with the same encryption key. This value is stored unencrypted + * and must simply be provided to the decryption function. We use a 96 bit IV + * (as recommended by NIST) for all block encryption. For non-dedup blocks we + * derive the IV randomly. The first 64 bits of the IV are stored in the second + * word of DVA[2] and the remaining 32 bits are stored in the upper 32 bits of + * blk_fill. This is safe because encrypted blocks can't use the upper 32 bits + * of blk_fill. We only encrypt level 0 blocks, which normally have a fill count + * of 1. The only exception is for DMU_OT_DNODE objects, where the fill count of + * level 0 blocks is the number of allocated dnodes in that block. The on-disk + * format supports at most 2^15 slots per L0 dnode block, because the maximum + * block size is 16MB (2^24). In either case, for level 0 blocks this number + * will still be smaller than UINT32_MAX so it is safe to store the IV in the + * top 32 bits of blk_fill, while leaving the bottom 32 bits of the fill count + * for the dnode code. + * + * Master key: + * This is the most important secret data of an encrypted dataset. It is used + * along with the salt to generate that actual encryption keys via HKDF. We + * do not use the master key to directly encrypt any data because there are + * theoretical limits on how much data can actually be safely encrypted with + * any encryption mode. The master key is stored encrypted on disk with the + * user's wrapping key. Its length is determined by the encryption algorithm. + * For details on how this is stored see the block comment in dsl_crypt.c + * + * Salt: + * Used as an input to the HKDF function, along with the master key. We use a + * 64 bit salt, stored unencrypted in the first word of DVA[2]. Any given salt + * can be used for encrypting many blocks, so we cache the current salt and the + * associated derived key in zio_crypt_t so we do not need to derive it again + * needlessly. + * + * Encryption Key: + * A secret binary key, generated from an HKDF function used to encrypt and + * decrypt data. + * + * Message Authentication Code (MAC) + * The MAC is an output of authenticated encryption modes such as AES-GCM and + * AES-CCM. Its purpose is to ensure that an attacker cannot modify encrypted + * data on disk and return garbage to the application. Effectively, it is a + * checksum that can not be reproduced by an attacker. We store the MAC in the + * second 128 bits of blk_cksum, leaving the first 128 bits for a truncated + * regular checksum of the ciphertext which can be used for scrubbing. + * + * OBJECT AUTHENTICATION: + * Some object types, such as DMU_OT_MASTER_NODE cannot be encrypted because + * they contain some info that always needs to be readable. To prevent this + * data from being altered, we authenticate this data using SHA512-HMAC. This + * will produce a MAC (similar to the one produced via encryption) which can + * be used to verify the object was not modified. HMACs do not require key + * rotation or IVs, so we can keep up to the full 3 copies of authenticated + * data. + * + * ZIL ENCRYPTION: + * ZIL blocks have their bp written to disk ahead of the associated data, so we + * cannot store the MAC there as we normally do. For these blocks the MAC is + * stored in the embedded checksum within the zil_chain_t header. The salt and + * IV are generated for the block on bp allocation instead of at encryption + * time. In addition, ZIL blocks have some pieces that must be left in plaintext + * for claiming even though all of the sensitive user data still needs to be + * encrypted. The function zio_crypt_init_uios_zil() handles parsing which + * pieces of the block need to be encrypted. All data that is not encrypted is + * authenticated using the AAD mechanisms that the supported encryption modes + * provide for. In order to preserve the semantics of the ZIL for encrypted + * datasets, the ZIL is not protected at the objset level as described below. + * + * DNODE ENCRYPTION: + * Similarly to ZIL blocks, the core part of each dnode_phys_t needs to be left + * in plaintext for scrubbing and claiming, but the bonus buffers might contain + * sensitive user data. The function zio_crypt_init_uios_dnode() handles parsing + * which which pieces of the block need to be encrypted. For more details about + * dnode authentication and encryption, see zio_crypt_init_uios_dnode(). + * + * OBJECT SET AUTHENTICATION: + * Up to this point, everything we have encrypted and authenticated has been + * at level 0 (or -2 for the ZIL). If we did not do any further work the + * on-disk format would be susceptible to attacks that deleted or rearranged + * the order of level 0 blocks. Ideally, the cleanest solution would be to + * maintain a tree of authentication MACs going up the bp tree. However, this + * presents a problem for raw sends. Send files do not send information about + * indirect blocks so there would be no convenient way to transfer the MACs and + * they cannot be recalculated on the receive side without the master key which + * would defeat one of the purposes of raw sends in the first place. Instead, + * for the indirect levels of the bp tree, we use a regular SHA512 of the MACs + * from the level below. We also include some portable fields from blk_prop such + * as the lsize and compression algorithm to prevent the data from being + * misinterpreted. + * + * At the objset level, we maintain 2 separate 256 bit MACs in the + * objset_phys_t. The first one is "portable" and is the logical root of the + * MAC tree maintained in the metadnode's bps. The second, is "local" and is + * used as the root MAC for the user accounting objects, which are also not + * transferred via "zfs send". The portable MAC is sent in the DRR_BEGIN payload + * of the send file. The useraccounting code ensures that the useraccounting + * info is not present upon a receive, so the local MAC can simply be cleared + * out at that time. For more info about objset_phys_t authentication, see + * zio_crypt_do_objset_hmacs(). + * + * CONSIDERATIONS FOR DEDUP: + * In order for dedup to work, blocks that we want to dedup with one another + * need to use the same IV and encryption key, so that they will have the same + * ciphertext. Normally, one should never reuse an IV with the same encryption + * key or else AES-GCM and AES-CCM can both actually leak the plaintext of both + * blocks. In this case, however, since we are using the same plaintext as + * well all that we end up with is a duplicate of the original ciphertext we + * already had. As a result, an attacker with read access to the raw disk will + * be able to tell which blocks are the same but this information is given away + * by dedup anyway. In order to get the same IVs and encryption keys for + * equivalent blocks of data we use an HMAC of the plaintext. We use an HMAC + * here so that a reproducible checksum of the plaintext is never available to + * the attacker. The HMAC key is kept alongside the master key, encrypted on + * disk. The first 64 bits of the HMAC are used in place of the random salt, and + * the next 96 bits are used as the IV. As a result of this mechanism, dedup + * will only work within a clone family since encrypted dedup requires use of + * the same master and HMAC keys. + */ + +/* + * After encrypting many blocks with the same key we may start to run up + * against the theoretical limits of how much data can securely be encrypted + * with a single key using the supported encryption modes. The most obvious + * limitation is that our risk of generating 2 equivalent 96 bit IVs increases + * the more IVs we generate (which both GCM and CCM modes strictly forbid). + * This risk actually grows surprisingly quickly over time according to the + * Birthday Problem. With a total IV space of 2^(96 bits), and assuming we have + * generated n IVs with a cryptographically secure RNG, the approximate + * probability p(n) of a collision is given as: + * + * p(n) ~= e^(-n*(n-1)/(2*(2^96))) + * + * [http://www.math.cornell.edu/~mec/2008-2009/TianyiZheng/Birthday.html] + * + * Assuming that we want to ensure that p(n) never goes over 1 / 1 trillion + * we must not write more than 398,065,730 blocks with the same encryption key. + * Therefore, we rotate our keys after 400,000,000 blocks have been written by + * generating a new random 64 bit salt for our HKDF encryption key generation + * function. + */ +#define ZFS_KEY_MAX_SALT_USES_DEFAULT 400000000 +#define ZFS_CURRENT_MAX_SALT_USES \ + (MIN(zfs_key_max_salt_uses, ZFS_KEY_MAX_SALT_USES_DEFAULT)) +unsigned long zfs_key_max_salt_uses = ZFS_KEY_MAX_SALT_USES_DEFAULT; + +typedef struct blkptr_auth_buf { + uint64_t bab_prop; /* blk_prop - portable mask */ + uint8_t bab_mac[ZIO_DATA_MAC_LEN]; /* MAC from blk_cksum */ + uint64_t bab_pad; /* reserved for future use */ +} blkptr_auth_buf_t; + +zio_crypt_info_t zio_crypt_table[ZIO_CRYPT_FUNCTIONS] = { + {"", ZC_TYPE_NONE, 0, "inherit"}, + {"", ZC_TYPE_NONE, 0, "on"}, + {"", ZC_TYPE_NONE, 0, "off"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 16, "aes-128-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 24, "aes-192-ccm"}, + {SUN_CKM_AES_CCM, ZC_TYPE_CCM, 32, "aes-256-ccm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 16, "aes-128-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 24, "aes-192-gcm"}, + {SUN_CKM_AES_GCM, ZC_TYPE_GCM, 32, "aes-256-gcm"} +}; + +void +zio_crypt_key_destroy(zio_crypt_key_t *key) +{ + rw_destroy(&key->zk_salt_lock); + + /* free crypto templates */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + crypto_destroy_ctx_template(key->zk_hmac_tmpl); + + /* zero out sensitive data */ + bzero(key, sizeof (zio_crypt_key_t)); +} + +int +zio_crypt_key_init(uint64_t crypt, zio_crypt_key_t *key) +{ + int ret; + crypto_mechanism_t mech; + uint_t keydata_len; + + ASSERT(key != NULL); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + bzero(key, sizeof (zio_crypt_key_t)); + + /* fill keydata buffers and salt with random data */ + ret = random_get_bytes((uint8_t *)&key->zk_guid, sizeof (uint64_t)); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_master_keydata, keydata_len); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_hmac_keydata, SHA512_HMAC_KEYLEN); + if (ret != 0) + goto error; + + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for the ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = &key->zk_hmac_key; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = ZIO_CRYPT_KEY_CURRENT_VERSION; + key->zk_salt_count = 0; + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +static int +zio_crypt_key_change_salt(zio_crypt_key_t *key) +{ + int ret = 0; + uint8_t salt[ZIO_DATA_SALT_LEN]; + crypto_mechanism_t mech; + uint_t keydata_len = zio_crypt_table[key->zk_crypt].ci_keylen; + + /* generate a new salt */ + ret = random_get_bytes(salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + rw_enter(&key->zk_salt_lock, RW_WRITER); + + /* someone beat us to the salt rotation, just unlock and return */ + if (key->zk_salt_count < ZFS_CURRENT_MAX_SALT_USES) + goto out_unlock; + + /* derive the current key from the master key and the new salt */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, keydata_len); + if (ret != 0) + goto out_unlock; + + /* assign the salt and reset the usage count */ + bcopy(salt, key->zk_salt, ZIO_DATA_SALT_LEN); + key->zk_salt_count = 0; + + /* destroy the old context template and create the new one */ + crypto_destroy_ctx_template(key->zk_current_tmpl); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + rw_exit(&key->zk_salt_lock); + + return (0); + +out_unlock: + rw_exit(&key->zk_salt_lock); +error: + return (ret); +} + +/* See comment above zfs_key_max_salt_uses definition for details */ +int +zio_crypt_key_get_salt(zio_crypt_key_t *key, uint8_t *salt) +{ + int ret; + boolean_t salt_change; + + rw_enter(&key->zk_salt_lock, RW_READER); + + bcopy(key->zk_salt, salt, ZIO_DATA_SALT_LEN); + salt_change = (atomic_inc_64_nv(&key->zk_salt_count) >= + ZFS_CURRENT_MAX_SALT_USES); + + rw_exit(&key->zk_salt_lock); + + if (salt_change) { + ret = zio_crypt_key_change_salt(key); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * This function handles all encryption and decryption in zfs. When + * encrypting it expects puio to reference the plaintext and cuio to + * reference the ciphertext. cuio must have enough space for the + * ciphertext + room for a MAC. datalen should be the length of the + * plaintext / ciphertext alone. + */ +static int +zio_do_crypt_uio(boolean_t encrypt, uint64_t crypt, crypto_key_t *key, + crypto_ctx_template_t tmpl, uint8_t *ivbuf, uint_t datalen, + uio_t *puio, uio_t *cuio, uint8_t *authbuf, uint_t auth_len) +{ + int ret; + crypto_data_t plaindata, cipherdata; + CK_AES_CCM_PARAMS ccmp; + CK_AES_GCM_PARAMS gcmp; + crypto_mechanism_t mech; + zio_crypt_info_t crypt_info; + uint_t plain_full_len, maclen; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(key->ck_format, ==, CRYPTO_KEY_RAW); + + /* lookup the encryption info */ + crypt_info = zio_crypt_table[crypt]; + + /* the mac will always be the last iovec_t in the cipher uio */ + maclen = cuio->uio_iov[cuio->uio_iovcnt - 1].iov_len; + + ASSERT(maclen <= ZIO_DATA_MAC_LEN); + + /* setup encryption mechanism (same as crypt) */ + mech.cm_type = crypto_mech2id(crypt_info.ci_mechname); + + /* + * Strangely, the ICP requires that plain_full_len must include + * the MAC length when decrypting, even though the UIO does not + * need to have the extra space allocated. + */ + if (encrypt) { + plain_full_len = datalen; + } else { + plain_full_len = datalen + maclen; + } + + /* + * setup encryption params (currently only AES CCM and AES GCM + * are supported) + */ + if (crypt_info.ci_crypt_type == ZC_TYPE_CCM) { + ccmp.ulNonceSize = ZIO_DATA_IV_LEN; + ccmp.ulAuthDataSize = auth_len; + ccmp.authData = authbuf; + ccmp.ulMACSize = maclen; + ccmp.nonce = ivbuf; + ccmp.ulDataSize = plain_full_len; + + mech.cm_param = (char *)(&ccmp); + mech.cm_param_len = sizeof (CK_AES_CCM_PARAMS); + } else { + gcmp.ulIvLen = ZIO_DATA_IV_LEN; + gcmp.ulIvBits = CRYPTO_BYTES2BITS(ZIO_DATA_IV_LEN); + gcmp.ulAADLen = auth_len; + gcmp.pAAD = authbuf; + gcmp.ulTagBits = CRYPTO_BYTES2BITS(maclen); + gcmp.pIv = ivbuf; + + mech.cm_param = (char *)(&gcmp); + mech.cm_param_len = sizeof (CK_AES_GCM_PARAMS); + } + + /* populate the cipher and plain data structs. */ + plaindata.cd_format = CRYPTO_DATA_UIO; + plaindata.cd_offset = 0; + plaindata.cd_uio = puio; + plaindata.cd_miscdata = NULL; + plaindata.cd_length = plain_full_len; + + cipherdata.cd_format = CRYPTO_DATA_UIO; + cipherdata.cd_offset = 0; + cipherdata.cd_uio = cuio; + cipherdata.cd_miscdata = NULL; + cipherdata.cd_length = datalen + maclen; + + /* perform the actual encryption */ + if (encrypt) { + ret = crypto_encrypt(&mech, &plaindata, key, tmpl, &cipherdata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + } else { + ret = crypto_decrypt(&mech, &cipherdata, key, tmpl, &plaindata, + NULL); + if (ret != CRYPTO_SUCCESS) { + ASSERT3U(ret, ==, CRYPTO_INVALID_MAC); + ret = SET_ERROR(ECKSUM); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_wrap(crypto_key_t *cwkey, zio_crypt_key_t *key, uint8_t *iv, + uint8_t *mac, uint8_t *keydata_out, uint8_t *hmac_keydata_out) +{ + int ret; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint64_t crypt = key->zk_crypt; + uint_t enc_len, keydata_len, aad_len; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* generate iv for wrapping the master and hmac key */ + ret = random_get_pseudo_bytes(iv, WRAPPING_IV_LEN); + if (ret != 0) + goto error; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = keydata_out; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = hmac_keydata_out; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + /* + * Although we don't support writing to the old format, we do + * support rewrapping the key so that the user can move and + * quarantine datasets on the old format. + */ + if (key->zk_version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(key->zk_guid); + } else { + ASSERT3U(key->zk_version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(key->zk_guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(key->zk_version); + } + + enc_len = zio_crypt_table[crypt].ci_keylen + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_iovcnt = 2; + puio.uio_segflg = UIO_SYSSPACE; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* encrypt the keys and store the resulting ciphertext and mac */ + ret = zio_do_crypt_uio(B_TRUE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + return (0); + +error: + return (ret); +} + +int +zio_crypt_key_unwrap(crypto_key_t *cwkey, uint64_t crypt, uint64_t version, + uint64_t guid, uint8_t *keydata, uint8_t *hmac_keydata, uint8_t *iv, + uint8_t *mac, zio_crypt_key_t *key) +{ + crypto_mechanism_t mech; + uio_t puio, cuio; + uint64_t aad[3]; + iovec_t plain_iovecs[2], cipher_iovecs[3]; + uint_t enc_len, keydata_len, aad_len; + int ret; + + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(cwkey->ck_format, ==, CRYPTO_KEY_RAW); + + rw_init(&key->zk_salt_lock, NULL, RW_DEFAULT, NULL); + + keydata_len = zio_crypt_table[crypt].ci_keylen; + + /* initialize uio_ts */ + plain_iovecs[0].iov_base = key->zk_master_keydata; + plain_iovecs[0].iov_len = keydata_len; + plain_iovecs[1].iov_base = key->zk_hmac_keydata; + plain_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + + cipher_iovecs[0].iov_base = keydata; + cipher_iovecs[0].iov_len = keydata_len; + cipher_iovecs[1].iov_base = hmac_keydata; + cipher_iovecs[1].iov_len = SHA512_HMAC_KEYLEN; + cipher_iovecs[2].iov_base = mac; + cipher_iovecs[2].iov_len = WRAPPING_MAC_LEN; + + if (version == 0) { + aad_len = sizeof (uint64_t); + aad[0] = LE_64(guid); + } else { + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + aad_len = sizeof (uint64_t) * 3; + aad[0] = LE_64(guid); + aad[1] = LE_64(crypt); + aad[2] = LE_64(version); + } + + enc_len = keydata_len + SHA512_HMAC_KEYLEN; + puio.uio_iov = plain_iovecs; + puio.uio_segflg = UIO_SYSSPACE; + puio.uio_iovcnt = 2; + cuio.uio_iov = cipher_iovecs; + cuio.uio_iovcnt = 3; + cuio.uio_segflg = UIO_SYSSPACE; + + /* decrypt the keys and store the result in the output buffers */ + ret = zio_do_crypt_uio(B_FALSE, crypt, cwkey, NULL, iv, enc_len, + &puio, &cuio, (uint8_t *)aad, aad_len); + if (ret != 0) + goto error; + + /* generate a fresh salt */ + ret = random_get_bytes(key->zk_salt, ZIO_DATA_SALT_LEN); + if (ret != 0) + goto error; + + /* derive the current key from the master key */ + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + key->zk_salt, ZIO_DATA_SALT_LEN, key->zk_current_keydata, + keydata_len); + if (ret != 0) + goto error; + + /* initialize keys for ICP */ + key->zk_current_key.ck_format = CRYPTO_KEY_RAW; + key->zk_current_key.ck_data = key->zk_current_keydata; + key->zk_current_key.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + key->zk_hmac_key.ck_format = CRYPTO_KEY_RAW; + key->zk_hmac_key.ck_data = key->zk_hmac_keydata; + key->zk_hmac_key.ck_length = CRYPTO_BYTES2BITS(SHA512_HMAC_KEYLEN); + + /* + * Initialize the crypto templates. It's ok if this fails because + * this is just an optimization. + */ + mech.cm_type = crypto_mech2id(zio_crypt_table[crypt].ci_mechname); + ret = crypto_create_ctx_template(&mech, &key->zk_current_key, + &key->zk_current_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_current_tmpl = NULL; + + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + ret = crypto_create_ctx_template(&mech, &key->zk_hmac_key, + &key->zk_hmac_tmpl, KM_SLEEP); + if (ret != CRYPTO_SUCCESS) + key->zk_hmac_tmpl = NULL; + + key->zk_crypt = crypt; + key->zk_version = version; + key->zk_guid = guid; + key->zk_salt_count = 0; + + return (0); + +error: + zio_crypt_key_destroy(key); + return (ret); +} + +int +zio_crypt_generate_iv(uint8_t *ivbuf) +{ + int ret; + + /* randomly generate the IV */ + ret = random_get_pseudo_bytes(ivbuf, ZIO_DATA_IV_LEN); + if (ret != 0) + goto error; + + return (0); + +error: + bzero(ivbuf, ZIO_DATA_IV_LEN); + return (ret); +} + +int +zio_crypt_do_hmac(zio_crypt_key_t *key, uint8_t *data, uint_t datalen, + uint8_t *digestbuf, uint_t digestlen) +{ + int ret; + crypto_mechanism_t mech; + crypto_data_t in_data, digest_data; + uint8_t raw_digestbuf[SHA512_DIGEST_LENGTH]; + + ASSERT3U(digestlen, <=, SHA512_DIGEST_LENGTH); + + /* initialize sha512-hmac mechanism and crypto data */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the crypto data */ + in_data.cd_format = CRYPTO_DATA_RAW; + in_data.cd_offset = 0; + in_data.cd_length = datalen; + in_data.cd_raw.iov_base = (char *)data; + in_data.cd_raw.iov_len = in_data.cd_length; + + digest_data.cd_format = CRYPTO_DATA_RAW; + digest_data.cd_offset = 0; + digest_data.cd_length = SHA512_DIGEST_LENGTH; + digest_data.cd_raw.iov_base = (char *)raw_digestbuf; + digest_data.cd_raw.iov_len = digest_data.cd_length; + + /* generate the hmac */ + ret = crypto_mac(&mech, &in_data, &key->zk_hmac_key, key->zk_hmac_tmpl, + &digest_data, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_digestbuf, digestbuf, digestlen); + + return (0); + +error: + bzero(digestbuf, digestlen); + return (ret); +} + +int +zio_crypt_generate_iv_salt_dedup(zio_crypt_key_t *key, uint8_t *data, + uint_t datalen, uint8_t *ivbuf, uint8_t *salt) +{ + int ret; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + ret = zio_crypt_do_hmac(key, data, datalen, + digestbuf, SHA512_DIGEST_LENGTH); + if (ret != 0) + return (ret); + + bcopy(digestbuf, salt, ZIO_DATA_SALT_LEN); + bcopy(digestbuf + ZIO_DATA_SALT_LEN, ivbuf, ZIO_DATA_IV_LEN); + + return (0); +} + +/* + * The following functions are used to encode and decode encryption parameters + * into blkptr_t and zil_header_t. The ICP wants to use these parameters as + * byte strings, which normally means that these strings would not need to deal + * with byteswapping at all. However, both blkptr_t and zil_header_t may be + * byteswapped by lower layers and so we must "undo" that byteswap here upon + * decoding and encoding in a non-native byteorder. These functions require + * that the byteorder bit is correct before being called. + */ +void +zio_crypt_encode_params_bp(blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(salt, &bp->blk_dva[2].dva_word[0], sizeof (uint64_t)); + bcopy(iv, &bp->blk_dva[2].dva_word[1], sizeof (uint64_t)); + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, val32); + } else { + bcopy(salt, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[0] = BSWAP_64(val64); + + bcopy(iv, &val64, sizeof (uint64_t)); + bp->blk_dva[2].dva_word[1] = BSWAP_64(val64); + + bcopy(iv + sizeof (uint64_t), &val32, sizeof (uint32_t)); + BP_SET_IV2(bp, BSWAP_32(val32)); + } +} + +void +zio_crypt_decode_params_bp(const blkptr_t *bp, uint8_t *salt, uint8_t *iv) +{ + uint64_t val64; + uint32_t val32; + + ASSERT(BP_IS_PROTECTED(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_IS_AUTHENTICATED(bp)) { + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_dva[2].dva_word[0], salt, sizeof (uint64_t)); + bcopy(&bp->blk_dva[2].dva_word[1], iv, sizeof (uint64_t)); + + val32 = (uint32_t)BP_GET_IV2(bp); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } else { + val64 = BSWAP_64(bp->blk_dva[2].dva_word[0]); + bcopy(&val64, salt, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_dva[2].dva_word[1]); + bcopy(&val64, iv, sizeof (uint64_t)); + + val32 = BSWAP_32((uint32_t)BP_GET_IV2(bp)); + bcopy(&val32, iv + sizeof (uint64_t), sizeof (uint32_t)); + } +} + +void +zio_crypt_encode_mac_bp(blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_OBJSET); + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(mac, &bp->blk_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &bp->blk_cksum.zc_word[3], + sizeof (uint64_t)); + } else { + bcopy(mac, &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[2] = BSWAP_64(val64); + + bcopy(mac + sizeof (uint64_t), &val64, sizeof (uint64_t)); + bp->blk_cksum.zc_word[3] = BSWAP_64(val64); + } +} + +void +zio_crypt_decode_mac_bp(const blkptr_t *bp, uint8_t *mac) +{ + uint64_t val64; + + ASSERT(BP_USES_CRYPT(bp) || BP_IS_HOLE(bp)); + + /* for convenience, so callers don't need to check */ + if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + bzero(mac, ZIO_DATA_MAC_LEN); + return; + } + + if (!BP_SHOULD_BYTESWAP(bp)) { + bcopy(&bp->blk_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&bp->blk_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); + } else { + val64 = BSWAP_64(bp->blk_cksum.zc_word[2]); + bcopy(&val64, mac, sizeof (uint64_t)); + + val64 = BSWAP_64(bp->blk_cksum.zc_word[3]); + bcopy(&val64, mac + sizeof (uint64_t), sizeof (uint64_t)); + } +} + +void +zio_crypt_encode_mac_zil(void *data, uint8_t *mac) +{ + zil_chain_t *zilc = data; + + bcopy(mac, &zilc->zc_eck.zec_cksum.zc_word[2], sizeof (uint64_t)); + bcopy(mac + sizeof (uint64_t), &zilc->zc_eck.zec_cksum.zc_word[3], + sizeof (uint64_t)); +} + +void +zio_crypt_decode_mac_zil(const void *data, uint8_t *mac) +{ + /* + * The ZIL MAC is embedded in the block it protects, which will + * not have been byteswapped by the time this function has been called. + * As a result, we don't need to worry about byteswapping the MAC. + */ + const zil_chain_t *zilc = data; + + bcopy(&zilc->zc_eck.zec_cksum.zc_word[2], mac, sizeof (uint64_t)); + bcopy(&zilc->zc_eck.zec_cksum.zc_word[3], mac + sizeof (uint64_t), + sizeof (uint64_t)); +} + +/* + * This routine takes a block of dnodes (src_abd) and copies only the bonus + * buffers to the same offsets in the dst buffer. datalen should be the size + * of both the src_abd and the dst buffer (not just the length of the bonus + * buffers). + */ +void +zio_crypt_copy_dnode_bonus(abd_t *src_abd, uint8_t *dst, uint_t datalen) +{ + uint_t i, max_dnp = datalen >> DNODE_SHIFT; + uint8_t *src; + dnode_phys_t *dnp, *sdnp, *ddnp; + + src = abd_borrow_buf_copy(src_abd, datalen); + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), + DN_MAX_BONUS_LEN(dnp)); + } + } + + abd_return_buf(src_abd, src, datalen); +} + +/* + * This function decides what fields from blk_prop are included in + * the on-disk various MAC algorithms. + */ +static void +zio_crypt_bp_zero_nonportable_blkprop(blkptr_t *bp, uint64_t version) +{ + /* + * Version 0 did not properly zero out all non-portable fields + * as it should have done. We maintain this code so that we can + * do read-only imports of pools on this version. + */ + if (version == 0) { + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + return; + } + + ASSERT3U(version, ==, ZIO_CRYPT_KEY_CURRENT_VERSION); + + /* + * The hole_birth feature might set these fields even if this bp + * is a hole. We zero them out here to guarantee that raw sends + * will function with or without the feature. + */ + if (BP_IS_HOLE(bp)) { + bp->blk_prop = 0ULL; + return; + } + + /* + * At L0 we want to verify these fields to ensure that data blocks + * can not be reinterpreted. For instance, we do not want an attacker + * to trick us into returning raw lz4 compressed data to the user + * by modifying the compression bits. At higher levels, we cannot + * enforce this policy since raw sends do not convey any information + * about indirect blocks, so these values might be different on the + * receive side. Fortunately, this does not open any new attack + * vectors, since any alterations that can be made to a higher level + * bp must still verify the correct order of the layer below it. + */ + if (BP_GET_LEVEL(bp) != 0) { + BP_SET_BYTEORDER(bp, 0); + BP_SET_COMPRESS(bp, 0); + + /* + * psize cannot be set to zero or it will trigger + * asserts, but the value doesn't really matter as + * long as it is constant. + */ + BP_SET_PSIZE(bp, SPA_MINBLOCKSIZE); + } + + BP_SET_DEDUP(bp, 0); + BP_SET_CHECKSUM(bp, 0); +} + +static void +zio_crypt_bp_auth_init(uint64_t version, boolean_t should_bswap, blkptr_t *bp, + blkptr_auth_buf_t *bab, uint_t *bab_len) +{ + blkptr_t tmpbp = *bp; + + if (should_bswap) + byteswap_uint64_array(&tmpbp, sizeof (blkptr_t)); + + ASSERT(BP_USES_CRYPT(&tmpbp) || BP_IS_HOLE(&tmpbp)); + ASSERT0(BP_IS_EMBEDDED(&tmpbp)); + + zio_crypt_decode_mac_bp(&tmpbp, bab->bab_mac); + + /* + * We always MAC blk_prop in LE to ensure portability. This + * must be done after decoding the mac, since the endianness + * will get zero'd out here. + */ + zio_crypt_bp_zero_nonportable_blkprop(&tmpbp, version); + bab->bab_prop = LE_64(tmpbp.blk_prop); + bab->bab_pad = 0ULL; + + /* version 0 did not include the padding */ + *bab_len = sizeof (blkptr_auth_buf_t); + if (version == 0) + *bab_len -= sizeof (uint64_t); +} + +static int +zio_crypt_bp_do_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + int ret; + uint_t bab_len; + blkptr_auth_buf_t bab; + crypto_data_t cd; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + cd.cd_length = bab_len; + cd.cd_raw.iov_base = (char *)&bab; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + return (0); + +error: + return (ret); +} + +static void +zio_crypt_bp_do_indrect_checksum_updates(SHA2_CTX *ctx, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + SHA2Update(ctx, &bab, bab_len); +} + +static void +zio_crypt_bp_do_aad_updates(uint8_t **aadp, uint_t *aad_len, uint64_t version, + boolean_t should_bswap, blkptr_t *bp) +{ + uint_t bab_len; + blkptr_auth_buf_t bab; + + zio_crypt_bp_auth_init(version, should_bswap, bp, &bab, &bab_len); + bcopy(&bab, *aadp, bab_len); + *aadp += bab_len; + *aad_len += bab_len; +} + +static int +zio_crypt_do_dnode_hmac_updates(crypto_context_t ctx, uint64_t version, + boolean_t should_bswap, dnode_phys_t *dnp) +{ + int ret, i; + dnode_phys_t *adnp; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + crypto_data_t cd; + uint8_t tmp_dncore[offsetof(dnode_phys_t, dn_blkptr)]; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* authenticate the core dnode (masking out non-portable bits) */ + bcopy(dnp, tmp_dncore, sizeof (tmp_dncore)); + adnp = (dnode_phys_t *)tmp_dncore; + if (le_bswap) { + adnp->dn_datablkszsec = BSWAP_16(adnp->dn_datablkszsec); + adnp->dn_bonuslen = BSWAP_16(adnp->dn_bonuslen); + adnp->dn_maxblkid = BSWAP_64(adnp->dn_maxblkid); + adnp->dn_used = BSWAP_64(adnp->dn_used); + } + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + + cd.cd_length = sizeof (tmp_dncore); + cd.cd_raw.iov_base = (char *)adnp; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + for (i = 0; i < dnp->dn_nblkptr; i++) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, &dnp->dn_blkptr[i]); + if (ret != 0) + goto error; + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + ret = zio_crypt_bp_do_hmac_updates(ctx, version, + should_bswap, DN_SPILL_BLKPTR(dnp)); + if (ret != 0) + goto error; + } + + return (0); + +error: + return (ret); +} + +/* + * objset_phys_t blocks introduce a number of exceptions to the normal + * authentication process. objset_phys_t's contain 2 separate HMACS for + * protecting the integrity of their data. The portable_mac protects the + * metadnode. This MAC can be sent with a raw send and protects against + * reordering of data within the metadnode. The local_mac protects the user + * accounting objects which are not sent from one system to another. + * + * In addition, objset blocks are the only blocks that can be modified and + * written to disk without the key loaded under certain circumstances. During + * zil_claim() we need to be able to update the zil_header_t to complete + * claiming log blocks and during raw receives we need to write out the + * portable_mac from the send file. Both of these actions are possible + * because these fields are not protected by either MAC so neither one will + * need to modify the MACs without the key. However, when the modified blocks + * are written out they will be byteswapped into the host machine's native + * endianness which will modify fields protected by the MAC. As a result, MAC + * calculation for objset blocks works slightly differently from other block + * types. Where other block types MAC the data in whatever endianness is + * written to disk, objset blocks always MAC little endian version of their + * values. In the code, should_bswap is the value from BP_SHOULD_BYTESWAP() + * and le_bswap indicates whether a byteswap is needed to get this block + * into little endian format. + */ +int +zio_crypt_do_objset_hmacs(zio_crypt_key_t *key, void *data, uint_t datalen, + boolean_t should_bswap, uint8_t *portable_mac, uint8_t *local_mac) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_data_t cd; + objset_phys_t *osp = data; + uint64_t intval; + boolean_t le_bswap = (should_bswap == ZFS_HOST_BYTEORDER); + uint8_t raw_portable_mac[SHA512_DIGEST_LENGTH]; + uint8_t raw_local_mac[SHA512_DIGEST_LENGTH]; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + cd.cd_format = CRYPTO_DATA_RAW; + cd.cd_offset = 0; + + /* calculate the portable MAC from the portable fields and metadnode */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the os_type */ + intval = (le_bswap) ? osp->os_type : BSWAP_64(osp->os_type); + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the metadnode */ + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_meta_dnode); + if (ret) + goto error; + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_portable_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_portable_mac, portable_mac, ZIO_OBJSET_MAC_LEN); + + /* + * The local MAC protects the user, group and project accounting. + * If these objects are not present, the local MAC is zeroed out. + */ + if ((datalen >= OBJSET_PHYS_SIZE_V3 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE && + osp->os_projectused_dnode.dn_type == DMU_OT_NONE) || + (datalen >= OBJSET_PHYS_SIZE_V2 && + osp->os_userused_dnode.dn_type == DMU_OT_NONE && + osp->os_groupused_dnode.dn_type == DMU_OT_NONE) || + (datalen <= OBJSET_PHYS_SIZE_V1)) { + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (0); + } + + /* calculate the local MAC from the userused and groupused dnodes */ + ret = crypto_mac_init(&mech, &key->zk_hmac_key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in the non-portable os_flags */ + intval = osp->os_flags; + if (should_bswap) + intval = BSWAP_64(intval); + intval &= ~OBJSET_CRYPT_PORTABLE_FLAGS_MASK; + if (!ZFS_HOST_BYTEORDER) + intval = BSWAP_64(intval); + + cd.cd_length = sizeof (uint64_t); + cd.cd_raw.iov_base = (char *)&intval; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_update(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + /* add in fields from the user accounting dnodes */ + if (osp->os_userused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_userused_dnode); + if (ret) + goto error; + } + + if (osp->os_groupused_dnode.dn_type != DMU_OT_NONE) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_groupused_dnode); + if (ret) + goto error; + } + + if (osp->os_projectused_dnode.dn_type != DMU_OT_NONE && + datalen >= OBJSET_PHYS_SIZE_V3) { + ret = zio_crypt_do_dnode_hmac_updates(ctx, key->zk_version, + should_bswap, &osp->os_projectused_dnode); + if (ret) + goto error; + } + + /* store the final digest in a temporary buffer and copy what we need */ + cd.cd_length = SHA512_DIGEST_LENGTH; + cd.cd_raw.iov_base = (char *)raw_local_mac; + cd.cd_raw.iov_len = cd.cd_length; + + ret = crypto_mac_final(ctx, &cd, NULL); + if (ret != CRYPTO_SUCCESS) { + ret = SET_ERROR(EIO); + goto error; + } + + bcopy(raw_local_mac, local_mac, ZIO_OBJSET_MAC_LEN); + + return (0); + +error: + bzero(portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(local_mac, ZIO_OBJSET_MAC_LEN); + return (ret); +} + +static void +zio_crypt_destroy_uio(uio_t *uio) +{ + if (uio->uio_iov) + kmem_free(uio->uio_iov, uio->uio_iovcnt * sizeof (iovec_t)); +} + +/* + * This function parses an uncompressed indirect block and returns a checksum + * of all the portable fields from all of the contained bps. The portable + * fields are the MAC and all of the fields from blk_prop except for the dedup, + * checksum, and psize bits. For an explanation of the purpose of this, see + * the comment block on object set authentication. + */ +static int +zio_crypt_do_indirect_mac_checksum_impl(boolean_t generate, void *buf, + uint_t datalen, uint64_t version, boolean_t byteswap, uint8_t *cksum) +{ + blkptr_t *bp; + int i, epb = datalen >> SPA_BLKPTRSHIFT; + SHA2_CTX ctx; + uint8_t digestbuf[SHA512_DIGEST_LENGTH]; + + /* checksum all of the MACs from the layer below */ + SHA2Init(SHA512, &ctx); + for (i = 0, bp = buf; i < epb; i++, bp++) { + zio_crypt_bp_do_indrect_checksum_updates(&ctx, version, + byteswap, bp); + } + SHA2Final(digestbuf, &ctx); + + if (generate) { + bcopy(digestbuf, cksum, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, cksum, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_crypt_do_indirect_mac_checksum(boolean_t generate, void *buf, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + + /* + * Unfortunately, callers of this function will not always have + * easy access to the on-disk format version. This info is + * normally found in the DSL Crypto Key, but the checksum-of-MACs + * is expected to be verifiable even when the key isn't loaded. + * Here, instead of doing a ZAP lookup for the version for each + * zio, we simply try both existing formats. + */ + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, buf, + datalen, ZIO_CRYPT_KEY_CURRENT_VERSION, byteswap, cksum); + if (ret == ECKSUM) { + ASSERT(!generate); + ret = zio_crypt_do_indirect_mac_checksum_impl(generate, + buf, datalen, 0, byteswap, cksum); + } + + return (ret); +} + +int +zio_crypt_do_indirect_mac_checksum_abd(boolean_t generate, abd_t *abd, + uint_t datalen, boolean_t byteswap, uint8_t *cksum) +{ + int ret; + void *buf; + + buf = abd_borrow_buf_copy(abd, datalen); + ret = zio_crypt_do_indirect_mac_checksum(generate, buf, datalen, + byteswap, cksum); + abd_return_buf(abd, buf, datalen); + + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting ZIL blocks. + * We do not check for the older ZIL chain because the encryption feature + * was not available before the newer ZIL chain was introduced. The goal + * here is to encrypt everything except the blkptr_t of a lr_write_t and + * the zil_chain_t header. Everything that is not encrypted is authenticated. + */ +static int +zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, uio_t *puio, + uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, uint_t *auth_len, + boolean_t *no_crypt) +{ + int ret; + uint64_t txtype, lr_len; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *slrp, *dlrp, *blkend, *aadp; + zil_chain_t *zilc; + lr_t *lr; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + /* cipherbuf always needs an extra iovec for the MAC */ + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + /* find the start and end record of the log block */ + zilc = (zil_chain_t *)src; + slrp = src + sizeof (zil_chain_t); + aadp = aadbuf; + blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + + /* calculate the number of encrypted iovecs we will need */ + for (; slrp < blkend; slrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + nr_iovecs++; + if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) + nr_iovecs++; + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + /* allocate the iovec arrays */ + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + /* + * Copy the plain zil header over and authenticate everything except + * the checksum that will store our MAC. If we are writing the data + * the embedded checksum will not have been calculated yet, so we don't + * authenticate that. + */ + bcopy(src, dst, sizeof (zil_chain_t)); + bcopy(src, aadp, sizeof (zil_chain_t) - sizeof (zio_eck_t)); + aadp += sizeof (zil_chain_t) - sizeof (zio_eck_t); + aad_len += sizeof (zil_chain_t) - sizeof (zio_eck_t); + + /* loop over records again, filling in iovecs */ + nr_iovecs = 0; + slrp = src + sizeof (zil_chain_t); + dlrp = dst + sizeof (zil_chain_t); + + for (; slrp < blkend; slrp += lr_len, dlrp += lr_len) { + lr = (lr_t *)slrp; + + if (!byteswap) { + txtype = lr->lrc_txtype; + lr_len = lr->lrc_reclen; + } else { + txtype = BSWAP_64(lr->lrc_txtype); + lr_len = BSWAP_64(lr->lrc_reclen); + } + + /* copy the common lr_t */ + bcopy(slrp, dlrp, sizeof (lr_t)); + bcopy(slrp, aadp, sizeof (lr_t)); + aadp += sizeof (lr_t); + aad_len += sizeof (lr_t); + + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + + /* + * If this is a TX_WRITE record we want to encrypt everything + * except the bp if exists. If the bp does exist we want to + * authenticate it. + */ + if (txtype == TX_WRITE) { + crypt_len = sizeof (lr_write_t) - + sizeof (lr_t) - sizeof (blkptr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + /* copy the bp now since it will not be encrypted */ + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + dlrp + sizeof (lr_write_t) - sizeof (blkptr_t), + sizeof (blkptr_t)); + bcopy(slrp + sizeof (lr_write_t) - sizeof (blkptr_t), + aadp, sizeof (blkptr_t)); + aadp += sizeof (blkptr_t); + aad_len += sizeof (blkptr_t); + nr_iovecs++; + total_len += crypt_len; + + if (lr_len != sizeof (lr_write_t)) { + crypt_len = lr_len - sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_base = + slrp + sizeof (lr_write_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = + dlrp + sizeof (lr_write_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } else { + crypt_len = lr_len - sizeof (lr_t); + src_iovecs[nr_iovecs].iov_base = slrp + sizeof (lr_t); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = dlrp + sizeof (lr_t); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + nr_iovecs++; + total_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * Special case handling routine for encrypting / decrypting dnode blocks. + */ +static int +zio_crypt_init_uios_dnode(boolean_t encrypt, uint64_t version, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + uint_t nr_src, nr_dst, crypt_len; + uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; + uint_t i, j, max_dnp = datalen >> DNODE_SHIFT; + iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; + uint8_t *src, *dst, *aadp; + dnode_phys_t *dnp, *adnp, *sdnp, *ddnp; + uint8_t *aadbuf = zio_buf_alloc(datalen); + + if (encrypt) { + src = plainbuf; + dst = cipherbuf; + nr_src = 0; + nr_dst = 1; + } else { + src = cipherbuf; + dst = plainbuf; + nr_src = 1; + nr_dst = 0; + } + + sdnp = (dnode_phys_t *)src; + ddnp = (dnode_phys_t *)dst; + aadp = aadbuf; + + /* + * Count the number of iovecs we will need to do the encryption by + * counting the number of bonus buffers that need to be encrypted. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + /* + * This block may still be byteswapped. However, all of the + * values we use are either uint8_t's (for which byteswapping + * is a noop) or a * != 0 check, which will work regardless + * of whether or not we byteswap. + */ + if (sdnp[i].dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(sdnp[i].dn_bonustype) && + sdnp[i].dn_bonuslen != 0) { + nr_iovecs++; + } + } + + nr_src += nr_iovecs; + nr_dst += nr_iovecs; + + if (nr_src != 0) { + src_iovecs = kmem_alloc(nr_src * sizeof (iovec_t), KM_SLEEP); + if (src_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + if (nr_dst != 0) { + dst_iovecs = kmem_alloc(nr_dst * sizeof (iovec_t), KM_SLEEP); + if (dst_iovecs == NULL) { + ret = SET_ERROR(ENOMEM); + goto error; + } + } + + nr_iovecs = 0; + + /* + * Iterate through the dnodes again, this time filling in the uios + * we allocated earlier. We also concatenate any data we want to + * authenticate onto aadbuf. + */ + for (i = 0; i < max_dnp; i += sdnp[i].dn_extra_slots + 1) { + dnp = &sdnp[i]; + + /* copy over the core fields and blkptrs (kept as plaintext) */ + bcopy(dnp, &ddnp[i], (uint8_t *)DN_BONUS(dnp) - (uint8_t *)dnp); + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + bcopy(DN_SPILL_BLKPTR(dnp), DN_SPILL_BLKPTR(&ddnp[i]), + sizeof (blkptr_t)); + } + + /* + * Handle authenticated data. We authenticate everything in + * the dnode that can be brought over when we do a raw send. + * This includes all of the core fields as well as the MACs + * stored in the bp checksums and all of the portable bits + * from blk_prop. We include the dnode padding here in case it + * ever gets used in the future. Some dn_flags and dn_used are + * not portable so we mask those out values out of the + * authenticated data. + */ + crypt_len = offsetof(dnode_phys_t, dn_blkptr); + bcopy(dnp, aadp, crypt_len); + adnp = (dnode_phys_t *)aadp; + adnp->dn_flags &= DNODE_CRYPT_PORTABLE_FLAGS_MASK; + adnp->dn_used = 0; + aadp += crypt_len; + aad_len += crypt_len; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, &dnp->dn_blkptr[j]); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zio_crypt_bp_do_aad_updates(&aadp, &aad_len, + version, byteswap, DN_SPILL_BLKPTR(dnp)); + } + + /* + * If this bonus buffer needs to be encrypted, we prepare an + * iovec_t. The encryption / decryption functions will fill + * this in for us with the encrypted or decrypted data. + * Otherwise we add the bonus buffer to the authenticated + * data buffer and copy it over to the destination. The + * encrypted iovec extends to DN_MAX_BONUS_LEN(dnp) so that + * we can guarantee alignment with the AES block size + * (128 bits). + */ + crypt_len = DN_MAX_BONUS_LEN(dnp); + if (dnp->dn_type != DMU_OT_NONE && + DMU_OT_IS_ENCRYPTED(dnp->dn_bonustype) && + dnp->dn_bonuslen != 0) { + ASSERT3U(nr_iovecs, <, nr_src); + ASSERT3U(nr_iovecs, <, nr_dst); + ASSERT3P(src_iovecs, !=, NULL); + ASSERT3P(dst_iovecs, !=, NULL); + src_iovecs[nr_iovecs].iov_base = DN_BONUS(dnp); + src_iovecs[nr_iovecs].iov_len = crypt_len; + dst_iovecs[nr_iovecs].iov_base = DN_BONUS(&ddnp[i]); + dst_iovecs[nr_iovecs].iov_len = crypt_len; + + nr_iovecs++; + total_len += crypt_len; + } else { + bcopy(DN_BONUS(dnp), DN_BONUS(&ddnp[i]), crypt_len); + bcopy(DN_BONUS(dnp), aadp, crypt_len); + aadp += crypt_len; + aad_len += crypt_len; + } + } + + *no_crypt = (nr_iovecs == 0); + *enc_len = total_len; + *authbuf = aadbuf; + *auth_len = aad_len; + + if (encrypt) { + puio->uio_iov = src_iovecs; + puio->uio_iovcnt = nr_src; + cuio->uio_iov = dst_iovecs; + cuio->uio_iovcnt = nr_dst; + } else { + puio->uio_iov = dst_iovecs; + puio->uio_iovcnt = nr_dst; + cuio->uio_iov = src_iovecs; + cuio->uio_iovcnt = nr_src; + } + + return (0); + +error: + zio_buf_free(aadbuf, datalen); + if (src_iovecs != NULL) + kmem_free(src_iovecs, nr_src * sizeof (iovec_t)); + if (dst_iovecs != NULL) + kmem_free(dst_iovecs, nr_dst * sizeof (iovec_t)); + + *enc_len = 0; + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +static int +zio_crypt_init_uios_normal(boolean_t encrypt, uint8_t *plainbuf, + uint8_t *cipherbuf, uint_t datalen, uio_t *puio, uio_t *cuio, + uint_t *enc_len) +{ + int ret; + uint_t nr_plain = 1, nr_cipher = 2; + iovec_t *plain_iovecs = NULL, *cipher_iovecs = NULL; + + /* allocate the iovecs for the plain and cipher data */ + plain_iovecs = kmem_alloc(nr_plain * sizeof (iovec_t), + KM_SLEEP); + if (!plain_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + cipher_iovecs = kmem_alloc(nr_cipher * sizeof (iovec_t), + KM_SLEEP); + if (!cipher_iovecs) { + ret = SET_ERROR(ENOMEM); + goto error; + } + + plain_iovecs[0].iov_base = plainbuf; + plain_iovecs[0].iov_len = datalen; + cipher_iovecs[0].iov_base = cipherbuf; + cipher_iovecs[0].iov_len = datalen; + + *enc_len = datalen; + puio->uio_iov = plain_iovecs; + puio->uio_iovcnt = nr_plain; + cuio->uio_iov = cipher_iovecs; + cuio->uio_iovcnt = nr_cipher; + + return (0); + +error: + if (plain_iovecs != NULL) + kmem_free(plain_iovecs, nr_plain * sizeof (iovec_t)); + if (cipher_iovecs != NULL) + kmem_free(cipher_iovecs, nr_cipher * sizeof (iovec_t)); + + *enc_len = 0; + puio->uio_iov = NULL; + puio->uio_iovcnt = 0; + cuio->uio_iov = NULL; + cuio->uio_iovcnt = 0; + return (ret); +} + +/* + * This function builds up the plaintext (puio) and ciphertext (cuio) uios so + * that they can be used for encryption and decryption by zio_do_crypt_uio(). + * Most blocks will use zio_crypt_init_uios_normal(), with ZIL and dnode blocks + * requiring special handling to parse out pieces that are to be encrypted. The + * authbuf is used by these special cases to store additional authenticated + * data (AAD) for the encryption modes. + */ +static int +zio_crypt_init_uios(boolean_t encrypt, uint64_t version, dmu_object_type_t ot, + uint8_t *plainbuf, uint8_t *cipherbuf, uint_t datalen, boolean_t byteswap, + uint8_t *mac, uio_t *puio, uio_t *cuio, uint_t *enc_len, uint8_t **authbuf, + uint_t *auth_len, boolean_t *no_crypt) +{ + int ret; + iovec_t *mac_iov; + + ASSERT(DMU_OT_IS_ENCRYPTED(ot) || ot == DMU_OT_NONE); + + /* route to handler */ + switch (ot) { + case DMU_OT_INTENT_LOG: + ret = zio_crypt_init_uios_zil(encrypt, plainbuf, cipherbuf, + datalen, byteswap, puio, cuio, enc_len, authbuf, auth_len, + no_crypt); + break; + case DMU_OT_DNODE: + ret = zio_crypt_init_uios_dnode(encrypt, version, plainbuf, + cipherbuf, datalen, byteswap, puio, cuio, enc_len, authbuf, + auth_len, no_crypt); + break; + default: + ret = zio_crypt_init_uios_normal(encrypt, plainbuf, cipherbuf, + datalen, puio, cuio, enc_len); + *authbuf = NULL; + *auth_len = 0; + *no_crypt = B_FALSE; + break; + } + + if (ret != 0) + goto error; + + /* populate the uios */ + puio->uio_segflg = UIO_SYSSPACE; + cuio->uio_segflg = UIO_SYSSPACE; + + mac_iov = ((iovec_t *)&cuio->uio_iov[cuio->uio_iovcnt - 1]); + mac_iov->iov_base = mac; + mac_iov->iov_len = ZIO_DATA_MAC_LEN; + + return (0); + +error: + return (ret); +} + +/* + * Primary encryption / decryption entrypoint for zio data. + */ +int +zio_do_crypt_data(boolean_t encrypt, zio_crypt_key_t *key, + dmu_object_type_t ot, boolean_t byteswap, uint8_t *salt, uint8_t *iv, + uint8_t *mac, uint_t datalen, uint8_t *plainbuf, uint8_t *cipherbuf, + boolean_t *no_crypt) +{ + int ret; + boolean_t locked = B_FALSE; + uint64_t crypt = key->zk_crypt; + uint_t keydata_len = zio_crypt_table[crypt].ci_keylen; + uint_t enc_len, auth_len; + uio_t puio, cuio; + uint8_t enc_keydata[MASTER_KEY_MAX_LEN]; + crypto_key_t tmp_ckey, *ckey = NULL; + crypto_ctx_template_t tmpl; + uint8_t *authbuf = NULL; + + /* + * If the needed key is the current one, just use it. Otherwise we + * need to generate a temporary one from the given salt + master key. + * If we are encrypting, we must return a copy of the current salt + * so that it can be stored in the blkptr_t. + */ + rw_enter(&key->zk_salt_lock, RW_READER); + locked = B_TRUE; + + if (bcmp(salt, key->zk_salt, ZIO_DATA_SALT_LEN) == 0) { + ckey = &key->zk_current_key; + tmpl = key->zk_current_tmpl; + } else { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + + ret = hkdf_sha512(key->zk_master_keydata, keydata_len, NULL, 0, + salt, ZIO_DATA_SALT_LEN, enc_keydata, keydata_len); + if (ret != 0) + goto error; + + tmp_ckey.ck_format = CRYPTO_KEY_RAW; + tmp_ckey.ck_data = enc_keydata; + tmp_ckey.ck_length = CRYPTO_BYTES2BITS(keydata_len); + + ckey = &tmp_ckey; + tmpl = NULL; + } + + /* + * Attempt to use QAT acceleration if we can. We currently don't + * do this for metadnode and ZIL blocks, since they have a much + * more involved buffer layout and the qat_crypt() function only + * works in-place. + */ + if (qat_crypt_use_accel(datalen) && + ot != DMU_OT_INTENT_LOG && ot != DMU_OT_DNODE) { + uint8_t *srcbuf, *dstbuf; + + if (encrypt) { + srcbuf = plainbuf; + dstbuf = cipherbuf; + } else { + srcbuf = cipherbuf; + dstbuf = plainbuf; + } + + ret = qat_crypt((encrypt) ? QAT_ENCRYPT : QAT_DECRYPT, srcbuf, + dstbuf, NULL, 0, iv, mac, ckey, key->zk_crypt, datalen); + if (ret == CPA_STATUS_SUCCESS) { + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + return (0); + } + /* If the hardware implementation fails fall back to software */ + } + + bzero(&puio, sizeof (uio_t)); + bzero(&cuio, sizeof (uio_t)); + + /* create uios for encryption */ + ret = zio_crypt_init_uios(encrypt, key->zk_version, ot, plainbuf, + cipherbuf, datalen, byteswap, mac, &puio, &cuio, &enc_len, + &authbuf, &auth_len, no_crypt); + if (ret != 0) + goto error; + + /* perform the encryption / decryption in software */ + ret = zio_do_crypt_uio(encrypt, key->zk_crypt, ckey, tmpl, iv, enc_len, + &puio, &cuio, authbuf, auth_len); + if (ret != 0) + goto error; + + if (locked) { + rw_exit(&key->zk_salt_lock); + locked = B_FALSE; + } + + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (0); + +error: + if (locked) + rw_exit(&key->zk_salt_lock); + if (authbuf != NULL) + zio_buf_free(authbuf, datalen); + if (ckey == &tmp_ckey) + bzero(enc_keydata, keydata_len); + zio_crypt_destroy_uio(&puio); + zio_crypt_destroy_uio(&cuio); + + return (ret); +} + +/* + * Simple wrapper around zio_do_crypt_data() to work with abd's instead of + * linear buffers. + */ +int +zio_do_crypt_abd(boolean_t encrypt, zio_crypt_key_t *key, dmu_object_type_t ot, + boolean_t byteswap, uint8_t *salt, uint8_t *iv, uint8_t *mac, + uint_t datalen, abd_t *pabd, abd_t *cabd, boolean_t *no_crypt) +{ + int ret; + void *ptmp, *ctmp; + + if (encrypt) { + ptmp = abd_borrow_buf_copy(pabd, datalen); + ctmp = abd_borrow_buf(cabd, datalen); + } else { + ptmp = abd_borrow_buf(pabd, datalen); + ctmp = abd_borrow_buf_copy(cabd, datalen); + } + + ret = zio_do_crypt_data(encrypt, key, ot, byteswap, salt, iv, mac, + datalen, ptmp, ctmp, no_crypt); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (0); + +error: + if (encrypt) { + abd_return_buf(pabd, ptmp, datalen); + abd_return_buf_copy(cabd, ctmp, datalen); + } else { + abd_return_buf_copy(pabd, ptmp, datalen); + abd_return_buf(cabd, ctmp, datalen); + } + + return (ret); +} + +#if defined(_KERNEL) +/* BEGIN CSTYLED */ +module_param(zfs_key_max_salt_uses, ulong, 0644); +MODULE_PARM_DESC(zfs_key_max_salt_uses, "Max number of times a salt value " + "can be used for generating encryption keys before it is rotated"); +/* END CSTYLED */ +#endif diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c new file mode 100644 index 000000000000..fa4500f6f8d1 --- /dev/null +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -0,0 +1,551 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2011 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * LLNL-CODE-403049. + * Rewritten for Linux by: + * Rohan Puri + * Brian Behlendorf + */ + +#include +#include +#include +#include +#include + +/* + * Common open routine. Disallow any write access. + */ +/* ARGSUSED */ +static int +zpl_common_open(struct inode *ip, struct file *filp) +{ + if (filp->f_mode & FMODE_WRITE) + return (-EACCES); + + return (generic_file_open(ip, filp)); +} + +/* + * Get root directory contents. + */ +static int +zpl_root_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + int error = 0; + + ZFS_ENTER(zfsvfs); + + if (!zpl_dir_emit_dots(filp, ctx)) + goto out; + + if (ctx->pos == 2) { + if (!zpl_dir_emit(ctx, ZFS_SNAPDIR_NAME, + strlen(ZFS_SNAPDIR_NAME), ZFSCTL_INO_SNAPDIR, DT_DIR)) + goto out; + + ctx->pos++; + } + + if (ctx->pos == 3) { + if (!zpl_dir_emit(ctx, ZFS_SHAREDIR_NAME, + strlen(ZFS_SHAREDIR_NAME), ZFSCTL_INO_SHARES, DT_DIR)) + goto out; + + ctx->pos++; + } +out: + ZFS_EXIT(zfsvfs); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_root_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_root_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +/* + * Get root directory attributes. + */ +/* ARGSUSED */ +static int +zpl_root_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + + generic_fillattr(ip, stat); + stat->atime = current_time(ip); + + return (0); +} +ZPL_GETATTR_WRAPPER(zpl_root_getattr); + +static struct dentry * +zpl_root_lookup(struct inode *dip, struct dentry *dentry, unsigned int flags) +{ + cred_t *cr = CRED(); + struct inode *ip; + int error; + + crhold(cr); + error = -zfsctl_root_lookup(dip, dname(dentry), &ip, 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + crfree(cr); + + if (error) { + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + + return (d_splice_alias(ip, dentry)); +} + +/* + * The '.zfs' control directory file and inode operations. + */ +const struct file_operations zpl_fops_root = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_root_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_root_iterate, +#else + .readdir = zpl_root_readdir, +#endif +}; + +const struct inode_operations zpl_ops_root = { + .lookup = zpl_root_lookup, + .getattr = zpl_root_getattr, +}; + +static struct vfsmount * +zpl_snapdir_automount(struct path *path) +{ + int error; + + error = -zfsctl_snapshot_mount(path, 0); + if (error) + return (ERR_PTR(error)); + + /* + * Rather than returning the new vfsmount for the snapshot we must + * return NULL to indicate a mount collision. This is done because + * the user space mount calls do_add_mount() which adds the vfsmount + * to the name space. If we returned the new mount here it would be + * added again to the vfsmount list resulting in list corruption. + */ + return (NULL); +} + +/* + * Negative dentries must always be revalidated so newly created snapshots + * can be detected and automounted. Normal dentries should be kept because + * as of the 3.18 kernel revaliding the mountpoint dentry will result in + * the snapshot being immediately unmounted. + */ +static int +#ifdef HAVE_D_REVALIDATE_NAMEIDATA +zpl_snapdir_revalidate(struct dentry *dentry, struct nameidata *i) +#else +zpl_snapdir_revalidate(struct dentry *dentry, unsigned int flags) +#endif +{ + return (!!dentry->d_inode); +} + +dentry_operations_t zpl_dops_snapdirs = { +/* + * Auto mounting of snapshots is only supported for 2.6.37 and + * newer kernels. Prior to this kernel the ops->follow_link() + * callback was used as a hack to trigger the mount. The + * resulting vfsmount was then explicitly grafted in to the + * name space. While it might be possible to add compatibility + * code to accomplish this it would require considerable care. + */ + .d_automount = zpl_snapdir_automount, + .d_revalidate = zpl_snapdir_revalidate, +}; + +static struct dentry * +zpl_snapdir_lookup(struct inode *dip, struct dentry *dentry, + unsigned int flags) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct inode *ip = NULL; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfsctl_snapdir_lookup(dip, dname(dentry), &ip, + 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error && error != -ENOENT) + return (ERR_PTR(error)); + + ASSERT(error == 0 || ip == NULL); + d_clear_d_op(dentry); + d_set_d_op(dentry, &zpl_dops_snapdirs); + dentry->d_flags |= DCACHE_NEED_AUTOMOUNT; + + return (d_splice_alias(ip, dentry)); +} + +static int +zpl_snapdir_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + fstrans_cookie_t cookie; + char snapname[MAXNAMELEN]; + boolean_t case_conflict; + uint64_t id, pos; + int error = 0; + + ZFS_ENTER(zfsvfs); + cookie = spl_fstrans_mark(); + + if (!zpl_dir_emit_dots(filp, ctx)) + goto out; + + pos = ctx->pos; + while (error == 0) { + dsl_pool_config_enter(dmu_objset_pool(zfsvfs->z_os), FTAG); + error = -dmu_snapshot_list_next(zfsvfs->z_os, MAXNAMELEN, + snapname, &id, &pos, &case_conflict); + dsl_pool_config_exit(dmu_objset_pool(zfsvfs->z_os), FTAG); + if (error) + goto out; + + if (!zpl_dir_emit(ctx, snapname, strlen(snapname), + ZFSCTL_INO_SHARES - id, DT_DIR)) + goto out; + + ctx->pos = pos; + } +out: + spl_fstrans_unmark(cookie); + ZFS_EXIT(zfsvfs); + + if (error == -ENOENT) + return (0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_snapdir_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_snapdir_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +static int +zpl_snapdir_rename2(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry, unsigned int flags) +{ + cred_t *cr = CRED(); + int error; + + /* We probably don't want to support renameat2(2) in ctldir */ + if (flags) + return (-EINVAL); + + crhold(cr); + error = -zfsctl_snapdir_rename(sdip, dname(sdentry), + tdip, dname(tdentry), cr, 0); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +#ifndef HAVE_RENAME_WANTS_FLAGS +static int +zpl_snapdir_rename(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry) +{ + return (zpl_snapdir_rename2(sdip, sdentry, tdip, tdentry, 0)); +} +#endif + +static int +zpl_snapdir_rmdir(struct inode *dip, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + + crhold(cr); + error = -zfsctl_snapdir_remove(dip, dname(dentry), cr, 0); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +static int +zpl_snapdir_mkdir(struct inode *dip, struct dentry *dentry, umode_t mode) +{ + cred_t *cr = CRED(); + vattr_t *vap; + struct inode *ip; + int error; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dip, mode | S_IFDIR, cr); + + error = -zfsctl_snapdir_mkdir(dip, dname(dentry), vap, &ip, cr, 0); + if (error == 0) { + d_clear_d_op(dentry); + d_set_d_op(dentry, &zpl_dops_snapdirs); + d_instantiate(dentry, ip); + } + + kmem_free(vap, sizeof (vattr_t)); + ASSERT3S(error, <=, 0); + crfree(cr); + + return (error); +} + +/* + * Get snapshot directory attributes. + */ +/* ARGSUSED */ +static int +zpl_snapdir_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + zfsvfs_t *zfsvfs = ITOZSB(ip); + + ZFS_ENTER(zfsvfs); + generic_fillattr(ip, stat); + + stat->nlink = stat->size = 2; + stat->ctime = stat->mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); + stat->atime = current_time(ip); + ZFS_EXIT(zfsvfs); + + return (0); +} +ZPL_GETATTR_WRAPPER(zpl_snapdir_getattr); + +/* + * The '.zfs/snapshot' directory file operations. These mainly control + * generating the list of available snapshots when doing an 'ls' in the + * directory. See zpl_snapdir_readdir(). + */ +const struct file_operations zpl_fops_snapdir = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_snapdir_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_snapdir_iterate, +#else + .readdir = zpl_snapdir_readdir, +#endif + +}; + +/* + * The '.zfs/snapshot' directory inode operations. These mainly control + * creating an inode for a snapshot directory and initializing the needed + * infrastructure to automount the snapshot. See zpl_snapdir_lookup(). + */ +const struct inode_operations zpl_ops_snapdir = { + .lookup = zpl_snapdir_lookup, + .getattr = zpl_snapdir_getattr, +#ifdef HAVE_RENAME_WANTS_FLAGS + .rename = zpl_snapdir_rename2, +#else + .rename = zpl_snapdir_rename, +#endif + .rmdir = zpl_snapdir_rmdir, + .mkdir = zpl_snapdir_mkdir, +}; + +static struct dentry * +zpl_shares_lookup(struct inode *dip, struct dentry *dentry, + unsigned int flags) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct inode *ip = NULL; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfsctl_shares_lookup(dip, dname(dentry), &ip, + 0, cr, NULL, NULL); + ASSERT3S(error, <=, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error) { + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + + return (d_splice_alias(ip, dentry)); +} + +static int +zpl_shares_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + zfsvfs_t *zfsvfs = ITOZSB(file_inode(filp)); + znode_t *dzp; + int error = 0; + + ZFS_ENTER(zfsvfs); + cookie = spl_fstrans_mark(); + + if (zfsvfs->z_shares_dir == 0) { + zpl_dir_emit_dots(filp, ctx); + goto out; + } + + error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); + if (error) + goto out; + + crhold(cr); + error = -zfs_readdir(ZTOI(dzp), ctx, cr); + crfree(cr); + + iput(ZTOI(dzp)); +out: + spl_fstrans_unmark(cookie); + ZFS_EXIT(zfsvfs); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_shares_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_shares_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +/* ARGSUSED */ +static int +zpl_shares_getattr_impl(const struct path *path, struct kstat *stat, + u32 request_mask, unsigned int query_flags) +{ + struct inode *ip = path->dentry->d_inode; + zfsvfs_t *zfsvfs = ITOZSB(ip); + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + generic_fillattr(path->dentry->d_inode, stat); + stat->nlink = stat->size = 2; + stat->atime = current_time(ip); + ZFS_EXIT(zfsvfs); + return (0); + } + + error = -zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp); + if (error == 0) { + error = -zfs_getattr_fast(ZTOI(dzp), stat); + iput(ZTOI(dzp)); + } + + ZFS_EXIT(zfsvfs); + ASSERT3S(error, <=, 0); + + return (error); +} +ZPL_GETATTR_WRAPPER(zpl_shares_getattr); + +/* + * The '.zfs/shares' directory file operations. + */ +const struct file_operations zpl_fops_shares = { + .open = zpl_common_open, + .llseek = generic_file_llseek, + .read = generic_read_dir, +#ifdef HAVE_VFS_ITERATE_SHARED + .iterate_shared = zpl_shares_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_shares_iterate, +#else + .readdir = zpl_shares_readdir, +#endif + +}; + +/* + * The '.zfs/shares' directory inode operations. + */ +const struct inode_operations zpl_ops_shares = { + .lookup = zpl_shares_lookup, + .getattr = zpl_shares_getattr, +}; diff --git a/module/os/linux/zfs/zpl_export.c b/module/os/linux/zfs/zpl_export.c new file mode 100644 index 000000000000..eaf048c38db1 --- /dev/null +++ b/module/os/linux/zfs/zpl_export.c @@ -0,0 +1,154 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011 Gunnar Beutner + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + */ + + +#include +#include +#include +#include + + +static int +#ifdef HAVE_ENCODE_FH_WITH_INODE +zpl_encode_fh(struct inode *ip, __u32 *fh, int *max_len, struct inode *parent) +{ +#else +zpl_encode_fh(struct dentry *dentry, __u32 *fh, int *max_len, int connectable) +{ + /* CSTYLED */ + struct inode *ip = dentry->d_inode; +#endif /* HAVE_ENCODE_FH_WITH_INODE */ + fstrans_cookie_t cookie; + fid_t *fid = (fid_t *)fh; + int len_bytes, rc; + + len_bytes = *max_len * sizeof (__u32); + + if (len_bytes < offsetof(fid_t, fid_data)) + return (255); + + fid->fid_len = len_bytes - offsetof(fid_t, fid_data); + cookie = spl_fstrans_mark(); + + if (zfsctl_is_node(ip)) + rc = zfsctl_fid(ip, fid); + else + rc = zfs_fid(ip, fid); + + spl_fstrans_unmark(cookie); + len_bytes = offsetof(fid_t, fid_data) + fid->fid_len; + *max_len = roundup(len_bytes, sizeof (__u32)) / sizeof (__u32); + + return (rc == 0 ? FILEID_INO32_GEN : 255); +} + +static struct dentry * +zpl_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + fid_t *fid = (fid_t *)fh; + fstrans_cookie_t cookie; + struct inode *ip; + int len_bytes, rc; + + len_bytes = fh_len * sizeof (__u32); + + if (fh_type != FILEID_INO32_GEN || + len_bytes < offsetof(fid_t, fid_data) || + len_bytes < offsetof(fid_t, fid_data) + fid->fid_len) + return (ERR_PTR(-EINVAL)); + + cookie = spl_fstrans_mark(); + rc = zfs_vget(sb, &ip, fid); + spl_fstrans_unmark(cookie); + + if (rc) { + /* + * If we see ENOENT it might mean that an NFSv4 * client + * is using a cached inode value in a file handle and + * that the sought after file has had its inode changed + * by a third party. So change the error to ESTALE + * which will trigger a full lookup by the client and + * will find the new filename/inode pair if it still + * exists. + */ + if (rc == ENOENT) + rc = ESTALE; + + return (ERR_PTR(-rc)); + } + + ASSERT((ip != NULL) && !IS_ERR(ip)); + + return (d_obtain_alias(ip)); +} + +static struct dentry * +zpl_get_parent(struct dentry *child) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + znode_t *zp; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_lookup(ITOZ(child->d_inode), "..", &zp, 0, cr, NULL, NULL); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + if (error) + return (ERR_PTR(error)); + + return (d_obtain_alias(ZTOI(zp))); +} + +static int +zpl_commit_metadata(struct inode *inode) +{ + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error; + + if (zfsctl_is_node(inode)) + return (0); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(ITOZ(inode), 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +const struct export_operations zpl_export_operations = { + .encode_fh = zpl_encode_fh, + .fh_to_dentry = zpl_fh_to_dentry, + .get_parent = zpl_get_parent, + .commit_metadata = zpl_commit_metadata, +}; diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c new file mode 100644 index 000000000000..51e189a87272 --- /dev/null +++ b/module/os/linux/zfs/zpl_file.c @@ -0,0 +1,1079 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + + +#ifdef CONFIG_COMPAT +#include +#endif +#include +#include +#include +#include +#include +#include + +/* + * When using fallocate(2) to preallocate space, inflate the requested + * capacity check by 10% to account for the required metadata blocks. + */ +unsigned int zfs_fallocate_reserve_percent = 110; + +static int +zpl_open(struct inode *ip, struct file *filp) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + error = generic_file_open(ip, filp); + if (error) + return (error); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_open(ip, filp->f_mode, filp->f_flags, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_release(struct inode *ip, struct file *filp) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + if (ITOZ(ip)->z_atime_dirty) + zfs_mark_inode_dirty(ip); + + crhold(cr); + error = -zfs_close(ip, filp->f_flags, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_iterate(struct file *filp, zpl_dir_context_t *ctx) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_readdir(file_inode(filp), ctx, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if !defined(HAVE_VFS_ITERATE) && !defined(HAVE_VFS_ITERATE_SHARED) +static int +zpl_readdir(struct file *filp, void *dirent, filldir_t filldir) +{ + zpl_dir_context_t ctx = + ZPL_DIR_CONTEXT_INIT(dirent, filldir, filp->f_pos); + int error; + + error = zpl_iterate(filp, &ctx); + filp->f_pos = ctx.pos; + + return (error); +} +#endif /* !HAVE_VFS_ITERATE && !HAVE_VFS_ITERATE_SHARED */ + +#if defined(HAVE_FSYNC_WITHOUT_DENTRY) +/* + * Linux 2.6.35 - 3.0 API, + * As of 2.6.35 the dentry argument to the fops->fsync() hook was deemed + * redundant. The dentry is still accessible via filp->f_path.dentry, + * and we are guaranteed that filp will never be NULL. + */ +static int +zpl_fsync(struct file *filp, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(ITOZ(inode), datasync, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_FILE_AIO_FSYNC +static int +zpl_aio_fsync(struct kiocb *kiocb, int datasync) +{ + return (zpl_fsync(kiocb->ki_filp, datasync)); +} +#endif + +#elif defined(HAVE_FSYNC_RANGE) +/* + * Linux 3.1 - 3.x API, + * As of 3.1 the responsibility to call filemap_write_and_wait_range() has + * been pushed down in to the .fsync() vfs hook. Additionally, the i_mutex + * lock is no longer held by the caller, for zfs we don't require the lock + * to be held so we don't acquire it. + */ +static int +zpl_fsync(struct file *filp, loff_t start, loff_t end, int datasync) +{ + struct inode *inode = filp->f_mapping->host; + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + error = filemap_write_and_wait_range(inode->i_mapping, start, end); + if (error) + return (error); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_fsync(ITOZ(inode), datasync, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_FILE_AIO_FSYNC +static int +zpl_aio_fsync(struct kiocb *kiocb, int datasync) +{ + return (zpl_fsync(kiocb->ki_filp, kiocb->ki_pos, -1, datasync)); +} +#endif + +#else +#error "Unsupported fops->fsync() implementation" +#endif + +static inline int +zfs_io_flags(struct kiocb *kiocb) +{ + int flags = 0; + +#if defined(IOCB_DSYNC) + if (kiocb->ki_flags & IOCB_DSYNC) + flags |= O_DSYNC; +#endif +#if defined(IOCB_SYNC) + if (kiocb->ki_flags & IOCB_SYNC) + flags |= O_SYNC; +#endif +#if defined(IOCB_APPEND) + if (kiocb->ki_flags & IOCB_APPEND) + flags |= O_APPEND; +#endif +#if defined(IOCB_DIRECT) + if (kiocb->ki_flags & IOCB_DIRECT) + flags |= O_DIRECT; +#endif + return (flags); +} + +static ssize_t +zpl_read_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) +{ + ssize_t read; + uio_t uio = { { 0 }, 0 }; + int error; + fstrans_cookie_t cookie; + + uio.uio_iov = iovp; + uio.uio_iovcnt = nr_segs; + uio.uio_loffset = *ppos; + uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; + + cookie = spl_fstrans_mark(); + error = -zfs_read(ip, &uio, flags, cr); + spl_fstrans_unmark(cookie); + if (error < 0) + return (error); + + read = count - uio.uio_resid; + *ppos += read; + + return (read); +} + +inline ssize_t +zpl_read_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, + uio_seg_t segment, int flags, cred_t *cr) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + return (zpl_read_common_iovec(ip, &iov, len, 1, ppos, segment, + flags, cr, 0)); +} + +static ssize_t +zpl_iter_read_common(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + struct inode *ip = filp->f_mapping->host; + zfsvfs_t *zfsvfs = ZTOZSB(ITOZ(ip)); + ssize_t read; + unsigned int f_flags = filp->f_flags; + + f_flags |= zfs_io_flags(kiocb); + crhold(cr); + read = zpl_read_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); + crfree(cr); + + /* + * If relatime is enabled, call file_accessed() only if + * zfs_relatime_need_update() is true. This is needed since datasets + * with inherited "relatime" property aren't necessarily mounted with + * MNT_RELATIME flag (e.g. after `zfs set relatime=...`), which is what + * relatime test in VFS by relatime_need_update() is based on. + */ + if (!IS_NOATIME(ip) && zfsvfs->z_relatime) { + if (zfs_relatime_need_update(ip)) + file_accessed(filp); + } else { + file_accessed(filp); + } + + return (read); +} + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_iter_read(struct kiocb *kiocb, struct iov_iter *to) +{ + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + if (to->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (to->type & ITER_BVEC) + seg = UIO_BVEC; + ret = zpl_iter_read_common(kiocb, to->iov, to->nr_segs, + iov_iter_count(to), seg, to->iov_offset); + if (ret > 0) + iov_iter_advance(to, ret); + return (ret); +} +#else +static ssize_t +zpl_aio_read(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, loff_t pos) +{ + ssize_t ret; + size_t count; + + ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_WRITE); + if (ret) + return (ret); + + return (zpl_iter_read_common(kiocb, iovp, nr_segs, count, + UIO_USERSPACE, 0)); +} +#endif /* HAVE_VFS_RW_ITERATE */ + +static ssize_t +zpl_write_common_iovec(struct inode *ip, const struct iovec *iovp, size_t count, + unsigned long nr_segs, loff_t *ppos, uio_seg_t segment, int flags, + cred_t *cr, size_t skip) +{ + ssize_t wrote; + uio_t uio = { { 0 }, 0 }; + int error; + fstrans_cookie_t cookie; + + if (flags & O_APPEND) + *ppos = i_size_read(ip); + + uio.uio_iov = iovp; + uio.uio_iovcnt = nr_segs; + uio.uio_loffset = *ppos; + uio.uio_segflg = segment; + uio.uio_limit = MAXOFFSET_T; + uio.uio_resid = count; + uio.uio_skip = skip; + + cookie = spl_fstrans_mark(); + error = -zfs_write(ip, &uio, flags, cr); + spl_fstrans_unmark(cookie); + if (error < 0) + return (error); + + wrote = count - uio.uio_resid; + *ppos += wrote; + + return (wrote); +} + +inline ssize_t +zpl_write_common(struct inode *ip, const char *buf, size_t len, loff_t *ppos, + uio_seg_t segment, int flags, cred_t *cr) +{ + struct iovec iov; + + iov.iov_base = (void *)buf; + iov.iov_len = len; + + return (zpl_write_common_iovec(ip, &iov, len, 1, ppos, segment, + flags, cr, 0)); +} + +static ssize_t +zpl_iter_write_common(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, size_t count, uio_seg_t seg, size_t skip) +{ + cred_t *cr = CRED(); + struct file *filp = kiocb->ki_filp; + ssize_t wrote; + unsigned int f_flags = filp->f_flags; + + f_flags |= zfs_io_flags(kiocb); + crhold(cr); + wrote = zpl_write_common_iovec(filp->f_mapping->host, iovp, count, + nr_segs, &kiocb->ki_pos, seg, f_flags, cr, skip); + crfree(cr); + + return (wrote); +} + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_iter_write(struct kiocb *kiocb, struct iov_iter *from) +{ + size_t count; + ssize_t ret; + uio_seg_t seg = UIO_USERSPACE; + +#ifndef HAVE_GENERIC_WRITE_CHECKS_KIOCB + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); + + count = iov_iter_count(from); + ret = generic_write_checks(file, &kiocb->ki_pos, &count, isblk); + if (ret) + return (ret); +#else + /* + * XXX - ideally this check should be in the same lock region with + * write operations, so that there's no TOCTTOU race when doing + * append and someone else grow the file. + */ + ret = generic_write_checks(kiocb, from); + if (ret <= 0) + return (ret); + count = ret; +#endif + + if (from->type & ITER_KVEC) + seg = UIO_SYSSPACE; + if (from->type & ITER_BVEC) + seg = UIO_BVEC; + + ret = zpl_iter_write_common(kiocb, from->iov, from->nr_segs, + count, seg, from->iov_offset); + if (ret > 0) + iov_iter_advance(from, ret); + + return (ret); +} +#else +static ssize_t +zpl_aio_write(struct kiocb *kiocb, const struct iovec *iovp, + unsigned long nr_segs, loff_t pos) +{ + struct file *file = kiocb->ki_filp; + struct address_space *mapping = file->f_mapping; + struct inode *ip = mapping->host; + int isblk = S_ISBLK(ip->i_mode); + size_t count; + ssize_t ret; + + ret = generic_segment_checks(iovp, &nr_segs, &count, VERIFY_READ); + if (ret) + return (ret); + + ret = generic_write_checks(file, &pos, &count, isblk); + if (ret) + return (ret); + + return (zpl_iter_write_common(kiocb, iovp, nr_segs, count, + UIO_USERSPACE, 0)); +} +#endif /* HAVE_VFS_RW_ITERATE */ + +#if defined(HAVE_VFS_RW_ITERATE) +static ssize_t +zpl_direct_IO_impl(int rw, struct kiocb *kiocb, struct iov_iter *iter) +{ + if (rw == WRITE) + return (zpl_iter_write(kiocb, iter)); + else + return (zpl_iter_read(kiocb, iter)); +} +#if defined(HAVE_VFS_DIRECT_IO_ITER) +static ssize_t +zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) +{ + return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_OFFSET) +static ssize_t +zpl_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + ASSERT3S(pos, ==, kiocb->ki_pos); + return (zpl_direct_IO_impl(iov_iter_rw(iter), kiocb, iter)); +} +#elif defined(HAVE_VFS_DIRECT_IO_ITER_RW_OFFSET) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t pos) +{ + ASSERT3S(pos, ==, kiocb->ki_pos); + return (zpl_direct_IO_impl(rw, kiocb, iter)); +} +#else +#error "Unknown direct IO interface" +#endif + +#else + +#if defined(HAVE_VFS_DIRECT_IO_IOVEC) +static ssize_t +zpl_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iovp, + loff_t pos, unsigned long nr_segs) +{ + if (rw == WRITE) + return (zpl_aio_write(kiocb, iovp, nr_segs, pos)); + else + return (zpl_aio_read(kiocb, iovp, nr_segs, pos)); +} +#else +#error "Unknown direct IO interface" +#endif + +#endif /* HAVE_VFS_RW_ITERATE */ + +static loff_t +zpl_llseek(struct file *filp, loff_t offset, int whence) +{ +#if defined(SEEK_HOLE) && defined(SEEK_DATA) + fstrans_cookie_t cookie; + + if (whence == SEEK_DATA || whence == SEEK_HOLE) { + struct inode *ip = filp->f_mapping->host; + loff_t maxbytes = ip->i_sb->s_maxbytes; + loff_t error; + + spl_inode_lock_shared(ip); + cookie = spl_fstrans_mark(); + error = -zfs_holey(ip, whence, &offset); + spl_fstrans_unmark(cookie); + if (error == 0) + error = lseek_execute(filp, ip, offset, maxbytes); + spl_inode_unlock_shared(ip); + + return (error); + } +#endif /* SEEK_HOLE && SEEK_DATA */ + + return (generic_file_llseek(filp, offset, whence)); +} + +/* + * It's worth taking a moment to describe how mmap is implemented + * for zfs because it differs considerably from other Linux filesystems. + * However, this issue is handled the same way under OpenSolaris. + * + * The issue is that by design zfs bypasses the Linux page cache and + * leaves all caching up to the ARC. This has been shown to work + * well for the common read(2)/write(2) case. However, mmap(2) + * is problem because it relies on being tightly integrated with the + * page cache. To handle this we cache mmap'ed files twice, once in + * the ARC and a second time in the page cache. The code is careful + * to keep both copies synchronized. + * + * When a file with an mmap'ed region is written to using write(2) + * both the data in the ARC and existing pages in the page cache + * are updated. For a read(2) data will be read first from the page + * cache then the ARC if needed. Neither a write(2) or read(2) will + * will ever result in new pages being added to the page cache. + * + * New pages are added to the page cache only via .readpage() which + * is called when the vfs needs to read a page off disk to back the + * virtual memory region. These pages may be modified without + * notifying the ARC and will be written out periodically via + * .writepage(). This will occur due to either a sync or the usual + * page aging behavior. Note because a read(2) of a mmap'ed file + * will always check the page cache first even when the ARC is out + * of date correct data will still be returned. + * + * While this implementation ensures correct behavior it does have + * have some drawbacks. The most obvious of which is that it + * increases the required memory footprint when access mmap'ed + * files. It also adds additional complexity to the code keeping + * both caches synchronized. + * + * Longer term it may be possible to cleanly resolve this wart by + * mapping page cache pages directly on to the ARC buffers. The + * Linux address space operations are flexible enough to allow + * selection of which pages back a particular index. The trick + * would be working out the details of which subsystem is in + * charge, the ARC, the page cache, or both. It may also prove + * helpful to move the ARC buffers to a scatter-gather lists + * rather than a vmalloc'ed region. + */ +static int +zpl_mmap(struct file *filp, struct vm_area_struct *vma) +{ + struct inode *ip = filp->f_mapping->host; + znode_t *zp = ITOZ(ip); + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + error = -zfs_map(ip, vma->vm_pgoff, (caddr_t *)vma->vm_start, + (size_t)(vma->vm_end - vma->vm_start), vma->vm_flags); + spl_fstrans_unmark(cookie); + if (error) + return (error); + + error = generic_file_mmap(filp, vma); + if (error) + return (error); + + mutex_enter(&zp->z_lock); + zp->z_is_mapped = B_TRUE; + mutex_exit(&zp->z_lock); + + return (error); +} + +/* + * Populate a page with data for the Linux page cache. This function is + * only used to support mmap(2). There will be an identical copy of the + * data in the ARC which is kept up to date via .write() and .writepage(). + * + * Current this function relies on zpl_read_common() and the O_DIRECT + * flag to read in a page. This works but the more correct way is to + * update zfs_fillpage() to be Linux friendly and use that interface. + */ +static int +zpl_readpage(struct file *filp, struct page *pp) +{ + struct inode *ip; + struct page *pl[1]; + int error = 0; + fstrans_cookie_t cookie; + + ASSERT(PageLocked(pp)); + ip = pp->mapping->host; + pl[0] = pp; + + cookie = spl_fstrans_mark(); + error = -zfs_getpage(ip, pl, 1); + spl_fstrans_unmark(cookie); + + if (error) { + SetPageError(pp); + ClearPageUptodate(pp); + } else { + ClearPageError(pp); + SetPageUptodate(pp); + flush_dcache_page(pp); + } + + unlock_page(pp); + return (error); +} + +/* + * Populate a set of pages with data for the Linux page cache. This + * function will only be called for read ahead and never for demand + * paging. For simplicity, the code relies on read_cache_pages() to + * correctly lock each page for IO and call zpl_readpage(). + */ +static int +zpl_readpages(struct file *filp, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return (read_cache_pages(mapping, pages, + (filler_t *)zpl_readpage, filp)); +} + +static int +zpl_putpage(struct page *pp, struct writeback_control *wbc, void *data) +{ + struct address_space *mapping = data; + fstrans_cookie_t cookie; + + ASSERT(PageLocked(pp)); + ASSERT(!PageWriteback(pp)); + + cookie = spl_fstrans_mark(); + (void) zfs_putpage(mapping->host, pp, wbc); + spl_fstrans_unmark(cookie); + + return (0); +} + +static int +zpl_writepages(struct address_space *mapping, struct writeback_control *wbc) +{ + znode_t *zp = ITOZ(mapping->host); + zfsvfs_t *zfsvfs = ITOZSB(mapping->host); + enum writeback_sync_modes sync_mode; + int result; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + ZFS_EXIT(zfsvfs); + sync_mode = wbc->sync_mode; + + /* + * We don't want to run write_cache_pages() in SYNC mode here, because + * that would make putpage() wait for a single page to be committed to + * disk every single time, resulting in atrocious performance. Instead + * we run it once in non-SYNC mode so that the ZIL gets all the data, + * and then we commit it all in one go. + */ + wbc->sync_mode = WB_SYNC_NONE; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + if (sync_mode != wbc->sync_mode) { + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + if (zfsvfs->z_log != NULL) + zil_commit(zfsvfs->z_log, zp->z_id); + ZFS_EXIT(zfsvfs); + + /* + * We need to call write_cache_pages() again (we can't just + * return after the commit) because the previous call in + * non-SYNC mode does not guarantee that we got all the dirty + * pages (see the implementation of write_cache_pages() for + * details). That being said, this is a no-op in most cases. + */ + wbc->sync_mode = sync_mode; + result = write_cache_pages(mapping, wbc, zpl_putpage, mapping); + } + return (result); +} + +/* + * Write out dirty pages to the ARC, this function is only required to + * support mmap(2). Mapped pages may be dirtied by memory operations + * which never call .write(). These dirty pages are kept in sync with + * the ARC buffers via this hook. + */ +static int +zpl_writepage(struct page *pp, struct writeback_control *wbc) +{ + if (ITOZSB(pp->mapping->host)->z_os->os_sync == ZFS_SYNC_ALWAYS) + wbc->sync_mode = WB_SYNC_ALL; + + return (zpl_putpage(pp, wbc, pp->mapping)); +} + +/* + * The flag combination which matches the behavior of zfs_space() is + * FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE. The FALLOC_FL_PUNCH_HOLE + * flag was introduced in the 2.6.38 kernel. + * + * The original mode=0 (allocate space) behavior can be reasonably emulated + * by checking if enough space exists and creating a sparse file, as real + * persistent space reservation is not possible due to COW, snapshots, etc. + */ +static long +zpl_fallocate_common(struct inode *ip, int mode, loff_t offset, loff_t len) +{ + cred_t *cr = CRED(); + loff_t olen; + fstrans_cookie_t cookie; + int error = 0; + + if ((mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) != 0) + return (-EOPNOTSUPP); + + if (offset < 0 || len <= 0) + return (-EINVAL); + + spl_inode_lock(ip); + olen = i_size_read(ip); + + crhold(cr); + cookie = spl_fstrans_mark(); + if (mode & FALLOC_FL_PUNCH_HOLE) { + flock64_t bf; + + if (offset > olen) + goto out_unmark; + + if (offset + len > olen) + len = olen - offset; + bf.l_type = F_WRLCK; + bf.l_whence = SEEK_SET; + bf.l_start = offset; + bf.l_len = len; + bf.l_pid = 0; + + error = -zfs_space(ITOZ(ip), F_FREESP, &bf, O_RDWR, offset, cr); + } else if ((mode & ~FALLOC_FL_KEEP_SIZE) == 0) { + unsigned int percent = zfs_fallocate_reserve_percent; + struct kstatfs statfs; + + /* Legacy mode, disable fallocate compatibility. */ + if (percent == 0) { + error = -EOPNOTSUPP; + goto out_unmark; + } + + /* + * Use zfs_statvfs() instead of dmu_objset_space() since it + * also checks project quota limits, which are relevant here. + */ + error = zfs_statvfs(ip, &statfs); + if (error) + goto out_unmark; + + /* + * Shrink available space a bit to account for overhead/races. + * We know the product previously fit into availbytes from + * dmu_objset_space(), so the smaller product will also fit. + */ + if (len > statfs.f_bavail * (statfs.f_bsize * 100 / percent)) { + error = -ENOSPC; + goto out_unmark; + } + if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > olen) + error = zfs_freesp(ITOZ(ip), offset + len, 0, 0, FALSE); + } +out_unmark: + spl_fstrans_unmark(cookie); + spl_inode_unlock(ip); + + crfree(cr); + + return (error); +} + +static long +zpl_fallocate(struct file *filp, int mode, loff_t offset, loff_t len) +{ + return zpl_fallocate_common(file_inode(filp), + mode, offset, len); +} + +#define ZFS_FL_USER_VISIBLE (FS_FL_USER_VISIBLE | ZFS_PROJINHERIT_FL) +#define ZFS_FL_USER_MODIFIABLE (FS_FL_USER_MODIFIABLE | ZFS_PROJINHERIT_FL) + +static uint32_t +__zpl_ioctl_getflags(struct inode *ip) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + uint32_t ioctl_flags = 0; + + if (zfs_flags & ZFS_IMMUTABLE) + ioctl_flags |= FS_IMMUTABLE_FL; + + if (zfs_flags & ZFS_APPENDONLY) + ioctl_flags |= FS_APPEND_FL; + + if (zfs_flags & ZFS_NODUMP) + ioctl_flags |= FS_NODUMP_FL; + + if (zfs_flags & ZFS_PROJINHERIT) + ioctl_flags |= ZFS_PROJINHERIT_FL; + + return (ioctl_flags & ZFS_FL_USER_VISIBLE); +} + +/* + * Map zfs file z_pflags (xvattr_t) to linux file attributes. Only file + * attributes common to both Linux and Solaris are mapped. + */ +static int +zpl_ioctl_getflags(struct file *filp, void __user *arg) +{ + uint32_t flags; + int err; + + flags = __zpl_ioctl_getflags(file_inode(filp)); + err = copy_to_user(arg, &flags, sizeof (flags)); + + return (err); +} + +/* + * fchange() is a helper macro to detect if we have been asked to change a + * flag. This is ugly, but the requirement that we do this is a consequence of + * how the Linux file attribute interface was designed. Another consequence is + * that concurrent modification of files suffers from a TOCTOU race. Neither + * are things we can fix without modifying the kernel-userland interface, which + * is outside of our jurisdiction. + */ + +#define fchange(f0, f1, b0, b1) (!((f0) & (b0)) != !((f1) & (b1))) + +static int +__zpl_ioctl_setflags(struct inode *ip, uint32_t ioctl_flags, xvattr_t *xva) +{ + uint64_t zfs_flags = ITOZ(ip)->z_pflags; + xoptattr_t *xoap; + + if (ioctl_flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | + ZFS_PROJINHERIT_FL)) + return (-EOPNOTSUPP); + + if (ioctl_flags & ~ZFS_FL_USER_MODIFIABLE) + return (-EACCES); + + if ((fchange(ioctl_flags, zfs_flags, FS_IMMUTABLE_FL, ZFS_IMMUTABLE) || + fchange(ioctl_flags, zfs_flags, FS_APPEND_FL, ZFS_APPENDONLY)) && + !capable(CAP_LINUX_IMMUTABLE)) + return (-EACCES); + + if (!inode_owner_or_capable(ip)) + return (-EACCES); + + xva_init(xva); + xoap = xva_getxoptattr(xva); + + XVA_SET_REQ(xva, XAT_IMMUTABLE); + if (ioctl_flags & FS_IMMUTABLE_FL) + xoap->xoa_immutable = B_TRUE; + + XVA_SET_REQ(xva, XAT_APPENDONLY); + if (ioctl_flags & FS_APPEND_FL) + xoap->xoa_appendonly = B_TRUE; + + XVA_SET_REQ(xva, XAT_NODUMP); + if (ioctl_flags & FS_NODUMP_FL) + xoap->xoa_nodump = B_TRUE; + + XVA_SET_REQ(xva, XAT_PROJINHERIT); + if (ioctl_flags & ZFS_PROJINHERIT_FL) + xoap->xoa_projinherit = B_TRUE; + + return (0); +} + +static int +zpl_ioctl_setflags(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + uint32_t flags; + cred_t *cr = CRED(); + xvattr_t xva; + int err; + fstrans_cookie_t cookie; + + if (copy_from_user(&flags, arg, sizeof (flags))) + return (-EFAULT); + + err = __zpl_ioctl_setflags(ip, flags, &xva); + if (err) + return (err); + + crhold(cr); + cookie = spl_fstrans_mark(); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (err); +} + +static int +zpl_ioctl_getxattr(struct file *filp, void __user *arg) +{ + zfsxattr_t fsx = { 0 }; + struct inode *ip = file_inode(filp); + int err; + + fsx.fsx_xflags = __zpl_ioctl_getflags(ip); + fsx.fsx_projid = ITOZ(ip)->z_projid; + err = copy_to_user(arg, &fsx, sizeof (fsx)); + + return (err); +} + +static int +zpl_ioctl_setxattr(struct file *filp, void __user *arg) +{ + struct inode *ip = file_inode(filp); + zfsxattr_t fsx; + cred_t *cr = CRED(); + xvattr_t xva; + xoptattr_t *xoap; + int err; + fstrans_cookie_t cookie; + + if (copy_from_user(&fsx, arg, sizeof (fsx))) + return (-EFAULT); + + if (!zpl_is_valid_projid(fsx.fsx_projid)) + return (-EINVAL); + + err = __zpl_ioctl_setflags(ip, fsx.fsx_xflags, &xva); + if (err) + return (err); + + xoap = xva_getxoptattr(&xva); + XVA_SET_REQ(&xva, XAT_PROJID); + xoap->xoa_projid = fsx.fsx_projid; + + crhold(cr); + cookie = spl_fstrans_mark(); + err = -zfs_setattr(ITOZ(ip), (vattr_t *)&xva, 0, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (err); +} + +static long +zpl_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC_GETFLAGS: + return (zpl_ioctl_getflags(filp, (void *)arg)); + case FS_IOC_SETFLAGS: + return (zpl_ioctl_setflags(filp, (void *)arg)); + case ZFS_IOC_FSGETXATTR: + return (zpl_ioctl_getxattr(filp, (void *)arg)); + case ZFS_IOC_FSSETXATTR: + return (zpl_ioctl_setxattr(filp, (void *)arg)); + default: + return (-ENOTTY); + } +} + +#ifdef CONFIG_COMPAT +static long +zpl_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case FS_IOC32_GETFLAGS: + cmd = FS_IOC_GETFLAGS; + break; + case FS_IOC32_SETFLAGS: + cmd = FS_IOC_SETFLAGS; + break; + default: + return (-ENOTTY); + } + return (zpl_ioctl(filp, cmd, (unsigned long)compat_ptr(arg))); +} +#endif /* CONFIG_COMPAT */ + + +const struct address_space_operations zpl_address_space_operations = { + .readpages = zpl_readpages, + .readpage = zpl_readpage, + .writepage = zpl_writepage, + .writepages = zpl_writepages, + .direct_IO = zpl_direct_IO, +}; + +const struct file_operations zpl_file_operations = { + .open = zpl_open, + .release = zpl_release, + .llseek = zpl_llseek, +#ifdef HAVE_VFS_RW_ITERATE +#ifdef HAVE_NEW_SYNC_READ + .read = new_sync_read, + .write = new_sync_write, +#endif + .read_iter = zpl_iter_read, + .write_iter = zpl_iter_write, +#else + .read = do_sync_read, + .write = do_sync_write, + .aio_read = zpl_aio_read, + .aio_write = zpl_aio_write, +#endif + .mmap = zpl_mmap, + .fsync = zpl_fsync, +#ifdef HAVE_FILE_AIO_FSYNC + .aio_fsync = zpl_aio_fsync, +#endif + .fallocate = zpl_fallocate, + .unlocked_ioctl = zpl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = zpl_compat_ioctl, +#endif +}; + +const struct file_operations zpl_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, +#if defined(HAVE_VFS_ITERATE_SHARED) + .iterate_shared = zpl_iterate, +#elif defined(HAVE_VFS_ITERATE) + .iterate = zpl_iterate, +#else + .readdir = zpl_readdir, +#endif + .fsync = zpl_fsync, + .unlocked_ioctl = zpl_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = zpl_compat_ioctl, +#endif +}; + +/* BEGIN CSTYLED */ +module_param(zfs_fallocate_reserve_percent, uint, 0644); +MODULE_PARM_DESC(zfs_fallocate_reserve_percent, + "Percentage of length to use for the available capacity check"); +/* END CSTYLED */ diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c new file mode 100644 index 000000000000..f3b97a22074c --- /dev/null +++ b/module/os/linux/zfs/zpl_inode.c @@ -0,0 +1,747 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include + + +static struct dentry * +zpl_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) +{ + cred_t *cr = CRED(); + struct inode *ip; + znode_t *zp; + int error; + fstrans_cookie_t cookie; + pathname_t *ppn = NULL; + pathname_t pn; + int zfs_flags = 0; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + if (dlen(dentry) >= ZAP_MAXNAMELEN) + return (ERR_PTR(-ENAMETOOLONG)); + + crhold(cr); + cookie = spl_fstrans_mark(); + + /* If we are a case insensitive fs, we need the real name */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { + zfs_flags = FIGNORECASE; + pn_alloc(&pn); + ppn = &pn; + } + + error = -zfs_lookup(ITOZ(dir), dname(dentry), &zp, + zfs_flags, cr, NULL, ppn); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + crfree(cr); + + spin_lock(&dentry->d_lock); + dentry->d_time = jiffies; + spin_unlock(&dentry->d_lock); + + if (error) { + /* + * If we have a case sensitive fs, we do not want to + * insert negative entries, so return NULL for ENOENT. + * Fall through if the error is not ENOENT. Also free memory. + */ + if (ppn) { + pn_free(ppn); + if (error == -ENOENT) + return (NULL); + } + + if (error == -ENOENT) + return (d_splice_alias(NULL, dentry)); + else + return (ERR_PTR(error)); + } + ip = ZTOI(zp); + + /* + * If we are case insensitive, call the correct function + * to install the name. + */ + if (ppn) { + struct dentry *new_dentry; + struct qstr ci_name; + + if (strcmp(dname(dentry), pn.pn_buf) == 0) { + new_dentry = d_splice_alias(ip, dentry); + } else { + ci_name.name = pn.pn_buf; + ci_name.len = strlen(pn.pn_buf); + new_dentry = d_add_ci(dentry, ip, &ci_name); + } + pn_free(ppn); + return (new_dentry); + } else { + return (d_splice_alias(ip, dentry)); + } +} + +void +zpl_vap_init(vattr_t *vap, struct inode *dir, umode_t mode, cred_t *cr) +{ + vap->va_mask = ATTR_MODE; + vap->va_mode = mode; + vap->va_uid = crgetfsuid(cr); + + if (dir && dir->i_mode & S_ISGID) { + vap->va_gid = KGID_TO_SGID(dir->i_gid); + if (S_ISDIR(mode)) + vap->va_mode |= S_ISGID; + } else { + vap->va_gid = crgetfsgid(cr); + } +} + +static int +zpl_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool flag) +{ + cred_t *cr = CRED(); + znode_t *zp; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, + mode, &zp, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ZTOI(zp)); + + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ZTOI(zp), dir); + + if (error) + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + cred_t *cr = CRED(); + znode_t *zp; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + /* + * We currently expect Linux to supply rdev=0 for all sockets + * and fifos, but we want to know if this behavior ever changes. + */ + if (S_ISSOCK(mode) || S_ISFIFO(mode)) + ASSERT(rdev == 0); + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode, cr); + vap->va_rdev = rdev; + + cookie = spl_fstrans_mark(); + error = -zfs_create(ITOZ(dir), dname(dentry), vap, 0, + mode, &zp, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ZTOI(zp)); + + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ZTOI(zp), dir); + + if (error) + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifdef HAVE_TMPFILE +static int +zpl_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + cred_t *cr = CRED(); + struct inode *ip; + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + /* + * The VFS does not apply the umask, therefore it is applied here + * when POSIX ACLs are not enabled. + */ + if (!IS_POSIXACL(dir)) + mode &= ~current_umask(); + zpl_vap_init(vap, dir, mode, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_tmpfile(dir, vap, 0, mode, &ip, cr, 0, NULL); + if (error == 0) { + /* d_tmpfile will do drop_nlink, so we should set it first */ + set_nlink(ip, 1); + d_tmpfile(dentry, ip); + + error = zpl_xattr_security_init(ip, dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ip, dir); + /* + * don't need to handle error here, file is already in + * unlinked set. + */ + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} +#endif + +static int +zpl_unlink(struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + + /* + * For a CI FS we must invalidate the dentry to prevent the + * creation of negative entries. + */ + if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) + d_invalidate(dentry); + + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + cred_t *cr = CRED(); + vattr_t *vap; + znode_t *zp; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, mode | S_IFDIR, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_mkdir(ITOZ(dir), dname(dentry), vap, &zp, cr, 0, NULL); + if (error == 0) { + d_instantiate(dentry, ZTOI(zp)); + + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); + if (error == 0) + error = zpl_init_acl(ZTOI(zp), dir); + + if (error) + (void) zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_rmdir(struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_rmdir(ITOZ(dir), dname(dentry), NULL, cr, 0); + + /* + * For a CI FS we must invalidate the dentry to prevent the + * creation of negative entries. + */ + if (error == 0 && zfsvfs->z_case == ZFS_CASE_INSENSITIVE) + d_invalidate(dentry); + + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_getattr_impl(const struct path *path, struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + int error; + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + + /* + * XXX request_mask and query_flags currently ignored. + */ + + error = -zfs_getattr_fast(path->dentry->d_inode, stat); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} +ZPL_GETATTR_WRAPPER(zpl_getattr); + +static int +zpl_setattr(struct dentry *dentry, struct iattr *ia) +{ + struct inode *ip = dentry->d_inode; + cred_t *cr = CRED(); + vattr_t *vap; + int error; + fstrans_cookie_t cookie; + + error = setattr_prepare(dentry, ia); + if (error) + return (error); + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + vap->va_mask = ia->ia_valid & ATTR_IATTR_MASK; + vap->va_mode = ia->ia_mode; + vap->va_uid = KUID_TO_SUID(ia->ia_uid); + vap->va_gid = KGID_TO_SGID(ia->ia_gid); + vap->va_size = ia->ia_size; + vap->va_atime = ia->ia_atime; + vap->va_mtime = ia->ia_mtime; + vap->va_ctime = ia->ia_ctime; + + if (vap->va_mask & ATTR_ATIME) + ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); + + cookie = spl_fstrans_mark(); + error = -zfs_setattr(ITOZ(ip), vap, 0, cr); + if (!error && (ia->ia_valid & ATTR_MODE)) + error = zpl_chmod_acl(ip); + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_rename2(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry, unsigned int flags) +{ + cred_t *cr = CRED(); + int error; + fstrans_cookie_t cookie; + + /* We don't have renameat2(2) support */ + if (flags) + return (-EINVAL); + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_rename(ITOZ(sdip), dname(sdentry), ITOZ(tdip), + dname(tdentry), cr, 0); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#ifndef HAVE_RENAME_WANTS_FLAGS +static int +zpl_rename(struct inode *sdip, struct dentry *sdentry, + struct inode *tdip, struct dentry *tdentry) +{ + return (zpl_rename2(sdip, sdentry, tdip, tdentry, 0)); +} +#endif + +static int +zpl_symlink(struct inode *dir, struct dentry *dentry, const char *name) +{ + cred_t *cr = CRED(); + vattr_t *vap; + znode_t *zp; + int error; + fstrans_cookie_t cookie; + + crhold(cr); + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + zpl_vap_init(vap, dir, S_IFLNK | S_IRWXUGO, cr); + + cookie = spl_fstrans_mark(); + error = -zfs_symlink(ITOZ(dir), dname(dentry), vap, + (char *)name, &zp, cr, 0); + if (error == 0) { + d_instantiate(dentry, ZTOI(zp)); + + error = zpl_xattr_security_init(ZTOI(zp), dir, &dentry->d_name); + if (error) + (void) zfs_remove(ITOZ(dir), dname(dentry), cr, 0); + } + + spl_fstrans_unmark(cookie); + kmem_free(vap, sizeof (vattr_t)); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +#if defined(HAVE_PUT_LINK_COOKIE) +static void +zpl_put_link(struct inode *unused, void *cookie) +{ + kmem_free(cookie, MAXPATHLEN); +} +#elif defined(HAVE_PUT_LINK_NAMEIDATA) +static void +zpl_put_link(struct dentry *dentry, struct nameidata *nd, void *ptr) +{ + const char *link = nd_get_link(nd); + + if (!IS_ERR(link)) + kmem_free(link, MAXPATHLEN); +} +#elif defined(HAVE_PUT_LINK_DELAYED) +static void +zpl_put_link(void *ptr) +{ + kmem_free(ptr, MAXPATHLEN); +} +#endif + +static int +zpl_get_link_common(struct dentry *dentry, struct inode *ip, char **link) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + struct iovec iov; + uio_t uio = { { 0 }, 0 }; + int error; + + crhold(cr); + *link = NULL; + iov.iov_len = MAXPATHLEN; + iov.iov_base = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_segflg = UIO_SYSSPACE; + uio.uio_resid = (MAXPATHLEN - 1); + + cookie = spl_fstrans_mark(); + error = -zfs_readlink(ip, &uio, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + + if (error) + kmem_free(iov.iov_base, MAXPATHLEN); + else + *link = iov.iov_base; + + return (error); +} + +#if defined(HAVE_GET_LINK_DELAYED) +static const char * +zpl_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *done) +{ + char *link = NULL; + int error; + + if (!dentry) + return (ERR_PTR(-ECHILD)); + + error = zpl_get_link_common(dentry, inode, &link); + if (error) + return (ERR_PTR(error)); + + set_delayed_call(done, zpl_put_link, link); + + return (link); +} +#elif defined(HAVE_GET_LINK_COOKIE) +static const char * +zpl_get_link(struct dentry *dentry, struct inode *inode, void **cookie) +{ + char *link = NULL; + int error; + + if (!dentry) + return (ERR_PTR(-ECHILD)); + + error = zpl_get_link_common(dentry, inode, &link); + if (error) + return (ERR_PTR(error)); + + return (*cookie = link); +} +#elif defined(HAVE_FOLLOW_LINK_COOKIE) +static const char * +zpl_follow_link(struct dentry *dentry, void **cookie) +{ + char *link = NULL; + int error; + + error = zpl_get_link_common(dentry, dentry->d_inode, &link); + if (error) + return (ERR_PTR(error)); + + return (*cookie = link); +} +#elif defined(HAVE_FOLLOW_LINK_NAMEIDATA) +static void * +zpl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + char *link = NULL; + int error; + + error = zpl_get_link_common(dentry, dentry->d_inode, &link); + if (error) + nd_set_link(nd, ERR_PTR(error)); + else + nd_set_link(nd, link); + + return (NULL); +} +#endif + +static int +zpl_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) +{ + cred_t *cr = CRED(); + struct inode *ip = old_dentry->d_inode; + int error; + fstrans_cookie_t cookie; + + if (ip->i_nlink >= ZFS_LINK_MAX) + return (-EMLINK); + + crhold(cr); + ip->i_ctime = current_time(ip); + igrab(ip); /* Use ihold() if available */ + + cookie = spl_fstrans_mark(); + error = -zfs_link(ITOZ(dir), ITOZ(ip), dname(dentry), cr, 0); + if (error) { + iput(ip); + goto out; + } + + d_instantiate(dentry, ip); +out: + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +#ifdef HAVE_D_REVALIDATE_NAMEIDATA +zpl_revalidate(struct dentry *dentry, struct nameidata *nd) +{ + unsigned int flags = (nd ? nd->flags : 0); +#else +zpl_revalidate(struct dentry *dentry, unsigned int flags) +{ +#endif /* HAVE_D_REVALIDATE_NAMEIDATA */ + /* CSTYLED */ + zfsvfs_t *zfsvfs = dentry->d_sb->s_fs_info; + int error; + + if (flags & LOOKUP_RCU) + return (-ECHILD); + + /* + * After a rollback negative dentries created before the rollback + * time must be invalidated. Otherwise they can obscure files which + * are only present in the rolled back dataset. + */ + if (dentry->d_inode == NULL) { + spin_lock(&dentry->d_lock); + error = time_before(dentry->d_time, zfsvfs->z_rollback_time); + spin_unlock(&dentry->d_lock); + + if (error) + return (0); + } + + /* + * The dentry may reference a stale inode if a mounted file system + * was rolled back to a point in time where the object didn't exist. + */ + if (dentry->d_inode && ITOZ(dentry->d_inode)->z_is_stale) + return (0); + + return (1); +} + +const struct inode_operations zpl_inode_operations = { + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif /* HAVE_SET_ACL */ + .get_acl = zpl_get_acl, +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +const struct inode_operations zpl_dir_inode_operations = { + .create = zpl_create, + .lookup = zpl_lookup, + .link = zpl_link, + .unlink = zpl_unlink, + .symlink = zpl_symlink, + .mkdir = zpl_mkdir, + .rmdir = zpl_rmdir, + .mknod = zpl_mknod, +#ifdef HAVE_RENAME_WANTS_FLAGS + .rename = zpl_rename2, +#else + .rename = zpl_rename, +#endif +#ifdef HAVE_TMPFILE + .tmpfile = zpl_tmpfile, +#endif + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif /* HAVE_SET_ACL */ + .get_acl = zpl_get_acl, +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +const struct inode_operations zpl_symlink_inode_operations = { +#ifdef HAVE_GENERIC_READLINK + .readlink = generic_readlink, +#endif +#if defined(HAVE_GET_LINK_DELAYED) || defined(HAVE_GET_LINK_COOKIE) + .get_link = zpl_get_link, +#elif defined(HAVE_FOLLOW_LINK_COOKIE) || defined(HAVE_FOLLOW_LINK_NAMEIDATA) + .follow_link = zpl_follow_link, +#endif +#if defined(HAVE_PUT_LINK_COOKIE) || defined(HAVE_PUT_LINK_NAMEIDATA) + .put_link = zpl_put_link, +#endif + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +}; + +const struct inode_operations zpl_special_inode_operations = { + .setattr = zpl_setattr, + .getattr = zpl_getattr, +#ifdef HAVE_GENERIC_SETXATTR + .setxattr = generic_setxattr, + .getxattr = generic_getxattr, + .removexattr = generic_removexattr, +#endif + .listxattr = zpl_xattr_list, +#if defined(CONFIG_FS_POSIX_ACL) +#if defined(HAVE_SET_ACL) + .set_acl = zpl_set_acl, +#endif /* HAVE_SET_ACL */ + .get_acl = zpl_get_acl, +#endif /* CONFIG_FS_POSIX_ACL */ +}; + +dentry_operations_t zpl_dentry_operations = { + .d_revalidate = zpl_revalidate, +}; diff --git a/module/os/linux/zfs/zpl_super.c b/module/os/linux/zfs/zpl_super.c new file mode 100644 index 000000000000..75adff51782e --- /dev/null +++ b/module/os/linux/zfs/zpl_super.c @@ -0,0 +1,326 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + */ + + +#include +#include +#include +#include +#include + + +static struct inode * +zpl_inode_alloc(struct super_block *sb) +{ + struct inode *ip; + + VERIFY3S(zfs_inode_alloc(sb, &ip), ==, 0); + inode_set_iversion(ip, 1); + + return (ip); +} + +static void +zpl_inode_destroy(struct inode *ip) +{ + ASSERT(atomic_read(&ip->i_count) == 0); + zfs_inode_destroy(ip); +} + +/* + * Called from __mark_inode_dirty() to reflect that something in the + * inode has changed. We use it to ensure the znode system attributes + * are always strictly update to date with respect to the inode. + */ +#ifdef HAVE_DIRTY_INODE_WITH_FLAGS +static void +zpl_dirty_inode(struct inode *ip, int flags) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + zfs_dirty_inode(ip, flags); + spl_fstrans_unmark(cookie); +} +#else +static void +zpl_dirty_inode(struct inode *ip) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + zfs_dirty_inode(ip, 0); + spl_fstrans_unmark(cookie); +} +#endif /* HAVE_DIRTY_INODE_WITH_FLAGS */ + +/* + * When ->drop_inode() is called its return value indicates if the + * inode should be evicted from the inode cache. If the inode is + * unhashed and has no links the default policy is to evict it + * immediately. + * + * The ->evict_inode() callback must minimally truncate the inode pages, + * and call clear_inode(). For 2.6.35 and later kernels this will + * simply update the inode state, with the sync occurring before the + * truncate in evict(). For earlier kernels clear_inode() maps to + * end_writeback() which is responsible for completing all outstanding + * write back. In either case, once this is done it is safe to cleanup + * any remaining inode specific data via zfs_inactive(). + * remaining filesystem specific data. + */ +static void +zpl_evict_inode(struct inode *ip) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + truncate_setsize(ip, 0); + clear_inode(ip); + zfs_inactive(ip); + spl_fstrans_unmark(cookie); +} + +static void +zpl_put_super(struct super_block *sb) +{ + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_umount(sb); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); +} + +static int +zpl_sync_fs(struct super_block *sb, int wait) +{ + fstrans_cookie_t cookie; + cred_t *cr = CRED(); + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + error = -zfs_sync(sb, wait, cr); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_statfs(struct dentry *dentry, struct kstatfs *statp) +{ + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_statvfs(dentry->d_inode, statp); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + /* + * If required by a 32-bit system call, dynamically scale the + * block size up to 16MiB and decrease the block counts. This + * allows for a maximum size of 64EiB to be reported. The file + * counts must be artificially capped at 2^32-1. + */ + if (unlikely(zpl_is_32bit_api())) { + while (statp->f_blocks > UINT32_MAX && + statp->f_bsize < SPA_MAXBLOCKSIZE) { + statp->f_frsize <<= 1; + statp->f_bsize <<= 1; + + statp->f_blocks >>= 1; + statp->f_bfree >>= 1; + statp->f_bavail >>= 1; + } + + uint64_t usedobjs = statp->f_files - statp->f_ffree; + statp->f_ffree = MIN(statp->f_ffree, UINT32_MAX - usedobjs); + statp->f_files = statp->f_ffree + usedobjs; + } + + return (error); +} + +static int +zpl_remount_fs(struct super_block *sb, int *flags, char *data) +{ + zfs_mnt_t zm = { .mnt_osname = NULL, .mnt_data = data }; + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_remount(sb, flags, &zm); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +__zpl_show_options(struct seq_file *seq, zfsvfs_t *zfsvfs) +{ + seq_printf(seq, ",%s", + zfsvfs->z_flags & ZSB_XATTR ? "xattr" : "noxattr"); + +#ifdef CONFIG_FS_POSIX_ACL + switch (zfsvfs->z_acl_type) { + case ZFS_ACLTYPE_POSIXACL: + seq_puts(seq, ",posixacl"); + break; + default: + seq_puts(seq, ",noacl"); + break; + } +#endif /* CONFIG_FS_POSIX_ACL */ + + return (0); +} + +static int +zpl_show_options(struct seq_file *seq, struct dentry *root) +{ + return (__zpl_show_options(seq, root->d_sb->s_fs_info)); +} + +static int +zpl_fill_super(struct super_block *sb, void *data, int silent) +{ + zfs_mnt_t *zm = (zfs_mnt_t *)data; + fstrans_cookie_t cookie; + int error; + + cookie = spl_fstrans_mark(); + error = -zfs_domount(sb, zm, silent); + spl_fstrans_unmark(cookie); + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_test_super(struct super_block *s, void *data) +{ + zfsvfs_t *zfsvfs = s->s_fs_info; + objset_t *os = data; + + if (zfsvfs == NULL) + return (0); + + return (os == zfsvfs->z_os); +} + +static struct super_block * +zpl_mount_impl(struct file_system_type *fs_type, int flags, zfs_mnt_t *zm) +{ + struct super_block *s; + objset_t *os; + int err; + + err = dmu_objset_hold(zm->mnt_osname, FTAG, &os); + if (err) + return (ERR_PTR(-err)); + + /* + * The dsl pool lock must be released prior to calling sget(). + * It is possible sget() may block on the lock in grab_super() + * while deactivate_super() holds that same lock and waits for + * a txg sync. If the dsl_pool lock is held over sget() + * this can prevent the pool sync and cause a deadlock. + */ + dsl_pool_rele(dmu_objset_pool(os), FTAG); + s = sget(fs_type, zpl_test_super, set_anon_super, flags, os); + dsl_dataset_rele(dmu_objset_ds(os), FTAG); + + if (IS_ERR(s)) + return (ERR_CAST(s)); + + if (s->s_root == NULL) { + err = zpl_fill_super(s, zm, flags & SB_SILENT ? 1 : 0); + if (err) { + deactivate_locked_super(s); + return (ERR_PTR(err)); + } + s->s_flags |= SB_ACTIVE; + } else if ((flags ^ s->s_flags) & SB_RDONLY) { + deactivate_locked_super(s); + return (ERR_PTR(-EBUSY)); + } + + return (s); +} + +static struct dentry * +zpl_mount(struct file_system_type *fs_type, int flags, + const char *osname, void *data) +{ + zfs_mnt_t zm = { .mnt_osname = osname, .mnt_data = data }; + + struct super_block *sb = zpl_mount_impl(fs_type, flags, &zm); + if (IS_ERR(sb)) + return (ERR_CAST(sb)); + + return (dget(sb->s_root)); +} + +static void +zpl_kill_sb(struct super_block *sb) +{ + zfs_preumount(sb); + kill_anon_super(sb); +} + +void +zpl_prune_sb(int64_t nr_to_scan, void *arg) +{ + struct super_block *sb = (struct super_block *)arg; + int objects = 0; + + (void) -zfs_prune(sb, nr_to_scan, &objects); +} + +const struct super_operations zpl_super_operations = { + .alloc_inode = zpl_inode_alloc, + .destroy_inode = zpl_inode_destroy, + .dirty_inode = zpl_dirty_inode, + .write_inode = NULL, + .evict_inode = zpl_evict_inode, + .put_super = zpl_put_super, + .sync_fs = zpl_sync_fs, + .statfs = zpl_statfs, + .remount_fs = zpl_remount_fs, + .show_options = zpl_show_options, + .show_stats = NULL, +}; + +struct file_system_type zpl_fs_type = { + .owner = THIS_MODULE, + .name = ZFS_DRIVER, + .mount = zpl_mount, + .kill_sb = zpl_kill_sb, +}; diff --git a/module/os/linux/zfs/zpl_xattr.c b/module/os/linux/zfs/zpl_xattr.c new file mode 100644 index 000000000000..fa3c036405b0 --- /dev/null +++ b/module/os/linux/zfs/zpl_xattr.c @@ -0,0 +1,1480 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2011, Lawrence Livermore National Security, LLC. + * + * Extended attributes (xattr) on Solaris are implemented as files + * which exist in a hidden xattr directory. These extended attributes + * can be accessed using the attropen() system call which opens + * the extended attribute. It can then be manipulated just like + * a standard file descriptor. This has a couple advantages such + * as practically no size limit on the file, and the extended + * attributes permissions may differ from those of the parent file. + * This interface is really quite clever, but it's also completely + * different than what is supported on Linux. It also comes with a + * steep performance penalty when accessing small xattrs because they + * are not stored with the parent file. + * + * Under Linux extended attributes are manipulated by the system + * calls getxattr(2), setxattr(2), and listxattr(2). They consider + * extended attributes to be name/value pairs where the name is a + * NULL terminated string. The name must also include one of the + * following namespace prefixes: + * + * user - No restrictions and is available to user applications. + * trusted - Restricted to kernel and root (CAP_SYS_ADMIN) use. + * system - Used for access control lists (system.nfs4_acl, etc). + * security - Used by SELinux to store a files security context. + * + * The value under Linux to limited to 65536 bytes of binary data. + * In practice, individual xattrs tend to be much smaller than this + * and are typically less than 100 bytes. A good example of this + * are the security.selinux xattrs which are less than 100 bytes and + * exist for every file when xattr labeling is enabled. + * + * The Linux xattr implementation has been written to take advantage of + * this typical usage. When the dataset property 'xattr=sa' is set, + * then xattrs will be preferentially stored as System Attributes (SA). + * This allows tiny xattrs (~100 bytes) to be stored with the dnode and + * up to 64k of xattrs to be stored in the spill block. If additional + * xattr space is required, which is unlikely under Linux, they will + * be stored using the traditional directory approach. + * + * This optimization results in roughly a 3x performance improvement + * when accessing xattrs because it avoids the need to perform a seek + * for every xattr value. When multiple xattrs are stored per-file + * the performance improvements are even greater because all of the + * xattrs stored in the spill block will be cached. + * + * However, by default SA based xattrs are disabled in the Linux port + * to maximize compatibility with other implementations. If you do + * enable SA based xattrs then they will not be visible on platforms + * which do not support this feature. + * + * NOTE: One additional consequence of the xattr directory implementation + * is that when an extended attribute is manipulated an inode is created. + * This inode will exist in the Linux inode cache but there will be no + * associated entry in the dentry cache which references it. This is + * safe but it may result in some confusion. Enabling SA based xattrs + * largely avoids the issue except in the overflow case. + */ + +#include +#include +#include +#include +#include +#include + +typedef struct xattr_filldir { + size_t size; + size_t offset; + char *buf; + struct dentry *dentry; +} xattr_filldir_t; + +static const struct xattr_handler *zpl_xattr_handler(const char *); + +static int +zpl_xattr_permission(xattr_filldir_t *xf, const char *name, int name_len) +{ + static const struct xattr_handler *handler; + struct dentry *d = xf->dentry; + + handler = zpl_xattr_handler(name); + if (!handler) + return (0); + + if (handler->list) { +#if defined(HAVE_XATTR_LIST_SIMPLE) + if (!handler->list(d)) + return (0); +#elif defined(HAVE_XATTR_LIST_DENTRY) + if (!handler->list(d, NULL, 0, name, name_len, 0)) + return (0); +#elif defined(HAVE_XATTR_LIST_HANDLER) + if (!handler->list(handler, d, NULL, 0, name, name_len)) + return (0); +#endif + } + + return (1); +} + +/* + * Determine is a given xattr name should be visible and if so copy it + * in to the provided buffer (xf->buf). + */ +static int +zpl_xattr_filldir(xattr_filldir_t *xf, const char *name, int name_len) +{ + /* Check permissions using the per-namespace list xattr handler. */ + if (!zpl_xattr_permission(xf, name, name_len)) + return (0); + + /* When xf->buf is NULL only calculate the required size. */ + if (xf->buf) { + if (xf->offset + name_len + 1 > xf->size) + return (-ERANGE); + + memcpy(xf->buf + xf->offset, name, name_len); + xf->buf[xf->offset + name_len] = '\0'; + } + + xf->offset += (name_len + 1); + + return (0); +} + +/* + * Read as many directory entry names as will fit in to the provided buffer, + * or when no buffer is provided calculate the required buffer size. + */ +static int +zpl_xattr_readdir(struct inode *dxip, xattr_filldir_t *xf) +{ + zap_cursor_t zc; + zap_attribute_t zap; + int error; + + zap_cursor_init(&zc, ITOZSB(dxip)->z_os, ITOZ(dxip)->z_id); + + while ((error = -zap_cursor_retrieve(&zc, &zap)) == 0) { + + if (zap.za_integer_length != 8 || zap.za_num_integers != 1) { + error = -ENXIO; + break; + } + + error = zpl_xattr_filldir(xf, zap.za_name, strlen(zap.za_name)); + if (error) + break; + + zap_cursor_advance(&zc); + } + + zap_cursor_fini(&zc); + + if (error == -ENOENT) + error = 0; + + return (error); +} + +static ssize_t +zpl_xattr_list_dir(xattr_filldir_t *xf, cred_t *cr) +{ + struct inode *ip = xf->dentry->d_inode; + struct inode *dxip = NULL; + znode_t *dxzp; + int error; + + /* Lookup the xattr directory */ + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, + cr, NULL, NULL); + if (error) { + if (error == -ENOENT) + error = 0; + + return (error); + } + + dxip = ZTOI(dxzp); + error = zpl_xattr_readdir(dxip, xf); + iput(dxip); + + return (error); +} + +static ssize_t +zpl_xattr_list_sa(xattr_filldir_t *xf) +{ + znode_t *zp = ITOZ(xf->dentry->d_inode); + nvpair_t *nvp = NULL; + int error = 0; + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + + while ((nvp = nvlist_next_nvpair(zp->z_xattr_cached, nvp)) != NULL) { + ASSERT3U(nvpair_type(nvp), ==, DATA_TYPE_BYTE_ARRAY); + + error = zpl_xattr_filldir(xf, nvpair_name(nvp), + strlen(nvpair_name(nvp))); + if (error) + return (error); + } + + return (0); +} + +ssize_t +zpl_xattr_list(struct dentry *dentry, char *buffer, size_t buffer_size) +{ + znode_t *zp = ITOZ(dentry->d_inode); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + xattr_filldir_t xf = { buffer_size, 0, buffer, dentry }; + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error = 0; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_READER); + + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_list_sa(&xf); + if (error) + goto out; + } + + error = zpl_xattr_list_dir(&xf, cr); + if (error) + goto out; + + error = xf.offset; +out: + + rw_exit(&zp->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (error); +} + +static int +zpl_xattr_get_dir(struct inode *ip, const char *name, void *value, + size_t size, cred_t *cr) +{ + struct inode *xip = NULL; + znode_t *dxzp = NULL; + znode_t *xzp = NULL; + loff_t pos = 0; + int error; + + /* Lookup the xattr directory */ + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, LOOKUP_XATTR, + cr, NULL, NULL); + if (error) + goto out; + + /* Lookup a specific xattr name in the directory */ + error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); + if (error) + goto out; + + xip = ZTOI(xzp); + if (!size) { + error = i_size_read(xip); + goto out; + } + + if (size < i_size_read(xip)) { + error = -ERANGE; + goto out; + } + + error = zpl_read_common(xip, value, size, &pos, UIO_SYSSPACE, 0, cr); +out: + if (xzp) + zrele(xzp); + + if (dxzp) + zrele(dxzp); + + return (error); +} + +static int +zpl_xattr_get_sa(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + uchar_t *nv_value; + uint_t nv_size; + int error = 0; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + error = -nvlist_lookup_byte_array(zp->z_xattr_cached, name, + &nv_value, &nv_size); + if (error) + return (error); + + if (size == 0 || value == NULL) + return (nv_size); + + if (size < nv_size) + return (-ERANGE); + + memcpy(value, nv_value, nv_size); + + return (nv_size); +} + +static int +__zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size, + cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_get_sa(ip, name, value, size); + if (error != -ENOENT) + goto out; + } + + error = zpl_xattr_get_dir(ip, name, value, size, cr); +out: + if (error == -ENOENT) + error = -ENODATA; + + return (error); +} + +#define XATTR_NOENT 0x0 +#define XATTR_IN_SA 0x1 +#define XATTR_IN_DIR 0x2 +/* check where the xattr resides */ +static int +__zpl_xattr_where(struct inode *ip, const char *name, int *where, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int error; + + ASSERT(where); + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + + *where = XATTR_NOENT; + if (zfsvfs->z_use_sa && zp->z_is_sa) { + error = zpl_xattr_get_sa(ip, name, NULL, 0); + if (error >= 0) + *where |= XATTR_IN_SA; + else if (error != -ENOENT) + return (error); + } + + error = zpl_xattr_get_dir(ip, name, NULL, 0, cr); + if (error >= 0) + *where |= XATTR_IN_DIR; + else if (error != -ENOENT) + return (error); + + if (*where == (XATTR_IN_SA|XATTR_IN_DIR)) + cmn_err(CE_WARN, "ZFS: inode %p has xattr \"%s\"" + " in both SA and dir", ip, name); + if (*where == XATTR_NOENT) + error = -ENODATA; + else + error = 0; + return (error); +} + +static int +zpl_xattr_get(struct inode *ip, const char *name, void *value, size_t size) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&zp->z_xattr_lock, RW_READER); + error = __zpl_xattr_get(ip, name, value, size, cr); + rw_exit(&zp->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + + return (error); +} + +static int +zpl_xattr_set_dir(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *dxzp = NULL; + znode_t *xzp = NULL; + vattr_t *vap = NULL; + ssize_t wrote; + int lookup_flags, error; + const int xattr_mode = S_IFREG | 0644; + loff_t pos = 0; + + /* + * Lookup the xattr directory. When we're adding an entry pass + * CREATE_XATTR_DIR to ensure the xattr directory is created. + * When removing an entry this flag is not passed to avoid + * unnecessarily creating a new xattr directory. + */ + lookup_flags = LOOKUP_XATTR; + if (value != NULL) + lookup_flags |= CREATE_XATTR_DIR; + + error = -zfs_lookup(ITOZ(ip), NULL, &dxzp, lookup_flags, + cr, NULL, NULL); + if (error) + goto out; + + /* Lookup a specific xattr name in the directory */ + error = -zfs_lookup(dxzp, (char *)name, &xzp, 0, cr, NULL, NULL); + if (error && (error != -ENOENT)) + goto out; + + error = 0; + + /* Remove a specific name xattr when value is set to NULL. */ + if (value == NULL) { + if (xzp) + error = -zfs_remove(dxzp, (char *)name, cr, 0); + + goto out; + } + + /* Lookup failed create a new xattr. */ + if (xzp == NULL) { + vap = kmem_zalloc(sizeof (vattr_t), KM_SLEEP); + vap->va_mode = xattr_mode; + vap->va_mask = ATTR_MODE; + vap->va_uid = crgetfsuid(cr); + vap->va_gid = crgetfsgid(cr); + + error = -zfs_create(dxzp, (char *)name, vap, 0, 0644, &xzp, + cr, 0, NULL); + if (error) + goto out; + } + + ASSERT(xzp != NULL); + + error = -zfs_freesp(xzp, 0, 0, xattr_mode, TRUE); + if (error) + goto out; + + wrote = zpl_write_common(ZTOI(xzp), value, size, &pos, + UIO_SYSSPACE, 0, cr); + if (wrote < 0) + error = wrote; + +out: + + if (error == 0) { + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + } + + if (vap) + kmem_free(vap, sizeof (vattr_t)); + + if (xzp) + zrele(xzp); + + if (dxzp) + zrele(dxzp); + + if (error == -ENOENT) + error = -ENODATA; + + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_xattr_set_sa(struct inode *ip, const char *name, const void *value, + size_t size, int flags, cred_t *cr) +{ + znode_t *zp = ITOZ(ip); + nvlist_t *nvl; + size_t sa_size; + int error = 0; + + mutex_enter(&zp->z_lock); + if (zp->z_xattr_cached == NULL) + error = -zfs_sa_get_xattr(zp); + mutex_exit(&zp->z_lock); + + if (error) + return (error); + + ASSERT(zp->z_xattr_cached); + nvl = zp->z_xattr_cached; + + if (value == NULL) { + error = -nvlist_remove(nvl, name, DATA_TYPE_BYTE_ARRAY); + if (error == -ENOENT) + error = zpl_xattr_set_dir(ip, name, NULL, 0, flags, cr); + } else { + /* Limited to 32k to keep nvpair memory allocations small */ + if (size > DXATTR_MAX_ENTRY_SIZE) + return (-EFBIG); + + /* Prevent the DXATTR SA from consuming the entire SA region */ + error = -nvlist_size(nvl, &sa_size, NV_ENCODE_XDR); + if (error) + return (error); + + if (sa_size > DXATTR_MAX_SA_SIZE) + return (-EFBIG); + + error = -nvlist_add_byte_array(nvl, name, + (uchar_t *)value, size); + } + + /* + * Update the SA for additions, modifications, and removals. On + * error drop the inconsistent cached version of the nvlist, it + * will be reconstructed from the ARC when next accessed. + */ + if (error == 0) + error = -zfs_sa_set_xattr(zp); + + if (error) { + nvlist_free(nvl); + zp->z_xattr_cached = NULL; + } + + ASSERT3S(error, <=, 0); + + return (error); +} + +static int +zpl_xattr_set(struct inode *ip, const char *name, const void *value, + size_t size, int flags) +{ + znode_t *zp = ITOZ(ip); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + cred_t *cr = CRED(); + fstrans_cookie_t cookie; + int where; + int error; + + crhold(cr); + cookie = spl_fstrans_mark(); + ZPL_ENTER(zfsvfs); + ZPL_VERIFY_ZP(zp); + rw_enter(&ITOZ(ip)->z_xattr_lock, RW_WRITER); + + /* + * Before setting the xattr check to see if it already exists. + * This is done to ensure the following optional flags are honored. + * + * XATTR_CREATE: fail if xattr already exists + * XATTR_REPLACE: fail if xattr does not exist + * + * We also want to know if it resides in sa or dir, so we can make + * sure we don't end up with duplicate in both places. + */ + error = __zpl_xattr_where(ip, name, &where, cr); + if (error < 0) { + if (error != -ENODATA) + goto out; + if (flags & XATTR_REPLACE) + goto out; + + /* The xattr to be removed already doesn't exist */ + error = 0; + if (value == NULL) + goto out; + } else { + error = -EEXIST; + if (flags & XATTR_CREATE) + goto out; + } + + /* Preferentially store the xattr as a SA for better performance */ + if (zfsvfs->z_use_sa && zp->z_is_sa && + (zfsvfs->z_xattr_sa || (value == NULL && where & XATTR_IN_SA))) { + error = zpl_xattr_set_sa(ip, name, value, size, flags, cr); + if (error == 0) { + /* + * Successfully put into SA, we need to clear the one + * in dir. + */ + if (where & XATTR_IN_DIR) + zpl_xattr_set_dir(ip, name, NULL, 0, 0, cr); + goto out; + } + } + + error = zpl_xattr_set_dir(ip, name, value, size, flags, cr); + /* + * Successfully put into dir, we need to clear the one in SA. + */ + if (error == 0 && (where & XATTR_IN_SA)) + zpl_xattr_set_sa(ip, name, NULL, 0, 0, cr); +out: + rw_exit(&ITOZ(ip)->z_xattr_lock); + ZPL_EXIT(zfsvfs); + spl_fstrans_unmark(cookie); + crfree(cr); + ASSERT3S(error, <=, 0); + + return (error); +} + +/* + * Extended user attributes + * + * "Extended user attributes may be assigned to files and directories for + * storing arbitrary additional information such as the mime type, + * character set or encoding of a file. The access permissions for user + * attributes are defined by the file permission bits: read permission + * is required to retrieve the attribute value, and writer permission is + * required to change it. + * + * The file permission bits of regular files and directories are + * interpreted differently from the file permission bits of special + * files and symbolic links. For regular files and directories the file + * permission bits define access to the file's contents, while for + * device special files they define access to the device described by + * the special file. The file permissions of symbolic links are not + * used in access checks. These differences would allow users to + * consume filesystem resources in a way not controllable by disk quotas + * for group or world writable special files and directories. + * + * For this reason, extended user attributes are allowed only for + * regular files and directories, and access to extended user attributes + * is restricted to the owner and to users with appropriate capabilities + * for directories with the sticky bit set (see the chmod(1) manual page + * for an explanation of the sticky bit)." - xattr(7) + * + * ZFS allows extended user attributes to be disabled administratively + * by setting the 'xattr=off' property on the dataset. + */ +static int +__zpl_xattr_user_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (ITOZSB(ip)->z_flags & ZSB_XATTR); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_user_list); + +static int +__zpl_xattr_user_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) + return (-EOPNOTSUPP); + + xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_user_get); + +static int +__zpl_xattr_user_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + if (!(ITOZSB(ip)->z_flags & ZSB_XATTR)) + return (-EOPNOTSUPP); + + xattr_name = kmem_asprintf("%s%s", XATTR_USER_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_user_set); + +xattr_handler_t zpl_xattr_user_handler = +{ + .prefix = XATTR_USER_PREFIX, + .list = zpl_xattr_user_list, + .get = zpl_xattr_user_get, + .set = zpl_xattr_user_set, +}; + +/* + * Trusted extended attributes + * + * "Trusted extended attributes are visible and accessible only to + * processes that have the CAP_SYS_ADMIN capability. Attributes in this + * class are used to implement mechanisms in user space (i.e., outside + * the kernel) which keep information in extended attributes to which + * ordinary processes should not have access." - xattr(7) + */ +static int +__zpl_xattr_trusted_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (capable(CAP_SYS_ADMIN)); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_trusted_list); + +static int +__zpl_xattr_trusted_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return (-EACCES); + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_trusted_get); + +static int +__zpl_xattr_trusted_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + + if (!capable(CAP_SYS_ADMIN)) + return (-EACCES); + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_TRUSTED_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_trusted_set); + +xattr_handler_t zpl_xattr_trusted_handler = +{ + .prefix = XATTR_TRUSTED_PREFIX, + .list = zpl_xattr_trusted_list, + .get = zpl_xattr_trusted_get, + .set = zpl_xattr_trusted_set, +}; + +/* + * Extended security attributes + * + * "The security attribute namespace is used by kernel security modules, + * such as Security Enhanced Linux, and also to implement file + * capabilities (see capabilities(7)). Read and write access + * permissions to security attributes depend on the policy implemented + * for each security attribute by the security module. When no security + * module is loaded, all processes have read access to extended security + * attributes, and write access is limited to processes that have the + * CAP_SYS_ADMIN capability." - xattr(7) + */ +static int +__zpl_xattr_security_list(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + return (1); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_security_list); + +static int +__zpl_xattr_security_get(struct inode *ip, const char *name, + void *value, size_t size) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); + error = zpl_xattr_get(ip, xattr_name, value, size); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_security_get); + +static int +__zpl_xattr_security_set(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + char *xattr_name; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") == 0) + return (-EINVAL); +#endif + xattr_name = kmem_asprintf("%s%s", XATTR_SECURITY_PREFIX, name); + error = zpl_xattr_set(ip, xattr_name, value, size, flags); + kmem_strfree(xattr_name); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_security_set); + +static int +zpl_xattr_security_init_impl(struct inode *ip, const struct xattr *xattrs, + void *fs_info) +{ + const struct xattr *xattr; + int error = 0; + + for (xattr = xattrs; xattr->name != NULL; xattr++) { + error = __zpl_xattr_security_set(ip, + xattr->name, xattr->value, xattr->value_len, 0); + + if (error < 0) + break; + } + + return (error); +} + +int +zpl_xattr_security_init(struct inode *ip, struct inode *dip, + const struct qstr *qstr) +{ + return security_inode_init_security(ip, dip, qstr, + &zpl_xattr_security_init_impl, NULL); +} + +/* + * Security xattr namespace handlers. + */ +xattr_handler_t zpl_xattr_security_handler = { + .prefix = XATTR_SECURITY_PREFIX, + .list = zpl_xattr_security_list, + .get = zpl_xattr_security_get, + .set = zpl_xattr_security_set, +}; + +/* + * Extended system attributes + * + * "Extended system attributes are used by the kernel to store system + * objects such as Access Control Lists. Read and write access permissions + * to system attributes depend on the policy implemented for each system + * attribute implemented by filesystems in the kernel." - xattr(7) + */ +#ifdef CONFIG_FS_POSIX_ACL +#ifndef HAVE_SET_ACL +static +#endif +int +zpl_set_acl(struct inode *ip, struct posix_acl *acl, int type) +{ + char *name, *value = NULL; + int error = 0; + size_t size = 0; + + if (S_ISLNK(ip->i_mode)) + return (-EOPNOTSUPP); + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + if (acl) { + umode_t mode = ip->i_mode; + error = posix_acl_equiv_mode(acl, &mode); + if (error < 0) { + return (error); + } else { + /* + * The mode bits will have been set by + * ->zfs_setattr()->zfs_acl_chmod_setattr() + * using the ZFS ACL conversion. If they + * differ from the Posix ACL conversion dirty + * the inode to write the Posix mode bits. + */ + if (ip->i_mode != mode) { + ip->i_mode = mode; + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + } + + if (error == 0) + acl = NULL; + } + } + break; + + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + if (!S_ISDIR(ip->i_mode)) + return (acl ? -EACCES : 0); + break; + + default: + return (-EINVAL); + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmem_alloc(size, KM_SLEEP); + + error = zpl_acl_to_xattr(acl, value, size); + if (error < 0) { + kmem_free(value, size); + return (error); + } + } + + error = zpl_xattr_set(ip, name, value, size, 0); + if (value) + kmem_free(value, size); + + if (!error) { + if (acl) + zpl_set_cached_acl(ip, type, acl); + else + zpl_forget_cached_acl(ip, type); + } + + return (error); +} + +struct posix_acl * +zpl_get_acl(struct inode *ip, int type) +{ + struct posix_acl *acl; + void *value = NULL; + char *name; + int size; + + /* + * As of Linux 3.14, the kernel get_acl will check this for us. + * Also as of Linux 4.7, comparing against ACL_NOT_CACHED is wrong + * as the kernel get_acl will set it to temporary sentinel value. + */ +#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE + acl = get_cached_acl(ip, type); + if (acl != ACL_NOT_CACHED) + return (acl); +#endif + + switch (type) { + case ACL_TYPE_ACCESS: + name = XATTR_NAME_POSIX_ACL_ACCESS; + break; + case ACL_TYPE_DEFAULT: + name = XATTR_NAME_POSIX_ACL_DEFAULT; + break; + default: + return (ERR_PTR(-EINVAL)); + } + + size = zpl_xattr_get(ip, name, NULL, 0); + if (size > 0) { + value = kmem_alloc(size, KM_SLEEP); + size = zpl_xattr_get(ip, name, value, size); + } + + if (size > 0) { + acl = zpl_acl_from_xattr(value, size); + } else if (size == -ENODATA || size == -ENOSYS) { + acl = NULL; + } else { + acl = ERR_PTR(-EIO); + } + + if (size > 0) + kmem_free(value, size); + + /* As of Linux 4.7, the kernel get_acl will set this for us */ +#ifndef HAVE_KERNEL_GET_ACL_HANDLE_CACHE + if (!IS_ERR(acl)) + zpl_set_cached_acl(ip, type, acl); +#endif + + return (acl); +} + +int +zpl_init_acl(struct inode *ip, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int error = 0; + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (!S_ISLNK(ip->i_mode)) { + acl = zpl_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (!acl) { + ip->i_mode &= ~current_umask(); + ip->i_ctime = current_time(ip); + zfs_mark_inode_dirty(ip); + return (0); + } + } + + if (acl) { + umode_t mode; + + if (S_ISDIR(ip->i_mode)) { + error = zpl_set_acl(ip, acl, ACL_TYPE_DEFAULT); + if (error) + goto out; + } + + mode = ip->i_mode; + error = __posix_acl_create(&acl, GFP_KERNEL, &mode); + if (error >= 0) { + ip->i_mode = mode; + zfs_mark_inode_dirty(ip); + if (error > 0) + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + } + } +out: + zpl_posix_acl_release(acl); + + return (error); +} + +int +zpl_chmod_acl(struct inode *ip) +{ + struct posix_acl *acl; + int error; + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (S_ISLNK(ip->i_mode)) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return (PTR_ERR(acl)); + + error = __posix_acl_chmod(&acl, GFP_KERNEL, ip->i_mode); + if (!error) + error = zpl_set_acl(ip, acl, ACL_TYPE_ACCESS); + + zpl_posix_acl_release(acl); + + return (error); +} + +static int +__zpl_xattr_acl_list_access(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + char *xattr_name = XATTR_NAME_POSIX_ACL_ACCESS; + size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_ACCESS); + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (list && xattr_size <= list_size) + memcpy(list, xattr_name, xattr_size); + + return (xattr_size); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_access); + +static int +__zpl_xattr_acl_list_default(struct inode *ip, char *list, size_t list_size, + const char *name, size_t name_len) +{ + char *xattr_name = XATTR_NAME_POSIX_ACL_DEFAULT; + size_t xattr_size = sizeof (XATTR_NAME_POSIX_ACL_DEFAULT); + + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (0); + + if (list && xattr_size <= list_size) + memcpy(list, xattr_name, xattr_size); + + return (xattr_size); +} +ZPL_XATTR_LIST_WRAPPER(zpl_xattr_acl_list_default); + +static int +__zpl_xattr_acl_get_access(struct inode *ip, const char *name, + void *buffer, size_t size) +{ + struct posix_acl *acl; + int type = ACL_TYPE_ACCESS; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, type); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (acl == NULL) + return (-ENODATA); + + error = zpl_acl_to_xattr(acl, buffer, size); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_access); + +static int +__zpl_xattr_acl_get_default(struct inode *ip, const char *name, + void *buffer, size_t size) +{ + struct posix_acl *acl; + int type = ACL_TYPE_DEFAULT; + int error; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + acl = zpl_get_acl(ip, type); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + if (acl == NULL) + return (-ENODATA); + + error = zpl_acl_to_xattr(acl, buffer, size); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_GET_WRAPPER(zpl_xattr_acl_get_default); + +static int +__zpl_xattr_acl_set_access(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + struct posix_acl *acl; + int type = ACL_TYPE_ACCESS; + int error = 0; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + if (!inode_owner_or_capable(ip)) + return (-EPERM); + + if (value) { + acl = zpl_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + else if (acl) { + error = zpl_posix_acl_valid(ip, acl); + if (error) { + zpl_posix_acl_release(acl); + return (error); + } + } + } else { + acl = NULL; + } + + error = zpl_set_acl(ip, acl, type); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_access); + +static int +__zpl_xattr_acl_set_default(struct inode *ip, const char *name, + const void *value, size_t size, int flags) +{ + struct posix_acl *acl; + int type = ACL_TYPE_DEFAULT; + int error = 0; + /* xattr_resolve_name will do this for us if this is defined */ +#ifndef HAVE_XATTR_HANDLER_NAME + if (strcmp(name, "") != 0) + return (-EINVAL); +#endif + if (ITOZSB(ip)->z_acl_type != ZFS_ACLTYPE_POSIXACL) + return (-EOPNOTSUPP); + + if (!inode_owner_or_capable(ip)) + return (-EPERM); + + if (value) { + acl = zpl_acl_from_xattr(value, size); + if (IS_ERR(acl)) + return (PTR_ERR(acl)); + else if (acl) { + error = zpl_posix_acl_valid(ip, acl); + if (error) { + zpl_posix_acl_release(acl); + return (error); + } + } + } else { + acl = NULL; + } + + error = zpl_set_acl(ip, acl, type); + zpl_posix_acl_release(acl); + + return (error); +} +ZPL_XATTR_SET_WRAPPER(zpl_xattr_acl_set_default); + +/* + * ACL access xattr namespace handlers. + * + * Use .name instead of .prefix when available. xattr_resolve_name will match + * whole name and reject anything that has .name only as prefix. + */ +xattr_handler_t zpl_xattr_acl_access_handler = +{ +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_ACCESS, +#else + .prefix = XATTR_NAME_POSIX_ACL_ACCESS, +#endif + .list = zpl_xattr_acl_list_access, + .get = zpl_xattr_acl_get_access, + .set = zpl_xattr_acl_set_access, +#if defined(HAVE_XATTR_LIST_SIMPLE) || \ + defined(HAVE_XATTR_LIST_DENTRY) || \ + defined(HAVE_XATTR_LIST_HANDLER) + .flags = ACL_TYPE_ACCESS, +#endif +}; + +/* + * ACL default xattr namespace handlers. + * + * Use .name instead of .prefix when available. xattr_resolve_name will match + * whole name and reject anything that has .name only as prefix. + */ +xattr_handler_t zpl_xattr_acl_default_handler = +{ +#ifdef HAVE_XATTR_HANDLER_NAME + .name = XATTR_NAME_POSIX_ACL_DEFAULT, +#else + .prefix = XATTR_NAME_POSIX_ACL_DEFAULT, +#endif + .list = zpl_xattr_acl_list_default, + .get = zpl_xattr_acl_get_default, + .set = zpl_xattr_acl_set_default, +#if defined(HAVE_XATTR_LIST_SIMPLE) || \ + defined(HAVE_XATTR_LIST_DENTRY) || \ + defined(HAVE_XATTR_LIST_HANDLER) + .flags = ACL_TYPE_DEFAULT, +#endif +}; + +#endif /* CONFIG_FS_POSIX_ACL */ + +xattr_handler_t *zpl_xattr_handlers[] = { + &zpl_xattr_security_handler, + &zpl_xattr_trusted_handler, + &zpl_xattr_user_handler, +#ifdef CONFIG_FS_POSIX_ACL + &zpl_xattr_acl_access_handler, + &zpl_xattr_acl_default_handler, +#endif /* CONFIG_FS_POSIX_ACL */ + NULL +}; + +static const struct xattr_handler * +zpl_xattr_handler(const char *name) +{ + if (strncmp(name, XATTR_USER_PREFIX, + XATTR_USER_PREFIX_LEN) == 0) + return (&zpl_xattr_user_handler); + + if (strncmp(name, XATTR_TRUSTED_PREFIX, + XATTR_TRUSTED_PREFIX_LEN) == 0) + return (&zpl_xattr_trusted_handler); + + if (strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) == 0) + return (&zpl_xattr_security_handler); + +#ifdef CONFIG_FS_POSIX_ACL + if (strncmp(name, XATTR_NAME_POSIX_ACL_ACCESS, + sizeof (XATTR_NAME_POSIX_ACL_ACCESS)) == 0) + return (&zpl_xattr_acl_access_handler); + + if (strncmp(name, XATTR_NAME_POSIX_ACL_DEFAULT, + sizeof (XATTR_NAME_POSIX_ACL_DEFAULT)) == 0) + return (&zpl_xattr_acl_default_handler); +#endif /* CONFIG_FS_POSIX_ACL */ + + return (NULL); +} + +#if !defined(HAVE_POSIX_ACL_RELEASE) || defined(HAVE_POSIX_ACL_RELEASE_GPL_ONLY) +struct acl_rel_struct { + struct acl_rel_struct *next; + struct posix_acl *acl; + clock_t time; +}; + +#define ACL_REL_GRACE (60*HZ) +#define ACL_REL_WINDOW (1*HZ) +#define ACL_REL_SCHED (ACL_REL_GRACE+ACL_REL_WINDOW) + +/* + * Lockless multi-producer single-consumer fifo list. + * Nodes are added to tail and removed from head. Tail pointer is our + * synchronization point. It always points to the next pointer of the last + * node, or head if list is empty. + */ +static struct acl_rel_struct *acl_rel_head = NULL; +static struct acl_rel_struct **acl_rel_tail = &acl_rel_head; + +static void +zpl_posix_acl_free(void *arg) +{ + struct acl_rel_struct *freelist = NULL; + struct acl_rel_struct *a; + clock_t new_time; + boolean_t refire = B_FALSE; + + ASSERT3P(acl_rel_head, !=, NULL); + while (acl_rel_head) { + a = acl_rel_head; + if (ddi_get_lbolt() - a->time >= ACL_REL_GRACE) { + /* + * If a is the last node we need to reset tail, but we + * need to use cmpxchg to make sure it is still the + * last node. + */ + if (acl_rel_tail == &a->next) { + acl_rel_head = NULL; + if (cmpxchg(&acl_rel_tail, &a->next, + &acl_rel_head) == &a->next) { + ASSERT3P(a->next, ==, NULL); + a->next = freelist; + freelist = a; + break; + } + } + /* + * a is not last node, make sure next pointer is set + * by the adder and advance the head. + */ + while (READ_ONCE(a->next) == NULL) + cpu_relax(); + acl_rel_head = a->next; + a->next = freelist; + freelist = a; + } else { + /* + * a is still in grace period. We are responsible to + * reschedule the free task, since adder will only do + * so if list is empty. + */ + new_time = a->time + ACL_REL_SCHED; + refire = B_TRUE; + break; + } + } + + if (refire) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, new_time); + + while (freelist) { + a = freelist; + freelist = a->next; + kfree(a->acl); + kmem_free(a, sizeof (struct acl_rel_struct)); + } +} + +void +zpl_posix_acl_release_impl(struct posix_acl *acl) +{ + struct acl_rel_struct *a, **prev; + + a = kmem_alloc(sizeof (struct acl_rel_struct), KM_SLEEP); + a->next = NULL; + a->acl = acl; + a->time = ddi_get_lbolt(); + /* atomically points tail to us and get the previous tail */ + prev = xchg(&acl_rel_tail, &a->next); + ASSERT3P(*prev, ==, NULL); + *prev = a; + /* if it was empty before, schedule the free task */ + if (prev == &acl_rel_head) + taskq_dispatch_delay(system_delay_taskq, zpl_posix_acl_free, + NULL, TQ_SLEEP, ddi_get_lbolt() + ACL_REL_SCHED); +} +#endif diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c new file mode 100644 index 000000000000..218e1101edf8 --- /dev/null +++ b/module/os/linux/zfs/zvol_os.c @@ -0,0 +1,1125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +unsigned int zvol_major = ZVOL_MAJOR; +unsigned int zvol_request_sync = 0; +unsigned int zvol_prefetch_bytes = (128 * 1024); +unsigned long zvol_max_discard_blocks = 16384; +unsigned int zvol_threads = 32; + +struct zvol_state_os { + struct gendisk *zvo_disk; /* generic disk */ + struct request_queue *zvo_queue; /* request queue */ + dev_t zvo_dev; /* device id */ +}; + +taskq_t *zvol_taskq; +static struct ida zvol_ida; + +typedef struct zv_request { + zvol_state_t *zv; + struct bio *bio; + taskq_ent_t ent; +} zv_request_t; + +/* + * Given a path, return TRUE if path is a ZVOL. + */ +static boolean_t +zvol_is_zvol_impl(const char *device) +{ + struct block_device *bdev; + unsigned int major; + + bdev = vdev_lookup_bdev(device); + if (IS_ERR(bdev)) + return (B_FALSE); + + major = MAJOR(bdev->bd_dev); + bdput(bdev); + + if (major == zvol_major) + return (B_TRUE); + + return (B_FALSE); +} + +static void +uio_from_bio(uio_t *uio, struct bio *bio) +{ + uio->uio_bvec = &bio->bi_io_vec[BIO_BI_IDX(bio)]; + uio->uio_iovcnt = bio->bi_vcnt - BIO_BI_IDX(bio); + uio->uio_loffset = BIO_BI_SECTOR(bio) << 9; + uio->uio_segflg = UIO_BVEC; + uio->uio_limit = MAXOFFSET_T; + uio->uio_resid = BIO_BI_SIZE(bio); + uio->uio_skip = BIO_BI_SKIP(bio); +} + +static void +zvol_write(void *arg) +{ + int error = 0; + + zv_request_t *zvr = arg; + struct bio *bio = zvr->bio; + uio_t uio = { { 0 }, 0 }; + uio_from_bio(&uio, bio); + + zvol_state_t *zv = zvr->zv; + ASSERT(zv && zv->zv_open_count > 0); + ASSERT(zv->zv_zilog != NULL); + + /* bio marked as FLUSH need to flush before write */ + if (bio_is_flush(bio)) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + /* Some requests are just for flush and nothing else. */ + if (uio.uio_resid == 0) { + rw_exit(&zv->zv_suspend_lock); + BIO_END_IO(bio, 0); + kmem_free(zvr, sizeof (zv_request_t)); + return; + } + + ssize_t start_resid = uio.uio_resid; + unsigned long start_jif = jiffies; + blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, + bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + + boolean_t sync = + bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + uio.uio_loffset, uio.uio_resid, RL_WRITER); + + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); + uint64_t off = uio.uio_loffset; + dmu_tx_t *tx = dmu_tx_create(zv->zv_objset); + + if (bytes > volsize - off) /* don't write past the end */ + bytes = volsize - off; + + dmu_tx_hold_write_by_dnode(tx, zv->zv_dn, off, bytes); + + /* This will only fail for ENOSPC */ + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + break; + } + error = dmu_write_uio_dnode(zv->zv_dn, &uio, bytes, tx); + if (error == 0) { + zvol_log_write(zv, tx, off, bytes, sync); + } + dmu_tx_commit(tx); + + if (error) + break; + } + zfs_rangelock_exit(lr); + + int64_t nwritten = start_resid - uio.uio_resid; + dataset_kstats_update_write_kstats(&zv->zv_kstat, nwritten); + task_io_account_write(nwritten); + + if (sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + + rw_exit(&zv->zv_suspend_lock); + blk_generic_end_io_acct(zv->zv_zso->zvo_queue, + WRITE, &zv->zv_zso->zvo_disk->part0, start_jif); + BIO_END_IO(bio, -error); + kmem_free(zvr, sizeof (zv_request_t)); +} + +static void +zvol_discard(void *arg) +{ + zv_request_t *zvr = arg; + struct bio *bio = zvr->bio; + zvol_state_t *zv = zvr->zv; + uint64_t start = BIO_BI_SECTOR(bio) << 9; + uint64_t size = BIO_BI_SIZE(bio); + uint64_t end = start + size; + boolean_t sync; + int error = 0; + dmu_tx_t *tx; + unsigned long start_jif; + + ASSERT(zv && zv->zv_open_count > 0); + ASSERT(zv->zv_zilog != NULL); + + start_jif = jiffies; + blk_generic_start_io_acct(zv->zv_zso->zvo_queue, WRITE, + bio_sectors(bio), &zv->zv_zso->zvo_disk->part0); + + sync = bio_is_fua(bio) || zv->zv_objset->os_sync == ZFS_SYNC_ALWAYS; + + if (end > zv->zv_volsize) { + error = SET_ERROR(EIO); + goto unlock; + } + + /* + * Align the request to volume block boundaries when a secure erase is + * not required. This will prevent dnode_free_range() from zeroing out + * the unaligned parts which is slow (read-modify-write) and useless + * since we are not freeing any space by doing so. + */ + if (!bio_is_secure_erase(bio)) { + start = P2ROUNDUP(start, zv->zv_volblocksize); + end = P2ALIGN(end, zv->zv_volblocksize); + size = end - start; + } + + if (start >= end) + goto unlock; + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + start, size, RL_WRITER); + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) { + dmu_tx_abort(tx); + } else { + zvol_log_truncate(zv, tx, start, size, B_TRUE); + dmu_tx_commit(tx); + error = dmu_free_long_range(zv->zv_objset, + ZVOL_OBJ, start, size); + } + zfs_rangelock_exit(lr); + + if (error == 0 && sync) + zil_commit(zv->zv_zilog, ZVOL_OBJ); + +unlock: + rw_exit(&zv->zv_suspend_lock); + blk_generic_end_io_acct(zv->zv_zso->zvo_queue, WRITE, + &zv->zv_zso->zvo_disk->part0, start_jif); + BIO_END_IO(bio, -error); + kmem_free(zvr, sizeof (zv_request_t)); +} + +static void +zvol_read(void *arg) +{ + int error = 0; + + zv_request_t *zvr = arg; + struct bio *bio = zvr->bio; + uio_t uio = { { 0 }, 0 }; + uio_from_bio(&uio, bio); + + zvol_state_t *zv = zvr->zv; + ASSERT(zv && zv->zv_open_count > 0); + + ssize_t start_resid = uio.uio_resid; + unsigned long start_jif = jiffies; + blk_generic_start_io_acct(zv->zv_zso->zvo_queue, READ, bio_sectors(bio), + &zv->zv_zso->zvo_disk->part0); + + zfs_locked_range_t *lr = zfs_rangelock_enter(&zv->zv_rangelock, + uio.uio_loffset, uio.uio_resid, RL_READER); + + uint64_t volsize = zv->zv_volsize; + while (uio.uio_resid > 0 && uio.uio_loffset < volsize) { + uint64_t bytes = MIN(uio.uio_resid, DMU_MAX_ACCESS >> 1); + + /* don't read past the end */ + if (bytes > volsize - uio.uio_loffset) + bytes = volsize - uio.uio_loffset; + + error = dmu_read_uio_dnode(zv->zv_dn, &uio, bytes); + if (error) { + /* convert checksum errors into IO errors */ + if (error == ECKSUM) + error = SET_ERROR(EIO); + break; + } + } + zfs_rangelock_exit(lr); + + int64_t nread = start_resid - uio.uio_resid; + dataset_kstats_update_read_kstats(&zv->zv_kstat, nread); + task_io_account_read(nread); + + rw_exit(&zv->zv_suspend_lock); + blk_generic_end_io_acct(zv->zv_zso->zvo_queue, READ, + &zv->zv_zso->zvo_disk->part0, start_jif); + BIO_END_IO(bio, -error); + kmem_free(zvr, sizeof (zv_request_t)); +} + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS +static blk_qc_t +zvol_submit_bio(struct bio *bio) +#else +static MAKE_REQUEST_FN_RET +zvol_request(struct request_queue *q, struct bio *bio) +#endif +{ +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS + struct request_queue *q = bio->bi_disk->queue; +#endif + zvol_state_t *zv = q->queuedata; + fstrans_cookie_t cookie = spl_fstrans_mark(); + uint64_t offset = BIO_BI_SECTOR(bio) << 9; + uint64_t size = BIO_BI_SIZE(bio); + int rw = bio_data_dir(bio); + zv_request_t *zvr; + + if (bio_has_data(bio) && offset + size > zv->zv_volsize) { + printk(KERN_INFO + "%s: bad access: offset=%llu, size=%lu\n", + zv->zv_zso->zvo_disk->disk_name, + (long long unsigned)offset, + (long unsigned)size); + + BIO_END_IO(bio, -SET_ERROR(EIO)); + goto out; + } + + if (rw == WRITE) { + if (unlikely(zv->zv_flags & ZVOL_RDONLY)) { + BIO_END_IO(bio, -SET_ERROR(EROFS)); + goto out; + } + + /* + * Prevents the zvol from being suspended, or the ZIL being + * concurrently opened. Will be released after the i/o + * completes. + */ + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* + * Open a ZIL if this is the first time we have written to this + * zvol. We protect zv->zv_zilog with zv_suspend_lock rather + * than zv_state_lock so that we don't need to acquire an + * additional lock in this path. + */ + if (zv->zv_zilog == NULL) { + rw_exit(&zv->zv_suspend_lock); + rw_enter(&zv->zv_suspend_lock, RW_WRITER); + if (zv->zv_zilog == NULL) { + zv->zv_zilog = zil_open(zv->zv_objset, + zvol_get_data); + zv->zv_flags |= ZVOL_WRITTEN_TO; + } + rw_downgrade(&zv->zv_suspend_lock); + } + + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv = zv; + zvr->bio = bio; + taskq_init_ent(&zvr->ent); + + /* + * We don't want this thread to be blocked waiting for i/o to + * complete, so we instead wait from a taskq callback. The + * i/o may be a ZIL write (via zil_commit()), or a read of an + * indirect block, or a read of a data block (if this is a + * partial-block write). We will indicate that the i/o is + * complete by calling BIO_END_IO() from the taskq callback. + * + * This design allows the calling thread to continue and + * initiate more concurrent operations by calling + * zvol_request() again. There are typically only a small + * number of threads available to call zvol_request() (e.g. + * one per iSCSI target), so keeping the latency of + * zvol_request() low is important for performance. + * + * The zvol_request_sync module parameter allows this + * behavior to be altered, for performance evaluation + * purposes. If the callback blocks, setting + * zvol_request_sync=1 will result in much worse performance. + * + * We can have up to zvol_threads concurrent i/o's being + * processed for all zvols on the system. This is typically + * a vast improvement over the zvol_request_sync=1 behavior + * of one i/o at a time per zvol. However, an even better + * design would be for zvol_request() to initiate the zio + * directly, and then be notified by the zio_done callback, + * which would call BIO_END_IO(). Unfortunately, the DMU/ZIL + * interfaces lack this functionality (they block waiting for + * the i/o to complete). + */ + if (bio_is_discard(bio) || bio_is_secure_erase(bio)) { + if (zvol_request_sync) { + zvol_discard(zvr); + } else { + taskq_dispatch_ent(zvol_taskq, + zvol_discard, zvr, 0, &zvr->ent); + } + } else { + if (zvol_request_sync) { + zvol_write(zvr); + } else { + taskq_dispatch_ent(zvol_taskq, + zvol_write, zvr, 0, &zvr->ent); + } + } + } else { + /* + * The SCST driver, and possibly others, may issue READ I/Os + * with a length of zero bytes. These empty I/Os contain no + * data and require no additional handling. + */ + if (size == 0) { + BIO_END_IO(bio, 0); + goto out; + } + + zvr = kmem_alloc(sizeof (zv_request_t), KM_SLEEP); + zvr->zv = zv; + zvr->bio = bio; + taskq_init_ent(&zvr->ent); + + rw_enter(&zv->zv_suspend_lock, RW_READER); + + /* See comment in WRITE case above. */ + if (zvol_request_sync) { + zvol_read(zvr); + } else { + taskq_dispatch_ent(zvol_taskq, + zvol_read, zvr, 0, &zvr->ent); + } + } + +out: + spl_fstrans_unmark(cookie); +#if defined(HAVE_MAKE_REQUEST_FN_RET_QC) || \ + defined(HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS) + return (BLK_QC_T_NONE); +#endif +} + +static int +zvol_open(struct block_device *bdev, fmode_t flag) +{ + zvol_state_t *zv; + int error = 0; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, RW_READER); + /* + * Obtain a copy of private_data under the zvol_state_lock to make + * sure that either the result of zvol free code path setting + * bdev->bd_disk->private_data to NULL is observed, or zvol_free() + * is not called on this zv because of the positive zv_open_count. + */ + zv = bdev->bd_disk->private_data; + if (zv == NULL) { + rw_exit(&zvol_state_lock); + return (SET_ERROR(-ENXIO)); + } + + mutex_enter(&zv->zv_state_lock); + /* + * make sure zvol is not suspended during first open + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 0) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 0) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 0 || RW_READ_HELD(&zv->zv_suspend_lock)); + + if (zv->zv_open_count == 0) { + error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + if (error) + goto out_mutex; + } + + if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + error = -EROFS; + goto out_open_count; + } + + zv->zv_open_count++; + + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + + check_disk_change(bdev); + + return (0); + +out_open_count: + if (zv->zv_open_count == 0) + zvol_last_close(zv); + +out_mutex: + mutex_exit(&zv->zv_state_lock); + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); + if (error == -EINTR) { + error = -ERESTARTSYS; + schedule(); + } + return (SET_ERROR(error)); +} + +static void +zvol_release(struct gendisk *disk, fmode_t mode) +{ + zvol_state_t *zv; + boolean_t drop_suspend = B_TRUE; + + rw_enter(&zvol_state_lock, RW_READER); + zv = disk->private_data; + + mutex_enter(&zv->zv_state_lock); + ASSERT(zv->zv_open_count > 0); + /* + * make sure zvol is not suspended during last close + * (hold zv_suspend_lock) and respect proper lock acquisition + * ordering - zv_suspend_lock before zv_state_lock + */ + if (zv->zv_open_count == 1) { + if (!rw_tryenter(&zv->zv_suspend_lock, RW_READER)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, RW_READER); + mutex_enter(&zv->zv_state_lock); + /* check to see if zv_suspend_lock is needed */ + if (zv->zv_open_count != 1) { + rw_exit(&zv->zv_suspend_lock); + drop_suspend = B_FALSE; + } + } + } else { + drop_suspend = B_FALSE; + } + rw_exit(&zvol_state_lock); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count != 1 || RW_READ_HELD(&zv->zv_suspend_lock)); + + zv->zv_open_count--; + if (zv->zv_open_count == 0) + zvol_last_close(zv); + + mutex_exit(&zv->zv_state_lock); + + if (drop_suspend) + rw_exit(&zv->zv_suspend_lock); +} + +static int +zvol_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + int error = 0; + + ASSERT3U(zv->zv_open_count, >, 0); + + switch (cmd) { + case BLKFLSBUF: + fsync_bdev(bdev); + invalidate_bdev(bdev); + rw_enter(&zv->zv_suspend_lock, RW_READER); + + if (!(zv->zv_flags & ZVOL_RDONLY)) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + + rw_exit(&zv->zv_suspend_lock); + break; + + case BLKZNAME: + mutex_enter(&zv->zv_state_lock); + error = copy_to_user((void *)arg, zv->zv_name, MAXNAMELEN); + mutex_exit(&zv->zv_state_lock); + break; + + default: + error = -ENOTTY; + break; + } + + return (SET_ERROR(error)); +} + +#ifdef CONFIG_COMPAT +static int +zvol_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned cmd, unsigned long arg) +{ + return (zvol_ioctl(bdev, mode, cmd, arg)); +} +#else +#define zvol_compat_ioctl NULL +#endif + +static unsigned int +zvol_check_events(struct gendisk *disk, unsigned int clearing) +{ + unsigned int mask = 0; + + rw_enter(&zvol_state_lock, RW_READER); + + zvol_state_t *zv = disk->private_data; + if (zv != NULL) { + mutex_enter(&zv->zv_state_lock); + mask = zv->zv_changed ? DISK_EVENT_MEDIA_CHANGE : 0; + zv->zv_changed = 0; + mutex_exit(&zv->zv_state_lock); + } + + rw_exit(&zvol_state_lock); + + return (mask); +} + +static int +zvol_revalidate_disk(struct gendisk *disk) +{ + rw_enter(&zvol_state_lock, RW_READER); + + zvol_state_t *zv = disk->private_data; + if (zv != NULL) { + mutex_enter(&zv->zv_state_lock); + set_capacity(zv->zv_zso->zvo_disk, + zv->zv_volsize >> SECTOR_BITS); + mutex_exit(&zv->zv_state_lock); + } + + rw_exit(&zvol_state_lock); + + return (0); +} + +static int +zvol_update_volsize(zvol_state_t *zv, uint64_t volsize) +{ + + revalidate_disk(zv->zv_zso->zvo_disk); + return (0); +} + +static void +zvol_clear_private(zvol_state_t *zv) +{ + /* + * Cleared while holding zvol_state_lock as a writer + * which will prevent zvol_open() from opening it. + */ + zv->zv_zso->zvo_disk->private_data = NULL; +} + +/* + * Provide a simple virtual geometry for legacy compatibility. For devices + * smaller than 1 MiB a small head and sector count is used to allow very + * tiny devices. For devices over 1 Mib a standard head and sector count + * is used to keep the cylinders count reasonable. + */ +static int +zvol_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + zvol_state_t *zv = bdev->bd_disk->private_data; + sector_t sectors; + + ASSERT3U(zv->zv_open_count, >, 0); + + sectors = get_capacity(zv->zv_zso->zvo_disk); + + if (sectors > 2048) { + geo->heads = 16; + geo->sectors = 63; + } else { + geo->heads = 2; + geo->sectors = 4; + } + + geo->start = 0; + geo->cylinders = sectors / (geo->heads * geo->sectors); + + return (0); +} + +/* + * Find a zvol_state_t given the full major+minor dev_t. If found, + * return with zv_state_lock taken, otherwise, return (NULL) without + * taking zv_state_lock. + */ +static zvol_state_t * +zvol_find_by_dev(dev_t dev) +{ + zvol_state_t *zv; + + rw_enter(&zvol_state_lock, RW_READER); + for (zv = list_head(&zvol_state_list); zv != NULL; + zv = list_next(&zvol_state_list, zv)) { + mutex_enter(&zv->zv_state_lock); + if (zv->zv_zso->zvo_dev == dev) { + rw_exit(&zvol_state_lock); + return (zv); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + + return (NULL); +} + +static struct kobject * +zvol_probe(dev_t dev, int *part, void *arg) +{ + zvol_state_t *zv; + struct kobject *kobj; + + zv = zvol_find_by_dev(dev); + kobj = zv ? get_disk_and_module(zv->zv_zso->zvo_disk) : NULL; + ASSERT(zv == NULL || MUTEX_HELD(&zv->zv_state_lock)); + if (zv) + mutex_exit(&zv->zv_state_lock); + + return (kobj); +} + +static struct block_device_operations zvol_ops = { + .open = zvol_open, + .release = zvol_release, + .ioctl = zvol_ioctl, + .compat_ioctl = zvol_compat_ioctl, + .check_events = zvol_check_events, + .revalidate_disk = zvol_revalidate_disk, + .getgeo = zvol_getgeo, + .owner = THIS_MODULE, +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS + .submit_bio = zvol_submit_bio, +#endif +}; + +/* + * Allocate memory for a new zvol_state_t and setup the required + * request queue and generic disk structures for the block device. + */ +static zvol_state_t * +zvol_alloc(dev_t dev, const char *name) +{ + zvol_state_t *zv; + struct zvol_state_os *zso; + uint64_t volmode; + + if (dsl_prop_get_integer(name, "volmode", &volmode, NULL) != 0) + return (NULL); + + if (volmode == ZFS_VOLMODE_DEFAULT) + volmode = zvol_volmode; + + if (volmode == ZFS_VOLMODE_NONE) + return (NULL); + + zv = kmem_zalloc(sizeof (zvol_state_t), KM_SLEEP); + zso = kmem_zalloc(sizeof (struct zvol_state_os), KM_SLEEP); + zv->zv_zso = zso; + + list_link_init(&zv->zv_next); + mutex_init(&zv->zv_state_lock, NULL, MUTEX_DEFAULT, NULL); + +#ifdef HAVE_SUBMIT_BIO_IN_BLOCK_DEVICE_OPERATIONS + zso->zvo_queue = blk_alloc_queue(NUMA_NO_NODE); +#else + zso->zvo_queue = blk_generic_alloc_queue(zvol_request, NUMA_NO_NODE); +#endif + if (zso->zvo_queue == NULL) + goto out_kmem; + + blk_queue_set_write_cache(zso->zvo_queue, B_TRUE, B_TRUE); + + /* Limit read-ahead to a single page to prevent over-prefetching. */ + blk_queue_set_read_ahead(zso->zvo_queue, 1); + + /* Disable write merging in favor of the ZIO pipeline. */ + blk_queue_flag_set(QUEUE_FLAG_NOMERGES, zso->zvo_queue); + + zso->zvo_disk = alloc_disk(ZVOL_MINORS); + if (zso->zvo_disk == NULL) + goto out_queue; + + zso->zvo_queue->queuedata = zv; + zso->zvo_dev = dev; + zv->zv_open_count = 0; + strlcpy(zv->zv_name, name, MAXNAMELEN); + + zfs_rangelock_init(&zv->zv_rangelock, NULL, NULL); + rw_init(&zv->zv_suspend_lock, NULL, RW_DEFAULT, NULL); + + zso->zvo_disk->major = zvol_major; + zso->zvo_disk->events = DISK_EVENT_MEDIA_CHANGE; + + if (volmode == ZFS_VOLMODE_DEV) { + /* + * ZFS_VOLMODE_DEV disable partitioning on ZVOL devices: set + * gendisk->minors = 1 as noted in include/linux/genhd.h. + * Also disable extended partition numbers (GENHD_FL_EXT_DEVT) + * and suppresses partition scanning (GENHD_FL_NO_PART_SCAN) + * setting gendisk->flags accordingly. + */ + zso->zvo_disk->minors = 1; +#if defined(GENHD_FL_EXT_DEVT) + zso->zvo_disk->flags &= ~GENHD_FL_EXT_DEVT; +#endif +#if defined(GENHD_FL_NO_PART_SCAN) + zso->zvo_disk->flags |= GENHD_FL_NO_PART_SCAN; +#endif + } + zso->zvo_disk->first_minor = (dev & MINORMASK); + zso->zvo_disk->fops = &zvol_ops; + zso->zvo_disk->private_data = zv; + zso->zvo_disk->queue = zso->zvo_queue; + snprintf(zso->zvo_disk->disk_name, DISK_NAME_LEN, "%s%d", + ZVOL_DEV_NAME, (dev & MINORMASK)); + + return (zv); + +out_queue: + blk_cleanup_queue(zso->zvo_queue); +out_kmem: + kmem_free(zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); + return (NULL); +} + +/* + * Cleanup then free a zvol_state_t which was created by zvol_alloc(). + * At this time, the structure is not opened by anyone, is taken off + * the zvol_state_list, and has its private data set to NULL. + * The zvol_state_lock is dropped. + * + * This function may take many milliseconds to complete (e.g. we've seen + * it take over 256ms), due to the calls to "blk_cleanup_queue" and + * "del_gendisk". Thus, consumers need to be careful to account for this + * latency when calling this function. + */ +static void +zvol_free(zvol_state_t *zv) +{ + + ASSERT(!RW_LOCK_HELD(&zv->zv_suspend_lock)); + ASSERT(!MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(zv->zv_open_count == 0); + ASSERT(zv->zv_zso->zvo_disk->private_data == NULL); + + rw_destroy(&zv->zv_suspend_lock); + zfs_rangelock_fini(&zv->zv_rangelock); + + del_gendisk(zv->zv_zso->zvo_disk); + blk_cleanup_queue(zv->zv_zso->zvo_queue); + put_disk(zv->zv_zso->zvo_disk); + + ida_simple_remove(&zvol_ida, + MINOR(zv->zv_zso->zvo_dev) >> ZVOL_MINOR_BITS); + + mutex_destroy(&zv->zv_state_lock); + dataset_kstats_destroy(&zv->zv_kstat); + + kmem_free(zv->zv_zso, sizeof (struct zvol_state_os)); + kmem_free(zv, sizeof (zvol_state_t)); +} + +/* + * Create a block device minor node and setup the linkage between it + * and the specified volume. Once this function returns the block + * device is live and ready for use. + */ +static int +zvol_os_create_minor(const char *name) +{ + zvol_state_t *zv; + objset_t *os; + dmu_object_info_t *doi; + uint64_t volsize; + uint64_t len; + unsigned minor = 0; + int error = 0; + int idx; + uint64_t hash = zvol_name_hash(name); + + if (zvol_inhibit_dev) + return (0); + + idx = ida_simple_get(&zvol_ida, 0, 0, kmem_flags_convert(KM_SLEEP)); + if (idx < 0) + return (SET_ERROR(-idx)); + minor = idx << ZVOL_MINOR_BITS; + + zv = zvol_find_by_name_hash(name, hash, RW_NONE); + if (zv) { + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + mutex_exit(&zv->zv_state_lock); + ida_simple_remove(&zvol_ida, idx); + return (SET_ERROR(EEXIST)); + } + + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, B_TRUE, FTAG, &os); + if (error) + goto out_doi; + + error = dmu_object_info(os, ZVOL_OBJ, doi); + if (error) + goto out_dmu_objset_disown; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + goto out_dmu_objset_disown; + + zv = zvol_alloc(MKDEV(zvol_major, minor), name); + if (zv == NULL) { + error = SET_ERROR(EAGAIN); + goto out_dmu_objset_disown; + } + zv->zv_hash = hash; + + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; + + zv->zv_volblocksize = doi->doi_data_block_size; + zv->zv_volsize = volsize; + zv->zv_objset = os; + + set_capacity(zv->zv_zso->zvo_disk, zv->zv_volsize >> 9); + + blk_queue_max_hw_sectors(zv->zv_zso->zvo_queue, + (DMU_MAX_ACCESS / 4) >> 9); + blk_queue_max_segments(zv->zv_zso->zvo_queue, UINT16_MAX); + blk_queue_max_segment_size(zv->zv_zso->zvo_queue, UINT_MAX); + blk_queue_physical_block_size(zv->zv_zso->zvo_queue, + zv->zv_volblocksize); + blk_queue_io_opt(zv->zv_zso->zvo_queue, zv->zv_volblocksize); + blk_queue_max_discard_sectors(zv->zv_zso->zvo_queue, + (zvol_max_discard_blocks * zv->zv_volblocksize) >> 9); + blk_queue_discard_granularity(zv->zv_zso->zvo_queue, + zv->zv_volblocksize); + blk_queue_flag_set(QUEUE_FLAG_DISCARD, zv->zv_zso->zvo_queue); +#ifdef QUEUE_FLAG_NONROT + blk_queue_flag_set(QUEUE_FLAG_NONROT, zv->zv_zso->zvo_queue); +#endif +#ifdef QUEUE_FLAG_ADD_RANDOM + blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, zv->zv_zso->zvo_queue); +#endif + /* This flag was introduced in kernel version 4.12. */ +#ifdef QUEUE_FLAG_SCSI_PASSTHROUGH + blk_queue_flag_set(QUEUE_FLAG_SCSI_PASSTHROUGH, zv->zv_zso->zvo_queue); +#endif + + if (spa_writeable(dmu_objset_spa(os))) { + if (zil_replay_disable) + zil_destroy(dmu_objset_zil(os), B_FALSE); + else + zil_replay(os, zv, zvol_replay_vector); + } + ASSERT3P(zv->zv_kstat.dk_kstats, ==, NULL); + dataset_kstats_create(&zv->zv_kstat, zv->zv_objset); + + /* + * When udev detects the addition of the device it will immediately + * invoke blkid(8) to determine the type of content on the device. + * Prefetching the blocks commonly scanned by blkid(8) will speed + * up this process. + */ + len = MIN(MAX(zvol_prefetch_bytes, 0), SPA_MAXBLOCKSIZE); + if (len > 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, len, ZIO_PRIORITY_SYNC_READ); + dmu_prefetch(os, ZVOL_OBJ, 0, volsize - len, len, + ZIO_PRIORITY_SYNC_READ); + } + + zv->zv_objset = NULL; +out_dmu_objset_disown: + dmu_objset_disown(os, B_TRUE, FTAG); +out_doi: + kmem_free(doi, sizeof (dmu_object_info_t)); + + /* + * Keep in mind that once add_disk() is called, the zvol is + * announced to the world, and zvol_open()/zvol_release() can + * be called at any time. Incidentally, add_disk() itself calls + * zvol_open()->zvol_first_open() and zvol_release()->zvol_last_close() + * directly as well. + */ + if (error == 0) { + rw_enter(&zvol_state_lock, RW_WRITER); + zvol_insert(zv); + rw_exit(&zvol_state_lock); + add_disk(zv->zv_zso->zvo_disk); + } else { + ida_simple_remove(&zvol_ida, idx); + } + + return (error); +} + +static void +zvol_rename_minor(zvol_state_t *zv, const char *newname) +{ + int readonly = get_disk_ro(zv->zv_zso->zvo_disk); + + ASSERT(RW_LOCK_HELD(&zvol_state_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + + /* move to new hashtable entry */ + zv->zv_hash = zvol_name_hash(zv->zv_name); + hlist_del(&zv->zv_hlink); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); + + /* + * The block device's read-only state is briefly changed causing + * a KOBJ_CHANGE uevent to be issued. This ensures udev detects + * the name change and fixes the symlinks. This does not change + * ZVOL_RDONLY in zv->zv_flags so the actual read-only state never + * changes. This would normally be done using kobject_uevent() but + * that is a GPL-only symbol which is why we need this workaround. + */ + set_disk_ro(zv->zv_zso->zvo_disk, !readonly); + set_disk_ro(zv->zv_zso->zvo_disk, readonly); +} + +static void +zvol_set_disk_ro_impl(zvol_state_t *zv, int flags) +{ + + set_disk_ro(zv->zv_zso->zvo_disk, flags); +} + +static void +zvol_set_capacity_impl(zvol_state_t *zv, uint64_t capacity) +{ + + set_capacity(zv->zv_zso->zvo_disk, capacity); +} + +const static zvol_platform_ops_t zvol_linux_ops = { + .zv_free = zvol_free, + .zv_rename_minor = zvol_rename_minor, + .zv_create_minor = zvol_os_create_minor, + .zv_update_volsize = zvol_update_volsize, + .zv_clear_private = zvol_clear_private, + .zv_is_zvol = zvol_is_zvol_impl, + .zv_set_disk_ro = zvol_set_disk_ro_impl, + .zv_set_capacity = zvol_set_capacity_impl, +}; + +int +zvol_init(void) +{ + int error; + int threads = MIN(MAX(zvol_threads, 1), 1024); + + error = register_blkdev(zvol_major, ZVOL_DRIVER); + if (error) { + printk(KERN_INFO "ZFS: register_blkdev() failed %d\n", error); + return (error); + } + zvol_taskq = taskq_create(ZVOL_DRIVER, threads, maxclsyspri, + threads * 2, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + if (zvol_taskq == NULL) { + unregister_blkdev(zvol_major, ZVOL_DRIVER); + return (-ENOMEM); + } + zvol_init_impl(); + blk_register_region(MKDEV(zvol_major, 0), 1UL << MINORBITS, + THIS_MODULE, zvol_probe, NULL, NULL); + + ida_init(&zvol_ida); + zvol_register_ops(&zvol_linux_ops); + return (0); +} + +void +zvol_fini(void) +{ + zvol_fini_impl(); + blk_unregister_region(MKDEV(zvol_major, 0), 1UL << MINORBITS); + unregister_blkdev(zvol_major, ZVOL_DRIVER); + taskq_destroy(zvol_taskq); + ida_destroy(&zvol_ida); +} + +/* BEGIN CSTYLED */ +module_param(zvol_inhibit_dev, uint, 0644); +MODULE_PARM_DESC(zvol_inhibit_dev, "Do not create zvol device nodes"); + +module_param(zvol_major, uint, 0444); +MODULE_PARM_DESC(zvol_major, "Major number for zvol device"); + +module_param(zvol_threads, uint, 0444); +MODULE_PARM_DESC(zvol_threads, "Max number of threads to handle I/O requests"); + +module_param(zvol_request_sync, uint, 0644); +MODULE_PARM_DESC(zvol_request_sync, "Synchronously handle bio requests"); + +module_param(zvol_max_discard_blocks, ulong, 0444); +MODULE_PARM_DESC(zvol_max_discard_blocks, "Max number of blocks to discard"); + +module_param(zvol_prefetch_bytes, uint, 0644); +MODULE_PARM_DESC(zvol_prefetch_bytes, "Prefetch N bytes at zvol start+end"); + +module_param(zvol_volmode, uint, 0644); +MODULE_PARM_DESC(zvol_volmode, "Default volmode property value"); +/* END CSTYLED */ diff --git a/module/spl/Makefile.in b/module/spl/Makefile.in new file mode 100644 index 000000000000..cedbfe92b58a --- /dev/null +++ b/module/spl/Makefile.in @@ -0,0 +1,13 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +mfdir = $(obj) +else +mfdir = $(srctree)/$(src) +endif + +MODULE := spl + +obj-$(CONFIG_ZFS) := $(MODULE).o + +include $(mfdir)/../os/linux/spl/Makefile diff --git a/module/unicode/Makefile.in b/module/unicode/Makefile.in new file mode 100644 index 000000000000..59c07c4555b7 --- /dev/null +++ b/module/unicode/Makefile.in @@ -0,0 +1,11 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +endif + +MODULE := zunicode + +obj-$(CONFIG_ZFS) := $(MODULE).o + +$(MODULE)-objs += u8_textprep.o +$(MODULE)-objs += uconv.o diff --git a/module/unicode/u8_textprep.c b/module/unicode/u8_textprep.c new file mode 100644 index 000000000000..65f555d88947 --- /dev/null +++ b/module/unicode/u8_textprep.c @@ -0,0 +1,2151 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + + +/* + * UTF-8 text preparation functions (PSARC/2007/149, PSARC/2007/458). + * + * Man pages: u8_textprep_open(9F), u8_textprep_buf(9F), u8_textprep_close(9F), + * u8_textprep_str(9F), u8_strcmp(9F), and u8_validate(9F). See also + * the section 3C man pages. + * Interface stability: Committed. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* The maximum possible number of bytes in a UTF-8 character. */ +#define U8_MB_CUR_MAX (4) + +/* + * The maximum number of bytes needed for a UTF-8 character to cover + * U+0000 - U+FFFF, i.e., the coding space of now deprecated UCS-2. + */ +#define U8_MAX_BYTES_UCS2 (3) + +/* The maximum possible number of bytes in a Stream-Safe Text. */ +#define U8_STREAM_SAFE_TEXT_MAX (128) + +/* + * The maximum number of characters in a combining/conjoining sequence and + * the actual upperbound limit of a combining/conjoining sequence. + */ +#define U8_MAX_CHARS_A_SEQ (32) +#define U8_UPPER_LIMIT_IN_A_SEQ (31) + +/* The combining class value for Starter. */ +#define U8_COMBINING_CLASS_STARTER (0) + +/* + * Some Hangul related macros at below. + * + * The first and the last of Hangul syllables, Hangul Jamo Leading consonants, + * Vowels, and optional Trailing consonants in Unicode scalar values. + * + * Please be noted that the U8_HANGUL_JAMO_T_FIRST is 0x11A7 at below not + * the actual U+11A8. This is due to that the trailing consonant is optional + * and thus we are doing a pre-calculation of subtracting one. + * + * Each of 19 modern leading consonants has total 588 possible syllables since + * Hangul has 21 modern vowels and 27 modern trailing consonants plus 1 for + * no trailing consonant case, i.e., 21 x 28 = 588. + * + * We also have bunch of Hangul related macros at below. Please bear in mind + * that the U8_HANGUL_JAMO_1ST_BYTE can be used to check whether it is + * a Hangul Jamo or not but the value does not guarantee that it is a Hangul + * Jamo; it just guarantee that it will be most likely. + */ +#define U8_HANGUL_SYL_FIRST (0xAC00U) +#define U8_HANGUL_SYL_LAST (0xD7A3U) + +#define U8_HANGUL_JAMO_L_FIRST (0x1100U) +#define U8_HANGUL_JAMO_L_LAST (0x1112U) +#define U8_HANGUL_JAMO_V_FIRST (0x1161U) +#define U8_HANGUL_JAMO_V_LAST (0x1175U) +#define U8_HANGUL_JAMO_T_FIRST (0x11A7U) +#define U8_HANGUL_JAMO_T_LAST (0x11C2U) + +#define U8_HANGUL_V_COUNT (21) +#define U8_HANGUL_VT_COUNT (588) +#define U8_HANGUL_T_COUNT (28) + +#define U8_HANGUL_JAMO_1ST_BYTE (0xE1U) + +#define U8_SAVE_HANGUL_AS_UTF8(s, i, j, k, b) \ + (s)[(i)] = (uchar_t)(0xE0U | ((uint32_t)(b) & 0xF000U) >> 12); \ + (s)[(j)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x0FC0U) >> 6); \ + (s)[(k)] = (uchar_t)(0x80U | ((uint32_t)(b) & 0x003FU)); + +#define U8_HANGUL_JAMO_L(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_L_LAST) + +#define U8_HANGUL_JAMO_V(u) \ + ((u) >= U8_HANGUL_JAMO_V_FIRST && (u) <= U8_HANGUL_JAMO_V_LAST) + +#define U8_HANGUL_JAMO_T(u) \ + ((u) > U8_HANGUL_JAMO_T_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_JAMO(u) \ + ((u) >= U8_HANGUL_JAMO_L_FIRST && (u) <= U8_HANGUL_JAMO_T_LAST) + +#define U8_HANGUL_SYLLABLE(u) \ + ((u) >= U8_HANGUL_SYL_FIRST && (u) <= U8_HANGUL_SYL_LAST) + +#define U8_HANGUL_COMPOSABLE_L_V(s, u) \ + ((s) == U8_STATE_HANGUL_L && U8_HANGUL_JAMO_V((u))) + +#define U8_HANGUL_COMPOSABLE_LV_T(s, u) \ + ((s) == U8_STATE_HANGUL_LV && U8_HANGUL_JAMO_T((u))) + +/* The types of decomposition mappings. */ +#define U8_DECOMP_BOTH (0xF5U) +#define U8_DECOMP_CANONICAL (0xF6U) + +/* The indicator for 16-bit table. */ +#define U8_16BIT_TABLE_INDICATOR (0x8000U) + +/* The following are some convenience macros. */ +#define U8_PUT_3BYTES_INTO_UTF32(u, b1, b2, b3) \ + (u) = ((((uint32_t)(b1) & 0x0F) << 12) | \ + (((uint32_t)(b2) & 0x3F) << 6) | \ + ((uint32_t)(b3) & 0x3F)); + +#define U8_SIMPLE_SWAP(a, b, t) \ + (t) = (a); \ + (a) = (b); \ + (b) = (t); + +#define U8_ASCII_TOUPPER(c) \ + (((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 'A' : (c)) + +#define U8_ASCII_TOLOWER(c) \ + (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' + 'a' : (c)) + +#define U8_ISASCII(c) (((uchar_t)(c)) < 0x80U) +/* + * The following macro assumes that the two characters that are to be + * swapped are adjacent to each other and 'a' comes before 'b'. + * + * If the assumptions are not met, then, the macro will fail. + */ +#define U8_SWAP_COMB_MARKS(a, b) \ + for (k = 0; k < disp[(a)]; k++) \ + u8t[k] = u8s[start[(a)] + k]; \ + for (k = 0; k < disp[(b)]; k++) \ + u8s[start[(a)] + k] = u8s[start[(b)] + k]; \ + start[(b)] = start[(a)] + disp[(b)]; \ + for (k = 0; k < disp[(a)]; k++) \ + u8s[start[(b)] + k] = u8t[k]; \ + U8_SIMPLE_SWAP(comb_class[(a)], comb_class[(b)], tc); \ + U8_SIMPLE_SWAP(disp[(a)], disp[(b)], tc); + +/* The possible states during normalization. */ +typedef enum { + U8_STATE_START = 0, + U8_STATE_HANGUL_L = 1, + U8_STATE_HANGUL_LV = 2, + U8_STATE_HANGUL_LVT = 3, + U8_STATE_HANGUL_V = 4, + U8_STATE_HANGUL_T = 5, + U8_STATE_COMBINING_MARK = 6 +} u8_normalization_states_t; + +/* + * The three vectors at below are used to check bytes of a given UTF-8 + * character are valid and not containing any malformed byte values. + * + * We used to have a quite relaxed UTF-8 binary representation but then there + * was some security related issues and so the Unicode Consortium defined + * and announced the UTF-8 Corrigendum at Unicode 3.1 and then refined it + * one more time at the Unicode 3.2. The following three tables are based on + * that. + */ + +#define U8_ILLEGAL_NEXT_BYTE_COMMON(c) ((c) < 0x80 || (c) > 0xBF) + +#define I_ U8_ILLEGAL_CHAR +#define O_ U8_OUT_OF_RANGE_CHAR + +const int8_t u8_number_of_bytes[0x100] = { + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* 80 81 82 83 84 85 86 87 88 89 8A 8B 8C 8D 8E 8F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* 90 91 92 93 94 95 96 97 98 99 9A 9B 9C 9D 9E 9F */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* A0 A1 A2 A3 A4 A5 A6 A7 A8 A9 AA AB AC AD AE AF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* B0 B1 B2 B3 B4 B5 B6 B7 B8 B9 BA BB BC BD BE BF */ + I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, I_, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + I_, I_, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 4, 4, 4, 4, 4, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, O_, +}; + +#undef I_ +#undef O_ + +const uint8_t u8_valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + +const uint8_t u8_valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, +}; + + +/* + * The u8_validate() validates on the given UTF-8 character string and + * calculate the byte length. It is quite similar to mblen(3C) except that + * this will validate against the list of characters if required and + * specific to UTF-8 and Unicode. + */ +int +u8_validate(char *u8str, size_t n, char **list, int flag, int *errnum) +{ + uchar_t *ib; + uchar_t *ibtail; + uchar_t **p; + uchar_t *s1; + uchar_t *s2; + uchar_t f; + int sz; + size_t i; + int ret_val; + boolean_t second; + boolean_t no_need_to_validate_entire; + boolean_t check_additional; + boolean_t validate_ucs2_range_only; + + if (! u8str) + return (0); + + ib = (uchar_t *)u8str; + ibtail = ib + n; + + ret_val = 0; + + no_need_to_validate_entire = ! (flag & U8_VALIDATE_ENTIRE); + check_additional = flag & U8_VALIDATE_CHECK_ADDITIONAL; + validate_ucs2_range_only = flag & U8_VALIDATE_UCS2_RANGE; + + while (ib < ibtail) { + /* + * The first byte of a UTF-8 character tells how many + * bytes will follow for the character. If the first byte + * is an illegal byte value or out of range value, we just + * return -1 with an appropriate error number. + */ + sz = u8_number_of_bytes[*ib]; + if (sz == U8_ILLEGAL_CHAR) { + *errnum = EILSEQ; + return (-1); + } + + if (sz == U8_OUT_OF_RANGE_CHAR || + (validate_ucs2_range_only && sz > U8_MAX_BYTES_UCS2)) { + *errnum = ERANGE; + return (-1); + } + + /* + * If we don't have enough bytes to check on, that's also + * an error. As you can see, we give illegal byte sequence + * checking higher priority then EINVAL cases. + */ + if ((ibtail - ib) < sz) { + *errnum = EINVAL; + return (-1); + } + + if (sz == 1) { + ib++; + ret_val++; + } else { + /* + * Check on the multi-byte UTF-8 character. For more + * details on this, see comment added for the used + * data structures at the beginning of the file. + */ + f = *ib++; + ret_val++; + second = B_TRUE; + for (i = 1; i < sz; i++) { + if (second) { + if (*ib < u8_valid_min_2nd_byte[f] || + *ib > u8_valid_max_2nd_byte[f]) { + *errnum = EILSEQ; + return (-1); + } + second = B_FALSE; + } else if (U8_ILLEGAL_NEXT_BYTE_COMMON(*ib)) { + *errnum = EILSEQ; + return (-1); + } + ib++; + ret_val++; + } + } + + if (check_additional) { + for (p = (uchar_t **)list, i = 0; p[i]; i++) { + s1 = ib - sz; + s2 = p[i]; + while (s1 < ib) { + if (*s1 != *s2 || *s2 == '\0') + break; + s1++; + s2++; + } + + if (s1 >= ib && *s2 == '\0') { + *errnum = EBADF; + return (-1); + } + } + } + + if (no_need_to_validate_entire) + break; + } + + return (ret_val); +} + +/* + * The do_case_conv() looks at the mapping tables and returns found + * bytes if any. If not found, the input bytes are returned. The function + * always terminate the return bytes with a null character assuming that + * there are plenty of room to do so. + * + * The case conversions are simple case conversions mapping a character to + * another character as specified in the Unicode data. The byte size of + * the mapped character could be different from that of the input character. + * + * The return value is the byte length of the returned character excluding + * the terminating null byte. + */ +static size_t +do_case_conv(int uv, uchar_t *u8s, uchar_t *s, int sz, boolean_t is_it_toupper) +{ + size_t i; + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + /* + * At this point, the only possible values for sz are 2, 3, and 4. + * The u8s should point to a vector that is well beyond the size of + * 5 bytes. + */ + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + } else if (sz == 3) { + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + } else { + /* This is not possible but just in case as a fallback. */ + if (is_it_toupper) + *u8s = U8_ASCII_TOUPPER(*s); + else + *u8s = U8_ASCII_TOLOWER(*s); + u8s[1] = '\0'; + + return (1); + } + u8s[sz] = '\0'; + + /* + * Let's find out if we have a corresponding character. + */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_case_common_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + if (is_it_toupper) { + b3_tbl = u8_toupper_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_toupper_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_toupper_b4_tbl[uv][b3_tbl][b4 + 1]; + + /* Either there is no match or an error at the table. */ + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_toupper_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_toupper_final_tbl[uv][b3_base + start_id]; + } else { + b3_tbl = u8_tolower_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + start_id = u8_tolower_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_tolower_b4_tbl[uv][b3_tbl][b4 + 1]; + + if (start_id >= end_id || (end_id - start_id) > U8_MB_CUR_MAX) + return ((size_t)sz); + + b3_base = u8_tolower_b3_tbl[uv][b2][b3].base; + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_tolower_final_tbl[uv][b3_base + start_id]; + } + + /* + * If i is still zero, that means there is no corresponding character. + */ + if (i == 0) + return ((size_t)sz); + + u8s[i] = '\0'; + + return (i); +} + +/* + * The do_case_compare() function compares the two input strings, s1 and s2, + * one character at a time doing case conversions if applicable and return + * the comparison result as like strcmp(). + * + * Since, in empirical sense, most of text data are 7-bit ASCII characters, + * we treat the 7-bit ASCII characters as a special case trying to yield + * faster processing time. + */ +static int +do_case_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, + size_t n2, boolean_t is_it_toupper, int *errnum) +{ + int f; + int sz1; + int sz2; + size_t j; + size_t i1; + size_t i2; + uchar_t u8s1[U8_MB_CUR_MAX + 1]; + uchar_t u8s2[U8_MB_CUR_MAX + 1]; + + i1 = i2 = 0; + while (i1 < n1 && i2 < n2) { + /* + * Find out what would be the byte length for this UTF-8 + * character at string s1 and also find out if this is + * an illegal start byte or not and if so, issue a proper + * error number and yet treat this byte as a character. + */ + sz1 = u8_number_of_bytes[*s1]; + if (sz1 < 0) { + *errnum = EILSEQ; + sz1 = 1; + } + + /* + * For 7-bit ASCII characters mainly, we do a quick case + * conversion right at here. + * + * If we don't have enough bytes for this character, issue + * an EINVAL error and use what are available. + * + * If we have enough bytes, find out if there is + * a corresponding uppercase character and if so, copy over + * the bytes for a comparison later. If there is no + * corresponding uppercase character, then, use what we have + * for the comparison. + */ + if (sz1 == 1) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else + u8s1[0] = U8_ASCII_TOLOWER(*s1); + s1++; + u8s1[1] = '\0'; + } else if ((i1 + sz1) > n1) { + *errnum = EINVAL; + for (j = 0; (i1 + j) < n1; ) + u8s1[j++] = *s1++; + u8s1[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s1, s1, sz1, is_it_toupper); + s1 += sz1; + } + + /* Do the same for the string s2. */ + sz2 = u8_number_of_bytes[*s2]; + if (sz2 < 0) { + *errnum = EILSEQ; + sz2 = 1; + } + + if (sz2 == 1) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else + u8s2[0] = U8_ASCII_TOLOWER(*s2); + s2++; + u8s2[1] = '\0'; + } else if ((i2 + sz2) > n2) { + *errnum = EINVAL; + for (j = 0; (i2 + j) < n2; ) + u8s2[j++] = *s2++; + u8s2[j] = '\0'; + } else { + (void) do_case_conv(uv, u8s2, s2, sz2, is_it_toupper); + s2 += sz2; + } + + /* Now compare the two characters. */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + f = strcmp((const char *)u8s1, (const char *)u8s2); + if (f != 0) + return (f); + } + + /* + * They were the same. Let's move on to the next + * characters then. + */ + i1 += sz1; + i2 += sz2; + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one of the two ends, that means the other string + * has something which then the fact can be used to determine + * the return value. + */ + if (i1 >= n1) { + if (i2 >= n2) + return (0); + return (-1); + } + return (1); +} + +/* + * The combining_class() function checks on the given bytes and find out + * the corresponding Unicode combining class value. The return value 0 means + * it is a Starter. Any illegal UTF-8 character will also be treated as + * a Starter. + */ +static uchar_t +combining_class(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b4 = 0; + + if (sz == 1 || sz > 4) + return (0); + + if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } + + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b2 = u8_combining_class_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + b3 = u8_combining_class_b3_tbl[uv][b2][b3]; + if (b3 == U8_TBL_ELEMENT_NOT_DEF) + return (0); + + return (u8_combining_class_b4_tbl[uv][b3][b4]); +} + +/* + * The do_decomp() function finds out a matching decomposition if any + * and return. If there is no match, the input bytes are copied and returned. + * The function also checks if there is a Hangul, decomposes it if necessary + * and returns. + * + * To save time, a single byte 7-bit ASCII character should be handled by + * the caller. + * + * The function returns the number of bytes returned sans always terminating + * the null byte. It will also return a state that will tell if there was + * a Hangul character decomposed which then will be used by the caller. + */ +static size_t +do_decomp(size_t uv, uchar_t *u8s, uchar_t *s, int sz, + boolean_t canonical_decomposition, u8_normalization_states_t *state) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + size_t i; + uint32_t u1; + + if (sz == 2) { + b3 = u8s[0] = s[0]; + b4 = u8s[1] = s[1]; + u8s[2] = '\0'; + } else if (sz == 3) { + /* Convert it to a Unicode scalar value. */ + U8_PUT_3BYTES_INTO_UTF32(u1, s[0], s[1], s[2]); + + /* + * If this is a Hangul syllable, we decompose it into + * a leading consonant, a vowel, and an optional trailing + * consonant and then return. + */ + if (U8_HANGUL_SYLLABLE(u1)) { + u1 -= U8_HANGUL_SYL_FIRST; + + b1 = U8_HANGUL_JAMO_L_FIRST + u1 / U8_HANGUL_VT_COUNT; + b2 = U8_HANGUL_JAMO_V_FIRST + (u1 % U8_HANGUL_VT_COUNT) + / U8_HANGUL_T_COUNT; + b3 = u1 % U8_HANGUL_T_COUNT; + + U8_SAVE_HANGUL_AS_UTF8(u8s, 0, 1, 2, b1); + U8_SAVE_HANGUL_AS_UTF8(u8s, 3, 4, 5, b2); + if (b3) { + b3 += U8_HANGUL_JAMO_T_FIRST; + U8_SAVE_HANGUL_AS_UTF8(u8s, 6, 7, 8, b3); + + u8s[9] = '\0'; + *state = U8_STATE_HANGUL_LVT; + return (9); + } + + u8s[6] = '\0'; + *state = U8_STATE_HANGUL_LV; + return (6); + } + + b2 = u8s[0] = s[0]; + b3 = u8s[1] = s[1]; + b4 = u8s[2] = s[2]; + u8s[3] = '\0'; + + /* + * If this is a Hangul Jamo, we know there is nothing + * further that we can decompose. + */ + if (U8_HANGUL_JAMO_L(u1)) { + *state = U8_STATE_HANGUL_L; + return (3); + } + + if (U8_HANGUL_JAMO_V(u1)) { + if (*state == U8_STATE_HANGUL_L) + *state = U8_STATE_HANGUL_LV; + else + *state = U8_STATE_HANGUL_V; + return (3); + } + + if (U8_HANGUL_JAMO_T(u1)) { + if (*state == U8_STATE_HANGUL_LV) + *state = U8_STATE_HANGUL_LVT; + else + *state = U8_STATE_HANGUL_T; + return (3); + } + } else if (sz == 4) { + b1 = u8s[0] = s[0]; + b2 = u8s[1] = s[1]; + b3 = u8s[2] = s[2]; + b4 = u8s[3] = s[3]; + u8s[4] = '\0'; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + u8s[0] = s[0]; + u8s[1] = '\0'; + *state = U8_STATE_START; + return (1); + } + + /* + * At this point, this routine does not know what it would get. + * The caller should sort it out if the state isn't a Hangul one. + */ + *state = U8_STATE_START; + + /* Try to find matching decomposition mapping byte sequence. */ + b1 = u8_common_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b2 = u8_decomp_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + b3_tbl = u8_decomp_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return ((size_t)sz); + + /* + * If b3_tbl is bigger than or equal to U8_16BIT_TABLE_INDICATOR + * which is 0x8000, this means we couldn't fit the mappings into + * the cardinality of a unsigned byte. + */ + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_decomp_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_decomp_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + /* This also means there wasn't any matching decomposition. */ + if (start_id >= end_id) + return ((size_t)sz); + + /* + * The final table for decomposition mappings has three types of + * byte sequences depending on whether a mapping is for compatibility + * decomposition, canonical decomposition, or both like the following: + * + * (1) Compatibility decomposition mappings: + * + * +---+---+-...-+---+ + * | B0| B1| ... | Bm| + * +---+---+-...-+---+ + * + * The first byte, B0, is always less then 0xF5 (U8_DECOMP_BOTH). + * + * (2) Canonical decomposition mappings: + * + * +---+---+---+-...-+---+ + * | T | b0| b1| ... | bn| + * +---+---+---+-...-+---+ + * + * where the first byte, T, is 0xF6 (U8_DECOMP_CANONICAL). + * + * (3) Both mappings: + * + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * | T | D | b0| b1| ... | bn| B0| B1| ... | Bm| + * +---+---+---+---+-...-+---+---+---+-...-+---+ + * + * where T is 0xF5 (U8_DECOMP_BOTH) and D is a displacement + * byte, b0 to bn are canonical mapping bytes and B0 to Bm are + * compatibility mapping bytes. + * + * Note that compatibility decomposition means doing recursive + * decompositions using both compatibility decomposition mappings and + * canonical decomposition mappings. On the other hand, canonical + * decomposition means doing recursive decompositions using only + * canonical decomposition mappings. Since the table we have has gone + * through the recursions already, we do not need to do so during + * runtime, i.e., the table has been completely flattened out + * already. + */ + + b3_base = u8_decomp_b3_tbl[uv][b2][b3].base; + + /* Get the type, T, of the byte sequence. */ + b1 = u8_decomp_final_tbl[uv][b3_base + start_id]; + + /* + * If necessary, adjust start_id, end_id, or both. Note that if + * this is compatibility decomposition mapping, there is no + * adjustment. + */ + if (canonical_decomposition) { + /* Is the mapping only for compatibility decomposition? */ + if (b1 < U8_DECOMP_BOTH) + return ((size_t)sz); + + start_id++; + + if (b1 == U8_DECOMP_BOTH) { + end_id = start_id + + u8_decomp_final_tbl[uv][b3_base + start_id]; + start_id++; + } + } else { + /* + * Unless this is a compatibility decomposition mapping, + * we adjust the start_id. + */ + if (b1 == U8_DECOMP_BOTH) { + start_id++; + start_id += u8_decomp_final_tbl[uv][b3_base + start_id]; + } else if (b1 == U8_DECOMP_CANONICAL) { + start_id++; + } + } + + for (i = 0; start_id < end_id; start_id++) + u8s[i++] = u8_decomp_final_tbl[uv][b3_base + start_id]; + u8s[i] = '\0'; + + return (i); +} + +/* + * The find_composition_start() function uses the character bytes given and + * find out the matching composition mappings if any and return the address + * to the composition mappings as explained in the do_composition(). + */ +static uchar_t * +find_composition_start(size_t uv, uchar_t *s, size_t sz) +{ + uint16_t b1 = 0; + uint16_t b2 = 0; + uint16_t b3 = 0; + uint16_t b3_tbl; + uint16_t b3_base; + uint16_t b4 = 0; + size_t start_id; + size_t end_id; + + if (sz == 1) { + b4 = s[0]; + } else if (sz == 2) { + b3 = s[0]; + b4 = s[1]; + } else if (sz == 3) { + b2 = s[0]; + b3 = s[1]; + b4 = s[2]; + } else if (sz == 4) { + b1 = s[0]; + b2 = s[1]; + b3 = s[2]; + b4 = s[3]; + } else { + /* + * This is a fallback and should not happen if the function + * was called properly. + */ + return (NULL); + } + + b1 = u8_composition_b1_tbl[uv][b1]; + if (b1 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b2 = u8_composition_b2_tbl[uv][b1][b2]; + if (b2 == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + b3_tbl = u8_composition_b3_tbl[uv][b2][b3].tbl_id; + if (b3_tbl == U8_TBL_ELEMENT_NOT_DEF) + return (NULL); + + if (b3_tbl >= U8_16BIT_TABLE_INDICATOR) { + b3_tbl -= U8_16BIT_TABLE_INDICATOR; + start_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_16bit_tbl[uv][b3_tbl][b4 + 1]; + } else { + start_id = u8_composition_b4_tbl[uv][b3_tbl][b4]; + end_id = u8_composition_b4_tbl[uv][b3_tbl][b4 + 1]; + } + + if (start_id >= end_id) + return (NULL); + + b3_base = u8_composition_b3_tbl[uv][b2][b3].base; + + return ((uchar_t *)&(u8_composition_final_tbl[uv][b3_base + start_id])); +} + +/* + * The blocked() function checks on the combining class values of previous + * characters in this sequence and return whether it is blocked or not. + */ +static boolean_t +blocked(uchar_t *comb_class, size_t last) +{ + uchar_t my_comb_class; + size_t i; + + my_comb_class = comb_class[last]; + for (i = 1; i < last; i++) + if (comb_class[i] >= my_comb_class || + comb_class[i] == U8_COMBINING_CLASS_STARTER) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The do_composition() reads the character string pointed by 's' and + * do necessary canonical composition and then copy over the result back to + * the 's'. + * + * The input argument 's' cannot contain more than 32 characters. + */ +static size_t +do_composition(size_t uv, uchar_t *s, uchar_t *comb_class, uchar_t *start, + uchar_t *disp, size_t last, uchar_t **os, uchar_t *oslast) +{ + uchar_t t[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc[U8_MB_CUR_MAX] = { '\0' }; + uint8_t saved_marks[U8_MAX_CHARS_A_SEQ]; + size_t saved_marks_count; + uchar_t *p; + uchar_t *saved_p; + uchar_t *q; + size_t i; + size_t saved_i; + size_t j; + size_t k; + size_t l; + size_t C; + size_t saved_l; + size_t size; + uint32_t u1; + uint32_t u2; + boolean_t match_not_found = B_TRUE; + + /* + * This should never happen unless the callers are doing some strange + * and unexpected things. + * + * The "last" is the index pointing to the last character not last + 1. + */ + if (last >= U8_MAX_CHARS_A_SEQ) + last = U8_UPPER_LIMIT_IN_A_SEQ; + + for (i = l = 0; i <= last; i++) { + /* + * The last or any non-Starters at the beginning, we don't + * have any chance to do composition and so we just copy them + * to the temporary buffer. + */ + if (i >= last || comb_class[i] != U8_COMBINING_CLASS_STARTER) { +SAVE_THE_CHAR: + p = s + start[i]; + size = disp[i]; + for (k = 0; k < size; k++) + t[l++] = *p++; + continue; + } + + /* + * If this could be a start of Hangul Jamos, then, we try to + * conjoin them. + */ + if (s[start[i]] == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, s[start[i]], + s[start[i] + 1], s[start[i] + 2]); + U8_PUT_3BYTES_INTO_UTF32(u2, s[start[i] + 3], + s[start[i] + 4], s[start[i] + 5]); + + if (U8_HANGUL_JAMO_L(u1) && U8_HANGUL_JAMO_V(u2)) { + u1 -= U8_HANGUL_JAMO_L_FIRST; + u2 -= U8_HANGUL_JAMO_V_FIRST; + u1 = U8_HANGUL_SYL_FIRST + + (u1 * U8_HANGUL_V_COUNT + u2) * + U8_HANGUL_T_COUNT; + + i += 2; + if (i <= last) { + U8_PUT_3BYTES_INTO_UTF32(u2, + s[start[i]], s[start[i] + 1], + s[start[i] + 2]); + + if (U8_HANGUL_JAMO_T(u2)) { + u1 += u2 - + U8_HANGUL_JAMO_T_FIRST; + i++; + } + } + + U8_SAVE_HANGUL_AS_UTF8(t + l, 0, 1, 2, u1); + i--; + l += 3; + continue; + } + } + + /* + * Let's then find out if this Starter has composition + * mapping. + */ + p = find_composition_start(uv, s + start[i], disp[i]); + if (p == NULL) + goto SAVE_THE_CHAR; + + /* + * We have a Starter with composition mapping and the next + * character is a non-Starter. Let's try to find out if + * we can do composition. + */ + + saved_p = p; + saved_i = i; + saved_l = l; + saved_marks_count = 0; + +TRY_THE_NEXT_MARK: + q = s + start[++i]; + size = disp[i]; + + /* + * The next for() loop compares the non-Starter pointed by + * 'q' with the possible (joinable) characters pointed by 'p'. + * + * The composition final table entry pointed by the 'p' + * looks like the following: + * + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * | C | b0| b2| ... | bn| F | B0| B1| ... | Bm| F | + * +---+---+---+-...-+---+---+---+---+-...-+---+---+ + * + * where C is the count byte indicating the number of + * mapping pairs where each pair would be look like + * (b0-bn F, B0-Bm F). The b0-bn are the bytes of the second + * character of a canonical decomposition and the B0-Bm are + * the bytes of a matching composite character. The F is + * a filler byte after each character as the separator. + */ + + match_not_found = B_TRUE; + + for (C = *p++; C > 0; C--) { + for (k = 0; k < size; p++, k++) + if (*p != q[k]) + break; + + /* Have we found it? */ + if (k >= size && *p == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++p != U8_TBL_ELEMENT_FILLER) + t[l++] = *p; + + break; + } + + /* We didn't find; skip to the next pair. */ + if (*p != U8_TBL_ELEMENT_FILLER) + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + while (*++p != U8_TBL_ELEMENT_FILLER) + ; + p++; + } + + /* + * If there was no match, we will need to save the combining + * mark for later appending. After that, if the next one + * is a non-Starter and not blocked, then, we try once + * again to do composition with the next non-Starter. + * + * If there was no match and this was a Starter, then, + * this is a new start. + * + * If there was a match and a composition done and we have + * more to check on, then, we retrieve a new composition final + * table entry for the composite and then try to do the + * composition again. + */ + + if (match_not_found) { + if (comb_class[i] == U8_COMBINING_CLASS_STARTER) { + i--; + goto SAVE_THE_CHAR; + } + + saved_marks[saved_marks_count++] = i; + } + + if (saved_l == l) { + while (i < last) { + if (blocked(comb_class, i + 1)) + saved_marks[saved_marks_count++] = ++i; + else + break; + } + if (i < last) { + p = saved_p; + goto TRY_THE_NEXT_MARK; + } + } else if (i < last) { + p = find_composition_start(uv, t + saved_l, + l - saved_l); + if (p != NULL) { + saved_p = p; + goto TRY_THE_NEXT_MARK; + } + } + + /* + * There is no more composition possible. + * + * If there was no composition what so ever then we copy + * over the original Starter and then append any non-Starters + * remaining at the target string sequentially after that. + */ + + if (saved_l == l) { + p = s + start[saved_i]; + size = disp[saved_i]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + + for (k = 0; k < saved_marks_count; k++) { + p = s + start[saved_marks[k]]; + size = disp[saved_marks[k]]; + for (j = 0; j < size; j++) + t[l++] = *p++; + } + } + + /* + * If the last character is a Starter and if we have a character + * (possibly another Starter) that can be turned into a composite, + * we do so and we do so until there is no more of composition + * possible. + */ + if (comb_class[last] == U8_COMBINING_CLASS_STARTER) { + p = *os; + saved_l = l - disp[last]; + + while (p < oslast) { + size = u8_number_of_bytes[*p]; + if (size <= 1 || (p + size) > oslast) + break; + + saved_p = p; + + for (i = 0; i < size; i++) + tc[i] = *p++; + + q = find_composition_start(uv, t + saved_l, + l - saved_l); + if (q == NULL) { + p = saved_p; + break; + } + + match_not_found = B_TRUE; + + for (C = *q++; C > 0; C--) { + for (k = 0; k < size; q++, k++) + if (*q != tc[k]) + break; + + if (k >= size && *q == U8_TBL_ELEMENT_FILLER) { + match_not_found = B_FALSE; + + l = saved_l; + + while (*++q != U8_TBL_ELEMENT_FILLER) { + /* + * This is practically + * impossible but we don't + * want to take any chances. + */ + if (l >= + U8_STREAM_SAFE_TEXT_MAX) { + p = saved_p; + goto SAFE_RETURN; + } + t[l++] = *q; + } + + break; + } + + if (*q != U8_TBL_ELEMENT_FILLER) + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + while (*++q != U8_TBL_ELEMENT_FILLER) + ; + q++; + } + + if (match_not_found) { + p = saved_p; + break; + } + } +SAFE_RETURN: + *os = p; + } + + /* + * Now we copy over the temporary string to the target string. + * Since composition always reduces the number of characters or + * the number of characters stay, we don't need to worry about + * the buffer overflow here. + */ + for (i = 0; i < l; i++) + s[i] = t[i]; + s[l] = '\0'; + + return (l); +} + +/* + * The collect_a_seq() function checks on the given string s, collect + * a sequence of characters at u8s, and return the sequence. While it collects + * a sequence, it also applies case conversion, canonical or compatibility + * decomposition, canonical decomposition, or some or all of them and + * in that order. + * + * The collected sequence cannot be bigger than 32 characters since if + * it is having more than 31 characters, the sequence will be terminated + * with a U+034F COMBINING GRAPHEME JOINER (CGJ) character and turned into + * a Stream-Safe Text. The collected sequence is always terminated with + * a null byte and the return value is the byte length of the sequence + * including 0. The return value does not include the terminating + * null byte. + */ +static size_t +collect_a_seq(size_t uv, uchar_t *u8s, uchar_t **source, uchar_t *slast, + boolean_t is_it_toupper, boolean_t is_it_tolower, + boolean_t canonical_decomposition, boolean_t compatibility_decomposition, + boolean_t canonical_composition, + int *errnum, u8_normalization_states_t *state) +{ + uchar_t *s; + int sz; + int saved_sz; + size_t i; + size_t j; + size_t k; + size_t l; + uchar_t comb_class[U8_MAX_CHARS_A_SEQ]; + uchar_t disp[U8_MAX_CHARS_A_SEQ]; + uchar_t start[U8_MAX_CHARS_A_SEQ]; + uchar_t u8t[U8_MB_CUR_MAX] = { '\0' }; + uchar_t uts[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t tc; + size_t last; + size_t saved_last; + uint32_t u1; + + /* + * Save the source string pointer which we will return a changed + * pointer if we do processing. + */ + s = *source; + + /* + * The following is a fallback for just in case callers are not + * checking the string boundaries before the calling. + */ + if (s >= slast) { + u8s[0] = '\0'; + + return (0); + } + + /* + * As the first thing, let's collect a character and do case + * conversion if necessary. + */ + + sz = u8_number_of_bytes[*s]; + + if (sz < 0) { + *errnum = EILSEQ; + + u8s[0] = *s++; + u8s[1] = '\0'; + + *source = s; + + return (1); + } + + if (sz == 1) { + if (is_it_toupper) + u8s[0] = U8_ASCII_TOUPPER(*s); + else if (is_it_tolower) + u8s[0] = U8_ASCII_TOLOWER(*s); + else + u8s[0] = *s; + s++; + u8s[1] = '\0'; + } else if ((s + sz) > slast) { + *errnum = EINVAL; + + for (i = 0; s < slast; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + + *source = s; + + return (i); + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(uv, u8s, s, sz, is_it_toupper); + s += sz; + sz = i; + } else { + for (i = 0; i < sz; ) + u8s[i++] = *s++; + u8s[i] = '\0'; + } + } + + /* + * And then canonical/compatibility decomposition followed by + * an optional canonical composition. Please be noted that + * canonical composition is done only when a decomposition is + * done. + */ + if (canonical_decomposition || compatibility_decomposition) { + if (sz == 1) { + *state = U8_STATE_START; + + saved_sz = 1; + + comb_class[0] = 0; + start[0] = 0; + disp[0] = 1; + + last = 1; + } else { + saved_sz = do_decomp(uv, u8s, u8s, sz, + canonical_decomposition, state); + + last = 0; + + for (i = 0; i < saved_sz; ) { + sz = u8_number_of_bytes[u8s[i]]; + + comb_class[last] = combining_class(uv, + u8s + i, sz); + start[last] = i; + disp[last] = sz; + + last++; + i += sz; + } + + /* + * Decomposition yields various Hangul related + * states but not on combining marks. We need to + * find out at here by checking on the last + * character. + */ + if (*state == U8_STATE_START) { + if (comb_class[last - 1]) + *state = U8_STATE_COMBINING_MARK; + } + } + + saved_last = last; + + while (s < slast) { + sz = u8_number_of_bytes[*s]; + + /* + * If this is an illegal character, an incomplete + * character, or an 7-bit ASCII Starter character, + * then we have collected a sequence; break and let + * the next call deal with the two cases. + * + * Note that this is okay only if you are using this + * function with a fixed length string, not on + * a buffer with multiple calls of one chunk at a time. + */ + if (sz <= 1) { + break; + } else if ((s + sz) > slast) { + break; + } else { + /* + * If the previous character was a Hangul Jamo + * and this character is a Hangul Jamo that + * can be conjoined, we collect the Jamo. + */ + if (*s == U8_HANGUL_JAMO_1ST_BYTE) { + U8_PUT_3BYTES_INTO_UTF32(u1, + *s, *(s + 1), *(s + 2)); + + if (U8_HANGUL_COMPOSABLE_L_V(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LV; + goto COLLECT_A_HANGUL; + } + + if (U8_HANGUL_COMPOSABLE_LV_T(*state, + u1)) { + i = 0; + *state = U8_STATE_HANGUL_LVT; + goto COLLECT_A_HANGUL; + } + } + + /* + * Regardless of whatever it was, if this is + * a Starter, we don't collect the character + * since that's a new start and we will deal + * with it at the next time. + */ + i = combining_class(uv, s, sz); + if (i == U8_COMBINING_CLASS_STARTER) + break; + + /* + * We know the current character is a combining + * mark. If the previous character wasn't + * a Starter (not Hangul) or a combining mark, + * then, we don't collect this combining mark. + */ + if (*state != U8_STATE_START && + *state != U8_STATE_COMBINING_MARK) + break; + + *state = U8_STATE_COMBINING_MARK; +COLLECT_A_HANGUL: + /* + * If we collected a Starter and combining + * marks up to 30, i.e., total 31 characters, + * then, we terminate this degenerately long + * combining sequence with a U+034F COMBINING + * GRAPHEME JOINER (CGJ) which is 0xCD 0x8F in + * UTF-8 and turn this into a Stream-Safe + * Text. This will be extremely rare but + * possible. + * + * The following will also guarantee that + * we are not writing more than 32 characters + * plus a NULL at u8s[]. + */ + if (last >= U8_UPPER_LIMIT_IN_A_SEQ) { +TURN_STREAM_SAFE: + *state = U8_STATE_START; + comb_class[last] = 0; + start[last] = saved_sz; + disp[last] = 2; + last++; + + u8s[saved_sz++] = 0xCD; + u8s[saved_sz++] = 0x8F; + + break; + } + + /* + * Some combining marks also do decompose into + * another combining mark or marks. + */ + if (*state == U8_STATE_COMBINING_MARK) { + k = last; + l = sz; + i = do_decomp(uv, uts, s, sz, + canonical_decomposition, state); + for (j = 0; j < i; ) { + sz = u8_number_of_bytes[uts[j]]; + + comb_class[last] = + combining_class(uv, + uts + j, sz); + start[last] = saved_sz + j; + disp[last] = sz; + + last++; + if (last >= + U8_UPPER_LIMIT_IN_A_SEQ) { + last = k; + goto TURN_STREAM_SAFE; + } + j += sz; + } + + *state = U8_STATE_COMBINING_MARK; + sz = i; + s += l; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = uts[i]; + } else { + comb_class[last] = i; + start[last] = saved_sz; + disp[last] = sz; + last++; + + for (i = 0; i < sz; i++) + u8s[saved_sz++] = *s++; + } + + /* + * If this is U+0345 COMBINING GREEK + * YPOGEGRAMMENI (0xCD 0x85 in UTF-8), a.k.a., + * iota subscript, and need to be converted to + * uppercase letter, convert it to U+0399 GREEK + * CAPITAL LETTER IOTA (0xCE 0x99 in UTF-8), + * i.e., convert to capital adscript form as + * specified in the Unicode standard. + * + * This is the only special case of (ambiguous) + * case conversion at combining marks and + * probably the standard will never have + * anything similar like this in future. + */ + if (is_it_toupper && sz >= 2 && + u8s[saved_sz - 2] == 0xCD && + u8s[saved_sz - 1] == 0x85) { + u8s[saved_sz - 2] = 0xCE; + u8s[saved_sz - 1] = 0x99; + } + } + } + + /* + * Let's try to ensure a canonical ordering for the collected + * combining marks. We do this only if we have collected + * at least one more non-Starter. (The decomposition mapping + * data tables have fully (and recursively) expanded and + * canonically ordered decompositions.) + * + * The U8_SWAP_COMB_MARKS() convenience macro has some + * assumptions and we are meeting the assumptions. + */ + last--; + if (last >= saved_last) { + for (i = 0; i < last; i++) + for (j = last; j > i; j--) + if (comb_class[j] && + comb_class[j - 1] > comb_class[j]) { + U8_SWAP_COMB_MARKS(j - 1, j); + } + } + + *source = s; + + if (! canonical_composition) { + u8s[saved_sz] = '\0'; + return (saved_sz); + } + + /* + * Now do the canonical composition. Note that we do this + * only after a canonical or compatibility decomposition to + * finish up NFC or NFKC. + */ + sz = do_composition(uv, u8s, comb_class, start, disp, last, + &s, slast); + } + + *source = s; + + return ((size_t)sz); +} + +/* + * The do_norm_compare() function does string comparison based on Unicode + * simple case mappings and Unicode Normalization definitions. + * + * It does so by collecting a sequence of character at a time and comparing + * the collected sequences from the strings. + * + * The meanings on the return values are the same as the usual strcmp(). + */ +static int +do_norm_compare(size_t uv, uchar_t *s1, uchar_t *s2, size_t n1, size_t n2, + int flag, int *errnum) +{ + int result; + size_t sz1; + size_t sz2; + uchar_t u8s1[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t u8s2[U8_STREAM_SAFE_TEXT_MAX + 1]; + uchar_t *s1last; + uchar_t *s2last; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + u8_normalization_states_t state; + + s1last = s1 + n1; + s2last = s2 + n2; + + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (s1 < s1last && s2 < s2last) { + /* + * If the current character is a 7-bit ASCII and the last + * character, or, if the current character and the next + * character are both some 7-bit ASCII characters then + * we treat the current character as a sequence. + * + * In any other cases, we need to call collect_a_seq(). + */ + + if (U8_ISASCII(*s1) && ((s1 + 1) >= s1last || + ((s1 + 1) < s1last && U8_ISASCII(*(s1 + 1))))) { + if (is_it_toupper) + u8s1[0] = U8_ASCII_TOUPPER(*s1); + else if (is_it_tolower) + u8s1[0] = U8_ASCII_TOLOWER(*s1); + else + u8s1[0] = *s1; + u8s1[1] = '\0'; + sz1 = 1; + s1++; + } else { + state = U8_STATE_START; + sz1 = collect_a_seq(uv, u8s1, &s1, s1last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + if (U8_ISASCII(*s2) && ((s2 + 1) >= s2last || + ((s2 + 1) < s2last && U8_ISASCII(*(s2 + 1))))) { + if (is_it_toupper) + u8s2[0] = U8_ASCII_TOUPPER(*s2); + else if (is_it_tolower) + u8s2[0] = U8_ASCII_TOLOWER(*s2); + else + u8s2[0] = *s2; + u8s2[1] = '\0'; + sz2 = 1; + s2++; + } else { + state = U8_STATE_START; + sz2 = collect_a_seq(uv, u8s2, &s2, s2last, + is_it_toupper, is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, errnum, &state); + } + + /* + * Now compare the two characters. If they are the same, + * we move on to the next character sequences. + */ + if (sz1 == 1 && sz2 == 1) { + if (*u8s1 > *u8s2) + return (1); + if (*u8s1 < *u8s2) + return (-1); + } else { + result = strcmp((const char *)u8s1, (const char *)u8s2); + if (result != 0) + return (result); + } + } + + /* + * We compared until the end of either or both strings. + * + * If we reached to or went over the ends for the both, that means + * they are the same. + * + * If we reached only one end, that means the other string has + * something which then can be used to determine the return value. + */ + if (s1 >= s1last) { + if (s2 >= s2last) + return (0); + return (-1); + } + return (1); +} + +/* + * The u8_strcmp() function compares two UTF-8 strings quite similar to + * the strcmp(). For the comparison, however, Unicode Normalization specific + * equivalency and Unicode simple case conversion mappings based equivalency + * can be requested and checked against. + */ +int +u8_strcmp(const char *s1, const char *s2, size_t n, int flag, size_t uv, + int *errnum) +{ + int f; + size_t n1; + size_t n2; + + *errnum = 0; + + /* + * Check on the requested Unicode version, case conversion, and + * normalization flag values. + */ + + if (uv > U8_UNICODE_LATEST) { + *errnum = ERANGE; + uv = U8_UNICODE_LATEST; + } + + if (flag == 0) { + flag = U8_STRCMP_CS; + } else { + f = flag & (U8_STRCMP_CS | U8_STRCMP_CI_UPPER | + U8_STRCMP_CI_LOWER); + if (f == 0) { + flag |= U8_STRCMP_CS; + } else if (f != U8_STRCMP_CS && f != U8_STRCMP_CI_UPPER && + f != U8_STRCMP_CI_LOWER) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_STRCMP_NFD && f != U8_STRCMP_NFC && + f != U8_STRCMP_NFKD && f != U8_STRCMP_NFKC) { + *errnum = EBADF; + flag = U8_STRCMP_CS; + } + } + + if (flag == U8_STRCMP_CS) { + return (n == 0 ? strcmp(s1, s2) : strncmp(s1, s2, n)); + } + + n1 = strlen(s1); + n2 = strlen(s2); + if (n != 0) { + if (n < n1) + n1 = n; + if (n < n2) + n2 = n; + } + + /* + * Simple case conversion can be done much faster and so we do + * them separately here. + */ + if (flag == U8_STRCMP_CI_UPPER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_TRUE, errnum)); + } else if (flag == U8_STRCMP_CI_LOWER) { + return (do_case_compare(uv, (uchar_t *)s1, (uchar_t *)s2, + n1, n2, B_FALSE, errnum)); + } + + return (do_norm_compare(uv, (uchar_t *)s1, (uchar_t *)s2, n1, n2, + flag, errnum)); +} + +size_t +u8_textprep_str(char *inarray, size_t *inlen, char *outarray, size_t *outlen, + int flag, size_t unicode_version, int *errnum) +{ + int f; + int sz; + uchar_t *ib; + uchar_t *ibtail; + uchar_t *ob; + uchar_t *obtail; + boolean_t do_not_ignore_null; + boolean_t do_not_ignore_invalid; + boolean_t is_it_toupper; + boolean_t is_it_tolower; + boolean_t canonical_decomposition; + boolean_t compatibility_decomposition; + boolean_t canonical_composition; + size_t ret_val; + size_t i; + size_t j; + uchar_t u8s[U8_STREAM_SAFE_TEXT_MAX + 1]; + u8_normalization_states_t state; + + if (unicode_version > U8_UNICODE_LATEST) { + *errnum = ERANGE; + return ((size_t)-1); + } + + f = flag & (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER); + if (f == (U8_TEXTPREP_TOUPPER | U8_TEXTPREP_TOLOWER)) { + *errnum = EBADF; + return ((size_t)-1); + } + + f = flag & (U8_CANON_DECOMP | U8_COMPAT_DECOMP | U8_CANON_COMP); + if (f && f != U8_TEXTPREP_NFD && f != U8_TEXTPREP_NFC && + f != U8_TEXTPREP_NFKD && f != U8_TEXTPREP_NFKC) { + *errnum = EBADF; + return ((size_t)-1); + } + + if (inarray == NULL || *inlen == 0) + return (0); + + if (outarray == NULL) { + *errnum = E2BIG; + return ((size_t)-1); + } + + ib = (uchar_t *)inarray; + ob = (uchar_t *)outarray; + ibtail = ib + *inlen; + obtail = ob + *outlen; + + do_not_ignore_null = !(flag & U8_TEXTPREP_IGNORE_NULL); + do_not_ignore_invalid = !(flag & U8_TEXTPREP_IGNORE_INVALID); + is_it_toupper = flag & U8_TEXTPREP_TOUPPER; + is_it_tolower = flag & U8_TEXTPREP_TOLOWER; + + ret_val = 0; + + /* + * If we don't have a normalization flag set, we do the simple case + * conversion based text preparation separately below. Text + * preparation involving Normalization will be done in the false task + * block, again, separately since it will take much more time and + * resource than doing simple case conversions. + */ + if (f == 0) { + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + sz = u8_number_of_bytes[*ib]; + + if (sz < 0) { + if (do_not_ignore_invalid) { + *errnum = EILSEQ; + ret_val = (size_t)-1; + break; + } + + sz = 1; + ret_val++; + } + + if (sz == 1) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else if ((ib + sz) > ibtail) { + if (do_not_ignore_invalid) { + *errnum = EINVAL; + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < (ibtail - ib)) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + /* + * We treat the remaining incomplete character + * bytes as a character. + */ + ret_val++; + + while (ib < ibtail) + *ob++ = *ib++; + } else { + if (is_it_toupper || is_it_tolower) { + i = do_case_conv(unicode_version, u8s, + ib, sz, is_it_toupper); + + if ((obtail - ob) < i) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + ib += sz; + + for (sz = 0; sz < i; sz++) + *ob++ = u8s[sz]; + } else { + if ((obtail - ob) < sz) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < sz; i++) + *ob++ = *ib++; + } + } + } + } else { + canonical_decomposition = flag & U8_CANON_DECOMP; + compatibility_decomposition = flag & U8_COMPAT_DECOMP; + canonical_composition = flag & U8_CANON_COMP; + + while (ib < ibtail) { + if (*ib == '\0' && do_not_ignore_null) + break; + + /* + * If the current character is a 7-bit ASCII + * character and it is the last character, or, + * if the current character is a 7-bit ASCII + * character and the next character is also a 7-bit + * ASCII character, then, we copy over this + * character without going through collect_a_seq(). + * + * In any other cases, we need to look further with + * the collect_a_seq() function. + */ + if (U8_ISASCII(*ib) && ((ib + 1) >= ibtail || + ((ib + 1) < ibtail && U8_ISASCII(*(ib + 1))))) { + if (ob >= obtail) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + if (is_it_toupper) + *ob = U8_ASCII_TOUPPER(*ib); + else if (is_it_tolower) + *ob = U8_ASCII_TOLOWER(*ib); + else + *ob = *ib; + ib++; + ob++; + } else { + *errnum = 0; + state = U8_STATE_START; + + j = collect_a_seq(unicode_version, u8s, + &ib, ibtail, + is_it_toupper, + is_it_tolower, + canonical_decomposition, + compatibility_decomposition, + canonical_composition, + errnum, &state); + + if (*errnum && do_not_ignore_invalid) { + ret_val = (size_t)-1; + break; + } + + if ((obtail - ob) < j) { + *errnum = E2BIG; + ret_val = (size_t)-1; + break; + } + + for (i = 0; i < j; i++) + *ob++ = u8s[i]; + } + } + } + + *inlen = ibtail - ib; + *outlen = obtail - ob; + + return (ret_val); +} + +#if defined(_KERNEL) +static int __init +unicode_init(void) +{ + return (0); +} + +static void __exit +unicode_fini(void) +{ +} + +module_init(unicode_init); +module_exit(unicode_fini); +#endif + +ZFS_MODULE_DESCRIPTION("Unicode implementation"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +EXPORT_SYMBOL(u8_validate); +EXPORT_SYMBOL(u8_strcmp); +EXPORT_SYMBOL(u8_textprep_str); diff --git a/module/unicode/uconv.c b/module/unicode/uconv.c new file mode 100644 index 000000000000..fe84979d08b2 --- /dev/null +++ b/module/unicode/uconv.c @@ -0,0 +1,863 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +/* + * Unicode encoding conversion functions among UTF-8, UTF-16, and UTF-32. + * (PSARC/2005/446, PSARC/2007/038, PSARC/2007/517) + * Man pages: uconv_u16tou32(9F), uconv_u16tou8(9F), uconv_u32tou16(9F), + * uconv_u32tou8(9F), uconv_u8tou16(9F), and uconv_u8tou32(9F). See also + * the section 3C man pages. + * Interface stability: Committed + */ + +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include +#else +#include +#endif /* _KERNEL */ +#include +#include + + +/* + * The max and min values of high and low surrogate pairs of UTF-16, + * UTF-16 bit shift value, bit mask, and starting value outside of BMP. + */ +#define UCONV_U16_HI_MIN (0xd800U) +#define UCONV_U16_HI_MAX (0xdbffU) +#define UCONV_U16_LO_MIN (0xdc00U) +#define UCONV_U16_LO_MAX (0xdfffU) +#define UCONV_U16_BIT_SHIFT (0x0400U) +#define UCONV_U16_BIT_MASK (0x0fffffU) +#define UCONV_U16_START (0x010000U) + +/* The maximum value of Unicode coding space and ASCII coding space. */ +#define UCONV_UNICODE_MAX (0x10ffffU) +#define UCONV_ASCII_MAX (0x7fU) + +/* The mask values for input and output endians. */ +#define UCONV_IN_ENDIAN_MASKS (UCONV_IN_BIG_ENDIAN | UCONV_IN_LITTLE_ENDIAN) +#define UCONV_OUT_ENDIAN_MASKS (UCONV_OUT_BIG_ENDIAN | UCONV_OUT_LITTLE_ENDIAN) + +/* Native and reversed endian macros. */ +#ifdef _ZFS_BIG_ENDIAN +#define UCONV_IN_NAT_ENDIAN UCONV_IN_BIG_ENDIAN +#define UCONV_IN_REV_ENDIAN UCONV_IN_LITTLE_ENDIAN +#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_BIG_ENDIAN +#define UCONV_OUT_REV_ENDIAN UCONV_OUT_LITTLE_ENDIAN +#else +#define UCONV_IN_NAT_ENDIAN UCONV_IN_LITTLE_ENDIAN +#define UCONV_IN_REV_ENDIAN UCONV_IN_BIG_ENDIAN +#define UCONV_OUT_NAT_ENDIAN UCONV_OUT_LITTLE_ENDIAN +#define UCONV_OUT_REV_ENDIAN UCONV_OUT_BIG_ENDIAN +#endif /* _BIG_ENDIAN */ + +/* The Byte Order Mark (BOM) character in normal and reversed byte orderings. */ +#define UCONV_BOM_NORMAL (0xfeffU) +#define UCONV_BOM_SWAPPED (0xfffeU) +#define UCONV_BOM_SWAPPED_32 (0xfffe0000U) + +/* UTF-32 boundaries based on UTF-8 character byte lengths. */ +#define UCONV_U8_ONE_BYTE (0x7fU) +#define UCONV_U8_TWO_BYTES (0x7ffU) +#define UCONV_U8_THREE_BYTES (0xffffU) +#define UCONV_U8_FOUR_BYTES (0x10ffffU) + +/* The common minimum and maximum values at the UTF-8 character bytes. */ +#define UCONV_U8_BYTE_MIN (0x80U) +#define UCONV_U8_BYTE_MAX (0xbfU) + +/* + * The following "6" and "0x3f" came from "10xx xxxx" bit representation of + * UTF-8 character bytes. + */ +#define UCONV_U8_BIT_SHIFT 6 +#define UCONV_U8_BIT_MASK 0x3f + +/* + * The following vector shows remaining bytes in a UTF-8 character. + * Index will be the first byte of the character. + */ +static const uchar_t remaining_bytes_tbl[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 C8 C9 CA CB CC CD CE CF */ + 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* D0 D1 D2 D3 D4 D5 D6 D7 D8 D9 DA DB DC DD DE DF */ + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + +/* E0 E1 E2 E3 E4 E5 E6 E7 E8 E9 EA EB EC ED EE EF */ + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + +/* F0 F1 F2 F3 F4 F5 F6 F7 F8 F9 FA FB FC FD FE FF */ + 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* + * The following is a vector of bit-masks to get used bits in + * the first byte of a UTF-8 character. Index is remaining bytes at above of + * the character. + */ +#ifdef _KERNEL +const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; +#else +static const uchar_t u8_masks_tbl[6] = { 0x00, 0x1f, 0x0f, 0x07, 0x03, 0x01 }; +#endif /* _KERNEL */ + +/* + * The following two vectors are to provide valid minimum and + * maximum values for the 2'nd byte of a multibyte UTF-8 character for + * better illegal sequence checking. The index value must be the value of + * the first byte of the UTF-8 character. + */ +static const uchar_t valid_min_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* C8 C9 CA CB CC CD CE CF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* D8 D9 DA DB DC DD DE DF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xa0, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* E8 E9 EA EB EC ED EE EF */ + 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, + +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0x90, 0x80, 0x80, 0x80, 0x80, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0 +}; + +static const uchar_t valid_max_2nd_byte[0x100] = { + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + +/* C0 C1 C2 C3 C4 C5 C6 C7 */ + 0, 0, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* C8 C9 CA CB CC CD CE CF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* D0 D1 D2 D3 D4 D5 D6 D7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* D8 D9 DA DB DC DD DE DF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* E0 E1 E2 E3 E4 E5 E6 E7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, + +/* E8 E9 EA EB EC ED EE EF */ + 0xbf, 0xbf, 0xbf, 0xbf, 0xbf, 0x9f, 0xbf, 0xbf, + +/* F0 F1 F2 F3 F4 F5 F6 F7 */ + 0xbf, 0xbf, 0xbf, 0xbf, 0x8f, 0, 0, 0, + + 0, 0, 0, 0, 0, 0, 0, 0 +}; + + +static int +check_endian(int flag, int *in, int *out) +{ + *in = flag & UCONV_IN_ENDIAN_MASKS; + + /* You cannot have both. */ + if (*in == UCONV_IN_ENDIAN_MASKS) + return (EBADF); + + if (*in == 0) + *in = UCONV_IN_NAT_ENDIAN; + + *out = flag & UCONV_OUT_ENDIAN_MASKS; + + /* You cannot have both. */ + if (*out == UCONV_OUT_ENDIAN_MASKS) + return (EBADF); + + if (*out == 0) + *out = UCONV_OUT_NAT_ENDIAN; + + return (0); +} + +static boolean_t +check_bom16(const uint16_t *u16s, size_t u16l, int *in) +{ + if (u16l > 0) { + if (*u16s == UCONV_BOM_NORMAL) { + *in = UCONV_IN_NAT_ENDIAN; + return (B_TRUE); + } + if (*u16s == UCONV_BOM_SWAPPED) { + *in = UCONV_IN_REV_ENDIAN; + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static boolean_t +check_bom32(const uint32_t *u32s, size_t u32l, int *in) +{ + if (u32l > 0) { + if (*u32s == UCONV_BOM_NORMAL) { + *in = UCONV_IN_NAT_ENDIAN; + return (B_TRUE); + } + if (*u32s == UCONV_BOM_SWAPPED_32) { + *in = UCONV_IN_REV_ENDIAN; + return (B_TRUE); + } + } + + return (B_FALSE); +} + +int +uconv_u16tou32(const uint16_t *u16s, size_t *utf16len, + uint32_t *u32s, size_t *utf32len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u32l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + /* + * Do preliminary validity checks on parameters and collect info on + * endians. + */ + if (u16s == NULL || utf16len == NULL) + return (EILSEQ); + + if (u32s == NULL || utf32len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + /* + * Initialize input and output parameter buffer indices and + * temporary variables. + */ + u16l = u32l = 0; + hi = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + /* + * Check on the BOM at the beginning of the input buffer if required + * and if there is indeed one, process it. + */ + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom16(u16s, *utf16len, &inendian)) + u16l++; + + /* + * Reset inendian and outendian so that after this point, those can be + * used as condition values. + */ + inendian &= UCONV_IN_NAT_ENDIAN; + outendian &= UCONV_OUT_NAT_ENDIAN; + + /* + * If there is something in the input buffer and if necessary and + * requested, save the BOM at the output buffer. + */ + if (*utf16len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED_32; + + /* + * Do conversion; if encounter a surrogate pair, assemble high and + * low pair values to form a UTF-32 character. If a half of a pair + * exists alone, then, either it is an illegal (EILSEQ) or + * invalid (EINVAL) value. + */ + for (; u16l < *utf16len; u16l++) { + if (u16s[u16l] == 0 && do_not_ignore_null) + break; + + lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); + + if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { + if (hi) + return (EILSEQ); + hi = lo; + continue; + } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { + if (! hi) + return (EILSEQ); + lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + + lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) + + UCONV_U16_START; + hi = 0; + } else if (hi) { + return (EILSEQ); + } + + if (u32l >= *utf32len) + return (E2BIG); + + u32s[u32l++] = (outendian) ? lo : BSWAP_32(lo); + } + + /* + * If high half didn't see low half, then, it's most likely the input + * parameter is incomplete. + */ + if (hi) + return (EINVAL); + + /* + * Save the number of consumed and saved characters. They do not + * include terminating NULL character (U+0000) at the end of + * the input buffer (even when UCONV_IGNORE_NULL isn't specified and + * the input buffer length is big enough to include the terminating + * NULL character). + */ + *utf16len = u16l; + *utf32len = u32l; + + return (0); +} + +int +uconv_u16tou8(const uint16_t *u16s, size_t *utf16len, + uchar_t *u8s, size_t *utf8len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u8l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u16s == NULL || utf16len == NULL) + return (EILSEQ); + + if (u8s == NULL || utf8len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u8l = 0; + hi = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom16(u16s, *utf16len, &inendian)) + u16l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + + for (; u16l < *utf16len; u16l++) { + if (u16s[u16l] == 0 && do_not_ignore_null) + break; + + lo = (uint32_t)((inendian) ? u16s[u16l] : BSWAP_16(u16s[u16l])); + + if (lo >= UCONV_U16_HI_MIN && lo <= UCONV_U16_HI_MAX) { + if (hi) + return (EILSEQ); + hi = lo; + continue; + } else if (lo >= UCONV_U16_LO_MIN && lo <= UCONV_U16_LO_MAX) { + if (! hi) + return (EILSEQ); + lo = (((hi - UCONV_U16_HI_MIN) * UCONV_U16_BIT_SHIFT + + lo - UCONV_U16_LO_MIN) & UCONV_U16_BIT_MASK) + + UCONV_U16_START; + hi = 0; + } else if (hi) { + return (EILSEQ); + } + + /* + * Now we convert a UTF-32 character into a UTF-8 character. + * Unicode coding space is between U+0000 and U+10FFFF; + * anything bigger is an illegal character. + */ + if (lo <= UCONV_U8_ONE_BYTE) { + if (u8l >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)lo; + } else if (lo <= UCONV_U8_TWO_BYTES) { + if ((u8l + 1) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); + } else if (lo <= UCONV_U8_THREE_BYTES) { + if ((u8l + 2) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); + } else if (lo <= UCONV_U8_FOUR_BYTES) { + if ((u8l + 3) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); + } else { + return (EILSEQ); + } + } + + if (hi) + return (EINVAL); + + *utf16len = u16l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u32tou16(const uint32_t *u32s, size_t *utf32len, + uint16_t *u16s, size_t *utf16len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u32l; + uint32_t hi; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u32s == NULL || utf32len == NULL) + return (EILSEQ); + + if (u16s == NULL || utf16len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u32l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom32(u32s, *utf32len, &inendian)) + u32l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf32len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED; + + for (; u32l < *utf32len; u32l++) { + if (u32s[u32l] == 0 && do_not_ignore_null) + break; + + hi = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); + + /* + * Anything bigger than the Unicode coding space, i.e., + * Unicode scalar value bigger than U+10FFFF, is an illegal + * character. + */ + if (hi > UCONV_UNICODE_MAX) + return (EILSEQ); + + /* + * Anything bigger than U+FFFF must be converted into + * a surrogate pair in UTF-16. + */ + if (hi >= UCONV_U16_START) { + lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + + UCONV_U16_LO_MIN; + hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + + UCONV_U16_HI_MIN; + + if ((u16l + 1) >= *utf16len) + return (E2BIG); + + if (outendian) { + u16s[u16l++] = (uint16_t)hi; + u16s[u16l++] = (uint16_t)lo; + } else { + u16s[u16l++] = BSWAP_16(((uint16_t)hi)); + u16s[u16l++] = BSWAP_16(((uint16_t)lo)); + } + } else { + if (u16l >= *utf16len) + return (E2BIG); + u16s[u16l++] = (outendian) ? (uint16_t)hi : + BSWAP_16(((uint16_t)hi)); + } + } + + *utf16len = u16l; + *utf32len = u32l; + + return (0); +} + +int +uconv_u32tou8(const uint32_t *u32s, size_t *utf32len, + uchar_t *u8s, size_t *utf8len, int flag) +{ + int inendian; + int outendian; + size_t u32l; + size_t u8l; + uint32_t lo; + boolean_t do_not_ignore_null; + + if (u32s == NULL || utf32len == NULL) + return (EILSEQ); + + if (u8s == NULL || utf8len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u32l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + if ((flag & UCONV_IN_ACCEPT_BOM) && + check_bom32(u32s, *utf32len, &inendian)) + u32l++; + + inendian &= UCONV_IN_NAT_ENDIAN; + + for (; u32l < *utf32len; u32l++) { + if (u32s[u32l] == 0 && do_not_ignore_null) + break; + + lo = (inendian) ? u32s[u32l] : BSWAP_32(u32s[u32l]); + + if (lo <= UCONV_U8_ONE_BYTE) { + if (u8l >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)lo; + } else if (lo <= UCONV_U8_TWO_BYTES) { + if ((u8l + 1) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xc0 | ((lo & 0x07c0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x003f)); + } else if (lo <= UCONV_U8_THREE_BYTES) { + if ((u8l + 2) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xe0 | ((lo & 0x0f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x00fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x0003f)); + } else if (lo <= UCONV_U8_FOUR_BYTES) { + if ((u8l + 3) >= *utf8len) + return (E2BIG); + u8s[u8l++] = (uchar_t)(0xf0 | ((lo & 0x01c0000) >> 18)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x003f000) >> 12)); + u8s[u8l++] = (uchar_t)(0x80 | ((lo & 0x0000fc0) >> 6)); + u8s[u8l++] = (uchar_t)(0x80 | (lo & 0x000003f)); + } else { + return (EILSEQ); + } + } + + *utf32len = u32l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u8tou16(const uchar_t *u8s, size_t *utf8len, + uint16_t *u16s, size_t *utf16len, int flag) +{ + int inendian; + int outendian; + size_t u16l; + size_t u8l; + uint32_t hi; + uint32_t lo; + int remaining_bytes; + int first_b; + boolean_t do_not_ignore_null; + + if (u8s == NULL || utf8len == NULL) + return (EILSEQ); + + if (u16s == NULL || utf16len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u16l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf8len > 0 && *utf16len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u16s[u16l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED; + + for (; u8l < *utf8len; ) { + if (u8s[u8l] == 0 && do_not_ignore_null) + break; + + /* + * Collect a UTF-8 character and convert it to a UTF-32 + * character. In doing so, we screen out illegally formed + * UTF-8 characters and treat such as illegal characters. + * The algorithm at below also screens out anything bigger + * than the U+10FFFF. + * + * See Unicode 3.1 UTF-8 Corrigendum and Unicode 3.2 for + * more details on the illegal values of UTF-8 character + * bytes. + */ + hi = (uint32_t)u8s[u8l++]; + + if (hi > UCONV_ASCII_MAX) { + if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) + return (EILSEQ); + + first_b = hi; + hi = hi & u8_masks_tbl[remaining_bytes]; + + for (; remaining_bytes > 0; remaining_bytes--) { + /* + * If we have no more bytes, the current + * UTF-8 character is incomplete. + */ + if (u8l >= *utf8len) + return (EINVAL); + + lo = (uint32_t)u8s[u8l++]; + + if (first_b) { + if (lo < valid_min_2nd_byte[first_b] || + lo > valid_max_2nd_byte[first_b]) + return (EILSEQ); + first_b = 0; + } else if (lo < UCONV_U8_BYTE_MIN || + lo > UCONV_U8_BYTE_MAX) { + return (EILSEQ); + } + hi = (hi << UCONV_U8_BIT_SHIFT) | + (lo & UCONV_U8_BIT_MASK); + } + } + + if (hi >= UCONV_U16_START) { + lo = ((hi - UCONV_U16_START) % UCONV_U16_BIT_SHIFT) + + UCONV_U16_LO_MIN; + hi = ((hi - UCONV_U16_START) / UCONV_U16_BIT_SHIFT) + + UCONV_U16_HI_MIN; + + if ((u16l + 1) >= *utf16len) + return (E2BIG); + + if (outendian) { + u16s[u16l++] = (uint16_t)hi; + u16s[u16l++] = (uint16_t)lo; + } else { + u16s[u16l++] = BSWAP_16(((uint16_t)hi)); + u16s[u16l++] = BSWAP_16(((uint16_t)lo)); + } + } else { + if (u16l >= *utf16len) + return (E2BIG); + + u16s[u16l++] = (outendian) ? (uint16_t)hi : + BSWAP_16(((uint16_t)hi)); + } + } + + *utf16len = u16l; + *utf8len = u8l; + + return (0); +} + +int +uconv_u8tou32(const uchar_t *u8s, size_t *utf8len, + uint32_t *u32s, size_t *utf32len, int flag) +{ + int inendian; + int outendian; + size_t u32l; + size_t u8l; + uint32_t hi; + uint32_t c; + int remaining_bytes; + int first_b; + boolean_t do_not_ignore_null; + + if (u8s == NULL || utf8len == NULL) + return (EILSEQ); + + if (u32s == NULL || utf32len == NULL) + return (E2BIG); + + if (check_endian(flag, &inendian, &outendian) != 0) + return (EBADF); + + u32l = u8l = 0; + do_not_ignore_null = ((flag & UCONV_IGNORE_NULL) == 0); + + outendian &= UCONV_OUT_NAT_ENDIAN; + + if (*utf8len > 0 && *utf32len > 0 && (flag & UCONV_OUT_EMIT_BOM)) + u32s[u32l++] = (outendian) ? UCONV_BOM_NORMAL : + UCONV_BOM_SWAPPED_32; + + for (; u8l < *utf8len; ) { + if (u8s[u8l] == 0 && do_not_ignore_null) + break; + + hi = (uint32_t)u8s[u8l++]; + + if (hi > UCONV_ASCII_MAX) { + if ((remaining_bytes = remaining_bytes_tbl[hi]) == 0) + return (EILSEQ); + + first_b = hi; + hi = hi & u8_masks_tbl[remaining_bytes]; + + for (; remaining_bytes > 0; remaining_bytes--) { + if (u8l >= *utf8len) + return (EINVAL); + + c = (uint32_t)u8s[u8l++]; + + if (first_b) { + if (c < valid_min_2nd_byte[first_b] || + c > valid_max_2nd_byte[first_b]) + return (EILSEQ); + first_b = 0; + } else if (c < UCONV_U8_BYTE_MIN || + c > UCONV_U8_BYTE_MAX) { + return (EILSEQ); + } + hi = (hi << UCONV_U8_BIT_SHIFT) | + (c & UCONV_U8_BIT_MASK); + } + } + + if (u32l >= *utf32len) + return (E2BIG); + + u32s[u32l++] = (outendian) ? hi : BSWAP_32(hi); + } + + *utf32len = u32l; + *utf8len = u8l; + + return (0); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(uconv_u16tou32); +EXPORT_SYMBOL(uconv_u16tou8); +EXPORT_SYMBOL(uconv_u32tou16); +EXPORT_SYMBOL(uconv_u32tou8); +EXPORT_SYMBOL(uconv_u8tou16); +EXPORT_SYMBOL(uconv_u8tou32); +#endif diff --git a/module/zcommon/Makefile.in b/module/zcommon/Makefile.in new file mode 100644 index 000000000000..b5cdf4c0c9fe --- /dev/null +++ b/module/zcommon/Makefile.in @@ -0,0 +1,29 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +endif + +MODULE := zcommon + +obj-$(CONFIG_ZFS) := $(MODULE).o + +# Suppress unused-value warnings in sparc64 architecture headers +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value + +$(MODULE)-objs += cityhash.o +$(MODULE)-objs += zfeature_common.o +$(MODULE)-objs += zfs_comutil.o +$(MODULE)-objs += zfs_deleg.o +$(MODULE)-objs += zfs_fletcher.o +$(MODULE)-objs += zfs_fletcher_superscalar.o +$(MODULE)-objs += zfs_fletcher_superscalar4.o +$(MODULE)-objs += zfs_namecheck.o +$(MODULE)-objs += zfs_prop.o +$(MODULE)-objs += zfs_uio.o +$(MODULE)-objs += zpool_prop.o +$(MODULE)-objs += zprop_common.o + +$(MODULE)-$(CONFIG_X86) += zfs_fletcher_intel.o +$(MODULE)-$(CONFIG_X86) += zfs_fletcher_sse.o +$(MODULE)-$(CONFIG_X86) += zfs_fletcher_avx512.o +$(MODULE)-$(CONFIG_ARM64) += zfs_fletcher_aarch64_neon.o diff --git a/module/zcommon/cityhash.c b/module/zcommon/cityhash.c new file mode 100644 index 000000000000..413a96df2cda --- /dev/null +++ b/module/zcommon/cityhash.c @@ -0,0 +1,67 @@ +// Copyright (c) 2011 Google, Inc. +// +// Permission is hereby granted, free of charge, to any person obtaining a copy +// of this software and associated documentation files (the "Software"), to deal +// in the Software without restriction, including without limitation the rights +// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +// copies of the Software, and to permit persons to whom the Software is +// furnished to do so, subject to the following conditions: +// +// The above copyright notice and this permission notice shall be included in +// all copies or substantial portions of the Software. +// +// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +// THE SOFTWARE. + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +#include + +#define HASH_K1 0xb492b66fbe98f273ULL +#define HASH_K2 0x9ae16a3b2f90404fULL + +/* + * Bitwise right rotate. Normally this will compile to a single + * instruction. + */ +static inline uint64_t +rotate(uint64_t val, int shift) +{ + // Avoid shifting by 64: doing so yields an undefined result. + return (shift == 0 ? val : (val >> shift) | (val << (64 - shift))); +} + +static inline uint64_t +cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) +{ + uint64_t a = (u ^ v) * mul; + a ^= (a >> 47); + uint64_t b = (v ^ a) * mul; + b ^= (b >> 47); + b *= mul; + return (b); +} + +uint64_t +cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +{ + uint64_t mul = HASH_K2 + 64; + uint64_t a = w1 * HASH_K1; + uint64_t b = w2; + uint64_t c = w4 * mul; + uint64_t d = w3 * HASH_K2; + return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, + a + rotate(b + HASH_K2, 18) + c, mul)); + +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(cityhash4); +#endif diff --git a/module/zcommon/zfeature_common.c b/module/zcommon/zfeature_common.c new file mode 100644 index 000000000000..97ddacbab9e0 --- /dev/null +++ b/module/zcommon/zfeature_common.c @@ -0,0 +1,602 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#ifndef _KERNEL +#include +#include +#include +#endif +#include +#include +#include +#include +#include +#include +#include "zfeature_common.h" + +/* + * Set to disable all feature checks while opening pools, allowing pools with + * unsupported features to be opened. Set for testing only. + */ +boolean_t zfeature_checks_disable = B_FALSE; + +zfeature_info_t spa_feature_table[SPA_FEATURES]; + +/* + * Valid characters for feature guids. This list is mainly for aesthetic + * purposes and could be expanded in the future. There are different allowed + * characters in the guids reverse dns portion (before the colon) and its + * short name (after the colon). + */ +static int +valid_char(char c, boolean_t after_colon) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + (after_colon && c == '_') || + (!after_colon && (c == '.' || c == '-'))); +} + +/* + * Every feature guid must contain exactly one colon which separates a reverse + * dns organization name from the feature's "short" name (e.g. + * "com.company:feature_name"). + */ +boolean_t +zfeature_is_valid_guid(const char *name) +{ + int i; + boolean_t has_colon = B_FALSE; + + i = 0; + while (name[i] != '\0') { + char c = name[i++]; + if (c == ':') { + if (has_colon) + return (B_FALSE); + has_colon = B_TRUE; + continue; + } + if (!valid_char(c, has_colon)) + return (B_FALSE); + } + + return (has_colon); +} + +boolean_t +zfeature_is_supported(const char *guid) +{ + if (zfeature_checks_disable) + return (B_TRUE); + + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (strcmp(guid, feature->fi_guid) == 0) + return (B_TRUE); + } + return (B_FALSE); +} + +int +zfeature_lookup_guid(const char *guid, spa_feature_t *res) +{ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (!feature->fi_zfs_mod_supported) + continue; + if (strcmp(guid, feature->fi_guid) == 0) { + if (res != NULL) + *res = i; + return (0); + } + } + + return (ENOENT); +} + +int +zfeature_lookup_name(const char *name, spa_feature_t *res) +{ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t *feature = &spa_feature_table[i]; + if (!feature->fi_zfs_mod_supported) + continue; + if (strcmp(name, feature->fi_uname) == 0) { + if (res != NULL) + *res = i; + return (0); + } + } + + return (ENOENT); +} + +boolean_t +zfeature_depends_on(spa_feature_t fid, spa_feature_t check) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) { + if (feature->fi_depends[i] == check) + return (B_TRUE); + } + return (B_FALSE); +} + +static boolean_t +deps_contains_feature(const spa_feature_t *deps, const spa_feature_t feature) +{ + for (int i = 0; deps[i] != SPA_FEATURE_NONE; i++) + if (deps[i] == feature) + return (B_TRUE); + + return (B_FALSE); +} + +#if !defined(_KERNEL) && !defined(LIB_ZPOOL_BUILD) +static boolean_t +zfs_mod_supported_impl(const char *scope, const char *name, const char *sysfs) +{ + boolean_t supported = B_FALSE; + char *path; + + int len = asprintf(&path, "%s%s%s%s%s", sysfs, + scope == NULL ? "" : "/", scope == NULL ? "" : scope, + name == NULL ? "" : "/", name == NULL ? "" : name); + if (len > 0) { + struct stat64 statbuf; + supported = !!(stat64(path, &statbuf) == 0); + free(path); + } + + return (supported); +} + +boolean_t +zfs_mod_supported(const char *scope, const char *name) +{ + boolean_t supported; + + /* + * Check both the primary and alternate sysfs locations to determine + * if the required functionality is supported. + */ + supported = (zfs_mod_supported_impl(scope, name, ZFS_SYSFS_DIR) || + zfs_mod_supported_impl(scope, name, ZFS_SYSFS_ALT_DIR)); + + /* + * For backwards compatibility with kernel modules that predate + * supported feature/property checking. Report the feature/property + * as supported if the kernel module is loaded but the requested + * scope directory does not exist. + */ + if (supported == B_FALSE) { + struct stat64 statbuf; + if ((stat64(ZFS_SYSFS_DIR, &statbuf) == 0) && + !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_DIR) && + !zfs_mod_supported_impl(scope, NULL, ZFS_SYSFS_ALT_DIR)) { + supported = B_TRUE; + } + } + + return (supported); +} +#endif + +static boolean_t +zfs_mod_supported_feature(const char *name) +{ + /* + * The zfs module spa_feature_table[], whether in-kernel or in + * libzpool, always supports all the features. libzfs needs to + * query the running module, via sysfs, to determine which + * features are supported. + * + * The equivalent _can_ be done on FreeBSD by way of the sysctl + * tree, but this has not been done yet. + */ +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) + return (B_TRUE); +#else + return (zfs_mod_supported(ZFS_SYSFS_POOL_FEATURES, name)); +#endif +} + +static void +zfeature_register(spa_feature_t fid, const char *guid, const char *name, + const char *desc, zfeature_flags_t flags, zfeature_type_t type, + const spa_feature_t *deps) +{ + zfeature_info_t *feature = &spa_feature_table[fid]; + static spa_feature_t nodeps[] = { SPA_FEATURE_NONE }; + + ASSERT(name != NULL); + ASSERT(desc != NULL); + ASSERT((flags & ZFEATURE_FLAG_READONLY_COMPAT) == 0 || + (flags & ZFEATURE_FLAG_MOS) == 0); + ASSERT3U(fid, <, SPA_FEATURES); + ASSERT(zfeature_is_valid_guid(guid)); + + if (deps == NULL) + deps = nodeps; + + VERIFY(((flags & ZFEATURE_FLAG_PER_DATASET) == 0) || + (deps_contains_feature(deps, SPA_FEATURE_EXTENSIBLE_DATASET))); + + feature->fi_feature = fid; + feature->fi_guid = guid; + feature->fi_uname = name; + feature->fi_desc = desc; + feature->fi_flags = flags; + feature->fi_type = type; + feature->fi_depends = deps; + feature->fi_zfs_mod_supported = zfs_mod_supported_feature(guid); +} + +/* + * Every feature has a GUID of the form com.example:feature_name. The + * reversed DNS name ensures that the feature's GUID is unique across all ZFS + * implementations. This allows companies to independently develop and + * release features. Examples include org.delphix and org.datto. Previously, + * features developed on one implementation have used that implementation's + * domain name (e.g. org.illumos and org.zfsonlinux). Use of the org.openzfs + * domain name is recommended for new features which are developed by the + * OpenZFS community and its platforms. This domain may optionally be used by + * companies developing features for initial release through an OpenZFS + * implementation. Use of the org.openzfs domain requires reserving the + * feature name in advance with the OpenZFS project. + */ +void +zpool_feature_init(void) +{ + zfeature_register(SPA_FEATURE_ASYNC_DESTROY, + "com.delphix:async_destroy", "async_destroy", + "Destroy filesystems asynchronously.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_EMPTY_BPOBJ, + "com.delphix:empty_bpobj", "empty_bpobj", + "Snapshots use less space.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_LZ4_COMPRESS, + "org.illumos:lz4_compress", "lz4_compress", + "LZ4 compression algorithm support.", + ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_MULTI_VDEV_CRASH_DUMP, + "com.joyent:multi_vdev_crash_dump", "multi_vdev_crash_dump", + "Crash dumps to multiple vdev pools.", + 0, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_SPACEMAP_HISTOGRAM, + "com.delphix:spacemap_histogram", "spacemap_histogram", + "Spacemaps maintain space histograms.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_ENABLED_TXG, + "com.delphix:enabled_txg", "enabled_txg", + "Record txg at which a feature is enabled", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t hole_birth_deps[] = { + SPA_FEATURE_ENABLED_TXG, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_HOLE_BIRTH, + "com.delphix:hole_birth", "hole_birth", + "Retain hole birth txg for more precise zfs send", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + ZFEATURE_TYPE_BOOLEAN, hole_birth_deps); + } + + zfeature_register(SPA_FEATURE_POOL_CHECKPOINT, + "com.delphix:zpool_checkpoint", "zpool_checkpoint", + "Pool state can be checkpointed, allowing rewind later.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_SPACEMAP_V2, + "com.delphix:spacemap_v2", "spacemap_v2", + "Space maps representing large segments are more efficient.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_EXTENSIBLE_DATASET, + "com.delphix:extensible_dataset", "extensible_dataset", + "Enhanced dataset functionality, used by other features.", + 0, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t bookmarks_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + + zfeature_register(SPA_FEATURE_BOOKMARKS, + "com.delphix:bookmarks", "bookmarks", + "\"zfs bookmark\" command", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + bookmarks_deps); + } + + { + static const spa_feature_t filesystem_limits_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_FS_SS_LIMIT, + "com.joyent:filesystem_limits", "filesystem_limits", + "Filesystem and snapshot limits.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + filesystem_limits_deps); + } + + zfeature_register(SPA_FEATURE_EMBEDDED_DATA, + "com.delphix:embedded_data", "embedded_data", + "Blocks which compress very well use even less space.", + ZFEATURE_FLAG_MOS | ZFEATURE_FLAG_ACTIVATE_ON_ENABLE, + ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t livelist_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LIVELIST, + "com.delphix:livelist", "livelist", + "Improved clone deletion performance.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + livelist_deps); + } + + { + static const spa_feature_t log_spacemap_deps[] = { + SPA_FEATURE_SPACEMAP_V2, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LOG_SPACEMAP, + "com.delphix:log_spacemap", "log_spacemap", + "Log metaslab changes on a single spacemap and " + "flush them periodically.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + log_spacemap_deps); + } + + { + static const spa_feature_t large_blocks_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_BLOCKS, + "org.open-zfs:large_blocks", "large_blocks", + "Support for blocks larger than 128KB.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + large_blocks_deps); + } + + { + static const spa_feature_t large_dnode_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_LARGE_DNODE, + "org.zfsonlinux:large_dnode", "large_dnode", + "Variable on-disk size of dnodes.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + large_dnode_deps); + } + + { + static const spa_feature_t sha512_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_SHA512, + "org.illumos:sha512", "sha512", + "SHA-512/256 hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + sha512_deps); + } + + { + static const spa_feature_t skein_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_SKEIN, + "org.illumos:skein", "skein", + "Skein hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + skein_deps); + } + +#if !defined(__FreeBSD__) + + { + static const spa_feature_t edonr_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_EDONR, + "org.illumos:edonr", "edonr", + "Edon-R hash algorithm.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + edonr_deps); + } +#endif + + { + static const spa_feature_t redact_books_deps[] = { + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTION_BOOKMARKS, + "com.delphix:redaction_bookmarks", "redaction_bookmarks", + "Support for bookmarks which store redaction lists for zfs " + "redacted send/recv.", 0, ZFEATURE_TYPE_BOOLEAN, + redact_books_deps); + } + + { + static const spa_feature_t redact_datasets_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_REDACTED_DATASETS, + "com.delphix:redacted_datasets", "redacted_datasets", "Support for " + "redacted datasets, produced by receiving a redacted zfs send " + "stream.", ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_UINT64_ARRAY, + redact_datasets_deps); + } + + { + static const spa_feature_t bookmark_written_deps[] = { + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BOOKMARK_WRITTEN, + "com.delphix:bookmark_written", "bookmark_written", + "Additional accounting, enabling the written# property" + "(space written since a bookmark), and estimates of send stream " + "sizes for incrementals from bookmarks.", + 0, ZFEATURE_TYPE_BOOLEAN, bookmark_written_deps); + } + + zfeature_register(SPA_FEATURE_DEVICE_REMOVAL, + "com.delphix:device_removal", "device_removal", + "Top-level vdevs can be removed, reducing logical pool size.", + ZFEATURE_FLAG_MOS, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t obsolete_counts_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_DEVICE_REMOVAL, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_OBSOLETE_COUNTS, + "com.delphix:obsolete_counts", "obsolete_counts", + "Reduce memory used by removed devices when their blocks are " + "freed or remapped.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, + obsolete_counts_deps); + } + + { + static const spa_feature_t userobj_accounting_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_USEROBJ_ACCOUNTING, + "org.zfsonlinux:userobj_accounting", "userobj_accounting", + "User/Group object accounting.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, + ZFEATURE_TYPE_BOOLEAN, userobj_accounting_deps); + } + + { + static const spa_feature_t bookmark_v2_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARKS, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_BOOKMARK_V2, + "com.datto:bookmark_v2", "bookmark_v2", + "Support for larger bookmarks", + 0, ZFEATURE_TYPE_BOOLEAN, bookmark_v2_deps); + } + + { + static const spa_feature_t encryption_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_BOOKMARK_V2, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ENCRYPTION, + "com.datto:encryption", "encryption", + "Support for dataset level encryption", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, + encryption_deps); + } + + { + static const spa_feature_t project_quota_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_PROJECT_QUOTA, + "org.zfsonlinux:project_quota", "project_quota", + "space/object accounting based on project ID.", + ZFEATURE_FLAG_READONLY_COMPAT | ZFEATURE_FLAG_PER_DATASET, + ZFEATURE_TYPE_BOOLEAN, project_quota_deps); + } + + zfeature_register(SPA_FEATURE_ALLOCATION_CLASSES, + "org.zfsonlinux:allocation_classes", "allocation_classes", + "Support for separate allocation classes.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_RESILVER_DEFER, + "com.datto:resilver_defer", "resilver_defer", + "Support for deferring new resilvers when one is already running.", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + zfeature_register(SPA_FEATURE_DEVICE_REBUILD, + "org.openzfs:device_rebuild", "device_rebuild", + "Support for sequential device rebuilds", + ZFEATURE_FLAG_READONLY_COMPAT, ZFEATURE_TYPE_BOOLEAN, NULL); + + { + static const spa_feature_t zstd_deps[] = { + SPA_FEATURE_EXTENSIBLE_DATASET, + SPA_FEATURE_NONE + }; + zfeature_register(SPA_FEATURE_ZSTD_COMPRESS, + "org.freebsd:zstd_compress", "zstd_compress", + "zstd compression algorithm support.", + ZFEATURE_FLAG_PER_DATASET, ZFEATURE_TYPE_BOOLEAN, zstd_deps); + } +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfeature_lookup_guid); +EXPORT_SYMBOL(zfeature_lookup_name); +EXPORT_SYMBOL(zfeature_is_supported); +EXPORT_SYMBOL(zfeature_is_valid_guid); +EXPORT_SYMBOL(zfeature_depends_on); +EXPORT_SYMBOL(zpool_feature_init); +EXPORT_SYMBOL(spa_feature_table); +#endif diff --git a/module/zcommon/zfs_comutil.c b/module/zcommon/zfs_comutil.c new file mode 100644 index 000000000000..1cec60ac1d67 --- /dev/null +++ b/module/zcommon/zfs_comutil.c @@ -0,0 +1,263 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +/* + * This file is intended for functions that ought to be common between user + * land (libzfs) and the kernel. When many common routines need to be shared + * then a separate file should to be created. + */ + +#if !defined(_KERNEL) +#include +#endif + +#include +#include +#include +#include "zfs_comutil.h" +#include + +/* + * Are there allocatable vdevs? + */ +boolean_t +zfs_allocatable_devs(nvlist_t *nv) +{ + uint64_t is_log; + uint_t c; + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + return (B_FALSE); + } + for (c = 0; c < children; c++) { + is_log = 0; + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (!is_log) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Are there special vdevs? + */ +boolean_t +zfs_special_devs(nvlist_t *nv, char *type) +{ + char *bias; + uint_t c; + nvlist_t **child; + uint_t children; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) { + return (B_FALSE); + } + for (c = 0; c < children; c++) { + if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0 || + strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) { + if (type != NULL && strcmp(bias, type) == 0) { + return (B_TRUE); + } else if (type == NULL) { + return (B_TRUE); + } + } + } + } + return (B_FALSE); +} + +void +zpool_get_load_policy(nvlist_t *nvl, zpool_load_policy_t *zlpp) +{ + nvlist_t *policy; + nvpair_t *elem; + char *nm; + + /* Defaults */ + zlpp->zlp_rewind = ZPOOL_NO_REWIND; + zlpp->zlp_maxmeta = 0; + zlpp->zlp_maxdata = UINT64_MAX; + zlpp->zlp_txg = UINT64_MAX; + + if (nvl == NULL) + return; + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + nm = nvpair_name(elem); + if (strcmp(nm, ZPOOL_LOAD_POLICY) == 0) { + if (nvpair_value_nvlist(elem, &policy) == 0) + zpool_get_load_policy(policy, zlpp); + return; + } else if (strcmp(nm, ZPOOL_LOAD_REWIND_POLICY) == 0) { + if (nvpair_value_uint32(elem, &zlpp->zlp_rewind) == 0) + if (zlpp->zlp_rewind & ~ZPOOL_REWIND_POLICIES) + zlpp->zlp_rewind = ZPOOL_NO_REWIND; + } else if (strcmp(nm, ZPOOL_LOAD_REQUEST_TXG) == 0) { + (void) nvpair_value_uint64(elem, &zlpp->zlp_txg); + } else if (strcmp(nm, ZPOOL_LOAD_META_THRESH) == 0) { + (void) nvpair_value_uint64(elem, &zlpp->zlp_maxmeta); + } else if (strcmp(nm, ZPOOL_LOAD_DATA_THRESH) == 0) { + (void) nvpair_value_uint64(elem, &zlpp->zlp_maxdata); + } + } + if (zlpp->zlp_rewind == 0) + zlpp->zlp_rewind = ZPOOL_NO_REWIND; +} + +typedef struct zfs_version_spa_map { + int version_zpl; + int version_spa; +} zfs_version_spa_map_t; + +/* + * Keep this table in monotonically increasing version number order. + */ +static zfs_version_spa_map_t zfs_version_table[] = { + {ZPL_VERSION_INITIAL, SPA_VERSION_INITIAL}, + {ZPL_VERSION_DIRENT_TYPE, SPA_VERSION_INITIAL}, + {ZPL_VERSION_FUID, SPA_VERSION_FUID}, + {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, + {ZPL_VERSION_SA, SPA_VERSION_SA}, + {0, 0} +}; + +/* + * Return the max zpl version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_zpl_version_map(int spa_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_spa; i++) { + if (spa_version >= zfs_version_table[i].version_spa) + version = zfs_version_table[i].version_zpl; + } + + return (version); +} + +/* + * Return the min spa version for a corresponding spa version + * -1 is returned if no mapping exists. + */ +int +zfs_spa_version_map(int zpl_version) +{ + int i; + int version = -1; + + for (i = 0; zfs_version_table[i].version_zpl; i++) { + if (zfs_version_table[i].version_zpl >= zpl_version) + return (zfs_version_table[i].version_spa); + } + + return (version); +} + +/* + * This is the table of legacy internal event names; it should not be modified. + * The internal events are now stored in the history log as strings. + */ +const char *zfs_history_event_names[ZFS_NUM_LEGACY_HISTORY_EVENTS] = { + "invalid event", + "pool create", + "vdev add", + "pool remove", + "pool destroy", + "pool export", + "pool import", + "vdev attach", + "vdev replace", + "vdev detach", + "vdev online", + "vdev offline", + "vdev upgrade", + "pool clear", + "pool scrub", + "pool property set", + "create", + "clone", + "destroy", + "destroy_begin_sync", + "inherit", + "property set", + "quota set", + "permission update", + "permission remove", + "permission who remove", + "promote", + "receive", + "rename", + "reservation set", + "replay_inc_sync", + "replay_full_sync", + "rollback", + "snapshot", + "filesystem version upgrade", + "refquota set", + "refreservation set", + "pool scrub done", + "user hold", + "user release", + "pool split", +}; + +boolean_t +zfs_dataset_name_hidden(const char *name) +{ + /* + * Skip over datasets that are not visible in this zone, + * internal datasets (which have a $ in their name), and + * temporary datasets (which have a % in their name). + */ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) + return (B_TRUE); + return (B_FALSE); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_allocatable_devs); +EXPORT_SYMBOL(zfs_special_devs); +EXPORT_SYMBOL(zpool_get_load_policy); +EXPORT_SYMBOL(zfs_zpl_version_map); +EXPORT_SYMBOL(zfs_spa_version_map); +EXPORT_SYMBOL(zfs_history_event_names); +EXPORT_SYMBOL(zfs_dataset_name_hidden); +#endif diff --git a/module/zcommon/zfs_deleg.c b/module/zcommon/zfs_deleg.c new file mode 100644 index 000000000000..e1f5a353b7a4 --- /dev/null +++ b/module/zcommon/zfs_deleg.c @@ -0,0 +1,249 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2010 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2013, 2015 by Delphix. All rights reserved. + * Copyright 2016 Igor Kozhukhov + */ + +#include + +#if defined(_KERNEL) +#include +#include +#else +#include +#include +#include +#include +#endif +#include +#include +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_namecheck.h" + +zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { + {ZFS_DELEG_PERM_ALLOW}, + {ZFS_DELEG_PERM_BOOKMARK}, + {ZFS_DELEG_PERM_CLONE}, + {ZFS_DELEG_PERM_CREATE}, + {ZFS_DELEG_PERM_DESTROY}, + {ZFS_DELEG_PERM_DIFF}, + {ZFS_DELEG_PERM_MOUNT}, + {ZFS_DELEG_PERM_PROMOTE}, + {ZFS_DELEG_PERM_RECEIVE}, + {ZFS_DELEG_PERM_RENAME}, + {ZFS_DELEG_PERM_ROLLBACK}, + {ZFS_DELEG_PERM_SNAPSHOT}, + {ZFS_DELEG_PERM_SHARE}, + {ZFS_DELEG_PERM_SEND}, + {ZFS_DELEG_PERM_USERPROP}, + {ZFS_DELEG_PERM_USERQUOTA}, + {ZFS_DELEG_PERM_GROUPQUOTA}, + {ZFS_DELEG_PERM_USERUSED}, + {ZFS_DELEG_PERM_GROUPUSED}, + {ZFS_DELEG_PERM_USEROBJQUOTA}, + {ZFS_DELEG_PERM_GROUPOBJQUOTA}, + {ZFS_DELEG_PERM_USEROBJUSED}, + {ZFS_DELEG_PERM_GROUPOBJUSED}, + {ZFS_DELEG_PERM_HOLD}, + {ZFS_DELEG_PERM_RELEASE}, + {ZFS_DELEG_PERM_LOAD_KEY}, + {ZFS_DELEG_PERM_CHANGE_KEY}, + {ZFS_DELEG_PERM_PROJECTUSED}, + {ZFS_DELEG_PERM_PROJECTQUOTA}, + {ZFS_DELEG_PERM_PROJECTOBJUSED}, + {ZFS_DELEG_PERM_PROJECTOBJQUOTA}, + {NULL} +}; + +static int +zfs_valid_permission_name(const char *perm) +{ + if (zfs_deleg_canonicalize_perm(perm)) + return (0); + + return (permset_namecheck(perm, NULL, NULL)); +} + +const char * +zfs_deleg_canonicalize_perm(const char *perm) +{ + int i; + zfs_prop_t prop; + + for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { + if (strcmp(perm, zfs_deleg_perm_tab[i].z_perm) == 0) + return (perm); + } + + prop = zfs_name_to_prop(perm); + if (prop != ZPROP_INVAL && zfs_prop_delegatable(prop)) + return (zfs_prop_to_name(prop)); + return (NULL); + +} + +static int +zfs_validate_who(char *who) +{ + char *p; + + if (who[2] != ZFS_DELEG_FIELD_SEP_CHR) + return (-1); + + switch (who[0]) { + case ZFS_DELEG_USER: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP_SETS: + if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) + return (-1); + for (p = &who[3]; *p; p++) + if (!isdigit(*p)) + return (-1); + break; + + case ZFS_DELEG_NAMED_SET: + case ZFS_DELEG_NAMED_SET_SETS: + if (who[1] != ZFS_DELEG_NA) + return (-1); + return (permset_namecheck(&who[3], NULL, NULL)); + + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + if (who[1] != ZFS_DELEG_NA) + return (-1); + if (who[3] != '\0') + return (-1); + break; + + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + if (who[1] != ZFS_DELEG_LOCAL && who[1] != ZFS_DELEG_DESCENDENT) + return (-1); + if (who[3] != '\0') + return (-1); + break; + + default: + return (-1); + } + + return (0); +} + +int +zfs_deleg_verify_nvlist(nvlist_t *nvp) +{ + nvpair_t *who, *perm_name; + nvlist_t *perms; + int error; + + if (nvp == NULL) + return (-1); + + who = nvlist_next_nvpair(nvp, NULL); + if (who == NULL) + return (-1); + + do { + if (zfs_validate_who(nvpair_name(who))) + return (-1); + + error = nvlist_lookup_nvlist(nvp, nvpair_name(who), &perms); + + if (error && error != ENOENT) + return (-1); + if (error == ENOENT) + continue; + + perm_name = nvlist_next_nvpair(perms, NULL); + if (perm_name == NULL) { + return (-1); + } + do { + error = zfs_valid_permission_name( + nvpair_name(perm_name)); + if (error) + return (-1); + } while ((perm_name = nvlist_next_nvpair(perms, perm_name)) + != NULL); + } while ((who = nvlist_next_nvpair(nvp, who)) != NULL); + return (0); +} + +/* + * Construct the base attribute name. The base attribute names + * are the "key" to locate the jump objects which contain the actual + * permissions. The base attribute names are encoded based on + * type of entry and whether it is a local or descendent permission. + * + * Arguments: + * attr - attribute name return string, attribute is assumed to be + * ZFS_MAX_DELEG_NAME long. + * type - type of entry to construct + * inheritchr - inheritance type (local,descendent, or NA for create and + * permission set definitions + * data - is either a permission set name or a 64 bit uid/gid. + */ +void +zfs_deleg_whokey(char *attr, zfs_deleg_who_type_t type, + char inheritchr, void *data) +{ + int len = ZFS_MAX_DELEG_NAME; + uint64_t *id = data; + + switch (type) { + case ZFS_DELEG_USER: + case ZFS_DELEG_GROUP: + case ZFS_DELEG_USER_SETS: + case ZFS_DELEG_GROUP_SETS: + (void) snprintf(attr, len, "%c%c%c%lld", type, inheritchr, + ZFS_DELEG_FIELD_SEP_CHR, (longlong_t)*id); + break; + case ZFS_DELEG_NAMED_SET_SETS: + case ZFS_DELEG_NAMED_SET: + (void) snprintf(attr, len, "%c-%c%s", type, + ZFS_DELEG_FIELD_SEP_CHR, (char *)data); + break; + case ZFS_DELEG_CREATE: + case ZFS_DELEG_CREATE_SETS: + (void) snprintf(attr, len, "%c-%c", type, + ZFS_DELEG_FIELD_SEP_CHR); + break; + case ZFS_DELEG_EVERYONE: + case ZFS_DELEG_EVERYONE_SETS: + (void) snprintf(attr, len, "%c%c%c", type, inheritchr, + ZFS_DELEG_FIELD_SEP_CHR); + break; + default: + ASSERT(!"bad zfs_deleg_who_type_t"); + } +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_deleg_verify_nvlist); +EXPORT_SYMBOL(zfs_deleg_whokey); +EXPORT_SYMBOL(zfs_deleg_canonicalize_perm); +#endif diff --git a/module/zcommon/zfs_fletcher.c b/module/zcommon/zfs_fletcher.c new file mode 100644 index 000000000000..18998bceeada --- /dev/null +++ b/module/zcommon/zfs_fletcher.c @@ -0,0 +1,940 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +/* + * Fletcher Checksums + * ------------------ + * + * ZFS's 2nd and 4th order Fletcher checksums are defined by the following + * recurrence relations: + * + * a = a + f + * i i-1 i-1 + * + * b = b + a + * i i-1 i + * + * c = c + b (fletcher-4 only) + * i i-1 i + * + * d = d + c (fletcher-4 only) + * i i-1 i + * + * Where + * a_0 = b_0 = c_0 = d_0 = 0 + * and + * f_0 .. f_(n-1) are the input data. + * + * Using standard techniques, these translate into the following series: + * + * __n_ __n_ + * \ | \ | + * a = > f b = > i * f + * n /___| n - i n /___| n - i + * i = 1 i = 1 + * + * + * __n_ __n_ + * \ | i*(i+1) \ | i*(i+1)*(i+2) + * c = > ------- f d = > ------------- f + * n /___| 2 n - i n /___| 6 n - i + * i = 1 i = 1 + * + * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. + * Since the additions are done mod (2^64), errors in the high bits may not + * be noticed. For this reason, fletcher-2 is deprecated. + * + * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. + * A conservative estimate of how big the buffer can get before we overflow + * can be estimated using f_i = 0xffffffff for all i: + * + * % bc + * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 + * 2264 + * quit + * % + * + * So blocks of up to 2k will not overflow. Our largest block size is + * 128k, which has 32k 4-byte words, so we can compute the largest possible + * accumulators, then divide by 2^64 to figure the max amount of overflow: + * + * % bc + * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } + * a/2^64;b/2^64;c/2^64;d/2^64 + * 0 + * 0 + * 1365 + * 11186858 + * quit + * % + * + * So a and b cannot overflow. To make sure each bit of input has some + * effect on the contents of c and d, we can look at what the factors of + * the coefficients in the equations for c_n and d_n are. The number of 2s + * in the factors determines the lowest set bit in the multiplier. Running + * through the cases for n*(n+1)/2 reveals that the highest power of 2 is + * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow + * the 64-bit accumulators, every bit of every f_i effects every accumulator, + * even for 128k blocks. + * + * If we wanted to make a stronger version of fletcher4 (fletcher4c?), + * we could do our calculations mod (2^32 - 1) by adding in the carries + * periodically, and store the number of carries in the top 32-bits. + * + * -------------------- + * Checksum Performance + * -------------------- + * + * There are two interesting components to checksum performance: cached and + * uncached performance. With cached data, fletcher-2 is about four times + * faster than fletcher-4. With uncached data, the performance difference is + * negligible, since the cost of a cache fill dominates the processing time. + * Even though fletcher-4 is slower than fletcher-2, it is still a pretty + * efficient pass over the data. + * + * In normal operation, the data which is being checksummed is in a buffer + * which has been filled either by: + * + * 1. a compression step, which will be mostly cached, or + * 2. a bcopy() or copyin(), which will be uncached (because the + * copy is cache-bypassing). + * + * For both cached and uncached data, both fletcher checksums are much faster + * than sha-256, and slower than 'off', which doesn't touch the data at all. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#define FLETCHER_MIN_SIMD_SIZE 64 + +static void fletcher_4_scalar_init(fletcher_4_ctx_t *ctx); +static void fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp); +static void fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size); +static void fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size); +static boolean_t fletcher_4_scalar_valid(void); + +static const fletcher_4_ops_t fletcher_4_scalar_ops = { + .init_native = fletcher_4_scalar_init, + .fini_native = fletcher_4_scalar_fini, + .compute_native = fletcher_4_scalar_native, + .init_byteswap = fletcher_4_scalar_init, + .fini_byteswap = fletcher_4_scalar_fini, + .compute_byteswap = fletcher_4_scalar_byteswap, + .valid = fletcher_4_scalar_valid, + .name = "scalar" +}; + +static fletcher_4_ops_t fletcher_4_fastest_impl = { + .name = "fastest", + .valid = fletcher_4_scalar_valid +}; + +static const fletcher_4_ops_t *fletcher_4_impls[] = { + &fletcher_4_scalar_ops, + &fletcher_4_superscalar_ops, + &fletcher_4_superscalar4_ops, +#if defined(HAVE_SSE2) + &fletcher_4_sse2_ops, +#endif +#if defined(HAVE_SSE2) && defined(HAVE_SSSE3) + &fletcher_4_ssse3_ops, +#endif +#if defined(HAVE_AVX) && defined(HAVE_AVX2) + &fletcher_4_avx2_ops, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) + &fletcher_4_avx512f_ops, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) + &fletcher_4_avx512bw_ops, +#endif +#if defined(__aarch64__) + &fletcher_4_aarch64_neon_ops, +#endif +}; + +/* Hold all supported implementations */ +static uint32_t fletcher_4_supp_impls_cnt = 0; +static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; + +/* Select fletcher4 implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) +#define IMPL_SCALAR (0) + +static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; + +#define IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +static struct fletcher_4_impl_selector { + const char *fis_name; + uint32_t fis_sel; +} fletcher_4_impl_selectors[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, + { "scalar", IMPL_SCALAR } +}; + +#if defined(_KERNEL) +static kstat_t *fletcher_4_kstat; + +static struct fletcher_4_kstat { + uint64_t native; + uint64_t byteswap; +} fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; +#endif + +/* Indicate that benchmark has been completed */ +static boolean_t fletcher_4_initialized = B_FALSE; + +/*ARGSUSED*/ +void +fletcher_init(zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +int +fletcher_2_incremental_native(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); +} + +/*ARGSUSED*/ +void +fletcher_2_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_native((void *) buf, size, zcp); +} + +int +fletcher_2_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + a0 = zcp->zc_word[0]; + a1 = zcp->zc_word[1]; + b0 = zcp->zc_word[2]; + b1 = zcp->zc_word[3]; + + for (; ip < ipend; ip += 2) { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); + return (0); +} + +/*ARGSUSED*/ +void +fletcher_2_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) fletcher_2_incremental_byteswap((void *) buf, size, zcp); +} + +static void +fletcher_4_scalar_init(fletcher_4_ctx_t *ctx) +{ + ZIO_SET_CHECKSUM(&ctx->scalar, 0, 0, 0, 0); +} + +static void +fletcher_4_scalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + memcpy(zcp, &ctx->scalar, sizeof (zio_cksum_t)); +} + +static void +fletcher_4_scalar_native(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = ctx->scalar.zc_word[0]; + b = ctx->scalar.zc_word[1]; + c = ctx->scalar.zc_word[2]; + d = ctx->scalar.zc_word[3]; + + for (; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); +} + +static void +fletcher_4_scalar_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = ctx->scalar.zc_word[0]; + b = ctx->scalar.zc_word[1]; + c = ctx->scalar.zc_word[2]; + d = ctx->scalar.zc_word[3]; + + for (; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(&ctx->scalar, a, b, c, d); +} + +static boolean_t +fletcher_4_scalar_valid(void) +{ + return (B_TRUE); +} + +int +fletcher_4_impl_set(const char *val) +{ + int err = -EINVAL; + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + size_t i, val_len; + + val_len = strlen(val); + while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ + val_len--; + + /* check mandatory implementations */ + for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { + const char *name = fletcher_4_impl_selectors[i].fis_name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = fletcher_4_impl_selectors[i].fis_sel; + err = 0; + break; + } + } + + if (err != 0 && fletcher_4_initialized) { + /* check all supported implementations */ + for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { + const char *name = fletcher_4_supp_impls[i]->name; + + if (val_len == strlen(name) && + strncmp(val, name, val_len) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + atomic_swap_32(&fletcher_4_impl_chosen, impl); + membar_producer(); + } + + return (err); +} + +/* + * Returns the Fletcher 4 operations for checksums. When a SIMD + * implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ +static inline const fletcher_4_ops_t * +fletcher_4_impl_get(void) +{ + if (!kfpu_allowed()) + return (&fletcher_4_superscalar4_ops); + + const fletcher_4_ops_t *ops = NULL; + uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(fletcher_4_initialized); + ops = &fletcher_4_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through supported implementations */ + ASSERT(fletcher_4_initialized); + ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); + static uint32_t cycle_count = 0; + uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; + ops = fletcher_4_supp_impls[idx]; + break; + default: + ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); + ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); + ops = fletcher_4_supp_impls[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +static inline void +fletcher_4_native_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_ctx_t ctx; + const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + + ops->init_native(&ctx); + ops->compute_native(&ctx, buf, size); + ops->fini_native(&ctx, zcp); +} + +/*ARGSUSED*/ +void +fletcher_4_native(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + + ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); + + if (size == 0 || p2size == 0) { + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); + + if (size > 0) + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, + buf, size); + } else { + fletcher_4_native_impl(buf, p2size, zcp); + + if (p2size < size) + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, + (char *)buf + p2size, size - p2size); + } +} + +void +fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); +} + +static inline void +fletcher_4_byteswap_impl(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + fletcher_4_ctx_t ctx; + const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + + ops->init_byteswap(&ctx); + ops->compute_byteswap(&ctx, buf, size); + ops->fini_byteswap(&ctx, zcp); +} + +/*ARGSUSED*/ +void +fletcher_4_byteswap(const void *buf, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + const uint64_t p2size = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + + ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); + + if (size == 0 || p2size == 0) { + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); + + if (size > 0) + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, + buf, size); + } else { + fletcher_4_byteswap_impl(buf, p2size, zcp); + + if (p2size < size) + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, + (char *)buf + p2size, size - p2size); + } +} + +/* Incremental Fletcher 4 */ + +#define ZFS_FLETCHER_4_INC_MAX_SIZE (8ULL << 20) + +static inline void +fletcher_4_incremental_combine(zio_cksum_t *zcp, const uint64_t size, + const zio_cksum_t *nzcp) +{ + const uint64_t c1 = size / sizeof (uint32_t); + const uint64_t c2 = c1 * (c1 + 1) / 2; + const uint64_t c3 = c2 * (c1 + 2) / 3; + + /* + * Value of 'c3' overflows on buffer sizes close to 16MiB. For that + * reason we split incremental fletcher4 computation of large buffers + * to steps of (ZFS_FLETCHER_4_INC_MAX_SIZE) size. + */ + ASSERT3U(size, <=, ZFS_FLETCHER_4_INC_MAX_SIZE); + + zcp->zc_word[3] += nzcp->zc_word[3] + c1 * zcp->zc_word[2] + + c2 * zcp->zc_word[1] + c3 * zcp->zc_word[0]; + zcp->zc_word[2] += nzcp->zc_word[2] + c1 * zcp->zc_word[1] + + c2 * zcp->zc_word[0]; + zcp->zc_word[1] += nzcp->zc_word[1] + c1 * zcp->zc_word[0]; + zcp->zc_word[0] += nzcp->zc_word[0]; +} + +static inline void +fletcher_4_incremental_impl(boolean_t native, const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + while (size > 0) { + zio_cksum_t nzc; + uint64_t len = MIN(size, ZFS_FLETCHER_4_INC_MAX_SIZE); + + if (native) + fletcher_4_native(buf, len, NULL, &nzc); + else + fletcher_4_byteswap(buf, len, NULL, &nzc); + + fletcher_4_incremental_combine(zcp, len, &nzc); + + size -= len; + buf += len; + } +} + +int +fletcher_4_incremental_native(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + /* Use scalar impl to directly update cksum of small blocks */ + if (size < SPA_MINBLOCKSIZE) + fletcher_4_scalar_native((fletcher_4_ctx_t *)zcp, buf, size); + else + fletcher_4_incremental_impl(B_TRUE, buf, size, zcp); + return (0); +} + +int +fletcher_4_incremental_byteswap(void *buf, size_t size, void *data) +{ + zio_cksum_t *zcp = data; + /* Use scalar impl to directly update cksum of small blocks */ + if (size < SPA_MINBLOCKSIZE) + fletcher_4_scalar_byteswap((fletcher_4_ctx_t *)zcp, buf, size); + else + fletcher_4_incremental_impl(B_FALSE, buf, size, zcp); + return (0); +} + +#if defined(_KERNEL) +/* + * Fletcher 4 kstats + */ +static int +fletcher_4_kstat_headers(char *buf, size_t size) +{ + ssize_t off = 0; + + off += snprintf(buf + off, size, "%-17s", "implementation"); + off += snprintf(buf + off, size - off, "%-15s", "native"); + (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); + + return (0); +} + +static int +fletcher_4_kstat_data(char *buf, size_t size, void *data) +{ + struct fletcher_4_kstat *fastest_stat = + &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; + struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *)data; + ssize_t off = 0; + + if (curr_stat == fastest_stat) { + off += snprintf(buf + off, size - off, "%-17s", "fastest"); + off += snprintf(buf + off, size - off, "%-15s", + fletcher_4_supp_impls[fastest_stat->native]->name); + off += snprintf(buf + off, size - off, "%-15s\n", + fletcher_4_supp_impls[fastest_stat->byteswap]->name); + } else { + ptrdiff_t id = curr_stat - fletcher_4_stat_data; + + off += snprintf(buf + off, size - off, "%-17s", + fletcher_4_supp_impls[id]->name); + off += snprintf(buf + off, size - off, "%-15llu", + (u_longlong_t)curr_stat->native); + off += snprintf(buf + off, size - off, "%-15llu\n", + (u_longlong_t)curr_stat->byteswap); + } + + return (0); +} + +static void * +fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) +{ + if (n <= fletcher_4_supp_impls_cnt) + ksp->ks_private = (void *) (fletcher_4_stat_data + n); + else + ksp->ks_private = NULL; + + return (ksp->ks_private); +} +#endif + +#define FLETCHER_4_FASTEST_FN_COPY(type, src) \ +{ \ + fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ + fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ + fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ +} + +#define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ + +typedef void fletcher_checksum_func_t(const void *, uint64_t, const void *, + zio_cksum_t *); + +#if defined(_KERNEL) +static void +fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) +{ + + struct fletcher_4_kstat *fastest_stat = + &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; + hrtime_t start; + uint64_t run_bw, run_time_ns, best_run = 0; + zio_cksum_t zc; + uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); + + fletcher_checksum_func_t *fletcher_4_test = native ? + fletcher_4_native : fletcher_4_byteswap; + + for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { + struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; + uint64_t run_count = 0; + + /* temporary set an implementation */ + fletcher_4_impl_chosen = i; + + kpreempt_disable(); + start = gethrtime(); + do { + for (l = 0; l < 32; l++, run_count++) + fletcher_4_test(data, data_size, NULL, &zc); + + run_time_ns = gethrtime() - start; + } while (run_time_ns < FLETCHER_4_BENCH_NS); + kpreempt_enable(); + + run_bw = data_size * run_count * NANOSEC; + run_bw /= run_time_ns; /* B/s */ + + if (native) + stat->native = run_bw; + else + stat->byteswap = run_bw; + + if (run_bw > best_run) { + best_run = run_bw; + + if (native) { + fastest_stat->native = i; + FLETCHER_4_FASTEST_FN_COPY(native, + fletcher_4_supp_impls[i]); + } else { + fastest_stat->byteswap = i; + FLETCHER_4_FASTEST_FN_COPY(byteswap, + fletcher_4_supp_impls[i]); + } + } + } + + /* restore original selection */ + atomic_swap_32(&fletcher_4_impl_chosen, sel_save); +} +#endif /* _KERNEL */ + +/* + * Initialize and benchmark all supported implementations. + */ +static void +fletcher_4_benchmark(void) +{ + fletcher_4_ops_t *curr_impl; + int i, c; + + /* Move supported implementations into fletcher_4_supp_impls */ + for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { + curr_impl = (fletcher_4_ops_t *)fletcher_4_impls[i]; + + if (curr_impl->valid && curr_impl->valid()) + fletcher_4_supp_impls[c++] = curr_impl; + } + membar_producer(); /* complete fletcher_4_supp_impls[] init */ + fletcher_4_supp_impls_cnt = c; /* number of supported impl */ + +#if defined(_KERNEL) + static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ + char *databuf = vmem_alloc(data_size, KM_SLEEP); + + for (i = 0; i < data_size / sizeof (uint64_t); i++) + ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ + + fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); + fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); + + vmem_free(databuf, data_size); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&fletcher_4_fastest_impl, + fletcher_4_supp_impls[fletcher_4_supp_impls_cnt - 1], + sizeof (fletcher_4_fastest_impl)); + fletcher_4_fastest_impl.name = "fastest"; + membar_producer(); +#endif /* _KERNEL */ +} + +void +fletcher_4_init(void) +{ + /* Determine the fastest available implementation. */ + fletcher_4_benchmark(); + +#if defined(_KERNEL) + /* Install kstats for all implementations */ + fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (fletcher_4_kstat != NULL) { + fletcher_4_kstat->ks_data = NULL; + fletcher_4_kstat->ks_ndata = UINT32_MAX; + kstat_set_raw_ops(fletcher_4_kstat, + fletcher_4_kstat_headers, + fletcher_4_kstat_data, + fletcher_4_kstat_addr); + kstat_install(fletcher_4_kstat); + } +#endif + + /* Finish initialization */ + fletcher_4_initialized = B_TRUE; +} + +void +fletcher_4_fini(void) +{ +#if defined(_KERNEL) + if (fletcher_4_kstat != NULL) { + kstat_delete(fletcher_4_kstat); + fletcher_4_kstat = NULL; + } +#endif +} + +/* ABD adapters */ + +static void +abd_fletcher_4_init(zio_abd_checksum_data_t *cdp) +{ + const fletcher_4_ops_t *ops = fletcher_4_impl_get(); + cdp->acd_private = (void *) ops; + + if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) + ops->init_native(cdp->acd_ctx); + else + ops->init_byteswap(cdp->acd_ctx); +} + +static void +abd_fletcher_4_fini(zio_abd_checksum_data_t *cdp) +{ + fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; + + ASSERT(ops); + + if (cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE) + ops->fini_native(cdp->acd_ctx, cdp->acd_zcp); + else + ops->fini_byteswap(cdp->acd_ctx, cdp->acd_zcp); +} + +static void +abd_fletcher_4_simd2scalar(boolean_t native, void *data, size_t size, + zio_abd_checksum_data_t *cdp) +{ + zio_cksum_t *zcp = cdp->acd_zcp; + + ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); + + abd_fletcher_4_fini(cdp); + cdp->acd_private = (void *)&fletcher_4_scalar_ops; + + if (native) + fletcher_4_incremental_native(data, size, zcp); + else + fletcher_4_incremental_byteswap(data, size, zcp); +} + +static int +abd_fletcher_4_iter(void *data, size_t size, void *private) +{ + zio_abd_checksum_data_t *cdp = (zio_abd_checksum_data_t *)private; + fletcher_4_ctx_t *ctx = cdp->acd_ctx; + fletcher_4_ops_t *ops = (fletcher_4_ops_t *)cdp->acd_private; + boolean_t native = cdp->acd_byteorder == ZIO_CHECKSUM_NATIVE; + uint64_t asize = P2ALIGN(size, FLETCHER_MIN_SIMD_SIZE); + + ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); + + if (asize > 0) { + if (native) + ops->compute_native(ctx, data, asize); + else + ops->compute_byteswap(ctx, data, asize); + + size -= asize; + data = (char *)data + asize; + } + + if (size > 0) { + ASSERT3U(size, <, FLETCHER_MIN_SIMD_SIZE); + /* At this point we have to switch to scalar impl */ + abd_fletcher_4_simd2scalar(native, data, size, cdp); + } + + return (0); +} + +zio_abd_checksum_func_t fletcher_4_abd_ops = { + .acf_init = abd_fletcher_4_init, + .acf_fini = abd_fletcher_4_fini, + .acf_iter = abd_fletcher_4_iter +}; + + +#if defined(_KERNEL) && defined(__linux__) + +static int +fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) +{ + const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); + char *fmt; + int i, cnt = 0; + + /* list fastest */ + fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, "fastest"); + + /* list all supported implementations */ + for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, + fletcher_4_supp_impls[i]->name); + } + + return (cnt); +} + +static int +fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) +{ + return (fletcher_4_impl_set(val)); +} + +/* + * Choose a fletcher 4 implementation in ZFS. + * Users can choose "cycle" to exercise all implementations, but this is + * for testing purpose therefore it can only be set in user space. + */ +module_param_call(zfs_fletcher_4_impl, + fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); +MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); + +EXPORT_SYMBOL(fletcher_init); +EXPORT_SYMBOL(fletcher_2_incremental_native); +EXPORT_SYMBOL(fletcher_2_incremental_byteswap); +EXPORT_SYMBOL(fletcher_4_init); +EXPORT_SYMBOL(fletcher_4_fini); +EXPORT_SYMBOL(fletcher_2_native); +EXPORT_SYMBOL(fletcher_2_byteswap); +EXPORT_SYMBOL(fletcher_4_native); +EXPORT_SYMBOL(fletcher_4_native_varsize); +EXPORT_SYMBOL(fletcher_4_byteswap); +EXPORT_SYMBOL(fletcher_4_incremental_native); +EXPORT_SYMBOL(fletcher_4_incremental_byteswap); +EXPORT_SYMBOL(fletcher_4_abd_ops); +#endif diff --git a/module/zcommon/zfs_fletcher_aarch64_neon.c b/module/zcommon/zfs_fletcher_aarch64_neon.c new file mode 100644 index 000000000000..c95a71681584 --- /dev/null +++ b/module/zcommon/zfs_fletcher_aarch64_neon.c @@ -0,0 +1,215 @@ +/* + * Implement fast Fletcher4 with NEON instructions. (aarch64) + * + * Use the 128-bit NEON SIMD instructions and registers to compute + * Fletcher4 in two incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Romain Dolbeau. + * + * Authors: + * Romain Dolbeau + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(__aarch64__) + +#include +#include +#include +#include + +static void +fletcher_4_aarch64_neon_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->aarch64_neon, 4 * sizeof (zfs_fletcher_aarch64_neon_t)); +} + +static void +fletcher_4_aarch64_neon_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + A = ctx->aarch64_neon[0].v[0] + ctx->aarch64_neon[0].v[1]; + B = 2 * ctx->aarch64_neon[1].v[0] + 2 * ctx->aarch64_neon[1].v[1] - + ctx->aarch64_neon[0].v[1]; + C = 4 * ctx->aarch64_neon[2].v[0] - ctx->aarch64_neon[1].v[0] + + 4 * ctx->aarch64_neon[2].v[1] - 3 * ctx->aarch64_neon[1].v[1]; + D = 8 * ctx->aarch64_neon[3].v[0] - 4 * ctx->aarch64_neon[2].v[0] + + 8 * ctx->aarch64_neon[3].v[1] - 8 * ctx->aarch64_neon[2].v[1] + + ctx->aarch64_neon[1].v[1]; + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +#define NEON_INIT_LOOP() \ + asm("eor %[ZERO].16b,%[ZERO].16b,%[ZERO].16b\n" \ + "ld1 { %[ACC0].4s }, %[CTX0]\n" \ + "ld1 { %[ACC1].4s }, %[CTX1]\n" \ + "ld1 { %[ACC2].4s }, %[CTX2]\n" \ + "ld1 { %[ACC3].4s }, %[CTX3]\n" \ + : [ZERO] "=w" (ZERO), \ + [ACC0] "=w" (ACC0), [ACC1] "=w" (ACC1), \ + [ACC2] "=w" (ACC2), [ACC3] "=w" (ACC3) \ + : [CTX0] "Q" (ctx->aarch64_neon[0]), \ + [CTX1] "Q" (ctx->aarch64_neon[1]), \ + [CTX2] "Q" (ctx->aarch64_neon[2]), \ + [CTX3] "Q" (ctx->aarch64_neon[3])) + +#define NEON_DO_REVERSE "rev32 %[SRC].16b, %[SRC].16b\n" + +#define NEON_DONT_REVERSE "" + +#define NEON_MAIN_LOOP(REVERSE) \ + asm("ld1 { %[SRC].4s }, %[IP]\n" \ + REVERSE \ + "zip1 %[TMP1].4s, %[SRC].4s, %[ZERO].4s\n" \ + "zip2 %[TMP2].4s, %[SRC].4s, %[ZERO].4s\n" \ + "add %[ACC0].2d, %[ACC0].2d, %[TMP1].2d\n" \ + "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \ + "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \ + "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \ + "add %[ACC0].2d, %[ACC0].2d, %[TMP2].2d\n" \ + "add %[ACC1].2d, %[ACC1].2d, %[ACC0].2d\n" \ + "add %[ACC2].2d, %[ACC2].2d, %[ACC1].2d\n" \ + "add %[ACC3].2d, %[ACC3].2d, %[ACC2].2d\n" \ + : [SRC] "=&w" (SRC), \ + [TMP1] "=&w" (TMP1), [TMP2] "=&w" (TMP2), \ + [ACC0] "+w" (ACC0), [ACC1] "+w" (ACC1), \ + [ACC2] "+w" (ACC2), [ACC3] "+w" (ACC3) \ + : [ZERO] "w" (ZERO), [IP] "Q" (*ip)) + +#define NEON_FINI_LOOP() \ + asm("st1 { %[ACC0].4s },%[DST0]\n" \ + "st1 { %[ACC1].4s },%[DST1]\n" \ + "st1 { %[ACC2].4s },%[DST2]\n" \ + "st1 { %[ACC3].4s },%[DST3]\n" \ + : [DST0] "=Q" (ctx->aarch64_neon[0]), \ + [DST1] "=Q" (ctx->aarch64_neon[1]), \ + [DST2] "=Q" (ctx->aarch64_neon[2]), \ + [DST3] "=Q" (ctx->aarch64_neon[3]) \ + : [ACC0] "w" (ACC0), [ACC1] "w" (ACC1), \ + [ACC2] "w" (ACC2), [ACC3] "w" (ACC3)) + +static void +fletcher_4_aarch64_neon_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); +#if defined(_KERNEL) +register unsigned char ZERO asm("v0") __attribute__((vector_size(16))); +register unsigned char ACC0 asm("v1") __attribute__((vector_size(16))); +register unsigned char ACC1 asm("v2") __attribute__((vector_size(16))); +register unsigned char ACC2 asm("v3") __attribute__((vector_size(16))); +register unsigned char ACC3 asm("v4") __attribute__((vector_size(16))); +register unsigned char TMP1 asm("v5") __attribute__((vector_size(16))); +register unsigned char TMP2 asm("v6") __attribute__((vector_size(16))); +register unsigned char SRC asm("v7") __attribute__((vector_size(16))); +#else +unsigned char ZERO __attribute__((vector_size(16))); +unsigned char ACC0 __attribute__((vector_size(16))); +unsigned char ACC1 __attribute__((vector_size(16))); +unsigned char ACC2 __attribute__((vector_size(16))); +unsigned char ACC3 __attribute__((vector_size(16))); +unsigned char TMP1 __attribute__((vector_size(16))); +unsigned char TMP2 __attribute__((vector_size(16))); +unsigned char SRC __attribute__((vector_size(16))); +#endif + + kfpu_begin(); + + NEON_INIT_LOOP(); + + for (; ip < ipend; ip += 2) { + NEON_MAIN_LOOP(NEON_DONT_REVERSE); + } + + NEON_FINI_LOOP(); + + kfpu_end(); +} + +static void +fletcher_4_aarch64_neon_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); +#if defined(_KERNEL) +register unsigned char ZERO asm("v0") __attribute__((vector_size(16))); +register unsigned char ACC0 asm("v1") __attribute__((vector_size(16))); +register unsigned char ACC1 asm("v2") __attribute__((vector_size(16))); +register unsigned char ACC2 asm("v3") __attribute__((vector_size(16))); +register unsigned char ACC3 asm("v4") __attribute__((vector_size(16))); +register unsigned char TMP1 asm("v5") __attribute__((vector_size(16))); +register unsigned char TMP2 asm("v6") __attribute__((vector_size(16))); +register unsigned char SRC asm("v7") __attribute__((vector_size(16))); +#else +unsigned char ZERO __attribute__((vector_size(16))); +unsigned char ACC0 __attribute__((vector_size(16))); +unsigned char ACC1 __attribute__((vector_size(16))); +unsigned char ACC2 __attribute__((vector_size(16))); +unsigned char ACC3 __attribute__((vector_size(16))); +unsigned char TMP1 __attribute__((vector_size(16))); +unsigned char TMP2 __attribute__((vector_size(16))); +unsigned char SRC __attribute__((vector_size(16))); +#endif + + kfpu_begin(); + + NEON_INIT_LOOP(); + + for (; ip < ipend; ip += 2) { + NEON_MAIN_LOOP(NEON_DO_REVERSE); + } + + NEON_FINI_LOOP(); + + kfpu_end(); +} + +static boolean_t fletcher_4_aarch64_neon_valid(void) +{ + return (kfpu_allowed()); +} + +const fletcher_4_ops_t fletcher_4_aarch64_neon_ops = { + .init_native = fletcher_4_aarch64_neon_init, + .compute_native = fletcher_4_aarch64_neon_native, + .fini_native = fletcher_4_aarch64_neon_fini, + .init_byteswap = fletcher_4_aarch64_neon_init, + .compute_byteswap = fletcher_4_aarch64_neon_byteswap, + .fini_byteswap = fletcher_4_aarch64_neon_fini, + .valid = fletcher_4_aarch64_neon_valid, + .name = "aarch64_neon" +}; + +#endif /* defined(__aarch64__) */ diff --git a/module/zcommon/zfs_fletcher_avx512.c b/module/zcommon/zfs_fletcher_avx512.c new file mode 100644 index 000000000000..300ec4c1fb69 --- /dev/null +++ b/module/zcommon/zfs_fletcher_avx512.c @@ -0,0 +1,225 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden Nešković. All rights reserved. + */ + +#if defined(__x86_64) && defined(HAVE_AVX512F) + +#include +#include +#include +#include +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +static void +fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->avx512, 4 * sizeof (zfs_fletcher_avx512_t)); +} + +static void +fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + static const uint64_t + CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, + CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, + DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, + DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, + DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; + + uint64_t A, B, C, D; + uint64_t i; + + A = ctx->avx512[0].v[0]; + B = 8 * ctx->avx512[1].v[0]; + C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; + D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + + DcB[0] * ctx->avx512[1].v[0]; + + for (i = 1; i < 8; i++) { + A += ctx->avx512[0].v[i]; + B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; + C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + + CcA[i] * ctx->avx512[0].v[i]; + D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + + DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; + } + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +#define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ +{ \ + __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ + __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ + __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ + __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ +} + +#define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ +{ \ + __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ + __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ + __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ + __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ +} + +static void +fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx); + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native); + +static void +fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + static const uint64_t byteswap_mask = 0xFFULL; + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); + __asm("vpsllq $8, %zmm8, %zmm9"); + __asm("vpsllq $16, %zmm8, %zmm10"); + __asm("vpsllq $24, %zmm8, %zmm11"); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip)); + + __asm("vpsrlq $24, %zmm5, %zmm6"); + __asm("vpandd %zmm8, %zmm6, %zmm6"); + __asm("vpsrlq $8, %zmm5, %zmm7"); + __asm("vpandd %zmm9, %zmm7, %zmm7"); + __asm("vpord %zmm6, %zmm7, %zmm4"); + __asm("vpsllq $8, %zmm5, %zmm6"); + __asm("vpandd %zmm10, %zmm6, %zmm6"); + __asm("vpord %zmm6, %zmm4, %zmm4"); + __asm("vpsllq $24, %zmm5, %zmm5"); + __asm("vpandd %zmm11, %zmm5, %zmm5"); + __asm("vpord %zmm5, %zmm4, %zmm4"); + + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx) + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); + +static boolean_t +fletcher_4_avx512f_valid(void) +{ + return (kfpu_allowed() && zfs_avx512f_available()); +} + +const fletcher_4_ops_t fletcher_4_avx512f_ops = { + .init_native = fletcher_4_avx512f_init, + .fini_native = fletcher_4_avx512f_fini, + .compute_native = fletcher_4_avx512f_native, + .init_byteswap = fletcher_4_avx512f_init, + .fini_byteswap = fletcher_4_avx512f_fini, + .compute_byteswap = fletcher_4_avx512f_byteswap, + .valid = fletcher_4_avx512f_valid, + .name = "avx512f" +}; + +#if defined(HAVE_AVX512BW) +static void +fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, + uint64_t size) +{ + static const zfs_fletcher_avx512_t mask = { + .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } + }; + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX512_RESTORE_CTX(ctx); + + __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); + + for (; ip < ipend; ip += 8) { + __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); + + __asm("vpshufb %zmm5, %zmm4, %zmm4"); + + __asm("vpaddq %zmm4, %zmm0, %zmm0"); + __asm("vpaddq %zmm0, %zmm1, %zmm1"); + __asm("vpaddq %zmm1, %zmm2, %zmm2"); + __asm("vpaddq %zmm2, %zmm3, %zmm3"); + } + + FLETCHER_4_AVX512_SAVE_CTX(ctx) + + kfpu_end(); +} +STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); + +const fletcher_4_ops_t fletcher_4_avx512bw_ops = { + .init_native = fletcher_4_avx512f_init, + .fini_native = fletcher_4_avx512f_fini, + .compute_native = fletcher_4_avx512f_native, + .init_byteswap = fletcher_4_avx512f_init, + .fini_byteswap = fletcher_4_avx512f_fini, + .compute_byteswap = fletcher_4_avx512bw_byteswap, + .valid = fletcher_4_avx512f_valid, + .name = "avx512bw" +}; +#endif + +#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ diff --git a/module/zcommon/zfs_fletcher_intel.c b/module/zcommon/zfs_fletcher_intel.c new file mode 100644 index 000000000000..5136a01eca51 --- /dev/null +++ b/module/zcommon/zfs_fletcher_intel.c @@ -0,0 +1,173 @@ +/* + * Implement fast Fletcher4 with AVX2 instructions. (x86_64) + * + * Use the 256-bit AVX2 SIMD instructions and registers to compute + * Fletcher4 in four incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * + * Copyright (C) 2015 Intel Corporation. + * + * Authors: + * James Guilford + * Jinshan Xiong + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(HAVE_AVX) && defined(HAVE_AVX2) + +#include +#include +#include +#include + +static void +fletcher_4_avx2_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->avx, 4 * sizeof (zfs_fletcher_avx_t)); +} + +static void +fletcher_4_avx2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + + A = ctx->avx[0].v[0] + ctx->avx[0].v[1] + + ctx->avx[0].v[2] + ctx->avx[0].v[3]; + B = 0 - ctx->avx[0].v[1] - 2 * ctx->avx[0].v[2] - 3 * ctx->avx[0].v[3] + + 4 * ctx->avx[1].v[0] + 4 * ctx->avx[1].v[1] + 4 * ctx->avx[1].v[2] + + 4 * ctx->avx[1].v[3]; + + C = ctx->avx[0].v[2] + 3 * ctx->avx[0].v[3] - 6 * ctx->avx[1].v[0] - + 10 * ctx->avx[1].v[1] - 14 * ctx->avx[1].v[2] - + 18 * ctx->avx[1].v[3] + 16 * ctx->avx[2].v[0] + + 16 * ctx->avx[2].v[1] + 16 * ctx->avx[2].v[2] + + 16 * ctx->avx[2].v[3]; + + D = 0 - ctx->avx[0].v[3] + 4 * ctx->avx[1].v[0] + + 10 * ctx->avx[1].v[1] + 20 * ctx->avx[1].v[2] + + 34 * ctx->avx[1].v[3] - 48 * ctx->avx[2].v[0] - + 64 * ctx->avx[2].v[1] - 80 * ctx->avx[2].v[2] - + 96 * ctx->avx[2].v[3] + 64 * ctx->avx[3].v[0] + + 64 * ctx->avx[3].v[1] + 64 * ctx->avx[3].v[2] + + 64 * ctx->avx[3].v[3]; + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +#define FLETCHER_4_AVX2_RESTORE_CTX(ctx) \ +{ \ + asm volatile("vmovdqu %0, %%ymm0" :: "m" ((ctx)->avx[0])); \ + asm volatile("vmovdqu %0, %%ymm1" :: "m" ((ctx)->avx[1])); \ + asm volatile("vmovdqu %0, %%ymm2" :: "m" ((ctx)->avx[2])); \ + asm volatile("vmovdqu %0, %%ymm3" :: "m" ((ctx)->avx[3])); \ +} + +#define FLETCHER_4_AVX2_SAVE_CTX(ctx) \ +{ \ + asm volatile("vmovdqu %%ymm0, %0" : "=m" ((ctx)->avx[0])); \ + asm volatile("vmovdqu %%ymm1, %0" : "=m" ((ctx)->avx[1])); \ + asm volatile("vmovdqu %%ymm2, %0" : "=m" ((ctx)->avx[2])); \ + asm volatile("vmovdqu %%ymm3, %0" : "=m" ((ctx)->avx[3])); \ +} + + +static void +fletcher_4_avx2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX2_RESTORE_CTX(ctx); + + for (; ip < ipend; ip += 2) { + asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); + asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); + asm volatile("vpaddq %ymm0, %ymm1, %ymm1"); + asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); + asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); + } + + FLETCHER_4_AVX2_SAVE_CTX(ctx); + asm volatile("vzeroupper"); + + kfpu_end(); +} + +static void +fletcher_4_avx2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + static const zfs_fletcher_avx_t mask = { + .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, + 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } + }; + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_AVX2_RESTORE_CTX(ctx); + + asm volatile("vmovdqu %0, %%ymm5" :: "m" (mask)); + + for (; ip < ipend; ip += 2) { + asm volatile("vpmovzxdq %0, %%ymm4"::"m" (*ip)); + asm volatile("vpshufb %ymm5, %ymm4, %ymm4"); + + asm volatile("vpaddq %ymm4, %ymm0, %ymm0"); + asm volatile("vpaddq %ymm0, %ymm1, %ymm1"); + asm volatile("vpaddq %ymm1, %ymm2, %ymm2"); + asm volatile("vpaddq %ymm2, %ymm3, %ymm3"); + } + + FLETCHER_4_AVX2_SAVE_CTX(ctx); + asm volatile("vzeroupper"); + + kfpu_end(); +} + +static boolean_t fletcher_4_avx2_valid(void) +{ + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); +} + +const fletcher_4_ops_t fletcher_4_avx2_ops = { + .init_native = fletcher_4_avx2_init, + .fini_native = fletcher_4_avx2_fini, + .compute_native = fletcher_4_avx2_native, + .init_byteswap = fletcher_4_avx2_init, + .fini_byteswap = fletcher_4_avx2_fini, + .compute_byteswap = fletcher_4_avx2_byteswap, + .valid = fletcher_4_avx2_valid, + .name = "avx2" +}; + +#endif /* defined(HAVE_AVX) && defined(HAVE_AVX2) */ diff --git a/module/zcommon/zfs_fletcher_sse.c b/module/zcommon/zfs_fletcher_sse.c new file mode 100644 index 000000000000..15ce9b07ffbe --- /dev/null +++ b/module/zcommon/zfs_fletcher_sse.c @@ -0,0 +1,232 @@ +/* + * Implement fast Fletcher4 with SSE2,SSSE3 instructions. (x86) + * + * Use the 128-bit SSE2/SSSE3 SIMD instructions and registers to compute + * Fletcher4 in two incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Tyler J. Stachecki. + * + * Authors: + * Tyler J. Stachecki + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#if defined(HAVE_SSE2) + +#include +#include +#include +#include +#include + +static void +fletcher_4_sse2_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->sse, 4 * sizeof (zfs_fletcher_sse_t)); +} + +static void +fletcher_4_sse2_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + + /* + * The mixing matrix for checksum calculation is: + * a = a0 + a1 + * b = 2b0 + 2b1 - a1 + * c = 4c0 - b0 + 4c1 -3b1 + * d = 8d0 - 4c0 + 8d1 - 8c1 + b1; + * + * c and d are multiplied by 4 and 8, respectively, + * before spilling the vectors out to memory. + */ + A = ctx->sse[0].v[0] + ctx->sse[0].v[1]; + B = 2 * ctx->sse[1].v[0] + 2 * ctx->sse[1].v[1] - ctx->sse[0].v[1]; + C = 4 * ctx->sse[2].v[0] - ctx->sse[1].v[0] + 4 * ctx->sse[2].v[1] - + 3 * ctx->sse[1].v[1]; + D = 8 * ctx->sse[3].v[0] - 4 * ctx->sse[2].v[0] + 8 * ctx->sse[3].v[1] - + 8 * ctx->sse[2].v[1] + ctx->sse[1].v[1]; + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +#define FLETCHER_4_SSE_RESTORE_CTX(ctx) \ +{ \ + asm volatile("movdqu %0, %%xmm0" :: "m" ((ctx)->sse[0])); \ + asm volatile("movdqu %0, %%xmm1" :: "m" ((ctx)->sse[1])); \ + asm volatile("movdqu %0, %%xmm2" :: "m" ((ctx)->sse[2])); \ + asm volatile("movdqu %0, %%xmm3" :: "m" ((ctx)->sse[3])); \ +} + +#define FLETCHER_4_SSE_SAVE_CTX(ctx) \ +{ \ + asm volatile("movdqu %%xmm0, %0" : "=m" ((ctx)->sse[0])); \ + asm volatile("movdqu %%xmm1, %0" : "=m" ((ctx)->sse[1])); \ + asm volatile("movdqu %%xmm2, %0" : "=m" ((ctx)->sse[2])); \ + asm volatile("movdqu %%xmm3, %0" : "=m" ((ctx)->sse[3])); \ +} + +static void +fletcher_4_sse2_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + + asm volatile("pxor %xmm4, %xmm4"); + + for (; ip < ipend; ip += 2) { + asm volatile("movdqu %0, %%xmm5" :: "m"(*ip)); + asm volatile("movdqa %xmm5, %xmm6"); + asm volatile("punpckldq %xmm4, %xmm5"); + asm volatile("punpckhdq %xmm4, %xmm6"); + asm volatile("paddq %xmm5, %xmm0"); + asm volatile("paddq %xmm0, %xmm1"); + asm volatile("paddq %xmm1, %xmm2"); + asm volatile("paddq %xmm2, %xmm3"); + asm volatile("paddq %xmm6, %xmm0"); + asm volatile("paddq %xmm0, %xmm1"); + asm volatile("paddq %xmm1, %xmm2"); + asm volatile("paddq %xmm2, %xmm3"); + } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); +} + +static void +fletcher_4_sse2_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + + for (; ip < ipend; ip += 2) { + uint32_t scratch1 = BSWAP_32(ip[0]); + uint32_t scratch2 = BSWAP_32(ip[1]); + asm volatile("movd %0, %%xmm5" :: "r"(scratch1)); + asm volatile("movd %0, %%xmm6" :: "r"(scratch2)); + asm volatile("punpcklqdq %xmm6, %xmm5"); + asm volatile("paddq %xmm5, %xmm0"); + asm volatile("paddq %xmm0, %xmm1"); + asm volatile("paddq %xmm1, %xmm2"); + asm volatile("paddq %xmm2, %xmm3"); + } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); +} + +static boolean_t fletcher_4_sse2_valid(void) +{ + return (kfpu_allowed() && zfs_sse2_available()); +} + +const fletcher_4_ops_t fletcher_4_sse2_ops = { + .init_native = fletcher_4_sse2_init, + .fini_native = fletcher_4_sse2_fini, + .compute_native = fletcher_4_sse2_native, + .init_byteswap = fletcher_4_sse2_init, + .fini_byteswap = fletcher_4_sse2_fini, + .compute_byteswap = fletcher_4_sse2_byteswap, + .valid = fletcher_4_sse2_valid, + .name = "sse2" +}; + +#endif /* defined(HAVE_SSE2) */ + +#if defined(HAVE_SSE2) && defined(HAVE_SSSE3) +static void +fletcher_4_ssse3_byteswap(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) +{ + static const zfs_fletcher_sse_t mask = { + .v = { 0x0405060700010203, 0x0C0D0E0F08090A0B } + }; + + const uint64_t *ip = buf; + const uint64_t *ipend = (uint64_t *)((uint8_t *)ip + size); + + kfpu_begin(); + + FLETCHER_4_SSE_RESTORE_CTX(ctx); + + asm volatile("movdqu %0, %%xmm7"::"m" (mask)); + asm volatile("pxor %xmm4, %xmm4"); + + for (; ip < ipend; ip += 2) { + asm volatile("movdqu %0, %%xmm5"::"m" (*ip)); + asm volatile("pshufb %xmm7, %xmm5"); + asm volatile("movdqa %xmm5, %xmm6"); + asm volatile("punpckldq %xmm4, %xmm5"); + asm volatile("punpckhdq %xmm4, %xmm6"); + asm volatile("paddq %xmm5, %xmm0"); + asm volatile("paddq %xmm0, %xmm1"); + asm volatile("paddq %xmm1, %xmm2"); + asm volatile("paddq %xmm2, %xmm3"); + asm volatile("paddq %xmm6, %xmm0"); + asm volatile("paddq %xmm0, %xmm1"); + asm volatile("paddq %xmm1, %xmm2"); + asm volatile("paddq %xmm2, %xmm3"); + } + + FLETCHER_4_SSE_SAVE_CTX(ctx); + + kfpu_end(); +} + +static boolean_t fletcher_4_ssse3_valid(void) +{ + return (kfpu_allowed() && zfs_sse2_available() && + zfs_ssse3_available()); +} + +const fletcher_4_ops_t fletcher_4_ssse3_ops = { + .init_native = fletcher_4_sse2_init, + .fini_native = fletcher_4_sse2_fini, + .compute_native = fletcher_4_sse2_native, + .init_byteswap = fletcher_4_sse2_init, + .fini_byteswap = fletcher_4_sse2_fini, + .compute_byteswap = fletcher_4_ssse3_byteswap, + .valid = fletcher_4_ssse3_valid, + .name = "ssse3" +}; + +#endif /* defined(HAVE_SSE2) && defined(HAVE_SSSE3) */ diff --git a/module/zcommon/zfs_fletcher_superscalar.c b/module/zcommon/zfs_fletcher_superscalar.c new file mode 100644 index 000000000000..153f5c7d75e3 --- /dev/null +++ b/module/zcommon/zfs_fletcher_superscalar.c @@ -0,0 +1,163 @@ +/* + * Implement fast Fletcher4 using superscalar pipelines. + * + * Use regular C code to compute + * Fletcher4 in two incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Romain Dolbeau. + * + * Authors: + * Romain Dolbeau + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +static void +fletcher_4_superscalar_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t)); +} + +static void +fletcher_4_superscalar_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1]; + B = 2 * ctx->superscalar[1].v[0] + 2 * ctx->superscalar[1].v[1] - + ctx->superscalar[0].v[1]; + C = 4 * ctx->superscalar[2].v[0] - ctx->superscalar[1].v[0] + + 4 * ctx->superscalar[2].v[1] - 3 * ctx->superscalar[1].v[1]; + D = 8 * ctx->superscalar[3].v[0] - 4 * ctx->superscalar[2].v[0] + + 8 * ctx->superscalar[3].v[1] - 8 * ctx->superscalar[2].v[1] + + ctx->superscalar[1].v[1]; + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +static void +fletcher_4_superscalar_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + + for (; ip < ipend; ip += 2) { + a += ip[0]; + a2 += ip[1]; + b += a; + b2 += a2; + c += b; + c2 += b2; + d += c; + d2 += c2; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; +} + +static void +fletcher_4_superscalar_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + + for (; ip < ipend; ip += 2) { + a += BSWAP_32(ip[0]); + a2 += BSWAP_32(ip[1]); + b += a; + b2 += a2; + c += b; + c2 += b2; + d += c; + d2 += c2; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; +} + +static boolean_t fletcher_4_superscalar_valid(void) +{ + return (B_TRUE); +} + +const fletcher_4_ops_t fletcher_4_superscalar_ops = { + .init_native = fletcher_4_superscalar_init, + .compute_native = fletcher_4_superscalar_native, + .fini_native = fletcher_4_superscalar_fini, + .init_byteswap = fletcher_4_superscalar_init, + .compute_byteswap = fletcher_4_superscalar_byteswap, + .fini_byteswap = fletcher_4_superscalar_fini, + .valid = fletcher_4_superscalar_valid, + .name = "superscalar" +}; diff --git a/module/zcommon/zfs_fletcher_superscalar4.c b/module/zcommon/zfs_fletcher_superscalar4.c new file mode 100644 index 000000000000..75e6a3baf980 --- /dev/null +++ b/module/zcommon/zfs_fletcher_superscalar4.c @@ -0,0 +1,229 @@ +/* + * Implement fast Fletcher4 using superscalar pipelines. + * + * Use regular C code to compute + * Fletcher4 in four incremental 64-bit parallel accumulator streams, + * and then combine the streams to form the final four checksum words. + * This implementation is a derivative of the AVX SIMD implementation by + * James Guilford and Jinshan Xiong from Intel (see zfs_fletcher_intel.c). + * + * Copyright (C) 2016 Romain Dolbeau. + * + * Authors: + * Romain Dolbeau + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include + +static void +fletcher_4_superscalar4_init(fletcher_4_ctx_t *ctx) +{ + bzero(ctx->superscalar, 4 * sizeof (zfs_fletcher_superscalar_t)); +} + +static void +fletcher_4_superscalar4_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) +{ + uint64_t A, B, C, D; + + A = ctx->superscalar[0].v[0] + ctx->superscalar[0].v[1] + + ctx->superscalar[0].v[2] + ctx->superscalar[0].v[3]; + B = 0 - ctx->superscalar[0].v[1] - 2 * ctx->superscalar[0].v[2] - + 3 * ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + + 4 * ctx->superscalar[1].v[1] + 4 * ctx->superscalar[1].v[2] + + 4 * ctx->superscalar[1].v[3]; + + C = ctx->superscalar[0].v[2] + 3 * ctx->superscalar[0].v[3] - + 6 * ctx->superscalar[1].v[0] - 10 * ctx->superscalar[1].v[1] - + 14 * ctx->superscalar[1].v[2] - 18 * ctx->superscalar[1].v[3] + + 16 * ctx->superscalar[2].v[0] + 16 * ctx->superscalar[2].v[1] + + 16 * ctx->superscalar[2].v[2] + 16 * ctx->superscalar[2].v[3]; + + D = 0 - ctx->superscalar[0].v[3] + 4 * ctx->superscalar[1].v[0] + + 10 * ctx->superscalar[1].v[1] + 20 * ctx->superscalar[1].v[2] + + 34 * ctx->superscalar[1].v[3] - 48 * ctx->superscalar[2].v[0] - + 64 * ctx->superscalar[2].v[1] - 80 * ctx->superscalar[2].v[2] - + 96 * ctx->superscalar[2].v[3] + 64 * ctx->superscalar[3].v[0] + + 64 * ctx->superscalar[3].v[1] + 64 * ctx->superscalar[3].v[2] + + 64 * ctx->superscalar[3].v[3]; + + ZIO_SET_CHECKSUM(zcp, A, B, C, D); +} + +static void +fletcher_4_superscalar4_native(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + uint64_t a3, b3, c3, d3; + uint64_t a4, b4, c4, d4; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + a3 = ctx->superscalar[0].v[2]; + b3 = ctx->superscalar[1].v[2]; + c3 = ctx->superscalar[2].v[2]; + d3 = ctx->superscalar[3].v[2]; + a4 = ctx->superscalar[0].v[3]; + b4 = ctx->superscalar[1].v[3]; + c4 = ctx->superscalar[2].v[3]; + d4 = ctx->superscalar[3].v[3]; + + for (; ip < ipend; ip += 4) { + a += ip[0]; + a2 += ip[1]; + a3 += ip[2]; + a4 += ip[3]; + b += a; + b2 += a2; + b3 += a3; + b4 += a4; + c += b; + c2 += b2; + c3 += b3; + c4 += b4; + d += c; + d2 += c2; + d3 += c3; + d4 += c4; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; + ctx->superscalar[0].v[2] = a3; + ctx->superscalar[1].v[2] = b3; + ctx->superscalar[2].v[2] = c3; + ctx->superscalar[3].v[2] = d3; + ctx->superscalar[0].v[3] = a4; + ctx->superscalar[1].v[3] = b4; + ctx->superscalar[2].v[3] = c4; + ctx->superscalar[3].v[3] = d4; +} + +static void +fletcher_4_superscalar4_byteswap(fletcher_4_ctx_t *ctx, + const void *buf, uint64_t size) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + uint64_t a2, b2, c2, d2; + uint64_t a3, b3, c3, d3; + uint64_t a4, b4, c4, d4; + + a = ctx->superscalar[0].v[0]; + b = ctx->superscalar[1].v[0]; + c = ctx->superscalar[2].v[0]; + d = ctx->superscalar[3].v[0]; + a2 = ctx->superscalar[0].v[1]; + b2 = ctx->superscalar[1].v[1]; + c2 = ctx->superscalar[2].v[1]; + d2 = ctx->superscalar[3].v[1]; + a3 = ctx->superscalar[0].v[2]; + b3 = ctx->superscalar[1].v[2]; + c3 = ctx->superscalar[2].v[2]; + d3 = ctx->superscalar[3].v[2]; + a4 = ctx->superscalar[0].v[3]; + b4 = ctx->superscalar[1].v[3]; + c4 = ctx->superscalar[2].v[3]; + d4 = ctx->superscalar[3].v[3]; + + for (; ip < ipend; ip += 4) { + a += BSWAP_32(ip[0]); + a2 += BSWAP_32(ip[1]); + a3 += BSWAP_32(ip[2]); + a4 += BSWAP_32(ip[3]); + b += a; + b2 += a2; + b3 += a3; + b4 += a4; + c += b; + c2 += b2; + c3 += b3; + c4 += b4; + d += c; + d2 += c2; + d3 += c3; + d4 += c4; + } + + ctx->superscalar[0].v[0] = a; + ctx->superscalar[1].v[0] = b; + ctx->superscalar[2].v[0] = c; + ctx->superscalar[3].v[0] = d; + ctx->superscalar[0].v[1] = a2; + ctx->superscalar[1].v[1] = b2; + ctx->superscalar[2].v[1] = c2; + ctx->superscalar[3].v[1] = d2; + ctx->superscalar[0].v[2] = a3; + ctx->superscalar[1].v[2] = b3; + ctx->superscalar[2].v[2] = c3; + ctx->superscalar[3].v[2] = d3; + ctx->superscalar[0].v[3] = a4; + ctx->superscalar[1].v[3] = b4; + ctx->superscalar[2].v[3] = c4; + ctx->superscalar[3].v[3] = d4; +} + +static boolean_t fletcher_4_superscalar4_valid(void) +{ + return (B_TRUE); +} + +const fletcher_4_ops_t fletcher_4_superscalar4_ops = { + .init_native = fletcher_4_superscalar4_init, + .compute_native = fletcher_4_superscalar4_native, + .fini_native = fletcher_4_superscalar4_fini, + .init_byteswap = fletcher_4_superscalar4_init, + .compute_byteswap = fletcher_4_superscalar4_byteswap, + .fini_byteswap = fletcher_4_superscalar4_fini, + .valid = fletcher_4_superscalar4_valid, + .name = "superscalar4" +}; diff --git a/module/zcommon/zfs_namecheck.c b/module/zcommon/zfs_namecheck.c new file mode 100644 index 000000000000..f8625042a74c --- /dev/null +++ b/module/zcommon/zfs_namecheck.c @@ -0,0 +1,471 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +/* + * Common name validation routines for ZFS. These routines are shared by the + * userland code as well as the ioctl() layer to ensure that we don't + * inadvertently expose a hole through direct ioctl()s that never gets tested. + * In userland, however, we want significantly more information about _why_ the + * name is invalid. In the kernel, we only care whether it's valid or not. + * Each routine therefore takes a 'namecheck_err_t' which describes exactly why + * the name failed to validate. + */ + +#if !defined(_KERNEL) +#include +#endif + +#include +#include +#include +#include "zfs_namecheck.h" +#include "zfs_deleg.h" + +/* + * Deeply nested datasets can overflow the stack, so we put a limit + * in the amount of nesting a path can have. zfs_max_dataset_nesting + * can be tuned temporarily to fix existing datasets that exceed our + * predefined limit. + */ +int zfs_max_dataset_nesting = 50; + +static int +valid_char(char c) +{ + return ((c >= 'a' && c <= 'z') || + (c >= 'A' && c <= 'Z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == ':' || c == ' '); +} + +/* + * Looks at a path and returns its level of nesting (depth). + */ +int +get_dataset_depth(const char *path) +{ + const char *loc = path; + int nesting = 0; + + /* + * Keep track of nesting until you hit the end of the + * path or found the snapshot/bookmark separator. + */ + for (int i = 0; loc[i] != '\0' && + loc[i] != '@' && + loc[i] != '#'; i++) { + if (loc[i] == '/') + nesting++; + } + + return (nesting); +} + +/* + * Snapshot names must be made up of alphanumeric characters plus the following + * characters: + * + * [-_.: ] + * + * Returns 0 on success, -1 on error. + */ +int +zfs_component_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + const char *loc; + + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + if (path[0] == '\0') { + if (why) + *why = NAME_ERR_EMPTY_COMPONENT; + return (-1); + } + + for (loc = path; *loc; loc++) { + if (!valid_char(*loc)) { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *loc; + } + return (-1); + } + } + return (0); +} + + +/* + * Permissions set name must start with the letter '@' followed by the + * same character restrictions as snapshot names, except that the name + * cannot exceed 64 characters. + * + * Returns 0 on success, -1 on error. + */ +int +permset_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + if (strlen(path) >= ZFS_PERMSET_MAXLEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + if (path[0] != '@') { + if (why) { + *why = NAME_ERR_NO_AT; + *what = path[0]; + } + return (-1); + } + + return (zfs_component_namecheck(&path[1], why, what)); +} + +/* + * Dataset paths should not be deeper than zfs_max_dataset_nesting + * in terms of nesting. + * + * Returns 0 on success, -1 on error. + */ +int +dataset_nestcheck(const char *path) +{ + return ((get_dataset_depth(path) < zfs_max_dataset_nesting) ? 0 : -1); +} + +/* + * Entity names must be of the following form: + * + * [component/]*[component][(@|#)component]? + * + * Where each component is made up of alphanumeric characters plus the following + * characters: + * + * [-_.: %] + * + * We allow '%' here as we use that character internally to create unique + * names for temporary clones (for online recv). + * + * Returns 0 on success, -1 on error. + */ +int +entity_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + const char *end; + + EQUIV(why == NULL, what == NULL); + + /* + * Make sure the name is not too long. + */ + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + /* Explicitly check for a leading slash. */ + if (path[0] == '/') { + if (why) + *why = NAME_ERR_LEADING_SLASH; + return (-1); + } + + if (path[0] == '\0') { + if (why) + *why = NAME_ERR_EMPTY_COMPONENT; + return (-1); + } + + const char *start = path; + boolean_t found_delim = B_FALSE; + for (;;) { + /* Find the end of this component */ + end = start; + while (*end != '/' && *end != '@' && *end != '#' && + *end != '\0') + end++; + + if (*end == '\0' && end[-1] == '/') { + /* trailing slashes are not allowed */ + if (why) + *why = NAME_ERR_TRAILING_SLASH; + return (-1); + } + + /* Validate the contents of this component */ + for (const char *loc = start; loc != end; loc++) { + if (!valid_char(*loc) && *loc != '%') { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *loc; + } + return (-1); + } + } + + if (*end == '\0' || *end == '/') { + int component_length = end - start; + /* Validate the contents of this component is not '.' */ + if (component_length == 1) { + if (start[0] == '.') { + if (why) + *why = NAME_ERR_SELF_REF; + return (-1); + } + } + + /* Validate the content of this component is not '..' */ + if (component_length == 2) { + if (start[0] == '.' && start[1] == '.') { + if (why) + *why = NAME_ERR_PARENT_REF; + return (-1); + } + } + } + + /* Snapshot or bookmark delimiter found */ + if (*end == '@' || *end == '#') { + /* Multiple delimiters are not allowed */ + if (found_delim != 0) { + if (why) + *why = NAME_ERR_MULTIPLE_DELIMITERS; + return (-1); + } + + found_delim = B_TRUE; + } + + /* Zero-length components are not allowed */ + if (start == end) { + if (why) + *why = NAME_ERR_EMPTY_COMPONENT; + return (-1); + } + + /* If we've reached the end of the string, we're OK */ + if (*end == '\0') + return (0); + + /* + * If there is a '/' in a snapshot or bookmark name + * then report an error + */ + if (*end == '/' && found_delim != 0) { + if (why) + *why = NAME_ERR_TRAILING_SLASH; + return (-1); + } + + /* Update to the next component */ + start = end + 1; + } +} + +/* + * Dataset is any entity, except bookmark + */ +int +dataset_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + int ret = entity_namecheck(path, why, what); + + if (ret == 0 && strchr(path, '#') != NULL) { + if (why != NULL) { + *why = NAME_ERR_INVALCHAR; + *what = '#'; + } + return (-1); + } + + return (ret); +} + +/* + * Assert path is a valid bookmark name + */ +int +bookmark_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + int ret = entity_namecheck(path, why, what); + + if (ret == 0 && strchr(path, '#') == NULL) { + if (why != NULL) { + *why = NAME_ERR_NO_POUND; + *what = '#'; + } + return (-1); + } + + return (ret); +} + +/* + * Assert path is a valid snapshot name + */ +int +snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) +{ + int ret = entity_namecheck(path, why, what); + + if (ret == 0 && strchr(path, '@') == NULL) { + if (why != NULL) { + *why = NAME_ERR_NO_AT; + *what = '@'; + } + return (-1); + } + + return (ret); +} + +/* + * mountpoint names must be of the following form: + * + * /[component][/]*[component][/] + * + * Returns 0 on success, -1 on error. + */ +int +mountpoint_namecheck(const char *path, namecheck_err_t *why) +{ + const char *start, *end; + + /* + * Make sure none of the mountpoint component names are too long. + * If a component name is too long then the mkdir of the mountpoint + * will fail but then the mountpoint property will be set to a value + * that can never be mounted. Better to fail before setting the prop. + * Extra slashes are OK, they will be tossed by the mountpoint mkdir. + */ + + if (path == NULL || *path != '/') { + if (why) + *why = NAME_ERR_LEADING_SLASH; + return (-1); + } + + /* Skip leading slash */ + start = &path[1]; + do { + end = start; + while (*end != '/' && *end != '\0') + end++; + + if (end - start >= ZFS_MAX_DATASET_NAME_LEN) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + start = end + 1; + + } while (*end != '\0'); + + return (0); +} + +/* + * For pool names, we have the same set of valid characters as described in + * dataset names, with the additional restriction that the pool name must begin + * with a letter. The pool names 'raidz' and 'mirror' are also reserved names + * that cannot be used. + * + * Returns 0 on success, -1 on error. + */ +int +pool_namecheck(const char *pool, namecheck_err_t *why, char *what) +{ + const char *c; + + /* + * Make sure the name is not too long. + * If we're creating a pool with version >= SPA_VERSION_DSL_SCRUB (v11) + * we need to account for additional space needed by the origin ds which + * will also be snapshotted: "poolname"+"/"+"$ORIGIN"+"@"+"$ORIGIN". + * Play it safe and enforce this limit even if the pool version is < 11 + * so it can be upgraded without issues. + */ + if (strlen(pool) >= (ZFS_MAX_DATASET_NAME_LEN - 2 - + strlen(ORIGIN_DIR_NAME) * 2)) { + if (why) + *why = NAME_ERR_TOOLONG; + return (-1); + } + + c = pool; + while (*c != '\0') { + if (!valid_char(*c)) { + if (why) { + *why = NAME_ERR_INVALCHAR; + *what = *c; + } + return (-1); + } + c++; + } + + if (!(*pool >= 'a' && *pool <= 'z') && + !(*pool >= 'A' && *pool <= 'Z')) { + if (why) + *why = NAME_ERR_NOLETTER; + return (-1); + } + + if (strcmp(pool, "mirror") == 0 || strcmp(pool, "raidz") == 0) { + if (why) + *why = NAME_ERR_RESERVED; + return (-1); + } + + if (pool[0] == 'c' && (pool[1] >= '0' && pool[1] <= '9')) { + if (why) + *why = NAME_ERR_DISKLIKE; + return (-1); + } + + return (0); +} + +EXPORT_SYMBOL(entity_namecheck); +EXPORT_SYMBOL(pool_namecheck); +EXPORT_SYMBOL(dataset_namecheck); +EXPORT_SYMBOL(bookmark_namecheck); +EXPORT_SYMBOL(snapshot_namecheck); +EXPORT_SYMBOL(zfs_component_namecheck); +EXPORT_SYMBOL(dataset_nestcheck); +EXPORT_SYMBOL(get_dataset_depth); +EXPORT_SYMBOL(zfs_max_dataset_nesting); + +ZFS_MODULE_PARAM(zfs, zfs_, max_dataset_nesting, INT, ZMOD_RW, + "Limit to the amount of nesting a path can have. Defaults to 50."); diff --git a/module/zcommon/zfs_prop.c b/module/zcommon/zfs_prop.c new file mode 100644 index 000000000000..837b8ae71b34 --- /dev/null +++ b/module/zcommon/zfs_prop.c @@ -0,0 +1,1052 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright 2016, Joyent, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_fletcher.h" + +#if !defined(_KERNEL) +#include +#include +#include +#endif + +static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; + +/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ +const char *zfs_userquota_prop_prefixes[] = { + "userused@", + "userquota@", + "groupused@", + "groupquota@", + "userobjused@", + "userobjquota@", + "groupobjused@", + "groupobjquota@", + "projectused@", + "projectquota@", + "projectobjused@", + "projectobjquota@" +}; + +zprop_desc_t * +zfs_prop_get_table(void) +{ + return (zfs_prop_table); +} + +void +zfs_prop_init(void) +{ + static zprop_index_t checksum_table[] = { + { "on", ZIO_CHECKSUM_ON }, + { "off", ZIO_CHECKSUM_OFF }, + { "fletcher2", ZIO_CHECKSUM_FLETCHER_2 }, + { "fletcher4", ZIO_CHECKSUM_FLETCHER_4 }, + { "sha256", ZIO_CHECKSUM_SHA256 }, + { "noparity", ZIO_CHECKSUM_NOPARITY }, + { "sha512", ZIO_CHECKSUM_SHA512 }, + { "skein", ZIO_CHECKSUM_SKEIN }, +#if !defined(__FreeBSD__) + + { "edonr", ZIO_CHECKSUM_EDONR }, +#endif + { NULL } + }; + + static zprop_index_t dedup_table[] = { + { "on", ZIO_CHECKSUM_ON }, + { "off", ZIO_CHECKSUM_OFF }, + { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, + { "sha256", ZIO_CHECKSUM_SHA256 }, + { "sha256,verify", + ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, + { "sha512", ZIO_CHECKSUM_SHA512 }, + { "sha512,verify", + ZIO_CHECKSUM_SHA512 | ZIO_CHECKSUM_VERIFY }, + { "skein", ZIO_CHECKSUM_SKEIN }, + { "skein,verify", + ZIO_CHECKSUM_SKEIN | ZIO_CHECKSUM_VERIFY }, +#if !defined(__FreeBSD__) + + { "edonr,verify", + ZIO_CHECKSUM_EDONR | ZIO_CHECKSUM_VERIFY }, +#endif + { NULL } + }; + + static zprop_index_t compress_table[] = { + { "on", ZIO_COMPRESS_ON }, + { "off", ZIO_COMPRESS_OFF }, + { "lzjb", ZIO_COMPRESS_LZJB }, + { "gzip", ZIO_COMPRESS_GZIP_6 }, /* gzip default */ + { "gzip-1", ZIO_COMPRESS_GZIP_1 }, + { "gzip-2", ZIO_COMPRESS_GZIP_2 }, + { "gzip-3", ZIO_COMPRESS_GZIP_3 }, + { "gzip-4", ZIO_COMPRESS_GZIP_4 }, + { "gzip-5", ZIO_COMPRESS_GZIP_5 }, + { "gzip-6", ZIO_COMPRESS_GZIP_6 }, + { "gzip-7", ZIO_COMPRESS_GZIP_7 }, + { "gzip-8", ZIO_COMPRESS_GZIP_8 }, + { "gzip-9", ZIO_COMPRESS_GZIP_9 }, + { "zle", ZIO_COMPRESS_ZLE }, + { "lz4", ZIO_COMPRESS_LZ4 }, + { "zstd", ZIO_COMPRESS_ZSTD }, + { "zstd-fast", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_DEFAULT) }, + + /* + * ZSTD 1-19 are synthetic. We store the compression level in a + * separate hidden property to avoid wasting a large amount of + * space in the ZIO_COMPRESS enum. + * + * The compression level is also stored within the header of the + * compressed block since we may need it for later recompression + * to avoid checksum errors (L2ARC). + * + * Note that the level here is defined as bit shifted mask on + * top of the method. + */ + { "zstd-1", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_1) }, + { "zstd-2", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_2) }, + { "zstd-3", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_3) }, + { "zstd-4", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_4) }, + { "zstd-5", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_5) }, + { "zstd-6", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_6) }, + { "zstd-7", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_7) }, + { "zstd-8", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_8) }, + { "zstd-9", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_9) }, + { "zstd-10", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_10) }, + { "zstd-11", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_11) }, + { "zstd-12", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_12) }, + { "zstd-13", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_13) }, + { "zstd-14", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_14) }, + { "zstd-15", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_15) }, + { "zstd-16", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_16) }, + { "zstd-17", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_17) }, + { "zstd-18", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_18) }, + { "zstd-19", ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_19) }, + + /* + * The ZSTD-Fast levels are also synthetic. + */ + { "zstd-fast-1", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1) }, + { "zstd-fast-2", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_2) }, + { "zstd-fast-3", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_3) }, + { "zstd-fast-4", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_4) }, + { "zstd-fast-5", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_5) }, + { "zstd-fast-6", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_6) }, + { "zstd-fast-7", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_7) }, + { "zstd-fast-8", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_8) }, + { "zstd-fast-9", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_9) }, + { "zstd-fast-10", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_10) }, + { "zstd-fast-20", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_20) }, + { "zstd-fast-30", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_30) }, + { "zstd-fast-40", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_40) }, + { "zstd-fast-50", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_50) }, + { "zstd-fast-60", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_60) }, + { "zstd-fast-70", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_70) }, + { "zstd-fast-80", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_80) }, + { "zstd-fast-90", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_90) }, + { "zstd-fast-100", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_100) }, + { "zstd-fast-500", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_500) }, + { "zstd-fast-1000", + ZIO_COMPLEVEL_ZSTD(ZIO_ZSTD_LEVEL_FAST_1000) }, + { NULL } + }; + + static zprop_index_t crypto_table[] = { + { "on", ZIO_CRYPT_ON }, + { "off", ZIO_CRYPT_OFF }, + { "aes-128-ccm", ZIO_CRYPT_AES_128_CCM }, + { "aes-192-ccm", ZIO_CRYPT_AES_192_CCM }, + { "aes-256-ccm", ZIO_CRYPT_AES_256_CCM }, + { "aes-128-gcm", ZIO_CRYPT_AES_128_GCM }, + { "aes-192-gcm", ZIO_CRYPT_AES_192_GCM }, + { "aes-256-gcm", ZIO_CRYPT_AES_256_GCM }, + { NULL } + }; + + static zprop_index_t keyformat_table[] = { + { "none", ZFS_KEYFORMAT_NONE }, + { "raw", ZFS_KEYFORMAT_RAW }, + { "hex", ZFS_KEYFORMAT_HEX }, + { "passphrase", ZFS_KEYFORMAT_PASSPHRASE }, + { NULL } + }; + + static zprop_index_t snapdir_table[] = { + { "hidden", ZFS_SNAPDIR_HIDDEN }, + { "visible", ZFS_SNAPDIR_VISIBLE }, + { NULL } + }; + + static zprop_index_t snapdev_table[] = { + { "hidden", ZFS_SNAPDEV_HIDDEN }, + { "visible", ZFS_SNAPDEV_VISIBLE }, + { NULL } + }; + + static zprop_index_t acl_mode_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "groupmask", ZFS_ACL_GROUPMASK }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "restricted", ZFS_ACL_RESTRICTED }, + { NULL } + }; + + static zprop_index_t acltype_table[] = { + { "off", ZFS_ACLTYPE_OFF }, + { "disabled", ZFS_ACLTYPE_OFF }, + { "noacl", ZFS_ACLTYPE_OFF }, + { "posixacl", ZFS_ACLTYPE_POSIXACL }, + { NULL } + }; + + static zprop_index_t acl_inherit_table[] = { + { "discard", ZFS_ACL_DISCARD }, + { "noallow", ZFS_ACL_NOALLOW }, + { "restricted", ZFS_ACL_RESTRICTED }, + { "passthrough", ZFS_ACL_PASSTHROUGH }, + { "secure", ZFS_ACL_RESTRICTED }, /* bkwrd compatibility */ + { "passthrough-x", ZFS_ACL_PASSTHROUGH_X }, + { NULL } + }; + + static zprop_index_t case_table[] = { + { "sensitive", ZFS_CASE_SENSITIVE }, + { "insensitive", ZFS_CASE_INSENSITIVE }, + { "mixed", ZFS_CASE_MIXED }, + { NULL } + }; + + static zprop_index_t copies_table[] = { + { "1", 1 }, + { "2", 2 }, + { "3", 3 }, + { NULL } + }; + + /* + * Use the unique flags we have to send to u8_strcmp() and/or + * u8_textprep() to represent the various normalization property + * values. + */ + static zprop_index_t normalize_table[] = { + { "none", 0 }, + { "formD", U8_TEXTPREP_NFD }, + { "formKC", U8_TEXTPREP_NFKC }, + { "formC", U8_TEXTPREP_NFC }, + { "formKD", U8_TEXTPREP_NFKD }, + { NULL } + }; + + static zprop_index_t version_table[] = { + { "1", 1 }, + { "2", 2 }, + { "3", 3 }, + { "4", 4 }, + { "5", 5 }, + { "current", ZPL_VERSION }, + { NULL } + }; + + static zprop_index_t boolean_table[] = { + { "off", 0 }, + { "on", 1 }, + { NULL } + }; + + static zprop_index_t keystatus_table[] = { + { "none", ZFS_KEYSTATUS_NONE}, + { "unavailable", ZFS_KEYSTATUS_UNAVAILABLE}, + { "available", ZFS_KEYSTATUS_AVAILABLE}, + { NULL } + }; + + static zprop_index_t logbias_table[] = { + { "latency", ZFS_LOGBIAS_LATENCY }, + { "throughput", ZFS_LOGBIAS_THROUGHPUT }, + { NULL } + }; + + static zprop_index_t canmount_table[] = { + { "off", ZFS_CANMOUNT_OFF }, + { "on", ZFS_CANMOUNT_ON }, + { "noauto", ZFS_CANMOUNT_NOAUTO }, + { NULL } + }; + + static zprop_index_t cache_table[] = { + { "none", ZFS_CACHE_NONE }, + { "metadata", ZFS_CACHE_METADATA }, + { "all", ZFS_CACHE_ALL }, + { NULL } + }; + + static zprop_index_t sync_table[] = { + { "standard", ZFS_SYNC_STANDARD }, + { "always", ZFS_SYNC_ALWAYS }, + { "disabled", ZFS_SYNC_DISABLED }, + { NULL } + }; + + static zprop_index_t xattr_table[] = { + { "off", ZFS_XATTR_OFF }, + { "on", ZFS_XATTR_DIR }, + { "sa", ZFS_XATTR_SA }, + { "dir", ZFS_XATTR_DIR }, + { NULL } + }; + + static zprop_index_t dnsize_table[] = { + { "legacy", ZFS_DNSIZE_LEGACY }, + { "auto", ZFS_DNSIZE_AUTO }, + { "1k", ZFS_DNSIZE_1K }, + { "2k", ZFS_DNSIZE_2K }, + { "4k", ZFS_DNSIZE_4K }, + { "8k", ZFS_DNSIZE_8K }, + { "16k", ZFS_DNSIZE_16K }, + { NULL } + }; + + static zprop_index_t redundant_metadata_table[] = { + { "all", ZFS_REDUNDANT_METADATA_ALL }, + { "most", ZFS_REDUNDANT_METADATA_MOST }, + { NULL } + }; + + static zprop_index_t volmode_table[] = { + { "default", ZFS_VOLMODE_DEFAULT }, + { "full", ZFS_VOLMODE_GEOM }, + { "geom", ZFS_VOLMODE_GEOM }, + { "dev", ZFS_VOLMODE_DEV }, + { "none", ZFS_VOLMODE_NONE }, + { NULL } + }; + + /* inherit index properties */ + zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata", + ZFS_REDUNDANT_METADATA_ALL, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "all | most", "REDUND_MD", + redundant_metadata_table); + zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "standard | always | disabled", "SYNC", + sync_table); + zprop_register_index(ZFS_PROP_CHECKSUM, "checksum", + ZIO_CHECKSUM_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_VOLUME, +#if !defined(__FreeBSD__) + "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein" + " | edonr", +#else + "on | off | fletcher2 | fletcher4 | sha256 | sha512 | skein", +#endif + "CHECKSUM", checksum_table); + zprop_register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | verify | sha256[,verify] | sha512[,verify] | " +#if !defined(__FreeBSD__) + "skein[,verify] | edonr,verify", +#else + "skein[,verify]", +#endif + "DEDUP", dedup_table); + zprop_register_index(ZFS_PROP_COMPRESSION, "compression", + ZIO_COMPRESS_DEFAULT, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | lzjb | gzip | gzip-[1-9] | zle | lz4 | " + "zstd | zstd-[1-19] | " + "zstd-fast-[1-10,20,30,40,50,60,70,80,90,100,500,1000]", + "COMPRESS", compress_table); + zprop_register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "hidden | visible", "SNAPDIR", snapdir_table); + zprop_register_index(ZFS_PROP_SNAPDEV, "snapdev", ZFS_SNAPDEV_HIDDEN, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "hidden | visible", "SNAPDEV", snapdev_table); + zprop_register_index(ZFS_PROP_ACLMODE, "aclmode", ZFS_ACL_DISCARD, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | groupmask | passthrough | restricted", "ACLMODE", + acl_mode_table); +#ifndef __FreeBSD__ + zprop_register_index(ZFS_PROP_ACLTYPE, "acltype", ZFS_ACLTYPE_OFF, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "noacl | posixacl", "ACLTYPE", acltype_table); +#endif + zprop_register_index(ZFS_PROP_ACLINHERIT, "aclinherit", + ZFS_ACL_RESTRICTED, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "discard | noallow | restricted | passthrough | passthrough-x", + "ACLINHERIT", acl_inherit_table); + zprop_register_index(ZFS_PROP_COPIES, "copies", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "1 | 2 | 3", "COPIES", copies_table); + zprop_register_index(ZFS_PROP_PRIMARYCACHE, "primarycache", + ZFS_CACHE_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "all | none | metadata", "PRIMARYCACHE", cache_table); + zprop_register_index(ZFS_PROP_SECONDARYCACHE, "secondarycache", + ZFS_CACHE_ALL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, + "all | none | metadata", "SECONDARYCACHE", cache_table); + zprop_register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "latency | throughput", "LOGBIAS", logbias_table); + zprop_register_index(ZFS_PROP_XATTR, "xattr", ZFS_XATTR_DIR, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off | dir | sa", "XATTR", xattr_table); + zprop_register_index(ZFS_PROP_DNODESIZE, "dnodesize", + ZFS_DNSIZE_LEGACY, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "legacy | auto | 1k | 2k | 4k | 8k | 16k", "DNSIZE", dnsize_table); + zprop_register_index(ZFS_PROP_VOLMODE, "volmode", + ZFS_VOLMODE_DEFAULT, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "default | full | geom | dev | none", "VOLMODE", volmode_table); + + /* inherit index (boolean) properties */ + zprop_register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "ATIME", boolean_table); + zprop_register_index(ZFS_PROP_RELATIME, "relatime", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "RELATIME", boolean_table); + zprop_register_index(ZFS_PROP_DEVICES, "devices", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "DEVICES", + boolean_table); + zprop_register_index(ZFS_PROP_EXEC, "exec", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "EXEC", + boolean_table); + zprop_register_index(ZFS_PROP_SETUID, "setuid", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "SETUID", + boolean_table); + zprop_register_index(ZFS_PROP_READONLY, "readonly", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off", "RDONLY", + boolean_table); +#ifdef __FreeBSD__ + zprop_register_index(ZFS_PROP_ZONED, "jailed", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "JAILED", boolean_table); +#else + zprop_register_index(ZFS_PROP_ZONED, "zoned", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "ZONED", boolean_table); +#endif + zprop_register_index(ZFS_PROP_VSCAN, "vscan", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "VSCAN", boolean_table); + zprop_register_index(ZFS_PROP_NBMAND, "nbmand", 0, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, "on | off", "NBMAND", + boolean_table); + zprop_register_index(ZFS_PROP_OVERLAY, "overlay", 1, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "on | off", "OVERLAY", boolean_table); + + /* default index properties */ + zprop_register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "1 | 2 | 3 | 4 | 5 | current", "VERSION", version_table); + zprop_register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", + "CANMOUNT", canmount_table); + + /* readonly index properties */ + zprop_register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); + zprop_register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, + PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", + boolean_table); + zprop_register_index(ZFS_PROP_KEYSTATUS, "keystatus", + ZFS_KEYSTATUS_NONE, PROP_READONLY, ZFS_TYPE_DATASET, + "none | unavailable | available", + "KEYSTATUS", keystatus_table); + + /* set once index properties */ + zprop_register_index(ZFS_PROP_NORMALIZE, "normalization", 0, + PROP_ONETIME, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "none | formC | formD | formKC | formKD", "NORMALIZATION", + normalize_table); + zprop_register_index(ZFS_PROP_CASE, "casesensitivity", + ZFS_CASE_SENSITIVE, PROP_ONETIME, ZFS_TYPE_FILESYSTEM | + ZFS_TYPE_SNAPSHOT, + "sensitive | insensitive | mixed", "CASE", case_table); + zprop_register_index(ZFS_PROP_KEYFORMAT, "keyformat", + ZFS_KEYFORMAT_NONE, PROP_ONETIME_DEFAULT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "none | raw | hex | passphrase", "KEYFORMAT", keyformat_table); + zprop_register_index(ZFS_PROP_ENCRYPTION, "encryption", + ZIO_CRYPT_DEFAULT, PROP_ONETIME, ZFS_TYPE_DATASET, + "on | off | aes-128-ccm | aes-192-ccm | aes-256-ccm | " + "aes-128-gcm | aes-192-gcm | aes-256-gcm", "ENCRYPTION", + crypto_table); + + /* set once index (boolean) properties */ + zprop_register_index(ZFS_PROP_UTF8ONLY, "utf8only", 0, PROP_ONETIME, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "on | off", "UTF8ONLY", boolean_table); + + /* string properties */ + zprop_register_string(ZFS_PROP_ORIGIN, "origin", NULL, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "ORIGIN"); + zprop_register_string(ZFS_PROP_CLONES, "clones", NULL, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "[,...]", "CLONES"); + zprop_register_string(ZFS_PROP_MOUNTPOINT, "mountpoint", "/", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, " | legacy | none", + "MOUNTPOINT"); + zprop_register_string(ZFS_PROP_SHARENFS, "sharenfs", "off", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | share(1M) options", + "SHARENFS"); + zprop_register_string(ZFS_PROP_TYPE, "type", NULL, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, + "filesystem | volume | snapshot | bookmark", "TYPE"); + zprop_register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", + PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "on | off | sharemgr(1M) options", "SHARESMB"); + zprop_register_string(ZFS_PROP_MLSLABEL, "mlslabel", + ZFS_MLSLABEL_DEFAULT, PROP_INHERIT, ZFS_TYPE_DATASET, + "", "MLSLABEL"); + zprop_register_string(ZFS_PROP_SELINUX_CONTEXT, "context", + "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", + "CONTEXT"); + zprop_register_string(ZFS_PROP_SELINUX_FSCONTEXT, "fscontext", + "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", + "FSCONTEXT"); + zprop_register_string(ZFS_PROP_SELINUX_DEFCONTEXT, "defcontext", + "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", + "DEFCONTEXT"); + zprop_register_string(ZFS_PROP_SELINUX_ROOTCONTEXT, "rootcontext", + "none", PROP_DEFAULT, ZFS_TYPE_DATASET, "", + "ROOTCONTEXT"); + zprop_register_string(ZFS_PROP_RECEIVE_RESUME_TOKEN, + "receive_resume_token", + NULL, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "RESUMETOK"); + zprop_register_string(ZFS_PROP_ENCRYPTION_ROOT, "encryptionroot", NULL, + PROP_READONLY, ZFS_TYPE_DATASET, "", + "ENCROOT"); + zprop_register_string(ZFS_PROP_KEYLOCATION, "keylocation", + "none", PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "prompt | ", "KEYLOCATION"); + zprop_register_string(ZFS_PROP_REDACT_SNAPS, + "redact_snaps", NULL, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "[,...]", + "RSNAPS"); + + /* readonly number properties */ + zprop_register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "", "USED"); + zprop_register_number(ZFS_PROP_AVAILABLE, "available", 0, PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "AVAIL"); + zprop_register_number(ZFS_PROP_REFERENCED, "referenced", 0, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "REFER"); + zprop_register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, + "<1.00x or higher if compressed>", "RATIO"); + zprop_register_number(ZFS_PROP_REFRATIO, "refcompressratio", 0, + PROP_READONLY, ZFS_TYPE_DATASET, + "<1.00x or higher if compressed>", "REFRATIO"); + zprop_register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", + ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, + ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); + zprop_register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDSNAP"); + zprop_register_number(ZFS_PROP_USEDDS, "usedbydataset", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDDS"); + zprop_register_number(ZFS_PROP_USEDCHILD, "usedbychildren", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "USEDCHILD"); + zprop_register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, + PROP_READONLY, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); + zprop_register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "", "USERREFS"); + zprop_register_number(ZFS_PROP_WRITTEN, "written", 0, PROP_READONLY, + ZFS_TYPE_DATASET, "", "WRITTEN"); + zprop_register_number(ZFS_PROP_LOGICALUSED, "logicalused", 0, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", + "LUSED"); + zprop_register_number(ZFS_PROP_LOGICALREFERENCED, "logicalreferenced", + 0, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", + "LREFER"); + zprop_register_number(ZFS_PROP_FILESYSTEM_COUNT, "filesystem_count", + UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM, + "", "FSCOUNT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_COUNT, "snapshot_count", + UINT64_MAX, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "SSCOUNT"); + zprop_register_number(ZFS_PROP_GUID, "guid", 0, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "GUID"); + zprop_register_number(ZFS_PROP_CREATETXG, "createtxg", 0, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "", "CREATETXG"); + zprop_register_number(ZFS_PROP_PBKDF2_ITERS, "pbkdf2iters", + 0, PROP_ONETIME_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "", "PBKDF2ITERS"); + zprop_register_number(ZFS_PROP_OBJSETID, "objsetid", 0, + PROP_READONLY, ZFS_TYPE_DATASET, "", "OBJSETID"); + + /* default number properties */ + zprop_register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM, " | none", "QUOTA"); + zprop_register_number(ZFS_PROP_RESERVATION, "reservation", 0, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "RESERV"); + zprop_register_number(ZFS_PROP_VOLSIZE, "volsize", 0, PROP_DEFAULT, + ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "", "VOLSIZE"); + zprop_register_number(ZFS_PROP_REFQUOTA, "refquota", 0, PROP_DEFAULT, + ZFS_TYPE_FILESYSTEM, " | none", "REFQUOTA"); + zprop_register_number(ZFS_PROP_REFRESERVATION, "refreservation", 0, + PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "REFRESERV"); + zprop_register_number(ZFS_PROP_FILESYSTEM_LIMIT, "filesystem_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, + " | none", "FSLIMIT"); + zprop_register_number(ZFS_PROP_SNAPSHOT_LIMIT, "snapshot_limit", + UINT64_MAX, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + " | none", "SSLIMIT"); + + /* inherit number properties */ + zprop_register_number(ZFS_PROP_RECORDSIZE, "recordsize", + SPA_OLD_MAXBLOCKSIZE, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM, "512 to 1M, power of 2", "RECSIZE"); + zprop_register_number(ZFS_PROP_SPECIAL_SMALL_BLOCKS, + "special_small_blocks", 0, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, + "zero or 512 to 1M, power of 2", "SPECIAL_SMALL_BLOCKS"); + + /* hidden properties */ + zprop_register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); + zprop_register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "NAME"); + zprop_register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", + PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); + zprop_register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", + PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, + "STMF_SBD_LU"); + zprop_register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, + "USERACCOUNTING"); + zprop_register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); + zprop_register_hidden(ZFS_PROP_INCONSISTENT, "inconsistent", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "INCONSISTENT"); + zprop_register_hidden(ZFS_PROP_IVSET_GUID, "ivsetguid", + PROP_TYPE_NUMBER, PROP_READONLY, + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, "IVSETGUID"); + zprop_register_hidden(ZFS_PROP_PREV_SNAP, "prevsnap", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PREVSNAP"); + zprop_register_hidden(ZFS_PROP_PBKDF2_SALT, "pbkdf2salt", + PROP_TYPE_NUMBER, PROP_ONETIME_DEFAULT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "PBKDF2SALT"); + zprop_register_hidden(ZFS_PROP_KEY_GUID, "keyguid", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "KEYGUID"); + zprop_register_hidden(ZFS_PROP_REDACTED, "redacted", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REDACTED"); + + /* + * Properties that are obsolete and not used. These are retained so + * that we don't have to change the values of the zfs_prop_t enum, or + * have NULL pointers in the zfs_prop_table[]. + */ +#ifdef __FreeBSD__ + zprop_register_impl(ZFS_PROP_ACLTYPE, "acltype", PROP_TYPE_INDEX, + ZFS_ACLTYPE_OFF, NULL, PROP_INHERIT, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, + "noacl | posixacl", "ACLTYPE", B_FALSE, B_FALSE, acltype_table); +#endif + zprop_register_hidden(ZFS_PROP_REMAPTXG, "remaptxg", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "REMAPTXG"); + + /* oddball properties */ + zprop_register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, + NULL, PROP_READONLY, ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK, + "", "CREATION", B_FALSE, B_TRUE, NULL); +} + +boolean_t +zfs_prop_delegatable(zfs_prop_t prop) +{ + zprop_desc_t *pd = &zfs_prop_table[prop]; + + /* The mlslabel property is never delegatable. */ + if (prop == ZFS_PROP_MLSLABEL) + return (B_FALSE); + + return (pd->pd_attr != PROP_READONLY); +} + +/* + * Given a zfs dataset property name, returns the corresponding property ID. + */ +zfs_prop_t +zfs_name_to_prop(const char *propname) +{ + return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); +} + +/* + * For user property names, we allow all lowercase alphanumeric characters, plus + * a few useful punctuation characters. + */ +static int +valid_char(char c) +{ + return ((c >= 'a' && c <= 'z') || + (c >= '0' && c <= '9') || + c == '-' || c == '_' || c == '.' || c == ':'); +} + +/* + * Returns true if this is a valid user-defined property (one with a ':'). + */ +boolean_t +zfs_prop_user(const char *name) +{ + int i; + char c; + boolean_t foundsep = B_FALSE; + + for (i = 0; i < strlen(name); i++) { + c = name[i]; + if (!valid_char(c)) + return (B_FALSE); + if (c == ':') + foundsep = B_TRUE; + } + + if (!foundsep) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Returns true if this is a valid userspace-type property (one with a '@'). + * Note that after the @, any character is valid (eg, another @, for SID + * user@domain). + */ +boolean_t +zfs_prop_userquota(const char *name) +{ + zfs_userquota_prop_t prop; + + for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { + if (strncmp(name, zfs_userquota_prop_prefixes[prop], + strlen(zfs_userquota_prop_prefixes[prop])) == 0) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Returns true if this is a valid written@ property. + * Note that after the @, any character is valid (eg, another @, for + * written@pool/fs@origin). + */ +boolean_t +zfs_prop_written(const char *name) +{ + static const char *prop_prefix = "written@"; + static const char *book_prefix = "written#"; + return (strncmp(name, prop_prefix, strlen(prop_prefix)) == 0 || + strncmp(name, book_prefix, strlen(book_prefix)) == 0); +} + +/* + * Tables of index types, plus functions to convert between the user view + * (strings) and internal representation (uint64_t). + */ +int +zfs_prop_string_to_index(zfs_prop_t prop, const char *string, uint64_t *index) +{ + return (zprop_string_to_index(prop, string, index, ZFS_TYPE_DATASET)); +} + +int +zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) +{ + return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); +} + +uint64_t +zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) +{ + return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); +} + +/* + * Returns TRUE if the property applies to any of the given dataset types. + */ +boolean_t +zfs_prop_valid_for_type(int prop, zfs_type_t types, boolean_t headcheck) +{ + return (zprop_valid_for_type(prop, types, headcheck)); +} + +zprop_type_t +zfs_prop_get_type(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_proptype); +} + +/* + * Returns TRUE if the property is readonly. + */ +boolean_t +zfs_prop_readonly(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_attr == PROP_READONLY || + zfs_prop_table[prop].pd_attr == PROP_ONETIME || + zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); +} + +/* + * Returns TRUE if the property is visible (not hidden). + */ +boolean_t +zfs_prop_visible(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_visible && + zfs_prop_table[prop].pd_zfs_mod_supported); +} + +/* + * Returns TRUE if the property is only allowed to be set once. + */ +boolean_t +zfs_prop_setonce(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_attr == PROP_ONETIME || + zfs_prop_table[prop].pd_attr == PROP_ONETIME_DEFAULT); +} + +const char * +zfs_prop_default_string(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_strdefault); +} + +uint64_t +zfs_prop_default_numeric(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_numdefault); +} + +/* + * Given a dataset property ID, returns the corresponding name. + * Assuming the zfs dataset property ID is valid. + */ +const char * +zfs_prop_to_name(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_name); +} + +/* + * Returns TRUE if the property is inheritable. + */ +boolean_t +zfs_prop_inheritable(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_attr == PROP_INHERIT || + zfs_prop_table[prop].pd_attr == PROP_ONETIME); +} + +/* + * Returns TRUE if property is one of the encryption properties that requires + * a loaded encryption key to modify. + */ +boolean_t +zfs_prop_encryption_key_param(zfs_prop_t prop) +{ + /* + * keylocation does not count as an encryption property. It can be + * changed at will without needing the master keys. + */ + return (prop == ZFS_PROP_PBKDF2_SALT || prop == ZFS_PROP_PBKDF2_ITERS || + prop == ZFS_PROP_KEYFORMAT); +} + +/* + * Helper function used by both kernelspace and userspace to check the + * keylocation property. If encrypted is set, the keylocation must be valid + * for an encrypted dataset. + */ +boolean_t +zfs_prop_valid_keylocation(const char *str, boolean_t encrypted) +{ + if (strcmp("none", str) == 0) + return (!encrypted); + else if (strcmp("prompt", str) == 0) + return (B_TRUE); + else if (strlen(str) > 8 && strncmp("file:///", str, 8) == 0) + return (B_TRUE); + + return (B_FALSE); +} + + +#ifndef _KERNEL +#include + +/* + * Returns a string describing the set of acceptable values for the given + * zfs property, or NULL if it cannot be set. + */ +const char * +zfs_prop_values(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_values); +} + +/* + * Returns TRUE if this property is a string type. Note that index types + * (compression, checksum) are treated as strings in userland, even though they + * are stored numerically on disk. + */ +int +zfs_prop_is_string(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_proptype == PROP_TYPE_STRING || + zfs_prop_table[prop].pd_proptype == PROP_TYPE_INDEX); +} + +/* + * Returns the column header for the given property. Used only in + * 'zfs list -o', but centralized here with the other property information. + */ +const char * +zfs_prop_column_name(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_colname); +} + +/* + * Returns whether the given property should be displayed right-justified for + * 'zfs list'. + */ +boolean_t +zfs_prop_align_right(zfs_prop_t prop) +{ + return (zfs_prop_table[prop].pd_rightalign); +} + +#endif + +#if defined(_KERNEL) + +#include + +#if defined(HAVE_KERNEL_FPU_INTERNAL) +union fpregs_state **zfs_kfpu_fpregs; +EXPORT_SYMBOL(zfs_kfpu_fpregs); +#endif /* HAVE_KERNEL_FPU_INTERNAL */ + +static int __init +zcommon_init(void) +{ + int error = kfpu_init(); + if (error) + return (error); + + fletcher_4_init(); + + return (0); +} + +static void __exit +zcommon_fini(void) +{ + fletcher_4_fini(); + kfpu_fini(); +} + +module_init(zcommon_init); +module_exit(zcommon_fini); + +#endif + +ZFS_MODULE_DESCRIPTION("Generic ZFS support"); +ZFS_MODULE_AUTHOR(ZFS_META_AUTHOR); +ZFS_MODULE_LICENSE(ZFS_META_LICENSE); +ZFS_MODULE_VERSION(ZFS_META_VERSION "-" ZFS_META_RELEASE); + +/* zfs dataset property functions */ +EXPORT_SYMBOL(zfs_userquota_prop_prefixes); +EXPORT_SYMBOL(zfs_prop_init); +EXPORT_SYMBOL(zfs_prop_get_type); +EXPORT_SYMBOL(zfs_prop_get_table); +EXPORT_SYMBOL(zfs_prop_delegatable); +EXPORT_SYMBOL(zfs_prop_visible); + +/* Dataset property functions shared between libzfs and kernel. */ +EXPORT_SYMBOL(zfs_prop_default_string); +EXPORT_SYMBOL(zfs_prop_default_numeric); +EXPORT_SYMBOL(zfs_prop_readonly); +EXPORT_SYMBOL(zfs_prop_inheritable); +EXPORT_SYMBOL(zfs_prop_encryption_key_param); +EXPORT_SYMBOL(zfs_prop_valid_keylocation); +EXPORT_SYMBOL(zfs_prop_setonce); +EXPORT_SYMBOL(zfs_prop_to_name); +EXPORT_SYMBOL(zfs_name_to_prop); +EXPORT_SYMBOL(zfs_prop_user); +EXPORT_SYMBOL(zfs_prop_userquota); +EXPORT_SYMBOL(zfs_prop_index_to_string); +EXPORT_SYMBOL(zfs_prop_string_to_index); +EXPORT_SYMBOL(zfs_prop_valid_for_type); +EXPORT_SYMBOL(zfs_prop_written); diff --git a/module/zcommon/zfs_uio.c b/module/zcommon/zfs_uio.c new file mode 100644 index 000000000000..d586e0a1220a --- /dev/null +++ b/module/zcommon/zfs_uio.c @@ -0,0 +1,287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ +/* + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + */ + +/* + * The uio support from OpenSolaris has been added as a short term + * work around. The hope is to adopt native Linux type and drop the + * use of uio's entirely. Under Linux they only add overhead and + * when possible we want to use native APIs for the ZPL layer. + */ +#ifdef _KERNEL + +#include +#include +#include +#include +#include +#include + +/* + * Move "n" bytes at byte address "p"; "rw" indicates the direction + * of the move, and the I/O parameters are provided in "uio", which is + * update to reflect the data which was moved. Returns 0 on success or + * a non-zero errno on failure. + */ +static int +uiomove_iov(void *p, size_t n, enum uio_rw rw, struct uio *uio) +{ + const struct iovec *iov = uio->uio_iov; + size_t skip = uio->uio_skip; + ulong_t cnt; + + while (n && uio->uio_resid) { + cnt = MIN(iov->iov_len - skip, n); + switch (uio->uio_segflg) { + case UIO_USERSPACE: + case UIO_USERISPACE: + /* + * p = kernel data pointer + * iov->iov_base = user data pointer + */ + if (rw == UIO_READ) { + if (copy_to_user(iov->iov_base+skip, p, cnt)) + return (EFAULT); + } else { + unsigned long b_left = 0; + if (uio->uio_fault_disable) { + if (!zfs_access_ok(VERIFY_READ, + (iov->iov_base + skip), cnt)) { + return (EFAULT); + } + pagefault_disable(); + b_left = + __copy_from_user_inatomic(p, + (iov->iov_base + skip), cnt); + pagefault_enable(); + } else { + b_left = + copy_from_user(p, + (iov->iov_base + skip), cnt); + } + if (b_left > 0) { + unsigned long c_bytes = + cnt - b_left; + uio->uio_skip += c_bytes; + ASSERT3U(uio->uio_skip, <, + iov->iov_len); + uio->uio_resid -= c_bytes; + uio->uio_loffset += c_bytes; + return (EFAULT); + } + } + break; + case UIO_SYSSPACE: + if (rw == UIO_READ) + bcopy(p, iov->iov_base + skip, cnt); + else + bcopy(iov->iov_base + skip, p, cnt); + break; + default: + ASSERT(0); + } + skip += cnt; + if (skip == iov->iov_len) { + skip = 0; + uio->uio_iov = (++iov); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); +} + +static int +uiomove_bvec(void *p, size_t n, enum uio_rw rw, struct uio *uio) +{ + const struct bio_vec *bv = uio->uio_bvec; + size_t skip = uio->uio_skip; + ulong_t cnt; + + while (n && uio->uio_resid) { + void *paddr; + cnt = MIN(bv->bv_len - skip, n); + + paddr = zfs_kmap_atomic(bv->bv_page, KM_USER1); + if (rw == UIO_READ) + bcopy(p, paddr + bv->bv_offset + skip, cnt); + else + bcopy(paddr + bv->bv_offset + skip, p, cnt); + zfs_kunmap_atomic(paddr, KM_USER1); + + skip += cnt; + if (skip == bv->bv_len) { + skip = 0; + uio->uio_bvec = (++bv); + uio->uio_iovcnt--; + } + uio->uio_skip = skip; + uio->uio_resid -= cnt; + uio->uio_loffset += cnt; + p = (caddr_t)p + cnt; + n -= cnt; + } + return (0); +} + +int +uiomove(void *p, size_t n, enum uio_rw rw, struct uio *uio) +{ + if (uio->uio_segflg != UIO_BVEC) + return (uiomove_iov(p, n, rw, uio)); + else + return (uiomove_bvec(p, n, rw, uio)); +} +EXPORT_SYMBOL(uiomove); + +#define fuword8(uptr, vptr) get_user((*vptr), (uptr)) + +/* + * Fault in the pages of the first n bytes specified by the uio structure. + * 1 byte in each page is touched and the uio struct is unmodified. Any + * error will terminate the process as this is only a best attempt to get + * the pages resident. + */ +int +uio_prefaultpages(ssize_t n, struct uio *uio) +{ + const struct iovec *iov; + ulong_t cnt, incr; + caddr_t p; + uint8_t tmp; + int iovcnt; + size_t skip; + + /* no need to fault in kernel pages */ + switch (uio->uio_segflg) { + case UIO_SYSSPACE: + case UIO_BVEC: + return (0); + case UIO_USERSPACE: + case UIO_USERISPACE: + break; + default: + ASSERT(0); + } + + iov = uio->uio_iov; + iovcnt = uio->uio_iovcnt; + skip = uio->uio_skip; + + for (; n > 0 && iovcnt > 0; iov++, iovcnt--, skip = 0) { + cnt = MIN(iov->iov_len - skip, n); + /* empty iov */ + if (cnt == 0) + continue; + n -= cnt; + /* + * touch each page in this segment. + */ + p = iov->iov_base + skip; + while (cnt) { + if (fuword8((uint8_t *)p, &tmp)) + return (EFAULT); + incr = MIN(cnt, PAGESIZE); + p += incr; + cnt -= incr; + } + /* + * touch the last byte in case it straddles a page. + */ + p--; + if (fuword8((uint8_t *)p, &tmp)) + return (EFAULT); + } + + return (0); +} +EXPORT_SYMBOL(uio_prefaultpages); + +/* + * same as uiomove() but doesn't modify uio structure. + * return in cbytes how many bytes were copied. + */ +int +uiocopy(void *p, size_t n, enum uio_rw rw, struct uio *uio, size_t *cbytes) +{ + struct uio uio_copy; + int ret; + + bcopy(uio, &uio_copy, sizeof (struct uio)); + ret = uiomove(p, n, rw, &uio_copy); + *cbytes = uio->uio_resid - uio_copy.uio_resid; + return (ret); +} +EXPORT_SYMBOL(uiocopy); + +/* + * Drop the next n chars out of *uiop. + */ +void +uioskip(uio_t *uiop, size_t n) +{ + if (n > uiop->uio_resid) + return; + + uiop->uio_skip += n; + if (uiop->uio_segflg != UIO_BVEC) { + while (uiop->uio_iovcnt && + uiop->uio_skip >= uiop->uio_iov->iov_len) { + uiop->uio_skip -= uiop->uio_iov->iov_len; + uiop->uio_iov++; + uiop->uio_iovcnt--; + } + } else { + while (uiop->uio_iovcnt && + uiop->uio_skip >= uiop->uio_bvec->bv_len) { + uiop->uio_skip -= uiop->uio_bvec->bv_len; + uiop->uio_bvec++; + uiop->uio_iovcnt--; + } + } + uiop->uio_loffset += n; + uiop->uio_resid -= n; +} +EXPORT_SYMBOL(uioskip); +#endif /* _KERNEL */ diff --git a/module/zcommon/zpool_prop.c b/module/zcommon/zpool_prop.c new file mode 100644 index 000000000000..815fad6df0e0 --- /dev/null +++ b/module/zcommon/zpool_prop.c @@ -0,0 +1,275 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#include "zfs_prop.h" + +#if !defined(_KERNEL) +#include +#include +#include +#endif + +static zprop_desc_t zpool_prop_table[ZPOOL_NUM_PROPS]; + +zprop_desc_t * +zpool_prop_get_table(void) +{ + return (zpool_prop_table); +} + +void +zpool_prop_init(void) +{ + static zprop_index_t boolean_table[] = { + { "off", 0}, + { "on", 1}, + { NULL } + }; + + static zprop_index_t failuremode_table[] = { + { "wait", ZIO_FAILURE_MODE_WAIT }, + { "continue", ZIO_FAILURE_MODE_CONTINUE }, + { "panic", ZIO_FAILURE_MODE_PANIC }, + { NULL } + }; + + /* string properties */ + zprop_register_string(ZPOOL_PROP_ALTROOT, "altroot", NULL, PROP_DEFAULT, + ZFS_TYPE_POOL, "", "ALTROOT"); + zprop_register_string(ZPOOL_PROP_BOOTFS, "bootfs", NULL, PROP_DEFAULT, + ZFS_TYPE_POOL, "", "BOOTFS"); + zprop_register_string(ZPOOL_PROP_CACHEFILE, "cachefile", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, " | none", "CACHEFILE"); + zprop_register_string(ZPOOL_PROP_COMMENT, "comment", NULL, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "COMMENT"); + + /* readonly number properties */ + zprop_register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "SIZE"); + zprop_register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "FREE"); + zprop_register_number(ZPOOL_PROP_FREEING, "freeing", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "FREEING"); + zprop_register_number(ZPOOL_PROP_CHECKPOINT, "checkpoint", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "CKPOINT"); + zprop_register_number(ZPOOL_PROP_LEAKED, "leaked", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "LEAKED"); + zprop_register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "ALLOC"); + zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "EXPANDSZ"); + zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "FRAG"); + zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "CAP"); + zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "GUID"); + zprop_register_number(ZPOOL_PROP_LOAD_GUID, "load_guid", 0, + PROP_READONLY, ZFS_TYPE_POOL, "", "LOAD_GUID"); + zprop_register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "HEALTH"); + zprop_register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, + PROP_READONLY, ZFS_TYPE_POOL, "<1.00x or higher if deduped>", + "DEDUP"); + + /* default number properties */ + zprop_register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); + zprop_register_number(ZPOOL_PROP_ASHIFT, "ashift", 0, PROP_DEFAULT, + ZFS_TYPE_POOL, "", "ASHIFT"); + + /* default index (boolean) properties */ + zprop_register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "DELEGATION", + boolean_table); + zprop_register_index(ZPOOL_PROP_AUTOREPLACE, "autoreplace", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); + zprop_register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", + boolean_table); + zprop_register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); + zprop_register_index(ZPOOL_PROP_READONLY, "readonly", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "RDONLY", boolean_table); + zprop_register_index(ZPOOL_PROP_MULTIHOST, "multihost", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "MULTIHOST", + boolean_table); + + /* default index properties */ + zprop_register_index(ZPOOL_PROP_FAILUREMODE, "failmode", + ZIO_FAILURE_MODE_WAIT, PROP_DEFAULT, ZFS_TYPE_POOL, + "wait | continue | panic", "FAILMODE", failuremode_table); + zprop_register_index(ZPOOL_PROP_AUTOTRIM, "autotrim", + SPA_AUTOTRIM_DEFAULT, PROP_DEFAULT, ZFS_TYPE_POOL, + "on | off", "AUTOTRIM", boolean_table); + + /* hidden properties */ + zprop_register_hidden(ZPOOL_PROP_NAME, "name", PROP_TYPE_STRING, + PROP_READONLY, ZFS_TYPE_POOL, "NAME"); + zprop_register_hidden(ZPOOL_PROP_MAXBLOCKSIZE, "maxblocksize", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXBLOCKSIZE"); + zprop_register_hidden(ZPOOL_PROP_TNAME, "tname", PROP_TYPE_STRING, + PROP_ONETIME, ZFS_TYPE_POOL, "TNAME"); + zprop_register_hidden(ZPOOL_PROP_MAXDNODESIZE, "maxdnodesize", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_POOL, "MAXDNODESIZE"); + zprop_register_hidden(ZPOOL_PROP_DEDUPDITTO, "dedupditto", + PROP_TYPE_NUMBER, PROP_DEFAULT, ZFS_TYPE_POOL, "DEDUPDITTO"); +} + +/* + * Given a property name and its type, returns the corresponding property ID. + */ +zpool_prop_t +zpool_name_to_prop(const char *propname) +{ + return (zprop_name_to_prop(propname, ZFS_TYPE_POOL)); +} + +/* + * Given a pool property ID, returns the corresponding name. + * Assuming the pool property ID is valid. + */ +const char * +zpool_prop_to_name(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_name); +} + +zprop_type_t +zpool_prop_get_type(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_proptype); +} + +boolean_t +zpool_prop_readonly(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_attr == PROP_READONLY); +} + +boolean_t +zpool_prop_setonce(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_attr == PROP_ONETIME); +} + +const char * +zpool_prop_default_string(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_strdefault); +} + +uint64_t +zpool_prop_default_numeric(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_numdefault); +} + +/* + * Returns true if this is a valid feature@ property. + */ +boolean_t +zpool_prop_feature(const char *name) +{ + static const char *prefix = "feature@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +/* + * Returns true if this is a valid unsupported@ property. + */ +boolean_t +zpool_prop_unsupported(const char *name) +{ + static const char *prefix = "unsupported@"; + return (strncmp(name, prefix, strlen(prefix)) == 0); +} + +int +zpool_prop_string_to_index(zpool_prop_t prop, const char *string, + uint64_t *index) +{ + return (zprop_string_to_index(prop, string, index, ZFS_TYPE_POOL)); +} + +int +zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, + const char **string) +{ + return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); +} + +uint64_t +zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) +{ + return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); +} + +#ifndef _KERNEL +#include + +const char * +zpool_prop_values(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_values); +} + +const char * +zpool_prop_column_name(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_colname); +} + +boolean_t +zpool_prop_align_right(zpool_prop_t prop) +{ + return (zpool_prop_table[prop].pd_rightalign); +} +#endif + +#if defined(_KERNEL) +/* zpool property functions */ +EXPORT_SYMBOL(zpool_prop_init); +EXPORT_SYMBOL(zpool_prop_get_type); +EXPORT_SYMBOL(zpool_prop_get_table); + +/* Pool property functions shared between libzfs and kernel. */ +EXPORT_SYMBOL(zpool_name_to_prop); +EXPORT_SYMBOL(zpool_prop_to_name); +EXPORT_SYMBOL(zpool_prop_default_string); +EXPORT_SYMBOL(zpool_prop_default_numeric); +EXPORT_SYMBOL(zpool_prop_readonly); +EXPORT_SYMBOL(zpool_prop_feature); +EXPORT_SYMBOL(zpool_prop_unsupported); +EXPORT_SYMBOL(zpool_prop_index_to_string); +EXPORT_SYMBOL(zpool_prop_string_to_index); +#endif diff --git a/module/zcommon/zprop_common.c b/module/zcommon/zprop_common.c new file mode 100644 index 000000000000..faab9d9a74fd --- /dev/null +++ b/module/zcommon/zprop_common.c @@ -0,0 +1,480 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * Common routines used by zfs and zpool property management. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" +#include "zfs_deleg.h" + +#if !defined(_KERNEL) +#include +#include +#include +#include +#endif + +static zprop_desc_t * +zprop_get_proptable(zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (zpool_prop_get_table()); + else + return (zfs_prop_get_table()); +} + +static int +zprop_get_numprops(zfs_type_t type) +{ + if (type == ZFS_TYPE_POOL) + return (ZPOOL_NUM_PROPS); + else + return (ZFS_NUM_PROPS); +} + +static boolean_t +zfs_mod_supported_prop(const char *name, zfs_type_t type) +{ +/* + * The zfs module spa_feature_table[], whether in-kernel or in libzpool, + * always supports all the properties. libzfs needs to query the running + * module, via sysfs, to determine which properties are supported. + * + * The equivalent _can_ be done on FreeBSD by way of the sysctl + * tree, but this has not been done yet. + */ +#if defined(_KERNEL) || defined(LIB_ZPOOL_BUILD) || defined(__FreeBSD__) + return (B_TRUE); +#else + return (zfs_mod_supported(type == ZFS_TYPE_POOL ? + ZFS_SYSFS_POOL_PROPERTIES : ZFS_SYSFS_DATASET_PROPERTIES, name)); +#endif +} + +void +zprop_register_impl(int prop, const char *name, zprop_type_t type, + uint64_t numdefault, const char *strdefault, zprop_attr_t attr, + int objset_types, const char *values, const char *colname, + boolean_t rightalign, boolean_t visible, const zprop_index_t *idx_tbl) +{ + zprop_desc_t *prop_tbl = zprop_get_proptable(objset_types); + zprop_desc_t *pd; + + pd = &prop_tbl[prop]; + + ASSERT(pd->pd_name == NULL || pd->pd_name == name); + ASSERT(name != NULL); + ASSERT(colname != NULL); + + pd->pd_name = name; + pd->pd_propnum = prop; + pd->pd_proptype = type; + pd->pd_numdefault = numdefault; + pd->pd_strdefault = strdefault; + pd->pd_attr = attr; + pd->pd_types = objset_types; + pd->pd_values = values; + pd->pd_colname = colname; + pd->pd_rightalign = rightalign; + pd->pd_visible = visible; + pd->pd_zfs_mod_supported = zfs_mod_supported_prop(name, objset_types); + pd->pd_table = idx_tbl; + pd->pd_table_size = 0; + while (idx_tbl && (idx_tbl++)->pi_name != NULL) + pd->pd_table_size++; +} + +void +zprop_register_string(int prop, const char *name, const char *def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname) +{ + zprop_register_impl(prop, name, PROP_TYPE_STRING, 0, def, attr, + objset_types, values, colname, B_FALSE, B_TRUE, NULL); + +} + +void +zprop_register_number(int prop, const char *name, uint64_t def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname) +{ + zprop_register_impl(prop, name, PROP_TYPE_NUMBER, def, NULL, attr, + objset_types, values, colname, B_TRUE, B_TRUE, NULL); +} + +void +zprop_register_index(int prop, const char *name, uint64_t def, + zprop_attr_t attr, int objset_types, const char *values, + const char *colname, const zprop_index_t *idx_tbl) +{ + zprop_register_impl(prop, name, PROP_TYPE_INDEX, def, NULL, attr, + objset_types, values, colname, B_FALSE, B_TRUE, idx_tbl); +} + +void +zprop_register_hidden(int prop, const char *name, zprop_type_t type, + zprop_attr_t attr, int objset_types, const char *colname) +{ + zprop_register_impl(prop, name, type, 0, NULL, attr, + objset_types, NULL, colname, + type == PROP_TYPE_NUMBER, B_FALSE, NULL); +} + + +/* + * A comparison function we can use to order indexes into property tables. + */ +static int +zprop_compare(const void *arg1, const void *arg2) +{ + const zprop_desc_t *p1 = *((zprop_desc_t **)arg1); + const zprop_desc_t *p2 = *((zprop_desc_t **)arg2); + boolean_t p1ro, p2ro; + + p1ro = (p1->pd_attr == PROP_READONLY); + p2ro = (p2->pd_attr == PROP_READONLY); + + if (p1ro == p2ro) + return (strcmp(p1->pd_name, p2->pd_name)); + + return (p1ro ? -1 : 1); +} + +/* + * Iterate over all properties in the given property table, calling back + * into the specified function for each property. We will continue to + * iterate until we either reach the end or the callback function returns + * something other than ZPROP_CONT. + */ +int +zprop_iter_common(zprop_func func, void *cb, boolean_t show_all, + boolean_t ordered, zfs_type_t type) +{ + int i, num_props, size, prop; + zprop_desc_t *prop_tbl; + zprop_desc_t **order; + + prop_tbl = zprop_get_proptable(type); + num_props = zprop_get_numprops(type); + size = num_props * sizeof (zprop_desc_t *); + +#if defined(_KERNEL) + order = kmem_alloc(size, KM_SLEEP); +#else + if ((order = malloc(size)) == NULL) + return (ZPROP_CONT); +#endif + + for (int j = 0; j < num_props; j++) + order[j] = &prop_tbl[j]; + + if (ordered) { + qsort((void *)order, num_props, sizeof (zprop_desc_t *), + zprop_compare); + } + + prop = ZPROP_CONT; + for (i = 0; i < num_props; i++) { + if ((order[i]->pd_visible || show_all) && + order[i]->pd_zfs_mod_supported && + (func(order[i]->pd_propnum, cb) != ZPROP_CONT)) { + prop = order[i]->pd_propnum; + break; + } + } + +#if defined(_KERNEL) + kmem_free(order, size); +#else + free(order); +#endif + return (prop); +} + +static boolean_t +propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) +{ + const char *propname = prop_entry->pd_name; +#ifndef _KERNEL + const char *colname = prop_entry->pd_colname; + int c; +#endif + + if (len == strlen(propname) && + strncmp(p, propname, len) == 0) + return (B_TRUE); + +#ifndef _KERNEL + if (colname == NULL || len != strlen(colname)) + return (B_FALSE); + + for (c = 0; c < len; c++) + if (p[c] != tolower(colname[c])) + break; + + return (colname[c] == '\0'); +#else + return (B_FALSE); +#endif +} + +typedef struct name_to_prop_cb { + const char *propname; + zprop_desc_t *prop_tbl; +} name_to_prop_cb_t; + +static int +zprop_name_to_prop_cb(int prop, void *cb_data) +{ + name_to_prop_cb_t *data = cb_data; + + if (propname_match(data->propname, strlen(data->propname), + &data->prop_tbl[prop])) + return (prop); + + return (ZPROP_CONT); +} + +int +zprop_name_to_prop(const char *propname, zfs_type_t type) +{ + int prop; + name_to_prop_cb_t cb_data; + + cb_data.propname = propname; + cb_data.prop_tbl = zprop_get_proptable(type); + + prop = zprop_iter_common(zprop_name_to_prop_cb, &cb_data, + B_TRUE, B_FALSE, type); + + return (prop == ZPROP_CONT ? ZPROP_INVAL : prop); +} + +int +zprop_string_to_index(int prop, const char *string, uint64_t *index, + zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + int i; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (-1); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) + return (-1); + + for (i = 0; idx_tbl[i].pi_name != NULL; i++) { + if (strcmp(string, idx_tbl[i].pi_name) == 0) { + *index = idx_tbl[i].pi_value; + return (0); + } + } + + return (-1); +} + +int +zprop_index_to_string(int prop, uint64_t index, const char **string, + zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + int i; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (-1); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + if ((idx_tbl = prop_tbl[prop].pd_table) == NULL) + return (-1); + + for (i = 0; idx_tbl[i].pi_name != NULL; i++) { + if (idx_tbl[i].pi_value == index) { + *string = idx_tbl[i].pi_name; + return (0); + } + } + + return (-1); +} + +/* + * Return a random valid property value. Used by ztest. + */ +uint64_t +zprop_random_value(int prop, uint64_t seed, zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + + ASSERT((uint_t)prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + idx_tbl = prop_tbl[prop].pd_table; + + if (idx_tbl == NULL) + return (seed); + + return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value); +} + +const char * +zprop_values(int prop, zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + + ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); + ASSERT(prop < zprop_get_numprops(type)); + + prop_tbl = zprop_get_proptable(type); + + return (prop_tbl[prop].pd_values); +} + +/* + * Returns TRUE if the property applies to any of the given dataset types. + * + * If headcheck is set, the check is being made against the head dataset + * type of a snapshot which requires to return B_TRUE when the property + * is only valid for snapshots. + */ +boolean_t +zprop_valid_for_type(int prop, zfs_type_t type, boolean_t headcheck) +{ + zprop_desc_t *prop_tbl; + + if (prop == ZPROP_INVAL || prop == ZPROP_CONT) + return (B_FALSE); + + ASSERT(prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + if (headcheck && prop_tbl[prop].pd_types == ZFS_TYPE_SNAPSHOT) + return (B_TRUE); + return ((prop_tbl[prop].pd_types & type) != 0); +} + +#ifndef _KERNEL + +/* + * Determines the minimum width for the column, and indicates whether it's fixed + * or not. Only string columns are non-fixed. + */ +size_t +zprop_width(int prop, boolean_t *fixed, zfs_type_t type) +{ + zprop_desc_t *prop_tbl, *pd; + const zprop_index_t *idx; + size_t ret; + int i; + + ASSERT(prop != ZPROP_INVAL && prop != ZPROP_CONT); + ASSERT(prop < zprop_get_numprops(type)); + + prop_tbl = zprop_get_proptable(type); + pd = &prop_tbl[prop]; + + *fixed = B_TRUE; + + /* + * Start with the width of the column name. + */ + ret = strlen(pd->pd_colname); + + /* + * For fixed-width values, make sure the width is large enough to hold + * any possible value. + */ + switch (pd->pd_proptype) { + case PROP_TYPE_NUMBER: + /* + * The maximum length of a human-readable number is 5 characters + * ("20.4M", for example). + */ + if (ret < 5) + ret = 5; + /* + * 'creation' is handled specially because it's a number + * internally, but displayed as a date string. + */ + if (prop == ZFS_PROP_CREATION) + *fixed = B_FALSE; + /* + * 'health' is handled specially because it's a number + * internally, but displayed as a fixed 8 character string. + */ + if (prop == ZPOOL_PROP_HEALTH) + ret = 8; + break; + case PROP_TYPE_INDEX: + idx = prop_tbl[prop].pd_table; + for (i = 0; idx[i].pi_name != NULL; i++) { + if (strlen(idx[i].pi_name) > ret) + ret = strlen(idx[i].pi_name); + } + break; + + case PROP_TYPE_STRING: + *fixed = B_FALSE; + break; + } + + return (ret); +} + +#endif + +#if defined(_KERNEL) +/* Common routines to initialize property tables */ +EXPORT_SYMBOL(zprop_register_impl); +EXPORT_SYMBOL(zprop_register_string); +EXPORT_SYMBOL(zprop_register_number); +EXPORT_SYMBOL(zprop_register_index); +EXPORT_SYMBOL(zprop_register_hidden); + +/* Common routines for zfs and zpool property management */ +EXPORT_SYMBOL(zprop_iter_common); +EXPORT_SYMBOL(zprop_name_to_prop); +EXPORT_SYMBOL(zprop_string_to_index); +EXPORT_SYMBOL(zprop_index_to_string); +EXPORT_SYMBOL(zprop_random_value); +EXPORT_SYMBOL(zprop_values); +EXPORT_SYMBOL(zprop_valid_for_type); +#endif diff --git a/module/zfs/Makefile.in b/module/zfs/Makefile.in new file mode 100644 index 000000000000..9ddcd6c339d4 --- /dev/null +++ b/module/zfs/Makefile.in @@ -0,0 +1,153 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +mfdir = $(obj) +else +mfdir = $(srctree)/$(src) +endif + +MODULE := zfs + +obj-$(CONFIG_ZFS) := $(MODULE).o + +# Suppress unused-value warnings in sparc64 architecture headers +ccflags-$(CONFIG_SPARC64) += -Wno-unused-value + +$(MODULE)-objs += abd.o +$(MODULE)-objs += aggsum.o +$(MODULE)-objs += arc.o +$(MODULE)-objs += blkptr.o +$(MODULE)-objs += bplist.o +$(MODULE)-objs += bpobj.o +$(MODULE)-objs += bptree.o +$(MODULE)-objs += btree.o +$(MODULE)-objs += bqueue.o +$(MODULE)-objs += dataset_kstats.o +$(MODULE)-objs += dbuf.o +$(MODULE)-objs += dbuf_stats.o +$(MODULE)-objs += ddt.o +$(MODULE)-objs += ddt_zap.o +$(MODULE)-objs += dmu.o +$(MODULE)-objs += dmu_diff.o +$(MODULE)-objs += dmu_object.o +$(MODULE)-objs += dmu_objset.o +$(MODULE)-objs += dmu_recv.o +$(MODULE)-objs += dmu_redact.o +$(MODULE)-objs += dmu_send.o +$(MODULE)-objs += dmu_traverse.o +$(MODULE)-objs += dmu_tx.o +$(MODULE)-objs += dmu_zfetch.o +$(MODULE)-objs += dnode.o +$(MODULE)-objs += dnode_sync.o +$(MODULE)-objs += dsl_bookmark.o +$(MODULE)-objs += dsl_crypt.o +$(MODULE)-objs += dsl_dataset.o +$(MODULE)-objs += dsl_deadlist.o +$(MODULE)-objs += dsl_deleg.o +$(MODULE)-objs += dsl_destroy.o +$(MODULE)-objs += dsl_dir.o +$(MODULE)-objs += dsl_pool.o +$(MODULE)-objs += dsl_prop.o +$(MODULE)-objs += dsl_scan.o +$(MODULE)-objs += dsl_synctask.o +$(MODULE)-objs += dsl_userhold.o +$(MODULE)-objs += edonr_zfs.o +$(MODULE)-objs += fm.o +$(MODULE)-objs += gzip.o +$(MODULE)-objs += hkdf.o +$(MODULE)-objs += lz4.o +$(MODULE)-objs += lzjb.o +$(MODULE)-objs += metaslab.o +$(MODULE)-objs += mmp.o +$(MODULE)-objs += multilist.o +$(MODULE)-objs += objlist.o +$(MODULE)-objs += pathname.o +$(MODULE)-objs += range_tree.o +$(MODULE)-objs += refcount.o +$(MODULE)-objs += rrwlock.o +$(MODULE)-objs += sa.o +$(MODULE)-objs += sha256.o +$(MODULE)-objs += skein_zfs.o +$(MODULE)-objs += spa.o +$(MODULE)-objs += spa_boot.o +$(MODULE)-objs += spa_checkpoint.o +$(MODULE)-objs += spa_config.o +$(MODULE)-objs += spa_errlog.o +$(MODULE)-objs += spa_history.o +$(MODULE)-objs += spa_log_spacemap.o +$(MODULE)-objs += spa_misc.o +$(MODULE)-objs += space_map.o +$(MODULE)-objs += space_reftree.o +$(MODULE)-objs += txg.o +$(MODULE)-objs += uberblock.o +$(MODULE)-objs += unique.o +$(MODULE)-objs += vdev.o +$(MODULE)-objs += vdev_cache.o +$(MODULE)-objs += vdev_indirect.o +$(MODULE)-objs += vdev_indirect_births.o +$(MODULE)-objs += vdev_indirect_mapping.o +$(MODULE)-objs += vdev_initialize.o +$(MODULE)-objs += vdev_label.o +$(MODULE)-objs += vdev_mirror.o +$(MODULE)-objs += vdev_missing.o +$(MODULE)-objs += vdev_queue.o +$(MODULE)-objs += vdev_raidz.o +$(MODULE)-objs += vdev_raidz_math.o +$(MODULE)-objs += vdev_raidz_math_scalar.o +$(MODULE)-objs += vdev_rebuild.o +$(MODULE)-objs += vdev_removal.o +$(MODULE)-objs += vdev_root.o +$(MODULE)-objs += vdev_trim.o +$(MODULE)-objs += zap.o +$(MODULE)-objs += zap_leaf.o +$(MODULE)-objs += zap_micro.o +$(MODULE)-objs += zcp.o +$(MODULE)-objs += zcp_get.o +$(MODULE)-objs += zcp_global.o +$(MODULE)-objs += zcp_iter.o +$(MODULE)-objs += zcp_set.o +$(MODULE)-objs += zcp_synctask.o +$(MODULE)-objs += zfeature.o +$(MODULE)-objs += zfs_byteswap.o +$(MODULE)-objs += zfs_fm.o +$(MODULE)-objs += zfs_fuid.o +$(MODULE)-objs += zfs_ioctl.o +$(MODULE)-objs += zfs_log.o +$(MODULE)-objs += zfs_onexit.o +$(MODULE)-objs += zfs_quota.o +$(MODULE)-objs += zfs_ratelimit.o +$(MODULE)-objs += zfs_replay.o +$(MODULE)-objs += zfs_rlock.o +$(MODULE)-objs += zfs_sa.o +$(MODULE)-objs += zil.o +$(MODULE)-objs += zio.o +$(MODULE)-objs += zio_checksum.o +$(MODULE)-objs += zio_compress.o +$(MODULE)-objs += zio_inject.o +$(MODULE)-objs += zle.o +$(MODULE)-objs += zrlock.o +$(MODULE)-objs += zthr.o +$(MODULE)-objs += zvol.o + +# Suppress incorrect warnings from versions of objtool which are not +# aware of x86 EVEX prefix instructions used for AVX512. +OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512bw.o := y +OBJECT_FILES_NON_STANDARD_vdev_raidz_math_avx512f.o := y + +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_sse2.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_ssse3.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx2.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512f.o +$(MODULE)-$(CONFIG_X86) += vdev_raidz_math_avx512bw.o + +$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neon.o +$(MODULE)-$(CONFIG_ARM64) += vdev_raidz_math_aarch64_neonx2.o + +$(MODULE)-$(CONFIG_PPC) += vdev_raidz_math_powerpc_altivec.o +$(MODULE)-$(CONFIG_PPC64) += vdev_raidz_math_powerpc_altivec.o + +ifeq ($(CONFIG_ALTIVEC),y) +$(obj)/vdev_raidz_math_powerpc_altivec.o: c_flags += -maltivec +endif + +include $(mfdir)/../os/linux/zfs/Makefile diff --git a/module/zfs/THIRDPARTYLICENSE.cityhash b/module/zfs/THIRDPARTYLICENSE.cityhash new file mode 100644 index 000000000000..e558b2a50358 --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.cityhash @@ -0,0 +1,19 @@ +Copyright (c) 2011 Google, Inc. + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. diff --git a/module/zfs/THIRDPARTYLICENSE.cityhash.descrip b/module/zfs/THIRDPARTYLICENSE.cityhash.descrip new file mode 100644 index 000000000000..f98cb76dfc91 --- /dev/null +++ b/module/zfs/THIRDPARTYLICENSE.cityhash.descrip @@ -0,0 +1 @@ +CITYHASH CHECKSUM FUNCTIONALITY IN ZFS diff --git a/module/zfs/abd.c b/module/zfs/abd.c new file mode 100644 index 000000000000..6018a42ca0d8 --- /dev/null +++ b/module/zfs/abd.c @@ -0,0 +1,1213 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +/* + * ARC buffer data (ABD). + * + * ABDs are an abstract data structure for the ARC which can use two + * different ways of storing the underlying data: + * + * (a) Linear buffer. In this case, all the data in the ABD is stored in one + * contiguous buffer in memory (from a zio_[data_]buf_* kmem cache). + * + * +-------------------+ + * | ABD (linear) | + * | abd_flags = ... | + * | abd_size = ... | +--------------------------------+ + * | abd_buf ------------->| raw buffer of size abd_size | + * +-------------------+ +--------------------------------+ + * no abd_chunks + * + * (b) Scattered buffer. In this case, the data in the ABD is split into + * equal-sized chunks (from the abd_chunk_cache kmem_cache), with pointers + * to the chunks recorded in an array at the end of the ABD structure. + * + * +-------------------+ + * | ABD (scattered) | + * | abd_flags = ... | + * | abd_size = ... | + * | abd_offset = 0 | +-----------+ + * | abd_chunks[0] ----------------------------->| chunk 0 | + * | abd_chunks[1] ---------------------+ +-----------+ + * | ... | | +-----------+ + * | abd_chunks[N-1] ---------+ +------->| chunk 1 | + * +-------------------+ | +-----------+ + * | ... + * | +-----------+ + * +----------------->| chunk N-1 | + * +-----------+ + * + * In addition to directly allocating a linear or scattered ABD, it is also + * possible to create an ABD by requesting the "sub-ABD" starting at an offset + * within an existing ABD. In linear buffers this is simple (set abd_buf of + * the new ABD to the starting point within the original raw buffer), but + * scattered ABDs are a little more complex. The new ABD makes a copy of the + * relevant abd_chunks pointers (but not the underlying data). However, to + * provide arbitrary rather than only chunk-aligned starting offsets, it also + * tracks an abd_offset field which represents the starting point of the data + * within the first chunk in abd_chunks. For both linear and scattered ABDs, + * creating an offset ABD marks the original ABD as the offset's parent, and the + * original ABD's abd_children refcount is incremented. This data allows us to + * ensure the root ABD isn't deleted before its children. + * + * Most consumers should never need to know what type of ABD they're using -- + * the ABD public API ensures that it's possible to transparently switch from + * using a linear ABD to a scattered one when doing so would be beneficial. + * + * If you need to use the data within an ABD directly, if you know it's linear + * (because you allocated it) you can use abd_to_buf() to access the underlying + * raw buffer. Otherwise, you should use one of the abd_borrow_buf* functions + * which will allocate a raw buffer if necessary. Use the abd_return_buf* + * functions to return any raw buffers that are no longer necessary when you're + * done using them. + * + * There are a variety of ABD APIs that implement basic buffer operations: + * compare, copy, read, write, and fill with zeroes. If you need a custom + * function which progressively accesses the whole ABD, use the abd_iterate_* + * functions. + * + * As an additional feature, linear and scatter ABD's can be stitched together + * by using the gang ABD type (abd_alloc_gang_abd()). This allows for + * multiple ABDs to be viewed as a singular ABD. + * + * It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to + * B_FALSE. + */ + +#include +#include +#include +#include +#include + +/* see block comment above for description */ +int zfs_abd_scatter_enabled = B_TRUE; + +boolean_t +abd_is_linear(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE); +} + +boolean_t +abd_is_linear_page(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ? + B_TRUE : B_FALSE); +} + +boolean_t +abd_is_gang(abd_t *abd) +{ + return ((abd->abd_flags & ABD_FLAG_GANG) != 0 ? B_TRUE : + B_FALSE); +} + +void +abd_verify(abd_t *abd) +{ + ASSERT3U(abd->abd_size, >, 0); + ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR | + ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE | + ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE | ABD_FLAG_GANG | + ABD_FLAG_GANG_FREE | ABD_FLAG_ZEROS)); + IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER)); + IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) { + ASSERT3P(ABD_LINEAR_BUF(abd), !=, NULL); + } else if (abd_is_gang(abd)) { + uint_t child_sizes = 0; + for (abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + ASSERT(list_link_active(&cabd->abd_gang_link)); + child_sizes += cabd->abd_size; + abd_verify(cabd); + } + ASSERT3U(abd->abd_size, ==, child_sizes); + } else { + abd_verify_scatter(abd); + } +} + +uint_t +abd_get_size(abd_t *abd) +{ + abd_verify(abd); + return (abd->abd_size); +} + +/* + * Allocate an ABD, along with its own underlying data buffers. Use this if you + * don't care whether the ABD is linear or not. + */ +abd_t * +abd_alloc(size_t size, boolean_t is_metadata) +{ + if (!zfs_abd_scatter_enabled || abd_size_alloc_linear(size)) + return (abd_alloc_linear(size, is_metadata)); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd_t *abd = abd_alloc_struct(size); + abd->abd_flags = ABD_FLAG_OWNER; + abd->abd_u.abd_scatter.abd_offset = 0; + abd_alloc_chunks(abd, size); + + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + abd_update_scatter_stats(abd, ABDSTAT_INCR); + + return (abd); +} + +static void +abd_free_scatter(abd_t *abd) +{ + abd_free_chunks(abd); + + zfs_refcount_destroy(&abd->abd_children); + abd_update_scatter_stats(abd, ABDSTAT_DECR); + abd_free_struct(abd); +} + +static void +abd_put_gang_abd(abd_t *abd) +{ + ASSERT(abd_is_gang(abd)); + abd_t *cabd; + + while ((cabd = list_remove_head(&ABD_GANG(abd).abd_gang_chain)) + != NULL) { + ASSERT0(cabd->abd_flags & ABD_FLAG_GANG_FREE); + abd->abd_size -= cabd->abd_size; + abd_put(cabd); + } + ASSERT0(abd->abd_size); + list_destroy(&ABD_GANG(abd).abd_gang_chain); +} + +/* + * Free an ABD allocated from abd_get_offset() or abd_get_from_buf(). Will not + * free the underlying scatterlist or buffer. + */ +void +abd_put(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + + if (abd->abd_parent != NULL) { + (void) zfs_refcount_remove_many(&abd->abd_parent->abd_children, + abd->abd_size, abd); + } + + if (abd_is_gang(abd)) + abd_put_gang_abd(abd); + + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Allocate an ABD that must be linear, along with its own underlying data + * buffer. Only use this when it would be very annoying to write your ABD + * consumer with a scattered ABD. + */ +abd_t * +abd_alloc_linear(size_t size, boolean_t is_metadata) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + abd->abd_flags = ABD_FLAG_LINEAR | ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + if (is_metadata) { + ABD_LINEAR_BUF(abd) = zio_buf_alloc(size); + } else { + ABD_LINEAR_BUF(abd) = zio_data_buf_alloc(size); + } + + abd_update_linear_stats(abd, ABDSTAT_INCR); + + return (abd); +} + +static void +abd_free_linear(abd_t *abd) +{ + if (abd_is_linear_page(abd)) { + abd_free_linear_page(abd); + return; + } + if (abd->abd_flags & ABD_FLAG_META) { + zio_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); + } else { + zio_data_buf_free(ABD_LINEAR_BUF(abd), abd->abd_size); + } + + zfs_refcount_destroy(&abd->abd_children); + abd_update_linear_stats(abd, ABDSTAT_DECR); + + abd_free_struct(abd); +} + +static void +abd_free_gang_abd(abd_t *abd) +{ + ASSERT(abd_is_gang(abd)); + abd_t *cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + + while (cabd != NULL) { + /* + * We must acquire the child ABDs mutex to ensure that if it + * is being added to another gang ABD we will set the link + * as inactive when removing it from this gang ABD and before + * adding it to the other gang ABD. + */ + mutex_enter(&cabd->abd_mtx); + ASSERT(list_link_active(&cabd->abd_gang_link)); + list_remove(&ABD_GANG(abd).abd_gang_chain, cabd); + mutex_exit(&cabd->abd_mtx); + abd->abd_size -= cabd->abd_size; + if (cabd->abd_flags & ABD_FLAG_GANG_FREE) { + if (cabd->abd_flags & ABD_FLAG_OWNER) + abd_free(cabd); + else + abd_put(cabd); + } + cabd = list_head(&ABD_GANG(abd).abd_gang_chain); + } + ASSERT0(abd->abd_size); + list_destroy(&ABD_GANG(abd).abd_gang_chain); + zfs_refcount_destroy(&abd->abd_children); + abd_free_struct(abd); +} + +/* + * Free an ABD. Only use this on ABDs allocated with abd_alloc(), + * abd_alloc_linear(), or abd_alloc_gang_abd(). + */ +void +abd_free(abd_t *abd) +{ + if (abd == NULL) + return; + + abd_verify(abd); + ASSERT3P(abd->abd_parent, ==, NULL); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + if (abd_is_linear(abd)) + abd_free_linear(abd); + else if (abd_is_gang(abd)) + abd_free_gang_abd(abd); + else + abd_free_scatter(abd); +} + +/* + * Allocate an ABD of the same format (same metadata flag, same scatterize + * setting) as another ABD. + */ +abd_t * +abd_alloc_sametype(abd_t *sabd, size_t size) +{ + boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0; + if (abd_is_linear(sabd) && + !abd_is_linear_page(sabd)) { + return (abd_alloc_linear(size, is_metadata)); + } else { + return (abd_alloc(size, is_metadata)); + } +} + + +/* + * Create gang ABD that will be the head of a list of ABD's. This is used + * to "chain" scatter/gather lists together when constructing aggregated + * IO's. To free this abd, abd_free() must be called. + */ +abd_t * +abd_alloc_gang_abd(void) +{ + abd_t *abd; + + abd = abd_alloc_struct(0); + abd->abd_flags = ABD_FLAG_GANG | ABD_FLAG_OWNER; + abd->abd_size = 0; + abd->abd_parent = NULL; + list_create(&ABD_GANG(abd).abd_gang_chain, + sizeof (abd_t), offsetof(abd_t, abd_gang_link)); + zfs_refcount_create(&abd->abd_children); + return (abd); +} + +/* + * Add a child gang ABD to a parent gang ABDs chained list. + */ +static void +abd_gang_add_gang(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) +{ + ASSERT(abd_is_gang(pabd)); + ASSERT(abd_is_gang(cabd)); + + if (free_on_free) { + /* + * If the parent is responsible for freeing the child gang + * ABD we will just splice the childs children ABD list to + * the parents list and immediately free the child gang ABD + * struct. The parent gang ABDs children from the child gang + * will retain all the free_on_free settings after being + * added to the parents list. + */ + pabd->abd_size += cabd->abd_size; + list_move_tail(&ABD_GANG(pabd).abd_gang_chain, + &ABD_GANG(cabd).abd_gang_chain); + ASSERT(list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); + abd_verify(pabd); + abd_free_struct(cabd); + } else { + for (abd_t *child = list_head(&ABD_GANG(cabd).abd_gang_chain); + child != NULL; + child = list_next(&ABD_GANG(cabd).abd_gang_chain, child)) { + /* + * We always pass B_FALSE for free_on_free as it is the + * original child gang ABDs responsibilty to determine + * if any of its child ABDs should be free'd on the call + * to abd_free(). + */ + abd_gang_add(pabd, child, B_FALSE); + } + abd_verify(pabd); + } +} + +/* + * Add a child ABD to a gang ABD's chained list. + */ +void +abd_gang_add(abd_t *pabd, abd_t *cabd, boolean_t free_on_free) +{ + ASSERT(abd_is_gang(pabd)); + abd_t *child_abd = NULL; + + /* + * If the child being added is a gang ABD, we will add the + * childs ABDs to the parent gang ABD. This alllows us to account + * for the offset correctly in the parent gang ABD. + */ + if (abd_is_gang(cabd)) { + ASSERT(!list_link_active(&cabd->abd_gang_link)); + ASSERT(!list_is_empty(&ABD_GANG(cabd).abd_gang_chain)); + return (abd_gang_add_gang(pabd, cabd, free_on_free)); + } + ASSERT(!abd_is_gang(cabd)); + + /* + * In order to verify that an ABD is not already part of + * another gang ABD, we must lock the child ABD's abd_mtx + * to check its abd_gang_link status. We unlock the abd_mtx + * only after it is has been added to a gang ABD, which + * will update the abd_gang_link's status. See comment below + * for how an ABD can be in multiple gang ABD's simultaneously. + */ + mutex_enter(&cabd->abd_mtx); + if (list_link_active(&cabd->abd_gang_link)) { + /* + * If the child ABD is already part of another + * gang ABD then we must allocate a new + * ABD to use a separate link. We mark the newly + * allocated ABD with ABD_FLAG_GANG_FREE, before + * adding it to the gang ABD's list, to make the + * gang ABD aware that it is responsible to call + * abd_put(). We use abd_get_offset() in order + * to just allocate a new ABD but avoid copying the + * data over into the newly allocated ABD. + * + * An ABD may become part of multiple gang ABD's. For + * example, when writing ditto bocks, the same ABD + * is used to write 2 or 3 locations with 2 or 3 + * zio_t's. Each of the zio's may be aggregated with + * different adjacent zio's. zio aggregation uses gang + * zio's, so the single ABD can become part of multiple + * gang zio's. + * + * The ASSERT below is to make sure that if + * free_on_free is passed as B_TRUE, the ABD can + * not be in multiple gang ABD's. The gang ABD + * can not be responsible for cleaning up the child + * ABD memory allocation if the ABD can be in + * multiple gang ABD's at one time. + */ + ASSERT3B(free_on_free, ==, B_FALSE); + child_abd = abd_get_offset(cabd, 0); + child_abd->abd_flags |= ABD_FLAG_GANG_FREE; + } else { + child_abd = cabd; + if (free_on_free) + child_abd->abd_flags |= ABD_FLAG_GANG_FREE; + } + ASSERT3P(child_abd, !=, NULL); + + list_insert_tail(&ABD_GANG(pabd).abd_gang_chain, child_abd); + mutex_exit(&cabd->abd_mtx); + pabd->abd_size += child_abd->abd_size; +} + +/* + * Locate the ABD for the supplied offset in the gang ABD. + * Return a new offset relative to the returned ABD. + */ +abd_t * +abd_gang_get_offset(abd_t *abd, size_t *off) +{ + abd_t *cabd; + + ASSERT(abd_is_gang(abd)); + ASSERT3U(*off, <, abd->abd_size); + for (cabd = list_head(&ABD_GANG(abd).abd_gang_chain); cabd != NULL; + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd)) { + if (*off >= cabd->abd_size) + *off -= cabd->abd_size; + else + return (cabd); + } + VERIFY3P(cabd, !=, NULL); + return (cabd); +} + +/* + * Allocate a new ABD to point to offset off of sabd. It shares the underlying + * buffer data with sabd. Use abd_put() to free. sabd must not be freed while + * any derived ABDs exist. + */ +static abd_t * +abd_get_offset_impl(abd_t *sabd, size_t off, size_t size) +{ + abd_t *abd = NULL; + + abd_verify(sabd); + ASSERT3U(off, <=, sabd->abd_size); + + if (abd_is_linear(sabd)) { + abd = abd_alloc_struct(0); + + /* + * Even if this buf is filesystem metadata, we only track that + * if we own the underlying data buffer, which is not true in + * this case. Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + + ABD_LINEAR_BUF(abd) = (char *)ABD_LINEAR_BUF(sabd) + off; + } else if (abd_is_gang(sabd)) { + size_t left = size; + abd = abd_alloc_gang_abd(); + abd->abd_flags &= ~ABD_FLAG_OWNER; + for (abd_t *cabd = abd_gang_get_offset(sabd, &off); + cabd != NULL && left > 0; + cabd = list_next(&ABD_GANG(sabd).abd_gang_chain, cabd)) { + int csize = MIN(left, cabd->abd_size - off); + + abd_t *nabd = abd_get_offset_impl(cabd, off, csize); + abd_gang_add(abd, nabd, B_FALSE); + left -= csize; + off = 0; + } + ASSERT3U(left, ==, 0); + } else { + abd = abd_get_offset_scatter(sabd, off); + } + + abd->abd_size = size; + abd->abd_parent = sabd; + zfs_refcount_create(&abd->abd_children); + (void) zfs_refcount_add_many(&sabd->abd_children, abd->abd_size, abd); + return (abd); +} + +abd_t * +abd_get_offset(abd_t *sabd, size_t off) +{ + size_t size = sabd->abd_size > off ? sabd->abd_size - off : 0; + VERIFY3U(size, >, 0); + return (abd_get_offset_impl(sabd, off, size)); +} + +abd_t * +abd_get_offset_size(abd_t *sabd, size_t off, size_t size) +{ + ASSERT3U(off + size, <=, sabd->abd_size); + return (abd_get_offset_impl(sabd, off, size)); +} + +/* + * Return a size scatter ABD. In order to free the returned + * ABD abd_put() must be called. + */ +abd_t * +abd_get_zeros(size_t size) +{ + ASSERT3P(abd_zero_scatter, !=, NULL); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + return (abd_get_offset_size(abd_zero_scatter, 0, size)); +} + +/* + * Allocate a linear ABD structure for buf. You must free this with abd_put() + * since the resulting ABD doesn't own its own buffer. + */ +abd_t * +abd_get_from_buf(void *buf, size_t size) +{ + abd_t *abd = abd_alloc_struct(0); + + VERIFY3U(size, <=, SPA_MAXBLOCKSIZE); + + /* + * Even if this buf is filesystem metadata, we only track that if we + * own the underlying data buffer, which is not true in this case. + * Therefore, we don't ever use ABD_FLAG_META here. + */ + abd->abd_flags = ABD_FLAG_LINEAR; + abd->abd_size = size; + abd->abd_parent = NULL; + zfs_refcount_create(&abd->abd_children); + + ABD_LINEAR_BUF(abd) = buf; + + return (abd); +} + +/* + * Get the raw buffer associated with a linear ABD. + */ +void * +abd_to_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + abd_verify(abd); + return (ABD_LINEAR_BUF(abd)); +} + +/* + * Borrow a raw buffer from an ABD without copying the contents of the ABD + * into the buffer. If the ABD is scattered, this will allocate a raw buffer + * whose contents are undefined. To copy over the existing data in the ABD, use + * abd_borrow_buf_copy() instead. + */ +void * +abd_borrow_buf(abd_t *abd, size_t n) +{ + void *buf; + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + buf = abd_to_buf(abd); + } else { + buf = zio_buf_alloc(n); + } + (void) zfs_refcount_add_many(&abd->abd_children, n, buf); + return (buf); +} + +void * +abd_borrow_buf_copy(abd_t *abd, size_t n) +{ + void *buf = abd_borrow_buf(abd, n); + if (!abd_is_linear(abd)) { + abd_copy_to_buf(buf, abd, n); + } + return (buf); +} + +/* + * Return a borrowed raw buffer to an ABD. If the ABD is scattered, this will + * not change the contents of the ABD and will ASSERT that you didn't modify + * the buffer since it was borrowed. If you want any changes you made to buf to + * be copied back to abd, use abd_return_buf_copy() instead. + */ +void +abd_return_buf(abd_t *abd, void *buf, size_t n) +{ + abd_verify(abd); + ASSERT3U(abd->abd_size, >=, n); + if (abd_is_linear(abd)) { + ASSERT3P(buf, ==, abd_to_buf(abd)); + } else { + ASSERT0(abd_cmp_buf(abd, buf, n)); + zio_buf_free(buf, n); + } + (void) zfs_refcount_remove_many(&abd->abd_children, n, buf); +} + +void +abd_return_buf_copy(abd_t *abd, void *buf, size_t n) +{ + if (!abd_is_linear(abd)) { + abd_copy_from_buf(abd, buf, n); + } + abd_return_buf(abd, buf, n); +} + +void +abd_release_ownership_of_buf(abd_t *abd) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(abd->abd_flags & ABD_FLAG_OWNER); + + /* + * abd_free() needs to handle LINEAR_PAGE ABD's specially. + * Since that flag does not survive the + * abd_release_ownership_of_buf() -> abd_get_from_buf() -> + * abd_take_ownership_of_buf() sequence, we don't allow releasing + * these "linear but not zio_[data_]buf_alloc()'ed" ABD's. + */ + ASSERT(!abd_is_linear_page(abd)); + + abd_verify(abd); + + abd->abd_flags &= ~ABD_FLAG_OWNER; + /* Disable this flag since we no longer own the data buffer */ + abd->abd_flags &= ~ABD_FLAG_META; + + abd_update_linear_stats(abd, ABDSTAT_DECR); +} + + +/* + * Give this ABD ownership of the buffer that it's storing. Can only be used on + * linear ABDs which were allocated via abd_get_from_buf(), or ones allocated + * with abd_alloc_linear() which subsequently released ownership of their buf + * with abd_release_ownership_of_buf(). + */ +void +abd_take_ownership_of_buf(abd_t *abd, boolean_t is_metadata) +{ + ASSERT(abd_is_linear(abd)); + ASSERT(!(abd->abd_flags & ABD_FLAG_OWNER)); + abd_verify(abd); + + abd->abd_flags |= ABD_FLAG_OWNER; + if (is_metadata) { + abd->abd_flags |= ABD_FLAG_META; + } + + abd_update_linear_stats(abd, ABDSTAT_INCR); +} + +/* + * Initializes an abd_iter based on whether the abd is a gang ABD + * or just a single ABD. + */ +static inline abd_t * +abd_init_abd_iter(abd_t *abd, struct abd_iter *aiter, size_t off) +{ + abd_t *cabd = NULL; + + if (abd_is_gang(abd)) { + cabd = abd_gang_get_offset(abd, &off); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, off); + } + } else { + abd_iter_init(aiter, abd); + abd_iter_advance(aiter, off); + } + return (cabd); +} + +/* + * Advances an abd_iter. We have to be careful with gang ABD as + * advancing could mean that we are at the end of a particular ABD and + * must grab the ABD in the gang ABD's list. + */ +static inline abd_t * +abd_advance_abd_iter(abd_t *abd, abd_t *cabd, struct abd_iter *aiter, + size_t len) +{ + abd_iter_advance(aiter, len); + if (abd_is_gang(abd) && abd_iter_at_end(aiter)) { + ASSERT3P(cabd, !=, NULL); + cabd = list_next(&ABD_GANG(abd).abd_gang_chain, cabd); + if (cabd) { + abd_iter_init(aiter, cabd); + abd_iter_advance(aiter, 0); + } + } + return (cabd); +} + +int +abd_iterate_func(abd_t *abd, size_t off, size_t size, + abd_iter_func_t *func, void *private) +{ + int ret = 0; + struct abd_iter aiter; + boolean_t abd_multi; + abd_t *c_abd; + + abd_verify(abd); + ASSERT3U(off + size, <=, abd->abd_size); + + abd_multi = abd_is_gang(abd); + c_abd = abd_init_abd_iter(abd, &aiter, off); + + while (size > 0) { + /* If we are at the end of the gang ABD we are done */ + if (abd_multi && !c_abd) + break; + + abd_iter_map(&aiter); + + size_t len = MIN(aiter.iter_mapsize, size); + ASSERT3U(len, >, 0); + + ret = func(aiter.iter_mapaddr, len, private); + + abd_iter_unmap(&aiter); + + if (ret != 0) + break; + + size -= len; + c_abd = abd_advance_abd_iter(abd, c_abd, &aiter, len); + } + + return (ret); +} + +struct buf_arg { + void *arg_buf; +}; + +static int +abd_copy_to_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(ba_ptr->arg_buf, buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy abd to buf. (off is the offset in abd.) + */ +void +abd_copy_to_buf_off(void *buf, abd_t *abd, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_to_buf_off_cb, + &ba_ptr); +} + +static int +abd_cmp_buf_off_cb(void *buf, size_t size, void *private) +{ + int ret; + struct buf_arg *ba_ptr = private; + + ret = memcmp(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (ret); +} + +/* + * Compare the contents of abd to buf. (off is the offset in abd.) + */ +int +abd_cmp_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + return (abd_iterate_func(abd, off, size, abd_cmp_buf_off_cb, &ba_ptr)); +} + +static int +abd_copy_from_buf_off_cb(void *buf, size_t size, void *private) +{ + struct buf_arg *ba_ptr = private; + + (void) memcpy(buf, ba_ptr->arg_buf, size); + ba_ptr->arg_buf = (char *)ba_ptr->arg_buf + size; + + return (0); +} + +/* + * Copy from buf to abd. (off is the offset in abd.) + */ +void +abd_copy_from_buf_off(abd_t *abd, const void *buf, size_t off, size_t size) +{ + struct buf_arg ba_ptr = { (void *) buf }; + + (void) abd_iterate_func(abd, off, size, abd_copy_from_buf_off_cb, + &ba_ptr); +} + +/*ARGSUSED*/ +static int +abd_zero_off_cb(void *buf, size_t size, void *private) +{ + (void) memset(buf, 0, size); + return (0); +} + +/* + * Zero out the abd from a particular offset to the end. + */ +void +abd_zero_off(abd_t *abd, size_t off, size_t size) +{ + (void) abd_iterate_func(abd, off, size, abd_zero_off_cb, NULL); +} + +/* + * Iterate over two ABDs and call func incrementally on the two ABDs' data in + * equal-sized chunks (passed to func as raw buffers). func could be called many + * times during this iteration. + */ +int +abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, + size_t size, abd_iter_func2_t *func, void *private) +{ + int ret = 0; + struct abd_iter daiter, saiter; + boolean_t dabd_is_gang_abd, sabd_is_gang_abd; + abd_t *c_dabd, *c_sabd; + + abd_verify(dabd); + abd_verify(sabd); + + ASSERT3U(doff + size, <=, dabd->abd_size); + ASSERT3U(soff + size, <=, sabd->abd_size); + + dabd_is_gang_abd = abd_is_gang(dabd); + sabd_is_gang_abd = abd_is_gang(sabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, doff); + c_sabd = abd_init_abd_iter(sabd, &saiter, soff); + + while (size > 0) { + /* if we are at the end of the gang ABD we are done */ + if ((dabd_is_gang_abd && !c_dabd) || + (sabd_is_gang_abd && !c_sabd)) + break; + + abd_iter_map(&daiter); + abd_iter_map(&saiter); + + size_t dlen = MIN(daiter.iter_mapsize, size); + size_t slen = MIN(saiter.iter_mapsize, size); + size_t len = MIN(dlen, slen); + ASSERT(dlen > 0 || slen > 0); + + ret = func(daiter.iter_mapaddr, saiter.iter_mapaddr, len, + private); + + abd_iter_unmap(&saiter); + abd_iter_unmap(&daiter); + + if (ret != 0) + break; + + size -= len; + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, len); + c_sabd = + abd_advance_abd_iter(sabd, c_sabd, &saiter, len); + } + + return (ret); +} + +/*ARGSUSED*/ +static int +abd_copy_off_cb(void *dbuf, void *sbuf, size_t size, void *private) +{ + (void) memcpy(dbuf, sbuf, size); + return (0); +} + +/* + * Copy from sabd to dabd starting from soff and doff. + */ +void +abd_copy_off(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, size_t size) +{ + (void) abd_iterate_func2(dabd, sabd, doff, soff, size, + abd_copy_off_cb, NULL); +} + +/*ARGSUSED*/ +static int +abd_cmp_cb(void *bufa, void *bufb, size_t size, void *private) +{ + return (memcmp(bufa, bufb, size)); +} + +/* + * Compares the contents of two ABDs. + */ +int +abd_cmp(abd_t *dabd, abd_t *sabd) +{ + ASSERT3U(dabd->abd_size, ==, sabd->abd_size); + return (abd_iterate_func2(dabd, sabd, 0, 0, dabd->abd_size, + abd_cmp_cb, NULL)); +} + +/* + * Iterate over code ABDs and a data ABD and call @func_raidz_gen. + * + * @cabds parity ABDs, must have equal size + * @dabd data ABD. Can be NULL (in this case @dsize = 0) + * @func_raidz_gen should be implemented so that its behaviour + * is the same when taking linear and when taking scatter + */ +void +abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, + ssize_t csize, ssize_t dsize, const unsigned parity, + void (*func_raidz_gen)(void **, const void *, size_t, size_t)) +{ + int i; + ssize_t len, dlen; + struct abd_iter caiters[3]; + struct abd_iter daiter = {0}; + void *caddrs[3]; + unsigned long flags __maybe_unused = 0; + abd_t *c_cabds[3]; + abd_t *c_dabd = NULL; + boolean_t cabds_is_gang_abd[3]; + boolean_t dabd_is_gang_abd = B_FALSE; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); + } + + if (dabd) { + dabd_is_gang_abd = abd_is_gang(dabd); + c_dabd = abd_init_abd_iter(dabd, &daiter, 0); + } + + ASSERT3S(dsize, >=, 0); + + abd_enter_critical(flags); + while (csize > 0) { + /* if we are at the end of the gang ABD we are done */ + if (dabd_is_gang_abd && !c_dabd) + break; + + for (i = 0; i < parity; i++) { + /* + * If we are at the end of the gang ABD we are + * done. + */ + if (cabds_is_gang_abd[i] && !c_cabds[i]) + break; + abd_iter_map(&caiters[i]); + caddrs[i] = caiters[i].iter_mapaddr; + } + + len = csize; + + if (dabd && dsize > 0) + abd_iter_map(&daiter); + + switch (parity) { + case 3: + len = MIN(caiters[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(caiters[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(caiters[0].iter_mapsize, len); + } + + /* must be progressive */ + ASSERT3S(len, >, 0); + + if (dabd && dsize > 0) { + /* this needs precise iter.length */ + len = MIN(daiter.iter_mapsize, len); + dlen = len; + } else + dlen = 0; + + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_gen(caddrs, daiter.iter_mapaddr, len, dlen); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&caiters[i]); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &caiters[i], len); + } + + if (dabd && dsize > 0) { + abd_iter_unmap(&daiter); + c_dabd = + abd_advance_abd_iter(dabd, c_dabd, &daiter, + dlen); + dsize -= dlen; + } + + csize -= len; + + ASSERT3S(dsize, >=, 0); + ASSERT3S(csize, >=, 0); + } + abd_exit_critical(flags); +} + +/* + * Iterate over code ABDs and data reconstruction target ABDs and call + * @func_raidz_rec. Function maps at most 6 pages atomically. + * + * @cabds parity ABDs, must have equal size + * @tabds rec target ABDs, at most 3 + * @tsize size of data target columns + * @func_raidz_rec expects syndrome data in target columns. Function + * reconstructs data and overwrites target columns. + */ +void +abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, + ssize_t tsize, const unsigned parity, + void (*func_raidz_rec)(void **t, const size_t tsize, void **c, + const unsigned *mul), + const unsigned *mul) +{ + int i; + ssize_t len; + struct abd_iter citers[3]; + struct abd_iter xiters[3]; + void *caddrs[3], *xaddrs[3]; + unsigned long flags __maybe_unused = 0; + boolean_t cabds_is_gang_abd[3]; + boolean_t tabds_is_gang_abd[3]; + abd_t *c_cabds[3]; + abd_t *c_tabds[3]; + + ASSERT3U(parity, <=, 3); + + for (i = 0; i < parity; i++) { + cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + c_cabds[i] = + abd_init_abd_iter(cabds[i], &citers[i], 0); + c_tabds[i] = + abd_init_abd_iter(tabds[i], &xiters[i], 0); + } + + abd_enter_critical(flags); + while (tsize > 0) { + + for (i = 0; i < parity; i++) { + /* + * If we are at the end of the gang ABD we + * are done. + */ + if (cabds_is_gang_abd[i] && !c_cabds[i]) + break; + if (tabds_is_gang_abd[i] && !c_tabds[i]) + break; + abd_iter_map(&citers[i]); + abd_iter_map(&xiters[i]); + caddrs[i] = citers[i].iter_mapaddr; + xaddrs[i] = xiters[i].iter_mapaddr; + } + + len = tsize; + switch (parity) { + case 3: + len = MIN(xiters[2].iter_mapsize, len); + len = MIN(citers[2].iter_mapsize, len); + /* falls through */ + case 2: + len = MIN(xiters[1].iter_mapsize, len); + len = MIN(citers[1].iter_mapsize, len); + /* falls through */ + case 1: + len = MIN(xiters[0].iter_mapsize, len); + len = MIN(citers[0].iter_mapsize, len); + } + /* must be progressive */ + ASSERT3S(len, >, 0); + /* + * The iterated function likely will not do well if each + * segment except the last one is not multiple of 512 (raidz). + */ + ASSERT3U(((uint64_t)len & 511ULL), ==, 0); + + func_raidz_rec(xaddrs, len, caddrs, mul); + + for (i = parity-1; i >= 0; i--) { + abd_iter_unmap(&xiters[i]); + abd_iter_unmap(&citers[i]); + c_tabds[i] = + abd_advance_abd_iter(tabds[i], c_tabds[i], + &xiters[i], len); + c_cabds[i] = + abd_advance_abd_iter(cabds[i], c_cabds[i], + &citers[i], len); + } + + tsize -= len; + ASSERT3S(tsize, >=, 0); + } + abd_exit_critical(flags); +} diff --git a/module/zfs/aggsum.c b/module/zfs/aggsum.c new file mode 100644 index 000000000000..a2fec27744e1 --- /dev/null +++ b/module/zfs/aggsum.c @@ -0,0 +1,237 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, 2018 by Delphix. All rights reserved. + */ + +#include +#include + +/* + * Aggregate-sum counters are a form of fanned-out counter, used when atomic + * instructions on a single field cause enough CPU cache line contention to + * slow system performance. Due to their increased overhead and the expense + * involved with precisely reading from them, they should only be used in cases + * where the write rate (increment/decrement) is much higher than the read rate + * (get value). + * + * Aggregate sum counters are comprised of two basic parts, the core and the + * buckets. The core counter contains a lock for the entire counter, as well + * as the current upper and lower bounds on the value of the counter. The + * aggsum_bucket structure contains a per-bucket lock to protect the contents of + * the bucket, the current amount that this bucket has changed from the global + * counter (called the delta), and the amount of increment and decrement we have + * "borrowed" from the core counter. + * + * The basic operation of an aggsum is simple. Threads that wish to modify the + * counter will modify one bucket's counter (determined by their current CPU, to + * help minimize lock and cache contention). If the bucket already has + * sufficient capacity borrowed from the core structure to handle their request, + * they simply modify the delta and return. If the bucket does not, we clear + * the bucket's current state (to prevent the borrowed amounts from getting too + * large), and borrow more from the core counter. Borrowing is done by adding to + * the upper bound (or subtracting from the lower bound) of the core counter, + * and setting the borrow value for the bucket to the amount added (or + * subtracted). Clearing the bucket is the opposite; we add the current delta + * to both the lower and upper bounds of the core counter, subtract the borrowed + * incremental from the upper bound, and add the borrowed decrement from the + * lower bound. Note that only borrowing and clearing require access to the + * core counter; since all other operations access CPU-local resources, + * performance can be much higher than a traditional counter. + * + * Threads that wish to read from the counter have a slightly more challenging + * task. It is fast to determine the upper and lower bounds of the aggum; this + * does not require grabbing any locks. This suffices for cases where an + * approximation of the aggsum's value is acceptable. However, if one needs to + * know whether some specific value is above or below the current value in the + * aggsum, they invoke aggsum_compare(). This function operates by repeatedly + * comparing the target value to the upper and lower bounds of the aggsum, and + * then clearing a bucket. This proceeds until the target is outside of the + * upper and lower bounds and we return a response, or the last bucket has been + * cleared and we know that the target is equal to the aggsum's value. Finally, + * the most expensive operation is determining the precise value of the aggsum. + * To do this, we clear every bucket and then return the upper bound (which must + * be equal to the lower bound). What makes aggsum_compare() and aggsum_value() + * expensive is clearing buckets. This involves grabbing the global lock + * (serializing against themselves and borrow operations), grabbing a bucket's + * lock (preventing threads on those CPUs from modifying their delta), and + * zeroing out the borrowed value (forcing that thread to borrow on its next + * request, which will also be expensive). This is what makes aggsums well + * suited for write-many read-rarely operations. + */ + +/* + * We will borrow aggsum_borrow_multiplier times the current request, so we will + * have to get the as_lock approximately every aggsum_borrow_multiplier calls to + * aggsum_delta(). + */ +static uint_t aggsum_borrow_multiplier = 10; + +void +aggsum_init(aggsum_t *as, uint64_t value) +{ + bzero(as, sizeof (*as)); + as->as_lower_bound = as->as_upper_bound = value; + mutex_init(&as->as_lock, NULL, MUTEX_DEFAULT, NULL); + as->as_numbuckets = boot_ncpus; + as->as_buckets = kmem_zalloc(boot_ncpus * sizeof (aggsum_bucket_t), + KM_SLEEP); + for (int i = 0; i < as->as_numbuckets; i++) { + mutex_init(&as->as_buckets[i].asc_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +void +aggsum_fini(aggsum_t *as) +{ + for (int i = 0; i < as->as_numbuckets; i++) + mutex_destroy(&as->as_buckets[i].asc_lock); + kmem_free(as->as_buckets, as->as_numbuckets * sizeof (aggsum_bucket_t)); + mutex_destroy(&as->as_lock); +} + +int64_t +aggsum_lower_bound(aggsum_t *as) +{ + return (as->as_lower_bound); +} + +int64_t +aggsum_upper_bound(aggsum_t *as) +{ + return (as->as_upper_bound); +} + +static void +aggsum_flush_bucket(aggsum_t *as, struct aggsum_bucket *asb) +{ + ASSERT(MUTEX_HELD(&as->as_lock)); + ASSERT(MUTEX_HELD(&asb->asc_lock)); + + /* + * We use atomic instructions for this because we read the upper and + * lower bounds without the lock, so we need stores to be atomic. + */ + atomic_add_64((volatile uint64_t *)&as->as_lower_bound, + asb->asc_delta + asb->asc_borrowed); + atomic_add_64((volatile uint64_t *)&as->as_upper_bound, + asb->asc_delta - asb->asc_borrowed); + asb->asc_delta = 0; + asb->asc_borrowed = 0; +} + +uint64_t +aggsum_value(aggsum_t *as) +{ + int64_t rv; + + mutex_enter(&as->as_lock); + if (as->as_lower_bound == as->as_upper_bound) { + rv = as->as_lower_bound; + for (int i = 0; i < as->as_numbuckets; i++) { + ASSERT0(as->as_buckets[i].asc_delta); + ASSERT0(as->as_buckets[i].asc_borrowed); + } + mutex_exit(&as->as_lock); + return (rv); + } + for (int i = 0; i < as->as_numbuckets; i++) { + struct aggsum_bucket *asb = &as->as_buckets[i]; + mutex_enter(&asb->asc_lock); + aggsum_flush_bucket(as, asb); + mutex_exit(&asb->asc_lock); + } + VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); + rv = as->as_lower_bound; + mutex_exit(&as->as_lock); + + return (rv); +} + +void +aggsum_add(aggsum_t *as, int64_t delta) +{ + struct aggsum_bucket *asb; + int64_t borrow; + + kpreempt_disable(); + asb = &as->as_buckets[CPU_SEQID % as->as_numbuckets]; + kpreempt_enable(); + + /* Try fast path if we already borrowed enough before. */ + mutex_enter(&asb->asc_lock); + if (asb->asc_delta + delta <= (int64_t)asb->asc_borrowed && + asb->asc_delta + delta >= -(int64_t)asb->asc_borrowed) { + asb->asc_delta += delta; + mutex_exit(&asb->asc_lock); + return; + } + mutex_exit(&asb->asc_lock); + + /* + * We haven't borrowed enough. Take the global lock and borrow + * considering what is requested now and what we borrowed before. + */ + borrow = (delta < 0 ? -delta : delta) * aggsum_borrow_multiplier; + mutex_enter(&as->as_lock); + mutex_enter(&asb->asc_lock); + delta += asb->asc_delta; + asb->asc_delta = 0; + if (borrow >= asb->asc_borrowed) + borrow -= asb->asc_borrowed; + else + borrow = (borrow - (int64_t)asb->asc_borrowed) / 4; + asb->asc_borrowed += borrow; + atomic_add_64((volatile uint64_t *)&as->as_lower_bound, + delta - borrow); + atomic_add_64((volatile uint64_t *)&as->as_upper_bound, + delta + borrow); + mutex_exit(&asb->asc_lock); + mutex_exit(&as->as_lock); +} + +/* + * Compare the aggsum value to target efficiently. Returns -1 if the value + * represented by the aggsum is less than target, 1 if it's greater, and 0 if + * they are equal. + */ +int +aggsum_compare(aggsum_t *as, uint64_t target) +{ + if (as->as_upper_bound < target) + return (-1); + if (as->as_lower_bound > target) + return (1); + mutex_enter(&as->as_lock); + for (int i = 0; i < as->as_numbuckets; i++) { + struct aggsum_bucket *asb = &as->as_buckets[i]; + mutex_enter(&asb->asc_lock); + aggsum_flush_bucket(as, asb); + mutex_exit(&asb->asc_lock); + if (as->as_upper_bound < target) { + mutex_exit(&as->as_lock); + return (-1); + } + if (as->as_lower_bound > target) { + mutex_exit(&as->as_lock); + return (1); + } + } + VERIFY3U(as->as_lower_bound, ==, as->as_upper_bound); + ASSERT3U(as->as_lower_bound, ==, target); + mutex_exit(&as->as_lock); + return (0); +} diff --git a/module/zfs/arc.c b/module/zfs/arc.c new file mode 100644 index 000000000000..bd1a993dca92 --- /dev/null +++ b/module/zfs/arc.c @@ -0,0 +1,10565 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018, Joyent, Inc. + * Copyright (c) 2011, 2019, Delphix. All rights reserved. + * Copyright (c) 2014, Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2020, George Amanakis. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +/* + * DVA-based Adjustable Replacement Cache + * + * While much of the theory of operation used here is + * based on the self-tuning, low overhead replacement cache + * presented by Megiddo and Modha at FAST 2003, there are some + * significant differences: + * + * 1. The Megiddo and Modha model assumes any page is evictable. + * Pages in its cache cannot be "locked" into memory. This makes + * the eviction algorithm simple: evict the last page in the list. + * This also make the performance characteristics easy to reason + * about. Our cache is not so simple. At any given moment, some + * subset of the blocks in the cache are un-evictable because we + * have handed out a reference to them. Blocks are only evictable + * when there are no external references active. This makes + * eviction far more problematic: we choose to evict the evictable + * blocks that are the "lowest" in the list. + * + * There are times when it is not possible to evict the requested + * space. In these circumstances we are unable to adjust the cache + * size. To prevent the cache growing unbounded at these times we + * implement a "cache throttle" that slows the flow of new data + * into the cache until we can make space available. + * + * 2. The Megiddo and Modha model assumes a fixed cache size. + * Pages are evicted when the cache is full and there is a cache + * miss. Our model has a variable sized cache. It grows with + * high use, but also tries to react to memory pressure from the + * operating system: decreasing its size when system memory is + * tight. + * + * 3. The Megiddo and Modha model assumes a fixed page size. All + * elements of the cache are therefore exactly the same size. So + * when adjusting the cache size following a cache miss, its simply + * a matter of choosing a single page to evict. In our model, we + * have variable sized cache blocks (ranging from 512 bytes to + * 128K bytes). We therefore choose a set of blocks to evict to make + * space for a cache miss that approximates as closely as possible + * the space used by the new block. + * + * See also: "ARC: A Self-Tuning, Low Overhead Replacement Cache" + * by N. Megiddo & D. Modha, FAST 2003 + */ + +/* + * The locking model: + * + * A new reference to a cache buffer can be obtained in two + * ways: 1) via a hash table lookup using the DVA as a key, + * or 2) via one of the ARC lists. The arc_read() interface + * uses method 1, while the internal ARC algorithms for + * adjusting the cache use method 2. We therefore provide two + * types of locks: 1) the hash table lock array, and 2) the + * ARC list locks. + * + * Buffers do not have their own mutexes, rather they rely on the + * hash table mutexes for the bulk of their protection (i.e. most + * fields in the arc_buf_hdr_t are protected by these mutexes). + * + * buf_hash_find() returns the appropriate mutex (held) when it + * locates the requested buffer in the hash table. It returns + * NULL for the mutex if the buffer was not in the table. + * + * buf_hash_remove() expects the appropriate hash mutex to be + * already held before it is invoked. + * + * Each ARC state also has a mutex which is used to protect the + * buffer list associated with the state. When attempting to + * obtain a hash table lock while holding an ARC list lock you + * must use: mutex_tryenter() to avoid deadlock. Also note that + * the active state mutex must be held before the ghost state mutex. + * + * It as also possible to register a callback which is run when the + * arc_meta_limit is reached and no buffers can be safely evicted. In + * this case the arc user should drop a reference on some arc buffers so + * they can be reclaimed and the arc_meta_limit honored. For example, + * when using the ZPL each dentry holds a references on a znode. These + * dentries must be pruned before the arc buffer holding the znode can + * be safely evicted. + * + * Note that the majority of the performance stats are manipulated + * with atomic operations. + * + * The L2ARC uses the l2ad_mtx on each vdev for the following: + * + * - L2ARC buflist creation + * - L2ARC buflist eviction + * - L2ARC write completion, which walks L2ARC buflists + * - ARC header destruction, as it removes from L2ARC buflists + * - ARC header release, as it removes from L2ARC buflists + */ + +/* + * ARC operation: + * + * Every block that is in the ARC is tracked by an arc_buf_hdr_t structure. + * This structure can point either to a block that is still in the cache or to + * one that is only accessible in an L2 ARC device, or it can provide + * information about a block that was recently evicted. If a block is + * only accessible in the L2ARC, then the arc_buf_hdr_t only has enough + * information to retrieve it from the L2ARC device. This information is + * stored in the l2arc_buf_hdr_t sub-structure of the arc_buf_hdr_t. A block + * that is in this state cannot access the data directly. + * + * Blocks that are actively being referenced or have not been evicted + * are cached in the L1ARC. The L1ARC (l1arc_buf_hdr_t) is a structure within + * the arc_buf_hdr_t that will point to the data block in memory. A block can + * only be read by a consumer if it has an l1arc_buf_hdr_t. The L1ARC + * caches data in two ways -- in a list of ARC buffers (arc_buf_t) and + * also in the arc_buf_hdr_t's private physical data block pointer (b_pabd). + * + * The L1ARC's data pointer may or may not be uncompressed. The ARC has the + * ability to store the physical data (b_pabd) associated with the DVA of the + * arc_buf_hdr_t. Since the b_pabd is a copy of the on-disk physical block, + * it will match its on-disk compression characteristics. This behavior can be + * disabled by setting 'zfs_compressed_arc_enabled' to B_FALSE. When the + * compressed ARC functionality is disabled, the b_pabd will point to an + * uncompressed version of the on-disk data. + * + * Data in the L1ARC is not accessed by consumers of the ARC directly. Each + * arc_buf_hdr_t can have multiple ARC buffers (arc_buf_t) which reference it. + * Each ARC buffer (arc_buf_t) is being actively accessed by a specific ARC + * consumer. The ARC will provide references to this data and will keep it + * cached until it is no longer in use. The ARC caches only the L1ARC's physical + * data block and will evict any arc_buf_t that is no longer referenced. The + * amount of memory consumed by the arc_buf_ts' data buffers can be seen via the + * "overhead_size" kstat. + * + * Depending on the consumer, an arc_buf_t can be requested in uncompressed or + * compressed form. The typical case is that consumers will want uncompressed + * data, and when that happens a new data buffer is allocated where the data is + * decompressed for them to use. Currently the only consumer who wants + * compressed arc_buf_t's is "zfs send", when it streams data exactly as it + * exists on disk. When this happens, the arc_buf_t's data buffer is shared + * with the arc_buf_hdr_t. + * + * Here is a diagram showing an arc_buf_hdr_t referenced by two arc_buf_t's. The + * first one is owned by a compressed send consumer (and therefore references + * the same compressed data buffer as the arc_buf_hdr_t) and the second could be + * used by any other consumer (and has its own uncompressed copy of the data + * buffer). + * + * arc_buf_hdr_t + * +-----------+ + * | fields | + * | common to | + * | L1- and | + * | L2ARC | + * +-----------+ + * | l2arc_buf_hdr_t + * | | + * +-----------+ + * | l1arc_buf_hdr_t + * | | arc_buf_t + * | b_buf +------------>+-----------+ arc_buf_t + * | b_pabd +-+ |b_next +---->+-----------+ + * +-----------+ | |-----------| |b_next +-->NULL + * | |b_comp = T | +-----------+ + * | |b_data +-+ |b_comp = F | + * | +-----------+ | |b_data +-+ + * +->+------+ | +-----------+ | + * compressed | | | | + * data | |<--------------+ | uncompressed + * +------+ compressed, | data + * shared +-->+------+ + * data | | + * | | + * +------+ + * + * When a consumer reads a block, the ARC must first look to see if the + * arc_buf_hdr_t is cached. If the hdr is cached then the ARC allocates a new + * arc_buf_t and either copies uncompressed data into a new data buffer from an + * existing uncompressed arc_buf_t, decompresses the hdr's b_pabd buffer into a + * new data buffer, or shares the hdr's b_pabd buffer, depending on whether the + * hdr is compressed and the desired compression characteristics of the + * arc_buf_t consumer. If the arc_buf_t ends up sharing data with the + * arc_buf_hdr_t and both of them are uncompressed then the arc_buf_t must be + * the last buffer in the hdr's b_buf list, however a shared compressed buf can + * be anywhere in the hdr's list. + * + * The diagram below shows an example of an uncompressed ARC hdr that is + * sharing its data with an arc_buf_t (note that the shared uncompressed buf is + * the last element in the buf list): + * + * arc_buf_hdr_t + * +-----------+ + * | | + * | | + * | | + * +-----------+ + * l2arc_buf_hdr_t| | + * | | + * +-----------+ + * l1arc_buf_hdr_t| | + * | | arc_buf_t (shared) + * | b_buf +------------>+---------+ arc_buf_t + * | | |b_next +---->+---------+ + * | b_pabd +-+ |---------| |b_next +-->NULL + * +-----------+ | | | +---------+ + * | |b_data +-+ | | + * | +---------+ | |b_data +-+ + * +->+------+ | +---------+ | + * | | | | + * uncompressed | | | | + * data +------+ | | + * ^ +->+------+ | + * | uncompressed | | | + * | data | | | + * | +------+ | + * +---------------------------------+ + * + * Writing to the ARC requires that the ARC first discard the hdr's b_pabd + * since the physical block is about to be rewritten. The new data contents + * will be contained in the arc_buf_t. As the I/O pipeline performs the write, + * it may compress the data before writing it to disk. The ARC will be called + * with the transformed data and will bcopy the transformed on-disk block into + * a newly allocated b_pabd. Writes are always done into buffers which have + * either been loaned (and hence are new and don't have other readers) or + * buffers which have been released (and hence have their own hdr, if there + * were originally other readers of the buf's original hdr). This ensures that + * the ARC only needs to update a single buf and its hdr after a write occurs. + * + * When the L2ARC is in use, it will also take advantage of the b_pabd. The + * L2ARC will always write the contents of b_pabd to the L2ARC. This means + * that when compressed ARC is enabled that the L2ARC blocks are identical + * to the on-disk block in the main data pool. This provides a significant + * advantage since the ARC can leverage the bp's checksum when reading from the + * L2ARC to determine if the contents are valid. However, if the compressed + * ARC is disabled, then the L2ARC's block must be transformed to look + * like the physical block in the main data pool before comparing the + * checksum and determining its validity. + * + * The L1ARC has a slightly different system for storing encrypted data. + * Raw (encrypted + possibly compressed) data has a few subtle differences from + * data that is just compressed. The biggest difference is that it is not + * possible to decrypt encrypted data (or vice-versa) if the keys aren't loaded. + * The other difference is that encryption cannot be treated as a suggestion. + * If a caller would prefer compressed data, but they actually wind up with + * uncompressed data the worst thing that could happen is there might be a + * performance hit. If the caller requests encrypted data, however, we must be + * sure they actually get it or else secret information could be leaked. Raw + * data is stored in hdr->b_crypt_hdr.b_rabd. An encrypted header, therefore, + * may have both an encrypted version and a decrypted version of its data at + * once. When a caller needs a raw arc_buf_t, it is allocated and the data is + * copied out of this header. To avoid complications with b_pabd, raw buffers + * cannot be shared. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef _KERNEL +/* set with ZFS_DEBUG=watch, to enable watchpoints on frozen buffers */ +boolean_t arc_watch = B_FALSE; +#endif + +/* + * This thread's job is to keep enough free memory in the system, by + * calling arc_kmem_reap_soon() plus arc_reduce_target_size(), which improves + * arc_available_memory(). + */ +static zthr_t *arc_reap_zthr; + +/* + * This thread's job is to keep arc_size under arc_c, by calling + * arc_evict(), which improves arc_is_overflowing(). + */ +static zthr_t *arc_evict_zthr; + +static kmutex_t arc_evict_lock; +static boolean_t arc_evict_needed = B_FALSE; + +/* + * Count of bytes evicted since boot. + */ +static uint64_t arc_evict_count; + +/* + * List of arc_evict_waiter_t's, representing threads waiting for the + * arc_evict_count to reach specific values. + */ +static list_t arc_evict_waiters; + +/* + * When arc_is_overflowing(), arc_get_data_impl() waits for this percent of + * the requested amount of data to be evicted. For example, by default for + * every 2KB that's evicted, 1KB of it may be "reused" by a new allocation. + * Since this is above 100%, it ensures that progress is made towards getting + * arc_size under arc_c. Since this is finite, it ensures that allocations + * can still happen, even during the potentially long time that arc_size is + * more than arc_c. + */ +int zfs_arc_eviction_pct = 200; + +/* + * The number of headers to evict in arc_evict_state_impl() before + * dropping the sublist lock and evicting from another sublist. A lower + * value means we're more likely to evict the "correct" header (i.e. the + * oldest header in the arc state), but comes with higher overhead + * (i.e. more invocations of arc_evict_state_impl()). + */ +int zfs_arc_evict_batch_limit = 10; + +/* number of seconds before growing cache again */ +int arc_grow_retry = 5; + +/* + * Minimum time between calls to arc_kmem_reap_soon(). + */ +int arc_kmem_cache_reap_retry_ms = 1000; + +/* shift of arc_c for calculating overflow limit in arc_get_data_impl */ +int zfs_arc_overflow_shift = 8; + +/* shift of arc_c for calculating both min and max arc_p */ +int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +int arc_shrink_shift = 7; + +/* percent of pagecache to reclaim arc to */ +#ifdef _KERNEL +uint_t zfs_arc_pc_percent = 0; +#endif + +/* + * log2(fraction of ARC which must be free to allow growing). + * I.e. If there is less than arc_c >> arc_no_grow_shift free memory, + * when reading a new block into the ARC, we will evict an equal-sized block + * from the ARC. + * + * This must be less than arc_shrink_shift, so that when we shrink the ARC, + * we will still not allow it to grow. + */ +int arc_no_grow_shift = 5; + + +/* + * minimum lifespan of a prefetch block in clock ticks + * (initialized in arc_init()) + */ +static int arc_min_prefetch_ms; +static int arc_min_prescient_prefetch_ms; + +/* + * If this percent of memory is free, don't throttle. + */ +int arc_lotsfree_percent = 10; + +/* + * The arc has filled available memory and has now warmed up. + */ +boolean_t arc_warm; + +/* + * These tunables are for performance analysis. + */ +unsigned long zfs_arc_max = 0; +unsigned long zfs_arc_min = 0; +unsigned long zfs_arc_meta_limit = 0; +unsigned long zfs_arc_meta_min = 0; +unsigned long zfs_arc_dnode_limit = 0; +unsigned long zfs_arc_dnode_reduce_percent = 10; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; +int zfs_arc_average_blocksize = 8 * 1024; /* 8KB */ + +/* + * ARC dirty data constraints for arc_tempreserve_space() throttle. + */ +unsigned long zfs_arc_dirty_limit_percent = 50; /* total dirty data limit */ +unsigned long zfs_arc_anon_limit_percent = 25; /* anon block dirty limit */ +unsigned long zfs_arc_pool_dirty_percent = 20; /* each pool's anon allowance */ + +/* + * Enable or disable compressed arc buffers. + */ +int zfs_compressed_arc_enabled = B_TRUE; + +/* + * ARC will evict meta buffers that exceed arc_meta_limit. This + * tunable make arc_meta_limit adjustable for different workloads. + */ +unsigned long zfs_arc_meta_limit_percent = 75; + +/* + * Percentage that can be consumed by dnodes of ARC meta buffers. + */ +unsigned long zfs_arc_dnode_limit_percent = 10; + +/* + * These tunables are Linux specific + */ +unsigned long zfs_arc_sys_free = 0; +int zfs_arc_min_prefetch_ms = 0; +int zfs_arc_min_prescient_prefetch_ms = 0; +int zfs_arc_p_dampener_disable = 1; +int zfs_arc_meta_prune = 10000; +int zfs_arc_meta_strategy = ARC_STRATEGY_META_BALANCED; +int zfs_arc_meta_adjust_restarts = 4096; +int zfs_arc_lotsfree_percent = 10; + +/* The 6 states: */ +arc_state_t ARC_anon; +arc_state_t ARC_mru; +arc_state_t ARC_mru_ghost; +arc_state_t ARC_mfu; +arc_state_t ARC_mfu_ghost; +arc_state_t ARC_l2c_only; + +arc_stats_t arc_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "demand_data_hits", KSTAT_DATA_UINT64 }, + { "demand_data_misses", KSTAT_DATA_UINT64 }, + { "demand_metadata_hits", KSTAT_DATA_UINT64 }, + { "demand_metadata_misses", KSTAT_DATA_UINT64 }, + { "prefetch_data_hits", KSTAT_DATA_UINT64 }, + { "prefetch_data_misses", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_hits", KSTAT_DATA_UINT64 }, + { "prefetch_metadata_misses", KSTAT_DATA_UINT64 }, + { "mru_hits", KSTAT_DATA_UINT64 }, + { "mru_ghost_hits", KSTAT_DATA_UINT64 }, + { "mfu_hits", KSTAT_DATA_UINT64 }, + { "mfu_ghost_hits", KSTAT_DATA_UINT64 }, + { "deleted", KSTAT_DATA_UINT64 }, + { "mutex_miss", KSTAT_DATA_UINT64 }, + { "access_skip", KSTAT_DATA_UINT64 }, + { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_not_enough", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, + { "evict_l2_skip", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "p", KSTAT_DATA_UINT64 }, + { "c", KSTAT_DATA_UINT64 }, + { "c_min", KSTAT_DATA_UINT64 }, + { "c_max", KSTAT_DATA_UINT64 }, + { "size", KSTAT_DATA_UINT64 }, + { "compressed_size", KSTAT_DATA_UINT64 }, + { "uncompressed_size", KSTAT_DATA_UINT64 }, + { "overhead_size", KSTAT_DATA_UINT64 }, + { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "metadata_size", KSTAT_DATA_UINT64 }, + { "dbuf_size", KSTAT_DATA_UINT64 }, + { "dnode_size", KSTAT_DATA_UINT64 }, + { "bonus_size", KSTAT_DATA_UINT64 }, +#if defined(COMPAT_FREEBSD11) + { "other_size", KSTAT_DATA_UINT64 }, +#endif + { "anon_size", KSTAT_DATA_UINT64 }, + { "anon_evictable_data", KSTAT_DATA_UINT64 }, + { "anon_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_size", KSTAT_DATA_UINT64 }, + { "mru_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mru_ghost_size", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mru_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_size", KSTAT_DATA_UINT64 }, + { "mfu_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_evictable_metadata", KSTAT_DATA_UINT64 }, + { "mfu_ghost_size", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_data", KSTAT_DATA_UINT64 }, + { "mfu_ghost_evictable_metadata", KSTAT_DATA_UINT64 }, + { "l2_hits", KSTAT_DATA_UINT64 }, + { "l2_misses", KSTAT_DATA_UINT64 }, + { "l2_feeds", KSTAT_DATA_UINT64 }, + { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, + { "l2_writes_sent", KSTAT_DATA_UINT64 }, + { "l2_writes_done", KSTAT_DATA_UINT64 }, + { "l2_writes_error", KSTAT_DATA_UINT64 }, + { "l2_writes_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_lock_retry", KSTAT_DATA_UINT64 }, + { "l2_evict_reading", KSTAT_DATA_UINT64 }, + { "l2_evict_l1cached", KSTAT_DATA_UINT64 }, + { "l2_free_on_write", KSTAT_DATA_UINT64 }, + { "l2_abort_lowmem", KSTAT_DATA_UINT64 }, + { "l2_cksum_bad", KSTAT_DATA_UINT64 }, + { "l2_io_error", KSTAT_DATA_UINT64 }, + { "l2_size", KSTAT_DATA_UINT64 }, + { "l2_asize", KSTAT_DATA_UINT64 }, + { "l2_hdr_size", KSTAT_DATA_UINT64 }, + { "l2_log_blk_writes", KSTAT_DATA_UINT64 }, + { "l2_log_blk_avg_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_asize", KSTAT_DATA_UINT64 }, + { "l2_log_blk_count", KSTAT_DATA_UINT64 }, + { "l2_data_to_meta_ratio", KSTAT_DATA_UINT64 }, + { "l2_rebuild_success", KSTAT_DATA_UINT64 }, + { "l2_rebuild_unsupported", KSTAT_DATA_UINT64 }, + { "l2_rebuild_io_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_dh_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_cksum_lb_errors", KSTAT_DATA_UINT64 }, + { "l2_rebuild_lowmem", KSTAT_DATA_UINT64 }, + { "l2_rebuild_size", KSTAT_DATA_UINT64 }, + { "l2_rebuild_asize", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs", KSTAT_DATA_UINT64 }, + { "l2_rebuild_bufs_precached", KSTAT_DATA_UINT64 }, + { "l2_rebuild_log_blks", KSTAT_DATA_UINT64 }, + { "memory_throttle_count", KSTAT_DATA_UINT64 }, + { "memory_direct_count", KSTAT_DATA_UINT64 }, + { "memory_indirect_count", KSTAT_DATA_UINT64 }, + { "memory_all_bytes", KSTAT_DATA_UINT64 }, + { "memory_free_bytes", KSTAT_DATA_UINT64 }, + { "memory_available_bytes", KSTAT_DATA_INT64 }, + { "arc_no_grow", KSTAT_DATA_UINT64 }, + { "arc_tempreserve", KSTAT_DATA_UINT64 }, + { "arc_loaned_bytes", KSTAT_DATA_UINT64 }, + { "arc_prune", KSTAT_DATA_UINT64 }, + { "arc_meta_used", KSTAT_DATA_UINT64 }, + { "arc_meta_limit", KSTAT_DATA_UINT64 }, + { "arc_dnode_limit", KSTAT_DATA_UINT64 }, + { "arc_meta_max", KSTAT_DATA_UINT64 }, + { "arc_meta_min", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, + { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, + { "arc_need_free", KSTAT_DATA_UINT64 }, + { "arc_sys_free", KSTAT_DATA_UINT64 }, + { "arc_raw_size", KSTAT_DATA_UINT64 }, + { "cached_only_in_progress", KSTAT_DATA_UINT64 }, + { "abd_chunk_waste_size", KSTAT_DATA_UINT64 }, +}; + +#define ARCSTAT_MAX(stat, val) { \ + uint64_t m; \ + while ((val) > (m = arc_stats.stat.value.ui64) && \ + (m != atomic_cas_64(&arc_stats.stat.value.ui64, m, (val)))) \ + continue; \ +} + +#define ARCSTAT_MAXSTAT(stat) \ + ARCSTAT_MAX(stat##_max, arc_stats.stat.value.ui64) + +/* + * We define a macro to allow ARC hits/misses to be easily broken down by + * two separate conditions, giving a total of four different subtypes for + * each of hits and misses (so eight statistics total). + */ +#define ARCSTAT_CONDSTAT(cond1, stat1, notstat1, cond2, stat2, notstat2, stat) \ + if (cond1) { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##stat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##stat1##_##notstat2##_##stat); \ + } \ + } else { \ + if (cond2) { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##stat2##_##stat); \ + } else { \ + ARCSTAT_BUMP(arcstat_##notstat1##_##notstat2##_##stat);\ + } \ + } + +/* + * This macro allows us to use kstats as floating averages. Each time we + * update this kstat, we first factor it and the update value by + * ARCSTAT_AVG_FACTOR to shrink the new value's contribution to the overall + * average. This macro assumes that integer loads and stores are atomic, but + * is not safe for multiple writers updating the kstat in parallel (only the + * last writer's update will remain). + */ +#define ARCSTAT_F_AVG_FACTOR 3 +#define ARCSTAT_F_AVG(stat, value) \ + do { \ + uint64_t x = ARCSTAT(stat); \ + x = x - x / ARCSTAT_F_AVG_FACTOR + \ + (value) / ARCSTAT_F_AVG_FACTOR; \ + ARCSTAT(stat) = x; \ + _NOTE(CONSTCOND) \ + } while (0) + +kstat_t *arc_ksp; +static arc_state_t *arc_anon; +static arc_state_t *arc_mru_ghost; +static arc_state_t *arc_mfu_ghost; +static arc_state_t *arc_l2c_only; + +arc_state_t *arc_mru; +arc_state_t *arc_mfu; + +/* + * There are several ARC variables that are critical to export as kstats -- + * but we don't want to have to grovel around in the kstat whenever we wish to + * manipulate them. For these variables, we therefore define them to be in + * terms of the statistic variable. This assures that we are not introducing + * the possibility of inconsistency by having shadow copies of the variables, + * while still allowing the code to be readable. + */ +#define arc_tempreserve ARCSTAT(arcstat_tempreserve) +#define arc_loaned_bytes ARCSTAT(arcstat_loaned_bytes) +#define arc_meta_limit ARCSTAT(arcstat_meta_limit) /* max size for metadata */ +/* max size for dnodes */ +#define arc_dnode_size_limit ARCSTAT(arcstat_dnode_limit) +#define arc_meta_min ARCSTAT(arcstat_meta_min) /* min size for metadata */ +#define arc_meta_max ARCSTAT(arcstat_meta_max) /* max size of metadata */ +#define arc_need_free ARCSTAT(arcstat_need_free) /* waiting to be evicted */ + +/* size of all b_rabd's in entire arc */ +#define arc_raw_size ARCSTAT(arcstat_raw_size) +/* compressed size of entire arc */ +#define arc_compressed_size ARCSTAT(arcstat_compressed_size) +/* uncompressed size of entire arc */ +#define arc_uncompressed_size ARCSTAT(arcstat_uncompressed_size) +/* number of bytes in the arc from arc_buf_t's */ +#define arc_overhead_size ARCSTAT(arcstat_overhead_size) + +/* + * There are also some ARC variables that we want to export, but that are + * updated so often that having the canonical representation be the statistic + * variable causes a performance bottleneck. We want to use aggsum_t's for these + * instead, but still be able to export the kstat in the same way as before. + * The solution is to always use the aggsum version, except in the kstat update + * callback. + */ +aggsum_t arc_size; +aggsum_t arc_meta_used; +aggsum_t astat_data_size; +aggsum_t astat_metadata_size; +aggsum_t astat_dbuf_size; +aggsum_t astat_dnode_size; +aggsum_t astat_bonus_size; +aggsum_t astat_hdr_size; +aggsum_t astat_l2_hdr_size; +aggsum_t astat_abd_chunk_waste_size; + +hrtime_t arc_growtime; +list_t arc_prune_list; +kmutex_t arc_prune_mtx; +taskq_t *arc_prune_taskq; + +#define GHOST_STATE(state) \ + ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ + (state) == arc_l2c_only) + +#define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_FLAG_IN_HASH_TABLE) +#define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) +#define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) +#define HDR_COMPRESSION_ENABLED(hdr) \ + ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) + +#define HDR_L2CACHE(hdr) ((hdr)->b_flags & ARC_FLAG_L2CACHE) +#define HDR_L2_READING(hdr) \ + (((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) && \ + ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR)) +#define HDR_L2_WRITING(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITING) +#define HDR_L2_EVICTED(hdr) ((hdr)->b_flags & ARC_FLAG_L2_EVICTED) +#define HDR_L2_WRITE_HEAD(hdr) ((hdr)->b_flags & ARC_FLAG_L2_WRITE_HEAD) +#define HDR_PROTECTED(hdr) ((hdr)->b_flags & ARC_FLAG_PROTECTED) +#define HDR_NOAUTH(hdr) ((hdr)->b_flags & ARC_FLAG_NOAUTH) +#define HDR_SHARED_DATA(hdr) ((hdr)->b_flags & ARC_FLAG_SHARED_DATA) + +#define HDR_ISTYPE_METADATA(hdr) \ + ((hdr)->b_flags & ARC_FLAG_BUFC_METADATA) +#define HDR_ISTYPE_DATA(hdr) (!HDR_ISTYPE_METADATA(hdr)) + +#define HDR_HAS_L1HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L1HDR) +#define HDR_HAS_L2HDR(hdr) ((hdr)->b_flags & ARC_FLAG_HAS_L2HDR) +#define HDR_HAS_RABD(hdr) \ + (HDR_HAS_L1HDR(hdr) && HDR_PROTECTED(hdr) && \ + (hdr)->b_crypt_hdr.b_rabd != NULL) +#define HDR_ENCRYPTED(hdr) \ + (HDR_PROTECTED(hdr) && DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) +#define HDR_AUTHENTICATED(hdr) \ + (HDR_PROTECTED(hdr) && !DMU_OT_IS_ENCRYPTED((hdr)->b_crypt_hdr.b_ot)) + +/* For storing compression mode in b_flags */ +#define HDR_COMPRESS_OFFSET (highbit64(ARC_FLAG_COMPRESS_0) - 1) + +#define HDR_GET_COMPRESS(hdr) ((enum zio_compress)BF32_GET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS)) +#define HDR_SET_COMPRESS(hdr, cmp) BF32_SET((hdr)->b_flags, \ + HDR_COMPRESS_OFFSET, SPA_COMPRESSBITS, (cmp)); + +#define ARC_BUF_LAST(buf) ((buf)->b_next == NULL) +#define ARC_BUF_SHARED(buf) ((buf)->b_flags & ARC_BUF_FLAG_SHARED) +#define ARC_BUF_COMPRESSED(buf) ((buf)->b_flags & ARC_BUF_FLAG_COMPRESSED) +#define ARC_BUF_ENCRYPTED(buf) ((buf)->b_flags & ARC_BUF_FLAG_ENCRYPTED) + +/* + * Other sizes + */ + +#define HDR_FULL_CRYPT_SIZE ((int64_t)sizeof (arc_buf_hdr_t)) +#define HDR_FULL_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_crypt_hdr)) +#define HDR_L2ONLY_SIZE ((int64_t)offsetof(arc_buf_hdr_t, b_l1hdr)) + +/* + * Hash table routines + */ + +#define HT_LOCK_ALIGN 64 +#define HT_LOCK_PAD (P2NPHASE(sizeof (kmutex_t), (HT_LOCK_ALIGN))) + +struct ht_lock { + kmutex_t ht_lock; +#ifdef _KERNEL + unsigned char pad[HT_LOCK_PAD]; +#endif +}; + +#define BUF_LOCKS 8192 +typedef struct buf_hash_table { + uint64_t ht_mask; + arc_buf_hdr_t **ht_table; + struct ht_lock ht_locks[BUF_LOCKS]; +} buf_hash_table_t; + +static buf_hash_table_t buf_hash_table; + +#define BUF_HASH_INDEX(spa, dva, birth) \ + (buf_hash(spa, dva, birth) & buf_hash_table.ht_mask) +#define BUF_HASH_LOCK_NTRY(idx) (buf_hash_table.ht_locks[idx & (BUF_LOCKS-1)]) +#define BUF_HASH_LOCK(idx) (&(BUF_HASH_LOCK_NTRY(idx).ht_lock)) +#define HDR_LOCK(hdr) \ + (BUF_HASH_LOCK(BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth))) + +uint64_t zfs_crc64_table[256]; + +/* + * Level 2 ARC + */ + +#define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ +#define L2ARC_HEADROOM 2 /* num of writes */ + +/* + * If we discover during ARC scan any buffers to be compressed, we boost + * our headroom for the next scanning cycle by this percentage multiple. + */ +#define L2ARC_HEADROOM_BOOST 200 +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ + +/* + * We can feed L2ARC from two states of ARC buffers, mru and mfu, + * and each of the state has two types: data and metadata. + */ +#define L2ARC_FEED_TYPES 4 + +#define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) +#define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) + +/* L2ARC Performance Tunables */ +unsigned long l2arc_write_max = L2ARC_WRITE_SIZE; /* def max write size */ +unsigned long l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra warmup write */ +unsigned long l2arc_headroom = L2ARC_HEADROOM; /* # of dev writes */ +unsigned long l2arc_headroom_boost = L2ARC_HEADROOM_BOOST; +unsigned long l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +unsigned long l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval msecs */ +int l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +int l2arc_feed_again = B_TRUE; /* turbo warmup */ +int l2arc_norw = B_FALSE; /* no reads during writes */ + +/* + * L2ARC Internals + */ +static list_t L2ARC_dev_list; /* device list */ +static list_t *l2arc_dev_list; /* device list pointer */ +static kmutex_t l2arc_dev_mtx; /* device list mutex */ +static l2arc_dev_t *l2arc_dev_last; /* last device used */ +static list_t L2ARC_free_on_write; /* free after write buf list */ +static list_t *l2arc_free_on_write; /* free after write list ptr */ +static kmutex_t l2arc_free_on_write_mtx; /* mutex for list */ +static uint64_t l2arc_ndev; /* number of devices */ + +typedef struct l2arc_read_callback { + arc_buf_hdr_t *l2rcb_hdr; /* read header */ + blkptr_t l2rcb_bp; /* original blkptr */ + zbookmark_phys_t l2rcb_zb; /* original bookmark */ + int l2rcb_flags; /* original flags */ + abd_t *l2rcb_abd; /* temporary buffer */ +} l2arc_read_callback_t; + +typedef struct l2arc_data_free { + /* protected by l2arc_free_on_write_mtx */ + abd_t *l2df_abd; + size_t l2df_size; + arc_buf_contents_t l2df_type; + list_node_t l2df_list_node; +} l2arc_data_free_t; + +typedef enum arc_fill_flags { + ARC_FILL_LOCKED = 1 << 0, /* hdr lock is held */ + ARC_FILL_COMPRESSED = 1 << 1, /* fill with compressed data */ + ARC_FILL_ENCRYPTED = 1 << 2, /* fill with encrypted data */ + ARC_FILL_NOAUTH = 1 << 3, /* don't attempt to authenticate */ + ARC_FILL_IN_PLACE = 1 << 4 /* fill in place (special case) */ +} arc_fill_flags_t; + +static kmutex_t l2arc_feed_thr_lock; +static kcondvar_t l2arc_feed_thr_cv; +static uint8_t l2arc_thread_exit; + +static kmutex_t l2arc_rebuild_thr_lock; +static kcondvar_t l2arc_rebuild_thr_cv; + +enum arc_hdr_alloc_flags { + ARC_HDR_ALLOC_RDATA = 0x1, + ARC_HDR_DO_ADAPT = 0x2, +}; + + +static abd_t *arc_get_data_abd(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static void *arc_get_data_buf(arc_buf_hdr_t *, uint64_t, void *); +static void arc_get_data_impl(arc_buf_hdr_t *, uint64_t, void *, boolean_t); +static void arc_free_data_abd(arc_buf_hdr_t *, abd_t *, uint64_t, void *); +static void arc_free_data_buf(arc_buf_hdr_t *, void *, uint64_t, void *); +static void arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag); +static void arc_hdr_free_abd(arc_buf_hdr_t *, boolean_t); +static void arc_hdr_alloc_abd(arc_buf_hdr_t *, int); +static void arc_access(arc_buf_hdr_t *, kmutex_t *); +static void arc_buf_watch(arc_buf_t *); + +static arc_buf_contents_t arc_buf_type(arc_buf_hdr_t *); +static uint32_t arc_bufc_to_flags(arc_buf_contents_t); +static inline void arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); +static inline void arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags); + +static boolean_t l2arc_write_eligible(uint64_t, arc_buf_hdr_t *); +static void l2arc_read_done(zio_t *); +static void l2arc_do_free_on_write(void); + +/* + * L2ARC TRIM + * l2arc_trim_ahead : A ZFS module parameter that controls how much ahead of + * the current write size (l2arc_write_max) we should TRIM if we + * have filled the device. It is defined as a percentage of the + * write size. If set to 100 we trim twice the space required to + * accommodate upcoming writes. A minimum of 64MB will be trimmed. + * It also enables TRIM of the whole L2ARC device upon creation or + * addition to an existing pool or if the header of the device is + * invalid upon importing a pool or onlining a cache device. The + * default is 0, which disables TRIM on L2ARC altogether as it can + * put significant stress on the underlying storage devices. This + * will vary depending of how well the specific device handles + * these commands. + */ +unsigned long l2arc_trim_ahead = 0; + +/* + * Performance tuning of L2ARC persistence: + * + * l2arc_rebuild_enabled : A ZFS module parameter that controls whether adding + * an L2ARC device (either at pool import or later) will attempt + * to rebuild L2ARC buffer contents. + * l2arc_rebuild_blocks_min_l2size : A ZFS module parameter that controls + * whether log blocks are written to the L2ARC device. If the L2ARC + * device is less than 1GB, the amount of data l2arc_evict() + * evicts is significant compared to the amount of restored L2ARC + * data. In this case do not write log blocks in L2ARC in order + * not to waste space. + */ +int l2arc_rebuild_enabled = B_TRUE; +unsigned long l2arc_rebuild_blocks_min_l2size = 1024 * 1024 * 1024; + +/* L2ARC persistence rebuild control routines. */ +void l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen); +static void l2arc_dev_rebuild_thread(void *arg); +static int l2arc_rebuild(l2arc_dev_t *dev); + +/* L2ARC persistence read I/O routines. */ +static int l2arc_dev_hdr_read(l2arc_dev_t *dev); +static int l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lp, const l2arc_log_blkptr_t *next_lp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io); +static zio_t *l2arc_log_blk_fetch(vdev_t *vd, + const l2arc_log_blkptr_t *lp, l2arc_log_blk_phys_t *lb); +static void l2arc_log_blk_fetch_abort(zio_t *zio); + +/* L2ARC persistence block restoration routines. */ +static void l2arc_log_blk_restore(l2arc_dev_t *dev, + const l2arc_log_blk_phys_t *lb, uint64_t lb_asize, uint64_t lb_daddr); +static void l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, + l2arc_dev_t *dev); + +/* L2ARC persistence write I/O routines. */ +static void l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, + l2arc_write_callback_t *cb); + +/* L2ARC persistence auxiliary routines. */ +boolean_t l2arc_log_blkptr_valid(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *lbp); +static boolean_t l2arc_log_blk_insert(l2arc_dev_t *dev, + const arc_buf_hdr_t *ab); +boolean_t l2arc_range_check_overlap(uint64_t bottom, + uint64_t top, uint64_t check); +static void l2arc_blk_fetch_done(zio_t *zio); +static inline uint64_t + l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev); + +/* + * We use Cityhash for this. It's fast, and has good hash properties without + * requiring any large static buffers. + */ +static uint64_t +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) +{ + return (cityhash4(spa, dva->dva_word[0], dva->dva_word[1], birth)); +} + +#define HDR_EMPTY(hdr) \ + ((hdr)->b_dva.dva_word[0] == 0 && \ + (hdr)->b_dva.dva_word[1] == 0) + +#define HDR_EMPTY_OR_LOCKED(hdr) \ + (HDR_EMPTY(hdr) || MUTEX_HELD(HDR_LOCK(hdr))) + +#define HDR_EQUAL(spa, dva, birth, hdr) \ + ((hdr)->b_dva.dva_word[0] == (dva)->dva_word[0]) && \ + ((hdr)->b_dva.dva_word[1] == (dva)->dva_word[1]) && \ + ((hdr)->b_birth == birth) && ((hdr)->b_spa == spa) + +static void +buf_discard_identity(arc_buf_hdr_t *hdr) +{ + hdr->b_dva.dva_word[0] = 0; + hdr->b_dva.dva_word[1] = 0; + hdr->b_birth = 0; +} + +static arc_buf_hdr_t * +buf_hash_find(uint64_t spa, const blkptr_t *bp, kmutex_t **lockp) +{ + const dva_t *dva = BP_IDENTITY(bp); + uint64_t birth = BP_PHYSICAL_BIRTH(bp); + uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *hdr; + + mutex_enter(hash_lock); + for (hdr = buf_hash_table.ht_table[idx]; hdr != NULL; + hdr = hdr->b_hash_next) { + if (HDR_EQUAL(spa, dva, birth, hdr)) { + *lockp = hash_lock; + return (hdr); + } + } + mutex_exit(hash_lock); + *lockp = NULL; + return (NULL); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + * If lockp == NULL, the caller is assumed to already hold the hash lock. + */ +static arc_buf_hdr_t * +buf_hash_insert(arc_buf_hdr_t *hdr, kmutex_t **lockp) +{ + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); + kmutex_t *hash_lock = BUF_HASH_LOCK(idx); + arc_buf_hdr_t *fhdr; + uint32_t i; + + ASSERT(!DVA_IS_EMPTY(&hdr->b_dva)); + ASSERT(hdr->b_birth != 0); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (lockp != NULL) { + *lockp = hash_lock; + mutex_enter(hash_lock); + } else { + ASSERT(MUTEX_HELD(hash_lock)); + } + + for (fhdr = buf_hash_table.ht_table[idx], i = 0; fhdr != NULL; + fhdr = fhdr->b_hash_next, i++) { + if (HDR_EQUAL(hdr->b_spa, &hdr->b_dva, hdr->b_birth, fhdr)) + return (fhdr); + } + + hdr->b_hash_next = buf_hash_table.ht_table[idx]; + buf_hash_table.ht_table[idx] = hdr; + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + + /* collect some hash table performance data */ + if (i > 0) { + ARCSTAT_BUMP(arcstat_hash_collisions); + if (i == 1) + ARCSTAT_BUMP(arcstat_hash_chains); + + ARCSTAT_MAX(arcstat_hash_chain_max, i); + } + + ARCSTAT_BUMP(arcstat_hash_elements); + ARCSTAT_MAXSTAT(arcstat_hash_elements); + + return (NULL); +} + +static void +buf_hash_remove(arc_buf_hdr_t *hdr) +{ + arc_buf_hdr_t *fhdr, **hdrp; + uint64_t idx = BUF_HASH_INDEX(hdr->b_spa, &hdr->b_dva, hdr->b_birth); + + ASSERT(MUTEX_HELD(BUF_HASH_LOCK(idx))); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + + hdrp = &buf_hash_table.ht_table[idx]; + while ((fhdr = *hdrp) != hdr) { + ASSERT3P(fhdr, !=, NULL); + hdrp = &fhdr->b_hash_next; + } + *hdrp = hdr->b_hash_next; + hdr->b_hash_next = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + + /* collect some hash table performance data */ + ARCSTAT_BUMPDOWN(arcstat_hash_elements); + + if (buf_hash_table.ht_table[idx] && + buf_hash_table.ht_table[idx]->b_hash_next == NULL) + ARCSTAT_BUMPDOWN(arcstat_hash_chains); +} + +/* + * Global data structures and functions for the buf kmem cache. + */ + +static kmem_cache_t *hdr_full_cache; +static kmem_cache_t *hdr_full_crypt_cache; +static kmem_cache_t *hdr_l2only_cache; +static kmem_cache_t *buf_cache; + +static void +buf_fini(void) +{ + int i; + +#if defined(_KERNEL) + /* + * Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel\ + */ + vmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#else + kmem_free(buf_hash_table.ht_table, + (buf_hash_table.ht_mask + 1) * sizeof (void *)); +#endif + for (i = 0; i < BUF_LOCKS; i++) + mutex_destroy(&buf_hash_table.ht_locks[i].ht_lock); + kmem_cache_destroy(hdr_full_cache); + kmem_cache_destroy(hdr_full_crypt_cache); + kmem_cache_destroy(hdr_l2only_cache); + kmem_cache_destroy(buf_cache); +} + +/* + * Constructor callback - called when the cache is empty + * and a new buf is requested. + */ +/* ARGSUSED */ +static int +hdr_full_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_FULL_SIZE); + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + cv_init(&hdr->b_l1hdr.b_cv, NULL, CV_DEFAULT, NULL); + zfs_refcount_create(&hdr->b_l1hdr.b_refcnt); + mutex_init(&hdr->b_l1hdr.b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&hdr->b_l1hdr.b_arc_node); + list_link_init(&hdr->b_l2hdr.b_l2node); + multilist_link_init(&hdr->b_l1hdr.b_arc_node); + arc_space_consume(HDR_FULL_SIZE, ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_full_crypt_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + hdr_full_cons(vbuf, unused, kmflag); + bzero(&hdr->b_crypt_hdr, sizeof (hdr->b_crypt_hdr)); + arc_space_consume(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +hdr_l2only_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_hdr_t *hdr = vbuf; + + bzero(hdr, HDR_L2ONLY_SIZE); + arc_space_consume(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); + + return (0); +} + +/* ARGSUSED */ +static int +buf_cons(void *vbuf, void *unused, int kmflag) +{ + arc_buf_t *buf = vbuf; + + bzero(buf, sizeof (arc_buf_t)); + mutex_init(&buf->b_evict_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + + return (0); +} + +/* + * Destructor callback - called when a cached buf is + * no longer required. + */ +/* ARGSUSED */ +static void +hdr_full_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + ASSERT(HDR_EMPTY(hdr)); + cv_destroy(&hdr->b_l1hdr.b_cv); + zfs_refcount_destroy(&hdr->b_l1hdr.b_refcnt); + mutex_destroy(&hdr->b_l1hdr.b_freeze_lock); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + arc_space_return(HDR_FULL_SIZE, ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_full_crypt_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr = vbuf; + + hdr_full_dest(vbuf, unused); + arc_space_return(sizeof (hdr->b_crypt_hdr), ARC_SPACE_HDRS); +} + +/* ARGSUSED */ +static void +hdr_l2only_dest(void *vbuf, void *unused) +{ + arc_buf_hdr_t *hdr __maybe_unused = vbuf; + + ASSERT(HDR_EMPTY(hdr)); + arc_space_return(HDR_L2ONLY_SIZE, ARC_SPACE_L2HDRS); +} + +/* ARGSUSED */ +static void +buf_dest(void *vbuf, void *unused) +{ + arc_buf_t *buf = vbuf; + + mutex_destroy(&buf->b_evict_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); +} + +static void +buf_init(void) +{ + uint64_t *ct = NULL; + uint64_t hsize = 1ULL << 12; + int i, j; + + /* + * The hash table is big enough to fill all of physical memory + * with an average block size of zfs_arc_average_blocksize (default 8K). + * By default, the table will take up + * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). + */ + while (hsize * zfs_arc_average_blocksize < arc_all_memory()) + hsize <<= 1; +retry: + buf_hash_table.ht_mask = hsize - 1; +#if defined(_KERNEL) + /* + * Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel + */ + buf_hash_table.ht_table = + vmem_zalloc(hsize * sizeof (void*), KM_SLEEP); +#else + buf_hash_table.ht_table = + kmem_zalloc(hsize * sizeof (void*), KM_NOSLEEP); +#endif + if (buf_hash_table.ht_table == NULL) { + ASSERT(hsize > (1ULL << 8)); + hsize >>= 1; + goto retry; + } + + hdr_full_cache = kmem_cache_create("arc_buf_hdr_t_full", HDR_FULL_SIZE, + 0, hdr_full_cons, hdr_full_dest, NULL, NULL, NULL, 0); + hdr_full_crypt_cache = kmem_cache_create("arc_buf_hdr_t_full_crypt", + HDR_FULL_CRYPT_SIZE, 0, hdr_full_crypt_cons, hdr_full_crypt_dest, + NULL, NULL, NULL, 0); + hdr_l2only_cache = kmem_cache_create("arc_buf_hdr_t_l2only", + HDR_L2ONLY_SIZE, 0, hdr_l2only_cons, hdr_l2only_dest, NULL, + NULL, NULL, 0); + buf_cache = kmem_cache_create("arc_buf_t", sizeof (arc_buf_t), + 0, buf_cons, buf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < 256; i++) + for (ct = zfs_crc64_table + i, *ct = i, j = 8; j > 0; j--) + *ct = (*ct >> 1) ^ (-(*ct & 1) & ZFS_CRC64_POLY); + + for (i = 0; i < BUF_LOCKS; i++) { + mutex_init(&buf_hash_table.ht_locks[i].ht_lock, + NULL, MUTEX_DEFAULT, NULL); + } +} + +#define ARC_MINTIME (hz>>4) /* 62 ms */ + +/* + * This is the size that the buf occupies in memory. If the buf is compressed, + * it will correspond to the compressed size. You should use this method of + * getting the buf size unless you explicitly need the logical size. + */ +uint64_t +arc_buf_size(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_PSIZE(buf->b_hdr) : HDR_GET_LSIZE(buf->b_hdr)); +} + +uint64_t +arc_buf_lsize(arc_buf_t *buf) +{ + return (HDR_GET_LSIZE(buf->b_hdr)); +} + +/* + * This function will return B_TRUE if the buffer is encrypted in memory. + * This buffer can be decrypted by calling arc_untransform(). + */ +boolean_t +arc_is_encrypted(arc_buf_t *buf) +{ + return (ARC_BUF_ENCRYPTED(buf) != 0); +} + +/* + * Returns B_TRUE if the buffer represents data that has not had its MAC + * verified yet. + */ +boolean_t +arc_is_unauthenticated(arc_buf_t *buf) +{ + return (HDR_NOAUTH(buf->b_hdr) != 0); +} + +void +arc_get_raw_params(arc_buf_t *buf, boolean_t *byteorder, uint8_t *salt, + uint8_t *iv, uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_PROTECTED(hdr)); + + bcopy(hdr->b_crypt_hdr.b_salt, salt, ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, iv, ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, mac, ZIO_DATA_MAC_LEN); + *byteorder = (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; +} + +/* + * Indicates how this buffer is compressed in memory. If it is not compressed + * the value will be ZIO_COMPRESS_OFF. It can be made normally readable with + * arc_untransform() as long as it is also unencrypted. + */ +enum zio_compress +arc_get_compression(arc_buf_t *buf) +{ + return (ARC_BUF_COMPRESSED(buf) ? + HDR_GET_COMPRESS(buf->b_hdr) : ZIO_COMPRESS_OFF); +} + +/* + * Return the compression algorithm used to store this data in the ARC. If ARC + * compression is enabled or this is an encrypted block, this will be the same + * as what's used to store it on-disk. Otherwise, this will be ZIO_COMPRESS_OFF. + */ +static inline enum zio_compress +arc_hdr_get_compress(arc_buf_hdr_t *hdr) +{ + return (HDR_COMPRESSION_ENABLED(hdr) ? + HDR_GET_COMPRESS(hdr) : ZIO_COMPRESS_OFF); +} + +uint8_t +arc_get_complevel(arc_buf_t *buf) +{ + return (buf->b_hdr->b_complevel); +} + +static inline boolean_t +arc_buf_is_shared(arc_buf_t *buf) +{ + boolean_t shared = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) && + buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd)); + IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr)); + IMPLY(shared, ARC_BUF_SHARED(buf)); + IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf)); + + /* + * It would be nice to assert arc_can_share() too, but the "hdr isn't + * already being shared" requirement prevents us from doing that. + */ + + return (shared); +} + +/* + * Free the checksum associated with this header. If there is no checksum, this + * is a no-op. + */ +static inline void +arc_cksum_free(arc_buf_hdr_t *hdr) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL) { + kmem_free(hdr->b_l1hdr.b_freeze_cksum, sizeof (zio_cksum_t)); + hdr->b_l1hdr.b_freeze_cksum = NULL; + } + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} + +/* + * Return true iff at least one of the bufs on hdr is not compressed. + * Encrypted buffers count as compressed. + */ +static boolean_t +arc_hdr_has_uncompressed_buf(arc_buf_hdr_t *hdr) +{ + ASSERT(hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY_OR_LOCKED(hdr)); + + for (arc_buf_t *b = hdr->b_l1hdr.b_buf; b != NULL; b = b->b_next) { + if (!ARC_BUF_COMPRESSED(b)) { + return (B_TRUE); + } + } + return (B_FALSE); +} + + +/* + * If we've turned on the ZFS_DEBUG_MODIFY flag, verify that the buf's data + * matches the checksum that is stored in the hdr. If there is no checksum, + * or if the buf is compressed, this is a no-op. + */ +static void +arc_cksum_verify(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + zio_cksum_t zc; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (ARC_BUF_COMPRESSED(buf)) + return; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&hdr->b_l1hdr.b_freeze_lock); + + if (hdr->b_l1hdr.b_freeze_cksum == NULL || HDR_IO_ERROR(hdr)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } + + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, &zc); + if (!ZIO_CHECKSUM_EQUAL(*hdr->b_l1hdr.b_freeze_cksum, zc)) + panic("buffer modified while frozen!"); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); +} + +/* + * This function makes the assumption that data stored in the L2ARC + * will be transformed exactly as it is in the main pool. Because of + * this we can verify the checksum against the reading process's bp. + */ +static boolean_t +arc_cksum_is_equal(arc_buf_hdr_t *hdr, zio_t *zio) +{ + ASSERT(!BP_IS_EMBEDDED(zio->io_bp)); + VERIFY3U(BP_GET_PSIZE(zio->io_bp), ==, HDR_GET_PSIZE(hdr)); + + /* + * Block pointers always store the checksum for the logical data. + * If the block pointer has the gang bit set, then the checksum + * it represents is for the reconstituted data and not for an + * individual gang member. The zio pipeline, however, must be able to + * determine the checksum of each of the gang constituents so it + * treats the checksum comparison differently than what we need + * for l2arc blocks. This prevents us from using the + * zio_checksum_error() interface directly. Instead we must call the + * zio_checksum_error_impl() so that we can ensure the checksum is + * generated using the correct checksum algorithm and accounts for the + * logical I/O size and not just a gang fragment. + */ + return (zio_checksum_error_impl(zio->io_spa, zio->io_bp, + BP_GET_CHECKSUM(zio->io_bp), zio->io_abd, zio->io_size, + zio->io_offset, NULL) == 0); +} + +/* + * Given a buf full of data, if ZFS_DEBUG_MODIFY is enabled this computes a + * checksum and attaches it to the buf's hdr so that we can ensure that the buf + * isn't modified later on. If buf is compressed or there is already a checksum + * on the hdr, this is a no-op (we only checksum uncompressed bufs). + */ +static void +arc_cksum_compute(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + ASSERT(HDR_HAS_L1HDR(hdr)); + + mutex_enter(&buf->b_hdr->b_l1hdr.b_freeze_lock); + if (hdr->b_l1hdr.b_freeze_cksum != NULL || ARC_BUF_COMPRESSED(buf)) { + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + return; + } + + ASSERT(!ARC_BUF_ENCRYPTED(buf)); + ASSERT(!ARC_BUF_COMPRESSED(buf)); + hdr->b_l1hdr.b_freeze_cksum = kmem_alloc(sizeof (zio_cksum_t), + KM_SLEEP); + fletcher_2_native(buf->b_data, arc_buf_size(buf), NULL, + hdr->b_l1hdr.b_freeze_cksum); + mutex_exit(&hdr->b_l1hdr.b_freeze_lock); + arc_buf_watch(buf); +} + +#ifndef _KERNEL +void +arc_buf_sigsegv(int sig, siginfo_t *si, void *unused) +{ + panic("Got SIGSEGV at address: 0x%lx\n", (long)si->si_addr); +} +#endif + +/* ARGSUSED */ +static void +arc_buf_unwatch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) { + ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), + PROT_READ | PROT_WRITE)); + } +#endif +} + +/* ARGSUSED */ +static void +arc_buf_watch(arc_buf_t *buf) +{ +#ifndef _KERNEL + if (arc_watch) + ASSERT0(mprotect(buf->b_data, arc_buf_size(buf), + PROT_READ)); +#endif +} + +static arc_buf_contents_t +arc_buf_type(arc_buf_hdr_t *hdr) +{ + arc_buf_contents_t type; + if (HDR_ISTYPE_METADATA(hdr)) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + } + VERIFY3U(hdr->b_type, ==, type); + return (type); +} + +boolean_t +arc_is_metadata(arc_buf_t *buf) +{ + return (HDR_ISTYPE_METADATA(buf->b_hdr) != 0); +} + +static uint32_t +arc_bufc_to_flags(arc_buf_contents_t type) +{ + switch (type) { + case ARC_BUFC_DATA: + /* metadata field is 0 if buffer contains normal data */ + return (0); + case ARC_BUFC_METADATA: + return (ARC_FLAG_BUFC_METADATA); + default: + break; + } + panic("undefined ARC buffer type!"); + return ((uint32_t)-1); +} + +void +arc_buf_thaw(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + + arc_cksum_verify(buf); + + /* + * Compressed buffers do not manipulate the b_freeze_cksum. + */ + if (ARC_BUF_COMPRESSED(buf)) + return; + + ASSERT(HDR_HAS_L1HDR(hdr)); + arc_cksum_free(hdr); + arc_buf_unwatch(buf); +} + +void +arc_buf_freeze(arc_buf_t *buf) +{ + if (!(zfs_flags & ZFS_DEBUG_MODIFY)) + return; + + if (ARC_BUF_COMPRESSED(buf)) + return; + + ASSERT(HDR_HAS_L1HDR(buf->b_hdr)); + arc_cksum_compute(buf); +} + +/* + * The arc_buf_hdr_t's b_flags should never be modified directly. Instead, + * the following functions should be used to ensure that the flags are + * updated in a thread-safe way. When manipulating the flags either + * the hash_lock must be held or the hdr must be undiscoverable. This + * ensures that we're not racing with any other threads when updating + * the flags. + */ +static inline void +arc_hdr_set_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + hdr->b_flags |= flags; +} + +static inline void +arc_hdr_clear_flags(arc_buf_hdr_t *hdr, arc_flags_t flags) +{ + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + hdr->b_flags &= ~flags; +} + +/* + * Setting the compression bits in the arc_buf_hdr_t's b_flags is + * done in a special way since we have to clear and set bits + * at the same time. Consumers that wish to set the compression bits + * must use this function to ensure that the flags are updated in + * thread-safe manner. + */ +static void +arc_hdr_set_compress(arc_buf_hdr_t *hdr, enum zio_compress cmp) +{ + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + /* + * Holes and embedded blocks will always have a psize = 0 so + * we ignore the compression of the blkptr and set the + * want to uncompress them. Mark them as uncompressed. + */ + if (!zfs_compressed_arc_enabled || HDR_GET_PSIZE(hdr) == 0) { + arc_hdr_clear_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + ASSERT(!HDR_COMPRESSION_ENABLED(hdr)); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_COMPRESSED_ARC); + ASSERT(HDR_COMPRESSION_ENABLED(hdr)); + } + + HDR_SET_COMPRESS(hdr, cmp); + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, cmp); +} + +/* + * Looks for another buf on the same hdr which has the data decompressed, copies + * from it, and returns true. If no such buf exists, returns false. + */ +static boolean_t +arc_buf_try_copy_decompressed_data(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t copied = B_FALSE; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(!ARC_BUF_COMPRESSED(buf)); + + for (arc_buf_t *from = hdr->b_l1hdr.b_buf; from != NULL; + from = from->b_next) { + /* can't use our own data buffer */ + if (from == buf) { + continue; + } + + if (!ARC_BUF_COMPRESSED(from)) { + bcopy(from->b_data, buf->b_data, arc_buf_size(buf)); + copied = B_TRUE; + break; + } + } + + /* + * There were no decompressed bufs, so there should not be a + * checksum on the hdr either. + */ + if (zfs_flags & ZFS_DEBUG_MODIFY) + EQUIV(!copied, hdr->b_l1hdr.b_freeze_cksum == NULL); + + return (copied); +} + +/* + * Allocates an ARC buf header that's in an evicted & L2-cached state. + * This is used during l2arc reconstruction to make empty ARC buffers + * which circumvent the regular disk->arc->l2arc path and instead come + * into being in the reverse order, i.e. l2arc->arc. + */ +static arc_buf_hdr_t * +arc_buf_alloc_l2only(size_t size, arc_buf_contents_t type, l2arc_dev_t *dev, + dva_t dva, uint64_t daddr, int32_t psize, uint64_t birth, + enum zio_compress compress, uint8_t complevel, boolean_t protected, + boolean_t prefetch) +{ + arc_buf_hdr_t *hdr; + + ASSERT(size != 0); + hdr = kmem_cache_alloc(hdr_l2only_cache, KM_SLEEP); + hdr->b_birth = birth; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L2HDR); + HDR_SET_LSIZE(hdr, size); + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = complevel; + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + if (prefetch) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + hdr->b_spa = spa_load_guid(dev->l2ad_vdev->vdev_spa); + + hdr->b_dva = dva; + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_daddr = daddr; + + return (hdr); +} + +/* + * Return the size of the block, b_pabd, that is stored in the arc_buf_hdr_t. + */ +static uint64_t +arc_hdr_size(arc_buf_hdr_t *hdr) +{ + uint64_t size; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + HDR_GET_PSIZE(hdr) > 0) { + size = HDR_GET_PSIZE(hdr); + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, 0); + size = HDR_GET_LSIZE(hdr); + } + return (size); +} + +static int +arc_hdr_authenticate(arc_buf_hdr_t *hdr, spa_t *spa, uint64_t dsobj) +{ + int ret; + uint64_t csize; + uint64_t lsize = HDR_GET_LSIZE(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + void *tmpbuf = NULL; + abd_t *abd = hdr->b_l1hdr.b_pabd; + + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + ASSERT(HDR_AUTHENTICATED(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * The MAC is calculated on the compressed data that is stored on disk. + * However, if compressed arc is disabled we will only have the + * decompressed data available to us now. Compress it into a temporary + * abd so we can verify the MAC. The performance overhead of this will + * be relatively low, since most objects in an encrypted objset will + * be encrypted (instead of authenticated) anyway. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + tmpbuf = zio_buf_alloc(lsize); + abd = abd_get_from_buf(tmpbuf, lsize); + abd_take_ownership_of_buf(abd, B_TRUE); + csize = zio_compress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmpbuf, lsize, hdr->b_complevel); + ASSERT3U(csize, <=, psize); + abd_zero_off(abd, csize, psize - csize); + } + + /* + * Authentication is best effort. We authenticate whenever the key is + * available. If we succeed we clear ARC_FLAG_NOAUTH. + */ + if (hdr->b_crypt_hdr.b_ot == DMU_OT_OBJSET) { + ASSERT3U(HDR_GET_COMPRESS(hdr), ==, ZIO_COMPRESS_OFF); + ASSERT3U(lsize, ==, psize); + ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, dsobj, abd, + psize, hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + } else { + ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, abd, psize, + hdr->b_crypt_hdr.b_mac); + } + + if (ret == 0) + arc_hdr_clear_flags(hdr, ARC_FLAG_NOAUTH); + else if (ret != ENOENT) + goto error; + + if (tmpbuf != NULL) + abd_free(abd); + + return (0); + +error: + if (tmpbuf != NULL) + abd_free(abd); + + return (ret); +} + +/* + * This function will take a header that only has raw encrypted data in + * b_crypt_hdr.b_rabd and decrypt it into a new buffer which is stored in + * b_l1hdr.b_pabd. If designated in the header flags, this function will + * also decompress the data. + */ +static int +arc_hdr_decrypt(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb) +{ + int ret; + abd_t *cabd = NULL; + void *tmp = NULL; + boolean_t no_crypt = B_FALSE; + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + ASSERT(HDR_ENCRYPTED(hdr)); + + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + + ret = spa_do_crypt_abd(B_FALSE, spa, zb, hdr->b_crypt_hdr.b_ot, + B_FALSE, bswap, hdr->b_crypt_hdr.b_salt, hdr->b_crypt_hdr.b_iv, + hdr->b_crypt_hdr.b_mac, HDR_GET_PSIZE(hdr), hdr->b_l1hdr.b_pabd, + hdr->b_crypt_hdr.b_rabd, &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) { + abd_copy(hdr->b_l1hdr.b_pabd, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + } + + /* + * If this header has disabled arc compression but the b_pabd is + * compressed after decrypting it, we need to decompress the newly + * decrypted data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + /* + * We want to make sure that we are correctly honoring the + * zfs_abd_scatter_enabled setting, so we allocate an abd here + * and then loan a buffer from it, rather than allocating a + * linear buffer and wrapping it in an abd later. + */ + cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, B_TRUE); + tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr), &hdr->b_complevel); + if (ret != 0) { + abd_return_buf(cabd, tmp, arc_hdr_size(hdr)); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + } + + return (0); + +error: + arc_hdr_free_abd(hdr, B_FALSE); + if (cabd != NULL) + arc_free_data_buf(hdr, cabd, arc_hdr_size(hdr), hdr); + + return (ret); +} + +/* + * This function is called during arc_buf_fill() to prepare the header's + * abd plaintext pointer for use. This involves authenticated protected + * data and decrypting encrypted data into the plaintext abd. + */ +static int +arc_fill_hdr_crypt(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, spa_t *spa, + const zbookmark_phys_t *zb, boolean_t noauth) +{ + int ret; + + ASSERT(HDR_PROTECTED(hdr)); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + + if (HDR_NOAUTH(hdr) && !noauth) { + /* + * The caller requested authenticated data but our data has + * not been authenticated yet. Verify the MAC now if we can. + */ + ret = arc_hdr_authenticate(hdr, spa, zb->zb_objset); + if (ret != 0) + goto error; + } else if (HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd == NULL) { + /* + * If we only have the encrypted version of the data, but the + * unencrypted version was requested we take this opportunity + * to store the decrypted version in the header for future use. + */ + ret = arc_hdr_decrypt(hdr, spa, zb); + if (ret != 0) + goto error; + } + + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (0); + +error: + if (hash_lock != NULL) + mutex_exit(hash_lock); + + return (ret); +} + +/* + * This function is used by the dbuf code to decrypt bonus buffers in place. + * The dbuf code itself doesn't have any locking for decrypting a shared dnode + * block, so we use the hash lock here to protect against concurrent calls to + * arc_buf_fill(). + */ +static void +arc_buf_untransform_in_place(arc_buf_t *buf, kmutex_t *hash_lock) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(HDR_ENCRYPTED(hdr)); + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + zio_crypt_copy_dnode_bonus(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + hdr->b_crypt_hdr.b_ebufcnt -= 1; +} + +/* + * Given a buf that has a data buffer attached to it, this function will + * efficiently fill the buf with data of the specified compression setting from + * the hdr and update the hdr's b_freeze_cksum if necessary. If the buf and hdr + * are already sharing a data buf, no copy is performed. + * + * If the buf is marked as compressed but uncompressed data was requested, this + * will allocate a new data buffer for the buf, remove that flag, and fill the + * buf with uncompressed data. You can't request a compressed buf on a hdr with + * uncompressed data, and (since we haven't added support for it yet) if you + * want compressed data your buf must already be marked as compressed and have + * the correct-sized data buffer. + */ +static int +arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + arc_fill_flags_t flags) +{ + int error = 0; + arc_buf_hdr_t *hdr = buf->b_hdr; + boolean_t hdr_compressed = + (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); + boolean_t compressed = (flags & ARC_FILL_COMPRESSED) != 0; + boolean_t encrypted = (flags & ARC_FILL_ENCRYPTED) != 0; + dmu_object_byteswap_t bswap = hdr->b_l1hdr.b_byteswap; + kmutex_t *hash_lock = (flags & ARC_FILL_LOCKED) ? NULL : HDR_LOCK(hdr); + + ASSERT3P(buf->b_data, !=, NULL); + IMPLY(compressed, hdr_compressed || ARC_BUF_ENCRYPTED(buf)); + IMPLY(compressed, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, HDR_ENCRYPTED(hdr)); + IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf)); + IMPLY(encrypted, ARC_BUF_COMPRESSED(buf)); + IMPLY(encrypted, !ARC_BUF_SHARED(buf)); + + /* + * If the caller wanted encrypted data we just need to copy it from + * b_rabd and potentially byteswap it. We won't be able to do any + * further transforms on it. + */ + if (encrypted) { + ASSERT(HDR_HAS_RABD(hdr)); + abd_copy_to_buf(buf->b_data, hdr->b_crypt_hdr.b_rabd, + HDR_GET_PSIZE(hdr)); + goto byteswap; + } + + /* + * Adjust encrypted and authenticated headers to accommodate + * the request if needed. Dnode blocks (ARC_FILL_IN_PLACE) are + * allowed to fail decryption due to keys not being loaded + * without being marked as an IO error. + */ + if (HDR_PROTECTED(hdr)) { + error = arc_fill_hdr_crypt(hdr, hash_lock, spa, + zb, !!(flags & ARC_FILL_NOAUTH)); + if (error == EACCES && (flags & ARC_FILL_IN_PLACE) != 0) { + return (error); + } else if (error != 0) { + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hash_lock != NULL) + mutex_exit(hash_lock); + return (error); + } + } + + /* + * There is a special case here for dnode blocks which are + * decrypting their bonus buffers. These blocks may request to + * be decrypted in-place. This is necessary because there may + * be many dnodes pointing into this buffer and there is + * currently no method to synchronize replacing the backing + * b_data buffer and updating all of the pointers. Here we use + * the hash lock to ensure there are no races. If the need + * arises for other types to be decrypted in-place, they must + * add handling here as well. + */ + if ((flags & ARC_FILL_IN_PLACE) != 0) { + ASSERT(!hdr_compressed); + ASSERT(!compressed); + ASSERT(!encrypted); + + if (HDR_ENCRYPTED(hdr) && ARC_BUF_ENCRYPTED(buf)) { + ASSERT3U(hdr->b_crypt_hdr.b_ot, ==, DMU_OT_DNODE); + + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_buf_untransform_in_place(buf, hash_lock); + if (hash_lock != NULL) + mutex_exit(hash_lock); + + /* Compute the hdr's checksum if necessary */ + arc_cksum_compute(buf); + } + + return (0); + } + + if (hdr_compressed == compressed) { + if (!arc_buf_is_shared(buf)) { + abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd, + arc_buf_size(buf)); + } + } else { + ASSERT(hdr_compressed); + ASSERT(!compressed); + ASSERT3U(HDR_GET_LSIZE(hdr), !=, HDR_GET_PSIZE(hdr)); + + /* + * If the buf is sharing its data with the hdr, unlink it and + * allocate a new data buffer for the buf. + */ + if (arc_buf_is_shared(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + + /* We need to give the buf its own b_data */ + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + + /* Previously overhead was 0; just add new overhead */ + ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr)); + } else if (ARC_BUF_COMPRESSED(buf)) { + /* We need to reallocate the buf's b_data */ + arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr), + buf); + buf->b_data = + arc_get_data_buf(hdr, HDR_GET_LSIZE(hdr), buf); + + /* We increased the size of b_data; update overhead */ + ARCSTAT_INCR(arcstat_overhead_size, + HDR_GET_LSIZE(hdr) - HDR_GET_PSIZE(hdr)); + } + + /* + * Regardless of the buf's previous compression settings, it + * should not be compressed at the end of this function. + */ + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + + /* + * Try copying the data from another buf which already has a + * decompressed version. If that's not possible, it's time to + * bite the bullet and decompress the data from the hdr. + */ + if (arc_buf_try_copy_decompressed_data(buf)) { + /* Skip byteswapping and checksumming (already done) */ + return (0); + } else { + error = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, buf->b_data, + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr), + &hdr->b_complevel); + + /* + * Absent hardware errors or software bugs, this should + * be impossible, but log it anyway so we can debug it. + */ + if (error != 0) { + zfs_dbgmsg( + "hdr %px, compress %d, psize %d, lsize %d", + hdr, arc_hdr_get_compress(hdr), + HDR_GET_PSIZE(hdr), HDR_GET_LSIZE(hdr)); + if (hash_lock != NULL) + mutex_enter(hash_lock); + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hash_lock != NULL) + mutex_exit(hash_lock); + return (SET_ERROR(EIO)); + } + } + } + +byteswap: + /* Byteswap the buf's data if necessary */ + if (bswap != DMU_BSWAP_NUMFUNCS) { + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT3U(bswap, <, DMU_BSWAP_NUMFUNCS); + dmu_ot_byteswap[bswap].ob_func(buf->b_data, HDR_GET_LSIZE(hdr)); + } + + /* Compute the hdr's checksum if necessary */ + arc_cksum_compute(buf); + + return (0); +} + +/* + * If this function is being called to decrypt an encrypted buffer or verify an + * authenticated one, the key must be loaded and a mapping must be made + * available in the keystore via spa_keystore_create_mapping() or one of its + * callers. + */ +int +arc_untransform(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb, + boolean_t in_place) +{ + int ret; + arc_fill_flags_t flags = 0; + + if (in_place) + flags |= ARC_FILL_IN_PLACE; + + ret = arc_buf_fill(buf, spa, zb, flags); + if (ret == ECKSUM) { + /* + * Convert authentication and decryption errors to EIO + * (and generate an ereport) before leaving the ARC. + */ + ret = SET_ERROR(EIO); + spa_log_error(spa, zb); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, zb, NULL, 0, 0); + } + + return (ret); +} + +/* + * Increment the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + (void) zfs_refcount_add_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) zfs_refcount_add_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_add_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } + + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) + continue; + (void) zfs_refcount_add_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); + } +} + +/* + * Decrement the amount of evictable space in the arc_state_t's refcount. + * We account for the space used by the hdr and the arc buf individually + * so that we can add and remove them from the refcount individually. + */ +static void +arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (GHOST_STATE(state)) { + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + HDR_GET_LSIZE(hdr), hdr); + return; + } + + ASSERT(!GHOST_STATE(state)); + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + arc_hdr_size(hdr), hdr); + } + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + HDR_GET_PSIZE(hdr), hdr); + } + + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + if (arc_buf_is_shared(buf)) + continue; + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + arc_buf_size(buf), buf); + } +} + +/* + * Add a reference to this hdr indicating that someone is actively + * referencing that memory. When the refcount transitions from 0 to 1, + * we remove it from the respective arc_state_t list to indicate that + * it is not evictable. + */ +static void +add_reference(arc_buf_hdr_t *hdr, void *tag) +{ + arc_state_t *state; + + ASSERT(HDR_HAS_L1HDR(hdr)); + if (!HDR_EMPTY(hdr) && !MUTEX_HELD(HDR_LOCK(hdr))) { + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + } + + state = hdr->b_l1hdr.b_state; + + if ((zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag) == 1) && + (state != arc_anon)) { + /* We don't use the L2-only state list. */ + if (state != arc_l2c_only) { + multilist_remove(state->arcs_list[arc_buf_type(hdr)], + hdr); + arc_evictable_space_decrement(hdr, state); + } + /* remove the prefetch flag if we get a reference */ + arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + } +} + +/* + * Remove a reference from this hdr. When the reference transitions from + * 1 to 0 and we're not anonymous, then we add this hdr to the arc_state_t's + * list making it eligible for eviction. + */ +static int +remove_reference(arc_buf_hdr_t *hdr, kmutex_t *hash_lock, void *tag) +{ + int cnt; + arc_state_t *state = hdr->b_l1hdr.b_state; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(state == arc_anon || MUTEX_HELD(hash_lock)); + ASSERT(!GHOST_STATE(state)); + + /* + * arc_l2c_only counts as a ghost state so we don't need to explicitly + * check to prevent usage of the arc_l2c_only list. + */ + if (((cnt = zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag)) == 0) && + (state != arc_anon)) { + multilist_insert(state->arcs_list[arc_buf_type(hdr)], hdr); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + arc_evictable_space_increment(hdr, state); + } + return (cnt); +} + +/* + * Returns detailed information about a specific arc buffer. When the + * state_index argument is set the function will calculate the arc header + * list position for its arc state. Since this requires a linear traversal + * callers are strongly encourage not to do this. However, it can be helpful + * for targeted analysis so the functionality is provided. + */ +void +arc_buf_info(arc_buf_t *ab, arc_buf_info_t *abi, int state_index) +{ + arc_buf_hdr_t *hdr = ab->b_hdr; + l1arc_buf_hdr_t *l1hdr = NULL; + l2arc_buf_hdr_t *l2hdr = NULL; + arc_state_t *state = NULL; + + memset(abi, 0, sizeof (arc_buf_info_t)); + + if (hdr == NULL) + return; + + abi->abi_flags = hdr->b_flags; + + if (HDR_HAS_L1HDR(hdr)) { + l1hdr = &hdr->b_l1hdr; + state = l1hdr->b_state; + } + if (HDR_HAS_L2HDR(hdr)) + l2hdr = &hdr->b_l2hdr; + + if (l1hdr) { + abi->abi_bufcnt = l1hdr->b_bufcnt; + abi->abi_access = l1hdr->b_arc_access; + abi->abi_mru_hits = l1hdr->b_mru_hits; + abi->abi_mru_ghost_hits = l1hdr->b_mru_ghost_hits; + abi->abi_mfu_hits = l1hdr->b_mfu_hits; + abi->abi_mfu_ghost_hits = l1hdr->b_mfu_ghost_hits; + abi->abi_holds = zfs_refcount_count(&l1hdr->b_refcnt); + } + + if (l2hdr) { + abi->abi_l2arc_dattr = l2hdr->b_daddr; + abi->abi_l2arc_hits = l2hdr->b_hits; + } + + abi->abi_state_type = state ? state->arcs_state : ARC_STATE_ANON; + abi->abi_state_contents = arc_buf_type(hdr); + abi->abi_size = arc_hdr_size(hdr); +} + +/* + * Move the supplied buffer to the indicated state. The hash lock + * for the buffer must be held by the caller. + */ +static void +arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr, + kmutex_t *hash_lock) +{ + arc_state_t *old_state; + int64_t refcnt; + uint32_t bufcnt; + boolean_t update_old, update_new; + arc_buf_contents_t buftype = arc_buf_type(hdr); + + /* + * We almost always have an L1 hdr here, since we call arc_hdr_realloc() + * in arc_read() when bringing a buffer out of the L2ARC. However, the + * L1 hdr doesn't always exist when we change state to arc_anon before + * destroying a header, in which case reallocating to add the L1 hdr is + * pointless. + */ + if (HDR_HAS_L1HDR(hdr)) { + old_state = hdr->b_l1hdr.b_state; + refcnt = zfs_refcount_count(&hdr->b_l1hdr.b_refcnt); + bufcnt = hdr->b_l1hdr.b_bufcnt; + update_old = (bufcnt > 0 || hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + } else { + old_state = arc_l2c_only; + refcnt = 0; + bufcnt = 0; + update_old = B_FALSE; + } + update_new = update_old; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT3P(new_state, !=, old_state); + ASSERT(!GHOST_STATE(new_state) || bufcnt == 0); + ASSERT(old_state != arc_anon || bufcnt <= 1); + + /* + * If this buffer is evictable, transfer it from the + * old state list to the new state list. + */ + if (refcnt == 0) { + if (old_state != arc_anon && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_remove(old_state->arcs_list[buftype], hdr); + + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_old = B_TRUE; + } + arc_evictable_space_decrement(hdr, old_state); + } + if (new_state != arc_anon && new_state != arc_l2c_only) { + /* + * An L1 header always exists here, since if we're + * moving to some L1-cached state (i.e. not l2c_only or + * anonymous), we realloc the header to add an L1hdr + * beforehand. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + multilist_insert(new_state->arcs_list[buftype], hdr); + + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + update_new = B_TRUE; + } + arc_evictable_space_increment(hdr, new_state); + } + } + + ASSERT(!HDR_EMPTY(hdr)); + if (new_state == arc_anon && HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + + /* adjust state sizes (ignore arc_l2c_only) */ + + if (update_new && new_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(new_state)) { + ASSERT0(bufcnt); + + /* + * When moving a header to a ghost state, we first + * remove all arc buffers. Thus, we'll have a + * bufcnt of zero, and no arc buffer to use for + * the reference. As a result, we use the arc + * header pointer for the reference. + */ + (void) zfs_refcount_add_many(&new_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) + continue; + + (void) zfs_refcount_add_many( + &new_state->arcs_size, + arc_buf_size(buf), buf); + } + ASSERT3U(bufcnt, ==, buffers); + + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) zfs_refcount_add_many( + &new_state->arcs_size, + arc_hdr_size(hdr), hdr); + } + + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_add_many( + &new_state->arcs_size, + HDR_GET_PSIZE(hdr), hdr); + } + } + } + + if (update_old && old_state != arc_l2c_only) { + ASSERT(HDR_HAS_L1HDR(hdr)); + if (GHOST_STATE(old_state)) { + ASSERT0(bufcnt); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + + /* + * When moving a header off of a ghost state, + * the header will not contain any arc buffers. + * We use the arc header pointer for the reference + * which is exactly what we did when we put the + * header on the ghost state. + */ + + (void) zfs_refcount_remove_many(&old_state->arcs_size, + HDR_GET_LSIZE(hdr), hdr); + } else { + uint32_t buffers = 0; + + /* + * Each individual buffer holds a unique reference, + * thus we must remove each of these references one + * at a time. + */ + for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL; + buf = buf->b_next) { + ASSERT3U(bufcnt, !=, 0); + buffers++; + + /* + * When the arc_buf_t is sharing the data + * block with the hdr, the owner of the + * reference belongs to the hdr. Only + * add to the refcount if the arc_buf_t is + * not shared. + */ + if (arc_buf_is_shared(buf)) + continue; + + (void) zfs_refcount_remove_many( + &old_state->arcs_size, arc_buf_size(buf), + buf); + } + ASSERT3U(bufcnt, ==, buffers); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + + if (hdr->b_l1hdr.b_pabd != NULL) { + (void) zfs_refcount_remove_many( + &old_state->arcs_size, arc_hdr_size(hdr), + hdr); + } + + if (HDR_HAS_RABD(hdr)) { + (void) zfs_refcount_remove_many( + &old_state->arcs_size, HDR_GET_PSIZE(hdr), + hdr); + } + } + } + + if (HDR_HAS_L1HDR(hdr)) + hdr->b_l1hdr.b_state = new_state; + + /* + * L2 headers should never be on the L2 state list since they don't + * have L1 headers allocated. + */ + ASSERT(multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_DATA]) && + multilist_is_empty(arc_l2c_only->arcs_list[ARC_BUFC_METADATA])); +} + +void +arc_space_consume(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + default: + break; + case ARC_SPACE_DATA: + aggsum_add(&astat_data_size, space); + break; + case ARC_SPACE_META: + aggsum_add(&astat_metadata_size, space); + break; + case ARC_SPACE_BONUS: + aggsum_add(&astat_bonus_size, space); + break; + case ARC_SPACE_DNODE: + aggsum_add(&astat_dnode_size, space); + break; + case ARC_SPACE_DBUF: + aggsum_add(&astat_dbuf_size, space); + break; + case ARC_SPACE_HDRS: + aggsum_add(&astat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + aggsum_add(&astat_l2_hdr_size, space); + break; + case ARC_SPACE_ABD_CHUNK_WASTE: + /* + * Note: this includes space wasted by all scatter ABD's, not + * just those allocated by the ARC. But the vast majority of + * scatter ABD's come from the ARC, because other users are + * very short-lived. + */ + aggsum_add(&astat_abd_chunk_waste_size, space); + break; + } + + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) + aggsum_add(&arc_meta_used, space); + + aggsum_add(&arc_size, space); +} + +void +arc_space_return(uint64_t space, arc_space_type_t type) +{ + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + default: + break; + case ARC_SPACE_DATA: + aggsum_add(&astat_data_size, -space); + break; + case ARC_SPACE_META: + aggsum_add(&astat_metadata_size, -space); + break; + case ARC_SPACE_BONUS: + aggsum_add(&astat_bonus_size, -space); + break; + case ARC_SPACE_DNODE: + aggsum_add(&astat_dnode_size, -space); + break; + case ARC_SPACE_DBUF: + aggsum_add(&astat_dbuf_size, -space); + break; + case ARC_SPACE_HDRS: + aggsum_add(&astat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + aggsum_add(&astat_l2_hdr_size, -space); + break; + case ARC_SPACE_ABD_CHUNK_WASTE: + aggsum_add(&astat_abd_chunk_waste_size, -space); + break; + } + + if (type != ARC_SPACE_DATA && type != ARC_SPACE_ABD_CHUNK_WASTE) { + ASSERT(aggsum_compare(&arc_meta_used, space) >= 0); + /* + * We use the upper bound here rather than the precise value + * because the arc_meta_max value doesn't need to be + * precise. It's only consumed by humans via arcstats. + */ + if (arc_meta_max < aggsum_upper_bound(&arc_meta_used)) + arc_meta_max = aggsum_upper_bound(&arc_meta_used); + aggsum_add(&arc_meta_used, -space); + } + + ASSERT(aggsum_compare(&arc_size, space) >= 0); + aggsum_add(&arc_size, -space); +} + +/* + * Given a hdr and a buf, returns whether that buf can share its b_data buffer + * with the hdr's b_pabd. + */ +static boolean_t +arc_can_share(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + /* + * The criteria for sharing a hdr's data are: + * 1. the buffer is not encrypted + * 2. the hdr's compression matches the buf's compression + * 3. the hdr doesn't need to be byteswapped + * 4. the hdr isn't already being shared + * 5. the buf is either compressed or it is the last buf in the hdr list + * + * Criterion #5 maintains the invariant that shared uncompressed + * bufs must be the final buf in the hdr's b_buf list. Reading this, you + * might ask, "if a compressed buf is allocated first, won't that be the + * last thing in the list?", but in that case it's impossible to create + * a shared uncompressed buf anyway (because the hdr must be compressed + * to have the compressed buf). You might also think that #3 is + * sufficient to make this guarantee, however it's possible + * (specifically in the rare L2ARC write race mentioned in + * arc_buf_alloc_impl()) there will be an existing uncompressed buf that + * is shareable, but wasn't at the time of its allocation. Rather than + * allow a new shared uncompressed buf to be created and then shuffle + * the list around to make it the last element, this simply disallows + * sharing if the new buf isn't the first to be added. + */ + ASSERT3P(buf->b_hdr, ==, hdr); + boolean_t hdr_compressed = + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF; + boolean_t buf_compressed = ARC_BUF_COMPRESSED(buf) != 0; + return (!ARC_BUF_ENCRYPTED(buf) && + buf_compressed == hdr_compressed && + hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS && + !HDR_SHARED_DATA(hdr) && + (ARC_BUF_LAST(buf) || ARC_BUF_COMPRESSED(buf))); +} + +/* + * Allocate a buf for this hdr. If you care about the data that's in the hdr, + * or if you want a compressed buffer, pass those flags in. Returns 0 if the + * copy was made successfully, or an error code otherwise. + */ +static int +arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb, + void *tag, boolean_t encrypted, boolean_t compressed, boolean_t noauth, + boolean_t fill, arc_buf_t **ret) +{ + arc_buf_t *buf; + arc_fill_flags_t flags = ARC_FILL_LOCKED; + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + VERIFY(hdr->b_type == ARC_BUFC_DATA || + hdr->b_type == ARC_BUFC_METADATA); + ASSERT3P(ret, !=, NULL); + ASSERT3P(*ret, ==, NULL); + IMPLY(encrypted, compressed); + + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + + buf = *ret = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); + buf->b_hdr = hdr; + buf->b_data = NULL; + buf->b_next = hdr->b_l1hdr.b_buf; + buf->b_flags = 0; + + add_reference(hdr, tag); + + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + /* + * Only honor requests for compressed bufs if the hdr is actually + * compressed. This must be overridden if the buffer is encrypted since + * encrypted buffers cannot be decompressed. + */ + if (encrypted) { + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + buf->b_flags |= ARC_BUF_FLAG_ENCRYPTED; + flags |= ARC_FILL_COMPRESSED | ARC_FILL_ENCRYPTED; + } else if (compressed && + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { + buf->b_flags |= ARC_BUF_FLAG_COMPRESSED; + flags |= ARC_FILL_COMPRESSED; + } + + if (noauth) { + ASSERT0(encrypted); + flags |= ARC_FILL_NOAUTH; + } + + /* + * If the hdr's data can be shared then we share the data buffer and + * set the appropriate bit in the hdr's b_flags to indicate the hdr is + * sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new + * buffer to store the buf's data. + * + * There are two additional restrictions here because we're sharing + * hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be + * actively involved in an L2ARC write, because if this buf is used by + * an arc_write() then the hdr's data buffer will be released when the + * write completes, even though the L2ARC write might still be using it. + * Second, the hdr's ABD must be linear so that the buf's user doesn't + * need to be ABD-aware. It must be allocated via + * zio_[data_]buf_alloc(), not as a page, because we need to be able + * to abd_release_ownership_of_buf(), which isn't allowed on "linear + * page" buffers because the ABD code needs to handle freeing them + * specially. + */ + boolean_t can_share = arc_can_share(hdr, buf) && + !HDR_L2_WRITING(hdr) && + hdr->b_l1hdr.b_pabd != NULL && + abd_is_linear(hdr->b_l1hdr.b_pabd) && + !abd_is_linear_page(hdr->b_l1hdr.b_pabd); + + /* Set up b_data and sharing */ + if (can_share) { + buf->b_data = abd_to_buf(hdr->b_l1hdr.b_pabd); + buf->b_flags |= ARC_BUF_FLAG_SHARED; + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + buf->b_data = + arc_get_data_buf(hdr, arc_buf_size(buf), buf); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); + } + VERIFY3P(buf->b_data, !=, NULL); + + hdr->b_l1hdr.b_buf = buf; + hdr->b_l1hdr.b_bufcnt += 1; + if (encrypted) + hdr->b_crypt_hdr.b_ebufcnt += 1; + + /* + * If the user wants the data from the hdr, we need to either copy or + * decompress the data. + */ + if (fill) { + ASSERT3P(zb, !=, NULL); + return (arc_buf_fill(buf, spa, zb, flags)); + } + + return (0); +} + +static char *arc_onloan_tag = "onloan"; + +static inline void +arc_loaned_bytes_update(int64_t delta) +{ + atomic_add_64(&arc_loaned_bytes, delta); + + /* assert that it did not wrap around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); +} + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, boolean_t is_metadata, int size) +{ + arc_buf_t *buf = arc_alloc_buf(spa, arc_onloan_tag, + is_metadata ? ARC_BUFC_METADATA : ARC_BUFC_DATA, size); + + arc_loaned_bytes_update(arc_buf_size(buf)); + + return (buf); +} + +arc_buf_t * +arc_loan_compressed_buf(spa_t *spa, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel) +{ + arc_buf_t *buf = arc_alloc_compressed_buf(spa, arc_onloan_tag, + psize, lsize, compression_type, complevel); + + arc_loaned_bytes_update(arc_buf_size(buf)); + + return (buf); +} + +arc_buf_t * +arc_loan_raw_buf(spa_t *spa, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel) +{ + arc_buf_t *buf = arc_alloc_raw_buf(spa, arc_onloan_tag, dsobj, + byteorder, salt, iv, mac, ot, psize, lsize, compression_type, + complevel); + + atomic_add_64(&arc_loaned_bytes, psize); + return (buf); +} + + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, tag); + (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + + arc_loaned_bytes_update(-arc_buf_size(buf)); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(buf->b_data, !=, NULL); + ASSERT(HDR_HAS_L1HDR(hdr)); + (void) zfs_refcount_add(&hdr->b_l1hdr.b_refcnt, arc_onloan_tag); + (void) zfs_refcount_remove(&hdr->b_l1hdr.b_refcnt, tag); + + arc_loaned_bytes_update(arc_buf_size(buf)); +} + +static void +l2arc_free_abd_on_write(abd_t *abd, size_t size, arc_buf_contents_t type) +{ + l2arc_data_free_t *df = kmem_alloc(sizeof (*df), KM_SLEEP); + + df->l2df_abd = abd; + df->l2df_size = size; + df->l2df_type = type; + mutex_enter(&l2arc_free_on_write_mtx); + list_insert_head(l2arc_free_on_write, df); + mutex_exit(&l2arc_free_on_write_mtx); +} + +static void +arc_hdr_free_on_write(arc_buf_hdr_t *hdr, boolean_t free_rdata) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + size, hdr); + } + (void) zfs_refcount_remove_many(&state->arcs_size, size, hdr); + if (type == ARC_BUFC_METADATA) { + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_space_return(size, ARC_SPACE_DATA); + } + + if (free_rdata) { + l2arc_free_abd_on_write(hdr->b_crypt_hdr.b_rabd, size, type); + } else { + l2arc_free_abd_on_write(hdr->b_l1hdr.b_pabd, size, type); + } +} + +/* + * Share the arc_buf_t's data with the hdr. Whenever we are sharing the + * data buffer, we transfer the refcount ownership to the hdr and update + * the appropriate kstats. + */ +static void +arc_share_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + ASSERT(arc_can_share(hdr, buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!ARC_BUF_ENCRYPTED(buf)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + /* + * Start sharing the data buffer. We transfer the + * refcount ownership to the hdr since it always owns + * the refcount whenever an arc_buf_t is shared. + */ + zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + arc_hdr_size(hdr), buf, hdr); + hdr->b_l1hdr.b_pabd = abd_get_from_buf(buf->b_data, arc_buf_size(buf)); + abd_take_ownership_of_buf(hdr->b_l1hdr.b_pabd, + HDR_ISTYPE_METADATA(hdr)); + arc_hdr_set_flags(hdr, ARC_FLAG_SHARED_DATA); + buf->b_flags |= ARC_BUF_FLAG_SHARED; + + /* + * Since we've transferred ownership to the hdr we need + * to increment its compressed and uncompressed kstats and + * decrement the overhead size. + */ + ARCSTAT_INCR(arcstat_compressed_size, arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, -arc_buf_size(buf)); +} + +static void +arc_unshare_buf(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + ASSERT(arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + /* + * We are no longer sharing this buffer so we need + * to transfer its ownership to the rightful owner. + */ + zfs_refcount_transfer_ownership_many(&hdr->b_l1hdr.b_state->arcs_size, + arc_hdr_size(hdr), hdr, buf); + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + abd_release_ownership_of_buf(hdr->b_l1hdr.b_pabd); + abd_put(hdr->b_l1hdr.b_pabd); + hdr->b_l1hdr.b_pabd = NULL; + buf->b_flags &= ~ARC_BUF_FLAG_SHARED; + + /* + * Since the buffer is no longer shared between + * the arc buf and the hdr, count it as overhead. + */ + ARCSTAT_INCR(arcstat_compressed_size, -arc_hdr_size(hdr)); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_overhead_size, arc_buf_size(buf)); +} + +/* + * Remove an arc_buf_t from the hdr's buf list and return the last + * arc_buf_t on the list. If no buffers remain on the list then return + * NULL. + */ +static arc_buf_t * +arc_buf_remove(arc_buf_hdr_t *hdr, arc_buf_t *buf) +{ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + arc_buf_t **bufp = &hdr->b_l1hdr.b_buf; + arc_buf_t *lastbuf = NULL; + + /* + * Remove the buf from the hdr list and locate the last + * remaining buffer on the list. + */ + while (*bufp != NULL) { + if (*bufp == buf) + *bufp = buf->b_next; + + /* + * If we've removed a buffer in the middle of + * the list then update the lastbuf and update + * bufp. + */ + if (*bufp != NULL) { + lastbuf = *bufp; + bufp = &(*bufp)->b_next; + } + } + buf->b_next = NULL; + ASSERT3P(lastbuf, !=, buf); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, lastbuf != NULL); + IMPLY(hdr->b_l1hdr.b_bufcnt > 0, hdr->b_l1hdr.b_buf != NULL); + IMPLY(lastbuf != NULL, ARC_BUF_LAST(lastbuf)); + + return (lastbuf); +} + +/* + * Free up buf->b_data and pull the arc_buf_t off of the arc_buf_hdr_t's + * list and free it. + */ +static void +arc_buf_destroy_impl(arc_buf_t *buf) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * Free up the data associated with the buf but only if we're not + * sharing this with the hdr. If we are sharing it with the hdr, the + * hdr is responsible for doing the free. + */ + if (buf->b_data != NULL) { + /* + * We're about to change the hdr's b_flags. We must either + * hold the hash_lock or be undiscoverable. + */ + ASSERT(HDR_EMPTY_OR_LOCKED(hdr)); + + arc_cksum_verify(buf); + arc_buf_unwatch(buf); + + if (arc_buf_is_shared(buf)) { + arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA); + } else { + uint64_t size = arc_buf_size(buf); + arc_free_data_buf(hdr, buf->b_data, size, buf); + ARCSTAT_INCR(arcstat_overhead_size, -size); + } + buf->b_data = NULL; + + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + hdr->b_l1hdr.b_bufcnt -= 1; + + if (ARC_BUF_ENCRYPTED(buf)) { + hdr->b_crypt_hdr.b_ebufcnt -= 1; + + /* + * If we have no more encrypted buffers and we've + * already gotten a copy of the decrypted data we can + * free b_rabd to save some space. + */ + if (hdr->b_crypt_hdr.b_ebufcnt == 0 && + HDR_HAS_RABD(hdr) && hdr->b_l1hdr.b_pabd != NULL && + !HDR_IO_IN_PROGRESS(hdr)) { + arc_hdr_free_abd(hdr, B_TRUE); + } + } + } + + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); + + if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) { + /* + * If the current arc_buf_t is sharing its data buffer with the + * hdr, then reassign the hdr's b_pabd to share it with the new + * buffer at the end of the list. The shared buffer is always + * the last one on the hdr's buffer list. + * + * There is an equivalent case for compressed bufs, but since + * they aren't guaranteed to be the last buf in the list and + * that is an exceedingly rare case, we just allow that space be + * wasted temporarily. We must also be careful not to share + * encrypted buffers, since they cannot be shared. + */ + if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) { + /* Only one buf can be shared at once */ + VERIFY(!arc_buf_is_shared(lastbuf)); + /* hdr is uncompressed so can't have compressed buf */ + VERIFY(!ARC_BUF_COMPRESSED(lastbuf)); + + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + arc_hdr_free_abd(hdr, B_FALSE); + + /* + * We must setup a new shared block between the + * last buffer and the hdr. The data would have + * been allocated by the arc buf so we need to transfer + * ownership to the hdr since it's now being shared. + */ + arc_share_buf(hdr, lastbuf); + } + } else if (HDR_SHARED_DATA(hdr)) { + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT3P(lastbuf, !=, NULL); + ASSERT(arc_buf_is_shared(lastbuf) || + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); + } + + /* + * Free the checksum if we're removing the last uncompressed buf from + * this hdr. + */ + if (!arc_hdr_has_uncompressed_buf(hdr)) { + arc_cksum_free(hdr); + } + + /* clean up the buf */ + buf->b_hdr = NULL; + kmem_cache_free(buf_cache, buf); +} + +static void +arc_hdr_alloc_abd(arc_buf_hdr_t *hdr, int alloc_flags) +{ + uint64_t size; + boolean_t alloc_rdata = ((alloc_flags & ARC_HDR_ALLOC_RDATA) != 0); + boolean_t do_adapt = ((alloc_flags & ARC_HDR_DO_ADAPT) != 0); + + ASSERT3U(HDR_GET_LSIZE(hdr), >, 0); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr) || alloc_rdata); + IMPLY(alloc_rdata, HDR_PROTECTED(hdr)); + + if (alloc_rdata) { + size = HDR_GET_PSIZE(hdr); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, ==, NULL); + hdr->b_crypt_hdr.b_rabd = arc_get_data_abd(hdr, size, hdr, + do_adapt); + ASSERT3P(hdr->b_crypt_hdr.b_rabd, !=, NULL); + ARCSTAT_INCR(arcstat_raw_size, size); + } else { + size = arc_hdr_size(hdr); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + hdr->b_l1hdr.b_pabd = arc_get_data_abd(hdr, size, hdr, + do_adapt); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + } + + ARCSTAT_INCR(arcstat_compressed_size, size); + ARCSTAT_INCR(arcstat_uncompressed_size, HDR_GET_LSIZE(hdr)); +} + +static void +arc_hdr_free_abd(arc_buf_hdr_t *hdr, boolean_t free_rdata) +{ + uint64_t size = (free_rdata) ? HDR_GET_PSIZE(hdr) : arc_hdr_size(hdr); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + IMPLY(free_rdata, HDR_HAS_RABD(hdr)); + + /* + * If the hdr is currently being written to the l2arc then + * we defer freeing the data by adding it to the l2arc_free_on_write + * list. The l2arc will free the data once it's finished + * writing it to the l2arc device. + */ + if (HDR_L2_WRITING(hdr)) { + arc_hdr_free_on_write(hdr, free_rdata); + ARCSTAT_BUMP(arcstat_l2_free_on_write); + } else if (free_rdata) { + arc_free_data_abd(hdr, hdr->b_crypt_hdr.b_rabd, size, hdr); + } else { + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, size, hdr); + } + + if (free_rdata) { + hdr->b_crypt_hdr.b_rabd = NULL; + ARCSTAT_INCR(arcstat_raw_size, -size); + } else { + hdr->b_l1hdr.b_pabd = NULL; + } + + if (hdr->b_l1hdr.b_pabd == NULL && !HDR_HAS_RABD(hdr)) + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + + ARCSTAT_INCR(arcstat_compressed_size, -size); + ARCSTAT_INCR(arcstat_uncompressed_size, -HDR_GET_LSIZE(hdr)); +} + +static arc_buf_hdr_t * +arc_hdr_alloc(uint64_t spa, int32_t psize, int32_t lsize, + boolean_t protected, enum zio_compress compression_type, uint8_t complevel, + arc_buf_contents_t type, boolean_t alloc_rdata) +{ + arc_buf_hdr_t *hdr; + int flags = ARC_HDR_DO_ADAPT; + + VERIFY(type == ARC_BUFC_DATA || type == ARC_BUFC_METADATA); + if (protected) { + hdr = kmem_cache_alloc(hdr_full_crypt_cache, KM_PUSHPAGE); + } else { + hdr = kmem_cache_alloc(hdr_full_cache, KM_PUSHPAGE); + } + flags |= alloc_rdata ? ARC_HDR_ALLOC_RDATA : 0; + + ASSERT(HDR_EMPTY(hdr)); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + HDR_SET_PSIZE(hdr, psize); + HDR_SET_LSIZE(hdr, lsize); + hdr->b_spa = spa; + hdr->b_type = type; + hdr->b_flags = 0; + arc_hdr_set_flags(hdr, arc_bufc_to_flags(type) | ARC_FLAG_HAS_L1HDR); + arc_hdr_set_compress(hdr, compression_type); + hdr->b_complevel = complevel; + if (protected) + arc_hdr_set_flags(hdr, ARC_FLAG_PROTECTED); + + hdr->b_l1hdr.b_state = arc_anon; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_buf = NULL; + + /* + * Allocate the hdr's buffer. This will contain either + * the compressed or uncompressed data depending on the block + * it references and compressed arc enablement. + */ + arc_hdr_alloc_abd(hdr, flags); + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + + return (hdr); +} + +/* + * Transition between the two allocation states for the arc_buf_hdr struct. + * The arc_buf_hdr struct can be allocated with (hdr_full_cache) or without + * (hdr_l2only_cache) the fields necessary for the L1 cache - the smaller + * version is used when a cache buffer is only in the L2ARC in order to reduce + * memory usage. + */ +static arc_buf_hdr_t * +arc_hdr_realloc(arc_buf_hdr_t *hdr, kmem_cache_t *old, kmem_cache_t *new) +{ + ASSERT(HDR_HAS_L2HDR(hdr)); + + arc_buf_hdr_t *nhdr; + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + + ASSERT((old == hdr_full_cache && new == hdr_l2only_cache) || + (old == hdr_l2only_cache && new == hdr_full_cache)); + + /* + * if the caller wanted a new full header and the header is to be + * encrypted we will actually allocate the header from the full crypt + * cache instead. The same applies to freeing from the old cache. + */ + if (HDR_PROTECTED(hdr) && new == hdr_full_cache) + new = hdr_full_crypt_cache; + if (HDR_PROTECTED(hdr) && old == hdr_full_cache) + old = hdr_full_crypt_cache; + + nhdr = kmem_cache_alloc(new, KM_PUSHPAGE); + + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + buf_hash_remove(hdr); + + bcopy(hdr, nhdr, HDR_L2ONLY_SIZE); + + if (new == hdr_full_cache || new == hdr_full_crypt_cache) { + arc_hdr_set_flags(nhdr, ARC_FLAG_HAS_L1HDR); + /* + * arc_access and arc_change_state need to be aware that a + * header has just come out of L2ARC, so we set its state to + * l2c_only even though it's about to change. + */ + nhdr->b_l1hdr.b_state = arc_l2c_only; + + /* Verify previous threads set to NULL before freeing */ + ASSERT3P(nhdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + } else { + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(hdr->b_l1hdr.b_bufcnt); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + /* + * If we've reached here, We must have been called from + * arc_evict_hdr(), as such we should have already been + * removed from any ghost list we were previously on + * (which protects us from racing with arc_evict_state), + * thus no locking is needed during this check. + */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + + /* + * A buffer must not be moved into the arc_l2c_only + * state if it's not finished being written out to the + * l2arc device. Otherwise, the b_l1hdr.b_pabd field + * might try to be accessed, even though it was removed. + */ + VERIFY(!HDR_L2_WRITING(hdr)); + VERIFY3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + + arc_hdr_clear_flags(nhdr, ARC_FLAG_HAS_L1HDR); + } + /* + * The header has been reallocated so we need to re-insert it into any + * lists it was on. + */ + (void) buf_hash_insert(nhdr, NULL); + + ASSERT(list_link_active(&hdr->b_l2hdr.b_l2node)); + + mutex_enter(&dev->l2ad_mtx); + + /* + * We must place the realloc'ed header back into the list at + * the same spot. Otherwise, if it's placed earlier in the list, + * l2arc_write_buffers() could find it during the function's + * write phase, and try to write it out to the l2arc. + */ + list_insert_after(&dev->l2ad_buflist, hdr, nhdr); + list_remove(&dev->l2ad_buflist, hdr); + + mutex_exit(&dev->l2ad_mtx); + + /* + * Since we're using the pointer address as the tag when + * incrementing and decrementing the l2ad_alloc refcount, we + * must remove the old pointer (that we're about to destroy) and + * add the new pointer to the refcount. Otherwise we'd remove + * the wrong pointer address when calling arc_hdr_destroy() later. + */ + + (void) zfs_refcount_remove_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(nhdr), nhdr); + + buf_discard_identity(hdr); + kmem_cache_free(old, hdr); + + return (nhdr); +} + +/* + * This function allows an L1 header to be reallocated as a crypt + * header and vice versa. If we are going to a crypt header, the + * new fields will be zeroed out. + */ +static arc_buf_hdr_t * +arc_hdr_realloc_crypt(arc_buf_hdr_t *hdr, boolean_t need_crypt) +{ + arc_buf_hdr_t *nhdr; + arc_buf_t *buf; + kmem_cache_t *ncache, *ocache; + unsigned nsize, osize; + + /* + * This function requires that hdr is in the arc_anon state. + * Therefore it won't have any L2ARC data for us to worry + * about copying. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT3U(!!HDR_PROTECTED(hdr), !=, need_crypt); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!list_link_active(&hdr->b_l2hdr.b_l2node)); + ASSERT3P(hdr->b_hash_next, ==, NULL); + + if (need_crypt) { + ncache = hdr_full_crypt_cache; + nsize = sizeof (hdr->b_crypt_hdr); + ocache = hdr_full_cache; + osize = HDR_FULL_SIZE; + } else { + ncache = hdr_full_cache; + nsize = HDR_FULL_SIZE; + ocache = hdr_full_crypt_cache; + osize = sizeof (hdr->b_crypt_hdr); + } + + nhdr = kmem_cache_alloc(ncache, KM_PUSHPAGE); + + /* + * Copy all members that aren't locks or condvars to the new header. + * No lists are pointing to us (as we asserted above), so we don't + * need to worry about the list nodes. + */ + nhdr->b_dva = hdr->b_dva; + nhdr->b_birth = hdr->b_birth; + nhdr->b_type = hdr->b_type; + nhdr->b_flags = hdr->b_flags; + nhdr->b_psize = hdr->b_psize; + nhdr->b_lsize = hdr->b_lsize; + nhdr->b_spa = hdr->b_spa; + nhdr->b_l1hdr.b_freeze_cksum = hdr->b_l1hdr.b_freeze_cksum; + nhdr->b_l1hdr.b_bufcnt = hdr->b_l1hdr.b_bufcnt; + nhdr->b_l1hdr.b_byteswap = hdr->b_l1hdr.b_byteswap; + nhdr->b_l1hdr.b_state = hdr->b_l1hdr.b_state; + nhdr->b_l1hdr.b_arc_access = hdr->b_l1hdr.b_arc_access; + nhdr->b_l1hdr.b_mru_hits = hdr->b_l1hdr.b_mru_hits; + nhdr->b_l1hdr.b_mru_ghost_hits = hdr->b_l1hdr.b_mru_ghost_hits; + nhdr->b_l1hdr.b_mfu_hits = hdr->b_l1hdr.b_mfu_hits; + nhdr->b_l1hdr.b_mfu_ghost_hits = hdr->b_l1hdr.b_mfu_ghost_hits; + nhdr->b_l1hdr.b_l2_hits = hdr->b_l1hdr.b_l2_hits; + nhdr->b_l1hdr.b_acb = hdr->b_l1hdr.b_acb; + nhdr->b_l1hdr.b_pabd = hdr->b_l1hdr.b_pabd; + + /* + * This zfs_refcount_add() exists only to ensure that the individual + * arc buffers always point to a header that is referenced, avoiding + * a small race condition that could trigger ASSERTs. + */ + (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, FTAG); + nhdr->b_l1hdr.b_buf = hdr->b_l1hdr.b_buf; + for (buf = nhdr->b_l1hdr.b_buf; buf != NULL; buf = buf->b_next) { + mutex_enter(&buf->b_evict_lock); + buf->b_hdr = nhdr; + mutex_exit(&buf->b_evict_lock); + } + + zfs_refcount_transfer(&nhdr->b_l1hdr.b_refcnt, &hdr->b_l1hdr.b_refcnt); + (void) zfs_refcount_remove(&nhdr->b_l1hdr.b_refcnt, FTAG); + ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); + + if (need_crypt) { + arc_hdr_set_flags(nhdr, ARC_FLAG_PROTECTED); + } else { + arc_hdr_clear_flags(nhdr, ARC_FLAG_PROTECTED); + } + + /* unset all members of the original hdr */ + bzero(&hdr->b_dva, sizeof (dva_t)); + hdr->b_birth = 0; + hdr->b_type = ARC_BUFC_INVALID; + hdr->b_flags = 0; + hdr->b_psize = 0; + hdr->b_lsize = 0; + hdr->b_spa = 0; + hdr->b_l1hdr.b_freeze_cksum = NULL; + hdr->b_l1hdr.b_buf = NULL; + hdr->b_l1hdr.b_bufcnt = 0; + hdr->b_l1hdr.b_byteswap = 0; + hdr->b_l1hdr.b_state = NULL; + hdr->b_l1hdr.b_arc_access = 0; + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + hdr->b_l1hdr.b_acb = NULL; + hdr->b_l1hdr.b_pabd = NULL; + + if (ocache == hdr_full_crypt_cache) { + ASSERT(!HDR_HAS_RABD(hdr)); + hdr->b_crypt_hdr.b_ot = DMU_OT_NONE; + hdr->b_crypt_hdr.b_ebufcnt = 0; + hdr->b_crypt_hdr.b_dsobj = 0; + bzero(hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + bzero(hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + bzero(hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + } + + buf_discard_identity(hdr); + kmem_cache_free(ocache, hdr); + + return (nhdr); +} + +/* + * This function is used by the send / receive code to convert a newly + * allocated arc_buf_t to one that is suitable for a raw encrypted write. It + * is also used to allow the root objset block to be updated without altering + * its embedded MACs. Both block types will always be uncompressed so we do not + * have to worry about compression type or psize. + */ +void +arc_convert_to_raw(arc_buf_t *buf, uint64_t dsobj, boolean_t byteorder, + dmu_object_type_t ot, const uint8_t *salt, const uint8_t *iv, + const uint8_t *mac) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(ot == DMU_OT_DNODE || ot == DMU_OT_OBJSET); + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + + buf->b_flags |= (ARC_BUF_FLAG_COMPRESSED | ARC_BUF_FLAG_ENCRYPTED); + if (!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, B_TRUE); + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + if (!arc_hdr_has_uncompressed_buf(hdr)) + arc_cksum_free(hdr); + + if (salt != NULL) + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + if (iv != NULL) + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + if (mac != NULL) + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); +} + +/* + * Allocate a new arc_buf_hdr_t and arc_buf_t and return the buf to the caller. + * The buf is returned thawed since we expect the consumer to modify it. + */ +arc_buf_t * +arc_alloc_buf(spa_t *spa, void *tag, arc_buf_contents_t type, int32_t size) +{ + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), size, size, + B_FALSE, ZIO_COMPRESS_OFF, 0, type, B_FALSE); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, B_FALSE, + B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + + return (buf); +} + +/* + * Allocate a compressed buf in the same manner as arc_alloc_buf. Don't use this + * for bufs containing metadata. + */ +arc_buf_t * +arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel) +{ + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT3U(compression_type, >, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); + + arc_buf_hdr_t *hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + B_FALSE, compression_type, complevel, ARC_BUFC_DATA, B_FALSE); + + arc_buf_t *buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_FALSE, + B_TRUE, B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + if (!arc_buf_is_shared(buf)) { + /* + * To ensure that the hdr has the correct data in it if we call + * arc_untransform() on this buf before it's been written to + * disk, it's easiest if we just set up sharing between the + * buf and the hdr. + */ + arc_hdr_free_abd(hdr, B_FALSE); + arc_share_buf(hdr, buf); + } + + return (buf); +} + +arc_buf_t * +arc_alloc_raw_buf(spa_t *spa, void *tag, uint64_t dsobj, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, + dmu_object_type_t ot, uint64_t psize, uint64_t lsize, + enum zio_compress compression_type, uint8_t complevel) +{ + arc_buf_hdr_t *hdr; + arc_buf_t *buf; + arc_buf_contents_t type = DMU_OT_IS_METADATA(ot) ? + ARC_BUFC_METADATA : ARC_BUFC_DATA; + + ASSERT3U(lsize, >, 0); + ASSERT3U(lsize, >=, psize); + ASSERT3U(compression_type, >=, ZIO_COMPRESS_OFF); + ASSERT3U(compression_type, <, ZIO_COMPRESS_FUNCTIONS); + + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, B_TRUE, + compression_type, complevel, type, B_TRUE); + + hdr->b_crypt_hdr.b_dsobj = dsobj; + hdr->b_crypt_hdr.b_ot = ot; + hdr->b_l1hdr.b_byteswap = (byteorder == ZFS_HOST_BYTEORDER) ? + DMU_BSWAP_NUMFUNCS : DMU_OT_BYTESWAP(ot); + bcopy(salt, hdr->b_crypt_hdr.b_salt, ZIO_DATA_SALT_LEN); + bcopy(iv, hdr->b_crypt_hdr.b_iv, ZIO_DATA_IV_LEN); + bcopy(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN); + + /* + * This buffer will be considered encrypted even if the ot is not an + * encrypted type. It will become authenticated instead in + * arc_write_ready(). + */ + buf = NULL; + VERIFY0(arc_buf_alloc_impl(hdr, spa, NULL, tag, B_TRUE, B_TRUE, + B_FALSE, B_FALSE, &buf)); + arc_buf_thaw(buf); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + + return (buf); +} + +static void +arc_hdr_l2hdr_destroy(arc_buf_hdr_t *hdr) +{ + l2arc_buf_hdr_t *l2hdr = &hdr->b_l2hdr; + l2arc_dev_t *dev = l2hdr->b_dev; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + + ASSERT(MUTEX_HELD(&dev->l2ad_mtx)); + ASSERT(HDR_HAS_L2HDR(hdr)); + + list_remove(&dev->l2ad_buflist, hdr); + + ARCSTAT_INCR(arcstat_l2_psize, -psize); + ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + + vdev_space_update(dev->l2ad_vdev, -asize, 0, 0); + + (void) zfs_refcount_remove_many(&dev->l2ad_alloc, arc_hdr_size(hdr), + hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); +} + +static void +arc_hdr_destroy(arc_buf_hdr_t *hdr) +{ + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_buf == NULL || + hdr->b_l1hdr.b_bufcnt > 0); + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + } + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + + if (HDR_HAS_L2HDR(hdr)) { + l2arc_dev_t *dev = hdr->b_l2hdr.b_dev; + boolean_t buflist_held = MUTEX_HELD(&dev->l2ad_mtx); + + if (!buflist_held) + mutex_enter(&dev->l2ad_mtx); + + /* + * Even though we checked this conditional above, we + * need to check this again now that we have the + * l2ad_mtx. This is because we could be racing with + * another thread calling l2arc_evict() which might have + * destroyed this header's L2 portion as we were waiting + * to acquire the l2ad_mtx. If that happens, we don't + * want to re-destroy the header's L2 portion. + */ + if (HDR_HAS_L2HDR(hdr)) + arc_hdr_l2hdr_destroy(hdr); + + if (!buflist_held) + mutex_exit(&dev->l2ad_mtx); + } + + /* + * The header's identify can only be safely discarded once it is no + * longer discoverable. This requires removing it from the hash table + * and the l2arc header list. After this point the hash lock can not + * be used to protect the header. + */ + if (!HDR_EMPTY(hdr)) + buf_discard_identity(hdr); + + if (HDR_HAS_L1HDR(hdr)) { + arc_cksum_free(hdr); + + while (hdr->b_l1hdr.b_buf != NULL) + arc_buf_destroy_impl(hdr->b_l1hdr.b_buf); + + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_abd(hdr, B_FALSE); + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); + } + + ASSERT3P(hdr->b_hash_next, ==, NULL); + if (HDR_HAS_L1HDR(hdr)) { + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + + if (!HDR_PROTECTED(hdr)) { + kmem_cache_free(hdr_full_cache, hdr); + } else { + kmem_cache_free(hdr_full_crypt_cache, hdr); + } + } else { + kmem_cache_free(hdr_l2only_cache, hdr); + } +} + +void +arc_buf_destroy(arc_buf_t *buf, void* tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + if (hdr->b_l1hdr.b_state == arc_anon) { + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + VERIFY0(remove_reference(hdr, NULL, tag)); + arc_hdr_destroy(hdr); + return; + } + + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + ASSERT3P(hdr, ==, buf->b_hdr); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(hdr->b_l1hdr.b_state, !=, arc_anon); + ASSERT3P(buf->b_data, !=, NULL); + + (void) remove_reference(hdr, hash_lock, tag); + arc_buf_destroy_impl(buf); + mutex_exit(hash_lock); +} + +/* + * Evict the arc_buf_hdr that is provided as a parameter. The resultant + * state of the header is dependent on its state prior to entering this + * function. The following transitions are possible: + * + * - arc_mru -> arc_mru_ghost + * - arc_mfu -> arc_mfu_ghost + * - arc_mru_ghost -> arc_l2c_only + * - arc_mru_ghost -> deleted + * - arc_mfu_ghost -> arc_l2c_only + * - arc_mfu_ghost -> deleted + */ +static int64_t +arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +{ + arc_state_t *evicted_state, *state; + int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + arc_min_prescient_prefetch_ms : arc_min_prefetch_ms; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); + + state = hdr->b_l1hdr.b_state; + if (GHOST_STATE(state)) { + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + + /* + * l2arc_write_buffers() relies on a header's L1 portion + * (i.e. its b_pabd field) during it's write phase. + * Thus, we cannot push a header onto the arc_l2c_only + * state (removing its L1 piece) until the header is + * done being written to the l2arc. + */ + if (HDR_HAS_L2HDR(hdr) && HDR_L2_WRITING(hdr)) { + ARCSTAT_BUMP(arcstat_evict_l2_skip); + return (bytes_evicted); + } + + ARCSTAT_BUMP(arcstat_deleted); + bytes_evicted += HDR_GET_LSIZE(hdr); + + DTRACE_PROBE1(arc__delete, arc_buf_hdr_t *, hdr); + + if (HDR_HAS_L2HDR(hdr)) { + ASSERT(hdr->b_l1hdr.b_pabd == NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + /* + * This buffer is cached on the 2nd Level ARC; + * don't destroy the header. + */ + arc_change_state(arc_l2c_only, hdr, hash_lock); + /* + * dropping from L1+L2 cached to L2-only, + * realloc to remove the L1 header. + */ + hdr = arc_hdr_realloc(hdr, hdr_full_cache, + hdr_l2only_cache); + } else { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + } + return (bytes_evicted); + } + + ASSERT(state == arc_mru || state == arc_mfu); + evicted_state = (state == arc_mru) ? arc_mru_ghost : arc_mfu_ghost; + + /* prefetch buffers have a minimum lifespan */ + if (HDR_IO_IN_PROGRESS(hdr) || + ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < + MSEC_TO_TICK(min_lifetime))) { + ARCSTAT_BUMP(arcstat_evict_skip); + return (bytes_evicted); + } + + ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); + while (hdr->b_l1hdr.b_buf) { + arc_buf_t *buf = hdr->b_l1hdr.b_buf; + if (!mutex_tryenter(&buf->b_evict_lock)) { + ARCSTAT_BUMP(arcstat_mutex_miss); + break; + } + if (buf->b_data != NULL) + bytes_evicted += HDR_GET_LSIZE(hdr); + mutex_exit(&buf->b_evict_lock); + arc_buf_destroy_impl(buf); + } + + if (HDR_HAS_L2HDR(hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_cached, HDR_GET_LSIZE(hdr)); + } else { + if (l2arc_write_eligible(hdr->b_spa, hdr)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + HDR_GET_LSIZE(hdr)); + } else { + ARCSTAT_INCR(arcstat_evict_l2_ineligible, + HDR_GET_LSIZE(hdr)); + } + } + + if (hdr->b_l1hdr.b_bufcnt == 0) { + arc_cksum_free(hdr); + + bytes_evicted += arc_hdr_size(hdr); + + /* + * If this hdr is being evicted and has a compressed + * buffer then we discard it here before we change states. + * This ensures that the accounting is updated correctly + * in arc_free_data_impl(). + */ + if (hdr->b_l1hdr.b_pabd != NULL) + arc_hdr_free_abd(hdr, B_FALSE); + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); + + arc_change_state(evicted_state, hdr, hash_lock); + ASSERT(HDR_IN_HASH_TABLE(hdr)); + arc_hdr_set_flags(hdr, ARC_FLAG_IN_HASH_TABLE); + DTRACE_PROBE1(arc__evict, arc_buf_hdr_t *, hdr); + } + + return (bytes_evicted); +} + +static void +arc_set_need_free(void) +{ + ASSERT(MUTEX_HELD(&arc_evict_lock)); + int64_t remaining = arc_free_memory() - arc_sys_free / 2; + arc_evict_waiter_t *aw = list_tail(&arc_evict_waiters); + if (aw == NULL) { + arc_need_free = MAX(-remaining, 0); + } else { + arc_need_free = + MAX(-remaining, (int64_t)(aw->aew_count - arc_evict_count)); + } +} + +static uint64_t +arc_evict_state_impl(multilist_t *ml, int idx, arc_buf_hdr_t *marker, + uint64_t spa, int64_t bytes) +{ + multilist_sublist_t *mls; + uint64_t bytes_evicted = 0; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + int evict_count = 0; + + ASSERT3P(marker, !=, NULL); + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + mls = multilist_sublist_lock(ml, idx); + + for (hdr = multilist_sublist_prev(mls, marker); hdr != NULL; + hdr = multilist_sublist_prev(mls, marker)) { + if ((bytes != ARC_EVICT_ALL && bytes_evicted >= bytes) || + (evict_count >= zfs_arc_evict_batch_limit)) + break; + + /* + * To keep our iteration location, move the marker + * forward. Since we're not holding hdr's hash lock, we + * must be very careful and not remove 'hdr' from the + * sublist. Otherwise, other consumers might mistake the + * 'hdr' as not being on a sublist when they call the + * multilist_link_active() function (they all rely on + * the hash lock protecting concurrent insertions and + * removals). multilist_sublist_move_forward() was + * specifically implemented to ensure this is the case + * (only 'marker' will be removed and re-inserted). + */ + multilist_sublist_move_forward(mls, marker); + + /* + * The only case where the b_spa field should ever be + * zero, is the marker headers inserted by + * arc_evict_state(). It's possible for multiple threads + * to be calling arc_evict_state() concurrently (e.g. + * dsl_pool_close() and zio_inject_fault()), so we must + * skip any markers we see from these other threads. + */ + if (hdr->b_spa == 0) + continue; + + /* we're only interested in evicting buffers of a certain spa */ + if (spa != 0 && hdr->b_spa != spa) { + ARCSTAT_BUMP(arcstat_evict_skip); + continue; + } + + hash_lock = HDR_LOCK(hdr); + + /* + * We aren't calling this function from any code path + * that would already be holding a hash lock, so we're + * asserting on this assumption to be defensive in case + * this ever changes. Without this check, it would be + * possible to incorrectly increment arcstat_mutex_miss + * below (e.g. if the code changed such that we called + * this function with a hash lock held). + */ + ASSERT(!MUTEX_HELD(hash_lock)); + + if (mutex_tryenter(hash_lock)) { + uint64_t evicted = arc_evict_hdr(hdr, hash_lock); + mutex_exit(hash_lock); + + bytes_evicted += evicted; + + /* + * If evicted is zero, arc_evict_hdr() must have + * decided to skip this header, don't increment + * evict_count in this case. + */ + if (evicted != 0) + evict_count++; + + } else { + ARCSTAT_BUMP(arcstat_mutex_miss); + } + } + + multilist_sublist_unlock(mls); + + /* + * Increment the count of evicted bytes, and wake up any threads that + * are waiting for the count to reach this value. Since the list is + * ordered by ascending aew_count, we pop off the beginning of the + * list until we reach the end, or a waiter that's past the current + * "count". Doing this outside the loop reduces the number of times + * we need to acquire the global arc_evict_lock. + * + * Only wake when there's sufficient free memory in the system + * (specifically, arc_sys_free/2, which by default is a bit more than + * 1/64th of RAM). See the comments in arc_wait_for_eviction(). + */ + mutex_enter(&arc_evict_lock); + arc_evict_count += bytes_evicted; + + if ((int64_t)(arc_free_memory() - arc_sys_free / 2) > 0) { + arc_evict_waiter_t *aw; + while ((aw = list_head(&arc_evict_waiters)) != NULL && + aw->aew_count <= arc_evict_count) { + list_remove(&arc_evict_waiters, aw); + cv_broadcast(&aw->aew_cv); + } + } + arc_set_need_free(); + mutex_exit(&arc_evict_lock); + + /* + * If the ARC size is reduced from arc_c_max to arc_c_min (especially + * if the average cached block is small), eviction can be on-CPU for + * many seconds. To ensure that other threads that may be bound to + * this CPU are able to make progress, make a voluntary preemption + * call here. + */ + cond_resched(); + + return (bytes_evicted); +} + +/* + * Evict buffers from the given arc state, until we've removed the + * specified number of bytes. Move the removed buffers to the + * appropriate evict state. + * + * This function makes a "best effort". It skips over any buffers + * it can't get a hash_lock on, and so, may not catch all candidates. + * It may also return without evicting as much space as requested. + * + * If bytes is specified using the special value ARC_EVICT_ALL, this + * will evict all available (i.e. unlocked and evictable) buffers from + * the given arc state; which is used by arc_flush(). + */ +static uint64_t +arc_evict_state(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + uint64_t total_evicted = 0; + multilist_t *ml = state->arcs_list[type]; + int num_sublists; + arc_buf_hdr_t **markers; + + IMPLY(bytes < 0, bytes == ARC_EVICT_ALL); + + num_sublists = multilist_get_num_sublists(ml); + + /* + * If we've tried to evict from each sublist, made some + * progress, but still have not hit the target number of bytes + * to evict, we want to keep trying. The markers allow us to + * pick up where we left off for each individual sublist, rather + * than starting from the tail each time. + */ + markers = kmem_zalloc(sizeof (*markers) * num_sublists, KM_SLEEP); + for (int i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls; + + markers[i] = kmem_cache_alloc(hdr_full_cache, KM_SLEEP); + + /* + * A b_spa of 0 is used to indicate that this header is + * a marker. This fact is used in arc_evict_type() and + * arc_evict_state_impl(). + */ + markers[i]->b_spa = 0; + + mls = multilist_sublist_lock(ml, i); + multilist_sublist_insert_tail(mls, markers[i]); + multilist_sublist_unlock(mls); + } + + /* + * While we haven't hit our target number of bytes to evict, or + * we're evicting all available buffers. + */ + while (total_evicted < bytes || bytes == ARC_EVICT_ALL) { + int sublist_idx = multilist_get_random_index(ml); + uint64_t scan_evicted = 0; + + /* + * Try to reduce pinned dnodes with a floor of arc_dnode_limit. + * Request that 10% of the LRUs be scanned by the superblock + * shrinker. + */ + if (type == ARC_BUFC_DATA && aggsum_compare(&astat_dnode_size, + arc_dnode_size_limit) > 0) { + arc_prune_async((aggsum_upper_bound(&astat_dnode_size) - + arc_dnode_size_limit) / sizeof (dnode_t) / + zfs_arc_dnode_reduce_percent); + } + + /* + * Start eviction using a randomly selected sublist, + * this is to try and evenly balance eviction across all + * sublists. Always starting at the same sublist + * (e.g. index 0) would cause evictions to favor certain + * sublists over others. + */ + for (int i = 0; i < num_sublists; i++) { + uint64_t bytes_remaining; + uint64_t bytes_evicted; + + if (bytes == ARC_EVICT_ALL) + bytes_remaining = ARC_EVICT_ALL; + else if (total_evicted < bytes) + bytes_remaining = bytes - total_evicted; + else + break; + + bytes_evicted = arc_evict_state_impl(ml, sublist_idx, + markers[sublist_idx], spa, bytes_remaining); + + scan_evicted += bytes_evicted; + total_evicted += bytes_evicted; + + /* we've reached the end, wrap to the beginning */ + if (++sublist_idx >= num_sublists) + sublist_idx = 0; + } + + /* + * If we didn't evict anything during this scan, we have + * no reason to believe we'll evict more during another + * scan, so break the loop. + */ + if (scan_evicted == 0) { + /* This isn't possible, let's make that obvious */ + ASSERT3S(bytes, !=, 0); + + /* + * When bytes is ARC_EVICT_ALL, the only way to + * break the loop is when scan_evicted is zero. + * In that case, we actually have evicted enough, + * so we don't want to increment the kstat. + */ + if (bytes != ARC_EVICT_ALL) { + ASSERT3S(total_evicted, <, bytes); + ARCSTAT_BUMP(arcstat_evict_not_enough); + } + + break; + } + } + + for (int i = 0; i < num_sublists; i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + multilist_sublist_remove(mls, markers[i]); + multilist_sublist_unlock(mls); + + kmem_cache_free(hdr_full_cache, markers[i]); + } + kmem_free(markers, sizeof (*markers) * num_sublists); + + return (total_evicted); +} + +/* + * Flush all "evictable" data of the given type from the arc state + * specified. This will not evict any "active" buffers (i.e. referenced). + * + * When 'retry' is set to B_FALSE, the function will make a single pass + * over the state and evict any buffers that it can. Since it doesn't + * continually retry the eviction, it might end up leaving some buffers + * in the ARC due to lock misses. + * + * When 'retry' is set to B_TRUE, the function will continually retry the + * eviction until *all* evictable buffers have been removed from the + * state. As a result, if concurrent insertions into the state are + * allowed (e.g. if the ARC isn't shutting down), this function might + * wind up in an infinite loop, continually trying to evict buffers. + */ +static uint64_t +arc_flush_state(arc_state_t *state, uint64_t spa, arc_buf_contents_t type, + boolean_t retry) +{ + uint64_t evicted = 0; + + while (zfs_refcount_count(&state->arcs_esize[type]) != 0) { + evicted += arc_evict_state(state, spa, ARC_EVICT_ALL, type); + + if (!retry) + break; + } + + return (evicted); +} + +/* + * Evict the specified number of bytes from the state specified, + * restricting eviction to the spa and type given. This function + * prevents us from trying to evict more from a state's list than + * is "evictable", and to skip evicting altogether when passed a + * negative value for "bytes". In contrast, arc_evict_state() will + * evict everything it can, when passed a negative value for "bytes". + */ +static uint64_t +arc_evict_impl(arc_state_t *state, uint64_t spa, int64_t bytes, + arc_buf_contents_t type) +{ + int64_t delta; + + if (bytes > 0 && zfs_refcount_count(&state->arcs_esize[type]) > 0) { + delta = MIN(zfs_refcount_count(&state->arcs_esize[type]), + bytes); + return (arc_evict_state(state, spa, delta, type)); + } + + return (0); +} + +/* + * The goal of this function is to evict enough meta data buffers from the + * ARC in order to enforce the arc_meta_limit. Achieving this is slightly + * more complicated than it appears because it is common for data buffers + * to have holds on meta data buffers. In addition, dnode meta data buffers + * will be held by the dnodes in the block preventing them from being freed. + * This means we can't simply traverse the ARC and expect to always find + * enough unheld meta data buffer to release. + * + * Therefore, this function has been updated to make alternating passes + * over the ARC releasing data buffers and then newly unheld meta data + * buffers. This ensures forward progress is maintained and meta_used + * will decrease. Normally this is sufficient, but if required the ARC + * will call the registered prune callbacks causing dentry and inodes to + * be dropped from the VFS cache. This will make dnode meta data buffers + * available for reclaim. + */ +static uint64_t +arc_evict_meta_balanced(uint64_t meta_used) +{ + int64_t delta, prune = 0, adjustmnt; + uint64_t total_evicted = 0; + arc_buf_contents_t type = ARC_BUFC_DATA; + int restarts = MAX(zfs_arc_meta_adjust_restarts, 0); + +restart: + /* + * This slightly differs than the way we evict from the mru in + * arc_evict because we don't have a "target" value (i.e. no + * "meta" arc_p). As a result, I think we can completely + * cannibalize the metadata in the MRU before we evict the + * metadata from the MFU. I think we probably need to implement a + * "metadata arc_p" value to do this properly. + */ + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && + zfs_refcount_count(&arc_mru->arcs_esize[type]) > 0) { + delta = MIN(zfs_refcount_count(&arc_mru->arcs_esize[type]), + adjustmnt); + total_evicted += arc_evict_impl(arc_mru, 0, delta, type); + adjustmnt -= delta; + } + + /* + * We can't afford to recalculate adjustmnt here. If we do, + * new metadata buffers can sneak into the MRU or ANON lists, + * thus penalize the MFU metadata. Although the fudge factor is + * small, it has been empirically shown to be significant for + * certain workloads (e.g. creating many empty directories). As + * such, we use the original calculation for adjustmnt, and + * simply decrement the amount of data evicted from the MRU. + */ + + if (adjustmnt > 0 && + zfs_refcount_count(&arc_mfu->arcs_esize[type]) > 0) { + delta = MIN(zfs_refcount_count(&arc_mfu->arcs_esize[type]), + adjustmnt); + total_evicted += arc_evict_impl(arc_mfu, 0, delta, type); + } + + adjustmnt = meta_used - arc_meta_limit; + + if (adjustmnt > 0 && + zfs_refcount_count(&arc_mru_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + zfs_refcount_count(&arc_mru_ghost->arcs_esize[type])); + total_evicted += arc_evict_impl(arc_mru_ghost, 0, delta, type); + adjustmnt -= delta; + } + + if (adjustmnt > 0 && + zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type]) > 0) { + delta = MIN(adjustmnt, + zfs_refcount_count(&arc_mfu_ghost->arcs_esize[type])); + total_evicted += arc_evict_impl(arc_mfu_ghost, 0, delta, type); + } + + /* + * If after attempting to make the requested adjustment to the ARC + * the meta limit is still being exceeded then request that the + * higher layers drop some cached objects which have holds on ARC + * meta buffers. Requests to the upper layers will be made with + * increasingly large scan sizes until the ARC is below the limit. + */ + if (meta_used > arc_meta_limit) { + if (type == ARC_BUFC_DATA) { + type = ARC_BUFC_METADATA; + } else { + type = ARC_BUFC_DATA; + + if (zfs_arc_meta_prune) { + prune += zfs_arc_meta_prune; + arc_prune_async(prune); + } + } + + if (restarts > 0) { + restarts--; + goto restart; + } + } + return (total_evicted); +} + +/* + * Evict metadata buffers from the cache, such that arc_meta_used is + * capped by the arc_meta_limit tunable. + */ +static uint64_t +arc_evict_meta_only(uint64_t meta_used) +{ + uint64_t total_evicted = 0; + int64_t target; + + /* + * If we're over the meta limit, we want to evict enough + * metadata to get back under the meta limit. We don't want to + * evict so much that we drop the MRU below arc_p, though. If + * we're over the meta limit more than we're over arc_p, we + * evict some from the MRU here, and some from the MFU below. + */ + target = MIN((int64_t)(meta_used - arc_meta_limit), + (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + + zfs_refcount_count(&arc_mru->arcs_size) - arc_p)); + + total_evicted += arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + + /* + * Similar to the above, we want to evict enough bytes to get us + * below the meta limit, but not so much as to drop us below the + * space allotted to the MFU (which is defined as arc_c - arc_p). + */ + target = MIN((int64_t)(meta_used - arc_meta_limit), + (int64_t)(zfs_refcount_count(&arc_mfu->arcs_size) - + (arc_c - arc_p))); + + total_evicted += arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +static uint64_t +arc_evict_meta(uint64_t meta_used) +{ + if (zfs_arc_meta_strategy == ARC_STRATEGY_META_ONLY) + return (arc_evict_meta_only(meta_used)); + else + return (arc_evict_meta_balanced(meta_used)); +} + +/* + * Return the type of the oldest buffer in the given arc state + * + * This function will select a random sublist of type ARC_BUFC_DATA and + * a random sublist of type ARC_BUFC_METADATA. The tail of each sublist + * is compared, and the type which contains the "older" buffer will be + * returned. + */ +static arc_buf_contents_t +arc_evict_type(arc_state_t *state) +{ + multilist_t *data_ml = state->arcs_list[ARC_BUFC_DATA]; + multilist_t *meta_ml = state->arcs_list[ARC_BUFC_METADATA]; + int data_idx = multilist_get_random_index(data_ml); + int meta_idx = multilist_get_random_index(meta_ml); + multilist_sublist_t *data_mls; + multilist_sublist_t *meta_mls; + arc_buf_contents_t type; + arc_buf_hdr_t *data_hdr; + arc_buf_hdr_t *meta_hdr; + + /* + * We keep the sublist lock until we're finished, to prevent + * the headers from being destroyed via arc_evict_state(). + */ + data_mls = multilist_sublist_lock(data_ml, data_idx); + meta_mls = multilist_sublist_lock(meta_ml, meta_idx); + + /* + * These two loops are to ensure we skip any markers that + * might be at the tail of the lists due to arc_evict_state(). + */ + + for (data_hdr = multilist_sublist_tail(data_mls); data_hdr != NULL; + data_hdr = multilist_sublist_prev(data_mls, data_hdr)) { + if (data_hdr->b_spa != 0) + break; + } + + for (meta_hdr = multilist_sublist_tail(meta_mls); meta_hdr != NULL; + meta_hdr = multilist_sublist_prev(meta_mls, meta_hdr)) { + if (meta_hdr->b_spa != 0) + break; + } + + if (data_hdr == NULL && meta_hdr == NULL) { + type = ARC_BUFC_DATA; + } else if (data_hdr == NULL) { + ASSERT3P(meta_hdr, !=, NULL); + type = ARC_BUFC_METADATA; + } else if (meta_hdr == NULL) { + ASSERT3P(data_hdr, !=, NULL); + type = ARC_BUFC_DATA; + } else { + ASSERT3P(data_hdr, !=, NULL); + ASSERT3P(meta_hdr, !=, NULL); + + /* The headers can't be on the sublist without an L1 header */ + ASSERT(HDR_HAS_L1HDR(data_hdr)); + ASSERT(HDR_HAS_L1HDR(meta_hdr)); + + if (data_hdr->b_l1hdr.b_arc_access < + meta_hdr->b_l1hdr.b_arc_access) { + type = ARC_BUFC_DATA; + } else { + type = ARC_BUFC_METADATA; + } + } + + multilist_sublist_unlock(meta_mls); + multilist_sublist_unlock(data_mls); + + return (type); +} + +/* + * Evict buffers from the cache, such that arc_size is capped by arc_c. + */ +static uint64_t +arc_evict(void) +{ + uint64_t total_evicted = 0; + uint64_t bytes; + int64_t target; + uint64_t asize = aggsum_value(&arc_size); + uint64_t ameta = aggsum_value(&arc_meta_used); + + /* + * If we're over arc_meta_limit, we want to correct that before + * potentially evicting data buffers below. + */ + total_evicted += arc_evict_meta(ameta); + + /* + * Adjust MRU size + * + * If we're over the target cache size, we want to evict enough + * from the list to get back to our target size. We don't want + * to evict too much from the MRU, such that it drops below + * arc_p. So, if we're over our target cache size more than + * the MRU is over arc_p, we'll evict enough to get back to + * arc_p here, and then evict more from the MFU below. + */ + target = MIN((int64_t)(asize - arc_c), + (int64_t)(zfs_refcount_count(&arc_anon->arcs_size) + + zfs_refcount_count(&arc_mru->arcs_size) + ameta - arc_p)); + + /* + * If we're below arc_meta_min, always prefer to evict data. + * Otherwise, try to satisfy the requested number of bytes to + * evict from the type which contains older buffers; in an + * effort to keep newer buffers in the cache regardless of their + * type. If we cannot satisfy the number of bytes from this + * type, spill over into the next type. + */ + if (arc_evict_type(arc_mru) == ARC_BUFC_METADATA && + ameta > arc_meta_min) { + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_evict_impl(arc_mru, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from metadata. + */ + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mru, 0, target, ARC_BUFC_METADATA); + } + + /* + * Re-sum ARC stats after the first round of evictions. + */ + asize = aggsum_value(&arc_size); + ameta = aggsum_value(&arc_meta_used); + + + /* + * Adjust MFU size + * + * Now that we've tried to evict enough from the MRU to get its + * size back to arc_p, if we're still above the target cache + * size, we evict the rest from the MFU. + */ + target = asize - arc_c; + + if (arc_evict_type(arc_mfu) == ARC_BUFC_METADATA && + ameta > arc_meta_min) { + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * metadata, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + } else { + bytes = arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + /* + * If we couldn't evict our target number of bytes from + * data, we try to get the rest from data. + */ + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mfu, 0, target, ARC_BUFC_METADATA); + } + + /* + * Adjust ghost lists + * + * In addition to the above, the ARC also defines target values + * for the ghost lists. The sum of the mru list and mru ghost + * list should never exceed the target size of the cache, and + * the sum of the mru list, mfu list, mru ghost list, and mfu + * ghost list should never exceed twice the target size of the + * cache. The following logic enforces these limits on the ghost + * caches, and evicts from them as needed. + */ + target = zfs_refcount_count(&arc_mru->arcs_size) + + zfs_refcount_count(&arc_mru_ghost->arcs_size) - arc_c; + + bytes = arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mru_ghost, 0, target, ARC_BUFC_METADATA); + + /* + * We assume the sum of the mru list and mfu list is less than + * or equal to arc_c (we enforced this above), which means we + * can use the simpler of the two equations below: + * + * mru + mfu + mru ghost + mfu ghost <= 2 * arc_c + * mru ghost + mfu ghost <= arc_c + */ + target = zfs_refcount_count(&arc_mru_ghost->arcs_size) + + zfs_refcount_count(&arc_mfu_ghost->arcs_size) - arc_c; + + bytes = arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_DATA); + total_evicted += bytes; + + target -= bytes; + + total_evicted += + arc_evict_impl(arc_mfu_ghost, 0, target, ARC_BUFC_METADATA); + + return (total_evicted); +} + +void +arc_flush(spa_t *spa, boolean_t retry) +{ + uint64_t guid = 0; + + /* + * If retry is B_TRUE, a spa must not be specified since we have + * no good way to determine if all of a spa's buffers have been + * evicted from an arc state. + */ + ASSERT(!retry || spa == 0); + + if (spa != NULL) + guid = spa_load_guid(spa); + + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mru_ghost, guid, ARC_BUFC_METADATA, retry); + + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_DATA, retry); + (void) arc_flush_state(arc_mfu_ghost, guid, ARC_BUFC_METADATA, retry); +} + +void +arc_reduce_target_size(int64_t to_free) +{ + uint64_t asize = aggsum_value(&arc_size); + + /* + * All callers want the ARC to actually evict (at least) this much + * memory. Therefore we reduce from the lower of the current size and + * the target size. This way, even if arc_c is much higher than + * arc_size (as can be the case after many calls to arc_freed(), we will + * immediately have arc_c < arc_size and therefore the arc_evict_zthr + * will evict. + */ + uint64_t c = MIN(arc_c, asize); + + if (c > to_free && c - to_free > arc_c_min) { + arc_c = c - to_free; + atomic_add_64(&arc_p, -(arc_p >> arc_shrink_shift)); + if (arc_p > arc_c) + arc_p = (arc_c >> 1); + ASSERT(arc_c >= arc_c_min); + ASSERT((int64_t)arc_p >= 0); + } else { + arc_c = arc_c_min; + } + + if (asize > arc_c) { + /* See comment in arc_evict_cb_check() on why lock+flag */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = B_TRUE; + mutex_exit(&arc_evict_lock); + zthr_wakeup(arc_evict_zthr); + } +} + +/* + * Determine if the system is under memory pressure and is asking + * to reclaim memory. A return value of B_TRUE indicates that the system + * is under memory pressure and that the arc should adjust accordingly. + */ +boolean_t +arc_reclaim_needed(void) +{ + return (arc_available_memory() < 0); +} + +void +arc_kmem_reap_soon(void) +{ + size_t i; + kmem_cache_t *prev_cache = NULL; + kmem_cache_t *prev_data_cache = NULL; + extern kmem_cache_t *zio_buf_cache[]; + extern kmem_cache_t *zio_data_buf_cache[]; + +#ifdef _KERNEL + if ((aggsum_compare(&arc_meta_used, arc_meta_limit) >= 0) && + zfs_arc_meta_prune) { + /* + * We are exceeding our meta-data cache limit. + * Prune some entries to release holds on meta-data. + */ + arc_prune_async(zfs_arc_meta_prune); + } +#if defined(_ILP32) + /* + * Reclaim unused memory from all kmem caches. + */ + kmem_reap(); +#endif +#endif + + for (i = 0; i < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; i++) { +#if defined(_ILP32) + /* reach upper limit of cache size on 32-bit */ + if (zio_buf_cache[i] == NULL) + break; +#endif + if (zio_buf_cache[i] != prev_cache) { + prev_cache = zio_buf_cache[i]; + kmem_cache_reap_now(zio_buf_cache[i]); + } + if (zio_data_buf_cache[i] != prev_data_cache) { + prev_data_cache = zio_data_buf_cache[i]; + kmem_cache_reap_now(zio_data_buf_cache[i]); + } + } + kmem_cache_reap_now(buf_cache); + kmem_cache_reap_now(hdr_full_cache); + kmem_cache_reap_now(hdr_l2only_cache); + kmem_cache_reap_now(zfs_btree_leaf_cache); + abd_cache_reap_now(); +} + +/* ARGSUSED */ +static boolean_t +arc_evict_cb_check(void *arg, zthr_t *zthr) +{ + /* + * This is necessary so that any changes which may have been made to + * many of the zfs_arc_* module parameters will be propagated to + * their actual internal variable counterparts. Without this, + * changing those module params at runtime would have no effect. + */ + arc_tuning_update(B_FALSE); + + /* + * This is necessary in order to keep the kstat information + * up to date for tools that display kstat data such as the + * mdb ::arc dcmd and the Linux crash utility. These tools + * typically do not call kstat's update function, but simply + * dump out stats from the most recent update. Without + * this call, these commands may show stale stats for the + * anon, mru, mru_ghost, mfu, and mfu_ghost lists. Even + * with this change, the data might be up to 1 second + * out of date(the arc_evict_zthr has a maximum sleep + * time of 1 second); but that should suffice. The + * arc_state_t structures can be queried directly if more + * accurate information is needed. + */ + if (arc_ksp != NULL) + arc_ksp->ks_update(arc_ksp, KSTAT_READ); + + /* + * We have to rely on arc_wait_for_eviction() to tell us when to + * evict, rather than checking if we are overflowing here, so that we + * are sure to not leave arc_wait_for_eviction() waiting on aew_cv. + * If we have become "not overflowing" since arc_wait_for_eviction() + * checked, we need to wake it up. We could broadcast the CV here, + * but arc_wait_for_eviction() may have not yet gone to sleep. We + * would need to use a mutex to ensure that this function doesn't + * broadcast until arc_wait_for_eviction() has gone to sleep (e.g. + * the arc_evict_lock). However, the lock ordering of such a lock + * would necessarily be incorrect with respect to the zthr_lock, + * which is held before this function is called, and is held by + * arc_wait_for_eviction() when it calls zthr_wakeup(). + */ + return (arc_evict_needed); +} + +/* + * Keep arc_size under arc_c by running arc_evict which evicts data + * from the ARC. + */ +/* ARGSUSED */ +static void +arc_evict_cb(void *arg, zthr_t *zthr) +{ + uint64_t evicted = 0; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + /* Evict from cache */ + evicted = arc_evict(); + + /* + * If evicted is zero, we couldn't evict anything + * via arc_evict(). This could be due to hash lock + * collisions, but more likely due to the majority of + * arc buffers being unevictable. Therefore, even if + * arc_size is above arc_c, another pass is unlikely to + * be helpful and could potentially cause us to enter an + * infinite loop. Additionally, zthr_iscancelled() is + * checked here so that if the arc is shutting down, the + * broadcast will wake any remaining arc evict waiters. + */ + mutex_enter(&arc_evict_lock); + arc_evict_needed = !zthr_iscancelled(arc_evict_zthr) && + evicted > 0 && aggsum_compare(&arc_size, arc_c) > 0; + if (!arc_evict_needed) { + /* + * We're either no longer overflowing, or we + * can't evict anything more, so we should wake + * arc_get_data_impl() sooner. + */ + arc_evict_waiter_t *aw; + while ((aw = list_remove_head(&arc_evict_waiters)) != NULL) { + cv_broadcast(&aw->aew_cv); + } + arc_set_need_free(); + } + mutex_exit(&arc_evict_lock); + spl_fstrans_unmark(cookie); +} + +/* ARGSUSED */ +static boolean_t +arc_reap_cb_check(void *arg, zthr_t *zthr) +{ + int64_t free_memory = arc_available_memory(); + + /* + * If a kmem reap is already active, don't schedule more. We must + * check for this because kmem_cache_reap_soon() won't actually + * block on the cache being reaped (this is to prevent callers from + * becoming implicitly blocked by a system-wide kmem reap -- which, + * on a system with many, many full magazines, can take minutes). + */ + if (!kmem_cache_reap_active() && free_memory < 0) { + + arc_no_grow = B_TRUE; + arc_warm = B_TRUE; + /* + * Wait at least zfs_grow_retry (default 5) seconds + * before considering growing. + */ + arc_growtime = gethrtime() + SEC2NSEC(arc_grow_retry); + return (B_TRUE); + } else if (free_memory < arc_c >> arc_no_grow_shift) { + arc_no_grow = B_TRUE; + } else if (gethrtime() >= arc_growtime) { + arc_no_grow = B_FALSE; + } + + return (B_FALSE); +} + +/* + * Keep enough free memory in the system by reaping the ARC's kmem + * caches. To cause more slabs to be reapable, we may reduce the + * target size of the cache (arc_c), causing the arc_evict_cb() + * to free more buffers. + */ +/* ARGSUSED */ +static void +arc_reap_cb(void *arg, zthr_t *zthr) +{ + int64_t free_memory; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + /* + * Kick off asynchronous kmem_reap()'s of all our caches. + */ + arc_kmem_reap_soon(); + + /* + * Wait at least arc_kmem_cache_reap_retry_ms between + * arc_kmem_reap_soon() calls. Without this check it is possible to + * end up in a situation where we spend lots of time reaping + * caches, while we're near arc_c_min. Waiting here also gives the + * subsequent free memory check a chance of finding that the + * asynchronous reap has already freed enough memory, and we don't + * need to call arc_reduce_target_size(). + */ + delay((hz * arc_kmem_cache_reap_retry_ms + 999) / 1000); + + /* + * Reduce the target size as needed to maintain the amount of free + * memory in the system at a fraction of the arc_size (1/128th by + * default). If oversubscribed (free_memory < 0) then reduce the + * target arc_size by the deficit amount plus the fractional + * amount. If free memory is positive but less then the fractional + * amount, reduce by what is needed to hit the fractional amount. + */ + free_memory = arc_available_memory(); + + int64_t to_free = + (arc_c >> arc_shrink_shift) - free_memory; + if (to_free > 0) { + arc_reduce_target_size(to_free); + } + spl_fstrans_unmark(cookie); +} + +#ifdef _KERNEL +/* + * Determine the amount of memory eligible for eviction contained in the + * ARC. All clean data reported by the ghost lists can always be safely + * evicted. Due to arc_c_min, the same does not hold for all clean data + * contained by the regular mru and mfu lists. + * + * In the case of the regular mru and mfu lists, we need to report as + * much clean data as possible, such that evicting that same reported + * data will not bring arc_size below arc_c_min. Thus, in certain + * circumstances, the total amount of clean data in the mru and mfu + * lists might not actually be evictable. + * + * The following two distinct cases are accounted for: + * + * 1. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is greater than or equal to arc_c_min. + * (i.e. amount of dirty data >= arc_c_min) + * + * This is the easy case; all clean data contained by the mru and mfu + * lists is evictable. Evicting all clean data can only drop arc_size + * to the amount of dirty data, which is greater than arc_c_min. + * + * 2. The sum of the amount of dirty data contained by both the mru and + * mfu lists, plus the ARC's other accounting (e.g. the anon list), + * is less than arc_c_min. + * (i.e. arc_c_min > amount of dirty data) + * + * 2.1. arc_size is greater than or equal arc_c_min. + * (i.e. arc_size >= arc_c_min > amount of dirty data) + * + * In this case, not all clean data from the regular mru and mfu + * lists is actually evictable; we must leave enough clean data + * to keep arc_size above arc_c_min. Thus, the maximum amount of + * evictable data from the two lists combined, is exactly the + * difference between arc_size and arc_c_min. + * + * 2.2. arc_size is less than arc_c_min + * (i.e. arc_c_min > arc_size > amount of dirty data) + * + * In this case, none of the data contained in the mru and mfu + * lists is evictable, even if it's clean. Since arc_size is + * already below arc_c_min, evicting any more would only + * increase this negative difference. + */ + +#endif /* _KERNEL */ + +/* + * Adapt arc info given the number of bytes we are trying to add and + * the state that we are coming from. This function is only called + * when we are adding new content to the cache. + */ +static void +arc_adapt(int bytes, arc_state_t *state) +{ + int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); + int64_t mrug_size = zfs_refcount_count(&arc_mru_ghost->arcs_size); + int64_t mfug_size = zfs_refcount_count(&arc_mfu_ghost->arcs_size); + + if (state == arc_l2c_only) + return; + + ASSERT(bytes > 0); + /* + * Adapt the target size of the MRU list: + * - if we just hit in the MRU ghost list, then increase + * the target size of the MRU list. + * - if we just hit in the MFU ghost list, then increase + * the target size of the MFU list by decreasing the + * target size of the MRU list. + */ + if (state == arc_mru_ghost) { + mult = (mrug_size >= mfug_size) ? 1 : (mfug_size / mrug_size); + if (!zfs_arc_p_dampener_disable) + mult = MIN(mult, 10); /* avoid wild arc_p adjustment */ + + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); + } else if (state == arc_mfu_ghost) { + uint64_t delta; + + mult = (mfug_size >= mrug_size) ? 1 : (mrug_size / mfug_size); + if (!zfs_arc_p_dampener_disable) + mult = MIN(mult, 10); + + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); + } + ASSERT((int64_t)arc_p >= 0); + + /* + * Wake reap thread if we do not have any available memory + */ + if (arc_reclaim_needed()) { + zthr_wakeup(arc_reap_zthr); + return; + } + + if (arc_no_grow) + return; + + if (arc_c >= arc_c_max) + return; + + /* + * If we're within (2 * maxblocksize) bytes of the target + * cache size, increment the target cache size + */ + ASSERT3U(arc_c, >=, 2ULL << SPA_MAXBLOCKSHIFT); + if (aggsum_upper_bound(&arc_size) >= + arc_c - (2ULL << SPA_MAXBLOCKSHIFT)) { + atomic_add_64(&arc_c, (int64_t)bytes); + if (arc_c > arc_c_max) + arc_c = arc_c_max; + else if (state == arc_anon) + atomic_add_64(&arc_p, (int64_t)bytes); + if (arc_p > arc_c) + arc_p = arc_c; + } + ASSERT((int64_t)arc_p >= 0); +} + +/* + * Check if arc_size has grown past our upper threshold, determined by + * zfs_arc_overflow_shift. + */ +boolean_t +arc_is_overflowing(void) +{ + /* Always allow at least one block of overflow */ + int64_t overflow = MAX(SPA_MAXBLOCKSIZE, + arc_c >> zfs_arc_overflow_shift); + + /* + * We just compare the lower bound here for performance reasons. Our + * primary goals are to make sure that the arc never grows without + * bound, and that it can reach its maximum size. This check + * accomplishes both goals. The maximum amount we could run over by is + * 2 * aggsum_borrow_multiplier * NUM_CPUS * the average size of a block + * in the ARC. In practice, that's in the tens of MB, which is low + * enough to be safe. + */ + return (aggsum_lower_bound(&arc_size) >= (int64_t)arc_c + overflow); +} + +static abd_t * +arc_get_data_abd(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + boolean_t do_adapt) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag, do_adapt); + if (type == ARC_BUFC_METADATA) { + return (abd_alloc(size, B_TRUE)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (abd_alloc(size, B_FALSE)); + } +} + +static void * +arc_get_data_buf(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_get_data_impl(hdr, size, tag, B_TRUE); + if (type == ARC_BUFC_METADATA) { + return (zio_buf_alloc(size)); + } else { + ASSERT(type == ARC_BUFC_DATA); + return (zio_data_buf_alloc(size)); + } +} + +/* + * Wait for the specified amount of data (in bytes) to be evicted from the + * ARC, and for there to be sufficient free memory in the system. Waiting for + * eviction ensures that the memory used by the ARC decreases. Waiting for + * free memory ensures that the system won't run out of free pages, regardless + * of ARC behavior and settings. See arc_lowmem_init(). + */ +void +arc_wait_for_eviction(uint64_t amount) +{ + mutex_enter(&arc_evict_lock); + if (arc_is_overflowing()) { + arc_evict_needed = B_TRUE; + zthr_wakeup(arc_evict_zthr); + + if (amount != 0) { + arc_evict_waiter_t aw; + list_link_init(&aw.aew_node); + cv_init(&aw.aew_cv, NULL, CV_DEFAULT, NULL); + + arc_evict_waiter_t *last = + list_tail(&arc_evict_waiters); + if (last != NULL) { + ASSERT3U(last->aew_count, >, arc_evict_count); + aw.aew_count = last->aew_count + amount; + } else { + aw.aew_count = arc_evict_count + amount; + } + + list_insert_tail(&arc_evict_waiters, &aw); + + arc_set_need_free(); + + DTRACE_PROBE3(arc__wait__for__eviction, + uint64_t, amount, + uint64_t, arc_evict_count, + uint64_t, aw.aew_count); + + /* + * We will be woken up either when arc_evict_count + * reaches aew_count, or when the ARC is no longer + * overflowing and eviction completes. + */ + cv_wait(&aw.aew_cv, &arc_evict_lock); + + /* + * In case of "false" wakeup, we will still be on the + * list. + */ + if (list_link_active(&aw.aew_node)) + list_remove(&arc_evict_waiters, &aw); + + cv_destroy(&aw.aew_cv); + } + } + mutex_exit(&arc_evict_lock); +} + +/* + * Allocate a block and return it to the caller. If we are hitting the + * hard limit for the cache size, we must sleep, waiting for the eviction + * thread to catch up. If we're past the target size but below the hard + * limit, we'll only signal the reclaim thread and continue on. + */ +static void +arc_get_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag, + boolean_t do_adapt) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + if (do_adapt) + arc_adapt(size, state); + + /* + * If arc_size is currently overflowing, we must be adding data + * faster than we are evicting. To ensure we don't compound the + * problem by adding more data and forcing arc_size to grow even + * further past it's target size, we wait for the eviction thread to + * make some progress. We also wait for there to be sufficient free + * memory in the system, as measured by arc_free_memory(). + * + * Specifically, we wait for zfs_arc_eviction_pct percent of the + * requested size to be evicted. This should be more than 100%, to + * ensure that that progress is also made towards getting arc_size + * under arc_c. See the comment above zfs_arc_eviction_pct. + * + * We do the overflowing check without holding the arc_evict_lock to + * reduce lock contention in this hot path. Note that + * arc_wait_for_eviction() will acquire the lock and check again to + * ensure we are truly overflowing before blocking. + */ + if (arc_is_overflowing()) { + arc_wait_for_eviction(size * + zfs_arc_eviction_pct / 100); + } + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + arc_space_consume(size, ARC_SPACE_META); + } else { + arc_space_consume(size, ARC_SPACE_DATA); + } + + /* + * Update the state size. Note that ghost states have a + * "ghost size" and so don't need to be updated. + */ + if (!GHOST_STATE(state)) { + + (void) zfs_refcount_add_many(&state->arcs_size, size, tag); + + /* + * If this is reached via arc_read, the link is + * protected by the hash lock. If reached via + * arc_buf_alloc, the header should not be accessed by + * any other thread. And, if reached via arc_read_done, + * the hash lock will protect it if it's found in the + * hash table; otherwise no other thread should be + * trying to [add|remove]_reference it. + */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + (void) zfs_refcount_add_many(&state->arcs_esize[type], + size, tag); + } + + /* + * If we are growing the cache, and we are adding anonymous + * data, and we have outgrown arc_p, update arc_p + */ + if (aggsum_upper_bound(&arc_size) < arc_c && + hdr->b_l1hdr.b_state == arc_anon && + (zfs_refcount_count(&arc_anon->arcs_size) + + zfs_refcount_count(&arc_mru->arcs_size) > arc_p)) + arc_p = MIN(arc_c, arc_p + size); + } +} + +static void +arc_free_data_abd(arc_buf_hdr_t *hdr, abd_t *abd, uint64_t size, void *tag) +{ + arc_free_data_impl(hdr, size, tag); + abd_free(abd); +} + +static void +arc_free_data_buf(arc_buf_hdr_t *hdr, void *buf, uint64_t size, void *tag) +{ + arc_buf_contents_t type = arc_buf_type(hdr); + + arc_free_data_impl(hdr, size, tag); + if (type == ARC_BUFC_METADATA) { + zio_buf_free(buf, size); + } else { + ASSERT(type == ARC_BUFC_DATA); + zio_data_buf_free(buf, size); + } +} + +/* + * Free the arc data buffer. + */ +static void +arc_free_data_impl(arc_buf_hdr_t *hdr, uint64_t size, void *tag) +{ + arc_state_t *state = hdr->b_l1hdr.b_state; + arc_buf_contents_t type = arc_buf_type(hdr); + + /* protected by hash lock, if in the hash table */ + if (multilist_link_active(&hdr->b_l1hdr.b_arc_node)) { + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + ASSERT(state != arc_anon && state != arc_l2c_only); + + (void) zfs_refcount_remove_many(&state->arcs_esize[type], + size, tag); + } + (void) zfs_refcount_remove_many(&state->arcs_size, size, tag); + + VERIFY3U(hdr->b_type, ==, type); + if (type == ARC_BUFC_METADATA) { + arc_space_return(size, ARC_SPACE_META); + } else { + ASSERT(type == ARC_BUFC_DATA); + arc_space_return(size, ARC_SPACE_DATA); + } +} + +/* + * This routine is called whenever a buffer is accessed. + * NOTE: the hash lock is dropped in this function. + */ +static void +arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) +{ + clock_t now; + + ASSERT(MUTEX_HELD(hash_lock)); + ASSERT(HDR_HAS_L1HDR(hdr)); + + if (hdr->b_l1hdr.b_state == arc_anon) { + /* + * This buffer is not in the cache, and does not + * appear in our "ghost" list. Add the new buffer + * to the MRU state. + */ + + ASSERT0(hdr->b_l1hdr.b_arc_access); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mru, hdr, hash_lock); + + } else if (hdr->b_l1hdr.b_state == arc_mru) { + now = ddi_get_lbolt(); + + /* + * If this buffer is here because of a prefetch, then either: + * - clear the flag if this is a "referencing" read + * (any subsequent access will bump this into the MFU state). + * or + * - move the buffer to the head of the list if this is + * another prefetch (to make it less likely to be evicted). + */ + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + /* link protected by hash lock */ + ASSERT(multilist_link_active( + &hdr->b_l1hdr.b_arc_node)); + } else { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); + ARCSTAT_BUMP(arcstat_mru_hits); + } + hdr->b_l1hdr.b_arc_access = now; + return; + } + + /* + * This buffer has been "accessed" only once so far, + * but it is still in the cache. Move it to the MFU + * state. + */ + if (ddi_time_after(now, hdr->b_l1hdr.b_arc_access + + ARC_MINTIME)) { + /* + * More than 125ms have passed since we + * instantiated this buffer. Move it to the + * most frequently used state. + */ + hdr->b_l1hdr.b_arc_access = now; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); + } + atomic_inc_32(&hdr->b_l1hdr.b_mru_hits); + ARCSTAT_BUMP(arcstat_mru_hits); + } else if (hdr->b_l1hdr.b_state == arc_mru_ghost) { + arc_state_t *new_state; + /* + * This buffer has been "accessed" recently, but + * was evicted from the cache. Move it to the + * MFU state. + */ + + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + new_state = arc_mru; + if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } + DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); + } else { + new_state = arc_mfu; + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + } + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + arc_change_state(new_state, hdr, hash_lock); + + atomic_inc_32(&hdr->b_l1hdr.b_mru_ghost_hits); + ARCSTAT_BUMP(arcstat_mru_ghost_hits); + } else if (hdr->b_l1hdr.b_state == arc_mfu) { + /* + * This buffer has been accessed more than once and is + * still in the cache. Keep it in the MFU state. + * + * NOTE: an add_reference() that occurred when we did + * the arc_read() will have kicked this off the list. + * If it was a prefetch, we will explicitly move it to + * the head of the list now. + */ + + atomic_inc_32(&hdr->b_l1hdr.b_mfu_hits); + ARCSTAT_BUMP(arcstat_mfu_hits); + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { + arc_state_t *new_state = arc_mfu; + /* + * This buffer has been accessed more than once but has + * been evicted from the cache. Move it back to the + * MFU state. + */ + + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { + /* + * This is a prefetch access... + * move this block back to the MRU state. + */ + new_state = arc_mru; + } + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(new_state, hdr, hash_lock); + + atomic_inc_32(&hdr->b_l1hdr.b_mfu_ghost_hits); + ARCSTAT_BUMP(arcstat_mfu_ghost_hits); + } else if (hdr->b_l1hdr.b_state == arc_l2c_only) { + /* + * This buffer is on the 2nd Level ARC. + */ + + hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); + DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, hdr); + arc_change_state(arc_mfu, hdr, hash_lock); + } else { + cmn_err(CE_PANIC, "invalid arc state 0x%p", + hdr->b_l1hdr.b_state); + } +} + +/* + * This routine is called by dbuf_hold() to update the arc_access() state + * which otherwise would be skipped for entries in the dbuf cache. + */ +void +arc_buf_access(arc_buf_t *buf) +{ + mutex_enter(&buf->b_evict_lock); + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * Avoid taking the hash_lock when possible as an optimization. + * The header must be checked again under the hash_lock in order + * to handle the case where it is concurrently being released. + */ + if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { + mutex_exit(&buf->b_evict_lock); + return; + } + + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + if (hdr->b_l1hdr.b_state == arc_anon || HDR_EMPTY(hdr)) { + mutex_exit(hash_lock); + mutex_exit(&buf->b_evict_lock); + ARCSTAT_BUMP(arcstat_access_skip); + return; + } + + mutex_exit(&buf->b_evict_lock); + + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr) && !HDR_PRESCIENT_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); +} + +/* a generic arc_read_done_func_t which you can use */ +/* ARGSUSED */ +void +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) +{ + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); + arc_buf_destroy(buf, arg); +} + +/* a generic arc_read_done_func_t */ +/* ARGSUSED */ +void +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) +{ + arc_buf_t **bufp = arg; + + if (buf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + *bufp = NULL; + } else { + ASSERT(zio == NULL || zio->io_error == 0); + *bufp = buf; + ASSERT(buf->b_data != NULL); + } +} + +static void +arc_hdr_verify(arc_buf_hdr_t *hdr, blkptr_t *bp) +{ + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + ASSERT3U(HDR_GET_PSIZE(hdr), ==, 0); + ASSERT3U(arc_hdr_get_compress(hdr), ==, ZIO_COMPRESS_OFF); + } else { + if (HDR_COMPRESSION_ENABLED(hdr)) { + ASSERT3U(arc_hdr_get_compress(hdr), ==, + BP_GET_COMPRESS(bp)); + } + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + ASSERT3U(HDR_GET_PSIZE(hdr), ==, BP_GET_PSIZE(bp)); + ASSERT3U(!!HDR_PROTECTED(hdr), ==, BP_IS_PROTECTED(bp)); + } +} + +static void +arc_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + arc_buf_hdr_t *hdr = zio->io_private; + kmutex_t *hash_lock = NULL; + arc_callback_t *callback_list; + arc_callback_t *acb; + boolean_t freeable = B_FALSE; + + /* + * The hdr was inserted into hash-table and removed from lists + * prior to starting I/O. We should find this header, since + * it's in the hash table, and it should be legit since it's + * not possible to evict it during the I/O. The only possible + * reason for it not to be found is if we were freed during the + * read. + */ + if (HDR_IN_HASH_TABLE(hdr)) { + arc_buf_hdr_t *found; + + ASSERT3U(hdr->b_birth, ==, BP_PHYSICAL_BIRTH(zio->io_bp)); + ASSERT3U(hdr->b_dva.dva_word[0], ==, + BP_IDENTITY(zio->io_bp)->dva_word[0]); + ASSERT3U(hdr->b_dva.dva_word[1], ==, + BP_IDENTITY(zio->io_bp)->dva_word[1]); + + found = buf_hash_find(hdr->b_spa, zio->io_bp, &hash_lock); + + ASSERT((found == hdr && + DVA_EQUAL(&hdr->b_dva, BP_IDENTITY(zio->io_bp))) || + (found == hdr && HDR_L2_READING(hdr))); + ASSERT3P(hash_lock, !=, NULL); + } + + if (BP_IS_PROTECTED(bp)) { + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + + if (BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) { + void *tmpbuf; + + tmpbuf = abd_borrow_buf_copy(zio->io_abd, + sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmpbuf, + hdr->b_crypt_hdr.b_mac); + abd_return_buf(zio->io_abd, tmpbuf, + sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + } + + if (zio->io_error == 0) { + /* byteswap if necessary */ + if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_GET_LEVEL(zio->io_bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(zio->io_bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + if (!HDR_L2_READING(hdr)) { + hdr->b_complevel = zio->io_prop.zp_complevel; + } + } + + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_EVICTED); + if (l2arc_noprefetch && HDR_PREFETCH(hdr)) + arc_hdr_clear_flags(hdr, ARC_FLAG_L2CACHE); + + callback_list = hdr->b_l1hdr.b_acb; + ASSERT3P(callback_list, !=, NULL); + + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + + /* + * If a read request has a callback (i.e. acb_done is not NULL), then we + * make a buf containing the data according to the parameters which were + * passed in. The implementation of arc_buf_alloc_impl() ensures that we + * aren't needlessly decompressing the data multiple times. + */ + int callback_cnt = 0; + for (acb = callback_list; acb != NULL; acb = acb->acb_next) { + if (!acb->acb_done) + continue; + + callback_cnt++; + + if (zio->io_error != 0) + continue; + + int error = arc_buf_alloc_impl(hdr, zio->io_spa, + &acb->acb_zb, acb->acb_private, acb->acb_encrypted, + acb->acb_compressed, acb->acb_noauth, B_TRUE, + &acb->acb_buf); + + /* + * Assert non-speculative zios didn't fail because an + * encryption key wasn't loaded + */ + ASSERT((zio->io_flags & ZIO_FLAG_SPECULATIVE) || + error != EACCES); + + /* + * If we failed to decrypt, report an error now (as the zio + * layer would have done if it had done the transforms). + */ + if (error == ECKSUM) { + ASSERT(BP_IS_PROTECTED(bp)); + error = SET_ERROR(EIO); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(zio->io_spa, &acb->acb_zb); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + zio->io_spa, NULL, &acb->acb_zb, zio, 0, 0); + } + } + + if (error != 0) { + /* + * Decompression or decryption failed. Set + * io_error so that when we call acb_done + * (below), we will indicate that the read + * failed. Note that in the unusual case + * where one callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; + } + } + + /* + * If there are multiple callbacks, we must have the hash lock, + * because the only way for multiple threads to find this hdr is + * in the hash table. This ensures that if there are multiple + * callbacks, the hdr is not anonymous. If it were anonymous, + * we couldn't use arc_buf_destroy() in the error case below. + */ + ASSERT(callback_cnt < 2 || hash_lock != NULL); + + hdr->b_l1hdr.b_acb = NULL; + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + if (callback_cnt == 0) + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + + ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || + callback_list != NULL); + + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + } else { + arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); + if (hdr->b_l1hdr.b_state != arc_anon) + arc_change_state(arc_anon, hdr, hash_lock); + if (HDR_IN_HASH_TABLE(hdr)) + buf_hash_remove(hdr); + freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); + } + + /* + * Broadcast before we drop the hash_lock to avoid the possibility + * that the hdr (and hence the cv) might be freed before we get to + * the cv_broadcast(). + */ + cv_broadcast(&hdr->b_l1hdr.b_cv); + + if (hash_lock != NULL) { + mutex_exit(hash_lock); + } else { + /* + * This block was freed while we waited for the read to + * complete. It has been removed from the hash table and + * moved to the anonymous state (so that it won't show up + * in the cache). + */ + ASSERT3P(hdr->b_l1hdr.b_state, ==, arc_anon); + freeable = zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt); + } + + /* execute each callback and free its structure */ + while ((acb = callback_list) != NULL) { + if (acb->acb_done != NULL) { + if (zio->io_error != 0 && acb->acb_buf != NULL) { + /* + * If arc_buf_alloc_impl() fails during + * decompression, the buf will still be + * allocated, and needs to be freed here. + */ + arc_buf_destroy(acb->acb_buf, + acb->acb_private); + acb->acb_buf = NULL; + } + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); + } + + if (acb->acb_zio_dummy != NULL) { + acb->acb_zio_dummy->io_error = zio->io_error; + zio_nowait(acb->acb_zio_dummy); + } + + callback_list = acb->acb_next; + kmem_free(acb, sizeof (arc_callback_t)); + } + + if (freeable) + arc_hdr_destroy(hdr); +} + +/* + * "Read" the block at the specified DVA (in bp) via the + * cache. If the block is found in the cache, invoke the provided + * callback immediately and return. Note that the `zio' parameter + * in the callback will be NULL in this case, since no IO was + * required. If the block is not in the cache pass the read request + * on to the spa with a substitute callback function, so that the + * requested block will be added to the cache. + * + * If a read request arrives for a block that has a read in-progress, + * either wait for the in-progress read to complete (and return the + * results); or, if this is a read with a "done" func, add a record + * to the read to invoke the "done" func when the read completes, + * and return; or just return. + * + * arc_read_done() will invoke all the requested "done" functions + * for readers of this block. + */ +int +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + arc_read_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) +{ + arc_buf_hdr_t *hdr = NULL; + kmutex_t *hash_lock = NULL; + zio_t *rzio; + uint64_t guid = spa_load_guid(spa); + boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW_COMPRESS) != 0; + boolean_t encrypted_read = BP_IS_ENCRYPTED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; + boolean_t noauth_read = BP_IS_AUTHENTICATED(bp) && + (zio_flags & ZIO_FLAG_RAW_ENCRYPT) != 0; + boolean_t embedded_bp = !!BP_IS_EMBEDDED(bp); + int rc = 0; + + ASSERT(!embedded_bp || + BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_REDACTED(bp)); + + /* + * Normally SPL_FSTRANS will already be set since kernel threads which + * expect to call the DMU interfaces will set it when created. System + * calls are similarly handled by setting/cleaning the bit in the + * registered callback (module/os/.../zfs/zpl_*). + * + * External consumers such as Lustre which call the exported DMU + * interfaces may not have set SPL_FSTRANS. To avoid a deadlock + * on the hash_lock always set and clear the bit. + */ + fstrans_cookie_t cookie = spl_fstrans_mark(); +top: + if (!embedded_bp) { + /* + * Embedded BP's have no DVA and require no I/O to "read". + * Create an anonymous arc buf to back it. + */ + hdr = buf_hash_find(guid, bp, &hash_lock); + } + + /* + * Determine if we have an L1 cache hit or a cache miss. For simplicity + * we maintain encrypted data separately from compressed / uncompressed + * data. If the user is requesting raw encrypted data and we don't have + * that in the header we will read from disk to guarantee that we can + * get it even if the encryption keys aren't loaded. + */ + if (hdr != NULL && HDR_HAS_L1HDR(hdr) && (HDR_HAS_RABD(hdr) || + (hdr->b_l1hdr.b_pabd != NULL && !encrypted_read))) { + arc_buf_t *buf = NULL; + *arc_flags |= ARC_FLAG_CACHED; + + if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + + if (*arc_flags & ARC_FLAG_CACHED_ONLY) { + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_cached_only_in_progress); + rc = SET_ERROR(ENOENT); + goto out; + } + + ASSERT3P(head_zio, !=, NULL); + if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && + priority == ZIO_PRIORITY_SYNC_READ) { + /* + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. + */ + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); + } + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + + if (*arc_flags & ARC_FLAG_WAIT) { + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); + + if (done) { + arc_callback_t *acb = NULL; + + acb = kmem_zalloc(sizeof (arc_callback_t), + KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_compressed = compressed_read; + acb->acb_encrypted = encrypted_read; + acb->acb_noauth = noauth_read; + acb->acb_zb = *zb; + if (pio != NULL) + acb->acb_zio_dummy = zio_null(pio, + spa, NULL, NULL, NULL, zio_flags); + + ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; + acb->acb_next = hdr->b_l1hdr.b_acb; + hdr->b_l1hdr.b_acb = acb; + mutex_exit(hash_lock); + goto out; + } + mutex_exit(hash_lock); + goto out; + } + + ASSERT(hdr->b_l1hdr.b_state == arc_mru || + hdr->b_l1hdr.b_state == arc_mfu); + + if (done) { + if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { + /* + * This is a demand read which does not have to + * wait for i/o because we did a predictive + * prefetch i/o for it, which has completed. + */ + DTRACE_PROBE1( + arc__demand__hit__predictive__prefetch, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP( + arcstat_demand_hit_predictive_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREDICTIVE_PREFETCH); + } + + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + + ASSERT(!embedded_bp || !BP_IS_HOLE(bp)); + + /* Get a buf with the desired data in it. */ + rc = arc_buf_alloc_impl(hdr, spa, zb, private, + encrypted_read, compressed_read, noauth_read, + B_TRUE, &buf); + if (rc == ECKSUM) { + /* + * Convert authentication and decryption errors + * to EIO (and generate an ereport if needed) + * before leaving the ARC. + */ + rc = SET_ERROR(EIO); + if ((zio_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(spa, zb); + zfs_ereport_post( + FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, zb, NULL, 0, 0); + } + } + if (rc != 0) { + (void) remove_reference(hdr, hash_lock, + private); + arc_buf_destroy_impl(buf); + buf = NULL; + } + + /* assert any errors weren't due to unloaded keys */ + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc != EACCES); + } else if (*arc_flags & ARC_FLAG_PREFETCH && + zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + } + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); + arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + mutex_exit(hash_lock); + ARCSTAT_BUMP(arcstat_hits); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), + data, metadata, hits); + + if (done) + done(NULL, zb, bp, buf, private); + } else { + uint64_t lsize = BP_GET_LSIZE(bp); + uint64_t psize = BP_GET_PSIZE(bp); + arc_callback_t *acb; + vdev_t *vd = NULL; + uint64_t addr = 0; + boolean_t devw = B_FALSE; + uint64_t size; + abd_t *hdr_abd; + int alloc_flags = encrypted_read ? ARC_HDR_ALLOC_RDATA : 0; + + if (*arc_flags & ARC_FLAG_CACHED_ONLY) { + rc = SET_ERROR(ENOENT); + if (hash_lock != NULL) + mutex_exit(hash_lock); + goto out; + } + + /* + * Gracefully handle a damaged logical block size as a + * checksum error. + */ + if (lsize > spa_maxblocksize(spa)) { + rc = SET_ERROR(ECKSUM); + if (hash_lock != NULL) + mutex_exit(hash_lock); + goto out; + } + + if (hdr == NULL) { + /* + * This block is not in the cache or it has + * embedded data. + */ + arc_buf_hdr_t *exists = NULL; + arc_buf_contents_t type = BP_GET_BUFC_TYPE(bp); + hdr = arc_hdr_alloc(spa_load_guid(spa), psize, lsize, + BP_IS_PROTECTED(bp), BP_GET_COMPRESS(bp), 0, type, + encrypted_read); + + if (!embedded_bp) { + hdr->b_dva = *BP_IDENTITY(bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); + exists = buf_hash_insert(hdr, &hash_lock); + } + if (exists != NULL) { + /* somebody beat us to the hash insert */ + mutex_exit(hash_lock); + buf_discard_identity(hdr); + arc_hdr_destroy(hdr); + goto top; /* restart the IO request */ + } + } else { + /* + * This block is in the ghost cache or encrypted data + * was requested and we didn't have it. If it was + * L2-only (and thus didn't have an L1 hdr), + * we realloc the header to add an L1 hdr. + */ + if (!HDR_HAS_L1HDR(hdr)) { + hdr = arc_hdr_realloc(hdr, hdr_l2only_cache, + hdr_full_cache); + } + + if (GHOST_STATE(hdr->b_l1hdr.b_state)) { + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT0(zfs_refcount_count( + &hdr->b_l1hdr.b_refcnt)); + ASSERT3P(hdr->b_l1hdr.b_buf, ==, NULL); + ASSERT3P(hdr->b_l1hdr.b_freeze_cksum, ==, NULL); + } else if (HDR_IO_IN_PROGRESS(hdr)) { + /* + * If this header already had an IO in progress + * and we are performing another IO to fetch + * encrypted data we must wait until the first + * IO completes so as not to confuse + * arc_read_done(). This should be very rare + * and so the performance impact shouldn't + * matter. + */ + cv_wait(&hdr->b_l1hdr.b_cv, hash_lock); + mutex_exit(hash_lock); + goto top; + } + + /* + * This is a delicate dance that we play here. + * This hdr might be in the ghost list so we access + * it to move it out of the ghost list before we + * initiate the read. If it's a prefetch then + * it won't have a callback so we'll remove the + * reference that arc_buf_alloc_impl() created. We + * do this after we've called arc_access() to + * avoid hitting an assert in remove_reference(). + */ + arc_adapt(arc_hdr_size(hdr), hdr->b_l1hdr.b_state); + arc_access(hdr, hash_lock); + arc_hdr_alloc_abd(hdr, alloc_flags); + } + + if (encrypted_read) { + ASSERT(HDR_HAS_RABD(hdr)); + size = HDR_GET_PSIZE(hdr); + hdr_abd = hdr->b_crypt_hdr.b_rabd; + zio_flags |= ZIO_FLAG_RAW; + } else { + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + size = arc_hdr_size(hdr); + hdr_abd = hdr->b_l1hdr.b_pabd; + + if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF) { + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } + + /* + * For authenticated bp's, we do not ask the ZIO layer + * to authenticate them since this will cause the entire + * IO to fail if the key isn't loaded. Instead, we + * defer authentication until arc_buf_fill(), which will + * verify the data when the key is available. + */ + if (BP_IS_AUTHENTICATED(bp)) + zio_flags |= ZIO_FLAG_RAW_ENCRYPT; + } + + if (*arc_flags & ARC_FLAG_PREFETCH && + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) + arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + if (BP_IS_AUTHENTICATED(bp)) + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); + if (BP_GET_LEVEL(bp) > 0) + arc_hdr_set_flags(hdr, ARC_FLAG_INDIRECT); + if (*arc_flags & ARC_FLAG_PREDICTIVE_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); + ASSERT(!GHOST_STATE(hdr->b_l1hdr.b_state)); + + acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); + acb->acb_done = done; + acb->acb_private = private; + acb->acb_compressed = compressed_read; + acb->acb_encrypted = encrypted_read; + acb->acb_noauth = noauth_read; + acb->acb_zb = *zb; + + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + hdr->b_l1hdr.b_acb = acb; + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (HDR_HAS_L2HDR(hdr) && + (vd = hdr->b_l2hdr.b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr.b_dev->l2ad_writing; + addr = hdr->b_l2hdr.b_daddr; + /* + * Lock out L2ARC device removal. + */ + if (vdev_is_dead(vd) || + !spa_config_tryenter(spa, SCL_L2ARC, vd, RW_READER)) + vd = NULL; + } + + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) + arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + else + arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); + + /* + * At this point, we have a level 1 cache miss or a blkptr + * with embedded data. Try again in L2ARC if possible. + */ + ASSERT3U(HDR_GET_LSIZE(hdr), ==, lsize); + + /* + * Skip ARC stat bump for block pointers with embedded + * data. The data are read from the blkptr itself via + * decode_embedded_bp_compressed(). + */ + if (!embedded_bp) { + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, + blkptr_t *, bp, uint64_t, lsize, + zbookmark_phys_t *, zb); + ARCSTAT_BUMP(arcstat_misses); + ARCSTAT_CONDSTAT(!HDR_PREFETCH(hdr), + demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, + metadata, misses); + } + + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { + /* + * Read from the L2ARC if the following are true: + * 1. The L2ARC vdev was previously cached. + * 2. This buffer still has L2ARC metadata. + * 3. This buffer isn't currently writing to the L2ARC. + * 4. The L2ARC entry wasn't evicted, which may + * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. + */ + if (HDR_HAS_L2HDR(hdr) && + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { + l2arc_read_callback_t *cb; + abd_t *abd; + uint64_t asize; + + DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_hits); + atomic_inc_32(&hdr->b_l2hdr.b_hits); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), + KM_SLEEP); + cb->l2rcb_hdr = hdr; + cb->l2rcb_bp = *bp; + cb->l2rcb_zb = *zb; + cb->l2rcb_flags = zio_flags; + + /* + * When Compressed ARC is disabled, but the + * L2ARC block is compressed, arc_hdr_size() + * will have returned LSIZE rather than PSIZE. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr) && + HDR_GET_PSIZE(hdr) != 0) { + size = HDR_GET_PSIZE(hdr); + } + + asize = vdev_psize_to_asize(vd, size); + if (asize != size) { + abd = abd_alloc_for_io(asize, + HDR_ISTYPE_METADATA(hdr)); + cb->l2rcb_abd = abd; + } else { + abd = hdr_abd; + } + + ASSERT(addr >= VDEV_LABEL_START_SIZE && + addr + asize <= vd->vdev_psize - + VDEV_LABEL_END_SIZE); + + /* + * l2arc read. The SCL_L2ARC lock will be + * released by l2arc_read_done(). + * Issue a null zio if the underlying buffer + * was squashed to zero size by compression. + */ + ASSERT3U(arc_hdr_get_compress(hdr), !=, + ZIO_COMPRESS_EMPTY); + rzio = zio_read_phys(pio, vd, addr, + asize, abd, + ZIO_CHECKSUM_OFF, + l2arc_read_done, cb, priority, + zio_flags | ZIO_FLAG_DONT_CACHE | + ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, + zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, + HDR_GET_PSIZE(hdr)); + + if (*arc_flags & ARC_FLAG_NOWAIT) { + zio_nowait(rzio); + goto out; + } + + ASSERT(*arc_flags & ARC_FLAG_WAIT); + if (zio_wait(rzio) == 0) + goto out; + + /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); + } else { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + if (HDR_L2_WRITING(hdr)) + ARCSTAT_BUMP(arcstat_l2_rw_clash); + spa_config_exit(spa, SCL_L2ARC, vd); + } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + /* + * Skip ARC stat bump for block pointers with + * embedded data. The data are read from the blkptr + * itself via decode_embedded_bp_compressed(). + */ + if (l2arc_ndev != 0 && !embedded_bp) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } + } + + rzio = zio_read(pio, spa, bp, hdr_abd, size, + arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + + if (*arc_flags & ARC_FLAG_WAIT) { + rc = zio_wait(rzio); + goto out; + } + + ASSERT(*arc_flags & ARC_FLAG_NOWAIT); + zio_nowait(rzio); + } + +out: + /* embedded bps don't actually go to disk */ + if (!embedded_bp) + spa_read_history_add(spa, zb, *arc_flags); + spl_fstrans_unmark(cookie); + return (rc); +} + +arc_prune_t * +arc_add_prune_callback(arc_prune_func_t *func, void *private) +{ + arc_prune_t *p; + + p = kmem_alloc(sizeof (*p), KM_SLEEP); + p->p_pfunc = func; + p->p_private = private; + list_link_init(&p->p_node); + zfs_refcount_create(&p->p_refcnt); + + mutex_enter(&arc_prune_mtx); + zfs_refcount_add(&p->p_refcnt, &arc_prune_list); + list_insert_head(&arc_prune_list, p); + mutex_exit(&arc_prune_mtx); + + return (p); +} + +void +arc_remove_prune_callback(arc_prune_t *p) +{ + boolean_t wait = B_FALSE; + mutex_enter(&arc_prune_mtx); + list_remove(&arc_prune_list, p); + if (zfs_refcount_remove(&p->p_refcnt, &arc_prune_list) > 0) + wait = B_TRUE; + mutex_exit(&arc_prune_mtx); + + /* wait for arc_prune_task to finish */ + if (wait) + taskq_wait_outstanding(arc_prune_taskq, 0); + ASSERT0(zfs_refcount_count(&p->p_refcnt)); + zfs_refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); +} + +/* + * Notify the arc that a block was freed, and thus will never be used again. + */ +void +arc_freed(spa_t *spa, const blkptr_t *bp) +{ + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + uint64_t guid = spa_load_guid(spa); + + ASSERT(!BP_IS_EMBEDDED(bp)); + + hdr = buf_hash_find(guid, bp, &hash_lock); + if (hdr == NULL) + return; + + /* + * We might be trying to free a block that is still doing I/O + * (i.e. prefetch) or has a reference (i.e. a dedup-ed, + * dmu_sync-ed block). If this block is being prefetched, then it + * would still have the ARC_FLAG_IO_IN_PROGRESS flag set on the hdr + * until the I/O completes. A block may also have a reference if it is + * part of a dedup-ed, dmu_synced write. The dmu_sync() function would + * have written the new block to its final resting place on disk but + * without the dedup flag set. This would have left the hdr in the MRU + * state and discoverable. When the txg finally syncs it detects that + * the block was overridden in open context and issues an override I/O. + * Since this is a dedup block, the override I/O will determine if the + * block is already in the DDT. If so, then it will replace the io_bp + * with the bp from the DDT and allow the I/O to finish. When the I/O + * reaches the done callback, dbuf_write_override_done, it will + * check to see if the io_bp and io_bp_override are identical. + * If they are not, then it indicates that the bp was replaced with + * the bp in the DDT and the override bp is freed. This allows + * us to arrive here with a reference on a block that is being + * freed. So if we have an I/O in progress, or a reference to + * this hdr, then we don't destroy the hdr. + */ + if (!HDR_HAS_L1HDR(hdr) || (!HDR_IO_IN_PROGRESS(hdr) && + zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt))) { + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + mutex_exit(hash_lock); + } else { + mutex_exit(hash_lock); + } + +} + +/* + * Release this buffer from the cache, making it an anonymous buffer. This + * must be done after a read and prior to modifying the buffer contents. + * If the buffer has more than one reference, we must make + * a new hdr for the buffer. + */ +void +arc_release(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + /* + * It would be nice to assert that if its DMU metadata (level > + * 0 || it's the dnode file), then it must be syncing context. + * But we don't know that information at this level. + */ + + mutex_enter(&buf->b_evict_lock); + + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * We don't grab the hash lock prior to this check, because if + * the buffer's header is in the arc_anon state, it won't be + * linked into the hash table. + */ + if (hdr->b_l1hdr.b_state == arc_anon) { + mutex_exit(&buf->b_evict_lock); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT(!HDR_IN_HASH_TABLE(hdr)); + ASSERT(!HDR_HAS_L2HDR(hdr)); + ASSERT(HDR_EMPTY(hdr)); + + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), ==, 1); + ASSERT(!list_link_active(&hdr->b_l1hdr.b_arc_node)); + + hdr->b_l1hdr.b_arc_access = 0; + + /* + * If the buf is being overridden then it may already + * have a hdr that is not empty. + */ + buf_discard_identity(hdr); + arc_buf_thaw(buf); + + return; + } + + kmutex_t *hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + + /* + * This assignment is only valid as long as the hash_lock is + * held, we must be careful not to reference state or the + * b_state field after dropping the lock. + */ + arc_state_t *state = hdr->b_l1hdr.b_state; + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + ASSERT3P(state, !=, arc_anon); + + /* this buffer is not on any list */ + ASSERT3S(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt), >, 0); + + if (HDR_HAS_L2HDR(hdr)) { + mutex_enter(&hdr->b_l2hdr.b_dev->l2ad_mtx); + + /* + * We have to recheck this conditional again now that + * we're holding the l2ad_mtx to prevent a race with + * another thread which might be concurrently calling + * l2arc_evict(). In that case, l2arc_evict() might have + * destroyed the header's L2 portion as we were waiting + * to acquire the l2ad_mtx. + */ + if (HDR_HAS_L2HDR(hdr)) + arc_hdr_l2hdr_destroy(hdr); + + mutex_exit(&hdr->b_l2hdr.b_dev->l2ad_mtx); + } + + /* + * Do we have more than one buf? + */ + if (hdr->b_l1hdr.b_bufcnt > 1) { + arc_buf_hdr_t *nhdr; + uint64_t spa = hdr->b_spa; + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t lsize = HDR_GET_LSIZE(hdr); + boolean_t protected = HDR_PROTECTED(hdr); + enum zio_compress compress = arc_hdr_get_compress(hdr); + arc_buf_contents_t type = arc_buf_type(hdr); + VERIFY3U(hdr->b_type, ==, type); + + ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL); + (void) remove_reference(hdr, hash_lock, tag); + + if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + ASSERT(ARC_BUF_LAST(buf)); + } + + /* + * Pull the data off of this hdr and attach it to + * a new anonymous hdr. Also find the last buffer + * in the hdr's buffer list. + */ + arc_buf_t *lastbuf = arc_buf_remove(hdr, buf); + ASSERT3P(lastbuf, !=, NULL); + + /* + * If the current arc_buf_t and the hdr are sharing their data + * buffer, then we must stop sharing that block. + */ + if (arc_buf_is_shared(buf)) { + ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf); + VERIFY(!arc_buf_is_shared(lastbuf)); + + /* + * First, sever the block sharing relationship between + * buf and the arc_buf_hdr_t. + */ + arc_unshare_buf(hdr, buf); + + /* + * Now we need to recreate the hdr's b_pabd. Since we + * have lastbuf handy, we try to share with it, but if + * we can't then we allocate a new b_pabd and copy the + * data from buf into it. + */ + if (arc_can_share(hdr, lastbuf)) { + arc_share_buf(hdr, lastbuf); + } else { + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, + buf->b_data, psize); + } + VERIFY3P(lastbuf->b_data, !=, NULL); + } else if (HDR_SHARED_DATA(hdr)) { + /* + * Uncompressed shared buffers are always at the end + * of the list. Compressed buffers don't have the + * same requirements. This makes it hard to + * simply assert that the lastbuf is shared so + * we rely on the hdr's compression flags to determine + * if we have a compressed, shared buffer. + */ + ASSERT(arc_buf_is_shared(lastbuf) || + arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF); + ASSERT(!ARC_BUF_SHARED(buf)); + } + + ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr)); + ASSERT3P(state, !=, arc_l2c_only); + + (void) zfs_refcount_remove_many(&state->arcs_size, + arc_buf_size(buf), buf); + + if (zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)) { + ASSERT3P(state, !=, arc_l2c_only); + (void) zfs_refcount_remove_many( + &state->arcs_esize[type], + arc_buf_size(buf), buf); + } + + hdr->b_l1hdr.b_bufcnt -= 1; + if (ARC_BUF_ENCRYPTED(buf)) + hdr->b_crypt_hdr.b_ebufcnt -= 1; + + arc_cksum_verify(buf); + arc_buf_unwatch(buf); + + /* if this is the last uncompressed buf free the checksum */ + if (!arc_hdr_has_uncompressed_buf(hdr)) + arc_cksum_free(hdr); + + mutex_exit(hash_lock); + + /* + * Allocate a new hdr. The new hdr will contain a b_pabd + * buffer which will be freed in arc_write(). + */ + nhdr = arc_hdr_alloc(spa, psize, lsize, protected, + compress, hdr->b_complevel, type, HDR_HAS_RABD(hdr)); + ASSERT3P(nhdr->b_l1hdr.b_buf, ==, NULL); + ASSERT0(nhdr->b_l1hdr.b_bufcnt); + ASSERT0(zfs_refcount_count(&nhdr->b_l1hdr.b_refcnt)); + VERIFY3U(nhdr->b_type, ==, type); + ASSERT(!HDR_SHARED_DATA(nhdr)); + + nhdr->b_l1hdr.b_buf = buf; + nhdr->b_l1hdr.b_bufcnt = 1; + if (ARC_BUF_ENCRYPTED(buf)) + nhdr->b_crypt_hdr.b_ebufcnt = 1; + nhdr->b_l1hdr.b_mru_hits = 0; + nhdr->b_l1hdr.b_mru_ghost_hits = 0; + nhdr->b_l1hdr.b_mfu_hits = 0; + nhdr->b_l1hdr.b_mfu_ghost_hits = 0; + nhdr->b_l1hdr.b_l2_hits = 0; + (void) zfs_refcount_add(&nhdr->b_l1hdr.b_refcnt, tag); + buf->b_hdr = nhdr; + + mutex_exit(&buf->b_evict_lock); + (void) zfs_refcount_add_many(&arc_anon->arcs_size, + arc_buf_size(buf), buf); + } else { + mutex_exit(&buf->b_evict_lock); + ASSERT(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 1); + /* protected by hash lock, or hdr is on arc_anon */ + ASSERT(!multilist_link_active(&hdr->b_l1hdr.b_arc_node)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + hdr->b_l1hdr.b_mru_hits = 0; + hdr->b_l1hdr.b_mru_ghost_hits = 0; + hdr->b_l1hdr.b_mfu_hits = 0; + hdr->b_l1hdr.b_mfu_ghost_hits = 0; + hdr->b_l1hdr.b_l2_hits = 0; + arc_change_state(arc_anon, hdr, hash_lock); + hdr->b_l1hdr.b_arc_access = 0; + + mutex_exit(hash_lock); + buf_discard_identity(hdr); + arc_buf_thaw(buf); + } +} + +int +arc_released(arc_buf_t *buf) +{ + int released; + + mutex_enter(&buf->b_evict_lock); + released = (buf->b_data != NULL && + buf->b_hdr->b_l1hdr.b_state == arc_anon); + mutex_exit(&buf->b_evict_lock); + return (released); +} + +#ifdef ZFS_DEBUG +int +arc_referenced(arc_buf_t *buf) +{ + int referenced; + + mutex_enter(&buf->b_evict_lock); + referenced = (zfs_refcount_count(&buf->b_hdr->b_l1hdr.b_refcnt)); + mutex_exit(&buf->b_evict_lock); + return (referenced); +} +#endif + +static void +arc_write_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_IS_HOLE(bp) ? 0 : BP_GET_PSIZE(bp); + fstrans_cookie_t cookie = spl_fstrans_mark(); + + ASSERT(HDR_HAS_L1HDR(hdr)); + ASSERT(!zfs_refcount_is_zero(&buf->b_hdr->b_l1hdr.b_refcnt)); + ASSERT(hdr->b_l1hdr.b_bufcnt > 0); + + /* + * If we're reexecuting this zio because the pool suspended, then + * cleanup any state that was previously set the first time the + * callback was invoked. + */ + if (zio->io_flags & ZIO_FLAG_REEXECUTED) { + arc_cksum_free(hdr); + arc_buf_unwatch(buf); + if (hdr->b_l1hdr.b_pabd != NULL) { + if (arc_buf_is_shared(buf)) { + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_abd(hdr, B_FALSE); + } + } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); + } + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + ASSERT(!HDR_HAS_RABD(hdr)); + ASSERT(!HDR_SHARED_DATA(hdr)); + ASSERT(!arc_buf_is_shared(buf)); + + callback->awcb_ready(zio, buf, callback->awcb_private); + + if (HDR_IO_IN_PROGRESS(hdr)) + ASSERT(zio->io_flags & ZIO_FLAG_REEXECUTED); + + arc_hdr_set_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + + if (BP_IS_PROTECTED(bp) != !!HDR_PROTECTED(hdr)) + hdr = arc_hdr_realloc_crypt(hdr, BP_IS_PROTECTED(bp)); + + if (BP_IS_PROTECTED(bp)) { + /* ZIL blocks are written through zio_rewrite */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(HDR_PROTECTED(hdr)); + + if (BP_SHOULD_BYTESWAP(bp)) { + if (BP_GET_LEVEL(bp) > 0) { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_UINT64; + } else { + hdr->b_l1hdr.b_byteswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + } + } else { + hdr->b_l1hdr.b_byteswap = DMU_BSWAP_NUMFUNCS; + } + + hdr->b_crypt_hdr.b_ot = BP_GET_TYPE(bp); + hdr->b_crypt_hdr.b_dsobj = zio->io_bookmark.zb_objset; + zio_crypt_decode_params_bp(bp, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv); + zio_crypt_decode_mac_bp(bp, hdr->b_crypt_hdr.b_mac); + } + + /* + * If this block was written for raw encryption but the zio layer + * ended up only authenticating it, adjust the buffer flags now. + */ + if (BP_IS_AUTHENTICATED(bp) && ARC_BUF_ENCRYPTED(buf)) { + arc_hdr_set_flags(hdr, ARC_FLAG_NOAUTH); + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + if (BP_GET_COMPRESS(bp) == ZIO_COMPRESS_OFF) + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + } else if (BP_IS_HOLE(bp) && ARC_BUF_ENCRYPTED(buf)) { + buf->b_flags &= ~ARC_BUF_FLAG_ENCRYPTED; + buf->b_flags &= ~ARC_BUF_FLAG_COMPRESSED; + } + + /* this must be done after the buffer flags are adjusted */ + arc_cksum_compute(buf); + + enum zio_compress compress; + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) { + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(HDR_GET_LSIZE(hdr), ==, BP_GET_LSIZE(bp)); + compress = BP_GET_COMPRESS(bp); + } + HDR_SET_PSIZE(hdr, psize); + arc_hdr_set_compress(hdr, compress); + hdr->b_complevel = zio->io_prop.zp_complevel; + + if (zio->io_error != 0 || psize == 0) + goto out; + + /* + * Fill the hdr with data. If the buffer is encrypted we have no choice + * but to copy the data into b_radb. If the hdr is compressed, the data + * we want is available from the zio, otherwise we can take it from + * the buf. + * + * We might be able to share the buf's data with the hdr here. However, + * doing so would cause the ARC to be full of linear ABDs if we write a + * lot of shareable data. As a compromise, we check whether scattered + * ABDs are allowed, and assume that if they are then the user wants + * the ARC to be primarily filled with them regardless of the data being + * written. Therefore, if they're allowed then we allocate one and copy + * the data into it; otherwise, we share the data directly if we can. + */ + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT3U(psize, >, 0); + ASSERT(ARC_BUF_COMPRESSED(buf)); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (zfs_abd_scatter_enabled || !arc_can_share(hdr, buf)) { + /* + * Ideally, we would always copy the io_abd into b_pabd, but the + * user may have disabled compressed ARC, thus we must check the + * hdr's compression setting rather than the io_bp's. + */ + if (BP_IS_ENCRYPTED(bp)) { + ASSERT3U(psize, >, 0); + arc_hdr_alloc_abd(hdr, + ARC_HDR_DO_ADAPT|ARC_HDR_ALLOC_RDATA); + abd_copy(hdr->b_crypt_hdr.b_rabd, zio->io_abd, psize); + } else if (arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF && + !ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(psize, >, 0); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + abd_copy(hdr->b_l1hdr.b_pabd, zio->io_abd, psize); + } else { + ASSERT3U(zio->io_orig_size, ==, arc_hdr_size(hdr)); + arc_hdr_alloc_abd(hdr, ARC_HDR_DO_ADAPT); + abd_copy_from_buf(hdr->b_l1hdr.b_pabd, buf->b_data, + arc_buf_size(buf)); + } + } else { + ASSERT3P(buf->b_data, ==, abd_to_buf(zio->io_orig_abd)); + ASSERT3U(zio->io_orig_size, ==, arc_buf_size(buf)); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, ==, 1); + + arc_share_buf(hdr, buf); + } + +out: + arc_hdr_verify(hdr, bp); + spl_fstrans_unmark(cookie); +} + +static void +arc_write_children_ready(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + + callback->awcb_children_ready(zio, buf, callback->awcb_private); +} + +/* + * The SPA calls this callback for each physical write that happens on behalf + * of a logical write. See the comment in dbuf_write_physdone() for details. + */ +static void +arc_write_physdone(zio_t *zio) +{ + arc_write_callback_t *cb = zio->io_private; + if (cb->awcb_physdone != NULL) + cb->awcb_physdone(zio, cb->awcb_buf, cb->awcb_private); +} + +static void +arc_write_done(zio_t *zio) +{ + arc_write_callback_t *callback = zio->io_private; + arc_buf_t *buf = callback->awcb_buf; + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + + if (zio->io_error == 0) { + arc_hdr_verify(hdr, zio->io_bp); + + if (BP_IS_HOLE(zio->io_bp) || BP_IS_EMBEDDED(zio->io_bp)) { + buf_discard_identity(hdr); + } else { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + } + } else { + ASSERT(HDR_EMPTY(hdr)); + } + + /* + * If the block to be written was all-zero or compressed enough to be + * embedded in the BP, no write was performed so there will be no + * dva/birth/checksum. The buffer must therefore remain anonymous + * (and uncached). + */ + if (!HDR_EMPTY(hdr)) { + arc_buf_hdr_t *exists; + kmutex_t *hash_lock; + + ASSERT3U(zio->io_error, ==, 0); + + arc_cksum_verify(buf); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists != NULL) { + /* + * This can only happen if we overwrite for + * sync-to-convergence, because we remove + * buffers from the hash table when we arc_free(). + */ + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(zfs_refcount_is_zero( + &exists->b_l1hdr.b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + arc_hdr_destroy(exists); + mutex_exit(hash_lock); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else if (zio->io_flags & ZIO_FLAG_NOPWRITE) { + /* nopwrite */ + ASSERT(zio->io_prop.zp_nopwrite); + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad nopwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + } else { + /* Dedup */ + ASSERT(hdr->b_l1hdr.b_bufcnt == 1); + ASSERT(hdr->b_l1hdr.b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } + } + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + /* if it's not anon, we are doing a scrub */ + if (exists == NULL && hdr->b_l1hdr.b_state == arc_anon) + arc_access(hdr, hash_lock); + mutex_exit(hash_lock); + } else { + arc_hdr_clear_flags(hdr, ARC_FLAG_IO_IN_PROGRESS); + } + + ASSERT(!zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); + + abd_put(zio->io_abd); + kmem_free(callback, sizeof (arc_write_callback_t)); +} + +zio_t * +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, + const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, + int zio_flags, const zbookmark_phys_t *zb) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + arc_write_callback_t *callback; + zio_t *zio; + zio_prop_t localprop = *zp; + + ASSERT3P(ready, !=, NULL); + ASSERT3P(done, !=, NULL); + ASSERT(!HDR_IO_ERROR(hdr)); + ASSERT(!HDR_IO_IN_PROGRESS(hdr)); + ASSERT3P(hdr->b_l1hdr.b_acb, ==, NULL); + ASSERT3U(hdr->b_l1hdr.b_bufcnt, >, 0); + if (l2arc) + arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); + + if (ARC_BUF_ENCRYPTED(buf)) { + ASSERT(ARC_BUF_COMPRESSED(buf)); + localprop.zp_encrypt = B_TRUE; + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; + localprop.zp_byteorder = + (hdr->b_l1hdr.b_byteswap == DMU_BSWAP_NUMFUNCS) ? + ZFS_HOST_BYTEORDER : !ZFS_HOST_BYTEORDER; + bcopy(hdr->b_crypt_hdr.b_salt, localprop.zp_salt, + ZIO_DATA_SALT_LEN); + bcopy(hdr->b_crypt_hdr.b_iv, localprop.zp_iv, + ZIO_DATA_IV_LEN); + bcopy(hdr->b_crypt_hdr.b_mac, localprop.zp_mac, + ZIO_DATA_MAC_LEN); + if (DMU_OT_IS_ENCRYPTED(localprop.zp_type)) { + localprop.zp_nopwrite = B_FALSE; + localprop.zp_copies = + MIN(localprop.zp_copies, SPA_DVAS_PER_BP - 1); + } + zio_flags |= ZIO_FLAG_RAW; + } else if (ARC_BUF_COMPRESSED(buf)) { + ASSERT3U(HDR_GET_LSIZE(hdr), !=, arc_buf_size(buf)); + localprop.zp_compress = HDR_GET_COMPRESS(hdr); + localprop.zp_complevel = hdr->b_complevel; + zio_flags |= ZIO_FLAG_RAW_COMPRESS; + } + callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); + callback->awcb_ready = ready; + callback->awcb_children_ready = children_ready; + callback->awcb_physdone = physdone; + callback->awcb_done = done; + callback->awcb_private = private; + callback->awcb_buf = buf; + + /* + * The hdr's b_pabd is now stale, free it now. A new data block + * will be allocated when the zio pipeline calls arc_write_ready(). + */ + if (hdr->b_l1hdr.b_pabd != NULL) { + /* + * If the buf is currently sharing the data block with + * the hdr then we need to break that relationship here. + * The hdr will remain with a NULL data pointer and the + * buf will take sole ownership of the block. + */ + if (arc_buf_is_shared(buf)) { + arc_unshare_buf(hdr, buf); + } else { + arc_hdr_free_abd(hdr, B_FALSE); + } + VERIFY3P(buf->b_data, !=, NULL); + } + + if (HDR_HAS_RABD(hdr)) + arc_hdr_free_abd(hdr, B_TRUE); + + if (!(zio_flags & ZIO_FLAG_RAW)) + arc_hdr_set_compress(hdr, ZIO_COMPRESS_OFF); + + ASSERT(!arc_buf_is_shared(buf)); + ASSERT3P(hdr->b_l1hdr.b_pabd, ==, NULL); + + zio = zio_write(pio, spa, txg, bp, + abd_get_from_buf(buf->b_data, HDR_GET_LSIZE(hdr)), + HDR_GET_LSIZE(hdr), arc_buf_size(buf), &localprop, arc_write_ready, + (children_ready != NULL) ? arc_write_children_ready : NULL, + arc_write_physdone, arc_write_done, callback, + priority, zio_flags, zb); + + return (zio); +} + +void +arc_tempreserve_clear(uint64_t reserve) +{ + atomic_add_64(&arc_tempreserve, -reserve); + ASSERT((int64_t)arc_tempreserve >= 0); +} + +int +arc_tempreserve_space(spa_t *spa, uint64_t reserve, uint64_t txg) +{ + int error; + uint64_t anon_size; + + if (!arc_no_grow && + reserve > arc_c/4 && + reserve * 4 > (2ULL << SPA_MAXBLOCKSHIFT)) + arc_c = MIN(arc_c_max, reserve * 4); + + /* + * Throttle when the calculated memory footprint for the TXG + * exceeds the target ARC size. + */ + if (reserve > arc_c) { + DMU_TX_STAT_BUMP(dmu_tx_memory_reserve); + return (SET_ERROR(ERESTART)); + } + + /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + + /* assert that it has not wrapped around */ + ASSERT3S(atomic_add_64_nv(&arc_loaned_bytes, 0), >=, 0); + + anon_size = MAX((int64_t)(zfs_refcount_count(&arc_anon->arcs_size) - + arc_loaned_bytes), 0); + + /* + * Writes will, almost always, require additional memory allocations + * in order to compress/encrypt/etc the data. We therefore need to + * make sure that there is sufficient available memory for this. + */ + error = arc_memory_throttle(spa, reserve, txg); + if (error != 0) + return (error); + + /* + * Throttle writes when the amount of dirty data in the cache + * gets too large. We try to keep the cache less than half full + * of dirty blocks so that our sync times don't grow too large. + * + * In the case of one pool being built on another pool, we want + * to make sure we don't end up throttling the lower (backing) + * pool when the upper pool is the majority contributor to dirty + * data. To insure we make forward progress during throttling, we + * also check the current pool's net dirty data and only throttle + * if it exceeds zfs_arc_pool_dirty_percent of the anonymous dirty + * data in the cache. + * + * Note: if two requests come in concurrently, we might let them + * both succeed, when one of them should fail. Not a huge deal. + */ + uint64_t total_dirty = reserve + arc_tempreserve + anon_size; + uint64_t spa_dirty_anon = spa_dirty_data(spa); + + if (total_dirty > arc_c * zfs_arc_dirty_limit_percent / 100 && + anon_size > arc_c * zfs_arc_anon_limit_percent / 100 && + spa_dirty_anon > anon_size * zfs_arc_pool_dirty_percent / 100) { +#ifdef ZFS_DEBUG + uint64_t meta_esize = zfs_refcount_count( + &arc_anon->arcs_esize[ARC_BUFC_METADATA]); + uint64_t data_esize = + zfs_refcount_count(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " + "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", + arc_tempreserve >> 10, meta_esize >> 10, + data_esize >> 10, reserve >> 10, arc_c >> 10); +#endif + DMU_TX_STAT_BUMP(dmu_tx_dirty_throttle); + return (SET_ERROR(ERESTART)); + } + atomic_add_64(&arc_tempreserve, reserve); + return (0); +} + +static void +arc_kstat_update_state(arc_state_t *state, kstat_named_t *size, + kstat_named_t *evict_data, kstat_named_t *evict_metadata) +{ + size->value.ui64 = zfs_refcount_count(&state->arcs_size); + evict_data->value.ui64 = + zfs_refcount_count(&state->arcs_esize[ARC_BUFC_DATA]); + evict_metadata->value.ui64 = + zfs_refcount_count(&state->arcs_esize[ARC_BUFC_METADATA]); +} + +static int +arc_kstat_update(kstat_t *ksp, int rw) +{ + arc_stats_t *as = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (SET_ERROR(EACCES)); + } else { + arc_kstat_update_state(arc_anon, + &as->arcstat_anon_size, + &as->arcstat_anon_evictable_data, + &as->arcstat_anon_evictable_metadata); + arc_kstat_update_state(arc_mru, + &as->arcstat_mru_size, + &as->arcstat_mru_evictable_data, + &as->arcstat_mru_evictable_metadata); + arc_kstat_update_state(arc_mru_ghost, + &as->arcstat_mru_ghost_size, + &as->arcstat_mru_ghost_evictable_data, + &as->arcstat_mru_ghost_evictable_metadata); + arc_kstat_update_state(arc_mfu, + &as->arcstat_mfu_size, + &as->arcstat_mfu_evictable_data, + &as->arcstat_mfu_evictable_metadata); + arc_kstat_update_state(arc_mfu_ghost, + &as->arcstat_mfu_ghost_size, + &as->arcstat_mfu_ghost_evictable_data, + &as->arcstat_mfu_ghost_evictable_metadata); + + ARCSTAT(arcstat_size) = aggsum_value(&arc_size); + ARCSTAT(arcstat_meta_used) = aggsum_value(&arc_meta_used); + ARCSTAT(arcstat_data_size) = aggsum_value(&astat_data_size); + ARCSTAT(arcstat_metadata_size) = + aggsum_value(&astat_metadata_size); + ARCSTAT(arcstat_hdr_size) = aggsum_value(&astat_hdr_size); + ARCSTAT(arcstat_l2_hdr_size) = aggsum_value(&astat_l2_hdr_size); + ARCSTAT(arcstat_dbuf_size) = aggsum_value(&astat_dbuf_size); +#if defined(COMPAT_FREEBSD11) + ARCSTAT(arcstat_other_size) = aggsum_value(&astat_bonus_size) + + aggsum_value(&astat_dnode_size) + + aggsum_value(&astat_dbuf_size); +#endif + ARCSTAT(arcstat_dnode_size) = aggsum_value(&astat_dnode_size); + ARCSTAT(arcstat_bonus_size) = aggsum_value(&astat_bonus_size); + ARCSTAT(arcstat_abd_chunk_waste_size) = + aggsum_value(&astat_abd_chunk_waste_size); + + as->arcstat_memory_all_bytes.value.ui64 = + arc_all_memory(); + as->arcstat_memory_free_bytes.value.ui64 = + arc_free_memory(); + as->arcstat_memory_available_bytes.value.i64 = + arc_available_memory(); + } + + return (0); +} + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the ARC eviction + * code is laid out; arc_evict_state() assumes ARC buffers are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +static unsigned int +arc_state_multilist_index_func(multilist_t *ml, void *obj) +{ + arc_buf_hdr_t *hdr = obj; + + /* + * We rely on b_dva to generate evenly distributed index + * numbers using buf_hash below. So, as an added precaution, + * let's make sure we never add empty buffers to the arc lists. + */ + ASSERT(!HDR_EMPTY(hdr)); + + /* + * The assumption here, is the hash value for a given + * arc_buf_hdr_t will remain constant throughout its lifetime + * (i.e. its b_spa, b_dva, and b_birth fields don't change). + * Thus, we don't need to store the header's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (buf_hash(hdr->b_spa, &hdr->b_dva, hdr->b_birth) % + multilist_get_num_sublists(ml)); +} + +#define WARN_IF_TUNING_IGNORED(tuning, value, do_warn) do { \ + if ((do_warn) && (tuning) && ((tuning) != (value))) { \ + cmn_err(CE_WARN, \ + "ignoring tunable %s (using %llu instead)", \ + (#tuning), (value)); \ + } \ +} while (0) + +/* + * Called during module initialization and periodically thereafter to + * apply reasonable changes to the exposed performance tunings. Can also be + * called explicitly by param_set_arc_*() functions when ARC tunables are + * updated manually. Non-zero zfs_* values which differ from the currently set + * values will be applied. + */ +void +arc_tuning_update(boolean_t verbose) +{ + uint64_t allmem = arc_all_memory(); + unsigned long limit; + + /* Valid range: 32M - */ + if ((zfs_arc_min) && (zfs_arc_min != arc_c_min) && + (zfs_arc_min >= 2ULL << SPA_MAXBLOCKSHIFT) && + (zfs_arc_min <= arc_c_max)) { + arc_c_min = zfs_arc_min; + arc_c = MAX(arc_c, arc_c_min); + } + WARN_IF_TUNING_IGNORED(zfs_arc_min, arc_c_min, verbose); + + /* Valid range: 64M - */ + if ((zfs_arc_max) && (zfs_arc_max != arc_c_max) && + (zfs_arc_max >= 64 << 20) && (zfs_arc_max < allmem) && + (zfs_arc_max > arc_c_min)) { + arc_c_max = zfs_arc_max; + arc_c = MIN(arc_c, arc_c_max); + arc_p = (arc_c >> 1); + if (arc_meta_limit > arc_c_max) + arc_meta_limit = arc_c_max; + if (arc_dnode_size_limit > arc_meta_limit) + arc_dnode_size_limit = arc_meta_limit; + } + WARN_IF_TUNING_IGNORED(zfs_arc_max, arc_c_max, verbose); + + /* Valid range: 16M - */ + if ((zfs_arc_meta_min) && (zfs_arc_meta_min != arc_meta_min) && + (zfs_arc_meta_min >= 1ULL << SPA_MAXBLOCKSHIFT) && + (zfs_arc_meta_min <= arc_c_max)) { + arc_meta_min = zfs_arc_meta_min; + if (arc_meta_limit < arc_meta_min) + arc_meta_limit = arc_meta_min; + if (arc_dnode_size_limit < arc_meta_min) + arc_dnode_size_limit = arc_meta_min; + } + WARN_IF_TUNING_IGNORED(zfs_arc_meta_min, arc_meta_min, verbose); + + /* Valid range: - */ + limit = zfs_arc_meta_limit ? zfs_arc_meta_limit : + MIN(zfs_arc_meta_limit_percent, 100) * arc_c_max / 100; + if ((limit != arc_meta_limit) && + (limit >= arc_meta_min) && + (limit <= arc_c_max)) + arc_meta_limit = limit; + WARN_IF_TUNING_IGNORED(zfs_arc_meta_limit, arc_meta_limit, verbose); + + /* Valid range: - */ + limit = zfs_arc_dnode_limit ? zfs_arc_dnode_limit : + MIN(zfs_arc_dnode_limit_percent, 100) * arc_meta_limit / 100; + if ((limit != arc_dnode_size_limit) && + (limit >= arc_meta_min) && + (limit <= arc_meta_limit)) + arc_dnode_size_limit = limit; + WARN_IF_TUNING_IGNORED(zfs_arc_dnode_limit, arc_dnode_size_limit, + verbose); + + /* Valid range: 1 - N */ + if (zfs_arc_grow_retry) + arc_grow_retry = zfs_arc_grow_retry; + + /* Valid range: 1 - N */ + if (zfs_arc_shrink_shift) { + arc_shrink_shift = zfs_arc_shrink_shift; + arc_no_grow_shift = MIN(arc_no_grow_shift, arc_shrink_shift -1); + } + + /* Valid range: 1 - N */ + if (zfs_arc_p_min_shift) + arc_p_min_shift = zfs_arc_p_min_shift; + + /* Valid range: 1 - N ms */ + if (zfs_arc_min_prefetch_ms) + arc_min_prefetch_ms = zfs_arc_min_prefetch_ms; + + /* Valid range: 1 - N ms */ + if (zfs_arc_min_prescient_prefetch_ms) { + arc_min_prescient_prefetch_ms = + zfs_arc_min_prescient_prefetch_ms; + } + + /* Valid range: 0 - 100 */ + if ((zfs_arc_lotsfree_percent >= 0) && + (zfs_arc_lotsfree_percent <= 100)) + arc_lotsfree_percent = zfs_arc_lotsfree_percent; + WARN_IF_TUNING_IGNORED(zfs_arc_lotsfree_percent, arc_lotsfree_percent, + verbose); + + /* Valid range: 0 - */ + if ((zfs_arc_sys_free) && (zfs_arc_sys_free != arc_sys_free)) + arc_sys_free = MIN(MAX(zfs_arc_sys_free, 0), allmem); + WARN_IF_TUNING_IGNORED(zfs_arc_sys_free, arc_sys_free, verbose); +} + +static void +arc_state_init(void) +{ + arc_anon = &ARC_anon; + arc_mru = &ARC_mru; + arc_mru_ghost = &ARC_mru_ghost; + arc_mfu = &ARC_mfu; + arc_mfu_ghost = &ARC_mfu_ghost; + arc_l2c_only = &ARC_l2c_only; + + arc_mru->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru_ghost->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mru_ghost->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_mfu_ghost->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_l2c_only->arcs_list[ARC_BUFC_METADATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + arc_l2c_only->arcs_list[ARC_BUFC_DATA] = + multilist_create(sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l1hdr.b_arc_node), + arc_state_multilist_index_func); + + zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_create(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + zfs_refcount_create(&arc_anon->arcs_size); + zfs_refcount_create(&arc_mru->arcs_size); + zfs_refcount_create(&arc_mru_ghost->arcs_size); + zfs_refcount_create(&arc_mfu->arcs_size); + zfs_refcount_create(&arc_mfu_ghost->arcs_size); + zfs_refcount_create(&arc_l2c_only->arcs_size); + + aggsum_init(&arc_meta_used, 0); + aggsum_init(&arc_size, 0); + aggsum_init(&astat_data_size, 0); + aggsum_init(&astat_metadata_size, 0); + aggsum_init(&astat_hdr_size, 0); + aggsum_init(&astat_l2_hdr_size, 0); + aggsum_init(&astat_bonus_size, 0); + aggsum_init(&astat_dnode_size, 0); + aggsum_init(&astat_dbuf_size, 0); + aggsum_init(&astat_abd_chunk_waste_size, 0); + + arc_anon->arcs_state = ARC_STATE_ANON; + arc_mru->arcs_state = ARC_STATE_MRU; + arc_mru_ghost->arcs_state = ARC_STATE_MRU_GHOST; + arc_mfu->arcs_state = ARC_STATE_MFU; + arc_mfu_ghost->arcs_state = ARC_STATE_MFU_GHOST; + arc_l2c_only->arcs_state = ARC_STATE_L2C_ONLY; +} + +static void +arc_state_fini(void) +{ + zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_anon->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mru_ghost->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_esize[ARC_BUFC_DATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_METADATA]); + zfs_refcount_destroy(&arc_l2c_only->arcs_esize[ARC_BUFC_DATA]); + + zfs_refcount_destroy(&arc_anon->arcs_size); + zfs_refcount_destroy(&arc_mru->arcs_size); + zfs_refcount_destroy(&arc_mru_ghost->arcs_size); + zfs_refcount_destroy(&arc_mfu->arcs_size); + zfs_refcount_destroy(&arc_mfu_ghost->arcs_size); + zfs_refcount_destroy(&arc_l2c_only->arcs_size); + + multilist_destroy(arc_mru->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_mru->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mru_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mfu->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_mfu_ghost->arcs_list[ARC_BUFC_DATA]); + multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_METADATA]); + multilist_destroy(arc_l2c_only->arcs_list[ARC_BUFC_DATA]); + + aggsum_fini(&arc_meta_used); + aggsum_fini(&arc_size); + aggsum_fini(&astat_data_size); + aggsum_fini(&astat_metadata_size); + aggsum_fini(&astat_hdr_size); + aggsum_fini(&astat_l2_hdr_size); + aggsum_fini(&astat_bonus_size); + aggsum_fini(&astat_dnode_size); + aggsum_fini(&astat_dbuf_size); + aggsum_fini(&astat_abd_chunk_waste_size); +} + +uint64_t +arc_target_bytes(void) +{ + return (arc_c); +} + +void +arc_init(void) +{ + uint64_t percent, allmem = arc_all_memory(); + mutex_init(&arc_evict_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&arc_evict_waiters, sizeof (arc_evict_waiter_t), + offsetof(arc_evict_waiter_t, aew_node)); + + arc_min_prefetch_ms = 1000; + arc_min_prescient_prefetch_ms = 6000; + +#if defined(_KERNEL) + arc_lowmem_init(); +#endif + + /* Set min cache to 1/32 of all memory, or 32MB, whichever is more. */ + arc_c_min = MAX(allmem / 32, 2ULL << SPA_MAXBLOCKSHIFT); + + /* How to set default max varies by platform. */ + arc_c_max = arc_default_max(arc_c_min, allmem); + +#ifndef _KERNEL + /* + * In userland, there's only the memory pressure that we artificially + * create (see arc_available_memory()). Don't let arc_c get too + * small, because it can cause transactions to be larger than + * arc_c, causing arc_tempreserve_space() to fail. + */ + arc_c_min = MAX(arc_c_max / 2, 2ULL << SPA_MAXBLOCKSHIFT); +#endif + + arc_c = arc_c_min; + arc_p = (arc_c >> 1); + + /* Set min to 1/2 of arc_c_min */ + arc_meta_min = 1ULL << SPA_MAXBLOCKSHIFT; + /* Initialize maximum observed usage to zero */ + arc_meta_max = 0; + /* + * Set arc_meta_limit to a percent of arc_c_max with a floor of + * arc_meta_min, and a ceiling of arc_c_max. + */ + percent = MIN(zfs_arc_meta_limit_percent, 100); + arc_meta_limit = MAX(arc_meta_min, (percent * arc_c_max) / 100); + percent = MIN(zfs_arc_dnode_limit_percent, 100); + arc_dnode_size_limit = (percent * arc_meta_limit) / 100; + + /* Apply user specified tunings */ + arc_tuning_update(B_TRUE); + + /* if kmem_flags are set, lets try to use less memory */ + if (kmem_debugging()) + arc_c = arc_c / 2; + if (arc_c < arc_c_min) + arc_c = arc_c_min; + + arc_state_init(); + + buf_init(); + + list_create(&arc_prune_list, sizeof (arc_prune_t), + offsetof(arc_prune_t, p_node)); + mutex_init(&arc_prune_mtx, NULL, MUTEX_DEFAULT, NULL); + + arc_prune_taskq = taskq_create("arc_prune", boot_ncpus, defclsyspri, + boot_ncpus, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + arc_ksp = kstat_create("zfs", 0, "arcstats", "misc", KSTAT_TYPE_NAMED, + sizeof (arc_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + + if (arc_ksp != NULL) { + arc_ksp->ks_data = &arc_stats; + arc_ksp->ks_update = arc_kstat_update; + kstat_install(arc_ksp); + } + + arc_evict_zthr = zthr_create_timer("arc_evict", + arc_evict_cb_check, arc_evict_cb, NULL, SEC2NSEC(1)); + arc_reap_zthr = zthr_create_timer("arc_reap", + arc_reap_cb_check, arc_reap_cb, NULL, SEC2NSEC(1)); + + arc_warm = B_FALSE; + + /* + * Calculate maximum amount of dirty data per pool. + * + * If it has been set by a module parameter, take that. + * Otherwise, use a percentage of physical memory defined by + * zfs_dirty_data_max_percent (default 10%) with a cap at + * zfs_dirty_data_max_max (default 4G or 25% of physical memory). + */ +#ifdef __LP64__ + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(4ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#else + if (zfs_dirty_data_max_max == 0) + zfs_dirty_data_max_max = MIN(1ULL * 1024 * 1024 * 1024, + allmem * zfs_dirty_data_max_max_percent / 100); +#endif + + if (zfs_dirty_data_max == 0) { + zfs_dirty_data_max = allmem * + zfs_dirty_data_max_percent / 100; + zfs_dirty_data_max = MIN(zfs_dirty_data_max, + zfs_dirty_data_max_max); + } +} + +void +arc_fini(void) +{ + arc_prune_t *p; + +#ifdef _KERNEL + arc_lowmem_fini(); +#endif /* _KERNEL */ + + /* Use B_TRUE to ensure *all* buffers are evicted */ + arc_flush(NULL, B_TRUE); + + if (arc_ksp != NULL) { + kstat_delete(arc_ksp); + arc_ksp = NULL; + } + + taskq_wait(arc_prune_taskq); + taskq_destroy(arc_prune_taskq); + + mutex_enter(&arc_prune_mtx); + while ((p = list_head(&arc_prune_list)) != NULL) { + list_remove(&arc_prune_list, p); + zfs_refcount_remove(&p->p_refcnt, &arc_prune_list); + zfs_refcount_destroy(&p->p_refcnt); + kmem_free(p, sizeof (*p)); + } + mutex_exit(&arc_prune_mtx); + + list_destroy(&arc_prune_list); + mutex_destroy(&arc_prune_mtx); + + (void) zthr_cancel(arc_evict_zthr); + (void) zthr_cancel(arc_reap_zthr); + + mutex_destroy(&arc_evict_lock); + list_destroy(&arc_evict_waiters); + + /* + * Free any buffers that were tagged for destruction. This needs + * to occur before arc_state_fini() runs and destroys the aggsum + * values which are updated when freeing scatter ABDs. + */ + l2arc_do_free_on_write(); + + /* + * buf_fini() must proceed arc_state_fini() because buf_fin() may + * trigger the release of kmem magazines, which can callback to + * arc_space_return() which accesses aggsums freed in act_state_fini(). + */ + buf_fini(); + arc_state_fini(); + + /* + * We destroy the zthrs after all the ARC state has been + * torn down to avoid the case of them receiving any + * wakeup() signals after they are destroyed. + */ + zthr_destroy(arc_evict_zthr); + zthr_destroy(arc_reap_zthr); + + ASSERT0(arc_loaned_bytes); +} + +/* + * Level 2 ARC + * + * The level 2 ARC (L2ARC) is a cache layer in-between main memory and disk. + * It uses dedicated storage devices to hold cached data, which are populated + * using large infrequent writes. The main role of this cache is to boost + * the performance of random read workloads. The intended L2ARC devices + * include short-stroked disks, solid state disks, and other media with + * substantially faster read latency than disk. + * + * +-----------------------+ + * | ARC | + * +-----------------------+ + * | ^ ^ + * | | | + * l2arc_feed_thread() arc_read() + * | | | + * | l2arc read | + * V | | + * +---------------+ | + * | L2ARC | | + * +---------------+ | + * | ^ | + * l2arc_write() | | + * | | | + * V | | + * +-------+ +-------+ + * | vdev | | vdev | + * | cache | | cache | + * +-------+ +-------+ + * +=========+ .-----. + * : L2ARC : |-_____-| + * : devices : | Disks | + * +=========+ `-_____-' + * + * Read requests are satisfied from the following sources, in order: + * + * 1) ARC + * 2) vdev cache of L2ARC devices + * 3) L2ARC devices + * 4) vdev cache of disks + * 5) disks + * + * Some L2ARC device types exhibit extremely slow write performance. + * To accommodate for this there are some significant differences between + * the L2ARC and traditional cache design: + * + * 1. There is no eviction path from the ARC to the L2ARC. Evictions from + * the ARC behave as usual, freeing buffers and placing headers on ghost + * lists. The ARC does not send buffers to the L2ARC during eviction as + * this would add inflated write latencies for all ARC memory pressure. + * + * 2. The L2ARC attempts to cache data from the ARC before it is evicted. + * It does this by periodically scanning buffers from the eviction-end of + * the MFU and MRU ARC lists, copying them to the L2ARC devices if they are + * not already there. It scans until a headroom of buffers is satisfied, + * which itself is a buffer for ARC eviction. If a compressible buffer is + * found during scanning and selected for writing to an L2ARC device, we + * temporarily boost scanning headroom during the next scan cycle to make + * sure we adapt to compression effects (which might significantly reduce + * the data volume we write to L2ARC). The thread that does this is + * l2arc_feed_thread(), illustrated below; example sizes are included to + * provide a better sense of ratio than this diagram: + * + * head --> tail + * +---------------------+----------+ + * ARC_mfu |:::::#:::::::::::::::|o#o###o###|-->. # already on L2ARC + * +---------------------+----------+ | o L2ARC eligible + * ARC_mru |:#:::::::::::::::::::|#o#ooo####|-->| : ARC buffer + * +---------------------+----------+ | + * 15.9 Gbytes ^ 32 Mbytes | + * headroom | + * l2arc_feed_thread() + * | + * l2arc write hand <--[oooo]--' + * | 8 Mbyte + * | write max + * V + * +==============================+ + * L2ARC dev |####|#|###|###| |####| ... | + * +==============================+ + * 32 Gbytes + * + * 3. If an ARC buffer is copied to the L2ARC but then hit instead of + * evicted, then the L2ARC has cached a buffer much sooner than it probably + * needed to, potentially wasting L2ARC device bandwidth and storage. It is + * safe to say that this is an uncommon case, since buffers at the end of + * the ARC lists have moved there due to inactivity. + * + * 4. If the ARC evicts faster than the L2ARC can maintain a headroom, + * then the L2ARC simply misses copying some buffers. This serves as a + * pressure valve to prevent heavy read workloads from both stalling the ARC + * with waits and clogging the L2ARC with writes. This also helps prevent + * the potential for the L2ARC to churn if it attempts to cache content too + * quickly, such as during backups of the entire pool. + * + * 5. After system boot and before the ARC has filled main memory, there are + * no evictions from the ARC and so the tails of the ARC_mfu and ARC_mru + * lists can remain mostly static. Instead of searching from tail of these + * lists as pictured, the l2arc_feed_thread() will search from the list heads + * for eligible buffers, greatly increasing its chance of finding them. + * + * The L2ARC device write speed is also boosted during this time so that + * the L2ARC warms up faster. Since there have been no ARC evictions yet, + * there are no L2ARC reads, and no fear of degrading read performance + * through increased writes. + * + * 6. Writes to the L2ARC devices are grouped and sent in-sequence, so that + * the vdev queue can aggregate them into larger and fewer writes. Each + * device is written to in a rotor fashion, sweeping writes through + * available space then repeating. + * + * 7. The L2ARC does not store dirty content. It never needs to flush + * write buffers back to disk based storage. + * + * 8. If an ARC buffer is written (and dirtied) which also exists in the + * L2ARC, the now stale L2ARC buffer is immediately dropped. + * + * The performance of the L2ARC can be tweaked by a number of tunables, which + * may be necessary for different workloads: + * + * l2arc_write_max max write bytes per interval + * l2arc_write_boost extra write bytes during device warmup + * l2arc_noprefetch skip caching prefetched buffers + * l2arc_headroom number of max device writes to precache + * l2arc_headroom_boost when we find compressed buffers during ARC + * scanning, we multiply headroom by this + * percentage factor for the next scan cycle, + * since more compressed buffers are likely to + * be present + * l2arc_feed_secs seconds between L2ARC writing + * + * Tunables may be removed or added as future performance improvements are + * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. + * + * L2ARC persistence: + * + * When writing buffers to L2ARC, we periodically add some metadata to + * make sure we can pick them up after reboot, thus dramatically reducing + * the impact that any downtime has on the performance of storage systems + * with large caches. + * + * The implementation works fairly simply by integrating the following two + * modifications: + * + * *) When writing to the L2ARC, we occasionally write a "l2arc log block", + * which is an additional piece of metadata which describes what's been + * written. This allows us to rebuild the arc_buf_hdr_t structures of the + * main ARC buffers. There are 2 linked-lists of log blocks headed by + * dh_start_lbps[2]. We alternate which chain we append to, so they are + * time-wise and offset-wise interleaved, but that is an optimization rather + * than for correctness. The log block also includes a pointer to the + * previous block in its chain. + * + * *) We reserve SPA_MINBLOCKSIZE of space at the start of each L2ARC device + * for our header bookkeeping purposes. This contains a device header, + * which contains our top-level reference structures. We update it each + * time we write a new log block, so that we're able to locate it in the + * L2ARC device. If this write results in an inconsistent device header + * (e.g. due to power failure), we detect this by verifying the header's + * checksum and simply fail to reconstruct the L2ARC after reboot. + * + * Implementation diagram: + * + * +=== L2ARC device (not to scale) ======================================+ + * | ___two newest log block pointers__.__________ | + * | / \dh_start_lbps[1] | + * | / \ \dh_start_lbps[0]| + * |.___/__. V V | + * ||L2 dev|....|lb |bufs |lb |bufs |lb |bufs |lb |bufs |lb |---(empty)---| + * || hdr| ^ /^ /^ / / | + * |+------+ ...--\-------/ \-----/--\------/ / | + * | \--------------/ \--------------/ | + * +======================================================================+ + * + * As can be seen on the diagram, rather than using a simple linked list, + * we use a pair of linked lists with alternating elements. This is a + * performance enhancement due to the fact that we only find out the + * address of the next log block access once the current block has been + * completely read in. Obviously, this hurts performance, because we'd be + * keeping the device's I/O queue at only a 1 operation deep, thus + * incurring a large amount of I/O round-trip latency. Having two lists + * allows us to fetch two log blocks ahead of where we are currently + * rebuilding L2ARC buffers. + * + * On-device data structures: + * + * L2ARC device header: l2arc_dev_hdr_phys_t + * L2ARC log block: l2arc_log_blk_phys_t + * + * L2ARC reconstruction: + * + * When writing data, we simply write in the standard rotary fashion, + * evicting buffers as we go and simply writing new data over them (writing + * a new log block every now and then). This obviously means that once we + * loop around the end of the device, we will start cutting into an already + * committed log block (and its referenced data buffers), like so: + * + * current write head__ __old tail + * \ / + * V V + * <--|bufs |lb |bufs |lb | |bufs |lb |bufs |lb |--> + * ^ ^^^^^^^^^___________________________________ + * | \ + * <> may overwrite this blk and/or its bufs --' + * + * When importing the pool, we detect this situation and use it to stop + * our scanning process (see l2arc_rebuild). + * + * There is one significant caveat to consider when rebuilding ARC contents + * from an L2ARC device: what about invalidated buffers? Given the above + * construction, we cannot update blocks which we've already written to amend + * them to remove buffers which were invalidated. Thus, during reconstruction, + * we might be populating the cache with buffers for data that's not on the + * main pool anymore, or may have been overwritten! + * + * As it turns out, this isn't a problem. Every arc_read request includes + * both the DVA and, crucially, the birth TXG of the BP the caller is + * looking for. So even if the cache were populated by completely rotten + * blocks for data that had been long deleted and/or overwritten, we'll + * never actually return bad data from the cache, since the DVA with the + * birth TXG uniquely identify a block in space and time - once created, + * a block is immutable on disk. The worst thing we have done is wasted + * some time and memory at l2arc rebuild to reconstruct outdated ARC + * entries that will get dropped from the l2arc as it is being updated + * with new blocks. + * + * L2ARC buffers that have been evicted by l2arc_evict() ahead of the write + * hand are not restored. This is done by saving the offset (in bytes) + * l2arc_evict() has evicted to in the L2ARC device header and taking it + * into account when restoring buffers. + */ + +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *hdr) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (hdr->b_spa != spa_guid || HDR_HAS_L2HDR(hdr) || + HDR_IO_IN_PROGRESS(hdr) || !HDR_L2CACHE(hdr)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size, dev_size, tsize; + + /* + * Make sure our globals have meaningful values in case the user + * altered them. + */ + size = l2arc_write_max; + if (size == 0) { + cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " + "be greater than zero, resetting it to the default (%d)", + L2ARC_WRITE_SIZE); + size = l2arc_write_max = L2ARC_WRITE_SIZE; + } + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + + /* + * Make sure the write size does not exceed the size of the cache + * device. This is important in l2arc_evict(), otherwise infinite + * iteration can occur. + */ + dev_size = dev->l2ad_end - dev->l2ad_start; + tsize = size + l2arc_log_blk_overhead(size, dev); + if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) + tsize += MAX(64 * 1024 * 1024, + (tsize * l2arc_trim_ahead) / 100); + + if (tsize >= dev_size) { + cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " + "plus the overhead of log blocks (persistent L2ARC, " + "%llu bytes) exceeds the size of the cache device " + "(guid %llu), resetting them to the default (%d)", + l2arc_log_blk_overhead(size, dev), + dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; + + if (arc_warm == B_FALSE) + size += l2arc_write_boost; + } + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + +/* + * Cycle through L2ARC devices. This is how L2ARC load balances. + * If a device is returned, this also returns holding the spa config lock. + */ +static l2arc_dev_t * +l2arc_dev_get_next(void) +{ + l2arc_dev_t *first, *next = NULL; + + /* + * Lock out the removal of spas (spa_namespace_lock), then removal + * of cache devices (l2arc_dev_mtx). Once a device has been selected, + * both locks will be dropped and a spa config lock held instead. + */ + mutex_enter(&spa_namespace_lock); + mutex_enter(&l2arc_dev_mtx); + + /* if there are no vdevs, there is nothing to do */ + if (l2arc_ndev == 0) + goto out; + + first = NULL; + next = l2arc_dev_last; + do { + /* loop around the list looking for a non-faulted vdev */ + if (next == NULL) { + next = list_head(l2arc_dev_list); + } else { + next = list_next(l2arc_dev_list, next); + if (next == NULL) + next = list_head(l2arc_dev_list); + } + + /* if we have come back to the start, bail out */ + if (first == NULL) + first = next; + else if (next == first) + break; + + } while (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all); + + /* if we were unable to find any usable vdevs, return NULL */ + if (vdev_is_dead(next->l2ad_vdev) || next->l2ad_rebuild || + next->l2ad_trim_all) + next = NULL; + + l2arc_dev_last = next; + +out: + mutex_exit(&l2arc_dev_mtx); + + /* + * Grab the config lock to prevent the 'next' device from being + * removed while we are writing to it. + */ + if (next != NULL) + spa_config_enter(next->l2ad_spa, SCL_L2ARC, next, RW_READER); + mutex_exit(&spa_namespace_lock); + + return (next); +} + +/* + * Free buffers that were tagged for destruction. + */ +static void +l2arc_do_free_on_write(void) +{ + list_t *buflist; + l2arc_data_free_t *df, *df_prev; + + mutex_enter(&l2arc_free_on_write_mtx); + buflist = l2arc_free_on_write; + + for (df = list_tail(buflist); df; df = df_prev) { + df_prev = list_prev(buflist, df); + ASSERT3P(df->l2df_abd, !=, NULL); + abd_free(df->l2df_abd); + list_remove(buflist, df); + kmem_free(df, sizeof (l2arc_data_free_t)); + } + + mutex_exit(&l2arc_free_on_write_mtx); +} + +/* + * A write to a cache device has completed. Update all headers to allow + * reads from these buffers to begin. + */ +static void +l2arc_write_done(zio_t *zio) +{ + l2arc_write_callback_t *cb; + l2arc_lb_abd_buf_t *abd_buf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + l2arc_dev_t *dev; + l2arc_dev_hdr_phys_t *l2dhdr; + list_t *buflist; + arc_buf_hdr_t *head, *hdr, *hdr_prev; + kmutex_t *hash_lock; + int64_t bytes_dropped = 0; + + cb = zio->io_private; + ASSERT3P(cb, !=, NULL); + dev = cb->l2wcb_dev; + l2dhdr = dev->l2ad_dev_hdr; + ASSERT3P(dev, !=, NULL); + head = cb->l2wcb_head; + ASSERT3P(head, !=, NULL); + buflist = &dev->l2ad_buflist; + ASSERT3P(buflist, !=, NULL); + DTRACE_PROBE2(l2arc__iodone, zio_t *, zio, + l2arc_write_callback_t *, cb); + + if (zio->io_error != 0) + ARCSTAT_BUMP(arcstat_l2_writes_error); + + /* + * All writes completed, or an error was hit. + */ +top: + mutex_enter(&dev->l2ad_mtx); + for (hdr = list_prev(buflist, head); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. We must retry so we + * don't leave the ARC_FLAG_L2_WRITING bit set. + */ + ARCSTAT_BUMP(arcstat_l2_writes_lock_retry); + + /* + * We don't want to rescan the headers we've + * already marked as having been written out, so + * we reinsert the head node so we can pick up + * where we left off. + */ + list_remove(buflist, head); + list_insert_after(buflist, hdr, head); + + mutex_exit(&dev->l2ad_mtx); + + /* + * We wait for the hash lock to become available + * to try and prevent busy waiting, and increase + * the chance we'll be able to acquire the lock + * the next time around. + */ + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto top; + } + + /* + * We could not have been moved into the arc_l2c_only + * state while in-flight due to our ARC_FLAG_L2_WRITING + * bit being set. Let's just ensure that's being enforced. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + /* + * Skipped - drop L2ARC entry and mark the header as no + * longer L2 eligibile. + */ + if (zio->io_error != 0) { + /* + * Error - drop L2ARC entry. + */ + list_remove(buflist, hdr); + arc_hdr_clear_flags(hdr, ARC_FLAG_HAS_L2HDR); + + uint64_t psize = HDR_GET_PSIZE(hdr); + ARCSTAT_INCR(arcstat_l2_psize, -psize); + ARCSTAT_INCR(arcstat_l2_lsize, -HDR_GET_LSIZE(hdr)); + + bytes_dropped += + vdev_psize_to_asize(dev->l2ad_vdev, psize); + (void) zfs_refcount_remove_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + } + + /* + * Allow ARC to begin reads and ghost list evictions to + * this L2ARC entry. + */ + arc_hdr_clear_flags(hdr, ARC_FLAG_L2_WRITING); + + mutex_exit(hash_lock); + } + + /* + * Free the allocated abd buffers for writing the log blocks. + * If the zio failed reclaim the allocated space and remove the + * pointers to these log blocks from the log block pointer list + * of the L2ARC device. + */ + while ((abd_buf = list_remove_tail(&cb->l2wcb_abd_list)) != NULL) { + abd_free(abd_buf->abd); + zio_buf_free(abd_buf, sizeof (*abd_buf)); + if (zio->io_error != 0) { + lb_ptr_buf = list_remove_head(&dev->l2ad_lbptr_list); + /* + * L2BLK_GET_PSIZE returns aligned size for log + * blocks. + */ + uint64_t asize = + L2BLK_GET_PSIZE((lb_ptr_buf->lb_ptr)->lbp_prop); + bytes_dropped += asize; + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + list_destroy(&cb->l2wcb_abd_list); + + if (zio->io_error != 0) { + /* + * Restore the lbps array in the header to its previous state. + * If the list of log block pointers is empty, zero out the + * log block pointers in the device header. + */ + lb_ptr_buf = list_head(&dev->l2ad_lbptr_list); + for (int i = 0; i < 2; i++) { + if (lb_ptr_buf == NULL) { + /* + * If the list is empty zero out the device + * header. Otherwise zero out the second log + * block pointer in the header. + */ + if (i == 0) { + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + } else { + bzero(&l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + } + break; + } + bcopy(lb_ptr_buf->lb_ptr, &l2dhdr->dh_start_lbps[i], + sizeof (l2arc_log_blkptr_t)); + lb_ptr_buf = list_next(&dev->l2ad_lbptr_list, + lb_ptr_buf); + } + } + + atomic_inc_64(&l2arc_writes_done); + list_remove(buflist, head); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + mutex_exit(&dev->l2ad_mtx); + + ASSERT(dev->l2ad_vdev != NULL); + vdev_space_update(dev->l2ad_vdev, -bytes_dropped, 0, 0); + + l2arc_do_free_on_write(); + + kmem_free(cb, sizeof (l2arc_write_callback_t)); +} + +static int +l2arc_untransform(zio_t *zio, l2arc_read_callback_t *cb) +{ + int ret; + spa_t *spa = zio->io_spa; + arc_buf_hdr_t *hdr = cb->l2rcb_hdr; + blkptr_t *bp = zio->io_bp; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* + * ZIL data is never be written to the L2ARC, so we don't need + * special handling for its unique MAC storage. + */ + ASSERT3U(BP_GET_TYPE(bp), !=, DMU_OT_INTENT_LOG); + ASSERT(MUTEX_HELD(HDR_LOCK(hdr))); + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + + /* + * If the data was encrypted, decrypt it now. Note that + * we must check the bp here and not the hdr, since the + * hdr does not have its encryption parameters updated + * until arc_read_done(). + */ + if (BP_IS_ENCRYPTED(bp)) { + abd_t *eabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + B_TRUE); + + zio_crypt_decode_params_bp(bp, salt, iv); + zio_crypt_decode_mac_bp(bp, mac); + + ret = spa_do_crypt_abd(B_FALSE, spa, &cb->l2rcb_zb, + BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), + salt, iv, mac, HDR_GET_PSIZE(hdr), eabd, + hdr->b_l1hdr.b_pabd, &no_crypt); + if (ret != 0) { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + goto error; + } + + /* + * If we actually performed decryption, replace b_pabd + * with the decrypted data. Otherwise we can just throw + * our decryption buffer away. + */ + if (!no_crypt) { + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = eabd; + zio->io_abd = eabd; + } else { + arc_free_data_abd(hdr, eabd, arc_hdr_size(hdr), hdr); + } + } + + /* + * If the L2ARC block was compressed, but ARC compression + * is disabled we decompress the data into a new buffer and + * replace the existing data. + */ + if (HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) { + abd_t *cabd = arc_get_data_abd(hdr, arc_hdr_size(hdr), hdr, + B_TRUE); + void *tmp = abd_borrow_buf(cabd, arc_hdr_size(hdr)); + + ret = zio_decompress_data(HDR_GET_COMPRESS(hdr), + hdr->b_l1hdr.b_pabd, tmp, HDR_GET_PSIZE(hdr), + HDR_GET_LSIZE(hdr), &hdr->b_complevel); + if (ret != 0) { + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, cabd, arc_hdr_size(hdr), hdr); + goto error; + } + + abd_return_buf_copy(cabd, tmp, arc_hdr_size(hdr)); + arc_free_data_abd(hdr, hdr->b_l1hdr.b_pabd, + arc_hdr_size(hdr), hdr); + hdr->b_l1hdr.b_pabd = cabd; + zio->io_abd = cabd; + zio->io_size = HDR_GET_LSIZE(hdr); + } + + return (0); + +error: + return (ret); +} + + +/* + * A read to a cache device completed. Validate buffer contents before + * handing over to the regular ARC routines. + */ +static void +l2arc_read_done(zio_t *zio) +{ + int tfm_error = 0; + l2arc_read_callback_t *cb = zio->io_private; + arc_buf_hdr_t *hdr; + kmutex_t *hash_lock; + boolean_t valid_cksum; + boolean_t using_rdata = (BP_IS_ENCRYPTED(&cb->l2rcb_bp) && + (cb->l2rcb_flags & ZIO_FLAG_RAW_ENCRYPT)); + + ASSERT3P(zio->io_vd, !=, NULL); + ASSERT(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE); + + spa_config_exit(zio->io_spa, SCL_L2ARC, zio->io_vd); + + ASSERT3P(cb, !=, NULL); + hdr = cb->l2rcb_hdr; + ASSERT3P(hdr, !=, NULL); + + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); + ASSERT3P(hash_lock, ==, HDR_LOCK(hdr)); + + /* + * If the data was read into a temporary buffer, + * move it and free the buffer. + */ + if (cb->l2rcb_abd != NULL) { + ASSERT3U(arc_hdr_size(hdr), <, zio->io_size); + if (zio->io_error == 0) { + if (using_rdata) { + abd_copy(hdr->b_crypt_hdr.b_rabd, + cb->l2rcb_abd, arc_hdr_size(hdr)); + } else { + abd_copy(hdr->b_l1hdr.b_pabd, + cb->l2rcb_abd, arc_hdr_size(hdr)); + } + } + + /* + * The following must be done regardless of whether + * there was an error: + * - free the temporary buffer + * - point zio to the real ARC buffer + * - set zio size accordingly + * These are required because zio is either re-used for + * an I/O of the block in the case of the error + * or the zio is passed to arc_read_done() and it + * needs real data. + */ + abd_free(cb->l2rcb_abd); + zio->io_size = zio->io_orig_size = arc_hdr_size(hdr); + + if (using_rdata) { + ASSERT(HDR_HAS_RABD(hdr)); + zio->io_abd = zio->io_orig_abd = + hdr->b_crypt_hdr.b_rabd; + } else { + ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL); + zio->io_abd = zio->io_orig_abd = hdr->b_l1hdr.b_pabd; + } + } + + ASSERT3P(zio->io_abd, !=, NULL); + + /* + * Check this survived the L2ARC journey. + */ + ASSERT(zio->io_abd == hdr->b_l1hdr.b_pabd || + (HDR_HAS_RABD(hdr) && zio->io_abd == hdr->b_crypt_hdr.b_rabd)); + zio->io_bp_copy = cb->l2rcb_bp; /* XXX fix in L2ARC 2.0 */ + zio->io_bp = &zio->io_bp_copy; /* XXX fix in L2ARC 2.0 */ + zio->io_prop.zp_complevel = hdr->b_complevel; + + valid_cksum = arc_cksum_is_equal(hdr, zio); + + /* + * b_rabd will always match the data as it exists on disk if it is + * being used. Therefore if we are reading into b_rabd we do not + * attempt to untransform the data. + */ + if (valid_cksum && !using_rdata) + tfm_error = l2arc_untransform(zio, cb); + + if (valid_cksum && tfm_error == 0 && zio->io_error == 0 && + !HDR_L2_EVICTED(hdr)) { + mutex_exit(hash_lock); + zio->io_private = hdr; + arc_read_done(zio); + } else { + /* + * Buffer didn't survive caching. Increment stats and + * reissue to the original storage device. + */ + if (zio->io_error != 0) { + ARCSTAT_BUMP(arcstat_l2_io_error); + } else { + zio->io_error = SET_ERROR(EIO); + } + if (!valid_cksum || tfm_error != 0) + ARCSTAT_BUMP(arcstat_l2_cksum_bad); + + /* + * If there's no waiter, issue an async i/o to the primary + * storage now. If there *is* a waiter, the caller must + * issue the i/o in a context where it's OK to block. + */ + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + void *abd = (using_rdata) ? + hdr->b_crypt_hdr.b_rabd : hdr->b_l1hdr.b_pabd; + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio = zio_read(pio, zio->io_spa, zio->io_bp, + abd, zio->io_size, arc_read_done, + hdr, zio->io_priority, cb->l2rcb_flags, + &cb->l2rcb_zb); + + /* + * Original ZIO will be freed, so we need to update + * ARC header with the new ZIO pointer to be used + * by zio_change_priority() in arc_read(). + */ + for (struct arc_callback *acb = hdr->b_l1hdr.b_acb; + acb != NULL; acb = acb->acb_next) + acb->acb_zio_head = zio; + + mutex_exit(hash_lock); + zio_nowait(zio); + } else { + mutex_exit(hash_lock); + } + } + + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * This is the list priority from which the L2ARC will search for pages to + * cache. This is used within loops (0..3) to cycle through lists in the + * desired order. This order can have a significant effect on cache + * performance. + * + * Currently the metadata lists are hit first, MFU then MRU, followed by + * the data lists. This function returns a locked list, and also returns + * the lock pointer. + */ +static multilist_sublist_t * +l2arc_sublist_lock(int list_num) +{ + multilist_t *ml = NULL; + unsigned int idx; + + ASSERT(list_num >= 0 && list_num < L2ARC_FEED_TYPES); + + switch (list_num) { + case 0: + ml = arc_mfu->arcs_list[ARC_BUFC_METADATA]; + break; + case 1: + ml = arc_mru->arcs_list[ARC_BUFC_METADATA]; + break; + case 2: + ml = arc_mfu->arcs_list[ARC_BUFC_DATA]; + break; + case 3: + ml = arc_mru->arcs_list[ARC_BUFC_DATA]; + break; + default: + return (NULL); + } + + /* + * Return a randomly-selected sublist. This is acceptable + * because the caller feeds only a little bit of data for each + * call (8MB). Subsequent calls will result in different + * sublists being selected. + */ + idx = multilist_get_random_index(ml); + return (multilist_sublist_lock(ml, idx)); +} + +/* + * Calculates the maximum overhead of L2ARC metadata log blocks for a given + * L2ARC write size. l2arc_evict and l2arc_write_size need to include this + * overhead in processing to make sure there is enough headroom available + * when writing buffers. + */ +static inline uint64_t +l2arc_log_blk_overhead(uint64_t write_sz, l2arc_dev_t *dev) +{ + if (dev->l2ad_log_entries == 0) { + return (0); + } else { + uint64_t log_entries = write_sz >> SPA_MINBLOCKSHIFT; + + uint64_t log_blocks = (log_entries + + dev->l2ad_log_entries - 1) / + dev->l2ad_log_entries; + + return (vdev_psize_to_asize(dev->l2ad_vdev, + sizeof (l2arc_log_blk_phys_t)) * log_blocks); + } +} + +/* + * Evict buffers from the device write hand to the distance specified in + * bytes. This distance may span populated buffers, it may span nothing. + * This is clearing a region on the L2ARC device ready for writing. + * If the 'all' boolean is set, every buffer is evicted. + */ +static void +l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) +{ + list_t *buflist; + arc_buf_hdr_t *hdr, *hdr_prev; + kmutex_t *hash_lock; + uint64_t taddr; + l2arc_lb_ptr_buf_t *lb_ptr_buf, *lb_ptr_buf_prev; + vdev_t *vd = dev->l2ad_vdev; + boolean_t rerun; + + buflist = &dev->l2ad_buflist; + + /* + * We need to add in the worst case scenario of log block overhead. + */ + distance += l2arc_log_blk_overhead(distance, dev); + if (vd->vdev_has_trim && l2arc_trim_ahead > 0) { + /* + * Trim ahead of the write size 64MB or (l2arc_trim_ahead/100) + * times the write size, whichever is greater. + */ + distance += MAX(64 * 1024 * 1024, + (distance * l2arc_trim_ahead) / 100); + } + +top: + rerun = B_FALSE; + if (dev->l2ad_hand >= (dev->l2ad_end - distance)) { + /* + * When there is no space to accommodate upcoming writes, + * evict to the end. Then bump the write and evict hands + * to the start and iterate. This iteration does not + * happen indefinitely as we make sure in + * l2arc_write_size() that when the write hand is reset, + * the write size does not exceed the end of the device. + */ + rerun = B_TRUE; + taddr = dev->l2ad_end; + } else { + taddr = dev->l2ad_hand + distance; + } + DTRACE_PROBE4(l2arc__evict, l2arc_dev_t *, dev, list_t *, buflist, + uint64_t, taddr, boolean_t, all); + + if (!all) { + /* + * This check has to be placed after deciding whether to + * iterate (rerun). + */ + if (dev->l2ad_first) { + /* + * This is the first sweep through the device. There is + * nothing to evict. We have already trimmmed the + * whole device. + */ + goto out; + } else { + /* + * Trim the space to be evicted. + */ + if (vd->vdev_has_trim && dev->l2ad_evict < taddr && + l2arc_trim_ahead > 0) { + /* + * We have to drop the spa_config lock because + * vdev_trim_range() will acquire it. + * l2ad_evict already accounts for the label + * size. To prevent vdev_trim_ranges() from + * adding it again, we subtract it from + * l2ad_evict. + */ + spa_config_exit(dev->l2ad_spa, SCL_L2ARC, dev); + vdev_trim_simple(vd, + dev->l2ad_evict - VDEV_LABEL_START_SIZE, + taddr - dev->l2ad_evict); + spa_config_enter(dev->l2ad_spa, SCL_L2ARC, dev, + RW_READER); + } + + /* + * When rebuilding L2ARC we retrieve the evict hand + * from the header of the device. Of note, l2arc_evict() + * does not actually delete buffers from the cache + * device, but trimming may do so depending on the + * hardware implementation. Thus keeping track of the + * evict hand is useful. + */ + dev->l2ad_evict = MAX(dev->l2ad_evict, taddr); + } + } + +retry: + mutex_enter(&dev->l2ad_mtx); + /* + * We have to account for evicted log blocks. Run vdev_space_update() + * on log blocks whose offset (in bytes) is before the evicted offset + * (in bytes) by searching in the list of pointers to log blocks + * present in the L2ARC device. + */ + for (lb_ptr_buf = list_tail(&dev->l2ad_lbptr_list); lb_ptr_buf; + lb_ptr_buf = lb_ptr_buf_prev) { + + lb_ptr_buf_prev = list_prev(&dev->l2ad_lbptr_list, lb_ptr_buf); + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE( + (lb_ptr_buf->lb_ptr)->lbp_prop); + + /* + * We don't worry about log blocks left behind (ie + * lbp_payload_start < l2ad_hand) because l2arc_write_buffers() + * will never write more than l2arc_evict() evicts. + */ + if (!all && l2arc_log_blkptr_valid(dev, lb_ptr_buf->lb_ptr)) { + break; + } else { + vdev_space_update(vd, -asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, -asize); + ARCSTAT_BUMPDOWN(arcstat_l2_log_blk_count); + zfs_refcount_remove_many(&dev->l2ad_lb_asize, asize, + lb_ptr_buf); + zfs_refcount_remove(&dev->l2ad_lb_count, lb_ptr_buf); + list_remove(&dev->l2ad_lbptr_list, lb_ptr_buf); + kmem_free(lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + kmem_free(lb_ptr_buf, sizeof (l2arc_lb_ptr_buf_t)); + } + } + + for (hdr = list_tail(buflist); hdr; hdr = hdr_prev) { + hdr_prev = list_prev(buflist, hdr); + + ASSERT(!HDR_EMPTY(hdr)); + hash_lock = HDR_LOCK(hdr); + + /* + * We cannot use mutex_enter or else we can deadlock + * with l2arc_write_buffers (due to swapping the order + * the hash lock and l2ad_mtx are taken). + */ + if (!mutex_tryenter(hash_lock)) { + /* + * Missed the hash lock. Retry. + */ + ARCSTAT_BUMP(arcstat_l2_evict_lock_retry); + mutex_exit(&dev->l2ad_mtx); + mutex_enter(hash_lock); + mutex_exit(hash_lock); + goto retry; + } + + /* + * A header can't be on this list if it doesn't have L2 header. + */ + ASSERT(HDR_HAS_L2HDR(hdr)); + + /* Ensure this header has finished being written. */ + ASSERT(!HDR_L2_WRITING(hdr)); + ASSERT(!HDR_L2_WRITE_HEAD(hdr)); + + if (!all && (hdr->b_l2hdr.b_daddr >= dev->l2ad_evict || + hdr->b_l2hdr.b_daddr < dev->l2ad_hand)) { + /* + * We've evicted to the target address, + * or the end of the device. + */ + mutex_exit(hash_lock); + break; + } + + if (!HDR_HAS_L1HDR(hdr)) { + ASSERT(!HDR_L2_READING(hdr)); + /* + * This doesn't exist in the ARC. Destroy. + * arc_hdr_destroy() will call list_remove() + * and decrement arcstat_l2_lsize. + */ + arc_change_state(arc_anon, hdr, hash_lock); + arc_hdr_destroy(hdr); + } else { + ASSERT(hdr->b_l1hdr.b_state != arc_l2c_only); + ARCSTAT_BUMP(arcstat_l2_evict_l1cached); + /* + * Invalidate issued or about to be issued + * reads, since we may be about to write + * over this location. + */ + if (HDR_L2_READING(hdr)) { + ARCSTAT_BUMP(arcstat_l2_evict_reading); + arc_hdr_set_flags(hdr, ARC_FLAG_L2_EVICTED); + } + + arc_hdr_l2hdr_destroy(hdr); + } + mutex_exit(hash_lock); + } + mutex_exit(&dev->l2ad_mtx); + +out: + /* + * We need to check if we evict all buffers, otherwise we may iterate + * unnecessarily. + */ + if (!all && rerun) { + /* + * Bump device hand to the device start if it is approaching the + * end. l2arc_evict() has already evicted ahead for this case. + */ + dev->l2ad_hand = dev->l2ad_start; + dev->l2ad_evict = dev->l2ad_start; + dev->l2ad_first = B_FALSE; + goto top; + } + + ASSERT3U(dev->l2ad_hand + distance, <, dev->l2ad_end); + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <, dev->l2ad_evict); +} + +/* + * Handle any abd transforms that might be required for writing to the L2ARC. + * If successful, this function will always return an abd with the data + * transformed as it is on disk in a new abd of asize bytes. + */ +static int +l2arc_apply_transforms(spa_t *spa, arc_buf_hdr_t *hdr, uint64_t asize, + abd_t **abd_out) +{ + int ret; + void *tmp = NULL; + abd_t *cabd = NULL, *eabd = NULL, *to_write = hdr->b_l1hdr.b_pabd; + enum zio_compress compress = HDR_GET_COMPRESS(hdr); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t size = arc_hdr_size(hdr); + boolean_t ismd = HDR_ISTYPE_METADATA(hdr); + boolean_t bswap = (hdr->b_l1hdr.b_byteswap != DMU_BSWAP_NUMFUNCS); + dsl_crypto_key_t *dck = NULL; + uint8_t mac[ZIO_DATA_MAC_LEN] = { 0 }; + boolean_t no_crypt = B_FALSE; + + ASSERT((HDR_GET_COMPRESS(hdr) != ZIO_COMPRESS_OFF && + !HDR_COMPRESSION_ENABLED(hdr)) || + HDR_ENCRYPTED(hdr) || HDR_SHARED_DATA(hdr) || psize != asize); + ASSERT3U(psize, <=, asize); + + /* + * If this data simply needs its own buffer, we simply allocate it + * and copy the data. This may be done to eliminate a dependency on a + * shared buffer or to reallocate the buffer to match asize. + */ + if (HDR_HAS_RABD(hdr) && asize != psize) { + ASSERT3U(asize, >=, psize); + to_write = abd_alloc_for_io(asize, ismd); + abd_copy(to_write, hdr->b_crypt_hdr.b_rabd, psize); + if (psize != asize) + abd_zero_off(to_write, psize, asize - psize); + goto out; + } + + if ((compress == ZIO_COMPRESS_OFF || HDR_COMPRESSION_ENABLED(hdr)) && + !HDR_ENCRYPTED(hdr)) { + ASSERT3U(size, ==, psize); + to_write = abd_alloc_for_io(asize, ismd); + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto out; + } + + if (compress != ZIO_COMPRESS_OFF && !HDR_COMPRESSION_ENABLED(hdr)) { + cabd = abd_alloc_for_io(asize, ismd); + tmp = abd_borrow_buf(cabd, asize); + + psize = zio_compress_data(compress, to_write, tmp, size, + hdr->b_complevel); + + if (psize >= size) { + abd_return_buf(cabd, tmp, asize); + HDR_SET_COMPRESS(hdr, ZIO_COMPRESS_OFF); + to_write = cabd; + abd_copy(to_write, hdr->b_l1hdr.b_pabd, size); + if (size != asize) + abd_zero_off(to_write, size, asize - size); + goto encrypt; + } + ASSERT3U(psize, <=, HDR_GET_PSIZE(hdr)); + if (psize < asize) + bzero((char *)tmp + psize, asize - psize); + psize = HDR_GET_PSIZE(hdr); + abd_return_buf_copy(cabd, tmp, asize); + to_write = cabd; + } + +encrypt: + if (HDR_ENCRYPTED(hdr)) { + eabd = abd_alloc_for_io(asize, ismd); + + /* + * If the dataset was disowned before the buffer + * made it to this point, the key to re-encrypt + * it won't be available. In this case we simply + * won't write the buffer to the L2ARC. + */ + ret = spa_keystore_lookup_key(spa, hdr->b_crypt_hdr.b_dsobj, + FTAG, &dck); + if (ret != 0) + goto error; + + ret = zio_do_crypt_abd(B_TRUE, &dck->dck_key, + hdr->b_crypt_hdr.b_ot, bswap, hdr->b_crypt_hdr.b_salt, + hdr->b_crypt_hdr.b_iv, mac, psize, to_write, eabd, + &no_crypt); + if (ret != 0) + goto error; + + if (no_crypt) + abd_copy(eabd, to_write, psize); + + if (psize != asize) + abd_zero_off(eabd, psize, asize - psize); + + /* assert that the MAC we got here matches the one we saved */ + ASSERT0(bcmp(mac, hdr->b_crypt_hdr.b_mac, ZIO_DATA_MAC_LEN)); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + if (to_write == cabd) + abd_free(cabd); + + to_write = eabd; + } + +out: + ASSERT3P(to_write, !=, hdr->b_l1hdr.b_pabd); + *abd_out = to_write; + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + if (cabd != NULL) + abd_free(cabd); + if (eabd != NULL) + abd_free(eabd); + + *abd_out = NULL; + return (ret); +} + +static void +l2arc_blk_fetch_done(zio_t *zio) +{ + l2arc_read_callback_t *cb; + + cb = zio->io_private; + if (cb->l2rcb_abd != NULL) + abd_put(cb->l2rcb_abd); + kmem_free(cb, sizeof (l2arc_read_callback_t)); +} + +/* + * Find and write ARC buffers to the L2ARC device. + * + * An ARC_FLAG_L2_WRITING flag is set so that the L2ARC buffers are not valid + * for reading until they have completed writing. + * The headroom_boost is an in-out parameter used to maintain headroom boost + * state between calls to this function. + * + * Returns the number of bytes actually written (which may be smaller than + * the delta by which the device hand has changed due to alignment and the + * writing of log blocks). + */ +static uint64_t +l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) +{ + arc_buf_hdr_t *hdr, *hdr_prev, *head; + uint64_t write_asize, write_psize, write_lsize, headroom; + boolean_t full; + l2arc_write_callback_t *cb = NULL; + zio_t *pio, *wzio; + uint64_t guid = spa_load_guid(spa); + + ASSERT3P(dev->l2ad_vdev, !=, NULL); + + pio = NULL; + write_lsize = write_asize = write_psize = 0; + full = B_FALSE; + head = kmem_cache_alloc(hdr_l2only_cache, KM_PUSHPAGE); + arc_hdr_set_flags(head, ARC_FLAG_L2_WRITE_HEAD | ARC_FLAG_HAS_L2HDR); + + /* + * Copy buffers for L2ARC writing. + */ + for (int try = 0; try < L2ARC_FEED_TYPES; try++) { + multilist_sublist_t *mls = l2arc_sublist_lock(try); + uint64_t passed_sz = 0; + + VERIFY3P(mls, !=, NULL); + + /* + * L2ARC fast warmup. + * + * Until the ARC is warm and starts to evict, read from the + * head of the ARC lists rather than the tail. + */ + if (arc_warm == B_FALSE) + hdr = multilist_sublist_head(mls); + else + hdr = multilist_sublist_tail(mls); + + headroom = target_sz * l2arc_headroom; + if (zfs_compressed_arc_enabled) + headroom = (headroom * l2arc_headroom_boost) / 100; + + for (; hdr; hdr = hdr_prev) { + kmutex_t *hash_lock; + abd_t *to_write = NULL; + + if (arc_warm == B_FALSE) + hdr_prev = multilist_sublist_next(mls, hdr); + else + hdr_prev = multilist_sublist_prev(mls, hdr); + + hash_lock = HDR_LOCK(hdr); + if (!mutex_tryenter(hash_lock)) { + /* + * Skip this buffer rather than waiting. + */ + continue; + } + + passed_sz += HDR_GET_LSIZE(hdr); + if (l2arc_headroom != 0 && passed_sz > headroom) { + /* + * Searched too far. + */ + mutex_exit(hash_lock); + break; + } + + if (!l2arc_write_eligible(guid, hdr)) { + mutex_exit(hash_lock); + continue; + } + + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT3U(arc_hdr_size(hdr), >, 0); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + uint64_t psize = HDR_GET_PSIZE(hdr); + uint64_t asize = vdev_psize_to_asize(dev->l2ad_vdev, + psize); + + if ((write_asize + asize) > target_sz) { + full = B_TRUE; + mutex_exit(hash_lock); + break; + } + + /* + * We rely on the L1 portion of the header below, so + * it's invalid for this header to have been evicted out + * of the ghost cache, prior to being written out. The + * ARC_FLAG_L2_WRITING bit ensures this won't happen. + */ + arc_hdr_set_flags(hdr, ARC_FLAG_L2_WRITING); + ASSERT(HDR_HAS_L1HDR(hdr)); + + ASSERT3U(HDR_GET_PSIZE(hdr), >, 0); + ASSERT(hdr->b_l1hdr.b_pabd != NULL || + HDR_HAS_RABD(hdr)); + ASSERT3U(arc_hdr_size(hdr), >, 0); + + /* + * If this header has b_rabd, we can use this since it + * must always match the data exactly as it exists on + * disk. Otherwise, the L2ARC can normally use the + * hdr's data, but if we're sharing data between the + * hdr and one of its bufs, L2ARC needs its own copy of + * the data so that the ZIO below can't race with the + * buf consumer. To ensure that this copy will be + * available for the lifetime of the ZIO and be cleaned + * up afterwards, we add it to the l2arc_free_on_write + * queue. If we need to apply any transforms to the + * data (compression, encryption) we will also need the + * extra buffer. + */ + if (HDR_HAS_RABD(hdr) && psize == asize) { + to_write = hdr->b_crypt_hdr.b_rabd; + } else if ((HDR_COMPRESSION_ENABLED(hdr) || + HDR_GET_COMPRESS(hdr) == ZIO_COMPRESS_OFF) && + !HDR_ENCRYPTED(hdr) && !HDR_SHARED_DATA(hdr) && + psize == asize) { + to_write = hdr->b_l1hdr.b_pabd; + } else { + int ret; + arc_buf_contents_t type = arc_buf_type(hdr); + + ret = l2arc_apply_transforms(spa, hdr, asize, + &to_write); + if (ret != 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_L2_WRITING); + mutex_exit(hash_lock); + continue; + } + + l2arc_free_abd_on_write(to_write, asize, type); + } + + if (pio == NULL) { + /* + * Insert a dummy header on the buflist so + * l2arc_write_done() can find where the + * write buffers begin without searching. + */ + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, head); + mutex_exit(&dev->l2ad_mtx); + + cb = kmem_alloc( + sizeof (l2arc_write_callback_t), KM_SLEEP); + cb->l2wcb_dev = dev; + cb->l2wcb_head = head; + /* + * Create a list to save allocated abd buffers + * for l2arc_log_blk_commit(). + */ + list_create(&cb->l2wcb_abd_list, + sizeof (l2arc_lb_abd_buf_t), + offsetof(l2arc_lb_abd_buf_t, node)); + pio = zio_root(spa, l2arc_write_done, cb, + ZIO_FLAG_CANFAIL); + } + + hdr->b_l2hdr.b_dev = dev; + hdr->b_l2hdr.b_hits = 0; + + hdr->b_l2hdr.b_daddr = dev->l2ad_hand; + arc_hdr_set_flags(hdr, ARC_FLAG_HAS_L2HDR); + + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_buflist, hdr); + mutex_exit(&dev->l2ad_mtx); + + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(hdr), hdr); + + wzio = zio_write_phys(pio, dev->l2ad_vdev, + hdr->b_l2hdr.b_daddr, asize, to_write, + ZIO_CHECKSUM_OFF, NULL, hdr, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_CANFAIL, B_FALSE); + + write_lsize += HDR_GET_LSIZE(hdr); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, + zio_t *, wzio); + + write_psize += psize; + write_asize += asize; + dev->l2ad_hand += asize; + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + mutex_exit(hash_lock); + + /* + * Append buf info to current log and commit if full. + * arcstat_l2_{size,asize} kstats are updated + * internally. + */ + if (l2arc_log_blk_insert(dev, hdr)) + l2arc_log_blk_commit(dev, pio, cb); + + zio_nowait(wzio); + } + + multilist_sublist_unlock(mls); + + if (full == B_TRUE) + break; + } + + /* No buffers selected for writing? */ + if (pio == NULL) { + ASSERT0(write_lsize); + ASSERT(!HDR_HAS_L1HDR(head)); + kmem_cache_free(hdr_l2only_cache, head); + + /* + * Although we did not write any buffers l2ad_evict may + * have advanced. + */ + l2arc_dev_hdr_update(dev); + + return (0); + } + + if (!dev->l2ad_first) + ASSERT3U(dev->l2ad_hand, <=, dev->l2ad_evict); + + ASSERT3U(write_asize, <=, target_sz); + ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_psize); + ARCSTAT_INCR(arcstat_l2_lsize, write_lsize); + ARCSTAT_INCR(arcstat_l2_psize, write_psize); + + dev->l2ad_writing = B_TRUE; + (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + /* + * Update the device header after the zio completes as + * l2arc_write_done() may have updated the memory holding the log block + * pointers in the device header. + */ + l2arc_dev_hdr_update(dev); + + return (write_asize); +} + +/* + * This thread feeds the L2ARC at regular intervals. This is the beating + * heart of the L2ARC. + */ +/* ARGSUSED */ +static void +l2arc_feed_thread(void *unused) +{ + callb_cpr_t cpr; + l2arc_dev_t *dev; + spa_t *spa; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); + fstrans_cookie_t cookie; + + CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); + + mutex_enter(&l2arc_feed_thr_lock); + + cookie = spl_fstrans_mark(); + while (l2arc_thread_exit == 0) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig(&l2arc_feed_thr_cv, + &l2arc_feed_thr_lock, next); + CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; + + /* + * Quick check for L2ARC devices. + */ + mutex_enter(&l2arc_dev_mtx); + if (l2arc_ndev == 0) { + mutex_exit(&l2arc_dev_mtx); + continue; + } + mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); + + /* + * This selects the next l2arc device to write to, and in + * doing so the next spa to feed from: dev->l2ad_spa. This + * will return NULL if there are now no l2arc devices or if + * they are all faulted. + * + * If a device is returned, its spa's config lock is also + * held to prevent device removal. l2arc_dev_get_next() + * will grab and release l2arc_dev_mtx. + */ + if ((dev = l2arc_dev_get_next()) == NULL) + continue; + + spa = dev->l2ad_spa; + ASSERT3P(spa, !=, NULL); + + /* + * If the pool is read-only then force the feed thread to + * sleep a little longer. + */ + if (!spa_writeable(spa)) { + next = ddi_get_lbolt() + 5 * l2arc_feed_secs * hz; + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + /* + * Avoid contributing to memory pressure. + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_abort_lowmem); + spa_config_exit(spa, SCL_L2ARC, dev); + continue; + } + + ARCSTAT_BUMP(arcstat_l2_feeds); + + size = l2arc_write_size(dev); + + /* + * Evict L2ARC buffers that will be overwritten. + */ + l2arc_evict(dev, size, B_FALSE); + + /* + * Write ARC buffers. + */ + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); + spa_config_exit(spa, SCL_L2ARC, dev); + } + spl_fstrans_unmark(cookie); + + l2arc_thread_exit = 0; + cv_broadcast(&l2arc_feed_thr_cv); + CALLB_CPR_EXIT(&cpr); /* drops l2arc_feed_thr_lock */ + thread_exit(); +} + +boolean_t +l2arc_vdev_present(vdev_t *vd) +{ + return (l2arc_vdev_get(vd) != NULL); +} + +/* + * Returns the l2arc_dev_t associated with a particular vdev_t or NULL if + * the vdev_t isn't an L2ARC device. + */ +l2arc_dev_t * +l2arc_vdev_get(vdev_t *vd) +{ + l2arc_dev_t *dev; + + mutex_enter(&l2arc_dev_mtx); + for (dev = list_head(l2arc_dev_list); dev != NULL; + dev = list_next(l2arc_dev_list, dev)) { + if (dev->l2ad_vdev == vd) + break; + } + mutex_exit(&l2arc_dev_mtx); + + return (dev); +} + +/* + * Add a vdev for use by the L2ARC. By this point the spa has already + * validated the vdev and opened it. + */ +void +l2arc_add_vdev(spa_t *spa, vdev_t *vd) +{ + l2arc_dev_t *adddev; + uint64_t l2dhdr_asize; + + ASSERT(!l2arc_vdev_present(vd)); + + vdev_ashift_optimize(vd); + + /* + * Create a new l2arc device entry. + */ + adddev = vmem_zalloc(sizeof (l2arc_dev_t), KM_SLEEP); + adddev->l2ad_spa = spa; + adddev->l2ad_vdev = vd; + /* leave extra size for an l2arc device header */ + l2dhdr_asize = adddev->l2ad_dev_hdr_asize = + MAX(sizeof (*adddev->l2ad_dev_hdr), 1 << vd->vdev_ashift); + adddev->l2ad_start = VDEV_LABEL_START_SIZE + l2dhdr_asize; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); + ASSERT3U(adddev->l2ad_start, <, adddev->l2ad_end); + adddev->l2ad_hand = adddev->l2ad_start; + adddev->l2ad_evict = adddev->l2ad_start; + adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; + adddev->l2ad_trim_all = B_FALSE; + list_link_init(&adddev->l2ad_node); + adddev->l2ad_dev_hdr = kmem_zalloc(l2dhdr_asize, KM_SLEEP); + + mutex_init(&adddev->l2ad_mtx, NULL, MUTEX_DEFAULT, NULL); + /* + * This is a list of all ARC buffers that are still valid on the + * device. + */ + list_create(&adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), + offsetof(arc_buf_hdr_t, b_l2hdr.b_l2node)); + + /* + * This is a list of pointers to log blocks that are still present + * on the device. + */ + list_create(&adddev->l2ad_lbptr_list, sizeof (l2arc_lb_ptr_buf_t), + offsetof(l2arc_lb_ptr_buf_t, node)); + + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); + zfs_refcount_create(&adddev->l2ad_alloc); + zfs_refcount_create(&adddev->l2ad_lb_asize); + zfs_refcount_create(&adddev->l2ad_lb_count); + + /* + * Add device to global list + */ + mutex_enter(&l2arc_dev_mtx); + list_insert_head(l2arc_dev_list, adddev); + atomic_inc_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Decide if vdev is eligible for L2ARC rebuild + */ + l2arc_rebuild_vdev(adddev->l2ad_vdev, B_FALSE); +} + +void +l2arc_rebuild_vdev(vdev_t *vd, boolean_t reopen) +{ + l2arc_dev_t *dev = NULL; + l2arc_dev_hdr_phys_t *l2dhdr; + uint64_t l2dhdr_asize; + spa_t *spa; + int err; + boolean_t l2dhdr_valid = B_TRUE; + + dev = l2arc_vdev_get(vd); + ASSERT3P(dev, !=, NULL); + spa = dev->l2ad_spa; + l2dhdr = dev->l2ad_dev_hdr; + l2dhdr_asize = dev->l2ad_dev_hdr_asize; + + /* + * The L2ARC has to hold at least the payload of one log block for + * them to be restored (persistent L2ARC). The payload of a log block + * depends on the amount of its log entries. We always write log blocks + * with 1022 entries. How many of them are committed or restored depends + * on the size of the L2ARC device. Thus the maximum payload of + * one log block is 1022 * SPA_MAXBLOCKSIZE = 16GB. If the L2ARC device + * is less than that, we reduce the amount of committed and restored + * log entries per block so as to enable persistence. + */ + if (dev->l2ad_end < l2arc_rebuild_blocks_min_l2size) { + dev->l2ad_log_entries = 0; + } else { + dev->l2ad_log_entries = MIN((dev->l2ad_end - + dev->l2ad_start) >> SPA_MAXBLOCKSHIFT, + L2ARC_LOG_BLK_MAX_ENTRIES); + } + + /* + * Read the device header, if an error is returned do not rebuild L2ARC. + */ + if ((err = l2arc_dev_hdr_read(dev)) != 0) + l2dhdr_valid = B_FALSE; + + if (l2dhdr_valid && dev->l2ad_log_entries > 0) { + /* + * If we are onlining a cache device (vdev_reopen) that was + * still present (l2arc_vdev_present()) and rebuild is enabled, + * we should evict all ARC buffers and pointers to log blocks + * and reclaim their space before restoring its contents to + * L2ARC. + */ + if (reopen) { + if (!l2arc_rebuild_enabled) { + return; + } else { + l2arc_evict(dev, 0, B_TRUE); + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; + } + } + /* + * Just mark the device as pending for a rebuild. We won't + * be starting a rebuild in line here as it would block pool + * import. Instead spa_load_impl will hand that off to an + * async task which will call l2arc_spa_rebuild_start. + */ + dev->l2ad_rebuild = B_TRUE; + } else if (spa_writeable(spa)) { + /* + * In this case TRIM the whole device if l2arc_trim_ahead > 0, + * otherwise create a new header. We zero out the memory holding + * the header to reset dh_start_lbps. If we TRIM the whole + * device the new header will be written by + * vdev_trim_l2arc_thread() at the end of the TRIM to update the + * trim_state in the header too. When reading the header, if + * trim_state is not VDEV_TRIM_COMPLETE and l2arc_trim_ahead > 0 + * we opt to TRIM the whole device again. + */ + if (l2arc_trim_ahead > 0) { + dev->l2ad_trim_all = B_TRUE; + } else { + bzero(l2dhdr, l2dhdr_asize); + l2arc_dev_hdr_update(dev); + } + } +} + +/* + * Remove a vdev from the L2ARC. + */ +void +l2arc_remove_vdev(vdev_t *vd) +{ + l2arc_dev_t *remdev = NULL; + + /* + * Find the device by vdev + */ + remdev = l2arc_vdev_get(vd); + ASSERT3P(remdev, !=, NULL); + + /* + * Cancel any ongoing or scheduled rebuild. + */ + mutex_enter(&l2arc_rebuild_thr_lock); + if (remdev->l2ad_rebuild_began == B_TRUE) { + remdev->l2ad_rebuild_cancel = B_TRUE; + while (remdev->l2ad_rebuild == B_TRUE) + cv_wait(&l2arc_rebuild_thr_cv, &l2arc_rebuild_thr_lock); + } + mutex_exit(&l2arc_rebuild_thr_lock); + + /* + * Remove device from global list + */ + mutex_enter(&l2arc_dev_mtx); + list_remove(l2arc_dev_list, remdev); + l2arc_dev_last = NULL; /* may have been invalidated */ + atomic_dec_64(&l2arc_ndev); + mutex_exit(&l2arc_dev_mtx); + + /* + * Clear all buflists and ARC references. L2ARC device flush. + */ + l2arc_evict(remdev, 0, B_TRUE); + list_destroy(&remdev->l2ad_buflist); + ASSERT(list_is_empty(&remdev->l2ad_lbptr_list)); + list_destroy(&remdev->l2ad_lbptr_list); + mutex_destroy(&remdev->l2ad_mtx); + zfs_refcount_destroy(&remdev->l2ad_alloc); + zfs_refcount_destroy(&remdev->l2ad_lb_asize); + zfs_refcount_destroy(&remdev->l2ad_lb_count); + kmem_free(remdev->l2ad_dev_hdr, remdev->l2ad_dev_hdr_asize); + vmem_free(remdev, sizeof (l2arc_dev_t)); +} + +void +l2arc_init(void) +{ + l2arc_thread_exit = 0; + l2arc_ndev = 0; + l2arc_writes_sent = 0; + l2arc_writes_done = 0; + + mutex_init(&l2arc_feed_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_feed_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_rebuild_thr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&l2arc_rebuild_thr_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&l2arc_dev_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&l2arc_free_on_write_mtx, NULL, MUTEX_DEFAULT, NULL); + + l2arc_dev_list = &L2ARC_dev_list; + l2arc_free_on_write = &L2ARC_free_on_write; + list_create(l2arc_dev_list, sizeof (l2arc_dev_t), + offsetof(l2arc_dev_t, l2ad_node)); + list_create(l2arc_free_on_write, sizeof (l2arc_data_free_t), + offsetof(l2arc_data_free_t, l2df_list_node)); +} + +void +l2arc_fini(void) +{ + mutex_destroy(&l2arc_feed_thr_lock); + cv_destroy(&l2arc_feed_thr_cv); + mutex_destroy(&l2arc_rebuild_thr_lock); + cv_destroy(&l2arc_rebuild_thr_cv); + mutex_destroy(&l2arc_dev_mtx); + mutex_destroy(&l2arc_free_on_write_mtx); + + list_destroy(l2arc_dev_list); + list_destroy(l2arc_free_on_write); +} + +void +l2arc_start(void) +{ + if (!(spa_mode_global & SPA_MODE_WRITE)) + return; + + (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, + TS_RUN, defclsyspri); +} + +void +l2arc_stop(void) +{ + if (!(spa_mode_global & SPA_MODE_WRITE)) + return; + + mutex_enter(&l2arc_feed_thr_lock); + cv_signal(&l2arc_feed_thr_cv); /* kick thread out of startup */ + l2arc_thread_exit = 1; + while (l2arc_thread_exit != 0) + cv_wait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock); + mutex_exit(&l2arc_feed_thr_lock); +} + +/* + * Punches out rebuild threads for the L2ARC devices in a spa. This should + * be called after pool import from the spa async thread, since starting + * these threads directly from spa_import() will make them part of the + * "zpool import" context and delay process exit (and thus pool import). + */ +void +l2arc_spa_rebuild_start(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off rebuild threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + l2arc_dev_t *dev = + l2arc_vdev_get(spa->spa_l2cache.sav_vdevs[i]); + if (dev == NULL) { + /* Don't attempt a rebuild if the vdev is UNAVAIL */ + continue; + } + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild && !dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild_began = B_TRUE; + (void) thread_create(NULL, 0, l2arc_dev_rebuild_thread, + dev, 0, &p0, TS_RUN, minclsyspri); + } + mutex_exit(&l2arc_rebuild_thr_lock); + } +} + +/* + * Main entry point for L2ARC rebuilding. + */ +static void +l2arc_dev_rebuild_thread(void *arg) +{ + l2arc_dev_t *dev = arg; + + VERIFY(!dev->l2ad_rebuild_cancel); + VERIFY(dev->l2ad_rebuild); + (void) l2arc_rebuild(dev); + mutex_enter(&l2arc_rebuild_thr_lock); + dev->l2ad_rebuild_began = B_FALSE; + dev->l2ad_rebuild = B_FALSE; + mutex_exit(&l2arc_rebuild_thr_lock); + + thread_exit(); +} + +/* + * This function implements the actual L2ARC metadata rebuild. It: + * starts reading the log block chain and restores each block's contents + * to memory (reconstructing arc_buf_hdr_t's). + * + * Operation stops under any of the following conditions: + * + * 1) We reach the end of the log block chain. + * 2) We encounter *any* error condition (cksum errors, io errors) + */ +static int +l2arc_rebuild(l2arc_dev_t *dev) +{ + vdev_t *vd = dev->l2ad_vdev; + spa_t *spa = vd->vdev_spa; + int err = 0; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + l2arc_log_blk_phys_t *this_lb, *next_lb; + zio_t *this_io = NULL, *next_io = NULL; + l2arc_log_blkptr_t lbps[2]; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + boolean_t lock_held; + + this_lb = vmem_zalloc(sizeof (*this_lb), KM_SLEEP); + next_lb = vmem_zalloc(sizeof (*next_lb), KM_SLEEP); + + /* + * We prevent device removal while issuing reads to the device, + * then during the rebuilding phases we drop this lock again so + * that a spa_unload or device remove can be initiated - this is + * safe, because the spa will signal us to stop before removing + * our device and wait for us to stop. + */ + spa_config_enter(spa, SCL_L2ARC, vd, RW_READER); + lock_held = B_TRUE; + + /* + * Retrieve the persistent L2ARC device state. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + dev->l2ad_evict = MAX(l2dhdr->dh_evict, dev->l2ad_start); + dev->l2ad_hand = MAX(l2dhdr->dh_start_lbps[0].lbp_daddr + + L2BLK_GET_PSIZE((&l2dhdr->dh_start_lbps[0])->lbp_prop), + dev->l2ad_start); + dev->l2ad_first = !!(l2dhdr->dh_flags & L2ARC_DEV_HDR_EVICT_FIRST); + + vd->vdev_trim_action_time = l2dhdr->dh_trim_action_time; + vd->vdev_trim_state = l2dhdr->dh_trim_state; + + /* + * In case the zfs module parameter l2arc_rebuild_enabled is false + * we do not start the rebuild process. + */ + if (!l2arc_rebuild_enabled) + goto out; + + /* Prepare the rebuild process */ + bcopy(l2dhdr->dh_start_lbps, lbps, sizeof (lbps)); + + /* Start the rebuild process */ + for (;;) { + if (!l2arc_log_blkptr_valid(dev, &lbps[0])) + break; + + if ((err = l2arc_log_blk_read(dev, &lbps[0], &lbps[1], + this_lb, next_lb, this_io, &next_io)) != 0) + goto out; + + /* + * Our memory pressure valve. If the system is running low + * on memory, rather than swamping memory with new ARC buf + * hdrs, we opt not to rebuild the L2ARC. At this point, + * however, we have already set up our L2ARC dev to chain in + * new metadata log blocks, so the user may choose to offline/ + * online the L2ARC dev at a later time (or re-import the pool) + * to reconstruct it (when there's less memory pressure). + */ + if (arc_reclaim_needed()) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_lowmem); + cmn_err(CE_NOTE, "System running low on memory, " + "aborting L2ARC rebuild."); + err = SET_ERROR(ENOMEM); + goto out; + } + + spa_config_exit(spa, SCL_L2ARC, vd); + lock_held = B_FALSE; + + /* + * Now that we know that the next_lb checks out alright, we + * can start reconstruction from this log block. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + uint64_t asize = L2BLK_GET_PSIZE((&lbps[0])->lbp_prop); + l2arc_log_blk_restore(dev, this_lb, asize, lbps[0].lbp_daddr); + + /* + * log block restored, include its pointer in the list of + * pointers to log blocks present in the L2ARC device. + */ + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), + KM_SLEEP); + bcopy(&lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(vd, asize, 0, 0); + + /* + * Protection against loops of log blocks: + * + * l2ad_hand l2ad_evict + * V V + * l2ad_start |=======================================| l2ad_end + * -----|||----|||---|||----||| + * (3) (2) (1) (0) + * ---|||---|||----|||---||| + * (7) (6) (5) (4) + * + * In this situation the pointer of log block (4) passes + * l2arc_log_blkptr_valid() but the log block should not be + * restored as it is overwritten by the payload of log block + * (0). Only log blocks (0)-(3) should be restored. We check + * whether l2ad_evict lies in between the payload starting + * offset of the next log block (lbps[1].lbp_payload_start) + * and the payload starting offset of the present log block + * (lbps[0].lbp_payload_start). If true and this isn't the + * first pass, we are looping from the beginning and we should + * stop. + */ + if (l2arc_range_check_overlap(lbps[1].lbp_payload_start, + lbps[0].lbp_payload_start, dev->l2ad_evict) && + !dev->l2ad_first) + goto out; + + for (;;) { + mutex_enter(&l2arc_rebuild_thr_lock); + if (dev->l2ad_rebuild_cancel) { + dev->l2ad_rebuild = B_FALSE; + cv_signal(&l2arc_rebuild_thr_cv); + mutex_exit(&l2arc_rebuild_thr_lock); + err = SET_ERROR(ECANCELED); + goto out; + } + mutex_exit(&l2arc_rebuild_thr_lock); + if (spa_config_tryenter(spa, SCL_L2ARC, vd, + RW_READER)) { + lock_held = B_TRUE; + break; + } + /* + * L2ARC config lock held by somebody in writer, + * possibly due to them trying to remove us. They'll + * likely to want us to shut down, so after a little + * delay, we check l2ad_rebuild_cancel and retry + * the lock again. + */ + delay(1); + } + + /* + * Continue with the next log block. + */ + lbps[0] = lbps[1]; + lbps[1] = this_lb->lb_prev_lbp; + PTR_SWAP(this_lb, next_lb); + this_io = next_io; + next_io = NULL; + } + + if (this_io != NULL) + l2arc_log_blk_fetch_abort(this_io); +out: + if (next_io != NULL) + l2arc_log_blk_fetch_abort(next_io); + vmem_free(this_lb, sizeof (*this_lb)); + vmem_free(next_lb, sizeof (*next_lb)); + + if (!l2arc_rebuild_enabled) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "disabled"); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) > 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_success); + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "successful, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err == 0 && zfs_refcount_count(&dev->l2ad_lb_count) == 0) { + /* + * No error but also nothing restored, meaning the lbps array + * in the device header points to invalid/non-present log + * blocks. Reset the header. + */ + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "no valid log blocks"); + bzero(l2dhdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + } else if (err == ECANCELED) { + /* + * In case the rebuild was canceled do not log to spa history + * log as the pool may be in the process of being removed. + */ + zfs_dbgmsg("L2ARC rebuild aborted, restored %llu blocks", + zfs_refcount_count(&dev->l2ad_lb_count)); + } else if (err != 0) { + spa_history_log_internal(spa, "L2ARC rebuild", NULL, + "aborted, restored %llu blocks", + (u_longlong_t)zfs_refcount_count(&dev->l2ad_lb_count)); + } + + if (lock_held) + spa_config_exit(spa, SCL_L2ARC, vd); + + return (err); +} + +/* + * Attempts to read the device header on the provided L2ARC device and writes + * it to `hdr'. On success, this function returns 0, otherwise the appropriate + * error code is returned. + */ +static int +l2arc_dev_hdr_read(l2arc_dev_t *dev) +{ + int err; + uint64_t guid; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + + guid = spa_guid(dev->l2ad_vdev->vdev_spa); + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_read_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, + ZIO_CHECKSUM_LABEL, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_SPECULATIVE, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_dh_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + return (err); + } + + if (l2dhdr->dh_magic == BSWAP_64(L2ARC_DEV_HDR_MAGIC)) + byteswap_uint64_array(l2dhdr, sizeof (*l2dhdr)); + + if (l2dhdr->dh_magic != L2ARC_DEV_HDR_MAGIC || + l2dhdr->dh_spa_guid != guid || + l2dhdr->dh_vdev_guid != dev->l2ad_vdev->vdev_guid || + l2dhdr->dh_version != L2ARC_PERSISTENT_VERSION || + l2dhdr->dh_log_entries != dev->l2ad_log_entries || + l2dhdr->dh_end != dev->l2ad_end || + !l2arc_range_check_overlap(dev->l2ad_start, dev->l2ad_end, + l2dhdr->dh_evict) || + (l2dhdr->dh_trim_state != VDEV_TRIM_COMPLETE && + l2arc_trim_ahead > 0)) { + /* + * Attempt to rebuild a device containing no actual dev hdr + * or containing a header from some other pool or from another + * version of persistent L2ARC. + */ + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_unsupported); + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +/* + * Reads L2ARC log blocks from storage and validates their contents. + * + * This function implements a simple fetcher to make sure that while + * we're processing one buffer the L2ARC is already fetching the next + * one in the chain. + * + * The arguments this_lp and next_lp point to the current and next log block + * address in the block chain. Similarly, this_lb and next_lb hold the + * l2arc_log_blk_phys_t's of the current and next L2ARC blk. + * + * The `this_io' and `next_io' arguments are used for block fetching. + * When issuing the first blk IO during rebuild, you should pass NULL for + * `this_io'. This function will then issue a sync IO to read the block and + * also issue an async IO to fetch the next block in the block chain. The + * fetched IO is returned in `next_io'. On subsequent calls to this + * function, pass the value returned in `next_io' from the previous call + * as `this_io' and a fresh `next_io' pointer to hold the next fetch IO. + * Prior to the call, you should initialize your `next_io' pointer to be + * NULL. If no fetch IO was issued, the pointer is left set at NULL. + * + * On success, this function returns 0, otherwise it returns an appropriate + * error code. On error the fetching IO is aborted and cleared before + * returning from this function. Therefore, if we return `success', the + * caller can assume that we have taken care of cleanup of fetch IOs. + */ +static int +l2arc_log_blk_read(l2arc_dev_t *dev, + const l2arc_log_blkptr_t *this_lbp, const l2arc_log_blkptr_t *next_lbp, + l2arc_log_blk_phys_t *this_lb, l2arc_log_blk_phys_t *next_lb, + zio_t *this_io, zio_t **next_io) +{ + int err = 0; + zio_cksum_t cksum; + abd_t *abd = NULL; + uint64_t asize; + + ASSERT(this_lbp != NULL && next_lbp != NULL); + ASSERT(this_lb != NULL && next_lb != NULL); + ASSERT(next_io != NULL && *next_io == NULL); + ASSERT(l2arc_log_blkptr_valid(dev, this_lbp)); + + /* + * Check to see if we have issued the IO for this log block in a + * previous run. If not, this is the first call, so issue it now. + */ + if (this_io == NULL) { + this_io = l2arc_log_blk_fetch(dev->l2ad_vdev, this_lbp, + this_lb); + } + + /* + * Peek to see if we can start issuing the next IO immediately. + */ + if (l2arc_log_blkptr_valid(dev, next_lbp)) { + /* + * Start issuing IO for the next log block early - this + * should help keep the L2ARC device busy while we + * decompress and restore this log block. + */ + *next_io = l2arc_log_blk_fetch(dev->l2ad_vdev, next_lbp, + next_lb); + } + + /* Wait for the IO to read this log block to complete */ + if ((err = zio_wait(this_io)) != 0) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_io_errors); + zfs_dbgmsg("L2ARC IO error (%d) while reading log block, " + "offset: %llu, vdev guid: %llu", err, this_lbp->lbp_daddr, + dev->l2ad_vdev->vdev_guid); + goto cleanup; + } + + /* + * Make sure the buffer checks out. + * L2BLK_GET_PSIZE returns aligned size for log blocks. + */ + asize = L2BLK_GET_PSIZE((this_lbp)->lbp_prop); + fletcher_4_native(this_lb, asize, NULL, &cksum); + if (!ZIO_CHECKSUM_EQUAL(cksum, this_lbp->lbp_cksum)) { + ARCSTAT_BUMP(arcstat_l2_rebuild_abort_cksum_lb_errors); + zfs_dbgmsg("L2ARC log block cksum failed, offset: %llu, " + "vdev guid: %llu, l2ad_hand: %llu, l2ad_evict: %llu", + this_lbp->lbp_daddr, dev->l2ad_vdev->vdev_guid, + dev->l2ad_hand, dev->l2ad_evict); + err = SET_ERROR(ECKSUM); + goto cleanup; + } + + /* Now we can take our time decoding this buffer */ + switch (L2BLK_GET_COMPRESS((this_lbp)->lbp_prop)) { + case ZIO_COMPRESS_OFF: + break; + case ZIO_COMPRESS_LZ4: + abd = abd_alloc_for_io(asize, B_TRUE); + abd_copy_from_buf_off(abd, this_lb, 0, asize); + if ((err = zio_decompress_data( + L2BLK_GET_COMPRESS((this_lbp)->lbp_prop), + abd, this_lb, asize, sizeof (*this_lb), NULL)) != 0) { + err = SET_ERROR(EINVAL); + goto cleanup; + } + break; + default: + err = SET_ERROR(EINVAL); + goto cleanup; + } + if (this_lb->lb_magic == BSWAP_64(L2ARC_LOG_BLK_MAGIC)) + byteswap_uint64_array(this_lb, sizeof (*this_lb)); + if (this_lb->lb_magic != L2ARC_LOG_BLK_MAGIC) { + err = SET_ERROR(EINVAL); + goto cleanup; + } +cleanup: + /* Abort an in-flight fetch I/O in case of error */ + if (err != 0 && *next_io != NULL) { + l2arc_log_blk_fetch_abort(*next_io); + *next_io = NULL; + } + if (abd != NULL) + abd_free(abd); + return (err); +} + +/* + * Restores the payload of a log block to ARC. This creates empty ARC hdr + * entries which only contain an l2arc hdr, essentially restoring the + * buffers to their L2ARC evicted state. This function also updates space + * usage on the L2ARC vdev to make sure it tracks restored buffers. + */ +static void +l2arc_log_blk_restore(l2arc_dev_t *dev, const l2arc_log_blk_phys_t *lb, + uint64_t lb_asize, uint64_t lb_daddr) +{ + uint64_t size = 0, asize = 0; + uint64_t log_entries = dev->l2ad_log_entries; + + for (int i = log_entries - 1; i >= 0; i--) { + /* + * Restore goes in the reverse temporal direction to preserve + * correct temporal ordering of buffers in the l2ad_buflist. + * l2arc_hdr_restore also does a list_insert_tail instead of + * list_insert_head on the l2ad_buflist: + * + * LIST l2ad_buflist LIST + * HEAD <------ (time) ------ TAIL + * direction +-----+-----+-----+-----+-----+ direction + * of l2arc <== | buf | buf | buf | buf | buf | ===> of rebuild + * fill +-----+-----+-----+-----+-----+ + * ^ ^ + * | | + * | | + * l2arc_feed_thread l2arc_rebuild + * will place new bufs here restores bufs here + * + * During l2arc_rebuild() the device is not used by + * l2arc_feed_thread() as dev->l2ad_rebuild is set to true. + */ + size += L2BLK_GET_LSIZE((&lb->lb_entries[i])->le_prop); + asize += vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((&lb->lb_entries[i])->le_prop)); + l2arc_hdr_restore(&lb->lb_entries[i], dev); + } + + /* + * Record rebuild stats: + * size Logical size of restored buffers in the L2ARC + * asize Aligned size of restored buffers in the L2ARC + */ + ARCSTAT_INCR(arcstat_l2_rebuild_size, size); + ARCSTAT_INCR(arcstat_l2_rebuild_asize, asize); + ARCSTAT_INCR(arcstat_l2_rebuild_bufs, log_entries); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, lb_asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, asize / lb_asize); + ARCSTAT_BUMP(arcstat_l2_rebuild_log_blks); +} + +/* + * Restores a single ARC buf hdr from a log entry. The ARC buffer is put + * into a state indicating that it has been evicted to L2ARC. + */ +static void +l2arc_hdr_restore(const l2arc_log_ent_phys_t *le, l2arc_dev_t *dev) +{ + arc_buf_hdr_t *hdr, *exists; + kmutex_t *hash_lock; + arc_buf_contents_t type = L2BLK_GET_TYPE((le)->le_prop); + uint64_t asize; + + /* + * Do all the allocation before grabbing any locks, this lets us + * sleep if memory is full and we don't have to deal with failed + * allocations. + */ + hdr = arc_buf_alloc_l2only(L2BLK_GET_LSIZE((le)->le_prop), type, + dev, le->le_dva, le->le_daddr, + L2BLK_GET_PSIZE((le)->le_prop), le->le_birth, + L2BLK_GET_COMPRESS((le)->le_prop), le->le_complevel, + L2BLK_GET_PROTECTED((le)->le_prop), + L2BLK_GET_PREFETCH((le)->le_prop)); + asize = vdev_psize_to_asize(dev->l2ad_vdev, + L2BLK_GET_PSIZE((le)->le_prop)); + + /* + * vdev_space_update() has to be called before arc_hdr_destroy() to + * avoid underflow since the latter also calls the former. + */ + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(hdr)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(hdr)); + + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, hdr); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, arc_hdr_size(hdr), hdr); + mutex_exit(&dev->l2ad_mtx); + + exists = buf_hash_insert(hdr, &hash_lock); + if (exists) { + /* Buffer was already cached, no need to restore it. */ + arc_hdr_destroy(hdr); + /* + * If the buffer is already cached, check whether it has + * L2ARC metadata. If not, enter them and update the flag. + * This is important is case of onlining a cache device, since + * we previously evicted all L2ARC metadata from ARC. + */ + if (!HDR_HAS_L2HDR(exists)) { + arc_hdr_set_flags(exists, ARC_FLAG_HAS_L2HDR); + exists->b_l2hdr.b_dev = dev; + exists->b_l2hdr.b_daddr = le->le_daddr; + mutex_enter(&dev->l2ad_mtx); + list_insert_tail(&dev->l2ad_buflist, exists); + (void) zfs_refcount_add_many(&dev->l2ad_alloc, + arc_hdr_size(exists), exists); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + ARCSTAT_INCR(arcstat_l2_lsize, HDR_GET_LSIZE(exists)); + ARCSTAT_INCR(arcstat_l2_psize, HDR_GET_PSIZE(exists)); + } + ARCSTAT_BUMP(arcstat_l2_rebuild_bufs_precached); + } + + mutex_exit(hash_lock); +} + +/* + * Starts an asynchronous read IO to read a log block. This is used in log + * block reconstruction to start reading the next block before we are done + * decoding and reconstructing the current block, to keep the l2arc device + * nice and hot with read IO to process. + * The returned zio will contain a newly allocated memory buffers for the IO + * data which should then be freed by the caller once the zio is no longer + * needed (i.e. due to it having completed). If you wish to abort this + * zio, you should do so using l2arc_log_blk_fetch_abort, which takes + * care of disposing of the allocated buffers correctly. + */ +static zio_t * +l2arc_log_blk_fetch(vdev_t *vd, const l2arc_log_blkptr_t *lbp, + l2arc_log_blk_phys_t *lb) +{ + uint32_t asize; + zio_t *pio; + l2arc_read_callback_t *cb; + + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + ASSERT(asize <= sizeof (l2arc_log_blk_phys_t)); + + cb = kmem_zalloc(sizeof (l2arc_read_callback_t), KM_SLEEP); + cb->l2rcb_abd = abd_get_from_buf(lb, asize); + pio = zio_root(vd->vdev_spa, l2arc_blk_fetch_done, cb, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_DONT_RETRY); + (void) zio_nowait(zio_read_phys(pio, vd, lbp->lbp_daddr, asize, + cb->l2rcb_abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_DONT_CACHE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE)); + + return (pio); +} + +/* + * Aborts a zio returned from l2arc_log_blk_fetch and frees the data + * buffers allocated for it. + */ +static void +l2arc_log_blk_fetch_abort(zio_t *zio) +{ + (void) zio_wait(zio); +} + +/* + * Creates a zio to update the device header on an l2arc device. + */ +void +l2arc_dev_hdr_update(l2arc_dev_t *dev) +{ + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + const uint64_t l2dhdr_asize = dev->l2ad_dev_hdr_asize; + abd_t *abd; + int err; + + VERIFY(spa_config_held(dev->l2ad_spa, SCL_STATE_ALL, RW_READER)); + + l2dhdr->dh_magic = L2ARC_DEV_HDR_MAGIC; + l2dhdr->dh_version = L2ARC_PERSISTENT_VERSION; + l2dhdr->dh_spa_guid = spa_guid(dev->l2ad_vdev->vdev_spa); + l2dhdr->dh_vdev_guid = dev->l2ad_vdev->vdev_guid; + l2dhdr->dh_log_entries = dev->l2ad_log_entries; + l2dhdr->dh_evict = dev->l2ad_evict; + l2dhdr->dh_start = dev->l2ad_start; + l2dhdr->dh_end = dev->l2ad_end; + l2dhdr->dh_lb_asize = zfs_refcount_count(&dev->l2ad_lb_asize); + l2dhdr->dh_lb_count = zfs_refcount_count(&dev->l2ad_lb_count); + l2dhdr->dh_flags = 0; + l2dhdr->dh_trim_action_time = dev->l2ad_vdev->vdev_trim_action_time; + l2dhdr->dh_trim_state = dev->l2ad_vdev->vdev_trim_state; + if (dev->l2ad_first) + l2dhdr->dh_flags |= L2ARC_DEV_HDR_EVICT_FIRST; + + abd = abd_get_from_buf(l2dhdr, l2dhdr_asize); + + err = zio_wait(zio_write_phys(NULL, dev->l2ad_vdev, + VDEV_LABEL_START_SIZE, l2dhdr_asize, abd, ZIO_CHECKSUM_LABEL, NULL, + NULL, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE)); + + abd_put(abd); + + if (err != 0) { + zfs_dbgmsg("L2ARC IO error (%d) while writing device header, " + "vdev guid: %llu", err, dev->l2ad_vdev->vdev_guid); + } +} + +/* + * Commits a log block to the L2ARC device. This routine is invoked from + * l2arc_write_buffers when the log block fills up. + * This function allocates some memory to temporarily hold the serialized + * buffer to be written. This is then released in l2arc_write_done. + */ +static void +l2arc_log_blk_commit(l2arc_dev_t *dev, zio_t *pio, l2arc_write_callback_t *cb) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_dev_hdr_phys_t *l2dhdr = dev->l2ad_dev_hdr; + uint64_t psize, asize; + zio_t *wzio; + l2arc_lb_abd_buf_t *abd_buf; + uint8_t *tmpbuf; + l2arc_lb_ptr_buf_t *lb_ptr_buf; + + VERIFY3S(dev->l2ad_log_ent_idx, ==, dev->l2ad_log_entries); + + tmpbuf = zio_buf_alloc(sizeof (*lb)); + abd_buf = zio_buf_alloc(sizeof (*abd_buf)); + abd_buf->abd = abd_get_from_buf(lb, sizeof (*lb)); + lb_ptr_buf = kmem_zalloc(sizeof (l2arc_lb_ptr_buf_t), KM_SLEEP); + lb_ptr_buf->lb_ptr = kmem_zalloc(sizeof (l2arc_log_blkptr_t), KM_SLEEP); + + /* link the buffer into the block chain */ + lb->lb_prev_lbp = l2dhdr->dh_start_lbps[1]; + lb->lb_magic = L2ARC_LOG_BLK_MAGIC; + + /* + * l2arc_log_blk_commit() may be called multiple times during a single + * l2arc_write_buffers() call. Save the allocated abd buffers in a list + * so we can free them in l2arc_write_done() later on. + */ + list_insert_tail(&cb->l2wcb_abd_list, abd_buf); + + /* try to compress the buffer */ + psize = zio_compress_data(ZIO_COMPRESS_LZ4, + abd_buf->abd, tmpbuf, sizeof (*lb), 0); + + /* a log block is never entirely zero */ + ASSERT(psize != 0); + asize = vdev_psize_to_asize(dev->l2ad_vdev, psize); + ASSERT(asize <= sizeof (*lb)); + + /* + * Update the start log block pointer in the device header to point + * to the log block we're about to write. + */ + l2dhdr->dh_start_lbps[1] = l2dhdr->dh_start_lbps[0]; + l2dhdr->dh_start_lbps[0].lbp_daddr = dev->l2ad_hand; + l2dhdr->dh_start_lbps[0].lbp_payload_asize = + dev->l2ad_log_blk_payload_asize; + l2dhdr->dh_start_lbps[0].lbp_payload_start = + dev->l2ad_log_blk_payload_start; + _NOTE(CONSTCOND) + L2BLK_SET_LSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, sizeof (*lb)); + L2BLK_SET_PSIZE( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, asize); + L2BLK_SET_CHECKSUM( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_CHECKSUM_FLETCHER_4); + if (asize < sizeof (*lb)) { + /* compression succeeded */ + bzero(tmpbuf + psize, asize - psize); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_LZ4); + } else { + /* compression failed */ + bcopy(lb, tmpbuf, sizeof (*lb)); + L2BLK_SET_COMPRESS( + (&l2dhdr->dh_start_lbps[0])->lbp_prop, + ZIO_COMPRESS_OFF); + } + + /* checksum what we're about to write */ + fletcher_4_native(tmpbuf, asize, NULL, + &l2dhdr->dh_start_lbps[0].lbp_cksum); + + abd_put(abd_buf->abd); + + /* perform the write itself */ + abd_buf->abd = abd_get_from_buf(tmpbuf, sizeof (*lb)); + abd_take_ownership_of_buf(abd_buf->abd, B_TRUE); + wzio = zio_write_phys(pio, dev->l2ad_vdev, dev->l2ad_hand, + asize, abd_buf->abd, ZIO_CHECKSUM_OFF, NULL, NULL, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_CANFAIL, B_FALSE); + DTRACE_PROBE2(l2arc__write, vdev_t *, dev->l2ad_vdev, zio_t *, wzio); + (void) zio_nowait(wzio); + + dev->l2ad_hand += asize; + /* + * Include the committed log block's pointer in the list of pointers + * to log blocks present in the L2ARC device. + */ + bcopy(&l2dhdr->dh_start_lbps[0], lb_ptr_buf->lb_ptr, + sizeof (l2arc_log_blkptr_t)); + mutex_enter(&dev->l2ad_mtx); + list_insert_head(&dev->l2ad_lbptr_list, lb_ptr_buf); + ARCSTAT_INCR(arcstat_l2_log_blk_asize, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_count); + zfs_refcount_add_many(&dev->l2ad_lb_asize, asize, lb_ptr_buf); + zfs_refcount_add(&dev->l2ad_lb_count, lb_ptr_buf); + mutex_exit(&dev->l2ad_mtx); + vdev_space_update(dev->l2ad_vdev, asize, 0, 0); + + /* bump the kstats */ + ARCSTAT_INCR(arcstat_l2_write_bytes, asize); + ARCSTAT_BUMP(arcstat_l2_log_blk_writes); + ARCSTAT_F_AVG(arcstat_l2_log_blk_avg_asize, asize); + ARCSTAT_F_AVG(arcstat_l2_data_to_meta_ratio, + dev->l2ad_log_blk_payload_asize / asize); + + /* start a new log block */ + dev->l2ad_log_ent_idx = 0; + dev->l2ad_log_blk_payload_asize = 0; + dev->l2ad_log_blk_payload_start = 0; +} + +/* + * Validates an L2ARC log block address to make sure that it can be read + * from the provided L2ARC device. + */ +boolean_t +l2arc_log_blkptr_valid(l2arc_dev_t *dev, const l2arc_log_blkptr_t *lbp) +{ + /* L2BLK_GET_PSIZE returns aligned size for log blocks */ + uint64_t asize = L2BLK_GET_PSIZE((lbp)->lbp_prop); + uint64_t end = lbp->lbp_daddr + asize - 1; + uint64_t start = lbp->lbp_payload_start; + boolean_t evicted = B_FALSE; + + /* + * A log block is valid if all of the following conditions are true: + * - it fits entirely (including its payload) between l2ad_start and + * l2ad_end + * - it has a valid size + * - neither the log block itself nor part of its payload was evicted + * by l2arc_evict(): + * + * l2ad_hand l2ad_evict + * | | lbp_daddr + * | start | | end + * | | | | | + * V V V V V + * l2ad_start ============================================ l2ad_end + * --------------------------|||| + * ^ ^ + * | log block + * payload + */ + + evicted = + l2arc_range_check_overlap(start, end, dev->l2ad_hand) || + l2arc_range_check_overlap(start, end, dev->l2ad_evict) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, start) || + l2arc_range_check_overlap(dev->l2ad_hand, dev->l2ad_evict, end); + + return (start >= dev->l2ad_start && end <= dev->l2ad_end && + asize > 0 && asize <= sizeof (l2arc_log_blk_phys_t) && + (!evicted || dev->l2ad_first)); +} + +/* + * Inserts ARC buffer header `hdr' into the current L2ARC log block on + * the device. The buffer being inserted must be present in L2ARC. + * Returns B_TRUE if the L2ARC log block is full and needs to be committed + * to L2ARC, or B_FALSE if it still has room for more ARC buffers. + */ +static boolean_t +l2arc_log_blk_insert(l2arc_dev_t *dev, const arc_buf_hdr_t *hdr) +{ + l2arc_log_blk_phys_t *lb = &dev->l2ad_log_blk; + l2arc_log_ent_phys_t *le; + + if (dev->l2ad_log_entries == 0) + return (B_FALSE); + + int index = dev->l2ad_log_ent_idx++; + + ASSERT3S(index, <, dev->l2ad_log_entries); + ASSERT(HDR_HAS_L2HDR(hdr)); + + le = &lb->lb_entries[index]; + bzero(le, sizeof (*le)); + le->le_dva = hdr->b_dva; + le->le_birth = hdr->b_birth; + le->le_daddr = hdr->b_l2hdr.b_daddr; + if (index == 0) + dev->l2ad_log_blk_payload_start = le->le_daddr; + L2BLK_SET_LSIZE((le)->le_prop, HDR_GET_LSIZE(hdr)); + L2BLK_SET_PSIZE((le)->le_prop, HDR_GET_PSIZE(hdr)); + L2BLK_SET_COMPRESS((le)->le_prop, HDR_GET_COMPRESS(hdr)); + le->le_complevel = hdr->b_complevel; + L2BLK_SET_TYPE((le)->le_prop, hdr->b_type); + L2BLK_SET_PROTECTED((le)->le_prop, !!(HDR_PROTECTED(hdr))); + L2BLK_SET_PREFETCH((le)->le_prop, !!(HDR_PREFETCH(hdr))); + + dev->l2ad_log_blk_payload_asize += vdev_psize_to_asize(dev->l2ad_vdev, + HDR_GET_PSIZE(hdr)); + + return (dev->l2ad_log_ent_idx == dev->l2ad_log_entries); +} + +/* + * Checks whether a given L2ARC device address sits in a time-sequential + * range. The trick here is that the L2ARC is a rotary buffer, so we can't + * just do a range comparison, we need to handle the situation in which the + * range wraps around the end of the L2ARC device. Arguments: + * bottom -- Lower end of the range to check (written to earlier). + * top -- Upper end of the range to check (written to later). + * check -- The address for which we want to determine if it sits in + * between the top and bottom. + * + * The 3-way conditional below represents the following cases: + * + * bottom < top : Sequentially ordered case: + * --------+-------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |---------------============--------------| + * + * bottom > top: Looped-around case: + * --------+------------------+ + * | (overlap here?) | + * L2ARC dev V V + * |===============---------------===========| + * ^ ^ + * | (or here?) | + * +---------------+--------- + * + * top == bottom : Just a single address comparison. + */ +boolean_t +l2arc_range_check_overlap(uint64_t bottom, uint64_t top, uint64_t check) +{ + if (bottom < top) + return (bottom <= check && check <= top); + else if (bottom > top) + return (check <= top || bottom <= check); + else + return (check == top); +} + +EXPORT_SYMBOL(arc_buf_size); +EXPORT_SYMBOL(arc_write); +EXPORT_SYMBOL(arc_read); +EXPORT_SYMBOL(arc_buf_info); +EXPORT_SYMBOL(arc_getbuf_func); +EXPORT_SYMBOL(arc_add_prune_callback); +EXPORT_SYMBOL(arc_remove_prune_callback); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min, param_set_arc_long, + param_get_long, ZMOD_RW, "Min arc size"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, max, param_set_arc_long, + param_get_long, ZMOD_RW, "Max arc size"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit, param_set_arc_long, + param_get_long, ZMOD_RW, "Metadata limit for arc size"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_limit_percent, + param_set_arc_long, param_get_long, ZMOD_RW, + "Percent of arc size for arc meta limit"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, meta_min, param_set_arc_long, + param_get_long, ZMOD_RW, "Min arc metadata"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_prune, INT, ZMOD_RW, + "Meta objects to scan for prune"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_adjust_restarts, INT, ZMOD_RW, + "Limit number of restarts in arc_evict_meta"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, meta_strategy, INT, ZMOD_RW, + "Meta reclaim strategy"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, grow_retry, param_set_arc_int, + param_get_int, ZMOD_RW, "Seconds before growing arc size"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, p_dampener_disable, INT, ZMOD_RW, + "Disable arc_p adapt dampener"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, shrink_shift, param_set_arc_int, + param_get_int, ZMOD_RW, "log2(fraction of arc to reclaim)"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, pc_percent, UINT, ZMOD_RW, + "Percent of pagecache to reclaim arc to"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, p_min_shift, param_set_arc_int, + param_get_int, ZMOD_RW, "arc_c shift to calc min/max arc_p"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, average_blocksize, INT, ZMOD_RD, + "Target average block size"); + +ZFS_MODULE_PARAM(zfs, zfs_, compressed_arc_enabled, INT, ZMOD_RW, + "Disable compressed arc buffers"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prefetch_ms, param_set_arc_int, + param_get_int, ZMOD_RW, "Min life of prefetch block in ms"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, min_prescient_prefetch_ms, + param_set_arc_int, param_get_int, ZMOD_RW, + "Min life of prescient prefetched block in ms"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_max, ULONG, ZMOD_RW, + "Max write bytes per interval"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, write_boost, ULONG, ZMOD_RW, + "Extra write bytes during device warmup"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom, ULONG, ZMOD_RW, + "Number of max device writes to precache"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, headroom_boost, ULONG, ZMOD_RW, + "Compressed l2arc_headroom multiplier"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, trim_ahead, ULONG, ZMOD_RW, + "TRIM ahead L2ARC write size multiplier"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_secs, ULONG, ZMOD_RW, + "Seconds between L2ARC writing"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_min_ms, ULONG, ZMOD_RW, + "Min feed interval in milliseconds"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, noprefetch, INT, ZMOD_RW, + "Skip caching prefetched buffers"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, feed_again, INT, ZMOD_RW, + "Turbo L2ARC warmup"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, norw, INT, ZMOD_RW, + "No reads during writes"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_enabled, INT, ZMOD_RW, + "Rebuild the L2ARC when importing a pool"); + +ZFS_MODULE_PARAM(zfs_l2arc, l2arc_, rebuild_blocks_min_l2size, ULONG, ZMOD_RW, + "Min size in bytes to write rebuild log blocks in L2ARC"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, lotsfree_percent, param_set_arc_int, + param_get_int, ZMOD_RW, "System free memory I/O throttle in bytes"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, sys_free, param_set_arc_long, + param_get_long, ZMOD_RW, "System free memory target size in bytes"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit, param_set_arc_long, + param_get_long, ZMOD_RW, "Minimum bytes of dnodes in arc"); + +ZFS_MODULE_PARAM_CALL(zfs_arc, zfs_arc_, dnode_limit_percent, + param_set_arc_long, param_get_long, ZMOD_RW, + "Percent of ARC meta buffers for dnodes"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, dnode_reduce_percent, ULONG, ZMOD_RW, + "Percentage of excess dnodes to try to unpin"); + +ZFS_MODULE_PARAM(zfs_arc, zfs_arc_, eviction_pct, INT, ZMOD_RW, + "When full, ARC allocation waits for eviction of this % of alloc size"); +/* END CSTYLED */ diff --git a/module/zfs/blkptr.c b/module/zfs/blkptr.c new file mode 100644 index 000000000000..aa09ded8dba3 --- /dev/null +++ b/module/zfs/blkptr.c @@ -0,0 +1,153 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +/* + * Embedded-data Block Pointers + * + * Normally, block pointers point (via their DVAs) to a block which holds data. + * If the data that we need to store is very small, this is an inefficient + * use of space, because a block must be at minimum 1 sector (typically 512 + * bytes or 4KB). Additionally, reading these small blocks tends to generate + * more random reads. + * + * Embedded-data Block Pointers allow small pieces of data (the "payload", + * up to 112 bytes) to be stored in the block pointer itself, instead of + * being pointed to. The "Pointer" part of this name is a bit of a + * misnomer, as nothing is pointed to. + * + * BP_EMBEDDED_TYPE_DATA block pointers allow highly-compressible data to + * be embedded in the block pointer. The logic for this is handled in + * the SPA, by the zio pipeline. Therefore most code outside the zio + * pipeline doesn't need special-cases to handle these block pointers. + * + * See spa.h for details on the exact layout of embedded block pointers. + */ + +void +encode_embedded_bp_compressed(blkptr_t *bp, void *data, + enum zio_compress comp, int uncompressed_size, int compressed_size) +{ + uint64_t *bp64 = (uint64_t *)bp; + uint64_t w = 0; + uint8_t *data8 = data; + + ASSERT3U(compressed_size, <=, BPE_PAYLOAD_SIZE); + ASSERT(uncompressed_size == compressed_size || + comp != ZIO_COMPRESS_OFF); + ASSERT3U(comp, >=, ZIO_COMPRESS_OFF); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + + bzero(bp, sizeof (*bp)); + BP_SET_EMBEDDED(bp, B_TRUE); + BP_SET_COMPRESS(bp, comp); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + BPE_SET_LSIZE(bp, uncompressed_size); + BPE_SET_PSIZE(bp, compressed_size); + + /* + * Encode the byte array into the words of the block pointer. + * First byte goes into low bits of first word (little endian). + */ + for (int i = 0; i < compressed_size; i++) { + BF64_SET(w, (i % sizeof (w)) * NBBY, NBBY, data8[i]); + if (i % sizeof (w) == sizeof (w) - 1) { + /* we've reached the end of a word */ + ASSERT3P(bp64, <, bp + 1); + *bp64 = w; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + w = 0; + } + } + /* write last partial word */ + if (bp64 < (uint64_t *)(bp + 1)) + *bp64 = w; +} + +/* + * buf must be at least BPE_GET_PSIZE(bp) bytes long (which will never be + * more than BPE_PAYLOAD_SIZE bytes). + */ +void +decode_embedded_bp_compressed(const blkptr_t *bp, void *buf) +{ + int psize; + uint8_t *buf8 = buf; + uint64_t w = 0; + const uint64_t *bp64 = (const uint64_t *)bp; + + ASSERT(BP_IS_EMBEDDED(bp)); + + psize = BPE_GET_PSIZE(bp); + + /* + * Decode the words of the block pointer into the byte array. + * Low bits of first word are the first byte (little endian). + */ + for (int i = 0; i < psize; i++) { + if (i % sizeof (w) == 0) { + /* beginning of a word */ + ASSERT3P(bp64, <, bp + 1); + w = *bp64; + bp64++; + if (!BPE_IS_PAYLOADWORD(bp, bp64)) + bp64++; + } + buf8[i] = BF64_GET(w, (i % sizeof (w)) * NBBY, NBBY); + } +} + +/* + * Fill in the buffer with the (decompressed) payload of the embedded + * blkptr_t. Takes into account compression and byteorder (the payload is + * treated as a stream of bytes). + * Return 0 on success, or ENOSPC if it won't fit in the buffer. + */ +int +decode_embedded_bp(const blkptr_t *bp, void *buf, int buflen) +{ + int lsize, psize; + + ASSERT(BP_IS_EMBEDDED(bp)); + + lsize = BPE_GET_LSIZE(bp); + psize = BPE_GET_PSIZE(bp); + + if (lsize > buflen) + return (SET_ERROR(ENOSPC)); + ASSERT3U(lsize, ==, buflen); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + uint8_t dstbuf[BPE_PAYLOAD_SIZE]; + decode_embedded_bp_compressed(bp, dstbuf); + VERIFY0(zio_decompress_data_buf(BP_GET_COMPRESS(bp), + dstbuf, buf, psize, buflen, NULL)); + } else { + ASSERT3U(lsize, ==, psize); + decode_embedded_bp_compressed(bp, buf); + } + + return (0); +} diff --git a/module/zfs/bplist.c b/module/zfs/bplist.c new file mode 100644 index 000000000000..47ea364ef26f --- /dev/null +++ b/module/zfs/bplist.c @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include +#include + + +void +bplist_create(bplist_t *bpl) +{ + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&bpl->bpl_list, sizeof (bplist_entry_t), + offsetof(bplist_entry_t, bpe_node)); +} + +void +bplist_destroy(bplist_t *bpl) +{ + list_destroy(&bpl->bpl_list); + mutex_destroy(&bpl->bpl_lock); +} + +void +bplist_append(bplist_t *bpl, const blkptr_t *bp) +{ + bplist_entry_t *bpe = kmem_alloc(sizeof (*bpe), KM_SLEEP); + + mutex_enter(&bpl->bpl_lock); + bpe->bpe_blk = *bp; + list_insert_tail(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); +} + +/* + * To aid debugging, we keep the most recently removed entry. This way if + * we are in the callback, we can easily locate the entry. + */ +static bplist_entry_t *bplist_iterate_last_removed; + +void +bplist_iterate(bplist_t *bpl, bplist_itor_t *func, void *arg, dmu_tx_t *tx) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + mutex_exit(&bpl->bpl_lock); + func(arg, &bpe->bpe_blk, tx); + kmem_free(bpe, sizeof (*bpe)); + mutex_enter(&bpl->bpl_lock); + } + mutex_exit(&bpl->bpl_lock); +} + +void +bplist_clear(bplist_t *bpl) +{ + bplist_entry_t *bpe; + + mutex_enter(&bpl->bpl_lock); + while ((bpe = list_head(&bpl->bpl_list))) { + bplist_iterate_last_removed = bpe; + list_remove(&bpl->bpl_list, bpe); + kmem_free(bpe, sizeof (*bpe)); + } + mutex_exit(&bpl->bpl_lock); +} diff --git a/module/zfs/bpobj.c b/module/zfs/bpobj.c new file mode 100644 index 000000000000..e75ba5cccde6 --- /dev/null +++ b/module/zfs/bpobj.c @@ -0,0 +1,943 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2017 Datto Inc. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Return an empty bpobj, preferably the empty dummy one (dp_empty_bpobj). + */ +uint64_t +bpobj_alloc_empty(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + dsl_pool_t *dp = dmu_objset_pool(os); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + if (!spa_feature_is_active(spa, SPA_FEATURE_EMPTY_BPOBJ)) { + ASSERT0(dp->dp_empty_bpobj); + dp->dp_empty_bpobj = + bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(os, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj, tx) == 0); + } + spa_feature_incr(spa, SPA_FEATURE_EMPTY_BPOBJ, tx); + ASSERT(dp->dp_empty_bpobj != 0); + return (dp->dp_empty_bpobj); + } else { + return (bpobj_alloc(os, blocksize, tx)); + } +} + +void +bpobj_decr_empty(objset_t *os, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_objset_pool(os); + + spa_feature_decr(dmu_objset_spa(os), SPA_FEATURE_EMPTY_BPOBJ, tx); + if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_EMPTY_BPOBJ)) { + VERIFY3U(0, ==, zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, tx)); + VERIFY3U(0, ==, dmu_object_free(os, dp->dp_empty_bpobj, tx)); + dp->dp_empty_bpobj = 0; + } +} + +uint64_t +bpobj_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + int size; + + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_BPOBJ_ACCOUNT) + size = BPOBJ_SIZE_V0; + else if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + size = BPOBJ_SIZE_V1; + else if (!spa_feature_is_active(dmu_objset_spa(os), + SPA_FEATURE_LIVELIST)) + size = BPOBJ_SIZE_V2; + else + size = sizeof (bpobj_phys_t); + + return (dmu_object_alloc(os, DMU_OT_BPOBJ, blocksize, + DMU_OT_BPOBJ_HDR, size, tx)); +} + +void +bpobj_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + int64_t i; + bpobj_t bpo; + dmu_object_info_t doi; + int epb; + dmu_buf_t *dbuf = NULL; + + ASSERT(obj != dmu_objset_pool(os)->dp_empty_bpobj); + VERIFY3U(0, ==, bpobj_open(&bpo, os, obj)); + + mutex_enter(&bpo.bpo_lock); + + if (!bpo.bpo_havesubobj || bpo.bpo_phys->bpo_subobjs == 0) + goto out; + + VERIFY3U(0, ==, dmu_object_info(os, bpo.bpo_phys->bpo_subobjs, &doi)); + epb = doi.doi_data_block_size / sizeof (uint64_t); + + for (i = bpo.bpo_phys->bpo_num_subobjs - 1; i >= 0; i--) { + uint64_t *objarray; + uint64_t offset, blkoff; + + offset = i * sizeof (uint64_t); + blkoff = P2PHASE(i, epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + VERIFY3U(0, ==, dmu_buf_hold(os, + bpo.bpo_phys->bpo_subobjs, offset, FTAG, &dbuf, 0)); + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + objarray = dbuf->db_data; + bpobj_free(os, objarray[blkoff], tx); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + VERIFY3U(0, ==, dmu_object_free(os, bpo.bpo_phys->bpo_subobjs, tx)); + +out: + mutex_exit(&bpo.bpo_lock); + bpobj_close(&bpo); + + VERIFY3U(0, ==, dmu_object_free(os, obj, tx)); +} + +int +bpobj_open(bpobj_t *bpo, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + int err; + + err = dmu_object_info(os, object, &doi); + if (err) + return (err); + + bzero(bpo, sizeof (*bpo)); + mutex_init(&bpo->bpo_lock, NULL, MUTEX_DEFAULT, NULL); + + ASSERT(bpo->bpo_dbuf == NULL); + ASSERT(bpo->bpo_phys == NULL); + ASSERT(object != 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_BPOBJ_HDR); + + err = dmu_bonus_hold(os, object, bpo, &bpo->bpo_dbuf); + if (err) + return (err); + + bpo->bpo_os = os; + bpo->bpo_object = object; + bpo->bpo_epb = doi.doi_data_block_size >> SPA_BLKPTRSHIFT; + bpo->bpo_havecomp = (doi.doi_bonus_size > BPOBJ_SIZE_V0); + bpo->bpo_havesubobj = (doi.doi_bonus_size > BPOBJ_SIZE_V1); + bpo->bpo_havefreed = (doi.doi_bonus_size > BPOBJ_SIZE_V2); + bpo->bpo_phys = bpo->bpo_dbuf->db_data; + return (0); +} + +boolean_t +bpobj_is_open(const bpobj_t *bpo) +{ + return (bpo->bpo_object != 0); +} + +void +bpobj_close(bpobj_t *bpo) +{ + /* Lame workaround for closing a bpobj that was never opened. */ + if (bpo->bpo_object == 0) + return; + + dmu_buf_rele(bpo->bpo_dbuf, bpo); + if (bpo->bpo_cached_dbuf != NULL) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + bpo->bpo_dbuf = NULL; + bpo->bpo_phys = NULL; + bpo->bpo_cached_dbuf = NULL; + bpo->bpo_object = 0; + + mutex_destroy(&bpo->bpo_lock); +} + +static boolean_t +bpobj_is_empty_impl(bpobj_t *bpo) +{ + ASSERT(MUTEX_HELD(&bpo->bpo_lock)); + return (bpo->bpo_phys->bpo_num_blkptrs == 0 && + (!bpo->bpo_havesubobj || bpo->bpo_phys->bpo_num_subobjs == 0)); +} + +boolean_t +bpobj_is_empty(bpobj_t *bpo) +{ + mutex_enter(&bpo->bpo_lock); + boolean_t is_empty = bpobj_is_empty_impl(bpo); + mutex_exit(&bpo->bpo_lock); + return (is_empty); +} + +/* + * A recursive iteration of the bpobjs would be nice here but we run the risk + * of overflowing function stack space. Instead, find each subobj and add it + * to the head of our list so it can be scanned for subjobjs. Like a + * recursive implementation, the "deepest" subobjs will be freed first. + * When a subobj is found to have no additional subojs, free it. + */ +typedef struct bpobj_info { + bpobj_t *bpi_bpo; + /* + * This object is a subobj of bpi_parent, + * at bpi_index in its subobj array. + */ + struct bpobj_info *bpi_parent; + uint64_t bpi_index; + /* How many of our subobj's are left to process. */ + uint64_t bpi_unprocessed_subobjs; + /* True after having visited this bpo's directly referenced BPs. */ + boolean_t bpi_visited; + list_node_t bpi_node; +} bpobj_info_t; + +static bpobj_info_t * +bpi_alloc(bpobj_t *bpo, bpobj_info_t *parent, uint64_t index) +{ + bpobj_info_t *bpi = kmem_zalloc(sizeof (bpobj_info_t), KM_SLEEP); + bpi->bpi_bpo = bpo; + bpi->bpi_parent = parent; + bpi->bpi_index = index; + if (bpo->bpo_havesubobj && bpo->bpo_phys->bpo_subobjs != 0) { + bpi->bpi_unprocessed_subobjs = bpo->bpo_phys->bpo_num_subobjs; + } + return (bpi); +} + +/* + * Update bpobj and all of its parents with new space accounting. + */ +static void +propagate_space_reduction(bpobj_info_t *bpi, int64_t freed, + int64_t comp_freed, int64_t uncomp_freed, dmu_tx_t *tx) +{ + + for (; bpi != NULL; bpi = bpi->bpi_parent) { + bpobj_t *p = bpi->bpi_bpo; + ASSERT(dmu_buf_is_dirty(p->bpo_dbuf, tx)); + p->bpo_phys->bpo_bytes -= freed; + ASSERT3S(p->bpo_phys->bpo_bytes, >=, 0); + if (p->bpo_havecomp) { + p->bpo_phys->bpo_comp -= comp_freed; + p->bpo_phys->bpo_uncomp -= uncomp_freed; + } + } +} + +static int +bpobj_iterate_blkptrs(bpobj_info_t *bpi, bpobj_itor_t func, void *arg, + int64_t start, dmu_tx_t *tx, boolean_t free) +{ + int err = 0; + int64_t freed = 0, comp_freed = 0, uncomp_freed = 0; + dmu_buf_t *dbuf = NULL; + bpobj_t *bpo = bpi->bpi_bpo; + + for (int64_t i = bpo->bpo_phys->bpo_num_blkptrs - 1; i >= start; i--) { + uint64_t offset = i * sizeof (blkptr_t); + uint64_t blkoff = P2PHASE(i, bpo->bpo_epb); + + if (dbuf == NULL || dbuf->db_offset > offset) { + if (dbuf) + dmu_buf_rele(dbuf, FTAG); + err = dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, FTAG, &dbuf, 0); + if (err) + break; + } + + ASSERT3U(offset, >=, dbuf->db_offset); + ASSERT3U(offset, <, dbuf->db_offset + dbuf->db_size); + + blkptr_t *bparray = dbuf->db_data; + blkptr_t *bp = &bparray[blkoff]; + + boolean_t bp_freed = BP_GET_FREE(bp); + err = func(arg, bp, bp_freed, tx); + if (err) + break; + + if (free) { + int sign = bp_freed ? -1 : +1; + spa_t *spa = dmu_objset_spa(bpo->bpo_os); + freed += sign * bp_get_dsize_sync(spa, bp); + comp_freed += sign * BP_GET_PSIZE(bp); + uncomp_freed += sign * BP_GET_UCSIZE(bp); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, tx)); + bpo->bpo_phys->bpo_num_blkptrs--; + ASSERT3S(bpo->bpo_phys->bpo_num_blkptrs, >=, 0); + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed--; + ASSERT3S(bpo->bpo_phys->bpo_num_freed, >=, 0); + } + } + } + if (free) { + propagate_space_reduction(bpi, freed, comp_freed, + uncomp_freed, tx); + VERIFY0(dmu_free_range(bpo->bpo_os, + bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + DMU_OBJECT_END, tx)); + } + if (dbuf) { + dmu_buf_rele(dbuf, FTAG); + dbuf = NULL; + } + return (err); +} + +/* + * Given an initial bpo, start by freeing the BPs that are directly referenced + * by that bpo. If the bpo has subobjs, read in its last subobj and push the + * subobj to our stack. By popping items off our stack, eventually we will + * encounter a bpo that has no subobjs. We can free its bpobj_info_t, and if + * requested also free the now-empty bpo from disk and decrement + * its parent's subobj count. We continue popping each subobj from our stack, + * visiting its last subobj until they too have no more subobjs, and so on. + */ +static int +bpobj_iterate_impl(bpobj_t *initial_bpo, bpobj_itor_t func, void *arg, + dmu_tx_t *tx, boolean_t free, uint64_t *bpobj_size) +{ + list_t stack; + bpobj_info_t *bpi; + int err = 0; + + /* + * Create a "stack" for us to work with without worrying about + * stack overflows. Initialize it with the initial_bpo. + */ + list_create(&stack, sizeof (bpobj_info_t), + offsetof(bpobj_info_t, bpi_node)); + mutex_enter(&initial_bpo->bpo_lock); + + if (bpobj_size != NULL) + *bpobj_size = initial_bpo->bpo_phys->bpo_num_blkptrs; + + list_insert_head(&stack, bpi_alloc(initial_bpo, NULL, 0)); + + while ((bpi = list_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; + + ASSERT3P(bpo, !=, NULL); + ASSERT(MUTEX_HELD(&bpo->bpo_lock)); + ASSERT(bpobj_is_open(bpo)); + + if (free) + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + if (bpi->bpi_visited == B_FALSE) { + err = bpobj_iterate_blkptrs(bpi, func, arg, 0, tx, + free); + bpi->bpi_visited = B_TRUE; + if (err != 0) + break; + } + /* + * We've finished with this bpo's directly-referenced BP's and + * it has no more unprocessed subobjs. We can free its + * bpobj_info_t (unless it is the topmost, initial_bpo). + * If we are freeing from disk, we can also do that. + */ + if (bpi->bpi_unprocessed_subobjs == 0) { + /* + * If there are no entries, there should + * be no bytes. + */ + if (bpobj_is_empty_impl(bpo)) { + ASSERT0(bpo->bpo_phys->bpo_bytes); + ASSERT0(bpo->bpo_phys->bpo_comp); + ASSERT0(bpo->bpo_phys->bpo_uncomp); + } + + /* The initial_bpo has no parent and is not closed. */ + if (bpi->bpi_parent != NULL) { + if (free) { + bpobj_t *p = bpi->bpi_parent->bpi_bpo; + + ASSERT0(bpo->bpo_phys->bpo_num_blkptrs); + ASSERT3U(p->bpo_phys->bpo_num_subobjs, + >, 0); + ASSERT3U(bpi->bpi_index, ==, + p->bpo_phys->bpo_num_subobjs - 1); + ASSERT(dmu_buf_is_dirty(bpo->bpo_dbuf, + tx)); + + p->bpo_phys->bpo_num_subobjs--; + + VERIFY0(dmu_free_range(p->bpo_os, + p->bpo_phys->bpo_subobjs, + bpi->bpi_index * sizeof (uint64_t), + sizeof (uint64_t), tx)); + + /* eliminate the empty subobj list */ + if (bpo->bpo_havesubobj && + bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(bpo->bpo_phys-> + bpo_num_subobjs); + err = dmu_object_free( + bpo->bpo_os, + bpo->bpo_phys->bpo_subobjs, + tx); + if (err) + break; + bpo->bpo_phys->bpo_subobjs = 0; + } + err = dmu_object_free(p->bpo_os, + bpo->bpo_object, tx); + if (err) + break; + } + + mutex_exit(&bpo->bpo_lock); + bpobj_close(bpo); + kmem_free(bpo, sizeof (bpobj_t)); + } else { + mutex_exit(&bpo->bpo_lock); + } + + /* + * Finished processing this bpo. Unlock, and free + * our "stack" info. + */ + list_remove_head(&stack); + kmem_free(bpi, sizeof (bpobj_info_t)); + } else { + /* + * We have unprocessed subobjs. Process the next one. + */ + ASSERT(bpo->bpo_havecomp); + ASSERT3P(bpobj_size, ==, NULL); + + /* Add the last subobj to stack. */ + int64_t i = bpi->bpi_unprocessed_subobjs - 1; + uint64_t offset = i * sizeof (uint64_t); + + uint64_t obj_from_sublist; + err = dmu_read(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + offset, sizeof (uint64_t), &obj_from_sublist, + DMU_READ_PREFETCH); + if (err) + break; + bpobj_t *sublist = kmem_alloc(sizeof (bpobj_t), + KM_SLEEP); + + err = bpobj_open(sublist, bpo->bpo_os, + obj_from_sublist); + if (err) + break; + + list_insert_head(&stack, bpi_alloc(sublist, bpi, i)); + mutex_enter(&sublist->bpo_lock); + bpi->bpi_unprocessed_subobjs--; + } + } + /* + * Cleanup anything left on the "stack" after we left the loop. + * Every bpo on the stack is locked so we must remember to undo + * that now (in LIFO order). + */ + while ((bpi = list_remove_head(&stack)) != NULL) { + bpobj_t *bpo = bpi->bpi_bpo; + ASSERT(err != 0); + ASSERT3P(bpo, !=, NULL); + + mutex_exit(&bpo->bpo_lock); + + /* do not free the initial_bpo */ + if (bpi->bpi_parent != NULL) { + bpobj_close(bpi->bpi_bpo); + kmem_free(bpi->bpi_bpo, sizeof (bpobj_t)); + } + kmem_free(bpi, sizeof (bpobj_info_t)); + } + + list_destroy(&stack); + + return (err); +} + +/* + * Iterate and remove the entries. If func returns nonzero, iteration + * will stop and that entry will not be removed. + */ +int +bpobj_iterate(bpobj_t *bpo, bpobj_itor_t func, void *arg, dmu_tx_t *tx) +{ + return (bpobj_iterate_impl(bpo, func, arg, tx, B_TRUE, NULL)); +} + +/* + * Iterate the entries. If func returns nonzero, iteration will stop. + * + * If there are no subobjs: + * + * *bpobj_size can be used to return the number of block pointers in the + * bpobj. Note that this may be different from the number of block pointers + * that are iterated over, if iteration is terminated early (e.g. by the func + * returning nonzero). + * + * If there are concurrent (or subsequent) modifications to the bpobj then the + * returned *bpobj_size can be passed as "start" to + * livelist_bpobj_iterate_from_nofree() to iterate the newly added entries. + */ +int +bpobj_iterate_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + uint64_t *bpobj_size) +{ + return (bpobj_iterate_impl(bpo, func, arg, NULL, B_FALSE, bpobj_size)); +} + +/* + * Iterate over the blkptrs in the bpobj beginning at index start. If func + * returns nonzero, iteration will stop. This is a livelist specific function + * since it assumes that there are no subobjs present. + */ +int +livelist_bpobj_iterate_from_nofree(bpobj_t *bpo, bpobj_itor_t func, void *arg, + int64_t start) +{ + if (bpo->bpo_havesubobj) + VERIFY0(bpo->bpo_phys->bpo_subobjs); + bpobj_info_t *bpi = bpi_alloc(bpo, NULL, 0); + int err = bpobj_iterate_blkptrs(bpi, func, arg, start, NULL, B_FALSE); + kmem_free(bpi, sizeof (bpobj_info_t)); + return (err); +} + +/* + * Logically add subobj's contents to the parent bpobj. + * + * In the most general case, this is accomplished in constant time by adding + * a reference to subobj. This case is used when enqueuing a large subobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * Result: sub-bpobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | + * +----+----+----+----+----+ +-----+-----+-----+--|--+ + * | + * /-----------------------------------------------------/ + * v + * +--------------+ +--------------+ + * | sub-bpobj |----------------------> | subsubobj | + * +----+----+----+----+---------+----+ +-----+-----+--+--------+-----+ + * | bp | bp | bp | bp | ... | bp | | obj | obj | ... | obj | + * +----+----+----+----+---------+----+ +-----+-----+-----------+-----+ + * + * + * In a common case, the subobj is small: its bp's and its list of subobj's + * are each stored in a single block. In this case we copy the subobj's + * contents to the parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ ^ + * +--------------+ | +--------------+ | + * | sub-bpobj |---------^------------> | subsubobj | ^ + * +----+----+----+ | +-----+-----+--+ | + * | BP | BP |-->-->-->-->-/ | OBJ | OBJ |-->-/ + * +----+----+ +-----+-----+ + * + * Result: subobj destroyed, contents copied to parent: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+----+----+ +-----+-----+--+--+-----+-----+ + * | bp | bp | bp | bp | bp | BP | BP | | obj | obj | obj | OBJ | OBJ | + * +----+----+----+----+----+----+----+ +-----+-----+-----+-----+-----+ + * + * + * If the subobj has many BP's but few subobj's, we can copy the sub-subobj's + * but retain the sub-bpobj: + * +--------------+ +--------------+ + * | bpobj |----------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | + * +----+----+----+----+----+ +-----+-----+-----+ + * ^ + * +--------------+ +--------------+ | + * | sub-bpobj |----------------------> | subsubobj | ^ + * +----+----+----+----+---------+----+ +-----+-----+--+ | + * | bp | bp | bp | bp | ... | bp | | OBJ | OBJ |-->-/ + * +----+----+----+----+---------+----+ +-----+-----+ + * + * Result: sub-sub-bpobjs and subobj added to parent's subobj list. + * +--------------+ +--------------+ + * | bpobj |-------------------->| subobj list | + * +----+----+----+----+----+ +-----+-----+--+--+-----+-----+------+ + * | bp | bp | bp | bp | bp | | obj | obj | obj | OBJ | OBJ | OBJ* | + * +----+----+----+----+----+ +-----+-----+-----+-----+-----+--|---+ + * | + * /--------------------------------------------------------------/ + * v + * +--------------+ + * | sub-bpobj | + * +----+----+----+----+---------+----+ + * | bp | bp | bp | bp | ... | bp | + * +----+----+----+----+---------+----+ + */ +void +bpobj_enqueue_subobj(bpobj_t *bpo, uint64_t subobj, dmu_tx_t *tx) +{ + bpobj_t subbpo; + uint64_t used, comp, uncomp, subsubobjs; + boolean_t copy_subsub = B_TRUE; + boolean_t copy_bps = B_TRUE; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(subobj != 0); + ASSERT(bpo->bpo_havesubobj); + ASSERT(bpo->bpo_havecomp); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (subobj == dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj) { + bpobj_decr_empty(bpo->bpo_os, tx); + return; + } + + VERIFY3U(0, ==, bpobj_open(&subbpo, bpo->bpo_os, subobj)); + VERIFY3U(0, ==, bpobj_space(&subbpo, &used, &comp, &uncomp)); + + if (bpobj_is_empty(&subbpo)) { + /* No point in having an empty subobj. */ + bpobj_close(&subbpo); + bpobj_free(bpo->bpo_os, subobj, tx); + return; + } + + mutex_enter(&bpo->bpo_lock); + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + + dmu_object_info_t doi; + + if (bpo->bpo_phys->bpo_subobjs != 0) { + ASSERT0(dmu_object_info(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_BPOBJ_SUBOBJ); + } + + /* + * If subobj has only one block of subobjs, then move subobj's + * subobjs to bpo's subobj list directly. This reduces recursion in + * bpobj_iterate due to nested subobjs. + */ + subsubobjs = subbpo.bpo_phys->bpo_subobjs; + if (subsubobjs != 0) { + VERIFY0(dmu_object_info(bpo->bpo_os, subsubobjs, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size) { + copy_subsub = B_FALSE; + } + } + + /* + * If, in addition to having only one block of subobj's, subobj has + * only one block of bp's, then move subobj's bp's to bpo's bp list + * directly. This reduces recursion in bpobj_iterate due to nested + * subobjs. + */ + VERIFY3U(0, ==, dmu_object_info(bpo->bpo_os, subobj, &doi)); + if (doi.doi_max_offset > doi.doi_data_block_size || !copy_subsub) { + copy_bps = B_FALSE; + } + + if (copy_subsub && subsubobjs != 0) { + dmu_buf_t *subdb; + uint64_t numsubsub = subbpo.bpo_phys->bpo_num_subobjs; + + VERIFY0(dmu_buf_hold(bpo->bpo_os, subsubobjs, + 0, FTAG, &subdb, 0)); + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(subdb->db_size, >=, + numsubsub * sizeof (subobj)); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + numsubsub * sizeof (subobj), subdb->db_data, tx); + dmu_buf_rele(subdb, FTAG); + bpo->bpo_phys->bpo_num_subobjs += numsubsub; + + dmu_buf_will_dirty(subbpo.bpo_dbuf, tx); + subbpo.bpo_phys->bpo_subobjs = 0; + VERIFY0(dmu_object_free(bpo->bpo_os, subsubobjs, tx)); + } + + if (copy_bps) { + dmu_buf_t *bps; + uint64_t numbps = subbpo.bpo_phys->bpo_num_blkptrs; + + ASSERT(copy_subsub); + VERIFY0(dmu_buf_hold(bpo->bpo_os, subobj, + 0, FTAG, &bps, 0)); + + /* + * Make sure that we are not asking dmu_write() + * to write more data than we have in our buffer. + */ + VERIFY3U(bps->db_size, >=, numbps * sizeof (blkptr_t)); + dmu_write(bpo->bpo_os, bpo->bpo_object, + bpo->bpo_phys->bpo_num_blkptrs * sizeof (blkptr_t), + numbps * sizeof (blkptr_t), + bps->db_data, tx); + dmu_buf_rele(bps, FTAG); + bpo->bpo_phys->bpo_num_blkptrs += numbps; + + bpobj_close(&subbpo); + VERIFY0(dmu_object_free(bpo->bpo_os, subobj, tx)); + } else { + bpobj_close(&subbpo); + if (bpo->bpo_phys->bpo_subobjs == 0) { + bpo->bpo_phys->bpo_subobjs = + dmu_object_alloc(bpo->bpo_os, + DMU_OT_BPOBJ_SUBOBJ, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + } + + dmu_write(bpo->bpo_os, bpo->bpo_phys->bpo_subobjs, + bpo->bpo_phys->bpo_num_subobjs * sizeof (subobj), + sizeof (subobj), &subobj, tx); + bpo->bpo_phys->bpo_num_subobjs++; + } + + bpo->bpo_phys->bpo_bytes += used; + bpo->bpo_phys->bpo_comp += comp; + bpo->bpo_phys->bpo_uncomp += uncomp; + mutex_exit(&bpo->bpo_lock); + +} + +void +bpobj_enqueue(bpobj_t *bpo, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + blkptr_t stored_bp = *bp; + uint64_t offset; + int blkoff; + blkptr_t *bparray; + + ASSERT(bpobj_is_open(bpo)); + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(bpo->bpo_object != dmu_objset_pool(bpo->bpo_os)->dp_empty_bpobj); + + if (BP_IS_EMBEDDED(bp)) { + /* + * The bpobj will compress better without the payload. + * + * Note that we store EMBEDDED bp's because they have an + * uncompressed size, which must be accounted for. An + * alternative would be to add their size to bpo_uncomp + * without storing the bp, but that would create additional + * complications: bpo_uncomp would be inconsistent with the + * set of BP's stored, and bpobj_iterate() wouldn't visit + * all the space accounted for in the bpobj. + */ + bzero(&stored_bp, sizeof (stored_bp)); + stored_bp.blk_prop = bp->blk_prop; + stored_bp.blk_birth = bp->blk_birth; + } else if (!BP_GET_DEDUP(bp)) { + /* The bpobj will compress better without the checksum */ + bzero(&stored_bp.blk_cksum, sizeof (stored_bp.blk_cksum)); + } + + stored_bp.blk_fill = 0; + BP_SET_FREE(&stored_bp, bp_freed); + + mutex_enter(&bpo->bpo_lock); + + offset = bpo->bpo_phys->bpo_num_blkptrs * sizeof (stored_bp); + blkoff = P2PHASE(bpo->bpo_phys->bpo_num_blkptrs, bpo->bpo_epb); + + if (bpo->bpo_cached_dbuf == NULL || + offset < bpo->bpo_cached_dbuf->db_offset || + offset >= bpo->bpo_cached_dbuf->db_offset + + bpo->bpo_cached_dbuf->db_size) { + if (bpo->bpo_cached_dbuf) + dmu_buf_rele(bpo->bpo_cached_dbuf, bpo); + VERIFY3U(0, ==, dmu_buf_hold(bpo->bpo_os, bpo->bpo_object, + offset, bpo, &bpo->bpo_cached_dbuf, 0)); + } + + dmu_buf_will_dirty(bpo->bpo_cached_dbuf, tx); + bparray = bpo->bpo_cached_dbuf->db_data; + bparray[blkoff] = stored_bp; + + dmu_buf_will_dirty(bpo->bpo_dbuf, tx); + bpo->bpo_phys->bpo_num_blkptrs++; + int sign = bp_freed ? -1 : +1; + bpo->bpo_phys->bpo_bytes += sign * + bp_get_dsize_sync(dmu_objset_spa(bpo->bpo_os), bp); + if (bpo->bpo_havecomp) { + bpo->bpo_phys->bpo_comp += sign * BP_GET_PSIZE(bp); + bpo->bpo_phys->bpo_uncomp += sign * BP_GET_UCSIZE(bp); + } + if (bp_freed) { + ASSERT(bpo->bpo_havefreed); + bpo->bpo_phys->bpo_num_freed++; + } + mutex_exit(&bpo->bpo_lock); +} + +struct space_range_arg { + spa_t *spa; + uint64_t mintxg; + uint64_t maxtxg; + uint64_t used; + uint64_t comp; + uint64_t uncomp; +}; + +/* ARGSUSED */ +static int +space_range_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + struct space_range_arg *sra = arg; + + if (bp->blk_birth > sra->mintxg && bp->blk_birth <= sra->maxtxg) { + if (dsl_pool_sync_context(spa_get_dsl(sra->spa))) + sra->used += bp_get_dsize_sync(sra->spa, bp); + else + sra->used += bp_get_dsize(sra->spa, bp); + sra->comp += BP_GET_PSIZE(bp); + sra->uncomp += BP_GET_UCSIZE(bp); + } + return (0); +} + +int +bpobj_space(bpobj_t *bpo, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + ASSERT(bpobj_is_open(bpo)); + mutex_enter(&bpo->bpo_lock); + + *usedp = bpo->bpo_phys->bpo_bytes; + if (bpo->bpo_havecomp) { + *compp = bpo->bpo_phys->bpo_comp; + *uncompp = bpo->bpo_phys->bpo_uncomp; + mutex_exit(&bpo->bpo_lock); + return (0); + } else { + mutex_exit(&bpo->bpo_lock); + return (bpobj_space_range(bpo, 0, UINT64_MAX, + usedp, compp, uncompp)); + } +} + +/* + * Return the amount of space in the bpobj which is: + * mintxg < blk_birth <= maxtxg + */ +int +bpobj_space_range(bpobj_t *bpo, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + struct space_range_arg sra = { 0 }; + int err; + + ASSERT(bpobj_is_open(bpo)); + + /* + * As an optimization, if they want the whole txg range, just + * get bpo_bytes rather than iterating over the bps. + */ + if (mintxg < TXG_INITIAL && maxtxg == UINT64_MAX && bpo->bpo_havecomp) + return (bpobj_space(bpo, usedp, compp, uncompp)); + + sra.spa = dmu_objset_spa(bpo->bpo_os); + sra.mintxg = mintxg; + sra.maxtxg = maxtxg; + + err = bpobj_iterate_nofree(bpo, space_range_cb, &sra, NULL); + *usedp = sra.used; + *compp = sra.comp; + *uncompp = sra.uncomp; + return (err); +} + +/* + * A bpobj_itor_t to append blkptrs to a bplist. Note that while blkptrs in a + * bpobj are designated as free or allocated that information is not preserved + * in bplists. + */ +/* ARGSUSED */ +int +bplist_append_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + bplist_t *bpl = arg; + bplist_append(bpl, bp); + return (0); +} diff --git a/module/zfs/bptree.c b/module/zfs/bptree.c new file mode 100644 index 000000000000..1827a3c4e326 --- /dev/null +++ b/module/zfs/bptree.c @@ -0,0 +1,303 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * A bptree is a queue of root block pointers from destroyed datasets. When a + * dataset is destroyed its root block pointer is put on the end of the pool's + * bptree queue so the dataset's blocks can be freed asynchronously by + * dsl_scan_sync. This allows the delete operation to finish without traversing + * all the dataset's blocks. + * + * Note that while bt_begin and bt_end are only ever incremented in this code, + * they are effectively reset to 0 every time the entire bptree is freed because + * the bptree's object is destroyed and re-created. + */ + +struct bptree_args { + bptree_phys_t *ba_phys; /* data in bonus buffer, dirtied if freeing */ + boolean_t ba_free; /* true if freeing during traversal */ + + bptree_itor_t *ba_func; /* function to call for each blockpointer */ + void *ba_arg; /* caller supplied argument to ba_func */ + dmu_tx_t *ba_tx; /* caller supplied tx, NULL if not freeing */ +} bptree_args_t; + +uint64_t +bptree_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t obj; + dmu_buf_t *db; + bptree_phys_t *bt; + + obj = dmu_object_alloc(os, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OTN_UINT64_METADATA, + sizeof (bptree_phys_t), tx); + + /* + * Bonus buffer contents are already initialized to 0, but for + * readability we make it explicit. + */ + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + bt = db->db_data; + bt->bt_begin = 0; + bt->bt_end = 0; + bt->bt_bytes = 0; + bt->bt_comp = 0; + bt->bt_uncomp = 0; + dmu_buf_rele(db, FTAG); + + return (obj); +} + +int +bptree_free(objset_t *os, uint64_t obj, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + ASSERT3U(bt->bt_begin, ==, bt->bt_end); + ASSERT0(bt->bt_bytes); + ASSERT0(bt->bt_comp); + ASSERT0(bt->bt_uncomp); + dmu_buf_rele(db, FTAG); + + return (dmu_object_free(os, obj, tx)); +} + +boolean_t +bptree_is_empty(objset_t *os, uint64_t obj) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + boolean_t rv; + + VERIFY0(dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + rv = (bt->bt_begin == bt->bt_end); + dmu_buf_rele(db, FTAG); + return (rv); +} + +void +bptree_add(objset_t *os, uint64_t obj, blkptr_t *bp, uint64_t birth_txg, + uint64_t bytes, uint64_t comp, uint64_t uncomp, dmu_tx_t *tx) +{ + dmu_buf_t *db; + bptree_phys_t *bt; + bptree_entry_phys_t *bte; + + /* + * bptree objects are in the pool mos, therefore they can only be + * modified in syncing context. Furthermore, this is only modified + * by the sync thread, so no locking is necessary. + */ + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, obj, FTAG, &db)); + bt = db->db_data; + + bte = kmem_zalloc(sizeof (*bte), KM_SLEEP); + bte->be_birth_txg = birth_txg; + bte->be_bp = *bp; + dmu_write(os, obj, bt->bt_end * sizeof (*bte), sizeof (*bte), bte, tx); + kmem_free(bte, sizeof (*bte)); + + dmu_buf_will_dirty(db, tx); + bt->bt_end++; + bt->bt_bytes += bytes; + bt->bt_comp += comp; + bt->bt_uncomp += uncomp; + dmu_buf_rele(db, FTAG); +} + +/* ARGSUSED */ +static int +bptree_visit_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + int err; + struct bptree_args *ba = arg; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_REDACTED(bp)) + return (0); + + err = ba->ba_func(ba->ba_arg, bp, ba->ba_tx); + if (err == 0 && ba->ba_free) { + ba->ba_phys->bt_bytes -= bp_get_dsize_sync(spa, bp); + ba->ba_phys->bt_comp -= BP_GET_PSIZE(bp); + ba->ba_phys->bt_uncomp -= BP_GET_UCSIZE(bp); + } + return (err); +} + +/* + * If "free" is set: + * - It is assumed that "func" will be freeing the block pointers. + * - If "func" returns nonzero, the bookmark will be remembered and + * iteration will be restarted from this point on next invocation. + * - If an i/o error is encountered (e.g. "func" returns EIO or ECKSUM), + * bptree_iterate will remember the bookmark, continue traversing + * any additional entries, and return 0. + * + * If "free" is not set, traversal will stop and return an error if + * an i/o error is encountered. + * + * In either case, if zfs_free_leak_on_eio is set, i/o errors will be + * ignored and traversal will continue (i.e. TRAVERSE_HARD will be passed to + * traverse_dataset_destroyed()). + */ +int +bptree_iterate(objset_t *os, uint64_t obj, boolean_t free, bptree_itor_t func, + void *arg, dmu_tx_t *tx) +{ + boolean_t ioerr = B_FALSE; + int err; + uint64_t i; + dmu_buf_t *db; + struct bptree_args ba; + + ASSERT(!free || dmu_tx_is_syncing(tx)); + + err = dmu_bonus_hold(os, obj, FTAG, &db); + if (err != 0) + return (err); + + if (free) + dmu_buf_will_dirty(db, tx); + + ba.ba_phys = db->db_data; + ba.ba_free = free; + ba.ba_func = func; + ba.ba_arg = arg; + ba.ba_tx = tx; + + err = 0; + for (i = ba.ba_phys->bt_begin; i < ba.ba_phys->bt_end; i++) { + bptree_entry_phys_t bte; + int flags = TRAVERSE_PREFETCH_METADATA | TRAVERSE_POST | + TRAVERSE_NO_DECRYPT; + + err = dmu_read(os, obj, i * sizeof (bte), sizeof (bte), + &bte, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + if (zfs_free_leak_on_eio) + flags |= TRAVERSE_HARD; + zfs_dbgmsg("bptree index %lld: traversing from min_txg=%lld " + "bookmark %lld/%lld/%lld/%lld", + (longlong_t)i, + (longlong_t)bte.be_birth_txg, + (longlong_t)bte.be_zb.zb_objset, + (longlong_t)bte.be_zb.zb_object, + (longlong_t)bte.be_zb.zb_level, + (longlong_t)bte.be_zb.zb_blkid); + err = traverse_dataset_destroyed(os->os_spa, &bte.be_bp, + bte.be_birth_txg, &bte.be_zb, flags, + bptree_visit_cb, &ba); + if (free) { + /* + * The callback has freed the visited block pointers. + * Record our traversal progress on disk, either by + * updating this record's bookmark, or by logically + * removing this record by advancing bt_begin. + */ + if (err != 0) { + /* save bookmark for future resume */ + ASSERT3U(bte.be_zb.zb_objset, ==, + ZB_DESTROYED_OBJSET); + ASSERT0(bte.be_zb.zb_level); + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + if (err == EIO || err == ECKSUM || + err == ENXIO) { + /* + * Skip the rest of this tree and + * continue on to the next entry. + */ + err = 0; + ioerr = B_TRUE; + } else { + break; + } + } else if (ioerr) { + /* + * This entry is finished, but there were + * i/o errors on previous entries, so we + * can't adjust bt_begin. Set this entry's + * be_birth_txg such that it will be + * treated as a no-op in future traversals. + */ + bte.be_birth_txg = UINT64_MAX; + dmu_write(os, obj, i * sizeof (bte), + sizeof (bte), &bte, tx); + } + + if (!ioerr) { + ba.ba_phys->bt_begin++; + (void) dmu_free_range(os, obj, + i * sizeof (bte), sizeof (bte), tx); + } + } else if (err != 0) { + break; + } + } + + ASSERT(!free || err != 0 || ioerr || + ba.ba_phys->bt_begin == ba.ba_phys->bt_end); + + /* if all blocks are free there should be no used space */ + if (ba.ba_phys->bt_begin == ba.ba_phys->bt_end) { + if (zfs_free_leak_on_eio) { + ba.ba_phys->bt_bytes = 0; + ba.ba_phys->bt_comp = 0; + ba.ba_phys->bt_uncomp = 0; + } + + ASSERT0(ba.ba_phys->bt_bytes); + ASSERT0(ba.ba_phys->bt_comp); + ASSERT0(ba.ba_phys->bt_uncomp); + } + + dmu_buf_rele(db, FTAG); + + return (err); +} diff --git a/module/zfs/bqueue.c b/module/zfs/bqueue.c new file mode 100644 index 000000000000..22539efc4e23 --- /dev/null +++ b/module/zfs/bqueue.c @@ -0,0 +1,155 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2014, 2018 by Delphix. All rights reserved. + */ + +#include +#include + +static inline bqueue_node_t * +obj2node(bqueue_t *q, void *data) +{ + return ((bqueue_node_t *)((char *)data + q->bq_node_offset)); +} + +/* + * Initialize a blocking queue The maximum capacity of the queue is set to + * size. Types that are stored in a bqueue must contain a bqueue_node_t, + * and node_offset must be its offset from the start of the struct. + * fill_fraction is a performance tuning value; when the queue is full, any + * threads attempting to enqueue records will block. They will block until + * they're signaled, which will occur when the queue is at least 1/fill_fraction + * empty. Similar behavior occurs on dequeue; if the queue is empty, threads + * block. They will be signalled when the queue has 1/fill_fraction full, or + * when bqueue_flush is called. As a result, you must call bqueue_flush when + * you enqueue your final record on a thread, in case the dequeueing threads are + * currently blocked and that enqueue does not cause them to be awoken. + * Alternatively, this behavior can be disabled (causing signaling to happen + * immediately) by setting fill_fraction to any value larger than size. + * Return 0 on success, or -1 on failure. + */ +int +bqueue_init(bqueue_t *q, uint64_t fill_fraction, uint64_t size, + size_t node_offset) +{ + if (fill_fraction == 0) { + return (-1); + } + list_create(&q->bq_list, node_offset + sizeof (bqueue_node_t), + node_offset + offsetof(bqueue_node_t, bqn_node)); + cv_init(&q->bq_add_cv, NULL, CV_DEFAULT, NULL); + cv_init(&q->bq_pop_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&q->bq_lock, NULL, MUTEX_DEFAULT, NULL); + q->bq_node_offset = node_offset; + q->bq_size = 0; + q->bq_maxsize = size; + q->bq_fill_fraction = fill_fraction; + return (0); +} + +/* + * Destroy a blocking queue. This function asserts that there are no + * elements in the queue, and no one is blocked on the condition + * variables. + */ +void +bqueue_destroy(bqueue_t *q) +{ + mutex_enter(&q->bq_lock); + ASSERT0(q->bq_size); + cv_destroy(&q->bq_add_cv); + cv_destroy(&q->bq_pop_cv); + list_destroy(&q->bq_list); + mutex_exit(&q->bq_lock); + mutex_destroy(&q->bq_lock); +} + +static void +bqueue_enqueue_impl(bqueue_t *q, void *data, uint64_t item_size, + boolean_t flush) +{ + ASSERT3U(item_size, >, 0); + ASSERT3U(item_size, <=, q->bq_maxsize); + mutex_enter(&q->bq_lock); + obj2node(q, data)->bqn_size = item_size; + while (q->bq_size + item_size > q->bq_maxsize) { + cv_wait_sig(&q->bq_add_cv, &q->bq_lock); + } + q->bq_size += item_size; + list_insert_tail(&q->bq_list, data); + if (q->bq_size >= q->bq_maxsize / q->bq_fill_fraction) + cv_signal(&q->bq_pop_cv); + if (flush) + cv_broadcast(&q->bq_pop_cv); + mutex_exit(&q->bq_lock); +} + +/* + * Add data to q, consuming size units of capacity. If there is insufficient + * capacity to consume size units, block until capacity exists. Asserts size is + * > 0. + */ +void +bqueue_enqueue(bqueue_t *q, void *data, uint64_t item_size) +{ + bqueue_enqueue_impl(q, data, item_size, B_FALSE); +} + +/* + * Enqueue an entry, and then flush the queue. This forces the popping threads + * to wake up, even if we're below the fill fraction. We have this in a single + * function, rather than having a separate call, because it prevents race + * conditions between the enqueuing thread and the dequeueing thread, where the + * enqueueing thread will wake up the dequeueing thread, that thread will + * destroy the condvar before the enqueuing thread is done. + */ +void +bqueue_enqueue_flush(bqueue_t *q, void *data, uint64_t item_size) +{ + bqueue_enqueue_impl(q, data, item_size, B_TRUE); +} + +/* + * Take the first element off of q. If there are no elements on the queue, wait + * until one is put there. Return the removed element. + */ +void * +bqueue_dequeue(bqueue_t *q) +{ + void *ret = NULL; + uint64_t item_size; + mutex_enter(&q->bq_lock); + while (q->bq_size == 0) { + cv_wait_sig(&q->bq_pop_cv, &q->bq_lock); + } + ret = list_remove_head(&q->bq_list); + ASSERT3P(ret, !=, NULL); + item_size = obj2node(q, ret)->bqn_size; + q->bq_size -= item_size; + if (q->bq_size <= q->bq_maxsize - (q->bq_maxsize / q->bq_fill_fraction)) + cv_signal(&q->bq_add_cv); + mutex_exit(&q->bq_lock); + return (ret); +} + +/* + * Returns true if the space used is 0. + */ +boolean_t +bqueue_empty(bqueue_t *q) +{ + return (q->bq_size == 0); +} diff --git a/module/zfs/btree.c b/module/zfs/btree.c new file mode 100644 index 000000000000..57b9dbbb2b50 --- /dev/null +++ b/module/zfs/btree.c @@ -0,0 +1,2124 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include + +kmem_cache_t *zfs_btree_leaf_cache; + +/* + * Control the extent of the verification that occurs when zfs_btree_verify is + * called. Primarily used for debugging when extending the btree logic and + * functionality. As the intensity is increased, new verification steps are + * added. These steps are cumulative; intensity = 3 includes the intensity = 1 + * and intensity = 2 steps as well. + * + * Intensity 1: Verify that the tree's height is consistent throughout. + * Intensity 2: Verify that a core node's children's parent pointers point + * to the core node. + * Intensity 3: Verify that the total number of elements in the tree matches the + * sum of the number of elements in each node. Also verifies that each node's + * count obeys the invariants (less than or equal to maximum value, greater than + * or equal to half the maximum minus one). + * Intensity 4: Verify that each element compares less than the element + * immediately after it and greater than the one immediately before it using the + * comparator function. For core nodes, also checks that each element is greater + * than the last element in the first of the two nodes it separates, and less + * than the first element in the second of the two nodes. + * Intensity 5: Verifies, if ZFS_DEBUG is defined, that all unused memory inside + * of each node is poisoned appropriately. Note that poisoning always occurs if + * ZFS_DEBUG is set, so it is safe to set the intensity to 5 during normal + * operation. + * + * Intensity 4 and 5 are particularly expensive to perform; the previous levels + * are a few memory operations per node, while these levels require multiple + * operations per element. In addition, when creating large btrees, these + * operations are called at every step, resulting in extremely slow operation + * (while the asymptotic complexity of the other steps is the same, the + * importance of the constant factors cannot be denied). + */ +int zfs_btree_verify_intensity = 0; + +/* + * A convenience function to silence warnings from memmove's return value and + * change argument order to src, dest. + */ +static void +bmov(const void *src, void *dest, size_t size) +{ + (void) memmove(dest, src, size); +} + +#ifdef _ILP32 +#define BTREE_POISON 0xabadb10c +#else +#define BTREE_POISON 0xabadb10cdeadbeef +#endif + +static void +zfs_btree_poison_node(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + hdr->bth_count * size, 0x0f, + BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t) - + hdr->bth_count * size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + node->btc_children[i] = + (zfs_btree_hdr_t *)BTREE_POISON; + } + (void) memset(node->btc_elems + hdr->bth_count * size, 0x0f, + (BTREE_CORE_ELEMS - hdr->bth_count) * size); + } +#endif +} + +static inline void +zfs_btree_poison_node_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + ASSERT3U(offset, >=, hdr->bth_count); + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + (void) memset(leaf->btl_elems + offset * size, 0x0f, size); + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + node->btc_children[offset + 1] = + (zfs_btree_hdr_t *)BTREE_POISON; + (void) memset(node->btc_elems + offset * size, 0x0f, size); + } +#endif +} + +static inline void +zfs_btree_verify_poison_at(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + uint64_t offset) +{ +#ifdef ZFS_DEBUG + size_t size = tree->bt_elem_size; + uint8_t eval = 0x0f; + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *cval = (zfs_btree_hdr_t *)BTREE_POISON; + VERIFY3P(node->btc_children[offset + 1], ==, cval); + for (int i = 0; i < size; i++) + VERIFY3U(node->btc_elems[offset * size + i], ==, eval); + } else { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 0; i < size; i++) + VERIFY3U(leaf->btl_elems[offset * size + i], ==, eval); + } +#endif +} + +void +zfs_btree_init(void) +{ + zfs_btree_leaf_cache = kmem_cache_create("zfs_btree_leaf_cache", + BTREE_LEAF_SIZE, 0, NULL, NULL, NULL, NULL, + NULL, 0); +} + +void +zfs_btree_fini(void) +{ + kmem_cache_destroy(zfs_btree_leaf_cache); +} + +void +zfs_btree_create(zfs_btree_t *tree, int (*compar) (const void *, const void *), + size_t size) +{ + /* + * We need a minimmum of 4 elements so that when we split a node we + * always have at least two elements in each node. This simplifies the + * logic in zfs_btree_bulk_finish, since it means the last leaf will + * always have a left sibling to share with (unless it's the root). + */ + ASSERT3U(size, <=, (BTREE_LEAF_SIZE - sizeof (zfs_btree_hdr_t)) / 4); + + bzero(tree, sizeof (*tree)); + tree->bt_compar = compar; + tree->bt_elem_size = size; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +/* + * Find value in the array of elements provided. Uses a simple binary search. + */ +static void * +zfs_btree_find_in_buf(zfs_btree_t *tree, uint8_t *buf, uint64_t nelems, + const void *value, zfs_btree_index_t *where) +{ + uint64_t max = nelems; + uint64_t min = 0; + while (max > min) { + uint64_t idx = (min + max) / 2; + uint8_t *cur = buf + idx * tree->bt_elem_size; + int comp = tree->bt_compar(cur, value); + if (comp == -1) { + min = idx + 1; + } else if (comp == 1) { + max = idx; + } else { + ASSERT0(comp); + where->bti_offset = idx; + where->bti_before = B_FALSE; + return (cur); + } + } + + where->bti_offset = max; + where->bti_before = B_TRUE; + return (NULL); +} + +/* + * Find the given value in the tree. where may be passed as null to use as a + * membership test or if the btree is being used as a map. + */ +void * +zfs_btree_find(zfs_btree_t *tree, const void *value, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + if (where != NULL) { + where->bti_node = NULL; + where->bti_offset = 0; + } + ASSERT0(tree->bt_num_elems); + return (NULL); + } + + /* + * If we're in bulk-insert mode, we check the last spot in the tree + * and the last leaf in the tree before doing the normal search, + * because for most workloads the vast majority of finds in + * bulk-insert mode are to insert new elements. + */ + zfs_btree_index_t idx; + if (tree->bt_bulk != NULL) { + zfs_btree_leaf_t *last_leaf = tree->bt_bulk; + int compar = tree->bt_compar(last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * tree->bt_elem_size), + value); + if (compar < 0) { + /* + * If what they're looking for is after the last + * element, it's not in the tree. + */ + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count; + where->bti_before = B_TRUE; + } + return (NULL); + } else if (compar == 0) { + if (where != NULL) { + where->bti_node = (zfs_btree_hdr_t *)last_leaf; + where->bti_offset = + last_leaf->btl_hdr.bth_count - 1; + where->bti_before = B_FALSE; + } + return (last_leaf->btl_elems + + ((last_leaf->btl_hdr.bth_count - 1) * + tree->bt_elem_size)); + } + if (tree->bt_compar(last_leaf->btl_elems, value) <= 0) { + /* + * If what they're looking for is after the first + * element in the last leaf, it's in the last leaf or + * it's not in the tree. + */ + void *d = zfs_btree_find_in_buf(tree, + last_leaf->btl_elems, last_leaf->btl_hdr.bth_count, + value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)last_leaf; + *where = idx; + } + return (d); + } + } + + zfs_btree_core_t *node = NULL; + uint64_t child = 0; + uint64_t depth = 0; + + /* + * Iterate down the tree, finding which child the value should be in + * by comparing with the separators. + */ + for (node = (zfs_btree_core_t *)tree->bt_root; depth < tree->bt_height; + node = (zfs_btree_core_t *)node->btc_children[child], depth++) { + ASSERT3P(node, !=, NULL); + void *d = zfs_btree_find_in_buf(tree, node->btc_elems, + node->btc_hdr.bth_count, value, &idx); + EQUIV(d != NULL, !idx.bti_before); + if (d != NULL) { + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)node; + *where = idx; + } + return (d); + } + ASSERT(idx.bti_before); + child = idx.bti_offset; + } + + /* + * The value is in this leaf, or it would be if it were in the + * tree. Find its proper location and return it. + */ + zfs_btree_leaf_t *leaf = (depth == 0 ? + (zfs_btree_leaf_t *)tree->bt_root : (zfs_btree_leaf_t *)node); + void *d = zfs_btree_find_in_buf(tree, leaf->btl_elems, + leaf->btl_hdr.bth_count, value, &idx); + + if (where != NULL) { + idx.bti_node = (zfs_btree_hdr_t *)leaf; + *where = idx; + } + + return (d); +} + +/* + * To explain the following functions, it is useful to understand the four + * kinds of shifts used in btree operation. First, a shift is a movement of + * elements within a node. It is used to create gaps for inserting new + * elements and children, or cover gaps created when things are removed. A + * shift has two fundamental properties, each of which can be one of two + * values, making four types of shifts. There is the direction of the shift + * (left or right) and the shape of the shift (parallelogram or isoceles + * trapezoid (shortened to trapezoid hereafter)). The shape distinction only + * applies to shifts of core nodes. + * + * The names derive from the following imagining of the layout of a node: + * + * Elements: * * * * * * * ... * * * + * Children: * * * * * * * * ... * * * + * + * This layout follows from the fact that the elements act as separators + * between pairs of children, and that children root subtrees "below" the + * current node. A left and right shift are fairly self-explanatory; a left + * shift moves things to the left, while a right shift moves things to the + * right. A parallelogram shift is a shift with the same number of elements + * and children being moved, while a trapezoid shift is a shift that moves one + * more children than elements. An example follows: + * + * A parallelogram shift could contain the following: + * _______________ + * \* * * * \ * * * ... * * * + * * \ * * * *\ * * * ... * * * + * --------------- + * A trapezoid shift could contain the following: + * ___________ + * * / * * * \ * * * ... * * * + * * / * * * *\ * * * ... * * * + * --------------- + * + * Note that a parallelogram shift is always shaped like a "left-leaning" + * parallelogram, where the starting index of the children being moved is + * always one higher than the starting index of the elements being moved. No + * "right-leaning" parallelogram shifts are needed (shifts where the starting + * element index and starting child index being moved are the same) to achieve + * any btree operations, so we ignore them. + */ + +enum bt_shift_shape { + BSS_TRAPEZOID, + BSS_PARALLELOGRAM +}; + +enum bt_shift_direction { + BSD_LEFT, + BSD_RIGHT +}; + +/* + * Shift elements and children in the provided core node by off spots. The + * first element moved is idx, and count elements are moved. The shape of the + * shift is determined by shape. The direction is determined by dir. + */ +static inline void +bt_shift_core(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_shape shape, + enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(node->btc_hdr.bth_core); + + uint8_t *e_start = node->btc_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *e_out = e_start + sign * off * size; + uint64_t e_count = count; + bmov(e_start, e_out, e_count * size); + + zfs_btree_hdr_t **c_start = node->btc_children + idx + + (shape == BSS_TRAPEZOID ? 0 : 1); + zfs_btree_hdr_t **c_out = (dir == BSD_LEFT ? c_start - off : + c_start + off); + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(c_start, c_out, c_count * sizeof (*c_start)); +} + +/* + * Shift elements and children in the provided core node left by one spot. + * The first element moved is idx, and count elements are moved. The + * shape of the shift is determined by trap; true if the shift is a trapezoid, + * false if it is a parallelogram. + */ +static inline void +bt_shift_core_left(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_LEFT); +} + +/* + * Shift elements and children in the provided core node right by one spot. + * Starts with elements[idx] and children[idx] and one more child than element. + */ +static inline void +bt_shift_core_right(zfs_btree_t *tree, zfs_btree_core_t *node, uint64_t idx, + uint64_t count, enum bt_shift_shape shape) +{ + bt_shift_core(tree, node, idx, count, 1, shape, BSD_RIGHT); +} + +/* + * Shift elements and children in the provided leaf node by off spots. + * The first element moved is idx, and count elements are moved. The direction + * is determined by left. + */ +static inline void +bt_shift_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *node, uint64_t idx, + uint64_t count, uint64_t off, enum bt_shift_direction dir) +{ + size_t size = tree->bt_elem_size; + ASSERT(!node->btl_hdr.bth_core); + + uint8_t *start = node->btl_elems + idx * size; + int sign = (dir == BSD_LEFT ? -1 : +1); + uint8_t *out = start + sign * off * size; + bmov(start, out, count * size); +} + +static inline void +bt_shift_leaf_right(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_RIGHT); +} + +static inline void +bt_shift_leaf_left(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, uint64_t idx, + uint64_t count) +{ + bt_shift_leaf(tree, leaf, idx, count, 1, BSD_LEFT); +} + +/* + * Move children and elements from one core node to another. The shape + * parameter behaves the same as it does in the shift logic. + */ +static inline void +bt_transfer_core(zfs_btree_t *tree, zfs_btree_core_t *source, uint64_t sidx, + uint64_t count, zfs_btree_core_t *dest, uint64_t didx, + enum bt_shift_shape shape) +{ + size_t size = tree->bt_elem_size; + ASSERT(source->btc_hdr.bth_core); + ASSERT(dest->btc_hdr.bth_core); + + bmov(source->btc_elems + sidx * size, dest->btc_elems + didx * size, + count * size); + + uint64_t c_count = count + (shape == BSS_TRAPEZOID ? 1 : 0); + bmov(source->btc_children + sidx + (shape == BSS_TRAPEZOID ? 0 : 1), + dest->btc_children + didx + (shape == BSS_TRAPEZOID ? 0 : 1), + c_count * sizeof (*source->btc_children)); +} + +static inline void +bt_transfer_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *source, uint64_t sidx, + uint64_t count, zfs_btree_leaf_t *dest, uint64_t didx) +{ + size_t size = tree->bt_elem_size; + ASSERT(!source->btl_hdr.bth_core); + ASSERT(!dest->btl_hdr.bth_core); + + bmov(source->btl_elems + sidx * size, dest->btl_elems + didx * size, + count * size); +} + +/* + * Find the first element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_first_helper(zfs_btree_hdr_t *hdr, zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[0]) + ; + + ASSERT(!node->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = 0; + where->bti_before = B_FALSE; + } + return (&leaf->btl_elems[0]); +} + +/* Insert an element and a child into a core node at the given offset. */ +static void +zfs_btree_insert_core_impl(zfs_btree_t *tree, zfs_btree_core_t *parent, + uint64_t offset, zfs_btree_hdr_t *new_node, void *buf) +{ + uint64_t size = tree->bt_elem_size; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + ASSERT3P(par_hdr, ==, new_node->bth_parent); + ASSERT3U(par_hdr->bth_count, <, BTREE_CORE_ELEMS); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, par_hdr, + par_hdr->bth_count); + } + /* Shift existing elements and children */ + uint64_t count = par_hdr->bth_count - offset; + bt_shift_core_right(tree, parent, offset, count, + BSS_PARALLELOGRAM); + + /* Insert new values */ + parent->btc_children[offset + 1] = new_node; + bmov(buf, parent->btc_elems + offset * size, size); + par_hdr->bth_count++; +} + +/* + * Insert new_node into the parent of old_node directly after old_node, with + * buf as the dividing element between the two. + */ +static void +zfs_btree_insert_into_parent(zfs_btree_t *tree, zfs_btree_hdr_t *old_node, + zfs_btree_hdr_t *new_node, void *buf) +{ + ASSERT3P(old_node->bth_parent, ==, new_node->bth_parent); + uint64_t size = tree->bt_elem_size; + zfs_btree_core_t *parent = old_node->bth_parent; + zfs_btree_hdr_t *par_hdr = &parent->btc_hdr; + + /* + * If this is the root node we were splitting, we create a new root + * and increase the height of the tree. + */ + if (parent == NULL) { + ASSERT3P(old_node, ==, tree->bt_root); + tree->bt_num_nodes++; + zfs_btree_core_t *new_root = + kmem_alloc(sizeof (zfs_btree_core_t) + BTREE_CORE_ELEMS * + size, KM_SLEEP); + zfs_btree_hdr_t *new_root_hdr = &new_root->btc_hdr; + new_root_hdr->bth_parent = NULL; + new_root_hdr->bth_core = B_TRUE; + new_root_hdr->bth_count = 1; + + old_node->bth_parent = new_node->bth_parent = new_root; + new_root->btc_children[0] = old_node; + new_root->btc_children[1] = new_node; + bmov(buf, new_root->btc_elems, size); + + tree->bt_height++; + tree->bt_root = new_root_hdr; + zfs_btree_poison_node(tree, new_root_hdr); + return; + } + + /* + * Since we have the new separator, binary search for where to put + * new_node. + */ + zfs_btree_index_t idx; + ASSERT(par_hdr->bth_core); + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + par_hdr->bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + uint64_t offset = idx.bti_offset; + ASSERT3U(offset, <=, par_hdr->bth_count); + ASSERT3P(parent->btc_children[offset], ==, old_node); + + /* + * If the parent isn't full, shift things to accommodate our insertions + * and return. + */ + if (par_hdr->bth_count != BTREE_CORE_ELEMS) { + zfs_btree_insert_core_impl(tree, parent, offset, new_node, buf); + return; + } + + /* + * We need to split this core node into two. Currently there are + * BTREE_CORE_ELEMS + 1 child nodes, and we are adding one for + * BTREE_CORE_ELEMS + 2. Some of the children will be part of the + * current node, and the others will be moved to the new core node. + * There are BTREE_CORE_ELEMS + 1 elements including the new one. One + * will be used as the new separator in our parent, and the others + * will be split among the two core nodes. + * + * Usually we will split the node in half evenly, with + * BTREE_CORE_ELEMS/2 elements in each node. If we're bulk loading, we + * instead move only about a quarter of the elements (and children) to + * the new node. Since the average state after a long time is a 3/4 + * full node, shortcutting directly to that state improves efficiency. + * + * We do this in two stages: first we split into two nodes, and then we + * reuse our existing logic to insert the new element and child. + */ + uint64_t move_count = MAX((BTREE_CORE_ELEMS / (tree->bt_bulk == NULL ? + 2 : 4)) - 1, 2); + uint64_t keep_count = BTREE_CORE_ELEMS - move_count - 1; + ASSERT3U(BTREE_CORE_ELEMS - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_core_t *new_parent = kmem_alloc(sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * size, KM_SLEEP); + zfs_btree_hdr_t *new_par_hdr = &new_parent->btc_hdr; + new_par_hdr->bth_parent = par_hdr->bth_parent; + new_par_hdr->bth_core = B_TRUE; + new_par_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_par_hdr); + + par_hdr->bth_count = keep_count; + + bt_transfer_core(tree, parent, keep_count + 1, move_count, new_parent, + 0, BSS_TRAPEZOID); + + /* Store the new separator in a buffer. */ + uint8_t *tmp_buf = kmem_alloc(size, KM_SLEEP); + bmov(parent->btc_elems + keep_count * size, tmp_buf, + size); + zfs_btree_poison_node(tree, par_hdr); + + if (offset < keep_count) { + /* Insert the new node into the left half */ + zfs_btree_insert_core_impl(tree, parent, offset, new_node, + buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else if (offset > keep_count) { + /* Insert the new node into the right half */ + new_node->bth_parent = new_parent; + zfs_btree_insert_core_impl(tree, new_parent, + offset - keep_count - 1, new_node, buf); + + /* + * Move the new separator to the existing buffer. + */ + bmov(tmp_buf, buf, size); + } else { + /* + * Move the new separator into the right half, and replace it + * with buf. We also need to shift back the elements in the + * right half to accommodate new_node. + */ + bt_shift_core_right(tree, new_parent, 0, move_count, + BSS_TRAPEZOID); + new_parent->btc_children[0] = new_node; + bmov(tmp_buf, new_parent->btc_elems, size); + new_par_hdr->bth_count++; + } + kmem_free(tmp_buf, size); + zfs_btree_poison_node(tree, par_hdr); + + for (int i = 0; i <= new_parent->btc_hdr.bth_count; i++) + new_parent->btc_children[i]->bth_parent = new_parent; + + for (int i = 0; i <= parent->btc_hdr.bth_count; i++) + ASSERT3P(parent->btc_children[i]->bth_parent, ==, parent); + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting. + */ + zfs_btree_insert_into_parent(tree, &parent->btc_hdr, + &new_parent->btc_hdr, buf); +} + +/* Insert an element into a leaf node at the given offset. */ +static void +zfs_btree_insert_leaf_impl(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + uint64_t idx, const void *value) +{ + uint64_t size = tree->bt_elem_size; + uint8_t *start = leaf->btl_elems + (idx * size); + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + uint64_t capacity __maybe_unused = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + uint64_t count = leaf->btl_hdr.bth_count - idx; + ASSERT3U(leaf->btl_hdr.bth_count, <, capacity); + + if (zfs_btree_verify_intensity >= 5) { + zfs_btree_verify_poison_at(tree, &leaf->btl_hdr, + leaf->btl_hdr.bth_count); + } + + bt_shift_leaf_right(tree, leaf, idx, count); + bmov(value, start, size); + hdr->bth_count++; +} + +/* Helper function for inserting a new value into leaf at the given index. */ +static void +zfs_btree_insert_into_leaf(zfs_btree_t *tree, zfs_btree_leaf_t *leaf, + const void *value, uint64_t idx) +{ + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * If the leaf isn't full, shift the elements after idx and insert + * value. + */ + if (leaf->btl_hdr.bth_count != capacity) { + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + return; + } + + /* + * Otherwise, we split the leaf node into two nodes. If we're not bulk + * inserting, each is of size (capacity / 2). If we are bulk + * inserting, we move a quarter of the elements to the new node so + * inserts into the old node don't cause immediate splitting but the + * tree stays relatively dense. Since the average state after a long + * time is a 3/4 full node, shortcutting directly to that state + * improves efficiency. At the end of the bulk insertion process + * we'll need to go through and fix up any nodes (the last leaf and + * its ancestors, potentially) that are below the minimum. + * + * In either case, we're left with one extra element. The leftover + * element will become the new dividing element between the two nodes. + */ + uint64_t move_count = MAX(capacity / (tree->bt_bulk == NULL ? 2 : 4) - + 1, 2); + uint64_t keep_count = capacity - move_count - 1; + ASSERT3U(capacity - move_count, >=, 2); + tree->bt_num_nodes++; + zfs_btree_leaf_t *new_leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + zfs_btree_hdr_t *new_hdr = &new_leaf->btl_hdr; + new_hdr->bth_parent = leaf->btl_hdr.bth_parent; + new_hdr->bth_core = B_FALSE; + new_hdr->bth_count = move_count; + zfs_btree_poison_node(tree, new_hdr); + + leaf->btl_hdr.bth_count = keep_count; + + if (tree->bt_bulk != NULL && leaf == tree->bt_bulk) + tree->bt_bulk = new_leaf; + + /* Copy the back part to the new leaf. */ + bt_transfer_leaf(tree, leaf, keep_count + 1, move_count, new_leaf, + 0); + + /* We store the new separator in a buffer we control for simplicity. */ + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(leaf->btl_elems + (keep_count * size), buf, size); + zfs_btree_poison_node(tree, &leaf->btl_hdr); + + if (idx < keep_count) { + /* Insert into the existing leaf. */ + zfs_btree_insert_leaf_impl(tree, leaf, idx, value); + } else if (idx > keep_count) { + /* Insert into the new leaf. */ + zfs_btree_insert_leaf_impl(tree, new_leaf, idx - keep_count - + 1, value); + } else { + /* + * Shift the elements in the new leaf to make room for the + * separator, and use the new value as the new separator. + */ + bt_shift_leaf_right(tree, new_leaf, 0, move_count); + bmov(buf, new_leaf->btl_elems, size); + bmov(value, buf, size); + new_hdr->bth_count++; + } + + /* + * Now that the node is split, we need to insert the new node into its + * parent. This may cause further splitting, bur only of core nodes. + */ + zfs_btree_insert_into_parent(tree, &leaf->btl_hdr, &new_leaf->btl_hdr, + buf); + kmem_free(buf, size); +} + +static uint64_t +zfs_btree_find_parent_idx(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + void *buf; + if (hdr->bth_core) { + buf = ((zfs_btree_core_t *)hdr)->btc_elems; + } else { + buf = ((zfs_btree_leaf_t *)hdr)->btl_elems; + } + zfs_btree_index_t idx; + zfs_btree_core_t *parent = hdr->bth_parent; + VERIFY3P(zfs_btree_find_in_buf(tree, parent->btc_elems, + parent->btc_hdr.bth_count, buf, &idx), ==, NULL); + ASSERT(idx.bti_before); + ASSERT3U(idx.bti_offset, <=, parent->btc_hdr.bth_count); + ASSERT3P(parent->btc_children[idx.bti_offset], ==, hdr); + return (idx.bti_offset); +} + +/* + * Take the b-tree out of bulk insert mode. During bulk-insert mode, some + * nodes may violate the invariant that non-root nodes must be at least half + * full. All nodes violating this invariant should be the last node in their + * particular level. To correct the invariant, we take values from their left + * neighbor until they are half full. They must have a left neighbor at their + * level because the last node at a level is not the first node unless it's + * the root. + */ +static void +zfs_btree_bulk_finish(zfs_btree_t *tree) +{ + ASSERT3P(tree->bt_bulk, !=, NULL); + ASSERT3P(tree->bt_root, !=, NULL); + zfs_btree_leaf_t *leaf = tree->bt_bulk; + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t size = tree->bt_elem_size; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + /* + * The invariant doesn't apply to the root node, if that's the only + * node in the tree we're done. + */ + if (parent == NULL) { + tree->bt_bulk = NULL; + return; + } + + /* First, take elements to rebalance the leaf node. */ + if (hdr->bth_count < capacity / 2) { + /* + * First, find the left neighbor. The simplest way to do this + * is to call zfs_btree_prev twice; the first time finds some + * ancestor of this node, and the second time finds the left + * neighbor. The ancestor found is the lowest common ancestor + * of leaf and the neighbor. + */ + zfs_btree_index_t idx = { + .bti_node = hdr, + .bti_offset = 0 + }; + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(idx.bti_node->bth_core); + zfs_btree_core_t *common = (zfs_btree_core_t *)idx.bti_node; + uint64_t common_idx = idx.bti_offset; + + VERIFY3P(zfs_btree_prev(tree, &idx, &idx), !=, NULL); + ASSERT(!idx.bti_node->bth_core); + zfs_btree_leaf_t *l_neighbor = (zfs_btree_leaf_t *)idx.bti_node; + zfs_btree_hdr_t *l_hdr = idx.bti_node; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btl_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + leaf->btl_hdr.bth_count + i); + } + } + + /* First, shift elements in leaf back. */ + bt_shift_leaf(tree, leaf, 0, hdr->bth_count, move_count, + BSD_RIGHT); + + /* Next, move the separator from the common ancestor to leaf. */ + uint8_t *separator = common->btc_elems + (common_idx * size); + uint8_t *out = leaf->btl_elems + ((move_count - 1) * size); + bmov(separator, out, size); + move_count--; + + /* + * Now we move elements from the tail of the left neighbor to + * fill the remaining spots in leaf. + */ + bt_transfer_leaf(tree, l_neighbor, l_hdr->bth_count - + move_count, move_count, leaf, 0); + + /* + * Finally, move the new last element in the left neighbor to + * the separator. + */ + bmov(l_neighbor->btl_elems + (l_hdr->bth_count - + move_count - 1) * size, separator, size); + + /* Adjust the node's counts, and we're done. */ + l_hdr->bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_hdr->bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + zfs_btree_poison_node(tree, l_hdr); + } + + /* + * Now we have to rebalance any ancestors of leaf that may also + * violate the invariant. + */ + capacity = BTREE_CORE_ELEMS; + while (parent->btc_hdr.bth_parent != NULL) { + zfs_btree_core_t *cur = parent; + zfs_btree_hdr_t *hdr = &cur->btc_hdr; + parent = hdr->bth_parent; + /* + * If the invariant isn't violated, move on to the next + * ancestor. + */ + if (hdr->bth_count >= capacity / 2) + continue; + + /* + * Because the smallest number of nodes we can move when + * splitting is 2, we never need to worry about not having a + * left sibling (a sibling is a neighbor with the same parent). + */ + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + ASSERT3U(parent_idx, >, 0); + zfs_btree_core_t *l_neighbor = + (zfs_btree_core_t *)parent->btc_children[parent_idx - 1]; + uint64_t move_count = (capacity / 2) - hdr->bth_count; + ASSERT3U(l_neighbor->btc_hdr.bth_count - move_count, >=, + capacity / 2); + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < move_count; i++) { + zfs_btree_verify_poison_at(tree, hdr, + hdr->bth_count + i); + } + } + /* First, shift things in the right node back. */ + bt_shift_core(tree, cur, 0, hdr->bth_count, move_count, + BSS_TRAPEZOID, BSD_RIGHT); + + /* Next, move the separator to the right node. */ + uint8_t *separator = parent->btc_elems + ((parent_idx - 1) * + size); + uint8_t *e_out = cur->btc_elems + ((move_count - 1) * size); + bmov(separator, e_out, size); + + /* + * Now, move elements and children from the left node to the + * right. We move one more child than elements. + */ + move_count--; + uint64_t move_idx = l_neighbor->btc_hdr.bth_count - move_count; + bt_transfer_core(tree, l_neighbor, move_idx, move_count, cur, 0, + BSS_TRAPEZOID); + + /* + * Finally, move the last element in the left node to the + * separator's position. + */ + move_idx--; + bmov(l_neighbor->btc_elems + move_idx * size, separator, size); + + l_neighbor->btc_hdr.bth_count -= move_count + 1; + hdr->bth_count += move_count + 1; + + ASSERT3U(l_neighbor->btc_hdr.bth_count, >=, capacity / 2); + ASSERT3U(hdr->bth_count, >=, capacity / 2); + + zfs_btree_poison_node(tree, &l_neighbor->btc_hdr); + + for (int i = 0; i <= hdr->bth_count; i++) + cur->btc_children[i]->bth_parent = cur; + } + + tree->bt_bulk = NULL; +} + +/* + * Insert value into tree at the location specified by where. + */ +void +zfs_btree_add_idx(zfs_btree_t *tree, const void *value, + const zfs_btree_index_t *where) +{ + zfs_btree_index_t idx = {0}; + + /* If we're not inserting in the last leaf, end bulk insert mode. */ + if (tree->bt_bulk != NULL) { + if (where->bti_node != &tree->bt_bulk->btl_hdr) { + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, value, &idx), ==, NULL); + where = &idx; + } + } + + tree->bt_num_elems++; + /* + * If this is the first element in the tree, create a leaf root node + * and add the value to it. + */ + if (where->bti_node == NULL) { + ASSERT3U(tree->bt_num_elems, ==, 1); + ASSERT3S(tree->bt_height, ==, -1); + ASSERT3P(tree->bt_root, ==, NULL); + ASSERT0(where->bti_offset); + + tree->bt_num_nodes++; + zfs_btree_leaf_t *leaf = kmem_cache_alloc(zfs_btree_leaf_cache, + KM_SLEEP); + tree->bt_root = &leaf->btl_hdr; + tree->bt_height++; + + zfs_btree_hdr_t *hdr = &leaf->btl_hdr; + hdr->bth_parent = NULL; + hdr->bth_core = B_FALSE; + hdr->bth_count = 0; + zfs_btree_poison_node(tree, hdr); + + zfs_btree_insert_into_leaf(tree, leaf, value, 0); + tree->bt_bulk = leaf; + } else if (!where->bti_node->bth_core) { + /* + * If we're inserting into a leaf, go directly to the helper + * function. + */ + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)where->bti_node, value, + where->bti_offset); + } else { + /* + * If we're inserting into a core node, we can't just shift + * the existing element in that slot in the same node without + * breaking our ordering invariants. Instead we place the new + * value in the node at that spot and then insert the old + * separator into the first slot in the subtree to the right. + */ + ASSERT(where->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)where->bti_node; + + /* + * We can ignore bti_before, because either way the value + * should end up in bti_offset. + */ + uint64_t off = where->bti_offset; + zfs_btree_hdr_t *subtree = node->btc_children[off + 1]; + size_t size = tree->bt_elem_size; + uint8_t *buf = kmem_alloc(size, KM_SLEEP); + bmov(node->btc_elems + off * size, buf, size); + bmov(value, node->btc_elems + off * size, size); + + /* + * Find the first slot in the subtree to the right, insert + * there. + */ + zfs_btree_index_t new_idx; + VERIFY3P(zfs_btree_first_helper(subtree, &new_idx), !=, NULL); + ASSERT0(new_idx.bti_offset); + ASSERT(!new_idx.bti_node->bth_core); + zfs_btree_insert_into_leaf(tree, + (zfs_btree_leaf_t *)new_idx.bti_node, buf, 0); + kmem_free(buf, size); + } + zfs_btree_verify(tree); +} + +/* + * Return the first element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_first(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_first_helper(tree->bt_root, where)); +} + +/* + * Find the last element in the subtree rooted at hdr, return its value and + * put its location in where if non-null. + */ +static void * +zfs_btree_last_helper(zfs_btree_t *btree, zfs_btree_hdr_t *hdr, + zfs_btree_index_t *where) +{ + zfs_btree_hdr_t *node; + + for (node = hdr; node->bth_core; node = + ((zfs_btree_core_t *)node)->btc_children[node->bth_count]) + ; + + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)node; + if (where != NULL) { + where->bti_node = node; + where->bti_offset = node->bth_count - 1; + where->bti_before = B_FALSE; + } + return (leaf->btl_elems + (node->bth_count - 1) * btree->bt_elem_size); +} + +/* + * Return the last element in the tree, and put its location in where if + * non-null. + */ +void * +zfs_btree_last(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + if (tree->bt_height == -1) { + ASSERT0(tree->bt_num_elems); + return (NULL); + } + return (zfs_btree_last_helper(tree, tree->bt_root, where)); +} + +/* + * This function contains the logic to find the next node in the tree. A + * helper function is used because there are multiple internal consumemrs of + * this logic. The done_func is used by zfs_btree_destroy_nodes to clean up each + * node after we've finished with it. + */ +static void * +zfs_btree_next_helper(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx, + void (*done_func)(zfs_btree_t *, zfs_btree_hdr_t *)) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the next element of an element in a leaf, + * there are two cases. If the element isn't the last one in + * the leaf, in which case we just return the next element in + * the leaf. Otherwise, we need to traverse up our parents + * until we find one where our ancestor isn't the last child + * of its parent. Once we do, the next element is the + * separator after our ancestor in its parent. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + uint64_t new_off = offset + (idx->bti_before ? 0 : 1); + if (leaf->btl_hdr.bth_count > new_off) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = new_off; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + new_off * tree->bt_elem_size); + } + + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (done_func != NULL) + done_func(tree, prev); + if (i == hdr->bth_count) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + i * tree->bt_elem_size); + } + if (done_func != NULL) + done_func(tree, prev); + /* + * We've traversed all the way up and been at the end of the + * node every time, so this was the last element in the tree. + */ + return (NULL); + } + + /* If we were before an element in a core node, return that element. */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + if (idx->bti_before) { + out_idx->bti_before = B_FALSE; + return (node->btc_elems + offset * tree->bt_elem_size); + } + + /* + * The next element from one in a core node is the first element in + * the subtree just to the right of the separator. + */ + zfs_btree_hdr_t *child = node->btc_children[offset + 1]; + return (zfs_btree_first_helper(child, out_idx)); +} + +/* + * Return the next valued node in the tree. The same address can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_next(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + return (zfs_btree_next_helper(tree, idx, out_idx, NULL)); +} + +/* + * Return the previous valued node in the tree. The same value can be safely + * passed for idx and out_idx. + */ +void * +zfs_btree_prev(zfs_btree_t *tree, const zfs_btree_index_t *idx, + zfs_btree_index_t *out_idx) +{ + if (idx->bti_node == NULL) { + ASSERT3S(tree->bt_height, ==, -1); + return (NULL); + } + + uint64_t offset = idx->bti_offset; + if (!idx->bti_node->bth_core) { + /* + * When finding the previous element of an element in a leaf, + * there are two cases. If the element isn't the first one in + * the leaf, in which case we just return the previous element + * in the leaf. Otherwise, we need to traverse up our parents + * until we find one where our previous ancestor isn't the + * first child. Once we do, the previous element is the + * separator after our previous ancestor. + */ + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + if (offset != 0) { + out_idx->bti_node = &leaf->btl_hdr; + out_idx->bti_offset = offset - 1; + out_idx->bti_before = B_FALSE; + return (leaf->btl_elems + (offset - 1) * + tree->bt_elem_size); + } + zfs_btree_hdr_t *prev = &leaf->btl_hdr; + for (zfs_btree_core_t *node = leaf->btl_hdr.bth_parent; + node != NULL; node = node->btc_hdr.bth_parent) { + zfs_btree_hdr_t *hdr = &node->btc_hdr; + ASSERT(hdr->bth_core); + uint64_t i = zfs_btree_find_parent_idx(tree, prev); + if (i == 0) { + prev = hdr; + continue; + } + out_idx->bti_node = hdr; + out_idx->bti_offset = i - 1; + out_idx->bti_before = B_FALSE; + return (node->btc_elems + (i - 1) * tree->bt_elem_size); + } + /* + * We've traversed all the way up and been at the start of the + * node every time, so this was the first node in the tree. + */ + return (NULL); + } + + /* + * The previous element from one in a core node is the last element in + * the subtree just to the left of the separator. + */ + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + zfs_btree_hdr_t *child = node->btc_children[offset]; + return (zfs_btree_last_helper(tree, child, out_idx)); +} + +/* + * Get the value at the provided index in the tree. + * + * Note that the value returned from this function can be mutated, but only + * if it will not change the ordering of the element with respect to any other + * elements that could be in the tree. + */ +void * +zfs_btree_get(zfs_btree_t *tree, zfs_btree_index_t *idx) +{ + ASSERT(!idx->bti_before); + if (!idx->bti_node->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)idx->bti_node; + return (leaf->btl_elems + idx->bti_offset * tree->bt_elem_size); + } + ASSERT(idx->bti_node->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)idx->bti_node; + return (node->btc_elems + idx->bti_offset * tree->bt_elem_size); +} + +/* Add the given value to the tree. Must not already be in the tree. */ +void +zfs_btree_add(zfs_btree_t *tree, const void *node) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, node, &where), ==, NULL); + zfs_btree_add_idx(tree, node, &where); +} + +/* Helper function to free a tree node. */ +static void +zfs_btree_node_destroy(zfs_btree_t *tree, zfs_btree_hdr_t *node) +{ + tree->bt_num_nodes--; + if (!node->bth_core) { + kmem_cache_free(zfs_btree_leaf_cache, node); + } else { + kmem_free(node, sizeof (zfs_btree_core_t) + + BTREE_CORE_ELEMS * tree->bt_elem_size); + } +} + +/* + * Remove the rm_hdr and the separator to its left from the parent node. The + * buffer that rm_hdr was stored in may already be freed, so its contents + * cannot be accessed. + */ +static void +zfs_btree_remove_from_node(zfs_btree_t *tree, zfs_btree_core_t *node, + zfs_btree_hdr_t *rm_hdr) +{ + size_t size = tree->bt_elem_size; + uint64_t min_count = (BTREE_CORE_ELEMS / 2) - 1; + zfs_btree_hdr_t *hdr = &node->btc_hdr; + /* + * If the node is the root node and rm_hdr is one of two children, + * promote the other child to the root. + */ + if (hdr->bth_parent == NULL && hdr->bth_count <= 1) { + ASSERT3U(hdr->bth_count, ==, 1); + ASSERT3P(tree->bt_root, ==, node); + ASSERT3P(node->btc_children[1], ==, rm_hdr); + tree->bt_root = node->btc_children[0]; + node->btc_children[0]->bth_parent = NULL; + zfs_btree_node_destroy(tree, hdr); + tree->bt_height--; + return; + } + + uint64_t idx; + for (idx = 0; idx <= hdr->bth_count; idx++) { + if (node->btc_children[idx] == rm_hdr) + break; + } + ASSERT3U(idx, <=, hdr->bth_count); + + /* + * If the node is the root or it has more than the minimum number of + * children, just remove the child and separator, and return. + */ + if (hdr->bth_parent == NULL || + hdr->bth_count > min_count) { + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + hdr->bth_count--; + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + return; + } + + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a neighbor. We check left, then + * right. If the neighbor exists and has more than the minimum number + * of elements, we move the separator between us and them to our + * node, move their closest element (last for left, first for right) + * to the separator, and move their closest child to our node. Along + * the way we need to collapse the gap made by idx, and (for our right + * neighbor) the gap made by removing their first element and child. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling (i.e. a neighbor with a different + * parent). This isn't critical functionality, but may be worth + * implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(l_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)l_hdr; + + /* + * Start by shifting the elements and children in the current + * node to the right by one spot. + */ + bt_shift_core_right(tree, node, 0, idx - 1, BSS_TRAPEZOID); + + /* + * Move the separator between node and neighbor to the first + * element slot in the current node. + */ + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, node->btc_elems, size); + + /* Move the last child of neighbor to our first child slot. */ + zfs_btree_hdr_t **take_child = neighbor->btc_children + + l_hdr->bth_count; + bmov(take_child, node->btc_children, sizeof (*take_child)); + node->btc_children[0]->bth_parent = node; + + /* Move the last element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems + + (l_hdr->bth_count - 1) * size; + bmov(take_elem, separator, size); + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(r_hdr->bth_core); + zfs_btree_core_t *neighbor = (zfs_btree_core_t *)r_hdr; + + /* + * Shift elements in node left by one spot to overwrite rm_hdr + * and the separator before it. + */ + bt_shift_core_left(tree, node, idx, hdr->bth_count - idx, + BSS_PARALLELOGRAM); + + /* + * Move the separator between node and neighbor to the last + * element spot in node. + */ + uint8_t *separator = parent->btc_elems + parent_idx * size; + bmov(separator, node->btc_elems + (hdr->bth_count - 1) * size, + size); + + /* + * Move the first child of neighbor to the last child spot in + * node. + */ + zfs_btree_hdr_t **take_child = neighbor->btc_children; + bmov(take_child, node->btc_children + hdr->bth_count, + sizeof (*take_child)); + node->btc_children[hdr->bth_count]->bth_parent = node; + + /* Move the first element of neighbor to the separator spot. */ + uint8_t *take_elem = neighbor->btc_elems; + bmov(take_elem, separator, size); + r_hdr->bth_count--; + + /* + * Shift the elements and children of neighbor to cover the + * stolen elements. + */ + bt_shift_core_left(tree, neighbor, 1, r_hdr->bth_count, + BSS_TRAPEZOID); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arbitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *new_rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + new_rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + new_rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(keep_hdr->bth_core); + ASSERT(new_rm_hdr->bth_core); + + zfs_btree_core_t *keep = (zfs_btree_core_t *)keep_hdr; + zfs_btree_core_t *rm = (zfs_btree_core_t *)new_rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + + /* Move the separator into the left node. */ + uint8_t *e_out = keep->btc_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, e_out, size); + keep_hdr->bth_count++; + + /* Move all our elements and children into the left node. */ + bt_transfer_core(tree, rm, 0, new_rm_hdr->bth_count, keep, + keep_hdr->bth_count, BSS_TRAPEZOID); + + uint64_t old_count = keep_hdr->bth_count; + + /* Update bookkeeping */ + keep_hdr->bth_count += new_rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, (min_count * 2) + 1); + + /* + * Shift the element and children to the right of rm_hdr to + * the left by one spot. + */ + ASSERT3P(keep->btc_children[new_idx], ==, rm_hdr); + bt_shift_core_left(tree, keep, new_idx, keep_hdr->bth_count - new_idx, + BSS_PARALLELOGRAM); + keep_hdr->bth_count--; + + /* Reparent all our children to point to the left node. */ + zfs_btree_hdr_t **new_start = keep->btc_children + + old_count - 1; + for (int i = 0; i < new_rm_hdr->bth_count + 1; i++) + new_start[i]->bth_parent = keep; + for (int i = 0; i <= keep_hdr->bth_count; i++) { + ASSERT3P(keep->btc_children[i]->bth_parent, ==, keep); + ASSERT3P(keep->btc_children[i], !=, rm_hdr); + } + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + new_rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, new_rm_hdr); + zfs_btree_remove_from_node(tree, parent, new_rm_hdr); +} + +/* Remove the element at the specific location. */ +void +zfs_btree_remove_idx(zfs_btree_t *tree, zfs_btree_index_t *where) +{ + size_t size = tree->bt_elem_size; + zfs_btree_hdr_t *hdr = where->bti_node; + uint64_t idx = where->bti_offset; + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / size, 2); + + ASSERT(!where->bti_before); + if (tree->bt_bulk != NULL) { + /* + * Leave bulk insert mode. Note that our index would be + * invalid after we correct the tree, so we copy the value + * we're planning to remove and find it again after + * bulk_finish. + */ + uint8_t *value = zfs_btree_get(tree, where); + uint8_t *tmp = kmem_alloc(size, KM_SLEEP); + bmov(value, tmp, size); + zfs_btree_bulk_finish(tree); + VERIFY3P(zfs_btree_find(tree, tmp, where), !=, NULL); + kmem_free(tmp, size); + hdr = where->bti_node; + idx = where->bti_offset; + } + + tree->bt_num_elems--; + /* + * If the element happens to be in a core node, we move a leaf node's + * element into its place and then remove the leaf node element. This + * makes the rebalance logic not need to be recursive both upwards and + * downwards. + */ + if (hdr->bth_core) { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + zfs_btree_hdr_t *left_subtree = node->btc_children[idx]; + void *new_value = zfs_btree_last_helper(tree, left_subtree, + where); + ASSERT3P(new_value, !=, NULL); + + bmov(new_value, node->btc_elems + idx * size, size); + + hdr = where->bti_node; + idx = where->bti_offset; + ASSERT(!where->bti_before); + } + + /* + * First, we'll update the leaf's metadata. Then, we shift any + * elements after the idx to the left. After that, we rebalance if + * needed. + */ + ASSERT(!hdr->bth_core); + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + ASSERT3U(hdr->bth_count, >, 0); + + uint64_t min_count = (capacity / 2) - 1; + + /* + * If we're over the minimum size or this is the root, just overwrite + * the value and return. + */ + if (hdr->bth_count > min_count || hdr->bth_parent == NULL) { + hdr->bth_count--; + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx); + if (hdr->bth_parent == NULL) { + ASSERT0(tree->bt_height); + if (hdr->bth_count == 0) { + tree->bt_root = NULL; + tree->bt_height--; + zfs_btree_node_destroy(tree, &leaf->btl_hdr); + } + } + if (tree->bt_root != NULL) + zfs_btree_poison_node_at(tree, hdr, hdr->bth_count); + zfs_btree_verify(tree); + return; + } + ASSERT3U(hdr->bth_count, ==, min_count); + + /* + * Now we try to take a node from a sibling. We check left, then + * right. If they exist and have more than the minimum number of + * elements, we move the separator between us and them to our node + * and move their closest element (last for left, first for right) to + * the separator. Along the way we need to collapse the gap made by + * idx, and (for our right neighbor) the gap made by removing their + * first element. + * + * Note: this logic currently doesn't support taking from a neighbor + * that isn't a sibling. This isn't critical functionality, but may be + * worth implementing in the future for completeness' sake. + */ + zfs_btree_core_t *parent = hdr->bth_parent; + uint64_t parent_idx = zfs_btree_find_parent_idx(tree, hdr); + + zfs_btree_hdr_t *l_hdr = (parent_idx == 0 ? NULL : + parent->btc_children[parent_idx - 1]); + if (l_hdr != NULL && l_hdr->bth_count > min_count) { + /* We can take a node from the left neighbor. */ + ASSERT(!l_hdr->bth_core); + + /* + * Move our elements back by one spot to make room for the + * stolen element and overwrite the element being removed. + */ + bt_shift_leaf_right(tree, leaf, 0, idx); + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)l_hdr)->btl_elems + + (l_hdr->bth_count - 1) * size; + /* Move the separator to our first spot. */ + bmov(separator, leaf->btl_elems, size); + + /* Move our neighbor's last element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + l_hdr->bth_count--; + zfs_btree_poison_node_at(tree, l_hdr, l_hdr->bth_count); + + zfs_btree_verify(tree); + return; + } + + zfs_btree_hdr_t *r_hdr = (parent_idx == parent->btc_hdr.bth_count ? + NULL : parent->btc_children[parent_idx + 1]); + if (r_hdr != NULL && r_hdr->bth_count > min_count) { + /* We can take a node from the right neighbor. */ + ASSERT(!r_hdr->bth_core); + zfs_btree_leaf_t *neighbor = (zfs_btree_leaf_t *)r_hdr; + + /* + * Move our elements after the element being removed forwards + * by one spot to make room for the stolen element and + * overwrite the element being removed. + */ + bt_shift_leaf_left(tree, leaf, idx + 1, hdr->bth_count - idx - + 1); + + uint8_t *separator = parent->btc_elems + parent_idx * size; + uint8_t *take_elem = ((zfs_btree_leaf_t *)r_hdr)->btl_elems; + /* Move the separator between us to our last spot. */ + bmov(separator, leaf->btl_elems + (hdr->bth_count - 1) * size, + size); + + /* Move our neighbor's first element to the separator. */ + bmov(take_elem, separator, size); + + /* Update the bookkeeping. */ + r_hdr->bth_count--; + + /* + * Move our neighbors elements forwards to overwrite the + * stolen element. + */ + bt_shift_leaf_left(tree, neighbor, 1, r_hdr->bth_count); + zfs_btree_poison_node_at(tree, r_hdr, r_hdr->bth_count); + zfs_btree_verify(tree); + return; + } + + /* + * In this case, neither of our neighbors can spare an element, so we + * need to merge with one of them. We prefer the left one, + * arbitrarily. Move the separator into the leftmost merging node + * (which may be us or the left neighbor), and then move the right + * merging node's elements. Once that's done, we go back and delete + * the element we're removing. Finally, go into the parent and delete + * the right merging node and the separator. This may cause further + * merging. + */ + zfs_btree_hdr_t *rm_hdr, *keep_hdr; + uint64_t new_idx = idx; + if (l_hdr != NULL) { + keep_hdr = l_hdr; + rm_hdr = hdr; + new_idx += keep_hdr->bth_count + 1; // 449 + } else { + ASSERT3P(r_hdr, !=, NULL); + keep_hdr = hdr; + rm_hdr = r_hdr; + parent_idx++; + } + + ASSERT(!keep_hdr->bth_core); + ASSERT(!rm_hdr->bth_core); + ASSERT3U(keep_hdr->bth_count, ==, min_count); + ASSERT3U(rm_hdr->bth_count, ==, min_count); + + zfs_btree_leaf_t *keep = (zfs_btree_leaf_t *)keep_hdr; + zfs_btree_leaf_t *rm = (zfs_btree_leaf_t *)rm_hdr; + + if (zfs_btree_verify_intensity >= 5) { + for (int i = 0; i < rm_hdr->bth_count + 1; i++) { + zfs_btree_verify_poison_at(tree, keep_hdr, + keep_hdr->bth_count + i); + } + } + /* + * Move the separator into the first open spot in the left + * neighbor. + */ + uint8_t *out = keep->btl_elems + keep_hdr->bth_count * size; + uint8_t *separator = parent->btc_elems + (parent_idx - 1) * + size; + bmov(separator, out, size); + keep_hdr->bth_count++; + + /* Move our elements to the left neighbor. */ + bt_transfer_leaf(tree, rm, 0, rm_hdr->bth_count, keep, + keep_hdr->bth_count); + + /* Update the bookkeeping. */ + keep_hdr->bth_count += rm_hdr->bth_count; + ASSERT3U(keep_hdr->bth_count, ==, min_count * 2 + 1); + + /* Remove the value from the node */ + keep_hdr->bth_count--; + bt_shift_leaf_left(tree, keep, new_idx + 1, keep_hdr->bth_count - + new_idx); + zfs_btree_poison_node_at(tree, keep_hdr, keep_hdr->bth_count); + + rm_hdr->bth_count = 0; + zfs_btree_node_destroy(tree, rm_hdr); + /* Remove the emptied node from the parent. */ + zfs_btree_remove_from_node(tree, parent, rm_hdr); + zfs_btree_verify(tree); +} + +/* Remove the given value from the tree. */ +void +zfs_btree_remove(zfs_btree_t *tree, const void *value) +{ + zfs_btree_index_t where = {0}; + VERIFY3P(zfs_btree_find(tree, value, &where), !=, NULL); + zfs_btree_remove_idx(tree, &where); +} + +/* Return the number of elements in the tree. */ +ulong_t +zfs_btree_numnodes(zfs_btree_t *tree) +{ + return (tree->bt_num_elems); +} + +/* + * This function is used to visit all the elements in the tree before + * destroying the tree. This allows the calling code to perform any cleanup it + * needs to do. This is more efficient than just removing the first element + * over and over, because it removes all rebalancing. Once the destroy_nodes() + * function has been called, no other btree operations are valid until it + * returns NULL, which point the only valid operation is zfs_btree_destroy(). + * + * example: + * + * zfs_btree_index_t *cookie = NULL; + * my_data_t *node; + * + * while ((node = zfs_btree_destroy_nodes(tree, &cookie)) != NULL) + * free(node->ptr); + * zfs_btree_destroy(tree); + * + */ +void * +zfs_btree_destroy_nodes(zfs_btree_t *tree, zfs_btree_index_t **cookie) +{ + if (*cookie == NULL) { + if (tree->bt_height == -1) + return (NULL); + *cookie = kmem_alloc(sizeof (**cookie), KM_SLEEP); + return (zfs_btree_first(tree, *cookie)); + } + + void *rval = zfs_btree_next_helper(tree, *cookie, *cookie, + zfs_btree_node_destroy); + if (rval == NULL) { + tree->bt_root = NULL; + tree->bt_height = -1; + tree->bt_num_elems = 0; + kmem_free(*cookie, sizeof (**cookie)); + tree->bt_bulk = NULL; + } + return (rval); +} + +static void +zfs_btree_clear_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (hdr->bth_core) { + zfs_btree_core_t *btc = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_clear_helper(tree, btc->btc_children[i]); + } + } + + zfs_btree_node_destroy(tree, hdr); +} + +void +zfs_btree_clear(zfs_btree_t *tree) +{ + if (tree->bt_root == NULL) { + ASSERT0(tree->bt_num_elems); + return; + } + + zfs_btree_clear_helper(tree, tree->bt_root); + tree->bt_num_elems = 0; + tree->bt_root = NULL; + tree->bt_num_nodes = 0; + tree->bt_height = -1; + tree->bt_bulk = NULL; +} + +void +zfs_btree_destroy(zfs_btree_t *tree) +{ + ASSERT0(tree->bt_num_elems); + ASSERT3P(tree->bt_root, ==, NULL); +} + +/* Verify that every child of this node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) + return; + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 0; i <= hdr->bth_count; i++) { + VERIFY3P(node->btc_children[i]->bth_parent, ==, hdr); + zfs_btree_verify_pointers_helper(tree, node->btc_children[i]); + } +} + +/* Verify that every node has the correct parent pointer. */ +static void +zfs_btree_verify_pointers(zfs_btree_t *tree) +{ + if (tree->bt_height == -1) { + VERIFY3P(tree->bt_root, ==, NULL); + return; + } + VERIFY3P(tree->bt_root->bth_parent, ==, NULL); + zfs_btree_verify_pointers_helper(tree, tree->bt_root); +} + +/* + * Verify that all the current node and its children satisfy the count + * invariants, and return the total count in the subtree rooted in this node. + */ +static uint64_t +zfs_btree_verify_counts_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + if (!hdr->bth_core) { + if (tree->bt_root != hdr && hdr != &tree->bt_bulk->btl_hdr) { + uint64_t capacity = P2ALIGN((BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t)) / tree->bt_elem_size, 2); + VERIFY3U(hdr->bth_count, >=, (capacity / 2) - 1); + } + + return (hdr->bth_count); + } else { + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = hdr->bth_count; + if (tree->bt_root != hdr && tree->bt_bulk == NULL) + VERIFY3P(hdr->bth_count, >=, BTREE_CORE_ELEMS / 2 - 1); + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_counts_helper(tree, + node->btc_children[i]); + } + + return (ret); + } +} + +/* + * Verify that all nodes satisfy the invariants and that the total number of + * elements is correct. + */ +static void +zfs_btree_verify_counts(zfs_btree_t *tree) +{ + EQUIV(tree->bt_num_elems == 0, tree->bt_height == -1); + if (tree->bt_height == -1) { + return; + } + VERIFY3P(zfs_btree_verify_counts_helper(tree, tree->bt_root), ==, + tree->bt_num_elems); +} + +/* + * Check that the subtree rooted at this node has a uniform height. Returns + * the number of nodes under this node, to help verify bt_num_nodes. + */ +static uint64_t +zfs_btree_verify_height_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr, + int64_t height) +{ + if (!hdr->bth_core) { + VERIFY0(height); + return (1); + } + + VERIFY(hdr->bth_core); + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint64_t ret = 1; + for (int i = 0; i <= hdr->bth_count; i++) { + ret += zfs_btree_verify_height_helper(tree, + node->btc_children[i], height - 1); + } + return (ret); +} + +/* + * Check that the tree rooted at this node has a uniform height, and that the + * bt_height in the tree is correct. + */ +static void +zfs_btree_verify_height(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + VERIFY3U(zfs_btree_verify_height_helper(tree, tree->bt_root, + tree->bt_height), ==, tree->bt_num_nodes); +} + +/* + * Check that the elements in this node are sorted, and that if this is a core + * node, the separators are properly between the subtrees they separaate and + * that the children also satisfy this requirement. + */ +static void +zfs_btree_verify_order_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(leaf->btl_elems + (i - 1) * + size, leaf->btl_elems + i * size), ==, -1); + } + return; + } + + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + for (int i = 1; i < hdr->bth_count; i++) { + VERIFY3S(tree->bt_compar(node->btc_elems + (i - 1) * size, + node->btc_elems + i * size), ==, -1); + } + for (int i = 0; i < hdr->bth_count; i++) { + uint8_t *left_child_last = NULL; + zfs_btree_hdr_t *left_child_hdr = node->btc_children[i]; + if (left_child_hdr->bth_core) { + zfs_btree_core_t *left_child = + (zfs_btree_core_t *)left_child_hdr; + left_child_last = left_child->btc_elems + + (left_child_hdr->bth_count - 1) * size; + } else { + zfs_btree_leaf_t *left_child = + (zfs_btree_leaf_t *)left_child_hdr; + left_child_last = left_child->btl_elems + + (left_child_hdr->bth_count - 1) * size; + } + if (tree->bt_compar(node->btc_elems + i * size, + left_child_last) != 1) { + panic("btree: compar returned %d (expected 1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, left_child_last), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)left_child_last); + } + + uint8_t *right_child_first = NULL; + zfs_btree_hdr_t *right_child_hdr = node->btc_children[i + 1]; + if (right_child_hdr->bth_core) { + zfs_btree_core_t *right_child = + (zfs_btree_core_t *)right_child_hdr; + right_child_first = right_child->btc_elems; + } else { + zfs_btree_leaf_t *right_child = + (zfs_btree_leaf_t *)right_child_hdr; + right_child_first = right_child->btl_elems; + } + if (tree->bt_compar(node->btc_elems + i * size, + right_child_first) != -1) { + panic("btree: compar returned %d (expected -1) at " + "%px %d: compar(%px, %px)", tree->bt_compar( + node->btc_elems + i * size, right_child_first), + (void *)node, i, (void *)(node->btc_elems + i * + size), (void *)right_child_first); + } + } + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_order_helper(tree, node->btc_children[i]); + } +} + +/* Check that all elements in the tree are in sorted order. */ +static void +zfs_btree_verify_order(zfs_btree_t *tree) +{ + EQUIV(tree->bt_height == -1, tree->bt_root == NULL); + if (tree->bt_height == -1) { + return; + } + + zfs_btree_verify_order_helper(tree, tree->bt_root); +} + +#ifdef ZFS_DEBUG +/* Check that all unused memory is poisoned correctly. */ +static void +zfs_btree_verify_poison_helper(zfs_btree_t *tree, zfs_btree_hdr_t *hdr) +{ + size_t size = tree->bt_elem_size; + if (!hdr->bth_core) { + zfs_btree_leaf_t *leaf = (zfs_btree_leaf_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_LEAF_SIZE - + sizeof (zfs_btree_hdr_t); i++) { + VERIFY3U(leaf->btl_elems[i], ==, val); + } + } else { + zfs_btree_core_t *node = (zfs_btree_core_t *)hdr; + uint8_t val = 0x0f; + for (int i = hdr->bth_count * size; i < BTREE_CORE_ELEMS * size; + i++) { + VERIFY3U(node->btc_elems[i], ==, val); + } + + for (int i = hdr->bth_count + 1; i <= BTREE_CORE_ELEMS; i++) { + VERIFY3P(node->btc_children[i], ==, + (zfs_btree_hdr_t *)BTREE_POISON); + } + + for (int i = 0; i <= hdr->bth_count; i++) { + zfs_btree_verify_poison_helper(tree, + node->btc_children[i]); + } + } +} +#endif + +/* Check that unused memory in the tree is still poisoned. */ +static void +zfs_btree_verify_poison(zfs_btree_t *tree) +{ +#ifdef ZFS_DEBUG + if (tree->bt_height == -1) + return; + zfs_btree_verify_poison_helper(tree, tree->bt_root); +#endif +} + +void +zfs_btree_verify(zfs_btree_t *tree) +{ + if (zfs_btree_verify_intensity == 0) + return; + zfs_btree_verify_height(tree); + if (zfs_btree_verify_intensity == 1) + return; + zfs_btree_verify_pointers(tree); + if (zfs_btree_verify_intensity == 2) + return; + zfs_btree_verify_counts(tree); + if (zfs_btree_verify_intensity == 3) + return; + zfs_btree_verify_order(tree); + + if (zfs_btree_verify_intensity == 4) + return; + zfs_btree_verify_poison(tree); +} diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c new file mode 100644 index 000000000000..e46a0926d557 --- /dev/null +++ b/module/zfs/dataset_kstats.c @@ -0,0 +1,215 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + * Copyright (c) 2018 Datto Inc. + */ + +#include +#include +#include +#include + +static dataset_kstat_values_t empty_dataset_kstats = { + { "dataset_name", KSTAT_DATA_STRING }, + { "writes", KSTAT_DATA_UINT64 }, + { "nwritten", KSTAT_DATA_UINT64 }, + { "reads", KSTAT_DATA_UINT64 }, + { "nread", KSTAT_DATA_UINT64 }, + { "nunlinks", KSTAT_DATA_UINT64 }, + { "nunlinked", KSTAT_DATA_UINT64 }, +}; + +static int +dataset_kstats_update(kstat_t *ksp, int rw) +{ + dataset_kstats_t *dk = ksp->ks_private; + ASSERT3P(dk->dk_kstats->ks_data, ==, ksp->ks_data); + + if (rw == KSTAT_WRITE) + return (EACCES); + + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + dkv->dkv_writes.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_writes); + dkv->dkv_nwritten.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nwritten); + dkv->dkv_reads.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_reads); + dkv->dkv_nread.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nread); + dkv->dkv_nunlinks.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nunlinks); + dkv->dkv_nunlinked.value.ui64 = + aggsum_value(&dk->dk_aggsums.das_nunlinked); + + return (0); +} + +void +dataset_kstats_create(dataset_kstats_t *dk, objset_t *objset) +{ + /* + * There should not be anything wrong with having kstats for + * snapshots. Since we are not sure how useful they would be + * though nor how much their memory overhead would matter in + * a filesystem with many snapshots, we skip them for now. + */ + if (dmu_objset_is_snapshot(objset)) + return; + + /* + * At the time of this writing, KSTAT_STRLEN is 255 in Linux, + * and the spa_name can theoretically be up to 256 characters. + * In reality though the spa_name can be 240 characters max + * [see origin directory name check in pool_namecheck()]. Thus, + * the naming scheme for the module name below should not cause + * any truncations. In the event that a truncation does happen + * though, due to some future change, we silently skip creating + * the kstat and log the event. + */ + char kstat_module_name[KSTAT_STRLEN]; + int n = snprintf(kstat_module_name, sizeof (kstat_module_name), + "zfs/%s", spa_name(dmu_objset_spa(objset))); + if (n < 0) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + " snprintf() for kstat module name returned %d", + (unsigned long long)dmu_objset_id(objset), n); + return; + } else if (n >= KSTAT_STRLEN) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + "kstat module name length (%d) exceeds limit (%d)", + (unsigned long long)dmu_objset_id(objset), + n, KSTAT_STRLEN); + return; + } + + char kstat_name[KSTAT_STRLEN]; + n = snprintf(kstat_name, sizeof (kstat_name), "objset-0x%llx", + (unsigned long long)dmu_objset_id(objset)); + if (n < 0) { + zfs_dbgmsg("failed to create dataset kstat for objset %lld: " + " snprintf() for kstat name returned %d", + (unsigned long long)dmu_objset_id(objset), n); + return; + } + ASSERT3U(n, <, KSTAT_STRLEN); + + kstat_t *kstat = kstat_create(kstat_module_name, 0, kstat_name, + "dataset", KSTAT_TYPE_NAMED, + sizeof (empty_dataset_kstats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (kstat == NULL) + return; + + dataset_kstat_values_t *dk_kstats = + kmem_alloc(sizeof (empty_dataset_kstats), KM_SLEEP); + bcopy(&empty_dataset_kstats, dk_kstats, + sizeof (empty_dataset_kstats)); + + char *ds_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(objset->os_dsl_dataset, ds_name); + KSTAT_NAMED_STR_PTR(&dk_kstats->dkv_ds_name) = ds_name; + KSTAT_NAMED_STR_BUFLEN(&dk_kstats->dkv_ds_name) = + ZFS_MAX_DATASET_NAME_LEN; + + kstat->ks_data = dk_kstats; + kstat->ks_update = dataset_kstats_update; + kstat->ks_private = dk; + kstat->ks_data_size += ZFS_MAX_DATASET_NAME_LEN; + + kstat_install(kstat); + dk->dk_kstats = kstat; + + aggsum_init(&dk->dk_aggsums.das_writes, 0); + aggsum_init(&dk->dk_aggsums.das_nwritten, 0); + aggsum_init(&dk->dk_aggsums.das_reads, 0); + aggsum_init(&dk->dk_aggsums.das_nread, 0); + aggsum_init(&dk->dk_aggsums.das_nunlinks, 0); + aggsum_init(&dk->dk_aggsums.das_nunlinked, 0); +} + +void +dataset_kstats_destroy(dataset_kstats_t *dk) +{ + if (dk->dk_kstats == NULL) + return; + + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + kmem_free(KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name), + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); + kmem_free(dkv, sizeof (empty_dataset_kstats)); + + kstat_delete(dk->dk_kstats); + dk->dk_kstats = NULL; + + aggsum_fini(&dk->dk_aggsums.das_writes); + aggsum_fini(&dk->dk_aggsums.das_nwritten); + aggsum_fini(&dk->dk_aggsums.das_reads); + aggsum_fini(&dk->dk_aggsums.das_nread); + aggsum_fini(&dk->dk_aggsums.das_nunlinks); + aggsum_fini(&dk->dk_aggsums.das_nunlinked); +} + +void +dataset_kstats_update_write_kstats(dataset_kstats_t *dk, + int64_t nwritten) +{ + ASSERT3S(nwritten, >=, 0); + + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_writes, 1); + aggsum_add(&dk->dk_aggsums.das_nwritten, nwritten); +} + +void +dataset_kstats_update_read_kstats(dataset_kstats_t *dk, + int64_t nread) +{ + ASSERT3S(nread, >=, 0); + + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_reads, 1); + aggsum_add(&dk->dk_aggsums.das_nread, nread); +} + +void +dataset_kstats_update_nunlinks_kstat(dataset_kstats_t *dk, int64_t delta) +{ + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_nunlinks, delta); +} + +void +dataset_kstats_update_nunlinked_kstat(dataset_kstats_t *dk, int64_t delta) +{ + if (dk->dk_kstats == NULL) + return; + + aggsum_add(&dk->dk_aggsums.das_nunlinked, delta); +} diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c new file mode 100644 index 000000000000..2de1f4e4c267 --- /dev/null +++ b/module/zfs/dbuf.c @@ -0,0 +1,4741 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +kstat_t *dbuf_ksp; + +typedef struct dbuf_stats { + /* + * Various statistics about the size of the dbuf cache. + */ + kstat_named_t cache_count; + kstat_named_t cache_size_bytes; + kstat_named_t cache_size_bytes_max; + /* + * Statistics regarding the bounds on the dbuf cache size. + */ + kstat_named_t cache_target_bytes; + kstat_named_t cache_lowater_bytes; + kstat_named_t cache_hiwater_bytes; + /* + * Total number of dbuf cache evictions that have occurred. + */ + kstat_named_t cache_total_evicts; + /* + * The distribution of dbuf levels in the dbuf cache and + * the total size of all dbufs at each level. + */ + kstat_named_t cache_levels[DN_MAX_LEVELS]; + kstat_named_t cache_levels_bytes[DN_MAX_LEVELS]; + /* + * Statistics about the dbuf hash table. + */ + kstat_named_t hash_hits; + kstat_named_t hash_misses; + kstat_named_t hash_collisions; + kstat_named_t hash_elements; + kstat_named_t hash_elements_max; + /* + * Number of sublists containing more than one dbuf in the dbuf + * hash table. Keep track of the longest hash chain. + */ + kstat_named_t hash_chains; + kstat_named_t hash_chain_max; + /* + * Number of times a dbuf_create() discovers that a dbuf was + * already created and in the dbuf hash table. + */ + kstat_named_t hash_insert_race; + /* + * Statistics about the size of the metadata dbuf cache. + */ + kstat_named_t metadata_cache_count; + kstat_named_t metadata_cache_size_bytes; + kstat_named_t metadata_cache_size_bytes_max; + /* + * For diagnostic purposes, this is incremented whenever we can't add + * something to the metadata cache because it's full, and instead put + * the data in the regular dbuf cache. + */ + kstat_named_t metadata_cache_overflow; +} dbuf_stats_t; + +dbuf_stats_t dbuf_stats = { + { "cache_count", KSTAT_DATA_UINT64 }, + { "cache_size_bytes", KSTAT_DATA_UINT64 }, + { "cache_size_bytes_max", KSTAT_DATA_UINT64 }, + { "cache_target_bytes", KSTAT_DATA_UINT64 }, + { "cache_lowater_bytes", KSTAT_DATA_UINT64 }, + { "cache_hiwater_bytes", KSTAT_DATA_UINT64 }, + { "cache_total_evicts", KSTAT_DATA_UINT64 }, + { { "cache_levels_N", KSTAT_DATA_UINT64 } }, + { { "cache_levels_bytes_N", KSTAT_DATA_UINT64 } }, + { "hash_hits", KSTAT_DATA_UINT64 }, + { "hash_misses", KSTAT_DATA_UINT64 }, + { "hash_collisions", KSTAT_DATA_UINT64 }, + { "hash_elements", KSTAT_DATA_UINT64 }, + { "hash_elements_max", KSTAT_DATA_UINT64 }, + { "hash_chains", KSTAT_DATA_UINT64 }, + { "hash_chain_max", KSTAT_DATA_UINT64 }, + { "hash_insert_race", KSTAT_DATA_UINT64 }, + { "metadata_cache_count", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes", KSTAT_DATA_UINT64 }, + { "metadata_cache_size_bytes_max", KSTAT_DATA_UINT64 }, + { "metadata_cache_overflow", KSTAT_DATA_UINT64 } +}; + +#define DBUF_STAT_INCR(stat, val) \ + atomic_add_64(&dbuf_stats.stat.value.ui64, (val)); +#define DBUF_STAT_DECR(stat, val) \ + DBUF_STAT_INCR(stat, -(val)); +#define DBUF_STAT_BUMP(stat) \ + DBUF_STAT_INCR(stat, 1); +#define DBUF_STAT_BUMPDOWN(stat) \ + DBUF_STAT_INCR(stat, -1); +#define DBUF_STAT_MAX(stat, v) { \ + uint64_t _m; \ + while ((v) > (_m = dbuf_stats.stat.value.ui64) && \ + (_m != atomic_cas_64(&dbuf_stats.stat.value.ui64, _m, (v))))\ + continue; \ +} + +static boolean_t dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); +static void dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr); +static int dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags); + +extern inline void dmu_buf_init_user(dmu_buf_user_t *dbu, + dmu_buf_evict_func_t *evict_func_sync, + dmu_buf_evict_func_t *evict_func_async, + dmu_buf_t **clear_on_evict_dbufp); + +/* + * Global data structures and functions for the dbuf cache. + */ +static kmem_cache_t *dbuf_kmem_cache; +static taskq_t *dbu_evict_taskq; + +static kthread_t *dbuf_cache_evict_thread; +static kmutex_t dbuf_evict_lock; +static kcondvar_t dbuf_evict_cv; +static boolean_t dbuf_evict_thread_exit; + +/* + * There are two dbuf caches; each dbuf can only be in one of them at a time. + * + * 1. Cache of metadata dbufs, to help make read-heavy administrative commands + * from /sbin/zfs run faster. The "metadata cache" specifically stores dbufs + * that represent the metadata that describes filesystems/snapshots/ + * bookmarks/properties/etc. We only evict from this cache when we export a + * pool, to short-circuit as much I/O as possible for all administrative + * commands that need the metadata. There is no eviction policy for this + * cache, because we try to only include types in it which would occupy a + * very small amount of space per object but create a large impact on the + * performance of these commands. Instead, after it reaches a maximum size + * (which should only happen on very small memory systems with a very large + * number of filesystem objects), we stop taking new dbufs into the + * metadata cache, instead putting them in the normal dbuf cache. + * + * 2. LRU cache of dbufs. The dbuf cache maintains a list of dbufs that + * are not currently held but have been recently released. These dbufs + * are not eligible for arc eviction until they are aged out of the cache. + * Dbufs that are aged out of the cache will be immediately destroyed and + * become eligible for arc eviction. + * + * Dbufs are added to these caches once the last hold is released. If a dbuf is + * later accessed and still exists in the dbuf cache, then it will be removed + * from the cache and later re-added to the head of the cache. + * + * If a given dbuf meets the requirements for the metadata cache, it will go + * there, otherwise it will be considered for the generic LRU dbuf cache. The + * caches and the refcounts tracking their sizes are stored in an array indexed + * by those caches' matching enum values (from dbuf_cached_state_t). + */ +typedef struct dbuf_cache { + multilist_t *cache; + zfs_refcount_t size; +} dbuf_cache_t; +dbuf_cache_t dbuf_caches[DB_CACHE_MAX]; + +/* Size limits for the caches */ +unsigned long dbuf_cache_max_bytes = ULONG_MAX; +unsigned long dbuf_metadata_cache_max_bytes = ULONG_MAX; + +/* Set the default sizes of the caches to log2 fraction of arc size */ +int dbuf_cache_shift = 5; +int dbuf_metadata_cache_shift = 6; + +static unsigned long dbuf_cache_target_bytes(void); +static unsigned long dbuf_metadata_cache_target_bytes(void); + +/* + * The LRU dbuf cache uses a three-stage eviction policy: + * - A low water marker designates when the dbuf eviction thread + * should stop evicting from the dbuf cache. + * - When we reach the maximum size (aka mid water mark), we + * signal the eviction thread to run. + * - The high water mark indicates when the eviction thread + * is unable to keep up with the incoming load and eviction must + * happen in the context of the calling thread. + * + * The dbuf cache: + * (max size) + * low water mid water hi water + * +----------------------------------------+----------+----------+ + * | | | | + * | | | | + * | | | | + * | | | | + * +----------------------------------------+----------+----------+ + * stop signal evict + * evicting eviction directly + * thread + * + * The high and low water marks indicate the operating range for the eviction + * thread. The low water mark is, by default, 90% of the total size of the + * cache and the high water mark is at 110% (both of these percentages can be + * changed by setting dbuf_cache_lowater_pct and dbuf_cache_hiwater_pct, + * respectively). The eviction thread will try to ensure that the cache remains + * within this range by waking up every second and checking if the cache is + * above the low water mark. The thread can also be woken up by callers adding + * elements into the cache if the cache is larger than the mid water (i.e max + * cache size). Once the eviction thread is woken up and eviction is required, + * it will continue evicting buffers until it's able to reduce the cache size + * to the low water mark. If the cache size continues to grow and hits the high + * water mark, then callers adding elements to the cache will begin to evict + * directly from the cache until the cache is no longer above the high water + * mark. + */ + +/* + * The percentage above and below the maximum cache size. + */ +uint_t dbuf_cache_hiwater_pct = 10; +uint_t dbuf_cache_lowater_pct = 10; + +/* ARGSUSED */ +static int +dbuf_cons(void *vdb, void *unused, int kmflag) +{ + dmu_buf_impl_t *db = vdb; + bzero(db, sizeof (dmu_buf_impl_t)); + + mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL); + rw_init(&db->db_rwlock, NULL, RW_DEFAULT, NULL); + cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL); + multilist_link_init(&db->db_cache_link); + zfs_refcount_create(&db->db_holds); + + return (0); +} + +/* ARGSUSED */ +static void +dbuf_dest(void *vdb, void *unused) +{ + dmu_buf_impl_t *db = vdb; + mutex_destroy(&db->db_mtx); + rw_destroy(&db->db_rwlock); + cv_destroy(&db->db_changed); + ASSERT(!multilist_link_active(&db->db_cache_link)); + zfs_refcount_destroy(&db->db_holds); +} + +/* + * dbuf hash table routines + */ +static dbuf_hash_table_t dbuf_hash_table; + +static uint64_t dbuf_hash_count; + +/* + * We use Cityhash for this. It's fast, and has good hash properties without + * requiring any large static buffers. + */ +static uint64_t +dbuf_hash(void *os, uint64_t obj, uint8_t lvl, uint64_t blkid) +{ + return (cityhash4((uintptr_t)os, obj, (uint64_t)lvl, blkid)); +} + +#define DTRACE_SET_STATE(db, why) \ + DTRACE_PROBE2(dbuf__state_change, dmu_buf_impl_t *, db, \ + const char *, why) + +#define DBUF_EQUAL(dbuf, os, obj, level, blkid) \ + ((dbuf)->db.db_object == (obj) && \ + (dbuf)->db_objset == (os) && \ + (dbuf)->db_level == (level) && \ + (dbuf)->db_blkid == (blkid)) + +dmu_buf_impl_t * +dbuf_find(objset_t *os, uint64_t obj, uint8_t level, uint64_t blkid) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv; + uint64_t idx; + dmu_buf_impl_t *db; + + hv = dbuf_hash(os, obj, level, blkid); + idx = hv & h->hash_table_mask; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (db = h->hash_table[idx]; db != NULL; db = db->db_hash_next) { + if (DBUF_EQUAL(db, os, obj, level, blkid)) { + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (db); + } + mutex_exit(&db->db_mtx); + } + } + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (NULL); +} + +static dmu_buf_impl_t * +dbuf_find_bonus(objset_t *os, uint64_t object) +{ + dnode_t *dn; + dmu_buf_impl_t *db = NULL; + + if (dnode_hold(os, object, FTAG, &dn) == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus != NULL) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + } + return (db); +} + +/* + * Insert an entry into the hash table. If there is already an element + * equal to elem in the hash table, then the already existing element + * will be returned and the new element will not be inserted. + * Otherwise returns NULL. + */ +static dmu_buf_impl_t * +dbuf_hash_insert(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + objset_t *os = db->db_objset; + uint64_t obj = db->db.db_object; + int level = db->db_level; + uint64_t blkid, hv, idx; + dmu_buf_impl_t *dbf; + uint32_t i; + + blkid = db->db_blkid; + hv = dbuf_hash(os, obj, level, blkid); + idx = hv & h->hash_table_mask; + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + for (dbf = h->hash_table[idx], i = 0; dbf != NULL; + dbf = dbf->db_hash_next, i++) { + if (DBUF_EQUAL(dbf, os, obj, level, blkid)) { + mutex_enter(&dbf->db_mtx); + if (dbf->db_state != DB_EVICTING) { + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + return (dbf); + } + mutex_exit(&dbf->db_mtx); + } + } + + if (i > 0) { + DBUF_STAT_BUMP(hash_collisions); + if (i == 1) + DBUF_STAT_BUMP(hash_chains); + + DBUF_STAT_MAX(hash_chain_max, i); + } + + mutex_enter(&db->db_mtx); + db->db_hash_next = h->hash_table[idx]; + h->hash_table[idx] = db; + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_inc_64(&dbuf_hash_count); + DBUF_STAT_MAX(hash_elements_max, dbuf_hash_count); + + return (NULL); +} + +/* + * This returns whether this dbuf should be stored in the metadata cache, which + * is based on whether it's from one of the dnode types that store data related + * to traversing dataset hierarchies. + */ +static boolean_t +dbuf_include_in_metadata_cache(dmu_buf_impl_t *db) +{ + DB_DNODE_ENTER(db); + dmu_object_type_t type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + /* Check if this dbuf is one of the types we care about */ + if (DMU_OT_IS_METADATA_CACHED(type)) { + /* If we hit this, then we set something up wrong in dmu_ot */ + ASSERT(DMU_OT_IS_METADATA(type)); + + /* + * Sanity check for small-memory systems: don't allocate too + * much memory for this purpose. + */ + if (zfs_refcount_count( + &dbuf_caches[DB_DBUF_METADATA_CACHE].size) > + dbuf_metadata_cache_target_bytes()) { + DBUF_STAT_BUMP(metadata_cache_overflow); + return (B_FALSE); + } + + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Remove an entry from the hash table. It must be in the EVICTING state. + */ +static void +dbuf_hash_remove(dmu_buf_impl_t *db) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + uint64_t hv, idx; + dmu_buf_impl_t *dbf, **dbp; + + hv = dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid); + idx = hv & h->hash_table_mask; + + /* + * We mustn't hold db_mtx to maintain lock ordering: + * DBUF_HASH_MUTEX > db_mtx. + */ + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_state == DB_EVICTING); + ASSERT(!MUTEX_HELD(&db->db_mtx)); + + mutex_enter(DBUF_HASH_MUTEX(h, idx)); + dbp = &h->hash_table[idx]; + while ((dbf = *dbp) != db) { + dbp = &dbf->db_hash_next; + ASSERT(dbf != NULL); + } + *dbp = db->db_hash_next; + db->db_hash_next = NULL; + if (h->hash_table[idx] && + h->hash_table[idx]->db_hash_next == NULL) + DBUF_STAT_BUMPDOWN(hash_chains); + mutex_exit(DBUF_HASH_MUTEX(h, idx)); + atomic_dec_64(&dbuf_hash_count); +} + +typedef enum { + DBVU_EVICTING, + DBVU_NOT_EVICTING +} dbvu_verify_type_t; + +static void +dbuf_verify_user(dmu_buf_impl_t *db, dbvu_verify_type_t verify_type) +{ +#ifdef ZFS_DEBUG + int64_t holds; + + if (db->db_user == NULL) + return; + + /* Only data blocks support the attachment of user data. */ + ASSERT(db->db_level == 0); + + /* Clients must resolve a dbuf before attaching user data. */ + ASSERT(db->db.db_data != NULL); + ASSERT3U(db->db_state, ==, DB_CACHED); + + holds = zfs_refcount_count(&db->db_holds); + if (verify_type == DBVU_EVICTING) { + /* + * Immediate eviction occurs when holds == dirtycnt. + * For normal eviction buffers, holds is zero on + * eviction, except when dbuf_fix_old_data() calls + * dbuf_clear_data(). However, the hold count can grow + * during eviction even though db_mtx is held (see + * dmu_bonus_hold() for an example), so we can only + * test the generic invariant that holds >= dirtycnt. + */ + ASSERT3U(holds, >=, db->db_dirtycnt); + } else { + if (db->db_user_immediate_evict == TRUE) + ASSERT3U(holds, >=, db->db_dirtycnt); + else + ASSERT3U(holds, >, 0); + } +#endif +} + +static void +dbuf_evict_user(dmu_buf_impl_t *db) +{ + dmu_buf_user_t *dbu = db->db_user; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (dbu == NULL) + return; + + dbuf_verify_user(db, DBVU_EVICTING); + db->db_user = NULL; + +#ifdef ZFS_DEBUG + if (dbu->dbu_clear_on_evict_dbufp != NULL) + *dbu->dbu_clear_on_evict_dbufp = NULL; +#endif + + /* + * There are two eviction callbacks - one that we call synchronously + * and one that we invoke via a taskq. The async one is useful for + * avoiding lock order reversals and limiting stack depth. + * + * Note that if we have a sync callback but no async callback, + * it's likely that the sync callback will free the structure + * containing the dbu. In that case we need to take care to not + * dereference dbu after calling the sync evict func. + */ + boolean_t has_async = (dbu->dbu_evict_func_async != NULL); + + if (dbu->dbu_evict_func_sync != NULL) + dbu->dbu_evict_func_sync(dbu); + + if (has_async) { + taskq_dispatch_ent(dbu_evict_taskq, dbu->dbu_evict_func_async, + dbu, 0, &dbu->dbu_tqent); + } +} + +boolean_t +dbuf_is_metadata(dmu_buf_impl_t *db) +{ + /* + * Consider indirect blocks and spill blocks to be meta data. + */ + if (db->db_level > 0 || db->db_blkid == DMU_SPILL_BLKID) { + return (B_TRUE); + } else { + boolean_t is_metadata; + + DB_DNODE_ENTER(db); + is_metadata = DMU_OT_IS_METADATA(DB_DNODE(db)->dn_type); + DB_DNODE_EXIT(db); + + return (is_metadata); + } +} + + +/* + * This function *must* return indices evenly distributed between all + * sublists of the multilist. This is needed due to how the dbuf eviction + * code is laid out; dbuf_evict_thread() assumes dbufs are evenly + * distributed between all sublists and uses this assumption when + * deciding which sublist to evict from and how much to evict from it. + */ +static unsigned int +dbuf_cache_multilist_index_func(multilist_t *ml, void *obj) +{ + dmu_buf_impl_t *db = obj; + + /* + * The assumption here, is the hash value for a given + * dmu_buf_impl_t will remain constant throughout it's lifetime + * (i.e. it's objset, object, level and blkid fields don't change). + * Thus, we don't need to store the dbuf's sublist index + * on insertion, as this index can be recalculated on removal. + * + * Also, the low order bits of the hash value are thought to be + * distributed evenly. Otherwise, in the case that the multilist + * has a power of two number of sublists, each sublists' usage + * would not be evenly distributed. + */ + return (dbuf_hash(db->db_objset, db->db.db_object, + db->db_level, db->db_blkid) % + multilist_get_num_sublists(ml)); +} + +/* + * The target size of the dbuf cache can grow with the ARC target, + * unless limited by the tunable dbuf_cache_max_bytes. + */ +static inline unsigned long +dbuf_cache_target_bytes(void) +{ + return (MIN(dbuf_cache_max_bytes, + arc_target_bytes() >> dbuf_cache_shift)); +} + +/* + * The target size of the dbuf metadata cache can grow with the ARC target, + * unless limited by the tunable dbuf_metadata_cache_max_bytes. + */ +static inline unsigned long +dbuf_metadata_cache_target_bytes(void) +{ + return (MIN(dbuf_metadata_cache_max_bytes, + arc_target_bytes() >> dbuf_metadata_cache_shift)); +} + +static inline uint64_t +dbuf_cache_hiwater_bytes(void) +{ + uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); + return (dbuf_cache_target + + (dbuf_cache_target * dbuf_cache_hiwater_pct) / 100); +} + +static inline uint64_t +dbuf_cache_lowater_bytes(void) +{ + uint64_t dbuf_cache_target = dbuf_cache_target_bytes(); + return (dbuf_cache_target - + (dbuf_cache_target * dbuf_cache_lowater_pct) / 100); +} + +static inline boolean_t +dbuf_cache_above_lowater(void) +{ + return (zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size) > + dbuf_cache_lowater_bytes()); +} + +/* + * Evict the oldest eligible dbuf from the dbuf cache. + */ +static void +dbuf_evict_one(void) +{ + int idx = multilist_get_random_index(dbuf_caches[DB_DBUF_CACHE].cache); + multilist_sublist_t *mls = multilist_sublist_lock( + dbuf_caches[DB_DBUF_CACHE].cache, idx); + + ASSERT(!MUTEX_HELD(&dbuf_evict_lock)); + + dmu_buf_impl_t *db = multilist_sublist_tail(mls); + while (db != NULL && mutex_tryenter(&db->db_mtx) == 0) { + db = multilist_sublist_prev(mls, db); + } + + DTRACE_PROBE2(dbuf__evict__one, dmu_buf_impl_t *, db, + multilist_sublist_t *, mls); + + if (db != NULL) { + multilist_sublist_remove(mls, db); + multilist_sublist_unlock(mls); + (void) zfs_refcount_remove_many( + &dbuf_caches[DB_DBUF_CACHE].size, db->db.db_size, db); + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + ASSERT3U(db->db_caching_status, ==, DB_DBUF_CACHE); + db->db_caching_status = DB_NO_CACHE; + dbuf_destroy(db); + DBUF_STAT_BUMP(cache_total_evicts); + } else { + multilist_sublist_unlock(mls); + } +} + +/* + * The dbuf evict thread is responsible for aging out dbufs from the + * cache. Once the cache has reached it's maximum size, dbufs are removed + * and destroyed. The eviction thread will continue running until the size + * of the dbuf cache is at or below the maximum size. Once the dbuf is aged + * out of the cache it is destroyed and becomes eligible for arc eviction. + */ +/* ARGSUSED */ +static void +dbuf_evict_thread(void *unused) +{ + callb_cpr_t cpr; + + CALLB_CPR_INIT(&cpr, &dbuf_evict_lock, callb_generic_cpr, FTAG); + + mutex_enter(&dbuf_evict_lock); + while (!dbuf_evict_thread_exit) { + while (!dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig_hires(&dbuf_evict_cv, + &dbuf_evict_lock, SEC2NSEC(1), MSEC2NSEC(1), 0); + CALLB_CPR_SAFE_END(&cpr, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + /* + * Keep evicting as long as we're above the low water mark + * for the cache. We do this without holding the locks to + * minimize lock contention. + */ + while (dbuf_cache_above_lowater() && !dbuf_evict_thread_exit) { + dbuf_evict_one(); + } + + mutex_enter(&dbuf_evict_lock); + } + + dbuf_evict_thread_exit = B_FALSE; + cv_broadcast(&dbuf_evict_cv); + CALLB_CPR_EXIT(&cpr); /* drops dbuf_evict_lock */ + thread_exit(); +} + +/* + * Wake up the dbuf eviction thread if the dbuf cache is at its max size. + * If the dbuf cache is at its high water mark, then evict a dbuf from the + * dbuf cache using the callers context. + */ +static void +dbuf_evict_notify(uint64_t size) +{ + /* + * We check if we should evict without holding the dbuf_evict_lock, + * because it's OK to occasionally make the wrong decision here, + * and grabbing the lock results in massive lock contention. + */ + if (size > dbuf_cache_target_bytes()) { + if (size > dbuf_cache_hiwater_bytes()) + dbuf_evict_one(); + cv_signal(&dbuf_evict_cv); + } +} + +static int +dbuf_kstat_update(kstat_t *ksp, int rw) +{ + dbuf_stats_t *ds = ksp->ks_data; + + if (rw == KSTAT_WRITE) { + return (SET_ERROR(EACCES)); + } else { + ds->metadata_cache_size_bytes.value.ui64 = zfs_refcount_count( + &dbuf_caches[DB_DBUF_METADATA_CACHE].size); + ds->cache_size_bytes.value.ui64 = + zfs_refcount_count(&dbuf_caches[DB_DBUF_CACHE].size); + ds->cache_target_bytes.value.ui64 = dbuf_cache_target_bytes(); + ds->cache_hiwater_bytes.value.ui64 = dbuf_cache_hiwater_bytes(); + ds->cache_lowater_bytes.value.ui64 = dbuf_cache_lowater_bytes(); + ds->hash_elements.value.ui64 = dbuf_hash_count; + } + + return (0); +} + +void +dbuf_init(void) +{ + uint64_t hsize = 1ULL << 16; + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + /* + * The hash table is big enough to fill all of physical memory + * with an average block size of zfs_arc_average_blocksize (default 8K). + * By default, the table will take up + * totalmem * sizeof(void*) / 8K (1MB per GB with 8-byte pointers). + */ + while (hsize * zfs_arc_average_blocksize < physmem * PAGESIZE) + hsize <<= 1; + +retry: + h->hash_table_mask = hsize - 1; +#if defined(_KERNEL) + /* + * Large allocations which do not require contiguous pages + * should be using vmem_alloc() in the linux kernel + */ + h->hash_table = vmem_zalloc(hsize * sizeof (void *), KM_SLEEP); +#else + h->hash_table = kmem_zalloc(hsize * sizeof (void *), KM_NOSLEEP); +#endif + if (h->hash_table == NULL) { + /* XXX - we should really return an error instead of assert */ + ASSERT(hsize > (1ULL << 10)); + hsize >>= 1; + goto retry; + } + + dbuf_kmem_cache = kmem_cache_create("dmu_buf_impl_t", + sizeof (dmu_buf_impl_t), + 0, dbuf_cons, dbuf_dest, NULL, NULL, NULL, 0); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_init(&h->hash_mutexes[i], NULL, MUTEX_DEFAULT, NULL); + + dbuf_stats_init(h); + + /* + * All entries are queued via taskq_dispatch_ent(), so min/maxalloc + * configuration is not required. + */ + dbu_evict_taskq = taskq_create("dbu_evict", 1, defclsyspri, 0, 0, 0); + + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + dbuf_caches[dcs].cache = + multilist_create(sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_cache_link), + dbuf_cache_multilist_index_func); + zfs_refcount_create(&dbuf_caches[dcs].size); + } + + dbuf_evict_thread_exit = B_FALSE; + mutex_init(&dbuf_evict_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dbuf_evict_cv, NULL, CV_DEFAULT, NULL); + dbuf_cache_evict_thread = thread_create(NULL, 0, dbuf_evict_thread, + NULL, 0, &p0, TS_RUN, minclsyspri); + + dbuf_ksp = kstat_create("zfs", 0, "dbufstats", "misc", + KSTAT_TYPE_NAMED, sizeof (dbuf_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dbuf_ksp != NULL) { + for (i = 0; i < DN_MAX_LEVELS; i++) { + snprintf(dbuf_stats.cache_levels[i].name, + KSTAT_STRLEN, "cache_level_%d", i); + dbuf_stats.cache_levels[i].data_type = + KSTAT_DATA_UINT64; + snprintf(dbuf_stats.cache_levels_bytes[i].name, + KSTAT_STRLEN, "cache_level_%d_bytes", i); + dbuf_stats.cache_levels_bytes[i].data_type = + KSTAT_DATA_UINT64; + } + dbuf_ksp->ks_data = &dbuf_stats; + dbuf_ksp->ks_update = dbuf_kstat_update; + kstat_install(dbuf_ksp); + } +} + +void +dbuf_fini(void) +{ + dbuf_hash_table_t *h = &dbuf_hash_table; + int i; + + dbuf_stats_destroy(); + + for (i = 0; i < DBUF_MUTEXES; i++) + mutex_destroy(&h->hash_mutexes[i]); +#if defined(_KERNEL) + /* + * Large allocations which do not require contiguous pages + * should be using vmem_free() in the linux kernel + */ + vmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#else + kmem_free(h->hash_table, (h->hash_table_mask + 1) * sizeof (void *)); +#endif + kmem_cache_destroy(dbuf_kmem_cache); + taskq_destroy(dbu_evict_taskq); + + mutex_enter(&dbuf_evict_lock); + dbuf_evict_thread_exit = B_TRUE; + while (dbuf_evict_thread_exit) { + cv_signal(&dbuf_evict_cv); + cv_wait(&dbuf_evict_cv, &dbuf_evict_lock); + } + mutex_exit(&dbuf_evict_lock); + + mutex_destroy(&dbuf_evict_lock); + cv_destroy(&dbuf_evict_cv); + + for (dbuf_cached_state_t dcs = 0; dcs < DB_CACHE_MAX; dcs++) { + zfs_refcount_destroy(&dbuf_caches[dcs].size); + multilist_destroy(dbuf_caches[dcs].cache); + } + + if (dbuf_ksp != NULL) { + kstat_delete(dbuf_ksp); + dbuf_ksp = NULL; + } +} + +/* + * Other stuff. + */ + +#ifdef ZFS_DEBUG +static void +dbuf_verify(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dbuf_dirty_record_t *dr; + uint32_t txg_prev; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!(zfs_flags & ZFS_DEBUG_DBUF_VERIFY)) + return; + + ASSERT(db->db_objset != NULL); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn == NULL) { + ASSERT(db->db_parent == NULL); + ASSERT(db->db_blkptr == NULL); + } else { + ASSERT3U(db->db.db_object, ==, dn->dn_object); + ASSERT3P(db->db_objset, ==, dn->dn_objset); + ASSERT3U(db->db_level, <, dn->dn_nlevels); + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID || + !avl_is_empty(&dn->dn_dbufs)); + } + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dn != NULL); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + ASSERT3U(db->db.db_offset, ==, DMU_BONUS_BLKID); + } else if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn != NULL); + ASSERT0(db->db.db_offset); + } else { + ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); + } + + if ((dr = list_head(&db->db_dirty_records)) != NULL) { + ASSERT(dr->dr_dbuf == db); + txg_prev = dr->dr_txg; + for (dr = list_next(&db->db_dirty_records, dr); dr != NULL; + dr = list_next(&db->db_dirty_records, dr)) { + ASSERT(dr->dr_dbuf == db); + ASSERT(txg_prev > dr->dr_txg); + txg_prev = dr->dr_txg; + } + } + + /* + * We can't assert that db_size matches dn_datablksz because it + * can be momentarily different when another thread is doing + * dnode_set_blksz(). + */ + if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { + dr = db->db_data_pending; + /* + * It should only be modified in syncing context, so + * make sure we only have one copy of the data. + */ + ASSERT(dr == NULL || dr->dt.dl.dr_data == db->db_buf); + } + + /* verify db->db_blkptr */ + if (db->db_blkptr) { + if (db->db_parent == dn->dn_dbuf) { + /* db is pointed to by the dnode */ + /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) + ASSERT(db->db_parent == NULL); + else + ASSERT(db->db_parent != NULL); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + } else { + /* db is pointed to by an indirect block */ + int epb __maybe_unused = db->db_parent->db.db_size >> + SPA_BLKPTRSHIFT; + ASSERT3U(db->db_parent->db_level, ==, db->db_level+1); + ASSERT3U(db->db_parent->db.db_object, ==, + db->db.db_object); + /* + * dnode_grow_indblksz() can make this fail if we don't + * have the parent's rwlock. XXX indblksz no longer + * grows. safe to do this now? + */ + if (RW_LOCK_HELD(&db->db_parent->db_rwlock)) { + ASSERT3P(db->db_blkptr, ==, + ((blkptr_t *)db->db_parent->db.db_data + + db->db_blkid % epb)); + } + } + } + if ((db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr)) && + (db->db_buf == NULL || db->db_buf->b_data) && + db->db.db_data && db->db_blkid != DMU_BONUS_BLKID && + db->db_state != DB_FILL && !dn->dn_free_txg) { + /* + * If the blkptr isn't set but they have nonzero data, + * it had better be dirty, otherwise we'll lose that + * data when we evict this buffer. + * + * There is an exception to this rule for indirect blocks; in + * this case, if the indirect block is a hole, we fill in a few + * fields on each of the child blocks (importantly, birth time) + * to prevent hole birth times from being lost when you + * partially fill in a hole. + */ + if (db->db_dirtycnt == 0) { + if (db->db_level == 0) { + uint64_t *buf = db->db.db_data; + int i; + + for (i = 0; i < db->db.db_size >> 3; i++) { + ASSERT(buf[i] == 0); + } + } else { + blkptr_t *bps = db->db.db_data; + ASSERT3U(1 << DB_DNODE(db)->dn_indblkshift, ==, + db->db.db_size); + /* + * We want to verify that all the blkptrs in the + * indirect block are holes, but we may have + * automatically set up a few fields for them. + * We iterate through each blkptr and verify + * they only have those fields set. + */ + for (int i = 0; + i < db->db.db_size / sizeof (blkptr_t); + i++) { + blkptr_t *bp = &bps[i]; + ASSERT(ZIO_CHECKSUM_IS_ZERO( + &bp->blk_cksum)); + ASSERT( + DVA_IS_EMPTY(&bp->blk_dva[0]) && + DVA_IS_EMPTY(&bp->blk_dva[1]) && + DVA_IS_EMPTY(&bp->blk_dva[2])); + ASSERT0(bp->blk_fill); + ASSERT0(bp->blk_pad[0]); + ASSERT0(bp->blk_pad[1]); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(BP_IS_HOLE(bp)); + ASSERT0(bp->blk_phys_birth); + } + } + } + } + DB_DNODE_EXIT(db); +} +#endif + +static void +dbuf_clear_data(dmu_buf_impl_t *db) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + dbuf_evict_user(db); + ASSERT3P(db->db_buf, ==, NULL); + db->db.db_data = NULL; + if (db->db_state != DB_NOFILL) { + db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "clear data"); + } +} + +static void +dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(buf != NULL); + + db->db_buf = buf; + ASSERT(buf->b_data != NULL); + db->db.db_data = buf->b_data; +} + +static arc_buf_t * +dbuf_alloc_arcbuf_from_arcbuf(dmu_buf_impl_t *db, arc_buf_t *data) +{ + objset_t *os = db->db_objset; + spa_t *spa = os->os_spa; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + enum zio_compress compress_type; + uint8_t complevel; + int psize, lsize; + + psize = arc_buf_size(data); + lsize = arc_buf_lsize(data); + compress_type = arc_get_compression(data); + complevel = arc_get_complevel(data); + + if (arc_is_encrypted(data)) { + boolean_t byteorder; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + dnode_t *dn = DB_DNODE(db); + + arc_get_raw_params(data, &byteorder, salt, iv, mac); + data = arc_alloc_raw_buf(spa, db, dmu_objset_id(os), + byteorder, salt, iv, mac, dn->dn_type, psize, lsize, + compress_type, complevel); + } else if (compress_type != ZIO_COMPRESS_OFF) { + ASSERT3U(type, ==, ARC_BUFC_DATA); + data = arc_alloc_compressed_buf(spa, db, + psize, lsize, compress_type, complevel); + } else { + data = arc_alloc_buf(spa, db, type, psize); + } + return (data); +} + +static arc_buf_t * +dbuf_alloc_arcbuf(dmu_buf_impl_t *db) +{ + spa_t *spa = db->db_objset->os_spa; + + return (arc_alloc_buf(spa, db, DBUF_GET_BUFC_TYPE(db), db->db.db_size)); +} + +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || zfs_refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + spa_t *spa = db->db_objset->os_spa; + + mutex_exit(&db->db_mtx); + abuf = arc_loan_buf(spa, B_FALSE, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + db->db_buf = NULL; + dbuf_clear_data(db); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + +/* + * Calculate which level n block references the data at the level 0 offset + * provided. + */ +uint64_t +dbuf_whichblock(const dnode_t *dn, const int64_t level, const uint64_t offset) +{ + if (dn->dn_datablkshift != 0 && dn->dn_indblkshift != 0) { + /* + * The level n blkid is equal to the level 0 blkid divided by + * the number of level 0s in a level n block. + * + * The level 0 blkid is offset >> datablkshift = + * offset / 2^datablkshift. + * + * The number of level 0s in a level n is the number of block + * pointers in an indirect block, raised to the power of level. + * This is 2^(indblkshift - SPA_BLKPTRSHIFT)^level = + * 2^(level*(indblkshift - SPA_BLKPTRSHIFT)). + * + * Thus, the level n blkid is: offset / + * ((2^datablkshift)*(2^(level*(indblkshift-SPA_BLKPTRSHIFT)))) + * = offset / 2^(datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + * = offset >> (datablkshift + level * + * (indblkshift - SPA_BLKPTRSHIFT)) + */ + + const unsigned exp = dn->dn_datablkshift + + level * (dn->dn_indblkshift - SPA_BLKPTRSHIFT); + + if (exp >= 8 * sizeof (offset)) { + /* This only happens on the highest indirection level */ + ASSERT3U(level, ==, dn->dn_nlevels - 1); + return (0); + } + + ASSERT3U(exp, <, 8 * sizeof (offset)); + + return (offset >> exp); + } else { + ASSERT3U(offset, <, dn->dn_datablksz); + return (0); + } +} + +/* + * This function is used to lock the parent of the provided dbuf. This should be + * used when modifying or reading db_blkptr. + */ +db_lock_type_t +dmu_buf_lock_parent(dmu_buf_impl_t *db, krw_t rw, void *tag) +{ + enum db_lock_type ret = DLT_NONE; + if (db->db_parent != NULL) { + rw_enter(&db->db_parent->db_rwlock, rw); + ret = DLT_PARENT; + } else if (dmu_objset_ds(db->db_objset) != NULL) { + rrw_enter(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, rw, + tag); + ret = DLT_OBJSET; + } + /* + * We only return a DLT_NONE lock when it's the top-most indirect block + * of the meta-dnode of the MOS. + */ + return (ret); +} + +/* + * We need to pass the lock type in because it's possible that the block will + * move from being the topmost indirect block in a dnode (and thus, have no + * parent) to not the top-most via an indirection increase. This would cause a + * panic if we didn't pass the lock type in. + */ +void +dmu_buf_unlock_parent(dmu_buf_impl_t *db, db_lock_type_t type, void *tag) +{ + if (type == DLT_PARENT) + rw_exit(&db->db_parent->db_rwlock); + else if (type == DLT_OBJSET) + rrw_exit(&dmu_objset_ds(db->db_objset)->ds_bp_rwlock, tag); +} + +static void +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + + mutex_enter(&db->db_mtx); + ASSERT3U(db->db_state, ==, DB_READ); + /* + * All reads are synchronous, so we must have a hold on the dbuf + */ + ASSERT(zfs_refcount_count(&db->db_holds) > 0); + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + if (buf == NULL) { + /* i/o error */ + ASSERT(zio == NULL || zio->io_error != 0); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT3P(db->db_buf, ==, NULL); + db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "i/o error"); + } else if (db->db_level == 0 && db->db_freed_in_flight) { + /* freed in flight */ + ASSERT(zio == NULL || zio->io_error == 0); + arc_release(buf, db); + bzero(buf->b_data, db->db.db_size); + arc_buf_freeze(buf); + db->db_freed_in_flight = FALSE; + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "freed in flight"); + } else { + /* success */ + ASSERT(zio == NULL || zio->io_error == 0); + dbuf_set_data(db, buf); + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "successful read"); + } + cv_broadcast(&db->db_changed); + dbuf_rele_and_unlock(db, NULL, B_FALSE); +} + +/* + * Shortcut for performing reads on bonus dbufs. Returns + * an error if we fail to verify the dnode associated with + * a decrypted block. Otherwise success. + */ +static int +dbuf_read_bonus(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +{ + int bonuslen, max_bonuslen, err; + + err = dbuf_read_verify_dnode_crypt(db, flags); + if (err) + return (err); + + bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); + max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + ASSERT3U(bonuslen, <=, db->db.db_size); + db->db.db_data = kmem_alloc(max_bonuslen, KM_SLEEP); + arc_space_consume(max_bonuslen, ARC_SPACE_BONUS); + if (bonuslen < max_bonuslen) + bzero(db->db.db_data, max_bonuslen); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "bonus buffer filled"); + return (0); +} + +static void +dbuf_handle_indirect_hole(dmu_buf_impl_t *db, dnode_t *dn) +{ + blkptr_t *bps = db->db.db_data; + uint32_t indbs = 1ULL << dn->dn_indblkshift; + int n_bps = indbs >> SPA_BLKPTRSHIFT; + + for (int i = 0; i < n_bps; i++) { + blkptr_t *bp = &bps[i]; + + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, indbs); + BP_SET_LSIZE(bp, BP_GET_LEVEL(db->db_blkptr) == 1 ? + dn->dn_datablksz : BP_GET_LSIZE(db->db_blkptr)); + BP_SET_TYPE(bp, BP_GET_TYPE(db->db_blkptr)); + BP_SET_LEVEL(bp, BP_GET_LEVEL(db->db_blkptr) - 1); + BP_SET_BIRTH(bp, db->db_blkptr->blk_birth, 0); + } +} + +/* + * Handle reads on dbufs that are holes, if necessary. This function + * requires that the dbuf's mutex is held. Returns success (0) if action + * was taken, ENOENT if no action was taken. + */ +static int +dbuf_read_hole(dmu_buf_impl_t *db, dnode_t *dn, uint32_t flags) +{ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + int is_hole = db->db_blkptr == NULL || BP_IS_HOLE(db->db_blkptr); + /* + * For level 0 blocks only, if the above check fails: + * Recheck BP_IS_HOLE() after dnode_block_freed() in case dnode_sync() + * processes the delete record and clears the bp while we are waiting + * for the dn_mtx (resulting in a "no" from block_freed). + */ + if (!is_hole && db->db_level == 0) { + is_hole = dnode_block_freed(dn, db->db_blkid) || + BP_IS_HOLE(db->db_blkptr); + } + + if (is_hole) { + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); + bzero(db->db.db_data, db->db.db_size); + + if (db->db_blkptr != NULL && db->db_level > 0 && + BP_IS_HOLE(db->db_blkptr) && + db->db_blkptr->blk_birth != 0) { + dbuf_handle_indirect_hole(db, dn); + } + db->db_state = DB_CACHED; + DTRACE_SET_STATE(db, "hole read satisfied"); + return (0); + } + return (ENOENT); +} + +/* + * This function ensures that, when doing a decrypting read of a block, + * we make sure we have decrypted the dnode associated with it. We must do + * this so that we ensure we are fully authenticating the checksum-of-MACs + * tree from the root of the objset down to this block. Indirect blocks are + * always verified against their secure checksum-of-MACs assuming that the + * dnode containing them is correct. Now that we are doing a decrypting read, + * we can be sure that the key is loaded and verify that assumption. This is + * especially important considering that we always read encrypted dnode + * blocks as raw data (without verifying their MACs) to start, and + * decrypt / authenticate them when we need to read an encrypted bonus buffer. + */ +static int +dbuf_read_verify_dnode_crypt(dmu_buf_impl_t *db, uint32_t flags) +{ + int err = 0; + objset_t *os = db->db_objset; + arc_buf_t *dnode_abuf; + dnode_t *dn; + zbookmark_phys_t zb; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (!os->os_encrypted || os->os_raw_receive || + (flags & DB_RF_NO_DECRYPT) != 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dnode_abuf = (dn->dn_dbuf != NULL) ? dn->dn_dbuf->db_buf : NULL; + + if (dnode_abuf == NULL || !arc_is_encrypted(dnode_abuf)) { + DB_DNODE_EXIT(db); + return (0); + } + + SET_BOOKMARK(&zb, dmu_objset_id(os), + DMU_META_DNODE_OBJECT, 0, dn->dn_dbuf->db_blkid); + err = arc_untransform(dnode_abuf, os->os_spa, &zb, B_TRUE); + + /* + * An error code of EACCES tells us that the key is still not + * available. This is ok if we are only reading authenticated + * (and therefore non-encrypted) blocks. + */ + if (err == EACCES && ((db->db_blkid != DMU_BONUS_BLKID && + !DMU_OT_IS_ENCRYPTED(dn->dn_type)) || + (db->db_blkid == DMU_BONUS_BLKID && + !DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)))) + err = 0; + + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Drops db_mtx and the parent lock specified by dblt and tag before + * returning. + */ +static int +dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, + db_lock_type_t dblt, void *tag) +{ + dnode_t *dn; + zbookmark_phys_t zb; + uint32_t aflags = ARC_FLAG_NOWAIT; + int err, zio_flags; + boolean_t bonus_read; + + err = zio_flags = 0; + bonus_read = B_FALSE; + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db_state == DB_UNCACHED); + ASSERT(db->db_buf == NULL); + ASSERT(db->db_parent == NULL || + RW_LOCK_HELD(&db->db_parent->db_rwlock)); + + if (db->db_blkid == DMU_BONUS_BLKID) { + err = dbuf_read_bonus(db, dn, flags); + goto early_unlock; + } + + err = dbuf_read_hole(db, dn, flags); + if (err == 0) + goto early_unlock; + + /* + * Any attempt to read a redacted block should result in an error. This + * will never happen under normal conditions, but can be useful for + * debugging purposes. + */ + if (BP_IS_REDACTED(db->db_blkptr)) { + ASSERT(dsl_dataset_feature_is_active( + db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + err = SET_ERROR(EIO); + goto early_unlock; + } + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (db->db_objset->os_encrypted && !BP_USES_CRYPT(db->db_blkptr)) { + spa_log_error(db->db_objset->os_spa, &zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", dmu_objset_id(db->db_objset)); + err = SET_ERROR(EIO); + goto early_unlock; + } + + err = dbuf_read_verify_dnode_crypt(db, flags); + if (err != 0) + goto early_unlock; + + DB_DNODE_EXIT(db); + + db->db_state = DB_READ; + DTRACE_SET_STATE(db, "read issued"); + mutex_exit(&db->db_mtx); + + if (DBUF_IS_L2CACHEABLE(db)) + aflags |= ARC_FLAG_L2CACHE; + + dbuf_add_ref(db, NULL); + + zio_flags = (flags & DB_RF_CANFAIL) ? + ZIO_FLAG_CANFAIL : ZIO_FLAG_MUSTSUCCEED; + + if ((flags & DB_RF_NO_DECRYPT) && BP_IS_PROTECTED(db->db_blkptr)) + zio_flags |= ZIO_FLAG_RAW; + /* + * The zio layer will copy the provided blkptr later, but we need to + * do this now so that we can release the parent's rwlock. We have to + * do that now so that if dbuf_read_done is called synchronously (on + * an l1 cache hit) we don't acquire the db_mtx while holding the + * parent's rwlock, which would be a lock ordering violation. + */ + blkptr_t bp = *db->db_blkptr; + dmu_buf_unlock_parent(db, dblt, tag); + (void) arc_read(zio, db->db_objset->os_spa, &bp, + dbuf_read_done, db, ZIO_PRIORITY_SYNC_READ, zio_flags, + &aflags, &zb); + return (err); +early_unlock: + DB_DNODE_EXIT(db); + mutex_exit(&db->db_mtx); + dmu_buf_unlock_parent(db, dblt, tag); + return (err); +} + +/* + * This is our just-in-time copy function. It makes a copy of buffers that + * have been modified in a previous transaction group before we access them in + * the current active group. + * + * This function is used in three places: when we are dirtying a buffer for the + * first time in a txg, when we are freeing a range in a dnode that includes + * this buffer, and when we are accessing a buffer which was received compressed + * and later referenced in a WRITE_BYREF record. + * + * Note that when we are called from dbuf_free_range() we do not put a hold on + * the buffer, we just traverse the active dbuf list for the dnode. + */ +static void +dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) +{ + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(db->db.db_data != NULL); + ASSERT(db->db_level == 0); + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT); + + if (dr == NULL || + (dr->dt.dl.dr_data != + ((db->db_blkid == DMU_BONUS_BLKID) ? db->db.db_data : db->db_buf))) + return; + + /* + * If the last dirty record for this dbuf has not yet synced + * and its referencing the dbuf data, either: + * reset the reference to point to a new copy, + * or (if there a no active holders) + * just null out the current db_data pointer. + */ + ASSERT3U(dr->dr_txg, >=, txg - 2); + if (db->db_blkid == DMU_BONUS_BLKID) { + dnode_t *dn = DB_DNODE(db); + int bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + dr->dt.dl.dr_data = kmem_alloc(bonuslen, KM_SLEEP); + arc_space_consume(bonuslen, ARC_SPACE_BONUS); + bcopy(db->db.db_data, dr->dt.dl.dr_data, bonuslen); + } else if (zfs_refcount_count(&db->db_holds) > db->db_dirtycnt) { + arc_buf_t *buf = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf); + dr->dt.dl.dr_data = buf; + bcopy(db->db.db_data, buf->b_data, arc_buf_size(buf)); + } else { + db->db_buf = NULL; + dbuf_clear_data(db); + } +} + +int +dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags) +{ + int err = 0; + boolean_t prefetch; + dnode_t *dn; + + /* + * We don't have to hold the mutex to check db_state because it + * can't be freed while we have a hold on the buffer. + */ + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + if (db->db_state == DB_NOFILL) + return (SET_ERROR(EIO)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + prefetch = db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + (flags & DB_RF_NOPREFETCH) == 0 && dn != NULL && + DBUF_IS_CACHEABLE(db); + + mutex_enter(&db->db_mtx); + if (db->db_state == DB_CACHED) { + spa_t *spa = dn->dn_objset->os_spa; + + /* + * Ensure that this block's dnode has been decrypted if + * the caller has requested decrypted data. + */ + err = dbuf_read_verify_dnode_crypt(db, flags); + + /* + * If the arc buf is compressed or encrypted and the caller + * requested uncompressed data, we need to untransform it + * before returning. We also call arc_untransform() on any + * unauthenticated blocks, which will verify their MAC if + * the key is now available. + */ + if (err == 0 && db->db_buf != NULL && + (flags & DB_RF_NO_DECRYPT) == 0 && + (arc_is_encrypted(db->db_buf) || + arc_is_unauthenticated(db->db_buf) || + arc_get_compression(db->db_buf) != ZIO_COMPRESS_OFF)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + dbuf_fix_old_data(db, spa_syncing_txg(spa)); + err = arc_untransform(db->db_buf, spa, &zb, B_FALSE); + dbuf_set_data(db, db->db_buf); + } + mutex_exit(&db->db_mtx); + if (err == 0 && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_hits); + } else if (db->db_state == DB_UNCACHED) { + spa_t *spa = dn->dn_objset->os_spa; + boolean_t need_wait = B_FALSE; + + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + + if (zio == NULL && + db->db_blkptr != NULL && !BP_IS_HOLE(db->db_blkptr)) { + zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); + need_wait = B_TRUE; + } + err = dbuf_read_impl(db, zio, flags, dblt, FTAG); + /* + * dbuf_read_impl has dropped db_mtx and our parent's rwlock + * for us + */ + if (!err && prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } + + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_misses); + + /* + * If we created a zio_root we must execute it to avoid + * leaking it, even if it isn't attached to any work due + * to an error in dbuf_read_impl(). + */ + if (need_wait) { + if (err == 0) + err = zio_wait(zio); + else + VERIFY0(zio_wait(zio)); + } + } else { + /* + * Another reader came in while the dbuf was in flight + * between UNCACHED and CACHED. Either a writer will finish + * writing the buffer (sending the dbuf to CACHED) or the + * first reader's request will reach the read_done callback + * and send the dbuf to CACHED. Otherwise, a failure + * occurred and the dbuf went to UNCACHED. + */ + mutex_exit(&db->db_mtx); + if (prefetch) { + dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE, + flags & DB_RF_HAVESTRUCT); + } + DB_DNODE_EXIT(db); + DBUF_STAT_BUMP(hash_misses); + + /* Skip the wait per the caller's request. */ + if ((flags & DB_RF_NEVERWAIT) == 0) { + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) { + ASSERT(db->db_state == DB_READ || + (flags & DB_RF_HAVESTRUCT) == 0); + DTRACE_PROBE2(blocked__read, dmu_buf_impl_t *, + db, zio_t *, zio); + cv_wait(&db->db_changed, &db->db_mtx); + } + if (db->db_state == DB_UNCACHED) + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + } + } + + return (err); +} + +static void +dbuf_noread(dmu_buf_impl_t *db) +{ + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) { + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + dbuf_set_data(db, dbuf_alloc_arcbuf(db)); + db->db_state = DB_FILL; + DTRACE_SET_STATE(db, "assigning filled buffer"); + } else if (db->db_state == DB_NOFILL) { + dbuf_clear_data(db); + } else { + ASSERT3U(db->db_state, ==, DB_CACHED); + } + mutex_exit(&db->db_mtx); +} + +void +dbuf_unoverride(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; + uint64_t txg = dr->dr_txg; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + /* + * This assert is valid because dmu_sync() expects to be called by + * a zilog's get_data while holding a range lock. This call only + * comes from dbuf_dirty() callers who must also hold a range lock. + */ + ASSERT(dr->dt.dl.dr_override_state != DR_IN_DMU_SYNC); + ASSERT(db->db_level == 0); + + if (db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) + return; + + ASSERT(db->db_data_pending != dr); + + /* free this block */ + if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) + zio_free(db->db_objset->os_spa, txg, bp); + + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + dr->dt.dl.dr_nopwrite = B_FALSE; + dr->dt.dl.dr_has_raw_params = B_FALSE; + + /* + * Release the already-written buffer, so we leave it in + * a consistent dirty state. Note that all callers are + * modifying the buffer, so they will immediately do + * another (redundant) arc_release(). Therefore, leave + * the buf thawed to save the effort of freezing & + * immediately re-thawing it. + */ + arc_release(dr->dt.dl.dr_data, db); +} + +/* + * Evict (if its unreferenced) or clear (if its referenced) any level-0 + * data blocks in the free range, so that any future readers will find + * empty blocks. + */ +void +dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db_search; + dmu_buf_impl_t *db, *db_next; + uint64_t txg = tx->tx_txg; + avl_index_t where; + dbuf_dirty_record_t *dr; + + if (end_blkid > dn->dn_maxblkid && + !(start_blkid == DMU_SPILL_BLKID || end_blkid == DMU_SPILL_BLKID)) + end_blkid = dn->dn_maxblkid; + dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid); + + db_search = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + db_search->db_level = 0; + db_search->db_blkid = start_blkid; + db_search->db_state = DB_SEARCH; + + mutex_enter(&dn->dn_dbufs_mtx); + db = avl_find(&dn->dn_dbufs, db_search, &where); + ASSERT3P(db, ==, NULL); + + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + for (; db != NULL; db = db_next) { + db_next = AVL_NEXT(&dn->dn_dbufs, db); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + if (db->db_level != 0 || db->db_blkid > end_blkid) { + break; + } + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* found a level 0 buffer in the range */ + mutex_enter(&db->db_mtx); + if (dbuf_undirty(db, tx)) { + /* mutex has been dropped and dbuf destroyed */ + continue; + } + + if (db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL || + db->db_state == DB_EVICTING) { + ASSERT(db->db.db_data == NULL); + mutex_exit(&db->db_mtx); + continue; + } + if (db->db_state == DB_READ || db->db_state == DB_FILL) { + /* will be handled in dbuf_read_done or dbuf_rele */ + db->db_freed_in_flight = TRUE; + mutex_exit(&db->db_mtx); + continue; + } + if (zfs_refcount_count(&db->db_holds) == 0) { + ASSERT(db->db_buf); + dbuf_destroy(db); + continue; + } + /* The dbuf is referenced */ + + dr = list_head(&db->db_dirty_records); + if (dr != NULL) { + if (dr->dr_txg == txg) { + /* + * This buffer is "in-use", re-adjust the file + * size to reflect that this buffer may + * contain new data when we sync. + */ + if (db->db_blkid != DMU_SPILL_BLKID && + db->db_blkid > dn->dn_maxblkid) + dn->dn_maxblkid = db->db_blkid; + dbuf_unoverride(dr); + } else { + /* + * This dbuf is not dirty in the open context. + * Either uncache it (if its not referenced in + * the open context) or reset its contents to + * empty. + */ + dbuf_fix_old_data(db, txg); + } + } + /* clear the contents if its cached */ + if (db->db_state == DB_CACHED) { + ASSERT(db->db.db_data != NULL); + arc_release(db->db_buf, db); + rw_enter(&db->db_rwlock, RW_WRITER); + bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); + arc_buf_freeze(db->db_buf); + } + + mutex_exit(&db->db_mtx); + } + + kmem_free(db_search, sizeof (dmu_buf_impl_t)); + mutex_exit(&dn->dn_dbufs_mtx); +} + +void +dbuf_new_size(dmu_buf_impl_t *db, int size, dmu_tx_t *tx) +{ + arc_buf_t *buf, *old_buf; + dbuf_dirty_record_t *dr; + int osize = db->db.db_size; + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + dnode_t *dn; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + /* + * XXX we should be doing a dbuf_read, checking the return + * value and returning that up to our callers + */ + dmu_buf_will_dirty(&db->db, tx); + + /* create the data buffer for the new block */ + buf = arc_alloc_buf(dn->dn_objset->os_spa, db, type, size); + + /* copy old block data to the new block */ + old_buf = db->db_buf; + bcopy(old_buf->b_data, buf->b_data, MIN(osize, size)); + /* zero the remainder */ + if (size > osize) + bzero((uint8_t *)buf->b_data + osize, size - osize); + + mutex_enter(&db->db_mtx); + dbuf_set_data(db, buf); + arc_buf_destroy(old_buf, db); + db->db.db_size = size; + + dr = list_head(&db->db_dirty_records); + /* dirty record added by dmu_buf_will_dirty() */ + VERIFY(dr != NULL); + if (db->db_level == 0) + dr->dt.dl.dr_data = buf; + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + ASSERT3U(dr->dr_accounted, ==, osize); + dr->dr_accounted = size; + mutex_exit(&db->db_mtx); + + dmu_objset_willuse_space(dn->dn_objset, size - osize, tx); + DB_DNODE_EXIT(db); +} + +void +dbuf_release_bp(dmu_buf_impl_t *db) +{ + objset_t *os __maybe_unused = db->db_objset; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(arc_released(os->os_phys_buf) || + list_link_active(&os->os_dsl_dataset->ds_synced_link)); + ASSERT(db->db_parent == NULL || arc_released(db->db_parent->db_buf)); + + (void) arc_release(db->db_buf, db); +} + +/* + * We already have a dirty record for this TXG, and we are being + * dirtied again. + */ +static void +dbuf_redirty(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID) { + /* + * If this buffer has already been written out, + * we now need to reset its state. + */ + dbuf_unoverride(dr); + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) { + /* Already released on initial dirty, so just thaw. */ + ASSERT(arc_released(db->db_buf)); + arc_buf_thaw(db->db_buf); + } + } +} + +dbuf_dirty_record_t * +dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + objset_t *os; + dbuf_dirty_record_t *dr, *dr_next, *dr_head; + int txgoff = tx->tx_txg & TXG_MASK; + boolean_t drop_struct_rwlock = B_FALSE; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + DMU_TX_DIRTY_BUF(tx, db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* + * Shouldn't dirty a regular buffer in syncing context. Private + * objects may be dirtied in syncing context, but only if they + * were already pre-dirtied in open context. + */ +#ifdef ZFS_DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) { + rrw_enter(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, + RW_READER, FTAG); + } + ASSERT(!dmu_tx_is_syncing(tx) || + BP_IS_HOLE(dn->dn_objset->os_rootbp) || + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&dn->dn_objset->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif + /* + * We make this assert for private objects as well, but after we + * check if we're already dirty. They are allowed to re-dirty + * in syncing context. + */ + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + mutex_enter(&db->db_mtx); + /* + * XXX make this true for indirects too? The problem is that + * transactions created with dmu_tx_create_assigned() from + * syncing context don't bother holding ahead. + */ + ASSERT(db->db_level != 0 || + db->db_state == DB_CACHED || db->db_state == DB_FILL || + db->db_state == DB_NOFILL); + + mutex_enter(&dn->dn_mtx); + dnode_set_dirtyctx(dn, tx, db); + if (tx->tx_txg > dn->dn_dirty_txg) + dn->dn_dirty_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + if (db->db_blkid == DMU_SPILL_BLKID) + dn->dn_have_spill = B_TRUE; + + /* + * If this buffer is already dirty, we're done. + */ + dr_head = list_head(&db->db_dirty_records); + ASSERT(dr_head == NULL || dr_head->dr_txg <= tx->tx_txg || + db->db.db_object == DMU_META_DNODE_OBJECT); + dr_next = dbuf_find_dirty_lte(db, tx->tx_txg); + if (dr_next && dr_next->dr_txg == tx->tx_txg) { + DB_DNODE_EXIT(db); + + dbuf_redirty(dr_next); + mutex_exit(&db->db_mtx); + return (dr_next); + } + + /* + * Only valid if not already dirty. + */ + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); + + ASSERT3U(dn->dn_nlevels, >, db->db_level); + + /* + * We should only be dirtying in syncing context if it's the + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. + */ + os = dn->dn_objset; + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(os->os_spa)); +#ifdef ZFS_DEBUG + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); + if (dn->dn_objset->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); +#endif + ASSERT(db->db.db_size != 0); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + if (db->db_blkid != DMU_BONUS_BLKID) { + dmu_objset_willuse_space(os, db->db.db_size, tx); + } + + /* + * If this buffer is dirty in an old transaction group we need + * to make a copy of it so that the changes we make in this + * transaction group won't leak out when we sync the older txg. + */ + dr = kmem_zalloc(sizeof (dbuf_dirty_record_t), KM_SLEEP); + list_link_init(&dr->dr_dirty_node); + list_link_init(&dr->dr_dbuf_node); + if (db->db_level == 0) { + void *data_old = db->db_buf; + + if (db->db_state != DB_NOFILL) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db.db_data; + } else if (db->db.db_object != DMU_META_DNODE_OBJECT) { + /* + * Release the data buffer from the cache so + * that we can modify it without impacting + * possible other users of this cached data + * block. Note that indirect blocks and + * private objects are not released until the + * syncing state (since they are only modified + * then). + */ + arc_release(db->db_buf, db); + dbuf_fix_old_data(db, tx->tx_txg); + data_old = db->db_buf; + } + ASSERT(data_old != NULL); + } + dr->dt.dl.dr_data = data_old; + } else { + mutex_init(&dr->dt.di.dr_mtx, NULL, MUTEX_NOLOCKDEP, NULL); + list_create(&dr->dt.di.dr_children, + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + if (db->db_blkid != DMU_BONUS_BLKID) + dr->dr_accounted = db->db.db_size; + dr->dr_dbuf = db; + dr->dr_txg = tx->tx_txg; + list_insert_before(&db->db_dirty_records, dr_next, dr); + + /* + * We could have been freed_in_flight between the dbuf_noread + * and dbuf_dirty. We win, as though the dbuf_noread() had + * happened after the free. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_free_ranges[txgoff] != NULL) { + range_tree_clear(dn->dn_free_ranges[txgoff], + db->db_blkid, 1); + } + mutex_exit(&dn->dn_mtx); + db->db_freed_in_flight = FALSE; + } + + /* + * This buffer is now part of this txg + */ + dbuf_add_ref(db, (void *)(uintptr_t)tx->tx_txg); + db->db_dirtycnt += 1; + ASSERT3U(db->db_dirtycnt, <=, 3); + + mutex_exit(&db->db_mtx); + + if (db->db_blkid == DMU_BONUS_BLKID || + db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); + } + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_rwlock = B_TRUE; + } + + /* + * If we are overwriting a dedup BP, then unless it is snapshotted, + * when we get to syncing context we will need to decrement its + * refcount in the DDT. Prefetch the relevant DDT block so that + * syncing context won't have to wait for the i/o. + */ + if (db->db_blkptr != NULL) { + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + ddt_prefetch(os->os_spa, db->db_blkptr); + dmu_buf_unlock_parent(db, dblt, FTAG); + } + + /* + * We need to hold the dn_struct_rwlock to make this assertion, + * because it protects dn_phys / dn_next_nlevels from changing. + */ + ASSERT((dn->dn_phys->dn_nlevels == 0 && db->db_level == 0) || + dn->dn_phys->dn_nlevels > db->db_level || + dn->dn_next_nlevels[txgoff] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-1) & TXG_MASK] > db->db_level || + dn->dn_next_nlevels[(tx->tx_txg-2) & TXG_MASK] > db->db_level); + + + if (db->db_level == 0) { + ASSERT(!db->db_objset->os_raw_receive || + dn->dn_maxblkid >= db->db_blkid); + dnode_new_blkid(dn, db->db_blkid, tx, + drop_struct_rwlock, B_FALSE); + ASSERT(dn->dn_maxblkid >= db->db_blkid); + } + + if (db->db_level+1 < dn->dn_nlevels) { + dmu_buf_impl_t *parent = db->db_parent; + dbuf_dirty_record_t *di; + int parent_held = FALSE; + + if (db->db_parent == NULL || db->db_parent == dn->dn_dbuf) { + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, FTAG); + ASSERT(parent != NULL); + parent_held = TRUE; + } + if (drop_struct_rwlock) + rw_exit(&dn->dn_struct_rwlock); + ASSERT3U(db->db_level + 1, ==, parent->db_level); + di = dbuf_dirty(parent, tx); + if (parent_held) + dbuf_rele(parent, FTAG); + + mutex_enter(&db->db_mtx); + /* + * Since we've dropped the mutex, it's possible that + * dbuf_undirty() might have changed this out from under us. + */ + if (list_head(&db->db_dirty_records) == dr || + dn->dn_object == DMU_META_DNODE_OBJECT) { + mutex_enter(&di->dt.di.dr_mtx); + ASSERT3U(di->dr_txg, ==, tx->tx_txg); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&di->dt.di.dr_children, dr); + mutex_exit(&di->dt.di.dr_mtx); + dr->dr_parent = di; + } + mutex_exit(&db->db_mtx); + } else { + ASSERT(db->db_level + 1 == dn->dn_nlevels); + ASSERT(db->db_blkid < dn->dn_nblkptr); + ASSERT(db->db_parent == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + ASSERT(!list_link_active(&dr->dr_dirty_node)); + list_insert_tail(&dn->dn_dirty_records[txgoff], dr); + mutex_exit(&dn->dn_mtx); + if (drop_struct_rwlock) + rw_exit(&dn->dn_struct_rwlock); + } + + dnode_setdirty(dn, tx); + DB_DNODE_EXIT(db); + return (dr); +} + +static void +dbuf_undirty_bonus(dbuf_dirty_record_t *dr) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + + if (dr->dt.dl.dr_data != db->db.db_data) { + struct dnode *dn = DB_DNODE(db); + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + + kmem_free(dr->dt.dl.dr_data, max_bonuslen); + arc_space_return(max_bonuslen, ARC_SPACE_BONUS); + } + db->db_data_pending = NULL; + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + if (dr->dr_dbuf->db_level != 0) { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + ASSERT3U(db->db_dirtycnt, >, 0); + db->db_dirtycnt -= 1; +} + +/* + * Undirty a buffer in the transaction group referenced by the given + * transaction. Return whether this evicted the dbuf. + */ +static boolean_t +dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dnode_t *dn; + uint64_t txg = tx->tx_txg; + dbuf_dirty_record_t *dr; + + ASSERT(txg != 0); + + /* + * Due to our use of dn_nlevels below, this can only be called + * in open context, unless we are operating on the MOS. + * From syncing context, dn_nlevels may be different from the + * dn_nlevels used when dbuf was dirtied. + */ + ASSERT(db->db_objset == + dmu_objset_pool(db->db_objset)->dp_meta_objset || + txg != spa_syncing_txg(dmu_objset_spa(db->db_objset))); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + + /* + * If this buffer is not dirty, we're done. + */ + dr = dbuf_find_dirty_eq(db, txg); + if (dr == NULL) + return (B_FALSE); + ASSERT(dr->dr_dbuf == db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); + + ASSERT(db->db.db_size != 0); + + dsl_pool_undirty_space(dmu_objset_pool(dn->dn_objset), + dr->dr_accounted, txg); + + list_remove(&db->db_dirty_records, dr); + + /* + * Note that there are three places in dbuf_dirty() + * where this dirty record may be put on a list. + * Make sure to do a list_remove corresponding to + * every one of those list_insert calls. + */ + if (dr->dr_parent) { + mutex_enter(&dr->dr_parent->dt.di.dr_mtx); + list_remove(&dr->dr_parent->dt.di.dr_children, dr); + mutex_exit(&dr->dr_parent->dt.di.dr_mtx); + } else if (db->db_blkid == DMU_SPILL_BLKID || + db->db_level + 1 == dn->dn_nlevels) { + ASSERT(db->db_blkptr == NULL || db->db_parent == dn->dn_dbuf); + mutex_enter(&dn->dn_mtx); + list_remove(&dn->dn_dirty_records[txg & TXG_MASK], dr); + mutex_exit(&dn->dn_mtx); + } + DB_DNODE_EXIT(db); + + if (db->db_state != DB_NOFILL) { + dbuf_unoverride(dr); + + ASSERT(db->db_buf != NULL); + ASSERT(dr->dt.dl.dr_data != NULL); + if (dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + + if (zfs_refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { + ASSERT(db->db_state == DB_NOFILL || arc_released(db->db_buf)); + dbuf_destroy(db); + return (B_TRUE); + } + + return (B_FALSE); +} + +static void +dmu_buf_will_dirty_impl(dmu_buf_t *db_fake, int flags, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(tx->tx_txg != 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + /* + * Quick check for dirtiness. For already dirty blocks, this + * reduces runtime of this function by >90%, and overall performance + * by 50% for some workloads (e.g. file deletion with indirect blocks + * cached). + */ + mutex_enter(&db->db_mtx); + + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = dbuf_find_dirty_eq(db, tx->tx_txg); + /* + * It's possible that it is already dirty but not cached, + * because there are some calls to dbuf_dirty() that don't + * go through dmu_buf_will_dirty(). + */ + if (dr != NULL) { + /* This dbuf is already dirty and cached. */ + dbuf_redirty(dr); + mutex_exit(&db->db_mtx); + return; + } + } + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + if (RW_WRITE_HELD(&DB_DNODE(db)->dn_struct_rwlock)) + flags |= DB_RF_HAVESTRUCT; + DB_DNODE_EXIT(db); + (void) dbuf_read(db, NULL, flags); + (void) dbuf_dirty(db, tx); +} + +void +dmu_buf_will_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH, tx); +} + +boolean_t +dmu_buf_is_dirty(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + mutex_enter(&db->db_mtx); + dr = dbuf_find_dirty_eq(db, tx->tx_txg); + mutex_exit(&db->db_mtx); + return (dr != NULL); +} + +void +dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_state = DB_NOFILL; + DTRACE_SET_STATE(db, "allocating NOFILL buffer"); + dmu_buf_will_fill(db_fake, tx); +} + +void +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(tx->tx_txg != 0); + ASSERT(db->db_level == 0); + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + + ASSERT(db->db.db_object != DMU_META_DNODE_OBJECT || + dmu_tx_private_ok(tx)); + + dbuf_noread(db); + (void) dbuf_dirty(db, tx); +} + +/* + * This function is effectively the same as dmu_buf_will_dirty(), but + * indicates the caller expects raw encrypted data in the db, and provides + * the crypt params (byteorder, salt, iv, mac) which should be stored in the + * blkptr_t when this dbuf is written. This is only used for blocks of + * dnodes, during raw receive. + */ +void +dmu_buf_set_crypt_params(dmu_buf_t *db_fake, boolean_t byteorder, + const uint8_t *salt, const uint8_t *iv, const uint8_t *mac, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dbuf_dirty_record_t *dr; + + /* + * dr_has_raw_params is only processed for blocks of dnodes + * (see dbuf_sync_dnode_leaf_crypt()). + */ + ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); + ASSERT3U(db->db_level, ==, 0); + ASSERT(db->db_objset->os_raw_receive); + + dmu_buf_will_dirty_impl(db_fake, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_NO_DECRYPT, tx); + + dr = dbuf_find_dirty_eq(db, tx->tx_txg); + + ASSERT3P(dr, !=, NULL); + + dr->dt.dl.dr_has_raw_params = B_TRUE; + dr->dt.dl.dr_byteorder = byteorder; + bcopy(salt, dr->dt.dl.dr_salt, ZIO_DATA_SALT_LEN); + bcopy(iv, dr->dt.dl.dr_iv, ZIO_DATA_IV_LEN); + bcopy(mac, dr->dt.dl.dr_mac, ZIO_DATA_MAC_LEN); +} + +static void +dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) +{ + struct dirty_leaf *dl; + dbuf_dirty_record_t *dr; + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; + dl->dr_overridden_by = *bp; + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = dr->dr_txg; +} + +/* ARGSUSED */ +void +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dbuf_states_t old_state; + mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + + old_state = db->db_state; + db->db_state = DB_CACHED; + if (old_state == DB_FILL) { + if (db->db_level == 0 && db->db_freed_in_flight) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + /* we were freed while filling */ + /* XXX dbuf_undirty? */ + bzero(db->db.db_data, db->db.db_size); + db->db_freed_in_flight = FALSE; + DTRACE_SET_STATE(db, + "fill done handling freed in flight"); + } else { + DTRACE_SET_STATE(db, "fill done"); + } + cv_broadcast(&db->db_changed); + } + mutex_exit(&db->db_mtx); +} + +void +dmu_buf_write_embedded(dmu_buf_t *dbuf, void *data, + bp_embedded_type_t etype, enum zio_compress comp, + int uncompressed_size, int compressed_size, int byteorder, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + struct dirty_leaf *dl; + dmu_object_type_t type; + dbuf_dirty_record_t *dr; + + if (etype == BP_EMBEDDED_TYPE_DATA) { + ASSERT(spa_feature_is_active(dmu_objset_spa(db->db_objset), + SPA_FEATURE_EMBEDDED_DATA)); + } + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + + dmu_buf_will_not_fill(dbuf, tx); + + dr = list_head(&db->db_dirty_records); + ASSERT3U(dr->dr_txg, ==, tx->tx_txg); + dl = &dr->dt.dl; + encode_embedded_bp_compressed(&dl->dr_overridden_by, + data, comp, uncompressed_size, compressed_size); + BPE_SET_ETYPE(&dl->dr_overridden_by, etype); + BP_SET_TYPE(&dl->dr_overridden_by, type); + BP_SET_LEVEL(&dl->dr_overridden_by, 0); + BP_SET_BYTEORDER(&dl->dr_overridden_by, byteorder); + + dl->dr_override_state = DR_OVERRIDDEN; + dl->dr_overridden_by.blk_birth = dr->dr_txg; +} + +void +dmu_buf_redact(dmu_buf_t *dbuf, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + dmu_object_type_t type; + ASSERT(dsl_dataset_feature_is_active(db->db_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + DB_DNODE_ENTER(db); + type = DB_DNODE(db)->dn_type; + DB_DNODE_EXIT(db); + + ASSERT0(db->db_level); + dmu_buf_will_not_fill(dbuf, tx); + + blkptr_t bp = { { { {0} } } }; + BP_SET_TYPE(&bp, type); + BP_SET_LEVEL(&bp, 0); + BP_SET_BIRTH(&bp, tx->tx_txg, 0); + BP_SET_REDACTED(&bp); + BPE_SET_LSIZE(&bp, dbuf->db_size); + + dbuf_override_impl(db, &bp, tx); +} + +/* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT3U(dbuf_is_metadata(db), ==, arc_is_metadata(buf)); + ASSERT(buf != NULL); + ASSERT3U(arc_buf_lsize(buf), ==, db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + /* + * In practice, we will never have a case where we have an + * encrypted arc buffer while additional holds exist on the + * dbuf. We don't handle this here so we simply assert that + * fact instead. + */ + ASSERT(!arc_is_encrypted(buf)); + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + arc_buf_destroy(buf, db); + xuio_stat_wbuf_copied(); + return; + } + + xuio_stat_wbuf_nocopy(); + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = list_head(&db->db_dirty_records); + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + arc_buf_destroy(db->db_buf, db); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + arc_buf_destroy(db->db_buf, db); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + DTRACE_SET_STATE(db, "filling assigned arcbuf"); + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dmu_buf_fill_done(&db->db, tx); +} + +void +dbuf_destroy(dmu_buf_impl_t *db) +{ + dnode_t *dn; + dmu_buf_impl_t *parent = db->db_parent; + dmu_buf_impl_t *dndb; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + + if (db->db_buf != NULL) { + arc_buf_destroy(db->db_buf, db); + db->db_buf = NULL; + } + + if (db->db_blkid == DMU_BONUS_BLKID) { + int slots = DB_DNODE(db)->dn_num_slots; + int bonuslen = DN_SLOTS_TO_BONUSLEN(slots); + if (db->db.db_data != NULL) { + kmem_free(db->db.db_data, bonuslen); + arc_space_return(bonuslen, ARC_SPACE_BONUS); + db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "buffer cleared"); + } + } + + dbuf_clear_data(db); + + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, + db->db.db_size, db); + + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + } + db->db_caching_status = DB_NO_CACHE; + } + + ASSERT(db->db_state == DB_UNCACHED || db->db_state == DB_NOFILL); + ASSERT(db->db_data_pending == NULL); + ASSERT(list_is_empty(&db->db_dirty_records)); + + db->db_state = DB_EVICTING; + DTRACE_SET_STATE(db, "buffer eviction started"); + db->db_blkptr = NULL; + + /* + * Now that db_state is DB_EVICTING, nobody else can find this via + * the hash table. We can now drop db_mtx, which allows us to + * acquire the dn_dbufs_mtx. + */ + mutex_exit(&db->db_mtx); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dndb = dn->dn_dbuf; + if (db->db_blkid != DMU_BONUS_BLKID) { + boolean_t needlock = !MUTEX_HELD(&dn->dn_dbufs_mtx); + if (needlock) + mutex_enter_nested(&dn->dn_dbufs_mtx, + NESTED_SINGLE); + avl_remove(&dn->dn_dbufs, db); + membar_producer(); + DB_DNODE_EXIT(db); + if (needlock) + mutex_exit(&dn->dn_dbufs_mtx); + /* + * Decrementing the dbuf count means that the hold corresponding + * to the removed dbuf is no longer discounted in dnode_move(), + * so the dnode cannot be moved until after we release the hold. + * The membar_producer() ensures visibility of the decremented + * value in dnode_move(), since DB_DNODE_EXIT doesn't actually + * release any lock. + */ + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, db, B_TRUE); + db->db_dnode_handle = NULL; + + dbuf_hash_remove(db); + } else { + DB_DNODE_EXIT(db); + } + + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + + db->db_parent = NULL; + + ASSERT(db->db_buf == NULL); + ASSERT(db->db.db_data == NULL); + ASSERT(db->db_hash_next == NULL); + ASSERT(db->db_blkptr == NULL); + ASSERT(db->db_data_pending == NULL); + ASSERT3U(db->db_caching_status, ==, DB_NO_CACHE); + ASSERT(!multilist_link_active(&db->db_cache_link)); + + kmem_cache_free(dbuf_kmem_cache, db); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + + /* + * If this dbuf is referenced from an indirect dbuf, + * decrement the ref count on the indirect dbuf. + */ + if (parent && parent != dndb) { + mutex_enter(&parent->db_mtx); + dbuf_rele_and_unlock(parent, db, B_TRUE); + } +} + +/* + * Note: While bpp will always be updated if the function returns success, + * parentp will not be updated if the dnode does not have dn_dbuf filled in; + * this happens when the dnode is the meta-dnode, or {user|group|project}used + * object. + */ +__attribute__((always_inline)) +static inline int +dbuf_findbp(dnode_t *dn, int level, uint64_t blkid, int fail_sparse, + dmu_buf_impl_t **parentp, blkptr_t **bpp) +{ + *parentp = NULL; + *bpp = NULL; + + ASSERT(blkid != DMU_BONUS_BLKID); + + if (blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (dn->dn_have_spill && + (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + *bpp = DN_SPILL_BLKPTR(dn->dn_phys); + else + *bpp = NULL; + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + mutex_exit(&dn->dn_mtx); + return (0); + } + + int nlevels = + (dn->dn_phys->dn_nlevels == 0) ? 1 : dn->dn_phys->dn_nlevels; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT3U(level * epbs, <, 64); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + /* + * This assertion shouldn't trip as long as the max indirect block size + * is less than 1M. The reason for this is that up to that point, + * the number of levels required to address an entire object with blocks + * of size SPA_MINBLOCKSIZE satisfies nlevels * epbs + 1 <= 64. In + * other words, if N * epbs + 1 > 64, then if (N-1) * epbs + 1 > 55 + * (i.e. we can address the entire object), objects will all use at most + * N-1 levels and the assertion won't overflow. However, once epbs is + * 13, 4 * 13 + 1 = 53, but 5 * 13 + 1 = 66. Then, 4 levels will not be + * enough to address an entire object, so objects will have 5 levels, + * but then this assertion will overflow. + * + * All this is to say that if we ever increase DN_MAX_INDBLKSHIFT, we + * need to redo this logic to handle overflows. + */ + ASSERT(level >= nlevels || + ((nlevels - level - 1) * epbs) + + highbit64(dn->dn_phys->dn_nblkptr) <= 64); + if (level >= nlevels || + blkid >= ((uint64_t)dn->dn_phys->dn_nblkptr << + ((nlevels - level - 1) * epbs)) || + (fail_sparse && + blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs)))) { + /* the buffer has no parent yet */ + return (SET_ERROR(ENOENT)); + } else if (level < nlevels-1) { + /* this block is referenced from an indirect block */ + int err; + + err = dbuf_hold_impl(dn, level + 1, + blkid >> epbs, fail_sparse, FALSE, NULL, parentp); + + if (err) + return (err); + err = dbuf_read(*parentp, NULL, + (DB_RF_HAVESTRUCT | DB_RF_NOPREFETCH | DB_RF_CANFAIL)); + if (err) { + dbuf_rele(*parentp, NULL); + *parentp = NULL; + return (err); + } + rw_enter(&(*parentp)->db_rwlock, RW_READER); + *bpp = ((blkptr_t *)(*parentp)->db.db_data) + + (blkid & ((1ULL << epbs) - 1)); + if (blkid > (dn->dn_phys->dn_maxblkid >> (level * epbs))) + ASSERT(BP_IS_HOLE(*bpp)); + rw_exit(&(*parentp)->db_rwlock); + return (0); + } else { + /* the block is referenced from the dnode */ + ASSERT3U(level, ==, nlevels-1); + ASSERT(dn->dn_phys->dn_nblkptr == 0 || + blkid < dn->dn_phys->dn_nblkptr); + if (dn->dn_dbuf) { + dbuf_add_ref(dn->dn_dbuf, NULL); + *parentp = dn->dn_dbuf; + } + *bpp = &dn->dn_phys->dn_blkptr[blkid]; + return (0); + } +} + +static dmu_buf_impl_t * +dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, + dmu_buf_impl_t *parent, blkptr_t *blkptr) +{ + objset_t *os = dn->dn_objset; + dmu_buf_impl_t *db, *odb; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT(dn->dn_type != DMU_OT_NONE); + + db = kmem_cache_alloc(dbuf_kmem_cache, KM_SLEEP); + + list_create(&db->db_dirty_records, sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dbuf_node)); + + db->db_objset = os; + db->db.db_object = dn->dn_object; + db->db_level = level; + db->db_blkid = blkid; + db->db_dirtycnt = 0; + db->db_dnode_handle = dn->dn_handle; + db->db_parent = parent; + db->db_blkptr = blkptr; + + db->db_user = NULL; + db->db_user_immediate_evict = FALSE; + db->db_freed_in_flight = FALSE; + db->db_pending_evict = FALSE; + + if (blkid == DMU_BONUS_BLKID) { + ASSERT3P(parent, ==, dn->dn_dbuf); + db->db.db_size = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT3U(db->db.db_size, >=, dn->dn_bonuslen); + db->db.db_offset = DMU_BONUS_BLKID; + db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "bonus buffer created"); + db->db_caching_status = DB_NO_CACHE; + /* the bonus dbuf is not placed in the hash table */ + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + return (db); + } else if (blkid == DMU_SPILL_BLKID) { + db->db.db_size = (blkptr != NULL) ? + BP_GET_LSIZE(blkptr) : SPA_MINBLOCKSIZE; + db->db.db_offset = 0; + } else { + int blocksize = + db->db_level ? 1 << dn->dn_indblkshift : dn->dn_datablksz; + db->db.db_size = blocksize; + db->db.db_offset = db->db_blkid * blocksize; + } + + /* + * Hold the dn_dbufs_mtx while we get the new dbuf + * in the hash table *and* added to the dbufs list. + * This prevents a possible deadlock with someone + * trying to look up this dbuf before it's added to the + * dn_dbufs list. + */ + mutex_enter(&dn->dn_dbufs_mtx); + db->db_state = DB_EVICTING; /* not worth logging this state change */ + if ((odb = dbuf_hash_insert(db)) != NULL) { + /* someone else inserted it first */ + kmem_cache_free(dbuf_kmem_cache, db); + mutex_exit(&dn->dn_dbufs_mtx); + DBUF_STAT_BUMP(hash_insert_race); + return (odb); + } + avl_add(&dn->dn_dbufs, db); + + db->db_state = DB_UNCACHED; + DTRACE_SET_STATE(db, "regular buffer created"); + db->db_caching_status = DB_NO_CACHE; + mutex_exit(&dn->dn_dbufs_mtx); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_DBUF); + + if (parent && parent != dn->dn_dbuf) + dbuf_add_ref(parent, db); + + ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || + zfs_refcount_count(&dn->dn_holds) > 0); + (void) zfs_refcount_add(&dn->dn_holds, db); + + dprintf_dbuf(db, "db=%p\n", db); + + return (db); +} + +/* + * This function returns a block pointer and information about the object, + * given a dnode and a block. This is a publicly accessible version of + * dbuf_findbp that only returns some information, rather than the + * dbuf. Note that the dnode passed in must be held, and the dn_struct_rwlock + * should be locked as (at least) a reader. + */ +int +dbuf_dnode_findbp(dnode_t *dn, uint64_t level, uint64_t blkid, + blkptr_t *bp, uint16_t *datablkszsec, uint8_t *indblkshift) +{ + dmu_buf_impl_t *dbp = NULL; + blkptr_t *bp2; + int err = 0; + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + err = dbuf_findbp(dn, level, blkid, B_FALSE, &dbp, &bp2); + if (err == 0) { + *bp = *bp2; + if (dbp != NULL) + dbuf_rele(dbp, NULL); + if (datablkszsec != NULL) + *datablkszsec = dn->dn_phys->dn_datablkszsec; + if (indblkshift != NULL) + *indblkshift = dn->dn_phys->dn_indblkshift; + } + + return (err); +} + +typedef struct dbuf_prefetch_arg { + spa_t *dpa_spa; /* The spa to issue the prefetch in. */ + zbookmark_phys_t dpa_zb; /* The target block to prefetch. */ + int dpa_epbs; /* Entries (blkptr_t's) Per Block Shift. */ + int dpa_curlevel; /* The current level that we're reading */ + dnode_t *dpa_dnode; /* The dnode associated with the prefetch */ + zio_priority_t dpa_prio; /* The priority I/Os should be issued at. */ + zio_t *dpa_zio; /* The parent zio_t for all prefetches. */ + arc_flags_t dpa_aflags; /* Flags to pass to the final prefetch. */ +} dbuf_prefetch_arg_t; + +/* + * Actually issue the prefetch read for the block given. + */ +static void +dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) +{ + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) + return; + + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + arc_flags_t aflags = + dpa->dpa_aflags | ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + + /* dnodes are always read as raw and then converted later */ + if (BP_GET_TYPE(bp) == DMU_OT_DNODE && BP_IS_PROTECTED(bp) && + dpa->dpa_curlevel == 0) + zio_flags |= ZIO_FLAG_RAW; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + ASSERT3U(dpa->dpa_curlevel, ==, dpa->dpa_zb.zb_level); + ASSERT(dpa->dpa_zio != NULL); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, bp, NULL, NULL, + dpa->dpa_prio, zio_flags, &aflags, &dpa->dpa_zb); +} + +/* + * Called when an indirect block above our prefetch target is read in. This + * will either read in the next indirect block down the tree or issue the actual + * prefetch if the next block down is our target. + */ +static void +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) +{ + dbuf_prefetch_arg_t *dpa = private; + + ASSERT3S(dpa->dpa_zb.zb_level, <, dpa->dpa_curlevel); + ASSERT3S(dpa->dpa_curlevel, >, 0); + + if (abuf == NULL) { + ASSERT(zio == NULL || zio->io_error != 0); + kmem_free(dpa, sizeof (*dpa)); + return; + } + ASSERT(zio == NULL || zio->io_error == 0); + + /* + * The dpa_dnode is only valid if we are called with a NULL + * zio. This indicates that the arc_read() returned without + * first calling zio_read() to issue a physical read. Once + * a physical read is made the dpa_dnode must be invalidated + * as the locks guarding it may have been dropped. If the + * dpa_dnode is still valid, then we want to add it to the dbuf + * cache. To do so, we must hold the dbuf associated with the block + * we just prefetched, read its contents so that we associate it + * with an arc_buf_t, and then release it. + */ + if (zio != NULL) { + ASSERT3S(BP_GET_LEVEL(zio->io_bp), ==, dpa->dpa_curlevel); + if (zio->io_flags & ZIO_FLAG_RAW_COMPRESS) { + ASSERT3U(BP_GET_PSIZE(zio->io_bp), ==, zio->io_size); + } else { + ASSERT3U(BP_GET_LSIZE(zio->io_bp), ==, zio->io_size); + } + ASSERT3P(zio->io_spa, ==, dpa->dpa_spa); + + dpa->dpa_dnode = NULL; + } else if (dpa->dpa_dnode != NULL) { + uint64_t curblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - + dpa->dpa_zb.zb_level)); + dmu_buf_impl_t *db = dbuf_hold_level(dpa->dpa_dnode, + dpa->dpa_curlevel, curblkid, FTAG); + if (db == NULL) { + kmem_free(dpa, sizeof (*dpa)); + arc_buf_destroy(abuf, private); + return; + } + + (void) dbuf_read(db, NULL, + DB_RF_MUST_SUCCEED | DB_RF_NOPREFETCH | DB_RF_HAVESTRUCT); + dbuf_rele(db, FTAG); + } + + dpa->dpa_curlevel--; + uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> + (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); + blkptr_t *bp = ((blkptr_t *)abuf->b_data) + + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + + ASSERT(!BP_IS_REDACTED(bp) || + dsl_dataset_feature_is_active( + dpa->dpa_dnode->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { + kmem_free(dpa, sizeof (*dpa)); + } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { + ASSERT3U(nextblkid, ==, dpa->dpa_zb.zb_blkid); + dbuf_issue_final_prefetch(dpa, bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (dpa->dpa_aflags & ARC_FLAG_L2CACHE) + iter_aflags |= ARC_FLAG_L2CACHE; + + ASSERT3U(dpa->dpa_curlevel, ==, BP_GET_LEVEL(bp)); + + SET_BOOKMARK(&zb, dpa->dpa_zb.zb_objset, + dpa->dpa_zb.zb_object, dpa->dpa_curlevel, nextblkid); + + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + bp, dbuf_prefetch_indirect_done, dpa, dpa->dpa_prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + + arc_buf_destroy(abuf, private); +} + +/* + * Issue prefetch reads for the given block on the given level. If the indirect + * blocks above that block are not in memory, we will read them in + * asynchronously. As a result, this call never blocks waiting for a read to + * complete. Note that the prefetch might fail if the dataset is encrypted and + * the encryption key is unmapped before the IO completes. + */ +void +dbuf_prefetch(dnode_t *dn, int64_t level, uint64_t blkid, zio_priority_t prio, + arc_flags_t aflags) +{ + blkptr_t bp; + int epbs, nlevels, curlevel; + uint64_t curblkid; + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + if (blkid > dn->dn_maxblkid) + return; + + if (level == 0 && dnode_block_freed(dn, blkid)) + return; + + /* + * This dnode hasn't been written to disk yet, so there's nothing to + * prefetch. + */ + nlevels = dn->dn_phys->dn_nlevels; + if (level >= nlevels || dn->dn_phys->dn_nblkptr == 0) + return; + + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + if (dn->dn_phys->dn_maxblkid < blkid << (epbs * level)) + return; + + dmu_buf_impl_t *db = dbuf_find(dn->dn_objset, dn->dn_object, + level, blkid); + if (db != NULL) { + mutex_exit(&db->db_mtx); + /* + * This dbuf already exists. It is either CACHED, or + * (we assume) about to be read or filled. + */ + return; + } + + /* + * Find the closest ancestor (indirect block) of the target block + * that is present in the cache. In this indirect block, we will + * find the bp that is at curlevel, curblkid. + */ + curlevel = level; + curblkid = blkid; + while (curlevel < nlevels - 1) { + int parent_level = curlevel + 1; + uint64_t parent_blkid = curblkid >> epbs; + dmu_buf_impl_t *db; + + if (dbuf_hold_impl(dn, parent_level, parent_blkid, + FALSE, TRUE, FTAG, &db) == 0) { + blkptr_t *bpp = db->db_buf->b_data; + bp = bpp[P2PHASE(curblkid, 1 << epbs)]; + dbuf_rele(db, FTAG); + break; + } + + curlevel = parent_level; + curblkid = parent_blkid; + } + + if (curlevel == nlevels - 1) { + /* No cached indirect blocks found. */ + ASSERT3U(curblkid, <, dn->dn_phys->dn_nblkptr); + bp = dn->dn_phys->dn_blkptr[curblkid]; + } + ASSERT(!BP_IS_REDACTED(&bp) || + dsl_dataset_feature_is_active(dn->dn_objset->os_dsl_dataset, + SPA_FEATURE_REDACTED_DATASETS)); + if (BP_IS_HOLE(&bp) || BP_IS_REDACTED(&bp)) + return; + + ASSERT3U(curlevel, ==, BP_GET_LEVEL(&bp)); + + zio_t *pio = zio_root(dmu_objset_spa(dn->dn_objset), NULL, NULL, + ZIO_FLAG_CANFAIL); + + dbuf_prefetch_arg_t *dpa = kmem_zalloc(sizeof (*dpa), KM_SLEEP); + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + SET_BOOKMARK(&dpa->dpa_zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, level, blkid); + dpa->dpa_curlevel = curlevel; + dpa->dpa_prio = prio; + dpa->dpa_aflags = aflags; + dpa->dpa_spa = dn->dn_objset->os_spa; + dpa->dpa_dnode = dn; + dpa->dpa_epbs = epbs; + dpa->dpa_zio = pio; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + dpa->dpa_aflags |= ARC_FLAG_L2CACHE; + + /* + * If we have the indirect just above us, no need to do the asynchronous + * prefetch chain; we'll just run the last step ourselves. If we're at + * a higher level, though, we want to issue the prefetches for all the + * indirect blocks asynchronously, so we can go on with whatever we were + * doing. + */ + if (curlevel == level) { + ASSERT3U(curblkid, ==, blkid); + dbuf_issue_final_prefetch(dpa, &bp); + kmem_free(dpa, sizeof (*dpa)); + } else { + arc_flags_t iter_aflags = ARC_FLAG_NOWAIT; + zbookmark_phys_t zb; + + /* flag if L2ARC eligible, l2arc_noprefetch then decides */ + if (DNODE_LEVEL_IS_L2CACHEABLE(dn, level)) + iter_aflags |= ARC_FLAG_L2CACHE; + + SET_BOOKMARK(&zb, ds != NULL ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, curlevel, curblkid); + (void) arc_read(dpa->dpa_zio, dpa->dpa_spa, + &bp, dbuf_prefetch_indirect_done, dpa, prio, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &iter_aflags, &zb); + } + /* + * We use pio here instead of dpa_zio since it's possible that + * dpa may have already been freed. + */ + zio_nowait(pio); +} + +/* + * Helper function for dbuf_hold_impl() to copy a buffer. Handles + * the case of encrypted, compressed and uncompressed buffers by + * allocating the new buffer, respectively, with arc_alloc_raw_buf(), + * arc_alloc_compressed_buf() or arc_alloc_buf().* + * + * NOTE: Declared noinline to avoid stack bloat in dbuf_hold_impl(). + */ +noinline static void +dbuf_hold_copy(dnode_t *dn, dmu_buf_impl_t *db) +{ + dbuf_dirty_record_t *dr = db->db_data_pending; + arc_buf_t *newdata, *data = dr->dt.dl.dr_data; + + newdata = dbuf_alloc_arcbuf_from_arcbuf(db, data); + dbuf_set_data(db, newdata); + rw_enter(&db->db_rwlock, RW_WRITER); + bcopy(data->b_data, db->db.db_data, arc_buf_size(data)); + rw_exit(&db->db_rwlock); +} + +/* + * Returns with db_holds incremented, and db_mtx not held. + * Note: dn_struct_rwlock must be held. + */ +int +dbuf_hold_impl(dnode_t *dn, uint8_t level, uint64_t blkid, + boolean_t fail_sparse, boolean_t fail_uncached, + void *tag, dmu_buf_impl_t **dbp) +{ + dmu_buf_impl_t *db, *parent = NULL; + + /* If the pool has been created, verify the tx_sync_lock is not held */ + spa_t *spa = dn->dn_objset->os_spa; + dsl_pool_t *dp = spa->spa_dsl_pool; + if (dp != NULL) { + ASSERT(!MUTEX_HELD(&dp->dp_tx.tx_sync_lock)); + } + + ASSERT(blkid != DMU_BONUS_BLKID); + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + ASSERT3U(dn->dn_nlevels, >, level); + + *dbp = NULL; + + /* dbuf_find() returns with db_mtx held */ + db = dbuf_find(dn->dn_objset, dn->dn_object, level, blkid); + + if (db == NULL) { + blkptr_t *bp = NULL; + int err; + + if (fail_uncached) + return (SET_ERROR(ENOENT)); + + ASSERT3P(parent, ==, NULL); + err = dbuf_findbp(dn, level, blkid, fail_sparse, &parent, &bp); + if (fail_sparse) { + if (err == 0 && bp && BP_IS_HOLE(bp)) + err = SET_ERROR(ENOENT); + if (err) { + if (parent) + dbuf_rele(parent, NULL); + return (err); + } + } + if (err && err != ENOENT) + return (err); + db = dbuf_create(dn, level, blkid, parent, bp); + } + + if (fail_uncached && db->db_state != DB_CACHED) { + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + + if (db->db_buf != NULL) { + arc_buf_access(db->db_buf); + ASSERT3P(db->db.db_data, ==, db->db_buf->b_data); + } + + ASSERT(db->db_buf == NULL || arc_referenced(db->db_buf)); + + /* + * If this buffer is currently syncing out, and we are + * still referencing it from db_data, we need to make a copy + * of it in case we decide we want to dirty it again in this txg. + */ + if (db->db_level == 0 && db->db_blkid != DMU_BONUS_BLKID && + dn->dn_object != DMU_META_DNODE_OBJECT && + db->db_state == DB_CACHED && db->db_data_pending) { + dbuf_dirty_record_t *dr = db->db_data_pending; + if (dr->dt.dl.dr_data == db->db_buf) + dbuf_hold_copy(dn, db); + } + + if (multilist_link_active(&db->db_cache_link)) { + ASSERT(zfs_refcount_is_zero(&db->db_holds)); + ASSERT(db->db_caching_status == DB_DBUF_CACHE || + db->db_caching_status == DB_DBUF_METADATA_CACHE); + + multilist_remove(dbuf_caches[db->db_caching_status].cache, db); + (void) zfs_refcount_remove_many( + &dbuf_caches[db->db_caching_status].size, + db->db.db_size, db); + + if (db->db_caching_status == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMPDOWN(metadata_cache_count); + } else { + DBUF_STAT_BUMPDOWN(cache_levels[db->db_level]); + DBUF_STAT_BUMPDOWN(cache_count); + DBUF_STAT_DECR(cache_levels_bytes[db->db_level], + db->db.db_size); + } + db->db_caching_status = DB_NO_CACHE; + } + (void) zfs_refcount_add(&db->db_holds, tag); + DBUF_VERIFY(db); + mutex_exit(&db->db_mtx); + + /* NOTE: we can't rele the parent until after we drop the db_mtx */ + if (parent) + dbuf_rele(parent, NULL); + + ASSERT3P(DB_DNODE(db), ==, dn); + ASSERT3U(db->db_blkid, ==, blkid); + ASSERT3U(db->db_level, ==, level); + *dbp = db; + + return (0); +} + +dmu_buf_impl_t * +dbuf_hold(dnode_t *dn, uint64_t blkid, void *tag) +{ + return (dbuf_hold_level(dn, 0, blkid, tag)); +} + +dmu_buf_impl_t * +dbuf_hold_level(dnode_t *dn, int level, uint64_t blkid, void *tag) +{ + dmu_buf_impl_t *db; + int err = dbuf_hold_impl(dn, level, blkid, FALSE, FALSE, tag, &db); + return (err ? NULL : db); +} + +void +dbuf_create_bonus(dnode_t *dn) +{ + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + ASSERT(dn->dn_bonus == NULL); + dn->dn_bonus = dbuf_create(dn, 0, DMU_BONUS_BLKID, dn->dn_dbuf, NULL); +} + +int +dbuf_spill_set_blksz(dmu_buf_t *db_fake, uint64_t blksz, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + if (db->db_blkid != DMU_SPILL_BLKID) + return (SET_ERROR(ENOTSUP)); + if (blksz == 0) + blksz = SPA_MINBLOCKSIZE; + ASSERT3U(blksz, <=, spa_maxblocksize(dmu_objset_spa(db->db_objset))); + blksz = P2ROUNDUP(blksz, SPA_MINBLOCKSIZE); + + dbuf_new_size(db, blksz, tx); + + return (0); +} + +void +dbuf_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + dbuf_free_range(dn, DMU_SPILL_BLKID, DMU_SPILL_BLKID, tx); +} + +#pragma weak dmu_buf_add_ref = dbuf_add_ref +void +dbuf_add_ref(dmu_buf_impl_t *db, void *tag) +{ + int64_t holds = zfs_refcount_add(&db->db_holds, tag); + VERIFY3S(holds, >, 1); +} + +#pragma weak dmu_buf_try_add_ref = dbuf_try_add_ref +boolean_t +dbuf_try_add_ref(dmu_buf_t *db_fake, objset_t *os, uint64_t obj, uint64_t blkid, + void *tag) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dmu_buf_impl_t *found_db; + boolean_t result = B_FALSE; + + if (blkid == DMU_BONUS_BLKID) + found_db = dbuf_find_bonus(os, obj); + else + found_db = dbuf_find(os, obj, 0, blkid); + + if (found_db != NULL) { + if (db == found_db && dbuf_refcount(db) > db->db_dirtycnt) { + (void) zfs_refcount_add(&db->db_holds, tag); + result = B_TRUE; + } + mutex_exit(&found_db->db_mtx); + } + return (result); +} + +/* + * If you call dbuf_rele() you had better not be referencing the dnode handle + * unless you have some other direct or indirect hold on the dnode. (An indirect + * hold is a hold on one of the dnode's dbufs, including the bonus buffer.) + * Without that, the dbuf_rele() could lead to a dnode_rele() followed by the + * dnode's parent dbuf evicting its dnode handles. + */ +void +dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag, B_FALSE); +} + +void +dmu_buf_rele(dmu_buf_t *db, void *tag) +{ + dbuf_rele((dmu_buf_impl_t *)db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. The 'evicting' + * argument should be set if we are already in the dbuf-evicting code + * path, in which case we don't want to recursively evict. This allows us to + * avoid deeply nested stacks that would have a call flow similar to this: + * + * dbuf_rele()-->dbuf_rele_and_unlock()-->dbuf_evict_notify() + * ^ | + * | | + * +-----dbuf_destroy()<--dbuf_evict_one()<--------+ + * + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag, boolean_t evicting) +{ + int64_t holds; + uint64_t size; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + DBUF_VERIFY(db); + + /* + * Remove the reference to the dbuf before removing its hold on the + * dnode so we can guarantee in dnode_move() that a referenced bonus + * buffer has a corresponding dnode hold. + */ + holds = zfs_refcount_remove(&db->db_holds, tag); + ASSERT(holds >= 0); + + /* + * We can't freeze indirects if there is a possibility that they + * may be modified in the current syncing context. + */ + if (db->db_buf != NULL && + holds == (db->db_level == 0 ? db->db_dirtycnt : 0)) { + arc_buf_freeze(db->db_buf); + } + + if (holds == db->db_dirtycnt && + db->db_level == 0 && db->db_user_immediate_evict) + dbuf_evict_user(db); + + if (holds == 0) { + if (db->db_blkid == DMU_BONUS_BLKID) { + dnode_t *dn; + boolean_t evict_dbuf = db->db_pending_evict; + + /* + * If the dnode moves here, we cannot cross this + * barrier until the move completes. + */ + DB_DNODE_ENTER(db); + + dn = DB_DNODE(db); + atomic_dec_32(&dn->dn_dbufs_count); + + /* + * Decrementing the dbuf count means that the bonus + * buffer's dnode hold is no longer discounted in + * dnode_move(). The dnode cannot move until after + * the dnode_rele() below. + */ + DB_DNODE_EXIT(db); + + /* + * Do not reference db after its lock is dropped. + * Another thread may evict it. + */ + mutex_exit(&db->db_mtx); + + if (evict_dbuf) + dnode_evict_bonus(dn); + + dnode_rele(dn, db); + } else if (db->db_buf == NULL) { + /* + * This is a special case: we never associated this + * dbuf with any data allocated from the ARC. + */ + ASSERT(db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); + dbuf_destroy(db); + } else if (arc_released(db->db_buf)) { + /* + * This dbuf has anonymous data associated with it. + */ + dbuf_destroy(db); + } else { + boolean_t do_arc_evict = B_FALSE; + blkptr_t bp; + spa_t *spa = dmu_objset_spa(db->db_objset); + + if (!DBUF_IS_CACHEABLE(db) && + db->db_blkptr != NULL && + !BP_IS_HOLE(db->db_blkptr) && + !BP_IS_EMBEDDED(db->db_blkptr)) { + do_arc_evict = B_TRUE; + bp = *db->db_blkptr; + } + + if (!DBUF_IS_CACHEABLE(db) || + db->db_pending_evict) { + dbuf_destroy(db); + } else if (!multilist_link_active(&db->db_cache_link)) { + ASSERT3U(db->db_caching_status, ==, + DB_NO_CACHE); + + dbuf_cached_state_t dcs = + dbuf_include_in_metadata_cache(db) ? + DB_DBUF_METADATA_CACHE : DB_DBUF_CACHE; + db->db_caching_status = dcs; + + multilist_insert(dbuf_caches[dcs].cache, db); + size = zfs_refcount_add_many( + &dbuf_caches[dcs].size, + db->db.db_size, db); + + if (dcs == DB_DBUF_METADATA_CACHE) { + DBUF_STAT_BUMP(metadata_cache_count); + DBUF_STAT_MAX( + metadata_cache_size_bytes_max, + size); + } else { + DBUF_STAT_BUMP( + cache_levels[db->db_level]); + DBUF_STAT_BUMP(cache_count); + DBUF_STAT_INCR( + cache_levels_bytes[db->db_level], + db->db.db_size); + DBUF_STAT_MAX(cache_size_bytes_max, + size); + } + mutex_exit(&db->db_mtx); + + if (dcs == DB_DBUF_CACHE && !evicting) + dbuf_evict_notify(size); + } + + if (do_arc_evict) + arc_freed(spa, &bp); + } + } else { + mutex_exit(&db->db_mtx); + } + +} + +#pragma weak dmu_buf_refcount = dbuf_refcount +uint64_t +dbuf_refcount(dmu_buf_impl_t *db) +{ + return (zfs_refcount_count(&db->db_holds)); +} + +uint64_t +dmu_buf_user_refcount(dmu_buf_t *db_fake) +{ + uint64_t holds; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + ASSERT3U(zfs_refcount_count(&db->db_holds), >=, db->db_dirtycnt); + holds = zfs_refcount_count(&db->db_holds) - db->db_dirtycnt; + mutex_exit(&db->db_mtx); + + return (holds); +} + +void * +dmu_buf_replace_user(dmu_buf_t *db_fake, dmu_buf_user_t *old_user, + dmu_buf_user_t *new_user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + mutex_enter(&db->db_mtx); + dbuf_verify_user(db, DBVU_NOT_EVICTING); + if (db->db_user == old_user) + db->db_user = new_user; + else + old_user = db->db_user; + dbuf_verify_user(db, DBVU_NOT_EVICTING); + mutex_exit(&db->db_mtx); + + return (old_user); +} + +void * +dmu_buf_set_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, NULL, user)); +} + +void * +dmu_buf_set_user_ie(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + db->db_user_immediate_evict = TRUE; + return (dmu_buf_set_user(db_fake, user)); +} + +void * +dmu_buf_remove_user(dmu_buf_t *db_fake, dmu_buf_user_t *user) +{ + return (dmu_buf_replace_user(db_fake, user, NULL)); +} + +void * +dmu_buf_get_user(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + dbuf_verify_user(db, DBVU_NOT_EVICTING); + return (db->db_user); +} + +void +dmu_buf_user_evict_wait() +{ + taskq_wait(dbu_evict_taskq); +} + +blkptr_t * +dmu_buf_get_blkptr(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_blkptr); +} + +objset_t * +dmu_buf_get_objset(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + return (dbi->db_objset); +} + +dnode_t * +dmu_buf_dnode_enter(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_ENTER(dbi); + return (DB_DNODE(dbi)); +} + +void +dmu_buf_dnode_exit(dmu_buf_t *db) +{ + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + DB_DNODE_EXIT(dbi); +} + +static void +dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) +{ + /* ASSERT(dmu_tx_is_syncing(tx) */ + ASSERT(MUTEX_HELD(&db->db_mtx)); + + if (db->db_blkptr != NULL) + return; + + if (db->db_blkid == DMU_SPILL_BLKID) { + db->db_blkptr = DN_SPILL_BLKPTR(dn->dn_phys); + BP_ZERO(db->db_blkptr); + return; + } + if (db->db_level == dn->dn_phys->dn_nlevels-1) { + /* + * This buffer was allocated at a time when there was + * no available blkptrs from the dnode, or it was + * inappropriate to hook it in (i.e., nlevels mismatch). + */ + ASSERT(db->db_blkid < dn->dn_phys->dn_nblkptr); + ASSERT(db->db_parent == NULL); + db->db_parent = dn->dn_dbuf; + db->db_blkptr = &dn->dn_phys->dn_blkptr[db->db_blkid]; + DBUF_VERIFY(db); + } else { + dmu_buf_impl_t *parent = db->db_parent; + int epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + + ASSERT(dn->dn_phys->dn_nlevels > 1); + if (parent == NULL) { + mutex_exit(&db->db_mtx); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + parent = dbuf_hold_level(dn, db->db_level + 1, + db->db_blkid >> epbs, db); + rw_exit(&dn->dn_struct_rwlock); + mutex_enter(&db->db_mtx); + db->db_parent = parent; + } + db->db_blkptr = (blkptr_t *)parent->db.db_data + + (db->db_blkid & ((1ULL << epbs) - 1)); + DBUF_VERIFY(db); + } +} + +static void +dbuf_sync_bonus(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + void *data = dr->dt.dl.dr_data; + + ASSERT0(db->db_level); + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT(DB_DNODE_HELD(db)); + ASSERT(db->db_blkid == DMU_BONUS_BLKID); + ASSERT(data != NULL); + + dnode_t *dn = DB_DNODE(db); + ASSERT3U(DN_MAX_BONUS_LEN(dn->dn_phys), <=, + DN_SLOTS_TO_BONUSLEN(dn->dn_phys->dn_extra_slots + 1)); + bcopy(data, DN_BONUS(dn->dn_phys), DN_MAX_BONUS_LEN(dn->dn_phys)); + DB_DNODE_EXIT(db); + + dbuf_sync_leaf_verify_bonus_dnode(dr); + + dbuf_undirty_bonus(dr); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); +} + +/* + * When syncing out a blocks of dnodes, adjust the block to deal with + * encryption. Normally, we make sure the block is decrypted before writing + * it. If we have crypt params, then we are writing a raw (encrypted) block, + * from a raw receive. In this case, set the ARC buf's crypt params so + * that the BP will be filled with the correct byteorder, salt, iv, and mac. + */ +static void +dbuf_prepare_encrypted_dnode_leaf(dbuf_dirty_record_t *dr) +{ + int err; + dmu_buf_impl_t *db = dr->dr_dbuf; + + ASSERT(MUTEX_HELD(&db->db_mtx)); + ASSERT3U(db->db.db_object, ==, DMU_META_DNODE_OBJECT); + ASSERT3U(db->db_level, ==, 0); + + if (!db->db_objset->os_raw_receive && arc_is_encrypted(db->db_buf)) { + zbookmark_phys_t zb; + + /* + * Unfortunately, there is currently no mechanism for + * syncing context to handle decryption errors. An error + * here is only possible if an attacker maliciously + * changed a dnode block and updated the associated + * checksums going up the block tree. + */ + SET_BOOKMARK(&zb, dmu_objset_id(db->db_objset), + db->db.db_object, db->db_level, db->db_blkid); + err = arc_untransform(db->db_buf, db->db_objset->os_spa, + &zb, B_TRUE); + if (err) + panic("Invalid dnode block MAC"); + } else if (dr->dt.dl.dr_has_raw_params) { + (void) arc_release(dr->dt.dl.dr_data, db); + arc_convert_to_raw(dr->dt.dl.dr_data, + dmu_objset_id(db->db_objset), + dr->dt.dl.dr_byteorder, DMU_OT_DNODE, + dr->dt.dl.dr_salt, dr->dt.dl.dr_iv, dr->dt.dl.dr_mac); + } +} + +/* + * dbuf_sync_indirect() is called recursively from dbuf_sync_list() so it + * is critical the we not allow the compiler to inline this function in to + * dbuf_sync_list() thereby drastically bloating the stack usage. + */ +noinline static void +dbuf_sync_indirect(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + zio_t *zio; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + + ASSERT(db->db_level > 0); + DBUF_VERIFY(db); + + /* Read the block if it hasn't been read yet. */ + if (db->db_buf == NULL) { + mutex_exit(&db->db_mtx); + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + mutex_enter(&db->db_mtx); + } + ASSERT3U(db->db_state, ==, DB_CACHED); + ASSERT(db->db_buf != NULL); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + /* Indirect block size must match what the dnode thinks it is. */ + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + dbuf_check_blkptr(dn, db); + DB_DNODE_EXIT(db); + + /* Provide the pending dirty record to child dbufs */ + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, db->db_buf, tx); + + zio = dr->dr_zio; + mutex_enter(&dr->dt.di.dr_mtx); + dbuf_sync_list(&dr->dt.di.dr_children, db->db_level - 1, tx); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + mutex_exit(&dr->dt.di.dr_mtx); + zio_nowait(zio); +} + +/* + * Verify that the size of the data in our bonus buffer does not exceed + * its recorded size. + * + * The purpose of this verification is to catch any cases in development + * where the size of a phys structure (i.e space_map_phys_t) grows and, + * due to incorrect feature management, older pools expect to read more + * data even though they didn't actually write it to begin with. + * + * For a example, this would catch an error in the feature logic where we + * open an older pool and we expect to write the space map histogram of + * a space map with size SPACE_MAP_SIZE_V0. + */ +static void +dbuf_sync_leaf_verify_bonus_dnode(dbuf_dirty_record_t *dr) +{ +#ifdef ZFS_DEBUG + dnode_t *dn = DB_DNODE(dr->dr_dbuf); + + /* + * Encrypted bonus buffers can have data past their bonuslen. + * Skip the verification of these blocks. + */ + if (DMU_OT_IS_ENCRYPTED(dn->dn_bonustype)) + return; + + uint16_t bonuslen = dn->dn_phys->dn_bonuslen; + uint16_t maxbonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT3U(bonuslen, <=, maxbonuslen); + + arc_buf_t *datap = dr->dt.dl.dr_data; + char *datap_end = ((char *)datap) + bonuslen; + char *datap_max = ((char *)datap) + maxbonuslen; + + /* ensure that everything is zero after our data */ + for (; datap_end < datap_max; datap_end++) + ASSERT(*datap_end == 0); +#endif +} + +/* + * dbuf_sync_leaf() is called recursively from dbuf_sync_list() so it is + * critical the we not allow the compiler to inline this function in to + * dbuf_sync_list() thereby drastically bloating the stack usage. + */ +noinline static void +dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) +{ + arc_buf_t **datap = &dr->dt.dl.dr_data; + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + uint64_t txg = tx->tx_txg; + + ASSERT(dmu_tx_is_syncing(tx)); + + dprintf_dbuf_bp(db, db->db_blkptr, "blkptr=%p", db->db_blkptr); + + mutex_enter(&db->db_mtx); + /* + * To be synced, we must be dirtied. But we + * might have been freed after the dirty. + */ + if (db->db_state == DB_UNCACHED) { + /* This buffer has been freed since it was dirtied */ + ASSERT(db->db.db_data == NULL); + } else if (db->db_state == DB_FILL) { + /* This buffer was freed and is now being re-filled */ + ASSERT(db->db.db_data != dr->dt.dl.dr_data); + } else { + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_NOFILL); + } + DBUF_VERIFY(db); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (db->db_blkid == DMU_SPILL_BLKID) { + mutex_enter(&dn->dn_mtx); + if (!(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + /* + * In the previous transaction group, the bonus buffer + * was entirely used to store the attributes for the + * dnode which overrode the dn_spill field. However, + * when adding more attributes to the file a spill + * block was required to hold the extra attributes. + * + * Make sure to clear the garbage left in the dn_spill + * field from the previous attributes in the bonus + * buffer. Otherwise, after writing out the spill + * block to the new allocated dva, it will free + * the old block pointed to by the invalid dn_spill. + */ + db->db_blkptr = NULL; + } + dn->dn_phys->dn_flags |= DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* + * If this is a bonus buffer, simply copy the bonus data into the + * dnode. It will be written out when the dnode is synced (and it + * will be synced, since it must have been dirty for dbuf_sync to + * be called). + */ + if (db->db_blkid == DMU_BONUS_BLKID) { + ASSERT(dr->dr_dbuf == db); + dbuf_sync_bonus(dr, tx); + return; + } + + os = dn->dn_objset; + + /* + * This function may have dropped the db_mtx lock allowing a dmu_sync + * operation to sneak in. As a result, we need to ensure that we + * don't check the dr_override_state until we have returned from + * dbuf_check_blkptr. + */ + dbuf_check_blkptr(dn, db); + + /* + * If this buffer is in the middle of an immediate write, + * wait for the synchronous IO to complete. + */ + while (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + cv_wait(&db->db_changed, &db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); + } + + /* + * If this is a dnode block, ensure it is appropriately encrypted + * or decrypted, depending on what we are writing to it this txg. + */ + if (os->os_encrypted && dn->dn_object == DMU_META_DNODE_OBJECT) + dbuf_prepare_encrypted_dnode_leaf(dr); + + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + zfs_refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + *datap = dbuf_alloc_arcbuf_from_arcbuf(db, db->db_buf); + bcopy(db->db.db_data, (*datap)->b_data, arc_buf_size(*datap)); + } + db->db_data_pending = dr; + + mutex_exit(&db->db_mtx); + + dbuf_write(dr, *datap, tx); + + ASSERT(!list_link_active(&dr->dr_dirty_node)); + if (dn->dn_object == DMU_META_DNODE_OBJECT) { + list_insert_tail(&dn->dn_dirty_records[txg & TXG_MASK], dr); + DB_DNODE_EXIT(db); + } else { + /* + * Although zio_nowait() does not "wait for an IO", it does + * initiate the IO. If this is an empty write it seems plausible + * that the IO could actually be completed before the nowait + * returns. We need to DB_DNODE_EXIT() first in case + * zio_nowait() invalidates the dbuf. + */ + DB_DNODE_EXIT(db); + zio_nowait(dr->dr_zio); + } +} + +void +dbuf_sync_list(list_t *list, int level, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + + while ((dr = list_head(list))) { + if (dr->dr_zio != NULL) { + /* + * If we find an already initialized zio then we + * are processing the meta-dnode, and we have finished. + * The dbufs for all dnodes are put back on the list + * during processing, so that we can zio_wait() + * these IOs after initiating all child IOs. + */ + ASSERT3U(dr->dr_dbuf->db.db_object, ==, + DMU_META_DNODE_OBJECT); + break; + } + if (dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + VERIFY3U(dr->dr_dbuf->db_level, ==, level); + } + list_remove(list, dr); + if (dr->dr_dbuf->db_level > 0) + dbuf_sync_indirect(dr, tx); + else + dbuf_sync_leaf(dr, tx); + } +} + +/* ARGSUSED */ +static void +dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + spa_t *spa = zio->io_spa; + int64_t delta; + uint64_t fill = 0; + int i; + + ASSERT3P(db->db_blkptr, !=, NULL); + ASSERT3P(&db->db_data_pending->dr_bp_copy, ==, bp); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; + + if (bp->blk_birth != 0) { + ASSERT((db->db_blkid != DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_type) || + (db->db_blkid == DMU_SPILL_BLKID && + BP_GET_TYPE(bp) == dn->dn_bonustype) || + BP_IS_EMBEDDED(bp)); + ASSERT(BP_GET_LEVEL(bp) == db->db_level); + } + + mutex_enter(&db->db_mtx); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(bp)) && + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); + } +#endif + + if (db->db_level == 0) { + mutex_enter(&dn->dn_mtx); + if (db->db_blkid > dn->dn_phys->dn_maxblkid && + db->db_blkid != DMU_SPILL_BLKID) { + ASSERT0(db->db_objset->os_raw_receive); + dn->dn_phys->dn_maxblkid = db->db_blkid; + } + mutex_exit(&dn->dn_mtx); + + if (dn->dn_type == DMU_OT_DNODE) { + i = 0; + while (i < db->db.db_size) { + dnode_phys_t *dnp = + (void *)(((char *)db->db.db_data) + i); + + i += DNODE_MIN_SIZE; + if (dnp->dn_type != DMU_OT_NONE) { + fill++; + i += dnp->dn_extra_slots * + DNODE_MIN_SIZE; + } + } + } else { + if (BP_IS_HOLE(bp)) { + fill = 0; + } else { + fill = 1; + } + } + } else { + blkptr_t *ibp = db->db.db_data; + ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); + for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { + if (BP_IS_HOLE(ibp)) + continue; + fill += BP_GET_FILL(ibp); + } + } + DB_DNODE_EXIT(db); + + if (!BP_IS_EMBEDDED(bp)) + BP_SET_FILL(bp, fill); + + mutex_exit(&db->db_mtx); + + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_WRITER, FTAG); + *db->db_blkptr = *bp; + dmu_buf_unlock_parent(db, dblt, FTAG); +} + +/* ARGSUSED */ +/* + * This function gets called just prior to running through the compression + * stage of the zio pipeline. If we're an indirect block comprised of only + * holes, then we want this indirect to be compressed away to a hole. In + * order to do that we must zero out any information about the holes that + * this indirect points to prior to before we try to compress it. + */ +static void +dbuf_write_children_ready(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + dnode_t *dn; + blkptr_t *bp; + unsigned int epbs, i; + + ASSERT3U(db->db_level, >, 0); + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(epbs, <, 31); + + /* Determine if all our children are holes */ + for (i = 0, bp = db->db.db_data; i < 1ULL << epbs; i++, bp++) { + if (!BP_IS_HOLE(bp)) + break; + } + + /* + * If all the children are holes, then zero them all out so that + * we may get compressed away. + */ + if (i == 1ULL << epbs) { + /* + * We only found holes. Grab the rwlock to prevent + * anybody from reading the blocks we're about to + * zero out. + */ + rw_enter(&db->db_rwlock, RW_WRITER); + bzero(db->db.db_data, db->db.db_size); + rw_exit(&db->db_rwlock); + } + DB_DNODE_EXIT(db); +} + +/* + * The SPA will call this callback several times for each zio - once + * for every physical child i/o (zio->io_phys_children times). This + * allows the DMU to monitor the progress of each logical i/o. For example, + * there may be 2 copies of an indirect block, or many fragments of a RAID-Z + * block. There may be a long delay before all copies/fragments are completed, + * so this callback allows us to retire dirty space gradually, as the physical + * i/os complete. + */ +/* ARGSUSED */ +static void +dbuf_write_physdone(zio_t *zio, arc_buf_t *buf, void *arg) +{ + dmu_buf_impl_t *db = arg; + objset_t *os = db->db_objset; + dsl_pool_t *dp = dmu_objset_pool(os); + dbuf_dirty_record_t *dr; + int delta = 0; + + dr = db->db_data_pending; + ASSERT3U(dr->dr_txg, ==, zio->io_txg); + + /* + * The callback will be called io_phys_children times. Retire one + * portion of our dirty space each time we are called. Any rounding + * error will be cleaned up by dbuf_write_done(). + */ + delta = dr->dr_accounted / zio->io_phys_children; + dsl_pool_undirty_space(dp, delta, zio->io_txg); +} + +/* ARGSUSED */ +static void +dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) +{ + dmu_buf_impl_t *db = vdb; + blkptr_t *bp_orig = &zio->io_bp_orig; + blkptr_t *bp = db->db_blkptr; + objset_t *os = db->db_objset; + dmu_tx_t *tx = os->os_synctx; + dbuf_dirty_record_t *dr; + + ASSERT0(zio->io_error); + ASSERT(db->db_blkptr == bp); + + /* + * For nopwrites and rewrites we ensure that the bp matches our + * original and bypass all the accounting. + */ + if (zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE)) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + + mutex_enter(&db->db_mtx); + + DBUF_VERIFY(db); + + dr = db->db_data_pending; + ASSERT(!list_link_active(&dr->dr_dirty_node)); + ASSERT(dr->dr_dbuf == db); + ASSERT(list_next(&db->db_dirty_records, dr) == NULL); + list_remove(&db->db_dirty_records, dr); + +#ifdef ZFS_DEBUG + if (db->db_blkid == DMU_SPILL_BLKID) { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR); + ASSERT(!(BP_IS_HOLE(db->db_blkptr)) && + db->db_blkptr == DN_SPILL_BLKPTR(dn->dn_phys)); + DB_DNODE_EXIT(db); + } +#endif + + if (db->db_level == 0) { + ASSERT(db->db_blkid != DMU_BONUS_BLKID); + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + if (db->db_state != DB_NOFILL) { + if (dr->dt.dl.dr_data != db->db_buf) + arc_buf_destroy(dr->dt.dl.dr_data, db); + } + } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + ASSERT(list_head(&dr->dt.di.dr_children) == NULL); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); + if (!BP_IS_HOLE(db->db_blkptr)) { + int epbs __maybe_unused = dn->dn_phys->dn_indblkshift - + SPA_BLKPTRSHIFT; + ASSERT3U(db->db_blkid, <=, + dn->dn_phys->dn_maxblkid >> (db->db_level * epbs)); + ASSERT3U(BP_GET_LSIZE(db->db_blkptr), ==, + db->db.db_size); + } + DB_DNODE_EXIT(db); + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + + cv_broadcast(&db->db_changed); + ASSERT(db->db_dirtycnt > 0); + db->db_dirtycnt -= 1; + db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)tx->tx_txg, B_FALSE); + + /* + * If we didn't do a physical write in this ZIO and we + * still ended up here, it means that the space of the + * dbuf that we just released (and undirtied) above hasn't + * been marked as undirtied in the pool's accounting. + * + * Thus, we undirty that space in the pool's view of the + * world here. For physical writes this type of update + * happens in dbuf_write_physdone(). + * + * If we did a physical write, cleanup any rounding errors + * that came up due to writing multiple copies of a block + * on disk [see dbuf_write_physdone()]. + */ + if (zio->io_phys_children == 0) { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted, zio->io_txg); + } else { + dsl_pool_undirty_space(dmu_objset_pool(os), + dr->dr_accounted % zio->io_phys_children, zio->io_txg); + } + + kmem_free(dr, sizeof (dbuf_dirty_record_t)); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } + mutex_exit(&db->db_mtx); + + dbuf_write_done(zio, NULL, db); + + if (zio->io_abd != NULL) + abd_put(zio->io_abd); +} + +typedef struct dbuf_remap_impl_callback_arg { + objset_t *drica_os; + uint64_t drica_blk_birth; + dmu_tx_t *drica_tx; +} dbuf_remap_impl_callback_arg_t; + +static void +dbuf_remap_impl_callback(uint64_t vdev, uint64_t offset, uint64_t size, + void *arg) +{ + dbuf_remap_impl_callback_arg_t *drica = arg; + objset_t *os = drica->drica_os; + spa_t *spa = dmu_objset_spa(os); + dmu_tx_t *tx = drica->drica_tx; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (os == spa_meta_objset(spa)) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + dsl_dataset_block_remapped(dmu_objset_ds(os), vdev, offset, + size, drica->drica_blk_birth, tx); + } +} + +static void +dbuf_remap_impl(dnode_t *dn, blkptr_t *bp, krwlock_t *rw, dmu_tx_t *tx) +{ + blkptr_t bp_copy = *bp; + spa_t *spa = dmu_objset_spa(dn->dn_objset); + dbuf_remap_impl_callback_arg_t drica; + + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + drica.drica_os = dn->dn_objset; + drica.drica_blk_birth = bp->blk_birth; + drica.drica_tx = tx; + if (spa_remap_blkptr(spa, &bp_copy, dbuf_remap_impl_callback, + &drica)) { + /* + * If the blkptr being remapped is tracked by a livelist, + * then we need to make sure the livelist reflects the update. + * First, cancel out the old blkptr by appending a 'FREE' + * entry. Next, add an 'ALLOC' to track the new version. This + * way we avoid trying to free an inaccurate blkptr at delete. + * Note that embedded blkptrs are not tracked in livelists. + */ + if (dn->dn_objset != spa_meta_objset(spa)) { + dsl_dataset_t *ds = dmu_objset_ds(dn->dn_objset); + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg) { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, + bp); + bplist_append(&ds->ds_dir->dd_pending_allocs, + &bp_copy); + } + } + + /* + * The db_rwlock prevents dbuf_read_impl() from + * dereferencing the BP while we are changing it. To + * avoid lock contention, only grab it when we are actually + * changing the BP. + */ + if (rw != NULL) + rw_enter(rw, RW_WRITER); + *bp = bp_copy; + if (rw != NULL) + rw_exit(rw); + } +} + +/* + * Remap any existing BP's to concrete vdevs, if possible. + */ +static void +dbuf_remap(dnode_t *dn, dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(db->db_objset); + ASSERT(dsl_pool_sync_context(spa_get_dsl(spa))); + + if (!spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return; + + if (db->db_level > 0) { + blkptr_t *bp = db->db.db_data; + for (int i = 0; i < db->db.db_size >> SPA_BLKPTRSHIFT; i++) { + dbuf_remap_impl(dn, &bp[i], &db->db_rwlock, tx); + } + } else if (db->db.db_object == DMU_META_DNODE_OBJECT) { + dnode_phys_t *dnp = db->db.db_data; + ASSERT3U(db->db_dnode_handle->dnh_dnode->dn_type, ==, + DMU_OT_DNODE); + for (int i = 0; i < db->db.db_size >> DNODE_SHIFT; + i += dnp[i].dn_extra_slots + 1) { + for (int j = 0; j < dnp[i].dn_nblkptr; j++) { + krwlock_t *lock = (dn->dn_dbuf == NULL ? NULL : + &dn->dn_dbuf->db_rwlock); + dbuf_remap_impl(dn, &dnp[i].dn_blkptr[j], lock, + tx); + } + } + } +} + + +/* Issue I/O to commit a dirty buffer to disk. */ +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn; + objset_t *os; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_phys_t zb; + zio_prop_t zp; + zio_t *pio; /* parent I/O */ + int wp_flag = 0; + + ASSERT(dmu_tx_is_syncing(tx)); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + os = dn->dn_objset; + + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + dbuf_release_bp(db); + } + dbuf_remap(dn, db, tx); + } + } + + if (parent != dn->dn_dbuf) { + /* Our parent is an indirect block. */ + /* We have a dirty parent that has been scheduled for write. */ + ASSERT(parent && parent->db_data_pending); + /* Our parent's buffer is one level closer to the dnode. */ + ASSERT(db->db_level == parent->db_level-1); + /* + * We're about to modify our parent's db_data by modifying + * our block pointer, so the parent must be released. + */ + ASSERT(arc_released(parent->db_buf)); + pio = parent->db_data_pending->dr_zio; + } else { + /* Our parent is the dnode itself. */ + ASSERT((db->db_level == dn->dn_phys->dn_nlevels-1 && + db->db_blkid != DMU_SPILL_BLKID) || + (db->db_blkid == DMU_SPILL_BLKID && db->db_level == 0)); + if (db->db_blkid != DMU_SPILL_BLKID) + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + pio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(pio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + if (db->db_blkid == DMU_SPILL_BLKID) + wp_flag = WP_SPILL; + wp_flag |= (db->db_state == DB_NOFILL) ? WP_NOFILL : 0; + + dmu_write_policy(os, dn, db->db_level, wp_flag, &zp); + DB_DNODE_EXIT(db); + + /* + * We copy the blkptr now (rather than when we instantiate the dirty + * record), because its value can change between open context and + * syncing context. We do not need to hold dn_struct_rwlock to read + * db_blkptr because we are in syncing context. + */ + dr->dr_bp_copy = *db->db_blkptr; + + if (db->db_level == 0 && + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * The BP for this block has been provided by open context + * (by dmu_sync() or dmu_buf_write_embedded()). + */ + abd_t *contents = (data != NULL) ? + abd_get_from_buf(data->b_data, arc_buf_size(data)) : NULL; + + dr->dr_zio = zio_write(pio, os->os_spa, txg, &dr->dr_bp_copy, + contents, db->db.db_size, db->db.db_size, &zp, + dbuf_write_override_ready, NULL, NULL, + dbuf_write_override_done, + dr, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies, dr->dt.dl.dr_nopwrite); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF || + zp.zp_checksum == ZIO_CHECKSUM_NOPARITY); + dr->dr_zio = zio_write(pio, os->os_spa, txg, + &dr->dr_bp_copy, NULL, db->db.db_size, db->db.db_size, &zp, + dbuf_write_nofill_ready, NULL, NULL, + dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + + /* + * For indirect blocks, we want to setup the children + * ready callback so that we can properly handle an indirect + * block that only contains holes. + */ + arc_write_done_func_t *children_ready_cb = NULL; + if (db->db_level != 0) + children_ready_cb = dbuf_write_children_ready; + + dr->dr_zio = arc_write(pio, os->os_spa, txg, + &dr->dr_bp_copy, data, DBUF_IS_L2CACHEABLE(db), + &zp, dbuf_write_ready, + children_ready_cb, dbuf_write_physdone, + dbuf_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED, &zb); + } +} + +EXPORT_SYMBOL(dbuf_find); +EXPORT_SYMBOL(dbuf_is_metadata); +EXPORT_SYMBOL(dbuf_destroy); +EXPORT_SYMBOL(dbuf_loan_arcbuf); +EXPORT_SYMBOL(dbuf_whichblock); +EXPORT_SYMBOL(dbuf_read); +EXPORT_SYMBOL(dbuf_unoverride); +EXPORT_SYMBOL(dbuf_free_range); +EXPORT_SYMBOL(dbuf_new_size); +EXPORT_SYMBOL(dbuf_release_bp); +EXPORT_SYMBOL(dbuf_dirty); +EXPORT_SYMBOL(dmu_buf_set_crypt_params); +EXPORT_SYMBOL(dmu_buf_will_dirty); +EXPORT_SYMBOL(dmu_buf_is_dirty); +EXPORT_SYMBOL(dmu_buf_will_not_fill); +EXPORT_SYMBOL(dmu_buf_will_fill); +EXPORT_SYMBOL(dmu_buf_fill_done); +EXPORT_SYMBOL(dmu_buf_rele); +EXPORT_SYMBOL(dbuf_assign_arcbuf); +EXPORT_SYMBOL(dbuf_prefetch); +EXPORT_SYMBOL(dbuf_hold_impl); +EXPORT_SYMBOL(dbuf_hold); +EXPORT_SYMBOL(dbuf_hold_level); +EXPORT_SYMBOL(dbuf_create_bonus); +EXPORT_SYMBOL(dbuf_spill_set_blksz); +EXPORT_SYMBOL(dbuf_rm_spill); +EXPORT_SYMBOL(dbuf_add_ref); +EXPORT_SYMBOL(dbuf_rele); +EXPORT_SYMBOL(dbuf_rele_and_unlock); +EXPORT_SYMBOL(dbuf_refcount); +EXPORT_SYMBOL(dbuf_sync_list); +EXPORT_SYMBOL(dmu_buf_set_user); +EXPORT_SYMBOL(dmu_buf_set_user_ie); +EXPORT_SYMBOL(dmu_buf_get_user); +EXPORT_SYMBOL(dmu_buf_get_blkptr); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, max_bytes, ULONG, ZMOD_RW, + "Maximum size in bytes of the dbuf cache."); + +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, hiwater_pct, UINT, ZMOD_RW, + "Percentage over dbuf_cache_max_bytes when dbufs must be evicted " + "directly."); + +ZFS_MODULE_PARAM(zfs_dbuf_cache, dbuf_cache_, lowater_pct, UINT, ZMOD_RW, + "Percentage below dbuf_cache_max_bytes when the evict thread stops " + "evicting dbufs."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_max_bytes, ULONG, ZMOD_RW, + "Maximum size in bytes of the dbuf metadata cache."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, cache_shift, INT, ZMOD_RW, + "Set the size of the dbuf cache to a log2 fraction of arc size."); + +ZFS_MODULE_PARAM(zfs_dbuf, dbuf_, metadata_cache_shift, INT, ZMOD_RW, + "Set the size of the dbuf metadata cache to a log2 fraction of arc " + "size."); +/* END CSTYLED */ diff --git a/module/zfs/dbuf_stats.c b/module/zfs/dbuf_stats.c new file mode 100644 index 000000000000..a2f3c580ee6c --- /dev/null +++ b/module/zfs/dbuf_stats.c @@ -0,0 +1,231 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include + +/* + * Calculate the index of the arc header for the state, disabled by default. + */ +int zfs_dbuf_state_index = 0; + +/* + * ========================================================================== + * Dbuf Hash Read Routines + * ========================================================================== + */ +typedef struct dbuf_stats_t { + kmutex_t lock; + kstat_t *kstat; + dbuf_hash_table_t *hash; + int idx; +} dbuf_stats_t; + +static dbuf_stats_t dbuf_stats_hash_table; + +static int +dbuf_stats_hash_table_headers(char *buf, size_t size) +{ + (void) snprintf(buf, size, + "%-96s | %-119s | %s\n" + "%-16s %-8s %-8s %-8s %-8s %-10s %-8s %-5s %-5s %-7s %3s | " + "%-5s %-5s %-9s %-6s %-8s %-12s " + "%-6s %-6s %-6s %-6s %-6s %-8s %-8s %-8s %-6s | " + "%-6s %-6s %-8s %-8s %-6s %-6s %-6s %-8s %-8s\n", + "dbuf", "arcbuf", "dnode", "pool", "objset", "object", "level", + "blkid", "offset", "dbsize", "meta", "state", "dbholds", "dbc", + "list", "atype", "flags", "count", "asize", "access", + "mru", "gmru", "mfu", "gmfu", "l2", "l2_dattr", "l2_asize", + "l2_comp", "aholds", "dtype", "btype", "data_bs", "meta_bs", + "bsize", "lvls", "dholds", "blocks", "dsize"); + + return (0); +} + +static int +__dbuf_stats_hash_table_data(char *buf, size_t size, dmu_buf_impl_t *db) +{ + arc_buf_info_t abi = { 0 }; + dmu_object_info_t doi = { 0 }; + dnode_t *dn = DB_DNODE(db); + size_t nwritten; + + if (db->db_buf) + arc_buf_info(db->db_buf, &abi, zfs_dbuf_state_index); + + __dmu_object_info_from_dnode(dn, &doi); + + nwritten = snprintf(buf, size, + "%-16s %-8llu %-8lld %-8lld %-8lld %-10llu %-8llu %-5d %-5d " + "%-7lu %-3d | %-5d %-5d 0x%-7x %-6lu %-8llu %-12llu " + "%-6lu %-6lu %-6lu %-6lu %-6lu %-8llu %-8llu %-8d %-6lu | " + "%-6d %-6d %-8lu %-8lu %-6llu %-6lu %-6lu %-8llu %-8llu\n", + /* dmu_buf_impl_t */ + spa_name(dn->dn_objset->os_spa), + (u_longlong_t)dmu_objset_id(db->db_objset), + (longlong_t)db->db.db_object, + (longlong_t)db->db_level, + (longlong_t)db->db_blkid, + (u_longlong_t)db->db.db_offset, + (u_longlong_t)db->db.db_size, + !!dbuf_is_metadata(db), + db->db_state, + (ulong_t)zfs_refcount_count(&db->db_holds), + multilist_link_active(&db->db_cache_link), + /* arc_buf_info_t */ + abi.abi_state_type, + abi.abi_state_contents, + abi.abi_flags, + (ulong_t)abi.abi_bufcnt, + (u_longlong_t)abi.abi_size, + (u_longlong_t)abi.abi_access, + (ulong_t)abi.abi_mru_hits, + (ulong_t)abi.abi_mru_ghost_hits, + (ulong_t)abi.abi_mfu_hits, + (ulong_t)abi.abi_mfu_ghost_hits, + (ulong_t)abi.abi_l2arc_hits, + (u_longlong_t)abi.abi_l2arc_dattr, + (u_longlong_t)abi.abi_l2arc_asize, + abi.abi_l2arc_compress, + (ulong_t)abi.abi_holds, + /* dmu_object_info_t */ + doi.doi_type, + doi.doi_bonus_type, + (ulong_t)doi.doi_data_block_size, + (ulong_t)doi.doi_metadata_block_size, + (u_longlong_t)doi.doi_bonus_size, + (ulong_t)doi.doi_indirection, + (ulong_t)zfs_refcount_count(&dn->dn_holds), + (u_longlong_t)doi.doi_fill_count, + (u_longlong_t)doi.doi_max_offset); + + if (nwritten >= size) + return (size); + + return (nwritten + 1); +} + +static int +dbuf_stats_hash_table_data(char *buf, size_t size, void *data) +{ + dbuf_stats_t *dsh = (dbuf_stats_t *)data; + dbuf_hash_table_t *h = dsh->hash; + dmu_buf_impl_t *db; + int length, error = 0; + + ASSERT3S(dsh->idx, >=, 0); + ASSERT3S(dsh->idx, <=, h->hash_table_mask); + memset(buf, 0, size); + + mutex_enter(DBUF_HASH_MUTEX(h, dsh->idx)); + for (db = h->hash_table[dsh->idx]; db != NULL; db = db->db_hash_next) { + /* + * Returning ENOMEM will cause the data and header functions + * to be called with a larger scratch buffers. + */ + if (size < 512) { + error = SET_ERROR(ENOMEM); + break; + } + + mutex_enter(&db->db_mtx); + + if (db->db_state != DB_EVICTING) { + length = __dbuf_stats_hash_table_data(buf, size, db); + buf += length; + size -= length; + } + + mutex_exit(&db->db_mtx); + } + mutex_exit(DBUF_HASH_MUTEX(h, dsh->idx)); + + return (error); +} + +static void * +dbuf_stats_hash_table_addr(kstat_t *ksp, loff_t n) +{ + dbuf_stats_t *dsh = ksp->ks_private; + + ASSERT(MUTEX_HELD(&dsh->lock)); + + if (n <= dsh->hash->hash_table_mask) { + dsh->idx = n; + return (dsh); + } + + return (NULL); +} + +static void +dbuf_stats_hash_table_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + mutex_init(&dsh->lock, NULL, MUTEX_DEFAULT, NULL); + dsh->hash = hash; + + ksp = kstat_create("zfs", 0, "dbufs", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + dsh->kstat = ksp; + + if (ksp) { + ksp->ks_lock = &dsh->lock; + ksp->ks_ndata = UINT32_MAX; + ksp->ks_private = dsh; + kstat_set_raw_ops(ksp, dbuf_stats_hash_table_headers, + dbuf_stats_hash_table_data, dbuf_stats_hash_table_addr); + kstat_install(ksp); + } +} + +static void +dbuf_stats_hash_table_destroy(void) +{ + dbuf_stats_t *dsh = &dbuf_stats_hash_table; + kstat_t *ksp; + + ksp = dsh->kstat; + if (ksp) + kstat_delete(ksp); + + mutex_destroy(&dsh->lock); +} + +void +dbuf_stats_init(dbuf_hash_table_t *hash) +{ + dbuf_stats_hash_table_init(hash); +} + +void +dbuf_stats_destroy(void) +{ + dbuf_stats_hash_table_destroy(); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, dbuf_state_index, INT, ZMOD_RW, + "Calculate arc header index"); +/* END CSTYLED */ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c new file mode 100644 index 000000000000..b94a9f54ece3 --- /dev/null +++ b/module/zfs/ddt.c @@ -0,0 +1,1187 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static kmem_cache_t *ddt_cache; +static kmem_cache_t *ddt_entry_cache; + +/* + * Enable/disable prefetching of dedup-ed blocks which are going to be freed. + */ +int zfs_dedup_prefetch = 0; + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + uint64_t count; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + bzero(&ddt->ddt_object_stats[type][class], sizeof (ddt_object_t)); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + if (error != 0) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + if (error != 0) + return (error); + + /* + * Seed the cached statistics. + */ + error = ddt_object_info(ddt, type, class, &doi); + if (error) + return (error); + + error = ddt_object_count(ddt, type, class, &count); + if (error) + return (error); + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; + + return (0); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; + dmu_object_info_t doi; + uint64_t count; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); + + /* + * Cache DDT statistics; this is the only time they'll change. + */ + VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); + VERIFY(ddt_object_count(ddt, type, class, &count) == 0); + + ddo->ddo_count = count; + ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; + ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static void +ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return; + + ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, + ddt->ddt_object[type][class], dde); +} + +int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +int +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *count) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class], count)); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +/* + * The bp created via this function may be used for repairs and scrub, but it + * will be missing the salt / IV required to do a full decrypting read. + */ +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CRYPT(bp, DDK_GET_CRYPT(ddk)); + BP_SET_FILL(bp, 1); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_DEDUP); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 1); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + ASSERT(BP_IS_ENCRYPTED(bp) || !BP_USES_CRYPT(bp)); + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); + DDK_SET_CRYPT(ddk, BP_USES_CRYPT(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + if (ddp) { + ASSERT(ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; + } +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + + /* + * We clear the dedup bit so that zio_free() will actually free the + * space, rather than just decrementing the refcount in the DDT. + */ + BP_SET_DEDUP(&blk, 0); + + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < DDE_GET_NDVAS(dde); d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total; + + if (spa->spa_dedup_dspace != ~0ULL) + return (spa->spa_dedup_dspace); + + bzero(&dds_total, sizeof (ddt_stat_t)); + + /* Calculate and cache the stats */ + ddt_get_dedup_stats(spa, &dds_total); + spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; + return (spa->spa_dedup_dspace); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = cpfunc; + /* CONSTCOND */ + if (ZFS_HOST_BYTEORDER) + *version |= DDT_COMPRESS_BYTEORDER_MASK; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +void +ddt_init(void) +{ + ddt_cache = kmem_cache_create("ddt_cache", + sizeof (ddt_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + ddt_entry_cache = kmem_cache_create("ddt_entry_cache", + sizeof (ddt_entry_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +ddt_fini(void) +{ + kmem_cache_destroy(ddt_entry_cache); + kmem_cache_destroy(ddt_cache); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); + bzero(dde, sizeof (ddt_entry_t)); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_abd != NULL) + abd_free(dde->dde_repair_abd); + + cv_destroy(&dde->dde_cv); + kmem_cache_free(ddt_entry_cache, dde); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) { + ASSERT0(error); + break; + } + } + if (error != ENOENT) + break; + } + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +void +ddt_prefetch(spa_t *spa, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) + return; + + /* + * We only remove the DDT once all tables are empty and only + * prefetch dedup blocks when there are entries in the DDT. + * Thus no locking is required as the DDT can't disappear on us. + */ + ddt = ddt_select(spa, bp); + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + ddt_object_prefetch(ddt, type, class, &dde); + } + } +} + +/* + * Opaque struct used for ddt_key comparison + */ +#define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) + +typedef struct ddt_key_cmp { + uint16_t u16[DDT_KEY_CMP_LEN]; +} ddt_key_cmp_t; + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key; + const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key; + int32_t cmp = 0; + + for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { + cmp = (int32_t)k1->u16[i] - (int32_t)k2->u16[i]; + if (likely(cmp)) + break; + } + + return (TREE_ISIGN(cmp)); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_cache_alloc(ddt_cache, KM_SLEEP); + bzero(ddt, sizeof (ddt_t)); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_cache_free(ddt_cache, ddt); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + + /* + * Seed the cached histograms. + */ + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + spa->spa_dedup_dspace = ~0ULL; + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t *dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); + + ddt_key_fill(&(dde->dde_key), bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class <= max_class; class++) { + if (ddt_object_lookup(ddt, type, class, dde) == 0) { + kmem_cache_free(ddt_entry_cache, dde); + return (B_TRUE); + } + } + } + + kmem_cache_free(ddt_entry_cache, dde); + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_abd != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_abd, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + /* + * Note, we no longer create DDT-DITTO blocks, but we + * don't want to leak any written by older software. + */ + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + /* We do not create new DDT-DITTO blocks. */ + ASSERT0(dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth); + if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + /* + * If the class changes, the order that we scan this bp + * changes. If it decreases, we could miss it, so + * scan it right now. (This covers both class changing + * while we are doing ddt_walk(), and when we are + * traversing.) + */ + if (nclass < oclass) { + dsl_scan_ddt_entry(dp->dp_scan, + ddt->ddt_checksum, dde, tx); + } + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, tx); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + uint64_t add, count = 0; + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (ddt_object_exists(ddt, type, class)) { + ddt_object_sync(ddt, type, class, tx); + VERIFY(ddt_object_count(ddt, type, class, + &add) == 0); + count += add; + } + } + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (count == 0 && ddt_object_exists(ddt, type, class)) + ddt_object_destroy(ddt, type, class, tx); + } + } + + bcopy(ddt->ddt_histogram, &ddt->ddt_histogram_cache, + sizeof (ddt->ddt_histogram)); + spa->spa_dedup_dspace = ~0ULL; +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dmu_tx_t *tx; + zio_t *rio; + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + scn->scn_zio_root = NULL; + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + dde->dde_type = ddb->ddb_type; + dde->dde_class = ddb->ddb_class; + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (SET_ERROR(ENOENT)); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_dedup, zfs_dedup_, prefetch, INT, ZMOD_RW, + "Enable prefetching dedup-ed blks"); +/* END CSTYLED */ diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c new file mode 100644 index 000000000000..c5c9eda0b2d0 --- /dev/null +++ b/module/zfs/ddt_zap.c @@ -0,0 +1,168 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t *cbuf; + uint64_t one, csize; + int error; + + cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP); + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + goto out; + + ASSERT(one == 1); + ASSERT(csize <= (sizeof (dde->dde_phys) + 1)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + goto out; + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); +out: + kmem_free(cbuf, sizeof (dde->dde_phys) + 1); + + return (error); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + if (*walk == 0) { + /* + * We don't want to prefetch the entire ZAP object, because + * it can be enormous. Also the primary use of DDT iteration + * is for scrubbing, in which case we will be issuing many + * scrub I/Os for each ZAP block that we read in, so + * reading the ZAP is unlikely to be the bottleneck. + */ + zap_cursor_init_noprefetch(&zc, os, object); + } else { + zap_cursor_init_serialized(&zc, os, object, *walk); + } + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static int +ddt_zap_count(objset_t *os, uint64_t object, uint64_t *count) +{ + return (zap_count(os, object, count)); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_prefetch, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c new file mode 100644 index 000000000000..06d6df618748 --- /dev/null +++ b/module/zfs/dmu.c @@ -0,0 +1,2482 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2016, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2015 by Chunwei Chen. All rights reserved. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#endif + +/* + * Enable/disable nopwrite feature. + */ +int zfs_nopwrite_enabled = 1; + +/* + * Tunable to control percentage of dirtied L1 blocks from frees allowed into + * one TXG. After this threshold is crossed, additional dirty blocks from frees + * will wait until the next TXG. + * A value of zero will disable this throttle. + */ +unsigned long zfs_per_txg_dirty_frees_percent = 5; + +/* + * Enable/disable forcing txg sync when dirty in dmu_offset_next. + */ +int zfs_dmu_offset_next_sync = 0; + +/* + * Limit the amount we can prefetch with one call to this amount. This + * helps to limit the amount of memory that can be used by prefetching. + * Larger objects should be prefetched a bit at a time. + */ +int dmu_prefetch_max = 8 * SPA_MAXBLOCKSIZE; + +const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "unallocated" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "object directory" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "object array" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "packed nvlist" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "packed nvlist size" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj header" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map header" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA space map" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, TRUE, "ZIL intent log" }, + {DMU_BSWAP_DNODE, TRUE, FALSE, TRUE, "DMU dnode" }, + {DMU_BSWAP_OBJSET, TRUE, TRUE, FALSE, "DMU objset" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL directory" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL directory child map"}, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset snap map" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL props" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL dataset" }, + {DMU_BSWAP_ZNODE, TRUE, FALSE, FALSE, "ZFS znode" }, + {DMU_BSWAP_OLDACL, TRUE, FALSE, TRUE, "ZFS V0 ACL" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "ZFS plain file" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS directory" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "ZFS master node" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS delete queue" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "zvol object" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "zvol prop" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "other uint8[]" }, + {DMU_BSWAP_UINT64, FALSE, FALSE, TRUE, "other uint64[]" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "other ZAP" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "persistent error log" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, FALSE, "SPA history" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "SPA history offsets" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "Pool properties" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL permissions" }, + {DMU_BSWAP_ACL, TRUE, FALSE, TRUE, "ZFS ACL" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "ZFS SYSACL" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "FUID table" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "FUID table size" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dataset next clones"}, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan work queue" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project used" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "ZFS user/group/project quota"}, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "snapshot refcount tags"}, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT ZAP algorithm" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "DDT statistics" }, + {DMU_BSWAP_UINT8, TRUE, FALSE, TRUE, "System attributes" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA master node" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr registration" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, TRUE, "SA attr layouts" }, + {DMU_BSWAP_ZAP, TRUE, FALSE, FALSE, "scan translations" }, + {DMU_BSWAP_UINT8, FALSE, FALSE, TRUE, "deduplicated block" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL deadlist map" }, + {DMU_BSWAP_UINT64, TRUE, TRUE, FALSE, "DSL deadlist map hdr" }, + {DMU_BSWAP_ZAP, TRUE, TRUE, FALSE, "DSL dir clones" }, + {DMU_BSWAP_UINT64, TRUE, FALSE, FALSE, "bpobj subobj" } +}; + +const dmu_object_byteswap_info_t dmu_ot_byteswap[DMU_BSWAP_NUMFUNCS] = { + { byteswap_uint8_array, "uint8" }, + { byteswap_uint16_array, "uint16" }, + { byteswap_uint32_array, "uint32" }, + { byteswap_uint64_array, "uint64" }, + { zap_byteswap, "zap" }, + { dnode_buf_byteswap, "dnode" }, + { dmu_objset_byteswap, "objset" }, + { zfs_znode_byteswap, "znode" }, + { zfs_oldacl_byteswap, "oldacl" }, + { zfs_acl_byteswap, "acl" } +}; + +static int +dmu_buf_hold_noread_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + uint64_t blkid; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (0); +} +int +dmu_buf_hold_noread(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + uint64_t blkid; + dmu_buf_impl_t *db; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); + db = dbuf_hold(dn, blkid, tag); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + + *dbp = &db->db; + return (err); +} + +int +dmu_buf_hold_by_dnode(dnode_t *dn, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; + + err = dmu_buf_hold_noread_by_dnode(dn, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; + } + } + + return (err); +} + +int +dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, + void *tag, dmu_buf_t **dbp, int flags) +{ + int err; + int db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; + + err = dmu_buf_hold_noread(os, object, offset, tag, dbp); + if (err == 0) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)(*dbp); + err = dbuf_read(db, NULL, db_flags); + if (err != 0) { + dbuf_rele(db, tag); + *dbp = NULL; + } + } + + return (err); +} + +int +dmu_bonus_max(void) +{ + return (DN_OLD_MAX_BONUSLEN); +} + +int +dmu_set_bonus(dmu_buf_t *db_fake, int newsize, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else if (newsize < 0 || newsize > db_fake->db_size) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonuslen(dn, newsize, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +int +dmu_set_bonustype(dmu_buf_t *db_fake, dmu_object_type_t type, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int error; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (!DMU_OT_IS_VALID(type)) { + error = SET_ERROR(EINVAL); + } else if (dn->dn_bonus != db) { + error = SET_ERROR(EINVAL); + } else { + dnode_setbonus_type(dn, type, tx); + error = 0; + } + + DB_DNODE_EXIT(db); + return (error); +} + +dmu_object_type_t +dmu_get_bonustype(dmu_buf_t *db_fake) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + dmu_object_type_t type; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + type = dn->dn_bonustype; + DB_DNODE_EXIT(db); + + return (type); +} + +int +dmu_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + dbuf_rm_spill(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_rm_spill(dn, tx); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (error); +} + +/* + * Lookup and hold the bonus buffer for the provided dnode. If the dnode + * has not yet been allocated a new bonus dbuf a will be allocated. + * Returns ENOENT, EIO, or 0. + */ +int dmu_bonus_hold_by_dnode(dnode_t *dn, void *tag, dmu_buf_t **dbp, + uint32_t flags) +{ + dmu_buf_impl_t *db; + int error; + uint32_t db_flags = DB_RF_MUST_SUCCEED; + + if (flags & DMU_READ_NO_PREFETCH) + db_flags |= DB_RF_NOPREFETCH; + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_bonus == NULL) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus == NULL) + dbuf_create_bonus(dn); + } + db = dn->dn_bonus; + + /* as long as the bonus buf is held, the dnode will be held */ + if (zfs_refcount_add(&db->db_holds, tag) == 1) { + VERIFY(dnode_add_ref(dn, db)); + atomic_inc_32(&dn->dn_dbufs_count); + } + + /* + * Wait to drop dn_struct_rwlock until after adding the bonus dbuf's + * hold and incrementing the dbuf count to ensure that dnode_move() sees + * a dnode hold for every dbuf. + */ + rw_exit(&dn->dn_struct_rwlock); + + error = dbuf_read(db, NULL, db_flags); + if (error) { + dnode_evict_bonus(dn); + dbuf_rele(db, tag); + *dbp = NULL; + return (error); + } + + *dbp = &db->db; + return (0); +} + +int +dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) +{ + dnode_t *dn; + int error; + + error = dnode_hold(os, object, FTAG, &dn); + if (error) + return (error); + + error = dmu_bonus_hold_by_dnode(dn, tag, dbp, DMU_READ_NO_PREFETCH); + dnode_rele(dn, FTAG); + + return (error); +} + +/* + * returns ENOENT, EIO, or 0. + * + * This interface will allocate a blank spill dbuf when a spill blk + * doesn't already exist on the dnode. + * + * if you only want to find an already existing spill db, then + * dmu_spill_hold_existing() should be used. + */ +int +dmu_spill_hold_by_dnode(dnode_t *dn, uint32_t flags, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = NULL; + int err; + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + db = dbuf_hold(dn, DMU_SPILL_BLKID, tag); + + if ((flags & DB_RF_HAVESTRUCT) == 0) + rw_exit(&dn->dn_struct_rwlock); + + if (db == NULL) { + *dbp = NULL; + return (SET_ERROR(EIO)); + } + err = dbuf_read(db, NULL, flags); + if (err == 0) + *dbp = &db->db; + else { + dbuf_rele(db, tag); + *dbp = NULL; + } + return (err); +} + +int +dmu_spill_hold_existing(dmu_buf_t *bonus, void *tag, dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_SA) { + err = SET_ERROR(EINVAL); + } else { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (!dn->dn_have_spill) { + err = SET_ERROR(ENOENT); + } else { + err = dmu_spill_hold_by_dnode(dn, + DB_RF_HAVESTRUCT | DB_RF_CANFAIL, tag, dbp); + } + + rw_exit(&dn->dn_struct_rwlock); + } + + DB_DNODE_EXIT(db); + return (err); +} + +int +dmu_spill_hold_by_bonus(dmu_buf_t *bonus, uint32_t flags, void *tag, + dmu_buf_t **dbp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)bonus; + dnode_t *dn; + int err; + uint32_t db_flags = DB_RF_CANFAIL; + + if (flags & DMU_READ_NO_DECRYPT) + db_flags |= DB_RF_NO_DECRYPT; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_spill_hold_by_dnode(dn, db_flags, tag, dbp); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Note: longer-term, we should modify all of the dmu_buf_*() interfaces + * to take a held dnode rather than -- the lookup is wasteful, + * and can induce severe lock contention when writing to several files + * whose dnodes are in the same block. + */ +int +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + boolean_t read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) +{ + dmu_buf_t **dbp; + uint64_t blkid, nblks, i; + uint32_t dbuf_flags; + int err; + zio_t *zio; + + ASSERT(length <= DMU_MAX_ACCESS); + + /* + * Note: We directly notify the prefetch code of this read, so that + * we can tell it about the multi-block read. dbuf_read() only knows + * about the one block it is accessing. + */ + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT | + DB_RF_NOPREFETCH; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (dn->dn_datablkshift) { + int blkshift = dn->dn_datablkshift; + nblks = (P2ROUNDUP(offset + length, 1ULL << blkshift) - + P2ALIGN(offset, 1ULL << blkshift)) >> blkshift; + } else { + if (offset + length > dn->dn_datablksz) { + zfs_panic_recover("zfs: accessing past end of object " + "%llx/%llx (size=%u access=%llu+%llu)", + (longlong_t)dn->dn_objset-> + os_dsl_dataset->ds_object, + (longlong_t)dn->dn_object, dn->dn_datablksz, + (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); + return (SET_ERROR(EIO)); + } + nblks = 1; + } + dbp = kmem_zalloc(sizeof (dmu_buf_t *) * nblks, KM_SLEEP); + + zio = zio_root(dn->dn_objset->os_spa, NULL, NULL, ZIO_FLAG_CANFAIL); + blkid = dbuf_whichblock(dn, 0, offset); + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = dbuf_hold(dn, blkid + i, tag); + if (db == NULL) { + rw_exit(&dn->dn_struct_rwlock); + dmu_buf_rele_array(dbp, nblks, tag); + zio_nowait(zio); + return (SET_ERROR(EIO)); + } + + /* initiate async i/o */ + if (read) + (void) dbuf_read(db, zio, dbuf_flags); + dbp[i] = &db->db; + } + + if ((flags & DMU_READ_NO_PREFETCH) == 0 && + DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) { + dmu_zfetch(&dn->dn_zfetch, blkid, nblks, + read && DNODE_IS_CACHEABLE(dn), B_TRUE); + } + rw_exit(&dn->dn_struct_rwlock); + + /* wait for async i/o */ + err = zio_wait(zio); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + + /* wait for other io to complete */ + if (read) { + for (i = 0; i < nblks; i++) { + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbp[i]; + mutex_enter(&db->db_mtx); + while (db->db_state == DB_READ || + db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + if (db->db_state == DB_UNCACHED) + err = SET_ERROR(EIO); + mutex_exit(&db->db_mtx); + if (err) { + dmu_buf_rele_array(dbp, nblks, tag); + return (err); + } + } + } + + *numbufsp = nblks; + *dbpp = dbp; + return (0); +} + +static int +dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, + uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_buf_hold_array_by_bonus(dmu_buf_t *db_fake, uint64_t offset, + uint64_t length, boolean_t read, void *tag, int *numbufsp, + dmu_buf_t ***dbpp) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + int err; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, + numbufsp, dbpp, DMU_READ_PREFETCH); + DB_DNODE_EXIT(db); + + return (err); +} + +void +dmu_buf_rele_array(dmu_buf_t **dbp_fake, int numbufs, void *tag) +{ + int i; + dmu_buf_impl_t **dbp = (dmu_buf_impl_t **)dbp_fake; + + if (numbufs == 0) + return; + + for (i = 0; i < numbufs; i++) { + if (dbp[i]) + dbuf_rele(dbp[i], tag); + } + + kmem_free(dbp, sizeof (dmu_buf_t *) * numbufs); +} + +/* + * Issue prefetch i/os for the given blocks. If level is greater than 0, the + * indirect blocks prefetched will be those that point to the blocks containing + * the data starting at offset, and continuing to offset + len. + * + * Note that if the indirect blocks above the blocks being prefetched are not + * in cache, they will be asynchronously read in. + */ +void +dmu_prefetch(objset_t *os, uint64_t object, int64_t level, uint64_t offset, + uint64_t len, zio_priority_t pri) +{ + dnode_t *dn; + uint64_t blkid; + int nblks, err; + + if (len == 0) { /* they're interested in the bonus buffer */ + dn = DMU_META_DNODE(os); + + if (object == 0 || object >= DN_MAX_OBJECT) + return; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, level, + object * sizeof (dnode_phys_t)); + dbuf_prefetch(dn, level, blkid, pri, 0); + rw_exit(&dn->dn_struct_rwlock); + return; + } + + /* + * See comment before the definition of dmu_prefetch_max. + */ + len = MIN(len, dmu_prefetch_max); + + /* + * XXX - Note, if the dnode for the requested object is not + * already cached, we will do a *synchronous* read in the + * dnode_hold() call. The same is true for any indirects. + */ + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return; + + /* + * offset + len - 1 is the last byte we want to prefetch for, and offset + * is the first. Then dbuf_whichblk(dn, level, off + len - 1) is the + * last block we want to prefetch, and dbuf_whichblock(dn, level, + * offset) is the first. Then the number we need to prefetch is the + * last - first + 1. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + if (level > 0 || dn->dn_datablkshift != 0) { + nblks = dbuf_whichblock(dn, level, offset + len - 1) - + dbuf_whichblock(dn, level, offset) + 1; + } else { + nblks = (offset < dn->dn_datablksz); + } + + if (nblks != 0) { + blkid = dbuf_whichblock(dn, level, offset); + for (int i = 0; i < nblks; i++) + dbuf_prefetch(dn, level, blkid + i, pri, 0); + } + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); +} + +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + * + * On input, *start should be the first offset that does not need to be + * freed (e.g. "offset + length"). On return, *start will be the first + * offset that should be freed and l1blks is set to the number of level 1 + * indirect blocks found within the chunk. + */ +static int +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t minimum, uint64_t *l1blks) +{ + uint64_t blks; + uint64_t maxblks = DMU_MAX_ACCESS >> (dn->dn_indblkshift + 1); + /* bytes of data covered by a level-1 indirect block */ + uint64_t iblkrange = (uint64_t)dn->dn_datablksz * + EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); + + ASSERT3U(minimum, <=, *start); + + /* + * Check if we can free the entire range assuming that all of the + * L1 blocks in this range have data. If we can, we use this + * worst case value as an estimate so we can avoid having to look + * at the object's actual data. + */ + uint64_t total_l1blks = + (roundup(*start, iblkrange) - (minimum / iblkrange * iblkrange)) / + iblkrange; + if (total_l1blks <= maxblks) { + *l1blks = total_l1blks; + *start = minimum; + return (0); + } + ASSERT(ISP2(iblkrange)); + + for (blks = 0; *start > minimum && blks < maxblks; blks++) { + int err; + + /* + * dnode_next_offset(BACKWARDS) will find an allocated L1 + * indirect block at or before the input offset. We must + * decrement *start so that it is at the end of the region + * to search. + */ + (*start)--; + + err = dnode_next_offset(dn, + DNODE_FIND_BACKWARDS, start, 2, 1, 0); + + /* if there are no indirect blocks before start, we are done */ + if (err == ESRCH) { + *start = minimum; + break; + } else if (err != 0) { + *l1blks = blks; + return (err); + } + + /* set start to the beginning of this L1 indirect */ + *start = P2ALIGN(*start, iblkrange); + } + if (*start < minimum) + *start = minimum; + *l1blks = blks; + + return (0); +} + +/* + * If this objset is of type OST_ZFS return true if vfs's unmounted flag is set, + * otherwise return false. + * Used below in dmu_free_long_range_impl() to enable abort when unmounting + */ +/*ARGSUSED*/ +static boolean_t +dmu_objset_zfs_unmounting(objset_t *os) +{ +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) + return (zfs_get_vfs_flag_unmounted(os)); +#endif + return (B_FALSE); +} + +static int +dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, + uint64_t length) +{ + uint64_t object_size; + int err; + uint64_t dirty_frees_threshold; + dsl_pool_t *dp = dmu_objset_pool(os); + + if (dn == NULL) + return (SET_ERROR(EINVAL)); + + object_size = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + if (offset >= object_size) + return (0); + + if (zfs_per_txg_dirty_frees_percent <= 100) + dirty_frees_threshold = + zfs_per_txg_dirty_frees_percent * zfs_dirty_data_max / 100; + else + dirty_frees_threshold = zfs_dirty_data_max / 20; + + if (length == DMU_OBJECT_END || offset + length > object_size) + length = object_size - offset; + + while (length != 0) { + uint64_t chunk_end, chunk_begin, chunk_len; + uint64_t l1blks; + dmu_tx_t *tx; + + if (dmu_objset_zfs_unmounting(dn->dn_objset)) + return (SET_ERROR(EINTR)); + + chunk_end = chunk_begin = offset + length; + + /* move chunk_begin backwards to the beginning of this chunk */ + err = get_next_chunk(dn, &chunk_begin, offset, &l1blks); + if (err) + return (err); + ASSERT3U(chunk_begin, >=, offset); + ASSERT3U(chunk_begin, <=, chunk_end); + + chunk_len = chunk_end - chunk_begin; + + tx = dmu_tx_create(os); + dmu_tx_hold_free(tx, dn->dn_object, chunk_begin, chunk_len); + + /* + * Mark this transaction as typically resulting in a net + * reduction in space used. + */ + dmu_tx_mark_netfree(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&dp->dp_lock); + uint64_t long_free_dirty = + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK]; + mutex_exit(&dp->dp_lock); + + /* + * To avoid filling up a TXG with just frees, wait for + * the next TXG to open before freeing more chunks if + * we have reached the threshold of frees. + */ + if (dirty_frees_threshold != 0 && + long_free_dirty >= dirty_frees_threshold) { + DMU_TX_STAT_BUMP(dmu_tx_dirty_frees_delay); + dmu_tx_commit(tx); + txg_wait_open(dp, 0, B_TRUE); + continue; + } + + /* + * In order to prevent unnecessary write throttling, for each + * TXG, we track the cumulative size of L1 blocks being dirtied + * in dnode_free_range() below. We compare this number to a + * tunable threshold, past which we prevent new L1 dirty freeing + * blocks from being added into the open TXG. See + * dmu_free_long_range_impl() for details. The threshold + * prevents write throttle activation due to dirty freeing L1 + * blocks taking up a large percentage of zfs_dirty_data_max. + */ + mutex_enter(&dp->dp_lock); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] += + l1blks << dn->dn_indblkshift; + mutex_exit(&dp->dp_lock); + DTRACE_PROBE3(free__long__range, + uint64_t, long_free_dirty, uint64_t, chunk_len, + uint64_t, txg); + dnode_free_range(dn, chunk_begin, chunk_len, tx); + + dmu_tx_commit(tx); + + length -= chunk_len; + } + return (0); +} + +int +dmu_free_long_range(objset_t *os, uint64_t object, + uint64_t offset, uint64_t length) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + err = dmu_free_long_range_impl(os, dn, offset, length); + + /* + * It is important to zero out the maxblkid when freeing the entire + * file, so that (a) subsequent calls to dmu_free_long_range_impl() + * will take the fast path, and (b) dnode_reallocate() can verify + * that the entire file has been freed. + */ + if (err == 0 && offset == 0 && length == DMU_OBJECT_END) + dn->dn_maxblkid = 0; + + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_free_long_object(objset_t *os, uint64_t object) +{ + dmu_tx_t *tx; + int err; + + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); + if (err != 0) + return (err); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + dmu_tx_mark_netfree(tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err == 0) { + if (err == 0) + err = dmu_object_free(os, object, tx); + + dmu_tx_commit(tx); + } else { + dmu_tx_abort(tx); + } + + return (err); +} + +int +dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + ASSERT(offset < UINT64_MAX); + ASSERT(size == DMU_OBJECT_END || size <= UINT64_MAX - offset); + dnode_free_range(dn, offset, size, tx); + dnode_rele(dn, FTAG); + return (0); +} + +static int +dmu_read_impl(dnode_t *dn, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dmu_buf_t **dbp; + int numbufs, err = 0; + + /* + * Deal with odd block sizes, where there can't be data past the first + * block. If we ever do the tail block optimization, we will need to + * handle that here as well. + */ + if (dn->dn_maxblkid == 0) { + uint64_t newsz = offset > dn->dn_datablksz ? 0 : + MIN(size, dn->dn_datablksz - offset); + bzero((char *)buf + newsz, size - newsz); + size = newsz; + } + + while (size > 0) { + uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, + TRUE, FTAG, &numbufs, &dbp, flags); + if (err) + break; + + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); + + (void) memcpy(buf, (char *)db->db_data + bufoff, tocpy); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + return (err); +} + +int +dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + void *buf, uint32_t flags) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) + return (err); + + err = dmu_read_impl(dn, offset, size, buf, flags); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_read_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, void *buf, + uint32_t flags) +{ + return (dmu_read_impl(dn, offset, size, buf, flags)); +} + +static void +dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + int i; + + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = offset - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); + + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + offset += tocpy; + size -= tocpy; + buf = (char *)buf + tocpy; + } +} + +void +dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * Note: Lustre is an external consumer of this interface. + */ +void +dmu_write_by_dnode(dnode_t *dn, uint64_t offset, uint64_t size, + const void *buf, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + + if (size == 0) + return; + + VERIFY0(dmu_buf_hold_array_by_dnode(dn, offset, size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH)); + dmu_write_impl(dbp, numbufs, offset, size, buf, tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs, i; + + if (size == 0) + return; + + VERIFY(0 == dmu_buf_hold_array(os, object, offset, size, + FALSE, FTAG, &numbufs, &dbp)); + + for (i = 0; i < numbufs; i++) { + dmu_buf_t *db = dbp[i]; + + dmu_buf_will_not_fill(db, tx); + } + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +void +dmu_write_embedded(objset_t *os, uint64_t object, uint64_t offset, + void *data, uint8_t etype, uint8_t comp, int uncompressed_size, + int compressed_size, int byteorder, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + ASSERT3U(etype, <, NUM_BP_EMBEDDED_TYPES); + ASSERT3U(comp, <, ZIO_COMPRESS_FUNCTIONS); + VERIFY0(dmu_buf_hold_noread(os, object, offset, + FTAG, &db)); + + dmu_buf_write_embedded(db, + data, (bp_embedded_type_t)etype, (enum zio_compress)comp, + uncompressed_size, compressed_size, byteorder, tx); + + dmu_buf_rele(db, FTAG); +} + +void +dmu_redact(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, + dmu_tx_t *tx) +{ + int numbufs, i; + dmu_buf_t **dbp; + + VERIFY0(dmu_buf_hold_array(os, object, offset, size, FALSE, FTAG, + &numbufs, &dbp)); + for (i = 0; i < numbufs; i++) + dmu_buf_redact(dbp[i], tx); + dmu_buf_rele_array(dbp, numbufs, FTAG); +} + +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + +#ifdef HAVE_UIO_ZEROCOPY +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = (iovec_t *)uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_lsize(abuf)); + iov = (iovec_t *)uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} +#endif /* HAVE_UIO_ZEROCOPY */ + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied(void) +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy(void) +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + +#ifdef _KERNEL +int +dmu_read_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size) +{ + dmu_buf_t **dbp; + int numbufs, i, err; +#ifdef HAVE_UIO_ZEROCOPY + xuio_t *xuio = NULL; +#endif + + /* + * NB: we could do this block-at-a-time, but it's nice + * to be reading in parallel. + */ + err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, + TRUE, FTAG, &numbufs, &dbp, 0); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio_offset(uio) - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); + +#ifdef HAVE_UIO_ZEROCOPY + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) + uio_advance(uio, tocpy); + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else +#endif +#ifdef __FreeBSD__ + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, + tocpy, uio); +#else + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); +#endif + if (err) + break; + + size -= tocpy; + } + dmu_buf_rele_array(dbp, numbufs, FTAG); + + return (err); +} + +/* + * Read 'size' bytes into the uio buffer. + * From object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_read_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_read_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_read_uio_dnode(dn, uio, size); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Read 'size' bytes into the uio buffer. + * From the specified object + * Starting at offset uio->uio_loffset. + */ +int +dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_read_uio_dnode(dn, uio, size); + + dnode_rele(dn, FTAG); + + return (err); +} + +int +dmu_write_uio_dnode(dnode_t *dn, uio_t *uio, uint64_t size, dmu_tx_t *tx) +{ + dmu_buf_t **dbp; + int numbufs; + int err = 0; + int i; + + err = dmu_buf_hold_array_by_dnode(dn, uio_offset(uio), size, + FALSE, FTAG, &numbufs, &dbp, DMU_READ_PREFETCH); + if (err) + return (err); + + for (i = 0; i < numbufs; i++) { + uint64_t tocpy; + int64_t bufoff; + dmu_buf_t *db = dbp[i]; + + ASSERT(size > 0); + + bufoff = uio_offset(uio) - db->db_offset; + tocpy = MIN(db->db_size - bufoff, size); + + ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); + + if (tocpy == db->db_size) + dmu_buf_will_fill(db, tx); + else + dmu_buf_will_dirty(db, tx); + + /* + * XXX uiomove could block forever (eg.nfs-backed + * pages). There needs to be a uiolockdown() function + * to lock the pages in memory, so that uiomove won't + * block. + */ +#ifdef __FreeBSD__ + err = vn_io_fault_uiomove((char *)db->db_data + bufoff, + tocpy, uio); +#else + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_WRITE, uio); +#endif + if (tocpy == db->db_size) + dmu_buf_fill_done(db, tx); + + if (err) + break; + + size -= tocpy; + } + + dmu_buf_rele_array(dbp, numbufs, FTAG); + return (err); +} + +/* + * Write 'size' bytes from the uio buffer. + * To object zdb->db_object. + * Starting at offset uio->uio_loffset. + * + * If the caller already has a dbuf in the target object + * (e.g. its bonus buffer), this routine is faster than dmu_write_uio(), + * because we don't have to find the dnode_t for the object. + */ +int +dmu_write_uio_dbuf(dmu_buf_t *zdb, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zdb; + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + err = dmu_write_uio_dnode(dn, uio, size, tx); + DB_DNODE_EXIT(db); + + return (err); +} + +/* + * Write 'size' bytes from the uio buffer. + * To the specified object. + * Starting at offset uio->uio_loffset. + */ +int +dmu_write_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + if (size == 0) + return (0); + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + err = dmu_write_uio_dnode(dn, uio, size, tx); + + dnode_rele(dn, FTAG); + + return (err); +} +#endif /* _KERNEL */ + +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)handle; + + return (arc_loan_buf(db->db_objset->os_spa, B_FALSE, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + arc_buf_destroy(buf, FTAG); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +int +dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + objset_t *os = dn->dn_objset; + uint64_t object = dn->dn_object; + uint32_t blksz = (uint32_t)arc_buf_lsize(buf); + uint64_t blkid; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, 0, offset); + db = dbuf_hold(dn, blkid, FTAG); + if (db == NULL) + return (SET_ERROR(EIO)); + rw_exit(&dn->dn_struct_rwlock); + + /* + * We can only assign if the offset is aligned, the arc buf is the + * same size as the dbuf, and the dbuf is not metadata. + */ + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + /* compressed bufs must always be assignable to their dbuf */ + ASSERT3U(arc_get_compression(buf), ==, ZIO_COMPRESS_OFF); + ASSERT(!(buf->b_flags & ARC_BUF_FLAG_COMPRESSED)); + + dbuf_rele(db, FTAG); + dmu_write(os, object, offset, blksz, buf->b_data, tx); + dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); + } + + return (0); +} + +int +dmu_assign_arcbuf_by_dbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + int err; + dmu_buf_impl_t *dbuf = (dmu_buf_impl_t *)handle; + + DB_DNODE_ENTER(dbuf); + err = dmu_assign_arcbuf_by_dnode(DB_DNODE(dbuf), offset, buf, tx); + DB_DNODE_EXIT(dbuf); + + return (err); +} + +typedef struct { + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; +} dmu_sync_arg_t; + +/* ARGSUSED */ +static void +dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; + blkptr_t *bp = zio->io_bp; + + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else if (!BP_IS_EMBEDDED(bp)) { + ASSERT(BP_GET_LEVEL(bp) == 0); + BP_SET_FILL(bp, 1); + } + } +} + +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + +/* ARGSUSED */ +static void +dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) +{ + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; + dmu_buf_impl_t *db = dr->dr_dbuf; + zgd_t *zgd = dsa->dsa_zgd; + + /* + * Record the vdev(s) backing this blkptr so they can be flushed after + * the writes for the lwb have completed. + */ + if (zio->io_error == 0) { + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + } + + mutex_enter(&db->db_mtx); + ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); + if (zio->io_error == 0) { + dr->dt.dl.dr_nopwrite = !!(zio->io_flags & ZIO_FLAG_NOPWRITE); + if (dr->dt.dl.dr_nopwrite) { + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + uint8_t chksum = BP_GET_CHECKSUM(bp_orig); + + ASSERT(BP_EQUAL(bp, bp_orig)); + VERIFY(BP_EQUAL(bp, db->db_blkptr)); + ASSERT(zio->io_prop.zp_compress != ZIO_COMPRESS_OFF); + VERIFY(zio_checksum_table[chksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + } + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + + /* + * Old style holes are filled with all zeros, whereas + * new-style holes maintain their lsize, type, level, + * and birth time (see zio_write_compress). While we + * need to reset the BP_SET_LSIZE() call that happened + * in dmu_sync_ready for old style holes, we do *not* + * want to wipe out the information contained in new + * style holes. Thus, only zero out the block pointer if + * it's an old style hole. + */ + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by) && + dr->dt.dl.dr_overridden_by.blk_birth == 0) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + } + cv_broadcast(&db->db_changed); + mutex_exit(&db->db_mtx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static void +dmu_sync_late_arrival_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; + zgd_t *zgd = dsa->dsa_zgd; + + if (zio->io_error == 0) { + /* + * Record the vdev(s) backing this blkptr so they can be + * flushed after the writes for the lwb have completed. + */ + zil_lwb_add_block(zgd->zgd_lwb, zgd->zgd_bp); + + if (!BP_IS_HOLE(bp)) { + blkptr_t *bp_orig __maybe_unused = &zio->io_bp_orig; + ASSERT(!(zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(BP_IS_HOLE(bp_orig) || !BP_EQUAL(bp, bp_orig)); + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + } + + dmu_tx_commit(dsa->dsa_tx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + abd_put(zio->io_abd); + kmem_free(dsa, sizeof (*dsa)); +} + +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_phys_t *zb) +{ + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + /* Make zl_get_data do txg_waited_synced() */ + return (SET_ERROR(EIO)); + } + + /* + * In order to prevent the zgd's lwb from being free'd prior to + * dmu_sync_late_arrival_done() being called, we have to ensure + * the lwb's "max txg" takes this tx's txg into account. + */ + zil_lwb_add_txg(zgd->zgd_lwb, dmu_tx_get_txg(tx)); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; + + /* + * Since we are currently syncing this txg, it's nontrivial to + * determine what BP to nopwrite against, so we disable nopwrite. + * + * When syncing, the db_blkptr is initially the BP of the previous + * txg. We can not nopwrite against it because it will be changed + * (this is similar to the non-late-arrival case where the dbuf is + * dirty in a future txg). + * + * Then dbuf_write_ready() sets bp_blkptr to the location we will write. + * We can not nopwrite against it because although the BP will not + * (typically) be changed, the data has not yet been persisted to this + * location. + * + * Finally, when dbuf_write_done() is called, it is theoretically + * possible to always nopwrite, because the data that was written in + * this txg is the same data that we are trying to write. However we + * would need to check that this dbuf is not dirty in any future + * txg's (as we do in the normal dmu_sync() path). For simplicity, we + * don't nopwrite in this case. + */ + zp->zp_nopwrite = B_FALSE; + + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + abd_get_from_buf(zgd->zgd_db->db_data, zgd->zgd_db->db_size), + zgd->zgd_db->db_size, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, NULL, NULL, dmu_sync_late_arrival_done, + dsa, ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); +} + +/* + * Intent log support: sync the block associated with db to disk. + * N.B. and XXX: the caller is responsible for making sure that the + * data isn't changing while dmu_sync() is writing it. + * + * Return values: + * + * EEXIST: this txg has already been synced, so there's nothing to do. + * The caller should not log the write. + * + * ENOENT: the block was dbuf_free_range()'d, so there's nothing to do. + * The caller should not log the write. + * + * EALREADY: this block is already in the process of being synced. + * The caller should track its progress (somehow). + * + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). + * + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). + */ +int +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + dbuf_dirty_record_t *dr, *dr_next; + dmu_sync_arg_t *dsa; + zbookmark_phys_t zb; + zio_prop_t zp; + dnode_t *dn; + + ASSERT(pio != NULL); + ASSERT(txg != 0); + + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + dmu_write_policy(os, dn, db->db_level, WP_DMU_SYNC, &zp); + DB_DNODE_EXIT(db); + + /* + * If we're frozen (running ziltest), we always need to generate a bp. + */ + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + + /* + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. + */ + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { + /* + * This txg has already synced. There's nothing to do. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EEXIST)); + } + + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); + } + + dr = dbuf_find_dirty_eq(db, txg); + + if (dr == NULL) { + /* + * There's no dr for this dbuf, so it must have been freed. + * There's no need to log writes to freed blocks, so we're done. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(ENOENT)); + } + + dr_next = list_next(&db->db_dirty_records, dr); + ASSERT(dr_next == NULL || dr_next->dr_txg < txg); + + if (db->db_blkptr != NULL) { + /* + * We need to fill in zgd_bp with the current blkptr so that + * the nopwrite code can check if we're writing the same + * data that's already on disk. We can only nopwrite if we + * are sure that after making the copy, db_blkptr will not + * change until our i/o completes. We ensure this by + * holding the db_mtx, and only allowing nopwrite if the + * block is not already dirty (see below). This is verified + * by dmu_sync_done(), which VERIFYs that the db_blkptr has + * not changed. + */ + *zgd->zgd_bp = *db->db_blkptr; + } + + /* + * Assume the on-disk data is X, the current syncing data (in + * txg - 1) is Y, and the current in-memory data is Z (currently + * in dmu_sync). + * + * We usually want to perform a nopwrite if X and Z are the + * same. However, if Y is different (i.e. the BP is going to + * change before this write takes effect), then a nopwrite will + * be incorrect - we would override with X, which could have + * been freed when Y was written. + * + * (Note that this is not a concern when we are nop-writing from + * syncing context, because X and Y must be identical, because + * all previous txgs have been synced.) + * + * Therefore, we disable nopwrite if the current BP could change + * before this TXG. There are two ways it could change: by + * being dirty (dr_next is non-NULL), or by being freed + * (dnode_block_freed()). This behavior is verified by + * zio_done(), which VERIFYs that the override BP is identical + * to the on-disk BP. + */ + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dr_next != NULL || dnode_block_freed(dn, db->db_blkid)) + zp.zp_nopwrite = B_FALSE; + DB_DNODE_EXIT(db); + + ASSERT(dr->dr_txg == txg); + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + /* + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not + * have been dirtied since, or we would have cleared the state. + */ + mutex_exit(&db->db_mtx); + return (SET_ERROR(EALREADY)); + } + + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); + dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; + mutex_exit(&db->db_mtx); + + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; + + zio_nowait(arc_write(pio, os->os_spa, txg, + zgd->zgd_bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), + &zp, dmu_sync_ready, NULL, NULL, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); + + return (0); +} + +int +dmu_object_set_nlevels(objset_t *os, uint64_t object, int nlevels, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_nlevels(dn, nlevels, tx); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + err = dnode_set_blksz(dn, size, ibs, tx); + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_object_set_maxblkid(objset_t *os, uint64_t object, uint64_t maxblkid, + dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_new_blkid(dn, maxblkid, tx, B_FALSE, B_TRUE); + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + return (0); +} + +void +dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* + * Send streams include each object's checksum function. This + * check ensures that the receiving system can understand the + * checksum function transmitted. + */ + ASSERT3U(checksum, <, ZIO_CHECKSUM_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + ASSERT3U(checksum, <, ZIO_CHECKSUM_FUNCTIONS); + dn->dn_checksum = checksum; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +void +dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, + dmu_tx_t *tx) +{ + dnode_t *dn; + + /* + * Send streams include each object's compression function. This + * check ensures that the receiving system can understand the + * compression function transmitted. + */ + ASSERT3U(compress, <, ZIO_COMPRESS_LEGACY_FUNCTIONS); + + VERIFY0(dnode_hold(os, object, FTAG, &dn)); + dn->dn_compress = compress; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); +} + +/* + * When the "redundant_metadata" property is set to "most", only indirect + * blocks of this level and higher will have an additional ditto block. + */ +int zfs_redundant_metadata_most_ditto_level = 2; + +void +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +{ + dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; + boolean_t ismd = (level > 0 || DMU_OT_IS_METADATA(type) || + (wp & WP_SPILL)); + enum zio_checksum checksum = os->os_checksum; + enum zio_compress compress = os->os_compress; + uint8_t complevel = os->os_complevel; + enum zio_checksum dedup_checksum = os->os_dedup_checksum; + boolean_t dedup = B_FALSE; + boolean_t nopwrite = B_FALSE; + boolean_t dedup_verify = os->os_dedup_verify; + boolean_t encrypt = B_FALSE; + int copies = os->os_copies; + + /* + * We maintain different write policies for each of the following + * types of data: + * 1. metadata + * 2. preallocated blocks (i.e. level-0 blocks of a dump device) + * 3. all other level 0 blocks + */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ON, ZIO_COMPRESS_ON); + + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_METADATA) || + (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_EMBEDDED)) + checksum = ZIO_CHECKSUM_FLETCHER_4; + + if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL || + (os->os_redundant_metadata == + ZFS_REDUNDANT_METADATA_MOST && + (level >= zfs_redundant_metadata_most_ditto_level || + DMU_OT_IS_METADATA(type) || (wp & WP_SPILL)))) + copies++; + } else if (wp & WP_NOFILL) { + ASSERT(level == 0); + + /* + * If we're writing preallocated blocks, we aren't actually + * writing them so don't set any policy properties. These + * blocks are currently only used by an external subsystem + * outside of zfs (i.e. dump) and not written by the zio + * pipeline. + */ + compress = ZIO_COMPRESS_OFF; + checksum = ZIO_CHECKSUM_OFF; + } else { + compress = zio_compress_select(os->os_spa, dn->dn_compress, + compress); + complevel = zio_complevel_select(os->os_spa, compress, + complevel, complevel); + + checksum = (dedup_checksum == ZIO_CHECKSUM_OFF) ? + zio_checksum_select(dn->dn_checksum, checksum) : + dedup_checksum; + + /* + * Determine dedup setting. If we are in dmu_sync(), + * we won't actually dedup now because that's all + * done in syncing context; but we do want to use the + * dedup checksum. If the checksum is not strong + * enough to ensure unique signatures, force + * dedup_verify. + */ + if (dedup_checksum != ZIO_CHECKSUM_OFF) { + dedup = (wp & WP_DMU_SYNC) ? B_FALSE : B_TRUE; + if (!(zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) + dedup_verify = B_TRUE; + } + + /* + * Enable nopwrite if we have secure enough checksum + * algorithm (see comment in zio_nop_write) and + * compression is enabled. We don't enable nopwrite if + * dedup is enabled as the two features are mutually + * exclusive. + */ + nopwrite = (!dedup && (zio_checksum_table[checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) && + compress != ZIO_COMPRESS_OFF && zfs_nopwrite_enabled); + } + + /* + * All objects in an encrypted objset are protected from modification + * via a MAC. Encrypted objects store their IV and salt in the last DVA + * in the bp, so we cannot use all copies. Encrypted objects are also + * not subject to nopwrite since writing the same data will still + * result in a new ciphertext. Only encrypted blocks can be dedup'd + * to avoid ambiguity in the dedup code since the DDT does not store + * object types. + */ + if (os->os_encrypted && (wp & WP_NOFILL) == 0) { + encrypt = B_TRUE; + + if (DMU_OT_IS_ENCRYPTED(type)) { + copies = MIN(copies, SPA_DVAS_PER_BP - 1); + nopwrite = B_FALSE; + } else { + dedup = B_FALSE; + } + + if (level <= 0 && + (type == DMU_OT_DNODE || type == DMU_OT_OBJSET)) { + compress = ZIO_COMPRESS_EMPTY; + } + } + + zp->zp_compress = compress; + zp->zp_complevel = complevel; + zp->zp_checksum = checksum; + zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type; + zp->zp_level = level; + zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa)); + zp->zp_dedup = dedup; + zp->zp_dedup_verify = dedup && dedup_verify; + zp->zp_nopwrite = nopwrite; + zp->zp_encrypt = encrypt; + zp->zp_byteorder = ZFS_HOST_BYTEORDER; + bzero(zp->zp_salt, ZIO_DATA_SALT_LEN); + bzero(zp->zp_iv, ZIO_DATA_IV_LEN); + bzero(zp->zp_mac, ZIO_DATA_MAC_LEN); + zp->zp_zpl_smallblk = DMU_OT_IS_FILE(zp->zp_type) ? + os->os_zpl_special_smallblock : 0; + + ASSERT3U(zp->zp_compress, !=, ZIO_COMPRESS_INHERIT); +} + +/* + * This function is only called from zfs_holey_common() for zpl_llseek() + * in order to determine the location of holes. In order to accurately + * report holes all dirty data must be synced to disk. This causes extremely + * poor performance when seeking for holes in a dirty file. As a compromise, + * only provide hole data when the dnode is clean. When a dnode is dirty + * report the dnode as having no holes which is always a safe thing to do. + */ +int +dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) +{ + dnode_t *dn; + int i, err; + boolean_t clean = B_TRUE; + + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + + /* + * Check if dnode is dirty + */ + for (i = 0; i < TXG_SIZE; i++) { + if (multilist_link_active(&dn->dn_dirty_link[i])) { + clean = B_FALSE; + break; + } + } + + /* + * If compatibility option is on, sync any current changes before + * we go trundling through the block pointers. + */ + if (!clean && zfs_dmu_offset_next_sync) { + clean = B_TRUE; + dnode_rele(dn, FTAG); + txg_wait_synced(dmu_objset_pool(os), 0); + err = dnode_hold(os, object, FTAG, &dn); + if (err) + return (err); + } + + if (clean) + err = dnode_next_offset(dn, + (hole ? DNODE_FIND_HOLE : 0), off, 1, 1, 0); + else + err = SET_ERROR(EBUSY); + + dnode_rele(dn, FTAG); + + return (err); +} + +void +__dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + dnode_phys_t *dnp = dn->dn_phys; + + doi->doi_data_block_size = dn->dn_datablksz; + doi->doi_metadata_block_size = dn->dn_indblkshift ? + 1ULL << dn->dn_indblkshift : 0; + doi->doi_type = dn->dn_type; + doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_bonus_size = dn->dn_bonuslen; + doi->doi_dnodesize = dn->dn_num_slots << DNODE_SHIFT; + doi->doi_indirection = dn->dn_nlevels; + doi->doi_checksum = dn->dn_checksum; + doi->doi_compress = dn->dn_compress; + doi->doi_nblkptr = dn->dn_nblkptr; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dn->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += BP_GET_FILL(&dnp->dn_blkptr[i]); +} + +void +dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) +{ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + mutex_enter(&dn->dn_mtx); + + __dmu_object_info_from_dnode(dn, doi); + + mutex_exit(&dn->dn_mtx); + rw_exit(&dn->dn_struct_rwlock); +} + +/* + * Get information on a DMU object. + * If doi is NULL, just indicates whether the object exists. + */ +int +dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) +{ + dnode_t *dn; + int err = dnode_hold(os, object, FTAG, &dn); + + if (err) + return (err); + + if (doi != NULL) + dmu_object_info_from_dnode(dn, doi); + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * As above, but faster; can be used when you have a held dbuf in hand. + */ +void +dmu_object_info_from_db(dmu_buf_t *db_fake, dmu_object_info_t *doi) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + + DB_DNODE_ENTER(db); + dmu_object_info_from_dnode(DB_DNODE(db), doi); + DB_DNODE_EXIT(db); +} + +/* + * Faster still when you only care about the size. + */ +void +dmu_object_size_from_db(dmu_buf_t *db_fake, uint32_t *blksize, + u_longlong_t *nblk512) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + + *blksize = dn->dn_datablksz; + /* add in number of slots used for the dnode itself */ + *nblk512 = ((DN_USED_BYTES(dn->dn_phys) + SPA_MINBLOCKSIZE/2) >> + SPA_MINBLOCKSHIFT) + dn->dn_num_slots; + DB_DNODE_EXIT(db); +} + +void +dmu_object_dnsize_from_db(dmu_buf_t *db_fake, int *dnsize) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + *dnsize = dn->dn_num_slots << DNODE_SHIFT; + DB_DNODE_EXIT(db); +} + +void +byteswap_uint64_array(void *vbuf, size_t size) +{ + uint64_t *buf = vbuf; + size_t count = size >> 3; + int i; + + ASSERT((size & 7) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_64(buf[i]); +} + +void +byteswap_uint32_array(void *vbuf, size_t size) +{ + uint32_t *buf = vbuf; + size_t count = size >> 2; + int i; + + ASSERT((size & 3) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_32(buf[i]); +} + +void +byteswap_uint16_array(void *vbuf, size_t size) +{ + uint16_t *buf = vbuf; + size_t count = size >> 1; + int i; + + ASSERT((size & 1) == 0); + + for (i = 0; i < count; i++) + buf[i] = BSWAP_16(buf[i]); +} + +/* ARGSUSED */ +void +byteswap_uint8_array(void *vbuf, size_t size) +{ +} + +void +dmu_init(void) +{ + abd_init(); + zfs_dbgmsg_init(); + sa_cache_init(); + xuio_stat_init(); + dmu_objset_init(); + dnode_init(); + zfetch_init(); + dmu_tx_init(); + l2arc_init(); + arc_init(); + dbuf_init(); +} + +void +dmu_fini(void) +{ + arc_fini(); /* arc depends on l2arc, so arc must go first */ + l2arc_fini(); + dmu_tx_fini(); + zfetch_fini(); + dbuf_fini(); + dnode_fini(); + dmu_objset_fini(); + xuio_stat_fini(); + sa_cache_fini(); + zfs_dbgmsg_fini(); + abd_fini(); +} + +EXPORT_SYMBOL(dmu_bonus_hold); +EXPORT_SYMBOL(dmu_bonus_hold_by_dnode); +EXPORT_SYMBOL(dmu_buf_hold_array_by_bonus); +EXPORT_SYMBOL(dmu_buf_rele_array); +EXPORT_SYMBOL(dmu_prefetch); +EXPORT_SYMBOL(dmu_free_range); +EXPORT_SYMBOL(dmu_free_long_range); +EXPORT_SYMBOL(dmu_free_long_object); +EXPORT_SYMBOL(dmu_read); +EXPORT_SYMBOL(dmu_read_by_dnode); +EXPORT_SYMBOL(dmu_write); +EXPORT_SYMBOL(dmu_write_by_dnode); +EXPORT_SYMBOL(dmu_prealloc); +EXPORT_SYMBOL(dmu_object_info); +EXPORT_SYMBOL(dmu_object_info_from_dnode); +EXPORT_SYMBOL(dmu_object_info_from_db); +EXPORT_SYMBOL(dmu_object_size_from_db); +EXPORT_SYMBOL(dmu_object_dnsize_from_db); +EXPORT_SYMBOL(dmu_object_set_nlevels); +EXPORT_SYMBOL(dmu_object_set_blocksize); +EXPORT_SYMBOL(dmu_object_set_maxblkid); +EXPORT_SYMBOL(dmu_object_set_checksum); +EXPORT_SYMBOL(dmu_object_set_compress); +EXPORT_SYMBOL(dmu_write_policy); +EXPORT_SYMBOL(dmu_sync); +EXPORT_SYMBOL(dmu_request_arcbuf); +EXPORT_SYMBOL(dmu_return_arcbuf); +EXPORT_SYMBOL(dmu_assign_arcbuf_by_dnode); +EXPORT_SYMBOL(dmu_assign_arcbuf_by_dbuf); +EXPORT_SYMBOL(dmu_buf_hold); +EXPORT_SYMBOL(dmu_ot); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, nopwrite_enabled, INT, ZMOD_RW, + "Enable NOP writes"); + +ZFS_MODULE_PARAM(zfs, zfs_, per_txg_dirty_frees_percent, ULONG, ZMOD_RW, + "Percentage of dirtied blocks from frees in one TXG"); + +ZFS_MODULE_PARAM(zfs, zfs_, dmu_offset_next_sync, INT, ZMOD_RW, + "Enable forcing txg sync to find holes"); + +ZFS_MODULE_PARAM(zfs, , dmu_prefetch_max, INT, ZMOD_RW, + "Limit one prefetch call to this size"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_diff.c b/module/zfs/dmu_diff.c new file mode 100644 index 000000000000..a573a2e1bd41 --- /dev/null +++ b/module/zfs/dmu_diff.c @@ -0,0 +1,240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +typedef struct dmu_diffarg { + zfs_file_t *da_fp; /* file to which we are reporting */ + offset_t *da_offp; + int da_err; /* error that stopped diff search */ + dmu_diff_record_t da_ddr; +} dmu_diffarg_t; + +static int +write_record(dmu_diffarg_t *da) +{ + zfs_file_t *fp; + ssize_t resid; + + if (da->da_ddr.ddr_type == DDR_NONE) { + da->da_err = 0; + return (0); + } + + fp = da->da_fp; + da->da_err = zfs_file_write(fp, (caddr_t)&da->da_ddr, + sizeof (da->da_ddr), &resid); + *da->da_offp += sizeof (da->da_ddr); + return (da->da_err); +} + +static int +report_free_dnode_range(dmu_diffarg_t *da, uint64_t first, uint64_t last) +{ + ASSERT(first <= last); + if (da->da_ddr.ddr_type != DDR_FREE || + first != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_FREE; + da->da_ddr.ddr_first = first; + da->da_ddr.ddr_last = last; + return (0); + } + da->da_ddr.ddr_last = last; + return (0); +} + +static int +report_dnode(dmu_diffarg_t *da, uint64_t object, dnode_phys_t *dnp) +{ + ASSERT(dnp != NULL); + if (dnp->dn_type == DMU_OT_NONE) + return (report_free_dnode_range(da, object, object)); + + if (da->da_ddr.ddr_type != DDR_INUSE || + object != da->da_ddr.ddr_last + 1) { + if (write_record(da) != 0) + return (da->da_err); + da->da_ddr.ddr_type = DDR_INUSE; + da->da_ddr.ddr_first = da->da_ddr.ddr_last = object; + return (0); + } + da->da_ddr.ddr_last = object; + return (0); +} + +#define DBP_SPAN(dnp, level) \ + (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ + (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) + +/* ARGSUSED */ +static int +diff_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + dmu_diffarg_t *da = arg; + int err = 0; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + + if (zb->zb_level == ZB_DNODE_LEVEL || + zb->zb_object != DMU_META_DNODE_OBJECT) + return (0); + + if (BP_IS_HOLE(bp)) { + uint64_t span = DBP_SPAN(dnp, zb->zb_level); + uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; + + err = report_free_dnode_range(da, dnobj, + dnobj + (span >> DNODE_SHIFT) - 1); + if (err) + return (err); + } else if (zb->zb_level == 0) { + dnode_phys_t *blk; + arc_buf_t *abuf; + arc_flags_t aflags = ARC_FLAG_WAIT; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + int zio_flags = ZIO_FLAG_CANFAIL; + int i; + + if (BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + + if (arc_read(NULL, spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &aflags, zb) != 0) + return (SET_ERROR(EIO)); + + blk = abuf->b_data; + for (i = 0; i < epb; i += blk[i].dn_extra_slots + 1) { + uint64_t dnobj = (zb->zb_blkid << + (DNODE_BLOCK_SHIFT - DNODE_SHIFT)) + i; + err = report_dnode(da, dnobj, blk+i); + if (err) + break; + } + arc_buf_destroy(abuf, &abuf); + if (err) + return (err); + /* Don't care about the data blocks */ + return (TRAVERSE_VISIT_NO_CHILDREN); + } + return (0); +} + +int +dmu_diff(const char *tosnap_name, const char *fromsnap_name, + zfs_file_t *fp, offset_t *offp) +{ + dmu_diffarg_t da; + dsl_dataset_t *fromsnap; + dsl_dataset_t *tosnap; + dsl_pool_t *dp; + int error; + uint64_t fromtxg; + + if (strchr(tosnap_name, '@') == NULL || + strchr(fromsnap_name, '@') == NULL) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(tosnap_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, tosnap_name, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_hold(dp, fromsnap_name, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EXDEV)); + } + + fromtxg = dsl_dataset_phys(fromsnap)->ds_creation_txg; + dsl_dataset_rele(fromsnap, FTAG); + + dsl_dataset_long_hold(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + + da.da_fp = fp; + da.da_offp = offp; + da.da_ddr.ddr_type = DDR_NONE; + da.da_ddr.ddr_first = da.da_ddr.ddr_last = 0; + da.da_err = 0; + + /* + * Since zfs diff only looks at dnodes which are stored in plaintext + * (other than bonus buffers), we don't technically need to decrypt + * the dataset to perform this operation. However, the command line + * utility will still fail if the keys are not loaded because the + * dataset isn't mounted and because it will fail when it attempts to + * call the ZFS_IOC_OBJ_TO_STATS ioctl. + */ + error = traverse_dataset(tosnap, fromtxg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_NO_DECRYPT, + diff_cb, &da); + + if (error != 0) { + da.da_err = error; + } else { + /* we set the da.da_err we return as side-effect */ + (void) write_record(&da); + } + + dsl_dataset_long_rele(tosnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + + return (da.da_err); +} diff --git a/module/zfs/dmu_object.c b/module/zfs/dmu_object.c new file mode 100644 index 000000000000..453a2842ce6e --- /dev/null +++ b/module/zfs/dmu_object.c @@ -0,0 +1,525 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Each of the concurrent object allocators will grab + * 2^dmu_object_alloc_chunk_shift dnode slots at a time. The default is to + * grab 128 slots, which is 4 blocks worth. This was experimentally + * determined to be the lowest value that eliminates the measurable effect + * of lock contention from this code path. + */ +int dmu_object_alloc_chunk_shift = 7; + +static uint64_t +dmu_object_alloc_impl(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) +{ + uint64_t object; + uint64_t L1_dnode_count = DNODES_PER_BLOCK << + (DMU_META_DNODE(os)->dn_indblkshift - SPA_BLKPTRSHIFT); + dnode_t *dn = NULL; + int dn_slots = dnodesize >> DNODE_SHIFT; + boolean_t restarted = B_FALSE; + uint64_t *cpuobj = NULL; + int dnodes_per_chunk = 1 << dmu_object_alloc_chunk_shift; + int error; + + kpreempt_disable(); + cpuobj = &os->os_obj_next_percpu[CPU_SEQID % + os->os_obj_next_percpu_len]; + kpreempt_enable(); + + if (dn_slots == 0) { + dn_slots = DNODE_MIN_SLOTS; + } else { + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + } + + /* + * The "chunk" of dnodes that is assigned to a CPU-specific + * allocator needs to be at least one block's worth, to avoid + * lock contention on the dbuf. It can be at most one L1 block's + * worth, so that the "rescan after polishing off a L1's worth" + * logic below will be sure to kick in. + */ + if (dnodes_per_chunk < DNODES_PER_BLOCK) + dnodes_per_chunk = DNODES_PER_BLOCK; + if (dnodes_per_chunk > L1_dnode_count) + dnodes_per_chunk = L1_dnode_count; + + /* + * The caller requested the dnode be returned as a performance + * optimization in order to avoid releasing the hold only to + * immediately reacquire it. Since they caller is responsible + * for releasing the hold they must provide the tag. + */ + if (allocated_dnode != NULL) { + ASSERT3P(tag, !=, NULL); + } else { + ASSERT3P(tag, ==, NULL); + tag = FTAG; + } + + object = *cpuobj; + for (;;) { + /* + * If we finished a chunk of dnodes, get a new one from + * the global allocator. + */ + if ((P2PHASE(object, dnodes_per_chunk) == 0) || + (P2PHASE(object + dn_slots - 1, dnodes_per_chunk) < + dn_slots)) { + DNODE_STAT_BUMP(dnode_alloc_next_chunk); + mutex_enter(&os->os_obj_lock); + ASSERT0(P2PHASE(os->os_obj_next_chunk, + dnodes_per_chunk)); + object = os->os_obj_next_chunk; + + /* + * Each time we polish off a L1 bp worth of dnodes + * (2^12 objects), move to another L1 bp that's + * still reasonably sparse (at most 1/4 full). Look + * from the beginning at most once per txg. If we + * still can't allocate from that L1 block, search + * for an empty L0 block, which will quickly skip + * to the end of the metadnode if no nearby L0 + * blocks are empty. This fallback avoids a + * pathology where full dnode blocks containing + * large dnodes appear sparse because they have a + * low blk_fill, leading to many failed allocation + * attempts. In the long term a better mechanism to + * search for sparse metadnode regions, such as + * spacemaps, could be implemented. + * + * os_scan_dnodes is set during txg sync if enough + * objects have been freed since the previous + * rescan to justify backfilling again. + * + * Note that dmu_traverse depends on the behavior + * that we use multiple blocks of the dnode object + * before going back to reuse objects. Any change + * to this algorithm should preserve that property + * or find another solution to the issues described + * in traverse_visitbp. + */ + if (P2PHASE(object, L1_dnode_count) == 0) { + uint64_t offset; + uint64_t blkfill; + int minlvl; + if (os->os_rescan_dnodes) { + offset = 0; + os->os_rescan_dnodes = B_FALSE; + } else { + offset = object << DNODE_SHIFT; + } + blkfill = restarted ? 1 : DNODES_PER_BLOCK >> 2; + minlvl = restarted ? 1 : 2; + restarted = B_TRUE; + error = dnode_next_offset(DMU_META_DNODE(os), + DNODE_FIND_HOLE, &offset, minlvl, + blkfill, 0); + if (error == 0) { + object = offset >> DNODE_SHIFT; + } + } + /* + * Note: if "restarted", we may find a L0 that + * is not suitably aligned. + */ + os->os_obj_next_chunk = + P2ALIGN(object, dnodes_per_chunk) + + dnodes_per_chunk; + (void) atomic_swap_64(cpuobj, object); + mutex_exit(&os->os_obj_lock); + } + + /* + * The value of (*cpuobj) before adding dn_slots is the object + * ID assigned to us. The value afterwards is the object ID + * assigned to whoever wants to do an allocation next. + */ + object = atomic_add_64_nv(cpuobj, dn_slots) - dn_slots; + + /* + * XXX We should check for an i/o error here and return + * up to our caller. Actually we should pre-read it in + * dmu_tx_assign(), but there is currently no mechanism + * to do so. + */ + error = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, + dn_slots, tag, &dn); + if (error == 0) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + /* + * Another thread could have allocated it; check + * again now that we have the struct lock. + */ + if (dn->dn_type == DMU_OT_NONE) { + dnode_allocate(dn, ot, blocksize, + indirect_blockshift, bonustype, + bonuslen, dn_slots, tx); + rw_exit(&dn->dn_struct_rwlock); + dmu_tx_add_new_object(tx, dn); + + /* + * Caller requested the allocated dnode be + * returned and is responsible for the hold. + */ + if (allocated_dnode != NULL) + *allocated_dnode = dn; + else + dnode_rele(dn, tag); + + return (object); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, tag); + DNODE_STAT_BUMP(dnode_alloc_race); + } + + /* + * Skip to next known valid starting point on error. This + * is the start of the next block of dnodes. + */ + if (dmu_object_next(os, &object, B_TRUE, 0) != 0) { + object = P2ROUNDUP(object + 1, DNODES_PER_BLOCK); + DNODE_STAT_BUMP(dnode_alloc_next_block); + } + (void) atomic_swap_64(cpuobj, object); + } +} + +uint64_t +dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, 0, NULL, NULL, tx); +} + +uint64_t +dmu_object_alloc_ibs(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + dmu_tx_t *tx) +{ + return dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, 0, NULL, NULL, tx); +} + +uint64_t +dmu_object_alloc_dnsize(objset_t *os, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (dmu_object_alloc_impl(os, ot, blocksize, 0, bonustype, + bonuslen, dnodesize, NULL, NULL, tx)); +} + +/* + * Allocate a new object and return a pointer to the newly allocated dnode + * via the allocated_dnode argument. The returned dnode will be held and + * the caller is responsible for releasing the hold by calling dnode_rele(). + */ +uint64_t +dmu_object_alloc_hold(objset_t *os, dmu_object_type_t ot, int blocksize, + int indirect_blockshift, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) +{ + return (dmu_object_alloc_impl(os, ot, blocksize, indirect_blockshift, + bonustype, bonuslen, dnodesize, allocated_dnode, tag, tx)); +} + +int +dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_claim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, 0, tx)); +} + +int +dmu_object_claim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; + int err; + + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + ASSERT3S(dn_slots, >=, DNODE_MIN_SLOTS); + ASSERT3S(dn_slots, <=, DNODE_MAX_SLOTS); + + if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + return (SET_ERROR(EBADF)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, dn_slots, + FTAG, &dn); + if (err) + return (err); + + dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, dn_slots, tx); + dmu_tx_add_new_object(tx, dn); + + dnode_rele(dn, FTAG); + + return (0); +} + +int +dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (dmu_object_reclaim_dnsize(os, object, ot, blocksize, bonustype, + bonuslen, DNODE_MIN_SIZE, B_FALSE, tx)); +} + +int +dmu_object_reclaim_dnsize(objset_t *os, uint64_t object, dmu_object_type_t ot, + int blocksize, dmu_object_type_t bonustype, int bonuslen, int dnodesize, + boolean_t keep_spill, dmu_tx_t *tx) +{ + dnode_t *dn; + int dn_slots = dnodesize >> DNODE_SHIFT; + int err; + + if (dn_slots == 0) + dn_slots = DNODE_MIN_SLOTS; + + if (object == DMU_META_DNODE_OBJECT) + return (SET_ERROR(EBADF)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, dn_slots, + keep_spill, tx); + + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_object_rm_spill(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } + rw_exit(&dn->dn_struct_rwlock); + + dnode_rele(dn, FTAG); + return (err); +} + +int +dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + int err; + + ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, + FTAG, &dn); + if (err) + return (err); + + ASSERT(dn->dn_type != DMU_OT_NONE); + /* + * If we don't create this free range, we'll leak indirect blocks when + * we get to freeing the dnode in syncing context. + */ + dnode_free_range(dn, 0, DMU_OBJECT_END, tx); + dnode_free(dn, tx); + dnode_rele(dn, FTAG); + + return (0); +} + +/* + * Return (in *objectp) the next object which is allocated (or a hole) + * after *object, taking into account only objects that may have been modified + * after the specified txg. + */ +int +dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) +{ + uint64_t offset; + uint64_t start_obj; + struct dsl_dataset *ds = os->os_dsl_dataset; + int error; + + if (*objectp == 0) { + start_obj = 1; + } else if (ds && dsl_dataset_feature_is_active(ds, + SPA_FEATURE_LARGE_DNODE)) { + uint64_t i = *objectp + 1; + uint64_t last_obj = *objectp | (DNODES_PER_BLOCK - 1); + dmu_object_info_t doi; + + /* + * Scan through the remaining meta dnode block. The contents + * of each slot in the block are known so it can be quickly + * checked. If the block is exhausted without a match then + * hand off to dnode_next_offset() for further scanning. + */ + while (i <= last_obj) { + error = dmu_object_info(os, i, &doi); + if (error == ENOENT) { + if (hole) { + *objectp = i; + return (0); + } else { + i++; + } + } else if (error == EEXIST) { + i++; + } else if (error == 0) { + if (hole) { + i += doi.doi_dnodesize >> DNODE_SHIFT; + } else { + *objectp = i; + return (0); + } + } else { + return (error); + } + } + + start_obj = i; + } else { + start_obj = *objectp + 1; + } + + offset = start_obj << DNODE_SHIFT; + + error = dnode_next_offset(DMU_META_DNODE(os), + (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); + + *objectp = offset >> DNODE_SHIFT; + + return (error); +} + +/* + * Turn this object from old_type into DMU_OTN_ZAP_METADATA, and bump the + * refcount on SPA_FEATURE_EXTENSIBLE_DATASET. + * + * Only for use from syncing context, on MOS objects. + */ +void +dmu_object_zapify(objset_t *mos, uint64_t object, dmu_object_type_t old_type, + dmu_tx_t *tx) +{ + dnode_t *dn; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + if (dn->dn_type == DMU_OTN_ZAP_METADATA) { + dnode_rele(dn, FTAG); + return; + } + ASSERT3U(dn->dn_type, ==, old_type); + ASSERT0(dn->dn_maxblkid); + + /* + * We must initialize the ZAP data before changing the type, + * so that concurrent calls to *_is_zapified() can determine if + * the object has been completely zapified by checking the type. + */ + mzap_create_impl(dn, 0, 0, tx); + + dn->dn_next_type[tx->tx_txg & TXG_MASK] = dn->dn_type = + DMU_OTN_ZAP_METADATA; + dnode_setdirty(dn, tx); + dnode_rele(dn, FTAG); + + spa_feature_incr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); +} + +void +dmu_object_free_zapified(objset_t *mos, uint64_t object, dmu_tx_t *tx) +{ + dnode_t *dn; + dmu_object_type_t t; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(dnode_hold(mos, object, FTAG, &dn)); + t = dn->dn_type; + dnode_rele(dn, FTAG); + + if (t == DMU_OTN_ZAP_METADATA) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_EXTENSIBLE_DATASET, tx); + } + VERIFY0(dmu_object_free(mos, object, tx)); +} + +EXPORT_SYMBOL(dmu_object_alloc); +EXPORT_SYMBOL(dmu_object_alloc_ibs); +EXPORT_SYMBOL(dmu_object_alloc_dnsize); +EXPORT_SYMBOL(dmu_object_alloc_hold); +EXPORT_SYMBOL(dmu_object_claim); +EXPORT_SYMBOL(dmu_object_claim_dnsize); +EXPORT_SYMBOL(dmu_object_reclaim); +EXPORT_SYMBOL(dmu_object_reclaim_dnsize); +EXPORT_SYMBOL(dmu_object_rm_spill); +EXPORT_SYMBOL(dmu_object_free); +EXPORT_SYMBOL(dmu_object_next); +EXPORT_SYMBOL(dmu_object_zapify); +EXPORT_SYMBOL(dmu_object_free_zapified); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, , dmu_object_alloc_chunk_shift, INT, ZMOD_RW, + "CPU-specific allocator grabs 2^N objects at once"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c new file mode 100644 index 000000000000..b1590d7dba91 --- /dev/null +++ b/module/zfs/dmu_objset.c @@ -0,0 +1,2992 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2015, STRATO AG, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" + +/* + * Needed to close a window in dnode_move() that allows the objset to be freed + * before it can be safely accessed. + */ +krwlock_t os_lock; + +/* + * Tunable to overwrite the maximum number of threads for the parallelization + * of dmu_objset_find_dp, needed to speed up the import of pools with many + * datasets. + * Default is 4 times the number of leaf vdevs. + */ +int dmu_find_threads = 0; + +/* + * Backfill lower metadnode objects after this many have been freed. + * Backfilling negatively impacts object creation rates, so only do it + * if there are enough holes to fill. + */ +int dmu_rescan_dnode_threshold = 1 << DN_MAX_INDBLKSHIFT; + +static char *upgrade_tag = "upgrade_tag"; + +static void dmu_objset_find_dp_cb(void *arg); + +static void dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb); +static void dmu_objset_upgrade_stop(objset_t *os); + +void +dmu_objset_init(void) +{ + rw_init(&os_lock, NULL, RW_DEFAULT, NULL); +} + +void +dmu_objset_fini(void) +{ + rw_destroy(&os_lock); +} + +spa_t * +dmu_objset_spa(objset_t *os) +{ + return (os->os_spa); +} + +zilog_t * +dmu_objset_zil(objset_t *os) +{ + return (os->os_zil); +} + +dsl_pool_t * +dmu_objset_pool(objset_t *os) +{ + dsl_dataset_t *ds; + + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) + return (ds->ds_dir->dd_pool); + else + return (spa_get_dsl(os->os_spa)); +} + +dsl_dataset_t * +dmu_objset_ds(objset_t *os) +{ + return (os->os_dsl_dataset); +} + +dmu_objset_type_t +dmu_objset_type(objset_t *os) +{ + return (os->os_phys->os_type); +} + +void +dmu_objset_name(objset_t *os, char *buf) +{ + dsl_dataset_name(os->os_dsl_dataset, buf); +} + +uint64_t +dmu_objset_id(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + return (ds ? ds->ds_object : 0); +} + +uint64_t +dmu_objset_dnodesize(objset_t *os) +{ + return (os->os_dnodesize); +} + +zfs_sync_type_t +dmu_objset_syncprop(objset_t *os) +{ + return (os->os_sync); +} + +zfs_logbias_op_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + +static void +checksum_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); +} + +static void +compression_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval != ZIO_COMPRESS_INHERIT); + + os->os_compress = zio_compress_select(os->os_spa, + ZIO_COMPRESS_ALGO(newval), ZIO_COMPRESS_ON); + os->os_complevel = zio_complevel_select(os->os_spa, os->os_compress, + ZIO_COMPRESS_LEVEL(newval), ZIO_COMPLEVEL_DEFAULT); +} + +static void +copies_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval > 0); + ASSERT(newval <= spa_max_replication(os->os_spa)); + + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); + + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); +} + +static void +primary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_primary_cache = newval; +} + +static void +secondary_cache_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || + newval == ZFS_CACHE_METADATA); + + os->os_secondary_cache = newval; +} + +static void +sync_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_SYNC_STANDARD || newval == ZFS_SYNC_ALWAYS || + newval == ZFS_SYNC_DISABLED); + + os->os_sync = newval; + if (os->os_zil) + zil_set_sync(os->os_zil, newval); +} + +static void +redundant_metadata_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL || + newval == ZFS_REDUNDANT_METADATA_MOST); + + os->os_redundant_metadata = newval; +} + +static void +dnodesize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + switch (newval) { + case ZFS_DNSIZE_LEGACY: + os->os_dnodesize = DNODE_MIN_SIZE; + break; + case ZFS_DNSIZE_AUTO: + /* + * Choose a dnode size that will work well for most + * workloads if the user specified "auto". Future code + * improvements could dynamically select a dnode size + * based on observed workload patterns. + */ + os->os_dnodesize = DNODE_MIN_SIZE * 2; + break; + case ZFS_DNSIZE_1K: + case ZFS_DNSIZE_2K: + case ZFS_DNSIZE_4K: + case ZFS_DNSIZE_8K: + case ZFS_DNSIZE_16K: + os->os_dnodesize = newval; + break; + } +} + +static void +smallblk_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + /* + * Inheritance and range checking should have been done by now. + */ + ASSERT(newval <= SPA_OLD_MAXBLOCKSIZE); + ASSERT(ISP2(newval)); + + os->os_zpl_special_smallblock = newval; +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); +} + +static void +recordsize_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + os->os_recordsize = newval; +} + +void +dmu_objset_byteswap(void *buf, size_t size) +{ + objset_phys_t *osp = buf; + + ASSERT(size == OBJSET_PHYS_SIZE_V1 || size == OBJSET_PHYS_SIZE_V2 || + size == sizeof (objset_phys_t)); + dnode_byteswap(&osp->os_meta_dnode); + byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); + osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size >= OBJSET_PHYS_SIZE_V2) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + if (size >= sizeof (objset_phys_t)) + dnode_byteswap(&osp->os_projectused_dnode); + } +} + +/* + * The hash is a CRC-based hash of the objset_t pointer and the object number. + */ +static uint64_t +dnode_hash(const objset_t *os, uint64_t obj) +{ + uintptr_t osv = (uintptr_t)os; + uint64_t crc = -1ULL; + + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + /* + * The low 6 bits of the pointer don't have much entropy, because + * the objset_t is larger than 2^6 bytes long. + */ + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 6)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; + + crc ^= (osv>>14) ^ (obj>>24); + + return (crc); +} + +static unsigned int +dnode_multilist_index_func(multilist_t *ml, void *obj) +{ + dnode_t *dn = obj; + return (dnode_hash(dn->dn_objset, dn->dn_object) % + multilist_get_num_sublists(ml)); +} + +/* + * Instantiates the objset_t in-memory structure corresponding to the + * objset_phys_t that's pointed to by the specified blkptr_t. + */ +int +dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + objset_t **osp) +{ + objset_t *os; + int i, err; + + ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); + ASSERT(!BP_IS_REDACTED(bp)); + + /* + * We need the pool config lock to get properties. + */ + ASSERT(ds == NULL || dsl_pool_config_held(ds->ds_dir->dd_pool)); + + /* + * The $ORIGIN dataset (if it exists) doesn't have an associated + * objset, so there's no reason to open it. The $ORIGIN dataset + * will not exist on pools older than SPA_VERSION_ORIGIN. + */ + if (ds != NULL && spa_get_dsl(spa) != NULL && + spa_get_dsl(spa)->dp_origin_snap != NULL) { + ASSERT3P(ds->ds_dir, !=, + spa_get_dsl(spa)->dp_origin_snap->ds_dir); + } + + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { + arc_flags_t aflags = ARC_FLAG_WAIT; + zbookmark_phys_t zb; + int size; + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) + aflags |= ARC_FLAG_L2CACHE; + + if (ds != NULL && ds->ds_dir->dd_crypto_obj != 0) { + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + ASSERT(BP_IS_AUTHENTICATED(bp)); + zio_flags |= ZIO_FLAG_RAW; + } + + dprintf_bp(os->os_rootbp, "reading %s", ""); + err = arc_read(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + if (err != 0) { + kmem_free(os, sizeof (objset_t)); + /* convert checksum errors into IO errors */ + if (err == ECKSUM) + err = SET_ERROR(EIO); + return (err); + } + + if (spa_version(spa) < SPA_VERSION_USERSPACE) + size = OBJSET_PHYS_SIZE_V1; + else if (!spa_feature_is_enabled(spa, + SPA_FEATURE_PROJECT_QUOTA)) + size = OBJSET_PHYS_SIZE_V2; + else + size = sizeof (objset_phys_t); + + /* Increase the blocksize if we are permitted. */ + if (arc_buf_size(os->os_phys_buf) < size) { + arc_buf_t *buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, size); + bzero(buf->b_data, size); + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + os->os_phys_buf = buf; + } + + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; + } else { + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_PHYS_SIZE_V1; + os->os_phys_buf = arc_alloc_buf(spa, &os->os_phys_buf, + ARC_BUFC_METADATA, size); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); + } + /* + * These properties will be filled in by the logic in zfs_get_zplprop() + * when they are queried for the first time. + */ + os->os_version = OBJSET_PROP_UNINITIALIZED; + os->os_normalization = OBJSET_PROP_UNINITIALIZED; + os->os_utf8only = OBJSET_PROP_UNINITIALIZED; + os->os_casesensitivity = OBJSET_PROP_UNINITIALIZED; + + /* + * Note: the changed_cb will be called once before the register + * func returns, thus changing the checksum/compression from the + * default (fletcher2/off). Snapshots don't need to know about + * checksum/compression/copies. + */ + if (ds != NULL) { + os->os_encrypted = (ds->ds_dir->dd_crypto_obj != 0); + + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE), + primary_cache_changed_cb, os); + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE), + secondary_cache_changed_cb, os); + } + if (!ds->ds_is_snapshot) { + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_CHECKSUM), + checksum_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COMPRESSION), + compression_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_COPIES), + copies_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DEDUP), + dedup_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_LOGBIAS), + logbias_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_SYNC), + sync_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_REDUNDANT_METADATA), + redundant_metadata_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), + recordsize_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name(ZFS_PROP_DNODESIZE), + dnodesize_changed_cb, os); + } + if (err == 0) { + err = dsl_prop_register(ds, + zfs_prop_to_name( + ZFS_PROP_SPECIAL_SMALL_BLOCKS), + smallblk_changed_cb, os); + } + } + if (err != 0) { + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + kmem_free(os, sizeof (objset_t)); + return (err); + } + } else { + /* It's the meta-objset. */ + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_ON; + os->os_complevel = ZIO_COMPLEVEL_DEFAULT; + os->os_encrypted = B_FALSE; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = B_FALSE; + os->os_logbias = ZFS_LOGBIAS_LATENCY; + os->os_sync = ZFS_SYNC_STANDARD; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; + os->os_dnodesize = DNODE_MIN_SIZE; + } + + if (ds == NULL || !ds->ds_is_snapshot) + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); + + for (i = 0; i < TXG_SIZE; i++) { + os->os_dirty_dnodes[i] = multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[i]), + dnode_multilist_index_func); + } + list_create(&os->os_dnodes, sizeof (dnode_t), + offsetof(dnode_t, dn_link)); + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + list_link_init(&os->os_evicting_node); + + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_userused_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + os->os_obj_next_percpu_len = boot_ncpus; + os->os_obj_next_percpu = kmem_zalloc(os->os_obj_next_percpu_len * + sizeof (os->os_obj_next_percpu[0]), KM_SLEEP); + + dnode_special_open(os, &os->os_phys->os_meta_dnode, + DMU_META_DNODE_OBJECT, &os->os_meta_dnode); + if (OBJSET_BUF_HAS_USERUSED(os->os_phys_buf)) { + dnode_special_open(os, &os->os_phys->os_userused_dnode, + DMU_USERUSED_OBJECT, &os->os_userused_dnode); + dnode_special_open(os, &os->os_phys->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, &os->os_groupused_dnode); + if (OBJSET_BUF_HAS_PROJECTUSED(os->os_phys_buf)) + dnode_special_open(os, + &os->os_phys->os_projectused_dnode, + DMU_PROJECTUSED_OBJECT, &os->os_projectused_dnode); + } + + mutex_init(&os->os_upgrade_lock, NULL, MUTEX_DEFAULT, NULL); + + *osp = os; + return (0); +} + +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) +{ + int err = 0; + + /* + * We need the pool_config lock to manipulate the dsl_dataset_t. + * Even if the dataset is long-held, we need the pool_config lock + * to open the objset, as it needs to get properties. + */ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + + mutex_enter(&ds->ds_opening_lock); + if (ds->ds_objset == NULL) { + objset_t *os; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), + ds, dsl_dataset_get_blkptr(ds), &os); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + if (err == 0) { + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); + } + } + *osp = ds->ds_objset; + mutex_exit(&ds->ds_opening_lock); + return (err); +} + +/* + * Holds the pool while the objset is held. Therefore only one objset + * can be held at a time. + */ +int +dmu_objset_hold_flags(const char *name, boolean_t decrypt, void *tag, + objset_t **osp) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + + err = dsl_pool_hold(name, tag, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold_flags(dp, name, flags, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + dsl_dataset_rele(ds, tag); + dsl_pool_rele(dp, tag); + } + + return (err); +} + +int +dmu_objset_hold(const char *name, void *tag, objset_t **osp) +{ + return (dmu_objset_hold_flags(name, B_FALSE, tag, osp)); +} + +static int +dmu_objset_own_impl(dsl_dataset_t *ds, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) +{ + int err; + + err = dmu_objset_from_ds(ds, osp); + if (err != 0) { + return (err); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + return (SET_ERROR(EINVAL)); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + return (SET_ERROR(EROFS)); + } else if (!readonly && decrypt && + dsl_dir_incompatible_encryption_version(ds->ds_dir)) { + return (SET_ERROR(EROFS)); + } + + /* if we are decrypting, we can now check MACs in os->os_phys_buf */ + if (decrypt && arc_is_unauthenticated((*osp)->os_phys_buf)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = arc_untransform((*osp)->os_phys_buf, (*osp)->os_spa, + &zb, B_FALSE); + if (err != 0) + return (err); + + ASSERT0(arc_is_unauthenticated((*osp)->os_phys_buf)); + } + + return (0); +} + +/* + * dsl_pool must not be held when this is called. + * Upon successful return, there will be a longhold on the dataset, + * and the dsl_pool will not be held. + */ +int +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + + err = dsl_pool_hold(name, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_own(dp, name, flags, tag, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); + if (err != 0) { + dsl_dataset_disown(ds, flags, tag); + dsl_pool_rele(dp, FTAG); + return (err); + } + + /* + * User accounting requires the dataset to be decrypted and rw. + * We also don't begin user accounting during claiming to help + * speed up pool import times and to keep this txg reserved + * completely for recovery work. + */ + if ((dmu_objset_userobjspace_upgradable(*osp) || + dmu_objset_projectquota_upgradable(*osp)) && + !readonly && !dp->dp_spa->spa_claiming && + (ds->ds_dir->dd_crypto_obj == 0 || decrypt)) + dmu_objset_id_quota_upgrade(*osp); + + dsl_pool_rele(dp, FTAG); + return (0); +} + +int +dmu_objset_own_obj(dsl_pool_t *dp, uint64_t obj, dmu_objset_type_t type, + boolean_t readonly, boolean_t decrypt, void *tag, objset_t **osp) +{ + dsl_dataset_t *ds; + int err; + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + + err = dsl_dataset_own_obj(dp, obj, flags, tag, &ds); + if (err != 0) + return (err); + + err = dmu_objset_own_impl(ds, type, readonly, decrypt, tag, osp); + if (err != 0) { + dsl_dataset_disown(ds, flags, tag); + return (err); + } + + return (0); +} + +void +dmu_objset_rele_flags(objset_t *os, boolean_t decrypt, void *tag) +{ + ds_hold_flags_t flags = (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0; + + dsl_pool_t *dp = dmu_objset_pool(os); + dsl_dataset_rele_flags(os->os_dsl_dataset, flags, tag); + dsl_pool_rele(dp, tag); +} + +void +dmu_objset_rele(objset_t *os, void *tag) +{ + dmu_objset_rele_flags(os, B_FALSE, tag); +} + +/* + * When we are called, os MUST refer to an objset associated with a dataset + * that is owned by 'tag'; that is, is held and long held by 'tag' and ds_owner + * == tag. We will then release and reacquire ownership of the dataset while + * holding the pool config_rwlock to avoid intervening namespace or ownership + * changes may occur. + * + * This exists solely to accommodate zfs_ioc_userspace_upgrade()'s desire to + * release the hold on its dataset and acquire a new one on the dataset of the + * same name so that it can be partially torn down and reconstructed. + */ +void +dmu_objset_refresh_ownership(dsl_dataset_t *ds, dsl_dataset_t **newds, + boolean_t decrypt, void *tag) +{ + dsl_pool_t *dp; + char name[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY3P(ds, !=, NULL); + VERIFY3P(ds->ds_owner, ==, tag); + VERIFY(dsl_dataset_long_held(ds)); + + dsl_dataset_name(ds, name); + dp = ds->ds_dir->dd_pool; + dsl_pool_config_enter(dp, FTAG); + dsl_dataset_disown(ds, decrypt, tag); + VERIFY0(dsl_dataset_own(dp, name, + (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag, newds)); + dsl_pool_config_exit(dp, FTAG); +} + +void +dmu_objset_disown(objset_t *os, boolean_t decrypt, void *tag) +{ + /* + * Stop upgrading thread + */ + dmu_objset_upgrade_stop(os); + dsl_dataset_disown(os->os_dsl_dataset, + (decrypt) ? DS_HOLD_FLAG_DECRYPT : 0, tag); +} + +void +dmu_objset_evict_dbufs(objset_t *os) +{ + dnode_t *dn_marker; + dnode_t *dn; + + dn_marker = kmem_alloc(sizeof (dnode_t), KM_SLEEP); + + mutex_enter(&os->os_lock); + dn = list_head(&os->os_dnodes); + while (dn != NULL) { + /* + * Skip dnodes without holds. We have to do this dance + * because dnode_add_ref() only works if there is already a + * hold. If the dnode has no holds, then it has no dbufs. + */ + if (dnode_add_ref(dn, FTAG)) { + list_insert_after(&os->os_dnodes, dn, dn_marker); + mutex_exit(&os->os_lock); + + dnode_evict_dbufs(dn); + dnode_rele(dn, FTAG); + + mutex_enter(&os->os_lock); + dn = list_next(&os->os_dnodes, dn_marker); + list_remove(&os->os_dnodes, dn_marker); + } else { + dn = list_next(&os->os_dnodes, dn); + } + } + mutex_exit(&os->os_lock); + + kmem_free(dn_marker, sizeof (dnode_t)); + + if (DMU_USERUSED_DNODE(os) != NULL) { + if (DMU_PROJECTUSED_DNODE(os) != NULL) + dnode_evict_dbufs(DMU_PROJECTUSED_DNODE(os)); + dnode_evict_dbufs(DMU_GROUPUSED_DNODE(os)); + dnode_evict_dbufs(DMU_USERUSED_DNODE(os)); + } + dnode_evict_dbufs(DMU_META_DNODE(os)); +} + +/* + * Objset eviction processing is split into into two pieces. + * The first marks the objset as evicting, evicts any dbufs that + * have a refcount of zero, and then queues up the objset for the + * second phase of eviction. Once os->os_dnodes has been cleared by + * dnode_buf_pageout()->dnode_destroy(), the second phase is executed. + * The second phase closes the special dnodes, dequeues the objset from + * the list of those undergoing eviction, and finally frees the objset. + * + * NOTE: Due to asynchronous eviction processing (invocation of + * dnode_buf_pageout()), it is possible for the meta dnode for the + * objset to have no holds even though os->os_dnodes is not empty. + */ +void +dmu_objset_evict(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); + + if (ds) + dsl_prop_unregister_all(ds, os); + + if (os->os_sa) + sa_tear_down(os); + + dmu_objset_evict_dbufs(os); + + mutex_enter(&os->os_lock); + spa_evicting_os_register(os->os_spa, os); + if (list_is_empty(&os->os_dnodes)) { + mutex_exit(&os->os_lock); + dmu_objset_evict_done(os); + } else { + mutex_exit(&os->os_lock); + } + + +} + +void +dmu_objset_evict_done(objset_t *os) +{ + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); + + dnode_special_close(&os->os_meta_dnode); + if (DMU_USERUSED_DNODE(os)) { + if (DMU_PROJECTUSED_DNODE(os)) + dnode_special_close(&os->os_projectused_dnode); + dnode_special_close(&os->os_userused_dnode); + dnode_special_close(&os->os_groupused_dnode); + } + zil_free(os->os_zil); + + arc_buf_destroy(os->os_phys_buf, &os->os_phys_buf); + + /* + * This is a barrier to prevent the objset from going away in + * dnode_move() until we can safely ensure that the objset is still in + * use. We consider the objset valid before the barrier and invalid + * after the barrier. + */ + rw_enter(&os_lock, RW_READER); + rw_exit(&os_lock); + + kmem_free(os->os_obj_next_percpu, + os->os_obj_next_percpu_len * sizeof (os->os_obj_next_percpu[0])); + + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_userused_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + mutex_destroy(&os->os_upgrade_lock); + for (int i = 0; i < TXG_SIZE; i++) { + multilist_destroy(os->os_dirty_dnodes[i]); + } + spa_evicting_os_deregister(os->os_spa, os); + kmem_free(os, sizeof (objset_t)); +} + +inode_timespec_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); +} + +objset_t * +dmu_objset_create_impl_dnstats(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, int levels, int blksz, int ibs, dmu_tx_t *tx) +{ + objset_t *os; + dnode_t *mdn; + + ASSERT(dmu_tx_is_syncing(tx)); + + if (blksz == 0) + blksz = DNODE_BLOCK_SIZE; + if (ibs == 0) + ibs = DN_MAX_INDBLKSHIFT; + + if (ds != NULL) + VERIFY0(dmu_objset_from_ds(ds, &os)); + else + VERIFY0(dmu_objset_open_impl(spa, NULL, bp, &os)); + + mdn = DMU_META_DNODE(os); + + dnode_allocate(mdn, DMU_OT_DNODE, blksz, ibs, DMU_OT_NONE, 0, + DNODE_MIN_SLOTS, tx); + + /* + * We don't want to have to increase the meta-dnode's nlevels + * later, because then we could do it in quiescing context while + * we are also accessing it in open context. + * + * This precaution is not necessary for the MOS (ds == NULL), + * because the MOS is only updated in syncing context. + * This is most fortunate: the MOS is the only objset that + * needs to be synced multiple times as spa_sync() iterates + * to convergence, so minimizing its dn_nlevels matters. + */ + if (ds != NULL) { + if (levels == 0) { + levels = 1; + + /* + * Determine the number of levels necessary for the + * meta-dnode to contain DN_MAX_OBJECT dnodes. Note + * that in order to ensure that we do not overflow + * 64 bits, there has to be a nlevels that gives us a + * number of blocks > DN_MAX_OBJECT but < 2^64. + * Therefore, (mdn->dn_indblkshift - SPA_BLKPTRSHIFT) + * (10) must be less than (64 - log2(DN_MAX_OBJECT)) + * (16). + */ + while ((uint64_t)mdn->dn_nblkptr << + (mdn->dn_datablkshift - DNODE_SHIFT + (levels - 1) * + (mdn->dn_indblkshift - SPA_BLKPTRSHIFT)) < + DN_MAX_OBJECT) + levels++; + } + + mdn->dn_next_nlevels[tx->tx_txg & TXG_MASK] = + mdn->dn_nlevels = levels; + } + + ASSERT(type != DMU_OST_NONE); + ASSERT(type != DMU_OST_ANY); + ASSERT(type < DMU_OST_NUMTYPES); + os->os_phys->os_type = type; + + /* + * Enable user accounting if it is enabled and this is not an + * encrypted receive. + */ + if (dmu_objset_userused_enabled(os) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + if (dmu_objset_userobjused_enabled(os)) { + ds->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; + os->os_phys->os_flags |= + OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + } + if (dmu_objset_projectquota_enabled(os)) { + ds->ds_feature_activation[ + SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; + os->os_phys->os_flags |= + OBJSET_FLAG_PROJECTQUOTA_COMPLETE; + } + os->os_flags = os->os_phys->os_flags; + } + + dsl_dataset_dirty(ds, tx); + + return (os); +} + +/* called from dsl for meta-objset */ +objset_t * +dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, + dmu_objset_type_t type, dmu_tx_t *tx) +{ + return (dmu_objset_create_impl_dnstats(spa, ds, bp, type, 0, 0, 0, tx)); +} + +typedef struct dmu_objset_create_arg { + const char *doca_name; + cred_t *doca_cred; + proc_t *doca_proc; + void (*doca_userfunc)(objset_t *os, void *arg, + cred_t *cr, dmu_tx_t *tx); + void *doca_userarg; + dmu_objset_type_t doca_type; + uint64_t doca_flags; + dsl_crypto_params_t *doca_dcp; +} dmu_objset_create_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_create_check(void *arg, dmu_tx_t *tx) +{ + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + dsl_dataset_t *parentds; + objset_t *parentos; + const char *tail; + int error; + + if (strchr(doca->doca_name, '@') != NULL) + return (SET_ERROR(EINVAL)); + + if (strlen(doca->doca_name) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + if (dataset_nestcheck(doca->doca_name) != 0) + return (SET_ERROR(ENAMETOOLONG)); + + error = dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); + } + + error = dmu_objset_create_crypt_check(pdd, doca->doca_dcp, NULL); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred, doca->doca_proc); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (error); + } + + /* can't create below anything but filesystems (eg. no ZVOLs) */ + error = dsl_dataset_hold_obj(pdd->dd_pool, + dsl_dir_phys(pdd)->dd_head_dataset_obj, FTAG, &parentds); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (error); + } + error = dmu_objset_from_ds(parentds, &parentos); + if (error != 0) { + dsl_dataset_rele(parentds, FTAG); + dsl_dir_rele(pdd, FTAG); + return (error); + } + if (dmu_objset_type(parentos) != DMU_OST_ZFS) { + dsl_dataset_rele(parentds, FTAG); + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); + } + dsl_dataset_rele(parentds, FTAG); + dsl_dir_rele(pdd, FTAG); + + return (error); +} + +static void +dmu_objset_create_sync(void *arg, dmu_tx_t *tx) +{ + dmu_objset_create_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *ds; + uint64_t obj; + blkptr_t *bp; + objset_t *os; + zio_t *rzio; + + VERIFY0(dsl_dir_hold(dp, doca->doca_name, FTAG, &pdd, &tail)); + + obj = dsl_dataset_create_sync(pdd, tail, NULL, doca->doca_flags, + doca->doca_cred, doca->doca_dcp, tx); + + VERIFY0(dsl_dataset_hold_obj_flags(pdd->dd_pool, obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bp = dsl_dataset_get_blkptr(ds); + os = dmu_objset_create_impl(spa, ds, bp, doca->doca_type, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + if (doca->doca_userfunc != NULL) { + doca->doca_userfunc(os, doca->doca_userarg, + doca->doca_cred, tx); + } + + /* + * The doca_userfunc() may write out some data that needs to be + * encrypted if the dataset is encrypted (specifically the root + * directory). This data must be written out before the encryption + * key mapping is removed by dsl_dataset_rele_flags(). Force the + * I/O to occur immediately by invoking the relevant sections of + * dsl_pool_sync(). + */ + if (os->os_encrypted) { + dsl_dataset_t *tmpds = NULL; + boolean_t need_sync_done = B_FALSE; + + mutex_enter(&ds->ds_lock); + ds->ds_owner = FTAG; + mutex_exit(&ds->ds_lock); + + rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, + tx->tx_txg); + if (tmpds != NULL) { + dsl_dataset_sync(ds, rzio, tx); + need_sync_done = B_TRUE; + } + VERIFY0(zio_wait(rzio)); + + dmu_objset_do_userquota_updates(os, tx); + taskq_wait(dp->dp_sync_taskq); + if (txg_list_member(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(spa, ds->ds_key_mapping, ds); + } + + rzio = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + tmpds = txg_list_remove_this(&dp->dp_dirty_datasets, ds, + tx->tx_txg); + if (tmpds != NULL) { + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, rzio, tx); + } + VERIFY0(zio_wait(rzio)); + + if (need_sync_done) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(spa, ds->ds_key_mapping, ds); + dsl_dataset_sync_done(ds, tx); + } + + mutex_enter(&ds->ds_lock); + ds->ds_owner = NULL; + mutex_exit(&ds->ds_lock); + } + + spa_history_log_internal_ds(ds, "create", tx, " "); + + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + dsl_dir_rele(pdd, FTAG); +} + +int +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, + dsl_crypto_params_t *dcp, dmu_objset_create_sync_func_t func, void *arg) +{ + dmu_objset_create_arg_t doca; + dsl_crypto_params_t tmp_dcp = { 0 }; + + doca.doca_name = name; + doca.doca_cred = CRED(); + doca.doca_proc = curproc; + doca.doca_flags = flags; + doca.doca_userfunc = func; + doca.doca_userarg = arg; + doca.doca_type = type; + + /* + * Some callers (mostly for testing) do not provide a dcp on their + * own but various code inside the sync task will require it to be + * allocated. Rather than adding NULL checks throughout this code + * or adding dummy dcp's to all of the callers we simply create a + * dummy one here and use that. This zero dcp will have the same + * effect as asking for inheritance of all encryption params. + */ + doca.doca_dcp = (dcp != NULL) ? dcp : &tmp_dcp; + + int rv = dsl_sync_task(name, + dmu_objset_create_check, dmu_objset_create_sync, &doca, + 6, ZFS_SPACE_CHECK_NORMAL); + + if (rv == 0) + zvol_create_minor(name); + return (rv); +} + +typedef struct dmu_objset_clone_arg { + const char *doca_clone; + const char *doca_origin; + cred_t *doca_cred; + proc_t *doca_proc; +} dmu_objset_clone_arg_t; + +/*ARGSUSED*/ +static int +dmu_objset_clone_check(void *arg, dmu_tx_t *tx) +{ + dmu_objset_clone_arg_t *doca = arg; + dsl_dir_t *pdd; + const char *tail; + int error; + dsl_dataset_t *origin; + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (strchr(doca->doca_clone, '@') != NULL) + return (SET_ERROR(EINVAL)); + + if (strlen(doca->doca_clone) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + error = dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail); + if (error != 0) + return (error); + if (tail == NULL) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EEXIST)); + } + + error = dsl_fs_ss_limit_check(pdd, 1, ZFS_PROP_FILESYSTEM_LIMIT, NULL, + doca->doca_cred, doca->doca_proc); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EDQUOT)); + } + + error = dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin); + if (error != 0) { + dsl_dir_rele(pdd, FTAG); + return (error); + } + + /* You can only clone snapshots, not the head datasets. */ + if (!origin->ds_is_snapshot) { + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); + return (SET_ERROR(EINVAL)); + } + + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); + + return (0); +} + +static void +dmu_objset_clone_sync(void *arg, dmu_tx_t *tx) +{ + dmu_objset_clone_arg_t *doca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *pdd; + const char *tail; + dsl_dataset_t *origin, *ds; + uint64_t obj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY0(dsl_dir_hold(dp, doca->doca_clone, FTAG, &pdd, &tail)); + VERIFY0(dsl_dataset_hold(dp, doca->doca_origin, FTAG, &origin)); + + obj = dsl_dataset_create_sync(pdd, tail, origin, 0, + doca->doca_cred, NULL, tx); + + VERIFY0(dsl_dataset_hold_obj(pdd->dd_pool, obj, FTAG, &ds)); + dsl_dataset_name(origin, namebuf); + spa_history_log_internal_ds(ds, "clone", tx, + "origin=%s (%llu)", namebuf, (u_longlong_t)origin->ds_object); + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(pdd, FTAG); +} + +int +dmu_objset_clone(const char *clone, const char *origin) +{ + dmu_objset_clone_arg_t doca; + + doca.doca_clone = clone; + doca.doca_origin = origin; + doca.doca_cred = CRED(); + doca.doca_proc = curproc; + + int rv = dsl_sync_task(clone, + dmu_objset_clone_check, dmu_objset_clone_sync, &doca, + 6, ZFS_SPACE_CHECK_NORMAL); + + if (rv == 0) + zvol_create_minor(clone); + + return (rv); +} + +int +dmu_objset_snapshot_one(const char *fsname, const char *snapname) +{ + int err; + char *longsnap = kmem_asprintf("%s@%s", fsname, snapname); + nvlist_t *snaps = fnvlist_alloc(); + + fnvlist_add_boolean(snaps, longsnap); + kmem_strfree(longsnap); + err = dsl_dataset_snapshot(snaps, NULL, NULL); + fnvlist_free(snaps); + return (err); +} + +static void +dmu_objset_upgrade_task_cb(void *data) +{ + objset_t *os = data; + + mutex_enter(&os->os_upgrade_lock); + os->os_upgrade_status = EINTR; + if (!os->os_upgrade_exit) { + mutex_exit(&os->os_upgrade_lock); + + os->os_upgrade_status = os->os_upgrade_cb(os); + mutex_enter(&os->os_upgrade_lock); + } + os->os_upgrade_exit = B_TRUE; + os->os_upgrade_id = 0; + mutex_exit(&os->os_upgrade_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); +} + +static void +dmu_objset_upgrade(objset_t *os, dmu_objset_upgrade_cb_t cb) +{ + if (os->os_upgrade_id != 0) + return; + + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + dsl_dataset_long_hold(dmu_objset_ds(os), upgrade_tag); + + mutex_enter(&os->os_upgrade_lock); + if (os->os_upgrade_id == 0 && os->os_upgrade_status == 0) { + os->os_upgrade_exit = B_FALSE; + os->os_upgrade_cb = cb; + os->os_upgrade_id = taskq_dispatch( + os->os_spa->spa_upgrade_taskq, + dmu_objset_upgrade_task_cb, os, TQ_SLEEP); + if (os->os_upgrade_id == TASKQID_INVALID) { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); + os->os_upgrade_status = ENOMEM; + } + } + mutex_exit(&os->os_upgrade_lock); +} + +static void +dmu_objset_upgrade_stop(objset_t *os) +{ + mutex_enter(&os->os_upgrade_lock); + os->os_upgrade_exit = B_TRUE; + if (os->os_upgrade_id != 0) { + taskqid_t id = os->os_upgrade_id; + + os->os_upgrade_id = 0; + mutex_exit(&os->os_upgrade_lock); + + if ((taskq_cancel_id(os->os_spa->spa_upgrade_taskq, id)) == 0) { + dsl_dataset_long_rele(dmu_objset_ds(os), upgrade_tag); + } + txg_wait_synced(os->os_spa->spa_dsl_pool, 0); + } else { + mutex_exit(&os->os_upgrade_lock); + } +} + +static void +dmu_objset_sync_dnodes(multilist_sublist_t *list, dmu_tx_t *tx) +{ + dnode_t *dn; + + while ((dn = multilist_sublist_head(list)) != NULL) { + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(dn->dn_dbuf->db_data_pending); + /* + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it outside dnode_sync(). + */ + dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; + ASSERT(dn->dn_zio); + + ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); + multilist_sublist_remove(list, dn); + + /* + * If we are not doing useraccounting (os_synced_dnodes == NULL) + * we are done with this dnode for this txg. Unset dn_dirty_txg + * if later txgs aren't dirtying it so that future holders do + * not get a stale value. Otherwise, we will do this in + * userquota_updates_task() when processing has completely + * finished for this txg. + */ + multilist_t *newlist = dn->dn_objset->os_synced_dnodes; + if (newlist != NULL) { + (void) dnode_add_ref(dn, newlist); + multilist_insert(newlist, dn); + } else { + mutex_enter(&dn->dn_mtx); + if (dn->dn_dirty_txg == tx->tx_txg) + dn->dn_dirty_txg = 0; + mutex_exit(&dn->dn_mtx); + } + + dnode_sync(dn, tx); + } +} + +/* ARGSUSED */ +static void +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + objset_t *os = arg; + dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; + uint64_t fill = 0; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_OBJSET); + ASSERT0(BP_GET_LEVEL(bp)); + + /* + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group/project accounting objects). + */ + for (int i = 0; i < dnp->dn_nblkptr; i++) + fill += BP_GET_FILL(&dnp->dn_blkptr[i]); + + BP_SET_FILL(bp, fill); + + if (os->os_dsl_dataset != NULL) + rrw_enter(&os->os_dsl_dataset->ds_bp_rwlock, RW_WRITER, FTAG); + *os->os_rootbp = *bp; + if (os->os_dsl_dataset != NULL) + rrw_exit(&os->os_dsl_dataset->ds_bp_rwlock, FTAG); +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } + kmem_free(bp, sizeof (*bp)); +} + +typedef struct sync_dnodes_arg { + multilist_t *sda_list; + int sda_sublist_idx; + multilist_t *sda_newlist; + dmu_tx_t *sda_tx; +} sync_dnodes_arg_t; + +static void +sync_dnodes_task(void *arg) +{ + sync_dnodes_arg_t *sda = arg; + + multilist_sublist_t *ms = + multilist_sublist_lock(sda->sda_list, sda->sda_sublist_idx); + + dmu_objset_sync_dnodes(ms, sda->sda_tx); + + multilist_sublist_unlock(ms); + + kmem_free(sda, sizeof (*sda)); +} + + +/* called from dsl */ +void +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) +{ + int txgoff; + zbookmark_phys_t zb; + zio_prop_t zp; + zio_t *zio; + list_t *list; + dbuf_dirty_record_t *dr; + int num_sublists; + multilist_t *ml; + blkptr_t *blkptr_copy = kmem_alloc(sizeof (*os->os_rootbp), KM_SLEEP); + *blkptr_copy = *os->os_rootbp; + + dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); + + ASSERT(dmu_tx_is_syncing(tx)); + /* XXX the write_done callback should really give us the tx... */ + os->os_synctx = tx; + + if (os->os_dsl_dataset == NULL) { + /* + * This is the MOS. If we have upgraded, + * spa_max_replication() could change, so reset + * os_copies here. + */ + os->os_copies = spa_max_replication(os->os_spa); + } + + /* + * Create the root block IO + */ + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + arc_release(os->os_phys_buf, &os->os_phys_buf); + + dmu_write_policy(os, NULL, 0, 0, &zp); + + /* + * If we are either claiming the ZIL or doing a raw receive, write + * out the os_phys_buf raw. Neither of these actions will effect the + * MAC at this point. + */ + if (os->os_raw_receive || + os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { + ASSERT(os->os_encrypted); + arc_convert_to_raw(os->os_phys_buf, + os->os_dsl_dataset->ds_object, ZFS_HOST_BYTEORDER, + DMU_OT_OBJSET, NULL, NULL, NULL); + } + + zio = arc_write(pio, os->os_spa, tx->tx_txg, + blkptr_copy, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), + &zp, dmu_objset_write_ready, NULL, NULL, dmu_objset_write_done, + os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + + /* + * Sync special dnodes - the parent IO for the sync is the root block + */ + DMU_META_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_META_DNODE(os), tx); + + os->os_phys->os_flags = os->os_flags; + + if (DMU_USERUSED_DNODE(os) && + DMU_USERUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_USERUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_USERUSED_DNODE(os), tx); + DMU_GROUPUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_GROUPUSED_DNODE(os), tx); + } + + if (DMU_PROJECTUSED_DNODE(os) && + DMU_PROJECTUSED_DNODE(os)->dn_type != DMU_OT_NONE) { + DMU_PROJECTUSED_DNODE(os)->dn_zio = zio; + dnode_sync(DMU_PROJECTUSED_DNODE(os), tx); + } + + txgoff = tx->tx_txg & TXG_MASK; + + if (dmu_objset_userused_enabled(os) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. But it may already + * exist because we call dsl_dataset_sync() twice per txg. + */ + if (os->os_synced_dnodes == NULL) { + os->os_synced_dnodes = + multilist_create(sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff]), + dnode_multilist_index_func); + } else { + ASSERT3U(os->os_synced_dnodes->ml_offset, ==, + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + } + + ml = os->os_dirty_dnodes[txgoff]; + num_sublists = multilist_get_num_sublists(ml); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(ml, i)) + continue; + sync_dnodes_arg_t *sda = kmem_alloc(sizeof (*sda), KM_SLEEP); + sda->sda_list = ml; + sda->sda_sublist_idx = i; + sda->sda_tx = tx; + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + sync_dnodes_task, sda, 0); + /* callback frees sda */ + } + taskq_wait(dmu_objset_pool(os)->dp_sync_taskq); + + list = &DMU_META_DNODE(os)->dn_dirty_records[txgoff]; + while ((dr = list_head(list)) != NULL) { + ASSERT0(dr->dr_dbuf->db_level); + list_remove(list, dr); + zio_nowait(dr->dr_zio); + } + + /* Enable dnode backfill if enough objects have been freed. */ + if (os->os_freed_dnodes >= dmu_rescan_dnode_threshold) { + os->os_rescan_dnodes = B_TRUE; + os->os_freed_dnodes = 0; + } + + /* + * Free intent log blocks up to this tx. + */ + zil_sync(os->os_zil, tx); + os->os_phys->os_zil_header = os->os_zil_header; + zio_nowait(zio); +} + +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!multilist_is_empty(os->os_dirty_dnodes[txg & TXG_MASK])); +} + +static file_info_cb_t *file_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, file_info_cb_t *cb) +{ + file_cbs[ost] = cb; +} + +int +dmu_get_file_info(objset_t *os, dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zfi) +{ + file_info_cb_t *cb = file_cbs[os->os_phys->os_type]; + if (cb == NULL) + return (EINVAL); + return (cb(bonustype, data, zfi)); +} + +boolean_t +dmu_objset_userused_enabled(objset_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + file_cbs[os->os_phys->os_type] != NULL && + DMU_USERUSED_DNODE(os) != NULL); +} + +boolean_t +dmu_objset_userobjused_enabled(objset_t *os) +{ + return (dmu_objset_userused_enabled(os) && + spa_feature_is_enabled(os->os_spa, SPA_FEATURE_USEROBJ_ACCOUNTING)); +} + +boolean_t +dmu_objset_projectquota_enabled(objset_t *os) +{ + return (file_cbs[os->os_phys->os_type] != NULL && + DMU_PROJECTUSED_DNODE(os) != NULL && + spa_feature_is_enabled(os->os_spa, SPA_FEATURE_PROJECT_QUOTA)); +} + +typedef struct userquota_node { + /* must be in the first filed, see userquota_update_cache() */ + char uqn_id[20 + DMU_OBJACCT_PREFIX_LEN]; + int64_t uqn_delta; + avl_node_t uqn_node; +} userquota_node_t; + +typedef struct userquota_cache { + avl_tree_t uqc_user_deltas; + avl_tree_t uqc_group_deltas; + avl_tree_t uqc_project_deltas; +} userquota_cache_t; + +static int +userquota_compare(const void *l, const void *r) +{ + const userquota_node_t *luqn = l; + const userquota_node_t *ruqn = r; + int rv; + + /* + * NB: can only access uqn_id because userquota_update_cache() doesn't + * pass in an entire userquota_node_t. + */ + rv = strcmp(luqn->uqn_id, ruqn->uqn_id); + + return (TREE_ISIGN(rv)); +} + +static void +do_userquota_cacheflush(objset_t *os, userquota_cache_t *cache, dmu_tx_t *tx) +{ + void *cookie; + userquota_node_t *uqn; + + ASSERT(dmu_tx_is_syncing(tx)); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_user_deltas, + &cookie)) != NULL) { + /* + * os_userused_lock protects against concurrent calls to + * zap_increment_int(). It's needed because zap_increment_int() + * is not thread-safe (i.e. not atomic). + */ + mutex_enter(&os->os_userused_lock); + VERIFY0(zap_increment(os, DMU_USERUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + mutex_exit(&os->os_userused_lock); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_user_deltas); + + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_group_deltas, + &cookie)) != NULL) { + mutex_enter(&os->os_userused_lock); + VERIFY0(zap_increment(os, DMU_GROUPUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + mutex_exit(&os->os_userused_lock); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_group_deltas); + + if (dmu_objset_projectquota_enabled(os)) { + cookie = NULL; + while ((uqn = avl_destroy_nodes(&cache->uqc_project_deltas, + &cookie)) != NULL) { + mutex_enter(&os->os_userused_lock); + VERIFY0(zap_increment(os, DMU_PROJECTUSED_OBJECT, + uqn->uqn_id, uqn->uqn_delta, tx)); + mutex_exit(&os->os_userused_lock); + kmem_free(uqn, sizeof (*uqn)); + } + avl_destroy(&cache->uqc_project_deltas); + } +} + +static void +userquota_update_cache(avl_tree_t *avl, const char *id, int64_t delta) +{ + userquota_node_t *uqn; + avl_index_t idx; + + ASSERT(strlen(id) < sizeof (uqn->uqn_id)); + /* + * Use id directly for searching because uqn_id is the first field of + * userquota_node_t and fields after uqn_id won't be accessed in + * avl_find(). + */ + uqn = avl_find(avl, (const void *)id, &idx); + if (uqn == NULL) { + uqn = kmem_zalloc(sizeof (*uqn), KM_SLEEP); + strlcpy(uqn->uqn_id, id, sizeof (uqn->uqn_id)); + avl_insert(avl, uqn, idx); + } + uqn->uqn_delta += delta; +} + +static void +do_userquota_update(objset_t *os, userquota_cache_t *cache, uint64_t used, + uint64_t flags, uint64_t user, uint64_t group, uint64_t project, + boolean_t subtract) +{ + if (flags & DNODE_FLAG_USERUSED_ACCOUNTED) { + int64_t delta = DNODE_MIN_SIZE + used; + char name[20]; + + if (subtract) + delta = -delta; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)user); + userquota_update_cache(&cache->uqc_user_deltas, name, delta); + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)group); + userquota_update_cache(&cache->uqc_group_deltas, name, delta); + + if (dmu_objset_projectquota_enabled(os)) { + (void) snprintf(name, sizeof (name), "%llx", + (longlong_t)project); + userquota_update_cache(&cache->uqc_project_deltas, + name, delta); + } + } +} + +static void +do_userobjquota_update(objset_t *os, userquota_cache_t *cache, uint64_t flags, + uint64_t user, uint64_t group, uint64_t project, boolean_t subtract) +{ + if (flags & DNODE_FLAG_USEROBJUSED_ACCOUNTED) { + char name[20 + DMU_OBJACCT_PREFIX_LEN]; + int delta = subtract ? -1 : 1; + + (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", + (longlong_t)user); + userquota_update_cache(&cache->uqc_user_deltas, name, delta); + + (void) snprintf(name, sizeof (name), DMU_OBJACCT_PREFIX "%llx", + (longlong_t)group); + userquota_update_cache(&cache->uqc_group_deltas, name, delta); + + if (dmu_objset_projectquota_enabled(os)) { + (void) snprintf(name, sizeof (name), + DMU_OBJACCT_PREFIX "%llx", (longlong_t)project); + userquota_update_cache(&cache->uqc_project_deltas, + name, delta); + } + } +} + +typedef struct userquota_updates_arg { + objset_t *uua_os; + int uua_sublist_idx; + dmu_tx_t *uua_tx; +} userquota_updates_arg_t; + +static void +userquota_updates_task(void *arg) +{ + userquota_updates_arg_t *uua = arg; + objset_t *os = uua->uua_os; + dmu_tx_t *tx = uua->uua_tx; + dnode_t *dn; + userquota_cache_t cache = { { 0 } }; + + multilist_sublist_t *list = + multilist_sublist_lock(os->os_synced_dnodes, uua->uua_sublist_idx); + + ASSERT(multilist_sublist_head(list) == NULL || + dmu_objset_userused_enabled(os)); + avl_create(&cache.uqc_user_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + avl_create(&cache.uqc_group_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, uqn_node)); + if (dmu_objset_projectquota_enabled(os)) + avl_create(&cache.uqc_project_deltas, userquota_compare, + sizeof (userquota_node_t), offsetof(userquota_node_t, + uqn_node)); + + while ((dn = multilist_sublist_head(list)) != NULL) { + int flags; + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + flags = dn->dn_id_flags; + ASSERT(flags); + if (flags & DN_ID_OLD_EXIST) { + do_userquota_update(os, &cache, dn->dn_oldused, + dn->dn_oldflags, dn->dn_olduid, dn->dn_oldgid, + dn->dn_oldprojid, B_TRUE); + do_userobjquota_update(os, &cache, dn->dn_oldflags, + dn->dn_olduid, dn->dn_oldgid, + dn->dn_oldprojid, B_TRUE); + } + if (flags & DN_ID_NEW_EXIST) { + do_userquota_update(os, &cache, + DN_USED_BYTES(dn->dn_phys), dn->dn_phys->dn_flags, + dn->dn_newuid, dn->dn_newgid, + dn->dn_newprojid, B_FALSE); + do_userobjquota_update(os, &cache, + dn->dn_phys->dn_flags, dn->dn_newuid, dn->dn_newgid, + dn->dn_newprojid, B_FALSE); + } + + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + if (dn->dn_id_flags & DN_ID_NEW_EXIST) { + dn->dn_olduid = dn->dn_newuid; + dn->dn_oldgid = dn->dn_newgid; + dn->dn_oldprojid = dn->dn_newprojid; + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (dn->dn_bonuslen == 0) + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + else + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + dn->dn_id_flags &= ~(DN_ID_NEW_EXIST); + if (dn->dn_dirty_txg == spa_syncing_txg(os->os_spa)) + dn->dn_dirty_txg = 0; + mutex_exit(&dn->dn_mtx); + + multilist_sublist_remove(list, dn); + dnode_rele(dn, os->os_synced_dnodes); + } + do_userquota_cacheflush(os, &cache, tx); + multilist_sublist_unlock(list); + kmem_free(uua, sizeof (*uua)); +} + +void +dmu_objset_do_userquota_updates(objset_t *os, dmu_tx_t *tx) +{ + int num_sublists; + + if (!dmu_objset_userused_enabled(os)) + return; + + /* + * If this is a raw receive just return and handle accounting + * later when we have the keys loaded. We also don't do user + * accounting during claiming since the datasets are not owned + * for the duration of claiming and this txg should only be + * used for recovery. + */ + if (os->os_encrypted && dmu_objset_is_receiving(os)) + return; + + if (tx->tx_txg <= os->os_spa->spa_claim_max_txg) + return; + + /* Allocate the user/group/project used objects if necessary. */ + if (DMU_USERUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY0(zap_create_claim(os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY0(zap_create_claim(os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + if (dmu_objset_projectquota_enabled(os) && + DMU_PROJECTUSED_DNODE(os)->dn_type == DMU_OT_NONE) { + VERIFY0(zap_create_claim(os, DMU_PROJECTUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + num_sublists = multilist_get_num_sublists(os->os_synced_dnodes); + for (int i = 0; i < num_sublists; i++) { + if (multilist_sublist_is_empty_idx(os->os_synced_dnodes, i)) + continue; + userquota_updates_arg_t *uua = + kmem_alloc(sizeof (*uua), KM_SLEEP); + uua->uua_os = os; + uua->uua_sublist_idx = i; + uua->uua_tx = tx; + /* note: caller does taskq_wait() */ + (void) taskq_dispatch(dmu_objset_pool(os)->dp_sync_taskq, + userquota_updates_task, uua, 0); + /* callback frees uua */ + } +} + +/* + * Returns a pointer to data to find uid/gid from + * + * If a dirty record for transaction group that is syncing can't + * be found then NULL is returned. In the NULL case it is assumed + * the uid/gid aren't changing. + */ +static void * +dmu_objset_userquota_find_data(dmu_buf_impl_t *db, dmu_tx_t *tx) +{ + dbuf_dirty_record_t *dr; + void *data; + + if (db->db_dirtycnt == 0) + return (db->db.db_data); /* Nothing is changing */ + + dr = dbuf_find_dirty_eq(db, tx->tx_txg); + + if (dr == NULL) { + data = NULL; + } else { + dnode_t *dn; + + DB_DNODE_ENTER(dr->dr_dbuf); + dn = DB_DNODE(dr->dr_dbuf); + + if (dn->dn_bonuslen == 0 && + dr->dr_dbuf->db_blkid == DMU_SPILL_BLKID) + data = dr->dt.dl.dr_data->b_data; + else + data = dr->dt.dl.dr_data; + + DB_DNODE_EXIT(dr->dr_dbuf); + } + + return (data); +} + +void +dmu_objset_userquota_get_ids(dnode_t *dn, boolean_t before, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + void *data = NULL; + dmu_buf_impl_t *db = NULL; + int flags = dn->dn_id_flags; + int error; + boolean_t have_spill = B_FALSE; + + if (!dmu_objset_userused_enabled(dn->dn_objset)) + return; + + /* + * Raw receives introduce a problem with user accounting. Raw + * receives cannot update the user accounting info because the + * user ids and the sizes are encrypted. To guarantee that we + * never end up with bad user accounting, we simply disable it + * during raw receives. We also disable this for normal receives + * so that an incremental raw receive may be done on top of an + * existing non-raw receive. + */ + if (os->os_encrypted && dmu_objset_is_receiving(os)) + return; + + if (before && (flags & (DN_ID_CHKED_BONUS|DN_ID_OLD_EXIST| + DN_ID_CHKED_SPILL))) + return; + + if (before && dn->dn_bonuslen != 0) + data = DN_BONUS(dn->dn_phys); + else if (!before && dn->dn_bonuslen != 0) { + if (dn->dn_bonus) { + db = dn->dn_bonus; + mutex_enter(&db->db_mtx); + data = dmu_objset_userquota_find_data(db, tx); + } else { + data = DN_BONUS(dn->dn_phys); + } + } else if (dn->dn_bonuslen == 0 && dn->dn_bonustype == DMU_OT_SA) { + int rf = 0; + + if (RW_WRITE_HELD(&dn->dn_struct_rwlock)) + rf |= DB_RF_HAVESTRUCT; + error = dmu_spill_hold_by_dnode(dn, + rf | DB_RF_MUST_SUCCEED, + FTAG, (dmu_buf_t **)&db); + ASSERT(error == 0); + mutex_enter(&db->db_mtx); + data = (before) ? db->db.db_data : + dmu_objset_userquota_find_data(db, tx); + have_spill = B_TRUE; + } else { + mutex_enter(&dn->dn_mtx); + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + mutex_exit(&dn->dn_mtx); + return; + } + + /* + * Must always call the callback in case the object + * type has changed and that type isn't an object type to track + */ + zfs_file_info_t zfi; + error = file_cbs[os->os_phys->os_type](dn->dn_bonustype, data, &zfi); + + if (before) { + ASSERT(data); + dn->dn_olduid = zfi.zfi_user; + dn->dn_oldgid = zfi.zfi_group; + dn->dn_oldprojid = zfi.zfi_project; + } else if (data) { + dn->dn_newuid = zfi.zfi_user; + dn->dn_newgid = zfi.zfi_group; + dn->dn_newprojid = zfi.zfi_project; + } + + /* + * Preserve existing uid/gid when the callback can't determine + * what the new uid/gid are and the callback returned EEXIST. + * The EEXIST error tells us to just use the existing uid/gid. + * If we don't know what the old values are then just assign + * them to 0, since that is a new file being created. + */ + if (!before && data == NULL && error == EEXIST) { + if (flags & DN_ID_OLD_EXIST) { + dn->dn_newuid = dn->dn_olduid; + dn->dn_newgid = dn->dn_oldgid; + dn->dn_newprojid = dn->dn_oldprojid; + } else { + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_newprojid = ZFS_DEFAULT_PROJID; + } + error = 0; + } + + if (db) + mutex_exit(&db->db_mtx); + + mutex_enter(&dn->dn_mtx); + if (error == 0 && before) + dn->dn_id_flags |= DN_ID_OLD_EXIST; + if (error == 0 && !before) + dn->dn_id_flags |= DN_ID_NEW_EXIST; + + if (have_spill) { + dn->dn_id_flags |= DN_ID_CHKED_SPILL; + } else { + dn->dn_id_flags |= DN_ID_CHKED_BONUS; + } + mutex_exit(&dn->dn_mtx); + if (have_spill) + dmu_buf_rele((dmu_buf_t *)db, FTAG); +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +boolean_t +dmu_objset_userobjspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); +} + +boolean_t +dmu_objset_projectquota_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_PROJECTQUOTA_COMPLETE); +} + +static int +dmu_objset_space_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + mutex_enter(&os->os_upgrade_lock); + if (os->os_upgrade_exit) + err = SET_ERROR(EINTR); + mutex_exit(&os->os_upgrade_lock); + if (err != 0) + return (err); + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (SET_ERROR(EINTR)); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr != 0) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr != 0) { + dmu_buf_rele(db, FTAG); + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + return (0); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (dmu_objset_is_snapshot(os)) + return (SET_ERROR(EINVAL)); + if (!dmu_objset_userused_enabled(os)) + return (SET_ERROR(ENOTSUP)); + + err = dmu_objset_space_upgrade(os); + if (err) + return (err); + + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +static int +dmu_objset_id_quota_upgrade_cb(objset_t *os) +{ + int err = 0; + + if (dmu_objset_userobjspace_present(os) && + dmu_objset_projectquota_present(os)) + return (0); + if (dmu_objset_is_snapshot(os)) + return (SET_ERROR(EINVAL)); + if (!dmu_objset_userobjused_enabled(os)) + return (SET_ERROR(ENOTSUP)); + if (!dmu_objset_projectquota_enabled(os) && + dmu_objset_userobjspace_present(os)) + return (SET_ERROR(ENOTSUP)); + + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_USEROBJ_ACCOUNTING] = (void *)B_TRUE; + if (dmu_objset_projectquota_enabled(os)) + dmu_objset_ds(os)->ds_feature_activation[ + SPA_FEATURE_PROJECT_QUOTA] = (void *)B_TRUE; + + err = dmu_objset_space_upgrade(os); + if (err) + return (err); + + os->os_flags |= OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE; + if (dmu_objset_projectquota_enabled(os)) + os->os_flags |= OBJSET_FLAG_PROJECTQUOTA_COMPLETE; + + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + +void +dmu_objset_id_quota_upgrade(objset_t *os) +{ + dmu_objset_upgrade(os, dmu_objset_id_quota_upgrade_cb); +} + +boolean_t +dmu_objset_userobjspace_upgradable(objset_t *os) +{ + return (dmu_objset_type(os) == DMU_OST_ZFS && + !dmu_objset_is_snapshot(os) && + dmu_objset_userobjused_enabled(os) && + !dmu_objset_userobjspace_present(os) && + spa_writeable(dmu_objset_spa(os))); +} + +boolean_t +dmu_objset_projectquota_upgradable(objset_t *os) +{ + return (dmu_objset_type(os) == DMU_OST_ZFS && + !dmu_objset_is_snapshot(os) && + dmu_objset_projectquota_enabled(os) && + !dmu_objset_projectquota_present(os) && + spa_writeable(dmu_objset_spa(os))); +} + +void +dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, + usedobjsp, availobjsp); +} + +uint64_t +dmu_objset_fsid_guid(objset_t *os) +{ + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); +} + +void +dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) +{ + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); +} + +void +dmu_objset_stats(objset_t *os, nvlist_t *nv) +{ + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); + + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, + os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); +} + +int +dmu_objset_is_snapshot(objset_t *os) +{ + if (os->os_dsl_dataset != NULL) + return (os->os_dsl_dataset->ds_is_snapshot); + else + return (B_FALSE); +} + +int +dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, + boolean_t *conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + uint64_t ignored; + + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); + + return (zap_lookup_norm(ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, name, 8, 1, &ignored, + MT_NORMALIZE, real, maxlen, conflict)); +} + +int +dmu_snapshot_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + zap_cursor_t cursor; + zap_attribute_t attr; + + ASSERT(dsl_pool_config_held(dmu_objset_pool(os))); + + if (dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0) + return (SET_ERROR(ENOENT)); + + zap_cursor_init_serialized(&cursor, + ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENOENT)); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENAMETOOLONG)); + } + + (void) strlcpy(name, attr.za_name, namelen); + if (idp) + *idp = attr.za_first_integer; + if (case_conflict) + *case_conflict = attr.za_normalization_conflict; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +int +dmu_snapshot_lookup(objset_t *os, const char *name, uint64_t *value) +{ + return (dsl_dataset_snap_lookup(os->os_dsl_dataset, name, value)); +} + +int +dmu_dir_list_next(objset_t *os, int namelen, char *name, + uint64_t *idp, uint64_t *offp) +{ + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; + zap_cursor_t cursor; + zap_attribute_t attr; + + /* there is no next dir on a snapshot! */ + if (os->os_dsl_dataset->ds_object != + dsl_dir_phys(dd)->dd_head_dataset_obj) + return (SET_ERROR(ENOENT)); + + zap_cursor_init_serialized(&cursor, + dd->dd_pool->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj, *offp); + + if (zap_cursor_retrieve(&cursor, &attr) != 0) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENOENT)); + } + + if (strlen(attr.za_name) + 1 > namelen) { + zap_cursor_fini(&cursor); + return (SET_ERROR(ENAMETOOLONG)); + } + + (void) strlcpy(name, attr.za_name, namelen); + if (idp) + *idp = attr.za_first_integer; + zap_cursor_advance(&cursor); + *offp = zap_cursor_serialize(&cursor); + zap_cursor_fini(&cursor); + + return (0); +} + +typedef struct dmu_objset_find_ctx { + taskq_t *dc_tq; + dsl_pool_t *dc_dp; + uint64_t dc_ddobj; + char *dc_ddname; /* last component of ddobj's name */ + int (*dc_func)(dsl_pool_t *, dsl_dataset_t *, void *); + void *dc_arg; + int dc_flags; + kmutex_t *dc_error_lock; + int *dc_error; +} dmu_objset_find_ctx_t; + +static void +dmu_objset_find_dp_impl(dmu_objset_find_ctx_t *dcp) +{ + dsl_pool_t *dp = dcp->dc_dp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + uint64_t thisobj; + int err = 0; + + /* don't process if there already was an error */ + if (*dcp->dc_error != 0) + goto out; + + /* + * Note: passing the name (dc_ddname) here is optional, but it + * improves performance because we don't need to call + * zap_value_search() to determine the name. + */ + err = dsl_dir_hold_obj(dp, dcp->dc_ddobj, dcp->dc_ddname, FTAG, &dd); + if (err != 0) + goto out; + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + goto out; + } + + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (dcp->dc_flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + dmu_objset_find_ctx_t *child_dcp = + kmem_alloc(sizeof (*child_dcp), KM_SLEEP); + *child_dcp = *dcp; + child_dcp->dc_ddobj = attr->za_first_integer; + child_dcp->dc_ddname = spa_strdup(attr->za_name); + if (dcp->dc_tq != NULL) + (void) taskq_dispatch(dcp->dc_tq, + dmu_objset_find_dp_cb, child_dcp, TQ_SLEEP); + else + dmu_objset_find_dp_impl(child_dcp); + } + zap_cursor_fini(&zc); + } + + /* + * Iterate over all snapshots. + */ + if (dcp->dc_flags & DS_FIND_SNAPSHOTS) { + dsl_dataset_t *ds; + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + err = dsl_dataset_hold_obj(dp, + attr->za_first_integer, FTAG, &ds); + if (err != 0) + break; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + kmem_free(attr, sizeof (zap_attribute_t)); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + goto out; + } + + /* + * Apply to self. + */ + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + /* + * Note: we hold the dir while calling dsl_dataset_hold_obj() so + * that the dir will remain cached, and we won't have to re-instantiate + * it (which could be expensive due to finding its name via + * zap_value_search()). + */ + dsl_dir_rele(dd, FTAG); + if (err != 0) + goto out; + err = dcp->dc_func(dp, ds, dcp->dc_arg); + dsl_dataset_rele(ds, FTAG); + +out: + if (err != 0) { + mutex_enter(dcp->dc_error_lock); + /* only keep first error */ + if (*dcp->dc_error == 0) + *dcp->dc_error = err; + mutex_exit(dcp->dc_error_lock); + } + + if (dcp->dc_ddname != NULL) + spa_strfree(dcp->dc_ddname); + kmem_free(dcp, sizeof (*dcp)); +} + +static void +dmu_objset_find_dp_cb(void *arg) +{ + dmu_objset_find_ctx_t *dcp = arg; + dsl_pool_t *dp = dcp->dc_dp; + + /* + * We need to get a pool_config_lock here, as there are several + * assert(pool_config_held) down the stack. Getting a lock via + * dsl_pool_config_enter is risky, as it might be stalled by a + * pending writer. This would deadlock, as the write lock can + * only be granted when our parent thread gives up the lock. + * The _prio interface gives us priority over a pending writer. + */ + dsl_pool_config_enter_prio(dp, FTAG); + + dmu_objset_find_dp_impl(dcp); + + dsl_pool_config_exit(dp, FTAG); +} + +/* + * Find objsets under and including ddobj, call func(ds) on each. + * The order for the enumeration is completely undefined. + * func is called with dsl_pool_config held. + */ +int +dmu_objset_find_dp(dsl_pool_t *dp, uint64_t ddobj, + int func(dsl_pool_t *, dsl_dataset_t *, void *), void *arg, int flags) +{ + int error = 0; + taskq_t *tq = NULL; + int ntasks; + dmu_objset_find_ctx_t *dcp; + kmutex_t err_lock; + + mutex_init(&err_lock, NULL, MUTEX_DEFAULT, NULL); + dcp = kmem_alloc(sizeof (*dcp), KM_SLEEP); + dcp->dc_tq = NULL; + dcp->dc_dp = dp; + dcp->dc_ddobj = ddobj; + dcp->dc_ddname = NULL; + dcp->dc_func = func; + dcp->dc_arg = arg; + dcp->dc_flags = flags; + dcp->dc_error_lock = &err_lock; + dcp->dc_error = &error; + + if ((flags & DS_FIND_SERIALIZE) || dsl_pool_config_held_writer(dp)) { + /* + * In case a write lock is held we can't make use of + * parallelism, as down the stack of the worker threads + * the lock is asserted via dsl_pool_config_held. + * In case of a read lock this is solved by getting a read + * lock in each worker thread, which isn't possible in case + * of a writer lock. So we fall back to the synchronous path + * here. + * In the future it might be possible to get some magic into + * dsl_pool_config_held in a way that it returns true for + * the worker threads so that a single lock held from this + * thread suffices. For now, stay single threaded. + */ + dmu_objset_find_dp_impl(dcp); + mutex_destroy(&err_lock); + + return (error); + } + + ntasks = dmu_find_threads; + if (ntasks == 0) + ntasks = vdev_count_leaves(dp->dp_spa) * 4; + tq = taskq_create("dmu_objset_find", ntasks, maxclsyspri, ntasks, + INT_MAX, 0); + if (tq == NULL) { + kmem_free(dcp, sizeof (*dcp)); + mutex_destroy(&err_lock); + + return (SET_ERROR(ENOMEM)); + } + dcp->dc_tq = tq; + + /* dcp will be freed by task */ + (void) taskq_dispatch(tq, dmu_objset_find_dp_cb, dcp, TQ_SLEEP); + + /* + * PORTING: this code relies on the property of taskq_wait to wait + * until no more tasks are queued and no more tasks are active. As + * we always queue new tasks from within other tasks, task_wait + * reliably waits for the full recursion to finish, even though we + * enqueue new tasks after taskq_wait has been called. + * On platforms other than illumos, taskq_wait may not have this + * property. + */ + taskq_wait(tq); + taskq_destroy(tq); + mutex_destroy(&err_lock); + + return (error); +} + +/* + * Find all objsets under name, and for each, call 'func(child_name, arg)'. + * The dp_config_rwlock must not be held when this is called, and it + * will not be held when the callback is called. + * Therefore this function should only be used when the pool is not changing + * (e.g. in syncing context), or the callback can deal with the possible races. + */ +static int +dmu_objset_find_impl(spa_t *spa, const char *name, + int func(const char *, void *), void *arg, int flags) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = spa_get_dsl(spa); + dsl_dataset_t *ds; + zap_cursor_t zc; + zap_attribute_t *attr; + char *child; + uint64_t thisobj; + int err; + + dsl_pool_config_enter(dp, FTAG); + + err = dsl_dir_hold(dp, name, FTAG, &dd, NULL); + if (err != 0) { + dsl_pool_config_exit(dp, FTAG); + return (err); + } + + /* Don't visit hidden ($MOS & $ORIGIN) objsets. */ + if (dd->dd_myname[0] == '$') { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); + return (0); + } + + thisobj = dsl_dir_phys(dd)->dd_head_dataset_obj; + attr = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* + * Iterate over all children. + */ + if (flags & DS_FIND_CHILDREN) { + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + child = kmem_asprintf("%s/%s", name, attr->za_name); + dsl_pool_config_exit(dp, FTAG); + err = dmu_objset_find_impl(spa, child, + func, arg, flags); + dsl_pool_config_enter(dp, FTAG); + kmem_strfree(child); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + + if (err != 0) { + dsl_dir_rele(dd, FTAG); + dsl_pool_config_exit(dp, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + return (err); + } + } + + /* + * Iterate over all snapshots. + */ + if (flags & DS_FIND_SNAPSHOTS) { + err = dsl_dataset_hold_obj(dp, thisobj, FTAG, &ds); + + if (err == 0) { + uint64_t snapobj; + + snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + dsl_dataset_rele(ds, FTAG); + + for (zap_cursor_init(&zc, dp->dp_meta_objset, snapobj); + zap_cursor_retrieve(&zc, attr) == 0; + (void) zap_cursor_advance(&zc)) { + ASSERT3U(attr->za_integer_length, ==, + sizeof (uint64_t)); + ASSERT3U(attr->za_num_integers, ==, 1); + + child = kmem_asprintf("%s@%s", + name, attr->za_name); + dsl_pool_config_exit(dp, FTAG); + err = func(child, arg); + dsl_pool_config_enter(dp, FTAG); + kmem_strfree(child); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + } + } + + dsl_dir_rele(dd, FTAG); + kmem_free(attr, sizeof (zap_attribute_t)); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) + return (err); + + /* Apply to self. */ + return (func(name, arg)); +} + +/* + * See comment above dmu_objset_find_impl(). + */ +int +dmu_objset_find(const char *name, int func(const char *, void *), void *arg, + int flags) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + error = dmu_objset_find_impl(spa, name, func, arg, flags); + spa_close(spa, FTAG); + return (error); +} + +boolean_t +dmu_objset_incompatible_encryption_version(objset_t *os) +{ + return (dsl_dir_incompatible_encryption_version( + os->os_dsl_dataset->ds_dir)); +} + +void +dmu_objset_set_user(objset_t *os, void *user_ptr) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; +} + +void * +dmu_objset_get_user(objset_t *os) +{ + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); +} + +/* + * Determine name of filesystem, given name of snapshot. + * buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes + */ +int +dmu_fsname(const char *snapname, char *buf) +{ + char *atp = strchr(snapname, '@'); + if (atp == NULL) + return (SET_ERROR(EINVAL)); + if (atp - snapname >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strlcpy(buf, snapname, atp - snapname + 1); + return (0); +} + +/* + * Call when we think we're going to write/free space in open context + * to track the amount of dirty data in the open txg, which is also the + * amount of memory that can not be evicted until this txg syncs. + * + * Note that there are two conditions where this can be called from + * syncing context: + * + * [1] When we just created the dataset, in which case we go on with + * updating any accounting of dirty data as usual. + * [2] When we are dirtying MOS data, in which case we only update the + * pool's accounting of dirty data. + */ +void +dmu_objset_willuse_space(objset_t *os, int64_t space, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int64_t aspace = spa_get_worst_case_asize(os->os_spa, space); + + if (ds != NULL) { + dsl_dir_willuse_space(ds->ds_dir, aspace, tx); + } + + dsl_pool_dirty_space(dmu_tx_pool(tx), space, tx); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dmu_objset_zil); +EXPORT_SYMBOL(dmu_objset_pool); +EXPORT_SYMBOL(dmu_objset_ds); +EXPORT_SYMBOL(dmu_objset_type); +EXPORT_SYMBOL(dmu_objset_name); +EXPORT_SYMBOL(dmu_objset_hold); +EXPORT_SYMBOL(dmu_objset_hold_flags); +EXPORT_SYMBOL(dmu_objset_own); +EXPORT_SYMBOL(dmu_objset_rele); +EXPORT_SYMBOL(dmu_objset_rele_flags); +EXPORT_SYMBOL(dmu_objset_disown); +EXPORT_SYMBOL(dmu_objset_from_ds); +EXPORT_SYMBOL(dmu_objset_create); +EXPORT_SYMBOL(dmu_objset_clone); +EXPORT_SYMBOL(dmu_objset_stats); +EXPORT_SYMBOL(dmu_objset_fast_stat); +EXPORT_SYMBOL(dmu_objset_spa); +EXPORT_SYMBOL(dmu_objset_space); +EXPORT_SYMBOL(dmu_objset_fsid_guid); +EXPORT_SYMBOL(dmu_objset_find); +EXPORT_SYMBOL(dmu_objset_byteswap); +EXPORT_SYMBOL(dmu_objset_evict_dbufs); +EXPORT_SYMBOL(dmu_objset_snap_cmtime); +EXPORT_SYMBOL(dmu_objset_dnodesize); + +EXPORT_SYMBOL(dmu_objset_sync); +EXPORT_SYMBOL(dmu_objset_is_dirty); +EXPORT_SYMBOL(dmu_objset_create_impl_dnstats); +EXPORT_SYMBOL(dmu_objset_create_impl); +EXPORT_SYMBOL(dmu_objset_open_impl); +EXPORT_SYMBOL(dmu_objset_evict); +EXPORT_SYMBOL(dmu_objset_register_type); +EXPORT_SYMBOL(dmu_objset_do_userquota_updates); +EXPORT_SYMBOL(dmu_objset_userquota_get_ids); +EXPORT_SYMBOL(dmu_objset_userused_enabled); +EXPORT_SYMBOL(dmu_objset_userspace_upgrade); +EXPORT_SYMBOL(dmu_objset_userspace_present); +EXPORT_SYMBOL(dmu_objset_userobjused_enabled); +EXPORT_SYMBOL(dmu_objset_userobjspace_upgradable); +EXPORT_SYMBOL(dmu_objset_userobjspace_present); +EXPORT_SYMBOL(dmu_objset_projectquota_enabled); +EXPORT_SYMBOL(dmu_objset_projectquota_present); +EXPORT_SYMBOL(dmu_objset_projectquota_upgradable); +EXPORT_SYMBOL(dmu_objset_id_quota_upgrade); +#endif diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c new file mode 100644 index 000000000000..2eee19a28e34 --- /dev/null +++ b/module/zfs/dmu_recv.c @@ -0,0 +1,3386 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif +#include + +int zfs_recv_queue_length = SPA_MAXBLOCKSIZE; +int zfs_recv_queue_ff = 20; +int zfs_recv_write_batch_size = 1024 * 1024; + +static char *dmu_recv_tag = "dmu_recv_tag"; +const char *recv_clone_name = "%recv"; + +static int receive_read_payload_and_next_header(dmu_recv_cookie_t *ra, int len, + void *buf); + +struct receive_record_arg { + dmu_replay_record_t header; + void *payload; /* Pointer to a buffer containing the payload */ + /* + * If the record is a write, pointer to the arc_buf_t containing the + * payload. + */ + arc_buf_t *arc_buf; + int payload_size; + uint64_t bytes_read; /* bytes read from stream when record created */ + boolean_t eos_marker; /* Marks the end of the stream */ + bqueue_node_t node; +}; + +struct receive_writer_arg { + objset_t *os; + boolean_t byteswap; + bqueue_t q; + + /* + * These three args are used to signal to the main thread that we're + * done. + */ + kmutex_t mutex; + kcondvar_t cv; + boolean_t done; + + int err; + boolean_t resumable; + boolean_t raw; /* DMU_BACKUP_FEATURE_RAW set */ + boolean_t spill; /* DRR_FLAG_SPILL_BLOCK set */ + boolean_t full; /* this is a full send stream */ + uint64_t last_object; + uint64_t last_offset; + uint64_t max_object; /* highest object ID referenced in stream */ + uint64_t bytes_read; /* bytes read when current record created */ + + list_t write_batch; + + /* Encryption parameters for the last received DRR_OBJECT_RANGE */ + boolean_t or_crypt_params_present; + uint64_t or_firstobj; + uint64_t or_numslots; + uint8_t or_salt[ZIO_DATA_SALT_LEN]; + uint8_t or_iv[ZIO_DATA_IV_LEN]; + uint8_t or_mac[ZIO_DATA_MAC_LEN]; + boolean_t or_byteorder; +}; + +typedef struct dmu_recv_begin_arg { + const char *drba_origin; + dmu_recv_cookie_t *drba_cookie; + cred_t *drba_cred; + proc_t *drba_proc; + dsl_crypto_params_t *drba_dcp; +} dmu_recv_begin_arg_t; + +static void +byteswap_record(dmu_replay_record_t *drr) +{ +#define DO64(X) (drr->drr_u.X = BSWAP_64(drr->drr_u.X)) +#define DO32(X) (drr->drr_u.X = BSWAP_32(drr->drr_u.X)) + drr->drr_type = BSWAP_32(drr->drr_type); + drr->drr_payloadlen = BSWAP_32(drr->drr_payloadlen); + + switch (drr->drr_type) { + case DRR_BEGIN: + DO64(drr_begin.drr_magic); + DO64(drr_begin.drr_versioninfo); + DO64(drr_begin.drr_creation_time); + DO32(drr_begin.drr_type); + DO32(drr_begin.drr_flags); + DO64(drr_begin.drr_toguid); + DO64(drr_begin.drr_fromguid); + break; + case DRR_OBJECT: + DO64(drr_object.drr_object); + DO32(drr_object.drr_type); + DO32(drr_object.drr_bonustype); + DO32(drr_object.drr_blksz); + DO32(drr_object.drr_bonuslen); + DO32(drr_object.drr_raw_bonuslen); + DO64(drr_object.drr_toguid); + DO64(drr_object.drr_maxblkid); + break; + case DRR_FREEOBJECTS: + DO64(drr_freeobjects.drr_firstobj); + DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); + break; + case DRR_WRITE: + DO64(drr_write.drr_object); + DO32(drr_write.drr_type); + DO64(drr_write.drr_offset); + DO64(drr_write.drr_logical_size); + DO64(drr_write.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write.drr_key.ddk_cksum); + DO64(drr_write.drr_key.ddk_prop); + DO64(drr_write.drr_compressed_size); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_write_byref. + drr_key.ddk_cksum); + DO64(drr_write_byref.drr_key.ddk_prop); + break; + case DRR_WRITE_EMBEDDED: + DO64(drr_write_embedded.drr_object); + DO64(drr_write_embedded.drr_offset); + DO64(drr_write_embedded.drr_length); + DO64(drr_write_embedded.drr_toguid); + DO32(drr_write_embedded.drr_lsize); + DO32(drr_write_embedded.drr_psize); + break; + case DRR_FREE: + DO64(drr_free.drr_object); + DO64(drr_free.drr_offset); + DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); + break; + case DRR_SPILL: + DO64(drr_spill.drr_object); + DO64(drr_spill.drr_length); + DO64(drr_spill.drr_toguid); + DO64(drr_spill.drr_compressed_size); + DO32(drr_spill.drr_type); + break; + case DRR_OBJECT_RANGE: + DO64(drr_object_range.drr_firstobj); + DO64(drr_object_range.drr_numslots); + DO64(drr_object_range.drr_toguid); + break; + case DRR_REDACT: + DO64(drr_redact.drr_object); + DO64(drr_redact.drr_offset); + DO64(drr_redact.drr_length); + DO64(drr_redact.drr_toguid); + break; + case DRR_END: + DO64(drr_end.drr_toguid); + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_end.drr_checksum); + break; + default: + break; + } + + if (drr->drr_type != DRR_BEGIN) { + ZIO_CHECKSUM_BSWAP(&drr->drr_u.drr_checksum.drr_checksum); + } + +#undef DO64 +#undef DO32 +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check that the new stream we're trying to receive is redacted with respect to + * a subset of the snapshots that the origin was redacted with respect to. For + * the reasons behind this, see the man page on redacted zfs sends and receives. + */ +static boolean_t +compatible_redact_snaps(uint64_t *origin_snaps, uint64_t origin_num_snaps, + uint64_t *redact_snaps, uint64_t num_redact_snaps) +{ + /* + * Short circuit the comparison; if we are redacted with respect to + * more snapshots than the origin, we can't be redacted with respect + * to a subset. + */ + if (num_redact_snaps > origin_num_snaps) { + return (B_FALSE); + } + + for (int i = 0; i < num_redact_snaps; i++) { + if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + redact_snaps[i])) { + return (B_FALSE); + } + } + return (B_TRUE); +} + +static boolean_t +redact_check(dmu_recv_begin_arg_t *drba, dsl_dataset_t *origin) +{ + uint64_t *origin_snaps; + uint64_t origin_num_snaps; + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + int featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + int err = 0; + boolean_t ret = B_TRUE; + uint64_t *redact_snaps; + uint_t numredactsnaps; + + /* + * If this is a full send stream, we're safe no matter what. + */ + if (drrb->drr_fromguid == 0) + return (ret); + + VERIFY(dsl_dataset_get_uint64_array_feature(origin, + SPA_FEATURE_REDACTED_DATASETS, &origin_num_snaps, &origin_snaps)); + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, &numredactsnaps) == + 0) { + /* + * If the send stream was sent from the redaction bookmark or + * the redacted version of the dataset, then we're safe. Verify + * that this is from the a compatible redaction bookmark or + * redacted dataset. + */ + if (!compatible_redact_snaps(origin_snaps, origin_num_snaps, + redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + /* + * If the stream is redacted, it must be redacted with respect + * to a subset of what the origin is redacted with respect to. + * See case number 2 in the zfs man page section on redacted zfs + * send. + */ + err = nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps); + + if (err != 0 || !compatible_redact_snaps(origin_snaps, + origin_num_snaps, redact_snaps, numredactsnaps)) { + err = EINVAL; + } + } else if (!redact_snaps_contains(origin_snaps, origin_num_snaps, + drrb->drr_toguid)) { + /* + * If the stream isn't redacted but the origin is, this must be + * one of the snapshots the origin is redacted with respect to. + * See case number 1 in the zfs man page section on redacted zfs + * send. + */ + err = EINVAL; + } + + if (err != 0) + ret = B_FALSE; + return (ret); +} + +/* + * If we previously received a stream with --large-block, we don't support + * receiving an incremental on top of it without --large-block. This avoids + * forcing a read-modify-write or trying to re-aggregate a string of WRITE + * records. + */ +static int +recv_check_large_blocks(dsl_dataset_t *ds, uint64_t featureflags) +{ + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_LARGE_BLOCKS) && + !(featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ZFS_ERR_STREAM_LARGE_BLOCK_MISMATCH)); + return (0); +} + +static int +recv_begin_check_existing_impl(dmu_recv_begin_arg_t *drba, dsl_dataset_t *ds, + uint64_t fromguid, uint64_t featureflags) +{ + uint64_t val; + uint64_t children; + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + boolean_t encrypted = ds->ds_dir->dd_crypto_obj != 0; + boolean_t raw = (featureflags & DMU_BACKUP_FEATURE_RAW) != 0; + boolean_t embed = (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) != 0; + + /* Temporary clone name must not exist. */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, recv_clone_name, + 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? SET_ERROR(EBUSY) : error); + + /* Resume state must not be set. */ + if (dsl_dataset_has_resume_receive_state(ds)) + return (SET_ERROR(EBUSY)); + + /* New snapshot name must not exist. */ + error = zap_lookup(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, + drba->drba_cookie->drc_tosnap, 8, 1, &val); + if (error != ENOENT) + return (error == 0 ? SET_ERROR(EEXIST) : error); + + /* Must not have children if receiving a ZVOL. */ + error = zap_count(dp->dp_meta_objset, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &children); + if (error != 0) + return (error); + if (drba->drba_cookie->drc_drrb->drr_type != DMU_OST_ZFS && + children > 0) + return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); + + /* + * Check snapshot limit before receiving. We'll recheck again at the + * end, but might as well abort before receiving if we're already over + * the limit. + * + * Note that we do not check the file system limit with + * dsl_dir_fscount_check because the temporary %clones don't count + * against that limit. + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, ZFS_PROP_SNAPSHOT_LIMIT, + NULL, drba->drba_cred, drba->drba_proc); + if (error != 0) + return (error); + + if (fromguid != 0) { + dsl_dataset_t *snap; + uint64_t obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + + /* Can't perform a raw receive on top of a non-raw receive */ + if (!encrypted && raw) + return (SET_ERROR(EINVAL)); + + /* Encryption is incompatible with embedded data */ + if (encrypted && embed) + return (SET_ERROR(EINVAL)); + + /* Find snapshot in this dir that matches fromguid. */ + while (obj != 0) { + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + return (SET_ERROR(ENODEV)); + if (snap->ds_dir != ds->ds_dir) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ENODEV)); + } + if (dsl_dataset_phys(snap)->ds_guid == fromguid) + break; + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (SET_ERROR(ENODEV)); + + if (drba->drba_cookie->drc_force) { + drba->drba_cookie->drc_fromsnapobj = obj; + } else { + /* + * If we are not forcing, there must be no + * changes since fromsnap. Raw sends have an + * additional constraint that requires that + * no "noop" snapshots exist between fromsnap + * and tosnap for the IVset checking code to + * work properly. + */ + if (dsl_dataset_modified_since_snap(ds, snap) || + (raw && + dsl_dataset_phys(ds)->ds_prev_snap_obj != + snap->ds_object)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(ETXTBSY)); + } + drba->drba_cookie->drc_fromsnapobj = + ds->ds_prev->ds_object; + } + + if (dsl_dataset_feature_is_active(snap, + SPA_FEATURE_REDACTED_DATASETS) && !redact_check(drba, + snap)) { + dsl_dataset_rele(snap, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = recv_check_large_blocks(snap, featureflags); + if (error != 0) { + dsl_dataset_rele(snap, FTAG); + return (error); + } + + dsl_dataset_rele(snap, FTAG); + } else { + /* if full, then must be forced */ + if (!drba->drba_cookie->drc_force) + return (SET_ERROR(EEXIST)); + + /* + * We don't support using zfs recv -F to blow away + * encrypted filesystems. This would require the + * dsl dir to point to the old encryption key and + * the new one at the same time during the receive. + */ + if ((!encrypted && raw) || encrypted) + return (SET_ERROR(EINVAL)); + + /* + * Perform the same encryption checks we would if + * we were creating a new dataset from scratch. + */ + if (!raw) { + boolean_t will_encrypt; + + error = dmu_objset_create_crypt_check( + ds->ds_dir->dd_parent, drba->drba_dcp, + &will_encrypt); + if (error != 0) + return (error); + + if (will_encrypt && embed) + return (SET_ERROR(EINVAL)); + } + } + + return (0); +} + +/* + * Check that any feature flags used in the data stream we're receiving are + * supported by the pool we are receiving into. + * + * Note that some of the features we explicitly check here have additional + * (implicit) features they depend on, but those dependencies are enforced + * through the zfeature_register() calls declaring the features that we + * explicitly check. + */ +static int +recv_begin_check_feature_flags_impl(uint64_t featureflags, spa_t *spa) +{ + /* + * Check if there are any unsupported feature flags. + */ + if (!DMU_STREAM_SUPPORTED(featureflags)) { + return (SET_ERROR(ZFS_ERR_UNKNOWN_SEND_STREAM_FEATURE)); + } + + /* Verify pool version supports SA if SA_SPILL feature set */ + if ((featureflags & DMU_BACKUP_FEATURE_SA_SPILL) && + spa_version(spa) < SPA_VERSION_SA) + return (SET_ERROR(ENOTSUP)); + + /* + * LZ4 compressed, ZSTD compressed, embedded, mooched, large blocks, + * and large_dnodes in the stream can only be used if those pool + * features are enabled because we don't attempt to decompress / + * un-embed / un-mooch / split up the blocks / dnodes during the + * receive process. + */ + if ((featureflags & DMU_BACKUP_FEATURE_LZ4) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LZ4_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_ZSTD) && + !spa_feature_is_enabled(spa, SPA_FEATURE_ZSTD_COMPRESS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) && + !spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + if ((featureflags & DMU_BACKUP_FEATURE_LARGE_DNODE) && + !spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (SET_ERROR(ENOTSUP)); + + /* + * Receiving redacted streams requires that redacted datasets are + * enabled. + */ + if ((featureflags & DMU_BACKUP_FEATURE_REDACTED) && + !spa_feature_is_enabled(spa, SPA_FEATURE_REDACTED_DATASETS)) + return (SET_ERROR(ENOTSUP)); + + return (0); +} + +static int +dmu_recv_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drba->drba_cookie->drc_drrb; + uint64_t fromguid = drrb->drr_fromguid; + int flags = drrb->drr_flags; + ds_hold_flags_t dsflags = 0; + int error; + uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dsl_dataset_t *ds; + const char *tofs = drba->drba_cookie->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(!(featureflags & DMU_BACKUP_FEATURE_RESUMING)); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES || + ((flags & DRR_FLAG_CLONE) && drba->drba_origin == NULL)) + return (SET_ERROR(EINVAL)); + + error = recv_begin_check_feature_flags_impl(featureflags, dp->dp_spa); + if (error != 0) + return (error); + + /* Resumable receives require extensible datasets */ + if (drba->drba_cookie->drc_resumable && + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_EXTENSIBLE_DATASET)) + return (SET_ERROR(ENOTSUP)); + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + /* raw receives require the encryption feature */ + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) + return (SET_ERROR(ENOTSUP)); + + /* embedded data is incompatible with encryption and raw recv */ + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (SET_ERROR(EINVAL)); + + /* raw receives require spill block allocation flag */ + if (!(flags & DRR_FLAG_SPILL_BLOCK)) + return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE || drba->drba_origin) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = recv_begin_check_existing_impl(drba, ds, fromguid, + featureflags); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + } else if (error == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + objset_t *os; + + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (fromguid != 0 && !((flags & DRR_FLAG_CLONE) || + drba->drba_origin)) + return (SET_ERROR(ENOENT)); + + /* + * If we're receiving a full send as a clone, and it doesn't + * contain all the necessary free records and freeobject + * records, reject it. + */ + if (fromguid == 0 && drba->drba_origin != NULL && + !(flags & DRR_FLAG_FREERECORDS)) + return (SET_ERROR(EINVAL)); + + /* Open the parent of tofs */ + ASSERT3U(strlen(tofs), <, sizeof (buf)); + (void) strlcpy(buf, tofs, strrchr(tofs, '/') - tofs + 1); + error = dsl_dataset_hold(dp, buf, FTAG, &ds); + if (error != 0) + return (error); + + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0 && + drba->drba_origin == NULL) { + boolean_t will_encrypt; + + /* + * Check that we aren't breaking any encryption rules + * and that we have all the parameters we need to + * create an encrypted dataset if necessary. If we are + * making an encrypted dataset the stream can't have + * embedded data. + */ + error = dmu_objset_create_crypt_check(ds->ds_dir, + drba->drba_dcp, &will_encrypt); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (will_encrypt && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + } + + /* + * Check filesystem and snapshot limits before receiving. We'll + * recheck snapshot limits again at the end (we create the + * filesystems and increment those counts during begin_sync). + */ + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_FILESYSTEM_LIMIT, NULL, + drba->drba_cred, drba->drba_proc); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + error = dsl_fs_ss_limit_check(ds->ds_dir, 1, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + drba->drba_cred, drba->drba_proc); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + /* can't recv below anything but filesystems (eg. no ZVOLs) */ + error = dmu_objset_from_ds(ds, &os); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); + } + + if (drba->drba_origin != NULL) { + dsl_dataset_t *origin; + error = dsl_dataset_hold_flags(dp, drba->drba_origin, + dsflags, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + if (!origin->ds_is_snapshot) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + if (dsl_dataset_phys(origin)->ds_guid != fromguid && + fromguid != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENODEV)); + } + + if (origin->ds_dir->dd_crypto_obj != 0 && + (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * If the origin is redacted we need to verify that this + * send stream can safely be received on top of the + * origin. + */ + if (dsl_dataset_feature_is_active(origin, + SPA_FEATURE_REDACTED_DATASETS)) { + if (!redact_check(drba, origin)) { + dsl_dataset_rele_flags(origin, dsflags, + FTAG); + dsl_dataset_rele_flags(ds, dsflags, + FTAG); + return (SET_ERROR(EINVAL)); + } + } + + error = recv_check_large_blocks(ds, featureflags); + if (error != 0) { + dsl_dataset_rele_flags(origin, dsflags, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + dsl_dataset_rele_flags(origin, dsflags, FTAG); + } + + dsl_dataset_rele(ds, FTAG); + error = 0; + } + return (error); +} + +static void +dmu_recv_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dmu_recv_cookie_t *drc = drba->drba_cookie; + struct drr_begin *drrb = drc->drc_drrb; + const char *tofs = drc->drc_tofs; + uint64_t featureflags = drc->drc_featureflags; + dsl_dataset_t *ds, *newds; + objset_t *os; + uint64_t dsobj; + ds_hold_flags_t dsflags = 0; + int error; + uint64_t crflags = 0; + dsl_crypto_params_t dummy_dcp = { 0 }; + dsl_crypto_params_t *dcp = drba->drba_dcp; + + if (drrb->drr_flags & DRR_FLAG_CI_DATA) + crflags |= DS_FLAG_CI_DATASET; + + if ((featureflags & DMU_BACKUP_FEATURE_RAW) == 0) + dsflags |= DS_HOLD_FLAG_DECRYPT; + + /* + * Raw, non-incremental recvs always use a dummy dcp with + * the raw cmd set. Raw incremental recvs do not use a dcp + * since the encryption parameters are already set in stone. + */ + if (dcp == NULL && drrb->drr_fromguid == 0 && + drba->drba_origin == NULL) { + ASSERT3P(dcp, ==, NULL); + dcp = &dummy_dcp; + + if (featureflags & DMU_BACKUP_FEATURE_RAW) + dcp->cp_cmd = DCP_CMD_RAW_RECV; + } + + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error == 0) { + /* create temporary clone */ + dsl_dataset_t *snap = NULL; + + if (drba->drba_cookie->drc_fromsnapobj != 0) { + VERIFY0(dsl_dataset_hold_obj(dp, + drba->drba_cookie->drc_fromsnapobj, FTAG, &snap)); + ASSERT3P(dcp, ==, NULL); + } + dsobj = dsl_dataset_create_sync(ds->ds_dir, recv_clone_name, + snap, crflags, drba->drba_cred, dcp, tx); + if (drba->drba_cookie->drc_fromsnapobj != 0) + dsl_dataset_rele(snap, FTAG); + dsl_dataset_rele_flags(ds, dsflags, FTAG); + } else { + dsl_dir_t *dd; + const char *tail; + dsl_dataset_t *origin = NULL; + + VERIFY0(dsl_dir_hold(dp, tofs, FTAG, &dd, &tail)); + + if (drba->drba_origin != NULL) { + VERIFY0(dsl_dataset_hold(dp, drba->drba_origin, + FTAG, &origin)); + ASSERT3P(dcp, ==, NULL); + } + + /* Create new dataset. */ + dsobj = dsl_dataset_create_sync(dd, strrchr(tofs, '/') + 1, + origin, crflags, drba->drba_cred, dcp, tx); + if (origin != NULL) + dsl_dataset_rele(origin, FTAG); + dsl_dir_rele(dd, FTAG); + drc->drc_newfs = B_TRUE; + } + VERIFY0(dsl_dataset_own_obj_force(dp, dsobj, dsflags, dmu_recv_tag, + &newds)); + if (dsl_dataset_feature_is_active(newds, + SPA_FEATURE_REDACTED_DATASETS)) { + /* + * If the origin dataset is redacted, the child will be redacted + * when we create it. We clear the new dataset's + * redaction info; if it should be redacted, we'll fill + * in its information later. + */ + dsl_dataset_deactivate_feature(newds, + SPA_FEATURE_REDACTED_DATASETS, tx); + } + VERIFY0(dmu_objset_from_ds(newds, &os)); + + if (drc->drc_resumable) { + dsl_dataset_zapify(newds, tx); + if (drrb->drr_fromguid != 0) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_FROMGUID, + 8, 1, &drrb->drr_fromguid, tx)); + } + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TOGUID, + 8, 1, &drrb->drr_toguid, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_TONAME, + 1, strlen(drrb->drr_toname) + 1, drrb->drr_toname, tx)); + uint64_t one = 1; + uint64_t zero = 0; + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OBJECT, + 8, 1, &one, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_OFFSET, + 8, 1, &zero, tx)); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_BYTES, + 8, 1, &zero, tx)); + if (featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_LARGEBLOCK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_EMBEDOK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_COMPRESSED) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_COMPRESSOK, + 8, 1, &one, tx)); + } + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + VERIFY0(zap_add(mos, dsobj, DS_FIELD_RESUME_RAWOK, + 8, 1, &one, tx)); + } + + uint64_t *redact_snaps; + uint_t numredactsnaps; + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_FROM_SNAPS, &redact_snaps, + &numredactsnaps) == 0) { + VERIFY0(zap_add(mos, dsobj, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, + sizeof (*redact_snaps), numredactsnaps, + redact_snaps, tx)); + } + } + + /* + * Usually the os->os_encrypted value is tied to the presence of a + * DSL Crypto Key object in the dd. However, that will not be received + * until dmu_recv_stream(), so we set the value manually for now. + */ + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + os->os_encrypted = B_TRUE; + drba->drba_cookie->drc_raw = B_TRUE; + } + + if (featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t *redact_snaps; + uint_t numredactsnaps; + VERIFY0(nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &redact_snaps, &numredactsnaps)); + dsl_dataset_activate_redaction(newds, redact_snaps, + numredactsnaps, tx); + } + + dmu_buf_will_dirty(newds->ds_dbuf, tx); + dsl_dataset_phys(newds)->ds_flags |= DS_FLAG_INCONSISTENT; + + /* + * If we actually created a non-clone, we need to create the objset + * in our new dataset. If this is a raw send we postpone this until + * dmu_recv_stream() so that we can allocate the metadnode with the + * properties from the DRR_BEGIN payload. + */ + rrw_enter(&newds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(newds)) && + (featureflags & DMU_BACKUP_FEATURE_RAW) == 0) { + (void) dmu_objset_create_impl(dp->dp_spa, + newds, dsl_dataset_get_blkptr(newds), drrb->drr_type, tx); + } + rrw_exit(&newds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = newds; + drba->drba_cookie->drc_os = os; + + spa_history_log_internal_ds(newds, "receive", tx, " "); +} + +static int +dmu_recv_resume_begin_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dmu_recv_cookie_t *drc = drba->drba_cookie; + dsl_pool_t *dp = dmu_tx_pool(tx); + struct drr_begin *drrb = drc->drc_drrb; + int error; + ds_hold_flags_t dsflags = 0; + dsl_dataset_t *ds; + const char *tofs = drc->drc_tofs; + + /* already checked */ + ASSERT3U(drrb->drr_magic, ==, DMU_BACKUP_MAGIC); + ASSERT(drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING); + + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM || + drrb->drr_type >= DMU_OST_NUMTYPES) + return (SET_ERROR(EINVAL)); + + /* + * This is mostly a sanity check since we should have already done these + * checks during a previous attempt to receive the data. + */ + error = recv_begin_check_feature_flags_impl(drc->drc_featureflags, + dp->dp_spa); + if (error != 0) + return (error); + + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", + tofs, recv_clone_name); + + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { + /* raw receives require spill block allocation flag */ + if (!(drrb->drr_flags & DRR_FLAG_SPILL_BLOCK)) + return (SET_ERROR(ZFS_ERR_SPILL_BLOCK_FLAG_MISSING)); + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + if (dsl_dataset_hold_flags(dp, recvname, dsflags, FTAG, &ds) != 0) { + /* %recv does not exist; continue in tofs */ + error = dsl_dataset_hold_flags(dp, tofs, dsflags, FTAG, &ds); + if (error != 0) + return (error); + } + + /* check that ds is marked inconsistent */ + if (!DS_IS_INCONSISTENT(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* check that there is resuming data, and that the toguid matches */ + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + uint64_t val; + error = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val); + if (error != 0 || drrb->drr_toguid != val) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Check if the receive is still running. If so, it will be owned. + * Note that nothing else can own the dataset (e.g. after the receive + * fails) because it will be marked inconsistent. + */ + if (dsl_dataset_has_owner(ds)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EBUSY)); + } + + /* There should not be any snapshots of this fs yet. */ + if (ds->ds_prev != NULL && ds->ds_prev->ds_dir == ds->ds_dir) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* + * Note: resume point will be checked when we process the first WRITE + * record. + */ + + /* check that the origin matches */ + val = 0; + (void) zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val); + if (drrb->drr_fromguid != val) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (ds->ds_prev != NULL && drrb->drr_fromguid != 0) + drc->drc_fromsnapobj = ds->ds_prev->ds_object; + + /* + * If we're resuming, and the send is redacted, then the original send + * must have been redacted, and must have been redacted with respect to + * the same snapshots. + */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_REDACTED) { + uint64_t num_ds_redact_snaps; + uint64_t *ds_redact_snaps; + + uint_t num_stream_redact_snaps; + uint64_t *stream_redact_snaps; + + if (nvlist_lookup_uint64_array(drc->drc_begin_nvl, + BEGINNV_REDACT_SNAPS, &stream_redact_snaps, + &num_stream_redact_snaps) != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + if (!dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_ds_redact_snaps, + &ds_redact_snaps)) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + + for (int i = 0; i < num_ds_redact_snaps; i++) { + if (!redact_snaps_contains(ds_redact_snaps, + num_ds_redact_snaps, stream_redact_snaps[i])) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (SET_ERROR(EINVAL)); + } + } + } + + error = recv_check_large_blocks(ds, drc->drc_featureflags); + if (error != 0) { + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (error); + } + + dsl_dataset_rele_flags(ds, dsflags, FTAG); + return (0); +} + +static void +dmu_recv_resume_begin_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_begin_arg_t *drba = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + const char *tofs = drba->drba_cookie->drc_tofs; + uint64_t featureflags = drba->drba_cookie->drc_featureflags; + dsl_dataset_t *ds; + ds_hold_flags_t dsflags = 0; + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + (void) snprintf(recvname, sizeof (recvname), "%s/%s", tofs, + recv_clone_name); + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + drba->drba_cookie->drc_raw = B_TRUE; + } else { + dsflags |= DS_HOLD_FLAG_DECRYPT; + } + + if (dsl_dataset_own_force(dp, recvname, dsflags, dmu_recv_tag, &ds) + != 0) { + /* %recv does not exist; continue in tofs */ + VERIFY0(dsl_dataset_own_force(dp, tofs, dsflags, dmu_recv_tag, + &ds)); + drba->drba_cookie->drc_newfs = B_TRUE; + } + + ASSERT(DS_IS_INCONSISTENT(ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT(!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) || + drba->drba_cookie->drc_raw); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + drba->drba_cookie->drc_ds = ds; + VERIFY0(dmu_objset_from_ds(ds, &drba->drba_cookie->drc_os)); + drba->drba_cookie->drc_should_save = B_TRUE; + + spa_history_log_internal_ds(ds, "resume receive", tx, " "); +} + +/* + * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() + * succeeds; otherwise we will leak the holds on the datasets. + */ +int +dmu_recv_begin(char *tofs, char *tosnap, dmu_replay_record_t *drr_begin, + boolean_t force, boolean_t resumable, nvlist_t *localprops, + nvlist_t *hidden_args, char *origin, dmu_recv_cookie_t *drc, + zfs_file_t *fp, offset_t *voffp) +{ + dmu_recv_begin_arg_t drba = { 0 }; + int err; + + bzero(drc, sizeof (dmu_recv_cookie_t)); + drc->drc_drr_begin = drr_begin; + drc->drc_drrb = &drr_begin->drr_u.drr_begin; + drc->drc_tosnap = tosnap; + drc->drc_tofs = tofs; + drc->drc_force = force; + drc->drc_resumable = resumable; + drc->drc_cred = CRED(); + drc->drc_proc = curproc; + drc->drc_clone = (origin != NULL); + + if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) { + drc->drc_byteswap = B_TRUE; + (void) fletcher_4_incremental_byteswap(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + byteswap_record(drr_begin); + } else if (drc->drc_drrb->drr_magic == DMU_BACKUP_MAGIC) { + (void) fletcher_4_incremental_native(drr_begin, + sizeof (dmu_replay_record_t), &drc->drc_cksum); + } else { + return (SET_ERROR(EINVAL)); + } + + drc->drc_fp = fp; + drc->drc_voff = *voffp; + drc->drc_featureflags = + DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + uint32_t payloadlen = drc->drc_drr_begin->drr_payloadlen; + void *payload = NULL; + if (payloadlen != 0) + payload = kmem_alloc(payloadlen, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, payloadlen, + payload); + if (err != 0) { + kmem_free(payload, payloadlen); + return (err); + } + if (payloadlen != 0) { + err = nvlist_unpack(payload, payloadlen, &drc->drc_begin_nvl, + KM_SLEEP); + kmem_free(payload, payloadlen); + if (err != 0) { + kmem_free(drc->drc_next_rrd, + sizeof (*drc->drc_next_rrd)); + return (err); + } + } + + if (drc->drc_drrb->drr_flags & DRR_FLAG_SPILL_BLOCK) + drc->drc_spill = B_TRUE; + + drba.drba_origin = origin; + drba.drba_cookie = drc; + drba.drba_cred = CRED(); + drba.drba_proc = curproc; + + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = dsl_sync_task(tofs, + dmu_recv_resume_begin_check, dmu_recv_resume_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + } else { + + /* + * For non-raw, non-incremental, non-resuming receives the + * user can specify encryption parameters on the command line + * with "zfs recv -o". For these receives we create a dcp and + * pass it to the sync task. Creating the dcp will implicitly + * remove the encryption params from the localprops nvlist, + * which avoids errors when trying to set these normally + * read-only properties. Any other kind of receive that + * attempts to set these properties will fail as a result. + */ + if ((DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo) & + DMU_BACKUP_FEATURE_RAW) == 0 && + origin == NULL && drc->drc_drrb->drr_fromguid == 0) { + err = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, + localprops, hidden_args, &drba.drba_dcp); + } + + if (err == 0) { + err = dsl_sync_task(tofs, + dmu_recv_begin_check, dmu_recv_begin_sync, + &drba, 5, ZFS_SPACE_CHECK_NORMAL); + dsl_crypto_params_free(drba.drba_dcp, !!err); + } + } + + if (err != 0) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + nvlist_free(drc->drc_begin_nvl); + } + return (err); +} + +static int +receive_read(dmu_recv_cookie_t *drc, int len, void *buf) +{ + int done = 0; + + /* + * The code doesn't rely on this (lengths being multiples of 8). See + * comment in dump_bytes. + */ + ASSERT(len % 8 == 0 || + (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) != 0); + + while (done < len) { + ssize_t resid; + zfs_file_t *fp = drc->drc_fp; + int err = zfs_file_read(fp, (char *)buf + done, + len - done, &resid); + if (resid == len - done) { + /* + * Note: ECKSUM or ZFS_ERR_STREAM_TRUNCATED indicates + * that the receive was interrupted and can + * potentially be resumed. + */ + err = SET_ERROR(ZFS_ERR_STREAM_TRUNCATED); + } + drc->drc_voff += len - done - resid; + done = len - resid; + if (err != 0) + return (err); + } + + drc->drc_bytes_read += len; + + ASSERT3U(done, ==, len); + return (0); +} + +static inline uint8_t +deduce_nblkptr(dmu_object_type_t bonus_type, uint64_t bonus_size) +{ + if (bonus_type == DMU_OT_SA) { + return (1); + } else { + return (1 + + ((DN_OLD_MAX_BONUSLEN - + MIN(DN_OLD_MAX_BONUSLEN, bonus_size)) >> SPA_BLKPTRSHIFT)); + } +} + +static void +save_resume_state(struct receive_writer_arg *rwa, + uint64_t object, uint64_t offset, dmu_tx_t *tx) +{ + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (!rwa->resumable) + return; + + /* + * We use ds_resume_bytes[] != 0 to indicate that we need to + * update this on disk, so it must not be 0. + */ + ASSERT(rwa->bytes_read != 0); + + /* + * We only resume from write records, which have a valid + * (non-meta-dnode) object number. + */ + ASSERT(object != 0); + + /* + * For resuming to work correctly, we must receive records in order, + * sorted by object,offset. This is checked by the callers, but + * assert it here for good measure. + */ + ASSERT3U(object, >=, rwa->os->os_dsl_dataset->ds_resume_object[txgoff]); + ASSERT(object != rwa->os->os_dsl_dataset->ds_resume_object[txgoff] || + offset >= rwa->os->os_dsl_dataset->ds_resume_offset[txgoff]); + ASSERT3U(rwa->bytes_read, >=, + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff]); + + rwa->os->os_dsl_dataset->ds_resume_object[txgoff] = object; + rwa->os->os_dsl_dataset->ds_resume_offset[txgoff] = offset; + rwa->os->os_dsl_dataset->ds_resume_bytes[txgoff] = rwa->bytes_read; +} + +static int +receive_object_is_same_generation(objset_t *os, uint64_t object, + dmu_object_type_t old_bonus_type, dmu_object_type_t new_bonus_type, + const void *new_bonus, boolean_t *samegenp) +{ + zfs_file_info_t zoi; + int err; + + dmu_buf_t *old_bonus_dbuf; + err = dmu_bonus_hold(os, object, FTAG, &old_bonus_dbuf); + if (err != 0) + return (err); + err = dmu_get_file_info(os, old_bonus_type, old_bonus_dbuf->db_data, + &zoi); + dmu_buf_rele(old_bonus_dbuf, FTAG); + if (err != 0) + return (err); + uint64_t old_gen = zoi.zfi_generation; + + err = dmu_get_file_info(os, new_bonus_type, new_bonus, &zoi); + if (err != 0) + return (err); + uint64_t new_gen = zoi.zfi_generation; + + *samegenp = (old_gen == new_gen); + return (0); +} + +static int +receive_handle_existing_object(const struct receive_writer_arg *rwa, + const struct drr_object *drro, const dmu_object_info_t *doi, + const void *bonus_data, + uint64_t *object_to_hold, uint32_t *new_blksz) +{ + uint32_t indblksz = drro->drr_indblkshift ? + 1ULL << drro->drr_indblkshift : 0; + int nblkptr = deduce_nblkptr(drro->drr_bonustype, + drro->drr_bonuslen); + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + boolean_t do_free_range = B_FALSE; + int err; + + *object_to_hold = drro->drr_object; + + /* nblkptr should be bounded by the bonus size and type */ + if (rwa->raw && nblkptr != drro->drr_nblkptr) + return (SET_ERROR(EINVAL)); + + /* + * After the previous send stream, the sending system may + * have freed this object, and then happened to re-allocate + * this object number in a later txg. In this case, we are + * receiving a different logical file, and the block size may + * appear to be different. i.e. we may have a different + * block size for this object than what the send stream says. + * In this case we need to remove the object's contents, + * so that its structure can be changed and then its contents + * entirely replaced by subsequent WRITE records. + * + * If this is a -L (--large-block) incremental stream, and + * the previous stream was not -L, the block size may appear + * to increase. i.e. we may have a smaller block size for + * this object than what the send stream says. In this case + * we need to keep the object's contents and block size + * intact, so that we don't lose parts of the object's + * contents that are not changed by this incremental send + * stream. + * + * We can distinguish between the two above cases by using + * the ZPL's generation number (see + * receive_object_is_same_generation()). However, we only + * want to rely on the generation number when absolutely + * necessary, because with raw receives, the generation is + * encrypted. We also want to minimize dependence on the + * ZPL, so that other types of datasets can also be received + * (e.g. ZVOLs, although note that ZVOLS currently do not + * reallocate their objects or change their structure). + * Therefore, we check a number of different cases where we + * know it is safe to discard the object's contents, before + * using the ZPL's generation number to make the above + * distinction. + */ + if (drro->drr_blksz != doi->doi_data_block_size) { + if (rwa->raw) { + /* + * RAW streams always have large blocks, so + * we are sure that the data is not needed + * due to changing --large-block to be on. + * Which is fortunate since the bonus buffer + * (which contains the ZPL generation) is + * encrypted, and the key might not be + * loaded. + */ + do_free_range = B_TRUE; + } else if (rwa->full) { + /* + * This is a full send stream, so it always + * replaces what we have. Even if the + * generation numbers happen to match, this + * can not actually be the same logical file. + * This is relevant when receiving a full + * send as a clone. + */ + do_free_range = B_TRUE; + } else if (drro->drr_type != + DMU_OT_PLAIN_FILE_CONTENTS || + doi->doi_type != DMU_OT_PLAIN_FILE_CONTENTS) { + /* + * PLAIN_FILE_CONTENTS are the only type of + * objects that have ever been stored with + * large blocks, so we don't need the special + * logic below. ZAP blocks can shrink (when + * there's only one block), so we don't want + * to hit the error below about block size + * only increasing. + */ + do_free_range = B_TRUE; + } else if (doi->doi_max_offset <= + doi->doi_data_block_size) { + /* + * There is only one block. We can free it, + * because its contents will be replaced by a + * WRITE record. This can not be the no-L -> + * -L case, because the no-L case would have + * resulted in multiple blocks. If we + * supported -L -> no-L, it would not be safe + * to free the file's contents. Fortunately, + * that is not allowed (see + * recv_check_large_blocks()). + */ + do_free_range = B_TRUE; + } else { + boolean_t is_same_gen; + err = receive_object_is_same_generation(rwa->os, + drro->drr_object, doi->doi_bonus_type, + drro->drr_bonustype, bonus_data, &is_same_gen); + if (err != 0) + return (SET_ERROR(EINVAL)); + + if (is_same_gen) { + /* + * This is the same logical file, and + * the block size must be increasing. + * It could only decrease if + * --large-block was changed to be + * off, which is checked in + * recv_check_large_blocks(). + */ + if (drro->drr_blksz <= + doi->doi_data_block_size) + return (SET_ERROR(EINVAL)); + /* + * We keep the existing blocksize and + * contents. + */ + *new_blksz = + doi->doi_data_block_size; + } else { + do_free_range = B_TRUE; + } + } + } + + /* nblkptr can only decrease if the object was reallocated */ + if (nblkptr < doi->doi_nblkptr) + do_free_range = B_TRUE; + + /* number of slots can only change on reallocation */ + if (dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) + do_free_range = B_TRUE; + + /* + * For raw sends we also check a few other fields to + * ensure we are preserving the objset structure exactly + * as it was on the receive side: + * - A changed indirect block size + * - A smaller nlevels + */ + if (rwa->raw) { + if (indblksz != doi->doi_metadata_block_size) + do_free_range = B_TRUE; + if (drro->drr_nlevels < doi->doi_indirection) + do_free_range = B_TRUE; + } + + if (do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + 0, DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The dmu does not currently support decreasing nlevels + * or changing the number of dnode slots on an object. For + * non-raw sends, this does not matter and the new object + * can just use the previous one's nlevels. For raw sends, + * however, the structure of the received dnode (including + * nlevels and dnode slots) must match that of the send + * side. Therefore, instead of using dmu_object_reclaim(), + * we must free the object completely and call + * dmu_object_claim_dnsize() instead. + */ + if ((rwa->raw && drro->drr_nlevels < doi->doi_indirection) || + dn_slots != doi->doi_dnodesize >> DNODE_SHIFT) { + err = dmu_free_long_object(rwa->os, drro->drr_object); + if (err != 0) + return (SET_ERROR(EINVAL)); + + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + *object_to_hold = DMU_NEW_OBJECT; + } + + /* + * For raw receives, free everything beyond the new incoming + * maxblkid. Normally this would be done with a DRR_FREE + * record that would come after this DRR_OBJECT record is + * processed. However, for raw receives we manually set the + * maxblkid from the drr_maxblkid and so we must first free + * everything above that blkid to ensure the DMU is always + * consistent with itself. We will never free the first block + * of the object here because a maxblkid of 0 could indicate + * an object with a single block or one with no blocks. This + * free may be skipped when dmu_free_long_range() was called + * above since it covers the entire object's contents. + */ + if (rwa->raw && *object_to_hold != DMU_NEW_OBJECT && !do_free_range) { + err = dmu_free_long_range(rwa->os, drro->drr_object, + (drro->drr_maxblkid + 1) * doi->doi_data_block_size, + DMU_OBJECT_END); + if (err != 0) + return (SET_ERROR(EINVAL)); + } + return (0); +} + +noinline static int +receive_object(struct receive_writer_arg *rwa, struct drr_object *drro, + void *data) +{ + dmu_object_info_t doi; + dmu_tx_t *tx; + int err; + uint32_t new_blksz = drro->drr_blksz; + uint8_t dn_slots = drro->drr_dn_slots != 0 ? + drro->drr_dn_slots : DNODE_MIN_SLOTS; + + if (drro->drr_type == DMU_OT_NONE || + !DMU_OT_IS_VALID(drro->drr_type) || + !DMU_OT_IS_VALID(drro->drr_bonustype) || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || + P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || + drro->drr_blksz < SPA_MINBLOCKSIZE || + drro->drr_blksz > spa_maxblocksize(dmu_objset_spa(rwa->os)) || + drro->drr_bonuslen > + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(rwa->os))) || + dn_slots > + (spa_maxdnodesize(dmu_objset_spa(rwa->os)) >> DNODE_SHIFT)) { + return (SET_ERROR(EINVAL)); + } + + if (rwa->raw) { + /* + * We should have received a DRR_OBJECT_RANGE record + * containing this block and stored it in rwa. + */ + if (drro->drr_object < rwa->or_firstobj || + drro->drr_object >= rwa->or_firstobj + rwa->or_numslots || + drro->drr_raw_bonuslen < drro->drr_bonuslen || + drro->drr_indblkshift > SPA_MAXBLOCKSHIFT || + drro->drr_nlevels > DN_MAX_LEVELS || + drro->drr_nblkptr > DN_MAX_NBLKPTR || + DN_SLOTS_TO_BONUSLEN(dn_slots) < + drro->drr_raw_bonuslen) + return (SET_ERROR(EINVAL)); + } else { + /* + * The DRR_OBJECT_SPILL flag is valid when the DRR_BEGIN + * record indicates this by setting DRR_FLAG_SPILL_BLOCK. + */ + if (((drro->drr_flags & ~(DRR_OBJECT_SPILL))) || + (!rwa->spill && DRR_OBJECT_HAS_SPILL(drro->drr_flags))) { + return (SET_ERROR(EINVAL)); + } + + if (drro->drr_raw_bonuslen != 0 || drro->drr_nblkptr != 0 || + drro->drr_indblkshift != 0 || drro->drr_nlevels != 0) { + return (SET_ERROR(EINVAL)); + } + } + + err = dmu_object_info(rwa->os, drro->drr_object, &doi); + + if (err != 0 && err != ENOENT && err != EEXIST) + return (SET_ERROR(EINVAL)); + + if (drro->drr_object > rwa->max_object) + rwa->max_object = drro->drr_object; + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + * Raw receives will also check that the indirect structure of the + * dnode hasn't changed. + */ + uint64_t object_to_hold; + if (err == 0) { + err = receive_handle_existing_object(rwa, drro, &doi, data, + &object_to_hold, &new_blksz); + } else if (err == EEXIST) { + /* + * The object requested is currently an interior slot of a + * multi-slot dnode. This will be resolved when the next txg + * is synced out, since the send stream will have told us + * to free this slot when we freed the associated dnode + * earlier in the stream. + */ + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + + if (dmu_object_info(rwa->os, drro->drr_object, NULL) != ENOENT) + return (SET_ERROR(EINVAL)); + + /* object was freed and we are about to allocate a new one */ + object_to_hold = DMU_NEW_OBJECT; + } else { + /* object is free and we are about to allocate a new one */ + object_to_hold = DMU_NEW_OBJECT; + } + + /* + * If this is a multi-slot dnode there is a chance that this + * object will expand into a slot that is already used by + * another object from the previous snapshot. We must free + * these objects before we attempt to allocate the new dnode. + */ + if (dn_slots > 1) { + boolean_t need_sync = B_FALSE; + + for (uint64_t slot = drro->drr_object + 1; + slot < drro->drr_object + dn_slots; + slot++) { + dmu_object_info_t slot_doi; + + err = dmu_object_info(rwa->os, slot, &slot_doi); + if (err == ENOENT || err == EEXIST) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, slot); + if (err != 0) + return (err); + + need_sync = B_TRUE; + } + + if (need_sync) + txg_wait_synced(dmu_objset_pool(rwa->os), 0); + } + + tx = dmu_tx_create(rwa->os); + dmu_tx_hold_bonus(tx, object_to_hold); + dmu_tx_hold_write(tx, object_to_hold, 0, 0); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + if (object_to_hold == DMU_NEW_OBJECT) { + /* Currently free, wants to be allocated */ + err = dmu_object_claim_dnsize(rwa->os, drro->drr_object, + drro->drr_type, new_blksz, + drro->drr_bonustype, drro->drr_bonuslen, + dn_slots << DNODE_SHIFT, tx); + } else if (drro->drr_type != doi.doi_type || + new_blksz != doi.doi_data_block_size || + drro->drr_bonustype != doi.doi_bonus_type || + drro->drr_bonuslen != doi.doi_bonus_size) { + /* Currently allocated, but with different properties */ + err = dmu_object_reclaim_dnsize(rwa->os, drro->drr_object, + drro->drr_type, new_blksz, + drro->drr_bonustype, drro->drr_bonuslen, + dn_slots << DNODE_SHIFT, rwa->spill ? + DRR_OBJECT_HAS_SPILL(drro->drr_flags) : B_FALSE, tx); + } else if (rwa->spill && !DRR_OBJECT_HAS_SPILL(drro->drr_flags)) { + /* + * Currently allocated, the existing version of this object + * may reference a spill block that is no longer allocated + * at the source and needs to be freed. + */ + err = dmu_object_rm_spill(rwa->os, drro->drr_object, tx); + } + + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + if (rwa->or_crypt_params_present) { + /* + * Set the crypt params for the buffer associated with this + * range of dnodes. This causes the blkptr_t to have the + * same crypt params (byteorder, salt, iv, mac) as on the + * sending side. + * + * Since we are committing this tx now, it is possible for + * the dnode block to end up on-disk with the incorrect MAC, + * if subsequent objects in this block are received in a + * different txg. However, since the dataset is marked as + * inconsistent, no code paths will do a non-raw read (or + * decrypt the block / verify the MAC). The receive code and + * scrub code can safely do raw reads and verify the + * checksum. They don't need to verify the MAC. + */ + dmu_buf_t *db = NULL; + uint64_t offset = rwa->or_firstobj * DNODE_MIN_SIZE; + + err = dmu_buf_hold_by_dnode(DMU_META_DNODE(rwa->os), + offset, FTAG, &db, DMU_READ_PREFETCH | DMU_READ_NO_DECRYPT); + if (err != 0) { + dmu_tx_commit(tx); + return (SET_ERROR(EINVAL)); + } + + dmu_buf_set_crypt_params(db, rwa->or_byteorder, + rwa->or_salt, rwa->or_iv, rwa->or_mac, tx); + + dmu_buf_rele(db, FTAG); + + rwa->or_crypt_params_present = B_FALSE; + } + + dmu_object_set_checksum(rwa->os, drro->drr_object, + drro->drr_checksumtype, tx); + dmu_object_set_compress(rwa->os, drro->drr_object, + drro->drr_compress, tx); + + /* handle more restrictive dnode structuring for raw recvs */ + if (rwa->raw) { + /* + * Set the indirect block size, block shift, nlevels. + * This will not fail because we ensured all of the + * blocks were freed earlier if this is a new object. + * For non-new objects block size and indirect block + * shift cannot change and nlevels can only increase. + */ + ASSERT3U(new_blksz, ==, drro->drr_blksz); + VERIFY0(dmu_object_set_blocksize(rwa->os, drro->drr_object, + drro->drr_blksz, drro->drr_indblkshift, tx)); + VERIFY0(dmu_object_set_nlevels(rwa->os, drro->drr_object, + drro->drr_nlevels, tx)); + + /* + * Set the maxblkid. This will always succeed because + * we freed all blocks beyond the new maxblkid above. + */ + VERIFY0(dmu_object_set_maxblkid(rwa->os, drro->drr_object, + drro->drr_maxblkid, tx)); + } + + if (data != NULL) { + dmu_buf_t *db; + dnode_t *dn; + uint32_t flags = DMU_READ_NO_PREFETCH; + + if (rwa->raw) + flags |= DMU_READ_NO_DECRYPT; + + VERIFY0(dnode_hold(rwa->os, drro->drr_object, FTAG, &dn)); + VERIFY0(dmu_bonus_hold_by_dnode(dn, FTAG, &db, flags)); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(db->db_size, >=, drro->drr_bonuslen); + bcopy(data, db->db_data, DRR_OBJECT_PAYLOAD_SIZE(drro)); + + /* + * Raw bonus buffers have their byteorder determined by the + * DRR_OBJECT_RANGE record. + */ + if (rwa->byteswap && !rwa->raw) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drro->drr_bonustype); + dmu_ot_byteswap[byteswap].ob_func(db->db_data, + DRR_OBJECT_PAYLOAD_SIZE(drro)); + } + dmu_buf_rele(db, FTAG); + dnode_rele(dn, FTAG); + } + dmu_tx_commit(tx); + + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_freeobjects(struct receive_writer_arg *rwa, + struct drr_freeobjects *drrfo) +{ + uint64_t obj; + int next_err = 0; + + if (drrfo->drr_firstobj + drrfo->drr_numobjs < drrfo->drr_firstobj) + return (SET_ERROR(EINVAL)); + + for (obj = drrfo->drr_firstobj == 0 ? 1 : drrfo->drr_firstobj; + obj < drrfo->drr_firstobj + drrfo->drr_numobjs && + obj < DN_MAX_OBJECT && next_err == 0; + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0)) { + dmu_object_info_t doi; + int err; + + err = dmu_object_info(rwa->os, obj, &doi); + if (err == ENOENT) + continue; + else if (err != 0) + return (err); + + err = dmu_free_long_object(rwa->os, obj); + + if (err != 0) + return (err); + } + if (next_err != ESRCH) + return (next_err); + return (0); +} + +/* + * Note: if this fails, the caller will clean up any records left on the + * rwa->write_batch list. + */ +static int +flush_write_batch_impl(struct receive_writer_arg *rwa) +{ + dnode_t *dn; + int err; + + if (dnode_hold(rwa->os, rwa->last_object, FTAG, &dn) != 0) + return (SET_ERROR(EINVAL)); + + struct receive_record_arg *last_rrd = list_tail(&rwa->write_batch); + struct drr_write *last_drrw = &last_rrd->header.drr_u.drr_write; + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + + ASSERT3U(rwa->last_object, ==, last_drrw->drr_object); + ASSERT3U(rwa->last_offset, ==, last_drrw->drr_offset); + + dmu_tx_t *tx = dmu_tx_create(rwa->os); + dmu_tx_hold_write_by_dnode(tx, dn, first_drrw->drr_offset, + last_drrw->drr_offset - first_drrw->drr_offset + + last_drrw->drr_logical_size); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + dnode_rele(dn, FTAG); + return (err); + } + + struct receive_record_arg *rrd; + while ((rrd = list_head(&rwa->write_batch)) != NULL) { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + arc_buf_t *abuf = rrd->arc_buf; + + ASSERT3U(drrw->drr_object, ==, rwa->last_object); + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrw->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_WRITE_PAYLOAD_SIZE(drrw)); + } + + /* + * If we are receiving an incremental large-block stream into + * a dataset that previously did a non-large-block receive, + * the WRITE record may be larger than the object's block + * size. dmu_assign_arcbuf_by_dnode() handles this as long + * as the arcbuf is not compressed, so decompress it here if + * necessary. + */ + if (drrw->drr_logical_size != dn->dn_datablksz && + arc_get_compression(abuf) != ZIO_COMPRESS_OFF) { + ASSERT3U(drrw->drr_logical_size, >, dn->dn_datablksz); + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(rwa->os), + .zb_object = rwa->last_object, + .zb_level = 0, + .zb_blkid = + drrw->drr_offset >> dn->dn_datablkshift, + }; + + /* + * The size of loaned arc bufs is counted in + * arc_loaned_bytes. When we untransform + * (decompress) the buf, its size increases. To + * ensure that arc_loaned_bytes remains accurate, we + * need to return (un-loan) the buf (with its + * compressed size) and then re-loan it (with its + * new, uncompressed size). + */ + arc_return_buf(abuf, FTAG); + VERIFY0(arc_untransform(abuf, dmu_objset_spa(rwa->os), + &zb, B_FALSE)); + arc_loan_inuse_buf(abuf, FTAG); + } + + err = dmu_assign_arcbuf_by_dnode(dn, + drrw->drr_offset, abuf, tx); + if (err != 0) { + /* + * This rrd is left on the list, so the caller will + * free it (and the arc_buf). + */ + break; + } + + /* + * Note: If the receive fails, we want the resume stream to + * start with the same record that we last successfully + * received (as opposed to the next record), so that we can + * verify that we are resuming from the correct location. + */ + save_resume_state(rwa, drrw->drr_object, drrw->drr_offset, tx); + + list_remove(&rwa->write_batch, rrd); + kmem_free(rrd, sizeof (*rrd)); + } + + dmu_tx_commit(tx); + dnode_rele(dn, FTAG); + return (err); +} + +noinline static int +flush_write_batch(struct receive_writer_arg *rwa) +{ + if (list_is_empty(&rwa->write_batch)) + return (0); + int err = rwa->err; + if (err == 0) + err = flush_write_batch_impl(rwa); + if (err != 0) { + struct receive_record_arg *rrd; + while ((rrd = list_remove_head(&rwa->write_batch)) != NULL) { + dmu_return_arcbuf(rrd->arc_buf); + kmem_free(rrd, sizeof (*rrd)); + } + } + ASSERT(list_is_empty(&rwa->write_batch)); + return (err); +} + +noinline static int +receive_process_write_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err = 0; + + ASSERT3U(rrd->header.drr_type, ==, DRR_WRITE); + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + + if (drrw->drr_offset + drrw->drr_logical_size < drrw->drr_offset || + !DMU_OT_IS_VALID(drrw->drr_type)) + return (SET_ERROR(EINVAL)); + + /* + * For resuming to work, records must be in increasing order + * by (object, offset). + */ + if (drrw->drr_object < rwa->last_object || + (drrw->drr_object == rwa->last_object && + drrw->drr_offset < rwa->last_offset)) { + return (SET_ERROR(EINVAL)); + } + + struct receive_record_arg *first_rrd = list_head(&rwa->write_batch); + struct drr_write *first_drrw = &first_rrd->header.drr_u.drr_write; + uint64_t batch_size = + MIN(zfs_recv_write_batch_size, DMU_MAX_ACCESS / 2); + if (first_rrd != NULL && + (drrw->drr_object != first_drrw->drr_object || + drrw->drr_offset >= first_drrw->drr_offset + batch_size)) { + err = flush_write_batch(rwa); + if (err != 0) + return (err); + } + + rwa->last_object = drrw->drr_object; + rwa->last_offset = drrw->drr_offset; + + if (rwa->last_object > rwa->max_object) + rwa->max_object = rwa->last_object; + + list_insert_tail(&rwa->write_batch, rrd); + /* + * Return EAGAIN to indicate that we will use this rrd again, + * so the caller should not free it + */ + return (EAGAIN); +} + +static int +receive_write_embedded(struct receive_writer_arg *rwa, + struct drr_write_embedded *drrwe, void *data) +{ + dmu_tx_t *tx; + int err; + + if (drrwe->drr_offset + drrwe->drr_length < drrwe->drr_offset) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_psize > BPE_PAYLOAD_SIZE) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_etype >= NUM_BP_EMBEDDED_TYPES) + return (SET_ERROR(EINVAL)); + if (drrwe->drr_compression >= ZIO_COMPRESS_FUNCTIONS) + return (SET_ERROR(EINVAL)); + if (rwa->raw) + return (SET_ERROR(EINVAL)); + + if (drrwe->drr_object > rwa->max_object) + rwa->max_object = drrwe->drr_object; + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_write(tx, drrwe->drr_object, + drrwe->drr_offset, drrwe->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_tx_abort(tx); + return (err); + } + + dmu_write_embedded(rwa->os, drrwe->drr_object, + drrwe->drr_offset, data, drrwe->drr_etype, + drrwe->drr_compression, drrwe->drr_lsize, drrwe->drr_psize, + rwa->byteswap ^ ZFS_HOST_BYTEORDER, tx); + + /* See comment in restore_write. */ + save_resume_state(rwa, drrwe->drr_object, drrwe->drr_offset, tx); + dmu_tx_commit(tx); + return (0); +} + +static int +receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, + arc_buf_t *abuf) +{ + dmu_tx_t *tx; + dmu_buf_t *db, *db_spill; + int err; + + if (drrs->drr_length < SPA_MINBLOCKSIZE || + drrs->drr_length > spa_maxblocksize(dmu_objset_spa(rwa->os))) + return (SET_ERROR(EINVAL)); + + /* + * This is an unmodified spill block which was added to the stream + * to resolve an issue with incorrectly removing spill blocks. It + * should be ignored by current versions of the code which support + * the DRR_FLAG_SPILL_BLOCK flag. + */ + if (rwa->spill && DRR_SPILL_IS_UNMODIFIED(drrs->drr_flags)) { + dmu_return_arcbuf(abuf); + return (0); + } + + if (rwa->raw) { + if (!DMU_OT_IS_VALID(drrs->drr_type) || + drrs->drr_compressiontype >= ZIO_COMPRESS_FUNCTIONS || + drrs->drr_compressed_size == 0) + return (SET_ERROR(EINVAL)); + } + + if (dmu_object_info(rwa->os, drrs->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrs->drr_object > rwa->max_object) + rwa->max_object = drrs->drr_object; + + VERIFY0(dmu_bonus_hold(rwa->os, drrs->drr_object, FTAG, &db)); + if ((err = dmu_spill_hold_by_bonus(db, DMU_READ_NO_DECRYPT, FTAG, + &db_spill)) != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + + tx = dmu_tx_create(rwa->os); + + dmu_tx_hold_spill(tx, db->db_object); + + err = dmu_tx_assign(tx, TXG_WAIT); + if (err != 0) { + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + dmu_tx_abort(tx); + return (err); + } + + /* + * Spill blocks may both grow and shrink. When a change in size + * occurs any existing dbuf must be updated to match the logical + * size of the provided arc_buf_t. + */ + if (db_spill->db_size != drrs->drr_length) { + dmu_buf_will_fill(db_spill, tx); + VERIFY(0 == dbuf_spill_set_blksz(db_spill, + drrs->drr_length, tx)); + } + + if (rwa->byteswap && !arc_is_encrypted(abuf) && + arc_get_compression(abuf) == ZIO_COMPRESS_OFF) { + dmu_object_byteswap_t byteswap = + DMU_OT_BYTESWAP(drrs->drr_type); + dmu_ot_byteswap[byteswap].ob_func(abuf->b_data, + DRR_SPILL_PAYLOAD_SIZE(drrs)); + } + + dbuf_assign_arcbuf((dmu_buf_impl_t *)db_spill, abuf, tx); + + dmu_buf_rele(db, FTAG); + dmu_buf_rele(db_spill, FTAG); + + dmu_tx_commit(tx); + return (0); +} + +/* ARGSUSED */ +noinline static int +receive_free(struct receive_writer_arg *rwa, struct drr_free *drrf) +{ + int err; + + if (drrf->drr_length != -1ULL && + drrf->drr_offset + drrf->drr_length < drrf->drr_offset) + return (SET_ERROR(EINVAL)); + + if (dmu_object_info(rwa->os, drrf->drr_object, NULL) != 0) + return (SET_ERROR(EINVAL)); + + if (drrf->drr_object > rwa->max_object) + rwa->max_object = drrf->drr_object; + + err = dmu_free_long_range(rwa->os, drrf->drr_object, + drrf->drr_offset, drrf->drr_length); + + return (err); +} + +static int +receive_object_range(struct receive_writer_arg *rwa, + struct drr_object_range *drror) +{ + /* + * By default, we assume this block is in our native format + * (ZFS_HOST_BYTEORDER). We then take into account whether + * the send stream is byteswapped (rwa->byteswap). Finally, + * we need to byteswap again if this particular block was + * in non-native format on the send side. + */ + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ rwa->byteswap ^ + !!DRR_IS_RAW_BYTESWAPPED(drror->drr_flags); + + /* + * Since dnode block sizes are constant, we should not need to worry + * about making sure that the dnode block size is the same on the + * sending and receiving sides for the time being. For non-raw sends, + * this does not matter (and in fact we do not send a DRR_OBJECT_RANGE + * record at all). Raw sends require this record type because the + * encryption parameters are used to protect an entire block of bonus + * buffers. If the size of dnode blocks ever becomes variable, + * handling will need to be added to ensure that dnode block sizes + * match on the sending and receiving side. + */ + if (drror->drr_numslots != DNODES_PER_BLOCK || + P2PHASE(drror->drr_firstobj, DNODES_PER_BLOCK) != 0 || + !rwa->raw) + return (SET_ERROR(EINVAL)); + + if (drror->drr_firstobj > rwa->max_object) + rwa->max_object = drror->drr_firstobj; + + /* + * The DRR_OBJECT_RANGE handling must be deferred to receive_object() + * so that the block of dnodes is not written out when it's empty, + * and converted to a HOLE BP. + */ + rwa->or_crypt_params_present = B_TRUE; + rwa->or_firstobj = drror->drr_firstobj; + rwa->or_numslots = drror->drr_numslots; + bcopy(drror->drr_salt, rwa->or_salt, ZIO_DATA_SALT_LEN); + bcopy(drror->drr_iv, rwa->or_iv, ZIO_DATA_IV_LEN); + bcopy(drror->drr_mac, rwa->or_mac, ZIO_DATA_MAC_LEN); + rwa->or_byteorder = byteorder; + + return (0); +} + +/* + * Until we have the ability to redact large ranges of data efficiently, we + * process these records as frees. + */ +/* ARGSUSED */ +noinline static int +receive_redact(struct receive_writer_arg *rwa, struct drr_redact *drrr) +{ + struct drr_free drrf = {0}; + drrf.drr_length = drrr->drr_length; + drrf.drr_object = drrr->drr_object; + drrf.drr_offset = drrr->drr_offset; + drrf.drr_toguid = drrr->drr_toguid; + return (receive_free(rwa, &drrf)); +} + +/* used to destroy the drc_ds on error */ +static void +dmu_recv_cleanup_ds(dmu_recv_cookie_t *drc) +{ + dsl_dataset_t *ds = drc->drc_ds; + ds_hold_flags_t dsflags = (drc->drc_raw) ? 0 : DS_HOLD_FLAG_DECRYPT; + + /* + * Wait for the txg sync before cleaning up the receive. For + * resumable receives, this ensures that our resume state has + * been written out to disk. For raw receives, this ensures + * that the user accounting code will not attempt to do anything + * after we stopped receiving the dataset. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + ds->ds_objset->os_raw_receive = B_FALSE; + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (drc->drc_resumable && drc->drc_should_save && + !BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + } else { + char name[ZFS_MAX_DATASET_NAME_LEN]; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dataset_name(ds, name); + dsl_dataset_disown(ds, dsflags, dmu_recv_tag); + (void) dsl_destroy_head(name); + } +} + +static void +receive_cksum(dmu_recv_cookie_t *drc, int len, void *buf) +{ + if (drc->drc_byteswap) { + (void) fletcher_4_incremental_byteswap(buf, len, + &drc->drc_cksum); + } else { + (void) fletcher_4_incremental_native(buf, len, &drc->drc_cksum); + } +} + +/* + * Read the payload into a buffer of size len, and update the current record's + * payload field. + * Allocate drc->drc_next_rrd and read the next record's header into + * drc->drc_next_rrd->header. + * Verify checksum of payload and next record. + */ +static int +receive_read_payload_and_next_header(dmu_recv_cookie_t *drc, int len, void *buf) +{ + int err; + + if (len != 0) { + ASSERT3U(len, <=, SPA_MAXBLOCKSIZE); + err = receive_read(drc, len, buf); + if (err != 0) + return (err); + receive_cksum(drc, len, buf); + + /* note: rrd is NULL when reading the begin record's payload */ + if (drc->drc_rrd != NULL) { + drc->drc_rrd->payload = buf; + drc->drc_rrd->payload_size = len; + drc->drc_rrd->bytes_read = drc->drc_bytes_read; + } + } else { + ASSERT3P(buf, ==, NULL); + } + + drc->drc_prev_cksum = drc->drc_cksum; + + drc->drc_next_rrd = kmem_zalloc(sizeof (*drc->drc_next_rrd), KM_SLEEP); + err = receive_read(drc, sizeof (drc->drc_next_rrd->header), + &drc->drc_next_rrd->header); + drc->drc_next_rrd->bytes_read = drc->drc_bytes_read; + + if (err != 0) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (err); + } + if (drc->drc_next_rrd->header.drr_type == DRR_BEGIN) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (SET_ERROR(EINVAL)); + } + + /* + * Note: checksum is of everything up to but not including the + * checksum itself. + */ + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + receive_cksum(drc, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &drc->drc_next_rrd->header); + + zio_cksum_t cksum_orig = + drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; + zio_cksum_t *cksump = + &drc->drc_next_rrd->header.drr_u.drr_checksum.drr_checksum; + + if (drc->drc_byteswap) + byteswap_record(&drc->drc_next_rrd->header); + + if ((!ZIO_CHECKSUM_IS_ZERO(cksump)) && + !ZIO_CHECKSUM_EQUAL(drc->drc_cksum, *cksump)) { + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + drc->drc_next_rrd = NULL; + return (SET_ERROR(ECKSUM)); + } + + receive_cksum(drc, sizeof (cksum_orig), &cksum_orig); + + return (0); +} + +/* + * Issue the prefetch reads for any necessary indirect blocks. + * + * We use the object ignore list to tell us whether or not to issue prefetches + * for a given object. We do this for both correctness (in case the blocksize + * of an object has changed) and performance (if the object doesn't exist, don't + * needlessly try to issue prefetches). We also trim the list as we go through + * the stream to prevent it from growing to an unbounded size. + * + * The object numbers within will always be in sorted order, and any write + * records we see will also be in sorted order, but they're not sorted with + * respect to each other (i.e. we can get several object records before + * receiving each object's write records). As a result, once we've reached a + * given object number, we can safely remove any reference to lower object + * numbers in the ignore list. In practice, we receive up to 32 object records + * before receiving write records, so the list can have up to 32 nodes in it. + */ +/* ARGSUSED */ +static void +receive_read_prefetch(dmu_recv_cookie_t *drc, uint64_t object, uint64_t offset, + uint64_t length) +{ + if (!objlist_exists(drc->drc_ignore_objlist, object)) { + dmu_prefetch(drc->drc_os, object, 1, offset, length, + ZIO_PRIORITY_SYNC_READ); + } +} + +/* + * Read records off the stream, issuing any necessary prefetches. + */ +static int +receive_read_record(dmu_recv_cookie_t *drc) +{ + int err; + + switch (drc->drc_rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = + &drc->drc_rrd->header.drr_u.drr_object; + uint32_t size = DRR_OBJECT_PAYLOAD_SIZE(drro); + void *buf = NULL; + dmu_object_info_t doi; + + if (size != 0) + buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + err = dmu_object_info(drc->drc_os, drro->drr_object, &doi); + /* + * See receive_read_prefetch for an explanation why we're + * storing this object in the ignore_obj_list. + */ + if (err == ENOENT || err == EEXIST || + (err == 0 && doi.doi_data_block_size != drro->drr_blksz)) { + objlist_insert(drc->drc_ignore_objlist, + drro->drr_object); + err = 0; + } + return (err); + } + case DRR_FREEOBJECTS: + { + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + } + case DRR_WRITE: + { + struct drr_write *drrw = &drc->drc_rrd->header.drr_u.drr_write; + arc_buf_t *abuf; + boolean_t is_meta = DMU_OT_IS_METADATA(drrw->drr_type); + + if (drc->drc_raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrw->drr_flags) ^ + drc->drc_byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), + drrw->drr_object, byteorder, drrw->drr_salt, + drrw->drr_iv, drrw->drr_mac, drrw->drr_type, + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype, 0); + } else if (DRR_WRITE_COMPRESSED(drrw)) { + ASSERT3U(drrw->drr_compressed_size, >, 0); + ASSERT3U(drrw->drr_logical_size, >=, + drrw->drr_compressed_size); + ASSERT(!is_meta); + abuf = arc_loan_compressed_buf( + dmu_objset_spa(drc->drc_os), + drrw->drr_compressed_size, drrw->drr_logical_size, + drrw->drr_compressiontype, 0); + } else { + abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), + is_meta, drrw->drr_logical_size); + } + + err = receive_read_payload_and_next_header(drc, + DRR_WRITE_PAYLOAD_SIZE(drrw), abuf->b_data); + if (err != 0) { + dmu_return_arcbuf(abuf); + return (err); + } + drc->drc_rrd->arc_buf = abuf; + receive_read_prefetch(drc, drrw->drr_object, drrw->drr_offset, + drrw->drr_logical_size); + return (err); + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwb = + &drc->drc_rrd->header.drr_u.drr_write_byref; + err = receive_read_payload_and_next_header(drc, 0, NULL); + receive_read_prefetch(drc, drrwb->drr_object, drrwb->drr_offset, + drrwb->drr_length); + return (err); + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &drc->drc_rrd->header.drr_u.drr_write_embedded; + uint32_t size = P2ROUNDUP(drrwe->drr_psize, 8); + void *buf = kmem_zalloc(size, KM_SLEEP); + + err = receive_read_payload_and_next_header(drc, size, buf); + if (err != 0) { + kmem_free(buf, size); + return (err); + } + + receive_read_prefetch(drc, drrwe->drr_object, drrwe->drr_offset, + drrwe->drr_length); + return (err); + } + case DRR_FREE: + case DRR_REDACT: + { + /* + * It might be beneficial to prefetch indirect blocks here, but + * we don't really have the data to decide for sure. + */ + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + } + case DRR_END: + { + struct drr_end *drre = &drc->drc_rrd->header.drr_u.drr_end; + if (!ZIO_CHECKSUM_EQUAL(drc->drc_prev_cksum, + drre->drr_checksum)) + return (SET_ERROR(ECKSUM)); + return (0); + } + case DRR_SPILL: + { + struct drr_spill *drrs = &drc->drc_rrd->header.drr_u.drr_spill; + arc_buf_t *abuf; + /* DRR_SPILL records are either raw or uncompressed */ + if (drc->drc_raw) { + boolean_t byteorder = ZFS_HOST_BYTEORDER ^ + !!DRR_IS_RAW_BYTESWAPPED(drrs->drr_flags) ^ + drc->drc_byteswap; + + abuf = arc_loan_raw_buf(dmu_objset_spa(drc->drc_os), + drrs->drr_object, byteorder, drrs->drr_salt, + drrs->drr_iv, drrs->drr_mac, drrs->drr_type, + drrs->drr_compressed_size, drrs->drr_length, + drrs->drr_compressiontype, 0); + } else { + abuf = arc_loan_buf(dmu_objset_spa(drc->drc_os), + DMU_OT_IS_METADATA(drrs->drr_type), + drrs->drr_length); + } + err = receive_read_payload_and_next_header(drc, + DRR_SPILL_PAYLOAD_SIZE(drrs), abuf->b_data); + if (err != 0) + dmu_return_arcbuf(abuf); + else + drc->drc_rrd->arc_buf = abuf; + return (err); + } + case DRR_OBJECT_RANGE: + { + err = receive_read_payload_and_next_header(drc, 0, NULL); + return (err); + + } + default: + return (SET_ERROR(EINVAL)); + } +} + + + +static void +dprintf_drr(struct receive_record_arg *rrd, int err) +{ +#ifdef ZFS_DEBUG + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + dprintf("drr_type = OBJECT obj = %llu type = %u " + "bonustype = %u blksz = %u bonuslen = %u cksumtype = %u " + "compress = %u dn_slots = %u err = %d\n", + drro->drr_object, drro->drr_type, drro->drr_bonustype, + drro->drr_blksz, drro->drr_bonuslen, + drro->drr_checksumtype, drro->drr_compress, + drro->drr_dn_slots, err); + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + dprintf("drr_type = FREEOBJECTS firstobj = %llu " + "numobjs = %llu err = %d\n", + drrfo->drr_firstobj, drrfo->drr_numobjs, err); + break; + } + case DRR_WRITE: + { + struct drr_write *drrw = &rrd->header.drr_u.drr_write; + dprintf("drr_type = WRITE obj = %llu type = %u offset = %llu " + "lsize = %llu cksumtype = %u flags = %u " + "compress = %u psize = %llu err = %d\n", + drrw->drr_object, drrw->drr_type, drrw->drr_offset, + drrw->drr_logical_size, drrw->drr_checksumtype, + drrw->drr_flags, drrw->drr_compressiontype, + drrw->drr_compressed_size, err); + break; + } + case DRR_WRITE_BYREF: + { + struct drr_write_byref *drrwbr = + &rrd->header.drr_u.drr_write_byref; + dprintf("drr_type = WRITE_BYREF obj = %llu offset = %llu " + "length = %llu toguid = %llx refguid = %llx " + "refobject = %llu refoffset = %llu cksumtype = %u " + "flags = %u err = %d\n", + drrwbr->drr_object, drrwbr->drr_offset, + drrwbr->drr_length, drrwbr->drr_toguid, + drrwbr->drr_refguid, drrwbr->drr_refobject, + drrwbr->drr_refoffset, drrwbr->drr_checksumtype, + drrwbr->drr_flags, err); + break; + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + dprintf("drr_type = WRITE_EMBEDDED obj = %llu offset = %llu " + "length = %llu compress = %u etype = %u lsize = %u " + "psize = %u err = %d\n", + drrwe->drr_object, drrwe->drr_offset, drrwe->drr_length, + drrwe->drr_compression, drrwe->drr_etype, + drrwe->drr_lsize, drrwe->drr_psize, err); + break; + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + dprintf("drr_type = FREE obj = %llu offset = %llu " + "length = %lld err = %d\n", + drrf->drr_object, drrf->drr_offset, drrf->drr_length, + err); + break; + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + dprintf("drr_type = SPILL obj = %llu length = %llu " + "err = %d\n", drrs->drr_object, drrs->drr_length, err); + break; + } + case DRR_OBJECT_RANGE: + { + struct drr_object_range *drror = + &rrd->header.drr_u.drr_object_range; + dprintf("drr_type = OBJECT_RANGE firstobj = %llu " + "numslots = %llu flags = %u err = %d\n", + drror->drr_firstobj, drror->drr_numslots, + drror->drr_flags, err); + break; + } + default: + return; + } +#endif +} + +/* + * Commit the records to the pool. + */ +static int +receive_process_record(struct receive_writer_arg *rwa, + struct receive_record_arg *rrd) +{ + int err; + + /* Processing in order, therefore bytes_read should be increasing. */ + ASSERT3U(rrd->bytes_read, >=, rwa->bytes_read); + rwa->bytes_read = rrd->bytes_read; + + if (rrd->header.drr_type != DRR_WRITE) { + err = flush_write_batch(rwa); + if (err != 0) { + if (rrd->arc_buf != NULL) { + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + + return (err); + } + } + + switch (rrd->header.drr_type) { + case DRR_OBJECT: + { + struct drr_object *drro = &rrd->header.drr_u.drr_object; + err = receive_object(rwa, drro, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + break; + } + case DRR_FREEOBJECTS: + { + struct drr_freeobjects *drrfo = + &rrd->header.drr_u.drr_freeobjects; + err = receive_freeobjects(rwa, drrfo); + break; + } + case DRR_WRITE: + { + err = receive_process_write_record(rwa, rrd); + if (err != EAGAIN) { + /* + * On success, receive_process_write_record() returns + * EAGAIN to indicate that we do not want to free + * the rrd or arc_buf. + */ + ASSERT(err != 0); + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + } + break; + } + case DRR_WRITE_EMBEDDED: + { + struct drr_write_embedded *drrwe = + &rrd->header.drr_u.drr_write_embedded; + err = receive_write_embedded(rwa, drrwe, rrd->payload); + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + break; + } + case DRR_FREE: + { + struct drr_free *drrf = &rrd->header.drr_u.drr_free; + err = receive_free(rwa, drrf); + break; + } + case DRR_SPILL: + { + struct drr_spill *drrs = &rrd->header.drr_u.drr_spill; + err = receive_spill(rwa, drrs, rrd->arc_buf); + if (err != 0) + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + break; + } + case DRR_OBJECT_RANGE: + { + struct drr_object_range *drror = + &rrd->header.drr_u.drr_object_range; + err = receive_object_range(rwa, drror); + break; + } + case DRR_REDACT: + { + struct drr_redact *drrr = &rrd->header.drr_u.drr_redact; + err = receive_redact(rwa, drrr); + break; + } + default: + err = (SET_ERROR(EINVAL)); + } + + if (err != 0) + dprintf_drr(rrd, err); + + return (err); +} + +/* + * dmu_recv_stream's worker thread; pull records off the queue, and then call + * receive_process_record When we're done, signal the main thread and exit. + */ +static void +receive_writer_thread(void *arg) +{ + struct receive_writer_arg *rwa = arg; + struct receive_record_arg *rrd; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + for (rrd = bqueue_dequeue(&rwa->q); !rrd->eos_marker; + rrd = bqueue_dequeue(&rwa->q)) { + /* + * If there's an error, the main thread will stop putting things + * on the queue, but we need to clear everything in it before we + * can exit. + */ + int err = 0; + if (rwa->err == 0) { + err = receive_process_record(rwa, rrd); + } else if (rrd->arc_buf != NULL) { + dmu_return_arcbuf(rrd->arc_buf); + rrd->arc_buf = NULL; + rrd->payload = NULL; + } else if (rrd->payload != NULL) { + kmem_free(rrd->payload, rrd->payload_size); + rrd->payload = NULL; + } + /* + * EAGAIN indicates that this record has been saved (on + * raw->write_batch), and will be used again, so we don't + * free it. + */ + if (err != EAGAIN) { + if (rwa->err == 0) + rwa->err = err; + kmem_free(rrd, sizeof (*rrd)); + } + } + kmem_free(rrd, sizeof (*rrd)); + + int err = flush_write_batch(rwa); + if (rwa->err == 0) + rwa->err = err; + + mutex_enter(&rwa->mutex); + rwa->done = B_TRUE; + cv_signal(&rwa->cv); + mutex_exit(&rwa->mutex); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +static int +resume_check(dmu_recv_cookie_t *drc, nvlist_t *begin_nvl) +{ + uint64_t val; + objset_t *mos = dmu_objset_pool(drc->drc_os)->dp_meta_objset; + uint64_t dsobj = dmu_objset_id(drc->drc_os); + uint64_t resume_obj, resume_off; + + if (nvlist_lookup_uint64(begin_nvl, + "resume_object", &resume_obj) != 0 || + nvlist_lookup_uint64(begin_nvl, + "resume_offset", &resume_off) != 0) { + return (SET_ERROR(EINVAL)); + } + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val)); + if (resume_obj != val) + return (SET_ERROR(EINVAL)); + VERIFY0(zap_lookup(mos, dsobj, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val)); + if (resume_off != val) + return (SET_ERROR(EINVAL)); + + return (0); +} + +/* + * Read in the stream's records, one by one, and apply them to the pool. There + * are two threads involved; the thread that calls this function will spin up a + * worker thread, read the records off the stream one by one, and issue + * prefetches for any necessary indirect blocks. It will then push the records + * onto an internal blocking queue. The worker thread will pull the records off + * the queue, and actually write the data into the DMU. This way, the worker + * thread doesn't have to wait for reads to complete, since everything it needs + * (the indirect blocks) will be prefetched. + * + * NB: callers *must* call dmu_recv_end() if this succeeds. + */ +int +dmu_recv_stream(dmu_recv_cookie_t *drc, offset_t *voffp) +{ + int err = 0; + struct receive_writer_arg *rwa = kmem_zalloc(sizeof (*rwa), KM_SLEEP); + + if (dsl_dataset_is_zapified(drc->drc_ds)) { + uint64_t bytes; + (void) zap_lookup(drc->drc_ds->ds_dir->dd_pool->dp_meta_objset, + drc->drc_ds->ds_object, DS_FIELD_RESUME_BYTES, + sizeof (bytes), 1, &bytes); + drc->drc_bytes_read += bytes; + } + + drc->drc_ignore_objlist = objlist_create(); + + /* these were verified in dmu_recv_begin */ + ASSERT3U(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo), ==, + DMU_SUBSTREAM); + ASSERT3U(drc->drc_drrb->drr_type, <, DMU_OST_NUMTYPES); + + ASSERT(dsl_dataset_phys(drc->drc_ds)->ds_flags & DS_FLAG_INCONSISTENT); + ASSERT0(drc->drc_os->os_encrypted && + (drc->drc_featureflags & DMU_BACKUP_FEATURE_EMBED_DATA)); + + /* handle DSL encryption key payload */ + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RAW) { + nvlist_t *keynvl = NULL; + + ASSERT(drc->drc_os->os_encrypted); + ASSERT(drc->drc_raw); + + err = nvlist_lookup_nvlist(drc->drc_begin_nvl, "crypt_keydata", + &keynvl); + if (err != 0) + goto out; + + /* + * If this is a new dataset we set the key immediately. + * Otherwise we don't want to change the key until we + * are sure the rest of the receive succeeded so we stash + * the keynvl away until then. + */ + err = dsl_crypto_recv_raw(spa_name(drc->drc_os->os_spa), + drc->drc_ds->ds_object, drc->drc_fromsnapobj, + drc->drc_drrb->drr_type, keynvl, drc->drc_newfs); + if (err != 0) + goto out; + + /* see comment in dmu_recv_end_sync() */ + drc->drc_ivset_guid = 0; + (void) nvlist_lookup_uint64(keynvl, "to_ivset_guid", + &drc->drc_ivset_guid); + + if (!drc->drc_newfs) + drc->drc_keynvl = fnvlist_dup(keynvl); + } + + if (drc->drc_featureflags & DMU_BACKUP_FEATURE_RESUMING) { + err = resume_check(drc, drc->drc_begin_nvl); + if (err != 0) + goto out; + } + + /* + * If we failed before this point we will clean up any new resume + * state that was created. Now that we've gotten past the initial + * checks we are ok to retain that resume state. + */ + drc->drc_should_save = B_TRUE; + + (void) bqueue_init(&rwa->q, zfs_recv_queue_ff, + MAX(zfs_recv_queue_length, 2 * zfs_max_recordsize), + offsetof(struct receive_record_arg, node)); + cv_init(&rwa->cv, NULL, CV_DEFAULT, NULL); + mutex_init(&rwa->mutex, NULL, MUTEX_DEFAULT, NULL); + rwa->os = drc->drc_os; + rwa->byteswap = drc->drc_byteswap; + rwa->resumable = drc->drc_resumable; + rwa->raw = drc->drc_raw; + rwa->spill = drc->drc_spill; + rwa->full = (drc->drc_drr_begin->drr_u.drr_begin.drr_fromguid == 0); + rwa->os->os_raw_receive = drc->drc_raw; + list_create(&rwa->write_batch, sizeof (struct receive_record_arg), + offsetof(struct receive_record_arg, node.bqn_node)); + + (void) thread_create(NULL, 0, receive_writer_thread, rwa, 0, curproc, + TS_RUN, minclsyspri); + /* + * We're reading rwa->err without locks, which is safe since we are the + * only reader, and the worker thread is the only writer. It's ok if we + * miss a write for an iteration or two of the loop, since the writer + * thread will keep freeing records we send it until we send it an eos + * marker. + * + * We can leave this loop in 3 ways: First, if rwa->err is + * non-zero. In that case, the writer thread will free the rrd we just + * pushed. Second, if we're interrupted; in that case, either it's the + * first loop and drc->drc_rrd was never allocated, or it's later, and + * drc->drc_rrd has been handed off to the writer thread who will free + * it. Finally, if receive_read_record fails or we're at the end of the + * stream, then we free drc->drc_rrd and exit. + */ + while (rwa->err == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = SET_ERROR(EINTR); + break; + } + + ASSERT3P(drc->drc_rrd, ==, NULL); + drc->drc_rrd = drc->drc_next_rrd; + drc->drc_next_rrd = NULL; + /* Allocates and loads header into drc->drc_next_rrd */ + err = receive_read_record(drc); + + if (drc->drc_rrd->header.drr_type == DRR_END || err != 0) { + kmem_free(drc->drc_rrd, sizeof (*drc->drc_rrd)); + drc->drc_rrd = NULL; + break; + } + + bqueue_enqueue(&rwa->q, drc->drc_rrd, + sizeof (struct receive_record_arg) + + drc->drc_rrd->payload_size); + drc->drc_rrd = NULL; + } + + ASSERT3P(drc->drc_rrd, ==, NULL); + drc->drc_rrd = kmem_zalloc(sizeof (*drc->drc_rrd), KM_SLEEP); + drc->drc_rrd->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rwa->q, drc->drc_rrd, 1); + + mutex_enter(&rwa->mutex); + while (!rwa->done) { + /* + * We need to use cv_wait_sig() so that any process that may + * be sleeping here can still fork. + */ + (void) cv_wait_sig(&rwa->cv, &rwa->mutex); + } + mutex_exit(&rwa->mutex); + + /* + * If we are receiving a full stream as a clone, all object IDs which + * are greater than the maximum ID referenced in the stream are + * by definition unused and must be freed. + */ + if (drc->drc_clone && drc->drc_drrb->drr_fromguid == 0) { + uint64_t obj = rwa->max_object + 1; + int free_err = 0; + int next_err = 0; + + while (next_err == 0) { + free_err = dmu_free_long_object(rwa->os, obj); + if (free_err != 0 && free_err != ENOENT) + break; + + next_err = dmu_object_next(rwa->os, &obj, FALSE, 0); + } + + if (err == 0) { + if (free_err != 0 && free_err != ENOENT) + err = free_err; + else if (next_err != ESRCH) + err = next_err; + } + } + + cv_destroy(&rwa->cv); + mutex_destroy(&rwa->mutex); + bqueue_destroy(&rwa->q); + list_destroy(&rwa->write_batch); + if (err == 0) + err = rwa->err; + +out: + /* + * If we hit an error before we started the receive_writer_thread + * we need to clean up the next_rrd we create by processing the + * DRR_BEGIN record. + */ + if (drc->drc_next_rrd != NULL) + kmem_free(drc->drc_next_rrd, sizeof (*drc->drc_next_rrd)); + + /* + * The objset will be invalidated by dmu_recv_end() when we do + * dsl_dataset_clone_swap_sync_impl(). + */ + drc->drc_os = NULL; + + kmem_free(rwa, sizeof (*rwa)); + nvlist_free(drc->drc_begin_nvl); + + if (err != 0) { + /* + * Clean up references. If receive is not resumable, + * destroy what we created, so we don't leave it in + * the inconsistent state. + */ + dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); + } + + objlist_destroy(drc->drc_ignore_objlist); + drc->drc_ignore_objlist = NULL; + *voffp = drc->drc_voff; + return (err); +} + +static int +dmu_recv_end_check(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int error; + + ASSERT3P(drc->drc_ds->ds_owner, ==, dmu_recv_tag); + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + error = dsl_dataset_hold(dp, drc->drc_tofs, FTAG, &origin_head); + if (error != 0) + return (error); + if (drc->drc_force) { + /* + * We will destroy any snapshots in tofs (i.e. before + * origin_head) that are after the origin (which is + * the snap before drc_ds, because drc_ds can not + * have any snaps of its own). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + error = dsl_dataset_hold_obj(dp, obj, FTAG, + &snap); + if (error != 0) + break; + if (snap->ds_dir != origin_head->ds_dir) + error = SET_ERROR(EINVAL); + if (error == 0) { + error = dsl_destroy_snapshot_check_impl( + snap, B_FALSE); + } + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + if (error != 0) + break; + } + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + if (drc->drc_keynvl != NULL) { + error = dsl_crypto_recv_raw_key_check(drc->drc_ds, + drc->drc_keynvl, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + } + + error = dsl_dataset_clone_swap_check_impl(drc->drc_ds, + origin_head, drc->drc_force, drc->drc_owner, tx); + if (error != 0) { + dsl_dataset_rele(origin_head, FTAG); + return (error); + } + error = dsl_dataset_snapshot_check_impl(origin_head, + drc->drc_tosnap, tx, B_TRUE, 1, + drc->drc_cred, drc->drc_proc); + dsl_dataset_rele(origin_head, FTAG); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(drc->drc_ds, 1); + } else { + error = dsl_dataset_snapshot_check_impl(drc->drc_ds, + drc->drc_tosnap, tx, B_TRUE, 1, + drc->drc_cred, drc->drc_proc); + } + return (error); +} + +static void +dmu_recv_end_sync(void *arg, dmu_tx_t *tx) +{ + dmu_recv_cookie_t *drc = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + boolean_t encrypted = drc->drc_ds->ds_dir->dd_crypto_obj != 0; + uint64_t newsnapobj; + + spa_history_log_internal_ds(drc->drc_ds, "finish receiving", + tx, "snap=%s", drc->drc_tosnap); + drc->drc_ds->ds_objset->os_raw_receive = B_FALSE; + + if (!drc->drc_newfs) { + dsl_dataset_t *origin_head; + + VERIFY0(dsl_dataset_hold(dp, drc->drc_tofs, FTAG, + &origin_head)); + + if (drc->drc_force) { + /* + * Destroy any snapshots of drc_tofs (origin_head) + * after the origin (the snap before drc_ds). + */ + uint64_t obj; + + obj = dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + while (obj != + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj) { + dsl_dataset_t *snap; + VERIFY0(dsl_dataset_hold_obj(dp, obj, FTAG, + &snap)); + ASSERT3P(snap->ds_dir, ==, origin_head->ds_dir); + obj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + dsl_destroy_snapshot_sync_impl(snap, + B_FALSE, tx); + dsl_dataset_rele(snap, FTAG); + } + } + if (drc->drc_keynvl != NULL) { + dsl_crypto_recv_raw_key_sync(drc->drc_ds, + drc->drc_keynvl, tx); + nvlist_free(drc->drc_keynvl); + drc->drc_keynvl = NULL; + } + + VERIFY3P(drc->drc_ds->ds_prev, ==, + origin_head->ds_prev); + + dsl_dataset_clone_swap_sync_impl(drc->drc_ds, + origin_head, tx); + /* + * The objset was evicted by dsl_dataset_clone_swap_sync_impl, + * so drc_os is no longer valid. + */ + drc->drc_os = NULL; + + dsl_dataset_snapshot_sync_impl(origin_head, + drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(origin_head->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(origin_head->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(origin_head->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(origin_head->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + dsl_dataset_phys(origin_head)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + newsnapobj = + dsl_dataset_phys(origin_head)->ds_prev_snap_obj; + + dsl_dataset_rele(origin_head, FTAG); + dsl_destroy_head_sync_impl(drc->drc_ds, tx); + + if (drc->drc_owner != NULL) + VERIFY3P(origin_head->ds_owner, ==, drc->drc_owner); + } else { + dsl_dataset_t *ds = drc->drc_ds; + + dsl_dataset_snapshot_sync_impl(ds, drc->drc_tosnap, tx); + + /* set snapshot's creation time and guid */ + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + dsl_dataset_phys(ds->ds_prev)->ds_creation_time = + drc->drc_drrb->drr_creation_time; + dsl_dataset_phys(ds->ds_prev)->ds_guid = + drc->drc_drrb->drr_toguid; + dsl_dataset_phys(ds->ds_prev)->ds_flags &= + ~DS_FLAG_INCONSISTENT; + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags &= ~DS_FLAG_INCONSISTENT; + if (dsl_dataset_has_resume_receive_state(ds)) { + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, tx); + (void) zap_remove(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, tx); + } + newsnapobj = + dsl_dataset_phys(drc->drc_ds)->ds_prev_snap_obj; + } + + /* + * If this is a raw receive, the crypt_keydata nvlist will include + * a to_ivset_guid for us to set on the new snapshot. This value + * will override the value generated by the snapshot code. However, + * this value may not be present, because older implementations of + * the raw send code did not include this value, and we are still + * allowed to receive them if the zfs_disable_ivset_guid_check + * tunable is set, in which case we will leave the newly-generated + * value. + */ + if (drc->drc_raw && drc->drc_ivset_guid != 0) { + dmu_object_zapify(dp->dp_meta_objset, newsnapobj, + DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_update(dp->dp_meta_objset, newsnapobj, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &drc->drc_ivset_guid, tx)); + } + + /* + * Release the hold from dmu_recv_begin. This must be done before + * we return to open context, so that when we free the dataset's dnode + * we can evict its bonus buffer. Since the dataset may be destroyed + * at this point (and therefore won't have a valid pointer to the spa) + * we release the key mapping manually here while we do have a valid + * pointer, if it exists. + */ + if (!drc->drc_raw && encrypted) { + (void) spa_keystore_remove_mapping(dmu_tx_pool(tx)->dp_spa, + drc->drc_ds->ds_object, drc->drc_ds); + } + dsl_dataset_disown(drc->drc_ds, 0, dmu_recv_tag); + drc->drc_ds = NULL; +} + +static int dmu_recv_end_modified_blocks = 3; + +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) +{ +#ifdef _KERNEL + /* + * We will be destroying the ds; make sure its origin is unmounted if + * necessary. + */ + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(drc->drc_ds, name); + zfs_destroy_unmount_origin(name); +#endif + + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + return (dsl_sync_task(drc->drc_tofs, + dmu_recv_end_check, dmu_recv_end_sync, drc, + dmu_recv_end_modified_blocks, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc, void *owner) +{ + int error; + + drc->drc_owner = owner; + + if (drc->drc_newfs) + error = dmu_recv_new_end(drc); + else + error = dmu_recv_existing_end(drc); + + if (error != 0) { + dmu_recv_cleanup_ds(drc); + nvlist_free(drc->drc_keynvl); + } else { + if (drc->drc_newfs) { + zvol_create_minor(drc->drc_tofs); + } + char *snapname = kmem_asprintf("%s@%s", + drc->drc_tofs, drc->drc_tosnap); + zvol_create_minor(snapname); + kmem_strfree(snapname); + } + return (error); +} + +/* + * Return TRUE if this objset is currently being received into. + */ +boolean_t +dmu_objset_is_receiving(objset_t *os) +{ + return (os->os_dsl_dataset != NULL && + os->os_dsl_dataset->ds_owner == dmu_recv_tag); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_length, INT, ZMOD_RW, + "Maximum receive queue length"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, queue_ff, INT, ZMOD_RW, + "Receive queue fill fraction"); + +ZFS_MODULE_PARAM(zfs_recv, zfs_recv_, write_batch_size, INT, ZMOD_RW, + "Maximum amount of writes to batch into one transaction"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_redact.c b/module/zfs/dmu_redact.c new file mode 100644 index 000000000000..df10d8d6faae --- /dev/null +++ b/module/zfs/dmu_redact.c @@ -0,0 +1,1186 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif + +/* + * This controls the number of entries in the buffer the redaction_list_update + * synctask uses to buffer writes to the redaction list. + */ +int redact_sync_bufsize = 1024; + +/* + * Controls how often to update the redaction list when creating a redaction + * list. + */ +uint64_t redaction_list_update_interval_ns = 1000 * 1000 * 1000ULL; /* NS */ + +/* + * This tunable controls the length of the queues that zfs redact worker threads + * use to communicate. If the dmu_redact_snap thread is blocking on these + * queues, this variable may need to be increased. If there is a significant + * slowdown at the start of a redact operation as these threads consume all the + * available IO resources, or the queues are consuming too much memory, this + * variable may need to be decreased. + */ +int zfs_redact_queue_length = 1024 * 1024; + +/* + * These tunables control the fill fraction of the queues by zfs redact. The + * fill fraction controls the frequency with which threads have to be + * cv_signaled. If a lot of cpu time is being spent on cv_signal, then these + * should be tuned down. If the queues empty before the signalled thread can + * catch up, then these should be tuned up. + */ +uint64_t zfs_redact_queue_ff = 20; + +struct redact_record { + bqueue_node_t ln; + boolean_t eos_marker; /* Marks the end of the stream */ + uint64_t start_object; + uint64_t start_blkid; + uint64_t end_object; + uint64_t end_blkid; + uint8_t indblkshift; + uint32_t datablksz; +}; + +struct redact_thread_arg { + bqueue_t q; + objset_t *os; /* Objset to traverse */ + dsl_dataset_t *ds; /* Dataset to traverse */ + struct redact_record *current_record; + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; + objlist_t *deleted_objs; + uint64_t *num_blocks_visited; + uint64_t ignore_object; /* ignore further callbacks on this */ + uint64_t txg; /* txg to traverse since */ +}; + +/* + * The redaction node is a wrapper around the redaction record that is used + * by the redaction merging thread to sort the records and determine overlaps. + * + * It contains two nodes; one sorts the records by their start_zb, and the other + * sorts the records by their end_zb. + */ +struct redact_node { + avl_node_t avl_node_start; + avl_node_t avl_node_end; + struct redact_record *record; + struct redact_thread_arg *rt_arg; + uint32_t thread_num; +}; + +struct merge_data { + list_t md_redact_block_pending; + redact_block_phys_t md_coalesce_block; + uint64_t md_last_time; + redact_block_phys_t md_furthest[TXG_SIZE]; + /* Lists of struct redact_block_list_node. */ + list_t md_blocks[TXG_SIZE]; + boolean_t md_synctask_txg[TXG_SIZE]; + uint64_t md_latest_synctask_txg; + redaction_list_t *md_redaction_list; +}; + +/* + * A wrapper around struct redact_block so it can be stored in a list_t. + */ +struct redact_block_list_node { + redact_block_phys_t block; + list_node_t node; +}; + +/* + * We've found a new redaction candidate. In order to improve performance, we + * coalesce these blocks when they're adjacent to each other. This function + * handles that. If the new candidate block range is immediately after the + * range we're building, coalesce it into the range we're building. Otherwise, + * put the record we're building on the queue, and update the build pointer to + * point to the new record. + */ +static void +record_merge_enqueue(bqueue_t *q, struct redact_record **build, + struct redact_record *new) +{ + if (new->eos_marker) { + if (*build != NULL) + bqueue_enqueue(q, *build, sizeof (*build)); + bqueue_enqueue_flush(q, new, sizeof (*new)); + return; + } + if (*build == NULL) { + *build = new; + return; + } + struct redact_record *curbuild = *build; + if ((curbuild->end_object == new->start_object && + curbuild->end_blkid + 1 == new->start_blkid && + curbuild->end_blkid != UINT64_MAX) || + (curbuild->end_object + 1 == new->start_object && + curbuild->end_blkid == UINT64_MAX && new->start_blkid == 0)) { + curbuild->end_object = new->end_object; + curbuild->end_blkid = new->end_blkid; + kmem_free(new, sizeof (*new)); + } else { + bqueue_enqueue(q, curbuild, sizeof (*curbuild)); + *build = new; + } +} +#ifdef _KERNEL +struct objnode { + avl_node_t node; + uint64_t obj; +}; + +static int +objnode_compare(const void *o1, const void *o2) +{ + const struct objnode *obj1 = o1; + const struct objnode *obj2 = o2; + if (obj1->obj < obj2->obj) + return (-1); + if (obj1->obj > obj2->obj) + return (1); + return (0); +} + + +static objlist_t * +zfs_get_deleteq(objset_t *os) +{ + objlist_t *deleteq_objlist = objlist_create(); + uint64_t deleteq_obj; + zap_cursor_t zc; + zap_attribute_t za; + dmu_object_info_t doi; + + ASSERT3U(os->os_phys->os_type, ==, DMU_OST_ZFS); + VERIFY0(dmu_object_info(os, MASTER_NODE_OBJ, &doi)); + ASSERT3U(doi.doi_type, ==, DMU_OT_MASTER_NODE); + + VERIFY0(zap_lookup(os, MASTER_NODE_OBJ, + ZFS_UNLINKED_SET, sizeof (uint64_t), 1, &deleteq_obj)); + + /* + * In order to insert objects into the objlist, they must be in sorted + * order. We don't know what order we'll get them out of the ZAP in, so + * we insert them into and remove them from an avl_tree_t to sort them. + */ + avl_tree_t at; + avl_create(&at, objnode_compare, sizeof (struct objnode), + offsetof(struct objnode, node)); + + for (zap_cursor_init(&zc, os, deleteq_obj); + zap_cursor_retrieve(&zc, &za) == 0; zap_cursor_advance(&zc)) { + struct objnode *obj = kmem_zalloc(sizeof (*obj), KM_SLEEP); + obj->obj = za.za_first_integer; + avl_add(&at, obj); + } + zap_cursor_fini(&zc); + + struct objnode *next, *found = avl_first(&at); + while (found != NULL) { + next = AVL_NEXT(&at, found); + objlist_insert(deleteq_objlist, found->obj); + found = next; + } + + void *cookie = NULL; + while ((found = avl_destroy_nodes(&at, &cookie)) != NULL) + kmem_free(found, sizeof (*found)); + avl_destroy(&at); + return (deleteq_objlist); +} +#endif + +/* + * This is the callback function to traverse_dataset for the redaction threads + * for dmu_redact_snap. This thread is responsible for creating redaction + * records for all the data that is modified by the snapshots we're redacting + * with respect to. Redaction records represent ranges of data that have been + * modified by one of the redaction snapshots, and are stored in the + * redact_record struct. We need to create redaction records for three + * cases: + * + * First, if there's a normal write, we need to create a redaction record for + * that block. + * + * Second, if there's a hole, we need to create a redaction record that covers + * the whole range of the hole. If the hole is in the meta-dnode, it must cover + * every block in all of the objects in the hole. + * + * Third, if there is a deleted object, we need to create a redaction record for + * all of the blocks in that object. + */ +/*ARGSUSED*/ +static int +redact_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct redact_thread_arg *rta = arg; + struct redact_record *record; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= rta->resume.zb_object); + + if (rta->cancel) + return (SET_ERROR(EINTR)); + + if (rta->ignore_object == zb->zb_object) + return (0); + + /* + * If we're visiting a dnode, we need to handle the case where the + * object has been deleted. + */ + if (zb->zb_level == ZB_DNODE_LEVEL) { + ASSERT3U(zb->zb_level, ==, ZB_DNODE_LEVEL); + + if (zb->zb_object == 0) + return (0); + + /* + * If the object has been deleted, redact all of the blocks in + * it. + */ + if (dnp->dn_type == DMU_OT_NONE || + objlist_exists(rta->deleted_objs, zb->zb_object)) { + rta->ignore_object = zb->zb_object; + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + + record->eos_marker = B_FALSE; + record->start_object = record->end_object = + zb->zb_object; + record->start_blkid = 0; + record->end_blkid = UINT64_MAX; + record_merge_enqueue(&rta->q, + &rta->current_record, record); + } + return (0); + } else if (zb->zb_level < 0) { + return (0); + } else if (zb->zb_level > 0 && !BP_IS_HOLE(bp)) { + /* + * If this is an indirect block, but not a hole, it doesn't + * provide any useful information for redaction, so ignore it. + */ + return (0); + } + + /* + * At this point, there are two options left for the type of block we're + * looking at. Either this is a hole (which could be in the dnode or + * the meta-dnode), or it's a level 0 block of some sort. If it's a + * hole, we create a redaction record that covers the whole range. If + * the hole is in a dnode, we need to redact all the blocks in that + * hole. If the hole is in the meta-dnode, we instead need to redact + * all blocks in every object covered by that hole. If it's a level 0 + * block, we only need to redact that single block. + */ + record = kmem_zalloc(sizeof (struct redact_record), KM_SLEEP); + record->eos_marker = B_FALSE; + + record->start_object = record->end_object = zb->zb_object; + if (BP_IS_HOLE(bp)) { + record->start_blkid = zb->zb_blkid * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + + record->end_blkid = ((zb->zb_blkid + 1) * + bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level)) - 1; + + if (zb->zb_object == DMU_META_DNODE_OBJECT) { + record->start_object = record->start_blkid * + ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t)); + record->start_blkid = 0; + record->end_object = ((record->end_blkid + + 1) * ((SPA_MINBLOCKSIZE * dnp->dn_datablkszsec) / + sizeof (dnode_phys_t))) - 1; + record->end_blkid = UINT64_MAX; + } + } else if (zb->zb_level != 0 || + zb->zb_object == DMU_META_DNODE_OBJECT) { + kmem_free(record, sizeof (*record)); + return (0); + } else { + record->start_blkid = record->end_blkid = zb->zb_blkid; + } + record->indblkshift = dnp->dn_indblkshift; + record->datablksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + record_merge_enqueue(&rta->q, &rta->current_record, record); + + return (0); +} + +static void +redact_traverse_thread(void *arg) +{ + struct redact_thread_arg *rt_arg = arg; + int err; + struct redact_record *data; +#ifdef _KERNEL + if (rt_arg->os->os_phys->os_type == DMU_OST_ZFS) + rt_arg->deleted_objs = zfs_get_deleteq(rt_arg->os); + else + rt_arg->deleted_objs = objlist_create(); +#else + rt_arg->deleted_objs = objlist_create(); +#endif + + err = traverse_dataset_resume(rt_arg->ds, rt_arg->txg, + &rt_arg->resume, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + redact_cb, rt_arg); + + if (err != EINTR) + rt_arg->error_code = err; + objlist_destroy(rt_arg->deleted_objs); + data = kmem_zalloc(sizeof (*data), KM_SLEEP); + data->eos_marker = B_TRUE; + record_merge_enqueue(&rt_arg->q, &rt_arg->current_record, data); + thread_exit(); +} + +static inline void +create_zbookmark_from_obj_off(zbookmark_phys_t *zb, uint64_t object, + uint64_t blkid) +{ + zb->zb_object = object; + zb->zb_level = 0; + zb->zb_blkid = blkid; +} + +/* + * This is a utility function that can do the comparison for the start or ends + * of the ranges in a redact_record. + */ +static int +redact_range_compare(uint64_t obj1, uint64_t off1, uint32_t dbss1, + uint64_t obj2, uint64_t off2, uint32_t dbss2) +{ + zbookmark_phys_t z1, z2; + create_zbookmark_from_obj_off(&z1, obj1, off1); + create_zbookmark_from_obj_off(&z2, obj2, off2); + + return (zbookmark_compare(dbss1 >> SPA_MINBLOCKSHIFT, 0, + dbss2 >> SPA_MINBLOCKSHIFT, 0, &z1, &z2)); +} + +/* + * Compare two redaction records by their range's start location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_start(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *rr1 = rn1->record; + const struct redact_record *rr2 = rn2->record; + if (rr1->eos_marker) + return (1); + if (rr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(rr1->start_object, rr1->start_blkid, + rr1->datablksz, rr2->start_object, rr2->start_blkid, + rr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Compare two redaction records by their range's end location. Also makes + * eos records always compare last. We use the thread number in the redact_node + * to ensure that records do not compare equal (which is not allowed in our avl + * trees). + */ +static int +redact_node_compare_end(const void *arg1, const void *arg2) +{ + const struct redact_node *rn1 = arg1; + const struct redact_node *rn2 = arg2; + const struct redact_record *srr1 = rn1->record; + const struct redact_record *srr2 = rn2->record; + if (srr1->eos_marker) + return (1); + if (srr2->eos_marker) + return (-1); + + int cmp = redact_range_compare(srr1->end_object, srr1->end_blkid, + srr1->datablksz, srr2->end_object, srr2->end_blkid, + srr2->datablksz); + if (cmp == 0) + cmp = (rn1->thread_num < rn2->thread_num ? -1 : 1); + return (cmp); +} + +/* + * Utility function that compares two redaction records to determine if any part + * of the "from" record is before any part of the "to" record. Also causes End + * of Stream redaction records to compare after all others, so that the + * redaction merging logic can stay simple. + */ +static boolean_t +redact_record_before(const struct redact_record *from, + const struct redact_record *to) +{ + if (from->eos_marker == B_TRUE) + return (B_FALSE); + else if (to->eos_marker == B_TRUE) + return (B_TRUE); + return (redact_range_compare(from->start_object, from->start_blkid, + from->datablksz, to->end_object, to->end_blkid, + to->datablksz) <= 0); +} + +/* + * Pop a new redaction record off the queue, check that the records are in the + * right order, and free the old data. + */ +static struct redact_record * +get_next_redact_record(bqueue_t *bq, struct redact_record *prev) +{ + struct redact_record *next = bqueue_dequeue(bq); + ASSERT(redact_record_before(prev, next)); + kmem_free(prev, sizeof (*prev)); + return (next); +} + +/* + * Remove the given redaction node from both trees, pull a new redaction record + * off the queue, free the old redaction record, update the redaction node, and + * reinsert the node into the trees. + */ +static int +update_avl_trees(avl_tree_t *start_tree, avl_tree_t *end_tree, + struct redact_node *redact_node) +{ + avl_remove(start_tree, redact_node); + avl_remove(end_tree, redact_node); + redact_node->record = get_next_redact_record(&redact_node->rt_arg->q, + redact_node->record); + avl_add(end_tree, redact_node); + avl_add(start_tree, redact_node); + return (redact_node->rt_arg->error_code); +} + +/* + * Synctask for updating redaction lists. We first take this txg's list of + * redacted blocks and append those to the redaction list. We then update the + * redaction list's bonus buffer. We store the furthest blocks we visited and + * the list of snapshots that we're redacting with respect to. We need these so + * that redacted sends and receives can be correctly resumed. + */ +static void +redaction_list_update_sync(void *arg, dmu_tx_t *tx) +{ + struct merge_data *md = arg; + uint64_t txg = dmu_tx_get_txg(tx); + list_t *list = &md->md_blocks[txg & TXG_MASK]; + redact_block_phys_t *furthest_visited = + &md->md_furthest[txg & TXG_MASK]; + objset_t *mos = tx->tx_pool->dp_meta_objset; + redaction_list_t *rl = md->md_redaction_list; + int bufsize = redact_sync_bufsize; + redact_block_phys_t *buf = kmem_alloc(bufsize * sizeof (*buf), + KM_SLEEP); + int index = 0; + + dmu_buf_will_dirty(rl->rl_dbuf, tx); + + for (struct redact_block_list_node *rbln = list_remove_head(list); + rbln != NULL; rbln = list_remove_head(list)) { + ASSERT3U(rbln->block.rbp_object, <=, + furthest_visited->rbp_object); + ASSERT(rbln->block.rbp_object < furthest_visited->rbp_object || + rbln->block.rbp_blkid <= furthest_visited->rbp_blkid); + buf[index] = rbln->block; + index++; + if (index == bufsize) { + dmu_write(mos, rl->rl_object, + rl->rl_phys->rlp_num_entries * sizeof (*buf), + bufsize * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += bufsize; + index = 0; + } + kmem_free(rbln, sizeof (*rbln)); + } + if (index > 0) { + dmu_write(mos, rl->rl_object, rl->rl_phys->rlp_num_entries * + sizeof (*buf), index * sizeof (*buf), buf, tx); + rl->rl_phys->rlp_num_entries += index; + } + kmem_free(buf, bufsize * sizeof (*buf)); + + md->md_synctask_txg[txg & TXG_MASK] = B_FALSE; + rl->rl_phys->rlp_last_object = furthest_visited->rbp_object; + rl->rl_phys->rlp_last_blkid = furthest_visited->rbp_blkid; +} + +static void +commit_rl_updates(objset_t *os, struct merge_data *md, uint64_t object, + uint64_t blkid) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(os->os_spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (struct redact_block_list_node)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + if (!md->md_synctask_txg[txg & TXG_MASK]) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + redaction_list_update_sync, md, 5, ZFS_SPACE_CHECK_NONE, + tx); + md->md_synctask_txg[txg & TXG_MASK] = B_TRUE; + md->md_latest_synctask_txg = txg; + } + md->md_furthest[txg & TXG_MASK].rbp_object = object; + md->md_furthest[txg & TXG_MASK].rbp_blkid = blkid; + list_move_tail(&md->md_blocks[txg & TXG_MASK], + &md->md_redact_block_pending); + dmu_tx_commit(tx); + md->md_last_time = gethrtime(); +} + +/* + * We want to store the list of blocks that we're redacting in the bookmark's + * redaction list. However, this list is stored in the MOS, which means it can + * only be written to in syncing context. To get around this, we create a + * synctask that will write to the mos for us. We tell it what to write by + * a linked list for each current transaction group; every time we decide to + * redact a block, we append it to the transaction group that is currently in + * open context. We also update some progress information that the synctask + * will store to enable resumable redacted sends. + */ +static void +update_redaction_list(struct merge_data *md, objset_t *os, + uint64_t object, uint64_t blkid, uint64_t endblkid, uint32_t blksz) +{ + boolean_t enqueue = B_FALSE; + redact_block_phys_t cur = {0}; + uint64_t count = endblkid - blkid + 1; + while (count > REDACT_BLOCK_MAX_COUNT) { + update_redaction_list(md, os, object, blkid, + blkid + REDACT_BLOCK_MAX_COUNT - 1, blksz); + blkid += REDACT_BLOCK_MAX_COUNT; + count -= REDACT_BLOCK_MAX_COUNT; + } + redact_block_phys_t *coalesce = &md->md_coalesce_block; + boolean_t new; + if (coalesce->rbp_size_count == 0) { + new = B_TRUE; + enqueue = B_FALSE; + } else { + uint64_t old_count = redact_block_get_count(coalesce); + if (coalesce->rbp_object == object && + coalesce->rbp_blkid + old_count == blkid && + old_count + count <= REDACT_BLOCK_MAX_COUNT) { + ASSERT3U(redact_block_get_size(coalesce), ==, blksz); + redact_block_set_count(coalesce, old_count + count); + new = B_FALSE; + enqueue = B_FALSE; + } else { + new = B_TRUE; + enqueue = B_TRUE; + } + } + + if (new) { + cur = *coalesce; + coalesce->rbp_blkid = blkid; + coalesce->rbp_object = object; + + redact_block_set_count(coalesce, count); + redact_block_set_size(coalesce, blksz); + } + + if (enqueue && redact_block_get_size(&cur) != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = cur; + list_insert_tail(&md->md_redact_block_pending, rbln); + } + + if (gethrtime() > md->md_last_time + + redaction_list_update_interval_ns) { + commit_rl_updates(os, md, object, blkid); + } +} + +/* + * This thread merges all the redaction records provided by the worker threads, + * and determines which blocks are redacted by all the snapshots. The algorithm + * for doing so is similar to performing a merge in mergesort with n sub-lists + * instead of 2, with some added complexity due to the fact that the entries are + * ranges, not just single blocks. This algorithm relies on the fact that the + * queues are sorted, which is ensured by the fact that traverse_dataset + * traverses the dataset in a consistent order. We pull one entry off the front + * of the queues of each secure dataset traversal thread. Then we repeat the + * following: each record represents a range of blocks modified by one of the + * redaction snapshots, and each block in that range may need to be redacted in + * the send stream. Find the record with the latest start of its range, and the + * record with the earliest end of its range. If the last start is before the + * first end, then we know that the blocks in the range [last_start, first_end] + * are covered by all of the ranges at the front of the queues, which means + * every thread redacts that whole range. For example, let's say the ranges on + * each queue look like this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * Thread 3 has the last start (5), and the thread 2 has the last end (6). All + * three threads modified the range [5,6], so that data should not be sent over + * the wire. After we've determined whether or not to redact anything, we take + * the record with the first end. We discard that record, and pull a new one + * off the front of the queue it came from. In the above example, we would + * discard Thread 2's record, and pull a new one. Let's say the next record we + * pulled from Thread 2 covered range [10,11]. The new layout would look like + * this: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 + * Thread 1 | [====================] + * Thread 2 | [==] + * Thread 3 | [=================] + * + * When we compare the last start (10, from Thread 2) and the first end (9, from + * Thread 1), we see that the last start is greater than the first end. + * Therefore, we do not redact anything from these records. We'll iterate by + * replacing the record from Thread 1. + * + * We iterate by replacing the record with the lowest end because we know + * that the record with the lowest end has helped us as much as it can. All the + * ranges before it that we will ever redact have been redacted. In addition, + * by replacing the one with the lowest end, we guarantee we catch all ranges + * that need to be redacted. For example, if in the case above we had replaced + * the record from Thread 1 instead, we might have ended up with the following: + * + * Block Id 1 2 3 4 5 6 7 8 9 10 11 12 + * Thread 1 | [==] + * Thread 2 | [========] + * Thread 3 | [=================] + * + * If the next record from Thread 2 had been [8,10], for example, we should have + * redacted part of that range, but because we updated Thread 1's record, we + * missed it. + * + * We implement this algorithm by using two trees. The first sorts the + * redaction records by their start_zb, and the second sorts them by their + * end_zb. We use these to find the record with the last start and the record + * with the first end. We create a record with that start and end, and send it + * on. The overall runtime of this implementation is O(n log m), where n is the + * total number of redaction records from all the different redaction snapshots, + * and m is the number of redaction snapshots. + * + * If we redact with respect to zero snapshots, we create a redaction + * record with the start object and blkid to 0, and the end object and blkid to + * UINT64_MAX. This will result in us redacting every block. + */ +static int +perform_thread_merge(bqueue_t *q, uint32_t num_threads, + struct redact_thread_arg *thread_args, boolean_t *cancel) +{ + struct redact_node *redact_nodes = NULL; + avl_tree_t start_tree, end_tree; + struct redact_record *record; + struct redact_record *current_record = NULL; + int err = 0; + struct merge_data md = { {0} }; + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + + /* + * If we're redacting with respect to zero snapshots, then no data is + * permitted to be sent. We enqueue a record that redacts all blocks, + * and an eos marker. + */ + if (num_threads == 0) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + // We can't redact object 0, so don't try. + record->start_object = 1; + record->start_blkid = 0; + record->end_object = record->end_blkid = UINT64_MAX; + bqueue_enqueue(q, record, sizeof (*record)); + return (0); + } + if (num_threads > 0) { + redact_nodes = kmem_zalloc(num_threads * + sizeof (*redact_nodes), KM_SLEEP); + } + + avl_create(&start_tree, redact_node_compare_start, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_start)); + avl_create(&end_tree, redact_node_compare_end, + sizeof (struct redact_node), + offsetof(struct redact_node, avl_node_end)); + + for (int i = 0; i < num_threads; i++) { + struct redact_node *node = &redact_nodes[i]; + struct redact_thread_arg *targ = &thread_args[i]; + node->record = bqueue_dequeue(&targ->q); + node->rt_arg = targ; + node->thread_num = i; + avl_add(&start_tree, node); + avl_add(&end_tree, node); + } + + /* + * Once the first record in the end tree has returned EOS, every record + * must be an EOS record, so we should stop. + */ + while (err == 0 && !((struct redact_node *)avl_first(&end_tree))-> + record->eos_marker) { + if (*cancel) { + err = EINTR; + break; + } + struct redact_node *last_start = avl_last(&start_tree); + struct redact_node *first_end = avl_first(&end_tree); + + /* + * If the last start record is before the first end record, + * then we have blocks that are redacted by all threads. + * Therefore, we should redact them. Copy the record, and send + * it to the main thread. + */ + if (redact_record_before(last_start->record, + first_end->record)) { + record = kmem_zalloc(sizeof (struct redact_record), + KM_SLEEP); + *record = *first_end->record; + record->start_object = last_start->record->start_object; + record->start_blkid = last_start->record->start_blkid; + record_merge_enqueue(q, ¤t_record, + record); + } + err = update_avl_trees(&start_tree, &end_tree, first_end); + } + + /* + * We're done; if we were cancelled, we need to cancel our workers and + * clear out their queues. Either way, we need to remove every thread's + * redact_node struct from the avl trees. + */ + for (int i = 0; i < num_threads; i++) { + if (err != 0) { + thread_args[i].cancel = B_TRUE; + while (!redact_nodes[i].record->eos_marker) { + (void) update_avl_trees(&start_tree, &end_tree, + &redact_nodes[i]); + } + } + avl_remove(&start_tree, &redact_nodes[i]); + avl_remove(&end_tree, &redact_nodes[i]); + kmem_free(redact_nodes[i].record, + sizeof (struct redact_record)); + } + + avl_destroy(&start_tree); + avl_destroy(&end_tree); + kmem_free(redact_nodes, num_threads * sizeof (*redact_nodes)); + if (current_record != NULL) + bqueue_enqueue(q, current_record, sizeof (current_record)); + return (err); +} + +struct redact_merge_thread_arg { + bqueue_t q; + spa_t *spa; + int numsnaps; + struct redact_thread_arg *thr_args; + boolean_t cancel; + int error_code; +}; + +static void +redact_merge_thread(void *arg) +{ + struct redact_merge_thread_arg *rmta = arg; + rmta->error_code = perform_thread_merge(&rmta->q, + rmta->numsnaps, rmta->thr_args, &rmta->cancel); + struct redact_record *rec = kmem_zalloc(sizeof (*rec), KM_SLEEP); + rec->eos_marker = B_TRUE; + bqueue_enqueue_flush(&rmta->q, rec, 1); + thread_exit(); +} + +/* + * Find the next object in or after the redaction range passed in, and hold + * its dnode with the provided tag. Also update *object to contain the new + * object number. + */ +static int +hold_next_object(objset_t *os, struct redact_record *rec, void *tag, + uint64_t *object, dnode_t **dn) +{ + int err = 0; + if (*dn != NULL) + dnode_rele(*dn, FTAG); + *dn = NULL; + if (*object < rec->start_object) { + *object = rec->start_object - 1; + } + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + return (err); + + err = dnode_hold(os, *object, tag, dn); + while (err == 0 && (*object < rec->start_object || + DMU_OT_IS_METADATA((*dn)->dn_type))) { + dnode_rele(*dn, tag); + *dn = NULL; + err = dmu_object_next(os, object, B_FALSE, 0); + if (err != 0) + break; + err = dnode_hold(os, *object, tag, dn); + } + return (err); +} + +static int +perform_redaction(objset_t *os, redaction_list_t *rl, + struct redact_merge_thread_arg *rmta) +{ + int err = 0; + bqueue_t *q = &rmta->q; + struct redact_record *rec = NULL; + struct merge_data md = { {0} }; + + list_create(&md.md_redact_block_pending, + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + md.md_redaction_list = rl; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&md.md_blocks[i], + sizeof (struct redact_block_list_node), + offsetof(struct redact_block_list_node, node)); + } + dnode_t *dn = NULL; + uint64_t prev_obj = 0; + for (rec = bqueue_dequeue(q); !rec->eos_marker && err == 0; + rec = get_next_redact_record(q, rec)) { + ASSERT3U(rec->start_object, !=, 0); + uint64_t object; + if (prev_obj != rec->start_object) { + object = rec->start_object - 1; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } else { + object = prev_obj; + } + while (err == 0 && object <= rec->end_object) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + err = EINTR; + break; + } + /* + * Part of the current object is contained somewhere in + * the range covered by rec. + */ + uint64_t startblkid; + uint64_t endblkid; + uint64_t maxblkid = dn->dn_phys->dn_maxblkid; + + if (rec->start_object < object) + startblkid = 0; + else if (rec->start_blkid > maxblkid) + break; + else + startblkid = rec->start_blkid; + + if (rec->end_object > object || rec->end_blkid > + maxblkid) { + endblkid = maxblkid; + } else { + endblkid = rec->end_blkid; + } + update_redaction_list(&md, os, object, startblkid, + endblkid, dn->dn_datablksz); + + if (object == rec->end_object) + break; + err = hold_next_object(os, rec, FTAG, &object, &dn); + } + if (err == ESRCH) + err = 0; + if (dn != NULL) + prev_obj = object; + } + if (err == 0 && dn != NULL) + dnode_rele(dn, FTAG); + + if (err == ESRCH) + err = 0; + rmta->cancel = B_TRUE; + while (!rec->eos_marker) + rec = get_next_redact_record(q, rec); + kmem_free(rec, sizeof (*rec)); + + /* + * There may be a block that's being coalesced, sync that out before we + * return. + */ + if (err == 0 && md.md_coalesce_block.rbp_size_count != 0) { + struct redact_block_list_node *rbln = + kmem_alloc(sizeof (struct redact_block_list_node), + KM_SLEEP); + rbln->block = md.md_coalesce_block; + list_insert_tail(&md.md_redact_block_pending, rbln); + } + commit_rl_updates(os, &md, UINT64_MAX, UINT64_MAX); + + /* + * Wait for all the redaction info to sync out before we return, so that + * anyone who attempts to resume this redaction will have all the data + * they need. + */ + dsl_pool_t *dp = spa_get_dsl(os->os_spa); + if (md.md_latest_synctask_txg != 0) + txg_wait_synced(dp, md.md_latest_synctask_txg); + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&md.md_blocks[i]); + return (err); +} + +static boolean_t +redact_snaps_contains(uint64_t *snaps, uint64_t num_snaps, uint64_t guid) +{ + for (int i = 0; i < num_snaps; i++) { + if (snaps[i] == guid) + return (B_TRUE); + } + return (B_FALSE); +} + +int +dmu_redact_snap(const char *snapname, nvlist_t *redactnvl, + const char *redactbook) +{ + int err = 0; + dsl_pool_t *dp = NULL; + dsl_dataset_t *ds = NULL; + int numsnaps = 0; + objset_t *os; + struct redact_thread_arg *args = NULL; + redaction_list_t *new_rl = NULL; + + if ((err = dsl_pool_hold(snapname, FTAG, &dp)) != 0) + return (err); + + if ((err = dsl_dataset_hold_flags(dp, snapname, DS_HOLD_FLAG_DECRYPT, + FTAG, &ds)) != 0) { + goto out; + } + dsl_dataset_long_hold(ds, FTAG); + if (!ds->ds_is_snapshot || dmu_objset_from_ds(ds, &os) != 0) { + err = EINVAL; + goto out; + } + if (dsl_dataset_feature_is_active(ds, SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + goto out; + } + + numsnaps = fnvlist_num_pairs(redactnvl); + if (numsnaps > 0) + args = kmem_zalloc(numsnaps * sizeof (*args), KM_SLEEP); + + nvpair_t *pair = NULL; + for (int i = 0; i < numsnaps; i++) { + pair = nvlist_next_nvpair(redactnvl, pair); + const char *name = nvpair_name(pair); + struct redact_thread_arg *rta = &args[i]; + err = dsl_dataset_hold_flags(dp, name, DS_HOLD_FLAG_DECRYPT, + FTAG, &rta->ds); + if (err != 0) + break; + /* + * We want to do the long hold before we can get any other + * errors, because the cleanup code will release the long + * hold if rta->ds is filled in. + */ + dsl_dataset_long_hold(rta->ds, FTAG); + + err = dmu_objset_from_ds(rta->ds, &rta->os); + if (err != 0) + break; + if (!dsl_dataset_is_before(rta->ds, ds, 0)) { + err = EINVAL; + break; + } + if (dsl_dataset_feature_is_active(rta->ds, + SPA_FEATURE_REDACTED_DATASETS)) { + err = EALREADY; + break; + + } + } + VERIFY3P(nvlist_next_nvpair(redactnvl, pair), ==, NULL); + if (err != 0) + goto out; + + boolean_t resuming = B_FALSE; + char newredactbook[ZFS_MAX_DATASET_NAME_LEN]; + zfs_bookmark_phys_t bookmark; + + (void) strlcpy(newredactbook, snapname, ZFS_MAX_DATASET_NAME_LEN); + char *c = strchr(newredactbook, '@'); + ASSERT3P(c, !=, NULL); + int n = snprintf(c, ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook), + "#%s", redactbook); + if (n >= ZFS_MAX_DATASET_NAME_LEN - (c - newredactbook)) { + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(ENAMETOOLONG)); + } + err = dsl_bookmark_lookup(dp, newredactbook, NULL, &bookmark); + if (err == 0) { + resuming = B_TRUE; + if (bookmark.zbm_redaction_obj == 0) { + err = EEXIST; + goto out; + } + err = dsl_redaction_list_hold_obj(dp, + bookmark.zbm_redaction_obj, FTAG, &new_rl); + if (err != 0) { + err = EIO; + goto out; + } + dsl_redaction_list_long_hold(dp, new_rl, FTAG); + if (new_rl->rl_phys->rlp_num_snaps != numsnaps) { + err = ESRCH; + goto out; + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + if (!redact_snaps_contains(new_rl->rl_phys->rlp_snaps, + new_rl->rl_phys->rlp_num_snaps, + dsl_dataset_phys(rta->ds)->ds_guid)) { + err = ESRCH; + goto out; + } + } + if (new_rl->rl_phys->rlp_last_blkid == UINT64_MAX && + new_rl->rl_phys->rlp_last_object == UINT64_MAX) { + err = EEXIST; + goto out; + } + dsl_pool_rele(dp, FTAG); + dp = NULL; + } else { + uint64_t *guids = NULL; + if (numsnaps > 0) { + guids = kmem_zalloc(numsnaps * sizeof (uint64_t), + KM_SLEEP); + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + guids[i] = dsl_dataset_phys(rta->ds)->ds_guid; + } + + dsl_pool_rele(dp, FTAG); + dp = NULL; + err = dsl_bookmark_create_redacted(newredactbook, snapname, + numsnaps, guids, FTAG, &new_rl); + kmem_free(guids, numsnaps * sizeof (uint64_t)); + if (err != 0) { + goto out; + } + } + + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + (void) bqueue_init(&rta->q, zfs_redact_queue_ff, + zfs_redact_queue_length, + offsetof(struct redact_record, ln)); + if (resuming) { + rta->resume.zb_blkid = + new_rl->rl_phys->rlp_last_blkid; + rta->resume.zb_object = + new_rl->rl_phys->rlp_last_object; + } + rta->txg = dsl_dataset_phys(ds)->ds_creation_txg; + (void) thread_create(NULL, 0, redact_traverse_thread, rta, + 0, curproc, TS_RUN, minclsyspri); + } + struct redact_merge_thread_arg rmta = { { {0} } }; + (void) bqueue_init(&rmta.q, zfs_redact_queue_ff, + zfs_redact_queue_length, offsetof(struct redact_record, ln)); + rmta.numsnaps = numsnaps; + rmta.spa = os->os_spa; + rmta.thr_args = args; + (void) thread_create(NULL, 0, redact_merge_thread, &rmta, 0, curproc, + TS_RUN, minclsyspri); + err = perform_redaction(os, new_rl, &rmta); +out: + if (new_rl != NULL) { + dsl_redaction_list_long_rele(new_rl, FTAG); + dsl_redaction_list_rele(new_rl, FTAG); + } + for (int i = 0; i < numsnaps; i++) { + struct redact_thread_arg *rta = &args[i]; + /* + * rta->ds may be NULL if we got an error while filling + * it in. + */ + if (rta->ds != NULL) { + dsl_dataset_long_rele(rta->ds, FTAG); + dsl_dataset_rele_flags(rta->ds, + DS_HOLD_FLAG_DECRYPT, FTAG); + } + } + + if (args != NULL) + kmem_free(args, numsnaps * sizeof (*args)); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + if (ds != NULL) { + dsl_dataset_long_rele(ds, FTAG); + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + } + return (SET_ERROR(err)); + +} diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c new file mode 100644 index 000000000000..33e99c2e02ab --- /dev/null +++ b/module/zfs/dmu_send.c @@ -0,0 +1,3091 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright 2014 HybridCluster. All rights reserved. + * Copyright 2016 RackTop Systems. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +/* Set this tunable to TRUE to replace corrupt data with 0x2f5baddb10c */ +int zfs_send_corrupt_data = B_FALSE; +/* + * This tunable controls the amount of data (measured in bytes) that will be + * prefetched by zfs send. If the main thread is blocking on reads that haven't + * completed, this variable might need to be increased. If instead the main + * thread is issuing new reads because the prefetches have fallen out of the + * cache, this may need to be decreased. + */ +int zfs_send_queue_length = SPA_MAXBLOCKSIZE; +/* + * This tunable controls the length of the queues that zfs send worker threads + * use to communicate. If the send_main_thread is blocking on these queues, + * this variable may need to be increased. If there is a significant slowdown + * at the start of a send as these threads consume all the available IO + * resources, this variable may need to be decreased. + */ +int zfs_send_no_prefetch_queue_length = 1024 * 1024; +/* + * These tunables control the fill fraction of the queues by zfs send. The fill + * fraction controls the frequency with which threads have to be cv_signaled. + * If a lot of cpu time is being spent on cv_signal, then these should be tuned + * down. If the queues empty before the signalled thread can catch up, then + * these should be tuned up. + */ +int zfs_send_queue_ff = 20; +int zfs_send_no_prefetch_queue_ff = 20; + +/* + * Use this to override the recordsize calculation for fast zfs send estimates. + */ +int zfs_override_estimate_recordsize = 0; + +/* Set this tunable to FALSE to disable setting of DRR_FLAG_FREERECORDS */ +int zfs_send_set_freerecords_bit = B_TRUE; + +/* Set this tunable to FALSE is disable sending unmodified spill blocks. */ +int zfs_send_unmodified_spill_blocks = B_TRUE; + +static inline boolean_t +overflow_multiply(uint64_t a, uint64_t b, uint64_t *c) +{ + uint64_t temp = a * b; + if (b != 0 && temp / b != a) + return (B_FALSE); + *c = temp; + return (B_TRUE); +} + +struct send_thread_arg { + bqueue_t q; + objset_t *os; /* Objset to traverse */ + uint64_t fromtxg; /* Traverse from this txg */ + int flags; /* flags to pass to traverse_dataset */ + int error_code; + boolean_t cancel; + zbookmark_phys_t resume; + uint64_t *num_blocks_visited; +}; + +struct redact_list_thread_arg { + boolean_t cancel; + bqueue_t q; + zbookmark_phys_t resume; + redaction_list_t *rl; + boolean_t mark_redact; + int error_code; + uint64_t *num_blocks_visited; +}; + +struct send_merge_thread_arg { + bqueue_t q; + objset_t *os; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *redact_arg; + int error; + boolean_t cancel; +}; + +struct send_range { + boolean_t eos_marker; /* Marks the end of the stream */ + uint64_t object; + uint64_t start_blkid; + uint64_t end_blkid; + bqueue_node_t ln; + enum type {DATA, HOLE, OBJECT, OBJECT_RANGE, REDACT, + PREVIOUSLY_REDACTED} type; + union { + struct srd { + dmu_object_type_t obj_type; + uint32_t datablksz; // logical size + uint32_t datasz; // payload size + blkptr_t bp; + arc_buf_t *abuf; + abd_t *abd; + kmutex_t lock; + kcondvar_t cv; + boolean_t io_outstanding; + int io_err; + } data; + struct srh { + uint32_t datablksz; + } hole; + struct sro { + /* + * This is a pointer because embedding it in the + * struct causes these structures to be massively larger + * for all range types; this makes the code much less + * memory efficient. + */ + dnode_phys_t *dnp; + blkptr_t bp; + } object; + struct srr { + uint32_t datablksz; + } redact; + struct sror { + blkptr_t bp; + } object_range; + } sru; +}; + +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free(), + * dump_freeobjects(), and dump_redact() can be aggregated into a single + * DRR_FREE, DRR_FREEOBJECTS, or DRR_REDACT replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS, + PENDING_REDACT +} dmu_pendop_t; + +typedef struct dmu_send_cookie { + dmu_replay_record_t *dsc_drr; + dmu_send_outparams_t *dsc_dso; + offset_t *dsc_off; + objset_t *dsc_os; + zio_cksum_t dsc_zc; + uint64_t dsc_toguid; + uint64_t dsc_fromtxg; + int dsc_err; + dmu_pendop_t dsc_pending_op; + uint64_t dsc_featureflags; + uint64_t dsc_last_data_object; + uint64_t dsc_last_data_offset; + uint64_t dsc_resume_object; + uint64_t dsc_resume_offset; + boolean_t dsc_sent_begin; + boolean_t dsc_sent_end; +} dmu_send_cookie_t; + +static int do_dump(dmu_send_cookie_t *dscp, struct send_range *range); + +static void +range_free(struct send_range *range) +{ + if (range->type == OBJECT) { + size_t size = sizeof (dnode_phys_t) * + (range->sru.object.dnp->dn_extra_slots + 1); + kmem_free(range->sru.object.dnp, size); + } else if (range->type == DATA) { + mutex_enter(&range->sru.data.lock); + while (range->sru.data.io_outstanding) + cv_wait(&range->sru.data.cv, &range->sru.data.lock); + if (range->sru.data.abd != NULL) + abd_free(range->sru.data.abd); + if (range->sru.data.abuf != NULL) { + arc_buf_destroy(range->sru.data.abuf, + &range->sru.data.abuf); + } + mutex_exit(&range->sru.data.lock); + + cv_destroy(&range->sru.data.cv); + mutex_destroy(&range->sru.data.lock); + } + kmem_free(range, sizeof (*range)); +} + +/* + * For all record types except BEGIN, fill in the checksum (overlaid in + * drr_u.drr_checksum.drr_checksum). The checksum verifies everything + * up to the start of the checksum itself. + */ +static int +dump_record(dmu_send_cookie_t *dscp, void *payload, int payload_len) +{ + dmu_send_outparams_t *dso = dscp->dsc_dso; + ASSERT3U(offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + ==, sizeof (dmu_replay_record_t) - sizeof (zio_cksum_t)); + (void) fletcher_4_incremental_native(dscp->dsc_drr, + offsetof(dmu_replay_record_t, drr_u.drr_checksum.drr_checksum), + &dscp->dsc_zc); + if (dscp->dsc_drr->drr_type == DRR_BEGIN) { + dscp->dsc_sent_begin = B_TRUE; + } else { + ASSERT(ZIO_CHECKSUM_IS_ZERO(&dscp->dsc_drr->drr_u. + drr_checksum.drr_checksum)); + dscp->dsc_drr->drr_u.drr_checksum.drr_checksum = dscp->dsc_zc; + } + if (dscp->dsc_drr->drr_type == DRR_END) { + dscp->dsc_sent_end = B_TRUE; + } + (void) fletcher_4_incremental_native(&dscp->dsc_drr-> + drr_u.drr_checksum.drr_checksum, + sizeof (zio_cksum_t), &dscp->dsc_zc); + *dscp->dsc_off += sizeof (dmu_replay_record_t); + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, dscp->dsc_drr, + sizeof (dmu_replay_record_t), dso->dso_arg); + if (dscp->dsc_err != 0) + return (SET_ERROR(EINTR)); + if (payload_len != 0) { + *dscp->dsc_off += payload_len; + /* + * payload is null when dso_dryrun == B_TRUE (i.e. when we're + * doing a send size calculation) + */ + if (payload != NULL) { + (void) fletcher_4_incremental_native( + payload, payload_len, &dscp->dsc_zc); + } + + /* + * The code does not rely on this (len being a multiple of 8). + * We keep this assertion because of the corresponding assertion + * in receive_read(). Keeping this assertion ensures that we do + * not inadvertently break backwards compatibility (causing the + * assertion in receive_read() to trigger on old software). + * + * Raw sends cannot be received on old software, and so can + * bypass this assertion. + */ + + ASSERT((payload_len % 8 == 0) || + (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)); + + dscp->dsc_err = dso->dso_outfunc(dscp->dsc_os, payload, + payload_len, dso->dso_arg); + if (dscp->dsc_err != 0) + return (SET_ERROR(EINTR)); + } + return (0); +} + +/* + * Fill in the drr_free struct, or perform aggregation if the previous record is + * also a free record, and the two are adjacent. + * + * Note that we send free records even for a full send, because we want to be + * able to receive a full send as a clone, which requires a list of all the free + * and freeobject records that were generated on the source. + */ +static int +dump_free(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_free *drrf = &(dscp->dsc_drr->drr_u.drr_free); + + /* + * When we receive a free record, dbuf_free_range() assumes + * that the receiving system doesn't have any dbufs in the range + * being freed. This is always true because there is a one-record + * constraint: we only send one WRITE record for any given + * object,offset. We know that the one-record constraint is + * true because we always send data in increasing order by + * object,offset. + * + * If the increasing-order constraint ever changes, we should find + * another way to assert that the one-record constraint is still + * satisfied. + */ + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); + + /* + * If there is a pending op, but it's not PENDING_FREE, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_FREE records can only be aggregated with + * other DRR_FREE records. DRR_FREEOBJECTS records can only be + * aggregated with other DRR_FREEOBJECTS records). + */ + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + if (dscp->dsc_pending_op == PENDING_FREE) { + /* + * Check to see whether this free block can be aggregated + * with pending one. + */ + if (drrf->drr_object == object && drrf->drr_offset + + drrf->drr_length == offset) { + if (offset + length < offset || length == UINT64_MAX) + drrf->drr_length = UINT64_MAX; + else + drrf->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + } + /* create a FREE record and make it pending */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREE; + drrf->drr_object = object; + drrf->drr_offset = offset; + if (offset + length < offset) + drrf->drr_length = DMU_OBJECT_END; + else + drrf->drr_length = length; + drrf->drr_toguid = dscp->dsc_toguid; + if (length == DMU_OBJECT_END) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + } else { + dscp->dsc_pending_op = PENDING_FREE; + } + + return (0); +} + +/* + * Fill in the drr_redact struct, or perform aggregation if the previous record + * is also a redaction record, and the two are adjacent. + */ +static int +dump_redact(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, + uint64_t length) +{ + struct drr_redact *drrr = &dscp->dsc_drr->drr_u.drr_redact; + + /* + * If there is a pending op, but it's not PENDING_REDACT, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_REDACT records can only be aggregated with + * other DRR_REDACT records). + */ + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_REDACT) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + if (dscp->dsc_pending_op == PENDING_REDACT) { + /* + * Check to see whether this redacted block can be aggregated + * with pending one. + */ + if (drrr->drr_object == object && drrr->drr_offset + + drrr->drr_length == offset) { + drrr->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + } + /* create a REDACT record and make it pending */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_REDACT; + drrr->drr_object = object; + drrr->drr_offset = offset; + drrr->drr_length = length; + drrr->drr_toguid = dscp->dsc_toguid; + dscp->dsc_pending_op = PENDING_REDACT; + + return (0); +} + +static int +dmu_dump_write(dmu_send_cookie_t *dscp, dmu_object_type_t type, uint64_t object, + uint64_t offset, int lsize, int psize, const blkptr_t *bp, void *data) +{ + uint64_t payload_size; + boolean_t raw = (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); + struct drr_write *drrw = &(dscp->dsc_drr->drr_u.drr_write); + + /* + * We send data in increasing object, offset order. + * See comment in dump_free() for details. + */ + ASSERT(object > dscp->dsc_last_data_object || + (object == dscp->dsc_last_data_object && + offset > dscp->dsc_last_data_offset)); + dscp->dsc_last_data_object = object; + dscp->dsc_last_data_offset = offset + lsize - 1; + + /* + * If there is any kind of pending aggregation (currently either + * a grouping of free objects or free blocks), push it out to + * the stream, since aggregation can't be done across operations + * of different types. + */ + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + /* write a WRITE record */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE; + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_toguid = dscp->dsc_toguid; + drrw->drr_logical_size = lsize; + + /* only set the compression fields if the buf is compressed or raw */ + if (raw || lsize != psize) { + ASSERT(raw || dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3S(psize, >, 0); + + if (raw) { + ASSERT(BP_IS_PROTECTED(bp)); + + /* + * This is a raw protected block so we need to pass + * along everything the receiving side will need to + * interpret this block, including the byteswap, salt, + * IV, and MAC. + */ + if (BP_SHOULD_BYTESWAP(bp)) + drrw->drr_flags |= DRR_RAW_BYTESWAP; + zio_crypt_decode_params_bp(bp, drrw->drr_salt, + drrw->drr_iv); + zio_crypt_decode_mac_bp(bp, drrw->drr_mac); + } else { + /* this is a compressed block */ + ASSERT(dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_COMPRESSED); + ASSERT(!BP_SHOULD_BYTESWAP(bp)); + ASSERT(!DMU_OT_IS_METADATA(BP_GET_TYPE(bp))); + ASSERT3U(BP_GET_COMPRESS(bp), !=, ZIO_COMPRESS_OFF); + ASSERT3S(lsize, >=, psize); + } + + /* set fields common to compressed and raw sends */ + drrw->drr_compressiontype = BP_GET_COMPRESS(bp); + drrw->drr_compressed_size = psize; + payload_size = drrw->drr_compressed_size; + } else { + payload_size = drrw->drr_logical_size; + } + + if (bp == NULL || BP_IS_EMBEDDED(bp) || (BP_IS_PROTECTED(bp) && !raw)) { + /* + * There's no pre-computed checksum for partial-block writes, + * embedded BP's, or encrypted BP's that are being sent as + * plaintext, so (like fletcher4-checksummed blocks) userland + * will have to compute a dedup-capable checksum itself. + */ + drrw->drr_checksumtype = ZIO_CHECKSUM_OFF; + } else { + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_flags & + ZCHECKSUM_FLAG_DEDUP) + drrw->drr_flags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + DDK_SET_CRYPT(&drrw->drr_key, BP_IS_PROTECTED(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + } + + if (dump_record(dscp, data, payload_size) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_write_embedded(dmu_send_cookie_t *dscp, uint64_t object, uint64_t offset, + int blksz, const blkptr_t *bp) +{ + char buf[BPE_PAYLOAD_SIZE]; + struct drr_write_embedded *drrw = + &(dscp->dsc_drr->drr_u.drr_write_embedded); + + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + ASSERT(BP_IS_EMBEDDED(bp)); + + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_WRITE_EMBEDDED; + drrw->drr_object = object; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = dscp->dsc_toguid; + drrw->drr_compression = BP_GET_COMPRESS(bp); + drrw->drr_etype = BPE_GET_ETYPE(bp); + drrw->drr_lsize = BPE_GET_LSIZE(bp); + drrw->drr_psize = BPE_GET_PSIZE(bp); + + decode_embedded_bp_compressed(bp, buf); + + if (dump_record(dscp, buf, P2ROUNDUP(drrw->drr_psize, 8)) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_spill(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, + void *data) +{ + struct drr_spill *drrs = &(dscp->dsc_drr->drr_u.drr_spill); + uint64_t blksz = BP_GET_LSIZE(bp); + uint64_t payload_size = blksz; + + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + /* write a SPILL record */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_SPILL; + drrs->drr_object = object; + drrs->drr_length = blksz; + drrs->drr_toguid = dscp->dsc_toguid; + + /* See comment in dump_dnode() for full details */ + if (zfs_send_unmodified_spill_blocks && + (bp->blk_birth <= dscp->dsc_fromtxg)) { + drrs->drr_flags |= DRR_SPILL_UNMODIFIED; + } + + /* handle raw send fields */ + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + + if (BP_SHOULD_BYTESWAP(bp)) + drrs->drr_flags |= DRR_RAW_BYTESWAP; + drrs->drr_compressiontype = BP_GET_COMPRESS(bp); + drrs->drr_compressed_size = BP_GET_PSIZE(bp); + zio_crypt_decode_params_bp(bp, drrs->drr_salt, drrs->drr_iv); + zio_crypt_decode_mac_bp(bp, drrs->drr_mac); + payload_size = drrs->drr_compressed_size; + } + + if (dump_record(dscp, data, payload_size) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static int +dump_freeobjects(dmu_send_cookie_t *dscp, uint64_t firstobj, uint64_t numobjs) +{ + struct drr_freeobjects *drrfo = &(dscp->dsc_drr->drr_u.drr_freeobjects); + uint64_t maxobj = DNODES_PER_BLOCK * + (DMU_META_DNODE(dscp->dsc_os)->dn_maxblkid + 1); + + /* + * ZoL < 0.7 does not handle large FREEOBJECTS records correctly, + * leading to zfs recv never completing. to avoid this issue, don't + * send FREEOBJECTS records for object IDs which cannot exist on the + * receiving side. + */ + if (maxobj > 0) { + if (maxobj < firstobj) + return (0); + + if (maxobj < firstobj + numobjs) + numobjs = maxobj - firstobj; + } + + /* + * If there is a pending op, but it's not PENDING_FREEOBJECTS, + * push it out, since free block aggregation can only be done for + * blocks of the same type (i.e., DRR_FREE records can only be + * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records + * can only be aggregated with other DRR_FREEOBJECTS records). + */ + if (dscp->dsc_pending_op != PENDING_NONE && + dscp->dsc_pending_op != PENDING_FREEOBJECTS) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + if (numobjs == 0) + numobjs = UINT64_MAX - firstobj; + + if (dscp->dsc_pending_op == PENDING_FREEOBJECTS) { + /* + * See whether this free object array can be aggregated + * with pending one + */ + if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { + drrfo->drr_numobjs += numobjs; + return (0); + } else { + /* can't be aggregated. Push out pending record */ + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + } + + /* write a FREEOBJECTS record */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_FREEOBJECTS; + drrfo->drr_firstobj = firstobj; + drrfo->drr_numobjs = numobjs; + drrfo->drr_toguid = dscp->dsc_toguid; + + dscp->dsc_pending_op = PENDING_FREEOBJECTS; + + return (0); +} + +static int +dump_dnode(dmu_send_cookie_t *dscp, const blkptr_t *bp, uint64_t object, + dnode_phys_t *dnp) +{ + struct drr_object *drro = &(dscp->dsc_drr->drr_u.drr_object); + int bonuslen; + + if (object < dscp->dsc_resume_object) { + /* + * Note: when resuming, we will visit all the dnodes in + * the block of dnodes that we are resuming from. In + * this case it's unnecessary to send the dnodes prior to + * the one we are resuming from. We should be at most one + * block's worth of dnodes behind the resume point. + */ + ASSERT3U(dscp->dsc_resume_object - object, <, + 1 << (DNODE_BLOCK_SHIFT - DNODE_SHIFT)); + return (0); + } + + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) + return (dump_freeobjects(dscp, object, 1)); + + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + /* write an OBJECT record */ + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT; + drro->drr_object = object; + drro->drr_type = dnp->dn_type; + drro->drr_bonustype = dnp->dn_bonustype; + drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_dn_slots = dnp->dn_extra_slots + 1; + drro->drr_checksumtype = dnp->dn_checksum; + drro->drr_compress = dnp->dn_compress; + drro->drr_toguid = dscp->dsc_toguid; + + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS) && + drro->drr_blksz > SPA_OLD_MAXBLOCKSIZE) + drro->drr_blksz = SPA_OLD_MAXBLOCKSIZE; + + bonuslen = P2ROUNDUP(dnp->dn_bonuslen, 8); + + if ((dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { + ASSERT(BP_IS_ENCRYPTED(bp)); + + if (BP_SHOULD_BYTESWAP(bp)) + drro->drr_flags |= DRR_RAW_BYTESWAP; + + /* needed for reconstructing dnp on recv side */ + drro->drr_maxblkid = dnp->dn_maxblkid; + drro->drr_indblkshift = dnp->dn_indblkshift; + drro->drr_nlevels = dnp->dn_nlevels; + drro->drr_nblkptr = dnp->dn_nblkptr; + + /* + * Since we encrypt the entire bonus area, the (raw) part + * beyond the bonuslen is actually nonzero, so we need + * to send it. + */ + if (bonuslen != 0) { + drro->drr_raw_bonuslen = DN_MAX_BONUS_LEN(dnp); + bonuslen = drro->drr_raw_bonuslen; + } + } + + /* + * DRR_OBJECT_SPILL is set for every dnode which references a + * spill block. This allows the receiving pool to definitively + * determine when a spill block should be kept or freed. + */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + drro->drr_flags |= DRR_OBJECT_SPILL; + + if (dump_record(dscp, DN_BONUS(dnp), bonuslen) != 0) + return (SET_ERROR(EINTR)); + + /* Free anything past the end of the file. */ + if (dump_free(dscp, object, (dnp->dn_maxblkid + 1) * + (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT), DMU_OBJECT_END) != 0) + return (SET_ERROR(EINTR)); + + /* + * Send DRR_SPILL records for unmodified spill blocks. This is useful + * because changing certain attributes of the object (e.g. blocksize) + * can cause old versions of ZFS to incorrectly remove a spill block. + * Including these records in the stream forces an up to date version + * to always be written ensuring they're never lost. Current versions + * of the code which understand the DRR_FLAG_SPILL_BLOCK feature can + * ignore these unmodified spill blocks. + */ + if (zfs_send_unmodified_spill_blocks && + (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) && + (DN_SPILL_BLKPTR(dnp)->blk_birth <= dscp->dsc_fromtxg)) { + struct send_range record; + blkptr_t *bp = DN_SPILL_BLKPTR(dnp); + + bzero(&record, sizeof (struct send_range)); + record.type = DATA; + record.object = object; + record.eos_marker = B_FALSE; + record.start_blkid = DMU_SPILL_BLKID; + record.end_blkid = record.start_blkid + 1; + record.sru.data.bp = *bp; + record.sru.data.obj_type = dnp->dn_type; + record.sru.data.datablksz = BP_GET_LSIZE(bp); + + if (do_dump(dscp, &record) != 0) + return (SET_ERROR(EINTR)); + } + + if (dscp->dsc_err != 0) + return (SET_ERROR(EINTR)); + + return (0); +} + +static int +dump_object_range(dmu_send_cookie_t *dscp, const blkptr_t *bp, + uint64_t firstobj, uint64_t numslots) +{ + struct drr_object_range *drror = + &(dscp->dsc_drr->drr_u.drr_object_range); + + /* we only use this record type for raw sends */ + ASSERT(BP_IS_PROTECTED(bp)); + ASSERT(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_DNODE); + ASSERT0(BP_GET_LEVEL(bp)); + + if (dscp->dsc_pending_op != PENDING_NONE) { + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + dscp->dsc_pending_op = PENDING_NONE; + } + + bzero(dscp->dsc_drr, sizeof (dmu_replay_record_t)); + dscp->dsc_drr->drr_type = DRR_OBJECT_RANGE; + drror->drr_firstobj = firstobj; + drror->drr_numslots = numslots; + drror->drr_toguid = dscp->dsc_toguid; + if (BP_SHOULD_BYTESWAP(bp)) + drror->drr_flags |= DRR_RAW_BYTESWAP; + zio_crypt_decode_params_bp(bp, drror->drr_salt, drror->drr_iv); + zio_crypt_decode_mac_bp(bp, drror->drr_mac); + + if (dump_record(dscp, NULL, 0) != 0) + return (SET_ERROR(EINTR)); + return (0); +} + +static boolean_t +send_do_embed(const blkptr_t *bp, uint64_t featureflags) +{ + if (!BP_IS_EMBEDDED(bp)) + return (B_FALSE); + + /* + * Compression function must be legacy, or explicitly enabled. + */ + if ((BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_LEGACY_FUNCTIONS && + !(featureflags & DMU_BACKUP_FEATURE_LZ4))) + return (B_FALSE); + + /* + * If we have not set the ZSTD feature flag, we can't send ZSTD + * compressed embedded blocks, as the receiver may not support them. + */ + if ((BP_GET_COMPRESS(bp) == ZIO_COMPRESS_ZSTD && + !(featureflags & DMU_BACKUP_FEATURE_ZSTD))) + return (B_FALSE); + + /* + * Embed type must be explicitly enabled. + */ + switch (BPE_GET_ETYPE(bp)) { + case BP_EMBEDDED_TYPE_DATA: + if (featureflags & DMU_BACKUP_FEATURE_EMBED_DATA) + return (B_TRUE); + break; + default: + return (B_FALSE); + } + return (B_FALSE); +} + +/* + * This function actually handles figuring out what kind of record needs to be + * dumped, and calling the appropriate helper function. In most cases, + * the data has already been read by send_reader_thread(). + */ +static int +do_dump(dmu_send_cookie_t *dscp, struct send_range *range) +{ + int err = 0; + switch (range->type) { + case OBJECT: + err = dump_dnode(dscp, &range->sru.object.bp, range->object, + range->sru.object.dnp); + return (err); + case OBJECT_RANGE: { + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (!(dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW)) { + return (0); + } + uint64_t epb = BP_GET_LSIZE(&range->sru.object_range.bp) >> + DNODE_SHIFT; + uint64_t firstobj = range->start_blkid * epb; + err = dump_object_range(dscp, &range->sru.object_range.bp, + firstobj, epb); + break; + } + case REDACT: { + struct srr *srrp = &range->sru.redact; + err = dump_redact(dscp, range->object, range->start_blkid * + srrp->datablksz, (range->end_blkid - range->start_blkid) * + srrp->datablksz); + return (err); + } + case DATA: { + struct srd *srdp = &range->sru.data; + blkptr_t *bp = &srdp->bp; + spa_t *spa = + dmu_objset_spa(dscp->dsc_os); + + ASSERT3U(srdp->datablksz, ==, BP_GET_LSIZE(bp)); + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + if (BP_GET_TYPE(bp) == DMU_OT_SA) { + arc_flags_t aflags = ARC_FLAG_WAIT; + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (dscp->dsc_featureflags & DMU_BACKUP_FEATURE_RAW) { + ASSERT(BP_IS_PROTECTED(bp)); + zioflags |= ZIO_FLAG_RAW; + } + + zbookmark_phys_t zb; + ASSERT3U(range->start_blkid, ==, DMU_SPILL_BLKID); + zb.zb_objset = dmu_objset_id(dscp->dsc_os); + zb.zb_object = range->object; + zb.zb_level = 0; + zb.zb_blkid = range->start_blkid; + + arc_buf_t *abuf = NULL; + if (!dscp->dsc_dso->dso_dryrun && arc_read(NULL, spa, + bp, arc_getbuf_func, &abuf, ZIO_PRIORITY_ASYNC_READ, + zioflags, &aflags, &zb) != 0) + return (SET_ERROR(EIO)); + + err = dump_spill(dscp, bp, zb.zb_object, + (abuf == NULL ? NULL : abuf->b_data)); + if (abuf != NULL) + arc_buf_destroy(abuf, &abuf); + return (err); + } + if (send_do_embed(bp, dscp->dsc_featureflags)) { + err = dump_write_embedded(dscp, range->object, + range->start_blkid * srdp->datablksz, + srdp->datablksz, bp); + return (err); + } + ASSERT(range->object > dscp->dsc_resume_object || + (range->object == dscp->dsc_resume_object && + range->start_blkid * srdp->datablksz >= + dscp->dsc_resume_offset)); + /* it's a level-0 block of a regular object */ + + mutex_enter(&srdp->lock); + while (srdp->io_outstanding) + cv_wait(&srdp->cv, &srdp->lock); + err = srdp->io_err; + mutex_exit(&srdp->lock); + + if (err != 0) { + if (zfs_send_corrupt_data && + !dscp->dsc_dso->dso_dryrun) { + /* + * Send a block filled with 0x"zfs badd bloc" + */ + srdp->abuf = arc_alloc_buf(spa, &srdp->abuf, + ARC_BUFC_DATA, srdp->datablksz); + uint64_t *ptr; + for (ptr = srdp->abuf->b_data; + (char *)ptr < (char *)srdp->abuf->b_data + + srdp->datablksz; ptr++) + *ptr = 0x2f5baddb10cULL; + } else { + return (SET_ERROR(EIO)); + } + } + + ASSERT(dscp->dsc_dso->dso_dryrun || + srdp->abuf != NULL || srdp->abd != NULL); + + uint64_t offset = range->start_blkid * srdp->datablksz; + + char *data = NULL; + if (srdp->abd != NULL) { + data = abd_to_buf(srdp->abd); + ASSERT3P(srdp->abuf, ==, NULL); + } else if (srdp->abuf != NULL) { + data = srdp->abuf->b_data; + } + + /* + * If we have large blocks stored on disk but the send flags + * don't allow us to send large blocks, we split the data from + * the arc buf into chunks. + */ + if (srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && + !(dscp->dsc_featureflags & + DMU_BACKUP_FEATURE_LARGE_BLOCKS)) { + while (srdp->datablksz > 0 && err == 0) { + int n = MIN(srdp->datablksz, + SPA_OLD_MAXBLOCKSIZE); + err = dmu_dump_write(dscp, srdp->obj_type, + range->object, offset, n, n, NULL, data); + offset += n; + /* + * When doing dry run, data==NULL is used as a + * sentinel value by + * dmu_dump_write()->dump_record(). + */ + if (data != NULL) + data += n; + srdp->datablksz -= n; + } + } else { + err = dmu_dump_write(dscp, srdp->obj_type, + range->object, offset, + srdp->datablksz, srdp->datasz, bp, data); + } + return (err); + } + case HOLE: { + struct srh *srhp = &range->sru.hole; + if (range->object == DMU_META_DNODE_OBJECT) { + uint32_t span = srhp->datablksz >> DNODE_SHIFT; + uint64_t first_obj = range->start_blkid * span; + uint64_t numobj = range->end_blkid * span - first_obj; + return (dump_freeobjects(dscp, first_obj, numobj)); + } + uint64_t offset = 0; + + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(range->start_blkid, srhp->datablksz, + &offset)) { + return (0); + } + uint64_t len = 0; + + if (!overflow_multiply(range->end_blkid, srhp->datablksz, &len)) + len = UINT64_MAX; + len = len - offset; + return (dump_free(dscp, range->object, offset, len)); + } + default: + panic("Invalid range type in do_dump: %d", range->type); + } + return (err); +} + +static struct send_range * +range_alloc(enum type type, uint64_t object, uint64_t start_blkid, + uint64_t end_blkid, boolean_t eos) +{ + struct send_range *range = kmem_alloc(sizeof (*range), KM_SLEEP); + range->type = type; + range->object = object; + range->start_blkid = start_blkid; + range->end_blkid = end_blkid; + range->eos_marker = eos; + if (type == DATA) { + range->sru.data.abd = NULL; + range->sru.data.abuf = NULL; + mutex_init(&range->sru.data.lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&range->sru.data.cv, NULL, CV_DEFAULT, NULL); + range->sru.data.io_outstanding = 0; + range->sru.data.io_err = 0; + } + return (range); +} + +/* + * This is the callback function to traverse_dataset that acts as a worker + * thread for dmu_send_impl. + */ +/*ARGSUSED*/ +static int +send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const struct dnode_phys *dnp, void *arg) +{ + struct send_thread_arg *sta = arg; + struct send_range *record; + + ASSERT(zb->zb_object == DMU_META_DNODE_OBJECT || + zb->zb_object >= sta->resume.zb_object); + + /* + * All bps of an encrypted os should have the encryption bit set. + * If this is not true it indicates tampering and we report an error. + */ + if (sta->os->os_encrypted && + !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { + spa_log_error(spa, zb); + zfs_panic_recover("unencrypted block in encrypted " + "object set %llu", dmu_objset_id(sta->os)); + return (SET_ERROR(EIO)); + } + + if (sta->cancel) + return (SET_ERROR(EINTR)); + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) + return (0); + atomic_inc_64(sta->num_blocks_visited); + + if (zb->zb_level == ZB_DNODE_LEVEL) { + if (zb->zb_object == DMU_META_DNODE_OBJECT) + return (0); + record = range_alloc(OBJECT, zb->zb_object, 0, 0, B_FALSE); + record->sru.object.bp = *bp; + size_t size = sizeof (*dnp) * (dnp->dn_extra_slots + 1); + record->sru.object.dnp = kmem_alloc(size, KM_SLEEP); + bcopy(dnp, record->sru.object.dnp, size); + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); + } + if (zb->zb_level == 0 && zb->zb_object == DMU_META_DNODE_OBJECT && + !BP_IS_HOLE(bp)) { + record = range_alloc(OBJECT_RANGE, 0, zb->zb_blkid, + zb->zb_blkid + 1, B_FALSE); + record->sru.object_range.bp = *bp; + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); + } + if (zb->zb_level < 0 || (zb->zb_level > 0 && !BP_IS_HOLE(bp))) + return (0); + if (zb->zb_object == DMU_META_DNODE_OBJECT && !BP_IS_HOLE(bp)) + return (0); + + uint64_t span = bp_span_in_blocks(dnp->dn_indblkshift, zb->zb_level); + uint64_t start; + + /* + * If this multiply overflows, we don't need to send this block. + * Even if it has a birth time, it can never not be a hole, so + * we don't need to send records for it. + */ + if (!overflow_multiply(span, zb->zb_blkid, &start) || (!(zb->zb_blkid == + DMU_SPILL_BLKID || DMU_OT_IS_METADATA(dnp->dn_type)) && + span * zb->zb_blkid > dnp->dn_maxblkid)) { + ASSERT(BP_IS_HOLE(bp)); + return (0); + } + + if (zb->zb_blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); + + enum type record_type = DATA; + if (BP_IS_HOLE(bp)) + record_type = HOLE; + else if (BP_IS_REDACTED(bp)) + record_type = REDACT; + else + record_type = DATA; + + record = range_alloc(record_type, zb->zb_object, start, + (start + span < start ? 0 : start + span), B_FALSE); + + uint64_t datablksz = (zb->zb_blkid == DMU_SPILL_BLKID ? + BP_GET_LSIZE(bp) : dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + + if (BP_IS_HOLE(bp)) { + record->sru.hole.datablksz = datablksz; + } else if (BP_IS_REDACTED(bp)) { + record->sru.redact.datablksz = datablksz; + } else { + record->sru.data.datablksz = datablksz; + record->sru.data.obj_type = dnp->dn_type; + record->sru.data.bp = *bp; + } + + bqueue_enqueue(&sta->q, record, sizeof (*record)); + return (0); +} + +struct redact_list_cb_arg { + uint64_t *num_blocks_visited; + bqueue_t *q; + boolean_t *cancel; + boolean_t mark_redact; +}; + +static int +redact_list_cb(redact_block_phys_t *rb, void *arg) +{ + struct redact_list_cb_arg *rlcap = arg; + + atomic_inc_64(rlcap->num_blocks_visited); + if (*rlcap->cancel) + return (-1); + + struct send_range *data = range_alloc(REDACT, rb->rbp_object, + rb->rbp_blkid, rb->rbp_blkid + redact_block_get_count(rb), B_FALSE); + ASSERT3U(data->end_blkid, >, rb->rbp_blkid); + if (rlcap->mark_redact) { + data->type = REDACT; + data->sru.redact.datablksz = redact_block_get_size(rb); + } else { + data->type = PREVIOUSLY_REDACTED; + } + bqueue_enqueue(rlcap->q, data, sizeof (*data)); + + return (0); +} + +/* + * This function kicks off the traverse_dataset. It also handles setting the + * error code of the thread in case something goes wrong, and pushes the End of + * Stream record when the traverse_dataset call has finished. + */ +static void +send_traverse_thread(void *arg) +{ + struct send_thread_arg *st_arg = arg; + int err = 0; + struct send_range *data; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + err = traverse_dataset_resume(st_arg->os->os_dsl_dataset, + st_arg->fromtxg, &st_arg->resume, + st_arg->flags, send_cb, st_arg); + + if (err != EINTR) + st_arg->error_code = err; + data = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&st_arg->q, data, sizeof (*data)); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +/* + * Utility function that causes End of Stream records to compare after of all + * others, so that other threads' comparison logic can stay simple. + */ +static int __attribute__((unused)) +send_range_after(const struct send_range *from, const struct send_range *to) +{ + if (from->eos_marker == B_TRUE) + return (1); + if (to->eos_marker == B_TRUE) + return (-1); + + uint64_t from_obj = from->object; + uint64_t from_end_obj = from->object + 1; + uint64_t to_obj = to->object; + uint64_t to_end_obj = to->object + 1; + if (from_obj == 0) { + ASSERT(from->type == HOLE || from->type == OBJECT_RANGE); + from_obj = from->start_blkid << DNODES_PER_BLOCK_SHIFT; + from_end_obj = from->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + if (to_obj == 0) { + ASSERT(to->type == HOLE || to->type == OBJECT_RANGE); + to_obj = to->start_blkid << DNODES_PER_BLOCK_SHIFT; + to_end_obj = to->end_blkid << DNODES_PER_BLOCK_SHIFT; + } + + if (from_end_obj <= to_obj) + return (-1); + if (from_obj >= to_end_obj) + return (1); + int64_t cmp = TREE_CMP(to->type == OBJECT_RANGE, from->type == + OBJECT_RANGE); + if (unlikely(cmp)) + return (cmp); + cmp = TREE_CMP(to->type == OBJECT, from->type == OBJECT); + if (unlikely(cmp)) + return (cmp); + if (from->end_blkid <= to->start_blkid) + return (-1); + if (from->start_blkid >= to->end_blkid) + return (1); + return (0); +} + +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, but do not free the old data. This is used so that the + * records can be sent on to the main thread without copying the data. + */ +static struct send_range * +get_next_range_nofree(bqueue_t *bq, struct send_range *prev) +{ + struct send_range *next = bqueue_dequeue(bq); + ASSERT3S(send_range_after(prev, next), ==, -1); + return (next); +} + +/* + * Pop the new data off the queue, check that the records we receive are in + * the right order, and free the old data. + */ +static struct send_range * +get_next_range(bqueue_t *bq, struct send_range *prev) +{ + struct send_range *next = get_next_range_nofree(bq, prev); + range_free(prev); + return (next); +} + +static void +redact_list_thread(void *arg) +{ + struct redact_list_thread_arg *rlt_arg = arg; + struct send_range *record; + fstrans_cookie_t cookie = spl_fstrans_mark(); + if (rlt_arg->rl != NULL) { + struct redact_list_cb_arg rlcba = {0}; + rlcba.cancel = &rlt_arg->cancel; + rlcba.q = &rlt_arg->q; + rlcba.num_blocks_visited = rlt_arg->num_blocks_visited; + rlcba.mark_redact = rlt_arg->mark_redact; + int err = dsl_redaction_list_traverse(rlt_arg->rl, + &rlt_arg->resume, redact_list_cb, &rlcba); + if (err != EINTR) + rlt_arg->error_code = err; + } + record = range_alloc(DATA, 0, 0, 0, B_TRUE); + bqueue_enqueue_flush(&rlt_arg->q, record, sizeof (*record)); + spl_fstrans_unmark(cookie); + + thread_exit(); +} + +/* + * Compare the start point of the two provided ranges. End of stream ranges + * compare last, objects compare before any data or hole inside that object and + * multi-object holes that start at the same object. + */ +static int +send_range_start_compare(struct send_range *r1, struct send_range *r2) +{ + uint64_t r1_objequiv = r1->object; + uint64_t r1_l0equiv = r1->start_blkid; + uint64_t r2_objequiv = r2->object; + uint64_t r2_l0equiv = r2->start_blkid; + int64_t cmp = TREE_CMP(r1->eos_marker, r2->eos_marker); + if (unlikely(cmp)) + return (cmp); + if (r1->object == 0) { + r1_objequiv = r1->start_blkid * DNODES_PER_BLOCK; + r1_l0equiv = 0; + } + if (r2->object == 0) { + r2_objequiv = r2->start_blkid * DNODES_PER_BLOCK; + r2_l0equiv = 0; + } + + cmp = TREE_CMP(r1_objequiv, r2_objequiv); + if (likely(cmp)) + return (cmp); + cmp = TREE_CMP(r2->type == OBJECT_RANGE, r1->type == OBJECT_RANGE); + if (unlikely(cmp)) + return (cmp); + cmp = TREE_CMP(r2->type == OBJECT, r1->type == OBJECT); + if (unlikely(cmp)) + return (cmp); + + return (TREE_CMP(r1_l0equiv, r2_l0equiv)); +} + +enum q_idx { + REDACT_IDX = 0, + TO_IDX, + FROM_IDX, + NUM_THREADS +}; + +/* + * This function returns the next range the send_merge_thread should operate on. + * The inputs are two arrays; the first one stores the range at the front of the + * queues stored in the second one. The ranges are sorted in descending + * priority order; the metadata from earlier ranges overrules metadata from + * later ranges. out_mask is used to return which threads the ranges came from; + * bit i is set if ranges[i] started at the same place as the returned range. + * + * This code is not hardcoded to compare a specific number of threads; it could + * be used with any number, just by changing the q_idx enum. + * + * The "next range" is the one with the earliest start; if two starts are equal, + * the highest-priority range is the next to operate on. If a higher-priority + * range starts in the middle of the first range, then the first range will be + * truncated to end where the higher-priority range starts, and we will operate + * on that one next time. In this way, we make sure that each block covered by + * some range gets covered by a returned range, and each block covered is + * returned using the metadata of the highest-priority range it appears in. + * + * For example, if the three ranges at the front of the queues were [2,4), + * [3,5), and [1,3), then the ranges returned would be [1,2) with the metadata + * from the third range, [2,4) with the metadata from the first range, and then + * [4,5) with the metadata from the second. + */ +static struct send_range * +find_next_range(struct send_range **ranges, bqueue_t **qs, uint64_t *out_mask) +{ + int idx = 0; // index of the range with the earliest start + int i; + uint64_t bmask = 0; + for (i = 1; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) < 0) + idx = i; + } + if (ranges[idx]->eos_marker) { + struct send_range *ret = range_alloc(DATA, 0, 0, 0, B_TRUE); + *out_mask = 0; + return (ret); + } + /* + * Find all the ranges that start at that same point. + */ + for (i = 0; i < NUM_THREADS; i++) { + if (send_range_start_compare(ranges[i], ranges[idx]) == 0) + bmask |= 1 << i; + } + *out_mask = bmask; + /* + * OBJECT_RANGE records only come from the TO thread, and should always + * be treated as overlapping with nothing and sent on immediately. They + * are only used in raw sends, and are never redacted. + */ + if (ranges[idx]->type == OBJECT_RANGE) { + ASSERT3U(idx, ==, TO_IDX); + ASSERT3U(*out_mask, ==, 1 << TO_IDX); + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } + /* + * Find the first start or end point after the start of the first range. + */ + uint64_t first_change = ranges[idx]->end_blkid; + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || ranges[i]->eos_marker || + ranges[i]->object > ranges[idx]->object || + ranges[i]->object == DMU_META_DNODE_OBJECT) + continue; + ASSERT3U(ranges[i]->object, ==, ranges[idx]->object); + if (first_change > ranges[i]->start_blkid && + (bmask & (1 << i)) == 0) + first_change = ranges[i]->start_blkid; + else if (first_change > ranges[i]->end_blkid) + first_change = ranges[i]->end_blkid; + } + /* + * Update all ranges to no longer overlap with the range we're + * returning. All such ranges must start at the same place as the range + * being returned, and end at or after first_change. Thus we update + * their start to first_change. If that makes them size 0, then free + * them and pull a new range from that thread. + */ + for (i = 0; i < NUM_THREADS; i++) { + if (i == idx || (bmask & (1 << i)) == 0) + continue; + ASSERT3U(first_change, >, ranges[i]->start_blkid); + ranges[i]->start_blkid = first_change; + ASSERT3U(ranges[i]->start_blkid, <=, ranges[i]->end_blkid); + if (ranges[i]->start_blkid == ranges[i]->end_blkid) + ranges[i] = get_next_range(qs[i], ranges[i]); + } + /* + * Short-circuit the simple case; if the range doesn't overlap with + * anything else, or it only overlaps with things that start at the same + * place and are longer, send it on. + */ + if (first_change == ranges[idx]->end_blkid) { + struct send_range *ret = ranges[idx]; + ranges[idx] = get_next_range_nofree(qs[idx], ranges[idx]); + return (ret); + } + + /* + * Otherwise, return a truncated copy of ranges[idx] and move the start + * of ranges[idx] back to first_change. + */ + struct send_range *ret = kmem_alloc(sizeof (*ret), KM_SLEEP); + *ret = *ranges[idx]; + ret->end_blkid = first_change; + ranges[idx]->start_blkid = first_change; + return (ret); +} + +#define FROM_AND_REDACT_BITS ((1 << REDACT_IDX) | (1 << FROM_IDX)) + +/* + * Merge the results from the from thread and the to thread, and then hand the + * records off to send_prefetch_thread to prefetch them. If this is not a + * send from a redaction bookmark, the from thread will push an end of stream + * record and stop, and we'll just send everything that was changed in the + * to_ds since the ancestor's creation txg. If it is, then since + * traverse_dataset has a canonical order, we can compare each change as + * they're pulled off the queues. That will give us a stream that is + * appropriately sorted, and covers all records. In addition, we pull the + * data from the redact_list_thread and use that to determine which blocks + * should be redacted. + */ +static void +send_merge_thread(void *arg) +{ + struct send_merge_thread_arg *smt_arg = arg; + struct send_range *front_ranges[NUM_THREADS]; + bqueue_t *queues[NUM_THREADS]; + int err = 0; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + if (smt_arg->redact_arg == NULL) { + front_ranges[REDACT_IDX] = + kmem_zalloc(sizeof (struct send_range), KM_SLEEP); + front_ranges[REDACT_IDX]->eos_marker = B_TRUE; + front_ranges[REDACT_IDX]->type = REDACT; + queues[REDACT_IDX] = NULL; + } else { + front_ranges[REDACT_IDX] = + bqueue_dequeue(&smt_arg->redact_arg->q); + queues[REDACT_IDX] = &smt_arg->redact_arg->q; + } + front_ranges[TO_IDX] = bqueue_dequeue(&smt_arg->to_arg->q); + queues[TO_IDX] = &smt_arg->to_arg->q; + front_ranges[FROM_IDX] = bqueue_dequeue(&smt_arg->from_arg->q); + queues[FROM_IDX] = &smt_arg->from_arg->q; + uint64_t mask = 0; + struct send_range *range; + for (range = find_next_range(front_ranges, queues, &mask); + !range->eos_marker && err == 0 && !smt_arg->cancel; + range = find_next_range(front_ranges, queues, &mask)) { + /* + * If the range in question was in both the from redact bookmark + * and the bookmark we're using to redact, then don't send it. + * It's already redacted on the receiving system, so a redaction + * record would be redundant. + */ + if ((mask & FROM_AND_REDACT_BITS) == FROM_AND_REDACT_BITS) { + ASSERT3U(range->type, ==, REDACT); + range_free(range); + continue; + } + bqueue_enqueue(&smt_arg->q, range, sizeof (*range)); + + if (smt_arg->to_arg->error_code != 0) { + err = smt_arg->to_arg->error_code; + } else if (smt_arg->from_arg->error_code != 0) { + err = smt_arg->from_arg->error_code; + } else if (smt_arg->redact_arg != NULL && + smt_arg->redact_arg->error_code != 0) { + err = smt_arg->redact_arg->error_code; + } + } + if (smt_arg->cancel && err == 0) + err = SET_ERROR(EINTR); + smt_arg->error = err; + if (smt_arg->error != 0) { + smt_arg->to_arg->cancel = B_TRUE; + smt_arg->from_arg->cancel = B_TRUE; + if (smt_arg->redact_arg != NULL) + smt_arg->redact_arg->cancel = B_TRUE; + } + for (int i = 0; i < NUM_THREADS; i++) { + while (!front_ranges[i]->eos_marker) { + front_ranges[i] = get_next_range(queues[i], + front_ranges[i]); + } + range_free(front_ranges[i]); + } + if (range == NULL) + range = kmem_zalloc(sizeof (*range), KM_SLEEP); + range->eos_marker = B_TRUE; + bqueue_enqueue_flush(&smt_arg->q, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +struct send_reader_thread_arg { + struct send_merge_thread_arg *smta; + bqueue_t q; + boolean_t cancel; + boolean_t issue_reads; + uint64_t featureflags; + int error; +}; + +static void +dmu_send_read_done(zio_t *zio) +{ + struct send_range *range = zio->io_private; + + mutex_enter(&range->sru.data.lock); + if (zio->io_error != 0) { + abd_free(range->sru.data.abd); + range->sru.data.abd = NULL; + range->sru.data.io_err = zio->io_error; + } + + ASSERT(range->sru.data.io_outstanding); + range->sru.data.io_outstanding = B_FALSE; + cv_broadcast(&range->sru.data.cv); + mutex_exit(&range->sru.data.lock); +} + +static void +issue_data_read(struct send_reader_thread_arg *srta, struct send_range *range) +{ + struct srd *srdp = &range->sru.data; + blkptr_t *bp = &srdp->bp; + objset_t *os = srta->smta->os; + + ASSERT3U(range->type, ==, DATA); + ASSERT3U(range->start_blkid + 1, ==, range->end_blkid); + /* + * If we have large blocks stored on disk but + * the send flags don't allow us to send large + * blocks, we split the data from the arc buf + * into chunks. + */ + boolean_t split_large_blocks = + srdp->datablksz > SPA_OLD_MAXBLOCKSIZE && + !(srta->featureflags & DMU_BACKUP_FEATURE_LARGE_BLOCKS); + /* + * We should only request compressed data from the ARC if all + * the following are true: + * - stream compression was requested + * - we aren't splitting large blocks into smaller chunks + * - the data won't need to be byteswapped before sending + * - this isn't an embedded block + * - this isn't metadata (if receiving on a different endian + * system it can be byteswapped more easily) + */ + boolean_t request_compressed = + (srta->featureflags & DMU_BACKUP_FEATURE_COMPRESSED) && + !split_large_blocks && !BP_SHOULD_BYTESWAP(bp) && + !BP_IS_EMBEDDED(bp) && !DMU_OT_IS_METADATA(BP_GET_TYPE(bp)); + + enum zio_flag zioflags = ZIO_FLAG_CANFAIL; + + if (srta->featureflags & DMU_BACKUP_FEATURE_RAW) + zioflags |= ZIO_FLAG_RAW; + else if (request_compressed) + zioflags |= ZIO_FLAG_RAW_COMPRESS; + + srdp->datasz = (zioflags & ZIO_FLAG_RAW_COMPRESS) ? + BP_GET_PSIZE(bp) : BP_GET_LSIZE(bp); + + if (!srta->issue_reads) + return; + if (BP_IS_REDACTED(bp)) + return; + if (send_do_embed(bp, srta->featureflags)) + return; + + zbookmark_phys_t zb = { + .zb_objset = dmu_objset_id(os), + .zb_object = range->object, + .zb_level = 0, + .zb_blkid = range->start_blkid, + }; + + arc_flags_t aflags = ARC_FLAG_CACHED_ONLY; + + int arc_err = arc_read(NULL, os->os_spa, bp, + arc_getbuf_func, &srdp->abuf, ZIO_PRIORITY_ASYNC_READ, + zioflags, &aflags, &zb); + /* + * If the data is not already cached in the ARC, we read directly + * from zio. This avoids the performance overhead of adding a new + * entry to the ARC, and we also avoid polluting the ARC cache with + * data that is not likely to be used in the future. + */ + if (arc_err != 0) { + srdp->abd = abd_alloc_linear(srdp->datasz, B_FALSE); + srdp->io_outstanding = B_TRUE; + zio_nowait(zio_read(NULL, os->os_spa, bp, srdp->abd, + srdp->datasz, dmu_send_read_done, range, + ZIO_PRIORITY_ASYNC_READ, zioflags, &zb)); + } +} + +/* + * Create a new record with the given values. + */ +static void +enqueue_range(struct send_reader_thread_arg *srta, bqueue_t *q, dnode_t *dn, + uint64_t blkid, uint64_t count, const blkptr_t *bp, uint32_t datablksz) +{ + enum type range_type = (bp == NULL || BP_IS_HOLE(bp) ? HOLE : + (BP_IS_REDACTED(bp) ? REDACT : DATA)); + + struct send_range *range = range_alloc(range_type, dn->dn_object, + blkid, blkid + count, B_FALSE); + + if (blkid == DMU_SPILL_BLKID) + ASSERT3U(BP_GET_TYPE(bp), ==, DMU_OT_SA); + + switch (range_type) { + case HOLE: + range->sru.hole.datablksz = datablksz; + break; + case DATA: + ASSERT3U(count, ==, 1); + range->sru.data.datablksz = datablksz; + range->sru.data.obj_type = dn->dn_type; + range->sru.data.bp = *bp; + issue_data_read(srta, range); + break; + case REDACT: + range->sru.redact.datablksz = datablksz; + break; + default: + break; + } + bqueue_enqueue(q, range, datablksz); +} + +/* + * This thread is responsible for two things: First, it retrieves the correct + * blkptr in the to ds if we need to send the data because of something from + * the from thread. As a result of this, we're the first ones to discover that + * some indirect blocks can be discarded because they're not holes. Second, + * it issues prefetches for the data we need to send. + */ +static void +send_reader_thread(void *arg) +{ + struct send_reader_thread_arg *srta = arg; + struct send_merge_thread_arg *smta = srta->smta; + bqueue_t *inq = &smta->q; + bqueue_t *outq = &srta->q; + objset_t *os = smta->os; + fstrans_cookie_t cookie = spl_fstrans_mark(); + struct send_range *range = bqueue_dequeue(inq); + int err = 0; + + /* + * If the record we're analyzing is from a redaction bookmark from the + * fromds, then we need to know whether or not it exists in the tods so + * we know whether to create records for it or not. If it does, we need + * the datablksz so we can generate an appropriate record for it. + * Finally, if it isn't redacted, we need the blkptr so that we can send + * a WRITE record containing the actual data. + */ + uint64_t last_obj = UINT64_MAX; + uint64_t last_obj_exists = B_TRUE; + while (!range->eos_marker && !srta->cancel && smta->error == 0 && + err == 0) { + switch (range->type) { + case DATA: + issue_data_read(srta, range); + bqueue_enqueue(outq, range, range->sru.data.datablksz); + range = get_next_range_nofree(inq, range); + break; + case HOLE: + case OBJECT: + case OBJECT_RANGE: + case REDACT: // Redacted blocks must exist + bqueue_enqueue(outq, range, sizeof (*range)); + range = get_next_range_nofree(inq, range); + break; + case PREVIOUSLY_REDACTED: { + /* + * This entry came from the "from bookmark" when + * sending from a bookmark that has a redaction + * list. We need to check if this object/blkid + * exists in the target ("to") dataset, and if + * not then we drop this entry. We also need + * to fill in the block pointer so that we know + * what to prefetch. + * + * To accomplish the above, we first cache whether or + * not the last object we examined exists. If it + * doesn't, we can drop this record. If it does, we hold + * the dnode and use it to call dbuf_dnode_findbp. We do + * this instead of dbuf_bookmark_findbp because we will + * often operate on large ranges, and holding the dnode + * once is more efficient. + */ + boolean_t object_exists = B_TRUE; + /* + * If the data is redacted, we only care if it exists, + * so that we don't send records for objects that have + * been deleted. + */ + dnode_t *dn; + if (range->object == last_obj && !last_obj_exists) { + /* + * If we're still examining the same object as + * previously, and it doesn't exist, we don't + * need to call dbuf_bookmark_findbp. + */ + object_exists = B_FALSE; + } else { + err = dnode_hold(os, range->object, FTAG, &dn); + if (err == ENOENT) { + object_exists = B_FALSE; + err = 0; + } + last_obj = range->object; + last_obj_exists = object_exists; + } + + if (err != 0) { + break; + } else if (!object_exists) { + /* + * The block was modified, but doesn't + * exist in the to dataset; if it was + * deleted in the to dataset, then we'll + * visit the hole bp for it at some point. + */ + range = get_next_range(inq, range); + continue; + } + uint64_t file_max = + (dn->dn_maxblkid < range->end_blkid ? + dn->dn_maxblkid : range->end_blkid); + /* + * The object exists, so we need to try to find the + * blkptr for each block in the range we're processing. + */ + rw_enter(&dn->dn_struct_rwlock, RW_READER); + for (uint64_t blkid = range->start_blkid; + blkid < file_max; blkid++) { + blkptr_t bp; + uint32_t datablksz = + dn->dn_phys->dn_datablkszsec << + SPA_MINBLOCKSHIFT; + uint64_t offset = blkid * datablksz; + /* + * This call finds the next non-hole block in + * the object. This is to prevent a + * performance problem where we're unredacting + * a large hole. Using dnode_next_offset to + * skip over the large hole avoids iterating + * over every block in it. + */ + err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &offset, 1, 1, 0); + if (err == ESRCH) { + offset = UINT64_MAX; + err = 0; + } else if (err != 0) { + break; + } + if (offset != blkid * datablksz) { + /* + * if there is a hole from here + * (blkid) to offset + */ + offset = MIN(offset, file_max * + datablksz); + uint64_t nblks = (offset / datablksz) - + blkid; + enqueue_range(srta, outq, dn, blkid, + nblks, NULL, datablksz); + blkid += nblks; + } + if (blkid >= file_max) + break; + err = dbuf_dnode_findbp(dn, 0, blkid, &bp, + NULL, NULL); + if (err != 0) + break; + ASSERT(!BP_IS_HOLE(&bp)); + enqueue_range(srta, outq, dn, blkid, 1, &bp, + datablksz); + } + rw_exit(&dn->dn_struct_rwlock); + dnode_rele(dn, FTAG); + range = get_next_range(inq, range); + } + } + } + if (srta->cancel || err != 0) { + smta->cancel = B_TRUE; + srta->error = err; + } else if (smta->error != 0) { + srta->error = smta->error; + } + while (!range->eos_marker) + range = get_next_range(inq, range); + + bqueue_enqueue_flush(outq, range, 1); + spl_fstrans_unmark(cookie); + thread_exit(); +} + +#define NUM_SNAPS_NOT_REDACTED UINT64_MAX + +struct dmu_send_params { + /* Pool args */ + void *tag; // Tag that dp was held with, will be used to release dp. + dsl_pool_t *dp; + /* To snapshot args */ + const char *tosnap; + dsl_dataset_t *to_ds; + /* From snapshot args */ + zfs_bookmark_phys_t ancestor_zb; + uint64_t *fromredactsnaps; + /* NUM_SNAPS_NOT_REDACTED if not sending from redaction bookmark */ + uint64_t numfromredactsnaps; + /* Stream params */ + boolean_t is_clone; + boolean_t embedok; + boolean_t large_block_ok; + boolean_t compressok; + boolean_t rawok; + boolean_t savedok; + uint64_t resumeobj; + uint64_t resumeoff; + uint64_t saved_guid; + zfs_bookmark_phys_t *redactbook; + /* Stream output params */ + dmu_send_outparams_t *dso; + + /* Stream progress params */ + offset_t *off; + int outfd; + char saved_toname[MAXNAMELEN]; +}; + +static int +setup_featureflags(struct dmu_send_params *dspp, objset_t *os, + uint64_t *featureflags) +{ + dsl_dataset_t *to_ds = dspp->to_ds; + dsl_pool_t *dp = dspp->dp; +#ifdef _KERNEL + if (dmu_objset_type(os) == DMU_OST_ZFS) { + uint64_t version; + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &version) != 0) + return (SET_ERROR(EINVAL)); + + if (version >= ZPL_VERSION_SA) + *featureflags |= DMU_BACKUP_FEATURE_SA_SPILL; + } +#endif + + /* raw sends imply large_block_ok */ + if ((dspp->rawok || dspp->large_block_ok) && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_BLOCKS)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_BLOCKS; + } + + /* encrypted datasets will not have embedded blocks */ + if ((dspp->embedok || dspp->rawok) && !os->os_encrypted && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMBEDDED_DATA)) { + *featureflags |= DMU_BACKUP_FEATURE_EMBED_DATA; + } + + /* raw send implies compressok */ + if (dspp->compressok || dspp->rawok) + *featureflags |= DMU_BACKUP_FEATURE_COMPRESSED; + + if (dspp->rawok && os->os_encrypted) + *featureflags |= DMU_BACKUP_FEATURE_RAW; + + if ((*featureflags & + (DMU_BACKUP_FEATURE_EMBED_DATA | DMU_BACKUP_FEATURE_COMPRESSED | + DMU_BACKUP_FEATURE_RAW)) != 0 && + spa_feature_is_active(dp->dp_spa, SPA_FEATURE_LZ4_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_LZ4; + } + + /* + * We specifically do not include DMU_BACKUP_FEATURE_EMBED_DATA here to + * allow sending ZSTD compressed datasets to a receiver that does not + * support ZSTD + */ + if ((*featureflags & + (DMU_BACKUP_FEATURE_COMPRESSED | DMU_BACKUP_FEATURE_RAW)) != 0 && + dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_ZSTD_COMPRESS)) { + *featureflags |= DMU_BACKUP_FEATURE_ZSTD; + } + + if (dspp->resumeobj != 0 || dspp->resumeoff != 0) { + *featureflags |= DMU_BACKUP_FEATURE_RESUMING; + } + + if (dspp->redactbook != NULL) { + *featureflags |= DMU_BACKUP_FEATURE_REDACTED; + } + + if (dsl_dataset_feature_is_active(to_ds, SPA_FEATURE_LARGE_DNODE)) { + *featureflags |= DMU_BACKUP_FEATURE_LARGE_DNODE; + } + return (0); +} + +static dmu_replay_record_t * +create_begin_record(struct dmu_send_params *dspp, objset_t *os, + uint64_t featureflags) +{ + dmu_replay_record_t *drr = kmem_zalloc(sizeof (dmu_replay_record_t), + KM_SLEEP); + drr->drr_type = DRR_BEGIN; + + struct drr_begin *drrb = &drr->drr_u.drr_begin; + dsl_dataset_t *to_ds = dspp->to_ds; + + drrb->drr_magic = DMU_BACKUP_MAGIC; + drrb->drr_creation_time = dsl_dataset_phys(to_ds)->ds_creation_time; + drrb->drr_type = dmu_objset_type(os); + drrb->drr_toguid = dsl_dataset_phys(to_ds)->ds_guid; + drrb->drr_fromguid = dspp->ancestor_zb.zbm_guid; + + DMU_SET_STREAM_HDRTYPE(drrb->drr_versioninfo, DMU_SUBSTREAM); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, featureflags); + + if (dspp->is_clone) + drrb->drr_flags |= DRR_FLAG_CLONE; + if (dsl_dataset_phys(dspp->to_ds)->ds_flags & DS_FLAG_CI_DATASET) + drrb->drr_flags |= DRR_FLAG_CI_DATA; + if (zfs_send_set_freerecords_bit) + drrb->drr_flags |= DRR_FLAG_FREERECORDS; + drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_SPILL_BLOCK; + + if (dspp->savedok) { + drrb->drr_toguid = dspp->saved_guid; + strlcpy(drrb->drr_toname, dspp->saved_toname, + sizeof (drrb->drr_toname)); + } else { + dsl_dataset_name(to_ds, drrb->drr_toname); + if (!to_ds->ds_is_snapshot) { + (void) strlcat(drrb->drr_toname, "@--head--", + sizeof (drrb->drr_toname)); + } + } + return (drr); +} + +static void +setup_to_thread(struct send_thread_arg *to_arg, objset_t *to_os, + dmu_sendstatus_t *dssp, uint64_t fromtxg, boolean_t rawok) +{ + VERIFY0(bqueue_init(&to_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + to_arg->error_code = 0; + to_arg->cancel = B_FALSE; + to_arg->os = to_os; + to_arg->fromtxg = fromtxg; + to_arg->flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA; + if (rawok) + to_arg->flags |= TRAVERSE_NO_DECRYPT; + to_arg->num_blocks_visited = &dssp->dss_blocks; + (void) thread_create(NULL, 0, send_traverse_thread, to_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_from_thread(struct redact_list_thread_arg *from_arg, + redaction_list_t *from_rl, dmu_sendstatus_t *dssp) +{ + VERIFY0(bqueue_init(&from_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + from_arg->error_code = 0; + from_arg->cancel = B_FALSE; + from_arg->rl = from_rl; + from_arg->mark_redact = B_FALSE; + from_arg->num_blocks_visited = &dssp->dss_blocks; + /* + * If from_ds is null, send_traverse_thread just returns success and + * enqueues an eos marker. + */ + (void) thread_create(NULL, 0, redact_list_thread, from_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_redact_list_thread(struct redact_list_thread_arg *rlt_arg, + struct dmu_send_params *dspp, redaction_list_t *rl, dmu_sendstatus_t *dssp) +{ + if (dspp->redactbook == NULL) + return; + + rlt_arg->cancel = B_FALSE; + VERIFY0(bqueue_init(&rlt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + rlt_arg->error_code = 0; + rlt_arg->mark_redact = B_TRUE; + rlt_arg->rl = rl; + rlt_arg->num_blocks_visited = &dssp->dss_blocks; + + (void) thread_create(NULL, 0, redact_list_thread, rlt_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static void +setup_merge_thread(struct send_merge_thread_arg *smt_arg, + struct dmu_send_params *dspp, struct redact_list_thread_arg *from_arg, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *rlt_arg, + objset_t *os) +{ + VERIFY0(bqueue_init(&smt_arg->q, zfs_send_no_prefetch_queue_ff, + MAX(zfs_send_no_prefetch_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + smt_arg->cancel = B_FALSE; + smt_arg->error = 0; + smt_arg->from_arg = from_arg; + smt_arg->to_arg = to_arg; + if (dspp->redactbook != NULL) + smt_arg->redact_arg = rlt_arg; + + smt_arg->os = os; + (void) thread_create(NULL, 0, send_merge_thread, smt_arg, 0, curproc, + TS_RUN, minclsyspri); +} + +static void +setup_reader_thread(struct send_reader_thread_arg *srt_arg, + struct dmu_send_params *dspp, struct send_merge_thread_arg *smt_arg, + uint64_t featureflags) +{ + VERIFY0(bqueue_init(&srt_arg->q, zfs_send_queue_ff, + MAX(zfs_send_queue_length, 2 * zfs_max_recordsize), + offsetof(struct send_range, ln))); + srt_arg->smta = smt_arg; + srt_arg->issue_reads = !dspp->dso->dso_dryrun; + srt_arg->featureflags = featureflags; + (void) thread_create(NULL, 0, send_reader_thread, srt_arg, 0, + curproc, TS_RUN, minclsyspri); +} + +static int +setup_resume_points(struct dmu_send_params *dspp, + struct send_thread_arg *to_arg, struct redact_list_thread_arg *from_arg, + struct redact_list_thread_arg *rlt_arg, + struct send_merge_thread_arg *smt_arg, boolean_t resuming, objset_t *os, + redaction_list_t *redact_rl, nvlist_t *nvl) +{ + dsl_dataset_t *to_ds = dspp->to_ds; + int err = 0; + + uint64_t obj = 0; + uint64_t blkid = 0; + if (resuming) { + obj = dspp->resumeobj; + dmu_object_info_t to_doi; + err = dmu_object_info(os, obj, &to_doi); + if (err != 0) + return (err); + + blkid = dspp->resumeoff / to_doi.doi_data_block_size; + } + /* + * If we're resuming a redacted send, we can skip to the appropriate + * point in the redaction bookmark by binary searching through it. + */ + if (redact_rl != NULL) { + SET_BOOKMARK(&rlt_arg->resume, to_ds->ds_object, obj, 0, blkid); + } + + SET_BOOKMARK(&to_arg->resume, to_ds->ds_object, obj, 0, blkid); + if (nvlist_exists(nvl, BEGINNV_REDACT_FROM_SNAPS)) { + uint64_t objset = dspp->ancestor_zb.zbm_redaction_obj; + /* + * Note: If the resume point is in an object whose + * blocksize is different in the from vs to snapshots, + * we will have divided by the "wrong" blocksize. + * However, in this case fromsnap's send_cb() will + * detect that the blocksize has changed and therefore + * ignore this object. + * + * If we're resuming a send from a redaction bookmark, + * we still cannot accidentally suggest blocks behind + * the to_ds. In addition, we know that any blocks in + * the object in the to_ds will have to be sent, since + * the size changed. Therefore, we can't cause any harm + * this way either. + */ + SET_BOOKMARK(&from_arg->resume, objset, obj, 0, blkid); + } + if (resuming) { + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OBJECT, dspp->resumeobj); + fnvlist_add_uint64(nvl, BEGINNV_RESUME_OFFSET, dspp->resumeoff); + } + return (0); +} + +static dmu_sendstatus_t * +setup_send_progress(struct dmu_send_params *dspp) +{ + dmu_sendstatus_t *dssp = kmem_zalloc(sizeof (*dssp), KM_SLEEP); + dssp->dss_outfd = dspp->outfd; + dssp->dss_off = dspp->off; + dssp->dss_proc = curproc; + mutex_enter(&dspp->to_ds->ds_sendstream_lock); + list_insert_head(&dspp->to_ds->ds_sendstreams, dssp); + mutex_exit(&dspp->to_ds->ds_sendstream_lock); + return (dssp); +} + +/* + * Actually do the bulk of the work in a zfs send. + * + * The idea is that we want to do a send from ancestor_zb to to_ds. We also + * want to not send any data that has been modified by all the datasets in + * redactsnaparr, and store the list of blocks that are redacted in this way in + * a bookmark named redactbook, created on the to_ds. We do this by creating + * several worker threads, whose function is described below. + * + * There are three cases. + * The first case is a redacted zfs send. In this case there are 5 threads. + * The first thread is the to_ds traversal thread: it calls dataset_traverse on + * the to_ds and finds all the blocks that have changed since ancestor_zb (if + * it's a full send, that's all blocks in the dataset). It then sends those + * blocks on to the send merge thread. The redact list thread takes the data + * from the redaction bookmark and sends those blocks on to the send merge + * thread. The send merge thread takes the data from the to_ds traversal + * thread, and combines it with the redaction records from the redact list + * thread. If a block appears in both the to_ds's data and the redaction data, + * the send merge thread will mark it as redacted and send it on to the prefetch + * thread. Otherwise, the send merge thread will send the block on to the + * prefetch thread unchanged. The prefetch thread will issue prefetch reads for + * any data that isn't redacted, and then send the data on to the main thread. + * The main thread behaves the same as in a normal send case, issuing demand + * reads for data blocks and sending out records over the network + * + * The graphic below diagrams the flow of data in the case of a redacted zfs + * send. Each box represents a thread, and each line represents the flow of + * data. + * + * Records from the | + * redaction bookmark | + * +--------------------+ | +---------------------------+ + * | | v | Send Merge Thread | + * | Redact List Thread +----------> Apply redaction marks to | + * | | | records as specified by | + * +--------------------+ | redaction ranges | + * +----^---------------+------+ + * | | Merged data + * | | + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since | + * ancestor_zb +------------v----+ + * | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The second case is an incremental send from a redaction bookmark. The to_ds + * traversal thread and the main thread behave the same as in the redacted + * send case. The new thread is the from bookmark traversal thread. It + * iterates over the redaction list in the redaction bookmark, and enqueues + * records for each block that was redacted in the original send. The send + * merge thread now has to merge the data from the two threads. For details + * about that process, see the header comment of send_merge_thread(). Any data + * it decides to send on will be prefetched by the prefetch thread. Note that + * you can perform a redacted send from a redaction bookmark; in that case, + * the data flow behaves very similarly to the flow in the redacted send case, + * except with the addition of the bookmark traversal thread iterating over the + * redaction bookmark. The send_merge_thread also has to take on the + * responsibility of merging the redact list thread's records, the bookmark + * traversal thread's records, and the to_ds records. + * + * +---------------------+ + * | | + * | Redact List Thread +--------------+ + * | | | + * +---------------------+ | + * Blocks in redaction list | Ranges modified by every secure snap + * of from bookmark | (or EOS if not readcted) + * | + * +---------------------+ | +----v----------------------+ + * | bookmark Traversal | v | Send Merge Thread | + * | Thread (finds +---------> Merges bookmark, rlt, and | + * | candidate blocks) | | to_ds send records | + * +---------------------+ +----^---------------+------+ + * | | Merged data + * | +------------v--------+ + * | | Prefetch Thread | + * +--------------------+ | | Issues prefetch | + * | to_ds Traversal | | | reads of data blocks| + * | Thread (finds +---------------+ +------------+--------+ + * | candidate blocks) | Blocks modified | Prefetched data + * +--------------------+ by to_ds since +------------v----+ + * ancestor_zb | Main Thread | File Descriptor + * | Sends data over +->(to zfs receive) + * | wire | + * +-----------------+ + * + * The final case is a simple zfs full or incremental send. The to_ds traversal + * thread behaves the same as always. The redact list thread is never started. + * The send merge thread takes all the blocks that the to_ds traversal thread + * sends it, prefetches the data, and sends the blocks on to the main thread. + * The main thread sends the data over the wire. + * + * To keep performance acceptable, we want to prefetch the data in the worker + * threads. While the to_ds thread could simply use the TRAVERSE_PREFETCH + * feature built into traverse_dataset, the combining and deletion of records + * due to redaction and sends from redaction bookmarks mean that we could + * issue many unnecessary prefetches. As a result, we only prefetch data + * after we've determined that the record is not going to be redacted. To + * prevent the prefetching from getting too far ahead of the main thread, the + * blocking queues that are used for communication are capped not by the + * number of entries in the queue, but by the sum of the size of the + * prefetches associated with them. The limit on the amount of data that the + * thread can prefetch beyond what the main thread has reached is controlled + * by the global variable zfs_send_queue_length. In addition, to prevent poor + * performance in the beginning of a send, we also limit the distance ahead + * that the traversal threads can be. That distance is controlled by the + * zfs_send_no_prefetch_queue_length tunable. + * + * Note: Releases dp using the specified tag. + */ +static int +dmu_send_impl(struct dmu_send_params *dspp) +{ + objset_t *os; + dmu_replay_record_t *drr; + dmu_sendstatus_t *dssp; + dmu_send_cookie_t dsc = {0}; + int err; + uint64_t fromtxg = dspp->ancestor_zb.zbm_creation_txg; + uint64_t featureflags = 0; + struct redact_list_thread_arg *from_arg; + struct send_thread_arg *to_arg; + struct redact_list_thread_arg *rlt_arg; + struct send_merge_thread_arg *smt_arg; + struct send_reader_thread_arg *srt_arg; + struct send_range *range; + redaction_list_t *from_rl = NULL; + redaction_list_t *redact_rl = NULL; + boolean_t resuming = (dspp->resumeobj != 0 || dspp->resumeoff != 0); + boolean_t book_resuming = resuming; + + dsl_dataset_t *to_ds = dspp->to_ds; + zfs_bookmark_phys_t *ancestor_zb = &dspp->ancestor_zb; + dsl_pool_t *dp = dspp->dp; + void *tag = dspp->tag; + + err = dmu_objset_from_ds(to_ds, &os); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + /* + * If this is a non-raw send of an encrypted ds, we can ensure that + * the objset_phys_t is authenticated. This is safe because this is + * either a snapshot or we have owned the dataset, ensuring that + * it can't be modified. + */ + if (!dspp->rawok && os->os_encrypted && + arc_is_unauthenticated(os->os_phys_buf)) { + zbookmark_phys_t zb; + + SET_BOOKMARK(&zb, to_ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + err = arc_untransform(os->os_phys_buf, os->os_spa, + &zb, B_FALSE); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + ASSERT0(arc_is_unauthenticated(os->os_phys_buf)); + } + + if ((err = setup_featureflags(dspp, os, &featureflags)) != 0) { + dsl_pool_rele(dp, tag); + return (err); + } + + /* + * If we're doing a redacted send, hold the bookmark's redaction list. + */ + if (dspp->redactbook != NULL) { + err = dsl_redaction_list_hold_obj(dp, + dspp->redactbook->zbm_redaction_obj, FTAG, + &redact_rl); + if (err != 0) { + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + dsl_redaction_list_long_hold(dp, redact_rl, FTAG); + } + + /* + * If we're sending from a redaction bookmark, hold the redaction list + * so that we can consider sending the redacted blocks. + */ + if (ancestor_zb->zbm_redaction_obj != 0) { + err = dsl_redaction_list_hold_obj(dp, + ancestor_zb->zbm_redaction_obj, FTAG, &from_rl); + if (err != 0) { + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); + } + dsl_pool_rele(dp, tag); + return (SET_ERROR(EINVAL)); + } + dsl_redaction_list_long_hold(dp, from_rl, FTAG); + } + + dsl_dataset_long_hold(to_ds, FTAG); + + from_arg = kmem_zalloc(sizeof (*from_arg), KM_SLEEP); + to_arg = kmem_zalloc(sizeof (*to_arg), KM_SLEEP); + rlt_arg = kmem_zalloc(sizeof (*rlt_arg), KM_SLEEP); + smt_arg = kmem_zalloc(sizeof (*smt_arg), KM_SLEEP); + srt_arg = kmem_zalloc(sizeof (*srt_arg), KM_SLEEP); + + drr = create_begin_record(dspp, os, featureflags); + dssp = setup_send_progress(dspp); + + dsc.dsc_drr = drr; + dsc.dsc_dso = dspp->dso; + dsc.dsc_os = os; + dsc.dsc_off = dspp->off; + dsc.dsc_toguid = dsl_dataset_phys(to_ds)->ds_guid; + dsc.dsc_fromtxg = fromtxg; + dsc.dsc_pending_op = PENDING_NONE; + dsc.dsc_featureflags = featureflags; + dsc.dsc_resume_object = dspp->resumeobj; + dsc.dsc_resume_offset = dspp->resumeoff; + + dsl_pool_rele(dp, tag); + + void *payload = NULL; + size_t payload_len = 0; + nvlist_t *nvl = fnvlist_alloc(); + + /* + * If we're doing a redacted send, we include the snapshots we're + * redacted with respect to so that the target system knows what send + * streams can be correctly received on top of this dataset. If we're + * instead sending a redacted dataset, we include the snapshots that the + * dataset was created with respect to. + */ + if (dspp->redactbook != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, + redact_rl->rl_phys->rlp_snaps, + redact_rl->rl_phys->rlp_num_snaps); + } else if (dsl_dataset_feature_is_active(to_ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t *tods_guids; + uint64_t length; + VERIFY(dsl_dataset_get_uint64_array_feature(to_ds, + SPA_FEATURE_REDACTED_DATASETS, &length, &tods_guids)); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_SNAPS, tods_guids, + length); + } + + /* + * If we're sending from a redaction bookmark, then we should retrieve + * the guids of that bookmark so we can send them over the wire. + */ + if (from_rl != NULL) { + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + from_rl->rl_phys->rlp_snaps, + from_rl->rl_phys->rlp_num_snaps); + } + + /* + * If the snapshot we're sending from is redacted, include the redaction + * list in the stream. + */ + if (dspp->numfromredactsnaps != NUM_SNAPS_NOT_REDACTED) { + ASSERT3P(from_rl, ==, NULL); + fnvlist_add_uint64_array(nvl, BEGINNV_REDACT_FROM_SNAPS, + dspp->fromredactsnaps, (uint_t)dspp->numfromredactsnaps); + if (dspp->numfromredactsnaps > 0) { + kmem_free(dspp->fromredactsnaps, + dspp->numfromredactsnaps * sizeof (uint64_t)); + dspp->fromredactsnaps = NULL; + } + } + + if (resuming || book_resuming) { + err = setup_resume_points(dspp, to_arg, from_arg, + rlt_arg, smt_arg, resuming, os, redact_rl, nvl); + if (err != 0) + goto out; + } + + if (featureflags & DMU_BACKUP_FEATURE_RAW) { + uint64_t ivset_guid = (ancestor_zb != NULL) ? + ancestor_zb->zbm_ivset_guid : 0; + nvlist_t *keynvl = NULL; + ASSERT(os->os_encrypted); + + err = dsl_crypto_populate_key_nvlist(os, ivset_guid, + &keynvl); + if (err != 0) { + fnvlist_free(nvl); + goto out; + } + + fnvlist_add_nvlist(nvl, "crypt_keydata", keynvl); + fnvlist_free(keynvl); + } + + if (!nvlist_empty(nvl)) { + payload = fnvlist_pack(nvl, &payload_len); + drr->drr_payloadlen = payload_len; + } + + fnvlist_free(nvl); + err = dump_record(&dsc, payload, payload_len); + fnvlist_pack_free(payload, payload_len); + if (err != 0) { + err = dsc.dsc_err; + goto out; + } + + setup_to_thread(to_arg, os, dssp, fromtxg, dspp->rawok); + setup_from_thread(from_arg, from_rl, dssp); + setup_redact_list_thread(rlt_arg, dspp, redact_rl, dssp); + setup_merge_thread(smt_arg, dspp, from_arg, to_arg, rlt_arg, os); + setup_reader_thread(srt_arg, dspp, smt_arg, featureflags); + + range = bqueue_dequeue(&srt_arg->q); + while (err == 0 && !range->eos_marker) { + err = do_dump(&dsc, range); + range = get_next_range(&srt_arg->q, range); + if (issig(JUSTLOOKING) && issig(FORREAL)) + err = SET_ERROR(EINTR); + } + + /* + * If we hit an error or are interrupted, cancel our worker threads and + * clear the queue of any pending records. The threads will pass the + * cancel up the tree of worker threads, and each one will clean up any + * pending records before exiting. + */ + if (err != 0) { + srt_arg->cancel = B_TRUE; + while (!range->eos_marker) { + range = get_next_range(&srt_arg->q, range); + } + } + range_free(range); + + bqueue_destroy(&srt_arg->q); + bqueue_destroy(&smt_arg->q); + if (dspp->redactbook != NULL) + bqueue_destroy(&rlt_arg->q); + bqueue_destroy(&to_arg->q); + bqueue_destroy(&from_arg->q); + + if (err == 0 && srt_arg->error != 0) + err = srt_arg->error; + + if (err != 0) + goto out; + + if (dsc.dsc_pending_op != PENDING_NONE) + if (dump_record(&dsc, NULL, 0) != 0) + err = SET_ERROR(EINTR); + + if (err != 0) { + if (err == EINTR && dsc.dsc_err != 0) + err = dsc.dsc_err; + goto out; + } + + /* + * Send the DRR_END record if this is not a saved stream. + * Otherwise, the omitted DRR_END record will signal to + * the receive side that the stream is incomplete. + */ + if (!dspp->savedok) { + bzero(drr, sizeof (dmu_replay_record_t)); + drr->drr_type = DRR_END; + drr->drr_u.drr_end.drr_checksum = dsc.dsc_zc; + drr->drr_u.drr_end.drr_toguid = dsc.dsc_toguid; + + if (dump_record(&dsc, NULL, 0) != 0) + err = dsc.dsc_err; + } +out: + mutex_enter(&to_ds->ds_sendstream_lock); + list_remove(&to_ds->ds_sendstreams, dssp); + mutex_exit(&to_ds->ds_sendstream_lock); + + VERIFY(err != 0 || (dsc.dsc_sent_begin && + (dsc.dsc_sent_end || dspp->savedok))); + + kmem_free(drr, sizeof (dmu_replay_record_t)); + kmem_free(dssp, sizeof (dmu_sendstatus_t)); + kmem_free(from_arg, sizeof (*from_arg)); + kmem_free(to_arg, sizeof (*to_arg)); + kmem_free(rlt_arg, sizeof (*rlt_arg)); + kmem_free(smt_arg, sizeof (*smt_arg)); + kmem_free(srt_arg, sizeof (*srt_arg)); + + dsl_dataset_long_rele(to_ds, FTAG); + if (from_rl != NULL) { + dsl_redaction_list_long_rele(from_rl, FTAG); + dsl_redaction_list_rele(from_rl, FTAG); + } + if (redact_rl != NULL) { + dsl_redaction_list_long_rele(redact_rl, FTAG); + dsl_redaction_list_rele(redact_rl, FTAG); + } + + return (err); +} + +int +dmu_send_obj(const char *pool, uint64_t tosnap, uint64_t fromsnap, + boolean_t embedok, boolean_t large_block_ok, boolean_t compressok, + boolean_t rawok, boolean_t savedok, int outfd, offset_t *off, + dmu_send_outparams_t *dsop) +{ + int err; + dsl_dataset_t *fromds; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + struct dmu_send_params dspp = {0}; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.rawok = rawok; + dspp.savedok = savedok; + + err = dsl_pool_hold(pool, FTAG, &dspp.dp); + if (err != 0) + return (err); + + err = dsl_dataset_hold_obj_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } + + if (fromsnap != 0) { + err = dsl_dataset_hold_obj_flags(dspp.dp, fromsnap, dsflags, + FTAG, &fromds); + if (err != 0) { + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } + dspp.ancestor_zb.zbm_guid = dsl_dataset_phys(fromds)->ds_guid; + dspp.ancestor_zb.zbm_creation_txg = + dsl_dataset_phys(fromds)->ds_creation_txg; + dspp.ancestor_zb.zbm_creation_time = + dsl_dataset_phys(fromds)->ds_creation_time; + + if (dsl_dataset_is_zapified(fromds)) { + (void) zap_lookup(dspp.dp->dp_meta_objset, + fromds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, + &dspp.ancestor_zb.zbm_ivset_guid); + } + + /* See dmu_send for the reasons behind this. */ + uint64_t *fromredact; + + if (!dsl_dataset_get_uint64_array_feature(fromds, + SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, size); + } + + if (!dsl_dataset_is_before(dspp.to_ds, fromds, 0)) { + err = SET_ERROR(EXDEV); + } else { + dspp.is_clone = (dspp.to_ds->ds_dir != + fromds->ds_dir); + dsl_dataset_rele(fromds, FTAG); + err = dmu_send_impl(&dspp); + } + } else { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); + } + dsl_dataset_rele(dspp.to_ds, FTAG); + return (err); +} + +int +dmu_send(const char *tosnap, const char *fromsnap, boolean_t embedok, + boolean_t large_block_ok, boolean_t compressok, boolean_t rawok, + boolean_t savedok, uint64_t resumeobj, uint64_t resumeoff, + const char *redactbook, int outfd, offset_t *off, + dmu_send_outparams_t *dsop) +{ + int err = 0; + ds_hold_flags_t dsflags = (rawok) ? 0 : DS_HOLD_FLAG_DECRYPT; + boolean_t owned = B_FALSE; + dsl_dataset_t *fromds = NULL; + zfs_bookmark_phys_t book = {0}; + struct dmu_send_params dspp = {0}; + + dspp.tosnap = tosnap; + dspp.embedok = embedok; + dspp.large_block_ok = large_block_ok; + dspp.compressok = compressok; + dspp.outfd = outfd; + dspp.off = off; + dspp.dso = dsop; + dspp.tag = FTAG; + dspp.resumeobj = resumeobj; + dspp.resumeoff = resumeoff; + dspp.rawok = rawok; + dspp.savedok = savedok; + + if (fromsnap != NULL && strpbrk(fromsnap, "@#") == NULL) + return (SET_ERROR(EINVAL)); + + err = dsl_pool_hold(tosnap, FTAG, &dspp.dp); + if (err != 0) + return (err); + + if (strchr(tosnap, '@') == NULL && spa_writeable(dspp.dp->dp_spa)) { + /* + * We are sending a filesystem or volume. Ensure + * that it doesn't change by owning the dataset. + */ + + if (savedok) { + /* + * We are looking for the dataset that represents the + * partially received send stream. If this stream was + * received as a new snapshot of an existing dataset, + * this will be saved in a hidden clone named + * "//%recv". Otherwise, the stream + * will be saved in the live dataset itself. In + * either case we need to use dsl_dataset_own_force() + * because the stream is marked as inconsistent, + * which would normally make it unavailable to be + * owned. + */ + char *name = kmem_asprintf("%s/%s", tosnap, + recv_clone_name); + err = dsl_dataset_own_force(dspp.dp, name, dsflags, + FTAG, &dspp.to_ds); + if (err == ENOENT) { + err = dsl_dataset_own_force(dspp.dp, tosnap, + dsflags, FTAG, &dspp.to_ds); + } + + if (err == 0) { + err = zap_lookup(dspp.dp->dp_meta_objset, + dspp.to_ds->ds_object, + DS_FIELD_RESUME_TOGUID, 8, 1, + &dspp.saved_guid); + } + + if (err == 0) { + err = zap_lookup(dspp.dp->dp_meta_objset, + dspp.to_ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, + sizeof (dspp.saved_toname), + dspp.saved_toname); + } + if (err != 0) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + + kmem_strfree(name); + } else { + err = dsl_dataset_own(dspp.dp, tosnap, dsflags, + FTAG, &dspp.to_ds); + } + owned = B_TRUE; + } else { + err = dsl_dataset_hold_flags(dspp.dp, tosnap, dsflags, FTAG, + &dspp.to_ds); + } + + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + return (err); + } + + if (redactbook != NULL) { + char path[ZFS_MAX_DATASET_NAME_LEN]; + (void) strlcpy(path, tosnap, sizeof (path)); + char *at = strchr(path, '@'); + if (at == NULL) { + err = EINVAL; + } else { + (void) snprintf(at, sizeof (path) - (at - path), "#%s", + redactbook); + err = dsl_bookmark_lookup(dspp.dp, path, + NULL, &book); + dspp.redactbook = &book; + } + } + + if (err != 0) { + dsl_pool_rele(dspp.dp, FTAG); + if (owned) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + return (err); + } + + if (fromsnap != NULL) { + zfs_bookmark_phys_t *zb = &dspp.ancestor_zb; + int fsnamelen; + if (strpbrk(tosnap, "@#") != NULL) + fsnamelen = strpbrk(tosnap, "@#") - tosnap; + else + fsnamelen = strlen(tosnap); + + /* + * If the fromsnap is in a different filesystem, then + * mark the send stream as a clone. + */ + if (strncmp(tosnap, fromsnap, fsnamelen) != 0 || + (fromsnap[fsnamelen] != '@' && + fromsnap[fsnamelen] != '#')) { + dspp.is_clone = B_TRUE; + } + + if (strchr(fromsnap, '@') != NULL) { + err = dsl_dataset_hold(dspp.dp, fromsnap, FTAG, + &fromds); + + if (err != 0) { + ASSERT3P(fromds, ==, NULL); + } else { + /* + * We need to make a deep copy of the redact + * snapshots of the from snapshot, because the + * array will be freed when we evict from_ds. + */ + uint64_t *fromredact; + if (!dsl_dataset_get_uint64_array_feature( + fromds, SPA_FEATURE_REDACTED_DATASETS, + &dspp.numfromredactsnaps, + &fromredact)) { + dspp.numfromredactsnaps = + NUM_SNAPS_NOT_REDACTED; + } else if (dspp.numfromredactsnaps > 0) { + uint64_t size = + dspp.numfromredactsnaps * + sizeof (uint64_t); + dspp.fromredactsnaps = kmem_zalloc(size, + KM_SLEEP); + bcopy(fromredact, dspp.fromredactsnaps, + size); + } + if (!dsl_dataset_is_before(dspp.to_ds, fromds, + 0)) { + err = SET_ERROR(EXDEV); + } else { + zb->zbm_creation_txg = + dsl_dataset_phys(fromds)-> + ds_creation_txg; + zb->zbm_creation_time = + dsl_dataset_phys(fromds)-> + ds_creation_time; + zb->zbm_guid = + dsl_dataset_phys(fromds)->ds_guid; + zb->zbm_redaction_obj = 0; + + if (dsl_dataset_is_zapified(fromds)) { + (void) zap_lookup( + dspp.dp->dp_meta_objset, + fromds->ds_object, + DS_FIELD_IVSET_GUID, 8, 1, + &zb->zbm_ivset_guid); + } + } + dsl_dataset_rele(fromds, FTAG); + } + } else { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dsl_bookmark_lookup(dspp.dp, fromsnap, dspp.to_ds, + zb); + if (err == EXDEV && zb->zbm_redaction_obj != 0 && + zb->zbm_guid == + dsl_dataset_phys(dspp.to_ds)->ds_guid) + err = 0; + } + + if (err == 0) { + /* dmu_send_impl will call dsl_pool_rele for us. */ + err = dmu_send_impl(&dspp); + } else { + dsl_pool_rele(dspp.dp, FTAG); + } + } else { + dspp.numfromredactsnaps = NUM_SNAPS_NOT_REDACTED; + err = dmu_send_impl(&dspp); + } + if (owned) + dsl_dataset_disown(dspp.to_ds, dsflags, FTAG); + else + dsl_dataset_rele_flags(dspp.to_ds, dsflags, FTAG); + return (err); +} + +static int +dmu_adjust_send_estimate_for_indirects(dsl_dataset_t *ds, uint64_t uncompressed, + uint64_t compressed, boolean_t stream_compressed, uint64_t *sizep) +{ + int err = 0; + uint64_t size; + /* + * Assume that space (both on-disk and in-stream) is dominated by + * data. We will adjust for indirect blocks and the copies property, + * but ignore per-object space used (eg, dnodes and DRR_OBJECT records). + */ + + uint64_t recordsize; + uint64_t record_count; + objset_t *os; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + /* Assume all (uncompressed) blocks are recordsize. */ + if (zfs_override_estimate_recordsize != 0) { + recordsize = zfs_override_estimate_recordsize; + } else if (os->os_phys->os_type == DMU_OST_ZVOL) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &recordsize); + } else { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RECORDSIZE), &recordsize); + } + if (err != 0) + return (err); + record_count = uncompressed / recordsize; + + /* + * If we're estimating a send size for a compressed stream, use the + * compressed data size to estimate the stream size. Otherwise, use the + * uncompressed data size. + */ + size = stream_compressed ? compressed : uncompressed; + + /* + * Subtract out approximate space used by indirect blocks. + * Assume most space is used by data blocks (non-indirect, non-dnode). + * Assume no ditto blocks or internal fragmentation. + * + * Therefore, space used by indirect blocks is sizeof(blkptr_t) per + * block. + */ + size -= record_count * sizeof (blkptr_t); + + /* Add in the space for the record associated with each block. */ + size += record_count * sizeof (dmu_replay_record_t); + + *sizep = size; + + return (0); +} + +int +dmu_send_estimate_fast(dsl_dataset_t *origds, dsl_dataset_t *fromds, + zfs_bookmark_phys_t *frombook, boolean_t stream_compressed, + boolean_t saved, uint64_t *sizep) +{ + int err; + dsl_dataset_t *ds = origds; + uint64_t uncomp, comp; + + ASSERT(dsl_pool_config_held(origds->ds_dir->dd_pool)); + ASSERT(fromds == NULL || frombook == NULL); + + /* + * If this is a saved send we may actually be sending + * from the %recv clone used for resuming. + */ + if (saved) { + objset_t *mos = origds->ds_dir->dd_pool->dp_meta_objset; + uint64_t guid; + char dsname[ZFS_MAX_DATASET_NAME_LEN + 6]; + + dsl_dataset_name(origds, dsname); + (void) strcat(dsname, "/"); + (void) strcat(dsname, recv_clone_name); + + err = dsl_dataset_hold(origds->ds_dir->dd_pool, + dsname, FTAG, &ds); + if (err != ENOENT && err != 0) { + return (err); + } else if (err == ENOENT) { + ds = origds; + } + + /* check that this dataset has partially received data */ + err = zap_lookup(mos, ds->ds_object, + DS_FIELD_RESUME_TOGUID, 8, 1, &guid); + if (err != 0) { + err = SET_ERROR(err == ENOENT ? EINVAL : err); + goto out; + } + + err = zap_lookup(mos, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (dsname), dsname); + if (err != 0) { + err = SET_ERROR(err == ENOENT ? EINVAL : err); + goto out; + } + } + + /* tosnap must be a snapshot or the target of a saved send */ + if (!ds->ds_is_snapshot && ds == origds) + return (SET_ERROR(EINVAL)); + + if (fromds != NULL) { + uint64_t used; + if (!fromds->ds_is_snapshot) { + err = SET_ERROR(EINVAL); + goto out; + } + + if (!dsl_dataset_is_before(ds, fromds, 0)) { + err = SET_ERROR(EXDEV); + goto out; + } + + err = dsl_dataset_space_written(fromds, ds, &used, &comp, + &uncomp); + if (err != 0) + goto out; + } else if (frombook != NULL) { + uint64_t used; + err = dsl_dataset_space_written_bookmark(frombook, ds, &used, + &comp, &uncomp); + if (err != 0) + goto out; + } else { + uncomp = dsl_dataset_phys(ds)->ds_uncompressed_bytes; + comp = dsl_dataset_phys(ds)->ds_compressed_bytes; + } + + err = dmu_adjust_send_estimate_for_indirects(ds, uncomp, comp, + stream_compressed, sizep); + /* + * Add the size of the BEGIN and END records to the estimate. + */ + *sizep += 2 * sizeof (dmu_replay_record_t); + +out: + if (ds != origds) + dsl_dataset_rele(ds, FTAG); + return (err); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_send, zfs_send_, corrupt_data, INT, ZMOD_RW, + "Allow sending corrupt data"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_length, INT, ZMOD_RW, + "Maximum send queue length"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, unmodified_spill_blocks, INT, ZMOD_RW, + "Send unmodified spill blocks"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_length, INT, ZMOD_RW, + "Maximum send queue length for non-prefetch queues"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, queue_ff, INT, ZMOD_RW, + "Send queue fill fraction"); + +ZFS_MODULE_PARAM(zfs_send, zfs_send_, no_prefetch_queue_ff, INT, ZMOD_RW, + "Send queue fill fraction for non-prefetch queues"); + +ZFS_MODULE_PARAM(zfs_send, zfs_, override_estimate_recordsize, INT, ZMOD_RW, + "Override block size estimate with fixed size"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_traverse.c b/module/zfs/dmu_traverse.c new file mode 100644 index 000000000000..83830fe39279 --- /dev/null +++ b/module/zfs/dmu_traverse.c @@ -0,0 +1,786 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int32_t zfs_pd_bytes_max = 50 * 1024 * 1024; /* 50MB */ +int32_t send_holes_without_birth_time = 1; + +typedef struct prefetch_data { + kmutex_t pd_mtx; + kcondvar_t pd_cv; + int32_t pd_bytes_fetched; + int pd_flags; + boolean_t pd_cancel; + boolean_t pd_exited; + zbookmark_phys_t pd_resume; +} prefetch_data_t; + +typedef struct traverse_data { + spa_t *td_spa; + uint64_t td_objset; + blkptr_t *td_rootbp; + uint64_t td_min_txg; + zbookmark_phys_t *td_resume; + int td_flags; + prefetch_data_t *td_pfd; + boolean_t td_paused; + uint64_t td_hole_birth_enabled_txg; + blkptr_cb_t *td_func; + void *td_arg; + boolean_t td_realloc_possible; +} traverse_data_t; + +static int traverse_dnode(traverse_data_t *td, const blkptr_t *bp, + const dnode_phys_t *dnp, uint64_t objset, uint64_t object); +static void prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *, + uint64_t objset, uint64_t object); + +static int +traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp)) + return (0); + + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(td->td_spa)) + return (-1); + + SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); + + return (0); +} + +static int +traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + traverse_data_t *td = arg; + + if (lrc->lrc_txtype == TX_WRITE) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + + if (BP_IS_HOLE(bp)) + return (0); + + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, + td->td_arg); + } + return (0); +} + +static void +traverse_zil(traverse_data_t *td, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed; plus blocks that are already stable in read-only mode. + */ + if (claim_txg == 0 && spa_writeable(td->td_spa)) + return; + + zilog_t *zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); + (void) zil_parse(zilog, traverse_zil_block, traverse_zil_record, td, + claim_txg, !(td->td_flags & TRAVERSE_NO_DECRYPT)); + zil_free(zilog); +} + +typedef enum resume_skip { + RESUME_SKIP_ALL, + RESUME_SKIP_NONE, + RESUME_SKIP_CHILDREN +} resume_skip_t; + +/* + * Returns RESUME_SKIP_ALL if td indicates that we are resuming a traversal and + * the block indicated by zb does not need to be visited at all. Returns + * RESUME_SKIP_CHILDREN if we are resuming a post traversal and we reach the + * resume point. This indicates that this block should be visited but not its + * children (since they must have been visited in a previous traversal). + * Otherwise returns RESUME_SKIP_NONE. + */ +static resume_skip_t +resume_skip_check(traverse_data_t *td, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) { + /* + * If we already visited this bp & everything below, + * don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, td->td_resume)) + return (RESUME_SKIP_ALL); + + /* + * If we found the block we're trying to resume from, zero + * the bookmark out to indicate that we have resumed. + */ + if (bcmp(zb, td->td_resume, sizeof (*zb)) == 0) { + bzero(td->td_resume, sizeof (*zb)); + if (td->td_flags & TRAVERSE_POST) + return (RESUME_SKIP_CHILDREN); + } + } + return (RESUME_SKIP_NONE); +} + +static void +traverse_prefetch_metadata(traverse_data_t *td, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + + if (!(td->td_flags & TRAVERSE_PREFETCH_METADATA)) + return; + /* + * If we are in the process of resuming, don't prefetch, because + * some children will not be needed (and in fact may have already + * been freed). + */ + if (td->td_resume != NULL && !ZB_IS_ZERO(td->td_resume)) + return; + if (BP_IS_HOLE(bp) || bp->blk_birth <= td->td_min_txg) + return; + if (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE) + return; + ASSERT(!BP_IS_REDACTED(bp)); + + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + + (void) arc_read(NULL, td->td_spa, bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); +} + +static boolean_t +prefetch_needed(prefetch_data_t *pfd, const blkptr_t *bp) +{ + ASSERT(pfd->pd_flags & TRAVERSE_PREFETCH_DATA); + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG || BP_IS_REDACTED(bp)) + return (B_FALSE); + return (B_TRUE); +} + +static int +traverse_visitbp(traverse_data_t *td, const dnode_phys_t *dnp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + int err = 0; + arc_buf_t *buf = NULL; + prefetch_data_t *pd = td->td_pfd; + + switch (resume_skip_check(td, dnp, zb)) { + case RESUME_SKIP_ALL: + return (0); + case RESUME_SKIP_CHILDREN: + goto post; + case RESUME_SKIP_NONE: + break; + default: + ASSERT(0); + } + + if (bp->blk_birth == 0) { + /* + * Since this block has a birth time of 0 it must be one of + * two things: a hole created before the + * SPA_FEATURE_HOLE_BIRTH feature was enabled, or a hole + * which has always been a hole in an object. + * + * If a file is written sparsely, then the unwritten parts of + * the file were "always holes" -- that is, they have been + * holes since this object was allocated. However, we (and + * our callers) can not necessarily tell when an object was + * allocated. Therefore, if it's possible that this object + * was freed and then its object number reused, we need to + * visit all the holes with birth==0. + * + * If it isn't possible that the object number was reused, + * then if SPA_FEATURE_HOLE_BIRTH was enabled before we wrote + * all the blocks we will visit as part of this traversal, + * then this hole must have always existed, so we can skip + * it. We visit blocks born after (exclusive) td_min_txg. + * + * Note that the meta-dnode cannot be reallocated. + */ + if (!send_holes_without_birth_time && + (!td->td_realloc_possible || + zb->zb_object == DMU_META_DNODE_OBJECT) && + td->td_hole_birth_enabled_txg <= td->td_min_txg) + return (0); + } else if (bp->blk_birth <= td->td_min_txg) { + return (0); + } + + if (pd != NULL && !pd->pd_exited && prefetch_needed(pd, bp)) { + uint64_t size = BP_GET_LSIZE(bp); + mutex_enter(&pd->pd_mtx); + ASSERT(pd->pd_bytes_fetched >= 0); + while (pd->pd_bytes_fetched < size && !pd->pd_exited) + cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); + pd->pd_bytes_fetched -= size; + cv_broadcast(&pd->pd_cv); + mutex_exit(&pd->pd_mtx); + } + + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + if (err != 0) + goto post; + return (0); + } + + if (td->td_flags & TRAVERSE_PRE) { + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + goto post; + } + + if (BP_GET_LEVEL(bp) > 0) { + uint32_t flags = ARC_FLAG_WAIT; + int32_t i; + int32_t epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t *czb; + + ASSERT(!BP_IS_PROTECTED(bp)); + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, &flags, zb); + if (err != 0) + goto post; + + czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + for (i = 0; i < epb; i++) { + SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + traverse_prefetch_metadata(td, + &((blkptr_t *)buf->b_data)[i], czb); + } + + /* recursively visitbp() blocks below this */ + for (i = 0; i < epb; i++) { + SET_BOOKMARK(czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + err = traverse_visitbp(td, dnp, + &((blkptr_t *)buf->b_data)[i], czb); + if (err != 0) + break; + } + + kmem_free(czb, sizeof (zbookmark_phys_t)); + + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + uint32_t flags = ARC_FLAG_WAIT; + uint32_t zio_flags = ZIO_FLAG_CANFAIL; + int32_t i; + int32_t epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + dnode_phys_t *child_dnp; + + /* + * dnode blocks might have their bonus buffers encrypted, so + * we must be careful to honor TRAVERSE_NO_DECRYPT + */ + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err != 0) + goto post; + + child_dnp = buf->b_data; + + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + prefetch_dnode_metadata(td, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + } + + /* recursively visitbp() blocks below this */ + for (i = 0; i < epb; i += child_dnp[i].dn_extra_slots + 1) { + err = traverse_dnode(td, bp, &child_dnp[i], + zb->zb_objset, zb->zb_blkid * epb + i); + if (err != 0) + break; + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + uint32_t zio_flags = ZIO_FLAG_CANFAIL; + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + + err = arc_read(NULL, td->td_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + if (err != 0) + goto post; + + osp = buf->b_data; + prefetch_dnode_metadata(td, &osp->os_meta_dnode, zb->zb_objset, + DMU_META_DNODE_OBJECT); + /* + * See the block comment above for the goal of this variable. + * If the maxblkid of the meta-dnode is 0, then we know that + * we've never had more than DNODES_PER_BLOCK objects in the + * dataset, which means we can't have reused any object ids. + */ + if (osp->os_meta_dnode.dn_maxblkid == 0) + td->td_realloc_possible = B_FALSE; + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) + prefetch_dnode_metadata(td, + &osp->os_projectused_dnode, + zb->zb_objset, DMU_PROJECTUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_groupused_dnode, + zb->zb_objset, DMU_GROUPUSED_OBJECT); + prefetch_dnode_metadata(td, &osp->os_userused_dnode, + zb->zb_objset, DMU_USERUSED_OBJECT); + } + + err = traverse_dnode(td, bp, &osp->os_meta_dnode, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (err == 0 && OBJSET_BUF_HAS_USERUSED(buf)) { + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) + err = traverse_dnode(td, bp, + &osp->os_projectused_dnode, zb->zb_objset, + DMU_PROJECTUSED_OBJECT); + if (err == 0) + err = traverse_dnode(td, bp, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + if (err == 0) + err = traverse_dnode(td, bp, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + } + + if (buf) + arc_buf_destroy(buf, &buf); + +post: + if (err == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); + + if ((td->td_flags & TRAVERSE_HARD) && (err == EIO || err == ECKSUM)) { + /* + * Ignore this disk error as requested by the HARD flag, + * and continue traversal. + */ + err = 0; + } + + /* + * If we are stopping here, set td_resume. + */ + if (td->td_resume != NULL && err != 0 && !td->td_paused) { + td->td_resume->zb_objset = zb->zb_objset; + td->td_resume->zb_object = zb->zb_object; + td->td_resume->zb_level = 0; + /* + * If we have stopped on an indirect block (e.g. due to + * i/o error), we have not visited anything below it. + * Set the bookmark to the first level-0 block that we need + * to visit. This way, the resuming code does not need to + * deal with resuming from indirect blocks. + * + * Note, if zb_level <= 0, dnp may be NULL, so we don't want + * to dereference it. + */ + td->td_resume->zb_blkid = zb->zb_blkid; + if (zb->zb_level > 0) { + td->td_resume->zb_blkid <<= zb->zb_level * + (dnp->dn_indblkshift - SPA_BLKPTRSHIFT); + } + td->td_paused = B_TRUE; + } + + return (err); +} + +static void +prefetch_dnode_metadata(traverse_data_t *td, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int j; + zbookmark_phys_t czb; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + traverse_prefetch_metadata(td, &dnp->dn_blkptr[j], &czb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + traverse_prefetch_metadata(td, DN_SPILL_BLKPTR(dnp), &czb); + } +} + +static int +traverse_dnode(traverse_data_t *td, const blkptr_t *bp, const dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int j, err = 0; + zbookmark_phys_t czb; + + if (object != DMU_META_DNODE_OBJECT && td->td_resume != NULL && + object < td->td_resume->zb_object) + return (0); + + if (td->td_flags & TRAVERSE_PRE) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, &dnp->dn_blkptr[j], &czb); + if (err != 0) + break; + } + + if (err == 0 && (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) { + SET_BOOKMARK(&czb, objset, object, 0, DMU_SPILL_BLKID); + err = traverse_visitbp(td, dnp, DN_SPILL_BLKPTR(dnp), &czb); + } + + if (err == 0 && (td->td_flags & TRAVERSE_POST)) { + SET_BOOKMARK(&czb, objset, object, ZB_DNODE_LEVEL, + ZB_DNODE_BLKID); + err = td->td_func(td->td_spa, NULL, bp, &czb, dnp, + td->td_arg); + if (err == TRAVERSE_VISIT_NO_CHILDREN) + return (0); + if (err != 0) + return (err); + } + return (err); +} + +/* ARGSUSED */ +static int +traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + prefetch_data_t *pfd = arg; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; + + ASSERT(pfd->pd_bytes_fetched >= 0); + if (zb->zb_level == ZB_DNODE_LEVEL) + return (0); + if (pfd->pd_cancel) + return (SET_ERROR(EINTR)); + + if (!prefetch_needed(pfd, bp)) + return (0); + + mutex_enter(&pfd->pd_mtx); + while (!pfd->pd_cancel && pfd->pd_bytes_fetched >= zfs_pd_bytes_max) + cv_wait_sig(&pfd->pd_cv, &pfd->pd_mtx); + pfd->pd_bytes_fetched += BP_GET_LSIZE(bp); + cv_broadcast(&pfd->pd_cv); + mutex_exit(&pfd->pd_mtx); + + if ((pfd->pd_flags & TRAVERSE_NO_DECRYPT) && BP_IS_PROTECTED(bp)) + zio_flags |= ZIO_FLAG_RAW; + + (void) arc_read(NULL, spa, bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, + zio_flags, &aflags, zb); + + return (0); +} + +static void +traverse_prefetch_thread(void *arg) +{ + traverse_data_t *td_main = arg; + traverse_data_t td = *td_main; + zbookmark_phys_t czb; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + td.td_func = traverse_prefetcher; + td.td_arg = td_main->td_pfd; + td.td_pfd = NULL; + td.td_resume = &td_main->td_pfd->pd_resume; + + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + (void) traverse_visitbp(&td, NULL, td.td_rootbp, &czb); + + mutex_enter(&td_main->td_pfd->pd_mtx); + td_main->td_pfd->pd_exited = B_TRUE; + cv_broadcast(&td_main->td_pfd->pd_cv); + mutex_exit(&td_main->td_pfd->pd_mtx); + spl_fstrans_unmark(cookie); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +static int +traverse_impl(spa_t *spa, dsl_dataset_t *ds, uint64_t objset, blkptr_t *rootbp, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + traverse_data_t *td; + prefetch_data_t *pd; + zbookmark_phys_t *czb; + int err; + + ASSERT(ds == NULL || objset == ds->ds_object); + ASSERT(!(flags & TRAVERSE_PRE) || !(flags & TRAVERSE_POST)); + + td = kmem_alloc(sizeof (traverse_data_t), KM_SLEEP); + pd = kmem_zalloc(sizeof (prefetch_data_t), KM_SLEEP); + czb = kmem_alloc(sizeof (zbookmark_phys_t), KM_SLEEP); + + td->td_spa = spa; + td->td_objset = objset; + td->td_rootbp = rootbp; + td->td_min_txg = txg_start; + td->td_resume = resume; + td->td_func = func; + td->td_arg = arg; + td->td_pfd = pd; + td->td_flags = flags; + td->td_paused = B_FALSE; + td->td_realloc_possible = (txg_start == 0 ? B_FALSE : B_TRUE); + + if (spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + VERIFY(spa_feature_enabled_txg(spa, + SPA_FEATURE_HOLE_BIRTH, &td->td_hole_birth_enabled_txg)); + } else { + td->td_hole_birth_enabled_txg = UINT64_MAX; + } + + pd->pd_flags = flags; + if (resume != NULL) + pd->pd_resume = *resume; + mutex_init(&pd->pd_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&pd->pd_cv, NULL, CV_DEFAULT, NULL); + + SET_BOOKMARK(czb, td->td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + /* See comment on ZIL traversal in dsl_scan_visitds. */ + if (ds != NULL && !ds->ds_is_snapshot && !BP_IS_HOLE(rootbp)) { + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + uint32_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + ASSERT(!BP_IS_REDACTED(rootbp)); + + if ((td->td_flags & TRAVERSE_NO_DECRYPT) && + BP_IS_PROTECTED(rootbp)) + zio_flags |= ZIO_FLAG_RAW; + + err = arc_read(NULL, td->td_spa, rootbp, arc_getbuf_func, + &buf, ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, czb); + if (err != 0) { + /* + * If both TRAVERSE_HARD and TRAVERSE_PRE are set, + * continue to visitbp so that td_func can be called + * in pre stage, and err will reset to zero. + */ + if (!(td->td_flags & TRAVERSE_HARD) || + !(td->td_flags & TRAVERSE_PRE)) + goto out; + } else { + osp = buf->b_data; + traverse_zil(td, &osp->os_zil_header); + arc_buf_destroy(buf, &buf); + } + } + + if (!(flags & TRAVERSE_PREFETCH_DATA) || + taskq_dispatch(spa->spa_prefetch_taskq, traverse_prefetch_thread, + td, TQ_NOQUEUE) == TASKQID_INVALID) + pd->pd_exited = B_TRUE; + + err = traverse_visitbp(td, NULL, rootbp, czb); + + mutex_enter(&pd->pd_mtx); + pd->pd_cancel = B_TRUE; + cv_broadcast(&pd->pd_cv); + while (!pd->pd_exited) + cv_wait_sig(&pd->pd_cv, &pd->pd_mtx); + mutex_exit(&pd->pd_mtx); +out: + mutex_destroy(&pd->pd_mtx); + cv_destroy(&pd->pd_cv); + + kmem_free(czb, sizeof (zbookmark_phys_t)); + kmem_free(pd, sizeof (struct prefetch_data)); + kmem_free(td, sizeof (struct traverse_data)); + + return (err); +} + +/* + * NB: dataset must not be changing on-disk (eg, is a snapshot or we are + * in syncing context). + */ +int +traverse_dataset_resume(dsl_dataset_t *ds, uint64_t txg_start, + zbookmark_phys_t *resume, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_impl(ds->ds_dir->dd_pool->dp_spa, ds, ds->ds_object, + &dsl_dataset_phys(ds)->ds_bp, txg_start, resume, flags, func, arg)); +} + +int +traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, + int flags, blkptr_cb_t func, void *arg) +{ + return (traverse_dataset_resume(ds, txg_start, NULL, flags, func, arg)); +} + +int +traverse_dataset_destroyed(spa_t *spa, blkptr_t *blkptr, + uint64_t txg_start, zbookmark_phys_t *resume, int flags, + blkptr_cb_t func, void *arg) +{ + return (traverse_impl(spa, NULL, ZB_DESTROYED_OBJSET, + blkptr, txg_start, resume, flags, func, arg)); +} + +/* + * NB: pool must not be changing on-disk (eg, from zdb or sync context). + */ +int +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) +{ + int err; + dsl_pool_t *dp = spa_get_dsl(spa); + objset_t *mos = dp->dp_meta_objset; + boolean_t hard = (flags & TRAVERSE_HARD); + + /* visit the MOS */ + err = traverse_impl(spa, NULL, 0, spa_get_rootblkptr(spa), + txg_start, NULL, flags, func, arg); + if (err != 0) + return (err); + + /* visit each dataset */ + for (uint64_t obj = 1; err == 0; + err = dmu_object_next(mos, &obj, B_FALSE, txg_start)) { + dmu_object_info_t doi; + + err = dmu_object_info(mos, obj, &doi); + if (err != 0) { + if (hard) + continue; + break; + } + + if (doi.doi_bonus_type == DMU_OT_DSL_DATASET) { + dsl_dataset_t *ds; + uint64_t txg = txg_start; + + dsl_pool_config_enter(dp, FTAG); + err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + dsl_pool_config_exit(dp, FTAG); + if (err != 0) { + if (hard) + continue; + break; + } + if (dsl_dataset_phys(ds)->ds_prev_snap_txg > txg) + txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + err = traverse_dataset(ds, txg, flags, func, arg); + dsl_dataset_rele(ds, FTAG); + if (err != 0) + break; + } + } + if (err == ESRCH) + err = 0; + return (err); +} + +EXPORT_SYMBOL(traverse_dataset); +EXPORT_SYMBOL(traverse_pool); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, pd_bytes_max, INT, ZMOD_RW, + "Max number of bytes to prefetch"); + +#if defined(_KERNEL) +module_param_named(ignore_hole_birth, send_holes_without_birth_time, int, 0644); +MODULE_PARM_DESC(ignore_hole_birth, + "Alias for send_holes_without_birth_time"); +#endif + +ZFS_MODULE_PARAM(zfs, , send_holes_without_birth_time, INT, ZMOD_RW, + "Ignore hole_birth txg for zfs send"); +/* END CSTYLED */ diff --git a/module/zfs/dmu_tx.c b/module/zfs/dmu_tx.c new file mode 100644 index 000000000000..09ef2be94944 --- /dev/null +++ b/module/zfs/dmu_tx.c @@ -0,0 +1,1404 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef void (*dmu_tx_hold_func_t)(dmu_tx_t *tx, struct dnode *dn, + uint64_t arg1, uint64_t arg2); + +dmu_tx_stats_t dmu_tx_stats = { + { "dmu_tx_assigned", KSTAT_DATA_UINT64 }, + { "dmu_tx_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_error", KSTAT_DATA_UINT64 }, + { "dmu_tx_suspended", KSTAT_DATA_UINT64 }, + { "dmu_tx_group", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reserve", KSTAT_DATA_UINT64 }, + { "dmu_tx_memory_reclaim", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_throttle", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_over_max", KSTAT_DATA_UINT64 }, + { "dmu_tx_dirty_frees_delay", KSTAT_DATA_UINT64 }, + { "dmu_tx_quota", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dmu_tx_ksp; + +dmu_tx_t * +dmu_tx_create_dd(dsl_dir_t *dd) +{ + dmu_tx_t *tx = kmem_zalloc(sizeof (dmu_tx_t), KM_SLEEP); + tx->tx_dir = dd; + if (dd != NULL) + tx->tx_pool = dd->dd_pool; + list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), + offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + tx->tx_start = gethrtime(); + return (tx); +} + +dmu_tx_t * +dmu_tx_create(objset_t *os) +{ + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); + tx->tx_objset = os; + return (tx); +} + +dmu_tx_t * +dmu_tx_create_assigned(struct dsl_pool *dp, uint64_t txg) +{ + dmu_tx_t *tx = dmu_tx_create_dd(NULL); + + TXG_VERIFY(dp->dp_spa, txg); + tx->tx_pool = dp; + tx->tx_txg = txg; + tx->tx_anyobj = TRUE; + + return (tx); +} + +int +dmu_tx_is_syncing(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +int +dmu_tx_private_ok(dmu_tx_t *tx) +{ + return (tx->tx_anyobj); +} + +static dmu_tx_hold_t * +dmu_tx_hold_dnode_impl(dmu_tx_t *tx, dnode_t *dn, enum dmu_tx_hold_type type, + uint64_t arg1, uint64_t arg2) +{ + dmu_tx_hold_t *txh; + + if (dn != NULL) { + (void) zfs_refcount_add(&dn->dn_holds, tx); + if (tx->tx_txg != 0) { + mutex_enter(&dn->dn_mtx); + /* + * dn->dn_assigned_txg == tx->tx_txg doesn't pose a + * problem, but there's no way for it to happen (for + * now, at least). + */ + ASSERT(dn->dn_assigned_txg == 0); + dn->dn_assigned_txg = tx->tx_txg; + (void) zfs_refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + } + + txh = kmem_zalloc(sizeof (dmu_tx_hold_t), KM_SLEEP); + txh->txh_tx = tx; + txh->txh_dnode = dn; + zfs_refcount_create(&txh->txh_space_towrite); + zfs_refcount_create(&txh->txh_memory_tohold); + txh->txh_type = type; + txh->txh_arg1 = arg1; + txh->txh_arg2 = arg2; + list_insert_tail(&tx->tx_holds, txh); + + return (txh); +} + +static dmu_tx_hold_t * +dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, + enum dmu_tx_hold_type type, uint64_t arg1, uint64_t arg2) +{ + dnode_t *dn = NULL; + dmu_tx_hold_t *txh; + int err; + + if (object != DMU_NEW_OBJECT) { + err = dnode_hold(os, object, FTAG, &dn); + if (err != 0) { + tx->tx_err = err; + return (NULL); + } + } + txh = dmu_tx_hold_dnode_impl(tx, dn, type, arg1, arg2); + if (dn != NULL) + dnode_rele(dn, FTAG); + return (txh); +} + +void +dmu_tx_add_new_object(dmu_tx_t *tx, dnode_t *dn) +{ + /* + * If we're syncing, they can manipulate any object anyhow, and + * the hold on the dnode_t can cause problems. + */ + if (!dmu_tx_is_syncing(tx)) + (void) dmu_tx_hold_dnode_impl(tx, dn, THT_NEWOBJECT, 0, 0); +} + +/* + * This function reads specified data from disk. The specified data will + * be needed to perform the transaction -- i.e, it will be read after + * we do dmu_tx_assign(). There are two reasons that we read the data now + * (before dmu_tx_assign()): + * + * 1. Reading it now has potentially better performance. The transaction + * has not yet been assigned, so the TXG is not held open, and also the + * caller typically has less locks held when calling dmu_tx_hold_*() than + * after the transaction has been assigned. This reduces the lock (and txg) + * hold times, thus reducing lock contention. + * + * 2. It is easier for callers (primarily the ZPL) to handle i/o errors + * that are detected before they start making changes to the DMU state + * (i.e. now). Once the transaction has been assigned, and some DMU + * state has been changed, it can be difficult to recover from an i/o + * error (e.g. to undo the changes already made in memory at the DMU + * layer). Typically code to do so does not exist in the caller -- it + * assumes that the data has already been cached and thus i/o errors are + * not possible. + * + * It has been observed that the i/o initiated here can be a performance + * problem, and it appears to be optional, because we don't look at the + * data which is read. However, removing this read would only serve to + * move the work elsewhere (after the dmu_tx_assign()), where it may + * have a greater impact on performance (in addition to the impact on + * fault tolerance noted above). + */ +static int +dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) +{ + int err; + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, level, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); + if (db == NULL) + return (SET_ERROR(EIO)); + err = dbuf_read(db, zio, DB_RF_CANFAIL | DB_RF_NOPREFETCH); + dbuf_rele(db, FTAG); + return (err); +} + +/* ARGSUSED */ +static void +dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dnode_t *dn = txh->txh_dnode; + int err = 0; + + if (len == 0) + return; + + (void) zfs_refcount_add_many(&txh->txh_space_towrite, len, FTAG); + + if (zfs_refcount_count(&txh->txh_space_towrite) > 2 * DMU_MAX_ACCESS) + err = SET_ERROR(EFBIG); + + if (dn == NULL) + return; + + /* + * For i/o error checking, read the blocks that will be needed + * to perform the write: the first and last level-0 blocks (if + * they are not aligned, i.e. if they are partial-block writes), + * and all the level-1 blocks. + */ + if (dn->dn_maxblkid == 0) { + if (off < dn->dn_datablksz && + (off > 0 || len < dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } else { + zio_t *zio = zio_root(dn->dn_objset->os_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + + /* first level-0 block */ + uint64_t start = off >> dn->dn_datablkshift; + if (P2PHASE(off, dn->dn_datablksz) || len < dn->dn_datablksz) { + err = dmu_tx_check_ioerr(zio, dn, 0, start); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + /* last level-0 block */ + uint64_t end = (off + len - 1) >> dn->dn_datablkshift; + if (end != start && end <= dn->dn_maxblkid && + P2PHASE(off + len, dn->dn_datablksz)) { + err = dmu_tx_check_ioerr(zio, dn, 0, end); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + + /* level-1 blocks */ + if (dn->dn_nlevels > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (uint64_t i = (start >> shft) + 1; + i < end >> shft; i++) { + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } + } + + err = zio_wait(zio); + if (err != 0) { + txh->txh_tx->tx_err = err; + } + } +} + +static void +dmu_tx_count_dnode(dmu_tx_hold_t *txh) +{ + (void) zfs_refcount_add_many(&txh->txh_space_towrite, + DNODE_MIN_SIZE, FTAG); +} + +void +dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_WRITE, off, len); + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +void +dmu_tx_hold_write_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, int len) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT3U(len, <=, DMU_MAX_ACCESS); + ASSERT(len == 0 || UINT64_MAX - off >= len - 1); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_WRITE, off, len); + if (txh != NULL) { + dmu_tx_count_write(txh, off, len); + dmu_tx_count_dnode(txh); + } +} + +/* + * This function marks the transaction as being a "net free". The end + * result is that refquotas will be disabled for this transaction, and + * this transaction will be able to use half of the pool space overhead + * (see dsl_pool_adjustedsize()). Therefore this function should only + * be called for transactions that we expect will not cause a net increase + * in the amount of space used (but it's OK if that is occasionally not true). + */ +void +dmu_tx_mark_netfree(dmu_tx_t *tx) +{ + tx->tx_netfree = B_TRUE; +} + +static void +dmu_tx_hold_free_impl(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) +{ + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn = txh->txh_dnode; + int err; + + ASSERT(tx->tx_txg == 0); + + dmu_tx_count_dnode(txh); + + if (off >= (dn->dn_maxblkid + 1) * dn->dn_datablksz) + return; + if (len == DMU_OBJECT_END) + len = (dn->dn_maxblkid + 1) * dn->dn_datablksz - off; + + dmu_tx_count_dnode(txh); + + /* + * For i/o error checking, we read the first and last level-0 + * blocks if they are not aligned, and all the level-1 blocks. + * + * Note: dbuf_free_range() assumes that we have not instantiated + * any level-0 dbufs that will be completely freed. Therefore we must + * exercise care to not read or count the first and last blocks + * if they are blocksize-aligned. + */ + if (dn->dn_datablkshift == 0) { + if (off != 0 || len < dn->dn_datablksz) + dmu_tx_count_write(txh, 0, dn->dn_datablksz); + } else { + /* first block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off, 1); + /* last block will be modified if it is not aligned */ + if (!IS_P2ALIGNED(off + len, 1 << dn->dn_datablkshift)) + dmu_tx_count_write(txh, off + len, 1); + } + + /* + * Check level-1 blocks. + */ + if (dn->dn_nlevels > 1) { + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + uint64_t start = off >> shift; + uint64_t end = (off + len) >> shift; + + ASSERT(dn->dn_indblkshift != 0); + + /* + * dnode_reallocate() can result in an object with indirect + * blocks having an odd data block size. In this case, + * just check the single block. + */ + if (dn->dn_datablkshift == 0) + start = end = 0; + + zio_t *zio = zio_root(tx->tx_pool->dp_spa, + NULL, NULL, ZIO_FLAG_CANFAIL); + for (uint64_t i = start; i <= end; i++) { + uint64_t ibyte = i << shift; + err = dnode_next_offset(dn, 0, &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (err == ESRCH || i > end) + break; + if (err != 0) { + tx->tx_err = err; + (void) zio_wait(zio); + return; + } + + (void) zfs_refcount_add_many(&txh->txh_memory_tohold, + 1 << dn->dn_indblkshift, FTAG); + + err = dmu_tx_check_ioerr(zio, dn, 1, i); + if (err != 0) { + tx->tx_err = err; + (void) zio_wait(zio); + return; + } + } + err = zio_wait(zio); + if (err != 0) { + tx->tx_err = err; + return; + } + } +} + +void +dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +void +dmu_tx_hold_free_by_dnode(dmu_tx_t *tx, dnode_t *dn, uint64_t off, uint64_t len) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_FREE, off, len); + if (txh != NULL) + (void) dmu_tx_hold_free_impl(txh, off, len); +} + +static void +dmu_tx_hold_zap_impl(dmu_tx_hold_t *txh, const char *name) +{ + dmu_tx_t *tx = txh->txh_tx; + dnode_t *dn = txh->txh_dnode; + int err; + + ASSERT(tx->tx_txg == 0); + + dmu_tx_count_dnode(txh); + + /* + * Modifying a almost-full microzap is around the worst case (128KB) + * + * If it is a fat zap, the worst case would be 7*16KB=112KB: + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + */ + (void) zfs_refcount_add_many(&txh->txh_space_towrite, + MZAP_MAX_BLKSZ, FTAG); + + if (dn == NULL) + return; + + ASSERT3U(DMU_OT_BYTESWAP(dn->dn_type), ==, DMU_BSWAP_ZAP); + + if (dn->dn_maxblkid == 0 || name == NULL) { + /* + * This is a microzap (only one block), or we don't know + * the name. Check the first block for i/o errors. + */ + err = dmu_tx_check_ioerr(NULL, dn, 0, 0); + if (err != 0) { + tx->tx_err = err; + } + } else { + /* + * Access the name so that we'll check for i/o errors to + * the leaf blocks, etc. We ignore ENOENT, as this name + * may not yet exist. + */ + err = zap_lookup_by_dnode(dn, name, 8, 0, NULL); + if (err == EIO || err == ECKSUM || err == ENXIO) { + tx->tx_err = err; + } + } +} + +void +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, name); +} + +void +dmu_tx_hold_zap_by_dnode(dmu_tx_t *tx, dnode_t *dn, int add, const char *name) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + ASSERT(dn != NULL); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_ZAP, add, (uintptr_t)name); + if (txh != NULL) + dmu_tx_hold_zap_impl(txh, name); +} + +void +dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + object, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_bonus_by_dnode(dmu_tx_t *tx, dnode_t *dn) +{ + dmu_tx_hold_t *txh; + + ASSERT0(tx->tx_txg); + + txh = dmu_tx_hold_dnode_impl(tx, dn, THT_BONUS, 0, 0); + if (txh) + dmu_tx_count_dnode(txh); +} + +void +dmu_tx_hold_space(dmu_tx_t *tx, uint64_t space) +{ + dmu_tx_hold_t *txh; + + ASSERT(tx->tx_txg == 0); + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, + DMU_NEW_OBJECT, THT_SPACE, space, 0); + if (txh) { + (void) zfs_refcount_add_many( + &txh->txh_space_towrite, space, FTAG); + } +} + +#ifdef ZFS_DEBUG +void +dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) +{ + boolean_t match_object = B_FALSE; + boolean_t match_offset = B_FALSE; + + DB_DNODE_ENTER(db); + dnode_t *dn = DB_DNODE(db); + ASSERT(tx->tx_txg != 0); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); + ASSERT3U(dn->dn_object, ==, db->db.db_object); + + if (tx->tx_anyobj) { + DB_DNODE_EXIT(db); + return; + } + + /* XXX No checking on the meta dnode for now */ + if (db->db.db_object == DMU_META_DNODE_OBJECT) { + DB_DNODE_EXIT(db); + return; + } + + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + if (txh->txh_dnode == dn && txh->txh_type != THT_NEWOBJECT) + match_object = TRUE; + if (txh->txh_dnode == NULL || txh->txh_dnode == dn) { + int datablkshift = dn->dn_datablkshift ? + dn->dn_datablkshift : SPA_MAXBLOCKSHIFT; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + int shift = datablkshift + epbs * db->db_level; + uint64_t beginblk = shift >= 64 ? 0 : + (txh->txh_arg1 >> shift); + uint64_t endblk = shift >= 64 ? 0 : + ((txh->txh_arg1 + txh->txh_arg2 - 1) >> shift); + uint64_t blkid = db->db_blkid; + + /* XXX txh_arg2 better not be zero... */ + + dprintf("found txh type %x beginblk=%llx endblk=%llx\n", + txh->txh_type, beginblk, endblk); + + switch (txh->txh_type) { + case THT_WRITE: + if (blkid >= beginblk && blkid <= endblk) + match_offset = TRUE; + /* + * We will let this hold work for the bonus + * or spill buffer so that we don't need to + * hold it when creating a new object. + */ + if (blkid == DMU_BONUS_BLKID || + blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + /* + * They might have to increase nlevels, + * thus dirtying the new TLIBs. Or the + * might have to change the block size, + * thus dirying the new lvl=0 blk=0. + */ + if (blkid == 0) + match_offset = TRUE; + break; + case THT_FREE: + /* + * We will dirty all the level 1 blocks in + * the free range and perhaps the first and + * last level 0 block. + */ + if (blkid >= beginblk && (blkid <= endblk || + txh->txh_arg2 == DMU_OBJECT_END)) + match_offset = TRUE; + break; + case THT_SPILL: + if (blkid == DMU_SPILL_BLKID) + match_offset = TRUE; + break; + case THT_BONUS: + if (blkid == DMU_BONUS_BLKID) + match_offset = TRUE; + break; + case THT_ZAP: + match_offset = TRUE; + break; + case THT_NEWOBJECT: + match_object = TRUE; + break; + default: + cmn_err(CE_PANIC, "bad txh_type %d", + txh->txh_type); + } + } + if (match_object && match_offset) { + DB_DNODE_EXIT(db); + return; + } + } + DB_DNODE_EXIT(db); + panic("dirtying dbuf obj=%llx lvl=%u blkid=%llx but not tx_held\n", + (u_longlong_t)db->db.db_object, db->db_level, + (u_longlong_t)db->db_blkid); +} +#endif + +/* + * If we can't do 10 iops, something is wrong. Let us go ahead + * and hit zfs_dirty_data_max. + */ +hrtime_t zfs_delay_max_ns = 100 * MICROSEC; /* 100 milliseconds */ +int zfs_delay_resolution_ns = 100 * 1000; /* 100 microseconds */ + +/* + * We delay transactions when we've determined that the backend storage + * isn't able to accommodate the rate of incoming writes. + * + * If there is already a transaction waiting, we delay relative to when + * that transaction finishes waiting. This way the calculated min_time + * is independent of the number of threads concurrently executing + * transactions. + * + * If we are the only waiter, wait relative to when the transaction + * started, rather than the current time. This credits the transaction for + * "time already served", e.g. reading indirect blocks. + * + * The minimum time for a transaction to take is calculated as: + * min_time = scale * (dirty - min) / (max - dirty) + * min_time is then capped at zfs_delay_max_ns. + * + * The delay has two degrees of freedom that can be adjusted via tunables. + * The percentage of dirty data at which we start to delay is defined by + * zfs_delay_min_dirty_percent. This should typically be at or above + * zfs_vdev_async_write_active_max_dirty_percent so that we only start to + * delay after writing at full speed has failed to keep up with the incoming + * write rate. The scale of the curve is defined by zfs_delay_scale. Roughly + * speaking, this variable determines the amount of delay at the midpoint of + * the curve. + * + * delay + * 10ms +-------------------------------------------------------------*+ + * | *| + * 9ms + *+ + * | *| + * 8ms + *+ + * | * | + * 7ms + * + + * | * | + * 6ms + * + + * | * | + * 5ms + * + + * | * | + * 4ms + * + + * | * | + * 3ms + * + + * | * | + * 2ms + (midpoint) * + + * | | ** | + * 1ms + v *** + + * | zfs_delay_scale ----------> ******** | + * 0 +-------------------------------------*********----------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note that since the delay is added to the outstanding time remaining on the + * most recent transaction, the delay is effectively the inverse of IOPS. + * Here the midpoint of 500us translates to 2000 IOPS. The shape of the curve + * was chosen such that small changes in the amount of accumulated dirty data + * in the first 3/4 of the curve yield relatively small differences in the + * amount of delay. + * + * The effects can be easier to understand when the amount of delay is + * represented on a log scale: + * + * delay + * 100ms +-------------------------------------------------------------++ + * + + + * | | + * + *+ + * 10ms + *+ + * + ** + + * | (midpoint) ** | + * + | ** + + * 1ms + v **** + + * + zfs_delay_scale ----------> ***** + + * | **** | + * + **** + + * 100us + ** + + * + * + + * | * | + * + * + + * 10us + * + + * + + + * | | + * + + + * +--------------------------------------------------------------+ + * 0% <- zfs_dirty_data_max -> 100% + * + * Note here that only as the amount of dirty data approaches its limit does + * the delay start to increase rapidly. The goal of a properly tuned system + * should be to keep the amount of dirty data out of that range by first + * ensuring that the appropriate limits are set for the I/O scheduler to reach + * optimal throughput on the backend storage, and then by changing the value + * of zfs_delay_scale to increase the steepness of the curve. + */ +static void +dmu_tx_delay(dmu_tx_t *tx, uint64_t dirty) +{ + dsl_pool_t *dp = tx->tx_pool; + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + hrtime_t wakeup, min_tx_time, now; + + if (dirty <= delay_min_bytes) + return; + + /* + * The caller has already waited until we are under the max. + * We make them pass us the amount of dirty data so we don't + * have to handle the case of it being >= the max, which could + * cause a divide-by-zero if it's == the max. + */ + ASSERT3U(dirty, <, zfs_dirty_data_max); + + now = gethrtime(); + min_tx_time = zfs_delay_scale * + (dirty - delay_min_bytes) / (zfs_dirty_data_max - dirty); + min_tx_time = MIN(min_tx_time, zfs_delay_max_ns); + if (now > tx->tx_start + min_tx_time) + return; + + DTRACE_PROBE3(delay__mintime, dmu_tx_t *, tx, uint64_t, dirty, + uint64_t, min_tx_time); + + mutex_enter(&dp->dp_lock); + wakeup = MAX(tx->tx_start + min_tx_time, + dp->dp_last_wakeup + min_tx_time); + dp->dp_last_wakeup = wakeup; + mutex_exit(&dp->dp_lock); + + zfs_sleep_until(wakeup); +} + +/* + * This routine attempts to assign the transaction to a transaction group. + * To do so, we must determine if there is sufficient free space on disk. + * + * If this is a "netfree" transaction (i.e. we called dmu_tx_mark_netfree() + * on it), then it is assumed that there is sufficient free space, + * unless there's insufficient slop space in the pool (see the comment + * above spa_slop_shift in spa_misc.c). + * + * If it is not a "netfree" transaction, then if the data already on disk + * is over the allowed usage (e.g. quota), this will fail with EDQUOT or + * ENOSPC. Otherwise, if the current rough estimate of pending changes, + * plus the rough estimate of this transaction's changes, may exceed the + * allowed usage, then this will fail with ERESTART, which will cause the + * caller to wait for the pending changes to be written to disk (by waiting + * for the next TXG to open), and then check the space usage again. + * + * The rough estimate of pending changes is comprised of the sum of: + * + * - this transaction's holds' txh_space_towrite + * + * - dd_tempreserved[], which is the sum of in-flight transactions' + * holds' txh_space_towrite (i.e. those transactions that have called + * dmu_tx_assign() but not yet called dmu_tx_commit()). + * + * - dd_space_towrite[], which is the amount of dirtied dbufs. + * + * Note that all of these values are inflated by spa_get_worst_case_asize(), + * which means that we may get ERESTART well before we are actually in danger + * of running out of space, but this also mitigates any small inaccuracies + * in the rough estimate (e.g. txh_space_towrite doesn't take into account + * indirect blocks, and dd_space_towrite[] doesn't take into account changes + * to the MOS). + * + * Note that due to this algorithm, it is possible to exceed the allowed + * usage by one transaction. Also, as we approach the allowed usage, + * we will allow a very limited amount of changes into each TXG, thus + * decreasing performance. + */ +static int +dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + spa_t *spa = tx->tx_pool->dp_spa; + + ASSERT0(tx->tx_txg); + + if (tx->tx_err) { + DMU_TX_STAT_BUMP(dmu_tx_error); + return (tx->tx_err); + } + + if (spa_suspended(spa)) { + DMU_TX_STAT_BUMP(dmu_tx_suspended); + + /* + * If the user has indicated a blocking failure mode + * then return ERESTART which will block in dmu_tx_wait(). + * Otherwise, return EIO so that an error can get + * propagated back to the VOP calls. + * + * Note that we always honor the txg_how flag regardless + * of the failuremode setting. + */ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE && + !(txg_how & TXG_WAIT)) + return (SET_ERROR(EIO)); + + return (SET_ERROR(ERESTART)); + } + + if (!tx->tx_dirty_delayed && + dsl_pool_need_dirty_delay(tx->tx_pool)) { + tx->tx_wait_dirty = B_TRUE; + DMU_TX_STAT_BUMP(dmu_tx_dirty_delay); + return (SET_ERROR(ERESTART)); + } + + tx->tx_txg = txg_hold_open(tx->tx_pool, &tx->tx_txgh); + tx->tx_needassign_txh = NULL; + + /* + * NB: No error returns are allowed after txg_hold_open, but + * before processing the dnode holds, due to the + * dmu_tx_unassign() logic. + */ + + uint64_t towrite = 0; + uint64_t tohold = 0; + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + if (dn != NULL) { + /* + * This thread can't hold the dn_struct_rwlock + * while assigning the tx, because this can lead to + * deadlock. Specifically, if this dnode is already + * assigned to an earlier txg, this thread may need + * to wait for that txg to sync (the ERESTART case + * below). The other thread that has assigned this + * dnode to an earlier txg prevents this txg from + * syncing until its tx can complete (calling + * dmu_tx_commit()), but it may need to acquire the + * dn_struct_rwlock to do so (e.g. via + * dmu_buf_hold*()). + * + * Note that this thread can't hold the lock for + * read either, but the rwlock doesn't record + * enough information to make that assertion. + */ + ASSERT(!RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + mutex_enter(&dn->dn_mtx); + if (dn->dn_assigned_txg == tx->tx_txg - 1) { + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = txh; + DMU_TX_STAT_BUMP(dmu_tx_group); + return (SET_ERROR(ERESTART)); + } + if (dn->dn_assigned_txg == 0) + dn->dn_assigned_txg = tx->tx_txg; + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + (void) zfs_refcount_add(&dn->dn_tx_holds, tx); + mutex_exit(&dn->dn_mtx); + } + towrite += zfs_refcount_count(&txh->txh_space_towrite); + tohold += zfs_refcount_count(&txh->txh_memory_tohold); + } + + /* needed allocation: worst-case estimate of write space */ + uint64_t asize = spa_get_worst_case_asize(tx->tx_pool->dp_spa, towrite); + /* calculate memory footprint estimate */ + uint64_t memory = towrite + tohold; + + if (tx->tx_dir != NULL && asize != 0) { + int err = dsl_dir_tempreserve_space(tx->tx_dir, memory, + asize, tx->tx_netfree, &tx->tx_tempreserve_cookie, tx); + if (err != 0) + return (err); + } + + DMU_TX_STAT_BUMP(dmu_tx_assigned); + + return (0); +} + +static void +dmu_tx_unassign(dmu_tx_t *tx) +{ + if (tx->tx_txg == 0) + return; + + txg_rele_to_quiesce(&tx->tx_txgh); + + /* + * Walk the transaction's hold list, removing the hold on the + * associated dnode, and notifying waiters if the refcount drops to 0. + */ + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); + txh && txh != tx->tx_needassign_txh; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + txg_rele_to_sync(&tx->tx_txgh); + + tx->tx_lasttried_txg = tx->tx_txg; + tx->tx_txg = 0; +} + +/* + * Assign tx to a transaction group; txg_how is a bitmask: + * + * If TXG_WAIT is set and the currently open txg is full, this function + * will wait until there's a new txg. This should be used when no locks + * are being held. With this bit set, this function will only fail if + * we're truly out of space (or over quota). + * + * If TXG_WAIT is *not* set and we can't assign into the currently open + * txg without blocking, this function will return immediately with + * ERESTART. This should be used whenever locks are being held. On an + * ERESTART error, the caller should drop all locks, call dmu_tx_wait(), + * and try again. + * + * If TXG_NOTHROTTLE is set, this indicates that this tx should not be + * delayed due on the ZFS Write Throttle (see comments in dsl_pool.c for + * details on the throttle). This is used by the VFS operations, after + * they have already called dmu_tx_wait() (though most likely on a + * different tx). + */ +int +dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how) +{ + int err; + + ASSERT(tx->tx_txg == 0); + ASSERT0(txg_how & ~(TXG_WAIT | TXG_NOTHROTTLE)); + ASSERT(!dsl_pool_sync_context(tx->tx_pool)); + + /* If we might wait, we must not hold the config lock. */ + IMPLY((txg_how & TXG_WAIT), !dsl_pool_config_held(tx->tx_pool)); + + if ((txg_how & TXG_NOTHROTTLE)) + tx->tx_dirty_delayed = B_TRUE; + + while ((err = dmu_tx_try_assign(tx, txg_how)) != 0) { + dmu_tx_unassign(tx); + + if (err != ERESTART || !(txg_how & TXG_WAIT)) + return (err); + + dmu_tx_wait(tx); + } + + txg_rele_to_quiesce(&tx->tx_txgh); + + return (0); +} + +void +dmu_tx_wait(dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dsl_pool_t *dp = tx->tx_pool; + hrtime_t before; + + ASSERT(tx->tx_txg == 0); + ASSERT(!dsl_pool_config_held(tx->tx_pool)); + + before = gethrtime(); + + if (tx->tx_wait_dirty) { + uint64_t dirty; + + /* + * dmu_tx_try_assign() has determined that we need to wait + * because we've consumed much or all of the dirty buffer + * space. + */ + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_total >= zfs_dirty_data_max) + DMU_TX_STAT_BUMP(dmu_tx_dirty_over_max); + while (dp->dp_dirty_total >= zfs_dirty_data_max) + cv_wait(&dp->dp_spaceavail_cv, &dp->dp_lock); + dirty = dp->dp_dirty_total; + mutex_exit(&dp->dp_lock); + + dmu_tx_delay(tx, dirty); + + tx->tx_wait_dirty = B_FALSE; + + /* + * Note: setting tx_dirty_delayed only has effect if the + * caller used TX_WAIT. Otherwise they are going to + * destroy this tx and try again. The common case, + * zfs_write(), uses TX_WAIT. + */ + tx->tx_dirty_delayed = B_TRUE; + } else if (spa_suspended(spa) || tx->tx_lasttried_txg == 0) { + /* + * If the pool is suspended we need to wait until it + * is resumed. Note that it's possible that the pool + * has become active after this thread has tried to + * obtain a tx. If that's the case then tx_lasttried_txg + * would not have been set. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + } else if (tx->tx_needassign_txh) { + dnode_t *dn = tx->tx_needassign_txh->txh_dnode; + + mutex_enter(&dn->dn_mtx); + while (dn->dn_assigned_txg == tx->tx_lasttried_txg - 1) + cv_wait(&dn->dn_notxholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + tx->tx_needassign_txh = NULL; + } else { + /* + * If we have a lot of dirty data just wait until we sync + * out a TXG at which point we'll hopefully have synced + * a portion of the changes. + */ + txg_wait_synced(dp, spa_last_synced_txg(spa) + 1); + } + + spa_tx_assign_add_nsecs(spa, gethrtime() - before); +} + +static void +dmu_tx_destroy(dmu_tx_t *tx) +{ + dmu_tx_hold_t *txh; + + while ((txh = list_head(&tx->tx_holds)) != NULL) { + dnode_t *dn = txh->txh_dnode; + + list_remove(&tx->tx_holds, txh); + zfs_refcount_destroy_many(&txh->txh_space_towrite, + zfs_refcount_count(&txh->txh_space_towrite)); + zfs_refcount_destroy_many(&txh->txh_memory_tohold, + zfs_refcount_count(&txh->txh_memory_tohold)); + kmem_free(txh, sizeof (dmu_tx_hold_t)); + if (dn != NULL) + dnode_rele(dn, tx); + } + + list_destroy(&tx->tx_callbacks); + list_destroy(&tx->tx_holds); + kmem_free(tx, sizeof (dmu_tx_t)); +} + +void +dmu_tx_commit(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + + /* + * Go through the transaction's hold list and remove holds on + * associated dnodes, notifying waiters if no holds remain. + */ + for (dmu_tx_hold_t *txh = list_head(&tx->tx_holds); txh != NULL; + txh = list_next(&tx->tx_holds, txh)) { + dnode_t *dn = txh->txh_dnode; + + if (dn == NULL) + continue; + + mutex_enter(&dn->dn_mtx); + ASSERT3U(dn->dn_assigned_txg, ==, tx->tx_txg); + + if (zfs_refcount_remove(&dn->dn_tx_holds, tx) == 0) { + dn->dn_assigned_txg = 0; + cv_broadcast(&dn->dn_notxholds); + } + mutex_exit(&dn->dn_mtx); + } + + if (tx->tx_tempreserve_cookie) + dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + + if (tx->tx_anyobj == FALSE) + txg_rele_to_sync(&tx->tx_txgh); + + dmu_tx_destroy(tx); +} + +void +dmu_tx_abort(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg == 0); + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, SET_ERROR(ECANCELED)); + + dmu_tx_destroy(tx); +} + +uint64_t +dmu_tx_get_txg(dmu_tx_t *tx) +{ + ASSERT(tx->tx_txg != 0); + return (tx->tx_txg); +} + +dsl_pool_t * +dmu_tx_pool(dmu_tx_t *tx) +{ + ASSERT(tx->tx_pool != NULL); + return (tx->tx_pool); +} + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while ((dcb = list_tail(cb_list)) != NULL) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} + +/* + * Interface to hold a bunch of attributes. + * used for creating new files. + * attrsize is the total size of all attributes + * to be added during object creation + * + * For updating/adding a single attribute dmu_tx_hold_sa() should be used. + */ + +/* + * hold necessary attribute name for attribute registration. + * should be a very rare case where this is needed. If it does + * happen it would only happen on the first write to the file system. + */ +static void +dmu_tx_sa_registration_hold(sa_os_t *sa, dmu_tx_t *tx) +{ + if (!sa->sa_need_attr_registration) + return; + + for (int i = 0; i != sa->sa_num_attrs; i++) { + if (!sa->sa_attr_table[i].sa_registered) { + if (sa->sa_reg_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_reg_attr_obj, + B_TRUE, sa->sa_attr_table[i].sa_name); + else + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, + B_TRUE, sa->sa_attr_table[i].sa_name); + } + } +} + +void +dmu_tx_hold_spill(dmu_tx_t *tx, uint64_t object) +{ + dmu_tx_hold_t *txh; + + txh = dmu_tx_hold_object_impl(tx, tx->tx_objset, object, + THT_SPILL, 0, 0); + if (txh != NULL) + (void) zfs_refcount_add_many(&txh->txh_space_towrite, + SPA_OLD_MAXBLOCKSIZE, FTAG); +} + +void +dmu_tx_hold_sa_create(dmu_tx_t *tx, int attrsize) +{ + sa_os_t *sa = tx->tx_objset->os_sa; + + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_layout_attr_obj) { + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + } else { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (attrsize <= DN_OLD_MAX_BONUSLEN && !sa->sa_force_spill) + return; + + (void) dmu_tx_hold_object_impl(tx, tx->tx_objset, DMU_NEW_OBJECT, + THT_SPILL, 0, 0); +} + +/* + * Hold SA attribute + * + * dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *, attribute, add, size) + * + * variable_size is the total size of all variable sized attributes + * passed to this function. It is not the total size of all + * variable size attributes that *may* exist on this object. + */ +void +dmu_tx_hold_sa(dmu_tx_t *tx, sa_handle_t *hdl, boolean_t may_grow) +{ + uint64_t object; + sa_os_t *sa = tx->tx_objset->os_sa; + + ASSERT(hdl != NULL); + + object = sa_handle_object(hdl); + + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + DB_DNODE_ENTER(db); + dmu_tx_hold_bonus_by_dnode(tx, DB_DNODE(db)); + DB_DNODE_EXIT(db); + + if (tx->tx_objset->os_sa->sa_master_obj == 0) + return; + + if (tx->tx_objset->os_sa->sa_reg_attr_obj == 0 || + tx->tx_objset->os_sa->sa_layout_attr_obj == 0) { + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_LAYOUTS); + dmu_tx_hold_zap(tx, sa->sa_master_obj, B_TRUE, SA_REGISTRY); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } + + dmu_tx_sa_registration_hold(sa, tx); + + if (may_grow && tx->tx_objset->os_sa->sa_layout_attr_obj) + dmu_tx_hold_zap(tx, sa->sa_layout_attr_obj, B_TRUE, NULL); + + if (sa->sa_force_spill || may_grow || hdl->sa_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } else { + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_have_spill) { + ASSERT(tx->tx_txg == 0); + dmu_tx_hold_spill(tx, object); + } + DB_DNODE_EXIT(db); + } +} + +void +dmu_tx_init(void) +{ + dmu_tx_ksp = kstat_create("zfs", 0, "dmu_tx", "misc", + KSTAT_TYPE_NAMED, sizeof (dmu_tx_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (dmu_tx_ksp != NULL) { + dmu_tx_ksp->ks_data = &dmu_tx_stats; + kstat_install(dmu_tx_ksp); + } +} + +void +dmu_tx_fini(void) +{ + if (dmu_tx_ksp != NULL) { + kstat_delete(dmu_tx_ksp); + dmu_tx_ksp = NULL; + } +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dmu_tx_create); +EXPORT_SYMBOL(dmu_tx_hold_write); +EXPORT_SYMBOL(dmu_tx_hold_write_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_free); +EXPORT_SYMBOL(dmu_tx_hold_free_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_zap); +EXPORT_SYMBOL(dmu_tx_hold_zap_by_dnode); +EXPORT_SYMBOL(dmu_tx_hold_bonus); +EXPORT_SYMBOL(dmu_tx_hold_bonus_by_dnode); +EXPORT_SYMBOL(dmu_tx_abort); +EXPORT_SYMBOL(dmu_tx_assign); +EXPORT_SYMBOL(dmu_tx_wait); +EXPORT_SYMBOL(dmu_tx_commit); +EXPORT_SYMBOL(dmu_tx_mark_netfree); +EXPORT_SYMBOL(dmu_tx_get_txg); +EXPORT_SYMBOL(dmu_tx_callback_register); +EXPORT_SYMBOL(dmu_tx_do_callbacks); +EXPORT_SYMBOL(dmu_tx_hold_spill); +EXPORT_SYMBOL(dmu_tx_hold_sa_create); +EXPORT_SYMBOL(dmu_tx_hold_sa); +#endif diff --git a/module/zfs/dmu_zfetch.c b/module/zfs/dmu_zfetch.c new file mode 100644 index 000000000000..5935b5f995be --- /dev/null +++ b/module/zfs/dmu_zfetch.c @@ -0,0 +1,384 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * This tunable disables predictive prefetch. Note that it leaves "prescient" + * prefetch (e.g. prefetch for zfs send) intact. Unlike predictive prefetch, + * prescient prefetch never issues i/os that end up not being needed, + * so it can't hurt performance. + */ + +int zfs_prefetch_disable = B_FALSE; + +/* max # of streams per zfetch */ +unsigned int zfetch_max_streams = 8; +/* min time before stream reclaim */ +unsigned int zfetch_min_sec_reap = 2; +/* max bytes to prefetch per stream (default 8MB) */ +unsigned int zfetch_max_distance = 8 * 1024 * 1024; +/* max bytes to prefetch indirects for per stream (default 64MB) */ +unsigned int zfetch_max_idistance = 64 * 1024 * 1024; +/* max number of bytes in an array_read in which we allow prefetching (1MB) */ +unsigned long zfetch_array_rd_sz = 1024 * 1024; + +typedef struct zfetch_stats { + kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_misses; + kstat_named_t zfetchstat_max_streams; +} zfetch_stats_t; + +static zfetch_stats_t zfetch_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "max_streams", KSTAT_DATA_UINT64 }, +}; + +#define ZFETCHSTAT_BUMP(stat) \ + atomic_inc_64(&zfetch_stats.stat.value.ui64); + +kstat_t *zfetch_ksp; + +void +zfetch_init(void) +{ + zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", + KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zfetch_ksp != NULL) { + zfetch_ksp->ks_data = &zfetch_stats; + kstat_install(zfetch_ksp); + } +} + +void +zfetch_fini(void) +{ + if (zfetch_ksp != NULL) { + kstat_delete(zfetch_ksp); + zfetch_ksp = NULL; + } +} + +/* + * This takes a pointer to a zfetch structure and a dnode. It performs the + * necessary setup for the zfetch structure, grokking data from the + * associated dnode. + */ +void +dmu_zfetch_init(zfetch_t *zf, dnode_t *dno) +{ + if (zf == NULL) + return; + + zf->zf_dnode = dno; + + list_create(&zf->zf_stream, sizeof (zstream_t), + offsetof(zstream_t, zs_node)); + + mutex_init(&zf->zf_lock, NULL, MUTEX_DEFAULT, NULL); +} + +static void +dmu_zfetch_stream_remove(zfetch_t *zf, zstream_t *zs) +{ + ASSERT(MUTEX_HELD(&zf->zf_lock)); + list_remove(&zf->zf_stream, zs); + mutex_destroy(&zs->zs_lock); + kmem_free(zs, sizeof (*zs)); +} + +/* + * Clean-up state associated with a zfetch structure (e.g. destroy the + * streams). This doesn't free the zfetch_t itself, that's left to the caller. + */ +void +dmu_zfetch_fini(zfetch_t *zf) +{ + zstream_t *zs; + + mutex_enter(&zf->zf_lock); + while ((zs = list_head(&zf->zf_stream)) != NULL) + dmu_zfetch_stream_remove(zf, zs); + mutex_exit(&zf->zf_lock); + list_destroy(&zf->zf_stream); + mutex_destroy(&zf->zf_lock); + + zf->zf_dnode = NULL; +} + +/* + * If there aren't too many streams already, create a new stream. + * The "blkid" argument is the next block that we expect this stream to access. + * While we're here, clean up old streams (which haven't been + * accessed for at least zfetch_min_sec_reap seconds). + */ +static void +dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid) +{ + zstream_t *zs_next; + int numstreams = 0; + + ASSERT(MUTEX_HELD(&zf->zf_lock)); + + /* + * Clean up old streams. + */ + for (zstream_t *zs = list_head(&zf->zf_stream); + zs != NULL; zs = zs_next) { + zs_next = list_next(&zf->zf_stream, zs); + if (((gethrtime() - zs->zs_atime) / NANOSEC) > + zfetch_min_sec_reap) + dmu_zfetch_stream_remove(zf, zs); + else + numstreams++; + } + + /* + * The maximum number of streams is normally zfetch_max_streams, + * but for small files we lower it such that it's at least possible + * for all the streams to be non-overlapping. + * + * If we are already at the maximum number of streams for this file, + * even after removing old streams, then don't create this stream. + */ + uint32_t max_streams = MAX(1, MIN(zfetch_max_streams, + zf->zf_dnode->dn_maxblkid * zf->zf_dnode->dn_datablksz / + zfetch_max_distance)); + if (numstreams >= max_streams) { + ZFETCHSTAT_BUMP(zfetchstat_max_streams); + return; + } + + zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP); + zs->zs_blkid = blkid; + zs->zs_pf_blkid = blkid; + zs->zs_ipf_blkid = blkid; + zs->zs_atime = gethrtime(); + mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL); + + list_insert_head(&zf->zf_stream, zs); +} + +/* + * This is the predictive prefetch entry point. It associates dnode access + * specified with blkid and nblks arguments with prefetch stream, predicts + * further accesses based on that stats and initiates speculative prefetch. + * fetch_data argument specifies whether actual data blocks should be fetched: + * FALSE -- prefetch only indirect blocks for predicted data blocks; + * TRUE -- prefetch predicted data blocks plus following indirect blocks. + */ +void +dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data, + boolean_t have_lock) +{ + zstream_t *zs; + int64_t pf_start, ipf_start, ipf_istart, ipf_iend; + int64_t pf_ahead_blks, max_blks; + int epbs, max_dist_blks, pf_nblks, ipf_nblks; + uint64_t end_of_access_blkid; + end_of_access_blkid = blkid + nblks; + spa_t *spa = zf->zf_dnode->dn_objset->os_spa; + + if (zfs_prefetch_disable) + return; + /* + * If we haven't yet loaded the indirect vdevs' mappings, we + * can only read from blocks that we carefully ensure are on + * concrete vdevs (or previously-loaded indirect vdevs). So we + * can't allow the predictive prefetcher to attempt reads of other + * blocks (e.g. of the MOS's dnode object). + */ + if (!spa_indirect_vdevs_loaded(spa)) + return; + + /* + * As a fast path for small (single-block) files, ignore access + * to the first block. + */ + if (blkid == 0) + return; + + if (!have_lock) + rw_enter(&zf->zf_dnode->dn_struct_rwlock, RW_READER); + mutex_enter(&zf->zf_lock); + + /* + * Find matching prefetch stream. Depending on whether the accesses + * are block-aligned, first block of the new access may either follow + * the last block of the previous access, or be equal to it. + */ + for (zs = list_head(&zf->zf_stream); zs != NULL; + zs = list_next(&zf->zf_stream, zs)) { + if (blkid == zs->zs_blkid || blkid + 1 == zs->zs_blkid) { + mutex_enter(&zs->zs_lock); + /* + * zs_blkid could have changed before we + * acquired zs_lock; re-check them here. + */ + if (blkid == zs->zs_blkid) { + break; + } else if (blkid + 1 == zs->zs_blkid) { + blkid++; + nblks--; + if (nblks == 0) { + /* Already prefetched this before. */ + mutex_exit(&zs->zs_lock); + mutex_exit(&zf->zf_lock); + if (!have_lock) { + rw_exit(&zf->zf_dnode-> + dn_struct_rwlock); + } + return; + } + break; + } + mutex_exit(&zs->zs_lock); + } + } + + if (zs == NULL) { + /* + * This access is not part of any existing stream. Create + * a new stream for it. + */ + ZFETCHSTAT_BUMP(zfetchstat_misses); + + dmu_zfetch_stream_create(zf, end_of_access_blkid); + mutex_exit(&zf->zf_lock); + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + return; + } + + /* + * This access was to a block that we issued a prefetch for on + * behalf of this stream. Issue further prefetches for this stream. + * + * Normally, we start prefetching where we stopped + * prefetching last (zs_pf_blkid). But when we get our first + * hit on this stream, zs_pf_blkid == zs_blkid, we don't + * want to prefetch the block we just accessed. In this case, + * start just after the block we just accessed. + */ + pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid); + + /* + * Double our amount of prefetched data, but don't let the + * prefetch get further ahead than zfetch_max_distance. + */ + if (fetch_data) { + max_dist_blks = + zfetch_max_distance >> zf->zf_dnode->dn_datablkshift; + /* + * Previously, we were (zs_pf_blkid - blkid) ahead. We + * want to now be double that, so read that amount again, + * plus the amount we are catching up by (i.e. the amount + * read just now). + */ + pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks; + max_blks = max_dist_blks - (pf_start - end_of_access_blkid); + pf_nblks = MIN(pf_ahead_blks, max_blks); + } else { + pf_nblks = 0; + } + + zs->zs_pf_blkid = pf_start + pf_nblks; + + /* + * Do the same for indirects, starting from where we stopped last, + * or where we will stop reading data blocks (and the indirects + * that point to them). + */ + ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid); + max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift; + /* + * We want to double our distance ahead of the data prefetch + * (or reader, if we are not prefetching data). Previously, we + * were (zs_ipf_blkid - blkid) ahead. To double that, we read + * that amount again, plus the amount we are catching up by + * (i.e. the amount read now + the amount of data prefetched now). + */ + pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks; + max_blks = max_dist_blks - (ipf_start - end_of_access_blkid); + ipf_nblks = MIN(pf_ahead_blks, max_blks); + zs->zs_ipf_blkid = ipf_start + ipf_nblks; + + epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT; + ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs; + ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs; + + zs->zs_atime = gethrtime(); + zs->zs_blkid = end_of_access_blkid; + mutex_exit(&zs->zs_lock); + mutex_exit(&zf->zf_lock); + + /* + * dbuf_prefetch() is asynchronous (even when it needs to read + * indirect blocks), but we still prefer to drop our locks before + * calling it to reduce the time we hold them. + */ + + for (int i = 0; i < pf_nblks; i++) { + dbuf_prefetch(zf->zf_dnode, 0, pf_start + i, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + } + for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) { + dbuf_prefetch(zf->zf_dnode, 1, iblk, + ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH); + } + if (!have_lock) + rw_exit(&zf->zf_dnode->dn_struct_rwlock); + ZFETCHSTAT_BUMP(zfetchstat_hits); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_prefetch, zfs_prefetch_, disable, INT, ZMOD_RW, + "Disable all ZFS prefetching"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_streams, UINT, ZMOD_RW, + "Max number of streams per zfetch"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, min_sec_reap, UINT, ZMOD_RW, + "Min time before stream reclaim"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, max_distance, UINT, ZMOD_RW, + "Max bytes to prefetch per stream (default 8MB)"); + +ZFS_MODULE_PARAM(zfs_prefetch, zfetch_, array_rd_sz, ULONG, ZMOD_RW, + "Number of bytes in a array_read"); +/* END CSTYLED */ diff --git a/module/zfs/dnode.c b/module/zfs/dnode.c new file mode 100644 index 000000000000..00536f2774e7 --- /dev/null +++ b/module/zfs/dnode.c @@ -0,0 +1,2575 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +dnode_stats_t dnode_stats = { + { "dnode_hold_dbuf_hold", KSTAT_DATA_UINT64 }, + { "dnode_hold_dbuf_read", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_interior", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_alloc_type_none", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_hits", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_misses", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_overflow", KSTAT_DATA_UINT64 }, + { "dnode_hold_free_refcount", KSTAT_DATA_UINT64 }, + { "dnode_free_interior_lock_retry", KSTAT_DATA_UINT64 }, + { "dnode_allocate", KSTAT_DATA_UINT64 }, + { "dnode_reallocate", KSTAT_DATA_UINT64 }, + { "dnode_buf_evict", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_chunk", KSTAT_DATA_UINT64 }, + { "dnode_alloc_race", KSTAT_DATA_UINT64 }, + { "dnode_alloc_next_block", KSTAT_DATA_UINT64 }, + { "dnode_move_invalid", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck1", KSTAT_DATA_UINT64 }, + { "dnode_move_recheck2", KSTAT_DATA_UINT64 }, + { "dnode_move_special", KSTAT_DATA_UINT64 }, + { "dnode_move_handle", KSTAT_DATA_UINT64 }, + { "dnode_move_rwlock", KSTAT_DATA_UINT64 }, + { "dnode_move_active", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *dnode_ksp; +static kmem_cache_t *dnode_cache; + +static dnode_phys_t dnode_phys_zero __maybe_unused; + +int zfs_default_bs = SPA_MINBLOCKSHIFT; +int zfs_default_ibs = DN_MAX_INDBLKSHIFT; + +#ifdef _KERNEL +static kmem_cbrc_t dnode_move(void *, void *, size_t, void *); +#endif /* _KERNEL */ + +static int +dbuf_compare(const void *x1, const void *x2) +{ + const dmu_buf_impl_t *d1 = x1; + const dmu_buf_impl_t *d2 = x2; + + int cmp = TREE_CMP(d1->db_level, d2->db_level); + if (likely(cmp)) + return (cmp); + + cmp = TREE_CMP(d1->db_blkid, d2->db_blkid); + if (likely(cmp)) + return (cmp); + + if (d1->db_state == DB_SEARCH) { + ASSERT3S(d2->db_state, !=, DB_SEARCH); + return (-1); + } else if (d2->db_state == DB_SEARCH) { + ASSERT3S(d1->db_state, !=, DB_SEARCH); + return (1); + } + + return (TREE_PCMP(d1, d2)); +} + +/* ARGSUSED */ +static int +dnode_cons(void *arg, void *unused, int kmflag) +{ + dnode_t *dn = arg; + int i; + + rw_init(&dn->dn_struct_rwlock, NULL, RW_NOLOCKDEP, NULL); + mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + cv_init(&dn->dn_nodnholds, NULL, CV_DEFAULT, NULL); + + /* + * Every dbuf has a reference, and dropping a tracked reference is + * O(number of references), so don't track dn_holds. + */ + zfs_refcount_create_untracked(&dn->dn_holds); + zfs_refcount_create(&dn->dn_tx_holds); + list_link_init(&dn->dn_link); + + bzero(&dn->dn_next_nblkptr[0], sizeof (dn->dn_next_nblkptr)); + bzero(&dn->dn_next_nlevels[0], sizeof (dn->dn_next_nlevels)); + bzero(&dn->dn_next_indblkshift[0], sizeof (dn->dn_next_indblkshift)); + bzero(&dn->dn_next_bonustype[0], sizeof (dn->dn_next_bonustype)); + bzero(&dn->dn_rm_spillblk[0], sizeof (dn->dn_rm_spillblk)); + bzero(&dn->dn_next_bonuslen[0], sizeof (dn->dn_next_bonuslen)); + bzero(&dn->dn_next_blksz[0], sizeof (dn->dn_next_blksz)); + bzero(&dn->dn_next_maxblkid[0], sizeof (dn->dn_next_maxblkid)); + + for (i = 0; i < TXG_SIZE; i++) { + multilist_link_init(&dn->dn_dirty_link[i]); + dn->dn_free_ranges[i] = NULL; + list_create(&dn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + } + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + dn->dn_bonus = NULL; + dn->dn_have_spill = B_FALSE; + dn->dn_zio = NULL; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_oldprojid = ZFS_DEFAULT_PROJID; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_newprojid = ZFS_DEFAULT_PROJID; + dn->dn_id_flags = 0; + + dn->dn_dbufs_count = 0; + avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + + dn->dn_moved = 0; + return (0); +} + +/* ARGSUSED */ +static void +dnode_dest(void *arg, void *unused) +{ + int i; + dnode_t *dn = arg; + + rw_destroy(&dn->dn_struct_rwlock); + mutex_destroy(&dn->dn_mtx); + mutex_destroy(&dn->dn_dbufs_mtx); + cv_destroy(&dn->dn_notxholds); + cv_destroy(&dn->dn_nodnholds); + zfs_refcount_destroy(&dn->dn_holds); + zfs_refcount_destroy(&dn->dn_tx_holds); + ASSERT(!list_link_active(&dn->dn_link)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + list_destroy(&dn->dn_dirty_records[i]); + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); + } + + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_free_txg); + ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirty_txg); + ASSERT0(dn->dn_dirtyctx); + ASSERT3P(dn->dn_dirtyctx_firstset, ==, NULL); + ASSERT3P(dn->dn_bonus, ==, NULL); + ASSERT(!dn->dn_have_spill); + ASSERT3P(dn->dn_zio, ==, NULL); + ASSERT0(dn->dn_oldused); + ASSERT0(dn->dn_oldflags); + ASSERT0(dn->dn_olduid); + ASSERT0(dn->dn_oldgid); + ASSERT0(dn->dn_oldprojid); + ASSERT0(dn->dn_newuid); + ASSERT0(dn->dn_newgid); + ASSERT0(dn->dn_newprojid); + ASSERT0(dn->dn_id_flags); + + ASSERT0(dn->dn_dbufs_count); + avl_destroy(&dn->dn_dbufs); +} + +void +dnode_init(void) +{ + ASSERT(dnode_cache == NULL); + dnode_cache = kmem_cache_create("dnode_t", sizeof (dnode_t), + 0, dnode_cons, dnode_dest, NULL, NULL, NULL, 0); + kmem_cache_set_move(dnode_cache, dnode_move); + + dnode_ksp = kstat_create("zfs", 0, "dnodestats", "misc", + KSTAT_TYPE_NAMED, sizeof (dnode_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (dnode_ksp != NULL) { + dnode_ksp->ks_data = &dnode_stats; + kstat_install(dnode_ksp); + } +} + +void +dnode_fini(void) +{ + if (dnode_ksp != NULL) { + kstat_delete(dnode_ksp); + dnode_ksp = NULL; + } + + kmem_cache_destroy(dnode_cache); + dnode_cache = NULL; +} + + +#ifdef ZFS_DEBUG +void +dnode_verify(dnode_t *dn) +{ + int drop_struct_lock = FALSE; + + ASSERT(dn->dn_phys); + ASSERT(dn->dn_objset); + ASSERT(dn->dn_handle->dnh_dnode == dn); + + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + + if (!(zfs_flags & ZFS_DEBUG_DNODE_VERIFY)) + return; + + if (!RW_WRITE_HELD(&dn->dn_struct_rwlock)) { + rw_enter(&dn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + if (dn->dn_phys->dn_type != DMU_OT_NONE || dn->dn_allocated_txg != 0) { + int i; + int max_bonuslen = DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots); + ASSERT3U(dn->dn_indblkshift, <=, SPA_MAXBLOCKSHIFT); + if (dn->dn_datablkshift) { + ASSERT3U(dn->dn_datablkshift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(dn->dn_datablkshift, <=, SPA_MAXBLOCKSHIFT); + ASSERT3U(1<dn_datablkshift, ==, dn->dn_datablksz); + } + ASSERT3U(dn->dn_nlevels, <=, 30); + ASSERT(DMU_OT_IS_VALID(dn->dn_type)); + ASSERT3U(dn->dn_nblkptr, >=, 1); + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + ASSERT3U(dn->dn_bonuslen, <=, max_bonuslen); + ASSERT3U(dn->dn_datablksz, ==, + dn->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT3U(ISP2(dn->dn_datablksz), ==, dn->dn_datablkshift != 0); + ASSERT3U((dn->dn_nblkptr - 1) * sizeof (blkptr_t) + + dn->dn_bonuslen, <=, max_bonuslen); + for (i = 0; i < TXG_SIZE; i++) { + ASSERT3U(dn->dn_next_nlevels[i], <=, dn->dn_nlevels); + } + } + if (dn->dn_phys->dn_type != DMU_OT_NONE) + ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); + if (dn->dn_dbuf != NULL) { + ASSERT3P(dn->dn_phys, ==, + (dnode_phys_t *)dn->dn_dbuf->db.db_data + + (dn->dn_object % (dn->dn_dbuf->db.db_size >> DNODE_SHIFT))); + } + if (drop_struct_lock) + rw_exit(&dn->dn_struct_rwlock); +} +#endif + +void +dnode_byteswap(dnode_phys_t *dnp) +{ + uint64_t *buf64 = (void*)&dnp->dn_blkptr; + int i; + + if (dnp->dn_type == DMU_OT_NONE) { + bzero(dnp, sizeof (dnode_phys_t)); + return; + } + + dnp->dn_datablkszsec = BSWAP_16(dnp->dn_datablkszsec); + dnp->dn_bonuslen = BSWAP_16(dnp->dn_bonuslen); + dnp->dn_extra_slots = BSWAP_8(dnp->dn_extra_slots); + dnp->dn_maxblkid = BSWAP_64(dnp->dn_maxblkid); + dnp->dn_used = BSWAP_64(dnp->dn_used); + + /* + * dn_nblkptr is only one byte, so it's OK to read it in either + * byte order. We can't read dn_bouslen. + */ + ASSERT(dnp->dn_indblkshift <= SPA_MAXBLOCKSHIFT); + ASSERT(dnp->dn_nblkptr <= DN_MAX_NBLKPTR); + for (i = 0; i < dnp->dn_nblkptr * sizeof (blkptr_t)/8; i++) + buf64[i] = BSWAP_64(buf64[i]); + + /* + * OK to check dn_bonuslen for zero, because it won't matter if + * we have the wrong byte order. This is necessary because the + * dnode dnode is smaller than a regular dnode. + */ + if (dnp->dn_bonuslen != 0) { + /* + * Note that the bonus length calculated here may be + * longer than the actual bonus buffer. This is because + * we always put the bonus buffer after the last block + * pointer (instead of packing it against the end of the + * dnode buffer). + */ + int off = (dnp->dn_nblkptr-1) * sizeof (blkptr_t); + int slots = dnp->dn_extra_slots + 1; + size_t len = DN_SLOTS_TO_BONUSLEN(slots) - off; + dmu_object_byteswap_t byteswap; + ASSERT(DMU_OT_IS_VALID(dnp->dn_bonustype)); + byteswap = DMU_OT_BYTESWAP(dnp->dn_bonustype); + dmu_ot_byteswap[byteswap].ob_func(dnp->dn_bonus + off, len); + } + + /* Swap SPILL block if we have one */ + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + byteswap_uint64_array(DN_SPILL_BLKPTR(dnp), sizeof (blkptr_t)); +} + +void +dnode_buf_byteswap(void *vbuf, size_t size) +{ + int i = 0; + + ASSERT3U(sizeof (dnode_phys_t), ==, (1<dn_type != DMU_OT_NONE) + i += dnp->dn_extra_slots * DNODE_MIN_SIZE; + } +} + +void +dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx) +{ + ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); + + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + ASSERT3U(newsize, <=, DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t)); + + if (newsize < dn->dn_bonuslen) { + /* clear any data after the end of the new size */ + size_t diff = dn->dn_bonuslen - newsize; + char *data_end = ((char *)dn->dn_bonus->db.db_data) + newsize; + bzero(data_end, diff); + } + + dn->dn_bonuslen = newsize; + if (newsize == 0) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = DN_ZERO_BONUSLEN; + else + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_setbonus_type(dnode_t *dn, dmu_object_type_t newtype, dmu_tx_t *tx) +{ + ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); + dnode_setdirty(dn, tx); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dn->dn_bonustype = newtype; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + rw_exit(&dn->dn_struct_rwlock); +} + +void +dnode_rm_spill(dnode_t *dn, dmu_tx_t *tx) +{ + ASSERT3U(zfs_refcount_count(&dn->dn_holds), >=, 1); + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + dnode_setdirty(dn, tx); + dn->dn_rm_spillblk[tx->tx_txg & TXG_MASK] = DN_KILL_SPILLBLK; + dn->dn_have_spill = B_FALSE; +} + +static void +dnode_setdblksz(dnode_t *dn, int size) +{ + ASSERT0(P2PHASE(size, SPA_MINBLOCKSIZE)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, >=, SPA_MINBLOCKSIZE); + ASSERT3U(size >> SPA_MINBLOCKSHIFT, <, + 1<<(sizeof (dn->dn_phys->dn_datablkszsec) * 8)); + dn->dn_datablksz = size; + dn->dn_datablkszsec = size >> SPA_MINBLOCKSHIFT; + dn->dn_datablkshift = ISP2(size) ? highbit64(size - 1) : 0; +} + +static dnode_t * +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, + uint64_t object, dnode_handle_t *dnh) +{ + dnode_t *dn; + + dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); + dn->dn_moved = 0; + + /* + * Defer setting dn_objset until the dnode is ready to be a candidate + * for the dnode_move() callback. + */ + dn->dn_object = object; + dn->dn_dbuf = db; + dn->dn_handle = dnh; + dn->dn_phys = dnp; + + if (dnp->dn_datablkszsec) { + dnode_setdblksz(dn, dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + } else { + dn->dn_datablksz = 0; + dn->dn_datablkszsec = 0; + dn->dn_datablkshift = 0; + } + dn->dn_indblkshift = dnp->dn_indblkshift; + dn->dn_nlevels = dnp->dn_nlevels; + dn->dn_type = dnp->dn_type; + dn->dn_nblkptr = dnp->dn_nblkptr; + dn->dn_checksum = dnp->dn_checksum; + dn->dn_compress = dnp->dn_compress; + dn->dn_bonustype = dnp->dn_bonustype; + dn->dn_bonuslen = dnp->dn_bonuslen; + dn->dn_num_slots = dnp->dn_extra_slots + 1; + dn->dn_maxblkid = dnp->dn_maxblkid; + dn->dn_have_spill = ((dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) != 0); + dn->dn_id_flags = 0; + + dmu_zfetch_init(&dn->dn_zfetch, dn); + + ASSERT(DMU_OT_IS_VALID(dn->dn_phys->dn_type)); + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + ASSERT(!DN_SLOT_IS_PTR(dnh->dnh_dnode)); + + mutex_enter(&os->os_lock); + + /* + * Exclude special dnodes from os_dnodes so an empty os_dnodes + * signifies that the special dnodes have no references from + * their children (the entries in os_dnodes). This allows + * dnode_destroy() to easily determine if the last child has + * been removed and then complete eviction of the objset. + */ + if (!DMU_OBJECT_IS_SPECIAL(object)) + list_insert_head(&os->os_dnodes, dn); + membar_producer(); + + /* + * Everything else must be valid before assigning dn_objset + * makes the dnode eligible for dnode_move(). + */ + dn->dn_objset = os; + + dnh->dnh_dnode = dn; + mutex_exit(&os->os_lock); + + arc_space_consume(sizeof (dnode_t), ARC_SPACE_DNODE); + + return (dn); +} + +/* + * Caller must be holding the dnode handle, which is released upon return. + */ +static void +dnode_destroy(dnode_t *dn) +{ + objset_t *os = dn->dn_objset; + boolean_t complete_os_eviction = B_FALSE; + + ASSERT((dn->dn_id_flags & DN_ID_NEW_EXIST) == 0); + + mutex_enter(&os->os_lock); + POINTER_INVALIDATE(&dn->dn_objset); + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + list_remove(&os->os_dnodes, dn); + complete_os_eviction = + list_is_empty(&os->os_dnodes) && + list_link_active(&os->os_evicting_node); + } + mutex_exit(&os->os_lock); + + /* the dnode can no longer move, so we can release the handle */ + if (!zrl_is_locked(&dn->dn_handle->dnh_zrlock)) + zrl_remove(&dn->dn_handle->dnh_zrlock); + + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_assigned_txg = 0; + dn->dn_dirty_txg = 0; + + dn->dn_dirtyctx = 0; + dn->dn_dirtyctx_firstset = NULL; + if (dn->dn_bonus != NULL) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_destroy(dn->dn_bonus); + dn->dn_bonus = NULL; + } + dn->dn_zio = NULL; + + dn->dn_have_spill = B_FALSE; + dn->dn_oldused = 0; + dn->dn_oldflags = 0; + dn->dn_olduid = 0; + dn->dn_oldgid = 0; + dn->dn_oldprojid = ZFS_DEFAULT_PROJID; + dn->dn_newuid = 0; + dn->dn_newgid = 0; + dn->dn_newprojid = ZFS_DEFAULT_PROJID; + dn->dn_id_flags = 0; + + dmu_zfetch_fini(&dn->dn_zfetch); + kmem_cache_free(dnode_cache, dn); + arc_space_return(sizeof (dnode_t), ARC_SPACE_DNODE); + + if (complete_os_eviction) + dmu_objset_evict_done(os); +} + +void +dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, dmu_tx_t *tx) +{ + int i; + + ASSERT3U(dn_slots, >, 0); + ASSERT3U(dn_slots << DNODE_SHIFT, <=, + spa_maxdnodesize(dmu_objset_spa(dn->dn_objset))); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + if (blocksize == 0) + blocksize = 1 << zfs_default_bs; + else + blocksize = P2ROUNDUP(blocksize, SPA_MINBLOCKSIZE); + + if (ibs == 0) + ibs = zfs_default_ibs; + + ibs = MIN(MAX(ibs, DN_MIN_INDBLKSHIFT), DN_MAX_INDBLKSHIFT); + + dprintf("os=%p obj=%llu txg=%llu blocksize=%d ibs=%d dn_slots=%d\n", + dn->dn_objset, dn->dn_object, tx->tx_txg, blocksize, ibs, dn_slots); + DNODE_STAT_BUMP(dnode_allocate); + + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT(bcmp(dn->dn_phys, &dnode_phys_zero, sizeof (dnode_phys_t)) == 0); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE); + ASSERT(ot != DMU_OT_NONE); + ASSERT(DMU_OT_IS_VALID(ot)); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype == DMU_OT_SA && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0)); + ASSERT(DMU_OT_IS_VALID(bonustype)); + ASSERT3U(bonuslen, <=, DN_SLOTS_TO_BONUSLEN(dn_slots)); + ASSERT(dn->dn_type == DMU_OT_NONE); + ASSERT0(dn->dn_maxblkid); + ASSERT0(dn->dn_allocated_txg); + ASSERT0(dn->dn_assigned_txg); + ASSERT0(dn->dn_dirty_txg); + ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); + ASSERT3U(zfs_refcount_count(&dn->dn_holds), <=, 1); + ASSERT(avl_is_empty(&dn->dn_dbufs)); + + for (i = 0; i < TXG_SIZE; i++) { + ASSERT0(dn->dn_next_nblkptr[i]); + ASSERT0(dn->dn_next_nlevels[i]); + ASSERT0(dn->dn_next_indblkshift[i]); + ASSERT0(dn->dn_next_bonuslen[i]); + ASSERT0(dn->dn_next_bonustype[i]); + ASSERT0(dn->dn_rm_spillblk[i]); + ASSERT0(dn->dn_next_blksz[i]); + ASSERT0(dn->dn_next_maxblkid[i]); + ASSERT(!multilist_link_active(&dn->dn_dirty_link[i])); + ASSERT3P(list_head(&dn->dn_dirty_records[i]), ==, NULL); + ASSERT3P(dn->dn_free_ranges[i], ==, NULL); + } + + dn->dn_type = ot; + dnode_setdblksz(dn, blocksize); + dn->dn_indblkshift = ibs; + dn->dn_nlevels = 1; + dn->dn_num_slots = dn_slots; + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + dn->dn_nblkptr = 1; + else { + dn->dn_nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + } + + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + dn->dn_dirtyctx = 0; + + dn->dn_free_txg = 0; + dn->dn_dirtyctx_firstset = NULL; + + dn->dn_allocated_txg = tx->tx_txg; + dn->dn_id_flags = 0; + + dnode_setdirty(dn, tx); + dn->dn_next_indblkshift[tx->tx_txg & TXG_MASK] = ibs; + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = dn->dn_bonuslen; + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = dn->dn_bonustype; + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = dn->dn_datablksz; +} + +void +dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, + dmu_object_type_t bonustype, int bonuslen, int dn_slots, + boolean_t keep_spill, dmu_tx_t *tx) +{ + int nblkptr; + + ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); + ASSERT3U(blocksize, <=, + spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + ASSERT0(blocksize % SPA_MINBLOCKSIZE); + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); + ASSERT(tx->tx_txg != 0); + ASSERT((bonustype == DMU_OT_NONE && bonuslen == 0) || + (bonustype != DMU_OT_NONE && bonuslen != 0) || + (bonustype == DMU_OT_SA && bonuslen == 0)); + ASSERT(DMU_OT_IS_VALID(bonustype)); + ASSERT3U(bonuslen, <=, + DN_BONUS_SIZE(spa_maxdnodesize(dmu_objset_spa(dn->dn_objset)))); + ASSERT3U(bonuslen, <=, DN_BONUS_SIZE(dn_slots << DNODE_SHIFT)); + + dnode_free_interior_slots(dn); + DNODE_STAT_BUMP(dnode_reallocate); + + /* clean up any unreferenced dbufs */ + dnode_evict_dbufs(dn); + + dn->dn_id_flags = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_setdirty(dn, tx); + if (dn->dn_datablksz != blocksize) { + /* change blocksize */ + ASSERT0(dn->dn_maxblkid); + ASSERT(BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + dnode_block_freed(dn, 0)); + + dnode_setdblksz(dn, blocksize); + dn->dn_next_blksz[tx->tx_txg & TXG_MASK] = blocksize; + } + if (dn->dn_bonuslen != bonuslen) + dn->dn_next_bonuslen[tx->tx_txg & TXG_MASK] = bonuslen; + + if (bonustype == DMU_OT_SA) /* Maximize bonus space for SA */ + nblkptr = 1; + else + nblkptr = MIN(DN_MAX_NBLKPTR, + 1 + ((DN_SLOTS_TO_BONUSLEN(dn_slots) - bonuslen) >> + SPA_BLKPTRSHIFT)); + if (dn->dn_bonustype != bonustype) + dn->dn_next_bonustype[tx->tx_txg & TXG_MASK] = bonustype; + if (dn->dn_nblkptr != nblkptr) + dn->dn_next_nblkptr[tx->tx_txg & TXG_MASK] = nblkptr; + if (dn->dn_phys->dn_flags & DNODE_FLAG_SPILL_BLKPTR && !keep_spill) { + dbuf_rm_spill(dn, tx); + dnode_rm_spill(dn, tx); + } + + rw_exit(&dn->dn_struct_rwlock); + + /* change type */ + dn->dn_type = ot; + + /* change bonus size and type */ + mutex_enter(&dn->dn_mtx); + dn->dn_bonustype = bonustype; + dn->dn_bonuslen = bonuslen; + dn->dn_num_slots = dn_slots; + dn->dn_nblkptr = nblkptr; + dn->dn_checksum = ZIO_CHECKSUM_INHERIT; + dn->dn_compress = ZIO_COMPRESS_INHERIT; + ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); + + /* fix up the bonus db_size */ + if (dn->dn_bonus) { + dn->dn_bonus->db.db_size = + DN_SLOTS_TO_BONUSLEN(dn->dn_num_slots) - + (dn->dn_nblkptr-1) * sizeof (blkptr_t); + ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); + } + + dn->dn_allocated_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); +} + +#ifdef _KERNEL +static void +dnode_move_impl(dnode_t *odn, dnode_t *ndn) +{ + int i; + + ASSERT(!RW_LOCK_HELD(&odn->dn_struct_rwlock)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_mtx)); + ASSERT(MUTEX_NOT_HELD(&odn->dn_dbufs_mtx)); + ASSERT(!MUTEX_HELD(&odn->dn_zfetch.zf_lock)); + + /* Copy fields. */ + ndn->dn_objset = odn->dn_objset; + ndn->dn_object = odn->dn_object; + ndn->dn_dbuf = odn->dn_dbuf; + ndn->dn_handle = odn->dn_handle; + ndn->dn_phys = odn->dn_phys; + ndn->dn_type = odn->dn_type; + ndn->dn_bonuslen = odn->dn_bonuslen; + ndn->dn_bonustype = odn->dn_bonustype; + ndn->dn_nblkptr = odn->dn_nblkptr; + ndn->dn_checksum = odn->dn_checksum; + ndn->dn_compress = odn->dn_compress; + ndn->dn_nlevels = odn->dn_nlevels; + ndn->dn_indblkshift = odn->dn_indblkshift; + ndn->dn_datablkshift = odn->dn_datablkshift; + ndn->dn_datablkszsec = odn->dn_datablkszsec; + ndn->dn_datablksz = odn->dn_datablksz; + ndn->dn_maxblkid = odn->dn_maxblkid; + ndn->dn_num_slots = odn->dn_num_slots; + bcopy(&odn->dn_next_type[0], &ndn->dn_next_type[0], + sizeof (odn->dn_next_type)); + bcopy(&odn->dn_next_nblkptr[0], &ndn->dn_next_nblkptr[0], + sizeof (odn->dn_next_nblkptr)); + bcopy(&odn->dn_next_nlevels[0], &ndn->dn_next_nlevels[0], + sizeof (odn->dn_next_nlevels)); + bcopy(&odn->dn_next_indblkshift[0], &ndn->dn_next_indblkshift[0], + sizeof (odn->dn_next_indblkshift)); + bcopy(&odn->dn_next_bonustype[0], &ndn->dn_next_bonustype[0], + sizeof (odn->dn_next_bonustype)); + bcopy(&odn->dn_rm_spillblk[0], &ndn->dn_rm_spillblk[0], + sizeof (odn->dn_rm_spillblk)); + bcopy(&odn->dn_next_bonuslen[0], &ndn->dn_next_bonuslen[0], + sizeof (odn->dn_next_bonuslen)); + bcopy(&odn->dn_next_blksz[0], &ndn->dn_next_blksz[0], + sizeof (odn->dn_next_blksz)); + bcopy(&odn->dn_next_maxblkid[0], &ndn->dn_next_maxblkid[0], + sizeof (odn->dn_next_maxblkid)); + for (i = 0; i < TXG_SIZE; i++) { + list_move_tail(&ndn->dn_dirty_records[i], + &odn->dn_dirty_records[i]); + } + bcopy(&odn->dn_free_ranges[0], &ndn->dn_free_ranges[0], + sizeof (odn->dn_free_ranges)); + ndn->dn_allocated_txg = odn->dn_allocated_txg; + ndn->dn_free_txg = odn->dn_free_txg; + ndn->dn_assigned_txg = odn->dn_assigned_txg; + ndn->dn_dirty_txg = odn->dn_dirty_txg; + ndn->dn_dirtyctx = odn->dn_dirtyctx; + ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset; + ASSERT(zfs_refcount_count(&odn->dn_tx_holds) == 0); + zfs_refcount_transfer(&ndn->dn_holds, &odn->dn_holds); + ASSERT(avl_is_empty(&ndn->dn_dbufs)); + avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs); + ndn->dn_dbufs_count = odn->dn_dbufs_count; + ndn->dn_bonus = odn->dn_bonus; + ndn->dn_have_spill = odn->dn_have_spill; + ndn->dn_zio = odn->dn_zio; + ndn->dn_oldused = odn->dn_oldused; + ndn->dn_oldflags = odn->dn_oldflags; + ndn->dn_olduid = odn->dn_olduid; + ndn->dn_oldgid = odn->dn_oldgid; + ndn->dn_oldprojid = odn->dn_oldprojid; + ndn->dn_newuid = odn->dn_newuid; + ndn->dn_newgid = odn->dn_newgid; + ndn->dn_newprojid = odn->dn_newprojid; + ndn->dn_id_flags = odn->dn_id_flags; + dmu_zfetch_init(&ndn->dn_zfetch, NULL); + list_move_tail(&ndn->dn_zfetch.zf_stream, &odn->dn_zfetch.zf_stream); + ndn->dn_zfetch.zf_dnode = odn->dn_zfetch.zf_dnode; + + /* + * Update back pointers. Updating the handle fixes the back pointer of + * every descendant dbuf as well as the bonus dbuf. + */ + ASSERT(ndn->dn_handle->dnh_dnode == odn); + ndn->dn_handle->dnh_dnode = ndn; + if (ndn->dn_zfetch.zf_dnode == odn) { + ndn->dn_zfetch.zf_dnode = ndn; + } + + /* + * Invalidate the original dnode by clearing all of its back pointers. + */ + odn->dn_dbuf = NULL; + odn->dn_handle = NULL; + avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t), + offsetof(dmu_buf_impl_t, db_link)); + odn->dn_dbufs_count = 0; + odn->dn_bonus = NULL; + dmu_zfetch_fini(&odn->dn_zfetch); + + /* + * Set the low bit of the objset pointer to ensure that dnode_move() + * recognizes the dnode as invalid in any subsequent callback. + */ + POINTER_INVALIDATE(&odn->dn_objset); + + /* + * Satisfy the destructor. + */ + for (i = 0; i < TXG_SIZE; i++) { + list_create(&odn->dn_dirty_records[i], + sizeof (dbuf_dirty_record_t), + offsetof(dbuf_dirty_record_t, dr_dirty_node)); + odn->dn_free_ranges[i] = NULL; + odn->dn_next_nlevels[i] = 0; + odn->dn_next_indblkshift[i] = 0; + odn->dn_next_bonustype[i] = 0; + odn->dn_rm_spillblk[i] = 0; + odn->dn_next_bonuslen[i] = 0; + odn->dn_next_blksz[i] = 0; + } + odn->dn_allocated_txg = 0; + odn->dn_free_txg = 0; + odn->dn_assigned_txg = 0; + odn->dn_dirty_txg = 0; + odn->dn_dirtyctx = 0; + odn->dn_dirtyctx_firstset = NULL; + odn->dn_have_spill = B_FALSE; + odn->dn_zio = NULL; + odn->dn_oldused = 0; + odn->dn_oldflags = 0; + odn->dn_olduid = 0; + odn->dn_oldgid = 0; + odn->dn_oldprojid = ZFS_DEFAULT_PROJID; + odn->dn_newuid = 0; + odn->dn_newgid = 0; + odn->dn_newprojid = ZFS_DEFAULT_PROJID; + odn->dn_id_flags = 0; + + /* + * Mark the dnode. + */ + ndn->dn_moved = 1; + odn->dn_moved = (uint8_t)-1; +} + +/*ARGSUSED*/ +static kmem_cbrc_t +dnode_move(void *buf, void *newbuf, size_t size, void *arg) +{ + dnode_t *odn = buf, *ndn = newbuf; + objset_t *os; + int64_t refcount; + uint32_t dbufs; + + /* + * The dnode is on the objset's list of known dnodes if the objset + * pointer is valid. We set the low bit of the objset pointer when + * freeing the dnode to invalidate it, and the memory patterns written + * by kmem (baddcafe and deadbeef) set at least one of the two low bits. + * A newly created dnode sets the objset pointer last of all to indicate + * that the dnode is known and in a valid state to be moved by this + * function. + */ + os = odn->dn_objset; + if (!POINTER_IS_VALID(os)) { + DNODE_STAT_BUMP(dnode_move_invalid); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * Ensure that the objset does not go away during the move. + */ + rw_enter(&os_lock, RW_WRITER); + if (os != odn->dn_objset) { + rw_exit(&os_lock); + DNODE_STAT_BUMP(dnode_move_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the dnode is still valid, then so is the objset. We know that no + * valid objset can be freed while we hold os_lock, so we can safely + * ensure that the objset remains in use. + */ + mutex_enter(&os->os_lock); + + /* + * Recheck the objset pointer in case the dnode was removed just before + * acquiring the lock. + */ + if (os != odn->dn_objset) { + mutex_exit(&os->os_lock); + rw_exit(&os_lock); + DNODE_STAT_BUMP(dnode_move_recheck2); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * At this point we know that as long as we hold os->os_lock, the dnode + * cannot be freed and fields within the dnode can be safely accessed. + * The objset listing this dnode cannot go away as long as this dnode is + * on its list. + */ + rw_exit(&os_lock); + if (DMU_OBJECT_IS_SPECIAL(odn->dn_object)) { + mutex_exit(&os->os_lock); + DNODE_STAT_BUMP(dnode_move_special); + return (KMEM_CBRC_NO); + } + ASSERT(odn->dn_dbuf != NULL); /* only "special" dnodes have no parent */ + + /* + * Lock the dnode handle to prevent the dnode from obtaining any new + * holds. This also prevents the descendant dbufs and the bonus dbuf + * from accessing the dnode, so that we can discount their holds. The + * handle is safe to access because we know that while the dnode cannot + * go away, neither can its handle. Once we hold dnh_zrlock, we can + * safely move any dnode referenced only by dbufs. + */ + if (!zrl_tryenter(&odn->dn_handle->dnh_zrlock)) { + mutex_exit(&os->os_lock); + DNODE_STAT_BUMP(dnode_move_handle); + return (KMEM_CBRC_LATER); + } + + /* + * Ensure a consistent view of the dnode's holds and the dnode's dbufs. + * We need to guarantee that there is a hold for every dbuf in order to + * determine whether the dnode is actively referenced. Falsely matching + * a dbuf to an active hold would lead to an unsafe move. It's possible + * that a thread already having an active dnode hold is about to add a + * dbuf, and we can't compare hold and dbuf counts while the add is in + * progress. + */ + if (!rw_tryenter(&odn->dn_struct_rwlock, RW_WRITER)) { + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_BUMP(dnode_move_rwlock); + return (KMEM_CBRC_LATER); + } + + /* + * A dbuf may be removed (evicted) without an active dnode hold. In that + * case, the dbuf count is decremented under the handle lock before the + * dbuf's hold is released. This order ensures that if we count the hold + * after the dbuf is removed but before its hold is released, we will + * treat the unmatched hold as active and exit safely. If we count the + * hold before the dbuf is removed, the hold is discounted, and the + * removal is blocked until the move completes. + */ + refcount = zfs_refcount_count(&odn->dn_holds); + ASSERT(refcount >= 0); + dbufs = DN_DBUFS_COUNT(odn); + + /* We can't have more dbufs than dnode holds. */ + ASSERT3U(dbufs, <=, refcount); + DTRACE_PROBE3(dnode__move, dnode_t *, odn, int64_t, refcount, + uint32_t, dbufs); + + if (refcount > dbufs) { + rw_exit(&odn->dn_struct_rwlock); + zrl_exit(&odn->dn_handle->dnh_zrlock); + mutex_exit(&os->os_lock); + DNODE_STAT_BUMP(dnode_move_active); + return (KMEM_CBRC_LATER); + } + + rw_exit(&odn->dn_struct_rwlock); + + /* + * At this point we know that anyone with a hold on the dnode is not + * actively referencing it. The dnode is known and in a valid state to + * move. We're holding the locks needed to execute the critical section. + */ + dnode_move_impl(odn, ndn); + + list_link_replace(&odn->dn_link, &ndn->dn_link); + /* If the dnode was safe to move, the refcount cannot have changed. */ + ASSERT(refcount == zfs_refcount_count(&ndn->dn_holds)); + ASSERT(dbufs == DN_DBUFS_COUNT(ndn)); + zrl_exit(&ndn->dn_handle->dnh_zrlock); /* handle has moved */ + mutex_exit(&os->os_lock); + + return (KMEM_CBRC_YES); +} +#endif /* _KERNEL */ + +static void +dnode_slots_hold(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + zrl_add(&dnh->dnh_zrlock); + } +} + +static void +dnode_slots_rele(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (zrl_is_locked(&dnh->dnh_zrlock)) + zrl_exit(&dnh->dnh_zrlock); + else + zrl_remove(&dnh->dnh_zrlock); + } +} + +static int +dnode_slots_tryenter(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + if (!zrl_tryenter(&dnh->dnh_zrlock)) { + for (int j = idx; j < i; j++) { + dnh = &children->dnc_children[j]; + zrl_exit(&dnh->dnh_zrlock); + } + + return (0); + } + } + + return (1); +} + +static void +dnode_set_slots(dnode_children_t *children, int idx, int slots, void *ptr) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnh->dnh_dnode = ptr; + } +} + +static boolean_t +dnode_check_slots_free(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + /* + * If all dnode slots are either already free or + * evictable return B_TRUE. + */ + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + dnode_t *dn = dnh->dnh_dnode; + + if (dn == DN_SLOT_FREE) { + continue; + } else if (DN_SLOT_IS_PTR(dn)) { + mutex_enter(&dn->dn_mtx); + boolean_t can_free = (dn->dn_type == DMU_OT_NONE && + zfs_refcount_is_zero(&dn->dn_holds) && + !DNODE_IS_DIRTY(dn)); + mutex_exit(&dn->dn_mtx); + + if (!can_free) + return (B_FALSE); + else + continue; + } else { + return (B_FALSE); + } + } + + return (B_TRUE); +} + +static void +dnode_reclaim_slots(dnode_children_t *children, int idx, int slots) +{ + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + for (int i = idx; i < idx + slots; i++) { + dnode_handle_t *dnh = &children->dnc_children[i]; + + ASSERT(zrl_is_locked(&dnh->dnh_zrlock)); + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + ASSERT3S(dnh->dnh_dnode->dn_type, ==, DMU_OT_NONE); + dnode_destroy(dnh->dnh_dnode); + dnh->dnh_dnode = DN_SLOT_FREE; + } + } +} + +void +dnode_free_interior_slots(dnode_t *dn) +{ + dnode_children_t *children = dmu_buf_get_user(&dn->dn_dbuf->db); + int epb = dn->dn_dbuf->db.db_size >> DNODE_SHIFT; + int idx = (dn->dn_object & (epb - 1)) + 1; + int slots = dn->dn_num_slots - 1; + + if (slots == 0) + return; + + ASSERT3S(idx + slots, <=, DNODES_PER_BLOCK); + + while (!dnode_slots_tryenter(children, idx, slots)) { + DNODE_STAT_BUMP(dnode_free_interior_lock_retry); + cond_resched(); + } + + dnode_set_slots(children, idx, slots, DN_SLOT_FREE); + dnode_slots_rele(children, idx, slots); +} + +void +dnode_special_close(dnode_handle_t *dnh) +{ + dnode_t *dn = dnh->dnh_dnode; + + /* + * Ensure dnode_rele_and_unlock() has released dn_mtx, after final + * zfs_refcount_remove() + */ + mutex_enter(&dn->dn_mtx); + if (zfs_refcount_count(&dn->dn_holds) > 0) + cv_wait(&dn->dn_nodnholds, &dn->dn_mtx); + mutex_exit(&dn->dn_mtx); + ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 0); + + ASSERT(dn->dn_dbuf == NULL || + dmu_buf_get_user(&dn->dn_dbuf->db) == NULL); + zrl_add(&dnh->dnh_zrlock); + dnode_destroy(dn); /* implicit zrl_remove() */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = NULL; +} + +void +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object, + dnode_handle_t *dnh) +{ + dnode_t *dn; + + zrl_init(&dnh->dnh_zrlock); + zrl_tryenter(&dnh->dnh_zrlock); + + dn = dnode_create(os, dnp, NULL, object, dnh); + DNODE_VERIFY(dn); + + zrl_exit(&dnh->dnh_zrlock); +} + +static void +dnode_buf_evict_async(void *dbu) +{ + dnode_children_t *dnc = dbu; + + DNODE_STAT_BUMP(dnode_buf_evict); + + for (int i = 0; i < dnc->dnc_count; i++) { + dnode_handle_t *dnh = &dnc->dnc_children[i]; + dnode_t *dn; + + /* + * The dnode handle lock guards against the dnode moving to + * another valid address, so there is no need here to guard + * against changes to or from NULL. + */ + if (!DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; + continue; + } + + zrl_add(&dnh->dnh_zrlock); + dn = dnh->dnh_dnode; + /* + * If there are holds on this dnode, then there should + * be holds on the dnode's containing dbuf as well; thus + * it wouldn't be eligible for eviction and this function + * would not have been called. + */ + ASSERT(zfs_refcount_is_zero(&dn->dn_holds)); + ASSERT(zfs_refcount_is_zero(&dn->dn_tx_holds)); + + dnode_destroy(dn); /* implicit zrl_remove() for first slot */ + zrl_destroy(&dnh->dnh_zrlock); + dnh->dnh_dnode = DN_SLOT_UNINIT; + } + kmem_free(dnc, sizeof (dnode_children_t) + + dnc->dnc_count * sizeof (dnode_handle_t)); +} + +/* + * When the DNODE_MUST_BE_FREE flag is set, the "slots" parameter is used + * to ensure the hole at the specified object offset is large enough to + * hold the dnode being created. The slots parameter is also used to ensure + * a dnode does not span multiple dnode blocks. In both of these cases, if + * a failure occurs, ENOSPC is returned. Keep in mind, these failure cases + * are only possible when using DNODE_MUST_BE_FREE. + * + * If the DNODE_MUST_BE_ALLOCATED flag is set, "slots" must be 0. + * dnode_hold_impl() will check if the requested dnode is already consumed + * as an extra dnode slot by an large dnode, in which case it returns + * ENOENT. + * + * If the DNODE_DRY_RUN flag is set, we don't actually hold the dnode, just + * return whether the hold would succeed or not. tag and dnp should set to + * NULL in this case. + * + * errors: + * EINVAL - Invalid object number or flags. + * ENOSPC - Hole too small to fulfill "slots" request (DNODE_MUST_BE_FREE) + * EEXIST - Refers to an allocated dnode (DNODE_MUST_BE_FREE) + * - Refers to a freeing dnode (DNODE_MUST_BE_FREE) + * - Refers to an interior dnode slot (DNODE_MUST_BE_ALLOCATED) + * ENOENT - The requested dnode is not allocated (DNODE_MUST_BE_ALLOCATED) + * - The requested dnode is being freed (DNODE_MUST_BE_ALLOCATED) + * EIO - I/O error when reading the meta dnode dbuf. + * + * succeeds even for free dnodes. + */ +int +dnode_hold_impl(objset_t *os, uint64_t object, int flag, int slots, + void *tag, dnode_t **dnp) +{ + int epb, idx, err; + int drop_struct_lock = FALSE; + int type; + uint64_t blk; + dnode_t *mdn, *dn; + dmu_buf_impl_t *db; + dnode_children_t *dnc; + dnode_phys_t *dn_block; + dnode_handle_t *dnh; + + ASSERT(!(flag & DNODE_MUST_BE_ALLOCATED) || (slots == 0)); + ASSERT(!(flag & DNODE_MUST_BE_FREE) || (slots > 0)); + IMPLY(flag & DNODE_DRY_RUN, (tag == NULL) && (dnp == NULL)); + + /* + * If you are holding the spa config lock as writer, you shouldn't + * be asking the DMU to do *anything* unless it's the root pool + * which may require us to read from the root filesystem while + * holding some (not all) of the locks as writer. + */ + ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0 || + (spa_is_root(os->os_spa) && + spa_config_held(os->os_spa, SCL_STATE, RW_WRITER))); + + ASSERT((flag & DNODE_MUST_BE_ALLOCATED) || (flag & DNODE_MUST_BE_FREE)); + + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT || + object == DMU_PROJECTUSED_OBJECT) { + if (object == DMU_USERUSED_OBJECT) + dn = DMU_USERUSED_DNODE(os); + else if (object == DMU_GROUPUSED_OBJECT) + dn = DMU_GROUPUSED_DNODE(os); + else + dn = DMU_PROJECTUSED_DNODE(os); + if (dn == NULL) + return (SET_ERROR(ENOENT)); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (SET_ERROR(ENOENT)); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (SET_ERROR(EEXIST)); + DNODE_VERIFY(dn); + /* Don't actually hold if dry run, just return 0 */ + if (!(flag & DNODE_DRY_RUN)) { + (void) zfs_refcount_add(&dn->dn_holds, tag); + *dnp = dn; + } + return (0); + } + + if (object == 0 || object >= DN_MAX_OBJECT) + return (SET_ERROR(EINVAL)); + + mdn = DMU_META_DNODE(os); + ASSERT(mdn->dn_object == DMU_META_DNODE_OBJECT); + + DNODE_VERIFY(mdn); + + if (!RW_WRITE_HELD(&mdn->dn_struct_rwlock)) { + rw_enter(&mdn->dn_struct_rwlock, RW_READER); + drop_struct_lock = TRUE; + } + + blk = dbuf_whichblock(mdn, 0, object * sizeof (dnode_phys_t)); + db = dbuf_hold(mdn, blk, FTAG); + if (drop_struct_lock) + rw_exit(&mdn->dn_struct_rwlock); + if (db == NULL) { + DNODE_STAT_BUMP(dnode_hold_dbuf_hold); + return (SET_ERROR(EIO)); + } + + /* + * We do not need to decrypt to read the dnode so it doesn't matter + * if we get the encrypted or decrypted version. + */ + err = dbuf_read(db, NULL, DB_RF_CANFAIL | DB_RF_NO_DECRYPT); + if (err) { + DNODE_STAT_BUMP(dnode_hold_dbuf_read); + dbuf_rele(db, FTAG); + return (err); + } + + ASSERT3U(db->db.db_size, >=, 1<db.db_size >> DNODE_SHIFT; + + idx = object & (epb - 1); + dn_block = (dnode_phys_t *)db->db.db_data; + + ASSERT(DB_DNODE(db)->dn_type == DMU_OT_DNODE); + dnc = dmu_buf_get_user(&db->db); + dnh = NULL; + if (dnc == NULL) { + dnode_children_t *winner; + int skip = 0; + + dnc = kmem_zalloc(sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t), KM_SLEEP); + dnc->dnc_count = epb; + dnh = &dnc->dnc_children[0]; + + /* Initialize dnode slot status from dnode_phys_t */ + for (int i = 0; i < epb; i++) { + zrl_init(&dnh[i].dnh_zrlock); + + if (skip) { + skip--; + continue; + } + + if (dn_block[i].dn_type != DMU_OT_NONE) { + int interior = dn_block[i].dn_extra_slots; + + dnode_set_slots(dnc, i, 1, DN_SLOT_ALLOCATED); + dnode_set_slots(dnc, i + 1, interior, + DN_SLOT_INTERIOR); + skip = interior; + } else { + dnh[i].dnh_dnode = DN_SLOT_FREE; + skip = 0; + } + } + + dmu_buf_init_user(&dnc->dnc_dbu, NULL, + dnode_buf_evict_async, NULL); + winner = dmu_buf_set_user(&db->db, &dnc->dnc_dbu); + if (winner != NULL) { + + for (int i = 0; i < epb; i++) + zrl_destroy(&dnh[i].dnh_zrlock); + + kmem_free(dnc, sizeof (dnode_children_t) + + epb * sizeof (dnode_handle_t)); + dnc = winner; + } + } + + ASSERT(dnc->dnc_count == epb); + + if (flag & DNODE_MUST_BE_ALLOCATED) { + slots = 1; + + dnode_slots_hold(dnc, idx, slots); + dnh = &dnc->dnc_children[idx]; + + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + } else if (dnh->dnh_dnode == DN_SLOT_INTERIOR) { + DNODE_STAT_BUMP(dnode_hold_alloc_interior); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } else if (dnh->dnh_dnode != DN_SLOT_ALLOCATED) { + DNODE_STAT_BUMP(dnode_hold_alloc_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } else { + dnode_slots_rele(dnc, idx, slots); + while (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_retry); + cond_resched(); + } + + /* + * Someone else won the race and called dnode_create() + * after we checked DN_SLOT_IS_PTR() above but before + * we acquired the lock. + */ + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + DNODE_STAT_BUMP(dnode_hold_alloc_lock_misses); + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg != 0) { + DNODE_STAT_BUMP(dnode_hold_alloc_type_none); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOENT)); + } + + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + + DNODE_STAT_BUMP(dnode_hold_alloc_hits); + } else if (flag & DNODE_MUST_BE_FREE) { + + if (idx + slots - 1 >= DNODES_PER_BLOCK) { + DNODE_STAT_BUMP(dnode_hold_free_overflow); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnode_slots_hold(dnc, idx, slots); + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dnode_slots_rele(dnc, idx, slots); + while (!dnode_slots_tryenter(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_retry); + cond_resched(); + } + + if (!dnode_check_slots_free(dnc, idx, slots)) { + DNODE_STAT_BUMP(dnode_hold_free_lock_misses); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(ENOSPC)); + } + + /* + * Allocated but otherwise free dnodes which would + * be in the interior of a multi-slot dnodes need + * to be freed. Single slot dnodes can be safely + * re-purposed as a performance optimization. + */ + if (slots > 1) + dnode_reclaim_slots(dnc, idx + 1, slots - 1); + + dnh = &dnc->dnc_children[idx]; + if (DN_SLOT_IS_PTR(dnh->dnh_dnode)) { + dn = dnh->dnh_dnode; + } else { + dn = dnode_create(os, dn_block + idx, db, + object, dnh); + } + + mutex_enter(&dn->dn_mtx); + if (!zfs_refcount_is_zero(&dn->dn_holds) || dn->dn_free_txg) { + DNODE_STAT_BUMP(dnode_hold_free_refcount); + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (SET_ERROR(EEXIST)); + } + + /* Don't actually hold if dry run, just return 0 */ + if (flag & DNODE_DRY_RUN) { + mutex_exit(&dn->dn_mtx); + dnode_slots_rele(dnc, idx, slots); + dbuf_rele(db, FTAG); + return (0); + } + + dnode_set_slots(dnc, idx + 1, slots - 1, DN_SLOT_INTERIOR); + DNODE_STAT_BUMP(dnode_hold_free_hits); + } else { + dbuf_rele(db, FTAG); + return (SET_ERROR(EINVAL)); + } + + ASSERT0(dn->dn_free_txg); + + if (zfs_refcount_add(&dn->dn_holds, tag) == 1) + dbuf_add_ref(db, dnh); + + mutex_exit(&dn->dn_mtx); + + /* Now we can rely on the hold to prevent the dnode from moving. */ + dnode_slots_rele(dnc, idx, slots); + + DNODE_VERIFY(dn); + ASSERT3P(dnp, !=, NULL); + ASSERT3P(dn->dn_dbuf, ==, db); + ASSERT3U(dn->dn_object, ==, object); + dbuf_rele(db, FTAG); + + *dnp = dn; + return (0); +} + +/* + * Return held dnode if the object is allocated, NULL if not. + */ +int +dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, 0, tag, + dnp)); +} + +/* + * Can only add a reference if there is already at least one + * reference on the dnode. Returns FALSE if unable to add a + * new reference. + */ +boolean_t +dnode_add_ref(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + if (zfs_refcount_is_zero(&dn->dn_holds)) { + mutex_exit(&dn->dn_mtx); + return (FALSE); + } + VERIFY(1 < zfs_refcount_add(&dn->dn_holds, tag)); + mutex_exit(&dn->dn_mtx); + return (TRUE); +} + +void +dnode_rele(dnode_t *dn, void *tag) +{ + mutex_enter(&dn->dn_mtx); + dnode_rele_and_unlock(dn, tag, B_FALSE); +} + +void +dnode_rele_and_unlock(dnode_t *dn, void *tag, boolean_t evicting) +{ + uint64_t refs; + /* Get while the hold prevents the dnode from moving. */ + dmu_buf_impl_t *db = dn->dn_dbuf; + dnode_handle_t *dnh = dn->dn_handle; + + refs = zfs_refcount_remove(&dn->dn_holds, tag); + if (refs == 0) + cv_broadcast(&dn->dn_nodnholds); + mutex_exit(&dn->dn_mtx); + /* dnode could get destroyed at this point, so don't use it anymore */ + + /* + * It's unsafe to release the last hold on a dnode by dnode_rele() or + * indirectly by dbuf_rele() while relying on the dnode handle to + * prevent the dnode from moving, since releasing the last hold could + * result in the dnode's parent dbuf evicting its dnode handles. For + * that reason anyone calling dnode_rele() or dbuf_rele() without some + * other direct or indirect hold on the dnode must first drop the dnode + * handle. + */ + ASSERT(refs > 0 || dnh->dnh_zrlock.zr_owner != curthread); + + /* NOTE: the DNODE_DNODE does not have a dn_dbuf */ + if (refs == 0 && db != NULL) { + /* + * Another thread could add a hold to the dnode handle in + * dnode_hold_impl() while holding the parent dbuf. Since the + * hold on the parent dbuf prevents the handle from being + * destroyed, the hold on the handle is OK. We can't yet assert + * that the handle has zero references, but that will be + * asserted anyway when the handle gets destroyed. + */ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, dnh, evicting); + } +} + +/* + * Test whether we can create a dnode at the specified location. + */ +int +dnode_try_claim(objset_t *os, uint64_t object, int slots) +{ + return (dnode_hold_impl(os, object, DNODE_MUST_BE_FREE | DNODE_DRY_RUN, + slots, NULL, NULL)); +} + +void +dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + uint64_t txg = tx->tx_txg; + + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); + return; + } + + DNODE_VERIFY(dn); + +#ifdef ZFS_DEBUG + mutex_enter(&dn->dn_mtx); + ASSERT(dn->dn_phys->dn_type || dn->dn_allocated_txg); + ASSERT(dn->dn_free_txg == 0 || dn->dn_free_txg >= txg); + mutex_exit(&dn->dn_mtx); +#endif + + /* + * Determine old uid/gid when necessary + */ + dmu_objset_userquota_get_ids(dn, B_TRUE, tx); + + multilist_t *dirtylist = os->os_dirty_dnodes[txg & TXG_MASK]; + multilist_sublist_t *mls = multilist_sublist_lock_obj(dirtylist, dn); + + /* + * If we are already marked dirty, we're done. + */ + if (multilist_link_active(&dn->dn_dirty_link[txg & TXG_MASK])) { + multilist_sublist_unlock(mls); + return; + } + + ASSERT(!zfs_refcount_is_zero(&dn->dn_holds) || + !avl_is_empty(&dn->dn_dbufs)); + ASSERT(dn->dn_datablksz != 0); + ASSERT0(dn->dn_next_bonuslen[txg & TXG_MASK]); + ASSERT0(dn->dn_next_blksz[txg & TXG_MASK]); + ASSERT0(dn->dn_next_bonustype[txg & TXG_MASK]); + + dprintf_ds(os->os_dsl_dataset, "obj=%llu txg=%llu\n", + dn->dn_object, txg); + + multilist_sublist_insert_head(mls, dn); + + multilist_sublist_unlock(mls); + + /* + * The dnode maintains a hold on its containing dbuf as + * long as there are holds on it. Each instantiated child + * dbuf maintains a hold on the dnode. When the last child + * drops its hold, the dnode will drop its hold on the + * containing dbuf. We add a "dirty hold" here so that the + * dnode will hang around after we finish processing its + * children. + */ + VERIFY(dnode_add_ref(dn, (void *)(uintptr_t)tx->tx_txg)); + + (void) dbuf_dirty(dn->dn_dbuf, tx); + + dsl_dataset_dirty(os->os_dsl_dataset, tx); +} + +void +dnode_free(dnode_t *dn, dmu_tx_t *tx) +{ + mutex_enter(&dn->dn_mtx); + if (dn->dn_type == DMU_OT_NONE || dn->dn_free_txg) { + mutex_exit(&dn->dn_mtx); + return; + } + dn->dn_free_txg = tx->tx_txg; + mutex_exit(&dn->dn_mtx); + + dnode_setdirty(dn, tx); +} + +/* + * Try to change the block size for the indicated dnode. This can only + * succeed if there are no blocks allocated or dirty beyond first block + */ +int +dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int err; + + ASSERT3U(size, <=, spa_maxblocksize(dmu_objset_spa(dn->dn_objset))); + if (size == 0) + size = SPA_MINBLOCKSIZE; + else + size = P2ROUNDUP(size, SPA_MINBLOCKSIZE); + + if (ibs == dn->dn_indblkshift) + ibs = 0; + + if (size >> SPA_MINBLOCKSHIFT == dn->dn_datablkszsec && ibs == 0) + return (0); + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* Check for any allocated blocks beyond the first */ + if (dn->dn_maxblkid != 0) + goto fail; + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; + db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID && + db->db_blkid != DMU_SPILL_BLKID) { + mutex_exit(&dn->dn_dbufs_mtx); + goto fail; + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + if (ibs && dn->dn_nlevels != 1) + goto fail; + + /* resize the old block */ + err = dbuf_hold_impl(dn, 0, 0, TRUE, FALSE, FTAG, &db); + if (err == 0) { + dbuf_new_size(db, size, tx); + } else if (err != ENOENT) { + goto fail; + } + + dnode_setdblksz(dn, size); + dnode_setdirty(dn, tx); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = size; + if (ibs) { + dn->dn_indblkshift = ibs; + dn->dn_next_indblkshift[tx->tx_txg&TXG_MASK] = ibs; + } + /* release after we have fixed the blocksize in the dnode */ + if (db) + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); + return (0); + +fail: + rw_exit(&dn->dn_struct_rwlock); + return (SET_ERROR(ENOTSUP)); +} + +static void +dnode_set_nlevels_impl(dnode_t *dn, int new_nlevels, dmu_tx_t *tx) +{ + uint64_t txgoff = tx->tx_txg & TXG_MASK; + int old_nlevels = dn->dn_nlevels; + dmu_buf_impl_t *db; + list_t *list; + dbuf_dirty_record_t *new, *dr, *dr_next; + + ASSERT(RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + dn->dn_nlevels = new_nlevels; + + ASSERT3U(new_nlevels, >, dn->dn_next_nlevels[txgoff]); + dn->dn_next_nlevels[txgoff] = new_nlevels; + + /* dirty the left indirects */ + db = dbuf_hold_level(dn, old_nlevels, 0, FTAG); + ASSERT(db != NULL); + new = dbuf_dirty(db, tx); + dbuf_rele(db, FTAG); + + /* transfer the dirty records to the new indirect */ + mutex_enter(&dn->dn_mtx); + mutex_enter(&new->dt.di.dr_mtx); + list = &dn->dn_dirty_records[txgoff]; + for (dr = list_head(list); dr; dr = dr_next) { + dr_next = list_next(&dn->dn_dirty_records[txgoff], dr); + if (dr->dr_dbuf->db_level != new_nlevels-1 && + dr->dr_dbuf->db_blkid != DMU_BONUS_BLKID && + dr->dr_dbuf->db_blkid != DMU_SPILL_BLKID) { + ASSERT(dr->dr_dbuf->db_level == old_nlevels-1); + list_remove(&dn->dn_dirty_records[txgoff], dr); + list_insert_tail(&new->dt.di.dr_children, dr); + dr->dr_parent = new; + } + } + mutex_exit(&new->dt.di.dr_mtx); + mutex_exit(&dn->dn_mtx); +} + +int +dnode_set_nlevels(dnode_t *dn, int nlevels, dmu_tx_t *tx) +{ + int ret = 0; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + if (dn->dn_nlevels == nlevels) { + ret = 0; + goto out; + } else if (nlevels < dn->dn_nlevels) { + ret = SET_ERROR(EINVAL); + goto out; + } + + dnode_set_nlevels_impl(dn, nlevels, tx); + +out: + rw_exit(&dn->dn_struct_rwlock); + return (ret); +} + +/* read-holding callers must not rely on the lock being continuously held */ +void +dnode_new_blkid(dnode_t *dn, uint64_t blkid, dmu_tx_t *tx, boolean_t have_read, + boolean_t force) +{ + int epbs, new_nlevels; + uint64_t sz; + + ASSERT(blkid != DMU_BONUS_BLKID); + + ASSERT(have_read ? + RW_READ_HELD(&dn->dn_struct_rwlock) : + RW_WRITE_HELD(&dn->dn_struct_rwlock)); + + /* + * if we have a read-lock, check to see if we need to do any work + * before upgrading to a write-lock. + */ + if (have_read) { + if (blkid <= dn->dn_maxblkid) + return; + + if (!rw_tryupgrade(&dn->dn_struct_rwlock)) { + rw_exit(&dn->dn_struct_rwlock); + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + } + } + + /* + * Raw sends (indicated by the force flag) require that we take the + * given blkid even if the value is lower than the current value. + */ + if (!force && blkid <= dn->dn_maxblkid) + goto out; + + /* + * We use the (otherwise unused) top bit of dn_next_maxblkid[txgoff] + * to indicate that this field is set. This allows us to set the + * maxblkid to 0 on an existing object in dnode_sync(). + */ + dn->dn_maxblkid = blkid; + dn->dn_next_maxblkid[tx->tx_txg & TXG_MASK] = + blkid | DMU_NEXT_MAXBLKID_SET; + + /* + * Compute the number of levels necessary to support the new maxblkid. + * Raw sends will ensure nlevels is set correctly for us. + */ + new_nlevels = 1; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (sz = dn->dn_nblkptr; + sz <= blkid && sz >= dn->dn_nblkptr; sz <<= epbs) + new_nlevels++; + + ASSERT3U(new_nlevels, <=, DN_MAX_LEVELS); + + if (!force) { + if (new_nlevels > dn->dn_nlevels) + dnode_set_nlevels_impl(dn, new_nlevels, tx); + } else { + ASSERT3U(dn->dn_nlevels, >=, new_nlevels); + } + +out: + if (have_read) + rw_downgrade(&dn->dn_struct_rwlock); +} + +static void +dnode_dirty_l1(dnode_t *dn, uint64_t l1blkid, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dbuf_hold_level(dn, 1, l1blkid, FTAG); + if (db != NULL) { + dmu_buf_will_dirty(&db->db, tx); + dbuf_rele(db, FTAG); + } +} + +/* + * Dirty all the in-core level-1 dbufs in the range specified by start_blkid + * and end_blkid. + */ +static void +dnode_dirty_l1range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid, + dmu_tx_t *tx) +{ + dmu_buf_impl_t db_search; + dmu_buf_impl_t *db; + avl_index_t where; + + mutex_enter(&dn->dn_dbufs_mtx); + + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + for (;;) { + + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + + if (db == NULL || db->db_level != 1 || + db->db_blkid >= end_blkid) { + break; + } + + /* + * Setup the next blkid we want to search for. + */ + db_search.db_blkid = db->db_blkid + 1; + ASSERT3U(db->db_blkid, >=, start_blkid); + + /* + * If the dbuf transitions to DB_EVICTING while we're trying + * to dirty it, then we will be unable to discover it in + * the dbuf hash table. This will result in a call to + * dbuf_create() which needs to acquire the dn_dbufs_mtx + * lock. To avoid a deadlock, we drop the lock before + * dirtying the level-1 dbuf. + */ + mutex_exit(&dn->dn_dbufs_mtx); + dnode_dirty_l1(dn, db->db_blkid, tx); + mutex_enter(&dn->dn_dbufs_mtx); + } + +#ifdef ZFS_DEBUG + /* + * Walk all the in-core level-1 dbufs and verify they have been dirtied. + */ + db_search.db_level = 1; + db_search.db_blkid = start_blkid + 1; + db_search.db_state = DB_SEARCH; + db = avl_find(&dn->dn_dbufs, &db_search, &where); + if (db == NULL) + db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER); + for (; db != NULL; db = AVL_NEXT(&dn->dn_dbufs, db)) { + if (db->db_level != 1 || db->db_blkid >= end_blkid) + break; + if (db->db_state != DB_EVICTING) + ASSERT(db->db_dirtycnt > 0); + } +#endif + mutex_exit(&dn->dn_dbufs_mtx); +} + +void +dnode_set_dirtyctx(dnode_t *dn, dmu_tx_t *tx, void *tag) +{ + /* + * Don't set dirtyctx to SYNC if we're just modifying this as we + * initialize the objset. + */ + if (dn->dn_dirtyctx == DN_UNDIRTIED) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + + if (ds != NULL) { + rrw_enter(&ds->ds_bp_rwlock, RW_READER, tag); + } + if (!BP_IS_HOLE(dn->dn_objset->os_rootbp)) { + if (dmu_tx_is_syncing(tx)) + dn->dn_dirtyctx = DN_DIRTY_SYNC; + else + dn->dn_dirtyctx = DN_DIRTY_OPEN; + dn->dn_dirtyctx_firstset = tag; + } + if (ds != NULL) { + rrw_exit(&ds->ds_bp_rwlock, tag); + } + } +} + +void +dnode_free_range(dnode_t *dn, uint64_t off, uint64_t len, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + uint64_t blkoff, blkid, nblks; + int blksz, blkshift, head, tail; + int trunc = FALSE; + int epbs; + + blksz = dn->dn_datablksz; + blkshift = dn->dn_datablkshift; + epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + if (len == DMU_OBJECT_END) { + len = UINT64_MAX - off; + trunc = TRUE; + } + + /* + * First, block align the region to free: + */ + if (ISP2(blksz)) { + head = P2NPHASE(off, blksz); + blkoff = P2PHASE(off, blksz); + if ((off >> blkshift) > dn->dn_maxblkid) + return; + } else { + ASSERT(dn->dn_maxblkid == 0); + if (off == 0 && len >= blksz) { + /* + * Freeing the whole block; fast-track this request. + */ + blkid = 0; + nblks = 1; + if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + dnode_dirty_l1(dn, 0, tx); + rw_exit(&dn->dn_struct_rwlock); + } + goto done; + } else if (off >= blksz) { + /* Freeing past end-of-data */ + return; + } else { + /* Freeing part of the block. */ + head = blksz - off; + ASSERT3U(head, >, 0); + } + blkoff = off; + } + /* zero out any partial block data at the start of the range */ + if (head) { + int res; + ASSERT3U(blkoff + head, ==, blksz); + if (len < head) + head = len; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + caddr_t data; + boolean_t dirty; + + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, + FTAG); + /* don't dirty if it isn't on disk and isn't dirty */ + dirty = !list_is_empty(&db->db_dirty_records) || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, dblt, FTAG); + if (dirty) { + dmu_buf_will_dirty(&db->db, tx); + data = db->db.db_data; + bzero(data + blkoff, head); + } + dbuf_rele(db, FTAG); + } + off += head; + len -= head; + } + + /* If the range was less than one block, we're done */ + if (len == 0) + return; + + /* If the remaining range is past end of file, we're done */ + if ((off >> blkshift) > dn->dn_maxblkid) + return; + + ASSERT(ISP2(blksz)); + if (trunc) + tail = 0; + else + tail = P2PHASE(len, blksz); + + ASSERT0(P2PHASE(off, blksz)); + /* zero out any partial block data at the end of the range */ + if (tail) { + int res; + if (len < tail) + tail = len; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + res = dbuf_hold_impl(dn, 0, dbuf_whichblock(dn, 0, off+len), + TRUE, FALSE, FTAG, &db); + rw_exit(&dn->dn_struct_rwlock); + if (res == 0) { + boolean_t dirty; + /* don't dirty if not on disk and not dirty */ + db_lock_type_t type = dmu_buf_lock_parent(db, RW_READER, + FTAG); + dirty = !list_is_empty(&db->db_dirty_records) || + (db->db_blkptr && !BP_IS_HOLE(db->db_blkptr)); + dmu_buf_unlock_parent(db, type, FTAG); + if (dirty) { + dmu_buf_will_dirty(&db->db, tx); + bzero(db->db.db_data, tail); + } + dbuf_rele(db, FTAG); + } + len -= tail; + } + + /* If the range did not include a full block, we are done */ + if (len == 0) + return; + + ASSERT(IS_P2ALIGNED(off, blksz)); + ASSERT(trunc || IS_P2ALIGNED(len, blksz)); + blkid = off >> blkshift; + nblks = len >> blkshift; + if (trunc) + nblks += 1; + + /* + * Dirty all the indirect blocks in this range. Note that only + * the first and last indirect blocks can actually be written + * (if they were partially freed) -- they must be dirtied, even if + * they do not exist on disk yet. The interior blocks will + * be freed by free_children(), so they will not actually be written. + * Even though these interior blocks will not be written, we + * dirty them for two reasons: + * + * - It ensures that the indirect blocks remain in memory until + * syncing context. (They have already been prefetched by + * dmu_tx_hold_free(), so we don't have to worry about reading + * them serially here.) + * + * - The dirty space accounting will put pressure on the txg sync + * mechanism to begin syncing, and to delay transactions if there + * is a large amount of freeing. Even though these indirect + * blocks will not be written, we could need to write the same + * amount of space if we copy the freed BPs into deadlists. + */ + if (dn->dn_nlevels > 1) { + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + uint64_t first, last; + + first = blkid >> epbs; + dnode_dirty_l1(dn, first, tx); + if (trunc) + last = dn->dn_maxblkid >> epbs; + else + last = (blkid + nblks - 1) >> epbs; + if (last != first) + dnode_dirty_l1(dn, last, tx); + + dnode_dirty_l1range(dn, first, last, tx); + + int shift = dn->dn_datablkshift + dn->dn_indblkshift - + SPA_BLKPTRSHIFT; + for (uint64_t i = first + 1; i < last; i++) { + /* + * Set i to the blockid of the next non-hole + * level-1 indirect block at or after i. Note + * that dnode_next_offset() operates in terms of + * level-0-equivalent bytes. + */ + uint64_t ibyte = i << shift; + int err = dnode_next_offset(dn, DNODE_FIND_HAVELOCK, + &ibyte, 2, 1, 0); + i = ibyte >> shift; + if (i >= last) + break; + + /* + * Normally we should not see an error, either + * from dnode_next_offset() or dbuf_hold_level() + * (except for ESRCH from dnode_next_offset). + * If there is an i/o error, then when we read + * this block in syncing context, it will use + * ZIO_FLAG_MUSTSUCCEED, and thus hang/panic according + * to the "failmode" property. dnode_next_offset() + * doesn't have a flag to indicate MUSTSUCCEED. + */ + if (err != 0) + break; + + dnode_dirty_l1(dn, i, tx); + } + rw_exit(&dn->dn_struct_rwlock); + } + +done: + /* + * Add this range to the dnode range list. + * We will finish up this free operation in the syncing phase. + */ + mutex_enter(&dn->dn_mtx); + { + int txgoff = tx->tx_txg & TXG_MASK; + if (dn->dn_free_ranges[txgoff] == NULL) { + dn->dn_free_ranges[txgoff] = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); + } + range_tree_clear(dn->dn_free_ranges[txgoff], blkid, nblks); + range_tree_add(dn->dn_free_ranges[txgoff], blkid, nblks); + } + dprintf_dnode(dn, "blkid=%llu nblks=%llu txg=%llu\n", + blkid, nblks, tx->tx_txg); + mutex_exit(&dn->dn_mtx); + + dbuf_free_range(dn, blkid, blkid + nblks - 1, tx); + dnode_setdirty(dn, tx); +} + +static boolean_t +dnode_spill_freed(dnode_t *dn) +{ + int i; + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_rm_spillblk[i] == DN_KILL_SPILLBLK) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* return TRUE if this blkid was freed in a recent txg, or FALSE if it wasn't */ +uint64_t +dnode_block_freed(dnode_t *dn, uint64_t blkid) +{ + void *dp = spa_get_dsl(dn->dn_objset->os_spa); + int i; + + if (blkid == DMU_BONUS_BLKID) + return (FALSE); + + /* + * If we're in the process of opening the pool, dp will not be + * set yet, but there shouldn't be anything dirty. + */ + if (dp == NULL) + return (FALSE); + + if (dn->dn_free_txg) + return (TRUE); + + if (blkid == DMU_SPILL_BLKID) + return (dnode_spill_freed(dn)); + + mutex_enter(&dn->dn_mtx); + for (i = 0; i < TXG_SIZE; i++) { + if (dn->dn_free_ranges[i] != NULL && + range_tree_contains(dn->dn_free_ranges[i], blkid, 1)) + break; + } + mutex_exit(&dn->dn_mtx); + return (i < TXG_SIZE); +} + +/* call from syncing context when we actually write/free space for this dnode */ +void +dnode_diduse_space(dnode_t *dn, int64_t delta) +{ + uint64_t space; + dprintf_dnode(dn, "dn=%p dnp=%p used=%llu delta=%lld\n", + dn, dn->dn_phys, + (u_longlong_t)dn->dn_phys->dn_used, + (longlong_t)delta); + + mutex_enter(&dn->dn_mtx); + space = DN_USED_BYTES(dn->dn_phys); + if (delta > 0) { + ASSERT3U(space + delta, >=, space); /* no overflow */ + } else { + ASSERT3U(space, >=, -delta); /* no underflow */ + } + space += delta; + if (spa_version(dn->dn_objset->os_spa) < SPA_VERSION_DNODE_BYTES) { + ASSERT((dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) == 0); + ASSERT0(P2PHASE(space, 1<dn_phys->dn_used = space >> DEV_BSHIFT; + } else { + dn->dn_phys->dn_used = space; + dn->dn_phys->dn_flags |= DNODE_FLAG_USED_BYTES; + } + mutex_exit(&dn->dn_mtx); +} + +/* + * Scans a block at the indicated "level" looking for a hole or data, + * depending on 'flags'. + * + * If level > 0, then we are scanning an indirect block looking at its + * pointers. If level == 0, then we are looking at a block of dnodes. + * + * If we don't find what we are looking for in the block, we return ESRCH. + * Otherwise, return with *offset pointing to the beginning (if searching + * forwards) or end (if searching backwards) of the range covered by the + * block pointer we matched on (or dnode). + * + * The basic search algorithm used below by dnode_next_offset() is to + * use this function to search up the block tree (widen the search) until + * we find something (i.e., we don't return ESRCH) and then search back + * down the tree (narrow the search) until we reach our original search + * level. + */ +static int +dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, + int lvl, uint64_t blkfill, uint64_t txg) +{ + dmu_buf_impl_t *db = NULL; + void *data = NULL; + uint64_t epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + uint64_t epb = 1ULL << epbs; + uint64_t minfill, maxfill; + boolean_t hole; + int i, inc, error, span; + + ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); + + hole = ((flags & DNODE_FIND_HOLE) != 0); + inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; + ASSERT(txg == 0 || !hole); + + if (lvl == dn->dn_phys->dn_nlevels) { + error = 0; + epb = dn->dn_phys->dn_nblkptr; + data = dn->dn_phys->dn_blkptr; + } else { + uint64_t blkid = dbuf_whichblock(dn, lvl, *offset); + error = dbuf_hold_impl(dn, lvl, blkid, TRUE, FALSE, FTAG, &db); + if (error) { + if (error != ENOENT) + return (error); + if (hole) + return (0); + /* + * This can only happen when we are searching up + * the block tree for data. We don't really need to + * adjust the offset, as we will just end up looking + * at the pointer to this block in its parent, and its + * going to be unallocated, so we will skip over it. + */ + return (SET_ERROR(ESRCH)); + } + error = dbuf_read(db, NULL, + DB_RF_CANFAIL | DB_RF_HAVESTRUCT | DB_RF_NO_DECRYPT); + if (error) { + dbuf_rele(db, FTAG); + return (error); + } + data = db->db.db_data; + rw_enter(&db->db_rwlock, RW_READER); + } + + if (db != NULL && txg != 0 && (db->db_blkptr == NULL || + db->db_blkptr->blk_birth <= txg || + BP_IS_HOLE(db->db_blkptr))) { + /* + * This can only happen when we are searching up the tree + * and these conditions mean that we need to keep climbing. + */ + error = SET_ERROR(ESRCH); + } else if (lvl == 0) { + dnode_phys_t *dnp = data; + + ASSERT(dn->dn_type == DMU_OT_DNODE); + ASSERT(!(flags & DNODE_FIND_BACKWARDS)); + + for (i = (*offset >> DNODE_SHIFT) & (blkfill - 1); + i < blkfill; i += dnp[i].dn_extra_slots + 1) { + if ((dnp[i].dn_type == DMU_OT_NONE) == hole) + break; + } + + if (i == blkfill) + error = SET_ERROR(ESRCH); + + *offset = (*offset & ~(DNODE_BLOCK_SIZE - 1)) + + (i << DNODE_SHIFT); + } else { + blkptr_t *bp = data; + uint64_t start = *offset; + span = (lvl - 1) * epbs + dn->dn_datablkshift; + minfill = 0; + maxfill = blkfill << ((lvl - 1) * epbs); + + if (hole) + maxfill--; + else + minfill++; + + if (span >= 8 * sizeof (*offset)) { + /* This only happens on the highest indirection level */ + ASSERT3U((lvl - 1), ==, dn->dn_phys->dn_nlevels - 1); + *offset = 0; + } else { + *offset = *offset >> span; + } + + for (i = BF64_GET(*offset, 0, epbs); + i >= 0 && i < epb; i += inc) { + if (BP_GET_FILL(&bp[i]) >= minfill && + BP_GET_FILL(&bp[i]) <= maxfill && + (hole || bp[i].blk_birth > txg)) + break; + if (inc > 0 || *offset > 0) + *offset += inc; + } + + if (span >= 8 * sizeof (*offset)) { + *offset = start; + } else { + *offset = *offset << span; + } + + if (inc < 0) { + /* traversing backwards; position offset at the end */ + ASSERT3U(*offset, <=, start); + *offset = MIN(*offset + (1ULL << span) - 1, start); + } else if (*offset < start) { + *offset = start; + } + if (i < 0 || i >= epb) + error = SET_ERROR(ESRCH); + } + + if (db != NULL) { + rw_exit(&db->db_rwlock); + dbuf_rele(db, FTAG); + } + + return (error); +} + +/* + * Find the next hole, data, or sparse region at or after *offset. + * The value 'blkfill' tells us how many items we expect to find + * in an L0 data block; this value is 1 for normal objects, + * DNODES_PER_BLOCK for the meta dnode, and some fraction of + * DNODES_PER_BLOCK when searching for sparse regions thereof. + * + * Examples: + * + * dnode_next_offset(dn, flags, offset, 1, 1, 0); + * Finds the next/previous hole/data in a file. + * Used in dmu_offset_next(). + * + * dnode_next_offset(mdn, flags, offset, 0, DNODES_PER_BLOCK, txg); + * Finds the next free/allocated dnode an objset's meta-dnode. + * Only finds objects that have new contents since txg (ie. + * bonus buffer changes and content removal are ignored). + * Used in dmu_object_next(). + * + * dnode_next_offset(mdn, DNODE_FIND_HOLE, offset, 2, DNODES_PER_BLOCK >> 2, 0); + * Finds the next L2 meta-dnode bp that's at most 1/4 full. + * Used in dmu_object_alloc(). + */ +int +dnode_next_offset(dnode_t *dn, int flags, uint64_t *offset, + int minlvl, uint64_t blkfill, uint64_t txg) +{ + uint64_t initial_offset = *offset; + int lvl, maxlvl; + int error = 0; + + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_enter(&dn->dn_struct_rwlock, RW_READER); + + if (dn->dn_phys->dn_nlevels == 0) { + error = SET_ERROR(ESRCH); + goto out; + } + + if (dn->dn_datablkshift == 0) { + if (*offset < dn->dn_datablksz) { + if (flags & DNODE_FIND_HOLE) + *offset = dn->dn_datablksz; + } else { + error = SET_ERROR(ESRCH); + } + goto out; + } + + maxlvl = dn->dn_phys->dn_nlevels; + + for (lvl = minlvl; lvl <= maxlvl; lvl++) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + if (error != ESRCH) + break; + } + + while (error == 0 && --lvl >= minlvl) { + error = dnode_next_offset_level(dn, + flags, offset, lvl, blkfill, txg); + } + + /* + * There's always a "virtual hole" at the end of the object, even + * if all BP's which physically exist are non-holes. + */ + if ((flags & DNODE_FIND_HOLE) && error == ESRCH && txg == 0 && + minlvl == 1 && blkfill == 1 && !(flags & DNODE_FIND_BACKWARDS)) { + error = 0; + } + + if (error == 0 && (flags & DNODE_FIND_BACKWARDS ? + initial_offset < *offset : initial_offset > *offset)) + error = SET_ERROR(ESRCH); +out: + if (!(flags & DNODE_FIND_HAVELOCK)) + rw_exit(&dn->dn_struct_rwlock); + + return (error); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dnode_hold); +EXPORT_SYMBOL(dnode_rele); +EXPORT_SYMBOL(dnode_set_nlevels); +EXPORT_SYMBOL(dnode_set_blksz); +EXPORT_SYMBOL(dnode_free_range); +EXPORT_SYMBOL(dnode_evict_dbufs); +EXPORT_SYMBOL(dnode_evict_bonus); +#endif diff --git a/module/zfs/dnode_sync.c b/module/zfs/dnode_sync.c new file mode 100644 index 000000000000..eafea3403c71 --- /dev/null +++ b/module/zfs/dnode_sync.c @@ -0,0 +1,846 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +dnode_increase_indirection(dnode_t *dn, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db; + int txgoff = tx->tx_txg & TXG_MASK; + int nblkptr = dn->dn_phys->dn_nblkptr; + int old_toplvl = dn->dn_phys->dn_nlevels - 1; + int new_level = dn->dn_next_nlevels[txgoff]; + int i; + + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + + /* this dnode can't be paged out because it's dirty */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(new_level > 1 && dn->dn_phys->dn_nlevels > 0); + + db = dbuf_hold_level(dn, dn->dn_phys->dn_nlevels, 0, FTAG); + ASSERT(db != NULL); + + dn->dn_phys->dn_nlevels = new_level; + dprintf("os=%p obj=%llu, increase to %d\n", dn->dn_objset, + dn->dn_object, dn->dn_phys->dn_nlevels); + + /* + * Lock ordering requires that we hold the children's db_mutexes (by + * calling dbuf_find()) before holding the parent's db_rwlock. The lock + * order is imposed by dbuf_read's steps of "grab the lock to protect + * db_parent, get db_parent, hold db_parent's db_rwlock". + */ + dmu_buf_impl_t *children[DN_MAX_NBLKPTR]; + ASSERT3U(nblkptr, <=, DN_MAX_NBLKPTR); + for (i = 0; i < nblkptr; i++) { + children[i] = + dbuf_find(dn->dn_objset, dn->dn_object, old_toplvl, i); + } + + /* transfer dnode's block pointers to new indirect block */ + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED|DB_RF_HAVESTRUCT); + if (dn->dn_dbuf != NULL) + rw_enter(&dn->dn_dbuf->db_rwlock, RW_WRITER); + rw_enter(&db->db_rwlock, RW_WRITER); + ASSERT(db->db.db_data); + ASSERT(arc_released(db->db_buf)); + ASSERT3U(sizeof (blkptr_t) * nblkptr, <=, db->db.db_size); + bcopy(dn->dn_phys->dn_blkptr, db->db.db_data, + sizeof (blkptr_t) * nblkptr); + arc_buf_freeze(db->db_buf); + + /* set dbuf's parent pointers to new indirect buf */ + for (i = 0; i < nblkptr; i++) { + dmu_buf_impl_t *child = children[i]; + + if (child == NULL) + continue; +#ifdef ZFS_DEBUG + DB_DNODE_ENTER(child); + ASSERT3P(DB_DNODE(child), ==, dn); + DB_DNODE_EXIT(child); +#endif /* DEBUG */ + if (child->db_parent && child->db_parent != dn->dn_dbuf) { + ASSERT(child->db_parent->db_level == db->db_level); + ASSERT(child->db_blkptr != + &dn->dn_phys->dn_blkptr[child->db_blkid]); + mutex_exit(&child->db_mtx); + continue; + } + ASSERT(child->db_parent == NULL || + child->db_parent == dn->dn_dbuf); + + child->db_parent = db; + dbuf_add_ref(db, child); + if (db->db.db_data) + child->db_blkptr = (blkptr_t *)db->db.db_data + i; + else + child->db_blkptr = NULL; + dprintf_dbuf_bp(child, child->db_blkptr, + "changed db_blkptr to new indirect %s", ""); + + mutex_exit(&child->db_mtx); + } + + bzero(dn->dn_phys->dn_blkptr, sizeof (blkptr_t) * nblkptr); + + rw_exit(&db->db_rwlock); + if (dn->dn_dbuf != NULL) + rw_exit(&dn->dn_dbuf->db_rwlock); + + dbuf_rele(db, FTAG); + + rw_exit(&dn->dn_struct_rwlock); +} + +static void +free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + uint64_t bytesfreed = 0; + + dprintf("ds=%p obj=%llx num=%d\n", ds, dn->dn_object, num); + + for (int i = 0; i < num; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + + bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); + ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); + + /* + * Save some useful information on the holes being + * punched, including logical size, type, and indirection + * level. Retaining birth time enables detection of when + * holes are punched for reducing the number of free + * records transmitted during a zfs send. + */ + + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t type = BP_GET_TYPE(bp); + uint64_t lvl = BP_GET_LEVEL(bp); + + bzero(bp, sizeof (blkptr_t)); + + if (spa_feature_is_active(dn->dn_objset->os_spa, + SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, type); + BP_SET_LEVEL(bp, lvl); + BP_SET_BIRTH(bp, dmu_tx_get_txg(tx), 0); + } + } + dnode_diduse_space(dn, -bytesfreed); +} + +#ifdef ZFS_DEBUG +static void +free_verify(dmu_buf_impl_t *db, uint64_t start, uint64_t end, dmu_tx_t *tx) +{ + int off, num; + int i, err, epbs; + uint64_t txg = tx->tx_txg; + dnode_t *dn; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + off = start - (db->db_blkid * 1<=, 0); + ASSERT3U(num, >=, 0); + ASSERT3U(db->db_level, >, 0); + ASSERT3U(db->db.db_size, ==, 1 << dn->dn_phys->dn_indblkshift); + ASSERT3U(off+num, <=, db->db.db_size >> SPA_BLKPTRSHIFT); + ASSERT(db->db_blkptr != NULL); + + for (i = off; i < off+num; i++) { + uint64_t *buf; + dmu_buf_impl_t *child; + dbuf_dirty_record_t *dr; + int j; + + ASSERT(db->db_level == 1); + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + err = dbuf_hold_impl(dn, db->db_level - 1, + (db->db_blkid << epbs) + i, TRUE, FALSE, FTAG, &child); + rw_exit(&dn->dn_struct_rwlock); + if (err == ENOENT) + continue; + ASSERT(err == 0); + ASSERT(child->db_level == 0); + dr = dbuf_find_dirty_eq(child, txg); + + /* data_old better be zeroed */ + if (dr) { + buf = dr->dt.dl.dr_data->b_data; + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + + /* + * db_data better be zeroed unless it's dirty in a + * future txg. + */ + mutex_enter(&child->db_mtx); + buf = child->db.db_data; + if (buf != NULL && child->db_state != DB_FILL && + list_is_empty(&child->db_dirty_records)) { + for (j = 0; j < child->db.db_size >> 3; j++) { + if (buf[j] != 0) { + panic("freed data not zero: " + "child=%p i=%d off=%d num=%d\n", + (void *)child, i, off, num); + } + } + } + mutex_exit(&child->db_mtx); + + dbuf_rele(child, FTAG); + } + DB_DNODE_EXIT(db); +} +#endif + +/* + * We don't usually free the indirect blocks here. If in one txg we have a + * free_range and a write to the same indirect block, it's important that we + * preserve the hole's birth times. Therefore, we don't free any any indirect + * blocks in free_children(). If an indirect block happens to turn into all + * holes, it will be freed by dbuf_write_children_ready, which happens at a + * point in the syncing process where we know for certain the contents of the + * indirect block. + * + * However, if we're freeing a dnode, its space accounting must go to zero + * before we actually try to free the dnode, or we will trip an assertion. In + * addition, we know the case described above cannot occur, because the dnode is + * being freed. Therefore, we free the indirect blocks immediately in that + * case. + */ +static void +free_children(dmu_buf_impl_t *db, uint64_t blkid, uint64_t nblks, + boolean_t free_indirects, dmu_tx_t *tx) +{ + dnode_t *dn; + blkptr_t *bp; + dmu_buf_impl_t *subdb; + uint64_t start, end, dbstart, dbend; + unsigned int epbs, shift, i; + + /* + * There is a small possibility that this block will not be cached: + * 1 - if level > 1 and there are no children with level <= 1 + * 2 - if this block was evicted since we read it from + * dmu_tx_hold_free(). + */ + if (db->db_state != DB_CACHED) + (void) dbuf_read(db, NULL, DB_RF_MUST_SUCCEED); + + /* + * If we modify this indirect block, and we are not freeing the + * dnode (!free_indirects), then this indirect block needs to get + * written to disk by dbuf_write(). If it is dirty, we know it will + * be written (otherwise, we would have incorrect on-disk state + * because the space would be freed but still referenced by the BP + * in this indirect block). Therefore we VERIFY that it is + * dirty. + * + * Our VERIFY covers some cases that do not actually have to be + * dirty, but the open-context code happens to dirty. E.g. if the + * blocks we are freeing are all holes, because in that case, we + * are only freeing part of this indirect block, so it is an + * ancestor of the first or last block to be freed. The first and + * last L1 indirect blocks are always dirtied by dnode_free_range(). + */ + db_lock_type_t dblt = dmu_buf_lock_parent(db, RW_READER, FTAG); + VERIFY(BP_GET_FILL(db->db_blkptr) == 0 || db->db_dirtycnt > 0); + dmu_buf_unlock_parent(db, dblt, FTAG); + + dbuf_release_bp(db); + bp = db->db.db_data; + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + epbs = dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT; + ASSERT3U(epbs, <, 31); + shift = (db->db_level - 1) * epbs; + dbstart = db->db_blkid << epbs; + start = blkid >> shift; + if (dbstart < start) { + bp += start - dbstart; + } else { + start = dbstart; + } + dbend = ((db->db_blkid + 1) << epbs) - 1; + end = (blkid + nblks - 1) >> shift; + if (dbend <= end) + end = dbend; + + ASSERT3U(start, <=, end); + + if (db->db_level == 1) { + FREE_VERIFY(db, start, end, tx); + rw_enter(&db->db_rwlock, RW_WRITER); + free_blocks(dn, bp, end - start + 1, tx); + rw_exit(&db->db_rwlock); + } else { + for (uint64_t id = start; id <= end; id++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, db->db_level - 1, + id, TRUE, FALSE, FTAG, &subdb)); + rw_exit(&dn->dn_struct_rwlock); + ASSERT3P(bp, ==, subdb->db_blkptr); + + free_children(subdb, blkid, nblks, free_indirects, tx); + dbuf_rele(subdb, FTAG); + } + } + + if (free_indirects) { + rw_enter(&db->db_rwlock, RW_WRITER); + for (i = 0, bp = db->db.db_data; i < 1 << epbs; i++, bp++) + ASSERT(BP_IS_HOLE(bp)); + bzero(db->db.db_data, db->db.db_size); + free_blocks(dn, db->db_blkptr, 1, tx); + rw_exit(&db->db_rwlock); + } + + DB_DNODE_EXIT(db); + arc_buf_freeze(db->db_buf); +} + +/* + * Traverse the indicated range of the provided file + * and "free" all the blocks contained there. + */ +static void +dnode_sync_free_range_impl(dnode_t *dn, uint64_t blkid, uint64_t nblks, + boolean_t free_indirects, dmu_tx_t *tx) +{ + blkptr_t *bp = dn->dn_phys->dn_blkptr; + int dnlevel = dn->dn_phys->dn_nlevels; + boolean_t trunc = B_FALSE; + + if (blkid > dn->dn_phys->dn_maxblkid) + return; + + ASSERT(dn->dn_phys->dn_maxblkid < UINT64_MAX); + if (blkid + nblks > dn->dn_phys->dn_maxblkid) { + nblks = dn->dn_phys->dn_maxblkid - blkid + 1; + trunc = B_TRUE; + } + + /* There are no indirect blocks in the object */ + if (dnlevel == 1) { + if (blkid >= dn->dn_phys->dn_nblkptr) { + /* this range was never made persistent */ + return; + } + ASSERT3U(blkid + nblks, <=, dn->dn_phys->dn_nblkptr); + free_blocks(dn, bp + blkid, nblks, tx); + } else { + int shift = (dnlevel - 1) * + (dn->dn_phys->dn_indblkshift - SPA_BLKPTRSHIFT); + int start = blkid >> shift; + int end = (blkid + nblks - 1) >> shift; + dmu_buf_impl_t *db; + + ASSERT(start < dn->dn_phys->dn_nblkptr); + bp += start; + for (int i = start; i <= end; i++, bp++) { + if (BP_IS_HOLE(bp)) + continue; + rw_enter(&dn->dn_struct_rwlock, RW_READER); + VERIFY0(dbuf_hold_impl(dn, dnlevel - 1, i, + TRUE, FALSE, FTAG, &db)); + rw_exit(&dn->dn_struct_rwlock); + free_children(db, blkid, nblks, free_indirects, tx); + dbuf_rele(db, FTAG); + } + } + + /* + * Do not truncate the maxblkid if we are performing a raw + * receive. The raw receive sets the maxblkid manually and + * must not be overridden. Usually, the last DRR_FREE record + * will be at the maxblkid, because the source system sets + * the maxblkid when truncating. However, if the last block + * was freed by overwriting with zeros and being compressed + * away to a hole, the source system will generate a DRR_FREE + * record while leaving the maxblkid after the end of that + * record. In this case we need to leave the maxblkid as + * indicated in the DRR_OBJECT record, so that it matches the + * source system, ensuring that the cryptographic hashes will + * match. + */ + if (trunc && !dn->dn_objset->os_raw_receive) { + uint64_t off __maybe_unused; + dn->dn_phys->dn_maxblkid = blkid == 0 ? 0 : blkid - 1; + + off = (dn->dn_phys->dn_maxblkid + 1) * + (dn->dn_phys->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(off < dn->dn_phys->dn_maxblkid || + dn->dn_phys->dn_maxblkid == 0 || + dnode_next_offset(dn, 0, &off, 1, 1, 0) != 0); + } +} + +typedef struct dnode_sync_free_range_arg { + dnode_t *dsfra_dnode; + dmu_tx_t *dsfra_tx; + boolean_t dsfra_free_indirects; +} dnode_sync_free_range_arg_t; + +static void +dnode_sync_free_range(void *arg, uint64_t blkid, uint64_t nblks) +{ + dnode_sync_free_range_arg_t *dsfra = arg; + dnode_t *dn = dsfra->dsfra_dnode; + + mutex_exit(&dn->dn_mtx); + dnode_sync_free_range_impl(dn, blkid, nblks, + dsfra->dsfra_free_indirects, dsfra->dsfra_tx); + mutex_enter(&dn->dn_mtx); +} + +/* + * Try to kick all the dnode's dbufs out of the cache... + */ +void +dnode_evict_dbufs(dnode_t *dn) +{ + dmu_buf_impl_t *db_marker; + dmu_buf_impl_t *db, *db_next; + + db_marker = kmem_alloc(sizeof (dmu_buf_impl_t), KM_SLEEP); + + mutex_enter(&dn->dn_dbufs_mtx); + for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) { + +#ifdef ZFS_DEBUG + DB_DNODE_ENTER(db); + ASSERT3P(DB_DNODE(db), ==, dn); + DB_DNODE_EXIT(db); +#endif /* DEBUG */ + + mutex_enter(&db->db_mtx); + if (db->db_state != DB_EVICTING && + zfs_refcount_is_zero(&db->db_holds)) { + db_marker->db_level = db->db_level; + db_marker->db_blkid = db->db_blkid; + db_marker->db_state = DB_SEARCH; + avl_insert_here(&dn->dn_dbufs, db_marker, db, + AVL_BEFORE); + + /* + * We need to use the "marker" dbuf rather than + * simply getting the next dbuf, because + * dbuf_destroy() may actually remove multiple dbufs. + * It can call itself recursively on the parent dbuf, + * which may also be removed from dn_dbufs. The code + * flow would look like: + * + * dbuf_destroy(): + * dnode_rele_and_unlock(parent_dbuf, evicting=TRUE): + * if (!cacheable || pending_evict) + * dbuf_destroy() + */ + dbuf_destroy(db); + + db_next = AVL_NEXT(&dn->dn_dbufs, db_marker); + avl_remove(&dn->dn_dbufs, db_marker); + } else { + db->db_pending_evict = TRUE; + mutex_exit(&db->db_mtx); + db_next = AVL_NEXT(&dn->dn_dbufs, db); + } + } + mutex_exit(&dn->dn_dbufs_mtx); + + kmem_free(db_marker, sizeof (dmu_buf_impl_t)); + + dnode_evict_bonus(dn); +} + +void +dnode_evict_bonus(dnode_t *dn) +{ + rw_enter(&dn->dn_struct_rwlock, RW_WRITER); + if (dn->dn_bonus != NULL) { + if (zfs_refcount_is_zero(&dn->dn_bonus->db_holds)) { + mutex_enter(&dn->dn_bonus->db_mtx); + dbuf_destroy(dn->dn_bonus); + dn->dn_bonus = NULL; + } else { + dn->dn_bonus->db_pending_evict = TRUE; + } + } + rw_exit(&dn->dn_struct_rwlock); +} + +static void +dnode_undirty_dbufs(list_t *list) +{ + dbuf_dirty_record_t *dr; + + while ((dr = list_head(list))) { + dmu_buf_impl_t *db = dr->dr_dbuf; + uint64_t txg = dr->dr_txg; + + if (db->db_level != 0) + dnode_undirty_dbufs(&dr->dt.di.dr_children); + + mutex_enter(&db->db_mtx); + /* XXX - use dbuf_undirty()? */ + list_remove(list, dr); + ASSERT(list_head(&db->db_dirty_records) == dr); + list_remove_head(&db->db_dirty_records); + ASSERT(list_is_empty(&db->db_dirty_records)); + db->db_dirtycnt -= 1; + if (db->db_level == 0) { + ASSERT(db->db_blkid == DMU_BONUS_BLKID || + dr->dt.dl.dr_data == db->db_buf); + dbuf_unoverride(dr); + } else { + mutex_destroy(&dr->dt.di.dr_mtx); + list_destroy(&dr->dt.di.dr_children); + } + kmem_free(dr, sizeof (dbuf_dirty_record_t)); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg, B_FALSE); + } +} + +static void +dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) +{ + int txgoff = tx->tx_txg & TXG_MASK; + + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * Our contents should have been freed in dnode_sync() by the + * free range record inserted by the caller of dnode_free(). + */ + ASSERT0(DN_USED_BYTES(dn->dn_phys)); + ASSERT(BP_IS_HOLE(dn->dn_phys->dn_blkptr)); + + dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]); + dnode_evict_dbufs(dn); + + /* + * XXX - It would be nice to assert this, but we may still + * have residual holds from async evictions from the arc... + * + * zfs_obj_to_path() also depends on this being + * commented out. + * + * ASSERT3U(zfs_refcount_count(&dn->dn_holds), ==, 1); + */ + + /* Undirty next bits */ + dn->dn_next_nlevels[txgoff] = 0; + dn->dn_next_indblkshift[txgoff] = 0; + dn->dn_next_blksz[txgoff] = 0; + dn->dn_next_maxblkid[txgoff] = 0; + + /* ASSERT(blkptrs are zero); */ + ASSERT(dn->dn_phys->dn_type != DMU_OT_NONE); + ASSERT(dn->dn_type != DMU_OT_NONE); + + ASSERT(dn->dn_free_txg > 0); + if (dn->dn_allocated_txg != dn->dn_free_txg) + dmu_buf_will_dirty(&dn->dn_dbuf->db, tx); + bzero(dn->dn_phys, sizeof (dnode_phys_t) * dn->dn_num_slots); + dnode_free_interior_slots(dn); + + mutex_enter(&dn->dn_mtx); + dn->dn_type = DMU_OT_NONE; + dn->dn_maxblkid = 0; + dn->dn_allocated_txg = 0; + dn->dn_free_txg = 0; + dn->dn_have_spill = B_FALSE; + dn->dn_num_slots = 1; + mutex_exit(&dn->dn_mtx); + + ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); + + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + /* + * Now that we've released our hold, the dnode may + * be evicted, so we mustn't access it. + */ +} + +/* + * Write out the dnode's dirty buffers. + */ +void +dnode_sync(dnode_t *dn, dmu_tx_t *tx) +{ + objset_t *os = dn->dn_objset; + dnode_phys_t *dnp = dn->dn_phys; + int txgoff = tx->tx_txg & TXG_MASK; + list_t *list = &dn->dn_dirty_records[txgoff]; + static const dnode_phys_t zerodn __maybe_unused = { 0 }; + boolean_t kill_spill = B_FALSE; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_MIN_SIZE) == 0); + DNODE_VERIFY(dn); + + ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + + /* + * Do user accounting if it is enabled and this is not + * an encrypted receive. + */ + if (dmu_objset_userused_enabled(os) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object) && + (!os->os_encrypted || !dmu_objset_is_receiving(os))) { + mutex_enter(&dn->dn_mtx); + dn->dn_oldused = DN_USED_BYTES(dn->dn_phys); + dn->dn_oldflags = dn->dn_phys->dn_flags; + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + if (dmu_objset_userobjused_enabled(dn->dn_objset)) + dn->dn_phys->dn_flags |= + DNODE_FLAG_USEROBJUSED_ACCOUNTED; + mutex_exit(&dn->dn_mtx); + dmu_objset_userquota_get_ids(dn, B_FALSE, tx); + } else { + /* Once we account for it, we should always account for it */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USEROBJUSED_ACCOUNTED)); + } + + mutex_enter(&dn->dn_mtx); + if (dn->dn_allocated_txg == tx->tx_txg) { + /* The dnode is newly allocated or reallocated */ + if (dnp->dn_type == DMU_OT_NONE) { + /* this is a first alloc, not a realloc */ + dnp->dn_nlevels = 1; + dnp->dn_nblkptr = dn->dn_nblkptr; + } + + dnp->dn_type = dn->dn_type; + dnp->dn_bonustype = dn->dn_bonustype; + dnp->dn_bonuslen = dn->dn_bonuslen; + } + + dnp->dn_extra_slots = dn->dn_num_slots - 1; + + ASSERT(dnp->dn_nlevels > 1 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_IS_EMBEDDED(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == + dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); + ASSERT(dnp->dn_nlevels < 2 || + BP_IS_HOLE(&dnp->dn_blkptr[0]) || + BP_GET_LSIZE(&dnp->dn_blkptr[0]) == 1 << dnp->dn_indblkshift); + + if (dn->dn_next_type[txgoff] != 0) { + dnp->dn_type = dn->dn_type; + dn->dn_next_type[txgoff] = 0; + } + + if (dn->dn_next_blksz[txgoff] != 0) { + ASSERT(P2PHASE(dn->dn_next_blksz[txgoff], + SPA_MINBLOCKSIZE) == 0); + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[0]) || + dn->dn_maxblkid == 0 || list_head(list) != NULL || + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT == + dnp->dn_datablkszsec || + !range_tree_is_empty(dn->dn_free_ranges[txgoff])); + dnp->dn_datablkszsec = + dn->dn_next_blksz[txgoff] >> SPA_MINBLOCKSHIFT; + dn->dn_next_blksz[txgoff] = 0; + } + + if (dn->dn_next_bonuslen[txgoff] != 0) { + if (dn->dn_next_bonuslen[txgoff] == DN_ZERO_BONUSLEN) + dnp->dn_bonuslen = 0; + else + dnp->dn_bonuslen = dn->dn_next_bonuslen[txgoff]; + ASSERT(dnp->dn_bonuslen <= + DN_SLOTS_TO_BONUSLEN(dnp->dn_extra_slots + 1)); + dn->dn_next_bonuslen[txgoff] = 0; + } + + if (dn->dn_next_bonustype[txgoff] != 0) { + ASSERT(DMU_OT_IS_VALID(dn->dn_next_bonustype[txgoff])); + dnp->dn_bonustype = dn->dn_next_bonustype[txgoff]; + dn->dn_next_bonustype[txgoff] = 0; + } + + boolean_t freeing_dnode = dn->dn_free_txg > 0 && + dn->dn_free_txg <= tx->tx_txg; + + /* + * Remove the spill block if we have been explicitly asked to + * remove it, or if the object is being removed. + */ + if (dn->dn_rm_spillblk[txgoff] || freeing_dnode) { + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) + kill_spill = B_TRUE; + dn->dn_rm_spillblk[txgoff] = 0; + } + + if (dn->dn_next_indblkshift[txgoff] != 0) { + ASSERT(dnp->dn_nlevels == 1); + dnp->dn_indblkshift = dn->dn_next_indblkshift[txgoff]; + dn->dn_next_indblkshift[txgoff] = 0; + } + + /* + * Just take the live (open-context) values for checksum and compress. + * Strictly speaking it's a future leak, but nothing bad happens if we + * start using the new checksum or compress algorithm a little early. + */ + dnp->dn_checksum = dn->dn_checksum; + dnp->dn_compress = dn->dn_compress; + + mutex_exit(&dn->dn_mtx); + + if (kill_spill) { + free_blocks(dn, DN_SPILL_BLKPTR(dn->dn_phys), 1, tx); + mutex_enter(&dn->dn_mtx); + dnp->dn_flags &= ~DNODE_FLAG_SPILL_BLKPTR; + mutex_exit(&dn->dn_mtx); + } + + /* process all the "freed" ranges in the file */ + if (dn->dn_free_ranges[txgoff] != NULL) { + dnode_sync_free_range_arg_t dsfra; + dsfra.dsfra_dnode = dn; + dsfra.dsfra_tx = tx; + dsfra.dsfra_free_indirects = freeing_dnode; + if (freeing_dnode) { + ASSERT(range_tree_contains(dn->dn_free_ranges[txgoff], + 0, dn->dn_maxblkid + 1)); + } + mutex_enter(&dn->dn_mtx); + range_tree_vacate(dn->dn_free_ranges[txgoff], + dnode_sync_free_range, &dsfra); + range_tree_destroy(dn->dn_free_ranges[txgoff]); + dn->dn_free_ranges[txgoff] = NULL; + mutex_exit(&dn->dn_mtx); + } + + if (freeing_dnode) { + dn->dn_objset->os_freed_dnodes++; + dnode_sync_free(dn, tx); + return; + } + + if (dn->dn_num_slots > DNODE_MIN_SLOTS) { + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; + mutex_enter(&ds->ds_lock); + ds->ds_feature_activation[SPA_FEATURE_LARGE_DNODE] = + (void *)B_TRUE; + mutex_exit(&ds->ds_lock); + } + + if (dn->dn_next_nlevels[txgoff]) { + dnode_increase_indirection(dn, tx); + dn->dn_next_nlevels[txgoff] = 0; + } + + /* + * This must be done after dnode_sync_free_range() + * and dnode_increase_indirection(). See dnode_new_blkid() + * for an explanation of the high bit being set. + */ + if (dn->dn_next_maxblkid[txgoff]) { + mutex_enter(&dn->dn_mtx); + dnp->dn_maxblkid = + dn->dn_next_maxblkid[txgoff] & ~DMU_NEXT_MAXBLKID_SET; + dn->dn_next_maxblkid[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + + if (dn->dn_next_nblkptr[txgoff]) { + /* this should only happen on a realloc */ + ASSERT(dn->dn_allocated_txg == tx->tx_txg); + if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); +#ifdef ZFS_DEBUG + } else { + int i; + ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); + /* the blkptrs we are losing better be unallocated */ + for (i = 0; i < dnp->dn_nblkptr; i++) { + if (i >= dn->dn_next_nblkptr[txgoff]) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); + } +#endif + } + mutex_enter(&dn->dn_mtx); + dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; + dn->dn_next_nblkptr[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + + dbuf_sync_list(list, dn->dn_phys->dn_nlevels - 1, tx); + + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + ASSERT3P(list_head(list), ==, NULL); + dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); + } + + /* + * Although we have dropped our reference to the dnode, it + * can't be evicted until its written, and we haven't yet + * initiated the IO for the dnode's dbuf. + */ +} diff --git a/module/zfs/dsl_bookmark.c b/module/zfs/dsl_bookmark.c new file mode 100644 index 000000000000..16bf2c4414a8 --- /dev/null +++ b/module/zfs/dsl_bookmark.c @@ -0,0 +1,1742 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright 2019, 2020 by Christian Schwarz. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static int +dsl_bookmark_hold_ds(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t **dsp, void *tag, char **shortnamep) +{ + char buf[ZFS_MAX_DATASET_NAME_LEN]; + char *hashp; + + if (strlen(fullname) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + hashp = strchr(fullname, '#'); + if (hashp == NULL) + return (SET_ERROR(EINVAL)); + + *shortnamep = hashp + 1; + if (zfs_component_namecheck(*shortnamep, NULL, NULL)) + return (SET_ERROR(EINVAL)); + (void) strlcpy(buf, fullname, hashp - fullname + 1); + return (dsl_dataset_hold(dp, buf, tag, dsp)); +} + +/* + * When reading BOOKMARK_V1 bookmarks, the BOOKMARK_V2 fields are guaranteed + * to be zeroed. + * + * Returns ESRCH if bookmark is not found. + * Note, we need to use the ZAP rather than the AVL to look up bookmarks + * by name, because only the ZAP honors the casesensitivity setting. + */ +int +dsl_bookmark_lookup_impl(dsl_dataset_t *ds, const char *shortname, + zfs_bookmark_phys_t *bmark_phys) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; + matchtype_t mt = 0; + int err; + + if (bmark_zapobj == 0) + return (SET_ERROR(ESRCH)); + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + /* + * Zero out the bookmark in case the one stored on disk + * is in an older, shorter format. + */ + bzero(bmark_phys, sizeof (*bmark_phys)); + + err = zap_lookup_norm(mos, bmark_zapobj, shortname, sizeof (uint64_t), + sizeof (*bmark_phys) / sizeof (uint64_t), bmark_phys, mt, NULL, 0, + NULL); + + return (err == ENOENT ? SET_ERROR(ESRCH) : err); +} + +/* + * If later_ds is non-NULL, this will return EXDEV if the specified bookmark + * does not represents an earlier point in later_ds's timeline. However, + * bmp will still be filled in if we return EXDEV. + * + * Returns ENOENT if the dataset containing the bookmark does not exist. + * Returns ESRCH if the dataset exists but the bookmark was not found in it. + */ +int +dsl_bookmark_lookup(dsl_pool_t *dp, const char *fullname, + dsl_dataset_t *later_ds, zfs_bookmark_phys_t *bmp) +{ + char *shortname; + dsl_dataset_t *ds; + int error; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, FTAG, &shortname); + if (error != 0) + return (error); + + error = dsl_bookmark_lookup_impl(ds, shortname, bmp); + if (error == 0 && later_ds != NULL) { + if (!dsl_dataset_is_before(later_ds, ds, bmp->zbm_creation_txg)) + error = SET_ERROR(EXDEV); + } + dsl_dataset_rele(ds, FTAG); + return (error); +} + +/* + * Validates that + * - bmark is a full dataset path of a bookmark (bookmark_namecheck) + * - source is a full path of a snapshot or bookmark + * ({bookmark,snapshot}_namecheck) + * + * Returns 0 if valid, -1 otherwise. + */ +static int +dsl_bookmark_create_nvl_validate_pair(const char *bmark, const char *source) +{ + if (bookmark_namecheck(bmark, NULL, NULL) != 0) + return (-1); + + int is_bmark, is_snap; + is_bmark = bookmark_namecheck(source, NULL, NULL) == 0; + is_snap = snapshot_namecheck(source, NULL, NULL) == 0; + if (!is_bmark && !is_snap) + return (-1); + + return (0); +} + +/* + * Check that the given nvlist corresponds to the following schema: + * { newbookmark -> source, ... } + * where + * - each pair passes dsl_bookmark_create_nvl_validate_pair + * - all newbookmarks are in the same pool + * - all newbookmarks have unique names + * + * Note that this function is only validates above schema. Callers must ensure + * that the bookmarks can be created, e.g. that sources exist. + * + * Returns 0 if the nvlist adheres to above schema. + * Returns -1 if it doesn't. + */ +int +dsl_bookmark_create_nvl_validate(nvlist_t *bmarks) +{ + char *first; + size_t first_len; + + first = NULL; + for (nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(bmarks, pair)) { + + char *bmark = nvpair_name(pair); + char *source; + + /* list structure: values must be snapshots XOR bookmarks */ + if (nvpair_value_string(pair, &source) != 0) + return (-1); + if (dsl_bookmark_create_nvl_validate_pair(bmark, source) != 0) + return (-1); + + /* same pool check */ + if (first == NULL) { + char *cp = strpbrk(bmark, "/#"); + if (cp == NULL) + return (-1); + first = bmark; + first_len = cp - bmark; + } + if (strncmp(first, bmark, first_len) != 0) + return (-1); + switch (*(bmark + first_len)) { + case '/': /* fallthrough */ + case '#': + break; + default: + return (-1); + } + + /* unique newbookmark names; todo: O(n^2) */ + for (nvpair_t *pair2 = nvlist_next_nvpair(bmarks, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(bmarks, pair2)) { + if (strcmp(nvpair_name(pair), nvpair_name(pair2)) == 0) + return (-1); + } + + } + return (0); +} + +/* + * expects that newbm and source have been validated using + * dsl_bookmark_create_nvl_validate_pair + */ +static int +dsl_bookmark_create_check_impl(dsl_pool_t *dp, + const char *newbm, const char *source) +{ + ASSERT0(dsl_bookmark_create_nvl_validate_pair(newbm, source)); + /* defer source namecheck until we know it's a snapshot or bookmark */ + + int error; + dsl_dataset_t *newbm_ds; + char *newbm_short; + zfs_bookmark_phys_t bmark_phys; + + error = dsl_bookmark_hold_ds(dp, newbm, &newbm_ds, FTAG, &newbm_short); + if (error != 0) + return (error); + + /* Verify that the new bookmark does not already exist */ + error = dsl_bookmark_lookup_impl(newbm_ds, newbm_short, &bmark_phys); + switch (error) { + case ESRCH: + /* happy path: new bmark doesn't exist, proceed after switch */ + error = 0; + break; + case 0: + error = SET_ERROR(EEXIST); + goto eholdnewbmds; + default: + /* dsl_bookmark_lookup_impl already did SET_ERRROR */ + goto eholdnewbmds; + } + + /* error is retval of the following if-cascade */ + if (strchr(source, '@') != NULL) { + dsl_dataset_t *source_snap_ds; + ASSERT3S(snapshot_namecheck(source, NULL, NULL), ==, 0); + error = dsl_dataset_hold(dp, source, FTAG, &source_snap_ds); + if (error == 0) { + VERIFY(source_snap_ds->ds_is_snapshot); + /* + * Verify that source snapshot is an earlier point in + * newbm_ds's timeline (source may be newbm_ds's origin) + */ + if (!dsl_dataset_is_before(newbm_ds, source_snap_ds, 0)) + error = SET_ERROR( + ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); + dsl_dataset_rele(source_snap_ds, FTAG); + } + } else if (strchr(source, '#') != NULL) { + zfs_bookmark_phys_t source_phys; + ASSERT3S(bookmark_namecheck(source, NULL, NULL), ==, 0); + /* + * Source must exists and be an earlier point in newbm_ds's + * timeline (newbm_ds's origin may be a snap of source's ds) + */ + error = dsl_bookmark_lookup(dp, source, newbm_ds, &source_phys); + switch (error) { + case 0: + break; /* happy path */ + case EXDEV: + error = SET_ERROR(ZFS_ERR_BOOKMARK_SOURCE_NOT_ANCESTOR); + break; + default: + /* dsl_bookmark_lookup already did SET_ERRROR */ + break; + } + } else { + /* + * dsl_bookmark_create_nvl_validate validates that source is + * either snapshot or bookmark + */ + panic("unreachable code: %s", source); + } + +eholdnewbmds: + dsl_dataset_rele(newbm_ds, FTAG); + return (error); +} + +int +dsl_bookmark_create_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + int rv = 0; + int schema_err = 0; + ASSERT3P(dbca, !=, NULL); + ASSERT3P(dbca->dbca_bmarks, !=, NULL); + /* dbca->dbca_errors is allowed to be NULL */ + + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + + if (dsl_bookmark_create_nvl_validate(dbca->dbca_bmarks) != 0) + rv = schema_err = SET_ERROR(EINVAL); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + char *new = nvpair_name(pair); + + int error = schema_err; + if (error == 0) { + char *source = fnvpair_value_string(pair); + error = dsl_bookmark_create_check_impl(dp, new, source); + if (error != 0) + error = SET_ERROR(error); + } + + if (error != 0) { + rv = error; + if (dbca->dbca_errors != NULL) + fnvlist_add_int32(dbca->dbca_errors, + new, error); + } + } + + return (rv); +} + +static dsl_bookmark_node_t * +dsl_bookmark_node_alloc(char *shortname) +{ + dsl_bookmark_node_t *dbn = kmem_alloc(sizeof (*dbn), KM_SLEEP); + dbn->dbn_name = spa_strdup(shortname); + dbn->dbn_dirty = B_FALSE; + mutex_init(&dbn->dbn_lock, NULL, MUTEX_DEFAULT, NULL); + return (dbn); +} + +/* + * Set the fields in the zfs_bookmark_phys_t based on the specified snapshot. + */ +static void +dsl_bookmark_set_phys(zfs_bookmark_phys_t *zbm, dsl_dataset_t *snap) +{ + spa_t *spa = dsl_dataset_get_spa(snap); + objset_t *mos = spa_get_dsl(spa)->dp_meta_objset; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(snap); + zbm->zbm_guid = dsp->ds_guid; + zbm->zbm_creation_txg = dsp->ds_creation_txg; + zbm->zbm_creation_time = dsp->ds_creation_time; + zbm->zbm_redaction_obj = 0; + + /* + * If the dataset is encrypted create a larger bookmark to + * accommodate the IVset guid. The IVset guid was added + * after the encryption feature to prevent a problem with + * raw sends. If we encounter an encrypted dataset without + * an IVset guid we fall back to a normal bookmark. + */ + if (snap->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { + (void) zap_lookup(mos, snap->ds_object, + DS_FIELD_IVSET_GUID, sizeof (uint64_t), 1, + &zbm->zbm_ivset_guid); + } + + if (spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_WRITTEN)) { + zbm->zbm_flags = ZBM_FLAG_SNAPSHOT_EXISTS | ZBM_FLAG_HAS_FBN; + zbm->zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm->zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm->zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + dsl_dataset_t *nextds; + VERIFY0(dsl_dataset_hold_obj(snap->ds_dir->dd_pool, + dsp->ds_next_snap_obj, FTAG, &nextds)); + dsl_deadlist_space(&nextds->ds_deadlist, + &zbm->zbm_referenced_freed_before_next_snap, + &zbm->zbm_compressed_freed_before_next_snap, + &zbm->zbm_uncompressed_freed_before_next_snap); + dsl_dataset_rele(nextds, FTAG); + } else { + bzero(&zbm->zbm_flags, + sizeof (zfs_bookmark_phys_t) - + offsetof(zfs_bookmark_phys_t, zbm_flags)); + } +} + +/* + * Add dsl_bookmark_node_t `dbn` to the given dataset and increment appropriate + * SPA feature counters. + */ +void +dsl_bookmark_node_add(dsl_dataset_t *hds, dsl_bookmark_node_t *dbn, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + if (hds->ds_bookmarks_obj == 0) { + hds->ds_bookmarks_obj = zap_create_norm(mos, + U8_TEXTPREP_TOUPPER, DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, + tx); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + + dsl_dataset_zapify(hds, tx); + VERIFY0(zap_add(mos, hds->ds_object, + DS_FIELD_BOOKMARK_NAMES, + sizeof (hds->ds_bookmarks_obj), 1, + &hds->ds_bookmarks_obj, tx)); + } + + avl_add(&hds->ds_bookmarks, dbn); + + /* + * To maintain backwards compatibility with software that doesn't + * understand SPA_FEATURE_BOOKMARK_V2, we need to use the smallest + * possible bookmark size. + */ + uint64_t bookmark_phys_size = BOOKMARK_PHYS_SIZE_V1; + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2) && + (dbn->dbn_phys.zbm_ivset_guid != 0 || dbn->dbn_phys.zbm_flags & + ZBM_FLAG_HAS_FBN || dbn->dbn_phys.zbm_redaction_obj != 0)) { + bookmark_phys_size = BOOKMARK_PHYS_SIZE_V2; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2, tx); + } + + __attribute__((unused)) zfs_bookmark_phys_t zero_phys = { 0 }; + ASSERT0(bcmp(((char *)&dbn->dbn_phys) + bookmark_phys_size, + &zero_phys, sizeof (zfs_bookmark_phys_t) - bookmark_phys_size)); + + VERIFY0(zap_add(mos, hds->ds_bookmarks_obj, dbn->dbn_name, + sizeof (uint64_t), bookmark_phys_size / sizeof (uint64_t), + &dbn->dbn_phys, tx)); +} + +/* + * If redaction_list is non-null, we create a redacted bookmark and redaction + * list, and store the object number of the redaction list in redact_obj. + */ +static void +dsl_bookmark_create_sync_impl_snap(const char *bookmark, const char *snapshot, + dmu_tx_t *tx, uint64_t num_redact_snaps, uint64_t *redact_snaps, void *tag, + redaction_list_t **redaction_list) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *snapds, *bmark_fs; + char *shortname; + boolean_t bookmark_redacted; + uint64_t *dsredactsnaps; + uint64_t dsnumsnaps; + + VERIFY0(dsl_dataset_hold(dp, snapshot, FTAG, &snapds)); + VERIFY0(dsl_bookmark_hold_ds(dp, bookmark, &bmark_fs, FTAG, + &shortname)); + + dsl_bookmark_node_t *dbn = dsl_bookmark_node_alloc(shortname); + dsl_bookmark_set_phys(&dbn->dbn_phys, snapds); + + bookmark_redacted = dsl_dataset_get_uint64_array_feature(snapds, + SPA_FEATURE_REDACTED_DATASETS, &dsnumsnaps, &dsredactsnaps); + if (redaction_list != NULL || bookmark_redacted) { + redaction_list_t *local_rl; + if (bookmark_redacted) { + redact_snaps = dsredactsnaps; + num_redact_snaps = dsnumsnaps; + } + dbn->dbn_phys.zbm_redaction_obj = dmu_object_alloc(mos, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (redaction_list_phys_t) + + num_redact_snaps * sizeof (uint64_t), tx); + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + + VERIFY0(dsl_redaction_list_hold_obj(dp, + dbn->dbn_phys.zbm_redaction_obj, tag, &local_rl)); + dsl_redaction_list_long_hold(dp, local_rl, tag); + + ASSERT3U((local_rl)->rl_dbuf->db_size, >=, + sizeof (redaction_list_phys_t) + num_redact_snaps * + sizeof (uint64_t)); + dmu_buf_will_dirty(local_rl->rl_dbuf, tx); + bcopy(redact_snaps, local_rl->rl_phys->rlp_snaps, + sizeof (uint64_t) * num_redact_snaps); + local_rl->rl_phys->rlp_num_snaps = num_redact_snaps; + if (bookmark_redacted) { + ASSERT3P(redaction_list, ==, NULL); + local_rl->rl_phys->rlp_last_blkid = UINT64_MAX; + local_rl->rl_phys->rlp_last_object = UINT64_MAX; + dsl_redaction_list_long_rele(local_rl, tag); + dsl_redaction_list_rele(local_rl, tag); + } else { + *redaction_list = local_rl; + } + } + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + dsl_bookmark_node_add(bmark_fs, dbn, tx); + + spa_history_log_internal_ds(bmark_fs, "bookmark", tx, + "name=%s creation_txg=%llu target_snap=%llu redact_obj=%llu", + shortname, (longlong_t)dbn->dbn_phys.zbm_creation_txg, + (longlong_t)snapds->ds_object, + (longlong_t)dbn->dbn_phys.zbm_redaction_obj); + + dsl_dataset_rele(bmark_fs, FTAG); + dsl_dataset_rele(snapds, FTAG); +} + + +static void +dsl_bookmark_create_sync_impl_book( + const char *new_name, const char *source_name, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *bmark_fs_source, *bmark_fs_new; + char *source_shortname, *new_shortname; + zfs_bookmark_phys_t source_phys; + + VERIFY0(dsl_bookmark_hold_ds(dp, source_name, &bmark_fs_source, FTAG, + &source_shortname)); + VERIFY0(dsl_bookmark_hold_ds(dp, new_name, &bmark_fs_new, FTAG, + &new_shortname)); + + /* + * create a copy of the source bookmark by copying most of its members + * + * Caveat: bookmarking a redaction bookmark yields a normal bookmark + * ----------------------------------------------------------------- + * Reasoning: + * - The zbm_redaction_obj would be referred to by both source and new + * bookmark, but would be destroyed once either source or new is + * destroyed, resulting in use-after-free of the referrred object. + * - User expectation when issuing the `zfs bookmark` command is that + * a normal bookmark of the source is created + * + * Design Alternatives For Full Redaction Bookmark Copying: + * - reference-count the redaction object => would require on-disk + * format change for existing redaction objects + * - Copy the redaction object => cannot be done in syncing context + * because the redaction object might be too large + */ + + VERIFY0(dsl_bookmark_lookup_impl(bmark_fs_source, source_shortname, + &source_phys)); + dsl_bookmark_node_t *new_dbn = dsl_bookmark_node_alloc(new_shortname); + + memcpy(&new_dbn->dbn_phys, &source_phys, sizeof (source_phys)); + new_dbn->dbn_phys.zbm_redaction_obj = 0; + + /* update feature counters */ + if (new_dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_incr(dp->dp_spa, + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + /* no need for redaction bookmark counter; nulled zbm_redaction_obj */ + /* dsl_bookmark_node_add bumps bookmarks and v2-bookmarks counter */ + + /* + * write new bookmark + * + * Note that dsl_bookmark_lookup_impl guarantees that, if source is a + * v1 bookmark, the v2-only fields are zeroed. + * And dsl_bookmark_node_add writes back a v1-sized bookmark if + * v2 bookmarks are disabled and/or v2-only fields are zeroed. + * => bookmark copying works on pre-bookmark-v2 pools + */ + dsl_bookmark_node_add(bmark_fs_new, new_dbn, tx); + + spa_history_log_internal_ds(bmark_fs_source, "bookmark", tx, + "name=%s creation_txg=%llu source_guid=%llu", + new_shortname, (longlong_t)new_dbn->dbn_phys.zbm_creation_txg, + (longlong_t)source_phys.zbm_guid); + + dsl_dataset_rele(bmark_fs_source, FTAG); + dsl_dataset_rele(bmark_fs_new, FTAG); +} + +void +dsl_bookmark_create_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_arg_t *dbca = arg; + + ASSERT(spa_feature_is_enabled(dmu_tx_pool(tx)->dp_spa, + SPA_FEATURE_BOOKMARKS)); + + for (nvpair_t *pair = nvlist_next_nvpair(dbca->dbca_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbca->dbca_bmarks, pair)) { + + char *new = nvpair_name(pair); + char *source = fnvpair_value_string(pair); + + if (strchr(source, '@') != NULL) { + dsl_bookmark_create_sync_impl_snap(new, source, tx, + 0, NULL, NULL, NULL); + } else if (strchr(source, '#') != NULL) { + dsl_bookmark_create_sync_impl_book(new, source, tx); + } else { + panic("unreachable code"); + } + + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_create(nvlist_t *bmarks, nvlist_t *errors) +{ + nvpair_t *pair; + dsl_bookmark_create_arg_t dbca; + + pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbca.dbca_bmarks = bmarks; + dbca.dbca_errors = errors; + + return (dsl_sync_task(nvpair_name(pair), dsl_bookmark_create_check, + dsl_bookmark_create_sync, &dbca, + fnvlist_num_pairs(bmarks), ZFS_SPACE_CHECK_NORMAL)); +} + +static int +dsl_bookmark_create_redacted_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + if (!spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_REDACTION_BOOKMARKS)) + return (SET_ERROR(ENOTSUP)); + /* + * If the list of redact snaps will not fit in the bonus buffer with + * the furthest reached object and offset, fail. + */ + if (dbcra->dbcra_numsnaps > (dmu_bonus_max() - + sizeof (redaction_list_phys_t)) / sizeof (uint64_t)) + return (SET_ERROR(E2BIG)); + + if (dsl_bookmark_create_nvl_validate_pair( + dbcra->dbcra_bmark, dbcra->dbcra_snap) != 0) + return (SET_ERROR(EINVAL)); + + rv = dsl_bookmark_create_check_impl(dp, + dbcra->dbcra_bmark, dbcra->dbcra_snap); + return (rv); +} + +static void +dsl_bookmark_create_redacted_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_create_redacted_arg_t *dbcra = arg; + dsl_bookmark_create_sync_impl_snap(dbcra->dbcra_bmark, + dbcra->dbcra_snap, tx, dbcra->dbcra_numsnaps, dbcra->dbcra_snaps, + dbcra->dbcra_tag, dbcra->dbcra_rl); +} + +int +dsl_bookmark_create_redacted(const char *bookmark, const char *snapshot, + uint64_t numsnaps, uint64_t *snapguids, void *tag, redaction_list_t **rl) +{ + dsl_bookmark_create_redacted_arg_t dbcra; + + dbcra.dbcra_bmark = bookmark; + dbcra.dbcra_snap = snapshot; + dbcra.dbcra_rl = rl; + dbcra.dbcra_numsnaps = numsnaps; + dbcra.dbcra_snaps = snapguids; + dbcra.dbcra_tag = tag; + + return (dsl_sync_task(bookmark, dsl_bookmark_create_redacted_check, + dsl_bookmark_create_redacted_sync, &dbcra, 5, + ZFS_SPACE_CHECK_NORMAL)); +} + +/* + * Retrieve the list of properties given in the 'props' nvlist for a bookmark. + * If 'props' is NULL, retrieves all properties. + */ +static void +dsl_bookmark_fetch_props(dsl_pool_t *dp, zfs_bookmark_phys_t *bmark_phys, + nvlist_t *props, nvlist_t *out_props) +{ + ASSERT3P(dp, !=, NULL); + ASSERT3P(bmark_phys, !=, NULL); + ASSERT3P(out_props, !=, NULL); + ASSERT(RRW_LOCK_HELD(&dp->dp_config_rwlock)); + + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_GUID, bmark_phys->zbm_guid); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATETXG))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATETXG, bmark_phys->zbm_creation_txg); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_CREATION))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_CREATION, bmark_phys->zbm_creation_time); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_IVSET_GUID))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_IVSET_GUID, bmark_phys->zbm_ivset_guid); + } + if (bmark_phys->zbm_flags & ZBM_FLAG_HAS_FBN) { + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFERENCED))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_REFERENCED, + bmark_phys->zbm_referenced_bytes_refd); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_LOGICALREFERENCED))) { + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_LOGICALREFERENCED, + bmark_phys->zbm_uncompressed_bytes_refd); + } + if (props == NULL || nvlist_exists(props, + zfs_prop_to_name(ZFS_PROP_REFRATIO))) { + uint64_t ratio = + bmark_phys->zbm_compressed_bytes_refd == 0 ? 100 : + bmark_phys->zbm_uncompressed_bytes_refd * 100 / + bmark_phys->zbm_compressed_bytes_refd; + dsl_prop_nvlist_add_uint64(out_props, + ZFS_PROP_REFRATIO, ratio); + } + } + + if ((props == NULL || nvlist_exists(props, "redact_snaps") || + nvlist_exists(props, "redact_complete")) && + bmark_phys->zbm_redaction_obj != 0) { + redaction_list_t *rl; + int err = dsl_redaction_list_hold_obj(dp, + bmark_phys->zbm_redaction_obj, FTAG, &rl); + if (err == 0) { + if (nvlist_exists(props, "redact_snaps")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_uint64_array(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_snaps, + rl->rl_phys->rlp_num_snaps); + fnvlist_add_nvlist(out_props, "redact_snaps", + nvl); + nvlist_free(nvl); + } + if (nvlist_exists(props, "redact_complete")) { + nvlist_t *nvl; + nvl = fnvlist_alloc(); + fnvlist_add_boolean_value(nvl, ZPROP_VALUE, + rl->rl_phys->rlp_last_blkid == UINT64_MAX && + rl->rl_phys->rlp_last_object == UINT64_MAX); + fnvlist_add_nvlist(out_props, "redact_complete", + nvl); + nvlist_free(nvl); + } + dsl_redaction_list_rele(rl, FTAG); + } + } +} + +int +dsl_get_bookmarks_impl(dsl_dataset_t *ds, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_is_snapshot(ds)) + return (SET_ERROR(EINVAL)); + + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + nvlist_t *out_props = fnvlist_alloc(); + + dsl_bookmark_fetch_props(dp, &dbn->dbn_phys, props, out_props); + + fnvlist_add_nvlist(outnvl, dbn->dbn_name, out_props); + fnvlist_free(out_props); + } + return (0); +} + +/* + * Comparison func for ds_bookmarks AVL tree. We sort the bookmarks by + * their TXG, then by their FBN-ness. The "FBN-ness" component ensures + * that all bookmarks at the same TXG that HAS_FBN are adjacent, which + * dsl_bookmark_destroy_sync_impl() depends on. Note that there may be + * multiple bookmarks at the same TXG (with the same FBN-ness). In this + * case we differentiate them by an arbitrary metric (in this case, + * their names). + */ +static int +dsl_bookmark_compare(const void *l, const void *r) +{ + const dsl_bookmark_node_t *ldbn = l; + const dsl_bookmark_node_t *rdbn = r; + + int64_t cmp = TREE_CMP(ldbn->dbn_phys.zbm_creation_txg, + rdbn->dbn_phys.zbm_creation_txg); + if (likely(cmp)) + return (cmp); + cmp = TREE_CMP((ldbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN), + (rdbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + if (likely(cmp)) + return (cmp); + cmp = strcmp(ldbn->dbn_name, rdbn->dbn_name); + return (TREE_ISIGN(cmp)); +} + +/* + * Cache this (head) dataset's bookmarks in the ds_bookmarks AVL tree. + */ +int +dsl_bookmark_init_ds(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + ASSERT(!ds->ds_is_snapshot); + + avl_create(&ds->ds_bookmarks, dsl_bookmark_compare, + sizeof (dsl_bookmark_node_t), + offsetof(dsl_bookmark_node_t, dbn_node)); + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + int zaperr = zap_lookup(mos, ds->ds_object, DS_FIELD_BOOKMARK_NAMES, + sizeof (ds->ds_bookmarks_obj), 1, &ds->ds_bookmarks_obj); + if (zaperr == ENOENT) + return (0); + if (zaperr != 0) + return (zaperr); + + if (ds->ds_bookmarks_obj == 0) + return (0); + + int err = 0; + zap_cursor_t zc; + zap_attribute_t attr; + + for (zap_cursor_init(&zc, mos, ds->ds_bookmarks_obj); + (err = zap_cursor_retrieve(&zc, &attr)) == 0; + zap_cursor_advance(&zc)) { + dsl_bookmark_node_t *dbn = + dsl_bookmark_node_alloc(attr.za_name); + + err = dsl_bookmark_lookup_impl(ds, + dbn->dbn_name, &dbn->dbn_phys); + ASSERT3U(err, !=, ENOENT); + if (err != 0) { + kmem_free(dbn, sizeof (*dbn)); + break; + } + avl_add(&ds->ds_bookmarks, dbn); + } + zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; + return (err); +} + +void +dsl_bookmark_fini_ds(dsl_dataset_t *ds) +{ + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + if (ds->ds_is_snapshot) + return; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != NULL) { + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); +} + +/* + * Retrieve the bookmarks that exist in the specified dataset, and the + * requested properties of each bookmark. + * + * The "props" nvlist specifies which properties are requested. + * See lzc_get_bookmarks() for the list of valid properties. + */ +int +dsl_get_bookmarks(const char *dsname, nvlist_t *props, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_get_bookmarks_impl(ds, props, outnvl); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + +/* + * Retrieve all properties for a single bookmark in the given dataset. + */ +int +dsl_get_bookmark_props(const char *dsname, const char *bmname, nvlist_t *props) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + zfs_bookmark_phys_t bmark_phys = { 0 }; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + err = dsl_bookmark_lookup_impl(ds, bmname, &bmark_phys); + if (err != 0) + goto out; + + dsl_bookmark_fetch_props(dp, &bmark_phys, NULL, props); +out: + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (err); +} + +typedef struct dsl_bookmark_destroy_arg { + nvlist_t *dbda_bmarks; + nvlist_t *dbda_success; + nvlist_t *dbda_errors; +} dsl_bookmark_destroy_arg_t; + +static void +dsl_bookmark_destroy_sync_impl(dsl_dataset_t *ds, const char *name, + dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t bmark_zapobj = ds->ds_bookmarks_obj; + matchtype_t mt = 0; + uint64_t int_size, num_ints; + /* + * 'search' must be zeroed so that dbn_flags (which is used in + * dsl_bookmark_compare()) will be zeroed even if the on-disk + * (in ZAP) bookmark is shorter than offsetof(dbn_flags). + */ + dsl_bookmark_node_t search = { 0 }; + char realname[ZFS_MAX_DATASET_NAME_LEN]; + + /* + * Find the real name of this bookmark, which may be different + * from the given name if the dataset is case-insensitive. Then + * use the real name to find the node in the ds_bookmarks AVL tree. + */ + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + VERIFY0(zap_length(mos, bmark_zapobj, name, &int_size, &num_ints)); + + ASSERT3U(int_size, ==, sizeof (uint64_t)); + + if (num_ints * int_size > BOOKMARK_PHYS_SIZE_V1) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_V2, tx); + } + VERIFY0(zap_lookup_norm(mos, bmark_zapobj, name, sizeof (uint64_t), + num_ints, &search.dbn_phys, mt, realname, sizeof (realname), NULL)); + + search.dbn_name = realname; + dsl_bookmark_node_t *dbn = avl_find(&ds->ds_bookmarks, &search, NULL); + ASSERT(dbn != NULL); + + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + /* + * If this bookmark HAS_FBN, and it is before the most + * recent snapshot, then its TXG is a key in the head's + * deadlist (and all clones' heads' deadlists). If this is + * the last thing keeping the key (i.e. there are no more + * bookmarks with HAS_FBN at this TXG, and there is no + * snapshot at this TXG), then remove the key. + * + * Note that this algorithm depends on ds_bookmarks being + * sorted such that all bookmarks at the same TXG with + * HAS_FBN are adjacent (with no non-HAS_FBN bookmarks + * at the same TXG in between them). If this were not + * the case, we would need to examine *all* bookmarks + * at this TXG, rather than just the adjacent ones. + */ + + dsl_bookmark_node_t *dbn_prev = + AVL_PREV(&ds->ds_bookmarks, dbn); + dsl_bookmark_node_t *dbn_next = + AVL_NEXT(&ds->ds_bookmarks, dbn); + + boolean_t more_bookmarks_at_this_txg = + (dbn_prev != NULL && dbn_prev->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_prev->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) || + (dbn_next != NULL && dbn_next->dbn_phys.zbm_creation_txg == + dbn->dbn_phys.zbm_creation_txg && + (dbn_next->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)); + + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS) && + !more_bookmarks_at_this_txg && + dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_prev_snap_txg) { + dsl_dir_remove_clones_key(ds->ds_dir, + dbn->dbn_phys.zbm_creation_txg, tx); + dsl_deadlist_remove_key(&ds->ds_deadlist, + dbn->dbn_phys.zbm_creation_txg, tx); + } + + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + + avl_remove(&ds->ds_bookmarks, dbn); + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + + VERIFY0(zap_remove_norm(mos, bmark_zapobj, name, mt, tx)); +} + +static int +dsl_bookmark_destroy_check(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + int rv = 0; + + ASSERT(nvlist_empty(dbda->dbda_success)); + ASSERT(nvlist_empty(dbda->dbda_errors)); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARKS)) + return (0); + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_bmarks, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_bmarks, pair)) { + const char *fullname = nvpair_name(pair); + dsl_dataset_t *ds; + zfs_bookmark_phys_t bm; + int error; + char *shortname; + + error = dsl_bookmark_hold_ds(dp, fullname, &ds, + FTAG, &shortname); + if (error == ENOENT) { + /* ignore it; the bookmark is "already destroyed" */ + continue; + } + if (error == 0) { + error = dsl_bookmark_lookup_impl(ds, shortname, &bm); + dsl_dataset_rele(ds, FTAG); + if (error == ESRCH) { + /* + * ignore it; the bookmark is + * "already destroyed" + */ + continue; + } + if (error == 0 && bm.zbm_redaction_obj != 0) { + redaction_list_t *rl = NULL; + error = dsl_redaction_list_hold_obj(tx->tx_pool, + bm.zbm_redaction_obj, FTAG, &rl); + if (error == ENOENT) { + error = 0; + } else if (error == 0 && + dsl_redaction_list_long_held(rl)) { + error = SET_ERROR(EBUSY); + } + if (rl != NULL) { + dsl_redaction_list_rele(rl, FTAG); + } + } + } + if (error == 0) { + if (dmu_tx_is_syncing(tx)) { + fnvlist_add_boolean(dbda->dbda_success, + fullname); + } + } else { + fnvlist_add_int32(dbda->dbda_errors, fullname, error); + rv = error; + } + } + return (rv); +} + +static void +dsl_bookmark_destroy_sync(void *arg, dmu_tx_t *tx) +{ + dsl_bookmark_destroy_arg_t *dbda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(dbda->dbda_success, NULL); + pair != NULL; pair = nvlist_next_nvpair(dbda->dbda_success, pair)) { + dsl_dataset_t *ds; + char *shortname; + uint64_t zap_cnt; + + VERIFY0(dsl_bookmark_hold_ds(dp, nvpair_name(pair), + &ds, FTAG, &shortname)); + dsl_bookmark_destroy_sync_impl(ds, shortname, tx); + + /* + * If all of this dataset's bookmarks have been destroyed, + * free the zap object and decrement the feature's use count. + */ + VERIFY0(zap_count(mos, ds->ds_bookmarks_obj, &zap_cnt)); + if (zap_cnt == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); + ds->ds_bookmarks_obj = 0; + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + VERIFY0(zap_remove(mos, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, tx)); + } + + spa_history_log_internal_ds(ds, "remove bookmark", tx, + "name=%s", shortname); + + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The bookmarks must all be in the same pool. + */ +int +dsl_bookmark_destroy(nvlist_t *bmarks, nvlist_t *errors) +{ + int rv; + dsl_bookmark_destroy_arg_t dbda; + nvpair_t *pair = nvlist_next_nvpair(bmarks, NULL); + if (pair == NULL) + return (0); + + dbda.dbda_bmarks = bmarks; + dbda.dbda_errors = errors; + dbda.dbda_success = fnvlist_alloc(); + + rv = dsl_sync_task(nvpair_name(pair), dsl_bookmark_destroy_check, + dsl_bookmark_destroy_sync, &dbda, fnvlist_num_pairs(bmarks), + ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dbda.dbda_success); + return (rv); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_redaction_list_long_held(redaction_list_t *rl) +{ + return (!zfs_refcount_is_zero(&rl->rl_longholds)); +} + +void +dsl_redaction_list_long_hold(dsl_pool_t *dp, redaction_list_t *rl, void *tag) +{ + ASSERT(dsl_pool_config_held(dp)); + (void) zfs_refcount_add(&rl->rl_longholds, tag); +} + +void +dsl_redaction_list_long_rele(redaction_list_t *rl, void *tag) +{ + (void) zfs_refcount_remove(&rl->rl_longholds, tag); +} + +/* ARGSUSED */ +static void +redaction_list_evict_sync(void *rlu) +{ + redaction_list_t *rl = rlu; + zfs_refcount_destroy(&rl->rl_longholds); + + kmem_free(rl, sizeof (redaction_list_t)); +} + +void +dsl_redaction_list_rele(redaction_list_t *rl, void *tag) +{ + dmu_buf_rele(rl->rl_dbuf, tag); +} + +int +dsl_redaction_list_hold_obj(dsl_pool_t *dp, uint64_t rlobj, void *tag, + redaction_list_t **rlp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + redaction_list_t *rl; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, rlobj, tag, &dbuf); + if (err != 0) + return (err); + + rl = dmu_buf_get_user(dbuf); + if (rl == NULL) { + redaction_list_t *winner = NULL; + + rl = kmem_zalloc(sizeof (redaction_list_t), KM_SLEEP); + rl->rl_dbuf = dbuf; + rl->rl_object = rlobj; + rl->rl_phys = dbuf->db_data; + rl->rl_mos = dp->dp_meta_objset; + zfs_refcount_create(&rl->rl_longholds); + dmu_buf_init_user(&rl->rl_dbu, redaction_list_evict_sync, NULL, + &rl->rl_dbuf); + if ((winner = dmu_buf_set_user_ie(dbuf, &rl->rl_dbu)) != NULL) { + kmem_free(rl, sizeof (*rl)); + rl = winner; + } + } + *rlp = rl; + return (0); +} + +/* + * Snapshot ds is being destroyed. + * + * Adjust the "freed_before_next" of any bookmarks between this snap + * and the previous snapshot, because their "next snapshot" is changing. + * + * If there are any bookmarks with HAS_FBN at this snapshot, remove + * their HAS_SNAP flag (note: there can be at most one snapshot of + * each filesystem at a given txg), and return B_TRUE. In this case + * the caller can not remove the key in the deadlist at this TXG, because + * the HAS_FBN bookmarks require the key be there. + * + * Returns B_FALSE if there are no bookmarks with HAS_FBN at this + * snapshot's TXG. In this case the caller can remove the key in the + * deadlist at this TXG. + */ +boolean_t +dsl_bookmark_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + dsl_dataset_t *head, *next; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &head)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &next)); + + /* + * Find the first bookmark that HAS_FBN at or after the + * previous snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at or after the previous + * snapshot, and before this (being deleted) snapshot. Adjust + * their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg < + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) + continue; + /* + * Increase our FBN by the amount of space that was live + * (referenced) at the time of this bookmark (i.e. + * birth <= zbm_creation_txg), and killed between this + * (being deleted) snapshot and the next snapshot (i.e. + * on the next snapshot's deadlist). (Space killed before + * this are already on our FBN.) + */ + uint64_t referenced, compressed, uncompressed; + dsl_deadlist_space_range(&next->ds_deadlist, + 0, dbn->dbn_phys.zbm_creation_txg, + &referenced, &compressed, &uncompressed); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + referenced; + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + compressed; + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + uncompressed; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } + dsl_dataset_rele(next, FTAG); + + /* + * There may be several bookmarks at this txg (the TXG of the + * snapshot being deleted). We need to clear the SNAPSHOT_EXISTS + * flag on all of them, and return TRUE if there is at least 1 + * bookmark here with HAS_FBN (thus preventing the deadlist + * key from being removed). + */ + boolean_t rv = B_FALSE; + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(ds)->ds_creation_txg; + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + if (!(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + ASSERT(!(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS)); + continue; + } + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_SNAPSHOT_EXISTS); + dbn->dbn_phys.zbm_flags &= ~ZBM_FLAG_SNAPSHOT_EXISTS; + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + rv = B_TRUE; + } + dsl_dataset_rele(head, FTAG); + return (rv); +} + +/* + * A snapshot is being created of this (head) dataset. + * + * We don't keep keys in the deadlist for the most recent snapshot, or any + * bookmarks at or after it, because there can't be any blocks on the + * deadlist in this range. Now that the most recent snapshot is after + * all bookmarks, we need to add these keys. Note that the caller always + * adds a key at the previous snapshot, so we only add keys for bookmarks + * after that. + */ +void +dsl_bookmark_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t last_key_added = UINT64_MAX; + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg > + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + uint64_t creation_txg = dbn->dbn_phys.zbm_creation_txg; + ASSERT3U(creation_txg, <=, last_key_added); + /* + * Note, there may be multiple bookmarks at this TXG, + * and we only want to add the key for this TXG once. + * The ds_bookmarks AVL is sorted by TXG, so we will visit + * these bookmarks in sequence. + */ + if ((dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) && + creation_txg != last_key_added) { + dsl_deadlist_add_key(&ds->ds_deadlist, + creation_txg, tx); + last_key_added = creation_txg; + } + } +} + +/* + * The next snapshot of the origin dataset has changed, due to + * promote or clone swap. If there are any bookmarks at this dataset, + * we need to update their zbm_*_freed_before_next_snap to reflect this. + * The head dataset has the relevant bookmarks in ds_bookmarks. + */ +void +dsl_bookmark_next_changed(dsl_dataset_t *head, dsl_dataset_t *origin, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + /* + * Find the first bookmark that HAS_FBN at the origin snapshot. + */ + dsl_bookmark_node_t search = { 0 }; + avl_index_t idx; + search.dbn_phys.zbm_creation_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + search.dbn_phys.zbm_flags = ZBM_FLAG_HAS_FBN; + /* + * The empty-string name can't be in the AVL, and it compares + * before any entries with this TXG. + */ + search.dbn_name = ""; + VERIFY3P(avl_find(&head->ds_bookmarks, &search, &idx), ==, NULL); + dsl_bookmark_node_t *dbn = + avl_nearest(&head->ds_bookmarks, idx, AVL_AFTER); + + /* + * Iterate over all bookmarks that are at the origin txg. + * Adjust their FBN based on their new next snapshot. + */ + for (; dbn != NULL && dbn->dbn_phys.zbm_creation_txg == + dsl_dataset_phys(origin)->ds_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + dbn = AVL_NEXT(&head->ds_bookmarks, dbn)) { + + /* + * Bookmark is at the origin, therefore its + * "next dataset" is changing, so we need + * to reset its FBN by recomputing it in + * dsl_bookmark_set_phys(). + */ + ASSERT3U(dbn->dbn_phys.zbm_guid, ==, + dsl_dataset_phys(origin)->ds_guid); + ASSERT3U(dbn->dbn_phys.zbm_referenced_bytes_refd, ==, + dsl_dataset_phys(origin)->ds_referenced_bytes); + ASSERT(dbn->dbn_phys.zbm_flags & + ZBM_FLAG_SNAPSHOT_EXISTS); + /* + * Save and restore the zbm_redaction_obj, which + * is zeroed by dsl_bookmark_set_phys(). + */ + uint64_t redaction_obj = + dbn->dbn_phys.zbm_redaction_obj; + dsl_bookmark_set_phys(&dbn->dbn_phys, origin); + dbn->dbn_phys.zbm_redaction_obj = redaction_obj; + + VERIFY0(zap_update(dp->dp_meta_objset, head->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + } +} + +/* + * This block is no longer referenced by this (head) dataset. + * + * Adjust the FBN of any bookmarks that reference this block, whose "next" + * is the head dataset. + */ +/* ARGSUSED */ +void +dsl_bookmark_block_killed(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + /* + * Iterate over bookmarks whose "next" is the head dataset. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + /* + * If the block was live (referenced) at the time of this + * bookmark, add its space to the bookmark's FBN. + */ + if (bp->blk_birth <= dbn->dbn_phys.zbm_creation_txg && + (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN)) { + mutex_enter(&dbn->dbn_lock); + dbn->dbn_phys.zbm_referenced_freed_before_next_snap += + bp_get_dsize_sync(dsl_dataset_get_spa(ds), bp); + dbn->dbn_phys.zbm_compressed_freed_before_next_snap += + BP_GET_PSIZE(bp); + dbn->dbn_phys.zbm_uncompressed_freed_before_next_snap += + BP_GET_UCSIZE(bp); + /* + * Changing the ZAP object here would be too + * expensive. Also, we may be called from the zio + * interrupt thread, which can't block on i/o. + * Therefore, we mark this bookmark as dirty and + * modify the ZAP once per txg, in + * dsl_bookmark_sync_done(). + */ + dbn->dbn_dirty = B_TRUE; + mutex_exit(&dbn->dbn_lock); + } + } +} + +void +dsl_bookmark_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + + if (dsl_dataset_is_snapshot(ds)) + return; + + /* + * We only dirty bookmarks that are at or after the most recent + * snapshot. We can't create snapshots between + * dsl_bookmark_block_killed() and dsl_bookmark_sync_done(), so we + * don't need to look at any bookmarks before ds_prev_snap_txg. + */ + for (dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg >= + dsl_dataset_phys(ds)->ds_prev_snap_txg; + dbn = AVL_PREV(&ds->ds_bookmarks, dbn)) { + if (dbn->dbn_dirty) { + /* + * We only dirty nodes with HAS_FBN, therefore + * we can always use the current bookmark struct size. + */ + ASSERT(dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN); + VERIFY0(zap_update(dp->dp_meta_objset, + ds->ds_bookmarks_obj, + dbn->dbn_name, sizeof (uint64_t), + sizeof (zfs_bookmark_phys_t) / sizeof (uint64_t), + &dbn->dbn_phys, tx)); + dbn->dbn_dirty = B_FALSE; + } + } +#ifdef ZFS_DEBUG + for (dsl_bookmark_node_t *dbn = avl_first(&ds->ds_bookmarks); + dbn != NULL; dbn = AVL_NEXT(&ds->ds_bookmarks, dbn)) { + ASSERT(!dbn->dbn_dirty); + } +#endif +} + +/* + * Return the TXG of the most recent bookmark (or 0 if there are no bookmarks). + */ +uint64_t +dsl_bookmark_latest_txg(dsl_dataset_t *ds) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + dsl_bookmark_node_t *dbn = avl_last(&ds->ds_bookmarks); + if (dbn == NULL) + return (0); + return (dbn->dbn_phys.zbm_creation_txg); +} + +static inline unsigned int +redact_block_buf_num_entries(unsigned int size) +{ + return (size / sizeof (redact_block_phys_t)); +} + +/* + * This function calculates the offset of the last entry in the array of + * redact_block_phys_t. If we're reading the redaction list into buffers of + * size bufsize, then for all but the last buffer, the last valid entry in the + * array will be the last entry in the array. However, for the last buffer, any + * amount of it may be filled. Thus, we check to see if we're looking at the + * last buffer in the redaction list, and if so, we return the total number of + * entries modulo the number of entries per buffer. Otherwise, we return the + * number of entries per buffer minus one. + */ +static inline unsigned int +last_entry(redaction_list_t *rl, unsigned int bufsize, uint64_t bufid) +{ + if (bufid == (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize)) { + return ((rl->rl_phys->rlp_num_entries - 1) % + redact_block_buf_num_entries(bufsize)); + } + return (redact_block_buf_num_entries(bufsize) - 1); +} + +/* + * Compare the redact_block_phys_t to the bookmark. If the last block in the + * redact_block_phys_t is before the bookmark, return -1. If the first block in + * the redact_block_phys_t is after the bookmark, return 1. Otherwise, the + * bookmark is inside the range of the redact_block_phys_t, and we return 0. + */ +static int +redact_block_zb_compare(redact_block_phys_t *first, + zbookmark_phys_t *second) +{ + /* + * If the block_phys is for a previous object, or the last block in the + * block_phys is strictly before the block in the bookmark, the + * block_phys is earlier. + */ + if (first->rbp_object < second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid + (redact_block_get_count(first) - 1) < + second->zb_blkid)) { + return (-1); + } + + /* + * If the bookmark is for a previous object, or the block in the + * bookmark is strictly before the first block in the block_phys, the + * bookmark is earlier. + */ + if (first->rbp_object > second->zb_object || + (first->rbp_object == second->zb_object && + first->rbp_blkid > second->zb_blkid)) { + return (1); + } + + return (0); +} + +/* + * Traverse the redaction list in the provided object, and call the callback for + * each entry we find. Don't call the callback for any records before resume. + */ +int +dsl_redaction_list_traverse(redaction_list_t *rl, zbookmark_phys_t *resume, + rl_traverse_callback_t cb, void *arg) +{ + objset_t *mos = rl->rl_mos; + redact_block_phys_t *buf; + unsigned int bufsize = SPA_OLD_MAXBLOCKSIZE; + int err = 0; + + if (rl->rl_phys->rlp_last_object != UINT64_MAX || + rl->rl_phys->rlp_last_blkid != UINT64_MAX) { + /* + * When we finish a send, we update the last object and offset + * to UINT64_MAX. If a send fails partway through, the last + * object and offset will have some other value, indicating how + * far the send got. The redaction list must be complete before + * it can be traversed, so return EINVAL if the last object and + * blkid are not set to UINT64_MAX. + */ + return (SET_ERROR(EINVAL)); + } + + /* + * Binary search for the point to resume from. The goal is to minimize + * the number of disk reads we have to perform. + */ + buf = zio_data_buf_alloc(bufsize); + uint64_t maxbufid = (rl->rl_phys->rlp_num_entries - 1) / + redact_block_buf_num_entries(bufsize); + uint64_t minbufid = 0; + while (resume != NULL && maxbufid - minbufid >= 1) { + ASSERT3U(maxbufid, >, minbufid); + uint64_t midbufid = minbufid + ((maxbufid - minbufid) / 2); + err = dmu_read(mos, rl->rl_object, midbufid * bufsize, bufsize, + buf, DMU_READ_NO_PREFETCH); + if (err != 0) + break; + + int cmp0 = redact_block_zb_compare(&buf[0], resume); + int cmpn = redact_block_zb_compare( + &buf[last_entry(rl, bufsize, maxbufid)], resume); + + /* + * If the first block is before or equal to the resume point, + * and the last one is equal or after, then the resume point is + * in this buf, and we should start here. + */ + if (cmp0 <= 0 && cmpn >= 0) + break; + + if (cmp0 > 0) + maxbufid = midbufid - 1; + else if (cmpn < 0) + minbufid = midbufid + 1; + else + panic("No progress in binary search for resume point"); + } + + for (uint64_t curidx = minbufid * redact_block_buf_num_entries(bufsize); + err == 0 && curidx < rl->rl_phys->rlp_num_entries; + curidx++) { + /* + * We read in the redaction list one block at a time. Once we + * finish with all the entries in a given block, we read in a + * new one. The predictive prefetcher will take care of any + * prefetching, and this code shouldn't be the bottleneck, so we + * don't need to do manual prefetching. + */ + if (curidx % redact_block_buf_num_entries(bufsize) == 0) { + err = dmu_read(mos, rl->rl_object, curidx * + sizeof (*buf), bufsize, buf, + DMU_READ_PREFETCH); + if (err != 0) + break; + } + redact_block_phys_t *rb = &buf[curidx % + redact_block_buf_num_entries(bufsize)]; + /* + * If resume is non-null, we should either not send the data, or + * null out resume so we don't have to keep doing these + * comparisons. + */ + if (resume != NULL) { + if (redact_block_zb_compare(rb, resume) < 0) { + continue; + } else { + /* + * If the place to resume is in the middle of + * the range described by this + * redact_block_phys, then modify the + * redact_block_phys in memory so we generate + * the right records. + */ + if (resume->zb_object == rb->rbp_object && + resume->zb_blkid > rb->rbp_blkid) { + uint64_t diff = resume->zb_blkid - + rb->rbp_blkid; + rb->rbp_blkid = resume->zb_blkid; + redact_block_set_count(rb, + redact_block_get_count(rb) - diff); + } + resume = NULL; + } + } + + if (cb(rb, arg) != 0) + break; + } + + zio_data_buf_free(buf, bufsize); + return (err); +} diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c new file mode 100644 index 000000000000..33c21e86c4d7 --- /dev/null +++ b/module/zfs/dsl_crypt.c @@ -0,0 +1,2872 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file's primary purpose is for managing master encryption keys in + * memory and on disk. For more info on how these keys are used, see the + * block comment in zio_crypt.c. + * + * All master keys are stored encrypted on disk in the form of the DSL + * Crypto Key ZAP object. The binary key data in this object is always + * randomly generated and is encrypted with the user's wrapping key. This + * layer of indirection allows the user to change their key without + * needing to re-encrypt the entire dataset. The ZAP also holds on to the + * (non-encrypted) encryption algorithm identifier, IV, and MAC needed to + * safely decrypt the master key. For more info on the user's key see the + * block comment in libzfs_crypto.c + * + * In-memory encryption keys are managed through the spa_keystore. The + * keystore consists of 3 AVL trees, which are as follows: + * + * The Wrapping Key Tree: + * The wrapping key (wkey) tree stores the user's keys that are fed into the + * kernel through 'zfs load-key' and related commands. Datasets inherit their + * parent's wkey by default, so these structures are refcounted. The wrapping + * keys remain in memory until they are explicitly unloaded (with + * "zfs unload-key"). Unloading is only possible when no datasets are using + * them (refcount=0). + * + * The DSL Crypto Key Tree: + * The DSL Crypto Keys (DCK) are the in-memory representation of decrypted + * master keys. They are used by the functions in zio_crypt.c to perform + * encryption, decryption, and authentication. Snapshots and clones of a given + * dataset will share a DSL Crypto Key, so they are also refcounted. Once the + * refcount on a key hits zero, it is immediately zeroed out and freed. + * + * The Crypto Key Mapping Tree: + * The zio layer needs to lookup master keys by their dataset object id. Since + * the DSL Crypto Keys can belong to multiple datasets, we maintain a tree of + * dsl_key_mapping_t's which essentially just map the dataset object id to its + * appropriate DSL Crypto Key. The management for creating and destroying these + * mappings hooks into the code for owning and disowning datasets. Usually, + * there will only be one active dataset owner, but there are times + * (particularly during dataset creation and destruction) when this may not be + * true or the dataset may not be initialized enough to own. As a result, this + * object is also refcounted. + */ + +/* + * This tunable allows datasets to be raw received even if the stream does + * not include IVset guids or if the guids don't match. This is used as part + * of the resolution for ZPOOL_ERRATA_ZOL_8308_ENCRYPTION. + */ +int zfs_disable_ivset_guid_check = 0; + +static void +dsl_wrapping_key_hold(dsl_wrapping_key_t *wkey, void *tag) +{ + (void) zfs_refcount_add(&wkey->wk_refcnt, tag); +} + +static void +dsl_wrapping_key_rele(dsl_wrapping_key_t *wkey, void *tag) +{ + (void) zfs_refcount_remove(&wkey->wk_refcnt, tag); +} + +static void +dsl_wrapping_key_free(dsl_wrapping_key_t *wkey) +{ + ASSERT0(zfs_refcount_count(&wkey->wk_refcnt)); + + if (wkey->wk_key.ck_data) { + bzero(wkey->wk_key.ck_data, + CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); + kmem_free(wkey->wk_key.ck_data, + CRYPTO_BITS2BYTES(wkey->wk_key.ck_length)); + } + + zfs_refcount_destroy(&wkey->wk_refcnt); + kmem_free(wkey, sizeof (dsl_wrapping_key_t)); +} + +static void +dsl_wrapping_key_create(uint8_t *wkeydata, zfs_keyformat_t keyformat, + uint64_t salt, uint64_t iters, dsl_wrapping_key_t **wkey_out) +{ + dsl_wrapping_key_t *wkey; + + /* allocate the wrapping key */ + wkey = kmem_alloc(sizeof (dsl_wrapping_key_t), KM_SLEEP); + + /* allocate and initialize the underlying crypto key */ + wkey->wk_key.ck_data = kmem_alloc(WRAPPING_KEY_LEN, KM_SLEEP); + + wkey->wk_key.ck_format = CRYPTO_KEY_RAW; + wkey->wk_key.ck_length = CRYPTO_BYTES2BITS(WRAPPING_KEY_LEN); + bcopy(wkeydata, wkey->wk_key.ck_data, WRAPPING_KEY_LEN); + + /* initialize the rest of the struct */ + zfs_refcount_create(&wkey->wk_refcnt); + wkey->wk_keyformat = keyformat; + wkey->wk_salt = salt; + wkey->wk_iters = iters; + + *wkey_out = wkey; +} + +int +dsl_crypto_params_create_nvlist(dcp_cmd_t cmd, nvlist_t *props, + nvlist_t *crypto_args, dsl_crypto_params_t **dcp_out) +{ + int ret; + uint64_t crypt = ZIO_CRYPT_INHERIT; + uint64_t keyformat = ZFS_KEYFORMAT_NONE; + uint64_t salt = 0, iters = 0; + dsl_crypto_params_t *dcp = NULL; + dsl_wrapping_key_t *wkey = NULL; + uint8_t *wkeydata = NULL; + uint_t wkeydata_len = 0; + char *keylocation = NULL; + + dcp = kmem_zalloc(sizeof (dsl_crypto_params_t), KM_SLEEP); + dcp->cp_cmd = cmd; + + /* get relevant arguments from the nvlists */ + if (props != NULL) { + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_ENCRYPTION), &crypt); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), &keyformat); + (void) nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), &keylocation); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), &salt); + (void) nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), &iters); + + dcp->cp_crypt = crypt; + } + + if (crypto_args != NULL) { + (void) nvlist_lookup_uint8_array(crypto_args, "wkeydata", + &wkeydata, &wkeydata_len); + } + + /* check for valid command */ + if (dcp->cp_cmd >= DCP_CMD_MAX) { + ret = SET_ERROR(EINVAL); + goto error; + } else { + dcp->cp_cmd = cmd; + } + + /* check for valid crypt */ + if (dcp->cp_crypt >= ZIO_CRYPT_FUNCTIONS) { + ret = SET_ERROR(EINVAL); + goto error; + } else { + dcp->cp_crypt = crypt; + } + + /* check for valid keyformat */ + if (keyformat >= ZFS_KEYFORMAT_FORMATS) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check for a valid keylocation (of any kind) and copy it in */ + if (keylocation != NULL) { + if (!zfs_prop_valid_keylocation(keylocation, B_FALSE)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + dcp->cp_keylocation = spa_strdup(keylocation); + } + + /* check wrapping key length, if given */ + if (wkeydata != NULL && wkeydata_len != WRAPPING_KEY_LEN) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* if the user asked for the default crypt, determine that now */ + if (dcp->cp_crypt == ZIO_CRYPT_ON) + dcp->cp_crypt = ZIO_CRYPT_ON_VALUE; + + /* create the wrapping key from the raw data */ + if (wkeydata != NULL) { + /* create the wrapping key with the verified parameters */ + dsl_wrapping_key_create(wkeydata, keyformat, salt, + iters, &wkey); + dcp->cp_wkey = wkey; + } + + /* + * Remove the encryption properties from the nvlist since they are not + * maintained through the DSL. + */ + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_ENCRYPTION)); + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); + (void) nvlist_remove_all(props, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); + (void) nvlist_remove_all(props, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); + + *dcp_out = dcp; + + return (0); + +error: + if (wkey != NULL) + dsl_wrapping_key_free(wkey); + if (dcp != NULL) + kmem_free(dcp, sizeof (dsl_crypto_params_t)); + + *dcp_out = NULL; + return (ret); +} + +void +dsl_crypto_params_free(dsl_crypto_params_t *dcp, boolean_t unload) +{ + if (dcp == NULL) + return; + + if (dcp->cp_keylocation != NULL) + spa_strfree(dcp->cp_keylocation); + if (unload && dcp->cp_wkey != NULL) + dsl_wrapping_key_free(dcp->cp_wkey); + + kmem_free(dcp, sizeof (dsl_crypto_params_t)); +} + +static int +spa_crypto_key_compare(const void *a, const void *b) +{ + const dsl_crypto_key_t *dcka = a; + const dsl_crypto_key_t *dckb = b; + + if (dcka->dck_obj < dckb->dck_obj) + return (-1); + if (dcka->dck_obj > dckb->dck_obj) + return (1); + return (0); +} + +static int +spa_key_mapping_compare(const void *a, const void *b) +{ + const dsl_key_mapping_t *kma = a; + const dsl_key_mapping_t *kmb = b; + + if (kma->km_dsobj < kmb->km_dsobj) + return (-1); + if (kma->km_dsobj > kmb->km_dsobj) + return (1); + return (0); +} + +static int +spa_wkey_compare(const void *a, const void *b) +{ + const dsl_wrapping_key_t *wka = a; + const dsl_wrapping_key_t *wkb = b; + + if (wka->wk_ddobj < wkb->wk_ddobj) + return (-1); + if (wka->wk_ddobj > wkb->wk_ddobj) + return (1); + return (0); +} + +void +spa_keystore_init(spa_keystore_t *sk) +{ + rw_init(&sk->sk_dk_lock, NULL, RW_DEFAULT, NULL); + rw_init(&sk->sk_km_lock, NULL, RW_DEFAULT, NULL); + rw_init(&sk->sk_wkeys_lock, NULL, RW_DEFAULT, NULL); + avl_create(&sk->sk_dsl_keys, spa_crypto_key_compare, + sizeof (dsl_crypto_key_t), + offsetof(dsl_crypto_key_t, dck_avl_link)); + avl_create(&sk->sk_key_mappings, spa_key_mapping_compare, + sizeof (dsl_key_mapping_t), + offsetof(dsl_key_mapping_t, km_avl_link)); + avl_create(&sk->sk_wkeys, spa_wkey_compare, sizeof (dsl_wrapping_key_t), + offsetof(dsl_wrapping_key_t, wk_avl_link)); +} + +void +spa_keystore_fini(spa_keystore_t *sk) +{ + dsl_wrapping_key_t *wkey; + void *cookie = NULL; + + ASSERT(avl_is_empty(&sk->sk_dsl_keys)); + ASSERT(avl_is_empty(&sk->sk_key_mappings)); + + while ((wkey = avl_destroy_nodes(&sk->sk_wkeys, &cookie)) != NULL) + dsl_wrapping_key_free(wkey); + + avl_destroy(&sk->sk_wkeys); + avl_destroy(&sk->sk_key_mappings); + avl_destroy(&sk->sk_dsl_keys); + rw_destroy(&sk->sk_wkeys_lock); + rw_destroy(&sk->sk_km_lock); + rw_destroy(&sk->sk_dk_lock); +} + +static int +dsl_dir_get_encryption_root_ddobj(dsl_dir_t *dd, uint64_t *rddobj) +{ + if (dd->dd_crypto_obj == 0) + return (SET_ERROR(ENOENT)); + + return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, rddobj)); +} + +static int +dsl_dir_get_encryption_version(dsl_dir_t *dd, uint64_t *version) +{ + *version = 0; + + if (dd->dd_crypto_obj == 0) + return (SET_ERROR(ENOENT)); + + /* version 0 is implied by ENOENT */ + (void) zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_VERSION, 8, 1, version); + + return (0); +} + +boolean_t +dsl_dir_incompatible_encryption_version(dsl_dir_t *dd) +{ + int ret; + uint64_t version = 0; + + ret = dsl_dir_get_encryption_version(dd, &version); + if (ret != 0) + return (B_FALSE); + + return (version != ZIO_CRYPT_KEY_CURRENT_VERSION); +} + +static int +spa_keystore_wkey_hold_ddobj_impl(spa_t *spa, uint64_t ddobj, + void *tag, dsl_wrapping_key_t **wkey_out) +{ + int ret; + dsl_wrapping_key_t search_wkey; + dsl_wrapping_key_t *found_wkey; + + ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_wkeys_lock)); + + /* init the search wrapping key */ + search_wkey.wk_ddobj = ddobj; + + /* lookup the wrapping key */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &search_wkey, NULL); + if (!found_wkey) { + ret = SET_ERROR(ENOENT); + goto error; + } + + /* increment the refcount */ + dsl_wrapping_key_hold(found_wkey, tag); + + *wkey_out = found_wkey; + return (0); + +error: + *wkey_out = NULL; + return (ret); +} + +static int +spa_keystore_wkey_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, + dsl_wrapping_key_t **wkey_out) +{ + int ret; + dsl_wrapping_key_t *wkey; + uint64_t rddobj; + boolean_t locked = B_FALSE; + + if (!RW_WRITE_HELD(&spa->spa_keystore.sk_wkeys_lock)) { + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_READER); + locked = B_TRUE; + } + + /* get the ddobj that the keylocation property was inherited from */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto error; + + /* lookup the wkey in the avl tree */ + ret = spa_keystore_wkey_hold_ddobj_impl(spa, rddobj, tag, &wkey); + if (ret != 0) + goto error; + + /* unlock the wkey tree if we locked it */ + if (locked) + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + *wkey_out = wkey; + return (0); + +error: + if (locked) + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + *wkey_out = NULL; + return (ret); +} + +int +dsl_crypto_can_set_keylocation(const char *dsname, const char *keylocation) +{ + int ret = 0; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = NULL; + uint64_t rddobj; + + /* hold the dsl dir */ + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto out; + + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) { + dd = NULL; + goto out; + } + + /* if dd is not encrypted, the value may only be "none" */ + if (dd->dd_crypto_obj == 0) { + if (strcmp(keylocation, "none") != 0) { + ret = SET_ERROR(EACCES); + goto out; + } + + ret = 0; + goto out; + } + + /* check for a valid keylocation for encrypted datasets */ + if (!zfs_prop_valid_keylocation(keylocation, B_TRUE)) { + ret = SET_ERROR(EINVAL); + goto out; + } + + /* check that this is an encryption root */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto out; + + if (rddobj != dd->dd_object) { + ret = SET_ERROR(EACCES); + goto out; + } + + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + return (0); + +out: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +static void +dsl_crypto_key_free(dsl_crypto_key_t *dck) +{ + ASSERT(zfs_refcount_count(&dck->dck_holds) == 0); + + /* destroy the zio_crypt_key_t */ + zio_crypt_key_destroy(&dck->dck_key); + + /* free the refcount, wrapping key, and lock */ + zfs_refcount_destroy(&dck->dck_holds); + if (dck->dck_wkey) + dsl_wrapping_key_rele(dck->dck_wkey, dck); + + /* free the key */ + kmem_free(dck, sizeof (dsl_crypto_key_t)); +} + +static void +dsl_crypto_key_rele(dsl_crypto_key_t *dck, void *tag) +{ + if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) + dsl_crypto_key_free(dck); +} + +static int +dsl_crypto_key_open(objset_t *mos, dsl_wrapping_key_t *wkey, + uint64_t dckobj, void *tag, dsl_crypto_key_t **dck_out) +{ + int ret; + uint64_t crypt = 0, guid = 0, version = 0; + uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; + uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + dsl_crypto_key_t *dck; + + /* allocate and initialize the key */ + dck = kmem_zalloc(sizeof (dsl_crypto_key_t), KM_SLEEP); + + /* fetch all of the values we need from the ZAP */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &guid); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, raw_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, raw_hmac_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac); + if (ret != 0) + goto error; + + /* the initial on-disk format for encryption did not have a version */ + (void) zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); + + /* + * Unwrap the keys. If there is an error return EACCES to indicate + * an authentication failure. + */ + ret = zio_crypt_key_unwrap(&wkey->wk_key, crypt, version, guid, + raw_keydata, raw_hmac_keydata, iv, mac, &dck->dck_key); + if (ret != 0) { + ret = SET_ERROR(EACCES); + goto error; + } + + /* finish initializing the dsl_crypto_key_t */ + zfs_refcount_create(&dck->dck_holds); + dsl_wrapping_key_hold(wkey, dck); + dck->dck_wkey = wkey; + dck->dck_obj = dckobj; + zfs_refcount_add(&dck->dck_holds, tag); + + *dck_out = dck; + return (0); + +error: + if (dck != NULL) { + bzero(dck, sizeof (dsl_crypto_key_t)); + kmem_free(dck, sizeof (dsl_crypto_key_t)); + } + + *dck_out = NULL; + return (ret); +} + +static int +spa_keystore_dsl_key_hold_impl(spa_t *spa, uint64_t dckobj, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + dsl_crypto_key_t search_dck; + dsl_crypto_key_t *found_dck; + + ASSERT(RW_LOCK_HELD(&spa->spa_keystore.sk_dk_lock)); + + /* init the search key */ + search_dck.dck_obj = dckobj; + + /* find the matching key in the keystore */ + found_dck = avl_find(&spa->spa_keystore.sk_dsl_keys, &search_dck, NULL); + if (!found_dck) { + ret = SET_ERROR(ENOENT); + goto error; + } + + /* increment the refcount */ + zfs_refcount_add(&found_dck->dck_holds, tag); + + *dck_out = found_dck; + return (0); + +error: + *dck_out = NULL; + return (ret); +} + +static int +spa_keystore_dsl_key_hold_dd(spa_t *spa, dsl_dir_t *dd, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + avl_index_t where; + dsl_crypto_key_t *dck_io = NULL, *dck_ks = NULL; + dsl_wrapping_key_t *wkey = NULL; + uint64_t dckobj = dd->dd_crypto_obj; + + /* Lookup the key in the tree of currently loaded keys */ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_READER); + ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); + rw_exit(&spa->spa_keystore.sk_dk_lock); + if (ret == 0) { + *dck_out = dck_ks; + return (0); + } + + /* Lookup the wrapping key from the keystore */ + ret = spa_keystore_wkey_hold_dd(spa, dd, FTAG, &wkey); + if (ret != 0) { + *dck_out = NULL; + return (SET_ERROR(EACCES)); + } + + /* Read the key from disk */ + ret = dsl_crypto_key_open(spa->spa_meta_objset, wkey, dckobj, + tag, &dck_io); + if (ret != 0) { + dsl_wrapping_key_rele(wkey, FTAG); + *dck_out = NULL; + return (ret); + } + + /* + * Add the key to the keystore. It may already exist if it was + * added while performing the read from disk. In this case discard + * it and return the key from the keystore. + */ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); + ret = spa_keystore_dsl_key_hold_impl(spa, dckobj, tag, &dck_ks); + if (ret != 0) { + avl_find(&spa->spa_keystore.sk_dsl_keys, dck_io, &where); + avl_insert(&spa->spa_keystore.sk_dsl_keys, dck_io, where); + *dck_out = dck_io; + } else { + dsl_crypto_key_free(dck_io); + *dck_out = dck_ks; + } + + /* Release the wrapping key (the dsl key now has a reference to it) */ + dsl_wrapping_key_rele(wkey, FTAG); + rw_exit(&spa->spa_keystore.sk_dk_lock); + + return (0); +} + +void +spa_keystore_dsl_key_rele(spa_t *spa, dsl_crypto_key_t *dck, void *tag) +{ + rw_enter(&spa->spa_keystore.sk_dk_lock, RW_WRITER); + + if (zfs_refcount_remove(&dck->dck_holds, tag) == 0) { + avl_remove(&spa->spa_keystore.sk_dsl_keys, dck); + dsl_crypto_key_free(dck); + } + + rw_exit(&spa->spa_keystore.sk_dk_lock); +} + +int +spa_keystore_load_wkey_impl(spa_t *spa, dsl_wrapping_key_t *wkey) +{ + int ret; + avl_index_t where; + dsl_wrapping_key_t *found_wkey; + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* insert the wrapping key into the keystore */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); + if (found_wkey != NULL) { + ret = SET_ERROR(EEXIST); + goto error_unlock; + } + avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + return (ret); +} + +int +spa_keystore_load_wkey(const char *dsname, dsl_crypto_params_t *dcp, + boolean_t noop) +{ + int ret; + dsl_dir_t *dd = NULL; + dsl_crypto_key_t *dck = NULL; + dsl_wrapping_key_t *wkey = dcp->cp_wkey; + dsl_pool_t *dp = NULL; + uint64_t rddobj, keyformat, salt, iters; + + /* + * We don't validate the wrapping key's keyformat, salt, or iters + * since they will never be needed after the DCK has been wrapped. + */ + if (dcp->cp_wkey == NULL || + dcp->cp_cmd != DCP_CMD_NONE || + dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL) + return (SET_ERROR(EINVAL)); + + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto error; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = SET_ERROR(ENOTSUP); + goto error; + } + + /* hold the dsl dir */ + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) { + dd = NULL; + goto error; + } + + /* confirm that dd is the encryption root */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0 || rddobj != dd->dd_object) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* initialize the wkey's ddobj */ + wkey->wk_ddobj = dd->dd_object; + + /* verify that the wkey is correct by opening its dsl key */ + ret = dsl_crypto_key_open(dp->dp_meta_objset, wkey, + dd->dd_crypto_obj, FTAG, &dck); + if (ret != 0) + goto error; + + /* initialize the wkey encryption parameters from the DSL Crypto Key */ + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &keyformat); + if (ret != 0) + goto error; + + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); + if (ret != 0) + goto error; + + ret = zap_lookup(dp->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); + if (ret != 0) + goto error; + + ASSERT3U(keyformat, <, ZFS_KEYFORMAT_FORMATS); + ASSERT3U(keyformat, !=, ZFS_KEYFORMAT_NONE); + IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, iters != 0); + IMPLY(keyformat == ZFS_KEYFORMAT_PASSPHRASE, salt != 0); + IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, iters == 0); + IMPLY(keyformat != ZFS_KEYFORMAT_PASSPHRASE, salt == 0); + + wkey->wk_keyformat = keyformat; + wkey->wk_salt = salt; + wkey->wk_iters = iters; + + /* + * At this point we have verified the wkey and confirmed that it can + * be used to decrypt a DSL Crypto Key. We can simply cleanup and + * return if this is all the user wanted to do. + */ + if (noop) + goto error; + + /* insert the wrapping key into the keystore */ + ret = spa_keystore_load_wkey_impl(dp->dp_spa, wkey); + if (ret != 0) + goto error; + + dsl_crypto_key_rele(dck, FTAG); + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + /* create any zvols under this ds */ + zvol_create_minors_recursive(dsname); + + return (0); + +error: + if (dck != NULL) + dsl_crypto_key_rele(dck, FTAG); + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +int +spa_keystore_unload_wkey_impl(spa_t *spa, uint64_t ddobj) +{ + int ret; + dsl_wrapping_key_t search_wkey; + dsl_wrapping_key_t *found_wkey; + + /* init the search wrapping key */ + search_wkey.wk_ddobj = ddobj; + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* remove the wrapping key from the keystore */ + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, + &search_wkey, NULL); + if (!found_wkey) { + ret = SET_ERROR(EACCES); + goto error_unlock; + } else if (zfs_refcount_count(&found_wkey->wk_refcnt) != 0) { + ret = SET_ERROR(EBUSY); + goto error_unlock; + } + avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + /* free the wrapping key */ + dsl_wrapping_key_free(found_wkey); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + return (ret); +} + +int +spa_keystore_unload_wkey(const char *dsname) +{ + int ret = 0; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = NULL; + spa_t *spa = NULL; + + ret = spa_open(dsname, &spa, FTAG); + if (ret != 0) + return (ret); + + /* + * Wait for any outstanding txg IO to complete, releasing any + * remaining references on the wkey. + */ + if (spa_mode(spa) != SPA_MODE_READ) + txg_wait_synced(spa->spa_dsl_pool, 0); + + spa_close(spa, FTAG); + + /* hold the dsl dir */ + ret = dsl_pool_hold(dsname, FTAG, &dp); + if (ret != 0) + goto error; + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = (SET_ERROR(ENOTSUP)); + goto error; + } + + ret = dsl_dir_hold(dp, dsname, FTAG, &dd, NULL); + if (ret != 0) { + dd = NULL; + goto error; + } + + /* unload the wkey */ + ret = spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); + if (ret != 0) + goto error; + + dsl_dir_rele(dd, FTAG); + dsl_pool_rele(dp, FTAG); + + /* remove any zvols under this ds */ + zvol_remove_minors(dp->dp_spa, dsname, B_TRUE); + + return (0); + +error: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + if (dp != NULL) + dsl_pool_rele(dp, FTAG); + + return (ret); +} + +void +key_mapping_add_ref(dsl_key_mapping_t *km, void *tag) +{ + ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); + zfs_refcount_add(&km->km_refcnt, tag); +} + +/* + * The locking here is a little tricky to ensure we don't cause unnecessary + * performance problems. We want to release a key mapping whenever someone + * decrements the refcount to 0, but freeing the mapping requires removing + * it from the spa_keystore, which requires holding sk_km_lock as a writer. + * Most of the time we don't want to hold this lock as a writer, since the + * same lock is held as a reader for each IO that needs to encrypt / decrypt + * data for any dataset and in practice we will only actually free the + * mapping after unmounting a dataset. + */ +void +key_mapping_rele(spa_t *spa, dsl_key_mapping_t *km, void *tag) +{ + ASSERT3U(zfs_refcount_count(&km->km_refcnt), >=, 1); + + if (zfs_refcount_remove(&km->km_refcnt, tag) != 0) + return; + + /* + * We think we are going to need to free the mapping. Add a + * reference to prevent most other releasers from thinking + * this might be their responsibility. This is inherently + * racy, so we will confirm that we are legitimately the + * last holder once we have the sk_km_lock as a writer. + */ + zfs_refcount_add(&km->km_refcnt, FTAG); + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); + if (zfs_refcount_remove(&km->km_refcnt, FTAG) != 0) { + rw_exit(&spa->spa_keystore.sk_km_lock); + return; + } + + avl_remove(&spa->spa_keystore.sk_key_mappings, km); + rw_exit(&spa->spa_keystore.sk_km_lock); + + spa_keystore_dsl_key_rele(spa, km->km_key, km); + zfs_refcount_destroy(&km->km_refcnt); + kmem_free(km, sizeof (dsl_key_mapping_t)); +} + +int +spa_keystore_create_mapping(spa_t *spa, dsl_dataset_t *ds, void *tag, + dsl_key_mapping_t **km_out) +{ + int ret; + avl_index_t where; + dsl_key_mapping_t *km, *found_km; + boolean_t should_free = B_FALSE; + + /* Allocate and initialize the mapping */ + km = kmem_zalloc(sizeof (dsl_key_mapping_t), KM_SLEEP); + zfs_refcount_create(&km->km_refcnt); + + ret = spa_keystore_dsl_key_hold_dd(spa, ds->ds_dir, km, &km->km_key); + if (ret != 0) { + zfs_refcount_destroy(&km->km_refcnt); + kmem_free(km, sizeof (dsl_key_mapping_t)); + + if (km_out != NULL) + *km_out = NULL; + return (ret); + } + + km->km_dsobj = ds->ds_object; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_WRITER); + + /* + * If a mapping already exists, simply increment its refcount and + * cleanup the one we made. We want to allocate / free outside of + * the lock because this lock is also used by the zio layer to lookup + * key mappings. Otherwise, use the one we created. Normally, there will + * only be one active reference at a time (the objset owner), but there + * are times when there could be multiple async users. + */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, km, &where); + if (found_km != NULL) { + should_free = B_TRUE; + zfs_refcount_add(&found_km->km_refcnt, tag); + if (km_out != NULL) + *km_out = found_km; + } else { + zfs_refcount_add(&km->km_refcnt, tag); + avl_insert(&spa->spa_keystore.sk_key_mappings, km, where); + if (km_out != NULL) + *km_out = km; + } + + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (should_free) { + spa_keystore_dsl_key_rele(spa, km->km_key, km); + zfs_refcount_destroy(&km->km_refcnt); + kmem_free(km, sizeof (dsl_key_mapping_t)); + } + + return (0); +} + +int +spa_keystore_remove_mapping(spa_t *spa, uint64_t dsobj, void *tag) +{ + int ret; + dsl_key_mapping_t search_km; + dsl_key_mapping_t *found_km; + + /* init the search key mapping */ + search_km.km_dsobj = dsobj; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); + + /* find the matching mapping */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, + &search_km, NULL); + if (found_km == NULL) { + ret = SET_ERROR(ENOENT); + goto error_unlock; + } + + rw_exit(&spa->spa_keystore.sk_km_lock); + + key_mapping_rele(spa, found_km, tag); + + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_km_lock); + return (ret); +} + +/* + * This function is primarily used by the zio and arc layer to lookup + * DSL Crypto Keys for encryption. Callers must release the key with + * spa_keystore_dsl_key_rele(). The function may also be called with + * dck_out == NULL and tag == NULL to simply check that a key exists + * without getting a reference to it. + */ +int +spa_keystore_lookup_key(spa_t *spa, uint64_t dsobj, void *tag, + dsl_crypto_key_t **dck_out) +{ + int ret; + dsl_key_mapping_t search_km; + dsl_key_mapping_t *found_km; + + ASSERT((tag != NULL && dck_out != NULL) || + (tag == NULL && dck_out == NULL)); + + /* init the search key mapping */ + search_km.km_dsobj = dsobj; + + rw_enter(&spa->spa_keystore.sk_km_lock, RW_READER); + + /* remove the mapping from the tree */ + found_km = avl_find(&spa->spa_keystore.sk_key_mappings, &search_km, + NULL); + if (found_km == NULL) { + ret = SET_ERROR(ENOENT); + goto error_unlock; + } + + if (found_km && tag) + zfs_refcount_add(&found_km->km_key->dck_holds, tag); + + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (dck_out != NULL) + *dck_out = found_km->km_key; + return (0); + +error_unlock: + rw_exit(&spa->spa_keystore.sk_km_lock); + + if (dck_out != NULL) + *dck_out = NULL; + return (ret); +} + +static int +dmu_objset_check_wkey_loaded(dsl_dir_t *dd) +{ + int ret; + dsl_wrapping_key_t *wkey = NULL; + + ret = spa_keystore_wkey_hold_dd(dd->dd_pool->dp_spa, dd, FTAG, + &wkey); + if (ret != 0) + return (SET_ERROR(EACCES)); + + dsl_wrapping_key_rele(wkey, FTAG); + + return (0); +} + +static zfs_keystatus_t +dsl_dataset_get_keystatus(dsl_dir_t *dd) +{ + /* check if this dd has a has a dsl key */ + if (dd->dd_crypto_obj == 0) + return (ZFS_KEYSTATUS_NONE); + + return (dmu_objset_check_wkey_loaded(dd) == 0 ? + ZFS_KEYSTATUS_AVAILABLE : ZFS_KEYSTATUS_UNAVAILABLE); +} + +static int +dsl_dir_get_crypt(dsl_dir_t *dd, uint64_t *crypt) +{ + if (dd->dd_crypto_obj == 0) { + *crypt = ZIO_CRYPT_OFF; + return (0); + } + + return (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, crypt)); +} + +static void +dsl_crypto_key_sync_impl(objset_t *mos, uint64_t dckobj, uint64_t crypt, + uint64_t root_ddobj, uint64_t guid, uint8_t *iv, uint8_t *mac, + uint8_t *keydata, uint8_t *hmac_keydata, uint64_t keyformat, + uint64_t salt, uint64_t iters, dmu_tx_t *tx) +{ + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, + &root_ddobj, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, + &guid, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, keydata, tx)); + VERIFY0(zap_update(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, hmac_keydata, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + 8, 1, &keyformat, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), + 8, 1, &salt, tx)); + VERIFY0(zap_update(mos, dckobj, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), + 8, 1, &iters, tx)); +} + +static void +dsl_crypto_key_sync(dsl_crypto_key_t *dck, dmu_tx_t *tx) +{ + zio_crypt_key_t *key = &dck->dck_key; + dsl_wrapping_key_t *wkey = dck->dck_wkey; + uint8_t keydata[MASTER_KEY_MAX_LEN]; + uint8_t hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(key->zk_crypt, <, ZIO_CRYPT_FUNCTIONS); + + /* encrypt and store the keys along with the IV and MAC */ + VERIFY0(zio_crypt_key_wrap(&dck->dck_wkey->wk_key, key, iv, mac, + keydata, hmac_keydata)); + + /* update the ZAP with the obtained values */ + dsl_crypto_key_sync_impl(tx->tx_pool->dp_meta_objset, dck->dck_obj, + key->zk_crypt, wkey->wk_ddobj, key->zk_guid, iv, mac, keydata, + hmac_keydata, wkey->wk_keyformat, wkey->wk_salt, wkey->wk_iters, + tx); +} + +typedef struct spa_keystore_change_key_args { + const char *skcka_dsname; + dsl_crypto_params_t *skcka_cp; +} spa_keystore_change_key_args_t; + +static int +spa_keystore_change_key_check(void *arg, dmu_tx_t *tx) +{ + int ret; + dsl_dir_t *dd = NULL; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_keystore_change_key_args_t *skcka = arg; + dsl_crypto_params_t *dcp = skcka->skcka_cp; + uint64_t rddobj; + + /* check for the encryption feature */ + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_ENCRYPTION)) { + ret = SET_ERROR(ENOTSUP); + goto error; + } + + /* check for valid key change command */ + if (dcp->cp_cmd != DCP_CMD_NEW_KEY && + dcp->cp_cmd != DCP_CMD_INHERIT && + dcp->cp_cmd != DCP_CMD_FORCE_NEW_KEY && + dcp->cp_cmd != DCP_CMD_FORCE_INHERIT) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* hold the dd */ + ret = dsl_dir_hold(dp, skcka->skcka_dsname, FTAG, &dd, NULL); + if (ret != 0) { + dd = NULL; + goto error; + } + + /* verify that the dataset is encrypted */ + if (dd->dd_crypto_obj == 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* clones must always use their origin's key */ + if (dsl_dir_is_clone(dd)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* lookup the ddobj we are inheriting the keylocation from */ + ret = dsl_dir_get_encryption_root_ddobj(dd, &rddobj); + if (ret != 0) + goto error; + + /* Handle inheritance */ + if (dcp->cp_cmd == DCP_CMD_INHERIT || + dcp->cp_cmd == DCP_CMD_FORCE_INHERIT) { + /* no other encryption params should be given */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL || + dcp->cp_wkey != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that this is an encryption root */ + if (dd->dd_object != rddobj) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that the parent is encrypted */ + if (dd->dd_parent->dd_crypto_obj == 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* if we are rewrapping check that both keys are loaded */ + if (dcp->cp_cmd == DCP_CMD_INHERIT) { + ret = dmu_objset_check_wkey_loaded(dd); + if (ret != 0) + goto error; + + ret = dmu_objset_check_wkey_loaded(dd->dd_parent); + if (ret != 0) + goto error; + } + + dsl_dir_rele(dd, FTAG); + return (0); + } + + /* handle forcing an encryption root without rewrapping */ + if (dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { + /* no other encryption params should be given */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT || + dcp->cp_keylocation != NULL || + dcp->cp_wkey != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that this is not an encryption root */ + if (dd->dd_object == rddobj) { + ret = SET_ERROR(EINVAL); + goto error; + } + + dsl_dir_rele(dd, FTAG); + return (0); + } + + /* crypt cannot be changed after creation */ + if (dcp->cp_crypt != ZIO_CRYPT_INHERIT) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* we are not inheritting our parent's wkey so we need one ourselves */ + if (dcp->cp_wkey == NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check for a valid keyformat for the new wrapping key */ + if (dcp->cp_wkey->wk_keyformat >= ZFS_KEYFORMAT_FORMATS || + dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_NONE) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* + * If this dataset is not currently an encryption root we need a new + * keylocation for this dataset's new wrapping key. Otherwise we can + * just keep the one we already had. + */ + if (dd->dd_object != rddobj && dcp->cp_keylocation == NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* check that the keylocation is valid if it is not NULL */ + if (dcp->cp_keylocation != NULL && + !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) { + ret = SET_ERROR(EINVAL); + goto error; + } + + /* passphrases require pbkdf2 salt and iters */ + if (dcp->cp_wkey->wk_keyformat == ZFS_KEYFORMAT_PASSPHRASE) { + if (dcp->cp_wkey->wk_salt == 0 || + dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) { + ret = SET_ERROR(EINVAL); + goto error; + } + } else { + if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) { + ret = SET_ERROR(EINVAL); + goto error; + } + } + + /* make sure the dd's wkey is loaded */ + ret = dmu_objset_check_wkey_loaded(dd); + if (ret != 0) + goto error; + + dsl_dir_rele(dd, FTAG); + + return (0); + +error: + if (dd != NULL) + dsl_dir_rele(dd, FTAG); + + return (ret); +} + +/* + * This function deals with the intricacies of updating wrapping + * key references and encryption roots recursively in the event + * of a call to 'zfs change-key' or 'zfs promote'. The 'skip' + * parameter should always be set to B_FALSE when called + * externally. + */ +static void +spa_keystore_change_key_sync_impl(uint64_t rddobj, uint64_t ddobj, + uint64_t new_rddobj, dsl_wrapping_key_t *wkey, boolean_t skip, + dmu_tx_t *tx) +{ + int ret; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd = NULL; + dsl_crypto_key_t *dck = NULL; + uint64_t curr_rddobj; + + ASSERT(RW_WRITE_HELD(&dp->dp_spa->spa_keystore.sk_wkeys_lock)); + + /* hold the dd */ + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + /* ignore special dsl dirs */ + if (dd->dd_myname[0] == '$' || dd->dd_myname[0] == '%') { + dsl_dir_rele(dd, FTAG); + return; + } + + ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); + VERIFY(ret == 0 || ret == ENOENT); + + /* + * Stop recursing if this dsl dir didn't inherit from the root + * or if this dd is a clone. + */ + if (ret == ENOENT || + (!skip && (curr_rddobj != rddobj || dsl_dir_is_clone(dd)))) { + dsl_dir_rele(dd, FTAG); + return; + } + + /* + * If we don't have a wrapping key just update the dck to reflect the + * new encryption root. Otherwise rewrap the entire dck and re-sync it + * to disk. If skip is set, we don't do any of this work. + */ + if (!skip) { + if (wkey == NULL) { + VERIFY0(zap_update(dp->dp_meta_objset, + dd->dd_crypto_obj, + DSL_CRYPTO_KEY_ROOT_DDOBJ, 8, 1, + &new_rddobj, tx)); + } else { + VERIFY0(spa_keystore_dsl_key_hold_dd(dp->dp_spa, dd, + FTAG, &dck)); + dsl_wrapping_key_hold(wkey, dck); + dsl_wrapping_key_rele(dck->dck_wkey, dck); + dck->dck_wkey = wkey; + dsl_crypto_key_sync(dck, tx); + spa_keystore_dsl_key_rele(dp->dp_spa, dck, FTAG); + } + } + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Recurse into all child dsl dirs. */ + for (zap_cursor_init(zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + spa_keystore_change_key_sync_impl(rddobj, + za->za_first_integer, new_rddobj, wkey, B_FALSE, tx); + } + zap_cursor_fini(zc); + + /* + * Recurse into all dsl dirs of clones. We utilize the skip parameter + * here so that we don't attempt to process the clones directly. This + * is because the clone and its origin share the same dck, which has + * already been updated. + */ + for (zap_cursor_init(zc, dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(dp, za->za_first_integer, + FTAG, &clone)); + spa_keystore_change_key_sync_impl(rddobj, + clone->ds_dir->dd_object, new_rddobj, wkey, B_TRUE, tx); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(zc); + + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); + + dsl_dir_rele(dd, FTAG); +} + +static void +spa_keystore_change_key_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_t *ds; + avl_index_t where; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + spa_keystore_change_key_args_t *skcka = arg; + dsl_crypto_params_t *dcp = skcka->skcka_cp; + dsl_wrapping_key_t *wkey = NULL, *found_wkey; + dsl_wrapping_key_t wkey_search; + char *keylocation = dcp->cp_keylocation; + uint64_t rddobj, new_rddobj; + + /* create and initialize the wrapping key */ + VERIFY0(dsl_dataset_hold(dp, skcka->skcka_dsname, FTAG, &ds)); + ASSERT(!ds->ds_is_snapshot); + + if (dcp->cp_cmd == DCP_CMD_NEW_KEY || + dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY) { + /* + * We are changing to a new wkey. Set additional properties + * which can be sent along with this ioctl. Note that this + * command can set keylocation even if it can't normally be + * set via 'zfs set' due to a non-local keylocation. + */ + if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { + wkey = dcp->cp_wkey; + wkey->wk_ddobj = ds->ds_dir->dd_object; + } else { + keylocation = "prompt"; + } + + if (keylocation != NULL) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, + keylocation, tx); + } + + VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj)); + new_rddobj = ds->ds_dir->dd_object; + } else { + /* + * We are inheritting the parent's wkey. Unset any local + * keylocation and grab a reference to the wkey. + */ + if (dcp->cp_cmd == DCP_CMD_INHERIT) { + VERIFY0(spa_keystore_wkey_hold_dd(spa, + ds->ds_dir->dd_parent, FTAG, &wkey)); + } + + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), ZPROP_SRC_NONE, + 0, 0, NULL, tx); + + rddobj = ds->ds_dir->dd_object; + VERIFY0(dsl_dir_get_encryption_root_ddobj(ds->ds_dir->dd_parent, + &new_rddobj)); + } + + if (wkey == NULL) { + ASSERT(dcp->cp_cmd == DCP_CMD_FORCE_INHERIT || + dcp->cp_cmd == DCP_CMD_FORCE_NEW_KEY); + } + + rw_enter(&spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + + /* recurse through all children and rewrap their keys */ + spa_keystore_change_key_sync_impl(rddobj, ds->ds_dir->dd_object, + new_rddobj, wkey, B_FALSE, tx); + + /* + * All references to the old wkey should be released now (if it + * existed). Replace the wrapping key. + */ + wkey_search.wk_ddobj = ds->ds_dir->dd_object; + found_wkey = avl_find(&spa->spa_keystore.sk_wkeys, &wkey_search, NULL); + if (found_wkey != NULL) { + ASSERT0(zfs_refcount_count(&found_wkey->wk_refcnt)); + avl_remove(&spa->spa_keystore.sk_wkeys, found_wkey); + dsl_wrapping_key_free(found_wkey); + } + + if (dcp->cp_cmd == DCP_CMD_NEW_KEY) { + avl_find(&spa->spa_keystore.sk_wkeys, wkey, &where); + avl_insert(&spa->spa_keystore.sk_wkeys, wkey, where); + } else if (wkey != NULL) { + dsl_wrapping_key_rele(wkey, FTAG); + } + + rw_exit(&spa->spa_keystore.sk_wkeys_lock); + + dsl_dataset_rele(ds, FTAG); +} + +int +spa_keystore_change_key(const char *dsname, dsl_crypto_params_t *dcp) +{ + spa_keystore_change_key_args_t skcka; + + /* initialize the args struct */ + skcka.skcka_dsname = dsname; + skcka.skcka_cp = dcp; + + /* + * Perform the actual work in syncing context. The blocks modified + * here could be calculated but it would require holding the pool + * lock and traversing all of the datasets that will have their keys + * changed. + */ + return (dsl_sync_task(dsname, spa_keystore_change_key_check, + spa_keystore_change_key_sync, &skcka, 15, + ZFS_SPACE_CHECK_RESERVED)); +} + +int +dsl_dir_rename_crypt_check(dsl_dir_t *dd, dsl_dir_t *newparent) +{ + int ret; + uint64_t curr_rddobj, parent_rddobj; + + if (dd->dd_crypto_obj == 0) + return (0); + + ret = dsl_dir_get_encryption_root_ddobj(dd, &curr_rddobj); + if (ret != 0) + goto error; + + /* + * if this is not an encryption root, we must make sure we are not + * moving dd to a new encryption root + */ + if (dd->dd_object != curr_rddobj) { + ret = dsl_dir_get_encryption_root_ddobj(newparent, + &parent_rddobj); + if (ret != 0) + goto error; + + if (parent_rddobj != curr_rddobj) { + ret = SET_ERROR(EACCES); + goto error; + } + } + + return (0); + +error: + return (ret); +} + +/* + * Check to make sure that a promote from targetdd to origindd will not require + * any key rewraps. + */ +int +dsl_dataset_promote_crypt_check(dsl_dir_t *target, dsl_dir_t *origin) +{ + int ret; + uint64_t rddobj, op_rddobj, tp_rddobj; + + /* If the dataset is not encrypted we don't need to check anything */ + if (origin->dd_crypto_obj == 0) + return (0); + + /* + * If we are not changing the first origin snapshot in a chain + * the encryption root won't change either. + */ + if (dsl_dir_is_clone(origin)) + return (0); + + /* + * If the origin is the encryption root we will update + * the DSL Crypto Key to point to the target instead. + */ + ret = dsl_dir_get_encryption_root_ddobj(origin, &rddobj); + if (ret != 0) + return (ret); + + if (rddobj == origin->dd_object) + return (0); + + /* + * The origin is inheriting its encryption root from its parent. + * Check that the parent of the target has the same encryption root. + */ + ret = dsl_dir_get_encryption_root_ddobj(origin->dd_parent, &op_rddobj); + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) + return (ret); + + ret = dsl_dir_get_encryption_root_ddobj(target->dd_parent, &tp_rddobj); + if (ret == ENOENT) + return (SET_ERROR(EACCES)); + else if (ret != 0) + return (ret); + + if (op_rddobj != tp_rddobj) + return (SET_ERROR(EACCES)); + + return (0); +} + +void +dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, + dmu_tx_t *tx) +{ + uint64_t rddobj; + dsl_pool_t *dp = target->dd_pool; + dsl_dataset_t *targetds; + dsl_dataset_t *originds; + char *keylocation; + + if (origin->dd_crypto_obj == 0) + return; + if (dsl_dir_is_clone(origin)) + return; + + VERIFY0(dsl_dir_get_encryption_root_ddobj(origin, &rddobj)); + + if (rddobj != origin->dd_object) + return; + + /* + * If the target is being promoted to the encryption root update the + * DSL Crypto Key and keylocation to reflect that. We also need to + * update the DSL Crypto Keys of all children inheritting their + * encryption root to point to the new target. Otherwise, the check + * function ensured that the encryption root will not change. + */ + keylocation = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(target)->dd_head_dataset_obj, FTAG, &targetds)); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(origin)->dd_head_dataset_obj, FTAG, &originds)); + + VERIFY0(dsl_prop_get_dd(origin, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + 1, ZAP_MAXVALUELEN, keylocation, NULL, B_FALSE)); + dsl_prop_set_sync_impl(targetds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, keylocation, tx); + dsl_prop_set_sync_impl(originds, zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_NONE, 0, 0, NULL, tx); + + rw_enter(&dp->dp_spa->spa_keystore.sk_wkeys_lock, RW_WRITER); + spa_keystore_change_key_sync_impl(rddobj, origin->dd_object, + target->dd_object, NULL, B_FALSE, tx); + rw_exit(&dp->dp_spa->spa_keystore.sk_wkeys_lock); + + dsl_dataset_rele(targetds, FTAG); + dsl_dataset_rele(originds, FTAG); + kmem_free(keylocation, ZAP_MAXVALUELEN); +} + +int +dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, + boolean_t *will_encrypt) +{ + int ret; + uint64_t pcrypt, crypt; + dsl_crypto_params_t dummy_dcp = { 0 }; + + if (will_encrypt != NULL) + *will_encrypt = B_FALSE; + + if (dcp == NULL) + dcp = &dummy_dcp; + + if (dcp->cp_cmd != DCP_CMD_NONE) + return (SET_ERROR(EINVAL)); + + if (parentdd != NULL) { + ret = dsl_dir_get_crypt(parentdd, &pcrypt); + if (ret != 0) + return (ret); + } else { + pcrypt = ZIO_CRYPT_OFF; + } + + crypt = (dcp->cp_crypt == ZIO_CRYPT_INHERIT) ? pcrypt : dcp->cp_crypt; + + ASSERT3U(pcrypt, !=, ZIO_CRYPT_INHERIT); + ASSERT3U(crypt, !=, ZIO_CRYPT_INHERIT); + + /* check for valid dcp with no encryption (inherited or local) */ + if (crypt == ZIO_CRYPT_OFF) { + /* Must not specify encryption params */ + if (dcp->cp_wkey != NULL || + (dcp->cp_keylocation != NULL && + strcmp(dcp->cp_keylocation, "none") != 0)) + return (SET_ERROR(EINVAL)); + + return (0); + } + + if (will_encrypt != NULL) + *will_encrypt = B_TRUE; + + /* + * We will now definitely be encrypting. Check the feature flag. When + * creating the pool the caller will check this for us since we won't + * technically have the feature activated yet. + */ + if (parentdd != NULL && + !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, + SPA_FEATURE_ENCRYPTION)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + /* Check for errata #4 (encryption enabled, bookmark_v2 disabled) */ + if (parentdd != NULL && + !spa_feature_is_enabled(parentdd->dd_pool->dp_spa, + SPA_FEATURE_BOOKMARK_V2)) { + return (SET_ERROR(EOPNOTSUPP)); + } + + /* handle inheritance */ + if (dcp->cp_wkey == NULL) { + ASSERT3P(parentdd, !=, NULL); + + /* key must be fully unspecified */ + if (dcp->cp_keylocation != NULL) + return (SET_ERROR(EINVAL)); + + /* parent must have a key to inherit */ + if (pcrypt == ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + /* check for parent key */ + ret = dmu_objset_check_wkey_loaded(parentdd); + if (ret != 0) + return (ret); + + return (0); + } + + /* At this point we should have a fully specified key. Check location */ + if (dcp->cp_keylocation == NULL || + !zfs_prop_valid_keylocation(dcp->cp_keylocation, B_TRUE)) + return (SET_ERROR(EINVAL)); + + /* Must have fully specified keyformat */ + switch (dcp->cp_wkey->wk_keyformat) { + case ZFS_KEYFORMAT_HEX: + case ZFS_KEYFORMAT_RAW: + /* requires no pbkdf2 iters and salt */ + if (dcp->cp_wkey->wk_salt != 0 || dcp->cp_wkey->wk_iters != 0) + return (SET_ERROR(EINVAL)); + break; + case ZFS_KEYFORMAT_PASSPHRASE: + /* requires pbkdf2 iters and salt */ + if (dcp->cp_wkey->wk_salt == 0 || + dcp->cp_wkey->wk_iters < MIN_PBKDF2_ITERATIONS) + return (SET_ERROR(EINVAL)); + break; + case ZFS_KEYFORMAT_NONE: + default: + /* keyformat must be specified and valid */ + return (SET_ERROR(EINVAL)); + } + + return (0); +} + +void +dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, + dsl_dataset_t *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + uint64_t crypt; + dsl_wrapping_key_t *wkey; + + /* clones always use their origin's wrapping key */ + if (dsl_dir_is_clone(dd)) { + ASSERT3P(dcp, ==, NULL); + + /* + * If this is an encrypted clone we just need to clone the + * dck into dd. Zapify the dd so we can do that. + */ + if (origin->ds_dir->dd_crypto_obj != 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + dd->dd_crypto_obj = + dsl_crypto_key_clone_sync(origin->ds_dir, tx); + VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, + &dd->dd_crypto_obj, tx)); + } + + return; + } + + /* + * A NULL dcp at this point indicates this is the origin dataset + * which does not have an objset to encrypt. Raw receives will handle + * encryption separately later. In both cases we can simply return. + */ + if (dcp == NULL || dcp->cp_cmd == DCP_CMD_RAW_RECV) + return; + + crypt = dcp->cp_crypt; + wkey = dcp->cp_wkey; + + /* figure out the effective crypt */ + if (crypt == ZIO_CRYPT_INHERIT && dd->dd_parent != NULL) + VERIFY0(dsl_dir_get_crypt(dd->dd_parent, &crypt)); + + /* if we aren't doing encryption just return */ + if (crypt == ZIO_CRYPT_OFF || crypt == ZIO_CRYPT_INHERIT) + return; + + /* zapify the dd so that we can add the crypto key obj to it */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + /* use the new key if given or inherit from the parent */ + if (wkey == NULL) { + VERIFY0(spa_keystore_wkey_hold_dd(dp->dp_spa, + dd->dd_parent, FTAG, &wkey)); + } else { + wkey->wk_ddobj = dd->dd_object; + } + + ASSERT3P(wkey, !=, NULL); + + /* Create or clone the DSL crypto key and activate the feature */ + dd->dd_crypto_obj = dsl_crypto_key_create_sync(crypt, wkey, tx); + VERIFY0(zap_add(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_CRYPTO_KEY_OBJ, sizeof (uint64_t), 1, &dd->dd_crypto_obj, + tx)); + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_ENCRYPTION, + (void *)B_TRUE, tx); + + /* + * If we inherited the wrapping key we release our reference now. + * Otherwise, this is a new key and we need to load it into the + * keystore. + */ + if (dcp->cp_wkey == NULL) { + dsl_wrapping_key_rele(wkey, FTAG); + } else { + VERIFY0(spa_keystore_load_wkey_impl(dp->dp_spa, wkey)); + } +} + +typedef struct dsl_crypto_recv_key_arg { + uint64_t dcrka_dsobj; + uint64_t dcrka_fromobj; + dmu_objset_type_t dcrka_ostype; + nvlist_t *dcrka_nvl; + boolean_t dcrka_do_key; +} dsl_crypto_recv_key_arg_t; + +static int +dsl_crypto_recv_raw_objset_check(dsl_dataset_t *ds, dsl_dataset_t *fromds, + dmu_objset_type_t ostype, nvlist_t *nvl, dmu_tx_t *tx) +{ + int ret; + objset_t *os; + dnode_t *mdn; + uint8_t *buf = NULL; + uint_t len; + uint64_t intval, nlevels, blksz, ibs; + uint64_t nblkptr, maxblkid; + + if (ostype != DMU_OST_ZFS && ostype != DMU_OST_ZVOL) + return (SET_ERROR(EINVAL)); + + /* raw receives also need info about the structure of the metadnode */ + ret = nvlist_lookup_uint64(nvl, "mdn_compress", &intval); + if (ret != 0 || intval >= ZIO_COMPRESS_LEGACY_FUNCTIONS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_checksum", &intval); + if (ret != 0 || intval >= ZIO_CHECKSUM_LEGACY_FUNCTIONS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_nlevels", &nlevels); + if (ret != 0 || nlevels > DN_MAX_LEVELS) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, "mdn_blksz", &blksz); + if (ret != 0 || blksz < SPA_MINBLOCKSIZE) + return (SET_ERROR(EINVAL)); + else if (blksz > spa_maxblocksize(tx->tx_pool->dp_spa)) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_indblkshift", &ibs); + if (ret != 0 || ibs < DN_MIN_INDBLKSHIFT || ibs > DN_MAX_INDBLKSHIFT) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_nblkptr", &nblkptr); + if (ret != 0 || nblkptr != DN_MAX_NBLKPTR) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, "mdn_maxblkid", &maxblkid); + if (ret != 0) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, "portable_mac", &buf, &len); + if (ret != 0 || len != ZIO_OBJSET_MAC_LEN) + return (SET_ERROR(EINVAL)); + + ret = dmu_objset_from_ds(ds, &os); + if (ret != 0) + return (ret); + + /* + * Useraccounting is not portable and must be done with the keys loaded. + * Therefore, whenever we do any kind of receive the useraccounting + * must not be present. + */ + ASSERT0(os->os_flags & OBJSET_FLAG_USERACCOUNTING_COMPLETE); + ASSERT0(os->os_flags & OBJSET_FLAG_USEROBJACCOUNTING_COMPLETE); + + mdn = DMU_META_DNODE(os); + + /* + * If we already created the objset, make sure its unchangeable + * properties match the ones received in the nvlist. + */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (!BP_IS_HOLE(dsl_dataset_get_blkptr(ds)) && + (mdn->dn_nlevels != nlevels || mdn->dn_datablksz != blksz || + mdn->dn_indblkshift != ibs || mdn->dn_nblkptr != nblkptr)) { + rrw_exit(&ds->ds_bp_rwlock, FTAG); + return (SET_ERROR(EINVAL)); + } + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* + * Check that the ivset guid of the fromds matches the one from the + * send stream. Older versions of the encryption code did not have + * an ivset guid on the from dataset and did not send one in the + * stream. For these streams we provide the + * zfs_disable_ivset_guid_check tunable to allow these datasets to + * be received with a generated ivset guid. + */ + if (fromds != NULL && !zfs_disable_ivset_guid_check) { + uint64_t from_ivset_guid = 0; + intval = 0; + + (void) nvlist_lookup_uint64(nvl, "from_ivset_guid", &intval); + (void) zap_lookup(tx->tx_pool->dp_meta_objset, + fromds->ds_object, DS_FIELD_IVSET_GUID, + sizeof (from_ivset_guid), 1, &from_ivset_guid); + + if (intval == 0 || from_ivset_guid == 0) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISSING)); + + if (intval != from_ivset_guid) + return (SET_ERROR(ZFS_ERR_FROM_IVSET_GUID_MISMATCH)); + } + + return (0); +} + +static void +dsl_crypto_recv_raw_objset_sync(dsl_dataset_t *ds, dmu_objset_type_t ostype, + nvlist_t *nvl, dmu_tx_t *tx) +{ + dsl_pool_t *dp = tx->tx_pool; + objset_t *os; + dnode_t *mdn; + zio_t *zio; + uint8_t *portable_mac; + uint_t len; + uint64_t compress, checksum, nlevels, blksz, ibs, maxblkid; + boolean_t newds = B_FALSE; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + mdn = DMU_META_DNODE(os); + + /* + * Fetch the values we need from the nvlist. "to_ivset_guid" must + * be set on the snapshot, which doesn't exist yet. The receive + * code will take care of this for us later. + */ + compress = fnvlist_lookup_uint64(nvl, "mdn_compress"); + checksum = fnvlist_lookup_uint64(nvl, "mdn_checksum"); + nlevels = fnvlist_lookup_uint64(nvl, "mdn_nlevels"); + blksz = fnvlist_lookup_uint64(nvl, "mdn_blksz"); + ibs = fnvlist_lookup_uint64(nvl, "mdn_indblkshift"); + maxblkid = fnvlist_lookup_uint64(nvl, "mdn_maxblkid"); + VERIFY0(nvlist_lookup_uint8_array(nvl, "portable_mac", &portable_mac, + &len)); + + /* if we haven't created an objset for the ds yet, do that now */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + if (BP_IS_HOLE(dsl_dataset_get_blkptr(ds))) { + (void) dmu_objset_create_impl_dnstats(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), ostype, nlevels, blksz, + ibs, tx); + newds = B_TRUE; + } + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* + * Set the portable MAC. The local MAC will always be zero since the + * incoming data will all be portable and user accounting will be + * deferred until the next mount. Afterwards, flag the os to be + * written out raw next time. + */ + arc_release(os->os_phys_buf, &os->os_phys_buf); + bcopy(portable_mac, os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN); + bzero(os->os_phys->os_local_mac, ZIO_OBJSET_MAC_LEN); + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; + + /* set metadnode compression and checksum */ + mdn->dn_compress = compress; + mdn->dn_checksum = checksum; + + rw_enter(&mdn->dn_struct_rwlock, RW_WRITER); + dnode_new_blkid(mdn, maxblkid, tx, B_FALSE, B_TRUE); + rw_exit(&mdn->dn_struct_rwlock); + + /* + * We can't normally dirty the dataset in syncing context unless + * we are creating a new dataset. In this case, we perform a + * pseudo txg sync here instead. + */ + if (newds) { + dsl_dataset_dirty(ds, tx); + } else { + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dsl_dataset_sync(ds, zio, tx); + VERIFY0(zio_wait(zio)); + + /* dsl_dataset_sync_done will drop this reference. */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + dsl_dataset_sync_done(ds, tx); + } +} + +int +dsl_crypto_recv_raw_key_check(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) +{ + int ret; + objset_t *mos = tx->tx_pool->dp_meta_objset; + uint8_t *buf = NULL; + uint_t len; + uint64_t intval, key_guid, version; + boolean_t is_passphrase = B_FALSE; + + ASSERT(dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT); + + /* + * Read and check all the encryption values from the nvlist. We need + * all of the fields of a DSL Crypto Key, as well as a fully specified + * wrapping key. + */ + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, &intval); + if (ret != 0 || intval >= ZIO_CRYPT_FUNCTIONS || + intval <= ZIO_CRYPT_OFF) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID, &intval); + if (ret != 0) + return (SET_ERROR(EINVAL)); + + /* + * If this is an incremental receive make sure the given key guid + * matches the one we already have. + */ + if (ds->ds_dir->dd_crypto_obj != 0) { + ret = zap_lookup(mos, ds->ds_dir->dd_crypto_obj, + DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); + if (ret != 0) + return (ret); + if (intval != key_guid) + return (SET_ERROR(EACCES)); + } + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + &buf, &len); + if (ret != 0 || len != MASTER_KEY_MAX_LEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + &buf, &len); + if (ret != 0 || len != SHA512_HMAC_KEYLEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &buf, &len); + if (ret != 0 || len != WRAPPING_IV_LEN) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &buf, &len); + if (ret != 0 || len != WRAPPING_MAC_LEN) + return (SET_ERROR(EINVAL)); + + /* + * We don't support receiving old on-disk formats. The version 0 + * implementation protected several fields in an objset that were + * not always portable during a raw receive. As a result, we call + * the old version an on-disk errata #3. + */ + ret = nvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_VERSION, &version); + if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) + return (SET_ERROR(ENOTSUP)); + + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), + &intval); + if (ret != 0 || intval >= ZFS_KEYFORMAT_FORMATS || + intval == ZFS_KEYFORMAT_NONE) + return (SET_ERROR(EINVAL)); + + is_passphrase = (intval == ZFS_KEYFORMAT_PASSPHRASE); + + /* + * for raw receives we allow any number of pbkdf2iters since there + * won't be a chance for the user to change it. + */ + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), + &intval); + if (ret != 0 || (is_passphrase == (intval == 0))) + return (SET_ERROR(EINVAL)); + + ret = nvlist_lookup_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), + &intval); + if (ret != 0 || (is_passphrase == (intval == 0))) + return (SET_ERROR(EINVAL)); + + return (0); +} + +void +dsl_crypto_recv_raw_key_sync(dsl_dataset_t *ds, nvlist_t *nvl, dmu_tx_t *tx) +{ + dsl_pool_t *dp = tx->tx_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dir_t *dd = ds->ds_dir; + uint_t len; + uint64_t rddobj, one = 1; + uint8_t *keydata, *hmac_keydata, *iv, *mac; + uint64_t crypt, key_guid, keyformat, iters, salt; + uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; + char *keylocation = "prompt"; + + /* lookup the values we need to create the DSL Crypto Key */ + crypt = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE); + key_guid = fnvlist_lookup_uint64(nvl, DSL_CRYPTO_KEY_GUID); + keyformat = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT)); + iters = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS)); + salt = fnvlist_lookup_uint64(nvl, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + &keydata, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + &hmac_keydata, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_IV, &iv, &len)); + VERIFY0(nvlist_lookup_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, &mac, &len)); + + /* if this is a new dataset setup the DSL Crypto Key. */ + if (dd->dd_crypto_obj == 0) { + /* zapify the dsl dir so we can add the key object to it */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_zapify(dd, tx); + + /* create the DSL Crypto Key on disk and activate the feature */ + dd->dd_crypto_obj = zap_create(mos, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + dd->dd_crypto_obj, DSL_CRYPTO_KEY_REFCOUNT, + sizeof (uint64_t), 1, &one, tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + dd->dd_crypto_obj, DSL_CRYPTO_KEY_VERSION, + sizeof (uint64_t), 1, &version, tx)); + + dsl_dataset_activate_feature(ds->ds_object, + SPA_FEATURE_ENCRYPTION, (void *)B_TRUE, tx); + ds->ds_feature[SPA_FEATURE_ENCRYPTION] = (void *)B_TRUE; + + /* save the dd_crypto_obj on disk */ + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj, tx)); + + /* + * Set the keylocation to prompt by default. If keylocation + * has been provided via the properties, this will be overridden + * later. + */ + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_KEYLOCATION), + ZPROP_SRC_LOCAL, 1, strlen(keylocation) + 1, + keylocation, tx); + + rddobj = dd->dd_object; + } else { + VERIFY0(dsl_dir_get_encryption_root_ddobj(dd, &rddobj)); + } + + /* sync the key data to the ZAP object on disk */ + dsl_crypto_key_sync_impl(mos, dd->dd_crypto_obj, crypt, + rddobj, key_guid, iv, mac, keydata, hmac_keydata, keyformat, salt, + iters, tx); +} + +static int +dsl_crypto_recv_key_check(void *arg, dmu_tx_t *tx) +{ + int ret; + dsl_crypto_recv_key_arg_t *dcrka = arg; + dsl_dataset_t *ds = NULL, *fromds = NULL; + + ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, + FTAG, &ds); + if (ret != 0) + goto out; + + if (dcrka->dcrka_fromobj != 0) { + ret = dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_fromobj, + FTAG, &fromds); + if (ret != 0) + goto out; + } + + ret = dsl_crypto_recv_raw_objset_check(ds, fromds, + dcrka->dcrka_ostype, dcrka->dcrka_nvl, tx); + if (ret != 0) + goto out; + + /* + * We run this check even if we won't be doing this part of + * the receive now so that we don't make the user wait until + * the receive finishes to fail. + */ + ret = dsl_crypto_recv_raw_key_check(ds, dcrka->dcrka_nvl, tx); + if (ret != 0) + goto out; + +out: + if (ds != NULL) + dsl_dataset_rele(ds, FTAG); + if (fromds != NULL) + dsl_dataset_rele(fromds, FTAG); + return (ret); +} + +static void +dsl_crypto_recv_key_sync(void *arg, dmu_tx_t *tx) +{ + dsl_crypto_recv_key_arg_t *dcrka = arg; + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(tx->tx_pool, dcrka->dcrka_dsobj, + FTAG, &ds)); + dsl_crypto_recv_raw_objset_sync(ds, dcrka->dcrka_ostype, + dcrka->dcrka_nvl, tx); + if (dcrka->dcrka_do_key) + dsl_crypto_recv_raw_key_sync(ds, dcrka->dcrka_nvl, tx); + dsl_dataset_rele(ds, FTAG); +} + +/* + * This function is used to sync an nvlist representing a DSL Crypto Key and + * the associated encryption parameters. The key will be written exactly as is + * without wrapping it. + */ +int +dsl_crypto_recv_raw(const char *poolname, uint64_t dsobj, uint64_t fromobj, + dmu_objset_type_t ostype, nvlist_t *nvl, boolean_t do_key) +{ + dsl_crypto_recv_key_arg_t dcrka; + + dcrka.dcrka_dsobj = dsobj; + dcrka.dcrka_fromobj = fromobj; + dcrka.dcrka_ostype = ostype; + dcrka.dcrka_nvl = nvl; + dcrka.dcrka_do_key = do_key; + + return (dsl_sync_task(poolname, dsl_crypto_recv_key_check, + dsl_crypto_recv_key_sync, &dcrka, 1, ZFS_SPACE_CHECK_NORMAL)); +} + +int +dsl_crypto_populate_key_nvlist(objset_t *os, uint64_t from_ivset_guid, + nvlist_t **nvl_out) +{ + int ret; + dsl_dataset_t *ds = os->os_dsl_dataset; + dnode_t *mdn; + uint64_t rddobj; + nvlist_t *nvl = NULL; + uint64_t dckobj = ds->ds_dir->dd_crypto_obj; + dsl_dir_t *rdd = NULL; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t crypt = 0, key_guid = 0, format = 0; + uint64_t iters = 0, salt = 0, version = 0; + uint64_t to_ivset_guid = 0; + uint8_t raw_keydata[MASTER_KEY_MAX_LEN]; + uint8_t raw_hmac_keydata[SHA512_HMAC_KEYLEN]; + uint8_t iv[WRAPPING_IV_LEN]; + uint8_t mac[WRAPPING_MAC_LEN]; + + ASSERT(dckobj != 0); + + mdn = DMU_META_DNODE(os); + + nvl = fnvlist_alloc(); + + /* lookup values from the DSL Crypto Key */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_CRYPTO_SUITE, 8, 1, + &crypt); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_GUID, 8, 1, &key_guid); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MASTER_KEY, 1, + MASTER_KEY_MAX_LEN, raw_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_HMAC_KEY, 1, + SHA512_HMAC_KEYLEN, raw_hmac_keydata); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_IV, 1, WRAPPING_IV_LEN, + iv); + if (ret != 0) + goto error; + + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_MAC, 1, WRAPPING_MAC_LEN, + mac); + if (ret != 0) + goto error; + + /* see zfs_disable_ivset_guid_check tunable for errata info */ + ret = zap_lookup(mos, ds->ds_object, DS_FIELD_IVSET_GUID, 8, 1, + &to_ivset_guid); + if (ret != 0) + ASSERT3U(dp->dp_spa->spa_errata, !=, 0); + + /* + * We don't support raw sends of legacy on-disk formats. See the + * comment in dsl_crypto_recv_key_check() for details. + */ + ret = zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_VERSION, 8, 1, &version); + if (ret != 0 || version != ZIO_CRYPT_KEY_CURRENT_VERSION) { + dp->dp_spa->spa_errata = ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + ret = SET_ERROR(ENOTSUP); + goto error; + } + + /* + * Lookup wrapping key properties. An early version of the code did + * not correctly add these values to the wrapping key or the DSL + * Crypto Key on disk for non encryption roots, so to be safe we + * always take the slightly circuitous route of looking it up from + * the encryption root's key. + */ + ret = dsl_dir_get_encryption_root_ddobj(ds->ds_dir, &rddobj); + if (ret != 0) + goto error; + + dsl_pool_config_enter(dp, FTAG); + + ret = dsl_dir_hold_obj(dp, rddobj, NULL, FTAG, &rdd); + if (ret != 0) + goto error_unlock; + + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &format); + if (ret != 0) + goto error_unlock; + + if (format == ZFS_KEYFORMAT_PASSPHRASE) { + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &iters); + if (ret != 0) + goto error_unlock; + + ret = zap_lookup(dp->dp_meta_objset, rdd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &salt); + if (ret != 0) + goto error_unlock; + } + + dsl_dir_rele(rdd, FTAG); + dsl_pool_config_exit(dp, FTAG); + + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_CRYPTO_SUITE, crypt); + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_GUID, key_guid); + fnvlist_add_uint64(nvl, DSL_CRYPTO_KEY_VERSION, version); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MASTER_KEY, + raw_keydata, MASTER_KEY_MAX_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_HMAC_KEY, + raw_hmac_keydata, SHA512_HMAC_KEYLEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_IV, iv, + WRAPPING_IV_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, DSL_CRYPTO_KEY_MAC, mac, + WRAPPING_MAC_LEN)); + VERIFY0(nvlist_add_uint8_array(nvl, "portable_mac", + os->os_phys->os_portable_mac, ZIO_OBJSET_MAC_LEN)); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_KEYFORMAT), format); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), iters); + fnvlist_add_uint64(nvl, zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), salt); + fnvlist_add_uint64(nvl, "mdn_checksum", mdn->dn_checksum); + fnvlist_add_uint64(nvl, "mdn_compress", mdn->dn_compress); + fnvlist_add_uint64(nvl, "mdn_nlevels", mdn->dn_nlevels); + fnvlist_add_uint64(nvl, "mdn_blksz", mdn->dn_datablksz); + fnvlist_add_uint64(nvl, "mdn_indblkshift", mdn->dn_indblkshift); + fnvlist_add_uint64(nvl, "mdn_nblkptr", mdn->dn_nblkptr); + fnvlist_add_uint64(nvl, "mdn_maxblkid", mdn->dn_maxblkid); + fnvlist_add_uint64(nvl, "to_ivset_guid", to_ivset_guid); + fnvlist_add_uint64(nvl, "from_ivset_guid", from_ivset_guid); + + *nvl_out = nvl; + return (0); + +error_unlock: + dsl_pool_config_exit(dp, FTAG); +error: + if (rdd != NULL) + dsl_dir_rele(rdd, FTAG); + nvlist_free(nvl); + + *nvl_out = NULL; + return (ret); +} + +uint64_t +dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, + dmu_tx_t *tx) +{ + dsl_crypto_key_t dck; + uint64_t version = ZIO_CRYPT_KEY_CURRENT_VERSION; + uint64_t one = 1ULL; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(crypt, <, ZIO_CRYPT_FUNCTIONS); + ASSERT3U(crypt, >, ZIO_CRYPT_OFF); + + /* create the DSL Crypto Key ZAP object */ + dck.dck_obj = zap_create(tx->tx_pool->dp_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + + /* fill in the key (on the stack) and sync it to disk */ + dck.dck_wkey = wkey; + VERIFY0(zio_crypt_key_init(crypt, &dck.dck_key)); + + dsl_crypto_key_sync(&dck, tx); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, + DSL_CRYPTO_KEY_REFCOUNT, sizeof (uint64_t), 1, &one, tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, dck.dck_obj, + DSL_CRYPTO_KEY_VERSION, sizeof (uint64_t), 1, &version, tx)); + + zio_crypt_key_destroy(&dck.dck_key); + bzero(&dck.dck_key, sizeof (zio_crypt_key_t)); + + return (dck.dck_obj); +} + +uint64_t +dsl_crypto_key_clone_sync(dsl_dir_t *origindd, dmu_tx_t *tx) +{ + objset_t *mos = tx->tx_pool->dp_meta_objset; + + ASSERT(dmu_tx_is_syncing(tx)); + + VERIFY0(zap_increment(mos, origindd->dd_crypto_obj, + DSL_CRYPTO_KEY_REFCOUNT, 1, tx)); + + return (origindd->dd_crypto_obj); +} + +void +dsl_crypto_key_destroy_sync(uint64_t dckobj, dmu_tx_t *tx) +{ + objset_t *mos = tx->tx_pool->dp_meta_objset; + uint64_t refcnt; + + /* Decrement the refcount, destroy if this is the last reference */ + VERIFY0(zap_lookup(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, + sizeof (uint64_t), 1, &refcnt)); + + if (refcnt != 1) { + VERIFY0(zap_increment(mos, dckobj, DSL_CRYPTO_KEY_REFCOUNT, + -1, tx)); + } else { + VERIFY0(zap_destroy(mos, dckobj, tx)); + } +} + +void +dsl_dataset_crypt_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + uint64_t intval; + dsl_dir_t *dd = ds->ds_dir; + dsl_dir_t *enc_root; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + + if (dd->dd_crypto_obj == 0) + return; + + intval = dsl_dataset_get_keystatus(dd); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYSTATUS, intval); + + if (dsl_dir_get_crypt(dd, &intval) == 0) + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_ENCRYPTION, intval); + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + DSL_CRYPTO_KEY_GUID, 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEY_GUID, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_KEYFORMAT), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_KEYFORMAT, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_SALT), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_SALT, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, dd->dd_crypto_obj, + zfs_prop_to_name(ZFS_PROP_PBKDF2_ITERS), 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_PBKDF2_ITERS, intval); + } + if (zap_lookup(dd->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_IVSET_GUID, 8, 1, &intval) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_IVSET_GUID, intval); + } + + if (dsl_dir_get_encryption_root_ddobj(dd, &intval) == 0) { + if (dsl_dir_hold_obj(dd->dd_pool, intval, NULL, FTAG, + &enc_root) == 0) { + dsl_dir_name(enc_root, buf); + dsl_dir_rele(enc_root, FTAG); + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_ENCRYPTION_ROOT, buf); + } + } +} + +int +spa_crypt_get_salt(spa_t *spa, uint64_t dsobj, uint8_t *salt) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + ret = zio_crypt_key_get_salt(&dck->dck_key, salt); + if (ret != 0) + goto error; + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + return (ret); +} + +/* + * Objset blocks are a special case for MAC generation. These blocks have 2 + * 256-bit MACs which are embedded within the block itself, rather than a + * single 128 bit MAC. As a result, this function handles encoding and decoding + * the MACs on its own, unlike other functions in this file. + */ +int +spa_do_crypt_objset_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, + abd_t *abd, uint_t datalen, boolean_t byteswap) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + void *buf = abd_borrow_buf_copy(abd, datalen); + objset_phys_t *osp = buf; + uint8_t portable_mac[ZIO_OBJSET_MAC_LEN]; + uint8_t local_mac[ZIO_OBJSET_MAC_LEN]; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + /* calculate both HMACs */ + ret = zio_crypt_do_objset_hmacs(&dck->dck_key, buf, datalen, + byteswap, portable_mac, local_mac); + if (ret != 0) + goto error; + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + /* if we are generating encode the HMACs in the objset_phys_t */ + if (generate) { + bcopy(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN); + bcopy(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN); + abd_return_buf_copy(abd, buf, datalen); + return (0); + } + + if (bcmp(portable_mac, osp->os_portable_mac, ZIO_OBJSET_MAC_LEN) != 0 || + bcmp(local_mac, osp->os_local_mac, ZIO_OBJSET_MAC_LEN) != 0) { + abd_return_buf(abd, buf, datalen); + return (SET_ERROR(ECKSUM)); + } + + abd_return_buf(abd, buf, datalen); + + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + abd_return_buf(abd, buf, datalen); + return (ret); +} + +int +spa_do_crypt_mac_abd(boolean_t generate, spa_t *spa, uint64_t dsobj, abd_t *abd, + uint_t datalen, uint8_t *mac) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + uint8_t *buf = abd_borrow_buf_copy(abd, datalen); + uint8_t digestbuf[ZIO_DATA_MAC_LEN]; + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, dsobj, FTAG, &dck); + if (ret != 0) + goto error; + + /* perform the hmac */ + ret = zio_crypt_do_hmac(&dck->dck_key, buf, datalen, + digestbuf, ZIO_DATA_MAC_LEN); + if (ret != 0) + goto error; + + abd_return_buf(abd, buf, datalen); + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + /* + * Truncate and fill in mac buffer if we were asked to generate a MAC. + * Otherwise verify that the MAC matched what we expected. + */ + if (generate) { + bcopy(digestbuf, mac, ZIO_DATA_MAC_LEN); + return (0); + } + + if (bcmp(digestbuf, mac, ZIO_DATA_MAC_LEN) != 0) + return (SET_ERROR(ECKSUM)); + + return (0); + +error: + if (dck != NULL) + spa_keystore_dsl_key_rele(spa, dck, FTAG); + abd_return_buf(abd, buf, datalen); + return (ret); +} + +/* + * This function serves as a multiplexer for encryption and decryption of + * all blocks (except the L2ARC). For encryption, it will populate the IV, + * salt, MAC, and cabd (the ciphertext). On decryption it will simply use + * these fields to populate pabd (the plaintext). + */ +int +spa_do_crypt_abd(boolean_t encrypt, spa_t *spa, const zbookmark_phys_t *zb, + dmu_object_type_t ot, boolean_t dedup, boolean_t bswap, uint8_t *salt, + uint8_t *iv, uint8_t *mac, uint_t datalen, abd_t *pabd, abd_t *cabd, + boolean_t *no_crypt) +{ + int ret; + dsl_crypto_key_t *dck = NULL; + uint8_t *plainbuf = NULL, *cipherbuf = NULL; + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); + + /* look up the key from the spa's keystore */ + ret = spa_keystore_lookup_key(spa, zb->zb_objset, FTAG, &dck); + if (ret != 0) { + ret = SET_ERROR(EACCES); + return (ret); + } + + if (encrypt) { + plainbuf = abd_borrow_buf_copy(pabd, datalen); + cipherbuf = abd_borrow_buf(cabd, datalen); + } else { + plainbuf = abd_borrow_buf(pabd, datalen); + cipherbuf = abd_borrow_buf_copy(cabd, datalen); + } + + /* + * Both encryption and decryption functions need a salt for key + * generation and an IV. When encrypting a non-dedup block, we + * generate the salt and IV randomly to be stored by the caller. Dedup + * blocks perform a (more expensive) HMAC of the plaintext to obtain + * the salt and the IV. ZIL blocks have their salt and IV generated + * at allocation time in zio_alloc_zil(). On decryption, we simply use + * the provided values. + */ + if (encrypt && ot != DMU_OT_INTENT_LOG && !dedup) { + ret = zio_crypt_key_get_salt(&dck->dck_key, salt); + if (ret != 0) + goto error; + + ret = zio_crypt_generate_iv(iv); + if (ret != 0) + goto error; + } else if (encrypt && dedup) { + ret = zio_crypt_generate_iv_salt_dedup(&dck->dck_key, + plainbuf, datalen, iv, salt); + if (ret != 0) + goto error; + } + + /* call lower level function to perform encryption / decryption */ + ret = zio_do_crypt_data(encrypt, &dck->dck_key, ot, bswap, salt, iv, + mac, datalen, plainbuf, cipherbuf, no_crypt); + + /* + * Handle injected decryption faults. Unfortunately, we cannot inject + * faults for dnode blocks because we might trigger the panic in + * dbuf_prepare_encrypted_dnode_leaf(), which exists because syncing + * context is not prepared to handle malicious decryption failures. + */ + if (zio_injection_enabled && !encrypt && ot != DMU_OT_DNODE && ret == 0) + ret = zio_handle_decrypt_injection(spa, zb, ot, ECKSUM); + if (ret != 0) + goto error; + + if (encrypt) { + abd_return_buf(pabd, plainbuf, datalen); + abd_return_buf_copy(cabd, cipherbuf, datalen); + } else { + abd_return_buf_copy(pabd, plainbuf, datalen); + abd_return_buf(cabd, cipherbuf, datalen); + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + return (0); + +error: + if (encrypt) { + /* zero out any state we might have changed while encrypting */ + bzero(salt, ZIO_DATA_SALT_LEN); + bzero(iv, ZIO_DATA_IV_LEN); + bzero(mac, ZIO_DATA_MAC_LEN); + abd_return_buf(pabd, plainbuf, datalen); + abd_return_buf_copy(cabd, cipherbuf, datalen); + } else { + abd_return_buf_copy(pabd, plainbuf, datalen); + abd_return_buf(cabd, cipherbuf, datalen); + } + + spa_keystore_dsl_key_rele(spa, dck, FTAG); + + return (ret); +} + +ZFS_MODULE_PARAM(zfs, zfs_, disable_ivset_guid_check, INT, ZMOD_RW, + "Set to allow raw receives without IVset guids"); diff --git a/module/zfs/dsl_dataset.c b/module/zfs/dsl_dataset.c new file mode 100644 index 000000000000..1fcd83db7988 --- /dev/null +++ b/module/zfs/dsl_dataset.c @@ -0,0 +1,5027 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2014 RackTop Systems. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright 2016, OmniTI Computer Consulting, Inc. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + * Copyright (c) 2020 The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The SPA supports block sizes up to 16MB. However, very large blocks + * can have an impact on i/o latency (e.g. tying up a spinning disk for + * ~300ms), and also potentially on the memory allocator. Therefore, + * we do not allow the recordsize to be set larger than zfs_max_recordsize + * (default 1MB). Larger blocks can be created by changing this tunable, + * and pools with larger blocks can always be imported and used, regardless + * of this setting. + */ +int zfs_max_recordsize = 1 * 1024 * 1024; +int zfs_allow_redacted_dataset_mount = 0; + +#define SWITCH64(x, y) \ + { \ + uint64_t __tmp = (x); \ + (x) = (y); \ + (y) = __tmp; \ + } + +#define DS_REF_MAX (1ULL << 62) + +extern inline dsl_dataset_phys_t *dsl_dataset_phys(dsl_dataset_t *ds); + +static void dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, + uint64_t obj, dmu_tx_t *tx); +static void dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, + dmu_tx_t *tx); + +static void unload_zfeature(dsl_dataset_t *ds, spa_feature_t f); + +extern int spa_asize_inflation; + +static zil_header_t zero_zil; + +/* + * Figure out how much of this delta should be propagated to the dsl_dir + * layer. If there's a refreservation, that space has already been + * partially accounted for in our ancestors. + */ +static int64_t +parent_delta(dsl_dataset_t *ds, int64_t delta) +{ + dsl_dataset_phys_t *ds_phys; + uint64_t old_bytes, new_bytes; + + if (ds->ds_reserved == 0) + return (delta); + + ds_phys = dsl_dataset_phys(ds); + old_bytes = MAX(ds_phys->ds_unique_bytes, ds->ds_reserved); + new_bytes = MAX(ds_phys->ds_unique_bytes + delta, ds->ds_reserved); + + ASSERT3U(ABS((int64_t)(new_bytes - old_bytes)), <=, ABS(delta)); + return (new_bytes - old_bytes); +} + +void +dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + int64_t delta; + spa_feature_t f; + + dprintf_bp(bp, "ds=%p", ds); + + ASSERT(dmu_tx_is_syncing(tx)); + /* It could have been compressed away to nothing */ + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) + return; + ASSERT(BP_GET_TYPE(bp) != DMU_OT_NONE); + ASSERT(DMU_OT_IS_VALID(BP_GET_TYPE(bp))); + if (ds == NULL) { + dsl_pool_mos_diduse_space(tx->tx_pool, + used, compressed, uncompressed); + return; + } + + ASSERT3U(bp->blk_birth, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_lock); + delta = parent_delta(ds, used); + dsl_dataset_phys(ds)->ds_referenced_bytes += used; + dsl_dataset_phys(ds)->ds_compressed_bytes += compressed; + dsl_dataset_phys(ds)->ds_uncompressed_bytes += uncompressed; + dsl_dataset_phys(ds)->ds_unique_bytes += used; + + if (BP_GET_LSIZE(bp) > SPA_OLD_MAXBLOCKSIZE) { + ds->ds_feature_activation[SPA_FEATURE_LARGE_BLOCKS] = + (void *)B_TRUE; + } + + + f = zio_checksum_to_feature(BP_GET_CHECKSUM(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } + + f = zio_compress_to_feature(BP_GET_COMPRESS(bp)); + if (f != SPA_FEATURE_NONE) { + ASSERT3S(spa_feature_table[f].fi_type, ==, + ZFEATURE_TYPE_BOOLEAN); + ds->ds_feature_activation[f] = (void *)B_TRUE; + } + + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_allocs, bp); + } + + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, delta, + compressed, uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); +} + +/* + * Called when the specified segment has been remapped, and is thus no + * longer referenced in the head dataset. The vdev must be indirect. + * + * If the segment is referenced by a snapshot, put it on the remap deadlist. + * Otherwise, add this segment to the obsolete spacemap. + */ +void +dsl_dataset_block_remapped(dsl_dataset_t *ds, uint64_t vdev, uint64_t offset, + uint64_t size, uint64_t birth, dmu_tx_t *tx) +{ + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(birth <= tx->tx_txg); + ASSERT(!ds->ds_is_snapshot); + + if (birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + spa_vdev_indirect_mark_obsolete(spa, vdev, offset, size, tx); + } else { + blkptr_t fakebp; + dva_t *dva = &fakebp.blk_dva[0]; + + ASSERT(ds != NULL); + + mutex_enter(&ds->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds)) { + dsl_dataset_create_remap_deadlist(ds, tx); + } + mutex_exit(&ds->ds_remap_deadlist_lock); + + BP_ZERO(&fakebp); + fakebp.blk_birth = birth; + DVA_SET_VDEV(dva, vdev); + DVA_SET_OFFSET(dva, offset); + DVA_SET_ASIZE(dva, size); + dsl_deadlist_insert(&ds->ds_remap_deadlist, &fakebp, B_FALSE, + tx); + } +} + +int +dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, + boolean_t async) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + int used = bp_get_dsize_sync(spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + + if (BP_IS_HOLE(bp) || BP_IS_REDACTED(bp)) + return (0); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(bp->blk_birth <= tx->tx_txg); + + if (ds == NULL) { + dsl_free(tx->tx_pool, tx->tx_txg, bp); + dsl_pool_mos_diduse_space(tx->tx_pool, + -used, -compressed, -uncompressed); + return (used); + } + ASSERT3P(tx->tx_pool, ==, ds->ds_dir->dd_pool); + + ASSERT(!ds->ds_is_snapshot); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + + /* + * Track block for livelist, but ignore embedded blocks because + * they do not need to be freed. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist) && + bp->blk_birth > ds->ds_dir->dd_origin_txg && + !(BP_IS_EMBEDDED(bp))) { + ASSERT(dsl_dir_is_clone(ds->ds_dir)); + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LIVELIST)); + bplist_append(&ds->ds_dir->dd_pending_frees, bp); + } + + if (bp->blk_birth > dsl_dataset_phys(ds)->ds_prev_snap_txg) { + int64_t delta; + + dprintf_bp(bp, "freeing ds=%llu", ds->ds_object); + dsl_free(tx->tx_pool, tx->tx_txg, bp); + + mutex_enter(&ds->ds_lock); + ASSERT(dsl_dataset_phys(ds)->ds_unique_bytes >= used || + !DS_UNIQUE_IS_ACCURATE(ds)); + delta = parent_delta(ds, -used); + dsl_dataset_phys(ds)->ds_unique_bytes -= used; + mutex_exit(&ds->ds_lock); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + delta, -compressed, -uncompressed, tx); + dsl_dir_transfer_space(ds->ds_dir, -used - delta, + DD_USED_REFRSRV, DD_USED_HEAD, tx); + } else { + dprintf_bp(bp, "putting on dead list: %s", ""); + if (async) { + /* + * We are here as part of zio's write done callback, + * which means we're a zio interrupt thread. We can't + * call dsl_deadlist_insert() now because it may block + * waiting for I/O. Instead, put bp on the deferred + * queue and let dsl_pool_sync() finish the job. + */ + bplist_append(&ds->ds_pending_deadlist, bp); + } else { + dsl_deadlist_insert(&ds->ds_deadlist, bp, B_FALSE, tx); + } + ASSERT3U(ds->ds_prev->ds_object, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj); + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_num_children > 0); + /* if (bp->blk_birth > prev prev snap txg) prev unique += bs */ + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object && bp->blk_birth > + dsl_dataset_phys(ds->ds_prev)->ds_prev_snap_txg) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + mutex_enter(&ds->ds_prev->ds_lock); + dsl_dataset_phys(ds->ds_prev)->ds_unique_bytes += used; + mutex_exit(&ds->ds_prev->ds_lock); + } + if (bp->blk_birth > ds->ds_dir->dd_origin_txg) { + dsl_dir_transfer_space(ds->ds_dir, used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + } + + dsl_bookmark_block_killed(ds, bp, tx); + + mutex_enter(&ds->ds_lock); + ASSERT3U(dsl_dataset_phys(ds)->ds_referenced_bytes, >=, used); + dsl_dataset_phys(ds)->ds_referenced_bytes -= used; + ASSERT3U(dsl_dataset_phys(ds)->ds_compressed_bytes, >=, compressed); + dsl_dataset_phys(ds)->ds_compressed_bytes -= compressed; + ASSERT3U(dsl_dataset_phys(ds)->ds_uncompressed_bytes, >=, uncompressed); + dsl_dataset_phys(ds)->ds_uncompressed_bytes -= uncompressed; + mutex_exit(&ds->ds_lock); + + return (used); +} + +struct feature_type_uint64_array_arg { + uint64_t length; + uint64_t *array; +}; + +static void +unload_zfeature(dsl_dataset_t *ds, spa_feature_t f) +{ + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; + kmem_free(ftuaa->array, ftuaa->length * sizeof (uint64_t)); + kmem_free(ftuaa, sizeof (*ftuaa)); + break; + } + default: + panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); + } +} + +static int +load_zfeature(objset_t *mos, dsl_dataset_t *ds, spa_feature_t f) +{ + int err = 0; + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + err = zap_contains(mos, ds->ds_object, + spa_feature_table[f].fi_guid); + if (err == 0) { + ds->ds_feature[f] = (void *)B_TRUE; + } else { + ASSERT3U(err, ==, ENOENT); + err = 0; + } + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + uint64_t int_size, num_int; + uint64_t *data; + err = zap_length(mos, ds->ds_object, + spa_feature_table[f].fi_guid, &int_size, &num_int); + if (err != 0) { + ASSERT3U(err, ==, ENOENT); + err = 0; + break; + } + ASSERT3U(int_size, ==, sizeof (uint64_t)); + data = kmem_alloc(int_size * num_int, KM_SLEEP); + VERIFY0(zap_lookup(mos, ds->ds_object, + spa_feature_table[f].fi_guid, int_size, num_int, data)); + struct feature_type_uint64_array_arg *ftuaa = + kmem_alloc(sizeof (*ftuaa), KM_SLEEP); + ftuaa->length = num_int; + ftuaa->array = data; + ds->ds_feature[f] = ftuaa; + break; + } + default: + panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); + } + return (err); +} + +/* + * We have to release the fsid synchronously or we risk that a subsequent + * mount of the same dataset will fail to unique_insert the fsid. This + * failure would manifest itself as the fsid of this dataset changing + * between mounts which makes NFS clients quite unhappy. + */ +static void +dsl_dataset_evict_sync(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + unique_remove(ds->ds_fsid_guid); +} + +static void +dsl_dataset_evict_async(void *dbu) +{ + dsl_dataset_t *ds = dbu; + + ASSERT(ds->ds_owner == NULL); + + ds->ds_dbuf = NULL; + + if (ds->ds_objset != NULL) + dmu_objset_evict(ds->ds_objset); + + if (ds->ds_prev) { + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + dsl_bookmark_fini_ds(ds); + + bplist_destroy(&ds->ds_pending_deadlist); + if (dsl_deadlist_is_open(&ds->ds_deadlist)) + dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); + if (ds->ds_dir) + dsl_dir_async_rele(ds->ds_dir, ds); + + ASSERT(!list_link_active(&ds->ds_synced_link)); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + unload_zfeature(ds, f); + } + + list_destroy(&ds->ds_prop_cbs); + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); + mutex_destroy(&ds->ds_remap_deadlist_lock); + zfs_refcount_destroy(&ds->ds_longholds); + rrw_destroy(&ds->ds_bp_rwlock); + + kmem_free(ds, sizeof (dsl_dataset_t)); +} + +int +dsl_dataset_get_snapname(dsl_dataset_t *ds) +{ + dsl_dataset_phys_t *headphys; + int err; + dmu_buf_t *headdbuf; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + if (ds->ds_snapname[0]) + return (0); + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) + return (0); + + err = dmu_bonus_hold(mos, dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, + FTAG, &headdbuf); + if (err != 0) + return (err); + headphys = headdbuf->db_data; + err = zap_value_search(dp->dp_meta_objset, + headphys->ds_snapnames_zapobj, ds->ds_object, 0, ds->ds_snapname); + if (err != 0 && zfs_recover == B_TRUE) { + err = 0; + (void) snprintf(ds->ds_snapname, sizeof (ds->ds_snapname), + "SNAPOBJ=%llu-ERR=%d", + (unsigned long long)ds->ds_object, err); + } + dmu_buf_rele(headdbuf, FTAG); + return (err); +} + +int +dsl_dataset_snap_lookup(dsl_dataset_t *ds, const char *name, uint64_t *value) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + matchtype_t mt = 0; + int err; + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + err = zap_lookup_norm(mos, snapobj, name, 8, 1, + value, mt, NULL, 0, NULL); + if (err == ENOTSUP && (mt & MT_NORMALIZE)) + err = zap_lookup(mos, snapobj, name, 8, 1, value); + return (err); +} + +int +dsl_dataset_snap_remove(dsl_dataset_t *ds, const char *name, dmu_tx_t *tx, + boolean_t adj_cnt) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t snapobj = dsl_dataset_phys(ds)->ds_snapnames_zapobj; + matchtype_t mt = 0; + int err; + + dsl_dir_snap_cmtime_update(ds->ds_dir); + + if (dsl_dataset_phys(ds)->ds_flags & DS_FLAG_CI_DATASET) + mt = MT_NORMALIZE; + + err = zap_remove_norm(mos, snapobj, name, mt, tx); + if (err == ENOTSUP && (mt & MT_NORMALIZE)) + err = zap_remove(mos, snapobj, name, tx); + + if (err == 0 && adj_cnt) + dsl_fs_ss_count_adjust(ds->ds_dir, -1, + DD_FIELD_SNAPSHOT_COUNT, tx); + + return (err); +} + +boolean_t +dsl_dataset_try_add_ref(dsl_pool_t *dp, dsl_dataset_t *ds, void *tag) +{ + dmu_buf_t *dbuf = ds->ds_dbuf; + boolean_t result = B_FALSE; + + if (dbuf != NULL && dmu_buf_try_add_ref(dbuf, dp->dp_meta_objset, + ds->ds_object, DMU_BONUS_BLKID, tag)) { + + if (ds == dmu_buf_get_user(dbuf)) + result = B_TRUE; + else + dmu_buf_rele(dbuf, tag); + } + + return (result); +} + +int +dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, + dsl_dataset_t **dsp) +{ + objset_t *mos = dp->dp_meta_objset; + dmu_buf_t *dbuf; + dsl_dataset_t *ds; + int err; + dmu_object_info_t doi; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(mos, dsobj, tag, &dbuf); + if (err != 0) + return (err); + + /* Make sure dsobj has the correct object type. */ + dmu_object_info_from_db(dbuf, &doi); + if (doi.doi_bonus_type != DMU_OT_DSL_DATASET) { + dmu_buf_rele(dbuf, tag); + return (SET_ERROR(EINVAL)); + } + + ds = dmu_buf_get_user(dbuf); + if (ds == NULL) { + dsl_dataset_t *winner = NULL; + + ds = kmem_zalloc(sizeof (dsl_dataset_t), KM_SLEEP); + ds->ds_dbuf = dbuf; + ds->ds_object = dsobj; + ds->ds_is_snapshot = dsl_dataset_phys(ds)->ds_num_children != 0; + list_link_init(&ds->ds_synced_link); + + err = dsl_dir_hold_obj(dp, dsl_dataset_phys(ds)->ds_dir_obj, + NULL, ds, &ds->ds_dir); + if (err != 0) { + kmem_free(ds, sizeof (dsl_dataset_t)); + dmu_buf_rele(dbuf, tag); + return (err); + } + + mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_sendstream_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_remap_deadlist_lock, + NULL, MUTEX_DEFAULT, NULL); + rrw_init(&ds->ds_bp_rwlock, B_FALSE); + zfs_refcount_create(&ds->ds_longholds); + + bplist_create(&ds->ds_pending_deadlist); + + list_create(&ds->ds_sendstreams, sizeof (dmu_sendstatus_t), + offsetof(dmu_sendstatus_t, dss_link)); + + list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_ds_node)); + + if (doi.doi_type == DMU_OTN_ZAP_METADATA) { + spa_feature_t f; + + for (f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) + continue; + err = load_zfeature(mos, ds, f); + } + } + + if (!ds->ds_is_snapshot) { + ds->ds_snapname[0] = '\0'; + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev); + } + err = dsl_bookmark_init_ds(ds); + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && + dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj, + &ds->ds_userrefs); + } + } + + if (err == 0 && !ds->ds_is_snapshot) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + &ds->ds_reserved); + if (err == 0) { + err = dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + &ds->ds_quota); + } + } else { + ds->ds_reserved = ds->ds_quota = 0; + } + + if (err == 0 && ds->ds_dir->dd_crypto_obj != 0 && + ds->ds_is_snapshot && + zap_contains(mos, dsobj, DS_FIELD_IVSET_GUID) != 0) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; + } + + dsl_deadlist_open(&ds->ds_deadlist, + mos, dsl_dataset_phys(ds)->ds_deadlist_obj); + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + if (remap_deadlist_obj != 0) { + dsl_deadlist_open(&ds->ds_remap_deadlist, mos, + remap_deadlist_obj); + } + + dmu_buf_init_user(&ds->ds_dbu, dsl_dataset_evict_sync, + dsl_dataset_evict_async, &ds->ds_dbuf); + if (err == 0) + winner = dmu_buf_set_user_ie(dbuf, &ds->ds_dbu); + + if (err != 0 || winner != NULL) { + bplist_destroy(&ds->ds_pending_deadlist); + dsl_deadlist_close(&ds->ds_deadlist); + if (dsl_deadlist_is_open(&ds->ds_remap_deadlist)) + dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_bookmark_fini_ds(ds); + if (ds->ds_prev) + dsl_dataset_rele(ds->ds_prev, ds); + dsl_dir_rele(ds->ds_dir, ds); + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + unload_zfeature(ds, f); + } + + list_destroy(&ds->ds_prop_cbs); + list_destroy(&ds->ds_sendstreams); + mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_opening_lock); + mutex_destroy(&ds->ds_sendstream_lock); + mutex_destroy(&ds->ds_remap_deadlist_lock); + zfs_refcount_destroy(&ds->ds_longholds); + rrw_destroy(&ds->ds_bp_rwlock); + kmem_free(ds, sizeof (dsl_dataset_t)); + if (err != 0) { + dmu_buf_rele(dbuf, tag); + return (err); + } + ds = winner; + } else { + ds->ds_fsid_guid = + unique_insert(dsl_dataset_phys(ds)->ds_fsid_guid); + if (ds->ds_fsid_guid != + dsl_dataset_phys(ds)->ds_fsid_guid) { + zfs_dbgmsg("ds_fsid_guid changed from " + "%llx to %llx for pool %s dataset id %llu", + (long long) + dsl_dataset_phys(ds)->ds_fsid_guid, + (long long)ds->ds_fsid_guid, + spa_name(dp->dp_spa), + dsobj); + } + } + } + + ASSERT3P(ds->ds_dbuf, ==, dbuf); + ASSERT3P(dsl_dataset_phys(ds), ==, dbuf->db_data); + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0 || + spa_version(dp->dp_spa) < SPA_VERSION_ORIGIN || + dp->dp_origin_snap == NULL || ds == dp->dp_origin_snap); + *dsp = ds; + + return (0); +} + +int +dsl_dataset_create_key_mapping(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + + if (dd->dd_crypto_obj == 0) + return (0); + + return (spa_keystore_create_mapping(dd->dd_pool->dp_spa, + ds, ds, &ds->ds_key_mapping)); +} + +int +dsl_dataset_hold_obj_flags(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + int err; + + err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); + if (err != 0) + return (err); + + ASSERT3P(*dsp, !=, NULL); + + if (flags & DS_HOLD_FLAG_DECRYPT) { + err = dsl_dataset_create_key_mapping(*dsp); + if (err != 0) + dsl_dataset_rele(*dsp, tag); + } + + return (err); +} + +int +dsl_dataset_hold_flags(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + dsl_dir_t *dd; + const char *snapname; + uint64_t obj; + int err = 0; + dsl_dataset_t *ds; + + err = dsl_dir_hold(dp, name, FTAG, &dd, &snapname); + if (err != 0) + return (err); + + ASSERT(dsl_pool_config_held(dp)); + obj = dsl_dir_phys(dd)->dd_head_dataset_obj; + if (obj != 0) + err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, &ds); + else + err = SET_ERROR(ENOENT); + + /* we may be looking for a snapshot */ + if (err == 0 && snapname != NULL) { + dsl_dataset_t *snap_ds; + + if (*snapname++ != '@') { + dsl_dataset_rele_flags(ds, flags, tag); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(ENOENT)); + } + + dprintf("looking for snapshot '%s'\n", snapname); + err = dsl_dataset_snap_lookup(ds, snapname, &obj); + if (err == 0) { + err = dsl_dataset_hold_obj_flags(dp, obj, flags, tag, + &snap_ds); + } + dsl_dataset_rele_flags(ds, flags, tag); + + if (err == 0) { + mutex_enter(&snap_ds->ds_lock); + if (snap_ds->ds_snapname[0] == 0) + (void) strlcpy(snap_ds->ds_snapname, snapname, + sizeof (snap_ds->ds_snapname)); + mutex_exit(&snap_ds->ds_lock); + ds = snap_ds; + } + } + if (err == 0) + *dsp = ds; + dsl_dir_rele(dd, FTAG); + return (err); +} + +int +dsl_dataset_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_flags(dp, name, 0, tag, dsp)); +} + +static int +dsl_dataset_own_obj_impl(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_obj_flags(dp, dsobj, flags, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag, override)) { + dsl_dataset_rele_flags(*dsp, flags, tag); + *dsp = NULL; + return (SET_ERROR(EBUSY)); + } + return (0); +} + + +int +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_FALSE, dsp)); +} + +int +dsl_dataset_own_obj_force(dsl_pool_t *dp, uint64_t dsobj, + ds_hold_flags_t flags, void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_obj_impl(dp, dsobj, flags, tag, B_TRUE, dsp)); +} + +static int +dsl_dataset_own_impl(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, boolean_t override, dsl_dataset_t **dsp) +{ + int err = dsl_dataset_hold_flags(dp, name, flags, tag, dsp); + if (err != 0) + return (err); + if (!dsl_dataset_tryown(*dsp, tag, override)) { + dsl_dataset_rele_flags(*dsp, flags, tag); + return (SET_ERROR(EBUSY)); + } + return (0); +} + +int +dsl_dataset_own_force(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_TRUE, dsp)); +} + +int +dsl_dataset_own(dsl_pool_t *dp, const char *name, ds_hold_flags_t flags, + void *tag, dsl_dataset_t **dsp) +{ + return (dsl_dataset_own_impl(dp, name, flags, tag, B_FALSE, dsp)); +} + +/* + * See the comment above dsl_pool_hold() for details. In summary, a long + * hold is used to prevent destruction of a dataset while the pool hold + * is dropped, allowing other concurrent operations (e.g. spa_sync()). + * + * The dataset and pool must be held when this function is called. After it + * is called, the pool hold may be released while the dataset is still held + * and accessed. + */ +void +dsl_dataset_long_hold(dsl_dataset_t *ds, void *tag) +{ + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + (void) zfs_refcount_add(&ds->ds_longholds, tag); +} + +void +dsl_dataset_long_rele(dsl_dataset_t *ds, void *tag) +{ + (void) zfs_refcount_remove(&ds->ds_longholds, tag); +} + +/* Return B_TRUE if there are any long holds on this dataset. */ +boolean_t +dsl_dataset_long_held(dsl_dataset_t *ds) +{ + return (!zfs_refcount_is_zero(&ds->ds_longholds)); +} + +void +dsl_dataset_name(dsl_dataset_t *ds, char *name) +{ + if (ds == NULL) { + (void) strlcpy(name, "mos", ZFS_MAX_DATASET_NAME_LEN); + } else { + dsl_dir_name(ds->ds_dir, name); + VERIFY0(dsl_dataset_get_snapname(ds)); + if (ds->ds_snapname[0]) { + VERIFY3U(strlcat(name, "@", ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + /* + * We use a "recursive" mutex so that we + * can call dprintf_ds() with ds_lock held. + */ + if (!MUTEX_HELD(&ds->ds_lock)) { + mutex_enter(&ds->ds_lock); + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + mutex_exit(&ds->ds_lock); + } else { + VERIFY3U(strlcat(name, ds->ds_snapname, + ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + } + } + } +} + +int +dsl_dataset_namelen(dsl_dataset_t *ds) +{ + VERIFY0(dsl_dataset_get_snapname(ds)); + mutex_enter(&ds->ds_lock); + int len = strlen(ds->ds_snapname); + mutex_exit(&ds->ds_lock); + /* add '@' if ds is a snap */ + if (len > 0) + len++; + len += dsl_dir_namelen(ds->ds_dir); + return (len); +} + +void +dsl_dataset_rele(dsl_dataset_t *ds, void *tag) +{ + dmu_buf_rele(ds->ds_dbuf, tag); +} + +void +dsl_dataset_remove_key_mapping(dsl_dataset_t *ds) +{ + dsl_dir_t *dd = ds->ds_dir; + + if (dd == NULL || dd->dd_crypto_obj == 0) + return; + + (void) spa_keystore_remove_mapping(dd->dd_pool->dp_spa, + ds->ds_object, ds); +} + +void +dsl_dataset_rele_flags(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) +{ + if (flags & DS_HOLD_FLAG_DECRYPT) + dsl_dataset_remove_key_mapping(ds); + + dsl_dataset_rele(ds, tag); +} + +void +dsl_dataset_disown(dsl_dataset_t *ds, ds_hold_flags_t flags, void *tag) +{ + ASSERT3P(ds->ds_owner, ==, tag); + ASSERT(ds->ds_dbuf != NULL); + + mutex_enter(&ds->ds_lock); + ds->ds_owner = NULL; + mutex_exit(&ds->ds_lock); + dsl_dataset_long_rele(ds, tag); + dsl_dataset_rele_flags(ds, flags, tag); +} + +boolean_t +dsl_dataset_tryown(dsl_dataset_t *ds, void *tag, boolean_t override) +{ + boolean_t gotit = FALSE; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + mutex_enter(&ds->ds_lock); + if (ds->ds_owner == NULL && (override || !(DS_IS_INCONSISTENT(ds) || + (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS) && + !zfs_allow_redacted_dataset_mount)))) { + ds->ds_owner = tag; + dsl_dataset_long_hold(ds, tag); + gotit = TRUE; + } + mutex_exit(&ds->ds_lock); + return (gotit); +} + +boolean_t +dsl_dataset_has_owner(dsl_dataset_t *ds) +{ + boolean_t rv; + mutex_enter(&ds->ds_lock); + rv = (ds->ds_owner != NULL); + mutex_exit(&ds->ds_lock); + return (rv); +} + +static boolean_t +zfeature_active(spa_feature_t f, void *arg) +{ + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: { + boolean_t val = (boolean_t)(uintptr_t)arg; + ASSERT(val == B_FALSE || val == B_TRUE); + return (val); + } + case ZFEATURE_TYPE_UINT64_ARRAY: + /* + * In this case, arg is a uint64_t array. The feature is active + * if the array is non-null. + */ + return (arg != NULL); + default: + panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); + return (B_FALSE); + } +} + +boolean_t +dsl_dataset_feature_is_active(dsl_dataset_t *ds, spa_feature_t f) +{ + return (zfeature_active(f, ds->ds_feature[f])); +} + +/* + * The buffers passed out by this function are references to internal buffers; + * they should not be freed by callers of this function, and they should not be + * used after the dataset has been released. + */ +boolean_t +dsl_dataset_get_uint64_array_feature(dsl_dataset_t *ds, spa_feature_t f, + uint64_t *outlength, uint64_t **outp) +{ + VERIFY(spa_feature_table[f].fi_type & ZFEATURE_TYPE_UINT64_ARRAY); + if (!dsl_dataset_feature_is_active(ds, f)) { + return (B_FALSE); + } + struct feature_type_uint64_array_arg *ftuaa = ds->ds_feature[f]; + *outp = ftuaa->array; + *outlength = ftuaa->length; + return (B_TRUE); +} + +void +dsl_dataset_activate_feature(uint64_t dsobj, spa_feature_t f, void *arg, + dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t zero = 0; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + spa_feature_incr(spa, f, tx); + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + + switch (spa_feature_table[f].fi_type) { + case ZFEATURE_TYPE_BOOLEAN: + ASSERT3S((boolean_t)(uintptr_t)arg, ==, B_TRUE); + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (zero), 1, &zero, tx)); + break; + case ZFEATURE_TYPE_UINT64_ARRAY: + { + struct feature_type_uint64_array_arg *ftuaa = arg; + VERIFY0(zap_add(mos, dsobj, spa_feature_table[f].fi_guid, + sizeof (uint64_t), ftuaa->length, ftuaa->array, tx)); + break; + } + default: + panic("Invalid zfeature type %d", spa_feature_table[f].fi_type); + } +} + +static void +dsl_dataset_deactivate_feature_impl(dsl_dataset_t *ds, spa_feature_t f, + dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = dmu_tx_pool(tx)->dp_meta_objset; + uint64_t dsobj = ds->ds_object; + + VERIFY(spa_feature_table[f].fi_flags & ZFEATURE_FLAG_PER_DATASET); + + VERIFY0(zap_remove(mos, dsobj, spa_feature_table[f].fi_guid, tx)); + spa_feature_decr(spa, f, tx); + ds->ds_feature[f] = NULL; +} + +void +dsl_dataset_deactivate_feature(dsl_dataset_t *ds, spa_feature_t f, dmu_tx_t *tx) +{ + unload_zfeature(ds, f); + dsl_dataset_deactivate_feature_impl(ds, f, tx); +} + +uint64_t +dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, + dsl_crypto_params_t *dcp, uint64_t flags, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj; + objset_t *mos = dp->dp_meta_objset; + + if (origin == NULL) + origin = dp->dp_origin_snap; + + ASSERT(origin == NULL || origin->ds_dir->dd_pool == dp); + ASSERT(origin == NULL || dsl_dataset_phys(origin)->ds_num_children > 0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dir_phys(dd)->dd_head_dataset_obj == 0); + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = dd->dd_object; + dsphys->ds_flags = flags; + dsphys->ds_fsid_guid = unique_create(); + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_snapnames_zapobj = + zap_create_norm(mos, U8_TEXTPREP_TOUPPER, DMU_OT_DSL_DS_SNAP_MAP, + DMU_OT_NONE, 0, tx); + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = tx->tx_txg == TXG_INITIAL ? 1 : tx->tx_txg; + + if (origin == NULL) { + dsphys->ds_deadlist_obj = dsl_deadlist_alloc(mos, tx); + } else { + dsl_dataset_t *ohds; /* head of the origin snapshot */ + + dsphys->ds_prev_snap_obj = origin->ds_object; + dsphys->ds_prev_snap_txg = + dsl_dataset_phys(origin)->ds_creation_txg; + dsphys->ds_referenced_bytes = + dsl_dataset_phys(origin)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = + dsl_dataset_phys(origin)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(origin)->ds_uncompressed_bytes; + rrw_enter(&origin->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(origin)->ds_bp; + rrw_exit(&origin->ds_bp_rwlock, FTAG); + + /* + * Inherit flags that describe the dataset's contents + * (INCONSISTENT) or properties (Case Insensitive). + */ + dsphys->ds_flags |= dsl_dataset_phys(origin)->ds_flags & + (DS_FLAG_INCONSISTENT | DS_FLAG_CI_DATASET); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (zfeature_active(f, origin->ds_feature[f])) { + dsl_dataset_activate_feature(dsobj, f, + origin->ds_feature[f], tx); + } + } + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_dataset_phys(origin)->ds_num_children++; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(origin->ds_dir)->dd_head_dataset_obj, + FTAG, &ohds)); + dsphys->ds_deadlist_obj = dsl_deadlist_clone(&ohds->ds_deadlist, + dsphys->ds_prev_snap_txg, dsphys->ds_prev_snap_obj, tx); + dsl_dataset_rele(ohds, FTAG); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_NEXT_CLONES) { + if (dsl_dataset_phys(origin)->ds_next_clones_obj == 0) { + dsl_dataset_phys(origin)->ds_next_clones_obj = + zap_create(mos, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(origin)->ds_next_clones_obj, + dsobj, tx)); + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_phys(dd)->dd_origin_obj = origin->ds_object; + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, + DMU_OT_DSL_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(mos, + dsl_dir_phys(origin->ds_dir)->dd_clones, + dsobj, tx)); + } + } + + /* handle encryption */ + dsl_dataset_create_crypt_sync(dsobj, dd, origin, dcp, tx); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsphys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + dmu_buf_rele(dbuf, FTAG); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + dsl_dir_phys(dd)->dd_head_dataset_obj = dsobj; + + return (dsobj); +} + +static void +dsl_dataset_zero_zil(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + if (bcmp(&os->os_zil_header, &zero_zil, sizeof (zero_zil)) != 0) { + dsl_pool_t *dp = ds->ds_dir->dd_pool; + zio_t *zio; + + bzero(&os->os_zil_header, sizeof (os->os_zil_header)); + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; + + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dsl_dataset_sync(ds, zio, tx); + VERIFY0(zio_wait(zio)); + + /* dsl_dataset_sync_done will drop this reference. */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + dsl_dataset_sync_done(ds, tx); + } +} + +uint64_t +dsl_dataset_create_sync(dsl_dir_t *pdd, const char *lastname, + dsl_dataset_t *origin, uint64_t flags, cred_t *cr, + dsl_crypto_params_t *dcp, dmu_tx_t *tx) +{ + dsl_pool_t *dp = pdd->dd_pool; + uint64_t dsobj, ddobj; + dsl_dir_t *dd; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(lastname[0] != '@'); + /* + * Filesystems will eventually have their origin set to dp_origin_snap, + * but that's taken care of in dsl_dataset_create_sync_dd. When + * creating a filesystem, this function is called with origin equal to + * NULL. + */ + if (origin != NULL) + ASSERT3P(origin, !=, dp->dp_origin_snap); + + ddobj = dsl_dir_create_sync(dp, pdd, lastname, tx); + VERIFY0(dsl_dir_hold_obj(dp, ddobj, lastname, FTAG, &dd)); + + dsobj = dsl_dataset_create_sync_dd(dd, origin, dcp, + flags & ~DS_CREATE_FLAG_NODIRTY, tx); + + dsl_deleg_set_create_perms(dd, tx, cr); + + /* + * If we are creating a clone and the livelist feature is enabled, + * add the entry DD_FIELD_LIVELIST to ZAP. + */ + if (origin != NULL && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_LIVELIST)) { + objset_t *mos = dd->dd_pool->dp_meta_objset; + dsl_dir_zapify(dd, tx); + uint64_t obj = dsl_deadlist_alloc(mos, tx); + VERIFY0(zap_add(mos, dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_LIVELIST, tx); + } + + /* + * Since we're creating a new node we know it's a leaf, so we can + * initialize the counts if the limit feature is active. + */ + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + uint64_t cnt = 0; + objset_t *os = dd->dd_pool->dp_meta_objset; + + dsl_dir_zapify(dd, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (cnt), 1, &cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (cnt), 1, &cnt, tx)); + } + + dsl_dir_rele(dd, FTAG); + + /* + * If we are creating a clone, make sure we zero out any stale + * data from the origin snapshots zil header. + */ + if (origin != NULL && !(flags & DS_CREATE_FLAG_NODIRTY)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_zero_zil(ds, tx); + dsl_dataset_rele(ds, FTAG); + } + + return (dsobj); +} + +/* + * The unique space in the head dataset can be calculated by subtracting + * the space used in the most recent snapshot, that is still being used + * in this file system, from the space currently in use. To figure out + * the space in the most recent snapshot still in use, we need to take + * the total space used in the snapshot and subtract out the space that + * has been freed up since the snapshot was taken. + */ +void +dsl_dataset_recalc_head_uniq(dsl_dataset_t *ds) +{ + uint64_t mrs_used; + uint64_t dlused, dlcomp, dluncomp; + + ASSERT(!ds->ds_is_snapshot); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) + mrs_used = dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes; + else + mrs_used = 0; + + dsl_deadlist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp); + + ASSERT3U(dlused, <=, mrs_used); + dsl_dataset_phys(ds)->ds_unique_bytes = + dsl_dataset_phys(ds)->ds_referenced_bytes - (mrs_used - dlused); + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= + SPA_VERSION_UNIQUE_ACCURATE) + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; +} + +void +dsl_dataset_remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count __maybe_unused; + int err; + + ASSERT(dsl_dataset_phys(ds)->ds_num_children >= 2); + err = zap_remove_int(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + obj, tx); + /* + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. + */ + if (err != ENOENT) + VERIFY0(err); + ASSERT0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, dsl_dataset_phys(ds)->ds_num_children - 2); +} + + +blkptr_t * +dsl_dataset_get_blkptr(dsl_dataset_t *ds) +{ + return (&dsl_dataset_phys(ds)->ds_bp); +} + +spa_t * +dsl_dataset_get_spa(dsl_dataset_t *ds) +{ + return (ds->ds_dir->dd_pool->dp_spa); +} + +void +dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp; + + if (ds == NULL) /* this is the meta-objset */ + return; + + ASSERT(ds->ds_objset != NULL); + + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) + panic("dirtying snapshot!"); + + /* Must not dirty a dataset in the same txg where it got snapshotted. */ + ASSERT3U(tx->tx_txg, >, dsl_dataset_phys(ds)->ds_prev_snap_txg); + + dp = ds->ds_dir->dd_pool; + if (txg_list_add(&dp->dp_dirty_datasets, ds, tx->tx_txg)) { + objset_t *os = ds->ds_objset; + + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, ds); + + /* if this dataset is encrypted, grab a reference to the DCK */ + if (ds->ds_dir->dd_crypto_obj != 0 && + !os->os_raw_receive && + !os->os_next_write_raw[tx->tx_txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_add_ref(ds->ds_key_mapping, ds); + } + } +} + +static int +dsl_dataset_snapshot_reserve_space(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t asize; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * If there's an fs-only reservation, any blocks that might become + * owned by the snapshot dataset must be accommodated by space + * outside of the reservation. + */ + ASSERT(ds->ds_reserved == 0 || DS_UNIQUE_IS_ACCURATE(ds)); + asize = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, ds->ds_reserved); + if (asize > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); + + /* + * Propagate any reserved space for this snapshot to other + * snapshot checks in this sync group. + */ + if (asize > 0) + dsl_dir_willuse_space(ds->ds_dir, asize, tx); + + return (0); +} + +int +dsl_dataset_snapshot_check_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx, boolean_t recv, uint64_t cnt, cred_t *cr, proc_t *proc) +{ + int error; + uint64_t value; + + ds->ds_trysnap_txg = tx->tx_txg; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * We don't allow multiple snapshots of the same txg. If there + * is already one, try again. + */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) + return (SET_ERROR(EAGAIN)); + + /* + * Check for conflicting snapshot name. + */ + error = dsl_dataset_snap_lookup(ds, snapname, &value); + if (error == 0) + return (SET_ERROR(EEXIST)); + if (error != ENOENT) + return (error); + + /* + * We don't allow taking snapshots of inconsistent datasets, such as + * those into which we are currently receiving. However, if we are + * creating this snapshot as part of a receive, this check will be + * executed atomically with respect to the completion of the receive + * itself but prior to the clearing of DS_FLAG_INCONSISTENT; in this + * case we ignore this, knowing it will be fixed up for us shortly in + * dmu_recv_end_sync(). + */ + if (!recv && DS_IS_INCONSISTENT(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Skip the check for temporary snapshots or if we have already checked + * the counts in dsl_dataset_snapshot_check. This means we really only + * check the count here when we're receiving a stream. + */ + if (cnt != 0 && cr != NULL) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, cr, proc); + if (error != 0) + return (error); + } + + error = dsl_dataset_snapshot_reserve_space(ds, tx); + if (error != 0) + return (error); + + return (0); +} + +int +dsl_dataset_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + int rv = 0; + + /* + * Pre-compute how many total new snapshots will be created for each + * level in the tree and below. This is needed for validating the + * snapshot limit when either taking a recursive snapshot or when + * taking multiple snapshots. + * + * The problem is that the counts are not actually adjusted when + * we are checking, only when we finally sync. For a single snapshot, + * this is easy, the count will increase by 1 at each node up the tree, + * but its more complicated for the recursive/multiple snapshot case. + * + * The dsl_fs_ss_limit_check function does recursively check the count + * at each level up the tree but since it is validating each snapshot + * independently we need to be sure that we are validating the complete + * count for the entire set of snapshots. We do this by rolling up the + * counts for each component of the name into an nvlist and then + * checking each of those cases with the aggregated count. + * + * This approach properly handles not only the recursive snapshot + * case (where we get all of those on the ddsa_snaps list) but also + * the sibling case (e.g. snapshot a/b and a/c so that we will also + * validate the limit on 'a' using a count of 2). + * + * We validate the snapshot names in the third loop and only report + * name errors once. + */ + if (dmu_tx_is_syncing(tx)) { + char *nm; + nvlist_t *cnt_track = NULL; + cnt_track = fnvlist_alloc(); + + nm = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* Rollup aggregated counts into the cnt_track list */ + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; + pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + char *pdelim; + uint64_t val; + + (void) strlcpy(nm, nvpair_name(pair), MAXPATHLEN); + pdelim = strchr(nm, '@'); + if (pdelim == NULL) + continue; + *pdelim = '\0'; + + do { + if (nvlist_lookup_uint64(cnt_track, nm, + &val) == 0) { + /* update existing entry */ + fnvlist_add_uint64(cnt_track, nm, + val + 1); + } else { + /* add to list */ + fnvlist_add_uint64(cnt_track, nm, 1); + } + + pdelim = strrchr(nm, '/'); + if (pdelim != NULL) + *pdelim = '\0'; + } while (pdelim != NULL); + } + + kmem_free(nm, MAXPATHLEN); + + /* Check aggregated counts at each level */ + for (pair = nvlist_next_nvpair(cnt_track, NULL); + pair != NULL; pair = nvlist_next_nvpair(cnt_track, pair)) { + int error = 0; + char *name; + uint64_t cnt = 0; + dsl_dataset_t *ds; + + name = nvpair_name(pair); + cnt = fnvpair_value_uint64(pair); + ASSERT(cnt > 0); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error == 0) { + error = dsl_fs_ss_limit_check(ds->ds_dir, cnt, + ZFS_PROP_SNAPSHOT_LIMIT, NULL, + ddsa->ddsa_cr, ddsa->ddsa_proc); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + rv = error; + /* only report one error for this check */ + break; + } + } + nvlist_free(cnt_track); + } + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + int error = 0; + dsl_dataset_t *ds; + char *name, *atp = NULL; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + + name = nvpair_name(pair); + if (strlen(name) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); + if (error == 0) { + atp = strchr(name, '@'); + if (atp == NULL) + error = SET_ERROR(EINVAL); + if (error == 0) + (void) strlcpy(dsname, name, atp - name + 1); + } + if (error == 0) + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + /* passing 0/NULL skips dsl_fs_ss_limit_check */ + error = dsl_dataset_snapshot_check_impl(ds, + atp + 1, tx, B_FALSE, 0, NULL, NULL); + dsl_dataset_rele(ds, FTAG); + } + + if (error != 0) { + if (ddsa->ddsa_errors != NULL) { + fnvlist_add_int32(ddsa->ddsa_errors, + name, error); + } + rv = error; + } + } + + return (rv); +} + +void +dsl_dataset_snapshot_sync_impl(dsl_dataset_t *ds, const char *snapname, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dmu_buf_t *dbuf; + dsl_dataset_phys_t *dsphys; + uint64_t dsobj, crtxg; + objset_t *mos = dp->dp_meta_objset; + static zil_header_t zero_zil __maybe_unused; + objset_t *os __maybe_unused; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + /* + * If we are on an old pool, the zil must not be active, in which + * case it will be zeroed. Usually zil_suspend() accomplishes this. + */ + ASSERT(spa_version(dmu_tx_pool(tx)->dp_spa) >= SPA_VERSION_FAST_SNAP || + dmu_objset_from_ds(ds, &os) != 0 || + bcmp(&os->os_phys->os_zil_header, &zero_zil, + sizeof (zero_zil)) == 0); + + /* Should not snapshot a dirty dataset. */ + ASSERT(!txg_list_member(&ds->ds_dir->dd_pool->dp_dirty_datasets, + ds, tx->tx_txg)); + + dsl_fs_ss_count_adjust(ds->ds_dir, 1, DD_FIELD_SNAPSHOT_COUNT, tx); + + /* + * The origin's ds_creation_txg has to be < TXG_INITIAL + */ + if (strcmp(snapname, ORIGIN_DIR_NAME) == 0) + crtxg = 1; + else + crtxg = tx->tx_txg; + + dsobj = dmu_object_alloc(mos, DMU_OT_DSL_DATASET, 0, + DMU_OT_DSL_DATASET, sizeof (dsl_dataset_phys_t), tx); + VERIFY0(dmu_bonus_hold(mos, dsobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + dsphys = dbuf->db_data; + bzero(dsphys, sizeof (dsl_dataset_phys_t)); + dsphys->ds_dir_obj = ds->ds_dir->dd_object; + dsphys->ds_fsid_guid = unique_create(); + (void) random_get_pseudo_bytes((void*)&dsphys->ds_guid, + sizeof (dsphys->ds_guid)); + dsphys->ds_prev_snap_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsphys->ds_prev_snap_txg = dsl_dataset_phys(ds)->ds_prev_snap_txg; + dsphys->ds_next_snap_obj = ds->ds_object; + dsphys->ds_num_children = 1; + dsphys->ds_creation_time = gethrestime_sec(); + dsphys->ds_creation_txg = crtxg; + dsphys->ds_deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsphys->ds_referenced_bytes = dsl_dataset_phys(ds)->ds_referenced_bytes; + dsphys->ds_compressed_bytes = dsl_dataset_phys(ds)->ds_compressed_bytes; + dsphys->ds_uncompressed_bytes = + dsl_dataset_phys(ds)->ds_uncompressed_bytes; + dsphys->ds_flags = dsl_dataset_phys(ds)->ds_flags; + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsphys->ds_bp = dsl_dataset_phys(ds)->ds_bp; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dmu_buf_rele(dbuf, FTAG); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (zfeature_active(f, ds->ds_feature[f])) { + dsl_dataset_activate_feature(dsobj, f, + ds->ds_feature[f], tx); + } + } + + ASSERT3U(ds->ds_prev != 0, ==, + dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); + if (ds->ds_prev) { + uint64_t next_clones_obj = + dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj; + ASSERT(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object || + dsl_dataset_phys(ds->ds_prev)->ds_num_children > 1); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == + ds->ds_object) { + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + dsl_dataset_phys(ds->ds_prev)->ds_creation_txg); + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj = dsobj; + } else if (next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + dsphys->ds_next_snap_obj, tx); + VERIFY0(zap_add_int(mos, + next_clones_obj, dsobj, tx)); + } + } + + /* + * If we have a reference-reservation on this dataset, we will + * need to increase the amount of refreservation being charged + * since our unique space is going to zero. + */ + if (ds->ds_reserved) { + int64_t delta; + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + delta = MIN(dsl_dataset_phys(ds)->ds_unique_bytes, + ds->ds_reserved); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, + delta, 0, 0, tx); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_deadlist_clone(&ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_add_key(&ds->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + dsl_bookmark_snapshotted(ds, tx); + + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_obj = + dsl_dataset_get_remap_deadlist_object(ds); + /* + * Move the remap_deadlist to the snapshot. The head + * will create a new remap deadlist on demand, from + * dsl_dataset_block_remapped(). + */ + dsl_dataset_unset_remap_deadlist_object(ds, tx); + dsl_deadlist_close(&ds->ds_remap_deadlist); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_REMAP_DEADLIST, + sizeof (remap_deadlist_obj), 1, &remap_deadlist_obj, tx)); + } + + /* + * Create a ivset guid for this snapshot if the dataset is + * encrypted. This may be overridden by a raw receive. A + * previous implementation of this code did not have this + * field as part of the on-disk format for ZFS encryption + * (see errata #4). As part of the remediation for this + * issue, we ask the user to enable the bookmark_v2 feature + * which is now a dependency of the encryption feature. We + * use this as a heuristic to determine when the user has + * elected to correct any datasets created with the old code. + * As a result, we only do this step if the bookmark_v2 + * feature is enabled, which limits the number of states a + * given pool / dataset can be in with regards to terms of + * correcting the issue. + */ + if (ds->ds_dir->dd_crypto_obj != 0 && + spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_BOOKMARK_V2)) { + uint64_t ivset_guid = unique_create(); + + dmu_object_zapify(mos, dsobj, DMU_OT_DSL_DATASET, tx); + VERIFY0(zap_add(mos, dsobj, DS_FIELD_IVSET_GUID, + sizeof (ivset_guid), 1, &ivset_guid, tx)); + } + + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, <, tx->tx_txg); + dsl_dataset_phys(ds)->ds_prev_snap_obj = dsobj; + dsl_dataset_phys(ds)->ds_prev_snap_txg = crtxg; + dsl_dataset_phys(ds)->ds_unique_bytes = 0; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_UNIQUE_ACCURATE) + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; + + VERIFY0(zap_add(mos, dsl_dataset_phys(ds)->ds_snapnames_zapobj, + snapname, 8, 1, &dsobj, tx)); + + if (ds->ds_prev) + dsl_dataset_rele(ds->ds_prev, ds); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, ds, &ds->ds_prev)); + + dsl_scan_ds_snapshotted(ds, tx); + + dsl_dir_snap_cmtime_update(ds->ds_dir); + + spa_history_log_internal_ds(ds->ds_prev, "snapshot", tx, " "); +} + +void +dsl_dataset_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_arg_t *ddsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvpair_t *pair; + + for (pair = nvlist_next_nvpair(ddsa->ddsa_snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddsa->ddsa_snaps, pair)) { + dsl_dataset_t *ds; + char *name, *atp; + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + + name = nvpair_name(pair); + atp = strchr(name, '@'); + (void) strlcpy(dsname, name, atp - name + 1); + VERIFY0(dsl_dataset_hold(dp, dsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, atp + 1, tx); + if (ddsa->ddsa_props != NULL) { + dsl_props_set_sync_impl(ds->ds_prev, + ZPROP_SRC_LOCAL, ddsa->ddsa_props, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The snapshots must all be in the same pool. + * All-or-nothing: if there are any failures, nothing will be modified. + */ +int +dsl_dataset_snapshot(nvlist_t *snaps, nvlist_t *props, nvlist_t *errors) +{ + dsl_dataset_snapshot_arg_t ddsa; + nvpair_t *pair; + boolean_t needsuspend; + int error; + spa_t *spa; + char *firstname; + nvlist_t *suspended = NULL; + + pair = nvlist_next_nvpair(snaps, NULL); + if (pair == NULL) + return (0); + firstname = nvpair_name(pair); + + error = spa_open(firstname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + suspended = fnvlist_alloc(); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *snapname = nvpair_name(pair); + char *atp; + void *cookie; + + atp = strchr(snapname, '@'); + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + (void) strlcpy(fsname, snapname, atp - snapname + 1); + + error = zil_suspend(fsname, &cookie); + if (error != 0) + break; + fnvlist_add_uint64(suspended, fsname, + (uintptr_t)cookie); + } + } + + ddsa.ddsa_snaps = snaps; + ddsa.ddsa_props = props; + ddsa.ddsa_errors = errors; + ddsa.ddsa_cr = CRED(); + ddsa.ddsa_proc = curproc; + + if (error == 0) { + error = dsl_sync_task(firstname, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, + fnvlist_num_pairs(snaps) * 3, ZFS_SPACE_CHECK_NORMAL); + } + + if (suspended != NULL) { + for (pair = nvlist_next_nvpair(suspended, NULL); pair != NULL; + pair = nvlist_next_nvpair(suspended, pair)) { + zil_resume((void *)(uintptr_t) + fnvpair_value_uint64(pair)); + } + fnvlist_free(suspended); + } + + if (error == 0) { + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + zvol_create_minor(nvpair_name(pair)); + } + } + + return (error); +} + +typedef struct dsl_dataset_snapshot_tmp_arg { + const char *ddsta_fsname; + const char *ddsta_snapname; + minor_t ddsta_cleanup_minor; + const char *ddsta_htag; +} dsl_dataset_snapshot_tmp_arg_t; + +static int +dsl_dataset_snapshot_tmp_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* NULL cred means no limit check for tmp snapshot */ + error = dsl_dataset_snapshot_check_impl(ds, ddsta->ddsta_snapname, + tx, B_FALSE, 0, NULL, NULL); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + error = dsl_dataset_user_hold_check_one(NULL, ddsta->ddsta_htag, + B_TRUE, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_snapshot_tmp_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_snapshot_tmp_arg_t *ddsta = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + + VERIFY0(dsl_dataset_hold(dp, ddsta->ddsta_fsname, FTAG, &ds)); + + dsl_dataset_snapshot_sync_impl(ds, ddsta->ddsta_snapname, tx); + dsl_dataset_user_hold_sync_one(ds->ds_prev, ddsta->ddsta_htag, + ddsta->ddsta_cleanup_minor, gethrestime_sec(), tx); + dsl_destroy_snapshot_sync_impl(ds->ds_prev, B_TRUE, tx); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_snapshot_tmp(const char *fsname, const char *snapname, + minor_t cleanup_minor, const char *htag) +{ + dsl_dataset_snapshot_tmp_arg_t ddsta; + int error; + spa_t *spa; + boolean_t needsuspend; + void *cookie; + + ddsta.ddsta_fsname = fsname; + ddsta.ddsta_snapname = snapname; + ddsta.ddsta_cleanup_minor = cleanup_minor; + ddsta.ddsta_htag = htag; + + error = spa_open(fsname, &spa, FTAG); + if (error != 0) + return (error); + needsuspend = (spa_version(spa) < SPA_VERSION_FAST_SNAP); + spa_close(spa, FTAG); + + if (needsuspend) { + error = zil_suspend(fsname, &cookie); + if (error != 0) + return (error); + } + + error = dsl_sync_task(fsname, dsl_dataset_snapshot_tmp_check, + dsl_dataset_snapshot_tmp_sync, &ddsta, 3, ZFS_SPACE_CHECK_RESERVED); + + if (needsuspend) + zil_resume(cookie); + return (error); +} + +void +dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(ds->ds_objset != NULL); + ASSERT(dsl_dataset_phys(ds)->ds_next_snap_obj == 0); + + /* + * in case we had to change ds_fsid_guid when we opened it, + * sync it out now. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_fsid_guid = ds->ds_fsid_guid; + + if (ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] != 0) { + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OBJECT, 8, 1, + &ds->ds_resume_object[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_OFFSET, 8, 1, + &ds->ds_resume_offset[tx->tx_txg & TXG_MASK], tx)); + VERIFY0(zap_update(tx->tx_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_BYTES, 8, 1, + &ds->ds_resume_bytes[tx->tx_txg & TXG_MASK], tx)); + ds->ds_resume_object[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_offset[tx->tx_txg & TXG_MASK] = 0; + ds->ds_resume_bytes[tx->tx_txg & TXG_MASK] = 0; + } + + dmu_objset_sync(ds->ds_objset, zio, tx); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (zfeature_active(f, ds->ds_feature_activation[f])) { + if (zfeature_active(f, ds->ds_feature[f])) + continue; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + } +} + +/* + * Check if the percentage of blocks shared between the clone and the + * snapshot (as opposed to those that are clone only) is below a certain + * threshold + */ +static boolean_t +dsl_livelist_should_disable(dsl_dataset_t *ds) +{ + uint64_t used, referenced; + int percent_shared; + + used = dsl_dir_get_usedds(ds->ds_dir); + referenced = dsl_get_referenced(ds); + ASSERT3U(referenced, >=, 0); + ASSERT3U(used, >=, 0); + if (referenced == 0) + return (B_FALSE); + percent_shared = (100 * (referenced - used)) / referenced; + if (percent_shared <= zfs_livelist_min_percent_shared) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Check if it is possible to combine two livelist entries into one. + * This is the case if the combined number of 'live' blkptrs (ALLOCs that + * don't have a matching FREE) is under the maximum sublist size. + * We check this by subtracting twice the total number of frees from the total + * number of blkptrs. FREEs are counted twice because each FREE blkptr + * will cancel out an ALLOC blkptr when the livelist is processed. + */ +static boolean_t +dsl_livelist_should_condense(dsl_deadlist_entry_t *first, + dsl_deadlist_entry_t *next) +{ + uint64_t total_free = first->dle_bpobj.bpo_phys->bpo_num_freed + + next->dle_bpobj.bpo_phys->bpo_num_freed; + uint64_t total_entries = first->dle_bpobj.bpo_phys->bpo_num_blkptrs + + next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + if ((total_entries - (2 * total_free)) < zfs_livelist_max_entries) + return (B_TRUE); + return (B_FALSE); +} + +typedef struct try_condense_arg { + spa_t *spa; + dsl_dataset_t *ds; +} try_condense_arg_t; + +/* + * Iterate over the livelist entries, searching for a pair to condense. + * A nonzero return value means stop, 0 means keep looking. + */ +static int +dsl_livelist_try_condense(void *arg, dsl_deadlist_entry_t *first) +{ + try_condense_arg_t *tca = arg; + spa_t *spa = tca->spa; + dsl_dataset_t *ds = tca->ds; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + dsl_deadlist_entry_t *next; + + /* The condense thread has not yet been created at import */ + if (spa->spa_livelist_condense_zthr == NULL) + return (1); + + /* A condense is already in progress */ + if (spa->spa_to_condense.ds != NULL) + return (1); + + next = AVL_NEXT(&ll->dl_tree, &first->dle_node); + /* The livelist has only one entry - don't condense it */ + if (next == NULL) + return (1); + + /* Next is the newest entry - don't condense it */ + if (AVL_NEXT(&ll->dl_tree, &next->dle_node) == NULL) + return (1); + + /* This pair is not ready to condense but keep looking */ + if (!dsl_livelist_should_condense(first, next)) + return (0); + + /* + * Add a ref to prevent the dataset from being evicted while + * the condense zthr or synctask are running. Ref will be + * released at the end of the condense synctask + */ + dmu_buf_add_ref(ds->ds_dbuf, spa); + + spa->spa_to_condense.ds = ds; + spa->spa_to_condense.first = first; + spa->spa_to_condense.next = next; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + zthr_wakeup(spa->spa_livelist_condense_zthr); + return (1); +} + +static void +dsl_flush_pending_livelist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_dir_t *dd = ds->ds_dir; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + dsl_deadlist_entry_t *last = dsl_deadlist_last(&dd->dd_livelist); + + /* Check if we need to add a new sub-livelist */ + if (last == NULL) { + /* The livelist is empty */ + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } else if (spa_sync_pass(spa) == 1) { + /* + * Check if the newest entry is full. If it is, make a new one. + * We only do this once per sync because we could overfill a + * sublist in one sync pass and don't want to add another entry + * for a txg that is already represented. This ensures that + * blkptrs born in the same txg are stored in the same sublist. + */ + bpobj_t bpobj = last->dle_bpobj; + uint64_t all = bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t free = bpobj.bpo_phys->bpo_num_freed; + uint64_t alloc = all - free; + if (alloc > zfs_livelist_max_entries) { + dsl_deadlist_add_key(&dd->dd_livelist, + tx->tx_txg - 1, tx); + } + } + + /* Insert each entry into the on-disk livelist */ + bplist_iterate(&dd->dd_pending_allocs, + dsl_deadlist_insert_alloc_cb, &dd->dd_livelist, tx); + bplist_iterate(&dd->dd_pending_frees, + dsl_deadlist_insert_free_cb, &dd->dd_livelist, tx); + + /* Attempt to condense every pair of adjacent entries */ + try_condense_arg_t arg = { + .spa = spa, + .ds = ds + }; + dsl_deadlist_iterate(&dd->dd_livelist, dsl_livelist_try_condense, + &arg); +} + +void +dsl_dataset_sync_done(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *os = ds->ds_objset; + + bplist_iterate(&ds->ds_pending_deadlist, + dsl_deadlist_insert_alloc_cb, &ds->ds_deadlist, tx); + + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_flush_pending_livelist(ds, tx); + if (dsl_livelist_should_disable(ds)) { + dsl_dir_remove_livelist(ds->ds_dir, tx, B_TRUE); + } + } + + dsl_bookmark_sync_done(ds, tx); + + if (os->os_synced_dnodes != NULL) { + multilist_destroy(os->os_synced_dnodes); + os->os_synced_dnodes = NULL; + } + + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_FALSE; + else + ASSERT0(os->os_next_write_raw[tx->tx_txg & TXG_MASK]); + + ASSERT(!dmu_objset_is_dirty(os, dmu_tx_get_txg(tx))); + + dmu_buf_rele(ds->ds_dbuf, ds); +} + +int +get_clones_stat_impl(dsl_dataset_t *ds, nvlist_t *val) +{ + uint64_t count = 0; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + + /* + * There may be missing entries in ds_next_clones_obj + * due to a bug in a previous version of the code. + * Only trust it if it has the right number of entries. + */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + VERIFY0(zap_count(mos, dsl_dataset_phys(ds)->ds_next_clones_obj, + &count)); + } + if (count != dsl_dataset_phys(ds)->ds_num_children - 1) { + return (SET_ERROR(ENOENT)); + } + for (zap_cursor_init(&zc, mos, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *clone; + char buf[ZFS_MAX_DATASET_NAME_LEN]; + VERIFY0(dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + za.za_first_integer, FTAG, &clone)); + dsl_dir_name(clone->ds_dir, buf); + fnvlist_add_boolean(val, buf); + dsl_dataset_rele(clone, FTAG); + } + zap_cursor_fini(&zc); + return (0); +} + +void +get_clones_stat(dsl_dataset_t *ds, nvlist_t *nv) +{ + nvlist_t *propval = fnvlist_alloc(); + nvlist_t *val; + + /* + * We use nvlist_alloc() instead of fnvlist_alloc() because the + * latter would allocate the list with NV_UNIQUE_NAME flag. + * As a result, every time a clone name is appended to the list + * it would be (linearly) searched for a duplicate name. + * We already know that all clone names must be unique and we + * want avoid the quadratic complexity of double-checking that + * because we can have a large number of clones. + */ + VERIFY0(nvlist_alloc(&val, 0, KM_SLEEP)); + + if (get_clones_stat_impl(ds, val) == 0) { + fnvlist_add_nvlist(propval, ZPROP_VALUE, val); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_CLONES), + propval); + } + + nvlist_free(val); + nvlist_free(propval); +} + +/* + * Returns a string that represents the receive resume stats token. It should + * be freed with strfree(). + */ +char * +get_receive_resume_stats_impl(dsl_dataset_t *ds) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + if (dsl_dataset_has_resume_receive_state(ds)) { + char *str; + void *packed; + uint8_t *compressed; + uint64_t val; + nvlist_t *token_nv = fnvlist_alloc(); + size_t packed_size, compressed_size; + + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_FROMGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "fromguid", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OBJECT, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "object", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_OFFSET, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "offset", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_BYTES, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "bytes", val); + } + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TOGUID, sizeof (val), 1, &val) == 0) { + fnvlist_add_uint64(token_nv, "toguid", val); + } + char buf[MAXNAMELEN]; + if (zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_TONAME, 1, sizeof (buf), buf) == 0) { + fnvlist_add_string(token_nv, "toname", buf); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_LARGEBLOCK) == 0) { + fnvlist_add_boolean(token_nv, "largeblockok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_EMBEDOK) == 0) { + fnvlist_add_boolean(token_nv, "embedok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_COMPRESSOK) == 0) { + fnvlist_add_boolean(token_nv, "compressok"); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_RAWOK) == 0) { + fnvlist_add_boolean(token_nv, "rawok"); + } + if (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)) { + uint64_t num_redact_snaps; + uint64_t *redact_snaps; + VERIFY(dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &num_redact_snaps, + &redact_snaps)); + fnvlist_add_uint64_array(token_nv, "redact_snaps", + redact_snaps, num_redact_snaps); + } + if (zap_contains(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS) == 0) { + uint64_t num_redact_snaps, int_size; + uint64_t *redact_snaps; + VERIFY0(zap_length(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, &int_size, + &num_redact_snaps)); + ASSERT3U(int_size, ==, sizeof (uint64_t)); + + redact_snaps = kmem_alloc(int_size * num_redact_snaps, + KM_SLEEP); + VERIFY0(zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_RESUME_REDACT_BOOKMARK_SNAPS, int_size, + num_redact_snaps, redact_snaps)); + fnvlist_add_uint64_array(token_nv, "book_redact_snaps", + redact_snaps, num_redact_snaps); + kmem_free(redact_snaps, int_size * num_redact_snaps); + } + packed = fnvlist_pack(token_nv, &packed_size); + fnvlist_free(token_nv); + compressed = kmem_alloc(packed_size, KM_SLEEP); + + compressed_size = gzip_compress(packed, compressed, + packed_size, packed_size, 6); + + zio_cksum_t cksum; + fletcher_4_native_varsize(compressed, compressed_size, &cksum); + + size_t alloc_size = compressed_size * 2 + 1; + str = kmem_alloc(alloc_size, KM_SLEEP); + for (int i = 0; i < compressed_size; i++) { + size_t offset = i * 2; + (void) snprintf(str + offset, alloc_size - offset, + "%02x", compressed[i]); + } + str[compressed_size * 2] = '\0'; + char *propval = kmem_asprintf("%u-%llx-%llx-%s", + ZFS_SEND_RESUME_TOKEN_VERSION, + (longlong_t)cksum.zc_word[0], + (longlong_t)packed_size, str); + kmem_free(packed, packed_size); + kmem_free(str, alloc_size); + kmem_free(compressed, packed_size); + return (propval); + } + return (kmem_strdup("")); +} + +/* + * Returns a string that represents the receive resume stats token of the + * dataset's child. It should be freed with strfree(). + */ +char * +get_child_receive_stats(dsl_dataset_t *ds) +{ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(ds->ds_dir->dd_pool, recvname, FTAG, + &recv_ds) == 0) { + char *propval = get_receive_resume_stats_impl(recv_ds); + dsl_dataset_rele(recv_ds, FTAG); + return (propval); + } + return (kmem_strdup("")); +} + +static void +get_receive_resume_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + char *propval = get_receive_resume_stats_impl(ds); + if (strcmp(propval, "") != 0) { + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, propval); + } else { + char *childval = get_child_receive_stats(ds); + if (strcmp(childval, "") != 0) { + dsl_prop_nvlist_add_string(nv, + ZFS_PROP_RECEIVE_RESUME_TOKEN, childval); + } + kmem_strfree(childval); + } + kmem_strfree(propval); +} + +uint64_t +dsl_get_refratio(dsl_dataset_t *ds) +{ + uint64_t ratio = dsl_dataset_phys(ds)->ds_compressed_bytes == 0 ? 100 : + (dsl_dataset_phys(ds)->ds_uncompressed_bytes * 100 / + dsl_dataset_phys(ds)->ds_compressed_bytes); + return (ratio); +} + +uint64_t +dsl_get_logicalreferenced(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_uncompressed_bytes); +} + +uint64_t +dsl_get_compressratio(dsl_dataset_t *ds) +{ + if (ds->ds_is_snapshot) { + return (dsl_get_refratio(ds)); + } else { + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + uint64_t val = dsl_dir_get_compressratio(dd); + mutex_exit(&dd->dd_lock); + return (val); + } +} + +uint64_t +dsl_get_used(dsl_dataset_t *ds) +{ + if (ds->ds_is_snapshot) { + return (dsl_dataset_phys(ds)->ds_unique_bytes); + } else { + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + uint64_t val = dsl_dir_get_used(dd); + mutex_exit(&dd->dd_lock); + return (val); + } +} + +uint64_t +dsl_get_creation(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_creation_time); +} + +uint64_t +dsl_get_creationtxg(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_creation_txg); +} + +uint64_t +dsl_get_refquota(dsl_dataset_t *ds) +{ + return (ds->ds_quota); +} + +uint64_t +dsl_get_refreservation(dsl_dataset_t *ds) +{ + return (ds->ds_reserved); +} + +uint64_t +dsl_get_guid(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_guid); +} + +uint64_t +dsl_get_unique(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_unique_bytes); +} + +uint64_t +dsl_get_objsetid(dsl_dataset_t *ds) +{ + return (ds->ds_object); +} + +uint64_t +dsl_get_userrefs(dsl_dataset_t *ds) +{ + return (ds->ds_userrefs); +} + +uint64_t +dsl_get_defer_destroy(dsl_dataset_t *ds) +{ + return (DS_IS_DEFER_DESTROY(ds) ? 1 : 0); +} + +uint64_t +dsl_get_referenced(dsl_dataset_t *ds) +{ + return (dsl_dataset_phys(ds)->ds_referenced_bytes); +} + +uint64_t +dsl_get_numclones(dsl_dataset_t *ds) +{ + ASSERT(ds->ds_is_snapshot); + return (dsl_dataset_phys(ds)->ds_num_children - 1); +} + +uint64_t +dsl_get_inconsistent(dsl_dataset_t *ds) +{ + return ((dsl_dataset_phys(ds)->ds_flags & DS_FLAG_INCONSISTENT) ? + 1 : 0); +} + +uint64_t +dsl_get_redacted(dsl_dataset_t *ds) +{ + return (dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); +} + +uint64_t +dsl_get_available(dsl_dataset_t *ds) +{ + uint64_t refdbytes = dsl_get_referenced(ds); + uint64_t availbytes = dsl_dir_space_available(ds->ds_dir, + NULL, 0, TRUE); + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { + availbytes += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; + } + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (refdbytes < ds->ds_quota) { + availbytes = MIN(availbytes, + ds->ds_quota - refdbytes); + } else { + availbytes = 0; + } + } + return (availbytes); +} + +int +dsl_get_written(dsl_dataset_t *ds, uint64_t *written) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_dataset_t *prev; + int err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err == 0) { + uint64_t comp, uncomp; + err = dsl_dataset_space_written(prev, ds, written, + &comp, &uncomp); + dsl_dataset_rele(prev, FTAG); + } + return (err); +} + +/* + * 'snap' should be a buffer of size ZFS_MAX_DATASET_NAME_LEN. + */ +int +dsl_get_prev_snap(dsl_dataset_t *ds, char *snap) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + if (ds->ds_prev != NULL && ds->ds_prev != dp->dp_origin_snap) { + dsl_dataset_name(ds->ds_prev, snap); + return (0); + } else { + return (SET_ERROR(ENOENT)); + } +} + +void +dsl_get_redact_snaps(dsl_dataset_t *ds, nvlist_t *propval) +{ + uint64_t nsnaps; + uint64_t *snaps; + if (dsl_dataset_get_uint64_array_feature(ds, + SPA_FEATURE_REDACTED_DATASETS, &nsnaps, &snaps)) { + fnvlist_add_uint64_array(propval, ZPROP_VALUE, snaps, + nsnaps); + } +} + +/* + * Returns the mountpoint property and source for the given dataset in the value + * and source buffers. The value buffer must be at least as large as MAXPATHLEN + * and the source buffer as least as large a ZFS_MAX_DATASET_NAME_LEN. + * Returns 0 on success and an error on failure. + */ +int +dsl_get_mountpoint(dsl_dataset_t *ds, const char *dsname, char *value, + char *source) +{ + int error; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Retrieve the mountpoint value stored in the zap object */ + error = dsl_prop_get_ds(ds, zfs_prop_to_name(ZFS_PROP_MOUNTPOINT), 1, + ZAP_MAXVALUELEN, value, source); + if (error != 0) { + return (error); + } + + /* + * Process the dsname and source to find the full mountpoint string. + * Can be skipped for 'legacy' or 'none'. + */ + if (value[0] == '/') { + char *buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + char *root = buf; + const char *relpath; + + /* + * If we inherit the mountpoint, even from a dataset + * with a received value, the source will be the path of + * the dataset we inherit from. If source is + * ZPROP_SOURCE_VAL_RECVD, the received value is not + * inherited. + */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { + relpath = ""; + } else { + ASSERT0(strncmp(dsname, source, strlen(source))); + relpath = dsname + strlen(source); + if (relpath[0] == '/') + relpath++; + } + + spa_altroot(dp->dp_spa, root, ZAP_MAXVALUELEN); + + /* + * Special case an alternate root of '/'. This will + * avoid having multiple leading slashes in the + * mountpoint path. + */ + if (strcmp(root, "/") == 0) + root++; + + /* + * If the mountpoint is '/' then skip over this + * if we are obtaining either an alternate root or + * an inherited mountpoint. + */ + char *mnt = value; + if (value[1] == '\0' && (root[0] != '\0' || + relpath[0] != '\0')) + mnt = value + 1; + + if (relpath[0] == '\0') { + (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s", + root, mnt); + } else { + (void) snprintf(value, ZAP_MAXVALUELEN, "%s%s%s%s", + root, mnt, relpath[0] == '@' ? "" : "/", + relpath); + } + kmem_free(buf, ZAP_MAXVALUELEN); + } + + return (0); +} + +void +dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRATIO, + dsl_get_refratio(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALREFERENCED, + dsl_get_logicalreferenced(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_COMPRESSRATIO, + dsl_get_compressratio(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USED, + dsl_get_used(ds)); + + if (ds->ds_is_snapshot) { + get_clones_stat(ds, nv); + } else { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + if (dsl_get_prev_snap(ds, buf) == 0) + dsl_prop_nvlist_add_string(nv, ZFS_PROP_PREV_SNAP, + buf); + dsl_dir_stats(ds->ds_dir, nv); + } + + nvlist_t *propval = fnvlist_alloc(); + dsl_get_redact_snaps(ds, propval); + fnvlist_add_nvlist(nv, zfs_prop_to_name(ZFS_PROP_REDACT_SNAPS), + propval); + nvlist_free(propval); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_AVAILABLE, + dsl_get_available(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFERENCED, + dsl_get_referenced(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATION, + dsl_get_creation(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_CREATETXG, + dsl_get_creationtxg(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFQUOTA, + dsl_get_refquota(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_REFRESERVATION, + dsl_get_refreservation(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, + dsl_get_guid(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, + dsl_get_unique(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, + dsl_get_objsetid(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, + dsl_get_userrefs(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + dsl_get_defer_destroy(ds)); + dsl_dataset_crypt_stats(ds, nv); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + uint64_t written; + if (dsl_get_written(ds, &written) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_WRITTEN, + written); + } + } + + if (!dsl_dataset_is_snapshot(ds)) { + /* + * A failed "newfs" (e.g. full) resumable receive leaves + * the stats set on this dataset. Check here for the prop. + */ + get_receive_resume_stats(ds, nv); + + /* + * A failed incremental resumable receive leaves the + * stats set on our child named "%recv". Check the child + * for the prop. + */ + /* 6 extra bytes for /%recv */ + char recvname[ZFS_MAX_DATASET_NAME_LEN + 6]; + dsl_dataset_t *recv_ds; + dsl_dataset_name(ds, recvname); + if (strlcat(recvname, "/", sizeof (recvname)) < + sizeof (recvname) && + strlcat(recvname, recv_clone_name, sizeof (recvname)) < + sizeof (recvname) && + dsl_dataset_hold(dp, recvname, FTAG, &recv_ds) == 0) { + get_receive_resume_stats(recv_ds, nv); + dsl_dataset_rele(recv_ds, FTAG); + } + } +} + +void +dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) +{ + dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; + ASSERT(dsl_pool_config_held(dp)); + + stat->dds_creation_txg = dsl_get_creationtxg(ds); + stat->dds_inconsistent = dsl_get_inconsistent(ds); + stat->dds_guid = dsl_get_guid(ds); + stat->dds_redacted = dsl_get_redacted(ds); + stat->dds_origin[0] = '\0'; + if (ds->ds_is_snapshot) { + stat->dds_is_snapshot = B_TRUE; + stat->dds_num_clones = dsl_get_numclones(ds); + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; + + if (dsl_dir_is_clone(ds->ds_dir)) { + dsl_dir_get_origin(ds->ds_dir, stat->dds_origin); + } + } +} + +uint64_t +dsl_dataset_fsid_guid(dsl_dataset_t *ds) +{ + return (ds->ds_fsid_guid); +} + +void +dsl_dataset_space(dsl_dataset_t *ds, + uint64_t *refdbytesp, uint64_t *availbytesp, + uint64_t *usedobjsp, uint64_t *availobjsp) +{ + *refdbytesp = dsl_dataset_phys(ds)->ds_referenced_bytes; + *availbytesp = dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE); + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) + *availbytesp += + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes; + if (ds->ds_quota != 0) { + /* + * Adjust available bytes according to refquota + */ + if (*refdbytesp < ds->ds_quota) + *availbytesp = MIN(*availbytesp, + ds->ds_quota - *refdbytesp); + else + *availbytesp = 0; + } + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + *usedobjsp = BP_GET_FILL(&dsl_dataset_phys(ds)->ds_bp); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + *availobjsp = DN_MAX_OBJECT - *usedobjsp; +} + +boolean_t +dsl_dataset_modified_since_snap(dsl_dataset_t *ds, dsl_dataset_t *snap) +{ + dsl_pool_t *dp __maybe_unused = ds->ds_dir->dd_pool; + uint64_t birth; + + ASSERT(dsl_pool_config_held(dp)); + if (snap == NULL) + return (B_FALSE); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + birth = dsl_dataset_get_blkptr(ds)->blk_birth; + rrw_exit(&ds->ds_bp_rwlock, FTAG); + if (birth > dsl_dataset_phys(snap)->ds_creation_txg) { + objset_t *os, *os_snap; + /* + * It may be that only the ZIL differs, because it was + * reset in the head. Don't count that as being + * modified. + */ + if (dmu_objset_from_ds(ds, &os) != 0) + return (B_TRUE); + if (dmu_objset_from_ds(snap, &os_snap) != 0) + return (B_TRUE); + return (bcmp(&os->os_phys->os_meta_dnode, + &os_snap->os_phys->os_meta_dnode, + sizeof (os->os_phys->os_meta_dnode)) != 0); + } + return (B_FALSE); +} + +typedef struct dsl_dataset_rename_snapshot_arg { + const char *ddrsa_fsname; + const char *ddrsa_oldsnapname; + const char *ddrsa_newsnapname; + boolean_t ddrsa_recursive; + dmu_tx_t *ddrsa_tx; +} dsl_dataset_rename_snapshot_arg_t; + +/* ARGSUSED */ +static int +dsl_dataset_rename_snapshot_check_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + int error; + uint64_t val; + + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + if (error != 0) { + /* ignore nonexistent snapshots */ + return (error == ENOENT ? 0 : error); + } + + /* new name should not exist */ + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_newsnapname, &val); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; + + /* dataset name + 1 for the "@" + the new snapshot name must fit */ + if (dsl_dir_namelen(hds->ds_dir) + 1 + + strlen(ddrsa->ddrsa_newsnapname) >= ZFS_MAX_DATASET_NAME_LEN) + error = SET_ERROR(ENAMETOOLONG); + + return (error); +} + +static int +dsl_dataset_rename_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + int error; + + error = dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds); + if (error != 0) + return (error); + + if (ddrsa->ddrsa_recursive) { + error = dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_check_impl, ddrsa, + DS_FIND_CHILDREN); + } else { + error = dsl_dataset_rename_snapshot_check_impl(dp, hds, ddrsa); + } + dsl_dataset_rele(hds, FTAG); + return (error); +} + +static int +dsl_dataset_rename_snapshot_sync_impl(dsl_pool_t *dp, + dsl_dataset_t *hds, void *arg) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_dataset_t *ds; + uint64_t val; + dmu_tx_t *tx = ddrsa->ddrsa_tx; + int error; + + error = dsl_dataset_snap_lookup(hds, ddrsa->ddrsa_oldsnapname, &val); + ASSERT(error == 0 || error == ENOENT); + if (error == ENOENT) { + /* ignore nonexistent snapshots */ + return (0); + } + + VERIFY0(dsl_dataset_hold_obj(dp, val, FTAG, &ds)); + + /* log before we change the name */ + spa_history_log_internal_ds(ds, "rename", tx, + "-> @%s", ddrsa->ddrsa_newsnapname); + + VERIFY0(dsl_dataset_snap_remove(hds, ddrsa->ddrsa_oldsnapname, tx, + B_FALSE)); + mutex_enter(&ds->ds_lock); + (void) strlcpy(ds->ds_snapname, ddrsa->ddrsa_newsnapname, + sizeof (ds->ds_snapname)); + mutex_exit(&ds->ds_lock); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, + ds->ds_snapname, 8, 1, &ds->ds_object, tx)); + zvol_rename_minors(dp->dp_spa, ddrsa->ddrsa_oldsnapname, + ddrsa->ddrsa_newsnapname, B_TRUE); + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_rename_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rename_snapshot_arg_t *ddrsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds = NULL; + + VERIFY0(dsl_dataset_hold(dp, ddrsa->ddrsa_fsname, FTAG, &hds)); + ddrsa->ddrsa_tx = tx; + if (ddrsa->ddrsa_recursive) { + VERIFY0(dmu_objset_find_dp(dp, hds->ds_dir->dd_object, + dsl_dataset_rename_snapshot_sync_impl, ddrsa, + DS_FIND_CHILDREN)); + } else { + VERIFY0(dsl_dataset_rename_snapshot_sync_impl(dp, hds, ddrsa)); + } + dsl_dataset_rele(hds, FTAG); +} + +int +dsl_dataset_rename_snapshot(const char *fsname, + const char *oldsnapname, const char *newsnapname, boolean_t recursive) +{ + dsl_dataset_rename_snapshot_arg_t ddrsa; + + ddrsa.ddrsa_fsname = fsname; + ddrsa.ddrsa_oldsnapname = oldsnapname; + ddrsa.ddrsa_newsnapname = newsnapname; + ddrsa.ddrsa_recursive = recursive; + + return (dsl_sync_task(fsname, dsl_dataset_rename_snapshot_check, + dsl_dataset_rename_snapshot_sync, &ddrsa, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +/* + * If we're doing an ownership handoff, we need to make sure that there is + * only one long hold on the dataset. We're not allowed to change anything here + * so we don't permanently release the long hold or regular hold here. We want + * to do this only when syncing to avoid the dataset unexpectedly going away + * when we release the long hold. + */ +static int +dsl_dataset_handoff_check(dsl_dataset_t *ds, void *owner, dmu_tx_t *tx) +{ + boolean_t held = B_FALSE; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + uint64_t holds = zfs_refcount_count(&ds->ds_longholds) - + (owner != NULL ? 1 : 0); + /* + * The value of dd_activity_waiters can chance as soon as we drop the + * lock, but we're fine with that; new waiters coming in or old + * waiters leaving doesn't cause problems, since we're going to cancel + * waiters later anyway. The goal of this check is to verify that no + * non-waiters have long-holds, and all new long-holds will be + * prevented because we're holding the pool config as writer. + */ + if (holds != dd->dd_activity_waiters) + held = B_TRUE; + mutex_exit(&dd->dd_activity_lock); + + if (held) + return (SET_ERROR(EBUSY)); + + return (0); +} + +int +dsl_dataset_rollback_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int64_t unused_refres_delta; + int error; + + error = dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds); + if (error != 0) + return (error); + + /* must not be a snapshot */ + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + /* must have a most recent snapshot */ + if (dsl_dataset_phys(ds)->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ESRCH)); + } + + /* + * No rollback to a snapshot created in the current txg, because + * the rollback may dirty the dataset and create blocks that are + * not reachable from the rootbp while having a birth txg that + * falls into the snapshot's range. + */ + if (dmu_tx_is_syncing(tx) && + dsl_dataset_phys(ds)->ds_prev_snap_txg >= tx->tx_txg) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EAGAIN)); + } + + /* + * If the expected target snapshot is specified, then check that + * the latest snapshot is it. + */ + if (ddra->ddra_tosnap != NULL) { + dsl_dataset_t *snapds; + + /* Check if the target snapshot exists at all. */ + error = dsl_dataset_hold(dp, ddra->ddra_tosnap, FTAG, &snapds); + if (error != 0) { + /* + * ESRCH is used to signal that the target snapshot does + * not exist, while ENOENT is used to report that + * the rolled back dataset does not exist. + * ESRCH is also used to cover other cases where the + * target snapshot is not related to the dataset being + * rolled back such as being in a different pool. + */ + if (error == ENOENT || error == EXDEV) + error = SET_ERROR(ESRCH); + dsl_dataset_rele(ds, FTAG); + return (error); + } + ASSERT(snapds->ds_is_snapshot); + + /* Check if the snapshot is the latest snapshot indeed. */ + if (snapds != ds->ds_prev) { + /* + * Distinguish between the case where the only problem + * is intervening snapshots (EEXIST) vs the snapshot + * not being a valid target for rollback (ESRCH). + */ + if (snapds->ds_dir == ds->ds_dir || + (dsl_dir_is_clone(ds->ds_dir) && + dsl_dir_phys(ds->ds_dir)->dd_origin_obj == + snapds->ds_object)) { + error = SET_ERROR(EEXIST); + } else { + error = SET_ERROR(ESRCH); + } + dsl_dataset_rele(snapds, FTAG); + dsl_dataset_rele(ds, FTAG); + return (error); + } + dsl_dataset_rele(snapds, FTAG); + } + + /* must not have any bookmarks after the most recent snapshot */ + if (dsl_bookmark_latest_txg(ds) > + dsl_dataset_phys(ds)->ds_prev_snap_txg) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EEXIST)); + } + + error = dsl_dataset_handoff_check(ds, ddra->ddra_owner, tx); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + /* + * Check if the snap we are rolling back to uses more than + * the refquota. + */ + if (ds->ds_quota != 0 && + dsl_dataset_phys(ds->ds_prev)->ds_referenced_bytes > ds->ds_quota) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EDQUOT)); + } + + /* + * When we do the clone swap, we will temporarily use more space + * due to the refreservation (the head will no longer have any + * unique space, so the entire amount of the refreservation will need + * to be free). We will immediately destroy the clone, freeing + * this space, but the freeing happens over many txg's. + */ + unused_refres_delta = (int64_t)MIN(ds->ds_reserved, + dsl_dataset_phys(ds)->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_dataset_rollback_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_rollback_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds, *clone; + uint64_t cloneobj; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY0(dsl_dataset_hold(dp, ddra->ddra_fsname, FTAG, &ds)); + + dsl_dataset_name(ds->ds_prev, namebuf); + fnvlist_add_string(ddra->ddra_result, "target", namebuf); + + cloneobj = dsl_dataset_create_sync(ds->ds_dir, "%rollback", + ds->ds_prev, DS_CREATE_FLAG_NODIRTY, kcred, NULL, tx); + + VERIFY0(dsl_dataset_hold_obj(dp, cloneobj, FTAG, &clone)); + + dsl_dataset_clone_swap_sync_impl(clone, ds, tx); + dsl_dataset_zero_zil(ds, tx); + + dsl_destroy_head_sync_impl(clone, tx); + + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(ds, FTAG); +} + +/* + * Rolls back the given filesystem or volume to the most recent snapshot. + * The name of the most recent snapshot will be returned under key "target" + * in the result nvlist. + * + * If owner != NULL: + * - The existing dataset MUST be owned by the specified owner at entry + * - Upon return, dataset will still be held by the same owner, whether we + * succeed or not. + * + * This mode is required any time the existing filesystem is mounted. See + * notes above zfs_suspend_fs() for further details. + */ +int +dsl_dataset_rollback(const char *fsname, const char *tosnap, void *owner, + nvlist_t *result) +{ + dsl_dataset_rollback_arg_t ddra; + + ddra.ddra_fsname = fsname; + ddra.ddra_tosnap = tosnap; + ddra.ddra_owner = owner; + ddra.ddra_result = result; + + return (dsl_sync_task(fsname, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, &ddra, + 1, ZFS_SPACE_CHECK_RESERVED)); +} + +struct promotenode { + list_node_t link; + dsl_dataset_t *ds; +}; + +static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); +static int promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, + void *tag); +static void promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag); + +int +dsl_dataset_promote_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds, *origin_head; + int err; + uint64_t unused; + uint64_t ss_mv_cnt; + size_t max_snap_len; + boolean_t conflicting_snaps; + + err = promote_hold(ddpa, dp, FTAG); + if (err != 0) + return (err); + + hds = ddpa->ddpa_clone; + max_snap_len = MAXNAMELEN - strlen(ddpa->ddpa_clonename) - 1; + + if (dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE) { + promote_rele(ddpa, FTAG); + return (SET_ERROR(EXDEV)); + } + + snap = list_head(&ddpa->shared_snaps); + origin_head = snap->ds; + if (snap == NULL) { + err = SET_ERROR(ENOENT); + goto out; + } + origin_ds = snap->ds; + + /* + * Encrypted clones share a DSL Crypto Key with their origin's dsl dir. + * When doing a promote we must make sure the encryption root for + * both the target and the target's origin does not change to avoid + * needing to rewrap encryption keys + */ + err = dsl_dataset_promote_crypt_check(hds->ds_dir, origin_ds->ds_dir); + if (err != 0) + goto out; + + /* + * Compute and check the amount of space to transfer. Since this is + * so expensive, don't do the preliminary check. + */ + if (!dmu_tx_is_syncing(tx)) { + promote_rele(ddpa, FTAG); + return (0); + } + + /* compute origin's new unique space */ + snap = list_tail(&ddpa->clone_snaps); + ASSERT(snap != NULL); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + dsl_dataset_phys(origin_ds)->ds_prev_snap_txg, UINT64_MAX, + &ddpa->unique, &unused, &unused); + + /* + * Walk the snapshots that we are moving + * + * Compute space to transfer. Consider the incremental changes + * to used by each snapshot: + * (my used) = (prev's used) + (blocks born) - (blocks killed) + * So each snapshot gave birth to: + * (blocks born) = (my used) - (prev's used) + (blocks killed) + * So a sequence would look like: + * (uN - u(N-1) + kN) + ... + (u1 - u0 + k1) + (u0 - 0 + k0) + * Which simplifies to: + * uN + kN + kN-1 + ... + k1 + k0 + * Note however, if we stop before we reach the ORIGIN we get: + * uN + kN + kN-1 + ... + kM - uM-1 + */ + conflicting_snaps = B_FALSE; + ss_mv_cnt = 0; + ddpa->used = dsl_dataset_phys(origin_ds)->ds_referenced_bytes; + ddpa->comp = dsl_dataset_phys(origin_ds)->ds_compressed_bytes; + ddpa->uncomp = dsl_dataset_phys(origin_ds)->ds_uncompressed_bytes; + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { + uint64_t val, dlused, dlcomp, dluncomp; + dsl_dataset_t *ds = snap->ds; + + ss_mv_cnt++; + + /* + * If there are long holds, we won't be able to evict + * the objset. + */ + if (dsl_dataset_long_held(ds)) { + err = SET_ERROR(EBUSY); + goto out; + } + + /* Check that the snapshot name does not conflict */ + VERIFY0(dsl_dataset_get_snapname(ds)); + if (strlen(ds->ds_snapname) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } + err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); + if (err == 0) { + fnvlist_add_boolean(ddpa->err_ds, + snap->ds->ds_snapname); + conflicting_snaps = B_TRUE; + } else if (err != ENOENT) { + goto out; + } + + /* The very first snapshot does not have a deadlist */ + if (dsl_dataset_phys(ds)->ds_prev_snap_obj == 0) + continue; + + dsl_deadlist_space(&ds->ds_deadlist, + &dlused, &dlcomp, &dluncomp); + ddpa->used += dlused; + ddpa->comp += dlcomp; + ddpa->uncomp += dluncomp; + } + + /* + * Check that bookmarks that are being transferred don't have + * name conflicts. + */ + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = AVL_NEXT(&origin_head->ds_bookmarks, dbn)) { + if (strlen(dbn->dbn_name) >= max_snap_len) { + err = SET_ERROR(ENAMETOOLONG); + goto out; + } + zfs_bookmark_phys_t bm; + err = dsl_bookmark_lookup_impl(ddpa->ddpa_clone, + dbn->dbn_name, &bm); + + if (err == 0) { + fnvlist_add_boolean(ddpa->err_ds, dbn->dbn_name); + conflicting_snaps = B_TRUE; + } else if (err == ESRCH) { + err = 0; + } else if (err != 0) { + goto out; + } + } + + /* + * In order to return the full list of conflicting snapshots, we check + * whether there was a conflict after traversing all of them. + */ + if (conflicting_snaps) { + err = SET_ERROR(EEXIST); + goto out; + } + + /* + * If we are a clone of a clone then we never reached ORIGIN, + * so we need to subtract out the clone origin's used space. + */ + if (ddpa->origin_origin) { + ddpa->used -= + dsl_dataset_phys(ddpa->origin_origin)->ds_referenced_bytes; + ddpa->comp -= + dsl_dataset_phys(ddpa->origin_origin)->ds_compressed_bytes; + ddpa->uncomp -= + dsl_dataset_phys(ddpa->origin_origin)-> + ds_uncompressed_bytes; + } + + /* Check that there is enough space and limit headroom here */ + err = dsl_dir_transfer_possible(origin_ds->ds_dir, hds->ds_dir, + 0, ss_mv_cnt, ddpa->used, ddpa->cr, ddpa->proc); + if (err != 0) + goto out; + + /* + * Compute the amounts of space that will be used by snapshots + * after the promotion (for both origin and clone). For each, + * it is the amount of space that will be on all of their + * deadlists (that was not born before their new origin). + */ + if (dsl_dir_phys(hds->ds_dir)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + uint64_t space; + + /* + * Note, typically this will not be a clone of a clone, + * so dd_origin_txg will be < TXG_INITIAL, so + * these snaplist_space() -> dsl_deadlist_space_range() + * calls will be fast because they do not have to + * iterate over all bps. + */ + snap = list_head(&ddpa->origin_snaps); + if (snap == NULL) { + err = SET_ERROR(ENOENT); + goto out; + } + err = snaplist_space(&ddpa->shared_snaps, + snap->ds->ds_dir->dd_origin_txg, &ddpa->cloneusedsnap); + if (err != 0) + goto out; + + err = snaplist_space(&ddpa->clone_snaps, + snap->ds->ds_dir->dd_origin_txg, &space); + if (err != 0) + goto out; + ddpa->cloneusedsnap += space; + } + if (dsl_dir_phys(origin_ds->ds_dir)->dd_flags & + DD_FLAG_USED_BREAKDOWN) { + err = snaplist_space(&ddpa->origin_snaps, + dsl_dataset_phys(origin_ds)->ds_creation_txg, + &ddpa->originusedsnap); + if (err != 0) + goto out; + } + +out: + promote_rele(ddpa, FTAG); + return (err); +} + +void +dsl_dataset_promote_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_promote_arg_t *ddpa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *hds; + struct promotenode *snap; + dsl_dataset_t *origin_ds; + dsl_dataset_t *origin_head; + dsl_dir_t *dd; + dsl_dir_t *odd = NULL; + uint64_t oldnext_obj; + int64_t delta; + + ASSERT(nvlist_empty(ddpa->err_ds)); + + VERIFY0(promote_hold(ddpa, dp, FTAG)); + hds = ddpa->ddpa_clone; + + ASSERT0(dsl_dataset_phys(hds)->ds_flags & DS_FLAG_NOPROMOTE); + + snap = list_head(&ddpa->shared_snaps); + origin_ds = snap->ds; + dd = hds->ds_dir; + + snap = list_head(&ddpa->origin_snaps); + origin_head = snap->ds; + + /* + * We need to explicitly open odd, since origin_ds's dd will be + * changing. + */ + VERIFY0(dsl_dir_hold_obj(dp, origin_ds->ds_dir->dd_object, + NULL, FTAG, &odd)); + + dsl_dataset_promote_crypt_sync(hds->ds_dir, odd, tx); + + /* change origin's next snap */ + dmu_buf_will_dirty(origin_ds->ds_dbuf, tx); + oldnext_obj = dsl_dataset_phys(origin_ds)->ds_next_snap_obj; + snap = list_tail(&ddpa->clone_snaps); + ASSERT3U(dsl_dataset_phys(snap->ds)->ds_prev_snap_obj, ==, + origin_ds->ds_object); + dsl_dataset_phys(origin_ds)->ds_next_snap_obj = snap->ds->ds_object; + + /* change the origin's next clone */ + if (dsl_dataset_phys(origin_ds)->ds_next_clones_obj) { + dsl_dataset_remove_from_next_clones(origin_ds, + snap->ds->ds_object, tx); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(origin_ds)->ds_next_clones_obj, + oldnext_obj, tx)); + } + + /* change origin */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + ASSERT3U(dsl_dir_phys(dd)->dd_origin_obj, ==, origin_ds->ds_object); + dsl_dir_phys(dd)->dd_origin_obj = dsl_dir_phys(odd)->dd_origin_obj; + dd->dd_origin_txg = origin_head->ds_dir->dd_origin_txg; + dmu_buf_will_dirty(odd->dd_dbuf, tx); + dsl_dir_phys(odd)->dd_origin_obj = origin_ds->ds_object; + origin_head->ds_dir->dd_origin_txg = + dsl_dataset_phys(origin_ds)->ds_creation_txg; + + /* change dd_clone entries */ + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, hds->ds_object, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + hds->ds_object, tx)); + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(ddpa->origin_origin->ds_dir)->dd_clones, + origin_head->ds_object, tx)); + if (dsl_dir_phys(dd)->dd_clones == 0) { + dsl_dir_phys(dd)->dd_clones = + zap_create(dp->dp_meta_objset, DMU_OT_DSL_CLONES, + DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, origin_head->ds_object, tx)); + } + + /* + * Move bookmarks to this dir. + */ + dsl_bookmark_node_t *dbn_next; + for (dsl_bookmark_node_t *dbn = avl_first(&origin_head->ds_bookmarks); + dbn != NULL && dbn->dbn_phys.zbm_creation_txg <= + dsl_dataset_phys(origin_ds)->ds_creation_txg; + dbn = dbn_next) { + dbn_next = AVL_NEXT(&origin_head->ds_bookmarks, dbn); + + avl_remove(&origin_head->ds_bookmarks, dbn); + VERIFY0(zap_remove(dp->dp_meta_objset, + origin_head->ds_bookmarks_obj, dbn->dbn_name, tx)); + + dsl_bookmark_node_add(hds, dbn, tx); + } + + dsl_bookmark_next_changed(hds, origin_ds, tx); + + /* move snapshots to this dir */ + for (snap = list_head(&ddpa->shared_snaps); snap; + snap = list_next(&ddpa->shared_snaps, snap)) { + dsl_dataset_t *ds = snap->ds; + + /* + * Property callbacks are registered to a particular + * dsl_dir. Since ours is changing, evict the objset + * so that they will be unregistered from the old dsl_dir. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* move snap name entry */ + VERIFY0(dsl_dataset_get_snapname(ds)); + VERIFY0(dsl_dataset_snap_remove(origin_head, + ds->ds_snapname, tx, B_TRUE)); + VERIFY0(zap_add(dp->dp_meta_objset, + dsl_dataset_phys(hds)->ds_snapnames_zapobj, ds->ds_snapname, + 8, 1, &ds->ds_object, tx)); + dsl_fs_ss_count_adjust(hds->ds_dir, 1, + DD_FIELD_SNAPSHOT_COUNT, tx); + + /* change containing dsl_dir */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ASSERT3U(dsl_dataset_phys(ds)->ds_dir_obj, ==, odd->dd_object); + dsl_dataset_phys(ds)->ds_dir_obj = dd->dd_object; + ASSERT3P(ds->ds_dir, ==, odd); + dsl_dir_rele(ds->ds_dir, ds); + VERIFY0(dsl_dir_hold_obj(dp, dd->dd_object, + NULL, ds, &ds->ds_dir)); + + /* move any clone references */ + if (dsl_dataset_phys(ds)->ds_next_clones_obj && + spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + dsl_dataset_t *cnds; + uint64_t o; + + if (za.za_first_integer == oldnext_obj) { + /* + * We've already moved the + * origin's reference. + */ + continue; + } + + VERIFY0(dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &cnds)); + o = dsl_dir_phys(cnds->ds_dir)-> + dd_head_dataset_obj; + + VERIFY0(zap_remove_int(dp->dp_meta_objset, + dsl_dir_phys(odd)->dd_clones, o, tx)); + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_clones, o, tx)); + dsl_dataset_rele(cnds, FTAG); + } + zap_cursor_fini(&zc); + } + + ASSERT(!dsl_prop_hascb(ds)); + } + + /* + * Change space accounting. + * Note, pa->*usedsnap and dd_used_breakdown[SNAP] will either + * both be valid, or both be 0 (resulting in delta == 0). This + * is true for each of {clone,origin} independently. + */ + + delta = ddpa->cloneusedsnap - + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, >=, 0); + ASSERT3U(ddpa->used, >=, delta); + dsl_dir_diduse_space(dd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + ddpa->used - delta, ddpa->comp, ddpa->uncomp, tx); + + delta = ddpa->originusedsnap - + dsl_dir_phys(odd)->dd_used_breakdown[DD_USED_SNAP]; + ASSERT3S(delta, <=, 0); + ASSERT3U(ddpa->used, >=, -delta); + dsl_dir_diduse_space(odd, DD_USED_SNAP, delta, 0, 0, tx); + dsl_dir_diduse_space(odd, DD_USED_HEAD, + -ddpa->used - delta, -ddpa->comp, -ddpa->uncomp, tx); + + dsl_dataset_phys(origin_ds)->ds_unique_bytes = ddpa->unique; + + /* + * Since livelists are specific to a clone's origin txg, they + * are no longer accurate. Destroy the livelist from the clone being + * promoted. If the origin dataset is a clone, destroy its livelist + * as well. + */ + dsl_dir_remove_livelist(dd, tx, B_TRUE); + dsl_dir_remove_livelist(odd, tx, B_TRUE); + + /* log history record */ + spa_history_log_internal_ds(hds, "promote", tx, " "); + + dsl_dir_rele(odd, FTAG); + promote_rele(ddpa, FTAG); +} + +/* + * Make a list of dsl_dataset_t's for the snapshots between first_obj + * (exclusive) and last_obj (inclusive). The list will be in reverse + * order (last_obj will be the list_head()). If first_obj == 0, do all + * snapshots back to this dataset's origin. + */ +static int +snaplist_make(dsl_pool_t *dp, + uint64_t first_obj, uint64_t last_obj, list_t *l, void *tag) +{ + uint64_t obj = last_obj; + + list_create(l, sizeof (struct promotenode), + offsetof(struct promotenode, link)); + + while (obj != first_obj) { + dsl_dataset_t *ds; + struct promotenode *snap; + int err; + + err = dsl_dataset_hold_obj(dp, obj, tag, &ds); + ASSERT(err != ENOENT); + if (err != 0) + return (err); + + if (first_obj == 0) + first_obj = dsl_dir_phys(ds->ds_dir)->dd_origin_obj; + + snap = kmem_alloc(sizeof (*snap), KM_SLEEP); + snap->ds = ds; + list_insert_tail(l, snap); + obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + } + + return (0); +} + +static int +snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep) +{ + struct promotenode *snap; + + *spacep = 0; + for (snap = list_head(l); snap; snap = list_next(l, snap)) { + uint64_t used, comp, uncomp; + dsl_deadlist_space_range(&snap->ds->ds_deadlist, + mintxg, UINT64_MAX, &used, &comp, &uncomp); + *spacep += used; + } + return (0); +} + +static void +snaplist_destroy(list_t *l, void *tag) +{ + struct promotenode *snap; + + if (l == NULL || !list_link_active(&l->list_head)) + return; + + while ((snap = list_tail(l)) != NULL) { + list_remove(l, snap); + dsl_dataset_rele(snap->ds, tag); + kmem_free(snap, sizeof (*snap)); + } + list_destroy(l); +} + +static int +promote_hold(dsl_dataset_promote_arg_t *ddpa, dsl_pool_t *dp, void *tag) +{ + int error; + dsl_dir_t *dd; + struct promotenode *snap; + + error = dsl_dataset_hold(dp, ddpa->ddpa_clonename, tag, + &ddpa->ddpa_clone); + if (error != 0) + return (error); + dd = ddpa->ddpa_clone->ds_dir; + + if (ddpa->ddpa_clone->ds_is_snapshot || + !dsl_dir_is_clone(dd)) { + dsl_dataset_rele(ddpa->ddpa_clone, tag); + return (SET_ERROR(EINVAL)); + } + + error = snaplist_make(dp, 0, dsl_dir_phys(dd)->dd_origin_obj, + &ddpa->shared_snaps, tag); + if (error != 0) + goto out; + + error = snaplist_make(dp, 0, ddpa->ddpa_clone->ds_object, + &ddpa->clone_snaps, tag); + if (error != 0) + goto out; + + snap = list_head(&ddpa->shared_snaps); + ASSERT3U(snap->ds->ds_object, ==, dsl_dir_phys(dd)->dd_origin_obj); + error = snaplist_make(dp, dsl_dir_phys(dd)->dd_origin_obj, + dsl_dir_phys(snap->ds->ds_dir)->dd_head_dataset_obj, + &ddpa->origin_snaps, tag); + if (error != 0) + goto out; + + if (dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj != 0) { + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(snap->ds->ds_dir)->dd_origin_obj, + tag, &ddpa->origin_origin); + if (error != 0) + goto out; + } +out: + if (error != 0) + promote_rele(ddpa, tag); + return (error); +} + +static void +promote_rele(dsl_dataset_promote_arg_t *ddpa, void *tag) +{ + snaplist_destroy(&ddpa->shared_snaps, tag); + snaplist_destroy(&ddpa->clone_snaps, tag); + snaplist_destroy(&ddpa->origin_snaps, tag); + if (ddpa->origin_origin != NULL) + dsl_dataset_rele(ddpa->origin_origin, tag); + dsl_dataset_rele(ddpa->ddpa_clone, tag); +} + +/* + * Promote a clone. + * + * If it fails due to a conflicting snapshot name, "conflsnap" will be filled + * in with the name. (It must be at least ZFS_MAX_DATASET_NAME_LEN bytes long.) + */ +int +dsl_dataset_promote(const char *name, char *conflsnap) +{ + dsl_dataset_promote_arg_t ddpa = { 0 }; + uint64_t numsnaps; + int error; + nvpair_t *snap_pair; + objset_t *os; + + /* + * We will modify space proportional to the number of + * snapshots. Compute numsnaps. + */ + error = dmu_objset_hold(name, FTAG, &os); + if (error != 0) + return (error); + error = zap_count(dmu_objset_pool(os)->dp_meta_objset, + dsl_dataset_phys(dmu_objset_ds(os))->ds_snapnames_zapobj, + &numsnaps); + dmu_objset_rele(os, FTAG); + if (error != 0) + return (error); + + ddpa.ddpa_clonename = name; + ddpa.err_ds = fnvlist_alloc(); + ddpa.cr = CRED(); + ddpa.proc = curproc; + + error = dsl_sync_task(name, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, + 2 + numsnaps, ZFS_SPACE_CHECK_RESERVED); + + /* + * Return the first conflicting snapshot found. + */ + snap_pair = nvlist_next_nvpair(ddpa.err_ds, NULL); + if (snap_pair != NULL && conflsnap != NULL) + (void) strlcpy(conflsnap, nvpair_name(snap_pair), + ZFS_MAX_DATASET_NAME_LEN); + + fnvlist_free(ddpa.err_ds); + return (error); +} + +int +dsl_dataset_clone_swap_check_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, boolean_t force, void *owner, dmu_tx_t *tx) +{ + /* + * "slack" factor for received datasets with refquota set on them. + * See the bottom of this function for details on its use. + */ + uint64_t refquota_slack = (uint64_t)DMU_MAX_ACCESS * + spa_asize_inflation; + int64_t unused_refres_delta; + + /* they should both be heads */ + if (clone->ds_is_snapshot || + origin_head->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + /* if we are not forcing, the branch point should be just before them */ + if (!force && clone->ds_prev != origin_head->ds_prev) + return (SET_ERROR(EINVAL)); + + /* clone should be the clone (unless they are unrelated) */ + if (clone->ds_prev != NULL && + clone->ds_prev != clone->ds_dir->dd_pool->dp_origin_snap && + origin_head->ds_dir != clone->ds_prev->ds_dir) + return (SET_ERROR(EINVAL)); + + /* the clone should be a child of the origin */ + if (clone->ds_dir->dd_parent != origin_head->ds_dir) + return (SET_ERROR(EINVAL)); + + /* origin_head shouldn't be modified unless 'force' */ + if (!force && + dsl_dataset_modified_since_snap(origin_head, origin_head->ds_prev)) + return (SET_ERROR(ETXTBSY)); + + /* origin_head should have no long holds (e.g. is not mounted) */ + if (dsl_dataset_handoff_check(origin_head, owner, tx)) + return (SET_ERROR(EBUSY)); + + /* check amount of any unconsumed refreservation */ + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); + + if (unused_refres_delta > 0 && + unused_refres_delta > + dsl_dir_space_available(origin_head->ds_dir, NULL, 0, TRUE)) + return (SET_ERROR(ENOSPC)); + + /* + * The clone can't be too much over the head's refquota. + * + * To ensure that the entire refquota can be used, we allow one + * transaction to exceed the refquota. Therefore, this check + * needs to also allow for the space referenced to be more than the + * refquota. The maximum amount of space that one transaction can use + * on disk is DMU_MAX_ACCESS * spa_asize_inflation. Allowing this + * overage ensures that we are able to receive a filesystem that + * exceeds the refquota on the source system. + * + * So that overage is the refquota_slack we use below. + */ + if (origin_head->ds_quota != 0 && + dsl_dataset_phys(clone)->ds_referenced_bytes > + origin_head->ds_quota + refquota_slack) + return (SET_ERROR(EDQUOT)); + + return (0); +} + +static void +dsl_dataset_swap_remap_deadlists(dsl_dataset_t *clone, + dsl_dataset_t *origin, dmu_tx_t *tx) +{ + uint64_t clone_remap_dl_obj, origin_remap_dl_obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(dsl_pool_sync_context(dp)); + + clone_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(clone); + origin_remap_dl_obj = dsl_dataset_get_remap_deadlist_object(origin); + + if (clone_remap_dl_obj != 0) { + dsl_deadlist_close(&clone->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(clone, tx); + } + if (origin_remap_dl_obj != 0) { + dsl_deadlist_close(&origin->ds_remap_deadlist); + dsl_dataset_unset_remap_deadlist_object(origin, tx); + } + + if (clone_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(origin, + clone_remap_dl_obj, tx); + dsl_deadlist_open(&origin->ds_remap_deadlist, + dp->dp_meta_objset, clone_remap_dl_obj); + } + if (origin_remap_dl_obj != 0) { + dsl_dataset_set_remap_deadlist_object(clone, + origin_remap_dl_obj, tx); + dsl_deadlist_open(&clone->ds_remap_deadlist, + dp->dp_meta_objset, origin_remap_dl_obj); + } +} + +void +dsl_dataset_clone_swap_sync_impl(dsl_dataset_t *clone, + dsl_dataset_t *origin_head, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + int64_t unused_refres_delta; + + ASSERT(clone->ds_reserved == 0); + /* + * NOTE: On DEBUG kernels there could be a race between this and + * the check function if spa_asize_inflation is adjusted... + */ + ASSERT(origin_head->ds_quota == 0 || + dsl_dataset_phys(clone)->ds_unique_bytes <= origin_head->ds_quota + + DMU_MAX_ACCESS * spa_asize_inflation); + ASSERT3P(clone->ds_prev, ==, origin_head->ds_prev); + + dsl_dir_cancel_waiters(origin_head->ds_dir); + + /* + * Swap per-dataset feature flags. + */ + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (!(spa_feature_table[f].fi_flags & + ZFEATURE_FLAG_PER_DATASET)) { + ASSERT(!dsl_dataset_feature_is_active(clone, f)); + ASSERT(!dsl_dataset_feature_is_active(origin_head, f)); + continue; + } + + boolean_t clone_inuse = dsl_dataset_feature_is_active(clone, f); + void *clone_feature = clone->ds_feature[f]; + boolean_t origin_head_inuse = + dsl_dataset_feature_is_active(origin_head, f); + void *origin_head_feature = origin_head->ds_feature[f]; + + if (clone_inuse) + dsl_dataset_deactivate_feature_impl(clone, f, tx); + if (origin_head_inuse) + dsl_dataset_deactivate_feature_impl(origin_head, f, tx); + + if (clone_inuse) { + dsl_dataset_activate_feature(origin_head->ds_object, f, + clone_feature, tx); + origin_head->ds_feature[f] = clone_feature; + } + if (origin_head_inuse) { + dsl_dataset_activate_feature(clone->ds_object, f, + origin_head_feature, tx); + clone->ds_feature[f] = origin_head_feature; + } + } + + dmu_buf_will_dirty(clone->ds_dbuf, tx); + dmu_buf_will_dirty(origin_head->ds_dbuf, tx); + + if (clone->ds_objset != NULL) { + dmu_objset_evict(clone->ds_objset); + clone->ds_objset = NULL; + } + + if (origin_head->ds_objset != NULL) { + dmu_objset_evict(origin_head->ds_objset); + origin_head->ds_objset = NULL; + } + + unused_refres_delta = + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(origin_head)->ds_unique_bytes) - + (int64_t)MIN(origin_head->ds_reserved, + dsl_dataset_phys(clone)->ds_unique_bytes); + + /* + * Reset origin's unique bytes. + */ + { + dsl_dataset_t *origin = clone->ds_prev; + uint64_t comp, uncomp; + + dmu_buf_will_dirty(origin->ds_dbuf, tx); + dsl_deadlist_space_range(&clone->ds_deadlist, + dsl_dataset_phys(origin)->ds_prev_snap_txg, UINT64_MAX, + &dsl_dataset_phys(origin)->ds_unique_bytes, &comp, &uncomp); + } + + /* swap blkptrs */ + { + rrw_enter(&clone->ds_bp_rwlock, RW_WRITER, FTAG); + rrw_enter(&origin_head->ds_bp_rwlock, RW_WRITER, FTAG); + blkptr_t tmp; + tmp = dsl_dataset_phys(origin_head)->ds_bp; + dsl_dataset_phys(origin_head)->ds_bp = + dsl_dataset_phys(clone)->ds_bp; + dsl_dataset_phys(clone)->ds_bp = tmp; + rrw_exit(&origin_head->ds_bp_rwlock, FTAG); + rrw_exit(&clone->ds_bp_rwlock, FTAG); + } + + /* set dd_*_bytes */ + { + int64_t dused, dcomp, duncomp; + uint64_t cdl_used, cdl_comp, cdl_uncomp; + uint64_t odl_used, odl_comp, odl_uncomp; + + ASSERT3U(dsl_dir_phys(clone->ds_dir)-> + dd_used_breakdown[DD_USED_SNAP], ==, 0); + + dsl_deadlist_space(&clone->ds_deadlist, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space(&origin_head->ds_deadlist, + &odl_used, &odl_comp, &odl_uncomp); + + dused = dsl_dataset_phys(clone)->ds_referenced_bytes + + cdl_used - + (dsl_dataset_phys(origin_head)->ds_referenced_bytes + + odl_used); + dcomp = dsl_dataset_phys(clone)->ds_compressed_bytes + + cdl_comp - + (dsl_dataset_phys(origin_head)->ds_compressed_bytes + + odl_comp); + duncomp = dsl_dataset_phys(clone)->ds_uncompressed_bytes + + cdl_uncomp - + (dsl_dataset_phys(origin_head)->ds_uncompressed_bytes + + odl_uncomp); + + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_HEAD, + dused, dcomp, duncomp, tx); + dsl_dir_diduse_space(clone->ds_dir, DD_USED_HEAD, + -dused, -dcomp, -duncomp, tx); + + /* + * The difference in the space used by snapshots is the + * difference in snapshot space due to the head's + * deadlist (since that's the only thing that's + * changing that affects the snapused). + */ + dsl_deadlist_space_range(&clone->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &cdl_used, &cdl_comp, &cdl_uncomp); + dsl_deadlist_space_range(&origin_head->ds_deadlist, + origin_head->ds_dir->dd_origin_txg, UINT64_MAX, + &odl_used, &odl_comp, &odl_uncomp); + dsl_dir_transfer_space(origin_head->ds_dir, cdl_used - odl_used, + DD_USED_HEAD, DD_USED_SNAP, tx); + } + + /* swap ds_*_bytes */ + SWITCH64(dsl_dataset_phys(origin_head)->ds_referenced_bytes, + dsl_dataset_phys(clone)->ds_referenced_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_compressed_bytes, + dsl_dataset_phys(clone)->ds_compressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_uncompressed_bytes, + dsl_dataset_phys(clone)->ds_uncompressed_bytes); + SWITCH64(dsl_dataset_phys(origin_head)->ds_unique_bytes, + dsl_dataset_phys(clone)->ds_unique_bytes); + + /* apply any parent delta for change in unconsumed refreservation */ + dsl_dir_diduse_space(origin_head->ds_dir, DD_USED_REFRSRV, + unused_refres_delta, 0, 0, tx); + + /* + * Swap deadlists. + */ + dsl_deadlist_close(&clone->ds_deadlist); + dsl_deadlist_close(&origin_head->ds_deadlist); + SWITCH64(dsl_dataset_phys(origin_head)->ds_deadlist_obj, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&clone->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(clone)->ds_deadlist_obj); + dsl_deadlist_open(&origin_head->ds_deadlist, dp->dp_meta_objset, + dsl_dataset_phys(origin_head)->ds_deadlist_obj); + dsl_dataset_swap_remap_deadlists(clone, origin_head, tx); + + /* + * If there is a bookmark at the origin, its "next dataset" is + * changing, so we need to reset its FBN. + */ + dsl_bookmark_next_changed(origin_head, origin_head->ds_prev, tx); + + dsl_scan_ds_clone_swapped(origin_head, clone, tx); + + /* + * Destroy any livelists associated with the clone or the origin, + * since after the swap the corresponding livelists are no longer + * valid. + */ + dsl_dir_remove_livelist(clone->ds_dir, tx, B_TRUE); + dsl_dir_remove_livelist(origin_head->ds_dir, tx, B_TRUE); + + spa_history_log_internal_ds(clone, "clone swap", tx, + "parent=%s", origin_head->ds_dir->dd_myname); +} + +/* + * Given a pool name and a dataset object number in that pool, + * return the name of that dataset. + */ +int +dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + error = dsl_pool_hold(pname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); + if (error == 0) { + dsl_dataset_name(ds, buf); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); + + return (error); +} + +int +dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, + uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv) +{ + int error = 0; + + ASSERT3S(asize, >, 0); + + /* + * *ref_rsrv is the portion of asize that will come from any + * unconsumed refreservation space. + */ + *ref_rsrv = 0; + + mutex_enter(&ds->ds_lock); + /* + * Make a space adjustment for reserved bytes. + */ + if (ds->ds_reserved > dsl_dataset_phys(ds)->ds_unique_bytes) { + ASSERT3U(*used, >=, + ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *used -= + (ds->ds_reserved - dsl_dataset_phys(ds)->ds_unique_bytes); + *ref_rsrv = + asize - MIN(asize, parent_delta(ds, asize + inflight)); + } + + if (!check_quota || ds->ds_quota == 0) { + mutex_exit(&ds->ds_lock); + return (0); + } + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (dsl_dataset_phys(ds)->ds_referenced_bytes + inflight >= + ds->ds_quota) { + if (inflight > 0 || + dsl_dataset_phys(ds)->ds_referenced_bytes < ds->ds_quota) + error = SET_ERROR(ERESTART); + else + error = SET_ERROR(EDQUOT); + } + mutex_exit(&ds->ds_lock); + + return (error); +} + +typedef struct dsl_dataset_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dataset_set_qr_arg_t; + + +/* ARGSUSED */ +static int +dsl_dataset_set_refquota_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval; + + if (spa_version(dp->dp_spa) < SPA_VERSION_REFQUOTA) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + if (newval < dsl_dataset_phys(ds)->ds_referenced_bytes || + newval < ds->ds_reserved) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dataset_set_refquota_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFQUOTA), &newval)); + + if (ds->ds_quota != newval) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = newval; + } + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_refquota(const char *dsname, zprop_source_t source, + uint64_t refquota) +{ + dsl_dataset_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refquota; + + return (dsl_sync_task(dsname, dsl_dataset_set_refquota_check, + dsl_dataset_set_refquota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +static int +dsl_dataset_set_refreservation_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t newval, unique; + + if (spa_version(dp->dp_spa) < SPA_VERSION_REFRESERVATION) + return (SET_ERROR(ENOTSUP)); + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + if (ds->ds_is_snapshot) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + mutex_enter(&ds->ds_lock); + if (!DS_UNIQUE_IS_ACCURATE(ds)) + dsl_dataset_recalc_head_uniq(ds); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + mutex_exit(&ds->ds_lock); + + if (MAX(unique, newval) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, newval) - + MAX(unique, ds->ds_reserved); + + if (delta > + dsl_dir_space_available(ds->ds_dir, NULL, 0, B_TRUE) || + (ds->ds_quota > 0 && newval > ds->ds_quota)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOSPC)); + } + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_dataset_set_refreservation_sync_impl(dsl_dataset_t *ds, + zprop_source_t source, uint64_t value, dmu_tx_t *tx) +{ + uint64_t newval; + uint64_t unique; + int64_t delta; + + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), + source, sizeof (value), 1, &value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &newval)); + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + mutex_enter(&ds->ds_lock); + ASSERT(DS_UNIQUE_IS_ACCURATE(ds)); + unique = dsl_dataset_phys(ds)->ds_unique_bytes; + delta = MAX(0, (int64_t)(newval - unique)) - + MAX(0, (int64_t)(ds->ds_reserved - unique)); + ds->ds_reserved = newval; + mutex_exit(&ds->ds_lock); + + dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); + mutex_exit(&ds->ds_dir->dd_lock); +} + +static void +dsl_dataset_set_refreservation_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + dsl_dataset_set_refreservation_sync_impl(ds, + ddsqra->ddsqra_source, ddsqra->ddsqra_value, tx); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_refreservation(const char *dsname, zprop_source_t source, + uint64_t refreservation) +{ + dsl_dataset_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = dsname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = refreservation; + + return (dsl_sync_task(dsname, dsl_dataset_set_refreservation_check, + dsl_dataset_set_refreservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +typedef struct dsl_dataset_set_compression_arg { + const char *ddsca_name; + zprop_source_t ddsca_source; + uint64_t ddsca_value; +} dsl_dataset_set_compression_arg_t; + +/* ARGSUSED */ +static int +dsl_dataset_set_compression_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + + if (f == SPA_FEATURE_NONE) + return (SET_ERROR(EINVAL)); + + if (!spa_feature_is_enabled(dp->dp_spa, f)) + return (SET_ERROR(ENOTSUP)); + + return (0); +} + +static void +dsl_dataset_set_compression_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_set_compression_arg_t *ddsca = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds = NULL; + + uint64_t compval = ZIO_COMPRESS_ALGO(ddsca->ddsca_value); + spa_feature_t f = zio_compress_to_feature(compval); + ASSERT3S(spa_feature_table[f].fi_type, ==, ZFEATURE_TYPE_BOOLEAN); + + VERIFY0(dsl_dataset_hold(dp, ddsca->ddsca_name, FTAG, &ds)); + if (zfeature_active(f, ds->ds_feature[f]) != B_TRUE) { + ds->ds_feature_activation[f] = (void *)B_TRUE; + dsl_dataset_activate_feature(ds->ds_object, f, + ds->ds_feature_activation[f], tx); + ds->ds_feature[f] = ds->ds_feature_activation[f]; + } + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dataset_set_compression(const char *dsname, zprop_source_t source, + uint64_t compression) +{ + dsl_dataset_set_compression_arg_t ddsca; + + /* + * The sync task is only required for zstd in order to activate + * the feature flag when the property is first set. + */ + if (ZIO_COMPRESS_ALGO(compression) != ZIO_COMPRESS_ZSTD) + return (0); + + ddsca.ddsca_name = dsname; + ddsca.ddsca_source = source; + ddsca.ddsca_value = compression; + + return (dsl_sync_task(dsname, dsl_dataset_set_compression_check, + dsl_dataset_set_compression_sync, &ddsca, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* + * Return (in *usedp) the amount of space referenced by "new" that was not + * referenced at the time the bookmark corresponds to. "New" may be a + * snapshot or a head. The bookmark must be before new, in + * new's filesystem (or its origin) -- caller verifies this. + * + * The written space is calculated by considering two components: First, we + * ignore any freed space, and calculate the written as new's used space + * minus old's used space. Next, we add in the amount of space that was freed + * between the two time points, thus reducing new's used space relative to + * old's. Specifically, this is the space that was born before + * zbm_creation_txg, and freed before new (ie. on new's deadlist or a + * previous deadlist). + * + * space freed [---------------------] + * snapshots ---O-------O--------O-------O------ + * bookmark new + * + * Note, the bookmark's zbm_*_bytes_refd must be valid, but if the HAS_FBN + * flag is not set, we will calculate the freed_before_next based on the + * next snapshot's deadlist, rather than using zbm_*_freed_before_next_snap. + */ +static int +dsl_dataset_space_written_impl(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + dsl_pool_t *dp = new->ds_dir->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + if (dsl_dataset_is_snapshot(new)) { + ASSERT3U(bmp->zbm_creation_txg, <, + dsl_dataset_phys(new)->ds_creation_txg); + } + + *usedp = 0; + *usedp += dsl_dataset_phys(new)->ds_referenced_bytes; + *usedp -= bmp->zbm_referenced_bytes_refd; + + *compp = 0; + *compp += dsl_dataset_phys(new)->ds_compressed_bytes; + *compp -= bmp->zbm_compressed_bytes_refd; + + *uncompp = 0; + *uncompp += dsl_dataset_phys(new)->ds_uncompressed_bytes; + *uncompp -= bmp->zbm_uncompressed_bytes_refd; + + dsl_dataset_t *snap = new; + + while (dsl_dataset_phys(snap)->ds_prev_snap_txg > + bmp->zbm_creation_txg) { + uint64_t used, comp, uncomp; + + dsl_deadlist_space_range(&snap->ds_deadlist, + 0, bmp->zbm_creation_txg, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + uint64_t snapobj = dsl_dataset_phys(snap)->ds_prev_snap_obj; + if (snap != new) + dsl_dataset_rele(snap, FTAG); + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &snap); + if (err != 0) + break; + } + + /* + * We might not have the FBN if we are calculating written from + * a snapshot (because we didn't know the correct "next" snapshot + * until now). + */ + if (bmp->zbm_flags & ZBM_FLAG_HAS_FBN) { + *usedp += bmp->zbm_referenced_freed_before_next_snap; + *compp += bmp->zbm_compressed_freed_before_next_snap; + *uncompp += bmp->zbm_uncompressed_freed_before_next_snap; + } else { + ASSERT3U(dsl_dataset_phys(snap)->ds_prev_snap_txg, ==, + bmp->zbm_creation_txg); + uint64_t used, comp, uncomp; + dsl_deadlist_space(&snap->ds_deadlist, &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + } + if (snap != new) + dsl_dataset_rele(snap, FTAG); + return (err); +} + +/* + * Return (in *usedp) the amount of space written in new that was not + * present at the time the bookmark corresponds to. New may be a + * snapshot or the head. Old must be a bookmark before new, in + * new's filesystem (or its origin) -- caller verifies this. + */ +int +dsl_dataset_space_written_bookmark(zfs_bookmark_phys_t *bmp, + dsl_dataset_t *new, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (!(bmp->zbm_flags & ZBM_FLAG_HAS_FBN)) + return (SET_ERROR(ENOTSUP)); + return (dsl_dataset_space_written_impl(bmp, new, + usedp, compp, uncompp)); +} + +/* + * Return (in *usedp) the amount of space written in new that is not + * present in oldsnap. New may be a snapshot or the head. Old must be + * a snapshot before new, in new's filesystem (or its origin). If not then + * fail and return EINVAL. + */ +int +dsl_dataset_space_written(dsl_dataset_t *oldsnap, dsl_dataset_t *new, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + if (!dsl_dataset_is_before(new, oldsnap, 0)) + return (SET_ERROR(EINVAL)); + + zfs_bookmark_phys_t zbm = { 0 }; + dsl_dataset_phys_t *dsp = dsl_dataset_phys(oldsnap); + zbm.zbm_guid = dsp->ds_guid; + zbm.zbm_creation_txg = dsp->ds_creation_txg; + zbm.zbm_creation_time = dsp->ds_creation_time; + zbm.zbm_referenced_bytes_refd = dsp->ds_referenced_bytes; + zbm.zbm_compressed_bytes_refd = dsp->ds_compressed_bytes; + zbm.zbm_uncompressed_bytes_refd = dsp->ds_uncompressed_bytes; + + /* + * If oldsnap is the origin (or origin's origin, ...) of new, + * we can't easily calculate the effective FBN. Therefore, + * we do not set ZBM_FLAG_HAS_FBN, so that the _impl will calculate + * it relative to the correct "next": the next snapshot towards "new", + * rather than the next snapshot in oldsnap's dsl_dir. + */ + return (dsl_dataset_space_written_impl(&zbm, new, + usedp, compp, uncompp)); +} + +/* + * Return (in *usedp) the amount of space that will be reclaimed if firstsnap, + * lastsnap, and all snapshots in between are deleted. + * + * blocks that would be freed [---------------------------] + * snapshots ---O-------O--------O-------O--------O + * firstsnap lastsnap + * + * This is the set of blocks that were born after the snap before firstsnap, + * (birth > firstsnap->prev_snap_txg) and died before the snap after the + * last snap (ie, is on lastsnap->ds_next->ds_deadlist or an earlier deadlist). + * We calculate this by iterating over the relevant deadlists (from the snap + * after lastsnap, backward to the snap after firstsnap), summing up the + * space on the deadlist that was born after the snap before firstsnap. + */ +int +dsl_dataset_space_wouldfree(dsl_dataset_t *firstsnap, + dsl_dataset_t *lastsnap, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + int err = 0; + uint64_t snapobj; + dsl_pool_t *dp = firstsnap->ds_dir->dd_pool; + + ASSERT(firstsnap->ds_is_snapshot); + ASSERT(lastsnap->ds_is_snapshot); + + /* + * Check that the snapshots are in the same dsl_dir, and firstsnap + * is before lastsnap. + */ + if (firstsnap->ds_dir != lastsnap->ds_dir || + dsl_dataset_phys(firstsnap)->ds_creation_txg > + dsl_dataset_phys(lastsnap)->ds_creation_txg) + return (SET_ERROR(EINVAL)); + + *usedp = *compp = *uncompp = 0; + + snapobj = dsl_dataset_phys(lastsnap)->ds_next_snap_obj; + while (snapobj != firstsnap->ds_object) { + dsl_dataset_t *ds; + uint64_t used, comp, uncomp; + + err = dsl_dataset_hold_obj(dp, snapobj, FTAG, &ds); + if (err != 0) + break; + + dsl_deadlist_space_range(&ds->ds_deadlist, + dsl_dataset_phys(firstsnap)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + *usedp += used; + *compp += comp; + *uncompp += uncomp; + + snapobj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + ASSERT3U(snapobj, !=, 0); + dsl_dataset_rele(ds, FTAG); + } + return (err); +} + +/* + * Return TRUE if 'earlier' is an earlier snapshot in 'later's timeline. + * For example, they could both be snapshots of the same filesystem, and + * 'earlier' is before 'later'. Or 'earlier' could be the origin of + * 'later's filesystem. Or 'earlier' could be an older snapshot in the origin's + * filesystem. Or 'earlier' could be the origin's origin. + * + * If non-zero, earlier_txg is used instead of earlier's ds_creation_txg. + */ +boolean_t +dsl_dataset_is_before(dsl_dataset_t *later, dsl_dataset_t *earlier, + uint64_t earlier_txg) +{ + dsl_pool_t *dp = later->ds_dir->dd_pool; + int error; + boolean_t ret; + + ASSERT(dsl_pool_config_held(dp)); + ASSERT(earlier->ds_is_snapshot || earlier_txg != 0); + + if (earlier_txg == 0) + earlier_txg = dsl_dataset_phys(earlier)->ds_creation_txg; + + if (later->ds_is_snapshot && + earlier_txg >= dsl_dataset_phys(later)->ds_creation_txg) + return (B_FALSE); + + if (later->ds_dir == earlier->ds_dir) + return (B_TRUE); + + /* + * We check dd_origin_obj explicitly here rather than using + * dsl_dir_is_clone() so that we will return TRUE if "earlier" + * is $ORIGIN@$ORIGIN. dsl_dataset_space_written() depends on + * this behavior. + */ + if (dsl_dir_phys(later->ds_dir)->dd_origin_obj == 0) + return (B_FALSE); + + dsl_dataset_t *origin; + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(later->ds_dir)->dd_origin_obj, FTAG, &origin); + if (error != 0) + return (B_FALSE); + if (dsl_dataset_phys(origin)->ds_creation_txg == earlier_txg && + origin->ds_dir == earlier->ds_dir) { + dsl_dataset_rele(origin, FTAG); + return (B_TRUE); + } + ret = dsl_dataset_is_before(origin, earlier, earlier_txg); + dsl_dataset_rele(origin, FTAG); + return (ret); +} + +void +dsl_dataset_zapify(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, ds->ds_object, DMU_OT_DSL_DATASET, tx); +} + +boolean_t +dsl_dataset_is_zapified(dsl_dataset_t *ds) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(ds->ds_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +boolean_t +dsl_dataset_has_resume_receive_state(dsl_dataset_t *ds) +{ + return (dsl_dataset_is_zapified(ds) && + zap_contains(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_RESUME_TOGUID) == 0); +} + +uint64_t +dsl_dataset_get_remap_deadlist_object(dsl_dataset_t *ds) +{ + uint64_t remap_deadlist_obj; + int err; + + if (!dsl_dataset_is_zapified(ds)) + return (0); + + err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (remap_deadlist_obj), 1, + &remap_deadlist_obj); + + if (err != 0) { + VERIFY3S(err, ==, ENOENT); + return (0); + } + + ASSERT(remap_deadlist_obj != 0); + return (remap_deadlist_obj); +} + +boolean_t +dsl_dataset_remap_deadlist_exists(dsl_dataset_t *ds) +{ + EQUIV(dsl_deadlist_is_open(&ds->ds_remap_deadlist), + dsl_dataset_get_remap_deadlist_object(ds) != 0); + return (dsl_deadlist_is_open(&ds->ds_remap_deadlist)); +} + +static void +dsl_dataset_set_remap_deadlist_object(dsl_dataset_t *ds, uint64_t obj, + dmu_tx_t *tx) +{ + ASSERT(obj != 0); + dsl_dataset_zapify(ds, tx); + VERIFY0(zap_add(ds->ds_dir->dd_pool->dp_meta_objset, ds->ds_object, + DS_FIELD_REMAP_DEADLIST, sizeof (obj), 1, &obj, tx)); +} + +static void +dsl_dataset_unset_remap_deadlist_object(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + VERIFY0(zap_remove(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_REMAP_DEADLIST, tx)); +} + +void +dsl_dataset_destroy_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_object; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_dataset_remap_deadlist_exists(ds)); + + remap_deadlist_object = ds->ds_remap_deadlist.dl_object; + dsl_deadlist_close(&ds->ds_remap_deadlist); + dsl_deadlist_free(spa_meta_objset(spa), remap_deadlist_object, tx); + dsl_dataset_unset_remap_deadlist_object(ds, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_dataset_create_remap_deadlist(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t remap_deadlist_obj; + spa_t *spa = ds->ds_dir->dd_pool->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(MUTEX_HELD(&ds->ds_remap_deadlist_lock)); + /* + * Currently we only create remap deadlists when there are indirect + * vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + remap_deadlist_obj = dsl_deadlist_clone( + &ds->ds_deadlist, UINT64_MAX, + dsl_dataset_phys(ds)->ds_prev_snap_obj, tx); + dsl_dataset_set_remap_deadlist_object(ds, + remap_deadlist_obj, tx); + dsl_deadlist_open(&ds->ds_remap_deadlist, spa_meta_objset(spa), + remap_deadlist_obj); + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_dataset_activate_redaction(dsl_dataset_t *ds, uint64_t *redact_snaps, + uint64_t num_redact_snaps, dmu_tx_t *tx) +{ + uint64_t dsobj = ds->ds_object; + struct feature_type_uint64_array_arg *ftuaa = + kmem_zalloc(sizeof (*ftuaa), KM_SLEEP); + ftuaa->length = (int64_t)num_redact_snaps; + if (num_redact_snaps > 0) { + ftuaa->array = kmem_alloc(num_redact_snaps * sizeof (uint64_t), + KM_SLEEP); + bcopy(redact_snaps, ftuaa->array, num_redact_snaps * + sizeof (uint64_t)); + } + dsl_dataset_activate_feature(dsobj, SPA_FEATURE_REDACTED_DATASETS, + ftuaa, tx); + ds->ds_feature[SPA_FEATURE_REDACTED_DATASETS] = ftuaa; +} + +/* BEGIN CSTYLED */ +#if defined(_LP64) +#define RECORDSIZE_PERM ZMOD_RW +#else +/* Limited to 1M on 32-bit platforms due to lack of virtual address space */ +#define RECORDSIZE_PERM ZMOD_RD +#endif +ZFS_MODULE_PARAM(zfs, zfs_, max_recordsize, INT, RECORDSIZE_PERM, + "Max allowed record size"); + +ZFS_MODULE_PARAM(zfs, zfs_, allow_redacted_dataset_mount, INT, ZMOD_RW, + "Allow mounting of redacted datasets"); +/* END CSTYLED */ + +EXPORT_SYMBOL(dsl_dataset_hold); +EXPORT_SYMBOL(dsl_dataset_hold_flags); +EXPORT_SYMBOL(dsl_dataset_hold_obj); +EXPORT_SYMBOL(dsl_dataset_hold_obj_flags); +EXPORT_SYMBOL(dsl_dataset_own); +EXPORT_SYMBOL(dsl_dataset_own_obj); +EXPORT_SYMBOL(dsl_dataset_name); +EXPORT_SYMBOL(dsl_dataset_rele); +EXPORT_SYMBOL(dsl_dataset_rele_flags); +EXPORT_SYMBOL(dsl_dataset_disown); +EXPORT_SYMBOL(dsl_dataset_tryown); +EXPORT_SYMBOL(dsl_dataset_create_sync); +EXPORT_SYMBOL(dsl_dataset_create_sync_dd); +EXPORT_SYMBOL(dsl_dataset_snapshot_check); +EXPORT_SYMBOL(dsl_dataset_snapshot_sync); +EXPORT_SYMBOL(dsl_dataset_promote); +EXPORT_SYMBOL(dsl_dataset_user_hold); +EXPORT_SYMBOL(dsl_dataset_user_release); +EXPORT_SYMBOL(dsl_dataset_get_holds); +EXPORT_SYMBOL(dsl_dataset_get_blkptr); +EXPORT_SYMBOL(dsl_dataset_get_spa); +EXPORT_SYMBOL(dsl_dataset_modified_since_snap); +EXPORT_SYMBOL(dsl_dataset_space_written); +EXPORT_SYMBOL(dsl_dataset_space_wouldfree); +EXPORT_SYMBOL(dsl_dataset_sync); +EXPORT_SYMBOL(dsl_dataset_block_born); +EXPORT_SYMBOL(dsl_dataset_block_kill); +EXPORT_SYMBOL(dsl_dataset_dirty); +EXPORT_SYMBOL(dsl_dataset_stats); +EXPORT_SYMBOL(dsl_dataset_fast_stat); +EXPORT_SYMBOL(dsl_dataset_space); +EXPORT_SYMBOL(dsl_dataset_fsid_guid); +EXPORT_SYMBOL(dsl_dsobj_to_dsname); +EXPORT_SYMBOL(dsl_dataset_check_quota); +EXPORT_SYMBOL(dsl_dataset_clone_swap_check_impl); +EXPORT_SYMBOL(dsl_dataset_clone_swap_sync_impl); diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c new file mode 100644 index 000000000000..bad2d56eefdd --- /dev/null +++ b/module/zfs/dsl_deadlist.c @@ -0,0 +1,1012 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#include +#include +#include +#include +#include + +/* + * Deadlist concurrency: + * + * Deadlists can only be modified from the syncing thread. + * + * Except for dsl_deadlist_insert(), it can only be modified with the + * dp_config_rwlock held with RW_WRITER. + * + * The accessors (dsl_deadlist_space() and dsl_deadlist_space_range()) can + * be called concurrently, from open context, with the dl_config_rwlock held + * with RW_READER. + * + * Therefore, we only need to provide locking between dsl_deadlist_insert() and + * the accessors, protecting: + * dl_phys->dl_used,comp,uncomp + * and protecting the dl_tree from being loaded. + * The locking is provided by dl_lock. Note that locking on the bpobj_t + * provides its own locking, and dl_oldfmt is immutable. + */ + +/* + * Livelist Overview + * ================ + * + * Livelists use the same 'deadlist_t' struct as deadlists and are also used + * to track blkptrs over the lifetime of a dataset. Livelists however, belong + * to clones and track the blkptrs that are clone-specific (were born after + * the clone's creation). The exception is embedded block pointers which are + * not included in livelists because they do not need to be freed. + * + * When it comes time to delete the clone, the livelist provides a quick + * reference as to what needs to be freed. For this reason, livelists also track + * when clone-specific blkptrs are freed before deletion to prevent double + * frees. Each blkptr in a livelist is marked as a FREE or an ALLOC and the + * deletion algorithm iterates backwards over the livelist, matching + * FREE/ALLOC pairs and then freeing those ALLOCs which remain. livelists + * are also updated in the case when blkptrs are remapped: the old version + * of the blkptr is cancelled out with a FREE and the new version is tracked + * with an ALLOC. + * + * To bound the amount of memory required for deletion, livelists over a + * certain size are spread over multiple entries. Entries are grouped by + * birth txg so we can be sure the ALLOC/FREE pair for a given blkptr will + * be in the same entry. This allows us to delete livelists incrementally + * over multiple syncs, one entry at a time. + * + * During the lifetime of the clone, livelists can get extremely large. + * Their size is managed by periodic condensing (preemptively cancelling out + * FREE/ALLOC pairs). Livelists are disabled when a clone is promoted or when + * the shared space between the clone and its origin is so small that it + * doesn't make sense to use livelists anymore. + */ + +/* + * The threshold sublist size at which we create a new sub-livelist for the + * next txg. However, since blkptrs of the same transaction group must be in + * the same sub-list, the actual sublist size may exceed this. When picking the + * size we had to balance the fact that larger sublists mean fewer sublists + * (decreasing the cost of insertion) against the consideration that sublists + * will be loaded into memory and shouldn't take up an inordinate amount of + * space. We settled on ~500000 entries, corresponding to roughly 128M. + */ +unsigned long zfs_livelist_max_entries = 500000; + +/* + * We can approximate how much of a performance gain a livelist will give us + * based on the percentage of blocks shared between the clone and its origin. + * 0 percent shared means that the clone has completely diverged and that the + * old method is maximally effective: every read from the block tree will + * result in lots of frees. Livelists give us gains when they track blocks + * scattered across the tree, when one read in the old method might only + * result in a few frees. Once the clone has been overwritten enough, + * writes are no longer sparse and we'll no longer get much of a benefit from + * tracking them with a livelist. We chose a lower limit of 75 percent shared + * (25 percent overwritten). This means that 1/4 of all block pointers will be + * freed (e.g. each read frees 256, out of a max of 1024) so we expect livelists + * to make deletion 4x faster. Once the amount of shared space drops below this + * threshold, the clone will revert to the old deletion method. + */ +int zfs_livelist_min_percent_shared = 75; + +static int +dsl_deadlist_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_entry_t *dle1 = arg1; + const dsl_deadlist_entry_t *dle2 = arg2; + + return (TREE_CMP(dle1->dle_mintxg, dle2->dle_mintxg)); +} + +static int +dsl_deadlist_cache_compare(const void *arg1, const void *arg2) +{ + const dsl_deadlist_cache_entry_t *dlce1 = arg1; + const dsl_deadlist_cache_entry_t *dlce2 = arg2; + + return (TREE_CMP(dlce1->dlce_mintxg, dlce2->dlce_mintxg)); +} + +static void +dsl_deadlist_load_tree(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) { + /* + * After loading the tree, the caller may modify the tree, + * e.g. to add or remove nodes, or to make a node no longer + * refer to the empty_bpobj. These changes would make the + * dl_cache incorrect. Therefore we discard the cache here, + * so that it can't become incorrect. + */ + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + dl->dl_havecache = B_FALSE; + } + if (dl->dl_havetree) + return; + + avl_create(&dl->dl_tree, dsl_deadlist_compare, + sizeof (dsl_deadlist_entry_t), + offsetof(dsl_deadlist_entry_t, dle_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + dsl_deadlist_entry_t *dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = zfs_strtonum(za.za_name, NULL); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dle->dle_bpobj.bpo_object = za.za_first_integer; + dmu_prefetch(dl->dl_os, dle->dle_bpobj.bpo_object, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + + avl_add(&dl->dl_tree, dle); + } + VERIFY3U(error, ==, ENOENT); + zap_cursor_fini(&zc); + + for (dsl_deadlist_entry_t *dle = avl_first(&dl->dl_tree); + dle != NULL; dle = AVL_NEXT(&dl->dl_tree, dle)) { + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, + dle->dle_bpobj.bpo_object)); + } + dl->dl_havetree = B_TRUE; +} + +/* + * Load only the non-empty bpobj's into the dl_cache. The cache is an analog + * of the dl_tree, but contains only non-empty_bpobj nodes from the ZAP. It + * is used only for gathering space statistics. The dl_cache has two + * advantages over the dl_tree: + * + * 1. Loading the dl_cache is ~5x faster than loading the dl_tree (if it's + * mostly empty_bpobj's), due to less CPU overhead to open the empty_bpobj + * many times and to inquire about its (zero) space stats many times. + * + * 2. The dl_cache uses less memory than the dl_tree. We only need to load + * the dl_tree of snapshots when deleting a snapshot, after which we free the + * dl_tree with dsl_deadlist_discard_tree + */ +static void +dsl_deadlist_load_cache(dsl_deadlist_t *dl) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + ASSERT(!dl->dl_oldfmt); + if (dl->dl_havecache) + return; + + uint64_t empty_bpobj = dmu_objset_pool(dl->dl_os)->dp_empty_bpobj; + + avl_create(&dl->dl_cache, dsl_deadlist_cache_compare, + sizeof (dsl_deadlist_cache_entry_t), + offsetof(dsl_deadlist_cache_entry_t, dlce_node)); + for (zap_cursor_init(&zc, dl->dl_os, dl->dl_object); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if (za.za_first_integer == empty_bpobj) + continue; + dsl_deadlist_cache_entry_t *dlce = + kmem_zalloc(sizeof (*dlce), KM_SLEEP); + dlce->dlce_mintxg = zfs_strtonum(za.za_name, NULL); + + /* + * Prefetch all the bpobj's so that we do that i/o + * in parallel. Then open them all in a second pass. + */ + dlce->dlce_bpobj = za.za_first_integer; + dmu_prefetch(dl->dl_os, dlce->dlce_bpobj, + 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + avl_add(&dl->dl_cache, dlce); + } + VERIFY3U(error, ==, ENOENT); + zap_cursor_fini(&zc); + + for (dsl_deadlist_cache_entry_t *dlce = avl_first(&dl->dl_cache); + dlce != NULL; dlce = AVL_NEXT(&dl->dl_cache, dlce)) { + bpobj_t bpo; + VERIFY0(bpobj_open(&bpo, dl->dl_os, dlce->dlce_bpobj)); + + VERIFY0(bpobj_space(&bpo, + &dlce->dlce_bytes, &dlce->dlce_comp, &dlce->dlce_uncomp)); + bpobj_close(&bpo); + } + dl->dl_havecache = B_TRUE; +} + +/* + * Discard the tree to save memory. + */ +void +dsl_deadlist_discard_tree(dsl_deadlist_t *dl) +{ + mutex_enter(&dl->dl_lock); + + if (!dl->dl_havetree) { + mutex_exit(&dl->dl_lock); + return; + } + dsl_deadlist_entry_t *dle; + void *cookie = NULL; + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + + dl->dl_havetree = B_FALSE; + mutex_exit(&dl->dl_lock); +} + +void +dsl_deadlist_iterate(dsl_deadlist_t *dl, deadlist_iter_t func, void *args) +{ + dsl_deadlist_entry_t *dle; + + ASSERT(dsl_deadlist_is_open(dl)); + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + mutex_exit(&dl->dl_lock); + for (dle = avl_first(&dl->dl_tree); dle != NULL; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + if (func(args, dle) != 0) + break; + } +} + +void +dsl_deadlist_open(dsl_deadlist_t *dl, objset_t *os, uint64_t object) +{ + dmu_object_info_t doi; + + ASSERT(!dsl_deadlist_is_open(dl)); + + mutex_init(&dl->dl_lock, NULL, MUTEX_DEFAULT, NULL); + dl->dl_os = os; + dl->dl_object = object; + VERIFY0(dmu_bonus_hold(os, object, dl, &dl->dl_dbuf)); + dmu_object_info_from_db(dl->dl_dbuf, &doi); + if (doi.doi_type == DMU_OT_BPOBJ) { + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_oldfmt = B_TRUE; + VERIFY0(bpobj_open(&dl->dl_bpobj, os, object)); + return; + } + + dl->dl_oldfmt = B_FALSE; + dl->dl_phys = dl->dl_dbuf->db_data; + dl->dl_havetree = B_FALSE; + dl->dl_havecache = B_FALSE; +} + +boolean_t +dsl_deadlist_is_open(dsl_deadlist_t *dl) +{ + return (dl->dl_os != NULL); +} + +void +dsl_deadlist_close(dsl_deadlist_t *dl) +{ + ASSERT(dsl_deadlist_is_open(dl)); + mutex_destroy(&dl->dl_lock); + + if (dl->dl_oldfmt) { + dl->dl_oldfmt = B_FALSE; + bpobj_close(&dl->dl_bpobj); + dl->dl_os = NULL; + dl->dl_object = 0; + return; + } + + if (dl->dl_havetree) { + dsl_deadlist_entry_t *dle; + void *cookie = NULL; + while ((dle = avl_destroy_nodes(&dl->dl_tree, &cookie)) + != NULL) { + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + } + avl_destroy(&dl->dl_tree); + } + if (dl->dl_havecache) { + dsl_deadlist_cache_entry_t *dlce; + void *cookie = NULL; + while ((dlce = avl_destroy_nodes(&dl->dl_cache, &cookie)) + != NULL) { + kmem_free(dlce, sizeof (*dlce)); + } + avl_destroy(&dl->dl_cache); + } + dmu_buf_rele(dl->dl_dbuf, dl); + dl->dl_dbuf = NULL; + dl->dl_phys = NULL; + dl->dl_os = NULL; + dl->dl_object = 0; +} + +uint64_t +dsl_deadlist_alloc(objset_t *os, dmu_tx_t *tx) +{ + if (spa_version(dmu_objset_spa(os)) < SPA_VERSION_DEADLISTS) + return (bpobj_alloc(os, SPA_OLD_MAXBLOCKSIZE, tx)); + return (zap_create(os, DMU_OT_DEADLIST, DMU_OT_DEADLIST_HDR, + sizeof (dsl_deadlist_phys_t), tx)); +} + +void +dsl_deadlist_free(objset_t *os, uint64_t dlobj, dmu_tx_t *tx) +{ + dmu_object_info_t doi; + zap_cursor_t zc; + zap_attribute_t za; + int error; + + VERIFY0(dmu_object_info(os, dlobj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_free(os, dlobj, tx); + return; + } + + for (zap_cursor_init(&zc, os, dlobj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t obj = za.za_first_integer; + if (obj == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, obj, tx); + } + VERIFY3U(error, ==, ENOENT); + zap_cursor_fini(&zc); + VERIFY0(dmu_object_free(os, dlobj, tx)); +} + +static void +dle_enqueue(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + ASSERT(MUTEX_HELD(&dl->dl_lock)); + if (dle->dle_bpobj.bpo_object == + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + uint64_t obj = bpobj_alloc(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } + bpobj_enqueue(&dle->dle_bpobj, bp, bp_freed, tx); +} + +static void +dle_enqueue_subobj(dsl_deadlist_t *dl, dsl_deadlist_entry_t *dle, + uint64_t obj, dmu_tx_t *tx) +{ + ASSERT(MUTEX_HELD(&dl->dl_lock)); + if (dle->dle_bpobj.bpo_object != + dmu_objset_pool(dl->dl_os)->dp_empty_bpobj) { + bpobj_enqueue_subobj(&dle->dle_bpobj, obj, tx); + } else { + bpobj_close(&dle->dle_bpobj); + bpobj_decr_empty(dl->dl_os, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + VERIFY0(zap_update_int_key(dl->dl_os, dl->dl_object, + dle->dle_mintxg, obj, tx)); + } +} + +void +dsl_deadlist_insert(dsl_deadlist_t *dl, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + if (dl->dl_oldfmt) { + bpobj_enqueue(&dl->dl_bpobj, bp, bp_freed, tx); + return; + } + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + + int sign = bp_freed ? -1 : +1; + dl->dl_phys->dl_used += + sign * bp_get_dsize_sync(dmu_objset_spa(dl->dl_os), bp); + dl->dl_phys->dl_comp += sign * BP_GET_PSIZE(bp); + dl->dl_phys->dl_uncomp += sign * BP_GET_UCSIZE(bp); + + dle_tofind.dle_mintxg = bp->blk_birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + else + dle = AVL_PREV(&dl->dl_tree, dle); + + if (dle == NULL) { + zfs_panic_recover("blkptr at %p has invalid BLK_BIRTH %llu", + bp, (longlong_t)bp->blk_birth); + dle = avl_first(&dl->dl_tree); + } + + ASSERT3P(dle, !=, NULL); + dle_enqueue(dl, dle, bp, bp_freed, tx); + mutex_exit(&dl->dl_lock); +} + +int +dsl_deadlist_insert_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_FALSE, tx); + return (0); +} + +int +dsl_deadlist_insert_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, B_TRUE, tx); + return (0); +} + +/* + * Insert new key in deadlist, which must be > all current entries. + * mintxg is not inclusive. + */ +void +dsl_deadlist_add_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t obj; + dsl_deadlist_entry_t *dle; + + if (dl->dl_oldfmt) + return; + + dle = kmem_alloc(sizeof (*dle), KM_SLEEP); + dle->dle_mintxg = mintxg; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, dl->dl_os, obj)); + avl_add(&dl->dl_tree, dle); + + VERIFY0(zap_add_int_key(dl->dl_os, dl->dl_object, + mintxg, obj, tx)); + mutex_exit(&dl->dl_lock); +} + +/* + * Remove this key, merging its entries into the previous key. + */ +void +dsl_deadlist_remove_key(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle, *dle_prev; + + if (dl->dl_oldfmt) + return; + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + ASSERT3P(dle, !=, NULL); + dle_prev = AVL_PREV(&dl->dl_tree, dle); + + dle_enqueue_subobj(dl, dle_prev, dle->dle_bpobj.bpo_object, tx); + + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, mintxg, tx)); + mutex_exit(&dl->dl_lock); +} + +/* + * Remove a deadlist entry and all of its contents by removing the entry from + * the deadlist's avl tree, freeing the entry's bpobj and adjusting the + * deadlist's space accounting accordingly. + */ +void +dsl_deadlist_remove_entry(dsl_deadlist_t *dl, uint64_t mintxg, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + objset_t *os = dl->dl_os; + + if (dl->dl_oldfmt) + return; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, NULL); + VERIFY3P(dle, !=, NULL); + + avl_remove(&dl->dl_tree, dle); + VERIFY0(zap_remove_int(os, dl->dl_object, mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) { + bpobj_decr_empty(os, tx); + } else { + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + } + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + mutex_exit(&dl->dl_lock); +} + +/* + * Clear out the contents of a deadlist_entry by freeing its bpobj, + * replacing it with an empty bpobj and adjusting the deadlist's + * space accounting + */ +void +dsl_deadlist_clear_entry(dsl_deadlist_entry_t *dle, dsl_deadlist_t *dl, + dmu_tx_t *tx) +{ + uint64_t new_obj, used, comp, uncomp; + objset_t *os = dl->dl_os; + + mutex_enter(&dl->dl_lock); + VERIFY0(zap_remove_int(os, dl->dl_object, dle->dle_mintxg, tx)); + VERIFY0(bpobj_space(&dle->dle_bpobj, &used, &comp, &uncomp)); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + if (dle->dle_bpobj.bpo_object == dmu_objset_pool(os)->dp_empty_bpobj) + bpobj_decr_empty(os, tx); + else + bpobj_free(os, dle->dle_bpobj.bpo_object, tx); + bpobj_close(&dle->dle_bpobj); + new_obj = bpobj_alloc_empty(os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dle->dle_bpobj, os, new_obj)); + VERIFY0(zap_add_int_key(os, dl->dl_object, dle->dle_mintxg, + new_obj, tx)); + ASSERT(bpobj_is_empty(&dle->dle_bpobj)); + mutex_exit(&dl->dl_lock); +} + +/* + * Return the first entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_first(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_first(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* + * Return the last entry in deadlist's avl tree + */ +dsl_deadlist_entry_t * +dsl_deadlist_last(dsl_deadlist_t *dl) +{ + dsl_deadlist_entry_t *dle; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + dle = avl_last(&dl->dl_tree); + mutex_exit(&dl->dl_lock); + + return (dle); +} + +/* + * Walk ds's snapshots to regenerate generate ZAP & AVL. + */ +static void +dsl_deadlist_regenerate(objset_t *os, uint64_t dlobj, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_t dl = { 0 }; + dsl_pool_t *dp = dmu_objset_pool(os); + + dsl_deadlist_open(&dl, os, dlobj); + if (dl.dl_oldfmt) { + dsl_deadlist_close(&dl); + return; + } + + while (mrs_obj != 0) { + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold_obj(dp, mrs_obj, FTAG, &ds)); + dsl_deadlist_add_key(&dl, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + mrs_obj = dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_rele(ds, FTAG); + } + dsl_deadlist_close(&dl); +} + +uint64_t +dsl_deadlist_clone(dsl_deadlist_t *dl, uint64_t maxtxg, + uint64_t mrs_obj, dmu_tx_t *tx) +{ + dsl_deadlist_entry_t *dle; + uint64_t newobj; + + newobj = dsl_deadlist_alloc(dl->dl_os, tx); + + if (dl->dl_oldfmt) { + dsl_deadlist_regenerate(dl->dl_os, newobj, mrs_obj, tx); + return (newobj); + } + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_tree(dl); + + for (dle = avl_first(&dl->dl_tree); dle; + dle = AVL_NEXT(&dl->dl_tree, dle)) { + uint64_t obj; + + if (dle->dle_mintxg >= maxtxg) + break; + + obj = bpobj_alloc_empty(dl->dl_os, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(zap_add_int_key(dl->dl_os, newobj, + dle->dle_mintxg, obj, tx)); + } + mutex_exit(&dl->dl_lock); + return (newobj); +} + +void +dsl_deadlist_space(dsl_deadlist_t *dl, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + ASSERT(dsl_deadlist_is_open(dl)); + if (dl->dl_oldfmt) { + VERIFY0(bpobj_space(&dl->dl_bpobj, + usedp, compp, uncompp)); + return; + } + + mutex_enter(&dl->dl_lock); + *usedp = dl->dl_phys->dl_used; + *compp = dl->dl_phys->dl_comp; + *uncompp = dl->dl_phys->dl_uncomp; + mutex_exit(&dl->dl_lock); +} + +/* + * return space used in the range (mintxg, maxtxg]. + * Includes maxtxg, does not include mintxg. + * mintxg and maxtxg must both be keys in the deadlist (unless maxtxg is + * UINT64_MAX). + */ +void +dsl_deadlist_space_range(dsl_deadlist_t *dl, uint64_t mintxg, uint64_t maxtxg, + uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) +{ + dsl_deadlist_cache_entry_t *dlce; + dsl_deadlist_cache_entry_t dlce_tofind; + avl_index_t where; + + if (dl->dl_oldfmt) { + VERIFY0(bpobj_space_range(&dl->dl_bpobj, + mintxg, maxtxg, usedp, compp, uncompp)); + return; + } + + *usedp = *compp = *uncompp = 0; + + mutex_enter(&dl->dl_lock); + dsl_deadlist_load_cache(dl); + dlce_tofind.dlce_mintxg = mintxg; + dlce = avl_find(&dl->dl_cache, &dlce_tofind, &where); + + /* + * If this mintxg doesn't exist, it may be an empty_bpobj which + * is omitted from the sparse tree. Start at the next non-empty + * entry. + */ + if (dlce == NULL) + dlce = avl_nearest(&dl->dl_cache, where, AVL_AFTER); + + for (; dlce && dlce->dlce_mintxg < maxtxg; + dlce = AVL_NEXT(&dl->dl_tree, dlce)) { + *usedp += dlce->dlce_bytes; + *compp += dlce->dlce_comp; + *uncompp += dlce->dlce_uncomp; + } + + mutex_exit(&dl->dl_lock); +} + +static void +dsl_deadlist_insert_bpobj(dsl_deadlist_t *dl, uint64_t obj, uint64_t birth, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + uint64_t used, comp, uncomp; + bpobj_t bpo; + + ASSERT(MUTEX_HELD(&dl->dl_lock)); + + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_space(&bpo, &used, &comp, &uncomp)); + bpobj_close(&bpo); + + dsl_deadlist_load_tree(dl); + + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dl->dl_phys->dl_used += used; + dl->dl_phys->dl_comp += comp; + dl->dl_phys->dl_uncomp += uncomp; + + dle_tofind.dle_mintxg = birth; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_BEFORE); + dle_enqueue_subobj(dl, dle, obj, tx); +} + +static int +dsl_deadlist_insert_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + dsl_deadlist_t *dl = arg; + dsl_deadlist_insert(dl, bp, bp_freed, tx); + return (0); +} + +/* + * Merge the deadlist pointed to by 'obj' into dl. obj will be left as + * an empty deadlist. + */ +void +dsl_deadlist_merge(dsl_deadlist_t *dl, uint64_t obj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + dmu_buf_t *bonus; + dsl_deadlist_phys_t *dlp; + dmu_object_info_t doi; + int error; + + VERIFY0(dmu_object_info(dl->dl_os, obj, &doi)); + if (doi.doi_type == DMU_OT_BPOBJ) { + bpobj_t bpo; + VERIFY0(bpobj_open(&bpo, dl->dl_os, obj)); + VERIFY0(bpobj_iterate(&bpo, dsl_deadlist_insert_cb, dl, tx)); + bpobj_close(&bpo); + return; + } + + mutex_enter(&dl->dl_lock); + for (zap_cursor_init(&zc, dl->dl_os, obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t mintxg = zfs_strtonum(za.za_name, NULL); + dsl_deadlist_insert_bpobj(dl, za.za_first_integer, mintxg, tx); + VERIFY0(zap_remove_int(dl->dl_os, obj, mintxg, tx)); + } + VERIFY3U(error, ==, ENOENT); + zap_cursor_fini(&zc); + + VERIFY0(dmu_bonus_hold(dl->dl_os, obj, FTAG, &bonus)); + dlp = bonus->db_data; + dmu_buf_will_dirty(bonus, tx); + bzero(dlp, sizeof (*dlp)); + dmu_buf_rele(bonus, FTAG); + mutex_exit(&dl->dl_lock); +} + +/* + * Remove entries on dl that are born > mintxg, and put them on the bpobj. + */ +void +dsl_deadlist_move_bpobj(dsl_deadlist_t *dl, bpobj_t *bpo, uint64_t mintxg, + dmu_tx_t *tx) +{ + dsl_deadlist_entry_t dle_tofind; + dsl_deadlist_entry_t *dle; + avl_index_t where; + + ASSERT(!dl->dl_oldfmt); + + mutex_enter(&dl->dl_lock); + dmu_buf_will_dirty(dl->dl_dbuf, tx); + dsl_deadlist_load_tree(dl); + + dle_tofind.dle_mintxg = mintxg; + dle = avl_find(&dl->dl_tree, &dle_tofind, &where); + if (dle == NULL) + dle = avl_nearest(&dl->dl_tree, where, AVL_AFTER); + while (dle) { + uint64_t used, comp, uncomp; + dsl_deadlist_entry_t *dle_next; + + bpobj_enqueue_subobj(bpo, dle->dle_bpobj.bpo_object, tx); + + VERIFY0(bpobj_space(&dle->dle_bpobj, + &used, &comp, &uncomp)); + ASSERT3U(dl->dl_phys->dl_used, >=, used); + ASSERT3U(dl->dl_phys->dl_comp, >=, comp); + ASSERT3U(dl->dl_phys->dl_uncomp, >=, uncomp); + dl->dl_phys->dl_used -= used; + dl->dl_phys->dl_comp -= comp; + dl->dl_phys->dl_uncomp -= uncomp; + + VERIFY0(zap_remove_int(dl->dl_os, dl->dl_object, + dle->dle_mintxg, tx)); + + dle_next = AVL_NEXT(&dl->dl_tree, dle); + avl_remove(&dl->dl_tree, dle); + bpobj_close(&dle->dle_bpobj); + kmem_free(dle, sizeof (*dle)); + dle = dle_next; + } + mutex_exit(&dl->dl_lock); +} + +typedef struct livelist_entry { + const blkptr_t *le_bp; + avl_node_t le_node; +} livelist_entry_t; + +static int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = ((livelist_entry_t *)larg)->le_bp; + const blkptr_t *r = ((livelist_entry_t *)rarg)->le_bp; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + uint64_t r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + + if (l_dva0_vdev != r_dva0_vdev) + return (TREE_CMP(l_dva0_vdev, r_dva0_vdev)); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset == r_dva0_offset) + ASSERT3U(l->blk_birth, ==, r->blk_birth); + return (TREE_CMP(l_dva0_offset, r_dva0_offset)); +} + +struct livelist_iter_arg { + avl_tree_t *avl; + bplist_t *to_free; + zthr_t *t; +}; + +/* + * Expects an AVL tree which is incrementally filled will FREE blkptrs + * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a + * corresponding FREE are stored in the supplied bplist. + */ +static int +dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + struct livelist_iter_arg *lia = arg; + avl_tree_t *avl = lia->avl; + bplist_t *to_free = lia->to_free; + zthr_t *t = lia->t; + ASSERT(tx == NULL); + + if ((t != NULL) && (zthr_has_waiters(t) || zthr_iscancelled(t))) + return (SET_ERROR(EINTR)); + if (bp_freed) { + livelist_entry_t *node = kmem_alloc(sizeof (livelist_entry_t), + KM_SLEEP); + blkptr_t *temp_bp = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *temp_bp = *bp; + node->le_bp = temp_bp; + avl_add(avl, node); + } else { + livelist_entry_t node; + node.le_bp = bp; + livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found != NULL) { + avl_remove(avl, found); + kmem_free((blkptr_t *)found->le_bp, sizeof (blkptr_t)); + kmem_free(found, sizeof (livelist_entry_t)); + } else { + bplist_append(to_free, bp); + } + } + return (0); +} + +/* + * Accepts a bpobj and a bplist. Will insert into the bplist the blkptrs + * which have an ALLOC entry but no matching FREE + */ +int +dsl_process_sub_livelist(bpobj_t *bpobj, bplist_t *to_free, zthr_t *t, + uint64_t *size) +{ + avl_tree_t avl; + avl_create(&avl, livelist_compare, sizeof (livelist_entry_t), + offsetof(livelist_entry_t, le_node)); + + /* process the sublist */ + struct livelist_iter_arg arg = { + .avl = &avl, + .to_free = to_free, + .t = t + }; + int err = bpobj_iterate_nofree(bpobj, dsl_livelist_iterate, &arg, size); + + avl_destroy(&avl); + return (err); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, max_entries, ULONG, ZMOD_RW, + "Size to start the next sub-livelist in a livelist"); + +ZFS_MODULE_PARAM(zfs_livelist, zfs_livelist_, min_percent_shared, INT, ZMOD_RW, + "Threshold at which livelist is disabled"); +/* END CSTYLED */ diff --git a/module/zfs/dsl_deleg.c b/module/zfs/dsl_deleg.c new file mode 100644 index 000000000000..cf8a3c9bbdfb --- /dev/null +++ b/module/zfs/dsl_deleg.c @@ -0,0 +1,774 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +/* + * DSL permissions are stored in a two level zap attribute + * mechanism. The first level identifies the "class" of + * entry. The class is identified by the first 2 letters of + * the attribute. The second letter "l" or "d" identifies whether + * it is a local or descendent permission. The first letter + * identifies the type of entry. + * + * ul$ identifies permissions granted locally for this userid. + * ud$ identifies permissions granted on descendent datasets for + * this userid. + * Ul$ identifies permission sets granted locally for this userid. + * Ud$ identifies permission sets granted on descendent datasets for + * this userid. + * gl$ identifies permissions granted locally for this groupid. + * gd$ identifies permissions granted on descendent datasets for + * this groupid. + * Gl$ identifies permission sets granted locally for this groupid. + * Gd$ identifies permission sets granted on descendent datasets for + * this groupid. + * el$ identifies permissions granted locally for everyone. + * ed$ identifies permissions granted on descendent datasets + * for everyone. + * El$ identifies permission sets granted locally for everyone. + * Ed$ identifies permission sets granted to descendent datasets for + * everyone. + * c-$ identifies permission to create at dataset creation time. + * C-$ identifies permission sets to grant locally at dataset creation + * time. + * s-$@ permissions defined in specified set @ + * S-$@ Sets defined in named set @ + * + * Each of the above entities points to another zap attribute that contains one + * attribute for each allowed permission, such as create, destroy,... + * All of the "upper" case class types will specify permission set names + * rather than permissions. + * + * Basically it looks something like this: + * ul$12 -> ZAP OBJ -> permissions... + * + * The ZAP OBJ is referred to as the jump object. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_deleg.h" + +/* + * Validate that user is allowed to delegate specified permissions. + * + * In order to delegate "create" you must have "create" + * and "allow". + */ +int +dsl_deleg_can_allow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + while ((whopair = nvlist_next_nvpair(nvp, whopair))) { + nvlist_t *perms; + nvpair_t *permpair = NULL; + + VERIFY(nvpair_value_nvlist(whopair, &perms) == 0); + + while ((permpair = nvlist_next_nvpair(perms, permpair))) { + const char *perm = nvpair_name(permpair); + + if (strcmp(perm, ZFS_DELEG_PERM_ALLOW) == 0) + return (SET_ERROR(EPERM)); + + if ((error = dsl_deleg_access(ddname, perm, cr)) != 0) + return (error); + } + } + return (0); +} + +/* + * Validate that user is allowed to unallow specified permissions. They + * must have the 'allow' permission, and even then can only unallow + * perms for their uid. + */ +int +dsl_deleg_can_unallow(char *ddname, nvlist_t *nvp, cred_t *cr) +{ + nvpair_t *whopair = NULL; + int error; + char idstr[32]; + + if ((error = dsl_deleg_access(ddname, ZFS_DELEG_PERM_ALLOW, cr)) != 0) + return (error); + + (void) snprintf(idstr, sizeof (idstr), "%lld", + (longlong_t)crgetuid(cr)); + + while ((whopair = nvlist_next_nvpair(nvp, whopair))) { + zfs_deleg_who_type_t type = nvpair_name(whopair)[0]; + + if (type != ZFS_DELEG_USER && + type != ZFS_DELEG_USER_SETS) + return (SET_ERROR(EPERM)); + + if (strcmp(idstr, &nvpair_name(whopair)[3]) != 0) + return (SET_ERROR(EPERM)); + } + return (0); +} + +typedef struct dsl_deleg_arg { + const char *dda_name; + nvlist_t *dda_nvlist; +} dsl_deleg_arg_t; + +static void +dsl_deleg_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + perms = fnvpair_value_nvlist(whopair); + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) { + jumpobj = zap_create_link(mos, DMU_OT_DSL_PERMS, + zapobj, whokey, tx); + } + + while ((permpair = nvlist_next_nvpair(perms, permpair))) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + VERIFY(zap_update(mos, jumpobj, + perm, 8, 1, &n, tx) == 0); + spa_history_log_internal_dd(dd, "permission update", tx, + "%s %s", whokey, perm); + } + } + dsl_dir_rele(dd, FTAG); +} + +static void +dsl_deleg_unset_sync(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + nvpair_t *whopair = NULL; + uint64_t zapobj; + + VERIFY0(dsl_dir_hold(dp, dda->dda_name, FTAG, &dd, NULL)); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + if (zapobj == 0) { + dsl_dir_rele(dd, FTAG); + return; + } + + while ((whopair = nvlist_next_nvpair(dda->dda_nvlist, whopair))) { + const char *whokey = nvpair_name(whopair); + nvlist_t *perms; + nvpair_t *permpair = NULL; + uint64_t jumpobj; + + if (nvpair_value_nvlist(whopair, &perms) != 0) { + if (zap_lookup(mos, zapobj, whokey, 8, + 1, &jumpobj) == 0) { + (void) zap_remove(mos, zapobj, whokey, tx); + VERIFY(0 == zap_destroy(mos, jumpobj, tx)); + } + spa_history_log_internal_dd(dd, "permission who remove", + tx, "%s", whokey); + continue; + } + + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) != 0) + continue; + + while ((permpair = nvlist_next_nvpair(perms, permpair))) { + const char *perm = nvpair_name(permpair); + uint64_t n = 0; + + (void) zap_remove(mos, jumpobj, perm, tx); + if (zap_count(mos, jumpobj, &n) == 0 && n == 0) { + (void) zap_remove(mos, zapobj, + whokey, tx); + VERIFY(0 == zap_destroy(mos, + jumpobj, tx)); + } + spa_history_log_internal_dd(dd, "permission remove", tx, + "%s %s", whokey, perm); + } + } + dsl_dir_rele(dd, FTAG); +} + +static int +dsl_deleg_check(void *arg, dmu_tx_t *tx) +{ + dsl_deleg_arg_t *dda = arg; + dsl_dir_t *dd; + int error; + + if (spa_version(dmu_tx_pool(tx)->dp_spa) < + SPA_VERSION_DELEGATED_PERMS) { + return (SET_ERROR(ENOTSUP)); + } + + error = dsl_dir_hold(dmu_tx_pool(tx), dda->dda_name, FTAG, &dd, NULL); + if (error == 0) + dsl_dir_rele(dd, FTAG); + return (error); +} + +int +dsl_deleg_set(const char *ddname, nvlist_t *nvp, boolean_t unset) +{ + dsl_deleg_arg_t dda; + + /* nvp must already have been verified to be valid */ + + dda.dda_name = ddname; + dda.dda_nvlist = nvp; + + return (dsl_sync_task(ddname, dsl_deleg_check, + unset ? dsl_deleg_unset_sync : dsl_deleg_set_sync, + &dda, fnvlist_num_pairs(nvp), ZFS_SPACE_CHECK_RESERVED)); +} + +/* + * Find all 'allow' permissions from a given point and then continue + * traversing up to the root. + * + * This function constructs an nvlist of nvlists. + * each setpoint is an nvlist composed of an nvlist of an nvlist + * of the individual * users/groups/everyone/create + * permissions. + * + * The nvlist will look like this. + * + * { source fsname -> { whokeys { permissions,...}, ...}} + * + * The fsname nvpairs will be arranged in a bottom up order. For example, + * if we have the following structure a/b/c then the nvpairs for the fsnames + * will be ordered a/b/c, a/b, a. + */ +int +dsl_deleg_get(const char *ddname, nvlist_t **nvp) +{ + dsl_dir_t *dd, *startdd; + dsl_pool_t *dp; + int error; + objset_t *mos; + zap_cursor_t *basezc, *zc; + zap_attribute_t *baseza, *za; + char *source; + + error = dsl_pool_hold(ddname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dir_hold(dp, ddname, FTAG, &startdd, NULL); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dp = startdd->dd_pool; + mos = dp->dp_meta_objset; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + basezc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + baseza = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + source = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (dd = startdd; dd != NULL; dd = dd->dd_parent) { + nvlist_t *sp_nvp; + uint64_t n; + + if (dsl_dir_phys(dd)->dd_deleg_zapobj == 0 || + zap_count(mos, + dsl_dir_phys(dd)->dd_deleg_zapobj, &n) != 0 || n == 0) + continue; + + sp_nvp = fnvlist_alloc(); + for (zap_cursor_init(basezc, mos, + dsl_dir_phys(dd)->dd_deleg_zapobj); + zap_cursor_retrieve(basezc, baseza) == 0; + zap_cursor_advance(basezc)) { + nvlist_t *perms_nvp; + + ASSERT(baseza->za_integer_length == 8); + ASSERT(baseza->za_num_integers == 1); + + perms_nvp = fnvlist_alloc(); + for (zap_cursor_init(zc, mos, baseza->za_first_integer); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + fnvlist_add_boolean(perms_nvp, za->za_name); + } + zap_cursor_fini(zc); + fnvlist_add_nvlist(sp_nvp, baseza->za_name, perms_nvp); + fnvlist_free(perms_nvp); + } + + zap_cursor_fini(basezc); + + dsl_dir_name(dd, source); + fnvlist_add_nvlist(*nvp, source, sp_nvp); + nvlist_free(sp_nvp); + } + + kmem_free(source, ZFS_MAX_DATASET_NAME_LEN); + kmem_free(baseza, sizeof (zap_attribute_t)); + kmem_free(basezc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); + + dsl_dir_rele(startdd, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} + +/* + * Routines for dsl_deleg_access() -- access checking. + */ +typedef struct perm_set { + avl_node_t p_node; + boolean_t p_matched; + char p_setname[ZFS_MAX_DELEG_NAME]; +} perm_set_t; + +static int +perm_set_compare(const void *arg1, const void *arg2) +{ + const perm_set_t *node1 = (const perm_set_t *)arg1; + const perm_set_t *node2 = (const perm_set_t *)arg2; + int val; + + val = strcmp(node1->p_setname, node2->p_setname); + + return (TREE_ISIGN(val)); +} + +/* + * Determine whether a specified permission exists. + * + * First the base attribute has to be retrieved. i.e. ul$12 + * Once the base object has been retrieved the actual permission + * is lookup up in the zap object the base object points to. + * + * Return 0 if permission exists, ENOENT if there is no whokey, EPERM if + * there is no perm in that jumpobj. + */ +static int +dsl_check_access(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, const char *perm) +{ + int error; + uint64_t jumpobj, zero; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error == 0) { + error = zap_lookup(mos, jumpobj, perm, 8, 1, &zero); + if (error == ENOENT) + error = SET_ERROR(EPERM); + } + return (error); +} + +/* + * check a specified user/group for a requested permission + */ +static int +dsl_check_user_access(objset_t *mos, uint64_t zapobj, const char *perm, + int checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids; + int i; + uint64_t id; + + /* check for user */ + id = crgetuid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_USER, checkflag, &id, perm) == 0) + return (0); + + /* check for users primary group */ + id = crgetgid(cr); + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + + /* check for everyone entry */ + id = -1; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_EVERYONE, checkflag, &id, perm) == 0) + return (0); + + /* check each supplemental group user is a member of */ + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + if (dsl_check_access(mos, zapobj, + ZFS_DELEG_GROUP, checkflag, &id, perm) == 0) + return (0); + } + + return (SET_ERROR(EPERM)); +} + +/* + * Iterate over the sets specified in the specified zapobj + * and load them into the permsets avl tree. + */ +static int +dsl_load_sets(objset_t *mos, uint64_t zapobj, + char type, char checkflag, void *valp, avl_tree_t *avl) +{ + zap_cursor_t zc; + zap_attribute_t za; + perm_set_t *permnode; + avl_index_t idx; + uint64_t jumpobj; + int error; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, type, checkflag, valp); + + error = zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj); + if (error != 0) + return (error); + + for (zap_cursor_init(&zc, mos, jumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + permnode = kmem_alloc(sizeof (perm_set_t), KM_SLEEP); + (void) strlcpy(permnode->p_setname, za.za_name, + sizeof (permnode->p_setname)); + permnode->p_matched = B_FALSE; + + if (avl_find(avl, permnode, &idx) == NULL) { + avl_insert(avl, permnode, idx); + } else { + kmem_free(permnode, sizeof (perm_set_t)); + } + } + zap_cursor_fini(&zc); + return (0); +} + +/* + * Load all permissions user based on cred belongs to. + */ +static void +dsl_load_user_sets(objset_t *mos, uint64_t zapobj, avl_tree_t *avl, + char checkflag, cred_t *cr) +{ + const gid_t *gids; + int ngids, i; + uint64_t id; + + id = crgetuid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_USER_SETS, checkflag, &id, avl); + + id = crgetgid(cr); + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_EVERYONE_SETS, checkflag, NULL, avl); + + ngids = crgetngroups(cr); + gids = crgetgroups(cr); + for (i = 0; i != ngids; i++) { + id = gids[i]; + (void) dsl_load_sets(mos, zapobj, + ZFS_DELEG_GROUP_SETS, checkflag, &id, avl); + } +} + +/* + * Check if user has requested permission. + */ +int +dsl_deleg_access_impl(dsl_dataset_t *ds, const char *perm, cred_t *cr) +{ + dsl_dir_t *dd; + dsl_pool_t *dp; + void *cookie; + int error; + char checkflag; + objset_t *mos; + avl_tree_t permsets; + perm_set_t *setnode; + + dp = ds->ds_dir->dd_pool; + mos = dp->dp_meta_objset; + + if (dsl_delegation_on(mos) == B_FALSE) + return (SET_ERROR(ECANCELED)); + + if (spa_version(dmu_objset_spa(dp->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return (SET_ERROR(EPERM)); + + if (ds->ds_is_snapshot) { + /* + * Snapshots are treated as descendents only, + * local permissions do not apply. + */ + checkflag = ZFS_DELEG_DESCENDENT; + } else { + checkflag = ZFS_DELEG_LOCAL; + } + + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), + offsetof(perm_set_t, p_node)); + + ASSERT(dsl_pool_config_held(dp)); + for (dd = ds->ds_dir; dd != NULL; dd = dd->dd_parent, + checkflag = ZFS_DELEG_DESCENDENT) { + uint64_t zapobj; + boolean_t expanded; + + /* + * If not in global zone then make sure + * the zoned property is set + */ + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + + if (dsl_prop_get_dd(dd, + zfs_prop_to_name(ZFS_PROP_ZONED), + 8, 1, &zoned, NULL, B_FALSE) != 0) + break; + if (!zoned) + break; + } + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + + if (zapobj == 0) + continue; + + dsl_load_user_sets(mos, zapobj, &permsets, checkflag, cr); +again: + expanded = B_FALSE; + for (setnode = avl_first(&permsets); setnode; + setnode = AVL_NEXT(&permsets, setnode)) { + if (setnode->p_matched == B_TRUE) + continue; + + /* See if this set directly grants this permission */ + error = dsl_check_access(mos, zapobj, + ZFS_DELEG_NAMED_SET, 0, setnode->p_setname, perm); + if (error == 0) + goto success; + if (error == EPERM) + setnode->p_matched = B_TRUE; + + /* See if this set includes other sets */ + error = dsl_load_sets(mos, zapobj, + ZFS_DELEG_NAMED_SET_SETS, 0, + setnode->p_setname, &permsets); + if (error == 0) + setnode->p_matched = expanded = B_TRUE; + } + /* + * If we expanded any sets, that will define more sets, + * which we need to check. + */ + if (expanded) + goto again; + + error = dsl_check_user_access(mos, zapobj, perm, checkflag, cr); + if (error == 0) + goto success; + } + error = SET_ERROR(EPERM); +success: + + cookie = NULL; + while ((setnode = avl_destroy_nodes(&permsets, &cookie)) != NULL) + kmem_free(setnode, sizeof (perm_set_t)); + + return (error); +} + +int +dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int error; + + error = dsl_pool_hold(dsname, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + error = dsl_deleg_access_impl(ds, perm, cr); + dsl_dataset_rele(ds, FTAG); + } + dsl_pool_rele(dp, FTAG); + + return (error); +} + +/* + * Other routines. + */ + +static void +copy_create_perms(dsl_dir_t *dd, uint64_t pzapobj, + boolean_t dosets, uint64_t uid, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + uint64_t jumpobj, pjumpobj; + uint64_t zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + zap_cursor_t zc; + zap_attribute_t za; + char whokey[ZFS_MAX_DELEG_NAME]; + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_CREATE_SETS : ZFS_DELEG_CREATE, + ZFS_DELEG_LOCAL, NULL); + if (zap_lookup(mos, pzapobj, whokey, 8, 1, &pjumpobj) != 0) + return; + + if (zapobj == 0) { + dmu_buf_will_dirty(dd->dd_dbuf, tx); + zapobj = dsl_dir_phys(dd)->dd_deleg_zapobj = zap_create(mos, + DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + } + + zfs_deleg_whokey(whokey, + dosets ? ZFS_DELEG_USER_SETS : ZFS_DELEG_USER, + ZFS_DELEG_LOCAL, &uid); + if (zap_lookup(mos, zapobj, whokey, 8, 1, &jumpobj) == ENOENT) { + jumpobj = zap_create(mos, DMU_OT_DSL_PERMS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(mos, zapobj, whokey, 8, 1, &jumpobj, tx) == 0); + } + + for (zap_cursor_init(&zc, mos, pjumpobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zero = 0; + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + + VERIFY(zap_update(mos, jumpobj, za.za_name, + 8, 1, &zero, tx) == 0); + } + zap_cursor_fini(&zc); +} + +/* + * set all create time permission on new dataset. + */ +void +dsl_deleg_set_create_perms(dsl_dir_t *sdd, dmu_tx_t *tx, cred_t *cr) +{ + dsl_dir_t *dd; + uint64_t uid = crgetuid(cr); + + if (spa_version(dmu_objset_spa(sdd->dd_pool->dp_meta_objset)) < + SPA_VERSION_DELEGATED_PERMS) + return; + + for (dd = sdd->dd_parent; dd != NULL; dd = dd->dd_parent) { + uint64_t pzapobj = dsl_dir_phys(dd)->dd_deleg_zapobj; + + if (pzapobj == 0) + continue; + + copy_create_perms(sdd, pzapobj, B_FALSE, uid, tx); + copy_create_perms(sdd, pzapobj, B_TRUE, uid, tx); + } +} + +int +dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (zapobj == 0) + return (0); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == 8 && za.za_num_integers == 1); + VERIFY(0 == zap_destroy(mos, za.za_first_integer, tx)); + } + zap_cursor_fini(&zc); + VERIFY(0 == zap_destroy(mos, zapobj, tx)); + return (0); +} + +boolean_t +dsl_delegation_on(objset_t *os) +{ + return (!!spa_delegation(os->os_spa)); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dsl_deleg_get); +EXPORT_SYMBOL(dsl_deleg_set); +#endif diff --git a/module/zfs/dsl_destroy.c b/module/zfs/dsl_destroy.c new file mode 100644 index 000000000000..26fdf96341b9 --- /dev/null +++ b/module/zfs/dsl_destroy.c @@ -0,0 +1,1286 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2013 by Joyent, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +int +dsl_destroy_snapshot_check_impl(dsl_dataset_t *ds, boolean_t defer) +{ + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (dsl_dataset_long_held(ds)) + return (SET_ERROR(EBUSY)); + + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + return (0); + } + + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete a branch point. + */ + if (dsl_dataset_phys(ds)->ds_num_children > 1) + return (SET_ERROR(EEXIST)); + + return (0); +} + +int +dsl_destroy_snapshot_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_snapshot_arg_t *ddsa = arg; + const char *dsname = ddsa->ddsa_name; + boolean_t defer = ddsa->ddsa_defer; + + dsl_pool_t *dp = dmu_tx_pool(tx); + int error = 0; + dsl_dataset_t *ds; + + error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + + /* + * If the snapshot does not exist, silently ignore it, and + * dsl_destroy_snapshot_sync() will be a no-op + * (it's "already destroyed"). + */ + if (error == ENOENT) + return (0); + + if (error == 0) { + error = dsl_destroy_snapshot_check_impl(ds, defer); + dsl_dataset_rele(ds, FTAG); + } + + return (error); +} + +struct process_old_arg { + dsl_dataset_t *ds; + dsl_dataset_t *ds_prev; + boolean_t after_branch_point; + zio_t *pio; + uint64_t used, comp, uncomp; +}; + +static int +process_old_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, dmu_tx_t *tx) +{ + struct process_old_arg *poa = arg; + dsl_pool_t *dp = poa->ds->ds_dir->dd_pool; + + ASSERT(!BP_IS_HOLE(bp)); + + if (bp->blk_birth <= dsl_dataset_phys(poa->ds)->ds_prev_snap_txg) { + dsl_deadlist_insert(&poa->ds->ds_deadlist, bp, bp_freed, tx); + if (poa->ds_prev && !poa->after_branch_point && + bp->blk_birth > + dsl_dataset_phys(poa->ds_prev)->ds_prev_snap_txg) { + dsl_dataset_phys(poa->ds_prev)->ds_unique_bytes += + bp_get_dsize_sync(dp->dp_spa, bp); + } + } else { + poa->used += bp_get_dsize_sync(dp->dp_spa, bp); + poa->comp += BP_GET_PSIZE(bp); + poa->uncomp += BP_GET_UCSIZE(bp); + dsl_free_sync(poa->pio, dp, tx->tx_txg, bp); + } + return (0); +} + +static void +process_old_deadlist(dsl_dataset_t *ds, dsl_dataset_t *ds_prev, + dsl_dataset_t *ds_next, boolean_t after_branch_point, dmu_tx_t *tx) +{ + struct process_old_arg poa = { 0 }; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t deadlist_obj; + + ASSERT(ds->ds_deadlist.dl_oldfmt); + ASSERT(ds_next->ds_deadlist.dl_oldfmt); + + poa.ds = ds; + poa.ds_prev = ds_prev; + poa.after_branch_point = after_branch_point; + poa.pio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + VERIFY0(bpobj_iterate(&ds_next->ds_deadlist.dl_bpobj, + process_old_cb, &poa, tx)); + VERIFY0(zio_wait(poa.pio)); + ASSERT3U(poa.used, ==, dsl_dataset_phys(ds)->ds_unique_bytes); + + /* change snapused */ + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -poa.used, -poa.comp, -poa.uncomp, tx); + + /* swap next's deadlist to our deadlist */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_close(&ds_next->ds_deadlist); + deadlist_obj = dsl_dataset_phys(ds)->ds_deadlist_obj; + dsl_dataset_phys(ds)->ds_deadlist_obj = + dsl_dataset_phys(ds_next)->ds_deadlist_obj; + dsl_dataset_phys(ds_next)->ds_deadlist_obj = deadlist_obj; + dsl_deadlist_open(&ds->ds_deadlist, mos, + dsl_dataset_phys(ds)->ds_deadlist_obj); + dsl_deadlist_open(&ds_next->ds_deadlist, mos, + dsl_dataset_phys(ds_next)->ds_deadlist_obj); +} + +typedef struct remaining_clones_key { + dsl_dataset_t *rck_clone; + list_node_t rck_node; +} remaining_clones_key_t; + +static remaining_clones_key_t * +rck_alloc(dsl_dataset_t *clone) +{ + remaining_clones_key_t *rck = kmem_alloc(sizeof (*rck), KM_SLEEP); + rck->rck_clone = clone; + return (rck); +} + +static void +dsl_dir_remove_clones_key_impl(dsl_dir_t *dd, uint64_t mintxg, dmu_tx_t *tx, + list_t *stack, void *tag) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + + /* + * If it is the old version, dd_clones doesn't exist so we can't + * find the clones, but dsl_deadlist_remove_key() is a no-op so it + * doesn't matter. + */ + if (dsl_dir_phys(dd)->dd_clones == 0) + return; + + zap_cursor_t *zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + zap_attribute_t *za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + for (zap_cursor_init(zc, mos, dsl_dir_phys(dd)->dd_clones); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + dsl_dataset_t *clone; + + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + za->za_first_integer, tag, &clone)); + + if (clone->ds_dir->dd_origin_txg > mintxg) { + dsl_deadlist_remove_key(&clone->ds_deadlist, + mintxg, tx); + + if (dsl_dataset_remap_deadlist_exists(clone)) { + dsl_deadlist_remove_key( + &clone->ds_remap_deadlist, mintxg, tx); + } + + list_insert_head(stack, rck_alloc(clone)); + } else { + dsl_dataset_rele(clone, tag); + } + } + zap_cursor_fini(zc); + + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); +} + +void +dsl_dir_remove_clones_key(dsl_dir_t *top_dd, uint64_t mintxg, dmu_tx_t *tx) +{ + list_t stack; + + list_create(&stack, sizeof (remaining_clones_key_t), + offsetof(remaining_clones_key_t, rck_node)); + + dsl_dir_remove_clones_key_impl(top_dd, mintxg, tx, &stack, FTAG); + for (remaining_clones_key_t *rck = list_remove_head(&stack); + rck != NULL; rck = list_remove_head(&stack)) { + dsl_dataset_t *clone = rck->rck_clone; + dsl_dir_t *clone_dir = clone->ds_dir; + + kmem_free(rck, sizeof (*rck)); + + dsl_dir_remove_clones_key_impl(clone_dir, mintxg, tx, + &stack, FTAG); + dsl_dataset_rele(clone, FTAG); + } + + list_destroy(&stack); +} + +static void +dsl_destroy_snapshot_handle_remaps(dsl_dataset_t *ds, dsl_dataset_t *ds_next, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + + /* Move blocks to be obsoleted to pool's obsolete list. */ + if (dsl_dataset_remap_deadlist_exists(ds_next)) { + if (!bpobj_is_open(&dp->dp_obsolete_bpobj)) + dsl_pool_create_obsolete_bpobj(dp, tx); + + dsl_deadlist_move_bpobj(&ds_next->ds_remap_deadlist, + &dp->dp_obsolete_bpobj, + dsl_dataset_phys(ds)->ds_prev_snap_txg, tx); + } + + /* Merge our deadlist into next's and free it. */ + if (dsl_dataset_remap_deadlist_exists(ds)) { + uint64_t remap_deadlist_object = + dsl_dataset_get_remap_deadlist_object(ds); + ASSERT(remap_deadlist_object != 0); + + mutex_enter(&ds_next->ds_remap_deadlist_lock); + if (!dsl_dataset_remap_deadlist_exists(ds_next)) + dsl_dataset_create_remap_deadlist(ds_next, tx); + mutex_exit(&ds_next->ds_remap_deadlist_lock); + + dsl_deadlist_merge(&ds_next->ds_remap_deadlist, + remap_deadlist_object, tx); + dsl_dataset_destroy_remap_deadlist(ds, tx); + } +} + +void +dsl_destroy_snapshot_sync_impl(dsl_dataset_t *ds, boolean_t defer, dmu_tx_t *tx) +{ + int after_branch_point = FALSE; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + dsl_dataset_t *ds_prev = NULL; + uint64_t obj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(zfs_refcount_is_zero(&ds->ds_longholds)); + + if (defer && + (ds->ds_userrefs > 0 || + dsl_dataset_phys(ds)->ds_num_children > 1)) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_DEFER_DESTROY; + spa_history_log_internal_ds(ds, "defer_destroy", tx, " "); + return; + } + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + + /* We need to log before removing it from the namespace. */ + spa_history_log_internal_ds(ds, "destroy", tx, " "); + + dsl_scan_ds_destroyed(ds, tx); + + obj = ds->ds_object; + + boolean_t book_exists = dsl_bookmark_ds_destroyed(ds, tx); + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + dsl_dataset_deactivate_feature(ds, f, tx); + } + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + ASSERT3P(ds->ds_prev, ==, NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &ds_prev)); + after_branch_point = + (dsl_dataset_phys(ds_prev)->ds_next_snap_obj != obj); + + dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); + if (after_branch_point && + dsl_dataset_phys(ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds_prev, obj, tx); + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + VERIFY0(zap_add_int(mos, + dsl_dataset_phys(ds_prev)-> + ds_next_clones_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + tx)); + } + } + if (!after_branch_point) { + dsl_dataset_phys(ds_prev)->ds_next_snap_obj = + dsl_dataset_phys(ds)->ds_next_snap_obj; + } + } + + dsl_dataset_t *ds_next; + uint64_t old_unique; + uint64_t used = 0, comp = 0, uncomp = 0; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_next_snap_obj, FTAG, &ds_next)); + ASSERT3U(dsl_dataset_phys(ds_next)->ds_prev_snap_obj, ==, obj); + + old_unique = dsl_dataset_phys(ds_next)->ds_unique_bytes; + + dmu_buf_will_dirty(ds_next->ds_dbuf, tx); + dsl_dataset_phys(ds_next)->ds_prev_snap_obj = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + dsl_dataset_phys(ds_next)->ds_prev_snap_txg = + dsl_dataset_phys(ds)->ds_prev_snap_txg; + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_txg, ==, + ds_prev ? dsl_dataset_phys(ds_prev)->ds_creation_txg : 0); + + if (ds_next->ds_deadlist.dl_oldfmt) { + process_old_deadlist(ds, ds_prev, ds_next, + after_branch_point, tx); + } else { + /* Adjust prev's unique space. */ + if (ds_prev && !after_branch_point) { + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds_prev)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_prev)->ds_unique_bytes += used; + } + + /* Adjust snapused. */ + dsl_deadlist_space_range(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, UINT64_MAX, + &used, &comp, &uncomp); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_SNAP, + -used, -comp, -uncomp, tx); + + /* Move blocks to be freed to pool's free list. */ + dsl_deadlist_move_bpobj(&ds_next->ds_deadlist, + &dp->dp_free_bpobj, dsl_dataset_phys(ds)->ds_prev_snap_txg, + tx); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, + DD_USED_HEAD, used, comp, uncomp, tx); + + /* Merge our deadlist into next's and free it. */ + dsl_deadlist_merge(&ds_next->ds_deadlist, + dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + + /* + * We are done with the deadlist tree (generated/used + * by dsl_deadlist_move_bpobj() and dsl_deadlist_merge()). + * Discard it to save memory. + */ + dsl_deadlist_discard_tree(&ds_next->ds_deadlist); + } + + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + dsl_destroy_snapshot_handle_remaps(ds, ds_next, tx); + + if (!book_exists) { + /* Collapse range in clone heads */ + dsl_dir_remove_clones_key(ds->ds_dir, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } + + if (ds_next->ds_is_snapshot) { + dsl_dataset_t *ds_nextnext; + + /* + * Update next's unique to include blocks which + * were previously shared by only this snapshot + * and it. Those blocks will be born after the + * prev snap and before this snap, and will have + * died after the next snap and before the one + * after that (ie. be on the snap after next's + * deadlist). + */ + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds_next)->ds_next_snap_obj, + FTAG, &ds_nextnext)); + dsl_deadlist_space_range(&ds_nextnext->ds_deadlist, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + dsl_dataset_phys(ds)->ds_creation_txg, + &used, &comp, &uncomp); + dsl_dataset_phys(ds_next)->ds_unique_bytes += used; + dsl_dataset_rele(ds_nextnext, FTAG); + ASSERT3P(ds_next->ds_prev, ==, NULL); + + /* Collapse range in this head. */ + dsl_dataset_t *hds; + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, + FTAG, &hds)); + if (!book_exists) { + /* Collapse range in this head. */ + dsl_deadlist_remove_key(&hds->ds_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } + if (dsl_dataset_remap_deadlist_exists(hds)) { + dsl_deadlist_remove_key(&hds->ds_remap_deadlist, + dsl_dataset_phys(ds)->ds_creation_txg, tx); + } + dsl_dataset_rele(hds, FTAG); + + } else { + ASSERT3P(ds_next->ds_prev, ==, ds); + dsl_dataset_rele(ds_next->ds_prev, ds_next); + ds_next->ds_prev = NULL; + if (ds_prev) { + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds_next, &ds_next->ds_prev)); + } + + dsl_dataset_recalc_head_uniq(ds_next); + + /* + * Reduce the amount of our unconsumed refreservation + * being charged to our parent by the amount of + * new unique data we have gained. + */ + if (old_unique < ds_next->ds_reserved) { + int64_t mrsdelta; + uint64_t new_unique = + dsl_dataset_phys(ds_next)->ds_unique_bytes; + + ASSERT(old_unique <= new_unique); + mrsdelta = MIN(new_unique - old_unique, + ds_next->ds_reserved - old_unique); + dsl_dir_diduse_space(ds->ds_dir, + DD_USED_REFRSRV, -mrsdelta, 0, 0, tx); + } + } + dsl_dataset_rele(ds_next, FTAG); + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* remove from snapshot namespace */ + dsl_dataset_t *ds_head; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj == 0); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj, FTAG, &ds_head)); + VERIFY0(dsl_dataset_get_snapname(ds)); +#ifdef ZFS_DEBUG + { + uint64_t val; + int err; + + err = dsl_dataset_snap_lookup(ds_head, + ds->ds_snapname, &val); + ASSERT0(err); + ASSERT3U(val, ==, obj); + } +#endif + VERIFY0(dsl_dataset_snap_remove(ds_head, ds->ds_snapname, tx, B_TRUE)); + dsl_dataset_rele(ds_head, FTAG); + + if (ds_prev != NULL) + dsl_dataset_rele(ds_prev, FTAG); + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count __maybe_unused; + ASSERT0(zap_count(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count) && + count == 0); + VERIFY0(dmu_object_free(mos, + dsl_dataset_phys(ds)->ds_next_clones_obj, tx)); + } + if (dsl_dataset_phys(ds)->ds_props_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_props_obj, + tx)); + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) + VERIFY0(zap_destroy(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + tx)); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); +} + +void +dsl_destroy_snapshot_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_snapshot_arg_t *ddsa = arg; + const char *dsname = ddsa->ddsa_name; + boolean_t defer = ddsa->ddsa_defer; + + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == ENOENT) + return; + ASSERT0(error); + dsl_destroy_snapshot_sync_impl(ds, defer, tx); + zvol_remove_minors(dp->dp_spa, dsname, B_TRUE); + dsl_dataset_rele(ds, FTAG); +} + +/* + * The semantics of this function are described in the comment above + * lzc_destroy_snaps(). To summarize: + * + * The snapshots must all be in the same pool. + * + * Snapshots that don't exist will be silently ignored (considered to be + * "already deleted"). + * + * On success, all snaps will be destroyed and this will return 0. + * On failure, no snaps will be destroyed, the errlist will be filled in, + * and this will return an errno. + */ +int +dsl_destroy_snapshots_nvl(nvlist_t *snaps, boolean_t defer, + nvlist_t *errlist) +{ + if (nvlist_next_nvpair(snaps, NULL) == NULL) + return (0); + + /* + * lzc_destroy_snaps() is documented to take an nvlist whose + * values "don't matter". We need to convert that nvlist to + * one that we know can be converted to LUA. We also don't + * care about any duplicate entries because the nvlist will + * be converted to a LUA table which should take care of this. + */ + nvlist_t *snaps_normalized; + VERIFY0(nvlist_alloc(&snaps_normalized, 0, KM_SLEEP)); + for (nvpair_t *pair = nvlist_next_nvpair(snaps, NULL); + pair != NULL; pair = nvlist_next_nvpair(snaps, pair)) { + fnvlist_add_boolean_value(snaps_normalized, + nvpair_name(pair), B_TRUE); + } + + nvlist_t *arg; + VERIFY0(nvlist_alloc(&arg, 0, KM_SLEEP)); + fnvlist_add_nvlist(arg, "snaps", snaps_normalized); + fnvlist_free(snaps_normalized); + fnvlist_add_boolean_value(arg, "defer", defer); + + nvlist_t *wrapper; + VERIFY0(nvlist_alloc(&wrapper, 0, KM_SLEEP)); + fnvlist_add_nvlist(wrapper, ZCP_ARG_ARGLIST, arg); + fnvlist_free(arg); + + const char *program = + "arg = ...\n" + "snaps = arg['snaps']\n" + "defer = arg['defer']\n" + "errors = { }\n" + "has_errors = false\n" + "for snap, v in pairs(snaps) do\n" + " errno = zfs.check.destroy{snap, defer=defer}\n" + " zfs.debug('snap: ' .. snap .. ' errno: ' .. errno)\n" + " if errno == ENOENT then\n" + " snaps[snap] = nil\n" + " elseif errno ~= 0 then\n" + " errors[snap] = errno\n" + " has_errors = true\n" + " end\n" + "end\n" + "if has_errors then\n" + " return errors\n" + "end\n" + "for snap, v in pairs(snaps) do\n" + " errno = zfs.sync.destroy{snap, defer=defer}\n" + " assert(errno == 0)\n" + "end\n" + "return { }\n"; + + nvlist_t *result = fnvlist_alloc(); + int error = zcp_eval(nvpair_name(nvlist_next_nvpair(snaps, NULL)), + program, + B_TRUE, + 0, + zfs_lua_max_memlimit, + nvlist_next_nvpair(wrapper, NULL), result); + if (error != 0) { + char *errorstr = NULL; + (void) nvlist_lookup_string(result, ZCP_RET_ERROR, &errorstr); + if (errorstr != NULL) { + zfs_dbgmsg(errorstr); + } + fnvlist_free(wrapper); + fnvlist_free(result); + return (error); + } + fnvlist_free(wrapper); + + /* + * lzc_destroy_snaps() is documented to fill the errlist with + * int32 values, so we need to convert the int64 values that are + * returned from LUA. + */ + int rv = 0; + nvlist_t *errlist_raw = fnvlist_lookup_nvlist(result, ZCP_RET_RETURN); + for (nvpair_t *pair = nvlist_next_nvpair(errlist_raw, NULL); + pair != NULL; pair = nvlist_next_nvpair(errlist_raw, pair)) { + int32_t val = (int32_t)fnvpair_value_int64(pair); + if (rv == 0) + rv = val; + fnvlist_add_int32(errlist, nvpair_name(pair), val); + } + fnvlist_free(result); + return (rv); +} + +int +dsl_destroy_snapshot(const char *name, boolean_t defer) +{ + int error; + nvlist_t *nvl = fnvlist_alloc(); + nvlist_t *errlist = fnvlist_alloc(); + + fnvlist_add_boolean(nvl, name); + error = dsl_destroy_snapshots_nvl(nvl, defer, errlist); + fnvlist_free(errlist); + fnvlist_free(nvl); + return (error); +} + +struct killarg { + dsl_dataset_t *ds; + dmu_tx_t *tx; +}; + +/* ARGSUSED */ +static int +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; + + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp)) + return (0); + + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); + /* + * It's a block in the intent log. It has no + * accounting, so just free it. + */ + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); + } else { + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, + dsl_dataset_phys(ka->ds)->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); + } + + return (0); +} + +static void +old_synchronous_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + struct killarg ka; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(synchronous, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + + /* + * Free everything that we point to (that's born after + * the previous snapshot, if we are a clone) + * + * NB: this should be very quick, because we already + * freed all the objects in open context. + */ + ka.ds = ds; + ka.tx = tx; + VERIFY0(traverse_dataset(ds, + dsl_dataset_phys(ds)->ds_prev_snap_txg, TRAVERSE_POST | + TRAVERSE_NO_DECRYPT, kill_blkptr, &ka)); + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == 0); +} + +int +dsl_destroy_head_check_impl(dsl_dataset_t *ds, int expected_holds) +{ + int error; + uint64_t count; + objset_t *mos; + + ASSERT(!ds->ds_is_snapshot); + if (ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (zfs_refcount_count(&ds->ds_longholds) != expected_holds) + return (SET_ERROR(EBUSY)); + + ASSERT0(ds->ds_dir->dd_activity_waiters); + + mos = ds->ds_dir->dd_pool->dp_meta_objset; + + /* + * Can't delete a head dataset if there are snapshots of it. + * (Except if the only snapshots are from the branch we cloned + * from.) + */ + if (ds->ds_prev != NULL && + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj == ds->ds_object) + return (SET_ERROR(EBUSY)); + + /* + * Can't delete if there are children of this fs. + */ + error = zap_count(mos, + dsl_dir_phys(ds->ds_dir)->dd_child_dir_zapobj, &count); + if (error != 0) + return (error); + if (count != 0) + return (SET_ERROR(EEXIST)); + + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0) { + /* We need to remove the origin snapshot as well. */ + if (!zfs_refcount_is_zero(&ds->ds_prev->ds_longholds)) + return (SET_ERROR(EBUSY)); + } + return (0); +} + +int +dsl_destroy_head_check(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + + error = dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_destroy_head_check_impl(ds, 0); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_destroy_sync(uint64_t ddobj, dmu_tx_t *tx) +{ + dsl_dir_t *dd; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + dd_used_t t; + + ASSERT(RRW_WRITE_HELD(&dmu_tx_pool(tx)->dp_config_rwlock)); + + VERIFY0(dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd)); + + ASSERT0(dsl_dir_phys(dd)->dd_head_dataset_obj); + + /* Decrement the filesystem count for all parent filesystems. */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, -1, + DD_FIELD_FILESYSTEM_COUNT, tx); + + /* + * Remove our reservation. The impl() routine avoids setting the + * actual property, which would require the (already destroyed) ds. + */ + dsl_dir_set_reservation_sync_impl(dd, 0, tx); + + ASSERT0(dsl_dir_phys(dd)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dd)->dd_reserved); + for (t = 0; t < DD_USED_NUM; t++) + ASSERT0(dsl_dir_phys(dd)->dd_used_breakdown[t]); + + if (dd->dd_crypto_obj != 0) { + dsl_crypto_key_destroy_sync(dd->dd_crypto_obj, tx); + (void) spa_keystore_unload_wkey_impl(dp->dp_spa, dd->dd_object); + } + + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_child_dir_zapobj, tx)); + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_props_zapobj, tx)); + if (dsl_dir_phys(dd)->dd_clones != 0) + VERIFY0(zap_destroy(mos, dsl_dir_phys(dd)->dd_clones, tx)); + VERIFY0(dsl_deleg_destroy(mos, dsl_dir_phys(dd)->dd_deleg_zapobj, tx)); + VERIFY0(zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx)); + + dsl_dir_rele(dd, FTAG); + dmu_object_free_zapified(mos, ddobj, tx); +} + +static void +dsl_clone_destroy_assert(dsl_dir_t *dd) +{ + uint64_t used, comp, uncomp; + + ASSERT(dsl_dir_is_clone(dd)); + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + + ASSERT3U(dsl_dir_phys(dd)->dd_used_bytes, ==, used); + ASSERT3U(dsl_dir_phys(dd)->dd_compressed_bytes, ==, comp); + /* + * Greater than because we do not track embedded block pointers in + * the livelist + */ + ASSERT3U(dsl_dir_phys(dd)->dd_uncompressed_bytes, >=, uncomp); + + ASSERT(list_is_empty(&dd->dd_pending_allocs.bpl_list)); + ASSERT(list_is_empty(&dd->dd_pending_frees.bpl_list)); +} + +/* + * Start the delete process for a clone. Free its zil, verify the space usage + * and queue the blkptrs for deletion by adding the livelist to the pool-wide + * delete queue. + */ +static void +dsl_async_clone_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t zap_obj, to_delete, used, comp, uncomp; + objset_t *os; + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + VERIFY0(dmu_objset_from_ds(ds, &os)); + + uint64_t mintxg = 0; + dsl_deadlist_entry_t *dle = dsl_deadlist_first(&dd->dd_livelist); + if (dle != NULL) + mintxg = dle->dle_mintxg; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(livelist, mintxg=%llu)", (long long)mintxg); + + /* Check that the clone is in a correct state to be deleted */ + dsl_clone_destroy_assert(dd); + + /* Destroy the zil */ + zil_destroy_sync(dmu_objset_zil(os), tx); + + VERIFY0(zap_lookup(mos, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &to_delete)); + /* Initialize deleted_clones entry to track livelists to cleanup */ + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, &zap_obj); + if (error == ENOENT) { + zap_obj = zap_create(mos, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, sizeof (uint64_t), 1, + &(zap_obj), tx)); + spa->spa_livelists_to_delete = zap_obj; + } else if (error != 0) { + zfs_panic_recover("zfs: error %d was returned while looking " + "up DMU_POOL_DELETED_CLONES in the zap", error); + return; + } + VERIFY0(zap_add_int(mos, zap_obj, to_delete, tx)); + + /* Clone is no longer using space, now tracked by dp_free_dir */ + dsl_deadlist_space(&dd->dd_livelist, &used, &comp, &uncomp); + dsl_dir_diduse_space(dd, DD_USED_HEAD, + -used, -comp, -dsl_dir_phys(dd)->dd_uncompressed_bytes, + tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); + dsl_dir_remove_livelist(dd, tx, B_FALSE); + zthr_wakeup(spa->spa_livelist_delete_zthr); +} + +/* + * Move the bptree into the pool's list of trees to clean up, update space + * accounting information and destroy the zil. + */ +static void +dsl_async_dataset_destroy(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + uint64_t used, comp, uncomp; + objset_t *os; + + VERIFY0(dmu_objset_from_ds(ds, &os)); + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + + spa_history_log_internal_ds(ds, "destroy", tx, + "(bptree, mintxg=%llu)", + (long long)dsl_dataset_phys(ds)->ds_prev_snap_txg); + + zil_destroy_sync(dmu_objset_zil(os), tx); + + if (!spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_scan_t *scn = dp->dp_scan; + spa_feature_incr(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY, + tx); + dp->dp_bptree_obj = bptree_alloc(mos, tx); + VERIFY0(zap_add(mos, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj, tx)); + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = B_TRUE; + } + + used = dsl_dir_phys(ds->ds_dir)->dd_used_bytes; + comp = dsl_dir_phys(ds->ds_dir)->dd_compressed_bytes; + uncomp = dsl_dir_phys(ds->ds_dir)->dd_uncompressed_bytes; + + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || + dsl_dataset_phys(ds)->ds_unique_bytes == used); + + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + bptree_add(mos, dp->dp_bptree_obj, + &dsl_dataset_phys(ds)->ds_bp, + dsl_dataset_phys(ds)->ds_prev_snap_txg, + used, comp, uncomp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + dsl_dir_diduse_space(ds->ds_dir, DD_USED_HEAD, + -used, -comp, -uncomp, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + used, comp, uncomp, tx); +} + +void +dsl_destroy_head_sync_impl(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + uint64_t obj, ddobj, prevobj = 0; + boolean_t rmorigin; + + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + ASSERT(ds->ds_prev == NULL || + dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj != ds->ds_object); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT3U(dsl_dataset_phys(ds)->ds_bp.blk_birth, <=, tx->tx_txg); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + dsl_dir_cancel_waiters(ds->ds_dir); + + rmorigin = (dsl_dir_is_clone(ds->ds_dir) && + DS_IS_DEFER_DESTROY(ds->ds_prev) && + dsl_dataset_phys(ds->ds_prev)->ds_num_children == 2 && + ds->ds_prev->ds_userrefs == 0); + + /* Remove our reservation. */ + if (ds->ds_reserved != 0) { + dsl_dataset_set_refreservation_sync_impl(ds, + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + 0, tx); + ASSERT0(ds->ds_reserved); + } + + obj = ds->ds_object; + + for (spa_feature_t f = 0; f < SPA_FEATURES; f++) { + if (dsl_dataset_feature_is_active(ds, f)) + dsl_dataset_deactivate_feature(ds, f, tx); + } + + dsl_scan_ds_destroyed(ds, tx); + + if (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + /* This is a clone */ + ASSERT(ds->ds_prev != NULL); + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_next_snap_obj, !=, + obj); + ASSERT0(dsl_dataset_phys(ds)->ds_next_snap_obj); + + dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); + if (dsl_dataset_phys(ds->ds_prev)->ds_next_clones_obj != 0) { + dsl_dataset_remove_from_next_clones(ds->ds_prev, + obj, tx); + } + + ASSERT3U(dsl_dataset_phys(ds->ds_prev)->ds_num_children, >, 1); + dsl_dataset_phys(ds->ds_prev)->ds_num_children--; + } + + /* + * Destroy the deadlist. Unless it's a clone, the + * deadlist should be empty since the dataset has no snapshots. + * (If it's a clone, it's safe to ignore the deadlist contents + * since they are still referenced by the origin snapshot.) + */ + dsl_deadlist_close(&ds->ds_deadlist); + dsl_deadlist_free(mos, dsl_dataset_phys(ds)->ds_deadlist_obj, tx); + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_deadlist_obj = 0; + + if (dsl_dataset_remap_deadlist_exists(ds)) + dsl_dataset_destroy_remap_deadlist(ds, tx); + + /* + * Each destroy is responsible for both destroying (enqueuing + * to be destroyed) the blkptrs comprising the dataset as well as + * those belonging to the zil. + */ + if (dsl_deadlist_is_open(&ds->ds_dir->dd_livelist)) { + dsl_async_clone_destroy(ds, tx); + } else if (spa_feature_is_enabled(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY)) { + dsl_async_dataset_destroy(ds, tx); + } else { + old_synchronous_dataset_destroy(ds, tx); + } + + if (ds->ds_prev != NULL) { + if (spa_version(dp->dp_spa) >= SPA_VERSION_DIR_CLONES) { + VERIFY0(zap_remove_int(mos, + dsl_dir_phys(ds->ds_prev->ds_dir)->dd_clones, + ds->ds_object, tx)); + } + prevobj = ds->ds_prev->ds_object; + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = NULL; + } + + /* + * This must be done after the dsl_traverse(), because it will + * re-open the objset. + */ + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* Erase the link in the dir */ + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + dsl_dir_phys(ds->ds_dir)->dd_head_dataset_obj = 0; + ddobj = ds->ds_dir->dd_object; + ASSERT(dsl_dataset_phys(ds)->ds_snapnames_zapobj != 0); + VERIFY0(zap_destroy(mos, + dsl_dataset_phys(ds)->ds_snapnames_zapobj, tx)); + + if (ds->ds_bookmarks_obj != 0) { + void *cookie = NULL; + dsl_bookmark_node_t *dbn; + + while ((dbn = avl_destroy_nodes(&ds->ds_bookmarks, &cookie)) != + NULL) { + if (dbn->dbn_phys.zbm_redaction_obj != 0) { + VERIFY0(dmu_object_free(mos, + dbn->dbn_phys.zbm_redaction_obj, tx)); + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_REDACTION_BOOKMARKS, tx); + } + if (dbn->dbn_phys.zbm_flags & ZBM_FLAG_HAS_FBN) { + spa_feature_decr(dmu_objset_spa(mos), + SPA_FEATURE_BOOKMARK_WRITTEN, tx); + } + spa_strfree(dbn->dbn_name); + mutex_destroy(&dbn->dbn_lock); + kmem_free(dbn, sizeof (*dbn)); + } + avl_destroy(&ds->ds_bookmarks); + VERIFY0(zap_destroy(mos, ds->ds_bookmarks_obj, tx)); + spa_feature_decr(dp->dp_spa, SPA_FEATURE_BOOKMARKS, tx); + } + + spa_prop_clear_bootfs(dp->dp_spa, ds->ds_object, tx); + + ASSERT0(dsl_dataset_phys(ds)->ds_next_clones_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_props_obj); + ASSERT0(dsl_dataset_phys(ds)->ds_userrefs_obj); + dsl_dir_rele(ds->ds_dir, ds); + ds->ds_dir = NULL; + dmu_object_free_zapified(mos, obj, tx); + + dsl_dir_destroy_sync(ddobj, tx); + + if (rmorigin) { + dsl_dataset_t *prev; + VERIFY0(dsl_dataset_hold_obj(dp, prevobj, FTAG, &prev)); + dsl_destroy_snapshot_sync_impl(prev, B_FALSE, tx); + dsl_dataset_rele(prev, FTAG); + } +} + +void +dsl_destroy_head_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + dsl_destroy_head_sync_impl(ds, tx); + zvol_remove_minors(dp->dp_spa, ddha->ddha_name, B_TRUE); + dsl_dataset_rele(ds, FTAG); +} + +static void +dsl_destroy_head_begin_sync(void *arg, dmu_tx_t *tx) +{ + dsl_destroy_head_arg_t *ddha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, ddha->ddha_name, FTAG, &ds)); + + /* Mark it as inconsistent on-disk, in case we crash */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_flags |= DS_FLAG_INCONSISTENT; + + spa_history_log_internal_ds(ds, "destroy begin", tx, " "); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_destroy_head(const char *name) +{ + dsl_destroy_head_arg_t ddha; + int error; + spa_t *spa; + boolean_t isenabled; + +#ifdef _KERNEL + zfs_destroy_unmount_origin(name); +#endif + + error = spa_open(name, &spa, FTAG); + if (error != 0) + return (error); + isenabled = spa_feature_is_enabled(spa, SPA_FEATURE_ASYNC_DESTROY); + spa_close(spa, FTAG); + + ddha.ddha_name = name; + + if (!isenabled) { + objset_t *os; + + error = dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_begin_sync, &ddha, + 0, ZFS_SPACE_CHECK_DESTROY); + if (error != 0) + return (error); + + /* + * Head deletion is processed in one txg on old pools; + * remove the objects from open context so that the txg sync + * is not too long. This optimization can only work for + * encrypted datasets if the wrapping key is loaded. + */ + error = dmu_objset_own(name, DMU_OST_ANY, B_FALSE, B_TRUE, + FTAG, &os); + if (error == 0) { + uint64_t prev_snap_txg = + dsl_dataset_phys(dmu_objset_ds(os))-> + ds_prev_snap_txg; + for (uint64_t obj = 0; error == 0; + error = dmu_object_next(os, &obj, FALSE, + prev_snap_txg)) + (void) dmu_free_long_object(os, obj); + /* sync out all frees */ + txg_wait_synced(dmu_objset_pool(os), 0); + dmu_objset_disown(os, B_TRUE, FTAG); + } + } + + return (dsl_sync_task(name, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, 0, ZFS_SPACE_CHECK_DESTROY)); +} + +/* + * Note, this function is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + objset_t *os; + + if (dmu_objset_hold(dsname, FTAG, &os) == 0) { + boolean_t need_destroy = DS_IS_INCONSISTENT(dmu_objset_ds(os)); + + /* + * If the dataset is inconsistent because a resumable receive + * has failed, then do not destroy it. + */ + if (dsl_dataset_has_resume_receive_state(dmu_objset_ds(os))) + need_destroy = B_FALSE; + + dmu_objset_rele(os, FTAG); + if (need_destroy) + (void) dsl_destroy_head(dsname); + } + return (0); +} + + +#if defined(_KERNEL) +EXPORT_SYMBOL(dsl_destroy_head); +EXPORT_SYMBOL(dsl_destroy_head_sync_impl); +EXPORT_SYMBOL(dsl_dataset_user_hold_check_one); +EXPORT_SYMBOL(dsl_destroy_snapshot_sync_impl); +EXPORT_SYMBOL(dsl_destroy_inconsistent); +EXPORT_SYMBOL(dsl_dataset_user_release_tmp); +EXPORT_SYMBOL(dsl_destroy_head_check_impl); +#endif diff --git a/module/zfs/dsl_dir.c b/module/zfs/dsl_dir.c new file mode 100644 index 000000000000..73b7943b8f99 --- /dev/null +++ b/module/zfs/dsl_dir.c @@ -0,0 +1,2414 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright (c) 2014 Joyent, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#ifdef _KERNEL +#include +#endif + +/* + * Filesystem and Snapshot Limits + * ------------------------------ + * + * These limits are used to restrict the number of filesystems and/or snapshots + * that can be created at a given level in the tree or below. A typical + * use-case is with a delegated dataset where the administrator wants to ensure + * that a user within the zone is not creating too many additional filesystems + * or snapshots, even though they're not exceeding their space quota. + * + * The filesystem and snapshot counts are stored as extensible properties. This + * capability is controlled by a feature flag and must be enabled to be used. + * Once enabled, the feature is not active until the first limit is set. At + * that point, future operations to create/destroy filesystems or snapshots + * will validate and update the counts. + * + * Because the count properties will not exist before the feature is active, + * the counts are updated when a limit is first set on an uninitialized + * dsl_dir node in the tree (The filesystem/snapshot count on a node includes + * all of the nested filesystems/snapshots. Thus, a new leaf node has a + * filesystem count of 0 and a snapshot count of 0. Non-existent filesystem and + * snapshot count properties on a node indicate uninitialized counts on that + * node.) When first setting a limit on an uninitialized node, the code starts + * at the filesystem with the new limit and descends into all sub-filesystems + * to add the count properties. + * + * In practice this is lightweight since a limit is typically set when the + * filesystem is created and thus has no children. Once valid, changing the + * limit value won't require a re-traversal since the counts are already valid. + * When recursively fixing the counts, if a node with a limit is encountered + * during the descent, the counts are known to be valid and there is no need to + * descend into that filesystem's children. The counts on filesystems above the + * one with the new limit will still be uninitialized, unless a limit is + * eventually set on one of those filesystems. The counts are always recursively + * updated when a limit is set on a dataset, unless there is already a limit. + * When a new limit value is set on a filesystem with an existing limit, it is + * possible for the new limit to be less than the current count at that level + * since a user who can change the limit is also allowed to exceed the limit. + * + * Once the feature is active, then whenever a filesystem or snapshot is + * created, the code recurses up the tree, validating the new count against the + * limit at each initialized level. In practice, most levels will not have a + * limit set. If there is a limit at any initialized level up the tree, the + * check must pass or the creation will fail. Likewise, when a filesystem or + * snapshot is destroyed, the counts are recursively adjusted all the way up + * the initialized nodes in the tree. Renaming a filesystem into different point + * in the tree will first validate, then update the counts on each branch up to + * the common ancestor. A receive will also validate the counts and then update + * them. + * + * An exception to the above behavior is that the limit is not enforced if the + * user has permission to modify the limit. This is primarily so that + * recursive snapshots in the global zone always work. We want to prevent a + * denial-of-service in which a lower level delegated dataset could max out its + * limit and thus block recursive snapshots from being taken in the global zone. + * Because of this, it is possible for the snapshot count to be over the limit + * and snapshots taken in the global zone could cause a lower level dataset to + * hit or exceed its limit. The administrator taking the global zone recursive + * snapshot should be aware of this side-effect and behave accordingly. + * For consistency, the filesystem limit is also not enforced if the user can + * modify the limit. + * + * The filesystem and snapshot limits are validated by dsl_fs_ss_limit_check() + * and updated by dsl_fs_ss_count_adjust(). A new limit value is setup in + * dsl_dir_activate_fs_ss_limit() and the counts are adjusted, if necessary, by + * dsl_dir_init_fs_ss_count(). + * + * There is a special case when we receive a filesystem that already exists. In + * this case a temporary clone name of %X is created (see dmu_recv_begin). We + * never update the filesystem counts for temporary clones. + * + * Likewise, we do not update the snapshot counts for temporary snapshots, + * such as those created by zfs diff. + */ + +extern inline dsl_dir_phys_t *dsl_dir_phys(dsl_dir_t *dd); + +static uint64_t dsl_dir_space_towrite(dsl_dir_t *dd); + +typedef struct ddulrt_arg { + dsl_dir_t *ddulrta_dd; + uint64_t ddlrta_txg; +} ddulrt_arg_t; + +static void +dsl_dir_evict_async(void *dbu) +{ + dsl_dir_t *dd = dbu; + int t; + dsl_pool_t *dp __maybe_unused = dd->dd_pool; + + dd->dd_dbuf = NULL; + + for (t = 0; t < TXG_SIZE; t++) { + ASSERT(!txg_list_member(&dp->dp_dirty_dirs, dd, t)); + ASSERT(dd->dd_tempreserved[t] == 0); + ASSERT(dd->dd_space_towrite[t] == 0); + } + + if (dd->dd_parent) + dsl_dir_async_rele(dd->dd_parent, dd); + + spa_async_close(dd->dd_pool->dp_spa, dd); + + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + + dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); +} + +int +dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj, + const char *tail, void *tag, dsl_dir_t **ddp) +{ + dmu_buf_t *dbuf; + dsl_dir_t *dd; + dmu_object_info_t doi; + int err; + + ASSERT(dsl_pool_config_held(dp)); + + err = dmu_bonus_hold(dp->dp_meta_objset, ddobj, tag, &dbuf); + if (err != 0) + return (err); + dd = dmu_buf_get_user(dbuf); + + dmu_object_info_from_db(dbuf, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_DSL_DIR); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (dsl_dir_phys_t)); + + if (dd == NULL) { + dsl_dir_t *winner; + + dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); + dd->dd_object = ddobj; + dd->dd_dbuf = dbuf; + dd->dd_pool = dp; + + mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&dd->dd_activity_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dd->dd_activity_cv, NULL, CV_DEFAULT, NULL); + dsl_prop_init(dd); + + if (dsl_dir_is_zapified(dd)) { + err = zap_lookup(dp->dp_meta_objset, + ddobj, DD_FIELD_CRYPTO_KEY_OBJ, + sizeof (uint64_t), 1, &dd->dd_crypto_obj); + if (err == 0) { + /* check for on-disk format errata */ + if (dsl_dir_incompatible_encryption_version( + dd)) { + dp->dp_spa->spa_errata = + ZPOOL_ERRATA_ZOL_6845_ENCRYPTION; + } + } else if (err != ENOENT) { + goto errout; + } + } + + dsl_dir_snap_cmtime_update(dd); + + if (dsl_dir_phys(dd)->dd_parent_obj) { + err = dsl_dir_hold_obj(dp, + dsl_dir_phys(dd)->dd_parent_obj, NULL, dd, + &dd->dd_parent); + if (err != 0) + goto errout; + if (tail) { +#ifdef ZFS_DEBUG + uint64_t foundobj; + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, tail, + sizeof (foundobj), 1, &foundobj); + ASSERT(err || foundobj == ddobj); +#endif + (void) strlcpy(dd->dd_myname, tail, + sizeof (dd->dd_myname)); + } else { + err = zap_value_search(dp->dp_meta_objset, + dsl_dir_phys(dd->dd_parent)-> + dd_child_dir_zapobj, + ddobj, 0, dd->dd_myname); + } + if (err != 0) + goto errout; + } else { + (void) strlcpy(dd->dd_myname, spa_name(dp->dp_spa), + sizeof (dd->dd_myname)); + } + + if (dsl_dir_is_clone(dd)) { + dmu_buf_t *origin_bonus; + dsl_dataset_phys_t *origin_phys; + + /* + * We can't open the origin dataset, because + * that would require opening this dsl_dir. + * Just look at its phys directly instead. + */ + err = dmu_bonus_hold(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, + &origin_bonus); + if (err != 0) + goto errout; + origin_phys = origin_bonus->db_data; + dd->dd_origin_txg = + origin_phys->ds_creation_txg; + dmu_buf_rele(origin_bonus, FTAG); + if (dsl_dir_is_zapified(dd)) { + uint64_t obj; + err = zap_lookup(dp->dp_meta_objset, + dd->dd_object, DD_FIELD_LIVELIST, + sizeof (uint64_t), 1, &obj); + if (err == 0) + dsl_dir_livelist_open(dd, obj); + else if (err != ENOENT) + goto errout; + } + } + + dmu_buf_init_user(&dd->dd_dbu, NULL, dsl_dir_evict_async, + &dd->dd_dbuf); + winner = dmu_buf_set_user_ie(dbuf, &dd->dd_dbu); + if (winner != NULL) { + if (dd->dd_parent) + dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dd = winner; + } else { + spa_open_ref(dp->dp_spa, dd); + } + } + + /* + * The dsl_dir_t has both open-to-close and instantiate-to-evict + * holds on the spa. We need the open-to-close holds because + * otherwise the spa_refcnt wouldn't change when we open a + * dir which the spa also has open, so we could incorrectly + * think it was OK to unload/export/destroy the pool. We need + * the instantiate-to-evict hold because the dsl_dir_t has a + * pointer to the dd_pool, which has a pointer to the spa_t. + */ + spa_open_ref(dp->dp_spa, tag); + ASSERT3P(dd->dd_pool, ==, dp); + ASSERT3U(dd->dd_object, ==, ddobj); + ASSERT3P(dd->dd_dbuf, ==, dbuf); + *ddp = dd; + return (0); + +errout: + if (dd->dd_parent) + dsl_dir_rele(dd->dd_parent, dd); + if (dsl_deadlist_is_open(&dd->dd_livelist)) + dsl_dir_livelist_close(dd); + dsl_prop_fini(dd); + cv_destroy(&dd->dd_activity_cv); + mutex_destroy(&dd->dd_activity_lock); + mutex_destroy(&dd->dd_lock); + kmem_free(dd, sizeof (dsl_dir_t)); + dmu_buf_rele(dbuf, tag); + return (err); +} + +void +dsl_dir_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* + * Remove a reference to the given dsl dir that is being asynchronously + * released. Async releases occur from a taskq performing eviction of + * dsl datasets and dirs. This process is identical to a normal release + * with the exception of using the async API for releasing the reference on + * the spa. + */ +void +dsl_dir_async_rele(dsl_dir_t *dd, void *tag) +{ + dprintf_dd(dd, "%s\n", ""); + spa_async_close(dd->dd_pool->dp_spa, tag); + dmu_buf_rele(dd->dd_dbuf, tag); +} + +/* buf must be at least ZFS_MAX_DATASET_NAME_LEN bytes */ +void +dsl_dir_name(dsl_dir_t *dd, char *buf) +{ + if (dd->dd_parent) { + dsl_dir_name(dd->dd_parent, buf); + VERIFY3U(strlcat(buf, "/", ZFS_MAX_DATASET_NAME_LEN), <, + ZFS_MAX_DATASET_NAME_LEN); + } else { + buf[0] = '\0'; + } + if (!MUTEX_HELD(&dd->dd_lock)) { + /* + * recursive mutex so that we can use + * dprintf_dd() with dd_lock held + */ + mutex_enter(&dd->dd_lock); + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + mutex_exit(&dd->dd_lock); + } else { + VERIFY3U(strlcat(buf, dd->dd_myname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + } +} + +/* Calculate name length, avoiding all the strcat calls of dsl_dir_name */ +int +dsl_dir_namelen(dsl_dir_t *dd) +{ + int result = 0; + + if (dd->dd_parent) { + /* parent's name + 1 for the "/" */ + result = dsl_dir_namelen(dd->dd_parent) + 1; + } + + if (!MUTEX_HELD(&dd->dd_lock)) { + /* see dsl_dir_name */ + mutex_enter(&dd->dd_lock); + result += strlen(dd->dd_myname); + mutex_exit(&dd->dd_lock); + } else { + result += strlen(dd->dd_myname); + } + + return (result); +} + +static int +getcomponent(const char *path, char *component, const char **nextp) +{ + char *p; + + if ((path == NULL) || (path[0] == '\0')) + return (SET_ERROR(ENOENT)); + /* This would be a good place to reserve some namespace... */ + p = strpbrk(path, "/@"); + if (p && (p[1] == '/' || p[1] == '@')) { + /* two separators in a row */ + return (SET_ERROR(EINVAL)); + } + if (p == NULL || p == path) { + /* + * if the first thing is an @ or /, it had better be an + * @ and it had better not have any more ats or slashes, + * and it had better have something after the @. + */ + if (p != NULL && + (p[0] != '@' || strpbrk(path+1, "/@") || p[1] == '\0')) + return (SET_ERROR(EINVAL)); + if (strlen(path) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strlcpy(component, path, ZFS_MAX_DATASET_NAME_LEN); + p = NULL; + } else if (p[0] == '/') { + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strncpy(component, path, p - path); + component[p - path] = '\0'; + p++; + } else if (p[0] == '@') { + /* + * if the next separator is an @, there better not be + * any more slashes. + */ + if (strchr(path, '/')) + return (SET_ERROR(EINVAL)); + if (p - path >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + (void) strncpy(component, path, p - path); + component[p - path] = '\0'; + } else { + panic("invalid p=%p", (void *)p); + } + *nextp = p; + return (0); +} + +/* + * Return the dsl_dir_t, and possibly the last component which couldn't + * be found in *tail. The name must be in the specified dsl_pool_t. This + * thread must hold the dp_config_rwlock for the pool. Returns NULL if the + * path is bogus, or if tail==NULL and we couldn't parse the whole name. + * (*tail)[0] == '@' means that the last component is a snapshot. + */ +int +dsl_dir_hold(dsl_pool_t *dp, const char *name, void *tag, + dsl_dir_t **ddp, const char **tailp) +{ + char *buf; + const char *spaname, *next, *nextnext = NULL; + int err; + dsl_dir_t *dd; + uint64_t ddobj; + + buf = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + err = getcomponent(name, buf, &next); + if (err != 0) + goto error; + + /* Make sure the name is in the specified pool. */ + spaname = spa_name(dp->dp_spa); + if (strcmp(buf, spaname) != 0) { + err = SET_ERROR(EXDEV); + goto error; + } + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, NULL, tag, &dd); + if (err != 0) { + goto error; + } + + while (next != NULL) { + dsl_dir_t *child_dd; + err = getcomponent(next, buf, &nextnext); + if (err != 0) + break; + ASSERT(next[0] != '\0'); + if (next[0] == '@') + break; + dprintf("looking up %s in obj%lld\n", + buf, dsl_dir_phys(dd)->dd_child_dir_zapobj); + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dd)->dd_child_dir_zapobj, + buf, sizeof (ddobj), 1, &ddobj); + if (err != 0) { + if (err == ENOENT) + err = 0; + break; + } + + err = dsl_dir_hold_obj(dp, ddobj, buf, tag, &child_dd); + if (err != 0) + break; + dsl_dir_rele(dd, tag); + dd = child_dd; + next = nextnext; + } + + if (err != 0) { + dsl_dir_rele(dd, tag); + goto error; + } + + /* + * It's an error if there's more than one component left, or + * tailp==NULL and there's any component left. + */ + if (next != NULL && + (tailp == NULL || (nextnext && nextnext[0] != '\0'))) { + /* bad path name */ + dsl_dir_rele(dd, tag); + dprintf("next=%p (%s) tail=%p\n", next, next?next:"", tailp); + err = SET_ERROR(ENOENT); + } + if (tailp != NULL) + *tailp = next; + if (err == 0) + *ddp = dd; +error: + kmem_free(buf, ZFS_MAX_DATASET_NAME_LEN); + return (err); +} + +/* + * If the counts are already initialized for this filesystem and its + * descendants then do nothing, otherwise initialize the counts. + * + * The counts on this filesystem, and those below, may be uninitialized due to + * either the use of a pre-existing pool which did not support the + * filesystem/snapshot limit feature, or one in which the feature had not yet + * been enabled. + * + * Recursively descend the filesystem tree and update the filesystem/snapshot + * counts on each filesystem below, then update the cumulative count on the + * current filesystem. If the filesystem already has a count set on it, + * then we know that its counts, and the counts on the filesystems below it, + * are already correct, so we don't have to update this filesystem. + */ +static void +dsl_dir_init_fs_ss_count(dsl_dir_t *dd, dmu_tx_t *tx) +{ + uint64_t my_fs_cnt = 0; + uint64_t my_ss_cnt = 0; + dsl_pool_t *dp = dd->dd_pool; + objset_t *os = dp->dp_meta_objset; + zap_cursor_t *zc; + zap_attribute_t *za; + dsl_dataset_t *ds; + + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)); + ASSERT(dsl_pool_config_held(dp)); + ASSERT(dmu_tx_is_syncing(tx)); + + dsl_dir_zapify(dd, tx); + + /* + * If the filesystem count has already been initialized then we + * don't need to recurse down any further. + */ + if (zap_contains(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + + /* Iterate my child dirs */ + for (zap_cursor_init(zc, os, dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(zc, za) == 0; zap_cursor_advance(zc)) { + dsl_dir_t *chld_dd; + uint64_t count; + + VERIFY0(dsl_dir_hold_obj(dp, za->za_first_integer, NULL, FTAG, + &chld_dd)); + + /* + * Ignore hidden ($FREE, $MOS & $ORIGIN) objsets and + * temporary datasets. + */ + if (chld_dd->dd_myname[0] == '$' || + chld_dd->dd_myname[0] == '%') { + dsl_dir_rele(chld_dd, FTAG); + continue; + } + + my_fs_cnt++; /* count this child */ + + dsl_dir_init_fs_ss_count(chld_dd, tx); + + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (count), 1, &count)); + my_fs_cnt += count; + VERIFY0(zap_lookup(os, chld_dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (count), 1, &count)); + my_ss_cnt += count; + + dsl_dir_rele(chld_dd, FTAG); + } + zap_cursor_fini(zc); + /* Count my snapshots (we counted children's snapshots above) */ + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds)); + + for (zap_cursor_init(zc, os, dsl_dataset_phys(ds)->ds_snapnames_zapobj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + /* Don't count temporary snapshots */ + if (za->za_name[0] != '%') + my_ss_cnt++; + } + zap_cursor_fini(zc); + + dsl_dataset_rele(ds, FTAG); + + kmem_free(zc, sizeof (zap_cursor_t)); + kmem_free(za, sizeof (zap_attribute_t)); + + /* we're in a sync task, update counts */ + dmu_buf_will_dirty(dd->dd_dbuf, tx); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (my_fs_cnt), 1, &my_fs_cnt, tx)); + VERIFY0(zap_add(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (my_ss_cnt), 1, &my_ss_cnt, tx)); +} + +static int +dsl_dir_actv_fs_ss_limit_check(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + int error; + + error = dsl_dataset_hold(dp, ddname, FTAG, &ds); + if (error != 0) + return (error); + + if (!spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + dd = ds->ds_dir; + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_FS_SS_LIMIT) && + dsl_dir_is_zapified(dd) && + zap_contains(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT) == 0) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(EALREADY)); + } + + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_dir_actv_fs_ss_limit_sync(void *arg, dmu_tx_t *tx) +{ + char *ddname = (char *)arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + spa_t *spa; + + VERIFY0(dsl_dataset_hold(dp, ddname, FTAG, &ds)); + + spa = dsl_dataset_get_spa(ds); + + if (!spa_feature_is_active(spa, SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Since the feature was not active and we're now setting a + * limit, increment the feature-active counter so that the + * feature becomes active for the first time. + * + * We are already in a sync task so we can update the MOS. + */ + spa_feature_incr(spa, SPA_FEATURE_FS_SS_LIMIT, tx); + } + + /* + * Since we are now setting a non-UINT64_MAX limit on the filesystem, + * we need to ensure the counts are correct. Descend down the tree from + * this point and update all of the counts to be accurate. + */ + dsl_dir_init_fs_ss_count(ds->ds_dir, tx); + + dsl_dataset_rele(ds, FTAG); +} + +/* + * Make sure the feature is enabled and activate it if necessary. + * Since we're setting a limit, ensure the on-disk counts are valid. + * This is only called by the ioctl path when setting a limit value. + * + * We do not need to validate the new limit, since users who can change the + * limit are also allowed to exceed the limit. + */ +int +dsl_dir_activate_fs_ss_limit(const char *ddname) +{ + int error; + + error = dsl_sync_task(ddname, dsl_dir_actv_fs_ss_limit_check, + dsl_dir_actv_fs_ss_limit_sync, (void *)ddname, 0, + ZFS_SPACE_CHECK_RESERVED); + + if (error == EALREADY) + error = 0; + + return (error); +} + +/* + * Used to determine if the filesystem_limit or snapshot_limit should be + * enforced. We allow the limit to be exceeded if the user has permission to + * write the property value. We pass in the creds that we got in the open + * context since we will always be the GZ root in syncing context. We also have + * to handle the case where we are allowed to change the limit on the current + * dataset, but there may be another limit in the tree above. + * + * We can never modify these two properties within a non-global zone. In + * addition, the other checks are modeled on zfs_secpolicy_write_perms. We + * can't use that function since we are already holding the dp_config_rwlock. + * In addition, we already have the dd and dealing with snapshots is simplified + * in this code. + */ + +typedef enum { + ENFORCE_ALWAYS, + ENFORCE_NEVER, + ENFORCE_ABOVE +} enforce_res_t; + +static enforce_res_t +dsl_enforce_ds_ss_limits(dsl_dir_t *dd, zfs_prop_t prop, + cred_t *cr, proc_t *proc) +{ + enforce_res_t enforce = ENFORCE_ALWAYS; + uint64_t obj; + dsl_dataset_t *ds; + uint64_t zoned; + const char *zonedstr; + + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + +#ifdef _KERNEL + if (crgetzoneid(cr) != GLOBAL_ZONEID) + return (ENFORCE_ALWAYS); + + /* + * We are checking the saved credentials of the user process, which is + * not the current process. Note that we can't use secpolicy_zfs(), + * because it only works if the cred is that of the current process (on + * Linux). + */ + if (secpolicy_zfs_proc(cr, proc) == 0) + return (ENFORCE_NEVER); +#endif + + if ((obj = dsl_dir_phys(dd)->dd_head_dataset_obj) == 0) + return (ENFORCE_ALWAYS); + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (dsl_dataset_hold_obj(dd->dd_pool, obj, FTAG, &ds) != 0) + return (ENFORCE_ALWAYS); + + zonedstr = zfs_prop_to_name(ZFS_PROP_ZONED); + if (dsl_prop_get_ds(ds, zonedstr, 8, 1, &zoned, NULL) || zoned) { + /* Only root can access zoned fs's from the GZ */ + enforce = ENFORCE_ALWAYS; + } else { + if (dsl_deleg_access_impl(ds, zfs_prop_to_name(prop), cr) == 0) + enforce = ENFORCE_ABOVE; + } + + dsl_dataset_rele(ds, FTAG); + return (enforce); +} + +/* + * Check if adding additional child filesystem(s) would exceed any filesystem + * limits or adding additional snapshot(s) would exceed any snapshot limits. + * The prop argument indicates which limit to check. + * + * Note that all filesystem limits up to the root (or the highest + * initialized) filesystem or the given ancestor must be satisfied. + */ +int +dsl_fs_ss_limit_check(dsl_dir_t *dd, uint64_t delta, zfs_prop_t prop, + dsl_dir_t *ancestor, cred_t *cr, proc_t *proc) +{ + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t limit, count; + char *count_prop; + enforce_res_t enforce; + int err = 0; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(prop == ZFS_PROP_FILESYSTEM_LIMIT || + prop == ZFS_PROP_SNAPSHOT_LIMIT); + + /* + * If we're allowed to change the limit, don't enforce the limit + * e.g. this can happen if a snapshot is taken by an administrative + * user in the global zone (i.e. a recursive snapshot by root). + * However, we must handle the case of delegated permissions where we + * are allowed to change the limit on the current dataset, but there + * is another limit in the tree above. + */ + enforce = dsl_enforce_ds_ss_limits(dd, prop, cr, proc); + if (enforce == ENFORCE_NEVER) + return (0); + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment + * is 0. + */ + if (delta == 0) + return (0); + + if (prop == ZFS_PROP_SNAPSHOT_LIMIT) { + /* + * We don't enforce the limit for temporary snapshots. This is + * indicated by a NULL cred_t argument. + */ + if (cr == NULL) + return (0); + + count_prop = DD_FIELD_SNAPSHOT_COUNT; + } else { + count_prop = DD_FIELD_FILESYSTEM_COUNT; + } + + /* + * If an ancestor has been provided, stop checking the limit once we + * hit that dir. We need this during rename so that we don't overcount + * the check once we recurse up to the common ancestor. + */ + if (ancestor == dd) + return (0); + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know there is no limit here (or above). The counts are + * not valid on this node and we know we won't touch this node's counts. + */ + if (!dsl_dir_is_zapified(dd)) + return (0); + err = zap_lookup(os, dd->dd_object, + count_prop, sizeof (count), 1, &count); + if (err == ENOENT) + return (0); + if (err != 0) + return (err); + + err = dsl_prop_get_dd(dd, zfs_prop_to_name(prop), 8, 1, &limit, NULL, + B_FALSE); + if (err != 0) + return (err); + + /* Is there a limit which we've hit? */ + if (enforce == ENFORCE_ALWAYS && (count + delta) > limit) + return (SET_ERROR(EDQUOT)); + + if (dd->dd_parent != NULL) + err = dsl_fs_ss_limit_check(dd->dd_parent, delta, prop, + ancestor, cr, proc); + + return (err); +} + +/* + * Adjust the filesystem or snapshot count for the specified dsl_dir_t and all + * parents. When a new filesystem/snapshot is created, increment the count on + * all parents, and when a filesystem/snapshot is destroyed, decrement the + * count. + */ +void +dsl_fs_ss_count_adjust(dsl_dir_t *dd, int64_t delta, const char *prop, + dmu_tx_t *tx) +{ + int err; + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t count; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0 || + strcmp(prop, DD_FIELD_SNAPSHOT_COUNT) == 0); + + /* + * When we receive an incremental stream into a filesystem that already + * exists, a temporary clone is created. We don't count this temporary + * clone, whose name begins with a '%'. We also ignore hidden ($FREE, + * $MOS & $ORIGIN) objsets. + */ + if ((dd->dd_myname[0] == '%' || dd->dd_myname[0] == '$') && + strcmp(prop, DD_FIELD_FILESYSTEM_COUNT) == 0) + return; + + /* + * e.g. if renaming a dataset with no snapshots, count adjustment is 0 + */ + if (delta == 0) + return; + + /* + * If we hit an uninitialized node while recursing up the tree, we can + * stop since we know the counts are not valid on this node and we + * know we shouldn't touch this node's counts. An uninitialized count + * on the node indicates that either the feature has not yet been + * activated or there are no limits on this part of the tree. + */ + if (!dsl_dir_is_zapified(dd) || (err = zap_lookup(os, dd->dd_object, + prop, sizeof (count), 1, &count)) == ENOENT) + return; + VERIFY0(err); + + count += delta; + /* Use a signed verify to make sure we're not neg. */ + VERIFY3S(count, >=, 0); + + VERIFY0(zap_update(os, dd->dd_object, prop, sizeof (count), 1, &count, + tx)); + + /* Roll up this additional count into our ancestors */ + if (dd->dd_parent != NULL) + dsl_fs_ss_count_adjust(dd->dd_parent, delta, prop, tx); +} + +uint64_t +dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, + dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t ddobj; + dsl_dir_phys_t *ddphys; + dmu_buf_t *dbuf; + + ddobj = dmu_object_alloc(mos, DMU_OT_DSL_DIR, 0, + DMU_OT_DSL_DIR, sizeof (dsl_dir_phys_t), tx); + if (pds) { + VERIFY0(zap_add(mos, dsl_dir_phys(pds)->dd_child_dir_zapobj, + name, sizeof (uint64_t), 1, &ddobj, tx)); + } else { + /* it's the root dir */ + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, &ddobj, tx)); + } + VERIFY0(dmu_bonus_hold(mos, ddobj, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + ddphys = dbuf->db_data; + + ddphys->dd_creation_time = gethrestime_sec(); + if (pds) { + ddphys->dd_parent_obj = pds->dd_object; + + /* update the filesystem counts */ + dsl_fs_ss_count_adjust(pds, 1, DD_FIELD_FILESYSTEM_COUNT, tx); + } + ddphys->dd_props_zapobj = zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + ddphys->dd_child_dir_zapobj = zap_create(mos, + DMU_OT_DSL_DIR_CHILD_MAP, DMU_OT_NONE, 0, tx); + if (spa_version(dp->dp_spa) >= SPA_VERSION_USED_BREAKDOWN) + ddphys->dd_flags |= DD_FLAG_USED_BREAKDOWN; + + dmu_buf_rele(dbuf, FTAG); + + return (ddobj); +} + +boolean_t +dsl_dir_is_clone(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_origin_obj && + (dd->dd_pool->dp_origin_snap == NULL || + dsl_dir_phys(dd)->dd_origin_obj != + dd->dd_pool->dp_origin_snap->ds_object)); +} + +uint64_t +dsl_dir_get_used(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_bytes); +} + +uint64_t +dsl_dir_get_compressed(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_compressed_bytes); +} + +uint64_t +dsl_dir_get_quota(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_quota); +} + +uint64_t +dsl_dir_get_reservation(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_reserved); +} + +uint64_t +dsl_dir_get_compressratio(dsl_dir_t *dd) +{ + /* a fixed point number, 100x the ratio */ + return (dsl_dir_phys(dd)->dd_compressed_bytes == 0 ? 100 : + (dsl_dir_phys(dd)->dd_uncompressed_bytes * 100 / + dsl_dir_phys(dd)->dd_compressed_bytes)); +} + +uint64_t +dsl_dir_get_logicalused(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_uncompressed_bytes); +} + +uint64_t +dsl_dir_get_usedsnap(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_SNAP]); +} + +uint64_t +dsl_dir_get_usedds(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_HEAD]); +} + +uint64_t +dsl_dir_get_usedrefreserv(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_REFRSRV]); +} + +uint64_t +dsl_dir_get_usedchild(dsl_dir_t *dd) +{ + return (dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD] + + dsl_dir_phys(dd)->dd_used_breakdown[DD_USED_CHILD_RSRV]); +} + +void +dsl_dir_get_origin(dsl_dir_t *dd, char *buf) +{ + dsl_dataset_t *ds; + VERIFY0(dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &ds)); + + dsl_dataset_name(ds, buf); + + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_get_filesystem_count(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_FILESYSTEM_COUNT, + sizeof (*count), 1, count)); + } else { + return (SET_ERROR(ENOENT)); + } +} + +int +dsl_dir_get_snapshot_count(dsl_dir_t *dd, uint64_t *count) +{ + if (dsl_dir_is_zapified(dd)) { + objset_t *os = dd->dd_pool->dp_meta_objset; + return (zap_lookup(os, dd->dd_object, DD_FIELD_SNAPSHOT_COUNT, + sizeof (*count), 1, count)); + } else { + return (SET_ERROR(ENOENT)); + } +} + +void +dsl_dir_stats(dsl_dir_t *dd, nvlist_t *nv) +{ + mutex_enter(&dd->dd_lock); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_QUOTA, + dsl_dir_get_quota(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_RESERVATION, + dsl_dir_get_reservation(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_LOGICALUSED, + dsl_dir_get_logicalused(dd)); + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDSNAP, + dsl_dir_get_usedsnap(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDDS, + dsl_dir_get_usedds(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDREFRESERV, + dsl_dir_get_usedrefreserv(dd)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USEDCHILD, + dsl_dir_get_usedchild(dd)); + } + mutex_exit(&dd->dd_lock); + + uint64_t count; + if (dsl_dir_get_filesystem_count(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_FILESYSTEM_COUNT, + count); + } + if (dsl_dir_get_snapshot_count(dd, &count) == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_SNAPSHOT_COUNT, + count); + } + + if (dsl_dir_is_clone(dd)) { + char buf[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dir_get_origin(dd, buf); + dsl_prop_nvlist_add_string(nv, ZFS_PROP_ORIGIN, buf); + } + +} + +void +dsl_dir_dirty(dsl_dir_t *dd, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dd->dd_pool; + + ASSERT(dsl_dir_phys(dd)); + + if (txg_list_add(&dp->dp_dirty_dirs, dd, tx->tx_txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(dd->dd_dbuf, dd); + } +} + +static int64_t +parent_delta(dsl_dir_t *dd, uint64_t used, int64_t delta) +{ + uint64_t old_accounted = MAX(used, dsl_dir_phys(dd)->dd_reserved); + uint64_t new_accounted = + MAX(used + delta, dsl_dir_phys(dd)->dd_reserved); + return (new_accounted - old_accounted); +} + +void +dsl_dir_sync(dsl_dir_t *dd, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + mutex_enter(&dd->dd_lock); + ASSERT0(dd->dd_tempreserved[tx->tx_txg & TXG_MASK]); + dprintf_dd(dd, "txg=%llu towrite=%lluK\n", tx->tx_txg, + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] / 1024); + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] = 0; + mutex_exit(&dd->dd_lock); + + /* release the hold from dsl_dir_dirty */ + dmu_buf_rele(dd->dd_dbuf, dd); +} + +static uint64_t +dsl_dir_space_towrite(dsl_dir_t *dd) +{ + uint64_t space = 0; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (int i = 0; i < TXG_SIZE; i++) { + space += dd->dd_space_towrite[i & TXG_MASK]; + ASSERT3U(dd->dd_space_towrite[i & TXG_MASK], >=, 0); + } + return (space); +} + +/* + * How much space would dd have available if ancestor had delta applied + * to it? If ondiskonly is set, we're only interested in what's + * on-disk, not estimated pending changes. + */ +uint64_t +dsl_dir_space_available(dsl_dir_t *dd, + dsl_dir_t *ancestor, int64_t delta, int ondiskonly) +{ + uint64_t parentspace, myspace, quota, used; + + /* + * If there are no restrictions otherwise, assume we have + * unlimited space available. + */ + quota = UINT64_MAX; + parentspace = UINT64_MAX; + + if (dd->dd_parent != NULL) { + parentspace = dsl_dir_space_available(dd->dd_parent, + ancestor, delta, ondiskonly); + } + + mutex_enter(&dd->dd_lock); + if (dsl_dir_phys(dd)->dd_quota != 0) + quota = dsl_dir_phys(dd)->dd_quota; + used = dsl_dir_phys(dd)->dd_used_bytes; + if (!ondiskonly) + used += dsl_dir_space_towrite(dd); + + if (dd->dd_parent == NULL) { + uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL); + quota = MIN(quota, poolsize); + } + + if (dsl_dir_phys(dd)->dd_reserved > used && parentspace != UINT64_MAX) { + /* + * We have some space reserved, in addition to what our + * parent gave us. + */ + parentspace += dsl_dir_phys(dd)->dd_reserved - used; + } + + if (dd == ancestor) { + ASSERT(delta <= 0); + ASSERT(used >= -delta); + used += delta; + if (parentspace != UINT64_MAX) + parentspace -= delta; + } + + if (used > quota) { + /* over quota */ + myspace = 0; + } else { + /* + * the lesser of the space provided by our parent and + * the space left in our quota + */ + myspace = MIN(parentspace, quota - used); + } + + mutex_exit(&dd->dd_lock); + + return (myspace); +} + +struct tempreserve { + list_node_t tr_node; + dsl_dir_t *tr_ds; + uint64_t tr_size; +}; + +static int +dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, + boolean_t ignorequota, list_t *tr_list, + dmu_tx_t *tx, boolean_t first) +{ + uint64_t txg; + uint64_t quota; + struct tempreserve *tr; + int retval; + uint64_t ref_rsrv; + +top_of_function: + txg = tx->tx_txg; + retval = EDQUOT; + ref_rsrv = 0; + + ASSERT3U(txg, !=, 0); + ASSERT3S(asize, >, 0); + + mutex_enter(&dd->dd_lock); + + /* + * Check against the dsl_dir's quota. We don't add in the delta + * when checking for over-quota because they get one free hit. + */ + uint64_t est_inflight = dsl_dir_space_towrite(dd); + for (int i = 0; i < TXG_SIZE; i++) + est_inflight += dd->dd_tempreserved[i]; + uint64_t used_on_disk = dsl_dir_phys(dd)->dd_used_bytes; + + /* + * On the first iteration, fetch the dataset's used-on-disk and + * refreservation values. Also, if checkrefquota is set, test if + * allocating this space would exceed the dataset's refquota. + */ + if (first && tx->tx_objset) { + int error; + dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; + + error = dsl_dataset_check_quota(ds, !netfree, + asize, est_inflight, &used_on_disk, &ref_rsrv); + if (error != 0) { + mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); + return (error); + } + } + + /* + * If this transaction will result in a net free of space, + * we want to let it through. + */ + if (ignorequota || netfree || dsl_dir_phys(dd)->dd_quota == 0) + quota = UINT64_MAX; + else + quota = dsl_dir_phys(dd)->dd_quota; + + /* + * Adjust the quota against the actual pool size at the root + * minus any outstanding deferred frees. + * To ensure that it's possible to remove files from a full + * pool without inducing transient overcommits, we throttle + * netfree transactions against a quota that is slightly larger, + * but still within the pool's allocation slop. In cases where + * we're very close to full, this will allow a steady trickle of + * removes to get through. + */ + uint64_t deferred = 0; + if (dd->dd_parent == NULL) { + uint64_t avail = dsl_pool_unreserved_space(dd->dd_pool, + (netfree) ? + ZFS_SPACE_CHECK_RESERVED : ZFS_SPACE_CHECK_NORMAL); + + if (avail < quota) { + quota = avail; + retval = SET_ERROR(ENOSPC); + } + } + + /* + * If they are requesting more space, and our current estimate + * is over quota, they get to try again unless the actual + * on-disk is over quota and there are no pending changes (which + * may free up space for us). + */ + if (used_on_disk + est_inflight >= quota) { + if (est_inflight > 0 || used_on_disk < quota || + (retval == ENOSPC && used_on_disk < quota + deferred)) + retval = ERESTART; + dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " + "quota=%lluK tr=%lluK err=%d\n", + used_on_disk>>10, est_inflight>>10, + quota>>10, asize>>10, retval); + mutex_exit(&dd->dd_lock); + DMU_TX_STAT_BUMP(dmu_tx_quota); + return (SET_ERROR(retval)); + } + + /* We need to up our estimated delta before dropping dd_lock */ + dd->dd_tempreserved[txg & TXG_MASK] += asize; + + uint64_t parent_rsrv = parent_delta(dd, used_on_disk + est_inflight, + asize - ref_rsrv); + mutex_exit(&dd->dd_lock); + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_ds = dd; + tr->tr_size = asize; + list_insert_tail(tr_list, tr); + + /* see if it's OK with our parent */ + if (dd->dd_parent != NULL && parent_rsrv != 0) { + /* + * Recurse on our parent without recursion. This has been + * observed to be potentially large stack usage even within + * the test suite. Largest seen stack was 7632 bytes on linux. + */ + + dd = dd->dd_parent; + asize = parent_rsrv; + ignorequota = (dsl_dir_phys(dd)->dd_head_dataset_obj == 0); + first = B_FALSE; + goto top_of_function; + + } else { + return (0); + } +} + +/* + * Reserve space in this dsl_dir, to be used in this tx's txg. + * After the space has been dirtied (and dsl_dir_willuse_space() + * has been called), the reservation should be canceled, using + * dsl_dir_tempreserve_clear(). + */ +int +dsl_dir_tempreserve_space(dsl_dir_t *dd, uint64_t lsize, uint64_t asize, + boolean_t netfree, void **tr_cookiep, dmu_tx_t *tx) +{ + int err; + list_t *tr_list; + + if (asize == 0) { + *tr_cookiep = NULL; + return (0); + } + + tr_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(tr_list, sizeof (struct tempreserve), + offsetof(struct tempreserve, tr_node)); + ASSERT3S(asize, >, 0); + + err = arc_tempreserve_space(dd->dd_pool->dp_spa, lsize, tx->tx_txg); + if (err == 0) { + struct tempreserve *tr; + + tr = kmem_zalloc(sizeof (struct tempreserve), KM_SLEEP); + tr->tr_size = lsize; + list_insert_tail(tr_list, tr); + } else { + if (err == EAGAIN) { + /* + * If arc_memory_throttle() detected that pageout + * is running and we are low on memory, we delay new + * non-pageout transactions to give pageout an + * advantage. + * + * It is unfortunate to be delaying while the caller's + * locks are held. + */ + txg_delay(dd->dd_pool, tx->tx_txg, + MSEC2NSEC(10), MSEC2NSEC(10)); + err = SET_ERROR(ERESTART); + } + } + + if (err == 0) { + err = dsl_dir_tempreserve_impl(dd, asize, netfree, + B_FALSE, tr_list, tx, B_TRUE); + } + + if (err != 0) + dsl_dir_tempreserve_clear(tr_list, tx); + else + *tr_cookiep = tr_list; + + return (err); +} + +/* + * Clear a temporary reservation that we previously made with + * dsl_dir_tempreserve_space(). + */ +void +dsl_dir_tempreserve_clear(void *tr_cookie, dmu_tx_t *tx) +{ + int txgidx = tx->tx_txg & TXG_MASK; + list_t *tr_list = tr_cookie; + struct tempreserve *tr; + + ASSERT3U(tx->tx_txg, !=, 0); + + if (tr_cookie == NULL) + return; + + while ((tr = list_head(tr_list)) != NULL) { + if (tr->tr_ds) { + mutex_enter(&tr->tr_ds->dd_lock); + ASSERT3U(tr->tr_ds->dd_tempreserved[txgidx], >=, + tr->tr_size); + tr->tr_ds->dd_tempreserved[txgidx] -= tr->tr_size; + mutex_exit(&tr->tr_ds->dd_lock); + } else { + arc_tempreserve_clear(tr->tr_size); + } + list_remove(tr_list, tr); + kmem_free(tr, sizeof (struct tempreserve)); + } + + kmem_free(tr_list, sizeof (list_t)); +} + +/* + * This should be called from open context when we think we're going to write + * or free space, for example when dirtying data. Be conservative; it's okay + * to write less space or free more, but we don't want to write more or free + * less than the amount specified. + * + * NOTE: The behavior of this function is identical to the Illumos / FreeBSD + * version however it has been adjusted to use an iterative rather than + * recursive algorithm to minimize stack usage. + */ +void +dsl_dir_willuse_space(dsl_dir_t *dd, int64_t space, dmu_tx_t *tx) +{ + int64_t parent_space; + uint64_t est_used; + + do { + mutex_enter(&dd->dd_lock); + if (space > 0) + dd->dd_space_towrite[tx->tx_txg & TXG_MASK] += space; + + est_used = dsl_dir_space_towrite(dd) + + dsl_dir_phys(dd)->dd_used_bytes; + parent_space = parent_delta(dd, est_used, space); + mutex_exit(&dd->dd_lock); + + /* Make sure that we clean up dd_space_to* */ + dsl_dir_dirty(dd, tx); + + dd = dd->dd_parent; + space = parent_space; + } while (space && dd); +} + +/* call from syncing context when we actually write/free space for this dd */ +void +dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, + int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx) +{ + int64_t accounted_delta; + + /* + * dsl_dataset_set_refreservation_sync_impl() calls this with + * dd_lock held, so that it can atomically update + * ds->ds_reserved and the dsl_dir accounting, so that + * dsl_dataset_check_quota() can see dataset and dir accounting + * consistently. + */ + boolean_t needlock = !MUTEX_HELD(&dd->dd_lock); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(type < DD_USED_NUM); + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + if (needlock) + mutex_enter(&dd->dd_lock); + accounted_delta = + parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, used); + ASSERT(used >= 0 || dsl_dir_phys(dd)->dd_used_bytes >= -used); + ASSERT(compressed >= 0 || + dsl_dir_phys(dd)->dd_compressed_bytes >= -compressed); + ASSERT(uncompressed >= 0 || + dsl_dir_phys(dd)->dd_uncompressed_bytes >= -uncompressed); + dsl_dir_phys(dd)->dd_used_bytes += used; + dsl_dir_phys(dd)->dd_uncompressed_bytes += uncompressed; + dsl_dir_phys(dd)->dd_compressed_bytes += compressed; + + if (dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN) { + ASSERT(used > 0 || + dsl_dir_phys(dd)->dd_used_breakdown[type] >= -used); + dsl_dir_phys(dd)->dd_used_breakdown[type] += used; +#ifdef ZFS_DEBUG + { + dd_used_t t; + uint64_t u = 0; + for (t = 0; t < DD_USED_NUM; t++) + u += dsl_dir_phys(dd)->dd_used_breakdown[t]; + ASSERT3U(u, ==, dsl_dir_phys(dd)->dd_used_bytes); + } +#endif + } + if (needlock) + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent != NULL) { + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + accounted_delta, compressed, uncompressed, tx); + dsl_dir_transfer_space(dd->dd_parent, + used - accounted_delta, + DD_USED_CHILD_RSRV, DD_USED_CHILD, tx); + } +} + +void +dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, + dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(oldtype < DD_USED_NUM); + ASSERT(newtype < DD_USED_NUM); + + if (delta == 0 || + !(dsl_dir_phys(dd)->dd_flags & DD_FLAG_USED_BREAKDOWN)) + return; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + mutex_enter(&dd->dd_lock); + ASSERT(delta > 0 ? + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] >= delta : + dsl_dir_phys(dd)->dd_used_breakdown[newtype] >= -delta); + ASSERT(dsl_dir_phys(dd)->dd_used_bytes >= ABS(delta)); + dsl_dir_phys(dd)->dd_used_breakdown[oldtype] -= delta; + dsl_dir_phys(dd)->dd_used_breakdown[newtype] += delta; + mutex_exit(&dd->dd_lock); +} + +typedef struct dsl_dir_set_qr_arg { + const char *ddsqra_name; + zprop_source_t ddsqra_source; + uint64_t ddsqra_value; +} dsl_dir_set_qr_arg_t; + +static int +dsl_dir_set_quota_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + int error; + uint64_t towrite, newval; + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + + error = dsl_prop_predict(ds->ds_dir, "quota", + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + if (newval == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + mutex_enter(&ds->ds_dir->dd_lock); + /* + * If we are doing the preliminary check in open context, and + * there are pending changes, then don't fail it, since the + * pending changes could under-estimate the amount of space to be + * freed up. + */ + towrite = dsl_dir_space_towrite(ds->ds_dir); + if ((dmu_tx_is_syncing(tx) || towrite == 0) && + (newval < dsl_dir_phys(ds->ds_dir)->dd_reserved || + newval < dsl_dir_phys(ds->ds_dir)->dd_used_bytes + towrite)) { + error = SET_ERROR(ENOSPC); + } + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); + return (error); +} + +static void +dsl_dir_set_quota_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_QUOTA), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_QUOTA), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_QUOTA), (longlong_t)newval); + } + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + mutex_enter(&ds->ds_dir->dd_lock); + dsl_dir_phys(ds->ds_dir)->dd_quota = newval; + mutex_exit(&ds->ds_dir->dd_lock); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) +{ + dsl_dir_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = quota; + + return (dsl_sync_task(ddname, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +static int +dsl_dir_set_reservation_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + dsl_dir_t *dd; + uint64_t newval, used, avail; + int error; + + error = dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds); + if (error != 0) + return (error); + dd = ds->ds_dir; + + /* + * If we are doing the preliminary check in open context, the + * space estimates may be inaccurate. + */ + if (!dmu_tx_is_syncing(tx)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + error = dsl_prop_predict(ds->ds_dir, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, ddsqra->ddsqra_value, &newval); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + return (error); + } + + mutex_enter(&dd->dd_lock); + used = dsl_dir_phys(dd)->dd_used_bytes; + mutex_exit(&dd->dd_lock); + + if (dd->dd_parent) { + avail = dsl_dir_space_available(dd->dd_parent, + NULL, 0, FALSE); + } else { + avail = dsl_pool_adjustedsize(dd->dd_pool, + ZFS_SPACE_CHECK_NORMAL) - used; + } + + if (MAX(used, newval) > MAX(used, dsl_dir_phys(dd)->dd_reserved)) { + uint64_t delta = MAX(used, newval) - + MAX(used, dsl_dir_phys(dd)->dd_reserved); + + if (delta > avail || + (dsl_dir_phys(dd)->dd_quota > 0 && + newval > dsl_dir_phys(dd)->dd_quota)) + error = SET_ERROR(ENOSPC); + } + + dsl_dataset_rele(ds, FTAG); + return (error); +} + +void +dsl_dir_set_reservation_sync_impl(dsl_dir_t *dd, uint64_t value, dmu_tx_t *tx) +{ + uint64_t used; + int64_t delta; + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + mutex_enter(&dd->dd_lock); + used = dsl_dir_phys(dd)->dd_used_bytes; + delta = MAX(used, value) - MAX(used, dsl_dir_phys(dd)->dd_reserved); + dsl_dir_phys(dd)->dd_reserved = value; + + if (dd->dd_parent != NULL) { + /* Roll up this additional usage into our ancestors */ + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + delta, 0, 0, tx); + } + mutex_exit(&dd->dd_lock); +} + +static void +dsl_dir_set_reservation_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_set_qr_arg_t *ddsqra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t newval; + + VERIFY0(dsl_dataset_hold(dp, ddsqra->ddsqra_name, FTAG, &ds)); + + if (spa_version(dp->dp_spa) >= SPA_VERSION_RECVD_PROPS) { + dsl_prop_set_sync_impl(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), + ddsqra->ddsqra_source, sizeof (ddsqra->ddsqra_value), 1, + &ddsqra->ddsqra_value, tx); + + VERIFY0(dsl_prop_get_int_ds(ds, + zfs_prop_to_name(ZFS_PROP_RESERVATION), &newval)); + } else { + newval = ddsqra->ddsqra_value; + spa_history_log_internal_ds(ds, "set", tx, "%s=%lld", + zfs_prop_to_name(ZFS_PROP_RESERVATION), + (longlong_t)newval); + } + + dsl_dir_set_reservation_sync_impl(ds->ds_dir, newval, tx); + dsl_dataset_rele(ds, FTAG); +} + +int +dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation) +{ + dsl_dir_set_qr_arg_t ddsqra; + + ddsqra.ddsqra_name = ddname; + ddsqra.ddsqra_source = source; + ddsqra.ddsqra_value = reservation; + + return (dsl_sync_task(ddname, dsl_dir_set_reservation_check, + dsl_dir_set_reservation_sync, &ddsqra, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +static dsl_dir_t * +closest_common_ancestor(dsl_dir_t *ds1, dsl_dir_t *ds2) +{ + for (; ds1; ds1 = ds1->dd_parent) { + dsl_dir_t *dd; + for (dd = ds2; dd; dd = dd->dd_parent) { + if (ds1 == dd) + return (dd); + } + } + return (NULL); +} + +/* + * If delta is applied to dd, how much of that delta would be applied to + * ancestor? Syncing context only. + */ +static int64_t +would_change(dsl_dir_t *dd, int64_t delta, dsl_dir_t *ancestor) +{ + if (dd == ancestor) + return (delta); + + mutex_enter(&dd->dd_lock); + delta = parent_delta(dd, dsl_dir_phys(dd)->dd_used_bytes, delta); + mutex_exit(&dd->dd_lock); + return (would_change(dd->dd_parent, delta, ancestor)); +} + +typedef struct dsl_dir_rename_arg { + const char *ddra_oldname; + const char *ddra_newname; + cred_t *ddra_cred; + proc_t *ddra_proc; +} dsl_dir_rename_arg_t; + +typedef struct dsl_valid_rename_arg { + int char_delta; + int nest_delta; +} dsl_valid_rename_arg_t; + +/* ARGSUSED */ +static int +dsl_valid_rename(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_valid_rename_arg_t *dvra = arg; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + + dsl_dataset_name(ds, namebuf); + + ASSERT3U(strnlen(namebuf, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + int namelen = strlen(namebuf) + dvra->char_delta; + int depth = get_dataset_depth(namebuf) + dvra->nest_delta; + + if (namelen >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + if (dvra->nest_delta > 0 && depth >= zfs_max_dataset_nesting) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +dsl_dir_rename_check(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + dsl_valid_rename_arg_t dvra; + dsl_dataset_t *parentds; + objset_t *parentos; + const char *mynewname; + int error; + + /* target dir should exist */ + error = dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL); + if (error != 0) + return (error); + + /* new parent should exist */ + error = dsl_dir_hold(dp, ddra->ddra_newname, FTAG, + &newparent, &mynewname); + if (error != 0) { + dsl_dir_rele(dd, FTAG); + return (error); + } + + /* can't rename to different pool */ + if (dd->dd_pool != newparent->dd_pool) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EXDEV)); + } + + /* new name should not already exist */ + if (mynewname == NULL) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EEXIST)); + } + + /* can't rename below anything but filesystems (eg. no ZVOLs) */ + error = dsl_dataset_hold_obj(newparent->dd_pool, + dsl_dir_phys(newparent)->dd_head_dataset_obj, FTAG, &parentds); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + error = dmu_objset_from_ds(parentds, &parentos); + if (error != 0) { + dsl_dataset_rele(parentds, FTAG); + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + if (dmu_objset_type(parentos) != DMU_OST_ZFS) { + dsl_dataset_rele(parentds, FTAG); + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); + } + dsl_dataset_rele(parentds, FTAG); + + ASSERT3U(strnlen(ddra->ddra_newname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + ASSERT3U(strnlen(ddra->ddra_oldname, ZFS_MAX_DATASET_NAME_LEN), + <, ZFS_MAX_DATASET_NAME_LEN); + dvra.char_delta = strlen(ddra->ddra_newname) + - strlen(ddra->ddra_oldname); + dvra.nest_delta = get_dataset_depth(ddra->ddra_newname) + - get_dataset_depth(ddra->ddra_oldname); + + /* if the name length is growing, validate child name lengths */ + if (dvra.char_delta > 0 || dvra.nest_delta > 0) { + error = dmu_objset_find_dp(dp, dd->dd_object, dsl_valid_rename, + &dvra, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } + + if (dmu_tx_is_syncing(tx)) { + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + /* + * Although this is the check function and we don't + * normally make on-disk changes in check functions, + * we need to do that here. + * + * Ensure this portion of the tree's counts have been + * initialized in case the new parent has limits set. + */ + dsl_dir_init_fs_ss_count(dd, tx); + } + } + + if (newparent != dd->dd_parent) { + /* is there enough space? */ + uint64_t myspace = + MAX(dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_reserved); + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + if (dsl_dir_is_zapified(dd)) { + int err; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + + /* + * have to add 1 for the filesystem itself that we're + * moving + */ + fs_cnt++; + + err = zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt); + if (err != ENOENT && err != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (err); + } + } + + /* check for encryption errors */ + error = dsl_dir_rename_crypt_check(dd, newparent); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EACCES)); + } + + /* no rename into our descendant */ + if (closest_common_ancestor(dd, newparent) == dd) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dir_transfer_possible(dd->dd_parent, + newparent, fs_cnt, ss_cnt, myspace, + ddra->ddra_cred, ddra->ddra_proc); + if (error != 0) { + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (error); + } + } + + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); + return (0); +} + +static void +dsl_dir_rename_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dir_rename_arg_t *ddra = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd, *newparent; + const char *mynewname; + objset_t *mos = dp->dp_meta_objset; + + VERIFY0(dsl_dir_hold(dp, ddra->ddra_oldname, FTAG, &dd, NULL)); + VERIFY0(dsl_dir_hold(dp, ddra->ddra_newname, FTAG, &newparent, + &mynewname)); + + /* Log this before we change the name. */ + spa_history_log_internal_dd(dd, "rename", tx, + "-> %s", ddra->ddra_newname); + + if (newparent != dd->dd_parent) { + objset_t *os = dd->dd_pool->dp_meta_objset; + uint64_t fs_cnt = 0; + uint64_t ss_cnt = 0; + + /* + * We already made sure the dd counts were initialized in the + * check function. + */ + if (spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_FS_SS_LIMIT)) { + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_FILESYSTEM_COUNT, sizeof (fs_cnt), 1, + &fs_cnt)); + /* add 1 for the filesystem itself that we're moving */ + fs_cnt++; + + VERIFY0(zap_lookup(os, dd->dd_object, + DD_FIELD_SNAPSHOT_COUNT, sizeof (ss_cnt), 1, + &ss_cnt)); + } + + dsl_fs_ss_count_adjust(dd->dd_parent, -fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, fs_cnt, + DD_FIELD_FILESYSTEM_COUNT, tx); + + dsl_fs_ss_count_adjust(dd->dd_parent, -ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + dsl_fs_ss_count_adjust(newparent, ss_cnt, + DD_FIELD_SNAPSHOT_COUNT, tx); + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD, + -dsl_dir_phys(dd)->dd_used_bytes, + -dsl_dir_phys(dd)->dd_compressed_bytes, + -dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(newparent, DD_USED_CHILD, + dsl_dir_phys(dd)->dd_used_bytes, + dsl_dir_phys(dd)->dd_compressed_bytes, + dsl_dir_phys(dd)->dd_uncompressed_bytes, tx); + + if (dsl_dir_phys(dd)->dd_reserved > + dsl_dir_phys(dd)->dd_used_bytes) { + uint64_t unused_rsrv = dsl_dir_phys(dd)->dd_reserved - + dsl_dir_phys(dd)->dd_used_bytes; + + dsl_dir_diduse_space(dd->dd_parent, DD_USED_CHILD_RSRV, + -unused_rsrv, 0, 0, tx); + dsl_dir_diduse_space(newparent, DD_USED_CHILD_RSRV, + unused_rsrv, 0, 0, tx); + } + } + + dmu_buf_will_dirty(dd->dd_dbuf, tx); + + /* remove from old parent zapobj */ + VERIFY0(zap_remove(mos, + dsl_dir_phys(dd->dd_parent)->dd_child_dir_zapobj, + dd->dd_myname, tx)); + + (void) strlcpy(dd->dd_myname, mynewname, + sizeof (dd->dd_myname)); + dsl_dir_rele(dd->dd_parent, dd); + dsl_dir_phys(dd)->dd_parent_obj = newparent->dd_object; + VERIFY0(dsl_dir_hold_obj(dp, + newparent->dd_object, NULL, dd, &dd->dd_parent)); + + /* add to new parent zapobj */ + VERIFY0(zap_add(mos, dsl_dir_phys(newparent)->dd_child_dir_zapobj, + dd->dd_myname, 8, 1, &dd->dd_object, tx)); + + zvol_rename_minors(dp->dp_spa, ddra->ddra_oldname, + ddra->ddra_newname, B_TRUE); + + dsl_prop_notify_all(dd); + + dsl_dir_rele(newparent, FTAG); + dsl_dir_rele(dd, FTAG); +} + +int +dsl_dir_rename(const char *oldname, const char *newname) +{ + dsl_dir_rename_arg_t ddra; + + ddra.ddra_oldname = oldname; + ddra.ddra_newname = newname; + ddra.ddra_cred = CRED(); + ddra.ddra_proc = curproc; + + return (dsl_sync_task(oldname, + dsl_dir_rename_check, dsl_dir_rename_sync, &ddra, + 3, ZFS_SPACE_CHECK_RESERVED)); +} + +int +dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, + uint64_t fs_cnt, uint64_t ss_cnt, uint64_t space, + cred_t *cr, proc_t *proc) +{ + dsl_dir_t *ancestor; + int64_t adelta; + uint64_t avail; + int err; + + ancestor = closest_common_ancestor(sdd, tdd); + adelta = would_change(sdd, -space, ancestor); + avail = dsl_dir_space_available(tdd, ancestor, adelta, FALSE); + if (avail < space) + return (SET_ERROR(ENOSPC)); + + err = dsl_fs_ss_limit_check(tdd, fs_cnt, ZFS_PROP_FILESYSTEM_LIMIT, + ancestor, cr, proc); + if (err != 0) + return (err); + err = dsl_fs_ss_limit_check(tdd, ss_cnt, ZFS_PROP_SNAPSHOT_LIMIT, + ancestor, cr, proc); + if (err != 0) + return (err); + + return (0); +} + +inode_timespec_t +dsl_dir_snap_cmtime(dsl_dir_t *dd) +{ + inode_timespec_t t; + + mutex_enter(&dd->dd_lock); + t = dd->dd_snap_cmtime; + mutex_exit(&dd->dd_lock); + + return (t); +} + +void +dsl_dir_snap_cmtime_update(dsl_dir_t *dd) +{ + inode_timespec_t t; + + gethrestime(&t); + mutex_enter(&dd->dd_lock); + dd->dd_snap_cmtime = t; + mutex_exit(&dd->dd_lock); +} + +void +dsl_dir_zapify(dsl_dir_t *dd, dmu_tx_t *tx) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + dmu_object_zapify(mos, dd->dd_object, DMU_OT_DSL_DIR, tx); +} + +boolean_t +dsl_dir_is_zapified(dsl_dir_t *dd) +{ + dmu_object_info_t doi; + + dmu_object_info_from_db(dd->dd_dbuf, &doi); + return (doi.doi_type == DMU_OTN_ZAP_METADATA); +} + +void +dsl_dir_livelist_open(dsl_dir_t *dd, uint64_t obj) +{ + objset_t *mos = dd->dd_pool->dp_meta_objset; + ASSERT(spa_feature_is_active(dd->dd_pool->dp_spa, + SPA_FEATURE_LIVELIST)); + dsl_deadlist_open(&dd->dd_livelist, mos, obj); + bplist_create(&dd->dd_pending_allocs); + bplist_create(&dd->dd_pending_frees); +} + +void +dsl_dir_livelist_close(dsl_dir_t *dd) +{ + dsl_deadlist_close(&dd->dd_livelist); + bplist_destroy(&dd->dd_pending_allocs); + bplist_destroy(&dd->dd_pending_frees); +} + +void +dsl_dir_remove_livelist(dsl_dir_t *dd, dmu_tx_t *tx, boolean_t total) +{ + uint64_t obj; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + livelist_condense_entry_t to_condense = spa->spa_to_condense; + + if (!dsl_deadlist_is_open(&dd->dd_livelist)) + return; + + /* + * If the livelist being removed is set to be condensed, stop the + * condense zthr and indicate the cancellation in the spa_to_condense + * struct in case the condense no-wait synctask has already started + */ + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL && + (to_condense.ds != NULL) && (to_condense.ds->ds_dir == dd)) { + /* + * We use zthr_wait_cycle_done instead of zthr_cancel + * because we don't want to destroy the zthr, just have + * it skip its current task. + */ + spa->spa_to_condense.cancelled = B_TRUE; + zthr_wait_cycle_done(ll_condense_thread); + /* + * If we've returned from zthr_wait_cycle_done without + * clearing the to_condense data structure it's either + * because the no-wait synctask has started (which is + * indicated by 'syncing' field of to_condense) and we + * can expect it to clear to_condense on its own. + * Otherwise, we returned before the zthr ran. The + * checkfunc will now fail as cancelled == B_TRUE so we + * can safely NULL out ds, allowing a different dir's + * livelist to be condensed. + * + * We can be sure that the to_condense struct will not + * be repopulated at this stage because both this + * function and dsl_livelist_try_condense execute in + * syncing context. + */ + if ((spa->spa_to_condense.ds != NULL) && + !spa->spa_to_condense.syncing) { + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, + spa); + spa->spa_to_condense.ds = NULL; + } + } + + dsl_dir_livelist_close(dd); + VERIFY0(zap_lookup(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, sizeof (uint64_t), 1, &obj)); + VERIFY0(zap_remove(dp->dp_meta_objset, dd->dd_object, + DD_FIELD_LIVELIST, tx)); + if (total) { + dsl_deadlist_free(dp->dp_meta_objset, obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + } +} + +static int +dsl_dir_activity_in_progress(dsl_dir_t *dd, dsl_dataset_t *ds, + zfs_wait_activity_t activity, boolean_t *in_progress) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&dd->dd_activity_lock)); + + switch (activity) { + case ZFS_WAIT_DELETEQ: { +#ifdef _KERNEL + objset_t *os; + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + break; + + mutex_enter(&os->os_user_ptr_lock); + void *user = dmu_objset_get_user(os); + mutex_exit(&os->os_user_ptr_lock); + if (dmu_objset_type(os) != DMU_OST_ZFS || + user == NULL || zfs_get_vfs_flag_unmounted(os)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t readonly = B_FALSE; + error = zfs_get_temporary_prop(ds, ZFS_PROP_READONLY, &readonly, + NULL); + + if (error != 0) + break; + + if (readonly || !spa_writeable(dd->dd_pool->dp_spa)) { + *in_progress = B_FALSE; + return (0); + } + + uint64_t count, unlinked_obj; + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &unlinked_obj); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + break; + } + error = zap_count(os, unlinked_obj, &count); + + if (error == 0) + *in_progress = (count != 0); + break; +#else + /* + * The delete queue is ZPL specific, and libzpool doesn't have + * it. It doesn't make sense to wait for it. + */ + *in_progress = B_FALSE; + break; +#endif + } + default: + panic("unrecognized value for activity %d", activity); + } + + return (error); +} + +int +dsl_dir_wait(dsl_dir_t *dd, dsl_dataset_t *ds, zfs_wait_activity_t activity, + boolean_t *waited) +{ + int error = 0; + boolean_t in_progress; + dsl_pool_t *dp = dd->dd_pool; + for (;;) { + dsl_pool_config_enter(dp, FTAG); + error = dsl_dir_activity_in_progress(dd, ds, activity, + &in_progress); + dsl_pool_config_exit(dp, FTAG); + if (error != 0 || !in_progress) + break; + + *waited = B_TRUE; + + if (cv_wait_sig(&dd->dd_activity_cv, &dd->dd_activity_lock) == + 0 || dd->dd_activity_cancelled) { + error = SET_ERROR(EINTR); + break; + } + } + return (error); +} + +void +dsl_dir_cancel_waiters(dsl_dir_t *dd) +{ + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_cancelled = B_TRUE; + cv_broadcast(&dd->dd_activity_cv); + while (dd->dd_activity_waiters > 0) + cv_wait(&dd->dd_activity_cv, &dd->dd_activity_lock); + mutex_exit(&dd->dd_activity_lock); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dsl_dir_set_quota); +EXPORT_SYMBOL(dsl_dir_set_reservation); +#endif diff --git a/module/zfs/dsl_pool.c b/module/zfs/dsl_pool.c new file mode 100644 index 000000000000..3a2028625e8b --- /dev/null +++ b/module/zfs/dsl_pool.c @@ -0,0 +1,1384 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS Write Throttle + * ------------------ + * + * ZFS must limit the rate of incoming writes to the rate at which it is able + * to sync data modifications to the backend storage. Throttling by too much + * creates an artificial limit; throttling by too little can only be sustained + * for short periods and would lead to highly lumpy performance. On a per-pool + * basis, ZFS tracks the amount of modified (dirty) data. As operations change + * data, the amount of dirty data increases; as ZFS syncs out data, the amount + * of dirty data decreases. When the amount of dirty data exceeds a + * predetermined threshold further modifications are blocked until the amount + * of dirty data decreases (as data is synced out). + * + * The limit on dirty data is tunable, and should be adjusted according to + * both the IO capacity and available memory of the system. The larger the + * window, the more ZFS is able to aggregate and amortize metadata (and data) + * changes. However, memory is a limited resource, and allowing for more dirty + * data comes at the cost of keeping other useful data in memory (for example + * ZFS data cached by the ARC). + * + * Implementation + * + * As buffers are modified dsl_pool_willuse_space() increments both the per- + * txg (dp_dirty_pertxg[]) and poolwide (dp_dirty_total) accounting of + * dirty space used; dsl_pool_dirty_space() decrements those values as data + * is synced out from dsl_pool_sync(). While only the poolwide value is + * relevant, the per-txg value is useful for debugging. The tunable + * zfs_dirty_data_max determines the dirty space limit. Once that value is + * exceeded, new writes are halted until space frees up. + * + * The zfs_dirty_data_sync_percent tunable dictates the threshold at which we + * ensure that there is a txg syncing (see the comment in txg.c for a full + * description of transaction group stages). + * + * The IO scheduler uses both the dirty space limit and current amount of + * dirty data as inputs. Those values affect the number of concurrent IOs ZFS + * issues. See the comment in vdev_queue.c for details of the IO scheduler. + * + * The delay is also calculated based on the amount of dirty data. See the + * comment above dmu_tx_delay() for details. + */ + +/* + * zfs_dirty_data_max will be set to zfs_dirty_data_max_percent% of all memory, + * capped at zfs_dirty_data_max_max. It can also be overridden with a module + * parameter. + */ +unsigned long zfs_dirty_data_max = 0; +unsigned long zfs_dirty_data_max_max = 0; +int zfs_dirty_data_max_percent = 10; +int zfs_dirty_data_max_max_percent = 25; + +/* + * If there's at least this much dirty data (as a percentage of + * zfs_dirty_data_max), push out a txg. This should be less than + * zfs_vdev_async_write_active_min_dirty_percent. + */ +int zfs_dirty_data_sync_percent = 20; + +/* + * Once there is this amount of dirty data, the dmu_tx_delay() will kick in + * and delay each transaction. + * This value should be >= zfs_vdev_async_write_active_max_dirty_percent. + */ +int zfs_delay_min_dirty_percent = 60; + +/* + * This controls how quickly the delay approaches infinity. + * Larger values cause it to delay more for a given amount of dirty data. + * Therefore larger values will cause there to be less dirty data for a + * given throughput. + * + * For the smoothest delay, this value should be about 1 billion divided + * by the maximum number of operations per second. This will smoothly + * handle between 10x and 1/10th this number. + * + * Note: zfs_delay_scale * zfs_dirty_data_max must be < 2^64, due to the + * multiply in dmu_tx_delay(). + */ +unsigned long zfs_delay_scale = 1000 * 1000 * 1000 / 2000; + +/* + * This determines the number of threads used by the dp_sync_taskq. + */ +int zfs_sync_taskq_batch_pct = 75; + +/* + * These tunables determine the behavior of how zil_itxg_clean() is + * called via zil_clean() in the context of spa_sync(). When an itxg + * list needs to be cleaned, TQ_NOSLEEP will be used when dispatching. + * If the dispatch fails, the call to zil_itxg_clean() will occur + * synchronously in the context of spa_sync(), which can negatively + * impact the performance of spa_sync() (e.g. in the case of the itxg + * list having a large number of itxs that needs to be cleaned). + * + * Thus, these tunables can be used to manipulate the behavior of the + * taskq used by zil_clean(); they determine the number of taskq entries + * that are pre-populated when the taskq is first created (via the + * "zfs_zil_clean_taskq_minalloc" tunable) and the maximum number of + * taskq entries that are cached after an on-demand allocation (via the + * "zfs_zil_clean_taskq_maxalloc"). + * + * The idea being, we want to try reasonably hard to ensure there will + * already be a taskq entry pre-allocated by the time that it is needed + * by zil_clean(). This way, we can avoid the possibility of an + * on-demand allocation of a new taskq entry from failing, which would + * result in zil_itxg_clean() being called synchronously from zil_clean() + * (which can adversely affect performance of spa_sync()). + * + * Additionally, the number of threads used by the taskq can be + * configured via the "zfs_zil_clean_taskq_nthr_pct" tunable. + */ +int zfs_zil_clean_taskq_nthr_pct = 100; +int zfs_zil_clean_taskq_minalloc = 1024; +int zfs_zil_clean_taskq_maxalloc = 1024 * 1024; + +int +dsl_pool_open_special_dir(dsl_pool_t *dp, const char *name, dsl_dir_t **ddp) +{ + uint64_t obj; + int err; + + err = zap_lookup(dp->dp_meta_objset, + dsl_dir_phys(dp->dp_root_dir)->dd_child_dir_zapobj, + name, sizeof (obj), 1, &obj); + if (err) + return (err); + + return (dsl_dir_hold_obj(dp, obj, name, dp, ddp)); +} + +static dsl_pool_t * +dsl_pool_open_impl(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp; + blkptr_t *bp = spa_get_rootblkptr(spa); + + dp = kmem_zalloc(sizeof (dsl_pool_t), KM_SLEEP); + dp->dp_spa = spa; + dp->dp_meta_rootbp = *bp; + rrw_init(&dp->dp_config_rwlock, B_TRUE); + txg_init(dp, txg); + mmp_init(spa); + + txg_list_create(&dp->dp_dirty_datasets, spa, + offsetof(dsl_dataset_t, ds_dirty_link)); + txg_list_create(&dp->dp_dirty_zilogs, spa, + offsetof(zilog_t, zl_dirty_link)); + txg_list_create(&dp->dp_dirty_dirs, spa, + offsetof(dsl_dir_t, dd_dirty_link)); + txg_list_create(&dp->dp_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); + txg_list_create(&dp->dp_early_sync_tasks, spa, + offsetof(dsl_sync_task_t, dst_node)); + + dp->dp_sync_taskq = taskq_create("dp_sync_taskq", + zfs_sync_taskq_batch_pct, minclsyspri, 1, INT_MAX, + TASKQ_THREADS_CPU_PCT); + + dp->dp_zil_clean_taskq = taskq_create("dp_zil_clean_taskq", + zfs_zil_clean_taskq_nthr_pct, minclsyspri, + zfs_zil_clean_taskq_minalloc, + zfs_zil_clean_taskq_maxalloc, + TASKQ_PREPOPULATE | TASKQ_THREADS_CPU_PCT); + + mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dp->dp_spaceavail_cv, NULL, CV_DEFAULT, NULL); + + dp->dp_zrele_taskq = taskq_create("z_zrele", boot_ncpus, defclsyspri, + boot_ncpus * 8, INT_MAX, TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + dp->dp_unlinked_drain_taskq = taskq_create("z_unlinked_drain", + boot_ncpus, defclsyspri, boot_ncpus, INT_MAX, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + + return (dp); +} + +int +dsl_pool_init(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + + /* + * Initialize the caller's dsl_pool_t structure before we actually open + * the meta objset. This is done because a self-healing write zio may + * be issued as part of dmu_objset_open_impl() and the spa needs its + * dsl_pool_t initialized in order to handle the write. + */ + *dpp = dp; + + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); + if (err != 0) { + dsl_pool_close(dp); + *dpp = NULL; + } + + return (err); +} + +int +dsl_pool_open(dsl_pool_t *dp) +{ + int err; + dsl_dir_t *dd; + dsl_dataset_t *ds; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, + &dp->dp_root_dir_obj); + if (err) + goto out; + + err = dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir); + if (err) + goto out; + + err = dsl_pool_open_special_dir(dp, MOS_DIR_NAME, &dp->dp_mos_dir); + if (err) + goto out; + + if (spa_version(dp->dp_spa) >= SPA_VERSION_ORIGIN) { + err = dsl_pool_open_special_dir(dp, ORIGIN_DIR_NAME, &dd); + if (err) + goto out; + err = dsl_dataset_hold_obj(dp, + dsl_dir_phys(dd)->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_rele(dd, dp); + if (err) + goto out; + } + + if (spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + err = dsl_pool_open_special_dir(dp, FREE_DIR_NAME, + &dp->dp_free_dir); + if (err) + goto out; + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err) + goto out; + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj); + if (err == 0) { + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, + dp->dp_meta_objset, obj)); + } else if (err == ENOENT) { + /* + * We might not have created the remap bpobj yet. + */ + err = 0; + } else { + goto out; + } + } + + /* + * Note: errors ignored, because the these special dirs, used for + * space accounting, are only created on demand. + */ + (void) dsl_pool_open_special_dir(dp, LEAK_DIR_NAME, + &dp->dp_leak_dir); + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, sizeof (uint64_t), 1, + &dp->dp_bptree_obj); + if (err != 0) + goto out; + } + + if (spa_feature_is_active(dp->dp_spa, SPA_FEATURE_EMPTY_BPOBJ)) { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_EMPTY_BPOBJ, sizeof (uint64_t), 1, + &dp->dp_empty_bpobj); + if (err != 0) + goto out; + } + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, + &dp->dp_tmp_userrefs_obj); + if (err == ENOENT) + err = 0; + if (err) + goto out; + + err = dsl_scan_init(dp, dp->dp_tx.tx_open_txg); + +out: + rrw_exit(&dp->dp_config_rwlock, FTAG); + return (err); +} + +void +dsl_pool_close(dsl_pool_t *dp) +{ + /* + * Drop our references from dsl_pool_open(). + * + * Since we held the origin_snap from "syncing" context (which + * includes pool-opening context), it actually only got a "ref" + * and not a hold, so just drop that here. + */ + if (dp->dp_origin_snap != NULL) + dsl_dataset_rele(dp->dp_origin_snap, dp); + if (dp->dp_mos_dir != NULL) + dsl_dir_rele(dp->dp_mos_dir, dp); + if (dp->dp_free_dir != NULL) + dsl_dir_rele(dp->dp_free_dir, dp); + if (dp->dp_leak_dir != NULL) + dsl_dir_rele(dp->dp_leak_dir, dp); + if (dp->dp_root_dir != NULL) + dsl_dir_rele(dp->dp_root_dir, dp); + + bpobj_close(&dp->dp_free_bpobj); + bpobj_close(&dp->dp_obsolete_bpobj); + + /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ + if (dp->dp_meta_objset != NULL) + dmu_objset_evict(dp->dp_meta_objset); + + txg_list_destroy(&dp->dp_dirty_datasets); + txg_list_destroy(&dp->dp_dirty_zilogs); + txg_list_destroy(&dp->dp_sync_tasks); + txg_list_destroy(&dp->dp_early_sync_tasks); + txg_list_destroy(&dp->dp_dirty_dirs); + + taskq_destroy(dp->dp_zil_clean_taskq); + taskq_destroy(dp->dp_sync_taskq); + + /* + * We can't set retry to TRUE since we're explicitly specifying + * a spa to flush. This is good enough; any missed buffers for + * this spa won't cause trouble, and they'll eventually fall + * out of the ARC just like any other unused buffer. + */ + arc_flush(dp->dp_spa, FALSE); + + mmp_fini(dp->dp_spa); + txg_fini(dp); + dsl_scan_fini(dp); + dmu_buf_user_evict_wait(); + + rrw_destroy(&dp->dp_config_rwlock); + mutex_destroy(&dp->dp_lock); + cv_destroy(&dp->dp_spaceavail_cv); + taskq_destroy(dp->dp_unlinked_drain_taskq); + taskq_destroy(dp->dp_zrele_taskq); + if (dp->dp_blkstats != NULL) { + mutex_destroy(&dp->dp_blkstats->zab_lock); + vmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + } + kmem_free(dp, sizeof (dsl_pool_t)); +} + +void +dsl_pool_create_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t obj; + /* + * Currently, we only create the obsolete_bpobj where there are + * indirect vdevs with referenced mappings. + */ + ASSERT(spa_feature_is_active(dp->dp_spa, SPA_FEATURE_DEVICE_REMOVAL)); + /* create and open the obsolete_bpobj */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY0(bpobj_open(&dp->dp_obsolete_bpobj, dp->dp_meta_objset, obj)); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + spa_feature_incr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); +} + +void +dsl_pool_destroy_obsolete_bpobj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + spa_feature_decr(dp->dp_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ, tx)); + bpobj_free(dp->dp_meta_objset, + dp->dp_obsolete_bpobj.bpo_object, tx); + bpobj_close(&dp->dp_obsolete_bpobj); +} + +dsl_pool_t * +dsl_pool_create(spa_t *spa, nvlist_t *zplprops, dsl_crypto_params_t *dcp, + uint64_t txg) +{ + int err; + dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); +#ifdef _KERNEL + objset_t *os; +#else + objset_t *os __attribute__((unused)); +#endif + dsl_dataset_t *ds; + uint64_t obj; + + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + + /* create and open the MOS (meta-objset) */ + dp->dp_meta_objset = dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); + spa->spa_meta_objset = dp->dp_meta_objset; + + /* create the pool directory */ + err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_OT_OBJECT_DIRECTORY, DMU_OT_NONE, 0, tx); + ASSERT0(err); + + /* Initialize scan structures */ + VERIFY0(dsl_scan_init(dp, txg)); + + /* create and open the root dir */ + dp->dp_root_dir_obj = dsl_dir_create_sync(dp, NULL, NULL, tx); + VERIFY0(dsl_dir_hold_obj(dp, dp->dp_root_dir_obj, + NULL, dp, &dp->dp_root_dir)); + + /* create and open the meta-objset dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, MOS_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + MOS_DIR_NAME, &dp->dp_mos_dir)); + + if (spa_version(spa) >= SPA_VERSION_DEADLISTS) { + /* create and open the free dir */ + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* create and open the free_bplist */ + obj = bpobj_alloc(dp->dp_meta_objset, SPA_OLD_MAXBLOCKSIZE, tx); + VERIFY(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx) == 0); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, + dp->dp_meta_objset, obj)); + } + + if (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) + dsl_pool_create_origin(dp, tx); + + /* + * Some features may be needed when creating the root dataset, so we + * create the feature objects here. + */ + if (spa_version(spa) >= SPA_VERSION_FEATURES) + spa_feature_create_zap_objects(spa, tx); + + if (dcp != NULL && dcp->cp_crypt != ZIO_CRYPT_OFF && + dcp->cp_crypt != ZIO_CRYPT_INHERIT) + spa_feature_enable(spa, SPA_FEATURE_ENCRYPTION, tx); + + /* create the root dataset */ + obj = dsl_dataset_create_sync_dd(dp->dp_root_dir, NULL, dcp, 0, tx); + + /* create the root objset */ + VERIFY0(dsl_dataset_hold_obj_flags(dp, obj, + DS_HOLD_FLAG_DECRYPT, FTAG, &ds)); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + os = dmu_objset_create_impl(dp->dp_spa, ds, + dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); +#ifdef _KERNEL + zfs_create_fs(os, kcred, zplprops, tx); +#endif + dsl_dataset_rele_flags(ds, DS_HOLD_FLAG_DECRYPT, FTAG); + + dmu_tx_commit(tx); + + rrw_exit(&dp->dp_config_rwlock, FTAG); + + return (dp); +} + +/* + * Account for the meta-objset space in its placeholder dsl_dir. + */ +void +dsl_pool_mos_diduse_space(dsl_pool_t *dp, + int64_t used, int64_t comp, int64_t uncomp) +{ + ASSERT3U(comp, ==, uncomp); /* it's all metadata */ + mutex_enter(&dp->dp_lock); + dp->dp_mos_used_delta += used; + dp->dp_mos_compressed_delta += comp; + dp->dp_mos_uncompressed_delta += uncomp; + mutex_exit(&dp->dp_lock); +} + +static void +dsl_pool_sync_mos(dsl_pool_t *dp, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + dmu_objset_sync(dp->dp_meta_objset, zio, tx); + VERIFY0(zio_wait(zio)); + dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); +} + +static void +dsl_pool_dirty_delta(dsl_pool_t *dp, int64_t delta) +{ + ASSERT(MUTEX_HELD(&dp->dp_lock)); + + if (delta < 0) + ASSERT3U(-delta, <=, dp->dp_dirty_total); + + dp->dp_dirty_total += delta; + + /* + * Note: we signal even when increasing dp_dirty_total. + * This ensures forward progress -- each thread wakes the next waiter. + */ + if (dp->dp_dirty_total < zfs_dirty_data_max) + cv_signal(&dp->dp_spaceavail_cv); +} + +#ifdef ZFS_DEBUG +static boolean_t +dsl_early_sync_task_verify(dsl_pool_t *dp, uint64_t txg) +{ + spa_t *spa = dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + txg_list_t *tl = &vd->vdev_ms_list; + metaslab_t *ms; + + for (ms = txg_list_head(tl, TXG_CLEAN(txg)); ms; + ms = txg_list_next(tl, ms, TXG_CLEAN(txg))) { + VERIFY(range_tree_is_empty(ms->ms_freeing)); + VERIFY(range_tree_is_empty(ms->ms_checkpointing)); + } + } + + return (B_TRUE); +} +#endif + +void +dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) +{ + zio_t *zio; + dmu_tx_t *tx; + dsl_dir_t *dd; + dsl_dataset_t *ds; + objset_t *mos = dp->dp_meta_objset; + list_t synced_datasets; + + list_create(&synced_datasets, sizeof (dsl_dataset_t), + offsetof(dsl_dataset_t, ds_synced_link)); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Run all early sync tasks before writing out any dirty blocks. + * For more info on early sync tasks see block comment in + * dsl_early_sync_task(). + */ + if (!txg_list_empty(&dp->dp_early_sync_tasks, txg)) { + dsl_sync_task_t *dst; + + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = + txg_list_remove(&dp->dp_early_sync_tasks, txg)) != NULL) { + ASSERT(dsl_early_sync_task_verify(dp, txg)); + dsl_sync_task_sync(dst, tx); + } + ASSERT(dsl_early_sync_task_verify(dp, txg)); + } + + /* + * Write out all dirty blocks of dirty datasets. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&synced_datasets, ds); + dsl_dataset_sync(ds, zio, tx); + } + VERIFY0(zio_wait(zio)); + + /* + * Update the long range free counter after + * we're done syncing user data + */ + mutex_enter(&dp->dp_lock); + ASSERT(spa_sync_pass(dp->dp_spa) == 1 || + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] == 0); + dp->dp_long_free_dirty_pertxg[txg & TXG_MASK] = 0; + mutex_exit(&dp->dp_lock); + + /* + * After the data blocks have been written (ensured by the zio_wait() + * above), update the user/group/project space accounting. This happens + * in tasks dispatched to dp_sync_taskq, so wait for them before + * continuing. + */ + for (ds = list_head(&synced_datasets); ds != NULL; + ds = list_next(&synced_datasets, ds)) { + dmu_objset_do_userquota_updates(ds->ds_objset, tx); + } + taskq_wait(dp->dp_sync_taskq); + + /* + * Sync the datasets again to push out the changes due to + * userspace updates. This must be done before we process the + * sync tasks, so that any snapshots will have the correct + * user accounting information (and we won't get confused + * about which blocks are part of the snapshot). + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while ((ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) != NULL) { + objset_t *os = ds->ds_objset; + + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + + /* + * Release any key mappings created by calls to + * dsl_dataset_dirty() from the userquota accounting + * code paths. + */ + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } + } + VERIFY0(zio_wait(zio)); + + /* + * Now that the datasets have been completely synced, we can + * clean up our in-memory structures accumulated while syncing: + * + * - move dead blocks from the pending deadlist and livelists + * to the on-disk versions + * - release hold from dsl_dataset_dirty() + * - release key mapping hold from dsl_dataset_dirty() + */ + while ((ds = list_remove_head(&synced_datasets)) != NULL) { + objset_t *os = ds->ds_objset; + + if (os->os_encrypted && !os->os_raw_receive && + !os->os_next_write_raw[txg & TXG_MASK]) { + ASSERT3P(ds->ds_key_mapping, !=, NULL); + key_mapping_rele(dp->dp_spa, ds->ds_key_mapping, ds); + } + + dsl_dataset_sync_done(ds, tx); + } + + while ((dd = txg_list_remove(&dp->dp_dirty_dirs, txg)) != NULL) { + dsl_dir_sync(dd, tx); + } + + /* + * The MOS's space is accounted for in the pool/$MOS + * (dp_mos_dir). We can't modify the mos while we're syncing + * it, so we remember the deltas and apply them here. + */ + if (dp->dp_mos_used_delta != 0 || dp->dp_mos_compressed_delta != 0 || + dp->dp_mos_uncompressed_delta != 0) { + dsl_dir_diduse_space(dp->dp_mos_dir, DD_USED_HEAD, + dp->dp_mos_used_delta, + dp->dp_mos_compressed_delta, + dp->dp_mos_uncompressed_delta, tx); + dp->dp_mos_used_delta = 0; + dp->dp_mos_compressed_delta = 0; + dp->dp_mos_uncompressed_delta = 0; + } + + if (dmu_objset_is_dirty(mos, txg)) { + dsl_pool_sync_mos(dp, tx); + } + + /* + * We have written all of the accounted dirty data, so our + * dp_space_towrite should now be zero. However, some seldom-used + * code paths do not adhere to this (e.g. dbuf_undirty()). Shore up + * the accounting of any dirtied space now. + * + * Note that, besides any dirty data from datasets, the amount of + * dirty data in the MOS is also accounted by the pool. Therefore, + * we want to do this cleanup after dsl_pool_sync_mos() so we don't + * attempt to update the accounting for the same dirty data twice. + * (i.e. at this point we only update the accounting for the space + * that we know that we "leaked"). + */ + dsl_pool_undirty_space(dp, dp->dp_dirty_pertxg[txg & TXG_MASK], txg); + + /* + * If we modify a dataset in the same txg that we want to destroy it, + * its dsl_dir's dd_dbuf will be dirty, and thus have a hold on it. + * dsl_dir_destroy_check() will fail if there are unexpected holds. + * Therefore, we want to sync the MOS (thus syncing the dd_dbuf + * and clearing the hold on it) before we process the sync_tasks. + * The MOS data dirtied by the sync_tasks will be synced on the next + * pass. + */ + if (!txg_list_empty(&dp->dp_sync_tasks, txg)) { + dsl_sync_task_t *dst; + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT3U(spa_sync_pass(dp->dp_spa), ==, 1); + while ((dst = txg_list_remove(&dp->dp_sync_tasks, txg)) != NULL) + dsl_sync_task_sync(dst, tx); + } + + dmu_tx_commit(tx); + + DTRACE_PROBE2(dsl_pool_sync__done, dsl_pool_t *dp, dp, uint64_t, txg); +} + +void +dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) +{ + zilog_t *zilog; + + while ((zilog = txg_list_head(&dp->dp_dirty_zilogs, txg))) { + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + /* + * We don't remove the zilog from the dp_dirty_zilogs + * list until after we've cleaned it. This ensures that + * callers of zilog_is_dirty() receive an accurate + * answer when they are racing with the spa sync thread. + */ + zil_clean(zilog, txg); + (void) txg_list_remove_this(&dp->dp_dirty_zilogs, zilog, txg); + ASSERT(!dmu_objset_is_dirty(zilog->zl_os, txg)); + dmu_buf_rele(ds->ds_dbuf, zilog); + } + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); +} + +/* + * TRUE if the current thread is the tx_sync_thread or if we + * are being called from SPA context during pool initialization. + */ +int +dsl_pool_sync_context(dsl_pool_t *dp) +{ + return (curthread == dp->dp_tx.tx_sync_thread || + spa_is_initializing(dp->dp_spa) || + taskq_member(dp->dp_sync_taskq, curthread)); +} + +/* + * This function returns the amount of allocatable space in the pool + * minus whatever space is currently reserved by ZFS for specific + * purposes. Specifically: + * + * 1] Any reserved SLOP space + * 2] Any space used by the checkpoint + * 3] Any space used for deferred frees + * + * The latter 2 are especially important because they are needed to + * rectify the SPA's and DMU's different understanding of how much space + * is used. Now the DMU is aware of that extra space tracked by the SPA + * without having to maintain a separate special dir (e.g similar to + * $MOS, $FREEING, and $LEAKED). + * + * Note: By deferred frees here, we mean the frees that were deferred + * in spa_sync() after sync pass 1 (spa_deferred_bpobj), and not the + * segments placed in ms_defer trees during metaslab_sync_done(). + */ +uint64_t +dsl_pool_adjustedsize(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + spa_t *spa = dp->dp_spa; + uint64_t space, resv, adjustedsize; + uint64_t spa_deferred_frees = + spa->spa_deferred_bpobj.bpo_phys->bpo_bytes; + + space = spa_get_dspace(spa) + - spa_get_checkpoint_space(spa) - spa_deferred_frees; + resv = spa_get_slop_space(spa); + + switch (slop_policy) { + case ZFS_SPACE_CHECK_NORMAL: + break; + case ZFS_SPACE_CHECK_RESERVED: + resv >>= 1; + break; + case ZFS_SPACE_CHECK_EXTRA_RESERVED: + resv >>= 2; + break; + case ZFS_SPACE_CHECK_NONE: + resv = 0; + break; + default: + panic("invalid slop policy value: %d", slop_policy); + break; + } + adjustedsize = (space >= resv) ? (space - resv) : 0; + + return (adjustedsize); +} + +uint64_t +dsl_pool_unreserved_space(dsl_pool_t *dp, zfs_space_check_t slop_policy) +{ + uint64_t poolsize = dsl_pool_adjustedsize(dp, slop_policy); + uint64_t deferred = + metaslab_class_get_deferred(spa_normal_class(dp->dp_spa)); + uint64_t quota = (poolsize >= deferred) ? (poolsize - deferred) : 0; + return (quota); +} + +boolean_t +dsl_pool_need_dirty_delay(dsl_pool_t *dp) +{ + uint64_t delay_min_bytes = + zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100; + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + uint64_t dirty; + + mutex_enter(&dp->dp_lock); + dirty = dp->dp_dirty_total; + mutex_exit(&dp->dp_lock); + if (dirty > dirty_min_bytes) + txg_kick(dp); + return (dirty > delay_min_bytes); +} + +void +dsl_pool_dirty_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx) +{ + if (space > 0) { + mutex_enter(&dp->dp_lock); + dp->dp_dirty_pertxg[tx->tx_txg & TXG_MASK] += space; + dsl_pool_dirty_delta(dp, space); + mutex_exit(&dp->dp_lock); + } +} + +void +dsl_pool_undirty_space(dsl_pool_t *dp, int64_t space, uint64_t txg) +{ + ASSERT3S(space, >=, 0); + if (space == 0) + return; + + mutex_enter(&dp->dp_lock); + if (dp->dp_dirty_pertxg[txg & TXG_MASK] < space) { + /* XXX writing something we didn't dirty? */ + space = dp->dp_dirty_pertxg[txg & TXG_MASK]; + } + ASSERT3U(dp->dp_dirty_pertxg[txg & TXG_MASK], >=, space); + dp->dp_dirty_pertxg[txg & TXG_MASK] -= space; + ASSERT3U(dp->dp_dirty_total, >=, space); + dsl_pool_dirty_delta(dp, -space); + mutex_exit(&dp->dp_lock); +} + +/* ARGSUSED */ +static int +upgrade_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + dmu_tx_t *tx = arg; + dsl_dataset_t *ds, *prev = NULL; + int err; + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) + break; + dsl_dataset_rele(ds, FTAG); + ds = prev; + prev = NULL; + } + + if (prev == NULL) { + prev = dp->dp_origin_snap; + + /* + * The $ORIGIN can't have any data, or the accounting + * will be wrong. + */ + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + ASSERT0(dsl_dataset_phys(prev)->ds_bp.blk_birth); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + /* The origin doesn't get attached to itself */ + if (ds->ds_object == prev->ds_object) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_prev_snap_obj = prev->ds_object; + dsl_dataset_phys(ds)->ds_prev_snap_txg = + dsl_dataset_phys(prev)->ds_creation_txg; + + dmu_buf_will_dirty(ds->ds_dir->dd_dbuf, tx); + dsl_dir_phys(ds->ds_dir)->dd_origin_obj = prev->ds_object; + + dmu_buf_will_dirty(prev->ds_dbuf, tx); + dsl_dataset_phys(prev)->ds_num_children++; + + if (dsl_dataset_phys(ds)->ds_next_snap_obj == 0) { + ASSERT(ds->ds_prev == NULL); + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, + ds, &ds->ds_prev)); + } + } + + ASSERT3U(dsl_dir_phys(ds->ds_dir)->dd_origin_obj, ==, prev->ds_object); + ASSERT3U(dsl_dataset_phys(ds)->ds_prev_snap_obj, ==, prev->ds_object); + + if (dsl_dataset_phys(prev)->ds_next_clones_obj == 0) { + dmu_buf_will_dirty(prev->ds_dbuf, tx); + dsl_dataset_phys(prev)->ds_next_clones_obj = + zap_create(dp->dp_meta_objset, + DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); + } + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dataset_phys(prev)->ds_next_clones_obj, ds->ds_object, tx)); + + dsl_dataset_rele(ds, FTAG); + if (prev != dp->dp_origin_snap) + dsl_dataset_rele(prev, FTAG); + return (0); +} + +void +dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap != NULL); + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, upgrade_clones_cb, + tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); +} + +/* ARGSUSED */ +static int +upgrade_dir_clones_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dmu_tx_t *tx = arg; + objset_t *mos = dp->dp_meta_objset; + + if (dsl_dir_phys(ds->ds_dir)->dd_origin_obj != 0) { + dsl_dataset_t *origin; + + VERIFY0(dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &origin)); + + if (dsl_dir_phys(origin->ds_dir)->dd_clones == 0) { + dmu_buf_will_dirty(origin->ds_dir->dd_dbuf, tx); + dsl_dir_phys(origin->ds_dir)->dd_clones = + zap_create(mos, DMU_OT_DSL_CLONES, DMU_OT_NONE, + 0, tx); + } + + VERIFY0(zap_add_int(dp->dp_meta_objset, + dsl_dir_phys(origin->ds_dir)->dd_clones, + ds->ds_object, tx)); + + dsl_dataset_rele(origin, FTAG); + } + return (0); +} + +void +dsl_pool_upgrade_dir_clones(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t obj; + + ASSERT(dmu_tx_is_syncing(tx)); + + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, FREE_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + FREE_DIR_NAME, &dp->dp_free_dir)); + + /* + * We can't use bpobj_alloc(), because spa_version() still + * returns the old version, and we need a new-version bpobj with + * subobj support. So call dmu_object_alloc() directly. + */ + obj = dmu_object_alloc(dp->dp_meta_objset, DMU_OT_BPOBJ, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_BPOBJ_HDR, sizeof (bpobj_phys_t), tx); + VERIFY0(zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FREE_BPOBJ, sizeof (uint64_t), 1, &obj, tx)); + VERIFY0(bpobj_open(&dp->dp_free_bpobj, dp->dp_meta_objset, obj)); + + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + upgrade_dir_clones_cb, tx, DS_FIND_CHILDREN | DS_FIND_SERIALIZE)); +} + +void +dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) +{ + uint64_t dsobj; + dsl_dataset_t *ds; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dp->dp_origin_snap == NULL); + ASSERT(rrw_held(&dp->dp_config_rwlock, RW_WRITER)); + + /* create the origin dir, ds, & snap-ds */ + dsobj = dsl_dataset_create_sync(dp->dp_root_dir, ORIGIN_DIR_NAME, + NULL, 0, kcred, NULL, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + dsl_dataset_snapshot_sync_impl(ds, ORIGIN_DIR_NAME, tx); + VERIFY0(dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, + dp, &dp->dp_origin_snap)); + dsl_dataset_rele(ds, FTAG); +} + +taskq_t * +dsl_pool_zrele_taskq(dsl_pool_t *dp) +{ + return (dp->dp_zrele_taskq); +} + +taskq_t * +dsl_pool_unlinked_drain_taskq(dsl_pool_t *dp) +{ + return (dp->dp_unlinked_drain_taskq); +} + +/* + * Walk through the pool-wide zap object of temporary snapshot user holds + * and release them. + */ +void +dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) +{ + zap_attribute_t za; + zap_cursor_t zc; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + nvlist_t *holds; + + if (zapobj == 0) + return; + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + + holds = fnvlist_alloc(); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + char *htag; + nvlist_t *tags; + + htag = strchr(za.za_name, '-'); + *htag = '\0'; + ++htag; + if (nvlist_lookup_nvlist(holds, za.za_name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(holds, za.za_name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } + } + dsl_dataset_user_release_tmp(dp, holds); + fnvlist_free(holds); + zap_cursor_fini(&zc); +} + +/* + * Create the pool-wide zap object for storing temporary snapshot holds. + */ +static void +dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + + ASSERT(dp->dp_tmp_userrefs_obj == 0); + ASSERT(dmu_tx_is_syncing(tx)); + + dp->dp_tmp_userrefs_obj = zap_create_link(mos, DMU_OT_USERREFS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, tx); +} + +static int +dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t now, dmu_tx_t *tx, boolean_t holding) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + char *name; + int error; + + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * If the pool was created prior to SPA_VERSION_USERREFS, the + * zap object for temporary holds might not exist yet. + */ + if (zapobj == 0) { + if (holding) { + dsl_pool_user_hold_create_obj(dp, tx); + zapobj = dp->dp_tmp_userrefs_obj; + } else { + return (SET_ERROR(ENOENT)); + } + } + + name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); + if (holding) + error = zap_add(mos, zapobj, name, 8, 1, &now, tx); + else + error = zap_remove(mos, zapobj, name, tx); + kmem_strfree(name); + + return (error); +} + +/* + * Add a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + uint64_t now, dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); +} + +/* + * Release a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, 0, + tx, B_FALSE)); +} + +/* + * DSL Pool Configuration Lock + * + * The dp_config_rwlock protects against changes to DSL state (e.g. dataset + * creation / destruction / rename / property setting). It must be held for + * read to hold a dataset or dsl_dir. I.e. you must call + * dsl_pool_config_enter() or dsl_pool_hold() before calling + * dsl_{dataset,dir}_hold{_obj}. In most circumstances, the dp_config_rwlock + * must be held continuously until all datasets and dsl_dirs are released. + * + * The only exception to this rule is that if a "long hold" is placed on + * a dataset, then the dp_config_rwlock may be dropped while the dataset + * is still held. The long hold will prevent the dataset from being + * destroyed -- the destroy will fail with EBUSY. A long hold can be + * obtained by calling dsl_dataset_long_hold(), or by "owning" a dataset + * (by calling dsl_{dataset,objset}_{try}own{_obj}). + * + * Legitimate long-holders (including owners) should be long-running, cancelable + * tasks that should cause "zfs destroy" to fail. This includes DMU + * consumers (i.e. a ZPL filesystem being mounted or ZVOL being open), + * "zfs send", and "zfs diff". There are several other long-holders whose + * uses are suboptimal (e.g. "zfs promote", and zil_suspend()). + * + * The usual formula for long-holding would be: + * dsl_pool_hold() + * dsl_dataset_hold() + * ... perform checks ... + * dsl_dataset_long_hold() + * dsl_pool_rele() + * ... perform long-running task ... + * dsl_dataset_long_rele() + * dsl_dataset_rele() + * + * Note that when the long hold is released, the dataset is still held but + * the pool is not held. The dataset may change arbitrarily during this time + * (e.g. it could be destroyed). Therefore you shouldn't do anything to the + * dataset except release it. + * + * User-initiated operations (e.g. ioctls, zfs_ioc_*()) are either read-only + * or modifying operations. + * + * Modifying operations should generally use dsl_sync_task(). The synctask + * infrastructure enforces proper locking strategy with respect to the + * dp_config_rwlock. See the comment above dsl_sync_task() for details. + * + * Read-only operations will manually hold the pool, then the dataset, obtain + * information from the dataset, then release the pool and dataset. + * dmu_objset_{hold,rele}() are convenience routines that also do the pool + * hold/rele. + */ + +int +dsl_pool_hold(const char *name, void *tag, dsl_pool_t **dp) +{ + spa_t *spa; + int error; + + error = spa_open(name, &spa, tag); + if (error == 0) { + *dp = spa_get_dsl(spa); + dsl_pool_config_enter(*dp, tag); + } + return (error); +} + +void +dsl_pool_rele(dsl_pool_t *dp, void *tag) +{ + dsl_pool_config_exit(dp, tag); + spa_close(dp->dp_spa, tag); +} + +void +dsl_pool_config_enter(dsl_pool_t *dp, void *tag) +{ + /* + * We use a "reentrant" reader-writer lock, but not reentrantly. + * + * The rrwlock can (with the track_all flag) track all reading threads, + * which is very useful for debugging which code path failed to release + * the lock, and for verifying that the *current* thread does hold + * the lock. + * + * (Unlike a rwlock, which knows that N threads hold it for + * read, but not *which* threads, so rw_held(RW_READER) returns TRUE + * if any thread holds it for read, even if this thread doesn't). + */ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter(&dp->dp_config_rwlock, RW_READER, tag); +} + +void +dsl_pool_config_enter_prio(dsl_pool_t *dp, void *tag) +{ + ASSERT(!rrw_held(&dp->dp_config_rwlock, RW_READER)); + rrw_enter_read_prio(&dp->dp_config_rwlock, tag); +} + +void +dsl_pool_config_exit(dsl_pool_t *dp, void *tag) +{ + rrw_exit(&dp->dp_config_rwlock, tag); +} + +boolean_t +dsl_pool_config_held(dsl_pool_t *dp) +{ + return (RRW_LOCK_HELD(&dp->dp_config_rwlock)); +} + +boolean_t +dsl_pool_config_held_writer(dsl_pool_t *dp) +{ + return (RRW_WRITE_HELD(&dp->dp_config_rwlock)); +} + +EXPORT_SYMBOL(dsl_pool_config_enter); +EXPORT_SYMBOL(dsl_pool_config_exit); + +/* BEGIN CSTYLED */ +/* zfs_dirty_data_max_percent only applied at module load in arc_init(). */ +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_percent, INT, ZMOD_RD, + "Max percent of RAM allowed to be dirty"); + +/* zfs_dirty_data_max_max_percent only applied at module load in arc_init(). */ +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max_percent, INT, ZMOD_RD, + "zfs_dirty_data_max upper bound as % of RAM"); + +ZFS_MODULE_PARAM(zfs, zfs_, delay_min_dirty_percent, INT, ZMOD_RW, + "Transaction delay threshold"); + +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max, ULONG, ZMOD_RW, + "Determines the dirty space limit"); + +/* zfs_dirty_data_max_max only applied at module load in arc_init(). */ +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_max_max, ULONG, ZMOD_RD, + "zfs_dirty_data_max upper bound in bytes"); + +ZFS_MODULE_PARAM(zfs, zfs_, dirty_data_sync_percent, INT, ZMOD_RW, + "Dirty data txg sync threshold as a percentage of zfs_dirty_data_max"); + +ZFS_MODULE_PARAM(zfs, zfs_, delay_scale, ULONG, ZMOD_RW, + "How quickly delay approaches infinity"); + +ZFS_MODULE_PARAM(zfs, zfs_, sync_taskq_batch_pct, INT, ZMOD_RW, + "Max percent of CPUs that are used to sync dirty data"); + +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_nthr_pct, INT, ZMOD_RW, + "Max percent of CPUs that are used per dp_sync_taskq"); + +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_minalloc, INT, ZMOD_RW, + "Number of taskq entries that are pre-populated"); + +ZFS_MODULE_PARAM(zfs_zil, zfs_zil_, clean_taskq_maxalloc, INT, ZMOD_RW, + "Max number of taskq entries that are cached"); +/* END CSTYLED */ diff --git a/module/zfs/dsl_prop.c b/module/zfs/dsl_prop.c new file mode 100644 index 000000000000..f6ff9ae47192 --- /dev/null +++ b/module/zfs/dsl_prop.c @@ -0,0 +1,1287 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2013 Martin Matuska. All rights reserved. + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_prop.h" + +#define ZPROP_INHERIT_SUFFIX "$inherit" +#define ZPROP_RECVD_SUFFIX "$recvd" + +static int +dodefault(zfs_prop_t prop, int intsz, int numints, void *buf) +{ + /* + * The setonce properties are read-only, BUT they still + * have a default value that can be used as the initial + * value. + */ + if (prop == ZPROP_INVAL || + (zfs_prop_readonly(prop) && !zfs_prop_setonce(prop))) + return (SET_ERROR(ENOENT)); + + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + if (intsz != 1) + return (SET_ERROR(EOVERFLOW)); + (void) strncpy(buf, zfs_prop_default_string(prop), + numints); + } else { + if (intsz != 8 || numints < 1) + return (SET_ERROR(EOVERFLOW)); + + *(uint64_t *)buf = zfs_prop_default_numeric(prop); + } + + return (0); +} + +int +dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, + int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) +{ + int err; + dsl_dir_t *target = dd; + objset_t *mos = dd->dd_pool->dp_meta_objset; + zfs_prop_t prop; + boolean_t inheritable; + boolean_t inheriting = B_FALSE; + char *inheritstr; + char *recvdstr; + + ASSERT(dsl_pool_config_held(dd->dd_pool)); + + if (setpoint) + setpoint[0] = '\0'; + + prop = zfs_name_to_prop(propname); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + /* + * Note: dd may become NULL, therefore we shouldn't dereference it + * after this loop. + */ + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != target || snapshot) { + if (!inheritable) { + err = SET_ERROR(ENOENT); + break; + } + inheriting = B_TRUE; + } + + /* Check for a local value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dir_name(dd, setpoint); + break; + } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + inheritstr); + if (err != 0 && err != ENOENT) + break; + + if (err == ENOENT) { + /* Check for a received value. */ + err = zap_lookup(mos, dsl_dir_phys(dd)->dd_props_zapobj, + recvdstr, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) { + if (inheriting) { + dsl_dir_name(dd, setpoint); + } else { + (void) strlcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD, + MAXNAMELEN); + } + } + break; + } + } + + /* + * If we found an explicit inheritance entry, err is zero even + * though we haven't yet found the value, so reinitializing err + * at the end of the loop (instead of at the beginning) ensures + * that err has a valid post-loop value. + */ + err = SET_ERROR(ENOENT); + } + + if (err == ENOENT) + err = dodefault(prop, intsz, numints, buf); + + kmem_strfree(inheritstr); + kmem_strfree(recvdstr); + + return (err); +} + +int +dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t inheritable; + uint64_t zapobj; + + ASSERT(dsl_pool_config_held(ds->ds_dir->dd_pool)); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + zapobj = dsl_dataset_phys(ds)->ds_props_obj; + + if (zapobj != 0) { + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int err; + + ASSERT(ds->ds_is_snapshot); + + /* Check for a local value. */ + err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + dsl_dataset_name(ds, setpoint); + return (err); + } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + if (inheritable) { + char *inheritstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, zapobj, inheritstr); + kmem_strfree(inheritstr); + if (err != 0 && err != ENOENT) + return (err); + } + + if (err == ENOENT) { + /* Check for a received value. */ + char *recvdstr = kmem_asprintf("%s%s", propname, + ZPROP_RECVD_SUFFIX); + err = zap_lookup(mos, zapobj, recvdstr, + intsz, numints, buf); + kmem_strfree(recvdstr); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + (void) strlcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD, + MAXNAMELEN); + return (err); + } + } + } + + return (dsl_prop_get_dd(ds->ds_dir, propname, + intsz, numints, buf, setpoint, ds->ds_is_snapshot)); +} + +static dsl_prop_record_t * +dsl_prop_record_find(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr = NULL; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + for (pr = list_head(&dd->dd_props); + pr != NULL; pr = list_next(&dd->dd_props, pr)) { + if (strcmp(pr->pr_propname, propname) == 0) + break; + } + + return (pr); +} + +static dsl_prop_record_t * +dsl_prop_record_create(dsl_dir_t *dd, const char *propname) +{ + dsl_prop_record_t *pr; + + ASSERT(MUTEX_HELD(&dd->dd_lock)); + + pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP); + pr->pr_propname = spa_strdup(propname); + list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t), + offsetof(dsl_prop_cb_record_t, cbr_pr_node)); + list_insert_head(&dd->dd_props, pr); + + return (pr); +} + +void +dsl_prop_init(dsl_dir_t *dd) +{ + list_create(&dd->dd_props, sizeof (dsl_prop_record_t), + offsetof(dsl_prop_record_t, pr_node)); +} + +void +dsl_prop_fini(dsl_dir_t *dd) +{ + dsl_prop_record_t *pr; + + while ((pr = list_remove_head(&dd->dd_props)) != NULL) { + list_destroy(&pr->pr_cbs); + spa_strfree((char *)pr->pr_propname); + kmem_free(pr, sizeof (dsl_prop_record_t)); + } + list_destroy(&dd->dd_props); +} + +/* + * Register interest in the named property. We'll call the callback + * once to notify it of the current property value, and again each time + * the property changes, until this callback is unregistered. + * + * Return 0 on success, errno if the prop is not an integer value. + */ +int +dsl_prop_register(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + uint64_t value; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + int err; + dsl_pool_t *dp __maybe_unused = dd->dd_pool; + + ASSERT(dsl_pool_config_held(dp)); + + err = dsl_prop_get_int_ds(ds, propname, &value); + if (err != 0) + return (err); + + cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP); + cbr->cbr_ds = ds; + cbr->cbr_func = callback; + cbr->cbr_arg = cbarg; + + mutex_enter(&dd->dd_lock); + pr = dsl_prop_record_find(dd, propname); + if (pr == NULL) + pr = dsl_prop_record_create(dd, propname); + cbr->cbr_pr = pr; + list_insert_head(&pr->pr_cbs, cbr); + list_insert_head(&ds->ds_prop_cbs, cbr); + mutex_exit(&dd->dd_lock); + + cbr->cbr_func(cbr->cbr_arg, value); + return (0); +} + +int +dsl_prop_get(const char *dsname, const char *propname, + int intsz, int numints, void *buf, char *setpoint) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + + error = dsl_prop_get_ds(dmu_objset_ds(os), propname, + intsz, numints, buf, setpoint); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * Get the current property value. It may have changed by the time this + * function returns, so it is NOT safe to follow up with + * dsl_prop_register() and assume that the value has not changed in + * between. + * + * Return 0 on success, ENOENT if ddname is invalid. + */ +int +dsl_prop_get_integer(const char *ddname, const char *propname, + uint64_t *valuep, char *setpoint) +{ + return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); +} + +int +dsl_prop_get_int_ds(dsl_dataset_t *ds, const char *propname, + uint64_t *valuep) +{ + return (dsl_prop_get_ds(ds, propname, 8, 1, valuep, NULL)); +} + +/* + * Predict the effective value of the given special property if it were set with + * the given value and source. This is not a general purpose function. It exists + * only to handle the special requirements of the quota and reservation + * properties. The fact that these properties are non-inheritable greatly + * simplifies the prediction logic. + * + * Returns 0 on success, a positive error code on failure, or -1 if called with + * a property not handled by this function. + */ +int +dsl_prop_predict(dsl_dir_t *dd, const char *propname, + zprop_source_t source, uint64_t value, uint64_t *newvalp) +{ + zfs_prop_t prop = zfs_name_to_prop(propname); + objset_t *mos; + uint64_t zapobj; + uint64_t version; + char *recvdstr; + int err = 0; + + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_REFRESERVATION: + break; + default: + return (-1); + } + + mos = dd->dd_pool->dp_meta_objset; + zapobj = dsl_dir_phys(dd)->dd_props_zapobj; + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + version = spa_version(dd->dd_pool->dp_spa); + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + switch ((int)source) { + case ZPROP_SRC_NONE: + /* Revert to the received value, if any. */ + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = 0; + break; + case ZPROP_SRC_LOCAL: + *newvalp = value; + break; + case ZPROP_SRC_RECEIVED: + /* + * If there's no local setting, then the new received value will + * be the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = value; + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * We're clearing the received value, so the local setting (if + * it exists) remains the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, newvalp); + if (err == ENOENT) + *newvalp = 0; + break; + default: + panic("unexpected property source: %d", source); + } + + kmem_strfree(recvdstr); + + if (err == ENOENT) + return (0); + + return (err); +} + +/* + * Unregister this callback. Return 0 on success, ENOENT if ddname is + * invalid, or ENOMSG if no matching callback registered. + * + * NOTE: This function is no longer used internally but has been preserved + * to prevent breaking external consumers (Lustre, etc). + */ +int +dsl_prop_unregister(dsl_dataset_t *ds, const char *propname, + dsl_prop_changed_cb_t *callback, void *cbarg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (cbr = list_head(&ds->ds_prop_cbs); + cbr; cbr = list_next(&ds->ds_prop_cbs, cbr)) { + if (cbr->cbr_ds == ds && + cbr->cbr_func == callback && + cbr->cbr_arg == cbarg && + strcmp(cbr->cbr_pr->pr_propname, propname) == 0) + break; + } + + if (cbr == NULL) { + mutex_exit(&dd->dd_lock); + return (SET_ERROR(ENOMSG)); + } + + list_remove(&ds->ds_prop_cbs, cbr); + list_remove(&cbr->cbr_pr->pr_cbs, cbr); + mutex_exit(&dd->dd_lock); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + + return (0); +} + +/* + * Unregister all callbacks that are registered with the + * given callback argument. + */ +void +dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg) +{ + dsl_prop_cb_record_t *cbr, *next_cbr; + + dsl_dir_t *dd = ds->ds_dir; + + mutex_enter(&dd->dd_lock); + next_cbr = list_head(&ds->ds_prop_cbs); + while (next_cbr != NULL) { + cbr = next_cbr; + next_cbr = list_next(&ds->ds_prop_cbs, cbr); + if (cbr->cbr_arg == cbarg) { + list_remove(&ds->ds_prop_cbs, cbr); + list_remove(&cbr->cbr_pr->pr_cbs, cbr); + kmem_free(cbr, sizeof (dsl_prop_cb_record_t)); + } + } + mutex_exit(&dd->dd_lock); +} + +boolean_t +dsl_prop_hascb(dsl_dataset_t *ds) +{ + return (!list_is_empty(&ds->ds_prop_cbs)); +} + +/* ARGSUSED */ +static int +dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + + mutex_enter(&dd->dd_lock); + for (pr = list_head(&dd->dd_props); + pr; pr = list_next(&dd->dd_props, pr)) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t value; + + /* + * Callback entries do not have holds on their + * datasets so that datasets with registered + * callbacks are still eligible for eviction. + * Unlike operations to update properties on a + * single dataset, we are performing a recursive + * descent of related head datasets. The caller + * of this function only has a dataset hold on + * the passed in head dataset, not the snapshots + * associated with this dataset. Without a hold, + * the dataset pointer within callback records + * for snapshots can be invalidated by eviction + * at any time. + * + * Use dsl_dataset_try_add_ref() to verify + * that the dataset for a snapshot has not + * begun eviction processing and to prevent + * eviction from occurring for the duration of + * the callback. If the hold attempt fails, + * this object is already being evicted and the + * callback can be safely ignored. + */ + if (ds != cbr->cbr_ds && + !dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; + + if (dsl_prop_get_ds(cbr->cbr_ds, + cbr->cbr_pr->pr_propname, sizeof (value), 1, + &value, NULL) == 0) + cbr->cbr_func(cbr->cbr_arg, value); + + if (ds != cbr->cbr_ds) + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } + } + mutex_exit(&dd->dd_lock); + + return (0); +} + +/* + * Update all property values for ddobj & its descendants. This is used + * when renaming the dir. + */ +void +dsl_prop_notify_all(dsl_dir_t *dd) +{ + dsl_pool_t *dp = dd->dd_pool; + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + (void) dmu_objset_find_dp(dp, dd->dd_object, dsl_prop_notify_all_cb, + NULL, DS_FIND_CHILDREN); +} + +static void +dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, + const char *propname, uint64_t value, int first) +{ + dsl_dir_t *dd; + dsl_prop_record_t *pr; + dsl_prop_cb_record_t *cbr; + objset_t *mos = dp->dp_meta_objset; + zap_cursor_t zc; + zap_attribute_t *za; + int err; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + err = dsl_dir_hold_obj(dp, ddobj, NULL, FTAG, &dd); + if (err) + return; + + if (!first) { + /* + * If the prop is set here, then this change is not + * being inherited here or below; stop the recursion. + */ + err = zap_contains(mos, dsl_dir_phys(dd)->dd_props_zapobj, + propname); + if (err == 0) { + dsl_dir_rele(dd, FTAG); + return; + } + ASSERT3U(err, ==, ENOENT); + } + + mutex_enter(&dd->dd_lock); + pr = dsl_prop_record_find(dd, propname); + if (pr != NULL) { + for (cbr = list_head(&pr->pr_cbs); cbr; + cbr = list_next(&pr->pr_cbs, cbr)) { + uint64_t propobj; + + /* + * cbr->cbr_ds may be invalidated due to eviction, + * requiring the use of dsl_dataset_try_add_ref(). + * See comment block in dsl_prop_notify_all_cb() + * for details. + */ + if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG)) + continue; + + propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj; + + /* + * If the property is not set on this ds, then it is + * inherited here; call the callback. + */ + if (propobj == 0 || + zap_contains(mos, propobj, propname) != 0) + cbr->cbr_func(cbr->cbr_arg, value); + + dsl_dataset_rele(cbr->cbr_ds, FTAG); + } + } + mutex_exit(&dd->dd_lock); + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, mos, + dsl_dir_phys(dd)->dd_child_dir_zapobj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + dsl_prop_changed_notify(dp, za->za_first_integer, + propname, value, FALSE); + } + kmem_free(za, sizeof (zap_attribute_t)); + zap_cursor_fini(&zc); + dsl_dir_rele(dd, FTAG); +} + +void +dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname, + zprop_source_t source, int intsz, int numints, const void *value, + dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj, intval, dummy, count; + int isint; + char valbuf[32]; + const char *valstr = NULL; + char *inheritstr; + char *recvdstr; + char *tbuf = NULL; + int err; + uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); + + isint = (dodefault(zfs_name_to_prop(propname), 8, 1, &intval) == 0); + + if (ds->ds_is_snapshot) { + ASSERT(version >= SPA_VERSION_SNAP_PROPS); + if (dsl_dataset_phys(ds)->ds_props_obj == 0 && + (source & ZPROP_SRC_NONE) == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_props_obj = + zap_create(mos, + DMU_OT_DSL_PROPS, DMU_OT_NONE, 0, tx); + } + zapobj = dsl_dataset_phys(ds)->ds_props_obj; + } else { + zapobj = dsl_dir_phys(ds->ds_dir)->dd_props_zapobj; + } + + /* If we are removing objects from a non-existent ZAP just return */ + if (zapobj == 0) + return; + + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + switch ((int)source) { + case ZPROP_SRC_NONE: + /* + * revert to received value, if any (inherit -S) + * - remove propname + * - remove propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + case ZPROP_SRC_LOCAL: + /* + * remove propname$inherit + * set propname -> value + */ + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + VERIFY0(zap_update(mos, zapobj, propname, + intsz, numints, value, tx)); + break; + case ZPROP_SRC_INHERITED: + /* + * explicitly inherit + * - remove propname + * - set propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + if (version >= SPA_VERSION_RECVD_PROPS && + dsl_prop_get_int_ds(ds, ZPROP_HAS_RECVD, &dummy) == 0) { + dummy = 0; + VERIFY0(zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx)); + } + break; + case ZPROP_SRC_RECEIVED: + /* + * set propname$recvd -> value + */ + err = zap_update(mos, zapobj, recvdstr, + intsz, numints, value, tx); + ASSERT(err == 0); + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): + /* + * clear local and received settings + * - remove propname + * - remove propname$inherit + * - remove propname$recvd + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + /* FALLTHRU */ + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * remove propname$recvd + */ + err = zap_remove(mos, zapobj, recvdstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); + } + + kmem_strfree(inheritstr); + kmem_strfree(recvdstr); + + /* + * If we are left with an empty snap zap we can destroy it. + * This will prevent unnecessary calls to zap_lookup() in + * the "zfs list" and "zfs get" code paths. + */ + if (ds->ds_is_snapshot && + zap_count(mos, zapobj, &count) == 0 && count == 0) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_dataset_phys(ds)->ds_props_obj = 0; + zap_destroy(mos, zapobj, tx); + } + + if (isint) { + VERIFY0(dsl_prop_get_int_ds(ds, propname, &intval)); + + if (ds->ds_is_snapshot) { + dsl_prop_cb_record_t *cbr; + /* + * It's a snapshot; nothing can inherit this + * property, so just look for callbacks on this + * ds here. + */ + mutex_enter(&ds->ds_dir->dd_lock); + for (cbr = list_head(&ds->ds_prop_cbs); cbr; + cbr = list_next(&ds->ds_prop_cbs, cbr)) { + if (strcmp(cbr->cbr_pr->pr_propname, + propname) == 0) + cbr->cbr_func(cbr->cbr_arg, intval); + } + mutex_exit(&ds->ds_dir->dd_lock); + } else { + dsl_prop_changed_notify(ds->ds_dir->dd_pool, + ds->ds_dir->dd_object, propname, intval, TRUE); + } + + (void) snprintf(valbuf, sizeof (valbuf), + "%lld", (longlong_t)intval); + valstr = valbuf; + } else { + if (source == ZPROP_SRC_LOCAL) { + valstr = value; + } else { + tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + if (dsl_prop_get_ds(ds, propname, 1, + ZAP_MAXVALUELEN, tbuf, NULL) == 0) + valstr = tbuf; + } + } + + spa_history_log_internal_ds(ds, (source == ZPROP_SRC_NONE || + source == ZPROP_SRC_INHERITED) ? "inherit" : "set", tx, + "%s=%s", propname, (valstr == NULL ? "" : valstr)); + + if (tbuf != NULL) + kmem_free(tbuf, ZAP_MAXVALUELEN); +} + +int +dsl_prop_set_int(const char *dsname, const char *propname, + zprop_source_t source, uint64_t value) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_uint64(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +int +dsl_prop_set_string(const char *dsname, const char *propname, + zprop_source_t source, const char *value) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_string(nvl, propname, value); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +int +dsl_prop_inherit(const char *dsname, const char *propname, + zprop_source_t source) +{ + nvlist_t *nvl = fnvlist_alloc(); + int error; + + fnvlist_add_boolean(nvl, propname); + error = dsl_props_set(dsname, source, nvl); + fnvlist_free(nvl); + return (error); +} + +int +dsl_props_set_check(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + uint64_t version; + nvpair_t *elem = NULL; + int err; + + err = dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds); + if (err != 0) + return (err); + + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + while ((elem = nvlist_next_nvpair(dpsa->dpsa_props, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENAMETOOLONG)); + } + if (nvpair_type(elem) == DATA_TYPE_STRING) { + char *valstr = fnvpair_value_string(elem); + if (strlen(valstr) >= (version < + SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(E2BIG)); + } + } + } + + if (ds->ds_is_snapshot && version < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (SET_ERROR(ENOTSUP)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +void +dsl_props_set_sync_impl(dsl_dataset_t *ds, zprop_source_t source, + nvlist_t *props, dmu_tx_t *tx) +{ + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; + const char *name = nvpair_name(pair); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * This usually happens when we reuse the nvlist_t data + * returned by the counterpart dsl_prop_get_all_impl(). + * For instance we do this to restore the original + * received properties when an error occurs in the + * zfs_ioc_recv() codepath. + */ + nvlist_t *attrs = fnvpair_value_nvlist(pair); + pair = fnvlist_lookup_nvpair(attrs, ZPROP_VALUE); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + const char *value = fnvpair_value_string(pair); + dsl_prop_set_sync_impl(ds, name, + source, 1, strlen(value) + 1, value, tx); + } else if (nvpair_type(pair) == DATA_TYPE_UINT64) { + uint64_t intval = fnvpair_value_uint64(pair); + dsl_prop_set_sync_impl(ds, name, + source, sizeof (intval), 1, &intval, tx); + } else if (nvpair_type(pair) == DATA_TYPE_BOOLEAN) { + dsl_prop_set_sync_impl(ds, name, + source, 0, 0, NULL, tx); + } else { + panic("invalid nvpair type"); + } + } +} + +void +dsl_props_set_sync(void *arg, dmu_tx_t *tx) +{ + dsl_props_set_arg_t *dpsa = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, dpsa->dpsa_dsname, FTAG, &ds)); + dsl_props_set_sync_impl(ds, dpsa->dpsa_source, dpsa->dpsa_props, tx); + dsl_dataset_rele(ds, FTAG); +} + +/* + * All-or-nothing; if any prop can't be set, nothing will be modified. + */ +int +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) +{ + dsl_props_set_arg_t dpsa; + int nblks = 0; + + dpsa.dpsa_dsname = dsname; + dpsa.dpsa_source = source; + dpsa.dpsa_props = props; + + /* + * If the source includes NONE, then we will only be removing entries + * from the ZAP object. In that case don't check for ENOSPC. + */ + if ((source & ZPROP_SRC_NONE) == 0) + nblks = 2 * fnvlist_num_pairs(props); + + return (dsl_sync_task(dsname, dsl_props_set_check, dsl_props_set_sync, + &dpsa, nblks, ZFS_SPACE_CHECK_RESERVED)); +} + +typedef enum dsl_prop_getflags { + DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ + DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ + DSL_PROP_GET_LOCAL = 0x4, /* local properties */ + DSL_PROP_GET_RECEIVED = 0x8, /* received properties */ +} dsl_prop_getflags_t; + +static int +dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, + const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err = 0; + + for (zap_cursor_init(&zc, mos, propobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop; + char buf[ZAP_MAXNAMELEN]; + char *valstr; + const char *suffix; + const char *propname; + const char *source; + + suffix = strchr(za.za_name, '$'); + + if (suffix == NULL) { + /* + * Skip local properties if we only want received + * properties. + */ + if (flags & DSL_PROP_GET_RECEIVED) + continue; + + propname = za.za_name; + source = setpoint; + } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { + /* Skip explicitly inherited entries. */ + continue; + } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { + if (flags & DSL_PROP_GET_LOCAL) + continue; + + (void) strncpy(buf, za.za_name, (suffix - za.za_name)); + buf[suffix - za.za_name] = '\0'; + propname = buf; + + if (!(flags & DSL_PROP_GET_RECEIVED)) { + /* Skip if locally overridden. */ + err = zap_contains(mos, propobj, propname); + if (err == 0) + continue; + if (err != ENOENT) + break; + + /* Skip if explicitly inherited. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, propobj, valstr); + kmem_strfree(valstr); + if (err == 0) + continue; + if (err != ENOENT) + break; + } + + source = ((flags & DSL_PROP_GET_INHERITING) ? + setpoint : ZPROP_SOURCE_VAL_RECVD); + } else { + /* + * For backward compatibility, skip suffixes we don't + * recognize. + */ + continue; + } + + prop = zfs_name_to_prop(propname); + + /* Skip non-inheritable properties. */ + if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop)) + continue; + + /* Skip properties not valid for this type. */ + if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT, B_FALSE)) + continue; + + /* Skip properties already defined. */ + if (nvlist_exists(nv, propname)) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + if (err == ENOENT) + err = 0; + return (err); +} + +/* + * Iterate over all properties for this dataset and return them in an nvlist. + */ +static int +dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, + dsl_prop_getflags_t flags) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err = 0; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (ds->ds_is_snapshot) + flags |= DSL_PROP_GET_SNAPSHOT; + + ASSERT(dsl_pool_config_held(dp)); + + if (dsl_dataset_phys(ds)->ds_props_obj != 0) { + ASSERT(flags & DSL_PROP_GET_SNAPSHOT); + dsl_dataset_name(ds, setpoint); + err = dsl_prop_get_all_impl(mos, + dsl_dataset_phys(ds)->ds_props_obj, setpoint, flags, *nvp); + if (err) + goto out; + } + + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { + if (flags & (DSL_PROP_GET_LOCAL | + DSL_PROP_GET_RECEIVED)) + break; + flags |= DSL_PROP_GET_INHERITING; + } + dsl_dir_name(dd, setpoint); + err = dsl_prop_get_all_impl(mos, + dsl_dir_phys(dd)->dd_props_zapobj, setpoint, flags, *nvp); + if (err) + break; + } + +out: + if (err) { + nvlist_free(*nvp); + *nvp = NULL; + } + return (err); +} + +boolean_t +dsl_prop_get_hasrecvd(const char *dsname) +{ + uint64_t dummy; + + return (0 == + dsl_prop_get_integer(dsname, ZPROP_HAS_RECVD, &dummy, NULL)); +} + +static int +dsl_prop_set_hasrecvd_impl(const char *dsname, zprop_source_t source) +{ + uint64_t version; + spa_t *spa; + int error = 0; + + VERIFY0(spa_open(dsname, &spa, FTAG)); + version = spa_version(spa); + spa_close(spa, FTAG); + + if (version >= SPA_VERSION_RECVD_PROPS) + error = dsl_prop_set_int(dsname, ZPROP_HAS_RECVD, source, 0); + return (error); +} + +/* + * Call after successfully receiving properties to ensure that only the first + * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. + */ +int +dsl_prop_set_hasrecvd(const char *dsname) +{ + int error = 0; + if (!dsl_prop_get_hasrecvd(dsname)) + error = dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_LOCAL); + return (error); +} + +void +dsl_prop_unset_hasrecvd(const char *dsname) +{ + VERIFY0(dsl_prop_set_hasrecvd_impl(dsname, ZPROP_SRC_NONE)); +} + +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +{ + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); +} + +int +dsl_prop_get_received(const char *dsname, nvlist_t **nvp) +{ + objset_t *os; + int error; + + /* + * Received properties are not distinguishable from local properties + * until the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(dsname) ? + DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + error = dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags); + dmu_objset_rele(os, FTAG); + return (error); +} + +void +dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) +{ + nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + uint64_t default_value; + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + return; + } + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + /* Indicate the default source if we can. */ + if (dodefault(prop, 8, 1, &default_value) == 0 && + value == default_value) { + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); + } + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); +} + +void +dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) +{ + nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + return; + } + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dsl_prop_register); +EXPORT_SYMBOL(dsl_prop_unregister); +EXPORT_SYMBOL(dsl_prop_unregister_all); +EXPORT_SYMBOL(dsl_prop_get); +EXPORT_SYMBOL(dsl_prop_get_integer); +EXPORT_SYMBOL(dsl_prop_get_all); +EXPORT_SYMBOL(dsl_prop_get_received); +EXPORT_SYMBOL(dsl_prop_get_ds); +EXPORT_SYMBOL(dsl_prop_get_int_ds); +EXPORT_SYMBOL(dsl_prop_get_dd); +EXPORT_SYMBOL(dsl_props_set); +EXPORT_SYMBOL(dsl_prop_set_int); +EXPORT_SYMBOL(dsl_prop_set_string); +EXPORT_SYMBOL(dsl_prop_inherit); +EXPORT_SYMBOL(dsl_prop_predict); +EXPORT_SYMBOL(dsl_prop_nvlist_add_uint64); +EXPORT_SYMBOL(dsl_prop_nvlist_add_string); +#endif diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c new file mode 100644 index 000000000000..712af664e90f --- /dev/null +++ b/module/zfs/dsl_scan.c @@ -0,0 +1,4426 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2008, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2016 Gary Mills + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + * Copyright 2019 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limited performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + +typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, + const zbookmark_phys_t *); + +static scan_cb_t dsl_scan_scrub_cb; + +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static void scan_ds_prefetch_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); +static uint64_t dsl_scan_count_leaves(vdev_t *vd); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +unsigned long zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +int zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ + +int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ +int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +int zfs_scan_checkpoint_intval = 7200; /* in seconds */ +int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ +int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +/* max number of blocks to free in a single TXG */ +unsigned long zfs_async_block_max_blocks = ULONG_MAX; +/* max number of dedup blocks to free in a single TXG */ +unsigned long zfs_max_async_dedup_frees = 100000; + +int zfs_resilver_disable_defer = 0; /* set to disable resilver deferring */ + +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + +#define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ + ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ + (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) + +/* + * Enable/disable the processing of the free_bpobj object. + */ +int zfs_free_bpobj_enabled = 1; + +/* the order has to match pool_scan_type */ +static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { + NULL, + dsl_scan_scrub_cb, /* POOL_SCAN_SCRUB */ + dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ +}; + +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_nr_dvas; + + /* fields from zio_t */ + uint32_t sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issuing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; + + /* + * There may be up to SPA_DVAS_PER_BP DVAs here from the bp, + * depending on how many were in the original bp. Only the + * first DVA is really used for sorting and issuing purposes. + * The other DVAs (if provided) simply exist so that the zio + * layer can find additional copies to repair from in the + * event of an error. This array must go at the end of the + * struct to allow this for the variable number of elements. + */ + dva_t sio_dva[0]; +} scan_io_t; + +#define SIO_SET_OFFSET(sio, x) DVA_SET_OFFSET(&(sio)->sio_dva[0], x) +#define SIO_SET_ASIZE(sio, x) DVA_SET_ASIZE(&(sio)->sio_dva[0], x) +#define SIO_GET_OFFSET(sio) DVA_GET_OFFSET(&(sio)->sio_dva[0]) +#define SIO_GET_ASIZE(sio) DVA_GET_ASIZE(&(sio)->sio_dva[0]) +#define SIO_GET_END_OFFSET(sio) \ + (SIO_GET_OFFSET(sio) + SIO_GET_ASIZE(sio)) +#define SIO_GET_MUSED(sio) \ + (sizeof (scan_io_t) + ((sio)->sio_nr_dvas * sizeof (dva_t))) + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + zfs_btree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + uint64_t q_sio_memused; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + zfs_refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache[SPA_DVAS_PER_BP]; + +/* sio->sio_nr_dvas must be set so we know which cache to free from */ +static void +sio_free(scan_io_t *sio) +{ + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + kmem_cache_free(sio_cache[sio->sio_nr_dvas - 1], sio); +} + +/* It is up to the caller to set sio->sio_nr_dvas for freeing */ +static scan_io_t * +sio_alloc(unsigned short nr_dvas) +{ + ASSERT3U(nr_dvas, >, 0); + ASSERT3U(nr_dvas, <=, SPA_DVAS_PER_BP); + + return (kmem_cache_alloc(sio_cache[nr_dvas - 1], KM_SLEEP)); +} + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechanism for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module initialization time + */ + fill_weight = zfs_scan_fill_weight; + + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + char name[36]; + + (void) snprintf(name, sizeof (name), "sio_cache_%d", i); + sio_cache[i] = kmem_cache_create(name, + (sizeof (scan_io_t) + ((i + 1) * sizeof (dva_t))), + 0, NULL, NULL, NULL, NULL, NULL, 0); + } +} + +void +scan_fini(void) +{ + for (int i = 0; i < SPA_DVAS_PER_BP; i++) { + kmem_cache_destroy(sio_cache[i]); + } +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp) +{ + bzero(bp, sizeof (*bp)); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; + + ASSERT3U(sio->sio_nr_dvas, >, 0); + ASSERT3U(sio->sio_nr_dvas, <=, SPA_DVAS_PER_BP); + + bcopy(sio->sio_dva, bp->blk_dva, sio->sio_nr_dvas * sizeof (dva_t)); +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; + sio->sio_nr_dvas = BP_GET_NDVAS(bp); + + /* + * Copy the DVAs to the sio. We need all copies of the block so + * that the self healing code can use the alternate copies if the + * first is corrupted. We want the DVA at index dva_i to be first + * in the sio since this is the primary one that we want to issue. + */ + for (int i = 0, j = dva_i; i < sio->sio_nr_dvas; i++, j++) { + sio->sio_dva[i] = bp->blk_dva[j % sio->sio_nr_dvas]; + } +} + +int +dsl_scan_init(dsl_pool_t *dp, uint64_t txg) +{ + int err; + dsl_scan_t *scn; + spa_t *spa = dp->dp_spa; + uint64_t f; + + scn = dp->dp_scan = kmem_zalloc(sizeof (dsl_scan_t), KM_SLEEP); + scn->scn_dp = dp; + + /* + * It's possible that we're resuming a scan after a reboot so + * make sure that the scan_async_destroying flag is initialized + * appropriately. + */ + ASSERT(!scn->scn_async_destroying); + scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_ASYNC_DESTROY); + + /* + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = MAX(zfs_scan_vdev_limit * + dsl_scan_count_leaves(spa->spa_root_vdev), 1ULL << 20); + + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_func", sizeof (uint64_t), 1, &f); + if (err == 0) { + /* + * There was an old-style scrub in progress. Restart a + * new-style scrub from the beginning. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("old-style scrub was in progress; " + "restarting new-style scrub in txg %llu", + (longlong_t)scn->scn_restart_txg); + + /* + * Load the queue obj from the old location so that it + * can be freed by dsl_scan_done(). + */ + (void) zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + "scrub_queue", sizeof (uint64_t), 1, + &scn->scn_phys.scn_queue_obj); + } else { + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys); + /* + * Detect if the pool contains the signature of #2094. If it + * does properly update the scn->scn_phys structure and notify + * the administrator by setting an errata for the pool. + */ + if (err == EOVERFLOW) { + uint64_t zaptmp[SCAN_PHYS_NUMINTS + 1]; + VERIFY3S(SCAN_PHYS_NUMINTS, ==, 24); + VERIFY3S(offsetof(dsl_scan_phys_t, scn_flags), ==, + (23 * sizeof (uint64_t))); + + err = zap_lookup(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCAN, + sizeof (uint64_t), SCAN_PHYS_NUMINTS + 1, &zaptmp); + if (err == 0) { + uint64_t overflow = zaptmp[SCAN_PHYS_NUMINTS]; + + if (overflow & ~DSL_SCAN_FLAGS_MASK || + scn->scn_async_destroying) { + spa->spa_errata = + ZPOOL_ERRATA_ZOL_2094_ASYNC_DESTROY; + return (EOVERFLOW); + } + + bcopy(zaptmp, &scn->scn_phys, + SCAN_PHYS_NUMINTS * sizeof (uint64_t)); + scn->scn_phys.scn_flags = overflow; + + /* Required scrub already in progress. */ + if (scn->scn_phys.scn_state == DSS_FINISHED || + scn->scn_phys.scn_state == DSS_CANCELED) + spa->spa_errata = + ZPOOL_ERRATA_ZOL_2094_SCRUB; + } + } + + if (err == ENOENT) + return (0); + else if (err) + return (err); + + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && + spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { + /* + * A new-type scrub was in progress on an old + * pool, and the pool was accessed by old + * software. Restart from the beginning, since + * the old software may have changed the pool in + * the meantime. + */ + scn->scn_restart_txg = txg; + zfs_dbgmsg("new-style scrub was modified " + "by old software; restarting in txg %llu", + (longlong_t)scn->scn_restart_txg); + } else if (dsl_scan_resilvering(dp)) { + /* + * If a resilver is in progress and there are already + * errors, restart it instead of finishing this scan and + * then restarting it. If there haven't been any errors + * then remember that the incore DTL is valid. + */ + if (scn->scn_phys.scn_errors > 0) { + scn->scn_restart_txg = txg; + zfs_dbgmsg("resilver can't excise DTL_MISSING " + "when finished; restarting in txg %llu", + (u_longlong_t)scn->scn_restart_txg); + } else { + /* it's safe to excise DTL when finished */ + spa->spa_scrub_started = B_TRUE; + } + } + } + + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + + /* reload the queue into the in-core state */ + if (scn->scn_phys.scn_queue_obj != 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + za.za_first_integer); + } + zap_cursor_fini(&zc); + } + + spa_scan_stat_init(spa); + return (0); +} + +void +dsl_scan_fini(dsl_pool_t *dp) +{ + if (dp->dp_scan != NULL) { + dsl_scan_t *scn = dp->dp_scan; + + if (scn->scn_taskq != NULL) + taskq_destroy(scn->scn_taskq); + + scan_ds_queue_clear(scn); + avl_destroy(&scn->scn_queue); + scan_ds_prefetch_queue_clear(scn); + avl_destroy(&scn->scn_prefetch_queue); + + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); + dp->dp_scan = NULL; + } +} + +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} + +boolean_t +dsl_scan_resilver_scheduled(dsl_pool_t *dp) +{ + return ((dp->dp_scan && dp->dp_scan->scn_restart_txg != 0) || + (spa_async_tasks(dp->dp_spa) & SPA_ASYNC_RESILVER)); +} + +boolean_t +dsl_scan_scrubbing(const dsl_pool_t *dp) +{ + dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; + + return (scn_phys->scn_state == DSS_SCANNING && + scn_phys->scn_func == POOL_SCAN_SCRUB); +} + +boolean_t +dsl_scan_is_paused_scrub(const dsl_scan_t *scn) +{ + return (dsl_scan_scrubbing(scn->scn_dp) && + scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); +} + +/* + * Writes out a persistent dsl_scan_phys_t record to the pool directory. + * Because we can be running in the block sorting algorithm, we do not always + * want to write out the record, only when it is "safe" to do so. This safety + * condition is achieved by making sure that the sorting queues are empty + * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * is inconsistent with how much actual scanning progress has been made. The + * kind of sync to be performed is specified by the sync_type argument. If the + * sync is optional, we only sync if the queues are empty. If the sync is + * mandatory, we do a hard ASSERT to make sure that the queues are empty. The + * third possible state is a "cached" sync. This is done in response to: + * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been + * destroyed, so we wouldn't be able to restart scanning from it. + * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been + * superseded by a newer snapshot. + * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been + * swapped with its clone. + * In all cases, a cached sync simply rewrites the last record we've written, + * just slightly modified. For the modifications that are performed to the + * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, + * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. + */ +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) +{ + int i; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); + if (scn->scn_bytes_pending == 0) { + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; + + if (q == NULL) + continue; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); + ASSERT3P(zfs_btree_first(&q->q_exts_by_size, NULL), ==, + NULL); + ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + if (scn->scn_phys.scn_queue_obj != 0) + scan_ds_queue_sync(scn, tx); + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, + sizeof (scn->scn_phys)); + + if (scn->scn_checkpointing) + zfs_dbgmsg("finish scan checkpoint"); + + scn->scn_checkpointing = B_FALSE; + scn->scn_last_checkpoint = ddi_get_lbolt(); + } else if (sync_type == SYNC_CACHED) { + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys_cached, tx)); + } +} + +/* ARGSUSED */ +static int +dsl_scan_setup_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + if (dsl_scan_is_running(scn) || vdev_rebuild_active(rvd)) + return (SET_ERROR(EBUSY)); + + return (0); +} + +static void +dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + pool_scan_func_t *funcp = arg; + dmu_object_type_t ot = 0; + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + + ASSERT(!dsl_scan_is_running(scn)); + ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); + bzero(&scn->scn_phys, sizeof (scn->scn_phys)); + scn->scn_phys.scn_func = *funcp; + scn->scn_phys.scn_state = DSS_SCANNING; + scn->scn_phys.scn_min_txg = 0; + scn->scn_phys.scn_max_txg = tx->tx_txg; + scn->scn_phys.scn_ddt_class_max = DDT_CLASSES - 1; /* the entire DDT */ + scn->scn_phys.scn_start_time = gethrestime_sec(); + scn->scn_phys.scn_errors = 0; + scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_issued_before_pass = 0; + scn->scn_restart_txg = 0; + scn->scn_done_txg = 0; + scn->scn_last_checkpoint = 0; + scn->scn_checkpointing = B_FALSE; + spa_scan_stat_init(spa); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + scn->scn_phys.scn_ddt_class_max = zfs_scrub_ddt_class_max; + + /* rewrite all disk labels */ + vdev_config_dirty(spa->spa_root_vdev); + + if (vdev_resilver_needed(spa->spa_root_vdev, + &scn->scn_phys.scn_min_txg, &scn->scn_phys.scn_max_txg)) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_START); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_START); + } + + spa->spa_scrub_started = B_TRUE; + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (scn->scn_phys.scn_min_txg > TXG_INITIAL) + scn->scn_phys.scn_ddt_class_max = DDT_CLASS_DITTO; + + /* + * When starting a resilver clear any existing rebuild state. + * This is required to prevent stale rebuild status from + * being reported when a rebuild is run, then a resilver and + * finally a scrub. In which case only the scrub status + * should be reported by 'zpool status'. + */ + if (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) { + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + vdev_rebuild_clear_sync( + (void *)(uintptr_t)vd->vdev_id, tx); + } + } + } + + /* back to the generic stuff */ + + if (dp->dp_blkstats == NULL) { + dp->dp_blkstats = + vmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + mutex_init(&dp->dp_blkstats->zab_lock, NULL, + MUTEX_DEFAULT, NULL); + } + bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); + + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) + ot = DMU_OT_ZAP_OTHER; + + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, + ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); + + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + + spa_history_log_internal(spa, "scan setup", tx, + "func=%u mintxg=%llu maxtxg=%llu", + *funcp, (u_longlong_t)scn->scn_phys.scn_min_txg, + (u_longlong_t)scn->scn_phys.scn_max_txg); +} + +/* + * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. + * Can also be called to resume a paused scrub. + */ +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (func == POOL_SCAN_RESILVER) { + dsl_scan_restart_resilver(spa->spa_dsl_pool, 0); + return (0); + } + + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { + /* got scrub start cmd, resume paused scrub */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); + return (SET_ERROR(ECANCELED)); + } + + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* ARGSUSED */ +static void +dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) +{ + static const char *old_names[] = { + "scrub_bookmark", + "scrub_ddt_bookmark", + "scrub_ddt_class_max", + "scrub_queue", + "scrub_min_txg", + "scrub_max_txg", + "scrub_func", + "scrub_errors", + NULL + }; + + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + int i; + + /* Remove any remnants of an old-style scrub. */ + for (i = 0; old_names[i]; i++) { + (void) zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, old_names[i], tx); + } + + if (scn->scn_phys.scn_queue_obj != 0) { + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = 0; + } + scan_ds_queue_clear(scn); + scan_ds_prefetch_queue_clear(scn); + + scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; + + /* + * If we were "restarted" from a stopped state, don't bother + * with anything else. + */ + if (!dsl_scan_is_running(scn)) { + ASSERT(!scn->scn_is_sorted); + return; + } + + if (scn->scn_is_sorted) { + scan_io_queues_destroy(scn); + scn->scn_is_sorted = B_FALSE; + + if (scn->scn_taskq != NULL) { + taskq_destroy(scn->scn_taskq); + scn->scn_taskq = NULL; + } + } + + scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; + + spa_notify_waiters(spa); + + if (dsl_scan_restarting(scn, tx)) + spa_history_log_internal(spa, "scan aborted, restarting", tx, + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + else if (!complete) + spa_history_log_internal(spa, "scan cancelled", tx, + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + else + spa_history_log_internal(spa, "scan done", tx, + "errors=%llu", (u_longlong_t)spa_get_errlog_size(spa)); + + if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { + spa->spa_scrub_active = B_FALSE; + + /* + * If the scrub/resilver completed, update all DTLs to + * reflect this. Whether it succeeded or not, vacate + * all temporary scrub DTLs. + * + * As the scrub does not currently support traversing + * data that have been freed but are part of a checkpoint, + * we don't mark the scrub as done in the DTLs as faults + * may still exist in those vdevs. + */ + if (complete && + !spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + scn->scn_phys.scn_max_txg, B_TRUE, B_FALSE); + + if (scn->scn_phys.scn_min_txg) { + nvlist_t *aux = fnvlist_alloc(); + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, + "healing"); + spa_event_notify(spa, NULL, aux, + ESC_ZFS_RESILVER_FINISH); + nvlist_free(aux); + } else { + spa_event_notify(spa, NULL, NULL, + ESC_ZFS_SCRUB_FINISH); + } + } else { + vdev_dtl_reassess(spa->spa_root_vdev, tx->tx_txg, + 0, B_TRUE, B_FALSE); + } + spa_errlog_rotate(spa); + + /* + * Don't clear flag until after vdev_dtl_reassess to ensure that + * DTL_MISSING will get updated when possible. + */ + spa->spa_scrub_started = B_FALSE; + + /* + * We may have finished replacing a device. + * Let the async thread assess this and handle the detach. + */ + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + + /* + * Clear any resilver_deferred flags in the config. + * If there are drives that need resilvering, kick + * off an asynchronous request to start resilver. + * vdev_clear_resilver_deferred() may update the config + * before the resilver can restart. In the event of + * a crash during this period, the spa loading code + * will find the drives that need to be resilvered + * and start the resilver then. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER) && + vdev_clear_resilver_deferred(spa->spa_root_vdev, tx)) { + spa_history_log_internal(spa, + "starting deferred resilver", tx, "errors=%llu", + (u_longlong_t)spa_get_errlog_size(spa)); + spa_async_request(spa, SPA_ASYNC_RESILVER); + } + } + + scn->scn_phys.scn_end_time = gethrestime_sec(); + + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_2094_SCRUB) + spa->spa_errata = 0; + + ASSERT(!dsl_scan_is_running(scn)); +} + +/* ARGSUSED */ +static int +dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + if (!dsl_scan_is_running(scn)) + return (SET_ERROR(ENOENT)); + return (0); +} + +/* ARGSUSED */ +static void +dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; + + dsl_scan_done(scn, B_FALSE, tx); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); + spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); +} + +int +dsl_scan_cancel(dsl_pool_t *dp) +{ + return (dsl_sync_task(spa_name(dp->dp_spa), dsl_scan_cancel_check, + dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); +} + +static int +dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* can't pause a scrub when there is no in-progress scrub */ + if (!dsl_scan_scrubbing(dp)) + return (SET_ERROR(ENOENT)); + + /* can't pause a paused scrub */ + if (dsl_scan_is_paused_scrub(scn)) + return (SET_ERROR(EBUSY)); + } else if (*cmd != POOL_SCRUB_NORMAL) { + return (SET_ERROR(ENOTSUP)); + } + + return (0); +} + +static void +dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) +{ + pool_scrub_cmd_t *cmd = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + if (*cmd == POOL_SCRUB_PAUSE) { + /* can't pause a scrub when there is no in-progress scrub */ + spa->spa_scan_pass_scrub_pause = gethrestime_sec(); + scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; + scn->scn_phys_cached.scn_flags |= DSF_SCRUB_PAUSED; + dsl_scan_sync_state(scn, tx, SYNC_CACHED); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); + spa_notify_waiters(spa); + } else { + ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); + if (dsl_scan_is_paused_scrub(scn)) { + /* + * We need to keep track of how much time we spend + * paused per pass so that we can adjust the scrub rate + * shown in the output of 'zpool status' + */ + spa->spa_scan_pass_scrub_spent_paused += + gethrestime_sec() - spa->spa_scan_pass_scrub_pause; + spa->spa_scan_pass_scrub_pause = 0; + scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; + scn->scn_phys_cached.scn_flags &= ~DSF_SCRUB_PAUSED; + dsl_scan_sync_state(scn, tx, SYNC_CACHED); + } + } +} + +/* + * Set scrub pause/resume state if it makes sense to do so + */ +int +dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) +{ + return (dsl_sync_task(spa_name(dp->dp_spa), + dsl_scrub_pause_resume_check, dsl_scrub_pause_resume_sync, &cmd, 3, + ZFS_SPACE_CHECK_RESERVED)); +} + + +/* start a new scan, or restart an existing one. */ +void +dsl_scan_restart_resilver(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", (longlong_t)txg); +} + +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) +{ + zio_free(dp->dp_spa, txg, bp); +} + +void +dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) +{ + ASSERT(dsl_pool_sync_context(dp)); + zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); +} + +static int +scan_ds_queue_compare(const void *a, const void *b) +{ + const scan_ds_t *sds_a = a, *sds_b = b; + + if (sds_a->sds_dsobj < sds_b->sds_dsobj) + return (-1); + if (sds_a->sds_dsobj == sds_b->sds_dsobj) + return (0); + return (1); +} + +static void +scan_ds_queue_clear(dsl_scan_t *scn) +{ + void *cookie = NULL; + scan_ds_t *sds; + while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { + kmem_free(sds, sizeof (*sds)); + } +} + +static boolean_t +scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + sds = avl_find(&scn->scn_queue, &srch, NULL); + if (sds != NULL && txg != NULL) + *txg = sds->sds_txg; + return (sds != NULL); +} + +static void +scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) +{ + scan_ds_t *sds; + avl_index_t where; + + sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); + sds->sds_dsobj = dsobj; + sds->sds_txg = txg; + + VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); + avl_insert(&scn->scn_queue, sds, where); +} + +static void +scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + + sds = avl_find(&scn->scn_queue, &srch, NULL); + VERIFY(sds != NULL); + avl_remove(&scn->scn_queue, sds); + kmem_free(sds, sizeof (*sds)); +} + +static void +scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? + DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; + + ASSERT0(scn->scn_bytes_pending); + ASSERT(scn->scn_phys.scn_queue_obj != 0); + + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, + DMU_OT_NONE, 0, tx); + for (scan_ds_t *sds = avl_first(&scn->scn_queue); + sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { + VERIFY0(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, sds->sds_dsobj, + sds->sds_txg, tx)); + } +} + +/* + * Computes the memory limit state that we're currently in. A sorted scan + * needs quite a bit of memory to hold the sorting queue, so we need to + * reasonably constrain the size so it doesn't impact overall system + * performance. We compute two limits: + * 1) Hard memory limit: if the amount of memory used by the sorting + * queues on a pool gets above this value, we stop the metadata + * scanning portion and start issuing the queued up and sorted + * I/Os to reduce memory usage. + * This limit is calculated as a fraction of physmem (by default 5%). + * We constrain the lower bound of the hard limit to an absolute + * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain + * the upper bound to 5% of the total pool size - no chance we'll + * ever need that much memory, but just to keep the value in check. + * 2) Soft memory limit: once we hit the hard memory limit, we start + * issuing I/O to reduce queue memory usage, but we don't want to + * completely empty out the queues, since we might be able to find I/Os + * that will fill in the gaps of our non-sequential IOs at some point + * in the future. So we stop the issuing of I/Os once the amount of + * memory used drops below the soft limit (at which point we stop issuing + * I/O and start scanning metadata again). + * + * This limit is calculated by subtracting a fraction of the hard + * limit from the hard limit. By default this fraction is 5%, so + * the soft limit is 95% of the hard limit. We cap the size of the + * difference between the hard and soft limits at an absolute + * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is + * sufficient to not cause too frequent switching between the + * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's + * worth of queues is about 1.2 GiB of on-pool data, so scanning + * that should take at least a decent fraction of a second). + */ +static boolean_t +dsl_scan_should_clear(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + uint64_t alloc, mlim_hard, mlim_soft, mused; + + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + + mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, + zfs_scan_mem_lim_min); + mlim_hard = MIN(mlim_hard, alloc / 20); + mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, + zfs_scan_mem_lim_soft_max); + mused = 0; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + dsl_scan_io_queue_t *queue; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + queue = tvd->vdev_scan_io_queue; + if (queue != NULL) { + /* # extents in exts_by_size = # in exts_by_addr */ + mused += zfs_btree_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_gap_t) + queue->q_sio_memused; + } + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } + + dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); + + if (mused == 0) + ASSERT0(scn->scn_bytes_pending); + + /* + * If we are above our hard limit, we need to clear out memory. + * If we are below our soft limit, we need to accumulate sequential IOs. + * Otherwise, we should keep doing whatever we are currently doing. + */ + if (mused >= mlim_hard) + return (B_TRUE); + else if (mused < mlim_soft) + return (B_FALSE); + else + return (scn->scn_clearing); +} + +static boolean_t +dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) +{ + /* we never skip user/group accounting objects */ + if (zb && (int64_t)zb->zb_object < 0) + return (B_FALSE); + + if (scn->scn_suspending) + return (B_TRUE); /* we're already suspending */ + + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) + return (B_FALSE); /* we're resuming */ + + /* We only know how to resume from level-0 and objset blocks. */ + if (zb && (zb->zb_level != 0 && zb->zb_level != ZB_ROOT_LEVEL)) + return (B_FALSE); + + /* + * We suspend if: + * - we have scanned for at least the minimum time (default 1 sec + * for scrub, 3 sec for resilver), and either we have sufficient + * dirty data that we are starting to write more quickly + * (default 30%), someone is explicitly waiting for this txg + * to complete, or we have used up all of the time in the txg + * timeout (default 5 sec). + * or + * - the spa is shutting down because this pool is being exported + * or the machine is rebooting. + * or + * - the scan queue has reached its memory use limit + */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa) || + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { + if (zb && zb->zb_level == ZB_ROOT_LEVEL) { + dprintf("suspending at first available bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + zb->zb_objset, 0, 0, 0); + } else if (zb != NULL) { + dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + scn->scn_phys.scn_bookmark = *zb; + } else { +#ifdef ZFS_DEBUG + dsl_scan_phys_t *scnp = &scn->scn_phys; + dprintf("suspending at at DDT bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); +#endif + } + scn->scn_suspending = B_TRUE; + return (B_TRUE); + } + return (B_FALSE); +} + +typedef struct zil_scan_arg { + dsl_pool_t *zsa_dp; + zil_header_t *zsa_zh; +} zil_scan_arg_t; + +/* ARGSUSED */ +static int +dsl_scan_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + zbookmark_phys_t zb; + + ASSERT(!BP_IS_REDACTED(bp)); + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * scrub there's nothing to do to it). + */ + if (claim_txg == 0 && bp->blk_birth >= spa_min_claim_txg(dp->dp_spa)) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + return (0); +} + +/* ARGSUSED */ +static int +dsl_scan_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) +{ + if (lrc->lrc_txtype == TX_WRITE) { + zil_scan_arg_t *zsa = arg; + dsl_pool_t *dp = zsa->zsa_dp; + dsl_scan_t *scn = dp->dp_scan; + zil_header_t *zh = zsa->zsa_zh; + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + zbookmark_phys_t zb; + + ASSERT(!BP_IS_REDACTED(bp)); + if (BP_IS_HOLE(bp) || + bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + return (0); + + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ + if (claim_txg == 0 || bp->blk_birth < claim_txg) + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + VERIFY(0 == scan_funcs[scn->scn_phys.scn_func](dp, bp, &zb)); + } + return (0); +} + +static void +dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) +{ + uint64_t claim_txg = zh->zh_claim_txg; + zil_scan_arg_t zsa = { dp, zh }; + zilog_t *zilog; + + ASSERT(spa_writeable(dp->dp_spa)); + + /* + * We only want to visit blocks that have been claimed but not yet + * replayed (or, in read-only mode, blocks that *would* be claimed). + */ + if (claim_txg == 0) + return; + + zilog = zil_alloc(dp->dp_meta_objset, zh); + + (void) zil_parse(zilog, dsl_scan_zil_block, dsl_scan_zil_record, &zsa, + claim_txg, B_FALSE); + + zil_free(zilog); +} + +/* + * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea + * here is to sort the AVL tree by the order each block will be needed. + */ +static int +scan_prefetch_queue_compare(const void *a, const void *b) +{ + const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; + const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; + const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; + + return (zbookmark_compare(spc_a->spc_datablkszsec, + spc_a->spc_indblkshift, spc_b->spc_datablkszsec, + spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); +} + +static void +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) +{ + if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { + zfs_refcount_destroy(&spc->spc_refcnt); + kmem_free(spc, sizeof (scan_prefetch_ctx_t)); + } +} + +static scan_prefetch_ctx_t * +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +{ + scan_prefetch_ctx_t *spc; + + spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); + zfs_refcount_create(&spc->spc_refcnt); + zfs_refcount_add(&spc->spc_refcnt, tag); + spc->spc_scn = scn; + if (dnp != NULL) { + spc->spc_datablkszsec = dnp->dn_datablkszsec; + spc->spc_indblkshift = dnp->dn_indblkshift; + spc->spc_root = B_FALSE; + } else { + spc->spc_datablkszsec = 0; + spc->spc_indblkshift = 0; + spc->spc_root = B_TRUE; + } + + return (spc); +} + +static void +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +{ + zfs_refcount_add(&spc->spc_refcnt, tag); +} + +static void +scan_ds_prefetch_queue_clear(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + void *cookie = NULL; + scan_prefetch_issue_ctx_t *spic = NULL; + + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_destroy_nodes(&scn->scn_prefetch_queue, + &cookie)) != NULL) { + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + mutex_exit(&spa->spa_scrub_lock); +} + +static boolean_t +dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, + const zbookmark_phys_t *zb) +{ + zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; + dnode_phys_t tmp_dnp; + dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; + + if (zb->zb_objset != last_zb->zb_objset) + return (B_TRUE); + if ((int64_t)zb->zb_object < 0) + return (B_FALSE); + + tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; + tmp_dnp.dn_indblkshift = spc->spc_indblkshift; + + if (zbookmark_subtree_completed(dnp, zb, last_zb)) + return (B_TRUE); + + return (B_FALSE); +} + +static void +dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) +{ + avl_index_t idx; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; + + if (zfs_no_scrub_prefetch || BP_IS_REDACTED(bp)) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET)) + return; + + if (dsl_scan_check_prefetch_resume(spc, zb)) + return; + + scan_prefetch_ctx_add_ref(spc, scn); + spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); + spic->spic_spc = spc; + spic->spic_bp = *bp; + spic->spic_zb = *zb; + + /* + * Add the IO to the queue of blocks to prefetch. This allows us to + * prioritize blocks that we will need first for the main traversal + * thread. + */ + mutex_enter(&spa->spa_scrub_lock); + if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { + /* this block is already queued for prefetch */ + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + scan_prefetch_ctx_rele(spc, scn); + mutex_exit(&spa->spa_scrub_lock); + return; + } + + avl_insert(&scn->scn_prefetch_queue, spic, idx); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +static void +dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int i; + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) + return; + + SET_BOOKMARK(&zb, objset, object, 0, 0); + + spc = scan_prefetch_ctx_create(scn, dnp, FTAG); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); + zb.zb_blkid = i; + dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zb.zb_level = 0; + zb.zb_blkid = DMU_SPILL_BLKID; + dsl_scan_prefetch(spc, DN_SPILL_BLKPTR(dnp), &zb); + } + + scan_prefetch_ctx_rele(spc, FTAG); +} + +static void +dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *private) +{ + scan_prefetch_ctx_t *spc = private; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + + /* broadcast that the IO has completed for rate limiting purposes */ + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + /* if there was an error or we are done prefetching, just cleanup */ + if (buf == NULL || scn->scn_prefetch_stop) + goto out; + + if (BP_GET_LEVEL(bp) > 0) { + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t czb; + + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, zb->zb_blkid * epb + i); + dsl_scan_prefetch(spc, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + dnode_phys_t *cdnp; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_prefetch_dnode(scn, cdnp, + zb->zb_objset, zb->zb_blkid * epb + i); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + objset_phys_t *osp = buf->b_data; + + dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, + zb->zb_objset, DMU_META_DNODE_OBJECT); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + dsl_scan_prefetch_dnode(scn, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + } + +out: + if (buf != NULL) + arc_buf_destroy(buf, private); + scan_prefetch_ctx_rele(spc, scn); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch_thread(void *arg) +{ + dsl_scan_t *scn = arg; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; + + /* loop until we are told to stop */ + while (!scn->scn_prefetch_stop) { + arc_flags_t flags = ARC_FLAG_NOWAIT | + ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Wait until we have an IO to issue and are not above our + * maximum in flight limit. + */ + while (!scn->scn_prefetch_stop && + (avl_numnodes(&scn->scn_prefetch_queue) == 0 || + spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } + + /* recheck if we should stop since we waited for the cv */ + if (scn->scn_prefetch_stop) { + mutex_exit(&spa->spa_scrub_lock); + break; + } + + /* remove the prefetch IO from the tree */ + spic = avl_first(&scn->scn_prefetch_queue); + spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); + avl_remove(&scn->scn_prefetch_queue, spic); + + mutex_exit(&spa->spa_scrub_lock); + + if (BP_IS_PROTECTED(&spic->spic_bp)) { + ASSERT(BP_GET_TYPE(&spic->spic_bp) == DMU_OT_DNODE || + BP_GET_TYPE(&spic->spic_bp) == DMU_OT_OBJSET); + ASSERT3U(BP_GET_LEVEL(&spic->spic_bp), ==, 0); + zio_flags |= ZIO_FLAG_RAW; + } + + /* issue the prefetch asynchronously */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, + &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + + ASSERT(scn->scn_prefetch_stop); + + /* free any prefetches we didn't get to complete */ + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { + avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); + mutex_exit(&spa->spa_scrub_lock); +} + +static boolean_t +dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, + const zbookmark_phys_t *zb) +{ + /* + * We never skip over user/group accounting objects (obj<0) + */ + if (!ZB_IS_ZERO(&scn->scn_phys.scn_bookmark) && + (int64_t)zb->zb_object >= 0) { + /* + * If we already visited this bp & everything below (in + * a prior txg sync), don't bother doing it again. + */ + if (zbookmark_subtree_completed(dnp, zb, + &scn->scn_phys.scn_bookmark)) + return (B_TRUE); + + /* + * If we found the block we're trying to resume from, or + * we went past it to a different object, zero it out to + * indicate that it's OK to start checking for suspending + * again. + */ + if (bcmp(zb, &scn->scn_phys.scn_bookmark, sizeof (*zb)) == 0 || + zb->zb_object > scn->scn_phys.scn_bookmark.zb_object) { + dprintf("resuming at %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + bzero(&scn->scn_phys.scn_bookmark, sizeof (*zb)); + } + } + return (B_FALSE); +} + +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +inline __attribute__((always_inline)) static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + +/* + * Return nonzero on i/o error. + * Return new buf to write out in *bufp. + */ +inline __attribute__((always_inline)) static int +dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, const blkptr_t *bp, + const zbookmark_phys_t *zb, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + int err; + + ASSERT(!BP_IS_REDACTED(bp)); + + if (BP_GET_LEVEL(bp) > 0) { + arc_flags_t flags = ARC_FLAG_WAIT; + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, + zb->zb_blkid * epb + i); + dsl_scan_visitbp(cbp, &czb, dnp, + ds, scn, ostype, tx); + } + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + arc_flags_t flags = ARC_FLAG_WAIT; + dnode_phys_t *cdnp; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + arc_buf_t *buf; + + if (BP_IS_PROTECTED(bp)) { + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + zio_flags |= ZIO_FLAG_RAW; + } + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_visitdnode(scn, ds, ostype, + cdnp, zb->zb_blkid * epb + i, tx); + } + + arc_buf_destroy(buf, &buf); + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + arc_flags_t flags = ARC_FLAG_WAIT; + objset_phys_t *osp; + arc_buf_t *buf; + + err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); + if (err) { + scn->scn_phys.scn_errors++; + return (err); + } + + osp = buf->b_data; + + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_meta_dnode, DMU_META_DNODE_OBJECT, tx); + + if (OBJSET_BUF_HAS_USERUSED(buf)) { + /* + * We also always visit user/group/project accounting + * objects, and never skip them, even if we are + * suspending. This is necessary so that the + * space deltas from this txg get integrated. + */ + if (OBJSET_BUF_HAS_PROJECTUSED(buf)) + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_projectused_dnode, + DMU_PROJECTUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_groupused_dnode, + DMU_GROUPUSED_OBJECT, tx); + dsl_scan_visitdnode(scn, ds, osp->os_type, + &osp->os_userused_dnode, + DMU_USERUSED_OBJECT, tx); + } + arc_buf_destroy(buf, &buf); + } + + return (0); +} + +inline __attribute__((always_inline)) static void +dsl_scan_visitdnode(dsl_scan_t *scn, dsl_dataset_t *ds, + dmu_objset_type_t ostype, dnode_phys_t *dnp, + uint64_t object, dmu_tx_t *tx) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_phys_t czb; + + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + dnp->dn_nlevels - 1, j); + dsl_scan_visitbp(&dnp->dn_blkptr[j], + &czb, dnp, ds, scn, ostype, tx); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zbookmark_phys_t czb; + SET_BOOKMARK(&czb, ds ? ds->ds_object : 0, object, + 0, DMU_SPILL_BLKID); + dsl_scan_visitbp(DN_SPILL_BLKPTR(dnp), + &czb, dnp, ds, scn, ostype, tx); + } +} + +/* + * The arguments are in this order because mdb can only print the + * first 5; we want them to be useful. + */ +static void +dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + blkptr_t *bp_toread = NULL; + + if (dsl_scan_check_suspend(scn, zb)) + return; + + if (dsl_scan_check_resume(scn, dnp, zb)) + return; + + scn->scn_visited_this_txg++; + + /* + * This debugging is commented out to conserve stack space. This + * function is called recursively and the debugging adds several + * bytes to the stack for each call. It can be commented back in + * if required to debug an issue in dsl_scan_visitbp(). + * + * dprintf_bp(bp, + * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", + * ds, ds ? ds->ds_object : 0, + * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + * bp); + */ + + if (BP_IS_HOLE(bp)) { + scn->scn_holes_this_txg++; + return; + } + + if (BP_IS_REDACTED(bp)) { + ASSERT(dsl_dataset_feature_is_active(ds, + SPA_FEATURE_REDACTED_DATASETS)); + return; + } + + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; + return; + } + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; + + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) + goto out; + + /* + * If dsl_scan_ddt() has already visited this block, it will have + * already done any translations or scrubbing, so don't call the + * callback again. + */ + if (ddt_class_contains(dp->dp_spa, + scn->scn_phys.scn_ddt_class_max, bp)) { + scn->scn_ddt_contained_this_txg++; + goto out; + } + + /* + * If this block is from the future (after cur_max_txg), then we + * are doing this on behalf of a deleted snapshot, and we will + * revisit the future block on the next pass of this dataset. + * Don't scan it now unless we need to because something + * under it was modified. + */ + if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + scn->scn_gt_max_this_txg++; + goto out; + } + + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + +out: + kmem_free(bp_toread, sizeof (blkptr_t)); +} + +static void +dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, + dmu_tx_t *tx) +{ + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { + SET_BOOKMARK(&scn->scn_prefetch_bookmark, + zb.zb_objset, 0, 0, 0); + } else { + scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; + } + + scn->scn_objsets_visited_this_txg++; + + spc = scan_prefetch_ctx_create(scn, NULL, FTAG); + dsl_scan_prefetch(spc, bp, &zb); + scan_prefetch_ctx_rele(spc, FTAG); + + dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); + + dprintf_ds(ds, "finished scan%s", ""); +} + +static void +ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) +{ + if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { + if (ds->ds_is_snapshot) { + /* + * Note: + * - scn_cur_{min,max}_txg stays the same. + * - Setting the flag is not really necessary if + * scn_cur_max_txg == scn_max_txg, because there + * is nothing after this snapshot that we care + * about. However, we set it anyway and then + * ignore it when we retraverse it in + * dsl_scan_visitds(). + */ + scn_phys->scn_bookmark.zb_objset = + dsl_dataset_phys(ds)->ds_next_snap_obj; + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; + } else { + SET_BOOKMARK(&scn_phys->scn_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); + zfs_dbgmsg("destroying ds %llu; currently traversing; " + "reset bookmark to -1,0,0,0", + (u_longlong_t)ds->ds_object); + } + } +} + +/* + * Invoked when a dataset is destroyed. We need to make sure that: + * + * 1) If it is the dataset that was currently being scanned, we write + * a new dsl_scan_phys_t and marking the objset reference in it + * as destroyed. + * 2) Remove it from the work queue, if it was present. + * + * If the dataset was actually a snapshot, instead of marking the dataset + * as destroyed, we instead substitute the next snapshot in line. + */ +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_destroyed_scn_phys(ds, &scn->scn_phys); + ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + if (ds->ds_is_snapshot) + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { + ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + if (ds->ds_is_snapshot) { + /* + * We keep the same mintxg; it could be > + * ds_creation_txg if the previous snapshot was + * deleted too. + */ + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_next_snap_obj, + mintxg, tx) == 0); + zfs_dbgmsg("destroying ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)-> + ds_next_snap_obj); + } else { + zfs_dbgmsg("destroying ds %llu; in queue; removing", + (u_longlong_t)ds->ds_object); + } + } + + /* + * dsl_scan_sync() should be called after this, and should sync + * out our changed state, but just to be safe, do it here. + */ + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds->ds_object) { + scn_bookmark->zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } +} + +/* + * Called when a dataset is snapshotted. If we were currently traversing + * this snapshot, we reset our bookmark to point at the newly created + * snapshot. We also modify our work queue to remove the old snapshot and + * replace with the new one. + */ +void +dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); + + ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); + ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { + VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); + VERIFY(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg, tx) == 0); + zfs_dbgmsg("snapshotting ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, + zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds1->ds_object) { + scn_bookmark->zb_objset = ds2->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (scn_bookmark->zb_objset == ds2->ds_object) { + scn_bookmark->zb_objset = ds1->ds_object; + zfs_dbgmsg("clone_swap ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } +} + +/* + * Called when an origin dataset and its clone are swapped. If we were + * currently traversing the dataset, we need to switch to traversing the + * newly promoted clone. + */ +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg1, mintxg2; + boolean_t ds1_queued, ds2_queued; + + if (!dsl_scan_is_running(scn)) + return; + + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); + + /* + * Handle the in-memory scan queue. + */ + ds1_queued = scan_ds_queue_contains(scn, ds1->ds_object, &mintxg1); + ds2_queued = scan_ds_queue_contains(scn, ds2->ds_object, &mintxg2); + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * The swapping code below would not handle this case correctly, + * since we can't insert ds2 if it is already there. That's + * because scan_ds_queue_insert() prohibits a duplicate insert + * and panics. + */ + } else if (ds1_queued) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg1); + } else if (ds2_queued) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg2); + } + + /* + * Handle the on-disk scan queue. + * The on-disk state is an out-of-date version of the in-memory state, + * so the in-memory and on-disk values for ds1_queued and ds2_queued may + * be different. Therefore we need to apply the swap logic to the + * on-disk state independently of the in-memory state. + */ + ds1_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg1) == 0; + ds2_queued = zap_lookup_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg2) == 0; + + /* Sanity checking. */ + if (ds1_queued) { + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg1, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + if (ds2_queued) { + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); + ASSERT3U(mintxg2, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); + } + + if (ds1_queued && ds2_queued) { + /* + * If both are queued, we don't need to do anything. + * Alternatively, we could check for EEXIST from + * zap_add_int_key() and back out to the original state, but + * that would be more work than checking for this case upfront. + */ + } else if (ds1_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, tx)); + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, mintxg1, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds1->ds_object, + (u_longlong_t)ds2->ds_object); + } else if (ds2_queued) { + VERIFY3S(0, ==, zap_remove_int(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds2->ds_object, tx)); + VERIFY3S(0, ==, zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, ds1->ds_object, mintxg2, tx)); + zfs_dbgmsg("clone_swap ds %llu; in queue; " + "replacing with %llu", + (u_longlong_t)ds2->ds_object, + (u_longlong_t)ds1->ds_object); + } + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +/* ARGSUSED */ +static int +enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + uint64_t originobj = *(uint64_t *)arg; + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) + return (0); + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + + dsl_dataset_rele(ds, FTAG); + if (err) + return (err); + ds = prev; + } + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +static void +dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + dsl_dataset_t *ds; + + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + + if (scn->scn_phys.scn_cur_min_txg >= + scn->scn_phys.scn_max_txg) { + /* + * This can happen if this snapshot was created after the + * scan started, and we already completed a previous snapshot + * that was created after the scan started. This snapshot + * only references blocks with: + * + * birth < our ds_creation_txg + * cur_min_txg is no less than ds_creation_txg. + * We have already visited these blocks. + * or + * birth > scn_max_txg + * The scan requested not to visit these blocks. + * + * Subsequent snapshots (and clones) can reference our + * blocks, or blocks with even higher birth times. + * Therefore we do not need to visit them either, + * so we do not add them to the work queue. + * + * Note that checking for cur_min_txg >= cur_max_txg + * is not sufficient, because in that case we may need to + * visit subsequent snapshots. This happens when min_txg > 0, + * which raises cur_min_txg. In this case we will visit + * this dataset but skip all of its blocks, because the + * rootbp's birth time is < cur_min_txg. Then we will + * add the next snapshots/clones to the work queue. + */ + char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " + "cur_min_txg (%llu) >= max_txg (%llu)", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_max_txg); + kmem_free(dsname, MAXNAMELEN); + + goto out; + } + + /* + * Only the ZIL in the head (non-snapshot) is valid. Even though + * snapshots can have ZIL block pointers (which may be the same + * BP as in the head), they must be ignored. In addition, $ORIGIN + * doesn't have a objset (i.e. its ds_bp is a hole) so we don't + * need to look for a ZIL in it either. So we traverse the ZIL here, + * rather than in scan_recurse(), because the regular snapshot + * block-sharing rules don't apply to it. + */ + if (!dsl_dataset_is_snapshot(ds) && + (dp->dp_origin_snap == NULL || + ds->ds_dir != dp->dp_origin_snap->ds_dir)) { + objset_t *os; + if (dmu_objset_from_ds(ds, &os) != 0) { + goto out; + } + dsl_scan_zil(dp, &os->os_zil_header); + } + + /* + * Iterate over the bps in this ds. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + rrw_enter(&ds->ds_bp_rwlock, RW_READER, FTAG); + dsl_scan_visit_rootbp(scn, ds, &dsl_dataset_phys(ds)->ds_bp, tx); + rrw_exit(&ds->ds_bp_rwlock, FTAG); + + char *dsname = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, KM_SLEEP); + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("scanned dataset %llu (%s) with min=%llu max=%llu; " + "suspending=%u", + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_cur_max_txg, + (int)scn->scn_suspending); + kmem_free(dsname, ZFS_MAX_DATASET_NAME_LEN); + + if (scn->scn_suspending) + goto out; + + /* + * We've finished this pass over this dataset. + */ + + /* + * If we did not completely visit this dataset, do another pass. + */ + if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { + zfs_dbgmsg("incomplete pass; visiting again"); + scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; + scan_ds_queue_insert(scn, ds->ds_object, + scn->scn_phys.scn_cur_max_txg); + goto out; + } + + /* + * Add descendant datasets to work queue. + */ + if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, + dsl_dataset_phys(ds)->ds_creation_txg); + } + if (dsl_dataset_phys(ds)->ds_num_children > 1) { + boolean_t usenext = B_FALSE; + if (dsl_dataset_phys(ds)->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, &count); + if (err == 0 && + count == dsl_dataset_phys(ds)->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + dsl_dataset_phys(ds)->ds_creation_txg); + } + zap_cursor_fini(&zc); + } else { + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_clones_cb, &ds->ds_object, + DS_FIND_CHILDREN)); + } + } + +out: + dsl_dataset_rele(ds, FTAG); +} + +/* ARGSUSED */ +static int +enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) +{ + dsl_dataset_t *ds; + int err; + dsl_scan_t *scn = dp->dp_scan; + + err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); + if (err) + return (err); + + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != 0) { + dsl_dataset_t *prev; + err = dsl_dataset_hold_obj(dp, + dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + /* + * If this is a clone, we don't need to worry about it for now. + */ + if (dsl_dataset_phys(prev)->ds_next_snap_obj != ds->ds_object) { + dsl_dataset_rele(ds, FTAG); + dsl_dataset_rele(prev, FTAG); + return (0); + } + dsl_dataset_rele(ds, FTAG); + ds = prev; + } + + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + int p; + + if (!dsl_scan_is_running(scn)) + return; + + /* + * This function is special because it is the only thing + * that can add scan_io_t's to the vdev scan queues from + * outside dsl_scan_sync(). For the most part this is ok + * as long as it is called from within syncing context. + * However, dsl_scan_sync() expects that no new sio's will + * be added between when all the work for a scan is done + * and the next txg when the scan is actually marked as + * completed. This check ensures we do not issue new sio's + * during this period. + */ + if (scn->scn_done_txg != 0) + return; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) +{ + ddt_bookmark_t *ddb = &scn->scn_phys.scn_ddt_bookmark; + ddt_entry_t dde; + int error; + uint64_t n = 0; + + bzero(&dde, sizeof (ddt_entry_t)); + + while ((error = ddt_walk(scn->scn_dp->dp_spa, ddb, &dde)) == 0) { + ddt_t *ddt; + + if (ddb->ddb_class > scn->scn_phys.scn_ddt_class_max) + break; + dprintf("visiting ddb=%llu/%llu/%llu/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + + /* There should be no pending changes to the dedup table */ + ddt = scn->scn_dp->dp_spa->spa_ddt[ddb->ddb_checksum]; + ASSERT(avl_first(&ddt->ddt_tree) == NULL); + + dsl_scan_ddt_entry(scn, ddb->ddb_checksum, &dde, tx); + n++; + + if (dsl_scan_check_suspend(scn, NULL)) + break; + } + + zfs_dbgmsg("scanned %llu ddt entries with class_max = %u; " + "suspending=%u", (longlong_t)n, + (int)scn->scn_phys.scn_ddt_class_max, (int)scn->scn_suspending); + + ASSERT(error == 0 || error == ENOENT); + ASSERT(error != ENOENT || + ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); +} + +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +{ + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); +} + +static void +dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) +{ + scan_ds_t *sds; + dsl_pool_t *dp = scn->scn_dp; + + if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= + scn->scn_phys.scn_ddt_class_max) { + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_ddt(scn, tx); + if (scn->scn_suspending) + return; + } + + if (scn->scn_phys.scn_bookmark.zb_objset == DMU_META_OBJSET) { + /* First do the MOS & ORIGIN */ + + scn->scn_phys.scn_cur_min_txg = scn->scn_phys.scn_min_txg; + scn->scn_phys.scn_cur_max_txg = scn->scn_phys.scn_max_txg; + dsl_scan_visit_rootbp(scn, NULL, + &dp->dp_meta_rootbp, tx); + spa_set_rootblkptr(dp->dp_spa, &dp->dp_meta_rootbp); + if (scn->scn_suspending) + return; + + if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + enqueue_cb, NULL, DS_FIND_CHILDREN)); + } else { + dsl_scan_visitds(scn, + dp->dp_origin_snap->ds_object, tx); + } + ASSERT(!scn->scn_suspending); + } else if (scn->scn_phys.scn_bookmark.zb_objset != + ZB_DESTROYED_OBJSET) { + uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; + /* + * If we were suspended, continue from here. Note if the + * ds we were suspended on was deleted, the zb_objset may + * be -1, so we will skip this and find a new objset + * below. + */ + dsl_scan_visitds(scn, dsobj, tx); + if (scn->scn_suspending) + return; + } + + /* + * In case we suspended right at the end of the ds, zero the + * bookmark so we don't think that we're still trying to resume. + */ + bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); + + /* + * Keep pulling things out of the dataset avl queue. Updates to the + * persistent zap-object-as-queue happen only at checkpoints. + */ + while ((sds = avl_first(&scn->scn_queue)) != NULL) { + dsl_dataset_t *ds; + uint64_t dsobj = sds->sds_dsobj; + uint64_t txg = sds->sds_txg; + + /* dequeue and free the ds from the queue */ + scan_ds_queue_remove(scn, dsobj); + sds = NULL; + + /* set up min / max txg */ + VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); + if (txg != 0) { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, txg); + } else { + scn->scn_phys.scn_cur_min_txg = + MAX(scn->scn_phys.scn_min_txg, + dsl_dataset_phys(ds)->ds_prev_snap_txg); + } + scn->scn_phys.scn_cur_max_txg = dsl_scan_ds_maxtxg(ds); + dsl_dataset_rele(ds, FTAG); + + dsl_scan_visitds(scn, dsobj, tx); + if (scn->scn_suspending) + return; + } + + /* No more objsets to fetch, we're done */ + scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; + ASSERT0(scn->scn_suspending); +} + +static uint64_t +dsl_scan_count_leaves(vdev_t *vd) +{ + uint64_t i, leaves = 0; + + /* we only count leaves that belong to the main pool and are readable */ + if (vd->vdev_islog || vd->vdev_isspare || + vd->vdev_isl2cache || !vdev_readable(vd)) + return (0); + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (i = 0; i < vd->vdev_children; i++) { + leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + } + + return (leaves); +} + +static void +scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) +{ + int i; + uint64_t cur_size = 0; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); + } + + q->q_total_zio_size_this_txg += cur_size; + q->q_zios_this_txg++; +} + +static void +scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, + uint64_t end) +{ + q->q_total_seg_size_this_txg += end - start; + q->q_segs_this_txg++; +} + +static boolean_t +scan_io_queue_check_suspend(dsl_scan_t *scn) +{ + /* See comment in dsl_scan_check_suspend() */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + return ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +/* + * Given a list of scan_io_t's in io_list, this issues the I/Os out to + * disk. This consumes the io_list and frees the scan_io_t's. This is + * called when emptying queues, either when we're up against the memory + * limit or when we have finished scanning. Returns B_TRUE if we stopped + * processing the list before we finished. Any sios that were not issued + * will remain in the io_list. + */ +static boolean_t +scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + int64_t bytes_issued = 0; + boolean_t suspended = B_FALSE; + + while ((sio = list_head(io_list)) != NULL) { + blkptr_t bp; + + if (scan_io_queue_check_suspend(scn)) { + suspended = B_TRUE; + break; + } + + sio2bp(sio, &bp); + bytes_issued += SIO_GET_ASIZE(sio); + scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, + &sio->sio_zb, queue); + (void) list_remove_head(io_list); + scan_io_queues_update_zio_stats(queue, &bp); + sio_free(sio); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); + + return (suspended); +} + +/* + * This function removes sios from an IO queue which reside within a given + * range_seg_t and inserts them (in offset order) into a list. Note that + * we only ever return a maximum of 32 sios at once. If there are more sios + * to process within this segment that did not make it onto the list we + * return B_TRUE and otherwise B_FALSE. + */ +static boolean_t +scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +{ + scan_io_t *srch_sio, *sio, *next_sio; + avl_index_t idx; + uint_t num_sios = 0; + int64_t bytes_issued = 0; + + ASSERT(rs != NULL); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + srch_sio = sio_alloc(1); + srch_sio->sio_nr_dvas = 1; + SIO_SET_OFFSET(srch_sio, rs_get_start(rs, queue->q_exts_by_addr)); + + /* + * The exact start of the extent might not contain any matching zios, + * so if that's the case, examine the next one in the tree. + */ + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + + if (sio == NULL) + sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); + + while (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr) && num_sios <= 32) { + ASSERT3U(SIO_GET_OFFSET(sio), >=, rs_get_start(rs, + queue->q_exts_by_addr)); + ASSERT3U(SIO_GET_END_OFFSET(sio), <=, rs_get_end(rs, + queue->q_exts_by_addr)); + + next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); + avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + + bytes_issued += SIO_GET_ASIZE(sio); + num_sios++; + list_insert_tail(list, sio); + sio = next_sio; + } + + /* + * We limit the number of sios we process at once to 32 to avoid + * biting off more than we can chew. If we didn't take everything + * in the segment we update it to reflect the work we were able to + * complete. Otherwise, we remove it from the range tree entirely. + */ + if (sio != NULL && SIO_GET_OFFSET(sio) < rs_get_end(rs, + queue->q_exts_by_addr)) { + range_tree_adjust_fill(queue->q_exts_by_addr, rs, + -bytes_issued); + range_tree_resize_segment(queue->q_exts_by_addr, rs, + SIO_GET_OFFSET(sio), rs_get_end(rs, + queue->q_exts_by_addr) - SIO_GET_OFFSET(sio)); + + return (B_TRUE); + } else { + uint64_t rstart = rs_get_start(rs, queue->q_exts_by_addr); + uint64_t rend = rs_get_end(rs, queue->q_exts_by_addr); + range_tree_remove(queue->q_exts_by_addr, rstart, rend - rstart); + return (B_FALSE); + } +} + +/* + * This is called from the queue emptying thread and selects the next + * extent from which we are to issue I/Os. The behavior of this function + * depends on the state of the scan, the current memory consumption and + * whether or not we are performing a scan shutdown. + * 1) We select extents in an elevator algorithm (LBA-order) if the scan + * needs to perform a checkpoint + * 2) We select the largest available extent if we are up against the + * memory limit. + * 3) Otherwise we don't select any extents. + */ +static range_seg_t * +scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + range_tree_t *rt = queue->q_exts_by_addr; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + ASSERT(scn->scn_is_sorted); + + /* handle tunable overrides */ + if (scn->scn_checkpointing || scn->scn_clearing) { + if (zfs_scan_issue_strategy == 1) { + return (range_tree_first(rt)); + } else if (zfs_scan_issue_strategy == 2) { + /* + * We need to get the original entry in the by_addr + * tree so we can modify it. + */ + range_seg_t *size_rs = + zfs_btree_first(&queue->q_exts_by_size, NULL); + if (size_rs == NULL) + return (NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, + size); + ASSERT3P(addr_rs, !=, NULL); + ASSERT3U(rs_get_start(size_rs, rt), ==, + rs_get_start(addr_rs, rt)); + ASSERT3U(rs_get_end(size_rs, rt), ==, + rs_get_end(addr_rs, rt)); + return (addr_rs); + } + } + + /* + * During normal clearing, we want to issue our largest segments + * first, keeping IO as sequential as possible, and leaving the + * smaller extents for later with the hope that they might eventually + * grow to larger sequential segments. However, when the scan is + * checkpointing, no new extents will be added to the sorting queue, + * so the way we are sorted now is as good as it will ever get. + * In this case, we instead switch to issuing extents in LBA order. + */ + if (scn->scn_checkpointing) { + return (range_tree_first(rt)); + } else if (scn->scn_clearing) { + /* + * We need to get the original entry in the by_addr + * tree so we can modify it. + */ + range_seg_t *size_rs = zfs_btree_first(&queue->q_exts_by_size, + NULL); + if (size_rs == NULL) + return (NULL); + uint64_t start = rs_get_start(size_rs, rt); + uint64_t size = rs_get_end(size_rs, rt) - start; + range_seg_t *addr_rs = range_tree_find(rt, start, size); + ASSERT3P(addr_rs, !=, NULL); + ASSERT3U(rs_get_start(size_rs, rt), ==, rs_get_start(addr_rs, + rt)); + ASSERT3U(rs_get_end(size_rs, rt), ==, rs_get_end(addr_rs, rt)); + return (addr_rs); + } else { + return (NULL); + } +} + +static void +scan_io_queues_run_one(void *arg) +{ + dsl_scan_io_queue_t *queue = arg; + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + boolean_t suspended = B_FALSE; + range_seg_t *rs = NULL; + scan_io_t *sio = NULL; + list_t sio_list; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); + + ASSERT(queue->q_scn->scn_is_sorted); + + list_create(&sio_list, sizeof (scan_io_t), + offsetof(scan_io_t, sio_nodes.sio_list_node)); + mutex_enter(q_lock); + + /* calculate maximum in-flight bytes for this txg (min 1MB) */ + queue->q_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + /* reset per-queue scan statistics for this txg */ + queue->q_total_seg_size_this_txg = 0; + queue->q_segs_this_txg = 0; + queue->q_total_zio_size_this_txg = 0; + queue->q_zios_this_txg = 0; + + /* loop until we run out of time or sios */ + while ((rs = scan_io_queue_fetch_ext(queue)) != NULL) { + uint64_t seg_start = 0, seg_end = 0; + boolean_t more_left = B_TRUE; + + ASSERT(list_is_empty(&sio_list)); + + /* loop while we still have sios left to process in this rs */ + while (more_left) { + scan_io_t *first_sio, *last_sio; + + /* + * We have selected which extent needs to be + * processed next. Gather up the corresponding sios. + */ + more_left = scan_io_queue_gather(queue, rs, &sio_list); + ASSERT(!list_is_empty(&sio_list)); + first_sio = list_head(&sio_list); + last_sio = list_tail(&sio_list); + + seg_end = SIO_GET_END_OFFSET(last_sio); + if (seg_start == 0) + seg_start = SIO_GET_OFFSET(first_sio); + + /* + * Issuing sios can take a long time so drop the + * queue lock. The sio queue won't be updated by + * other threads since we're in syncing context so + * we can be sure that our trees will remain exactly + * as we left them. + */ + mutex_exit(q_lock); + suspended = scan_io_queue_issue(queue, &sio_list); + mutex_enter(q_lock); + + if (suspended) + break; + } + + /* update statistics for debugging purposes */ + scan_io_queues_update_seg_stats(queue, seg_start, seg_end); + + if (suspended) + break; + } + + /* + * If we were suspended in the middle of processing, + * requeue any unfinished sios and exit. + */ + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); + scan_io_queue_insert_impl(queue, sio); + } + + mutex_exit(q_lock); + list_destroy(&sio_list); +} + +/* + * Performs an emptying run on all scan queues in the pool. This just + * punches out one thread per top-level vdev, each of which processes + * only that vdev's scan queue. We can parallelize the I/O here because + * we know that each queue's I/Os only affect its own top-level vdev. + * + * This function waits for the queue runs to complete, and must be + * called from dsl_scan_sync (or in general, syncing context). + */ +static void +scan_io_queues_run(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_is_sorted); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (scn->scn_bytes_pending == 0) + return; + + if (scn->scn_taskq == NULL) { + int nthreads = spa->spa_root_vdev->vdev_children; + + /* + * We need to make this taskq *always* execute as many + * threads in parallel as we have top-level vdevs and no + * less, otherwise strange serialization of the calls to + * scan_io_queues_run_one can occur during spa_sync runs + * and that significantly impacts performance. + */ + scn->scn_taskq = taskq_create("dsl_scan_iss", nthreads, + minclsyspri, nthreads, nthreads, TASKQ_PREPOPULATE); + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + if (vd->vdev_scan_io_queue != NULL) { + VERIFY(taskq_dispatch(scn->scn_taskq, + scan_io_queues_run_one, vd->vdev_scan_io_queue, + TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * Wait for the queues to finish issuing their IOs for this run + * before we return. There may still be IOs in flight at this + * point. + */ + taskq_wait(scn->scn_taskq); +} + +static boolean_t +dsl_scan_async_block_should_pause(dsl_scan_t *scn) +{ + uint64_t elapsed_nanosecs; + + if (zfs_recover) + return (B_FALSE); + + if (zfs_async_block_max_blocks != 0 && + scn->scn_visited_this_txg >= zfs_async_block_max_blocks) { + return (B_TRUE); + } + + if (zfs_max_async_dedup_frees != 0 && + scn->scn_dedup_frees_this_txg >= zfs_max_async_dedup_frees) { + return (B_TRUE); + } + + elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + return (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (NSEC2MSEC(elapsed_nanosecs) > scn->scn_async_block_min_time_ms && + txg_sync_waiting(scn->scn_dp)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +static int +dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = arg; + + if (!scn->scn_is_bptree || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)) { + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + } + + zio_nowait(zio_free_sync(scn->scn_zio_root, scn->scn_dp->dp_spa, + dmu_tx_get_txg(tx), bp, 0)); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(scn->scn_dp->dp_spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + scn->scn_visited_this_txg++; + if (BP_GET_DEDUP(bp)) + scn->scn_dedup_frees_this_txg++; + return (0); +} + +static void +dsl_scan_update_stats(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t i; + uint64_t seg_size_total = 0, zio_size_total = 0; + uint64_t seg_count_total = 0, zio_count_total = 0; + + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; + + if (queue == NULL) + continue; + + seg_size_total += queue->q_total_seg_size_this_txg; + zio_size_total += queue->q_total_zio_size_this_txg; + seg_count_total += queue->q_segs_this_txg; + zio_count_total += queue->q_zios_this_txg; + } + + if (seg_count_total == 0 || zio_count_total == 0) { + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_zios_this_txg = 0; + return; + } + + scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; + scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; + scn->scn_segs_this_txg = seg_count_total; + scn->scn_zios_this_txg = zio_count_total; +} + +static int +bpobj_dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (dsl_scan_free_block_cb(arg, bp, tx)); +} + +static int +dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + dsl_scan_t *scn = arg; + const dva_t *dva = &bp->blk_dva[0]; + + if (dsl_scan_async_block_should_pause(scn)) + return (SET_ERROR(ERESTART)); + + spa_vdev_indirect_mark_obsolete(scn->scn_dp->dp_spa, + DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), + DVA_GET_ASIZE(dva), tx); + scn->scn_visited_this_txg++; + return (0); +} + +boolean_t +dsl_scan_active(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t used = 0, comp, uncomp; + boolean_t clones_left; + + if (spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + if (spa_shutting_down(spa)) + return (B_FALSE); + if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || + (scn->scn_async_destroying && !scn->scn_async_stalled)) + return (B_TRUE); + + if (spa_version(scn->scn_dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + (void) bpobj_space(&scn->scn_dp->dp_free_bpobj, + &used, &comp, &uncomp); + } + clones_left = spa_livelist_delete_check(spa); + return ((used != 0) || (clones_left)); +} + +static boolean_t +dsl_scan_check_deferred(vdev_t *vd) +{ + boolean_t need_resilver = B_FALSE; + + for (int c = 0; c < vd->vdev_children; c++) { + need_resilver |= + dsl_scan_check_deferred(vd->vdev_child[c]); + } + + if (!vdev_is_concrete(vd) || vd->vdev_aux || + !vd->vdev_ops->vdev_op_leaf) + return (need_resilver); + + if (!vd->vdev_resilver_deferred) + need_resilver = B_TRUE; + + return (need_resilver); +} + +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops == &vdev_indirect_ops) { + /* + * The indirect vdev can point to multiple + * vdevs. For simplicity, always create + * the resilver zio_t. zio_vdev_io_start() + * will bypass the child resilver i/o's if + * they are on vdevs that don't have DTL's. + */ + return (B_TRUE); + } + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + /* + * Check that this top-level vdev has a device under it which + * is resilvering and is not deferred. + */ + if (!dsl_scan_check_deferred(vd)) + return (B_FALSE); + + return (B_TRUE); +} + +static int +dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + int err = 0; + + if (spa_suspend_async_destroy(spa)) + return (0); + + if (zfs_free_bpobj_enabled && + spa_version(spa) >= SPA_VERSION_DEADLISTS) { + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; + scn->scn_zio_root = zio_root(spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bpobj_iterate(&dp->dp_free_bpobj, + bpobj_dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; + + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + } + + if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { + ASSERT(scn->scn_async_destroying); + scn->scn_is_bptree = B_TRUE; + scn->scn_zio_root = zio_root(spa, NULL, + NULL, ZIO_FLAG_MUSTSUCCEED); + err = bptree_iterate(dp->dp_meta_objset, + dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; + + if (err == EIO || err == ECKSUM) { + err = 0; + } else if (err != 0 && err != ERESTART) { + zfs_panic_recover("error %u from " + "traverse_dataset_destroyed()", err); + } + + if (bptree_is_empty(dp->dp_meta_objset, dp->dp_bptree_obj)) { + /* finished; deactivate async destroy feature */ + spa_feature_decr(spa, SPA_FEATURE_ASYNC_DESTROY, tx); + ASSERT(!spa_feature_is_active(spa, + SPA_FEATURE_ASYNC_DESTROY)); + VERIFY0(zap_remove(dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_BPTREE_OBJ, tx)); + VERIFY0(bptree_free(dp->dp_meta_objset, + dp->dp_bptree_obj, tx)); + dp->dp_bptree_obj = 0; + scn->scn_async_destroying = B_FALSE; + scn->scn_async_stalled = B_FALSE; + } else { + /* + * If we didn't make progress, mark the async + * destroy as stalled, so that we will not initiate + * a spa_sync() on its behalf. Note that we only + * check this if we are not finished, because if the + * bptree had no blocks for us to visit, we can + * finish without "making progress". + */ + scn->scn_async_stalled = + (scn->scn_visited_this_txg == 0); + } + } + if (scn->scn_visited_this_txg) { + zfs_dbgmsg("freed %llu blocks in %llums from " + "free_bpobj/bptree txg %llu; err=%u", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t) + NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), + (longlong_t)tx->tx_txg, err); + scn->scn_visited_this_txg = 0; + scn->scn_dedup_frees_this_txg = 0; + + /* + * Write out changes to the DDT that may be required as a + * result of the blocks freed. This ensures that the DDT + * is clean when a scrub/resilver runs. + */ + ddt_sync(spa, tx->tx_txg); + } + if (err != 0) + return (err); + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + zfs_free_leak_on_eio && + (dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes != 0 || + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes != 0)) { + /* + * We have finished background destroying, but there is still + * some space left in the dp_free_dir. Transfer this leaked + * space to the dp_leak_dir. + */ + if (dp->dp_leak_dir == NULL) { + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + (void) dsl_dir_create_sync(dp, dp->dp_root_dir, + LEAK_DIR_NAME, tx); + VERIFY0(dsl_pool_open_special_dir(dp, + LEAK_DIR_NAME, &dp->dp_leak_dir)); + rrw_exit(&dp->dp_config_rwlock, FTAG); + } + dsl_dir_diduse_space(dp->dp_leak_dir, DD_USED_HEAD, + dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + dsl_dir_diduse_space(dp->dp_free_dir, DD_USED_HEAD, + -dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes, + -dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes, tx); + } + + if (dp->dp_free_dir != NULL && !scn->scn_async_destroying && + !spa_livelist_delete_check(spa)) { + /* finished; verify that space accounting went to zero */ + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_used_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_compressed_bytes); + ASSERT0(dsl_dir_phys(dp->dp_free_dir)->dd_uncompressed_bytes); + } + + spa_notify_waiters(spa); + + EQUIV(bpobj_is_open(&dp->dp_obsolete_bpobj), + 0 == zap_contains(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_OBSOLETE_BPOBJ)); + if (err == 0 && bpobj_is_open(&dp->dp_obsolete_bpobj)) { + ASSERT(spa_feature_is_active(dp->dp_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + + scn->scn_is_bptree = B_FALSE; + scn->scn_async_block_min_time_ms = zfs_obsolete_min_time_ms; + err = bpobj_iterate(&dp->dp_obsolete_bpobj, + dsl_scan_obsolete_block_cb, scn, tx); + if (err != 0 && err != ERESTART) + zfs_panic_recover("error %u from bpobj_iterate()", err); + + if (bpobj_is_empty(&dp->dp_obsolete_bpobj)) + dsl_pool_destroy_obsolete_bpobj(dp, tx); + } + return (0); +} + +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * can guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this function controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ +void +dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) +{ + int err = 0; + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + state_sync_type_t sync_type = SYNC_OPTIONAL; + + if (spa->spa_resilver_deferred && + !spa_feature_is_active(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) + spa_feature_incr(spa, SPA_FEATURE_RESILVER_DEFER, tx); + + /* + * Check for scn_restart_txg before checking spa_load_state, so + * that we can restart an old-style scan while the pool is being + * imported (see dsl_scan_init). We also restart scans if there + * is a deferred resilver and the user has manually disabled + * deferred resilvers via the tunable. + */ + if (dsl_scan_restarting(scn, tx) || + (spa->spa_resilver_deferred && zfs_resilver_disable_defer)) { + pool_scan_func_t func = POOL_SCAN_SCRUB; + dsl_scan_done(scn, B_FALSE, tx); + if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) + func = POOL_SCAN_RESILVER; + zfs_dbgmsg("restarting scan func=%u txg=%llu", + func, (longlong_t)tx->tx_txg); + dsl_scan_setup_sync(&func, tx); + } + + /* + * Only process scans in sync pass 1. + */ + if (spa_sync_pass(spa) > 1) + return; + + /* + * If the spa is shutting down, then stop scanning. This will + * ensure that the scan does not dirty any new data during the + * shutdown phase. + */ + if (spa_shutting_down(spa)) + return; + + /* + * If the scan is inactive due to a stalled async destroy, try again. + */ + if (!scn->scn_async_stalled && !dsl_scan_active(scn)) + return; + + /* reset scan statistics */ + scn->scn_visited_this_txg = 0; + scn->scn_dedup_frees_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; + scn->scn_suspending = B_FALSE; + scn->scn_sync_start_time = gethrtime(); + spa->spa_scrub_active = B_TRUE; + + /* + * First process the async destroys. If we suspend, don't do + * any scrubbing or resilvering. This ensures that there are no + * async destroys while we are scanning, so the scan code doesn't + * have to worry about traversing it. It is also faster to free the + * blocks than to scrub them. + */ + err = dsl_process_async_destroys(dp, tx); + if (err != 0) + return; + + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) + return; + + /* + * Wait a few txgs after importing to begin scanning so that + * we can get the pool imported quickly. + */ + if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) + return; + + /* + * zfs_scan_suspend_progress can be set to disable scan progress. + * We don't want to spin the txg_sync thread, so we add a delay + * here to simulate the time spent doing a scan. This is mostly + * useful for testing and debugging. + */ + if (zfs_scan_suspend_progress) { + uint64_t scan_time_ns = gethrtime() - scn->scn_sync_start_time; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + while (zfs_scan_suspend_progress && + !txg_sync_waiting(scn->scn_dp) && + !spa_shutting_down(scn->scn_dp->dp_spa) && + NSEC2MSEC(scan_time_ns) < mintime) { + delay(hz); + scan_time_ns = gethrtime() - scn->scn_sync_start_time; + } + return; + } + + /* + * It is possible to switch from unsorted to sorted at any time, + * but afterwards the scan will remain sorted unless reloaded from + * a checkpoint after a reboot. + */ + if (!zfs_scan_legacy) { + scn->scn_is_sorted = B_TRUE; + if (scn->scn_last_checkpoint == 0) + scn->scn_last_checkpoint = ddi_get_lbolt(); + } + + /* + * For sorted scans, determine what kind of work we will be doing + * this txg based on our memory limitations and whether or not we + * need to perform a checkpoint. + */ + if (scn->scn_is_sorted) { + /* + * If we are over our checkpoint interval, set scn_clearing + * so that we can begin checkpointing immediately. The + * checkpoint allows us to save a consistent bookmark + * representing how much data we have scrubbed so far. + * Otherwise, use the memory limit to determine if we should + * scan for metadata or start issue scrub IOs. We accumulate + * metadata until we hit our hard memory limit at which point + * we issue scrub IOs until we are at our soft memory limit. + */ + if (scn->scn_checkpointing || + ddi_get_lbolt() - scn->scn_last_checkpoint > + SEC_TO_TICK(zfs_scan_checkpoint_intval)) { + if (!scn->scn_checkpointing) + zfs_dbgmsg("begin scan checkpoint"); + + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } else { + boolean_t should_clear = dsl_scan_should_clear(scn); + if (should_clear && !scn->scn_clearing) { + zfs_dbgmsg("begin scan clearing"); + scn->scn_clearing = B_TRUE; + } else if (!should_clear && scn->scn_clearing) { + zfs_dbgmsg("finish scan clearing"); + scn->scn_clearing = B_FALSE; + } + } + } else { + ASSERT0(scn->scn_checkpointing); + ASSERT0(scn->scn_clearing); + } + + if (!scn->scn_clearing && scn->scn_done_txg == 0) { + /* Need to scan metadata for more blocks to scrub */ + dsl_scan_phys_t *scnp = &scn->scn_phys; + taskqid_t prefetch_tqid; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); + + /* + * Recalculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + if (scnp->scn_ddt_bookmark.ddb_class <= + scnp->scn_ddt_class_max) { + ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } else { + zfs_dbgmsg("doing scan sync txg %llu; " + "bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_bookmark.zb_objset, + (longlong_t)scnp->scn_bookmark.zb_object, + (longlong_t)scnp->scn_bookmark.zb_level, + (longlong_t)scnp->scn_bookmark.zb_blkid); + } + + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + + scn->scn_prefetch_stop = B_FALSE; + prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, + dsl_scan_prefetch_thread, scn, TQ_SLEEP); + ASSERT(prefetch_tqid != TASKQID_INVALID); + + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); + + mutex_enter(&dp->dp_spa->spa_scrub_lock); + scn->scn_prefetch_stop = B_TRUE; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&dp->dp_spa->spa_scrub_lock); + + taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("scan visited %llu blocks in %llums " + "(%llu os's, %llu holes, %llu < mintxg, " + "%llu in ddt, %llu > maxtxg)", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_objsets_visited_this_txg, + (longlong_t)scn->scn_holes_this_txg, + (longlong_t)scn->scn_lt_min_this_txg, + (longlong_t)scn->scn_ddt_contained_this_txg, + (longlong_t)scn->scn_gt_max_this_txg); + + if (!scn->scn_suspending) { + ASSERT0(avl_numnodes(&scn->scn_queue)); + scn->scn_done_txg = tx->tx_txg + 1; + if (scn->scn_is_sorted) { + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } + zfs_dbgmsg("scan complete txg %llu", + (longlong_t)tx->tx_txg); + } + } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + ASSERT(scn->scn_clearing); + + /* need to issue scrubbing IOs from per-vdev queues */ + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + scan_io_queues_run(scn); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + /* calculate and dprintf the current memory usage */ + (void) dsl_scan_should_clear(scn); + dsl_scan_update_stats(scn); + + zfs_dbgmsg("scan issued %llu blocks (%llu segs) in %llums " + "(avg_block_size = %llu, avg_seg_size = %llu)", + (longlong_t)scn->scn_zios_this_txg, + (longlong_t)scn->scn_segs_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_avg_zio_size_this_txg, + (longlong_t)scn->scn_avg_seg_size_this_txg); + } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { + /* Finished with everything. Mark the scrub as complete */ + zfs_dbgmsg("scan issuing complete txg %llu", + (longlong_t)tx->tx_txg); + ASSERT3U(scn->scn_done_txg, !=, 0); + ASSERT0(spa->spa_scrub_inflight); + ASSERT0(scn->scn_bytes_pending); + dsl_scan_done(scn, B_TRUE, tx); + sync_type = SYNC_MANDATORY; + } + + dsl_scan_sync_state(scn, tx, sync_type); +} + +static void +count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) +{ + int i; + + /* + * Don't count embedded bp's, since we already did the work of + * scanning these when we scanned the containing block. + */ + if (BP_IS_EMBEDDED(bp)) + return; + + /* + * Update the spa's stats on how many bytes we have issued. + * Sequential scrubs create a zio for each DVA of the bp. Each + * of these will include all DVAs for repair purposes, but the + * zio code will only try the first one unless there is an issue. + * Therefore, we should only count the first DVA for these IOs. + */ + if (scn->scn_is_sorted) { + atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[0])); + } else { + spa_t *spa = scn->scn_dp->dp_spa; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } + } + + /* + * If we resume after a reboot, zab will be NULL; don't record + * incomplete stats in that case. + */ + if (zab == NULL) + return; + + mutex_enter(&zab->zab_lock); + + for (i = 0; i < 4; i++) { + int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; + int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; + + if (t & DMU_OT_NEWTYPE) + t = DMU_OT_OTHER; + zfs_blkstat_t *zb = &zab->zab_type[l][t]; + int equal; + + zb->zb_count++; + zb->zb_asize += BP_GET_ASIZE(bp); + zb->zb_lsize += BP_GET_LSIZE(bp); + zb->zb_psize += BP_GET_PSIZE(bp); + zb->zb_gangs += BP_COUNT_GANG(bp); + + switch (BP_GET_NDVAS(bp)) { + case 2: + if (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + zb->zb_ditto_2_of_2_samevdev++; + break; + case 3: + equal = (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[1])) + + (DVA_GET_VDEV(&bp->blk_dva[0]) == + DVA_GET_VDEV(&bp->blk_dva[2])) + + (DVA_GET_VDEV(&bp->blk_dva[1]) == + DVA_GET_VDEV(&bp->blk_dva[2])); + if (equal == 1) + zb->zb_ditto_2_of_3_samevdev++; + else if (equal == 3) + zb->zb_ditto_3_of_3_samevdev++; + break; + } + } + + mutex_exit(&zab->zab_lock); +} + +static void +scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) +{ + avl_index_t idx; + int64_t asize = SIO_GET_ASIZE(sio); + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { + /* block is already scheduled for reading */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + sio_free(sio); + return; + } + avl_insert(&queue->q_sios_by_addr, sio, idx); + queue->q_sio_memused += SIO_GET_MUSED(sio); + range_tree_add(queue->q_exts_by_addr, SIO_GET_OFFSET(sio), asize); +} + +/* + * Given all the info we got from our metadata scanning process, we + * construct a scan_io_t and insert it into the scan sorting queue. The + * I/O must already be suitable for us to process. This is controlled + * by dsl_scan_enqueue(). + */ +static void +scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, + int zio_flags, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio = sio_alloc(BP_GET_NDVAS(bp)); + + ASSERT0(BP_IS_GANG(bp)); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + bp2sio(bp, sio, dva_i); + sio->sio_flags = zio_flags; + sio->sio_zb = *zb; + + /* + * Increment the bytes pending counter now so that we can't + * get an integer underflow in case the worker processes the + * zio before we get to incrementing this counter. + */ + atomic_add_64(&scn->scn_bytes_pending, SIO_GET_ASIZE(sio)); + + scan_io_queue_insert_impl(queue, sio); +} + +/* + * Given a set of I/O parameters as discovered by the metadata traversal + * process, attempts to place the I/O into the sorted queues (if allowed), + * or immediately executes the I/O. + */ +static void +dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb) +{ + spa_t *spa = dp->dp_spa; + + ASSERT(!BP_IS_EMBEDDED(bp)); + + /* + * Gang blocks are hard to issue sequentially, so we just issue them + * here immediately instead of queuing them. + */ + if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { + scan_exec_io(dp, bp, zio_flags, zb, NULL); + return; + } + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + dva_t dva; + vdev_t *vdev; + + dva = bp->blk_dva[i]; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); + ASSERT(vdev != NULL); + + mutex_enter(&vdev->vdev_scan_io_queue_lock); + if (vdev->vdev_scan_io_queue == NULL) + vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); + ASSERT(dp->dp_scan != NULL); + scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, + i, zio_flags, zb); + mutex_exit(&vdev->vdev_scan_io_queue_lock); + } +} + +static int +dsl_scan_scrub_cb(dsl_pool_t *dp, + const blkptr_t *bp, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = dp->dp_scan; + spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + size_t psize = BP_GET_PSIZE(bp); + boolean_t needs_io = B_FALSE; + int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; + + + if (phys_birth <= scn->scn_phys.scn_min_txg || + phys_birth >= scn->scn_phys.scn_max_txg) { + count_block(scn, dp->dp_blkstats, bp); + return (0); + } + + /* Embedded BP's have phys_birth==0, so we reject them above. */ + ASSERT(!BP_IS_EMBEDDED(bp)); + + ASSERT(DSL_SCAN_IS_SCRUB_RESILVER(scn)); + if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { + zio_flags |= ZIO_FLAG_SCRUB; + needs_io = B_TRUE; + } else { + ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); + zio_flags |= ZIO_FLAG_RESILVER; + needs_io = B_FALSE; + } + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; + + /* + * Keep track of how much data we've examined so that + * zpool(1M) status can make useful progress reports. + */ + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); + + /* if it's a resilver, this may not be in the target range */ + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, psize, + phys_birth); + } + + if (needs_io && !zfs_no_scrub_io) { + dsl_scan_enqueue(dp, bp, zio_flags, zb); + } else { + count_block(scn, dp->dp_blkstats, bp); + } + + /* do not relocate this block */ + return (0); +} + +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + dsl_scan_io_queue_t *queue = zio->io_private; + + abd_free(zio->io_abd); + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + } else { + mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); + ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); + queue->q_inflight_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&queue->q_zio_cv); + mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); + } + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + } +} + +/* + * Given a scanning zio's information, executes the zio. The zio need + * not necessarily be only sortable, this function simply executes the + * zio, no matter what it is. The optional queue argument allows the + * caller to specify that they want per top level vdev IO rate limiting + * instead of the legacy global limiting. + */ +static void +scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + abd_t *data = abd_alloc_for_io(size, B_FALSE); + + ASSERT3U(scn->scn_maxinflight_bytes, >, 0); + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_scrub_inflight += BP_GET_PSIZE(bp); + mutex_exit(&spa->spa_scrub_lock); + } else { + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + + mutex_enter(q_lock); + while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) + cv_wait(&queue->q_zio_cv, q_lock); + queue->q_inflight_bytes += BP_GET_PSIZE(bp); + mutex_exit(q_lock); + } + + count_block(scn, dp->dp_blkstats, bp); + zio_nowait(zio_read(scn->scn_zio_root, spa, bp, data, size, + dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); +} + +/* + * This is the primary extent sorting algorithm. We balance two parameters: + * 1) how many bytes of I/O are in an extent + * 2) how well the extent is filled with I/O (as a fraction of its total size) + * Since we allow extents to have gaps between their constituent I/Os, it's + * possible to have a fairly large extent that contains the same amount of + * I/O bytes than a much smaller extent, which just packs the I/O more tightly. + * The algorithm sorts based on a score calculated from the extent's size, + * the relative fill volume (in %) and a "fill weight" parameter that controls + * the split between whether we prefer larger extents or more well populated + * extents: + * + * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) + * + * Example: + * 1) assume extsz = 64 MiB + * 2) assume fill = 32 MiB (extent is half full) + * 3) assume fill_weight = 3 + * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 + * SCORE = 32M + (50 * 3 * 32M) / 100 + * SCORE = 32M + (4800M / 100) + * SCORE = 32M + 48M + * ^ ^ + * | +--- final total relative fill-based score + * +--------- final total fill-based score + * SCORE = 80M + * + * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards + * extents that are more completely filled (in a 3:2 ratio) vs just larger. + * Note that as an optimization, we replace multiplication and division by + * 100 with bitshifting by 7 (which effectively multiplies and divides by 128). + */ +static int +ext_size_compare(const void *x, const void *y) +{ + const range_seg_gap_t *rsa = x, *rsb = y; + + uint64_t sa = rsa->rs_end - rsa->rs_start; + uint64_t sb = rsb->rs_end - rsb->rs_start; + uint64_t score_a, score_b; + + score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * + fill_weight * rsa->rs_fill) >> 7); + score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * + fill_weight * rsb->rs_fill) >> 7); + + if (score_a > score_b) + return (-1); + if (score_a == score_b) { + if (rsa->rs_start < rsb->rs_start) + return (-1); + if (rsa->rs_start == rsb->rs_start) + return (0); + return (1); + } + return (1); +} + +/* + * Comparator for the q_sios_by_addr tree. Sorting is simply performed + * based on LBA-order (from lowest to highest). + */ +static int +sio_addr_compare(const void *x, const void *y) +{ + const scan_io_t *a = x, *b = y; + + return (TREE_CMP(SIO_GET_OFFSET(a), SIO_GET_OFFSET(b))); +} + +/* IO queues are created on demand when they are needed. */ +static dsl_scan_io_queue_t * +scan_io_queue_create(vdev_t *vd) +{ + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); + + q->q_scn = scn; + q->q_vd = vd; + q->q_sio_memused = 0; + cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); + q->q_exts_by_addr = range_tree_create_impl(&rt_btree_ops, RANGE_SEG_GAP, + &q->q_exts_by_size, 0, 0, ext_size_compare, zfs_scan_max_ext_gap); + avl_create(&q->q_sios_by_addr, sio_addr_compare, + sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); + + return (q); +} + +/* + * Destroys a scan queue and all segments and scan_io_t's contained in it. + * No further execution of I/O occurs, anything pending in the queue is + * simply freed without being executed. + */ +void +dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + void *cookie = NULL; + int64_t bytes_dequeued = 0; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != + NULL) { + ASSERT(range_tree_contains(queue->q_exts_by_addr, + SIO_GET_OFFSET(sio), SIO_GET_ASIZE(sio))); + bytes_dequeued += SIO_GET_ASIZE(sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + sio_free(sio); + } + + ASSERT0(queue->q_sio_memused); + atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); + range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + range_tree_destroy(queue->q_exts_by_addr); + avl_destroy(&queue->q_sios_by_addr); + cv_destroy(&queue->q_zio_cv); + + kmem_free(queue, sizeof (*queue)); +} + +/* + * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is + * called on behalf of vdev_top_transfer when creating or destroying + * a mirror vdev due to zpool attach/detach. + */ +void +dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) +{ + mutex_enter(&svd->vdev_scan_io_queue_lock); + mutex_enter(&tvd->vdev_scan_io_queue_lock); + + VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; + svd->vdev_scan_io_queue = NULL; + if (tvd->vdev_scan_io_queue != NULL) + tvd->vdev_scan_io_queue->q_vd = tvd; + + mutex_exit(&tvd->vdev_scan_io_queue_lock); + mutex_exit(&svd->vdev_scan_io_queue_lock); +} + +static void +scan_io_queues_destroy(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + if (tvd->vdev_scan_io_queue != NULL) + dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); + tvd->vdev_scan_io_queue = NULL; + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } +} + +static void +dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + vdev_t *vdev; + kmutex_t *q_lock; + dsl_scan_io_queue_t *queue; + scan_io_t *srch_sio, *sio; + avl_index_t idx; + uint64_t start, size; + + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); + ASSERT(vdev != NULL); + q_lock = &vdev->vdev_scan_io_queue_lock; + queue = vdev->vdev_scan_io_queue; + + mutex_enter(q_lock); + if (queue == NULL) { + mutex_exit(q_lock); + return; + } + + srch_sio = sio_alloc(BP_GET_NDVAS(bp)); + bp2sio(bp, srch_sio, dva_i); + start = SIO_GET_OFFSET(srch_sio); + size = SIO_GET_ASIZE(srch_sio); + + /* + * We can find the zio in two states: + * 1) Cold, just sitting in the queue of zio's to be issued at + * some point in the future. In this case, all we do is + * remove the zio from the q_sios_by_addr tree, decrement + * its data volume from the containing range_seg_t and + * resort the q_exts_by_size tree to reflect that the + * range_seg_t has lost some of its 'fill'. We don't shorten + * the range_seg_t - this is usually rare enough not to be + * worth the extra hassle of trying keep track of precise + * extent boundaries. + * 2) Hot, where the zio is currently in-flight in + * dsl_scan_issue_ios. In this case, we can't simply + * reach in and stop the in-flight zio's, so we instead + * block the caller. Eventually, dsl_scan_issue_ios will + * be done with issuing the zio's it gathered and will + * signal us. + */ + sio = avl_find(&queue->q_sios_by_addr, srch_sio, &idx); + sio_free(srch_sio); + + if (sio != NULL) { + int64_t asize = SIO_GET_ASIZE(sio); + blkptr_t tmpbp; + + /* Got it while it was cold in the queue */ + ASSERT3U(start, ==, SIO_GET_OFFSET(sio)); + ASSERT3U(size, ==, asize); + avl_remove(&queue->q_sios_by_addr, sio); + queue->q_sio_memused -= SIO_GET_MUSED(sio); + + ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); + range_tree_remove_fill(queue->q_exts_by_addr, start, size); + + /* + * We only update scn_bytes_pending in the cold path, + * otherwise it will already have been accounted for as + * part of the zio's execution. + */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + + /* count the block as though we issued it */ + sio2bp(sio, &tmpbp); + count_block(scn, dp->dp_blkstats, &tmpbp); + + sio_free(sio); + } + mutex_exit(q_lock); +} + +/* + * Callback invoked when a zio_free() zio is executing. This needs to be + * intercepted to prevent the zio from deallocating a particular portion + * of disk space and it then getting reallocated and written to, while we + * still have it queued up for processing. + */ +void +dsl_scan_freed(spa_t *spa, const blkptr_t *bp) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(scn != NULL); + if (!dsl_scan_is_running(scn)) + return; + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) + dsl_scan_freed_dva(spa, bp, i); +} + +/* + * Check if a vdev needs resilvering (non-empty DTL), if so, and resilver has + * not started, start it. Otherwise, only restart if max txg in DTL range is + * greater than the max txg in the current scan. If the DTL max is less than + * the scan max, then the vdev has not missed any new data since the resilver + * started, so a restart is not needed. + */ +void +dsl_scan_assess_vdev(dsl_pool_t *dp, vdev_t *vd) +{ + uint64_t min, max; + + if (!vdev_resilver_needed(vd, &min, &max)) + return; + + if (!dsl_scan_resilvering(dp)) { + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); + return; + } + + if (max <= dp->dp_scan->scn_phys.scn_max_txg) + return; + + /* restart is needed, check if it can be deferred */ + if (spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER)) + vdev_defer_resilver(vd); + else + spa_async_request(dp->dp_spa, SPA_ASYNC_RESILVER); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, scan_vdev_limit, ULONG, ZMOD_RW, + "Max bytes in flight per leaf vdev for scrubs and resilvers"); + +ZFS_MODULE_PARAM(zfs, zfs_, scrub_min_time_ms, INT, ZMOD_RW, + "Min millisecs to scrub per txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, obsolete_min_time_ms, INT, ZMOD_RW, + "Min millisecs to obsolete per txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, free_min_time_ms, INT, ZMOD_RW, + "Min millisecs to free per txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, resilver_min_time_ms, INT, ZMOD_RW, + "Min millisecs to resilver per txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_suspend_progress, INT, ZMOD_RW, + "Set to prevent scans from progressing"); + +ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_io, INT, ZMOD_RW, + "Set to disable scrub I/O"); + +ZFS_MODULE_PARAM(zfs, zfs_, no_scrub_prefetch, INT, ZMOD_RW, + "Set to disable scrub prefetching"); + +ZFS_MODULE_PARAM(zfs, zfs_, async_block_max_blocks, ULONG, ZMOD_RW, + "Max number of blocks freed in one txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_async_dedup_frees, ULONG, ZMOD_RW, + "Max number of dedup blocks freed in one txg"); + +ZFS_MODULE_PARAM(zfs, zfs_, free_bpobj_enabled, INT, ZMOD_RW, + "Enable processing of the free_bpobj"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_fact, INT, ZMOD_RW, + "Fraction of RAM for scan hard limit"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_issue_strategy, INT, ZMOD_RW, + "IO issuing strategy during scrubbing. " + "0 = default, 1 = LBA, 2 = size"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_legacy, INT, ZMOD_RW, + "Scrub using legacy non-sequential method"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_checkpoint_intval, INT, ZMOD_RW, + "Scan progress on-disk checkpointing interval"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_max_ext_gap, ULONG, ZMOD_RW, + "Max gap in bytes between sequential scrub / resilver I/Os"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_mem_lim_soft_fact, INT, ZMOD_RW, + "Fraction of hard limit used as soft limit"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_strict_mem_lim, INT, ZMOD_RW, + "Tunable to attempt to reduce lock contention"); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_fill_weight, INT, ZMOD_RW, + "Tunable to adjust bias towards more filled segments during scans"); + +ZFS_MODULE_PARAM(zfs, zfs_, resilver_disable_defer, INT, ZMOD_RW, + "Process all resilvers immediately"); +/* END CSTYLED */ diff --git a/module/zfs/dsl_synctask.c b/module/zfs/dsl_synctask.c new file mode 100644 index 000000000000..2d6ca8549eb9 --- /dev/null +++ b/module/zfs/dsl_synctask.c @@ -0,0 +1,261 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#define DST_AVG_BLKSHIFT 14 + +/* ARGSUSED */ +static int +dsl_null_checkfunc(void *arg, dmu_tx_t *tx) +{ + return (0); +} + +static int +dsl_sync_task_common(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, boolean_t early) +{ + spa_t *spa; + dmu_tx_t *tx; + int err; + dsl_sync_task_t dst = { { { NULL } } }; + dsl_pool_t *dp; + + err = spa_open(pool, &spa, FTAG); + if (err != 0) + return (err); + dp = spa_get_dsl(spa); + +top: + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + dst.dst_pool = dp; + dst.dst_txg = dmu_tx_get_txg(tx); + dst.dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst.dst_space_check = space_check; + dst.dst_checkfunc = checkfunc != NULL ? checkfunc : dsl_null_checkfunc; + dst.dst_syncfunc = syncfunc; + dst.dst_arg = arg; + dst.dst_error = 0; + dst.dst_nowaiter = B_FALSE; + + dsl_pool_config_enter(dp, FTAG); + err = dst.dst_checkfunc(arg, tx); + dsl_pool_config_exit(dp, FTAG); + + if (err != 0) { + dmu_tx_commit(tx); + spa_close(spa, FTAG); + return (err); + } + + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, &dst, dst.dst_txg)); + + dmu_tx_commit(tx); + + if (sigfunc != NULL && txg_wait_synced_sig(dp, dst.dst_txg)) { + /* current contract is to call func once */ + sigfunc(arg, tx); + sigfunc = NULL; /* in case we're performing an EAGAIN retry */ + } + txg_wait_synced(dp, dst.dst_txg); + + if (dst.dst_error == EAGAIN) { + txg_wait_synced(dp, dst.dst_txg + TXG_DEFER_SIZE); + goto top; + } + + spa_close(spa, FTAG); + return (dst.dst_error); +} + +/* + * Called from open context to perform a callback in syncing context. Waits + * for the operation to complete. + * + * The checkfunc will be called from open context as a preliminary check + * which can quickly fail. If it succeeds, it will be called again from + * syncing context. The checkfunc should generally be designed to work + * properly in either context, but if necessary it can check + * dmu_tx_is_syncing(tx). + * + * The synctask infrastructure enforces proper locking strategy with respect + * to the dp_config_rwlock -- the lock will always be held when the callbacks + * are called. It will be held for read during the open-context (preliminary) + * call to the checkfunc, and then held for write from syncing context during + * the calls to the check and sync funcs. + * + * A dataset or pool name can be passed as the first argument. Typically, + * the check func will hold, check the return value of the hold, and then + * release the dataset. The sync func will VERIFYO(hold()) the dataset. + * This is safe because no changes can be made between the check and sync funcs, + * and the sync func will only be called if the check func successfully opened + * the dataset. + */ +int +dsl_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, + blocks_modified, space_check, B_FALSE)); +} + +/* + * An early synctask works exactly as a standard synctask with one important + * difference on the way it is handled during syncing context. Standard + * synctasks run after we've written out all the dirty blocks of dirty + * datasets. Early synctasks are executed before writing out any dirty data, + * and thus before standard synctasks. + * + * For that reason, early synctasks can affect the process of writing dirty + * changes to disk for the txg that they run and should be used with caution. + * In addition, early synctasks should not dirty any metaslabs as this would + * invalidate the precondition/invariant for subsequent early synctasks. + * [see dsl_pool_sync() and dsl_early_sync_task_verify()] + */ +int +dsl_early_sync_task(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, NULL, arg, + blocks_modified, space_check, B_TRUE)); +} + +/* + * A standard synctask that can be interrupted from a signal. The sigfunc + * is called once if a signal occurred while waiting for the task to sync. + */ +int +dsl_sync_task_sig(const char *pool, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, dsl_sigfunc_t *sigfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check) +{ + return (dsl_sync_task_common(pool, checkfunc, syncfunc, sigfunc, arg, + blocks_modified, space_check, B_FALSE)); +} + +static void +dsl_sync_task_nowait_common(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx, + boolean_t early) +{ + dsl_sync_task_t *dst = kmem_zalloc(sizeof (*dst), KM_SLEEP); + + dst->dst_pool = dp; + dst->dst_txg = dmu_tx_get_txg(tx); + dst->dst_space = blocks_modified << DST_AVG_BLKSHIFT; + dst->dst_space_check = space_check; + dst->dst_checkfunc = dsl_null_checkfunc; + dst->dst_syncfunc = syncfunc; + dst->dst_arg = arg; + dst->dst_error = 0; + dst->dst_nowaiter = B_TRUE; + + txg_list_t *task_list = (early) ? + &dp->dp_early_sync_tasks : &dp->dp_sync_tasks; + VERIFY(txg_list_add_tail(task_list, dst, dst->dst_txg)); +} + +void +dsl_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_FALSE); +} + +void +dsl_early_sync_task_nowait(dsl_pool_t *dp, dsl_syncfunc_t *syncfunc, void *arg, + int blocks_modified, zfs_space_check_t space_check, dmu_tx_t *tx) +{ + dsl_sync_task_nowait_common(dp, syncfunc, arg, + blocks_modified, space_check, tx, B_TRUE); +} + +/* + * Called in syncing context to execute the synctask. + */ +void +dsl_sync_task_sync(dsl_sync_task_t *dst, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dst->dst_pool; + + ASSERT0(dst->dst_error); + + /* + * Check for sufficient space. + * + * When the sync task was created, the caller specified the + * type of space checking required. See the comment in + * zfs_space_check_t for details on the semantics of each + * type of space checking. + * + * We just check against what's on-disk; we don't want any + * in-flight accounting to get in our way, because open context + * may have already used up various in-core limits + * (arc_tempreserve, dsl_pool_tempreserve). + */ + if (dst->dst_space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_unreserved_space(dp, + dst->dst_space_check); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + + /* MOS space is triple-dittoed, so we multiply by 3. */ + if (used + dst->dst_space * 3 > quota) { + dst->dst_error = SET_ERROR(ENOSPC); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); + return; + } + } + + /* + * Check for errors by calling checkfunc. + */ + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + dst->dst_error = dst->dst_checkfunc(dst->dst_arg, tx); + if (dst->dst_error == 0) + dst->dst_syncfunc(dst->dst_arg, tx); + rrw_exit(&dp->dp_config_rwlock, FTAG); + if (dst->dst_nowaiter) + kmem_free(dst, sizeof (*dst)); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(dsl_sync_task); +EXPORT_SYMBOL(dsl_sync_task_nowait); +#endif diff --git a/module/zfs/dsl_userhold.c b/module/zfs/dsl_userhold.c new file mode 100644 index 000000000000..75d153194a00 --- /dev/null +++ b/module/zfs/dsl_userhold.c @@ -0,0 +1,691 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2017 by Delphix. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +typedef struct dsl_dataset_user_hold_arg { + nvlist_t *dduha_holds; + nvlist_t *dduha_chkholds; + nvlist_t *dduha_errlist; + minor_t dduha_minor; +} dsl_dataset_user_hold_arg_t; + +/* + * If you add new checks here, you may need to add additional checks to the + * "temporary" case in snapshot_check() in dmu_objset.c. + */ +int +dsl_dataset_user_hold_check_one(dsl_dataset_t *ds, const char *htag, + boolean_t temphold, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + objset_t *mos = dp->dp_meta_objset; + int error = 0; + + ASSERT(dsl_pool_config_held(dp)); + + if (strlen(htag) > MAXNAMELEN) + return (SET_ERROR(E2BIG)); + /* Tempholds have a more restricted length */ + if (temphold && strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + return (SET_ERROR(E2BIG)); + + /* tags must be unique (if ds already exists) */ + if (ds != NULL && dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + uint64_t value; + + error = zap_lookup(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + htag, 8, 1, &value); + if (error == 0) + error = SET_ERROR(EEXIST); + else if (error == ENOENT) + error = 0; + } + + return (error); +} + +static int +dsl_dataset_user_hold_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvlist_t *tmp_holds; + + if (spa_version(dp->dp_spa) < SPA_VERSION_USERREFS) + return (SET_ERROR(ENOTSUP)); + + if (!dmu_tx_is_syncing(tx)) + return (0); + + /* + * Ensure the list has no duplicates by copying name/values from + * non-unique dduha_holds to unique tmp_holds, and comparing counts. + */ + tmp_holds = fnvlist_alloc(); + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + size_t len = strlen(nvpair_name(pair)) + + strlen(fnvpair_value_string(pair)); + char *nameval = kmem_zalloc(len + 2, KM_SLEEP); + (void) strlcpy(nameval, nvpair_name(pair), len + 2); + (void) strlcat(nameval, "@", len + 2); + (void) strlcat(nameval, fnvpair_value_string(pair), len + 2); + fnvlist_add_string(tmp_holds, nameval, ""); + kmem_free(nameval, len + 2); + } + size_t tmp_count = fnvlist_num_pairs(tmp_holds); + fnvlist_free(tmp_holds); + if (tmp_count != fnvlist_num_pairs(dduha->dduha_holds)) + return (SET_ERROR(EEXIST)); + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(dduha->dduha_holds, pair)) { + dsl_dataset_t *ds; + int error = 0; + char *htag, *name; + + /* must be a snapshot */ + name = nvpair_name(pair); + if (strchr(name, '@') == NULL) + error = SET_ERROR(EINVAL); + + if (error == 0) + error = nvpair_value_string(pair, &htag); + + if (error == 0) + error = dsl_dataset_hold(dp, name, FTAG, &ds); + + if (error == 0) { + error = dsl_dataset_user_hold_check_one(ds, htag, + dduha->dduha_minor != 0, tx); + dsl_dataset_rele(ds, FTAG); + } + + if (error == 0) { + fnvlist_add_string(dduha->dduha_chkholds, name, htag); + } else { + /* + * We register ENOENT errors so they can be correctly + * reported if needed, such as when all holds fail. + */ + fnvlist_add_int32(dduha->dduha_errlist, name, error); + if (error != ENOENT) + return (error); + } + } + + return (0); +} + + +static void +dsl_dataset_user_hold_sync_one_impl(nvlist_t *tmpholds, dsl_dataset_t *ds, + const char *htag, minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + } + ds->ds_userrefs++; + + VERIFY0(zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (minor != 0) { + char name[MAXNAMELEN]; + nvlist_t *tags; + + VERIFY0(dsl_pool_user_hold(dp, ds->ds_object, + htag, now, tx)); + (void) snprintf(name, sizeof (name), "%llx", + (u_longlong_t)ds->ds_object); + + if (nvlist_lookup_nvlist(tmpholds, name, &tags) != 0) { + tags = fnvlist_alloc(); + fnvlist_add_boolean(tags, htag); + fnvlist_add_nvlist(tmpholds, name, tags); + fnvlist_free(tags); + } else { + fnvlist_add_boolean(tags, htag); + } + } + + spa_history_log_internal_ds(ds, "hold", tx, + "tag=%s temp=%d refs=%llu", + htag, minor != 0, (u_longlong_t)ds->ds_userrefs); +} + +typedef struct zfs_hold_cleanup_arg { + char zhca_spaname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t zhca_spa_load_guid; + nvlist_t *zhca_holds; +} zfs_hold_cleanup_arg_t; + +static void +dsl_dataset_user_release_onexit(void *arg) +{ + zfs_hold_cleanup_arg_t *ca = arg; + spa_t *spa; + int error; + + error = spa_open(ca->zhca_spaname, &spa, FTAG); + if (error != 0) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded", + ca->zhca_spaname); + return; + } + if (spa_load_guid(spa) != ca->zhca_spa_load_guid) { + zfs_dbgmsg("couldn't release holds on pool=%s " + "because pool is no longer loaded (guid doesn't match)", + ca->zhca_spaname); + spa_close(spa, FTAG); + return; + } + + (void) dsl_dataset_user_release_tmp(spa_get_dsl(spa), ca->zhca_holds); + fnvlist_free(ca->zhca_holds); + kmem_free(ca, sizeof (zfs_hold_cleanup_arg_t)); + spa_close(spa, FTAG); +} + +static void +dsl_onexit_hold_cleanup(spa_t *spa, nvlist_t *holds, minor_t minor) +{ + zfs_hold_cleanup_arg_t *ca; + + if (minor == 0 || nvlist_empty(holds)) { + fnvlist_free(holds); + return; + } + + ASSERT(spa != NULL); + ca = kmem_alloc(sizeof (*ca), KM_SLEEP); + + (void) strlcpy(ca->zhca_spaname, spa_name(spa), + sizeof (ca->zhca_spaname)); + ca->zhca_spa_load_guid = spa_load_guid(spa); + ca->zhca_holds = holds; + VERIFY0(zfs_onexit_add_cb(minor, + dsl_dataset_user_release_onexit, ca, NULL)); +} + +void +dsl_dataset_user_hold_sync_one(dsl_dataset_t *ds, const char *htag, + minor_t minor, uint64_t now, dmu_tx_t *tx) +{ + nvlist_t *tmpholds; + + if (minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, htag, minor, now, tx); + dsl_onexit_hold_cleanup(dsl_dataset_get_spa(ds), tmpholds, minor); +} + +static void +dsl_dataset_user_hold_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_hold_arg_t *dduha = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + nvlist_t *tmpholds; + uint64_t now = gethrestime_sec(); + + if (dduha->dduha_minor != 0) + tmpholds = fnvlist_alloc(); + else + tmpholds = NULL; + for (nvpair_t *pair = nvlist_next_nvpair(dduha->dduha_chkholds, NULL); + pair != NULL; + pair = nvlist_next_nvpair(dduha->dduha_chkholds, pair)) { + dsl_dataset_t *ds; + + VERIFY0(dsl_dataset_hold(dp, nvpair_name(pair), FTAG, &ds)); + dsl_dataset_user_hold_sync_one_impl(tmpholds, ds, + fnvpair_value_string(pair), dduha->dduha_minor, now, tx); + dsl_dataset_rele(ds, FTAG); + } + dsl_onexit_hold_cleanup(dp->dp_spa, tmpholds, dduha->dduha_minor); +} + +/* + * The full semantics of this function are described in the comment above + * lzc_hold(). + * + * To summarize: + * holds is nvl of snapname -> holdname + * errlist will be filled in with snapname -> error + * + * The snapshots must all be in the same pool. + * + * Holds for snapshots that don't exist will be skipped. + * + * If none of the snapshots for requested holds exist then ENOENT will be + * returned. + * + * If cleanup_minor is not 0, the holds will be temporary, which will be cleaned + * up when the process exits. + * + * On success all the holds, for snapshots that existed, will be created and 0 + * will be returned. + * + * On failure no holds will be created, the errlist will be filled in, + * and an errno will returned. + * + * In all cases the errlist will contain entries for holds where the snapshot + * didn't exist. + */ +int +dsl_dataset_user_hold(nvlist_t *holds, minor_t cleanup_minor, nvlist_t *errlist) +{ + dsl_dataset_user_hold_arg_t dduha; + nvpair_t *pair; + int ret; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + dduha.dduha_holds = holds; + /* chkholds can have non-unique name */ + VERIFY(0 == nvlist_alloc(&dduha.dduha_chkholds, 0, KM_SLEEP)); + dduha.dduha_errlist = errlist; + dduha.dduha_minor = cleanup_minor; + + ret = dsl_sync_task(nvpair_name(pair), dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, &dduha, + fnvlist_num_pairs(holds), ZFS_SPACE_CHECK_RESERVED); + fnvlist_free(dduha.dduha_chkholds); + + return (ret); +} + +typedef int (dsl_holdfunc_t)(dsl_pool_t *dp, const char *name, void *tag, + dsl_dataset_t **dsp); + +typedef struct dsl_dataset_user_release_arg { + dsl_holdfunc_t *ddura_holdfunc; + nvlist_t *ddura_holds; + nvlist_t *ddura_todelete; + nvlist_t *ddura_errlist; + nvlist_t *ddura_chkholds; +} dsl_dataset_user_release_arg_t; + +/* Place a dataset hold on the snapshot identified by passed dsobj string */ +static int +dsl_dataset_hold_obj_string(dsl_pool_t *dp, const char *dsobj, void *tag, + dsl_dataset_t **dsp) +{ + return (dsl_dataset_hold_obj(dp, zfs_strtonum(dsobj, NULL), tag, dsp)); +} + +static int +dsl_dataset_user_release_check_one(dsl_dataset_user_release_arg_t *ddura, + dsl_dataset_t *ds, nvlist_t *holds, const char *snapname) +{ + uint64_t zapobj; + nvlist_t *holds_found; + objset_t *mos; + int numholds; + + if (!ds->ds_is_snapshot) + return (SET_ERROR(EINVAL)); + + if (nvlist_empty(holds)) + return (0); + + numholds = 0; + mos = ds->ds_dir->dd_pool->dp_meta_objset; + zapobj = dsl_dataset_phys(ds)->ds_userrefs_obj; + VERIFY0(nvlist_alloc(&holds_found, NV_UNIQUE_NAME, KM_SLEEP)); + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + uint64_t tmp; + int error; + const char *holdname = nvpair_name(pair); + + if (zapobj != 0) + error = zap_lookup(mos, zapobj, holdname, 8, 1, &tmp); + else + error = SET_ERROR(ENOENT); + + /* + * Non-existent holds are put on the errlist, but don't + * cause an overall failure. + */ + if (error == ENOENT) { + if (ddura->ddura_errlist != NULL) { + char *errtag = kmem_asprintf("%s#%s", + snapname, holdname); + fnvlist_add_int32(ddura->ddura_errlist, errtag, + ENOENT); + kmem_strfree(errtag); + } + continue; + } + + if (error != 0) { + fnvlist_free(holds_found); + return (error); + } + + fnvlist_add_boolean(holds_found, holdname); + numholds++; + } + + if (DS_IS_DEFER_DESTROY(ds) && + dsl_dataset_phys(ds)->ds_num_children == 1 && + ds->ds_userrefs == numholds) { + /* we need to destroy the snapshot as well */ + if (dsl_dataset_long_held(ds)) { + fnvlist_free(holds_found); + return (SET_ERROR(EBUSY)); + } + fnvlist_add_boolean(ddura->ddura_todelete, snapname); + } + + if (numholds != 0) { + fnvlist_add_nvlist(ddura->ddura_chkholds, snapname, + holds_found); + } + fnvlist_free(holds_found); + + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura; + dsl_holdfunc_t *holdfunc; + dsl_pool_t *dp; + + if (!dmu_tx_is_syncing(tx)) + return (0); + + dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + ddura = arg; + holdfunc = ddura->ddura_holdfunc; + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_holds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_holds, pair)) { + int error; + dsl_dataset_t *ds; + nvlist_t *holds; + const char *snapname = nvpair_name(pair); + + error = nvpair_value_nvlist(pair, &holds); + if (error != 0) + error = (SET_ERROR(EINVAL)); + else + error = holdfunc(dp, snapname, FTAG, &ds); + if (error == 0) { + error = dsl_dataset_user_release_check_one(ddura, ds, + holds, snapname); + dsl_dataset_rele(ds, FTAG); + } + if (error != 0) { + if (ddura->ddura_errlist != NULL) { + fnvlist_add_int32(ddura->ddura_errlist, + snapname, error); + } + /* + * Non-existent snapshots are put on the errlist, + * but don't cause an overall failure. + */ + if (error != ENOENT) + return (error); + } + } + + return (0); +} + +static void +dsl_dataset_user_release_sync_one(dsl_dataset_t *ds, nvlist_t *holds, + dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + + for (nvpair_t *pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + int error; + const char *holdname = nvpair_name(pair); + + /* Remove temporary hold if one exists. */ + error = dsl_pool_user_release(dp, ds->ds_object, holdname, tx); + VERIFY(error == 0 || error == ENOENT); + + VERIFY0(zap_remove(mos, dsl_dataset_phys(ds)->ds_userrefs_obj, + holdname, tx)); + ds->ds_userrefs--; + + spa_history_log_internal_ds(ds, "release", tx, + "tag=%s refs=%lld", holdname, (longlong_t)ds->ds_userrefs); + } +} + +static void +dsl_dataset_user_release_sync(void *arg, dmu_tx_t *tx) +{ + dsl_dataset_user_release_arg_t *ddura = arg; + dsl_holdfunc_t *holdfunc = ddura->ddura_holdfunc; + dsl_pool_t *dp = dmu_tx_pool(tx); + + ASSERT(RRW_WRITE_HELD(&dp->dp_config_rwlock)); + + for (nvpair_t *pair = nvlist_next_nvpair(ddura->ddura_chkholds, NULL); + pair != NULL; pair = nvlist_next_nvpair(ddura->ddura_chkholds, + pair)) { + dsl_dataset_t *ds; + const char *name = nvpair_name(pair); + + VERIFY0(holdfunc(dp, name, FTAG, &ds)); + + dsl_dataset_user_release_sync_one(ds, + fnvpair_value_nvlist(pair), tx); + if (nvlist_exists(ddura->ddura_todelete, name)) { + ASSERT(ds->ds_userrefs == 0 && + dsl_dataset_phys(ds)->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)); + dsl_destroy_snapshot_sync_impl(ds, B_FALSE, tx); + } + dsl_dataset_rele(ds, FTAG); + } +} + +/* + * The full semantics of this function are described in the comment above + * lzc_release(). + * + * To summarize: + * Releases holds specified in the nvl holds. + * + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + * + * If tmpdp is not NULL the names for holds should be the dsobj's of snapshots, + * otherwise they should be the names of snapshots. + * + * As a release may cause snapshots to be destroyed this tries to ensure they + * aren't mounted. + * + * The release of non-existent holds are skipped. + * + * At least one hold must have been released for the this function to succeed + * and return 0. + */ +static int +dsl_dataset_user_release_impl(nvlist_t *holds, nvlist_t *errlist, + dsl_pool_t *tmpdp) +{ + dsl_dataset_user_release_arg_t ddura; + nvpair_t *pair; + char *pool; + int error; + + pair = nvlist_next_nvpair(holds, NULL); + if (pair == NULL) + return (0); + + /* + * The release may cause snapshots to be destroyed; make sure they + * are not mounted. + */ + if (tmpdp != NULL) { + /* Temporary holds are specified by dsobj string. */ + ddura.ddura_holdfunc = dsl_dataset_hold_obj_string; + pool = spa_name(tmpdp->dp_spa); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + dsl_dataset_t *ds; + + dsl_pool_config_enter(tmpdp, FTAG); + error = dsl_dataset_hold_obj_string(tmpdp, + nvpair_name(pair), FTAG, &ds); + if (error == 0) { + char name[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, name); + dsl_pool_config_exit(tmpdp, FTAG); + dsl_dataset_rele(ds, FTAG); + (void) zfs_unmount_snap(name); + } else { + dsl_pool_config_exit(tmpdp, FTAG); + } + } +#endif + } else { + /* Non-temporary holds are specified by name. */ + ddura.ddura_holdfunc = dsl_dataset_hold; + pool = nvpair_name(pair); +#ifdef _KERNEL + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + (void) zfs_unmount_snap(nvpair_name(pair)); + } +#endif + } + + ddura.ddura_holds = holds; + ddura.ddura_errlist = errlist; + VERIFY0(nvlist_alloc(&ddura.ddura_todelete, NV_UNIQUE_NAME, + KM_SLEEP)); + VERIFY0(nvlist_alloc(&ddura.ddura_chkholds, NV_UNIQUE_NAME, + KM_SLEEP)); + + error = dsl_sync_task(pool, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, &ddura, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + fnvlist_free(ddura.ddura_todelete); + fnvlist_free(ddura.ddura_chkholds); + + return (error); +} + +/* + * holds is nvl of snapname -> { holdname, ... } + * errlist will be filled in with snapname -> error + */ +int +dsl_dataset_user_release(nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release_impl(holds, errlist, NULL)); +} + +/* + * holds is nvl of snapdsobj -> { holdname, ... } + */ +void +dsl_dataset_user_release_tmp(struct dsl_pool *dp, nvlist_t *holds) +{ + ASSERT(dp != NULL); + (void) dsl_dataset_user_release_impl(holds, NULL, dp); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t *nvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + int err; + + err = dsl_pool_hold(dsname, FTAG, &dp); + if (err != 0) + return (err); + err = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (err != 0) { + dsl_pool_rele(dp, FTAG); + return (err); + } + + if (dsl_dataset_phys(ds)->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + fnvlist_add_uint64(nvl, za->za_name, + za->za_first_integer); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (0); +} diff --git a/module/zfs/edonr_zfs.c b/module/zfs/edonr_zfs.c new file mode 100644 index 000000000000..aa00e1c9417e --- /dev/null +++ b/module/zfs/edonr_zfs.c @@ -0,0 +1,115 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include +#include +#include +#include +#include + +#define EDONR_MODE 512 +#define EDONR_BLOCK_SIZE EdonR512_BLOCK_SIZE + +static int +edonr_incremental(void *buf, size_t size, void *arg) +{ + EdonRState *ctx = arg; + EdonRUpdate(ctx, buf, size * 8); + return (0); +} + +/* + * Native zio_checksum interface for the Edon-R hash function. + */ +/*ARGSUSED*/ +void +abd_checksum_edonr_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + uint8_t digest[EDONR_MODE / 8]; + EdonRState ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) abd_iterate_func(abd, 0, size, edonr_incremental, &ctx); + EdonRFinal(&ctx, digest); + bcopy(digest, zcp->zc_word, sizeof (zcp->zc_word)); +} + +/* + * Byteswapped zio_checksum interface for the Edon-R hash function. + */ +void +abd_checksum_edonr_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_edonr_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(zcp->zc_word[0]); + zcp->zc_word[1] = BSWAP_64(zcp->zc_word[1]); + zcp->zc_word[2] = BSWAP_64(zcp->zc_word[2]); + zcp->zc_word[3] = BSWAP_64(zcp->zc_word[3]); +} + +void * +abd_checksum_edonr_tmpl_init(const zio_cksum_salt_t *salt) +{ + EdonRState *ctx; + uint8_t salt_block[EDONR_BLOCK_SIZE]; + + /* + * Edon-R needs all but the last hash invocation to be on full-size + * blocks, but the salt is too small. Rather than simply padding it + * with zeros, we expand the salt into a new salt block of proper + * size by double-hashing it (the new salt block will be composed of + * H(salt) || H(H(salt))). + */ + CTASSERT(EDONR_BLOCK_SIZE == 2 * (EDONR_MODE / 8)); + EdonRHash(EDONR_MODE, salt->zcs_bytes, sizeof (salt->zcs_bytes) * 8, + salt_block); + EdonRHash(EDONR_MODE, salt_block, EDONR_MODE, salt_block + + EDONR_MODE / 8); + + /* + * Feed the new salt block into the hash function - this will serve + * as our MAC key. + */ + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + EdonRInit(ctx, EDONR_MODE); + EdonRUpdate(ctx, salt_block, sizeof (salt_block) * 8); + return (ctx); +} + +void +abd_checksum_edonr_tmpl_free(void *ctx_template) +{ + EdonRState *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/module/zfs/fm.c b/module/zfs/fm.c new file mode 100644 index 000000000000..c00e08b8d02a --- /dev/null +++ b/module/zfs/fm.c @@ -0,0 +1,1674 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * Fault Management Architecture (FMA) Resource and Protocol Support + * + * The routines contained herein provide services to support kernel subsystems + * in publishing fault management telemetry (see PSARC 2002/412 and 2003/089). + * + * Name-Value Pair Lists + * + * The embodiment of an FMA protocol element (event, fmri or authority) is a + * name-value pair list (nvlist_t). FMA-specific nvlist constructor and + * destructor functions, fm_nvlist_create() and fm_nvlist_destroy(), are used + * to create an nvpair list using custom allocators. Callers may choose to + * allocate either from the kernel memory allocator, or from a preallocated + * buffer, useful in constrained contexts like high-level interrupt routines. + * + * Protocol Event and FMRI Construction + * + * Convenience routines are provided to construct nvlist events according to + * the FMA Event Protocol and Naming Schema specification for ereports and + * FMRIs for the dev, cpu, hc, mem, legacy hc and de schemes. + * + * ENA Manipulation + * + * Routines to generate ENA formats 0, 1 and 2 are available as well as + * routines to increment formats 1 and 2. Individual fields within the + * ENA are extractable via fm_ena_time_get(), fm_ena_id_get(), + * fm_ena_format_get() and fm_ena_gen_get(). + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#include +#include + +int zfs_zevent_len_max = 0; +int zfs_zevent_cols = 80; +int zfs_zevent_console = 0; + +static int zevent_len_cur = 0; +static int zevent_waiters = 0; +static int zevent_flags = 0; + +/* Num events rate limited since the last time zfs_zevent_next() was called */ +static uint64_t ratelimit_dropped = 0; + +/* + * The EID (Event IDentifier) is used to uniquely tag a zevent when it is + * posted. The posted EIDs are monotonically increasing but not persistent. + * They will be reset to the initial value (1) each time the kernel module is + * loaded. + */ +static uint64_t zevent_eid = 0; + +static kmutex_t zevent_lock; +static list_t zevent_list; +static kcondvar_t zevent_cv; +#endif /* _KERNEL */ + + +/* + * Common fault management kstats to record event generation failures + */ + +struct erpt_kstat { + kstat_named_t erpt_dropped; /* num erpts dropped on post */ + kstat_named_t erpt_set_failed; /* num erpt set failures */ + kstat_named_t fmri_set_failed; /* num fmri set failures */ + kstat_named_t payload_set_failed; /* num payload set failures */ +}; + +static struct erpt_kstat erpt_kstat_data = { + { "erpt-dropped", KSTAT_DATA_UINT64 }, + { "erpt-set-failed", KSTAT_DATA_UINT64 }, + { "fmri-set-failed", KSTAT_DATA_UINT64 }, + { "payload-set-failed", KSTAT_DATA_UINT64 } +}; + +kstat_t *fm_ksp; + +#ifdef _KERNEL + +/* + * Formatting utility function for fm_nvprintr. We attempt to wrap chunks of + * output so they aren't split across console lines, and return the end column. + */ +/*PRINTFLIKE4*/ +static int +fm_printf(int depth, int c, int cols, const char *format, ...) +{ + va_list ap; + int width; + char c1; + + va_start(ap, format); + width = vsnprintf(&c1, sizeof (c1), format, ap); + va_end(ap); + + if (c + width >= cols) { + console_printf("\n"); + c = 0; + if (format[0] != ' ' && depth > 0) { + console_printf(" "); + c++; + } + } + + va_start(ap, format); + console_vprintf(format, ap); + va_end(ap); + + return ((c + width) % cols); +} + +/* + * Recursively print an nvlist in the specified column width and return the + * column we end up in. This function is called recursively by fm_nvprint(), + * below. We generically format the entire nvpair using hexadecimal + * integers and strings, and elide any integer arrays. Arrays are basically + * used for cache dumps right now, so we suppress them so as not to overwhelm + * the amount of console output we produce at panic time. This can be further + * enhanced as FMA technology grows based upon the needs of consumers. All + * FMA telemetry is logged using the dump device transport, so the console + * output serves only as a fallback in case this procedure is unsuccessful. + */ +static int +fm_nvprintr(nvlist_t *nvl, int d, int c, int cols) +{ + nvpair_t *nvp; + + for (nvp = nvlist_next_nvpair(nvl, NULL); + nvp != NULL; nvp = nvlist_next_nvpair(nvl, nvp)) { + + data_type_t type = nvpair_type(nvp); + const char *name = nvpair_name(nvp); + + boolean_t b; + uint8_t i8; + uint16_t i16; + uint32_t i32; + uint64_t i64; + char *str; + nvlist_t *cnv; + + if (strcmp(name, FM_CLASS) == 0) + continue; /* already printed by caller */ + + c = fm_printf(d, c, cols, " %s=", name); + + switch (type) { + case DATA_TYPE_BOOLEAN: + c = fm_printf(d + 1, c, cols, " 1"); + break; + + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(nvp, &b); + c = fm_printf(d + 1, c, cols, b ? "1" : "0"); + break; + + case DATA_TYPE_BYTE: + (void) nvpair_value_byte(nvp, &i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); + break; + + case DATA_TYPE_INT8: + (void) nvpair_value_int8(nvp, (void *)&i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); + break; + + case DATA_TYPE_UINT8: + (void) nvpair_value_uint8(nvp, &i8); + c = fm_printf(d + 1, c, cols, "0x%x", i8); + break; + + case DATA_TYPE_INT16: + (void) nvpair_value_int16(nvp, (void *)&i16); + c = fm_printf(d + 1, c, cols, "0x%x", i16); + break; + + case DATA_TYPE_UINT16: + (void) nvpair_value_uint16(nvp, &i16); + c = fm_printf(d + 1, c, cols, "0x%x", i16); + break; + + case DATA_TYPE_INT32: + (void) nvpair_value_int32(nvp, (void *)&i32); + c = fm_printf(d + 1, c, cols, "0x%x", i32); + break; + + case DATA_TYPE_UINT32: + (void) nvpair_value_uint32(nvp, &i32); + c = fm_printf(d + 1, c, cols, "0x%x", i32); + break; + + case DATA_TYPE_INT64: + (void) nvpair_value_int64(nvp, (void *)&i64); + c = fm_printf(d + 1, c, cols, "0x%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_UINT64: + (void) nvpair_value_uint64(nvp, &i64); + c = fm_printf(d + 1, c, cols, "0x%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_HRTIME: + (void) nvpair_value_hrtime(nvp, (void *)&i64); + c = fm_printf(d + 1, c, cols, "0x%llx", + (u_longlong_t)i64); + break; + + case DATA_TYPE_STRING: + (void) nvpair_value_string(nvp, &str); + c = fm_printf(d + 1, c, cols, "\"%s\"", + str ? str : ""); + break; + + case DATA_TYPE_NVLIST: + c = fm_printf(d + 1, c, cols, "["); + (void) nvpair_value_nvlist(nvp, &cnv); + c = fm_nvprintr(cnv, d + 1, c, cols); + c = fm_printf(d + 1, c, cols, " ]"); + break; + + case DATA_TYPE_NVLIST_ARRAY: { + nvlist_t **val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "["); + (void) nvpair_value_nvlist_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) { + c = fm_nvprintr(val[i], d + 1, c, cols); + } + c = fm_printf(d + 1, c, cols, " ]"); + } + break; + + case DATA_TYPE_INT8_ARRAY: { + int8_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT8_ARRAY: { + uint8_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint8_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT16_ARRAY: { + int16_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT16_ARRAY: { + uint16_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint16_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT32_ARRAY: { + int32_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT32_ARRAY: { + uint32_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint32_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_INT64_ARRAY: { + int64_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_int64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *val; + uint_t i, nelem; + + c = fm_printf(d + 1, c, cols, "[ "); + (void) nvpair_value_uint64_array(nvp, &val, &nelem); + for (i = 0; i < nelem; i++) + c = fm_printf(d + 1, c, cols, "0x%llx ", + (u_longlong_t)val[i]); + + c = fm_printf(d + 1, c, cols, "]"); + break; + } + + case DATA_TYPE_STRING_ARRAY: + case DATA_TYPE_BOOLEAN_ARRAY: + case DATA_TYPE_BYTE_ARRAY: + c = fm_printf(d + 1, c, cols, "[...]"); + break; + + case DATA_TYPE_UNKNOWN: + case DATA_TYPE_DONTCARE: + c = fm_printf(d + 1, c, cols, ""); + break; + } + } + + return (c); +} + +void +fm_nvprint(nvlist_t *nvl) +{ + char *class; + int c = 0; + + console_printf("\n"); + + if (nvlist_lookup_string(nvl, FM_CLASS, &class) == 0) + c = fm_printf(0, c, zfs_zevent_cols, "%s", class); + + if (fm_nvprintr(nvl, 0, c, zfs_zevent_cols) != 0) + console_printf("\n"); + + console_printf("\n"); +} + +static zevent_t * +zfs_zevent_alloc(void) +{ + zevent_t *ev; + + ev = kmem_zalloc(sizeof (zevent_t), KM_SLEEP); + + list_create(&ev->ev_ze_list, sizeof (zfs_zevent_t), + offsetof(zfs_zevent_t, ze_node)); + list_link_init(&ev->ev_node); + + return (ev); +} + +static void +zfs_zevent_free(zevent_t *ev) +{ + /* Run provided cleanup callback */ + ev->ev_cb(ev->ev_nvl, ev->ev_detector); + + list_destroy(&ev->ev_ze_list); + kmem_free(ev, sizeof (zevent_t)); +} + +static void +zfs_zevent_drain(zevent_t *ev) +{ + zfs_zevent_t *ze; + + ASSERT(MUTEX_HELD(&zevent_lock)); + list_remove(&zevent_list, ev); + + /* Remove references to this event in all private file data */ + while ((ze = list_head(&ev->ev_ze_list)) != NULL) { + list_remove(&ev->ev_ze_list, ze); + ze->ze_zevent = NULL; + ze->ze_dropped++; + } + + zfs_zevent_free(ev); +} + +void +zfs_zevent_drain_all(int *count) +{ + zevent_t *ev; + + mutex_enter(&zevent_lock); + while ((ev = list_head(&zevent_list)) != NULL) + zfs_zevent_drain(ev); + + *count = zevent_len_cur; + zevent_len_cur = 0; + mutex_exit(&zevent_lock); +} + +/* + * New zevents are inserted at the head. If the maximum queue + * length is exceeded a zevent will be drained from the tail. + * As part of this any user space processes which currently have + * a reference to this zevent_t in their private data will have + * this reference set to NULL. + */ +static void +zfs_zevent_insert(zevent_t *ev) +{ + ASSERT(MUTEX_HELD(&zevent_lock)); + list_insert_head(&zevent_list, ev); + + if (zevent_len_cur >= zfs_zevent_len_max) + zfs_zevent_drain(list_tail(&zevent_list)); + else + zevent_len_cur++; +} + +/* + * Post a zevent. The cb will be called when nvl and detector are no longer + * needed, i.e.: + * - An error happened and a zevent can't be posted. In this case, cb is called + * before zfs_zevent_post() returns. + * - The event is being drained and freed. + */ +int +zfs_zevent_post(nvlist_t *nvl, nvlist_t *detector, zevent_cb_t *cb) +{ + inode_timespec_t tv; + int64_t tv_array[2]; + uint64_t eid; + size_t nvl_size = 0; + zevent_t *ev; + int error; + + ASSERT(cb != NULL); + + gethrestime(&tv); + tv_array[0] = tv.tv_sec; + tv_array[1] = tv.tv_nsec; + + error = nvlist_add_int64_array(nvl, FM_EREPORT_TIME, tv_array, 2); + if (error) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + goto out; + } + + eid = atomic_inc_64_nv(&zevent_eid); + error = nvlist_add_uint64(nvl, FM_EREPORT_EID, eid); + if (error) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + goto out; + } + + error = nvlist_size(nvl, &nvl_size, NV_ENCODE_NATIVE); + if (error) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + goto out; + } + + if (nvl_size > ERPT_DATA_SZ || nvl_size == 0) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + error = EOVERFLOW; + goto out; + } + + if (zfs_zevent_console) + fm_nvprint(nvl); + + ev = zfs_zevent_alloc(); + if (ev == NULL) { + atomic_inc_64(&erpt_kstat_data.erpt_dropped.value.ui64); + error = ENOMEM; + goto out; + } + + ev->ev_nvl = nvl; + ev->ev_detector = detector; + ev->ev_cb = cb; + ev->ev_eid = eid; + + mutex_enter(&zevent_lock); + zfs_zevent_insert(ev); + cv_broadcast(&zevent_cv); + mutex_exit(&zevent_lock); + +out: + if (error) + cb(nvl, detector); + + return (error); +} + +static int +zfs_zevent_minor_to_state(minor_t minor, zfs_zevent_t **ze) +{ + *ze = zfsdev_get_state(minor, ZST_ZEVENT); + if (*ze == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +int +zfs_zevent_fd_hold(int fd, minor_t *minorp, zfs_zevent_t **ze) +{ + int error; + + error = zfsdev_getminor(fd, minorp); + if (error == 0) + error = zfs_zevent_minor_to_state(*minorp, ze); + + if (error) + zfs_zevent_fd_rele(fd); + + return (error); +} + +void +zfs_zevent_fd_rele(int fd) +{ + zfs_file_put(fd); +} + +/* + * Get the next zevent in the stream and place a copy in 'event'. This + * may fail with ENOMEM if the encoded nvlist size exceeds the passed + * 'event_size'. In this case the stream pointer is not advanced and + * and 'event_size' is set to the minimum required buffer size. + */ +int +zfs_zevent_next(zfs_zevent_t *ze, nvlist_t **event, uint64_t *event_size, + uint64_t *dropped) +{ + zevent_t *ev; + size_t size; + int error = 0; + + mutex_enter(&zevent_lock); + if (ze->ze_zevent == NULL) { + /* New stream start at the beginning/tail */ + ev = list_tail(&zevent_list); + if (ev == NULL) { + error = ENOENT; + goto out; + } + } else { + /* + * Existing stream continue with the next element and remove + * ourselves from the wait queue for the previous element + */ + ev = list_prev(&zevent_list, ze->ze_zevent); + if (ev == NULL) { + error = ENOENT; + goto out; + } + } + + VERIFY(nvlist_size(ev->ev_nvl, &size, NV_ENCODE_NATIVE) == 0); + if (size > *event_size) { + *event_size = size; + error = ENOMEM; + goto out; + } + + if (ze->ze_zevent) + list_remove(&ze->ze_zevent->ev_ze_list, ze); + + ze->ze_zevent = ev; + list_insert_head(&ev->ev_ze_list, ze); + (void) nvlist_dup(ev->ev_nvl, event, KM_SLEEP); + *dropped = ze->ze_dropped; + +#ifdef _KERNEL + /* Include events dropped due to rate limiting */ + *dropped += ratelimit_dropped; + ratelimit_dropped = 0; +#endif + ze->ze_dropped = 0; +out: + mutex_exit(&zevent_lock); + + return (error); +} + +/* + * Wait in an interruptible state for any new events. + */ +int +zfs_zevent_wait(zfs_zevent_t *ze) +{ + int error = EAGAIN; + + mutex_enter(&zevent_lock); + zevent_waiters++; + + while (error == EAGAIN) { + if (zevent_flags & ZEVENT_SHUTDOWN) { + error = SET_ERROR(ESHUTDOWN); + break; + } + + error = cv_wait_sig(&zevent_cv, &zevent_lock); + if (signal_pending(current)) { + error = SET_ERROR(EINTR); + break; + } else if (!list_is_empty(&zevent_list)) { + error = 0; + continue; + } else { + error = EAGAIN; + } + } + + zevent_waiters--; + mutex_exit(&zevent_lock); + + return (error); +} + +/* + * The caller may seek to a specific EID by passing that EID. If the EID + * is still available in the posted list of events the cursor is positioned + * there. Otherwise ENOENT is returned and the cursor is not moved. + * + * There are two reserved EIDs which may be passed and will never fail. + * ZEVENT_SEEK_START positions the cursor at the start of the list, and + * ZEVENT_SEEK_END positions the cursor at the end of the list. + */ +int +zfs_zevent_seek(zfs_zevent_t *ze, uint64_t eid) +{ + zevent_t *ev; + int error = 0; + + mutex_enter(&zevent_lock); + + if (eid == ZEVENT_SEEK_START) { + if (ze->ze_zevent) + list_remove(&ze->ze_zevent->ev_ze_list, ze); + + ze->ze_zevent = NULL; + goto out; + } + + if (eid == ZEVENT_SEEK_END) { + if (ze->ze_zevent) + list_remove(&ze->ze_zevent->ev_ze_list, ze); + + ev = list_head(&zevent_list); + if (ev) { + ze->ze_zevent = ev; + list_insert_head(&ev->ev_ze_list, ze); + } else { + ze->ze_zevent = NULL; + } + + goto out; + } + + for (ev = list_tail(&zevent_list); ev != NULL; + ev = list_prev(&zevent_list, ev)) { + if (ev->ev_eid == eid) { + if (ze->ze_zevent) + list_remove(&ze->ze_zevent->ev_ze_list, ze); + + ze->ze_zevent = ev; + list_insert_head(&ev->ev_ze_list, ze); + break; + } + } + + if (ev == NULL) + error = ENOENT; + +out: + mutex_exit(&zevent_lock); + + return (error); +} + +void +zfs_zevent_init(zfs_zevent_t **zep) +{ + zfs_zevent_t *ze; + + ze = *zep = kmem_zalloc(sizeof (zfs_zevent_t), KM_SLEEP); + list_link_init(&ze->ze_node); +} + +void +zfs_zevent_destroy(zfs_zevent_t *ze) +{ + mutex_enter(&zevent_lock); + if (ze->ze_zevent) + list_remove(&ze->ze_zevent->ev_ze_list, ze); + mutex_exit(&zevent_lock); + + kmem_free(ze, sizeof (zfs_zevent_t)); +} +#endif /* _KERNEL */ + +/* + * Wrappers for FM nvlist allocators + */ +/* ARGSUSED */ +static void * +i_fm_alloc(nv_alloc_t *nva, size_t size) +{ + return (kmem_zalloc(size, KM_SLEEP)); +} + +/* ARGSUSED */ +static void +i_fm_free(nv_alloc_t *nva, void *buf, size_t size) +{ + kmem_free(buf, size); +} + +const nv_alloc_ops_t fm_mem_alloc_ops = { + .nv_ao_init = NULL, + .nv_ao_fini = NULL, + .nv_ao_alloc = i_fm_alloc, + .nv_ao_free = i_fm_free, + .nv_ao_reset = NULL +}; + +/* + * Create and initialize a new nv_alloc_t for a fixed buffer, buf. A pointer + * to the newly allocated nv_alloc_t structure is returned upon success or NULL + * is returned to indicate that the nv_alloc structure could not be created. + */ +nv_alloc_t * +fm_nva_xcreate(char *buf, size_t bufsz) +{ + nv_alloc_t *nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); + + if (bufsz == 0 || nv_alloc_init(nvhdl, nv_fixed_ops, buf, bufsz) != 0) { + kmem_free(nvhdl, sizeof (nv_alloc_t)); + return (NULL); + } + + return (nvhdl); +} + +/* + * Destroy a previously allocated nv_alloc structure. The fixed buffer + * associated with nva must be freed by the caller. + */ +void +fm_nva_xdestroy(nv_alloc_t *nva) +{ + nv_alloc_fini(nva); + kmem_free(nva, sizeof (nv_alloc_t)); +} + +/* + * Create a new nv list. A pointer to a new nv list structure is returned + * upon success or NULL is returned to indicate that the structure could + * not be created. The newly created nv list is created and managed by the + * operations installed in nva. If nva is NULL, the default FMA nva + * operations are installed and used. + * + * When called from the kernel and nva == NULL, this function must be called + * from passive kernel context with no locks held that can prevent a + * sleeping memory allocation from occurring. Otherwise, this function may + * be called from other kernel contexts as long a valid nva created via + * fm_nva_create() is supplied. + */ +nvlist_t * +fm_nvlist_create(nv_alloc_t *nva) +{ + int hdl_alloced = 0; + nvlist_t *nvl; + nv_alloc_t *nvhdl; + + if (nva == NULL) { + nvhdl = kmem_zalloc(sizeof (nv_alloc_t), KM_SLEEP); + + if (nv_alloc_init(nvhdl, &fm_mem_alloc_ops, NULL, 0) != 0) { + kmem_free(nvhdl, sizeof (nv_alloc_t)); + return (NULL); + } + hdl_alloced = 1; + } else { + nvhdl = nva; + } + + if (nvlist_xalloc(&nvl, NV_UNIQUE_NAME, nvhdl) != 0) { + if (hdl_alloced) { + nv_alloc_fini(nvhdl); + kmem_free(nvhdl, sizeof (nv_alloc_t)); + } + return (NULL); + } + + return (nvl); +} + +/* + * Destroy a previously allocated nvlist structure. flag indicates whether + * or not the associated nva structure should be freed (FM_NVA_FREE) or + * retained (FM_NVA_RETAIN). Retaining the nv alloc structure allows + * it to be re-used for future nvlist creation operations. + */ +void +fm_nvlist_destroy(nvlist_t *nvl, int flag) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(nvl); + + nvlist_free(nvl); + + if (nva != NULL) { + if (flag == FM_NVA_FREE) + fm_nva_xdestroy(nva); + } +} + +int +i_fm_payload_set(nvlist_t *payload, const char *name, va_list ap) +{ + int nelem, ret = 0; + data_type_t type; + + while (ret == 0 && name != NULL) { + type = va_arg(ap, data_type_t); + switch (type) { + case DATA_TYPE_BYTE: + ret = nvlist_add_byte(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_BYTE_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_byte_array(payload, name, + va_arg(ap, uchar_t *), nelem); + break; + case DATA_TYPE_BOOLEAN_VALUE: + ret = nvlist_add_boolean_value(payload, name, + va_arg(ap, boolean_t)); + break; + case DATA_TYPE_BOOLEAN_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_boolean_array(payload, name, + va_arg(ap, boolean_t *), nelem); + break; + case DATA_TYPE_INT8: + ret = nvlist_add_int8(payload, name, + va_arg(ap, int)); + break; + case DATA_TYPE_INT8_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int8_array(payload, name, + va_arg(ap, int8_t *), nelem); + break; + case DATA_TYPE_UINT8: + ret = nvlist_add_uint8(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_UINT8_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint8_array(payload, name, + va_arg(ap, uint8_t *), nelem); + break; + case DATA_TYPE_INT16: + ret = nvlist_add_int16(payload, name, + va_arg(ap, int)); + break; + case DATA_TYPE_INT16_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int16_array(payload, name, + va_arg(ap, int16_t *), nelem); + break; + case DATA_TYPE_UINT16: + ret = nvlist_add_uint16(payload, name, + va_arg(ap, uint_t)); + break; + case DATA_TYPE_UINT16_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint16_array(payload, name, + va_arg(ap, uint16_t *), nelem); + break; + case DATA_TYPE_INT32: + ret = nvlist_add_int32(payload, name, + va_arg(ap, int32_t)); + break; + case DATA_TYPE_INT32_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int32_array(payload, name, + va_arg(ap, int32_t *), nelem); + break; + case DATA_TYPE_UINT32: + ret = nvlist_add_uint32(payload, name, + va_arg(ap, uint32_t)); + break; + case DATA_TYPE_UINT32_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint32_array(payload, name, + va_arg(ap, uint32_t *), nelem); + break; + case DATA_TYPE_INT64: + ret = nvlist_add_int64(payload, name, + va_arg(ap, int64_t)); + break; + case DATA_TYPE_INT64_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_int64_array(payload, name, + va_arg(ap, int64_t *), nelem); + break; + case DATA_TYPE_UINT64: + ret = nvlist_add_uint64(payload, name, + va_arg(ap, uint64_t)); + break; + case DATA_TYPE_UINT64_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_uint64_array(payload, name, + va_arg(ap, uint64_t *), nelem); + break; + case DATA_TYPE_STRING: + ret = nvlist_add_string(payload, name, + va_arg(ap, char *)); + break; + case DATA_TYPE_STRING_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_string_array(payload, name, + va_arg(ap, char **), nelem); + break; + case DATA_TYPE_NVLIST: + ret = nvlist_add_nvlist(payload, name, + va_arg(ap, nvlist_t *)); + break; + case DATA_TYPE_NVLIST_ARRAY: + nelem = va_arg(ap, int); + ret = nvlist_add_nvlist_array(payload, name, + va_arg(ap, nvlist_t **), nelem); + break; + default: + ret = EINVAL; + } + + name = va_arg(ap, char *); + } + return (ret); +} + +void +fm_payload_set(nvlist_t *payload, ...) +{ + int ret; + const char *name; + va_list ap; + + va_start(ap, payload); + name = va_arg(ap, char *); + ret = i_fm_payload_set(payload, name, ap); + va_end(ap); + + if (ret) + atomic_inc_64(&erpt_kstat_data.payload_set_failed.value.ui64); +} + +/* + * Set-up and validate the members of an ereport event according to: + * + * Member name Type Value + * ==================================================== + * class string ereport + * version uint8_t 0 + * ena uint64_t + * detector nvlist_t + * ereport-payload nvlist_t + * + * We don't actually add a 'version' member to the payload. Really, + * the version quoted to us by our caller is that of the category 1 + * "ereport" event class (and we require FM_EREPORT_VERS0) but + * the payload version of the actual leaf class event under construction + * may be something else. Callers should supply a version in the varargs, + * or (better) we could take two version arguments - one for the + * ereport category 1 classification (expect FM_EREPORT_VERS0) and one + * for the leaf class. + */ +void +fm_ereport_set(nvlist_t *ereport, int version, const char *erpt_class, + uint64_t ena, const nvlist_t *detector, ...) +{ + char ereport_class[FM_MAX_CLASS]; + const char *name; + va_list ap; + int ret; + + if (version != FM_EREPORT_VERS0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + return; + } + + (void) snprintf(ereport_class, FM_MAX_CLASS, "%s.%s", + FM_EREPORT_CLASS, erpt_class); + if (nvlist_add_string(ereport, FM_CLASS, ereport_class) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint64(ereport, FM_EREPORT_ENA, ena)) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + } + + if (nvlist_add_nvlist(ereport, FM_EREPORT_DETECTOR, + (nvlist_t *)detector) != 0) { + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); + } + + va_start(ap, detector); + name = va_arg(ap, const char *); + ret = i_fm_payload_set(ereport, name, ap); + va_end(ap); + + if (ret) + atomic_inc_64(&erpt_kstat_data.erpt_set_failed.value.ui64); +} + +/* + * Set-up and validate the members of an hc fmri according to; + * + * Member name Type Value + * =================================================== + * version uint8_t 0 + * auth nvlist_t + * hc-name string + * hc-id string + * + * Note that auth and hc-id are optional members. + */ + +#define HC_MAXPAIRS 20 +#define HC_MAXNAMELEN 50 + +static int +fm_fmri_hc_set_common(nvlist_t *fmri, int version, const nvlist_t *auth) +{ + if (version != FM_HC_SCHEME_VERSION) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0 || + nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_HC) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + if (auth != NULL && nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return (0); + } + + return (1); +} + +void +fm_fmri_hc_set(nvlist_t *fmri, int version, const nvlist_t *auth, + nvlist_t *snvl, int npairs, ...) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); + nvlist_t *pairs[HC_MAXPAIRS]; + va_list ap; + int i; + + if (!fm_fmri_hc_set_common(fmri, version, auth)) + return; + + npairs = MIN(npairs, HC_MAXPAIRS); + + va_start(ap, npairs); + for (i = 0; i < npairs; i++) { + const char *name = va_arg(ap, const char *); + uint32_t id = va_arg(ap, uint32_t); + char idstr[11]; + + (void) snprintf(idstr, sizeof (idstr), "%u", id); + + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } + va_end(ap); + + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, npairs) != 0) + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + + for (i = 0; i < npairs; i++) + fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); + + if (snvl != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +void +fm_fmri_hc_create(nvlist_t *fmri, int version, const nvlist_t *auth, + nvlist_t *snvl, nvlist_t *bboard, int npairs, ...) +{ + nv_alloc_t *nva = nvlist_lookup_nv_alloc(fmri); + nvlist_t *pairs[HC_MAXPAIRS]; + nvlist_t **hcl; + uint_t n; + int i, j; + va_list ap; + char *hcname, *hcid; + + if (!fm_fmri_hc_set_common(fmri, version, auth)) + return; + + /* + * copy the bboard nvpairs to the pairs array + */ + if (nvlist_lookup_nvlist_array(bboard, FM_FMRI_HC_LIST, &hcl, &n) + != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + for (i = 0; i < n; i++) { + if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_NAME, + &hcname) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + if (nvlist_lookup_string(hcl[i], FM_FMRI_HC_ID, &hcid) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, hcname) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, hcid) != 0) { + for (j = 0; j <= i; j++) { + if (pairs[j] != NULL) + fm_nvlist_destroy(pairs[j], + FM_NVA_RETAIN); + } + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } + + /* + * create the pairs from passed in pairs + */ + npairs = MIN(npairs, HC_MAXPAIRS); + + va_start(ap, npairs); + for (i = n; i < npairs + n; i++) { + const char *name = va_arg(ap, const char *); + uint32_t id = va_arg(ap, uint32_t); + char idstr[11]; + (void) snprintf(idstr, sizeof (idstr), "%u", id); + pairs[i] = fm_nvlist_create(nva); + if (nvlist_add_string(pairs[i], FM_FMRI_HC_NAME, name) != 0 || + nvlist_add_string(pairs[i], FM_FMRI_HC_ID, idstr) != 0) { + for (j = 0; j <= i; j++) { + if (pairs[j] != NULL) + fm_nvlist_destroy(pairs[j], + FM_NVA_RETAIN); + } + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } + va_end(ap); + + /* + * Create the fmri hc list + */ + if (nvlist_add_nvlist_array(fmri, FM_FMRI_HC_LIST, pairs, + npairs + n) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + for (i = 0; i < npairs + n; i++) { + fm_nvlist_destroy(pairs[i], FM_NVA_RETAIN); + } + + if (snvl != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_HC_SPECIFIC, snvl) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + } +} + +/* + * Set-up and validate the members of an dev fmri according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t + * devpath string + * [devid] string + * [target-port-l0id] string + * + * Note that auth and devid are optional members. + */ +void +fm_fmri_dev_set(nvlist_t *fmri_dev, int version, const nvlist_t *auth, + const char *devpath, const char *devid, const char *tpl0) +{ + int err = 0; + + if (version != DEV_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + err |= nvlist_add_uint8(fmri_dev, FM_VERSION, version); + err |= nvlist_add_string(fmri_dev, FM_FMRI_SCHEME, FM_FMRI_SCHEME_DEV); + + if (auth != NULL) { + err |= nvlist_add_nvlist(fmri_dev, FM_FMRI_AUTHORITY, + (nvlist_t *)auth); + } + + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_PATH, devpath); + + if (devid != NULL) + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_ID, devid); + + if (tpl0 != NULL) + err |= nvlist_add_string(fmri_dev, FM_FMRI_DEV_TGTPTLUN0, tpl0); + + if (err) + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + +} + +/* + * Set-up and validate the members of an cpu fmri according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t + * cpuid uint32_t + * cpumask uint8_t + * serial uint64_t + * + * Note that auth, cpumask, serial are optional members. + * + */ +void +fm_fmri_cpu_set(nvlist_t *fmri_cpu, int version, const nvlist_t *auth, + uint32_t cpu_id, uint8_t *cpu_maskp, const char *serial_idp) +{ + uint64_t *failedp = &erpt_kstat_data.fmri_set_failed.value.ui64; + + if (version < CPU_SCHEME_VERSION1) { + atomic_inc_64(failedp); + return; + } + + if (nvlist_add_uint8(fmri_cpu, FM_VERSION, version) != 0) { + atomic_inc_64(failedp); + return; + } + + if (nvlist_add_string(fmri_cpu, FM_FMRI_SCHEME, + FM_FMRI_SCHEME_CPU) != 0) { + atomic_inc_64(failedp); + return; + } + + if (auth != NULL && nvlist_add_nvlist(fmri_cpu, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) + atomic_inc_64(failedp); + + if (nvlist_add_uint32(fmri_cpu, FM_FMRI_CPU_ID, cpu_id) != 0) + atomic_inc_64(failedp); + + if (cpu_maskp != NULL && nvlist_add_uint8(fmri_cpu, FM_FMRI_CPU_MASK, + *cpu_maskp) != 0) + atomic_inc_64(failedp); + + if (serial_idp == NULL || nvlist_add_string(fmri_cpu, + FM_FMRI_CPU_SERIAL_ID, (char *)serial_idp) != 0) + atomic_inc_64(failedp); +} + +/* + * Set-up and validate the members of a mem according to: + * + * Member name Type Value + * ==================================================== + * version uint8_t 0 + * auth nvlist_t [optional] + * unum string + * serial string [optional*] + * offset uint64_t [optional] + * + * * serial is required if offset is present + */ +void +fm_fmri_mem_set(nvlist_t *fmri, int version, const nvlist_t *auth, + const char *unum, const char *serial, uint64_t offset) +{ + if (version != MEM_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (!serial && (offset != (uint64_t)-1)) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_MEM) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (auth != NULL) { + if (nvlist_add_nvlist(fmri, FM_FMRI_AUTHORITY, + (nvlist_t *)auth) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } + + if (nvlist_add_string(fmri, FM_FMRI_MEM_UNUM, unum) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + } + + if (serial != NULL) { + if (nvlist_add_string_array(fmri, FM_FMRI_MEM_SERIAL_ID, + (char **)&serial, 1) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + if (offset != (uint64_t)-1 && nvlist_add_uint64(fmri, + FM_FMRI_MEM_OFFSET, offset) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +void +fm_fmri_zfs_set(nvlist_t *fmri, int version, uint64_t pool_guid, + uint64_t vdev_guid) +{ + if (version != ZFS_SCHEME_VERSION0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint8(fmri, FM_VERSION, version) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_string(fmri, FM_FMRI_SCHEME, FM_FMRI_SCHEME_ZFS) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + return; + } + + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_POOL, pool_guid) != 0) { + atomic_inc_64(&erpt_kstat_data.fmri_set_failed.value.ui64); + } + + if (vdev_guid != 0) { + if (nvlist_add_uint64(fmri, FM_FMRI_ZFS_VDEV, vdev_guid) != 0) { + atomic_inc_64( + &erpt_kstat_data.fmri_set_failed.value.ui64); + } + } +} + +uint64_t +fm_ena_increment(uint64_t ena) +{ + uint64_t new_ena; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + new_ena = ena + (1 << ENA_FMT1_GEN_SHFT); + break; + case FM_ENA_FMT2: + new_ena = ena + (1 << ENA_FMT2_GEN_SHFT); + break; + default: + new_ena = 0; + } + + return (new_ena); +} + +uint64_t +fm_ena_generate_cpu(uint64_t timestamp, processorid_t cpuid, uchar_t format) +{ + uint64_t ena = 0; + + switch (format) { + case FM_ENA_FMT1: + if (timestamp) { + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((cpuid << ENA_FMT1_CPUID_SHFT) & + ENA_FMT1_CPUID_MASK) | + ((timestamp << ENA_FMT1_TIME_SHFT) & + ENA_FMT1_TIME_MASK)); + } else { + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((cpuid << ENA_FMT1_CPUID_SHFT) & + ENA_FMT1_CPUID_MASK) | + ((gethrtime() << ENA_FMT1_TIME_SHFT) & + ENA_FMT1_TIME_MASK)); + } + break; + case FM_ENA_FMT2: + ena = (uint64_t)((format & ENA_FORMAT_MASK) | + ((timestamp << ENA_FMT2_TIME_SHFT) & ENA_FMT2_TIME_MASK)); + break; + default: + break; + } + + return (ena); +} + +uint64_t +fm_ena_generate(uint64_t timestamp, uchar_t format) +{ + uint64_t ena; + + kpreempt_disable(); + ena = fm_ena_generate_cpu(timestamp, getcpuid(), format); + kpreempt_enable(); + + return (ena); +} + +uint64_t +fm_ena_generation_get(uint64_t ena) +{ + uint64_t gen; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + gen = (ena & ENA_FMT1_GEN_MASK) >> ENA_FMT1_GEN_SHFT; + break; + case FM_ENA_FMT2: + gen = (ena & ENA_FMT2_GEN_MASK) >> ENA_FMT2_GEN_SHFT; + break; + default: + gen = 0; + break; + } + + return (gen); +} + +uchar_t +fm_ena_format_get(uint64_t ena) +{ + + return (ENA_FORMAT(ena)); +} + +uint64_t +fm_ena_id_get(uint64_t ena) +{ + uint64_t id; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + id = (ena & ENA_FMT1_ID_MASK) >> ENA_FMT1_ID_SHFT; + break; + case FM_ENA_FMT2: + id = (ena & ENA_FMT2_ID_MASK) >> ENA_FMT2_ID_SHFT; + break; + default: + id = 0; + } + + return (id); +} + +uint64_t +fm_ena_time_get(uint64_t ena) +{ + uint64_t time; + + switch (ENA_FORMAT(ena)) { + case FM_ENA_FMT1: + time = (ena & ENA_FMT1_TIME_MASK) >> ENA_FMT1_TIME_SHFT; + break; + case FM_ENA_FMT2: + time = (ena & ENA_FMT2_TIME_MASK) >> ENA_FMT2_TIME_SHFT; + break; + default: + time = 0; + } + + return (time); +} + +#ifdef _KERNEL +/* + * Helper function to increment ereport dropped count. Used by the event + * rate limiting code to give feedback to the user about how many events were + * rate limited by including them in the 'dropped' count. + */ +void +fm_erpt_dropped_increment(void) +{ + atomic_inc_64(&ratelimit_dropped); +} + +void +fm_init(void) +{ + zevent_len_cur = 0; + zevent_flags = 0; + + if (zfs_zevent_len_max == 0) + zfs_zevent_len_max = ERPT_MAX_ERRS * MAX(max_ncpus, 4); + + /* Initialize zevent allocation and generation kstats */ + fm_ksp = kstat_create("zfs", 0, "fm", "misc", KSTAT_TYPE_NAMED, + sizeof (struct erpt_kstat) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (fm_ksp != NULL) { + fm_ksp->ks_data = &erpt_kstat_data; + kstat_install(fm_ksp); + } else { + cmn_err(CE_NOTE, "failed to create fm/misc kstat\n"); + } + + mutex_init(&zevent_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zevent_list, sizeof (zevent_t), + offsetof(zevent_t, ev_node)); + cv_init(&zevent_cv, NULL, CV_DEFAULT, NULL); +} + +void +fm_fini(void) +{ + int count; + + zfs_zevent_drain_all(&count); + + mutex_enter(&zevent_lock); + cv_broadcast(&zevent_cv); + + zevent_flags |= ZEVENT_SHUTDOWN; + while (zevent_waiters > 0) { + mutex_exit(&zevent_lock); + schedule(); + mutex_enter(&zevent_lock); + } + mutex_exit(&zevent_lock); + + cv_destroy(&zevent_cv); + list_destroy(&zevent_list); + mutex_destroy(&zevent_lock); + + if (fm_ksp != NULL) { + kstat_delete(fm_ksp); + fm_ksp = NULL; + } +} +#endif /* _KERNEL */ + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, len_max, INT, ZMOD_RW, + "Max event queue length"); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, cols, INT, ZMOD_RW, + "Max event column width"); + +ZFS_MODULE_PARAM(zfs_zevent, zfs_zevent_, console, INT, ZMOD_RW, + "Log events to the console"); diff --git a/module/zfs/gzip.c b/module/zfs/gzip.c new file mode 100644 index 000000000000..e2c6e59969d6 --- /dev/null +++ b/module/zfs/gzip.c @@ -0,0 +1,106 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include +#include +#include +#include +#include + +#ifdef _KERNEL + +#include +typedef size_t zlen_t; +#define compress_func z_compress_level +#define uncompress_func z_uncompress + +#else /* _KERNEL */ + +#include +typedef uLongf zlen_t; +#define compress_func compress2 +#define uncompress_func uncompress + +#endif + +size_t +gzip_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + int ret; + zlen_t dstlen = d_len; + + ASSERT(d_len <= s_len); + + /* check if hardware accelerator can be used */ + if (qat_dc_use_accel(s_len)) { + ret = qat_compress(QAT_COMPRESS, s_start, s_len, d_start, + d_len, &dstlen); + if (ret == CPA_STATUS_SUCCESS) { + return ((size_t)dstlen); + } else if (ret == CPA_STATUS_INCOMPRESSIBLE) { + if (d_len != s_len) + return (s_len); + + bcopy(s_start, d_start, s_len); + return (s_len); + } + /* if hardware compression fails, do it again with software */ + } + + if (compress_func(d_start, &dstlen, s_start, s_len, n) != Z_OK) { + if (d_len != s_len) + return (s_len); + + bcopy(s_start, d_start, s_len); + return (s_len); + } + + return ((size_t)dstlen); +} + +/*ARGSUSED*/ +int +gzip_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + zlen_t dstlen = d_len; + + ASSERT(d_len >= s_len); + + /* check if hardware accelerator can be used */ + if (qat_dc_use_accel(d_len)) { + if (qat_compress(QAT_DECOMPRESS, s_start, s_len, + d_start, d_len, &dstlen) == CPA_STATUS_SUCCESS) + return (0); + /* if hardware de-compress fail, do it again with software */ + } + + if (uncompress_func(d_start, &dstlen, s_start, s_len) != Z_OK) + return (-1); + + return (0); +} diff --git a/module/zfs/hkdf.c b/module/zfs/hkdf.c new file mode 100644 index 000000000000..14265472df7d --- /dev/null +++ b/module/zfs/hkdf.c @@ -0,0 +1,171 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Datto, Inc. All rights reserved. + */ + +#include +#include +#include + +static int +hkdf_sha512_extract(uint8_t *salt, uint_t salt_len, uint8_t *key_material, + uint_t km_len, uint8_t *out_buf) +{ + int ret; + crypto_mechanism_t mech; + crypto_key_t key; + crypto_data_t input_cd, output_cd; + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(salt_len); + key.ck_data = salt; + + /* initialize crypto data for the input and output data */ + input_cd.cd_format = CRYPTO_DATA_RAW; + input_cd.cd_offset = 0; + input_cd.cd_length = km_len; + input_cd.cd_raw.iov_base = (char *)key_material; + input_cd.cd_raw.iov_len = input_cd.cd_length; + + output_cd.cd_format = CRYPTO_DATA_RAW; + output_cd.cd_offset = 0; + output_cd.cd_length = SHA512_DIGEST_LENGTH; + output_cd.cd_raw.iov_base = (char *)out_buf; + output_cd.cd_raw.iov_len = output_cd.cd_length; + + ret = crypto_mac(&mech, &input_cd, &key, NULL, &output_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + return (0); +} + +static int +hkdf_sha512_expand(uint8_t *extract_key, uint8_t *info, uint_t info_len, + uint8_t *out_buf, uint_t out_len) +{ + int ret; + crypto_mechanism_t mech; + crypto_context_t ctx; + crypto_key_t key; + crypto_data_t T_cd, info_cd, c_cd; + uint_t i, T_len = 0, pos = 0; + uint8_t c; + uint_t N = (out_len + SHA512_DIGEST_LENGTH) / SHA512_DIGEST_LENGTH; + uint8_t T[SHA512_DIGEST_LENGTH]; + + if (N > 255) + return (SET_ERROR(EINVAL)); + + /* initialize HMAC mechanism */ + mech.cm_type = crypto_mech2id(SUN_CKM_SHA512_HMAC); + mech.cm_param = NULL; + mech.cm_param_len = 0; + + /* initialize the salt as a crypto key */ + key.ck_format = CRYPTO_KEY_RAW; + key.ck_length = CRYPTO_BYTES2BITS(SHA512_DIGEST_LENGTH); + key.ck_data = extract_key; + + /* initialize crypto data for the input and output data */ + T_cd.cd_format = CRYPTO_DATA_RAW; + T_cd.cd_offset = 0; + T_cd.cd_raw.iov_base = (char *)T; + + c_cd.cd_format = CRYPTO_DATA_RAW; + c_cd.cd_offset = 0; + c_cd.cd_length = 1; + c_cd.cd_raw.iov_base = (char *)&c; + c_cd.cd_raw.iov_len = c_cd.cd_length; + + info_cd.cd_format = CRYPTO_DATA_RAW; + info_cd.cd_offset = 0; + info_cd.cd_length = info_len; + info_cd.cd_raw.iov_base = (char *)info; + info_cd.cd_raw.iov_len = info_cd.cd_length; + + for (i = 1; i <= N; i++) { + c = i; + + T_cd.cd_length = T_len; + T_cd.cd_raw.iov_len = T_cd.cd_length; + + ret = crypto_mac_init(&mech, &key, NULL, &ctx, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &T_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &info_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + ret = crypto_mac_update(ctx, &c_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + T_len = SHA512_DIGEST_LENGTH; + T_cd.cd_length = T_len; + T_cd.cd_raw.iov_len = T_cd.cd_length; + + ret = crypto_mac_final(ctx, &T_cd, NULL); + if (ret != CRYPTO_SUCCESS) + return (SET_ERROR(EIO)); + + bcopy(T, out_buf + pos, + (i != N) ? SHA512_DIGEST_LENGTH : (out_len - pos)); + pos += SHA512_DIGEST_LENGTH; + } + + return (0); +} + +/* + * HKDF is designed to be a relatively fast function for deriving keys from a + * master key + a salt. We use this function to generate new encryption keys + * so as to avoid hitting the cryptographic limits of the underlying + * encryption modes. Note that, for the sake of deriving encryption keys, the + * info parameter is called the "salt" everywhere else in the code. + */ +int +hkdf_sha512(uint8_t *key_material, uint_t km_len, uint8_t *salt, + uint_t salt_len, uint8_t *info, uint_t info_len, uint8_t *output_key, + uint_t out_len) +{ + int ret; + uint8_t extract_key[SHA512_DIGEST_LENGTH]; + + ret = hkdf_sha512_extract(salt, salt_len, key_material, km_len, + extract_key); + if (ret != 0) + return (ret); + + ret = hkdf_sha512_expand(extract_key, info, info_len, output_key, + out_len); + if (ret != 0) + return (ret); + + return (0); +} diff --git a/module/zfs/lz4.c b/module/zfs/lz4.c new file mode 100644 index 000000000000..4b46e694891a --- /dev/null +++ b/module/zfs/lz4.c @@ -0,0 +1,1031 @@ +/* + * LZ4 - Fast LZ compression algorithm + * Header File + * Copyright (C) 2011-2013, Yann Collet. + * BSD 2-Clause License (http://www.opensource.org/licenses/bsd-license.php) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are + * met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following disclaimer + * in the documentation and/or other materials provided with the + * distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * You can contact the author at : + * - LZ4 homepage : http://fastcompression.blogspot.com/p/lz4.html + * - LZ4 source repository : http://code.google.com/p/lz4/ + */ + +#include +#include + +static int real_LZ4_compress(const char *source, char *dest, int isize, + int osize); +static int LZ4_uncompress_unknownOutputSize(const char *source, char *dest, + int isize, int maxOutputSize); +static int LZ4_compressCtx(void *ctx, const char *source, char *dest, + int isize, int osize); +static int LZ4_compress64kCtx(void *ctx, const char *source, char *dest, + int isize, int osize); + +static kmem_cache_t *lz4_cache; + +/*ARGSUSED*/ +size_t +lz4_compress_zfs(void *s_start, void *d_start, size_t s_len, + size_t d_len, int n) +{ + uint32_t bufsiz; + char *dest = d_start; + + ASSERT(d_len >= sizeof (bufsiz)); + + bufsiz = real_LZ4_compress(s_start, &dest[sizeof (bufsiz)], s_len, + d_len - sizeof (bufsiz)); + + /* Signal an error if the compression routine returned zero. */ + if (bufsiz == 0) + return (s_len); + + /* + * The exact compressed size is needed by the decompression routine, + * so it is stored at the start of the buffer. Note that this may be + * less than the compressed block size, which is rounded up to a + * multiple of 1< s_len) + return (1); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative). + */ + return (LZ4_uncompress_unknownOutputSize(&src[sizeof (bufsiz)], + d_start, bufsiz, d_len) < 0); +} + +/* + * LZ4 API Description: + * + * Simple Functions: + * real_LZ4_compress() : + * isize : is the input size. Max supported value is ~1.9GB + * return : the number of bytes written in buffer dest + * or 0 if the compression fails (if LZ4_COMPRESSMIN is set). + * note : destination buffer must be already allocated. + * destination buffer must be sized to handle worst cases + * situations (input data not compressible) worst case size + * evaluation is provided by function LZ4_compressBound(). + * + * real_LZ4_uncompress() : + * osize : is the output size, therefore the original size + * return : the number of bytes read in the source buffer. + * If the source stream is malformed, the function will stop + * decoding and return a negative result, indicating the byte + * position of the faulty instruction. This function never + * writes beyond dest + osize, and is therefore protected + * against malicious data packets. + * note : destination buffer must be already allocated + * note : real_LZ4_uncompress() is not used in ZFS so its code + * is not present here. + * + * Advanced Functions + * + * LZ4_compressBound() : + * Provides the maximum size that LZ4 may output in a "worst case" + * scenario (input data not compressible) primarily useful for memory + * allocation of output buffer. + * + * isize : is the input size. Max supported value is ~1.9GB + * return : maximum output size in a "worst case" scenario + * note : this function is limited by "int" range (2^31-1) + * + * LZ4_uncompress_unknownOutputSize() : + * isize : is the input size, therefore the compressed size + * maxOutputSize : is the size of the destination buffer (which must be + * already allocated) + * return : the number of bytes decoded in the destination buffer + * (necessarily <= maxOutputSize). If the source stream is + * malformed, the function will stop decoding and return a + * negative result, indicating the byte position of the faulty + * instruction. This function never writes beyond dest + + * maxOutputSize, and is therefore protected against malicious + * data packets. + * note : Destination buffer must be already allocated. + * This version is slightly slower than real_LZ4_uncompress() + * + * LZ4_compressCtx() : + * This function explicitly handles the CTX memory structure. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_cache_alloc). Passing + * NULL isn't valid. + * + * LZ4_compress64kCtx() : + * Same as LZ4_compressCtx(), but specific to small inputs (<64KB). + * isize *Must* be <64KB, otherwise the output will be corrupted. + * + * ILLUMOS CHANGES: the CTX memory structure must be explicitly allocated + * by the caller (either on the stack or using kmem_cache_alloc). Passing + * NULL isn't valid. + */ + +/* + * Tuning parameters + */ + +/* + * COMPRESSIONLEVEL: Increasing this value improves compression ratio + * Lowering this value reduces memory usage. Reduced memory usage + * typically improves speed, due to cache effect (ex: L1 32KB for Intel, + * L1 64KB for AMD). Memory usage formula : N->2^(N+2) Bytes + * (examples : 12 -> 16KB ; 17 -> 512KB) + */ +#define COMPRESSIONLEVEL 12 + +/* + * NOTCOMPRESSIBLE_CONFIRMATION: Decreasing this value will make the + * algorithm skip faster data segments considered "incompressible". + * This may decrease compression ratio dramatically, but will be + * faster on incompressible data. Increasing this value will make + * the algorithm search more before declaring a segment "incompressible". + * This could improve compression a bit, but will be slower on + * incompressible data. The default value (6) is recommended. + */ +#define NOTCOMPRESSIBLE_CONFIRMATION 6 + +/* + * BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE: This will provide a boost to + * performance for big endian cpu, but the resulting compressed stream + * will be incompatible with little-endian CPU. You can set this option + * to 1 in situations where data will stay within closed environment. + * This option is useless on Little_Endian CPU (such as x86). + */ +/* #define BIG_ENDIAN_NATIVE_BUT_INCOMPATIBLE 1 */ + +/* + * CPU Feature Detection + */ + +/* 32 or 64 bits ? */ +#if defined(_LP64) +#define LZ4_ARCH64 1 +#else +#define LZ4_ARCH64 0 +#endif + +/* + * Little Endian or Big Endian? + * Note: overwrite the below #define if you know your architecture endianness. + */ +#if defined(_ZFS_BIG_ENDIAN) +#define LZ4_BIG_ENDIAN 1 +#else +/* + * Little Endian assumed. PDP Endian and other very rare endian format + * are unsupported. + */ +#undef LZ4_BIG_ENDIAN +#endif + +/* + * Unaligned memory access is automatically enabled for "common" CPU, + * such as x86. For others CPU, the compiler will be more cautious, and + * insert extra code to ensure aligned access is respected. If you know + * your target CPU supports unaligned memory access, you may want to + * force this option manually to improve performance + */ +#if defined(__ARM_FEATURE_UNALIGNED) +#define LZ4_FORCE_UNALIGNED_ACCESS 1 +#endif + +/* + * Illumos : we can't use GCC's __builtin_ctz family of builtins in the + * kernel + * Linux : we can use GCC's __builtin_ctz family of builtins in the + * kernel + */ +#undef LZ4_FORCE_SW_BITCOUNT +#if defined(__sparc) +#define LZ4_FORCE_SW_BITCOUNT +#endif + +/* + * Compiler Options + */ +/* Disable restrict */ +#define restrict + +/* + * Linux : GCC_VERSION is defined as of 3.9-rc1, so undefine it. + * torvalds/linux@3f3f8d2f48acfd8ed3b8e6b7377935da57b27b16 + */ +#ifdef GCC_VERSION +#undef GCC_VERSION +#endif + +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +#if (GCC_VERSION >= 302) || (__INTEL_COMPILER >= 800) || defined(__clang__) +#define expect(expr, value) (__builtin_expect((expr), (value))) +#else +#define expect(expr, value) (expr) +#endif + +#ifndef likely +#define likely(expr) expect((expr) != 0, 1) +#endif + +#ifndef unlikely +#define unlikely(expr) expect((expr) != 0, 0) +#endif + +#define lz4_bswap16(x) ((unsigned short int) ((((x) >> 8) & 0xffu) | \ + (((x) & 0xffu) << 8))) + +/* Basic types */ +#define BYTE uint8_t +#define U16 uint16_t +#define U32 uint32_t +#define S32 int32_t +#define U64 uint64_t + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack(1) +#endif + +typedef struct _U16_S { + U16 v; +} U16_S; +typedef struct _U32_S { + U32 v; +} U32_S; +typedef struct _U64_S { + U64 v; +} U64_S; + +#ifndef LZ4_FORCE_UNALIGNED_ACCESS +#pragma pack() +#endif + +#define A64(x) (((U64_S *)(x))->v) +#define A32(x) (((U32_S *)(x))->v) +#define A16(x) (((U16_S *)(x))->v) + +/* + * Constants + */ +#define MINMATCH 4 + +#define HASH_LOG COMPRESSIONLEVEL +#define HASHTABLESIZE (1 << HASH_LOG) +#define HASH_MASK (HASHTABLESIZE - 1) + +#define SKIPSTRENGTH (NOTCOMPRESSIBLE_CONFIRMATION > 2 ? \ + NOTCOMPRESSIBLE_CONFIRMATION : 2) + +#define COPYLENGTH 8 +#define LASTLITERALS 5 +#define MFLIMIT (COPYLENGTH + MINMATCH) +#define MINLENGTH (MFLIMIT + 1) + +#define MAXD_LOG 16 +#define MAX_DISTANCE ((1 << MAXD_LOG) - 1) + +#define ML_BITS 4 +#define ML_MASK ((1U<> ((MINMATCH * 8) - \ + HASH_LOG)) +#define LZ4_HASH_VALUE(p) LZ4_HASH_FUNCTION(A32(p)) +#define LZ4_WILDCOPY(s, d, e) do { LZ4_COPYPACKET(s, d) } while (d < e); +#define LZ4_BLINDCOPY(s, d, l) { BYTE* e = (d) + l; LZ4_WILDCOPY(s, d, e); \ + d = e; } + + +/* Private functions */ +#if LZ4_ARCH64 + +static inline int +LZ4_NbCommonBytes(register U64 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clzll(val) >> 3); +#else + int r; + if (!(val >> 32)) { + r = 4; + } else { + r = 0; + val >>= 32; + } + if (!(val >> 16)) { + r += 2; + val >>= 8; + } else { + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctzll(val) >> 3); +#else + static const int DeBruijnBytePos[64] = + { 0, 0, 0, 0, 0, 1, 1, 2, 0, 3, 1, 3, 1, 4, 2, 7, 0, 2, 3, 6, 1, 5, + 3, 5, 1, 3, 4, 4, 2, 5, 6, 7, 7, 0, 1, 2, 3, 3, 4, 6, 2, 6, 5, + 5, 3, 4, 5, 6, 7, 1, 2, 4, 6, 4, + 4, 5, 7, 2, 6, 5, 7, 6, 7, 7 + }; + return DeBruijnBytePos[((U64) ((val & -val) * 0x0218A392CDABBD3F)) >> + 58]; +#endif +#endif +} + +#else + +static inline int +LZ4_NbCommonBytes(register U32 val) +{ +#if defined(LZ4_BIG_ENDIAN) +#if ((defined(__GNUC__) && (GCC_VERSION >= 304)) || defined(__clang__)) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_clz(val) >> 3); +#else + int r; + if (!(val >> 16)) { + r = 2; + val >>= 8; + } else { + r = 0; + val >>= 24; + } + r += (!val); + return (r); +#endif +#else +#if defined(__GNUC__) && (GCC_VERSION >= 304) && \ + !defined(LZ4_FORCE_SW_BITCOUNT) + return (__builtin_ctz(val) >> 3); +#else + static const int DeBruijnBytePos[32] = { + 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 + }; + return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> + 27]; +#endif +#endif +} + +#endif + +/* Compression functions */ + +/*ARGSUSED*/ +static int +LZ4_compressCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ + struct refTables *srt = (struct refTables *)ctx; + HTYPE *HashTable = (HTYPE *) (srt->hashTable); + + const BYTE *ip = (BYTE *) source; + INITBASE(base); + const BYTE *anchor = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + ip++; + forwardH = LZ4_HASH_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (unlikely(forwardIp > mflimit)) { + goto _last_literals; + } + + forwardH = LZ4_HASH_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while ((ref < ip - MAX_DISTANCE) || (A32(ref) != A32(ip))); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + unlikely(ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (likely(ip < matchlimit - (STEPSIZE - 1))) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH_VALUE(ip)]; + HashTable[LZ4_HASH_VALUE(ip)] = ip - base; + if ((ref > ip - (MAX_DISTANCE + 1)) && (A32(ref) == A32(ip))) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) { + *op++ = 255; + } + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + + + +/* Note : this function is valid only if isize < LZ4_64KLIMIT */ +#define LZ4_64KLIMIT ((1 << 16) + (MFLIMIT - 1)) +#define HASHLOG64K (HASH_LOG + 1) +#define HASH64KTABLESIZE (1U << HASHLOG64K) +#define LZ4_HASH64K_FUNCTION(i) (((i) * 2654435761U) >> ((MINMATCH*8) - \ + HASHLOG64K)) +#define LZ4_HASH64K_VALUE(p) LZ4_HASH64K_FUNCTION(A32(p)) + +/*ARGSUSED*/ +static int +LZ4_compress64kCtx(void *ctx, const char *source, char *dest, int isize, + int osize) +{ + struct refTables *srt = (struct refTables *)ctx; + U16 *HashTable = (U16 *) (srt->hashTable); + + const BYTE *ip = (BYTE *) source; + const BYTE *anchor = ip; + const BYTE *const base = ip; + const BYTE *const iend = ip + isize; + const BYTE *const oend = (BYTE *) dest + osize; + const BYTE *const mflimit = iend - MFLIMIT; +#define matchlimit (iend - LASTLITERALS) + + BYTE *op = (BYTE *) dest; + + int len, length; + const int skipStrength = SKIPSTRENGTH; + U32 forwardH; + + /* Init */ + if (isize < MINLENGTH) + goto _last_literals; + + /* First Byte */ + ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + + /* Main Loop */ + for (;;) { + int findMatchAttempts = (1U << skipStrength) + 3; + const BYTE *forwardIp = ip; + const BYTE *ref; + BYTE *token; + + /* Find a match */ + do { + U32 h = forwardH; + int step = findMatchAttempts++ >> skipStrength; + ip = forwardIp; + forwardIp = ip + step; + + if (forwardIp > mflimit) { + goto _last_literals; + } + + forwardH = LZ4_HASH64K_VALUE(forwardIp); + ref = base + HashTable[h]; + HashTable[h] = ip - base; + + } while (A32(ref) != A32(ip)); + + /* Catch up */ + while ((ip > anchor) && (ref > (BYTE *) source) && + (ip[-1] == ref[-1])) { + ip--; + ref--; + } + + /* Encode Literal length */ + length = ip - anchor; + token = op++; + + /* Check output limit */ + if (unlikely(op + length + (2 + 1 + LASTLITERALS) + + (length >> 8) > oend)) + return (0); + + if (length >= (int)RUN_MASK) { + *token = (RUN_MASK << ML_BITS); + len = length - RUN_MASK; + for (; len > 254; len -= 255) + *op++ = 255; + *op++ = (BYTE)len; + } else + *token = (length << ML_BITS); + + /* Copy Literals */ + LZ4_BLINDCOPY(anchor, op, length); + + _next_match: + /* Encode Offset */ + LZ4_WRITE_LITTLEENDIAN_16(op, ip - ref); + + /* Start Counting */ + ip += MINMATCH; + ref += MINMATCH; /* MinMatch verified */ + anchor = ip; + while (ip < matchlimit - (STEPSIZE - 1)) { + UARCH diff = AARCH(ref) ^ AARCH(ip); + if (!diff) { + ip += STEPSIZE; + ref += STEPSIZE; + continue; + } + ip += LZ4_NbCommonBytes(diff); + goto _endCount; + } +#if LZ4_ARCH64 + if ((ip < (matchlimit - 3)) && (A32(ref) == A32(ip))) { + ip += 4; + ref += 4; + } +#endif + if ((ip < (matchlimit - 1)) && (A16(ref) == A16(ip))) { + ip += 2; + ref += 2; + } + if ((ip < matchlimit) && (*ref == *ip)) + ip++; + _endCount: + + /* Encode MatchLength */ + len = (ip - anchor); + /* Check output limit */ + if (unlikely(op + (1 + LASTLITERALS) + (len >> 8) > oend)) + return (0); + if (len >= (int)ML_MASK) { + *token += ML_MASK; + len -= ML_MASK; + for (; len > 509; len -= 510) { + *op++ = 255; + *op++ = 255; + } + if (len > 254) { + len -= 255; + *op++ = 255; + } + *op++ = (BYTE)len; + } else + *token += len; + + /* Test end of chunk */ + if (ip > mflimit) { + anchor = ip; + break; + } + /* Fill table */ + HashTable[LZ4_HASH64K_VALUE(ip - 2)] = ip - 2 - base; + + /* Test next position */ + ref = base + HashTable[LZ4_HASH64K_VALUE(ip)]; + HashTable[LZ4_HASH64K_VALUE(ip)] = ip - base; + if (A32(ref) == A32(ip)) { + token = op++; + *token = 0; + goto _next_match; + } + /* Prepare next loop */ + anchor = ip++; + forwardH = LZ4_HASH64K_VALUE(ip); + } + + _last_literals: + /* Encode Last Literals */ + { + int lastRun = iend - anchor; + if (op + lastRun + 1 + ((lastRun + 255 - RUN_MASK) / 255) > + oend) + return (0); + if (lastRun >= (int)RUN_MASK) { + *op++ = (RUN_MASK << ML_BITS); + lastRun -= RUN_MASK; + for (; lastRun > 254; lastRun -= 255) + *op++ = 255; + *op++ = (BYTE)lastRun; + } else + *op++ = (lastRun << ML_BITS); + (void) memcpy(op, anchor, iend - anchor); + op += iend - anchor; + } + + /* End */ + return (int)(((char *)op) - dest); +} + +static int +real_LZ4_compress(const char *source, char *dest, int isize, int osize) +{ + void *ctx; + int result; + + ASSERT(lz4_cache != NULL); + ctx = kmem_cache_alloc(lz4_cache, KM_SLEEP); + + /* + * out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (ctx == NULL) + return (0); + + memset(ctx, 0, sizeof (struct refTables)); + + if (isize < LZ4_64KLIMIT) + result = LZ4_compress64kCtx(ctx, source, dest, isize, osize); + else + result = LZ4_compressCtx(ctx, source, dest, isize, osize); + + kmem_cache_free(lz4_cache, ctx); + return (result); +} + +/* Decompression functions */ + +/* + * Note: The decoding functions real_LZ4_uncompress() and + * LZ4_uncompress_unknownOutputSize() are safe against "buffer overflow" + * attack type. They will never write nor read outside of the provided + * output buffers. LZ4_uncompress_unknownOutputSize() also insures that + * it will never read outside of the input buffer. A corrupted input + * will produce an error result, a negative int, indicating the position + * of the error within input stream. + * + * Note[2]: real_LZ4_uncompress(), referred to above, is not used in ZFS so + * its code is not present here. + */ + +static const int dec32table[] = {0, 3, 2, 3, 0, 0, 0, 0}; +#if LZ4_ARCH64 +static const int dec64table[] = {0, 0, 0, -1, 0, 1, 2, 3}; +#endif + +static int +LZ4_uncompress_unknownOutputSize(const char *source, char *dest, int isize, + int maxOutputSize) +{ + /* Local Variables */ + const BYTE *restrict ip = (const BYTE *) source; + const BYTE *const iend = ip + isize; + const BYTE *ref; + + BYTE *op = (BYTE *) dest; + BYTE *const oend = op + maxOutputSize; + BYTE *cpy; + + /* Main Loop */ + while (ip < iend) { + unsigned token; + size_t length; + + /* get runlength */ + token = *ip++; + if ((length = (token >> ML_BITS)) == RUN_MASK) { + int s = 255; + while ((ip < iend) && (s == 255)) { + s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + } + } + /* copy literals */ + cpy = op + length; + /* CORNER-CASE: cpy might overflow. */ + if (cpy < op) + goto _output_error; /* cpy was overflowed, bail! */ + if ((cpy > oend - COPYLENGTH) || + (ip + length > iend - COPYLENGTH)) { + if (cpy > oend) + /* Error: writes beyond output buffer */ + goto _output_error; + if (ip + length != iend) + /* + * Error: LZ4 format requires to consume all + * input at this stage + */ + goto _output_error; + (void) memcpy(op, ip, length); + op += length; + /* Necessarily EOF, due to parsing restrictions */ + break; + } + LZ4_WILDCOPY(ip, op, cpy); + ip -= (op - cpy); + op = cpy; + + /* get offset */ + LZ4_READ_LITTLEENDIAN_16(ref, cpy, ip); + ip += 2; + if (ref < (BYTE * const) dest) + /* + * Error: offset creates reference outside of + * destination buffer + */ + goto _output_error; + + /* get matchlength */ + if ((length = (token & ML_MASK)) == ML_MASK) { + while (ip < iend) { + int s = *ip++; + if (unlikely(length > (size_t)(length + s))) + goto _output_error; + length += s; + if (s == 255) + continue; + break; + } + } + /* copy repeated sequence */ + if (unlikely(op - ref < STEPSIZE)) { +#if LZ4_ARCH64 + int dec64 = dec64table[op - ref]; +#else + const int dec64 = 0; +#endif + op[0] = ref[0]; + op[1] = ref[1]; + op[2] = ref[2]; + op[3] = ref[3]; + op += 4; + ref += 4; + ref -= dec32table[op - ref]; + A32(op) = A32(ref); + op += STEPSIZE - 4; + ref -= dec64; + } else { + LZ4_COPYSTEP(ref, op); + } + cpy = op + length - (STEPSIZE - 4); + if (cpy > oend - COPYLENGTH) { + if (cpy > oend) + /* + * Error: request to write outside of + * destination buffer + */ + goto _output_error; +#if LZ4_ARCH64 + if ((ref + COPYLENGTH) > oend) +#else + if ((ref + COPYLENGTH) > oend || + (op + COPYLENGTH) > oend) +#endif + goto _output_error; + LZ4_SECURECOPY(ref, op, (oend - COPYLENGTH)); + while (op < cpy) + *op++ = *ref++; + op = cpy; + if (op == oend) + /* + * Check EOF (should never happen, since + * last 5 bytes are supposed to be literals) + */ + goto _output_error; + continue; + } + LZ4_SECURECOPY(ref, op, cpy); + op = cpy; /* correction */ + } + + /* end of decoding */ + return (int)(((char *)op) - dest); + + /* write overflow error detected */ + _output_error: + return (-1); +} + +void +lz4_init(void) +{ + lz4_cache = kmem_cache_create("lz4_cache", + sizeof (struct refTables), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +lz4_fini(void) +{ + if (lz4_cache) { + kmem_cache_destroy(lz4_cache); + lz4_cache = NULL; + } +} diff --git a/module/zfs/lzjb.c b/module/zfs/lzjb.c new file mode 100644 index 000000000000..a478e64c5141 --- /dev/null +++ b/module/zfs/lzjb.c @@ -0,0 +1,132 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +/* + * We keep our own copy of this algorithm for 3 main reasons: + * 1. If we didn't, anyone modifying common/os/compress.c would + * directly break our on disk format + * 2. Our version of lzjb does not have a number of checks that the + * common/os version needs and uses + * 3. We initialize the lempel to ensure deterministic results, + * so that identical blocks can always be deduplicated. + * In particular, we are adding the "feature" that compress() can + * take a destination buffer size and returns the compressed length, or the + * source length if compression would overflow the destination buffer. + */ + +#include +#include + +#define MATCH_BITS 6 +#define MATCH_MIN 3 +#define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) +#define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) +#define LEMPEL_SIZE 1024 + +/*ARGSUSED*/ +size_t +lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *cpy; + uchar_t *copymap = NULL; + int copymask = 1 << (NBBY - 1); + int mlen, offset, hash; + uint16_t *hp; + uint16_t *lempel; + + lempel = kmem_zalloc(LEMPEL_SIZE * sizeof (uint16_t), KM_SLEEP); + while (src < (uchar_t *)s_start + s_len) { + if ((copymask <<= 1) == (1 << NBBY)) { + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { + kmem_free(lempel, + LEMPEL_SIZE*sizeof (uint16_t)); + return (s_len); + } + copymask = 1; + copymap = dst; + *dst++ = 0; + } + if (src > (uchar_t *)s_start + s_len - MATCH_MAX) { + *dst++ = *src++; + continue; + } + hash = (src[0] << 16) + (src[1] << 8) + src[2]; + hash += hash >> 9; + hash += hash >> 5; + hp = &lempel[hash & (LEMPEL_SIZE - 1)]; + offset = (intptr_t)(src - *hp) & OFFSET_MASK; + *hp = (uint16_t)(uintptr_t)src; + cpy = src - offset; + if (cpy >= (uchar_t *)s_start && cpy != src && + src[0] == cpy[0] && src[1] == cpy[1] && src[2] == cpy[2]) { + *copymap |= copymask; + for (mlen = MATCH_MIN; mlen < MATCH_MAX; mlen++) + if (src[mlen] != cpy[mlen]) + break; + *dst++ = ((mlen - MATCH_MIN) << (NBBY - MATCH_BITS)) | + (offset >> NBBY); + *dst++ = (uchar_t)offset; + src += mlen; + } else { + *dst++ = *src++; + } + } + + kmem_free(lempel, LEMPEL_SIZE * sizeof (uint16_t)); + return (dst - (uchar_t *)d_start); +} + +/*ARGSUSED*/ +int +lzjb_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *d_end = (uchar_t *)d_start + d_len; + uchar_t *cpy; + uchar_t copymap = 0; + int copymask = 1 << (NBBY - 1); + + while (dst < d_end) { + if ((copymask <<= 1) == (1 << NBBY)) { + copymask = 1; + copymap = *src++; + } + if (copymap & copymask) { + int mlen = (src[0] >> (NBBY - MATCH_BITS)) + MATCH_MIN; + int offset = ((src[0] << NBBY) | src[1]) & OFFSET_MASK; + src += 2; + if ((cpy = dst - offset) < (uchar_t *)d_start) + return (-1); + while (--mlen >= 0 && dst < d_end) + *dst++ = *cpy++; + } else { + *dst++ = *src++; + } + } + return (0); +} diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c new file mode 100644 index 000000000000..ccc247d1557a --- /dev/null +++ b/module/zfs/metaslab.c @@ -0,0 +1,6236 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define WITH_DF_BLOCK_ALLOCATOR + +#define GANG_ALLOCATION(flags) \ + ((flags) & (METASLAB_GANG_CHILD | METASLAB_GANG_HEADER)) + +/* + * Metaslab granularity, in bytes. This is roughly similar to what would be + * referred to as the "stripe size" in traditional RAID arrays. In normal + * operation, we will try to write this amount of data to a top-level vdev + * before moving on to the next one. + */ +unsigned long metaslab_aliquot = 512 << 10; + +/* + * For testing, make some blocks above a certain size be gang blocks. + */ +unsigned long metaslab_force_ganging = SPA_MAXBLOCKSIZE + 1; + +/* + * In pools where the log space map feature is not enabled we touch + * multiple metaslabs (and their respective space maps) with each + * transaction group. Thus, we benefit from having a small space map + * block size since it allows us to issue more I/O operations scattered + * around the disk. So a sane default for the space map block size + * is 8~16K. + */ +int zfs_metaslab_sm_blksz_no_log = (1 << 14); + +/* + * When the log space map feature is enabled, we accumulate a lot of + * changes per metaslab that are flushed once in a while so we benefit + * from a bigger block size like 128K for the metaslab space maps. + */ +int zfs_metaslab_sm_blksz_with_log = (1 << 17); + +/* + * The in-core space map representation is more compact than its on-disk form. + * The zfs_condense_pct determines how much more compact the in-core + * space map representation must be before we compact it on-disk. + * Values should be greater than or equal to 100. + */ +int zfs_condense_pct = 200; + +/* + * Condensing a metaslab is not guaranteed to actually reduce the amount of + * space used on disk. In particular, a space map uses data in increments of + * MAX(1 << ashift, space_map_blksz), so a metaslab might use the + * same number of blocks after condensing. Since the goal of condensing is to + * reduce the number of IOPs required to read the space map, we only want to + * condense when we can be sure we will reduce the number of blocks used by the + * space map. Unfortunately, we cannot precisely compute whether or not this is + * the case in metaslab_should_condense since we are holding ms_lock. Instead, + * we apply the following heuristic: do not condense a spacemap unless the + * uncondensed size consumes greater than zfs_metaslab_condense_block_threshold + * blocks. + */ +int zfs_metaslab_condense_block_threshold = 4; + +/* + * The zfs_mg_noalloc_threshold defines which metaslab groups should + * be eligible for allocation. The value is defined as a percentage of + * free space. Metaslab groups that have more free space than + * zfs_mg_noalloc_threshold are always eligible for allocations. Once + * a metaslab group's free space is less than or equal to the + * zfs_mg_noalloc_threshold the allocator will avoid allocating to that + * group unless all groups in the pool have reached zfs_mg_noalloc_threshold. + * Once all groups in the pool reach zfs_mg_noalloc_threshold then all + * groups are allowed to accept allocations. Gang blocks are always + * eligible to allocate on any metaslab group. The default value of 0 means + * no metaslab group will be excluded based on this criterion. + */ +int zfs_mg_noalloc_threshold = 0; + +/* + * Metaslab groups are considered eligible for allocations if their + * fragmentation metric (measured as a percentage) is less than or + * equal to zfs_mg_fragmentation_threshold. If a metaslab group + * exceeds this threshold then it will be skipped unless all metaslab + * groups within the metaslab class have also crossed this threshold. + * + * This tunable was introduced to avoid edge cases where we continue + * allocating from very fragmented disks in our pool while other, less + * fragmented disks, exists. On the other hand, if all disks in the + * pool are uniformly approaching the threshold, the threshold can + * be a speed bump in performance, where we keep switching the disks + * that we allocate from (e.g. we allocate some segments from disk A + * making it bypassing the threshold while freeing segments from disk + * B getting its fragmentation below the threshold). + * + * Empirically, we've seen that our vdev selection for allocations is + * good enough that fragmentation increases uniformly across all vdevs + * the majority of the time. Thus we set the threshold percentage high + * enough to avoid hitting the speed bump on pools that are being pushed + * to the edge. + */ +int zfs_mg_fragmentation_threshold = 95; + +/* + * Allow metaslabs to keep their active state as long as their fragmentation + * percentage is less than or equal to zfs_metaslab_fragmentation_threshold. An + * active metaslab that exceeds this threshold will no longer keep its active + * status allowing better metaslabs to be selected. + */ +int zfs_metaslab_fragmentation_threshold = 70; + +/* + * When set will load all metaslabs when pool is first opened. + */ +int metaslab_debug_load = 0; + +/* + * When set will prevent metaslabs from being unloaded. + */ +int metaslab_debug_unload = 0; + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_OLD_MAXBLOCKSIZE; + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 4; + +/* + * Maximum distance to search forward from the last offset. Without this + * limit, fragmented pools can see >100,000 iterations and + * metaslab_block_picker() becomes the performance limiting factor on + * high-performance storage. + * + * With the default setting of 16MB, we typically see less than 500 + * iterations, even with very fragmented, ashift=9 pools. The maximum number + * of iterations possible is: + * metaslab_df_max_search / (2 * (1<ks_data = &metaslab_stats; + kstat_install(metaslab_ksp); + } +} + +void +metaslab_stat_fini(void) +{ + if (metaslab_ksp != NULL) { + kstat_delete(metaslab_ksp); + metaslab_ksp = NULL; + } + + kmem_cache_destroy(metaslab_alloc_trace_cache); + metaslab_alloc_trace_cache = NULL; +} +#else + +void +metaslab_stat_init(void) +{ +} + +void +metaslab_stat_fini(void) +{ +} +#endif + +/* + * ========================================================================== + * Metaslab classes + * ========================================================================== + */ +metaslab_class_t * +metaslab_class_create(spa_t *spa, metaslab_ops_t *ops) +{ + metaslab_class_t *mc; + + mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + + mc->mc_spa = spa; + mc->mc_rotor = NULL; + mc->mc_ops = ops; + mutex_init(&mc->mc_lock, NULL, MUTEX_DEFAULT, NULL); + mc->mc_metaslab_txg_list = multilist_create(sizeof (metaslab_t), + offsetof(metaslab_t, ms_class_txg_node), metaslab_idx_func); + mc->mc_alloc_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (zfs_refcount_t), KM_SLEEP); + mc->mc_alloc_max_slots = kmem_zalloc(spa->spa_alloc_count * + sizeof (uint64_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) + zfs_refcount_create_tracked(&mc->mc_alloc_slots[i]); + + return (mc); +} + +void +metaslab_class_destroy(metaslab_class_t *mc) +{ + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); + + for (int i = 0; i < mc->mc_spa->spa_alloc_count; i++) + zfs_refcount_destroy(&mc->mc_alloc_slots[i]); + kmem_free(mc->mc_alloc_slots, mc->mc_spa->spa_alloc_count * + sizeof (zfs_refcount_t)); + kmem_free(mc->mc_alloc_max_slots, mc->mc_spa->spa_alloc_count * + sizeof (uint64_t)); + mutex_destroy(&mc->mc_lock); + multilist_destroy(mc->mc_metaslab_txg_list); + kmem_free(mc, sizeof (metaslab_class_t)); +} + +int +metaslab_class_validate(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + vdev_t *vd; + + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); + + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); +} + +static void +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) +{ + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} + +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} + +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} + +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} + +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); +} + +void +metaslab_class_histogram_verify(metaslab_class_t *mc) +{ + spa_t *spa = mc->mc_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t *mc_hist; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mc_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, or + * vdevs that are not in this metalab class. + */ + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + mc_hist[i] += mg->mg_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + VERIFY3U(mc_hist[i], ==, mc->mc_histogram[i]); + + kmem_free(mc_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +/* + * Calculate the metaslab class's fragmentation metric. The metric + * is weighted based on the space contribution of each metaslab group. + * The return value will be a number between 0 and 100 (inclusive), or + * ZFS_FRAG_INVALID if the metric has not been set. See comment above the + * zfs_frag_table for more information about the metric. + */ +uint64_t +metaslab_class_fragmentation(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t fragmentation = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + /* + * Skip any holes, uninitialized top-levels, + * or vdevs that are not in this metalab class. + */ + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * If a metaslab group does not contain a fragmentation + * metric then just bail out. + */ + if (mg->mg_fragmentation == ZFS_FRAG_INVALID) { + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (ZFS_FRAG_INVALID); + } + + /* + * Determine how much this metaslab_group is contributing + * to the overall pool fragmentation metric. + */ + fragmentation += mg->mg_fragmentation * + metaslab_group_get_space(mg); + } + fragmentation /= metaslab_class_get_space(mc); + + ASSERT3U(fragmentation, <=, 100); + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (fragmentation); +} + +/* + * Calculate the amount of expandable space that is available in + * this metaslab class. If a device is expanded then its expandable + * space will be the amount of allocatable space that is currently not + * part of this metaslab class. + */ +uint64_t +metaslab_class_expandable_space(metaslab_class_t *mc) +{ + vdev_t *rvd = mc->mc_spa->spa_root_vdev; + uint64_t space = 0; + + spa_config_enter(mc->mc_spa, SCL_VDEV, FTAG, RW_READER); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (!vdev_is_concrete(tvd) || tvd->vdev_ms_shift == 0 || + mg->mg_class != mc) { + continue; + } + + /* + * Calculate if we have enough space to add additional + * metaslabs. We report the expandable space in terms + * of the metaslab size since that's the unit of expansion. + */ + space += P2ALIGN(tvd->vdev_max_asize - tvd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); + } + spa_config_exit(mc->mc_spa, SCL_VDEV, FTAG); + return (space); +} + +void +metaslab_class_evict_old(metaslab_class_t *mc, uint64_t txg) +{ + multilist_t *ml = mc->mc_metaslab_txg_list; + for (int i = 0; i < multilist_get_num_sublists(ml); i++) { + multilist_sublist_t *mls = multilist_sublist_lock(ml, i); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL) { + mutex_enter(&msp->ms_lock); + + /* + * If the metaslab has been removed from the list + * (which could happen if we were at the memory limit + * and it was evicted during this loop), then we can't + * proceed and we should restart the sublist. + */ + if (!multilist_link_active(&msp->ms_class_txg_node)) { + mutex_exit(&msp->ms_lock); + i--; + break; + } + mls = multilist_sublist_lock(ml, i); + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + if (txg > + msp->ms_selected_txg + metaslab_unload_delay && + gethrtime() > msp->ms_selected_time + + (uint64_t)MSEC2NSEC(metaslab_unload_delay_ms)) { + metaslab_evict(msp, txg); + } else { + /* + * Once we've hit a metaslab selected too + * recently to evict, we're done evicting for + * now. + */ + mutex_exit(&msp->ms_lock); + break; + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + } + } +} + +static int +metaslab_compare(const void *x1, const void *x2) +{ + const metaslab_t *m1 = (const metaslab_t *)x1; + const metaslab_t *m2 = (const metaslab_t *)x2; + + int sort1 = 0; + int sort2 = 0; + if (m1->ms_allocator != -1 && m1->ms_primary) + sort1 = 1; + else if (m1->ms_allocator != -1 && !m1->ms_primary) + sort1 = 2; + if (m2->ms_allocator != -1 && m2->ms_primary) + sort2 = 1; + else if (m2->ms_allocator != -1 && !m2->ms_primary) + sort2 = 2; + + /* + * Sort inactive metaslabs first, then primaries, then secondaries. When + * selecting a metaslab to allocate from, an allocator first tries its + * primary, then secondary active metaslab. If it doesn't have active + * metaslabs, or can't allocate from them, it searches for an inactive + * metaslab to activate. If it can't find a suitable one, it will steal + * a primary or secondary metaslab from another allocator. + */ + if (sort1 < sort2) + return (-1); + if (sort1 > sort2) + return (1); + + int cmp = TREE_CMP(m2->ms_weight, m1->ms_weight); + if (likely(cmp)) + return (cmp); + + IMPLY(TREE_CMP(m1->ms_start, m2->ms_start) == 0, m1 == m2); + + return (TREE_CMP(m1->ms_start, m2->ms_start)); +} + +/* + * ========================================================================== + * Metaslab groups + * ========================================================================== + */ +/* + * Update the allocatable flag and the metaslab group's capacity. + * The allocatable flag is set to true if the capacity is below + * the zfs_mg_noalloc_threshold or has a fragmentation value that is + * greater than zfs_mg_fragmentation_threshold. If a metaslab group + * transitions from allocatable to non-allocatable or vice versa then the + * metaslab group's class is updated to reflect the transition. + */ +static void +metaslab_group_alloc_update(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + metaslab_class_t *mc = mg->mg_class; + vdev_stat_t *vs = &vd->vdev_stat; + boolean_t was_allocatable; + boolean_t was_initialized; + + ASSERT(vd == vd->vdev_top); + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_READER), ==, + SCL_ALLOC); + + mutex_enter(&mg->mg_lock); + was_allocatable = mg->mg_allocatable; + was_initialized = mg->mg_initialized; + + mg->mg_free_capacity = ((vs->vs_space - vs->vs_alloc) * 100) / + (vs->vs_space + 1); + + mutex_enter(&mc->mc_lock); + + /* + * If the metaslab group was just added then it won't + * have any space until we finish syncing out this txg. + * At that point we will consider it initialized and available + * for allocations. We also don't consider non-activated + * metaslab groups (e.g. vdevs that are in the middle of being removed) + * to be initialized, because they can't be used for allocation. + */ + mg->mg_initialized = metaslab_group_initialized(mg); + if (!was_initialized && mg->mg_initialized) { + mc->mc_groups++; + } else if (was_initialized && !mg->mg_initialized) { + ASSERT3U(mc->mc_groups, >, 0); + mc->mc_groups--; + } + if (mg->mg_initialized) + mg->mg_no_free_space = B_FALSE; + + /* + * A metaslab group is considered allocatable if it has plenty + * of free space or is not heavily fragmented. We only take + * fragmentation into account if the metaslab group has a valid + * fragmentation metric (i.e. a value between 0 and 100). + */ + mg->mg_allocatable = (mg->mg_activation_count > 0 && + mg->mg_free_capacity > zfs_mg_noalloc_threshold && + (mg->mg_fragmentation == ZFS_FRAG_INVALID || + mg->mg_fragmentation <= zfs_mg_fragmentation_threshold)); + + /* + * The mc_alloc_groups maintains a count of the number of + * groups in this metaslab class that are still above the + * zfs_mg_noalloc_threshold. This is used by the allocating + * threads to determine if they should avoid allocations to + * a given group. The allocator will avoid allocations to a group + * if that group has reached or is below the zfs_mg_noalloc_threshold + * and there are still other groups that are above the threshold. + * When a group transitions from allocatable to non-allocatable or + * vice versa we update the metaslab class to reflect that change. + * When the mc_alloc_groups value drops to 0 that means that all + * groups have reached the zfs_mg_noalloc_threshold making all groups + * eligible for allocations. This effectively means that all devices + * are balanced again. + */ + if (was_allocatable && !mg->mg_allocatable) + mc->mc_alloc_groups--; + else if (!was_allocatable && mg->mg_allocatable) + mc->mc_alloc_groups++; + mutex_exit(&mc->mc_lock); + + mutex_exit(&mg->mg_lock); +} + +int +metaslab_sort_by_flushed(const void *va, const void *vb) +{ + const metaslab_t *a = va; + const metaslab_t *b = vb; + + int cmp = TREE_CMP(a->ms_unflushed_txg, b->ms_unflushed_txg); + if (likely(cmp)) + return (cmp); + + uint64_t a_vdev_id = a->ms_group->mg_vd->vdev_id; + uint64_t b_vdev_id = b->ms_group->mg_vd->vdev_id; + cmp = TREE_CMP(a_vdev_id, b_vdev_id); + if (cmp) + return (cmp); + + return (TREE_CMP(a->ms_id, b->ms_id)); +} + +metaslab_group_t * +metaslab_group_create(metaslab_class_t *mc, vdev_t *vd, int allocators) +{ + metaslab_group_t *mg; + + mg = kmem_zalloc(sizeof (metaslab_group_t), KM_SLEEP); + mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&mg->mg_ms_disabled_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mg->mg_ms_disabled_cv, NULL, CV_DEFAULT, NULL); + avl_create(&mg->mg_metaslab_tree, metaslab_compare, + sizeof (metaslab_t), offsetof(metaslab_t, ms_group_node)); + mg->mg_vd = vd; + mg->mg_class = mc; + mg->mg_activation_count = 0; + mg->mg_initialized = B_FALSE; + mg->mg_no_free_space = B_TRUE; + mg->mg_allocators = allocators; + + mg->mg_allocator = kmem_zalloc(allocators * + sizeof (metaslab_group_allocator_t), KM_SLEEP); + for (int i = 0; i < allocators; i++) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + zfs_refcount_create_tracked(&mga->mga_alloc_queue_depth); + } + + mg->mg_taskq = taskq_create("metaslab_group_taskq", metaslab_load_pct, + maxclsyspri, 10, INT_MAX, TASKQ_THREADS_CPU_PCT | TASKQ_DYNAMIC); + + return (mg); +} + +void +metaslab_group_destroy(metaslab_group_t *mg) +{ + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + + taskq_destroy(mg->mg_taskq); + avl_destroy(&mg->mg_metaslab_tree); + mutex_destroy(&mg->mg_lock); + mutex_destroy(&mg->mg_ms_disabled_lock); + cv_destroy(&mg->mg_ms_disabled_cv); + + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + zfs_refcount_destroy(&mga->mga_alloc_queue_depth); + } + kmem_free(mg->mg_allocator, mg->mg_allocators * + sizeof (metaslab_group_allocator_t)); + + kmem_free(mg, sizeof (metaslab_group_t)); +} + +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT3U(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER), !=, 0); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + metaslab_group_alloc_update(mg); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +/* + * Passivate a metaslab group and remove it from the allocation rotor. + * Callers must hold both the SCL_ALLOC and SCL_ZIO lock prior to passivating + * a metaslab group. This function will momentarily drop spa_config_locks + * that are lower than the SCL_ALLOC lock (see comment below). + */ +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + spa_t *spa = mc->mc_spa; + metaslab_group_t *mgprev, *mgnext; + int locks = spa_config_held(spa, SCL_ALL, RW_WRITER); + + ASSERT3U(spa_config_held(spa, SCL_ALLOC | SCL_ZIO, RW_WRITER), ==, + (SCL_ALLOC | SCL_ZIO)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + /* + * The spa_config_lock is an array of rwlocks, ordered as + * follows (from highest to lowest): + * SCL_CONFIG > SCL_STATE > SCL_L2ARC > SCL_ALLOC > + * SCL_ZIO > SCL_FREE > SCL_VDEV + * (For more information about the spa_config_lock see spa_misc.c) + * The higher the lock, the broader its coverage. When we passivate + * a metaslab group, we must hold both the SCL_ALLOC and the SCL_ZIO + * config locks. However, the metaslab group's taskq might be trying + * to preload metaslabs so we must drop the SCL_ZIO lock and any + * lower locks to allow the I/O to complete. At a minimum, + * we continue to hold the SCL_ALLOC lock, which prevents any future + * allocations from taking place and any changes to the vdev tree. + */ + spa_config_exit(spa, locks & ~(SCL_ZIO - 1), spa); + taskq_wait_outstanding(mg->mg_taskq, 0); + spa_config_enter(spa, locks & ~(SCL_ZIO - 1), spa, RW_WRITER); + metaslab_group_alloc_update(mg); + for (int i = 0; i < mg->mg_allocators; i++) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[i]; + metaslab_t *msp = mga->mga_primary; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + msp = mga->mga_secondary; + if (msp != NULL) { + mutex_enter(&msp->ms_lock); + metaslab_passivate(msp, + metaslab_weight_from_range_tree(msp)); + mutex_exit(&msp->ms_lock); + } + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + +boolean_t +metaslab_group_initialized(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + vdev_stat_t *vs = &vd->vdev_stat; + + return (vs->vs_space != 0 && mg->mg_activation_count > 0); +} + +uint64_t +metaslab_group_get_space(metaslab_group_t *mg) +{ + return ((1ULL << mg->mg_vd->vdev_ms_shift) * mg->mg_vd->vdev_ms_count); +} + +void +metaslab_group_histogram_verify(metaslab_group_t *mg) +{ + uint64_t *mg_hist; + vdev_t *vd = mg->mg_vd; + uint64_t ashift = vd->vdev_ashift; + int i; + + if ((zfs_flags & ZFS_DEBUG_HISTOGRAM_VERIFY) == 0) + return; + + mg_hist = kmem_zalloc(sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE, + KM_SLEEP); + + ASSERT3U(RANGE_TREE_HISTOGRAM_SIZE, >=, + SPACE_MAP_HISTOGRAM_SIZE + ashift); + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + /* skip if not active or not a member */ + if (msp->ms_sm == NULL || msp->ms_group != mg) + continue; + + for (i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + mg_hist[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i ++) + VERIFY3U(mg_hist[i], ==, mg->mg_histogram[i]); + + kmem_free(mg_hist, sizeof (uint64_t) * RANGE_TREE_HISTOGRAM_SIZE); +} + +static void +metaslab_group_histogram_add(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + mg->mg_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] += + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +void +metaslab_group_histogram_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + metaslab_class_t *mc = mg->mg_class; + uint64_t ashift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + if (msp->ms_sm == NULL) + return; + + mutex_enter(&mg->mg_lock); + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + ASSERT3U(mg->mg_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + ASSERT3U(mc->mc_histogram[i + ashift], >=, + msp->ms_sm->sm_phys->smp_histogram[i]); + + mg->mg_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + mc->mc_histogram[i + ashift] -= + msp->ms_sm->sm_phys->smp_histogram[i]; + } + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) +{ + ASSERT(msp->ms_group == NULL); + mutex_enter(&mg->mg_lock); + msp->ms_group = mg; + msp->ms_weight = 0; + avl_add(&mg->mg_metaslab_tree, msp); + mutex_exit(&mg->mg_lock); + + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_add(mg, msp); + mutex_exit(&msp->ms_lock); +} + +static void +metaslab_group_remove(metaslab_group_t *mg, metaslab_t *msp) +{ + mutex_enter(&msp->ms_lock); + metaslab_group_histogram_remove(mg, msp); + mutex_exit(&msp->ms_lock); + + mutex_enter(&mg->mg_lock); + ASSERT(msp->ms_group == mg); + avl_remove(&mg->mg_metaslab_tree, msp); + + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + + msp->ms_group = NULL; + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_group_sort_impl(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(MUTEX_HELD(&mg->mg_lock)); + ASSERT(msp->ms_group == mg); + + avl_remove(&mg->mg_metaslab_tree, msp); + msp->ms_weight = weight; + avl_add(&mg->mg_metaslab_tree, msp); + +} + +static void +metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) +{ + /* + * Although in principle the weight can be any value, in + * practice we do not use values in the range [1, 511]. + */ + ASSERT(weight >= SPA_MINBLOCKSIZE || weight == 0); + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + mutex_enter(&mg->mg_lock); + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +/* + * Calculate the fragmentation for a given metaslab group. We can use + * a simple average here since all metaslabs within the group must have + * the same size. The return value will be a value between 0 and 100 + * (inclusive), or ZFS_FRAG_INVALID if less than half of the metaslab in this + * group have a fragmentation metric. + */ +uint64_t +metaslab_group_fragmentation(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + uint64_t fragmentation = 0; + uint64_t valid_ms = 0; + + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_fragmentation == ZFS_FRAG_INVALID) + continue; + if (msp->ms_group != mg) + continue; + + valid_ms++; + fragmentation += msp->ms_fragmentation; + } + + if (valid_ms <= mg->mg_vd->vdev_ms_count / 2) + return (ZFS_FRAG_INVALID); + + fragmentation /= valid_ms; + ASSERT3U(fragmentation, <=, 100); + return (fragmentation); +} + +/* + * Determine if a given metaslab group should skip allocations. A metaslab + * group should avoid allocations if its free capacity is less than the + * zfs_mg_noalloc_threshold or its fragmentation metric is greater than + * zfs_mg_fragmentation_threshold and there is at least one metaslab group + * that can still handle allocations. If the allocation throttle is enabled + * then we skip allocations to devices that have reached their maximum + * allocation queue depth unless the selected metaslab group is the only + * eligible group remaining. + */ +static boolean_t +metaslab_group_allocatable(metaslab_group_t *mg, metaslab_group_t *rotor, + uint64_t psize, int allocator, int d) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_class_t *mc = mg->mg_class; + + /* + * We can only consider skipping this metaslab group if it's + * in the normal metaslab class and there are other metaslab + * groups to select from. Otherwise, we always consider it eligible + * for allocations. + */ + if ((mc != spa_normal_class(spa) && + mc != spa_special_class(spa) && + mc != spa_dedup_class(spa)) || + mc->mc_groups <= 1) + return (B_TRUE); + + /* + * If the metaslab group's mg_allocatable flag is set (see comments + * in metaslab_group_alloc_update() for more information) and + * the allocation throttle is disabled then allow allocations to this + * device. However, if the allocation throttle is enabled then + * check if we have reached our allocation limit (mg_alloc_queue_depth) + * to determine if we should allow allocations to this metaslab group. + * If all metaslab groups are no longer considered allocatable + * (mc_alloc_groups == 0) or we're trying to allocate the smallest + * gang block size then we allow allocations on this metaslab group + * regardless of the mg_allocatable or throttle settings. + */ + if (mg->mg_allocatable) { + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + int64_t qdepth; + uint64_t qmax = mga->mga_cur_max_alloc_queue_depth; + + if (!mc->mc_alloc_throttle_enabled) + return (B_TRUE); + + /* + * If this metaslab group does not have any free space, then + * there is no point in looking further. + */ + if (mg->mg_no_free_space) + return (B_FALSE); + + /* + * Relax allocation throttling for ditto blocks. Due to + * random imbalances in allocation it tends to push copies + * to one vdev, that looks a bit better at the moment. + */ + qmax = qmax * (4 + d) / 4; + + qdepth = zfs_refcount_count(&mga->mga_alloc_queue_depth); + + /* + * If this metaslab group is below its qmax or it's + * the only allocatable metasable group, then attempt + * to allocate from it. + */ + if (qdepth < qmax || mc->mc_alloc_groups == 1) + return (B_TRUE); + ASSERT3U(mc->mc_alloc_groups, >, 1); + + /* + * Since this metaslab group is at or over its qmax, we + * need to determine if there are metaslab groups after this + * one that might be able to handle this allocation. This is + * racy since we can't hold the locks for all metaslab + * groups at the same time when we make this check. + */ + for (metaslab_group_t *mgp = mg->mg_next; + mgp != rotor; mgp = mgp->mg_next) { + metaslab_group_allocator_t *mgap = + &mgp->mg_allocator[allocator]; + qmax = mgap->mga_cur_max_alloc_queue_depth; + qmax = qmax * (4 + d) / 4; + qdepth = + zfs_refcount_count(&mgap->mga_alloc_queue_depth); + + /* + * If there is another metaslab group that + * might be able to handle the allocation, then + * we return false so that we skip this group. + */ + if (qdepth < qmax && !mgp->mg_no_free_space) + return (B_FALSE); + } + + /* + * We didn't find another group to handle the allocation + * so we can't skip this metaslab group even though + * we are at or over our qmax. + */ + return (B_TRUE); + + } else if (mc->mc_alloc_groups == 0 || psize == SPA_MINBLOCKSIZE) { + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * ========================================================================== + * Range tree callbacks + * ========================================================================== + */ + +/* + * Comparison function for the private size-ordered tree using 32-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} + +/* + * Comparison function for the private size-ordered tree using 64-bit + * ranges. Tree is sorted by size, larger sizes at the end of the tree. + */ +static int +metaslab_rangesize64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + uint64_t rs_size1 = r1->rs_end - r1->rs_start; + uint64_t rs_size2 = r2->rs_end - r2->rs_start; + + int cmp = TREE_CMP(rs_size1, rs_size2); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(r1->rs_start, r2->rs_start)); +} +typedef struct metaslab_rt_arg { + zfs_btree_t *mra_bt; + uint32_t mra_floor_shift; +} metaslab_rt_arg_t; + +struct mssa_arg { + range_tree_t *rt; + metaslab_rt_arg_t *mra; +}; + +static void +metaslab_size_sorted_add(void *arg, uint64_t start, uint64_t size) +{ + struct mssa_arg *mssap = arg; + range_tree_t *rt = mssap->rt; + metaslab_rt_arg_t *mrap = mssap->mra; + range_seg_max_t seg = {0}; + rs_set_start(&seg, rt, start); + rs_set_end(&seg, rt, start + size); + metaslab_rt_add(rt, &seg, mrap); +} + +static void +metaslab_size_tree_full_load(range_tree_t *rt) +{ + metaslab_rt_arg_t *mrap = rt->rt_arg; +#ifdef _METASLAB_TRACING + METASLABSTAT_BUMP(metaslabstat_reload_tree); +#endif + ASSERT0(zfs_btree_numnodes(mrap->mra_bt)); + mrap->mra_floor_shift = 0; + struct mssa_arg arg = {0}; + arg.rt = rt; + arg.mra = mrap; + range_tree_walk(rt, metaslab_size_sorted_add, &arg); +} + +/* + * Create any block allocator specific components. The current allocators + * rely on using both a size-ordered range_tree_t and an array of uint64_t's. + */ +/* ARGSUSED */ +static void +metaslab_rt_create(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + size_t size; + int (*compare) (const void *, const void *); + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = metaslab_rangesize32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = metaslab_rangesize64_compare; + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, compare, size); + mrap->mra_floor_shift = metaslab_by_size_min_shift; +} + +/* ARGSUSED */ +static void +metaslab_rt_destroy(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + zfs_btree_destroy(size_tree); + kmem_free(mrap, sizeof (*mrap)); +} + +/* ARGSUSED */ +static void +metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < + (1 << mrap->mra_floor_shift)) + return; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + + if (rs_get_end(rs, rt) - rs_get_start(rs, rt) < (1 << + mrap->mra_floor_shift)) + return; + + zfs_btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +static void +metaslab_rt_vacate(range_tree_t *rt, void *arg) +{ + metaslab_rt_arg_t *mrap = arg; + zfs_btree_t *size_tree = mrap->mra_bt; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + metaslab_rt_create(rt, arg); +} + +static range_tree_ops_t metaslab_rt_ops = { + .rtop_create = metaslab_rt_create, + .rtop_destroy = metaslab_rt_destroy, + .rtop_add = metaslab_rt_add, + .rtop_remove = metaslab_rt_remove, + .rtop_vacate = metaslab_rt_vacate +}; + +/* + * ========================================================================== + * Common allocator routines + * ========================================================================== + */ + +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_largest_allocatable(metaslab_t *msp) +{ + zfs_btree_t *t = &msp->ms_allocatable_by_size; + range_seg_t *rs; + + if (t == NULL) + return (0); + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + + rs = zfs_btree_last(t, NULL); + if (rs == NULL) + return (0); + + return (rs_get_end(rs, msp->ms_allocatable) - rs_get_start(rs, + msp->ms_allocatable)); +} + +/* + * Return the maximum contiguous segment within the unflushed frees of this + * metaslab. + */ +static uint64_t +metaslab_largest_unflushed_free(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (msp->ms_unflushed_frees == NULL) + return (0); + + if (zfs_btree_numnodes(&msp->ms_unflushed_frees_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_unflushed_frees); + range_seg_t *rs = zfs_btree_last(&msp->ms_unflushed_frees_by_size, + NULL); + if (rs == NULL) + return (0); + + /* + * When a range is freed from the metaslab, that range is added to + * both the unflushed frees and the deferred frees. While the block + * will eventually be usable, if the metaslab were loaded the range + * would not be added to the ms_allocatable tree until TXG_DEFER_SIZE + * txgs had passed. As a result, when attempting to estimate an upper + * bound for the largest currently-usable free segment in the + * metaslab, we need to not consider any ranges currently in the defer + * trees. This algorithm approximates the largest available chunk in + * the largest range in the unflushed_frees tree by taking the first + * chunk. While this may be a poor estimate, it should only remain so + * briefly and should eventually self-correct as frees are no longer + * deferred. Similar logic applies to the ms_freed tree. See + * metaslab_load() for more details. + * + * There are two primary sources of inaccuracy in this estimate. Both + * are tolerated for performance reasons. The first source is that we + * only check the largest segment for overlaps. Smaller segments may + * have more favorable overlaps with the other trees, resulting in + * larger usable chunks. Second, we only look at the first chunk in + * the largest segment; there may be other usable chunks in the + * largest segment, but we ignore them. + */ + uint64_t rstart = rs_get_start(rs, msp->ms_unflushed_frees); + uint64_t rsize = rs_get_end(rs, msp->ms_unflushed_frees) - rstart; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_defer[t], rstart, + rsize, &start, &size); + if (found) { + if (rstart == start) + return (0); + rsize = start - rstart; + } + } + + uint64_t start = 0; + uint64_t size = 0; + boolean_t found = range_tree_find_in(msp->ms_freed, rstart, + rsize, &start, &size); + if (found) + rsize = start - rstart; + + return (rsize); +} + +static range_seg_t * +metaslab_block_find(zfs_btree_t *t, range_tree_t *rt, uint64_t start, + uint64_t size, zfs_btree_index_t *where) +{ + range_seg_t *rs; + range_seg_max_t rsearch; + + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, start + size); + + rs = zfs_btree_find(t, &rsearch, where); + if (rs == NULL) { + rs = zfs_btree_next(t, where, where); + } + + return (rs); +} + +#if defined(WITH_DF_BLOCK_ALLOCATOR) || \ + defined(WITH_CF_BLOCK_ALLOCATOR) +/* + * This is a helper function that can be used by the allocator to find a + * suitable block to allocate. This will search the specified B-tree looking + * for a block that matches the specified criteria. + */ +static uint64_t +metaslab_block_picker(range_tree_t *rt, uint64_t *cursor, uint64_t size, + uint64_t max_search) +{ + if (*cursor == 0) + *cursor = rt->rt_start; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; + range_seg_t *rs = metaslab_block_find(bt, rt, *cursor, size, &where); + uint64_t first_found; + int count_searched = 0; + + if (rs != NULL) + first_found = rs_get_start(rs, rt); + + while (rs != NULL && (rs_get_start(rs, rt) - first_found <= + max_search || count_searched < metaslab_min_search_count)) { + uint64_t offset = rs_get_start(rs, rt); + if (offset + size <= rs_get_end(rs, rt)) { + *cursor = offset + size; + return (offset); + } + rs = zfs_btree_next(bt, &where, &where); + count_searched++; + } + + *cursor = 0; + return (-1ULL); +} +#endif /* WITH_DF/CF_BLOCK_ALLOCATOR */ + +#if defined(WITH_DF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * Dynamic Fit (df) block allocator + * + * Search for a free chunk of at least this size, starting from the last + * offset (for this alignment of block) looking for up to + * metaslab_df_max_search bytes (16MB). If a large enough free chunk is not + * found within 16MB, then return a free chunk of exactly the requested size (or + * larger). + * + * If it seems like searching from the last offset will be unproductive, skip + * that and just return a free chunk of exactly the requested size (or larger). + * This is based on metaslab_df_alloc_threshold and metaslab_df_free_pct. This + * mechanism is probably not very useful and may be removed in the future. + * + * The behavior when not searching can be changed to return the largest free + * chunk, instead of a free chunk of exactly the requested size, by setting + * metaslab_df_use_largest_segment. + * ========================================================================== + */ +static uint64_t +metaslab_df_alloc(metaslab_t *msp, uint64_t size) +{ + /* + * Find the largest power of 2 block size that evenly divides the + * requested size. This is used to try to allocate blocks with similar + * alignment from the same area of the metaslab (i.e. same cursor + * bucket) but it does not guarantee that other allocations sizes + * may exist in the same region. + */ + uint64_t align = size & -size; + uint64_t *cursor = &msp->ms_lbas[highbit64(align) - 1]; + range_tree_t *rt = msp->ms_allocatable; + int free_pct = range_tree_space(rt) * 100 / msp->ms_size; + uint64_t offset; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * If we're running low on space, find a segment based on size, + * rather than iterating based on offset. + */ + if (metaslab_largest_allocatable(msp) < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + offset = -1; + } else { + offset = metaslab_block_picker(rt, + cursor, size, metaslab_df_max_search); + } + + if (offset == -1) { + range_seg_t *rs; + if (zfs_btree_numnodes(&msp->ms_allocatable_by_size) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + if (metaslab_df_use_largest_segment) { + /* use largest free segment */ + rs = zfs_btree_last(&msp->ms_allocatable_by_size, NULL); + } else { + zfs_btree_index_t where; + /* use segment of this size, or next largest */ +#ifdef _METASLAB_TRACING + metaslab_rt_arg_t *mrap = msp->ms_allocatable->rt_arg; + if (size < (1 << mrap->mra_floor_shift)) { + METASLABSTAT_BUMP( + metaslabstat_df_find_under_floor); + } +#endif + rs = metaslab_block_find(&msp->ms_allocatable_by_size, + rt, msp->ms_start, size, &where); + } + if (rs != NULL && rs_get_start(rs, rt) + size <= rs_get_end(rs, + rt)) { + offset = rs_get_start(rs, rt); + *cursor = offset + size; + } + } + + return (offset); +} + +static metaslab_ops_t metaslab_df_ops = { + metaslab_df_alloc +}; + +metaslab_ops_t *zfs_metaslab_ops = &metaslab_df_ops; +#endif /* WITH_DF_BLOCK_ALLOCATOR */ + +#if defined(WITH_CF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * Cursor fit block allocator - + * Select the largest region in the metaslab, set the cursor to the beginning + * of the range and the cursor_end to the end of the range. As allocations + * are made advance the cursor. Continue allocating from the cursor until + * the range is exhausted and then find a new range. + * ========================================================================== + */ +static uint64_t +metaslab_cf_alloc(metaslab_t *msp, uint64_t size) +{ + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *t = &msp->ms_allocatable_by_size; + uint64_t *cursor = &msp->ms_lbas[0]; + uint64_t *cursor_end = &msp->ms_lbas[1]; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(*cursor_end, >=, *cursor); + + if ((*cursor + size) > *cursor_end) { + range_seg_t *rs; + + if (zfs_btree_numnodes(t) == 0) + metaslab_size_tree_full_load(msp->ms_allocatable); + rs = zfs_btree_last(t, NULL); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < + size) + return (-1ULL); + + *cursor = rs_get_start(rs, rt); + *cursor_end = rs_get_end(rs, rt); + } + + offset = *cursor; + *cursor += size; + + return (offset); +} + +static metaslab_ops_t metaslab_cf_ops = { + metaslab_cf_alloc +}; + +metaslab_ops_t *zfs_metaslab_ops = &metaslab_cf_ops; +#endif /* WITH_CF_BLOCK_ALLOCATOR */ + +#if defined(WITH_NDF_BLOCK_ALLOCATOR) +/* + * ========================================================================== + * New dynamic fit allocator - + * Select a region that is large enough to allocate 2^metaslab_ndf_clump_shift + * contiguous blocks. If no region is found then just use the largest segment + * that remains. + * ========================================================================== + */ + +/* + * Determines desired number of contiguous blocks (2^metaslab_ndf_clump_shift) + * to request from the allocator. + */ +uint64_t metaslab_ndf_clump_shift = 4; + +static uint64_t +metaslab_ndf_alloc(metaslab_t *msp, uint64_t size) +{ + zfs_btree_t *t = &msp->ms_allocatable->rt_root; + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch; + uint64_t hbit = highbit64(size); + uint64_t *cursor = &msp->ms_lbas[hbit - 1]; + uint64_t max_size = metaslab_largest_allocatable(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if (max_size < size) + return (-1ULL); + + rs_set_start(&rsearch, rt, *cursor); + rs_set_end(&rsearch, rt, *cursor + size); + + rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL || (rs_get_end(rs, rt) - rs_get_start(rs, rt)) < size) { + t = &msp->ms_allocatable_by_size; + + rs_set_start(&rsearch, rt, 0); + rs_set_end(&rsearch, rt, MIN(max_size, 1ULL << (hbit + + metaslab_ndf_clump_shift))); + + rs = zfs_btree_find(t, &rsearch, &where); + if (rs == NULL) + rs = zfs_btree_next(t, &where, &where); + ASSERT(rs != NULL); + } + + if ((rs_get_end(rs, rt) - rs_get_start(rs, rt)) >= size) { + *cursor = rs_get_start(rs, rt) + size; + return (rs_get_start(rs, rt)); + } + return (-1ULL); +} + +static metaslab_ops_t metaslab_ndf_ops = { + metaslab_ndf_alloc +}; + +metaslab_ops_t *zfs_metaslab_ops = &metaslab_ndf_ops; +#endif /* WITH_NDF_BLOCK_ALLOCATOR */ + + +/* + * ========================================================================== + * Metaslabs + * ========================================================================== + */ + +/* + * Wait for any in-progress metaslab loads to complete. + */ +static void +metaslab_load_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_loading) { + ASSERT(!msp->ms_loaded); + cv_wait(&msp->ms_load_cv, &msp->ms_lock); + } +} + +/* + * Wait for any in-progress flushing to complete. + */ +static void +metaslab_flush_wait(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + while (msp->ms_flushing) + cv_wait(&msp->ms_flush_cv, &msp->ms_lock); +} + +static unsigned int +metaslab_idx_func(multilist_t *ml, void *arg) +{ + metaslab_t *msp = arg; + return (msp->ms_id % multilist_get_num_sublists(ml)); +} + +uint64_t +metaslab_allocated_space(metaslab_t *msp) +{ + return (msp->ms_allocated_space); +} + +/* + * Verify that the space accounting on disk matches the in-core range_trees. + */ +static void +metaslab_verify_space(metaslab_t *msp, uint64_t txg) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t allocating = 0; + uint64_t sm_free_space, msp_free_space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(!msp->ms_condensing); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can only verify the metaslab space when we're called + * from syncing context with a loaded metaslab that has an + * allocated space map. Calling this in non-syncing context + * does not provide a consistent view of the metaslab since + * we're performing allocations in the future. + */ + if (txg != spa_syncing_txg(spa) || msp->ms_sm == NULL || + !msp->ms_loaded) + return; + + /* + * Even though the smp_alloc field can get negative, + * when it comes to a metaslab's space map, that should + * never be the case. + */ + ASSERT3S(space_map_allocated(msp->ms_sm), >=, 0); + + ASSERT3U(space_map_allocated(msp->ms_sm), >=, + range_tree_space(msp->ms_unflushed_frees)); + + ASSERT3U(metaslab_allocated_space(msp), ==, + space_map_allocated(msp->ms_sm) + + range_tree_space(msp->ms_unflushed_allocs) - + range_tree_space(msp->ms_unflushed_frees)); + + sm_free_space = msp->ms_size - metaslab_allocated_space(msp); + + /* + * Account for future allocations since we would have + * already deducted that space from the ms_allocatable. + */ + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + allocating += + range_tree_space(msp->ms_allocating[(txg + t) & TXG_MASK]); + } + ASSERT3U(allocating + msp->ms_allocated_this_txg, ==, + msp->ms_allocating_total); + + ASSERT3U(msp->ms_deferspace, ==, + range_tree_space(msp->ms_defer[0]) + + range_tree_space(msp->ms_defer[1])); + + msp_free_space = range_tree_space(msp->ms_allocatable) + allocating + + msp->ms_deferspace + range_tree_space(msp->ms_freed); + + VERIFY3U(sm_free_space, ==, msp_free_space); +} + +static void +metaslab_aux_histograms_clear(metaslab_t *msp) +{ + /* + * Auxiliary histograms are only cleared when resetting them, + * which can only happen while the metaslab is loaded. + */ + ASSERT(msp->ms_loaded); + + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) + bzero(msp->ms_deferhist[t], sizeof (msp->ms_deferhist[t])); +} + +static void +metaslab_aux_histogram_add(uint64_t *histogram, uint64_t shift, + range_tree_t *rt) +{ + /* + * This is modeled after space_map_histogram_add(), so refer to that + * function for implementation details. We want this to work like + * the space map histogram, and not the range tree histogram, as we + * are essentially constructing a delta that will be later subtracted + * from the space map histogram. + */ + int idx = 0; + for (int i = shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + ASSERT3U(i, >=, idx + shift); + histogram[idx] += rt->rt_histogram[i] << (i - idx - shift); + + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + +/* + * Called at every sync pass that the metaslab gets synced. + * + * The reason is that we want our auxiliary histograms to be updated + * wherever the metaslab's space map histogram is updated. This way + * we stay consistent on which parts of the metaslab space map's + * histogram are currently not available for allocations (e.g because + * they are in the defer, freed, and freeing trees). + */ +static void +metaslab_aux_histograms_update(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(sm != NULL); + + /* + * This is similar to the metaslab's space map histogram updates + * that take place in metaslab_sync(). The only difference is that + * we only care about segments that haven't made it into the + * ms_allocatable tree yet. + */ + if (msp->ms_loaded) { + metaslab_aux_histograms_clear(msp); + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freed); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + metaslab_aux_histogram_add(msp->ms_deferhist[t], + sm->sm_shift, msp->ms_defer[t]); + } + } + + metaslab_aux_histogram_add(msp->ms_synchist, + sm->sm_shift, msp->ms_freeing); +} + +/* + * Called every time we are done syncing (writing to) the metaslab, + * i.e. at the end of each sync pass. + * [see the comment in metaslab_impl.h for ms_synchist, ms_deferhist] + */ +static void +metaslab_aux_histograms_update_done(metaslab_t *msp, boolean_t defer_allowed) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + space_map_t *sm = msp->ms_sm; + + if (sm == NULL) { + /* + * We came here from metaslab_init() when creating/opening a + * pool, looking at a metaslab that hasn't had any allocations + * yet. + */ + return; + } + + /* + * This is similar to the actions that we take for the ms_freed + * and ms_defer trees in metaslab_sync_done(). + */ + uint64_t hist_index = spa_syncing_txg(spa) % TXG_DEFER_SIZE; + if (defer_allowed) { + bcopy(msp->ms_synchist, msp->ms_deferhist[hist_index], + sizeof (msp->ms_synchist)); + } else { + bzero(msp->ms_deferhist[hist_index], + sizeof (msp->ms_deferhist[hist_index])); + } + bzero(msp->ms_synchist, sizeof (msp->ms_synchist)); +} + +/* + * Ensure that the metaslab's weight and fragmentation are consistent + * with the contents of the histogram (either the range tree's histogram + * or the space map's depending whether the metaslab is loaded). + */ +static void +metaslab_verify_weight_and_frag(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + /* + * We can end up here from vdev_remove_complete(), in which case we + * cannot do these assertions because we hold spa config locks and + * thus we are not allowed to read from the DMU. + * + * We check if the metaslab group has been removed and if that's + * the case we return immediately as that would mean that we are + * here from the aforementioned code path. + */ + if (msp->ms_group == NULL) + return; + + /* + * Devices being removed always return a weight of 0 and leave + * fragmentation and ms_max_size as is - there is nothing for + * us to verify here. + */ + vdev_t *vd = msp->ms_group->mg_vd; + if (vd->vdev_removing) + return; + + /* + * If the metaslab is dirty it probably means that we've done + * some allocations or frees that have changed our histograms + * and thus the weight. + */ + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&vd->vdev_ms_list, msp, t)) + return; + } + + /* + * This verification checks that our in-memory state is consistent + * with what's on disk. If the pool is read-only then there aren't + * any changes and we just have the initially-loaded state. + */ + if (!spa_writeable(msp->ms_group->mg_vd->vdev_spa)) + return; + + /* some extra verification for in-core tree if you can */ + if (msp->ms_loaded) { + range_tree_stat_verify(msp->ms_allocatable); + VERIFY(space_map_histogram_verify(msp->ms_sm, + msp->ms_allocatable)); + } + + uint64_t weight = msp->ms_weight; + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + boolean_t space_based = WEIGHT_IS_SPACEBASED(msp->ms_weight); + uint64_t frag = msp->ms_fragmentation; + uint64_t max_segsize = msp->ms_max_size; + + msp->ms_weight = 0; + msp->ms_fragmentation = 0; + + /* + * This function is used for verification purposes and thus should + * not introduce any side-effects/mutations on the system's state. + * + * Regardless of whether metaslab_weight() thinks this metaslab + * should be active or not, we want to ensure that the actual weight + * (and therefore the value of ms_weight) would be the same if it + * was to be recalculated at this point. + * + * In addition we set the nodirty flag so metaslab_weight() does + * not dirty the metaslab for future TXGs (e.g. when trying to + * force condensing to upgrade the metaslab spacemaps). + */ + msp->ms_weight = metaslab_weight(msp, B_TRUE) | was_active; + + VERIFY3U(max_segsize, ==, msp->ms_max_size); + + /* + * If the weight type changed then there is no point in doing + * verification. Revert fields to their original values. + */ + if ((space_based && !WEIGHT_IS_SPACEBASED(msp->ms_weight)) || + (!space_based && WEIGHT_IS_SPACEBASED(msp->ms_weight))) { + msp->ms_fragmentation = frag; + msp->ms_weight = weight; + return; + } + + VERIFY3U(msp->ms_fragmentation, ==, frag); + VERIFY3U(msp->ms_weight, ==, weight); +} + +/* + * If we're over the zfs_metaslab_mem_limit, select the loaded metaslab from + * this class that was used longest ago, and attempt to unload it. We don't + * want to spend too much time in this loop to prevent performance + * degradation, and we expect that most of the time this operation will + * succeed. Between that and the normal unloading processing during txg sync, + * we expect this to keep the metaslab memory usage under control. + */ +static void +metaslab_potentially_evict(metaslab_class_t *mc) +{ +#ifdef _KERNEL + uint64_t allmem = arc_all_memory(); + uint64_t inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); + uint64_t size = spl_kmem_cache_entry_size(zfs_btree_leaf_cache); + int tries = 0; + for (; allmem * zfs_metaslab_mem_limit / 100 < inuse * size && + tries < multilist_get_num_sublists(mc->mc_metaslab_txg_list) * 2; + tries++) { + unsigned int idx = multilist_get_random_index( + mc->mc_metaslab_txg_list); + multilist_sublist_t *mls = + multilist_sublist_lock(mc->mc_metaslab_txg_list, idx); + metaslab_t *msp = multilist_sublist_head(mls); + multilist_sublist_unlock(mls); + while (msp != NULL && allmem * zfs_metaslab_mem_limit / 100 < + inuse * size) { + VERIFY3P(mls, ==, multilist_sublist_lock( + mc->mc_metaslab_txg_list, idx)); + ASSERT3U(idx, ==, + metaslab_idx_func(mc->mc_metaslab_txg_list, msp)); + + if (!multilist_link_active(&msp->ms_class_txg_node)) { + multilist_sublist_unlock(mls); + break; + } + metaslab_t *next_msp = multilist_sublist_next(mls, msp); + multilist_sublist_unlock(mls); + /* + * If the metaslab is currently loading there are two + * cases. If it's the metaslab we're evicting, we + * can't continue on or we'll panic when we attempt to + * recursively lock the mutex. If it's another + * metaslab that's loading, it can be safely skipped, + * since we know it's very new and therefore not a + * good eviction candidate. We check later once the + * lock is held that the metaslab is fully loaded + * before actually unloading it. + */ + if (msp->ms_loading) { + msp = next_msp; + inuse = + spl_kmem_cache_inuse(zfs_btree_leaf_cache); + continue; + } + /* + * We can't unload metaslabs with no spacemap because + * they're not ready to be unloaded yet. We can't + * unload metaslabs with outstanding allocations + * because doing so could cause the metaslab's weight + * to decrease while it's unloaded, which violates an + * invariant that we use to prevent unnecessary + * loading. We also don't unload metaslabs that are + * currently active because they are high-weight + * metaslabs that are likely to be used in the near + * future. + */ + mutex_enter(&msp->ms_lock); + if (msp->ms_allocator == -1 && msp->ms_sm != NULL && + msp->ms_allocating_total == 0) { + metaslab_unload(msp); + } + mutex_exit(&msp->ms_lock); + msp = next_msp; + inuse = spl_kmem_cache_inuse(zfs_btree_leaf_cache); + } + } +#endif +} + +static int +metaslab_load_impl(metaslab_t *msp) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loading); + ASSERT(!msp->ms_condensing); + + /* + * We temporarily drop the lock to unblock other operations while we + * are reading the space map. Therefore, metaslab_sync() and + * metaslab_sync_done() can run at the same time as we do. + * + * If we are using the log space maps, metaslab_sync() can't write to + * the metaslab's space map while we are loading as we only write to + * it when we are flushing the metaslab, and that can't happen while + * we are loading it. + * + * If we are not using log space maps though, metaslab_sync() can + * append to the space map while we are loading. Therefore we load + * only entries that existed when we started the load. Additionally, + * metaslab_sync_done() has to wait for the load to complete because + * there are potential races like metaslab_load() loading parts of the + * space map that are currently being appended by metaslab_sync(). If + * we didn't, the ms_allocatable would have entries that + * metaslab_sync_done() would try to re-add later. + * + * That's why before dropping the lock we remember the synced length + * of the metaslab and read up to that point of the space map, + * ignoring entries appended by metaslab_sync() that happen after we + * drop the lock. + */ + uint64_t length = msp->ms_synced_length; + mutex_exit(&msp->ms_lock); + + hrtime_t load_start = gethrtime(); + metaslab_rt_arg_t *mrap; + if (msp->ms_allocatable->rt_arg == NULL) { + mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + } else { + mrap = msp->ms_allocatable->rt_arg; + msp->ms_allocatable->rt_ops = NULL; + msp->ms_allocatable->rt_arg = NULL; + } + mrap->mra_bt = &msp->ms_allocatable_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + + if (msp->ms_sm != NULL) { + error = space_map_load_length(msp->ms_sm, msp->ms_allocatable, + SM_FREE, length); + + /* Now, populate the size-sorted tree. */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + + struct mssa_arg arg = {0}; + arg.rt = msp->ms_allocatable; + arg.mra = mrap; + range_tree_walk(msp->ms_allocatable, metaslab_size_sorted_add, + &arg); + } else { + /* + * Add the size-sorted tree first, since we don't need to load + * the metaslab from the spacemap. + */ + metaslab_rt_create(msp->ms_allocatable, mrap); + msp->ms_allocatable->rt_ops = &metaslab_rt_ops; + msp->ms_allocatable->rt_arg = mrap; + /* + * The space map has not been allocated yet, so treat + * all the space in the metaslab as free and add it to the + * ms_allocatable tree. + */ + range_tree_add(msp->ms_allocatable, + msp->ms_start, msp->ms_size); + + if (msp->ms_freed != NULL) { + /* + * If the ms_sm doesn't exist, this means that this + * metaslab hasn't gone through metaslab_sync() and + * thus has never been dirtied. So we shouldn't + * expect any unflushed allocs or frees from previous + * TXGs. + * + * Note: ms_freed and all the other trees except for + * the ms_allocatable, can be NULL at this point only + * if this is a new metaslab of a vdev that just got + * expanded. + */ + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } + } + + /* + * We need to grab the ms_sync_lock to prevent metaslab_sync() from + * changing the ms_sm (or log_sm) and the metaslab's range trees + * while we are about to use them and populate the ms_allocatable. + * The ms_lock is insufficient for this because metaslab_sync() doesn't + * hold the ms_lock while writing the ms_checkpointing tree to disk. + */ + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + ASSERT(!msp->ms_condensing); + ASSERT(!msp->ms_flushing); + + if (error != 0) { + mutex_exit(&msp->ms_sync_lock); + return (error); + } + + ASSERT3P(msp->ms_group, !=, NULL); + msp->ms_loaded = B_TRUE; + + /* + * Apply all the unflushed changes to ms_allocatable right + * away so any manipulations we do below have a clear view + * of what is allocated and what is free. + */ + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_remove, msp->ms_allocatable); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_add, msp->ms_allocatable); + + msp->ms_loaded = B_TRUE; + + ASSERT3P(msp->ms_group, !=, NULL); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + if (spa_syncing_log_sm(spa) != NULL) { + ASSERT(spa_feature_is_enabled(spa, + SPA_FEATURE_LOG_SPACEMAP)); + + /* + * If we use a log space map we add all the segments + * that are in ms_unflushed_frees so they are available + * for allocation. + * + * ms_allocatable needs to contain all free segments + * that are ready for allocations (thus not segments + * from ms_freeing, ms_freed, and the ms_defer trees). + * But if we grab the lock in this code path at a sync + * pass later that 1, then it also contains the + * segments of ms_freed (they were added to it earlier + * in this path through ms_unflushed_frees). So we + * need to remove all the segments that exist in + * ms_freed from ms_allocatable as they will be added + * later in metaslab_sync_done(). + * + * When there's no log space map, the ms_allocatable + * correctly doesn't contain any segments that exist + * in ms_freed [see ms_synced_length]. + */ + range_tree_walk(msp->ms_freed, + range_tree_remove, msp->ms_allocatable); + } + + /* + * If we are not using the log space map, ms_allocatable + * contains the segments that exist in the ms_defer trees + * [see ms_synced_length]. Thus we need to remove them + * from ms_allocatable as they will be added again in + * metaslab_sync_done(). + * + * If we are using the log space map, ms_allocatable still + * contains the segments that exist in the ms_defer trees. + * Not because it read them through the ms_sm though. But + * because these segments are part of ms_unflushed_frees + * whose segments we add to ms_allocatable earlier in this + * code path. + */ + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_remove, msp->ms_allocatable); + } + + /* + * Call metaslab_recalculate_weight_and_sort() now that the + * metaslab is loaded so we get the metaslab's real weight. + * + * Unless this metaslab was created with older software and + * has not yet been converted to use segment-based weight, we + * expect the new weight to be better or equal to the weight + * that the metaslab had while it was not loaded. This is + * because the old weight does not take into account the + * consolidation of adjacent segments between TXGs. [see + * comment for ms_synchist and ms_deferhist[] for more info] + */ + uint64_t weight = msp->ms_weight; + uint64_t max_size = msp->ms_max_size; + metaslab_recalculate_weight_and_sort(msp); + if (!WEIGHT_IS_SPACEBASED(weight)) + ASSERT3U(weight, <=, msp->ms_weight); + msp->ms_max_size = metaslab_largest_allocatable(msp); + ASSERT3U(max_size, <=, msp->ms_max_size); + hrtime_t load_end = gethrtime(); + msp->ms_load_time = load_end; + zfs_dbgmsg("metaslab_load: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, smp_length %llu, " + "unflushed_allocs %llu, unflushed_frees %llu, " + "freed %llu, defer %llu + %llu, unloaded time %llu ms, " + "loading_time %lld ms, ms_max_size %llu, " + "max size error %lld, " + "old_weight %llx, new_weight %llx", + spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + space_map_length(msp->ms_sm), + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + range_tree_space(msp->ms_freed), + range_tree_space(msp->ms_defer[0]), + range_tree_space(msp->ms_defer[1]), + (longlong_t)((load_start - msp->ms_unload_time) / 1000000), + (longlong_t)((load_end - load_start) / 1000000), + msp->ms_max_size, msp->ms_max_size - max_size, + weight, msp->ms_weight); + + metaslab_verify_space(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_sync_lock); + return (0); +} + +int +metaslab_load(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * There may be another thread loading the same metaslab, if that's + * the case just wait until the other thread is done and return. + */ + metaslab_load_wait(msp); + if (msp->ms_loaded) + return (0); + VERIFY(!msp->ms_loading); + ASSERT(!msp->ms_condensing); + + /* + * We set the loading flag BEFORE potentially dropping the lock to + * wait for an ongoing flush (see ms_flushing below). This way other + * threads know that there is already a thread that is loading this + * metaslab. + */ + msp->ms_loading = B_TRUE; + + /* + * Wait for any in-progress flushing to finish as we drop the ms_lock + * both here (during space_map_load()) and in metaslab_flush() (when + * we flush our changes to the ms_sm). + */ + if (msp->ms_flushing) + metaslab_flush_wait(msp); + + /* + * In the possibility that we were waiting for the metaslab to be + * flushed (where we temporarily dropped the ms_lock), ensure that + * no one else loaded the metaslab somehow. + */ + ASSERT(!msp->ms_loaded); + + /* + * If we're loading a metaslab in the normal class, consider evicting + * another one to keep our memory usage under the limit defined by the + * zfs_metaslab_mem_limit tunable. + */ + if (spa_normal_class(msp->ms_group->mg_class->mc_spa) == + msp->ms_group->mg_class) { + metaslab_potentially_evict(msp->ms_group->mg_class); + } + + int error = metaslab_load_impl(msp); + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + msp->ms_loading = B_FALSE; + cv_broadcast(&msp->ms_load_cv); + + return (error); +} + +void +metaslab_unload(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * This can happen if a metaslab is selected for eviction (in + * metaslab_potentially_evict) and then unloaded during spa_sync (via + * metaslab_class_evict_old). + */ + if (!msp->ms_loaded) + return; + + range_tree_vacate(msp->ms_allocatable, NULL, NULL); + msp->ms_loaded = B_FALSE; + msp->ms_unload_time = gethrtime(); + + msp->ms_activation_weight = 0; + msp->ms_weight &= ~METASLAB_ACTIVE_MASK; + + if (msp->ms_group != NULL) { + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + multilist_sublist_unlock(mls); + + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + zfs_dbgmsg("metaslab_unload: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, weight %llx, " + "selected txg %llu (%llu ms ago), alloc_txg %llu, " + "loaded %llu ms ago, max_size %llu", + spa_syncing_txg(spa), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + msp->ms_weight, + msp->ms_selected_txg, + (msp->ms_unload_time - msp->ms_selected_time) / 1000 / 1000, + msp->ms_alloc_txg, + (msp->ms_unload_time - msp->ms_load_time) / 1000 / 1000, + msp->ms_max_size); + } + + /* + * We explicitly recalculate the metaslab's weight based on its space + * map (as it is now not loaded). We want unload metaslabs to always + * have their weights calculated from the space map histograms, while + * loaded ones have it calculated from their in-core range tree + * [see metaslab_load()]. This way, the weight reflects the information + * available in-core, whether it is loaded or not. + * + * If ms_group == NULL means that we came here from metaslab_fini(), + * at which point it doesn't make sense for us to do the recalculation + * and the sorting. + */ + if (msp->ms_group != NULL) + metaslab_recalculate_weight_and_sort(msp); +} + +/* + * We want to optimize the memory use of the per-metaslab range + * trees. To do this, we store the segments in the range trees in + * units of sectors, zero-indexing from the start of the metaslab. If + * the vdev_ms_shift - the vdev_ashift is less than 32, we can store + * the ranges using two uint32_ts, rather than two uint64_ts. + */ +range_seg_type_t +metaslab_calculate_range_tree_type(vdev_t *vdev, metaslab_t *msp, + uint64_t *start, uint64_t *shift) +{ + if (vdev->vdev_ms_shift - vdev->vdev_ashift < 32 && + !zfs_metaslab_force_large_segs) { + *shift = vdev->vdev_ashift; + *start = msp->ms_start; + return (RANGE_SEG32); + } else { + *shift = 0; + *start = 0; + return (RANGE_SEG64); + } +} + +void +metaslab_set_selected_txg(metaslab_t *msp, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (multilist_link_active(&msp->ms_class_txg_node)) + multilist_sublist_remove(mls, msp); + msp->ms_selected_txg = txg; + msp->ms_selected_time = gethrtime(); + multilist_sublist_insert_tail(mls, msp); + multilist_sublist_unlock(mls); +} + +void +metaslab_space_update(vdev_t *vd, metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta) +{ + vdev_space_update(vd, alloc_delta, defer_delta, space_delta); + + ASSERT3P(vd->vdev_spa->spa_root_vdev, ==, vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, alloc_delta, defer_delta, space_delta, + vdev_deflated_space(vd, space_delta)); +} + +int +metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, + uint64_t txg, metaslab_t **msp) +{ + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + metaslab_t *ms; + int error; + + ms = kmem_zalloc(sizeof (metaslab_t), KM_SLEEP); + mutex_init(&ms->ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ms->ms_sync_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&ms->ms_load_cv, NULL, CV_DEFAULT, NULL); + cv_init(&ms->ms_flush_cv, NULL, CV_DEFAULT, NULL); + multilist_link_init(&ms->ms_class_txg_node); + + ms->ms_id = id; + ms->ms_start = id << vd->vdev_ms_shift; + ms->ms_size = 1ULL << vd->vdev_ms_shift; + ms->ms_allocator = -1; + ms->ms_new = B_TRUE; + + /* + * We only open space map objects that already exist. All others + * will be opened when we finally allocate an object for it. + * + * Note: + * When called from vdev_expand(), we can't call into the DMU as + * we are holding the spa_config_lock as a writer and we would + * deadlock [see relevant comment in vdev_metaslab_init()]. in + * that case, the object parameter is zero though, so we won't + * call into the DMU. + */ + if (object != 0) { + error = space_map_open(&ms->ms_sm, mos, object, ms->ms_start, + ms->ms_size, vd->vdev_ashift); + + if (error != 0) { + kmem_free(ms, sizeof (metaslab_t)); + return (error); + } + + ASSERT(ms->ms_sm != NULL); + ms->ms_allocated_space = space_map_allocated(ms->ms_sm); + } + + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, ms, &start, &shift); + + /* + * We create the ms_allocatable here, but we don't create the + * other range trees until metaslab_sync_done(). This serves + * two purposes: it allows metaslab_sync_done() to detect the + * addition of new space; and for debugging, it ensures that + * we'd data fault on any attempt to use this metaslab before + * it's ready. + */ + ms->ms_allocatable = range_tree_create(NULL, type, NULL, start, shift); + + ms->ms_trim = range_tree_create(NULL, type, NULL, start, shift); + + metaslab_group_add(mg, ms); + metaslab_set_fragmentation(ms, B_FALSE); + + /* + * If we're opening an existing pool (txg == 0) or creating + * a new one (txg == TXG_INITIAL), all space is available now. + * If we're adding space to an existing pool, the new space + * does not become available until after this txg has synced. + * The metaslab's weight will also be initialized when we sync + * out this txg. This ensures that we don't attempt to allocate + * from it before we have initialized it completely. + */ + if (txg <= TXG_INITIAL) { + metaslab_sync_done(ms, 0); + metaslab_space_update(vd, mg->mg_class, + metaslab_allocated_space(ms), 0, 0); + } + + if (txg != 0) { + vdev_dirty(vd, 0, NULL, txg); + vdev_dirty(vd, VDD_METASLAB, ms, txg); + } + + *msp = ms; + + return (0); +} + +static void +metaslab_fini_flush_data(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (metaslab_unflushed_txg(msp) == 0) { + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), + ==, NULL); + return; + } + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + spa_log_sm_decrement_mscount(spa, metaslab_unflushed_txg(msp)); + spa_log_summary_decrement_mscount(spa, metaslab_unflushed_txg(msp)); +} + +uint64_t +metaslab_unflushed_changes_memused(metaslab_t *ms) +{ + return ((range_tree_numsegs(ms->ms_unflushed_allocs) + + range_tree_numsegs(ms->ms_unflushed_frees)) * + ms->ms_unflushed_allocs->rt_root.bt_elem_size); +} + +void +metaslab_fini(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + + metaslab_fini_flush_data(msp); + + metaslab_group_remove(mg, msp); + + mutex_enter(&msp->ms_lock); + VERIFY(msp->ms_group == NULL); + metaslab_space_update(vd, mg->mg_class, + -metaslab_allocated_space(msp), 0, -msp->ms_size); + + space_map_close(msp->ms_sm); + msp->ms_sm = NULL; + + metaslab_unload(msp); + range_tree_destroy(msp->ms_allocatable); + range_tree_destroy(msp->ms_freeing); + range_tree_destroy(msp->ms_freed); + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_allocs); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + range_tree_destroy(msp->ms_unflushed_frees); + + for (int t = 0; t < TXG_SIZE; t++) { + range_tree_destroy(msp->ms_allocating[t]); + } + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_destroy(msp->ms_defer[t]); + } + ASSERT0(msp->ms_deferspace); + + range_tree_destroy(msp->ms_checkpointing); + + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!txg_list_member(&vd->vdev_ms_list, msp, t)); + + range_tree_vacate(msp->ms_trim, NULL, NULL); + range_tree_destroy(msp->ms_trim); + + mutex_exit(&msp->ms_lock); + cv_destroy(&msp->ms_load_cv); + cv_destroy(&msp->ms_flush_cv); + mutex_destroy(&msp->ms_lock); + mutex_destroy(&msp->ms_sync_lock); + ASSERT3U(msp->ms_allocator, ==, -1); + + kmem_free(msp, sizeof (metaslab_t)); +} + +#define FRAGMENTATION_TABLE_SIZE 17 + +/* + * This table defines a segment size based fragmentation metric that will + * allow each metaslab to derive its own fragmentation value. This is done + * by calculating the space in each bucket of the spacemap histogram and + * multiplying that by the fragmentation metric in this table. Doing + * this for all buckets and dividing it by the total amount of free + * space in this metaslab (i.e. the total free space in all buckets) gives + * us the fragmentation metric. This means that a high fragmentation metric + * equates to most of the free space being comprised of small segments. + * Conversely, if the metric is low, then most of the free space is in + * large segments. A 10% change in fragmentation equates to approximately + * double the number of segments. + * + * This table defines 0% fragmented space using 16MB segments. Testing has + * shown that segments that are greater than or equal to 16MB do not suffer + * from drastic performance problems. Using this value, we derive the rest + * of the table. Since the fragmentation value is never stored on disk, it + * is possible to change these calculations in the future. + */ +int zfs_frag_table[FRAGMENTATION_TABLE_SIZE] = { + 100, /* 512B */ + 100, /* 1K */ + 98, /* 2K */ + 95, /* 4K */ + 90, /* 8K */ + 80, /* 16K */ + 70, /* 32K */ + 60, /* 64K */ + 50, /* 128K */ + 40, /* 256K */ + 30, /* 512K */ + 20, /* 1M */ + 15, /* 2M */ + 10, /* 4M */ + 5, /* 8M */ + 0 /* 16M */ +}; + +/* + * Calculate the metaslab's fragmentation metric and set ms_fragmentation. + * Setting this value to ZFS_FRAG_INVALID means that the metaslab has not + * been upgraded and does not support this metric. Otherwise, the return + * value should be in the range [0, 100]. + */ +static void +metaslab_set_fragmentation(metaslab_t *msp, boolean_t nodirty) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + uint64_t fragmentation = 0; + uint64_t total = 0; + boolean_t feature_enabled = spa_feature_is_enabled(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM); + + if (!feature_enabled) { + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + /* + * A null space map means that the entire metaslab is free + * and thus is not fragmented. + */ + if (msp->ms_sm == NULL) { + msp->ms_fragmentation = 0; + return; + } + + /* + * If this metaslab's space map has not been upgraded, flag it + * so that we upgrade next time we encounter it. + */ + if (msp->ms_sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) { + uint64_t txg = spa_syncing_txg(spa); + vdev_t *vd = msp->ms_group->mg_vd; + + /* + * If we've reached the final dirty txg, then we must + * be shutting down the pool. We don't want to dirty + * any data past this point so skip setting the condense + * flag. We can retry this action the next time the pool + * is imported. We also skip marking this metaslab for + * condensing if the caller has explicitly set nodirty. + */ + if (!nodirty && + spa_writeable(spa) && txg < spa_final_dirty_txg(spa)) { + msp->ms_condense_wanted = B_TRUE; + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + zfs_dbgmsg("txg %llu, requesting force condense: " + "ms_id %llu, vdev_id %llu", txg, msp->ms_id, + vd->vdev_id); + } + msp->ms_fragmentation = ZFS_FRAG_INVALID; + return; + } + + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + uint64_t space = 0; + uint8_t shift = msp->ms_sm->sm_shift; + + int idx = MIN(shift - SPA_MINBLOCKSHIFT + i, + FRAGMENTATION_TABLE_SIZE - 1); + + if (msp->ms_sm->sm_phys->smp_histogram[i] == 0) + continue; + + space = msp->ms_sm->sm_phys->smp_histogram[i] << (i + shift); + total += space; + + ASSERT3U(idx, <, FRAGMENTATION_TABLE_SIZE); + fragmentation += space * zfs_frag_table[idx]; + } + + if (total > 0) + fragmentation /= total; + ASSERT3U(fragmentation, <=, 100); + + msp->ms_fragmentation = fragmentation; +} + +/* + * Compute a weight -- a selection preference value -- for the given metaslab. + * This is based on the amount of free space, the level of fragmentation, + * the LBA range, and whether the metaslab is loaded. + */ +static uint64_t +metaslab_space_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + uint64_t weight, space; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The baseline weight is the metaslab's free space. + */ + space = msp->ms_size - metaslab_allocated_space(msp); + + if (metaslab_fragmentation_factor_enabled && + msp->ms_fragmentation != ZFS_FRAG_INVALID) { + /* + * Use the fragmentation information to inversely scale + * down the baseline weight. We need to ensure that we + * don't exclude this metaslab completely when it's 100% + * fragmented. To avoid this we reduce the fragmented value + * by 1. + */ + space = (space * (100 - (msp->ms_fragmentation - 1))) / 100; + + /* + * If space < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. The fragmentation metric may have + * decreased the space to something smaller than + * SPA_MINBLOCKSIZE, so reset the space to SPA_MINBLOCKSIZE + * so that we can consume any remaining space. + */ + if (space > 0 && space < SPA_MINBLOCKSIZE) + space = SPA_MINBLOCKSIZE; + } + weight = space; + + /* + * Modern disks have uniform bit density and constant angular velocity. + * Therefore, the outer recording zones are faster (higher bandwidth) + * than the inner zones by the ratio of outer to inner track diameter, + * which is typically around 2:1. We account for this by assigning + * higher weight to lower metaslabs (multiplier ranging from 2x to 1x). + * In effect, this means that we'll select the metaslab with the most + * free bandwidth rather than simply the one with the most free space. + */ + if (!vd->vdev_nonrot && metaslab_lba_weighting_enabled) { + weight = 2 * weight - (msp->ms_id * weight) / vd->vdev_ms_count; + ASSERT(weight >= space && weight <= 2 * space); + } + + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. If the fragmentation on this metaslab + * has exceed our threshold, then don't mark it active. + */ + if (msp->ms_loaded && msp->ms_fragmentation != ZFS_FRAG_INVALID && + msp->ms_fragmentation <= zfs_metaslab_fragmentation_threshold) { + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + + WEIGHT_SET_SPACEBASED(weight); + return (weight); +} + +/* + * Return the weight of the specified metaslab, according to the segment-based + * weighting algorithm. The metaslab must be loaded. This function can + * be called within a sync pass since it relies only on the metaslab's + * range tree which is always accurate when the metaslab is loaded. + */ +static uint64_t +metaslab_weight_from_range_tree(metaslab_t *msp) +{ + uint64_t weight = 0; + uint32_t segments = 0; + + ASSERT(msp->ms_loaded); + + for (int i = RANGE_TREE_HISTOGRAM_SIZE - 1; i >= SPA_MINBLOCKSHIFT; + i--) { + uint8_t shift = msp->ms_group->mg_vd->vdev_ashift; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + segments <<= 1; + segments += msp->ms_allocatable->rt_histogram[i]; + + /* + * The range tree provides more precision than the space map + * and must be downgraded so that all values fit within the + * space map's histogram. This allows us to compare loaded + * vs. unloaded metaslabs to determine which metaslab is + * considered "best". + */ + if (i > max_idx) + continue; + + if (segments != 0) { + WEIGHT_SET_COUNT(weight, segments); + WEIGHT_SET_INDEX(weight, i); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Calculate the weight based on the on-disk histogram. Should be applied + * only to unloaded metaslabs (i.e no incoming allocations) in-order to + * give results consistent with the on-disk state + */ +static uint64_t +metaslab_weight_from_spacemap(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + ASSERT(!msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(space_map_object(sm), !=, 0); + ASSERT3U(sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * Create a joint histogram from all the segments that have made + * it to the metaslab's space map histogram, that are not yet + * available for allocation because they are still in the freeing + * pipeline (e.g. freeing, freed, and defer trees). Then subtract + * these segments from the space map's histogram to get a more + * accurate weight. + */ + uint64_t deferspace_histogram[SPACE_MAP_HISTOGRAM_SIZE] = {0}; + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) + deferspace_histogram[i] += msp->ms_synchist[i]; + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + for (int i = 0; i < SPACE_MAP_HISTOGRAM_SIZE; i++) { + deferspace_histogram[i] += msp->ms_deferhist[t][i]; + } + } + + uint64_t weight = 0; + for (int i = SPACE_MAP_HISTOGRAM_SIZE - 1; i >= 0; i--) { + ASSERT3U(sm->sm_phys->smp_histogram[i], >=, + deferspace_histogram[i]); + uint64_t count = + sm->sm_phys->smp_histogram[i] - deferspace_histogram[i]; + if (count != 0) { + WEIGHT_SET_COUNT(weight, count); + WEIGHT_SET_INDEX(weight, i + sm->sm_shift); + WEIGHT_SET_ACTIVE(weight, 0); + break; + } + } + return (weight); +} + +/* + * Compute a segment-based weight for the specified metaslab. The weight + * is determined by highest bucket in the histogram. The information + * for the highest bucket is encoded into the weight value. + */ +static uint64_t +metaslab_segment_weight(metaslab_t *msp) +{ + metaslab_group_t *mg = msp->ms_group; + uint64_t weight = 0; + uint8_t shift = mg->mg_vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The metaslab is completely free. + */ + if (metaslab_allocated_space(msp) == 0) { + int idx = highbit64(msp->ms_size) - 1; + int max_idx = SPACE_MAP_HISTOGRAM_SIZE + shift - 1; + + if (idx < max_idx) { + WEIGHT_SET_COUNT(weight, 1ULL); + WEIGHT_SET_INDEX(weight, idx); + } else { + WEIGHT_SET_COUNT(weight, 1ULL << (idx - max_idx)); + WEIGHT_SET_INDEX(weight, max_idx); + } + WEIGHT_SET_ACTIVE(weight, 0); + ASSERT(!WEIGHT_IS_SPACEBASED(weight)); + return (weight); + } + + ASSERT3U(msp->ms_sm->sm_dbuf->db_size, ==, sizeof (space_map_phys_t)); + + /* + * If the metaslab is fully allocated then just make the weight 0. + */ + if (metaslab_allocated_space(msp) == msp->ms_size) + return (0); + /* + * If the metaslab is already loaded, then use the range tree to + * determine the weight. Otherwise, we rely on the space map information + * to generate the weight. + */ + if (msp->ms_loaded) { + weight = metaslab_weight_from_range_tree(msp); + } else { + weight = metaslab_weight_from_spacemap(msp); + } + + /* + * If the metaslab was active the last time we calculated its weight + * then keep it active. We want to consume the entire region that + * is associated with this weight. + */ + if (msp->ms_activation_weight != 0 && weight != 0) + WEIGHT_SET_ACTIVE(weight, WEIGHT_GET_ACTIVE(msp->ms_weight)); + return (weight); +} + +/* + * Determine if we should attempt to allocate from this metaslab. If the + * metaslab is loaded, then we can determine if the desired allocation + * can be satisfied by looking at the size of the maximum free segment + * on that metaslab. Otherwise, we make our decision based on the metaslab's + * weight. For segment-based weighting we can determine the maximum + * allocation based on the index encoded in its value. For space-based + * weights we rely on the entire weight (excluding the weight-type bit). + */ +static boolean_t +metaslab_should_allocate(metaslab_t *msp, uint64_t asize, boolean_t try_hard) +{ + /* + * If the metaslab is loaded, ms_max_size is definitive and we can use + * the fast check. If it's not, the ms_max_size is a lower bound (once + * set), and we should use the fast check as long as we're not in + * try_hard and it's been less than zfs_metaslab_max_size_cache_sec + * seconds since the metaslab was unloaded. + */ + if (msp->ms_loaded || + (msp->ms_max_size != 0 && !try_hard && gethrtime() < + msp->ms_unload_time + SEC2NSEC(zfs_metaslab_max_size_cache_sec))) + return (msp->ms_max_size >= asize); + + boolean_t should_allocate; + if (!WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + /* + * The metaslab segment weight indicates segments in the + * range [2^i, 2^(i+1)), where i is the index in the weight. + * Since the asize might be in the middle of the range, we + * should attempt the allocation if asize < 2^(i+1). + */ + should_allocate = (asize < + 1ULL << (WEIGHT_GET_INDEX(msp->ms_weight) + 1)); + } else { + should_allocate = (asize <= + (msp->ms_weight & ~METASLAB_WEIGHT_TYPE)); + } + + return (should_allocate); +} + +static uint64_t +metaslab_weight(metaslab_t *msp, boolean_t nodirty) +{ + vdev_t *vd = msp->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + uint64_t weight; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + metaslab_set_fragmentation(msp, nodirty); + + /* + * Update the maximum size. If the metaslab is loaded, this will + * ensure that we get an accurate maximum size if newly freed space + * has been added back into the free tree. If the metaslab is + * unloaded, we check if there's a larger free segment in the + * unflushed frees. This is a lower bound on the largest allocatable + * segment size. Coalescing of adjacent entries may reveal larger + * allocatable segments, but we aren't aware of those until loading + * the space map into a range tree. + */ + if (msp->ms_loaded) { + msp->ms_max_size = metaslab_largest_allocatable(msp); + } else { + msp->ms_max_size = MAX(msp->ms_max_size, + metaslab_largest_unflushed_free(msp)); + } + + /* + * Segment-based weighting requires space map histogram support. + */ + if (zfs_metaslab_segment_weight_enabled && + spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + (msp->ms_sm == NULL || msp->ms_sm->sm_dbuf->db_size == + sizeof (space_map_phys_t))) { + weight = metaslab_segment_weight(msp); + } else { + weight = metaslab_space_weight(msp); + } + return (weight); +} + +void +metaslab_recalculate_weight_and_sort(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* note: we preserve the mask (e.g. indication of primary, etc..) */ + uint64_t was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(msp->ms_group, msp, + metaslab_weight(msp, B_FALSE) | was_active); +} + +static int +metaslab_activate_allocator(metaslab_group_t *mg, metaslab_t *msp, + int allocator, uint64_t activation_weight) +{ + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * If we're activating for the claim code, we don't want to actually + * set the metaslab up for a specific allocator. + */ + if (activation_weight == METASLAB_WEIGHT_CLAIM) { + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort(mg, msp, msp->ms_weight | + activation_weight); + return (0); + } + + metaslab_t **mspp = (activation_weight == METASLAB_WEIGHT_PRIMARY ? + &mga->mga_primary : &mga->mga_secondary); + + mutex_enter(&mg->mg_lock); + if (*mspp != NULL) { + mutex_exit(&mg->mg_lock); + return (EEXIST); + } + + *mspp = msp; + ASSERT3S(msp->ms_allocator, ==, -1); + msp->ms_allocator = allocator; + msp->ms_primary = (activation_weight == METASLAB_WEIGHT_PRIMARY); + + ASSERT0(msp->ms_activation_weight); + msp->ms_activation_weight = msp->ms_weight; + metaslab_group_sort_impl(mg, msp, + msp->ms_weight | activation_weight); + mutex_exit(&mg->mg_lock); + + return (0); +} + +static int +metaslab_activate(metaslab_t *msp, int allocator, uint64_t activation_weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + /* + * The current metaslab is already activated for us so there + * is nothing to do. Already activated though, doesn't mean + * that this metaslab is activated for our allocator nor our + * requested activation weight. The metaslab could have started + * as an active one for our allocator but changed allocators + * while we were waiting to grab its ms_lock or we stole it + * [see find_valid_metaslab()]. This means that there is a + * possibility of passivating a metaslab of another allocator + * or from a different activation mask, from this thread. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + ASSERT(msp->ms_loaded); + return (0); + } + + int error = metaslab_load(msp); + if (error != 0) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + + /* + * When entering metaslab_load() we may have dropped the + * ms_lock because we were loading this metaslab, or we + * were waiting for another thread to load it for us. In + * that scenario, we recheck the weight of the metaslab + * to see if it was activated by another thread. + * + * If the metaslab was activated for another allocator or + * it was activated with a different activation weight (e.g. + * we wanted to make it a primary but it was activated as + * secondary) we return error (EBUSY). + * + * If the metaslab was activated for the same allocator + * and requested activation mask, skip activating it. + */ + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) != 0) { + if (msp->ms_allocator != allocator) + return (EBUSY); + + if ((msp->ms_weight & activation_weight) == 0) + return (SET_ERROR(EBUSY)); + + EQUIV((activation_weight == METASLAB_WEIGHT_PRIMARY), + msp->ms_primary); + return (0); + } + + /* + * If the metaslab has literally 0 space, it will have weight 0. In + * that case, don't bother activating it. This can happen if the + * metaslab had space during find_valid_metaslab, but another thread + * loaded it and used all that space while we were waiting to grab the + * lock. + */ + if (msp->ms_weight == 0) { + ASSERT0(range_tree_space(msp->ms_allocatable)); + return (SET_ERROR(ENOSPC)); + } + + if ((error = metaslab_activate_allocator(msp->ms_group, msp, + allocator, activation_weight)) != 0) { + return (error); + } + + ASSERT(msp->ms_loaded); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + + return (0); +} + +static void +metaslab_passivate_allocator(metaslab_group_t *mg, metaslab_t *msp, + uint64_t weight) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + metaslab_group_sort(mg, msp, weight); + return; + } + + mutex_enter(&mg->mg_lock); + ASSERT3P(msp->ms_group, ==, mg); + ASSERT3S(0, <=, msp->ms_allocator); + ASSERT3U(msp->ms_allocator, <, mg->mg_allocators); + + metaslab_group_allocator_t *mga = &mg->mg_allocator[msp->ms_allocator]; + if (msp->ms_primary) { + ASSERT3P(mga->mga_primary, ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + mga->mga_primary = NULL; + } else { + ASSERT3P(mga->mga_secondary, ==, msp); + ASSERT(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + mga->mga_secondary = NULL; + } + msp->ms_allocator = -1; + metaslab_group_sort_impl(mg, msp, weight); + mutex_exit(&mg->mg_lock); +} + +static void +metaslab_passivate(metaslab_t *msp, uint64_t weight) +{ + uint64_t size __maybe_unused = weight & ~METASLAB_WEIGHT_TYPE; + + /* + * If size < SPA_MINBLOCKSIZE, then we will not allocate from + * this metaslab again. In that case, it had better be empty, + * or we would be leaving space on the table. + */ + ASSERT(!WEIGHT_IS_SPACEBASED(msp->ms_weight) || + size >= SPA_MINBLOCKSIZE || + range_tree_space(msp->ms_allocatable) == 0); + ASSERT0(weight & METASLAB_ACTIVE_MASK); + + ASSERT(msp->ms_activation_weight != 0); + msp->ms_activation_weight = 0; + metaslab_passivate_allocator(msp->ms_group, msp, weight); + ASSERT0(msp->ms_weight & METASLAB_ACTIVE_MASK); +} + +/* + * Segment-based metaslabs are activated once and remain active until + * we either fail an allocation attempt (similar to space-based metaslabs) + * or have exhausted the free space in zfs_metaslab_switch_threshold + * buckets since the metaslab was activated. This function checks to see + * if we've exhausted the zfs_metaslab_switch_threshold buckets in the + * metaslab and passivates it proactively. This will allow us to select a + * metaslab with a larger contiguous region, if any, remaining within this + * metaslab group. If we're in sync pass > 1, then we continue using this + * metaslab so that we don't dirty more block and cause more sync passes. + */ +static void +metaslab_segment_may_passivate(metaslab_t *msp) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + if (WEIGHT_IS_SPACEBASED(msp->ms_weight) || spa_sync_pass(spa) > 1) + return; + + /* + * Since we are in the middle of a sync pass, the most accurate + * information that is accessible to us is the in-core range tree + * histogram; calculate the new weight based on that information. + */ + uint64_t weight = metaslab_weight_from_range_tree(msp); + int activation_idx = WEIGHT_GET_INDEX(msp->ms_activation_weight); + int current_idx = WEIGHT_GET_INDEX(weight); + + if (current_idx <= activation_idx - zfs_metaslab_switch_threshold) + metaslab_passivate(msp, weight); +} + +static void +metaslab_preload(void *arg) +{ + metaslab_t *msp = arg; + metaslab_class_t *mc = msp->ms_group->mg_class; + spa_t *spa = mc->mc_spa; + fstrans_cookie_t cookie = spl_fstrans_mark(); + + ASSERT(!MUTEX_HELD(&msp->ms_group->mg_lock)); + + mutex_enter(&msp->ms_lock); + (void) metaslab_load(msp); + metaslab_set_selected_txg(msp, spa_syncing_txg(spa)); + mutex_exit(&msp->ms_lock); + spl_fstrans_unmark(cookie); +} + +static void +metaslab_group_preload(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m = 0; + + if (spa_shutting_down(spa) || !metaslab_preload_enabled) { + taskq_wait_outstanding(mg->mg_taskq, 0); + return; + } + + mutex_enter(&mg->mg_lock); + + /* + * Load the next potential metaslabs + */ + for (msp = avl_first(t); msp != NULL; msp = AVL_NEXT(t, msp)) { + ASSERT3P(msp->ms_group, ==, mg); + + /* + * We preload only the maximum number of metaslabs specified + * by metaslab_preload_limit. If a metaslab is being forced + * to condense then we preload it too. This will ensure + * that force condensing happens in the next txg. + */ + if (++m > metaslab_preload_limit && !msp->ms_condense_wanted) { + continue; + } + + VERIFY(taskq_dispatch(mg->mg_taskq, metaslab_preload, + msp, TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&mg->mg_lock); +} + +/* + * Determine if the space map's on-disk footprint is past our tolerance for + * inefficiency. We would like to use the following criteria to make our + * decision: + * + * 1. Do not condense if the size of the space map object would dramatically + * increase as a result of writing out the free space range tree. + * + * 2. Condense if the on on-disk space map representation is at least + * zfs_condense_pct/100 times the size of the optimal representation + * (i.e. zfs_condense_pct = 110 and in-core = 1MB, optimal = 1.1MB). + * + * 3. Do not condense if the on-disk size of the space map does not actually + * decrease. + * + * Unfortunately, we cannot compute the on-disk size of the space map in this + * context because we cannot accurately compute the effects of compression, etc. + * Instead, we apply the heuristic described in the block comment for + * zfs_metaslab_condense_block_threshold - we only condense if the space used + * is greater than a threshold number of blocks. + */ +static boolean_t +metaslab_should_condense(metaslab_t *msp) +{ + space_map_t *sm = msp->ms_sm; + vdev_t *vd = msp->ms_group->mg_vd; + uint64_t vdev_blocksize = 1 << vd->vdev_ashift; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + ASSERT(sm != NULL); + ASSERT3U(spa_sync_pass(vd->vdev_spa), ==, 1); + + /* + * We always condense metaslabs that are empty and metaslabs for + * which a condense request has been made. + */ + if (range_tree_numsegs(msp->ms_allocatable) == 0 || + msp->ms_condense_wanted) + return (B_TRUE); + + uint64_t record_size = MAX(sm->sm_blksz, vdev_blocksize); + uint64_t object_size = space_map_length(sm); + uint64_t optimal_size = space_map_estimate_optimal_size(sm, + msp->ms_allocatable, SM_NO_VDEVID); + + return (object_size >= (optimal_size * zfs_condense_pct / 100) && + object_size > zfs_metaslab_condense_block_threshold * record_size); +} + +/* + * Condense the on-disk space map representation to its minimized form. + * The minimized form consists of a small number of allocations followed + * by the entries of the free range tree (ms_allocatable). The condensed + * spacemap contains all the entries of previous TXGs (including those in + * the pool-wide log spacemaps; thus this is effectively a superset of + * metaslab_flush()), but this TXG's entries still need to be written. + */ +static void +metaslab_condense(metaslab_t *msp, dmu_tx_t *tx) +{ + range_tree_t *condense_tree; + space_map_t *sm = msp->ms_sm; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT(msp->ms_loaded); + ASSERT(msp->ms_sm != NULL); + + /* + * In order to condense the space map, we need to change it so it + * only describes which segments are currently allocated and free. + * + * All the current free space resides in the ms_allocatable, all + * the ms_defer trees, and all the ms_allocating trees. We ignore + * ms_freed because it is empty because we're in sync pass 1. We + * ignore ms_freeing because these changes are not yet reflected + * in the spacemap (they will be written later this txg). + * + * So to truncate the space map to represent all the entries of + * previous TXGs we do the following: + * + * 1] We create a range tree (condense tree) that is 100% empty. + * 2] We add to it all segments found in the ms_defer trees + * as those segments are marked as free in the original space + * map. We do the same with the ms_allocating trees for the same + * reason. Adding these segments should be a relatively + * inexpensive operation since we expect these trees to have a + * small number of nodes. + * 3] We vacate any unflushed allocs, since they are not frees we + * need to add to the condense tree. Then we vacate any + * unflushed frees as they should already be part of ms_allocatable. + * 4] At this point, we would ideally like to add all segments + * in the ms_allocatable tree from the condense tree. This way + * we would write all the entries of the condense tree as the + * condensed space map, which would only contain freed + * segments with everything else assumed to be allocated. + * + * Doing so can be prohibitively expensive as ms_allocatable can + * be large, and therefore computationally expensive to add to + * the condense_tree. Instead we first sync out an entry marking + * everything as allocated, then the condense_tree and then the + * ms_allocatable, in the condensed space map. While this is not + * optimal, it is typically close to optimal and more importantly + * much cheaper to compute. + * + * 5] Finally, as both of the unflushed trees were written to our + * new and condensed metaslab space map, we basically flushed + * all the unflushed changes to disk, thus we call + * metaslab_flush_update(). + */ + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_freed)); /* since it is pass 1 */ + + zfs_dbgmsg("condensing: txg %llu, msp[%llu] %px, vdev id %llu, " + "spa %s, smp size %llu, segments %lu, forcing condense=%s", txg, + msp->ms_id, msp, msp->ms_group->mg_vd->vdev_id, + spa->spa_name, space_map_length(msp->ms_sm), + range_tree_numsegs(msp->ms_allocatable), + msp->ms_condense_wanted ? "TRUE" : "FALSE"); + + msp->ms_condense_wanted = B_FALSE; + + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(msp->ms_group->mg_vd, msp, + &start, &shift); + + condense_tree = range_tree_create(NULL, type, NULL, start, shift); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + range_tree_walk(msp->ms_defer[t], + range_tree_add, condense_tree); + } + + for (int t = 0; t < TXG_CONCURRENT_STATES; t++) { + range_tree_walk(msp->ms_allocating[(txg + t) & TXG_MASK], + range_tree_add, condense_tree); + } + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + + /* + * We're about to drop the metaslab's lock thus allowing other + * consumers to change it's content. Set the metaslab's ms_condensing + * flag to ensure that allocations on this metaslab do not occur + * while we're in the middle of committing it to disk. This is only + * critical for ms_allocatable as all other range trees use per TXG + * views of their content. + */ + msp->ms_condensing = B_TRUE; + + mutex_exit(&msp->ms_lock); + uint64_t object = space_map_object(msp->ms_sm); + space_map_truncate(sm, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : zfs_metaslab_sm_blksz_no_log, tx); + + /* + * space_map_truncate() may have reallocated the spacemap object. + * If so, update the vdev_ms_array. + */ + if (space_map_object(msp->ms_sm) != object) { + object = space_map_object(msp->ms_sm); + dmu_write(spa->spa_meta_objset, + msp->ms_group->mg_vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &object, tx); + } + + /* + * Note: + * When the log space map feature is enabled, each space map will + * always have ALLOCS followed by FREES for each sync pass. This is + * typically true even when the log space map feature is disabled, + * except from the case where a metaslab goes through metaslab_sync() + * and gets condensed. In that case the metaslab's space map will have + * ALLOCS followed by FREES (due to condensing) followed by ALLOCS + * followed by FREES (due to space_map_write() in metaslab_sync()) for + * sync pass 1. + */ + range_tree_t *tmp_tree = range_tree_create(NULL, type, NULL, start, + shift); + range_tree_add(tmp_tree, msp->ms_start, msp->ms_size); + space_map_write(sm, tmp_tree, SM_ALLOC, SM_NO_VDEVID, tx); + space_map_write(sm, msp->ms_allocatable, SM_FREE, SM_NO_VDEVID, tx); + space_map_write(sm, condense_tree, SM_FREE, SM_NO_VDEVID, tx); + + range_tree_vacate(condense_tree, NULL, NULL); + range_tree_destroy(condense_tree); + range_tree_vacate(tmp_tree, NULL, NULL); + range_tree_destroy(tmp_tree); + mutex_enter(&msp->ms_lock); + + msp->ms_condensing = B_FALSE; + metaslab_flush_update(msp, tx); +} + +/* + * Called when the metaslab has been flushed (its own spacemap now reflects + * all the contents of the pool-wide spacemap log). Updates the metaslab's + * metadata and any pool-wide related log space map data (e.g. summary, + * obsolete logs, etc..) to reflect that. + */ +static void +metaslab_flush_update(metaslab_t *msp, dmu_tx_t *tx) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + + /* + * Just because a metaslab got flushed, that doesn't mean that + * it will pass through metaslab_sync_done(). Thus, make sure to + * update ms_synced_length here in case it doesn't. + */ + msp->ms_synced_length = space_map_length(msp->ms_sm); + + /* + * We may end up here from metaslab_condense() without the + * feature being active. In that case this is a no-op. + */ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ASSERT(spa_syncing_log_sm(spa) != NULL); + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT3P(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL), ==, msp); + + VERIFY3U(tx->tx_txg, <=, spa_final_dirty_txg(spa)); + + /* update metaslab's position in our flushing tree */ + uint64_t ms_prev_flushed_txg = metaslab_unflushed_txg(msp); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_remove(&spa->spa_metaslabs_by_flushed, msp); + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + /* update metaslab counts of spa_log_sm_t nodes */ + spa_log_sm_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_sm_increment_current_mscount(spa); + + /* cleanup obsolete logs if any */ + uint64_t log_blocks_before = spa_log_sm_nblocks(spa); + spa_cleanup_old_sm_logs(spa, tx); + uint64_t log_blocks_after = spa_log_sm_nblocks(spa); + VERIFY3U(log_blocks_after, <=, log_blocks_before); + + /* update log space map summary */ + uint64_t blocks_gone = log_blocks_before - log_blocks_after; + spa_log_summary_add_flushed_metaslab(spa); + spa_log_summary_decrement_mscount(spa, ms_prev_flushed_txg); + spa_log_summary_decrement_blkcount(spa, blocks_gone); +} + +boolean_t +metaslab_flush(metaslab_t *msp, dmu_tx_t *tx) +{ + spa_t *spa = msp->ms_group->mg_vd->vdev_spa; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + ASSERT(msp->ms_sm != NULL); + ASSERT(metaslab_unflushed_txg(msp) != 0); + ASSERT(avl_find(&spa->spa_metaslabs_by_flushed, msp, NULL) != NULL); + + /* + * There is nothing wrong with flushing the same metaslab twice, as + * this codepath should work on that case. However, the current + * flushing scheme makes sure to avoid this situation as we would be + * making all these calls without having anything meaningful to write + * to disk. We assert this behavior here. + */ + ASSERT3U(metaslab_unflushed_txg(msp), <, dmu_tx_get_txg(tx)); + + /* + * We can not flush while loading, because then we would + * not load the ms_unflushed_{allocs,frees}. + */ + if (msp->ms_loading) + return (B_FALSE); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + /* + * Metaslab condensing is effectively flushing. Therefore if the + * metaslab can be condensed we can just condense it instead of + * flushing it. + * + * Note that metaslab_condense() does call metaslab_flush_update() + * so we can just return immediately after condensing. We also + * don't need to care about setting ms_flushing or broadcasting + * ms_flush_cv, even if we temporarily drop the ms_lock in + * metaslab_condense(), as the metaslab is already loaded. + */ + if (msp->ms_loaded && metaslab_should_condense(msp)) { + metaslab_group_t *mg = msp->ms_group; + + /* + * For all histogram operations below refer to the + * comments of metaslab_sync() where we follow a + * similar procedure. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + metaslab_condense(msp, tx); + + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + ASSERT(range_tree_is_empty(msp->ms_freed)); + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + metaslab_aux_histograms_update(msp); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + + /* + * Since we recreated the histogram (and potentially + * the ms_sm too while condensing) ensure that the + * weight is updated too because we are not guaranteed + * that this metaslab is dirty and will go through + * metaslab_sync_done(). + */ + metaslab_recalculate_weight_and_sort(msp); + return (B_TRUE); + } + + msp->ms_flushing = B_TRUE; + uint64_t sm_len_before = space_map_length(msp->ms_sm); + + mutex_exit(&msp->ms_lock); + space_map_write(msp->ms_sm, msp->ms_unflushed_allocs, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_unflushed_frees, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + + uint64_t sm_len_after = space_map_length(msp->ms_sm); + if (zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) { + zfs_dbgmsg("flushing: txg %llu, spa %s, vdev_id %llu, " + "ms_id %llu, unflushed_allocs %llu, unflushed_frees %llu, " + "appended %llu bytes", dmu_tx_get_txg(tx), spa_name(spa), + msp->ms_group->mg_vd->vdev_id, msp->ms_id, + range_tree_space(msp->ms_unflushed_allocs), + range_tree_space(msp->ms_unflushed_frees), + (sm_len_after - sm_len_before)); + } + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_vacate(msp->ms_unflushed_allocs, NULL, NULL); + range_tree_vacate(msp->ms_unflushed_frees, NULL, NULL); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + metaslab_flush_update(msp, tx); + + metaslab_verify_space(msp, dmu_tx_get_txg(tx)); + metaslab_verify_weight_and_frag(msp); + + msp->ms_flushing = B_FALSE; + cv_broadcast(&msp->ms_flush_cv); + return (B_TRUE); +} + +/* + * Write a metaslab to disk in the context of the specified transaction group. + */ +void +metaslab_sync(metaslab_t *msp, uint64_t txg) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + range_tree_t *alloctree = msp->ms_allocating[txg & TXG_MASK]; + dmu_tx_t *tx; + + ASSERT(!vd->vdev_ishole); + + /* + * This metaslab has just been added so there's no work to do now. + */ + if (msp->ms_freeing == NULL) { + ASSERT3P(alloctree, ==, NULL); + return; + } + + ASSERT3P(alloctree, !=, NULL); + ASSERT3P(msp->ms_freeing, !=, NULL); + ASSERT3P(msp->ms_freed, !=, NULL); + ASSERT3P(msp->ms_checkpointing, !=, NULL); + ASSERT3P(msp->ms_trim, !=, NULL); + + /* + * Normally, we don't want to process a metaslab if there are no + * allocations or frees to perform. However, if the metaslab is being + * forced to condense, it's loaded and we're not beyond the final + * dirty txg, we need to let it through. Not condensing beyond the + * final dirty txg prevents an issue where metaslabs that need to be + * condensed but were loaded for other reasons could cause a panic + * here. By only checking the txg in that branch of the conditional, + * we preserve the utility of the VERIFY statements in all other + * cases. + */ + if (range_tree_is_empty(alloctree) && + range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing) && + !(msp->ms_loaded && msp->ms_condense_wanted && + txg <= spa_final_dirty_txg(spa))) + return; + + + VERIFY3U(txg, <=, spa_final_dirty_txg(spa)); + + /* + * The only state that can actually be changing concurrently + * with metaslab_sync() is the metaslab's ms_allocatable. No + * other thread can be modifying this txg's alloc, freeing, + * freed, or space_map_phys_t. We drop ms_lock whenever we + * could call into the DMU, because the DMU can call down to + * us (e.g. via zio_free()) at any time. + * + * The spa_vdev_remove_thread() can be reading metaslab state + * concurrently, and it is locked out by the ms_sync_lock. + * Note that the ms_lock is insufficient for this, because it + * is dropped by space_map_write(). + */ + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + /* + * Generate a log space map if one doesn't exist already. + */ + spa_generate_syncing_log_sm(spa, tx); + + if (msp->ms_sm == NULL) { + uint64_t new_object = space_map_alloc(mos, + spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP) ? + zfs_metaslab_sm_blksz_with_log : + zfs_metaslab_sm_blksz_no_log, tx); + VERIFY3U(new_object, !=, 0); + + dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * + msp->ms_id, sizeof (uint64_t), &new_object, tx); + + VERIFY0(space_map_open(&msp->ms_sm, mos, new_object, + msp->ms_start, msp->ms_size, vd->vdev_ashift)); + ASSERT(msp->ms_sm != NULL); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + ASSERT0(metaslab_allocated_space(msp)); + } + + if (metaslab_unflushed_txg(msp) == 0 && + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT(spa_syncing_log_sm(spa) != NULL); + + metaslab_set_unflushed_txg(msp, spa_syncing_txg(spa), tx); + spa_log_sm_increment_current_mscount(spa); + spa_log_summary_add_flushed_metaslab(spa); + + ASSERT(msp->ms_sm != NULL); + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, msp); + mutex_exit(&spa->spa_flushed_ms_lock); + + ASSERT(range_tree_is_empty(msp->ms_unflushed_allocs)); + ASSERT(range_tree_is_empty(msp->ms_unflushed_frees)); + } + + if (!range_tree_is_empty(msp->ms_checkpointing) && + vd->vdev_checkpoint_sm == NULL) { + ASSERT(spa_has_checkpoint(spa)); + + uint64_t new_object = space_map_alloc(mos, + zfs_vdev_standard_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_checkpoint_sm, + mos, new_object, 0, vd->vdev_asize, vd->vdev_ashift)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * We save the space map object as an entry in vdev_top_zap + * so it can be retrieved when the pool is reopened after an + * export or through zdb. + */ + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, + sizeof (new_object), 1, &new_object, tx)); + } + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Note: metaslab_condense() clears the space map's histogram. + * Therefore we must verify and remove this histogram before + * condensing. + */ + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + metaslab_group_histogram_remove(mg, msp); + + if (spa->spa_sync_pass == 1 && msp->ms_loaded && + metaslab_should_condense(msp)) + metaslab_condense(msp, tx); + + /* + * We'll be going to disk to sync our space accounting, thus we + * drop the ms_lock during that time so allocations coming from + * open-context (ZIL) for future TXGs do not block. + */ + mutex_exit(&msp->ms_lock); + space_map_t *log_sm = spa_syncing_log_sm(spa); + if (log_sm != NULL) { + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + + space_map_write(log_sm, alloctree, SM_ALLOC, + vd->vdev_id, tx); + space_map_write(log_sm, msp->ms_freeing, SM_FREE, + vd->vdev_id, tx); + mutex_enter(&msp->ms_lock); + + ASSERT3U(spa->spa_unflushed_stats.sus_memused, >=, + metaslab_unflushed_changes_memused(msp)); + spa->spa_unflushed_stats.sus_memused -= + metaslab_unflushed_changes_memused(msp); + range_tree_remove_xor_add(alloctree, + msp->ms_unflushed_frees, msp->ms_unflushed_allocs); + range_tree_remove_xor_add(msp->ms_freeing, + msp->ms_unflushed_allocs, msp->ms_unflushed_frees); + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(msp); + } else { + ASSERT(!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)); + + space_map_write(msp->ms_sm, alloctree, SM_ALLOC, + SM_NO_VDEVID, tx); + space_map_write(msp->ms_sm, msp->ms_freeing, SM_FREE, + SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + } + + msp->ms_allocated_space += range_tree_space(alloctree); + ASSERT3U(msp->ms_allocated_space, >=, + range_tree_space(msp->ms_freeing)); + msp->ms_allocated_space -= range_tree_space(msp->ms_freeing); + + if (!range_tree_is_empty(msp->ms_checkpointing)) { + ASSERT(spa_has_checkpoint(spa)); + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * Since we are doing writes to disk and the ms_checkpointing + * tree won't be changing during that time, we drop the + * ms_lock while writing to the checkpoint space map, for the + * same reason mentioned above. + */ + mutex_exit(&msp->ms_lock); + space_map_write(vd->vdev_checkpoint_sm, + msp->ms_checkpointing, SM_FREE, SM_NO_VDEVID, tx); + mutex_enter(&msp->ms_lock); + + spa->spa_checkpoint_info.sci_dspace += + range_tree_space(msp->ms_checkpointing); + vd->vdev_stat.vs_checkpoint_space += + range_tree_space(msp->ms_checkpointing); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, ==, + -space_map_allocated(vd->vdev_checkpoint_sm)); + + range_tree_vacate(msp->ms_checkpointing, NULL, NULL); + } + + if (msp->ms_loaded) { + /* + * When the space map is loaded, we have an accurate + * histogram in the range tree. This gives us an opportunity + * to bring the space map's histogram up-to-date so we clear + * it first before updating it. + */ + space_map_histogram_clear(msp->ms_sm); + space_map_histogram_add(msp->ms_sm, msp->ms_allocatable, tx); + + /* + * Since we've cleared the histogram we need to add back + * any free space that has already been processed, plus + * any deferred space. This allows the on-disk histogram + * to accurately reflect all free space even if some space + * is not yet available for allocation (i.e. deferred). + */ + space_map_histogram_add(msp->ms_sm, msp->ms_freed, tx); + + /* + * Add back any deferred free space that has not been + * added back into the in-core free tree yet. This will + * ensure that we don't end up with a space map histogram + * that is completely empty unless the metaslab is fully + * allocated. + */ + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + space_map_histogram_add(msp->ms_sm, + msp->ms_defer[t], tx); + } + } + + /* + * Always add the free space from this sync pass to the space + * map histogram. We want to make sure that the on-disk histogram + * accounts for all free space. If the space map is not loaded, + * then we will lose some accuracy but will correct it the next + * time we load the space map. + */ + space_map_histogram_add(msp->ms_sm, msp->ms_freeing, tx); + metaslab_aux_histograms_update(msp); + + metaslab_group_histogram_add(mg, msp); + metaslab_group_histogram_verify(mg); + metaslab_class_histogram_verify(mg->mg_class); + + /* + * For sync pass 1, we avoid traversing this txg's free range tree + * and instead will just swap the pointers for freeing and freed. + * We can safely do this since the freed_tree is guaranteed to be + * empty on the initial pass. + * + * Keep in mind that even if we are currently using a log spacemap + * we want current frees to end up in the ms_allocatable (but not + * get appended to the ms_sm) so their ranges can be reused as usual. + */ + if (spa_sync_pass(spa) == 1) { + range_tree_swap(&msp->ms_freeing, &msp->ms_freed); + ASSERT0(msp->ms_allocated_this_txg); + } else { + range_tree_vacate(msp->ms_freeing, + range_tree_add, msp->ms_freed); + } + msp->ms_allocated_this_txg += range_tree_space(alloctree); + range_tree_vacate(alloctree, NULL, NULL); + + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_allocating[TXG_CLEAN(txg) + & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + + mutex_exit(&msp->ms_lock); + + /* + * Verify that the space map object ID has been recorded in the + * vdev_ms_array. + */ + uint64_t object; + VERIFY0(dmu_read(mos, vd->vdev_ms_array, + msp->ms_id * sizeof (uint64_t), sizeof (uint64_t), &object, 0)); + VERIFY3U(object, ==, space_map_object(msp->ms_sm)); + + mutex_exit(&msp->ms_sync_lock); + dmu_tx_commit(tx); +} + +static void +metaslab_evict(metaslab_t *msp, uint64_t txg) +{ + if (!msp->ms_loaded || msp->ms_disabled != 0) + return; + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) { + VERIFY0(range_tree_space( + msp->ms_allocating[(txg + t) & TXG_MASK])); + } + if (msp->ms_allocator != -1) + metaslab_passivate(msp, msp->ms_weight & ~METASLAB_ACTIVE_MASK); + + if (!metaslab_debug_unload) + metaslab_unload(msp); +} + +/* + * Called after a transaction group has completely synced to mark + * all of the metaslab's free space as usable. + */ +void +metaslab_sync_done(metaslab_t *msp, uint64_t txg) +{ + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + spa_t *spa = vd->vdev_spa; + range_tree_t **defer_tree; + int64_t alloc_delta, defer_delta; + boolean_t defer_allowed = B_TRUE; + + ASSERT(!vd->vdev_ishole); + + mutex_enter(&msp->ms_lock); + + /* + * If this metaslab is just becoming available, initialize its + * range trees and add its capacity to the vdev. + */ + if (msp->ms_freed == NULL) { + range_seg_type_t type; + uint64_t shift, start; + type = metaslab_calculate_range_tree_type(vd, msp, &start, + &shift); + + for (int t = 0; t < TXG_SIZE; t++) { + ASSERT(msp->ms_allocating[t] == NULL); + + msp->ms_allocating[t] = range_tree_create(NULL, type, + NULL, start, shift); + } + + ASSERT3P(msp->ms_freeing, ==, NULL); + msp->ms_freeing = range_tree_create(NULL, type, NULL, start, + shift); + + ASSERT3P(msp->ms_freed, ==, NULL); + msp->ms_freed = range_tree_create(NULL, type, NULL, start, + shift); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) { + ASSERT3P(msp->ms_defer[t], ==, NULL); + msp->ms_defer[t] = range_tree_create(NULL, type, NULL, + start, shift); + } + + ASSERT3P(msp->ms_checkpointing, ==, NULL); + msp->ms_checkpointing = range_tree_create(NULL, type, NULL, + start, shift); + + ASSERT3P(msp->ms_unflushed_allocs, ==, NULL); + msp->ms_unflushed_allocs = range_tree_create(NULL, type, NULL, + start, shift); + + metaslab_rt_arg_t *mrap = kmem_zalloc(sizeof (*mrap), KM_SLEEP); + mrap->mra_bt = &msp->ms_unflushed_frees_by_size; + mrap->mra_floor_shift = metaslab_by_size_min_shift; + ASSERT3P(msp->ms_unflushed_frees, ==, NULL); + msp->ms_unflushed_frees = range_tree_create(&metaslab_rt_ops, + type, mrap, start, shift); + + metaslab_space_update(vd, mg->mg_class, 0, 0, msp->ms_size); + } + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + + defer_tree = &msp->ms_defer[txg % TXG_DEFER_SIZE]; + + uint64_t free_space = metaslab_class_get_space(spa_normal_class(spa)) - + metaslab_class_get_alloc(spa_normal_class(spa)); + if (free_space <= spa_get_slop_space(spa) || vd->vdev_removing) { + defer_allowed = B_FALSE; + } + + defer_delta = 0; + alloc_delta = msp->ms_allocated_this_txg - + range_tree_space(msp->ms_freed); + + if (defer_allowed) { + defer_delta = range_tree_space(msp->ms_freed) - + range_tree_space(*defer_tree); + } else { + defer_delta -= range_tree_space(*defer_tree); + } + metaslab_space_update(vd, mg->mg_class, alloc_delta + defer_delta, + defer_delta, 0); + + if (spa_syncing_log_sm(spa) == NULL) { + /* + * If there's a metaslab_load() in progress and we don't have + * a log space map, it means that we probably wrote to the + * metaslab's space map. If this is the case, we need to + * make sure that we wait for the load to complete so that we + * have a consistent view at the in-core side of the metaslab. + */ + metaslab_load_wait(msp); + } else { + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + } + + /* + * When auto-trimming is enabled, free ranges which are added to + * ms_allocatable are also be added to ms_trim. The ms_trim tree is + * periodically consumed by the vdev_autotrim_thread() which issues + * trims for all ranges and then vacates the tree. The ms_trim tree + * can be discarded at any time with the sole consequence of recent + * frees not being trimmed. + */ + if (spa_get_autotrim(spa) == SPA_AUTOTRIM_ON) { + range_tree_walk(*defer_tree, range_tree_add, msp->ms_trim); + if (!defer_allowed) { + range_tree_walk(msp->ms_freed, range_tree_add, + msp->ms_trim); + } + } else { + range_tree_vacate(msp->ms_trim, NULL, NULL); + } + + /* + * Move the frees from the defer_tree back to the free + * range tree (if it's loaded). Swap the freed_tree and + * the defer_tree -- this is safe to do because we've + * just emptied out the defer_tree. + */ + range_tree_vacate(*defer_tree, + msp->ms_loaded ? range_tree_add : NULL, msp->ms_allocatable); + if (defer_allowed) { + range_tree_swap(&msp->ms_freed, defer_tree); + } else { + range_tree_vacate(msp->ms_freed, + msp->ms_loaded ? range_tree_add : NULL, + msp->ms_allocatable); + } + + msp->ms_synced_length = space_map_length(msp->ms_sm); + + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, msp->ms_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + metaslab_aux_histograms_update_done(msp, defer_allowed); + + if (msp->ms_new) { + msp->ms_new = B_FALSE; + mutex_enter(&mg->mg_lock); + mg->mg_ms_ready++; + mutex_exit(&mg->mg_lock); + } + + /* + * Re-sort metaslab within its group now that we've adjusted + * its allocatable space. + */ + metaslab_recalculate_weight_and_sort(msp); + + ASSERT0(range_tree_space(msp->ms_allocating[txg & TXG_MASK])); + ASSERT0(range_tree_space(msp->ms_freeing)); + ASSERT0(range_tree_space(msp->ms_freed)); + ASSERT0(range_tree_space(msp->ms_checkpointing)); + msp->ms_allocating_total -= msp->ms_allocated_this_txg; + msp->ms_allocated_this_txg = 0; + mutex_exit(&msp->ms_lock); +} + +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_class->mc_spa; + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + metaslab_group_alloc_update(mg); + mg->mg_fragmentation = metaslab_group_fragmentation(mg); + + /* + * Preload the next potential metaslabs but only on active + * metaslab groups. We can get into a state where the metaslab + * is no longer active since we dirty metaslabs as we remove a + * a device, thus potentially making the metaslab group eligible + * for preloading. + */ + if (mg->mg_activation_count > 0) { + metaslab_group_preload(mg); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); +} + +/* + * When writing a ditto block (i.e. more than one DVA for a given BP) on + * the same vdev as an existing DVA of this BP, then try to allocate it + * on a different metaslab than existing DVAs (i.e. a unique metaslab). + */ +static boolean_t +metaslab_is_unique(metaslab_t *msp, dva_t *dva) +{ + uint64_t dva_ms_id; + + if (DVA_GET_ASIZE(dva) == 0) + return (B_TRUE); + + if (msp->ms_group->mg_vd->vdev_id != DVA_GET_VDEV(dva)) + return (B_TRUE); + + dva_ms_id = DVA_GET_OFFSET(dva) >> msp->ms_group->mg_vd->vdev_ms_shift; + + return (msp->ms_id != dva_ms_id); +} + +/* + * ========================================================================== + * Metaslab allocation tracing facility + * ========================================================================== + */ +#ifdef _METASLAB_TRACING + +/* + * Add an allocation trace element to the allocation tracing list. + */ +static void +metaslab_trace_add(zio_alloc_list_t *zal, metaslab_group_t *mg, + metaslab_t *msp, uint64_t psize, uint32_t dva_id, uint64_t offset, + int allocator) +{ + metaslab_alloc_trace_t *mat; + + if (!metaslab_trace_enabled) + return; + + /* + * When the tracing list reaches its maximum we remove + * the second element in the list before adding a new one. + * By removing the second element we preserve the original + * entry as a clue to what allocations steps have already been + * performed. + */ + if (zal->zal_size == metaslab_trace_max_entries) { + metaslab_alloc_trace_t *mat_next; +#ifdef ZFS_DEBUG + panic("too many entries in allocation list"); +#endif + METASLABSTAT_BUMP(metaslabstat_trace_over_limit); + zal->zal_size--; + mat_next = list_next(&zal->zal_list, list_head(&zal->zal_list)); + list_remove(&zal->zal_list, mat_next); + kmem_cache_free(metaslab_alloc_trace_cache, mat_next); + } + + mat = kmem_cache_alloc(metaslab_alloc_trace_cache, KM_SLEEP); + list_link_init(&mat->mat_list_node); + mat->mat_mg = mg; + mat->mat_msp = msp; + mat->mat_size = psize; + mat->mat_dva_id = dva_id; + mat->mat_offset = offset; + mat->mat_weight = 0; + mat->mat_allocator = allocator; + + if (msp != NULL) + mat->mat_weight = msp->ms_weight; + + /* + * The list is part of the zio so locking is not required. Only + * a single thread will perform allocations for a given zio. + */ + list_insert_tail(&zal->zal_list, mat); + zal->zal_size++; + + ASSERT3U(zal->zal_size, <=, metaslab_trace_max_entries); +} + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ + list_create(&zal->zal_list, sizeof (metaslab_alloc_trace_t), + offsetof(metaslab_alloc_trace_t, mat_list_node)); + zal->zal_size = 0; +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ + metaslab_alloc_trace_t *mat; + + while ((mat = list_remove_head(&zal->zal_list)) != NULL) + kmem_cache_free(metaslab_alloc_trace_cache, mat); + list_destroy(&zal->zal_list); + zal->zal_size = 0; +} +#else + +#define metaslab_trace_add(zal, mg, msp, psize, id, off, alloc) + +void +metaslab_trace_init(zio_alloc_list_t *zal) +{ +} + +void +metaslab_trace_fini(zio_alloc_list_t *zal) +{ +} + +#endif /* _METASLAB_TRACING */ + +/* + * ========================================================================== + * Metaslab block operations + * ========================================================================== + */ + +static void +metaslab_group_alloc_increment(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + (flags & METASLAB_DONT_THROTTLE)) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + (void) zfs_refcount_add(&mga->mga_alloc_queue_depth, tag); +} + +static void +metaslab_group_increment_qdepth(metaslab_group_t *mg, int allocator) +{ + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + uint64_t max = mg->mg_max_alloc_queue_depth; + uint64_t cur = mga->mga_cur_max_alloc_queue_depth; + while (cur < max) { + if (atomic_cas_64(&mga->mga_cur_max_alloc_queue_depth, + cur, cur + 1) == cur) { + atomic_inc_64( + &mg->mg_class->mc_alloc_max_slots[allocator]); + return; + } + cur = mga->mga_cur_max_alloc_queue_depth; + } +} + +void +metaslab_group_alloc_decrement(spa_t *spa, uint64_t vdev, void *tag, int flags, + int allocator, boolean_t io_complete) +{ + if (!(flags & METASLAB_ASYNC_ALLOC) || + (flags & METASLAB_DONT_THROTTLE)) + return; + + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + if (!mg->mg_class->mc_alloc_throttle_enabled) + return; + + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + (void) zfs_refcount_remove(&mga->mga_alloc_queue_depth, tag); + if (io_complete) + metaslab_group_increment_qdepth(mg, allocator); +} + +void +metaslab_group_alloc_verify(spa_t *spa, const blkptr_t *bp, void *tag, + int allocator) +{ +#ifdef ZFS_DEBUG + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + for (int d = 0; d < ndvas; d++) { + uint64_t vdev = DVA_GET_VDEV(&dva[d]); + metaslab_group_t *mg = vdev_lookup_top(spa, vdev)->vdev_mg; + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + VERIFY(zfs_refcount_not_held(&mga->mga_alloc_queue_depth, tag)); + } +#endif +} + +static uint64_t +metaslab_block_alloc(metaslab_t *msp, uint64_t size, uint64_t txg) +{ + uint64_t start; + range_tree_t *rt = msp->ms_allocatable; + metaslab_class_t *mc = msp->ms_group->mg_class; + + ASSERT(MUTEX_HELD(&msp->ms_lock)); + VERIFY(!msp->ms_condensing); + VERIFY0(msp->ms_disabled); + + start = mc->mc_ops->msop_alloc(msp, size); + if (start != -1ULL) { + metaslab_group_t *mg = msp->ms_group; + vdev_t *vd = mg->mg_vd; + + VERIFY0(P2PHASE(start, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(rt) - size, <=, msp->ms_size); + range_tree_remove(rt, start, size); + range_tree_clear(msp->ms_trim, start, size); + + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(mg->mg_vd, VDD_METASLAB, msp, txg); + + range_tree_add(msp->ms_allocating[txg & TXG_MASK], start, size); + msp->ms_allocating_total += size; + + /* Track the last successful allocation */ + msp->ms_alloc_txg = txg; + metaslab_verify_space(msp, txg); + } + + /* + * Now that we've attempted the allocation we need to update the + * metaslab's maximum block size since it may have changed. + */ + msp->ms_max_size = metaslab_largest_allocatable(msp); + return (start); +} + +/* + * Find the metaslab with the highest weight that is less than what we've + * already tried. In the common case, this means that we will examine each + * metaslab at most once. Note that concurrent callers could reorder metaslabs + * by activation/passivation once we have dropped the mg_lock. If a metaslab is + * activated by another thread, and we fail to allocate from the metaslab we + * have selected, we may not try the newly-activated metaslab, and instead + * activate another metaslab. This is not optimal, but generally does not cause + * any problems (a possible exception being if every metaslab is completely full + * except for the newly-activated metaslab which we fail to examine). + */ +static metaslab_t * +find_valid_metaslab(metaslab_group_t *mg, uint64_t activation_weight, + dva_t *dva, int d, boolean_t want_unique, uint64_t asize, int allocator, + boolean_t try_hard, zio_alloc_list_t *zal, metaslab_t *search, + boolean_t *was_active) +{ + avl_index_t idx; + avl_tree_t *t = &mg->mg_metaslab_tree; + metaslab_t *msp = avl_find(t, search, &idx); + if (msp == NULL) + msp = avl_nearest(t, idx, AVL_AFTER); + + for (; msp != NULL; msp = AVL_NEXT(t, msp)) { + int i; + if (!metaslab_should_allocate(msp, asize, try_hard)) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + continue; + } + + /* + * If the selected metaslab is condensing or disabled, + * skip it. + */ + if (msp->ms_condensing || msp->ms_disabled > 0) + continue; + + *was_active = msp->ms_allocator != -1; + /* + * If we're activating as primary, this is our first allocation + * from this disk, so we don't need to check how close we are. + * If the metaslab under consideration was already active, + * we're getting desperate enough to steal another allocator's + * metaslab, so we still don't care about distances. + */ + if (activation_weight == METASLAB_WEIGHT_PRIMARY || *was_active) + break; + + for (i = 0; i < d; i++) { + if (want_unique && + !metaslab_is_unique(msp, &dva[i])) + break; /* try another metaslab */ + } + if (i == d) + break; + } + + if (msp != NULL) { + search->ms_weight = msp->ms_weight; + search->ms_start = msp->ms_start + 1; + search->ms_allocator = msp->ms_allocator; + search->ms_primary = msp->ms_primary; + } + return (msp); +} + +static void +metaslab_active_mask_verify(metaslab_t *msp) +{ + ASSERT(MUTEX_HELD(&msp->ms_lock)); + + if ((zfs_flags & ZFS_DEBUG_METASLAB_VERIFY) == 0) + return; + + if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) + return; + + if (msp->ms_weight & METASLAB_WEIGHT_PRIMARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_SECONDARY) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_CLAIM); + VERIFY3S(msp->ms_allocator, !=, -1); + VERIFY(!msp->ms_primary); + return; + } + + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM) { + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_PRIMARY); + VERIFY0(msp->ms_weight & METASLAB_WEIGHT_SECONDARY); + VERIFY3S(msp->ms_allocator, ==, -1); + return; + } +} + +/* ARGSUSED */ +static uint64_t +metaslab_group_alloc_normal(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) +{ + metaslab_t *msp = NULL; + uint64_t offset = -1ULL; + + uint64_t activation_weight = METASLAB_WEIGHT_PRIMARY; + for (int i = 0; i < d; i++) { + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + activation_weight = METASLAB_WEIGHT_SECONDARY; + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { + activation_weight = METASLAB_WEIGHT_CLAIM; + break; + } + } + + /* + * If we don't have enough metaslabs active to fill the entire array, we + * just use the 0th slot. + */ + if (mg->mg_ms_ready < mg->mg_allocators * 3) + allocator = 0; + metaslab_group_allocator_t *mga = &mg->mg_allocator[allocator]; + + ASSERT3U(mg->mg_vd->vdev_ms_count, >=, 2); + + metaslab_t *search = kmem_alloc(sizeof (*search), KM_SLEEP); + search->ms_weight = UINT64_MAX; + search->ms_start = 0; + /* + * At the end of the metaslab tree are the already-active metaslabs, + * first the primaries, then the secondaries. When we resume searching + * through the tree, we need to consider ms_allocator and ms_primary so + * we start in the location right after where we left off, and don't + * accidentally loop forever considering the same metaslabs. + */ + search->ms_allocator = -1; + search->ms_primary = B_TRUE; + for (;;) { + boolean_t was_active = B_FALSE; + + mutex_enter(&mg->mg_lock); + + if (activation_weight == METASLAB_WEIGHT_PRIMARY && + mga->mga_primary != NULL) { + msp = mga->mga_primary; + + /* + * Even though we don't hold the ms_lock for the + * primary metaslab, those fields should not + * change while we hold the mg_lock. Thus it is + * safe to make assertions on them. + */ + ASSERT(msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + } else if (activation_weight == METASLAB_WEIGHT_SECONDARY && + mga->mga_secondary != NULL) { + msp = mga->mga_secondary; + + /* + * See comment above about the similar assertions + * for the primary metaslab. + */ + ASSERT(!msp->ms_primary); + ASSERT3S(msp->ms_allocator, ==, allocator); + ASSERT(msp->ms_loaded); + + was_active = B_TRUE; + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + } else { + msp = find_valid_metaslab(mg, activation_weight, dva, d, + want_unique, asize, allocator, try_hard, zal, + search, &was_active); + } + + mutex_exit(&mg->mg_lock); + if (msp == NULL) { + kmem_free(search, sizeof (*search)); + return (-1ULL); + } + mutex_enter(&msp->ms_lock); + + metaslab_active_mask_verify(msp); + + /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE3(ms__activation__attempt, + metaslab_t *, msp, uint64_t, activation_weight, + boolean_t, was_active); +#endif + + /* + * Ensure that the metaslab we have selected is still + * capable of handling our request. It's possible that + * another thread may have changed the weight while we + * were blocked on the metaslab lock. We check the + * active status first to see if we need to set_selected_txg + * a new metaslab. + */ + if (was_active && !(msp->ms_weight & METASLAB_ACTIVE_MASK)) { + ASSERT3S(msp->ms_allocator, ==, -1); + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If the metaslab was activated for another allocator + * while we were waiting in the ms_lock above, or it's + * a primary and we're seeking a secondary (or vice versa), + * we go back and select a new metaslab. + */ + if (!was_active && (msp->ms_weight & METASLAB_ACTIVE_MASK) && + (msp->ms_allocator != -1) && + (msp->ms_allocator != allocator || ((activation_weight == + METASLAB_WEIGHT_PRIMARY) != msp->ms_primary))) { + ASSERT(msp->ms_loaded); + ASSERT((msp->ms_weight & METASLAB_WEIGHT_CLAIM) || + msp->ms_allocator != -1); + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * This metaslab was used for claiming regions allocated + * by the ZIL during pool import. Once these regions are + * claimed we don't need to keep the CLAIM bit set + * anymore. Passivate this metaslab to zero its activation + * mask. + */ + if (msp->ms_weight & METASLAB_WEIGHT_CLAIM && + activation_weight != METASLAB_WEIGHT_CLAIM) { + ASSERT(msp->ms_loaded); + ASSERT3S(msp->ms_allocator, ==, -1); + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_WEIGHT_CLAIM); + mutex_exit(&msp->ms_lock); + continue; + } + + metaslab_set_selected_txg(msp, txg); + + int activation_error = + metaslab_activate(msp, allocator, activation_weight); + metaslab_active_mask_verify(msp); + + /* + * If the metaslab was activated by another thread for + * another allocator or activation_weight (EBUSY), or it + * failed because another metaslab was assigned as primary + * for this allocator (EEXIST) we continue using this + * metaslab for our allocation, rather than going on to a + * worse metaslab (we waited for that metaslab to be loaded + * after all). + * + * If the activation failed due to an I/O error or ENOSPC we + * skip to the next metaslab. + */ + boolean_t activated; + if (activation_error == 0) { + activated = B_TRUE; + } else if (activation_error == EBUSY || + activation_error == EEXIST) { + activated = B_FALSE; + } else { + mutex_exit(&msp->ms_lock); + continue; + } + ASSERT(msp->ms_loaded); + + /* + * Now that we have the lock, recheck to see if we should + * continue to use this metaslab for this allocation. The + * the metaslab is now loaded so metaslab_should_allocate() + * can accurately determine if the allocation attempt should + * proceed. + */ + if (!metaslab_should_allocate(msp, asize, try_hard)) { + /* Passivate this metaslab and select a new one. */ + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_TOO_SMALL, allocator); + goto next; + } + + /* + * If this metaslab is currently condensing then pick again + * as we can't manipulate this metaslab until it's committed + * to disk. If this metaslab is being initialized, we shouldn't + * allocate from it since the allocated region might be + * overwritten after allocation. + */ + if (msp->ms_condensing) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_CONDENSING, allocator); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + mutex_exit(&msp->ms_lock); + continue; + } else if (msp->ms_disabled > 0) { + metaslab_trace_add(zal, mg, msp, asize, d, + TRACE_DISABLED, allocator); + if (activated) { + metaslab_passivate(msp, msp->ms_weight & + ~METASLAB_ACTIVE_MASK); + } + mutex_exit(&msp->ms_lock); + continue; + } + + offset = metaslab_block_alloc(msp, asize, txg); + metaslab_trace_add(zal, mg, msp, asize, d, offset, allocator); + + if (offset != -1ULL) { + /* Proactively passivate the metaslab, if needed */ + if (activated) + metaslab_segment_may_passivate(msp); + break; + } +next: + ASSERT(msp->ms_loaded); + + /* + * This code is disabled out because of issues with + * tracepoints in non-gpl kernel modules. + */ +#if 0 + DTRACE_PROBE2(ms__alloc__failure, metaslab_t *, msp, + uint64_t, asize); +#endif + + /* + * We were unable to allocate from this metaslab so determine + * a new weight for this metaslab. Now that we have loaded + * the metaslab we can provide a better hint to the metaslab + * selector. + * + * For space-based metaslabs, we use the maximum block size. + * This information is only available when the metaslab + * is loaded and is more accurate than the generic free + * space weight that was calculated by metaslab_weight(). + * This information allows us to quickly compare the maximum + * available allocation in the metaslab to the allocation + * size being requested. + * + * For segment-based metaslabs, determine the new weight + * based on the highest bucket in the range tree. We + * explicitly use the loaded segment weight (i.e. the range + * tree histogram) since it contains the space that is + * currently available for allocation and is accurate + * even within a sync pass. + */ + uint64_t weight; + if (WEIGHT_IS_SPACEBASED(msp->ms_weight)) { + weight = metaslab_largest_allocatable(msp); + WEIGHT_SET_SPACEBASED(weight); + } else { + weight = metaslab_weight_from_range_tree(msp); + } + + if (activated) { + metaslab_passivate(msp, weight); + } else { + /* + * For the case where we use the metaslab that is + * active for another allocator we want to make + * sure that we retain the activation mask. + * + * Note that we could attempt to use something like + * metaslab_recalculate_weight_and_sort() that + * retains the activation mask here. That function + * uses metaslab_weight() to set the weight though + * which is not as accurate as the calculations + * above. + */ + weight |= msp->ms_weight & METASLAB_ACTIVE_MASK; + metaslab_group_sort(mg, msp, weight); + } + metaslab_active_mask_verify(msp); + + /* + * We have just failed an allocation attempt, check + * that metaslab_should_allocate() agrees. Otherwise, + * we may end up in an infinite loop retrying the same + * metaslab. + */ + ASSERT(!metaslab_should_allocate(msp, asize, try_hard)); + + mutex_exit(&msp->ms_lock); + } + mutex_exit(&msp->ms_lock); + kmem_free(search, sizeof (*search)); + return (offset); +} + +static uint64_t +metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, + uint64_t asize, uint64_t txg, boolean_t want_unique, dva_t *dva, int d, + int allocator, boolean_t try_hard) +{ + uint64_t offset; + ASSERT(mg->mg_initialized); + + offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, + dva, d, allocator, try_hard); + + mutex_enter(&mg->mg_lock); + if (offset == -1ULL) { + mg->mg_failed_allocations++; + metaslab_trace_add(zal, mg, NULL, asize, d, + TRACE_GROUP_FAILURE, allocator); + if (asize == SPA_GANGBLOCKSIZE) { + /* + * This metaslab group was unable to allocate + * the minimum gang block size so it must be out of + * space. We must notify the allocation throttle + * to start skipping allocation attempts to this + * metaslab group until more space becomes available. + * Note: this failure cannot be caused by the + * allocation throttle since the allocation throttle + * is only responsible for skipping devices and + * not failing block allocations. + */ + mg->mg_no_free_space = B_TRUE; + } + } + mg->mg_allocations++; + mutex_exit(&mg->mg_lock); + return (offset); +} + +/* + * Allocate a block for the specified i/o. + */ +int +metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, + dva_t *dva, int d, dva_t *hintdva, uint64_t txg, int flags, + zio_alloc_list_t *zal, int allocator) +{ + metaslab_group_t *mg, *fast_mg, *rotor; + vdev_t *vd; + boolean_t try_hard = B_FALSE; + + ASSERT(!DVA_IS_VALID(&dva[d])); + + /* + * For testing, make some blocks above a certain size be gang blocks. + * This will result in more split blocks when using device removal, + * and a large number of split blocks coupled with ztest-induced + * damage can result in extremely long reconstruction times. This + * will also test spilling from special to normal. + */ + if (psize >= metaslab_force_ganging && (spa_get_random(100) < 3)) { + metaslab_trace_add(zal, NULL, NULL, psize, d, TRACE_FORCE_GANG, + allocator); + return (SET_ERROR(ENOSPC)); + } + + /* + * Start at the rotor and loop through all mgs until we find something. + * Note that there's no locking on mc_rotor or mc_aliquot because + * nothing actually breaks if we miss a few updates -- we just won't + * allocate quite as evenly. It all balances out over time. + * + * If we are doing ditto or log blocks, try to spread them across + * consecutive vdevs. If we're forced to reuse a vdev before we've + * allocated all of our ditto blocks, then try and spread them out on + * that vdev as much as possible. If it turns out to not be possible, + * gradually lower our standards until anything becomes acceptable. + * Also, allocating on consecutive vdevs (as opposed to random vdevs) + * gives us hope of containing our fault domains to something we're + * able to reason about. Otherwise, any two top-level vdev failures + * will guarantee the loss of data. With consecutive allocation, + * only two adjacent top-level vdev failures will result in data loss. + * + * If we are doing gang blocks (hintdva is non-NULL), try to keep + * ourselves on the same vdev as our gang block header. That + * way, we can hope for locality in vdev_cache, plus it makes our + * fault domains something tractable. + */ + if (hintdva) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); + + /* + * It's possible the vdev we're using as the hint no + * longer exists or its mg has been closed (e.g. by + * device removal). Consult the rotor when + * all else fails. + */ + if (vd != NULL && vd->vdev_mg != NULL) { + mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } + } else if (d != 0) { + vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); + mg = vd->vdev_mg->mg_next; + } else if (flags & METASLAB_FASTWRITE) { + mg = fast_mg = mc->mc_rotor; + + do { + if (fast_mg->mg_vd->vdev_pending_fastwrite < + mg->mg_vd->vdev_pending_fastwrite) + mg = fast_mg; + } while ((fast_mg = fast_mg->mg_next) != mc->mc_rotor); + + } else { + ASSERT(mc->mc_rotor != NULL); + mg = mc->mc_rotor; + } + + /* + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. + */ + if (mg->mg_class != mc || mg->mg_activation_count <= 0) + mg = mc->mc_rotor; + + rotor = mg; +top: + do { + boolean_t allocatable; + + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; + + /* + * Don't allocate from faulted devices. + */ + if (try_hard) { + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + spa_config_exit(spa, SCL_ZIO, FTAG); + } else { + allocatable = vdev_allocatable(vd); + } + + /* + * Determine if the selected metaslab group is eligible + * for allocations. If we're ganging then don't allow + * this metaslab group to skip allocations since that would + * inadvertently return ENOSPC and suspend the pool + * even though space is still available. + */ + if (allocatable && !GANG_ALLOCATION(flags) && !try_hard) { + allocatable = metaslab_group_allocatable(mg, rotor, + psize, allocator, d); + } + + if (!allocatable) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_NOT_ALLOCATABLE, allocator); + goto next; + } + + ASSERT(mg->mg_initialized); + + /* + * Avoid writing single-copy data to a failing, + * non-redundant vdev, unless we've already tried all + * other vdevs. + */ + if ((vd->vdev_stat.vs_write_errors > 0 || + vd->vdev_state < VDEV_STATE_HEALTHY) && + d == 0 && !try_hard && vd->vdev_children == 0) { + metaslab_trace_add(zal, mg, NULL, psize, d, + TRACE_VDEV_ERROR, allocator); + goto next; + } + + ASSERT(mg->mg_class == mc); + + uint64_t asize = vdev_psize_to_asize(vd, psize); + ASSERT(P2PHASE(asize, 1ULL << vd->vdev_ashift) == 0); + + /* + * If we don't need to try hard, then require that the + * block be on a different metaslab from any other DVAs + * in this BP (unique=true). If we are trying hard, then + * allow any metaslab to be used (unique=false). + */ + uint64_t offset = metaslab_group_alloc(mg, zal, asize, txg, + !try_hard, dva, d, allocator, try_hard); + + if (offset != -1ULL) { + /* + * If we've just selected this metaslab group, + * figure out whether the corresponding vdev is + * over- or under-used relative to the pool, + * and set an allocation bias to even it out. + * + * Bias is also used to compensate for unequally + * sized vdevs so that space is allocated fairly. + */ + if (mc->mc_aliquot == 0 && metaslab_bias_enabled) { + vdev_stat_t *vs = &vd->vdev_stat; + int64_t vs_free = vs->vs_space - vs->vs_alloc; + int64_t mc_free = mc->mc_space - mc->mc_alloc; + int64_t ratio; + + /* + * Calculate how much more or less we should + * try to allocate from this device during + * this iteration around the rotor. + * + * This basically introduces a zero-centered + * bias towards the devices with the most + * free space, while compensating for vdev + * size differences. + * + * Examples: + * vdev V1 = 16M/128M + * vdev V2 = 16M/128M + * ratio(V1) = 100% ratio(V2) = 100% + * + * vdev V1 = 16M/128M + * vdev V2 = 64M/128M + * ratio(V1) = 127% ratio(V2) = 72% + * + * vdev V1 = 16M/128M + * vdev V2 = 64M/512M + * ratio(V1) = 40% ratio(V2) = 160% + */ + ratio = (vs_free * mc->mc_alloc_groups * 100) / + (mc_free + 1); + mg->mg_bias = ((ratio - 100) * + (int64_t)mg->mg_aliquot) / 100; + } else if (!metaslab_bias_enabled) { + mg->mg_bias = 0; + } + + if ((flags & METASLAB_FASTWRITE) || + atomic_add_64_nv(&mc->mc_aliquot, asize) >= + mg->mg_aliquot + mg->mg_bias) { + mc->mc_rotor = mg->mg_next; + mc->mc_aliquot = 0; + } + + DVA_SET_VDEV(&dva[d], vd->vdev_id); + DVA_SET_OFFSET(&dva[d], offset); + DVA_SET_GANG(&dva[d], + ((flags & METASLAB_GANG_HEADER) ? 1 : 0)); + DVA_SET_ASIZE(&dva[d], asize); + + if (flags & METASLAB_FASTWRITE) { + atomic_add_64(&vd->vdev_pending_fastwrite, + psize); + } + + return (0); + } +next: + mc->mc_rotor = mg->mg_next; + mc->mc_aliquot = 0; + } while ((mg = mg->mg_next) != rotor); + + /* + * If we haven't tried hard, do so now. + */ + if (!try_hard) { + try_hard = B_TRUE; + goto top; + } + + bzero(&dva[d], sizeof (dva_t)); + + metaslab_trace_add(zal, rotor, NULL, psize, d, TRACE_ENOSPC, allocator); + return (SET_ERROR(ENOSPC)); +} + +void +metaslab_free_concrete(vdev_t *vd, uint64_t offset, uint64_t asize, + boolean_t checkpoint) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + asize, <=, msp->ms_start + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(asize, 1ULL << vd->vdev_ashift)); + + metaslab_check_free_impl(vd, offset, asize); + + mutex_enter(&msp->ms_lock); + if (range_tree_is_empty(msp->ms_freeing) && + range_tree_is_empty(msp->ms_checkpointing)) { + vdev_dirty(vd, VDD_METASLAB, msp, spa_syncing_txg(spa)); + } + + if (checkpoint) { + ASSERT(spa_has_checkpoint(spa)); + range_tree_add(msp->ms_checkpointing, offset, asize); + } else { + range_tree_add(msp->ms_freeing, offset, asize); + } + mutex_exit(&msp->ms_lock); +} + +/* ARGSUSED */ +void +metaslab_free_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + boolean_t *checkpoint = arg; + + ASSERT3P(checkpoint, !=, NULL); + + if (vd->vdev_ops->vdev_op_remap != NULL) + vdev_indirect_mark_obsolete(vd, offset, size); + else + metaslab_free_impl(vd, offset, size, *checkpoint); +} + +static void +metaslab_free_impl(vdev_t *vd, uint64_t offset, uint64_t size, + boolean_t checkpoint) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (spa_syncing_txg(spa) > spa_freeze_txg(spa)) + return; + + if (spa->spa_vdev_removal != NULL && + spa->spa_vdev_removal->svr_vdev_id == vd->vdev_id && + vdev_is_concrete(vd)) { + /* + * Note: we check if the vdev is concrete because when + * we complete the removal, we first change the vdev to be + * an indirect vdev (in open context), and then (in syncing + * context) clear spa_vdev_removal. + */ + free_from_removing_vdev(vd, offset, size); + } else if (vd->vdev_ops->vdev_op_remap != NULL) { + vdev_indirect_mark_obsolete(vd, offset, size); + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &checkpoint); + } else { + metaslab_free_concrete(vd, offset, size, checkpoint); + } +} + +typedef struct remap_blkptr_cb_arg { + blkptr_t *rbca_bp; + spa_remap_cb_t rbca_cb; + vdev_t *rbca_remap_vd; + uint64_t rbca_remap_offset; + void *rbca_cb_arg; +} remap_blkptr_cb_arg_t; + +static void +remap_blkptr_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + remap_blkptr_cb_arg_t *rbca = arg; + blkptr_t *bp = rbca->rbca_bp; + + /* We can not remap split blocks. */ + if (size != DVA_GET_ASIZE(&bp->blk_dva[0])) + return; + ASSERT0(inner_offset); + + if (rbca->rbca_cb != NULL) { + /* + * At this point we know that we are not handling split + * blocks and we invoke the callback on the previous + * vdev which must be indirect. + */ + ASSERT3P(rbca->rbca_remap_vd->vdev_ops, ==, &vdev_indirect_ops); + + rbca->rbca_cb(rbca->rbca_remap_vd->vdev_id, + rbca->rbca_remap_offset, size, rbca->rbca_cb_arg); + + /* set up remap_blkptr_cb_arg for the next call */ + rbca->rbca_remap_vd = vd; + rbca->rbca_remap_offset = offset; + } + + /* + * The phys birth time is that of dva[0]. This ensures that we know + * when each dva was written, so that resilver can determine which + * blocks need to be scrubbed (i.e. those written during the time + * the vdev was offline). It also ensures that the key used in + * the ARC hash table is unique (i.e. dva[0] + phys_birth). If + * we didn't change the phys_birth, a lookup in the ARC for a + * remapped BP could find the data that was previously stored at + * this vdev + offset. + */ + vdev_t *oldvd = vdev_lookup_top(vd->vdev_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + vdev_indirect_births_t *vib = oldvd->vdev_indirect_births; + bp->blk_phys_birth = vdev_indirect_births_physbirth(vib, + DVA_GET_OFFSET(&bp->blk_dva[0]), DVA_GET_ASIZE(&bp->blk_dva[0])); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], offset); +} + +/* + * If the block pointer contains any indirect DVAs, modify them to refer to + * concrete DVAs. Note that this will sometimes not be possible, leaving + * the indirect DVA in place. This happens if the indirect DVA spans multiple + * segments in the mapping (i.e. it is a "split block"). + * + * If the BP was remapped, calls the callback on the original dva (note the + * callback can be called multiple times if the original indirect DVA refers + * to another indirect DVA, etc). + * + * Returns TRUE if the BP was remapped. + */ +boolean_t +spa_remap_blkptr(spa_t *spa, blkptr_t *bp, spa_remap_cb_t callback, void *arg) +{ + remap_blkptr_cb_arg_t rbca; + + if (!zfs_remap_blkptr_enable) + return (B_FALSE); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) + return (B_FALSE); + + /* + * Dedup BP's can not be remapped, because ddt_phys_select() depends + * on DVA[0] being the same in the BP as in the DDT (dedup table). + */ + if (BP_GET_DEDUP(bp)) + return (B_FALSE); + + /* + * Gang blocks can not be remapped, because + * zio_checksum_gang_verifier() depends on the DVA[0] that's in + * the BP used to read the gang block header (GBH) being the same + * as the DVA[0] that we allocated for the GBH. + */ + if (BP_IS_GANG(bp)) + return (B_FALSE); + + /* + * Embedded BP's have no DVA to remap. + */ + if (BP_GET_NDVAS(bp) < 1) + return (B_FALSE); + + /* + * Note: we only remap dva[0]. If we remapped other dvas, we + * would no longer know what their phys birth txg is. + */ + dva_t *dva = &bp->blk_dva[0]; + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops->vdev_op_remap == NULL) + return (B_FALSE); + + rbca.rbca_bp = bp; + rbca.rbca_cb = callback; + rbca.rbca_remap_vd = vd; + rbca.rbca_remap_offset = offset; + rbca.rbca_cb_arg = arg; + + /* + * remap_blkptr_cb() will be called in order for each level of + * indirection, until a concrete vdev is reached or a split block is + * encountered. old_vd and old_offset are updated within the callback + * as we go from the one indirect vdev to the next one (either concrete + * or indirect again) in that order. + */ + vd->vdev_ops->vdev_op_remap(vd, offset, size, remap_blkptr_cb, &rbca); + + /* Check if the DVA wasn't remapped because it is a split block */ + if (DVA_GET_VDEV(&rbca.rbca_bp->blk_dva[0]) == vd->vdev_id) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Undo the allocation of a DVA which happened in the given transaction group. + */ +void +metaslab_unalloc_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + metaslab_t *msp; + vdev_t *vd; + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + + ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (txg > spa_freeze_txg(spa)) + return; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL || !DVA_IS_VALID(dva) || + (offset >> vd->vdev_ms_shift) >= vd->vdev_ms_count) { + zfs_panic_recover("metaslab_free_dva(): bad DVA %llu:%llu:%llu", + (u_longlong_t)vdev, (u_longlong_t)offset, + (u_longlong_t)size); + return; + } + + ASSERT(!vd->vdev_removing); + ASSERT(vdev_is_concrete(vd)); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + range_tree_remove(msp->ms_allocating[txg & TXG_MASK], + offset, size); + msp->ms_allocating_total -= size; + + VERIFY(!msp->ms_condensing); + VERIFY3U(offset, >=, msp->ms_start); + VERIFY3U(offset + size, <=, msp->ms_start + msp->ms_size); + VERIFY3U(range_tree_space(msp->ms_allocatable) + size, <=, + msp->ms_size); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + range_tree_add(msp->ms_allocatable, offset, size); + mutex_exit(&msp->ms_lock); +} + +/* + * Free the block represented by the given DVA. + */ +void +metaslab_free_dva(spa_t *spa, const dva_t *dva, boolean_t checkpoint) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd = vdev_lookup_top(spa, vdev); + + ASSERT(DVA_IS_VALID(dva)); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + if (DVA_GET_GANG(dva)) { + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + } + + metaslab_free_impl(vd, offset, size, checkpoint); +} + +/* + * Reserve some allocation slots. The reservation system must be called + * before we call into the allocator. If there aren't any available slots + * then the I/O will be throttled until an I/O completes and its slots are + * freed up. The function returns true if it was successful in placing + * the reservation. + */ +boolean_t +metaslab_class_throttle_reserve(metaslab_class_t *mc, int slots, int allocator, + zio_t *zio, int flags) +{ + uint64_t available_slots = 0; + boolean_t slot_reserved = B_FALSE; + uint64_t max = mc->mc_alloc_max_slots[allocator]; + + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + + uint64_t reserved_slots = + zfs_refcount_count(&mc->mc_alloc_slots[allocator]); + if (reserved_slots < max) + available_slots = max - reserved_slots; + + if (slots <= available_slots || GANG_ALLOCATION(flags) || + flags & METASLAB_MUST_RESERVE) { + /* + * We reserve the slots individually so that we can unreserve + * them individually when an I/O completes. + */ + for (int d = 0; d < slots; d++) { + reserved_slots = + zfs_refcount_add(&mc->mc_alloc_slots[allocator], + zio); + } + zio->io_flags |= ZIO_FLAG_IO_ALLOCATING; + slot_reserved = B_TRUE; + } + + mutex_exit(&mc->mc_lock); + return (slot_reserved); +} + +void +metaslab_class_throttle_unreserve(metaslab_class_t *mc, int slots, + int allocator, zio_t *zio) +{ + ASSERT(mc->mc_alloc_throttle_enabled); + mutex_enter(&mc->mc_lock); + for (int d = 0; d < slots; d++) { + (void) zfs_refcount_remove(&mc->mc_alloc_slots[allocator], + zio); + } + mutex_exit(&mc->mc_lock); +} + +static int +metaslab_claim_concrete(vdev_t *vd, uint64_t offset, uint64_t size, + uint64_t txg) +{ + metaslab_t *msp; + spa_t *spa = vd->vdev_spa; + int error = 0; + + if (offset >> vd->vdev_ms_shift >= vd->vdev_ms_count) + return (SET_ERROR(ENXIO)); + + ASSERT3P(vd->vdev_ms, !=, NULL); + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_loaded) { + error = metaslab_activate(msp, 0, METASLAB_WEIGHT_CLAIM); + if (error == EBUSY) { + ASSERT(msp->ms_loaded); + ASSERT(msp->ms_weight & METASLAB_ACTIVE_MASK); + error = 0; + } + } + + if (error == 0 && + !range_tree_contains(msp->ms_allocatable, offset, size)) + error = SET_ERROR(ENOENT); + + if (error || txg == 0) { /* txg == 0 indicates dry run */ + mutex_exit(&msp->ms_lock); + return (error); + } + + VERIFY(!msp->ms_condensing); + VERIFY0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + VERIFY0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + VERIFY3U(range_tree_space(msp->ms_allocatable) - size, <=, + msp->ms_size); + range_tree_remove(msp->ms_allocatable, offset, size); + range_tree_clear(msp->ms_trim, offset, size); + + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ + metaslab_class_t *mc = msp->ms_group->mg_class; + multilist_sublist_t *mls = + multilist_sublist_lock_obj(mc->mc_metaslab_txg_list, msp); + if (!multilist_link_active(&msp->ms_class_txg_node)) { + msp->ms_selected_txg = txg; + multilist_sublist_insert_head(mls, msp); + } + multilist_sublist_unlock(mls); + + if (range_tree_is_empty(msp->ms_allocating[txg & TXG_MASK])) + vdev_dirty(vd, VDD_METASLAB, msp, txg); + range_tree_add(msp->ms_allocating[txg & TXG_MASK], + offset, size); + msp->ms_allocating_total += size; + } + + mutex_exit(&msp->ms_lock); + + return (0); +} + +typedef struct metaslab_claim_cb_arg_t { + uint64_t mcca_txg; + int mcca_error; +} metaslab_claim_cb_arg_t; + +/* ARGSUSED */ +static void +metaslab_claim_impl_cb(uint64_t inner_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + metaslab_claim_cb_arg_t *mcca_arg = arg; + + if (mcca_arg->mcca_error == 0) { + mcca_arg->mcca_error = metaslab_claim_concrete(vd, offset, + size, mcca_arg->mcca_txg); + } +} + +int +metaslab_claim_impl(vdev_t *vd, uint64_t offset, uint64_t size, uint64_t txg) +{ + if (vd->vdev_ops->vdev_op_remap != NULL) { + metaslab_claim_cb_arg_t arg; + + /* + * Only zdb(1M) can claim on indirect vdevs. This is used + * to detect leaks of mapped space (that are not accounted + * for in the obsolete counts, spacemap, or bpobj). + */ + ASSERT(!spa_writeable(vd->vdev_spa)); + arg.mcca_error = 0; + arg.mcca_txg = txg; + + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_claim_impl_cb, &arg); + + if (arg.mcca_error == 0) { + arg.mcca_error = metaslab_claim_concrete(vd, + offset, size, txg); + } + return (arg.mcca_error); + } else { + return (metaslab_claim_concrete(vd, offset, size, txg)); + } +} + +/* + * Intent log support: upon opening the pool after a crash, notify the SPA + * of blocks that the intent log has allocated for immediate write, but + * which are still considered free by the SPA because the last transaction + * group didn't commit yet. + */ +static int +metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) +{ + uint64_t vdev = DVA_GET_VDEV(dva); + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t size = DVA_GET_ASIZE(dva); + vdev_t *vd; + + if ((vd = vdev_lookup_top(spa, vdev)) == NULL) { + return (SET_ERROR(ENXIO)); + } + + ASSERT(DVA_IS_VALID(dva)); + + if (DVA_GET_GANG(dva)) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + return (metaslab_claim_impl(vd, offset, size, txg)); +} + +int +metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, + int ndvas, uint64_t txg, blkptr_t *hintbp, int flags, + zio_alloc_list_t *zal, zio_t *zio, int allocator) +{ + dva_t *dva = bp->blk_dva; + dva_t *hintdva = (hintbp != NULL) ? hintbp->blk_dva : NULL; + int error = 0; + + ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + if (mc->mc_rotor == NULL) { /* no vdevs in this class */ + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (SET_ERROR(ENOSPC)); + } + + ASSERT(ndvas > 0 && ndvas <= spa_max_replication(spa)); + ASSERT(BP_GET_NDVAS(bp) == 0); + ASSERT(hintbp == NULL || ndvas <= BP_GET_NDVAS(hintbp)); + ASSERT3P(zal, !=, NULL); + + for (int d = 0; d < ndvas; d++) { + error = metaslab_alloc_dva(spa, mc, psize, dva, d, hintdva, + txg, flags, zal, allocator); + if (error != 0) { + for (d--; d >= 0; d--) { + metaslab_unalloc_dva(spa, &dva[d], txg); + metaslab_group_alloc_decrement(spa, + DVA_GET_VDEV(&dva[d]), zio, flags, + allocator, B_FALSE); + bzero(&dva[d], sizeof (dva_t)); + } + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (error); + } else { + /* + * Update the metaslab group's queue depth + * based on the newly allocated dva. + */ + metaslab_group_alloc_increment(spa, + DVA_GET_VDEV(&dva[d]), zio, flags, allocator); + } + + } + ASSERT(error == 0); + ASSERT(BP_GET_NDVAS(bp) == ndvas); + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + BP_SET_BIRTH(bp, txg, 0); + + return (0); +} + +void +metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); + + /* + * If we have a checkpoint for the pool we need to make sure that + * the blocks that we free that are part of the checkpoint won't be + * reused until the checkpoint is discarded or we revert to it. + * + * The checkpoint flag is passed down the metaslab_free code path + * and is set whenever we want to add a block to the checkpoint's + * accounting. That is, we "checkpoint" blocks that existed at the + * time the checkpoint was created and are therefore referenced by + * the checkpointed uberblock. + * + * Note that, we don't checkpoint any blocks if the current + * syncing txg <= spa_checkpoint_txg. We want these frees to sync + * normally as they will be referenced by the checkpointed uberblock. + */ + boolean_t checkpoint = B_FALSE; + if (bp->blk_birth <= spa->spa_checkpoint_txg && + spa_syncing_txg(spa) > spa->spa_checkpoint_txg) { + /* + * At this point, if the block is part of the checkpoint + * there is no way it was created in the current txg. + */ + ASSERT(!now); + ASSERT3U(spa_syncing_txg(spa), ==, txg); + checkpoint = B_TRUE; + } + + spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) { + if (now) { + metaslab_unalloc_dva(spa, &dva[d], txg); + } else { + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + metaslab_free_dva(spa, &dva[d], checkpoint); + } + } + + spa_config_exit(spa, SCL_FREE, FTAG); +} + +int +metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + int error = 0; + + ASSERT(!BP_IS_HOLE(bp)); + + if (txg != 0) { + /* + * First do a dry run to make sure all DVAs are claimable, + * so we don't have to unwind from partial failures below. + */ + if ((error = metaslab_claim(spa, bp, 0)) != 0) + return (error); + } + + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); + + for (int d = 0; d < ndvas; d++) { + error = metaslab_claim_dva(spa, &dva[d], txg); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_ALLOC, FTAG); + + ASSERT(error == 0 || txg == 0); + + return (error); +} + +void +metaslab_fastwrite_mark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + atomic_add_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +void +metaslab_fastwrite_unmark(spa_t *spa, const blkptr_t *bp) +{ + const dva_t *dva = bp->blk_dva; + int ndvas = BP_GET_NDVAS(bp); + uint64_t psize = BP_GET_PSIZE(bp); + int d; + vdev_t *vd; + + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(psize > 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (d = 0; d < ndvas; d++) { + if ((vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d]))) == NULL) + continue; + ASSERT3U(vd->vdev_pending_fastwrite, >=, psize); + atomic_sub_64(&vd->vdev_pending_fastwrite, psize); + } + + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +/* ARGSUSED */ +static void +metaslab_check_free_impl_cb(uint64_t inner, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + metaslab_check_free_impl(vd, offset, size); +} + +static void +metaslab_check_free_impl(vdev_t *vd, uint64_t offset, uint64_t size) +{ + metaslab_t *msp; + spa_t *spa __maybe_unused = vd->vdev_spa; + + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + if (vd->vdev_ops->vdev_op_remap != NULL) { + vd->vdev_ops->vdev_op_remap(vd, offset, size, + metaslab_check_free_impl_cb, NULL); + return; + } + + ASSERT(vdev_is_concrete(vd)); + ASSERT3U(offset >> vd->vdev_ms_shift, <, vd->vdev_ms_count); + ASSERT3U(spa_config_held(spa, SCL_ALL, RW_READER), !=, 0); + + msp = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + + mutex_enter(&msp->ms_lock); + if (msp->ms_loaded) { + range_tree_verify_not_present(msp->ms_allocatable, + offset, size); + } + + /* + * Check all segments that currently exist in the freeing pipeline. + * + * It would intuitively make sense to also check the current allocating + * tree since metaslab_unalloc_dva() exists for extents that are + * allocated and freed in the same sync pass within the same txg. + * Unfortunately there are places (e.g. the ZIL) where we allocate a + * segment but then we free part of it within the same txg + * [see zil_sync()]. Thus, we don't call range_tree_verify() in the + * current allocating tree. + */ + range_tree_verify_not_present(msp->ms_freeing, offset, size); + range_tree_verify_not_present(msp->ms_checkpointing, offset, size); + range_tree_verify_not_present(msp->ms_freed, offset, size); + for (int j = 0; j < TXG_DEFER_SIZE; j++) + range_tree_verify_not_present(msp->ms_defer[j], offset, size); + range_tree_verify_not_present(msp->ms_trim, offset, size); + mutex_exit(&msp->ms_lock); +} + +void +metaslab_check_free(spa_t *spa, const blkptr_t *bp) +{ + if ((zfs_flags & ZFS_DEBUG_ZIO_FREE) == 0) + return; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + vdev_t *vd = vdev_lookup_top(spa, vdev); + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t size = DVA_GET_ASIZE(&bp->blk_dva[i]); + + if (DVA_GET_GANG(&bp->blk_dva[i])) + size = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + + ASSERT3P(vd, !=, NULL); + + metaslab_check_free_impl(vd, offset, size); + } + spa_config_exit(spa, SCL_VDEV, FTAG); +} + +static void +metaslab_group_disable_wait(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); + while (mg->mg_disabled_updating) { + cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); + } +} + +static void +metaslab_group_disabled_increment(metaslab_group_t *mg) +{ + ASSERT(MUTEX_HELD(&mg->mg_ms_disabled_lock)); + ASSERT(mg->mg_disabled_updating); + + while (mg->mg_ms_disabled >= max_disabled_ms) { + cv_wait(&mg->mg_ms_disabled_cv, &mg->mg_ms_disabled_lock); + } + mg->mg_ms_disabled++; + ASSERT3U(mg->mg_ms_disabled, <=, max_disabled_ms); +} + +/* + * Mark the metaslab as disabled to prevent any allocations on this metaslab. + * We must also track how many metaslabs are currently disabled within a + * metaslab group and limit them to prevent allocation failures from + * occurring because all metaslabs are disabled. + */ +void +metaslab_disable(metaslab_t *msp) +{ + ASSERT(!MUTEX_HELD(&msp->ms_lock)); + metaslab_group_t *mg = msp->ms_group; + + mutex_enter(&mg->mg_ms_disabled_lock); + + /* + * To keep an accurate count of how many threads have disabled + * a specific metaslab group, we only allow one thread to mark + * the metaslab group at a time. This ensures that the value of + * ms_disabled will be accurate when we decide to mark a metaslab + * group as disabled. To do this we force all other threads + * to wait till the metaslab's mg_disabled_updating flag is no + * longer set. + */ + metaslab_group_disable_wait(mg); + mg->mg_disabled_updating = B_TRUE; + if (msp->ms_disabled == 0) { + metaslab_group_disabled_increment(mg); + } + mutex_enter(&msp->ms_lock); + msp->ms_disabled++; + mutex_exit(&msp->ms_lock); + + mg->mg_disabled_updating = B_FALSE; + cv_broadcast(&mg->mg_ms_disabled_cv); + mutex_exit(&mg->mg_ms_disabled_lock); +} + +void +metaslab_enable(metaslab_t *msp, boolean_t sync, boolean_t unload) +{ + metaslab_group_t *mg = msp->ms_group; + spa_t *spa = mg->mg_vd->vdev_spa; + + /* + * Wait for the outstanding IO to be synced to prevent newly + * allocated blocks from being overwritten. This used by + * initialize and TRIM which are modifying unallocated space. + */ + if (sync) + txg_wait_synced(spa_get_dsl(spa), 0); + + mutex_enter(&mg->mg_ms_disabled_lock); + mutex_enter(&msp->ms_lock); + if (--msp->ms_disabled == 0) { + mg->mg_ms_disabled--; + cv_broadcast(&mg->mg_ms_disabled_cv); + if (unload) + metaslab_unload(msp); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&mg->mg_ms_disabled_lock); +} + +static void +metaslab_update_ondisk_flush_data(metaslab_t *ms, dmu_tx_t *tx) +{ + vdev_t *vd = ms->ms_group->mg_vd; + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + metaslab_unflushed_phys_t entry = { + .msp_unflushed_txg = metaslab_unflushed_txg(ms), + }; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object); + if (err == ENOENT) { + object = dmu_object_alloc(mos, DMU_OTN_UINT64_METADATA, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, + &object, tx)); + } else { + VERIFY0(err); + } + + dmu_write(spa_meta_objset(spa), object, entry_offset, entry_size, + &entry, tx); +} + +void +metaslab_set_unflushed_txg(metaslab_t *ms, uint64_t txg, dmu_tx_t *tx) +{ + spa_t *spa = ms->ms_group->mg_vd->vdev_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + ms->ms_unflushed_txg = txg; + metaslab_update_ondisk_flush_data(ms, tx); +} + +uint64_t +metaslab_unflushed_txg(metaslab_t *ms) +{ + return (ms->ms_unflushed_txg); +} + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, aliquot, ULONG, ZMOD_RW, + "Allocation granularity (a.k.a. stripe size)"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_load, INT, ZMOD_RW, + "Load all metaslabs when pool is first opened"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, debug_unload, INT, ZMOD_RW, + "Prevent metaslabs from being unloaded"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, preload_enabled, INT, ZMOD_RW, + "Preload potential metaslabs during reassessment"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay, INT, ZMOD_RW, + "Delay in txgs after metaslab was last used before unloading"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, unload_delay_ms, INT, ZMOD_RW, + "Delay in milliseconds after metaslab was last used before unloading"); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, noalloc_threshold, INT, ZMOD_RW, + "Percentage of metaslab group size that should be free to make it " + "eligible for allocation"); + +ZFS_MODULE_PARAM(zfs_mg, zfs_mg_, fragmentation_threshold, INT, ZMOD_RW, + "Percentage of metaslab group size that should be considered eligible " + "for allocations unless all metaslab groups within the metaslab class " + "have also crossed this threshold"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, fragmentation_threshold, INT, + ZMOD_RW, "Fragmentation for metaslab to allow allocation"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, fragmentation_factor_enabled, INT, ZMOD_RW, + "Use the fragmentation metric to prefer less fragmented metaslabs"); +/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, lba_weighting_enabled, INT, ZMOD_RW, + "Prefer metaslabs with lower LBAs"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, bias_enabled, INT, ZMOD_RW, + "Enable metaslab group biasing"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, segment_weight_enabled, INT, + ZMOD_RW, "Enable segment-based metaslab selection"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, switch_threshold, INT, ZMOD_RW, + "Segment-based metaslab selection maximum buckets before switching"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, force_ganging, ULONG, ZMOD_RW, + "Blocks larger than this size are forced to be gang blocks"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_max_search, INT, ZMOD_RW, + "Max distance (bytes) to search forward before using size tree"); + +ZFS_MODULE_PARAM(zfs_metaslab, metaslab_, df_use_largest_segment, INT, ZMOD_RW, + "When looking in size tree, use largest segment instead of exact fit"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, max_size_cache_sec, ULONG, + ZMOD_RW, "How long to trust the cached max chunk size of a metaslab"); + +ZFS_MODULE_PARAM(zfs_metaslab, zfs_metaslab_, mem_limit, INT, ZMOD_RW, + "Percentage of memory that can be used to store metaslab range trees"); diff --git a/module/zfs/mmp.c b/module/zfs/mmp.c new file mode 100644 index 000000000000..4170d7e03ebd --- /dev/null +++ b/module/zfs/mmp.c @@ -0,0 +1,728 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2017 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Multi-Modifier Protection (MMP) attempts to prevent a user from importing + * or opening a pool on more than one host at a time. In particular, it + * prevents "zpool import -f" on a host from succeeding while the pool is + * already imported on another host. There are many other ways in which a + * device could be used by two hosts for different purposes at the same time + * resulting in pool damage. This implementation does not attempt to detect + * those cases. + * + * MMP operates by ensuring there are frequent visible changes on disk (a + * "heartbeat") at all times. And by altering the import process to check + * for these changes and failing the import when they are detected. This + * functionality is enabled by setting the 'multihost' pool property to on. + * + * Uberblocks written by the txg_sync thread always go into the first + * (N-MMP_BLOCKS_PER_LABEL) slots, the remaining slots are reserved for MMP. + * They are used to hold uberblocks which are exactly the same as the last + * synced uberblock except that the ub_timestamp and mmp_config are frequently + * updated. Like all other uberblocks, the slot is written with an embedded + * checksum, and slots with invalid checksums are ignored. This provides the + * "heartbeat", with no risk of overwriting good uberblocks that must be + * preserved, e.g. previous txgs and associated block pointers. + * + * Three optional fields are added to uberblock structure; ub_mmp_magic, + * ub_mmp_config, and ub_mmp_delay. The ub_mmp_magic value allows zfs to tell + * whether the other ub_mmp_* fields are valid. The ub_mmp_config field tells + * the importing host the settings of zfs_multihost_interval and + * zfs_multihost_fail_intervals on the host which last had (or currently has) + * the pool imported. These determine how long a host must wait to detect + * activity in the pool, before concluding the pool is not in use. The + * mmp_delay field is a decaying average of the amount of time between + * completion of successive MMP writes, in nanoseconds. It indicates whether + * MMP is enabled. + * + * During import an activity test may now be performed to determine if + * the pool is in use. The activity test is typically required if the + * ZPOOL_CONFIG_HOSTID does not match the system hostid, the pool state is + * POOL_STATE_ACTIVE, and the pool is not a root pool. + * + * The activity test finds the "best" uberblock (highest txg, timestamp, and, if + * ub_mmp_magic is valid, sequence number from ub_mmp_config). It then waits + * some time, and finds the "best" uberblock again. If any of the mentioned + * fields have different values in the newly read uberblock, the pool is in use + * by another host and the import fails. In order to assure the accuracy of the + * activity test, the default values result in an activity test duration of 20x + * the mmp write interval. + * + * The duration of the "zpool import" activity test depends on the information + * available in the "best" uberblock: + * + * 1) If uberblock was written by zfs-0.8 or newer and fail_intervals > 0: + * ub_mmp_config.fail_intervals * ub_mmp_config.multihost_interval * 2 + * + * In this case, a weak guarantee is provided. Since the host which last had + * the pool imported will suspend the pool if no mmp writes land within + * fail_intervals * multihost_interval ms, the absence of writes during that + * time means either the pool is not imported, or it is imported but the pool + * is suspended and no further writes will occur. + * + * Note that resuming the suspended pool on the remote host would invalidate + * this guarantee, and so it is not allowed. + * + * The factor of 2 provides a conservative safety factor and derives from + * MMP_IMPORT_SAFETY_FACTOR; + * + * 2) If uberblock was written by zfs-0.8 or newer and fail_intervals == 0: + * (ub_mmp_config.multihost_interval + ub_mmp_delay) * + * zfs_multihost_import_intervals + * + * In this case no guarantee can provided. However, as long as some devices + * are healthy and connected, it is likely that at least one write will land + * within (multihost_interval + mmp_delay) because multihost_interval is + * enough time for a write to be attempted to each leaf vdev, and mmp_delay + * is enough for one to land, based on past delays. Multiplying by + * zfs_multihost_import_intervals provides a conservative safety factor. + * + * 3) If uberblock was written by zfs-0.7: + * (zfs_multihost_interval + ub_mmp_delay) * zfs_multihost_import_intervals + * + * The same logic as case #2 applies, but we do not know remote tunables. + * + * We use the local value for zfs_multihost_interval because the original MMP + * did not record this value in the uberblock. + * + * ub_mmp_delay >= (zfs_multihost_interval / leaves), so if the other host + * has a much larger zfs_multihost_interval set, ub_mmp_delay will reflect + * that. We will have waited enough time for zfs_multihost_import_intervals + * writes to be issued and all but one to land. + * + * single device pool example delays + * + * import_delay = (1 + 1) * 20 = 40s #defaults, no I/O delay + * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay + * import_delay = (10 + 10) * 20 = 400s #10s multihost_interval, + * no I/O delay + * 100 device pool example delays + * + * import_delay = (1 + .01) * 20 = 20s #defaults, no I/O delay + * import_delay = (1 + 10) * 20 = 220s #defaults, 10s I/O delay + * import_delay = (10 + .1) * 20 = 202s #10s multihost_interval, + * no I/O delay + * + * 4) Otherwise, this uberblock was written by a pre-MMP zfs: + * zfs_multihost_import_intervals * zfs_multihost_interval + * + * In this case local tunables are used. By default this product = 10s, long + * enough for a pool with any activity at all to write at least one + * uberblock. No guarantee can be provided. + * + * Additionally, the duration is then extended by a random 25% to attempt to to + * detect simultaneous imports. For example, if both partner hosts are rebooted + * at the same time and automatically attempt to import the pool. + */ + +/* + * Used to control the frequency of mmp writes which are performed when the + * 'multihost' pool property is on. This is one factor used to determine the + * length of the activity check during import. + * + * On average an mmp write will be issued for each leaf vdev every + * zfs_multihost_interval milliseconds. In practice, the observed period can + * vary with the I/O load and this observed value is the ub_mmp_delay which is + * stored in the uberblock. The minimum allowed value is 100 ms. + */ +ulong_t zfs_multihost_interval = MMP_DEFAULT_INTERVAL; + +/* + * Used to control the duration of the activity test on import. Smaller values + * of zfs_multihost_import_intervals will reduce the import time but increase + * the risk of failing to detect an active pool. The total activity check time + * is never allowed to drop below one second. A value of 0 is ignored and + * treated as if it was set to 1. + */ +uint_t zfs_multihost_import_intervals = MMP_DEFAULT_IMPORT_INTERVALS; + +/* + * Controls the behavior of the pool when mmp write failures or delays are + * detected. + * + * When zfs_multihost_fail_intervals = 0, mmp write failures or delays are + * ignored. The failures will still be reported to the ZED which depending on + * its configuration may take action such as suspending the pool or taking a + * device offline. + * + * When zfs_multihost_fail_intervals > 0, the pool will be suspended if + * zfs_multihost_fail_intervals * zfs_multihost_interval milliseconds pass + * without a successful mmp write. This guarantees the activity test will see + * mmp writes if the pool is imported. A value of 1 is ignored and treated as + * if it was set to 2, because a single leaf vdev pool will issue a write once + * per multihost_interval and thus any variation in latency would cause the + * pool to be suspended. + */ +uint_t zfs_multihost_fail_intervals = MMP_DEFAULT_FAIL_INTERVALS; + +char *mmp_tag = "mmp_write_uberblock"; +static void mmp_thread(void *arg); + +void +mmp_init(spa_t *spa) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + mutex_init(&mmp->mmp_thread_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&mmp->mmp_thread_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&mmp->mmp_io_lock, NULL, MUTEX_DEFAULT, NULL); + mmp->mmp_kstat_id = 1; + + /* + * mmp_write_done() calculates mmp_delay based on prior mmp_delay and + * the elapsed time since the last write. For the first mmp write, + * there is no "last write", so we start with fake non-zero values. + */ + mmp->mmp_last_write = gethrtime(); + mmp->mmp_delay = MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)); +} + +void +mmp_fini(spa_t *spa) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + mutex_destroy(&mmp->mmp_thread_lock); + cv_destroy(&mmp->mmp_thread_cv); + mutex_destroy(&mmp->mmp_io_lock); +} + +static void +mmp_thread_enter(mmp_thread_t *mmp, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &mmp->mmp_thread_lock, callb_generic_cpr, FTAG); + mutex_enter(&mmp->mmp_thread_lock); +} + +static void +mmp_thread_exit(mmp_thread_t *mmp, kthread_t **mpp, callb_cpr_t *cpr) +{ + ASSERT(*mpp != NULL); + *mpp = NULL; + cv_broadcast(&mmp->mmp_thread_cv); + CALLB_CPR_EXIT(cpr); /* drops &mmp->mmp_thread_lock */ + thread_exit(); +} + +void +mmp_thread_start(spa_t *spa) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + if (spa_writeable(spa)) { + mutex_enter(&mmp->mmp_thread_lock); + if (!mmp->mmp_thread) { + mmp->mmp_thread = thread_create(NULL, 0, mmp_thread, + spa, 0, &p0, TS_RUN, defclsyspri); + zfs_dbgmsg("MMP thread started pool '%s' " + "gethrtime %llu", spa_name(spa), gethrtime()); + } + mutex_exit(&mmp->mmp_thread_lock); + } +} + +void +mmp_thread_stop(spa_t *spa) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + mutex_enter(&mmp->mmp_thread_lock); + mmp->mmp_thread_exiting = 1; + cv_broadcast(&mmp->mmp_thread_cv); + + while (mmp->mmp_thread) { + cv_wait(&mmp->mmp_thread_cv, &mmp->mmp_thread_lock); + } + mutex_exit(&mmp->mmp_thread_lock); + zfs_dbgmsg("MMP thread stopped pool '%s' gethrtime %llu", + spa_name(spa), gethrtime()); + + ASSERT(mmp->mmp_thread == NULL); + mmp->mmp_thread_exiting = 0; +} + +typedef enum mmp_vdev_state_flag { + MMP_FAIL_NOT_WRITABLE = (1 << 0), + MMP_FAIL_WRITE_PENDING = (1 << 1), +} mmp_vdev_state_flag_t; + +/* + * Find a leaf vdev to write an MMP block to. It must not have an outstanding + * mmp write (if so a new write will also likely block). If there is no usable + * leaf, a nonzero error value is returned. The error value returned is a bit + * field. + * + * MMP_FAIL_WRITE_PENDING One or more leaf vdevs are writeable, but have an + * outstanding MMP write. + * MMP_FAIL_NOT_WRITABLE One or more leaf vdevs are not writeable. + */ + +static int +mmp_next_leaf(spa_t *spa) +{ + vdev_t *leaf; + vdev_t *starting_leaf; + int fail_mask = 0; + + ASSERT(MUTEX_HELD(&spa->spa_mmp.mmp_io_lock)); + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER)); + ASSERT(list_link_active(&spa->spa_leaf_list.list_head) == B_TRUE); + ASSERT(!list_is_empty(&spa->spa_leaf_list)); + + if (spa->spa_mmp.mmp_leaf_last_gen != spa->spa_leaf_list_gen) { + spa->spa_mmp.mmp_last_leaf = list_head(&spa->spa_leaf_list); + spa->spa_mmp.mmp_leaf_last_gen = spa->spa_leaf_list_gen; + } + + leaf = spa->spa_mmp.mmp_last_leaf; + if (leaf == NULL) + leaf = list_head(&spa->spa_leaf_list); + starting_leaf = leaf; + + do { + leaf = list_next(&spa->spa_leaf_list, leaf); + if (leaf == NULL) + leaf = list_head(&spa->spa_leaf_list); + + if (!vdev_writeable(leaf)) { + fail_mask |= MMP_FAIL_NOT_WRITABLE; + } else if (leaf->vdev_mmp_pending != 0) { + fail_mask |= MMP_FAIL_WRITE_PENDING; + } else { + spa->spa_mmp.mmp_last_leaf = leaf; + return (0); + } + } while (leaf != starting_leaf); + + ASSERT(fail_mask); + + return (fail_mask); +} + +/* + * MMP writes are issued on a fixed schedule, but may complete at variable, + * much longer, intervals. The mmp_delay captures long periods between + * successful writes for any reason, including disk latency, scheduling delays, + * etc. + * + * The mmp_delay is usually calculated as a decaying average, but if the latest + * delay is higher we do not average it, so that we do not hide sudden spikes + * which the importing host must wait for. + * + * If writes are occurring frequently, such as due to a high rate of txg syncs, + * the mmp_delay could become very small. Since those short delays depend on + * activity we cannot count on, we never allow mmp_delay to get lower than rate + * expected if only mmp_thread writes occur. + * + * If an mmp write was skipped or fails, and we have already waited longer than + * mmp_delay, we need to update it so the next write reflects the longer delay. + * + * Do not set mmp_delay if the multihost property is not on, so as not to + * trigger an activity check on import. + */ +static void +mmp_delay_update(spa_t *spa, boolean_t write_completed) +{ + mmp_thread_t *mts = &spa->spa_mmp; + hrtime_t delay = gethrtime() - mts->mmp_last_write; + + ASSERT(MUTEX_HELD(&mts->mmp_io_lock)); + + if (spa_multihost(spa) == B_FALSE) { + mts->mmp_delay = 0; + return; + } + + if (delay > mts->mmp_delay) + mts->mmp_delay = delay; + + if (write_completed == B_FALSE) + return; + + mts->mmp_last_write = gethrtime(); + + /* + * strictly less than, in case delay was changed above. + */ + if (delay < mts->mmp_delay) { + hrtime_t min_delay = + MSEC2NSEC(MMP_INTERVAL_OK(zfs_multihost_interval)) / + MAX(1, vdev_count_leaves(spa)); + mts->mmp_delay = MAX(((delay + mts->mmp_delay * 127) / 128), + min_delay); + } +} + +static void +mmp_write_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; + mmp_thread_t *mts = zio->io_private; + + mutex_enter(&mts->mmp_io_lock); + uint64_t mmp_kstat_id = vd->vdev_mmp_kstat_id; + hrtime_t mmp_write_duration = gethrtime() - vd->vdev_mmp_pending; + + mmp_delay_update(spa, (zio->io_error == 0)); + + vd->vdev_mmp_pending = 0; + vd->vdev_mmp_kstat_id = 0; + + mutex_exit(&mts->mmp_io_lock); + spa_config_exit(spa, SCL_STATE, mmp_tag); + + spa_mmp_history_set(spa, mmp_kstat_id, zio->io_error, + mmp_write_duration); + + abd_free(zio->io_abd); +} + +/* + * When the uberblock on-disk is updated by a spa_sync, + * creating a new "best" uberblock, update the one stored + * in the mmp thread state, used for mmp writes. + */ +void +mmp_update_uberblock(spa_t *spa, uberblock_t *ub) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + mutex_enter(&mmp->mmp_io_lock); + mmp->mmp_ub = *ub; + mmp->mmp_seq = 1; + mmp->mmp_ub.ub_timestamp = gethrestime_sec(); + mmp_delay_update(spa, B_TRUE); + mutex_exit(&mmp->mmp_io_lock); +} + +/* + * Choose a random vdev, label, and MMP block, and write over it + * with a copy of the last-synced uberblock, whose timestamp + * has been updated to reflect that the pool is in use. + */ +static void +mmp_write_uberblock(spa_t *spa) +{ + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + mmp_thread_t *mmp = &spa->spa_mmp; + uberblock_t *ub; + vdev_t *vd = NULL; + int label, error; + uint64_t offset; + + hrtime_t lock_acquire_time = gethrtime(); + spa_config_enter(spa, SCL_STATE, mmp_tag, RW_READER); + lock_acquire_time = gethrtime() - lock_acquire_time; + if (lock_acquire_time > (MSEC2NSEC(MMP_MIN_INTERVAL) / 10)) + zfs_dbgmsg("MMP SCL_STATE acquisition pool '%s' took %llu ns " + "gethrtime %llu", spa_name(spa), lock_acquire_time, + gethrtime()); + + mutex_enter(&mmp->mmp_io_lock); + + error = mmp_next_leaf(spa); + + /* + * spa_mmp_history has two types of entries: + * Issued MMP write: records time issued, error status, etc. + * Skipped MMP write: an MMP write could not be issued because no + * suitable leaf vdev was available. See comment above struct + * spa_mmp_history for details. + */ + + if (error) { + mmp_delay_update(spa, B_FALSE); + if (mmp->mmp_skip_error == error) { + spa_mmp_history_set_skip(spa, mmp->mmp_kstat_id - 1); + } else { + mmp->mmp_skip_error = error; + spa_mmp_history_add(spa, mmp->mmp_ub.ub_txg, + gethrestime_sec(), mmp->mmp_delay, NULL, 0, + mmp->mmp_kstat_id++, error); + zfs_dbgmsg("MMP error choosing leaf pool '%s' " + "gethrtime %llu fail_mask %#x", spa_name(spa), + gethrtime(), error); + } + mutex_exit(&mmp->mmp_io_lock); + spa_config_exit(spa, SCL_STATE, mmp_tag); + return; + } + + vd = spa->spa_mmp.mmp_last_leaf; + if (mmp->mmp_skip_error != 0) { + mmp->mmp_skip_error = 0; + zfs_dbgmsg("MMP write after skipping due to unavailable " + "leaves, pool '%s' gethrtime %llu leaf %#llu", + spa_name(spa), gethrtime(), vd->vdev_guid); + } + + if (mmp->mmp_zio_root == NULL) + mmp->mmp_zio_root = zio_root(spa, NULL, NULL, + flags | ZIO_FLAG_GODFATHER); + + if (mmp->mmp_ub.ub_timestamp != gethrestime_sec()) { + /* + * Want to reset mmp_seq when timestamp advances because after + * an mmp_seq wrap new values will not be chosen by + * uberblock_compare() as the "best". + */ + mmp->mmp_ub.ub_timestamp = gethrestime_sec(); + mmp->mmp_seq = 1; + } + + ub = &mmp->mmp_ub; + ub->ub_mmp_magic = MMP_MAGIC; + ub->ub_mmp_delay = mmp->mmp_delay; + ub->ub_mmp_config = MMP_SEQ_SET(mmp->mmp_seq) | + MMP_INTERVAL_SET(MMP_INTERVAL_OK(zfs_multihost_interval)) | + MMP_FAIL_INT_SET(MMP_FAIL_INTVS_OK( + zfs_multihost_fail_intervals)); + vd->vdev_mmp_pending = gethrtime(); + vd->vdev_mmp_kstat_id = mmp->mmp_kstat_id; + + zio_t *zio = zio_null(mmp->mmp_zio_root, spa, NULL, NULL, NULL, flags); + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + + mmp->mmp_seq++; + mmp->mmp_kstat_id++; + mutex_exit(&mmp->mmp_io_lock); + + offset = VDEV_UBERBLOCK_OFFSET(vd, VDEV_UBERBLOCK_COUNT(vd) - + MMP_BLOCKS_PER_LABEL + spa_get_random(MMP_BLOCKS_PER_LABEL)); + + label = spa_get_random(VDEV_LABELS); + vdev_label_write(zio, vd, label, ub_abd, offset, + VDEV_UBERBLOCK_SIZE(vd), mmp_write_done, mmp, + flags | ZIO_FLAG_DONT_PROPAGATE); + + (void) spa_mmp_history_add(spa, ub->ub_txg, ub->ub_timestamp, + ub->ub_mmp_delay, vd, label, vd->vdev_mmp_kstat_id, 0); + + zio_nowait(zio); +} + +static void +mmp_thread(void *arg) +{ + spa_t *spa = (spa_t *)arg; + mmp_thread_t *mmp = &spa->spa_mmp; + boolean_t suspended = spa_suspended(spa); + boolean_t multihost = spa_multihost(spa); + uint64_t mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( + zfs_multihost_interval)); + uint32_t mmp_fail_intervals = MMP_FAIL_INTVS_OK( + zfs_multihost_fail_intervals); + hrtime_t mmp_fail_ns = mmp_fail_intervals * mmp_interval; + boolean_t last_spa_suspended = suspended; + boolean_t last_spa_multihost = multihost; + uint64_t last_mmp_interval = mmp_interval; + uint32_t last_mmp_fail_intervals = mmp_fail_intervals; + hrtime_t last_mmp_fail_ns = mmp_fail_ns; + callb_cpr_t cpr; + int skip_wait = 0; + + mmp_thread_enter(mmp, &cpr); + + while (!mmp->mmp_thread_exiting) { + hrtime_t next_time = gethrtime() + + MSEC2NSEC(MMP_DEFAULT_INTERVAL); + int leaves = MAX(vdev_count_leaves(spa), 1); + + /* Detect changes in tunables or state */ + + last_spa_suspended = suspended; + last_spa_multihost = multihost; + suspended = spa_suspended(spa); + multihost = spa_multihost(spa); + + last_mmp_interval = mmp_interval; + last_mmp_fail_intervals = mmp_fail_intervals; + last_mmp_fail_ns = mmp_fail_ns; + mmp_interval = MSEC2NSEC(MMP_INTERVAL_OK( + zfs_multihost_interval)); + mmp_fail_intervals = MMP_FAIL_INTVS_OK( + zfs_multihost_fail_intervals); + + /* Smooth so pool is not suspended when reducing tunables */ + if (mmp_fail_intervals * mmp_interval < mmp_fail_ns) { + mmp_fail_ns = (mmp_fail_ns * 31 + + mmp_fail_intervals * mmp_interval) / 32; + } else { + mmp_fail_ns = mmp_fail_intervals * + mmp_interval; + } + + if (mmp_interval != last_mmp_interval || + mmp_fail_intervals != last_mmp_fail_intervals) { + /* + * We want other hosts to see new tunables as quickly as + * possible. Write out at higher frequency than usual. + */ + skip_wait += leaves; + } + + if (multihost) + next_time = gethrtime() + mmp_interval / leaves; + + if (mmp_fail_ns != last_mmp_fail_ns) { + zfs_dbgmsg("MMP interval change pool '%s' " + "gethrtime %llu last_mmp_interval %llu " + "mmp_interval %llu last_mmp_fail_intervals %u " + "mmp_fail_intervals %u mmp_fail_ns %llu " + "skip_wait %d leaves %d next_time %llu", + spa_name(spa), gethrtime(), last_mmp_interval, + mmp_interval, last_mmp_fail_intervals, + mmp_fail_intervals, mmp_fail_ns, skip_wait, leaves, + next_time); + } + + /* + * MMP off => on, or suspended => !suspended: + * No writes occurred recently. Update mmp_last_write to give + * us some time to try. + */ + if ((!last_spa_multihost && multihost) || + (last_spa_suspended && !suspended)) { + zfs_dbgmsg("MMP state change pool '%s': gethrtime %llu " + "last_spa_multihost %u multihost %u " + "last_spa_suspended %u suspended %u", + spa_name(spa), last_spa_multihost, multihost, + last_spa_suspended, suspended); + mutex_enter(&mmp->mmp_io_lock); + mmp->mmp_last_write = gethrtime(); + mmp->mmp_delay = mmp_interval; + mutex_exit(&mmp->mmp_io_lock); + } + + /* + * MMP on => off: + * mmp_delay == 0 tells importing node to skip activity check. + */ + if (last_spa_multihost && !multihost) { + mutex_enter(&mmp->mmp_io_lock); + mmp->mmp_delay = 0; + mutex_exit(&mmp->mmp_io_lock); + } + + /* + * Suspend the pool if no MMP write has succeeded in over + * mmp_interval * mmp_fail_intervals nanoseconds. + */ + if (multihost && !suspended && mmp_fail_intervals && + (gethrtime() - mmp->mmp_last_write) > mmp_fail_ns) { + zfs_dbgmsg("MMP suspending pool '%s': gethrtime %llu " + "mmp_last_write %llu mmp_interval %llu " + "mmp_fail_intervals %llu mmp_fail_ns %llu", + spa_name(spa), (u_longlong_t)gethrtime(), + (u_longlong_t)mmp->mmp_last_write, + (u_longlong_t)mmp_interval, + (u_longlong_t)mmp_fail_intervals, + (u_longlong_t)mmp_fail_ns); + cmn_err(CE_WARN, "MMP writes to pool '%s' have not " + "succeeded in over %llu ms; suspending pool. " + "Hrtime %llu", + spa_name(spa), + NSEC2MSEC(gethrtime() - mmp->mmp_last_write), + gethrtime()); + zio_suspend(spa, NULL, ZIO_SUSPEND_MMP); + } + + if (multihost && !suspended) + mmp_write_uberblock(spa); + + if (skip_wait > 0) { + next_time = gethrtime() + MSEC2NSEC(MMP_MIN_INTERVAL) / + leaves; + skip_wait--; + } + + CALLB_CPR_SAFE_BEGIN(&cpr); + (void) cv_timedwait_sig_hires(&mmp->mmp_thread_cv, + &mmp->mmp_thread_lock, next_time, USEC2NSEC(100), + CALLOUT_FLAG_ABSOLUTE); + CALLB_CPR_SAFE_END(&cpr, &mmp->mmp_thread_lock); + } + + /* Outstanding writes are allowed to complete. */ + zio_wait(mmp->mmp_zio_root); + + mmp->mmp_zio_root = NULL; + mmp_thread_exit(mmp, &mmp->mmp_thread, &cpr); +} + +/* + * Signal the MMP thread to wake it, when it is sleeping on + * its cv. Used when some module parameter has changed and + * we want the thread to know about it. + * Only signal if the pool is active and mmp thread is + * running, otherwise there is no thread to wake. + */ +static void +mmp_signal_thread(spa_t *spa) +{ + mmp_thread_t *mmp = &spa->spa_mmp; + + mutex_enter(&mmp->mmp_thread_lock); + if (mmp->mmp_thread) + cv_broadcast(&mmp->mmp_thread_cv); + mutex_exit(&mmp->mmp_thread_lock); +} + +void +mmp_signal_all_threads(void) +{ + spa_t *spa = NULL; + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa))) { + if (spa->spa_state == POOL_STATE_ACTIVE) + mmp_signal_thread(spa); + } + mutex_exit(&spa_namespace_lock); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM_CALL(zfs_multihost, zfs_multihost_, interval, + param_set_multihost_interval, param_get_ulong, ZMOD_RW, + "Milliseconds between mmp writes to each leaf"); +/* END CSTYLED */ + +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, fail_intervals, UINT, ZMOD_RW, + "Max allowed period without a successful mmp write"); + +ZFS_MODULE_PARAM(zfs_multihost, zfs_multihost_, import_intervals, UINT, ZMOD_RW, + "Number of zfs_multihost_interval periods to wait for activity"); diff --git a/module/zfs/multilist.c b/module/zfs/multilist.c new file mode 100644 index 000000000000..a3adfd317af6 --- /dev/null +++ b/module/zfs/multilist.c @@ -0,0 +1,431 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* needed for spa_get_random() */ +#include + +/* + * This overrides the number of sublists in each multilist_t, which defaults + * to the number of CPUs in the system (see multilist_create()). + */ +int zfs_multilist_num_sublists = 0; + +/* + * Given the object contained on the list, return a pointer to the + * object's multilist_node_t structure it contains. + */ +#ifdef ZFS_DEBUG +static multilist_node_t * +multilist_d2l(multilist_t *ml, void *obj) +{ + return ((multilist_node_t *)((char *)obj + ml->ml_offset)); +} +#endif + +/* + * Initialize a new mutlilist using the parameters specified. + * + * - 'size' denotes the size of the structure containing the + * multilist_node_t. + * - 'offset' denotes the byte offset of the mutlilist_node_t within + * the structure that contains it. + * - 'num' specifies the number of internal sublists to create. + * - 'index_func' is used to determine which sublist to insert into + * when the multilist_insert() function is called; as well as which + * sublist to remove from when multilist_remove() is called. The + * requirements this function must meet, are the following: + * + * - It must always return the same value when called on the same + * object (to ensure the object is removed from the list it was + * inserted into). + * + * - It must return a value in the range [0, number of sublists). + * The multilist_get_num_sublists() function may be used to + * determine the number of sublists in the multilist. + * + * Also, in order to reduce internal contention between the sublists + * during insertion and removal, this function should choose evenly + * between all available sublists when inserting. This isn't a hard + * requirement, but a general rule of thumb in order to garner the + * best multi-threaded performance out of the data structure. + */ +static multilist_t * +multilist_create_impl(size_t size, size_t offset, + unsigned int num, multilist_sublist_index_func_t *index_func) +{ + ASSERT3U(size, >, 0); + ASSERT3U(size, >=, offset + sizeof (multilist_node_t)); + ASSERT3U(num, >, 0); + ASSERT3P(index_func, !=, NULL); + + multilist_t *ml = kmem_alloc(sizeof (*ml), KM_SLEEP); + ml->ml_offset = offset; + ml->ml_num_sublists = num; + ml->ml_index_func = index_func; + + ml->ml_sublists = kmem_zalloc(sizeof (multilist_sublist_t) * + ml->ml_num_sublists, KM_SLEEP); + + ASSERT3P(ml->ml_sublists, !=, NULL); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + mutex_init(&mls->mls_lock, NULL, MUTEX_NOLOCKDEP, NULL); + list_create(&mls->mls_list, size, offset); + } + return (ml); +} + +/* + * Allocate a new multilist, using the default number of sublists + * (the number of CPUs, or at least 4, or the tunable + * zfs_multilist_num_sublists). + */ +multilist_t * +multilist_create(size_t size, size_t offset, + multilist_sublist_index_func_t *index_func) +{ + int num_sublists; + + if (zfs_multilist_num_sublists > 0) { + num_sublists = zfs_multilist_num_sublists; + } else { + num_sublists = MAX(boot_ncpus, 4); + } + + return (multilist_create_impl(size, offset, num_sublists, index_func)); +} + +/* + * Destroy the given multilist object, and free up any memory it holds. + */ +void +multilist_destroy(multilist_t *ml) +{ + ASSERT(multilist_is_empty(ml)); + + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + + ASSERT(list_is_empty(&mls->mls_list)); + + list_destroy(&mls->mls_list); + mutex_destroy(&mls->mls_lock); + } + + ASSERT3P(ml->ml_sublists, !=, NULL); + kmem_free(ml->ml_sublists, + sizeof (multilist_sublist_t) * ml->ml_num_sublists); + + ml->ml_num_sublists = 0; + ml->ml_offset = 0; + kmem_free(ml, sizeof (multilist_t)); +} + +/* + * Insert the given object into the multilist. + * + * This function will insert the object specified into the sublist + * determined using the function given at multilist creation time. + * + * The sublist locks are automatically acquired if not already held, to + * ensure consistency when inserting and removing from multiple threads. + */ +void +multilist_insert(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__insert, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + + /* + * Note: Callers may already hold the sublist lock by calling + * multilist_sublist_lock(). Here we rely on MUTEX_HELD() + * returning TRUE if and only if the current thread holds the + * lock. While it's a little ugly to make the lock recursive in + * this way, it works and allows the calling code to be much + * simpler -- otherwise it would have to pass around a flag + * indicating that it already has the lock. + */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(!multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_insert_head(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Remove the given object from the multilist. + * + * This function will remove the object specified from the sublist + * determined using the function given at multilist creation time. + * + * The necessary sublist locks are automatically acquired, to ensure + * consistency when inserting and removing from multiple threads. + */ +void +multilist_remove(multilist_t *ml, void *obj) +{ + unsigned int sublist_idx = ml->ml_index_func(ml, obj); + multilist_sublist_t *mls; + boolean_t need_lock; + + DTRACE_PROBE3(multilist__remove, multilist_t *, ml, + unsigned int, sublist_idx, void *, obj); + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + + mls = &ml->ml_sublists[sublist_idx]; + /* See comment in multilist_insert(). */ + need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + ASSERT(multilist_link_active(multilist_d2l(ml, obj))); + + multilist_sublist_remove(mls, obj); + + if (need_lock) + mutex_exit(&mls->mls_lock); +} + +/* + * Check to see if this multilist object is empty. + * + * This will return TRUE if it finds all of the sublists of this + * multilist to be empty, and FALSE otherwise. Each sublist lock will be + * automatically acquired as necessary. + * + * If concurrent insertions and removals are occurring, the semantics + * of this function become a little fuzzy. Instead of locking all + * sublists for the entire call time of the function, each sublist is + * only locked as it is individually checked for emptiness. Thus, it's + * possible for this function to return TRUE with non-empty sublists at + * the time the function returns. This would be due to another thread + * inserting into a given sublist, after that specific sublist was check + * and deemed empty, but before all sublists have been checked. + */ +int +multilist_is_empty(multilist_t *ml) +{ + for (int i = 0; i < ml->ml_num_sublists; i++) { + multilist_sublist_t *mls = &ml->ml_sublists[i]; + /* See comment in multilist_insert(). */ + boolean_t need_lock = !MUTEX_HELD(&mls->mls_lock); + + if (need_lock) + mutex_enter(&mls->mls_lock); + + if (!list_is_empty(&mls->mls_list)) { + if (need_lock) + mutex_exit(&mls->mls_lock); + + return (FALSE); + } + + if (need_lock) + mutex_exit(&mls->mls_lock); + } + + return (TRUE); +} + +/* Return the number of sublists composing this multilist */ +unsigned int +multilist_get_num_sublists(multilist_t *ml) +{ + return (ml->ml_num_sublists); +} + +/* Return a randomly selected, valid sublist index for this multilist */ +unsigned int +multilist_get_random_index(multilist_t *ml) +{ + return (spa_get_random(ml->ml_num_sublists)); +} + +/* Lock and return the sublist specified at the given index */ +multilist_sublist_t * +multilist_sublist_lock(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + mutex_enter(&mls->mls_lock); + + return (mls); +} + +/* Lock and return the sublist that would be used to store the specified obj */ +multilist_sublist_t * +multilist_sublist_lock_obj(multilist_t *ml, void *obj) +{ + return (multilist_sublist_lock(ml, ml->ml_index_func(ml, obj))); +} + +void +multilist_sublist_unlock(multilist_sublist_t *mls) +{ + mutex_exit(&mls->mls_lock); +} + +/* + * We're allowing any object to be inserted into this specific sublist, + * but this can lead to trouble if multilist_remove() is called to + * remove this object. Specifically, if calling ml_index_func on this + * object returns an index for sublist different than what is passed as + * a parameter here, any call to multilist_remove() with this newly + * inserted object is undefined! (the call to multilist_remove() will + * remove the object from a list that it isn't contained in) + */ +void +multilist_sublist_insert_head(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_head(&mls->mls_list, obj); +} + +/* please see comment above multilist_sublist_insert_head */ +void +multilist_sublist_insert_tail(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_insert_tail(&mls->mls_list, obj); +} + +/* + * Move the object one element forward in the list. + * + * This function will move the given object forward in the list (towards + * the head) by one object. So, in essence, it will swap its position in + * the list with its "prev" pointer. If the given object is already at the + * head of the list, it cannot be moved forward any more than it already + * is, so no action is taken. + * + * NOTE: This function **must not** remove any object from the list other + * than the object given as the parameter. This is relied upon in + * arc_evict_state_impl(). + */ +void +multilist_sublist_move_forward(multilist_sublist_t *mls, void *obj) +{ + void *prev = list_prev(&mls->mls_list, obj); + + ASSERT(MUTEX_HELD(&mls->mls_lock)); + ASSERT(!list_is_empty(&mls->mls_list)); + + /* 'obj' must be at the head of the list, nothing to do */ + if (prev == NULL) + return; + + list_remove(&mls->mls_list, obj); + list_insert_before(&mls->mls_list, prev, obj); +} + +void +multilist_sublist_remove(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + list_remove(&mls->mls_list, obj); +} + +int +multilist_sublist_is_empty(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_is_empty(&mls->mls_list)); +} + +int +multilist_sublist_is_empty_idx(multilist_t *ml, unsigned int sublist_idx) +{ + multilist_sublist_t *mls; + int empty; + + ASSERT3U(sublist_idx, <, ml->ml_num_sublists); + mls = &ml->ml_sublists[sublist_idx]; + ASSERT(!MUTEX_HELD(&mls->mls_lock)); + mutex_enter(&mls->mls_lock); + empty = list_is_empty(&mls->mls_list); + mutex_exit(&mls->mls_lock); + return (empty); +} + +void * +multilist_sublist_head(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_head(&mls->mls_list)); +} + +void * +multilist_sublist_tail(multilist_sublist_t *mls) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_tail(&mls->mls_list)); +} + +void * +multilist_sublist_next(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_next(&mls->mls_list, obj)); +} + +void * +multilist_sublist_prev(multilist_sublist_t *mls, void *obj) +{ + ASSERT(MUTEX_HELD(&mls->mls_lock)); + return (list_prev(&mls->mls_list, obj)); +} + +void +multilist_link_init(multilist_node_t *link) +{ + list_link_init(link); +} + +int +multilist_link_active(multilist_node_t *link) +{ + return (list_link_active(link)); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, multilist_num_sublists, INT, ZMOD_RW, + "Number of sublists used in each multilist"); +/* END CSTYLED */ diff --git a/module/zfs/objlist.c b/module/zfs/objlist.c new file mode 100644 index 000000000000..c80bab2a77bd --- /dev/null +++ b/module/zfs/objlist.c @@ -0,0 +1,84 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include + +objlist_t * +objlist_create(void) +{ + objlist_t *list = kmem_alloc(sizeof (*list), KM_SLEEP); + list_create(&list->ol_list, sizeof (objlist_node_t), + offsetof(objlist_node_t, on_node)); + list->ol_last_lookup = 0; + return (list); +} + +void +objlist_destroy(objlist_t *list) +{ + for (objlist_node_t *n = list_remove_head(&list->ol_list); + n != NULL; n = list_remove_head(&list->ol_list)) { + kmem_free(n, sizeof (*n)); + } + list_destroy(&list->ol_list); + kmem_free(list, sizeof (*list)); +} + +/* + * This function looks through the objlist to see if the specified object number + * is contained in the objlist. In the process, it will remove all object + * numbers in the list that are smaller than the specified object number. Thus, + * any lookup of an object number smaller than a previously looked up object + * number will always return false; therefore, all lookups should be done in + * ascending order. + */ +boolean_t +objlist_exists(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = list_head(&list->ol_list); + ASSERT3U(object, >=, list->ol_last_lookup); + list->ol_last_lookup = object; + while (node != NULL && node->on_object < object) { + VERIFY3P(node, ==, list_remove_head(&list->ol_list)); + kmem_free(node, sizeof (*node)); + node = list_head(&list->ol_list); + } + return (node != NULL && node->on_object == object); +} + +/* + * The objlist is a list of object numbers stored in ascending order. However, + * the insertion of new object numbers does not seek out the correct location to + * store a new object number; instead, it appends it to the list for simplicity. + * Thus, any users must take care to only insert new object numbers in ascending + * order. + */ +void +objlist_insert(objlist_t *list, uint64_t object) +{ + objlist_node_t *node = kmem_zalloc(sizeof (*node), KM_SLEEP); + node->on_object = object; +#ifdef ZFS_DEBUG + objlist_node_t *last_object = list_tail(&list->ol_list); + uint64_t last_objnum = (last_object != NULL ? last_object->on_object : + 0); + ASSERT3U(node->on_object, >, last_objnum); +#endif + list_insert_tail(&list->ol_list, node); +} diff --git a/module/zfs/pathname.c b/module/zfs/pathname.c new file mode 100644 index 000000000000..4766762f37d1 --- /dev/null +++ b/module/zfs/pathname.c @@ -0,0 +1,96 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ +/* All Rights Reserved */ + +/* + * University Copyright- Copyright (c) 1982, 1986, 1988 + * The Regents of the University of California + * All Rights Reserved + * + * University Acknowledgment- Portions of this document are derived from + * software developed by the University of California, Berkeley, and its + * contributors. + */ + + +#include +#include +#include +#include + +/* + * Pathname utilities. + * + * In translating file names we copy each argument file + * name into a pathname structure where we operate on it. + * Each pathname structure can hold "pn_bufsize" characters + * including a terminating null, and operations here support + * allocating and freeing pathname structures, fetching + * strings from user space, getting the next character from + * a pathname, combining two pathnames (used in symbolic + * link processing), and peeling off the first component + * of a pathname. + */ + +/* + * Allocate contents of pathname structure. Structure is typically + * an automatic variable in calling routine for convenience. + * + * May sleep in the call to kmem_alloc() and so must not be called + * from interrupt level. + */ +void +pn_alloc(struct pathname *pnp) +{ + pn_alloc_sz(pnp, MAXPATHLEN); +} +void +pn_alloc_sz(struct pathname *pnp, size_t sz) +{ + pnp->pn_buf = kmem_alloc(sz, KM_SLEEP); + pnp->pn_bufsize = sz; +#if 0 /* unused in ZoL */ + pnp->pn_path = pnp->pn_buf; + pnp->pn_pathlen = 0; +#endif +} + +/* + * Free pathname resources. + */ +void +pn_free(struct pathname *pnp) +{ + /* pn_bufsize is usually MAXPATHLEN, but may not be */ + kmem_free(pnp->pn_buf, pnp->pn_bufsize); + pnp->pn_buf = NULL; + pnp->pn_bufsize = 0; +#if 0 /* unused in ZoL */ + pnp->pn_path = NULL; + pnp->pn_pathlen = 0; +#endif +} diff --git a/module/zfs/range_tree.c b/module/zfs/range_tree.c new file mode 100644 index 000000000000..2c0e4b860a04 --- /dev/null +++ b/module/zfs/range_tree.c @@ -0,0 +1,921 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Range trees are tree-based data structures that can be used to + * track free space or generally any space allocation information. + * A range tree keeps track of individual segments and automatically + * provides facilities such as adjacent extent merging and extent + * splitting in response to range add/remove requests. + * + * A range tree starts out completely empty, with no segments in it. + * Adding an allocation via range_tree_add to the range tree can either: + * 1) create a new extent + * 2) extend an adjacent extent + * 3) merge two adjacent extents + * Conversely, removing an allocation via range_tree_remove can: + * 1) completely remove an extent + * 2) shorten an extent (if the allocation was near one of its ends) + * 3) split an extent into two extents, in effect punching a hole + * + * A range tree is also capable of 'bridging' gaps when adding + * allocations. This is useful for cases when close proximity of + * allocations is an important detail that needs to be represented + * in the range tree. See range_tree_set_gap(). The default behavior + * is not to bridge gaps (i.e. the maximum allowed gap size is 0). + * + * In order to traverse a range tree, use either the range_tree_walk() + * or range_tree_vacate() functions. + * + * To obtain more accurate information on individual segment + * operations that the range tree performs "under the hood", you can + * specify a set of callbacks by passing a range_tree_ops_t structure + * to the range_tree_create function. Any callbacks that are non-NULL + * are then called at the appropriate times. + * + * The range tree code also supports a special variant of range trees + * that can bridge small gaps between segments. This kind of tree is used + * by the dsl scanning code to group I/Os into mostly sequential chunks to + * optimize disk performance. The code here attempts to do this with as + * little memory and computational overhead as possible. One limitation of + * this implementation is that segments of range trees with gaps can only + * support removing complete segments. + */ + +static inline void +rs_copy(range_seg_t *src, range_seg_t *dest, range_tree_t *rt) +{ + ASSERT3U(rt->rt_type, <=, RANGE_SEG_NUM_TYPES); + size_t size = 0; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + VERIFY(0); + } + bcopy(src, dest, size); +} + +void +range_tree_stat_verify(range_tree_t *rt) +{ + range_seg_t *rs; + zfs_btree_index_t where; + uint64_t hist[RANGE_TREE_HISTOGRAM_SIZE] = { 0 }; + int i; + + for (rs = zfs_btree_first(&rt->rt_root, &where); rs != NULL; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + int idx = highbit64(size) - 1; + + hist[idx]++; + ASSERT3U(hist[idx], !=, 0); + } + + for (i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (hist[i] != rt->rt_histogram[i]) { + zfs_dbgmsg("i=%d, hist=%px, hist=%llu, rt_hist=%llu", + i, hist, hist[i], rt->rt_histogram[i]); + } + VERIFY3U(hist[i], ==, rt->rt_histogram[i]); + } +} + +static void +range_tree_stat_incr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + rt->rt_histogram[idx]++; + ASSERT3U(rt->rt_histogram[idx], !=, 0); +} + +static void +range_tree_stat_decr(range_tree_t *rt, range_seg_t *rs) +{ + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + int idx = highbit64(size) - 1; + + ASSERT(size != 0); + ASSERT3U(idx, <, + sizeof (rt->rt_histogram) / sizeof (*rt->rt_histogram)); + + ASSERT3U(rt->rt_histogram[idx], !=, 0); + rt->rt_histogram[idx]--; +} + +static int +range_tree_seg32_compare(const void *x1, const void *x2) +{ + const range_seg32_t *r1 = x1; + const range_seg32_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg64_compare(const void *x1, const void *x2) +{ + const range_seg64_t *r1 = x1; + const range_seg64_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +static int +range_tree_seg_gap_compare(const void *x1, const void *x2) +{ + const range_seg_gap_t *r1 = x1; + const range_seg_gap_t *r2 = x2; + + ASSERT3U(r1->rs_start, <=, r1->rs_end); + ASSERT3U(r2->rs_start, <=, r2->rs_end); + + return ((r1->rs_start >= r2->rs_end) - (r1->rs_end <= r2->rs_start)); +} + +range_tree_t * +range_tree_create_impl(range_tree_ops_t *ops, range_seg_type_t type, void *arg, + uint64_t start, uint64_t shift, + int (*zfs_btree_compare) (const void *, const void *), + uint64_t gap) +{ + range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + + ASSERT3U(shift, <, 64); + ASSERT3U(type, <=, RANGE_SEG_NUM_TYPES); + size_t size; + int (*compare) (const void *, const void *); + switch (type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + compare = range_tree_seg32_compare; + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + compare = range_tree_seg64_compare; + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + compare = range_tree_seg_gap_compare; + break; + default: + panic("Invalid range seg type %d", type); + } + zfs_btree_create(&rt->rt_root, compare, size); + + rt->rt_ops = ops; + rt->rt_gap = gap; + rt->rt_arg = arg; + rt->rt_type = type; + rt->rt_start = start; + rt->rt_shift = shift; + rt->rt_btree_compare = zfs_btree_compare; + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) + rt->rt_ops->rtop_create(rt, rt->rt_arg); + + return (rt); +} + +range_tree_t * +range_tree_create(range_tree_ops_t *ops, range_seg_type_t type, + void *arg, uint64_t start, uint64_t shift) +{ + return (range_tree_create_impl(ops, type, arg, start, shift, NULL, 0)); +} + +void +range_tree_destroy(range_tree_t *rt) +{ + VERIFY0(rt->rt_space); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) + rt->rt_ops->rtop_destroy(rt, rt->rt_arg); + + zfs_btree_destroy(&rt->rt_root); + kmem_free(rt, sizeof (*rt)); +} + +void +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +{ + if (delta < 0 && delta * -1 >= rs_get_fill(rs, rt)) { + zfs_panic_recover("zfs: attempting to decrease fill to or " + "below 0; probable double remove in segment [%llx:%llx]", + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt)); + } + if (rs_get_fill(rs, rt) + delta > rs_get_end(rs, rt) - + rs_get_start(rs, rt)) { + zfs_panic_recover("zfs: attempting to increase fill beyond " + "max; probable double add in segment [%llx:%llx]", + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt)); + } + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs_set_fill(rs, rt, rs_get_fill(rs, rt) + delta); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + +static void +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) +{ + range_tree_t *rt = arg; + zfs_btree_index_t where; + range_seg_t *rs_before, *rs_after, *rs; + range_seg_max_t tmp, rsearch; + uint64_t end = start + size, gap = rt->rt_gap; + uint64_t bridge_size = 0; + boolean_t merge_before, merge_after; + + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); + ASSERT3U(start + size, >, start); + + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); + + /* + * If this is a gap-supporting range tree, it is possible that we + * are inserting into an existing segment. In this case simply + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. + */ + if (rs != NULL) { + if (gap == 0) { + zfs_panic_recover("zfs: adding existent segment to " + "range tree (offset=%llx size=%llx)", + (longlong_t)start, (longlong_t)size); + return; + } + uint64_t rstart = rs_get_start(rs, rt); + uint64_t rend = rs_get_end(rs, rt); + if (rstart <= start && rend >= end) { + range_tree_adjust_fill(rt, rs, fill); + return; + } + + zfs_btree_remove(&rt->rt_root, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rend - rstart; + + fill += rs_get_fill(rs, rt); + start = MIN(start, rstart); + end = MAX(end, rend); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + return; + } + + ASSERT3P(rs, ==, NULL); + + /* + * Determine whether or not we will have to merge with our neighbors. + * If gap != 0, we might need to merge with our neighbors even if we + * aren't directly touching. + */ + zfs_btree_index_t where_before, where_after; + rs_before = zfs_btree_prev(&rt->rt_root, &where, &where_before); + rs_after = zfs_btree_next(&rt->rt_root, &where, &where_after); + + merge_before = (rs_before != NULL && rs_get_end(rs_before, rt) >= + start - gap); + merge_after = (rs_after != NULL && rs_get_start(rs_after, rt) <= end + + gap); + + if (merge_before && gap != 0) + bridge_size += start - rs_get_end(rs_before, rt); + if (merge_after && gap != 0) + bridge_size += rs_get_start(rs_after, rt) - end; + + if (merge_before && merge_after) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + } + + range_tree_stat_decr(rt, rs_before); + range_tree_stat_decr(rt, rs_after); + + rs_copy(rs_after, &tmp, rt); + uint64_t before_start = rs_get_start_raw(rs_before, rt); + uint64_t before_fill = rs_get_fill(rs_before, rt); + uint64_t after_fill = rs_get_fill(rs_after, rt); + zfs_btree_remove_idx(&rt->rt_root, &where_before); + + /* + * We have to re-find the node because our old reference is + * invalid as soon as we do any mutating btree operations. + */ + rs_after = zfs_btree_find(&rt->rt_root, &tmp, &where_after); + rs_set_start_raw(rs_after, rt, before_start); + rs_set_fill(rs_after, rt, after_fill + before_fill + fill); + rs = rs_after; + } else if (merge_before) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); + + range_tree_stat_decr(rt, rs_before); + + uint64_t before_fill = rs_get_fill(rs_before, rt); + rs_set_end(rs_before, rt, end); + rs_set_fill(rs_before, rt, before_fill + fill); + rs = rs_before; + } else if (merge_after) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); + + range_tree_stat_decr(rt, rs_after); + + uint64_t after_fill = rs_get_fill(rs_after, rt); + rs_set_start(rs_after, rt, start); + rs_set_fill(rs_after, rt, after_fill + fill); + rs = rs_after; + } else { + rs = &tmp; + + rs_set_start(rs, rt, start); + rs_set_end(rs, rt, end); + rs_set_fill(rs, rt, fill); + zfs_btree_add_idx(&rt->rt_root, rs, &where); + } + + if (gap != 0) { + ASSERT3U(rs_get_fill(rs, rt), <=, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } else { + ASSERT3U(rs_get_fill(rs, rt), ==, rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + range_tree_stat_incr(rt, rs); + rt->rt_space += size + bridge_size; +} + +void +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_add_impl(arg, start, size, size); +} + +static void +range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, + boolean_t do_fill) +{ + zfs_btree_index_t where; + range_seg_t *rs; + range_seg_max_t rsearch, rs_tmp; + uint64_t end = start + size; + boolean_t left_over, right_over; + + VERIFY3U(size, !=, 0); + VERIFY3U(size, <=, rt->rt_space); + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); + + /* Make sure we completely overlap with someone */ + if (rs == NULL) { + zfs_panic_recover("zfs: removing nonexistent segment from " + "range tree (offset=%llx size=%llx)", + (longlong_t)start, (longlong_t)size); + return; + } + + /* + * Range trees with gap support must only remove complete segments + * from the tree. This allows us to maintain accurate fill accounting + * and to ensure that bridged sections are not leaked. If we need to + * remove less than the full segment, we can only adjust the fill count. + */ + if (rt->rt_gap != 0) { + if (do_fill) { + if (rs_get_fill(rs, rt) == size) { + start = rs_get_start(rs, rt); + end = rs_get_end(rs, rt); + size = end - start; + } else { + range_tree_adjust_fill(rt, rs, -size); + return; + } + } else if (rs_get_start(rs, rt) != start || + rs_get_end(rs, rt) != end) { + zfs_panic_recover("zfs: freeing partial segment of " + "gap tree (offset=%llx size=%llx) of " + "(offset=%llx size=%llx)", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs_get_start(rs, rt), + (longlong_t)rs_get_end(rs, rt) - rs_get_start(rs, + rt)); + return; + } + } + + VERIFY3U(rs_get_start(rs, rt), <=, start); + VERIFY3U(rs_get_end(rs, rt), >=, end); + + left_over = (rs_get_start(rs, rt) != start); + right_over = (rs_get_end(rs, rt) != end); + + range_tree_stat_decr(rt, rs); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + if (left_over && right_over) { + range_seg_max_t newseg; + rs_set_start(&newseg, rt, end); + rs_set_end_raw(&newseg, rt, rs_get_end_raw(rs, rt)); + rs_set_fill(&newseg, rt, rs_get_end(rs, rt) - end); + range_tree_stat_incr(rt, &newseg); + + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + + rs_copy(rs, &rs_tmp, rt); + if (zfs_btree_next(&rt->rt_root, &where, &where) != NULL) + zfs_btree_add_idx(&rt->rt_root, &newseg, &where); + else + zfs_btree_add(&rt->rt_root, &newseg); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, &newseg, rt->rt_arg); + } else if (left_over) { + // This modifies the buffer already inside the range tree + rs_set_end(rs, rt, start); + rs_copy(rs, &rs_tmp, rt); + } else if (right_over) { + // This modifies the buffer already inside the range tree + rs_set_start(rs, rt, end); + rs_copy(rs, &rs_tmp, rt); + } else { + zfs_btree_remove_idx(&rt->rt_root, &where); + rs = NULL; + } + + if (rs != NULL) { + /* + * The fill of the leftover segment will always be equal to + * the size, since we do not support removing partial segments + * of range trees with gaps. + */ + rs_set_fill_raw(rs, rt, rs_get_end_raw(rs, rt) - + rs_get_start_raw(rs, rt)); + range_tree_stat_incr(rt, &rs_tmp); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, &rs_tmp, rt->rt_arg); + } + + rt->rt_space -= size; +} + +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(arg, start, size, B_FALSE); +} + +void +range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(rt, start, size, B_TRUE); +} + +void +range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize) +{ + int64_t delta = newsize - (rs_get_end(rs, rt) - rs_get_start(rs, rt)); + + range_tree_stat_decr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + rs_set_start(rs, rt, newstart); + rs_set_end(rs, rt, newstart + newsize); + + range_tree_stat_incr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + rt->rt_space += delta; +} + +static range_seg_t * +range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_max_t rsearch; + uint64_t end = start + size; + + VERIFY(size != 0); + + rs_set_start(&rsearch, rt, start); + rs_set_end(&rsearch, rt, end); + return (zfs_btree_find(&rt->rt_root, &rsearch, NULL)); +} + +range_seg_t * +range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) +{ + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + range_seg_t *rs = range_tree_find_impl(rt, start, size); + if (rs != NULL && rs_get_start(rs, rt) <= start && + rs_get_end(rs, rt) >= start + size) { + return (rs); + } + return (NULL); +} + +void +range_tree_verify_not_present(range_tree_t *rt, uint64_t off, uint64_t size) +{ + range_seg_t *rs = range_tree_find(rt, off, size); + if (rs != NULL) + panic("segment already in tree; rs=%p", (void *)rs); +} + +boolean_t +range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size) +{ + return (range_tree_find(rt, start, size) != NULL); +} + +/* + * Returns the first subset of the given range which overlaps with the range + * tree. Returns true if there is a segment in the range, and false if there + * isn't. + */ +boolean_t +range_tree_find_in(range_tree_t *rt, uint64_t start, uint64_t size, + uint64_t *ostart, uint64_t *osize) +{ + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + range_seg_max_t rsearch; + rs_set_start(&rsearch, rt, start); + rs_set_end_raw(&rsearch, rt, rs_get_start_raw(&rsearch, rt) + 1); + + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_find(&rt->rt_root, &rsearch, &where); + if (rs != NULL) { + *ostart = start; + *osize = MIN(size, rs_get_end(rs, rt) - start); + return (B_TRUE); + } + + rs = zfs_btree_next(&rt->rt_root, &where, &where); + if (rs == NULL || rs_get_start(rs, rt) > start + size) + return (B_FALSE); + + *ostart = rs_get_start(rs, rt); + *osize = MIN(start + size, rs_get_end(rs, rt)) - + rs_get_start(rs, rt); + return (B_TRUE); +} + +/* + * Ensure that this range is not in the tree, regardless of whether + * it is currently in the tree. + */ +void +range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_seg_t *rs; + + if (size == 0) + return; + + if (rt->rt_type == RANGE_SEG64) + ASSERT3U(start + size, >, start); + + while ((rs = range_tree_find_impl(rt, start, size)) != NULL) { + uint64_t free_start = MAX(rs_get_start(rs, rt), start); + uint64_t free_end = MIN(rs_get_end(rs, rt), start + size); + range_tree_remove(rt, free_start, free_end - free_start); + } +} + +void +range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst) +{ + range_tree_t *rt; + + ASSERT0(range_tree_space(*rtdst)); + ASSERT0(zfs_btree_numnodes(&(*rtdst)->rt_root)); + + rt = *rtsrc; + *rtsrc = *rtdst; + *rtdst = rt; +} + +void +range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) + rt->rt_ops->rtop_vacate(rt, rt->rt_arg); + + if (func != NULL) { + range_seg_t *rs; + zfs_btree_index_t *cookie = NULL; + + while ((rs = zfs_btree_destroy_nodes(&rt->rt_root, &cookie)) != + NULL) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } + } else { + zfs_btree_clear(&rt->rt_root); + } + + bzero(rt->rt_histogram, sizeof (rt->rt_histogram)); + rt->rt_space = 0; +} + +void +range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) +{ + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); + rs != NULL; rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + func(arg, rs_get_start(rs, rt), rs_get_end(rs, rt) - + rs_get_start(rs, rt)); + } +} + +range_seg_t * +range_tree_first(range_tree_t *rt) +{ + return (zfs_btree_first(&rt->rt_root, NULL)); +} + +uint64_t +range_tree_space(range_tree_t *rt) +{ + return (rt->rt_space); +} + +uint64_t +range_tree_numsegs(range_tree_t *rt) +{ + return ((rt == NULL) ? 0 : zfs_btree_numnodes(&rt->rt_root)); +} + +boolean_t +range_tree_is_empty(range_tree_t *rt) +{ + ASSERT(rt != NULL); + return (range_tree_space(rt) == 0); +} + +/* ARGSUSED */ +void +rt_btree_create(range_tree_t *rt, void *arg) +{ + zfs_btree_t *size_tree = arg; + + size_t size; + switch (rt->rt_type) { + case RANGE_SEG32: + size = sizeof (range_seg32_t); + break; + case RANGE_SEG64: + size = sizeof (range_seg64_t); + break; + case RANGE_SEG_GAP: + size = sizeof (range_seg_gap_t); + break; + default: + panic("Invalid range seg type %d", rt->rt_type); + } + zfs_btree_create(size_tree, rt->rt_btree_compare, size); +} + +/* ARGSUSED */ +void +rt_btree_destroy(range_tree_t *rt, void *arg) +{ + zfs_btree_t *size_tree = arg; + ASSERT0(zfs_btree_numnodes(size_tree)); + + zfs_btree_destroy(size_tree); +} + +/* ARGSUSED */ +void +rt_btree_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + zfs_btree_t *size_tree = arg; + + zfs_btree_add(size_tree, rs); +} + +/* ARGSUSED */ +void +rt_btree_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + zfs_btree_t *size_tree = arg; + + zfs_btree_remove(size_tree, rs); +} + +/* ARGSUSED */ +void +rt_btree_vacate(range_tree_t *rt, void *arg) +{ + zfs_btree_t *size_tree = arg; + zfs_btree_clear(size_tree); + zfs_btree_destroy(size_tree); + + rt_btree_create(rt, arg); +} + +range_tree_ops_t rt_btree_ops = { + .rtop_create = rt_btree_create, + .rtop_destroy = rt_btree_destroy, + .rtop_add = rt_btree_add, + .rtop_remove = rt_btree_remove, + .rtop_vacate = rt_btree_vacate +}; + +/* + * Remove any overlapping ranges between the given segment [start, end) + * from removefrom. Add non-overlapping leftovers to addto. + */ +void +range_tree_remove_xor_add_segment(uint64_t start, uint64_t end, + range_tree_t *removefrom, range_tree_t *addto) +{ + zfs_btree_index_t where; + range_seg_max_t starting_rs; + rs_set_start(&starting_rs, removefrom, start); + rs_set_end_raw(&starting_rs, removefrom, rs_get_start_raw(&starting_rs, + removefrom) + 1); + + range_seg_t *curr = zfs_btree_find(&removefrom->rt_root, + &starting_rs, &where); + + if (curr == NULL) + curr = zfs_btree_next(&removefrom->rt_root, &where, &where); + + range_seg_t *next; + for (; curr != NULL; curr = next) { + if (start == end) + return; + VERIFY3U(start, <, end); + + /* there is no overlap */ + if (end <= rs_get_start(curr, removefrom)) { + range_tree_add(addto, start, end - start); + return; + } + + uint64_t overlap_start = MAX(rs_get_start(curr, removefrom), + start); + uint64_t overlap_end = MIN(rs_get_end(curr, removefrom), + end); + uint64_t overlap_size = overlap_end - overlap_start; + ASSERT3S(overlap_size, >, 0); + range_seg_max_t rs; + rs_copy(curr, &rs, removefrom); + + range_tree_remove(removefrom, overlap_start, overlap_size); + + if (start < overlap_start) + range_tree_add(addto, start, overlap_start - start); + + start = overlap_end; + next = zfs_btree_find(&removefrom->rt_root, &rs, &where); + /* + * If we find something here, we only removed part of the + * curr segment. Either there's some left at the end + * because we've reached the end of the range we're removing, + * or there's some left at the start because we started + * partway through the range. Either way, we continue with + * the loop. If it's the former, we'll return at the start of + * the loop, and if it's the latter we'll see if there is more + * area to process. + */ + if (next != NULL) { + ASSERT(start == end || start == rs_get_end(&rs, + removefrom)); + } + + next = zfs_btree_next(&removefrom->rt_root, &where, &where); + } + VERIFY3P(curr, ==, NULL); + + if (start != end) { + VERIFY3U(start, <, end); + range_tree_add(addto, start, end - start); + } else { + VERIFY3U(start, ==, end); + } +} + +/* + * For each entry in rt, if it exists in removefrom, remove it + * from removefrom. Otherwise, add it to addto. + */ +void +range_tree_remove_xor_add(range_tree_t *rt, range_tree_t *removefrom, + range_tree_t *addto) +{ + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, &where)) { + range_tree_remove_xor_add_segment(rs_get_start(rs, rt), + rs_get_end(rs, rt), removefrom, addto); + } +} + +uint64_t +range_tree_min(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_first(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_start(rs, rt) : 0); +} + +uint64_t +range_tree_max(range_tree_t *rt) +{ + range_seg_t *rs = zfs_btree_last(&rt->rt_root, NULL); + return (rs != NULL ? rs_get_end(rs, rt) : 0); +} + +uint64_t +range_tree_span(range_tree_t *rt) +{ + return (range_tree_max(rt) - range_tree_min(rt)); +} diff --git a/module/zfs/refcount.c b/module/zfs/refcount.c new file mode 100644 index 000000000000..39476261edfb --- /dev/null +++ b/module/zfs/refcount.c @@ -0,0 +1,327 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#include +#include + +/* + * Reference count tracking is disabled by default. It's memory requirements + * are reasonable, however as implemented it consumes a significant amount of + * cpu time. Until its performance is improved it should be manually enabled. + */ +int reference_tracking_enable = FALSE; +int reference_history = 3; /* tunable */ + +#ifdef ZFS_DEBUG +static kmem_cache_t *reference_cache; +static kmem_cache_t *reference_history_cache; + +void +zfs_refcount_init(void) +{ + reference_cache = kmem_cache_create("reference_cache", + sizeof (reference_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + reference_history_cache = kmem_cache_create("reference_history_cache", + sizeof (uint64_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +zfs_refcount_fini(void) +{ + kmem_cache_destroy(reference_cache); + kmem_cache_destroy(reference_history_cache); +} + +void +zfs_refcount_create(zfs_refcount_t *rc) +{ + mutex_init(&rc->rc_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&rc->rc_list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&rc->rc_removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + rc->rc_count = 0; + rc->rc_removed_count = 0; + rc->rc_tracked = reference_tracking_enable; +} + +void +zfs_refcount_create_tracked(zfs_refcount_t *rc) +{ + zfs_refcount_create(rc); + rc->rc_tracked = B_TRUE; +} + +void +zfs_refcount_create_untracked(zfs_refcount_t *rc) +{ + zfs_refcount_create(rc); + rc->rc_tracked = B_FALSE; +} + +void +zfs_refcount_destroy_many(zfs_refcount_t *rc, uint64_t number) +{ + reference_t *ref; + + ASSERT3U(rc->rc_count, ==, number); + while ((ref = list_head(&rc->rc_list))) { + list_remove(&rc->rc_list, ref); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_list); + + while ((ref = list_head(&rc->rc_removed))) { + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, ref->ref_removed); + kmem_cache_free(reference_cache, ref); + } + list_destroy(&rc->rc_removed); + mutex_destroy(&rc->rc_mtx); +} + +void +zfs_refcount_destroy(zfs_refcount_t *rc) +{ + zfs_refcount_destroy_many(rc, 0); +} + +int +zfs_refcount_is_zero(zfs_refcount_t *rc) +{ + return (rc->rc_count == 0); +} + +int64_t +zfs_refcount_count(zfs_refcount_t *rc) +{ + return (rc->rc_count); +} + +int64_t +zfs_refcount_add_many(zfs_refcount_t *rc, uint64_t number, const void *holder) +{ + reference_t *ref = NULL; + int64_t count; + + if (rc->rc_tracked) { + ref = kmem_cache_alloc(reference_cache, KM_SLEEP); + ref->ref_holder = holder; + ref->ref_number = number; + } + mutex_enter(&rc->rc_mtx); + ASSERT3U(rc->rc_count, >=, 0); + if (rc->rc_tracked) + list_insert_head(&rc->rc_list, ref); + rc->rc_count += number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + + return (count); +} + +int64_t +zfs_refcount_add(zfs_refcount_t *rc, const void *holder) +{ + return (zfs_refcount_add_many(rc, 1, holder)); +} + +int64_t +zfs_refcount_remove_many(zfs_refcount_t *rc, uint64_t number, + const void *holder) +{ + reference_t *ref; + int64_t count; + + mutex_enter(&rc->rc_mtx); + ASSERT3U(rc->rc_count, >=, number); + + if (!rc->rc_tracked) { + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder && ref->ref_number == number) { + list_remove(&rc->rc_list, ref); + if (reference_history > 0) { + ref->ref_removed = + kmem_cache_alloc(reference_history_cache, + KM_SLEEP); + list_insert_head(&rc->rc_removed, ref); + rc->rc_removed_count++; + if (rc->rc_removed_count > reference_history) { + ref = list_tail(&rc->rc_removed); + list_remove(&rc->rc_removed, ref); + kmem_cache_free(reference_history_cache, + ref->ref_removed); + kmem_cache_free(reference_cache, ref); + rc->rc_removed_count--; + } + } else { + kmem_cache_free(reference_cache, ref); + } + rc->rc_count -= number; + count = rc->rc_count; + mutex_exit(&rc->rc_mtx); + return (count); + } + } + panic("No such hold %p on refcount %llx", holder, + (u_longlong_t)(uintptr_t)rc); + return (-1); +} + +int64_t +zfs_refcount_remove(zfs_refcount_t *rc, const void *holder) +{ + return (zfs_refcount_remove_many(rc, 1, holder)); +} + +void +zfs_refcount_transfer(zfs_refcount_t *dst, zfs_refcount_t *src) +{ + int64_t count, removed_count; + list_t list, removed; + + list_create(&list, sizeof (reference_t), + offsetof(reference_t, ref_link)); + list_create(&removed, sizeof (reference_t), + offsetof(reference_t, ref_link)); + + mutex_enter(&src->rc_mtx); + count = src->rc_count; + removed_count = src->rc_removed_count; + src->rc_count = 0; + src->rc_removed_count = 0; + list_move_tail(&list, &src->rc_list); + list_move_tail(&removed, &src->rc_removed); + mutex_exit(&src->rc_mtx); + + mutex_enter(&dst->rc_mtx); + dst->rc_count += count; + dst->rc_removed_count += removed_count; + list_move_tail(&dst->rc_list, &list); + list_move_tail(&dst->rc_removed, &removed); + mutex_exit(&dst->rc_mtx); + + list_destroy(&list); + list_destroy(&removed); +} + +void +zfs_refcount_transfer_ownership_many(zfs_refcount_t *rc, uint64_t number, + const void *current_holder, const void *new_holder) +{ + reference_t *ref; + boolean_t found = B_FALSE; + + mutex_enter(&rc->rc_mtx); + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return; + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == current_holder && + ref->ref_number == number) { + ref->ref_holder = new_holder; + found = B_TRUE; + break; + } + } + ASSERT(found); + mutex_exit(&rc->rc_mtx); +} + +void +zfs_refcount_transfer_ownership(zfs_refcount_t *rc, const void *current_holder, + const void *new_holder) +{ + return (zfs_refcount_transfer_ownership_many(rc, 1, current_holder, + new_holder)); +} + +/* + * If tracking is enabled, return true if a reference exists that matches + * the "holder" tag. If tracking is disabled, then return true if a reference + * might be held. + */ +boolean_t +zfs_refcount_held(zfs_refcount_t *rc, const void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (rc->rc_count > 0); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_FALSE); +} + +/* + * If tracking is enabled, return true if a reference does not exist that + * matches the "holder" tag. If tracking is disabled, always return true + * since the reference might not be held. + */ +boolean_t +zfs_refcount_not_held(zfs_refcount_t *rc, const void *holder) +{ + reference_t *ref; + + mutex_enter(&rc->rc_mtx); + + if (!rc->rc_tracked) { + mutex_exit(&rc->rc_mtx); + return (B_TRUE); + } + + for (ref = list_head(&rc->rc_list); ref; + ref = list_next(&rc->rc_list, ref)) { + if (ref->ref_holder == holder) { + mutex_exit(&rc->rc_mtx); + return (B_FALSE); + } + } + mutex_exit(&rc->rc_mtx); + return (B_TRUE); +} +#endif /* ZFS_DEBUG */ diff --git a/module/zfs/rrwlock.c b/module/zfs/rrwlock.c new file mode 100644 index 000000000000..d23fc3ad1067 --- /dev/null +++ b/module/zfs/rrwlock.c @@ -0,0 +1,396 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include + +/* + * This file contains the implementation of a re-entrant read + * reader/writer lock (aka "rrwlock"). + * + * This is a normal reader/writer lock with the additional feature + * of allowing threads who have already obtained a read lock to + * re-enter another read lock (re-entrant read) - even if there are + * waiting writers. + * + * Callers who have not obtained a read lock give waiting writers priority. + * + * The rrwlock_t lock does not allow re-entrant writers, nor does it + * allow a re-entrant mix of reads and writes (that is, it does not + * allow a caller who has already obtained a read lock to be able to + * then grab a write lock without first dropping all read locks, and + * vice versa). + * + * The rrwlock_t uses tsd (thread specific data) to keep a list of + * nodes (rrw_node_t), where each node keeps track of which specific + * lock (rrw_node_t::rn_rrl) the thread has grabbed. Since re-entering + * should be rare, a thread that grabs multiple reads on the same rrwlock_t + * will store multiple rrw_node_ts of the same 'rrn_rrl'. Nodes on the + * tsd list can represent a different rrwlock_t. This allows a thread + * to enter multiple and unique rrwlock_ts for read locks at the same time. + * + * Since using tsd exposes some overhead, the rrwlock_t only needs to + * keep tsd data when writers are waiting. If no writers are waiting, then + * a reader just bumps the anonymous read count (rr_anon_rcount) - no tsd + * is needed. Once a writer attempts to grab the lock, readers then + * keep tsd data and bump the linked readers count (rr_linked_rcount). + * + * If there are waiting writers and there are anonymous readers, then a + * reader doesn't know if it is a re-entrant lock. But since it may be one, + * we allow the read to proceed (otherwise it could deadlock). Since once + * waiting writers are active, readers no longer bump the anonymous count, + * the anonymous readers will eventually flush themselves out. At this point, + * readers will be able to tell if they are a re-entrant lock (have a + * rrw_node_t entry for the lock) or not. If they are a re-entrant lock, then + * we must let the proceed. If they are not, then the reader blocks for the + * waiting writers. Hence, we do not starve writers. + */ + +/* global key for TSD */ +uint_t rrw_tsd_key; + +typedef struct rrw_node { + struct rrw_node *rn_next; + rrwlock_t *rn_rrl; + void *rn_tag; +} rrw_node_t; + +static rrw_node_t * +rrn_find(rrwlock_t *rrl) +{ + rrw_node_t *rn; + + if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) + return (NULL); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl) + return (rn); + } + return (NULL); +} + +/* + * Add a node to the head of the singly linked list. + */ +static void +rrn_add(rrwlock_t *rrl, void *tag) +{ + rrw_node_t *rn; + + rn = kmem_alloc(sizeof (*rn), KM_SLEEP); + rn->rn_rrl = rrl; + rn->rn_next = tsd_get(rrw_tsd_key); + rn->rn_tag = tag; + VERIFY(tsd_set(rrw_tsd_key, rn) == 0); +} + +/* + * If a node is found for 'rrl', then remove the node from this + * thread's list and return TRUE; otherwise return FALSE. + */ +static boolean_t +rrn_find_and_remove(rrwlock_t *rrl, void *tag) +{ + rrw_node_t *rn; + rrw_node_t *prev = NULL; + + if (zfs_refcount_count(&rrl->rr_linked_rcount) == 0) + return (B_FALSE); + + for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { + if (rn->rn_rrl == rrl && rn->rn_tag == tag) { + if (prev) + prev->rn_next = rn->rn_next; + else + VERIFY(tsd_set(rrw_tsd_key, rn->rn_next) == 0); + kmem_free(rn, sizeof (*rn)); + return (B_TRUE); + } + prev = rn; + } + return (B_FALSE); +} + +void +rrw_init(rrwlock_t *rrl, boolean_t track_all) +{ + mutex_init(&rrl->rr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&rrl->rr_cv, NULL, CV_DEFAULT, NULL); + rrl->rr_writer = NULL; + zfs_refcount_create(&rrl->rr_anon_rcount); + zfs_refcount_create(&rrl->rr_linked_rcount); + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_track_all = track_all; +} + +void +rrw_destroy(rrwlock_t *rrl) +{ + mutex_destroy(&rrl->rr_lock); + cv_destroy(&rrl->rr_cv); + ASSERT(rrl->rr_writer == NULL); + zfs_refcount_destroy(&rrl->rr_anon_rcount); + zfs_refcount_destroy(&rrl->rr_linked_rcount); +} + +static void +rrw_enter_read_impl(rrwlock_t *rrl, boolean_t prio, void *tag) +{ + mutex_enter(&rrl->rr_lock); +#if !defined(ZFS_DEBUG) && defined(_KERNEL) + if (rrl->rr_writer == NULL && !rrl->rr_writer_wanted && + !rrl->rr_track_all) { + rrl->rr_anon_rcount.rc_count++; + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +#endif + ASSERT(rrl->rr_writer != curthread); + ASSERT(zfs_refcount_count(&rrl->rr_anon_rcount) >= 0); + + while (rrl->rr_writer != NULL || (rrl->rr_writer_wanted && + zfs_refcount_is_zero(&rrl->rr_anon_rcount) && !prio && + rrn_find(rrl) == NULL)) + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + + if (rrl->rr_writer_wanted || rrl->rr_track_all) { + /* may or may not be a re-entrant enter */ + rrn_add(rrl, tag); + (void) zfs_refcount_add(&rrl->rr_linked_rcount, tag); + } else { + (void) zfs_refcount_add(&rrl->rr_anon_rcount, tag); + } + ASSERT(rrl->rr_writer == NULL); + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter_read(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_FALSE, tag); +} + +/* + * take a read lock even if there are pending write lock requests. if we want + * to take a lock reentrantly, but from different threads (that have a + * relationship to each other), the normal detection mechanism to overrule + * the pending writer does not work, so we have to give an explicit hint here. + */ +void +rrw_enter_read_prio(rrwlock_t *rrl, void *tag) +{ + rrw_enter_read_impl(rrl, B_TRUE, tag); +} + + +void +rrw_enter_write(rrwlock_t *rrl) +{ + mutex_enter(&rrl->rr_lock); + ASSERT(rrl->rr_writer != curthread); + + while (zfs_refcount_count(&rrl->rr_anon_rcount) > 0 || + zfs_refcount_count(&rrl->rr_linked_rcount) > 0 || + rrl->rr_writer != NULL) { + rrl->rr_writer_wanted = B_TRUE; + cv_wait(&rrl->rr_cv, &rrl->rr_lock); + } + rrl->rr_writer_wanted = B_FALSE; + rrl->rr_writer = curthread; + mutex_exit(&rrl->rr_lock); +} + +void +rrw_enter(rrwlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrw_enter_read(rrl, tag); + else + rrw_enter_write(rrl); +} + +void +rrw_exit(rrwlock_t *rrl, void *tag) +{ + mutex_enter(&rrl->rr_lock); +#if !defined(ZFS_DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { + rrl->rr_anon_rcount.rc_count--; + if (rrl->rr_anon_rcount.rc_count == 0) + cv_broadcast(&rrl->rr_cv); + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__exitmiss); +#endif + ASSERT(!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || + !zfs_refcount_is_zero(&rrl->rr_linked_rcount) || + rrl->rr_writer != NULL); + + if (rrl->rr_writer == NULL) { + int64_t count; + if (rrn_find_and_remove(rrl, tag)) { + count = zfs_refcount_remove( + &rrl->rr_linked_rcount, tag); + } else { + ASSERT(!rrl->rr_track_all); + count = zfs_refcount_remove(&rrl->rr_anon_rcount, tag); + } + if (count == 0) + cv_broadcast(&rrl->rr_cv); + } else { + ASSERT(rrl->rr_writer == curthread); + ASSERT(zfs_refcount_is_zero(&rrl->rr_anon_rcount) && + zfs_refcount_is_zero(&rrl->rr_linked_rcount)); + rrl->rr_writer = NULL; + cv_broadcast(&rrl->rr_cv); + } + mutex_exit(&rrl->rr_lock); +} + +/* + * If the lock was created with track_all, rrw_held(RW_READER) will return + * B_TRUE iff the current thread has the lock for reader. Otherwise it may + * return B_TRUE if any thread has the lock for reader. + */ +boolean_t +rrw_held(rrwlock_t *rrl, krw_t rw) +{ + boolean_t held; + + mutex_enter(&rrl->rr_lock); + if (rw == RW_WRITER) { + held = (rrl->rr_writer == curthread); + } else { + held = (!zfs_refcount_is_zero(&rrl->rr_anon_rcount) || + rrn_find(rrl) != NULL); + } + mutex_exit(&rrl->rr_lock); + + return (held); +} + +void +rrw_tsd_destroy(void *arg) +{ + rrw_node_t *rn = arg; + if (rn != NULL) { + panic("thread %p terminating with rrw lock %p held", + (void *)curthread, (void *)rn->rn_rrl); + } +} + +/* + * A reader-mostly lock implementation, tuning above reader-writer locks + * for hightly parallel read acquisitions, while pessimizing writes. + * + * The idea is to split single busy lock into array of locks, so that + * each reader can lock only one of them for read, depending on result + * of simple hash function. That proportionally reduces lock congestion. + * Writer at the same time has to sequentially acquire write on all the locks. + * That makes write acquisition proportionally slower, but in places where + * it is used (filesystem unmount) performance is not critical. + * + * All the functions below are direct wrappers around functions above. + */ +void +rrm_init(rrmlock_t *rrl, boolean_t track_all) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_init(&rrl->locks[i], track_all); +} + +void +rrm_destroy(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_destroy(&rrl->locks[i]); +} + +void +rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag) +{ + if (rw == RW_READER) + rrm_enter_read(rrl, tag); + else + rrm_enter_write(rrl); +} + +/* + * This maps the current thread to a specific lock. Note that the lock + * must be released by the same thread that acquired it. We do this + * mapping by taking the thread pointer mod a prime number. We examine + * only the low 32 bits of the thread pointer, because 32-bit division + * is faster than 64-bit division, and the high 32 bits have little + * entropy anyway. + */ +#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS) + +void +rrm_enter_read(rrmlock_t *rrl, void *tag) +{ + rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag); +} + +void +rrm_enter_write(rrmlock_t *rrl) +{ + int i; + + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_enter_write(&rrl->locks[i]); +} + +void +rrm_exit(rrmlock_t *rrl, void *tag) +{ + int i; + + if (rrl->locks[0].rr_writer == curthread) { + for (i = 0; i < RRM_NUM_LOCKS; i++) + rrw_exit(&rrl->locks[i], tag); + } else { + rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag); + } +} + +boolean_t +rrm_held(rrmlock_t *rrl, krw_t rw) +{ + if (rw == RW_WRITER) { + return (rrw_held(&rrl->locks[0], rw)); + } else { + return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw)); + } +} diff --git a/module/zfs/sa.c b/module/zfs/sa.c new file mode 100644 index 000000000000..977e729fe468 --- /dev/null +++ b/module/zfs/sa.c @@ -0,0 +1,2258 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif + +/* + * ZFS System attributes: + * + * A generic mechanism to allow for arbitrary attributes + * to be stored in a dnode. The data will be stored in the bonus buffer of + * the dnode and if necessary a special "spill" block will be used to handle + * overflow situations. The spill block will be sized to fit the data + * from 512 - 128K. When a spill block is used the BP (blkptr_t) for the + * spill block is stored at the end of the current bonus buffer. Any + * attributes that would be in the way of the blkptr_t will be relocated + * into the spill block. + * + * Attribute registration: + * + * Stored persistently on a per dataset basis + * a mapping between attribute "string" names and their actual attribute + * numeric values, length, and byteswap function. The names are only used + * during registration. All attributes are known by their unique attribute + * id value. If an attribute can have a variable size then the value + * 0 will be used to indicate this. + * + * Attribute Layout: + * + * Attribute layouts are a way to compactly store multiple attributes, but + * without taking the overhead associated with managing each attribute + * individually. Since you will typically have the same set of attributes + * stored in the same order a single table will be used to represent that + * layout. The ZPL for example will usually have only about 10 different + * layouts (regular files, device files, symlinks, + * regular files + scanstamp, files/dir with extended attributes, and then + * you have the possibility of all of those minus ACL, because it would + * be kicked out into the spill block) + * + * Layouts are simply an array of the attributes and their + * ordering i.e. [0, 1, 4, 5, 2] + * + * Each distinct layout is given a unique layout number and that is what's + * stored in the header at the beginning of the SA data buffer. + * + * A layout only covers a single dbuf (bonus or spill). If a set of + * attributes is split up between the bonus buffer and a spill buffer then + * two different layouts will be used. This allows us to byteswap the + * spill without looking at the bonus buffer and keeps the on disk format of + * the bonus and spill buffer the same. + * + * Adding a single attribute will cause the entire set of attributes to + * be rewritten and could result in a new layout number being constructed + * as part of the rewrite if no such layout exists for the new set of + * attributes. The new attribute will be appended to the end of the already + * existing attributes. + * + * Both the attribute registration and attribute layout information are + * stored in normal ZAP attributes. Their should be a small number of + * known layouts and the set of attributes is assumed to typically be quite + * small. + * + * The registered attributes and layout "table" information is maintained + * in core and a special "sa_os_t" is attached to the objset_t. + * + * A special interface is provided to allow for quickly applying + * a large set of attributes at once. sa_replace_all_by_template() is + * used to set an array of attributes. This is used by the ZPL when + * creating a brand new file. The template that is passed into the function + * specifies the attribute, size for variable length attributes, location of + * data and special "data locator" function if the data isn't in a contiguous + * location. + * + * Byteswap implications: + * + * Since the SA attributes are not entirely self describing we can't do + * the normal byteswap processing. The special ZAP layout attribute and + * attribute registration attributes define the byteswap function and the + * size of the attributes, unless it is variable sized. + * The normal ZFS byteswapping infrastructure assumes you don't need + * to read any objects in order to do the necessary byteswapping. Whereas + * SA attributes can only be properly byteswapped if the dataset is opened + * and the layout/attribute ZAP attributes are available. Because of this + * the SA attributes will be byteswapped when they are first accessed by + * the SA code that will read the SA data. + */ + +typedef void (sa_iterfunc_t)(void *hdr, void *addr, sa_attr_type_t, + uint16_t length, int length_idx, boolean_t, void *userp); + +static int sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype); +static void sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab); +static sa_idx_tab_t *sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, + sa_hdr_phys_t *hdr); +static void sa_idx_tab_rele(objset_t *os, void *arg); +static void sa_copy_data(sa_data_locator_t *func, void *start, void *target, + int buflen); +static int sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx); + +arc_byteswap_func_t sa_bswap_table[] = { + byteswap_uint64_array, + byteswap_uint32_array, + byteswap_uint16_array, + byteswap_uint8_array, + zfs_acl_byteswap, +}; + +#ifdef HAVE_EFFICIENT_UNALIGNED_ACCESS +#define SA_COPY_DATA(f, s, t, l) \ +do { \ + if (f == NULL) { \ + if (l == 8) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + } else if (l == 16) { \ + *(uint64_t *)t = *(uint64_t *)s; \ + *(uint64_t *)((uintptr_t)t + 8) = \ + *(uint64_t *)((uintptr_t)s + 8); \ + } else { \ + bcopy(s, t, l); \ + } \ + } else { \ + sa_copy_data(f, s, t, l); \ + } \ +} while (0) +#else +#define SA_COPY_DATA(f, s, t, l) sa_copy_data(f, s, t, l) +#endif + +/* + * This table is fixed and cannot be changed. Its purpose is to + * allow the SA code to work with both old/new ZPL file systems. + * It contains the list of legacy attributes. These attributes aren't + * stored in the "attribute" registry zap objects, since older ZPL file systems + * won't have the registry. Only objsets of type ZFS_TYPE_FILESYSTEM will + * use this static table. + */ +sa_attr_reg_t sa_legacy_attrs[] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, +}; + +/* + * This is only used for objects of type DMU_OT_ZNODE + */ +sa_attr_type_t sa_legacy_zpl_layout[] = { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 +}; + +/* + * Special dummy layout used for buffers with no attributes. + */ +sa_attr_type_t sa_dummy_zpl_layout[] = { 0 }; + +static int sa_legacy_attr_count = ARRAY_SIZE(sa_legacy_attrs); +static kmem_cache_t *sa_cache = NULL; + +/*ARGSUSED*/ +static int +sa_cache_constructor(void *buf, void *unused, int kmflag) +{ + sa_handle_t *hdl = buf; + + mutex_init(&hdl->sa_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/*ARGSUSED*/ +static void +sa_cache_destructor(void *buf, void *unused) +{ + sa_handle_t *hdl = buf; + mutex_destroy(&hdl->sa_lock); +} + +void +sa_cache_init(void) +{ + sa_cache = kmem_cache_create("sa_cache", + sizeof (sa_handle_t), 0, sa_cache_constructor, + sa_cache_destructor, NULL, NULL, NULL, 0); +} + +void +sa_cache_fini(void) +{ + if (sa_cache) + kmem_cache_destroy(sa_cache); +} + +static int +layout_num_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; + + return (TREE_CMP(node1->lot_num, node2->lot_num)); +} + +static int +layout_hash_compare(const void *arg1, const void *arg2) +{ + const sa_lot_t *node1 = (const sa_lot_t *)arg1; + const sa_lot_t *node2 = (const sa_lot_t *)arg2; + + int cmp = TREE_CMP(node1->lot_hash, node2->lot_hash); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(node1->lot_instance, node2->lot_instance)); +} + +static boolean_t +sa_layout_equal(sa_lot_t *tbf, sa_attr_type_t *attrs, int count) +{ + int i; + + if (count != tbf->lot_attr_count) + return (1); + + for (i = 0; i != count; i++) { + if (attrs[i] != tbf->lot_attrs[i]) + return (1); + } + return (0); +} + +#define SA_ATTR_HASH(attr) (zfs_crc64_table[(-1ULL ^ attr) & 0xFF]) + +static uint64_t +sa_layout_info_hash(sa_attr_type_t *attrs, int attr_count) +{ + int i; + uint64_t crc = -1ULL; + + for (i = 0; i != attr_count; i++) + crc ^= SA_ATTR_HASH(attrs[i]); + + return (crc); +} + +static int +sa_get_spill(sa_handle_t *hdl) +{ + int rc; + if (hdl->sa_spill == NULL) { + if ((rc = dmu_spill_hold_existing(hdl->sa_bonus, NULL, + &hdl->sa_spill)) == 0) + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } else { + rc = 0; + } + + return (rc); +} + +/* + * Main attribute lookup/update function + * returns 0 for success or non zero for failures + * + * Operates on bulk array, first failure will abort further processing + */ +static int +sa_attr_op(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + sa_data_op_t data_op, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + int i; + int error = 0; + sa_buf_type_t buftypes; + + buftypes = 0; + + ASSERT(count > 0); + for (i = 0; i != count; i++) { + ASSERT(bulk[i].sa_attr <= hdl->sa_os->os_sa->sa_num_attrs); + + bulk[i].sa_addr = NULL; + /* First check the bonus buffer */ + + if (hdl->sa_bonus_tab && TOC_ATTR_PRESENT( + hdl->sa_bonus_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_bonus_tab, + SA_GET_HDR(hdl, SA_BONUS), + bulk[i].sa_attr, bulk[i], SA_BONUS, hdl); + if (tx && !(buftypes & SA_BONUS)) { + dmu_buf_will_dirty(hdl->sa_bonus, tx); + buftypes |= SA_BONUS; + } + } + if (bulk[i].sa_addr == NULL && + ((error = sa_get_spill(hdl)) == 0)) { + if (TOC_ATTR_PRESENT( + hdl->sa_spill_tab->sa_idx_tab[bulk[i].sa_attr])) { + SA_ATTR_INFO(sa, hdl->sa_spill_tab, + SA_GET_HDR(hdl, SA_SPILL), + bulk[i].sa_attr, bulk[i], SA_SPILL, hdl); + if (tx && !(buftypes & SA_SPILL) && + bulk[i].sa_size == bulk[i].sa_length) { + dmu_buf_will_dirty(hdl->sa_spill, tx); + buftypes |= SA_SPILL; + } + } + } + if (error && error != ENOENT) { + return ((error == ECKSUM) ? EIO : error); + } + + switch (data_op) { + case SA_LOOKUP: + if (bulk[i].sa_addr == NULL) + return (SET_ERROR(ENOENT)); + if (bulk[i].sa_data) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_addr, bulk[i].sa_data, + bulk[i].sa_size); + } + continue; + + case SA_UPDATE: + /* existing rewrite of attr */ + if (bulk[i].sa_addr && + bulk[i].sa_size == bulk[i].sa_length) { + SA_COPY_DATA(bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_addr, + bulk[i].sa_length); + continue; + } else if (bulk[i].sa_addr) { /* attr size change */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_REPLACE, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } else { /* adding new attribute */ + error = sa_modify_attrs(hdl, bulk[i].sa_attr, + SA_ADD, bulk[i].sa_data_func, + bulk[i].sa_data, bulk[i].sa_length, tx); + } + if (error) + return (error); + break; + default: + break; + } + } + return (error); +} + +static sa_lot_t * +sa_add_layout_entry(objset_t *os, sa_attr_type_t *attrs, int attr_count, + uint64_t lot_num, uint64_t hash, boolean_t zapadd, dmu_tx_t *tx) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, *findtb; + int i; + avl_index_t loc; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + tb = kmem_zalloc(sizeof (sa_lot_t), KM_SLEEP); + tb->lot_attr_count = attr_count; + tb->lot_attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + bcopy(attrs, tb->lot_attrs, sizeof (sa_attr_type_t) * attr_count); + tb->lot_num = lot_num; + tb->lot_hash = hash; + tb->lot_instance = 0; + + if (zapadd) { + char attr_name[8]; + + if (sa->sa_layout_attr_obj == 0) { + sa->sa_layout_attr_obj = zap_create_link(os, + DMU_OT_SA_ATTR_LAYOUTS, + sa->sa_master_obj, SA_LAYOUTS, tx); + } + + (void) snprintf(attr_name, sizeof (attr_name), + "%d", (int)lot_num); + VERIFY(0 == zap_update(os, os->os_sa->sa_layout_attr_obj, + attr_name, 2, attr_count, attrs, tx)); + } + + list_create(&tb->lot_idx_tab, sizeof (sa_idx_tab_t), + offsetof(sa_idx_tab_t, sa_next)); + + for (i = 0; i != attr_count; i++) { + if (sa->sa_attr_table[tb->lot_attrs[i]].sa_length == 0) + tb->lot_var_sizes++; + } + + avl_add(&sa->sa_layout_num_tree, tb); + + /* verify we don't have a hash collision */ + if ((findtb = avl_find(&sa->sa_layout_hash_tree, tb, &loc)) != NULL) { + for (; findtb && findtb->lot_hash == hash; + findtb = AVL_NEXT(&sa->sa_layout_hash_tree, findtb)) { + if (findtb->lot_instance != tb->lot_instance) + break; + tb->lot_instance++; + } + } + avl_add(&sa->sa_layout_hash_tree, tb); + return (tb); +} + +static void +sa_find_layout(objset_t *os, uint64_t hash, sa_attr_type_t *attrs, + int count, dmu_tx_t *tx, sa_lot_t **lot) +{ + sa_lot_t *tb, tbsearch; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + boolean_t found = B_FALSE; + + mutex_enter(&sa->sa_lock); + tbsearch.lot_hash = hash; + tbsearch.lot_instance = 0; + tb = avl_find(&sa->sa_layout_hash_tree, &tbsearch, &loc); + if (tb) { + for (; tb && tb->lot_hash == hash; + tb = AVL_NEXT(&sa->sa_layout_hash_tree, tb)) { + if (sa_layout_equal(tb, attrs, count) == 0) { + found = B_TRUE; + break; + } + } + } + if (!found) { + tb = sa_add_layout_entry(os, attrs, count, + avl_numnodes(&sa->sa_layout_num_tree), hash, B_TRUE, tx); + } + mutex_exit(&sa->sa_lock); + *lot = tb; +} + +static int +sa_resize_spill(sa_handle_t *hdl, uint32_t size, dmu_tx_t *tx) +{ + int error; + uint32_t blocksize; + + if (size == 0) { + blocksize = SPA_MINBLOCKSIZE; + } else if (size > SPA_OLD_MAXBLOCKSIZE) { + ASSERT(0); + return (SET_ERROR(EFBIG)); + } else { + blocksize = P2ROUNDUP_TYPED(size, SPA_MINBLOCKSIZE, uint32_t); + } + + error = dbuf_spill_set_blksz(hdl->sa_spill, blocksize, tx); + ASSERT(error == 0); + return (error); +} + +static void +sa_copy_data(sa_data_locator_t *func, void *datastart, void *target, int buflen) +{ + if (func == NULL) { + bcopy(datastart, target, buflen); + } else { + boolean_t start; + int bytes; + void *dataptr; + void *saptr = target; + uint32_t length; + + start = B_TRUE; + bytes = 0; + while (bytes < buflen) { + func(&dataptr, &length, buflen, start, datastart); + bcopy(dataptr, saptr, length); + saptr = (void *)((caddr_t)saptr + length); + bytes += length; + start = B_FALSE; + } + } +} + +/* + * Determine several different values pertaining to system attribute + * buffers. + * + * Return the size of the sa_hdr_phys_t header for the buffer. Each + * variable length attribute except the first contributes two bytes to + * the header size, which is then rounded up to an 8-byte boundary. + * + * The following output parameters are also computed. + * + * index - The index of the first attribute in attr_desc that will + * spill over. Only valid if will_spill is set. + * + * total - The total number of bytes of all system attributes described + * in attr_desc. + * + * will_spill - Set when spilling is necessary. It is only set when + * the buftype is SA_BONUS. + */ +static int +sa_find_sizes(sa_os_t *sa, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_buf_t *db, sa_buf_type_t buftype, int full_space, int *index, + int *total, boolean_t *will_spill) +{ + int var_size_count = 0; + int i; + int hdrsize; + int extra_hdrsize; + + if (buftype == SA_BONUS && sa->sa_force_spill) { + *total = 0; + *index = 0; + *will_spill = B_TRUE; + return (0); + } + + *index = -1; + *total = 0; + *will_spill = B_FALSE; + + extra_hdrsize = 0; + hdrsize = (SA_BONUSTYPE_FROM_DB(db) == DMU_OT_ZNODE) ? 0 : + sizeof (sa_hdr_phys_t); + + ASSERT(IS_P2ALIGNED(full_space, 8)); + + for (i = 0; i != attr_count; i++) { + boolean_t is_var_sz, might_spill_here; + int tmp_hdrsize; + + *total = P2ROUNDUP(*total, 8); + *total += attr_desc[i].sa_length; + if (*will_spill) + continue; + + is_var_sz = (SA_REGISTERED_LEN(sa, attr_desc[i].sa_attr) == 0); + if (is_var_sz) + var_size_count++; + + /* + * Calculate what the SA header size would be if this + * attribute doesn't spill. + */ + tmp_hdrsize = hdrsize + ((is_var_sz && var_size_count > 1) ? + sizeof (uint16_t) : 0); + + /* + * Check whether this attribute spans into the space + * that would be used by the spill block pointer should + * a spill block be needed. + */ + might_spill_here = + buftype == SA_BONUS && *index == -1 && + (*total + P2ROUNDUP(tmp_hdrsize, 8)) > + (full_space - sizeof (blkptr_t)); + + if (is_var_sz && var_size_count > 1) { + if (buftype == SA_SPILL || + tmp_hdrsize + *total < full_space) { + /* + * Record the extra header size in case this + * increase needs to be reversed due to + * spill-over. + */ + hdrsize = tmp_hdrsize; + if (*index != -1 || might_spill_here) + extra_hdrsize += sizeof (uint16_t); + } else { + ASSERT(buftype == SA_BONUS); + if (*index == -1) + *index = i; + *will_spill = B_TRUE; + continue; + } + } + + /* + * Store index of where spill *could* occur. Then + * continue to count the remaining attribute sizes. The + * sum is used later for sizing bonus and spill buffer. + */ + if (might_spill_here) + *index = i; + + if ((*total + P2ROUNDUP(hdrsize, 8)) > full_space && + buftype == SA_BONUS) + *will_spill = B_TRUE; + } + + if (*will_spill) + hdrsize -= extra_hdrsize; + + hdrsize = P2ROUNDUP(hdrsize, 8); + return (hdrsize); +} + +#define BUF_SPACE_NEEDED(total, header) (total + header) + +/* + * Find layout that corresponds to ordering of attributes + * If not found a new layout number is created and added to + * persistent layout tables. + */ +static int +sa_build_layouts(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, int attr_count, + dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + uint64_t hash; + sa_buf_type_t buftype; + sa_hdr_phys_t *sahdr; + void *data_start; + sa_attr_type_t *attrs, *attrs_start; + int i, lot_count; + int dnodesize; + int spill_idx; + int hdrsize; + int spillhdrsize = 0; + int used; + dmu_object_type_t bonustype; + sa_lot_t *lot; + int len_idx; + int spill_used; + int bonuslen; + boolean_t spilling; + + dmu_buf_will_dirty(hdl->sa_bonus, tx); + bonustype = SA_BONUSTYPE_FROM_DB(hdl->sa_bonus); + dmu_object_dnsize_from_db(hdl->sa_bonus, &dnodesize); + bonuslen = DN_BONUS_SIZE(dnodesize); + + /* first determine bonus header size and sum of all attributes */ + hdrsize = sa_find_sizes(sa, attr_desc, attr_count, hdl->sa_bonus, + SA_BONUS, bonuslen, &spill_idx, &used, &spilling); + + if (used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + VERIFY0(dmu_set_bonus(hdl->sa_bonus, spilling ? + MIN(bonuslen - sizeof (blkptr_t), used + hdrsize) : + used + hdrsize, tx)); + + ASSERT((bonustype == DMU_OT_ZNODE && spilling == 0) || + bonustype == DMU_OT_SA); + + /* setup and size spill buffer when needed */ + if (spilling) { + boolean_t dummy; + + if (hdl->sa_spill == NULL) { + VERIFY(dmu_spill_hold_by_bonus(hdl->sa_bonus, 0, NULL, + &hdl->sa_spill) == 0); + } + dmu_buf_will_dirty(hdl->sa_spill, tx); + + spillhdrsize = sa_find_sizes(sa, &attr_desc[spill_idx], + attr_count - spill_idx, hdl->sa_spill, SA_SPILL, + hdl->sa_spill->db_size, &i, &spill_used, &dummy); + + if (spill_used > SPA_OLD_MAXBLOCKSIZE) + return (SET_ERROR(EFBIG)); + + if (BUF_SPACE_NEEDED(spill_used, spillhdrsize) > + hdl->sa_spill->db_size) + VERIFY(0 == sa_resize_spill(hdl, + BUF_SPACE_NEEDED(spill_used, spillhdrsize), tx)); + } + + /* setup starting pointers to lay down data */ + data_start = (void *)((uintptr_t)hdl->sa_bonus->db_data + hdrsize); + sahdr = (sa_hdr_phys_t *)hdl->sa_bonus->db_data; + buftype = SA_BONUS; + + attrs_start = attrs = kmem_alloc(sizeof (sa_attr_type_t) * attr_count, + KM_SLEEP); + lot_count = 0; + + for (i = 0, len_idx = 0, hash = -1ULL; i != attr_count; i++) { + uint16_t length; + + ASSERT(IS_P2ALIGNED(data_start, 8)); + attrs[i] = attr_desc[i].sa_attr; + length = SA_REGISTERED_LEN(sa, attrs[i]); + if (length == 0) + length = attr_desc[i].sa_length; + + if (spilling && i == spill_idx) { /* switch to spill buffer */ + VERIFY(bonustype == DMU_OT_SA); + if (buftype == SA_BONUS && !sa->sa_force_spill) { + sa_find_layout(hdl->sa_os, hash, attrs_start, + lot_count, tx, &lot); + SA_SET_HDR(sahdr, lot->lot_num, hdrsize); + } + + buftype = SA_SPILL; + hash = -1ULL; + len_idx = 0; + + sahdr = (sa_hdr_phys_t *)hdl->sa_spill->db_data; + sahdr->sa_magic = SA_MAGIC; + data_start = (void *)((uintptr_t)sahdr + + spillhdrsize); + attrs_start = &attrs[i]; + lot_count = 0; + } + hash ^= SA_ATTR_HASH(attrs[i]); + attr_desc[i].sa_addr = data_start; + attr_desc[i].sa_size = length; + SA_COPY_DATA(attr_desc[i].sa_data_func, attr_desc[i].sa_data, + data_start, length); + if (sa->sa_attr_table[attrs[i]].sa_length == 0) { + sahdr->sa_lengths[len_idx++] = length; + } + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + length), 8); + lot_count++; + } + + sa_find_layout(hdl->sa_os, hash, attrs_start, lot_count, tx, &lot); + + /* + * Verify that old znodes always have layout number 0. + * Must be DMU_OT_SA for arbitrary layouts + */ + VERIFY((bonustype == DMU_OT_ZNODE && lot->lot_num == 0) || + (bonustype == DMU_OT_SA && lot->lot_num > 1)); + + if (bonustype == DMU_OT_SA) { + SA_SET_HDR(sahdr, lot->lot_num, + buftype == SA_BONUS ? hdrsize : spillhdrsize); + } + + kmem_free(attrs, sizeof (sa_attr_type_t) * attr_count); + if (hdl->sa_bonus_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + hdl->sa_bonus_tab = NULL; + } + if (!sa->sa_force_spill) + VERIFY(0 == sa_build_index(hdl, SA_BONUS)); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + if (!spilling) { + /* + * remove spill block that is no longer needed. + */ + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + VERIFY(0 == dmu_rm_spill(hdl->sa_os, + sa_handle_object(hdl), tx)); + } else { + VERIFY(0 == sa_build_index(hdl, SA_SPILL)); + } + } + + return (0); +} + +static void +sa_free_attr_table(sa_os_t *sa) +{ + int i; + + if (sa->sa_attr_table == NULL) + return; + + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_name) + kmem_free(sa->sa_attr_table[i].sa_name, + strlen(sa->sa_attr_table[i].sa_name) + 1); + } + + kmem_free(sa->sa_attr_table, + sizeof (sa_attr_table_t) * sa->sa_num_attrs); + + sa->sa_attr_table = NULL; +} + +static int +sa_attr_table_setup(objset_t *os, sa_attr_reg_t *reg_attrs, int count) +{ + sa_os_t *sa = os->os_sa; + uint64_t sa_attr_count = 0; + uint64_t sa_reg_count = 0; + int error = 0; + uint64_t attr_value; + sa_attr_table_t *tb; + zap_cursor_t zc; + zap_attribute_t za; + int registered_count = 0; + int i; + dmu_objset_type_t ostype = dmu_objset_type(os); + + sa->sa_user_table = + kmem_zalloc(count * sizeof (sa_attr_type_t), KM_SLEEP); + sa->sa_user_table_sz = count * sizeof (sa_attr_type_t); + + if (sa->sa_reg_attr_obj != 0) { + error = zap_count(os, sa->sa_reg_attr_obj, + &sa_attr_count); + + /* + * Make sure we retrieved a count and that it isn't zero + */ + if (error || (error == 0 && sa_attr_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto bail; + } + sa_reg_count = sa_attr_count; + } + + if (ostype == DMU_OST_ZFS && sa_attr_count == 0) + sa_attr_count += sa_legacy_attr_count; + + /* Allocate attribute numbers for attributes that aren't registered */ + for (i = 0; i != count; i++) { + boolean_t found = B_FALSE; + int j; + + if (ostype == DMU_OST_ZFS) { + for (j = 0; j != sa_legacy_attr_count; j++) { + if (strcmp(reg_attrs[i].sa_name, + sa_legacy_attrs[j].sa_name) == 0) { + sa->sa_user_table[i] = + sa_legacy_attrs[j].sa_attr; + found = B_TRUE; + } + } + } + if (found) + continue; + + if (sa->sa_reg_attr_obj) + error = zap_lookup(os, sa->sa_reg_attr_obj, + reg_attrs[i].sa_name, 8, 1, &attr_value); + else + error = SET_ERROR(ENOENT); + switch (error) { + case ENOENT: + sa->sa_user_table[i] = (sa_attr_type_t)sa_attr_count; + sa_attr_count++; + break; + case 0: + sa->sa_user_table[i] = ATTR_NUM(attr_value); + break; + default: + goto bail; + } + } + + sa->sa_num_attrs = sa_attr_count; + tb = sa->sa_attr_table = + kmem_zalloc(sizeof (sa_attr_table_t) * sa_attr_count, KM_SLEEP); + + /* + * Attribute table is constructed from requested attribute list, + * previously foreign registered attributes, and also the legacy + * ZPL set of attributes. + */ + + if (sa->sa_reg_attr_obj) { + for (zap_cursor_init(&zc, os, sa->sa_reg_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t value; + value = za.za_first_integer; + + registered_count++; + tb[ATTR_NUM(value)].sa_attr = ATTR_NUM(value); + tb[ATTR_NUM(value)].sa_length = ATTR_LENGTH(value); + tb[ATTR_NUM(value)].sa_byteswap = ATTR_BSWAP(value); + tb[ATTR_NUM(value)].sa_registered = B_TRUE; + + if (tb[ATTR_NUM(value)].sa_name) { + continue; + } + tb[ATTR_NUM(value)].sa_name = + kmem_zalloc(strlen(za.za_name) +1, KM_SLEEP); + (void) strlcpy(tb[ATTR_NUM(value)].sa_name, za.za_name, + strlen(za.za_name) +1); + } + zap_cursor_fini(&zc); + /* + * Make sure we processed the correct number of registered + * attributes + */ + if (registered_count != sa_reg_count) { + ASSERT(error != 0); + goto bail; + } + + } + + if (ostype == DMU_OST_ZFS) { + for (i = 0; i != sa_legacy_attr_count; i++) { + if (tb[i].sa_name) + continue; + tb[i].sa_attr = sa_legacy_attrs[i].sa_attr; + tb[i].sa_length = sa_legacy_attrs[i].sa_length; + tb[i].sa_byteswap = sa_legacy_attrs[i].sa_byteswap; + tb[i].sa_registered = B_FALSE; + tb[i].sa_name = + kmem_zalloc(strlen(sa_legacy_attrs[i].sa_name) +1, + KM_SLEEP); + (void) strlcpy(tb[i].sa_name, + sa_legacy_attrs[i].sa_name, + strlen(sa_legacy_attrs[i].sa_name) + 1); + } + } + + for (i = 0; i != count; i++) { + sa_attr_type_t attr_id; + + attr_id = sa->sa_user_table[i]; + if (tb[attr_id].sa_name) + continue; + + tb[attr_id].sa_length = reg_attrs[i].sa_length; + tb[attr_id].sa_byteswap = reg_attrs[i].sa_byteswap; + tb[attr_id].sa_attr = attr_id; + tb[attr_id].sa_name = + kmem_zalloc(strlen(reg_attrs[i].sa_name) + 1, KM_SLEEP); + (void) strlcpy(tb[attr_id].sa_name, reg_attrs[i].sa_name, + strlen(reg_attrs[i].sa_name) + 1); + } + + sa->sa_need_attr_registration = + (sa_attr_count != registered_count); + + return (0); +bail: + kmem_free(sa->sa_user_table, count * sizeof (sa_attr_type_t)); + sa->sa_user_table = NULL; + sa_free_attr_table(sa); + ASSERT(error != 0); + return (error); +} + +int +sa_setup(objset_t *os, uint64_t sa_obj, sa_attr_reg_t *reg_attrs, int count, + sa_attr_type_t **user_table) +{ + zap_cursor_t zc; + zap_attribute_t za; + sa_os_t *sa; + dmu_objset_type_t ostype = dmu_objset_type(os); + sa_attr_type_t *tb; + int error; + + mutex_enter(&os->os_user_ptr_lock); + if (os->os_sa) { + mutex_enter(&os->os_sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + tb = os->os_sa->sa_user_table; + mutex_exit(&os->os_sa->sa_lock); + *user_table = tb; + return (0); + } + + sa = kmem_zalloc(sizeof (sa_os_t), KM_SLEEP); + mutex_init(&sa->sa_lock, NULL, MUTEX_NOLOCKDEP, NULL); + sa->sa_master_obj = sa_obj; + + os->os_sa = sa; + mutex_enter(&sa->sa_lock); + mutex_exit(&os->os_user_ptr_lock); + avl_create(&sa->sa_layout_num_tree, layout_num_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_num_node)); + avl_create(&sa->sa_layout_hash_tree, layout_hash_compare, + sizeof (sa_lot_t), offsetof(sa_lot_t, lot_hash_node)); + + if (sa_obj) { + error = zap_lookup(os, sa_obj, SA_LAYOUTS, + 8, 1, &sa->sa_layout_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + error = zap_lookup(os, sa_obj, SA_REGISTRY, + 8, 1, &sa->sa_reg_attr_obj); + if (error != 0 && error != ENOENT) + goto fail; + } + + if ((error = sa_attr_table_setup(os, reg_attrs, count)) != 0) + goto fail; + + if (sa->sa_layout_attr_obj != 0) { + uint64_t layout_count; + + error = zap_count(os, sa->sa_layout_attr_obj, + &layout_count); + + /* + * Layout number count should be > 0 + */ + if (error || (error == 0 && layout_count == 0)) { + if (error == 0) + error = SET_ERROR(EINVAL); + goto fail; + } + + for (zap_cursor_init(&zc, os, sa->sa_layout_attr_obj); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + sa_attr_type_t *lot_attrs; + uint64_t lot_num; + + lot_attrs = kmem_zalloc(sizeof (sa_attr_type_t) * + za.za_num_integers, KM_SLEEP); + + if ((error = (zap_lookup(os, sa->sa_layout_attr_obj, + za.za_name, 2, za.za_num_integers, + lot_attrs))) != 0) { + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + break; + } + VERIFY(ddi_strtoull(za.za_name, NULL, 10, + (unsigned long long *)&lot_num) == 0); + + (void) sa_add_layout_entry(os, lot_attrs, + za.za_num_integers, lot_num, + sa_layout_info_hash(lot_attrs, + za.za_num_integers), B_FALSE, NULL); + kmem_free(lot_attrs, sizeof (sa_attr_type_t) * + za.za_num_integers); + } + zap_cursor_fini(&zc); + + /* + * Make sure layout count matches number of entries added + * to AVL tree + */ + if (avl_numnodes(&sa->sa_layout_num_tree) != layout_count) { + ASSERT(error != 0); + goto fail; + } + } + + /* Add special layout number for old ZNODES */ + if (ostype == DMU_OST_ZFS) { + (void) sa_add_layout_entry(os, sa_legacy_zpl_layout, + sa_legacy_attr_count, 0, + sa_layout_info_hash(sa_legacy_zpl_layout, + sa_legacy_attr_count), B_FALSE, NULL); + + (void) sa_add_layout_entry(os, sa_dummy_zpl_layout, 0, 1, + 0, B_FALSE, NULL); + } + *user_table = os->os_sa->sa_user_table; + mutex_exit(&sa->sa_lock); + return (0); +fail: + os->os_sa = NULL; + sa_free_attr_table(sa); + if (sa->sa_user_table) + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + mutex_exit(&sa->sa_lock); + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + kmem_free(sa, sizeof (sa_os_t)); + return ((error == ECKSUM) ? EIO : error); +} + +void +sa_tear_down(objset_t *os) +{ + sa_os_t *sa = os->os_sa; + sa_lot_t *layout; + void *cookie; + + kmem_free(sa->sa_user_table, sa->sa_user_table_sz); + + /* Free up attr table */ + + sa_free_attr_table(sa); + + cookie = NULL; + while ((layout = + avl_destroy_nodes(&sa->sa_layout_hash_tree, &cookie))) { + sa_idx_tab_t *tab; + while ((tab = list_head(&layout->lot_idx_tab))) { + ASSERT(zfs_refcount_count(&tab->sa_refcount)); + sa_idx_tab_rele(os, tab); + } + } + + cookie = NULL; + while ((layout = avl_destroy_nodes(&sa->sa_layout_num_tree, &cookie))) { + kmem_free(layout->lot_attrs, + sizeof (sa_attr_type_t) * layout->lot_attr_count); + kmem_free(layout, sizeof (sa_lot_t)); + } + + avl_destroy(&sa->sa_layout_hash_tree); + avl_destroy(&sa->sa_layout_num_tree); + mutex_destroy(&sa->sa_lock); + + kmem_free(sa, sizeof (sa_os_t)); + os->os_sa = NULL; +} + +static void +sa_build_idx_tab(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t var_length, void *userp) +{ + sa_idx_tab_t *idx_tab = userp; + + if (var_length) { + ASSERT(idx_tab->sa_variable_lengths); + idx_tab->sa_variable_lengths[length_idx] = length; + } + TOC_ATTR_ENCODE(idx_tab->sa_idx_tab[attr], length_idx, + (uint32_t)((uintptr_t)attr_addr - (uintptr_t)hdr)); +} + +static void +sa_attr_iter(objset_t *os, sa_hdr_phys_t *hdr, dmu_object_type_t type, + sa_iterfunc_t func, sa_lot_t *tab, void *userp) +{ + void *data_start; + sa_lot_t *tb = tab; + sa_lot_t search; + avl_index_t loc; + sa_os_t *sa = os->os_sa; + int i; + uint16_t *length_start = NULL; + uint8_t length_idx = 0; + + if (tab == NULL) { + search.lot_num = SA_LAYOUT_NUM(hdr, type); + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + ASSERT(tb); + } + + if (IS_SA_BONUSTYPE(type)) { + data_start = (void *)P2ROUNDUP(((uintptr_t)hdr + + offsetof(sa_hdr_phys_t, sa_lengths) + + (sizeof (uint16_t) * tb->lot_var_sizes)), 8); + length_start = hdr->sa_lengths; + } else { + data_start = hdr; + } + + for (i = 0; i != tb->lot_attr_count; i++) { + int attr_length, reg_length; + uint8_t idx_len; + + reg_length = sa->sa_attr_table[tb->lot_attrs[i]].sa_length; + if (reg_length) { + attr_length = reg_length; + idx_len = 0; + } else { + attr_length = length_start[length_idx]; + idx_len = length_idx++; + } + + func(hdr, data_start, tb->lot_attrs[i], attr_length, + idx_len, reg_length == 0 ? B_TRUE : B_FALSE, userp); + + data_start = (void *)P2ROUNDUP(((uintptr_t)data_start + + attr_length), 8); + } +} + +/*ARGSUSED*/ +static void +sa_byteswap_cb(void *hdr, void *attr_addr, sa_attr_type_t attr, + uint16_t length, int length_idx, boolean_t variable_length, void *userp) +{ + sa_handle_t *hdl = userp; + sa_os_t *sa = hdl->sa_os->os_sa; + + sa_bswap_table[sa->sa_attr_table[attr].sa_byteswap](attr_addr, length); +} + +static void +sa_byteswap(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys = SA_GET_HDR(hdl, buftype); + dmu_buf_impl_t *db; + int num_lengths = 1; + int i; + sa_os_t *sa __maybe_unused = hdl->sa_os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + if (sa_hdr_phys->sa_magic == SA_MAGIC) + return; + + db = SA_GET_DB(hdl, buftype); + + if (buftype == SA_SPILL) { + arc_release(db->db_buf, NULL); + arc_buf_thaw(db->db_buf); + } + + sa_hdr_phys->sa_magic = BSWAP_32(sa_hdr_phys->sa_magic); + sa_hdr_phys->sa_layout_info = BSWAP_16(sa_hdr_phys->sa_layout_info); + + /* + * Determine number of variable lengths in header + * The standard 8 byte header has one for free and a + * 16 byte header would have 4 + 1; + */ + if (SA_HDR_SIZE(sa_hdr_phys) > 8) + num_lengths += (SA_HDR_SIZE(sa_hdr_phys) - 8) >> 1; + for (i = 0; i != num_lengths; i++) + sa_hdr_phys->sa_lengths[i] = + BSWAP_16(sa_hdr_phys->sa_lengths[i]); + + sa_attr_iter(hdl->sa_os, sa_hdr_phys, DMU_OT_SA, + sa_byteswap_cb, NULL, hdl); + + if (buftype == SA_SPILL) + arc_buf_freeze(((dmu_buf_impl_t *)hdl->sa_spill)->db_buf); +} + +static int +sa_build_index(sa_handle_t *hdl, sa_buf_type_t buftype) +{ + sa_hdr_phys_t *sa_hdr_phys; + dmu_buf_impl_t *db = SA_GET_DB(hdl, buftype); + dmu_object_type_t bonustype = SA_BONUSTYPE_FROM_DB(db); + sa_os_t *sa = hdl->sa_os->os_sa; + sa_idx_tab_t *idx_tab; + + sa_hdr_phys = SA_GET_HDR(hdl, buftype); + + mutex_enter(&sa->sa_lock); + + /* Do we need to byteswap? */ + + /* only check if not old znode */ + if (IS_SA_BONUSTYPE(bonustype) && sa_hdr_phys->sa_magic != SA_MAGIC && + sa_hdr_phys->sa_magic != 0) { + if (BSWAP_32(sa_hdr_phys->sa_magic) != SA_MAGIC) { + mutex_exit(&sa->sa_lock); + zfs_dbgmsg("Buffer Header: %x != SA_MAGIC:%x " + "object=%#llx\n", sa_hdr_phys->sa_magic, SA_MAGIC, + db->db.db_object); + return (SET_ERROR(EIO)); + } + sa_byteswap(hdl, buftype); + } + + idx_tab = sa_find_idx_tab(hdl->sa_os, bonustype, sa_hdr_phys); + + if (buftype == SA_BONUS) + hdl->sa_bonus_tab = idx_tab; + else + hdl->sa_spill_tab = idx_tab; + + mutex_exit(&sa->sa_lock); + return (0); +} + +/*ARGSUSED*/ +static void +sa_evict_sync(void *dbu) +{ + panic("evicting sa dbuf\n"); +} + +static void +sa_idx_tab_rele(objset_t *os, void *arg) +{ + sa_os_t *sa = os->os_sa; + sa_idx_tab_t *idx_tab = arg; + + if (idx_tab == NULL) + return; + + mutex_enter(&sa->sa_lock); + if (zfs_refcount_remove(&idx_tab->sa_refcount, NULL) == 0) { + list_remove(&idx_tab->sa_layout->lot_idx_tab, idx_tab); + if (idx_tab->sa_variable_lengths) + kmem_free(idx_tab->sa_variable_lengths, + sizeof (uint16_t) * + idx_tab->sa_layout->lot_var_sizes); + zfs_refcount_destroy(&idx_tab->sa_refcount); + kmem_free(idx_tab->sa_idx_tab, + sizeof (uint32_t) * sa->sa_num_attrs); + kmem_free(idx_tab, sizeof (sa_idx_tab_t)); + } + mutex_exit(&sa->sa_lock); +} + +static void +sa_idx_tab_hold(objset_t *os, sa_idx_tab_t *idx_tab) +{ + sa_os_t *sa __maybe_unused = os->os_sa; + + ASSERT(MUTEX_HELD(&sa->sa_lock)); + (void) zfs_refcount_add(&idx_tab->sa_refcount, NULL); +} + +void +sa_spill_rele(sa_handle_t *hdl) +{ + mutex_enter(&hdl->sa_lock); + if (hdl->sa_spill) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + hdl->sa_spill_tab = NULL; + } + mutex_exit(&hdl->sa_lock); +} + +void +sa_handle_destroy(sa_handle_t *hdl) +{ + dmu_buf_t *db = hdl->sa_bonus; + + mutex_enter(&hdl->sa_lock); + (void) dmu_buf_remove_user(db, &hdl->sa_dbu); + + if (hdl->sa_bonus_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_bonus_tab); + + if (hdl->sa_spill_tab) + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + + dmu_buf_rele(hdl->sa_bonus, NULL); + + if (hdl->sa_spill) + dmu_buf_rele(hdl->sa_spill, NULL); + mutex_exit(&hdl->sa_lock); + + kmem_cache_free(sa_cache, hdl); +} + +int +sa_handle_get_from_db(objset_t *os, dmu_buf_t *db, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + int error = 0; + sa_handle_t *handle = NULL; +#ifdef ZFS_DEBUG + dmu_object_info_t doi; + + dmu_object_info_from_db(db, &doi); + ASSERT(doi.doi_bonus_type == DMU_OT_SA || + doi.doi_bonus_type == DMU_OT_ZNODE); +#endif + /* find handle, if it exists */ + /* if one doesn't exist then create a new one, and initialize it */ + + if (hdl_type == SA_HDL_SHARED) + handle = dmu_buf_get_user(db); + + if (handle == NULL) { + sa_handle_t *winner = NULL; + + handle = kmem_cache_alloc(sa_cache, KM_SLEEP); + handle->sa_dbu.dbu_evict_func_sync = NULL; + handle->sa_dbu.dbu_evict_func_async = NULL; + handle->sa_userp = userp; + handle->sa_bonus = db; + handle->sa_os = os; + handle->sa_spill = NULL; + handle->sa_bonus_tab = NULL; + handle->sa_spill_tab = NULL; + + error = sa_build_index(handle, SA_BONUS); + + if (hdl_type == SA_HDL_SHARED) { + dmu_buf_init_user(&handle->sa_dbu, sa_evict_sync, NULL, + NULL); + winner = dmu_buf_set_user_ie(db, &handle->sa_dbu); + } + + if (winner != NULL) { + kmem_cache_free(sa_cache, handle); + handle = winner; + } + } + *handlepp = handle; + + return (error); +} + +int +sa_handle_get(objset_t *objset, uint64_t objid, void *userp, + sa_handle_type_t hdl_type, sa_handle_t **handlepp) +{ + dmu_buf_t *db; + int error; + + if ((error = dmu_bonus_hold(objset, objid, NULL, &db))) + return (error); + + return (sa_handle_get_from_db(objset, db, userp, hdl_type, + handlepp)); +} + +int +sa_buf_hold(objset_t *objset, uint64_t obj_num, void *tag, dmu_buf_t **db) +{ + return (dmu_bonus_hold(objset, obj_num, tag, db)); +} + +void +sa_buf_rele(dmu_buf_t *db, void *tag) +{ + dmu_buf_rele(db, tag); +} + +static int +sa_lookup_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_attr_op(hdl, bulk, count, SA_LOOKUP, NULL)); +} + +static int +sa_lookup_locked(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, + uint32_t buflen) +{ + int error; + sa_bulk_attr_t bulk; + + VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN); + + bulk.sa_attr = attr; + bulk.sa_data = buf; + bulk.sa_length = buflen; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + error = sa_lookup_impl(hdl, &bulk, 1); + return (error); +} + +int +sa_lookup(sa_handle_t *hdl, sa_attr_type_t attr, void *buf, uint32_t buflen) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_lookup_locked(hdl, attr, buf, buflen); + mutex_exit(&hdl->sa_lock); + + return (error); +} + +#ifdef _KERNEL +int +sa_lookup_uio(sa_handle_t *hdl, sa_attr_type_t attr, uio_t *uio) +{ + int error; + sa_bulk_attr_t bulk; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) == 0) { + error = uiomove((void *)bulk.sa_addr, MIN(bulk.sa_size, + uio_resid(uio)), UIO_READ, uio); + } + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * For the existed object that is upgraded from old system, its ondisk layout + * has no slot for the project ID attribute. But quota accounting logic needs + * to access related slots by offset directly. So we need to adjust these old + * objects' layout to make the project ID to some unified and fixed offset. + */ +int +sa_add_projid(sa_handle_t *hdl, dmu_tx_t *tx, uint64_t projid) +{ + znode_t *zp = sa_get_userdata(hdl); + dmu_buf_t *db = sa_get_db(hdl); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int count = 0, err = 0; + sa_bulk_attr_t *bulk, *attrs; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr = 0, parent, gen, links; + uint64_t crtime[2], mtime[2], ctime[2], atime[2]; + zfs_acl_phys_t znode_acl = { 0 }; + char scanstamp[AV_SCANSTAMP_SZ]; + + if (zp->z_acl_cached == NULL) { + zfs_acl_t *aclp; + + mutex_enter(&zp->z_acl_lock); + err = zfs_acl_node_read(zp, B_FALSE, &aclp, B_FALSE); + mutex_exit(&zp->z_acl_lock); + if (err != 0 && err != ENOENT) + return (err); + } + + bulk = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + mutex_enter(&hdl->sa_lock); + mutex_enter(&zp->z_lock); + + err = sa_lookup_locked(hdl, SA_ZPL_PROJID(zfsvfs), &projid, + sizeof (uint64_t)); + if (unlikely(err == 0)) + /* Someone has added project ID attr by race. */ + err = EEXIST; + if (err != ENOENT) + goto out; + + /* First do a bulk query of the attributes that aren't cached */ + if (zp->z_is_sa) { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + } else { + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, + &gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, + &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, + &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, + &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, + &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, + &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + } + err = sa_bulk_lookup_locked(hdl, bulk, count); + if (err != 0) + goto out; + + err = sa_lookup_locked(hdl, SA_ZPL_XATTR(zfsvfs), &xattr, 8); + if (err != 0 && err != ENOENT) + goto out; + + zp->z_projid = projid; + zp->z_pflags |= ZFS_PROJID; + links = ZTONLNK(zp); + count = 0; + err = 0; + + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GEN(zfsvfs), NULL, &gen, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, &links, 8); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, &projid, 8); + + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + + if (zp->z_acl_cached != NULL) { + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, + zp->z_acl_cached->z_acl_bytes); + } + + if (xattr) + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_XATTR(zfsvfs), NULL, + &xattr, 8); + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), NULL, + scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, attrs, count, tx) == 0); + if (znode_acl.z_acl_extern_obj) { + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + } + + zp->z_is_sa = B_TRUE; + +out: + mutex_exit(&zp->z_lock); + mutex_exit(&hdl->sa_lock); + kmem_free(attrs, sizeof (sa_bulk_attr_t) * ZPL_END); + kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END); + return (err); +} +#endif + +static sa_idx_tab_t * +sa_find_idx_tab(objset_t *os, dmu_object_type_t bonustype, sa_hdr_phys_t *hdr) +{ + sa_idx_tab_t *idx_tab; + sa_os_t *sa = os->os_sa; + sa_lot_t *tb, search; + avl_index_t loc; + + /* + * Deterimine layout number. If SA node and header == 0 then + * force the index table to the dummy "1" empty layout. + * + * The layout number would only be zero for a newly created file + * that has not added any attributes yet, or with crypto enabled which + * doesn't write any attributes to the bonus buffer. + */ + + search.lot_num = SA_LAYOUT_NUM(hdr, bonustype); + + tb = avl_find(&sa->sa_layout_num_tree, &search, &loc); + + /* Verify header size is consistent with layout information */ + ASSERT(tb); + ASSERT((IS_SA_BONUSTYPE(bonustype) && + SA_HDR_SIZE_MATCH_LAYOUT(hdr, tb)) || !IS_SA_BONUSTYPE(bonustype) || + (IS_SA_BONUSTYPE(bonustype) && hdr->sa_layout_info == 0)); + + /* + * See if any of the already existing TOC entries can be reused? + */ + + for (idx_tab = list_head(&tb->lot_idx_tab); idx_tab; + idx_tab = list_next(&tb->lot_idx_tab, idx_tab)) { + boolean_t valid_idx = B_TRUE; + int i; + + if (tb->lot_var_sizes != 0 && + idx_tab->sa_variable_lengths != NULL) { + for (i = 0; i != tb->lot_var_sizes; i++) { + if (hdr->sa_lengths[i] != + idx_tab->sa_variable_lengths[i]) { + valid_idx = B_FALSE; + break; + } + } + } + if (valid_idx) { + sa_idx_tab_hold(os, idx_tab); + return (idx_tab); + } + } + + /* No such luck, create a new entry */ + idx_tab = kmem_zalloc(sizeof (sa_idx_tab_t), KM_SLEEP); + idx_tab->sa_idx_tab = + kmem_zalloc(sizeof (uint32_t) * sa->sa_num_attrs, KM_SLEEP); + idx_tab->sa_layout = tb; + zfs_refcount_create(&idx_tab->sa_refcount); + if (tb->lot_var_sizes) + idx_tab->sa_variable_lengths = kmem_alloc(sizeof (uint16_t) * + tb->lot_var_sizes, KM_SLEEP); + + sa_attr_iter(os, hdr, bonustype, sa_build_idx_tab, + tb, idx_tab); + sa_idx_tab_hold(os, idx_tab); /* one hold for consumer */ + sa_idx_tab_hold(os, idx_tab); /* one for layout */ + list_insert_tail(&tb->lot_idx_tab, idx_tab); + return (idx_tab); +} + +void +sa_default_locator(void **dataptr, uint32_t *len, uint32_t total_len, + boolean_t start, void *userdata) +{ + ASSERT(start); + + *dataptr = userdata; + *len = total_len; +} + +static void +sa_attr_register_sync(sa_handle_t *hdl, dmu_tx_t *tx) +{ + uint64_t attr_value = 0; + sa_os_t *sa = hdl->sa_os->os_sa; + sa_attr_table_t *tb = sa->sa_attr_table; + int i; + + mutex_enter(&sa->sa_lock); + + if (!sa->sa_need_attr_registration || sa->sa_master_obj == 0) { + mutex_exit(&sa->sa_lock); + return; + } + + if (sa->sa_reg_attr_obj == 0) { + sa->sa_reg_attr_obj = zap_create_link(hdl->sa_os, + DMU_OT_SA_ATTR_REGISTRATION, + sa->sa_master_obj, SA_REGISTRY, tx); + } + for (i = 0; i != sa->sa_num_attrs; i++) { + if (sa->sa_attr_table[i].sa_registered) + continue; + ATTR_ENCODE(attr_value, tb[i].sa_attr, tb[i].sa_length, + tb[i].sa_byteswap); + VERIFY(0 == zap_update(hdl->sa_os, sa->sa_reg_attr_obj, + tb[i].sa_name, 8, 1, &attr_value, tx)); + tb[i].sa_registered = B_TRUE; + } + sa->sa_need_attr_registration = B_FALSE; + mutex_exit(&sa->sa_lock); +} + +/* + * Replace all attributes with attributes specified in template. + * If dnode had a spill buffer then those attributes will be + * also be replaced, possibly with just an empty spill block + * + * This interface is intended to only be used for bulk adding of + * attributes for a new file. It will also be used by the ZPL + * when converting and old formatted znode to native SA support. + */ +int +sa_replace_all_by_template_locked(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + return (sa_build_layouts(hdl, attr_desc, attr_count, tx)); +} + +int +sa_replace_all_by_template(sa_handle_t *hdl, sa_bulk_attr_t *attr_desc, + int attr_count, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_replace_all_by_template_locked(hdl, attr_desc, + attr_count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Add/remove a single attribute or replace a variable-sized attribute value + * with a value of a different size, and then rewrite the entire set + * of attributes. + * Same-length attribute value replacement (including fixed-length attributes) + * is handled more efficiently by the upper layers. + */ +static int +sa_modify_attrs(sa_handle_t *hdl, sa_attr_type_t newattr, + sa_data_op_t action, sa_data_locator_t *locator, void *datastart, + uint16_t buflen, dmu_tx_t *tx) +{ + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)hdl->sa_bonus; + dnode_t *dn; + sa_bulk_attr_t *attr_desc; + void *old_data[2]; + int bonus_attr_count = 0; + int bonus_data_size = 0; + int spill_data_size = 0; + int spill_attr_count = 0; + int error; + uint16_t length, reg_length; + int i, j, k, length_idx; + sa_hdr_phys_t *hdr; + sa_idx_tab_t *idx_tab; + int attr_count; + int count; + + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + /* First make of copy of the old data */ + + DB_DNODE_ENTER(db); + dn = DB_DNODE(db); + if (dn->dn_bonuslen != 0) { + bonus_data_size = hdl->sa_bonus->db_size; + old_data[0] = kmem_alloc(bonus_data_size, KM_SLEEP); + bcopy(hdl->sa_bonus->db_data, old_data[0], + hdl->sa_bonus->db_size); + bonus_attr_count = hdl->sa_bonus_tab->sa_layout->lot_attr_count; + } else { + old_data[0] = NULL; + } + DB_DNODE_EXIT(db); + + /* Bring spill buffer online if it isn't currently */ + + if ((error = sa_get_spill(hdl)) == 0) { + spill_data_size = hdl->sa_spill->db_size; + old_data[1] = vmem_alloc(spill_data_size, KM_SLEEP); + bcopy(hdl->sa_spill->db_data, old_data[1], + hdl->sa_spill->db_size); + spill_attr_count = + hdl->sa_spill_tab->sa_layout->lot_attr_count; + } else if (error && error != ENOENT) { + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + return (error); + } else { + old_data[1] = NULL; + } + + /* build descriptor of all attributes */ + + attr_count = bonus_attr_count + spill_attr_count; + if (action == SA_ADD) + attr_count++; + else if (action == SA_REMOVE) + attr_count--; + + attr_desc = kmem_zalloc(sizeof (sa_bulk_attr_t) * attr_count, KM_SLEEP); + + /* + * loop through bonus and spill buffer if it exists, and + * build up new attr_descriptor to reset the attributes + */ + k = j = 0; + count = bonus_attr_count; + hdr = SA_GET_HDR(hdl, SA_BONUS); + idx_tab = SA_IDX_TAB_GET(hdl, SA_BONUS); + for (; k != 2; k++) { + /* + * Iterate over each attribute in layout. Fetch the + * size of variable-length attributes needing rewrite + * from sa_lengths[]. + */ + for (i = 0, length_idx = 0; i != count; i++) { + sa_attr_type_t attr; + + attr = idx_tab->sa_layout->lot_attrs[i]; + reg_length = SA_REGISTERED_LEN(sa, attr); + if (reg_length == 0) { + length = hdr->sa_lengths[length_idx]; + length_idx++; + } else { + length = reg_length; + } + if (attr == newattr) { + /* + * There is nothing to do for SA_REMOVE, + * so it is just skipped. + */ + if (action == SA_REMOVE) + continue; + + /* + * Duplicate attributes are not allowed, so the + * action can not be SA_ADD here. + */ + ASSERT3S(action, ==, SA_REPLACE); + + /* + * Only a variable-sized attribute can be + * replaced here, and its size must be changing. + */ + ASSERT3U(reg_length, ==, 0); + ASSERT3U(length, !=, buflen); + SA_ADD_BULK_ATTR(attr_desc, j, attr, + locator, datastart, buflen); + } else { + SA_ADD_BULK_ATTR(attr_desc, j, attr, + NULL, (void *) + (TOC_OFF(idx_tab->sa_idx_tab[attr]) + + (uintptr_t)old_data[k]), length); + } + } + if (k == 0 && hdl->sa_spill) { + hdr = SA_GET_HDR(hdl, SA_SPILL); + idx_tab = SA_IDX_TAB_GET(hdl, SA_SPILL); + count = spill_attr_count; + } else { + break; + } + } + if (action == SA_ADD) { + reg_length = SA_REGISTERED_LEN(sa, newattr); + IMPLY(reg_length != 0, reg_length == buflen); + SA_ADD_BULK_ATTR(attr_desc, j, newattr, locator, + datastart, buflen); + } + ASSERT3U(j, ==, attr_count); + + error = sa_build_layouts(hdl, attr_desc, attr_count, tx); + + if (old_data[0]) + kmem_free(old_data[0], bonus_data_size); + if (old_data[1]) + vmem_free(old_data[1], spill_data_size); + kmem_free(attr_desc, sizeof (sa_bulk_attr_t) * attr_count); + + return (error); +} + +static int +sa_bulk_update_impl(sa_handle_t *hdl, sa_bulk_attr_t *bulk, int count, + dmu_tx_t *tx) +{ + int error; + sa_os_t *sa = hdl->sa_os->os_sa; + dmu_object_type_t bonustype; + dmu_buf_t *saved_spill; + + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + + bonustype = SA_BONUSTYPE_FROM_DB(SA_GET_DB(hdl, SA_BONUS)); + saved_spill = hdl->sa_spill; + + /* sync out registration table if necessary */ + if (sa->sa_need_attr_registration) + sa_attr_register_sync(hdl, tx); + + error = sa_attr_op(hdl, bulk, count, SA_UPDATE, tx); + if (error == 0 && !IS_SA_BONUSTYPE(bonustype) && sa->sa_update_cb) + sa->sa_update_cb(hdl, tx); + + /* + * If saved_spill is NULL and current sa_spill is not NULL that + * means we increased the refcount of the spill buffer through + * sa_get_spill() or dmu_spill_hold_by_dnode(). Therefore we + * must release the hold before calling dmu_tx_commit() to avoid + * making a copy of this buffer in dbuf_sync_leaf() due to the + * reference count now being greater than 1. + */ + if (!saved_spill && hdl->sa_spill) { + if (hdl->sa_spill_tab) { + sa_idx_tab_rele(hdl->sa_os, hdl->sa_spill_tab); + hdl->sa_spill_tab = NULL; + } + + dmu_buf_rele(hdl->sa_spill, NULL); + hdl->sa_spill = NULL; + } + + return (error); +} + +/* + * update or add new attribute + */ +int +sa_update(sa_handle_t *hdl, sa_attr_type_t type, + void *buf, uint32_t buflen, dmu_tx_t *tx) +{ + int error; + sa_bulk_attr_t bulk; + + VERIFY3U(buflen, <=, SA_ATTR_MAX_LEN); + + bulk.sa_attr = type; + bulk.sa_data_func = NULL; + bulk.sa_length = buflen; + bulk.sa_data = buf; + + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, &bulk, 1, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +/* + * Return size of an attribute + */ + +int +sa_size(sa_handle_t *hdl, sa_attr_type_t attr, int *size) +{ + sa_bulk_attr_t bulk; + int error; + + bulk.sa_data = NULL; + bulk.sa_attr = attr; + bulk.sa_data_func = NULL; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + if ((error = sa_attr_op(hdl, &bulk, 1, SA_LOOKUP, NULL)) != 0) { + mutex_exit(&hdl->sa_lock); + return (error); + } + *size = bulk.sa_size; + + mutex_exit(&hdl->sa_lock); + return (0); +} + +int +sa_bulk_lookup_locked(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + ASSERT(hdl); + ASSERT(MUTEX_HELD(&hdl->sa_lock)); + return (sa_lookup_impl(hdl, attrs, count)); +} + +int +sa_bulk_lookup(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_lookup_locked(hdl, attrs, count); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_bulk_update(sa_handle_t *hdl, sa_bulk_attr_t *attrs, int count, dmu_tx_t *tx) +{ + int error; + + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); + error = sa_bulk_update_impl(hdl, attrs, count, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +int +sa_remove(sa_handle_t *hdl, sa_attr_type_t attr, dmu_tx_t *tx) +{ + int error; + + mutex_enter(&hdl->sa_lock); + error = sa_modify_attrs(hdl, attr, SA_REMOVE, NULL, + NULL, 0, tx); + mutex_exit(&hdl->sa_lock); + return (error); +} + +void +sa_object_info(sa_handle_t *hdl, dmu_object_info_t *doi) +{ + dmu_object_info_from_db(hdl->sa_bonus, doi); +} + +void +sa_object_size(sa_handle_t *hdl, uint32_t *blksize, u_longlong_t *nblocks) +{ + dmu_object_size_from_db(hdl->sa_bonus, + blksize, nblocks); +} + +void +sa_set_userp(sa_handle_t *hdl, void *ptr) +{ + hdl->sa_userp = ptr; +} + +dmu_buf_t * +sa_get_db(sa_handle_t *hdl) +{ + return (hdl->sa_bonus); +} + +void * +sa_get_userdata(sa_handle_t *hdl) +{ + return (hdl->sa_userp); +} + +void +sa_register_update_callback_locked(objset_t *os, sa_update_cb_t *func) +{ + ASSERT(MUTEX_HELD(&os->os_sa->sa_lock)); + os->os_sa->sa_update_cb = func; +} + +void +sa_register_update_callback(objset_t *os, sa_update_cb_t *func) +{ + + mutex_enter(&os->os_sa->sa_lock); + sa_register_update_callback_locked(os, func); + mutex_exit(&os->os_sa->sa_lock); +} + +uint64_t +sa_handle_object(sa_handle_t *hdl) +{ + return (hdl->sa_bonus->db_object); +} + +boolean_t +sa_enabled(objset_t *os) +{ + return (os->os_sa == NULL); +} + +int +sa_set_sa_object(objset_t *os, uint64_t sa_object) +{ + sa_os_t *sa = os->os_sa; + + if (sa->sa_master_obj) + return (1); + + sa->sa_master_obj = sa_object; + + return (0); +} + +int +sa_hdrsize(void *arg) +{ + sa_hdr_phys_t *hdr = arg; + + return (SA_HDR_SIZE(hdr)); +} + +void +sa_handle_lock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_enter(&hdl->sa_lock); +} + +void +sa_handle_unlock(sa_handle_t *hdl) +{ + ASSERT(hdl); + mutex_exit(&hdl->sa_lock); +} + +#ifdef _KERNEL +EXPORT_SYMBOL(sa_handle_get); +EXPORT_SYMBOL(sa_handle_get_from_db); +EXPORT_SYMBOL(sa_handle_destroy); +EXPORT_SYMBOL(sa_buf_hold); +EXPORT_SYMBOL(sa_buf_rele); +EXPORT_SYMBOL(sa_spill_rele); +EXPORT_SYMBOL(sa_lookup); +EXPORT_SYMBOL(sa_update); +EXPORT_SYMBOL(sa_remove); +EXPORT_SYMBOL(sa_bulk_lookup); +EXPORT_SYMBOL(sa_bulk_lookup_locked); +EXPORT_SYMBOL(sa_bulk_update); +EXPORT_SYMBOL(sa_size); +EXPORT_SYMBOL(sa_object_info); +EXPORT_SYMBOL(sa_object_size); +EXPORT_SYMBOL(sa_get_userdata); +EXPORT_SYMBOL(sa_set_userp); +EXPORT_SYMBOL(sa_get_db); +EXPORT_SYMBOL(sa_handle_object); +EXPORT_SYMBOL(sa_register_update_callback); +EXPORT_SYMBOL(sa_setup); +EXPORT_SYMBOL(sa_replace_all_by_template); +EXPORT_SYMBOL(sa_replace_all_by_template_locked); +EXPORT_SYMBOL(sa_enabled); +EXPORT_SYMBOL(sa_cache_init); +EXPORT_SYMBOL(sa_cache_fini); +EXPORT_SYMBOL(sa_set_sa_object); +EXPORT_SYMBOL(sa_hdrsize); +EXPORT_SYMBOL(sa_handle_lock); +EXPORT_SYMBOL(sa_handle_unlock); +EXPORT_SYMBOL(sa_lookup_uio); +EXPORT_SYMBOL(sa_add_projid); +#endif /* _KERNEL */ diff --git a/module/zfs/sha256.c b/module/zfs/sha256.c new file mode 100644 index 000000000000..d297768eada5 --- /dev/null +++ b/module/zfs/sha256.c @@ -0,0 +1,105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include +#include +#include +#include +#include +#include + +static int +sha_incremental(void *buf, size_t size, void *arg) +{ + SHA2_CTX *ctx = arg; + SHA2Update(ctx, buf, size); + return (0); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA256(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + int ret; + SHA2_CTX ctx; + zio_cksum_t tmp; + + if (qat_checksum_use_accel(size)) { + uint8_t *buf = abd_borrow_buf_copy(abd, size); + ret = qat_checksum(ZIO_CHECKSUM_SHA256, buf, size, &tmp); + abd_return_buf(abd, buf, size); + if (ret == CPA_STATUS_SUCCESS) + goto bswap; + + /* If the hardware implementation fails fall back to software */ + } + + SHA2Init(SHA256, &ctx); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); + SHA2Final(&tmp, &ctx); + +bswap: + /* + * A prior implementation of this function had a + * private SHA256 implementation always wrote things out in + * Big Endian and there wasn't a byteswap variant of it. + * To preserve on disk compatibility we need to force that + * behavior. + */ + zcp->zc_word[0] = BE_64(tmp.zc_word[0]); + zcp->zc_word[1] = BE_64(tmp.zc_word[1]); + zcp->zc_word[2] = BE_64(tmp.zc_word[2]); + zcp->zc_word[3] = BE_64(tmp.zc_word[3]); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA512_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + SHA2_CTX ctx; + + SHA2Init(SHA512_256, &ctx); + (void) abd_iterate_func(abd, 0, size, sha_incremental, &ctx); + SHA2Final(zcp, &ctx); +} + +/*ARGSUSED*/ +void +abd_checksum_SHA512_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_SHA512_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} diff --git a/module/zfs/skein_zfs.c b/module/zfs/skein_zfs.c new file mode 100644 index 000000000000..11b9940e027e --- /dev/null +++ b/module/zfs/skein_zfs.c @@ -0,0 +1,102 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + */ +#include +#include +#include +#include + +#include + +static int +skein_incremental(void *buf, size_t size, void *arg) +{ + Skein_512_Ctxt_t *ctx = arg; + (void) Skein_512_Update(ctx, buf, size); + return (0); +} +/* + * Computes a native 256-bit skein MAC checksum. Please note that this + * function requires the presence of a ctx_template that should be allocated + * using abd_checksum_skein_tmpl_init. + */ +/*ARGSUSED*/ +void +abd_checksum_skein_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + Skein_512_Ctxt_t ctx; + + ASSERT(ctx_template != NULL); + bcopy(ctx_template, &ctx, sizeof (ctx)); + (void) abd_iterate_func(abd, 0, size, skein_incremental, &ctx); + (void) Skein_512_Final(&ctx, (uint8_t *)zcp); + bzero(&ctx, sizeof (ctx)); +} + +/* + * Byteswapped version of abd_checksum_skein_native. This just invokes + * the native checksum function and byteswaps the resulting checksum (since + * skein is internally endian-insensitive). + */ +void +abd_checksum_skein_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + zio_cksum_t tmp; + + abd_checksum_skein_native(abd, size, ctx_template, &tmp); + zcp->zc_word[0] = BSWAP_64(tmp.zc_word[0]); + zcp->zc_word[1] = BSWAP_64(tmp.zc_word[1]); + zcp->zc_word[2] = BSWAP_64(tmp.zc_word[2]); + zcp->zc_word[3] = BSWAP_64(tmp.zc_word[3]); +} + +/* + * Allocates a skein MAC template suitable for using in skein MAC checksum + * computations and returns a pointer to it. + */ +void * +abd_checksum_skein_tmpl_init(const zio_cksum_salt_t *salt) +{ + Skein_512_Ctxt_t *ctx; + + ctx = kmem_zalloc(sizeof (*ctx), KM_SLEEP); + (void) Skein_512_InitExt(ctx, sizeof (zio_cksum_t) * 8, 0, + salt->zcs_bytes, sizeof (salt->zcs_bytes)); + return (ctx); +} + +/* + * Frees a skein context template previously allocated using + * zio_checksum_skein_tmpl_init. + */ +void +abd_checksum_skein_tmpl_free(void *ctx_template) +{ + Skein_512_Ctxt_t *ctx = ctx_template; + + bzero(ctx, sizeof (*ctx)); + kmem_free(ctx, sizeof (*ctx)); +} diff --git a/module/zfs/spa.c b/module/zfs/spa.c new file mode 100644 index 000000000000..1e3728d93cee --- /dev/null +++ b/module/zfs/spa.c @@ -0,0 +1,9754 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2018, Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright 2018 Joyent, Inc. + * Copyright (c) 2017, 2019, Datto Inc. All rights reserved. + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * SPA: Storage Pool Allocator + * + * This file contains all the routines used when modifying on-disk SPA state. + * This includes opening, importing, destroying, exporting a pool, and syncing a + * pool. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#endif /* _KERNEL */ + +#include "zfs_prop.h" +#include "zfs_comutil.h" + +/* + * The interval, in seconds, at which failed configuration cache file writes + * should be retried. + */ +int zfs_ccw_retry_interval = 300; + +typedef enum zti_modes { + ZTI_MODE_FIXED, /* value is # of threads (min 1) */ + ZTI_MODE_BATCH, /* cpu-intensive; value is ignored */ + ZTI_MODE_NULL, /* don't create a taskq */ + ZTI_NMODES +} zti_modes_t; + +#define ZTI_P(n, q) { ZTI_MODE_FIXED, (n), (q) } +#define ZTI_PCT(n) { ZTI_MODE_ONLINE_PERCENT, (n), 1 } +#define ZTI_BATCH { ZTI_MODE_BATCH, 0, 1 } +#define ZTI_NULL { ZTI_MODE_NULL, 0, 0 } + +#define ZTI_N(n) ZTI_P(n, 1) +#define ZTI_ONE ZTI_N(1) + +typedef struct zio_taskq_info { + zti_modes_t zti_mode; + uint_t zti_value; + uint_t zti_count; +} zio_taskq_info_t; + +static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { + "iss", "iss_h", "int", "int_h" +}; + +/* + * This table defines the taskq settings for each ZFS I/O type. When + * initializing a pool, we use this table to create an appropriately sized + * taskq. Some operations are low volume and therefore have a small, static + * number of threads assigned to their taskqs using the ZTI_N(#) or ZTI_ONE + * macros. Other operations process a large amount of data; the ZTI_BATCH + * macro causes us to create a taskq oriented for throughput. Some operations + * are so high frequency and short-lived that the taskq itself can become a + * point of lock contention. The ZTI_P(#, #) macro indicates that we need an + * additional degree of parallelism specified by the number of threads per- + * taskq and the number of taskqs; when dispatching an event in this case, the + * particular taskq is chosen at random. + * + * The different taskq priorities are to handle the different contexts (issue + * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that + * need to be handled with minimum delay. + */ +const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ + { ZTI_N(8), ZTI_NULL, ZTI_P(12, 8), ZTI_NULL }, /* READ */ + { ZTI_BATCH, ZTI_N(5), ZTI_P(12, 8), ZTI_N(5) }, /* WRITE */ + { ZTI_P(12, 8), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* FREE */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* CLAIM */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* IOCTL */ + { ZTI_N(4), ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* TRIM */ +}; + +static void spa_sync_version(void *arg, dmu_tx_t *tx); +static void spa_sync_props(void *arg, dmu_tx_t *tx); +static boolean_t spa_has_active_shared_spare(spa_t *spa); +static int spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport); +static void spa_vdev_resilver_done(spa_t *spa); + +uint_t zio_taskq_batch_pct = 75; /* 1 thread per cpu in pset */ +boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +uint_t zio_taskq_basedc = 80; /* base duty cycle */ + +boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ + +/* + * Report any spa_load_verify errors found, but do not fail spa_load. + * This is used by zdb to analyze non-idle pools. + */ +boolean_t spa_load_verify_dryrun = B_FALSE; + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" + +/* + * For debugging purposes: print out vdev tree during pool import. + */ +int spa_load_print_vdev_tree = B_FALSE; + +/* + * A non-zero value for zfs_max_missing_tvds means that we allow importing + * pools with missing top-level vdevs. This is strictly intended for advanced + * pool recovery cases since missing data is almost inevitable. Pools with + * missing devices can only be imported read-only for safety reasons, and their + * fail-mode will be automatically set to "continue". + * + * With 1 missing vdev we should be able to import the pool and mount all + * datasets. User data that was not modified after the missing device has been + * added should be recoverable. This means that snapshots created prior to the + * addition of that device should be completely intact. + * + * With 2 missing vdevs, some datasets may fail to mount since there are + * dataset statistics that are stored as regular metadata. Some data might be + * recoverable if those vdevs were added recently. + * + * With 3 or more missing vdevs, the pool is severely damaged and MOS entries + * may be missing entirely. Chances of data recovery are very low. Note that + * there are also risks of performing an inadvertent rewind as we might be + * missing all the vdevs with the latest uberblocks. + */ +unsigned long zfs_max_missing_tvds = 0; + +/* + * The parameters below are similar to zfs_max_missing_tvds but are only + * intended for a preliminary open of the pool with an untrusted config which + * might be incomplete or out-dated. + * + * We are more tolerant for pools opened from a cachefile since we could have + * an out-dated cachefile where a device removal was not registered. + * We could have set the limit arbitrarily high but in the case where devices + * are really missing we would want to return the proper error codes; we chose + * SPA_DVAS_PER_BP - 1 so that some copies of the MOS would still be available + * and we get a chance to retrieve the trusted config. + */ +uint64_t zfs_max_missing_tvds_cachefile = SPA_DVAS_PER_BP - 1; + +/* + * In the case where config was assembled by scanning device paths (/dev/dsks + * by default) we are less tolerant since all the existing devices should have + * been detected and we want spa_load to return the right error codes. + */ +uint64_t zfs_max_missing_tvds_scan = 0; + +/* + * Debugging aid that pauses spa_sync() towards the end. + */ +boolean_t zfs_pause_spa_sync = B_FALSE; + +/* + * Variables to indicate the livelist condense zthr func should wait at certain + * points for the livelist to be removed - used to test condense/destroy races + */ +int zfs_livelist_condense_zthr_pause = 0; +int zfs_livelist_condense_sync_pause = 0; + +/* + * Variables to track whether or not condense cancellation has been + * triggered in testing. + */ +int zfs_livelist_condense_sync_cancel = 0; +int zfs_livelist_condense_zthr_cancel = 0; + +/* + * Variable to track whether or not extra ALLOC blkptrs were added to a + * livelist entry while it was being condensed (caused by the way we track + * remapped blkptrs in dbuf_remap_impl) + */ +int zfs_livelist_condense_new_alloc = 0; + +/* + * ========================================================================== + * SPA properties routines + * ========================================================================== + */ + +/* + * Add a (source=src, propname=propval) list to an nvlist. + */ +static void +spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, + uint64_t intval, zprop_source_t src) +{ + const char *propname = zpool_prop_to_name(prop); + nvlist_t *propval; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(propval, ZPROP_SOURCE, src) == 0); + + if (strval != NULL) + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, strval) == 0); + else + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, intval) == 0); + + VERIFY(nvlist_add_nvlist(nvl, propname, propval) == 0); + nvlist_free(propval); +} + +/* + * Get property values from the spa configuration. + */ +static void +spa_prop_get_config(spa_t *spa, nvlist_t **nvp) +{ + vdev_t *rvd = spa->spa_root_vdev; + dsl_pool_t *pool = spa->spa_dsl_pool; + uint64_t size, alloc, cap, version; + const zprop_source_t src = ZPROP_SRC_NONE; + spa_config_dirent_t *dp; + metaslab_class_t *mc = spa_normal_class(spa); + + ASSERT(MUTEX_HELD(&spa->spa_props_lock)); + + if (rvd != NULL) { + alloc = metaslab_class_get_alloc(mc); + alloc += metaslab_class_get_alloc(spa_special_class(spa)); + alloc += metaslab_class_get_alloc(spa_dedup_class(spa)); + + size = metaslab_class_get_space(mc); + size += metaslab_class_get_space(spa_special_class(spa)); + size += metaslab_class_get_space(spa_dedup_class(spa)); + + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + size - alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_CHECKPOINT, NULL, + spa->spa_checkpoint_info.sci_dspace, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_FRAGMENTATION, NULL, + metaslab_class_fragmentation(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_EXPANDSZ, NULL, + metaslab_class_expandable_space(mc), src); + spa_prop_add_list(*nvp, ZPOOL_PROP_READONLY, NULL, + (spa_mode(spa) == SPA_MODE_READ), src); + + cap = (size == 0) ? 0 : (alloc * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + ddt_get_pool_dedup_ratio(spa), src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + rvd->vdev_state, src); + + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + version, ZPROP_SRC_DEFAULT); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, + version, ZPROP_SRC_LOCAL); + } + spa_prop_add_list(*nvp, ZPOOL_PROP_LOAD_GUID, + NULL, spa_load_guid(spa), src); + } + + if (pool != NULL) { + /* + * The $FREE directory was introduced in SPA_VERSION_DEADLISTS, + * when opening pools before this version freedir will be NULL. + */ + if (pool->dp_free_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, NULL, + dsl_dir_phys(pool->dp_free_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_FREEING, + NULL, 0, src); + } + + if (pool->dp_leak_dir != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, NULL, + dsl_dir_phys(pool->dp_leak_dir)->dd_used_bytes, + src); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_LEAKED, + NULL, 0, src); + } + } + + spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); + + if (spa->spa_comment != NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_COMMENT, spa->spa_comment, + 0, ZPROP_SRC_LOCAL); + } + + if (spa->spa_root != NULL) + spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, + 0, ZPROP_SRC_LOCAL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + MIN(zfs_max_recordsize, SPA_MAXBLOCKSIZE), ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXBLOCKSIZE, NULL, + SPA_OLD_MAXBLOCKSIZE, ZPROP_SRC_NONE); + } + + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MAX_SIZE, ZPROP_SRC_NONE); + } else { + spa_prop_add_list(*nvp, ZPOOL_PROP_MAXDNODESIZE, NULL, + DNODE_MIN_SIZE, ZPROP_SRC_NONE); + } + + if ((dp = list_head(&spa->spa_config_list)) != NULL) { + if (dp->scd_path == NULL) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + "none", 0, ZPROP_SRC_LOCAL); + } else if (strcmp(dp->scd_path, spa_config_path) != 0) { + spa_prop_add_list(*nvp, ZPOOL_PROP_CACHEFILE, + dp->scd_path, 0, ZPROP_SRC_LOCAL); + } + } +} + +/* + * Get zpool property values. + */ +int +spa_prop_get(spa_t *spa, nvlist_t **nvp) +{ + objset_t *mos = spa->spa_meta_objset; + zap_cursor_t zc; + zap_attribute_t za; + dsl_pool_t *dp; + int err; + + err = nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP); + if (err) + return (err); + + dp = spa_get_dsl(spa); + dsl_pool_config_enter(dp, FTAG); + mutex_enter(&spa->spa_props_lock); + + /* + * Get properties from the spa config. + */ + spa_prop_get_config(spa, nvp); + + /* If no pool property object, no more prop to get. */ + if (mos == NULL || spa->spa_pool_props_object == 0) + goto out; + + /* + * Get properties from the MOS pool property object. + */ + for (zap_cursor_init(&zc, mos, spa->spa_pool_props_object); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t intval = 0; + char *strval = NULL; + zprop_source_t src = ZPROP_SRC_DEFAULT; + zpool_prop_t prop; + + if ((prop = zpool_name_to_prop(za.za_name)) == ZPOOL_PROP_INVAL) + continue; + + switch (za.za_integer_length) { + case 8: + /* integer property */ + if (za.za_first_integer != + zpool_prop_default_numeric(prop)) + src = ZPROP_SRC_LOCAL; + + if (prop == ZPOOL_PROP_BOOTFS) { + dsl_dataset_t *ds = NULL; + + err = dsl_dataset_hold_obj(dp, + za.za_first_integer, FTAG, &ds); + if (err != 0) + break; + + strval = kmem_alloc(ZFS_MAX_DATASET_NAME_LEN, + KM_SLEEP); + dsl_dataset_name(ds, strval); + dsl_dataset_rele(ds, FTAG); + } else { + strval = NULL; + intval = za.za_first_integer; + } + + spa_prop_add_list(*nvp, prop, strval, intval, src); + + if (strval != NULL) + kmem_free(strval, ZFS_MAX_DATASET_NAME_LEN); + + break; + + case 1: + /* string property */ + strval = kmem_alloc(za.za_num_integers, KM_SLEEP); + err = zap_lookup(mos, spa->spa_pool_props_object, + za.za_name, 1, za.za_num_integers, strval); + if (err) { + kmem_free(strval, za.za_num_integers); + break; + } + spa_prop_add_list(*nvp, prop, strval, 0, src); + kmem_free(strval, za.za_num_integers); + break; + + default: + break; + } + } + zap_cursor_fini(&zc); +out: + mutex_exit(&spa->spa_props_lock); + dsl_pool_config_exit(dp, FTAG); + if (err && err != ENOENT) { + nvlist_free(*nvp); + *nvp = NULL; + return (err); + } + + return (0); +} + +/* + * Validate the given pool properties nvlist and modify the list + * for the property values to be set. + */ +static int +spa_prop_validate(spa_t *spa, nvlist_t *props) +{ + nvpair_t *elem; + int error = 0, reset_bootfs = 0; + uint64_t objnum = 0; + boolean_t has_feature = B_FALSE; + + elem = NULL; + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + uint64_t intval; + char *strval, *slash, *check, *fname; + const char *propname = nvpair_name(elem); + zpool_prop_t prop = zpool_name_to_prop(propname); + + switch (prop) { + case ZPOOL_PROP_INVAL: + if (!zpool_prop_feature(propname)) { + error = SET_ERROR(EINVAL); + break; + } + + /* + * Sanitize the input. + */ + if (nvpair_type(elem) != DATA_TYPE_UINT64) { + error = SET_ERROR(EINVAL); + break; + } + + if (nvpair_value_uint64(elem, &intval) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + if (intval != 0) { + error = SET_ERROR(EINVAL); + break; + } + + fname = strchr(propname, '@') + 1; + if (zfeature_lookup_name(fname, NULL) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + has_feature = B_TRUE; + break; + + case ZPOOL_PROP_VERSION: + error = nvpair_value_uint64(elem, &intval); + if (!error && + (intval < spa_version(spa) || + intval > SPA_VERSION_BEFORE_FEATURES || + has_feature)) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_DELEGATION: + case ZPOOL_PROP_AUTOREPLACE: + case ZPOOL_PROP_LISTSNAPS: + case ZPOOL_PROP_AUTOEXPAND: + case ZPOOL_PROP_AUTOTRIM: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_MULTIHOST: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > 1) + error = SET_ERROR(EINVAL); + + if (!error) { + uint32_t hostid = zone_get_hostid(NULL); + if (hostid) + spa->spa_hostid = hostid; + else + error = SET_ERROR(ENOTSUP); + } + + break; + + case ZPOOL_PROP_BOOTFS: + /* + * If the pool version is less than SPA_VERSION_BOOTFS, + * or the pool is still being created (version == 0), + * the bootfs property cannot be set. + */ + if (spa_version(spa) < SPA_VERSION_BOOTFS) { + error = SET_ERROR(ENOTSUP); + break; + } + + /* + * Make sure the vdev config is bootable + */ + if (!vdev_is_bootable(spa->spa_root_vdev)) { + error = SET_ERROR(ENOTSUP); + break; + } + + reset_bootfs = 1; + + error = nvpair_value_string(elem, &strval); + + if (!error) { + objset_t *os; + + if (strval == NULL || strval[0] == '\0') { + objnum = zpool_prop_default_numeric( + ZPOOL_PROP_BOOTFS); + break; + } + + error = dmu_objset_hold(strval, FTAG, &os); + if (error != 0) + break; + + /* Must be ZPL. */ + if (dmu_objset_type(os) != DMU_OST_ZFS) { + error = SET_ERROR(ENOTSUP); + } else { + objnum = dmu_objset_id(os); + } + dmu_objset_rele(os, FTAG); + } + break; + + case ZPOOL_PROP_FAILUREMODE: + error = nvpair_value_uint64(elem, &intval); + if (!error && intval > ZIO_FAILURE_MODE_PANIC) + error = SET_ERROR(EINVAL); + + /* + * This is a special case which only occurs when + * the pool has completely failed. This allows + * the user to change the in-core failmode property + * without syncing it out to disk (I/Os might + * currently be blocked). We do this by returning + * EIO to the caller (spa_prop_set) to trick it + * into thinking we encountered a property validation + * error. + */ + if (!error && spa_suspended(spa)) { + spa->spa_failmode = intval; + error = SET_ERROR(EIO); + } + break; + + case ZPOOL_PROP_CACHEFILE: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + + if (strval[0] == '\0') + break; + + if (strcmp(strval, "none") == 0) + break; + + if (strval[0] != '/') { + error = SET_ERROR(EINVAL); + break; + } + + slash = strrchr(strval, '/'); + ASSERT(slash != NULL); + + if (slash[1] == '\0' || strcmp(slash, "/.") == 0 || + strcmp(slash, "/..") == 0) + error = SET_ERROR(EINVAL); + break; + + case ZPOOL_PROP_COMMENT: + if ((error = nvpair_value_string(elem, &strval)) != 0) + break; + for (check = strval; *check != '\0'; check++) { + if (!isprint(*check)) { + error = SET_ERROR(EINVAL); + break; + } + } + if (strlen(strval) > ZPROP_MAX_COMMENT) + error = SET_ERROR(E2BIG); + break; + + default: + break; + } + + if (error) + break; + } + + (void) nvlist_remove_all(props, + zpool_prop_to_name(ZPOOL_PROP_DEDUPDITTO)); + + if (!error && reset_bootfs) { + error = nvlist_remove(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), DATA_TYPE_STRING); + + if (!error) { + error = nvlist_add_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), objnum); + } + } + + return (error); +} + +void +spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) +{ + char *cachefile; + spa_config_dirent_t *dp; + + if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &cachefile) != 0) + return; + + dp = kmem_alloc(sizeof (spa_config_dirent_t), + KM_SLEEP); + + if (cachefile[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(cachefile, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(cachefile); + + list_insert_head(&spa->spa_config_list, dp); + if (need_sync) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +int +spa_prop_set(spa_t *spa, nvlist_t *nvp) +{ + int error; + nvpair_t *elem = NULL; + boolean_t need_sync = B_FALSE; + + if ((error = spa_prop_validate(spa, nvp)) != 0) + return (error); + + while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { + zpool_prop_t prop = zpool_name_to_prop(nvpair_name(elem)); + + if (prop == ZPOOL_PROP_CACHEFILE || + prop == ZPOOL_PROP_ALTROOT || + prop == ZPOOL_PROP_READONLY) + continue; + + if (prop == ZPOOL_PROP_VERSION || prop == ZPOOL_PROP_INVAL) { + uint64_t ver; + + if (prop == ZPOOL_PROP_VERSION) { + VERIFY(nvpair_value_uint64(elem, &ver) == 0); + } else { + ASSERT(zpool_prop_feature(nvpair_name(elem))); + ver = SPA_VERSION_FEATURES; + need_sync = B_TRUE; + } + + /* Save time if the version is already set. */ + if (ver == spa_version(spa)) + continue; + + /* + * In addition to the pool directory object, we might + * create the pool properties object, the features for + * read object, the features for write object, or the + * feature descriptions object. + */ + error = dsl_sync_task(spa->spa_name, NULL, + spa_sync_version, &ver, + 6, ZFS_SPACE_CHECK_RESERVED); + if (error) + return (error); + continue; + } + + need_sync = B_TRUE; + break; + } + + if (need_sync) { + return (dsl_sync_task(spa->spa_name, NULL, spa_sync_props, + nvp, 6, ZFS_SPACE_CHECK_RESERVED)); + } + + return (0); +} + +/* + * If the bootfs property value is dsobj, clear it. + */ +void +spa_prop_clear_bootfs(spa_t *spa, uint64_t dsobj, dmu_tx_t *tx) +{ + if (spa->spa_bootfs == dsobj && spa->spa_pool_props_object != 0) { + VERIFY(zap_remove(spa->spa_meta_objset, + spa->spa_pool_props_object, + zpool_prop_to_name(ZPOOL_PROP_BOOTFS), tx) == 0); + spa->spa_bootfs = 0; + } +} + +/*ARGSUSED*/ +static int +spa_change_guid_check(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid __maybe_unused = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t vdev_state; + + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + int error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (SET_ERROR(error)); + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_state = rvd->vdev_state; + spa_config_exit(spa, SCL_STATE, FTAG); + + if (vdev_state != VDEV_STATE_HEALTHY) + return (SET_ERROR(ENXIO)); + + ASSERT3U(spa_guid(spa), !=, *newguid); + + return (0); +} + +static void +spa_change_guid_sync(void *arg, dmu_tx_t *tx) +{ + uint64_t *newguid = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + uint64_t oldguid; + vdev_t *rvd = spa->spa_root_vdev; + + oldguid = spa_guid(spa); + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + rvd->vdev_guid = *newguid; + rvd->vdev_guid_sum += (*newguid - oldguid); + vdev_config_dirty(rvd); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa_history_log_internal(spa, "guid change", tx, "old=%llu new=%llu", + (u_longlong_t)oldguid, (u_longlong_t)*newguid); +} + +/* + * Change the GUID for the pool. This is done so that we can later + * re-import a pool built from a clone of our own vdevs. We will modify + * the root vdev's guid, our own pool guid, and then mark all of our + * vdevs dirty. Note that we must make sure that all our vdevs are + * online when we do this, or else any vdevs that weren't present + * would be orphaned from our pool. We are also going to issue a + * sysevent to update any watchers. + */ +int +spa_change_guid(spa_t *spa) +{ + int error; + uint64_t guid; + + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + guid = spa_generate_guid(NULL); + + error = dsl_sync_task(spa->spa_name, spa_change_guid_check, + spa_change_guid_sync, &guid, 5, ZFS_SPACE_CHECK_RESERVED); + + if (error == 0) { + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_REGUID); + } + + mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * ========================================================================== + * SPA state manipulation (open/create/destroy/import/export) + * ========================================================================== + */ + +static int +spa_error_entry_compare(const void *a, const void *b) +{ + const spa_error_entry_t *sa = (const spa_error_entry_t *)a; + const spa_error_entry_t *sb = (const spa_error_entry_t *)b; + int ret; + + ret = memcmp(&sa->se_bookmark, &sb->se_bookmark, + sizeof (zbookmark_phys_t)); + + return (TREE_ISIGN(ret)); +} + +/* + * Utility function which retrieves copies of the current logs and + * re-initializes them in the process. + */ +void +spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) +{ + ASSERT(MUTEX_HELD(&spa->spa_errlist_lock)); + + bcopy(&spa->spa_errlist_last, last, sizeof (avl_tree_t)); + bcopy(&spa->spa_errlist_scrub, scrub, sizeof (avl_tree_t)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); +} + +static void +spa_taskqs_init(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + uint_t count = ztip->zti_count; + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t flags = 0; + boolean_t batch = B_FALSE; + + if (mode == ZTI_MODE_NULL) { + tqs->stqs_count = 0; + tqs->stqs_taskq = NULL; + return; + } + + ASSERT3U(count, >, 0); + + tqs->stqs_count = count; + tqs->stqs_taskq = kmem_alloc(count * sizeof (taskq_t *), KM_SLEEP); + + switch (mode) { + case ZTI_MODE_FIXED: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + flags |= TASKQ_DYNAMIC; + break; + + case ZTI_MODE_BATCH: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = MIN(zio_taskq_batch_pct, 100); + break; + + default: + panic("unrecognized mode for %s_%s taskq (%u:%u) in " + "spa_activate()", + zio_type_name[t], zio_taskq_types[q], mode, value); + break; + } + + for (uint_t i = 0; i < count; i++) { + taskq_t *tq; + char name[32]; + + (void) snprintf(name, sizeof (name), "%s_%s", + zio_type_name[t], zio_taskq_types[q]); + + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + tq = taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags); + } else { + pri_t pri = maxclsyspri; + /* + * The write issue taskq can be extremely CPU + * intensive. Run it at slightly less important + * priority than the other taskqs. Under Linux this + * means incrementing the priority value on platforms + * like illumos it should be decremented. + */ + if (t == ZIO_TYPE_WRITE && q == ZIO_TASKQ_ISSUE) + pri++; + + tq = taskq_create_proc(name, value, pri, 50, + INT_MAX, spa->spa_proc, flags); + } + + tqs->stqs_taskq[i] = tq; + } +} + +static void +spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + + if (tqs->stqs_taskq == NULL) { + ASSERT3U(tqs->stqs_count, ==, 0); + return; + } + + for (uint_t i = 0; i < tqs->stqs_count; i++) { + ASSERT3P(tqs->stqs_taskq[i], !=, NULL); + taskq_destroy(tqs->stqs_taskq[i]); + } + + kmem_free(tqs->stqs_taskq, tqs->stqs_count * sizeof (taskq_t *)); + tqs->stqs_taskq = NULL; +} + +/* + * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. + * Note that a type may have multiple discrete taskqs to avoid lock contention + * on the taskq itself. In that case we choose which taskq at random by using + * the low bits of gethrtime(). + */ +void +spa_taskq_dispatch_ent(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags, taskq_ent_t *ent) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; + + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); + + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { + tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; + } + + taskq_dispatch_ent(tq, func, arg, flags, ent); +} + +/* + * Same as spa_taskq_dispatch_ent() but block on the task until completion. + */ +void +spa_taskq_dispatch_sync(spa_t *spa, zio_type_t t, zio_taskq_type_t q, + task_func_t *func, void *arg, uint_t flags) +{ + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + taskq_t *tq; + taskqid_t id; + + ASSERT3P(tqs->stqs_taskq, !=, NULL); + ASSERT3U(tqs->stqs_count, !=, 0); + + if (tqs->stqs_count == 1) { + tq = tqs->stqs_taskq[0]; + } else { + tq = tqs->stqs_taskq[((uint64_t)gethrtime()) % tqs->stqs_count]; + } + + id = taskq_dispatch(tq, func, arg, flags); + if (id) + taskq_wait_id(tq, id); +} + +static void +spa_create_zio_taskqs(spa_t *spa) +{ + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa_taskqs_init(spa, t, q); + } + } +} + +/* + * Disabled until spa_thread() can be adapted for Linux. + */ +#undef HAVE_SPA_THREAD + +#if defined(_KERNEL) && defined(HAVE_SPA_THREAD) +static void +spa_thread(void *arg) +{ + psetid_t zio_taskq_psrset_bind = PS_NONE; + callb_cpr_t cprinfo; + + spa_t *spa = arg; + user_t *pu = PTOU(curproc); + + CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, + spa->spa_name); + + ASSERT(curproc != &p0); + (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), + "zpool-%s", spa->spa_name); + (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); + + /* bind this thread to the requested psrset */ + if (zio_taskq_psrset_bind != PS_NONE) { + pool_lock(); + mutex_enter(&cpu_lock); + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, + 0, NULL, NULL) == 0) { + curthread->t_bind_pset = zio_taskq_psrset_bind; + } else { + cmn_err(CE_WARN, + "Couldn't bind process for zfs pool \"%s\" to " + "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); + } + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + mutex_exit(&cpu_lock); + pool_unlock(); + } + + if (zio_taskq_sysdc) { + sysdc_thread_enter(curthread, 100, 0); + } + + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; + + spa_create_zio_taskqs(spa); + + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); + + spa->spa_proc_state = SPA_PROC_ACTIVE; + cv_broadcast(&spa->spa_proc_cv); + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_proc_state == SPA_PROC_ACTIVE) + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); + + ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); + spa->spa_proc_state = SPA_PROC_GONE; + spa->spa_proc = &p0; + cv_broadcast(&spa->spa_proc_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ + + mutex_enter(&curproc->p_lock); + lwp_exit(); +} +#endif + +/* + * Activate an uninitialized pool. + */ +static void +spa_activate(spa_t *spa, spa_mode_t mode) +{ + ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_mode = mode; + + spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_special_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_dedup_class = metaslab_class_create(spa, zfs_metaslab_ops); + + /* Try to create a covering process */ + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_NONE); + ASSERT(spa->spa_proc == &p0); + spa->spa_did = 0; + +#ifdef HAVE_SPA_THREAD + /* Only create a process if we're going to be around a while. */ + if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { + if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, + NULL, 0) == 0) { + spa->spa_proc_state = SPA_PROC_CREATED; + while (spa->spa_proc_state == SPA_PROC_CREATED) { + cv_wait(&spa->spa_proc_cv, + &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + ASSERT(spa->spa_proc != &p0); + ASSERT(spa->spa_did != 0); + } else { +#ifdef _KERNEL + cmn_err(CE_WARN, + "Couldn't create process for zfs pool \"%s\"\n", + spa->spa_name); +#endif + } + } +#endif /* HAVE_SPA_THREAD */ + mutex_exit(&spa->spa_proc_lock); + + /* If we didn't create a process, we need to create our taskqs. */ + if (spa->spa_proc == &p0) { + spa_create_zio_taskqs(spa); + } + + for (size_t i = 0; i < TXG_SIZE; i++) { + spa->spa_txg_zio[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + } + + list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_config_dirty_node)); + list_create(&spa->spa_evicting_os_list, sizeof (objset_t), + offsetof(objset_t, os_evicting_node)); + list_create(&spa->spa_state_dirty_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_state_dirty_node)); + + txg_list_create(&spa->spa_vdev_txg_list, spa, + offsetof(struct vdev, vdev_txg_node)); + + avl_create(&spa->spa_errlist_scrub, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + avl_create(&spa->spa_errlist_last, + spa_error_entry_compare, sizeof (spa_error_entry_t), + offsetof(spa_error_entry_t, se_avl)); + + spa_keystore_init(&spa->spa_keystore); + + /* + * This taskq is used to perform zvol-minor-related tasks + * asynchronously. This has several advantages, including easy + * resolution of various deadlocks (zfsonlinux bug #3681). + * + * The taskq must be single threaded to ensure tasks are always + * processed in the order in which they were dispatched. + * + * A taskq per pool allows one to keep the pools independent. + * This way if one pool is suspended, it will not impact another. + * + * The preferred location to dispatch a zvol minor task is a sync + * task. In this context, there is easy access to the spa_t and minimal + * error handling is required because the sync task must succeed. + */ + spa->spa_zvol_taskq = taskq_create("z_zvol", 1, defclsyspri, + 1, INT_MAX, 0); + + /* + * Taskq dedicated to prefetcher threads: this is used to prevent the + * pool traverse code from monopolizing the global (and limited) + * system_taskq by inappropriately scheduling long running tasks on it. + */ + spa->spa_prefetch_taskq = taskq_create("z_prefetch", boot_ncpus, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); + + /* + * The taskq to upgrade datasets in this pool. Currently used by + * feature SPA_FEATURE_USEROBJ_ACCOUNTING/SPA_FEATURE_PROJECT_QUOTA. + */ + spa->spa_upgrade_taskq = taskq_create("z_upgrade", boot_ncpus, + defclsyspri, 1, INT_MAX, TASKQ_DYNAMIC); +} + +/* + * Opposite of spa_activate(). + */ +static void +spa_deactivate(spa_t *spa) +{ + ASSERT(spa->spa_sync_on == B_FALSE); + ASSERT(spa->spa_dsl_pool == NULL); + ASSERT(spa->spa_root_vdev == NULL); + ASSERT(spa->spa_async_zio_root == NULL); + ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); + + spa_evicting_os_wait(spa); + + if (spa->spa_zvol_taskq) { + taskq_destroy(spa->spa_zvol_taskq); + spa->spa_zvol_taskq = NULL; + } + + if (spa->spa_prefetch_taskq) { + taskq_destroy(spa->spa_prefetch_taskq); + spa->spa_prefetch_taskq = NULL; + } + + if (spa->spa_upgrade_taskq) { + taskq_destroy(spa->spa_upgrade_taskq); + spa->spa_upgrade_taskq = NULL; + } + + txg_list_destroy(&spa->spa_vdev_txg_list); + + list_destroy(&spa->spa_config_dirty_list); + list_destroy(&spa->spa_evicting_os_list); + list_destroy(&spa->spa_state_dirty_list); + + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + spa_taskqs_fini(spa, t, q); + } + } + + for (size_t i = 0; i < TXG_SIZE; i++) { + ASSERT3P(spa->spa_txg_zio[i], !=, NULL); + VERIFY0(zio_wait(spa->spa_txg_zio[i])); + spa->spa_txg_zio[i] = NULL; + } + + metaslab_class_destroy(spa->spa_normal_class); + spa->spa_normal_class = NULL; + + metaslab_class_destroy(spa->spa_log_class); + spa->spa_log_class = NULL; + + metaslab_class_destroy(spa->spa_special_class); + spa->spa_special_class = NULL; + + metaslab_class_destroy(spa->spa_dedup_class); + spa->spa_dedup_class = NULL; + + /* + * If this was part of an import or the open otherwise failed, we may + * still have errors left in the queues. Empty them just in case. + */ + spa_errlog_drain(spa); + avl_destroy(&spa->spa_errlist_scrub); + avl_destroy(&spa->spa_errlist_last); + + spa_keystore_fini(&spa->spa_keystore); + + spa->spa_state = POOL_STATE_UNINITIALIZED; + + mutex_enter(&spa->spa_proc_lock); + if (spa->spa_proc_state != SPA_PROC_NONE) { + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + spa->spa_proc_state = SPA_PROC_DEACTIVATE; + cv_broadcast(&spa->spa_proc_cv); + while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { + ASSERT(spa->spa_proc != &p0); + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_GONE); + spa->spa_proc_state = SPA_PROC_NONE; + } + ASSERT(spa->spa_proc == &p0); + mutex_exit(&spa->spa_proc_lock); + + /* + * We want to make sure spa_thread() has actually exited the ZFS + * module, so that the module can't be unloaded out from underneath + * it. + */ + if (spa->spa_did != 0) { + thread_join(spa->spa_did); + spa->spa_did = 0; + } +} + +/* + * Verify a pool configuration, and construct the vdev tree appropriately. This + * will create all the necessary vdevs in the appropriate layout, with each vdev + * in the CLOSED state. This will prep the pool before open/creation/import. + * All vdev validation is done by the vdev_alloc() routine. + */ +int +spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, + uint_t id, int atype) +{ + nvlist_t **child; + uint_t children; + int error; + + if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) + return (error); + + if ((*vdp)->vdev_ops->vdev_op_leaf) + return (0); + + error = nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + if (error == ENOENT) + return (0); + + if (error) { + vdev_free(*vdp); + *vdp = NULL; + return (SET_ERROR(EINVAL)); + } + + for (int c = 0; c < children; c++) { + vdev_t *vd; + if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, + atype)) != 0) { + vdev_free(*vdp); + *vdp = NULL; + return (error); + } + } + + ASSERT(*vdp != NULL); + + return (0); +} + +static boolean_t +spa_should_flush_logs_on_unload(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return (B_FALSE); + + if (!spa_writeable(spa)) + return (B_FALSE); + + if (!spa->spa_sync_on) + return (B_FALSE); + + if (spa_state(spa) != POOL_STATE_EXPORTED) + return (B_FALSE); + + if (zfs_keep_log_spacemaps_at_export) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Opens a transaction that will set the flag that will instruct + * spa_sync to attempt to flush all the metaslabs for that txg. + */ +static void +spa_unload_log_sm_flush_all(spa_t *spa) +{ + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + ASSERT3U(spa->spa_log_flushall_txg, ==, 0); + spa->spa_log_flushall_txg = dmu_tx_get_txg(tx); + + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), spa->spa_log_flushall_txg); +} + +static void +spa_unload_log_sm_metadata(spa_t *spa) +{ + void *cookie = NULL; + spa_log_sm_t *sls; + while ((sls = avl_destroy_nodes(&spa->spa_sm_logs_by_txg, + &cookie)) != NULL) { + VERIFY0(sls->sls_mscount); + kmem_free(sls, sizeof (spa_log_sm_t)); + } + + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + VERIFY0(e->lse_mscount); + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } + + spa->spa_unflushed_stats.sus_nblocks = 0; + spa->spa_unflushed_stats.sus_memused = 0; + spa->spa_unflushed_stats.sus_blocklimit = 0; +} + +static void +spa_destroy_aux_threads(spa_t *spa) +{ + if (spa->spa_condense_zthr != NULL) { + zthr_destroy(spa->spa_condense_zthr); + spa->spa_condense_zthr = NULL; + } + if (spa->spa_checkpoint_discard_zthr != NULL) { + zthr_destroy(spa->spa_checkpoint_discard_zthr); + spa->spa_checkpoint_discard_zthr = NULL; + } + if (spa->spa_livelist_delete_zthr != NULL) { + zthr_destroy(spa->spa_livelist_delete_zthr); + spa->spa_livelist_delete_zthr = NULL; + } + if (spa->spa_livelist_condense_zthr != NULL) { + zthr_destroy(spa->spa_livelist_condense_zthr); + spa->spa_livelist_condense_zthr = NULL; + } +} + +/* + * Opposite of spa_load(). + */ +static void +spa_unload(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_state(spa) != POOL_STATE_UNINITIALIZED); + + spa_import_progress_remove(spa_guid(spa)); + spa_load_note(spa, "UNLOADING"); + + spa_wake_waiters(spa); + + /* + * If the log space map feature is enabled and the pool is getting + * exported (but not destroyed), we want to spend some time flushing + * as many metaslabs as we can in an attempt to destroy log space + * maps and save import time. + */ + if (spa_should_flush_logs_on_unload(spa)) + spa_unload_log_sm_flush_all(spa); + + /* + * Stop async tasks. + */ + spa_async_suspend(spa); + + if (spa->spa_root_vdev) { + vdev_t *root_vdev = spa->spa_root_vdev; + vdev_initialize_stop_all(root_vdev, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(root_vdev, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); + } + + /* + * Stop syncing. + */ + if (spa->spa_sync_on) { + txg_sync_stop(spa->spa_dsl_pool); + spa->spa_sync_on = B_FALSE; + } + + /* + * This ensures that there is no async metaslab prefetching + * while we attempt to unload the spa. + */ + if (spa->spa_root_vdev != NULL) { + for (int c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vc = spa->spa_root_vdev->vdev_child[c]; + if (vc->vdev_mg != NULL) + taskq_wait(vc->vdev_mg->mg_taskq); + } + } + + if (spa->spa_mmp.mmp_thread) + mmp_thread_stop(spa); + + /* + * Wait for any outstanding async I/O to complete. + */ + if (spa->spa_async_zio_root != NULL) { + for (int i = 0; i < max_ncpus; i++) + (void) zio_wait(spa->spa_async_zio_root[i]); + kmem_free(spa->spa_async_zio_root, max_ncpus * sizeof (void *)); + spa->spa_async_zio_root = NULL; + } + + if (spa->spa_vdev_removal != NULL) { + spa_vdev_removal_destroy(spa->spa_vdev_removal); + spa->spa_vdev_removal = NULL; + } + + spa_destroy_aux_threads(spa); + + spa_condense_fini(spa); + + bpobj_close(&spa->spa_deferred_bpobj); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + + /* + * Close all vdevs. + */ + if (spa->spa_root_vdev) + vdev_free(spa->spa_root_vdev); + ASSERT(spa->spa_root_vdev == NULL); + + /* + * Close the dsl pool. + */ + if (spa->spa_dsl_pool) { + dsl_pool_close(spa->spa_dsl_pool); + spa->spa_dsl_pool = NULL; + spa->spa_meta_objset = NULL; + } + + ddt_unload(spa); + spa_unload_log_sm_metadata(spa); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + + for (int i = 0; i < spa->spa_spares.sav_count; i++) + vdev_free(spa->spa_spares.sav_vdevs[i]); + if (spa->spa_spares.sav_vdevs) { + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + spa->spa_spares.sav_vdevs = NULL; + } + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + } + spa->spa_spares.sav_count = 0; + + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_clear_stats(spa->spa_l2cache.sav_vdevs[i]); + vdev_free(spa->spa_l2cache.sav_vdevs[i]); + } + if (spa->spa_l2cache.sav_vdevs) { + kmem_free(spa->spa_l2cache.sav_vdevs, + spa->spa_l2cache.sav_count * sizeof (void *)); + spa->spa_l2cache.sav_vdevs = NULL; + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + } + spa->spa_l2cache.sav_count = 0; + + spa->spa_async_suspended = 0; + + spa->spa_indirect_vdevs_loaded = B_FALSE; + + if (spa->spa_comment != NULL) { + spa_strfree(spa->spa_comment); + spa->spa_comment = NULL; + } + + spa_config_exit(spa, SCL_ALL, spa); +} + +/* + * Load (or re-load) the current list of vdevs describing the active spares for + * this pool. When this is called, we have some form of basic information in + * 'spa_spares.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + */ +void +spa_load_spares(spa_t *spa) +{ + nvlist_t **spares; + uint_t nspares; + int i; + vdev_t *vd, *tvd; + +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As spare vdevs are shared among open pools, we skip loading + * them when we load the checkpointed state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * First, close and free any existing spare vdevs. + */ + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + + /* Undo the call to spa_activate() below */ + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL && tvd->vdev_isspare) + spa_spare_remove(tvd); + vdev_close(vd); + vdev_free(vd); + } + + if (spa->spa_spares.sav_vdevs) + kmem_free(spa->spa_spares.sav_vdevs, + spa->spa_spares.sav_count * sizeof (void *)); + + if (spa->spa_spares.sav_config == NULL) + nspares = 0; + else + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + spa->spa_spares.sav_count = (int)nspares; + spa->spa_spares.sav_vdevs = NULL; + + if (nspares == 0) + return; + + /* + * Construct the array of vdevs, opening them to get status in the + * process. For each spare, there is potentially two different vdev_t + * structures associated with it: one in the list of spares (used only + * for basic validation purposes) and one in the active vdev + * configuration (if it's spared in). During this phase we open and + * validate each vdev on the spare list. If the vdev also exists in the + * active configuration, then we also mark this vdev as an active spare. + */ + spa->spa_spares.sav_vdevs = kmem_zalloc(nspares * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) { + VERIFY(spa_config_parse(spa, &vd, spares[i], NULL, 0, + VDEV_ALLOC_SPARE) == 0); + ASSERT(vd != NULL); + + spa->spa_spares.sav_vdevs[i] = vd; + + if ((tvd = spa_lookup_by_guid(spa, vd->vdev_guid, + B_FALSE)) != NULL) { + if (!tvd->vdev_isspare) + spa_spare_add(tvd); + + /* + * We only mark the spare active if we were successfully + * able to load the vdev. Otherwise, importing a pool + * with a bad active spare would result in strange + * behavior, because multiple pool would think the spare + * is actively in use. + * + * There is a vulnerability here to an equally bizarre + * circumstance, where a dead active spare is later + * brought back to life (onlined or otherwise). Given + * the rarity of this scenario, and the extra complexity + * it adds, we ignore the possibility. + */ + if (!vdev_is_dead(tvd)) + spa_spare_activate(tvd); + } + + vd->vdev_top = vd; + vd->vdev_aux = &spa->spa_spares; + + if (vdev_open(vd) != 0) + continue; + + if (vdev_validate_aux(vd) == 0) + spa_spare_add(vd); + } + + /* + * Recompute the stashed list of spares, with status information + * this time. + */ + VERIFY(nvlist_remove(spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, + DATA_TYPE_NVLIST_ARRAY) == 0); + + spares = kmem_alloc(spa->spa_spares.sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < spa->spa_spares.sav_count; i++) + spares[i] = vdev_config_generate(spa, + spa->spa_spares.sav_vdevs[i], B_TRUE, VDEV_CONFIG_SPARE); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, spa->spa_spares.sav_count) == 0); + for (i = 0; i < spa->spa_spares.sav_count; i++) + nvlist_free(spares[i]); + kmem_free(spares, spa->spa_spares.sav_count * sizeof (void *)); +} + +/* + * Load (or re-load) the current list of vdevs describing the active l2cache for + * this pool. When this is called, we have some form of basic information in + * 'spa_l2cache.sav_config'. We parse this into vdevs, try to open them, and + * then re-generate a more complete list including status information. + * Devices which are already active have their details maintained, and are + * not re-opened. + */ +void +spa_load_l2cache(spa_t *spa) +{ + nvlist_t **l2cache = NULL; + uint_t nl2cache; + int i, j, oldnvdevs; + uint64_t guid; + vdev_t *vd, **oldvdevs, **newvdevs; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + +#ifndef _KERNEL + /* + * zdb opens both the current state of the pool and the + * checkpointed state (if present), with a different spa_t. + * + * As L2 caches are part of the ARC which is shared among open + * pools, we skip loading them when we load the checkpointed + * state of the pool. + */ + if (!spa_writeable(spa)) + return; +#endif + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + oldvdevs = sav->sav_vdevs; + oldnvdevs = sav->sav_count; + sav->sav_vdevs = NULL; + sav->sav_count = 0; + + if (sav->sav_config == NULL) { + nl2cache = 0; + newvdevs = NULL; + goto out; + } + + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + newvdevs = kmem_alloc(nl2cache * sizeof (void *), KM_SLEEP); + + /* + * Process new nvlist of vdevs. + */ + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], ZPOOL_CONFIG_GUID, + &guid) == 0); + + newvdevs[i] = NULL; + for (j = 0; j < oldnvdevs; j++) { + vd = oldvdevs[j]; + if (vd != NULL && guid == vd->vdev_guid) { + /* + * Retain previous vdev for add/remove ops. + */ + newvdevs[i] = vd; + oldvdevs[j] = NULL; + break; + } + } + + if (newvdevs[i] == NULL) { + /* + * Create new vdev + */ + VERIFY(spa_config_parse(spa, &vd, l2cache[i], NULL, 0, + VDEV_ALLOC_L2CACHE) == 0); + ASSERT(vd != NULL); + newvdevs[i] = vd; + + /* + * Commit this vdev as an l2cache device, + * even if it fails to open. + */ + spa_l2cache_add(vd); + + vd->vdev_top = vd; + vd->vdev_aux = sav; + + spa_l2cache_activate(vd); + + if (vdev_open(vd) != 0) + continue; + + (void) vdev_validate_aux(vd); + + if (!vdev_is_dead(vd)) + l2arc_add_vdev(spa, vd); + + /* + * Upon cache device addition to a pool or pool + * creation with a cache device or if the header + * of the device is invalid we issue an async + * TRIM command for the whole device which will + * execute if l2arc_trim_ahead > 0. + */ + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); + } + } + + sav->sav_vdevs = newvdevs; + sav->sav_count = (int)nl2cache; + + /* + * Recompute the stashed list of l2cache devices, with status + * information this time. + */ + VERIFY(nvlist_remove(sav->sav_config, ZPOOL_CONFIG_L2CACHE, + DATA_TYPE_NVLIST_ARRAY) == 0); + + if (sav->sav_count > 0) + l2cache = kmem_alloc(sav->sav_count * sizeof (void *), + KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + l2cache[i] = vdev_config_generate(spa, + sav->sav_vdevs[i], B_TRUE, VDEV_CONFIG_L2CACHE); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, sav->sav_count) == 0); + +out: + /* + * Purge vdevs that were dropped + */ + for (i = 0; i < oldnvdevs; i++) { + uint64_t pool; + + vd = oldvdevs[i]; + if (vd != NULL) { + ASSERT(vd->vdev_isl2cache); + + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + vdev_clear_stats(vd); + vdev_free(vd); + } + } + + if (oldvdevs) + kmem_free(oldvdevs, oldnvdevs * sizeof (void *)); + + for (i = 0; i < sav->sav_count; i++) + nvlist_free(l2cache[i]); + if (sav->sav_count) + kmem_free(l2cache, sav->sav_count * sizeof (void *)); +} + +static int +load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) +{ + dmu_buf_t *db; + char *packed = NULL; + size_t nvsize = 0; + int error; + *value = NULL; + + error = dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db); + if (error) + return (error); + + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + packed = vmem_alloc(nvsize, KM_SLEEP); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, + DMU_READ_PREFETCH); + if (error == 0) + error = nvlist_unpack(packed, nvsize, value, 0); + vmem_free(packed, nvsize); + + return (error); +} + +/* + * Concrete top-level vdevs that are not missing and are not logs. At every + * spa_sync we write new uberblocks to at least SPA_SYNC_MIN_VDEVS core tvds. + */ +static uint64_t +spa_healthy_core_tvds(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t tvds = 0; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *vd = rvd->vdev_child[i]; + if (vd->vdev_islog) + continue; + if (vdev_is_concrete(vd) && !vdev_is_dead(vd)) + tvds++; + } + + return (tvds); +} + +/* + * Checks to see if the given vdev could not be opened, in which case we post a + * sysevent to notify the autoreplace code that the device has been removed. + */ +static void +spa_check_removed(vdev_t *vd) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) + spa_check_removed(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd) && + vdev_is_concrete(vd)) { + zfs_post_autoreplace(vd->vdev_spa, vd); + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_CHECK); + } +} + +static int +spa_check_for_missing_logs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If we're doing a normal import, then build up any additional + * diagnostic information about missing log devices. + * We'll pass this up to the user for further processing. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_MISSING_LOG)) { + nvlist_t **child, *nv; + uint64_t idx = 0; + + child = kmem_alloc(rvd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * We consider a device as missing only if it failed + * to open (i.e. offline or faulted is not considered + * as missing). + */ + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + child[idx++] = vdev_config_generate(spa, tvd, + B_FALSE, VDEV_CONFIG_MISSING); + } + } + + if (idx > 0) { + fnvlist_add_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, child, idx); + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_MISSING_DEVICES, nv); + + for (uint64_t i = 0; i < idx; i++) + nvlist_free(child[i]); + } + nvlist_free(nv); + kmem_free(child, rvd->vdev_children * sizeof (char **)); + + if (idx > 0) { + spa_load_failed(spa, "some log devices are missing"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } + } else { + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_islog && + tvd->vdev_state == VDEV_STATE_CANT_OPEN) { + spa_set_log_state(spa, SPA_LOG_CLEAR); + spa_load_note(spa, "some log devices are " + "missing, ZIL is dropped."); + vdev_dbgmsg_print_tree(rvd, 2); + break; + } + } + } + + return (0); +} + +/* + * Check for missing log devices + */ +static boolean_t +spa_check_logs(spa_t *spa) +{ + boolean_t rv = B_FALSE; + dsl_pool_t *dp = spa_get_dsl(spa); + + switch (spa->spa_log_state) { + default: + break; + case SPA_LOG_MISSING: + /* need to recheck in case slog has been restored */ + case SPA_LOG_UNKNOWN: + rv = (dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_check_log_chain, NULL, DS_FIND_CHILDREN) != 0); + if (rv) + spa_set_log_state(spa, SPA_LOG_MISSING); + break; + } + return (rv); +} + +static boolean_t +spa_passivate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + if (!spa_has_slogs(spa)) + return (B_FALSE); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } + } + + return (slog_found); +} + +static void +spa_activate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) + metaslab_group_activate(mg); + } +} + +int +spa_reset_logs(spa_t *spa) +{ + int error; + + error = dmu_objset_find(spa_name(spa), zil_reset, + NULL, DS_FIND_CHILDREN); + if (error == 0) { + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} + +static void +spa_aux_check_removed(spa_aux_vdev_t *sav) +{ + for (int i = 0; i < sav->sav_count; i++) + spa_check_removed(sav->sav_vdevs[i]); +} + +void +spa_claim_notify(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + + if (zio->io_error) + return; + + mutex_enter(&spa->spa_props_lock); /* any mutex will do */ + if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) + spa->spa_claim_max_txg = zio->io_bp->blk_birth; + mutex_exit(&spa->spa_props_lock); +} + +typedef struct spa_load_error { + uint64_t sle_meta_count; + uint64_t sle_data_count; +} spa_load_error_t; + +static void +spa_load_verify_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; + spa_t *spa = zio->io_spa; + + abd_free(zio->io_abd); + if (error) { + if ((BP_GET_LEVEL(bp) != 0 || DMU_OT_IS_METADATA(type)) && + type != DMU_OT_INTENT_LOG) + atomic_inc_64(&sle->sle_meta_count); + else + atomic_inc_64(&sle->sle_data_count); + } + + mutex_enter(&spa->spa_scrub_lock); + spa->spa_load_verify_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +/* + * Maximum number of inflight bytes is the log2 fraction of the arc size. + * By default, we set it to 1/16th of the arc. + */ +int spa_load_verify_shift = 4; +int spa_load_verify_metadata = B_TRUE; +int spa_load_verify_data = B_TRUE; + +/*ARGSUSED*/ +static int +spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) +{ + if (zb->zb_level == ZB_DNODE_LEVEL || BP_IS_HOLE(bp) || + BP_IS_EMBEDDED(bp) || BP_IS_REDACTED(bp)) + return (0); + /* + * Note: normally this routine will not be called if + * spa_load_verify_metadata is not set. However, it may be useful + * to manually set the flag after the traversal has begun. + */ + if (!spa_load_verify_metadata) + return (0); + if (!BP_IS_METADATA(bp) && !spa_load_verify_data) + return (0); + + uint64_t maxinflight_bytes = + arc_target_bytes() >> spa_load_verify_shift; + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_load_verify_bytes >= maxinflight_bytes) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + spa->spa_load_verify_bytes += size; + mutex_exit(&spa->spa_scrub_lock); + + zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + return (0); +} + +/* ARGSUSED */ +static int +verify_dataset_name_len(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + if (dsl_dataset_namelen(ds) >= ZFS_MAX_DATASET_NAME_LEN) + return (SET_ERROR(ENAMETOOLONG)); + + return (0); +} + +static int +spa_load_verify(spa_t *spa) +{ + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_load_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error = 0; + + zpool_get_load_policy(spa->spa_config, &policy); + + if (policy.zlp_rewind & ZPOOL_NEVER_REWIND) + return (0); + + dsl_pool_config_enter(spa->spa_dsl_pool, FTAG); + error = dmu_objset_find_dp(spa->spa_dsl_pool, + spa->spa_dsl_pool->dp_root_dir_obj, verify_dataset_name_len, NULL, + DS_FIND_CHILDREN); + dsl_pool_config_exit(spa->spa_dsl_pool, FTAG); + if (error != 0) + return (error); + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + if (spa_load_verify_metadata) { + if (spa->spa_extreme_rewind) { + spa_load_note(spa, "performing a complete scan of the " + "pool since extreme rewind is on. This may take " + "a very long time.\n (spa_load_verify_data=%u, " + "spa_load_verify_metadata=%u)", + spa_load_verify_data, spa_load_verify_metadata); + } + + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | + TRAVERSE_NO_DECRYPT, spa_load_verify_cb, rio); + } + + (void) zio_wait(rio); + ASSERT0(spa->spa_load_verify_bytes); + + spa->spa_load_meta_errors = sle.sle_meta_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (sle.sle_meta_count != 0 || sle.sle_data_count != 0) { + spa_load_note(spa, "spa_load_verify found %llu metadata errors " + "and %llu data errors", (u_longlong_t)sle.sle_meta_count, + (u_longlong_t)sle.sle_data_count); + } + + if (spa_load_verify_dryrun || + (!error && sle.sle_meta_count <= policy.zlp_maxmeta && + sle.sle_data_count <= policy.zlp_maxdata)) { + int64_t loss = 0; + + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_TIME, spa->spa_load_txg_ts) == 0); + VERIFY(nvlist_add_int64(spa->spa_load_info, + ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_LOAD_DATA_ERRORS, sle.sle_data_count) == 0); + } else { + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + } + + if (spa_load_verify_dryrun) + return (0); + + if (error) { + if (error != ENXIO && error != EIO) + error = SET_ERROR(EIO); + return (error); + } + + return (verify_ok ? 0 : EIO); +} + +/* + * Find a value in the pool props object. + */ +static void +spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) +{ + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); +} + +/* + * Find a value in the pool directory object. + */ +static int +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val, boolean_t log_enoent) +{ + int error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val); + + if (error != 0 && (error != ENOENT || log_enoent)) { + spa_load_failed(spa, "couldn't get '%s' value in MOS directory " + "[error=%d]", name, error); + } + + return (error); +} + +static int +spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) +{ + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (SET_ERROR(err)); +} + +boolean_t +spa_livelist_delete_check(spa_t *spa) +{ + return (spa->spa_livelists_to_delete != 0); +} + +/* ARGSUSED */ +static boolean_t +spa_livelist_delete_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + return (spa_livelist_delete_check(spa)); +} + +static int +delete_blkptr_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + spa_t *spa = arg; + zio_free(spa, tx->tx_txg, bp); + dsl_dir_diduse_space(tx->tx_pool->dp_free_dir, DD_USED_HEAD, + -bp_get_dsize_sync(spa, bp), + -BP_GET_PSIZE(bp), -BP_GET_UCSIZE(bp), tx); + return (0); +} + +static int +dsl_get_next_livelist_obj(objset_t *os, uint64_t zap_obj, uint64_t *llp) +{ + int err; + zap_cursor_t zc; + zap_attribute_t za; + zap_cursor_init(&zc, os, zap_obj); + err = zap_cursor_retrieve(&zc, &za); + zap_cursor_fini(&zc); + if (err == 0) + *llp = za.za_first_integer; + return (err); +} + +/* + * Components of livelist deletion that must be performed in syncing + * context: freeing block pointers and updating the pool-wide data + * structures to indicate how much work is left to do + */ +typedef struct sublist_delete_arg { + spa_t *spa; + dsl_deadlist_t *ll; + uint64_t key; + bplist_t *to_free; +} sublist_delete_arg_t; + +static void +sublist_delete_sync(void *arg, dmu_tx_t *tx) +{ + sublist_delete_arg_t *sda = arg; + spa_t *spa = sda->spa; + dsl_deadlist_t *ll = sda->ll; + uint64_t key = sda->key; + bplist_t *to_free = sda->to_free; + + bplist_iterate(to_free, delete_blkptr_cb, spa, tx); + dsl_deadlist_remove_entry(ll, key, tx); +} + +typedef struct livelist_delete_arg { + spa_t *spa; + uint64_t ll_obj; + uint64_t zap_obj; +} livelist_delete_arg_t; + +static void +livelist_delete_sync(void *arg, dmu_tx_t *tx) +{ + livelist_delete_arg_t *lda = arg; + spa_t *spa = lda->spa; + uint64_t ll_obj = lda->ll_obj; + uint64_t zap_obj = lda->zap_obj; + objset_t *mos = spa->spa_meta_objset; + uint64_t count; + + /* free the livelist and decrement the feature count */ + VERIFY0(zap_remove_int(mos, zap_obj, ll_obj, tx)); + dsl_deadlist_free(mos, ll_obj, tx); + spa_feature_decr(spa, SPA_FEATURE_LIVELIST, tx); + VERIFY0(zap_count(mos, zap_obj, &count)); + if (count == 0) { + /* no more livelists to delete */ + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DELETED_CLONES, tx)); + VERIFY0(zap_destroy(mos, zap_obj, tx)); + spa->spa_livelists_to_delete = 0; + spa_notify_waiters(spa); + } +} + +/* + * Load in the value for the livelist to be removed and open it. Then, + * load its first sublist and determine which block pointers should actually + * be freed. Then, call a synctask which performs the actual frees and updates + * the pool-wide livelist data. + */ +/* ARGSUSED */ +static void +spa_livelist_delete_cb(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + uint64_t ll_obj = 0, count; + objset_t *mos = spa->spa_meta_objset; + uint64_t zap_obj = spa->spa_livelists_to_delete; + /* + * Determine the next livelist to delete. This function should only + * be called if there is at least one deleted clone. + */ + VERIFY0(dsl_get_next_livelist_obj(mos, zap_obj, &ll_obj)); + VERIFY0(zap_count(mos, ll_obj, &count)); + if (count > 0) { + dsl_deadlist_t ll = { 0 }; + dsl_deadlist_entry_t *dle; + bplist_t to_free; + dsl_deadlist_open(&ll, mos, ll_obj); + dle = dsl_deadlist_first(&ll); + ASSERT3P(dle, !=, NULL); + bplist_create(&to_free); + int err = dsl_process_sub_livelist(&dle->dle_bpobj, &to_free, + z, NULL); + if (err == 0) { + sublist_delete_arg_t sync_arg = { + .spa = spa, + .ll = &ll, + .key = dle->dle_mintxg, + .to_free = &to_free + }; + zfs_dbgmsg("deleting sublist (id %llu) from" + " livelist %llu, %d remaining", + dle->dle_bpobj.bpo_object, ll_obj, count - 1); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + sublist_delete_sync, &sync_arg, 0, + ZFS_SPACE_CHECK_DESTROY)); + } else { + VERIFY3U(err, ==, EINTR); + } + bplist_clear(&to_free); + bplist_destroy(&to_free); + dsl_deadlist_close(&ll); + } else { + livelist_delete_arg_t sync_arg = { + .spa = spa, + .ll_obj = ll_obj, + .zap_obj = zap_obj + }; + zfs_dbgmsg("deletion of livelist %llu completed", ll_obj); + VERIFY0(dsl_sync_task(spa_name(spa), NULL, livelist_delete_sync, + &sync_arg, 0, ZFS_SPACE_CHECK_DESTROY)); + } +} + +static void +spa_start_livelist_destroy_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_livelist_delete_zthr, ==, NULL); + spa->spa_livelist_delete_zthr = + zthr_create("z_livelist_destroy", + spa_livelist_delete_cb_check, spa_livelist_delete_cb, spa); +} + +typedef struct livelist_new_arg { + bplist_t *allocs; + bplist_t *frees; +} livelist_new_arg_t; + +static int +livelist_track_new_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(tx == NULL); + livelist_new_arg_t *lna = arg; + if (bp_freed) { + bplist_append(lna->frees, bp); + } else { + bplist_append(lna->allocs, bp); + zfs_livelist_condense_new_alloc++; + } + return (0); +} + +typedef struct livelist_condense_arg { + spa_t *spa; + bplist_t to_keep; + uint64_t first_size; + uint64_t next_size; +} livelist_condense_arg_t; + +static void +spa_livelist_condense_sync(void *arg, dmu_tx_t *tx) +{ + livelist_condense_arg_t *lca = arg; + spa_t *spa = lca->spa; + bplist_t new_frees; + dsl_dataset_t *ds = spa->spa_to_condense.ds; + + /* Have we been cancelled? */ + if (spa->spa_to_condense.cancelled) { + zfs_livelist_condense_sync_cancel++; + goto out; + } + + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + dsl_deadlist_t *ll = &ds->ds_dir->dd_livelist; + + /* + * It's possible that the livelist was changed while the zthr was + * running. Therefore, we need to check for new blkptrs in the two + * entries being condensed and continue to track them in the livelist. + * Because of the way we handle remapped blkptrs (see dbuf_remap_impl), + * it's possible that the newly added blkptrs are FREEs or ALLOCs so + * we need to sort them into two different bplists. + */ + uint64_t first_obj = first->dle_bpobj.bpo_object; + uint64_t next_obj = next->dle_bpobj.bpo_object; + uint64_t cur_first_size = first->dle_bpobj.bpo_phys->bpo_num_blkptrs; + uint64_t cur_next_size = next->dle_bpobj.bpo_phys->bpo_num_blkptrs; + + bplist_create(&new_frees); + livelist_new_arg_t new_bps = { + .allocs = &lca->to_keep, + .frees = &new_frees, + }; + + if (cur_first_size > lca->first_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&first->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->first_size)); + } + if (cur_next_size > lca->next_size) { + VERIFY0(livelist_bpobj_iterate_from_nofree(&next->dle_bpobj, + livelist_track_new_cb, &new_bps, lca->next_size)); + } + + dsl_deadlist_clear_entry(first, ll, tx); + ASSERT(bpobj_is_empty(&first->dle_bpobj)); + dsl_deadlist_remove_entry(ll, next->dle_mintxg, tx); + + bplist_iterate(&lca->to_keep, dsl_deadlist_insert_alloc_cb, ll, tx); + bplist_iterate(&new_frees, dsl_deadlist_insert_free_cb, ll, tx); + bplist_destroy(&new_frees); + + char dsname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds, dsname); + zfs_dbgmsg("txg %llu condensing livelist of %s (id %llu), bpobj %llu " + "(%llu blkptrs) and bpobj %llu (%llu blkptrs) -> bpobj %llu " + "(%llu blkptrs)", tx->tx_txg, dsname, ds->ds_object, first_obj, + cur_first_size, next_obj, cur_next_size, + first->dle_bpobj.bpo_object, + first->dle_bpobj.bpo_phys->bpo_num_blkptrs); +out: + dmu_buf_rele(ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + spa->spa_to_condense.syncing = B_FALSE; +} + +static void +spa_livelist_condense_cb(void *arg, zthr_t *t) +{ + while (zfs_livelist_condense_zthr_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + spa_t *spa = arg; + dsl_deadlist_entry_t *first = spa->spa_to_condense.first; + dsl_deadlist_entry_t *next = spa->spa_to_condense.next; + uint64_t first_size, next_size; + + livelist_condense_arg_t *lca = + kmem_alloc(sizeof (livelist_condense_arg_t), KM_SLEEP); + bplist_create(&lca->to_keep); + + /* + * Process the livelists (matching FREEs and ALLOCs) in open context + * so we have minimal work in syncing context to condense. + * + * We save bpobj sizes (first_size and next_size) to use later in + * syncing context to determine if entries were added to these sublists + * while in open context. This is possible because the clone is still + * active and open for normal writes and we want to make sure the new, + * unprocessed blockpointers are inserted into the livelist normally. + * + * Note that dsl_process_sub_livelist() both stores the size number of + * blockpointers and iterates over them while the bpobj's lock held, so + * the sizes returned to us are consistent which what was actually + * processed. + */ + int err = dsl_process_sub_livelist(&first->dle_bpobj, &lca->to_keep, t, + &first_size); + if (err == 0) + err = dsl_process_sub_livelist(&next->dle_bpobj, &lca->to_keep, + t, &next_size); + + if (err == 0) { + while (zfs_livelist_condense_sync_pause && + !(zthr_has_waiters(t) || zthr_iscancelled(t))) + delay(1); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_mark_netfree(tx); + dmu_tx_hold_space(tx, 1); + err = dmu_tx_assign(tx, TXG_NOWAIT | TXG_NOTHROTTLE); + if (err == 0) { + /* + * Prevent the condense zthr restarting before + * the synctask completes. + */ + spa->spa_to_condense.syncing = B_TRUE; + lca->spa = spa; + lca->first_size = first_size; + lca->next_size = next_size; + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_livelist_condense_sync, lca, 0, + ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + return; + } + } + /* + * Condensing can not continue: either it was externally stopped or + * we were unable to assign to a tx because the pool has run out of + * space. In the second case, we'll just end up trying to condense + * again in a later txg. + */ + ASSERT(err != 0); + bplist_clear(&lca->to_keep); + bplist_destroy(&lca->to_keep); + kmem_free(lca, sizeof (livelist_condense_arg_t)); + dmu_buf_rele(spa->spa_to_condense.ds->ds_dbuf, spa); + spa->spa_to_condense.ds = NULL; + if (err == EINTR) + zfs_livelist_condense_zthr_cancel++; +} + +/* ARGSUSED */ +/* + * Check that there is something to condense but that a condense is not + * already in progress and that condensing has not been cancelled. + */ +static boolean_t +spa_livelist_condense_cb_check(void *arg, zthr_t *z) +{ + spa_t *spa = arg; + if ((spa->spa_to_condense.ds != NULL) && + (spa->spa_to_condense.syncing == B_FALSE) && + (spa->spa_to_condense.cancelled == B_FALSE)) { + return (B_TRUE); + } + return (B_FALSE); +} + +static void +spa_start_livelist_condensing_thread(spa_t *spa) +{ + spa->spa_to_condense.ds = NULL; + spa->spa_to_condense.first = NULL; + spa->spa_to_condense.next = NULL; + spa->spa_to_condense.syncing = B_FALSE; + spa->spa_to_condense.cancelled = B_FALSE; + + ASSERT3P(spa->spa_livelist_condense_zthr, ==, NULL); + spa->spa_livelist_condense_zthr = + zthr_create("z_livelist_condense", + spa_livelist_condense_cb_check, + spa_livelist_condense_cb, spa); +} + +static void +spa_spawn_aux_threads(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_start_indirect_condensing_thread(spa); + spa_start_livelist_destroy_thread(spa); + spa_start_livelist_condensing_thread(spa); + + ASSERT3P(spa->spa_checkpoint_discard_zthr, ==, NULL); + spa->spa_checkpoint_discard_zthr = + zthr_create("z_checkpoint_discard", + spa_checkpoint_discard_thread_check, + spa_checkpoint_discard_thread, spa); +} + +/* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, with all the vdevs in place in + * the original pool. + */ +static void +spa_try_repair(spa_t *spa, nvlist_t *config) +{ + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; + } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. + */ + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); + } + + kmem_free(vd, gcount * sizeof (vdev_t *)); +} + +static int +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type) +{ + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + + spa->spa_load_state = state; + (void) spa_import_progress_set_state(spa_guid(spa), + spa_load_state(spa)); + + gethrestime(&spa->spa_loaded_ts); + error = spa_load_impl(spa, type, &ereport); + + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); + spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); + if (error) { + if (error != EEXIST) { + spa->spa_loaded_ts.tv_sec = 0; + spa->spa_loaded_ts.tv_nsec = 0; + } + if (error != EBADF) { + zfs_ereport_post(ereport, spa, NULL, NULL, NULL, 0, 0); + } + } + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + (void) spa_import_progress_set_state(spa_guid(spa), + spa_load_state(spa)); + + return (error); +} + +#ifdef ZFS_DEBUG +/* + * Count the number of per-vdev ZAPs associated with all of the vdevs in the + * vdev tree rooted in the given vd, and ensure that each ZAP is present in the + * spa's per-vdev ZAP list. + */ +static uint64_t +vdev_count_verify_zaps(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + uint64_t total = 0; + + if (vd->vdev_top_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_top_zap)); + } + if (vd->vdev_leaf_zap != 0) { + total++; + ASSERT0(zap_lookup_int(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, vd->vdev_leaf_zap)); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + total += vdev_count_verify_zaps(vd->vdev_child[i]); + } + + return (total); +} +#endif + +/* + * Determine whether the activity check is required. + */ +static boolean_t +spa_activity_check_required(spa_t *spa, uberblock_t *ub, nvlist_t *label, + nvlist_t *config) +{ + uint64_t state = 0; + uint64_t hostid = 0; + uint64_t tryconfig_txg = 0; + uint64_t tryconfig_timestamp = 0; + uint16_t tryconfig_mmp_seq = 0; + nvlist_t *nvinfo; + + if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { + nvinfo = fnvlist_lookup_nvlist(config, ZPOOL_CONFIG_LOAD_INFO); + (void) nvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG, + &tryconfig_txg); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + &tryconfig_timestamp); + (void) nvlist_lookup_uint16(nvinfo, ZPOOL_CONFIG_MMP_SEQ, + &tryconfig_mmp_seq); + } + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_STATE, &state); + + /* + * Disable the MMP activity check - This is used by zdb which + * is intended to be used on potentially active pools. + */ + if (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) + return (B_FALSE); + + /* + * Skip the activity check when the MMP feature is disabled. + */ + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay == 0) + return (B_FALSE); + + /* + * If the tryconfig_ values are nonzero, they are the results of an + * earlier tryimport. If they all match the uberblock we just found, + * then the pool has not changed and we return false so we do not test + * a second time. + */ + if (tryconfig_txg && tryconfig_txg == ub->ub_txg && + tryconfig_timestamp && tryconfig_timestamp == ub->ub_timestamp && + tryconfig_mmp_seq && tryconfig_mmp_seq == + (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) + return (B_FALSE); + + /* + * Allow the activity check to be skipped when importing the pool + * on the same host which last imported it. Since the hostid from + * configuration may be stale use the one read from the label. + */ + if (nvlist_exists(label, ZPOOL_CONFIG_HOSTID)) + hostid = fnvlist_lookup_uint64(label, ZPOOL_CONFIG_HOSTID); + + if (hostid == spa_get_hostid(spa)) + return (B_FALSE); + + /* + * Skip the activity test when the pool was cleanly exported. + */ + if (state != POOL_STATE_ACTIVE) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Nanoseconds the activity check must watch for changes on-disk. + */ +static uint64_t +spa_activity_check_duration(spa_t *spa, uberblock_t *ub) +{ + uint64_t import_intervals = MAX(zfs_multihost_import_intervals, 1); + uint64_t multihost_interval = MSEC2NSEC( + MMP_INTERVAL_OK(zfs_multihost_interval)); + uint64_t import_delay = MAX(NANOSEC, import_intervals * + multihost_interval); + + /* + * Local tunables determine a minimum duration except for the case + * where we know when the remote host will suspend the pool if MMP + * writes do not land. + * + * See Big Theory comment at the top of mmp.c for the reasoning behind + * these cases and times. + */ + + ASSERT(MMP_IMPORT_SAFETY_FACTOR >= 100); + + if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && + MMP_FAIL_INT(ub) > 0) { + + /* MMP on remote host will suspend pool after failed writes */ + import_delay = MMP_FAIL_INT(ub) * MSEC2NSEC(MMP_INTERVAL(ub)) * + MMP_IMPORT_SAFETY_FACTOR / 100; + + zfs_dbgmsg("fail_intvals>0 import_delay=%llu ub_mmp " + "mmp_fails=%llu ub_mmp mmp_interval=%llu " + "import_intervals=%u", import_delay, MMP_FAIL_INT(ub), + MMP_INTERVAL(ub), import_intervals); + + } else if (MMP_INTERVAL_VALID(ub) && MMP_FAIL_INT_VALID(ub) && + MMP_FAIL_INT(ub) == 0) { + + /* MMP on remote host will never suspend pool */ + import_delay = MAX(import_delay, (MSEC2NSEC(MMP_INTERVAL(ub)) + + ub->ub_mmp_delay) * import_intervals); + + zfs_dbgmsg("fail_intvals=0 import_delay=%llu ub_mmp " + "mmp_interval=%llu ub_mmp_delay=%llu " + "import_intervals=%u", import_delay, MMP_INTERVAL(ub), + ub->ub_mmp_delay, import_intervals); + + } else if (MMP_VALID(ub)) { + /* + * zfs-0.7 compatibility case + */ + + import_delay = MAX(import_delay, (multihost_interval + + ub->ub_mmp_delay) * import_intervals); + + zfs_dbgmsg("import_delay=%llu ub_mmp_delay=%llu " + "import_intervals=%u leaves=%u", import_delay, + ub->ub_mmp_delay, import_intervals, + vdev_count_leaves(spa)); + } else { + /* Using local tunings is the only reasonable option */ + zfs_dbgmsg("pool last imported on non-MMP aware " + "host using import_delay=%llu multihost_interval=%llu " + "import_intervals=%u", import_delay, multihost_interval, + import_intervals); + } + + return (import_delay); +} + +/* + * Perform the import activity check. If the user canceled the import or + * we detected activity then fail. + */ +static int +spa_activity_check(spa_t *spa, uberblock_t *ub, nvlist_t *config) +{ + uint64_t txg = ub->ub_txg; + uint64_t timestamp = ub->ub_timestamp; + uint64_t mmp_config = ub->ub_mmp_config; + uint16_t mmp_seq = MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0; + uint64_t import_delay; + hrtime_t import_expire; + nvlist_t *mmp_label = NULL; + vdev_t *rvd = spa->spa_root_vdev; + kcondvar_t cv; + kmutex_t mtx; + int error = 0; + + cv_init(&cv, NULL, CV_DEFAULT, NULL); + mutex_init(&mtx, NULL, MUTEX_DEFAULT, NULL); + mutex_enter(&mtx); + + /* + * If ZPOOL_CONFIG_MMP_TXG is present an activity check was performed + * during the earlier tryimport. If the txg recorded there is 0 then + * the pool is known to be active on another host. + * + * Otherwise, the pool might be in use on another host. Check for + * changes in the uberblocks on disk if necessary. + */ + if (nvlist_exists(config, ZPOOL_CONFIG_LOAD_INFO)) { + nvlist_t *nvinfo = fnvlist_lookup_nvlist(config, + ZPOOL_CONFIG_LOAD_INFO); + + if (nvlist_exists(nvinfo, ZPOOL_CONFIG_MMP_TXG) && + fnvlist_lookup_uint64(nvinfo, ZPOOL_CONFIG_MMP_TXG) == 0) { + vdev_uberblock_load(rvd, ub, &mmp_label); + error = SET_ERROR(EREMOTEIO); + goto out; + } + } + + import_delay = spa_activity_check_duration(spa, ub); + + /* Add a small random factor in case of simultaneous imports (0-25%) */ + import_delay += import_delay * spa_get_random(250) / 1000; + + import_expire = gethrtime() + import_delay; + + while (gethrtime() < import_expire) { + (void) spa_import_progress_set_mmp_check(spa_guid(spa), + NSEC2SEC(import_expire - gethrtime())); + + vdev_uberblock_load(rvd, ub, &mmp_label); + + if (txg != ub->ub_txg || timestamp != ub->ub_timestamp || + mmp_seq != (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)) { + zfs_dbgmsg("multihost activity detected " + "txg %llu ub_txg %llu " + "timestamp %llu ub_timestamp %llu " + "mmp_config %#llx ub_mmp_config %#llx", + txg, ub->ub_txg, timestamp, ub->ub_timestamp, + mmp_config, ub->ub_mmp_config); + + error = SET_ERROR(EREMOTEIO); + break; + } + + if (mmp_label) { + nvlist_free(mmp_label); + mmp_label = NULL; + } + + error = cv_timedwait_sig(&cv, &mtx, ddi_get_lbolt() + hz); + if (error != -1) { + error = SET_ERROR(EINTR); + break; + } + error = 0; + } + +out: + mutex_exit(&mtx); + mutex_destroy(&mtx); + cv_destroy(&cv); + + /* + * If the pool is determined to be active store the status in the + * spa->spa_load_info nvlist. If the remote hostname or hostid are + * available from configuration read from disk store them as well. + * This allows 'zpool import' to generate a more useful message. + * + * ZPOOL_CONFIG_MMP_STATE - observed pool status (mandatory) + * ZPOOL_CONFIG_MMP_HOSTNAME - hostname from the active pool + * ZPOOL_CONFIG_MMP_HOSTID - hostid from the active pool + */ + if (error == EREMOTEIO) { + char *hostname = ""; + uint64_t hostid = 0; + + if (mmp_label) { + if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTNAME)) { + hostname = fnvlist_lookup_string(mmp_label, + ZPOOL_CONFIG_HOSTNAME); + fnvlist_add_string(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTNAME, hostname); + } + + if (nvlist_exists(mmp_label, ZPOOL_CONFIG_HOSTID)) { + hostid = fnvlist_lookup_uint64(mmp_label, + ZPOOL_CONFIG_HOSTID); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_HOSTID, hostid); + } + } + + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_ACTIVE); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TXG, 0); + + error = spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO); + } + + if (mmp_label) + nvlist_free(mmp_label); + + return (error); +} + +static int +spa_verify_host(spa_t *spa, nvlist_t *mos_config) +{ + uint64_t hostid; + char *hostname; + uint64_t myhostid = 0; + + if (!spa_is_root(spa) && nvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_HOSTID, &hostid) == 0) { + hostname = fnvlist_lookup_string(mos_config, + ZPOOL_CONFIG_HOSTNAME); + + myhostid = zone_get_hostid(NULL); + + if (hostid != 0 && myhostid != 0 && hostid != myhostid) { + cmn_err(CE_WARN, "pool '%s' could not be " + "loaded as it was last accessed by " + "another system (host: %s hostid: 0x%llx). " + "See: http://illumos.org/msg/ZFS-8000-EY", + spa_name(spa), hostname, (u_longlong_t)hostid); + spa_load_failed(spa, "hostid verification failed: pool " + "last accessed by host: %s (hostid: 0x%llx)", + hostname, (u_longlong_t)hostid); + return (SET_ERROR(EBADF)); + } + } + + return (0); +} + +static int +spa_ld_parse_config(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + nvlist_t *nvtree, *nvl, *config = spa->spa_config; + int parse; + vdev_t *rvd; + uint64_t pool_guid; + char *comment; + + /* + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. + */ + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_POOL_GUID); + return (SET_ERROR(EINVAL)); + } + + /* + * If we are doing an import, ensure that the pool is not already + * imported by checking if its pool guid already exists in the + * spa namespace. + * + * The only case that we allow an already imported pool to be + * imported again, is when the pool is checkpointed and we want to + * look at its checkpointed state from userland tools like zdb. + */ +#ifdef _KERNEL + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { +#else + if ((spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0) && + !spa_importing_readonly_checkpoint(spa)) { +#endif + spa_load_failed(spa, "a pool with guid %llu is already open", + (u_longlong_t)pool_guid); + return (SET_ERROR(EEXIST)); + } + + spa->spa_config_guid = pool_guid; + + nvlist_free(spa->spa_load_info); + spa->spa_load_info = fnvlist_alloc(); + + ASSERT(spa->spa_comment == NULL); + if (nvlist_lookup_string(config, ZPOOL_CONFIG_COMMENT, &comment) == 0) + spa->spa_comment = spa_strdup(comment); + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) == 0) + spa->spa_config_splitting = fnvlist_dup(nvl); + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtree)) { + spa_load_failed(spa, "invalid config provided: '%s' missing", + ZPOOL_CONFIG_VDEV_TREE); + return (SET_ERROR(EINVAL)); + } + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + error = spa_config_parse(spa, &rvd, nvtree, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "unable to parse config [error=%d]", + error); + return (error); + } + + ASSERT(spa->spa_root_vdev == rvd); + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, <=, SPA_MAXBLOCKSHIFT); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + return (0); +} + +/* + * Recursively open all vdevs in the vdev tree. This function is called twice: + * first with the untrusted config, then with the trusted config. + */ +static int +spa_ld_open_vdevs(spa_t *spa) +{ + int error = 0; + + /* + * spa_missing_tvds_allowed defines how many top-level vdevs can be + * missing/unopenable for the root vdev to be still considered openable. + */ + if (spa->spa_trust_config) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_CACHEFILE) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_cachefile; + } else if (spa->spa_config_source == SPA_CONFIG_SRC_SCAN) { + spa->spa_missing_tvds_allowed = zfs_max_missing_tvds_scan; + } else { + spa->spa_missing_tvds_allowed = 0; + } + + spa->spa_missing_tvds_allowed = + MAX(zfs_max_missing_tvds, spa->spa_missing_tvds_allowed); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "vdev tree has %lld missing top-level " + "vdevs.", (u_longlong_t)spa->spa_missing_tvds); + if (spa->spa_trust_config && (spa->spa_mode & SPA_MODE_WRITE)) { + /* + * Although theoretically we could allow users to open + * incomplete pools in RW mode, we'd need to add a lot + * of extra logic (e.g. adjust pool space to account + * for missing vdevs). + * This limitation also prevents users from accidentally + * opening the pool in RW mode during data recovery and + * damaging it further. + */ + spa_load_note(spa, "pools with missing top-level " + "vdevs can only be opened in read-only mode."); + error = SET_ERROR(ENXIO); + } else { + spa_load_note(spa, "current settings allow for maximum " + "%lld missing top-level vdevs at this stage.", + (u_longlong_t)spa->spa_missing_tvds_allowed); + } + } + if (error != 0) { + spa_load_failed(spa, "unable to open vdev tree [error=%d]", + error); + } + if (spa->spa_missing_tvds != 0 || error != 0) + vdev_dbgmsg_print_tree(spa->spa_root_vdev, 2); + + return (error); +} + +/* + * We need to validate the vdev labels against the configuration that + * we have in hand. This function is called twice: first with an untrusted + * config, then with a trusted config. The validation is more strict when the + * config is trusted. + */ +static int +spa_ld_validate_vdevs(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "vdev_validate failed [error=%d]", error); + return (error); + } + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { + spa_load_failed(spa, "cannot open vdev tree after invalidating " + "some vdevs"); + vdev_dbgmsg_print_tree(rvd, 2); + return (SET_ERROR(ENXIO)); + } + + return (0); +} + +static void +spa_ld_select_uberblock_done(spa_t *spa, uberblock_t *ub) +{ + spa->spa_state = POOL_STATE_ACTIVE; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + spa->spa_prev_software_version = ub->ub_software_version; +} + +static int +spa_ld_select_uberblock(spa_t *spa, spa_import_type_t type) +{ + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *label; + uberblock_t *ub = &spa->spa_uberblock; + boolean_t activity_check = B_FALSE; + + /* + * If we are opening the checkpointed state of the pool by + * rewinding to it, at this point we will have written the + * checkpointed uberblock to the vdev labels, so searching + * the labels will find the right uberblock. However, if + * we are opening the checkpointed state read-only, we have + * not modified the labels. Therefore, we must ignore the + * labels and continue using the spa_uberblock that was set + * by spa_ld_checkpoint_rewind. + * + * Note that it would be fine to ignore the labels when + * rewinding (opening writeable) as well. However, if we + * crash just after writing the labels, we will end up + * searching the labels. Doing so in the common case means + * that this code path gets exercised normally, rather than + * just in the edge case. + */ + if (ub->ub_checkpoint_txg != 0 && + spa_importing_readonly_checkpoint(spa)) { + spa_ld_select_uberblock_done(spa, ub); + return (0); + } + + /* + * Find the best uberblock. + */ + vdev_uberblock_load(rvd, ub, &label); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) { + nvlist_free(label); + spa_load_failed(spa, "no valid uberblock found"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + } + + if (spa->spa_load_max_txg != UINT64_MAX) { + (void) spa_import_progress_set_max_txg(spa_guid(spa), + (u_longlong_t)spa->spa_load_max_txg); + } + spa_load_note(spa, "using uberblock with txg=%llu", + (u_longlong_t)ub->ub_txg); + + + /* + * For pools which have the multihost property on determine if the + * pool is truly inactive and can be safely imported. Prevent + * hosts which don't have a hostid set from importing the pool. + */ + activity_check = spa_activity_check_required(spa, ub, label, + spa->spa_config); + if (activity_check) { + if (ub->ub_mmp_magic == MMP_MAGIC && ub->ub_mmp_delay && + spa_get_hostid(spa) == 0) { + nvlist_free(label); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } + + int error = spa_activity_check(spa, ub, spa->spa_config); + if (error) { + nvlist_free(label); + return (error); + } + + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_INACTIVE); + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_TXG, ub->ub_txg); + fnvlist_add_uint16(spa->spa_load_info, + ZPOOL_CONFIG_MMP_SEQ, + (MMP_SEQ_VALID(ub) ? MMP_SEQ(ub) : 0)); + } + + /* + * If the pool has an unsupported version we can't open it. + */ + if (!SPA_VERSION_IS_SUPPORTED(ub->ub_version)) { + nvlist_free(label); + spa_load_failed(spa, "version %llu is not supported", + (u_longlong_t)ub->ub_version); + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + } + + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *features; + + /* + * If we weren't able to find what's necessary for reading the + * MOS in the label, return failure. + */ + if (label == NULL) { + spa_load_failed(spa, "label config unavailable"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) != 0) { + nvlist_free(label); + spa_load_failed(spa, "invalid label: '%s' missing", + ZPOOL_CONFIG_FEATURES_FOR_READ); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + ENXIO)); + } + + /* + * Update our in-core representation with the definitive values + * from the label. + */ + nvlist_free(spa->spa_label_features); + VERIFY(nvlist_dup(features, &spa->spa_label_features, 0) == 0); + } + + nvlist_free(label); + + /* + * Look through entries in the label nvlist's features_for_read. If + * there is a feature listed there which we don't understand then we + * cannot open a pool. + */ + if (ub->ub_version >= SPA_VERSION_FEATURES) { + nvlist_t *unsup_feat; + + VERIFY(nvlist_alloc(&unsup_feat, NV_UNIQUE_NAME, KM_SLEEP) == + 0); + + for (nvpair_t *nvp = nvlist_next_nvpair(spa->spa_label_features, + NULL); nvp != NULL; + nvp = nvlist_next_nvpair(spa->spa_label_features, nvp)) { + if (!zfeature_is_supported(nvpair_name(nvp))) { + VERIFY(nvlist_add_string(unsup_feat, + nvpair_name(nvp), "") == 0); + } + } + + if (!nvlist_empty(unsup_feat)) { + VERIFY(nvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat) == 0); + nvlist_free(unsup_feat); + spa_load_failed(spa, "some features are unsupported"); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + nvlist_free(unsup_feat); + } + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, spa->spa_config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + } + + /* + * Initialize internal SPA structures. + */ + spa_ld_select_uberblock_done(spa, ub); + + return (0); +} + +static int +spa_ld_open_rootbp(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = dsl_pool_init(spa, spa->spa_first_txg, &spa->spa_dsl_pool); + if (error != 0) { + spa_load_failed(spa, "unable to open rootbp in dsl_pool_init " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; + + return (0); +} + +static int +spa_ld_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t reloading) +{ + vdev_t *mrvd, *rvd = spa->spa_root_vdev; + nvlist_t *nv, *mos_config, *policy; + int error = 0, copy_error; + uint64_t healthy_tvds, healthy_tvds_mos; + uint64_t mos_config_txg; + + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object, B_TRUE) + != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling a pool from a split, the config provided is + * already trusted so there is nothing to do. + */ + if (type == SPA_IMPORT_ASSEMBLE) + return (0); + + healthy_tvds = spa_healthy_core_tvds(spa); + + if (load_nvlist(spa, spa->spa_config_object, &mos_config) + != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * If we are doing an open, pool owner wasn't verified yet, thus do + * the verification here. + */ + if (spa->spa_load_state == SPA_LOAD_OPEN) { + error = spa_verify_host(spa, mos_config); + if (error != 0) { + nvlist_free(mos_config); + return (error); + } + } + + nv = fnvlist_lookup_nvlist(mos_config, ZPOOL_CONFIG_VDEV_TREE); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Build a new vdev tree from the trusted config + */ + VERIFY(spa_config_parse(spa, &mrvd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + /* + * Vdev paths in the MOS may be obsolete. If the untrusted config was + * obtained by scanning /dev/dsk, then it will have the right vdev + * paths. We update the trusted MOS config with this information. + * We first try to copy the paths with vdev_copy_path_strict, which + * succeeds only when both configs have exactly the same vdev tree. + * If that fails, we fall back to a more flexible method that has a + * best effort policy. + */ + copy_error = vdev_copy_path_strict(rvd, mrvd); + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "provided vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + spa_load_note(spa, "MOS vdev tree:"); + vdev_dbgmsg_print_tree(mrvd, 2); + } + if (copy_error != 0) { + spa_load_note(spa, "vdev_copy_path_strict failed, falling " + "back to vdev_copy_path_relaxed"); + vdev_copy_path_relaxed(rvd, mrvd); + } + + vdev_close(rvd); + vdev_free(rvd); + spa->spa_root_vdev = mrvd; + rvd = mrvd; + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * We will use spa_config if we decide to reload the spa or if spa_load + * fails and we rewind. We must thus regenerate the config using the + * MOS information with the updated paths. ZPOOL_LOAD_POLICY is used to + * pass settings on how to load the pool and is not stored in the MOS. + * We copy it over to our new, trusted config. + */ + mos_config_txg = fnvlist_lookup_uint64(mos_config, + ZPOOL_CONFIG_POOL_TXG); + nvlist_free(mos_config); + mos_config = spa_config_generate(spa, NULL, mos_config_txg, B_FALSE); + if (nvlist_lookup_nvlist(spa->spa_config, ZPOOL_LOAD_POLICY, + &policy) == 0) + fnvlist_add_nvlist(mos_config, ZPOOL_LOAD_POLICY, policy); + spa_config_set(spa, mos_config); + spa->spa_config_source = SPA_CONFIG_SRC_MOS; + + /* + * Now that we got the config from the MOS, we should be more strict + * in checking blkptrs and can make assumptions about the consistency + * of the vdev tree. spa_trust_config must be set to true before opening + * vdevs in order for them to be writeable. + */ + spa->spa_trust_config = B_TRUE; + + /* + * Open and validate the new vdev tree + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + + if (copy_error != 0 || spa_load_print_vdev_tree) { + spa_load_note(spa, "final vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + } + + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT && + !spa->spa_extreme_rewind && zfs_max_missing_tvds == 0) { + /* + * Sanity check to make sure that we are indeed loading the + * latest uberblock. If we missed SPA_SYNC_MIN_VDEVS tvds + * in the config provided and they happened to be the only ones + * to have the latest uberblock, we could involuntarily perform + * an extreme rewind. + */ + healthy_tvds_mos = spa_healthy_core_tvds(spa); + if (healthy_tvds_mos - healthy_tvds >= + SPA_SYNC_MIN_VDEVS) { + spa_load_note(spa, "config provided misses too many " + "top-level vdevs compared to MOS (%lld vs %lld). ", + (u_longlong_t)healthy_tvds, + (u_longlong_t)healthy_tvds_mos); + spa_load_note(spa, "vdev tree:"); + vdev_dbgmsg_print_tree(rvd, 2); + if (reloading) { + spa_load_failed(spa, "config was already " + "provided from MOS. Aborting."); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + spa_load_note(spa, "spa must be reloaded using MOS " + "config"); + return (SET_ERROR(EAGAIN)); + } + } + + error = spa_check_for_missing_logs(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (rvd->vdev_guid_sum != spa->spa_uberblock.ub_guid_sum) { + spa_load_failed(spa, "uberblock guid sum doesn't match MOS " + "guid sum (%llu != %llu)", + (u_longlong_t)spa->spa_uberblock.ub_guid_sum, + (u_longlong_t)rvd->vdev_guid_sum); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, + ENXIO)); + } + + return (0); +} + +static int +spa_ld_open_indirect_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * Everything that we read before spa_remove_init() must be stored + * on concreted vdevs. Therefore we do this as early as possible. + */ + error = spa_remove_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_remove_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Retrieve information needed to condense indirect vdev mappings. + */ + error = spa_condense_init(spa); + if (error != 0) { + spa_load_failed(spa, "spa_condense_init failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + return (0); +} + +static int +spa_ld_check_features(spa_t *spa, boolean_t *missing_feat_writep) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + if (spa_version(spa) >= SPA_VERSION_FEATURES) { + boolean_t missing_feat_read = B_FALSE; + nvlist_t *unsup_feat, *enabled_feat; + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_READ, + &spa->spa_feat_for_read_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURES_FOR_WRITE, + &spa->spa_feat_for_write_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_FEATURE_DESCRIPTIONS, + &spa->spa_feat_desc_obj, B_TRUE) != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + enabled_feat = fnvlist_alloc(); + unsup_feat = fnvlist_alloc(); + + if (!spa_features_check(spa, B_FALSE, + unsup_feat, enabled_feat)) + missing_feat_read = B_TRUE; + + if (spa_writeable(spa) || + spa->spa_load_state == SPA_LOAD_TRYIMPORT) { + if (!spa_features_check(spa, B_TRUE, + unsup_feat, enabled_feat)) { + *missing_feat_writep = B_TRUE; + } + } + + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_ENABLED_FEAT, enabled_feat); + + if (!nvlist_empty(unsup_feat)) { + fnvlist_add_nvlist(spa->spa_load_info, + ZPOOL_CONFIG_UNSUP_FEAT, unsup_feat); + } + + fnvlist_free(enabled_feat); + fnvlist_free(unsup_feat); + + if (!missing_feat_read) { + fnvlist_add_boolean(spa->spa_load_info, + ZPOOL_CONFIG_CAN_RDONLY); + } + + /* + * If the state is SPA_LOAD_TRYIMPORT, our objective is + * twofold: to determine whether the pool is available for + * import in read-write mode and (if it is not) whether the + * pool is available for import in read-only mode. If the pool + * is available for import in read-write mode, it is displayed + * as available in userland; if it is not available for import + * in read-only mode, it is displayed as unavailable in + * userland. If the pool is available for import in read-only + * mode but not read-write mode, it is displayed as unavailable + * in userland with a special note that the pool is actually + * available for open in read-only mode. + * + * As a result, if the state is SPA_LOAD_TRYIMPORT and we are + * missing a feature for write, we must first determine whether + * the pool can be opened read-only before returning to + * userland in order to know whether to display the + * abovementioned note. + */ + if (missing_feat_read || (*missing_feat_writep && + spa_writeable(spa))) { + spa_load_failed(spa, "pool uses unsupported features"); + return (spa_vdev_err(rvd, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + /* + * Load refcounts for ZFS features from disk into an in-memory + * cache during SPA initialization. + */ + for (spa_feature_t i = 0; i < SPA_FEATURES; i++) { + uint64_t refcount; + + error = feature_get_refcount_from_disk(spa, + &spa_feature_table[i], &refcount); + if (error == 0) { + spa->spa_feat_refcount_cache[i] = refcount; + } else if (error == ENOTSUP) { + spa->spa_feat_refcount_cache[i] = + SPA_FEATURE_DISABLED; + } else { + spa_load_failed(spa, "error getting refcount " + "for feature %s [error=%d]", + spa_feature_table[i].fi_guid, error); + return (spa_vdev_err(rvd, + VDEV_AUX_CORRUPT_DATA, EIO)); + } + } + } + + if (spa_feature_is_active(spa, SPA_FEATURE_ENABLED_TXG)) { + if (spa_dir_prop(spa, DMU_POOL_FEATURE_ENABLED_TXG, + &spa->spa_feat_enabled_txg_obj, B_TRUE) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Encryption was added before bookmark_v2, even though bookmark_v2 + * is now a dependency. If this pool has encryption enabled without + * bookmark_v2, trigger an errata message. + */ + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && + !spa_feature_is_enabled(spa, SPA_FEATURE_BOOKMARK_V2)) { + spa->spa_errata = ZPOOL_ERRATA_ZOL_8308_ENCRYPTION; + } + + return (0); +} + +static int +spa_ld_load_special_directories(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + spa->spa_is_initializing = B_TRUE; + error = dsl_pool_open(spa->spa_dsl_pool); + spa->spa_is_initializing = B_FALSE; + if (error != 0) { + spa_load_failed(spa, "dsl_pool_open failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_get_props(spa_t *spa) +{ + int error = 0; + uint64_t obj; + vdev_t *rvd = spa->spa_root_vdev; + + /* Grab the checksum salt from the MOS. */ + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes); + if (error == ENOENT) { + /* Generate a new salt for subsequent use */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + } else if (error != 0) { + spa_load_failed(spa, "unable to retrieve checksum salt from " + "MOS [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPOBJ, &obj, B_TRUE) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + error = bpobj_open(&spa->spa_deferred_bpobj, spa->spa_meta_objset, obj); + if (error != 0) { + spa_load_failed(spa, "error opening deferred-frees bpobj " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + /* + * Load the bit that tells us to use the new accounting function + * (raid-z deflation). If we have an older pool, this will not + * be present. + */ + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_CREATION_VERSION, + &spa->spa_creation_version, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the persistent error log. If we have an older pool, this will + * not be present. + */ + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last, + B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, + &spa->spa_errlog_scrub, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the livelist deletion field. If a livelist is queued for + * deletion, indicate that in the spa + */ + error = spa_dir_prop(spa, DMU_POOL_DELETED_CLONES, + &spa->spa_livelists_to_delete, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the history object. If we have an older pool, this + * will not be present. + */ + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * Load the per-vdev ZAP map. If we have an older pool, this will not + * be present; in this case, defer its creation to a later time to + * avoid dirtying the MOS this early / out of sync context. See + * spa_sync_config_object. + */ + + /* The sentinel is only available in the MOS config. */ + nvlist_t *mos_config; + if (load_nvlist(spa, spa->spa_config_object, &mos_config) != 0) { + spa_load_failed(spa, "unable to retrieve MOS config"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + error = spa_dir_prop(spa, DMU_POOL_VDEV_ZAP_MAP, + &spa->spa_all_vdev_zaps, B_FALSE); + + if (error == ENOENT) { + VERIFY(!nvlist_exists(mos_config, + ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + spa->spa_avz_action = AVZ_ACTION_INITIALIZE; + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } else if (error != 0) { + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } else if (!nvlist_exists(mos_config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)) { + /* + * An older version of ZFS overwrote the sentinel value, so + * we have orphaned per-vdev ZAPs in the MOS. Defer their + * destruction to later; see spa_sync_config_object. + */ + spa->spa_avz_action = AVZ_ACTION_DESTROY; + /* + * We're assuming that no vdevs have had their ZAPs created + * before this. Better be sure of it. + */ + ASSERT0(vdev_count_verify_zaps(spa->spa_root_vdev)); + } + nvlist_free(mos_config); + + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object, + B_FALSE); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (error == 0) { + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_MULTIHOST, &spa->spa_multihost); + spa_prop_find(spa, ZPOOL_PROP_AUTOTRIM, &spa->spa_autotrim); + spa->spa_autoreplace = (autoreplace != 0); + } + + /* + * If we are importing a pool with missing top-level vdevs, + * we enforce that the pool doesn't panic or get suspended on + * error since the likelihood of missing data is extremely high. + */ + if (spa->spa_missing_tvds > 0 && + spa->spa_failmode != ZIO_FAILURE_MODE_CONTINUE && + spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_load_note(spa, "forcing failmode to 'continue' " + "as some top level vdevs are missing"); + spa->spa_failmode = ZIO_FAILURE_MODE_CONTINUE; + } + + return (0); +} + +static int +spa_ld_open_aux_vdevs(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If we're assembling the pool from the split-off vdevs of + * an existing pool, we don't want to attach the spares & cache + * devices. + */ + + /* + * Load any hot spares for this pool. + */ + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object, + B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); + if (load_nvlist(spa, spa->spa_spares.sav_object, + &spa->spa_spares.sav_config) != 0) { + spa_load_failed(spa, "error loading spares nvlist"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Load any level 2 ARC devices for this pool. + */ + error = spa_dir_prop(spa, DMU_POOL_L2CACHE, + &spa->spa_l2cache.sav_object, B_FALSE); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); + if (load_nvlist(spa, spa->spa_l2cache.sav_object, + &spa->spa_l2cache.sav_config) != 0) { + spa_load_failed(spa, "error loading l2cache nvlist"); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_l2cache.sav_sync = B_TRUE; + } + + return (0); +} + +static int +spa_ld_load_vdev_metadata(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * If the 'multihost' property is set, then never allow a pool to + * be imported when the system hostid is zero. The exception to + * this rule is zdb which is always allowed to access pools. + */ + if (spa_multihost(spa) && spa_get_hostid(spa) == 0 && + (spa->spa_import_flags & ZFS_IMPORT_SKIP_MMP) == 0) { + fnvlist_add_uint64(spa->spa_load_info, + ZPOOL_CONFIG_MMP_STATE, MMP_STATE_NO_HOSTID); + return (spa_vdev_err(rvd, VDEV_AUX_ACTIVE, EREMOTEIO)); + } + + /* + * If the 'autoreplace' property is set, then post a resource notifying + * the ZFS DE that it should not issue any faults for unopenable + * devices. We also iterate over the vdevs, and post a sysevent for any + * unopenable vdevs so that the normal autoreplace handler can take + * over. + */ + if (spa->spa_autoreplace && spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + spa_check_removed(spa->spa_root_vdev); + /* + * For the import case, this is done in spa_import(), because + * at this point we're using the spare definitions from + * the MOS config, not necessarily from the userland config. + */ + if (spa->spa_load_state != SPA_LOAD_IMPORT) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + } + + /* + * Load the vdev metadata such as metaslabs, DTLs, spacemap object, etc. + */ + error = vdev_load(rvd); + if (error != 0) { + spa_load_failed(spa, "vdev_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + error = spa_ld_log_spacemaps(spa); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data failed [error=%d]", + error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, error)); + } + + /* + * Propagate the leaf DTLs we just loaded all the way up the vdev tree. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_dtl_reassess(rvd, 0, 0, B_FALSE, B_FALSE); + spa_config_exit(spa, SCL_ALL, FTAG); + + return (0); +} + +static int +spa_ld_load_dedup_tables(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + error = ddt_load(spa); + if (error != 0) { + spa_load_failed(spa, "ddt_load failed [error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + } + + return (0); +} + +static int +spa_ld_verify_logs(spa_t *spa, spa_import_type_t type, char **ereport) +{ + vdev_t *rvd = spa->spa_root_vdev; + + if (type != SPA_IMPORT_ASSEMBLE && spa_writeable(spa)) { + boolean_t missing = spa_check_logs(spa); + if (missing) { + if (spa->spa_missing_tvds != 0) { + spa_load_note(spa, "spa_check_logs failed " + "so dropping the logs"); + } else { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + spa_load_failed(spa, "spa_check_logs failed"); + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, + ENXIO)); + } + } + } + + return (0); +} + +static int +spa_ld_verify_pool_data(spa_t *spa) +{ + int error = 0; + vdev_t *rvd = spa->spa_root_vdev; + + /* + * We've successfully opened the pool, verify that we're ready + * to start pushing transactions. + */ + if (spa->spa_load_state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error != 0) { + spa_load_failed(spa, "spa_load_verify failed " + "[error=%d]", error); + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + } + + return (0); +} + +static void +spa_ld_claim_log_blocks(spa_t *spa) +{ + dmu_tx_t *tx; + dsl_pool_t *dp = spa_get_dsl(spa); + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. + */ + spa->spa_claiming = B_TRUE; + + tx = dmu_tx_create_assigned(dp, spa_first_txg(spa)); + (void) dmu_objset_find_dp(dp, dp->dp_root_dir_obj, + zil_claim, tx, DS_FIND_CHILDREN); + dmu_tx_commit(tx); + + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); +} + +static void +spa_ld_check_for_config_update(spa_t *spa, uint64_t config_cache_txg, + boolean_t update_config_cache) +{ + vdev_t *rvd = spa->spa_root_vdev; + int need_update = B_FALSE; + + /* + * If the config cache is stale, or we have uninitialized + * metaslabs (see spa_vdev_add()), then update the config. + * + * If this is a verbatim import, trust the current + * in-core spa_config and update the disk labels. + */ + if (update_config_cache || config_cache_txg != spa->spa_config_txg || + spa->spa_load_state == SPA_LOAD_IMPORT || + spa->spa_load_state == SPA_LOAD_RECOVER || + (spa->spa_import_flags & ZFS_IMPORT_VERBATIM)) + need_update = B_TRUE; + + for (int c = 0; c < rvd->vdev_children; c++) + if (rvd->vdev_child[c]->vdev_ms_array == 0) + need_update = B_TRUE; + + /* + * Update the config cache asynchronously in case we're the + * root pool, in which case the config cache isn't writable yet. + */ + if (need_update) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + +static void +spa_ld_prepare_for_reload(spa_t *spa) +{ + spa_mode_t mode = spa->spa_mode; + int async_suspended = spa->spa_async_suspended; + + spa_unload(spa); + spa_deactivate(spa); + spa_activate(spa, mode); + + /* + * We save the value of spa_async_suspended as it gets reset to 0 by + * spa_unload(). We want to restore it back to the original value before + * returning as we might be calling spa_async_resume() later. + */ + spa->spa_async_suspended = async_suspended; +} + +static int +spa_ld_read_checkpoint_txg(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT0(spa->spa_checkpoint_txg); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error == ENOENT) + return (0); + + if (error != 0) + return (error); + + ASSERT3U(checkpoint.ub_txg, !=, 0); + ASSERT3U(checkpoint.ub_checkpoint_txg, !=, 0); + ASSERT3U(checkpoint.ub_timestamp, !=, 0); + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + return (0); +} + +static int +spa_ld_mos_init(spa_t *spa, spa_import_type_t type) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + /* + * Never trust the config that is provided unless we are assembling + * a pool following a split. + * This means don't trust blkptrs and the vdev tree in general. This + * also effectively puts the spa in read-only mode since + * spa_writeable() checks for spa_trust_config to be true. + * We will later load a trusted config from the MOS. + */ + if (type != SPA_IMPORT_ASSEMBLE) + spa->spa_trust_config = B_FALSE; + + /* + * Parse the config provided to create a vdev tree. + */ + error = spa_ld_parse_config(spa, type); + if (error != 0) + return (error); + + spa_import_progress_add(spa); + + /* + * Now that we have the vdev tree, try to open each vdev. This involves + * opening the underlying physical device, retrieving its geometry and + * probing the vdev with a dummy I/O. The state of each vdev will be set + * based on the success of those operations. After this we'll be ready + * to read from the vdevs. + */ + error = spa_ld_open_vdevs(spa); + if (error != 0) + return (error); + + /* + * Read the label of each vdev and make sure that the GUIDs stored + * there match the GUIDs in the config provided. + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + error = spa_ld_validate_vdevs(spa); + if (error != 0) + return (error); + } + + /* + * Read all vdev labels to find the best uberblock (i.e. latest, + * unless spa_load_max_txg is set) and store it in spa_uberblock. We + * get the list of features required to read blkptrs in the MOS from + * the vdev label with the best uberblock and verify that our version + * of zfs supports them all. + */ + error = spa_ld_select_uberblock(spa, type); + if (error != 0) + return (error); + + /* + * Pass that uberblock to the dsl_pool layer which will open the root + * blkptr. This blkptr points to the latest version of the MOS and will + * allow us to read its contents. + */ + error = spa_ld_open_rootbp(spa); + if (error != 0) + return (error); + + return (0); +} + +static int +spa_ld_checkpoint_rewind(spa_t *spa) +{ + uberblock_t checkpoint; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, sizeof (uint64_t), + sizeof (uberblock_t) / sizeof (uint64_t), &checkpoint); + + if (error != 0) { + spa_load_failed(spa, "unable to retrieve checkpointed " + "uberblock from the MOS config [error=%d]", error); + + if (error == ENOENT) + error = ZFS_ERR_NO_CHECKPOINT; + + return (error); + } + + ASSERT3U(checkpoint.ub_txg, <, spa->spa_uberblock.ub_txg); + ASSERT3U(checkpoint.ub_txg, ==, checkpoint.ub_checkpoint_txg); + + /* + * We need to update the txg and timestamp of the checkpointed + * uberblock to be higher than the latest one. This ensures that + * the checkpointed uberblock is selected if we were to close and + * reopen the pool right after we've written it in the vdev labels. + * (also see block comment in vdev_uberblock_compare) + */ + checkpoint.ub_txg = spa->spa_uberblock.ub_txg + 1; + checkpoint.ub_timestamp = gethrestime_sec(); + + /* + * Set current uberblock to be the checkpointed uberblock. + */ + spa->spa_uberblock = checkpoint; + + /* + * If we are doing a normal rewind, then the pool is open for + * writing and we sync the "updated" checkpointed uberblock to + * disk. Once this is done, we've basically rewound the whole + * pool and there is no way back. + * + * There are cases when we don't want to attempt and sync the + * checkpointed uberblock to disk because we are opening a + * pool as read-only. Specifically, verifying the checkpointed + * state with zdb, and importing the checkpointed state to get + * a "preview" of its content. + */ + if (spa_writeable(spa)) { + vdev_t *rvd = spa->spa_root_vdev; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vdev_t *vd = rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; + } + error = vdev_config_sync(svd, svdcount, spa->spa_first_txg); + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_load_failed(spa, "failed to write checkpointed " + "uberblock to the vdev labels [error=%d]", error); + return (error); + } + } + + return (0); +} + +static int +spa_ld_mos_with_trusted_config(spa_t *spa, spa_import_type_t type, + boolean_t *update_config_cache) +{ + int error; + + /* + * Parse the config for pool, open and validate vdevs, + * select an uberblock, and use that uberblock to open + * the MOS. + */ + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + /* + * Retrieve the trusted config stored in the MOS and use it to create + * a new, exact version of the vdev tree, then reopen all vdevs. + */ + error = spa_ld_trusted_config(spa, type, B_FALSE); + if (error == EAGAIN) { + if (update_config_cache != NULL) + *update_config_cache = B_TRUE; + + /* + * Redo the loading process with the trusted config if it is + * too different from the untrusted config. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "RELOADING"); + error = spa_ld_mos_init(spa, type); + if (error != 0) + return (error); + + error = spa_ld_trusted_config(spa, type, B_TRUE); + if (error != 0) + return (error); + + } else if (error != 0) { + return (error); + } + + return (0); +} + +/* + * Load an existing storage pool, using the config provided. This config + * describes which vdevs are part of the pool and is later validated against + * partial configs present in each vdev's label and an entire copy of the + * config stored in the MOS. + */ +static int +spa_load_impl(spa_t *spa, spa_import_type_t type, char **ereport) +{ + int error = 0; + boolean_t missing_feat_write = B_FALSE; + boolean_t checkpoint_rewind = + (spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + boolean_t update_config_cache = B_FALSE; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa->spa_config_source != SPA_CONFIG_SRC_NONE); + + spa_load_note(spa, "LOADING"); + + error = spa_ld_mos_with_trusted_config(spa, type, &update_config_cache); + if (error != 0) + return (error); + + /* + * If we are rewinding to the checkpoint then we need to repeat + * everything we've done so far in this function but this time + * selecting the checkpointed uberblock and using that to open + * the MOS. + */ + if (checkpoint_rewind) { + /* + * If we are rewinding to the checkpoint update config cache + * anyway. + */ + update_config_cache = B_TRUE; + + /* + * Extract the checkpointed uberblock from the current MOS + * and use this as the pool's uberblock from now on. If the + * pool is imported as writeable we also write the checkpoint + * uberblock to the labels, making the rewind permanent. + */ + error = spa_ld_checkpoint_rewind(spa); + if (error != 0) + return (error); + + /* + * Redo the loading process again with the + * checkpointed uberblock. + */ + spa_ld_prepare_for_reload(spa); + spa_load_note(spa, "LOADING checkpointed uberblock"); + error = spa_ld_mos_with_trusted_config(spa, type, NULL); + if (error != 0) + return (error); + } + + /* + * Retrieve the checkpoint txg if the pool has a checkpoint. + */ + error = spa_ld_read_checkpoint_txg(spa); + if (error != 0) + return (error); + + /* + * Retrieve the mapping of indirect vdevs. Those vdevs were removed + * from the pool and their contents were re-mapped to other vdevs. Note + * that everything that we read before this step must have been + * rewritten on concrete vdevs after the last device removal was + * initiated. Otherwise we could be reading from indirect vdevs before + * we have loaded their mappings. + */ + error = spa_ld_open_indirect_vdev_metadata(spa); + if (error != 0) + return (error); + + /* + * Retrieve the full list of active features from the MOS and check if + * they are all supported. + */ + error = spa_ld_check_features(spa, &missing_feat_write); + if (error != 0) + return (error); + + /* + * Load several special directories from the MOS needed by the dsl_pool + * layer. + */ + error = spa_ld_load_special_directories(spa); + if (error != 0) + return (error); + + /* + * Retrieve pool properties from the MOS. + */ + error = spa_ld_get_props(spa); + if (error != 0) + return (error); + + /* + * Retrieve the list of auxiliary devices - cache devices and spares - + * and open them. + */ + error = spa_ld_open_aux_vdevs(spa, type); + if (error != 0) + return (error); + + /* + * Load the metadata for all vdevs. Also check if unopenable devices + * should be autoreplaced. + */ + error = spa_ld_load_vdev_metadata(spa); + if (error != 0) + return (error); + + error = spa_ld_load_dedup_tables(spa); + if (error != 0) + return (error); + + /* + * Verify the logs now to make sure we don't have any unexpected errors + * when we claim log blocks later. + */ + error = spa_ld_verify_logs(spa, type, ereport); + if (error != 0) + return (error); + + if (missing_feat_write) { + ASSERT(spa->spa_load_state == SPA_LOAD_TRYIMPORT); + + /* + * At this point, we know that we can open the pool in + * read-only mode but not read-write mode. We now have enough + * information and can return to userland. + */ + return (spa_vdev_err(spa->spa_root_vdev, VDEV_AUX_UNSUP_FEAT, + ENOTSUP)); + } + + /* + * Traverse the last txgs to make sure the pool was left off in a safe + * state. When performing an extreme rewind, we verify the whole pool, + * which can take a very long time. + */ + error = spa_ld_verify_pool_data(spa); + if (error != 0) + return (error); + + /* + * Calculate the deflated space for the pool. This must be done before + * we write anything to the pool because we'd need to update the space + * accounting using the deflated sizes. + */ + spa_update_dspace(spa); + + /* + * We have now retrieved all the information we needed to open the + * pool. If we are importing the pool in read-write mode, a few + * additional steps must be performed to finish the import. + */ + if (spa_writeable(spa) && (spa->spa_load_state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { + uint64_t config_cache_txg = spa->spa_config_txg; + + ASSERT(spa->spa_load_state != SPA_LOAD_TRYIMPORT); + + /* + * In case of a checkpoint rewind, log the original txg + * of the checkpointed uberblock. + */ + if (checkpoint_rewind) { + spa_history_log_internal(spa, "checkpoint rewind", + NULL, "rewound state to txg=%llu", + (u_longlong_t)spa->spa_uberblock.ub_checkpoint_txg); + } + + /* + * Traverse the ZIL and claim all blocks. + */ + spa_ld_claim_log_blocks(spa); + + /* + * Kick-off the syncing thread. + */ + spa->spa_sync_on = B_TRUE; + txg_sync_start(spa->spa_dsl_pool); + mmp_thread_start(spa); + + /* + * Wait for all claims to sync. We sync up to the highest + * claimed log block birth time so that claimed log blocks + * don't appear to be from the future. spa_claim_max_txg + * will have been set for us by ZIL traversal operations + * performed above. + */ + txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); + + /* + * Check if we need to request an update of the config. On the + * next sync, we would update the config stored in vdev labels + * and the cachefile (by default /etc/zfs/zpool.cache). + */ + spa_ld_check_for_config_update(spa, config_cache_txg, + update_config_cache); + + /* + * Check if a rebuild was in progress and if so resume it. + * Then check all DTLs to see if anything needs resilvering. + * The resilver will be deferred if a rebuild was started. + */ + if (vdev_rebuild_active(spa->spa_root_vdev)) { + vdev_rebuild_restart(spa); + } else if (!dsl_scan_resilvering(spa->spa_dsl_pool) && + vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + spa_async_request(spa, SPA_ASYNC_RESILVER); + } + + /* + * Log the fact that we booted up (so that we can detect if + * we rebooted in the middle of an operation). + */ + spa_history_log_version(spa, "open", NULL); + + spa_restart_removal(spa); + spa_spawn_aux_threads(spa); + + /* + * Delete any inconsistent datasets. + * + * Note: + * Since we may be issuing deletes for clones here, + * we make sure to do so after we've spawned all the + * auxiliary threads above (from which the livelist + * deletion zthr is part of). + */ + (void) dmu_objset_find(spa_name(spa), + dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); + + /* + * Clean up any stale temporary dataset userrefs. + */ + dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + vdev_trim_restart(spa->spa_root_vdev); + vdev_autotrim_restart(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + spa_import_progress_remove(spa_guid(spa)); + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + + spa_load_note(spa, "LOADED"); + + return (0); +} + +static int +spa_load_retry(spa_t *spa, spa_load_state_t state) +{ + spa_mode_t mode = spa->spa_mode; + + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg - 1; + + spa_activate(spa, mode); + spa_async_suspend(spa); + + spa_load_note(spa, "spa_load_retry: rewind, max txg: %llu", + (u_longlong_t)spa->spa_load_max_txg); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING)); +} + +/* + * If spa_load() fails this function will try loading prior txg's. If + * 'state' is SPA_LOAD_RECOVER and one of these loads succeeds the pool + * will be rewound to that txg. If 'state' is not SPA_LOAD_RECOVER this + * function will not rewind the pool and will return the same error as + * spa_load(). + */ +static int +spa_load_best(spa_t *spa, spa_load_state_t state, uint64_t max_request, + int rewind_flags) +{ + nvlist_t *loadinfo = NULL; + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rewind_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { + spa->spa_load_max_txg = spa->spa_load_txg; + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + spa->spa_load_max_txg = max_request; + if (max_request != UINT64_MAX) + spa->spa_extreme_rewind = B_TRUE; + } + + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING); + if (load_error == 0) + return (0); + if (load_error == ZFS_ERR_NO_CHECKPOINT) { + /* + * When attempting checkpoint-rewind on a pool with no + * checkpoint, we should not attempt to load uberblocks + * from previous txgs when spa_load fails. + */ + ASSERT(spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT); + spa_import_progress_remove(spa_guid(spa)); + return (load_error); + } + + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; + + if (rewind_flags & ZPOOL_NEVER_REWIND) { + nvlist_free(config); + spa_import_progress_remove(spa_guid(spa)); + return (load_error); + } + + if (state == SPA_LOAD_RECOVER) { + /* Price of rolling back is discarding txgs, including log */ + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + /* + * If we aren't rolling back save the load info from our first + * import attempt so that we can restore it after attempting + * to rewind. + */ + loadinfo = spa->spa_load_info; + spa->spa_load_info = fnvlist_alloc(); + } + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; + min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? + TXG_INITIAL : safe_rewind_txg; + + /* + * Continue as long as we're finding errors, we're still within + * the acceptable rewind range, and we're still finding uberblocks + */ + while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && + spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { + if (spa->spa_load_max_txg < safe_rewind_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state); + } + + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + else + nvlist_free(config); + + if (state == SPA_LOAD_RECOVER) { + ASSERT3P(loadinfo, ==, NULL); + spa_import_progress_remove(spa_guid(spa)); + return (rewind_error); + } else { + /* Store the rewind info as part of the initial load info */ + fnvlist_add_nvlist(loadinfo, ZPOOL_CONFIG_REWIND_INFO, + spa->spa_load_info); + + /* Restore the initial load info */ + fnvlist_free(spa->spa_load_info); + spa->spa_load_info = loadinfo; + + spa_import_progress_remove(spa_guid(spa)); + return (load_error); + } +} + +/* + * Pool Open/Import + * + * The import case is identical to an open except that the configuration is sent + * down from userland, instead of grabbed from the configuration cache. For the + * case of an open, the pool configuration will exist in the + * POOL_STATE_UNINITIALIZED state. + * + * The stats information (gen/count/ustats) is used to gather vdev statistics at + * the same time open the pool, without having to keep around the spa_t in some + * ambiguous state. + */ +static int +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, + nvlist_t **config) +{ + spa_t *spa; + spa_load_state_t state = SPA_LOAD_OPEN; + int error; + int locked = B_FALSE; + int firstopen = B_FALSE; + + *spapp = NULL; + + /* + * As disgusting as this is, we need to support recursive calls to this + * function because dsl_dir_open() is called during spa_load(), and ends + * up calling spa_open() again. The real fix is to figure out how to + * avoid dsl_dir_open() calling this in the first place. + */ + if (MUTEX_NOT_HELD(&spa_namespace_lock)) { + mutex_enter(&spa_namespace_lock); + locked = B_TRUE; + } + + if ((spa = spa_lookup(pool)) == NULL) { + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { + zpool_load_policy_t policy; + + firstopen = B_TRUE; + + zpool_get_load_policy(nvpolicy ? nvpolicy : spa->spa_config, + &policy); + if (policy.zlp_rewind & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + + spa_activate(spa, spa_mode_global); + + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + + zfs_dbgmsg("spa_open_common: opening %s", pool); + error = spa_load_best(spa, state, policy.zlp_txg, + policy.zlp_rewind); + + if (error == EBADF) { + /* + * If vdev_validate() returns failure (indicated by + * EBADF), it indicates that one of the vdevs indicates + * that the pool has been exported or destroyed. If + * this is the case, the config cache is out of sync and + * we should remove the pool from the namespace. + */ + spa_unload(spa); + spa_deactivate(spa); + spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_remove(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + if (error) { + /* + * We can't open the pool, but we still have useful + * information: the state of each vdev after the + * attempted vdev_open(). Return this to the user. + */ + if (config != NULL && spa->spa_config) { + VERIFY(nvlist_dup(spa->spa_config, config, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist(*config, + ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + spa_unload(spa); + spa_deactivate(spa); + spa->spa_last_open_failed = error; + if (locked) + mutex_exit(&spa_namespace_lock); + *spapp = NULL; + return (error); + } + } + + spa_open_ref(spa, tag); + + if (config != NULL) + *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + /* + * If we've recovered the pool, pass back any information we + * gathered while doing the load. + */ + if (state == SPA_LOAD_RECOVER) { + VERIFY(nvlist_add_nvlist(*config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + } + + if (locked) { + spa->spa_last_open_failed = 0; + spa->spa_last_ubsync_txg = 0; + spa->spa_load_txg = 0; + mutex_exit(&spa_namespace_lock); + } + + if (firstopen) + zvol_create_minors_recursive(spa_name(spa)); + + *spapp = spa; + + return (0); +} + +int +spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, + nvlist_t **config) +{ + return (spa_open_common(name, spapp, tag, policy, config)); +} + +int +spa_open(const char *name, spa_t **spapp, void *tag) +{ + return (spa_open_common(name, spapp, tag, NULL, NULL)); +} + +/* + * Lookup the given spa_t, incrementing the inject count in the process, + * preventing it from being exported or destroyed. + */ +spa_t * +spa_inject_addref(char *name) +{ + spa_t *spa; + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(name)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (NULL); + } + spa->spa_inject_ref++; + mutex_exit(&spa_namespace_lock); + + return (spa); +} + +void +spa_inject_delref(spa_t *spa) +{ + mutex_enter(&spa_namespace_lock); + spa->spa_inject_ref--; + mutex_exit(&spa_namespace_lock); +} + +/* + * Add spares device information to the nvlist. + */ +static void +spa_add_spares(spa_t *spa, nvlist_t *config) +{ + nvlist_t **spares; + uint_t i, nspares; + nvlist_t *nvroot; + uint64_t guid; + vdev_stat_t *vs; + uint_t vsc; + uint64_t pool; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (spa->spa_spares.sav_count == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + if (nspares != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0); + + /* + * Go through and find any spares which have since been + * repurposed as an active spare. If this is the case, update + * their status appropriately. + */ + for (i = 0; i < nspares; i++) { + VERIFY(nvlist_lookup_uint64(spares[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + if (spa_spare_exists(guid, &pool, NULL) && + pool != 0ULL) { + VERIFY(nvlist_lookup_uint64_array( + spares[i], ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) == 0); + vs->vs_state = VDEV_STATE_CANT_OPEN; + vs->vs_aux = VDEV_AUX_SPARED; + } + } + } +} + +/* + * Add l2cache device information to the nvlist, including vdev stats. + */ +static void +spa_add_l2cache(spa_t *spa, nvlist_t *config) +{ + nvlist_t **l2cache; + uint_t i, j, nl2cache; + nvlist_t *nvroot; + uint64_t guid; + vdev_t *vd; + vdev_stat_t *vs; + uint_t vsc; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (spa->spa_l2cache.sav_count == 0) + return; + + VERIFY(nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); + VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + if (nl2cache != 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvroot, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0); + + /* + * Update level 2 cache device stats. + */ + + for (i = 0; i < nl2cache; i++) { + VERIFY(nvlist_lookup_uint64(l2cache[i], + ZPOOL_CONFIG_GUID, &guid) == 0); + + vd = NULL; + for (j = 0; j < spa->spa_l2cache.sav_count; j++) { + if (guid == + spa->spa_l2cache.sav_vdevs[j]->vdev_guid) { + vd = spa->spa_l2cache.sav_vdevs[j]; + break; + } + } + ASSERT(vd != NULL); + + VERIFY(nvlist_lookup_uint64_array(l2cache[i], + ZPOOL_CONFIG_VDEV_STATS, (uint64_t **)&vs, &vsc) + == 0); + vdev_get_stats(vd, vs); + vdev_config_generate_stats(vd, l2cache[i]); + + } + } +} + +static void +spa_feature_stats_from_disk(spa_t *spa, nvlist_t *features) +{ + zap_cursor_t zc; + zap_attribute_t za; + + if (spa->spa_feat_for_read_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_read_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY0(nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } + + if (spa->spa_feat_for_write_obj != 0) { + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_feat_for_write_obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + ASSERT(za.za_integer_length == sizeof (uint64_t) && + za.za_num_integers == 1); + VERIFY0(nvlist_add_uint64(features, za.za_name, + za.za_first_integer)); + } + zap_cursor_fini(&zc); + } +} + +static void +spa_feature_stats_from_cache(spa_t *spa, nvlist_t *features) +{ + int i; + + for (i = 0; i < SPA_FEATURES; i++) { + zfeature_info_t feature = spa_feature_table[i]; + uint64_t refcount; + + if (feature_get_refcount(spa, &feature, &refcount) != 0) + continue; + + VERIFY0(nvlist_add_uint64(features, feature.fi_guid, refcount)); + } +} + +/* + * Store a list of pool features and their reference counts in the + * config. + * + * The first time this is called on a spa, allocate a new nvlist, fetch + * the pool features and reference counts from disk, then save the list + * in the spa. In subsequent calls on the same spa use the saved nvlist + * and refresh its values from the cached reference counts. This + * ensures we don't block here on I/O on a suspended pool so 'zpool + * clear' can resume the pool. + */ +static void +spa_add_feature_stats(spa_t *spa, nvlist_t *config) +{ + nvlist_t *features; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + mutex_enter(&spa->spa_feat_stats_lock); + features = spa->spa_feat_stats; + + if (features != NULL) { + spa_feature_stats_from_cache(spa, features); + } else { + VERIFY0(nvlist_alloc(&features, NV_UNIQUE_NAME, KM_SLEEP)); + spa->spa_feat_stats = features; + spa_feature_stats_from_disk(spa, features); + } + + VERIFY0(nvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURE_STATS, + features)); + + mutex_exit(&spa->spa_feat_stats_lock); +} + +int +spa_get_stats(const char *name, nvlist_t **config, + char *altroot, size_t buflen) +{ + int error; + spa_t *spa; + + *config = NULL; + error = spa_open_common(name, &spa, FTAG, NULL, config); + + if (spa != NULL) { + /* + * This still leaves a window of inconsistency where the spares + * or l2cache devices could change and the config would be + * self-inconsistent. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + if (*config != NULL) { + uint64_t loadtimes[2]; + + loadtimes[0] = spa->spa_loaded_ts.tv_sec; + loadtimes[1] = spa->spa_loaded_ts.tv_nsec; + VERIFY(nvlist_add_uint64_array(*config, + ZPOOL_CONFIG_LOADED_TIME, loadtimes, 2) == 0); + + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + if (spa_suspended(spa)) { + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, + spa->spa_failmode) == 0); + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED_REASON, + spa->spa_suspended) == 0); + } + + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + spa_add_feature_stats(spa, *config); + } + } + + /* + * We want to get the alternate root even for faulted pools, so we cheat + * and call spa_lookup() directly. + */ + if (altroot) { + if (spa == NULL) { + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(name); + if (spa) + spa_altroot(spa, altroot, buflen); + else + altroot[0] = '\0'; + spa = NULL; + mutex_exit(&spa_namespace_lock); + } else { + spa_altroot(spa, altroot, buflen); + } + } + + if (spa != NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_close(spa, FTAG); + } + + return (error); +} + +/* + * Validate that the auxiliary device array is well formed. We must have an + * array of nvlists, each which describes a valid leaf vdev. If this is an + * import (mode is VDEV_ALLOC_SPARE), then we allow corrupted spares to be + * specified, as long as they are well-formed. + */ +static int +spa_validate_aux_devs(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode, + spa_aux_vdev_t *sav, const char *config, uint64_t version, + vdev_labeltype_t label) +{ + nvlist_t **dev; + uint_t i, ndev; + vdev_t *vd; + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* + * It's acceptable to have no devs specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, config, &dev, &ndev) != 0) + return (0); + + if (ndev == 0) + return (SET_ERROR(EINVAL)); + + /* + * Make sure the pool is formatted with a version that supports this + * device type. + */ + if (spa_version(spa) < version) + return (SET_ERROR(ENOTSUP)); + + /* + * Set the pending device list so we correctly handle device in-use + * checking. + */ + sav->sav_pending = dev; + sav->sav_npending = ndev; + + for (i = 0; i < ndev; i++) { + if ((error = spa_config_parse(spa, &vd, dev[i], NULL, 0, + mode)) != 0) + goto out; + + if (!vd->vdev_ops->vdev_op_leaf) { + vdev_free(vd); + error = SET_ERROR(EINVAL); + goto out; + } + + vd->vdev_top = vd; + + if ((error = vdev_open(vd)) == 0 && + (error = vdev_label_init(vd, crtxg, label)) == 0) { + VERIFY(nvlist_add_uint64(dev[i], ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } + + vdev_free(vd); + + if (error && + (mode != VDEV_ALLOC_SPARE && mode != VDEV_ALLOC_L2CACHE)) + goto out; + else + error = 0; + } + +out: + sav->sav_pending = NULL; + sav->sav_npending = 0; + return (error); +} + +static int +spa_validate_aux(spa_t *spa, nvlist_t *nvroot, uint64_t crtxg, int mode) +{ + int error; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((error = spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_spares, ZPOOL_CONFIG_SPARES, SPA_VERSION_SPARES, + VDEV_LABEL_SPARE)) != 0) { + return (error); + } + + return (spa_validate_aux_devs(spa, nvroot, crtxg, mode, + &spa->spa_l2cache, ZPOOL_CONFIG_L2CACHE, SPA_VERSION_L2CACHE, + VDEV_LABEL_L2CACHE)); +} + +static void +spa_set_aux_vdevs(spa_aux_vdev_t *sav, nvlist_t **devs, int ndevs, + const char *config) +{ + int i; + + if (sav->sav_config != NULL) { + nvlist_t **olddevs; + uint_t oldndevs; + nvlist_t **newdevs; + + /* + * Generate new dev list by concatenating with the + * current dev list. + */ + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, config, + &olddevs, &oldndevs) == 0); + + newdevs = kmem_alloc(sizeof (void *) * + (ndevs + oldndevs), KM_SLEEP); + for (i = 0; i < oldndevs; i++) + VERIFY(nvlist_dup(olddevs[i], &newdevs[i], + KM_SLEEP) == 0); + for (i = 0; i < ndevs; i++) + VERIFY(nvlist_dup(devs[i], &newdevs[i + oldndevs], + KM_SLEEP) == 0); + + VERIFY(nvlist_remove(sav->sav_config, config, + DATA_TYPE_NVLIST_ARRAY) == 0); + + VERIFY(nvlist_add_nvlist_array(sav->sav_config, + config, newdevs, ndevs + oldndevs) == 0); + for (i = 0; i < oldndevs + ndevs; i++) + nvlist_free(newdevs[i]); + kmem_free(newdevs, (oldndevs + ndevs) * sizeof (void *)); + } else { + /* + * Generate a new dev list. + */ + VERIFY(nvlist_alloc(&sav->sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(sav->sav_config, config, + devs, ndevs) == 0); + } +} + +/* + * Stop and drop level 2 ARC devices + */ +void +spa_l2cache_drop(spa_t *spa) +{ + vdev_t *vd; + int i; + spa_aux_vdev_t *sav = &spa->spa_l2cache; + + for (i = 0; i < sav->sav_count; i++) { + uint64_t pool; + + vd = sav->sav_vdevs[i]; + ASSERT(vd != NULL); + + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) + l2arc_remove_vdev(vd); + } +} + +/* + * Verify encryption parameters for spa creation. If we are encrypting, we must + * have the encryption feature flag enabled. + */ +static int +spa_create_check_encryption_params(dsl_crypto_params_t *dcp, + boolean_t has_encryption) +{ + if (dcp->cp_crypt != ZIO_CRYPT_OFF && + dcp->cp_crypt != ZIO_CRYPT_INHERIT && + !has_encryption) + return (SET_ERROR(ENOTSUP)); + + return (dmu_objset_create_crypt_check(NULL, dcp, NULL)); +} + +/* + * Pool Creation + */ +int +spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, + nvlist_t *zplprops, dsl_crypto_params_t *dcp) +{ + spa_t *spa; + char *altroot = NULL; + vdev_t *rvd; + dsl_pool_t *dp; + dmu_tx_t *tx; + int error = 0; + uint64_t txg = TXG_INITIAL; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + uint64_t version, obj; + boolean_t has_features; + boolean_t has_encryption; + boolean_t has_allocclass; + spa_feature_t feat; + char *feat_name; + char *poolname; + nvlist_t *nvl; + + if (props == NULL || + nvlist_lookup_string(props, "tname", &poolname) != 0) + poolname = (char *)pool; + + /* + * If this pool already exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(poolname) != NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EEXIST)); + } + + /* + * Allocate a new spa_t structure. + */ + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, ZPOOL_CONFIG_POOL_NAME, pool); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(poolname, nvl, altroot); + fnvlist_free(nvl); + spa_activate(spa, spa_mode_global); + + if (props && (error = spa_prop_validate(spa, props))) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Temporary pool names should never be written to disk. + */ + if (poolname != pool) + spa->spa_import_flags |= ZFS_IMPORT_TEMP_NAME; + + has_features = B_FALSE; + has_encryption = B_FALSE; + has_allocclass = B_FALSE; + for (nvpair_t *elem = nvlist_next_nvpair(props, NULL); + elem != NULL; elem = nvlist_next_nvpair(props, elem)) { + if (zpool_prop_feature(nvpair_name(elem))) { + has_features = B_TRUE; + + feat_name = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(feat_name, &feat)); + if (feat == SPA_FEATURE_ENCRYPTION) + has_encryption = B_TRUE; + if (feat == SPA_FEATURE_ALLOCATION_CLASSES) + has_allocclass = B_TRUE; + } + } + + /* verify encryption params, if they were provided */ + if (dcp != NULL) { + error = spa_create_check_encryption_params(dcp, has_encryption); + if (error != 0) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + } + if (!has_allocclass && zfs_special_devs(nvroot, NULL)) { + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (ENOTSUP); + } + + if (has_features || nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version) != 0) { + version = SPA_VERSION; + } + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + + spa->spa_first_txg = txg; + spa->spa_uberblock.ub_txg = txg - 1; + spa->spa_uberblock.ub_version = version; + spa->spa_ubsync = spa->spa_uberblock; + spa->spa_load_state = SPA_LOAD_CREATE; + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + spa->spa_indirect_vdevs_loaded = B_TRUE; + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = kmem_alloc(max_ncpus * sizeof (void *), + KM_SLEEP); + for (int i = 0; i < max_ncpus; i++) { + spa->spa_async_zio_root[i] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + } + + /* + * Create the root vdev. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_ADD); + + ASSERT(error != 0 || rvd != NULL); + ASSERT(error != 0 || spa->spa_root_vdev == rvd); + + if (error == 0 && !zfs_allocatable_devs(nvroot)) + error = SET_ERROR(EINVAL); + + if (error == 0 && + (error = vdev_create(rvd, txg, B_FALSE)) == 0 && + (error = spa_validate_aux(spa, nvroot, txg, + VDEV_ALLOC_ADD)) == 0) { + /* + * instantiate the metaslab groups (this will dirty the vdevs) + * we can no longer error exit past this point + */ + for (int c = 0; error == 0 && c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + vdev_ashift_optimize(vd); + vdev_metaslab_set_size(vd); + vdev_expand(vd, txg); + } + } + + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + /* + * Get the list of spares, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + + /* + * Get the list of level 2 cache devices, if specified. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + spa->spa_is_initializing = B_TRUE; + spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, dcp, txg); + spa->spa_is_initializing = B_FALSE; + + /* + * Create DDTs (dedup tables). + */ + ddt_create(spa); + + spa_update_dspace(spa); + + tx = dmu_tx_create_assigned(dp, txg); + + /* + * Create the pool's history object. + */ + if (version >= SPA_VERSION_ZPOOL_HISTORY && !spa->spa_history) + spa_history_create_obj(spa, tx); + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_CREATE); + spa_history_log_version(spa, "create", tx); + + /* + * Create the pool config object. + */ + spa->spa_config_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, SPA_CONFIG_BLOCKSIZE, + DMU_OT_PACKED_NVLIST_SIZE, sizeof (uint64_t), tx); + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, + sizeof (uint64_t), 1, &spa->spa_config_object, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool config"); + } + + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CREATION_VERSION, + sizeof (uint64_t), 1, &version, tx) != 0) { + cmn_err(CE_PANIC, "failed to add pool version"); + } + + /* Newly created pools with the right version are always deflated. */ + if (version >= SPA_VERSION_RAIDZ_DEFLATE) { + spa->spa_deflate = TRUE; + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx) != 0) { + cmn_err(CE_PANIC, "failed to add deflate"); + } + } + + /* + * Create the deferred-free bpobj. Turn off compression + * because sync-to-convergence takes longer if the blocksize + * keeps changing. + */ + obj = bpobj_alloc(spa->spa_meta_objset, 1 << 14, tx); + dmu_object_set_compress(spa->spa_meta_objset, obj, + ZIO_COMPRESS_OFF, tx); + if (zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPOBJ, + sizeof (uint64_t), 1, &obj, tx) != 0) { + cmn_err(CE_PANIC, "failed to add bpobj"); + } + VERIFY3U(0, ==, bpobj_open(&spa->spa_deferred_bpobj, + spa->spa_meta_objset, obj)); + + /* + * Generate some random noise for salted checksums to operate on. + */ + (void) random_get_pseudo_bytes(spa->spa_cksum_salt.zcs_bytes, + sizeof (spa->spa_cksum_salt.zcs_bytes)); + + /* + * Set pool properties. + */ + spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); + spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); + spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); + spa->spa_multihost = zpool_prop_default_numeric(ZPOOL_PROP_MULTIHOST); + spa->spa_autotrim = zpool_prop_default_numeric(ZPOOL_PROP_AUTOTRIM); + + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_sync_props(props, tx); + } + + dmu_tx_commit(tx); + + spa->spa_sync_on = B_TRUE; + txg_sync_start(dp); + mmp_thread_start(spa); + txg_wait_synced(dp, txg); + + spa_spawn_aux_threads(spa); + + spa_write_cachefile(spa, B_FALSE, B_TRUE); + + /* + * Don't count references from objsets that are already closed + * and are making their way through the eviction process. + */ + spa_evicting_os_wait(spa); + spa->spa_minref = zfs_refcount_count(&spa->spa_refcount); + spa->spa_load_state = SPA_LOAD_NONE; + + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Import a non-root pool into the system. + */ +int +spa_import(char *pool, nvlist_t *config, nvlist_t *props, uint64_t flags) +{ + spa_t *spa; + char *altroot = NULL; + spa_load_state_t state = SPA_LOAD_IMPORT; + zpool_load_policy_t policy; + spa_mode_t mode = spa_mode_global; + uint64_t readonly = B_FALSE; + int error; + nvlist_t *nvroot; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EEXIST)); + } + + /* + * Create and initialize the spa structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_READONLY), &readonly); + if (readonly) + mode = SPA_MODE_READ; + spa = spa_add(pool, config, altroot); + spa->spa_import_flags = flags; + + /* + * Verbatim import - Take a pool and insert it into the namespace + * as if it had been loaded at boot. + */ + if (spa->spa_import_flags & ZFS_IMPORT_VERBATIM) { + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + spa_write_cachefile(spa, B_FALSE, B_TRUE); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + zfs_dbgmsg("spa_import: verbatim import of %s", pool); + mutex_exit(&spa_namespace_lock); + return (0); + } + + spa_activate(spa, mode); + + /* + * Don't start async tasks until we know everything is healthy. + */ + spa_async_suspend(spa); + + zpool_get_load_policy(config, &policy); + if (policy.zlp_rewind & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + + spa->spa_config_source = SPA_CONFIG_SRC_TRYIMPORT; + + if (state != SPA_LOAD_RECOVER) { + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + zfs_dbgmsg("spa_import: importing %s", pool); + } else { + zfs_dbgmsg("spa_import: importing %s, max_txg=%lld " + "(RECOVERY MODE)", pool, (longlong_t)policy.zlp_txg); + } + error = spa_load_best(spa, state, policy.zlp_txg, policy.zlp_rewind); + + /* + * Propagate anything learned while loading the pool and pass it + * back to caller (i.e. rewind info, missing devices, etc). + */ + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + /* + * Toss any existing sparelist, as it doesn't have any validity + * anymore, and conflicts with spa_has_spare(). + */ + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + spa_load_spares(spa); + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); + } + + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + if (error != 0 || (props && spa_writeable(spa) && + (error = spa_prop_set(spa, props)))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); + } + + spa_async_resume(spa); + + /* + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. + */ + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + /* + * Check for any removed devices. + */ + if (spa->spa_autoreplace) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + + if (spa_writeable(spa)) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + } + + /* + * It's possible that the pool was expanded while it was exported. + * We kick off an async task to handle this for us. + */ + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + + spa_history_log_version(spa, "import", NULL); + + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_IMPORT); + + mutex_exit(&spa_namespace_lock); + + zvol_create_minors_recursive(pool); + + return (0); +} + +nvlist_t * +spa_tryimport(nvlist_t *tryconfig) +{ + nvlist_t *config = NULL; + char *poolname, *cachefile; + spa_t *spa; + uint64_t state; + int error; + zpool_load_policy_t policy; + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) + return (NULL); + + if (nvlist_lookup_uint64(tryconfig, ZPOOL_CONFIG_POOL_STATE, &state)) + return (NULL); + + /* + * Create and initialize the spa structure. + */ + mutex_enter(&spa_namespace_lock); + spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa_activate(spa, SPA_MODE_READ); + + /* + * Rewind pool if a max txg was provided. + */ + zpool_get_load_policy(spa->spa_config, &policy); + if (policy.zlp_txg != UINT64_MAX) { + spa->spa_load_max_txg = policy.zlp_txg; + spa->spa_extreme_rewind = B_TRUE; + zfs_dbgmsg("spa_tryimport: importing %s, max_txg=%lld", + poolname, (longlong_t)policy.zlp_txg); + } else { + zfs_dbgmsg("spa_tryimport: importing %s", poolname); + } + + if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_CACHEFILE, &cachefile) + == 0) { + zfs_dbgmsg("spa_tryimport: using cachefile '%s'", cachefile); + spa->spa_config_source = SPA_CONFIG_SRC_CACHEFILE; + } else { + spa->spa_config_source = SPA_CONFIG_SRC_SCAN; + } + + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING); + + /* + * If 'tryconfig' was at least parsable, return the current config. + */ + if (spa->spa_root_vdev != NULL) { + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, + poolname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + state) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_TIMESTAMP, + spa->spa_uberblock.ub_timestamp) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_LOAD_INFO, + spa->spa_load_info) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, + spa->spa_errata) == 0); + + /* + * If the bootfs property exists on this pool then we + * copy it out so that external consumers can tell which + * pools are bootable. + */ + if ((!error || error == EEXIST) && spa->spa_bootfs) { + char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + /* + * We have to play games with the name since the + * pool was opened as TRYIMPORT_NAME. + */ + if (dsl_dsobj_to_dsname(spa_name(spa), + spa->spa_bootfs, tmpname) == 0) { + char *cp; + char *dsname; + + dsname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + cp = strchr(tmpname, '/'); + if (cp == NULL) { + (void) strlcpy(dsname, tmpname, + MAXPATHLEN); + } else { + (void) snprintf(dsname, MAXPATHLEN, + "%s/%s", poolname, ++cp); + } + VERIFY(nvlist_add_string(config, + ZPOOL_CONFIG_BOOTFS, dsname) == 0); + kmem_free(dsname, MAXPATHLEN); + } + kmem_free(tmpname, MAXPATHLEN); + } + + /* + * Add the list of hot spares and level 2 cache devices. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_add_spares(spa, config); + spa_add_l2cache(spa, config); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + + return (config); +} + +/* + * Pool export/destroy + * + * The act of destroying or exporting a pool is very simple. We make sure there + * is no more pending I/O and any references to the pool are gone. Then, we + * update the pool state and sync all the labels to disk, removing the + * configuration from the cache afterwards. If the 'hardforce' flag is set, then + * we don't sync the labels or remove the configuration cache. + */ +static int +spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, + boolean_t force, boolean_t hardforce) +{ + spa_t *spa; + + if (oldconfig) + *oldconfig = NULL; + + if (!(spa_mode_global & SPA_MODE_WRITE)) + return (SET_ERROR(EROFS)); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pool)) == NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ENOENT)); + } + + if (spa->spa_is_exporting) { + /* the pool is being exported by another thread */ + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(ZFS_ERR_EXPORT_IN_PROGRESS)); + } + spa->spa_is_exporting = B_TRUE; + + /* + * Put a hold on the pool, drop the namespace lock, stop async tasks, + * reacquire the namespace lock, and see if we can export. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + if (spa->spa_zvol_taskq) { + zvol_remove_minors(spa, spa_name(spa), B_TRUE); + taskq_wait(spa->spa_zvol_taskq); + } + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + goto export_spa; + /* + * The pool will be in core if it's openable, in which case we can + * modify its state. Objsets may be open only because they're dirty, + * so we have to force it to sync before checking spa_refcnt. + */ + if (spa->spa_sync_on) { + txg_wait_synced(spa->spa_dsl_pool, 0); + spa_evicting_os_wait(spa); + } + + /* + * A pool cannot be exported or destroyed if there are active + * references. If we are resetting a pool, allow references by + * fault injection handlers. + */ + if (!spa_refcount_zero(spa) || + (spa->spa_inject_ref != 0 && + new_state != POOL_STATE_UNINITIALIZED)) { + spa_async_resume(spa); + spa->spa_is_exporting = B_FALSE; + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EBUSY)); + } + + if (spa->spa_sync_on) { + /* + * A pool cannot be exported if it has an active shared spare. + * This is to prevent other pools stealing the active spare + * from an exported pool. At user's own will, such pool can + * be forcedly exported. + */ + if (!force && new_state == POOL_STATE_EXPORTED && + spa_has_active_shared_spare(spa)) { + spa_async_resume(spa); + spa->spa_is_exporting = B_FALSE; + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EXDEV)); + } + + /* + * We're about to export or destroy this pool. Make sure + * we stop all initialization and trim activity here before + * we set the spa_final_txg. This will ensure that all + * dirty data resulting from the initialization is + * committed to disk before we unload the pool. + */ + if (spa->spa_root_vdev != NULL) { + vdev_t *rvd = spa->spa_root_vdev; + vdev_initialize_stop_all(rvd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(rvd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_all(spa); + vdev_rebuild_stop_all(spa); + } + + /* + * We want this to be reflected on every label, + * so mark them all dirty. spa_unload() will do the + * final sync that pushes these changes out. + */ + if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa->spa_state = new_state; + spa->spa_final_txg = spa_last_synced_txg(spa) + + TXG_DEFER_SIZE + 1; + vdev_config_dirty(spa->spa_root_vdev); + spa_config_exit(spa, SCL_ALL, FTAG); + } + } + +export_spa: + if (new_state == POOL_STATE_DESTROYED) + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_DESTROY); + else if (new_state == POOL_STATE_EXPORTED) + spa_event_notify(spa, NULL, NULL, ESC_ZFS_POOL_EXPORT); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + + if (oldconfig && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); + + if (new_state != POOL_STATE_UNINITIALIZED) { + if (!hardforce) + spa_write_cachefile(spa, B_TRUE, B_TRUE); + spa_remove(spa); + } else { + /* + * If spa_remove() is not called for this spa_t and + * there is any possibility that it can be reused, + * we make sure to reset the exporting flag. + */ + spa->spa_is_exporting = B_FALSE; + } + + mutex_exit(&spa_namespace_lock); + return (0); +} + +/* + * Destroy a storage pool. + */ +int +spa_destroy(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, + B_FALSE, B_FALSE)); +} + +/* + * Export a storage pool. + */ +int +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce) +{ + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, + force, hardforce)); +} + +/* + * Similar to spa_export(), this unloads the spa_t without actually removing it + * from the namespace in any way. + */ +int +spa_reset(char *pool) +{ + return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, + B_FALSE, B_FALSE)); +} + +/* + * ========================================================================== + * Device manipulation + * ========================================================================== + */ + +/* + * Add a device to a storage pool. + */ +int +spa_vdev_add(spa_t *spa, nvlist_t *nvroot) +{ + uint64_t txg; + int error; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd, *tvd; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + if ((error = spa_config_parse(spa, &vd, nvroot, NULL, 0, + VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + spa->spa_pending_vdev = vd; /* spa_vdev_exit() will clear this */ + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, &spares, + &nspares) != 0) + nspares = 0; + + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, + &nl2cache) != 0) + nl2cache = 0; + + if (vd->vdev_children == 0 && nspares == 0 && nl2cache == 0) + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + + if (vd->vdev_children != 0 && + (error = vdev_create(vd, txg, B_FALSE)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * We must validate the spares and l2cache devices after checking the + * children. Otherwise, vdev_inuse() will blindly overwrite the spare. + */ + if ((error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) != 0) + return (spa_vdev_exit(spa, vd, txg, error)); + + /* + * If we are in the middle of a device removal, we can only add + * devices which match the existing devices in the pool. + * If we are in the middle of a removal, or have some indirect + * vdevs, we can not add raidz toplevels. + */ + if (spa->spa_vdev_removal != NULL || + spa->spa_removing_phys.sr_prev_indirect_vdev != -1) { + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + if (spa->spa_vdev_removal != NULL && + tvd->vdev_ashift != spa->spa_max_ashift) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* Fail if top level vdev is raidz */ + if (tvd->vdev_ops == &vdev_raidz_ops) { + return (spa_vdev_exit(spa, vd, txg, EINVAL)); + } + /* + * Need the top level mirror to be + * a mirror of leaf vdevs only + */ + if (tvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < tvd->vdev_children; cid++) { + vdev_t *cvd = tvd->vdev_child[cid]; + if (!cvd->vdev_ops->vdev_op_leaf) { + return (spa_vdev_exit(spa, vd, + txg, EINVAL)); + } + } + } + } + } + + for (int c = 0; c < vd->vdev_children; c++) { + tvd = vd->vdev_child[c]; + vdev_remove_child(vd, tvd); + tvd->vdev_id = rvd->vdev_children; + vdev_add_child(rvd, tvd); + vdev_config_dirty(tvd); + } + + if (nspares != 0) { + spa_set_aux_vdevs(&spa->spa_spares, spares, nspares, + ZPOOL_CONFIG_SPARES); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } + + if (nl2cache != 0) { + spa_set_aux_vdevs(&spa->spa_l2cache, l2cache, nl2cache, + ZPOOL_CONFIG_L2CACHE); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } + + /* + * We have to be careful when adding new vdevs to an existing pool. + * If other threads start allocating from these vdevs before we + * sync the config cache, and we lose power, then upon reboot we may + * fail to open the pool because there are DVAs that the config cache + * can't translate. Therefore, we first add the vdevs without + * initializing metaslabs; sync the config cache (via spa_vdev_exit()); + * and then let spa_config_update() initialize the new metaslabs. + * + * spa_load() checks for added-but-not-initialized vdevs, so that + * if we lose power at any point in this sequence, the remaining + * steps will be completed the next time we load the pool. + */ + (void) spa_vdev_exit(spa, vd, txg, 0); + + mutex_enter(&spa_namespace_lock); + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + spa_event_notify(spa, NULL, NULL, ESC_ZFS_VDEV_ADD); + mutex_exit(&spa_namespace_lock); + + return (0); +} + +/* + * Attach a device to a mirror. The arguments are the path to any device + * in the mirror, and the nvroot for the new device. If the path specifies + * a device that is not mirrored, we automatically insert the mirror vdev. + * + * If 'replacing' is specified, the new device is intended to replace the + * existing device; in this case the two devices are made into their own + * mirror using the 'replacing' vdev, which is functionally identical to + * the mirror vdev (it actually reuses all the same ops) but has a few + * extra rules: you can't attach to it after it's been created, and upon + * completion of resilvering, the first disk (the one being replaced) + * is automatically detached. + * + * If 'rebuild' is specified, then sequential reconstruction (a.ka. rebuild) + * should be performed instead of traditional healing reconstruction. From + * an administrators perspective these are both resilver operations. + */ +int +spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing, + int rebuild) +{ + uint64_t txg, dtl_max_txg; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; + vdev_ops_t *pvops; + char *oldvdpath, *newvdpath; + int newvd_isspare; + int error; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + oldvd = spa_lookup_by_guid(spa, guid, B_FALSE); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + if (rebuild) { + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + if (dsl_scan_resilvering(spa_get_dsl(spa))) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_RESILVER_IN_PROGRESS)); + } else { + if (vdev_rebuild_active(rvd)) + return (spa_vdev_exit(spa, NULL, txg, + ZFS_ERR_REBUILD_IN_PROGRESS)); + } + + if (spa->spa_vdev_removal != NULL) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + if (oldvd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!oldvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = oldvd->vdev_parent; + + if ((error = spa_config_parse(spa, &newrootvd, nvroot, NULL, 0, + VDEV_ALLOC_ATTACH)) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + if (newrootvd->vdev_children != 1) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + newvd = newrootvd->vdev_child[0]; + + if (!newvd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, newrootvd, txg, EINVAL)); + + if ((error = vdev_create(newrootvd, txg, replacing)) != 0) + return (spa_vdev_exit(spa, newrootvd, txg, error)); + + /* + * Spares can't replace logs + */ + if (oldvd->vdev_top->vdev_islog && newvd->vdev_isspare) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + if (rebuild) { + /* + * For rebuilds, the parent vdev must support reconstruction + * using only space maps. This means the only allowable + * parents are the root vdev or a mirror vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + } + + if (!replacing) { + /* + * For attach, the only allowable parent is a mirror or the root + * vdev. + */ + if (pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_root_ops) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + pvops = &vdev_mirror_ops; + } else { + /* + * Active hot spares can only be replaced by inactive hot + * spares. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + oldvd->vdev_isspare && + !spa_has_spare(spa, newvd->vdev_guid)) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If the source is a hot spare, and the parent isn't already a + * spare, then we want to create a new hot spare. Otherwise, we + * want to create a replacing vdev. The user is not allowed to + * attach to a spared vdev child unless the 'isspare' state is + * the same (spare replaces spare, non-spare replaces + * non-spare). + */ + if (pvd->vdev_ops == &vdev_replacing_ops && + spa_version(spa) < SPA_VERSION_MULTI_REPLACE) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } else if (pvd->vdev_ops == &vdev_spare_ops && + newvd->vdev_isspare != oldvd->vdev_isspare) { + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + } + + if (newvd->vdev_isspare) + pvops = &vdev_spare_ops; + else + pvops = &vdev_replacing_ops; + } + + /* + * Make sure the new device is big enough. + */ + if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) + return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); + + /* + * The new device cannot have a higher alignment requirement + * than the top-level vdev. + */ + if (newvd->vdev_ashift > oldvd->vdev_top->vdev_ashift) + return (spa_vdev_exit(spa, newrootvd, txg, ENOTSUP)); + + /* + * If this is an in-place replacement, update oldvd's path and devid + * to make it distinguishable from newvd, and unopenable from now on. + */ + if (strcmp(oldvd->vdev_path, newvd->vdev_path) == 0) { + spa_strfree(oldvd->vdev_path); + oldvd->vdev_path = kmem_alloc(strlen(newvd->vdev_path) + 5, + KM_SLEEP); + (void) snprintf(oldvd->vdev_path, strlen(newvd->vdev_path) + 5, + "%s/%s", newvd->vdev_path, "old"); + if (oldvd->vdev_devid != NULL) { + spa_strfree(oldvd->vdev_devid); + oldvd->vdev_devid = NULL; + } + } + + /* + * If the parent is not a mirror, or if we're replacing, insert the new + * mirror/replacing/spare vdev above oldvd. + */ + if (pvd->vdev_ops != pvops) + pvd = vdev_add_parent(oldvd, pvops); + + ASSERT(pvd->vdev_top->vdev_parent == rvd); + ASSERT(pvd->vdev_ops == pvops); + ASSERT(oldvd->vdev_parent == pvd); + + /* + * Extract the new device from its root and add it to pvd. + */ + vdev_remove_child(newrootvd, newvd); + newvd->vdev_id = pvd->vdev_children; + newvd->vdev_crtxg = oldvd->vdev_crtxg; + vdev_add_child(pvd, newvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(pvd); + + tvd = newvd->vdev_top; + ASSERT(pvd->vdev_top == tvd); + ASSERT(tvd->vdev_parent == rvd); + + vdev_config_dirty(tvd); + + /* + * Set newvd's DTL to [TXG_INITIAL, dtl_max_txg) so that we account + * for any dmu_sync-ed blocks. It will propagate upward when + * spa_vdev_exit() calls vdev_dtl_reassess(). + */ + dtl_max_txg = txg + TXG_CONCURRENT_STATES; + + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, dtl_max_txg - TXG_INITIAL); + + if (newvd->vdev_isspare) { + spa_spare_activate(newvd); + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_SPARE); + } + + oldvdpath = spa_strdup(oldvd->vdev_path); + newvdpath = spa_strdup(newvd->vdev_path); + newvd_isspare = newvd->vdev_isspare; + + /* + * Mark newvd's DTL dirty in this txg. + */ + vdev_dirty(tvd, VDD_DTL, newvd, txg); + + /* + * Schedule the resilver or rebuild to restart in the future. We do + * this to ensure that dmu_sync-ed blocks have been stitched into the + * respective datasets. + */ + if (rebuild) { + newvd->vdev_rebuild_txg = txg; + + vdev_rebuild(tvd); + } else { + newvd->vdev_resilver_txg = txg; + + if (dsl_scan_resilvering(spa_get_dsl(spa)) && + spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) { + vdev_defer_resilver(newvd); + } else { + dsl_scan_restart_resilver(spa->spa_dsl_pool, + dtl_max_txg); + } + } + + if (spa->spa_bootfs) + spa_event_notify(spa, newvd, NULL, ESC_ZFS_BOOTFS_VDEV_ATTACH); + + spa_event_notify(spa, newvd, NULL, ESC_ZFS_VDEV_ATTACH); + + /* + * Commit the config + */ + (void) spa_vdev_exit(spa, newrootvd, dtl_max_txg, 0); + + spa_history_log_internal(spa, "vdev attach", NULL, + "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); + + spa_strfree(oldvdpath); + spa_strfree(newvdpath); + + return (0); +} + +/* + * Detach a device from a mirror or replacing vdev. + * + * If 'replace_done' is specified, only detach if the parent + * is a replacing vdev. + */ +int +spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) +{ + uint64_t txg; + int error; + vdev_t *rvd __maybe_unused = spa->spa_root_vdev; + vdev_t *vd, *pvd, *cvd, *tvd; + boolean_t unspare = B_FALSE; + uint64_t unspare_guid = 0; + char *vdpath; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_detach_enter(spa, guid); + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + /* + * Besides being called directly from the userland through the + * ioctl interface, spa_vdev_detach() can be potentially called + * at the end of spa_vdev_resilver_done(). + * + * In the regular case, when we have a checkpoint this shouldn't + * happen as we never empty the DTLs of a vdev during the scrub + * [see comment in dsl_scan_done()]. Thus spa_vdev_resilvering_done() + * should never get here when we have a checkpoint. + * + * That said, even in a case when we checkpoint the pool exactly + * as spa_vdev_resilver_done() calls this function everything + * should be fine as the resilver will return right away. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + if (vd == NULL) + return (spa_vdev_exit(spa, NULL, txg, ENODEV)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + pvd = vd->vdev_parent; + + /* + * If the parent/child relationship is not as expected, don't do it. + * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing + * vdev that's replacing B with C. The user's intent in replacing + * is to go from M(A,B) to M(A,C). If the user decides to cancel + * the replace by detaching C, the expected behavior is to end up + * M(A,B). But suppose that right after deciding to detach C, + * the replacement of B completes. We would have M(A,C), and then + * ask to detach C, which would leave us with just A -- not what + * the user wanted. To prevent this, we make sure that the + * parent/child relationship hasn't changed -- in this example, + * that C's parent is still the replacing vdev R. + */ + if (pvd->vdev_guid != pguid && pguid != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + /* + * Only 'replacing' or 'spare' vdevs can be replaced. + */ + if (replace_done && pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + ASSERT(pvd->vdev_ops != &vdev_spare_ops || + spa_version(spa) >= SPA_VERSION_SPARES); + + /* + * Only mirror, replacing, and spare vdevs support detach. + */ + if (pvd->vdev_ops != &vdev_replacing_ops && + pvd->vdev_ops != &vdev_mirror_ops && + pvd->vdev_ops != &vdev_spare_ops) + return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + + /* + * If this device has the only valid copy of some data, + * we cannot safely detach it. + */ + if (vdev_dtl_required(vd)) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + + ASSERT(pvd->vdev_children >= 2); + + /* + * If we are detaching the second disk from a replacing vdev, then + * check to see if we changed the original vdev's path to have "/old" + * at the end in spa_vdev_attach(). If so, undo that change now. + */ + if (pvd->vdev_ops == &vdev_replacing_ops && vd->vdev_id > 0 && + vd->vdev_path != NULL) { + size_t len = strlen(vd->vdev_path); + + for (int c = 0; c < pvd->vdev_children; c++) { + cvd = pvd->vdev_child[c]; + + if (cvd == vd || cvd->vdev_path == NULL) + continue; + + if (strncmp(cvd->vdev_path, vd->vdev_path, len) == 0 && + strcmp(cvd->vdev_path + len, "/old") == 0) { + spa_strfree(cvd->vdev_path); + cvd->vdev_path = spa_strdup(vd->vdev_path); + break; + } + } + } + + /* + * If we are detaching the original disk from a spare, then it implies + * that the spare should become a real disk, and be removed from the + * active spare list for the pool. + */ + if (pvd->vdev_ops == &vdev_spare_ops && + vd->vdev_id == 0 && + pvd->vdev_child[pvd->vdev_children - 1]->vdev_isspare) + unspare = B_TRUE; + + /* + * Erase the disk labels so the disk can be used for other things. + * This must be done after all other error cases are handled, + * but before we disembowel vd (so we can still do I/O to it). + * But if we can't do it, don't treat the error as fatal -- + * it may be that the unwritability of the disk is the reason + * it's being detached! + */ + error = vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + /* + * Remove vd from its parent and compact the parent's children. + */ + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + /* + * Remember one of the remaining children so we can get tvd below. + */ + cvd = pvd->vdev_child[pvd->vdev_children - 1]; + + /* + * If we need to remove the remaining child from the list of hot spares, + * do it now, marking the vdev as no longer a spare in the process. + * We must do this before vdev_remove_parent(), because that can + * change the GUID if it creates a new toplevel GUID. For a similar + * reason, we must remove the spare now, in the same txg as the detach; + * otherwise someone could attach a new sibling, change the GUID, and + * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. + */ + if (unspare) { + ASSERT(cvd->vdev_isspare); + spa_spare_remove(cvd); + unspare_guid = cvd->vdev_guid; + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + cvd->vdev_unspare = B_TRUE; + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) { + if (pvd->vdev_ops == &vdev_spare_ops) + cvd->vdev_unspare = B_FALSE; + vdev_remove_parent(cvd); + } + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. + */ + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + vdpath = spa_strdup(vd->vdev_path ? vd->vdev_path : "none"); + for (int t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE); + spa_notify_waiters(spa); + + /* hang on to the spa before we release the lock */ + spa_open_ref(spa, FTAG); + + error = spa_vdev_exit(spa, vd, txg, 0); + + spa_history_log_internal(spa, "detach", NULL, + "vdev=%s", vdpath); + spa_strfree(vdpath); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa_t *altspa = NULL; + + mutex_enter(&spa_namespace_lock); + while ((altspa = spa_next(altspa)) != NULL) { + if (altspa->spa_state != POOL_STATE_ACTIVE || + altspa == spa) + continue; + + spa_open_ref(altspa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(altspa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(altspa, FTAG); + } + mutex_exit(&spa_namespace_lock); + + /* search the rest of the vdevs for spares to remove */ + spa_vdev_resilver_done(spa); + } + + /* all done with the spa; OK to release */ + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + mutex_exit(&spa_namespace_lock); + + return (error); +} + +static int +spa_vdev_initialize_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, + list_t *vd_list) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EROFS)); + } + mutex_enter(&vd->vdev_initialize_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate an initialize action we check to see + * if the vdev_initialize_thread is NULL. We do this instead + * of using the vdev_initialize_state since there might be + * a previous initialization process which has completed but + * the thread is not exited. + */ + if (cmd_type == POOL_INITIALIZE_START && + (vd->vdev_initialize_thread != NULL || + vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_INITIALIZE_CANCEL && + (vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE && + vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED)) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_INITIALIZE_SUSPEND && + vd->vdev_initialize_state != VDEV_INITIALIZE_ACTIVE) { + mutex_exit(&vd->vdev_initialize_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_INITIALIZE_START: + vdev_initialize(vd); + break; + case POOL_INITIALIZE_CANCEL: + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, vd_list); + break; + case POOL_INITIALIZE_SUSPEND: + vdev_initialize_stop(vd, VDEV_INITIALIZE_SUSPENDED, vd_list); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_initialize_lock); + + return (0); +} + +int +spa_vdev_initialize(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, + nvlist_t *vdev_errlist) +{ + int total_errors = 0; + list_t vd_list; + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping initialization. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the initializing operation. + */ + mutex_enter(&spa_namespace_lock); + + for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + int error = spa_vdev_initialize_impl(spa, vdev_guid, cmd_type, + &vd_list); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + + /* Wait for all initialize threads to stop. */ + vdev_initialize_stop_wait(spa, &vd_list); + + /* Sync out the initializing state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + list_destroy(&vd_list); + + return (total_errors); +} + +static int +spa_vdev_trim_impl(spa_t *spa, uint64_t guid, uint64_t cmd_type, + uint64_t rate, boolean_t partial, boolean_t secure, list_t *vd_list) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + + /* Look up vdev and ensure it's a leaf. */ + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_detached) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(ENODEV)); + } else if (!vd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EINVAL)); + } else if (!vdev_writeable(vd)) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EROFS)); + } else if (!vd->vdev_has_trim) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EOPNOTSUPP)); + } else if (secure && !vd->vdev_has_securetrim) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (SET_ERROR(EOPNOTSUPP)); + } + mutex_enter(&vd->vdev_trim_lock); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + /* + * When we activate a TRIM action we check to see if the + * vdev_trim_thread is NULL. We do this instead of using the + * vdev_trim_state since there might be a previous TRIM process + * which has completed but the thread is not exited. + */ + if (cmd_type == POOL_TRIM_START && + (vd->vdev_trim_thread != NULL || vd->vdev_top->vdev_removing)) { + mutex_exit(&vd->vdev_trim_lock); + return (SET_ERROR(EBUSY)); + } else if (cmd_type == POOL_TRIM_CANCEL && + (vd->vdev_trim_state != VDEV_TRIM_ACTIVE && + vd->vdev_trim_state != VDEV_TRIM_SUSPENDED)) { + mutex_exit(&vd->vdev_trim_lock); + return (SET_ERROR(ESRCH)); + } else if (cmd_type == POOL_TRIM_SUSPEND && + vd->vdev_trim_state != VDEV_TRIM_ACTIVE) { + mutex_exit(&vd->vdev_trim_lock); + return (SET_ERROR(ESRCH)); + } + + switch (cmd_type) { + case POOL_TRIM_START: + vdev_trim(vd, rate, partial, secure); + break; + case POOL_TRIM_CANCEL: + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, vd_list); + break; + case POOL_TRIM_SUSPEND: + vdev_trim_stop(vd, VDEV_TRIM_SUSPENDED, vd_list); + break; + default: + panic("invalid cmd_type %llu", (unsigned long long)cmd_type); + } + mutex_exit(&vd->vdev_trim_lock); + + return (0); +} + +/* + * Initiates a manual TRIM for the requested vdevs. This kicks off individual + * TRIM threads for each child vdev. These threads pass over all of the free + * space in the vdev's metaslabs and issues TRIM commands for that space. + */ +int +spa_vdev_trim(spa_t *spa, nvlist_t *nv, uint64_t cmd_type, uint64_t rate, + boolean_t partial, boolean_t secure, nvlist_t *vdev_errlist) +{ + int total_errors = 0; + list_t vd_list; + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_trim_node)); + + /* + * We hold the namespace lock through the whole function + * to prevent any changes to the pool while we're starting or + * stopping TRIM. The config and state locks are held so that + * we can properly assess the vdev state before we commit to + * the TRIM operation. + */ + mutex_enter(&spa_namespace_lock); + + for (nvpair_t *pair = nvlist_next_nvpair(nv, NULL); + pair != NULL; pair = nvlist_next_nvpair(nv, pair)) { + uint64_t vdev_guid = fnvpair_value_uint64(pair); + + int error = spa_vdev_trim_impl(spa, vdev_guid, cmd_type, + rate, partial, secure, &vd_list); + if (error != 0) { + char guid_as_str[MAXNAMELEN]; + + (void) snprintf(guid_as_str, sizeof (guid_as_str), + "%llu", (unsigned long long)vdev_guid); + fnvlist_add_int64(vdev_errlist, guid_as_str, error); + total_errors++; + } + } + + /* Wait for all TRIM threads to stop. */ + vdev_trim_stop_wait(spa, &vd_list); + + /* Sync out the TRIM state */ + txg_wait_synced(spa->spa_dsl_pool, 0); + mutex_exit(&spa_namespace_lock); + + list_destroy(&vd_list); + + return (total_errors); +} + +/* + * Split a set of devices from their mirrors, and create a new pool from them. + */ +int +spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) +{ + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + ASSERT(spa_writeable(spa)); + + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_reset_logs(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || (vd->vdev_ops != &vdev_indirect_ops && + !vdev_is_concrete(vd))) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = SET_ERROR(EINVAL); + break; + } + } + + /* deal with indirect vdevs */ + if (spa->spa_root_vdev->vdev_child[c]->vdev_ops == + &vdev_indirect_ops) + continue; + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = SET_ERROR(EINVAL); + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = SET_ERROR(ENODEV); + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + !vdev_is_concrete(vml[c]) || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_children != 0 || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = SET_ERROR(EINVAL); + break; + } + + if (vdev_dtl_required(vml[c]) || + vdev_resilver_needed(vml[c], NULL, NULL)) { + error = SET_ERROR(EBUSY); + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + + /* transfer per-vdev ZAPs */ + ASSERT3U(vml[c]->vdev_leaf_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_LEAF_ZAP, vml[c]->vdev_leaf_zap)); + + ASSERT3U(vml[c]->vdev_top->vdev_top_zap, !=, 0); + VERIFY0(nvlist_add_uint64(child[c], + ZPOOL_CONFIG_VDEV_TOP_ZAP, + vml[c]->vdev_parent->vdev_top_zap)); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; + } + vdev_reopen(spa->spa_root_vdev); + + /* + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. + */ + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); + + mutex_enter(&spa->spa_props_lock); + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + mutex_exit(&spa->spa_props_lock); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); + + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + VERIFY0(nvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS)); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + + /* add the new pool to the namespace */ + newspa = spa_add(newname, config, altroot); + newspa->spa_avz_action = AVZ_ACTION_REBUILD; + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); + + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); + + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); + + /* + * Temporarily stop the initializing and TRIM activity. We set the + * state to ACTIVE so that we know to resume initializing or TRIM + * once the split has completed. + */ + list_t vd_initialize_list; + list_create(&vd_initialize_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + list_t vd_trim_list; + list_create(&vd_trim_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_trim_node)); + + for (c = 0; c < children; c++) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { + mutex_enter(&vml[c]->vdev_initialize_lock); + vdev_initialize_stop(vml[c], + VDEV_INITIALIZE_ACTIVE, &vd_initialize_list); + mutex_exit(&vml[c]->vdev_initialize_lock); + + mutex_enter(&vml[c]->vdev_trim_lock); + vdev_trim_stop(vml[c], VDEV_TRIM_ACTIVE, &vd_trim_list); + mutex_exit(&vml[c]->vdev_trim_lock); + } + } + + vdev_initialize_stop_wait(spa, &vd_initialize_list); + vdev_trim_stop_wait(spa, &vd_trim_list); + + list_destroy(&vd_initialize_list); + list_destroy(&vd_trim_list); + + newspa->spa_config_source = SPA_CONFIG_SRC_SPLIT; + newspa->spa_is_splitting = B_TRUE; + + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE); + if (error) + goto out; + + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); + } + + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } + + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); + + spa_async_resume(newspa); + + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL && vml[c]->vdev_ops != &vdev_indirect_ops) { + vdev_t *tvd = vml[c]->vdev_top; + + /* + * Need to be sure the detachable VDEV is not + * on any *other* txg's DTL list to prevent it + * from being accessed after it's freed. + */ + for (int t = 0; t < TXG_SIZE; t++) { + (void) txg_list_remove_this( + &tvd->vdev_dtl_list, vml[c], t); + } + + vdev_split(vml[c]); + if (error == 0) + spa_history_log_internal(spa, "detach", tx, + "vdev=%s", vml[c]->vdev_path); + + vdev_free(vml[c]); + } + } + spa->spa_avz_action = AVZ_ACTION_REBUILD; + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); + + /* split is complete; log a history record */ + spa_history_log_internal(newspa, "split", NULL, + "from pool %s", spa_name(spa)); + + newspa->spa_is_splitting = B_FALSE; + kmem_free(vml, children * sizeof (vdev_t *)); + + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); + + return (error); + +out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); + + txg = spa_vdev_config_enter(spa); + + /* re-online all offlined disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_FALSE; + } + + /* restart initializing or trimming disks as necessary */ + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + + vdev_reopen(spa->spa_root_vdev); + + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + + kmem_free(vml, children * sizeof (vdev_t *)); + return (error); +} + +/* + * Find any device that's done replacing, or a vdev marked 'unspare' that's + * currently spared, so we can detach it. + */ +static vdev_t * +spa_vdev_resilver_done_hunt(vdev_t *vd) +{ + vdev_t *newvd, *oldvd; + + for (int c = 0; c < vd->vdev_children; c++) { + oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); + if (oldvd != NULL) + return (oldvd); + } + + /* + * Check for a completed replacement. We always consider the first + * vdev in the list to be the oldest vdev, and the last one to be + * the newest (see spa_vdev_attach() for how that works). In + * the case where the newest vdev is faulted, we will not automatically + * remove it after a resilver completes. This is OK as it will require + * user intervention to determine which disk the admin wishes to keep. + */ + if (vd->vdev_ops == &vdev_replacing_ops) { + ASSERT(vd->vdev_children > 1); + + newvd = vd->vdev_child[vd->vdev_children - 1]; + oldvd = vd->vdev_child[0]; + + if (vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) + return (oldvd); + } + + /* + * Check for a completed resilver with the 'unspare' flag set. + * Also potentially update faulted state. + */ + if (vd->vdev_ops == &vdev_spare_ops) { + vdev_t *first = vd->vdev_child[0]; + vdev_t *last = vd->vdev_child[vd->vdev_children - 1]; + + if (last->vdev_unspare) { + oldvd = first; + newvd = last; + } else if (first->vdev_unspare) { + oldvd = last; + newvd = first; + } else { + oldvd = NULL; + } + + if (oldvd != NULL && + vdev_dtl_empty(newvd, DTL_MISSING) && + vdev_dtl_empty(newvd, DTL_OUTAGE) && + !vdev_dtl_required(oldvd)) + return (oldvd); + + vdev_propagate_state(vd); + + /* + * If there are more than two spares attached to a disk, + * and those spares are not required, then we want to + * attempt to free them up now so that they can be used + * by other pools. Once we're back down to a single + * disk+spare, we stop removing them. + */ + if (vd->vdev_children > 2) { + newvd = vd->vdev_child[1]; + + if (newvd->vdev_isspare && last->vdev_isspare && + vdev_dtl_empty(last, DTL_MISSING) && + vdev_dtl_empty(last, DTL_OUTAGE) && + !vdev_dtl_required(newvd)) + return (newvd); + } + } + + return (NULL); +} + +static void +spa_vdev_resilver_done(spa_t *spa) +{ + vdev_t *vd, *pvd, *ppvd; + uint64_t guid, sguid, pguid, ppguid; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + pvd = vd->vdev_parent; + ppvd = pvd->vdev_parent; + guid = vd->vdev_guid; + pguid = pvd->vdev_guid; + ppguid = ppvd->vdev_guid; + sguid = 0; + /* + * If we have just finished replacing a hot spared device, then + * we need to detach the parent's first child (the original hot + * spare) as well. + */ + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0 && + ppvd->vdev_children == 2) { + ASSERT(pvd->vdev_ops == &vdev_replacing_ops); + sguid = ppvd->vdev_child[1]->vdev_guid; + } + ASSERT(vd->vdev_resilver_txg == 0 || !vdev_dtl_required(vd)); + + spa_config_exit(spa, SCL_ALL, FTAG); + if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) + return; + if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) + return; + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + } + + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * If a detach was not performed above replace waiters will not have + * been notified. In which case we must do so now. + */ + spa_notify_waiters(spa); +} + +/* + * Update the stored path or FRU for this vdev. + */ +static int +spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, + boolean_t ispath) +{ + vdev_t *vd; + boolean_t sync = B_FALSE; + + ASSERT(spa_writeable(spa)); + + spa_vdev_state_enter(spa, SCL_ALL); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENOENT)); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + if (ispath) { + if (strcmp(value, vd->vdev_path) != 0) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + sync = B_TRUE; + } + } else { + if (vd->vdev_fru == NULL) { + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } else if (strcmp(value, vd->vdev_fru) != 0) { + spa_strfree(vd->vdev_fru); + vd->vdev_fru = spa_strdup(value); + sync = B_TRUE; + } + } + + return (spa_vdev_state_exit(spa, sync ? vd : NULL, 0)); +} + +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); +} + +int +spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) +{ + return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); +} + +/* + * ========================================================================== + * SPA Scanning + * ========================================================================== + */ +int +spa_scrub_pause_resume(spa_t *spa, pool_scrub_cmd_t cmd) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + + return (dsl_scrub_set_pause_resume(spa->spa_dsl_pool, cmd)); +} + +int +spa_scan_stop(spa_t *spa) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + if (dsl_scan_resilvering(spa->spa_dsl_pool)) + return (SET_ERROR(EBUSY)); + return (dsl_scan_cancel(spa->spa_dsl_pool)); +} + +int +spa_scan(spa_t *spa, pool_scan_func_t func) +{ + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + + if (func >= POOL_SCAN_FUNCS || func == POOL_SCAN_NONE) + return (SET_ERROR(ENOTSUP)); + + if (func == POOL_SCAN_RESILVER && + !spa_feature_is_enabled(spa, SPA_FEATURE_RESILVER_DEFER)) + return (SET_ERROR(ENOTSUP)); + + /* + * If a resilver was requested, but there is no DTL on a + * writeable leaf device, we have nothing to do. + */ + if (func == POOL_SCAN_RESILVER && + !vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) { + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + return (0); + } + + return (dsl_scan(spa->spa_dsl_pool, func)); +} + +/* + * ========================================================================== + * SPA async task processing + * ========================================================================== + */ + +static void +spa_async_remove(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_remove_wanted) { + vd->vdev_remove_wanted = B_FALSE; + vd->vdev_delayed_close = B_FALSE; + vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); + + /* + * We want to clear the stats, but we don't want to do a full + * vdev_clear() as that will cause us to throw away + * degraded/faulted state as well as attempt to reopen the + * device, all of which is a waste. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + + vdev_state_dirty(vd->vdev_top); + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_remove(spa, vd->vdev_child[c]); +} + +static void +spa_async_probe(spa_t *spa, vdev_t *vd) +{ + if (vd->vdev_probe_wanted) { + vd->vdev_probe_wanted = B_FALSE; + vdev_reopen(vd); /* vdev_open() does the actual probe */ + } + + for (int c = 0; c < vd->vdev_children; c++) + spa_async_probe(spa, vd->vdev_child[c]); +} + +static void +spa_async_autoexpand(spa_t *spa, vdev_t *vd) +{ + if (!spa->spa_autoexpand) + return; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + spa_async_autoexpand(spa, cvd); + } + + if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) + return; + + spa_event_notify(vd->vdev_spa, vd, NULL, ESC_ZFS_VDEV_AUTOEXPAND); +} + +static void +spa_async_thread(void *arg) +{ + spa_t *spa = (spa_t *)arg; + dsl_pool_t *dp = spa->spa_dsl_pool; + int tasks; + + ASSERT(spa->spa_sync_on); + + mutex_enter(&spa->spa_async_lock); + tasks = spa->spa_async_tasks; + spa->spa_async_tasks = 0; + mutex_exit(&spa->spa_async_lock); + + /* + * See if the config needs to be updated. + */ + if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + uint64_t old_space, new_space; + + mutex_enter(&spa_namespace_lock); + old_space = metaslab_class_get_space(spa_normal_class(spa)); + old_space += metaslab_class_get_space(spa_special_class(spa)); + old_space += metaslab_class_get_space(spa_dedup_class(spa)); + + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + + new_space = metaslab_class_get_space(spa_normal_class(spa)); + new_space += metaslab_class_get_space(spa_special_class(spa)); + new_space += metaslab_class_get_space(spa_dedup_class(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * If the pool grew as a result of the config update, + * then log an internal history event. + */ + if (new_space != old_space) { + spa_history_log_internal(spa, "vdev online", NULL, + "pool '%s' size: %llu(+%llu)", + spa_name(spa), (u_longlong_t)new_space, + (u_longlong_t)(new_space - old_space)); + } + } + + /* + * See if any devices need to be marked REMOVED. + */ + if (tasks & SPA_ASYNC_REMOVE) { + spa_vdev_state_enter(spa, SCL_NONE); + spa_async_remove(spa, spa->spa_root_vdev); + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) + spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); + for (int i = 0; i < spa->spa_spares.sav_count; i++) + spa_async_remove(spa, spa->spa_spares.sav_vdevs[i]); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_async_autoexpand(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + + /* + * See if any devices need to be probed. + */ + if (tasks & SPA_ASYNC_PROBE) { + spa_vdev_state_enter(spa, SCL_NONE); + spa_async_probe(spa, spa->spa_root_vdev); + (void) spa_vdev_state_exit(spa, NULL, 0); + } + + /* + * If any devices are done replacing, detach them. + */ + if (tasks & SPA_ASYNC_RESILVER_DONE) + spa_vdev_resilver_done(spa); + + /* + * If any devices are done replacing, detach them. Then if no + * top-level vdevs are rebuilding attempt to kick off a scrub. + */ + if (tasks & SPA_ASYNC_REBUILD_DONE) { + spa_vdev_resilver_done(spa); + + if (!vdev_rebuild_active(spa->spa_root_vdev)) + (void) dsl_scan(spa->spa_dsl_pool, POOL_SCAN_SCRUB); + } + + /* + * Kick off a resilver. + */ + if (tasks & SPA_ASYNC_RESILVER && + !vdev_rebuild_active(spa->spa_root_vdev) && + (!dsl_scan_resilvering(dp) || + !spa_feature_is_enabled(dp->dp_spa, SPA_FEATURE_RESILVER_DEFER))) + dsl_scan_restart_resilver(dp, 0); + + if (tasks & SPA_ASYNC_INITIALIZE_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_initialize_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + if (tasks & SPA_ASYNC_TRIM_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_restart(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + if (tasks & SPA_ASYNC_AUTOTRIM_RESTART) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_autotrim_restart(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* + * Kick off L2 cache whole device TRIM. + */ + if (tasks & SPA_ASYNC_L2CACHE_TRIM) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_l2arc(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* + * Kick off L2 cache rebuilding. + */ + if (tasks & SPA_ASYNC_L2CACHE_REBUILD) { + mutex_enter(&spa_namespace_lock); + spa_config_enter(spa, SCL_L2ARC, FTAG, RW_READER); + l2arc_spa_rebuild_start(spa); + spa_config_exit(spa, SCL_L2ARC, FTAG); + mutex_exit(&spa_namespace_lock); + } + + /* + * Let the world know that we're done. + */ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_thread = NULL; + cv_broadcast(&spa->spa_async_cv); + mutex_exit(&spa->spa_async_lock); + thread_exit(); +} + +void +spa_async_suspend(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + spa->spa_async_suspended++; + while (spa->spa_async_thread != NULL) + cv_wait(&spa->spa_async_cv, &spa->spa_async_lock); + mutex_exit(&spa->spa_async_lock); + + spa_vdev_remove_suspend(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL) + zthr_cancel(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL) + zthr_cancel(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_cancel(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_cancel(ll_condense_thread); +} + +void +spa_async_resume(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + ASSERT(spa->spa_async_suspended != 0); + spa->spa_async_suspended--; + mutex_exit(&spa->spa_async_lock); + spa_restart_removal(spa); + + zthr_t *condense_thread = spa->spa_condense_zthr; + if (condense_thread != NULL) + zthr_resume(condense_thread); + + zthr_t *discard_thread = spa->spa_checkpoint_discard_zthr; + if (discard_thread != NULL) + zthr_resume(discard_thread); + + zthr_t *ll_delete_thread = spa->spa_livelist_delete_zthr; + if (ll_delete_thread != NULL) + zthr_resume(ll_delete_thread); + + zthr_t *ll_condense_thread = spa->spa_livelist_condense_zthr; + if (ll_condense_thread != NULL) + zthr_resume(ll_condense_thread); +} + +static boolean_t +spa_async_tasks_pending(spa_t *spa) +{ + uint_t non_config_tasks; + uint_t config_task; + boolean_t config_task_suspended; + + non_config_tasks = spa->spa_async_tasks & ~SPA_ASYNC_CONFIG_UPDATE; + config_task = spa->spa_async_tasks & SPA_ASYNC_CONFIG_UPDATE; + if (spa->spa_ccw_fail_time == 0) { + config_task_suspended = B_FALSE; + } else { + config_task_suspended = + (gethrtime() - spa->spa_ccw_fail_time) < + ((hrtime_t)zfs_ccw_retry_interval * NANOSEC); + } + + return (non_config_tasks || (config_task && !config_task_suspended)); +} + +static void +spa_async_dispatch(spa_t *spa) +{ + mutex_enter(&spa->spa_async_lock); + if (spa_async_tasks_pending(spa) && + !spa->spa_async_suspended && + spa->spa_async_thread == NULL) + spa->spa_async_thread = thread_create(NULL, 0, + spa_async_thread, spa, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&spa->spa_async_lock); +} + +void +spa_async_request(spa_t *spa, int task) +{ + zfs_dbgmsg("spa=%s async request task=%u", spa->spa_name, task); + mutex_enter(&spa->spa_async_lock); + spa->spa_async_tasks |= task; + mutex_exit(&spa->spa_async_lock); +} + +int +spa_async_tasks(spa_t *spa) +{ + return (spa->spa_async_tasks); +} + +/* + * ========================================================================== + * SPA syncing routines + * ========================================================================== + */ + + +static int +bpobj_enqueue_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + bpobj_t *bpo = arg; + bpobj_enqueue(bpo, bp, bp_freed, tx); + return (0); +} + +int +bpobj_enqueue_alloc_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_FALSE, tx)); +} + +int +bpobj_enqueue_free_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + return (bpobj_enqueue_cb(arg, bp, B_TRUE, tx)); +} + +static int +spa_free_sync_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zio_t *pio = arg; + + zio_nowait(zio_free_sync(pio, pio->io_spa, dmu_tx_get_txg(tx), bp, + pio->io_flags)); + return (0); +} + +static int +bpobj_spa_free_sync_cb(void *arg, const blkptr_t *bp, boolean_t bp_freed, + dmu_tx_t *tx) +{ + ASSERT(!bp_freed); + return (spa_free_sync_cb(arg, bp, tx)); +} + +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing frees. + */ +static void +spa_sync_frees(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx) +{ + zio_t *zio = zio_root(spa, NULL, NULL, 0); + bplist_iterate(bpl, spa_free_sync_cb, zio, tx); + VERIFY(zio_wait(zio) == 0); +} + +/* + * Note: this simple function is not inlined to make it easier to dtrace the + * amount of time spent syncing deferred frees. + */ +static void +spa_sync_deferred_frees(spa_t *spa, dmu_tx_t *tx) +{ + if (spa_sync_pass(spa) != 1) + return; + + /* + * Note: + * If the log space map feature is active, we stop deferring + * frees to the next TXG and therefore running this function + * would be considered a no-op as spa_deferred_bpobj should + * not have any entries. + * + * That said we run this function anyway (instead of returning + * immediately) for the edge-case scenario where we just + * activated the log space map feature in this TXG but we have + * deferred frees from the previous TXG. + */ + zio_t *zio = zio_root(spa, NULL, NULL, 0); + VERIFY3U(bpobj_iterate(&spa->spa_deferred_bpobj, + bpobj_spa_free_sync_cb, zio, tx), ==, 0); + VERIFY0(zio_wait(zio)); +} + +static void +spa_sync_nvlist(spa_t *spa, uint64_t obj, nvlist_t *nv, dmu_tx_t *tx) +{ + char *packed = NULL; + size_t bufsize; + size_t nvsize = 0; + dmu_buf_t *db; + + VERIFY(nvlist_size(nv, &nvsize, NV_ENCODE_XDR) == 0); + + /* + * Write full (SPA_CONFIG_BLOCKSIZE) blocks of configuration + * information. This avoids the dmu_buf_will_dirty() path and + * saves us a pre-read to get data we don't actually care about. + */ + bufsize = P2ROUNDUP((uint64_t)nvsize, SPA_CONFIG_BLOCKSIZE); + packed = vmem_alloc(bufsize, KM_SLEEP); + + VERIFY(nvlist_pack(nv, &packed, &nvsize, NV_ENCODE_XDR, + KM_SLEEP) == 0); + bzero(packed + nvsize, bufsize - nvsize); + + dmu_write(spa->spa_meta_objset, obj, 0, bufsize, packed, tx); + + vmem_free(packed, bufsize); + + VERIFY(0 == dmu_bonus_hold(spa->spa_meta_objset, obj, FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = nvsize; + dmu_buf_rele(db, FTAG); +} + +static void +spa_sync_aux_dev(spa_t *spa, spa_aux_vdev_t *sav, dmu_tx_t *tx, + const char *config, const char *entry) +{ + nvlist_t *nvroot; + nvlist_t **list; + int i; + + if (!sav->sav_sync) + return; + + /* + * Update the MOS nvlist describing the list of available devices. + * spa_validate_aux() will have already made sure this nvlist is + * valid and the vdevs are labeled appropriately. + */ + if (sav->sav_object == 0) { + sav->sav_object = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_PACKED_NVLIST, 1 << 14, DMU_OT_PACKED_NVLIST_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, entry, sizeof (uint64_t), 1, + &sav->sav_object, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (sav->sav_count == 0) { + VERIFY(nvlist_add_nvlist_array(nvroot, config, NULL, 0) == 0); + } else { + list = kmem_alloc(sav->sav_count*sizeof (void *), KM_SLEEP); + for (i = 0; i < sav->sav_count; i++) + list[i] = vdev_config_generate(spa, sav->sav_vdevs[i], + B_FALSE, VDEV_CONFIG_L2CACHE); + VERIFY(nvlist_add_nvlist_array(nvroot, config, list, + sav->sav_count) == 0); + for (i = 0; i < sav->sav_count; i++) + nvlist_free(list[i]); + kmem_free(list, sav->sav_count * sizeof (void *)); + } + + spa_sync_nvlist(spa, sav->sav_object, nvroot, tx); + nvlist_free(nvroot); + + sav->sav_sync = B_FALSE; +} + +/* + * Rebuild spa's all-vdev ZAP from the vdev ZAPs indicated in each vdev_t. + * The all-vdev ZAP must be empty. + */ +static void +spa_avz_build(vdev_t *vd, uint64_t avz, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + + if (vd->vdev_top_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_top_zap, tx)); + } + if (vd->vdev_leaf_zap != 0) { + VERIFY0(zap_add_int(spa->spa_meta_objset, avz, + vd->vdev_leaf_zap, tx)); + } + for (uint64_t i = 0; i < vd->vdev_children; i++) { + spa_avz_build(vd->vdev_child[i], avz, tx); + } +} + +static void +spa_sync_config_object(spa_t *spa, dmu_tx_t *tx) +{ + nvlist_t *config; + + /* + * If the pool is being imported from a pre-per-vdev-ZAP version of ZFS, + * its config may not be dirty but we still need to build per-vdev ZAPs. + * Similarly, if the pool is being assembled (e.g. after a split), we + * need to rebuild the AVZ although the config may not be dirty. + */ + if (list_is_empty(&spa->spa_config_dirty_list) && + spa->spa_avz_action == AVZ_ACTION_NONE) + return; + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + ASSERT(spa->spa_avz_action == AVZ_ACTION_NONE || + spa->spa_avz_action == AVZ_ACTION_INITIALIZE || + spa->spa_all_vdev_zaps != 0); + + if (spa->spa_avz_action == AVZ_ACTION_REBUILD) { + /* Make and build the new AVZ */ + uint64_t new_avz = zap_create(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + spa_avz_build(spa->spa_root_vdev, new_avz, tx); + + /* Diff old AVZ with new one */ + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t vdzap = za.za_first_integer; + if (zap_lookup_int(spa->spa_meta_objset, new_avz, + vdzap) == ENOENT) { + /* + * ZAP is listed in old AVZ but not in new one; + * destroy it + */ + VERIFY0(zap_destroy(spa->spa_meta_objset, vdzap, + tx)); + } + } + + zap_cursor_fini(&zc); + + /* Destroy the old AVZ */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + + /* Replace the old AVZ in the dir obj with the new one */ + VERIFY0(zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, + sizeof (new_avz), 1, &new_avz, tx)); + + spa->spa_all_vdev_zaps = new_avz; + } else if (spa->spa_avz_action == AVZ_ACTION_DESTROY) { + zap_cursor_t zc; + zap_attribute_t za; + + /* Walk through the AVZ and destroy all listed ZAPs */ + for (zap_cursor_init(&zc, spa->spa_meta_objset, + spa->spa_all_vdev_zaps); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + uint64_t zap = za.za_first_integer; + VERIFY0(zap_destroy(spa->spa_meta_objset, zap, tx)); + } + + zap_cursor_fini(&zc); + + /* Destroy and unlink the AVZ itself */ + VERIFY0(zap_destroy(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, tx)); + VERIFY0(zap_remove(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_VDEV_ZAP_MAP, tx)); + spa->spa_all_vdev_zaps = 0; + } + + if (spa->spa_all_vdev_zaps == 0) { + spa->spa_all_vdev_zaps = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_VDEV_ZAP_MAP, tx); + } + spa->spa_avz_action = AVZ_ACTION_NONE; + + /* Create ZAPs for vdevs that don't have them. */ + vdev_construct_zaps(spa->spa_root_vdev, tx); + + config = spa_config_generate(spa, spa->spa_root_vdev, + dmu_tx_get_txg(tx), B_FALSE); + + /* + * If we're upgrading the spa version then make sure that + * the config object gets updated with the correct version. + */ + if (spa->spa_ubsync.ub_version < spa->spa_uberblock.ub_version) + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa->spa_uberblock.ub_version); + + spa_config_exit(spa, SCL_STATE, FTAG); + + nvlist_free(spa->spa_config_syncing); + spa->spa_config_syncing = config; + + spa_sync_nvlist(spa, spa->spa_config_object, config, tx); +} + +static void +spa_sync_version(void *arg, dmu_tx_t *tx) +{ + uint64_t *versionp = arg; + uint64_t version = *versionp; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + /* + * Setting the version is special cased when first creating the pool. + */ + ASSERT(tx->tx_txg != TXG_INITIAL); + + ASSERT(SPA_VERSION_IS_SUPPORTED(version)); + ASSERT(version >= spa_version(spa)); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, "version=%lld", + (longlong_t)version); +} + +/* + * Set zpool properties. + */ +static void +spa_sync_props(void *arg, dmu_tx_t *tx) +{ + nvlist_t *nvp = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = spa->spa_meta_objset; + nvpair_t *elem = NULL; + + mutex_enter(&spa->spa_props_lock); + + while ((elem = nvlist_next_nvpair(nvp, elem))) { + uint64_t intval; + char *strval, *fname; + zpool_prop_t prop; + const char *propname; + zprop_type_t proptype; + spa_feature_t fid; + + switch (prop = zpool_name_to_prop(nvpair_name(elem))) { + case ZPOOL_PROP_INVAL: + /* + * We checked this earlier in spa_prop_validate(). + */ + ASSERT(zpool_prop_feature(nvpair_name(elem))); + + fname = strchr(nvpair_name(elem), '@') + 1; + VERIFY0(zfeature_lookup_name(fname, &fid)); + + spa_feature_enable(spa, fid, tx); + spa_history_log_internal(spa, "set", tx, + "%s=enabled", nvpair_name(elem)); + break; + + case ZPOOL_PROP_VERSION: + intval = fnvpair_value_uint64(elem); + /* + * The version is synced separately before other + * properties and should be correct by now. + */ + ASSERT3U(spa_version(spa), >=, intval); + break; + + case ZPOOL_PROP_ALTROOT: + /* + * 'altroot' is a non-persistent property. It should + * have been set temporarily at creation or import time. + */ + ASSERT(spa->spa_root != NULL); + break; + + case ZPOOL_PROP_READONLY: + case ZPOOL_PROP_CACHEFILE: + /* + * 'readonly' and 'cachefile' are also non-persistent + * properties. + */ + break; + case ZPOOL_PROP_COMMENT: + strval = fnvpair_value_string(elem); + if (spa->spa_comment != NULL) + spa_strfree(spa->spa_comment); + spa->spa_comment = spa_strdup(strval); + /* + * We need to dirty the configuration on all the vdevs + * so that their labels get updated. It's unnecessary + * to do this for pool creation since the vdev's + * configuration has already been dirtied. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + break; + default: + /* + * Set pool property values in the poolprops mos object. + */ + if (spa->spa_pool_props_object == 0) { + spa->spa_pool_props_object = + zap_create_link(mos, DMU_OT_POOL_PROPS, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_PROPS, + tx); + } + + /* normalize the property name */ + propname = zpool_prop_to_name(prop); + proptype = zpool_prop_get_type(prop); + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + ASSERT(proptype == PROP_TYPE_STRING); + strval = fnvpair_value_string(elem); + VERIFY0(zap_update(mos, + spa->spa_pool_props_object, propname, + 1, strlen(strval) + 1, strval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%s", nvpair_name(elem), strval); + } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { + intval = fnvpair_value_uint64(elem); + + if (proptype == PROP_TYPE_INDEX) { + const char *unused; + VERIFY0(zpool_prop_index_to_string( + prop, intval, &unused)); + } + VERIFY0(zap_update(mos, + spa->spa_pool_props_object, propname, + 8, 1, &intval, tx)); + spa_history_log_internal(spa, "set", tx, + "%s=%lld", nvpair_name(elem), + (longlong_t)intval); + } else { + ASSERT(0); /* not allowed */ + } + + switch (prop) { + case ZPOOL_PROP_DELEGATION: + spa->spa_delegation = intval; + break; + case ZPOOL_PROP_BOOTFS: + spa->spa_bootfs = intval; + break; + case ZPOOL_PROP_FAILUREMODE: + spa->spa_failmode = intval; + break; + case ZPOOL_PROP_AUTOTRIM: + spa->spa_autotrim = intval; + spa_async_request(spa, + SPA_ASYNC_AUTOTRIM_RESTART); + break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + if (tx->tx_txg != TXG_INITIAL) + spa_async_request(spa, + SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_MULTIHOST: + spa->spa_multihost = intval; + break; + default: + break; + } + } + + } + + mutex_exit(&spa->spa_props_lock); +} + +/* + * Perform one-time upgrade on-disk changes. spa_version() does not + * reflect the new version this txg, so there must be no changes this + * txg to anything that the upgrade code depends on after it executes. + * Therefore this must be called after dsl_pool_sync() does the sync + * tasks. + */ +static void +spa_sync_upgrades(spa_t *spa, dmu_tx_t *tx) +{ + if (spa_sync_pass(spa) != 1) + return; + + dsl_pool_t *dp = spa->spa_dsl_pool; + rrw_enter(&dp->dp_config_rwlock, RW_WRITER, FTAG); + + if (spa->spa_ubsync.ub_version < SPA_VERSION_ORIGIN && + spa->spa_uberblock.ub_version >= SPA_VERSION_ORIGIN) { + dsl_pool_create_origin(dp, tx); + + /* Keeping the origin open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_NEXT_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_NEXT_CLONES) { + dsl_pool_upgrade_clones(dp, tx); + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_DIR_CLONES && + spa->spa_uberblock.ub_version >= SPA_VERSION_DIR_CLONES) { + dsl_pool_upgrade_dir_clones(dp, tx); + + /* Keeping the freedir open increases spa_minref */ + spa->spa_minref += 3; + } + + if (spa->spa_ubsync.ub_version < SPA_VERSION_FEATURES && + spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + spa_feature_create_zap_objects(spa, tx); + } + + /* + * LZ4_COMPRESS feature's behaviour was changed to activate_on_enable + * when possibility to use lz4 compression for metadata was added + * Old pools that have this feature enabled must be upgraded to have + * this feature active + */ + if (spa->spa_uberblock.ub_version >= SPA_VERSION_FEATURES) { + boolean_t lz4_en = spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS); + boolean_t lz4_ac = spa_feature_is_active(spa, + SPA_FEATURE_LZ4_COMPRESS); + + if (lz4_en && !lz4_ac) + spa_feature_incr(spa, SPA_FEATURE_LZ4_COMPRESS, tx); + } + + /* + * If we haven't written the salt, do so now. Note that the + * feature may not be activated yet, but that's fine since + * the presence of this ZAP entry is backwards compatible. + */ + if (zap_contains(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CHECKSUM_SALT) == ENOENT) { + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CHECKSUM_SALT, 1, + sizeof (spa->spa_cksum_salt.zcs_bytes), + spa->spa_cksum_salt.zcs_bytes, tx)); + } + + rrw_exit(&dp->dp_config_rwlock, FTAG); +} + +static void +vdev_indirect_state_sync_verify(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim __maybe_unused = vd->vdev_indirect_mapping; + vdev_indirect_births_t *vib __maybe_unused = vd->vdev_indirect_births; + + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(vim != NULL); + ASSERT(vib != NULL); + } + + uint64_t obsolete_sm_object = 0; + ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + ASSERT(vdev_indirect_mapping_num_entries(vim) > 0); + ASSERT(vdev_indirect_mapping_bytes_mapped(vim) > 0); + ASSERT3U(obsolete_sm_object, ==, + space_map_object(vd->vdev_obsolete_sm)); + ASSERT3U(vdev_indirect_mapping_bytes_mapped(vim), >=, + space_map_allocated(vd->vdev_obsolete_sm)); + } + ASSERT(vd->vdev_obsolete_segments != NULL); + + /* + * Since frees / remaps to an indirect vdev can only + * happen in syncing context, the obsolete segments + * tree must be empty when we start syncing. + */ + ASSERT0(range_tree_space(vd->vdev_obsolete_segments)); +} + +/* + * Set the top-level vdev's max queue depth. Evaluate each top-level's + * async write queue depth in case it changed. The max queue depth will + * not change in the middle of syncing out this txg. + */ +static void +spa_sync_adjust_vdev_max_queue_depth(spa_t *spa) +{ + ASSERT(spa_writeable(spa)); + + vdev_t *rvd = spa->spa_root_vdev; + uint32_t max_queue_depth = zfs_vdev_async_write_max_active * + zfs_vdev_queue_depth_pct / 100; + metaslab_class_t *normal = spa_normal_class(spa); + metaslab_class_t *special = spa_special_class(spa); + metaslab_class_t *dedup = spa_dedup_class(spa); + + uint64_t slots_per_allocator = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + metaslab_group_t *mg = tvd->vdev_mg; + if (mg == NULL || !metaslab_group_initialized(mg)) + continue; + + metaslab_class_t *mc = mg->mg_class; + if (mc != normal && mc != special && mc != dedup) + continue; + + /* + * It is safe to do a lock-free check here because only async + * allocations look at mg_max_alloc_queue_depth, and async + * allocations all happen from spa_sync(). + */ + for (int i = 0; i < mg->mg_allocators; i++) { + ASSERT0(zfs_refcount_count( + &(mg->mg_allocator[i].mga_alloc_queue_depth))); + } + mg->mg_max_alloc_queue_depth = max_queue_depth; + + for (int i = 0; i < mg->mg_allocators; i++) { + mg->mg_allocator[i].mga_cur_max_alloc_queue_depth = + zfs_vdev_def_queue_depth; + } + slots_per_allocator += zfs_vdev_def_queue_depth; + } + + for (int i = 0; i < spa->spa_alloc_count; i++) { + ASSERT0(zfs_refcount_count(&normal->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&special->mc_alloc_slots[i])); + ASSERT0(zfs_refcount_count(&dedup->mc_alloc_slots[i])); + normal->mc_alloc_max_slots[i] = slots_per_allocator; + special->mc_alloc_max_slots[i] = slots_per_allocator; + dedup->mc_alloc_max_slots[i] = slots_per_allocator; + } + normal->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + special->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; + dedup->mc_alloc_throttle_enabled = zio_dva_throttle_enabled; +} + +static void +spa_sync_condense_indirect(spa_t *spa, dmu_tx_t *tx) +{ + ASSERT(spa_writeable(spa)); + + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + vdev_indirect_state_sync_verify(vd); + + if (vdev_indirect_should_condense(vd)) { + spa_condense_indirect_start_sync(vd, tx); + break; + } + } +} + +static void +spa_sync_iterate_to_convergence(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa->spa_meta_objset; + dsl_pool_t *dp = spa->spa_dsl_pool; + uint64_t txg = tx->tx_txg; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; + + do { + int pass = ++spa->spa_sync_pass; + + spa_sync_config_object(spa, tx); + spa_sync_aux_dev(spa, &spa->spa_spares, tx, + ZPOOL_CONFIG_SPARES, DMU_POOL_SPARES); + spa_sync_aux_dev(spa, &spa->spa_l2cache, tx, + ZPOOL_CONFIG_L2CACHE, DMU_POOL_L2CACHE); + spa_errlog_sync(spa, txg); + dsl_pool_sync(dp, txg); + + if (pass < zfs_sync_pass_deferred_free || + spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + /* + * If the log space map feature is active we don't + * care about deferred frees and the deferred bpobj + * as the log space map should effectively have the + * same results (i.e. appending only to one object). + */ + spa_sync_frees(spa, free_bpl, tx); + } else { + /* + * We can not defer frees in pass 1, because + * we sync the deferred frees later in pass 1. + */ + ASSERT3U(pass, >, 1); + bplist_iterate(free_bpl, bpobj_enqueue_alloc_cb, + &spa->spa_deferred_bpobj, tx); + } + + ddt_sync(spa, txg); + dsl_scan_sync(dp, tx); + svr_sync(spa, tx); + spa_sync_upgrades(spa, tx); + + spa_flush_metaslabs(spa, tx); + + vdev_t *vd = NULL; + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + != NULL) + vdev_sync(vd, txg); + + /* + * Note: We need to check if the MOS is dirty because we could + * have marked the MOS dirty without updating the uberblock + * (e.g. if we have sync tasks but no dirty user data). We need + * to check the uberblock's rootbp because it is updated if we + * have synced out dirty data (though in this case the MOS will + * most likely also be dirty due to second order effects, we + * don't want to rely on that here). + */ + if (pass == 1 && + spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(mos, txg)) { + /* + * Nothing changed on the first pass, therefore this + * TXG is a no-op. Avoid syncing deferred frees, so + * that we can keep this TXG as a no-op. + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&dp->dp_sync_tasks, txg)); + ASSERT(txg_list_empty(&dp->dp_early_sync_tasks, txg)); + break; + } + + spa_sync_deferred_frees(spa, tx); + } while (dmu_objset_is_dirty(mos, txg)); +} + +/* + * Rewrite the vdev configuration (which includes the uberblock) to + * commit the transaction group. + * + * If there are no dirty vdevs, we sync the uberblock to a few random + * top-level vdevs that are known to be visible in the config cache + * (see spa_vdev_add() for a complete description). If there *are* dirty + * vdevs, sync the uberblock to all vdevs. + */ +static void +spa_sync_rewrite_vdev_config(spa_t *spa, dmu_tx_t *tx) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg = tx->tx_txg; + + for (;;) { + int error = 0; + + /* + * We hold SCL_STATE to prevent vdev open/close/etc. + * while we're attempting to write the vdev labels. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + + if (list_is_empty(&spa->spa_config_dirty_list)) { + vdev_t *svd[SPA_SYNC_MIN_VDEVS] = { NULL }; + int svdcount = 0; + int children = rvd->vdev_children; + int c0 = spa_get_random(children); + + for (int c = 0; c < children; c++) { + vdev_t *vd = + rvd->vdev_child[(c0 + c) % children]; + + /* Stop when revisiting the first vdev */ + if (c > 0 && svd[0] == vd) + break; + + if (vd->vdev_ms_array == 0 || + vd->vdev_islog || + !vdev_is_concrete(vd)) + continue; + + svd[svdcount++] = vd; + if (svdcount == SPA_SYNC_MIN_VDEVS) + break; + } + error = vdev_config_sync(svd, svdcount, txg); + } else { + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg); + } + + if (error == 0) + spa->spa_last_synced_guid = rvd->vdev_guid; + + spa_config_exit(spa, SCL_STATE, FTAG); + + if (error == 0) + break; + zio_suspend(spa, NULL, ZIO_SUSPEND_IOERR); + zio_resume_wait(spa); + } +} + +/* + * Sync the specified transaction group. New blocks may be dirtied as + * part of the process, so we iterate until it converges. + */ +void +spa_sync(spa_t *spa, uint64_t txg) +{ + vdev_t *vd = NULL; + + VERIFY(spa_writeable(spa)); + + /* + * Wait for i/os issued in open context that need to complete + * before this txg syncs. + */ + (void) zio_wait(spa->spa_txg_zio[txg & TXG_MASK]); + spa->spa_txg_zio[txg & TXG_MASK] = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL); + + /* + * Lock out configuration changes. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + spa->spa_syncing_txg = txg; + spa->spa_sync_pass = 0; + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * If there are any pending vdev state changes, convert them + * into config changes that go out with this transaction group. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + spa_config_exit(spa, SCL_STATE, FTAG); + + dsl_pool_t *dp = spa->spa_dsl_pool; + dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); + + spa->spa_sync_starttime = gethrtime(); + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + NSEC_TO_TICK(spa->spa_deadman_synctime)); + + /* + * If we are upgrading to SPA_VERSION_RAIDZ_DEFLATE this txg, + * set spa_deflate if we have no raid-z vdevs. + */ + if (spa->spa_ubsync.ub_version < SPA_VERSION_RAIDZ_DEFLATE && + spa->spa_uberblock.ub_version >= SPA_VERSION_RAIDZ_DEFLATE) { + vdev_t *rvd = spa->spa_root_vdev; + + int i; + for (i = 0; i < rvd->vdev_children; i++) { + vd = rvd->vdev_child[i]; + if (vd->vdev_deflate_ratio != SPA_MINBLOCKSIZE) + break; + } + if (i == rvd->vdev_children) { + spa->spa_deflate = TRUE; + VERIFY0(zap_add(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, + sizeof (uint64_t), 1, &spa->spa_deflate, tx)); + } + } + + spa_sync_adjust_vdev_max_queue_depth(spa); + + spa_sync_condense_indirect(spa, tx); + + spa_sync_iterate_to_convergence(spa, tx); + +#ifdef ZFS_DEBUG + if (!list_is_empty(&spa->spa_config_dirty_list)) { + /* + * Make sure that the number of ZAPs for all the vdevs matches + * the number of ZAPs in the per-vdev ZAP list. This only gets + * called if the config is dirty; otherwise there may be + * outstanding AVZ operations that weren't completed in + * spa_sync_config_object. + */ + uint64_t all_vdev_zap_entry_count; + ASSERT0(zap_count(spa->spa_meta_objset, + spa->spa_all_vdev_zaps, &all_vdev_zap_entry_count)); + ASSERT3U(vdev_count_verify_zaps(spa->spa_root_vdev), ==, + all_vdev_zap_entry_count); + } +#endif + + if (spa->spa_vdev_removal != NULL) { + ASSERT0(spa->spa_vdev_removal->svr_bytes_done[txg & TXG_MASK]); + } + + spa_sync_rewrite_vdev_config(spa, tx); + dmu_tx_commit(tx); + + taskq_cancel_id(system_delay_taskq, spa->spa_deadman_tqid); + spa->spa_deadman_tqid = 0; + + /* + * Clear the dirty config list. + */ + while ((vd = list_head(&spa->spa_config_dirty_list)) != NULL) + vdev_config_clean(vd); + + /* + * Now that the new config has synced transactionally, + * let it become visible to the config cache. + */ + if (spa->spa_config_syncing != NULL) { + spa_config_set(spa, spa->spa_config_syncing); + spa->spa_config_txg = txg; + spa->spa_config_syncing = NULL; + } + + dsl_pool_sync_done(dp, txg); + + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_enter(&spa->spa_alloc_locks[i]); + VERIFY0(avl_numnodes(&spa->spa_alloc_trees[i])); + mutex_exit(&spa->spa_alloc_locks[i]); + } + + /* + * Update usable space statistics. + */ + while ((vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) + != NULL) + vdev_sync_done(vd, txg); + + metaslab_class_evict_old(spa->spa_normal_class, txg); + metaslab_class_evict_old(spa->spa_log_class, txg); + + spa_sync_close_syncing_log_sm(spa); + + spa_update_dspace(spa); + + /* + * It had better be the case that we didn't dirty anything + * since vdev_config_sync(). + */ + ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); + ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); + ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); + + while (zfs_pause_spa_sync) + delay(1); + + spa->spa_sync_pass = 0; + + /* + * Update the last synced uberblock here. We want to do this at + * the end of spa_sync() so that consumers of spa_last_synced_txg() + * will be guaranteed that all the processing associated with + * that txg has been completed. + */ + spa->spa_ubsync = spa->spa_uberblock; + spa_config_exit(spa, SCL_CONFIG, FTAG); + + spa_handle_ignored_writes(spa); + + /* + * If any async tasks have been requested, kick them off. + */ + spa_async_dispatch(spa); +} + +/* + * Sync all pools. We don't want to hold the namespace lock across these + * operations, so we take a reference on the spa_t and drop the lock during the + * sync. + */ +void +spa_sync_allpools(void) +{ + spa_t *spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa_state(spa) != POOL_STATE_ACTIVE || + !spa_writeable(spa) || spa_suspended(spa)) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); +} + +/* + * ========================================================================== + * Miscellaneous routines + * ========================================================================== + */ + +/* + * Remove all pools in the system. + */ +void +spa_evict_all(void) +{ + spa_t *spa; + + /* + * Remove all cached state. All pools should be closed now, + * so every spa in the AVL tree should be unreferenced. + */ + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(NULL)) != NULL) { + /* + * Stop async tasks. The async thread may need to detach + * a device that's been replaced, which requires grabbing + * spa_namespace_lock, so we must drop it here. + */ + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + spa_async_suspend(spa); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + + if (spa->spa_state != POOL_STATE_UNINITIALIZED) { + spa_unload(spa); + spa_deactivate(spa); + } + spa_remove(spa); + } + mutex_exit(&spa_namespace_lock); +} + +vdev_t * +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) +{ + vdev_t *vd; + int i; + + if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) + return (vd); + + if (aux) { + for (i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd = spa->spa_l2cache.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } + } + + return (NULL); +} + +void +spa_upgrade(spa_t *spa, uint64_t version) +{ + ASSERT(spa_writeable(spa)); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * This should only be called for a non-faulted pool, and since a + * future version would result in an unopenable pool, this shouldn't be + * possible. + */ + ASSERT(SPA_VERSION_IS_SUPPORTED(spa->spa_uberblock.ub_version)); + ASSERT3U(version, >=, spa->spa_uberblock.ub_version); + + spa->spa_uberblock.ub_version = version; + vdev_config_dirty(spa->spa_root_vdev); + + spa_config_exit(spa, SCL_ALL, FTAG); + + txg_wait_synced(spa_get_dsl(spa), 0); +} + +boolean_t +spa_has_spare(spa_t *spa, uint64_t guid) +{ + int i; + uint64_t spareguid; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) + if (sav->sav_vdevs[i]->vdev_guid == guid) + return (B_TRUE); + + for (i = 0; i < sav->sav_npending; i++) { + if (nvlist_lookup_uint64(sav->sav_pending[i], ZPOOL_CONFIG_GUID, + &spareguid) == 0 && spareguid == guid) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * Check if a pool has an active shared spare device. + * Note: reference count of an active spare is 2, as a spare and as a replace + */ +static boolean_t +spa_has_active_shared_spare(spa_t *spa) +{ + int i, refcnt; + uint64_t pool; + spa_aux_vdev_t *sav = &spa->spa_spares; + + for (i = 0; i < sav->sav_count; i++) { + if (spa_spare_exists(sav->sav_vdevs[i]->vdev_guid, &pool, + &refcnt) && pool != 0ULL && pool == spa_guid(spa) && + refcnt > 2) + return (B_TRUE); + } + + return (B_FALSE); +} + +uint64_t +spa_total_metaslabs(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + uint64_t m = 0; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + if (!vdev_is_concrete(vd)) + continue; + m += vd->vdev_ms_count; + } + return (m); +} + +/* + * Notify any waiting threads that some activity has switched from being in- + * progress to not-in-progress so that the thread can wake up and determine + * whether it is finished waiting. + */ +void +spa_notify_waiters(spa_t *spa) +{ + /* + * Acquiring spa_activities_lock here prevents the cv_broadcast from + * happening between the waiting thread's check and cv_wait. + */ + mutex_enter(&spa->spa_activities_lock); + cv_broadcast(&spa->spa_activities_cv); + mutex_exit(&spa->spa_activities_lock); +} + +/* + * Notify any waiting threads that the pool is exporting, and then block until + * they are finished using the spa_t. + */ +void +spa_wake_waiters(spa_t *spa) +{ + mutex_enter(&spa->spa_activities_lock); + spa->spa_waiters_cancel = B_TRUE; + cv_broadcast(&spa->spa_activities_cv); + while (spa->spa_waiters != 0) + cv_wait(&spa->spa_waiters_cv, &spa->spa_activities_lock); + spa->spa_waiters_cancel = B_FALSE; + mutex_exit(&spa->spa_activities_lock); +} + +/* Whether the vdev or any of its descendants are being initialized/trimmed. */ +static boolean_t +spa_vdev_activity_in_progress_impl(vdev_t *vd, zpool_wait_activity_t activity) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER)); + ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); + ASSERT(activity == ZPOOL_WAIT_INITIALIZE || + activity == ZPOOL_WAIT_TRIM); + + kmutex_t *lock = activity == ZPOOL_WAIT_INITIALIZE ? + &vd->vdev_initialize_lock : &vd->vdev_trim_lock; + + mutex_exit(&spa->spa_activities_lock); + mutex_enter(lock); + mutex_enter(&spa->spa_activities_lock); + + boolean_t in_progress = (activity == ZPOOL_WAIT_INITIALIZE) ? + (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) : + (vd->vdev_trim_state == VDEV_TRIM_ACTIVE); + mutex_exit(lock); + + if (in_progress) + return (B_TRUE); + + for (int i = 0; i < vd->vdev_children; i++) { + if (spa_vdev_activity_in_progress_impl(vd->vdev_child[i], + activity)) + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * If use_guid is true, this checks whether the vdev specified by guid is + * being initialized/trimmed. Otherwise, it checks whether any vdev in the pool + * is being initialized/trimmed. The caller must hold the config lock and + * spa_activities_lock. + */ +static int +spa_vdev_activity_in_progress(spa_t *spa, boolean_t use_guid, uint64_t guid, + zpool_wait_activity_t activity, boolean_t *in_progress) +{ + mutex_exit(&spa->spa_activities_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + mutex_enter(&spa->spa_activities_lock); + + vdev_t *vd; + if (use_guid) { + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd == NULL || !vd->vdev_ops->vdev_op_leaf) { + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (EINVAL); + } + } else { + vd = spa->spa_root_vdev; + } + + *in_progress = spa_vdev_activity_in_progress_impl(vd, activity); + + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + return (0); +} + +/* + * Locking for waiting threads + * --------------------------- + * + * Waiting threads need a way to check whether a given activity is in progress, + * and then, if it is, wait for it to complete. Each activity will have some + * in-memory representation of the relevant on-disk state which can be used to + * determine whether or not the activity is in progress. The in-memory state and + * the locking used to protect it will be different for each activity, and may + * not be suitable for use with a cvar (e.g., some state is protected by the + * config lock). To allow waiting threads to wait without any races, another + * lock, spa_activities_lock, is used. + * + * When the state is checked, both the activity-specific lock (if there is one) + * and spa_activities_lock are held. In some cases, the activity-specific lock + * is acquired explicitly (e.g. the config lock). In others, the locking is + * internal to some check (e.g. bpobj_is_empty). After checking, the waiting + * thread releases the activity-specific lock and, if the activity is in + * progress, then cv_waits using spa_activities_lock. + * + * The waiting thread is woken when another thread, one completing some + * activity, updates the state of the activity and then calls + * spa_notify_waiters, which will cv_broadcast. This 'completing' thread only + * needs to hold its activity-specific lock when updating the state, and this + * lock can (but doesn't have to) be dropped before calling spa_notify_waiters. + * + * Because spa_notify_waiters acquires spa_activities_lock before broadcasting, + * and because it is held when the waiting thread checks the state of the + * activity, it can never be the case that the completing thread both updates + * the activity state and cv_broadcasts in between the waiting thread's check + * and cv_wait. Thus, a waiting thread can never miss a wakeup. + * + * In order to prevent deadlock, when the waiting thread does its check, in some + * cases it will temporarily drop spa_activities_lock in order to acquire the + * activity-specific lock. The order in which spa_activities_lock and the + * activity specific lock are acquired in the waiting thread is determined by + * the order in which they are acquired in the completing thread; if the + * completing thread calls spa_notify_waiters with the activity-specific lock + * held, then the waiting thread must also acquire the activity-specific lock + * first. + */ + +static int +spa_activity_in_progress(spa_t *spa, zpool_wait_activity_t activity, + boolean_t use_tag, uint64_t tag, boolean_t *in_progress) +{ + int error = 0; + + ASSERT(MUTEX_HELD(&spa->spa_activities_lock)); + + switch (activity) { + case ZPOOL_WAIT_CKPT_DISCARD: + *in_progress = + (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT) && + zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT) == + ENOENT); + break; + case ZPOOL_WAIT_FREE: + *in_progress = ((spa_version(spa) >= SPA_VERSION_DEADLISTS && + !bpobj_is_empty(&spa->spa_dsl_pool->dp_free_bpobj)) || + spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY) || + spa_livelist_delete_check(spa)); + break; + case ZPOOL_WAIT_INITIALIZE: + case ZPOOL_WAIT_TRIM: + error = spa_vdev_activity_in_progress(spa, use_tag, tag, + activity, in_progress); + break; + case ZPOOL_WAIT_REPLACE: + mutex_exit(&spa->spa_activities_lock); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + mutex_enter(&spa->spa_activities_lock); + + *in_progress = vdev_replace_in_progress(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + break; + case ZPOOL_WAIT_REMOVE: + *in_progress = (spa->spa_removing_phys.sr_state == + DSS_SCANNING); + break; + case ZPOOL_WAIT_RESILVER: + if ((*in_progress = vdev_rebuild_active(spa->spa_root_vdev))) + break; + /* fall through */ + case ZPOOL_WAIT_SCRUB: + { + boolean_t scanning, paused, is_scrub; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + + is_scrub = (scn->scn_phys.scn_func == POOL_SCAN_SCRUB); + scanning = (scn->scn_phys.scn_state == DSS_SCANNING); + paused = dsl_scan_is_paused_scrub(scn); + *in_progress = (scanning && !paused && + is_scrub == (activity == ZPOOL_WAIT_SCRUB)); + break; + } + default: + panic("unrecognized value for activity %d", activity); + } + + return (error); +} + +static int +spa_wait_common(const char *pool, zpool_wait_activity_t activity, + boolean_t use_tag, uint64_t tag, boolean_t *waited) +{ + /* + * The tag is used to distinguish between instances of an activity. + * 'initialize' and 'trim' are the only activities that we use this for. + * The other activities can only have a single instance in progress in a + * pool at one time, making the tag unnecessary. + * + * There can be multiple devices being replaced at once, but since they + * all finish once resilvering finishes, we don't bother keeping track + * of them individually, we just wait for them all to finish. + */ + if (use_tag && activity != ZPOOL_WAIT_INITIALIZE && + activity != ZPOOL_WAIT_TRIM) + return (EINVAL); + + if (activity < 0 || activity >= ZPOOL_WAIT_NUM_ACTIVITIES) + return (EINVAL); + + spa_t *spa; + int error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + /* + * Increment the spa's waiter count so that we can call spa_close and + * still ensure that the spa_t doesn't get freed before this thread is + * finished with it when the pool is exported. We want to call spa_close + * before we start waiting because otherwise the additional ref would + * prevent the pool from being exported or destroyed throughout the + * potentially long wait. + */ + mutex_enter(&spa->spa_activities_lock); + spa->spa_waiters++; + spa_close(spa, FTAG); + + *waited = B_FALSE; + for (;;) { + boolean_t in_progress; + error = spa_activity_in_progress(spa, activity, use_tag, tag, + &in_progress); + + if (error || !in_progress || spa->spa_waiters_cancel) + break; + + *waited = B_TRUE; + + if (cv_wait_sig(&spa->spa_activities_cv, + &spa->spa_activities_lock) == 0) { + error = EINTR; + break; + } + } + + spa->spa_waiters--; + cv_signal(&spa->spa_waiters_cv); + mutex_exit(&spa->spa_activities_lock); + + return (error); +} + +/* + * Wait for a particular instance of the specified activity to complete, where + * the instance is identified by 'tag' + */ +int +spa_wait_tag(const char *pool, zpool_wait_activity_t activity, uint64_t tag, + boolean_t *waited) +{ + return (spa_wait_common(pool, activity, B_TRUE, tag, waited)); +} + +/* + * Wait for all instances of the specified activity complete + */ +int +spa_wait(const char *pool, zpool_wait_activity_t activity, boolean_t *waited) +{ + + return (spa_wait_common(pool, activity, B_FALSE, 0, waited)); +} + +sysevent_t * +spa_event_create(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) +{ + sysevent_t *ev = NULL; +#ifdef _KERNEL + nvlist_t *resource; + + resource = zfs_event_create(spa, vd, FM_SYSEVENT_CLASS, name, hist_nvl); + if (resource) { + ev = kmem_alloc(sizeof (sysevent_t), KM_SLEEP); + ev->resource = resource; + } +#endif + return (ev); +} + +void +spa_event_post(sysevent_t *ev) +{ +#ifdef _KERNEL + if (ev) { + zfs_zevent_post(ev->resource, NULL, zfs_zevent_post_cb); + kmem_free(ev, sizeof (*ev)); + } +#endif +} + +/* + * Post a zevent corresponding to the given sysevent. The 'name' must be one + * of the event definitions in sys/sysevent/eventdefs.h. The payload will be + * filled in from the spa and (optionally) the vdev. This doesn't do anything + * in the userland libzpool, as we don't want consumers to misinterpret ztest + * or zdb as real changes. + */ +void +spa_event_notify(spa_t *spa, vdev_t *vd, nvlist_t *hist_nvl, const char *name) +{ + spa_event_post(spa_event_create(spa, vd, hist_nvl, name)); +} + +/* state manipulation functions */ +EXPORT_SYMBOL(spa_open); +EXPORT_SYMBOL(spa_open_rewind); +EXPORT_SYMBOL(spa_get_stats); +EXPORT_SYMBOL(spa_create); +EXPORT_SYMBOL(spa_import); +EXPORT_SYMBOL(spa_tryimport); +EXPORT_SYMBOL(spa_destroy); +EXPORT_SYMBOL(spa_export); +EXPORT_SYMBOL(spa_reset); +EXPORT_SYMBOL(spa_async_request); +EXPORT_SYMBOL(spa_async_suspend); +EXPORT_SYMBOL(spa_async_resume); +EXPORT_SYMBOL(spa_inject_addref); +EXPORT_SYMBOL(spa_inject_delref); +EXPORT_SYMBOL(spa_scan_stat_init); +EXPORT_SYMBOL(spa_scan_get_stats); + +/* device manipulation */ +EXPORT_SYMBOL(spa_vdev_add); +EXPORT_SYMBOL(spa_vdev_attach); +EXPORT_SYMBOL(spa_vdev_detach); +EXPORT_SYMBOL(spa_vdev_setpath); +EXPORT_SYMBOL(spa_vdev_setfru); +EXPORT_SYMBOL(spa_vdev_split_mirror); + +/* spare statech is global across all pools) */ +EXPORT_SYMBOL(spa_spare_add); +EXPORT_SYMBOL(spa_spare_remove); +EXPORT_SYMBOL(spa_spare_exists); +EXPORT_SYMBOL(spa_spare_activate); + +/* L2ARC statech is global across all pools) */ +EXPORT_SYMBOL(spa_l2cache_add); +EXPORT_SYMBOL(spa_l2cache_remove); +EXPORT_SYMBOL(spa_l2cache_exists); +EXPORT_SYMBOL(spa_l2cache_activate); +EXPORT_SYMBOL(spa_l2cache_drop); + +/* scanning */ +EXPORT_SYMBOL(spa_scan); +EXPORT_SYMBOL(spa_scan_stop); + +/* spa syncing */ +EXPORT_SYMBOL(spa_sync); /* only for DMU use */ +EXPORT_SYMBOL(spa_sync_allpools); + +/* properties */ +EXPORT_SYMBOL(spa_prop_set); +EXPORT_SYMBOL(spa_prop_get); +EXPORT_SYMBOL(spa_prop_clear_bootfs); + +/* asynchronous event notification */ +EXPORT_SYMBOL(spa_event_notify); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_shift, INT, ZMOD_RW, + "log2(fraction of arc that can be used by inflight I/Os when " + "verifying pool during import"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_metadata, INT, ZMOD_RW, + "Set to traverse metadata on pool import"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, load_verify_data, INT, ZMOD_RW, + "Set to traverse data on pool import"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, load_print_vdev_tree, INT, ZMOD_RW, + "Print vdev tree to zfs_dbgmsg during pool import"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, taskq_batch_pct, UINT, ZMOD_RD, + "Percentage of CPUs to run an IO worker thread"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_missing_tvds, ULONG, ZMOD_RW, + "Allow importing pool with up to this number of missing top-level " + "vdevs (in read-only mode)"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_pause, INT, ZMOD_RW, + "Set the livelist condense zthr to pause"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_pause, INT, ZMOD_RW, + "Set the livelist condense synctask to pause"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, sync_cancel, INT, ZMOD_RW, + "Whether livelist condensing was canceled in the synctask"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, zthr_cancel, INT, ZMOD_RW, + "Whether livelist condensing was canceled in the zthr function"); + +ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, + "Whether extra ALLOC blkptrs were added to a livelist entry while it " + "was being condensed"); +/* END CSTYLED */ diff --git a/module/zfs/spa_boot.c b/module/zfs/spa_boot.c new file mode 100644 index 000000000000..674394650f82 --- /dev/null +++ b/module/zfs/spa_boot.c @@ -0,0 +1,50 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifdef _KERNEL + +#include +#include +#include + +char * +spa_get_bootprop(char *propname) +{ + char *value; + + if (ddi_prop_lookup_string(DDI_DEV_T_ANY, ddi_root_node(), + DDI_PROP_DONTPASS, propname, &value) != DDI_SUCCESS) + return (NULL); + return (value); +} + +void +spa_free_bootprop(char *value) +{ + ddi_prop_free(value); +} + +#endif /* _KERNEL */ diff --git a/module/zfs/spa_checkpoint.c b/module/zfs/spa_checkpoint.c new file mode 100644 index 000000000000..5fb614467273 --- /dev/null +++ b/module/zfs/spa_checkpoint.c @@ -0,0 +1,636 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * Storage Pool Checkpoint + * + * A storage pool checkpoint can be thought of as a pool-wide snapshot or + * a stable version of extreme rewind that guarantees no blocks from the + * checkpointed state will have been overwritten. It remembers the entire + * state of the storage pool (e.g. snapshots, dataset names, etc..) from the + * point that it was taken and the user can rewind back to that point even if + * they applied destructive operations on their datasets or even enabled new + * zpool on-disk features. If a pool has a checkpoint that is no longer + * needed, the user can discard it. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * flag is set to active when we create the checkpoint and remains active + * until the checkpoint is fully discarded. The entry in the MOS config + * (DMU_POOL_ZPOOL_CHECKPOINT) is populated with the uberblock that + * references the state of the pool when we take the checkpoint. The entry + * remains populated until we start discarding the checkpoint or we rewind + * back to it. + * + * - Each vdev contains a vdev-wide space map while the pool has a checkpoint, + * which persists until the checkpoint is fully discarded. The space map + * contains entries that have been freed in the current state of the pool + * but we want to keep around in case we decide to rewind to the checkpoint. + * [see vdev_checkpoint_sm] + * + * - Each metaslab's ms_sm space map behaves the same as without the + * checkpoint, with the only exception being the scenario when we free + * blocks that belong to the checkpoint. In this case, these blocks remain + * ALLOCATED in the metaslab's space map and they are added as FREE in the + * vdev's checkpoint space map. + * + * - Each uberblock has a field (ub_checkpoint_txg) which holds the txg that + * the uberblock was checkpointed. For normal uberblocks this field is 0. + * + * == Overview of operations == + * + * - To create a checkpoint, we first wait for the current TXG to be synced, + * so we can use the most recently synced uberblock (spa_ubsync) as the + * checkpointed uberblock. Then we use an early synctask to place that + * uberblock in MOS config, increment the feature flag for the checkpoint + * (marking it active), and setting spa_checkpoint_txg (see its use below) + * to the TXG of the checkpointed uberblock. We use an early synctask for + * the aforementioned operations to ensure that no blocks were dirtied + * between the current TXG and the TXG of the checkpointed uberblock + * (e.g the previous txg). + * + * - When a checkpoint exists, we need to ensure that the blocks that + * belong to the checkpoint are freed but never reused. This means that + * these blocks should never end up in the ms_allocatable or the ms_freeing + * trees of a metaslab. Therefore, whenever there is a checkpoint the new + * ms_checkpointing tree is used in addition to the aforementioned ones. + * + * Whenever a block is freed and we find out that it is referenced by the + * checkpoint (we find out by comparing its birth to spa_checkpoint_txg), + * we place it in the ms_checkpointing tree instead of the ms_freeingtree. + * This way, we divide the blocks that are being freed into checkpointed + * and not-checkpointed blocks. + * + * In order to persist these frees, we write the extents from the + * ms_freeingtree to the ms_sm as usual, and the extents from the + * ms_checkpointing tree to the vdev_checkpoint_sm. This way, these + * checkpointed extents will remain allocated in the metaslab's ms_sm space + * map, and therefore won't be reused [see metaslab_sync()]. In addition, + * when we discard the checkpoint, we can find the entries that have + * actually been freed in vdev_checkpoint_sm. + * [see spa_checkpoint_discard_thread_sync()] + * + * - To discard the checkpoint we use an early synctask to delete the + * checkpointed uberblock from the MOS config, set spa_checkpoint_txg to 0, + * and wakeup the discarding zthr thread (an open-context async thread). + * We use an early synctask to ensure that the operation happens before any + * new data end up in the checkpoint's data structures. + * + * Once the synctask is done and the discarding zthr is awake, we discard + * the checkpointed data over multiple TXGs by having the zthr prefetching + * entries from vdev_checkpoint_sm and then starting a synctask that places + * them as free blocks into their respective ms_allocatable and ms_sm + * structures. + * [see spa_checkpoint_discard_thread()] + * + * When there are no entries left in the vdev_checkpoint_sm of all + * top-level vdevs, a final synctask runs that decrements the feature flag. + * + * - To rewind to the checkpoint, we first use the current uberblock and + * open the MOS so we can access the checkpointed uberblock from the MOS + * config. After we retrieve the checkpointed uberblock, we use it as the + * current uberblock for the pool by writing it to disk with an updated + * TXG, opening its version of the MOS, and moving on as usual from there. + * [see spa_ld_checkpoint_rewind()] + * + * An important note on rewinding to the checkpoint has to do with how we + * handle ZIL blocks. In the scenario of a rewind, we clear out any ZIL + * blocks that have not been claimed by the time we took the checkpoint + * as they should no longer be valid. + * [see comment in zil_claim()] + * + * == Miscellaneous information == + * + * - In the hypothetical event that we take a checkpoint, remove a vdev, + * and attempt to rewind, the rewind would fail as the checkpointed + * uberblock would reference data in the removed device. For this reason + * and others of similar nature, we disallow the following operations that + * can change the config: + * vdev removal and attach/detach, mirror splitting, and pool reguid. + * + * - As most of the checkpoint logic is implemented in the SPA and doesn't + * distinguish datasets when it comes to space accounting, having a + * checkpoint can potentially break the boundaries set by dataset + * reservations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The following parameter limits the amount of memory to be used for the + * prefetching of the checkpoint space map done on each vdev while + * discarding the checkpoint. + * + * The reason it exists is because top-level vdevs with long checkpoint + * space maps can potentially take up a lot of memory depending on the + * amount of checkpointed data that has been freed within them while + * the pool had a checkpoint. + */ +unsigned long zfs_spa_discard_memory_limit = 16 * 1024 * 1024; + +int +spa_checkpoint_get_stats(spa_t *spa, pool_checkpoint_stat_t *pcs) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + bzero(pcs, sizeof (pool_checkpoint_stat_t)); + + int error = zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT); + ASSERT(error == 0 || error == ENOENT); + + if (error == ENOENT) + pcs->pcs_state = CS_CHECKPOINT_DISCARDING; + else + pcs->pcs_state = CS_CHECKPOINT_EXISTS; + + pcs->pcs_space = spa->spa_checkpoint_info.sci_dspace; + pcs->pcs_start_time = spa->spa_checkpoint_info.sci_timestamp; + + return (0); +} + +static void +spa_checkpoint_discard_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = arg; + + spa->spa_checkpoint_info.sci_timestamp = 0; + + spa_feature_decr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + spa_notify_waiters(spa); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "finished discarding checkpointed state from the pool"); +} + +typedef struct spa_checkpoint_discard_sync_callback_arg { + vdev_t *sdc_vd; + uint64_t sdc_txg; + uint64_t sdc_entry_limit; +} spa_checkpoint_discard_sync_callback_arg_t; + +static int +spa_checkpoint_discard_sync_callback(space_map_entry_t *sme, void *arg) +{ + spa_checkpoint_discard_sync_callback_arg_t *sdc = arg; + vdev_t *vd = sdc->sdc_vd; + metaslab_t *ms = vd->vdev_ms[sme->sme_offset >> vd->vdev_ms_shift]; + uint64_t end = sme->sme_offset + sme->sme_run; + + if (sdc->sdc_entry_limit == 0) + return (SET_ERROR(EINTR)); + + /* + * Since the space map is not condensed, we know that + * none of its entries is crossing the boundaries of + * its respective metaslab. + * + * That said, there is no fundamental requirement that + * the checkpoint's space map entries should not cross + * metaslab boundaries. So if needed we could add code + * that handles metaslab-crossing segments in the future. + */ + VERIFY3U(sme->sme_type, ==, SM_FREE); + VERIFY3U(sme->sme_offset, >=, ms->ms_start); + VERIFY3U(end, <=, ms->ms_start + ms->ms_size); + + /* + * At this point we should not be processing any + * other frees concurrently, so the lock is technically + * unnecessary. We use the lock anyway though to + * potentially save ourselves from future headaches. + */ + mutex_enter(&ms->ms_lock); + if (range_tree_is_empty(ms->ms_freeing)) + vdev_dirty(vd, VDD_METASLAB, ms, sdc->sdc_txg); + range_tree_add(ms->ms_freeing, sme->sme_offset, sme->sme_run); + mutex_exit(&ms->ms_lock); + + ASSERT3U(vd->vdev_spa->spa_checkpoint_info.sci_dspace, >=, + sme->sme_run); + ASSERT3U(vd->vdev_stat.vs_checkpoint_space, >=, sme->sme_run); + + vd->vdev_spa->spa_checkpoint_info.sci_dspace -= sme->sme_run; + vd->vdev_stat.vs_checkpoint_space -= sme->sme_run; + sdc->sdc_entry_limit--; + + return (0); +} + +#ifdef ZFS_DEBUG +static void +spa_checkpoint_accounting_verify(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t ckpoint_sm_space_sum = 0; + uint64_t vs_ckpoint_space_sum = 0; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + if (vd->vdev_checkpoint_sm != NULL) { + ckpoint_sm_space_sum += + -space_map_allocated(vd->vdev_checkpoint_sm); + vs_ckpoint_space_sum += + vd->vdev_stat.vs_checkpoint_space; + ASSERT3U(ckpoint_sm_space_sum, ==, + vs_ckpoint_space_sum); + } else { + ASSERT0(vd->vdev_stat.vs_checkpoint_space); + } + } + ASSERT3U(spa->spa_checkpoint_info.sci_dspace, ==, ckpoint_sm_space_sum); +} +#endif + +static void +spa_checkpoint_discard_thread_sync(void *arg, dmu_tx_t *tx) +{ + vdev_t *vd = arg; + int error; + + /* + * The space map callback is applied only to non-debug entries. + * Because the number of debug entries is less or equal to the + * number of non-debug entries, we want to ensure that we only + * read what we prefetched from open-context. + * + * Thus, we set the maximum entries that the space map callback + * will be applied to be half the entries that could fit in the + * imposed memory limit. + * + * Note that since this is a conservative estimate we also + * assume the worst case scenario in our computation where each + * entry is two-word. + */ + uint64_t max_entry_limit = + (zfs_spa_discard_memory_limit / (2 * sizeof (uint64_t))) >> 1; + + /* + * Iterate from the end of the space map towards the beginning, + * placing its entries on ms_freeing and removing them from the + * space map. The iteration stops if one of the following + * conditions is true: + * + * 1] We reached the beginning of the space map. At this point + * the space map should be completely empty and + * space_map_incremental_destroy should have returned 0. + * The next step would be to free and close the space map + * and remove its entry from its vdev's top zap. This allows + * spa_checkpoint_discard_thread() to move on to the next vdev. + * + * 2] We reached the memory limit (amount of memory used to hold + * space map entries in memory) and space_map_incremental_destroy + * returned EINTR. This means that there are entries remaining + * in the space map that will be cleared in a future invocation + * of this function by spa_checkpoint_discard_thread(). + */ + spa_checkpoint_discard_sync_callback_arg_t sdc; + sdc.sdc_vd = vd; + sdc.sdc_txg = tx->tx_txg; + sdc.sdc_entry_limit = max_entry_limit; + + uint64_t words_before = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + + error = space_map_incremental_destroy(vd->vdev_checkpoint_sm, + spa_checkpoint_discard_sync_callback, &sdc, tx); + + uint64_t words_after = + space_map_length(vd->vdev_checkpoint_sm) / sizeof (uint64_t); + +#ifdef ZFS_DEBUG + spa_checkpoint_accounting_verify(vd->vdev_spa); +#endif + + zfs_dbgmsg("discarding checkpoint: txg %llu, vdev id %d, " + "deleted %llu words - %llu words are left", + tx->tx_txg, vd->vdev_id, (words_before - words_after), + words_after); + + if (error != EINTR) { + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while incrementally destroying the checkpoint " + "space map of vdev %llu\n", + error, vd->vdev_id); + } + ASSERT0(words_after); + ASSERT0(space_map_allocated(vd->vdev_checkpoint_sm)); + ASSERT0(space_map_length(vd->vdev_checkpoint_sm)); + + space_map_free(vd->vdev_checkpoint_sm, tx); + space_map_close(vd->vdev_checkpoint_sm); + vd->vdev_checkpoint_sm = NULL; + + VERIFY0(zap_remove(spa_meta_objset(vd->vdev_spa), + vd->vdev_top_zap, VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, tx)); + } +} + +static boolean_t +spa_checkpoint_discard_is_done(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(!spa_has_checkpoint(spa)); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)); + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (rvd->vdev_child[c]->vdev_checkpoint_sm != NULL) + return (B_FALSE); + ASSERT0(rvd->vdev_child[c]->vdev_stat.vs_checkpoint_space); + } + + return (B_TRUE); +} + +/* ARGSUSED */ +boolean_t +spa_checkpoint_discard_thread_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (B_FALSE); + + if (spa_has_checkpoint(spa)) + return (B_FALSE); + + return (B_TRUE); +} + +void +spa_checkpoint_discard_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *rvd = spa->spa_root_vdev; + + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + while (vd->vdev_checkpoint_sm != NULL) { + space_map_t *checkpoint_sm = vd->vdev_checkpoint_sm; + int numbufs; + dmu_buf_t **dbp; + + if (zthr_iscancelled(zthr)) + return; + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + + uint64_t size = MIN(space_map_length(checkpoint_sm), + zfs_spa_discard_memory_limit); + uint64_t offset = + space_map_length(checkpoint_sm) - size; + + /* + * Ensure that the part of the space map that will + * be destroyed by the synctask, is prefetched in + * memory before the synctask runs. + */ + int error = dmu_buf_hold_array_by_bonus( + checkpoint_sm->sm_dbuf, offset, size, + B_TRUE, FTAG, &numbufs, &dbp); + if (error != 0) { + zfs_panic_recover("zfs: error %d was returned " + "while prefetching checkpoint space map " + "entries of vdev %llu\n", + error, vd->vdev_id); + } + + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_thread_sync, vd, + 0, ZFS_SPACE_CHECK_NONE)); + + dmu_buf_rele_array(dbp, numbufs, FTAG); + } + } + + VERIFY(spa_checkpoint_discard_is_done(spa)); + VERIFY0(spa->spa_checkpoint_info.sci_dspace); + VERIFY0(dsl_sync_task(spa->spa_name, NULL, + spa_checkpoint_discard_complete_sync, spa, + 0, ZFS_SPACE_CHECK_NONE)); +} + + +/* ARGSUSED */ +static int +spa_checkpoint_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ENOTSUP)); + + if (!spa_top_vdevs_spacemap_addressable(spa)) + return (SET_ERROR(ZFS_ERR_VDEV_TOO_BIG)); + + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) + return (SET_ERROR(ZFS_ERR_DEVRM_IN_PROGRESS)); + + if (spa->spa_checkpoint_txg != 0) + return (SET_ERROR(ZFS_ERR_CHECKPOINT_EXISTS)); + + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_sync(void *arg, dmu_tx_t *tx) +{ + dsl_pool_t *dp = dmu_tx_pool(tx); + spa_t *spa = dp->dp_spa; + uberblock_t checkpoint = spa->spa_ubsync; + + /* + * At this point, there should not be a checkpoint in the MOS. + */ + ASSERT3U(zap_contains(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT), ==, ENOENT); + + ASSERT0(spa->spa_checkpoint_info.sci_timestamp); + ASSERT0(spa->spa_checkpoint_info.sci_dspace); + + /* + * Since the checkpointed uberblock is the one that just got synced + * (we use spa_ubsync), its txg must be equal to the txg number of + * the txg we are syncing, minus 1. + */ + ASSERT3U(checkpoint.ub_txg, ==, spa->spa_syncing_txg - 1); + + /* + * Once the checkpoint is in place, we need to ensure that none of + * its blocks will be marked for reuse after it has been freed. + * When there is a checkpoint and a block is freed, we compare its + * birth txg to the txg of the checkpointed uberblock to see if the + * block is part of the checkpoint or not. Therefore, we have to set + * spa_checkpoint_txg before any frees happen in this txg (which is + * why this is done as an early_synctask as explained in the comment + * in spa_checkpoint()). + */ + spa->spa_checkpoint_txg = checkpoint.ub_txg; + spa->spa_checkpoint_info.sci_timestamp = checkpoint.ub_timestamp; + + checkpoint.ub_checkpoint_txg = checkpoint.ub_txg; + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT, + sizeof (uint64_t), sizeof (uberblock_t) / sizeof (uint64_t), + &checkpoint, tx)); + + /* + * Increment the feature refcount and thus activate the feature. + * Note that the feature will be deactivated when we've + * completely discarded all checkpointed state (both vdev + * space maps and uberblock). + */ + spa_feature_incr(spa, SPA_FEATURE_POOL_CHECKPOINT, tx); + + spa_history_log_internal(spa, "spa checkpoint", tx, + "checkpointed uberblock txg=%llu", (u_longlong_t)checkpoint.ub_txg); +} + +/* + * Create a checkpoint for the pool. + */ +int +spa_checkpoint(const char *pool) +{ + int error; + spa_t *spa; + + error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + mutex_enter(&spa->spa_vdev_top_lock); + + /* + * Wait for current syncing txg to finish so the latest synced + * uberblock (spa_ubsync) has all the changes that we expect + * to see if we were to revert later to the checkpoint. In other + * words we want the checkpointed uberblock to include/reference + * all the changes that were pending at the time that we issued + * the checkpoint command. + */ + txg_wait_synced(spa_get_dsl(spa), 0); + + /* + * As the checkpointed uberblock references blocks from the previous + * txg (spa_ubsync) we want to ensure that are not freeing any of + * these blocks in the same txg that the following synctask will + * run. Thus, we run it as an early synctask, so the dirty changes + * that are synced to disk afterwards during zios and other synctasks + * do not reuse checkpointed blocks. + */ + error = dsl_early_sync_task(pool, spa_checkpoint_check, + spa_checkpoint_sync, NULL, 0, ZFS_SPACE_CHECK_NORMAL); + + mutex_exit(&spa->spa_vdev_top_lock); + + spa_close(spa, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +spa_checkpoint_discard_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (!spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) + return (SET_ERROR(ZFS_ERR_NO_CHECKPOINT)); + + if (spa->spa_checkpoint_txg == 0) + return (SET_ERROR(ZFS_ERR_DISCARDING_CHECKPOINT)); + + VERIFY0(zap_contains(spa_meta_objset(spa), + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ZPOOL_CHECKPOINT)); + + return (0); +} + +/* ARGSUSED */ +static void +spa_checkpoint_discard_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + VERIFY0(zap_remove(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ZPOOL_CHECKPOINT, tx)); + + spa->spa_checkpoint_txg = 0; + + zthr_wakeup(spa->spa_checkpoint_discard_zthr); + + spa_history_log_internal(spa, "spa discard checkpoint", tx, + "started discarding checkpointed state from the pool"); +} + +/* + * Discard the checkpoint from a pool. + */ +int +spa_checkpoint_discard(const char *pool) +{ + /* + * Similarly to spa_checkpoint(), we want our synctask to run + * before any pending dirty data are written to disk so they + * won't end up in the checkpoint's data structures (e.g. + * ms_checkpointing and vdev_checkpoint_sm) and re-create any + * space maps that the discarding open-context thread has + * deleted. + * [see spa_discard_checkpoint_sync and spa_discard_checkpoint_thread] + */ + return (dsl_early_sync_task(pool, spa_checkpoint_discard_check, + spa_checkpoint_discard_sync, NULL, 0, + ZFS_SPACE_CHECK_DISCARD_CHECKPOINT)); +} + +EXPORT_SYMBOL(spa_checkpoint_get_stats); +EXPORT_SYMBOL(spa_checkpoint_discard_thread); +EXPORT_SYMBOL(spa_checkpoint_discard_thread_check); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_spa, zfs_spa_, discard_memory_limit, ULONG, ZMOD_RW, + "Limit for memory used in prefetching the checkpoint space map done " + "on each vdev while discarding the checkpoint"); +/* END CSTYLED */ diff --git a/module/zfs/spa_config.c b/module/zfs/spa_config.c new file mode 100644 index 000000000000..cc65a00d9dec --- /dev/null +++ b/module/zfs/spa_config.c @@ -0,0 +1,620 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright 2017 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#endif + +/* + * Pool configuration repository. + * + * Pool configuration is stored as a packed nvlist on the filesystem. By + * default, all pools are stored in /etc/zfs/zpool.cache and loaded on boot + * (when the ZFS module is loaded). Pools can also have the 'cachefile' + * property set that allows them to be stored in an alternate location until + * the control of external software. + * + * For each cache file, we have a single nvlist which holds all the + * configuration information. When the module loads, we read this information + * from /etc/zfs/zpool.cache and populate the SPA namespace. This namespace is + * maintained independently in spa.c. Whenever the namespace is modified, or + * the configuration of a pool is changed, we call spa_write_cachefile(), which + * walks through all the active pools and writes the configuration to disk. + */ + +static uint64_t spa_config_generation = 1; + +/* + * This can be overridden in userland to preserve an alternate namespace for + * userland pools when doing testing. + */ +char *spa_config_path = ZPOOL_CACHE; +int zfs_autoimport_disable = 1; + +/* + * Called when the module is first loaded, this routine loads the configuration + * file into the SPA namespace. It does not actually open or load the pools; it + * only populates the namespace. + */ +void +spa_config_load(void) +{ + void *buf = NULL; + nvlist_t *nvlist, *child; + nvpair_t *nvpair; + char *pathname; + zfs_file_t *fp; + zfs_file_attr_t zfa; + uint64_t fsize; + int err; + +#ifdef _KERNEL + if (zfs_autoimport_disable) + return; +#endif + + /* + * Open the configuration file. + */ + pathname = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + (void) snprintf(pathname, MAXPATHLEN, "%s", spa_config_path); + + err = zfs_file_open(pathname, O_RDONLY, 0, &fp); + +#ifdef __FreeBSD__ + if (err) + err = zfs_file_open(ZPOOL_CACHE_BOOT, O_RDONLY, 0, &fp); +#endif + kmem_free(pathname, MAXPATHLEN); + + if (err) + return; + + if (zfs_file_getattr(fp, &zfa)) + goto out; + + fsize = zfa.zfa_size; + buf = kmem_alloc(fsize, KM_SLEEP); + + /* + * Read the nvlist from the file. + */ + if (zfs_file_read(fp, buf, fsize, NULL) < 0) + goto out; + + /* + * Unpack the nvlist. + */ + if (nvlist_unpack(buf, fsize, &nvlist, KM_SLEEP) != 0) + goto out; + + /* + * Iterate over all elements in the nvlist, creating a new spa_t for + * each one with the specified configuration. + */ + mutex_enter(&spa_namespace_lock); + nvpair = NULL; + while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { + if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) + continue; + + child = fnvpair_value_nvlist(nvpair); + + if (spa_lookup(nvpair_name(nvpair)) != NULL) + continue; + (void) spa_add(nvpair_name(nvpair), child, NULL); + } + mutex_exit(&spa_namespace_lock); + + nvlist_free(nvlist); + +out: + if (buf != NULL) + kmem_free(buf, fsize); + + zfs_file_close(fp); +} + +static int +spa_config_remove(spa_config_dirent_t *dp) +{ + int error = 0; + + /* + * Remove the cache file. If zfs_file_unlink() in not supported by the + * platform fallback to truncating the file which is functionally + * equivalent. + */ + error = zfs_file_unlink(dp->scd_path); + if (error == EOPNOTSUPP) { + int flags = O_RDWR | O_TRUNC; + zfs_file_t *fp; + + error = zfs_file_open(dp->scd_path, flags, 0644, &fp); + if (error == 0) { + (void) zfs_file_fsync(fp, O_SYNC); + (void) zfs_file_close(fp); + } + } + + return (error); +} + +static int +spa_config_write(spa_config_dirent_t *dp, nvlist_t *nvl) +{ + size_t buflen; + char *buf; + int oflags = O_RDWR | O_TRUNC | O_CREAT | O_LARGEFILE; + char *temp; + int err; + zfs_file_t *fp; + + /* + * If the nvlist is empty (NULL), then remove the old cachefile. + */ + if (nvl == NULL) { + err = spa_config_remove(dp); + if (err == ENOENT) + err = 0; + + return (err); + } + + /* + * Pack the configuration into a buffer. + */ + buf = fnvlist_pack(nvl, &buflen); + temp = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + + /* + * Write the configuration to disk. Due to the complexity involved + * in performing a rename and remove from within the kernel the file + * is instead truncated and overwritten in place. This way we always + * have a consistent view of the data or a zero length file. + */ + err = zfs_file_open(dp->scd_path, oflags, 0644, &fp); + if (err == 0) { + err = zfs_file_write(fp, buf, buflen, NULL); + if (err == 0) + err = zfs_file_fsync(fp, O_SYNC); + + zfs_file_close(fp); + if (err) + (void) spa_config_remove(dp); + } + fnvlist_pack_free(buf, buflen); + kmem_free(temp, MAXPATHLEN); + return (err); +} + +/* + * Synchronize pool configuration to disk. This must be called with the + * namespace lock held. Synchronizing the pool cache is typically done after + * the configuration has been synced to the MOS. This exposes a window where + * the MOS config will have been updated but the cache file has not. If + * the system were to crash at that instant then the cached config may not + * contain the correct information to open the pool and an explicit import + * would be required. + */ +void +spa_write_cachefile(spa_t *target, boolean_t removing, boolean_t postsysevent) +{ + spa_config_dirent_t *dp, *tdp; + nvlist_t *nvl; + char *pool_name; + boolean_t ccw_failure; + int error = 0; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (!(spa_mode_global & SPA_MODE_WRITE)) + return; + + /* + * Iterate over all cachefiles for the pool, past or present. When the + * cachefile is changed, the new one is pushed onto this list, allowing + * us to update previous cachefiles that no longer contain this pool. + */ + ccw_failure = B_FALSE; + for (dp = list_head(&target->spa_config_list); dp != NULL; + dp = list_next(&target->spa_config_list, dp)) { + spa_t *spa = NULL; + if (dp->scd_path == NULL) + continue; + + /* + * Iterate over all pools, adding any matching pools to 'nvl'. + */ + nvl = NULL; + while ((spa = spa_next(spa)) != NULL) { + /* + * Skip over our own pool if we're about to remove + * ourselves from the spa namespace or any pool that + * is readonly. Since we cannot guarantee that a + * readonly pool would successfully import upon reboot, + * we don't allow them to be written to the cache file. + */ + if ((spa == target && removing) || + !spa_writeable(spa)) + continue; + + mutex_enter(&spa->spa_props_lock); + tdp = list_head(&spa->spa_config_list); + if (spa->spa_config == NULL || + tdp == NULL || + tdp->scd_path == NULL || + strcmp(tdp->scd_path, dp->scd_path) != 0) { + mutex_exit(&spa->spa_props_lock); + continue; + } + + if (nvl == NULL) + nvl = fnvlist_alloc(); + + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) + pool_name = fnvlist_lookup_string( + spa->spa_config, ZPOOL_CONFIG_POOL_NAME); + else + pool_name = spa_name(spa); + + fnvlist_add_nvlist(nvl, pool_name, spa->spa_config); + mutex_exit(&spa->spa_props_lock); + } + + error = spa_config_write(dp, nvl); + if (error != 0) + ccw_failure = B_TRUE; + nvlist_free(nvl); + } + + if (ccw_failure) { + /* + * Keep trying so that configuration data is + * written if/when any temporary filesystem + * resource issues are resolved. + */ + if (target->spa_ccw_fail_time == 0) { + zfs_ereport_post(FM_EREPORT_ZFS_CONFIG_CACHE_WRITE, + target, NULL, NULL, NULL, 0, 0); + } + target->spa_ccw_fail_time = gethrtime(); + spa_async_request(target, SPA_ASYNC_CONFIG_UPDATE); + } else { + /* + * Do not rate limit future attempts to update + * the config cache. + */ + target->spa_ccw_fail_time = 0; + } + + /* + * Remove any config entries older than the current one. + */ + dp = list_head(&target->spa_config_list); + while ((tdp = list_next(&target->spa_config_list, dp)) != NULL) { + list_remove(&target->spa_config_list, tdp); + if (tdp->scd_path != NULL) + spa_strfree(tdp->scd_path); + kmem_free(tdp, sizeof (spa_config_dirent_t)); + } + + spa_config_generation++; + + if (postsysevent) + spa_event_notify(target, NULL, NULL, ESC_ZFS_CONFIG_SYNC); +} + +/* + * Sigh. Inside a local zone, we don't have access to /etc/zfs/zpool.cache, + * and we don't want to allow the local zone to see all the pools anyway. + * So we have to invent the ZFS_IOC_CONFIG ioctl to grab the configuration + * information for all pool visible within the zone. + */ +nvlist_t * +spa_all_configs(uint64_t *generation) +{ + nvlist_t *pools; + spa_t *spa = NULL; + + if (*generation == spa_config_generation) + return (NULL); + + pools = fnvlist_alloc(); + + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (INGLOBALZONE(curproc) || + zone_dataset_visible(spa_name(spa), NULL)) { + mutex_enter(&spa->spa_props_lock); + fnvlist_add_nvlist(pools, spa_name(spa), + spa->spa_config); + mutex_exit(&spa->spa_props_lock); + } + } + *generation = spa_config_generation; + mutex_exit(&spa_namespace_lock); + + return (pools); +} + +void +spa_config_set(spa_t *spa, nvlist_t *config) +{ + mutex_enter(&spa->spa_props_lock); + if (spa->spa_config != NULL && spa->spa_config != config) + nvlist_free(spa->spa_config); + spa->spa_config = config; + mutex_exit(&spa->spa_props_lock); +} + +/* + * Generate the pool's configuration based on the current in-core state. + * + * We infer whether to generate a complete config or just one top-level config + * based on whether vd is the root vdev. + */ +nvlist_t * +spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) +{ + nvlist_t *config, *nvroot; + vdev_t *rvd = spa->spa_root_vdev; + unsigned long hostid = 0; + boolean_t locked = B_FALSE; + uint64_t split_guid; + char *pool_name; + + if (vd == NULL) { + vd = rvd; + locked = B_TRUE; + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); + } + + ASSERT(spa_config_held(spa, SCL_CONFIG | SCL_STATE, RW_READER) == + (SCL_CONFIG | SCL_STATE)); + + /* + * If txg is -1, report the current value of spa->spa_config_txg. + */ + if (txg == -1ULL) + txg = spa->spa_config_txg; + + /* + * Originally, users had to handle spa namespace collisions by either + * exporting the already imported pool or by specifying a new name for + * the pool with a conflicting name. In the case of root pools from + * virtual guests, neither approach to collision resolution is + * reasonable. This is addressed by extending the new name syntax with + * an option to specify that the new name is temporary. When specified, + * ZFS_IMPORT_TEMP_NAME will be set in spa->spa_import_flags to tell us + * to use the previous name, which we do below. + */ + if (spa->spa_import_flags & ZFS_IMPORT_TEMP_NAME) { + VERIFY0(nvlist_lookup_string(spa->spa_config, + ZPOOL_CONFIG_POOL_NAME, &pool_name)); + } else + pool_name = spa_name(spa); + + config = fnvlist_alloc(); + + fnvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, spa_version(spa)); + fnvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, pool_name); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, spa_state(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, txg); + fnvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)); + fnvlist_add_uint64(config, ZPOOL_CONFIG_ERRATA, spa->spa_errata); + if (spa->spa_comment != NULL) + fnvlist_add_string(config, ZPOOL_CONFIG_COMMENT, + spa->spa_comment); + + hostid = spa_get_hostid(spa); + if (hostid != 0) + fnvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid); + fnvlist_add_string(config, ZPOOL_CONFIG_HOSTNAME, utsname()->nodename); + + int config_gen_flags = 0; + if (vd != rvd) { + fnvlist_add_uint64(config, ZPOOL_CONFIG_TOP_GUID, + vd->vdev_top->vdev_guid); + fnvlist_add_uint64(config, ZPOOL_CONFIG_GUID, + vd->vdev_guid); + if (vd->vdev_isspare) + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_SPARE, 1ULL); + if (vd->vdev_islog) + fnvlist_add_uint64(config, + ZPOOL_CONFIG_IS_LOG, 1ULL); + vd = vd->vdev_top; /* label contains top config */ + } else { + /* + * Only add the (potentially large) split information + * in the mos config, and not in the vdev labels + */ + if (spa->spa_config_splitting != NULL) + fnvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting); + + fnvlist_add_boolean(config, ZPOOL_CONFIG_HAS_PER_VDEV_ZAPS); + + config_gen_flags |= VDEV_CONFIG_MOS; + } + + /* + * Add the top-level config. We even add this on pools which + * don't support holes in the namespace. + */ + vdev_top_config_generate(spa, config); + + /* + * If we're splitting, record the original pool's guid. + */ + if (spa->spa_config_splitting != NULL && + nvlist_lookup_uint64(spa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { + fnvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, split_guid); + } + + nvroot = vdev_config_generate(spa, vd, getstats, config_gen_flags); + fnvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot); + nvlist_free(nvroot); + + /* + * Store what's necessary for reading the MOS in the label. + */ + fnvlist_add_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + spa->spa_label_features); + + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + + ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)); + kmem_free(ddh, sizeof (ddt_histogram_t)); + + ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); + ddt_get_dedup_object_stats(spa, ddo); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)); + kmem_free(ddo, sizeof (ddt_object_t)); + + dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); + ddt_get_dedup_stats(spa, dds); + fnvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_STATS, + (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)); + kmem_free(dds, sizeof (ddt_stat_t)); + } + + if (locked) + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + + return (config); +} + +/* + * Update all disk labels, generate a fresh config based on the current + * in-core state, and sync the global config cache (do not sync the config + * cache if this is a booting rootpool). + */ +void +spa_config_update(spa_t *spa, int what) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t txg; + int c; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + txg = spa_last_synced_txg(spa) + 1; + if (what == SPA_CONFIG_UPDATE_POOL) { + vdev_config_dirty(rvd); + } else { + /* + * If we have top-level vdevs that were added but have + * not yet been prepared for allocation, do that now. + * (It's safe now because the config cache is up to date, + * so it will be able to translate the new DVAs.) + * See comments in spa_vdev_add() for full details. + */ + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + /* + * Explicitly skip vdevs that are indirect or + * log vdevs that are being removed. The reason + * is that both of those can have vdev_ms_array + * set to 0 and we wouldn't want to change their + * metaslab size nor call vdev_expand() on them. + */ + if (!vdev_is_concrete(tvd) || + (tvd->vdev_islog && tvd->vdev_removing)) + continue; + + if (tvd->vdev_ms_array == 0) { + vdev_ashift_optimize(tvd); + vdev_metaslab_set_size(tvd); + } + vdev_expand(tvd, txg); + } + } + spa_config_exit(spa, SCL_ALL, FTAG); + + /* + * Wait for the mosconfig to be regenerated and synced. + */ + txg_wait_synced(spa->spa_dsl_pool, txg); + + /* + * Update the global config cache to reflect the new mosconfig. + */ + if (!spa->spa_is_root) { + spa_write_cachefile(spa, B_FALSE, + what != SPA_CONFIG_UPDATE_POOL); + } + + if (what == SPA_CONFIG_UPDATE_POOL) + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); +} + +EXPORT_SYMBOL(spa_config_load); +EXPORT_SYMBOL(spa_all_configs); +EXPORT_SYMBOL(spa_config_set); +EXPORT_SYMBOL(spa_config_generate); +EXPORT_SYMBOL(spa_config_update); + +/* BEGIN CSTYLED */ +#ifdef __linux__ +/* string sysctls require a char array on FreeBSD */ +ZFS_MODULE_PARAM(zfs_spa, spa_, config_path, STRING, ZMOD_RD, + "SPA config file (/etc/zfs/zpool.cache)"); +#endif + +ZFS_MODULE_PARAM(zfs, zfs_, autoimport_disable, INT, ZMOD_RW, + "Disable pool import at module load"); +/* END CSTYLED */ diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c new file mode 100644 index 000000000000..fa5120eb61b3 --- /dev/null +++ b/module/zfs/spa_errlog.c @@ -0,0 +1,416 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2014 by Delphix. All rights reserved. + */ + +/* + * Routines to manage the on-disk persistent error log. + * + * Each pool stores a log of all logical data errors seen during normal + * operation. This is actually the union of two distinct logs: the last log, + * and the current log. All errors seen are logged to the current log. When a + * scrub completes, the current log becomes the last log, the last log is thrown + * out, and the current log is reinitialized. This way, if an error is somehow + * corrected, a new scrub will show that it no longer exists, and will be + * deleted from the log when the scrub completes. + * + * The log is stored using a ZAP object whose key is a string form of the + * zbookmark_phys tuple (objset, object, level, blkid), and whose contents is an + * optional 'objset:object' human-readable string describing the data. When an + * error is first logged, this string will be empty, indicating that no name is + * known. This prevents us from having to issue a potentially large amount of + * I/O to discover the object name during an error path. Instead, we do the + * calculation when the data is requested, storing the result so future queries + * will be faster. + * + * This log is then shipped into an nvlist where the key is the dataset name and + * the value is the object name. Userland is then responsible for uniquifying + * this list and displaying it to the user. + */ + +#include +#include +#include +#include +#include + + +/* + * Convert a bookmark to a string. + */ +static void +bookmark_to_name(zbookmark_phys_t *zb, char *buf, size_t len) +{ + (void) snprintf(buf, len, "%llx:%llx:%llx:%llx", + (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, (u_longlong_t)zb->zb_blkid); +} + +/* + * Convert a string to a bookmark + */ +#ifdef _KERNEL +static void +name_to_bookmark(char *buf, zbookmark_phys_t *zb) +{ + zb->zb_objset = zfs_strtonum(buf, &buf); + ASSERT(*buf == ':'); + zb->zb_object = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_level = (int)zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == ':'); + zb->zb_blkid = zfs_strtonum(buf + 1, &buf); + ASSERT(*buf == '\0'); +} +#endif + +/* + * Log an uncorrectable error to the persistent error log. We add it to the + * spa's list of pending errors. The changes are actually synced out to disk + * during spa_errlog_sync(). + */ +void +spa_log_error(spa_t *spa, const zbookmark_phys_t *zb) +{ + spa_error_entry_t search; + spa_error_entry_t *new; + avl_tree_t *tree; + avl_index_t where; + + /* + * If we are trying to import a pool, ignore any errors, as we won't be + * writing to the pool any time soon. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * If we have had a request to rotate the log, log it to the next list + * instead of the current one. + */ + if (spa->spa_scrub_active || spa->spa_scrub_finished) + tree = &spa->spa_errlist_scrub; + else + tree = &spa->spa_errlist_last; + + search.se_bookmark = *zb; + if (avl_find(tree, &search, &where) != NULL) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + new = kmem_zalloc(sizeof (spa_error_entry_t), KM_SLEEP); + new->se_bookmark = *zb; + avl_insert(tree, new, where); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Return the number of errors currently in the error log. This is actually the + * sum of both the last log and the current log, since we don't know the union + * of these logs until we reach userland. + */ +uint64_t +spa_get_errlog_size(spa_t *spa) +{ + uint64_t total = 0, count; + + mutex_enter(&spa->spa_errlog_lock); + if (spa->spa_errlog_scrub != 0 && + zap_count(spa->spa_meta_objset, spa->spa_errlog_scrub, + &count) == 0) + total += count; + + if (spa->spa_errlog_last != 0 && !spa->spa_scrub_finished && + zap_count(spa->spa_meta_objset, spa->spa_errlog_last, + &count) == 0) + total += count; + mutex_exit(&spa->spa_errlog_lock); + + mutex_enter(&spa->spa_errlist_lock); + total += avl_numnodes(&spa->spa_errlist_last); + total += avl_numnodes(&spa->spa_errlist_scrub); + mutex_exit(&spa->spa_errlist_lock); + + return (total); +} + +#ifdef _KERNEL +static int +process_error_log(spa_t *spa, uint64_t obj, void *addr, size_t *count) +{ + zap_cursor_t zc; + zap_attribute_t za; + zbookmark_phys_t zb; + + if (obj == 0) + return (0); + + for (zap_cursor_init(&zc, spa->spa_meta_objset, obj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + + if (*count == 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(ENOMEM)); + } + + name_to_bookmark(za.za_name, &zb); + + if (copyout(&zb, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) { + zap_cursor_fini(&zc); + return (SET_ERROR(EFAULT)); + } + + *count -= 1; + } + + zap_cursor_fini(&zc); + + return (0); +} + +static int +process_error_list(avl_tree_t *list, void *addr, size_t *count) +{ + spa_error_entry_t *se; + + for (se = avl_first(list); se != NULL; se = AVL_NEXT(list, se)) { + + if (*count == 0) + return (SET_ERROR(ENOMEM)); + + if (copyout(&se->se_bookmark, (char *)addr + + (*count - 1) * sizeof (zbookmark_phys_t), + sizeof (zbookmark_phys_t)) != 0) + return (SET_ERROR(EFAULT)); + + *count -= 1; + } + + return (0); +} +#endif + +/* + * Copy all known errors to userland as an array of bookmarks. This is + * actually a union of the on-disk last log and current log, as well as any + * pending error requests. + * + * Because the act of reading the on-disk log could cause errors to be + * generated, we have two separate locks: one for the error log and one for the + * in-core error lists. We only need the error list lock to log and error, so + * we grab the error log lock while we read the on-disk logs, and only pick up + * the error list lock when we are finished. + */ +int +spa_get_errlog(spa_t *spa, void *uaddr, size_t *count) +{ + int ret = 0; + +#ifdef _KERNEL + mutex_enter(&spa->spa_errlog_lock); + + ret = process_error_log(spa, spa->spa_errlog_scrub, uaddr, count); + + if (!ret && !spa->spa_scrub_finished) + ret = process_error_log(spa, spa->spa_errlog_last, uaddr, + count); + + mutex_enter(&spa->spa_errlist_lock); + if (!ret) + ret = process_error_list(&spa->spa_errlist_scrub, uaddr, + count); + if (!ret) + ret = process_error_list(&spa->spa_errlist_last, uaddr, + count); + mutex_exit(&spa->spa_errlist_lock); + + mutex_exit(&spa->spa_errlog_lock); +#endif + + return (ret); +} + +/* + * Called when a scrub completes. This simply set a bit which tells which AVL + * tree to add new errors. spa_errlog_sync() is responsible for actually + * syncing the changes to the underlying objects. + */ +void +spa_errlog_rotate(spa_t *spa) +{ + mutex_enter(&spa->spa_errlist_lock); + spa->spa_scrub_finished = B_TRUE; + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Discard any pending errors from the spa_t. Called when unloading a faulted + * pool, as the errors encountered during the open cannot be synced to disk. + */ +void +spa_errlog_drain(spa_t *spa) +{ + spa_error_entry_t *se; + void *cookie; + + mutex_enter(&spa->spa_errlist_lock); + + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_last, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + cookie = NULL; + while ((se = avl_destroy_nodes(&spa->spa_errlist_scrub, + &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + + mutex_exit(&spa->spa_errlist_lock); +} + +/* + * Process a list of errors into the current on-disk log. + */ +static void +sync_error_list(spa_t *spa, avl_tree_t *t, uint64_t *obj, dmu_tx_t *tx) +{ + spa_error_entry_t *se; + char buf[64]; + void *cookie; + + if (avl_numnodes(t) != 0) { + /* create log if necessary */ + if (*obj == 0) + *obj = zap_create(spa->spa_meta_objset, + DMU_OT_ERROR_LOG, DMU_OT_NONE, + 0, tx); + + /* add errors to the current log */ + for (se = avl_first(t); se != NULL; se = AVL_NEXT(t, se)) { + char *name = se->se_name ? se->se_name : ""; + + bookmark_to_name(&se->se_bookmark, buf, sizeof (buf)); + + (void) zap_update(spa->spa_meta_objset, + *obj, buf, 1, strlen(name) + 1, name, tx); + } + + /* purge the error list */ + cookie = NULL; + while ((se = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(se, sizeof (spa_error_entry_t)); + } +} + +/* + * Sync the error log out to disk. This is a little tricky because the act of + * writing the error log requires the spa_errlist_lock. So, we need to lock the + * error lists, take a copy of the lists, and then reinitialize them. Then, we + * drop the error list lock and take the error log lock, at which point we + * do the errlog processing. Then, if we encounter an I/O error during this + * process, we can successfully add the error to the list. Note that this will + * result in the perpetual recycling of errors, but it is an unlikely situation + * and not a performance critical operation. + */ +void +spa_errlog_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + avl_tree_t scrub, last; + int scrub_finished; + + mutex_enter(&spa->spa_errlist_lock); + + /* + * Bail out early under normal circumstances. + */ + if (avl_numnodes(&spa->spa_errlist_scrub) == 0 && + avl_numnodes(&spa->spa_errlist_last) == 0 && + !spa->spa_scrub_finished) { + mutex_exit(&spa->spa_errlist_lock); + return; + } + + spa_get_errlists(spa, &last, &scrub); + scrub_finished = spa->spa_scrub_finished; + spa->spa_scrub_finished = B_FALSE; + + mutex_exit(&spa->spa_errlist_lock); + mutex_enter(&spa->spa_errlog_lock); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + /* + * Sync out the current list of errors. + */ + sync_error_list(spa, &last, &spa->spa_errlog_last, tx); + + /* + * Rotate the log if necessary. + */ + if (scrub_finished) { + if (spa->spa_errlog_last != 0) + VERIFY(dmu_object_free(spa->spa_meta_objset, + spa->spa_errlog_last, tx) == 0); + spa->spa_errlog_last = spa->spa_errlog_scrub; + spa->spa_errlog_scrub = 0; + + sync_error_list(spa, &scrub, &spa->spa_errlog_last, tx); + } + + /* + * Sync out any pending scrub errors. + */ + sync_error_list(spa, &scrub, &spa->spa_errlog_scrub, tx); + + /* + * Update the MOS to reflect the new values. + */ + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_LAST, sizeof (uint64_t), 1, + &spa->spa_errlog_last, tx); + (void) zap_update(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_ERRLOG_SCRUB, sizeof (uint64_t), 1, + &spa->spa_errlog_scrub, tx); + + dmu_tx_commit(tx); + + mutex_exit(&spa->spa_errlog_lock); +} + +#if defined(_KERNEL) +/* error handling */ +EXPORT_SYMBOL(spa_log_error); +EXPORT_SYMBOL(spa_get_errlog_size); +EXPORT_SYMBOL(spa_get_errlog); +EXPORT_SYMBOL(spa_errlog_rotate); +EXPORT_SYMBOL(spa_errlog_drain); +EXPORT_SYMBOL(spa_errlog_sync); +EXPORT_SYMBOL(spa_get_errlists); +#endif diff --git a/module/zfs/spa_history.c b/module/zfs/spa_history.c new file mode 100644 index 000000000000..f47adb94d55b --- /dev/null +++ b/module/zfs/spa_history.c @@ -0,0 +1,629 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2017 Joyent, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_comutil.h" +#include "zfs_gitrev.h" +#ifdef _KERNEL +#include +#endif + +/* + * Routines to manage the on-disk history log. + * + * The history log is stored as a dmu object containing + * tuples. + * + * Where "record nvlist" is an nvlist containing uint64_ts and strings, and + * "packed record length" is the packed length of the "record nvlist" stored + * as a little endian uint64_t. + * + * The log is implemented as a ring buffer, though the original creation + * of the pool ('zpool create') is never overwritten. + * + * The history log is tracked as object 'spa_t::spa_history'. The bonus buffer + * of 'spa_history' stores the offsets for logging/retrieving history as + * 'spa_history_phys_t'. 'sh_pool_create_len' is the ending offset in bytes of + * where the 'zpool create' record is stored. This allows us to never + * overwrite the original creation of the pool. 'sh_phys_max_off' is the + * physical ending offset in bytes of the log. This tells you the length of + * the buffer. 'sh_eof' is the logical EOF (in bytes). Whenever a record + * is added, 'sh_eof' is incremented by the size of the record. + * 'sh_eof' is never decremented. 'sh_bof' is the logical BOF (in bytes). + * This is where the consumer should start reading from after reading in + * the 'zpool create' portion of the log. + * + * 'sh_records_lost' keeps track of how many records have been overwritten + * and permanently lost. + */ + +/* convert a logical offset to physical */ +static uint64_t +spa_history_log_to_phys(uint64_t log_off, spa_history_phys_t *shpp) +{ + uint64_t phys_len; + + phys_len = shpp->sh_phys_max_off - shpp->sh_pool_create_len; + return ((log_off - shpp->sh_pool_create_len) % phys_len + + shpp->sh_pool_create_len); +} + +void +spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) +{ + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + objset_t *mos = spa->spa_meta_objset; + + ASSERT0(spa->spa_history); + spa->spa_history = dmu_object_alloc(mos, DMU_OT_SPA_HISTORY, + SPA_OLD_MAXBLOCKSIZE, DMU_OT_SPA_HISTORY_OFFSETS, + sizeof (spa_history_phys_t), tx); + + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_HISTORY, sizeof (uint64_t), 1, + &spa->spa_history, tx)); + + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + ASSERT3U(dbp->db_size, >=, sizeof (spa_history_phys_t)); + + shpp = dbp->db_data; + dmu_buf_will_dirty(dbp, tx); + + /* + * Figure out maximum size of history log. We set it at + * 0.1% of pool size, with a max of 1G and min of 128KB. + */ + shpp->sh_phys_max_off = + metaslab_class_get_dspace(spa_normal_class(spa)) / 1000; + shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 1<<30); + shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); + + dmu_buf_rele(dbp, FTAG); +} + +/* + * Change 'sh_bof' to the beginning of the next record. + */ +static int +spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) +{ + objset_t *mos = spa->spa_meta_objset; + uint64_t firstread, reclen, phys_bof; + char buf[sizeof (reclen)]; + int err; + + phys_bof = spa_history_log_to_phys(shpp->sh_bof, shpp); + firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); + + if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, + buf, DMU_READ_PREFETCH)) != 0) + return (err); + if (firstread != sizeof (reclen)) { + if ((err = dmu_read(mos, spa->spa_history, + shpp->sh_pool_create_len, sizeof (reclen) - firstread, + buf + firstread, DMU_READ_PREFETCH)) != 0) + return (err); + } + + reclen = LE_64(*((uint64_t *)buf)); + shpp->sh_bof += reclen + sizeof (reclen); + shpp->sh_records_lost++; + return (0); +} + +static int +spa_history_write(spa_t *spa, void *buf, uint64_t len, spa_history_phys_t *shpp, + dmu_tx_t *tx) +{ + uint64_t firstwrite, phys_eof; + objset_t *mos = spa->spa_meta_objset; + int err; + + ASSERT(MUTEX_HELD(&spa->spa_history_lock)); + + /* see if we need to reset logical BOF */ + while (shpp->sh_phys_max_off - shpp->sh_pool_create_len - + (shpp->sh_eof - shpp->sh_bof) <= len) { + if ((err = spa_history_advance_bof(spa, shpp)) != 0) { + return (err); + } + } + + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + firstwrite = MIN(len, shpp->sh_phys_max_off - phys_eof); + shpp->sh_eof += len; + dmu_write(mos, spa->spa_history, phys_eof, firstwrite, buf, tx); + + len -= firstwrite; + if (len > 0) { + /* write out the rest at the beginning of physical file */ + dmu_write(mos, spa->spa_history, shpp->sh_pool_create_len, + len, (char *)buf + firstwrite, tx); + } + + return (0); +} + +/* + * Post a history sysevent. + * + * The nvlist_t* passed into this function will be transformed into a new + * nvlist where: + * + * 1. Nested nvlists will be flattened to a single level + * 2. Keys will have their names normalized (to remove any problematic + * characters, such as whitespace) + * + * The nvlist_t passed into this function will duplicated and should be freed + * by caller. + * + */ +static void +spa_history_log_notify(spa_t *spa, nvlist_t *nvl) +{ + nvlist_t *hist_nvl = fnvlist_alloc(); + uint64_t uint64; + char *string; + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_CMD, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_CMD, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_ZONE, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_ZONE, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_HOST, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_HOST, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_DSNAME, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_STR, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_IOCTL, string); + + if (nvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME, &string) == 0) + fnvlist_add_string(hist_nvl, ZFS_EV_HIST_INT_NAME, string); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_DSID, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TXG, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_TIME, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_TIME, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_WHO, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_WHO, uint64); + + if (nvlist_lookup_uint64(nvl, ZPOOL_HIST_INT_EVENT, &uint64) == 0) + fnvlist_add_uint64(hist_nvl, ZFS_EV_HIST_INT_EVENT, uint64); + + spa_event_notify(spa, NULL, hist_nvl, ESC_ZFS_HISTORY_EVENT); + + nvlist_free(hist_nvl); +} + +/* + * Write out a history event. + */ +/*ARGSUSED*/ +static void +spa_history_log_sync(void *arg, dmu_tx_t *tx) +{ + nvlist_t *nvl = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + spa_history_phys_t *shpp; + size_t reclen; + uint64_t le_len; + char *record_packed = NULL; + int ret; + + /* + * If we have an older pool that doesn't have a command + * history object, create it now. + */ + mutex_enter(&spa->spa_history_lock); + if (!spa->spa_history) + spa_history_create_obj(spa, tx); + mutex_exit(&spa->spa_history_lock); + + /* + * Get the offset of where we need to write via the bonus buffer. + * Update the offset when the write completes. + */ + VERIFY0(dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)); + shpp = dbp->db_data; + + dmu_buf_will_dirty(dbp, tx); + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + fnvlist_add_uint64(nvl, ZPOOL_HIST_TIME, gethrestime_sec()); + fnvlist_add_string(nvl, ZPOOL_HIST_HOST, utsname()->nodename); + + if (nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + zfs_dbgmsg("command: %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_CMD)); + } else if (nvlist_exists(nvl, ZPOOL_HIST_INT_NAME)) { + if (nvlist_exists(nvl, ZPOOL_HIST_DSNAME)) { + zfs_dbgmsg("txg %lld %s %s (id %llu) %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_DSNAME), + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_DSID), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } else { + zfs_dbgmsg("txg %lld %s %s", + fnvlist_lookup_uint64(nvl, ZPOOL_HIST_TXG), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_NAME), + fnvlist_lookup_string(nvl, ZPOOL_HIST_INT_STR)); + } + /* + * The history sysevent is posted only for internal history + * messages to show what has happened, not how it happened. For + * example, the following command: + * + * # zfs destroy -r tank/foo + * + * will result in one sysevent posted per dataset that is + * destroyed as a result of the command - which could be more + * than one event in total. By contrast, if the sysevent was + * posted as a result of the ZPOOL_HIST_CMD key being present + * it would result in only one sysevent being posted with the + * full command line arguments, requiring the consumer to know + * how to parse and understand zfs(1M) command invocations. + */ + spa_history_log_notify(spa, nvl); + } else if (nvlist_exists(nvl, ZPOOL_HIST_IOCTL)) { + zfs_dbgmsg("ioctl %s", + fnvlist_lookup_string(nvl, ZPOOL_HIST_IOCTL)); + } + + VERIFY3U(nvlist_pack(nvl, &record_packed, &reclen, NV_ENCODE_NATIVE, + KM_SLEEP), ==, 0); + + mutex_enter(&spa->spa_history_lock); + + /* write out the packed length as little endian */ + le_len = LE_64((uint64_t)reclen); + ret = spa_history_write(spa, &le_len, sizeof (le_len), shpp, tx); + if (!ret) + ret = spa_history_write(spa, record_packed, reclen, shpp, tx); + + /* The first command is the create, which we keep forever */ + if (ret == 0 && shpp->sh_pool_create_len == 0 && + nvlist_exists(nvl, ZPOOL_HIST_CMD)) { + shpp->sh_pool_create_len = shpp->sh_bof = shpp->sh_eof; + } + + mutex_exit(&spa->spa_history_lock); + fnvlist_pack_free(record_packed, reclen); + dmu_buf_rele(dbp, FTAG); + fnvlist_free(nvl); +} + +/* + * Write out a history event. + */ +int +spa_history_log(spa_t *spa, const char *msg) +{ + int err; + nvlist_t *nvl = fnvlist_alloc(); + + fnvlist_add_string(nvl, ZPOOL_HIST_CMD, msg); + err = spa_history_log_nvl(spa, nvl); + fnvlist_free(nvl); + return (err); +} + +int +spa_history_log_nvl(spa_t *spa, nvlist_t *nvl) +{ + int err = 0; + dmu_tx_t *tx; + nvlist_t *nvarg, *in_nvl = NULL; + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY || !spa_writeable(spa)) + return (SET_ERROR(EINVAL)); + + err = nvlist_lookup_nvlist(nvl, ZPOOL_HIST_INPUT_NVL, &in_nvl); + if (err == 0) { + (void) nvlist_remove_all(in_nvl, ZPOOL_HIDDEN_ARGS); + } + + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + VERIFY0(nvlist_dup(nvl, &nvarg, KM_SLEEP)); + if (spa_history_zone() != NULL) { + fnvlist_add_string(nvarg, ZPOOL_HIST_ZONE, + spa_history_zone()); + } + fnvlist_add_uint64(nvarg, ZPOOL_HIST_WHO, crgetruid(CRED())); + + /* Kick this off asynchronously; errors are ignored. */ + dsl_sync_task_nowait(spa_get_dsl(spa), spa_history_log_sync, + nvarg, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + /* spa_history_log_sync will free nvl */ + return (err); +} + +/* + * Read out the command history. + */ +int +spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) +{ + objset_t *mos = spa->spa_meta_objset; + dmu_buf_t *dbp; + uint64_t read_len, phys_read_off, phys_eof; + uint64_t leftover = 0; + spa_history_phys_t *shpp; + int err; + + /* + * If the command history doesn't exist (older pool), + * that's ok, just return ENOENT. + */ + if (!spa->spa_history) + return (SET_ERROR(ENOENT)); + + /* + * The history is logged asynchronously, so when they request + * the first chunk of history, make sure everything has been + * synced to disk so that we get it. + */ + if (*offp == 0 && spa_writeable(spa)) + txg_wait_synced(spa_get_dsl(spa), 0); + + if ((err = dmu_bonus_hold(mos, spa->spa_history, FTAG, &dbp)) != 0) + return (err); + shpp = dbp->db_data; + +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(dbp, &doi); + ASSERT3U(doi.doi_bonus_type, ==, DMU_OT_SPA_HISTORY_OFFSETS); + } +#endif + + mutex_enter(&spa->spa_history_lock); + phys_eof = spa_history_log_to_phys(shpp->sh_eof, shpp); + + if (*offp < shpp->sh_pool_create_len) { + /* read in just the zpool create history */ + phys_read_off = *offp; + read_len = MIN(*len, shpp->sh_pool_create_len - + phys_read_off); + } else { + /* + * Need to reset passed in offset to BOF if the passed in + * offset has since been overwritten. + */ + *offp = MAX(*offp, shpp->sh_bof); + phys_read_off = spa_history_log_to_phys(*offp, shpp); + + /* + * Read up to the minimum of what the user passed down or + * the EOF (physical or logical). If we hit physical EOF, + * use 'leftover' to read from the physical BOF. + */ + if (phys_read_off <= phys_eof) { + read_len = MIN(*len, phys_eof - phys_read_off); + } else { + read_len = MIN(*len, + shpp->sh_phys_max_off - phys_read_off); + if (phys_read_off + *len > shpp->sh_phys_max_off) { + leftover = MIN(*len - read_len, + phys_eof - shpp->sh_pool_create_len); + } + } + } + + /* offset for consumer to use next */ + *offp += read_len + leftover; + + /* tell the consumer how much you actually read */ + *len = read_len + leftover; + + if (read_len == 0) { + mutex_exit(&spa->spa_history_lock); + dmu_buf_rele(dbp, FTAG); + return (0); + } + + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, + DMU_READ_PREFETCH); + if (leftover && err == 0) { + err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, + leftover, buf + read_len, DMU_READ_PREFETCH); + } + mutex_exit(&spa->spa_history_lock); + + dmu_buf_rele(dbp, FTAG); + return (err); +} + +/* + * The nvlist will be consumed by this call. + */ +static void +log_internal(nvlist_t *nvl, const char *operation, spa_t *spa, + dmu_tx_t *tx, const char *fmt, va_list adx) +{ + char *msg; + + /* + * If this is part of creating a pool, not everything is + * initialized yet, so don't bother logging the internal events. + * Likewise if the pool is not writeable. + */ + if (spa_is_initializing(spa) || !spa_writeable(spa)) { + fnvlist_free(nvl); + return; + } + + msg = kmem_vasprintf(fmt, adx); + fnvlist_add_string(nvl, ZPOOL_HIST_INT_STR, msg); + kmem_strfree(msg); + + fnvlist_add_string(nvl, ZPOOL_HIST_INT_NAME, operation); + fnvlist_add_uint64(nvl, ZPOOL_HIST_TXG, tx->tx_txg); + + if (dmu_tx_is_syncing(tx)) { + spa_history_log_sync(nvl, tx); + } else { + dsl_sync_task_nowait(spa_get_dsl(spa), + spa_history_log_sync, nvl, 0, ZFS_SPACE_CHECK_NONE, tx); + } + /* spa_history_log_sync() will free nvl */ +} + +void +spa_history_log_internal(spa_t *spa, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(fnvlist_alloc(), operation, spa, htx, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_internal_ds(dsl_dataset_t *ds, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); + + ASSERT(tx != NULL); + + dsl_dataset_name(ds, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, ds->ds_object); + + va_start(adx, fmt); + log_internal(nvl, operation, dsl_dataset_get_spa(ds), tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_internal_dd(dsl_dir_t *dd, const char *operation, + dmu_tx_t *tx, const char *fmt, ...) +{ + va_list adx; + char namebuf[ZFS_MAX_DATASET_NAME_LEN]; + nvlist_t *nvl = fnvlist_alloc(); + + ASSERT(tx != NULL); + + dsl_dir_name(dd, namebuf); + fnvlist_add_string(nvl, ZPOOL_HIST_DSNAME, namebuf); + fnvlist_add_uint64(nvl, ZPOOL_HIST_DSID, + dsl_dir_phys(dd)->dd_head_dataset_obj); + + va_start(adx, fmt); + log_internal(nvl, operation, dd->dd_pool->dp_spa, tx, fmt, adx); + va_end(adx); +} + +void +spa_history_log_version(spa_t *spa, const char *operation, dmu_tx_t *tx) +{ + utsname_t *u = utsname(); + + spa_history_log_internal(spa, operation, tx, + "pool version %llu; software version %s; uts %s %s %s %s", + (u_longlong_t)spa_version(spa), ZFS_META_GITREV, + u->nodename, u->release, u->version, u->machine); +} + +#ifndef _KERNEL +const char * +spa_history_zone(void) +{ + return (NULL); +} +#endif + +#if defined(_KERNEL) +EXPORT_SYMBOL(spa_history_create_obj); +EXPORT_SYMBOL(spa_history_get); +EXPORT_SYMBOL(spa_history_log); +EXPORT_SYMBOL(spa_history_log_internal); +EXPORT_SYMBOL(spa_history_log_version); +#endif diff --git a/module/zfs/spa_log_spacemap.c b/module/zfs/spa_log_spacemap.c new file mode 100644 index 000000000000..5c55d32ec066 --- /dev/null +++ b/module/zfs/spa_log_spacemap.c @@ -0,0 +1,1322 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Log Space Maps + * + * Log space maps are an optimization in ZFS metadata allocations for pools + * whose workloads are primarily random-writes. Random-write workloads are also + * typically random-free, meaning that they are freeing from locations scattered + * throughout the pool. This means that each TXG we will have to append some + * FREE records to almost every metaslab. With log space maps, we hold their + * changes in memory and log them altogether in one pool-wide space map on-disk + * for persistence. As more blocks are accumulated in the log space maps and + * more unflushed changes are accounted in memory, we flush a selected group + * of metaslabs every TXG to relieve memory pressure and potential overheads + * when loading the pool. Flushing a metaslab to disk relieves memory as we + * flush any unflushed changes from memory to disk (i.e. the metaslab's space + * map) and saves import time by making old log space maps obsolete and + * eventually destroying them. [A log space map is said to be obsolete when all + * its entries have made it to their corresponding metaslab space maps]. + * + * == On disk data structures used == + * + * - The pool has a new feature flag and a new entry in the MOS. The feature + * is activated when we create the first log space map and remains active + * for the lifetime of the pool. The new entry in the MOS Directory [refer + * to DMU_POOL_LOG_SPACEMAP_ZAP] is populated with a ZAP whose key-value + * pairs are of the form . + * This entry is our on-disk reference of the log space maps that exist in + * the pool for each TXG and it is used during import to load all the + * metaslab unflushed changes in memory. To see how this structure is first + * created and later populated refer to spa_generate_syncing_log_sm(). To see + * how it is used during import time refer to spa_ld_log_sm_metadata(). + * + * - Each vdev has a new entry in its vdev_top_zap (see field + * VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS) which holds the msp_unflushed_txg of + * each metaslab in this vdev. This field is the on-disk counterpart of the + * in-memory field ms_unflushed_txg which tells us from which TXG and onwards + * the metaslab haven't had its changes flushed. During import, we use this + * to ignore any entries in the space map log that are for this metaslab but + * from a TXG before msp_unflushed_txg. At that point, we also populate its + * in-memory counterpart and from there both fields are updated every time + * we flush that metaslab. + * + * - A space map is created every TXG and, during that TXG, it is used to log + * all incoming changes (the log space map). When created, the log space map + * is referenced in memory by spa_syncing_log_sm and its object ID is inserted + * to the space map ZAP mentioned above. The log space map is closed at the + * end of the TXG and will be destroyed when it becomes fully obsolete. We + * know when a log space map has become obsolete by looking at the oldest + * (and smallest) ms_unflushed_txg in the pool. If the value of that is bigger + * than the log space map's TXG, then it means that there is no metaslab who + * doesn't have the changes from that log and we can therefore destroy it. + * [see spa_cleanup_old_sm_logs()]. + * + * == Important in-memory structures == + * + * - The per-spa field spa_metaslabs_by_flushed sorts all the metaslabs in + * the pool by their ms_unflushed_txg field. It is primarily used for three + * reasons. First of all, it is used during flushing where we try to flush + * metaslabs in-order from the oldest-flushed to the most recently flushed + * every TXG. Secondly, it helps us to lookup the ms_unflushed_txg of the + * oldest flushed metaslab to distinguish which log space maps have become + * obsolete and which ones are still relevant. Finally it tells us which + * metaslabs have unflushed changes in a pool where this feature was just + * enabled, as we don't immediately add all of the pool's metaslabs but we + * add them over time as they go through metaslab_sync(). The reason that + * we do that is to ease these pools into the behavior of the flushing + * algorithm (described later on). + * + * - The per-spa field spa_sm_logs_by_txg can be thought as the in-memory + * counterpart of the space map ZAP mentioned above. It's an AVL tree whose + * nodes represent the log space maps in the pool. This in-memory + * representation of log space maps in the pool sorts the log space maps by + * the TXG that they were created (which is also the TXG of their unflushed + * changes). It also contains the following extra information for each + * space map: + * [1] The number of metaslabs that were last flushed on that TXG. This is + * important because if that counter is zero and this is the oldest + * log then it means that it is also obsolete. + * [2] The number of blocks of that space map. This field is used by the + * block heuristic of our flushing algorithm (described later on). + * It represents how many blocks of metadata changes ZFS had to write + * to disk for that TXG. + * + * - The per-spa field spa_log_summary is a list of entries that summarizes + * the metaslab and block counts of all the nodes of the spa_sm_logs_by_txg + * AVL tree mentioned above. The reason this exists is that our flushing + * algorithm (described later) tries to estimate how many metaslabs to flush + * in each TXG by iterating over all the log space maps and looking at their + * block counts. Summarizing that information means that don't have to + * iterate through each space map, minimizing the runtime overhead of the + * flushing algorithm which would be induced in syncing context. In terms of + * implementation the log summary is used as a queue: + * * we modify or pop entries from its head when we flush metaslabs + * * we modify or append entries to its tail when we sync changes. + * + * - Each metaslab has two new range trees that hold its unflushed changes, + * ms_unflushed_allocs and ms_unflushed_frees. These are always disjoint. + * + * == Flushing algorithm == + * + * The decision of how many metaslabs to flush on a give TXG is guided by + * two heuristics: + * + * [1] The memory heuristic - + * We keep track of the memory used by the unflushed trees from all the + * metaslabs [see sus_memused of spa_unflushed_stats] and we ensure that it + * stays below a certain threshold which is determined by an arbitrary hard + * limit and an arbitrary percentage of the system's memory [see + * spa_log_exceeds_memlimit()]. When we see that the memory usage of the + * unflushed changes are passing that threshold, we flush metaslabs, which + * empties their unflushed range trees, reducing the memory used. + * + * [2] The block heuristic - + * We try to keep the total number of blocks in the log space maps in check + * so the log doesn't grow indefinitely and we don't induce a lot of overhead + * when loading the pool. At the same time we don't want to flush a lot of + * metaslabs too often as this would defeat the purpose of the log space map. + * As a result we set a limit in the amount of blocks that we think it's + * acceptable for the log space maps to have and try not to cross it. + * [see sus_blocklimit from spa_unflushed_stats]. + * + * In order to stay below the block limit every TXG we have to estimate how + * many metaslabs we need to flush based on the current rate of incoming blocks + * and our history of log space map blocks. The main idea here is to answer + * the question of how many metaslabs do we need to flush in order to get rid + * at least an X amount of log space map blocks. We can answer this question + * by iterating backwards from the oldest log space map to the newest one + * and looking at their metaslab and block counts. At this point the log summary + * mentioned above comes handy as it reduces the amount of things that we have + * to iterate (even though it may reduce the preciseness of our estimates due + * to its aggregation of data). So with that in mind, we project the incoming + * rate of the current TXG into the future and attempt to approximate how many + * metaslabs would we need to flush from now in order to avoid exceeding our + * block limit in different points in the future (granted that we would keep + * flushing the same number of metaslabs for every TXG). Then we take the + * maximum number from all these estimates to be on the safe side. For the + * exact implementation details of algorithm refer to + * spa_estimate_metaslabs_to_flush. + */ + +/* + * This is used as the block size for the space maps used for the + * log space map feature. These space maps benefit from a bigger + * block size as we expect to be writing a lot of data to them at + * once. + */ +unsigned long zfs_log_sm_blksz = 1ULL << 17; + +/* + * Percentage of the overall system's memory that ZFS allows to be + * used for unflushed changes (e.g. the sum of size of all the nodes + * in the unflushed trees). + * + * Note that this value is calculated over 1000000 for finer granularity + * (thus the _ppm suffix; reads as "parts per million"). As an example, + * the default of 1000 allows 0.1% of memory to be used. + */ +unsigned long zfs_unflushed_max_mem_ppm = 1000; + +/* + * Specific hard-limit in memory that ZFS allows to be used for + * unflushed changes. + */ +unsigned long zfs_unflushed_max_mem_amt = 1ULL << 30; + +/* + * The following tunable determines the number of blocks that can be used for + * the log space maps. It is expressed as a percentage of the total number of + * metaslabs in the pool (i.e. the default of 400 means that the number of log + * blocks is capped at 4 times the number of metaslabs). + * + * This value exists to tune our flushing algorithm, with higher values + * flushing metaslabs less often (doing less I/Os) per TXG versus lower values + * flushing metaslabs more aggressively with the upside of saving overheads + * when loading the pool. Another factor in this tradeoff is that flushing + * less often can potentially lead to better utilization of the metaslab space + * map's block size as we accumulate more changes per flush. + * + * Given that this tunable indirectly controls the flush rate (metaslabs + * flushed per txg) and that's why making it a percentage in terms of the + * number of metaslabs in the pool makes sense here. + * + * As a rule of thumb we default this tunable to 400% based on the following: + * + * 1] Assuming a constant flush rate and a constant incoming rate of log blocks + * it is reasonable to expect that the amount of obsolete entries changes + * linearly from txg to txg (e.g. the oldest log should have the most + * obsolete entries, and the most recent one the least). With this we could + * say that, at any given time, about half of the entries in the whole space + * map log are obsolete. Thus for every two entries for a metaslab in the + * log space map, only one of them is valid and actually makes it to the + * metaslab's space map. + * [factor of 2] + * 2] Each entry in the log space map is guaranteed to be two words while + * entries in metaslab space maps are generally single-word. + * [an extra factor of 2 - 400% overall] + * 3] Even if [1] and [2] are slightly less than 2 each, we haven't taken into + * account any consolidation of segments from the log space map to the + * unflushed range trees nor their history (e.g. a segment being allocated, + * then freed, then allocated again means 3 log space map entries but 0 + * metaslab space map entries). Depending on the workload, we've seen ~1.8 + * non-obsolete log space map entries per metaslab entry, for a total of + * ~600%. Since most of these estimates though are workload dependent, we + * default on 400% to be conservative. + * + * Thus we could say that even in the worst + * case of [1] and [2], the factor should end up being 4. + * + * That said, regardless of the number of metaslabs in the pool we need to + * provide upper and lower bounds for the log block limit. + * [see zfs_unflushed_log_block_{min,max}] + */ +unsigned long zfs_unflushed_log_block_pct = 400; + +/* + * If the number of metaslabs is small and our incoming rate is high, we could + * get into a situation that we are flushing all our metaslabs every TXG. Thus + * we always allow at least this many log blocks. + */ +unsigned long zfs_unflushed_log_block_min = 1000; + +/* + * If the log becomes too big, the import time of the pool can take a hit in + * terms of performance. Thus we have a hard limit in the size of the log in + * terms of blocks. + */ +unsigned long zfs_unflushed_log_block_max = (1ULL << 18); + +/* + * Max # of rows allowed for the log_summary. The tradeoff here is accuracy and + * stability of the flushing algorithm (longer summary) vs its runtime overhead + * (smaller summary is faster to traverse). + */ +unsigned long zfs_max_logsm_summary_length = 10; + +/* + * Tunable that sets the lower bound on the metaslabs to flush every TXG. + * + * Setting this to 0 has no effect since if the pool is idle we won't even be + * creating log space maps and therefore we won't be flushing. On the other + * hand if the pool has any incoming workload our block heuristic will start + * flushing metaslabs anyway. + * + * The point of this tunable is to be used in extreme cases where we really + * want to flush more metaslabs than our adaptable heuristic plans to flush. + */ +unsigned long zfs_min_metaslabs_to_flush = 1; + +/* + * Tunable that specifies how far in the past do we want to look when trying to + * estimate the incoming log blocks for the current TXG. + * + * Setting this too high may not only increase runtime but also minimize the + * effect of the incoming rates from the most recent TXGs as we take the + * average over all the blocks that we walk + * [see spa_estimate_incoming_log_blocks]. + */ +unsigned long zfs_max_log_walking = 5; + +/* + * This tunable exists solely for testing purposes. It ensures that the log + * spacemaps are not flushed and destroyed during export in order for the + * relevant log spacemap import code paths to be tested (effectively simulating + * a crash). + */ +int zfs_keep_log_spacemaps_at_export = 0; + +static uint64_t +spa_estimate_incoming_log_blocks(spa_t *spa) +{ + ASSERT3U(spa_sync_pass(spa), ==, 1); + uint64_t steps = 0, sum = 0; + for (spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + sls != NULL && steps < zfs_max_log_walking; + sls = AVL_PREV(&spa->spa_sm_logs_by_txg, sls)) { + if (sls->sls_txg == spa_syncing_txg(spa)) { + /* + * skip the log created in this TXG as this would + * make our estimations inaccurate. + */ + continue; + } + sum += sls->sls_nblocks; + steps++; + } + return ((steps > 0) ? DIV_ROUND_UP(sum, steps) : 0); +} + +uint64_t +spa_log_sm_blocklimit(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_blocklimit); +} + +void +spa_log_sm_set_blocklimit(spa_t *spa) +{ + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) { + ASSERT0(spa_log_sm_blocklimit(spa)); + return; + } + + uint64_t calculated_limit = + (spa_total_metaslabs(spa) * zfs_unflushed_log_block_pct) / 100; + spa->spa_unflushed_stats.sus_blocklimit = MIN(MAX(calculated_limit, + zfs_unflushed_log_block_min), zfs_unflushed_log_block_max); +} + +uint64_t +spa_log_sm_nblocks(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_nblocks); +} + +/* + * Ensure that the in-memory log space map structures and the summary + * have the same block and metaslab counts. + */ +static void +spa_log_summary_verify_counts(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + if ((zfs_flags & ZFS_DEBUG_LOG_SPACEMAP) == 0) + return; + + uint64_t ms_in_avl = avl_numnodes(&spa->spa_metaslabs_by_flushed); + + uint64_t ms_in_summary = 0, blk_in_summary = 0; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + ms_in_summary += e->lse_mscount; + blk_in_summary += e->lse_blkcount; + } + + uint64_t ms_in_logs = 0, blk_in_logs = 0; + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + ms_in_logs += sls->sls_mscount; + blk_in_logs += sls->sls_nblocks; + } + + VERIFY3U(ms_in_logs, ==, ms_in_summary); + VERIFY3U(ms_in_logs, ==, ms_in_avl); + VERIFY3U(blk_in_logs, ==, blk_in_summary); + VERIFY3U(blk_in_logs, ==, spa_log_sm_nblocks(spa)); +} + +static boolean_t +summary_entry_is_full(spa_t *spa, log_summary_entry_t *e) +{ + uint64_t blocks_per_row = MAX(1, + DIV_ROUND_UP(spa_log_sm_blocklimit(spa), + zfs_max_logsm_summary_length)); + return (blocks_per_row <= e->lse_blkcount); +} + +/* + * Update the log summary information to reflect the fact that a metaslab + * was flushed or destroyed (e.g due to device removal or pool export/destroy). + * + * We typically flush the oldest flushed metaslab so the first (and oldest) + * entry of the summary is updated. However if that metaslab is getting loaded + * we may flush the second oldest one which may be part of an entry later in + * the summary. Moreover, if we call into this function from metaslab_fini() + * the metaslabs probably won't be ordered by ms_unflushed_txg. Thus we ask + * for a txg as an argument so we can locate the appropriate summary entry for + * the metaslab. + */ +void +spa_log_summary_decrement_mscount(spa_t *spa, uint64_t txg) +{ + /* + * We don't track summary data for read-only pools and this function + * can be called from metaslab_fini(). In that case return immediately. + */ + if (!spa_writeable(spa)) + return; + + log_summary_entry_t *target = NULL; + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_next(&spa->spa_log_summary, e)) { + if (e->lse_start > txg) + break; + target = e; + } + + if (target == NULL || target->lse_mscount == 0) { + /* + * We didn't find a summary entry for this metaslab. We must be + * at the teardown of a spa_load() attempt that got an error + * while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + target->lse_mscount--; +} + +/* + * Update the log summary information to reflect the fact that we destroyed + * old log space maps. Since we can only destroy the oldest log space maps, + * we decrement the block count of the oldest summary entry and potentially + * destroy it when that count hits 0. + * + * This function is called after a metaslab is flushed and typically that + * metaslab is the oldest flushed, which means that this function will + * typically decrement the block count of the first entry of the summary and + * potentially free it if the block count gets to zero (its metaslab count + * should be zero too at that point). + * + * There are certain scenarios though that don't work exactly like that so we + * need to account for them: + * + * Scenario [1]: It is possible that after we flushed the oldest flushed + * metaslab and we destroyed the oldest log space map, more recent logs had 0 + * metaslabs pointing to them so we got rid of them too. This can happen due + * to metaslabs being destroyed through device removal, or because the oldest + * flushed metaslab was loading but we kept flushing more recently flushed + * metaslabs due to the memory pressure of unflushed changes. Because of that, + * we always iterate from the beginning of the summary and if blocks_gone is + * bigger than the block_count of the current entry we free that entry (we + * expect its metaslab count to be zero), we decrement blocks_gone and on to + * the next entry repeating this procedure until blocks_gone gets decremented + * to 0. Doing this also works for the typical case mentioned above. + * + * Scenario [2]: The oldest flushed metaslab isn't necessarily accounted by + * the first (and oldest) entry in the summary. If the first few entries of + * the summary were only accounting metaslabs from a device that was just + * removed, then the current oldest flushed metaslab could be accounted by an + * entry somewhere in the middle of the summary. Moreover flushing that + * metaslab will destroy all the log space maps older than its ms_unflushed_txg + * because they became obsolete after the removal. Thus, iterating as we did + * for scenario [1] works out for this case too. + * + * Scenario [3]: At times we decide to flush all the metaslabs in the pool + * in one TXG (either because we are exporting the pool or because our flushing + * heuristics decided to do so). When that happens all the log space maps get + * destroyed except the one created for the current TXG which doesn't have + * any log blocks yet. As log space maps get destroyed with every metaslab that + * we flush, entries in the summary are also destroyed. This brings a weird + * corner-case when we flush the last metaslab and the log space map of the + * current TXG is in the same summary entry with other log space maps that + * are older. When that happens we are eventually left with this one last + * summary entry whose blocks are gone (blocks_gone equals the entry's block + * count) but its metaslab count is non-zero (because it accounts all the + * metaslabs in the pool as they all got flushed). Under this scenario we can't + * free this last summary entry as it's referencing all the metaslabs in the + * pool and its block count will get incremented at the end of this sync (when + * we close the syncing log space map). Thus we just decrement its current + * block count and leave it alone. In the case that the pool gets exported, + * its metaslab count will be decremented over time as we call metaslab_fini() + * for all the metaslabs in the pool and the entry will be freed at + * spa_unload_log_sm_metadata(). + */ +void +spa_log_summary_decrement_blkcount(spa_t *spa, uint64_t blocks_gone) +{ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e != NULL; e = list_head(&spa->spa_log_summary)) { + if (e->lse_blkcount > blocks_gone) { + /* + * Assert that we stopped at an entry that is not + * obsolete. + */ + ASSERT(e->lse_mscount != 0); + + e->lse_blkcount -= blocks_gone; + blocks_gone = 0; + break; + } else if (e->lse_mscount == 0) { + /* remove obsolete entry */ + blocks_gone -= e->lse_blkcount; + list_remove(&spa->spa_log_summary, e); + kmem_free(e, sizeof (log_summary_entry_t)); + } else { + /* Verify that this is scenario [3] mentioned above. */ + VERIFY3U(blocks_gone, ==, e->lse_blkcount); + + /* + * Assert that this is scenario [3] further by ensuring + * that this is the only entry in the summary. + */ + VERIFY3P(e, ==, list_tail(&spa->spa_log_summary)); + ASSERT3P(e, ==, list_head(&spa->spa_log_summary)); + + blocks_gone = e->lse_blkcount = 0; + break; + } + } + + /* + * Ensure that there is no way we are trying to remove more blocks + * than the # of blocks in the summary. + */ + ASSERT0(blocks_gone); +} + +void +spa_log_sm_decrement_mscount(spa_t *spa, uint64_t txg) +{ + spa_log_sm_t target = { .sls_txg = txg }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + if (sls == NULL) { + /* + * We must be at the teardown of a spa_load() attempt that + * got an error while reading the log space maps. + */ + VERIFY3S(spa_load_state(spa), ==, SPA_LOAD_ERROR); + return; + } + + ASSERT(sls->sls_mscount > 0); + sls->sls_mscount--; +} + +void +spa_log_sm_increment_current_mscount(spa_t *spa) +{ + spa_log_sm_t *last_sls = avl_last(&spa->spa_sm_logs_by_txg); + ASSERT3U(last_sls->sls_txg, ==, spa_syncing_txg(spa)); + last_sls->sls_mscount++; +} + +static void +summary_add_data(spa_t *spa, uint64_t txg, uint64_t metaslabs_flushed, + uint64_t nblocks) +{ + log_summary_entry_t *e = list_tail(&spa->spa_log_summary); + + if (e == NULL || summary_entry_is_full(spa, e)) { + e = kmem_zalloc(sizeof (log_summary_entry_t), KM_SLEEP); + e->lse_start = txg; + list_insert_tail(&spa->spa_log_summary, e); + } + + ASSERT3U(e->lse_start, <=, txg); + e->lse_mscount += metaslabs_flushed; + e->lse_blkcount += nblocks; +} + +static void +spa_log_summary_add_incoming_blocks(spa_t *spa, uint64_t nblocks) +{ + summary_add_data(spa, spa_syncing_txg(spa), 0, nblocks); +} + +void +spa_log_summary_add_flushed_metaslab(spa_t *spa) +{ + summary_add_data(spa, spa_syncing_txg(spa), 1, 0); +} + +/* + * This function attempts to estimate how many metaslabs should + * we flush to satisfy our block heuristic for the log spacemap + * for the upcoming TXGs. + * + * Specifically, it first tries to estimate the number of incoming + * blocks in this TXG. Then by projecting that incoming rate to + * future TXGs and using the log summary, it figures out how many + * flushes we would need to do for future TXGs individually to + * stay below our block limit and returns the maximum number of + * flushes from those estimates. + */ +static uint64_t +spa_estimate_metaslabs_to_flush(spa_t *spa) +{ + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + ASSERT3U(spa_sync_pass(spa), ==, 1); + ASSERT(spa_log_sm_blocklimit(spa) != 0); + + /* + * This variable contains the incoming rate that will be projected + * and used for our flushing estimates in the future. + */ + uint64_t incoming = spa_estimate_incoming_log_blocks(spa); + + /* + * At any point in time this variable tells us how many + * TXGs in the future we are so we can make our estimations. + */ + uint64_t txgs_in_future = 1; + + /* + * This variable tells us how much room do we have until we hit + * our limit. When it goes negative, it means that we've exceeded + * our limit and we need to flush. + * + * Note that since we start at the first TXG in the future (i.e. + * txgs_in_future starts from 1) we already decrement this + * variable by the incoming rate. + */ + int64_t available_blocks = + spa_log_sm_blocklimit(spa) - spa_log_sm_nblocks(spa) - incoming; + + /* + * This variable tells us the total number of flushes needed to + * keep the log size within the limit when we reach txgs_in_future. + */ + uint64_t total_flushes = 0; + + /* Holds the current maximum of our estimates so far. */ + uint64_t max_flushes_pertxg = + MIN(avl_numnodes(&spa->spa_metaslabs_by_flushed), + zfs_min_metaslabs_to_flush); + + /* + * For our estimations we only look as far in the future + * as the summary allows us. + */ + for (log_summary_entry_t *e = list_head(&spa->spa_log_summary); + e; e = list_next(&spa->spa_log_summary, e)) { + + /* + * If there is still room before we exceed our limit + * then keep skipping TXGs accumulating more blocks + * based on the incoming rate until we exceed it. + */ + if (available_blocks >= 0) { + uint64_t skip_txgs = (available_blocks / incoming) + 1; + available_blocks -= (skip_txgs * incoming); + txgs_in_future += skip_txgs; + ASSERT3S(available_blocks, >=, -incoming); + } + + /* + * At this point we're far enough into the future where + * the limit was just exceeded and we flush metaslabs + * based on the current entry in the summary, updating + * our available_blocks. + */ + ASSERT3S(available_blocks, <, 0); + available_blocks += e->lse_blkcount; + total_flushes += e->lse_mscount; + + /* + * Keep the running maximum of the total_flushes that + * we've done so far over the number of TXGs in the + * future that we are. The idea here is to estimate + * the average number of flushes that we should do + * every TXG so that when we are that many TXGs in the + * future we stay under the limit. + */ + max_flushes_pertxg = MAX(max_flushes_pertxg, + DIV_ROUND_UP(total_flushes, txgs_in_future)); + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + max_flushes_pertxg); + } + return (max_flushes_pertxg); +} + +uint64_t +spa_log_sm_memused(spa_t *spa) +{ + return (spa->spa_unflushed_stats.sus_memused); +} + +static boolean_t +spa_log_exceeds_memlimit(spa_t *spa) +{ + if (spa_log_sm_memused(spa) > zfs_unflushed_max_mem_amt) + return (B_TRUE); + + uint64_t system_mem_allowed = ((physmem * PAGESIZE) * + zfs_unflushed_max_mem_ppm) / 1000000; + if (spa_log_sm_memused(spa) > system_mem_allowed) + return (B_TRUE); + + return (B_FALSE); +} + +boolean_t +spa_flush_all_logs_requested(spa_t *spa) +{ + return (spa->spa_log_flushall_txg != 0); +} + +void +spa_flush_metaslabs(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + + if (spa_sync_pass(spa) != 1) + return; + + if (!spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + /* + * If we don't have any metaslabs with unflushed changes + * return immediately. + */ + if (avl_numnodes(&spa->spa_metaslabs_by_flushed) == 0) + return; + + /* + * During SPA export we leave a few empty TXGs to go by [see + * spa_final_dirty_txg() to understand why]. For this specific + * case, it is important to not flush any metaslabs as that + * would dirty this TXG. + * + * That said, during one of these dirty TXGs that is less or + * equal to spa_final_dirty(), spa_unload() will request that + * we try to flush all the metaslabs for that TXG before + * exporting the pool, thus we ensure that we didn't get a + * request of flushing everything before we attempt to return + * immediately. + */ + if (spa->spa_uberblock.ub_rootbp.blk_birth < txg && + !dmu_objset_is_dirty(spa_meta_objset(spa), txg) && + !spa_flush_all_logs_requested(spa)) + return; + + /* + * We need to generate a log space map before flushing because this + * will set up the in-memory data (i.e. node in spa_sm_logs_by_txg) + * for this TXG's flushed metaslab count (aka sls_mscount which is + * manipulated in many ways down the metaslab_flush() codepath). + * + * That is not to say that we may generate a log space map when we + * don't need it. If we are flushing metaslabs, that means that we + * were going to write changes to disk anyway, so even if we were + * not flushing, a log space map would have been created anyway in + * metaslab_sync(). + */ + spa_generate_syncing_log_sm(spa, tx); + + /* + * This variable tells us how many metaslabs we want to flush based + * on the block-heuristic of our flushing algorithm (see block comment + * of log space map feature). We also decrement this as we flush + * metaslabs and attempt to destroy old log space maps. + */ + uint64_t want_to_flush; + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + want_to_flush = avl_numnodes(&spa->spa_metaslabs_by_flushed); + } else { + want_to_flush = spa_estimate_metaslabs_to_flush(spa); + } + + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, + want_to_flush); + + /* Used purely for verification purposes */ + uint64_t visited = 0; + + /* + * Ideally we would only iterate through spa_metaslabs_by_flushed + * using only one variable (curr). We can't do that because + * metaslab_flush() mutates position of curr in the AVL when + * it flushes that metaslab by moving it to the end of the tree. + * Thus we always keep track of the original next node of the + * current node (curr) in another variable (next). + */ + metaslab_t *next = NULL; + for (metaslab_t *curr = avl_first(&spa->spa_metaslabs_by_flushed); + curr != NULL; curr = next) { + next = AVL_NEXT(&spa->spa_metaslabs_by_flushed, curr); + + /* + * If this metaslab has been flushed this txg then we've done + * a full circle over the metaslabs. + */ + if (metaslab_unflushed_txg(curr) == txg) + break; + + /* + * If we are done flushing for the block heuristic and the + * unflushed changes don't exceed the memory limit just stop. + */ + if (want_to_flush == 0 && !spa_log_exceeds_memlimit(spa)) + break; + + mutex_enter(&curr->ms_sync_lock); + mutex_enter(&curr->ms_lock); + boolean_t flushed = metaslab_flush(curr, tx); + mutex_exit(&curr->ms_lock); + mutex_exit(&curr->ms_sync_lock); + + /* + * If we failed to flush a metaslab (because it was loading), + * then we are done with the block heuristic as it's not + * possible to destroy any log space maps once you've skipped + * a metaslab. In that case we just set our counter to 0 but + * we continue looping in case there is still memory pressure + * due to unflushed changes. Note that, flushing a metaslab + * that is not the oldest flushed in the pool, will never + * destroy any log space maps [see spa_cleanup_old_sm_logs()]. + */ + if (!flushed) { + want_to_flush = 0; + } else if (want_to_flush > 0) { + want_to_flush--; + } + + visited++; + } + ASSERT3U(avl_numnodes(&spa->spa_metaslabs_by_flushed), >=, visited); +} + +/* + * Close the log space map for this TXG and update the block counts + * for the log's in-memory structure and the summary. + */ +void +spa_sync_close_syncing_log_sm(spa_t *spa) +{ + if (spa_syncing_log_sm(spa) == NULL) + return; + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP)); + + spa_log_sm_t *sls = avl_last(&spa->spa_sm_logs_by_txg); + ASSERT3U(sls->sls_txg, ==, spa_syncing_txg(spa)); + + sls->sls_nblocks = space_map_nblocks(spa_syncing_log_sm(spa)); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + + /* + * Note that we can't assert that sls_mscount is not 0, + * because there is the case where the first metaslab + * in spa_metaslabs_by_flushed is loading and we were + * not able to flush any metaslabs the current TXG. + */ + ASSERT(sls->sls_nblocks != 0); + + spa_log_summary_add_incoming_blocks(spa, sls->sls_nblocks); + spa_log_summary_verify_counts(spa); + + space_map_close(spa->spa_syncing_log_sm); + spa->spa_syncing_log_sm = NULL; + + /* + * At this point we tried to flush as many metaslabs as we + * can as the pool is getting exported. Reset the "flush all" + * so the last few TXGs before closing the pool can be empty + * (e.g. not dirty). + */ + if (spa_flush_all_logs_requested(spa)) { + ASSERT3S(spa_state(spa), ==, POOL_STATE_EXPORTED); + spa->spa_log_flushall_txg = 0; + } +} + +void +spa_cleanup_old_sm_logs(spa_t *spa, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(spa); + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + return; + } + VERIFY0(error); + + metaslab_t *oldest = avl_first(&spa->spa_metaslabs_by_flushed); + uint64_t oldest_flushed_txg = metaslab_unflushed_txg(oldest); + + /* Free all log space maps older than the oldest_flushed_txg. */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls && sls->sls_txg < oldest_flushed_txg; + sls = avl_first(&spa->spa_sm_logs_by_txg)) { + ASSERT0(sls->sls_mscount); + avl_remove(&spa->spa_sm_logs_by_txg, sls); + space_map_free_obj(mos, sls->sls_sm_obj, tx); + VERIFY0(zap_remove_int(mos, spacemap_zap, sls->sls_txg, tx)); + spa->spa_unflushed_stats.sus_nblocks -= sls->sls_nblocks; + kmem_free(sls, sizeof (spa_log_sm_t)); + } +} + +static spa_log_sm_t * +spa_log_sm_alloc(uint64_t sm_obj, uint64_t txg) +{ + spa_log_sm_t *sls = kmem_zalloc(sizeof (*sls), KM_SLEEP); + sls->sls_sm_obj = sm_obj; + sls->sls_txg = txg; + return (sls); +} + +void +spa_generate_syncing_log_sm(spa_t *spa, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + objset_t *mos = spa_meta_objset(spa); + + if (spa_syncing_log_sm(spa) != NULL) + return; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_LOG_SPACEMAP)) + return; + + uint64_t spacemap_zap; + int error = zap_lookup(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = 0; + spacemap_zap = zap_create(mos, + DMU_OTN_ZAP_METADATA, DMU_OT_NONE, 0, tx); + VERIFY0(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, + &spacemap_zap, tx)); + spa_feature_incr(spa, SPA_FEATURE_LOG_SPACEMAP, tx); + } + VERIFY0(error); + + uint64_t sm_obj; + ASSERT3U(zap_lookup_int_key(mos, spacemap_zap, txg, &sm_obj), + ==, ENOENT); + sm_obj = space_map_alloc(mos, zfs_log_sm_blksz, tx); + VERIFY0(zap_add_int_key(mos, spacemap_zap, txg, sm_obj, tx)); + avl_add(&spa->spa_sm_logs_by_txg, spa_log_sm_alloc(sm_obj, txg)); + + /* + * We pass UINT64_MAX as the space map's representation size + * and SPA_MINBLOCKSHIFT as the shift, to make the space map + * accept any sorts of segments since there's no real advantage + * to being more restrictive (given that we're already going + * to be using 2-word entries). + */ + VERIFY0(space_map_open(&spa->spa_syncing_log_sm, mos, sm_obj, + 0, UINT64_MAX, SPA_MINBLOCKSHIFT)); + + /* + * If the log space map feature was just enabled, the blocklimit + * has not yet been set. + */ + if (spa_log_sm_blocklimit(spa) == 0) + spa_log_sm_set_blocklimit(spa); +} + +/* + * Find all the log space maps stored in the space map ZAP and sort + * them by their TXG in spa_sm_logs_by_txg. + */ +static int +spa_ld_log_sm_metadata(spa_t *spa) +{ + int error; + uint64_t spacemap_zap; + + ASSERT(avl_is_empty(&spa->spa_sm_logs_by_txg)); + + error = zap_lookup(spa_meta_objset(spa), DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_LOG_SPACEMAP_ZAP, sizeof (spacemap_zap), 1, &spacemap_zap); + if (error == ENOENT) { + /* the space map ZAP doesn't exist yet */ + return (0); + } else if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_lookup(DMU_POOL_DIRECTORY_OBJECT) [error %d]", + error); + return (error); + } + + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, spa_meta_objset(spa), spacemap_zap); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + uint64_t log_txg = zfs_strtonum(za.za_name, NULL); + spa_log_sm_t *sls = + spa_log_sm_alloc(za.za_first_integer, log_txg); + avl_add(&spa->spa_sm_logs_by_txg, sls); + } + zap_cursor_fini(&zc); + if (error != ENOENT) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): failed at " + "zap_cursor_retrieve(spacemap_zap) [error %d]", + error); + return (error); + } + + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + spa_log_sm_t target = { .sls_txg = metaslab_unflushed_txg(m) }; + spa_log_sm_t *sls = avl_find(&spa->spa_sm_logs_by_txg, + &target, NULL); + + /* + * At this point if sls is zero it means that a bug occurred + * in ZFS the last time the pool was open or earlier in the + * import code path. In general, we would have placed a + * VERIFY() here or in this case just let the kernel panic + * with NULL pointer dereference when incrementing sls_mscount, + * but since this is the import code path we can be a bit more + * lenient. Thus, for DEBUG bits we always cause a panic, while + * in production we log the error and just fail the import. + */ + ASSERT(sls != NULL); + if (sls == NULL) { + spa_load_failed(spa, "spa_ld_log_sm_metadata(): bug " + "encountered: could not find log spacemap for " + "TXG %ld [error %d]", + metaslab_unflushed_txg(m), ENOENT); + return (ENOENT); + } + sls->sls_mscount++; + } + + return (0); +} + +typedef struct spa_ld_log_sm_arg { + spa_t *slls_spa; + uint64_t slls_txg; +} spa_ld_log_sm_arg_t; + +static int +spa_ld_log_sm_cb(space_map_entry_t *sme, void *arg) +{ + uint64_t offset = sme->sme_offset; + uint64_t size = sme->sme_run; + uint32_t vdev_id = sme->sme_vdev; + + spa_ld_log_sm_arg_t *slls = arg; + spa_t *spa = slls->slls_spa; + + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + + /* + * If the vdev has been removed (i.e. it is indirect or a hole) + * skip this entry. The contents of this vdev have already moved + * elsewhere. + */ + if (!vdev_is_concrete(vd)) + return (0); + + metaslab_t *ms = vd->vdev_ms[offset >> vd->vdev_ms_shift]; + ASSERT(!ms->ms_loaded); + + /* + * If we have already flushed entries for this TXG to this + * metaslab's space map, then ignore it. Note that we flush + * before processing any allocations/frees for that TXG, so + * the metaslab's space map only has entries from *before* + * the unflushed TXG. + */ + if (slls->slls_txg < metaslab_unflushed_txg(ms)) + return (0); + + switch (sme->sme_type) { + case SM_ALLOC: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_frees, ms->ms_unflushed_allocs); + break; + case SM_FREE: + range_tree_remove_xor_add_segment(offset, offset + size, + ms->ms_unflushed_allocs, ms->ms_unflushed_frees); + break; + default: + panic("invalid maptype_t"); + break; + } + return (0); +} + +static int +spa_ld_log_sm_data(spa_t *spa) +{ + int error = 0; + + /* + * If we are not going to do any writes there is no need + * to read the log space maps. + */ + if (!spa_writeable(spa)) + return (0); + + ASSERT0(spa->spa_unflushed_stats.sus_nblocks); + ASSERT0(spa->spa_unflushed_stats.sus_memused); + + hrtime_t read_logs_starttime = gethrtime(); + /* this is a no-op when we don't have space map logs */ + for (spa_log_sm_t *sls = avl_first(&spa->spa_sm_logs_by_txg); + sls; sls = AVL_NEXT(&spa->spa_sm_logs_by_txg, sls)) { + space_map_t *sm = NULL; + error = space_map_open(&sm, spa_meta_objset(spa), + sls->sls_sm_obj, 0, UINT64_MAX, SPA_MINBLOCKSHIFT); + if (error != 0) { + spa_load_failed(spa, "spa_ld_log_sm_data(): failed at " + "space_map_open(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + struct spa_ld_log_sm_arg vla = { + .slls_spa = spa, + .slls_txg = sls->sls_txg + }; + error = space_map_iterate(sm, space_map_length(sm), + spa_ld_log_sm_cb, &vla); + if (error != 0) { + space_map_close(sm); + spa_load_failed(spa, "spa_ld_log_sm_data(): failed " + "at space_map_iterate(obj=%llu) [error %d]", + (u_longlong_t)sls->sls_sm_obj, error); + goto out; + } + + ASSERT0(sls->sls_nblocks); + sls->sls_nblocks = space_map_nblocks(sm); + spa->spa_unflushed_stats.sus_nblocks += sls->sls_nblocks; + summary_add_data(spa, sls->sls_txg, + sls->sls_mscount, sls->sls_nblocks); + + space_map_close(sm); + } + hrtime_t read_logs_endtime = gethrtime(); + spa_load_note(spa, + "read %llu log space maps (%llu total blocks - blksz = %llu bytes) " + "in %lld ms", (u_longlong_t)avl_numnodes(&spa->spa_sm_logs_by_txg), + (u_longlong_t)spa_log_sm_nblocks(spa), + (u_longlong_t)zfs_log_sm_blksz, + (longlong_t)((read_logs_endtime - read_logs_starttime) / 1000000)); + +out: + /* + * Now that the metaslabs contain their unflushed changes: + * [1] recalculate their actual allocated space + * [2] recalculate their weights + * [3] sum up the memory usage of their unflushed range trees + * [4] optionally load them, if debug_load is set + * + * Note that even in the case where we get here because of an + * error (e.g. error != 0), we still want to update the fields + * below in order to have a proper teardown in spa_unload(). + */ + for (metaslab_t *m = avl_first(&spa->spa_metaslabs_by_flushed); + m != NULL; m = AVL_NEXT(&spa->spa_metaslabs_by_flushed, m)) { + mutex_enter(&m->ms_lock); + m->ms_allocated_space = space_map_allocated(m->ms_sm) + + range_tree_space(m->ms_unflushed_allocs) - + range_tree_space(m->ms_unflushed_frees); + + vdev_t *vd = m->ms_group->mg_vd; + metaslab_space_update(vd, m->ms_group->mg_class, + range_tree_space(m->ms_unflushed_allocs), 0, 0); + metaslab_space_update(vd, m->ms_group->mg_class, + -range_tree_space(m->ms_unflushed_frees), 0, 0); + + ASSERT0(m->ms_weight & METASLAB_ACTIVE_MASK); + metaslab_recalculate_weight_and_sort(m); + + spa->spa_unflushed_stats.sus_memused += + metaslab_unflushed_changes_memused(m); + + if (metaslab_debug_load && m->ms_sm != NULL) { + VERIFY0(metaslab_load(m)); + metaslab_set_selected_txg(m, 0); + } + mutex_exit(&m->ms_lock); + } + + return (error); +} + +static int +spa_ld_unflushed_txgs(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa_meta_objset(spa); + + if (vd->vdev_top_zap == 0) + return (0); + + uint64_t object = 0; + int error = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, + sizeof (uint64_t), 1, &object); + if (error == ENOENT) + return (0); + else if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): failed at " + "zap_lookup(vdev_top_zap=%llu) [error %d]", + (u_longlong_t)vd->vdev_top_zap, error); + return (error); + } + + for (uint64_t m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *ms = vd->vdev_ms[m]; + ASSERT(ms != NULL); + + metaslab_unflushed_phys_t entry; + uint64_t entry_size = sizeof (entry); + uint64_t entry_offset = ms->ms_id * entry_size; + + error = dmu_read(mos, object, + entry_offset, entry_size, &entry, 0); + if (error != 0) { + spa_load_failed(spa, "spa_ld_unflushed_txgs(): " + "failed at dmu_read(obj=%llu) [error %d]", + (u_longlong_t)object, error); + return (error); + } + + ms->ms_unflushed_txg = entry.msp_unflushed_txg; + if (ms->ms_unflushed_txg != 0) { + mutex_enter(&spa->spa_flushed_ms_lock); + avl_add(&spa->spa_metaslabs_by_flushed, ms); + mutex_exit(&spa->spa_flushed_ms_lock); + } + } + return (0); +} + +/* + * Read all the log space map entries into their respective + * metaslab unflushed trees and keep them sorted by TXG in the + * SPA's metadata. In addition, setup all the metadata for the + * memory and the block heuristics. + */ +int +spa_ld_log_spacemaps(spa_t *spa) +{ + int error; + + spa_log_sm_set_blocklimit(spa); + + for (uint64_t c = 0; c < spa->spa_root_vdev->vdev_children; c++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[c]; + error = spa_ld_unflushed_txgs(vd); + if (error != 0) + return (error); + } + + error = spa_ld_log_sm_metadata(spa); + if (error != 0) + return (error); + + /* + * Note: we don't actually expect anything to change at this point + * but we grab the config lock so we don't fail any assertions + * when using vdev_lookup_top(). + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + error = spa_ld_log_sm_data(spa); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_amt, ULONG, ZMOD_RW, + "Specific hard-limit in memory that ZFS allows to be used for " + "unflushed changes"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_max_mem_ppm, ULONG, ZMOD_RW, + "Percentage of the overall system memory that ZFS allows to be " + "used for unflushed changes (value is calculated over 1000000 for " + "finer granularity"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_max, ULONG, ZMOD_RW, + "Hard limit (upper-bound) in the size of the space map log " + "in terms of blocks."); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_min, ULONG, ZMOD_RW, + "Lower-bound limit for the maximum amount of blocks allowed in " + "log spacemap (see zfs_unflushed_log_block_max)"); + +ZFS_MODULE_PARAM(zfs, zfs_, unflushed_log_block_pct, ULONG, ZMOD_RW, + "Tunable used to determine the number of blocks that can be used for " + "the spacemap log, expressed as a percentage of the total number of " + "metaslabs in the pool (e.g. 400 means the number of log blocks is " + "capped at 4 times the number of metaslabs)"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_log_walking, ULONG, ZMOD_RW, + "The number of past TXGs that the flushing algorithm of the log " + "spacemap feature uses to estimate incoming log blocks"); + +ZFS_MODULE_PARAM(zfs, zfs_, max_logsm_summary_length, ULONG, ZMOD_RW, + "Maximum number of rows allowed in the summary of the spacemap log"); + +ZFS_MODULE_PARAM(zfs, zfs_, min_metaslabs_to_flush, ULONG, ZMOD_RW, + "Minimum number of metaslabs to flush per dirty TXG"); + +ZFS_MODULE_PARAM(zfs, zfs_, keep_log_spacemaps_at_export, INT, ZMOD_RW, + "Prevent the log spacemaps from being flushed and destroyed " + "during pool export/destroy"); +/* END CSTYLED */ diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c new file mode 100644 index 000000000000..41f0ddbde288 --- /dev/null +++ b/module/zfs/spa_misc.c @@ -0,0 +1,2921 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright 2015 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + * Copyright (c) 2017 Datto Inc. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, loli10K . All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "zfs_prop.h" +#include +#include +#include +#include + +/* + * SPA locking + * + * There are three basic locks for managing spa_t structures: + * + * spa_namespace_lock (global mutex) + * + * This lock must be acquired to do any of the following: + * + * - Lookup a spa_t by name + * - Add or remove a spa_t from the namespace + * - Increase spa_refcount from non-zero + * - Check if spa_refcount is zero + * - Rename a spa_t + * - add/remove/attach/detach devices + * - Held for the duration of create/destroy/import/export + * + * It does not need to handle recursion. A create or destroy may + * reference objects (files or zvols) in other pools, but by + * definition they must have an existing reference, and will never need + * to lookup a spa_t by name. + * + * spa_refcount (per-spa zfs_refcount_t protected by mutex) + * + * This reference count keep track of any active users of the spa_t. The + * spa_t cannot be destroyed or freed while this is non-zero. Internally, + * the refcount is never really 'zero' - opening a pool implicitly keeps + * some references in the DMU. Internally we check against spa_minref, but + * present the image of a zero/non-zero value to consumers. + * + * spa_config_lock[] (per-spa array of rwlocks) + * + * This protects the spa_t from config changes, and must be held in + * the following circumstances: + * + * - RW_READER to perform I/O to the spa + * - RW_WRITER to change the vdev config + * + * The locking order is fairly straightforward: + * + * spa_namespace_lock -> spa_refcount + * + * The namespace lock must be acquired to increase the refcount from 0 + * or to check if it is zero. + * + * spa_refcount -> spa_config_lock[] + * + * There must be at least one valid reference on the spa_t to acquire + * the config lock. + * + * spa_namespace_lock -> spa_config_lock[] + * + * The namespace lock must always be taken before the config lock. + * + * + * The spa_namespace_lock can be acquired directly and is globally visible. + * + * The namespace is manipulated using the following functions, all of which + * require the spa_namespace_lock to be held. + * + * spa_lookup() Lookup a spa_t by name. + * + * spa_add() Create a new spa_t in the namespace. + * + * spa_remove() Remove a spa_t from the namespace. This also + * frees up any memory associated with the spa_t. + * + * spa_next() Returns the next spa_t in the system, or the + * first if NULL is passed. + * + * spa_evict_all() Shutdown and remove all spa_t structures in + * the system. + * + * spa_guid_exists() Determine whether a pool/device guid exists. + * + * The spa_refcount is manipulated using the following functions: + * + * spa_open_ref() Adds a reference to the given spa_t. Must be + * called with spa_namespace_lock held if the + * refcount is currently zero. + * + * spa_close() Remove a reference from the spa_t. This will + * not free the spa_t or remove it from the + * namespace. No locking is required. + * + * spa_refcount_zero() Returns true if the refcount is currently + * zero. Must be called with spa_namespace_lock + * held. + * + * The spa_config_lock[] is an array of rwlocks, ordered as follows: + * SCL_CONFIG > SCL_STATE > SCL_ALLOC > SCL_ZIO > SCL_FREE > SCL_VDEV. + * spa_config_lock[] is manipulated with spa_config_{enter,exit,held}(). + * + * To read the configuration, it suffices to hold one of these locks as reader. + * To modify the configuration, you must hold all locks as writer. To modify + * vdev state without altering the vdev tree's topology (e.g. online/offline), + * you must hold SCL_STATE and SCL_ZIO as writer. + * + * We use these distinct config locks to avoid recursive lock entry. + * For example, spa_sync() (which holds SCL_CONFIG as reader) induces + * block allocations (SCL_ALLOC), which may require reading space maps + * from disk (dmu_read() -> zio_read() -> SCL_ZIO). + * + * The spa config locks cannot be normal rwlocks because we need the + * ability to hand off ownership. For example, SCL_ZIO is acquired + * by the issuing thread and later released by an interrupt thread. + * They do, however, obey the usual write-wanted semantics to prevent + * writer (i.e. system administrator) starvation. + * + * The lock acquisition rules are as follows: + * + * SCL_CONFIG + * Protects changes to the vdev tree topology, such as vdev + * add/remove/attach/detach. Protects the dirty config list + * (spa_config_dirty_list) and the set of spares and l2arc devices. + * + * SCL_STATE + * Protects changes to pool state and vdev state, such as vdev + * online/offline/fault/degrade/clear. Protects the dirty state list + * (spa_state_dirty_list) and global pool state (spa_state). + * + * SCL_ALLOC + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_alloc() and metaslab_claim(). + * + * SCL_ZIO + * Held by bp-level zios (those which have no io_vd upon entry) + * to prevent changes to the vdev tree. The bp-level zio implicitly + * protects all of its vdev child zios, which do not hold SCL_ZIO. + * + * SCL_FREE + * Protects changes to metaslab groups and classes. + * Held as reader by metaslab_free(). SCL_FREE is distinct from + * SCL_ALLOC, and lower than SCL_ZIO, so that we can safely free + * blocks in zio_done() while another i/o that holds either + * SCL_ALLOC or SCL_ZIO is waiting for this i/o to complete. + * + * SCL_VDEV + * Held as reader to prevent changes to the vdev tree during trivial + * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the + * other locks, and lower than all of them, to ensure that it's safe + * to acquire regardless of caller context. + * + * In addition, the following rules apply: + * + * (a) spa_props_lock protects pool properties, spa_config and spa_config_list. + * The lock ordering is SCL_CONFIG > spa_props_lock. + * + * (b) I/O operations on leaf vdevs. For any zio operation that takes + * an explicit vdev_t argument -- such as zio_ioctl(), zio_read_phys(), + * or zio_write_phys() -- the caller must ensure that the config cannot + * cannot change in the interim, and that the vdev cannot be reopened. + * SCL_STATE as reader suffices for both. + * + * The vdev configuration is protected by spa_vdev_enter() / spa_vdev_exit(). + * + * spa_vdev_enter() Acquire the namespace lock and the config lock + * for writing. + * + * spa_vdev_exit() Release the config lock, wait for all I/O + * to complete, sync the updated configs to the + * cache, and release the namespace lock. + * + * vdev state is protected by spa_vdev_state_enter() / spa_vdev_state_exit(). + * Like spa_vdev_enter/exit, these are convenience wrappers -- the actual + * locking is, always, based on spa_namespace_lock and spa_config_lock[]. + */ + +static avl_tree_t spa_namespace_avl; +kmutex_t spa_namespace_lock; +static kcondvar_t spa_namespace_cv; +int spa_max_replication_override = SPA_DVAS_PER_BP; + +static kmutex_t spa_spare_lock; +static avl_tree_t spa_spare_avl; +static kmutex_t spa_l2cache_lock; +static avl_tree_t spa_l2cache_avl; + +kmem_cache_t *spa_buffer_pool; +spa_mode_t spa_mode_global = SPA_MODE_UNINIT; + +#ifdef ZFS_DEBUG +/* + * Everything except dprintf, set_error, spa, and indirect_remap is on + * by default in debug builds. + */ +int zfs_flags = ~(ZFS_DEBUG_DPRINTF | ZFS_DEBUG_SET_ERROR | + ZFS_DEBUG_INDIRECT_REMAP); +#else +int zfs_flags = 0; +#endif + +/* + * zfs_recover can be set to nonzero to attempt to recover from + * otherwise-fatal errors, typically caused by on-disk corruption. When + * set, calls to zfs_panic_recover() will turn into warning messages. + * This should only be used as a last resort, as it typically results + * in leaked space, or worse. + */ +int zfs_recover = B_FALSE; + +/* + * If destroy encounters an EIO while reading metadata (e.g. indirect + * blocks), space referenced by the missing metadata can not be freed. + * Normally this causes the background destroy to become "stalled", as + * it is unable to make forward progress. While in this stalled state, + * all remaining space to free from the error-encountering filesystem is + * "temporarily leaked". Set this flag to cause it to ignore the EIO, + * permanently leak the space from indirect blocks that can not be read, + * and continue to free everything else that it can. + * + * The default, "stalling" behavior is useful if the storage partially + * fails (i.e. some but not all i/os fail), and then later recovers. In + * this case, we will be able to continue pool operations while it is + * partially failed, and when it recovers, we can continue to free the + * space, with no leaks. However, note that this case is actually + * fairly rare. + * + * Typically pools either (a) fail completely (but perhaps temporarily, + * e.g. a top-level vdev going offline), or (b) have localized, + * permanent errors (e.g. disk returns the wrong data due to bit flip or + * firmware bug). In case (a), this setting does not matter because the + * pool will be suspended and the sync thread will not be able to make + * forward progress regardless. In case (b), because the error is + * permanent, the best we can do is leak the minimum amount of space, + * which is what setting this flag will do. Therefore, it is reasonable + * for this flag to normally be set, but we chose the more conservative + * approach of not setting it, so that there is no possibility of + * leaking space in the "partial temporary" failure case. + */ +int zfs_free_leak_on_eio = B_FALSE; + +/* + * Expiration time in milliseconds. This value has two meanings. First it is + * used to determine when the spa_deadman() logic should fire. By default the + * spa_deadman() will fire if spa_sync() has not completed in 600 seconds. + * Secondly, the value determines if an I/O is considered "hung". Any I/O that + * has not completed in zfs_deadman_synctime_ms is considered "hung" resulting + * in one of three behaviors controlled by zfs_deadman_failmode. + */ +unsigned long zfs_deadman_synctime_ms = 600000UL; + +/* + * This value controls the maximum amount of time zio_wait() will block for an + * outstanding IO. By default this is 300 seconds at which point the "hung" + * behavior will be applied as described for zfs_deadman_synctime_ms. + */ +unsigned long zfs_deadman_ziotime_ms = 300000UL; + +/* + * Check time in milliseconds. This defines the frequency at which we check + * for hung I/O. + */ +unsigned long zfs_deadman_checktime_ms = 60000UL; + +/* + * By default the deadman is enabled. + */ +int zfs_deadman_enabled = 1; + +/* + * Controls the behavior of the deadman when it detects a "hung" I/O. + * Valid values are zfs_deadman_failmode=. + * + * wait - Wait for the "hung" I/O (default) + * continue - Attempt to recover from a "hung" I/O + * panic - Panic the system + */ +char *zfs_deadman_failmode = "wait"; + +/* + * The worst case is single-sector max-parity RAID-Z blocks, in which + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) + * times the size; so just assume that. Add to this the fact that + * we can have up to 3 DVAs per bp, and one more factor of 2 because + * the block may be dittoed with up to 3 DVAs by ddt_sync(). All together, + * the worst case is: + * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2 == 24 + */ +int spa_asize_inflation = 24; + +/* + * Normally, we don't allow the last 3.2% (1/(2^spa_slop_shift)) of space in + * the pool to be consumed. This ensures that we don't run the pool + * completely out of space, due to unaccounted changes (e.g. to the MOS). + * It also limits the worst-case time to allocate space. If we have + * less than this amount of free space, most ZPL operations (e.g. write, + * create) will return ENOSPC. + * + * Certain operations (e.g. file removal, most administrative actions) can + * use half the slop space. They will only return ENOSPC if less than half + * the slop space is free. Typically, once the pool has less than the slop + * space free, the user will use these operations to free up space in the pool. + * These are the operations that call dsl_pool_adjustedsize() with the netfree + * argument set to TRUE. + * + * Operations that are almost guaranteed to free up space in the absence of + * a pool checkpoint can use up to three quarters of the slop space + * (e.g zfs destroy). + * + * A very restricted set of operations are always permitted, regardless of + * the amount of free space. These are the operations that call + * dsl_sync_task(ZFS_SPACE_CHECK_NONE). If these operations result in a net + * increase in the amount of space used, it is possible to run the pool + * completely out of space, causing it to be permanently read-only. + * + * Note that on very small pools, the slop space will be larger than + * 3.2%, in an effort to have it be at least spa_min_slop (128MB), + * but we never allow it to be more than half the pool size. + * + * See also the comments in zfs_space_check_t. + */ +int spa_slop_shift = 5; +uint64_t spa_min_slop = 128 * 1024 * 1024; +int spa_allocators = 4; + + +/*PRINTFLIKE2*/ +void +spa_load_failed(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + zfs_dbgmsg("spa_load(%s, config %s): FAILED: %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); +} + +/*PRINTFLIKE2*/ +void +spa_load_note(spa_t *spa, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + zfs_dbgmsg("spa_load(%s, config %s): %s", spa->spa_name, + spa->spa_trust_config ? "trusted" : "untrusted", buf); +} + +/* + * By default dedup and user data indirects land in the special class + */ +int zfs_ddt_data_is_special = B_TRUE; +int zfs_user_indirect_is_special = B_TRUE; + +/* + * The percentage of special class final space reserved for metadata only. + * Once we allocate 100 - zfs_special_class_metadata_reserve_pct we only + * let metadata into the class. + */ +int zfs_special_class_metadata_reserve_pct = 25; + +/* + * ========================================================================== + * SPA config locking + * ========================================================================== + */ +static void +spa_config_lock_init(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_init(&scl->scl_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&scl->scl_cv, NULL, CV_DEFAULT, NULL); + zfs_refcount_create_untracked(&scl->scl_count); + scl->scl_writer = NULL; + scl->scl_write_wanted = 0; + } +} + +static void +spa_config_lock_destroy(spa_t *spa) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + mutex_destroy(&scl->scl_lock); + cv_destroy(&scl->scl_cv); + zfs_refcount_destroy(&scl->scl_count); + ASSERT(scl->scl_writer == NULL); + ASSERT(scl->scl_write_wanted == 0); + } +} + +int +spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) +{ + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + if (scl->scl_writer || scl->scl_write_wanted) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); + return (0); + } + } else { + ASSERT(scl->scl_writer != curthread); + if (!zfs_refcount_is_zero(&scl->scl_count)) { + mutex_exit(&scl->scl_lock); + spa_config_exit(spa, locks & ((1 << i) - 1), + tag); + return (0); + } + scl->scl_writer = curthread; + } + (void) zfs_refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + return (1); +} + +void +spa_config_enter(spa_t *spa, int locks, const void *tag, krw_t rw) +{ + int wlocks_held = 0; + + ASSERT3U(SCL_LOCKS, <, sizeof (wlocks_held) * NBBY); + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + if (rw == RW_READER) { + while (scl->scl_writer || scl->scl_write_wanted) { + cv_wait(&scl->scl_cv, &scl->scl_lock); + } + } else { + ASSERT(scl->scl_writer != curthread); + while (!zfs_refcount_is_zero(&scl->scl_count)) { + scl->scl_write_wanted++; + cv_wait(&scl->scl_cv, &scl->scl_lock); + scl->scl_write_wanted--; + } + scl->scl_writer = curthread; + } + (void) zfs_refcount_add(&scl->scl_count, tag); + mutex_exit(&scl->scl_lock); + } + ASSERT3U(wlocks_held, <=, locks); +} + +void +spa_config_exit(spa_t *spa, int locks, const void *tag) +{ + for (int i = SCL_LOCKS - 1; i >= 0; i--) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + mutex_enter(&scl->scl_lock); + ASSERT(!zfs_refcount_is_zero(&scl->scl_count)); + if (zfs_refcount_remove(&scl->scl_count, tag) == 0) { + ASSERT(scl->scl_writer == NULL || + scl->scl_writer == curthread); + scl->scl_writer = NULL; /* OK in either case */ + cv_broadcast(&scl->scl_cv); + } + mutex_exit(&scl->scl_lock); + } +} + +int +spa_config_held(spa_t *spa, int locks, krw_t rw) +{ + int locks_held = 0; + + for (int i = 0; i < SCL_LOCKS; i++) { + spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (!(locks & (1 << i))) + continue; + if ((rw == RW_READER && + !zfs_refcount_is_zero(&scl->scl_count)) || + (rw == RW_WRITER && scl->scl_writer == curthread)) + locks_held |= 1 << i; + } + + return (locks_held); +} + +/* + * ========================================================================== + * SPA namespace functions + * ========================================================================== + */ + +/* + * Lookup the named spa_t in the AVL tree. The spa_namespace_lock must be held. + * Returns NULL if no matching spa_t is found. + */ +spa_t * +spa_lookup(const char *name) +{ + static spa_t search; /* spa_t is large; don't allocate on stack */ + spa_t *spa; + avl_index_t where; + char *cp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + (void) strlcpy(search.spa_name, name, sizeof (search.spa_name)); + + /* + * If it's a full dataset name, figure out the pool name and + * just use that. + */ + cp = strpbrk(search.spa_name, "/@#"); + if (cp != NULL) + *cp = '\0'; + + spa = avl_find(&spa_namespace_avl, &search, &where); + + return (spa); +} + +/* + * Fires when spa_sync has not completed within zfs_deadman_synctime_ms. + * If the zfs_deadman_enabled flag is set then it inspects all vdev queues + * looking for potentially hung I/Os. + */ +void +spa_deadman(void *arg) +{ + spa_t *spa = arg; + + /* Disable the deadman if the pool is suspended. */ + if (spa_suspended(spa)) + return; + + zfs_dbgmsg("slow spa_sync: started %llu seconds ago, calls %llu", + (gethrtime() - spa->spa_sync_starttime) / NANOSEC, + ++spa->spa_deadman_calls); + if (zfs_deadman_enabled) + vdev_deadman(spa->spa_root_vdev, FTAG); + + spa->spa_deadman_tqid = taskq_dispatch_delay(system_delay_taskq, + spa_deadman, spa, TQ_SLEEP, ddi_get_lbolt() + + MSEC_TO_TICK(zfs_deadman_checktime_ms)); +} + +static int +spa_log_sm_sort_by_txg(const void *va, const void *vb) +{ + const spa_log_sm_t *a = va; + const spa_log_sm_t *b = vb; + + return (TREE_CMP(a->sls_txg, b->sls_txg)); +} + +/* + * Create an uninitialized spa_t with the given name. Requires + * spa_namespace_lock. The caller must ensure that the spa_t doesn't already + * exist by calling spa_lookup() first. + */ +spa_t * +spa_add(const char *name, nvlist_t *config, const char *altroot) +{ + spa_t *spa; + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); + + mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_evicting_os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_cksum_tmpls_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_feat_stats_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_flushed_ms_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_activities_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_activities_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_waiters_cv, NULL, CV_DEFAULT, NULL); + + for (int t = 0; t < TXG_SIZE; t++) + bplist_create(&spa->spa_free_bplist[t]); + + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); + spa->spa_state = POOL_STATE_UNINITIALIZED; + spa->spa_freeze_txg = UINT64_MAX; + spa->spa_final_txg = UINT64_MAX; + spa->spa_load_max_txg = UINT64_MAX; + spa->spa_proc = &p0; + spa->spa_proc_state = SPA_PROC_NONE; + spa->spa_trust_config = B_TRUE; + spa->spa_hostid = zone_get_hostid(NULL); + + spa->spa_deadman_synctime = MSEC2NSEC(zfs_deadman_synctime_ms); + spa->spa_deadman_ziotime = MSEC2NSEC(zfs_deadman_ziotime_ms); + spa_set_deadman_failmode(spa, zfs_deadman_failmode); + + zfs_refcount_create(&spa->spa_refcount); + spa_config_lock_init(spa); + spa_stats_init(spa); + + avl_add(&spa_namespace_avl, spa); + + /* + * Set the alternate root, if there is one. + */ + if (altroot) + spa->spa_root = spa_strdup(altroot); + + spa->spa_alloc_count = spa_allocators; + spa->spa_alloc_locks = kmem_zalloc(spa->spa_alloc_count * + sizeof (kmutex_t), KM_SLEEP); + spa->spa_alloc_trees = kmem_zalloc(spa->spa_alloc_count * + sizeof (avl_tree_t), KM_SLEEP); + for (int i = 0; i < spa->spa_alloc_count; i++) { + mutex_init(&spa->spa_alloc_locks[i], NULL, MUTEX_DEFAULT, NULL); + avl_create(&spa->spa_alloc_trees[i], zio_bookmark_compare, + sizeof (zio_t), offsetof(zio_t, io_alloc_node)); + } + avl_create(&spa->spa_metaslabs_by_flushed, metaslab_sort_by_flushed, + sizeof (metaslab_t), offsetof(metaslab_t, ms_spa_txg_node)); + avl_create(&spa->spa_sm_logs_by_txg, spa_log_sm_sort_by_txg, + sizeof (spa_log_sm_t), offsetof(spa_log_sm_t, sls_node)); + list_create(&spa->spa_log_summary, sizeof (log_summary_entry_t), + offsetof(log_summary_entry_t, lse_node)); + + /* + * Every pool starts with the default cachefile + */ + list_create(&spa->spa_config_list, sizeof (spa_config_dirent_t), + offsetof(spa_config_dirent_t, scd_link)); + + dp = kmem_zalloc(sizeof (spa_config_dirent_t), KM_SLEEP); + dp->scd_path = altroot ? NULL : spa_strdup(spa_config_path); + list_insert_head(&spa->spa_config_list, dp); + + VERIFY(nvlist_alloc(&spa->spa_load_info, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + + if (config != NULL) { + nvlist_t *features; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_FEATURES_FOR_READ, + &features) == 0) { + VERIFY(nvlist_dup(features, &spa->spa_label_features, + 0) == 0); + } + + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + } + + if (spa->spa_label_features == NULL) { + VERIFY(nvlist_alloc(&spa->spa_label_features, NV_UNIQUE_NAME, + KM_SLEEP) == 0); + } + + spa->spa_min_ashift = INT_MAX; + spa->spa_max_ashift = 0; + + /* Reset cached value */ + spa->spa_dedup_dspace = ~0ULL; + + /* + * As a pool is being created, treat all features as disabled by + * setting SPA_FEATURE_DISABLED for all entries in the feature + * refcount cache. + */ + for (int i = 0; i < SPA_FEATURES; i++) { + spa->spa_feat_refcount_cache[i] = SPA_FEATURE_DISABLED; + } + + list_create(&spa->spa_leaf_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_leaf_node)); + + return (spa); +} + +/* + * Removes a spa_t from the namespace, freeing up any memory used. Requires + * spa_namespace_lock. This is called only after the spa_t has been closed and + * deactivated. + */ +void +spa_remove(spa_t *spa) +{ + spa_config_dirent_t *dp; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_state(spa) == POOL_STATE_UNINITIALIZED); + ASSERT3U(zfs_refcount_count(&spa->spa_refcount), ==, 0); + ASSERT0(spa->spa_waiters); + + nvlist_free(spa->spa_config_splitting); + + avl_remove(&spa_namespace_avl, spa); + cv_broadcast(&spa_namespace_cv); + + if (spa->spa_root) + spa_strfree(spa->spa_root); + + while ((dp = list_head(&spa->spa_config_list)) != NULL) { + list_remove(&spa->spa_config_list, dp); + if (dp->scd_path != NULL) + spa_strfree(dp->scd_path); + kmem_free(dp, sizeof (spa_config_dirent_t)); + } + + for (int i = 0; i < spa->spa_alloc_count; i++) { + avl_destroy(&spa->spa_alloc_trees[i]); + mutex_destroy(&spa->spa_alloc_locks[i]); + } + kmem_free(spa->spa_alloc_locks, spa->spa_alloc_count * + sizeof (kmutex_t)); + kmem_free(spa->spa_alloc_trees, spa->spa_alloc_count * + sizeof (avl_tree_t)); + + avl_destroy(&spa->spa_metaslabs_by_flushed); + avl_destroy(&spa->spa_sm_logs_by_txg); + list_destroy(&spa->spa_log_summary); + list_destroy(&spa->spa_config_list); + list_destroy(&spa->spa_leaf_list); + + nvlist_free(spa->spa_label_features); + nvlist_free(spa->spa_load_info); + nvlist_free(spa->spa_feat_stats); + spa_config_set(spa, NULL); + + zfs_refcount_destroy(&spa->spa_refcount); + + spa_stats_destroy(spa); + spa_config_lock_destroy(spa); + + for (int t = 0; t < TXG_SIZE; t++) + bplist_destroy(&spa->spa_free_bplist[t]); + + zio_checksum_templates_free(spa); + + cv_destroy(&spa->spa_async_cv); + cv_destroy(&spa->spa_evicting_os_cv); + cv_destroy(&spa->spa_proc_cv); + cv_destroy(&spa->spa_scrub_io_cv); + cv_destroy(&spa->spa_suspend_cv); + cv_destroy(&spa->spa_activities_cv); + cv_destroy(&spa->spa_waiters_cv); + + mutex_destroy(&spa->spa_flushed_ms_lock); + mutex_destroy(&spa->spa_async_lock); + mutex_destroy(&spa->spa_errlist_lock); + mutex_destroy(&spa->spa_errlog_lock); + mutex_destroy(&spa->spa_evicting_os_lock); + mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_proc_lock); + mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_cksum_tmpls_lock); + mutex_destroy(&spa->spa_scrub_lock); + mutex_destroy(&spa->spa_suspend_lock); + mutex_destroy(&spa->spa_vdev_top_lock); + mutex_destroy(&spa->spa_feat_stats_lock); + mutex_destroy(&spa->spa_activities_lock); + + kmem_free(spa, sizeof (spa_t)); +} + +/* + * Given a pool, return the next pool in the namespace, or NULL if there is + * none. If 'prev' is NULL, return the first pool. + */ +spa_t * +spa_next(spa_t *prev) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (prev) + return (AVL_NEXT(&spa_namespace_avl, prev)); + else + return (avl_first(&spa_namespace_avl)); +} + +/* + * ========================================================================== + * SPA refcount functions + * ========================================================================== + */ + +/* + * Add a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_open_ref(spa_t *spa, void *tag) +{ + ASSERT(zfs_refcount_count(&spa->spa_refcount) >= spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) zfs_refcount_add(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t. Must have at least one reference, or + * have the namespace lock held. + */ +void +spa_close(spa_t *spa, void *tag) +{ + ASSERT(zfs_refcount_count(&spa->spa_refcount) > spa->spa_minref || + MUTEX_HELD(&spa_namespace_lock)); + (void) zfs_refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Remove a reference to the given spa_t held by a dsl dir that is + * being asynchronously released. Async releases occur from a taskq + * performing eviction of dsl datasets and dirs. The namespace lock + * isn't held and the hold by the object being evicted may contribute to + * spa_minref (e.g. dataset or directory released during pool export), + * so the asserts in spa_close() do not apply. + */ +void +spa_async_close(spa_t *spa, void *tag) +{ + (void) zfs_refcount_remove(&spa->spa_refcount, tag); +} + +/* + * Check to see if the spa refcount is zero. Must be called with + * spa_namespace_lock held. We really compare against spa_minref, which is the + * number of references acquired when opening a pool + */ +boolean_t +spa_refcount_zero(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + return (zfs_refcount_count(&spa->spa_refcount) == spa->spa_minref); +} + +/* + * ========================================================================== + * SPA spare and l2cache tracking + * ========================================================================== + */ + +/* + * Hot spares and cache devices are tracked using the same code below, + * for 'auxiliary' devices. + */ + +typedef struct spa_aux { + uint64_t aux_guid; + uint64_t aux_pool; + avl_node_t aux_avl; + int aux_count; +} spa_aux_t; + +static inline int +spa_aux_compare(const void *a, const void *b) +{ + const spa_aux_t *sa = (const spa_aux_t *)a; + const spa_aux_t *sb = (const spa_aux_t *)b; + + return (TREE_CMP(sa->aux_guid, sb->aux_guid)); +} + +static void +spa_aux_add(vdev_t *vd, avl_tree_t *avl) +{ + avl_index_t where; + spa_aux_t search; + spa_aux_t *aux; + + search.aux_guid = vd->vdev_guid; + if ((aux = avl_find(avl, &search, &where)) != NULL) { + aux->aux_count++; + } else { + aux = kmem_zalloc(sizeof (spa_aux_t), KM_SLEEP); + aux->aux_guid = vd->vdev_guid; + aux->aux_count = 1; + avl_insert(avl, aux, where); + } +} + +static void +spa_aux_remove(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search; + spa_aux_t *aux; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + aux = avl_find(avl, &search, &where); + + ASSERT(aux != NULL); + + if (--aux->aux_count == 0) { + avl_remove(avl, aux); + kmem_free(aux, sizeof (spa_aux_t)); + } else if (aux->aux_pool == spa_guid(vd->vdev_spa)) { + aux->aux_pool = 0ULL; + } +} + +static boolean_t +spa_aux_exists(uint64_t guid, uint64_t *pool, int *refcnt, avl_tree_t *avl) +{ + spa_aux_t search, *found; + + search.aux_guid = guid; + found = avl_find(avl, &search, NULL); + + if (pool) { + if (found) + *pool = found->aux_pool; + else + *pool = 0ULL; + } + + if (refcnt) { + if (found) + *refcnt = found->aux_count; + else + *refcnt = 0; + } + + return (found != NULL); +} + +static void +spa_aux_activate(vdev_t *vd, avl_tree_t *avl) +{ + spa_aux_t search, *found; + avl_index_t where; + + search.aux_guid = vd->vdev_guid; + found = avl_find(avl, &search, &where); + ASSERT(found != NULL); + ASSERT(found->aux_pool == 0ULL); + + found->aux_pool = spa_guid(vd->vdev_spa); +} + +/* + * Spares are tracked globally due to the following constraints: + * + * - A spare may be part of multiple pools. + * - A spare may be added to a pool even if it's actively in use within + * another pool. + * - A spare in use in any pool can only be the source of a replacement if + * the target is a spare in the same pool. + * + * We keep track of all spares on the system through the use of a reference + * counted AVL tree. When a vdev is added as a spare, or used as a replacement + * spare, then we bump the reference count in the AVL tree. In addition, we set + * the 'vdev_isspare' member to indicate that the device is a spare (active or + * inactive). When a spare is made active (used to replace a device in the + * pool), we also keep track of which pool its been made a part of. + * + * The 'spa_spare_lock' protects the AVL tree. These functions are normally + * called under the spa_namespace lock as part of vdev reconfiguration. The + * separate spare lock exists for the status query path, which does not need to + * be completely consistent with respect to other vdev configuration changes. + */ + +static int +spa_spare_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_spare_add(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(!vd->vdev_isspare); + spa_aux_add(vd, &spa_spare_avl); + vd->vdev_isspare = B_TRUE; + mutex_exit(&spa_spare_lock); +} + +void +spa_spare_remove(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_remove(vd, &spa_spare_avl); + vd->vdev_isspare = B_FALSE; + mutex_exit(&spa_spare_lock); +} + +boolean_t +spa_spare_exists(uint64_t guid, uint64_t *pool, int *refcnt) +{ + boolean_t found; + + mutex_enter(&spa_spare_lock); + found = spa_aux_exists(guid, pool, refcnt, &spa_spare_avl); + mutex_exit(&spa_spare_lock); + + return (found); +} + +void +spa_spare_activate(vdev_t *vd) +{ + mutex_enter(&spa_spare_lock); + ASSERT(vd->vdev_isspare); + spa_aux_activate(vd, &spa_spare_avl); + mutex_exit(&spa_spare_lock); +} + +/* + * Level 2 ARC devices are tracked globally for the same reasons as spares. + * Cache devices currently only support one pool per cache device, and so + * for these devices the aux reference count is currently unused beyond 1. + */ + +static int +spa_l2cache_compare(const void *a, const void *b) +{ + return (spa_aux_compare(a, b)); +} + +void +spa_l2cache_add(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(!vd->vdev_isl2cache); + spa_aux_add(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_TRUE; + mutex_exit(&spa_l2cache_lock); +} + +void +spa_l2cache_remove(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_remove(vd, &spa_l2cache_avl); + vd->vdev_isl2cache = B_FALSE; + mutex_exit(&spa_l2cache_lock); +} + +boolean_t +spa_l2cache_exists(uint64_t guid, uint64_t *pool) +{ + boolean_t found; + + mutex_enter(&spa_l2cache_lock); + found = spa_aux_exists(guid, pool, NULL, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); + + return (found); +} + +void +spa_l2cache_activate(vdev_t *vd) +{ + mutex_enter(&spa_l2cache_lock); + ASSERT(vd->vdev_isl2cache); + spa_aux_activate(vd, &spa_l2cache_avl); + mutex_exit(&spa_l2cache_lock); +} + +/* + * ========================================================================== + * SPA vdev locking + * ========================================================================== + */ + +/* + * Lock the given spa_t for the purpose of adding or removing a vdev. + * Grabs the global spa_namespace_lock plus the spa config lock for writing. + * It returns the next transaction group for the spa_t. + */ +uint64_t +spa_vdev_enter(spa_t *spa) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + return (spa_vdev_config_enter(spa)); +} + +/* + * The same as spa_vdev_enter() above but additionally takes the guid of + * the vdev being detached. When there is a rebuild in process it will be + * suspended while the vdev tree is modified then resumed by spa_vdev_exit(). + * The rebuild is canceled if only a single child remains after the detach. + */ +uint64_t +spa_vdev_detach_enter(spa_t *spa, uint64_t guid) +{ + mutex_enter(&spa->spa_vdev_top_lock); + mutex_enter(&spa_namespace_lock); + + vdev_autotrim_stop_all(spa); + + if (guid != 0) { + vdev_t *vd = spa_lookup_by_guid(spa, guid, B_FALSE); + if (vd) { + vdev_rebuild_stop_wait(vd->vdev_top); + } + } + + return (spa_vdev_config_enter(spa)); +} + +/* + * Internal implementation for spa_vdev_enter(). Used when a vdev + * operation requires multiple syncs (i.e. removing a device) while + * keeping the spa_namespace_lock held. + */ +uint64_t +spa_vdev_config_enter(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + + return (spa_last_synced_txg(spa) + 1); +} + +/* + * Used in combination with spa_vdev_config_enter() to allow the syncing + * of multiple transactions without releasing the spa_namespace_lock. + */ +void +spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + int config_changed = B_FALSE; + + ASSERT(txg > spa_last_synced_txg(spa)); + + spa->spa_pending_vdev = NULL; + + /* + * Reassess the DTLs. + */ + vdev_dtl_reassess(spa->spa_root_vdev, 0, 0, B_FALSE, B_FALSE); + + if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { + config_changed = B_TRUE; + spa->spa_config_generation++; + } + + /* + * Verify the metaslab classes. + */ + ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_special_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_dedup_class(spa)) == 0); + + spa_config_exit(spa, SCL_ALL, spa); + + /* + * Panic the system if the specified tag requires it. This + * is useful for ensuring that configurations are updated + * transactionally. + */ + if (zio_injection_enabled) + zio_handle_panic_injection(spa, tag, 0); + + /* + * Note: this txg_wait_synced() is important because it ensures + * that there won't be more than one config change per txg. + * This allows us to use the txg as the generation number. + */ + if (error == 0) + txg_wait_synced(spa->spa_dsl_pool, txg); + + if (vd != NULL) { + ASSERT(!vd->vdev_detached || vd->vdev_dtl_sm == NULL); + if (vd->vdev_ops->vdev_op_leaf) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, VDEV_INITIALIZE_CANCELED, + NULL); + mutex_exit(&vd->vdev_initialize_lock); + + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); + mutex_exit(&vd->vdev_trim_lock); + } + + /* + * The vdev may be both a leaf and top-level device. + */ + vdev_autotrim_stop_wait(vd); + + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); + vdev_free(vd); + spa_config_exit(spa, SCL_ALL, spa); + } + + /* + * If the config changed, update the config cache. + */ + if (config_changed) + spa_write_cachefile(spa, B_FALSE, B_TRUE); +} + +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + vdev_autotrim_restart(spa); + vdev_rebuild_restart(spa); + + spa_vdev_config_exit(spa, vd, txg, error, FTAG); + mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * Lock the given spa_t for the purpose of changing vdev state. + */ +void +spa_vdev_state_enter(spa_t *spa, int oplocks) +{ + int locks = SCL_STATE_ALL | oplocks; + + /* + * Root pools may need to read of the underlying devfs filesystem + * when opening up a vdev. Unfortunately if we're holding the + * SCL_ZIO lock it will result in a deadlock when we try to issue + * the read from the root filesystem. Instead we "prefetch" + * the associated vnodes that we need prior to opening the + * underlying devices and cache them so that we can prevent + * any I/O when we are doing the actual open. + */ + if (spa_is_root(spa)) { + int low = locks & ~(SCL_ZIO - 1); + int high = locks & ~low; + + spa_config_enter(spa, high, spa, RW_WRITER); + vdev_hold(spa->spa_root_vdev); + spa_config_enter(spa, low, spa, RW_WRITER); + } else { + spa_config_enter(spa, locks, spa, RW_WRITER); + } + spa->spa_vdev_locks = locks; +} + +int +spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) +{ + boolean_t config_changed = B_FALSE; + vdev_t *vdev_top; + + if (vd == NULL || vd == spa->spa_root_vdev) { + vdev_top = spa->spa_root_vdev; + } else { + vdev_top = vd->vdev_top; + } + + if (vd != NULL || error == 0) + vdev_dtl_reassess(vdev_top, 0, 0, B_FALSE, B_FALSE); + + if (vd != NULL) { + if (vd != spa->spa_root_vdev) + vdev_state_dirty(vdev_top); + + config_changed = B_TRUE; + spa->spa_config_generation++; + } + + if (spa_is_root(spa)) + vdev_rele(spa->spa_root_vdev); + + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); + spa_config_exit(spa, spa->spa_vdev_locks, spa); + + /* + * If anything changed, wait for it to sync. This ensures that, + * from the system administrator's perspective, zpool(1M) commands + * are synchronous. This is important for things like zpool offline: + * when the command completes, you expect no further I/O from ZFS. + */ + if (vd != NULL) + txg_wait_synced(spa->spa_dsl_pool, 0); + + /* + * If the config changed, update the config cache. + */ + if (config_changed) { + mutex_enter(&spa_namespace_lock); + spa_write_cachefile(spa, B_FALSE, B_TRUE); + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * ========================================================================== + * Miscellaneous functions + * ========================================================================== + */ + +void +spa_activate_mos_feature(spa_t *spa, const char *feature, dmu_tx_t *tx) +{ + if (!nvlist_exists(spa->spa_label_features, feature)) { + fnvlist_add_boolean(spa->spa_label_features, feature); + /* + * When we are creating the pool (tx_txg==TXG_INITIAL), we can't + * dirty the vdev config because lock SCL_CONFIG is not held. + * Thankfully, in this case we don't need to dirty the config + * because it will be written out anyway when we finish + * creating the pool. + */ + if (tx->tx_txg != TXG_INITIAL) + vdev_config_dirty(spa->spa_root_vdev); + } +} + +void +spa_deactivate_mos_feature(spa_t *spa, const char *feature) +{ + if (nvlist_remove_all(spa->spa_label_features, feature) == 0) + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * Return the spa_t associated with given pool_guid, if it exists. If + * device_guid is non-zero, determine whether the pool exists *and* contains + * a device with the specified device_guid. + */ +spa_t * +spa_by_guid(uint64_t pool_guid, uint64_t device_guid) +{ + spa_t *spa; + avl_tree_t *t = &spa_namespace_avl; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + for (spa = avl_first(t); spa != NULL; spa = AVL_NEXT(t, spa)) { + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + continue; + if (spa->spa_root_vdev == NULL) + continue; + if (spa_guid(spa) == pool_guid) { + if (device_guid == 0) + break; + + if (vdev_lookup_by_guid(spa->spa_root_vdev, + device_guid) != NULL) + break; + + /* + * Check any devices we may be in the process of adding. + */ + if (spa->spa_pending_vdev) { + if (vdev_lookup_by_guid(spa->spa_pending_vdev, + device_guid) != NULL) + break; + } + } + } + + return (spa); +} + +/* + * Determine whether a pool with the given pool_guid exists. + */ +boolean_t +spa_guid_exists(uint64_t pool_guid, uint64_t device_guid) +{ + return (spa_by_guid(pool_guid, device_guid) != NULL); +} + +char * +spa_strdup(const char *s) +{ + size_t len; + char *new; + + len = strlen(s); + new = kmem_alloc(len + 1, KM_SLEEP); + bcopy(s, new, len); + new[len] = '\0'; + + return (new); +} + +void +spa_strfree(char *s) +{ + kmem_free(s, strlen(s) + 1); +} + +uint64_t +spa_get_random(uint64_t range) +{ + uint64_t r; + + ASSERT(range != 0); + + if (range == 1) + return (0); + + (void) random_get_pseudo_bytes((void *)&r, sizeof (uint64_t)); + + return (r % range); +} + +uint64_t +spa_generate_guid(spa_t *spa) +{ + uint64_t guid = spa_get_random(-1ULL); + + if (spa != NULL) { + while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } else { + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); + } + + return (guid); +} + +void +snprintf_blkptr(char *buf, size_t buflen, const blkptr_t *bp) +{ + char type[256]; + char *checksum = NULL; + char *compress = NULL; + + if (bp != NULL) { + if (BP_GET_TYPE(bp) & DMU_OT_NEWTYPE) { + dmu_object_byteswap_t bswap = + DMU_OT_BYTESWAP(BP_GET_TYPE(bp)); + (void) snprintf(type, sizeof (type), "bswap %s %s", + DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) ? + "metadata" : "data", + dmu_ot_byteswap[bswap].ob_name); + } else { + (void) strlcpy(type, dmu_ot[BP_GET_TYPE(bp)].ot_name, + sizeof (type)); + } + if (!BP_IS_EMBEDDED(bp)) { + checksum = + zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + } + compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; + } + + SNPRINTF_BLKPTR(snprintf, ' ', buf, buflen, bp, type, checksum, + compress); +} + +void +spa_freeze(spa_t *spa) +{ + uint64_t freeze_txg = 0; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + if (spa->spa_freeze_txg == UINT64_MAX) { + freeze_txg = spa_last_synced_txg(spa) + TXG_SIZE; + spa->spa_freeze_txg = freeze_txg; + } + spa_config_exit(spa, SCL_ALL, FTAG); + if (freeze_txg != 0) + txg_wait_synced(spa_get_dsl(spa), freeze_txg); +} + +void +zfs_panic_recover(const char *fmt, ...) +{ + va_list adx; + + va_start(adx, fmt); + vcmn_err(zfs_recover ? CE_WARN : CE_PANIC, fmt, adx); + va_end(adx); +} + +/* + * This is a stripped-down version of strtoull, suitable only for converting + * lowercase hexadecimal numbers that don't overflow. + */ +uint64_t +zfs_strtonum(const char *str, char **nptr) +{ + uint64_t val = 0; + char c; + int digit; + + while ((c = *str) != '\0') { + if (c >= '0' && c <= '9') + digit = c - '0'; + else if (c >= 'a' && c <= 'f') + digit = 10 + c - 'a'; + else + break; + + val *= 16; + val += digit; + + str++; + } + + if (nptr) + *nptr = (char *)str; + + return (val); +} + +void +spa_activate_allocation_classes(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We bump the feature refcount for each special vdev added to the pool + */ + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_ALLOCATION_CLASSES)); + spa_feature_incr(spa, SPA_FEATURE_ALLOCATION_CLASSES, tx); +} + +/* + * ========================================================================== + * Accessor functions + * ========================================================================== + */ + +boolean_t +spa_shutting_down(spa_t *spa) +{ + return (spa->spa_async_suspended); +} + +dsl_pool_t * +spa_get_dsl(spa_t *spa) +{ + return (spa->spa_dsl_pool); +} + +boolean_t +spa_is_initializing(spa_t *spa) +{ + return (spa->spa_is_initializing); +} + +boolean_t +spa_indirect_vdevs_loaded(spa_t *spa) +{ + return (spa->spa_indirect_vdevs_loaded); +} + +blkptr_t * +spa_get_rootblkptr(spa_t *spa) +{ + return (&spa->spa_ubsync.ub_rootbp); +} + +void +spa_set_rootblkptr(spa_t *spa, const blkptr_t *bp) +{ + spa->spa_uberblock.ub_rootbp = *bp; +} + +void +spa_altroot(spa_t *spa, char *buf, size_t buflen) +{ + if (spa->spa_root == NULL) + buf[0] = '\0'; + else + (void) strncpy(buf, spa->spa_root, buflen); +} + +int +spa_sync_pass(spa_t *spa) +{ + return (spa->spa_sync_pass); +} + +char * +spa_name(spa_t *spa) +{ + return (spa->spa_name); +} + +uint64_t +spa_guid(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t guid; + + /* + * If we fail to parse the config during spa_load(), we can go through + * the error path (which posts an ereport) and end up here with no root + * vdev. We stash the original pool guid in 'spa_config_guid' to handle + * this case. + */ + if (spa->spa_root_vdev == NULL) + return (spa->spa_config_guid); + + guid = spa->spa_last_synced_guid != 0 ? + spa->spa_last_synced_guid : spa->spa_root_vdev->vdev_guid; + + /* + * Return the most recently synced out guid unless we're + * in syncing context. + */ + if (dp && dsl_pool_sync_context(dp)) + return (spa->spa_root_vdev->vdev_guid); + else + return (guid); +} + +uint64_t +spa_load_guid(spa_t *spa) +{ + /* + * This is a GUID that exists solely as a reference for the + * purposes of the arc. It is generated at load time, and + * is never written to persistent storage. + */ + return (spa->spa_load_guid); +} + +uint64_t +spa_last_synced_txg(spa_t *spa) +{ + return (spa->spa_ubsync.ub_txg); +} + +uint64_t +spa_first_txg(spa_t *spa) +{ + return (spa->spa_first_txg); +} + +uint64_t +spa_syncing_txg(spa_t *spa) +{ + return (spa->spa_syncing_txg); +} + +/* + * Return the last txg where data can be dirtied. The final txgs + * will be used to just clear out any deferred frees that remain. + */ +uint64_t +spa_final_dirty_txg(spa_t *spa) +{ + return (spa->spa_final_txg - TXG_DEFER_SIZE); +} + +pool_state_t +spa_state(spa_t *spa) +{ + return (spa->spa_state); +} + +spa_load_state_t +spa_load_state(spa_t *spa) +{ + return (spa->spa_load_state); +} + +uint64_t +spa_freeze_txg(spa_t *spa) +{ + return (spa->spa_freeze_txg); +} + +/* + * Return the inflated asize for a logical write in bytes. This is used by the + * DMU to calculate the space a logical write will require on disk. + * If lsize is smaller than the largest physical block size allocatable on this + * pool we use its value instead, since the write will end up using the whole + * block anyway. + */ +uint64_t +spa_get_worst_case_asize(spa_t *spa, uint64_t lsize) +{ + if (lsize == 0) + return (0); /* No inflation needed */ + return (MAX(lsize, 1 << spa->spa_max_ashift) * spa_asize_inflation); +} + +/* + * Return the amount of slop space in bytes. It is 1/32 of the pool (3.2%), + * or at least 128MB, unless that would cause it to be more than half the + * pool size. + * + * See the comment above spa_slop_shift for details. + */ +uint64_t +spa_get_slop_space(spa_t *spa) +{ + uint64_t space = spa_get_dspace(spa); + return (MAX(space >> spa_slop_shift, MIN(space >> 1, spa_min_slop))); +} + +uint64_t +spa_get_dspace(spa_t *spa) +{ + return (spa->spa_dspace); +} + +uint64_t +spa_get_checkpoint_space(spa_t *spa) +{ + return (spa->spa_checkpoint_info.sci_dspace); +} + +void +spa_update_dspace(spa_t *spa) +{ + spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + + ddt_get_dedup_dspace(spa); + if (spa->spa_vdev_removal != NULL) { + /* + * We can't allocate from the removing device, so + * subtract its size. This prevents the DMU/DSL from + * filling up the (now smaller) pool while we are in the + * middle of removing the device. + * + * Note that the DMU/DSL doesn't actually know or care + * how much space is allocated (it does its own tracking + * of how much space has been logically used). So it + * doesn't matter that the data we are moving may be + * allocated twice (on the old device and the new + * device). + */ + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *vd = + vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + spa->spa_dspace -= spa_deflate(spa) ? + vd->vdev_stat.vs_dspace : vd->vdev_stat.vs_space; + spa_config_exit(spa, SCL_VDEV, FTAG); + } +} + +/* + * Return the failure mode that has been set to this pool. The default + * behavior will be to block all I/Os when a complete failure occurs. + */ +uint64_t +spa_get_failmode(spa_t *spa) +{ + return (spa->spa_failmode); +} + +boolean_t +spa_suspended(spa_t *spa) +{ + return (spa->spa_suspended != ZIO_SUSPEND_NONE); +} + +uint64_t +spa_version(spa_t *spa) +{ + return (spa->spa_ubsync.ub_version); +} + +boolean_t +spa_deflate(spa_t *spa) +{ + return (spa->spa_deflate); +} + +metaslab_class_t * +spa_normal_class(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +metaslab_class_t * +spa_log_class(spa_t *spa) +{ + return (spa->spa_log_class); +} + +metaslab_class_t * +spa_special_class(spa_t *spa) +{ + return (spa->spa_special_class); +} + +metaslab_class_t * +spa_dedup_class(spa_t *spa) +{ + return (spa->spa_dedup_class); +} + +/* + * Locate an appropriate allocation class + */ +metaslab_class_t * +spa_preferred_class(spa_t *spa, uint64_t size, dmu_object_type_t objtype, + uint_t level, uint_t special_smallblk) +{ + if (DMU_OT_IS_ZIL(objtype)) { + if (spa->spa_log_class->mc_groups != 0) + return (spa_log_class(spa)); + else + return (spa_normal_class(spa)); + } + + boolean_t has_special_class = spa->spa_special_class->mc_groups != 0; + + if (DMU_OT_IS_DDT(objtype)) { + if (spa->spa_dedup_class->mc_groups != 0) + return (spa_dedup_class(spa)); + else if (has_special_class && zfs_ddt_data_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* Indirect blocks for user data can land in special if allowed */ + if (level > 0 && (DMU_OT_IS_FILE(objtype) || objtype == DMU_OT_ZVOL)) { + if (has_special_class && zfs_user_indirect_is_special) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + if (DMU_OT_IS_METADATA(objtype) || level > 0) { + if (has_special_class) + return (spa_special_class(spa)); + else + return (spa_normal_class(spa)); + } + + /* + * Allow small file blocks in special class in some cases (like + * for the dRAID vdev feature). But always leave a reserve of + * zfs_special_class_metadata_reserve_pct exclusively for metadata. + */ + if (DMU_OT_IS_FILE(objtype) && + has_special_class && size <= special_smallblk) { + metaslab_class_t *special = spa_special_class(spa); + uint64_t alloc = metaslab_class_get_alloc(special); + uint64_t space = metaslab_class_get_space(special); + uint64_t limit = + (space * (100 - zfs_special_class_metadata_reserve_pct)) + / 100; + + if (alloc < limit) + return (special); + } + + return (spa_normal_class(spa)); +} + +void +spa_evicting_os_register(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_insert_head(&spa->spa_evicting_os_list, os); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_deregister(spa_t *spa, objset_t *os) +{ + mutex_enter(&spa->spa_evicting_os_lock); + list_remove(&spa->spa_evicting_os_list, os); + cv_broadcast(&spa->spa_evicting_os_cv); + mutex_exit(&spa->spa_evicting_os_lock); +} + +void +spa_evicting_os_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_evicting_os_lock); + while (!list_is_empty(&spa->spa_evicting_os_list)) + cv_wait(&spa->spa_evicting_os_cv, &spa->spa_evicting_os_lock); + mutex_exit(&spa->spa_evicting_os_lock); + + dmu_buf_user_evict_wait(); +} + +int +spa_max_replication(spa_t *spa) +{ + /* + * As of SPA_VERSION == SPA_VERSION_DITTO_BLOCKS, we are able to + * handle BPs with more than one DVA allocated. Set our max + * replication level accordingly. + */ + if (spa_version(spa) < SPA_VERSION_DITTO_BLOCKS) + return (1); + return (MIN(SPA_DVAS_PER_BP, spa_max_replication_override)); +} + +int +spa_prev_software_version(spa_t *spa) +{ + return (spa->spa_prev_software_version); +} + +uint64_t +spa_deadman_synctime(spa_t *spa) +{ + return (spa->spa_deadman_synctime); +} + +spa_autotrim_t +spa_get_autotrim(spa_t *spa) +{ + return (spa->spa_autotrim); +} + +uint64_t +spa_deadman_ziotime(spa_t *spa) +{ + return (spa->spa_deadman_ziotime); +} + +uint64_t +spa_get_deadman_failmode(spa_t *spa) +{ + return (spa->spa_deadman_failmode); +} + +void +spa_set_deadman_failmode(spa_t *spa, const char *failmode) +{ + if (strcmp(failmode, "wait") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; + else if (strcmp(failmode, "continue") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_CONTINUE; + else if (strcmp(failmode, "panic") == 0) + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_PANIC; + else + spa->spa_deadman_failmode = ZIO_FAILURE_MODE_WAIT; +} + +void +spa_set_deadman_ziotime(hrtime_t ns) +{ + spa_t *spa = NULL; + + if (spa_mode_global != SPA_MODE_UNINIT) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa->spa_deadman_ziotime = ns; + mutex_exit(&spa_namespace_lock); + } +} + +void +spa_set_deadman_synctime(hrtime_t ns) +{ + spa_t *spa = NULL; + + if (spa_mode_global != SPA_MODE_UNINIT) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa->spa_deadman_synctime = ns; + mutex_exit(&spa_namespace_lock); + } +} + +uint64_t +dva_get_dsize_sync(spa_t *spa, const dva_t *dva) +{ + uint64_t asize = DVA_GET_ASIZE(dva); + uint64_t dsize = asize; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (asize != 0 && spa->spa_deflate) { + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + if (vd != NULL) + dsize = (asize >> SPA_MINBLOCKSHIFT) * + vd->vdev_deflate_ratio; + } + + return (dsize); +} + +uint64_t +bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + return (dsize); +} + +uint64_t +bp_get_dsize(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (int d = 0; d < BP_GET_NDVAS(bp); d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (dsize); +} + +uint64_t +spa_dirty_data(spa_t *spa) +{ + return (spa->spa_dsl_pool->dp_dirty_total); +} + +/* + * ========================================================================== + * SPA Import Progress Routines + * ========================================================================== + */ + +typedef struct spa_import_progress { + uint64_t pool_guid; /* unique id for updates */ + char *pool_name; + spa_load_state_t spa_load_state; + uint64_t mmp_sec_remaining; /* MMP activity check */ + uint64_t spa_load_max_txg; /* rewind txg */ + procfs_list_node_t smh_node; +} spa_import_progress_t; + +spa_history_list_t *spa_import_progress_list = NULL; + +static int +spa_import_progress_show_header(struct seq_file *f) +{ + seq_printf(f, "%-20s %-14s %-14s %-12s %s\n", "pool_guid", + "load_state", "multihost_secs", "max_txg", + "pool_name"); + return (0); +} + +static int +spa_import_progress_show(struct seq_file *f, void *data) +{ + spa_import_progress_t *sip = (spa_import_progress_t *)data; + + seq_printf(f, "%-20llu %-14llu %-14llu %-12llu %s\n", + (u_longlong_t)sip->pool_guid, (u_longlong_t)sip->spa_load_state, + (u_longlong_t)sip->mmp_sec_remaining, + (u_longlong_t)sip->spa_load_max_txg, + (sip->pool_name ? sip->pool_name : "-")); + + return (0); +} + +/* Remove oldest elements from list until there are no more than 'size' left */ +static void +spa_import_progress_truncate(spa_history_list_t *shl, unsigned int size) +{ + spa_import_progress_t *sip; + while (shl->size > size) { + sip = list_remove_head(&shl->procfs_list.pl_list); + if (sip->pool_name) + spa_strfree(sip->pool_name); + kmem_free(sip, sizeof (spa_import_progress_t)); + shl->size--; + } + + IMPLY(size == 0, list_is_empty(&shl->procfs_list.pl_list)); +} + +static void +spa_import_progress_init(void) +{ + spa_import_progress_list = kmem_zalloc(sizeof (spa_history_list_t), + KM_SLEEP); + + spa_import_progress_list->size = 0; + + spa_import_progress_list->procfs_list.pl_private = + spa_import_progress_list; + + procfs_list_install("zfs", + "import_progress", + 0644, + &spa_import_progress_list->procfs_list, + spa_import_progress_show, + spa_import_progress_show_header, + NULL, + offsetof(spa_import_progress_t, smh_node)); +} + +static void +spa_import_progress_destroy(void) +{ + spa_history_list_t *shl = spa_import_progress_list; + procfs_list_uninstall(&shl->procfs_list); + spa_import_progress_truncate(shl, 0); + procfs_list_destroy(&shl->procfs_list); + kmem_free(shl, sizeof (spa_history_list_t)); +} + +int +spa_import_progress_set_state(uint64_t pool_guid, + spa_load_state_t load_state) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + int error = ENOENT; + + if (shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + sip->spa_load_state = load_state; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +int +spa_import_progress_set_max_txg(uint64_t pool_guid, uint64_t load_max_txg) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + int error = ENOENT; + + if (shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + sip->spa_load_max_txg = load_max_txg; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +int +spa_import_progress_set_mmp_check(uint64_t pool_guid, + uint64_t mmp_sec_remaining) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + int error = ENOENT; + + if (shl->size == 0) + return (0); + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + sip->mmp_sec_remaining = mmp_sec_remaining; + error = 0; + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); + + return (error); +} + +/* + * A new import is in progress, add an entry. + */ +void +spa_import_progress_add(spa_t *spa) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + char *poolname = NULL; + + sip = kmem_zalloc(sizeof (spa_import_progress_t), KM_SLEEP); + sip->pool_guid = spa_guid(spa); + + (void) nvlist_lookup_string(spa->spa_config, ZPOOL_CONFIG_POOL_NAME, + &poolname); + if (poolname == NULL) + poolname = spa_name(spa); + sip->pool_name = spa_strdup(poolname); + sip->spa_load_state = spa_load_state(spa); + + mutex_enter(&shl->procfs_list.pl_lock); + procfs_list_add(&shl->procfs_list, sip); + shl->size++; + mutex_exit(&shl->procfs_list.pl_lock); +} + +void +spa_import_progress_remove(uint64_t pool_guid) +{ + spa_history_list_t *shl = spa_import_progress_list; + spa_import_progress_t *sip; + + mutex_enter(&shl->procfs_list.pl_lock); + for (sip = list_tail(&shl->procfs_list.pl_list); sip != NULL; + sip = list_prev(&shl->procfs_list.pl_list, sip)) { + if (sip->pool_guid == pool_guid) { + if (sip->pool_name) + spa_strfree(sip->pool_name); + list_remove(&shl->procfs_list.pl_list, sip); + shl->size--; + kmem_free(sip, sizeof (spa_import_progress_t)); + break; + } + } + mutex_exit(&shl->procfs_list.pl_lock); +} + +/* + * ========================================================================== + * Initialization and Termination + * ========================================================================== + */ + +static int +spa_name_compare(const void *a1, const void *a2) +{ + const spa_t *s1 = a1; + const spa_t *s2 = a2; + int s; + + s = strcmp(s1->spa_name, s2->spa_name); + + return (TREE_ISIGN(s)); +} + +void +spa_boot_init(void) +{ + spa_config_load(); +} + +void +spa_init(spa_mode_t mode) +{ + mutex_init(&spa_namespace_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_spare_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa_l2cache_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&spa_namespace_cv, NULL, CV_DEFAULT, NULL); + + avl_create(&spa_namespace_avl, spa_name_compare, sizeof (spa_t), + offsetof(spa_t, spa_avl)); + + avl_create(&spa_spare_avl, spa_spare_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), + offsetof(spa_aux_t, aux_avl)); + + spa_mode_global = mode; + +#ifndef _KERNEL + if (spa_mode_global != SPA_MODE_READ && dprintf_find_string("watch")) { + struct sigaction sa; + + sa.sa_flags = SA_SIGINFO; + sigemptyset(&sa.sa_mask); + sa.sa_sigaction = arc_buf_sigsegv; + + if (sigaction(SIGSEGV, &sa, NULL) == -1) { + perror("could not enable watchpoints: " + "sigaction(SIGSEGV, ...) = "); + } else { + arc_watch = B_TRUE; + } + } +#endif + + fm_init(); + zfs_refcount_init(); + unique_init(); + zfs_btree_init(); + metaslab_stat_init(); + ddt_init(); + zio_init(); + dmu_init(); + zil_init(); + vdev_cache_stat_init(); + vdev_mirror_stat_init(); + vdev_raidz_math_init(); + vdev_file_init(); + zfs_prop_init(); + zpool_prop_init(); + zpool_feature_init(); + spa_config_load(); + l2arc_start(); + scan_init(); + qat_init(); + spa_import_progress_init(); +} + +void +spa_fini(void) +{ + l2arc_stop(); + + spa_evict_all(); + + vdev_file_fini(); + vdev_cache_stat_fini(); + vdev_mirror_stat_fini(); + vdev_raidz_math_fini(); + zil_fini(); + dmu_fini(); + zio_fini(); + ddt_fini(); + metaslab_stat_fini(); + zfs_btree_fini(); + unique_fini(); + zfs_refcount_fini(); + fm_fini(); + scan_fini(); + qat_fini(); + spa_import_progress_destroy(); + + avl_destroy(&spa_namespace_avl); + avl_destroy(&spa_spare_avl); + avl_destroy(&spa_l2cache_avl); + + cv_destroy(&spa_namespace_cv); + mutex_destroy(&spa_namespace_lock); + mutex_destroy(&spa_spare_lock); + mutex_destroy(&spa_l2cache_lock); +} + +/* + * Return whether this pool has slogs. No locking needed. + * It's not a problem if the wrong answer is returned as it's only for + * performance and not correctness + */ +boolean_t +spa_has_slogs(spa_t *spa) +{ + return (spa->spa_log_class->mc_rotor != NULL); +} + +spa_log_state_t +spa_get_log_state(spa_t *spa) +{ + return (spa->spa_log_state); +} + +void +spa_set_log_state(spa_t *spa, spa_log_state_t state) +{ + spa->spa_log_state = state; +} + +boolean_t +spa_is_root(spa_t *spa) +{ + return (spa->spa_is_root); +} + +boolean_t +spa_writeable(spa_t *spa) +{ + return (!!(spa->spa_mode & SPA_MODE_WRITE) && spa->spa_trust_config); +} + +/* + * Returns true if there is a pending sync task in any of the current + * syncing txg, the current quiescing txg, or the current open txg. + */ +boolean_t +spa_has_pending_synctask(spa_t *spa) +{ + return (!txg_all_lists_empty(&spa->spa_dsl_pool->dp_sync_tasks) || + !txg_all_lists_empty(&spa->spa_dsl_pool->dp_early_sync_tasks)); +} + +spa_mode_t +spa_mode(spa_t *spa) +{ + return (spa->spa_mode); +} + +uint64_t +spa_bootfs(spa_t *spa) +{ + return (spa->spa_bootfs); +} + +uint64_t +spa_delegation(spa_t *spa) +{ + return (spa->spa_delegation); +} + +objset_t * +spa_meta_objset(spa_t *spa) +{ + return (spa->spa_meta_objset); +} + +enum zio_checksum +spa_dedup_checksum(spa_t *spa) +{ + return (spa->spa_dedup_checksum); +} + +/* + * Reset pool scan stat per scan pass (or reboot). + */ +void +spa_scan_stat_init(spa_t *spa) +{ + /* data not stored on disk */ + spa->spa_scan_pass_start = gethrestime_sec(); + if (dsl_scan_is_paused_scrub(spa->spa_dsl_pool->dp_scan)) + spa->spa_scan_pass_scrub_pause = spa->spa_scan_pass_start; + else + spa->spa_scan_pass_scrub_pause = 0; + spa->spa_scan_pass_scrub_spent_paused = 0; + spa->spa_scan_pass_exam = 0; + spa->spa_scan_pass_issued = 0; + vdev_scan_stat_init(spa->spa_root_vdev); +} + +/* + * Get scan stats for zpool status reports + */ +int +spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) +{ + dsl_scan_t *scn = spa->spa_dsl_pool ? spa->spa_dsl_pool->dp_scan : NULL; + + if (scn == NULL || scn->scn_phys.scn_func == POOL_SCAN_NONE) + return (SET_ERROR(ENOENT)); + bzero(ps, sizeof (pool_scan_stat_t)); + + /* data stored on disk */ + ps->pss_func = scn->scn_phys.scn_func; + ps->pss_state = scn->scn_phys.scn_state; + ps->pss_start_time = scn->scn_phys.scn_start_time; + ps->pss_end_time = scn->scn_phys.scn_end_time; + ps->pss_to_examine = scn->scn_phys.scn_to_examine; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_to_process = scn->scn_phys.scn_to_process; + ps->pss_processed = scn->scn_phys.scn_processed; + ps->pss_errors = scn->scn_phys.scn_errors; + + /* data not stored on disk */ + ps->pss_pass_exam = spa->spa_scan_pass_exam; + ps->pss_pass_start = spa->spa_scan_pass_start; + ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; + ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; + ps->pss_pass_issued = spa->spa_scan_pass_issued; + ps->pss_issued = + scn->scn_issued_before_pass + spa->spa_scan_pass_issued; + + return (0); +} + +int +spa_maxblocksize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) + return (SPA_MAXBLOCKSIZE); + else + return (SPA_OLD_MAXBLOCKSIZE); +} + + +/* + * Returns the txg that the last device removal completed. No indirect mappings + * have been added since this txg. + */ +uint64_t +spa_get_last_removal_txg(spa_t *spa) +{ + uint64_t vdevid; + uint64_t ret = -1ULL; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + /* + * sr_prev_indirect_vdev is only modified while holding all the + * config locks, so it is sufficient to hold SCL_VDEV as reader when + * examining it. + */ + vdevid = spa->spa_removing_phys.sr_prev_indirect_vdev; + + while (vdevid != -1ULL) { + vdev_t *vd = vdev_lookup_top(spa, vdevid); + vdev_indirect_births_t *vib = vd->vdev_indirect_births; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + /* + * If the removal did not remap any data, we don't care. + */ + if (vdev_indirect_births_count(vib) != 0) { + ret = vdev_indirect_births_last_entry_txg(vib); + break; + } + + vdevid = vd->vdev_indirect_config.vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_VDEV, FTAG); + + IMPLY(ret != -1ULL, + spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REMOVAL)); + + return (ret); +} + +int +spa_maxdnodesize(spa_t *spa) +{ + if (spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_DNODE)) + return (DNODE_MAX_SIZE); + else + return (DNODE_MIN_SIZE); +} + +boolean_t +spa_multihost(spa_t *spa) +{ + return (spa->spa_multihost ? B_TRUE : B_FALSE); +} + +uint32_t +spa_get_hostid(spa_t *spa) +{ + return (spa->spa_hostid); +} + +boolean_t +spa_trust_config(spa_t *spa) +{ + return (spa->spa_trust_config); +} + +uint64_t +spa_missing_tvds_allowed(spa_t *spa) +{ + return (spa->spa_missing_tvds_allowed); +} + +space_map_t * +spa_syncing_log_sm(spa_t *spa) +{ + return (spa->spa_syncing_log_sm); +} + +void +spa_set_missing_tvds(spa_t *spa, uint64_t missing) +{ + spa->spa_missing_tvds = missing; +} + +/* + * Return the pool state string ("ONLINE", "DEGRADED", "SUSPENDED", etc). + */ +const char * +spa_state_to_name(spa_t *spa) +{ + ASSERT3P(spa, !=, NULL); + + /* + * it is possible for the spa to exist, without root vdev + * as the spa transitions during import/export + */ + vdev_t *rvd = spa->spa_root_vdev; + if (rvd == NULL) { + return ("TRANSITIONING"); + } + vdev_state_t state = rvd->vdev_state; + vdev_aux_t aux = rvd->vdev_stat.vs_aux; + + if (spa_suspended(spa) && + (spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)) + return ("SUSPENDED"); + + switch (state) { + case VDEV_STATE_CLOSED: + case VDEV_STATE_OFFLINE: + return ("OFFLINE"); + case VDEV_STATE_REMOVED: + return ("REMOVED"); + case VDEV_STATE_CANT_OPEN: + if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) + return ("FAULTED"); + else if (aux == VDEV_AUX_SPLIT_POOL) + return ("SPLIT"); + else + return ("UNAVAIL"); + case VDEV_STATE_FAULTED: + return ("FAULTED"); + case VDEV_STATE_DEGRADED: + return ("DEGRADED"); + case VDEV_STATE_HEALTHY: + return ("ONLINE"); + default: + break; + } + + return ("UNKNOWN"); +} + +boolean_t +spa_top_vdevs_spacemap_addressable(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + for (uint64_t c = 0; c < rvd->vdev_children; c++) { + if (!vdev_is_spacemap_addressable(rvd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); +} + +boolean_t +spa_has_checkpoint(spa_t *spa) +{ + return (spa->spa_checkpoint_txg != 0); +} + +boolean_t +spa_importing_readonly_checkpoint(spa_t *spa) +{ + return ((spa->spa_import_flags & ZFS_IMPORT_CHECKPOINT) && + spa->spa_mode == SPA_MODE_READ); +} + +uint64_t +spa_min_claim_txg(spa_t *spa) +{ + uint64_t checkpoint_txg = spa->spa_uberblock.ub_checkpoint_txg; + + if (checkpoint_txg != 0) + return (checkpoint_txg + 1); + + return (spa->spa_first_txg); +} + +/* + * If there is a checkpoint, async destroys may consume more space from + * the pool instead of freeing it. In an attempt to save the pool from + * getting suspended when it is about to run out of space, we stop + * processing async destroys. + */ +boolean_t +spa_suspend_async_destroy(spa_t *spa) +{ + dsl_pool_t *dp = spa_get_dsl(spa); + + uint64_t unreserved = dsl_pool_unreserved_space(dp, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes; + uint64_t avail = (unreserved > used) ? (unreserved - used) : 0; + + if (spa_has_checkpoint(spa) && avail == 0) + return (B_TRUE); + + return (B_FALSE); +} + +#if defined(_KERNEL) + +int +param_set_deadman_failmode_common(const char *val) +{ + spa_t *spa = NULL; + char *p; + + if (val == NULL) + return (SET_ERROR(EINVAL)); + + if ((p = strchr(val, '\n')) != NULL) + *p = '\0'; + + if (strcmp(val, "wait") != 0 && strcmp(val, "continue") != 0 && + strcmp(val, "panic")) + return (SET_ERROR(EINVAL)); + + if (spa_mode_global != SPA_MODE_UNINIT) { + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) + spa_set_deadman_failmode(spa, val); + mutex_exit(&spa_namespace_lock); + } + + return (0); +} +#endif + +/* Namespace manipulation */ +EXPORT_SYMBOL(spa_lookup); +EXPORT_SYMBOL(spa_add); +EXPORT_SYMBOL(spa_remove); +EXPORT_SYMBOL(spa_next); + +/* Refcount functions */ +EXPORT_SYMBOL(spa_open_ref); +EXPORT_SYMBOL(spa_close); +EXPORT_SYMBOL(spa_refcount_zero); + +/* Pool configuration lock */ +EXPORT_SYMBOL(spa_config_tryenter); +EXPORT_SYMBOL(spa_config_enter); +EXPORT_SYMBOL(spa_config_exit); +EXPORT_SYMBOL(spa_config_held); + +/* Pool vdev add/remove lock */ +EXPORT_SYMBOL(spa_vdev_enter); +EXPORT_SYMBOL(spa_vdev_exit); + +/* Pool vdev state change lock */ +EXPORT_SYMBOL(spa_vdev_state_enter); +EXPORT_SYMBOL(spa_vdev_state_exit); + +/* Accessor functions */ +EXPORT_SYMBOL(spa_shutting_down); +EXPORT_SYMBOL(spa_get_dsl); +EXPORT_SYMBOL(spa_get_rootblkptr); +EXPORT_SYMBOL(spa_set_rootblkptr); +EXPORT_SYMBOL(spa_altroot); +EXPORT_SYMBOL(spa_sync_pass); +EXPORT_SYMBOL(spa_name); +EXPORT_SYMBOL(spa_guid); +EXPORT_SYMBOL(spa_last_synced_txg); +EXPORT_SYMBOL(spa_first_txg); +EXPORT_SYMBOL(spa_syncing_txg); +EXPORT_SYMBOL(spa_version); +EXPORT_SYMBOL(spa_state); +EXPORT_SYMBOL(spa_load_state); +EXPORT_SYMBOL(spa_freeze_txg); +EXPORT_SYMBOL(spa_get_dspace); +EXPORT_SYMBOL(spa_update_dspace); +EXPORT_SYMBOL(spa_deflate); +EXPORT_SYMBOL(spa_normal_class); +EXPORT_SYMBOL(spa_log_class); +EXPORT_SYMBOL(spa_special_class); +EXPORT_SYMBOL(spa_preferred_class); +EXPORT_SYMBOL(spa_max_replication); +EXPORT_SYMBOL(spa_prev_software_version); +EXPORT_SYMBOL(spa_get_failmode); +EXPORT_SYMBOL(spa_suspended); +EXPORT_SYMBOL(spa_bootfs); +EXPORT_SYMBOL(spa_delegation); +EXPORT_SYMBOL(spa_meta_objset); +EXPORT_SYMBOL(spa_maxblocksize); +EXPORT_SYMBOL(spa_maxdnodesize); + +/* Miscellaneous support routines */ +EXPORT_SYMBOL(spa_guid_exists); +EXPORT_SYMBOL(spa_strdup); +EXPORT_SYMBOL(spa_strfree); +EXPORT_SYMBOL(spa_get_random); +EXPORT_SYMBOL(spa_generate_guid); +EXPORT_SYMBOL(snprintf_blkptr); +EXPORT_SYMBOL(spa_freeze); +EXPORT_SYMBOL(spa_upgrade); +EXPORT_SYMBOL(spa_evict_all); +EXPORT_SYMBOL(spa_lookup_by_guid); +EXPORT_SYMBOL(spa_has_spare); +EXPORT_SYMBOL(dva_get_dsize_sync); +EXPORT_SYMBOL(bp_get_dsize_sync); +EXPORT_SYMBOL(bp_get_dsize); +EXPORT_SYMBOL(spa_has_slogs); +EXPORT_SYMBOL(spa_is_root); +EXPORT_SYMBOL(spa_writeable); +EXPORT_SYMBOL(spa_mode); +EXPORT_SYMBOL(spa_namespace_lock); +EXPORT_SYMBOL(spa_trust_config); +EXPORT_SYMBOL(spa_missing_tvds_allowed); +EXPORT_SYMBOL(spa_set_missing_tvds); +EXPORT_SYMBOL(spa_state_to_name); +EXPORT_SYMBOL(spa_importing_readonly_checkpoint); +EXPORT_SYMBOL(spa_min_claim_txg); +EXPORT_SYMBOL(spa_suspend_async_destroy); +EXPORT_SYMBOL(spa_has_checkpoint); +EXPORT_SYMBOL(spa_top_vdevs_spacemap_addressable); + +ZFS_MODULE_PARAM(zfs, zfs_, flags, UINT, ZMOD_RW, + "Set additional debugging flags"); + +ZFS_MODULE_PARAM(zfs, zfs_, recover, INT, ZMOD_RW, + "Set to attempt to recover from fatal errors"); + +ZFS_MODULE_PARAM(zfs, zfs_, free_leak_on_eio, INT, ZMOD_RW, + "Set to ignore IO errors during free and permanently leak the space"); + +ZFS_MODULE_PARAM(zfs, zfs_, deadman_checktime_ms, ULONG, ZMOD_RW, + "Dead I/O check interval in milliseconds"); + +ZFS_MODULE_PARAM(zfs, zfs_, deadman_enabled, INT, ZMOD_RW, + "Enable deadman timer"); + +ZFS_MODULE_PARAM(zfs_spa, spa_, asize_inflation, INT, ZMOD_RW, + "SPA size estimate multiplication factor"); + +ZFS_MODULE_PARAM(zfs, zfs_, ddt_data_is_special, INT, ZMOD_RW, + "Place DDT data into the special class"); + +ZFS_MODULE_PARAM(zfs, zfs_, user_indirect_is_special, INT, ZMOD_RW, + "Place user data indirect blocks into the special class"); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, failmode, + param_set_deadman_failmode, param_get_charp, ZMOD_RW, + "Failmode for deadman timer"); + +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, synctime_ms, + param_set_deadman_synctime, param_get_ulong, ZMOD_RW, + "Pool sync expiration time in milliseconds"); + +ZFS_MODULE_PARAM_CALL(zfs_deadman, zfs_deadman_, ziotime_ms, + param_set_deadman_ziotime, param_get_ulong, ZMOD_RW, + "IO expiration time in milliseconds"); + +ZFS_MODULE_PARAM(zfs, zfs_, special_class_metadata_reserve_pct, INT, ZMOD_RW, + "Small file blocks in special vdevs depends on this much " + "free space available"); +/* END CSTYLED */ + +ZFS_MODULE_PARAM_CALL(zfs_spa, spa_, slop_shift, param_set_slop_shift, + param_get_int, ZMOD_RW, "Reserved free space in pool"); diff --git a/module/zfs/space_map.c b/module/zfs/space_map.c new file mode 100644 index 000000000000..3db7d199199c --- /dev/null +++ b/module/zfs/space_map.c @@ -0,0 +1,1105 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Note on space map block size: + * + * The data for a given space map can be kept on blocks of any size. + * Larger blocks entail fewer I/O operations, but they also cause the + * DMU to keep more data in-core, and also to waste more I/O bandwidth + * when only a few blocks have changed since the last transaction group. + */ + +/* + * Enabled whenever we want to stress test the use of double-word + * space map entries. + */ +boolean_t zfs_force_some_double_word_sm_entries = B_FALSE; + +/* + * Override the default indirect block size of 128K, instead use 16K for + * spacemaps (2^14 bytes). This dramatically reduces write inflation since + * appending to a spacemap typically has to write one data block (4KB) and one + * or two indirect blocks (16K-32K, rather than 128K). + */ +int space_map_ibs = 14; + +boolean_t +sm_entry_is_debug(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM_DEBUG_PREFIX); +} + +boolean_t +sm_entry_is_single_word(uint64_t e) +{ + uint8_t prefix = SM_PREFIX_DECODE(e); + return (prefix != SM_DEBUG_PREFIX && prefix != SM2_PREFIX); +} + +boolean_t +sm_entry_is_double_word(uint64_t e) +{ + return (SM_PREFIX_DECODE(e) == SM2_PREFIX); +} + +/* + * Iterate through the space map, invoking the callback on each (non-debug) + * space map entry. Stop after reading 'end' bytes of the space map. + */ +int +space_map_iterate(space_map_t *sm, uint64_t end, sm_cb_t callback, void *arg) +{ + uint64_t blksz = sm->sm_blksz; + + ASSERT3U(blksz, !=, 0); + ASSERT3U(end, <=, space_map_length(sm)); + ASSERT0(P2PHASE(end, sizeof (uint64_t))); + + dmu_prefetch(sm->sm_os, space_map_object(sm), 0, 0, end, + ZIO_PRIORITY_SYNC_READ); + + int error = 0; + uint64_t txg = 0, sync_pass = 0; + for (uint64_t block_base = 0; block_base < end && error == 0; + block_base += blksz) { + dmu_buf_t *db; + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), + block_base, FTAG, &db, DMU_READ_PREFETCH); + if (error != 0) + return (error); + + uint64_t *block_start = db->db_data; + uint64_t block_length = MIN(end - block_base, blksz); + uint64_t *block_end = block_start + + (block_length / sizeof (uint64_t)); + + VERIFY0(P2PHASE(block_length, sizeof (uint64_t))); + VERIFY3U(block_length, !=, 0); + ASSERT3U(blksz, ==, db->db_size); + + for (uint64_t *block_cursor = block_start; + block_cursor < block_end && error == 0; block_cursor++) { + uint64_t e = *block_cursor; + + if (sm_entry_is_debug(e)) { + /* + * Debug entries are only needed to record the + * current TXG and sync pass if available. + * + * Note though that sometimes there can be + * debug entries that are used as padding + * at the end of space map blocks in-order + * to not split a double-word entry in the + * middle between two blocks. These entries + * have their TXG field set to 0 and we + * skip them without recording the TXG. + * [see comment in space_map_write_seg()] + */ + uint64_t e_txg = SM_DEBUG_TXG_DECODE(e); + if (e_txg != 0) { + txg = e_txg; + sync_pass = SM_DEBUG_SYNCPASS_DECODE(e); + } else { + ASSERT0(SM_DEBUG_SYNCPASS_DECODE(e)); + } + continue; + } + + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + /* it is a two-word entry */ + ASSERT(sm_entry_is_double_word(e)); + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move on to the second word */ + block_cursor++; + e = *block_cursor; + VERIFY3P(block_cursor, <=, block_end); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = (raw_offset << sm->sm_shift) + + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + ASSERT3U(entry_offset, >=, sm->sm_start); + ASSERT3U(entry_offset, <, sm->sm_start + sm->sm_size); + ASSERT3U(entry_run, <=, sm->sm_size); + ASSERT3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); + + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run, + .sme_txg = txg, + .sme_sync_pass = sync_pass + }; + error = callback(&sme, arg); + } + dmu_buf_rele(db, FTAG); + } + return (error); +} + +/* + * Reads the entries from the last block of the space map into + * buf in reverse order. Populates nwords with number of words + * in the last block. + * + * Refer to block comment within space_map_incremental_destroy() + * to understand why this function is needed. + */ +static int +space_map_reversed_last_block_entries(space_map_t *sm, uint64_t *buf, + uint64_t bufsz, uint64_t *nwords) +{ + int error = 0; + dmu_buf_t *db; + + /* + * Find the offset of the last word in the space map and use + * that to read the last block of the space map with + * dmu_buf_hold(). + */ + uint64_t last_word_offset = + sm->sm_phys->smp_length - sizeof (uint64_t); + error = dmu_buf_hold(sm->sm_os, space_map_object(sm), last_word_offset, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (error != 0) + return (error); + + ASSERT3U(sm->sm_object, ==, db->db_object); + ASSERT3U(sm->sm_blksz, ==, db->db_size); + ASSERT3U(bufsz, >=, db->db_size); + ASSERT(nwords != NULL); + + uint64_t *words = db->db_data; + *nwords = + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); + + ASSERT3U(*nwords, <=, bufsz / sizeof (uint64_t)); + + uint64_t n = *nwords; + uint64_t j = n - 1; + for (uint64_t i = 0; i < n; i++) { + uint64_t entry = words[i]; + if (sm_entry_is_double_word(entry)) { + /* + * Since we are populating the buffer backwards + * we have to be extra careful and add the two + * words of the double-word entry in the right + * order. + */ + ASSERT3U(j, >, 0); + buf[j - 1] = entry; + + i++; + ASSERT3U(i, <, n); + entry = words[i]; + buf[j] = entry; + j -= 2; + } else { + ASSERT(sm_entry_is_debug(entry) || + sm_entry_is_single_word(entry)); + buf[j] = entry; + j--; + } + } + + /* + * Assert that we wrote backwards all the + * way to the beginning of the buffer. + */ + ASSERT3S(j, ==, -1); + + dmu_buf_rele(db, FTAG); + return (error); +} + +/* + * Note: This function performs destructive actions - specifically + * it deletes entries from the end of the space map. Thus, callers + * should ensure that they are holding the appropriate locks for + * the space map that they provide. + */ +int +space_map_incremental_destroy(space_map_t *sm, sm_cb_t callback, void *arg, + dmu_tx_t *tx) +{ + uint64_t bufsz = MAX(sm->sm_blksz, SPA_MINBLOCKSIZE); + uint64_t *buf = zio_buf_alloc(bufsz); + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * Ideally we would want to iterate from the beginning of the + * space map to the end in incremental steps. The issue with this + * approach is that we don't have any field on-disk that points + * us where to start between each step. We could try zeroing out + * entries that we've destroyed, but this doesn't work either as + * an entry that is 0 is a valid one (ALLOC for range [0x0:0x200]). + * + * As a result, we destroy its entries incrementally starting from + * the end after applying the callback to each of them. + * + * The problem with this approach is that we cannot literally + * iterate through the words in the space map backwards as we + * can't distinguish two-word space map entries from their second + * word. Thus we do the following: + * + * 1] We get all the entries from the last block of the space map + * and put them into a buffer in reverse order. This way the + * last entry comes first in the buffer, the second to last is + * second, etc. + * 2] We iterate through the entries in the buffer and we apply + * the callback to each one. As we move from entry to entry we + * we decrease the size of the space map, deleting effectively + * each entry. + * 3] If there are no more entries in the space map or the callback + * returns a value other than 0, we stop iterating over the + * space map. If there are entries remaining and the callback + * returned 0, we go back to step [1]. + */ + int error = 0; + while (space_map_length(sm) > 0 && error == 0) { + uint64_t nwords = 0; + error = space_map_reversed_last_block_entries(sm, buf, bufsz, + &nwords); + if (error != 0) + break; + + ASSERT3U(nwords, <=, bufsz / sizeof (uint64_t)); + + for (uint64_t i = 0; i < nwords; i++) { + uint64_t e = buf[i]; + + if (sm_entry_is_debug(e)) { + sm->sm_phys->smp_length -= sizeof (uint64_t); + continue; + } + + int words = 1; + uint64_t raw_offset, raw_run, vdev_id; + maptype_t type; + if (sm_entry_is_single_word(e)) { + type = SM_TYPE_DECODE(e); + vdev_id = SM_NO_VDEVID; + raw_offset = SM_OFFSET_DECODE(e); + raw_run = SM_RUN_DECODE(e); + } else { + ASSERT(sm_entry_is_double_word(e)); + words = 2; + + raw_run = SM2_RUN_DECODE(e); + vdev_id = SM2_VDEV_DECODE(e); + + /* move to the second word */ + i++; + e = buf[i]; + + ASSERT3P(i, <=, nwords); + + type = SM2_TYPE_DECODE(e); + raw_offset = SM2_OFFSET_DECODE(e); + } + + uint64_t entry_offset = + (raw_offset << sm->sm_shift) + sm->sm_start; + uint64_t entry_run = raw_run << sm->sm_shift; + + VERIFY0(P2PHASE(entry_offset, 1ULL << sm->sm_shift)); + VERIFY0(P2PHASE(entry_run, 1ULL << sm->sm_shift)); + VERIFY3U(entry_offset, >=, sm->sm_start); + VERIFY3U(entry_offset, <, sm->sm_start + sm->sm_size); + VERIFY3U(entry_run, <=, sm->sm_size); + VERIFY3U(entry_offset + entry_run, <=, + sm->sm_start + sm->sm_size); + + space_map_entry_t sme = { + .sme_type = type, + .sme_vdev = vdev_id, + .sme_offset = entry_offset, + .sme_run = entry_run + }; + error = callback(&sme, arg); + if (error != 0) + break; + + if (type == SM_ALLOC) + sm->sm_phys->smp_alloc -= entry_run; + else + sm->sm_phys->smp_alloc += entry_run; + sm->sm_phys->smp_length -= words * sizeof (uint64_t); + } + } + + if (space_map_length(sm) == 0) { + ASSERT0(error); + ASSERT0(space_map_allocated(sm)); + } + + zio_buf_free(buf, bufsz); + return (error); +} + +typedef struct space_map_load_arg { + space_map_t *smla_sm; + range_tree_t *smla_rt; + maptype_t smla_type; +} space_map_load_arg_t; + +static int +space_map_load_callback(space_map_entry_t *sme, void *arg) +{ + space_map_load_arg_t *smla = arg; + if (sme->sme_type == smla->smla_type) { + VERIFY3U(range_tree_space(smla->smla_rt) + sme->sme_run, <=, + smla->smla_sm->sm_size); + range_tree_add(smla->smla_rt, sme->sme_offset, sme->sme_run); + } else { + range_tree_remove(smla->smla_rt, sme->sme_offset, sme->sme_run); + } + + return (0); +} + +/* + * Load the spacemap into the rangetree, like space_map_load. But only + * read the first 'length' bytes of the spacemap. + */ +int +space_map_load_length(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t length) +{ + space_map_load_arg_t smla; + + VERIFY0(range_tree_space(rt)); + + if (maptype == SM_FREE) + range_tree_add(rt, sm->sm_start, sm->sm_size); + + smla.smla_rt = rt; + smla.smla_sm = sm; + smla.smla_type = maptype; + int err = space_map_iterate(sm, length, + space_map_load_callback, &smla); + + if (err != 0) + range_tree_vacate(rt, NULL, NULL); + + return (err); +} + +/* + * Load the space map disk into the specified range tree. Segments of maptype + * are added to the range tree, other segment types are removed. + */ +int +space_map_load(space_map_t *sm, range_tree_t *rt, maptype_t maptype) +{ + return (space_map_load_length(sm, rt, maptype, space_map_length(sm))); +} + +void +space_map_histogram_clear(space_map_t *sm) +{ + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; + + bzero(sm->sm_phys->smp_histogram, sizeof (sm->sm_phys->smp_histogram)); +} + +boolean_t +space_map_histogram_verify(space_map_t *sm, range_tree_t *rt) +{ + /* + * Verify that the in-core range tree does not have any + * ranges smaller than our sm_shift size. + */ + for (int i = 0; i < sm->sm_shift; i++) { + if (rt->rt_histogram[i] != 0) + return (B_FALSE); + } + return (B_TRUE); +} + +void +space_map_histogram_add(space_map_t *sm, range_tree_t *rt, dmu_tx_t *tx) +{ + int idx = 0; + + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY3U(space_map_object(sm), !=, 0); + + if (sm->sm_dbuf->db_size != sizeof (space_map_phys_t)) + return; + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + ASSERT(space_map_histogram_verify(sm, rt)); + /* + * Transfer the content of the range tree histogram to the space + * map histogram. The space map histogram contains 32 buckets ranging + * between 2^sm_shift to 2^(32+sm_shift-1). The range tree, + * however, can represent ranges from 2^0 to 2^63. Since the space + * map only cares about allocatable blocks (minimum of sm_shift) we + * can safely ignore all ranges in the range tree smaller than sm_shift. + */ + for (int i = sm->sm_shift; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + + /* + * Since the largest histogram bucket in the space map is + * 2^(32+sm_shift-1), we need to normalize the values in + * the range tree for any bucket larger than that size. For + * example given an sm_shift of 9, ranges larger than 2^40 + * would get normalized as if they were 1TB ranges. Assume + * the range tree had a count of 5 in the 2^44 (16TB) bucket, + * the calculation below would normalize this to 5 * 2^4 (16). + */ + ASSERT3U(i, >=, idx + sm->sm_shift); + sm->sm_phys->smp_histogram[idx] += + rt->rt_histogram[i] << (i - idx - sm->sm_shift); + + /* + * Increment the space map's index as long as we haven't + * reached the maximum bucket size. Accumulate all ranges + * larger than the max bucket size into the last bucket. + */ + if (idx < SPACE_MAP_HISTOGRAM_SIZE - 1) { + ASSERT3U(idx + sm->sm_shift, ==, i); + idx++; + ASSERT3U(idx, <, SPACE_MAP_HISTOGRAM_SIZE); + } + } +} + +static void +space_map_write_intro_debug(space_map_t *sm, maptype_t maptype, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + uint64_t dentry = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(maptype) | + SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(tx->tx_pool->dp_spa)) | + SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx)); + + dmu_write(sm->sm_os, space_map_object(sm), sm->sm_phys->smp_length, + sizeof (dentry), &dentry, tx); + + sm->sm_phys->smp_length += sizeof (dentry); +} + +/* + * Writes one or more entries given a segment. + * + * Note: The function may release the dbuf from the pointer initially + * passed to it, and return a different dbuf. Also, the space map's + * dbuf must be dirty for the changes in sm_phys to take effect. + */ +static void +space_map_write_seg(space_map_t *sm, uint64_t rstart, uint64_t rend, + maptype_t maptype, uint64_t vdev_id, uint8_t words, dmu_buf_t **dbp, + void *tag, dmu_tx_t *tx) +{ + ASSERT3U(words, !=, 0); + ASSERT3U(words, <=, 2); + + /* ensure the vdev_id can be represented by the space map */ + ASSERT3U(vdev_id, <=, SM_NO_VDEVID); + + /* + * if this is a single word entry, ensure that no vdev was + * specified. + */ + IMPLY(words == 1, vdev_id == SM_NO_VDEVID); + + dmu_buf_t *db = *dbp; + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + uint64_t *block_base = db->db_data; + uint64_t *block_end = block_base + (sm->sm_blksz / sizeof (uint64_t)); + uint64_t *block_cursor = block_base + + (sm->sm_phys->smp_length - db->db_offset) / sizeof (uint64_t); + + ASSERT3P(block_cursor, <=, block_end); + + uint64_t size = (rend - rstart) >> sm->sm_shift; + uint64_t start = (rstart - sm->sm_start) >> sm->sm_shift; + uint64_t run_max = (words == 2) ? SM2_RUN_MAX : SM_RUN_MAX; + + ASSERT3U(rstart, >=, sm->sm_start); + ASSERT3U(rstart, <, sm->sm_start + sm->sm_size); + ASSERT3U(rend - rstart, <=, sm->sm_size); + ASSERT3U(rend, <=, sm->sm_start + sm->sm_size); + + while (size != 0) { + ASSERT3P(block_cursor, <=, block_end); + + /* + * If we are at the end of this block, flush it and start + * writing again from the beginning. + */ + if (block_cursor == block_end) { + dmu_buf_rele(db, tag); + + uint64_t next_word_offset = sm->sm_phys->smp_length; + VERIFY0(dmu_buf_hold(sm->sm_os, + space_map_object(sm), next_word_offset, + tag, &db, DMU_READ_PREFETCH)); + dmu_buf_will_dirty(db, tx); + + /* update caller's dbuf */ + *dbp = db; + + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + block_base = db->db_data; + block_cursor = block_base; + block_end = block_base + + (db->db_size / sizeof (uint64_t)); + } + + /* + * If we are writing a two-word entry and we only have one + * word left on this block, just pad it with an empty debug + * entry and write the two-word entry in the next block. + */ + uint64_t *next_entry = block_cursor + 1; + if (next_entry == block_end && words > 1) { + ASSERT3U(words, ==, 2); + *block_cursor = SM_PREFIX_ENCODE(SM_DEBUG_PREFIX) | + SM_DEBUG_ACTION_ENCODE(0) | + SM_DEBUG_SYNCPASS_ENCODE(0) | + SM_DEBUG_TXG_ENCODE(0); + block_cursor++; + sm->sm_phys->smp_length += sizeof (uint64_t); + ASSERT3P(block_cursor, ==, block_end); + continue; + } + + uint64_t run_len = MIN(size, run_max); + switch (words) { + case 1: + *block_cursor = SM_OFFSET_ENCODE(start) | + SM_TYPE_ENCODE(maptype) | + SM_RUN_ENCODE(run_len); + block_cursor++; + break; + case 2: + /* write the first word of the entry */ + *block_cursor = SM_PREFIX_ENCODE(SM2_PREFIX) | + SM2_RUN_ENCODE(run_len) | + SM2_VDEV_ENCODE(vdev_id); + block_cursor++; + + /* move on to the second word of the entry */ + ASSERT3P(block_cursor, <, block_end); + *block_cursor = SM2_TYPE_ENCODE(maptype) | + SM2_OFFSET_ENCODE(start); + block_cursor++; + break; + default: + panic("%d-word space map entries are not supported", + words); + break; + } + sm->sm_phys->smp_length += words * sizeof (uint64_t); + + start += run_len; + size -= run_len; + } + ASSERT0(size); + +} + +/* + * Note: The space map's dbuf must be dirty for the changes in sm_phys to + * take effect. + */ +static void +space_map_write_impl(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + spa_t *spa = tx->tx_pool->dp_spa; + dmu_buf_t *db; + + space_map_write_intro_debug(sm, maptype, tx); + +#ifdef ZFS_DEBUG + /* + * We do this right after we write the intro debug entry + * because the estimate does not take it into account. + */ + uint64_t initial_objsize = sm->sm_phys->smp_length; + uint64_t estimated_growth = + space_map_estimate_optimal_size(sm, rt, SM_NO_VDEVID); + uint64_t estimated_final_objsize = initial_objsize + estimated_growth; +#endif + + /* + * Find the offset right after the last word in the space map + * and use that to get a hold of the last block, so we can + * start appending to it. + */ + uint64_t next_word_offset = sm->sm_phys->smp_length; + VERIFY0(dmu_buf_hold(sm->sm_os, space_map_object(sm), + next_word_offset, FTAG, &db, DMU_READ_PREFETCH)); + ASSERT3U(db->db_size, ==, sm->sm_blksz); + + dmu_buf_will_dirty(db, tx); + + zfs_btree_t *t = &rt->rt_root; + zfs_btree_index_t where; + for (range_seg_t *rs = zfs_btree_first(t, &where); rs != NULL; + rs = zfs_btree_next(t, &where, &where)) { + uint64_t offset = (rs_get_start(rs, rt) - sm->sm_start) >> + sm->sm_shift; + uint64_t length = (rs_get_end(rs, rt) - rs_get_start(rs, rt)) >> + sm->sm_shift; + uint8_t words = 1; + + /* + * We only write two-word entries when both of the following + * are true: + * + * [1] The feature is enabled. + * [2] The offset or run is too big for a single-word entry, + * or the vdev_id is set (meaning not equal to + * SM_NO_VDEVID). + * + * Note that for purposes of testing we've added the case that + * we write two-word entries occasionally when the feature is + * enabled and zfs_force_some_double_word_sm_entries has been + * set. + */ + if (spa_feature_is_active(spa, SPA_FEATURE_SPACEMAP_V2) && + (offset >= (1ULL << SM_OFFSET_BITS) || + length > SM_RUN_MAX || + vdev_id != SM_NO_VDEVID || + (zfs_force_some_double_word_sm_entries && + spa_get_random(100) == 0))) + words = 2; + + space_map_write_seg(sm, rs_get_start(rs, rt), rs_get_end(rs, + rt), maptype, vdev_id, words, &db, FTAG, tx); + } + + dmu_buf_rele(db, FTAG); + +#ifdef ZFS_DEBUG + /* + * We expect our estimation to be based on the worst case + * scenario [see comment in space_map_estimate_optimal_size()]. + * Therefore we expect the actual objsize to be equal or less + * than whatever we estimated it to be. + */ + ASSERT3U(estimated_final_objsize, >=, sm->sm_phys->smp_length); +#endif +} + +/* + * Note: This function manipulates the state of the given space map but + * does not hold any locks implicitly. Thus the caller is responsible + * for synchronizing writes to the space map. + */ +void +space_map_write(space_map_t *sm, range_tree_t *rt, maptype_t maptype, + uint64_t vdev_id, dmu_tx_t *tx) +{ + ASSERT(dsl_pool_sync_context(dmu_objset_pool(sm->sm_os))); + VERIFY3U(space_map_object(sm), !=, 0); + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + + /* + * This field is no longer necessary since the in-core space map + * now contains the object number but is maintained for backwards + * compatibility. + */ + sm->sm_phys->smp_object = sm->sm_object; + + if (range_tree_is_empty(rt)) { + VERIFY3U(sm->sm_object, ==, sm->sm_phys->smp_object); + return; + } + + if (maptype == SM_ALLOC) + sm->sm_phys->smp_alloc += range_tree_space(rt); + else + sm->sm_phys->smp_alloc -= range_tree_space(rt); + + uint64_t nodes = zfs_btree_numnodes(&rt->rt_root); + uint64_t rt_space = range_tree_space(rt); + + space_map_write_impl(sm, rt, maptype, vdev_id, tx); + + /* + * Ensure that the space_map's accounting wasn't changed + * while we were in the middle of writing it out. + */ + VERIFY3U(nodes, ==, zfs_btree_numnodes(&rt->rt_root)); + VERIFY3U(range_tree_space(rt), ==, rt_space); +} + +static int +space_map_open_impl(space_map_t *sm) +{ + int error; + u_longlong_t blocks; + + error = dmu_bonus_hold(sm->sm_os, sm->sm_object, sm, &sm->sm_dbuf); + if (error) + return (error); + + dmu_object_size_from_db(sm->sm_dbuf, &sm->sm_blksz, &blocks); + sm->sm_phys = sm->sm_dbuf->db_data; + return (0); +} + +int +space_map_open(space_map_t **smp, objset_t *os, uint64_t object, + uint64_t start, uint64_t size, uint8_t shift) +{ + space_map_t *sm; + int error; + + ASSERT(*smp == NULL); + ASSERT(os != NULL); + ASSERT(object != 0); + + sm = kmem_alloc(sizeof (space_map_t), KM_SLEEP); + + sm->sm_start = start; + sm->sm_size = size; + sm->sm_shift = shift; + sm->sm_os = os; + sm->sm_object = object; + sm->sm_blksz = 0; + sm->sm_dbuf = NULL; + sm->sm_phys = NULL; + + error = space_map_open_impl(sm); + if (error != 0) { + space_map_close(sm); + return (error); + } + *smp = sm; + + return (0); +} + +void +space_map_close(space_map_t *sm) +{ + if (sm == NULL) + return; + + if (sm->sm_dbuf != NULL) + dmu_buf_rele(sm->sm_dbuf, sm); + sm->sm_dbuf = NULL; + sm->sm_phys = NULL; + + kmem_free(sm, sizeof (*sm)); +} + +void +space_map_truncate(space_map_t *sm, int blocksize, dmu_tx_t *tx) +{ + objset_t *os = sm->sm_os; + spa_t *spa = dmu_objset_spa(os); + dmu_object_info_t doi; + + ASSERT(dsl_pool_sync_context(dmu_objset_pool(os))); + ASSERT(dmu_tx_is_syncing(tx)); + VERIFY3U(dmu_tx_get_txg(tx), <=, spa_final_dirty_txg(spa)); + + dmu_object_info_from_db(sm->sm_dbuf, &doi); + + /* + * If the space map has the wrong bonus size (because + * SPA_FEATURE_SPACEMAP_HISTOGRAM has recently been enabled), or + * the wrong block size (because space_map_blksz has changed), + * free and re-allocate its object with the updated sizes. + * + * Otherwise, just truncate the current object. + */ + if ((spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM) && + doi.doi_bonus_size != sizeof (space_map_phys_t)) || + doi.doi_data_block_size != blocksize || + doi.doi_metadata_block_size != 1 << space_map_ibs) { + zfs_dbgmsg("txg %llu, spa %s, sm %px, reallocating " + "object[%llu]: old bonus %u, old blocksz %u", + dmu_tx_get_txg(tx), spa_name(spa), sm, sm->sm_object, + doi.doi_bonus_size, doi.doi_data_block_size); + + space_map_free(sm, tx); + dmu_buf_rele(sm->sm_dbuf, sm); + + sm->sm_object = space_map_alloc(sm->sm_os, blocksize, tx); + VERIFY0(space_map_open_impl(sm)); + } else { + VERIFY0(dmu_free_range(os, space_map_object(sm), 0, -1ULL, tx)); + + /* + * If the spacemap is reallocated, its histogram + * will be reset. Do the same in the common case so that + * bugs related to the uncommon case do not go unnoticed. + */ + bzero(sm->sm_phys->smp_histogram, + sizeof (sm->sm_phys->smp_histogram)); + } + + dmu_buf_will_dirty(sm->sm_dbuf, tx); + sm->sm_phys->smp_length = 0; + sm->sm_phys->smp_alloc = 0; +} + +uint64_t +space_map_alloc(objset_t *os, int blocksize, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + uint64_t object; + int bonuslen; + + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + spa_feature_incr(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); + bonuslen = sizeof (space_map_phys_t); + ASSERT3U(bonuslen, <=, dmu_bonus_max()); + } else { + bonuslen = SPACE_MAP_SIZE_V0; + } + + object = dmu_object_alloc_ibs(os, DMU_OT_SPACE_MAP, blocksize, + space_map_ibs, DMU_OT_SPACE_MAP_HEADER, bonuslen, tx); + + return (object); +} + +void +space_map_free_obj(objset_t *os, uint64_t smobj, dmu_tx_t *tx) +{ + spa_t *spa = dmu_objset_spa(os); + if (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_HISTOGRAM)) { + dmu_object_info_t doi; + + VERIFY0(dmu_object_info(os, smobj, &doi)); + if (doi.doi_bonus_size != SPACE_MAP_SIZE_V0) { + spa_feature_decr(spa, + SPA_FEATURE_SPACEMAP_HISTOGRAM, tx); + } + } + + VERIFY0(dmu_object_free(os, smobj, tx)); +} + +void +space_map_free(space_map_t *sm, dmu_tx_t *tx) +{ + if (sm == NULL) + return; + + space_map_free_obj(sm->sm_os, space_map_object(sm), tx); + sm->sm_object = 0; +} + +/* + * Given a range tree, it makes a worst-case estimate of how much + * space would the tree's segments take if they were written to + * the given space map. + */ +uint64_t +space_map_estimate_optimal_size(space_map_t *sm, range_tree_t *rt, + uint64_t vdev_id) +{ + spa_t *spa = dmu_objset_spa(sm->sm_os); + uint64_t shift = sm->sm_shift; + uint64_t *histogram = rt->rt_histogram; + uint64_t entries_for_seg = 0; + + /* + * In order to get a quick estimate of the optimal size that this + * range tree would have on-disk as a space map, we iterate through + * its histogram buckets instead of iterating through its nodes. + * + * Note that this is a highest-bound/worst-case estimate for the + * following reasons: + * + * 1] We assume that we always add a debug padding for each block + * we write and we also assume that we start at the last word + * of a block attempting to write a two-word entry. + * 2] Rounding up errors due to the way segments are distributed + * in the buckets of the range tree's histogram. + * 3] The activation of zfs_force_some_double_word_sm_entries + * (tunable) when testing. + * + * = Math and Rounding Errors = + * + * rt_histogram[i] bucket of a range tree represents the number + * of entries in [2^i, (2^(i+1))-1] of that range_tree. Given + * that, we want to divide the buckets into groups: Buckets that + * can be represented using a single-word entry, ones that can + * be represented with a double-word entry, and ones that can + * only be represented with multiple two-word entries. + * + * [Note that if the new encoding feature is not enabled there + * are only two groups: single-word entry buckets and multiple + * single-word entry buckets. The information below assumes + * two-word entries enabled, but it can easily applied when + * the feature is not enabled] + * + * To find the highest bucket that can be represented with a + * single-word entry we look at the maximum run that such entry + * can have, which is 2^(SM_RUN_BITS + sm_shift) [remember that + * the run of a space map entry is shifted by sm_shift, thus we + * add it to the exponent]. This way, excluding the value of the + * maximum run that can be represented by a single-word entry, + * all runs that are smaller exist in buckets 0 to + * SM_RUN_BITS + shift - 1. + * + * To find the highest bucket that can be represented with a + * double-word entry, we follow the same approach. Finally, any + * bucket higher than that are represented with multiple two-word + * entries. To be more specific, if the highest bucket whose + * segments can be represented with a single two-word entry is X, + * then bucket X+1 will need 2 two-word entries for each of its + * segments, X+2 will need 4, X+3 will need 8, ...etc. + * + * With all of the above we make our estimation based on bucket + * groups. There is a rounding error though. As we mentioned in + * the example with the one-word entry, the maximum run that can + * be represented in a one-word entry 2^(SM_RUN_BITS + shift) is + * not part of bucket SM_RUN_BITS + shift - 1. Thus, segments of + * that length fall into the next bucket (and bucket group) where + * we start counting two-word entries and this is one more reason + * why the estimated size may end up being bigger than the actual + * size written. + */ + uint64_t size = 0; + uint64_t idx = 0; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) || + (vdev_id == SM_NO_VDEVID && sm->sm_size < SM_OFFSET_MAX)) { + + /* + * If we are trying to force some double word entries just + * assume the worst-case of every single word entry being + * written as a double word entry. + */ + uint64_t entry_size = + (spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2) && + zfs_force_some_double_word_sm_entries) ? + (2 * sizeof (uint64_t)) : sizeof (uint64_t); + + uint64_t single_entry_max_bucket = SM_RUN_BITS + shift - 1; + for (; idx <= single_entry_max_bucket; idx++) + size += histogram[idx] * entry_size; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)) { + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, single_entry_max_bucket); + entries_for_seg = + 1ULL << (idx - single_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * entry_size; + } + return (size); + } + } + + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_SPACEMAP_V2)); + + uint64_t double_entry_max_bucket = SM2_RUN_BITS + shift - 1; + for (; idx <= double_entry_max_bucket; idx++) + size += histogram[idx] * 2 * sizeof (uint64_t); + + for (; idx < RANGE_TREE_HISTOGRAM_SIZE; idx++) { + ASSERT3U(idx, >=, double_entry_max_bucket); + entries_for_seg = 1ULL << (idx - double_entry_max_bucket); + size += histogram[idx] * + entries_for_seg * 2 * sizeof (uint64_t); + } + + /* + * Assume the worst case where we start with the padding at the end + * of the current block and we add an extra padding entry at the end + * of all subsequent blocks. + */ + size += ((size / sm->sm_blksz) + 1) * sizeof (uint64_t); + + return (size); +} + +uint64_t +space_map_object(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_object : 0); +} + +int64_t +space_map_allocated(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_phys->smp_alloc : 0); +} + +uint64_t +space_map_length(space_map_t *sm) +{ + return (sm != NULL ? sm->sm_phys->smp_length : 0); +} + +uint64_t +space_map_nblocks(space_map_t *sm) +{ + if (sm == NULL) + return (0); + return (DIV_ROUND_UP(space_map_length(sm), sm->sm_blksz)); +} diff --git a/module/zfs/space_reftree.c b/module/zfs/space_reftree.c new file mode 100644 index 000000000000..080fc6646512 --- /dev/null +++ b/module/zfs/space_reftree.c @@ -0,0 +1,152 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include + +/* + * Space reference trees. + * + * A range tree is a collection of integers. Every integer is either + * in the tree, or it's not. A space reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a range tree. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N range trees is the subset of the reference tree with refcnt >= 1. + * The intersection of N range trees is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'range tree domain', so we convert the trees + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_reftree_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = (const space_ref_t *)x1; + const space_ref_t *sr2 = (const space_ref_t *)x2; + + int cmp = TREE_CMP(sr1->sr_offset, sr2->sr_offset); + if (likely(cmp)) + return (cmp); + + return (TREE_PCMP(sr1, sr2)); +} + +void +space_reftree_create(avl_tree_t *t) +{ + avl_create(t, space_reftree_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_reftree_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_reftree_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_reftree_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_reftree_add_node(t, start, refcnt); + space_reftree_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a range tree into a reference tree. + */ +void +space_reftree_add_map(avl_tree_t *t, range_tree_t *rt, int64_t refcnt) +{ + zfs_btree_index_t where; + + for (range_seg_t *rs = zfs_btree_first(&rt->rt_root, &where); rs; rs = + zfs_btree_next(&rt->rt_root, &where, &where)) { + space_reftree_add_seg(t, rs_get_start(rs, rt), rs_get_end(rs, + rt), refcnt); + } +} + +/* + * Convert a reference tree into a range tree. The range tree will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_reftree_generate_map(avl_tree_t *t, range_tree_t *rt, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + range_tree_vacate(rt, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + range_tree_add(rt, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} diff --git a/module/zfs/txg.c b/module/zfs/txg.c new file mode 100644 index 000000000000..a5f2b041737b --- /dev/null +++ b/module/zfs/txg.c @@ -0,0 +1,1059 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS Transaction Groups + * ---------------------- + * + * ZFS transaction groups are, as the name implies, groups of transactions + * that act on persistent state. ZFS asserts consistency at the granularity of + * these transaction groups. Each successive transaction group (txg) is + * assigned a 64-bit consecutive identifier. There are three active + * transaction group states: open, quiescing, or syncing. At any given time, + * there may be an active txg associated with each state; each active txg may + * either be processing, or blocked waiting to enter the next state. There may + * be up to three active txgs, and there is always a txg in the open state + * (though it may be blocked waiting to enter the quiescing state). In broad + * strokes, transactions -- operations that change in-memory structures -- are + * accepted into the txg in the open state, and are completed while the txg is + * in the open or quiescing states. The accumulated changes are written to + * disk in the syncing state. + * + * Open + * + * When a new txg becomes active, it first enters the open state. New + * transactions -- updates to in-memory structures -- are assigned to the + * currently open txg. There is always a txg in the open state so that ZFS can + * accept new changes (though the txg may refuse new changes if it has hit + * some limit). ZFS advances the open txg to the next state for a variety of + * reasons such as it hitting a time or size threshold, or the execution of an + * administrative action that must be completed in the syncing state. + * + * Quiescing + * + * After a txg exits the open state, it enters the quiescing state. The + * quiescing state is intended to provide a buffer between accepting new + * transactions in the open state and writing them out to stable storage in + * the syncing state. While quiescing, transactions can continue their + * operation without delaying either of the other states. Typically, a txg is + * in the quiescing state very briefly since the operations are bounded by + * software latencies rather than, say, slower I/O latencies. After all + * transactions complete, the txg is ready to enter the next state. + * + * Syncing + * + * In the syncing state, the in-memory state built up during the open and (to + * a lesser degree) the quiescing states is written to stable storage. The + * process of writing out modified data can, in turn modify more data. For + * example when we write new blocks, we need to allocate space for them; those + * allocations modify metadata (space maps)... which themselves must be + * written to stable storage. During the sync state, ZFS iterates, writing out + * data until it converges and all in-memory changes have been written out. + * The first such pass is the largest as it encompasses all the modified user + * data (as opposed to filesystem metadata). Subsequent passes typically have + * far less data to write as they consist exclusively of filesystem metadata. + * + * To ensure convergence, after a certain number of passes ZFS begins + * overwriting locations on stable storage that had been allocated earlier in + * the syncing state (and subsequently freed). ZFS usually allocates new + * blocks to optimize for large, continuous, writes. For the syncing state to + * converge however it must complete a pass where no new blocks are allocated + * since each allocation requires a modification of persistent metadata. + * Further, to hasten convergence, after a prescribed number of passes, ZFS + * also defers frees, and stops compressing. + * + * In addition to writing out user data, we must also execute synctasks during + * the syncing context. A synctask is the mechanism by which some + * administrative activities work such as creating and destroying snapshots or + * datasets. Note that when a synctask is initiated it enters the open txg, + * and ZFS then pushes that txg as quickly as possible to completion of the + * syncing state in order to reduce the latency of the administrative + * activity. To complete the syncing state, ZFS writes out a new uberblock, + * the root of the tree of blocks that comprise all state stored on the ZFS + * pool. Finally, if there is a quiesced txg waiting, we signal that it can + * now transition to the syncing state. + */ + +static void txg_sync_thread(void *arg); +static void txg_quiesce_thread(void *arg); + +int zfs_txg_timeout = 5; /* max seconds worth of delta per txg */ + +/* + * Prepare the txg subsystem. + */ +void +txg_init(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + bzero(tx, sizeof (tx_state_t)); + + tx->tx_cpu = vmem_zalloc(max_ncpus * sizeof (tx_cpu_t), KM_SLEEP); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_init(&tx->tx_cpu[c].tc_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&tx->tx_cpu[c].tc_open_lock, NULL, MUTEX_NOLOCKDEP, + NULL); + for (i = 0; i < TXG_SIZE; i++) { + cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, + NULL); + list_create(&tx->tx_cpu[c].tc_callbacks[i], + sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + } + } + + mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + + cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); + + tx->tx_open_txg = txg; +} + +/* + * Close down the txg subsystem. + */ +void +txg_fini(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + int c; + + ASSERT0(tx->tx_threads); + + mutex_destroy(&tx->tx_sync_lock); + + cv_destroy(&tx->tx_sync_more_cv); + cv_destroy(&tx->tx_sync_done_cv); + cv_destroy(&tx->tx_quiesce_more_cv); + cv_destroy(&tx->tx_quiesce_done_cv); + cv_destroy(&tx->tx_exit_cv); + + for (c = 0; c < max_ncpus; c++) { + int i; + + mutex_destroy(&tx->tx_cpu[c].tc_open_lock); + mutex_destroy(&tx->tx_cpu[c].tc_lock); + for (i = 0; i < TXG_SIZE; i++) { + cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); + } + } + + if (tx->tx_commit_cb_taskq != NULL) + taskq_destroy(tx->tx_commit_cb_taskq); + + vmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); + + bzero(tx, sizeof (tx_state_t)); +} + +/* + * Start syncing transaction groups. + */ +void +txg_sync_start(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + mutex_enter(&tx->tx_sync_lock); + + dprintf("pool %p\n", dp); + + ASSERT0(tx->tx_threads); + + tx->tx_threads = 2; + + tx->tx_quiesce_thread = thread_create(NULL, 0, txg_quiesce_thread, + dp, 0, &p0, TS_RUN, defclsyspri); + + /* + * The sync thread can need a larger-than-default stack size on + * 32-bit x86. This is due in part to nested pools and + * scrub_visitbp() recursion. + */ + tx->tx_sync_thread = thread_create(NULL, 0, txg_sync_thread, + dp, 0, &p0, TS_RUN, defclsyspri); + + mutex_exit(&tx->tx_sync_lock); +} + +static void +txg_thread_enter(tx_state_t *tx, callb_cpr_t *cpr) +{ + CALLB_CPR_INIT(cpr, &tx->tx_sync_lock, callb_generic_cpr, FTAG); + mutex_enter(&tx->tx_sync_lock); +} + +static void +txg_thread_exit(tx_state_t *tx, callb_cpr_t *cpr, kthread_t **tpp) +{ + ASSERT(*tpp != NULL); + *tpp = NULL; + tx->tx_threads--; + cv_broadcast(&tx->tx_exit_cv); + CALLB_CPR_EXIT(cpr); /* drops &tx->tx_sync_lock */ + thread_exit(); +} + +static void +txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, clock_t time) +{ + CALLB_CPR_SAFE_BEGIN(cpr); + + /* + * cv_wait_sig() is used instead of cv_wait() in order to prevent + * this process from incorrectly contributing to the system load + * average when idle. + */ + if (time) { + (void) cv_timedwait_sig(cv, &tx->tx_sync_lock, + ddi_get_lbolt() + time); + } else { + cv_wait_sig(cv, &tx->tx_sync_lock); + } + + CALLB_CPR_SAFE_END(cpr, &tx->tx_sync_lock); +} + +/* + * Stop syncing transaction groups. + */ +void +txg_sync_stop(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + dprintf("pool %p\n", dp); + /* + * Finish off any work in progress. + */ + ASSERT3U(tx->tx_threads, ==, 2); + + /* + * We need to ensure that we've vacated the deferred metaslab trees. + */ + txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); + + /* + * Wake all sync threads and wait for them to die. + */ + mutex_enter(&tx->tx_sync_lock); + + ASSERT3U(tx->tx_threads, ==, 2); + + tx->tx_exiting = 1; + + cv_broadcast(&tx->tx_quiesce_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + cv_broadcast(&tx->tx_sync_more_cv); + + while (tx->tx_threads != 0) + cv_wait(&tx->tx_exit_cv, &tx->tx_sync_lock); + + tx->tx_exiting = 0; + + mutex_exit(&tx->tx_sync_lock); +} + +uint64_t +txg_hold_open(dsl_pool_t *dp, txg_handle_t *th) +{ + tx_state_t *tx = &dp->dp_tx; + tx_cpu_t *tc; + uint64_t txg; + + /* + * It appears the processor id is simply used as a "random" + * number to index into the array, and there isn't any other + * significance to the chosen tx_cpu. Because.. Why not use + * the current cpu to index into the array? + */ + kpreempt_disable(); + tc = &tx->tx_cpu[CPU_SEQID]; + kpreempt_enable(); + + mutex_enter(&tc->tc_open_lock); + txg = tx->tx_open_txg; + + mutex_enter(&tc->tc_lock); + tc->tc_count[txg & TXG_MASK]++; + mutex_exit(&tc->tc_lock); + + th->th_cpu = tc; + th->th_txg = txg; + + return (txg); +} + +void +txg_rele_to_quiesce(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + + ASSERT(!MUTEX_HELD(&tc->tc_lock)); + mutex_exit(&tc->tc_open_lock); +} + +void +txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + list_move_tail(&tc->tc_callbacks[g], tx_callbacks); + mutex_exit(&tc->tc_lock); +} + +void +txg_rele_to_sync(txg_handle_t *th) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + ASSERT(tc->tc_count[g] != 0); + if (--tc->tc_count[g] == 0) + cv_broadcast(&tc->tc_cv[g]); + mutex_exit(&tc->tc_lock); + + th->th_cpu = NULL; /* defensive */ +} + +/* + * Blocks until all transactions in the group are committed. + * + * On return, the transaction group has reached a stable state in which it can + * then be passed off to the syncing context. + */ +static void +txg_quiesce(dsl_pool_t *dp, uint64_t txg) +{ + tx_state_t *tx = &dp->dp_tx; + uint64_t tx_open_time; + int g = txg & TXG_MASK; + int c; + + /* + * Grab all tc_open_locks so nobody else can get into this txg. + */ + for (c = 0; c < max_ncpus; c++) + mutex_enter(&tx->tx_cpu[c].tc_open_lock); + + ASSERT(txg == tx->tx_open_txg); + tx->tx_open_txg++; + tx->tx_open_time = tx_open_time = gethrtime(); + + DTRACE_PROBE2(txg__quiescing, dsl_pool_t *, dp, uint64_t, txg); + DTRACE_PROBE2(txg__opened, dsl_pool_t *, dp, uint64_t, tx->tx_open_txg); + + /* + * Now that we've incremented tx_open_txg, we can let threads + * enter the next transaction group. + */ + for (c = 0; c < max_ncpus; c++) + mutex_exit(&tx->tx_cpu[c].tc_open_lock); + + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_OPEN, tx_open_time); + spa_txg_history_add(dp->dp_spa, txg + 1, tx_open_time); + + /* + * Quiesce the transaction group by waiting for everyone to txg_exit(). + */ + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + mutex_enter(&tc->tc_lock); + while (tc->tc_count[g] != 0) + cv_wait(&tc->tc_cv[g], &tc->tc_lock); + mutex_exit(&tc->tc_lock); + } + + spa_txg_history_set(dp->dp_spa, txg, TXG_STATE_QUIESCED, gethrtime()); +} + +static void +txg_do_callbacks(list_t *cb_list) +{ + dmu_tx_do_callbacks(cb_list, 0); + + list_destroy(cb_list); + + kmem_free(cb_list, sizeof (list_t)); +} + +/* + * Dispatch the commit callbacks registered on this txg to worker threads. + * + * If no callbacks are registered for a given TXG, nothing happens. + * This function creates a taskq for the associated pool, if needed. + */ +static void +txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) +{ + int c; + tx_state_t *tx = &dp->dp_tx; + list_t *cb_list; + + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + /* + * No need to lock tx_cpu_t at this point, since this can + * only be called once a txg has been synced. + */ + + int g = txg & TXG_MASK; + + if (list_is_empty(&tc->tc_callbacks[g])) + continue; + + if (tx->tx_commit_cb_taskq == NULL) { + /* + * Commit callback taskq hasn't been created yet. + */ + tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", + boot_ncpus, defclsyspri, boot_ncpus, boot_ncpus * 2, + TASKQ_PREPOPULATE | TASKQ_DYNAMIC); + } + + cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(cb_list, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + + list_move_tail(cb_list, &tc->tc_callbacks[g]); + + (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + txg_do_callbacks, cb_list, TQ_SLEEP); + } +} + +/* + * Wait for pending commit callbacks of already-synced transactions to finish + * processing. + * Calling this function from within a commit callback will deadlock. + */ +void +txg_wait_callbacks(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + if (tx->tx_commit_cb_taskq != NULL) + taskq_wait_outstanding(tx->tx_commit_cb_taskq, 0); +} + +static boolean_t +txg_is_syncing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_syncing_txg != 0); +} + +static boolean_t +txg_is_quiescing(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiescing_txg != 0); +} + +static boolean_t +txg_has_quiesced_to_sync(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + ASSERT(MUTEX_HELD(&tx->tx_sync_lock)); + return (tx->tx_quiesced_txg != 0); +} + +static void +txg_sync_thread(void *arg) +{ + dsl_pool_t *dp = arg; + spa_t *spa = dp->dp_spa; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + clock_t start, delta; + + (void) spl_fstrans_mark(); + txg_thread_enter(tx, &cpr); + + start = delta = 0; + for (;;) { + clock_t timeout = zfs_txg_timeout * hz; + clock_t timer; + uint64_t txg; + uint64_t dirty_min_bytes = + zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100; + + /* + * We sync when we're scanning, there's someone waiting + * on us, or the quiesce thread has handed off a txg to + * us, or we have reached our timeout. + */ + timer = (delta >= timeout ? 0 : timeout - delta); + while (!dsl_scan_active(dp->dp_scan) && + !tx->tx_exiting && timer > 0 && + tx->tx_synced_txg >= tx->tx_sync_txg_waiting && + !txg_has_quiesced_to_sync(dp) && + dp->dp_dirty_total < dirty_min_bytes) { + dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); + delta = ddi_get_lbolt() - start; + timer = (delta > timeout ? 0 : timeout - delta); + } + + /* + * Wait until the quiesce thread hands off a txg to us, + * prompting it to do so if necessary. + */ + while (!tx->tx_exiting && !txg_has_quiesced_to_sync(dp)) { + if (tx->tx_quiesce_txg_waiting < tx->tx_open_txg+1) + tx->tx_quiesce_txg_waiting = tx->tx_open_txg+1; + cv_broadcast(&tx->tx_quiesce_more_cv); + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_done_cv, 0); + } + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); + + /* + * Consume the quiesced txg which has been handed off to + * us. This may cause the quiescing thread to now be + * able to quiesce another txg, so we must signal it. + */ + ASSERT(tx->tx_quiesced_txg != 0); + txg = tx->tx_quiesced_txg; + tx->tx_quiesced_txg = 0; + tx->tx_syncing_txg = txg; + DTRACE_PROBE2(txg__syncing, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_quiesce_more_cv); + + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + mutex_exit(&tx->tx_sync_lock); + + txg_stat_t *ts = spa_txg_history_init_io(spa, txg, dp); + start = ddi_get_lbolt(); + spa_sync(spa, txg); + delta = ddi_get_lbolt() - start; + spa_txg_history_fini_io(spa, ts); + + mutex_enter(&tx->tx_sync_lock); + tx->tx_synced_txg = txg; + tx->tx_syncing_txg = 0; + DTRACE_PROBE2(txg__synced, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_sync_done_cv); + + /* + * Dispatch commit callbacks to worker threads. + */ + txg_dispatch_callbacks(dp, txg); + } +} + +static void +txg_quiesce_thread(void *arg) +{ + dsl_pool_t *dp = arg; + tx_state_t *tx = &dp->dp_tx; + callb_cpr_t cpr; + + txg_thread_enter(tx, &cpr); + + for (;;) { + uint64_t txg; + + /* + * We quiesce when there's someone waiting on us. + * However, we can only have one txg in "quiescing" or + * "quiesced, waiting to sync" state. So we wait until + * the "quiesced, waiting to sync" txg has been consumed + * by the sync thread. + */ + while (!tx->tx_exiting && + (tx->tx_open_txg >= tx->tx_quiesce_txg_waiting || + txg_has_quiesced_to_sync(dp))) + txg_thread_wait(tx, &cpr, &tx->tx_quiesce_more_cv, 0); + + if (tx->tx_exiting) + txg_thread_exit(tx, &cpr, &tx->tx_quiesce_thread); + + txg = tx->tx_open_txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, + tx->tx_sync_txg_waiting); + tx->tx_quiescing_txg = txg; + + mutex_exit(&tx->tx_sync_lock); + txg_quiesce(dp, txg); + mutex_enter(&tx->tx_sync_lock); + + /* + * Hand this txg off to the sync thread. + */ + dprintf("quiesce done, handing off txg %llu\n", txg); + tx->tx_quiescing_txg = 0; + tx->tx_quiesced_txg = txg; + DTRACE_PROBE2(txg__quiesced, dsl_pool_t *, dp, uint64_t, txg); + cv_broadcast(&tx->tx_sync_more_cv); + cv_broadcast(&tx->tx_quiesce_done_cv); + } +} + +/* + * Delay this thread by delay nanoseconds if we are still in the open + * transaction group and there is already a waiting txg quiescing or quiesced. + * Abort the delay if this txg stalls or enters the quiescing state. + */ +void +txg_delay(dsl_pool_t *dp, uint64_t txg, hrtime_t delay, hrtime_t resolution) +{ + tx_state_t *tx = &dp->dp_tx; + hrtime_t start = gethrtime(); + + /* don't delay if this txg could transition to quiescing immediately */ + if (tx->tx_open_txg > txg || + tx->tx_syncing_txg == txg-1 || tx->tx_synced_txg == txg-1) + return; + + mutex_enter(&tx->tx_sync_lock); + if (tx->tx_open_txg > txg || tx->tx_synced_txg == txg-1) { + mutex_exit(&tx->tx_sync_lock); + return; + } + + while (gethrtime() - start < delay && + tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) { + (void) cv_timedwait_hires(&tx->tx_quiesce_more_cv, + &tx->tx_sync_lock, delay, resolution, 0); + } + + DMU_TX_STAT_BUMP(dmu_tx_delay); + + mutex_exit(&tx->tx_sync_lock); +} + +static boolean_t +txg_wait_synced_impl(dsl_pool_t *dp, uint64_t txg, boolean_t wait_sig) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + ASSERT3U(tx->tx_threads, ==, 2); + if (txg == 0) + txg = tx->tx_open_txg + TXG_DEFER_SIZE; + if (tx->tx_sync_txg_waiting < txg) + tx->tx_sync_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_synced_txg < txg) { + dprintf("broadcasting sync more " + "tx_synced=%llu waiting=%llu dp=%px\n", + tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); + cv_broadcast(&tx->tx_sync_more_cv); + if (wait_sig) { + /* + * Condition wait here but stop if the thread receives a + * signal. The caller may call txg_wait_synced*() again + * to resume waiting for this txg. + */ + if (cv_wait_io_sig(&tx->tx_sync_done_cv, + &tx->tx_sync_lock) == 0) { + mutex_exit(&tx->tx_sync_lock); + return (B_TRUE); + } + } else { + cv_wait_io(&tx->tx_sync_done_cv, &tx->tx_sync_lock); + } + } + mutex_exit(&tx->tx_sync_lock); + return (B_FALSE); +} + +void +txg_wait_synced(dsl_pool_t *dp, uint64_t txg) +{ + VERIFY0(txg_wait_synced_impl(dp, txg, B_FALSE)); +} + +/* + * Similar to a txg_wait_synced but it can be interrupted from a signal. + * Returns B_TRUE if the thread was signaled while waiting. + */ +boolean_t +txg_wait_synced_sig(dsl_pool_t *dp, uint64_t txg) +{ + return (txg_wait_synced_impl(dp, txg, B_TRUE)); +} + +/* + * Wait for the specified open transaction group. Set should_quiesce + * when the current open txg should be quiesced immediately. + */ +void +txg_wait_open(dsl_pool_t *dp, uint64_t txg, boolean_t should_quiesce) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + ASSERT3U(tx->tx_threads, ==, 2); + if (txg == 0) + txg = tx->tx_open_txg + 1; + if (tx->tx_quiesce_txg_waiting < txg && should_quiesce) + tx->tx_quiesce_txg_waiting = txg; + dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", + txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); + while (tx->tx_open_txg < txg) { + cv_broadcast(&tx->tx_quiesce_more_cv); + /* + * Callers setting should_quiesce will use cv_wait_io() and + * be accounted for as iowait time. Otherwise, the caller is + * understood to be idle and cv_wait_sig() is used to prevent + * incorrectly inflating the system load average. + */ + if (should_quiesce == B_TRUE) { + cv_wait_io(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } else { + cv_wait_sig(&tx->tx_quiesce_done_cv, &tx->tx_sync_lock); + } + } + mutex_exit(&tx->tx_sync_lock); +} + +/* + * If there isn't a txg syncing or in the pipeline, push another txg through + * the pipeline by quiescing the open txg. + */ +void +txg_kick(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + ASSERT(!dsl_pool_config_held(dp)); + + mutex_enter(&tx->tx_sync_lock); + if (!txg_is_syncing(dp) && + !txg_is_quiescing(dp) && + tx->tx_quiesce_txg_waiting <= tx->tx_open_txg && + tx->tx_sync_txg_waiting <= tx->tx_synced_txg && + tx->tx_quiesced_txg <= tx->tx_synced_txg) { + tx->tx_quiesce_txg_waiting = tx->tx_open_txg + 1; + cv_broadcast(&tx->tx_quiesce_more_cv); + } + mutex_exit(&tx->tx_sync_lock); +} + +boolean_t +txg_stalled(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + return (tx->tx_quiesce_txg_waiting > tx->tx_open_txg); +} + +boolean_t +txg_sync_waiting(dsl_pool_t *dp) +{ + tx_state_t *tx = &dp->dp_tx; + + return (tx->tx_syncing_txg <= tx->tx_sync_txg_waiting || + tx->tx_quiesced_txg != 0); +} + +/* + * Verify that this txg is active (open, quiescing, syncing). Non-active + * txg's should not be manipulated. + */ +#ifdef ZFS_DEBUG +void +txg_verify(spa_t *spa, uint64_t txg) +{ + dsl_pool_t *dp __maybe_unused = spa_get_dsl(spa); + if (txg <= TXG_INITIAL || txg == ZILTEST_TXG) + return; + ASSERT3U(txg, <=, dp->dp_tx.tx_open_txg); + ASSERT3U(txg, >=, dp->dp_tx.tx_synced_txg); + ASSERT3U(txg, >=, dp->dp_tx.tx_open_txg - TXG_CONCURRENT_STATES); +} +#endif + +/* + * Per-txg object lists. + */ +void +txg_list_create(txg_list_t *tl, spa_t *spa, size_t offset) +{ + int t; + + mutex_init(&tl->tl_lock, NULL, MUTEX_DEFAULT, NULL); + + tl->tl_offset = offset; + tl->tl_spa = spa; + + for (t = 0; t < TXG_SIZE; t++) + tl->tl_head[t] = NULL; +} + +static boolean_t +txg_list_empty_impl(txg_list_t *tl, uint64_t txg) +{ + ASSERT(MUTEX_HELD(&tl->tl_lock)); + TXG_VERIFY(tl->tl_spa, txg); + return (tl->tl_head[txg & TXG_MASK] == NULL); +} + +boolean_t +txg_list_empty(txg_list_t *tl, uint64_t txg) +{ + mutex_enter(&tl->tl_lock); + boolean_t ret = txg_list_empty_impl(tl, txg); + mutex_exit(&tl->tl_lock); + + return (ret); +} + +void +txg_list_destroy(txg_list_t *tl) +{ + int t; + + mutex_enter(&tl->tl_lock); + for (t = 0; t < TXG_SIZE; t++) + ASSERT(txg_list_empty_impl(tl, t)); + mutex_exit(&tl->tl_lock); + + mutex_destroy(&tl->tl_lock); +} + +/* + * Returns true if all txg lists are empty. + * + * Warning: this is inherently racy (an item could be added immediately + * after this function returns). + */ +boolean_t +txg_all_lists_empty(txg_list_t *tl) +{ + mutex_enter(&tl->tl_lock); + for (int i = 0; i < TXG_SIZE; i++) { + if (!txg_list_empty_impl(tl, i)) { + mutex_exit(&tl->tl_lock); + return (B_FALSE); + } + } + mutex_exit(&tl->tl_lock); + return (B_TRUE); +} + +/* + * Add an entry to the list (unless it's already on the list). + * Returns B_TRUE if it was actually added. + */ +boolean_t +txg_list_add(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + boolean_t add; + + TXG_VERIFY(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + add = (tn->tn_member[t] == 0); + if (add) { + tn->tn_member[t] = 1; + tn->tn_next[t] = tl->tl_head[t]; + tl->tl_head[t] = tn; + } + mutex_exit(&tl->tl_lock); + + return (add); +} + +/* + * Add an entry to the end of the list, unless it's already on the list. + * (walks list to find end) + * Returns B_TRUE if it was actually added. + */ +boolean_t +txg_list_add_tail(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + boolean_t add; + + TXG_VERIFY(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + add = (tn->tn_member[t] == 0); + if (add) { + txg_node_t **tp; + + for (tp = &tl->tl_head[t]; *tp != NULL; tp = &(*tp)->tn_next[t]) + continue; + + tn->tn_member[t] = 1; + tn->tn_next[t] = NULL; + *tp = tn; + } + mutex_exit(&tl->tl_lock); + + return (add); +} + +/* + * Remove the head of the list and return it. + */ +void * +txg_list_remove(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + void *p = NULL; + + TXG_VERIFY(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + if ((tn = tl->tl_head[t]) != NULL) { + ASSERT(tn->tn_member[t]); + ASSERT(tn->tn_next[t] == NULL || tn->tn_next[t]->tn_member[t]); + p = (char *)tn - tl->tl_offset; + tl->tl_head[t] = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + } + mutex_exit(&tl->tl_lock); + + return (p); +} + +/* + * Remove a specific item from the list and return it. + */ +void * +txg_list_remove_this(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn, **tp; + + TXG_VERIFY(tl->tl_spa, txg); + mutex_enter(&tl->tl_lock); + + for (tp = &tl->tl_head[t]; (tn = *tp) != NULL; tp = &tn->tn_next[t]) { + if ((char *)tn - tl->tl_offset == p) { + *tp = tn->tn_next[t]; + tn->tn_next[t] = NULL; + tn->tn_member[t] = 0; + mutex_exit(&tl->tl_lock); + return (p); + } + } + + mutex_exit(&tl->tl_lock); + + return (NULL); +} + +boolean_t +txg_list_member(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + TXG_VERIFY(tl->tl_spa, txg); + return (tn->tn_member[t] != 0); +} + +/* + * Walk a txg list + */ +void * +txg_list_head(txg_list_t *tl, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn; + + mutex_enter(&tl->tl_lock); + tn = tl->tl_head[t]; + mutex_exit(&tl->tl_lock); + + TXG_VERIFY(tl->tl_spa, txg); + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +void * +txg_list_next(txg_list_t *tl, void *p, uint64_t txg) +{ + int t = txg & TXG_MASK; + txg_node_t *tn = (txg_node_t *)((char *)p + tl->tl_offset); + + TXG_VERIFY(tl->tl_spa, txg); + + mutex_enter(&tl->tl_lock); + tn = tn->tn_next[t]; + mutex_exit(&tl->tl_lock); + + return (tn == NULL ? NULL : (char *)tn - tl->tl_offset); +} + +EXPORT_SYMBOL(txg_init); +EXPORT_SYMBOL(txg_fini); +EXPORT_SYMBOL(txg_sync_start); +EXPORT_SYMBOL(txg_sync_stop); +EXPORT_SYMBOL(txg_hold_open); +EXPORT_SYMBOL(txg_rele_to_quiesce); +EXPORT_SYMBOL(txg_rele_to_sync); +EXPORT_SYMBOL(txg_register_callbacks); +EXPORT_SYMBOL(txg_delay); +EXPORT_SYMBOL(txg_wait_synced); +EXPORT_SYMBOL(txg_wait_open); +EXPORT_SYMBOL(txg_wait_callbacks); +EXPORT_SYMBOL(txg_stalled); +EXPORT_SYMBOL(txg_sync_waiting); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_txg, zfs_txg_, timeout, INT, ZMOD_RW, + "Max seconds worth of delta per txg"); +/* END CSTYLED */ diff --git a/module/zfs/uberblock.c b/module/zfs/uberblock.c new file mode 100644 index 000000000000..b8857d74d810 --- /dev/null +++ b/module/zfs/uberblock.c @@ -0,0 +1,74 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include + +int +uberblock_verify(uberblock_t *ub) +{ + if (ub->ub_magic == BSWAP_64((uint64_t)UBERBLOCK_MAGIC)) + byteswap_uint64_array(ub, sizeof (uberblock_t)); + + if (ub->ub_magic != UBERBLOCK_MAGIC) + return (SET_ERROR(EINVAL)); + + return (0); +} + +/* + * Update the uberblock and return TRUE if anything changed in this + * transaction group. + */ +boolean_t +uberblock_update(uberblock_t *ub, vdev_t *rvd, uint64_t txg, uint64_t mmp_delay) +{ + ASSERT(ub->ub_txg < txg); + + /* + * We explicitly do not set ub_version here, so that older versions + * continue to be written with the previous uberblock version. + */ + ub->ub_magic = UBERBLOCK_MAGIC; + ub->ub_txg = txg; + ub->ub_guid_sum = rvd->vdev_guid_sum; + ub->ub_timestamp = gethrestime_sec(); + ub->ub_software_version = SPA_VERSION; + ub->ub_mmp_magic = MMP_MAGIC; + if (spa_multihost(rvd->vdev_spa)) { + ub->ub_mmp_delay = mmp_delay; + ub->ub_mmp_config = MMP_SEQ_SET(0) | + MMP_INTERVAL_SET(zfs_multihost_interval) | + MMP_FAIL_INT_SET(zfs_multihost_fail_intervals); + } else { + ub->ub_mmp_delay = 0; + ub->ub_mmp_config = 0; + } + ub->ub_checkpoint_txg = 0; + + return (ub->ub_rootbp.blk_birth == txg); +} diff --git a/module/zfs/unique.c b/module/zfs/unique.c new file mode 100644 index 000000000000..0e076797a002 --- /dev/null +++ b/module/zfs/unique.c @@ -0,0 +1,112 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + + + +#include +#include +#include + +static avl_tree_t unique_avl; +static kmutex_t unique_mtx; + +typedef struct unique { + avl_node_t un_link; + uint64_t un_value; +} unique_t; + +#define UNIQUE_MASK ((1ULL << UNIQUE_BITS) - 1) + +static int +unique_compare(const void *a, const void *b) +{ + const unique_t *una = (const unique_t *)a; + const unique_t *unb = (const unique_t *)b; + + return (TREE_CMP(una->un_value, unb->un_value)); +} + +void +unique_init(void) +{ + avl_create(&unique_avl, unique_compare, + sizeof (unique_t), offsetof(unique_t, un_link)); + mutex_init(&unique_mtx, NULL, MUTEX_DEFAULT, NULL); +} + +void +unique_fini(void) +{ + avl_destroy(&unique_avl); + mutex_destroy(&unique_mtx); +} + +uint64_t +unique_create(void) +{ + uint64_t value = unique_insert(0); + unique_remove(value); + return (value); +} + +uint64_t +unique_insert(uint64_t value) +{ + avl_index_t idx; + unique_t *un = kmem_alloc(sizeof (unique_t), KM_SLEEP); + + un->un_value = value; + + mutex_enter(&unique_mtx); + while (un->un_value == 0 || un->un_value & ~UNIQUE_MASK || + avl_find(&unique_avl, un, &idx)) { + mutex_exit(&unique_mtx); + (void) random_get_pseudo_bytes((void*)&un->un_value, + sizeof (un->un_value)); + un->un_value &= UNIQUE_MASK; + mutex_enter(&unique_mtx); + } + + avl_insert(&unique_avl, un, idx); + mutex_exit(&unique_mtx); + + return (un->un_value); +} + +void +unique_remove(uint64_t value) +{ + unique_t un_tofind; + unique_t *un; + + un_tofind.un_value = value; + mutex_enter(&unique_mtx); + un = avl_find(&unique_avl, &un_tofind, NULL); + if (un != NULL) { + avl_remove(&unique_avl, un); + kmem_free(un, sizeof (unique_t)); + } + mutex_exit(&unique_mtx); +} diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c new file mode 100644 index 000000000000..1844a5653f12 --- /dev/null +++ b/module/zfs/vdev.c @@ -0,0 +1,5090 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright 2017 Joyent, Inc. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Datto Inc. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* default target for number of metaslabs per top-level vdev */ +int zfs_vdev_default_ms_count = 200; + +/* minimum number of metaslabs per top-level vdev */ +int zfs_vdev_min_ms_count = 16; + +/* practical upper limit of total metaslabs per top-level vdev */ +int zfs_vdev_ms_count_limit = 1ULL << 17; + +/* lower limit for metaslab size (512M) */ +int zfs_vdev_default_ms_shift = 29; + +/* upper limit for metaslab size (16G) */ +int zfs_vdev_max_ms_shift = 34; + +int vdev_validate_skip = B_FALSE; + +/* + * Since the DTL space map of a vdev is not expected to have a lot of + * entries, we default its block size to 4K. + */ +int zfs_vdev_dtl_sm_blksz = (1 << 12); + +/* + * Rate limit slow IO (delay) events to this many per second. + */ +unsigned int zfs_slow_io_events_per_second = 20; + +/* + * Rate limit checksum events after this many checksum errors per second. + */ +unsigned int zfs_checksum_events_per_second = 20; + +/* + * Ignore errors during scrub/resilver. Allows to work around resilver + * upon import when there are pool errors. + */ +int zfs_scan_ignore_errors = 0; + +/* + * vdev-wide space maps that have lots of entries written to them at + * the end of each transaction can benefit from a higher I/O bandwidth + * (e.g. vdev_obsolete_sm), thus we default their block size to 128K. + */ +int zfs_vdev_standard_sm_blksz = (1 << 17); + +/* + * Tunable parameter for debugging or performance analysis. Setting this + * will cause pool corruption on power loss if a volatile out-of-order + * write cache is enabled. + */ +int zfs_nocacheflush = 0; + +uint64_t zfs_vdev_max_auto_ashift = ASHIFT_MAX; +uint64_t zfs_vdev_min_auto_ashift = ASHIFT_MIN; + +/*PRINTFLIKE2*/ +void +vdev_dbgmsg(vdev_t *vd, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + if (vd->vdev_path != NULL) { + zfs_dbgmsg("%s vdev '%s': %s", vd->vdev_ops->vdev_op_type, + vd->vdev_path, buf); + } else { + zfs_dbgmsg("%s-%llu vdev (guid %llu): %s", + vd->vdev_ops->vdev_op_type, + (u_longlong_t)vd->vdev_id, + (u_longlong_t)vd->vdev_guid, buf); + } +} + +void +vdev_dbgmsg_print_tree(vdev_t *vd, int indent) +{ + char state[20]; + + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) { + zfs_dbgmsg("%*svdev %u: %s", indent, "", vd->vdev_id, + vd->vdev_ops->vdev_op_type); + return; + } + + switch (vd->vdev_state) { + case VDEV_STATE_UNKNOWN: + (void) snprintf(state, sizeof (state), "unknown"); + break; + case VDEV_STATE_CLOSED: + (void) snprintf(state, sizeof (state), "closed"); + break; + case VDEV_STATE_OFFLINE: + (void) snprintf(state, sizeof (state), "offline"); + break; + case VDEV_STATE_REMOVED: + (void) snprintf(state, sizeof (state), "removed"); + break; + case VDEV_STATE_CANT_OPEN: + (void) snprintf(state, sizeof (state), "can't open"); + break; + case VDEV_STATE_FAULTED: + (void) snprintf(state, sizeof (state), "faulted"); + break; + case VDEV_STATE_DEGRADED: + (void) snprintf(state, sizeof (state), "degraded"); + break; + case VDEV_STATE_HEALTHY: + (void) snprintf(state, sizeof (state), "healthy"); + break; + default: + (void) snprintf(state, sizeof (state), "", + (uint_t)vd->vdev_state); + } + + zfs_dbgmsg("%*svdev %u: %s%s, guid: %llu, path: %s, %s", indent, + "", (int)vd->vdev_id, vd->vdev_ops->vdev_op_type, + vd->vdev_islog ? " (log)" : "", + (u_longlong_t)vd->vdev_guid, + vd->vdev_path ? vd->vdev_path : "N/A", state); + + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_dbgmsg_print_tree(vd->vdev_child[i], indent + 2); +} + +/* + * Virtual device management. + */ + +static vdev_ops_t *vdev_ops_table[] = { + &vdev_root_ops, + &vdev_raidz_ops, + &vdev_mirror_ops, + &vdev_replacing_ops, + &vdev_spare_ops, + &vdev_disk_ops, + &vdev_file_ops, + &vdev_missing_ops, + &vdev_hole_ops, + &vdev_indirect_ops, + NULL +}; + +/* + * Given a vdev type, return the appropriate ops vector. + */ +static vdev_ops_t * +vdev_getops(const char *type) +{ + vdev_ops_t *ops, **opspp; + + for (opspp = vdev_ops_table; (ops = *opspp) != NULL; opspp++) + if (strcmp(ops->vdev_op_type, type) == 0) + break; + + return (ops); +} + +/* ARGSUSED */ +void +vdev_default_xlate(vdev_t *vd, const range_seg64_t *in, range_seg64_t *res) +{ + res->rs_start = in->rs_start; + res->rs_end = in->rs_end; +} + +/* + * Derive the enumerated allocation bias from string input. + * String origin is either the per-vdev zap or zpool(1M). + */ +static vdev_alloc_bias_t +vdev_derive_alloc_bias(const char *bias) +{ + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + + if (strcmp(bias, VDEV_ALLOC_BIAS_LOG) == 0) + alloc_bias = VDEV_BIAS_LOG; + else if (strcmp(bias, VDEV_ALLOC_BIAS_SPECIAL) == 0) + alloc_bias = VDEV_BIAS_SPECIAL; + else if (strcmp(bias, VDEV_ALLOC_BIAS_DEDUP) == 0) + alloc_bias = VDEV_BIAS_DEDUP; + + return (alloc_bias); +} + +/* + * Default asize function: return the MAX of psize with the asize of + * all children. This is what's used by anything other than RAID-Z. + */ +uint64_t +vdev_default_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); + uint64_t csize; + + for (int c = 0; c < vd->vdev_children; c++) { + csize = vdev_psize_to_asize(vd->vdev_child[c], psize); + asize = MAX(asize, csize); + } + + return (asize); +} + +/* + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. + */ +uint64_t +vdev_get_min_asize(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + + /* + * If our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); + + /* + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. + */ + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); + + /* + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. + */ + if (pvd->vdev_ops == &vdev_raidz_ops) + return ((pvd->vdev_min_asize + pvd->vdev_children - 1) / + pvd->vdev_children); + + return (pvd->vdev_min_asize); +} + +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); +} + +vdev_t * +vdev_lookup_top(spa_t *spa, uint64_t vdev) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + if (vdev < rvd->vdev_children) { + ASSERT(rvd->vdev_child[vdev] != NULL); + return (rvd->vdev_child[vdev]); + } + + return (NULL); +} + +vdev_t * +vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) +{ + vdev_t *mvd; + + if (vd->vdev_guid == guid) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != + NULL) + return (mvd); + + return (NULL); +} + +static int +vdev_count_leaves_impl(vdev_t *vd) +{ + int n = 0; + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (int c = 0; c < vd->vdev_children; c++) + n += vdev_count_leaves_impl(vd->vdev_child[c]); + + return (n); +} + +int +vdev_count_leaves(spa_t *spa) +{ + int rc; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + rc = vdev_count_leaves_impl(spa->spa_root_vdev); + spa_config_exit(spa, SCL_VDEV, FTAG); + + return (rc); +} + +void +vdev_add_child(vdev_t *pvd, vdev_t *cvd) +{ + size_t oldsize, newsize; + uint64_t id = cvd->vdev_id; + vdev_t **newchild; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(cvd->vdev_parent == NULL); + + cvd->vdev_parent = pvd; + + if (pvd == NULL) + return; + + ASSERT(id >= pvd->vdev_children || pvd->vdev_child[id] == NULL); + + oldsize = pvd->vdev_children * sizeof (vdev_t *); + pvd->vdev_children = MAX(pvd->vdev_children, id + 1); + newsize = pvd->vdev_children * sizeof (vdev_t *); + + newchild = kmem_alloc(newsize, KM_SLEEP); + if (pvd->vdev_child != NULL) { + bcopy(pvd->vdev_child, newchild, oldsize); + kmem_free(pvd->vdev_child, oldsize); + } + + pvd->vdev_child = newchild; + pvd->vdev_child[id] = cvd; + + cvd->vdev_top = (pvd->vdev_top ? pvd->vdev_top: cvd); + ASSERT(cvd->vdev_top->vdev_parent->vdev_parent == NULL); + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += cvd->vdev_guid_sum; + + if (cvd->vdev_ops->vdev_op_leaf) { + list_insert_head(&cvd->vdev_spa->spa_leaf_list, cvd); + cvd->vdev_spa->spa_leaf_list_gen++; + } +} + +void +vdev_remove_child(vdev_t *pvd, vdev_t *cvd) +{ + int c; + uint_t id = cvd->vdev_id; + + ASSERT(cvd->vdev_parent == pvd); + + if (pvd == NULL) + return; + + ASSERT(id < pvd->vdev_children); + ASSERT(pvd->vdev_child[id] == cvd); + + pvd->vdev_child[id] = NULL; + cvd->vdev_parent = NULL; + + for (c = 0; c < pvd->vdev_children; c++) + if (pvd->vdev_child[c]) + break; + + if (c == pvd->vdev_children) { + kmem_free(pvd->vdev_child, c * sizeof (vdev_t *)); + pvd->vdev_child = NULL; + pvd->vdev_children = 0; + } + + if (cvd->vdev_ops->vdev_op_leaf) { + spa_t *spa = cvd->vdev_spa; + list_remove(&spa->spa_leaf_list, cvd); + spa->spa_leaf_list_gen++; + } + + /* + * Walk up all ancestors to update guid sum. + */ + for (; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum -= cvd->vdev_guid_sum; +} + +/* + * Remove any holes in the child array. + */ +void +vdev_compact_children(vdev_t *pvd) +{ + vdev_t **newchild, *cvd; + int oldc = pvd->vdev_children; + int newc; + + ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (oldc == 0) + return; + + for (int c = newc = 0; c < oldc; c++) + if (pvd->vdev_child[c]) + newc++; + + if (newc > 0) { + newchild = kmem_zalloc(newc * sizeof (vdev_t *), KM_SLEEP); + + for (int c = newc = 0; c < oldc; c++) { + if ((cvd = pvd->vdev_child[c]) != NULL) { + newchild[newc] = cvd; + cvd->vdev_id = newc++; + } + } + } else { + newchild = NULL; + } + + kmem_free(pvd->vdev_child, oldc * sizeof (vdev_t *)); + pvd->vdev_child = newchild; + pvd->vdev_children = newc; +} + +/* + * Allocate and minimally initialize a vdev_t. + */ +vdev_t * +vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) +{ + vdev_t *vd; + vdev_indirect_config_t *vic; + + vd = kmem_zalloc(sizeof (vdev_t), KM_SLEEP); + vic = &vd->vdev_indirect_config; + + if (spa->spa_root_vdev == NULL) { + ASSERT(ops == &vdev_root_ops); + spa->spa_root_vdev = vd; + spa->spa_load_guid = spa_generate_guid(NULL); + } + + if (guid == 0 && ops != &vdev_hole_ops) { + if (spa->spa_root_vdev == vd) { + /* + * The root vdev's guid will also be the pool guid, + * which must be unique among all pools. + */ + guid = spa_generate_guid(NULL); + } else { + /* + * Any other vdev's guid must be unique within the pool. + */ + guid = spa_generate_guid(spa); + } + ASSERT(!spa_guid_exists(spa_guid(spa), guid)); + } + + vd->vdev_spa = spa; + vd->vdev_id = id; + vd->vdev_guid = guid; + vd->vdev_guid_sum = guid; + vd->vdev_ops = ops; + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_ishole = (ops == &vdev_hole_ops); + vic->vic_prev_indirect_vdev = UINT64_MAX; + + rw_init(&vd->vdev_indirect_rwlock, NULL, RW_DEFAULT, NULL); + mutex_init(&vd->vdev_obsolete_lock, NULL, MUTEX_DEFAULT, NULL); + vd->vdev_obsolete_segments = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + + /* + * Initialize rate limit structs for events. We rate limit ZIO delay + * and checksum events so that we don't overwhelm ZED with thousands + * of events when a disk is acting up. + */ + zfs_ratelimit_init(&vd->vdev_delay_rl, &zfs_slow_io_events_per_second, + 1); + zfs_ratelimit_init(&vd->vdev_checksum_rl, + &zfs_checksum_events_per_second, 1); + + list_link_init(&vd->vdev_config_dirty_node); + list_link_init(&vd->vdev_state_dirty_node); + list_link_init(&vd->vdev_initialize_node); + list_link_init(&vd->vdev_leaf_node); + list_link_init(&vd->vdev_trim_node); + mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_NOLOCKDEP, NULL); + mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); + + mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_initialize_io_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&vd->vdev_trim_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_autotrim_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_trim_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_trim_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_autotrim_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_trim_io_cv, NULL, CV_DEFAULT, NULL); + + mutex_init(&vd->vdev_rebuild_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_rebuild_io_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_cv, NULL, CV_DEFAULT, NULL); + cv_init(&vd->vdev_rebuild_io_cv, NULL, CV_DEFAULT, NULL); + + for (int t = 0; t < DTL_TYPES; t++) { + vd->vdev_dtl[t] = range_tree_create(NULL, RANGE_SEG64, NULL, 0, + 0); + } + + txg_list_create(&vd->vdev_ms_list, spa, + offsetof(struct metaslab, ms_txg_node)); + txg_list_create(&vd->vdev_dtl_list, spa, + offsetof(struct vdev, vdev_dtl_node)); + vd->vdev_stat.vs_timestamp = gethrtime(); + vdev_queue_init(vd); + vdev_cache_init(vd); + + return (vd); +} + +/* + * Allocate a new vdev. The 'alloctype' is used to control whether we are + * creating a new vdev or loading an existing one - the behavior is slightly + * different for each case. + */ +int +vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, + int alloctype) +{ + vdev_ops_t *ops; + char *type; + uint64_t guid = 0, islog, nparity; + vdev_t *vd; + vdev_indirect_config_t *vic; + char *tmp = NULL; + int rc; + vdev_alloc_bias_t alloc_bias = VDEV_BIAS_NONE; + boolean_t top_level = (parent && !parent->vdev_parent); + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (SET_ERROR(EINVAL)); + + if ((ops = vdev_getops(type)) == NULL) + return (SET_ERROR(EINVAL)); + + /* + * If this is a load, get the vdev guid from the nvlist. + * Otherwise, vdev_alloc_common() will generate one for us. + */ + if (alloctype == VDEV_ALLOC_LOAD) { + uint64_t label_id; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, &label_id) || + label_id != id) + return (SET_ERROR(EINVAL)); + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_SPARE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_L2CACHE) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * The first allocated vdev must be of type 'root'. + */ + if (ops != &vdev_root_ops && spa->spa_root_vdev == NULL) + return (SET_ERROR(EINVAL)); + + /* + * Determine whether we're a log vdev. + */ + islog = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_LOG, &islog); + if (islog && spa_version(spa) < SPA_VERSION_SLOGS) + return (SET_ERROR(ENOTSUP)); + + if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) + return (SET_ERROR(ENOTSUP)); + + /* + * Set the nparity property for RAID-Z vdevs. + */ + nparity = -1ULL; + if (ops == &vdev_raidz_ops) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, + &nparity) == 0) { + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) + return (SET_ERROR(EINVAL)); + /* + * Previous versions could only support 1 or 2 parity + * device. + */ + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (SET_ERROR(ENOTSUP)); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) + return (SET_ERROR(ENOTSUP)); + } else { + /* + * We require the parity to be specified for SPAs that + * support multiple parity levels. + */ + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) + return (SET_ERROR(EINVAL)); + /* + * Otherwise, we default to 1 parity device for RAID-Z. + */ + nparity = 1; + } + } else { + nparity = 0; + } + ASSERT(nparity != -1ULL); + + /* + * If creating a top-level vdev, check for allocation classes input + */ + if (top_level && alloctype == VDEV_ALLOC_ADD) { + char *bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + &bias) == 0) { + alloc_bias = vdev_derive_alloc_bias(bias); + + /* spa_vdev_add() expects feature to be enabled */ + if (spa->spa_load_state != SPA_LOAD_CREATE && + !spa_feature_is_enabled(spa, + SPA_FEATURE_ALLOCATION_CLASSES)) { + return (SET_ERROR(ENOTSUP)); + } + } + } + + vd = vdev_alloc_common(spa, id, guid, ops); + vic = &vd->vdev_indirect_config; + + vd->vdev_islog = islog; + vd->vdev_nparity = nparity; + if (top_level && alloc_bias != VDEV_BIAS_NONE) + vd->vdev_alloc_bias = alloc_bias; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &vd->vdev_path) == 0) + vd->vdev_path = spa_strdup(vd->vdev_path); + + /* + * ZPOOL_CONFIG_AUX_STATE = "external" means we previously forced a + * fault on a vdev and want it to persist across imports (like with + * zpool offline -f). + */ + rc = nvlist_lookup_string(nv, ZPOOL_CONFIG_AUX_STATE, &tmp); + if (rc == 0 && tmp != NULL && strcmp(tmp, "external") == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; + vd->vdev_faulted = 1; + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + } + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_DEVID, &vd->vdev_devid) == 0) + vd->vdev_devid = spa_strdup(vd->vdev_devid); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, + &vd->vdev_physpath) == 0) + vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &vd->vdev_enc_sysfs_path) == 0) + vd->vdev_enc_sysfs_path = spa_strdup(vd->vdev_enc_sysfs_path); + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); + + /* + * Set the whole_disk property. If it's not specified, leave the value + * as -1. + */ + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &vd->vdev_wholedisk) != 0) + vd->vdev_wholedisk = -1ULL; + + ASSERT0(vic->vic_mapping_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + &vic->vic_mapping_object); + ASSERT0(vic->vic_births_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + &vic->vic_births_object); + ASSERT3U(vic->vic_prev_indirect_vdev, ==, UINT64_MAX); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + &vic->vic_prev_indirect_vdev); + + /* + * Look for the 'not present' flag. This will only be set if the device + * was not present at the time of import. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); + + /* + * Get the alignment requirement. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + + /* + * Retrieve the vdev creation time. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + &vd->vdev_crtxg); + + /* + * If we're a top-level vdev, try to load the allocation parameters. + */ + if (top_level && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + &vd->vdev_ms_array); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + &vd->vdev_ms_shift); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASIZE, + &vd->vdev_asize); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVING, + &vd->vdev_removing); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + &vd->vdev_top_zap); + } else { + ASSERT0(vd->vdev_top_zap); + } + + if (top_level && alloctype != VDEV_ALLOC_ATTACH) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL); + /* Note: metaslab_group_create() is now deferred */ + } + + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { + (void) nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_VDEV_LEAF_ZAP, &vd->vdev_leaf_zap); + } else { + ASSERT0(vd->vdev_leaf_zap); + } + + /* + * If we're a leaf vdev, try to load the DTL object and other state. + */ + + if (vd->vdev_ops->vdev_op_leaf && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { + if (alloctype == VDEV_ALLOC_LOAD) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, + &vd->vdev_dtl_object); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, + &vd->vdev_unspare); + } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, + &vd->vdev_offline); + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + &vd->vdev_resilver_txg); + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + &vd->vdev_rebuild_txg); + + if (nvlist_exists(nv, ZPOOL_CONFIG_RESILVER_DEFER)) + vdev_defer_resilver(vd); + + /* + * In general, when importing a pool we want to ignore the + * persistent fault state, as the diagnosis made on another + * system may not be valid in the current context. The only + * exception is if we forced a vdev to a persistently faulted + * state with 'zpool offline -f'. The persistent fault will + * remain across imports until cleared. + * + * Local vdevs will remain in the faulted state. + */ + if (spa_load_state(spa) == SPA_LOAD_OPEN || + spa_load_state(spa) == SPA_LOAD_IMPORT) { + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, + &vd->vdev_faulted); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, + &vd->vdev_degraded); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, + &vd->vdev_removed); + + if (vd->vdev_faulted || vd->vdev_degraded) { + char *aux; + + vd->vdev_label_aux = + VDEV_AUX_ERR_EXCEEDED; + if (nvlist_lookup_string(nv, + ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && + strcmp(aux, "external") == 0) + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + else + vd->vdev_faulted = 0ULL; + } + } + } + + /* + * Add ourselves to the parent's list of children. + */ + vdev_add_child(parent, vd); + + *vdp = vd; + + return (0); +} + +void +vdev_free(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + /* + * Scan queues are normally destroyed at the end of a scan. If the + * queue exists here, that implies the vdev is being removed while + * the scan is still running. + */ + if (vd->vdev_scan_io_queue != NULL) { + mutex_enter(&vd->vdev_scan_io_queue_lock); + dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); + vd->vdev_scan_io_queue = NULL; + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * vdev_free() implies closing the vdev first. This is simpler than + * trying to ensure complicated semantics for all callers. + */ + vdev_close(vd); + + ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + /* + * Free all children. + */ + for (int c = 0; c < vd->vdev_children; c++) + vdev_free(vd->vdev_child[c]); + + ASSERT(vd->vdev_child == NULL); + ASSERT(vd->vdev_guid_sum == vd->vdev_guid); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + } + + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * Remove this vdev from its parent's child list. + */ + vdev_remove_child(vd->vdev_parent, vd); + + ASSERT(vd->vdev_parent == NULL); + ASSERT(!list_link_active(&vd->vdev_leaf_node)); + + /* + * Clean up vdev structure. + */ + vdev_queue_fini(vd); + vdev_cache_fini(vd); + + if (vd->vdev_path) + spa_strfree(vd->vdev_path); + if (vd->vdev_devid) + spa_strfree(vd->vdev_devid); + if (vd->vdev_physpath) + spa_strfree(vd->vdev_physpath); + + if (vd->vdev_enc_sysfs_path) + spa_strfree(vd->vdev_enc_sysfs_path); + + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); + + if (vd->vdev_isspare) + spa_spare_remove(vd); + if (vd->vdev_isl2cache) + spa_l2cache_remove(vd); + + txg_list_destroy(&vd->vdev_ms_list); + txg_list_destroy(&vd->vdev_dtl_list); + + mutex_enter(&vd->vdev_dtl_lock); + space_map_close(vd->vdev_dtl_sm); + for (int t = 0; t < DTL_TYPES; t++) { + range_tree_vacate(vd->vdev_dtl[t], NULL, NULL); + range_tree_destroy(vd->vdev_dtl[t]); + } + mutex_exit(&vd->vdev_dtl_lock); + + EQUIV(vd->vdev_indirect_births != NULL, + vd->vdev_indirect_mapping != NULL); + if (vd->vdev_indirect_births != NULL) { + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vdev_indirect_births_close(vd->vdev_indirect_births); + } + + if (vd->vdev_obsolete_sm != NULL) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + } + range_tree_destroy(vd->vdev_obsolete_segments); + rw_destroy(&vd->vdev_indirect_rwlock); + mutex_destroy(&vd->vdev_obsolete_lock); + + mutex_destroy(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_stat_lock); + mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_scan_io_queue_lock); + + mutex_destroy(&vd->vdev_initialize_lock); + mutex_destroy(&vd->vdev_initialize_io_lock); + cv_destroy(&vd->vdev_initialize_io_cv); + cv_destroy(&vd->vdev_initialize_cv); + + mutex_destroy(&vd->vdev_trim_lock); + mutex_destroy(&vd->vdev_autotrim_lock); + mutex_destroy(&vd->vdev_trim_io_lock); + cv_destroy(&vd->vdev_trim_cv); + cv_destroy(&vd->vdev_autotrim_cv); + cv_destroy(&vd->vdev_trim_io_cv); + + mutex_destroy(&vd->vdev_rebuild_lock); + mutex_destroy(&vd->vdev_rebuild_io_lock); + cv_destroy(&vd->vdev_rebuild_cv); + cv_destroy(&vd->vdev_rebuild_io_cv); + + zfs_ratelimit_fini(&vd->vdev_delay_rl); + zfs_ratelimit_fini(&vd->vdev_checksum_rl); + + if (vd == spa->spa_root_vdev) + spa->spa_root_vdev = NULL; + + kmem_free(vd, sizeof (vdev_t)); +} + +/* + * Transfer top-level vdev state from svd to tvd. + */ +static void +vdev_top_transfer(vdev_t *svd, vdev_t *tvd) +{ + spa_t *spa = svd->vdev_spa; + metaslab_t *msp; + vdev_t *vd; + int t; + + ASSERT(tvd == tvd->vdev_top); + + tvd->vdev_pending_fastwrite = svd->vdev_pending_fastwrite; + tvd->vdev_ms_array = svd->vdev_ms_array; + tvd->vdev_ms_shift = svd->vdev_ms_shift; + tvd->vdev_ms_count = svd->vdev_ms_count; + tvd->vdev_top_zap = svd->vdev_top_zap; + + svd->vdev_ms_array = 0; + svd->vdev_ms_shift = 0; + svd->vdev_ms_count = 0; + svd->vdev_top_zap = 0; + + if (tvd->vdev_mg) + ASSERT3P(tvd->vdev_mg, ==, svd->vdev_mg); + tvd->vdev_mg = svd->vdev_mg; + tvd->vdev_ms = svd->vdev_ms; + + svd->vdev_mg = NULL; + svd->vdev_ms = NULL; + + if (tvd->vdev_mg != NULL) + tvd->vdev_mg->mg_vd = tvd; + + tvd->vdev_checkpoint_sm = svd->vdev_checkpoint_sm; + svd->vdev_checkpoint_sm = NULL; + + tvd->vdev_alloc_bias = svd->vdev_alloc_bias; + svd->vdev_alloc_bias = VDEV_BIAS_NONE; + + tvd->vdev_stat.vs_alloc = svd->vdev_stat.vs_alloc; + tvd->vdev_stat.vs_space = svd->vdev_stat.vs_space; + tvd->vdev_stat.vs_dspace = svd->vdev_stat.vs_dspace; + + svd->vdev_stat.vs_alloc = 0; + svd->vdev_stat.vs_space = 0; + svd->vdev_stat.vs_dspace = 0; + + /* + * State which may be set on a top-level vdev that's in the + * process of being removed. + */ + ASSERT0(tvd->vdev_indirect_config.vic_births_object); + ASSERT0(tvd->vdev_indirect_config.vic_mapping_object); + ASSERT3U(tvd->vdev_indirect_config.vic_prev_indirect_vdev, ==, -1ULL); + ASSERT3P(tvd->vdev_indirect_mapping, ==, NULL); + ASSERT3P(tvd->vdev_indirect_births, ==, NULL); + ASSERT3P(tvd->vdev_obsolete_sm, ==, NULL); + ASSERT0(tvd->vdev_removing); + ASSERT0(tvd->vdev_rebuilding); + tvd->vdev_removing = svd->vdev_removing; + tvd->vdev_rebuilding = svd->vdev_rebuilding; + tvd->vdev_rebuild_config = svd->vdev_rebuild_config; + tvd->vdev_indirect_config = svd->vdev_indirect_config; + tvd->vdev_indirect_mapping = svd->vdev_indirect_mapping; + tvd->vdev_indirect_births = svd->vdev_indirect_births; + range_tree_swap(&svd->vdev_obsolete_segments, + &tvd->vdev_obsolete_segments); + tvd->vdev_obsolete_sm = svd->vdev_obsolete_sm; + svd->vdev_indirect_config.vic_mapping_object = 0; + svd->vdev_indirect_config.vic_births_object = 0; + svd->vdev_indirect_config.vic_prev_indirect_vdev = -1ULL; + svd->vdev_indirect_mapping = NULL; + svd->vdev_indirect_births = NULL; + svd->vdev_obsolete_sm = NULL; + svd->vdev_removing = 0; + svd->vdev_rebuilding = 0; + + for (t = 0; t < TXG_SIZE; t++) { + while ((msp = txg_list_remove(&svd->vdev_ms_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_ms_list, msp, t); + while ((vd = txg_list_remove(&svd->vdev_dtl_list, t)) != NULL) + (void) txg_list_add(&tvd->vdev_dtl_list, vd, t); + if (txg_list_remove_this(&spa->spa_vdev_txg_list, svd, t)) + (void) txg_list_add(&spa->spa_vdev_txg_list, tvd, t); + } + + if (list_link_active(&svd->vdev_config_dirty_node)) { + vdev_config_clean(svd); + vdev_config_dirty(tvd); + } + + if (list_link_active(&svd->vdev_state_dirty_node)) { + vdev_state_clean(svd); + vdev_state_dirty(tvd); + } + + tvd->vdev_deflate_ratio = svd->vdev_deflate_ratio; + svd->vdev_deflate_ratio = 0; + + tvd->vdev_islog = svd->vdev_islog; + svd->vdev_islog = 0; + + dsl_scan_io_queue_vdev_xfer(svd, tvd); +} + +static void +vdev_top_update(vdev_t *tvd, vdev_t *vd) +{ + if (vd == NULL) + return; + + vd->vdev_top = tvd; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_top_update(tvd, vd->vdev_child[c]); +} + +/* + * Add a mirror/replacing vdev above an existing vdev. + */ +vdev_t * +vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) +{ + spa_t *spa = cvd->vdev_spa; + vdev_t *pvd = cvd->vdev_parent; + vdev_t *mvd; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); + + mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; + mvd->vdev_max_asize = cvd->vdev_max_asize; + mvd->vdev_psize = cvd->vdev_psize; + mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; + mvd->vdev_state = cvd->vdev_state; + mvd->vdev_crtxg = cvd->vdev_crtxg; + + vdev_remove_child(pvd, cvd); + vdev_add_child(pvd, mvd); + cvd->vdev_id = mvd->vdev_children; + vdev_add_child(mvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (mvd == mvd->vdev_top) + vdev_top_transfer(cvd, mvd); + + return (mvd); +} + +/* + * Remove a 1-way mirror/replacing vdev from the tree. + */ +void +vdev_remove_parent(vdev_t *cvd) +{ + vdev_t *mvd = cvd->vdev_parent; + vdev_t *pvd = mvd->vdev_parent; + + ASSERT(spa_config_held(cvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + ASSERT(mvd->vdev_children == 1); + ASSERT(mvd->vdev_ops == &vdev_mirror_ops || + mvd->vdev_ops == &vdev_replacing_ops || + mvd->vdev_ops == &vdev_spare_ops); + cvd->vdev_ashift = mvd->vdev_ashift; + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; + vdev_remove_child(mvd, cvd); + vdev_remove_child(pvd, mvd); + + /* + * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. + * Otherwise, we could have detached an offline device, and when we + * go to import the pool we'll think we have two top-level vdevs, + * instead of a different version of the same top-level vdev. + */ + if (mvd->vdev_top == mvd) { + uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; + cvd->vdev_guid += guid_delta; + cvd->vdev_guid_sum += guid_delta; + + /* + * If pool not set for autoexpand, we need to also preserve + * mvd's asize to prevent automatic expansion of cvd. + * Otherwise if we are adjusting the mirror by attaching and + * detaching children of non-uniform sizes, the mirror could + * autoexpand, unexpectedly requiring larger devices to + * re-establish the mirror. + */ + if (!cvd->vdev_spa->spa_autoexpand) + cvd->vdev_asize = mvd->vdev_asize; + } + cvd->vdev_id = mvd->vdev_id; + vdev_add_child(pvd, cvd); + vdev_top_update(cvd->vdev_top, cvd->vdev_top); + + if (cvd == cvd->vdev_top) + vdev_top_transfer(mvd, cvd); + + ASSERT(mvd->vdev_children == 0); + vdev_free(mvd); +} + +static void +vdev_metaslab_group_create(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + /* + * metaslab_group_create was delayed until allocation bias was available + */ + if (vd->vdev_mg == NULL) { + metaslab_class_t *mc; + + if (vd->vdev_islog && vd->vdev_alloc_bias == VDEV_BIAS_NONE) + vd->vdev_alloc_bias = VDEV_BIAS_LOG; + + ASSERT3U(vd->vdev_islog, ==, + (vd->vdev_alloc_bias == VDEV_BIAS_LOG)); + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + mc = spa_log_class(spa); + break; + case VDEV_BIAS_SPECIAL: + mc = spa_special_class(spa); + break; + case VDEV_BIAS_DEDUP: + mc = spa_dedup_class(spa); + break; + default: + mc = spa_normal_class(spa); + } + + vd->vdev_mg = metaslab_group_create(mc, vd, + spa->spa_alloc_count); + + /* + * The spa ashift values currently only reflect the + * general vdev classes. Class destination is late + * binding so ashift checking had to wait until now + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + mc == spa_normal_class(spa) && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + } +} + +int +vdev_metaslab_init(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + uint64_t m; + uint64_t oldc = vd->vdev_ms_count; + uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; + metaslab_t **mspp; + int error; + boolean_t expanding = (oldc != 0); + + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + /* + * This vdev is not being allocated from yet or is a hole. + */ + if (vd->vdev_ms_shift == 0) + return (0); + + ASSERT(!vd->vdev_ishole); + + ASSERT(oldc <= newc); + + mspp = vmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); + + if (expanding) { + bcopy(vd->vdev_ms, mspp, oldc * sizeof (*mspp)); + vmem_free(vd->vdev_ms, oldc * sizeof (*mspp)); + } + + vd->vdev_ms = mspp; + vd->vdev_ms_count = newc; + for (m = oldc; m < newc; m++) { + uint64_t object = 0; + + /* + * vdev_ms_array may be 0 if we are creating the "fake" + * metaslabs for an indirect vdev for zdb's leak detection. + * See zdb_leak_init(). + */ + if (txg == 0 && vd->vdev_ms_array != 0) { + error = dmu_read(mos, vd->vdev_ms_array, + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); + if (error != 0) { + vdev_dbgmsg(vd, "unable to read the metaslab " + "array [error=%d]", error); + return (error); + } + } + +#ifndef _KERNEL + /* + * To accommodate zdb_leak_init() fake indirect + * metaslabs, we allocate a metaslab group for + * indirect vdevs which normally don't have one. + */ + if (vd->vdev_mg == NULL) { + ASSERT0(vdev_is_concrete(vd)); + vdev_metaslab_group_create(vd); + } +#endif + error = metaslab_init(vd->vdev_mg, m, object, txg, + &(vd->vdev_ms[m])); + if (error != 0) { + vdev_dbgmsg(vd, "metaslab_init failed [error=%d]", + error); + return (error); + } + } + + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + /* + * If the vdev is being removed we don't activate + * the metaslabs since we want to ensure that no new + * allocations are performed on this device. + */ + if (!expanding && !vd->vdev_removing) { + metaslab_group_activate(vd->vdev_mg); + } + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + + /* + * Regardless whether this vdev was just added or it is being + * expanded, the metaslab count has changed. Recalculate the + * block limit. + */ + spa_log_sm_set_blocklimit(spa); + + return (0); +} + +void +vdev_metaslab_fini(vdev_t *vd) +{ + if (vd->vdev_checkpoint_sm != NULL) { + ASSERT(spa_feature_is_active(vd->vdev_spa, + SPA_FEATURE_POOL_CHECKPOINT)); + space_map_close(vd->vdev_checkpoint_sm); + /* + * Even though we close the space map, we need to set its + * pointer to NULL. The reason is that vdev_metaslab_fini() + * may be called multiple times for certain operations + * (i.e. when destroying a pool) so we need to ensure that + * this clause never executes twice. This logic is similar + * to the one used for the vdev_ms clause below. + */ + vd->vdev_checkpoint_sm = NULL; + } + + if (vd->vdev_ms != NULL) { + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + + uint64_t count = vd->vdev_ms_count; + for (uint64_t m = 0; m < count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + if (msp != NULL) + metaslab_fini(msp); + } + vmem_free(vd->vdev_ms, count * sizeof (metaslab_t *)); + vd->vdev_ms = NULL; + + vd->vdev_ms_count = 0; + + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) + ASSERT0(mg->mg_histogram[i]); + } + ASSERT0(vd->vdev_ms_count); + ASSERT3U(vd->vdev_pending_fastwrite, ==, 0); +} + +typedef struct vdev_probe_stats { + boolean_t vps_readable; + boolean_t vps_writeable; + int vps_flags; +} vdev_probe_stats_t; + +static void +vdev_probe_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; + vdev_probe_stats_t *vps = zio->io_private; + + ASSERT(vd->vdev_probe_zio != NULL); + + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_error == 0) + vps->vps_readable = 1; + if (zio->io_error == 0 && spa_writeable(spa)) { + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, + zio->io_offset, zio->io_size, zio->io_abd, + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); + } else { + abd_free(zio->io_abd); + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + if (zio->io_error == 0) + vps->vps_writeable = 1; + abd_free(zio->io_abd); + } else if (zio->io_type == ZIO_TYPE_NULL) { + zio_t *pio; + zio_link_t *zl; + + vd->vdev_cant_read |= !vps->vps_readable; + vd->vdev_cant_write |= !vps->vps_writeable; + + if (vdev_readable(vd) && + (vdev_writeable(vd) || !spa_writeable(spa))) { + zio->io_error = 0; + } else { + ASSERT(zio->io_error != 0); + vdev_dbgmsg(vd, "failed probe"); + zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, + spa, vd, NULL, NULL, 0, 0); + zio->io_error = SET_ERROR(ENXIO); + } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = SET_ERROR(ENXIO); + + kmem_free(vps, sizeof (*vps)); + } +} + +/* + * Determine whether this device is accessible. + * + * Read and write to several known locations: the pad regions of each + * vdev label but the first, which we leave alone in case it contains + * a VTOC. + */ +zio_t * +vdev_probe(vdev_t *vd, zio_t *zio) +{ + spa_t *spa = vd->vdev_spa; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); + + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_TRYHARD; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + /* + * We can't change the vdev state in this context, so we + * kick off an async task to do it on our behalf. + */ + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } + } + + if (zio != NULL) + zio_add_child(zio, pio); + + mutex_exit(&vd->vdev_probe_lock); + + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } + + for (int l = 1; l < VDEV_LABELS; l++) { + zio_nowait(zio_read_phys(pio, vd, + vdev_label_offset(vd->vdev_psize, l, + offsetof(vdev_label_t, vl_be)), VDEV_PAD_SIZE, + abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE), + ZIO_CHECKSUM_OFF, vdev_probe_done, vps, + ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); + } + + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); +} + +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +static boolean_t +vdev_uses_zvols(vdev_t *vd) +{ +#ifdef _KERNEL + if (zvol_is_zvol(vd->vdev_path)) + return (B_TRUE); +#endif + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (vdev_uses_zvols(vd)) { +retry_sync: + for (int c = 0; c < children; c++) + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + } else { + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + if (tq == NULL) + goto retry_sync; + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, + vd->vdev_child[c], TQ_SLEEP) != TASKQID_INVALID); + + taskq_destroy(tq); + } + + vd->vdev_nonrot = B_TRUE; + + for (int c = 0; c < children; c++) + vd->vdev_nonrot &= vd->vdev_child[c]->vdev_nonrot; +} + +/* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the "typical" blocksize. + * Even though SPA_MAXBLOCKSIZE changed, this algorithm can not change, + * otherwise it would inconsistently account for existing bp's. + */ +static void +vdev_set_deflate_ratio(vdev_t *vd) +{ + if (vd == vd->vdev_top && !vd->vdev_ishole && vd->vdev_ashift != 0) { + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); + } +} + +/* + * Prepare a virtual device for access. + */ +int +vdev_open(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + int error; + uint64_t osize = 0; + uint64_t max_osize = 0; + uint64_t asize, max_asize, psize; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; + + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || + vd->vdev_state == VDEV_STATE_CANT_OPEN || + vd->vdev_state == VDEV_STATE_OFFLINE); + + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); + + /* + * If this vdev is not removed, check its fault status. If it's + * faulted, bail out of the open. + */ + if (!vd->vdev_removed && vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (SET_ERROR(ENXIO)); + } else if (vd->vdev_offline) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, VDEV_AUX_NONE); + return (SET_ERROR(ENXIO)); + } + + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); + /* + * Physical volume size should never be larger than its max size, unless + * the disk has shrunk while we were reading it or the device is buggy + * or damaged: either way it's not safe for use, bail out of the open. + */ + if (osize > max_osize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_OPEN_FAILED); + return (SET_ERROR(ENXIO)); + } + + /* + * Reset the vdev_reopening flag so that we actually close + * the vdev on error. + */ + vd->vdev_reopening = B_FALSE; + if (zio_injection_enabled && error == 0) + error = zio_handle_device_injection(vd, NULL, SET_ERROR(ENXIO)); + + if (error) { + if (vd->vdev_removed && + vd->vdev_stat.vs_aux != VDEV_AUX_OPEN_FAILED) + vd->vdev_removed = B_FALSE; + + if (vd->vdev_stat.vs_aux == VDEV_AUX_CHILDREN_OFFLINE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_OFFLINE, + vd->vdev_stat.vs_aux); + } else { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + vd->vdev_stat.vs_aux); + } + return (error); + } + + vd->vdev_removed = B_FALSE; + + /* + * Recheck the faulted flag now that we have confirmed that + * the vdev is accessible. If we're faulted, bail. + */ + if (vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (SET_ERROR(ENXIO)); + } + + if (vd->vdev_degraded) { + ASSERT(vd->vdev_children == 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_ERR_EXCEEDED); + } else { + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); + } + + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + + for (int c = 0; c < vd->vdev_children; c++) { + if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, + VDEV_AUX_NONE); + break; + } + } + + osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); + max_osize = P2ALIGN(max_osize, (uint64_t)sizeof (vdev_label_t)); + + if (vd->vdev_children == 0) { + if (osize < SPA_MINDEVSIZE) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (SET_ERROR(EOVERFLOW)); + } + psize = osize; + asize = osize - (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE); + max_asize = max_osize - (VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE); + } else { + if (vd->vdev_parent != NULL && osize < SPA_MINDEVSIZE - + (VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE)) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_TOO_SMALL); + return (SET_ERROR(EOVERFLOW)); + } + psize = 0; + asize = osize; + max_asize = max_osize; + } + + /* + * If the vdev was expanded, record this so that we can re-create the + * uberblock rings in labels {2,3}, during the next sync. + */ + if ((psize > vd->vdev_psize) && (vd->vdev_psize != 0)) + vd->vdev_copy_uberblocks = B_TRUE; + + vd->vdev_psize = psize; + + /* + * Make sure the allocatable size hasn't shrunk too much. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EINVAL)); + } + + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); + vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); + + if (vd->vdev_logical_ashift > ASHIFT_MAX) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (SET_ERROR(EDOM)); + } + + if (vd->vdev_asize == 0) { + /* + * This is the first-ever open, so use the computed values. + * For compatibility, a different ashift can be requested. + */ + vd->vdev_asize = asize; + vd->vdev_max_asize = max_asize; + if (vd->vdev_ashift != 0 && (vd->vdev_ashift < ASHIFT_MIN || + vd->vdev_ashift > ASHIFT_MAX)) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_ASHIFT); + return (SET_ERROR(EDOM)); + } + } else { + /* + * Make sure the alignment required hasn't increased. + */ + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && + vd->vdev_ops->vdev_op_leaf) { + zfs_ereport_post(FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT, + spa, vd, NULL, NULL, 0, 0); + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (SET_ERROR(EDOM)); + + } + vd->vdev_max_asize = max_asize; + } + + /* + * If all children are healthy we update asize if either: + * The asize has increased, due to a device expansion caused by dynamic + * LUN growth or vdev replacement, and automatic expansion is enabled; + * making the additional space available. + * + * The asize has decreased, due to a device shrink usually caused by a + * vdev replace with a smaller device. This ensures that calculations + * based of max_asize and asize e.g. esize are always valid. It's safe + * to do this as we've already validated that asize is greater than + * vdev_min_asize. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + ((asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) || + (asize < vd->vdev_asize))) + vd->vdev_asize = asize; + + vdev_set_min_asize(vd); + + /* + * Ensure we can issue some IO before declaring the + * vdev open for business. + */ + if (vd->vdev_ops->vdev_op_leaf && + (error = zio_wait(vdev_probe(vd, NULL))) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + VDEV_AUX_ERR_EXCEEDED); + return (error); + } + + /* + * Track the min and max ashift values for normal data devices. + */ + if (vd->vdev_top == vd && vd->vdev_ashift != 0 && + vd->vdev_alloc_bias == VDEV_BIAS_NONE && + vd->vdev_islog == 0 && vd->vdev_aux == NULL) { + if (vd->vdev_ashift > spa->spa_max_ashift) + spa->spa_max_ashift = vd->vdev_ashift; + if (vd->vdev_ashift < spa->spa_min_ashift) + spa->spa_min_ashift = vd->vdev_ashift; + } + + /* + * If this is a leaf vdev, assess whether a resilver is needed. + * But don't do this if we are doing a reopen for a scrub, since + * this would just restart the scrub we are already doing. + */ + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen) + dsl_scan_assess_vdev(spa->spa_dsl_pool, vd); + + return (0); +} + +/* + * Called once the vdevs are all opened, this routine validates the label + * contents. This needs to be done before vdev_load() so that we don't + * inadvertently do repair I/Os to the wrong device. + * + * This function will only return failure if one of the vdevs indicates that it + * has since been destroyed or exported. This is only possible if + * /etc/zfs/zpool.cache was readonly at the time. Otherwise, the vdev state + * will be updated but the function will return 0. + */ +int +vdev_validate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + uint64_t guid = 0, aux_guid = 0, top_guid; + uint64_t state; + nvlist_t *nvl; + uint64_t txg; + + if (vdev_validate_skip) + return (0); + + for (uint64_t c = 0; c < vd->vdev_children; c++) + if (vdev_validate(vd->vdev_child[c]) != 0) + return (SET_ERROR(EBADF)); + + /* + * If the device has already failed, or was marked offline, don't do + * any further validation. Otherwise, label I/O will fail and we will + * overwrite the previous state. + */ + if (!vd->vdev_ops->vdev_op_leaf || !vdev_readable(vd)) + return (0); + + /* + * If we are performing an extreme rewind, we allow for a label that + * was modified at a point after the current txg. + * If config lock is not held do not check for the txg. spa_sync could + * be updating the vdev's label before updating spa_last_synced_txg. + */ + if (spa->spa_extreme_rewind || spa_last_synced_txg(spa) == 0 || + spa_config_held(spa, SCL_CONFIG, RW_WRITER) != SCL_CONFIG) + txg = UINT64_MAX; + else + txg = spa_last_synced_txg(spa); + + if ((label = vdev_label_read_config(vd, txg)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + vdev_dbgmsg(vd, "vdev_validate: failed reading config for " + "txg %llu", (u_longlong_t)txg); + return (0); + } + + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev split into other pool"); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_GUID); + return (0); + } + + /* + * If config is not trusted then ignore the spa guid check. This is + * necessary because if the machine crashed during a re-guid the new + * guid might have been written to all of the vdev labels, but not the + * cached config. The check will be performed again once we have the + * trusted config from the MOS. + */ + if (spa->spa_trust_config && guid != spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: vdev label pool_guid doesn't " + "match config (%llu != %llu)", (u_longlong_t)guid, + (u_longlong_t)spa_guid(spa)); + return (0); + } + + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_GUID); + return (0); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) + != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_TOP_GUID); + return (0); + } + + /* + * If this vdev just became a top-level vdev because its sibling was + * detached, it will have adopted the parent's vdev guid -- but the + * label may or may not be on disk yet. Fortunately, either version + * of the label will have the same top guid, so if we're a top-level + * vdev, we can safely compare to that instead. + * However, if the config comes from a cachefile that failed to update + * after the detach, a top-level vdev will appear as a non top-level + * vdev in the config. Also relax the constraints if we perform an + * extreme rewind. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. + */ + if (vd->vdev_guid != guid && vd->vdev_guid != aux_guid) { + boolean_t mismatch = B_FALSE; + if (spa->spa_trust_config && !spa->spa_extreme_rewind) { + if (vd != vd->vdev_top || vd->vdev_guid != top_guid) + mismatch = B_TRUE; + } else { + if (vd->vdev_guid != top_guid && + vd->vdev_top->vdev_guid != guid) + mismatch = B_TRUE; + } + + if (mismatch) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: config guid " + "doesn't match label guid"); + vdev_dbgmsg(vd, "CONFIG: guid %llu, top_guid %llu", + (u_longlong_t)vd->vdev_guid, + (u_longlong_t)vd->vdev_top->vdev_guid); + vdev_dbgmsg(vd, "LABEL: guid %llu, top_guid %llu, " + "aux_guid %llu", (u_longlong_t)guid, + (u_longlong_t)top_guid, (u_longlong_t)aux_guid); + return (0); + } + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + vdev_dbgmsg(vd, "vdev_validate: '%s' missing from label", + ZPOOL_CONFIG_POOL_STATE); + return (0); + } + + nvlist_free(label); + + /* + * If this is a verbatim import, no need to check the + * state of the pool. + */ + if (!(spa->spa_import_flags & ZFS_IMPORT_VERBATIM) && + spa_load_state(spa) == SPA_LOAD_OPEN && + state != POOL_STATE_ACTIVE) { + vdev_dbgmsg(vd, "vdev_validate: invalid pool state (%llu) " + "for spa %s", (u_longlong_t)state, spa->spa_name); + return (SET_ERROR(EBADF)); + } + + /* + * If we were able to open and validate a vdev that was + * previously marked permanently unavailable, clear that state + * now. + */ + if (vd->vdev_not_present) + vd->vdev_not_present = 0; + + return (0); +} + +static void +vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) +{ + if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { + if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " + "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, + dvd->vdev_path, svd->vdev_path); + spa_strfree(dvd->vdev_path); + dvd->vdev_path = spa_strdup(svd->vdev_path); + } + } else if (svd->vdev_path != NULL) { + dvd->vdev_path = spa_strdup(svd->vdev_path); + zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", + (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); + } +} + +/* + * Recursively copy vdev paths from one vdev to another. Source and destination + * vdev trees must have same geometry otherwise return error. Intended to copy + * paths from userland config into MOS config. + */ +int +vdev_copy_path_strict(vdev_t *svd, vdev_t *dvd) +{ + if ((svd->vdev_ops == &vdev_missing_ops) || + (svd->vdev_ishole && dvd->vdev_ishole) || + (dvd->vdev_ops == &vdev_indirect_ops)) + return (0); + + if (svd->vdev_ops != dvd->vdev_ops) { + vdev_dbgmsg(svd, "vdev_copy_path: vdev type mismatch: %s != %s", + svd->vdev_ops->vdev_op_type, dvd->vdev_ops->vdev_op_type); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_guid != dvd->vdev_guid) { + vdev_dbgmsg(svd, "vdev_copy_path: guids mismatch (%llu != " + "%llu)", (u_longlong_t)svd->vdev_guid, + (u_longlong_t)dvd->vdev_guid); + return (SET_ERROR(EINVAL)); + } + + if (svd->vdev_children != dvd->vdev_children) { + vdev_dbgmsg(svd, "vdev_copy_path: children count mismatch: " + "%llu != %llu", (u_longlong_t)svd->vdev_children, + (u_longlong_t)dvd->vdev_children); + return (SET_ERROR(EINVAL)); + } + + for (uint64_t i = 0; i < svd->vdev_children; i++) { + int error = vdev_copy_path_strict(svd->vdev_child[i], + dvd->vdev_child[i]); + if (error != 0) + return (error); + } + + if (svd->vdev_ops->vdev_op_leaf) + vdev_copy_path_impl(svd, dvd); + + return (0); +} + +static void +vdev_copy_path_search(vdev_t *stvd, vdev_t *dvd) +{ + ASSERT(stvd->vdev_top == stvd); + ASSERT3U(stvd->vdev_id, ==, dvd->vdev_top->vdev_id); + + for (uint64_t i = 0; i < dvd->vdev_children; i++) { + vdev_copy_path_search(stvd, dvd->vdev_child[i]); + } + + if (!dvd->vdev_ops->vdev_op_leaf || !vdev_is_concrete(dvd)) + return; + + /* + * The idea here is that while a vdev can shift positions within + * a top vdev (when replacing, attaching mirror, etc.) it cannot + * step outside of it. + */ + vdev_t *vd = vdev_lookup_by_guid(stvd, dvd->vdev_guid); + + if (vd == NULL || vd->vdev_ops != dvd->vdev_ops) + return; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + vdev_copy_path_impl(vd, dvd); +} + +/* + * Recursively copy vdev paths from one root vdev to another. Source and + * destination vdev trees may differ in geometry. For each destination leaf + * vdev, search a vdev with the same guid and top vdev id in the source. + * Intended to copy paths from userland config into MOS config. + */ +void +vdev_copy_path_relaxed(vdev_t *srvd, vdev_t *drvd) +{ + uint64_t children = MIN(srvd->vdev_children, drvd->vdev_children); + ASSERT(srvd->vdev_ops == &vdev_root_ops); + ASSERT(drvd->vdev_ops == &vdev_root_ops); + + for (uint64_t i = 0; i < children; i++) { + vdev_copy_path_search(srvd->vdev_child[i], + drvd->vdev_child[i]); + } +} + +/* + * Close a virtual device. + */ +void +vdev_close(vdev_t *vd) +{ + vdev_t *pvd = vd->vdev_parent; + spa_t *spa __maybe_unused = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ + if (pvd != NULL && pvd->vdev_reopening) + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); + + vd->vdev_ops->vdev_op_close(vd); + + vdev_cache_purge(vd); + + /* + * We record the previous state before we close it, so that if we are + * doing a reopen(), we don't generate FMA ereports if we notice that + * it's still faulted. + */ + vd->vdev_prevstate = vd->vdev_state; + + if (vd->vdev_offline) + vd->vdev_state = VDEV_STATE_OFFLINE; + else + vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; +} + +void +vdev_hold(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_is_root(spa)); + if (spa->spa_state == POOL_STATE_UNINITIALIZED) + return; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_hold(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_hold(vd); +} + +void +vdev_rele(vdev_t *vd) +{ + ASSERT(spa_is_root(vd->vdev_spa)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_rele(vd->vdev_child[c]); + + if (vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_rele(vd); +} + +/* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ +void +vdev_reopen(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; + vdev_close(vd); + (void) vdev_open(vd); + + /* + * Call vdev_validate() here to make sure we have the same device. + * Otherwise, a device with an invalid label could be successfully + * opened in response to vdev_reopen(). + */ + if (vd->vdev_aux) { + (void) vdev_validate_aux(vd); + if (vdev_readable(vd) && vdev_writeable(vd) && + vd->vdev_aux == &spa->spa_l2cache) { + /* + * In case the vdev is present we should evict all ARC + * buffers and pointers to log blocks and reclaim their + * space before restoring its contents to L2ARC. + */ + if (l2arc_vdev_present(vd)) { + l2arc_rebuild_vdev(vd, B_TRUE); + } else { + l2arc_add_vdev(spa, vd); + } + spa_async_request(spa, SPA_ASYNC_L2CACHE_REBUILD); + spa_async_request(spa, SPA_ASYNC_L2CACHE_TRIM); + } + } else { + (void) vdev_validate(vd); + } + + /* + * Reassess parent vdev's health. + */ + vdev_propagate_state(vd); +} + +int +vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) +{ + int error; + + /* + * Normally, partial opens (e.g. of a mirror) are allowed. + * For a create, however, we want to fail the request if + * there are any components we can't open. + */ + error = vdev_open(vd); + + if (error || vd->vdev_state != VDEV_STATE_HEALTHY) { + vdev_close(vd); + return (error ? error : SET_ERROR(ENXIO)); + } + + /* + * Recursively load DTLs and initialize all labels. + */ + if ((error = vdev_dtl_load(vd)) != 0 || + (error = vdev_label_init(vd, txg, isreplacing ? + VDEV_LABEL_REPLACE : VDEV_LABEL_CREATE)) != 0) { + vdev_close(vd); + return (error); + } + + return (0); +} + +void +vdev_metaslab_set_size(vdev_t *vd) +{ + uint64_t asize = vd->vdev_asize; + uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; + uint64_t ms_shift; + + /* + * There are two dimensions to the metaslab sizing calculation: + * the size of the metaslab and the count of metaslabs per vdev. + * + * The default values used below are a good balance between memory + * usage (larger metaslab size means more memory needed for loaded + * metaslabs; more metaslabs means more memory needed for the + * metaslab_t structs), metaslab load time (larger metaslabs take + * longer to load), and metaslab sync time (more metaslabs means + * more time spent syncing all of them). + * + * In general, we aim for zfs_vdev_default_ms_count (200) metaslabs. + * The range of the dimensions are as follows: + * + * 2^29 <= ms_size <= 2^34 + * 16 <= ms_count <= 131,072 + * + * On the lower end of vdev sizes, we aim for metaslabs sizes of + * at least 512MB (2^29) to minimize fragmentation effects when + * testing with smaller devices. However, the count constraint + * of at least 16 metaslabs will override this minimum size goal. + * + * On the upper end of vdev sizes, we aim for a maximum metaslab + * size of 16GB. However, we will cap the total count to 2^17 + * metaslabs to keep our memory footprint in check and let the + * metaslab size grow from there if that limit is hit. + * + * The net effect of applying above constrains is summarized below. + * + * vdev size metaslab count + * --------------|----------------- + * < 8GB ~16 + * 8GB - 100GB one per 512MB + * 100GB - 3TB ~200 + * 3TB - 2PB one per 16GB + * > 2PB ~131,072 + * -------------------------------- + * + * Finally, note that all of the above calculate the initial + * number of metaslabs. Expanding a top-level vdev will result + * in additional metaslabs being allocated making it possible + * to exceed the zfs_vdev_ms_count_limit. + */ + + if (ms_count < zfs_vdev_min_ms_count) + ms_shift = highbit64(asize / zfs_vdev_min_ms_count); + else if (ms_count > zfs_vdev_default_ms_count) + ms_shift = highbit64(asize / zfs_vdev_default_ms_count); + else + ms_shift = zfs_vdev_default_ms_shift; + + if (ms_shift < SPA_MAXBLOCKSHIFT) { + ms_shift = SPA_MAXBLOCKSHIFT; + } else if (ms_shift > zfs_vdev_max_ms_shift) { + ms_shift = zfs_vdev_max_ms_shift; + /* cap the total count to constrain memory footprint */ + if ((asize >> ms_shift) > zfs_vdev_ms_count_limit) + ms_shift = highbit64(asize / zfs_vdev_ms_count_limit); + } + + vd->vdev_ms_shift = ms_shift; + ASSERT3U(vd->vdev_ms_shift, >=, SPA_MAXBLOCKSHIFT); +} + +/* + * Maximize performance by inflating the configured ashift for top level + * vdevs to be as close to the physical ashift as possible while maintaining + * administrator defined limits and ensuring it doesn't go below the + * logical ashift. + */ +void +vdev_ashift_optimize(vdev_t *vd) +{ + if (vd == vd->vdev_top) { + if (vd->vdev_ashift < vd->vdev_physical_ashift) { + vd->vdev_ashift = MIN( + MAX(zfs_vdev_max_auto_ashift, vd->vdev_ashift), + MAX(zfs_vdev_min_auto_ashift, + vd->vdev_physical_ashift)); + } else { + /* + * Unusual case where logical ashift > physical ashift + * so we can't cap the calculated ashift based on max + * ashift as that would cause failures. + * We still check if we need to increase it to match + * the min ashift. + */ + vd->vdev_ashift = MAX(zfs_vdev_min_auto_ashift, + vd->vdev_ashift); + } + } +} + +void +vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) +{ + ASSERT(vd == vd->vdev_top); + /* indirect vdevs don't have metaslabs or dtls */ + ASSERT(vdev_is_concrete(vd) || flags == 0); + ASSERT(ISP2(flags)); + ASSERT(spa_writeable(vd->vdev_spa)); + + if (flags & VDD_METASLAB) + (void) txg_list_add(&vd->vdev_ms_list, arg, txg); + + if (flags & VDD_DTL) + (void) txg_list_add(&vd->vdev_dtl_list, arg, txg); + + (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); +} + +void +vdev_dirty_leaves(vdev_t *vd, int flags, uint64_t txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_dirty_leaves(vd->vdev_child[c], flags, txg); + + if (vd->vdev_ops->vdev_op_leaf) + vdev_dirty(vd->vdev_top, flags, vd, txg); +} + +/* + * DTLs. + * + * A vdev's DTL (dirty time log) is the set of transaction groups for which + * the vdev has less than perfect replication. There are four kinds of DTL: + * + * DTL_MISSING: txgs for which the vdev has no valid copies of the data + * + * DTL_PARTIAL: txgs for which data is available, but not fully replicated + * + * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon + * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of + * txgs that was scrubbed. + * + * DTL_OUTAGE: txgs which cannot currently be read, whether due to + * persistent errors or just some device being offline. + * Unlike the other three, the DTL_OUTAGE map is not generally + * maintained; it's only computed when needed, typically to + * determine whether a device can be detached. + * + * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device + * either has the data or it doesn't. + * + * For interior vdevs such as mirror and RAID-Z the picture is more complex. + * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because + * if any child is less than fully replicated, then so is its parent. + * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, + * comprising only those txgs which appear in 'maxfaults' or more children; + * those are the txgs we don't have enough replication to read. For example, + * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); + * thus, its DTL_MISSING consists of the set of txgs that appear in more than + * two child DTL_MISSING maps. + * + * It should be clear from the above that to compute the DTLs and outage maps + * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. + * Therefore, that is all we keep on disk. When loading the pool, or after + * a configuration change, we generate all other DTLs from first principles. + */ +void +vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + ASSERT(spa_writeable(vd->vdev_spa)); + + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_contains(rt, txg, size)) + range_tree_add(rt, txg, size); + mutex_exit(&vd->vdev_dtl_lock); +} + +boolean_t +vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + boolean_t dirty = B_FALSE; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + /* + * While we are loading the pool, the DTLs have not been loaded yet. + * Ignore the DTLs and try all devices. This avoids a recursive + * mutex enter on the vdev_dtl_lock, and also makes us try hard + * when loading the pool (relying on the checksum to ensure that + * we get the right data -- note that we while loading, we are + * only reading the MOS, which is always checksummed). + */ + if (vd->vdev_spa->spa_load_state != SPA_LOAD_NONE) + return (B_FALSE); + + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_is_empty(rt)) + dirty = range_tree_contains(rt, txg, size); + mutex_exit(&vd->vdev_dtl_lock); + + return (dirty); +} + +boolean_t +vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) +{ + range_tree_t *rt = vd->vdev_dtl[t]; + boolean_t empty; + + mutex_enter(&vd->vdev_dtl_lock); + empty = range_tree_is_empty(rt); + mutex_exit(&vd->vdev_dtl_lock); + + return (empty); +} + +/* + * Returns B_TRUE if vdev determines offset needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + if (vd->vdev_ops->vdev_op_need_resilver == NULL || + vd->vdev_ops->vdev_op_leaf) + return (B_TRUE); + + return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); +} + +/* + * Returns the lowest txg in the DTL range. + */ +static uint64_t +vdev_dtl_min(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + return (range_tree_min(vd->vdev_dtl[DTL_MISSING]) - 1); +} + +/* + * Returns the highest txg in the DTL. + */ +static uint64_t +vdev_dtl_max(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_dtl_lock)); + ASSERT3U(range_tree_space(vd->vdev_dtl[DTL_MISSING]), !=, 0); + ASSERT0(vd->vdev_children); + + return (range_tree_max(vd->vdev_dtl[DTL_MISSING])); +} + +/* + * Determine if a resilvering vdev should remove any DTL entries from + * its range. If the vdev was resilvering for the entire duration of the + * scan then it should excise that range from its DTLs. Otherwise, this + * vdev is considered partially resilvered and should leave its DTL + * entries intact. The comment in vdev_dtl_reassess() describes how we + * excise the DTLs. + */ +static boolean_t +vdev_dtl_should_excise(vdev_t *vd, boolean_t rebuild_done) +{ + ASSERT0(vd->vdev_children); + + if (vd->vdev_state < VDEV_STATE_DEGRADED) + return (B_FALSE); + + if (vd->vdev_resilver_deferred) + return (B_FALSE); + + if (range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) + return (B_TRUE); + + if (rebuild_done) { + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + /* Rebuild not initiated by attach */ + if (vd->vdev_rebuild_txg == 0) + return (B_TRUE); + + /* + * When a rebuild completes without error then all missing data + * up to the rebuild max txg has been reconstructed and the DTL + * is eligible for excision. + */ + if (vrp->vrp_rebuild_state == VDEV_REBUILD_COMPLETE && + vdev_dtl_max(vd) <= vrp->vrp_max_txg) { + ASSERT3U(vrp->vrp_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(vrp->vrp_min_txg, <, vd->vdev_rebuild_txg); + ASSERT3U(vd->vdev_rebuild_txg, <=, vrp->vrp_max_txg); + return (B_TRUE); + } + } else { + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scnp __maybe_unused = &scn->scn_phys; + + /* Resilver not initiated by attach */ + if (vd->vdev_resilver_txg == 0) + return (B_TRUE); + + /* + * When a resilver is initiated the scan will assign the + * scn_max_txg value to the highest txg value that exists + * in all DTLs. If this device's max DTL is not part of this + * scan (i.e. it is not in the range (scn_min_txg, scn_max_txg] + * then it is not eligible for excision. + */ + if (vdev_dtl_max(vd) <= scn->scn_phys.scn_max_txg) { + ASSERT3U(scnp->scn_min_txg, <=, vdev_dtl_min(vd)); + ASSERT3U(scnp->scn_min_txg, <, vd->vdev_resilver_txg); + ASSERT3U(vd->vdev_resilver_txg, <=, scnp->scn_max_txg); + return (B_TRUE); + } + } + + return (B_FALSE); +} + +/* + * Reassess DTLs after a config change or scrub completion. If txg == 0 no + * write operations will be issued to the pool. + */ +void +vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, + boolean_t scrub_done, boolean_t rebuild_done) +{ + spa_t *spa = vd->vdev_spa; + avl_tree_t reftree; + int minref; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + + for (int c = 0; c < vd->vdev_children; c++) + vdev_dtl_reassess(vd->vdev_child[c], txg, + scrub_txg, scrub_done, rebuild_done); + + if (vd == spa->spa_root_vdev || !vdev_is_concrete(vd) || vd->vdev_aux) + return; + + if (vd->vdev_ops->vdev_op_leaf) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + vdev_rebuild_t *vr = &vd->vdev_top->vdev_rebuild_config; + boolean_t check_excise = B_FALSE; + boolean_t wasempty = B_TRUE; + + mutex_enter(&vd->vdev_dtl_lock); + + /* + * If requested, pretend the scan or rebuild completed cleanly. + */ + if (zfs_scan_ignore_errors) { + if (scn != NULL) + scn->scn_phys.scn_errors = 0; + if (vr != NULL) + vr->vr_rebuild_phys.vrp_errors = 0; + } + + if (scrub_txg != 0 && + !range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + wasempty = B_FALSE; + zfs_dbgmsg("guid:%llu txg:%llu scrub:%llu started:%d " + "dtl:%llu/%llu errors:%llu", + (u_longlong_t)vd->vdev_guid, (u_longlong_t)txg, + (u_longlong_t)scrub_txg, spa->spa_scrub_started, + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd), + (u_longlong_t)(scn ? scn->scn_phys.scn_errors : 0)); + } + + /* + * If we've completed a scrub/resilver or a rebuild cleanly + * then determine if this vdev should remove any DTLs. We + * only want to excise regions on vdevs that were available + * during the entire duration of this scan. + */ + if (rebuild_done && + vr != NULL && vr->vr_rebuild_phys.vrp_errors == 0) { + check_excise = B_TRUE; + } else { + if (spa->spa_scrub_started || + (scn != NULL && scn->scn_phys.scn_errors == 0)) { + check_excise = B_TRUE; + } + } + + if (scrub_txg && check_excise && + vdev_dtl_should_excise(vd, rebuild_done)) { + /* + * We completed a scrub, resilver or rebuild up to + * scrub_txg. If we did it without rebooting, then + * the scrub dtl will be valid, so excise the old + * region and fold in the scrub dtl. Otherwise, + * leave the dtl as-is if there was an error. + * + * There's little trick here: to excise the beginning + * of the DTL_MISSING map, we put it into a reference + * tree and then add a segment with refcnt -1 that + * covers the range [0, scrub_txg). This means + * that each txg in that range has refcnt -1 or 0. + * We then add DTL_SCRUB with a refcnt of 2, so that + * entries in the range [0, scrub_txg) will have a + * positive refcnt -- either 1 or 2. We then convert + * the reference tree into the new DTL_MISSING map. + */ + space_reftree_create(&reftree); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_add_seg(&reftree, 0, scrub_txg, -1); + space_reftree_add_map(&reftree, + vd->vdev_dtl[DTL_SCRUB], 2); + space_reftree_generate_map(&reftree, + vd->vdev_dtl[DTL_MISSING], 1); + space_reftree_destroy(&reftree); + + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING])) { + zfs_dbgmsg("update DTL_MISSING:%llu/%llu", + (u_longlong_t)vdev_dtl_min(vd), + (u_longlong_t)vdev_dtl_max(vd)); + } else if (!wasempty) { + zfs_dbgmsg("DTL_MISSING is now empty"); + } + } + range_tree_vacate(vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_PARTIAL]); + if (scrub_done) + range_tree_vacate(vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + range_tree_vacate(vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + if (!vdev_readable(vd)) + range_tree_add(vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + else + range_tree_walk(vd->vdev_dtl[DTL_MISSING], + range_tree_add, vd->vdev_dtl[DTL_OUTAGE]); + + /* + * If the vdev was resilvering or rebuilding and no longer + * has any DTLs then reset the appropriate flag and dirty + * the top level so that we persist the change. + */ + if (txg != 0 && + range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + range_tree_is_empty(vd->vdev_dtl[DTL_OUTAGE])) { + if (vd->vdev_rebuild_txg != 0) { + vd->vdev_rebuild_txg = 0; + vdev_config_dirty(vd->vdev_top); + } else if (vd->vdev_resilver_txg != 0) { + vd->vdev_resilver_txg = 0; + vdev_config_dirty(vd->vdev_top); + } + } + + mutex_exit(&vd->vdev_dtl_lock); + + if (txg != 0) + vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); + return; + } + + mutex_enter(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) + continue; /* leaf vdevs only */ + if (t == DTL_PARTIAL) + minref = 1; /* i.e. non-zero */ + else if (vd->vdev_nparity != 0) + minref = vd->vdev_nparity + 1; /* RAID-Z */ + else + minref = vd->vdev_children; /* any kind of mirror */ + space_reftree_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_reftree_add_map(&reftree, cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_reftree_generate_map(&reftree, vd->vdev_dtl[t], minref); + space_reftree_destroy(&reftree); + } + mutex_exit(&vd->vdev_dtl_lock); +} + +int +vdev_dtl_load(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + int error = 0; + + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_dtl_object != 0) { + ASSERT(vdev_is_concrete(vd)); + + error = space_map_open(&vd->vdev_dtl_sm, mos, + vd->vdev_dtl_object, 0, -1ULL, 0); + if (error) + return (error); + ASSERT(vd->vdev_dtl_sm != NULL); + + mutex_enter(&vd->vdev_dtl_lock); + error = space_map_load(vd->vdev_dtl_sm, + vd->vdev_dtl[DTL_MISSING], SM_ALLOC); + mutex_exit(&vd->vdev_dtl_lock); + + return (error); + } + + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_dtl_load(vd->vdev_child[c]); + if (error != 0) + break; + } + + return (error); +} + +static void +vdev_zap_allocation_data(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + vdev_alloc_bias_t alloc_bias = vd->vdev_alloc_bias; + const char *string; + + ASSERT(alloc_bias != VDEV_BIAS_NONE); + + string = + (alloc_bias == VDEV_BIAS_LOG) ? VDEV_ALLOC_BIAS_LOG : + (alloc_bias == VDEV_BIAS_SPECIAL) ? VDEV_ALLOC_BIAS_SPECIAL : + (alloc_bias == VDEV_BIAS_DEDUP) ? VDEV_ALLOC_BIAS_DEDUP : NULL; + + ASSERT(string != NULL); + VERIFY0(zap_add(mos, vd->vdev_top_zap, VDEV_TOP_ZAP_ALLOCATION_BIAS, + 1, strlen(string) + 1, string, tx)); + + if (alloc_bias == VDEV_BIAS_SPECIAL || alloc_bias == VDEV_BIAS_DEDUP) { + spa_activate_allocation_classes(spa, tx); + } +} + +void +vdev_destroy_unlink_zap(vdev_t *vd, uint64_t zapobj, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + + VERIFY0(zap_destroy(spa->spa_meta_objset, zapobj, tx)); + VERIFY0(zap_remove_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zapobj, tx)); +} + +uint64_t +vdev_create_link_zap(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + uint64_t zap = zap_create(spa->spa_meta_objset, DMU_OTN_ZAP_METADATA, + DMU_OT_NONE, 0, tx); + + ASSERT(zap != 0); + VERIFY0(zap_add_int(spa->spa_meta_objset, spa->spa_all_vdev_zaps, + zap, tx)); + + return (zap); +} + +void +vdev_construct_zaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ops != &vdev_hole_ops && + vd->vdev_ops != &vdev_missing_ops && + vd->vdev_ops != &vdev_root_ops && + !vd->vdev_top->vdev_removing) { + if (vd->vdev_ops->vdev_op_leaf && vd->vdev_leaf_zap == 0) { + vd->vdev_leaf_zap = vdev_create_link_zap(vd, tx); + } + if (vd == vd->vdev_top && vd->vdev_top_zap == 0) { + vd->vdev_top_zap = vdev_create_link_zap(vd, tx); + if (vd->vdev_alloc_bias != VDEV_BIAS_NONE) + vdev_zap_allocation_data(vd, tx); + } + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_construct_zaps(vd->vdev_child[i], tx); + } +} + +static void +vdev_dtl_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + range_tree_t *rt = vd->vdev_dtl[DTL_MISSING]; + objset_t *mos = spa->spa_meta_objset; + range_tree_t *rtsync; + dmu_tx_t *tx; + uint64_t object = space_map_object(vd->vdev_dtl_sm); + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + if (vd->vdev_detached || vd->vdev_top->vdev_removing) { + mutex_enter(&vd->vdev_dtl_lock); + space_map_free(vd->vdev_dtl_sm, tx); + space_map_close(vd->vdev_dtl_sm); + vd->vdev_dtl_sm = NULL; + mutex_exit(&vd->vdev_dtl_lock); + + /* + * We only destroy the leaf ZAP for detached leaves or for + * removed log devices. Removed data devices handle leaf ZAP + * cleanup later, once cancellation is no longer possible. + */ + if (vd->vdev_leaf_zap != 0 && (vd->vdev_detached || + vd->vdev_top->vdev_islog)) { + vdev_destroy_unlink_zap(vd, vd->vdev_leaf_zap, tx); + vd->vdev_leaf_zap = 0; + } + + dmu_tx_commit(tx); + return; + } + + if (vd->vdev_dtl_sm == NULL) { + uint64_t new_object; + + new_object = space_map_alloc(mos, zfs_vdev_dtl_sm_blksz, tx); + VERIFY3U(new_object, !=, 0); + + VERIFY0(space_map_open(&vd->vdev_dtl_sm, mos, new_object, + 0, -1ULL, 0)); + ASSERT(vd->vdev_dtl_sm != NULL); + } + + rtsync = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + + mutex_enter(&vd->vdev_dtl_lock); + range_tree_walk(rt, range_tree_add, rtsync); + mutex_exit(&vd->vdev_dtl_lock); + + space_map_truncate(vd->vdev_dtl_sm, zfs_vdev_dtl_sm_blksz, tx); + space_map_write(vd->vdev_dtl_sm, rtsync, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_vacate(rtsync, NULL, NULL); + + range_tree_destroy(rtsync); + + /* + * If the object for the space map has changed then dirty + * the top level so that we update the config. + */ + if (object != space_map_object(vd->vdev_dtl_sm)) { + vdev_dbgmsg(vd, "txg %llu, spa %s, DTL old object %llu, " + "new object %llu", (u_longlong_t)txg, spa_name(spa), + (u_longlong_t)object, + (u_longlong_t)space_map_object(vd->vdev_dtl_sm)); + vdev_config_dirty(vd->vdev_top); + } + + dmu_tx_commit(tx); +} + +/* + * Determine whether the specified vdev can be offlined/detached/removed + * without losing data. + */ +boolean_t +vdev_dtl_required(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint8_t cant_read = vd->vdev_cant_read; + boolean_t required; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == spa->spa_root_vdev || vd == tvd) + return (B_TRUE); + + /* + * Temporarily mark the device as unreadable, and then determine + * whether this results in any DTL outages in the top-level vdev. + * If not, we can safely offline/detach/remove the device. + */ + vd->vdev_cant_read = B_TRUE; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); + required = !vdev_dtl_empty(tvd, DTL_OUTAGE); + vd->vdev_cant_read = cant_read; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE, B_FALSE); + + if (!required && zio_injection_enabled) { + required = !!zio_handle_device_injection(vd, NULL, + SET_ERROR(ECHILD)); + } + + return (required); +} + +/* + * Determine if resilver is needed, and if so the txg range. + */ +boolean_t +vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) +{ + boolean_t needed = B_FALSE; + uint64_t thismin = UINT64_MAX; + uint64_t thismax = 0; + + if (vd->vdev_children == 0) { + mutex_enter(&vd->vdev_dtl_lock); + if (!range_tree_is_empty(vd->vdev_dtl[DTL_MISSING]) && + vdev_writeable(vd)) { + + thismin = vdev_dtl_min(vd); + thismax = vdev_dtl_max(vd); + needed = B_TRUE; + } + mutex_exit(&vd->vdev_dtl_lock); + } else { + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + uint64_t cmin, cmax; + + if (vdev_resilver_needed(cvd, &cmin, &cmax)) { + thismin = MIN(thismin, cmin); + thismax = MAX(thismax, cmax); + needed = B_TRUE; + } + } + } + + if (needed && minp) { + *minp = thismin; + *maxp = thismax; + } + return (needed); +} + +/* + * Gets the checkpoint space map object from the vdev's ZAP. On success sm_obj + * will contain either the checkpoint spacemap object or zero if none exists. + * All other errors are returned to the caller. + */ +int +vdev_checkpoint_sm_object(vdev_t *vd, uint64_t *sm_obj) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_top_zap == 0) { + *sm_obj = 0; + return (0); + } + + int error = zap_lookup(spa_meta_objset(vd->vdev_spa), vd->vdev_top_zap, + VDEV_TOP_ZAP_POOL_CHECKPOINT_SM, sizeof (uint64_t), 1, sm_obj); + if (error == ENOENT) { + *sm_obj = 0; + error = 0; + } + + return (error); +} + +int +vdev_load(vdev_t *vd) +{ + int error = 0; + + /* + * Recursively load all children. + */ + for (int c = 0; c < vd->vdev_children; c++) { + error = vdev_load(vd->vdev_child[c]); + if (error != 0) { + return (error); + } + } + + vdev_set_deflate_ratio(vd); + + /* + * On spa_load path, grab the allocation bias from our zap + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + spa_t *spa = vd->vdev_spa; + char bias_str[64]; + + error = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_ALLOCATION_BIAS, 1, sizeof (bias_str), + bias_str); + if (error == 0) { + ASSERT(vd->vdev_alloc_bias == VDEV_BIAS_NONE); + vd->vdev_alloc_bias = vdev_derive_alloc_bias(bias_str); + } else if (error != ENOENT) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: zap_lookup(top_zap=%llu) " + "failed [error=%d]", vd->vdev_top_zap, error); + return (error); + } + } + + /* + * Load any rebuild state from the top-level vdev zap. + */ + if (vd == vd->vdev_top && vd->vdev_top_zap != 0) { + error = vdev_rebuild_load(vd); + if (error && error != ENOTSUP) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_rebuild_load " + "failed [error=%d]", error); + return (error); + } + } + + /* + * If this is a top-level vdev, initialize its metaslabs. + */ + if (vd == vd->vdev_top && vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + + if (vd->vdev_ashift == 0 || vd->vdev_asize == 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: invalid size. ashift=%llu, " + "asize=%llu", (u_longlong_t)vd->vdev_ashift, + (u_longlong_t)vd->vdev_asize); + return (SET_ERROR(ENXIO)); + } + + error = vdev_metaslab_init(vd, 0); + if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: metaslab_init failed " + "[error=%d]", error); + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (error); + } + + uint64_t checkpoint_sm_obj; + error = vdev_checkpoint_sm_object(vd, &checkpoint_sm_obj); + if (error == 0 && checkpoint_sm_obj != 0) { + objset_t *mos = spa_meta_objset(vd->vdev_spa); + ASSERT(vd->vdev_asize != 0); + ASSERT3P(vd->vdev_checkpoint_sm, ==, NULL); + + error = space_map_open(&vd->vdev_checkpoint_sm, + mos, checkpoint_sm_obj, 0, vd->vdev_asize, + vd->vdev_ashift); + if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: space_map_open " + "failed for checkpoint spacemap (obj %llu) " + "[error=%d]", + (u_longlong_t)checkpoint_sm_obj, error); + return (error); + } + ASSERT3P(vd->vdev_checkpoint_sm, !=, NULL); + + /* + * Since the checkpoint_sm contains free entries + * exclusively we can use space_map_allocated() to + * indicate the cumulative checkpointed space that + * has been freed. + */ + vd->vdev_stat.vs_checkpoint_space = + -space_map_allocated(vd->vdev_checkpoint_sm); + vd->vdev_spa->spa_checkpoint_info.sci_dspace += + vd->vdev_stat.vs_checkpoint_space; + } else if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: failed to retrieve " + "checkpoint space map object from vdev ZAP " + "[error=%d]", error); + return (error); + } + } + + /* + * If this is a leaf vdev, load its DTL. + */ + if (vd->vdev_ops->vdev_op_leaf && (error = vdev_dtl_load(vd)) != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: vdev_dtl_load failed " + "[error=%d]", error); + return (error); + } + + uint64_t obsolete_sm_object; + error = vdev_obsolete_sm_object(vd, &obsolete_sm_object); + if (error == 0 && obsolete_sm_object != 0) { + objset_t *mos = vd->vdev_spa->spa_meta_objset; + ASSERT(vd->vdev_asize != 0); + ASSERT3P(vd->vdev_obsolete_sm, ==, NULL); + + if ((error = space_map_open(&vd->vdev_obsolete_sm, mos, + obsolete_sm_object, 0, vd->vdev_asize, 0))) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + vdev_dbgmsg(vd, "vdev_load: space_map_open failed for " + "obsolete spacemap (obj %llu) [error=%d]", + (u_longlong_t)obsolete_sm_object, error); + return (error); + } + } else if (error != 0) { + vdev_dbgmsg(vd, "vdev_load: failed to retrieve obsolete " + "space map object from vdev ZAP [error=%d]", error); + return (error); + } + + return (0); +} + +/* + * The special vdev case is used for hot spares and l2cache devices. Its + * sole purpose it to set the vdev state for the associated vdev. To do this, + * we make sure that we can open the underlying device, then try to read the + * label, and make sure that the label is sane and that it hasn't been + * repurposed to another pool. + */ +int +vdev_validate_aux(vdev_t *vd) +{ + nvlist_t *label; + uint64_t guid, version; + uint64_t state; + + if (!vdev_readable(vd)) + return (0); + + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + return (-1); + } + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_VERSION, &version) != 0 || + !SPA_VERSION_IS_SUPPORTED(version) || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || + guid != vd->vdev_guid || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state) != 0) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + nvlist_free(label); + return (-1); + } + + /* + * We don't actually check the pool state here. If it's in fact in + * use by another pool, we update this fact on the fly when requested. + */ + nvlist_free(label); + return (0); +} + +static void +vdev_destroy_ms_flush_data(vdev_t *vd, dmu_tx_t *tx) +{ + objset_t *mos = spa_meta_objset(vd->vdev_spa); + + if (vd->vdev_top_zap == 0) + return; + + uint64_t object = 0; + int err = zap_lookup(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, sizeof (uint64_t), 1, &object); + if (err == ENOENT) + return; + VERIFY0(err); + + VERIFY0(dmu_object_free(mos, object, tx)); + VERIFY0(zap_remove(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_MS_UNFLUSHED_PHYS_TXGS, tx)); +} + +/* + * Free the objects used to store this vdev's spacemaps, and the array + * that points to them. + */ +void +vdev_destroy_spacemaps(vdev_t *vd, dmu_tx_t *tx) +{ + if (vd->vdev_ms_array == 0) + return; + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + uint64_t array_count = vd->vdev_asize >> vd->vdev_ms_shift; + size_t array_bytes = array_count * sizeof (uint64_t); + uint64_t *smobj_array = kmem_alloc(array_bytes, KM_SLEEP); + VERIFY0(dmu_read(mos, vd->vdev_ms_array, 0, + array_bytes, smobj_array, 0)); + + for (uint64_t i = 0; i < array_count; i++) { + uint64_t smobj = smobj_array[i]; + if (smobj == 0) + continue; + + space_map_free_obj(mos, smobj, tx); + } + + kmem_free(smobj_array, array_bytes); + VERIFY0(dmu_object_free(mos, vd->vdev_ms_array, tx)); + vdev_destroy_ms_flush_data(vd, tx); + vd->vdev_ms_array = 0; +} + +static void +vdev_remove_empty_log(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_islog); + ASSERT(vd == vd->vdev_top); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + dmu_tx_t *tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + vdev_destroy_spacemaps(vd, tx); + if (vd->vdev_top_zap != 0) { + vdev_destroy_unlink_zap(vd, vd->vdev_top_zap, tx); + vd->vdev_top_zap = 0; + } + + dmu_tx_commit(tx); +} + +void +vdev_sync_done(vdev_t *vd, uint64_t txg) +{ + metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); + + ASSERT(vdev_is_concrete(vd)); + + while ((msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) + != NULL) + metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); +} + +void +vdev_sync(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *lvd; + metaslab_t *msp; + + ASSERT3U(txg, ==, spa->spa_syncing_txg); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + if (range_tree_space(vd->vdev_obsolete_segments) > 0) { + ASSERT(vd->vdev_removing || + vd->vdev_ops == &vdev_indirect_ops); + + vdev_indirect_sync_obsolete(vd, tx); + + /* + * If the vdev is indirect, it can't have dirty + * metaslabs or DTLs. + */ + if (vd->vdev_ops == &vdev_indirect_ops) { + ASSERT(txg_list_empty(&vd->vdev_ms_list, txg)); + ASSERT(txg_list_empty(&vd->vdev_dtl_list, txg)); + dmu_tx_commit(tx); + return; + } + } + + ASSERT(vdev_is_concrete(vd)); + + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0 && + !vd->vdev_removing) { + ASSERT(vd == vd->vdev_top); + ASSERT0(vd->vdev_indirect_config.vic_mapping_object); + vd->vdev_ms_array = dmu_object_alloc(spa->spa_meta_objset, + DMU_OT_OBJECT_ARRAY, 0, DMU_OT_NONE, 0, tx); + ASSERT(vd->vdev_ms_array != 0); + vdev_config_dirty(vd); + } + + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { + metaslab_sync(msp, txg); + (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); + } + + while ((lvd = txg_list_remove(&vd->vdev_dtl_list, txg)) != NULL) + vdev_dtl_sync(lvd, txg); + + /* + * If this is an empty log device being removed, destroy the + * metadata associated with it. + */ + if (vd->vdev_islog && vd->vdev_stat.vs_alloc == 0 && vd->vdev_removing) + vdev_remove_empty_log(vd, txg); + + (void) txg_list_add(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg)); + dmu_tx_commit(tx); +} + +uint64_t +vdev_psize_to_asize(vdev_t *vd, uint64_t psize) +{ + return (vd->vdev_ops->vdev_op_asize(vd, psize)); +} + +/* + * Mark the given vdev faulted. A faulted vdev behaves as if the device could + * not be opened, and no I/O is attempted. + */ +int +vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) +{ + vdev_t *vd, *tvd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + + tvd = vd->vdev_top; + + /* + * If user did a 'zpool offline -f' then make the fault persist across + * reboots. + */ + if (aux == VDEV_AUX_EXTERNAL_PERSIST) { + /* + * There are two kinds of forced faults: temporary and + * persistent. Temporary faults go away at pool import, while + * persistent faults stay set. Both types of faults can be + * cleared with a zpool clear. + * + * We tell if a vdev is persistently faulted by looking at the + * ZPOOL_CONFIG_AUX_STATE nvpair. If it's set to "external" at + * import then it's a persistent fault. Otherwise, it's + * temporary. We get ZPOOL_CONFIG_AUX_STATE set to "external" + * by setting vd.vdev_stat.vs_aux to VDEV_AUX_EXTERNAL. This + * tells vdev_config_generate() (which gets run later) to set + * ZPOOL_CONFIG_AUX_STATE to "external" in the nvlist. + */ + vd->vdev_stat.vs_aux = VDEV_AUX_EXTERNAL; + vd->vdev_tmpoffline = B_FALSE; + aux = VDEV_AUX_EXTERNAL; + } else { + vd->vdev_tmpoffline = B_TRUE; + } + + /* + * We don't directly use the aux state here, but if we do a + * vdev_reopen(), we need this value to be present to remember why we + * were faulted. + */ + vd->vdev_label_aux = aux; + + /* + * Faulted state takes precedence over degraded. + */ + vd->vdev_delayed_close = B_FALSE; + vd->vdev_faulted = 1ULL; + vd->vdev_degraded = 0ULL; + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); + + /* + * If this device has the only valid copy of the data, then + * back off and simply mark the vdev as degraded instead. + */ + if (!tvd->vdev_islog && vd->vdev_aux == NULL && vdev_dtl_required(vd)) { + vd->vdev_degraded = 1ULL; + vd->vdev_faulted = 0ULL; + + /* + * If we reopen the device and it's not dead, only then do we + * mark it degraded. + */ + vdev_reopen(tvd); + + if (vdev_readable(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); + } + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Mark the given vdev degraded. A degraded vdev is purely an indication to the + * user that something is wrong. The vdev continues to operate as normal as far + * as I/O is concerned. + */ +int +vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) +{ + vdev_t *vd; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + + /* + * If the vdev is already faulted, then don't do anything. + */ + if (vd->vdev_faulted || vd->vdev_degraded) + return (spa_vdev_state_exit(spa, NULL, 0)); + + vd->vdev_degraded = 1ULL; + if (!vdev_is_dead(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, + aux); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +/* + * Online the given vdev. + * + * If 'ZFS_ONLINE_UNSPARE' is set, it implies two things. First, any attached + * spare device should be detached when the device finishes resilvering. + * Second, the online should be treated like a 'test' online case, so no FMA + * events are generated if the device fails to open. + */ +int +vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) +{ + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; + boolean_t wasoffline; + vdev_state_t oldstate; + + spa_vdev_state_enter(spa, SCL_NONE); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + + wasoffline = (vd->vdev_offline || vd->vdev_tmpoffline); + oldstate = vd->vdev_state; + + tvd = vd->vdev_top; + vd->vdev_offline = B_FALSE; + vd->vdev_tmpoffline = B_FALSE; + vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); + vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!((flags & ZFS_ONLINE_EXPAND) || + spa->spa_autoexpand); + vd->vdev_expansion_time = gethrestime_sec(); + } + + vdev_reopen(tvd); + vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + + if (newstate) + *newstate = vd->vdev_state; + if ((flags & ZFS_ONLINE_UNSPARE) && + !vdev_is_dead(vd) && vd->vdev_parent && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; + + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { + + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + + /* Restart initializing if necessary */ + mutex_enter(&vd->vdev_initialize_lock); + if (vdev_writeable(vd) && + vd->vdev_initialize_thread == NULL && + vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE) { + (void) vdev_initialize(vd); + } + mutex_exit(&vd->vdev_initialize_lock); + + /* + * Restart trimming if necessary. We do not restart trimming for cache + * devices here. This is triggered by l2arc_rebuild_vdev() + * asynchronously for the whole device or in l2arc_evict() as it evicts + * space for upcoming writes. + */ + mutex_enter(&vd->vdev_trim_lock); + if (vdev_writeable(vd) && !vd->vdev_isl2cache && + vd->vdev_trim_thread == NULL && + vd->vdev_trim_state == VDEV_TRIM_ACTIVE) { + (void) vdev_trim(vd, vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + mutex_exit(&vd->vdev_trim_lock); + + if (wasoffline || + (oldstate < VDEV_STATE_DEGRADED && + vd->vdev_state >= VDEV_STATE_DEGRADED)) + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_ONLINE); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) +{ + vdev_t *vd, *tvd; + int error = 0; + uint64_t generation; + metaslab_group_t *mg; + +top: + spa_vdev_state_enter(spa, SCL_ALLOC); + + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENODEV))); + + if (!vd->vdev_ops->vdev_op_leaf) + return (spa_vdev_state_exit(spa, NULL, SET_ERROR(ENOTSUP))); + + tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; + + /* + * If the device isn't already offline, try to offline it. + */ + if (!vd->vdev_offline) { + /* + * If this device has the only valid copy of some data, + * don't allow it to be offlined. Log devices are always + * expendable. + */ + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) + return (spa_vdev_state_exit(spa, NULL, + SET_ERROR(EBUSY))); + + /* + * If the top-level is a slog and it has had allocations + * then proceed. We check that the vdev's metaslab group + * is not NULL since it's possible that we may have just + * added this vdev but not yet initialized its metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_group_passivate(mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = spa_reset_logs(spa); + + /* + * If the log device was successfully reset but has + * checkpointed data, do not offline it. + */ + if (error == 0 && + tvd->vdev_checkpoint_sm != NULL) { + ASSERT3U(space_map_allocated( + tvd->vdev_checkpoint_sm), !=, 0); + error = ZFS_ERR_CHECKPOINT_EXISTS; + } + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_group_activate(mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT0(tvd->vdev_stat.vs_alloc); + } + + /* + * Offline this device and reopen its top-level vdev. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. + */ + vd->vdev_offline = B_TRUE; + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { + vd->vdev_offline = B_FALSE; + vdev_reopen(tvd); + return (spa_vdev_state_exit(spa, NULL, + SET_ERROR(EBUSY))); + } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_group_activate(mg); + } + + vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); + + return (spa_vdev_state_exit(spa, vd, 0)); +} + +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; + + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + +/* + * Clear the error counts associated with this vdev. Unlike vdev_online() and + * vdev_offline(), we assume the spa config is locked. We also clear all + * children. If 'vd' is NULL, then the user wants to clear all vdevs. + */ +void +vdev_clear(spa_t *spa, vdev_t *vd) +{ + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == NULL) + vd = rvd; + + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + vd->vdev_stat.vs_slow_ios = 0; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_clear(spa, vd->vdev_child[c]); + + /* + * It makes no sense to "clear" an indirect vdev. + */ + if (!vdev_is_concrete(vd)) + return; + + /* + * If we're in the FAULTED state or have experienced failed I/O, then + * clear the persistent state and attempt to reopen the device. We + * also mark the vdev config dirty, so that the new faulted state is + * written out to disk. + */ + if (vd->vdev_faulted || vd->vdev_degraded || + !vdev_readable(vd) || !vdev_writeable(vd)) { + /* + * When reopening in response to a clear event, it may be due to + * a fmadm repair request. In this case, if the device is + * still broken, we want to still post the ereport again. + */ + vd->vdev_forcefault = B_TRUE; + + vd->vdev_faulted = vd->vdev_degraded = 0ULL; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_stat.vs_aux = 0; + + vdev_reopen(vd == rvd ? rvd : vd->vdev_top); + + vd->vdev_forcefault = B_FALSE; + + if (vd != rvd && vdev_writeable(vd->vdev_top)) + vdev_state_dirty(vd->vdev_top); + + /* If a resilver isn't required, check if vdevs can be culled */ + if (vd->vdev_aux == NULL && !vdev_is_dead(vd) && + !dsl_scan_resilvering(spa->spa_dsl_pool) && + !dsl_scan_resilver_scheduled(spa->spa_dsl_pool)) + spa_async_request(spa, SPA_ASYNC_RESILVER_DONE); + + spa_event_notify(spa, vd, NULL, ESC_ZFS_VDEV_CLEAR); + } + + /* + * When clearing a FMA-diagnosed fault, we always want to + * unspare the device, as we assume that the original spare was + * done in response to the FMA fault. + */ + if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; +} + +boolean_t +vdev_is_dead(vdev_t *vd) +{ + /* + * Holes and missing devices are always considered "dead". + * This simplifies the code since we don't have to check for + * these types of devices in the various code paths. + * Instead we rely on the fact that we skip over dead devices + * before issuing I/O to them. + */ + return (vd->vdev_state < VDEV_STATE_DEGRADED || + vd->vdev_ops == &vdev_hole_ops || + vd->vdev_ops == &vdev_missing_ops); +} + +boolean_t +vdev_readable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_read); +} + +boolean_t +vdev_writeable(vdev_t *vd) +{ + return (!vdev_is_dead(vd) && !vd->vdev_cant_write && + vdev_is_concrete(vd)); +} + +boolean_t +vdev_allocatable(vdev_t *vd) +{ + uint64_t state = vd->vdev_state; + + /* + * We currently allow allocations from vdevs which may be in the + * process of reopening (i.e. VDEV_STATE_CLOSED). If the device + * fails to reopen then we'll catch it later when we're holding + * the proper locks. Note that we have to get the vdev state + * in a local variable because although it changes atomically, + * we're asking two separate questions about it. + */ + return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && + !vd->vdev_cant_write && vdev_is_concrete(vd) && + vd->vdev_mg->mg_initialized); +} + +boolean_t +vdev_accessible(vdev_t *vd, zio_t *zio) +{ + ASSERT(zio->io_vd == vd); + + if (vdev_is_dead(vd) || vd->vdev_remove_wanted) + return (B_FALSE); + + if (zio->io_type == ZIO_TYPE_READ) + return (!vd->vdev_cant_read); + + if (zio->io_type == ZIO_TYPE_WRITE) + return (!vd->vdev_cant_write); + + return (B_TRUE); +} + +static void +vdev_get_child_stat(vdev_t *cvd, vdev_stat_t *vs, vdev_stat_t *cvs) +{ + for (int t = 0; t < VS_ZIO_TYPES; t++) { + vs->vs_ops[t] += cvs->vs_ops[t]; + vs->vs_bytes[t] += cvs->vs_bytes[t]; + } + + cvs->vs_scan_removing = cvd->vdev_removing; +} + +/* + * Get extended stats + */ +static void +vdev_get_child_stat_ex(vdev_t *cvd, vdev_stat_ex_t *vsx, vdev_stat_ex_t *cvsx) +{ + int t, b; + for (t = 0; t < ZIO_TYPES; t++) { + for (b = 0; b < ARRAY_SIZE(vsx->vsx_disk_histo[0]); b++) + vsx->vsx_disk_histo[t][b] += cvsx->vsx_disk_histo[t][b]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_total_histo[0]); b++) { + vsx->vsx_total_histo[t][b] += + cvsx->vsx_total_histo[t][b]; + } + } + + for (t = 0; t < ZIO_PRIORITY_NUM_QUEUEABLE; t++) { + for (b = 0; b < ARRAY_SIZE(vsx->vsx_queue_histo[0]); b++) { + vsx->vsx_queue_histo[t][b] += + cvsx->vsx_queue_histo[t][b]; + } + vsx->vsx_active_queue[t] += cvsx->vsx_active_queue[t]; + vsx->vsx_pend_queue[t] += cvsx->vsx_pend_queue[t]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_ind_histo[0]); b++) + vsx->vsx_ind_histo[t][b] += cvsx->vsx_ind_histo[t][b]; + + for (b = 0; b < ARRAY_SIZE(vsx->vsx_agg_histo[0]); b++) + vsx->vsx_agg_histo[t][b] += cvsx->vsx_agg_histo[t][b]; + } + +} + +boolean_t +vdev_is_spacemap_addressable(vdev_t *vd) +{ + if (spa_feature_is_active(vd->vdev_spa, SPA_FEATURE_SPACEMAP_V2)) + return (B_TRUE); + + /* + * If double-word space map entries are not enabled we assume + * 47 bits of the space map entry are dedicated to the entry's + * offset (see SM_OFFSET_BITS in space_map.h). We then use that + * to calculate the maximum address that can be described by a + * space map entry for the given device. + */ + uint64_t shift = vd->vdev_ashift + SM_OFFSET_BITS; + + if (shift >= 63) /* detect potential overflow */ + return (B_TRUE); + + return (vd->vdev_asize < (1ULL << shift)); +} + +/* + * Get statistics for the given vdev. + */ +static void +vdev_get_stats_ex_impl(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + int t; + /* + * If we're getting stats on the root vdev, aggregate the I/O counts + * over all top-level vdevs (i.e. the direct children of the root). + */ + if (!vd->vdev_ops->vdev_op_leaf) { + if (vs) { + memset(vs->vs_ops, 0, sizeof (vs->vs_ops)); + memset(vs->vs_bytes, 0, sizeof (vs->vs_bytes)); + } + if (vsx) + memset(vsx, 0, sizeof (*vsx)); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + vdev_stat_t *cvs = &cvd->vdev_stat; + vdev_stat_ex_t *cvsx = &cvd->vdev_stat_ex; + + vdev_get_stats_ex_impl(cvd, cvs, cvsx); + if (vs) + vdev_get_child_stat(cvd, vs, cvs); + if (vsx) + vdev_get_child_stat_ex(cvd, vsx, cvsx); + + } + } else { + /* + * We're a leaf. Just copy our ZIO active queue stats in. The + * other leaf stats are updated in vdev_stat_update(). + */ + if (!vsx) + return; + + memcpy(vsx, &vd->vdev_stat_ex, sizeof (vd->vdev_stat_ex)); + + for (t = 0; t < ARRAY_SIZE(vd->vdev_queue.vq_class); t++) { + vsx->vsx_active_queue[t] = + vd->vdev_queue.vq_class[t].vqc_active; + vsx->vsx_pend_queue[t] = avl_numnodes( + &vd->vdev_queue.vq_class[t].vqc_queued_tree); + } + } +} + +void +vdev_get_stats_ex(vdev_t *vd, vdev_stat_t *vs, vdev_stat_ex_t *vsx) +{ + vdev_t *tvd = vd->vdev_top; + mutex_enter(&vd->vdev_stat_lock); + if (vs) { + bcopy(&vd->vdev_stat, vs, sizeof (*vs)); + vs->vs_timestamp = gethrtime() - vs->vs_timestamp; + vs->vs_state = vd->vdev_state; + vs->vs_rsize = vdev_get_min_asize(vd); + + if (vd->vdev_ops->vdev_op_leaf) { + vs->vs_rsize += VDEV_LABEL_START_SIZE + + VDEV_LABEL_END_SIZE; + /* + * Report initializing progress. Since we don't + * have the initializing locks held, this is only + * an estimate (although a fairly accurate one). + */ + vs->vs_initialize_bytes_done = + vd->vdev_initialize_bytes_done; + vs->vs_initialize_bytes_est = + vd->vdev_initialize_bytes_est; + vs->vs_initialize_state = vd->vdev_initialize_state; + vs->vs_initialize_action_time = + vd->vdev_initialize_action_time; + + /* + * Report manual TRIM progress. Since we don't have + * the manual TRIM locks held, this is only an + * estimate (although fairly accurate one). + */ + vs->vs_trim_notsup = !vd->vdev_has_trim; + vs->vs_trim_bytes_done = vd->vdev_trim_bytes_done; + vs->vs_trim_bytes_est = vd->vdev_trim_bytes_est; + vs->vs_trim_state = vd->vdev_trim_state; + vs->vs_trim_action_time = vd->vdev_trim_action_time; + + /* Set when there is a deferred resilver. */ + vs->vs_resilver_deferred = vd->vdev_resilver_deferred; + } + + /* + * Report expandable space on top-level, non-auxiliary devices + * only. The expandable space is reported in terms of metaslab + * sized units since that determines how much space the pool + * can expand. + */ + if (vd->vdev_aux == NULL && tvd != NULL) { + vs->vs_esize = P2ALIGN( + vd->vdev_max_asize - vd->vdev_asize, + 1ULL << tvd->vdev_ms_shift); + } + + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; + + /* + * Report fragmentation and rebuild progress for top-level, + * non-auxiliary, concrete devices. + */ + if (vd->vdev_aux == NULL && vd == vd->vdev_top && + vdev_is_concrete(vd)) { + vs->vs_fragmentation = (vd->vdev_mg != NULL) ? + vd->vdev_mg->mg_fragmentation : 0; + } + } + + vdev_get_stats_ex_impl(vd, vs, vsx); + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) +{ + return (vdev_get_stats_ex(vd, vs, NULL)); +} + +void +vdev_clear_stats(vdev_t *vd) +{ + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_space = 0; + vd->vdev_stat.vs_dspace = 0; + vd->vdev_stat.vs_alloc = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_scan_stat_init(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (int c = 0; c < vd->vdev_children; c++) + vdev_scan_stat_init(vd->vdev_child[c]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_scan_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +void +vdev_stat_update(zio_t *zio, uint64_t psize) +{ + spa_t *spa = zio->io_spa; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; + vdev_t *pvd; + uint64_t txg = zio->io_txg; + vdev_stat_t *vs = &vd->vdev_stat; + vdev_stat_ex_t *vsx = &vd->vdev_stat_ex; + zio_type_t type = zio->io_type; + int flags = zio->io_flags; + + /* + * If this i/o is a gang leader, it didn't do any actual work. + */ + if (zio->io_gang_tree) + return; + + if (zio->io_error == 0) { + /* + * If this is a root i/o, don't count it -- we've already + * counted the top-level vdevs, and vdev_get_stats() will + * aggregate them when asked. This reduces contention on + * the root vdev_stat_lock and implicitly handles blocks + * that compress away to holes, for which there is no i/o. + * (Holes never create vdev children, so all the counters + * remain zero, which is what we want.) + * + * Note: this only applies to successful i/o (io_error == 0) + * because unlike i/o counts, errors are not additive. + * When reading a ditto block, for example, failure of + * one top-level vdev does not imply a root-level error. + */ + if (vd == rvd) + return; + + ASSERT(vd == zio->io_vd); + + if (flags & ZIO_FLAG_IO_BYPASS) + return; + + mutex_enter(&vd->vdev_stat_lock); + + if (flags & ZIO_FLAG_IO_REPAIR) { + /* + * Repair is the result of a resilver issued by the + * scan thread (spa_sync). + */ + if (flags & ZIO_FLAG_SCAN_THREAD) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dsl_scan_phys_t *scn_phys = &scn->scn_phys; + uint64_t *processed = &scn_phys->scn_processed; + + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(processed, psize); + vs->vs_scan_processed += psize; + } + + /* + * Repair is the result of a rebuild issued by the + * rebuild thread (vdev_rebuild_thread). + */ + if (zio->io_priority == ZIO_PRIORITY_REBUILD) { + vdev_t *tvd = vd->vdev_top; + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t *rebuilt = &vrp->vrp_bytes_rebuilt; + + if (vd->vdev_ops->vdev_op_leaf) + atomic_add_64(rebuilt, psize); + vs->vs_rebuild_processed += psize; + } + + if (flags & ZIO_FLAG_SELF_HEAL) + vs->vs_self_healed += psize; + } + + /* + * The bytes/ops/histograms are recorded at the leaf level and + * aggregated into the higher level vdevs in vdev_get_stats(). + */ + if (vd->vdev_ops->vdev_op_leaf && + (zio->io_priority < ZIO_PRIORITY_NUM_QUEUEABLE)) { + zio_type_t vs_type = type; + zio_priority_t priority = zio->io_priority; + + /* + * TRIM ops and bytes are reported to user space as + * ZIO_TYPE_IOCTL. This is done to preserve the + * vdev_stat_t structure layout for user space. + */ + if (type == ZIO_TYPE_TRIM) + vs_type = ZIO_TYPE_IOCTL; + + /* + * Solely for the purposes of 'zpool iostat -lqrw' + * reporting use the priority to catagorize the IO. + * Only the following are reported to user space: + * + * ZIO_PRIORITY_SYNC_READ, + * ZIO_PRIORITY_SYNC_WRITE, + * ZIO_PRIORITY_ASYNC_READ, + * ZIO_PRIORITY_ASYNC_WRITE, + * ZIO_PRIORITY_SCRUB, + * ZIO_PRIORITY_TRIM. + */ + if (priority == ZIO_PRIORITY_REBUILD) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_SCRUB); + } else if (priority == ZIO_PRIORITY_INITIALIZING) { + ASSERT3U(type, ==, ZIO_TYPE_WRITE); + priority = ZIO_PRIORITY_ASYNC_WRITE; + } else if (priority == ZIO_PRIORITY_REMOVAL) { + priority = ((type == ZIO_TYPE_WRITE) ? + ZIO_PRIORITY_ASYNC_WRITE : + ZIO_PRIORITY_ASYNC_READ); + } + + vs->vs_ops[vs_type]++; + vs->vs_bytes[vs_type] += psize; + + if (flags & ZIO_FLAG_DELEGATED) { + vsx->vsx_agg_histo[priority] + [RQ_HISTO(zio->io_size)]++; + } else { + vsx->vsx_ind_histo[priority] + [RQ_HISTO(zio->io_size)]++; + } + + if (zio->io_delta && zio->io_delay) { + vsx->vsx_queue_histo[priority] + [L_HISTO(zio->io_delta - zio->io_delay)]++; + vsx->vsx_disk_histo[type] + [L_HISTO(zio->io_delay)]++; + vsx->vsx_total_histo[type] + [L_HISTO(zio->io_delta)]++; + } + } + + mutex_exit(&vd->vdev_stat_lock); + return; + } + + if (flags & ZIO_FLAG_SPECULATIVE) + return; + + /* + * If this is an I/O error that is going to be retried, then ignore the + * error. Otherwise, the user may interpret B_FAILFAST I/O errors as + * hard errors, when in reality they can happen for any number of + * innocuous reasons (bus resets, MPxIO link failure, etc). + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + + if (spa->spa_load_state == SPA_LOAD_NONE && + type == ZIO_TYPE_WRITE && txg != 0 && + (!(flags & ZIO_FLAG_IO_REPAIR) || + (flags & ZIO_FLAG_SCAN_THREAD) || + spa->spa_claiming)) { + /* + * This is either a normal write (not a repair), or it's + * a repair induced by the scrub thread, or it's a repair + * made by zil_claim() during spa_load() in the first txg. + * In the normal case, we commit the DTL change in the same + * txg as the block was born. In the scrub-induced repair + * case, we know that scrubs run in first-pass syncing context, + * so we commit the DTL change in spa_syncing_txg(spa). + * In the zil_claim() case, we commit in spa_first_txg(spa). + * + * We currently do not make DTL entries for failed spontaneous + * self-healing writes triggered by normal (non-scrubbing) + * reads, because we have no transactional context in which to + * do so -- and it's not clear that it'd be desirable anyway. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t commit_txg = txg; + if (flags & ZIO_FLAG_SCAN_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + ASSERT(spa_sync_pass(spa) == 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + commit_txg = spa_syncing_txg(spa); + } else if (spa->spa_claiming) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + commit_txg = spa_first_txg(spa); + } + ASSERT(commit_txg >= spa_syncing_txg(spa)); + if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) + return; + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); + } + if (vd != rvd) + vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); + } +} + +int64_t +vdev_deflated_space(vdev_t *vd, int64_t space) +{ + ASSERT((space & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); + + return ((space >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio); +} + +/* + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. + */ +void +vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta) +{ + int64_t dspace_delta; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(vd == vd->vdev_top); + + /* + * Apply the inverse of the psize-to-asize (ie. RAID-Z) space-expansion + * factor. We must calculate this here and not at the root vdev + * because the root vdev's psize-to-asize is simply the max of its + * children's, thus not accurate enough for us. + */ + dspace_delta = vdev_deflated_space(vd, space_delta); + + mutex_enter(&vd->vdev_stat_lock); + /* ensure we won't underflow */ + if (alloc_delta < 0) { + ASSERT3U(vd->vdev_stat.vs_alloc, >=, -alloc_delta); + } + + vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_space += space_delta; + vd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&vd->vdev_stat_lock); + + /* every class but log contributes to root space stats */ + if (vd->vdev_mg != NULL && !vd->vdev_islog) { + ASSERT(!vd->vdev_isl2cache); + mutex_enter(&rvd->vdev_stat_lock); + rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_space += space_delta; + rvd->vdev_stat.vs_dspace += dspace_delta; + mutex_exit(&rvd->vdev_stat_lock); + } + /* Note: metaslab_class_space_update moved to metaslab_space_update */ +} + +/* + * Mark a top-level vdev's config as dirty, placing it on the dirty list + * so that it will be written out next time the vdev configuration is synced. + * If the root vdev is specified (vdev_top == NULL), dirty all top-level vdevs. + */ +void +vdev_config_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int c; + + ASSERT(spa_writeable(spa)); + + /* + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. + */ + if (vd->vdev_aux != NULL) { + spa_aux_vdev_t *sav = vd->vdev_aux; + nvlist_t **aux; + uint_t naux; + + for (c = 0; c < sav->sav_count; c++) { + if (sav->sav_vdevs[c] == vd) + break; + } + + if (c == sav->sav_count) { + /* + * We're being removed. There's nothing more to do. + */ + ASSERT(sav->sav_sync == B_TRUE); + return; + } + + sav->sav_sync = B_TRUE; + + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } + + ASSERT(c < naux); + + /* + * Setting the nvlist in the middle if the array is a little + * sketchy, but it will work. + */ + nvlist_free(aux[c]); + aux[c] = vdev_config_generate(spa, vd, B_TRUE, 0); + + return; + } + + /* + * The dirty list is protected by the SCL_CONFIG lock. The caller + * must either hold SCL_CONFIG as writer, or must be the sync thread + * (which holds SCL_CONFIG as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + if (vd == rvd) { + for (c = 0; c < rvd->vdev_children; c++) + vdev_config_dirty(rvd->vdev_child[c]); + } else { + ASSERT(vd == vd->vdev_top); + + if (!list_link_active(&vd->vdev_config_dirty_node) && + vdev_is_concrete(vd)) { + list_insert_head(&spa->spa_config_dirty_list, vd); + } + } +} + +void +vdev_config_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_CONFIG, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_config_dirty_node)); + list_remove(&spa->spa_config_dirty_list, vd); +} + +/* + * Mark a top-level vdev's state as dirty, so that the next pass of + * spa_sync() can convert this into vdev_config_dirty(). We distinguish + * the state changes from larger config changes because they require + * much less locking, and are often needed for administrative actions. + */ +void +vdev_state_dirty(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_writeable(spa)); + ASSERT(vd == vd->vdev_top); + + /* + * The state list is protected by the SCL_STATE lock. The caller + * must either hold SCL_STATE as writer, or must be the sync thread + * (which holds SCL_STATE as reader). There's only one sync thread, + * so this is sufficient to ensure mutual exclusion. + */ + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + if (!list_link_active(&vd->vdev_state_dirty_node) && + vdev_is_concrete(vd)) + list_insert_head(&spa->spa_state_dirty_list, vd); +} + +void +vdev_state_clean(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_WRITER) || + (dsl_pool_sync_context(spa_get_dsl(spa)) && + spa_config_held(spa, SCL_STATE, RW_READER))); + + ASSERT(list_link_active(&vd->vdev_state_dirty_node)); + list_remove(&spa->spa_state_dirty_list, vd); +} + +/* + * Propagate vdev state up from children to parent. + */ +void +vdev_propagate_state(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + int degraded = 0, faulted = 0; + int corrupted = 0; + vdev_t *child; + + if (vd->vdev_children > 0) { + for (int c = 0; c < vd->vdev_children; c++) { + child = vd->vdev_child[c]; + + /* + * Don't factor holes or indirect vdevs into the + * decision. + */ + if (!vdev_is_concrete(child)) + continue; + + if (!vdev_readable(child) || + (!vdev_writeable(child) && spa_writeable(spa))) { + /* + * Root special: if there is a top-level log + * device, treat the root vdev as if it were + * degraded. + */ + if (child->vdev_islog && vd == rvd) + degraded++; + else + faulted++; + } else if (child->vdev_state <= VDEV_STATE_DEGRADED) { + degraded++; + } + + if (child->vdev_stat.vs_aux == VDEV_AUX_CORRUPT_DATA) + corrupted++; + } + + vd->vdev_ops->vdev_op_state_change(vd, faulted, degraded); + + /* + * Root special: if there is a top-level vdev that cannot be + * opened due to corrupted metadata, then propagate the root + * vdev's aux state as 'corrupt' rather than 'insufficient + * replicas'. + */ + if (corrupted && vd == rvd && + rvd->vdev_state == VDEV_STATE_CANT_OPEN) + vdev_set_state(rvd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_CORRUPT_DATA); + } + + if (vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); +} + +/* + * Set a vdev's state. If this is during an open, we don't update the parent + * state, because we're in the process of opening children depth-first. + * Otherwise, we propagate the change to the parent. + * + * If this routine places a device in a faulted state, an appropriate ereport is + * generated. + */ +void +vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) +{ + uint64_t save_state; + spa_t *spa = vd->vdev_spa; + + if (state == vd->vdev_state) { + /* + * Since vdev_offline() code path is already in an offline + * state we can miss a statechange event to OFFLINE. Check + * the previous state to catch this condition. + */ + if (vd->vdev_ops->vdev_op_leaf && + (state == VDEV_STATE_OFFLINE) && + (vd->vdev_prevstate >= VDEV_STATE_FAULTED)) { + /* post an offline state change */ + zfs_post_state_change(spa, vd, vd->vdev_prevstate); + } + vd->vdev_stat.vs_aux = aux; + return; + } + + save_state = vd->vdev_state; + + vd->vdev_state = state; + vd->vdev_stat.vs_aux = aux; + + /* + * If we are setting the vdev state to anything but an open state, then + * always close the underlying device unless the device has requested + * a delayed close (i.e. we're about to remove or fault the device). + * Otherwise, we keep accessible but invalid devices open forever. + * We don't call vdev_close() itself, because that implies some extra + * checks (offline, etc) that we don't want here. This is limited to + * leaf devices, because otherwise closing the device will affect other + * children. + */ + if (!vd->vdev_delayed_close && vdev_is_dead(vd) && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_ops->vdev_op_close(vd); + + if (vd->vdev_removed && + state == VDEV_STATE_CANT_OPEN && + (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { + /* + * If the previous state is set to VDEV_STATE_REMOVED, then this + * device was previously marked removed and someone attempted to + * reopen it. If this failed due to a nonexistent device, then + * keep the device in the REMOVED state. We also let this be if + * it is one of our special test online cases, which is only + * attempting to online the device and shouldn't generate an FMA + * fault. + */ + vd->vdev_state = VDEV_STATE_REMOVED; + vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + } else if (state == VDEV_STATE_REMOVED) { + vd->vdev_removed = B_TRUE; + } else if (state == VDEV_STATE_CANT_OPEN) { + /* + * If we fail to open a vdev during an import or recovery, we + * mark it as "not available", which signifies that it was + * never there to begin with. Failure to open such a device + * is not considered an error. + */ + if ((spa_load_state(spa) == SPA_LOAD_IMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) && + vd->vdev_ops->vdev_op_leaf) + vd->vdev_not_present = 1; + + /* + * Post the appropriate ereport. If the 'prevstate' field is + * set to something other than VDEV_STATE_UNKNOWN, it indicates + * that this is part of a vdev_reopen(). In this case, we don't + * want to post the ereport if the device was already in the + * CANT_OPEN state beforehand. + * + * If the 'checkremove' flag is set, then this is an attempt to + * online the device in response to an insertion event. If we + * hit this case, then we have detected an insertion event for a + * faulted or offline device that wasn't in the removed state. + * In this scenario, we don't post an ereport because we are + * about to replace the device, or attempt an online with + * vdev_forcefault, which will generate the fault for us. + */ + if ((vd->vdev_prevstate != state || vd->vdev_forcefault) && + !vd->vdev_not_present && !vd->vdev_checkremove && + vd != spa->spa_root_vdev) { + const char *class; + + switch (aux) { + case VDEV_AUX_OPEN_FAILED: + class = FM_EREPORT_ZFS_DEVICE_OPEN_FAILED; + break; + case VDEV_AUX_CORRUPT_DATA: + class = FM_EREPORT_ZFS_DEVICE_CORRUPT_DATA; + break; + case VDEV_AUX_NO_REPLICAS: + class = FM_EREPORT_ZFS_DEVICE_NO_REPLICAS; + break; + case VDEV_AUX_BAD_GUID_SUM: + class = FM_EREPORT_ZFS_DEVICE_BAD_GUID_SUM; + break; + case VDEV_AUX_TOO_SMALL: + class = FM_EREPORT_ZFS_DEVICE_TOO_SMALL; + break; + case VDEV_AUX_BAD_LABEL: + class = FM_EREPORT_ZFS_DEVICE_BAD_LABEL; + break; + case VDEV_AUX_BAD_ASHIFT: + class = FM_EREPORT_ZFS_DEVICE_BAD_ASHIFT; + break; + default: + class = FM_EREPORT_ZFS_DEVICE_UNKNOWN; + } + + zfs_ereport_post(class, spa, vd, NULL, NULL, + save_state, 0); + } + + /* Erase any notion of persistent removed state */ + vd->vdev_removed = B_FALSE; + } else { + vd->vdev_removed = B_FALSE; + } + + /* + * Notify ZED of any significant state-change on a leaf vdev. + * + */ + if (vd->vdev_ops->vdev_op_leaf) { + /* preserve original state from a vdev_reopen() */ + if ((vd->vdev_prevstate != VDEV_STATE_UNKNOWN) && + (vd->vdev_prevstate != vd->vdev_state) && + (save_state <= VDEV_STATE_CLOSED)) + save_state = vd->vdev_prevstate; + + /* filter out state change due to initial vdev_open */ + if (save_state > VDEV_STATE_CLOSED) + zfs_post_state_change(spa, vd, save_state); + } + + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); +} + +boolean_t +vdev_children_are_offline(vdev_t *vd) +{ + ASSERT(!vd->vdev_ops->vdev_op_leaf); + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + if (vd->vdev_child[i]->vdev_state != VDEV_STATE_OFFLINE) + return (B_FALSE); + } + + return (B_TRUE); +} + +/* + * Check the vdev configuration to ensure that it's capable of supporting + * a root pool. We do not support partial configuration. + */ +boolean_t +vdev_is_bootable(vdev_t *vd) +{ + if (!vd->vdev_ops->vdev_op_leaf) { + const char *vdev_type = vd->vdev_ops->vdev_op_type; + + if (strcmp(vdev_type, VDEV_TYPE_MISSING) == 0 || + strcmp(vdev_type, VDEV_TYPE_INDIRECT) == 0) { + return (B_FALSE); + } + } + + for (int c = 0; c < vd->vdev_children; c++) { + if (!vdev_is_bootable(vd->vdev_child[c])) + return (B_FALSE); + } + return (B_TRUE); +} + +boolean_t +vdev_is_concrete(vdev_t *vd) +{ + vdev_ops_t *ops = vd->vdev_ops; + if (ops == &vdev_indirect_ops || ops == &vdev_hole_ops || + ops == &vdev_missing_ops || ops == &vdev_root_ops) { + return (B_FALSE); + } else { + return (B_TRUE); + } +} + +/* + * Determine if a log device has valid content. If the vdev was + * removed or faulted in the MOS config then we know that + * the content on the log device has already been written to the pool. + */ +boolean_t +vdev_log_state_valid(vdev_t *vd) +{ + if (vd->vdev_ops->vdev_op_leaf && !vd->vdev_faulted && + !vd->vdev_removed) + return (B_TRUE); + + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_log_state_valid(vd->vdev_child[c])) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(vdev_is_concrete(vd)); + + vdev_set_deflate_ratio(vd); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count && + vdev_is_concrete(vd)) { + vdev_metaslab_group_create(vd); + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +} + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; + } + vdev_propagate_state(cvd); +} + +void +vdev_deadman(vdev_t *vd, char *tag) +{ + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + vdev_deadman(cvd, tag); + } + + if (vd->vdev_ops->vdev_op_leaf) { + vdev_queue_t *vq = &vd->vdev_queue; + + mutex_enter(&vq->vq_lock); + if (avl_numnodes(&vq->vq_active_tree) > 0) { + spa_t *spa = vd->vdev_spa; + zio_t *fio; + uint64_t delta; + + zfs_dbgmsg("slow vdev: %s has %d active IOs", + vd->vdev_path, avl_numnodes(&vq->vq_active_tree)); + + /* + * Look at the head of all the pending queues, + * if any I/O has been outstanding for longer than + * the spa_deadman_synctime invoke the deadman logic. + */ + fio = avl_first(&vq->vq_active_tree); + delta = gethrtime() - fio->io_timestamp; + if (delta > spa_deadman_synctime(spa)) + zio_deadman(fio, tag); + } + mutex_exit(&vq->vq_lock); + } +} + +void +vdev_defer_resilver(vdev_t *vd) +{ + ASSERT(vd->vdev_ops->vdev_op_leaf); + + vd->vdev_resilver_deferred = B_TRUE; + vd->vdev_spa->spa_resilver_deferred = B_TRUE; +} + +/* + * Clears the resilver deferred flag on all leaf devs under vd. Returns + * B_TRUE if we have devices that need to be resilvered and are available to + * accept resilver I/Os. + */ +boolean_t +vdev_clear_resilver_deferred(vdev_t *vd, dmu_tx_t *tx) +{ + boolean_t resilver_needed = B_FALSE; + spa_t *spa = vd->vdev_spa; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + resilver_needed |= vdev_clear_resilver_deferred(cvd, tx); + } + + if (vd == spa->spa_root_vdev && + spa_feature_is_active(spa, SPA_FEATURE_RESILVER_DEFER)) { + spa_feature_decr(spa, SPA_FEATURE_RESILVER_DEFER, tx); + vdev_config_dirty(vd); + spa->spa_resilver_deferred = B_FALSE; + return (resilver_needed); + } + + if (!vdev_is_concrete(vd) || vd->vdev_aux || + !vd->vdev_ops->vdev_op_leaf) + return (resilver_needed); + + vd->vdev_resilver_deferred = B_FALSE; + + return (!vdev_is_dead(vd) && !vd->vdev_offline && + vdev_resilver_needed(vd, NULL, NULL)); +} + +/* + * Translate a logical range to the physical range for the specified vdev_t. + * This function is initially called with a leaf vdev and will walk each + * parent vdev until it reaches a top-level vdev. Once the top-level is + * reached the physical range is initialized and the recursive function + * begins to unwind. As it unwinds it calls the parent's vdev specific + * translation function to do the real conversion. + */ +void +vdev_xlate(vdev_t *vd, const range_seg64_t *logical_rs, + range_seg64_t *physical_rs) +{ + /* + * Walk up the vdev tree + */ + if (vd != vd->vdev_top) { + vdev_xlate(vd->vdev_parent, logical_rs, physical_rs); + } else { + /* + * We've reached the top-level vdev, initialize the + * physical range to the logical range and start to + * unwind. + */ + physical_rs->rs_start = logical_rs->rs_start; + physical_rs->rs_end = logical_rs->rs_end; + return; + } + + vdev_t *pvd = vd->vdev_parent; + ASSERT3P(pvd, !=, NULL); + ASSERT3P(pvd->vdev_ops->vdev_op_xlate, !=, NULL); + + /* + * As this recursive function unwinds, translate the logical + * range into its physical components by calling the + * vdev specific translate function. + */ + range_seg64_t intermediate = { 0 }; + pvd->vdev_ops->vdev_op_xlate(vd, physical_rs, &intermediate); + + physical_rs->rs_start = intermediate.rs_start; + physical_rs->rs_end = intermediate.rs_end; +} + +/* + * Look at the vdev tree and determine whether any devices are currently being + * replaced. + */ +boolean_t +vdev_replace_in_progress(vdev_t *vdev) +{ + ASSERT(spa_config_held(vdev->vdev_spa, SCL_ALL, RW_READER) != 0); + + if (vdev->vdev_ops == &vdev_replacing_ops) + return (B_TRUE); + + /* + * A 'spare' vdev indicates that we have a replace in progress, unless + * it has exactly two children, and the second, the hot spare, has + * finished being resilvered. + */ + if (vdev->vdev_ops == &vdev_spare_ops && (vdev->vdev_children > 2 || + !vdev_dtl_empty(vdev->vdev_child[1], DTL_MISSING))) + return (B_TRUE); + + for (int i = 0; i < vdev->vdev_children; i++) { + if (vdev_replace_in_progress(vdev->vdev_child[i])) + return (B_TRUE); + } + + return (B_FALSE); +} + +EXPORT_SYMBOL(vdev_fault); +EXPORT_SYMBOL(vdev_degrade); +EXPORT_SYMBOL(vdev_online); +EXPORT_SYMBOL(vdev_offline); +EXPORT_SYMBOL(vdev_clear); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_count, INT, ZMOD_RW, + "Target number of metaslabs per top-level vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, default_ms_shift, INT, ZMOD_RW, + "Default limit for metaslab size"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, min_ms_count, INT, ZMOD_RW, + "Minimum number of metaslabs per top-level vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, ms_count_limit, INT, ZMOD_RW, + "Practical upper limit of total metaslabs per top-level vdev"); + +ZFS_MODULE_PARAM(zfs, zfs_, slow_io_events_per_second, UINT, ZMOD_RW, + "Rate limit slow IO (delay) events to this many per second"); + +ZFS_MODULE_PARAM(zfs, zfs_, checksum_events_per_second, UINT, ZMOD_RW, + "Rate limit checksum events to this many checksum errors per second " + "(do not set below zed threshold)."); + +ZFS_MODULE_PARAM(zfs, zfs_, scan_ignore_errors, INT, ZMOD_RW, + "Ignore errors during resilver/scrub"); + +ZFS_MODULE_PARAM(zfs_vdev, vdev_, validate_skip, INT, ZMOD_RW, + "Bypass vdev_validate()"); + +ZFS_MODULE_PARAM(zfs, zfs_, nocacheflush, INT, ZMOD_RW, + "Disable cache flushes"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, min_auto_ashift, + param_set_min_auto_ashift, param_get_ulong, ZMOD_RW, + "Minimum ashift used when creating new top-level vdevs"); + +ZFS_MODULE_PARAM_CALL(zfs_vdev, zfs_vdev_, max_auto_ashift, + param_set_max_auto_ashift, param_get_ulong, ZMOD_RW, + "Maximum ashift used when optimizing for logical -> physical sector " + "size on new top-level vdevs"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_cache.c b/module/zfs/vdev_cache.c new file mode 100644 index 000000000000..6e82184b800d --- /dev/null +++ b/module/zfs/vdev_cache.c @@ -0,0 +1,437 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +/* + * Virtual device read-ahead caching. + * + * This file implements a simple LRU read-ahead cache. When the DMU reads + * a given block, it will often want other, nearby blocks soon thereafter. + * We take advantage of this by reading a larger disk region and caching + * the result. In the best case, this can turn 128 back-to-back 512-byte + * reads into a single 64k read followed by 127 cache hits; this reduces + * latency dramatically. In the worst case, it can turn an isolated 512-byte + * read into a 64k read, which doesn't affect latency all that much but is + * terribly wasteful of bandwidth. A more intelligent version of the cache + * could keep track of access patterns and not do read-ahead unless it sees + * at least two temporally close I/Os to the same region. Currently, only + * metadata I/O is inflated. A further enhancement could take advantage of + * more semantic information about the I/O. And it could use something + * faster than an AVL tree; that was chosen solely for convenience. + * + * There are five cache operations: allocate, fill, read, write, evict. + * + * (1) Allocate. This reserves a cache entry for the specified region. + * We separate the allocate and fill operations so that multiple threads + * don't generate I/O for the same cache miss. + * + * (2) Fill. When the I/O for a cache miss completes, the fill routine + * places the data in the previously allocated cache entry. + * + * (3) Read. Read data from the cache. + * + * (4) Write. Update cache contents after write completion. + * + * (5) Evict. When allocating a new entry, we evict the oldest (LRU) entry + * if the total cache size exceeds zfs_vdev_cache_size. + */ + +/* + * These tunables are for performance analysis. + */ +/* + * All i/os smaller than zfs_vdev_cache_max will be turned into + * 1<ve_offset, ve2->ve_offset)); +} + +static int +vdev_cache_lastused_compare(const void *a1, const void *a2) +{ + const vdev_cache_entry_t *ve1 = (const vdev_cache_entry_t *)a1; + const vdev_cache_entry_t *ve2 = (const vdev_cache_entry_t *)a2; + + int cmp = TREE_CMP(ve1->ve_lastused, ve2->ve_lastused); + if (likely(cmp)) + return (cmp); + + /* + * Among equally old entries, sort by offset to ensure uniqueness. + */ + return (vdev_cache_offset_compare(a1, a2)); +} + +/* + * Evict the specified entry from the cache. + */ +static void +vdev_cache_evict(vdev_cache_t *vc, vdev_cache_entry_t *ve) +{ + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + ASSERT3P(ve->ve_abd, !=, NULL); + + avl_remove(&vc->vc_lastused_tree, ve); + avl_remove(&vc->vc_offset_tree, ve); + abd_free(ve->ve_abd); + kmem_free(ve, sizeof (vdev_cache_entry_t)); +} + +/* + * Allocate an entry in the cache. At the point we don't have the data, + * we're just creating a placeholder so that multiple threads don't all + * go off and read the same blocks. + */ +static vdev_cache_entry_t * +vdev_cache_allocate(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + uint64_t offset = P2ALIGN(zio->io_offset, VCBS); + vdev_cache_entry_t *ve; + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + + if (zfs_vdev_cache_size == 0) + return (NULL); + + /* + * If adding a new entry would exceed the cache size, + * evict the oldest entry (LRU). + */ + if ((avl_numnodes(&vc->vc_lastused_tree) << zfs_vdev_cache_bshift) > + zfs_vdev_cache_size) { + ve = avl_first(&vc->vc_lastused_tree); + if (ve->ve_fill_io != NULL) + return (NULL); + ASSERT3U(ve->ve_hits, !=, 0); + vdev_cache_evict(vc, ve); + } + + ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve->ve_offset = offset; + ve->ve_lastused = ddi_get_lbolt(); + ve->ve_abd = abd_alloc_for_io(VCBS, B_TRUE); + + avl_add(&vc->vc_offset_tree, ve); + avl_add(&vc->vc_lastused_tree, ve); + + return (ve); +} + +static void +vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) +{ + uint64_t cache_phase = P2PHASE(zio->io_offset, VCBS); + + ASSERT(MUTEX_HELD(&vc->vc_lock)); + ASSERT3P(ve->ve_fill_io, ==, NULL); + + if (ve->ve_lastused != ddi_get_lbolt()) { + avl_remove(&vc->vc_lastused_tree, ve); + ve->ve_lastused = ddi_get_lbolt(); + avl_add(&vc->vc_lastused_tree, ve); + } + + ve->ve_hits++; + abd_copy_off(zio->io_abd, ve->ve_abd, 0, cache_phase, zio->io_size); +} + +/* + * Fill a previously allocated cache entry with data. + */ +static void +vdev_cache_fill(zio_t *fio) +{ + vdev_t *vd = fio->io_vd; + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; + + ASSERT3U(fio->io_size, ==, VCBS); + + /* + * Add data to the cache. + */ + mutex_enter(&vc->vc_lock); + + ASSERT3P(ve->ve_fill_io, ==, fio); + ASSERT3U(ve->ve_offset, ==, fio->io_offset); + ASSERT3P(ve->ve_abd, ==, fio->io_abd); + + ve->ve_fill_io = NULL; + + /* + * Even if this cache line was invalidated by a missed write update, + * any reads that were queued up before the missed update are still + * valid, so we can satisfy them from this line before we evict it. + */ + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(fio, &zl)) != NULL) + vdev_cache_hit(vc, ve, pio); + + if (fio->io_error || ve->ve_missed_update) + vdev_cache_evict(vc, ve); + + mutex_exit(&vc->vc_lock); +} + +/* + * Read data from the cache. Returns B_TRUE cache hit, B_FALSE on miss. + */ +boolean_t +vdev_cache_read(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, *ve_search; + uint64_t cache_offset = P2ALIGN(zio->io_offset, VCBS); + zio_t *fio; + uint64_t cache_phase __maybe_unused = P2PHASE(zio->io_offset, VCBS); + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + if (zio->io_flags & ZIO_FLAG_DONT_CACHE) + return (B_FALSE); + + if (zio->io_size > zfs_vdev_cache_max) + return (B_FALSE); + + /* + * If the I/O straddles two or more cache blocks, don't cache it. + */ + if (P2BOUNDARY(zio->io_offset, zio->io_size, VCBS)) + return (B_FALSE); + + ASSERT3U(cache_phase + zio->io_size, <=, VCBS); + + mutex_enter(&vc->vc_lock); + + ve_search = kmem_alloc(sizeof (vdev_cache_entry_t), KM_SLEEP); + ve_search->ve_offset = cache_offset; + ve = avl_find(&vc->vc_offset_tree, ve_search, NULL); + kmem_free(ve_search, sizeof (vdev_cache_entry_t)); + + if (ve != NULL) { + if (ve->ve_missed_update) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + if ((fio = ve->ve_fill_io) != NULL) { + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_delegations); + return (B_TRUE); + } + + vdev_cache_hit(vc, ve, zio); + zio_vdev_io_bypass(zio); + + mutex_exit(&vc->vc_lock); + VDCSTAT_BUMP(vdc_stat_hits); + return (B_TRUE); + } + + ve = vdev_cache_allocate(zio); + + if (ve == NULL) { + mutex_exit(&vc->vc_lock); + return (B_FALSE); + } + + fio = zio_vdev_delegated_io(zio->io_vd, cache_offset, + ve->ve_abd, VCBS, ZIO_TYPE_READ, ZIO_PRIORITY_NOW, + ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); + + ve->ve_fill_io = fio; + zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); + + mutex_exit(&vc->vc_lock); + zio_nowait(fio); + VDCSTAT_BUMP(vdc_stat_misses); + + return (B_TRUE); +} + +/* + * Update cache contents upon write completion. + */ +void +vdev_cache_write(zio_t *zio) +{ + vdev_cache_t *vc = &zio->io_vd->vdev_cache; + vdev_cache_entry_t *ve, ve_search; + uint64_t io_start = zio->io_offset; + uint64_t io_end = io_start + zio->io_size; + uint64_t min_offset = P2ALIGN(io_start, VCBS); + uint64_t max_offset = P2ROUNDUP(io_end, VCBS); + avl_index_t where; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + + mutex_enter(&vc->vc_lock); + + ve_search.ve_offset = min_offset; + ve = avl_find(&vc->vc_offset_tree, &ve_search, &where); + + if (ve == NULL) + ve = avl_nearest(&vc->vc_offset_tree, where, AVL_AFTER); + + while (ve != NULL && ve->ve_offset < max_offset) { + uint64_t start = MAX(ve->ve_offset, io_start); + uint64_t end = MIN(ve->ve_offset + VCBS, io_end); + + if (ve->ve_fill_io != NULL) { + ve->ve_missed_update = 1; + } else { + abd_copy_off(ve->ve_abd, zio->io_abd, + start - ve->ve_offset, start - io_start, + end - start); + } + ve = AVL_NEXT(&vc->vc_offset_tree, ve); + } + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_purge(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + vdev_cache_entry_t *ve; + + mutex_enter(&vc->vc_lock); + while ((ve = avl_first(&vc->vc_offset_tree)) != NULL) + vdev_cache_evict(vc, ve); + mutex_exit(&vc->vc_lock); +} + +void +vdev_cache_init(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + mutex_init(&vc->vc_lock, NULL, MUTEX_DEFAULT, NULL); + + avl_create(&vc->vc_offset_tree, vdev_cache_offset_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_offset_node)); + + avl_create(&vc->vc_lastused_tree, vdev_cache_lastused_compare, + sizeof (vdev_cache_entry_t), + offsetof(struct vdev_cache_entry, ve_lastused_node)); +} + +void +vdev_cache_fini(vdev_t *vd) +{ + vdev_cache_t *vc = &vd->vdev_cache; + + vdev_cache_purge(vd); + + avl_destroy(&vc->vc_offset_tree); + avl_destroy(&vc->vc_lastused_tree); + + mutex_destroy(&vc->vc_lock); +} + +void +vdev_cache_stat_init(void) +{ + vdc_ksp = kstat_create("zfs", 0, "vdev_cache_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (vdc_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (vdc_ksp != NULL) { + vdc_ksp->ks_data = &vdc_stats; + kstat_install(vdc_ksp); + } +} + +void +vdev_cache_stat_fini(void) +{ + if (vdc_ksp != NULL) { + kstat_delete(vdc_ksp); + vdc_ksp = NULL; + } +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_max, INT, ZMOD_RW, + "Inflate reads small than max"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_size, INT, ZMOD_RD, + "Total size of the per-disk cache"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, cache_bshift, INT, ZMOD_RW, + "Shift size to inflate reads too"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_indirect.c b/module/zfs/vdev_indirect.c new file mode 100644 index 000000000000..6a944f4e88b6 --- /dev/null +++ b/module/zfs/vdev_indirect.c @@ -0,0 +1,1890 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2014, 2017 by Delphix. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. + * Copyright (c) 2014, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * An indirect vdev corresponds to a vdev that has been removed. Since + * we cannot rewrite block pointers of snapshots, etc., we keep a + * mapping from old location on the removed device to the new location + * on another device in the pool and use this mapping whenever we need + * to access the DVA. Unfortunately, this mapping did not respect + * logical block boundaries when it was first created, and so a DVA on + * this indirect vdev may be "split" into multiple sections that each + * map to a different location. As a consequence, not all DVAs can be + * translated to an equivalent new DVA. Instead we must provide a + * "vdev_remap" operation that executes a callback on each contiguous + * segment of the new location. This function is used in multiple ways: + * + * - i/os to this vdev use the callback to determine where the + * data is now located, and issue child i/os for each segment's new + * location. + * + * - frees and claims to this vdev use the callback to free or claim + * each mapped segment. (Note that we don't actually need to claim + * log blocks on indirect vdevs, because we don't allocate to + * removing vdevs. However, zdb uses zio_claim() for its leak + * detection.) + */ + +/* + * "Big theory statement" for how we mark blocks obsolete. + * + * When a block on an indirect vdev is freed or remapped, a section of + * that vdev's mapping may no longer be referenced (aka "obsolete"). We + * keep track of how much of each mapping entry is obsolete. When + * an entry becomes completely obsolete, we can remove it, thus reducing + * the memory used by the mapping. The complete picture of obsolescence + * is given by the following data structures, described below: + * - the entry-specific obsolete count + * - the vdev-specific obsolete spacemap + * - the pool-specific obsolete bpobj + * + * == On disk data structures used == + * + * We track the obsolete space for the pool using several objects. Each + * of these objects is created on demand and freed when no longer + * needed, and is assumed to be empty if it does not exist. + * SPA_FEATURE_OBSOLETE_COUNTS includes the count of these objects. + * + * - Each vic_mapping_object (associated with an indirect vdev) can + * have a vimp_counts_object. This is an array of uint32_t's + * with the same number of entries as the vic_mapping_object. When + * the mapping is condensed, entries from the vic_obsolete_sm_object + * (see below) are folded into the counts. Therefore, each + * obsolete_counts entry tells us the number of bytes in the + * corresponding mapping entry that were not referenced when the + * mapping was last condensed. + * + * - Each indirect or removing vdev can have a vic_obsolete_sm_object. + * This is a space map containing an alloc entry for every DVA that + * has been obsoleted since the last time this indirect vdev was + * condensed. We use this object in order to improve performance + * when marking a DVA as obsolete. Instead of modifying an arbitrary + * offset of the vimp_counts_object, we only need to append an entry + * to the end of this object. When a DVA becomes obsolete, it is + * added to the obsolete space map. This happens when the DVA is + * freed, remapped and not referenced by a snapshot, or the last + * snapshot referencing it is destroyed. + * + * - Each dataset can have a ds_remap_deadlist object. This is a + * deadlist object containing all blocks that were remapped in this + * dataset but referenced in a previous snapshot. Blocks can *only* + * appear on this list if they were remapped (dsl_dataset_block_remapped); + * blocks that were killed in a head dataset are put on the normal + * ds_deadlist and marked obsolete when they are freed. + * + * - The pool can have a dp_obsolete_bpobj. This is a list of blocks + * in the pool that need to be marked obsolete. When a snapshot is + * destroyed, we move some of the ds_remap_deadlist to the obsolete + * bpobj (see dsl_destroy_snapshot_handle_remaps()). We then + * asynchronously process the obsolete bpobj, moving its entries to + * the specific vdevs' obsolete space maps. + * + * == Summary of how we mark blocks as obsolete == + * + * - When freeing a block: if any DVA is on an indirect vdev, append to + * vic_obsolete_sm_object. + * - When remapping a block, add dva to ds_remap_deadlist (if prev snap + * references; otherwise append to vic_obsolete_sm_object). + * - When freeing a snapshot: move parts of ds_remap_deadlist to + * dp_obsolete_bpobj (same algorithm as ds_deadlist). + * - When syncing the spa: process dp_obsolete_bpobj, moving ranges to + * individual vdev's vic_obsolete_sm_object. + */ + +/* + * "Big theory statement" for how we condense indirect vdevs. + * + * Condensing an indirect vdev's mapping is the process of determining + * the precise counts of obsolete space for each mapping entry (by + * integrating the obsolete spacemap into the obsolete counts) and + * writing out a new mapping that contains only referenced entries. + * + * We condense a vdev when we expect the mapping to shrink (see + * vdev_indirect_should_condense()), but only perform one condense at a + * time to limit the memory usage. In addition, we use a separate + * open-context thread (spa_condense_indirect_thread) to incrementally + * create the new mapping object in a way that minimizes the impact on + * the rest of the system. + * + * == Generating a new mapping == + * + * To generate a new mapping, we follow these steps: + * + * 1. Save the old obsolete space map and create a new mapping object + * (see spa_condense_indirect_start_sync()). This initializes the + * spa_condensing_indirect_phys with the "previous obsolete space map", + * which is now read only. Newly obsolete DVAs will be added to a + * new (initially empty) obsolete space map, and will not be + * considered as part of this condense operation. + * + * 2. Construct in memory the precise counts of obsolete space for each + * mapping entry, by incorporating the obsolete space map into the + * counts. (See vdev_indirect_mapping_load_obsolete_{counts,spacemap}().) + * + * 3. Iterate through each mapping entry, writing to the new mapping any + * entries that are not completely obsolete (i.e. which don't have + * obsolete count == mapping length). (See + * spa_condense_indirect_generate_new_mapping().) + * + * 4. Destroy the old mapping object and switch over to the new one + * (spa_condense_indirect_complete_sync). + * + * == Restarting from failure == + * + * To restart the condense when we import/open the pool, we must start + * at the 2nd step above: reconstruct the precise counts in memory, + * based on the space map + counts. Then in the 3rd step, we start + * iterating where we left off: at vimp_max_offset of the new mapping + * object. + */ + +int zfs_condense_indirect_vdevs_enable = B_TRUE; + +/* + * Condense if at least this percent of the bytes in the mapping is + * obsolete. With the default of 25%, the amount of space mapped + * will be reduced to 1% of its original size after at most 16 + * condenses. Higher values will condense less often (causing less + * i/o); lower values will reduce the mapping size more quickly. + */ +int zfs_indirect_condense_obsolete_pct = 25; + +/* + * Condense if the obsolete space map takes up more than this amount of + * space on disk (logically). This limits the amount of disk space + * consumed by the obsolete space map; the default of 1GB is small enough + * that we typically don't mind "wasting" it. + */ +unsigned long zfs_condense_max_obsolete_bytes = 1024 * 1024 * 1024; + +/* + * Don't bother condensing if the mapping uses less than this amount of + * memory. The default of 128KB is considered a "trivial" amount of + * memory and not worth reducing. + */ +unsigned long zfs_condense_min_mapping_bytes = 128 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a condense (which might otherwise + * complete too quickly). If used to reduce the performance impact of + * condensing in production, a maximum value of 1 should be sufficient. + */ +int zfs_condense_indirect_commit_entry_delay_ms = 0; + +/* + * If an indirect split block contains more than this many possible unique + * combinations when being reconstructed, consider it too computationally + * expensive to check them all. Instead, try at most 100 randomly-selected + * combinations each time the block is accessed. This allows all segment + * copies to participate fairly in the reconstruction when all combinations + * cannot be checked and prevents repeated use of one bad copy. + */ +int zfs_reconstruct_indirect_combinations_max = 4096; + +/* + * Enable to simulate damaged segments and validate reconstruction. This + * is intentionally not exposed as a module parameter. + */ +unsigned long zfs_reconstruct_indirect_damage_fraction = 0; + +/* + * The indirect_child_t represents the vdev that we will read from, when we + * need to read all copies of the data (e.g. for scrub or reconstruction). + * For plain (non-mirror) top-level vdevs (i.e. is_vdev is not a mirror), + * ic_vdev is the same as is_vdev. However, for mirror top-level vdevs, + * ic_vdev is a child of the mirror. + */ +typedef struct indirect_child { + abd_t *ic_data; + vdev_t *ic_vdev; + + /* + * ic_duplicate is NULL when the ic_data contents are unique, when it + * is determined to be a duplicate it references the primary child. + */ + struct indirect_child *ic_duplicate; + list_node_t ic_node; /* node on is_unique_child */ +} indirect_child_t; + +/* + * The indirect_split_t represents one mapped segment of an i/o to the + * indirect vdev. For non-split (contiguously-mapped) blocks, there will be + * only one indirect_split_t, with is_split_offset==0 and is_size==io_size. + * For split blocks, there will be several of these. + */ +typedef struct indirect_split { + list_node_t is_node; /* link on iv_splits */ + + /* + * is_split_offset is the offset into the i/o. + * This is the sum of the previous splits' is_size's. + */ + uint64_t is_split_offset; + + vdev_t *is_vdev; /* top-level vdev */ + uint64_t is_target_offset; /* offset on is_vdev */ + uint64_t is_size; + int is_children; /* number of entries in is_child[] */ + int is_unique_children; /* number of entries in is_unique_child */ + list_t is_unique_child; + + /* + * is_good_child is the child that we are currently using to + * attempt reconstruction. + */ + indirect_child_t *is_good_child; + + indirect_child_t is_child[1]; /* variable-length */ +} indirect_split_t; + +/* + * The indirect_vsd_t is associated with each i/o to the indirect vdev. + * It is the "Vdev-Specific Data" in the zio_t's io_vsd. + */ +typedef struct indirect_vsd { + boolean_t iv_split_block; + boolean_t iv_reconstruct; + uint64_t iv_unique_combinations; + uint64_t iv_attempts; + uint64_t iv_attempts_max; + + list_t iv_splits; /* list of indirect_split_t's */ +} indirect_vsd_t; + +static void +vdev_indirect_map_free(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + indirect_split_t *is; + while ((is = list_head(&iv->iv_splits)) != NULL) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic->ic_data != NULL) + abd_free(ic->ic_data); + } + list_remove(&iv->iv_splits, is); + + indirect_child_t *ic; + while ((ic = list_head(&is->is_unique_child)) != NULL) + list_remove(&is->is_unique_child, ic); + + list_destroy(&is->is_unique_child); + + kmem_free(is, + offsetof(indirect_split_t, is_child[is->is_children])); + } + kmem_free(iv, sizeof (*iv)); +} + +static const zio_vsd_ops_t vdev_indirect_vsd_ops = { + .vsd_free = vdev_indirect_map_free, + .vsd_cksum_report = zio_vsd_default_cksum_report +}; + +/* + * Mark the given offset and size as being obsolete. + */ +void +vdev_indirect_mark_obsolete(vdev_t *vd, uint64_t offset, uint64_t size) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, !=, 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(size > 0); + VERIFY(vdev_indirect_mapping_entry_for_offset( + vd->vdev_indirect_mapping, offset) != NULL); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + mutex_enter(&vd->vdev_obsolete_lock); + range_tree_add(vd->vdev_obsolete_segments, offset, size); + mutex_exit(&vd->vdev_obsolete_lock); + vdev_dirty(vd, 0, NULL, spa_syncing_txg(spa)); + } +} + +/* + * Mark the DVA vdev_id:offset:size as being obsolete in the given tx. This + * wrapper is provided because the DMU does not know about vdev_t's and + * cannot directly call vdev_indirect_mark_obsolete. + */ +void +spa_vdev_indirect_mark_obsolete(spa_t *spa, uint64_t vdev_id, uint64_t offset, + uint64_t size, dmu_tx_t *tx) +{ + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + ASSERT(dmu_tx_is_syncing(tx)); + + /* The DMU can only remap indirect vdevs. */ + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vdev_indirect_mark_obsolete(vd, offset, size); +} + +static spa_condensing_indirect_t * +spa_condensing_indirect_create(spa_t *spa) +{ + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + spa_condensing_indirect_t *sci = kmem_zalloc(sizeof (*sci), KM_SLEEP); + objset_t *mos = spa->spa_meta_objset; + + for (int i = 0; i < TXG_SIZE; i++) { + list_create(&sci->sci_new_mapping_entries[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + sci->sci_new_mapping = + vdev_indirect_mapping_open(mos, scip->scip_next_mapping_object); + + return (sci); +} + +static void +spa_condensing_indirect_destroy(spa_condensing_indirect_t *sci) +{ + for (int i = 0; i < TXG_SIZE; i++) + list_destroy(&sci->sci_new_mapping_entries[i]); + + if (sci->sci_new_mapping != NULL) + vdev_indirect_mapping_close(sci->sci_new_mapping); + + kmem_free(sci, sizeof (*sci)); +} + +boolean_t +vdev_indirect_should_condense(vdev_t *vd) +{ + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + spa_t *spa = vd->vdev_spa; + + ASSERT(dsl_pool_sync_context(spa->spa_dsl_pool)); + + if (!zfs_condense_indirect_vdevs_enable) + return (B_FALSE); + + /* + * We can only condense one indirect vdev at a time. + */ + if (spa->spa_condensing_indirect != NULL) + return (B_FALSE); + + if (spa_shutting_down(spa)) + return (B_FALSE); + + /* + * The mapping object size must not change while we are + * condensing, so we can only condense indirect vdevs + * (not vdevs that are still in the middle of being removed). + */ + if (vd->vdev_ops != &vdev_indirect_ops) + return (B_FALSE); + + /* + * If nothing new has been marked obsolete, there is no + * point in condensing. + */ + uint64_t obsolete_sm_obj __maybe_unused; + ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); + if (vd->vdev_obsolete_sm == NULL) { + ASSERT0(obsolete_sm_obj); + return (B_FALSE); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + + ASSERT3U(obsolete_sm_obj, ==, space_map_object(vd->vdev_obsolete_sm)); + + uint64_t bytes_mapped = vdev_indirect_mapping_bytes_mapped(vim); + uint64_t bytes_obsolete = space_map_allocated(vd->vdev_obsolete_sm); + uint64_t mapping_size = vdev_indirect_mapping_size(vim); + uint64_t obsolete_sm_size = space_map_length(vd->vdev_obsolete_sm); + + ASSERT3U(bytes_obsolete, <=, bytes_mapped); + + /* + * If a high percentage of the bytes that are mapped have become + * obsolete, condense (unless the mapping is already small enough). + * This has a good chance of reducing the amount of memory used + * by the mapping. + */ + if (bytes_obsolete * 100 / bytes_mapped >= + zfs_indirect_condense_obsolete_pct && + mapping_size > zfs_condense_min_mapping_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete " + "spacemap covers %d%% of %lluMB mapping", + (u_longlong_t)vd->vdev_id, + (int)(bytes_obsolete * 100 / bytes_mapped), + (u_longlong_t)bytes_mapped / 1024 / 1024); + return (B_TRUE); + } + + /* + * If the obsolete space map takes up too much space on disk, + * condense in order to free up this disk space. + */ + if (obsolete_sm_size >= zfs_condense_max_obsolete_bytes) { + zfs_dbgmsg("should condense vdev %llu because obsolete sm " + "length %lluMB >= max size %lluMB", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)obsolete_sm_size / 1024 / 1024, + (u_longlong_t)zfs_condense_max_obsolete_bytes / + 1024 / 1024); + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This sync task completes (finishes) a condense, deleting the old + * mapping and replacing it with the new one. + */ +static void +spa_condense_indirect_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + vdev_t *vd = vdev_lookup_top(spa, scip->scip_vdev); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_meta_objset; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_count = vdev_indirect_mapping_num_entries(old_mapping); + uint64_t new_count = + vdev_indirect_mapping_num_entries(sci->sci_new_mapping); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + + /* + * Reset vdev_indirect_mapping to refer to the new object. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_WRITER); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = sci->sci_new_mapping; + rw_exit(&vd->vdev_indirect_rwlock); + + sci->sci_new_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = scip->scip_next_mapping_object; + scip->scip_next_mapping_object = 0; + + space_map_free_obj(mos, scip->scip_prev_obsolete_sm_object, tx); + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + scip->scip_prev_obsolete_sm_object = 0; + + scip->scip_vdev = 0; + + VERIFY0(zap_remove(mos, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, tx)); + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + + zfs_dbgmsg("finished condense of vdev %llu in txg %llu: " + "new mapping object %llu has %llu entries " + "(was %llu entries)", + vd->vdev_id, dmu_tx_get_txg(tx), vic->vic_mapping_object, + new_count, old_count); + + vdev_config_dirty(spa->spa_root_vdev); +} + +/* + * This sync task appends entries to the new mapping object. + */ +static void +spa_condense_indirect_commit_sync(void *arg, dmu_tx_t *tx) +{ + spa_condensing_indirect_t *sci = arg; + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa __maybe_unused = dmu_tx_pool(tx)->dp_spa; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(sci, ==, spa->spa_condensing_indirect); + + vdev_indirect_mapping_add_entries(sci->sci_new_mapping, + &sci->sci_new_mapping_entries[txg & TXG_MASK], tx); + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[txg & TXG_MASK])); +} + +/* + * Open-context function to add one entry to the new mapping. The new + * entry will be remembered and written from syncing context. + */ +static void +spa_condense_indirect_commit_entry(spa_t *spa, + vdev_indirect_mapping_entry_phys_t *vimep, uint32_t count) +{ + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + + ASSERT3U(count, <, DVA_GET_ASIZE(&vimep->vimep_dst)); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + dmu_tx_hold_space(tx, sizeof (*vimep) + sizeof (count)); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + /* + * If we are the first entry committed this txg, kick off the sync + * task to write to the MOS on our behalf. + */ + if (list_is_empty(&sci->sci_new_mapping_entries[txgoff])) { + dsl_sync_task_nowait(dmu_tx_pool(tx), + spa_condense_indirect_commit_sync, sci, + 0, ZFS_SPACE_CHECK_NONE, tx); + } + + vdev_indirect_mapping_entry_t *vime = + kmem_alloc(sizeof (*vime), KM_SLEEP); + vime->vime_mapping = *vimep; + vime->vime_obsolete_count = count; + list_insert_tail(&sci->sci_new_mapping_entries[txgoff], vime); + + dmu_tx_commit(tx); +} + +static void +spa_condense_indirect_generate_new_mapping(vdev_t *vd, + uint32_t *obsolete_counts, uint64_t start_index, zthr_t *zthr) +{ + spa_t *spa = vd->vdev_spa; + uint64_t mapi = start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + uint64_t old_num_entries = + vdev_indirect_mapping_num_entries(old_mapping); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT3U(vd->vdev_id, ==, spa->spa_condensing_indirect_phys.scip_vdev); + + zfs_dbgmsg("starting condense of vdev %llu from index %llu", + (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + + while (mapi < old_num_entries) { + + if (zthr_iscancelled(zthr)) { + zfs_dbgmsg("pausing condense of vdev %llu " + "at index %llu", (u_longlong_t)vd->vdev_id, + (u_longlong_t)mapi); + break; + } + + vdev_indirect_mapping_entry_phys_t *entry = + &old_mapping->vim_entries[mapi]; + uint64_t entry_size = DVA_GET_ASIZE(&entry->vimep_dst); + ASSERT3U(obsolete_counts[mapi], <=, entry_size); + if (obsolete_counts[mapi] < entry_size) { + spa_condense_indirect_commit_entry(spa, entry, + obsolete_counts[mapi]); + + /* + * This delay may be requested for testing, debugging, + * or performance reasons. + */ + hrtime_t now = gethrtime(); + hrtime_t sleep_until = now + MSEC2NSEC( + zfs_condense_indirect_commit_entry_delay_ms); + zfs_sleep_until(sleep_until); + } + + mapi++; + } +} + +/* ARGSUSED */ +static boolean_t +spa_condense_indirect_thread_check(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + + return (spa->spa_condensing_indirect != NULL); +} + +/* ARGSUSED */ +static void +spa_condense_indirect_thread(void *arg, zthr_t *zthr) +{ + spa_t *spa = arg; + vdev_t *vd; + + ASSERT3P(spa->spa_condensing_indirect, !=, NULL); + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + vd = vdev_lookup_top(spa, spa->spa_condensing_indirect_phys.scip_vdev); + ASSERT3P(vd, !=, NULL); + spa_config_exit(spa, SCL_VDEV, FTAG); + + spa_condensing_indirect_t *sci = spa->spa_condensing_indirect; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + uint32_t *counts; + uint64_t start_index; + vdev_indirect_mapping_t *old_mapping = vd->vdev_indirect_mapping; + space_map_t *prev_obsolete_sm = NULL; + + ASSERT3U(vd->vdev_id, ==, scip->scip_vdev); + ASSERT(scip->scip_next_mapping_object != 0); + ASSERT(scip->scip_prev_obsolete_sm_object != 0); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + /* + * The list must start out empty in order for the + * _commit_sync() sync task to be properly registered + * on the first call to _commit_entry(); so it's wise + * to double check and ensure we actually are starting + * with empty lists. + */ + ASSERT(list_is_empty(&sci->sci_new_mapping_entries[i])); + } + + VERIFY0(space_map_open(&prev_obsolete_sm, spa->spa_meta_objset, + scip->scip_prev_obsolete_sm_object, 0, vd->vdev_asize, 0)); + counts = vdev_indirect_mapping_load_obsolete_counts(old_mapping); + if (prev_obsolete_sm != NULL) { + vdev_indirect_mapping_load_obsolete_spacemap(old_mapping, + counts, prev_obsolete_sm); + } + space_map_close(prev_obsolete_sm); + + /* + * Generate new mapping. Determine what index to continue from + * based on the max offset that we've already written in the + * new mapping. + */ + uint64_t max_offset = + vdev_indirect_mapping_max_offset(sci->sci_new_mapping); + if (max_offset == 0) { + /* We haven't written anything to the new mapping yet. */ + start_index = 0; + } else { + /* + * Pick up from where we left off. _entry_for_offset() + * returns a pointer into the vim_entries array. If + * max_offset is greater than any of the mappings + * contained in the table NULL will be returned and + * that indicates we've exhausted our iteration of the + * old_mapping. + */ + + vdev_indirect_mapping_entry_phys_t *entry = + vdev_indirect_mapping_entry_for_offset_or_next(old_mapping, + max_offset); + + if (entry == NULL) { + /* + * We've already written the whole new mapping. + * This special value will cause us to skip the + * generate_new_mapping step and just do the sync + * task to complete the condense. + */ + start_index = UINT64_MAX; + } else { + start_index = entry - old_mapping->vim_entries; + ASSERT3U(start_index, <, + vdev_indirect_mapping_num_entries(old_mapping)); + } + } + + spa_condense_indirect_generate_new_mapping(vd, counts, + start_index, zthr); + + vdev_indirect_mapping_free_obsolete_counts(old_mapping, counts); + + /* + * If the zthr has received a cancellation signal while running + * in generate_new_mapping() or at any point after that, then bail + * early. We don't want to complete the condense if the spa is + * shutting down. + */ + if (zthr_iscancelled(zthr)) + return; + + VERIFY0(dsl_sync_task(spa_name(spa), NULL, + spa_condense_indirect_complete_sync, sci, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + +/* + * Sync task to begin the condensing process. + */ +void +spa_condense_indirect_start_sync(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + spa_condensing_indirect_phys_t *scip = + &spa->spa_condensing_indirect_phys; + + ASSERT0(scip->scip_next_mapping_object); + ASSERT0(scip->scip_prev_obsolete_sm_object); + ASSERT0(scip->scip_vdev); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + ASSERT(vdev_indirect_mapping_num_entries(vd->vdev_indirect_mapping)); + + uint64_t obsolete_sm_obj; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_obj)); + ASSERT3U(obsolete_sm_obj, !=, 0); + + scip->scip_vdev = vd->vdev_id; + scip->scip_next_mapping_object = + vdev_indirect_mapping_alloc(spa->spa_meta_objset, tx); + + scip->scip_prev_obsolete_sm_object = obsolete_sm_obj; + + /* + * We don't need to allocate a new space map object, since + * vdev_indirect_sync_obsolete will allocate one when needed. + */ + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + + VERIFY0(zap_add(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (*scip) / sizeof (uint64_t), scip, tx)); + + ASSERT3P(spa->spa_condensing_indirect, ==, NULL); + spa->spa_condensing_indirect = spa_condensing_indirect_create(spa); + + zfs_dbgmsg("starting condense of vdev %llu in txg %llu: " + "posm=%llu nm=%llu", + vd->vdev_id, dmu_tx_get_txg(tx), + (u_longlong_t)scip->scip_prev_obsolete_sm_object, + (u_longlong_t)scip->scip_next_mapping_object); + + zthr_wakeup(spa->spa_condense_zthr); +} + +/* + * Sync to the given vdev's obsolete space map any segments that are no longer + * referenced as of the given txg. + * + * If the obsolete space map doesn't exist yet, create and open it. + */ +void +vdev_indirect_sync_obsolete(vdev_t *vd, dmu_tx_t *tx) +{ + spa_t *spa = vd->vdev_spa; + vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; + + ASSERT3U(vic->vic_mapping_object, !=, 0); + ASSERT(range_tree_space(vd->vdev_obsolete_segments) > 0); + ASSERT(vd->vdev_removing || vd->vdev_ops == &vdev_indirect_ops); + ASSERT(spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)); + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object == 0) { + obsolete_sm_object = space_map_alloc(spa->spa_meta_objset, + zfs_vdev_standard_sm_blksz, tx); + + ASSERT(vd->vdev_top_zap != 0); + VERIFY0(zap_add(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, + sizeof (obsolete_sm_object), 1, &obsolete_sm_object, tx)); + ASSERT0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + ASSERT3U(obsolete_sm_object, !=, 0); + + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(space_map_open(&vd->vdev_obsolete_sm, + spa->spa_meta_objset, obsolete_sm_object, + 0, vd->vdev_asize, 0)); + } + + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(obsolete_sm_object, ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_write(vd->vdev_obsolete_sm, + vd->vdev_obsolete_segments, SM_ALLOC, SM_NO_VDEVID, tx); + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); +} + +int +spa_condense_init(spa_t *spa) +{ + int error = zap_lookup(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_CONDENSING_INDIRECT, sizeof (uint64_t), + sizeof (spa->spa_condensing_indirect_phys) / sizeof (uint64_t), + &spa->spa_condensing_indirect_phys); + if (error == 0) { + if (spa_writeable(spa)) { + spa->spa_condensing_indirect = + spa_condensing_indirect_create(spa); + } + return (0); + } else if (error == ENOENT) { + return (0); + } else { + return (error); + } +} + +void +spa_condense_fini(spa_t *spa) +{ + if (spa->spa_condensing_indirect != NULL) { + spa_condensing_indirect_destroy(spa->spa_condensing_indirect); + spa->spa_condensing_indirect = NULL; + } +} + +void +spa_start_indirect_condensing_thread(spa_t *spa) +{ + ASSERT3P(spa->spa_condense_zthr, ==, NULL); + spa->spa_condense_zthr = zthr_create("z_indirect_condense", + spa_condense_indirect_thread_check, + spa_condense_indirect_thread, spa); +} + +/* + * Gets the obsolete spacemap object from the vdev's ZAP. On success sm_obj + * will contain either the obsolete spacemap object or zero if none exists. + * All other errors are returned to the caller. + */ +int +vdev_obsolete_sm_object(vdev_t *vd, uint64_t *sm_obj) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_top_zap == 0) { + *sm_obj = 0; + return (0); + } + + int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, sizeof (uint64_t), 1, sm_obj); + if (error == ENOENT) { + *sm_obj = 0; + error = 0; + } + + return (error); +} + +/* + * Gets the obsolete count are precise spacemap object from the vdev's ZAP. + * On success are_precise will be set to reflect if the counts are precise. + * All other errors are returned to the caller. + */ +int +vdev_obsolete_counts_are_precise(vdev_t *vd, boolean_t *are_precise) +{ + ASSERT0(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_top_zap == 0) { + *are_precise = B_FALSE; + return (0); + } + + uint64_t val = 0; + int error = zap_lookup(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (val), 1, &val); + if (error == 0) { + *are_precise = (val != 0); + } else if (error == ENOENT) { + *are_precise = B_FALSE; + error = 0; + } + + return (error); +} + +/* ARGSUSED */ +static void +vdev_indirect_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static int +vdev_indirect_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + *psize = *max_psize = vd->vdev_asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; + *logical_ashift = vd->vdev_ashift; + *physical_ashift = vd->vdev_physical_ashift; + return (0); +} + +typedef struct remap_segment { + vdev_t *rs_vd; + uint64_t rs_offset; + uint64_t rs_asize; + uint64_t rs_split_offset; + list_node_t rs_node; +} remap_segment_t; + +static remap_segment_t * +rs_alloc(vdev_t *vd, uint64_t offset, uint64_t asize, uint64_t split_offset) +{ + remap_segment_t *rs = kmem_alloc(sizeof (remap_segment_t), KM_SLEEP); + rs->rs_vd = vd; + rs->rs_offset = offset; + rs->rs_asize = asize; + rs->rs_split_offset = split_offset; + return (rs); +} + +/* + * Given an indirect vdev and an extent on that vdev, it duplicates the + * physical entries of the indirect mapping that correspond to the extent + * to a new array and returns a pointer to it. In addition, copied_entries + * is populated with the number of mapping entries that were duplicated. + * + * Note that the function assumes that the caller holds vdev_indirect_rwlock. + * This ensures that the mapping won't change due to condensing as we + * copy over its contents. + * + * Finally, since we are doing an allocation, it is up to the caller to + * free the array allocated in this function. + */ +static vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_duplicate_adjacent_entries(vdev_t *vd, uint64_t offset, + uint64_t asize, uint64_t *copied_entries) +{ + vdev_indirect_mapping_entry_phys_t *duplicate_mappings = NULL; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t entries = 0; + + ASSERT(RW_READ_HELD(&vd->vdev_indirect_rwlock)); + + vdev_indirect_mapping_entry_phys_t *first_mapping = + vdev_indirect_mapping_entry_for_offset(vim, offset); + ASSERT3P(first_mapping, !=, NULL); + + vdev_indirect_mapping_entry_phys_t *m = first_mapping; + while (asize > 0) { + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + + ASSERT3U(offset, >=, DVA_MAPPING_GET_SRC_OFFSET(m)); + ASSERT3U(offset, <, DVA_MAPPING_GET_SRC_OFFSET(m) + size); + + uint64_t inner_offset = offset - DVA_MAPPING_GET_SRC_OFFSET(m); + uint64_t inner_size = MIN(asize, size - inner_offset); + + offset += inner_size; + asize -= inner_size; + entries++; + m++; + } + + size_t copy_length = entries * sizeof (*first_mapping); + duplicate_mappings = kmem_alloc(copy_length, KM_SLEEP); + bcopy(first_mapping, duplicate_mappings, copy_length); + *copied_entries = entries; + + return (duplicate_mappings); +} + +/* + * Goes through the relevant indirect mappings until it hits a concrete vdev + * and issues the callback. On the way to the concrete vdev, if any other + * indirect vdevs are encountered, then the callback will also be called on + * each of those indirect vdevs. For example, if the segment is mapped to + * segment A on indirect vdev 1, and then segment A on indirect vdev 1 is + * mapped to segment B on concrete vdev 2, then the callback will be called on + * both vdev 1 and vdev 2. + * + * While the callback passed to vdev_indirect_remap() is called on every vdev + * the function encounters, certain callbacks only care about concrete vdevs. + * These types of callbacks should return immediately and explicitly when they + * are called on an indirect vdev. + * + * Because there is a possibility that a DVA section in the indirect device + * has been split into multiple sections in our mapping, we keep track + * of the relevant contiguous segments of the new location (remap_segment_t) + * in a stack. This way we can call the callback for each of the new sections + * created by a single section of the indirect device. Note though, that in + * this scenario the callbacks in each split block won't occur in-order in + * terms of offset, so callers should not make any assumptions about that. + * + * For callbacks that don't handle split blocks and immediately return when + * they encounter them (as is the case for remap_blkptr_cb), the caller can + * assume that its callback will be applied from the first indirect vdev + * encountered to the last one and then the concrete vdev, in that order. + */ +static void +vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, + void (*func)(uint64_t, vdev_t *, uint64_t, uint64_t, void *), void *arg) +{ + list_t stack; + spa_t *spa = vd->vdev_spa; + + list_create(&stack, sizeof (remap_segment_t), + offsetof(remap_segment_t, rs_node)); + + for (remap_segment_t *rs = rs_alloc(vd, offset, asize, 0); + rs != NULL; rs = list_remove_head(&stack)) { + vdev_t *v = rs->rs_vd; + uint64_t num_entries = 0; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + ASSERT(rs->rs_asize > 0); + + /* + * Note: As this function can be called from open context + * (e.g. zio_read()), we need the following rwlock to + * prevent the mapping from being changed by condensing. + * + * So we grab the lock and we make a copy of the entries + * that are relevant to the extent that we are working on. + * Once that is done, we drop the lock and iterate over + * our copy of the mapping. Once we are done with the with + * the remap segment and we free it, we also free our copy + * of the indirect mapping entries that are relevant to it. + * + * This way we don't need to wait until the function is + * finished with a segment, to condense it. In addition, we + * don't need a recursive rwlock for the case that a call to + * vdev_indirect_remap() needs to call itself (through the + * codepath of its callback) for the same vdev in the middle + * of its execution. + */ + rw_enter(&v->vdev_indirect_rwlock, RW_READER); + ASSERT3P(v->vdev_indirect_mapping, !=, NULL); + + vdev_indirect_mapping_entry_phys_t *mapping = + vdev_indirect_mapping_duplicate_adjacent_entries(v, + rs->rs_offset, rs->rs_asize, &num_entries); + ASSERT3P(mapping, !=, NULL); + ASSERT3U(num_entries, >, 0); + rw_exit(&v->vdev_indirect_rwlock); + + for (uint64_t i = 0; i < num_entries; i++) { + /* + * Note: the vdev_indirect_mapping can not change + * while we are running. It only changes while the + * removal is in progress, and then only from syncing + * context. While a removal is in progress, this + * function is only called for frees, which also only + * happen from syncing context. + */ + vdev_indirect_mapping_entry_phys_t *m = &mapping[i]; + + ASSERT3P(m, !=, NULL); + ASSERT3U(rs->rs_asize, >, 0); + + uint64_t size = DVA_GET_ASIZE(&m->vimep_dst); + uint64_t dst_offset = DVA_GET_OFFSET(&m->vimep_dst); + uint64_t dst_vdev = DVA_GET_VDEV(&m->vimep_dst); + + ASSERT3U(rs->rs_offset, >=, + DVA_MAPPING_GET_SRC_OFFSET(m)); + ASSERT3U(rs->rs_offset, <, + DVA_MAPPING_GET_SRC_OFFSET(m) + size); + ASSERT3U(dst_vdev, !=, v->vdev_id); + + uint64_t inner_offset = rs->rs_offset - + DVA_MAPPING_GET_SRC_OFFSET(m); + uint64_t inner_size = + MIN(rs->rs_asize, size - inner_offset); + + vdev_t *dst_v = vdev_lookup_top(spa, dst_vdev); + ASSERT3P(dst_v, !=, NULL); + + if (dst_v->vdev_ops == &vdev_indirect_ops) { + list_insert_head(&stack, + rs_alloc(dst_v, dst_offset + inner_offset, + inner_size, rs->rs_split_offset)); + + } + + if ((zfs_flags & ZFS_DEBUG_INDIRECT_REMAP) && + IS_P2ALIGNED(inner_size, 2 * SPA_MINBLOCKSIZE)) { + /* + * Note: This clause exists only solely for + * testing purposes. We use it to ensure that + * split blocks work and that the callbacks + * using them yield the same result if issued + * in reverse order. + */ + uint64_t inner_half = inner_size / 2; + + func(rs->rs_split_offset + inner_half, dst_v, + dst_offset + inner_offset + inner_half, + inner_half, arg); + + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_half, arg); + } else { + func(rs->rs_split_offset, dst_v, + dst_offset + inner_offset, + inner_size, arg); + } + + rs->rs_offset += inner_size; + rs->rs_asize -= inner_size; + rs->rs_split_offset += inner_size; + } + VERIFY0(rs->rs_asize); + + kmem_free(mapping, num_entries * sizeof (*mapping)); + kmem_free(rs, sizeof (remap_segment_t)); + } + list_destroy(&stack); +} + +static void +vdev_indirect_child_io_done(zio_t *zio) +{ + zio_t *pio = zio->io_private; + + mutex_enter(&pio->io_lock); + pio->io_error = zio_worst_error(pio->io_error, zio->io_error); + mutex_exit(&pio->io_lock); + + abd_put(zio->io_abd); +} + +/* + * This is a callback for vdev_indirect_remap() which allocates an + * indirect_split_t for each split segment and adds it to iv_splits. + */ +static void +vdev_indirect_gather_splits(uint64_t split_offset, vdev_t *vd, uint64_t offset, + uint64_t size, void *arg) +{ + zio_t *zio = arg; + indirect_vsd_t *iv = zio->io_vsd; + + ASSERT3P(vd, !=, NULL); + + if (vd->vdev_ops == &vdev_indirect_ops) + return; + + int n = 1; + if (vd->vdev_ops == &vdev_mirror_ops) + n = vd->vdev_children; + + indirect_split_t *is = + kmem_zalloc(offsetof(indirect_split_t, is_child[n]), KM_SLEEP); + + is->is_children = n; + is->is_size = size; + is->is_split_offset = split_offset; + is->is_target_offset = offset; + is->is_vdev = vd; + list_create(&is->is_unique_child, sizeof (indirect_child_t), + offsetof(indirect_child_t, ic_node)); + + /* + * Note that we only consider multiple copies of the data for + * *mirror* vdevs. We don't for "replacing" or "spare" vdevs, even + * though they use the same ops as mirror, because there's only one + * "good" copy under the replacing/spare. + */ + if (vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < n; i++) { + is->is_child[i].ic_vdev = vd->vdev_child[i]; + list_link_init(&is->is_child[i].ic_node); + } + } else { + is->is_child[0].ic_vdev = vd; + } + + list_insert_tail(&iv->iv_splits, is); +} + +static void +vdev_indirect_read_split_done(zio_t *zio) +{ + indirect_child_t *ic = zio->io_private; + + if (zio->io_error != 0) { + /* + * Clear ic_data to indicate that we do not have data for this + * child. + */ + abd_free(ic->ic_data); + ic->ic_data = NULL; + } +} + +/* + * Issue reads for all copies (mirror children) of all splits. + */ +static void +vdev_indirect_read_all(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic = &is->is_child[i]; + + if (!vdev_readable(ic->ic_vdev)) + continue; + + /* + * Note, we may read from a child whose DTL + * indicates that the data may not be present here. + * While this might result in a few i/os that will + * likely return incorrect data, it simplifies the + * code since we can treat scrub and resilver + * identically. (The incorrect data will be + * detected and ignored when we verify the + * checksum.) + */ + + ic->ic_data = abd_alloc_sametype(zio->io_abd, + is->is_size); + ic->ic_duplicate = NULL; + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, ic->ic_data, + is->is_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_read_split_done, ic)); + } + } + iv->iv_reconstruct = B_TRUE; +} + +static void +vdev_indirect_io_start(zio_t *zio) +{ + spa_t *spa __maybe_unused = zio->io_spa; + indirect_vsd_t *iv = kmem_zalloc(sizeof (*iv), KM_SLEEP); + list_create(&iv->iv_splits, + sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + + zio->io_vsd = iv; + zio->io_vsd_ops = &vdev_indirect_vsd_ops; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); + if (zio->io_type != ZIO_TYPE_READ) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + /* + * Note: this code can handle other kinds of writes, + * but we don't expect them. + */ + ASSERT((zio->io_flags & (ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)) != 0); + } + + vdev_indirect_remap(zio->io_vd, zio->io_offset, zio->io_size, + vdev_indirect_gather_splits, zio); + + indirect_split_t *first = list_head(&iv->iv_splits); + if (first->is_size == zio->io_size) { + /* + * This is not a split block; we are pointing to the entire + * data, which will checksum the same as the original data. + * Pass the BP down so that the child i/o can verify the + * checksum, and try a different location if available + * (e.g. on a mirror). + * + * While this special case could be handled the same as the + * general (split block) case, doing it this way ensures + * that the vast majority of blocks on indirect vdevs + * (which are not split) are handled identically to blocks + * on non-indirect vdevs. This allows us to be less strict + * about performance in the general (but rare) case. + */ + ASSERT0(first->is_split_offset); + ASSERT3P(list_next(&iv->iv_splits, first), ==, NULL); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + first->is_vdev, first->is_target_offset, + abd_get_offset(zio->io_abd, 0), + zio->io_size, zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } else { + iv->iv_split_block = B_TRUE; + if (zio->io_type == ZIO_TYPE_READ && + zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER)) { + /* + * Read all copies. Note that for simplicity, + * we don't bother consulting the DTL in the + * resilver case. + */ + vdev_indirect_read_all(zio); + } else { + /* + * If this is a read zio, we read one copy of each + * split segment, from the top-level vdev. Since + * we don't know the checksum of each split + * individually, the child zio can't ensure that + * we get the right data. E.g. if it's a mirror, + * it will just read from a random (healthy) leaf + * vdev. We have to verify the checksum in + * vdev_indirect_io_done(). + * + * For write zios, the vdev code will ensure we write + * to all children. + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + zio_nowait(zio_vdev_child_io(zio, NULL, + is->is_vdev, is->is_target_offset, + abd_get_offset(zio->io_abd, + is->is_split_offset), is->is_size, + zio->io_type, zio->io_priority, 0, + vdev_indirect_child_io_done, zio)); + } + + } + } + + zio_execute(zio); +} + +/* + * Report a checksum error for a child. + */ +static void +vdev_indirect_checksum_error(zio_t *zio, + indirect_split_t *is, indirect_child_t *ic) +{ + vdev_t *vd = ic->ic_vdev; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zio_bad_cksum_t zbc = {{{ 0 }}}; + abd_t *bad_abd = ic->ic_data; + abd_t *good_abd = is->is_good_child->ic_data; + zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + is->is_target_offset, is->is_size, good_abd, bad_abd, &zbc); +} + +/* + * Issue repair i/os for any incorrect copies. We do this by comparing + * each split segment's correct data (is_good_child's ic_data) with each + * other copy of the data. If they differ, then we overwrite the bad data + * with the good copy. Note that we do this without regard for the DTL's, + * which simplifies this code and also issues the optimal number of writes + * (based on which copies actually read bad data, as opposed to which we + * think might be wrong). For the same reason, we always use + * ZIO_FLAG_SELF_HEAL, to bypass the DTL check in zio_vdev_io_start(). + */ +static void +vdev_indirect_repair(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + enum zio_flag flags = ZIO_FLAG_IO_REPAIR; + + if (!(zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) + flags |= ZIO_FLAG_SELF_HEAL; + + if (!spa_writeable(zio->io_spa)) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + if (ic == is->is_good_child) + continue; + if (ic->ic_data == NULL) + continue; + if (ic->ic_duplicate == is->is_good_child) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, + ic->ic_vdev, is->is_target_offset, + is->is_good_child->ic_data, is->is_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | ZIO_FLAG_SELF_HEAL, + NULL, NULL)); + + vdev_indirect_checksum_error(zio, is, ic); + } + } +} + +/* + * Report checksum errors on all children that we read from. + */ +static void +vdev_indirect_all_checksum_errors(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (zio->io_flags & ZIO_FLAG_SPECULATIVE) + return; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + + if (ic->ic_data == NULL) + continue; + + vdev_t *vd = ic->ic_vdev; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zfs_ereport_post_checksum(zio->io_spa, vd, NULL, zio, + is->is_target_offset, is->is_size, + NULL, NULL, NULL); + } + } +} + +/* + * Copy data from all the splits to a main zio then validate the checksum. + * If then checksum is successfully validated return success. + */ +static int +vdev_indirect_splits_checksum_validate(indirect_vsd_t *iv, zio_t *zio) +{ + zio_bad_cksum_t zbc; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + + ASSERT3P(is->is_good_child->ic_data, !=, NULL); + ASSERT3P(is->is_good_child->ic_duplicate, ==, NULL); + + abd_copy_off(zio->io_abd, is->is_good_child->ic_data, + is->is_split_offset, 0, is->is_size); + } + + return (zio_checksum_error(zio, &zbc)); +} + +/* + * There are relatively few possible combinations making it feasible to + * deterministically check them all. We do this by setting the good_child + * to the next unique split version. If we reach the end of the list then + * "carry over" to the next unique split version (like counting in base + * is_unique_children, but each digit can have a different base). + */ +static int +vdev_indirect_splits_enumerate_all(indirect_vsd_t *iv, zio_t *zio) +{ + boolean_t more = B_TRUE; + + iv->iv_attempts = 0; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) + is->is_good_child = list_head(&is->is_unique_child); + + while (more == B_TRUE) { + iv->iv_attempts++; + more = B_FALSE; + + if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) + return (0); + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_good_child = list_next(&is->is_unique_child, + is->is_good_child); + if (is->is_good_child != NULL) { + more = B_TRUE; + break; + } + + is->is_good_child = list_head(&is->is_unique_child); + } + } + + ASSERT3S(iv->iv_attempts, <=, iv->iv_unique_combinations); + + return (SET_ERROR(ECKSUM)); +} + +/* + * There are too many combinations to try all of them in a reasonable amount + * of time. So try a fixed number of random combinations from the unique + * split versions, after which we'll consider the block unrecoverable. + */ +static int +vdev_indirect_splits_enumerate_randomly(indirect_vsd_t *iv, zio_t *zio) +{ + iv->iv_attempts = 0; + + while (iv->iv_attempts < iv->iv_attempts_max) { + iv->iv_attempts++; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + indirect_child_t *ic = list_head(&is->is_unique_child); + int children = is->is_unique_children; + + for (int i = spa_get_random(children); i > 0; i--) + ic = list_next(&is->is_unique_child, ic); + + ASSERT3P(ic, !=, NULL); + is->is_good_child = ic; + } + + if (vdev_indirect_splits_checksum_validate(iv, zio) == 0) + return (0); + } + + return (SET_ERROR(ECKSUM)); +} + +/* + * This is a validation function for reconstruction. It randomly selects + * a good combination, if one can be found, and then it intentionally + * damages all other segment copes by zeroing them. This forces the + * reconstruction algorithm to locate the one remaining known good copy. + */ +static int +vdev_indirect_splits_damage(indirect_vsd_t *iv, zio_t *zio) +{ + int error; + + /* Presume all the copies are unique for initial selection. */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_unique_children = 0; + + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic = &is->is_child[i]; + if (ic->ic_data != NULL) { + is->is_unique_children++; + list_insert_tail(&is->is_unique_child, ic); + } + } + + if (list_is_empty(&is->is_unique_child)) { + error = SET_ERROR(EIO); + goto out; + } + } + + /* + * Set each is_good_child to a randomly-selected child which + * is known to contain validated data. + */ + error = vdev_indirect_splits_enumerate_randomly(iv, zio); + if (error) + goto out; + + /* + * Damage all but the known good copy by zeroing it. This will + * result in two or less unique copies per indirect_child_t. + * Both may need to be checked in order to reconstruct the block. + * Set iv->iv_attempts_max such that all unique combinations will + * enumerated, but limit the damage to at most 12 indirect splits. + */ + iv->iv_attempts_max = 1; + + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + for (int c = 0; c < is->is_children; c++) { + indirect_child_t *ic = &is->is_child[c]; + + if (ic == is->is_good_child) + continue; + if (ic->ic_data == NULL) + continue; + + abd_zero(ic->ic_data, abd_get_size(ic->ic_data)); + } + + iv->iv_attempts_max *= 2; + if (iv->iv_attempts_max >= (1ULL << 12)) { + iv->iv_attempts_max = UINT64_MAX; + break; + } + } + +out: + /* Empty the unique children lists so they can be reconstructed. */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + indirect_child_t *ic; + while ((ic = list_head(&is->is_unique_child)) != NULL) + list_remove(&is->is_unique_child, ic); + + is->is_unique_children = 0; + } + + return (error); +} + +/* + * This function is called when we have read all copies of the data and need + * to try to find a combination of copies that gives us the right checksum. + * + * If we pointed to any mirror vdevs, this effectively does the job of the + * mirror. The mirror vdev code can't do its own job because we don't know + * the checksum of each split segment individually. + * + * We have to try every unique combination of copies of split segments, until + * we find one that checksums correctly. Duplicate segment copies are first + * identified and latter skipped during reconstruction. This optimization + * reduces the search space and ensures that of the remaining combinations + * at most one is correct. + * + * When the total number of combinations is small they can all be checked. + * For example, if we have 3 segments in the split, and each points to a + * 2-way mirror with unique copies, we will have the following pieces of data: + * + * | mirror child + * split | [0] [1] + * ======|===================== + * A | data_A_0 data_A_1 + * B | data_B_0 data_B_1 + * C | data_C_0 data_C_1 + * + * We will try the following (mirror children)^(number of splits) (2^3=8) + * combinations, which is similar to bitwise-little-endian counting in + * binary. In general each "digit" corresponds to a split segment, and the + * base of each digit is is_children, which can be different for each + * digit. + * + * "low bit" "high bit" + * v v + * data_A_0 data_B_0 data_C_0 + * data_A_1 data_B_0 data_C_0 + * data_A_0 data_B_1 data_C_0 + * data_A_1 data_B_1 data_C_0 + * data_A_0 data_B_0 data_C_1 + * data_A_1 data_B_0 data_C_1 + * data_A_0 data_B_1 data_C_1 + * data_A_1 data_B_1 data_C_1 + * + * Note that the split segments may be on the same or different top-level + * vdevs. In either case, we may need to try lots of combinations (see + * zfs_reconstruct_indirect_combinations_max). This ensures that if a mirror + * has small silent errors on all of its children, we can still reconstruct + * the correct data, as long as those errors are at sufficiently-separated + * offsets (specifically, separated by the largest block size - default of + * 128KB, but up to 16MB). + */ +static void +vdev_indirect_reconstruct_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + boolean_t known_good = B_FALSE; + int error; + + iv->iv_unique_combinations = 1; + iv->iv_attempts_max = UINT64_MAX; + + if (zfs_reconstruct_indirect_combinations_max > 0) + iv->iv_attempts_max = zfs_reconstruct_indirect_combinations_max; + + /* + * If nonzero, every 1/x blocks will be damaged, in order to validate + * reconstruction when there are split segments with damaged copies. + * Known_good will be TRUE when reconstruction is known to be possible. + */ + if (zfs_reconstruct_indirect_damage_fraction != 0 && + spa_get_random(zfs_reconstruct_indirect_damage_fraction) == 0) + known_good = (vdev_indirect_splits_damage(iv, zio) == 0); + + /* + * Determine the unique children for a split segment and add them + * to the is_unique_child list. By restricting reconstruction + * to these children, only unique combinations will be considered. + * This can vastly reduce the search space when there are a large + * number of indirect splits. + */ + for (indirect_split_t *is = list_head(&iv->iv_splits); + is != NULL; is = list_next(&iv->iv_splits, is)) { + is->is_unique_children = 0; + + for (int i = 0; i < is->is_children; i++) { + indirect_child_t *ic_i = &is->is_child[i]; + + if (ic_i->ic_data == NULL || + ic_i->ic_duplicate != NULL) + continue; + + for (int j = i + 1; j < is->is_children; j++) { + indirect_child_t *ic_j = &is->is_child[j]; + + if (ic_j->ic_data == NULL || + ic_j->ic_duplicate != NULL) + continue; + + if (abd_cmp(ic_i->ic_data, ic_j->ic_data) == 0) + ic_j->ic_duplicate = ic_i; + } + + is->is_unique_children++; + list_insert_tail(&is->is_unique_child, ic_i); + } + + /* Reconstruction is impossible, no valid children */ + EQUIV(list_is_empty(&is->is_unique_child), + is->is_unique_children == 0); + if (list_is_empty(&is->is_unique_child)) { + zio->io_error = EIO; + vdev_indirect_all_checksum_errors(zio); + zio_checksum_verified(zio); + return; + } + + iv->iv_unique_combinations *= is->is_unique_children; + } + + if (iv->iv_unique_combinations <= iv->iv_attempts_max) + error = vdev_indirect_splits_enumerate_all(iv, zio); + else + error = vdev_indirect_splits_enumerate_randomly(iv, zio); + + if (error != 0) { + /* All attempted combinations failed. */ + ASSERT3B(known_good, ==, B_FALSE); + zio->io_error = error; + vdev_indirect_all_checksum_errors(zio); + } else { + /* + * The checksum has been successfully validated. Issue + * repair I/Os to any copies of splits which don't match + * the validated version. + */ + ASSERT0(vdev_indirect_splits_checksum_validate(iv, zio)); + vdev_indirect_repair(zio); + zio_checksum_verified(zio); + } +} + +static void +vdev_indirect_io_done(zio_t *zio) +{ + indirect_vsd_t *iv = zio->io_vsd; + + if (iv->iv_reconstruct) { + /* + * We have read all copies of the data (e.g. from mirrors), + * either because this was a scrub/resilver, or because the + * one-copy read didn't checksum correctly. + */ + vdev_indirect_reconstruct_io_done(zio); + return; + } + + if (!iv->iv_split_block) { + /* + * This was not a split block, so we passed the BP down, + * and the checksum was handled by the (one) child zio. + */ + return; + } + + zio_bad_cksum_t zbc; + int ret = zio_checksum_error(zio, &zbc); + if (ret == 0) { + zio_checksum_verified(zio); + return; + } + + /* + * The checksum didn't match. Read all copies of all splits, and + * then we will try to reconstruct. The next time + * vdev_indirect_io_done() is called, iv_reconstruct will be set. + */ + vdev_indirect_read_all(zio); + + zio_vdev_io_redone(zio); +} + +vdev_ops_t vdev_indirect_ops = { + .vdev_op_open = vdev_indirect_open, + .vdev_op_close = vdev_indirect_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_indirect_io_start, + .vdev_op_io_done = vdev_indirect_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = vdev_indirect_remap, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* leaf vdev */ +}; + +EXPORT_SYMBOL(spa_condense_fini); +EXPORT_SYMBOL(spa_start_indirect_condensing_thread); +EXPORT_SYMBOL(spa_condense_indirect_start_sync); +EXPORT_SYMBOL(spa_condense_init); +EXPORT_SYMBOL(spa_vdev_indirect_mark_obsolete); +EXPORT_SYMBOL(vdev_indirect_mark_obsolete); +EXPORT_SYMBOL(vdev_indirect_should_condense); +EXPORT_SYMBOL(vdev_indirect_sync_obsolete); +EXPORT_SYMBOL(vdev_obsolete_counts_are_precise); +EXPORT_SYMBOL(vdev_obsolete_sm_object); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_vdevs_enable, INT, ZMOD_RW, + "Whether to attempt condensing indirect vdev mappings"); + +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, min_mapping_bytes, ULONG, ZMOD_RW, + "Don't bother condensing if the mapping uses less than this amount of " + "memory"); + +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, max_obsolete_bytes, ULONG, ZMOD_RW, + "Minimum size obsolete spacemap to attempt condensing"); + +ZFS_MODULE_PARAM(zfs_condense, zfs_condense_, indirect_commit_entry_delay_ms, INT, ZMOD_RW, + "Used by tests to ensure certain actions happen in the middle of a " + "condense. A maximum value of 1 should be sufficient."); + +ZFS_MODULE_PARAM(zfs_reconstruct, zfs_reconstruct_, indirect_combinations_max, INT, ZMOD_RW, + "Maximum number of combinations when reconstructing split segments"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_indirect_births.c b/module/zfs/vdev_indirect_births.c new file mode 100644 index 000000000000..99b83c392257 --- /dev/null +++ b/module/zfs/vdev_indirect_births.c @@ -0,0 +1,226 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +static boolean_t +vdev_indirect_births_verify(vdev_indirect_births_t *vib) +{ + ASSERT(vib != NULL); + + ASSERT(vib->vib_object != 0); + ASSERT(vib->vib_objset != NULL); + ASSERT(vib->vib_phys != NULL); + ASSERT(vib->vib_dbuf != NULL); + + EQUIV(vib->vib_phys->vib_count > 0, vib->vib_entries != NULL); + + return (B_TRUE); +} +#endif + +uint64_t +vdev_indirect_births_count(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_phys->vib_count); +} + +uint64_t +vdev_indirect_births_object(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib->vib_object); +} + +static uint64_t +vdev_indirect_births_size_impl(vdev_indirect_births_t *vib) +{ + return (vib->vib_phys->vib_count * sizeof (*vib->vib_entries)); +} + +void +vdev_indirect_births_close(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + + vmem_free(vib->vib_entries, births_size); + vib->vib_entries = NULL; + } + + dmu_buf_rele(vib->vib_dbuf, vib); + + vib->vib_objset = NULL; + vib->vib_object = 0; + vib->vib_dbuf = NULL; + vib->vib_phys = NULL; + + kmem_free(vib, sizeof (*vib)); +} + +uint64_t +vdev_indirect_births_alloc(objset_t *os, dmu_tx_t *tx) +{ + ASSERT(dmu_tx_is_syncing(tx)); + + return (dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, sizeof (vdev_indirect_birth_phys_t), + tx)); +} + +vdev_indirect_births_t * +vdev_indirect_births_open(objset_t *os, uint64_t births_object) +{ + vdev_indirect_births_t *vib = kmem_zalloc(sizeof (*vib), KM_SLEEP); + + vib->vib_objset = os; + vib->vib_object = births_object; + + VERIFY0(dmu_bonus_hold(os, vib->vib_object, vib, &vib->vib_dbuf)); + vib->vib_phys = vib->vib_dbuf->db_data; + + if (vib->vib_phys->vib_count > 0) { + uint64_t births_size = vdev_indirect_births_size_impl(vib); + vib->vib_entries = vmem_alloc(births_size, KM_SLEEP); + VERIFY0(dmu_read(vib->vib_objset, vib->vib_object, 0, + births_size, vib->vib_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_births_verify(vib)); + + return (vib); +} + +void +vdev_indirect_births_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + VERIFY0(dmu_object_free(os, object, tx)); +} + +void +vdev_indirect_births_add_entry(vdev_indirect_births_t *vib, + uint64_t max_offset, uint64_t txg, dmu_tx_t *tx) +{ + vdev_indirect_birth_entry_phys_t vibe; + uint64_t old_size; + uint64_t new_size; + vdev_indirect_birth_entry_phys_t *new_entries; + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(vdev_indirect_births_verify(vib)); + + dmu_buf_will_dirty(vib->vib_dbuf, tx); + + vibe.vibe_offset = max_offset; + vibe.vibe_phys_birth_txg = txg; + + old_size = vdev_indirect_births_size_impl(vib); + dmu_write(vib->vib_objset, vib->vib_object, old_size, sizeof (vibe), + &vibe, tx); + vib->vib_phys->vib_count++; + new_size = vdev_indirect_births_size_impl(vib); + + new_entries = vmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(vib->vib_entries, new_entries, old_size); + vmem_free(vib->vib_entries, old_size); + } + new_entries[vib->vib_phys->vib_count - 1] = vibe; + vib->vib_entries = new_entries; +} + +uint64_t +vdev_indirect_births_last_entry_txg(vdev_indirect_births_t *vib) +{ + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + vdev_indirect_birth_entry_phys_t *last = + &vib->vib_entries[vib->vib_phys->vib_count - 1]; + return (last->vibe_phys_birth_txg); +} + +/* + * Return the txg in which the given range was copied (i.e. its physical + * birth txg). The specified offset+asize must be contiguously mapped + * (i.e. not a split block). + * + * The entries are sorted by increasing phys_birth, and also by increasing + * offset. We find the specified offset by binary search. Note that we + * can not use bsearch() because looking at each entry independently is + * insufficient to find the correct entry. Each entry implicitly relies + * on the previous entry: an entry indicates that the offsets from the + * end of the previous entry to the end of this entry were written in the + * specified txg. + */ +uint64_t +vdev_indirect_births_physbirth(vdev_indirect_births_t *vib, uint64_t offset, + uint64_t asize) +{ + vdev_indirect_birth_entry_phys_t *base; + vdev_indirect_birth_entry_phys_t *last; + + ASSERT(vdev_indirect_births_verify(vib)); + ASSERT(vib->vib_phys->vib_count > 0); + + base = vib->vib_entries; + last = base + vib->vib_phys->vib_count - 1; + + ASSERT3U(offset, <, last->vibe_offset); + + while (last >= base) { + vdev_indirect_birth_entry_phys_t *p = + base + ((last - base) / 2); + if (offset >= p->vibe_offset) { + base = p + 1; + } else if (p == vib->vib_entries || + offset >= (p - 1)->vibe_offset) { + ASSERT3U(offset + asize, <=, p->vibe_offset); + return (p->vibe_phys_birth_txg); + } else { + last = p - 1; + } + } + ASSERT(!"offset not found"); + return (-1); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(vdev_indirect_births_add_entry); +EXPORT_SYMBOL(vdev_indirect_births_alloc); +EXPORT_SYMBOL(vdev_indirect_births_close); +EXPORT_SYMBOL(vdev_indirect_births_count); +EXPORT_SYMBOL(vdev_indirect_births_free); +EXPORT_SYMBOL(vdev_indirect_births_last_entry_txg); +EXPORT_SYMBOL(vdev_indirect_births_object); +EXPORT_SYMBOL(vdev_indirect_births_open); +EXPORT_SYMBOL(vdev_indirect_births_physbirth); +#endif diff --git a/module/zfs/vdev_indirect_mapping.c b/module/zfs/vdev_indirect_mapping.c new file mode 100644 index 000000000000..bb484a401b1b --- /dev/null +++ b/module/zfs/vdev_indirect_mapping.c @@ -0,0 +1,616 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2015, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +static boolean_t +vdev_indirect_mapping_verify(vdev_indirect_mapping_t *vim) +{ + ASSERT(vim != NULL); + + ASSERT(vim->vim_object != 0); + ASSERT(vim->vim_objset != NULL); + ASSERT(vim->vim_phys != NULL); + ASSERT(vim->vim_dbuf != NULL); + + EQUIV(vim->vim_phys->vimp_num_entries > 0, + vim->vim_entries != NULL); + if (vim->vim_phys->vimp_num_entries > 0) { + vdev_indirect_mapping_entry_phys_t *last_entry __maybe_unused = + &vim->vim_entries[vim->vim_phys->vimp_num_entries - 1]; + uint64_t offset __maybe_unused = + DVA_MAPPING_GET_SRC_OFFSET(last_entry); + uint64_t size __maybe_unused = + DVA_GET_ASIZE(&last_entry->vimep_dst); + + ASSERT3U(vim->vim_phys->vimp_max_offset, >=, offset + size); + } + if (vim->vim_havecounts) { + ASSERT(vim->vim_phys->vimp_counts_object != 0); + } + + return (B_TRUE); +} +#endif + +uint64_t +vdev_indirect_mapping_num_entries(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_num_entries); +} + +uint64_t +vdev_indirect_mapping_max_offset(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_max_offset); +} + +uint64_t +vdev_indirect_mapping_object(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_object); +} + +uint64_t +vdev_indirect_mapping_bytes_mapped(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim->vim_phys->vimp_bytes_mapped); +} + +/* + * The length (in bytes) of the mapping object array in memory and + * (logically) on disk. + * + * Note that unlike most of our accessor functions, + * we don't assert that the struct is consistent; therefore it can be + * called while there may be concurrent changes, if we don't care about + * the value being immediately stale (e.g. from spa_removal_get_stats()). + */ +uint64_t +vdev_indirect_mapping_size(vdev_indirect_mapping_t *vim) +{ + return (vim->vim_phys->vimp_num_entries * sizeof (*vim->vim_entries)); +} + +/* + * Compare an offset with an indirect mapping entry; there are three + * possible scenarios: + * + * 1. The offset is "less than" the mapping entry; meaning the + * offset is less than the source offset of the mapping entry. In + * this case, there is no overlap between the offset and the + * mapping entry and -1 will be returned. + * + * 2. The offset is "greater than" the mapping entry; meaning the + * offset is greater than the mapping entry's source offset plus + * the entry's size. In this case, there is no overlap between + * the offset and the mapping entry and 1 will be returned. + * + * NOTE: If the offset is actually equal to the entry's offset + * plus size, this is considered to be "greater" than the entry, + * and this case applies (i.e. 1 will be returned). Thus, the + * entry's "range" can be considered to be inclusive at its + * start, but exclusive at its end: e.g. [src, src + size). + * + * 3. The last case to consider is if the offset actually falls + * within the mapping entry's range. If this is the case, the + * offset is considered to be "equal to" the mapping entry and + * 0 will be returned. + * + * NOTE: If the offset is equal to the entry's source offset, + * this case applies and 0 will be returned. If the offset is + * equal to the entry's source plus its size, this case does + * *not* apply (see "NOTE" above for scenario 2), and 1 will be + * returned. + */ +static int +dva_mapping_overlap_compare(const void *v_key, const void *v_array_elem) +{ + const uint64_t * const key = v_key; + const vdev_indirect_mapping_entry_phys_t * const array_elem = + v_array_elem; + uint64_t src_offset = DVA_MAPPING_GET_SRC_OFFSET(array_elem); + + if (*key < src_offset) { + return (-1); + } else if (*key < src_offset + DVA_GET_ASIZE(&array_elem->vimep_dst)) { + return (0); + } else { + return (1); + } +} + +/* + * Returns the mapping entry for the given offset. + * + * It's possible that the given offset will not be in the mapping table + * (i.e. no mapping entries contain this offset), in which case, the + * return value value depends on the "next_if_missing" parameter. + * + * If the offset is not found in the table and "next_if_missing" is + * B_FALSE, then NULL will always be returned. The behavior is intended + * to allow consumers to get the entry corresponding to the offset + * parameter, iff the offset overlaps with an entry in the table. + * + * If the offset is not found in the table and "next_if_missing" is + * B_TRUE, then the entry nearest to the given offset will be returned, + * such that the entry's source offset is greater than the offset + * passed in (i.e. the "next" mapping entry in the table is returned, if + * the offset is missing from the table). If there are no entries whose + * source offset is greater than the passed in offset, NULL is returned. + */ +static vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_impl(vdev_indirect_mapping_t *vim, + uint64_t offset, boolean_t next_if_missing) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(vim->vim_phys->vimp_num_entries > 0); + + vdev_indirect_mapping_entry_phys_t *entry = NULL; + + uint64_t last = vim->vim_phys->vimp_num_entries - 1; + uint64_t base = 0; + + /* + * We don't define these inside of the while loop because we use + * their value in the case that offset isn't in the mapping. + */ + uint64_t mid; + int result; + + while (last >= base) { + mid = base + ((last - base) >> 1); + + result = dva_mapping_overlap_compare(&offset, + &vim->vim_entries[mid]); + + if (result == 0) { + entry = &vim->vim_entries[mid]; + break; + } else if (result < 0) { + last = mid - 1; + } else { + base = mid + 1; + } + } + + if (entry == NULL && next_if_missing) { + ASSERT3U(base, ==, last + 1); + ASSERT(mid == base || mid == last); + ASSERT3S(result, !=, 0); + + /* + * The offset we're looking for isn't actually contained + * in the mapping table, thus we need to return the + * closest mapping entry that is greater than the + * offset. We reuse the result of the last comparison, + * comparing the mapping entry at index "mid" and the + * offset. The offset is guaranteed to lie between + * indices one less than "mid", and one greater than + * "mid"; we just need to determine if offset is greater + * than, or less than the mapping entry contained at + * index "mid". + */ + + uint64_t index; + if (result < 0) + index = mid; + else + index = mid + 1; + + ASSERT3U(index, <=, vim->vim_phys->vimp_num_entries); + + if (index == vim->vim_phys->vimp_num_entries) { + /* + * If "index" is past the end of the entries + * array, then not only is the offset not in the + * mapping table, but it's actually greater than + * all entries in the table. In this case, we + * can't return a mapping entry greater than the + * offset (since none exist), so we return NULL. + */ + + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]), >, 0); + + return (NULL); + } else { + /* + * Just to be safe, we verify the offset falls + * in between the mapping entries at index and + * one less than index. Since we know the offset + * doesn't overlap an entry, and we're supposed + * to return the entry just greater than the + * offset, both of the following tests must be + * true. + */ + ASSERT3S(dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index]), <, 0); + IMPLY(index >= 1, dva_mapping_overlap_compare(&offset, + &vim->vim_entries[index - 1]) > 0); + + return (&vim->vim_entries[index]); + } + } else { + return (entry); + } +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_FALSE)); +} + +vdev_indirect_mapping_entry_phys_t * +vdev_indirect_mapping_entry_for_offset_or_next(vdev_indirect_mapping_t *vim, + uint64_t offset) +{ + return (vdev_indirect_mapping_entry_for_offset_impl(vim, offset, + B_TRUE)); +} + +void +vdev_indirect_mapping_close(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vmem_free(vim->vim_entries, map_size); + vim->vim_entries = NULL; + } + + dmu_buf_rele(vim->vim_dbuf, vim); + + vim->vim_objset = NULL; + vim->vim_object = 0; + vim->vim_dbuf = NULL; + vim->vim_phys = NULL; + + kmem_free(vim, sizeof (*vim)); +} + +uint64_t +vdev_indirect_mapping_alloc(objset_t *os, dmu_tx_t *tx) +{ + uint64_t object; + ASSERT(dmu_tx_is_syncing(tx)); + uint64_t bonus_size = VDEV_INDIRECT_MAPPING_SIZE_V0; + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + bonus_size = sizeof (vdev_indirect_mapping_phys_t); + } + + object = dmu_object_alloc(os, + DMU_OTN_UINT64_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OTN_UINT64_METADATA, bonus_size, + tx); + + if (spa_feature_is_enabled(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + dmu_buf_t *dbuf; + vdev_indirect_mapping_phys_t *vimp; + + VERIFY0(dmu_bonus_hold(os, object, FTAG, &dbuf)); + dmu_buf_will_dirty(dbuf, tx); + vimp = dbuf->db_data; + vimp->vimp_counts_object = dmu_object_alloc(os, + DMU_OTN_UINT32_METADATA, SPA_OLD_MAXBLOCKSIZE, + DMU_OT_NONE, 0, tx); + spa_feature_incr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + dmu_buf_rele(dbuf, FTAG); + } + + return (object); +} + + +vdev_indirect_mapping_t * +vdev_indirect_mapping_open(objset_t *os, uint64_t mapping_object) +{ + vdev_indirect_mapping_t *vim = kmem_zalloc(sizeof (*vim), KM_SLEEP); + dmu_object_info_t doi; + VERIFY0(dmu_object_info(os, mapping_object, &doi)); + + vim->vim_objset = os; + vim->vim_object = mapping_object; + + VERIFY0(dmu_bonus_hold(os, vim->vim_object, vim, + &vim->vim_dbuf)); + vim->vim_phys = vim->vim_dbuf->db_data; + + vim->vim_havecounts = + (doi.doi_bonus_size > VDEV_INDIRECT_MAPPING_SIZE_V0); + + if (vim->vim_phys->vimp_num_entries > 0) { + uint64_t map_size = vdev_indirect_mapping_size(vim); + vim->vim_entries = vmem_alloc(map_size, KM_SLEEP); + VERIFY0(dmu_read(os, vim->vim_object, 0, map_size, + vim->vim_entries, DMU_READ_PREFETCH)); + } + + ASSERT(vdev_indirect_mapping_verify(vim)); + + return (vim); +} + +void +vdev_indirect_mapping_free(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + vdev_indirect_mapping_t *vim = vdev_indirect_mapping_open(os, object); + if (vim->vim_havecounts) { + VERIFY0(dmu_object_free(os, vim->vim_phys->vimp_counts_object, + tx)); + spa_feature_decr(os->os_spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + vdev_indirect_mapping_close(vim); + + VERIFY0(dmu_object_free(os, object, tx)); +} + +/* + * Append the list of vdev_indirect_mapping_entry_t's to the on-disk + * mapping object. Also remove the entries from the list and free them. + * This also implicitly extends the max_offset of the mapping (to the end + * of the last entry). + */ +void +vdev_indirect_mapping_add_entries(vdev_indirect_mapping_t *vim, + list_t *list, dmu_tx_t *tx) +{ + vdev_indirect_mapping_entry_phys_t *mapbuf; + uint64_t old_size; + uint32_t *countbuf = NULL; + vdev_indirect_mapping_entry_phys_t *old_entries; + uint64_t old_count; + uint64_t entries_written = 0; + + ASSERT(vdev_indirect_mapping_verify(vim)); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(dsl_pool_sync_context(dmu_tx_pool(tx))); + ASSERT(!list_is_empty(list)); + + old_size = vdev_indirect_mapping_size(vim); + old_entries = vim->vim_entries; + old_count = vim->vim_phys->vimp_num_entries; + + dmu_buf_will_dirty(vim->vim_dbuf, tx); + + mapbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); + if (vim->vim_havecounts) { + countbuf = vmem_alloc(SPA_OLD_MAXBLOCKSIZE, KM_SLEEP); + ASSERT(spa_feature_is_active(vim->vim_objset->os_spa, + SPA_FEATURE_OBSOLETE_COUNTS)); + } + while (!list_is_empty(list)) { + uint64_t i; + /* + * Write entries from the list to the + * vdev_im_object in batches of size SPA_OLD_MAXBLOCKSIZE. + */ + for (i = 0; i < SPA_OLD_MAXBLOCKSIZE / sizeof (*mapbuf); i++) { + vdev_indirect_mapping_entry_t *entry = + list_remove_head(list); + if (entry == NULL) + break; + + uint64_t size = + DVA_GET_ASIZE(&entry->vime_mapping.vimep_dst); + uint64_t src_offset = + DVA_MAPPING_GET_SRC_OFFSET(&entry->vime_mapping); + + /* + * We shouldn't be adding an entry which is fully + * obsolete. + */ + ASSERT3U(entry->vime_obsolete_count, <, size); + IMPLY(entry->vime_obsolete_count != 0, + vim->vim_havecounts); + + mapbuf[i] = entry->vime_mapping; + if (vim->vim_havecounts) + countbuf[i] = entry->vime_obsolete_count; + + vim->vim_phys->vimp_bytes_mapped += size; + ASSERT3U(src_offset, >=, + vim->vim_phys->vimp_max_offset); + vim->vim_phys->vimp_max_offset = src_offset + size; + + entries_written++; + + vmem_free(entry, sizeof (*entry)); + } + dmu_write(vim->vim_objset, vim->vim_object, + vim->vim_phys->vimp_num_entries * sizeof (*mapbuf), + i * sizeof (*mapbuf), + mapbuf, tx); + if (vim->vim_havecounts) { + dmu_write(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + vim->vim_phys->vimp_num_entries * + sizeof (*countbuf), + i * sizeof (*countbuf), countbuf, tx); + } + vim->vim_phys->vimp_num_entries += i; + } + vmem_free(mapbuf, SPA_OLD_MAXBLOCKSIZE); + if (vim->vim_havecounts) + vmem_free(countbuf, SPA_OLD_MAXBLOCKSIZE); + + /* + * Update the entry array to reflect the new entries. First, copy + * over any old entries then read back the new entries we just wrote. + */ + uint64_t new_size = vdev_indirect_mapping_size(vim); + ASSERT3U(new_size, >, old_size); + ASSERT3U(new_size - old_size, ==, + entries_written * sizeof (vdev_indirect_mapping_entry_phys_t)); + vim->vim_entries = vmem_alloc(new_size, KM_SLEEP); + if (old_size > 0) { + bcopy(old_entries, vim->vim_entries, old_size); + vmem_free(old_entries, old_size); + } + VERIFY0(dmu_read(vim->vim_objset, vim->vim_object, old_size, + new_size - old_size, &vim->vim_entries[old_count], + DMU_READ_PREFETCH)); + + zfs_dbgmsg("txg %llu: wrote %llu entries to " + "indirect mapping obj %llu; max offset=0x%llx", + (u_longlong_t)dmu_tx_get_txg(tx), + (u_longlong_t)entries_written, + (u_longlong_t)vim->vim_object, + (u_longlong_t)vim->vim_phys->vimp_max_offset); +} + +/* + * Increment the relevant counts for the specified offset and length. + * The counts array must be obtained from + * vdev_indirect_mapping_load_obsolete_counts(). + */ +void +vdev_indirect_mapping_increment_obsolete_count(vdev_indirect_mapping_t *vim, + uint64_t offset, uint64_t length, uint32_t *counts) +{ + vdev_indirect_mapping_entry_phys_t *mapping; + uint64_t index; + + mapping = vdev_indirect_mapping_entry_for_offset(vim, offset); + + ASSERT(length > 0); + ASSERT3P(mapping, !=, NULL); + + index = mapping - vim->vim_entries; + + while (length > 0) { + ASSERT3U(index, <, vdev_indirect_mapping_num_entries(vim)); + + uint64_t size = DVA_GET_ASIZE(&mapping->vimep_dst); + uint64_t inner_offset = offset - + DVA_MAPPING_GET_SRC_OFFSET(mapping); + VERIFY3U(inner_offset, <, size); + uint64_t inner_size = MIN(length, size - inner_offset); + + VERIFY3U(counts[index] + inner_size, <=, size); + counts[index] += inner_size; + + offset += inner_size; + length -= inner_size; + mapping++; + index++; + } +} + +typedef struct load_obsolete_space_map_arg { + vdev_indirect_mapping_t *losma_vim; + uint32_t *losma_counts; +} load_obsolete_space_map_arg_t; + +static int +load_obsolete_sm_callback(space_map_entry_t *sme, void *arg) +{ + load_obsolete_space_map_arg_t *losma = arg; + ASSERT3S(sme->sme_type, ==, SM_ALLOC); + + vdev_indirect_mapping_increment_obsolete_count(losma->losma_vim, + sme->sme_offset, sme->sme_run, losma->losma_counts); + + return (0); +} + +/* + * Modify the counts (increment them) based on the spacemap. + */ +void +vdev_indirect_mapping_load_obsolete_spacemap(vdev_indirect_mapping_t *vim, + uint32_t *counts, space_map_t *obsolete_space_sm) +{ + load_obsolete_space_map_arg_t losma; + losma.losma_counts = counts; + losma.losma_vim = vim; + VERIFY0(space_map_iterate(obsolete_space_sm, + space_map_length(obsolete_space_sm), + load_obsolete_sm_callback, &losma)); +} + +/* + * Read the obsolete counts from disk, returning them in an array. + */ +uint32_t * +vdev_indirect_mapping_load_obsolete_counts(vdev_indirect_mapping_t *vim) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + uint64_t counts_size = + vim->vim_phys->vimp_num_entries * sizeof (uint32_t); + uint32_t *counts = vmem_alloc(counts_size, KM_SLEEP); + if (vim->vim_havecounts) { + VERIFY0(dmu_read(vim->vim_objset, + vim->vim_phys->vimp_counts_object, + 0, counts_size, + counts, DMU_READ_PREFETCH)); + } else { + bzero(counts, counts_size); + } + return (counts); +} + +extern void +vdev_indirect_mapping_free_obsolete_counts(vdev_indirect_mapping_t *vim, + uint32_t *counts) +{ + ASSERT(vdev_indirect_mapping_verify(vim)); + + vmem_free(counts, vim->vim_phys->vimp_num_entries * sizeof (uint32_t)); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(vdev_indirect_mapping_add_entries); +EXPORT_SYMBOL(vdev_indirect_mapping_alloc); +EXPORT_SYMBOL(vdev_indirect_mapping_bytes_mapped); +EXPORT_SYMBOL(vdev_indirect_mapping_close); +EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset); +EXPORT_SYMBOL(vdev_indirect_mapping_entry_for_offset_or_next); +EXPORT_SYMBOL(vdev_indirect_mapping_free); +EXPORT_SYMBOL(vdev_indirect_mapping_free_obsolete_counts); +EXPORT_SYMBOL(vdev_indirect_mapping_increment_obsolete_count); +EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_counts); +EXPORT_SYMBOL(vdev_indirect_mapping_load_obsolete_spacemap); +EXPORT_SYMBOL(vdev_indirect_mapping_max_offset); +EXPORT_SYMBOL(vdev_indirect_mapping_num_entries); +EXPORT_SYMBOL(vdev_indirect_mapping_object); +EXPORT_SYMBOL(vdev_indirect_mapping_open); +EXPORT_SYMBOL(vdev_indirect_mapping_size); +#endif diff --git a/module/zfs/vdev_initialize.c b/module/zfs/vdev_initialize.c new file mode 100644 index 000000000000..ab711441d9ca --- /dev/null +++ b/module/zfs/vdev_initialize.c @@ -0,0 +1,750 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2019 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Value that is written to disk during initialization. + */ +#ifdef _ILP32 +unsigned long zfs_initialize_value = 0xdeadbeefUL; +#else +unsigned long zfs_initialize_value = 0xdeadbeefdeadbeeeULL; +#endif + +/* maximum number of I/Os outstanding per leaf vdev */ +int zfs_initialize_limit = 1; + +/* size of initializing writes; default 1MiB, see zfs_remove_max_segment */ +unsigned long zfs_initialize_chunk_size = 1024 * 1024; + +static boolean_t +vdev_initialize_should_stop(vdev_t *vd) +{ + return (vd->vdev_initialize_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +static void +vdev_initialize_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the initializing thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_initialize_offset[txg & TXG_MASK]; + vd->vdev_initialize_offset[txg & TXG_MASK] = 0; + + VERIFY(vd->vdev_leaf_zap != 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0) { + vd->vdev_initialize_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + if (vd->vdev_initialize_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_initialize_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + uint64_t initialize_state = vd->vdev_initialize_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_INITIALIZE_STATE, sizeof (initialize_state), 1, + &initialize_state, tx)); +} + +static void +vdev_initialize_change_state(vdev_t *vd, vdev_initializing_state_t new_state) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_initialize_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserving the original start time. + */ + if (vd->vdev_initialize_state != VDEV_INITIALIZE_SUSPENDED) { + vd->vdev_initialize_action_time = gethrestime_sec(); + } + vd->vdev_initialize_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_initialize_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_NONE, tx); + + switch (new_state) { + case VDEV_INITIALIZE_ACTIVE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_INITIALIZE_SUSPENDED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_INITIALIZE_CANCELED: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_INITIALIZE_COMPLETE: + spa_history_log_internal(spa, "initialize", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); + + if (new_state != VDEV_INITIALIZE_ACTIVE) + spa_notify_waiters(spa); +} + +static void +vdev_initialize_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + mutex_enter(&vd->vdev_initialize_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *off = + &vd->vdev_initialize_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else { + /* + * Since initializing is best-effort, we ignore I/O errors and + * rely on vdev_probe to determine if the errors are more + * critical. + */ + if (zio->io_error != 0) + vd->vdev_stat.vs_initialize_errors++; + + vd->vdev_initialize_bytes_done += zio->io_orig_size; + } + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + cv_broadcast(&vd->vdev_initialize_io_cv); + mutex_exit(&vd->vdev_initialize_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* Takes care of physical writing and limiting # of concurrent ZIOs. */ +static int +vdev_initialize_write(vdev_t *vd, uint64_t start, uint64_t size, abd_t *data) +{ + spa_t *spa = vd->vdev_spa; + + /* Limit inflight initializing I/Os */ + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight >= zfs_initialize_limit) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + vd->vdev_initialize_inflight++; + mutex_exit(&vd->vdev_initialize_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_initialize_lock); + + if (vd->vdev_initialize_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_initialize_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev struct will still be around since all + * consumers of vdev_free must stop the initialization first. + */ + if (vdev_initialize_should_stop(vd)) { + mutex_enter(&vd->vdev_initialize_io_lock); + ASSERT3U(vd->vdev_initialize_inflight, >, 0); + vd->vdev_initialize_inflight--; + mutex_exit(&vd->vdev_initialize_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_initialize_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_initialize_lock); + + vd->vdev_initialize_offset[txg & TXG_MASK] = start + size; + zio_nowait(zio_write_phys(spa->spa_txg_zio[txg & TXG_MASK], vd, start, + size, data, ZIO_CHECKSUM_OFF, vdev_initialize_cb, NULL, + ZIO_PRIORITY_INITIALIZING, ZIO_FLAG_CANFAIL, B_FALSE)); + /* vdev_initialize_cb releases SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Callback to fill each ABD chunk with zfs_initialize_value. len must be + * divisible by sizeof (uint64_t), and buf must be 8-byte aligned. The ABD + * allocation will guarantee these for us. + */ +/* ARGSUSED */ +static int +vdev_initialize_block_fill(void *buf, size_t len, void *unused) +{ + ASSERT0(len % sizeof (uint64_t)); +#ifdef _ILP32 + for (uint64_t i = 0; i < len; i += sizeof (uint32_t)) { + *(uint32_t *)((char *)(buf) + i) = zfs_initialize_value; + } +#else + for (uint64_t i = 0; i < len; i += sizeof (uint64_t)) { + *(uint64_t *)((char *)(buf) + i) = zfs_initialize_value; + } +#endif + return (0); +} + +static abd_t * +vdev_initialize_block_alloc(void) +{ + /* Allocate ABD for filler data */ + abd_t *data = abd_alloc_for_io(zfs_initialize_chunk_size, B_FALSE); + + ASSERT0(zfs_initialize_chunk_size % sizeof (uint64_t)); + (void) abd_iterate_func(data, 0, zfs_initialize_chunk_size, + vdev_initialize_block_fill, NULL); + + return (data); +} + +static void +vdev_initialize_block_free(abd_t *data) +{ + abd_free(data); +} + +static int +vdev_initialize_ranges(vdev_t *vd, abd_t *data) +{ + range_tree_t *rt = vd->vdev_initialize_tree; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t where; + + for (range_seg_t *rs = zfs_btree_first(bt, &where); rs != NULL; + rs = zfs_btree_next(bt, &where, &where)) { + uint64_t size = rs_get_end(rs, rt) - rs_get_start(rs, rt); + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = + ((size - 1) / zfs_initialize_chunk_size) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_initialize_write(vd, + VDEV_LABEL_START_SIZE + rs_get_start(rs, rt) + + (w * zfs_initialize_chunk_size), + MIN(size - (w * zfs_initialize_chunk_size), + zfs_initialize_chunk_size), data); + if (error != 0) + return (error); + } + } + return (0); +} + +static void +vdev_initialize_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_initialize_bytes_est = 0; + vd->vdev_initialize_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + metaslab_allocated_space(msp); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg64_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_initialize_last_offset <= physical_rs.rs_start) { + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += ms_free; + vd->vdev_initialize_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of initializing this + * metaslab. Load it and walk the free tree for more accurate + * progress estimation. + */ + VERIFY0(metaslab_load(msp)); + + zfs_btree_index_t where; + range_tree_t *rt = msp->ms_allocatable; + for (range_seg_t *rs = + zfs_btree_first(&rt->rt_root, &where); rs; + rs = zfs_btree_next(&rt->rt_root, &where, + &where)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_initialize_bytes_est += size; + if (vd->vdev_initialize_last_offset > + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += size; + } else if (vd->vdev_initialize_last_offset > + physical_rs.rs_start && + vd->vdev_initialize_last_offset < + physical_rs.rs_end) { + vd->vdev_initialize_bytes_done += + vd->vdev_initialize_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +static int +vdev_initialize_load(vdev_t *vd) +{ + int err = 0; + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_ACTIVE || + vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_LAST_OFFSET, + sizeof (vd->vdev_initialize_last_offset), 1, + &vd->vdev_initialize_last_offset); + if (err == ENOENT) { + vd->vdev_initialize_last_offset = 0; + err = 0; + } + } + + vdev_initialize_calculate_progress(vd); + return (err); +} + +/* + * Convert the logical range into a physical range and add it to our + * avl tree. + */ +static void +vdev_initialize_range_add(void *arg, uint64_t start, uint64_t size) +{ + vdev_t *vd = arg; + range_seg64_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_initialize_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_initialize_last_offset > physical_rs.rs_start) { + zfs_dbgmsg("range write: vd %s changed (%llu, %llu) to " + "(%llu, %llu)", vd->vdev_path, + (u_longlong_t)physical_rs.rs_start, + (u_longlong_t)physical_rs.rs_end, + (u_longlong_t)vd->vdev_initialize_last_offset, + (u_longlong_t)physical_rs.rs_end); + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_initialize_last_offset); + physical_rs.rs_start = vd->vdev_initialize_last_offset; + } + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(vd->vdev_initialize_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +static void +vdev_initialize_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + uint64_t ms_count = 0; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_initialize_last_offset = 0; + VERIFY0(vdev_initialize_load(vd)); + + abd_t *deadbeef = vdev_initialize_block_alloc(); + + vd->vdev_initialize_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + boolean_t unload_when_done = B_FALSE; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_initialize_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + if (!msp->ms_loaded && !msp->ms_loading) + unload_when_done = B_TRUE; + VERIFY0(metaslab_load(msp)); + + range_tree_walk(msp->ms_allocatable, vdev_initialize_range_add, + vd); + mutex_exit(&msp->ms_lock); + + error = vdev_initialize_ranges(vd, deadbeef); + metaslab_enable(msp, B_TRUE, unload_when_done); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(vd->vdev_initialize_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_initialize_io_lock); + while (vd->vdev_initialize_inflight > 0) { + cv_wait(&vd->vdev_initialize_io_cv, + &vd->vdev_initialize_io_lock); + } + mutex_exit(&vd->vdev_initialize_io_lock); + + range_tree_destroy(vd->vdev_initialize_tree); + vdev_initialize_block_free(deadbeef); + vd->vdev_initialize_tree = NULL; + + mutex_enter(&vd->vdev_initialize_lock); + if (!vd->vdev_initialize_exit_wanted && vdev_writeable(vd)) { + vdev_initialize_change_state(vd, VDEV_INITIALIZE_COMPLETE); + } + ASSERT(vd->vdev_initialize_thread != NULL || + vd->vdev_initialize_inflight == 0); + + /* + * Drop the vdev_initialize_lock while we sync out the + * txg since it's possible that a device might be trying to + * come online and must check to see if it needs to restart an + * initialization. That thread will be holding the spa_config_lock + * which would prevent the txg_wait_synced from completing. + */ + mutex_exit(&vd->vdev_initialize_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_initialize_lock); + + vd->vdev_initialize_thread = NULL; + cv_broadcast(&vd->vdev_initialize_cv); + mutex_exit(&vd->vdev_initialize_lock); + + thread_exit(); +} + +/* + * Initiates a device. Caller must hold vdev_initialize_lock. + * Device must be a leaf and not already be initializing. + */ +void +vdev_initialize(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_initialize_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_initialize_change_state(vd, VDEV_INITIALIZE_ACTIVE); + vd->vdev_initialize_thread = thread_create(NULL, 0, + vdev_initialize_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Wait for the initialize thread to be terminated (cancelled or stopped). + */ +static void +vdev_initialize_stop_wait_impl(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + + while (vd->vdev_initialize_thread != NULL) + cv_wait(&vd->vdev_initialize_cv, &vd->vdev_initialize_lock); + + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + vd->vdev_initialize_exit_wanted = B_FALSE; +} + +/* + * Wait for vdev initialize threads which were either to cleanly exit. + */ +void +vdev_initialize_stop_wait(spa_t *spa, list_t *vd_list) +{ + vdev_t *vd; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + while ((vd = list_remove_head(vd_list)) != NULL) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop_wait_impl(vd); + mutex_exit(&vd->vdev_initialize_lock); + } +} + +/* + * Stop initializing a device, with the resultant initializing state being + * tgt_state. For blocking behavior pass NULL for vd_list. Otherwise, when + * a list_t is provided the stopping vdev is inserted in to the list. Callers + * are then required to call vdev_initialize_stop_wait() to block for all the + * initialization threads to exit. The caller must hold vdev_initialize_lock + * and must not be writing to the spa config, as the initializing thread may + * try to enter the config as a reader before exiting. + */ +void +vdev_initialize_stop(vdev_t *vd, vdev_initializing_state_t tgt_state, + list_t *vd_list) +{ + ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); + ASSERT(MUTEX_HELD(&vd->vdev_initialize_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the initialize thread + * has stopped. + */ + if (vd->vdev_initialize_thread == NULL && + tgt_state != VDEV_INITIALIZE_CANCELED) { + return; + } + + vdev_initialize_change_state(vd, tgt_state); + vd->vdev_initialize_exit_wanted = B_TRUE; + + if (vd_list == NULL) { + vdev_initialize_stop_wait_impl(vd); + } else { + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + list_insert_tail(vd_list, vd); + } +} + +static void +vdev_initialize_stop_all_impl(vdev_t *vd, vdev_initializing_state_t tgt_state, + list_t *vd_list) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_initialize_lock); + vdev_initialize_stop(vd, tgt_state, vd_list); + mutex_exit(&vd->vdev_initialize_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_stop_all_impl(vd->vdev_child[i], tgt_state, + vd_list); + } +} + +/* + * Convenience function to stop initializing of a vdev tree and set all + * initialize thread pointers to NULL. + */ +void +vdev_initialize_stop_all(vdev_t *vd, vdev_initializing_state_t tgt_state) +{ + spa_t *spa = vd->vdev_spa; + list_t vd_list; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_initialize_node)); + + vdev_initialize_stop_all_impl(vd, tgt_state, &vd_list); + vdev_initialize_stop_wait(spa, &vd_list); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } + + list_destroy(&vd_list); +} + +void +vdev_initialize_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_initialize_lock); + uint64_t initialize_state = VDEV_INITIALIZE_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_STATE, + sizeof (initialize_state), 1, &initialize_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_state = initialize_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_INITIALIZE_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_initialize_action_time = timestamp; + + if (vd->vdev_initialize_state == VDEV_INITIALIZE_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + VERIFY0(vdev_initialize_load(vd)); + } else if (vd->vdev_initialize_state == + VDEV_INITIALIZE_ACTIVE && vdev_writeable(vd) && + !vd->vdev_top->vdev_removing && + vd->vdev_initialize_thread == NULL) { + vdev_initialize(vd); + } + + mutex_exit(&vd->vdev_initialize_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_initialize_restart(vd->vdev_child[i]); + } +} + +EXPORT_SYMBOL(vdev_initialize); +EXPORT_SYMBOL(vdev_initialize_stop); +EXPORT_SYMBOL(vdev_initialize_stop_all); +EXPORT_SYMBOL(vdev_initialize_stop_wait); +EXPORT_SYMBOL(vdev_initialize_restart); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, initialize_value, ULONG, ZMOD_RW, + "Value written during zpool initialize"); + +ZFS_MODULE_PARAM(zfs, zfs_, initialize_chunk_size, ULONG, ZMOD_RW, + "Size in bytes of writes by zpool initialize"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c new file mode 100644 index 000000000000..8c7468255565 --- /dev/null +++ b/module/zfs/vdev_label.c @@ -0,0 +1,1901 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2020 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * Virtual Device Labels + * --------------------- + * + * The vdev label serves several distinct purposes: + * + * 1. Uniquely identify this device as part of a ZFS pool and confirm its + * identity within the pool. + * + * 2. Verify that all the devices given in a configuration are present + * within the pool. + * + * 3. Determine the uberblock for the pool. + * + * 4. In case of an import operation, determine the configuration of the + * toplevel vdev of which it is a part. + * + * 5. If an import operation cannot find all the devices in the pool, + * provide enough information to the administrator to determine which + * devices are missing. + * + * It is important to note that while the kernel is responsible for writing the + * label, it only consumes the information in the first three cases. The + * latter information is only consumed in userland when determining the + * configuration to import a pool. + * + * + * Label Organization + * ------------------ + * + * Before describing the contents of the label, it's important to understand how + * the labels are written and updated with respect to the uberblock. + * + * When the pool configuration is altered, either because it was newly created + * or a device was added, we want to update all the labels such that we can deal + * with fatal failure at any point. To this end, each disk has two labels which + * are updated before and after the uberblock is synced. Assuming we have + * labels and an uberblock with the following transaction groups: + * + * L1 UB L2 + * +------+ +------+ +------+ + * | | | | | | + * | t10 | | t10 | | t10 | + * | | | | | | + * +------+ +------+ +------+ + * + * In this stable state, the labels and the uberblock were all updated within + * the same transaction group (10). Each label is mirrored and checksummed, so + * that we can detect when we fail partway through writing the label. + * + * In order to identify which labels are valid, the labels are written in the + * following manner: + * + * 1. For each vdev, update 'L1' to the new label + * 2. Update the uberblock + * 3. For each vdev, update 'L2' to the new label + * + * Given arbitrary failure, we can determine the correct label to use based on + * the transaction group. If we fail after updating L1 but before updating the + * UB, we will notice that L1's transaction group is greater than the uberblock, + * so L2 must be valid. If we fail after writing the uberblock but before + * writing L2, we will notice that L2's transaction group is less than L1, and + * therefore L1 is valid. + * + * Another added complexity is that not every label is updated when the config + * is synced. If we add a single device, we do not want to have to re-write + * every label for every device in the pool. This means that both L1 and L2 may + * be older than the pool uberblock, because the necessary information is stored + * on another vdev. + * + * + * On-disk Format + * -------------- + * + * The vdev label consists of two distinct parts, and is wrapped within the + * vdev_label_t structure. The label includes 8k of padding to permit legacy + * VTOC disk labels, but is otherwise ignored. + * + * The first half of the label is a packed nvlist which contains pool wide + * properties, per-vdev properties, and configuration information. It is + * described in more detail below. + * + * The latter half of the label consists of a redundant array of uberblocks. + * These uberblocks are updated whenever a transaction group is committed, + * or when the configuration is updated. When a pool is loaded, we scan each + * vdev for the 'best' uberblock. + * + * + * Configuration Information + * ------------------------- + * + * The nvlist describing the pool and vdev contains the following elements: + * + * version ZFS on-disk version + * name Pool name + * state Pool state + * txg Transaction group in which this label was written + * pool_guid Unique identifier for this pool + * vdev_tree An nvlist describing vdev tree. + * features_for_read + * An nvlist of the features necessary for reading the MOS. + * + * Each leaf device label also contains the following: + * + * top_guid Unique ID for top-level vdev in which this is contained + * guid Unique ID for the leaf vdev + * + * The 'vs' configuration follows the format described in 'spa_config.c'. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Basic routines to read and write from a vdev label. + * Used throughout the rest of this file. + */ +uint64_t +vdev_label_offset(uint64_t psize, int l, uint64_t offset) +{ + ASSERT(offset < sizeof (vdev_label_t)); + ASSERT(P2PHASE_TYPED(psize, sizeof (vdev_label_t), uint64_t) == 0); + + return (offset + l * sizeof (vdev_label_t) + (l < VDEV_LABELS / 2 ? + 0 : psize - VDEV_LABELS * sizeof (vdev_label_t))); +} + +/* + * Returns back the vdev label associated with the passed in offset. + */ +int +vdev_label_number(uint64_t psize, uint64_t offset) +{ + int l; + + if (offset >= psize - VDEV_LABEL_END_SIZE) { + offset -= psize - VDEV_LABEL_END_SIZE; + offset += (VDEV_LABELS / 2) * sizeof (vdev_label_t); + } + l = offset / sizeof (vdev_label_t); + return (l < VDEV_LABELS ? l : -1); +} + +static void +vdev_label_read(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_read_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_READ, flags, B_TRUE)); +} + +void +vdev_label_write(zio_t *zio, vdev_t *vd, int l, abd_t *buf, uint64_t offset, + uint64_t size, zio_done_func_t *done, void *private, int flags) +{ + ASSERT( + spa_config_held(zio->io_spa, SCL_STATE, RW_READER) == SCL_STATE || + spa_config_held(zio->io_spa, SCL_STATE, RW_WRITER) == SCL_STATE); + ASSERT(flags & ZIO_FLAG_CONFIG_WRITER); + + zio_nowait(zio_write_phys(zio, vd, + vdev_label_offset(vd->vdev_psize, l, offset), + size, buf, ZIO_CHECKSUM_LABEL, done, private, + ZIO_PRIORITY_SYNC_WRITE, flags, B_TRUE)); +} + +/* + * Generate the nvlist representing this vdev's stats + */ +void +vdev_config_generate_stats(vdev_t *vd, nvlist_t *nv) +{ + nvlist_t *nvx; + vdev_stat_t *vs; + vdev_stat_ex_t *vsx; + + vs = kmem_alloc(sizeof (*vs), KM_SLEEP); + vsx = kmem_alloc(sizeof (*vsx), KM_SLEEP); + + vdev_get_stats_ex(vd, vs, vsx); + fnvlist_add_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t *)vs, sizeof (*vs) / sizeof (uint64_t)); + + /* + * Add extended stats into a special extended stats nvlist. This keeps + * all the extended stats nicely grouped together. The extended stats + * nvlist is then added to the main nvlist. + */ + nvx = fnvlist_alloc(); + + /* ZIOs in flight to disk */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_SCRUB]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_ACTIVE_QUEUE, + vsx->vsx_active_queue[ZIO_PRIORITY_TRIM]); + + /* ZIOs pending */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_READ]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_ASYNC_WRITE]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_SCRUB]); + + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_TRIM_PEND_QUEUE, + vsx->vsx_pend_queue[ZIO_PRIORITY_TRIM]); + + /* Histograms */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO, + vsx->vsx_total_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_total_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_READ], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO, + vsx->vsx_disk_histo[ZIO_TYPE_WRITE], + ARRAY_SIZE(vsx->vsx_disk_histo[ZIO_TYPE_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO, + vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_queue_histo[ZIO_PRIORITY_TRIM])); + + /* Request sizes */ + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO, + vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_ind_histo[ZIO_PRIORITY_TRIM])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_READ])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_ASYNC_WRITE])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_SCRUB])); + + fnvlist_add_uint64_array(nvx, ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO, + vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM], + ARRAY_SIZE(vsx->vsx_agg_histo[ZIO_PRIORITY_TRIM])); + + /* IO delays */ + fnvlist_add_uint64(nvx, ZPOOL_CONFIG_VDEV_SLOW_IOS, vs->vs_slow_ios); + + /* Add extended stats nvlist to main nvlist */ + fnvlist_add_nvlist(nv, ZPOOL_CONFIG_VDEV_STATS_EX, nvx); + + fnvlist_free(nvx); + kmem_free(vs, sizeof (*vs)); + kmem_free(vsx, sizeof (*vsx)); +} + +static void +root_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != spa->spa_root_vdev) + return; + + /* provide either current or previous scan information */ + pool_scan_stat_t ps; + if (spa_scan_get_stats(spa, &ps) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_SCAN_STATS, (uint64_t *)&ps, + sizeof (pool_scan_stat_t) / sizeof (uint64_t)); + } + + pool_removal_stat_t prs; + if (spa_removal_get_stats(spa, &prs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REMOVAL_STATS, (uint64_t *)&prs, + sizeof (prs) / sizeof (uint64_t)); + } + + pool_checkpoint_stat_t pcs; + if (spa_checkpoint_get_stats(spa, &pcs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_CHECKPOINT_STATS, (uint64_t *)&pcs, + sizeof (pcs) / sizeof (uint64_t)); + } +} + +static void +top_vdev_actions_getprogress(vdev_t *vd, nvlist_t *nvl) +{ + if (vd == vd->vdev_top) { + vdev_rebuild_stat_t vrs; + if (vdev_rebuild_get_stats(vd, &vrs) == 0) { + fnvlist_add_uint64_array(nvl, + ZPOOL_CONFIG_REBUILD_STATS, (uint64_t *)&vrs, + sizeof (vrs) / sizeof (uint64_t)); + } + } +} + +/* + * Generate the nvlist representing this vdev's config. + */ +nvlist_t * +vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, + vdev_config_flag_t flags) +{ + nvlist_t *nv = NULL; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + nv = fnvlist_alloc(); + + fnvlist_add_string(nv, ZPOOL_CONFIG_TYPE, vd->vdev_ops->vdev_op_type); + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE))) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ID, vd->vdev_id); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_GUID, vd->vdev_guid); + + if (vd->vdev_path != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_PATH, vd->vdev_path); + + if (vd->vdev_devid != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_DEVID, vd->vdev_devid); + + if (vd->vdev_physpath != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath); + + if (vd->vdev_enc_sysfs_path != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + vd->vdev_enc_sysfs_path); + + if (vd->vdev_fru != NULL) + fnvlist_add_string(nv, ZPOOL_CONFIG_FRU, vd->vdev_fru); + + if (vd->vdev_nparity != 0) { + ASSERT(strcmp(vd->vdev_ops->vdev_op_type, + VDEV_TYPE_RAIDZ) == 0); + + /* + * Make sure someone hasn't managed to sneak a fancy new vdev + * into a crufty old storage pool. + */ + ASSERT(vd->vdev_nparity == 1 || + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); + + /* + * Note that we'll add the nparity tag even on storage pools + * that only support a single parity device -- older software + * will just ignore it. + */ + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NPARITY, vd->vdev_nparity); + } + + if (vd->vdev_wholedisk != -1ULL) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + vd->vdev_wholedisk); + + if (vd->vdev_not_present && !(flags & VDEV_CONFIG_MISSING)) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, 1); + + if (vd->vdev_isspare) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_SPARE, 1); + + if (!(flags & (VDEV_CONFIG_SPARE | VDEV_CONFIG_L2CACHE)) && + vd == vd->vdev_top) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, + vd->vdev_ms_array); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, + vd->vdev_ms_shift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ASIZE, + vd->vdev_asize); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_LOG, vd->vdev_islog); + if (vd->vdev_removing) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVING, + vd->vdev_removing); + } + + /* zpool command expects alloc class data */ + if (getstats && vd->vdev_alloc_bias != VDEV_BIAS_NONE) { + const char *bias = NULL; + + switch (vd->vdev_alloc_bias) { + case VDEV_BIAS_LOG: + bias = VDEV_ALLOC_BIAS_LOG; + break; + case VDEV_BIAS_SPECIAL: + bias = VDEV_ALLOC_BIAS_SPECIAL; + break; + case VDEV_BIAS_DEDUP: + bias = VDEV_ALLOC_BIAS_DEDUP; + break; + default: + ASSERT3U(vd->vdev_alloc_bias, ==, + VDEV_BIAS_NONE); + } + fnvlist_add_string(nv, ZPOOL_CONFIG_ALLOCATION_BIAS, + bias); + } + } + + if (vd->vdev_dtl_sm != NULL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, + space_map_object(vd->vdev_dtl_sm)); + } + + if (vic->vic_mapping_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_OBJECT, + vic->vic_mapping_object); + } + + if (vic->vic_births_object != 0) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_BIRTHS, + vic->vic_births_object); + } + + if (vic->vic_prev_indirect_vdev != UINT64_MAX) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + vic->vic_prev_indirect_vdev); + } + + if (vd->vdev_crtxg) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, vd->vdev_crtxg); + + if (vd->vdev_expansion_time) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_EXPANSION_TIME, + vd->vdev_expansion_time); + + if (flags & VDEV_CONFIG_MOS) { + if (vd->vdev_leaf_zap != 0) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_LEAF_ZAP, + vd->vdev_leaf_zap); + } + + if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + fnvlist_add_uint64(nv, ZPOOL_CONFIG_VDEV_TOP_ZAP, + vd->vdev_top_zap); + } + + if (vd->vdev_resilver_deferred) { + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(spa->spa_resilver_deferred); + fnvlist_add_boolean(nv, ZPOOL_CONFIG_RESILVER_DEFER); + } + } + + if (getstats) { + vdev_config_generate_stats(vd, nv); + + root_vdev_actions_getprogress(vd, nv); + top_vdev_actions_getprogress(vd, nv); + + /* + * Note: this can be called from open context + * (spa_get_stats()), so we need the rwlock to prevent + * the mapping from being changed by condensing. + */ + rw_enter(&vd->vdev_indirect_rwlock, RW_READER); + if (vd->vdev_indirect_mapping != NULL) { + ASSERT(vd->vdev_indirect_births != NULL); + vdev_indirect_mapping_t *vim = + vd->vdev_indirect_mapping; + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + vdev_indirect_mapping_size(vim)); + } + rw_exit(&vd->vdev_indirect_rwlock); + if (vd->vdev_mg != NULL && + vd->vdev_mg->mg_fragmentation != ZFS_FRAG_INVALID) { + /* + * Compute approximately how much memory would be used + * for the indirect mapping if this device were to + * be removed. + * + * Note: If the frag metric is invalid, then not + * enough metaslabs have been converted to have + * histograms. + */ + uint64_t seg_count = 0; + uint64_t to_alloc = vd->vdev_stat.vs_alloc; + + /* + * There are the same number of allocated segments + * as free segments, so we will have at least one + * entry per free segment. However, small free + * segments (smaller than vdev_removal_max_span) + * will be combined with adjacent allocated segments + * as a single mapping. + */ + for (int i = 0; i < RANGE_TREE_HISTOGRAM_SIZE; i++) { + if (1ULL << (i + 1) < vdev_removal_max_span) { + to_alloc += + vd->vdev_mg->mg_histogram[i] << + (i + 1); + } else { + seg_count += + vd->vdev_mg->mg_histogram[i]; + } + } + + /* + * The maximum length of a mapping is + * zfs_remove_max_segment, so we need at least one entry + * per zfs_remove_max_segment of allocated data. + */ + seg_count += to_alloc / spa_remove_max_segment(spa); + + fnvlist_add_uint64(nv, ZPOOL_CONFIG_INDIRECT_SIZE, + seg_count * + sizeof (vdev_indirect_mapping_entry_phys_t)); + } + } + + if (!vd->vdev_ops->vdev_op_leaf) { + nvlist_t **child; + int c, idx; + + ASSERT(!vd->vdev_ishole); + + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), + KM_SLEEP); + + for (c = 0, idx = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + /* + * If we're generating an nvlist of removing + * vdevs then skip over any device which is + * not being removed. + */ + if ((flags & VDEV_CONFIG_REMOVING) && + !cvd->vdev_removing) + continue; + + child[idx++] = vdev_config_generate(spa, cvd, + getstats, flags); + } + + if (idx) { + fnvlist_add_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + child, idx); + } + + for (c = 0; c < idx; c++) + nvlist_free(child[c]); + + kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); + + } else { + const char *aux = NULL; + + if (vd->vdev_offline && !vd->vdev_tmpoffline) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE); + if (vd->vdev_resilver_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_RESILVER_TXG, + vd->vdev_resilver_txg); + if (vd->vdev_rebuild_txg != 0) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REBUILD_TXG, + vd->vdev_rebuild_txg); + if (vd->vdev_faulted) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_FAULTED, B_TRUE); + if (vd->vdev_degraded) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_DEGRADED, B_TRUE); + if (vd->vdev_removed) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_REMOVED, B_TRUE); + if (vd->vdev_unspare) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE); + if (vd->vdev_ishole) + fnvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, B_TRUE); + + /* Set the reason why we're FAULTED/DEGRADED. */ + switch (vd->vdev_stat.vs_aux) { + case VDEV_AUX_ERR_EXCEEDED: + aux = "err_exceeded"; + break; + + case VDEV_AUX_EXTERNAL: + aux = "external"; + break; + } + + if (aux != NULL && !vd->vdev_tmpoffline) { + fnvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, aux); + } else { + /* + * We're healthy - clear any previous AUX_STATE values. + */ + if (nvlist_exists(nv, ZPOOL_CONFIG_AUX_STATE)) + nvlist_remove_all(nv, ZPOOL_CONFIG_AUX_STATE); + } + + if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { + fnvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid); + } + } + + return (nv); +} + +/* + * Generate a view of the top-level vdevs. If we currently have holes + * in the namespace, then generate an array which contains a list of holey + * vdevs. Additionally, add the number of top-level children that currently + * exist. + */ +void +vdev_top_config_generate(spa_t *spa, nvlist_t *config) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t *array; + uint_t c, idx; + + array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); + + for (c = 0, idx = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_ishole) { + array[idx++] = c; + } + } + + if (idx) { + VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, + array, idx) == 0); + } + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + rvd->vdev_children) == 0); + + kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); +} + +/* + * Returns the configuration from the label of the given vdev. For vdevs + * which don't have a txg value stored on their label (i.e. spares/cache) + * or have not been completely initialized (txg = 0) just return + * the configuration from the first valid label we find. Otherwise, + * find the most up-to-date label that does not exceed the specified + * 'txg' value. + */ +nvlist_t * +vdev_label_read_config(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *config = NULL; + vdev_phys_t *vp; + abd_t *vp_abd; + zio_t *zio; + uint64_t best_txg = 0; + uint64_t label_txg = 0; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (!vdev_readable(vd)) + return (NULL); + + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + vp = abd_to_buf(vp_abd); + +retry: + for (int l = 0; l < VDEV_LABELS; l++) { + nvlist_t *label = NULL; + + zio = zio_root(spa, NULL, NULL, flags); + + vdev_label_read(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + if (zio_wait(zio) == 0 && + nvlist_unpack(vp->vp_nvlist, sizeof (vp->vp_nvlist), + &label, 0) == 0) { + /* + * Auxiliary vdevs won't have txg values in their + * labels and newly added vdevs may not have been + * completely initialized so just return the + * configuration from the first valid label we + * encounter. + */ + error = nvlist_lookup_uint64(label, + ZPOOL_CONFIG_POOL_TXG, &label_txg); + if ((error || label_txg == 0) && !config) { + config = label; + break; + } else if (label_txg <= txg && label_txg > best_txg) { + best_txg = label_txg; + nvlist_free(config); + config = fnvlist_dup(label); + } + } + + if (label != NULL) { + nvlist_free(label); + label = NULL; + } + } + + if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + /* + * We found a valid label but it didn't pass txg restrictions. + */ + if (config == NULL && label_txg != 0) { + vdev_dbgmsg(vd, "label discarded as txg is too large " + "(%llu > %llu)", (u_longlong_t)label_txg, + (u_longlong_t)txg); + } + + abd_free(vp_abd); + + return (config); +} + +/* + * Determine if a device is in use. The 'spare_guid' parameter will be filled + * in with the device guid if this spare is active elsewhere on the system. + */ +static boolean_t +vdev_inuse(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason, + uint64_t *spare_guid, uint64_t *l2cache_guid) +{ + spa_t *spa = vd->vdev_spa; + uint64_t state, pool_guid, device_guid, txg, spare_pool; + uint64_t vdtxg = 0; + nvlist_t *label; + + if (spare_guid) + *spare_guid = 0ULL; + if (l2cache_guid) + *l2cache_guid = 0ULL; + + /* + * Read the label, if any, and perform some basic sanity checks. + */ + if ((label = vdev_label_read_config(vd, -1ULL)) == NULL) + return (B_FALSE); + + (void) nvlist_lookup_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + &vdtxg); + + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, + &state) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, + &device_guid) != 0) { + nvlist_free(label); + return (B_FALSE); + } + + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, + &pool_guid) != 0 || + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &txg) != 0)) { + nvlist_free(label); + return (B_FALSE); + } + + nvlist_free(label); + + /* + * Check to see if this device indeed belongs to the pool it claims to + * be a part of. The only way this is allowed is if the device is a hot + * spare (which we check for later on). + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + !spa_guid_exists(pool_guid, device_guid) && + !spa_spare_exists(device_guid, NULL, NULL) && + !spa_l2cache_exists(device_guid, NULL)) + return (B_FALSE); + + /* + * If the transaction group is zero, then this an initialized (but + * unused) label. This is only an error if the create transaction + * on-disk is the same as the one we're using now, in which case the + * user has attempted to add the same vdev multiple times in the same + * transaction. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + txg == 0 && vdtxg == crtxg) + return (B_TRUE); + + /* + * Check to see if this is a spare device. We do an explicit check for + * spa_has_spare() here because it may be on our pending list of spares + * to add. We also check if it is an l2cache device. + */ + if (spa_spare_exists(device_guid, &spare_pool, NULL) || + spa_has_spare(spa, device_guid)) { + if (spare_guid) + *spare_guid = device_guid; + + switch (reason) { + case VDEV_LABEL_CREATE: + case VDEV_LABEL_L2CACHE: + return (B_TRUE); + + case VDEV_LABEL_REPLACE: + return (!spa_has_spare(spa, device_guid) || + spare_pool != 0ULL); + + case VDEV_LABEL_SPARE: + return (spa_has_spare(spa, device_guid)); + default: + break; + } + } + + /* + * Check to see if this is an l2cache device. + */ + if (spa_l2cache_exists(device_guid, NULL)) + return (B_TRUE); + + /* + * We can't rely on a pool's state if it's been imported + * read-only. Instead we look to see if the pools is marked + * read-only in the namespace and set the state to active. + */ + if (state != POOL_STATE_SPARE && state != POOL_STATE_L2CACHE && + (spa = spa_by_guid(pool_guid, device_guid)) != NULL && + spa_mode(spa) == SPA_MODE_READ) + state = POOL_STATE_ACTIVE; + + /* + * If the device is marked ACTIVE, then this device is in use by another + * pool on the system. + */ + return (state == POOL_STATE_ACTIVE); +} + +/* + * Initialize a vdev label. We check to make sure each leaf device is not in + * use, and writable. We put down an initial label which we will later + * overwrite with a complete label. Note that it's important to do this + * sequentially, not in parallel, so that we catch cases of multiple use of the + * same leaf vdev in the vdev we're creating -- e.g. mirroring a disk with + * itself. + */ +int +vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) +{ + spa_t *spa = vd->vdev_spa; + nvlist_t *label; + vdev_phys_t *vp; + abd_t *vp_abd; + abd_t *bootenv; + uberblock_t *ub; + abd_t *ub_abd; + zio_t *zio; + char *buf; + size_t buflen; + int error; + uint64_t spare_guid = 0, l2cache_guid = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) + if ((error = vdev_label_init(vd->vdev_child[c], + crtxg, reason)) != 0) + return (error); + + /* Track the creation time for this vdev */ + vd->vdev_crtxg = crtxg; + + if (!vd->vdev_ops->vdev_op_leaf || !spa_writeable(spa)) + return (0); + + /* + * Dead vdevs cannot be initialized. + */ + if (vdev_is_dead(vd)) + return (SET_ERROR(EIO)); + + /* + * Determine if the vdev is in use. + */ + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && + vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) + return (SET_ERROR(EBUSY)); + + /* + * If this is a request to add or replace a spare or l2cache device + * that is in use elsewhere on the system, then we must update the + * guid (which was initialized to a random value) to reflect the + * actual GUID (which is shared between multiple pools). + */ + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_L2CACHE && + spare_guid != 0ULL) { + uint64_t guid_delta = spare_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding a spare, then it's already + * labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_SPARE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE || + reason == VDEV_LABEL_SPLIT); + } + + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && + l2cache_guid != 0ULL) { + uint64_t guid_delta = l2cache_guid - vd->vdev_guid; + + vd->vdev_guid += guid_delta; + + for (vdev_t *pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) + pvd->vdev_guid_sum += guid_delta; + + /* + * If this is a replacement, then we want to fallthrough to the + * rest of the code. If we're adding an l2cache, then it's + * already labeled appropriately and we can just return. + */ + if (reason == VDEV_LABEL_L2CACHE) + return (0); + ASSERT(reason == VDEV_LABEL_REPLACE); + } + + /* + * Initialize its label. + */ + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); + + /* + * Generate a label describing the pool and our top-level vdev. + * We mark it as being from txg 0 to indicate that it's not + * really part of an active pool just yet. The labels will + * be written again with a meaningful txg by spa_sync(). + */ + if (reason == VDEV_LABEL_SPARE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + /* + * For inactive hot spares, we generate a special label that + * identifies as a mutually shared hot spare. We write the + * label if we are adding a hot spare, or if we are removing an + * active hot spare (in which case we want to revert the + * labels). + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_SPARE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else if (reason == VDEV_LABEL_L2CACHE || + (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { + /* + * For level 2 ARC devices, add a special label. + */ + VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, + POOL_STATE_L2CACHE) == 0); + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, + vd->vdev_guid) == 0); + } else { + uint64_t txg = 0ULL; + + if (reason == VDEV_LABEL_SPLIT) + txg = spa->spa_uberblock.ub_txg; + label = spa_config_generate(spa, vd, txg, B_FALSE); + + /* + * Add our creation time. This allows us to detect multiple + * vdev uses as described above, and automatically expires if we + * fail. + */ + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_CREATE_TXG, + crtxg) == 0); + } + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + error = nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP); + if (error != 0) { + nvlist_free(label); + abd_free(vp_abd); + /* EFAULT means nvlist_pack ran out of room */ + return (SET_ERROR(error == EFAULT ? ENAMETOOLONG : EINVAL)); + } + + /* + * Initialize uberblock template. + */ + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_RING, B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_RING); + abd_copy_from_buf(ub_abd, &spa->spa_uberblock, sizeof (uberblock_t)); + ub = abd_to_buf(ub_abd); + ub->ub_txg = 0; + + /* Initialize the 2nd padding area. */ + bootenv = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(bootenv, VDEV_PAD_SIZE); + + /* + * Write everything in parallel. + */ +retry: + zio = zio_root(spa, NULL, NULL, flags); + + for (int l = 0; l < VDEV_LABELS; l++) { + + vdev_label_write(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), NULL, NULL, flags); + + /* + * Skip the 1st padding area. + * Zero out the 2nd padding area where it might have + * left over data from previous filesystem format. + */ + vdev_label_write(zio, vd, l, bootenv, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + + vdev_label_write(zio, vd, l, ub_abd, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); + } + + error = zio_wait(zio); + + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + nvlist_free(label); + abd_free(bootenv); + abd_free(ub_abd); + abd_free(vp_abd); + + /* + * If this vdev hasn't been previously identified as a spare, then we + * mark it as such only if a) we are labeling it as a spare, or b) it + * exists as a spare elsewhere in the system. Do the same for + * level 2 ARC devices. + */ + if (error == 0 && !vd->vdev_isspare && + (reason == VDEV_LABEL_SPARE || + spa_spare_exists(vd->vdev_guid, NULL, NULL))) + spa_spare_add(vd); + + if (error == 0 && !vd->vdev_isl2cache && + (reason == VDEV_LABEL_L2CACHE || + spa_l2cache_exists(vd->vdev_guid, NULL))) + spa_l2cache_add(vd); + + return (error); +} + +/* + * Done callback for vdev_label_read_bootenv_impl. If this is the first + * callback to finish, store our abd in the callback pointer. Otherwise, we + * just free our abd and return. + */ +static void +vdev_label_read_bootenv_done(zio_t *zio) +{ + zio_t *rio = zio->io_private; + abd_t **cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_PAD_SIZE); + + if (zio->io_error == 0) { + mutex_enter(&rio->io_lock); + if (*cbp == NULL) { + /* Will free this buffer in vdev_label_read_bootenv. */ + *cbp = zio->io_abd; + } else { + abd_free(zio->io_abd); + } + mutex_exit(&rio->io_lock); + } else { + abd_free(zio->io_abd); + } +} + +static void +vdev_label_read_bootenv_impl(zio_t *zio, vdev_t *vd, int flags) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_label_read_bootenv_impl(zio, vd->vdev_child[c], flags); + + /* + * We just use the first label that has a correct checksum; the + * bootloader should have rewritten them all to be the same on boot, + * and any changes we made since boot have been the same across all + * labels. + * + * While grub supports writing to all four labels, other bootloaders + * don't, so we only use the first two labels to store boot + * information. + */ + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_PAD_SIZE, B_FALSE), + offsetof(vdev_label_t, vl_be), VDEV_PAD_SIZE, + vdev_label_read_bootenv_done, zio, flags); + } + } +} + +int +vdev_label_read_bootenv(vdev_t *rvd, nvlist_t *command) +{ + spa_t *spa = rvd->vdev_spa; + abd_t *abd = NULL; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(command); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + zio_t *zio = zio_root(spa, NULL, &abd, flags); + vdev_label_read_bootenv_impl(zio, rvd, flags); + int err = zio_wait(zio); + + if (abd != NULL) { + vdev_boot_envblock_t *vbe = abd_to_buf(abd); + if (vbe->vbe_version != VB_RAW) { + abd_free(abd); + return (SET_ERROR(ENOTSUP)); + } + vbe->vbe_bootenv[sizeof (vbe->vbe_bootenv) - 1] = '\0'; + fnvlist_add_string(command, "envmap", vbe->vbe_bootenv); + /* abd was allocated in vdev_label_read_bootenv_impl() */ + abd_free(abd); + /* If we managed to read any successfully, return success. */ + return (0); + } + return (err); +} + +int +vdev_label_write_bootenv(vdev_t *vd, char *envmap) +{ + zio_t *zio; + spa_t *spa = vd->vdev_spa; + vdev_boot_envblock_t *bootenv; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + int error = ENXIO; + + if (strlen(envmap) >= sizeof (bootenv->vbe_bootenv)) { + return (SET_ERROR(E2BIG)); + } + + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + for (int c = 0; c < vd->vdev_children; c++) { + int child_err = vdev_label_write_bootenv(vd->vdev_child[c], + envmap); + /* + * As long as any of the disks managed to write all of their + * labels successfully, return success. + */ + if (child_err == 0) + error = child_err; + } + + if (!vd->vdev_ops->vdev_op_leaf || vdev_is_dead(vd) || + !vdev_writeable(vd)) { + return (error); + } + ASSERT3U(sizeof (*bootenv), ==, VDEV_PAD_SIZE); + abd_t *abd = abd_alloc_for_io(VDEV_PAD_SIZE, B_TRUE); + abd_zero(abd, VDEV_PAD_SIZE); + bootenv = abd_borrow_buf_copy(abd, VDEV_PAD_SIZE); + + char *buf = bootenv->vbe_bootenv; + (void) strlcpy(buf, envmap, sizeof (bootenv->vbe_bootenv)); + bootenv->vbe_version = VB_RAW; + abd_return_buf_copy(abd, bootenv, VDEV_PAD_SIZE); + +retry: + zio = zio_root(spa, NULL, NULL, flags); + for (int l = 0; l < VDEV_LABELS / 2; l++) { + vdev_label_write(zio, vd, l, abd, + offsetof(vdev_label_t, vl_be), + VDEV_PAD_SIZE, NULL, NULL, flags); + } + + error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + + abd_free(abd); + return (error); +} + +/* + * ========================================================================== + * uberblock load/sync + * ========================================================================== + */ + +/* + * Consider the following situation: txg is safely synced to disk. We've + * written the first uberblock for txg + 1, and then we lose power. When we + * come back up, we fail to see the uberblock for txg + 1 because, say, + * it was on a mirrored device and the replica to which we wrote txg + 1 + * is now offline. If we then make some changes and sync txg + 1, and then + * the missing replica comes back, then for a few seconds we'll have two + * conflicting uberblocks on disk with the same txg. The solution is simple: + * among uberblocks with equal txg, choose the one with the latest timestamp. + */ +static int +vdev_uberblock_compare(const uberblock_t *ub1, const uberblock_t *ub2) +{ + int cmp = TREE_CMP(ub1->ub_txg, ub2->ub_txg); + + if (likely(cmp)) + return (cmp); + + cmp = TREE_CMP(ub1->ub_timestamp, ub2->ub_timestamp); + if (likely(cmp)) + return (cmp); + + /* + * If MMP_VALID(ub) && MMP_SEQ_VALID(ub) then the host has an MMP-aware + * ZFS, e.g. zfsonlinux >= 0.7. + * + * If one ub has MMP and the other does not, they were written by + * different hosts, which matters for MMP. So we treat no MMP/no SEQ as + * a 0 value. + * + * Since timestamp and txg are the same if we get this far, either is + * acceptable for importing the pool. + */ + unsigned int seq1 = 0; + unsigned int seq2 = 0; + + if (MMP_VALID(ub1) && MMP_SEQ_VALID(ub1)) + seq1 = MMP_SEQ(ub1); + + if (MMP_VALID(ub2) && MMP_SEQ_VALID(ub2)) + seq2 = MMP_SEQ(ub2); + + return (TREE_CMP(seq1, seq2)); +} + +struct ubl_cbdata { + uberblock_t *ubl_ubbest; /* Best uberblock */ + vdev_t *ubl_vd; /* vdev associated with the above */ +}; + +static void +vdev_uberblock_load_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + spa_t *spa = zio->io_spa; + zio_t *rio = zio->io_private; + uberblock_t *ub = abd_to_buf(zio->io_abd); + struct ubl_cbdata *cbp = rio->io_private; + + ASSERT3U(zio->io_size, ==, VDEV_UBERBLOCK_SIZE(vd)); + + if (zio->io_error == 0 && uberblock_verify(ub) == 0) { + mutex_enter(&rio->io_lock); + if (ub->ub_txg <= spa->spa_load_max_txg && + vdev_uberblock_compare(ub, cbp->ubl_ubbest) > 0) { + /* + * Keep track of the vdev in which this uberblock + * was found. We will use this information later + * to obtain the config nvlist associated with + * this uberblock. + */ + *cbp->ubl_ubbest = *ub; + cbp->ubl_vd = vd; + } + mutex_exit(&rio->io_lock); + } + + abd_free(zio->io_abd); +} + +static void +vdev_uberblock_load_impl(zio_t *zio, vdev_t *vd, int flags, + struct ubl_cbdata *cbp) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_uberblock_load_impl(zio, vd->vdev_child[c], flags, cbp); + + if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + for (int l = 0; l < VDEV_LABELS; l++) { + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + vdev_label_read(zio, vd, l, + abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), + B_TRUE), VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_load_done, zio, flags); + } + } + } +} + +/* + * Reads the 'best' uberblock from disk along with its associated + * configuration. First, we read the uberblock array of each label of each + * vdev, keeping track of the uberblock with the highest txg in each array. + * Then, we read the configuration from the same vdev as the best uberblock. + */ +void +vdev_uberblock_load(vdev_t *rvd, uberblock_t *ub, nvlist_t **config) +{ + zio_t *zio; + spa_t *spa = rvd->vdev_spa; + struct ubl_cbdata cb; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; + + ASSERT(ub); + ASSERT(config); + + bzero(ub, sizeof (uberblock_t)); + *config = NULL; + + cb.ubl_ubbest = ub; + cb.ubl_vd = NULL; + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + zio = zio_root(spa, NULL, &cb, flags); + vdev_uberblock_load_impl(zio, rvd, flags, &cb); + (void) zio_wait(zio); + + /* + * It's possible that the best uberblock was discovered on a label + * that has a configuration which was written in a future txg. + * Search all labels on this vdev to find the configuration that + * matches the txg for our uberblock. + */ + if (cb.ubl_vd != NULL) { + vdev_dbgmsg(cb.ubl_vd, "best uberblock found for spa %s. " + "txg %llu", spa->spa_name, (u_longlong_t)ub->ub_txg); + + *config = vdev_label_read_config(cb.ubl_vd, ub->ub_txg); + if (*config == NULL && spa->spa_extreme_rewind) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config. " + "Trying again without txg restrictions."); + *config = vdev_label_read_config(cb.ubl_vd, UINT64_MAX); + } + if (*config == NULL) { + vdev_dbgmsg(cb.ubl_vd, "failed to read label config"); + } + } + spa_config_exit(spa, SCL_ALL, FTAG); +} + +/* + * For use when a leaf vdev is expanded. + * The location of labels 2 and 3 changed, and at the new location the + * uberblock rings are either empty or contain garbage. The sync will write + * new configs there because the vdev is dirty, but expansion also needs the + * uberblock rings copied. Read them from label 0 which did not move. + * + * Since the point is to populate labels {2,3} with valid uberblocks, + * we zero uberblocks we fail to read or which are not valid. + */ + +static void +vdev_copy_uberblocks(vdev_t *vd) +{ + abd_t *ub_abd; + zio_t *write_zio; + int locks = (SCL_L2ARC | SCL_ZIO); + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; + + ASSERT(spa_config_held(vd->vdev_spa, SCL_STATE, RW_READER) == + SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + spa_config_enter(vd->vdev_spa, locks, FTAG, RW_READER); + + ub_abd = abd_alloc_linear(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + + write_zio = zio_root(vd->vdev_spa, NULL, NULL, flags); + for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { + const int src_label = 0; + zio_t *zio; + + zio = zio_root(vd->vdev_spa, NULL, NULL, flags); + vdev_label_read(zio, vd, src_label, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + NULL, NULL, flags); + + if (zio_wait(zio) || uberblock_verify(abd_to_buf(ub_abd))) + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + + for (int l = 2; l < VDEV_LABELS; l++) + vdev_label_write(write_zio, vd, l, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), + VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + (void) zio_wait(write_zio); + + spa_config_exit(vd->vdev_spa, locks, FTAG); + + abd_free(ub_abd); +} + +/* + * On success, increment root zio's count of good writes. + * We only get credit for writes to known-visible vdevs; see spa_vdev_add(). + */ +static void +vdev_uberblock_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0 && zio->io_vd->vdev_top->vdev_ms_array != 0) + atomic_inc_64(good_writes); +} + +/* + * Write the uberblock to all labels of all leaves of the specified vdev. + */ +static void +vdev_uberblock_sync(zio_t *zio, uint64_t *good_writes, + uberblock_t *ub, vdev_t *vd, int flags) +{ + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_uberblock_sync(zio, good_writes, + ub, vd->vdev_child[c], flags); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + /* If the vdev was expanded, need to copy uberblock rings. */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && + vd->vdev_copy_uberblocks == B_TRUE) { + vdev_copy_uberblocks(vd); + vd->vdev_copy_uberblocks = B_FALSE; + } + + int m = spa_multihost(vd->vdev_spa) ? MMP_BLOCKS_PER_LABEL : 0; + int n = ub->ub_txg % (VDEV_UBERBLOCK_COUNT(vd) - m); + + /* Copy the uberblock_t into the ABD */ + abd_t *ub_abd = abd_alloc_for_io(VDEV_UBERBLOCK_SIZE(vd), B_TRUE); + abd_zero(ub_abd, VDEV_UBERBLOCK_SIZE(vd)); + abd_copy_from_buf(ub_abd, ub, sizeof (uberblock_t)); + + for (int l = 0; l < VDEV_LABELS; l++) + vdev_label_write(zio, vd, l, ub_abd, + VDEV_UBERBLOCK_OFFSET(vd, n), VDEV_UBERBLOCK_SIZE(vd), + vdev_uberblock_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + + abd_free(ub_abd); +} + +/* Sync the uberblocks to all vdevs in svd[] */ +static int +vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) +{ + spa_t *spa = svd[0]->vdev_spa; + zio_t *zio; + uint64_t good_writes = 0; + + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) + vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); + + (void) zio_wait(zio); + + /* + * Flush the uberblocks to disk. This ensures that the odd labels + * are no longer needed (because the new uberblocks and the even + * labels are safely on disk), so it is safe to overwrite them. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (int v = 0; v < svdcount; v++) { + if (vdev_writeable(svd[v])) { + zio_flush(zio, svd[v]); + } + } + + (void) zio_wait(zio); + + return (good_writes >= 1 ? 0 : EIO); +} + +/* + * On success, increment the count of good writes for our top-level vdev. + */ +static void +vdev_label_sync_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (zio->io_error == 0) + atomic_inc_64(good_writes); +} + +/* + * If there weren't enough good writes, indicate failure to the parent. + */ +static void +vdev_label_sync_top_done(zio_t *zio) +{ + uint64_t *good_writes = zio->io_private; + + if (*good_writes == 0) + zio->io_error = SET_ERROR(EIO); + + kmem_free(good_writes, sizeof (uint64_t)); +} + +/* + * We ignore errors for log and cache devices, simply free the private data. + */ +static void +vdev_label_sync_ignore_done(zio_t *zio) +{ + kmem_free(zio->io_private, sizeof (uint64_t)); +} + +/* + * Write all even or odd labels to all leaves of the specified vdev. + */ +static void +vdev_label_sync(zio_t *zio, uint64_t *good_writes, + vdev_t *vd, int l, uint64_t txg, int flags) +{ + nvlist_t *label; + vdev_phys_t *vp; + abd_t *vp_abd; + char *buf; + size_t buflen; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_label_sync(zio, good_writes, + vd->vdev_child[c], l, txg, flags); + } + + if (!vd->vdev_ops->vdev_op_leaf) + return; + + if (!vdev_writeable(vd)) + return; + + /* + * Generate a label describing the top-level config to which we belong. + */ + label = spa_config_generate(vd->vdev_spa, vd, txg, B_FALSE); + + vp_abd = abd_alloc_linear(sizeof (vdev_phys_t), B_TRUE); + abd_zero(vp_abd, sizeof (vdev_phys_t)); + vp = abd_to_buf(vp_abd); + + buf = vp->vp_nvlist; + buflen = sizeof (vp->vp_nvlist); + + if (!nvlist_pack(label, &buf, &buflen, NV_ENCODE_XDR, KM_SLEEP)) { + for (; l < VDEV_LABELS; l += 2) { + vdev_label_write(zio, vd, l, vp_abd, + offsetof(vdev_label_t, vl_vdev_phys), + sizeof (vdev_phys_t), + vdev_label_sync_done, good_writes, + flags | ZIO_FLAG_DONT_PROPAGATE); + } + } + + abd_free(vp_abd); + nvlist_free(label); +} + +static int +vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) +{ + list_t *dl = &spa->spa_config_dirty_list; + vdev_t *vd; + zio_t *zio; + int error; + + /* + * Write the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { + uint64_t *good_writes; + + ASSERT(!vd->vdev_ishole); + + good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + zio_t *vio = zio_null(zio, spa, NULL, + (vd->vdev_islog || vd->vdev_aux != NULL) ? + vdev_label_sync_ignore_done : vdev_label_sync_top_done, + good_writes, flags); + vdev_label_sync(vio, good_writes, vd, l, txg, flags); + zio_nowait(vio); + } + + error = zio_wait(zio); + + /* + * Flush the new labels to disk. + */ + zio = zio_root(spa, NULL, NULL, flags); + + for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + return (error); +} + +/* + * Sync the uberblock and any changes to the vdev configuration. + * + * The order of operations is carefully crafted to ensure that + * if the system panics or loses power at any time, the state on disk + * is still transactionally consistent. The in-line comments below + * describe the failure semantics at each stage. + * + * Moreover, vdev_config_sync() is designed to be idempotent: if it fails + * at any time, you can just call it again, and it will resume its work. + */ +int +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +{ + spa_t *spa = svd[0]->vdev_spa; + uberblock_t *ub = &spa->spa_uberblock; + int error = 0; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + + ASSERT(svdcount != 0); +retry: + /* + * Normally, we don't want to try too hard to write every label and + * uberblock. If there is a flaky disk, we don't want the rest of the + * sync process to block while we retry. But if we can't write a + * single label out, we should retry with ZIO_FLAG_TRYHARD before + * bailing out and declaring the pool faulted. + */ + if (error != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) + return (error); + flags |= ZIO_FLAG_TRYHARD; + } + + ASSERT(ub->ub_txg <= txg); + + /* + * If this isn't a resync due to I/O errors, + * and nothing changed in this transaction group, + * and the vdev configuration hasn't changed, + * then there's nothing to do. + */ + if (ub->ub_txg < txg) { + boolean_t changed = uberblock_update(ub, spa->spa_root_vdev, + txg, spa->spa_mmp.mmp_delay); + + if (!changed && list_is_empty(&spa->spa_config_dirty_list)) + return (0); + } + + if (txg > spa_freeze_txg(spa)) + return (0); + + ASSERT(txg <= spa->spa_final_txg); + + /* + * Flush the write cache of every disk that's been written to + * in this transaction group. This ensures that all blocks + * written in this txg will be committed to stable storage + * before any uberblock that references them. + */ + zio_t *zio = zio_root(spa, NULL, NULL, flags); + + for (vdev_t *vd = + txg_list_head(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)); vd != NULL; + vd = txg_list_next(&spa->spa_vdev_txg_list, vd, TXG_CLEAN(txg))) + zio_flush(zio, vd); + + (void) zio_wait(zio); + + /* + * Sync out the even labels (L0, L2) for every dirty vdev. If the + * system dies in the middle of this process, that's OK: all of the + * even labels that made it to disk will be newer than any uberblock, + * and will therefore be considered invalid. The odd labels (L1, L3), + * which have not yet been touched, will still be valid. We flush + * the new labels to disk to ensure that all even-label updates + * are committed to stable storage before the uberblock update. + */ + if ((error = vdev_label_sync_list(spa, 0, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the even labels " + "of dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + + /* + * Sync the uberblocks to all vdevs in svd[]. + * If the system dies in the middle of this step, there are two cases + * to consider, and the on-disk state is consistent either way: + * + * (1) If none of the new uberblocks made it to disk, then the + * previous uberblock will be the newest, and the odd labels + * (which had not yet been touched) will be valid with respect + * to that uberblock. + * + * (2) If one or more new uberblocks made it to disk, then they + * will be the newest, and the even labels (which had all + * been successfully committed) will be valid with respect + * to the new uberblocks. + */ + if ((error = vdev_uberblock_sync_list(svd, svdcount, ub, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_uberblock_sync_list() returned error " + "%d for pool '%s'", error, spa_name(spa)); + } + goto retry; + } + + if (spa_multihost(spa)) + mmp_update_uberblock(spa, ub); + + /* + * Sync out odd labels for every dirty vdev. If the system dies + * in the middle of this process, the even labels and the new + * uberblocks will suffice to open the pool. The next time + * the pool is opened, the first thing we'll do -- before any + * user data is modified -- is mark every vdev dirty so that + * all labels will be brought up to date. We flush the new labels + * to disk to ensure that all odd-label updates are committed to + * stable storage before the next transaction group begins. + */ + if ((error = vdev_label_sync_list(spa, 1, txg, flags)) != 0) { + if ((flags & ZIO_FLAG_TRYHARD) != 0) { + zfs_dbgmsg("vdev_label_sync_list() returned error %d " + "for pool '%s' when syncing out the odd labels of " + "dirty vdevs", error, spa_name(spa)); + } + goto retry; + } + + return (0); +} diff --git a/module/zfs/vdev_mirror.c b/module/zfs/vdev_mirror.c new file mode 100644 index 000000000000..5e1060f127c9 --- /dev/null +++ b/module/zfs/vdev_mirror.c @@ -0,0 +1,863 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Vdev mirror kstats + */ +static kstat_t *mirror_ksp = NULL; + +typedef struct mirror_stats { + kstat_named_t vdev_mirror_stat_rotating_linear; + kstat_named_t vdev_mirror_stat_rotating_offset; + kstat_named_t vdev_mirror_stat_rotating_seek; + kstat_named_t vdev_mirror_stat_non_rotating_linear; + kstat_named_t vdev_mirror_stat_non_rotating_seek; + + kstat_named_t vdev_mirror_stat_preferred_found; + kstat_named_t vdev_mirror_stat_preferred_not_found; +} mirror_stats_t; + +static mirror_stats_t mirror_stats = { + /* New I/O follows directly the last I/O */ + { "rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O is within zfs_vdev_mirror_rotating_seek_offset of the last */ + { "rotating_offset", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek */ + { "rotating_seek", KSTAT_DATA_UINT64 }, + /* New I/O follows directly the last I/O (nonrot) */ + { "non_rotating_linear", KSTAT_DATA_UINT64 }, + /* New I/O requires random seek (nonrot) */ + { "non_rotating_seek", KSTAT_DATA_UINT64 }, + /* Preferred child vdev found */ + { "preferred_found", KSTAT_DATA_UINT64 }, + /* Preferred child vdev not found or equal load */ + { "preferred_not_found", KSTAT_DATA_UINT64 }, + +}; + +#define MIRROR_STAT(stat) (mirror_stats.stat.value.ui64) +#define MIRROR_INCR(stat, val) atomic_add_64(&MIRROR_STAT(stat), val) +#define MIRROR_BUMP(stat) MIRROR_INCR(stat, 1) + +void +vdev_mirror_stat_init(void) +{ + mirror_ksp = kstat_create("zfs", 0, "vdev_mirror_stats", + "misc", KSTAT_TYPE_NAMED, + sizeof (mirror_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); + if (mirror_ksp != NULL) { + mirror_ksp->ks_data = &mirror_stats; + kstat_install(mirror_ksp); + } +} + +void +vdev_mirror_stat_fini(void) +{ + if (mirror_ksp != NULL) { + kstat_delete(mirror_ksp); + mirror_ksp = NULL; + } +} + +/* + * Virtual device vector for mirroring. + */ + +typedef struct mirror_child { + vdev_t *mc_vd; + uint64_t mc_offset; + int mc_error; + int mc_load; + uint8_t mc_tried; + uint8_t mc_skipped; + uint8_t mc_speculative; +} mirror_child_t; + +typedef struct mirror_map { + int *mm_preferred; + int mm_preferred_cnt; + int mm_children; + boolean_t mm_resilvering; + boolean_t mm_root; + mirror_child_t mm_child[]; +} mirror_map_t; + +static int vdev_mirror_shift = 21; + +/* + * The load configuration settings below are tuned by default for + * the case where all devices are of the same rotational type. + * + * If there is a mixture of rotating and non-rotating media, setting + * zfs_vdev_mirror_non_rotating_seek_inc to 0 may well provide better results + * as it will direct more reads to the non-rotating vdevs which are more likely + * to have a higher performance. + */ + +/* Rotating media load calculation configuration. */ +static int zfs_vdev_mirror_rotating_inc = 0; +static int zfs_vdev_mirror_rotating_seek_inc = 5; +static int zfs_vdev_mirror_rotating_seek_offset = 1 * 1024 * 1024; + +/* Non-rotating media load calculation configuration. */ +static int zfs_vdev_mirror_non_rotating_inc = 0; +static int zfs_vdev_mirror_non_rotating_seek_inc = 1; + +static inline size_t +vdev_mirror_map_size(int children) +{ + return (offsetof(mirror_map_t, mm_child[children]) + + sizeof (int) * children); +} + +static inline mirror_map_t * +vdev_mirror_map_alloc(int children, boolean_t resilvering, boolean_t root) +{ + mirror_map_t *mm; + + mm = kmem_zalloc(vdev_mirror_map_size(children), KM_SLEEP); + mm->mm_children = children; + mm->mm_resilvering = resilvering; + mm->mm_root = root; + mm->mm_preferred = (int *)((uintptr_t)mm + + offsetof(mirror_map_t, mm_child[children])); + + return (mm); +} + +static void +vdev_mirror_map_free(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + + kmem_free(mm, vdev_mirror_map_size(mm->mm_children)); +} + +static const zio_vsd_ops_t vdev_mirror_vsd_ops = { + .vsd_free = vdev_mirror_map_free, + .vsd_cksum_report = zio_vsd_default_cksum_report +}; + +static int +vdev_mirror_load(mirror_map_t *mm, vdev_t *vd, uint64_t zio_offset) +{ + uint64_t last_offset; + int64_t offset_diff; + int load; + + /* All DVAs have equal weight at the root. */ + if (mm->mm_root) + return (INT_MAX); + + /* + * We don't return INT_MAX if the device is resilvering i.e. + * vdev_resilver_txg != 0 as when tested performance was slightly + * worse overall when resilvering with compared to without. + */ + + /* Fix zio_offset for leaf vdevs */ + if (vd->vdev_ops->vdev_op_leaf) + zio_offset += VDEV_LABEL_START_SIZE; + + /* Standard load based on pending queue length. */ + load = vdev_queue_length(vd); + last_offset = vdev_queue_last_offset(vd); + + if (vd->vdev_nonrot) { + /* Non-rotating media. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_non_rotating_linear); + return (load + zfs_vdev_mirror_non_rotating_inc); + } + + /* + * Apply a seek penalty even for non-rotating devices as + * sequential I/O's can be aggregated into fewer operations on + * the device, thus avoiding unnecessary per-command overhead + * and boosting performance. + */ + MIRROR_BUMP(vdev_mirror_stat_non_rotating_seek); + return (load + zfs_vdev_mirror_non_rotating_seek_inc); + } + + /* Rotating media I/O's which directly follow the last I/O. */ + if (last_offset == zio_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_linear); + return (load + zfs_vdev_mirror_rotating_inc); + } + + /* + * Apply half the seek increment to I/O's within seek offset + * of the last I/O issued to this vdev as they should incur less + * of a seek increment. + */ + offset_diff = (int64_t)(last_offset - zio_offset); + if (ABS(offset_diff) < zfs_vdev_mirror_rotating_seek_offset) { + MIRROR_BUMP(vdev_mirror_stat_rotating_offset); + return (load + (zfs_vdev_mirror_rotating_seek_inc / 2)); + } + + /* Apply the full seek increment to all other I/O's. */ + MIRROR_BUMP(vdev_mirror_stat_rotating_seek); + return (load + zfs_vdev_mirror_rotating_seek_inc); +} + +/* + * Avoid inlining the function to keep vdev_mirror_io_start(), which + * is this functions only caller, as small as possible on the stack. + */ +noinline static mirror_map_t * +vdev_mirror_map_init(zio_t *zio) +{ + mirror_map_t *mm = NULL; + mirror_child_t *mc; + vdev_t *vd = zio->io_vd; + int c; + + if (vd == NULL) { + dva_t *dva = zio->io_bp->blk_dva; + spa_t *spa = zio->io_spa; + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; + dva_t dva_copy[SPA_DVAS_PER_BP]; + + /* + * The sequential scrub code sorts and issues all DVAs + * of a bp separately. Each of these IOs includes all + * original DVA copies so that repairs can be performed + * in the event of an error, but we only actually want + * to check the first DVA since the others will be + * checked by their respective sorted IOs. Only if we + * hit an error will we try all DVAs upon retrying. + * + * Note: This check is safe even if the user switches + * from a legacy scrub to a sequential one in the middle + * of processing, since scn_is_sorted isn't updated until + * all outstanding IOs from the previous scrub pass + * complete. + */ + if ((zio->io_flags & ZIO_FLAG_SCRUB) && + !(zio->io_flags & ZIO_FLAG_IO_RETRY) && + dsl_scan_scrubbing(spa->spa_dsl_pool) && + scn->scn_is_sorted) { + c = 1; + } else { + c = BP_GET_NDVAS(zio->io_bp); + } + + /* + * If the pool cannot be written to, then infer that some + * DVAs might be invalid or point to vdevs that do not exist. + * We skip them. + */ + if (!spa_writeable(spa)) { + ASSERT3U(zio->io_type, ==, ZIO_TYPE_READ); + int j = 0; + for (int i = 0; i < c; i++) { + if (zfs_dva_valid(spa, &dva[i], zio->io_bp)) + dva_copy[j++] = dva[i]; + } + if (j == 0) { + zio->io_vsd = NULL; + zio->io_error = ENXIO; + return (NULL); + } + if (j < c) { + dva = dva_copy; + c = j; + } + } + + mm = vdev_mirror_map_alloc(c, B_FALSE, B_TRUE); + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + mc->mc_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[c])); + mc->mc_offset = DVA_GET_OFFSET(&dva[c]); + if (mc->mc_vd == NULL) { + kmem_free(mm, vdev_mirror_map_size( + mm->mm_children)); + zio->io_vsd = NULL; + zio->io_error = ENXIO; + return (NULL); + } + } + } else { + /* + * If we are resilvering, then we should handle scrub reads + * differently; we shouldn't issue them to the resilvering + * device because it might not have those blocks. + * + * We are resilvering iff: + * 1) We are a replacing vdev (ie our name is "replacing-1" or + * "spare-1" or something like that), and + * 2) The pool is currently being resilvered. + * + * We cannot simply check vd->vdev_resilver_txg, because it's + * not set in this path. + * + * Nor can we just check our vdev_ops; there are cases (such as + * when a user types "zpool replace pool odev spare_dev" and + * spare_dev is in the spare list, or when a spare device is + * automatically used to replace a DEGRADED device) when + * resilvering is complete but both the original vdev and the + * spare vdev remain in the pool. That behavior is intentional. + * It helps implement the policy that a spare should be + * automatically removed from the pool after the user replaces + * the device that originally failed. + * + * If a spa load is in progress, then spa_dsl_pool may be + * uninitialized. But we shouldn't be resilvering during a spa + * load anyway. + */ + boolean_t replacing = (vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops) && + spa_load_state(vd->vdev_spa) == SPA_LOAD_NONE && + dsl_scan_resilvering(vd->vdev_spa->spa_dsl_pool); + mm = vdev_mirror_map_alloc(vd->vdev_children, replacing, + B_FALSE); + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + mc->mc_vd = vd->vdev_child[c]; + mc->mc_offset = zio->io_offset; + } + } + + zio->io_vsd = mm; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; + return (mm); +} + +static int +vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + int numerrors = 0; + int lasterror = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + vd->vdev_physical_ashift); + } + + if (numerrors == vd->vdev_children) { + if (vdev_children_are_offline(vd)) + vd->vdev_stat.vs_aux = VDEV_AUX_CHILDREN_OFFLINE; + else + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_mirror_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_mirror_child_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +static void +vdev_mirror_scrub_done(zio_t *zio) +{ + mirror_child_t *mc = zio->io_private; + + if (zio->io_error == 0) { + zio_t *pio; + zio_link_t *zl = NULL; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio, &zl)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + abd_copy(pio->io_abd, zio->io_abd, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); + } + + abd_free(zio->io_abd); + + mc->mc_error = zio->io_error; + mc->mc_tried = 1; + mc->mc_skipped = 0; +} + +/* + * Check the other, lower-index DVAs to see if they're on the same + * vdev as the child we picked. If they are, use them since they + * are likely to have been allocated from the primary metaslab in + * use at the time, and hence are more likely to have locality with + * single-copy data. + */ +static int +vdev_mirror_dva_select(zio_t *zio, int p) +{ + dva_t *dva = zio->io_bp->blk_dva; + mirror_map_t *mm = zio->io_vsd; + int preferred; + int c; + + preferred = mm->mm_preferred[p]; + for (p--; p >= 0; p--) { + c = mm->mm_preferred[p]; + if (DVA_GET_VDEV(&dva[c]) == DVA_GET_VDEV(&dva[preferred])) + preferred = c; + } + return (preferred); +} + +static int +vdev_mirror_preferred_child_randomize(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + int p; + + if (mm->mm_root) { + p = spa_get_random(mm->mm_preferred_cnt); + return (vdev_mirror_dva_select(zio, p)); + } + + /* + * To ensure we don't always favour the first matching vdev, + * which could lead to wear leveling issues on SSD's, we + * use the I/O offset as a pseudo random seed into the vdevs + * which have the lowest load. + */ + p = (zio->io_offset >> vdev_mirror_shift) % mm->mm_preferred_cnt; + return (mm->mm_preferred[p]); +} + +/* + * Try to find a vdev whose DTL doesn't contain the block we want to read + * preferring vdevs based on determined load. + * + * Try to find a child whose DTL doesn't contain the block we want to read. + * If we can't, try the read on any vdev we haven't already tried. + */ +static int +vdev_mirror_child_select(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + uint64_t txg = zio->io_txg; + int c, lowest_load; + + ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); + + lowest_load = INT_MAX; + mm->mm_preferred_cnt = 0; + for (c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc; + + mc = &mm->mm_child[c]; + if (mc->mc_tried || mc->mc_skipped) + continue; + + if (mc->mc_vd == NULL || !vdev_readable(mc->mc_vd)) { + mc->mc_error = SET_ERROR(ENXIO); + mc->mc_tried = 1; /* don't even try */ + mc->mc_skipped = 1; + continue; + } + + if (vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) { + mc->mc_error = SET_ERROR(ESTALE); + mc->mc_skipped = 1; + mc->mc_speculative = 1; + continue; + } + + mc->mc_load = vdev_mirror_load(mm, mc->mc_vd, mc->mc_offset); + if (mc->mc_load > lowest_load) + continue; + + if (mc->mc_load < lowest_load) { + lowest_load = mc->mc_load; + mm->mm_preferred_cnt = 0; + } + mm->mm_preferred[mm->mm_preferred_cnt] = c; + mm->mm_preferred_cnt++; + } + + if (mm->mm_preferred_cnt == 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_found); + return (mm->mm_preferred[0]); + } + + if (mm->mm_preferred_cnt > 1) { + MIRROR_BUMP(vdev_mirror_stat_preferred_not_found); + return (vdev_mirror_preferred_child_randomize(zio)); + } + + /* + * Every device is either missing or has this txg in its DTL. + * Look for any child we haven't already tried before giving up. + */ + for (c = 0; c < mm->mm_children; c++) { + if (!mm->mm_child[c].mc_tried) + return (c); + } + + /* + * Every child failed. There's no place left to look. + */ + return (-1); +} + +static void +vdev_mirror_io_start(zio_t *zio) +{ + mirror_map_t *mm; + mirror_child_t *mc; + int c, children; + + mm = vdev_mirror_map_init(zio); + + if (mm == NULL) { + ASSERT(!spa_trust_config(zio->io_spa)); + ASSERT(zio->io_type == ZIO_TYPE_READ); + zio_execute(zio); + return; + } + + if (zio->io_type == ZIO_TYPE_READ) { + if (zio->io_bp != NULL && + (zio->io_flags & ZIO_FLAG_SCRUB) && !mm->mm_resilvering) { + /* + * For scrubbing reads (if we can verify the + * checksum here, as indicated by io_bp being + * non-NULL) we need to allocate a read buffer for + * each child and issue reads to all children. If + * any child succeeds, it will copy its data into + * zio->io_data in vdev_mirror_scrub_done. + */ + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + abd_alloc_sametype(zio->io_abd, + zio->io_size), zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_scrub_done, mc)); + } + zio_execute(zio); + return; + } + /* + * For normal reads just pick one child. + */ + c = vdev_mirror_child_select(zio); + children = (c >= 0); + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + + /* + * Writes go to all children. + */ + c = 0; + children = mm->mm_children; + } + + while (children--) { + mc = &mm->mm_child[c]; + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, + zio->io_type, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + c++; + } + + zio_execute(zio); +} + +static int +vdev_mirror_worst_error(mirror_map_t *mm) +{ + int error[2] = { 0, 0 }; + + for (int c = 0; c < mm->mm_children; c++) { + mirror_child_t *mc = &mm->mm_child[c]; + int s = mc->mc_speculative; + error[s] = zio_worst_error(error[s], mc->mc_error); + } + + return (error[0] ? error[0] : error[1]); +} + +static void +vdev_mirror_io_done(zio_t *zio) +{ + mirror_map_t *mm = zio->io_vsd; + mirror_child_t *mc; + int c; + int good_copies = 0; + int unexpected_errors = 0; + + if (mm == NULL) + return; + + for (c = 0; c < mm->mm_children; c++) { + mc = &mm->mm_child[c]; + + if (mc->mc_error) { + if (!mc->mc_skipped) + unexpected_errors++; + } else if (mc->mc_tried) { + good_copies++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as success. + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (good_copies != mm->mm_children) { + /* + * Always require at least one good copy. + * + * For ditto blocks (io_vd == NULL), require + * all copies to be good. + * + * XXX -- for replacing vdevs, there's no great answer. + * If the old device is really dead, we may not even + * be able to access it -- so we only want to + * require good writes to the new device. But if + * the new device turns out to be flaky, we want + * to be able to detach it -- which requires all + * writes to the old device to have succeeded. + */ + if (good_copies == 0 || zio->io_vd == NULL) + zio->io_error = vdev_mirror_worst_error(mm); + } + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * If we don't have a good copy yet, keep trying other children. + */ + /* XXPOLICY */ + if (good_copies == 0 && (c = vdev_mirror_child_select(zio)) != -1) { + ASSERT(c >= 0 && c < mm->mm_children); + mc = &mm->mm_child[c]; + zio_vdev_io_redone(zio); + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, zio->io_abd, zio->io_size, + ZIO_TYPE_READ, zio->io_priority, 0, + vdev_mirror_child_done, mc)); + return; + } + + /* XXPOLICY */ + if (good_copies == 0) { + zio->io_error = vdev_mirror_worst_error(mm); + ASSERT(zio->io_error != 0); + } + + if (good_copies && spa_writeable(zio->io_spa) && + (unexpected_errors || + (zio->io_flags & ZIO_FLAG_RESILVER) || + ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_resilvering))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < mm->mm_children; c++) { + /* + * Don't rewrite known good children. + * Not only is it unnecessary, it could + * actually be harmful: if the system lost + * power while rewriting the only good copy, + * there would be no good copies left! + */ + mc = &mm->mm_child[c]; + + if (mc->mc_error == 0) { + if (mc->mc_tried) + continue; + /* + * We didn't try this child. We need to + * repair it if: + * 1. it's a scrub (in which case we have + * tried everything that was healthy) + * - or - + * 2. it's an indirect vdev (in which case + * it could point to any other vdev, which + * might have a bad DTL) + * - or - + * 3. the DTL indicates that this data is + * missing from this vdev + */ + if (!(zio->io_flags & ZIO_FLAG_SCRUB) && + mc->mc_vd->vdev_ops != &vdev_indirect_ops && + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, + zio->io_txg, 1)) + continue; + mc->mc_error = SET_ERROR(ESTALE); + } + + zio_nowait(zio_vdev_child_io(zio, zio->io_bp, + mc->mc_vd, mc->mc_offset, + zio->io_abd, zio->io_size, ZIO_TYPE_WRITE, + zio->io_priority == ZIO_PRIORITY_REBUILD ? + ZIO_PRIORITY_REBUILD : ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted == vd->vdev_children) { + if (vdev_children_are_offline(vd)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_OFFLINE, + VDEV_AUX_CHILDREN_OFFLINE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } + } else if (degraded + faulted != 0) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +vdev_ops_t vdev_mirror_ops = { + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_replacing_ops = { + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + +vdev_ops_t vdev_spare_ops = { + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_inc, INT, ZMOD_RW, + "Rotating media load increment for non-seeking I/O's"); + +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_inc, INT, ZMOD_RW, + "Rotating media load increment for seeking I/O's"); + +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, rotating_seek_offset, INT, ZMOD_RW, + "Offset in bytes from the last I/O which triggers " + "a reduced rotating media seek increment"); + +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_inc, INT, ZMOD_RW, + "Non-rotating media load increment for non-seeking I/O's"); + +ZFS_MODULE_PARAM(zfs_vdev_mirror, zfs_vdev_mirror_, non_rotating_seek_inc, INT, ZMOD_RW, + "Non-rotating media load increment for seeking I/O's"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_missing.c b/module/zfs/vdev_missing.c new file mode 100644 index 000000000000..ce90df6e8d95 --- /dev/null +++ b/module/zfs/vdev_missing.c @@ -0,0 +1,113 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +/* + * The 'missing' vdev is a special vdev type used only during import. It + * signifies a placeholder in the root vdev for some vdev that we know is + * missing. We pass it down to the kernel to allow the rest of the + * configuration to parsed and an attempt made to open all available devices. + * Because its GUID is always 0, we know that the guid sum will mismatch and we + * won't be able to open the pool anyway. + */ + +#include +#include +#include +#include +#include + +/* ARGSUSED */ +static int +vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, + uint64_t *ashift, uint64_t *pshift) +{ + /* + * Really this should just fail. But then the root vdev will be in the + * faulted state with VDEV_AUX_NO_REPLICAS, when what we really want is + * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we + * will fail the GUID sum check before ever trying to open the pool. + */ + *psize = 0; + *max_psize = 0; + *ashift = 0; + *pshift = 0; + return (0); +} + +/* ARGSUSED */ +static void +vdev_missing_close(vdev_t *vd) +{ +} + +/* ARGSUSED */ +static void +vdev_missing_io_start(zio_t *zio) +{ + zio->io_error = SET_ERROR(ENOTSUP); + zio_execute(zio); +} + +/* ARGSUSED */ +static void +vdev_missing_io_done(zio_t *zio) +{ +} + +vdev_ops_t vdev_missing_ops = { + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; + +vdev_ops_t vdev_hole_ops = { + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ +}; diff --git a/module/zfs/vdev_queue.c b/module/zfs/vdev_queue.c new file mode 100644 index 000000000000..a8ef3d7474c9 --- /dev/null +++ b/module/zfs/vdev_queue.c @@ -0,0 +1,1070 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS I/O Scheduler + * --------------- + * + * ZFS issues I/O operations to leaf vdevs to satisfy and complete zios. The + * I/O scheduler determines when and in what order those operations are + * issued. The I/O scheduler divides operations into five I/O classes + * prioritized in the following order: sync read, sync write, async read, + * async write, and scrub/resilver. Each queue defines the minimum and + * maximum number of concurrent operations that may be issued to the device. + * In addition, the device has an aggregate maximum. Note that the sum of the + * per-queue minimums must not exceed the aggregate maximum. If the + * sum of the per-queue maximums exceeds the aggregate maximum, then the + * number of active i/os may reach zfs_vdev_max_active, in which case no + * further i/os will be issued regardless of whether all per-queue + * minimums have been met. + * + * For many physical devices, throughput increases with the number of + * concurrent operations, but latency typically suffers. Further, physical + * devices typically have a limit at which more concurrent operations have no + * effect on throughput or can actually cause it to decrease. + * + * The scheduler selects the next operation to issue by first looking for an + * I/O class whose minimum has not been satisfied. Once all are satisfied and + * the aggregate maximum has not been hit, the scheduler looks for classes + * whose maximum has not been satisfied. Iteration through the I/O classes is + * done in the order specified above. No further operations are issued if the + * aggregate maximum number of concurrent operations has been hit or if there + * are no operations queued for an I/O class that has not hit its maximum. + * Every time an i/o is queued or an operation completes, the I/O scheduler + * looks for new operations to issue. + * + * All I/O classes have a fixed maximum number of outstanding operations + * except for the async write class. Asynchronous writes represent the data + * that is committed to stable storage during the syncing stage for + * transaction groups (see txg.c). Transaction groups enter the syncing state + * periodically so the number of queued async writes will quickly burst up and + * then bleed down to zero. Rather than servicing them as quickly as possible, + * the I/O scheduler changes the maximum number of active async write i/os + * according to the amount of dirty data in the pool (see dsl_pool.c). Since + * both throughput and latency typically increase with the number of + * concurrent operations issued to physical devices, reducing the burstiness + * in the number of concurrent operations also stabilizes the response time of + * operations from other -- and in particular synchronous -- queues. In broad + * strokes, the I/O scheduler will issue more concurrent operations from the + * async write queue as there's more dirty data in the pool. + * + * Async Writes + * + * The number of concurrent operations issued for the async write I/O class + * follows a piece-wise linear function defined by a few adjustable points. + * + * | o---------| <-- zfs_vdev_async_write_max_active + * ^ | /^ | + * | | / | | + * active | / | | + * I/O | / | | + * count | / | | + * | / | | + * |------------o | | <-- zfs_vdev_async_write_min_active + * 0|____________^______|_________| + * 0% | | 100% of zfs_dirty_data_max + * | | + * | `-- zfs_vdev_async_write_active_max_dirty_percent + * `--------- zfs_vdev_async_write_active_min_dirty_percent + * + * Until the amount of dirty data exceeds a minimum percentage of the dirty + * data allowed in the pool, the I/O scheduler will limit the number of + * concurrent operations to the minimum. As that threshold is crossed, the + * number of concurrent operations issued increases linearly to the maximum at + * the specified maximum percentage of the dirty data allowed in the pool. + * + * Ideally, the amount of dirty data on a busy pool will stay in the sloped + * part of the function between zfs_vdev_async_write_active_min_dirty_percent + * and zfs_vdev_async_write_active_max_dirty_percent. If it exceeds the + * maximum percentage, this indicates that the rate of incoming data is + * greater than the rate that the backend storage can handle. In this case, we + * must further throttle incoming writes (see dmu_tx_delay() for details). + */ + +/* + * The maximum number of i/os active to each device. Ideally, this will be >= + * the sum of each queue's max_active. It must be at least the sum of each + * queue's min_active. + */ +uint32_t zfs_vdev_max_active = 1000; + +/* + * Per-queue limits on the number of i/os active to each device. If the + * number of active i/os is < zfs_vdev_max_active, then the min_active comes + * into play. We will send min_active from each queue, and then select from + * queues in the order defined by zio_priority_t. + * + * In general, smaller max_active's will lead to lower latency of synchronous + * operations. Larger max_active's may lead to higher overall throughput, + * depending on underlying storage. + * + * The ratio of the queues' max_actives determines the balance of performance + * between reads, writes, and scrubs. E.g., increasing + * zfs_vdev_scrub_max_active will cause the scrub or resilver to complete + * more quickly, but reads and writes to have higher latency and lower + * throughput. + */ +uint32_t zfs_vdev_sync_read_min_active = 10; +uint32_t zfs_vdev_sync_read_max_active = 10; +uint32_t zfs_vdev_sync_write_min_active = 10; +uint32_t zfs_vdev_sync_write_max_active = 10; +uint32_t zfs_vdev_async_read_min_active = 1; +uint32_t zfs_vdev_async_read_max_active = 3; +uint32_t zfs_vdev_async_write_min_active = 2; +uint32_t zfs_vdev_async_write_max_active = 10; +uint32_t zfs_vdev_scrub_min_active = 1; +uint32_t zfs_vdev_scrub_max_active = 2; +uint32_t zfs_vdev_removal_min_active = 1; +uint32_t zfs_vdev_removal_max_active = 2; +uint32_t zfs_vdev_initializing_min_active = 1; +uint32_t zfs_vdev_initializing_max_active = 1; +uint32_t zfs_vdev_trim_min_active = 1; +uint32_t zfs_vdev_trim_max_active = 2; +uint32_t zfs_vdev_rebuild_min_active = 1; +uint32_t zfs_vdev_rebuild_max_active = 3; + +/* + * When the pool has less than zfs_vdev_async_write_active_min_dirty_percent + * dirty data, use zfs_vdev_async_write_min_active. When it has more than + * zfs_vdev_async_write_active_max_dirty_percent, use + * zfs_vdev_async_write_max_active. The value is linearly interpolated + * between min and max. + */ +int zfs_vdev_async_write_active_min_dirty_percent = 30; +int zfs_vdev_async_write_active_max_dirty_percent = 60; + +/* + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. + */ +int zfs_vdev_aggregation_limit = 1 << 20; +int zfs_vdev_aggregation_limit_non_rotating = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; + +/* + * Define the queue depth percentage for each top-level. This percentage is + * used in conjunction with zfs_vdev_async_max_active to determine how many + * allocations a specific top-level vdev should handle. Once the queue depth + * reaches zfs_vdev_queue_depth_pct * zfs_vdev_async_write_max_active / 100 + * then allocator will stop allocating blocks on that top-level device. + * The default kernel setting is 1000% which will yield 100 allocations per + * device. For userland testing, the default setting is 300% which equates + * to 30 allocations per device. + */ +#ifdef _KERNEL +int zfs_vdev_queue_depth_pct = 1000; +#else +int zfs_vdev_queue_depth_pct = 300; +#endif + +/* + * When performing allocations for a given metaslab, we want to make sure that + * there are enough IOs to aggregate together to improve throughput. We want to + * ensure that there are at least 128k worth of IOs that can be aggregated, and + * we assume that the average allocation size is 4k, so we need the queue depth + * to be 32 per allocator to get good aggregation of sequential writes. + */ +int zfs_vdev_def_queue_depth = 32; + +/* + * Allow TRIM I/Os to be aggregated. This should normally not be needed since + * TRIM I/O for extents up to zfs_trim_extent_bytes_max (128M) can be submitted + * by the TRIM code in zfs_trim.c. + */ +int zfs_vdev_aggregate_trim = 0; + +static int +vdev_queue_offset_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = (const zio_t *)x1; + const zio_t *z2 = (const zio_t *)x2; + + int cmp = TREE_CMP(z1->io_offset, z2->io_offset); + + if (likely(cmp)) + return (cmp); + + return (TREE_PCMP(z1, z2)); +} + +static inline avl_tree_t * +vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p) +{ + return (&vq->vq_class[p].vqc_queued_tree); +} + +static inline avl_tree_t * +vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t) +{ + ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE || t == ZIO_TYPE_TRIM); + if (t == ZIO_TYPE_READ) + return (&vq->vq_read_offset_tree); + else if (t == ZIO_TYPE_WRITE) + return (&vq->vq_write_offset_tree); + else + return (&vq->vq_trim_offset_tree); +} + +static int +vdev_queue_timestamp_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = (const zio_t *)x1; + const zio_t *z2 = (const zio_t *)x2; + + int cmp = TREE_CMP(z1->io_timestamp, z2->io_timestamp); + + if (likely(cmp)) + return (cmp); + + return (TREE_PCMP(z1, z2)); +} + +static int +vdev_queue_class_min_active(zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_min_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_min_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_min_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (zfs_vdev_async_write_min_active); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_min_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_min_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_min_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_min_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_min_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +static int +vdev_queue_max_async_writes(spa_t *spa) +{ + int writes; + uint64_t dirty = 0; + dsl_pool_t *dp = spa_get_dsl(spa); + uint64_t min_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_min_dirty_percent / 100; + uint64_t max_bytes = zfs_dirty_data_max * + zfs_vdev_async_write_active_max_dirty_percent / 100; + + /* + * Async writes may occur before the assignment of the spa's + * dsl_pool_t if a self-healing zio is issued prior to the + * completion of dmu_objset_open_impl(). + */ + if (dp == NULL) + return (zfs_vdev_async_write_max_active); + + /* + * Sync tasks correspond to interactive user actions. To reduce the + * execution time of those actions we push data out as fast as possible. + */ + if (spa_has_pending_synctask(spa)) + return (zfs_vdev_async_write_max_active); + + dirty = dp->dp_dirty_total; + if (dirty < min_bytes) + return (zfs_vdev_async_write_min_active); + if (dirty > max_bytes) + return (zfs_vdev_async_write_max_active); + + /* + * linear interpolation: + * slope = (max_writes - min_writes) / (max_bytes - min_bytes) + * move right by min_bytes + * move up by min_writes + */ + writes = (dirty - min_bytes) * + (zfs_vdev_async_write_max_active - + zfs_vdev_async_write_min_active) / + (max_bytes - min_bytes) + + zfs_vdev_async_write_min_active; + ASSERT3U(writes, >=, zfs_vdev_async_write_min_active); + ASSERT3U(writes, <=, zfs_vdev_async_write_max_active); + return (writes); +} + +static int +vdev_queue_class_max_active(spa_t *spa, zio_priority_t p) +{ + switch (p) { + case ZIO_PRIORITY_SYNC_READ: + return (zfs_vdev_sync_read_max_active); + case ZIO_PRIORITY_SYNC_WRITE: + return (zfs_vdev_sync_write_max_active); + case ZIO_PRIORITY_ASYNC_READ: + return (zfs_vdev_async_read_max_active); + case ZIO_PRIORITY_ASYNC_WRITE: + return (vdev_queue_max_async_writes(spa)); + case ZIO_PRIORITY_SCRUB: + return (zfs_vdev_scrub_max_active); + case ZIO_PRIORITY_REMOVAL: + return (zfs_vdev_removal_max_active); + case ZIO_PRIORITY_INITIALIZING: + return (zfs_vdev_initializing_max_active); + case ZIO_PRIORITY_TRIM: + return (zfs_vdev_trim_max_active); + case ZIO_PRIORITY_REBUILD: + return (zfs_vdev_rebuild_max_active); + default: + panic("invalid priority %u", p); + return (0); + } +} + +/* + * Return the i/o class to issue from, or ZIO_PRIORITY_MAX_QUEUEABLE if + * there is no eligible class. + */ +static zio_priority_t +vdev_queue_class_to_issue(vdev_queue_t *vq) +{ + spa_t *spa = vq->vq_vdev->vdev_spa; + zio_priority_t p; + + if (avl_numnodes(&vq->vq_active_tree) >= zfs_vdev_max_active) + return (ZIO_PRIORITY_NUM_QUEUEABLE); + + /* find a queue that has not reached its minimum # outstanding i/os */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_min_active(p)) + return (p); + } + + /* + * If we haven't found a queue, look for one that hasn't reached its + * maximum # outstanding i/os. + */ + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 && + vq->vq_class[p].vqc_active < + vdev_queue_class_max_active(spa, p)) + return (p); + } + + /* No eligible queued i/os */ + return (ZIO_PRIORITY_NUM_QUEUEABLE); +} + +void +vdev_queue_init(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + zio_priority_t p; + + mutex_init(&vq->vq_lock, NULL, MUTEX_DEFAULT, NULL); + vq->vq_vdev = vd; + taskq_init_ent(&vd->vdev_queue.vq_io_search.io_tqent); + + avl_create(&vq->vq_active_tree, vdev_queue_offset_compare, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM), + vdev_queue_offset_compare, sizeof (zio_t), + offsetof(struct zio, io_offset_node)); + + for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) { + int (*compfn) (const void *, const void *); + + /* + * The synchronous/trim i/o queues are dispatched in FIFO rather + * than LBA order. This provides more consistent latency for + * these i/os. + */ + if (p == ZIO_PRIORITY_SYNC_READ || + p == ZIO_PRIORITY_SYNC_WRITE || + p == ZIO_PRIORITY_TRIM) { + compfn = vdev_queue_timestamp_compare; + } else { + compfn = vdev_queue_offset_compare; + } + avl_create(vdev_queue_class_tree(vq, p), compfn, + sizeof (zio_t), offsetof(struct zio, io_queue_node)); + } + + vq->vq_last_offset = 0; +} + +void +vdev_queue_fini(vdev_t *vd) +{ + vdev_queue_t *vq = &vd->vdev_queue; + + for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) + avl_destroy(vdev_queue_class_tree(vq, p)); + avl_destroy(&vq->vq_active_tree); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE)); + avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_TRIM)); + + mutex_destroy(&vq->vq_lock); +} + +static void +vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_add(vdev_queue_type_tree(vq, zio->io_type), zio); + + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_waitq_enter(shk->kstat->ks_data); + mutex_exit(&shk->lock); + } +} + +static void +vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio); + + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_waitq_exit(shk->kstat->ks_data); + mutex_exit(&shk->lock); + } +} + +static void +vdev_queue_pending_add(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active++; + avl_add(&vq->vq_active_tree, zio); + + if (shk->kstat != NULL) { + mutex_enter(&shk->lock); + kstat_runq_enter(shk->kstat->ks_data); + mutex_exit(&shk->lock); + } +} + +static void +vdev_queue_pending_remove(vdev_queue_t *vq, zio_t *zio) +{ + spa_t *spa = zio->io_spa; + spa_history_kstat_t *shk = &spa->spa_stats.io_history; + + ASSERT(MUTEX_HELD(&vq->vq_lock)); + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + vq->vq_class[zio->io_priority].vqc_active--; + avl_remove(&vq->vq_active_tree, zio); + + if (shk->kstat != NULL) { + kstat_io_t *ksio = shk->kstat->ks_data; + + mutex_enter(&shk->lock); + kstat_runq_exit(ksio); + if (zio->io_type == ZIO_TYPE_READ) { + ksio->reads++; + ksio->nread += zio->io_size; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ksio->writes++; + ksio->nwritten += zio->io_size; + } + mutex_exit(&shk->lock); + } +} + +static void +vdev_queue_agg_io_done(zio_t *aio) +{ + abd_free(aio->io_abd); +} + +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) + +/* + * Sufficiently adjacent io_offset's in ZIOs will be aggregated. We do this + * by creating a gang ABD from the adjacent ZIOs io_abd's. By using + * a gang ABD we avoid doing memory copies to and from the parent, + * child ZIOs. The gang ABD also accounts for gaps between adjacent + * io_offsets by simply getting the zero ABD for writes or allocating + * a new ABD for reads and placing them in the gang ABD as well. + */ +static zio_t * +vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) +{ + zio_t *first, *last, *aio, *dio, *mandatory, *nio; + zio_link_t *zl = NULL; + uint64_t maxgap = 0; + uint64_t size; + uint64_t limit; + int maxblocksize; + boolean_t stretch = B_FALSE; + avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type); + enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT; + uint64_t next_offset; + abd_t *abd; + + maxblocksize = spa_maxblocksize(vq->vq_vdev->vdev_spa); + if (vq->vq_vdev->vdev_nonrot) + limit = zfs_vdev_aggregation_limit_non_rotating; + else + limit = zfs_vdev_aggregation_limit; + limit = MAX(MIN(limit, maxblocksize), 0); + + if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE || limit == 0) + return (NULL); + + /* + * While TRIM commands could be aggregated based on offset this + * behavior is disabled until it's determined to be beneficial. + */ + if (zio->io_type == ZIO_TYPE_TRIM && !zfs_vdev_aggregate_trim) + return (NULL); + + first = last = zio; + + if (zio->io_type == ZIO_TYPE_READ) + maxgap = zfs_vdev_read_gap_limit; + + /* + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mandatory = (first->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : first; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-optional I/O. + */ + while ((dio = AVL_PREV(t, first)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, last) <= limit && + IO_GAP(dio, first) <= maxgap && + dio->io_type == zio->io_type) { + first = dio; + if (mandatory == NULL && !(first->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = first; + } + + /* + * Skip any initial optional I/Os. + */ + while ((first->io_flags & ZIO_FLAG_OPTIONAL) && first != last) { + first = AVL_NEXT(t, first); + ASSERT(first != NULL); + } + + + /* + * Walk forward through sufficiently contiguous I/Os. + * The aggregation limit does not apply to optional i/os, so that + * we can issue contiguous writes even if they are larger than the + * aggregation limit. + */ + while ((dio = AVL_NEXT(t, last)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + (IO_SPAN(first, dio) <= limit || + (dio->io_flags & ZIO_FLAG_OPTIONAL)) && + IO_SPAN(first, dio) <= maxblocksize && + IO_GAP(last, dio) <= maxgap && + dio->io_type == zio->io_type) { + last = dio; + if (!(last->io_flags & ZIO_FLAG_OPTIONAL)) + mandatory = last; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + if (zio->io_type == ZIO_TYPE_WRITE && mandatory != NULL) { + zio_t *nio = last; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mandatory, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } + + if (stretch) { + /* + * We are going to include an optional io in our aggregated + * span, thus closing the write gap. Only mandatory i/os can + * start aggregated spans, so make sure that the next i/o + * after our span is mandatory. + */ + dio = AVL_NEXT(t, last); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + /* do not include the optional i/o */ + while (last != mandatory && last != first) { + ASSERT(last->io_flags & ZIO_FLAG_OPTIONAL); + last = AVL_PREV(t, last); + ASSERT(last != NULL); + } + } + + if (first == last) + return (NULL); + + size = IO_SPAN(first, last); + ASSERT3U(size, <=, maxblocksize); + + abd = abd_alloc_gang_abd(); + if (abd == NULL) + return (NULL); + + aio = zio_vdev_delegated_io(first->io_vd, first->io_offset, + abd, size, first->io_type, zio->io_priority, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + vdev_queue_agg_io_done, NULL); + aio->io_timestamp = first->io_timestamp; + + nio = first; + next_offset = first->io_offset; + do { + dio = nio; + nio = AVL_NEXT(t, dio); + zio_add_child(dio, aio); + vdev_queue_io_remove(vq, dio); + + if (dio->io_offset != next_offset) { + /* allocate a buffer for a read gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_READ); + ASSERT3U(dio->io_offset, >, next_offset); + abd = abd_alloc_for_io( + dio->io_offset - next_offset, B_TRUE); + abd_gang_add(aio->io_abd, abd, B_TRUE); + } + if (dio->io_abd && + (dio->io_size != abd_get_size(dio->io_abd))) { + /* abd size not the same as IO size */ + ASSERT3U(abd_get_size(dio->io_abd), >, dio->io_size); + abd = abd_get_offset_size(dio->io_abd, 0, dio->io_size); + abd_gang_add(aio->io_abd, abd, B_TRUE); + } else { + if (dio->io_flags & ZIO_FLAG_NODATA) { + /* allocate a buffer for a write gap */ + ASSERT3U(dio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3P(dio->io_abd, ==, NULL); + abd_gang_add(aio->io_abd, + abd_get_zeros(dio->io_size), B_TRUE); + } else { + /* + * We pass B_FALSE to abd_gang_add() + * because we did not allocate a new + * ABD, so it is assumed the caller + * will free this ABD. + */ + abd_gang_add(aio->io_abd, dio->io_abd, + B_FALSE); + } + } + next_offset = dio->io_offset + dio->io_size; + } while (dio != last); + ASSERT3U(abd_get_size(aio->io_abd), ==, aio->io_size); + + /* + * We need to drop the vdev queue's lock during zio_execute() to + * avoid a deadlock that we could encounter due to lock order + * reversal between vq_lock and io_lock in zio_change_priority(). + */ + mutex_exit(&vq->vq_lock); + while ((dio = zio_walk_parents(aio, &zl)) != NULL) { + ASSERT3U(dio->io_type, ==, aio->io_type); + + zio_vdev_io_bypass(dio); + zio_execute(dio); + } + mutex_enter(&vq->vq_lock); + + return (aio); +} + +static zio_t * +vdev_queue_io_to_issue(vdev_queue_t *vq) +{ + zio_t *zio, *aio; + zio_priority_t p; + avl_index_t idx; + avl_tree_t *tree; + +again: + ASSERT(MUTEX_HELD(&vq->vq_lock)); + + p = vdev_queue_class_to_issue(vq); + + if (p == ZIO_PRIORITY_NUM_QUEUEABLE) { + /* No eligible queued i/os */ + return (NULL); + } + + /* + * For LBA-ordered queues (async / scrub / initializing), issue the + * i/o which follows the most recently issued i/o in LBA (offset) order. + * + * For FIFO queues (sync/trim), issue the i/o with the lowest timestamp. + */ + tree = vdev_queue_class_tree(vq, p); + vq->vq_io_search.io_timestamp = 0; + vq->vq_io_search.io_offset = vq->vq_last_offset - 1; + VERIFY3P(avl_find(tree, &vq->vq_io_search, &idx), ==, NULL); + zio = avl_nearest(tree, idx, AVL_AFTER); + if (zio == NULL) + zio = avl_first(tree); + ASSERT3U(zio->io_priority, ==, p); + + aio = vdev_queue_aggregate(vq, zio); + if (aio != NULL) + zio = aio; + else + vdev_queue_io_remove(vq, zio); + + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (zio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(zio); + zio_execute(zio); + mutex_enter(&vq->vq_lock); + goto again; + } + + vdev_queue_pending_add(vq, zio); + vq->vq_last_offset = zio->io_offset + zio->io_size; + + return (zio); +} + +zio_t * +vdev_queue_io(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + + if (zio->io_flags & ZIO_FLAG_DONT_QUEUE) + return (zio); + + /* + * Children i/os inherent their parent's priority, which might + * not match the child's i/o type. Fix it up here. + */ + if (zio->io_type == ZIO_TYPE_READ) { + ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); + + if (zio->io_priority != ZIO_PRIORITY_SYNC_READ && + zio->io_priority != ZIO_PRIORITY_ASYNC_READ && + zio->io_priority != ZIO_PRIORITY_SCRUB && + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { + zio->io_priority = ZIO_PRIORITY_ASYNC_READ; + } + } else if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(zio->io_priority != ZIO_PRIORITY_TRIM); + + if (zio->io_priority != ZIO_PRIORITY_SYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_ASYNC_WRITE && + zio->io_priority != ZIO_PRIORITY_REMOVAL && + zio->io_priority != ZIO_PRIORITY_INITIALIZING && + zio->io_priority != ZIO_PRIORITY_REBUILD) { + zio->io_priority = ZIO_PRIORITY_ASYNC_WRITE; + } + } else { + ASSERT(zio->io_type == ZIO_TYPE_TRIM); + ASSERT(zio->io_priority == ZIO_PRIORITY_TRIM); + } + + zio->io_flags |= ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE; + + mutex_enter(&vq->vq_lock); + zio->io_timestamp = gethrtime(); + vdev_queue_io_add(vq, zio); + nio = vdev_queue_io_to_issue(vq); + mutex_exit(&vq->vq_lock); + + if (nio == NULL) + return (NULL); + + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + return (NULL); + } + + return (nio); +} + +void +vdev_queue_io_done(zio_t *zio) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + zio_t *nio; + + mutex_enter(&vq->vq_lock); + + vdev_queue_pending_remove(vq, zio); + + zio->io_delta = gethrtime() - zio->io_timestamp; + vq->vq_io_complete_ts = gethrtime(); + vq->vq_io_delta_ts = vq->vq_io_complete_ts - zio->io_timestamp; + + while ((nio = vdev_queue_io_to_issue(vq)) != NULL) { + mutex_exit(&vq->vq_lock); + if (nio->io_done == vdev_queue_agg_io_done) { + zio_nowait(nio); + } else { + zio_vdev_io_reissue(nio); + zio_execute(nio); + } + mutex_enter(&vq->vq_lock); + } + + mutex_exit(&vq->vq_lock); +} + +void +vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; + + /* + * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio + * code to issue IOs without adding them to the vdev queue. In this + * case, the zio is already going to be issued as quickly as possible + * and so it doesn't need any reprioritization to help. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW) + return; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (zio->io_type == ZIO_TYPE_READ) { + if (priority != ZIO_PRIORITY_SYNC_READ && + priority != ZIO_PRIORITY_ASYNC_READ && + priority != ZIO_PRIORITY_SCRUB) + priority = ZIO_PRIORITY_ASYNC_READ; + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + if (priority != ZIO_PRIORITY_SYNC_WRITE && + priority != ZIO_PRIORITY_ASYNC_WRITE) + priority = ZIO_PRIORITY_ASYNC_WRITE; + } + + mutex_enter(&vq->vq_lock); + + /* + * If the zio is in none of the queues we can simply change + * the priority. If the zio is waiting to be submitted we must + * remove it from the queue and re-insert it with the new priority. + * Otherwise, the zio is currently active and we cannot change its + * priority. + */ + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + zio->io_priority = priority; + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + zio->io_priority = priority; + } + + mutex_exit(&vq->vq_lock); +} + +/* + * As these two methods are only used for load calculations we're not + * concerned if we get an incorrect value on 32bit platforms due to lack of + * vq_lock mutex use here, instead we prefer to keep it lock free for + * performance. + */ +int +vdev_queue_length(vdev_t *vd) +{ + return (avl_numnodes(&vd->vdev_queue.vq_active_tree)); +} + +uint64_t +vdev_queue_last_offset(vdev_t *vd) +{ + return (vd->vdev_queue.vq_last_offset); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit, INT, ZMOD_RW, + "Max vdev I/O aggregation size"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregation_limit_non_rotating, INT, ZMOD_RW, + "Max vdev I/O aggregation size for non-rotating media"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, aggregate_trim, INT, ZMOD_RW, + "Allow TRIM I/O to be aggregated"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, read_gap_limit, INT, ZMOD_RW, + "Aggregate read I/O over gap"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, write_gap_limit, INT, ZMOD_RW, + "Aggregate write I/O over gap"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, max_active, INT, ZMOD_RW, + "Maximum number of active I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_max_dirty_percent, INT, ZMOD_RW, + "Async write concurrency max threshold"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_active_min_dirty_percent, INT, ZMOD_RW, + "Async write concurrency min threshold"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_max_active, INT, ZMOD_RW, + "Max active async read I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_read_min_active, INT, ZMOD_RW, + "Min active async read I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_max_active, INT, ZMOD_RW, + "Max active async write I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, async_write_min_active, INT, ZMOD_RW, + "Min active async write I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_max_active, INT, ZMOD_RW, + "Max active initializing I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, initializing_min_active, INT, ZMOD_RW, + "Min active initializing I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_max_active, INT, ZMOD_RW, + "Max active removal I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, removal_min_active, INT, ZMOD_RW, + "Min active removal I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_max_active, INT, ZMOD_RW, + "Max active scrub I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, scrub_min_active, INT, ZMOD_RW, + "Min active scrub I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_max_active, INT, ZMOD_RW, + "Max active sync read I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_read_min_active, INT, ZMOD_RW, + "Min active sync read I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_max_active, INT, ZMOD_RW, + "Max active sync write I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, sync_write_min_active, INT, ZMOD_RW, + "Min active sync write I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_max_active, INT, ZMOD_RW, + "Max active trim/discard I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, trim_min_active, INT, ZMOD_RW, + "Min active trim/discard I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_max_active, INT, ZMOD_RW, + "Max active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, rebuild_min_active, INT, ZMOD_RW, + "Min active rebuild I/Os per vdev"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_vdev_, queue_depth_pct, INT, ZMOD_RW, + "Queue depth percentage for each top-level vdev"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_raidz.c b/module/zfs/vdev_raidz.c new file mode 100644 index 000000000000..8d49628053dd --- /dev/null +++ b/module/zfs/vdev_raidz.c @@ -0,0 +1,2421 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + * Copyright (c) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef ZFS_DEBUG +#include /* For vdev_xlate() in vdev_raidz_io_verify() */ +#endif + +/* + * Virtual device vector for RAID-Z. + * + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: + * + * o addition (+) is represented by a bitwise XOR + * o subtraction (-) is therefore identical to addition: A + B = A - B + * o multiplication of A by 2 is defined by the following bitwise expression: + * + * (A * 2)_7 = A_6 + * (A * 2)_6 = A_5 + * (A * 2)_5 = A_4 + * (A * 2)_4 = A_3 + A_7 + * (A * 2)_3 = A_2 + A_7 + * (A * 2)_2 = A_1 + A_7 + * (A * 2)_1 = A_0 + * (A * 2)_0 = A_7 + * + * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. + * + * Observe that any number in the field (except for 0) can be expressed as a + * power of 2 -- a generator for the field. We store a table of the powers of + * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can + * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. + * + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: + * + * P = D_0 + D_1 + ... + D_n-2 + D_n-1 + * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 + * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 + * + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trivial + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. + */ + +#define VDEV_RAIDZ_P 0 +#define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1dULL); \ +} + +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} + +void +vdev_raidz_map_free(raidz_map_t *rm) +{ + int c; + + for (c = 0; c < rm->rm_firstdatacol; c++) { + abd_free(rm->rm_col[c].rc_abd); + + if (rm->rm_col[c].rc_gdata != NULL) + abd_free(rm->rm_col[c].rc_gdata); + } + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + abd_put(rm->rm_col[c].rc_abd); + + if (rm->rm_abd_copy != NULL) + abd_free(rm->rm_abd_copy); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); +} + +static void +vdev_raidz_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT0(rm->rm_freed); + rm->rm_freed = 1; + + if (rm->rm_reports == 0) + vdev_raidz_map_free(rm); +} + +/*ARGSUSED*/ +static void +vdev_raidz_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed != 0) + vdev_raidz_map_free(rm); +} + +static void +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const abd_t *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + const size_t c = zcr->zcr_cbinfo; + size_t x, offset; + + const abd_t *good = NULL; + const abd_t *bad = rm->rm_col[c].rc_abd; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + if (c < rm->rm_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical ZIO) + */ + if (rm->rm_col[0].rc_gdata == NULL) { + abd_t *bad_parity[VDEV_RAIDZ_MAXPARITY]; + + /* + * Set up the rm_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + bad_parity[x] = rm->rm_col[x].rc_abd; + rm->rm_col[x].rc_abd = + rm->rm_col[x].rc_gdata = + abd_alloc_sametype(rm->rm_col[x].rc_abd, + rm->rm_col[x].rc_size); + } + + /* fill in the data columns from good_data */ + offset = 0; + for (; x < rm->rm_cols; x++) { + abd_put(rm->rm_col[x].rc_abd); + + rm->rm_col[x].rc_abd = + abd_get_offset_size((abd_t *)good_data, + offset, rm->rm_col[x].rc_size); + offset += rm->rm_col[x].rc_size; + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity(rm); + + /* restore everything back to its original state */ + for (x = 0; x < rm->rm_firstdatacol; x++) + rm->rm_col[x].rc_abd = bad_parity[x]; + + offset = 0; + for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { + abd_put(rm->rm_col[x].rc_abd); + rm->rm_col[x].rc_abd = abd_get_offset_size( + rm->rm_abd_copy, offset, + rm->rm_col[x].rc_size); + offset += rm->rm_col[x].rc_size; + } + } + + ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); + good = abd_get_offset_size(rm->rm_col[c].rc_gdata, 0, + rm->rm_col[c].rc_size); + } else { + /* adjust good_data to point at the start of our column */ + offset = 0; + for (x = rm->rm_firstdatacol; x < c; x++) + offset += rm->rm_col[x].rc_size; + + good = abd_get_offset_size((abd_t *)good_data, offset, + rm->rm_col[c].rc_size); + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); + abd_put((abd_t *)good); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_raidz_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + size_t offset; + + raidz_map_t *rm = zio->io_vsd; + size_t size; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_raidz_cksum_finish; + zcr->zcr_free = vdev_raidz_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_abd_copy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. + */ + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + rm->rm_abd_copy = abd_alloc_for_io(size, B_FALSE); + + for (offset = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + abd_t *tmp = abd_get_offset_size(rm->rm_abd_copy, offset, + col->rc_size); + + abd_copy(tmp, col->rc_abd, col->rc_size); + + abd_put(col->rc_abd); + col->rc_abd = tmp; + + offset += col->rc_size; + } + ASSERT3U(offset, ==, size); +} + +static const zio_vsd_ops_t vdev_raidz_vsd_ops = { + .vsd_free = vdev_raidz_map_free_vsd, + .vsd_cksum_report = vdev_raidz_cksum_report +}; + +/* + * Divides the IO evenly across all child vdevs; usually, dcols is + * the number of children in the target vdev. + * + * Avoid inlining the function to keep vdev_raidz_io_start(), which + * is this functions only caller, as small as possible on the stack. + */ +noinline raidz_map_t * +vdev_raidz_map_alloc(zio_t *zio, uint64_t ashift, uint64_t dcols, + uint64_t nparity) +{ + raidz_map_t *rm; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = zio->io_offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = zio->io_size >> ashift; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + /* The starting byte offset on each child vdev. */ + uint64_t o = (b / dcols) << ashift; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; + uint64_t off = 0; + + /* + * "Quotient": The number of data sectors for this stripe on all but + * the "big column" child vdevs that also contain "remainder" data. + */ + q = s / (dcols - nparity); + + /* + * "Remainder": The number of partial stripe data sectors in this I/O. + * This will add a sector to some, but not all, child vdevs. + */ + r = s - q * (dcols - nparity); + + /* The number of "big columns" - those which contain remainder data. */ + bc = (r == 0 ? 0 : r + nparity); + + /* + * The total number of data and parity sectors associated with + * this I/O. + */ + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + /* acols: The columns that will be accessed. */ + /* scols: The columns that will be accessed or skipped. */ + if (q == 0) { + /* Our I/O request doesn't span all child vdevs. */ + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } + + ASSERT3U(acols, <=, scols); + + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); + + rm->rm_cols = acols; + rm->rm_scols = scols; + rm->rm_bigcols = bc; + rm->rm_skipstart = bc; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + rm->rm_firstdatacol = nparity; + rm->rm_abd_copy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; + + asize = 0; + + for (c = 0; c < scols; c++) { + col = f + c; + coff = o; + if (col >= dcols) { + col -= dcols; + coff += 1ULL << ashift; + } + rm->rm_col[c].rc_devidx = col; + rm->rm_col[c].rc_offset = coff; + rm->rm_col[c].rc_abd = NULL; + rm->rm_col[c].rc_gdata = NULL; + rm->rm_col[c].rc_error = 0; + rm->rm_col[c].rc_tried = 0; + rm->rm_col[c].rc_skipped = 0; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << ashift; + else + rm->rm_col[c].rc_size = q << ashift; + + asize += rm->rm_col[c].rc_size; + } + + ASSERT3U(asize, ==, tot << ashift); + rm->rm_asize = roundup(asize, (nparity + 1) << ashift); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << ashift); + ASSERT3U(rm->rm_nskip, <=, nparity); + + for (c = 0; c < rm->rm_firstdatacol; c++) + rm->rm_col[c].rc_abd = + abd_alloc_linear(rm->rm_col[c].rc_size, B_FALSE); + + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, 0, + rm->rm_col[c].rc_size); + off = rm->rm_col[c].rc_size; + + for (c = c + 1; c < acols; c++) { + rm->rm_col[c].rc_abd = abd_get_offset_size(zio->io_abd, off, + rm->rm_col[c].rc_size); + off += rm->rm_col[c].rc_size; + } + + /* + * If all data stored spans all columns, there's a danger that parity + * will always be on the same device and, since parity isn't read + * during normal operation, that device's I/O bandwidth won't be + * used effectively. We therefore switch the parity every 1MB. + * + * ... at least that was, ostensibly, the theory. As a practical + * matter unless we juggle the parity between all devices evenly, we + * won't see any benefit. Further, occasional writes that aren't a + * multiple of the LCM of the number of children and the minimum + * stripe width are sufficient to avoid pessimal behavior. + * Unfortunately, this decision created an implicit on-disk format + * requirement that we need to support for all eternity, but only + * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. + */ + ASSERT(rm->rm_cols >= 2); + ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); + + if (rm->rm_firstdatacol == 1 && (zio->io_offset & (1ULL << 20))) { + devidx = rm->rm_col[0].rc_devidx; + o = rm->rm_col[0].rc_offset; + rm->rm_col[0].rc_devidx = rm->rm_col[1].rc_devidx; + rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; + rm->rm_col[1].rc_devidx = devidx; + rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; + } + + zio->io_vsd = rm; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; + + /* init RAIDZ parity ops */ + rm->rm_ops = vdev_raidz_math_get_ops(); + + return (rm); +} + +struct pqr_struct { + uint64_t *p; + uint64_t *q; + uint64_t *r; +}; + +static int +vdev_raidz_p_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && !pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++) + *pqr->p ^= *src; + + return (0); +} + +static int +vdev_raidz_pq_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && !pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + } + + return (0); +} + +static int +vdev_raidz_pqr_func(void *buf, size_t size, void *private) +{ + struct pqr_struct *pqr = private; + const uint64_t *src = buf; + uint64_t mask; + int i, cnt = size / sizeof (src[0]); + + ASSERT(pqr->p && pqr->q && pqr->r); + + for (i = 0; i < cnt; i++, src++, pqr->p++, pqr->q++, pqr->r++) { + *pqr->p ^= *src; + VDEV_RAIDZ_64MUL_2(*pqr->q, mask); + *pqr->q ^= *src; + VDEV_RAIDZ_64MUL_4(*pqr->r, mask); + *pqr->r ^= *src; + } + + return (0); +} + +static void +vdev_raidz_generate_parity_p(raidz_map_t *rm) +{ + uint64_t *p; + int c; + abd_t *src; + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + + if (c == rm->rm_firstdatacol) { + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + } else { + struct pqr_struct pqr = { p, NULL, NULL }; + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_p_func, &pqr); + } + } +} + +static void +vdev_raidz_generate_parity_pq(raidz_map_t *rm) +{ + uint64_t *p, *q, pcnt, ccnt, mask, i; + int c; + abd_t *src; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + } + } else { + struct pqr_struct pqr = { p, q, NULL }; + + ASSERT(ccnt <= pcnt); + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pq_func, &pqr); + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, pcnt, ccnt, mask, i; + int c; + abd_t *src; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (p[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_abd; + p = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + q = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + r = abd_to_buf(rm->rm_col[VDEV_RAIDZ_R].rc_abd); + + ccnt = rm->rm_col[c].rc_size / sizeof (p[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + abd_copy_to_buf(p, src, rm->rm_col[c].rc_size); + (void) memcpy(q, p, rm->rm_col[c].rc_size); + (void) memcpy(r, p, rm->rm_col[c].rc_size); + + for (i = ccnt; i < pcnt; i++) { + p[i] = 0; + q[i] = 0; + r[i] = 0; + } + } else { + struct pqr_struct pqr = { p, q, r }; + + ASSERT(ccnt <= pcnt); + (void) abd_iterate_func(src, 0, rm->rm_col[c].rc_size, + vdev_raidz_pqr_func, &pqr); + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (i = ccnt; i < pcnt; i++) { + VDEV_RAIDZ_64MUL_2(q[i], mask); + VDEV_RAIDZ_64MUL_4(r[i], mask); + } + } + } +} + +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ +void +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + /* Generate using the new math implementation */ + if (vdev_raidz_math_generate(rm) != RAIDZ_ORIGINAL_IMPL) + return; + + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_p_func(void *dbuf, void *sbuf, size_t size, void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + int cnt = size / sizeof (src[0]); + + for (int i = 0; i < cnt; i++) { + dst[i] ^= src[i]; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_func(void *dbuf, void *sbuf, size_t size, + void *private) +{ + uint64_t *dst = dbuf; + uint64_t *src = sbuf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, src++) { + VDEV_RAIDZ_64MUL_2(*dst, mask); + *dst ^= *src; + } + + return (0); +} + +/* ARGSUSED */ +static int +vdev_raidz_reconst_q_pre_tail_func(void *buf, size_t size, void *private) +{ + uint64_t *dst = buf; + uint64_t mask; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++) { + /* same operation as vdev_raidz_reconst_q_pre_func() on dst */ + VDEV_RAIDZ_64MUL_2(*dst, mask); + } + + return (0); +} + +struct reconst_q_struct { + uint64_t *q; + int exp; +}; + +static int +vdev_raidz_reconst_q_post_func(void *buf, size_t size, void *private) +{ + struct reconst_q_struct *rq = private; + uint64_t *dst = buf; + int cnt = size / sizeof (dst[0]); + + for (int i = 0; i < cnt; i++, dst++, rq->q++) { + int j; + uint8_t *b; + + *dst ^= *rq->q; + for (j = 0, b = (uint8_t *)dst; j < 8; j++, b++) { + *b = vdev_raidz_exp2(*b, rq->exp); + } + } + + return (0); +} + +struct reconst_pq_struct { + uint8_t *p; + uint8_t *q; + uint8_t *pxy; + uint8_t *qxy; + int aexp; + int bexp; +}; + +static int +vdev_raidz_reconst_pq_func(void *xbuf, void *ybuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + uint8_t *yd = ybuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++, yd++) { + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + *yd = *rpq->p ^ *rpq->pxy ^ *xd; + } + + return (0); +} + +static int +vdev_raidz_reconst_pq_tail_func(void *xbuf, size_t size, void *private) +{ + struct reconst_pq_struct *rpq = private; + uint8_t *xd = xbuf; + + for (int i = 0; i < size; + i++, rpq->p++, rpq->q++, rpq->pxy++, rpq->qxy++, xd++) { + /* same operation as vdev_raidz_reconst_pq_func() on xd */ + *xd = vdev_raidz_exp2(*rpq->p ^ *rpq->pxy, rpq->aexp) ^ + vdev_raidz_exp2(*rpq->q ^ *rpq->qxy, rpq->bexp); + } + + return (0); +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) +{ + int x = tgts[0]; + int c; + abd_t *dst, *src; + + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_P].rc_size); + ASSERT(rm->rm_col[x].rc_size > 0); + + src = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + dst = rm->rm_col[x].rc_abd; + + abd_copy_from_buf(dst, abd_to_buf(src), rm->rm_col[x].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; + + if (c == x) + continue; + + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_p_func, NULL); + } + + return (1 << VDEV_RAIDZ_P); +} + +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) +{ + int x = tgts[0]; + int c, exp; + abd_t *dst, *src; + + ASSERT(ntgts == 1); + + ASSERT(rm->rm_col[x].rc_size <= rm->rm_col[VDEV_RAIDZ_Q].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + uint64_t size = (c == x) ? 0 : MIN(rm->rm_col[x].rc_size, + rm->rm_col[c].rc_size); + + src = rm->rm_col[c].rc_abd; + dst = rm->rm_col[x].rc_abd; + + if (c == rm->rm_firstdatacol) { + abd_copy(dst, src, size); + if (rm->rm_col[x].rc_size > size) + abd_zero_off(dst, size, + rm->rm_col[x].rc_size - size); + + } else { + ASSERT3U(size, <=, rm->rm_col[x].rc_size); + (void) abd_iterate_func2(dst, src, 0, 0, size, + vdev_raidz_reconst_q_pre_func, NULL); + (void) abd_iterate_func(dst, + size, rm->rm_col[x].rc_size - size, + vdev_raidz_reconst_q_pre_tail_func, NULL); + } + } + + src = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + dst = rm->rm_col[x].rc_abd; + exp = 255 - (rm->rm_cols - 1 - x); + + struct reconst_q_struct rq = { abd_to_buf(src), exp }; + (void) abd_iterate_func(dst, 0, rm->rm_col[x].rc_size, + vdev_raidz_reconst_q_post_func, &rq); + + return (1 << VDEV_RAIDZ_Q); +} + +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) +{ + uint8_t *p, *q, *pxy, *qxy, tmp, a, b, aexp, bexp; + abd_t *pdata, *qdata; + uint64_t xsize, ysize; + int x = tgts[0]; + int y = tgts[1]; + abd_t *xd, *yd; + + ASSERT(ntgts == 2); + ASSERT(x < y); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(y < rm->rm_cols); + + ASSERT(rm->rm_col[x].rc_size >= rm->rm_col[y].rc_size); + + /* + * Move the parity data aside -- we're going to compute parity as + * though columns x and y were full of zeros -- Pxy and Qxy. We want to + * reuse the parity generation mechanism without trashing the actual + * parity so we make those columns appear to be full of zeros by + * setting their lengths to zero. + */ + pdata = rm->rm_col[VDEV_RAIDZ_P].rc_abd; + qdata = rm->rm_col[VDEV_RAIDZ_Q].rc_abd; + xsize = rm->rm_col[x].rc_size; + ysize = rm->rm_col[y].rc_size; + + rm->rm_col[VDEV_RAIDZ_P].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_P].rc_size, B_TRUE); + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = + abd_alloc_linear(rm->rm_col[VDEV_RAIDZ_Q].rc_size, B_TRUE); + rm->rm_col[x].rc_size = 0; + rm->rm_col[y].rc_size = 0; + + vdev_raidz_generate_parity_pq(rm); + + rm->rm_col[x].rc_size = xsize; + rm->rm_col[y].rc_size = ysize; + + p = abd_to_buf(pdata); + q = abd_to_buf(qdata); + pxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + qxy = abd_to_buf(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + xd = rm->rm_col[x].rc_abd; + yd = rm->rm_col[y].rc_abd; + + /* + * We now have: + * Pxy = P + D_x + D_y + * Qxy = Q + 2^(ndevs - 1 - x) * D_x + 2^(ndevs - 1 - y) * D_y + * + * We can then solve for D_x: + * D_x = A * (P + Pxy) + B * (Q + Qxy) + * where + * A = 2^(x - y) * (2^(x - y) + 1)^-1 + * B = 2^(ndevs - 1 - x) * (2^(x - y) + 1)^-1 + * + * With D_x in hand, we can easily solve for D_y: + * D_y = P + Pxy + D_x + */ + + a = vdev_raidz_pow2[255 + x - y]; + b = vdev_raidz_pow2[255 - (rm->rm_cols - 1 - x)]; + tmp = 255 - vdev_raidz_log2[a ^ 1]; + + aexp = vdev_raidz_log2[vdev_raidz_exp2(a, tmp)]; + bexp = vdev_raidz_log2[vdev_raidz_exp2(b, tmp)]; + + ASSERT3U(xsize, >=, ysize); + struct reconst_pq_struct rpq = { p, q, pxy, qxy, aexp, bexp }; + + (void) abd_iterate_func2(xd, yd, 0, 0, ysize, + vdev_raidz_reconst_pq_func, &rpq); + (void) abd_iterate_func(xd, ysize, xsize - ysize, + vdev_raidz_reconst_pq_tail_func, &rpq); + + abd_free(rm->rm_col[VDEV_RAIDZ_P].rc_abd); + abd_free(rm->rm_col[VDEV_RAIDZ_Q].rc_abd); + + /* + * Restore the saved parity data. + */ + rm->rm_col[VDEV_RAIDZ_P].rc_abd = pdata; + rm->rm_col[VDEV_RAIDZ_Q].rc_abd = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coefficients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coefficients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT0(rows[i][j]); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT0(rows[i][j]); + } + } + } +} + +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY] = { NULL }; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY] = { 0 }; + uint8_t log = 0; + uint8_t val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = abd_to_buf(rm->rm_col[c].rc_abd); + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = abd_to_buf(rm->rm_col[cc].rc_abd); + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + abd_t **bufs = NULL; + + int code = 0; + + /* + * Matrix reconstruction can't use scatter ABDs yet, so we allocate + * temporary linear ABDs. + */ + if (!abd_is_linear(rm->rm_col[rm->rm_firstdatacol].rc_abd)) { + bufs = kmem_alloc(rm->rm_cols * sizeof (abd_t *), KM_PUSHPAGE); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bufs[c] = col->rc_abd; + col->rc_abd = abd_alloc_linear(col->rc_size, B_TRUE); + abd_copy(col->rc_abd, bufs[c], col->rc_size); + } + } + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + /* + * copy back from temporary linear abds and free them + */ + if (bufs) { + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + abd_copy(bufs[c], col->rc_abd, col->rc_size); + abd_free(col->rc_abd); + col->rc_abd = bufs[c]; + } + kmem_free(bufs, rm->rm_cols * sizeof (abd_t *)); + } + + return (code); +} + +int +vdev_raidz_reconstruct(raidz_map_t *rm, const int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c, ret; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* Reconstruct using the new math implementation */ + ret = vdev_raidz_math_reconstruct(rm, parity_valid, dt, nbaddata); + if (ret != RAIDZ_ORIGINAL_IMPL) + return (ret); + + /* + * See if we can use any of our optimized reconstruction routines. + */ + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} + +static int +vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *logical_ashift, uint64_t *physical_ashift) +{ + vdev_t *cvd; + uint64_t nparity = vd->vdev_nparity; + int c; + int lasterror = 0; + int numerrors = 0; + + ASSERT(nparity > 0); + + if (nparity > VDEV_RAIDZ_MAXPARITY || + vd->vdev_children < nparity + 1) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (c = 0; c < vd->vdev_children; c++) { + cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; + numerrors++; + continue; + } + + *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; + *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); + } + + *asize *= vd->vdev_children; + *max_asize *= vd->vdev_children; + + if (numerrors > nparity) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + return (0); +} + +static void +vdev_raidz_close(vdev_t *vd) +{ + int c; + + for (c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static uint64_t +vdev_raidz_asize(vdev_t *vd, uint64_t psize) +{ + uint64_t asize; + uint64_t ashift = vd->vdev_top->vdev_ashift; + uint64_t cols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + + asize = ((psize - 1) >> ashift) + 1; + asize += nparity * ((asize + cols - nparity - 1) / (cols - nparity)); + asize = roundup(asize, nparity + 1) << ashift; + + return (asize); +} + +static void +vdev_raidz_child_done(zio_t *zio) +{ + raidz_col_t *rc = zio->io_private; + + rc->rc_error = zio->io_error; + rc->rc_tried = 1; + rc->rc_skipped = 0; +} + +static void +vdev_raidz_io_verify(zio_t *zio, raidz_map_t *rm, int col) +{ +#ifdef ZFS_DEBUG + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + + range_seg64_t logical_rs, physical_rs; + logical_rs.rs_start = zio->io_offset; + logical_rs.rs_end = logical_rs.rs_start + + vdev_raidz_asize(zio->io_vd, zio->io_size); + + raidz_col_t *rc = &rm->rm_col[col]; + vdev_t *cvd = vd->vdev_child[rc->rc_devidx]; + + vdev_xlate(cvd, &logical_rs, &physical_rs); + ASSERT3U(rc->rc_offset, ==, physical_rs.rs_start); + ASSERT3U(rc->rc_offset, <, physical_rs.rs_end); + /* + * It would be nice to assert that rs_end is equal + * to rc_offset + rc_size but there might be an + * optional I/O at the end that is not accounted in + * rc_size. + */ + if (physical_rs.rs_end > rc->rc_offset + rc->rc_size) { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + + rc->rc_size + (1 << tvd->vdev_ashift)); + } else { + ASSERT3U(physical_rs.rs_end, ==, rc->rc_offset + rc->rc_size); + } +#endif +} + +/* + * Start an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Generate the parity data + * 2. Create child zio write operations to each column's vdev, for both + * data and parity. + * 3. If the column skips any sectors for padding, create optional dummy + * write zio children for those areas to improve aggregation continuity. + * - For read operations: + * 1. Create child zio read operations to each data column's vdev to read + * the range of data required for zio. + * 2. If this is a scrub or resilver operation, or if any of the data + * vdevs have had errors, then create zio read operations to the parity + * columns' VDevs as well. + */ +static void +vdev_raidz_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *tvd = vd->vdev_top; + vdev_t *cvd; + raidz_map_t *rm; + raidz_col_t *rc; + int c, i; + + rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, + vd->vdev_nparity); + + ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); + + if (zio->io_type == ZIO_TYPE_WRITE) { + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + /* + * Verify physical to logical translation. + */ + vdev_raidz_io_verify(zio, rm, c); + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + + zio_execute(zio); + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* + * Iterate over the columns in reverse order so that we hit the parity + * last -- any errors along the way will force us to read the parity. + */ + for (c = rm->rm_cols - 1; c >= 0; c--) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (!vdev_readable(cvd)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ENXIO); + rc->rc_tried = 1; /* don't even try */ + rc->rc_skipped = 1; + continue; + } + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { + if (c >= rm->rm_firstdatacol) + rm->rm_missingdata++; + else + rm->rm_missingparity++; + rc->rc_error = SET_ERROR(ESTALE); + rc->rc_skipped = 1; + continue; + } + if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } + } + + zio_execute(zio); +} + + +/* + * Report a checksum error for a child of a RAID-Z device. + */ +static void +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, abd_t *bad_data) +{ + vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + mutex_enter(&vd->vdev_stat_lock); + vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&vd->vdev_stat_lock); + + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + zfs_ereport_post_checksum(zio->io_spa, vd, + &zio->io_bookmark, zio, rc->rc_offset, rc->rc_size, + rc->rc_abd, bad_data, &zbc); + } +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + bzero(&zbc, sizeof (zio_bad_cksum_t)); + + int ret = zio_checksum_error(zio, &zbc); + if (ret != 0 && zbc.zbc_injected != 0) + rm->rm_ecksuminjected = 1; + + return (ret); +} + +/* + * Generate the parity from the data columns. If we tried and were able to + * read the parity without error, verify that the generated parity matches the + * data we read. If it doesn't, we fire off a checksum error. Return the + * number such failures. + */ +static int +raidz_parity_verify(zio_t *zio, raidz_map_t *rm) +{ + abd_t *orig[VDEV_RAIDZ_MAXPARITY]; + int c, ret = 0; + raidz_col_t *rc; + + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + + if (checksum == ZIO_CHECKSUM_NOPARITY) + return (ret); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + + orig[c] = abd_alloc_sametype(rc->rc_abd, rc->rc_size); + abd_copy(orig[c], rc->rc_abd, rc->rc_size); + } + + vdev_raidz_generate_parity(rm); + + for (c = 0; c < rm->rm_firstdatacol; c++) { + rc = &rm->rm_col[c]; + if (!rc->rc_tried || rc->rc_error != 0) + continue; + if (abd_cmp(orig[c], rc->rc_abd) != 0) { + raidz_checksum_error(zio, rc, orig[c]); + rc->rc_error = SET_ERROR(ECKSUM); + ret++; + } + abd_free(orig[c]); + } + + return (ret); +} + +static int +vdev_raidz_worst_error(raidz_map_t *rm) +{ + int error = 0; + + for (int c = 0; c < rm->rm_cols; c++) + error = zio_worst_error(error, rm->rm_col[c].rc_error); + + return (error); +} + +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + abd_t *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int curr, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = abd_alloc_sametype(rm->rm_col[0].rc_abd, + rm->rm_col[0].rc_size); + + curr = 0; + next = tgts[curr]; + + while (curr != n) { + tgts[curr] = next; + curr = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + abd_copy(orig[i], rc->rc_abd, rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (raidz_checksum_verify(zio) == 0) { + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc, + orig[i]); + rc->rc_error = SET_ERROR(ECKSUM); + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + abd_copy(rc->rc_abd, orig[i], rc->rc_size); + } + + do { + /* + * Find the next valid column after the curr + * position.. + */ + for (next = tgts[curr] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[curr + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[curr + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[curr - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[curr] = c; + curr++; + + } while (curr != n); + } + } + n--; +done: + for (i = 0; i < n; i++) + abd_free(orig[i]); + + return (ret); +} + +/* + * Complete an IO operation on a RAIDZ VDev + * + * Outline: + * - For write operations: + * 1. Check for errors on the child IOs. + * 2. Return, setting an error code if too few child VDevs were written + * to reconstruct the data later. Note that partial writes are + * considered successful if they can be reconstructed at all. + * - For read operations: + * 1. Check for errors on the child IOs. + * 2. If data errors occurred: + * a. Try to reassemble the data from the parity available. + * b. If we haven't yet read the parity drives, read them now. + * c. If all parity drives have been read but the data still doesn't + * reassemble with a correct checksum, then try combinatorial + * reconstruction. + * d. If that doesn't work, return an error. + * 3. If there were unexpected errors or this is a resilver operation, + * rewrite the vdevs that had errors. + */ +static void +vdev_raidz_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_t *cvd; + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc = NULL; + int unexpected_errors = 0; + int parity_errors = 0; + int parity_untried = 0; + int data_errors = 0; + int total_errors = 0; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; + + ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ + + ASSERT(rm->rm_missingparity <= rm->rm_firstdatacol); + ASSERT(rm->rm_missingdata <= rm->rm_cols - rm->rm_firstdatacol); + + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + + if (rc->rc_error) { + ASSERT(rc->rc_error != ECKSUM); /* child has no bp */ + + if (c < rm->rm_firstdatacol) + parity_errors++; + else + data_errors++; + + if (!rc->rc_skipped) + unexpected_errors++; + + total_errors++; + } else if (c < rm->rm_firstdatacol && !rc->rc_tried) { + parity_untried++; + } + } + + if (zio->io_type == ZIO_TYPE_WRITE) { + /* + * XXX -- for now, treat partial writes as a success. + * (If we couldn't write enough columns to reconstruct + * the data, the I/O failed. Otherwise, good enough.) + * + * Now that we support write reallocation, it would be better + * to treat partial failure as real failure unless there are + * no non-degraded top-level vdevs left, and not update DTLs + * if we intend to reallocate. + */ + /* XXPOLICY */ + if (total_errors > rm->rm_firstdatacol) + zio->io_error = vdev_raidz_worst_error(rm); + + return; + } + + ASSERT(zio->io_type == ZIO_TYPE_READ); + /* + * There are three potential phases for a read: + * 1. produce valid data from the columns read + * 2. read all disks and try again + * 3. perform combinatorial reconstruction + * + * Each phase is progressively both more expensive and less likely to + * occur. If we encounter more errors than we can repair or all phases + * fail, we have no choice but to return an error. + */ + + /* + * If the number of errors we saw was correctable -- less than or equal + * to the number of parity disks read -- attempt to produce data that + * has a valid checksum. Naturally, this case applies in the absence of + * any errors. + */ + if (total_errors <= rm->rm_firstdatacol - parity_untried) { + if (data_errors == 0) { + if (raidz_checksum_verify(zio) == 0) { + /* + * If we read parity information (unnecessarily + * as it happens since no reconstruction was + * needed) regenerate and verify the parity. + * We also regenerate parity when resilvering + * so we can write it out to the failed device + * later. + */ + if (parity_errors + parity_untried < + rm->rm_firstdatacol || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + goto done; + } + } else { + /* + * We either attempt to read all the parity columns or + * none of them. If we didn't try to read parity, we + * wouldn't be here in the correctable case. There must + * also have been fewer parity errors than parity + * columns or, again, we wouldn't be in this code path. + */ + ASSERT(parity_untried == 0); + ASSERT(parity_errors < rm->rm_firstdatacol); + + /* + * Identify the data columns that reported an error. + */ + n = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } + } + + ASSERT(rm->rm_firstdatacol >= n); + + code = vdev_raidz_reconstruct(rm, tgts, n); + + if (raidz_checksum_verify(zio) == 0) { + /* + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. + */ + if (parity_errors < rm->rm_firstdatacol - n || + (zio->io_flags & ZIO_FLAG_RESILVER)) { + n = raidz_parity_verify(zio, rm); + unexpected_errors += n; + ASSERT(parity_errors + n <= + rm->rm_firstdatacol); + } + + goto done; + } + } + } + + /* + * This isn't a typical situation -- either we got a read error or + * a child silently returned bad data. Read every block so we can + * try again with as much data and parity as we can track down. If + * we've already been through once before, all children will be marked + * as tried so we'll proceed to combinatorial reconstruction. + */ + unexpected_errors = 1; + rm->rm_missingdata = 0; + rm->rm_missingparity = 0; + + for (c = 0; c < rm->rm_cols; c++) { + if (rm->rm_col[c].rc_tried) + continue; + + zio_vdev_io_redone(zio); + do { + rc = &rm->rm_col[c]; + if (rc->rc_tried) + continue; + zio_nowait(zio_vdev_child_io(zio, NULL, + vd->vdev_child[rc->rc_devidx], + rc->rc_offset, rc->rc_abd, rc->rc_size, + zio->io_type, zio->io_priority, 0, + vdev_raidz_child_done, rc)); + } while (++c < rm->rm_cols); + + return; + } + + /* + * At this point we've attempted to reconstruct the data given the + * errors we detected, and we've attempted to read all columns. There + * must, therefore, be one or more additional problems -- silent errors + * resulting in invalid data rather than explicit I/O errors resulting + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. + */ + if (total_errors > rm->rm_firstdatacol) { + zio->io_error = vdev_raidz_worst_error(rm); + + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { + /* + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. + */ + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { + /* + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. + */ + zio->io_error = SET_ERROR(ECKSUM); + + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + vdev_t *cvd; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + mutex_enter(&cvd->vdev_stat_lock); + cvd->vdev_stat.vs_checksum_errors++; + mutex_exit(&cvd->vdev_stat_lock); + + zfs_ereport_start_checksum( + zio->io_spa, cvd, + &zio->io_bookmark, zio, + rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); + } + } + } + } + +done: + zio_checksum_verified(zio); + + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && + (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { + /* + * Use the good data we have in hand to repair damaged children. + */ + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + + if (rc->rc_error == 0) + continue; + + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset, rc->rc_abd, rc->rc_size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); + } + } +} + +static void +vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (faulted > vd->vdev_nparity) + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + else if (degraded + faulted != 0) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + else + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); +} + +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which implies that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t ashift = vd->vdev_top->vdev_ashift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = ((psize - 1) >> ashift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + + if (s + nparity >= dcols) + return (B_TRUE); + + for (uint64_t c = 0; c < s + nparity; c++) { + uint64_t devidx = (f + c) % dcols; + vdev_t *cvd = vd->vdev_child[devidx]; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static void +vdev_raidz_xlate(vdev_t *cvd, const range_seg64_t *in, range_seg64_t *res) +{ + vdev_t *raidvd = cvd->vdev_parent; + ASSERT(raidvd->vdev_ops == &vdev_raidz_ops); + + uint64_t width = raidvd->vdev_children; + uint64_t tgt_col = cvd->vdev_id; + uint64_t ashift = raidvd->vdev_top->vdev_ashift; + + /* make sure the offsets are block-aligned */ + ASSERT0(in->rs_start % (1 << ashift)); + ASSERT0(in->rs_end % (1 << ashift)); + uint64_t b_start = in->rs_start >> ashift; + uint64_t b_end = in->rs_end >> ashift; + + uint64_t start_row = 0; + if (b_start > tgt_col) /* avoid underflow */ + start_row = ((b_start - tgt_col - 1) / width) + 1; + + uint64_t end_row = 0; + if (b_end > tgt_col) + end_row = ((b_end - tgt_col - 1) / width) + 1; + + res->rs_start = start_row << ashift; + res->rs_end = end_row << ashift; + + ASSERT3U(res->rs_start, <=, in->rs_start); + ASSERT3U(res->rs_end - res->rs_start, <=, in->rs_end - in->rs_start); +} + +vdev_ops_t vdev_raidz_ops = { + .vdev_op_open = vdev_raidz_open, + .vdev_op_close = vdev_raidz_close, + .vdev_op_asize = vdev_raidz_asize, + .vdev_op_io_start = vdev_raidz_io_start, + .vdev_op_io_done = vdev_raidz_io_done, + .vdev_op_state_change = vdev_raidz_state_change, + .vdev_op_need_resilver = vdev_raidz_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; diff --git a/module/zfs/vdev_raidz_math.c b/module/zfs/vdev_raidz_math.c new file mode 100644 index 000000000000..aa92cb83d301 --- /dev/null +++ b/module/zfs/vdev_raidz_math.c @@ -0,0 +1,666 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* Opaque implementation with NULL methods to represent original methods */ +static const raidz_impl_ops_t vdev_raidz_original_impl = { + .name = "original", + .is_supported = raidz_will_scalar_work, +}; + +/* RAIDZ parity op that contain the fastest methods */ +static raidz_impl_ops_t vdev_raidz_fastest_impl = { + .name = "fastest" +}; + +/* All compiled in implementations */ +const raidz_impl_ops_t *raidz_all_maths[] = { + &vdev_raidz_original_impl, + &vdev_raidz_scalar_impl, +#if defined(__x86_64) && defined(HAVE_SSE2) /* only x86_64 for now */ + &vdev_raidz_sse2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_SSSE3) /* only x86_64 for now */ + &vdev_raidz_ssse3_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX2) /* only x86_64 for now */ + &vdev_raidz_avx2_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512F) /* only x86_64 for now */ + &vdev_raidz_avx512f_impl, +#endif +#if defined(__x86_64) && defined(HAVE_AVX512BW) /* only x86_64 for now */ + &vdev_raidz_avx512bw_impl, +#endif +#if defined(__aarch64__) + &vdev_raidz_aarch64_neon_impl, + &vdev_raidz_aarch64_neonx2_impl, +#endif +#if defined(__powerpc__) && defined(__altivec__) + &vdev_raidz_powerpc_altivec_impl, +#endif +}; + +/* Indicate that benchmark has been completed */ +static boolean_t raidz_math_initialized = B_FALSE; + +/* Select raidz implementation */ +#define IMPL_FASTEST (UINT32_MAX) +#define IMPL_CYCLE (UINT32_MAX - 1) +#define IMPL_ORIGINAL (0) +#define IMPL_SCALAR (1) + +#define RAIDZ_IMPL_READ(i) (*(volatile uint32_t *) &(i)) + +static uint32_t zfs_vdev_raidz_impl = IMPL_SCALAR; +static uint32_t user_sel_impl = IMPL_FASTEST; + +/* Hold all supported implementations */ +static size_t raidz_supp_impl_cnt = 0; +static raidz_impl_ops_t *raidz_supp_impl[ARRAY_SIZE(raidz_all_maths)]; + +#if defined(_KERNEL) +/* + * kstats values for supported implementations + * Values represent per disk throughput of 8 disk+parity raidz vdev [B/s] + */ +static raidz_impl_kstat_t raidz_impl_kstats[ARRAY_SIZE(raidz_all_maths) + 1]; + +/* kstat for benchmarked implementations */ +static kstat_t *raidz_math_kstat = NULL; +#endif + +/* + * Returns the RAIDZ operations for raidz_map() parity calculations. When + * a SIMD implementation is not allowed in the current context, then fallback + * to the fastest generic implementation. + */ +const raidz_impl_ops_t * +vdev_raidz_math_get_ops(void) +{ + if (!kfpu_allowed()) + return (&vdev_raidz_scalar_impl); + + raidz_impl_ops_t *ops = NULL; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + switch (impl) { + case IMPL_FASTEST: + ASSERT(raidz_math_initialized); + ops = &vdev_raidz_fastest_impl; + break; + case IMPL_CYCLE: + /* Cycle through all supported implementations */ + ASSERT(raidz_math_initialized); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + static size_t cycle_impl_idx = 0; + size_t idx = (++cycle_impl_idx) % raidz_supp_impl_cnt; + ops = raidz_supp_impl[idx]; + break; + case IMPL_ORIGINAL: + ops = (raidz_impl_ops_t *)&vdev_raidz_original_impl; + break; + case IMPL_SCALAR: + ops = (raidz_impl_ops_t *)&vdev_raidz_scalar_impl; + break; + default: + ASSERT3U(impl, <, raidz_supp_impl_cnt); + ASSERT3U(raidz_supp_impl_cnt, >, 0); + if (impl < ARRAY_SIZE(raidz_all_maths)) + ops = raidz_supp_impl[impl]; + break; + } + + ASSERT3P(ops, !=, NULL); + + return (ops); +} + +/* + * Select parity generation method for raidz_map + */ +int +vdev_raidz_math_generate(raidz_map_t *rm) +{ + raidz_gen_f gen_parity = NULL; + + switch (raidz_parity(rm)) { + case 1: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_P]; + break; + case 2: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQ]; + break; + case 3: + gen_parity = rm->rm_ops->gen[RAIDZ_GEN_PQR]; + break; + default: + gen_parity = NULL; + cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", + raidz_parity(rm)); + break; + } + + /* if method is NULL execute the original implementation */ + if (gen_parity == NULL) + return (RAIDZ_ORIGINAL_IMPL); + + gen_parity(rm); + + return (0); +} + +static raidz_rec_f +reconstruct_fun_p_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1 && parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pq_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } + } else if (nbaddata == 2 && + parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } + return ((raidz_rec_f) NULL); +} + +static raidz_rec_f +reconstruct_fun_pqr_sel(raidz_map_t *rm, const int *parity_valid, + const int nbaddata) +{ + if (nbaddata == 1) { + if (parity_valid[CODE_P]) { + return (rm->rm_ops->rec[RAIDZ_REC_P]); + } else if (parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_Q]); + } else if (parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_R]); + } + } else if (nbaddata == 2) { + if (parity_valid[CODE_P] && parity_valid[CODE_Q]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQ]); + } else if (parity_valid[CODE_P] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PR]); + } else if (parity_valid[CODE_Q] && parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_QR]); + } + } else if (nbaddata == 3 && + parity_valid[CODE_P] && parity_valid[CODE_Q] && + parity_valid[CODE_R]) { + return (rm->rm_ops->rec[RAIDZ_REC_PQR]); + } + return ((raidz_rec_f) NULL); +} + +/* + * Select data reconstruction method for raidz_map + * @parity_valid - Parity validity flag + * @dt - Failed data index array + * @nbaddata - Number of failed data columns + */ +int +vdev_raidz_math_reconstruct(raidz_map_t *rm, const int *parity_valid, + const int *dt, const int nbaddata) +{ + raidz_rec_f rec_fn = NULL; + + switch (raidz_parity(rm)) { + case PARITY_P: + rec_fn = reconstruct_fun_p_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQ: + rec_fn = reconstruct_fun_pq_sel(rm, parity_valid, nbaddata); + break; + case PARITY_PQR: + rec_fn = reconstruct_fun_pqr_sel(rm, parity_valid, nbaddata); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration %d", + raidz_parity(rm)); + break; + } + + if (rec_fn == NULL) + return (RAIDZ_ORIGINAL_IMPL); + else + return (rec_fn(rm, dt)); +} + +const char *raidz_gen_name[] = { + "gen_p", "gen_pq", "gen_pqr" +}; +const char *raidz_rec_name[] = { + "rec_p", "rec_q", "rec_r", + "rec_pq", "rec_pr", "rec_qr", "rec_pqr" +}; + +#if defined(_KERNEL) + +#define RAIDZ_KSTAT_LINE_LEN (17 + 10*12 + 1) + +static int +raidz_math_kstat_headers(char *buf, size_t size) +{ + int i; + ssize_t off; + + ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); + + off = snprintf(buf, size, "%-17s", "implementation"); + + for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) + off += snprintf(buf + off, size - off, "%-16s", + raidz_gen_name[i]); + + for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) + off += snprintf(buf + off, size - off, "%-16s", + raidz_rec_name[i]); + + (void) snprintf(buf + off, size - off, "\n"); + + return (0); +} + +static int +raidz_math_kstat_data(char *buf, size_t size, void *data) +{ + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + raidz_impl_kstat_t *cstat = (raidz_impl_kstat_t *)data; + ssize_t off = 0; + int i; + + ASSERT3U(size, >=, RAIDZ_KSTAT_LINE_LEN); + + if (cstat == fstat) { + off += snprintf(buf + off, size - off, "%-17s", "fastest"); + + for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) { + int id = fstat->gen[i]; + off += snprintf(buf + off, size - off, "%-16s", + raidz_supp_impl[id]->name); + } + for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) { + int id = fstat->rec[i]; + off += snprintf(buf + off, size - off, "%-16s", + raidz_supp_impl[id]->name); + } + } else { + ptrdiff_t id = cstat - raidz_impl_kstats; + + off += snprintf(buf + off, size - off, "%-17s", + raidz_supp_impl[id]->name); + + for (i = 0; i < ARRAY_SIZE(raidz_gen_name); i++) + off += snprintf(buf + off, size - off, "%-16llu", + (u_longlong_t)cstat->gen[i]); + + for (i = 0; i < ARRAY_SIZE(raidz_rec_name); i++) + off += snprintf(buf + off, size - off, "%-16llu", + (u_longlong_t)cstat->rec[i]); + } + + (void) snprintf(buf + off, size - off, "\n"); + + return (0); +} + +static void * +raidz_math_kstat_addr(kstat_t *ksp, loff_t n) +{ + if (n <= raidz_supp_impl_cnt) + ksp->ks_private = (void *) (raidz_impl_kstats + n); + else + ksp->ks_private = NULL; + + return (ksp->ks_private); +} + +#define BENCH_D_COLS (8ULL) +#define BENCH_COLS (BENCH_D_COLS + PARITY_PQR) +#define BENCH_ZIO_SIZE (1ULL << SPA_OLD_MAXBLOCKSHIFT) /* 128 kiB */ +#define BENCH_NS MSEC2NSEC(25) /* 25ms */ + +typedef void (*benchmark_fn)(raidz_map_t *rm, const int fn); + +static void +benchmark_gen_impl(raidz_map_t *rm, const int fn) +{ + (void) fn; + vdev_raidz_generate_parity(rm); +} + +static void +benchmark_rec_impl(raidz_map_t *rm, const int fn) +{ + static const int rec_tgt[7][3] = { + {1, 2, 3}, /* rec_p: bad QR & D[0] */ + {0, 2, 3}, /* rec_q: bad PR & D[0] */ + {0, 1, 3}, /* rec_r: bad PQ & D[0] */ + {2, 3, 4}, /* rec_pq: bad R & D[0][1] */ + {1, 3, 4}, /* rec_pr: bad Q & D[0][1] */ + {0, 3, 4}, /* rec_qr: bad P & D[0][1] */ + {3, 4, 5} /* rec_pqr: bad & D[0][1][2] */ + }; + + vdev_raidz_reconstruct(rm, rec_tgt[fn], 3); +} + +/* + * Benchmarking of all supported implementations (raidz_supp_impl_cnt) + * is performed by setting the rm_ops pointer and calling the top level + * generate/reconstruct methods of bench_rm. + */ +static void +benchmark_raidz_impl(raidz_map_t *bench_rm, const int fn, benchmark_fn bench_fn) +{ + uint64_t run_cnt, speed, best_speed = 0; + hrtime_t t_start, t_diff; + raidz_impl_ops_t *curr_impl; + raidz_impl_kstat_t *fstat = &raidz_impl_kstats[raidz_supp_impl_cnt]; + int impl, i; + + for (impl = 0; impl < raidz_supp_impl_cnt; impl++) { + /* set an implementation to benchmark */ + curr_impl = raidz_supp_impl[impl]; + bench_rm->rm_ops = curr_impl; + + run_cnt = 0; + t_start = gethrtime(); + + do { + for (i = 0; i < 25; i++, run_cnt++) + bench_fn(bench_rm, fn); + + t_diff = gethrtime() - t_start; + } while (t_diff < BENCH_NS); + + speed = run_cnt * BENCH_ZIO_SIZE * NANOSEC; + speed /= (t_diff * BENCH_COLS); + + if (bench_fn == benchmark_gen_impl) + raidz_impl_kstats[impl].gen[fn] = speed; + else + raidz_impl_kstats[impl].rec[fn] = speed; + + /* Update fastest implementation method */ + if (speed > best_speed) { + best_speed = speed; + + if (bench_fn == benchmark_gen_impl) { + fstat->gen[fn] = impl; + vdev_raidz_fastest_impl.gen[fn] = + curr_impl->gen[fn]; + } else { + fstat->rec[fn] = impl; + vdev_raidz_fastest_impl.rec[fn] = + curr_impl->rec[fn]; + } + } + } +} +#endif + +/* + * Initialize and benchmark all supported implementations. + */ +static void +benchmark_raidz(void) +{ + raidz_impl_ops_t *curr_impl; + int i, c; + + /* Move supported impl into raidz_supp_impl */ + for (i = 0, c = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = (raidz_impl_ops_t *)raidz_all_maths[i]; + + if (curr_impl->init) + curr_impl->init(); + + if (curr_impl->is_supported()) + raidz_supp_impl[c++] = (raidz_impl_ops_t *)curr_impl; + } + membar_producer(); /* complete raidz_supp_impl[] init */ + raidz_supp_impl_cnt = c; /* number of supported impl */ + +#if defined(_KERNEL) + zio_t *bench_zio = NULL; + raidz_map_t *bench_rm = NULL; + uint64_t bench_parity; + + /* Fake a zio and run the benchmark on a warmed up buffer */ + bench_zio = kmem_zalloc(sizeof (zio_t), KM_SLEEP); + bench_zio->io_offset = 0; + bench_zio->io_size = BENCH_ZIO_SIZE; /* only data columns */ + bench_zio->io_abd = abd_alloc_linear(BENCH_ZIO_SIZE, B_TRUE); + memset(abd_to_buf(bench_zio->io_abd), 0xAA, BENCH_ZIO_SIZE); + + /* Benchmark parity generation methods */ + for (int fn = 0; fn < RAIDZ_GEN_NUM; fn++) { + bench_parity = fn + 1; + /* New raidz_map is needed for each generate_p/q/r */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_D_COLS + bench_parity, bench_parity); + + benchmark_raidz_impl(bench_rm, fn, benchmark_gen_impl); + + vdev_raidz_map_free(bench_rm); + } + + /* Benchmark data reconstruction methods */ + bench_rm = vdev_raidz_map_alloc(bench_zio, SPA_MINBLOCKSHIFT, + BENCH_COLS, PARITY_PQR); + + for (int fn = 0; fn < RAIDZ_REC_NUM; fn++) + benchmark_raidz_impl(bench_rm, fn, benchmark_rec_impl); + + vdev_raidz_map_free(bench_rm); + + /* cleanup the bench zio */ + abd_free(bench_zio->io_abd); + kmem_free(bench_zio, sizeof (zio_t)); +#else + /* + * Skip the benchmark in user space to avoid impacting libzpool + * consumers (zdb, zhack, zinject, ztest). The last implementation + * is assumed to be the fastest and used by default. + */ + memcpy(&vdev_raidz_fastest_impl, + raidz_supp_impl[raidz_supp_impl_cnt - 1], + sizeof (vdev_raidz_fastest_impl)); + strcpy(vdev_raidz_fastest_impl.name, "fastest"); +#endif /* _KERNEL */ +} + +void +vdev_raidz_math_init(void) +{ + /* Determine the fastest available implementation. */ + benchmark_raidz(); + +#if defined(_KERNEL) + /* Install kstats for all implementations */ + raidz_math_kstat = kstat_create("zfs", 0, "vdev_raidz_bench", "misc", + KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); + if (raidz_math_kstat != NULL) { + raidz_math_kstat->ks_data = NULL; + raidz_math_kstat->ks_ndata = UINT32_MAX; + kstat_set_raw_ops(raidz_math_kstat, + raidz_math_kstat_headers, + raidz_math_kstat_data, + raidz_math_kstat_addr); + kstat_install(raidz_math_kstat); + } +#endif + + /* Finish initialization */ + atomic_swap_32(&zfs_vdev_raidz_impl, user_sel_impl); + raidz_math_initialized = B_TRUE; +} + +void +vdev_raidz_math_fini(void) +{ + raidz_impl_ops_t const *curr_impl; + +#if defined(_KERNEL) + if (raidz_math_kstat != NULL) { + kstat_delete(raidz_math_kstat); + raidz_math_kstat = NULL; + } +#endif + + for (int i = 0; i < ARRAY_SIZE(raidz_all_maths); i++) { + curr_impl = raidz_all_maths[i]; + if (curr_impl->fini) + curr_impl->fini(); + } +} + +static const struct { + char *name; + uint32_t sel; +} math_impl_opts[] = { + { "cycle", IMPL_CYCLE }, + { "fastest", IMPL_FASTEST }, + { "original", IMPL_ORIGINAL }, + { "scalar", IMPL_SCALAR } +}; + +/* + * Function sets desired raidz implementation. + * + * If we are called before init(), user preference will be saved in + * user_sel_impl, and applied in later init() call. This occurs when module + * parameter is specified on module load. Otherwise, directly update + * zfs_vdev_raidz_impl. + * + * @val Name of raidz implementation to use + * @param Unused. + */ +int +vdev_raidz_impl_set(const char *val) +{ + int err = -EINVAL; + char req_name[RAIDZ_IMPL_NAME_MAX]; + uint32_t impl = RAIDZ_IMPL_READ(user_sel_impl); + size_t i; + + /* sanitize input */ + i = strnlen(val, RAIDZ_IMPL_NAME_MAX); + if (i == 0 || i == RAIDZ_IMPL_NAME_MAX) + return (err); + + strlcpy(req_name, val, RAIDZ_IMPL_NAME_MAX); + while (i > 0 && !!isspace(req_name[i-1])) + i--; + req_name[i] = '\0'; + + /* Check mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts); i++) { + if (strcmp(req_name, math_impl_opts[i].name) == 0) { + impl = math_impl_opts[i].sel; + err = 0; + break; + } + } + + /* check all supported impl if init() was already called */ + if (err != 0 && raidz_math_initialized) { + /* check all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + if (strcmp(req_name, raidz_supp_impl[i]->name) == 0) { + impl = i; + err = 0; + break; + } + } + } + + if (err == 0) { + if (raidz_math_initialized) + atomic_swap_32(&zfs_vdev_raidz_impl, impl); + else + atomic_swap_32(&user_sel_impl, impl); + } + + return (err); +} + +#if defined(_KERNEL) && defined(__linux__) + +static int +zfs_vdev_raidz_impl_set(const char *val, zfs_kernel_param_t *kp) +{ + return (vdev_raidz_impl_set(val)); +} + +static int +zfs_vdev_raidz_impl_get(char *buffer, zfs_kernel_param_t *kp) +{ + int i, cnt = 0; + char *fmt; + const uint32_t impl = RAIDZ_IMPL_READ(zfs_vdev_raidz_impl); + + ASSERT(raidz_math_initialized); + + /* list mandatory options */ + for (i = 0; i < ARRAY_SIZE(math_impl_opts) - 2; i++) { + fmt = (impl == math_impl_opts[i].sel) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, math_impl_opts[i].name); + } + + /* list all supported implementations */ + for (i = 0; i < raidz_supp_impl_cnt; i++) { + fmt = (i == impl) ? "[%s] " : "%s "; + cnt += sprintf(buffer + cnt, fmt, raidz_supp_impl[i]->name); + } + + return (cnt); +} + +module_param_call(zfs_vdev_raidz_impl, zfs_vdev_raidz_impl_set, + zfs_vdev_raidz_impl_get, NULL, 0644); +MODULE_PARM_DESC(zfs_vdev_raidz_impl, "Select raidz implementation."); +#endif diff --git a/module/zfs/vdev_raidz_math_aarch64_neon.c b/module/zfs/vdev_raidz_math_aarch64_neon.c new file mode 100644 index 000000000000..0a67ceb84920 --- /dev/null +++ b/module/zfs/vdev_raidz_math_aarch64_neon.c @@ -0,0 +1,2279 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + */ + +#include +#include + +#if defined(__aarch64__) + +#include "vdev_raidz_math_aarch64_neon_common.h" + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQ_STRIDE 2 +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PR_STRIDE 2 +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_QR_STRIDE 2 +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(aarch64_neon); +DEFINE_REC_METHODS(aarch64_neon); + +static boolean_t +raidz_will_aarch64_neon_work(void) +{ + return (kfpu_allowed()); +} + +const raidz_impl_ops_t vdev_raidz_aarch64_neon_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(aarch64_neon), + .rec = RAIDZ_REC_METHODS(aarch64_neon), + .is_supported = &raidz_will_aarch64_neon_work, + .name = "aarch64_neon" +}; + +#endif /* defined(__aarch64__) */ + + +#if defined(__aarch64__) +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, + 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, + 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, + 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, + 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, + 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, + 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, + 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, + 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, + 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, + 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, + 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, + 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, + 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, + 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, + 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, + 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, + 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, + 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, + 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, + 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, + 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, + 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, + 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, + 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, + 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, + 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, + 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, + 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, + 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, + 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, + 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, + 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, + 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, + 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, + 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, + 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, + 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, + 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, + 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, + 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, + 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, + 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, + 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, + 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, + 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, + 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, + 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, + 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, + 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, + 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, + 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, + 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, + 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, + 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, + 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, + 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, + 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, + 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, + 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, + 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, + 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, + 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, + 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, + 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, + 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, + 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, + 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, + 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, + 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, + 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, + 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, + 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, + 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, + 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, + 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, + 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, + 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, + 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, + 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, + 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, + 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, + 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, + 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, + 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, + 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, + 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, + 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, + 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, + 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, + 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, + 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, + 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, + 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, + 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, + 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, + 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, + 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, + 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, + 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, + 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, + 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, + 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, + 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, + 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, + 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, + 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, + 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, + 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, + 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, + 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, + 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, + 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, + 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, + 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, + 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, + 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, + 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, + 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, + 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, + 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, + 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, + 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, + 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, + 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, + 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, + 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, + 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, + 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, + 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, + 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, + 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, + 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, + 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, + 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, + 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, + 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, + 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, + 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, + 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, + 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, + 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, + 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, + 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, + 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, + 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, + 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, + 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, + 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, + 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, + 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, + 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, + 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, + 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, + 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, + 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, + 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, + 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, + 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, + 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, + 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, + 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, + 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, + 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, + 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, + 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, + 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, + 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, + 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, + 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, + 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, + 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, + 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, + 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, + 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, + 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, + 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, + 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, + 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, + 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, + 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, + 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, + 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, + 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, + 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, + 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, + 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, + 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, + 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, + 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, + 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, + 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, + 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, + 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, + 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, + 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, + 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, + 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, + 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, + 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, + 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, + 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, + 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, + 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, + 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, + 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, + 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, + 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, + 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, + 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, + 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, + 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, + 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, + 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, + 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, + 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, + 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, + 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, + 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, + 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, + 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } +}; +/* END CSTYLED */ +#endif /* defined(__aarch64__) */ diff --git a/module/zfs/vdev_raidz_math_aarch64_neon_common.h b/module/zfs/vdev_raidz_math_aarch64_neon_common.h new file mode 100644 index 000000000000..e46b2536546c --- /dev/null +++ b/module/zfs/vdev_raidz_math_aarch64_neon_common.h @@ -0,0 +1,684 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + */ + +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "%[w"#REG"]" +#define VR1_(_1, REG, ...) "%[w"#REG"]" +#define VR2_(_1, _2, REG, ...) "%[w"#REG"]" +#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]" +#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]" +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]" +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]" +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]" + +/* + * Here we need registers not used otherwise. + * They will be used in unused ASM for the case + * with more registers than required... but GCC + * will still need to make sure the constraints + * are correct, and duplicate constraints are illegal + * ... and we use the "register" number as a name + */ + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 36) +#define VR3(r...) VR3_(r, 36, 35) +#define VR4(r...) VR4_(r, 36, 35, 34, 33) +#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32) +#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31) +#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define VR(X) "%[w"#X"]" + +#define RVR0_(REG, ...) [w##REG] "w" (w##REG) +#define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG) +#define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG) +#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG) +#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG) +#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG) +#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG) +#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG) + +#define RVR0(r...) RVR0_(r) +#define RVR1(r...) RVR1_(r) +#define RVR2(r...) RVR2_(r, 36) +#define RVR3(r...) RVR3_(r, 36, 35) +#define RVR4(r...) RVR4_(r, 36, 35, 34, 33) +#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32) +#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31) +#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define RVR(X) [w##X] "w" (w##X) + +#define WVR0_(REG, ...) [w##REG] "=w" (w##REG) +#define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG) +#define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG) +#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG) +#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG) +#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG) +#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG) +#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG) + +#define WVR0(r...) WVR0_(r) +#define WVR1(r...) WVR1_(r) +#define WVR2(r...) WVR2_(r, 36) +#define WVR3(r...) WVR3_(r, 36, 35) +#define WVR4(r...) WVR4_(r, 36, 35, 34, 33) +#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32) +#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31) +#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define WVR(X) [w##X] "=w" (w##X) + +#define UVR0_(REG, ...) [w##REG] "+&w" (w##REG) +#define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG) +#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG) + +#define UVR0(r...) UVR0_(r) +#define UVR1(r...) UVR1_(r) +#define UVR2(r...) UVR2_(r, 36) +#define UVR3(r...) UVR3_(r, 36, 35) +#define UVR4(r...) UVR4_(r, 36, 35, 34, 33) +#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32) +#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) +#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define UVR(X) [w##X] "+&w" (w##X) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "ld1 { v21.4s },%[SRC0]\n" \ + "ld1 { v20.4s },%[SRC1]\n" \ + "ld1 { v19.4s },%[SRC2]\n" \ + "ld1 { v18.4s },%[SRC3]\n" \ + "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ + "ld1 { v21.4s },%[SRC4]\n" \ + "ld1 { v20.4s },%[SRC5]\n" \ + "ld1 { v19.4s },%[SRC6]\n" \ + "ld1 { v18.4s },%[SRC7]\n" \ + "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \ + UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16))), \ + [SRC2] "Q" (*(OFFSET(src, 32))), \ + [SRC3] "Q" (*(OFFSET(src, 48))), \ + [SRC4] "Q" (*(OFFSET(src, 64))), \ + [SRC5] "Q" (*(OFFSET(src, 80))), \ + [SRC6] "Q" (*(OFFSET(src, 96))), \ + [SRC7] "Q" (*(OFFSET(src, 112))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 4: \ + __asm( \ + "ld1 { v21.4s },%[SRC0]\n" \ + "ld1 { v20.4s },%[SRC1]\n" \ + "ld1 { v19.4s },%[SRC2]\n" \ + "ld1 { v18.4s },%[SRC3]\n" \ + "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16))), \ + [SRC2] "Q" (*(OFFSET(src, 32))), \ + [SRC3] "Q" (*(OFFSET(src, 48))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "ld1 { v21.4s },%[SRC0]\n" \ + "ld1 { v20.4s },%[SRC1]\n" \ + "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ + : UVR0(r), UVR1(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16))) \ + : "v20", "v21"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n" \ + : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n" \ + : UVR2(r), UVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ + "eor " VR4(r) ".16b," VR4(r) ".16b," VR4(r) ".16b\n" \ + "eor " VR5(r) ".16b," VR5(r) ".16b," VR5(r) ".16b\n" \ + "eor " VR6(r) ".16b," VR6(r) ".16b," VR6(r) ".16b\n" \ + "eor " VR7(r) ".16b," VR7(r) ".16b," VR7(r) ".16b\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ + case 4: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ + break; \ + case 2: \ + __asm( \ + "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ + : WVR0(r), WVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "mov " VR4(r) ".16b," VR0(r) ".16b\n" \ + "mov " VR5(r) ".16b," VR1(r) ".16b\n" \ + "mov " VR6(r) ".16b," VR2(r) ".16b\n" \ + "mov " VR7(r) ".16b," VR3(r) ".16b\n" \ + : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "mov " VR2(r) ".16b," VR0(r) ".16b\n" \ + "mov " VR3(r) ".16b," VR1(r) ".16b\n" \ + : WVR2(r), WVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ + "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ + "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ + "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ + "ld1 { " VR4(r) ".4s },%[SRC4]\n" \ + "ld1 { " VR5(r) ".4s },%[SRC5]\n" \ + "ld1 { " VR6(r) ".4s },%[SRC6]\n" \ + "ld1 { " VR7(r) ".4s },%[SRC7]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16))), \ + [SRC2] "Q" (*(OFFSET(src, 32))), \ + [SRC3] "Q" (*(OFFSET(src, 48))), \ + [SRC4] "Q" (*(OFFSET(src, 64))), \ + [SRC5] "Q" (*(OFFSET(src, 80))), \ + [SRC6] "Q" (*(OFFSET(src, 96))), \ + [SRC7] "Q" (*(OFFSET(src, 112)))); \ + break; \ + case 4: \ + __asm( \ + "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ + "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ + "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ + "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16))), \ + [SRC2] "Q" (*(OFFSET(src, 32))), \ + [SRC3] "Q" (*(OFFSET(src, 48)))); \ + break; \ + case 2: \ + __asm( \ + "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ + "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ + : WVR0(r), WVR1(r) \ + : [SRC0] "Q" (*(OFFSET(src, 0))), \ + [SRC1] "Q" (*(OFFSET(src, 16)))); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "st1 { " VR0(r) ".4s },%[DST0]\n" \ + "st1 { " VR1(r) ".4s },%[DST1]\n" \ + "st1 { " VR2(r) ".4s },%[DST2]\n" \ + "st1 { " VR3(r) ".4s },%[DST3]\n" \ + "st1 { " VR4(r) ".4s },%[DST4]\n" \ + "st1 { " VR5(r) ".4s },%[DST5]\n" \ + "st1 { " VR6(r) ".4s },%[DST6]\n" \ + "st1 { " VR7(r) ".4s },%[DST7]\n" \ + : [DST0] "=Q" (*(OFFSET(dst, 0))), \ + [DST1] "=Q" (*(OFFSET(dst, 16))), \ + [DST2] "=Q" (*(OFFSET(dst, 32))), \ + [DST3] "=Q" (*(OFFSET(dst, 48))), \ + [DST4] "=Q" (*(OFFSET(dst, 64))), \ + [DST5] "=Q" (*(OFFSET(dst, 80))), \ + [DST6] "=Q" (*(OFFSET(dst, 96))), \ + [DST7] "=Q" (*(OFFSET(dst, 112))) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r), \ + RVR4(r), RVR5(r), RVR6(r), RVR7(r)); \ + break; \ + case 4: \ + __asm( \ + "st1 { " VR0(r) ".4s },%[DST0]\n" \ + "st1 { " VR1(r) ".4s },%[DST1]\n" \ + "st1 { " VR2(r) ".4s },%[DST2]\n" \ + "st1 { " VR3(r) ".4s },%[DST3]\n" \ + : [DST0] "=Q" (*(OFFSET(dst, 0))), \ + [DST1] "=Q" (*(OFFSET(dst, 16))), \ + [DST2] "=Q" (*(OFFSET(dst, 32))), \ + [DST3] "=Q" (*(OFFSET(dst, 48))) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 2: \ + __asm( \ + "st1 { " VR0(r) ".4s },%[DST0]\n" \ + "st1 { " VR1(r) ".4s },%[DST1]\n" \ + : [DST0] "=Q" (*(OFFSET(dst, 0))), \ + [DST1] "=Q" (*(OFFSET(dst, 16))) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a numbered variable is + */ +#define _00 "v17" +#define _1d "v16" +#define _temp0 "v19" +#define _temp1 "v18" + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n" \ + "movi " VR(16) ".16b,#0x1d\n" \ + : WVR(16), WVR(17)); \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ + "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ + "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n" \ + "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n" \ + "and v19.16b,v19.16b," VR(16) ".16b\n" \ + "and v18.16b,v18.16b," VR(16) ".16b\n" \ + "and v21.16b,v21.16b," VR(16) ".16b\n" \ + "and v20.16b,v20.16b," VR(16) ".16b\n" \ + "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ + "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ + "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n" \ + "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n" \ + "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ + "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n" \ + "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ + "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ + "and v19.16b,v19.16b," VR(16) ".16b\n" \ + "and v18.16b,v18.16b," VR(16) ".16b\n" \ + "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ + "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ + "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ + "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ + : UVR0(r), UVR1(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a register is + * (here we're using actual registers for the + * clobbered ones) + */ +#define _0f "v15" +#define _a_save "v14" +#define _b_save "v13" +#define _lt_mod_a "v12" +#define _lt_clmul_a "v11" +#define _lt_mod_b "v10" +#define _lt_clmul_b "v15" + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + /* lts for upper part */ \ + "movi v15.16b,#0x0f\n" \ + "ld1 { v10.4s },%[lt0]\n" \ + "ld1 { v11.4s },%[lt1]\n" \ + /* upper part */ \ + "and v14.16b," VR0(r) ".16b,v15.16b\n" \ + "and v13.16b," VR1(r) ".16b,v15.16b\n" \ + "ushr " VR0(r) ".16b," VR0(r) ".16b,#4\n" \ + "ushr " VR1(r) ".16b," VR1(r) ".16b,#4\n" \ + \ + "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \ + "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \ + "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n" \ + "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n" \ + \ + "eor " VR0(r) ".16b,v15.16b,v12.16b\n" \ + "eor " VR1(r) ".16b,v11.16b,v10.16b\n" \ + /* lts for lower part */ \ + "ld1 { v10.4s },%[lt2]\n" \ + "ld1 { v15.4s },%[lt3]\n" \ + /* lower part */ \ + "tbl v12.16b,{v10.16b},v14.16b\n" \ + "tbl v10.16b,{v10.16b},v13.16b\n" \ + "tbl v11.16b,{v15.16b},v14.16b\n" \ + "tbl v15.16b,{v15.16b},v13.16b\n" \ + \ + "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n" \ + "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n" \ + "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \ + : UVR0(r), UVR1(r) \ + : [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])), \ + [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])), \ + [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])), \ + [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0])) \ + : "v10", "v11", "v12", "v13", "v14", "v15"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_23(r)); \ + _MULx2(c, R_01(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +/* Overkill... */ +#if defined(_KERNEL) +#define GEN_X_DEFINE_0_3() \ +register unsigned char w0 asm("v0") __attribute__((vector_size(16))); \ +register unsigned char w1 asm("v1") __attribute__((vector_size(16))); \ +register unsigned char w2 asm("v2") __attribute__((vector_size(16))); \ +register unsigned char w3 asm("v3") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ +register unsigned char w4 asm("v4") __attribute__((vector_size(16))); \ +register unsigned char w5 asm("v5") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ +register unsigned char w6 asm("v6") __attribute__((vector_size(16))); \ +register unsigned char w7 asm("v7") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ +register unsigned char w8 asm("v8") __attribute__((vector_size(16))); \ +register unsigned char w9 asm("v9") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ +register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \ +register unsigned char w11 asm("v11") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ +register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \ +register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \ +register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \ +register unsigned char w15 asm("v15") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ +register unsigned char w16 asm("v16") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ +register unsigned char w17 asm("v17") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ +register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \ +register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \ +register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \ +register unsigned char w21 asm("v21") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ +register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \ +register unsigned char w23 asm("v23") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ +register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \ +register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \ +register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \ +register unsigned char w27 asm("v27") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ +register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \ +register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \ +register unsigned char w30 asm("v30") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ +register unsigned char w31 asm("v31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ +register unsigned char w32 asm("v31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ +register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \ +register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \ +register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \ +register unsigned char w36 asm("v31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ +register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \ +register unsigned char w38 asm("v31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#else +#define GEN_X_DEFINE_0_3() \ + unsigned char w0 __attribute__((vector_size(16))); \ + unsigned char w1 __attribute__((vector_size(16))); \ + unsigned char w2 __attribute__((vector_size(16))); \ + unsigned char w3 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ + unsigned char w4 __attribute__((vector_size(16))); \ + unsigned char w5 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ + unsigned char w6 __attribute__((vector_size(16))); \ + unsigned char w7 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ + unsigned char w8 __attribute__((vector_size(16))); \ + unsigned char w9 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ + unsigned char w10 __attribute__((vector_size(16))); \ + unsigned char w11 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ + unsigned char w12 __attribute__((vector_size(16))); \ + unsigned char w13 __attribute__((vector_size(16))); \ + unsigned char w14 __attribute__((vector_size(16))); \ + unsigned char w15 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ + unsigned char w16 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ + unsigned char w17 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ + unsigned char w18 __attribute__((vector_size(16))); \ + unsigned char w19 __attribute__((vector_size(16))); \ + unsigned char w20 __attribute__((vector_size(16))); \ + unsigned char w21 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ + unsigned char w22 __attribute__((vector_size(16))); \ + unsigned char w23 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ + unsigned char w24 __attribute__((vector_size(16))); \ + unsigned char w25 __attribute__((vector_size(16))); \ + unsigned char w26 __attribute__((vector_size(16))); \ + unsigned char w27 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ + unsigned char w28 __attribute__((vector_size(16))); \ + unsigned char w29 __attribute__((vector_size(16))); \ + unsigned char w30 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ + unsigned char w31 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ + unsigned char w32 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ + unsigned char w33 __attribute__((vector_size(16))); \ + unsigned char w34 __attribute__((vector_size(16))); \ + unsigned char w35 __attribute__((vector_size(16))); \ + unsigned char w36 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ + unsigned char w37 __attribute__((vector_size(16))); \ + unsigned char w38 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#endif diff --git a/module/zfs/vdev_raidz_math_aarch64_neonx2.c b/module/zfs/vdev_raidz_math_aarch64_neonx2.c new file mode 100644 index 000000000000..e072f51cd635 --- /dev/null +++ b/module/zfs/vdev_raidz_math_aarch64_neonx2.c @@ -0,0 +1,232 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + */ + +#include + +#if defined(__aarch64__) + +#include "vdev_raidz_math_aarch64_neon_common.h" + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 8 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define ZERO_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define COPY_STRIDE 8 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define COPY_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define ADD_STRIDE 8 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() +#define ADD_D 0, 1, 2, 3, 4, 5, 6, 7 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() +#define REC_PQ_STRIDE 4 +#define REC_PQ_X 0, 1, 2, 3 +#define REC_PQ_Y 4, 5, 6, 7 +#define REC_PQ_T 8, 9, 22, 23 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() +#define REC_PR_STRIDE 4 +#define REC_PR_X 0, 1, 2, 3 +#define REC_PR_Y 4, 5, 6, 7 +#define REC_PR_T 8, 9, 22, 23 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_33_36() +#define REC_QR_STRIDE 4 +#define REC_QR_X 0, 1, 2, 3 +#define REC_QR_Y 4, 5, 6, 7 +#define REC_QR_T 8, 9, 22, 23 + +#define SYN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(aarch64_neonx2); +/* + * If compiled with -O0, gcc doesn't do any stack frame coalescing + * and -Wframe-larger-than=1024 is triggered in debug mode. + */ +#pragma GCC diagnostic ignored "-Wframe-larger-than=" +DEFINE_REC_METHODS(aarch64_neonx2); +#pragma GCC diagnostic pop + +static boolean_t +raidz_will_aarch64_neonx2_work(void) +{ + return (kfpu_allowed()); +} + +const raidz_impl_ops_t vdev_raidz_aarch64_neonx2_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(aarch64_neonx2), + .rec = RAIDZ_REC_METHODS(aarch64_neonx2), + .is_supported = &raidz_will_aarch64_neonx2_work, + .name = "aarch64_neonx2" +}; + +#endif /* defined(__aarch64__) */ diff --git a/module/zfs/vdev_raidz_math_avx2.c b/module/zfs/vdev_raidz_math_avx2.c new file mode 100644 index 000000000000..65e4bebce8fa --- /dev/null +++ b/module/zfs/vdev_raidz_math_avx2.c @@ -0,0 +1,413 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ +#include + +#if defined(__x86_64) && defined(HAVE_AVX2) + +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "ymm"#REG +#define VR1_(_1, REG, ...) "ymm"#REG +#define VR2_(_1, _2, REG, ...) "ymm"#REG +#define VR3_(_1, _2, _3, REG, ...) "ymm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "ymm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 32 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxor 0x40(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxor 0x60(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vpxor 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxor 0x20(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxor %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxor %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxor %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxor %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxor %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxor %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa 0x40(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa 0x60(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa 0x20(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \ + "vmovdqa %%" VR2(r) ", 0x40(%[DST])\n" \ + "vmovdqa %%" VR3(r) ", 0x60(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa %%" VR1(r) ", 0x20(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define FLUSH() \ +{ \ + __asm("vzeroupper"); \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm14" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm14, %ymm14"); \ + __asm("vpxor %ymm15, %ymm15 ,%ymm15"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpcmpgtb %" VR0(r)", %ymm15, %ymm12\n" \ + "vpcmpgtb %" VR1(r)", %ymm15, %ymm13\n" \ + "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \ + "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \ + "vpand %ymm14, %ymm12, %ymm12\n" \ + "vpand %ymm14, %ymm13, %ymm13\n" \ + "vpxor %ymm12, %" VR0(r)", %" VR0(r) "\n" \ + "vpxor %ymm13, %" VR1(r)", %" VR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "ymm15" +#define _as "ymm14" +#define _bs "ymm13" +#define _ltmod "ymm12" +#define _ltmul "ymm11" +#define _ta "ymm10" +#define _tb "ymm15" + +static const uint8_t __attribute__((aligned(32))) _mul_mask = 0x0F; + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpbroadcastb (%[mask]), %%" _0f "\n" \ + /* upper bits */ \ + "vbroadcasti128 0x00(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti128 0x10(%[lt]), %%" _ltmul "\n" \ + \ + "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \ + "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \ + "vpand %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpand %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpand %%" _0f ", %%" _as ", %%" _as "\n" \ + "vpand %%" _0f ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \ + "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \ + /* lower bits */ \ + "vbroadcasti128 0x20(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti128 0x30(%[lt]), %%" _ltmul "\n" \ + \ + "vpxor %%" _ta ", %%" _as ", %%" _as "\n" \ + "vpxor %%" _tb ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\ + "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\ + \ + "vpxor %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxor %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxor %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpxor %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \ + : : [mask] "r" (&_mul_mask), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_01(r)); \ + _MULx2(c, R_23(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() \ +{ \ + FLUSH(); \ + kfpu_end(); \ +} + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx2); +DEFINE_REC_METHODS(avx2); + +static boolean_t +raidz_will_avx2_work(void) +{ + return (kfpu_allowed() && zfs_avx_available() && zfs_avx2_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx2_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx2), + .rec = RAIDZ_REC_METHODS(avx2), + .is_supported = &raidz_will_avx2_work, + .name = "avx2" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AVX2) */ diff --git a/module/zfs/vdev_raidz_math_avx512bw.c b/module/zfs/vdev_raidz_math_avx512bw.c new file mode 100644 index 000000000000..f06b469023eb --- /dev/null +++ b/module/zfs/vdev_raidz_math_avx512bw.c @@ -0,0 +1,413 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_AVX512BW) + +#include +#include +#include + + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "zmm"#REG +#define VR1_(_1, REG, ...) "zmm"#REG +#define VR2_(_1, _2, REG, ...) "zmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 64 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa64 %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \ + "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm22" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm22, %zmm22"); \ + __asm("vpxord %zmm23, %zmm23 ,%zmm23"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpcmpb $1, %zmm23, %" VR0(r)", %k1\n" \ + "vpcmpb $1, %zmm23, %" VR1(r)", %k2\n" \ + "vpaddb %" VR0(r)", %" VR0(r)", %" VR0(r) "\n" \ + "vpaddb %" VR1(r)", %" VR1(r)", %" VR1(r) "\n" \ + "vpxord %zmm22, %" VR0(r)", %zmm12\n" \ + "vpxord %zmm22, %" VR1(r)", %zmm13\n" \ + "vmovdqu8 %zmm12, %" VR0(r) "{%k1}\n" \ + "vmovdqu8 %zmm13, %" VR1(r) "{%k2}"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "zmm15" +#define _as "zmm14" +#define _bs "zmm13" +#define _ltmod "zmm12" +#define _ltmul "zmm11" +#define _ta "zmm10" +#define _tb "zmm15" + +static const uint8_t __attribute__((aligned(64))) _mul_mask = 0x0F; + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpbroadcastb (%[mask]), %%" _0f "\n" \ + /* upper bits */ \ + "vbroadcasti32x4 0x00(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti32x4 0x10(%[lt]), %%" _ltmul "\n" \ + \ + "vpsraw $0x4, %%" VR0(r) ", %%"_as "\n" \ + "vpsraw $0x4, %%" VR1(r) ", %%"_bs "\n" \ + "vpandq %%" _0f ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpandq %%" _0f ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpandq %%" _0f ", %%" _as ", %%" _as "\n" \ + "vpandq %%" _0f ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" _as ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" _bs ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" _as ", %%" _ltmul ", %%" _as "\n" \ + "vpshufb %%" _bs ", %%" _ltmul ", %%" _bs "\n" \ + /* lower bits */ \ + "vbroadcasti32x4 0x20(%[lt]), %%" _ltmod "\n" \ + "vbroadcasti32x4 0x30(%[lt]), %%" _ltmul "\n" \ + \ + "vpxorq %%" _ta ", %%" _as ", %%" _as "\n" \ + "vpxorq %%" _tb ", %%" _bs ", %%" _bs "\n" \ + \ + "vpshufb %%" VR0(r) ", %%" _ltmod ", %%" _ta "\n" \ + "vpshufb %%" VR1(r) ", %%" _ltmod ", %%" _tb "\n" \ + "vpshufb %%" VR0(r) ", %%" _ltmul ", %%" VR0(r) "\n"\ + "vpshufb %%" VR1(r) ", %%" _ltmul ", %%" VR1(r) "\n"\ + \ + "vpxorq %%" _ta ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxorq %%" _as ", %%" VR0(r) ", %%" VR0(r) "\n" \ + "vpxorq %%" _tb ", %%" VR1(r) ", %%" VR1(r) "\n" \ + "vpxorq %%" _bs ", %%" VR1(r) ", %%" VR1(r) "\n" \ + : : [mask] "r" (&_mul_mask), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_01(r)); \ + _MULx2(c, R_23(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +/* + * ZERO, COPY, and MUL operations are already 2x unrolled, which means that + * the stride of these operations for avx512 must not exceed 4. Otherwise, a + * single step would exceed 512B block size. + */ + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx512bw); +DEFINE_REC_METHODS(avx512bw); + +static boolean_t +raidz_will_avx512bw_work(void) +{ + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx512f_available() && zfs_avx512bw_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx512bw_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx512bw), + .rec = RAIDZ_REC_METHODS(avx512bw), + .is_supported = &raidz_will_avx512bw_work, + .name = "avx512bw" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AVX512BW) */ diff --git a/module/zfs/vdev_raidz_math_avx512f.c b/module/zfs/vdev_raidz_math_avx512f.c new file mode 100644 index 000000000000..aab653b77491 --- /dev/null +++ b/module/zfs/vdev_raidz_math_avx512f.c @@ -0,0 +1,494 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Romain Dolbeau. All rights reserved. + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_AVX512F) + +#include +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "zmm"#REG +#define VR1_(_1, REG, ...) "zmm"#REG +#define VR2_(_1, _2, REG, ...) "zmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "zmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "zmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "zmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "zmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "zmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define VRy0_(REG, ...) "ymm"#REG +#define VRy1_(_1, REG, ...) "ymm"#REG +#define VRy2_(_1, _2, REG, ...) "ymm"#REG +#define VRy3_(_1, _2, _3, REG, ...) "ymm"#REG +#define VRy4_(_1, _2, _3, _4, REG, ...) "ymm"#REG +#define VRy5_(_1, _2, _3, _4, _5, REG, ...) "ymm"#REG +#define VRy6_(_1, _2, _3, _4, _5, _6, REG, ...) "ymm"#REG +#define VRy7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "ymm"#REG + +#define VRy0(r...) VRy0_(r) +#define VRy1(r...) VRy1_(r) +#define VRy2(r...) VRy2_(r, 1) +#define VRy3(r...) VRy3_(r, 1, 2) +#define VRy4(r...) VRy4_(r, 1, 2) +#define VRy5(r...) VRy5_(r, 1, 2, 3) +#define VRy6(r...) VRy6_(r, 1, 2, 3, 4) +#define VRy7(r...) VRy7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ELEM_SIZE 64 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vpxorq 0x00(%[SRC]), %%" VR0(r)", %%" VR0(r) "\n" \ + "vpxorq 0x40(%[SRC]), %%" VR1(r)", %%" VR1(r) "\n" \ + "vpxorq 0x80(%[SRC]), %%" VR2(r)", %%" VR2(r) "\n" \ + "vpxorq 0xc0(%[SRC]), %%" VR3(r)", %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR4(r)", %" VR4(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR5(r)", %" VR5(r) "\n" \ + "vpxorq %" VR2(r) ", %" VR6(r)", %" VR6(r) "\n" \ + "vpxorq %" VR3(r) ", %" VR7(r)", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vpxorq %" VR0(r) ", %" VR2(r)", %" VR2(r) "\n" \ + "vpxorq %" VR1(r) ", %" VR3(r)", %" VR3(r)); \ + break; \ + } \ +} + + +#define ZERO(r...) XOR(r, r) + + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR4(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR5(r) "\n" \ + "vmovdqa64 %" VR2(r) ", %" VR6(r) "\n" \ + "vmovdqa64 %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vmovdqa64 %" VR0(r) ", %" VR2(r) "\n" \ + "vmovdqa64 %" VR1(r) ", %" VR3(r)); \ + break; \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 0x00(%[SRC]), %%" VR0(r) "\n" \ + "vmovdqa64 0x40(%[SRC]), %%" VR1(r) "\n" \ + "vmovdqa64 0x80(%[SRC]), %%" VR2(r) "\n" \ + "vmovdqa64 0xc0(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vmovdqa64 %%" VR0(r) ", 0x00(%[DST])\n" \ + "vmovdqa64 %%" VR1(r) ", 0x40(%[DST])\n" \ + "vmovdqa64 %%" VR2(r) ", 0x80(%[DST])\n" \ + "vmovdqa64 %%" VR3(r) ", 0xc0(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm("vmovq %0, %%xmm31" :: "r"(0x1d1d1d1d1d1d1d1d)); \ + __asm("vpbroadcastq %xmm31, %zmm31"); \ + __asm("vmovq %0, %%xmm30" :: "r"(0x8080808080808080)); \ + __asm("vpbroadcastq %xmm30, %zmm30"); \ + __asm("vmovq %0, %%xmm29" :: "r"(0xfefefefefefefefe)); \ + __asm("vpbroadcastq %xmm29, %zmm29"); \ +} + +#define _MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "vpandq %" VR0(r)", %zmm30, %zmm26\n" \ + "vpandq %" VR1(r)", %zmm30, %zmm25\n" \ + "vpsrlq $7, %zmm26, %zmm28\n" \ + "vpsrlq $7, %zmm25, %zmm27\n" \ + "vpsllq $1, %zmm26, %zmm26\n" \ + "vpsllq $1, %zmm25, %zmm25\n" \ + "vpsubq %zmm28, %zmm26, %zmm26\n" \ + "vpsubq %zmm27, %zmm25, %zmm25\n" \ + "vpsllq $1, %" VR0(r)", %" VR0(r) "\n" \ + "vpsllq $1, %" VR1(r)", %" VR1(r) "\n" \ + "vpandq %zmm26, %zmm31, %zmm26\n" \ + "vpandq %zmm25, %zmm31, %zmm25\n" \ + "vpternlogd $0x6c,%zmm29, %zmm26, %" VR0(r) "\n" \ + "vpternlogd $0x6c,%zmm29, %zmm25, %" VR1(r)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2(R_01(r)); \ + _MUL2(R_23(r)); \ + break; \ + case 2: \ + _MUL2(r); \ + break; \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + + +/* General multiplication by adding powers of two */ + +#define _mul_x2_in 21, 22 +#define _mul_x2_acc 23, 24 + +#define _MUL_PARAM(x, in, acc) \ +{ \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ + if (x & 0xfe) { MUL2(in); } \ + if (x & 0x02) { XOR(in, acc); } \ + if (x & 0xfc) { MUL2(in); } \ + if (x & 0x04) { XOR(in, acc); } \ + if (x & 0xf8) { MUL2(in); } \ + if (x & 0x08) { XOR(in, acc); } \ + if (x & 0xf0) { MUL2(in); } \ + if (x & 0x10) { XOR(in, acc); } \ + if (x & 0xe0) { MUL2(in); } \ + if (x & 0x20) { XOR(in, acc); } \ + if (x & 0xc0) { MUL2(in); } \ + if (x & 0x40) { XOR(in, acc); } \ + if (x & 0x80) { MUL2(in); XOR(in, acc); } \ +} + +#define MUL_x2_DEFINE(x) \ +static void \ +mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); } + + +MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3); +MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7); +MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11); +MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15); +MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19); +MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23); +MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27); +MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31); +MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35); +MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39); +MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43); +MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47); +MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51); +MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55); +MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59); +MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63); +MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67); +MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71); +MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75); +MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79); +MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83); +MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87); +MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91); +MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95); +MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99); +MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103); +MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107); +MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111); +MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115); +MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119); +MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123); +MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127); +MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131); +MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135); +MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139); +MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143); +MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147); +MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151); +MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155); +MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159); +MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163); +MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167); +MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171); +MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175); +MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179); +MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183); +MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187); +MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191); +MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195); +MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199); +MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203); +MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207); +MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211); +MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215); +MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219); +MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223); +MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227); +MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231); +MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235); +MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239); +MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243); +MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247); +MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251); +MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255); + + +typedef void (*mul_fn_ptr_t)(void); + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x2_mul_fns[256] = { + mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5, + mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11, + mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17, + mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23, + mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29, + mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35, + mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41, + mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47, + mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53, + mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59, + mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65, + mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71, + mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77, + mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83, + mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89, + mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95, + mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101, + mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107, + mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113, + mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119, + mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125, + mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131, + mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137, + mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143, + mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149, + mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155, + mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161, + mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167, + mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173, + mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179, + mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185, + mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191, + mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197, + mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203, + mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209, + mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215, + mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221, + mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227, + mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233, + mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239, + mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245, + mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251, + mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255 +}; + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + COPY(R_01(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_01(r)); \ + COPY(R_23(r), _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, R_23(r)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 4 +#define REC_PQ_DEFINE() MUL2_SETUP() +#define REC_PQ_X 0, 1, 2, 3 +#define REC_PQ_Y 4, 5, 6, 7 +#define REC_PQ_T 8, 9, 10, 11 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 4 +#define REC_PR_DEFINE() MUL2_SETUP() +#define REC_PR_X 0, 1, 2, 3 +#define REC_PR_Y 4, 5, 6, 7 +#define REC_PR_T 8, 9, 10, 11 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 4 +#define REC_QR_DEFINE() MUL2_SETUP() +#define REC_QR_X 0, 1, 2, 3 +#define REC_QR_Y 4, 5, 6, 7 +#define REC_QR_T 8, 9, 10, 11 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 4 +#define REC_PQR_DEFINE() MUL2_SETUP() +#define REC_PQR_X 0, 1, 2, 3 +#define REC_PQR_Y 4, 5, 6, 7 +#define REC_PQR_Z 8, 9, 10, 11 +#define REC_PQR_XS 12, 13, 14, 15 +#define REC_PQR_YS 16, 17, 18, 19 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(avx512f); +DEFINE_REC_METHODS(avx512f); + +static boolean_t +raidz_will_avx512f_work(void) +{ + return (kfpu_allowed() && zfs_avx_available() && + zfs_avx2_available() && zfs_avx512f_available()); +} + +const raidz_impl_ops_t vdev_raidz_avx512f_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(avx512f), + .rec = RAIDZ_REC_METHODS(avx512f), + .is_supported = &raidz_will_avx512f_work, + .name = "avx512f" +}; + +#endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ diff --git a/module/zfs/vdev_raidz_math_impl.h b/module/zfs/vdev_raidz_math_impl.h new file mode 100644 index 000000000000..89c2082c4ab9 --- /dev/null +++ b/module/zfs/vdev_raidz_math_impl.h @@ -0,0 +1,1477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#ifndef _VDEV_RAIDZ_MATH_IMPL_H +#define _VDEV_RAIDZ_MATH_IMPL_H + +#include + +#define raidz_inline inline __attribute__((always_inline)) +#ifndef noinline +#define noinline __attribute__((noinline)) +#endif + +/* + * Functions calculate multiplication constants for data reconstruction. + * Coefficients depend on RAIDZ geometry, indexes of failed child vdevs, and + * used parity columns for reconstruction. + * @rm RAIDZ map + * @tgtidx array of missing data indexes + * @coeff output array of coefficients. Array must be provided by + * user and must hold minimum MUL_CNT values. + */ +static noinline void +raidz_rec_q_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_Q_X] = gf_exp2(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_r_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + + coeff[MUL_R_X] = gf_exp4(255 - (ncols - x - 1)); +} + +static noinline void +raidz_rec_pq_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + gf_t a, b, e; + + a = gf_exp2(x + 255 - y); + b = gf_exp2(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PQ_X] = gf_div(a, e); + coeff[MUL_PQ_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_pr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t a, b, e; + + a = gf_exp4(x + 255 - y); + b = gf_exp4(255 - (ncols - x - 1)); + e = a ^ 0x01; + + coeff[MUL_PR_X] = gf_div(a, e); + coeff[MUL_PR_Y] = gf_div(b, e); +} + +static noinline void +raidz_rec_qr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + + gf_t nx, ny, nxxy, nxyy, d; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nxxy = gf_mul(gf_mul(nx, nx), ny); + nxyy = gf_mul(gf_mul(nx, ny), ny); + d = nxxy ^ nxyy; + + coeff[MUL_QR_XQ] = ny; + coeff[MUL_QR_X] = gf_div(ny, d); + coeff[MUL_QR_YQ] = nx; + coeff[MUL_QR_Y] = gf_div(nx, d); +} + +static noinline void +raidz_rec_pqr_coeff(const raidz_map_t *rm, const int *tgtidx, unsigned *coeff) +{ + const unsigned ncols = raidz_ncols(rm); + const unsigned x = tgtidx[TARGET_X]; + const unsigned y = tgtidx[TARGET_Y]; + const unsigned z = tgtidx[TARGET_Z]; + + gf_t nx, ny, nz, nxx, nyy, nzz, nyyz, nyzz, xd, yd; + + nx = gf_exp2(ncols - x - 1); + ny = gf_exp2(ncols - y - 1); + nz = gf_exp2(ncols - z - 1); + + nxx = gf_exp4(ncols - x - 1); + nyy = gf_exp4(ncols - y - 1); + nzz = gf_exp4(ncols - z - 1); + + nyyz = gf_mul(gf_mul(ny, nz), ny); + nyzz = gf_mul(nzz, ny); + + xd = gf_mul(nxx, ny) ^ gf_mul(nx, nyy) ^ nyyz ^ + gf_mul(nxx, nz) ^ gf_mul(nzz, nx) ^ nyzz; + + yd = gf_inv(ny ^ nz); + + coeff[MUL_PQR_XP] = gf_div(nyyz ^ nyzz, xd); + coeff[MUL_PQR_XQ] = gf_div(nyy ^ nzz, xd); + coeff[MUL_PQR_XR] = gf_div(ny ^ nz, xd); + coeff[MUL_PQR_YU] = nx; + coeff[MUL_PQR_YP] = gf_mul(nz, yd); + coeff[MUL_PQR_YQ] = yd; +} + +/* + * Method for zeroing a buffer (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @dsize Destination buffer size + * @private Unused + */ +static int +raidz_zero_abd_cb(void *dc, size_t dsize, void *private) +{ + v_t *dst = (v_t *)dc; + size_t i; + + ZERO_DEFINE(); + + (void) private; /* unused */ + + ZERO(ZERO_D); + + for (i = 0; i < dsize / sizeof (v_t); i += (2 * ZERO_STRIDE)) { + STORE(dst + i, ZERO_D); + STORE(dst + i + ZERO_STRIDE, ZERO_D); + } + + return (0); +} + +#define raidz_zero(dabd, size) \ +{ \ + abd_iterate_func(dabd, 0, size, raidz_zero_abd_cb, NULL); \ +} + +/* + * Method for copying two buffers (can be implemented using SIMD). + * This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_copy_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + COPY_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * COPY_STRIDE)) { + LOAD(src + i, COPY_D); + STORE(dst + i, COPY_D); + + LOAD(src + i + COPY_STRIDE, COPY_D); + STORE(dst + i + COPY_STRIDE, COPY_D); + } + + return (0); +} + + +#define raidz_copy(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_copy_abd_cb, NULL);\ +} + +/* + * Method for adding (XORing) two buffers. + * Source and destination are XORed together and result is stored in + * destination buffer. This method is used by multiple for gen/rec functions. + * + * @dc Destination buffer + * @sc Source buffer + * @dsize Destination buffer size + * @ssize Source buffer size + * @private Unused + */ +static int +raidz_add_abd_cb(void *dc, void *sc, size_t size, void *private) +{ + v_t *dst = (v_t *)dc; + const v_t *src = (v_t *)sc; + size_t i; + + ADD_DEFINE(); + + (void) private; /* unused */ + + for (i = 0; i < size / sizeof (v_t); i += (2 * ADD_STRIDE)) { + LOAD(dst + i, ADD_D); + XOR_ACC(src + i, ADD_D); + STORE(dst + i, ADD_D); + + LOAD(dst + i + ADD_STRIDE, ADD_D); + XOR_ACC(src + i + ADD_STRIDE, ADD_D); + STORE(dst + i + ADD_STRIDE, ADD_D); + } + + return (0); +} + +#define raidz_add(dabd, sabd, size) \ +{ \ + abd_iterate_func2(dabd, sabd, 0, 0, size, raidz_add_abd_cb, NULL);\ +} + +/* + * Method for multiplying a buffer with a constant in GF(2^8). + * Symbols from buffer are multiplied by a constant and result is stored + * back in the same buffer. + * + * @dc In/Out data buffer. + * @size Size of the buffer + * @private pointer to the multiplication constant (unsigned) + */ +static int +raidz_mul_abd_cb(void *dc, size_t size, void *private) +{ + const unsigned mul = *((unsigned *)private); + v_t *d = (v_t *)dc; + size_t i; + + MUL_DEFINE(); + + for (i = 0; i < size / sizeof (v_t); i += (2 * MUL_STRIDE)) { + LOAD(d + i, MUL_D); + MUL(mul, MUL_D); + STORE(d + i, MUL_D); + + LOAD(d + i + MUL_STRIDE, MUL_D); + MUL(mul, MUL_D); + STORE(d + i + MUL_STRIDE, MUL_D); + } + + return (0); +} + + +/* + * Syndrome generation/update macros + * + * Require LOAD(), XOR(), STORE(), MUL2(), and MUL4() macros + */ +#define P_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define Q_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL2(T); \ + STORE((t), T); \ +} + +#define R_D_SYNDROME(D, T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + XOR(D, T); \ + STORE((t), T); \ +} + +#define R_SYNDROME(T, t) \ +{ \ + LOAD((t), T); \ + MUL4(T); \ + STORE((t), T); \ +} + + +/* + * PARITY CALCULATION + * + * Macros *_SYNDROME are used for parity/syndrome calculation. + * *_D_SYNDROME() macros are used to calculate syndrome between 0 and + * length of data column, and *_SYNDROME() macros are only for updating + * the parity/syndrome if data column is shorter. + * + * P parity is calculated using raidz_add_abd(). + */ + +/* + * Generate P parity (RAIDZ1) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_p_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t psize = rm->rm_col[CODE_P].rc_size; + abd_t *pabd = rm->rm_col[CODE_P].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* start with first data column */ + raidz_copy(pabd, rm->rm_col[1].rc_abd, psize); + + for (c = 2; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + size = rm->rm_col[c].rc_size; + + /* add data column */ + raidz_add(pabd, dabd, size); + } + + raidz_math_end(); +} + + +/* + * Generate PQ parity (RAIDZ2) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pq_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQ_STRIDE, p += GEN_PQ_STRIDE, + q += GEN_PQ_STRIDE) { + LOAD(d, GEN_PQ_D); + P_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, p); + Q_D_SYNDROME(GEN_PQ_D, GEN_PQ_C, q); + } + for (; q < qend; q += GEN_PQ_STRIDE) { + Q_SYNDROME(GEN_PQ_C, q); + } +} + + +/* + * Generate PQ parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pq_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[2].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[2].rc_abd, csize); + + for (c = 3; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 2, + raidz_gen_pq_add); + } + + raidz_math_end(); +} + + +/* + * Generate PQR parity (RAIDZ3) + * The function is called per data column. + * + * @c array of pointers to parity (code) columns + * @dc pointer to data column + * @csize size of parity columns + * @dsize size of data column + */ +static void +raidz_gen_pqr_add(void **c, const void *dc, const size_t csize, + const size_t dsize) +{ + v_t *p = (v_t *)c[0]; + v_t *q = (v_t *)c[1]; + v_t *r = (v_t *)c[CODE_R]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const qend = q + (csize / sizeof (v_t)); + + GEN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += GEN_PQR_STRIDE, p += GEN_PQR_STRIDE, + q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + LOAD(d, GEN_PQR_D); + P_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, p); + Q_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, q); + R_D_SYNDROME(GEN_PQR_D, GEN_PQR_C, r); + } + for (; q < qend; q += GEN_PQR_STRIDE, r += GEN_PQR_STRIDE) { + Q_SYNDROME(GEN_PQR_C, q); + R_SYNDROME(GEN_PQR_C, r); + } +} + + +/* + * Generate PQR parity (RAIDZ2) + * + * @rm RAIDZ map + */ +static raidz_inline void +raidz_generate_pqr_impl(raidz_map_t * const rm) +{ + size_t c; + const size_t ncols = raidz_ncols(rm); + const size_t csize = rm->rm_col[CODE_P].rc_size; + size_t dsize; + abd_t *dabd; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + + raidz_math_begin(); + + raidz_copy(cabds[CODE_P], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_Q], rm->rm_col[3].rc_abd, csize); + raidz_copy(cabds[CODE_R], rm->rm_col[3].rc_abd, csize); + + for (c = 4; c < ncols; c++) { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + + abd_raidz_gen_iterate(cabds, dabd, csize, dsize, 3, + raidz_gen_pqr_add); + } + + raidz_math_end(); +} + + +/* + * DATA RECONSTRUCTION + * + * Data reconstruction process consists of two phases: + * - Syndrome calculation + * - Data reconstruction + * + * Syndrome is calculated by generating parity using available data columns + * and zeros in places of erasure. Existing parity is added to corresponding + * syndrome value to obtain the [P|Q|R]syn values from equation: + * P = Psyn + Dx + Dy + Dz + * Q = Qsyn + 2^x * Dx + 2^y * Dy + 2^z * Dz + * R = Rsyn + 4^x * Dx + 4^y * Dy + 4^z * Dz + * + * For data reconstruction phase, the corresponding equations are solved + * for missing data (Dx, Dy, Dz). This generally involves multiplying known + * symbols by an coefficient and adding them together. The multiplication + * constant coefficients are calculated ahead of the operation in + * raidz_rec_[q|r|pq|pq|qr|pqr]_coeff() functions. + * + * IMPLEMENTATION NOTE: RAID-Z block can have complex geometry, with "big" + * and "short" columns. + * For this reason, reconstruction is performed in minimum of + * two steps. First, from offset 0 to short_size, then from short_size to + * short_size. Calculation functions REC_[*]_BLOCK() are implemented to work + * over both ranges. The split also enables removal of conditional expressions + * from loop bodies, improving throughput of SIMD implementations. + * For the best performance, all functions marked with raidz_inline attribute + * must be inlined by compiler. + * + * parity data + * columns columns + * <----------> <------------------> + * x y <----+ missing columns (x, y) + * | | + * +---+---+---+---+-v-+---+-v-+---+ ^ 0 + * | | | | | | | | | | + * | | | | | | | | | | + * | P | Q | R | D | D | D | D | D | | + * | | | | 0 | 1 | 2 | 3 | 4 | | + * | | | | | | | | | v + * | | | | | +---+---+---+ ^ short_size + * | | | | | | | + * +---+---+---+---+---+ v big_size + * <------------------> <----------> + * big columns short columns + * + */ + + + + +/* + * Reconstruct single data column using P parity + * + * @syn_method raidz_add_abd() + * @rec_method not applicable + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_p_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + size_t size; + abd_t *dabd; + + raidz_math_begin(); + + /* copy P into target */ + raidz_copy(xabd, rm->rm_col[CODE_P].rc_abd, xsize); + + /* generate p_syndrome */ + for (c = firstdc; c < ncols; c++) { + if (c == x) + continue; + + dabd = rm->rm_col[c].rc_abd; + size = MIN(rm->rm_col[c].rc_size, xsize); + + raidz_add(xabd, dabd, size); + } + + raidz_math_end(); + + return (1 << CODE_P); +} + + +/* + * Generate Q syndrome (Qsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @xsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_q_abd(void **xc, const void *dc, const size_t xsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (xsize / sizeof (v_t)); + + SYN_Q_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_Q_D); + Q_D_SYNDROME(SYN_Q_D, SYN_Q_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + Q_SYNDROME(SYN_Q_X, x); + } +} + + +/* + * Reconstruct single data column using Q parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_q_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + abd_t *xabd = rm->rm_col[x].rc_abd; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_q_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_q_abd); + } + + /* add Q to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_Q].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void*) coeff); + + raidz_math_end(); + + return (1 << CODE_Q); +} + + +/* + * Generate R syndrome (Rsyn) + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_r_abd(void **xc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)xc[TARGET_X]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const xend = x + (tsize / sizeof (v_t)); + + SYN_R_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE) { + LOAD(d, SYN_R_D); + R_D_SYNDROME(SYN_R_D, SYN_R_X, x); + } + for (; x < xend; x += SYN_STRIDE) { + R_SYNDROME(SYN_R_X, x); + } +} + + +/* + * Reconstruct single data column using R parity + * + * @syn_method raidz_add_abd() + * @rec_method raidz_mul_abd_cb() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_r_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t xsize = rm->rm_col[x].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *tabds[] = { xabd }; + + unsigned coeff[MUL_CNT]; + raidz_rec_r_coeff(rm, tgtidx, coeff); + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + } + + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 1, + raidz_syn_r_abd); + } + + /* add R to the syndrome */ + raidz_add(xabd, rm->rm_col[CODE_R].rc_abd, xsize); + + /* transform the syndrome */ + abd_iterate_func(xabd, 0, xsize, raidz_mul_abd_cb, (void *)coeff); + + raidz_math_end(); + + return (1 << CODE_R); +} + + +/* + * Generate P and Q syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pq_abd(void **tc, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PQ_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + P_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, x); + Q_D_SYNDROME(SYN_PQ_D, SYN_PQ_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + Q_SYNDROME(SYN_PQ_X, y); + } +} + +/* + * Reconstruct data using PQ parity and PQ syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pq_abd(void **tc, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)tc[TARGET_X]; + v_t *y = (v_t *)tc[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PQ_DEFINE(); + + for (; x < xend; x += REC_PQ_STRIDE, y += REC_PQ_STRIDE, + p += REC_PQ_STRIDE, q += REC_PQ_STRIDE) { + LOAD(x, REC_PQ_X); + LOAD(y, REC_PQ_Y); + + XOR_ACC(p, REC_PQ_X); + XOR_ACC(q, REC_PQ_Y); + + /* Save Pxy */ + COPY(REC_PQ_X, REC_PQ_T); + + /* Calc X */ + MUL(mul[MUL_PQ_X], REC_PQ_X); + MUL(mul[MUL_PQ_Y], REC_PQ_Y); + XOR(REC_PQ_Y, REC_PQ_X); + STORE(x, REC_PQ_X); + + /* Calc Y */ + XOR(REC_PQ_T, REC_PQ_X); + STORE(y, REC_PQ_X); + } +} + + +/* + * Reconstruct two data columns using PQ parity + * + * @syn_method raidz_syn_pq_abd() + * @rec_method raidz_rec_pq_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pq_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd + }; + + unsigned coeff[MUL_CNT]; + raidz_rec_pq_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pq_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pq_abd, coeff); + + /* Copy shorter targets back to the original abd buffer */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate P and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + const v_t * const yend = y + (tsize / sizeof (v_t)); + + SYN_PR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PR_D); + P_D_SYNDROME(SYN_PR_D, SYN_PR_X, x); + R_D_SYNDROME(SYN_PR_D, SYN_PR_X, y); + } + for (; y < yend; y += SYN_STRIDE) { + R_SYNDROME(SYN_PR_X, y); + } +} + +/* + * Reconstruct data using PR parity and PR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_PR_DEFINE(); + + for (; x < xend; x += REC_PR_STRIDE, y += REC_PR_STRIDE, + p += REC_PR_STRIDE, q += REC_PR_STRIDE) { + LOAD(x, REC_PR_X); + LOAD(y, REC_PR_Y); + XOR_ACC(p, REC_PR_X); + XOR_ACC(q, REC_PR_Y); + + /* Save Pxy */ + COPY(REC_PR_X, REC_PR_T); + + /* Calc X */ + MUL(mul[MUL_PR_X], REC_PR_X); + MUL(mul[MUL_PR_Y], REC_PR_Y); + XOR(REC_PR_Y, REC_PR_X); + STORE(x, REC_PR_X); + + /* Calc Y */ + XOR(REC_PR_T, REC_PR_X); + STORE(y, REC_PR_X); + } +} + + +/* + * Reconstruct two data columns using PR parity + * + * @syn_method raidz_syn_pr_abd() + * @rec_method raidz_rec_pr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[0]; + const size_t y = tgtidx[1]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets are shorter then others. + * They need to be replaced with a new buffer so that syndrome can + * be calculated on full length. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_pr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_pr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + return ((1 << CODE_P) | (1 << CODE_Q)); +} + + +/* + * Generate Q and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_qr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_QR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE) { + LOAD(d, SYN_PQ_D); + Q_D_SYNDROME(SYN_QR_D, SYN_QR_X, x); + R_D_SYNDROME(SYN_QR_D, SYN_QR_X, y); + } + for (; x < xend; x += SYN_STRIDE, y += SYN_STRIDE) { + Q_SYNDROME(SYN_QR_X, x); + R_SYNDROME(SYN_QR_X, y); + } +} + + +/* + * Reconstruct data using QR parity and QR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_qr_abd(void **t, const size_t tsize, void **c, + const unsigned *mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + + REC_QR_DEFINE(); + + for (; x < xend; x += REC_QR_STRIDE, y += REC_QR_STRIDE, + p += REC_QR_STRIDE, q += REC_QR_STRIDE) { + LOAD(x, REC_QR_X); + LOAD(y, REC_QR_Y); + + XOR_ACC(p, REC_QR_X); + XOR_ACC(q, REC_QR_Y); + + /* Save Pxy */ + COPY(REC_QR_X, REC_QR_T); + + /* Calc X */ + MUL(mul[MUL_QR_XQ], REC_QR_X); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_X); /* X = R ^ X */ + MUL(mul[MUL_QR_X], REC_QR_X); /* X = X * xm */ + STORE(x, REC_QR_X); + + /* Calc Y */ + MUL(mul[MUL_QR_YQ], REC_QR_T); /* X = Q * xqm */ + XOR(REC_QR_Y, REC_QR_T); /* X = R ^ X */ + MUL(mul[MUL_QR_Y], REC_QR_T); /* X = X * xm */ + STORE(y, REC_QR_T); + } +} + + +/* + * Reconstruct two data columns using QR parity + * + * @syn_method raidz_syn_qr_abd() + * @rec_method raidz_rec_qr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_qr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *tabds[2] = { xabd, yabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_qr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 2, + raidz_syn_qr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 2, raidz_rec_qr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + + + return ((1 << CODE_Q) | (1 << CODE_R)); +} + + +/* + * Generate P, Q, and R syndromes + * + * @xc array of pointers to syndrome columns + * @dc data column (NULL if missing) + * @tsize size of syndrome columns + * @dsize size of data column (0 if missing) + */ +static void +raidz_syn_pqr_abd(void **c, const void *dc, const size_t tsize, + const size_t dsize) +{ + v_t *x = (v_t *)c[TARGET_X]; + v_t *y = (v_t *)c[TARGET_Y]; + v_t *z = (v_t *)c[TARGET_Z]; + const v_t * const yend = y + (tsize / sizeof (v_t)); + const v_t *d = (const v_t *)dc; + const v_t * const dend = d + (dsize / sizeof (v_t)); + + SYN_PQR_DEFINE(); + + MUL2_SETUP(); + + for (; d < dend; d += SYN_STRIDE, x += SYN_STRIDE, y += SYN_STRIDE, + z += SYN_STRIDE) { + LOAD(d, SYN_PQR_D); + P_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, x) + Q_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, y); + R_D_SYNDROME(SYN_PQR_D, SYN_PQR_X, z); + } + for (; y < yend; y += SYN_STRIDE, z += SYN_STRIDE) { + Q_SYNDROME(SYN_PQR_X, y); + R_SYNDROME(SYN_PQR_X, z); + } +} + + +/* + * Reconstruct data using PRQ parity and PQR syndromes + * + * @tc syndrome/result columns + * @tsize size of syndrome/result columns + * @c parity columns + * @mul array of multiplication constants + */ +static void +raidz_rec_pqr_abd(void **t, const size_t tsize, void **c, + const unsigned * const mul) +{ + v_t *x = (v_t *)t[TARGET_X]; + v_t *y = (v_t *)t[TARGET_Y]; + v_t *z = (v_t *)t[TARGET_Z]; + const v_t * const xend = x + (tsize / sizeof (v_t)); + const v_t *p = (v_t *)c[CODE_P]; + const v_t *q = (v_t *)c[CODE_Q]; + const v_t *r = (v_t *)c[CODE_R]; + + REC_PQR_DEFINE(); + + for (; x < xend; x += REC_PQR_STRIDE, y += REC_PQR_STRIDE, + z += REC_PQR_STRIDE, p += REC_PQR_STRIDE, q += REC_PQR_STRIDE, + r += REC_PQR_STRIDE) { + LOAD(x, REC_PQR_X); + LOAD(y, REC_PQR_Y); + LOAD(z, REC_PQR_Z); + + XOR_ACC(p, REC_PQR_X); + XOR_ACC(q, REC_PQR_Y); + XOR_ACC(r, REC_PQR_Z); + + /* Save Pxyz and Qxyz */ + COPY(REC_PQR_X, REC_PQR_XS); + COPY(REC_PQR_Y, REC_PQR_YS); + + /* Calc X */ + MUL(mul[MUL_PQR_XP], REC_PQR_X); /* Xp = Pxyz * xp */ + MUL(mul[MUL_PQR_XQ], REC_PQR_Y); /* Xq = Qxyz * xq */ + XOR(REC_PQR_Y, REC_PQR_X); + MUL(mul[MUL_PQR_XR], REC_PQR_Z); /* Xr = Rxyz * xr */ + XOR(REC_PQR_Z, REC_PQR_X); /* X = Xp + Xq + Xr */ + STORE(x, REC_PQR_X); + + /* Calc Y */ + XOR(REC_PQR_X, REC_PQR_XS); /* Pyz = Pxyz + X */ + MUL(mul[MUL_PQR_YU], REC_PQR_X); /* Xq = X * upd_q */ + XOR(REC_PQR_X, REC_PQR_YS); /* Qyz = Qxyz + Xq */ + COPY(REC_PQR_XS, REC_PQR_X); /* restore Pyz */ + MUL(mul[MUL_PQR_YP], REC_PQR_X); /* Yp = Pyz * yp */ + MUL(mul[MUL_PQR_YQ], REC_PQR_YS); /* Yq = Qyz * yq */ + XOR(REC_PQR_X, REC_PQR_YS); /* Y = Yp + Yq */ + STORE(y, REC_PQR_YS); + + /* Calc Z */ + XOR(REC_PQR_XS, REC_PQR_YS); /* Z = Pz = Pyz + Y */ + STORE(z, REC_PQR_YS); + } +} + + +/* + * Reconstruct three data columns using PQR parity + * + * @syn_method raidz_syn_pqr_abd() + * @rec_method raidz_rec_pqr_abd() + * + * @rm RAIDZ map + * @tgtidx array of missing data indexes + */ +static raidz_inline int +raidz_reconstruct_pqr_impl(raidz_map_t *rm, const int *tgtidx) +{ + size_t c; + size_t dsize; + abd_t *dabd; + const size_t firstdc = raidz_parity(rm); + const size_t ncols = raidz_ncols(rm); + const size_t x = tgtidx[TARGET_X]; + const size_t y = tgtidx[TARGET_Y]; + const size_t z = tgtidx[TARGET_Z]; + const size_t xsize = rm->rm_col[x].rc_size; + const size_t ysize = rm->rm_col[y].rc_size; + const size_t zsize = rm->rm_col[z].rc_size; + abd_t *xabd = rm->rm_col[x].rc_abd; + abd_t *yabd = rm->rm_col[y].rc_abd; + abd_t *zabd = rm->rm_col[z].rc_abd; + abd_t *tabds[] = { xabd, yabd, zabd }; + abd_t *cabds[] = { + rm->rm_col[CODE_P].rc_abd, + rm->rm_col[CODE_Q].rc_abd, + rm->rm_col[CODE_R].rc_abd + }; + unsigned coeff[MUL_CNT]; + raidz_rec_pqr_coeff(rm, tgtidx, coeff); + + /* + * Check if some of targets is shorter then others + * In this case, shorter target needs to be replaced with + * new buffer so that syndrome can be calculated. + */ + if (ysize < xsize) { + yabd = abd_alloc(xsize, B_FALSE); + tabds[1] = yabd; + } + if (zsize < xsize) { + zabd = abd_alloc(xsize, B_FALSE); + tabds[2] = zabd; + } + + raidz_math_begin(); + + /* Start with first data column if present */ + if (firstdc != x) { + raidz_copy(xabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(yabd, rm->rm_col[firstdc].rc_abd, xsize); + raidz_copy(zabd, rm->rm_col[firstdc].rc_abd, xsize); + } else { + raidz_zero(xabd, xsize); + raidz_zero(yabd, xsize); + raidz_zero(zabd, xsize); + } + + /* generate q_syndrome */ + for (c = firstdc+1; c < ncols; c++) { + if (c == x || c == y || c == z) { + dabd = NULL; + dsize = 0; + } else { + dabd = rm->rm_col[c].rc_abd; + dsize = rm->rm_col[c].rc_size; + } + + abd_raidz_gen_iterate(tabds, dabd, xsize, dsize, 3, + raidz_syn_pqr_abd); + } + + abd_raidz_rec_iterate(cabds, tabds, xsize, 3, raidz_rec_pqr_abd, coeff); + + /* + * Copy shorter targets back to the original abd buffer + */ + if (ysize < xsize) + raidz_copy(rm->rm_col[y].rc_abd, yabd, ysize); + if (zsize < xsize) + raidz_copy(rm->rm_col[z].rc_abd, zabd, zsize); + + raidz_math_end(); + + if (ysize < xsize) + abd_free(yabd); + if (zsize < xsize) + abd_free(zabd); + + return ((1 << CODE_P) | (1 << CODE_Q) | (1 << CODE_R)); +} + +#endif /* _VDEV_RAIDZ_MATH_IMPL_H */ diff --git a/module/zfs/vdev_raidz_math_powerpc_altivec.c b/module/zfs/vdev_raidz_math_powerpc_altivec.c new file mode 100644 index 000000000000..1db2c4cd3a47 --- /dev/null +++ b/module/zfs/vdev_raidz_math_powerpc_altivec.c @@ -0,0 +1,4337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau. All rights reserved. + * + */ + +#include +#include + +#if defined(__powerpc__) +#pragma GCC target("altivec") + +#include "vdev_raidz_math_powerpc_altivec_common.h" + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_33_36() +#define GEN_P_STRIDE 4 +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_Q_STRIDE 4 +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_R_STRIDE 4 +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQ_STRIDE 4 +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQ_STRIDE 2 +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PR_STRIDE 4 +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PR_STRIDE 2 +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_QR_STRIDE 4 +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_QR_STRIDE 2 +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_33_36() +#define SYN_PQR_STRIDE 4 +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_DEFINE() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() +#define REC_PQR_STRIDE 2 +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(powerpc_altivec); +DEFINE_REC_METHODS(powerpc_altivec); + +static boolean_t +raidz_will_powerpc_altivec_work(void) +{ + return (kfpu_allowed()) && zfs_altivec_available(); +} + +const raidz_impl_ops_t vdev_raidz_powerpc_altivec_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(powerpc_altivec), + .rec = RAIDZ_REC_METHODS(powerpc_altivec), + .is_supported = &raidz_will_powerpc_altivec_work, + .name = "powerpc_altivec" +}; + +#endif /* defined(__powerpc__) */ + + +#if defined(__powerpc__) +#if defined(_ZFS_LITTLE_ENDIAN) && _LITTLE_ENDIAN +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08, + 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1e, 0x1c, 0x1a, 0x18, 0x16, 0x14, 0x12, 0x10, + 0x0e, 0x0c, 0x0a, 0x08, 0x06, 0x04, 0x02, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x11, 0x12, 0x17, 0x14, 0x1d, 0x1e, 0x1b, 0x18, + 0x09, 0x0a, 0x0f, 0x0c, 0x05, 0x06, 0x03, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3c, 0x38, 0x34, 0x30, 0x2c, 0x28, 0x24, 0x20, + 0x1c, 0x18, 0x14, 0x10, 0x0c, 0x08, 0x04, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x33, 0x36, 0x39, 0x3c, 0x27, 0x22, 0x2d, 0x28, + 0x1b, 0x1e, 0x11, 0x14, 0x0f, 0x0a, 0x05, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x22, 0x24, 0x2e, 0x28, 0x3a, 0x3c, 0x36, 0x30, + 0x12, 0x14, 0x1e, 0x18, 0x0a, 0x0c, 0x06, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x2d, 0x2a, 0x23, 0x24, 0x31, 0x36, 0x3f, 0x38, + 0x15, 0x12, 0x1b, 0x1c, 0x09, 0x0e, 0x07, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x78, 0x70, 0x68, 0x60, 0x58, 0x50, 0x48, 0x40, + 0x38, 0x30, 0x28, 0x20, 0x18, 0x10, 0x08, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x77, 0x7e, 0x65, 0x6c, 0x53, 0x5a, 0x41, 0x48, + 0x3f, 0x36, 0x2d, 0x24, 0x1b, 0x12, 0x09, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x66, 0x6c, 0x72, 0x78, 0x4e, 0x44, 0x5a, 0x50, + 0x36, 0x3c, 0x22, 0x28, 0x1e, 0x14, 0x0a, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x62, 0x7f, 0x74, 0x45, 0x4e, 0x53, 0x58, + 0x31, 0x3a, 0x27, 0x2c, 0x1d, 0x16, 0x0b, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x44, 0x48, 0x5c, 0x50, 0x74, 0x78, 0x6c, 0x60, + 0x24, 0x28, 0x3c, 0x30, 0x14, 0x18, 0x0c, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4b, 0x46, 0x51, 0x5c, 0x7f, 0x72, 0x65, 0x68, + 0x23, 0x2e, 0x39, 0x34, 0x17, 0x1a, 0x0d, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x5a, 0x54, 0x46, 0x48, 0x62, 0x6c, 0x7e, 0x70, + 0x2a, 0x24, 0x36, 0x38, 0x12, 0x1c, 0x0e, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x55, 0x5a, 0x4b, 0x44, 0x69, 0x66, 0x77, 0x78, + 0x2d, 0x22, 0x33, 0x3c, 0x11, 0x1e, 0x0f, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xff, 0xee, 0xdd, 0xcc, 0xbb, 0xaa, 0x99, 0x88, + 0x77, 0x66, 0x55, 0x44, 0x33, 0x22, 0x11, 0x00 }, + { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xee, 0xfc, 0xca, 0xd8, 0xa6, 0xb4, 0x82, 0x90, + 0x7e, 0x6c, 0x5a, 0x48, 0x36, 0x24, 0x12, 0x00 }, + { 0xa6, 0xbb, 0x9c, 0x81, 0xd2, 0xcf, 0xe8, 0xf5, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe1, 0xf2, 0xc7, 0xd4, 0xad, 0xbe, 0x8b, 0x98, + 0x79, 0x6a, 0x5f, 0x4c, 0x35, 0x26, 0x13, 0x00 }, + { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xcc, 0xd8, 0xe4, 0xf0, 0x9c, 0x88, 0xb4, 0xa0, + 0x6c, 0x78, 0x44, 0x50, 0x3c, 0x28, 0x14, 0x00 }, + { 0x9c, 0x81, 0xa6, 0xbb, 0xf5, 0xe8, 0xcf, 0xd2, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xc3, 0xd6, 0xe9, 0xfc, 0x97, 0x82, 0xbd, 0xa8, + 0x6b, 0x7e, 0x41, 0x54, 0x3f, 0x2a, 0x15, 0x00 }, + { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xd2, 0xc4, 0xfe, 0xe8, 0x8a, 0x9c, 0xa6, 0xb0, + 0x62, 0x74, 0x4e, 0x58, 0x3a, 0x2c, 0x16, 0x00 }, + { 0x81, 0x9c, 0xbb, 0xa6, 0xe8, 0xf5, 0xd2, 0xcf, + 0x4e, 0x53, 0x74, 0x69, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xdd, 0xca, 0xf3, 0xe4, 0x81, 0x96, 0xaf, 0xb8, + 0x65, 0x72, 0x4b, 0x5c, 0x39, 0x2e, 0x17, 0x00 }, + { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x88, 0x90, 0xb8, 0xa0, 0xe8, 0xf0, 0xd8, 0xc0, + 0x48, 0x50, 0x78, 0x60, 0x28, 0x30, 0x18, 0x00 }, + { 0xe8, 0xf5, 0xcf, 0xd2, 0xa6, 0xbb, 0x81, 0x9c, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x87, 0x9e, 0xb5, 0xac, 0xe3, 0xfa, 0xd1, 0xc8, + 0x4f, 0x56, 0x7d, 0x64, 0x2b, 0x32, 0x19, 0x00 }, + { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x96, 0x8c, 0xa2, 0xb8, 0xfe, 0xe4, 0xca, 0xd0, + 0x46, 0x5c, 0x72, 0x68, 0x2e, 0x34, 0x1a, 0x00 }, + { 0xf5, 0xe8, 0xd2, 0xcf, 0xbb, 0xa6, 0x9c, 0x81, + 0x74, 0x69, 0x53, 0x4e, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x99, 0x82, 0xaf, 0xb4, 0xf5, 0xee, 0xc3, 0xd8, + 0x41, 0x5a, 0x77, 0x6c, 0x2d, 0x36, 0x1b, 0x00 }, + { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xb4, 0xa8, 0x8c, 0x90, 0xc4, 0xd8, 0xfc, 0xe0, + 0x54, 0x48, 0x6c, 0x70, 0x24, 0x38, 0x1c, 0x00 }, + { 0xcf, 0xd2, 0xe8, 0xf5, 0x9c, 0x81, 0xbb, 0xa6, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xbb, 0xa6, 0x81, 0x9c, 0xcf, 0xd2, 0xf5, 0xe8, + 0x53, 0x4e, 0x69, 0x74, 0x27, 0x3a, 0x1d, 0x00 }, + { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xaa, 0xb4, 0x96, 0x88, 0xd2, 0xcc, 0xee, 0xf0, + 0x5a, 0x44, 0x66, 0x78, 0x22, 0x3c, 0x1e, 0x00 }, + { 0xd2, 0xcf, 0xf5, 0xe8, 0x81, 0x9c, 0xa6, 0xbb, + 0x69, 0x74, 0x4e, 0x53, 0x3a, 0x27, 0x1d, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xa5, 0xba, 0x9b, 0x84, 0xd9, 0xc6, 0xe7, 0xf8, + 0x5d, 0x42, 0x63, 0x7c, 0x21, 0x3e, 0x1f, 0x00 }, + { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x6b, 0x51, 0x1f, 0x25, 0x83, 0xb9, 0xf7, 0xcd, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xef, 0xce, 0xad, 0x8c, 0x6b, 0x4a, 0x29, 0x08, + 0xe7, 0xc6, 0xa5, 0x84, 0x63, 0x42, 0x21, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xfe, 0xdc, 0xba, 0x98, 0x76, 0x54, 0x32, 0x10, + 0xee, 0xcc, 0xaa, 0x88, 0x66, 0x44, 0x22, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xf1, 0xd2, 0xb7, 0x94, 0x7d, 0x5e, 0x3b, 0x18, + 0xe9, 0xca, 0xaf, 0x8c, 0x65, 0x46, 0x23, 0x00 }, + { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xdc, 0xf8, 0x94, 0xb0, 0x4c, 0x68, 0x04, 0x20, + 0xfc, 0xd8, 0xb4, 0x90, 0x6c, 0x48, 0x24, 0x00 }, + { 0x4c, 0x76, 0x38, 0x02, 0xb9, 0x83, 0xcd, 0xf7, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xd3, 0xf6, 0x99, 0xbc, 0x47, 0x62, 0x0d, 0x28, + 0xfb, 0xde, 0xb1, 0x94, 0x6f, 0x4a, 0x25, 0x00 }, + { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xc2, 0xe4, 0x8e, 0xa8, 0x5a, 0x7c, 0x16, 0x30, + 0xf2, 0xd4, 0xbe, 0x98, 0x6a, 0x4c, 0x26, 0x00 }, + { 0x51, 0x6b, 0x25, 0x1f, 0xa4, 0x9e, 0xd0, 0xea, + 0xbb, 0x81, 0xcf, 0xf5, 0x4e, 0x74, 0x3a, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x98, 0xb0, 0xc8, 0xe0, 0x38, 0x10, 0x68, 0x40, + 0xd8, 0xf0, 0x88, 0xa0, 0x78, 0x50, 0x28, 0x00 }, + { 0x38, 0x02, 0x51, 0x6b, 0xea, 0xd0, 0x83, 0xb9, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x97, 0xbe, 0xc5, 0xec, 0x33, 0x1a, 0x61, 0x48, + 0xdf, 0xf6, 0x8d, 0xa4, 0x7b, 0x52, 0x29, 0x00 }, + { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x86, 0xac, 0xd2, 0xf8, 0x2e, 0x04, 0x7a, 0x50, + 0xd6, 0xfc, 0x82, 0xa8, 0x7e, 0x54, 0x2a, 0x00 }, + { 0x25, 0x1f, 0x4c, 0x76, 0xf7, 0xcd, 0x9e, 0xa4, + 0x81, 0xbb, 0xe8, 0xd2, 0x53, 0x69, 0x3a, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x89, 0xa2, 0xdf, 0xf4, 0x25, 0x0e, 0x73, 0x58, + 0xd1, 0xfa, 0x87, 0xac, 0x7d, 0x56, 0x2b, 0x00 }, + { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xa4, 0x88, 0xfc, 0xd0, 0x14, 0x38, 0x4c, 0x60, + 0xc4, 0xe8, 0x9c, 0xb0, 0x74, 0x58, 0x2c, 0x00 }, + { 0x1f, 0x25, 0x76, 0x4c, 0xd0, 0xea, 0xb9, 0x83, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xab, 0x86, 0xf1, 0xdc, 0x1f, 0x32, 0x45, 0x68, + 0xc3, 0xee, 0x99, 0xb4, 0x77, 0x5a, 0x2d, 0x00 }, + { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xba, 0x94, 0xe6, 0xc8, 0x02, 0x2c, 0x5e, 0x70, + 0xca, 0xe4, 0x96, 0xb8, 0x72, 0x5c, 0x2e, 0x00 }, + { 0x02, 0x38, 0x6b, 0x51, 0xcd, 0xf7, 0xa4, 0x9e, + 0x9c, 0xa6, 0xf5, 0xcf, 0x53, 0x69, 0x3a, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0xb5, 0x9a, 0xeb, 0xc4, 0x09, 0x26, 0x57, 0x78, + 0xcd, 0xe2, 0x93, 0xbc, 0x71, 0x5e, 0x2f, 0x00 }, + { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0xd0, 0xf7, 0x9e, 0xb9, 0x4c, 0x6b, 0x02, 0x25, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x1f, 0x2e, 0x7d, 0x4c, 0xdb, 0xea, 0xb9, 0x88, + 0x97, 0xa6, 0xf5, 0xc4, 0x53, 0x62, 0x31, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x0e, 0x3c, 0x6a, 0x58, 0xc6, 0xf4, 0xa2, 0x90, + 0x9e, 0xac, 0xfa, 0xc8, 0x56, 0x64, 0x32, 0x00 }, + { 0xcd, 0xea, 0x83, 0xa4, 0x51, 0x76, 0x1f, 0x38, + 0xf5, 0xd2, 0xbb, 0x9c, 0x69, 0x4e, 0x27, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x01, 0x32, 0x67, 0x54, 0xcd, 0xfe, 0xab, 0x98, + 0x99, 0xaa, 0xff, 0xcc, 0x55, 0x66, 0x33, 0x00 }, + { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x2c, 0x18, 0x44, 0x70, 0xfc, 0xc8, 0x94, 0xa0, + 0x8c, 0xb8, 0xe4, 0xd0, 0x5c, 0x68, 0x34, 0x00 }, + { 0xf7, 0xd0, 0xb9, 0x9e, 0x76, 0x51, 0x38, 0x1f, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x23, 0x16, 0x49, 0x7c, 0xf7, 0xc2, 0x9d, 0xa8, + 0x8b, 0xbe, 0xe1, 0xd4, 0x5f, 0x6a, 0x35, 0x00 }, + { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x32, 0x04, 0x5e, 0x68, 0xea, 0xdc, 0x86, 0xb0, + 0x82, 0xb4, 0xee, 0xd8, 0x5a, 0x6c, 0x36, 0x00 }, + { 0xea, 0xcd, 0xa4, 0x83, 0x6b, 0x4c, 0x25, 0x02, + 0xe8, 0xcf, 0xa6, 0x81, 0x69, 0x4e, 0x27, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3d, 0x0a, 0x53, 0x64, 0xe1, 0xd6, 0x8f, 0xb8, + 0x85, 0xb2, 0xeb, 0xdc, 0x59, 0x6e, 0x37, 0x00 }, + { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x68, 0x50, 0x18, 0x20, 0x88, 0xb0, 0xf8, 0xc0, + 0xa8, 0x90, 0xd8, 0xe0, 0x48, 0x70, 0x38, 0x00 }, + { 0x83, 0xa4, 0xd0, 0xf7, 0x25, 0x02, 0x76, 0x51, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x67, 0x5e, 0x15, 0x2c, 0x83, 0xba, 0xf1, 0xc8, + 0xaf, 0x96, 0xdd, 0xe4, 0x4b, 0x72, 0x39, 0x00 }, + { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x76, 0x4c, 0x02, 0x38, 0x9e, 0xa4, 0xea, 0xd0, + 0xa6, 0x9c, 0xd2, 0xe8, 0x4e, 0x74, 0x3a, 0x00 }, + { 0x9e, 0xb9, 0xcd, 0xea, 0x38, 0x1f, 0x6b, 0x4c, + 0xd2, 0xf5, 0x81, 0xa6, 0x74, 0x53, 0x27, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x79, 0x42, 0x0f, 0x34, 0x95, 0xae, 0xe3, 0xd8, + 0xa1, 0x9a, 0xd7, 0xec, 0x4d, 0x76, 0x3b, 0x00 }, + { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x54, 0x68, 0x2c, 0x10, 0xa4, 0x98, 0xdc, 0xe0, + 0xb4, 0x88, 0xcc, 0xf0, 0x44, 0x78, 0x3c, 0x00 }, + { 0xa4, 0x83, 0xf7, 0xd0, 0x1f, 0x38, 0x4c, 0x6b, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x5b, 0x66, 0x21, 0x1c, 0xaf, 0x92, 0xd5, 0xe8, + 0xb3, 0x8e, 0xc9, 0xf4, 0x47, 0x7a, 0x3d, 0x00 }, + { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4a, 0x74, 0x36, 0x08, 0xb2, 0x8c, 0xce, 0xf0, + 0xba, 0x84, 0xc6, 0xf8, 0x42, 0x7c, 0x3e, 0x00 }, + { 0xb9, 0x9e, 0xea, 0xcd, 0x02, 0x25, 0x51, 0x76, + 0xcf, 0xe8, 0x9c, 0xbb, 0x74, 0x53, 0x27, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x45, 0x7a, 0x3b, 0x04, 0xb9, 0x86, 0xc7, 0xf8, + 0xbd, 0x82, 0xc3, 0xfc, 0x41, 0x7e, 0x3f, 0x00 }, + { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0xd6, 0xa2, 0x3e, 0x4a, 0x1b, 0x6f, 0xf3, 0x87, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xcf, 0x8e, 0x4d, 0x0c, 0xcb, 0x8a, 0x49, 0x08, + 0xc7, 0x86, 0x45, 0x04, 0xc3, 0x82, 0x41, 0x00 }, + { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xde, 0x9c, 0x5a, 0x18, 0xd6, 0x94, 0x52, 0x10, + 0xce, 0x8c, 0x4a, 0x08, 0xc6, 0x84, 0x42, 0x00 }, + { 0xcb, 0xbf, 0x23, 0x57, 0x06, 0x72, 0xee, 0x9a, + 0x51, 0x25, 0xb9, 0xcd, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd1, 0x92, 0x57, 0x14, 0xdd, 0x9e, 0x5b, 0x18, + 0xc9, 0x8a, 0x4f, 0x0c, 0xc5, 0x86, 0x43, 0x00 }, + { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xfc, 0xb8, 0x74, 0x30, 0xec, 0xa8, 0x64, 0x20, + 0xdc, 0x98, 0x54, 0x10, 0xcc, 0x88, 0x44, 0x00 }, + { 0xf1, 0x85, 0x19, 0x6d, 0x21, 0x55, 0xc9, 0xbd, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xf3, 0xb6, 0x79, 0x3c, 0xe7, 0xa2, 0x6d, 0x28, + 0xdb, 0x9e, 0x51, 0x14, 0xcf, 0x8a, 0x45, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xe2, 0xa4, 0x6e, 0x28, 0xfa, 0xbc, 0x76, 0x30, + 0xd2, 0x94, 0x5e, 0x18, 0xca, 0x8c, 0x46, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xed, 0xaa, 0x63, 0x24, 0xf1, 0xb6, 0x7f, 0x38, + 0xd5, 0x92, 0x5b, 0x1c, 0xc9, 0x8e, 0x47, 0x00 }, + { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb8, 0xf0, 0x28, 0x60, 0x98, 0xd0, 0x08, 0x40, + 0xf8, 0xb0, 0x68, 0x20, 0xd8, 0x90, 0x48, 0x00 }, + { 0x85, 0xf1, 0x70, 0x04, 0x72, 0x06, 0x87, 0xf3, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb7, 0xfe, 0x25, 0x6c, 0x93, 0xda, 0x01, 0x48, + 0xff, 0xb6, 0x6d, 0x24, 0xdb, 0x92, 0x49, 0x00 }, + { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa6, 0xec, 0x32, 0x78, 0x8e, 0xc4, 0x1a, 0x50, + 0xf6, 0xbc, 0x62, 0x28, 0xde, 0x94, 0x4a, 0x00 }, + { 0x98, 0xec, 0x6d, 0x19, 0x6f, 0x1b, 0x9a, 0xee, + 0x76, 0x02, 0x83, 0xf7, 0x81, 0xf5, 0x74, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa9, 0xe2, 0x3f, 0x74, 0x85, 0xce, 0x13, 0x58, + 0xf1, 0xba, 0x67, 0x2c, 0xdd, 0x96, 0x4b, 0x00 }, + { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x84, 0xc8, 0x1c, 0x50, 0xb4, 0xf8, 0x2c, 0x60, + 0xe4, 0xa8, 0x7c, 0x30, 0xd4, 0x98, 0x4c, 0x00 }, + { 0xa2, 0xd6, 0x57, 0x23, 0x48, 0x3c, 0xbd, 0xc9, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x8b, 0xc6, 0x11, 0x5c, 0xbf, 0xf2, 0x25, 0x68, + 0xe3, 0xae, 0x79, 0x34, 0xd7, 0x9a, 0x4d, 0x00 }, + { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xbf, 0xcb, 0x4a, 0x3e, 0x55, 0x21, 0xa0, 0xd4, + 0x6b, 0x1f, 0x9e, 0xea, 0x81, 0xf5, 0x74, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x95, 0xda, 0x0b, 0x44, 0xa9, 0xe6, 0x37, 0x78, + 0xed, 0xa2, 0x73, 0x3c, 0xd1, 0x9e, 0x4f, 0x00 }, + { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x6d, 0x04, 0xbf, 0xd6, 0xd4, 0xbd, 0x06, 0x6f, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x3f, 0x6e, 0x9d, 0xcc, 0x7b, 0x2a, 0xd9, 0x88, + 0xb7, 0xe6, 0x15, 0x44, 0xf3, 0xa2, 0x51, 0x00 }, + { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x2e, 0x7c, 0x8a, 0xd8, 0x66, 0x34, 0xc2, 0x90, + 0xbe, 0xec, 0x1a, 0x48, 0xf6, 0xa4, 0x52, 0x00 }, + { 0x70, 0x19, 0xa2, 0xcb, 0xc9, 0xa0, 0x1b, 0x72, + 0x02, 0x6b, 0xd0, 0xb9, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x0c, 0x58, 0xa4, 0xf0, 0x5c, 0x08, 0xf4, 0xa0, + 0xac, 0xf8, 0x04, 0x50, 0xfc, 0xa8, 0x54, 0x00 }, + { 0x4a, 0x23, 0x98, 0xf1, 0xee, 0x87, 0x3c, 0x55, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x03, 0x56, 0xa9, 0xfc, 0x57, 0x02, 0xfd, 0xa8, + 0xab, 0xfe, 0x01, 0x54, 0xff, 0xaa, 0x55, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x12, 0x44, 0xbe, 0xe8, 0x4a, 0x1c, 0xe6, 0xb0, + 0xa2, 0xf4, 0x0e, 0x58, 0xfa, 0xac, 0x56, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x1d, 0x4a, 0xb3, 0xe4, 0x41, 0x16, 0xef, 0xb8, + 0xa5, 0xf2, 0x0b, 0x5c, 0xf9, 0xae, 0x57, 0x00 }, + { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x48, 0x10, 0xf8, 0xa0, 0x28, 0x70, 0x98, 0xc0, + 0x88, 0xd0, 0x38, 0x60, 0xe8, 0xb0, 0x58, 0x00 }, + { 0x3e, 0x57, 0xf1, 0x98, 0xbd, 0xd4, 0x72, 0x1b, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x47, 0x1e, 0xf5, 0xac, 0x23, 0x7a, 0x91, 0xc8, + 0x8f, 0xd6, 0x3d, 0x64, 0xeb, 0xb2, 0x59, 0x00 }, + { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x56, 0x0c, 0xe2, 0xb8, 0x3e, 0x64, 0x8a, 0xd0, + 0x86, 0xdc, 0x32, 0x68, 0xee, 0xb4, 0x5a, 0x00 }, + { 0x23, 0x4a, 0xec, 0x85, 0xa0, 0xc9, 0x6f, 0x06, + 0x25, 0x4c, 0xea, 0x83, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x59, 0x02, 0xef, 0xb4, 0x35, 0x6e, 0x83, 0xd8, + 0x81, 0xda, 0x37, 0x6c, 0xed, 0xb6, 0x5b, 0x00 }, + { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x28, 0xcc, 0x90, 0x04, 0x58, 0xbc, 0xe0, + 0x94, 0xc8, 0x2c, 0x70, 0xe4, 0xb8, 0x5c, 0x00 }, + { 0x19, 0x70, 0xd6, 0xbf, 0x87, 0xee, 0x48, 0x21, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x7b, 0x26, 0xc1, 0x9c, 0x0f, 0x52, 0xb5, 0xe8, + 0x93, 0xce, 0x29, 0x74, 0xe7, 0xba, 0x5d, 0x00 }, + { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x6a, 0x34, 0xd6, 0x88, 0x12, 0x4c, 0xae, 0xf0, + 0x9a, 0xc4, 0x26, 0x78, 0xe2, 0xbc, 0x5e, 0x00 }, + { 0x04, 0x6d, 0xcb, 0xa2, 0x9a, 0xf3, 0x55, 0x3c, + 0x38, 0x51, 0xf7, 0x9e, 0xa6, 0xcf, 0x69, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x65, 0x3a, 0xdb, 0x84, 0x19, 0x46, 0xa7, 0xf8, + 0x9d, 0xc2, 0x23, 0x7c, 0xe1, 0xbe, 0x5f, 0x00 }, + { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0xbd, 0xf3, 0x21, 0x6f, 0x98, 0xd6, 0x04, 0x4a, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x2f, 0x4e, 0xed, 0x8c, 0xab, 0xca, 0x69, 0x08, + 0x27, 0x46, 0xe5, 0x84, 0xa3, 0xc2, 0x61, 0x00 }, + { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x3e, 0x5c, 0xfa, 0x98, 0xb6, 0xd4, 0x72, 0x10, + 0x2e, 0x4c, 0xea, 0x88, 0xa6, 0xc4, 0x62, 0x00 }, + { 0xa0, 0xee, 0x3c, 0x72, 0x85, 0xcb, 0x19, 0x57, + 0xf7, 0xb9, 0x6b, 0x25, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x31, 0x52, 0xf7, 0x94, 0xbd, 0xde, 0x7b, 0x18, + 0x29, 0x4a, 0xef, 0x8c, 0xa5, 0xc6, 0x63, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x1c, 0x78, 0xd4, 0xb0, 0x8c, 0xe8, 0x44, 0x20, + 0x3c, 0x58, 0xf4, 0x90, 0xac, 0xc8, 0x64, 0x00 }, + { 0x9a, 0xd4, 0x06, 0x48, 0xa2, 0xec, 0x3e, 0x70, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x13, 0x76, 0xd9, 0xbc, 0x87, 0xe2, 0x4d, 0x28, + 0x3b, 0x5e, 0xf1, 0x94, 0xaf, 0xca, 0x65, 0x00 }, + { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x02, 0x64, 0xce, 0xa8, 0x9a, 0xfc, 0x56, 0x30, + 0x32, 0x54, 0xfe, 0x98, 0xaa, 0xcc, 0x66, 0x00 }, + { 0x87, 0xc9, 0x1b, 0x55, 0xbf, 0xf1, 0x23, 0x6d, + 0xea, 0xa4, 0x76, 0x38, 0xd2, 0x9c, 0x4e, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x0d, 0x6a, 0xc3, 0xa4, 0x91, 0xf6, 0x5f, 0x38, + 0x35, 0x52, 0xfb, 0x9c, 0xa9, 0xce, 0x67, 0x00 }, + { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x58, 0x30, 0x88, 0xe0, 0xf8, 0x90, 0x28, 0x40, + 0x18, 0x70, 0xc8, 0xa0, 0xb8, 0xd0, 0x68, 0x00 }, + { 0xee, 0xa0, 0x6f, 0x21, 0xf1, 0xbf, 0x70, 0x3e, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x57, 0x3e, 0x85, 0xec, 0xf3, 0x9a, 0x21, 0x48, + 0x1f, 0x76, 0xcd, 0xa4, 0xbb, 0xd2, 0x69, 0x00 }, + { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x46, 0x2c, 0x92, 0xf8, 0xee, 0x84, 0x3a, 0x50, + 0x16, 0x7c, 0xc2, 0xa8, 0xbe, 0xd4, 0x6a, 0x00 }, + { 0xf3, 0xbd, 0x72, 0x3c, 0xec, 0xa2, 0x6d, 0x23, + 0xd0, 0x9e, 0x51, 0x1f, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x49, 0x22, 0x9f, 0xf4, 0xe5, 0x8e, 0x33, 0x58, + 0x11, 0x7a, 0xc7, 0xac, 0xbd, 0xd6, 0x6b, 0x00 }, + { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x64, 0x08, 0xbc, 0xd0, 0xd4, 0xb8, 0x0c, 0x60, + 0x04, 0x68, 0xdc, 0xb0, 0xb4, 0xd8, 0x6c, 0x00 }, + { 0xc9, 0x87, 0x48, 0x06, 0xcb, 0x85, 0x4a, 0x04, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x6b, 0x06, 0xb1, 0xdc, 0xdf, 0xb2, 0x05, 0x68, + 0x03, 0x6e, 0xd9, 0xb4, 0xb7, 0xda, 0x6d, 0x00 }, + { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x7a, 0x14, 0xa6, 0xc8, 0xc2, 0xac, 0x1e, 0x70, + 0x0a, 0x64, 0xd6, 0xb8, 0xb2, 0xdc, 0x6e, 0x00 }, + { 0xd4, 0x9a, 0x55, 0x1b, 0xd6, 0x98, 0x57, 0x19, + 0xcd, 0x83, 0x4c, 0x02, 0xcf, 0x81, 0x4e, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x75, 0x1a, 0xab, 0xc4, 0xc9, 0xa6, 0x17, 0x78, + 0x0d, 0x62, 0xd3, 0xbc, 0xb1, 0xde, 0x6f, 0x00 }, + { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x06, 0x55, 0xa0, 0xf3, 0x57, 0x04, 0xf1, 0xa2, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xdf, 0xae, 0x3d, 0x4c, 0x1b, 0x6a, 0xf9, 0x88, + 0x57, 0x26, 0xb5, 0xc4, 0x93, 0xe2, 0x71, 0x00 }, + { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xce, 0xbc, 0x2a, 0x58, 0x06, 0x74, 0xe2, 0x90, + 0x5e, 0x2c, 0xba, 0xc8, 0x96, 0xe4, 0x72, 0x00 }, + { 0x1b, 0x48, 0xbd, 0xee, 0x4a, 0x19, 0xec, 0xbf, + 0xa4, 0xf7, 0x02, 0x51, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xc1, 0xb2, 0x27, 0x54, 0x0d, 0x7e, 0xeb, 0x98, + 0x59, 0x2a, 0xbf, 0xcc, 0x95, 0xe6, 0x73, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xec, 0x98, 0x04, 0x70, 0x3c, 0x48, 0xd4, 0xa0, + 0x4c, 0x38, 0xa4, 0xd0, 0x9c, 0xe8, 0x74, 0x00 }, + { 0x21, 0x72, 0x87, 0xd4, 0x6d, 0x3e, 0xcb, 0x98, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xe3, 0x96, 0x09, 0x7c, 0x37, 0x42, 0xdd, 0xa8, + 0x4b, 0x3e, 0xa1, 0xd4, 0x9f, 0xea, 0x75, 0x00 }, + { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xf2, 0x84, 0x1e, 0x68, 0x2a, 0x5c, 0xc6, 0xb0, + 0x42, 0x34, 0xae, 0xd8, 0x9a, 0xec, 0x76, 0x00 }, + { 0x3c, 0x6f, 0x9a, 0xc9, 0x70, 0x23, 0xd6, 0x85, + 0xb9, 0xea, 0x1f, 0x4c, 0xf5, 0xa6, 0x53, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xfd, 0x8a, 0x13, 0x64, 0x21, 0x56, 0xcf, 0xb8, + 0x45, 0x32, 0xab, 0xdc, 0x99, 0xee, 0x77, 0x00 }, + { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa8, 0xd0, 0x58, 0x20, 0x48, 0x30, 0xb8, 0xc0, + 0x68, 0x10, 0x98, 0xe0, 0x88, 0xf0, 0x78, 0x00 }, + { 0x55, 0x06, 0xee, 0xbd, 0x3e, 0x6d, 0x85, 0xd6, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xa7, 0xde, 0x55, 0x2c, 0x43, 0x3a, 0xb1, 0xc8, + 0x6f, 0x16, 0x9d, 0xe4, 0x8b, 0xf2, 0x79, 0x00 }, + { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb6, 0xcc, 0x42, 0x38, 0x5e, 0x24, 0xaa, 0xd0, + 0x66, 0x1c, 0x92, 0xe8, 0x8e, 0xf4, 0x7a, 0x00 }, + { 0x48, 0x1b, 0xf3, 0xa0, 0x23, 0x70, 0x98, 0xcb, + 0x83, 0xd0, 0x38, 0x6b, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0xb9, 0xc2, 0x4f, 0x34, 0x55, 0x2e, 0xa3, 0xd8, + 0x61, 0x1a, 0x97, 0xec, 0x8d, 0xf6, 0x7b, 0x00 }, + { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x94, 0xe8, 0x6c, 0x10, 0x64, 0x18, 0x9c, 0xe0, + 0x74, 0x08, 0x8c, 0xf0, 0x84, 0xf8, 0x7c, 0x00 }, + { 0x72, 0x21, 0xc9, 0x9a, 0x04, 0x57, 0xbf, 0xec, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x9b, 0xe6, 0x61, 0x1c, 0x6f, 0x12, 0x95, 0xe8, + 0x73, 0x0e, 0x89, 0xf4, 0x87, 0xfa, 0x7d, 0x00 }, + { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x8a, 0xf4, 0x76, 0x08, 0x72, 0x0c, 0x8e, 0xf0, + 0x7a, 0x04, 0x86, 0xf8, 0x82, 0xfc, 0x7e, 0x00 }, + { 0x6f, 0x3c, 0xd4, 0x87, 0x19, 0x4a, 0xa2, 0xf1, + 0x9e, 0xcd, 0x25, 0x76, 0xe8, 0xbb, 0x53, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27, + 0x1d, 0x1d, 0x1d, 0x1d, 0x00, 0x00, 0x00, 0x00 }, + { 0x85, 0xfa, 0x7b, 0x04, 0x79, 0x06, 0x87, 0xf8, + 0x7d, 0x02, 0x83, 0xfc, 0x81, 0xfe, 0x7f, 0x00 }, + { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0xb1, 0x59, 0x7c, 0x94, 0x36, 0xde, 0xfb, 0x13, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9e, 0x1c, 0x9a, 0x18, 0x96, 0x14, 0x92, 0x10, + 0x8e, 0x0c, 0x8a, 0x08, 0x86, 0x04, 0x82, 0x00 }, + { 0xac, 0x44, 0x61, 0x89, 0x2b, 0xc3, 0xe6, 0x0e, + 0xa2, 0x4a, 0x6f, 0x87, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x91, 0x12, 0x97, 0x14, 0x9d, 0x1e, 0x9b, 0x18, + 0x89, 0x0a, 0x8f, 0x0c, 0x85, 0x06, 0x83, 0x00 }, + { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbc, 0x38, 0xb4, 0x30, 0xac, 0x28, 0xa4, 0x20, + 0x9c, 0x18, 0x94, 0x10, 0x8c, 0x08, 0x84, 0x00 }, + { 0x96, 0x7e, 0x5b, 0xb3, 0x0c, 0xe4, 0xc1, 0x29, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb3, 0x36, 0xb9, 0x3c, 0xa7, 0x22, 0xad, 0x28, + 0x9b, 0x1e, 0x91, 0x14, 0x8f, 0x0a, 0x85, 0x00 }, + { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa2, 0x24, 0xae, 0x28, 0xba, 0x3c, 0xb6, 0x30, + 0x92, 0x14, 0x9e, 0x18, 0x8a, 0x0c, 0x86, 0x00 }, + { 0x8b, 0x63, 0x46, 0xae, 0x11, 0xf9, 0xdc, 0x34, + 0xbf, 0x57, 0x72, 0x9a, 0x25, 0xcd, 0xe8, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xad, 0x2a, 0xa3, 0x24, 0xb1, 0x36, 0xbf, 0x38, + 0x95, 0x12, 0x9b, 0x1c, 0x89, 0x0e, 0x87, 0x00 }, + { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf8, 0x70, 0xe8, 0x60, 0xd8, 0x50, 0xc8, 0x40, + 0xb8, 0x30, 0xa8, 0x20, 0x98, 0x10, 0x88, 0x00 }, + { 0xe2, 0x0a, 0x32, 0xda, 0x5f, 0xb7, 0x8f, 0x67, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf7, 0x7e, 0xe5, 0x6c, 0xd3, 0x5a, 0xc1, 0x48, + 0xbf, 0x36, 0xad, 0x24, 0x9b, 0x12, 0x89, 0x00 }, + { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe6, 0x6c, 0xf2, 0x78, 0xce, 0x44, 0xda, 0x50, + 0xb6, 0x3c, 0xa2, 0x28, 0x9e, 0x14, 0x8a, 0x00 }, + { 0xff, 0x17, 0x2f, 0xc7, 0x42, 0xaa, 0x92, 0x7a, + 0x85, 0x6d, 0x55, 0xbd, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe9, 0x62, 0xff, 0x74, 0xc5, 0x4e, 0xd3, 0x58, + 0xb1, 0x3a, 0xa7, 0x2c, 0x9d, 0x16, 0x8b, 0x00 }, + { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc4, 0x48, 0xdc, 0x50, 0xf4, 0x78, 0xec, 0x60, + 0xa4, 0x28, 0xbc, 0x30, 0x94, 0x18, 0x8c, 0x00 }, + { 0xc5, 0x2d, 0x15, 0xfd, 0x65, 0x8d, 0xb5, 0x5d, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xcb, 0x46, 0xd1, 0x5c, 0xff, 0x72, 0xe5, 0x68, + 0xa3, 0x2e, 0xb9, 0x34, 0x97, 0x1a, 0x8d, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xda, 0x54, 0xc6, 0x48, 0xe2, 0x6c, 0xfe, 0x70, + 0xaa, 0x24, 0xb6, 0x38, 0x92, 0x1c, 0x8e, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd5, 0x5a, 0xcb, 0x44, 0xe9, 0x66, 0xf7, 0x78, + 0xad, 0x22, 0xb3, 0x3c, 0x91, 0x1e, 0x8f, 0x00 }, + { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x0a, 0xff, 0xfd, 0x08, 0xf9, 0x0c, 0x0e, 0xfb, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7f, 0xee, 0x5d, 0xcc, 0x3b, 0xaa, 0x19, 0x88, + 0xf7, 0x66, 0xd5, 0x44, 0xb3, 0x22, 0x91, 0x00 }, + { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6e, 0xfc, 0x4a, 0xd8, 0x26, 0xb4, 0x02, 0x90, + 0xfe, 0x6c, 0xda, 0x48, 0xb6, 0x24, 0x92, 0x00 }, + { 0x17, 0xe2, 0xe0, 0x15, 0xe4, 0x11, 0x13, 0xe6, + 0xf1, 0x04, 0x06, 0xf3, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x61, 0xf2, 0x47, 0xd4, 0x2d, 0xbe, 0x0b, 0x98, + 0xf9, 0x6a, 0xdf, 0x4c, 0xb5, 0x26, 0x93, 0x00 }, + { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4c, 0xd8, 0x64, 0xf0, 0x1c, 0x88, 0x34, 0xa0, + 0xec, 0x78, 0xc4, 0x50, 0xbc, 0x28, 0x94, 0x00 }, + { 0x2d, 0xd8, 0xda, 0x2f, 0xc3, 0x36, 0x34, 0xc1, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x43, 0xd6, 0x69, 0xfc, 0x17, 0x82, 0x3d, 0xa8, + 0xeb, 0x7e, 0xc1, 0x54, 0xbf, 0x2a, 0x95, 0x00 }, + { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x52, 0xc4, 0x7e, 0xe8, 0x0a, 0x9c, 0x26, 0xb0, + 0xe2, 0x74, 0xce, 0x58, 0xba, 0x2c, 0x96, 0x00 }, + { 0x30, 0xc5, 0xc7, 0x32, 0xde, 0x2b, 0x29, 0xdc, + 0xec, 0x19, 0x1b, 0xee, 0x02, 0xf7, 0xf5, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5d, 0xca, 0x73, 0xe4, 0x01, 0x96, 0x2f, 0xb8, + 0xe5, 0x72, 0xcb, 0x5c, 0xb9, 0x2e, 0x97, 0x00 }, + { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x08, 0x90, 0x38, 0xa0, 0x68, 0xf0, 0x58, 0xc0, + 0xc8, 0x50, 0xf8, 0x60, 0xa8, 0x30, 0x98, 0x00 }, + { 0x59, 0xac, 0xb3, 0x46, 0x90, 0x65, 0x7a, 0x8f, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x07, 0x9e, 0x35, 0xac, 0x63, 0xfa, 0x51, 0xc8, + 0xcf, 0x56, 0xfd, 0x64, 0xab, 0x32, 0x99, 0x00 }, + { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x16, 0x8c, 0x22, 0xb8, 0x7e, 0xe4, 0x4a, 0xd0, + 0xc6, 0x5c, 0xf2, 0x68, 0xae, 0x34, 0x9a, 0x00 }, + { 0x44, 0xb1, 0xae, 0x5b, 0x8d, 0x78, 0x67, 0x92, + 0xd6, 0x23, 0x3c, 0xc9, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x19, 0x82, 0x2f, 0xb4, 0x75, 0xee, 0x43, 0xd8, + 0xc1, 0x5a, 0xf7, 0x6c, 0xad, 0x36, 0x9b, 0x00 }, + { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x7e, 0x8b, 0x94, 0x61, 0xaa, 0x5f, 0x40, 0xb5, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x3b, 0xa6, 0x01, 0x9c, 0x4f, 0xd2, 0x75, 0xe8, + 0xd3, 0x4e, 0xe9, 0x74, 0xa7, 0x3a, 0x9d, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x2a, 0xb4, 0x16, 0x88, 0x52, 0xcc, 0x6e, 0xf0, + 0xda, 0x44, 0xe6, 0x78, 0xa2, 0x3c, 0x9e, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x53, 0x53, 0x4e, 0x4e, 0x69, 0x69, 0x74, 0x74, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x25, 0xba, 0x1b, 0x84, 0x59, 0xc6, 0x67, 0xf8, + 0xdd, 0x42, 0xe3, 0x7c, 0xa1, 0x3e, 0x9f, 0x00 }, + { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0xda, 0x08, 0x63, 0xb1, 0xb5, 0x67, 0x0c, 0xde, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6f, 0xce, 0x2d, 0x8c, 0xeb, 0x4a, 0xa9, 0x08, + 0x67, 0xc6, 0x25, 0x84, 0xe3, 0x42, 0xa1, 0x00 }, + { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7e, 0xdc, 0x3a, 0x98, 0xf6, 0x54, 0xb2, 0x10, + 0x6e, 0xcc, 0x2a, 0x88, 0xe6, 0x44, 0xa2, 0x00 }, + { 0xc7, 0x15, 0x7e, 0xac, 0xa8, 0x7a, 0x11, 0xc3, + 0x04, 0xd6, 0xbd, 0x6f, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x71, 0xd2, 0x37, 0x94, 0xfd, 0x5e, 0xbb, 0x18, + 0x69, 0xca, 0x2f, 0x8c, 0xe5, 0x46, 0xa3, 0x00 }, + { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5c, 0xf8, 0x14, 0xb0, 0xcc, 0x68, 0x84, 0x20, + 0x7c, 0xd8, 0x34, 0x90, 0xec, 0x48, 0xa4, 0x00 }, + { 0xfd, 0x2f, 0x44, 0x96, 0x8f, 0x5d, 0x36, 0xe4, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x53, 0xf6, 0x19, 0xbc, 0xc7, 0x62, 0x8d, 0x28, + 0x7b, 0xde, 0x31, 0x94, 0xef, 0x4a, 0xa5, 0x00 }, + { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xe0, 0x32, 0x59, 0x8b, 0x92, 0x40, 0x2b, 0xf9, + 0x19, 0xcb, 0xa0, 0x72, 0x6b, 0xb9, 0xd2, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4d, 0xea, 0x03, 0xa4, 0xd1, 0x76, 0x9f, 0x38, + 0x75, 0xd2, 0x3b, 0x9c, 0xe9, 0x4e, 0xa7, 0x00 }, + { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x18, 0xb0, 0x48, 0xe0, 0xb8, 0x10, 0xe8, 0x40, + 0x58, 0xf0, 0x08, 0xa0, 0xf8, 0x50, 0xa8, 0x00 }, + { 0x89, 0x5b, 0x2d, 0xff, 0xdc, 0x0e, 0x78, 0xaa, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x17, 0xbe, 0x45, 0xec, 0xb3, 0x1a, 0xe1, 0x48, + 0x5f, 0xf6, 0x0d, 0xa4, 0xfb, 0x52, 0xa9, 0x00 }, + { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x06, 0xac, 0x52, 0xf8, 0xae, 0x04, 0xfa, 0x50, + 0x56, 0xfc, 0x02, 0xa8, 0xfe, 0x54, 0xaa, 0x00 }, + { 0x94, 0x46, 0x30, 0xe2, 0xc1, 0x13, 0x65, 0xb7, + 0x23, 0xf1, 0x87, 0x55, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x09, 0xa2, 0x5f, 0xf4, 0xa5, 0x0e, 0xf3, 0x58, + 0x51, 0xfa, 0x07, 0xac, 0xfd, 0x56, 0xab, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x24, 0x88, 0x7c, 0xd0, 0x94, 0x38, 0xcc, 0x60, + 0x44, 0xe8, 0x1c, 0xb0, 0xf4, 0x58, 0xac, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x2b, 0x86, 0x71, 0xdc, 0x9f, 0x32, 0xc5, 0x68, + 0x43, 0xee, 0x19, 0xb4, 0xf7, 0x5a, 0xad, 0x00 }, + { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x3a, 0x94, 0x66, 0xc8, 0x82, 0x2c, 0xde, 0x70, + 0x4a, 0xe4, 0x16, 0xb8, 0xf2, 0x5c, 0xae, 0x00 }, + { 0xb3, 0x61, 0x17, 0xc5, 0xfb, 0x29, 0x5f, 0x8d, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x35, 0x9a, 0x6b, 0xc4, 0x89, 0x26, 0xd7, 0x78, + 0x4d, 0xe2, 0x13, 0xbc, 0xf1, 0x5e, 0xaf, 0x00 }, + { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x61, 0xae, 0xe2, 0x2d, 0x7a, 0xb5, 0xf9, 0x36, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9f, 0x2e, 0xfd, 0x4c, 0x5b, 0xea, 0x39, 0x88, + 0x17, 0xa6, 0x75, 0xc4, 0xd3, 0x62, 0xb1, 0x00 }, + { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8e, 0x3c, 0xea, 0x58, 0x46, 0xf4, 0x22, 0x90, + 0x1e, 0xac, 0x7a, 0xc8, 0xd6, 0x64, 0xb2, 0x00 }, + { 0x7c, 0xb3, 0xff, 0x30, 0x67, 0xa8, 0xe4, 0x2b, + 0x57, 0x98, 0xd4, 0x1b, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x81, 0x32, 0xe7, 0x54, 0x4d, 0xfe, 0x2b, 0x98, + 0x19, 0xaa, 0x7f, 0xcc, 0xd5, 0x66, 0xb3, 0x00 }, + { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xac, 0x18, 0xc4, 0x70, 0x7c, 0xc8, 0x14, 0xa0, + 0x0c, 0xb8, 0x64, 0xd0, 0xdc, 0x68, 0xb4, 0x00 }, + { 0x46, 0x89, 0xc5, 0x0a, 0x40, 0x8f, 0xc3, 0x0c, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa3, 0x16, 0xc9, 0x7c, 0x77, 0xc2, 0x1d, 0xa8, + 0x0b, 0xbe, 0x61, 0xd4, 0xdf, 0x6a, 0xb5, 0x00 }, + { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb2, 0x04, 0xde, 0x68, 0x6a, 0xdc, 0x06, 0xb0, + 0x02, 0xb4, 0x6e, 0xd8, 0xda, 0x6c, 0xb6, 0x00 }, + { 0x5b, 0x94, 0xd8, 0x17, 0x5d, 0x92, 0xde, 0x11, + 0x4a, 0x85, 0xc9, 0x06, 0x4c, 0x83, 0xcf, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbd, 0x0a, 0xd3, 0x64, 0x61, 0xd6, 0x0f, 0xb8, + 0x05, 0xb2, 0x6b, 0xdc, 0xd9, 0x6e, 0xb7, 0x00 }, + { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe8, 0x50, 0x98, 0x20, 0x08, 0xb0, 0x78, 0xc0, + 0x28, 0x90, 0x58, 0xe0, 0xc8, 0x70, 0xb8, 0x00 }, + { 0x32, 0xfd, 0xac, 0x63, 0x13, 0xdc, 0x8d, 0x42, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe7, 0x5e, 0x95, 0x2c, 0x03, 0xba, 0x71, 0xc8, + 0x2f, 0x96, 0x5d, 0xe4, 0xcb, 0x72, 0xb9, 0x00 }, + { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf6, 0x4c, 0x82, 0x38, 0x1e, 0xa4, 0x6a, 0xd0, + 0x26, 0x9c, 0x52, 0xe8, 0xce, 0x74, 0xba, 0x00 }, + { 0x2f, 0xe0, 0xb1, 0x7e, 0x0e, 0xc1, 0x90, 0x5f, + 0x70, 0xbf, 0xee, 0x21, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd4, 0x68, 0xac, 0x10, 0x24, 0x98, 0x5c, 0xe0, + 0x34, 0x88, 0x4c, 0xf0, 0xc4, 0x78, 0xbc, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xdb, 0x66, 0xa1, 0x1c, 0x2f, 0x92, 0x55, 0xe8, + 0x33, 0x8e, 0x49, 0xf4, 0xc7, 0x7a, 0xbd, 0x00 }, + { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xca, 0x74, 0xb6, 0x08, 0x32, 0x8c, 0x4e, 0xf0, + 0x3a, 0x84, 0x46, 0xf8, 0xc2, 0x7c, 0xbe, 0x00 }, + { 0x08, 0xc7, 0x96, 0x59, 0x34, 0xfb, 0xaa, 0x65, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x4e, 0x4e, 0x53, 0x53, 0x74, 0x74, 0x69, 0x69, + 0x27, 0x27, 0x3a, 0x3a, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc5, 0x7a, 0xbb, 0x04, 0x39, 0x86, 0x47, 0xf8, + 0x3d, 0x82, 0x43, 0xfc, 0xc1, 0x7e, 0xbf, 0x00 }, + { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x67, 0xfb, 0x42, 0xde, 0x2d, 0xb1, 0x08, 0x94, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4f, 0x8e, 0xcd, 0x0c, 0x4b, 0x8a, 0xc9, 0x08, + 0x47, 0x86, 0xc5, 0x04, 0x43, 0x82, 0xc1, 0x00 }, + { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5e, 0x9c, 0xda, 0x18, 0x56, 0x94, 0xd2, 0x10, + 0x4e, 0x8c, 0xca, 0x08, 0x46, 0x84, 0xc2, 0x00 }, + { 0x7a, 0xe6, 0x5f, 0xc3, 0x30, 0xac, 0x15, 0x89, + 0xf3, 0x6f, 0xd6, 0x4a, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x51, 0x92, 0xd7, 0x14, 0x5d, 0x9e, 0xdb, 0x18, + 0x49, 0x8a, 0xcf, 0x0c, 0x45, 0x86, 0xc3, 0x00 }, + { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7c, 0xb8, 0xf4, 0x30, 0x6c, 0xa8, 0xe4, 0x20, + 0x5c, 0x98, 0xd4, 0x10, 0x4c, 0x88, 0xc4, 0x00 }, + { 0x40, 0xdc, 0x65, 0xf9, 0x17, 0x8b, 0x32, 0xae, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x73, 0xb6, 0xf9, 0x3c, 0x67, 0xa2, 0xed, 0x28, + 0x5b, 0x9e, 0xd1, 0x14, 0x4f, 0x8a, 0xc5, 0x00 }, + { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x62, 0xa4, 0xee, 0x28, 0x7a, 0xbc, 0xf6, 0x30, + 0x52, 0x94, 0xde, 0x18, 0x4a, 0x8c, 0xc6, 0x00 }, + { 0x5d, 0xc1, 0x78, 0xe4, 0x0a, 0x96, 0x2f, 0xb3, + 0xee, 0x72, 0xcb, 0x57, 0xb9, 0x25, 0x9c, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6d, 0xaa, 0xe3, 0x24, 0x71, 0xb6, 0xff, 0x38, + 0x55, 0x92, 0xdb, 0x1c, 0x49, 0x8e, 0xc7, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x38, 0xf0, 0xa8, 0x60, 0x18, 0xd0, 0x88, 0x40, + 0x78, 0xb0, 0xe8, 0x20, 0x58, 0x90, 0xc8, 0x00 }, + { 0x34, 0xa8, 0x0c, 0x90, 0x44, 0xd8, 0x7c, 0xe0, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x37, 0xfe, 0xa5, 0x6c, 0x13, 0xda, 0x81, 0x48, + 0x7f, 0xb6, 0xed, 0x24, 0x5b, 0x92, 0xc9, 0x00 }, + { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x26, 0xec, 0xb2, 0x78, 0x0e, 0xc4, 0x9a, 0x50, + 0x76, 0xbc, 0xe2, 0x28, 0x5e, 0x94, 0xca, 0x00 }, + { 0x29, 0xb5, 0x11, 0x8d, 0x59, 0xc5, 0x61, 0xfd, + 0xd4, 0x48, 0xec, 0x70, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x29, 0xe2, 0xbf, 0x74, 0x05, 0xce, 0x93, 0x58, + 0x71, 0xba, 0xe7, 0x2c, 0x5d, 0x96, 0xcb, 0x00 }, + { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x04, 0xc8, 0x9c, 0x50, 0x34, 0xf8, 0xac, 0x60, + 0x64, 0xa8, 0xfc, 0x30, 0x54, 0x98, 0xcc, 0x00 }, + { 0x13, 0x8f, 0x2b, 0xb7, 0x7e, 0xe2, 0x46, 0xda, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x0b, 0xc6, 0x91, 0x5c, 0x3f, 0xf2, 0xa5, 0x68, + 0x63, 0xae, 0xf9, 0x34, 0x57, 0x9a, 0xcd, 0x00 }, + { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x1a, 0xd4, 0x86, 0x48, 0x22, 0xec, 0xbe, 0x70, + 0x6a, 0xa4, 0xf6, 0x38, 0x52, 0x9c, 0xce, 0x00 }, + { 0x0e, 0x92, 0x36, 0xaa, 0x63, 0xff, 0x5b, 0xc7, + 0xc9, 0x55, 0xf1, 0x6d, 0xa4, 0x38, 0x9c, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x15, 0xda, 0x8b, 0x44, 0x29, 0xe6, 0xb7, 0x78, + 0x6d, 0xa2, 0xf3, 0x3c, 0x51, 0x9e, 0xcf, 0x00 }, + { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0xdc, 0x5d, 0xc3, 0x42, 0xe2, 0x63, 0xfd, 0x7c, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbf, 0x6e, 0x1d, 0xcc, 0xfb, 0x2a, 0x59, 0x88, + 0x37, 0xe6, 0x95, 0x44, 0x73, 0xa2, 0xd1, 0x00 }, + { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xae, 0x7c, 0x0a, 0xd8, 0xe6, 0x34, 0x42, 0x90, + 0x3e, 0xec, 0x9a, 0x48, 0x76, 0xa4, 0xd2, 0x00 }, + { 0xc1, 0x40, 0xde, 0x5f, 0xff, 0x7e, 0xe0, 0x61, + 0xa0, 0x21, 0xbf, 0x3e, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa1, 0x72, 0x07, 0xd4, 0xed, 0x3e, 0x4b, 0x98, + 0x39, 0xea, 0x9f, 0x4c, 0x75, 0xa6, 0xd3, 0x00 }, + { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8c, 0x58, 0x24, 0xf0, 0xdc, 0x08, 0x74, 0xa0, + 0x2c, 0xf8, 0x84, 0x50, 0x7c, 0xa8, 0xd4, 0x00 }, + { 0xfb, 0x7a, 0xe4, 0x65, 0xd8, 0x59, 0xc7, 0x46, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x83, 0x56, 0x29, 0xfc, 0xd7, 0x02, 0x7d, 0xa8, + 0x2b, 0xfe, 0x81, 0x54, 0x7f, 0xaa, 0xd5, 0x00 }, + { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x92, 0x44, 0x3e, 0xe8, 0xca, 0x1c, 0x66, 0xb0, + 0x22, 0xf4, 0x8e, 0x58, 0x7a, 0xac, 0xd6, 0x00 }, + { 0xe6, 0x67, 0xf9, 0x78, 0xc5, 0x44, 0xda, 0x5b, + 0xbd, 0x3c, 0xa2, 0x23, 0x9e, 0x1f, 0x81, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9d, 0x4a, 0x33, 0xe4, 0xc1, 0x16, 0x6f, 0xb8, + 0x25, 0xf2, 0x8b, 0x5c, 0x79, 0xae, 0xd7, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc8, 0x10, 0x78, 0xa0, 0xa8, 0x70, 0x18, 0xc0, + 0x08, 0xd0, 0xb8, 0x60, 0x68, 0xb0, 0xd8, 0x00 }, + { 0x8f, 0x0e, 0x8d, 0x0c, 0x8b, 0x0a, 0x89, 0x08, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc7, 0x1e, 0x75, 0xac, 0xa3, 0x7a, 0x11, 0xc8, + 0x0f, 0xd6, 0xbd, 0x64, 0x6b, 0xb2, 0xd9, 0x00 }, + { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd6, 0x0c, 0x62, 0xb8, 0xbe, 0x64, 0x0a, 0xd0, + 0x06, 0xdc, 0xb2, 0x68, 0x6e, 0xb4, 0xda, 0x00 }, + { 0x92, 0x13, 0x90, 0x11, 0x96, 0x17, 0x94, 0x15, + 0x87, 0x06, 0x85, 0x04, 0x83, 0x02, 0x81, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd9, 0x02, 0x6f, 0xb4, 0xb5, 0x6e, 0x03, 0xd8, + 0x01, 0xda, 0xb7, 0x6c, 0x6d, 0xb6, 0xdb, 0x00 }, + { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf4, 0x28, 0x4c, 0x90, 0x84, 0x58, 0x3c, 0xe0, + 0x14, 0xc8, 0xac, 0x70, 0x64, 0xb8, 0xdc, 0x00 }, + { 0xa8, 0x29, 0xaa, 0x2b, 0xb1, 0x30, 0xb3, 0x32, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xfb, 0x26, 0x41, 0x9c, 0x8f, 0x52, 0x35, 0xe8, + 0x13, 0xce, 0xa9, 0x74, 0x67, 0xba, 0xdd, 0x00 }, + { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xea, 0x34, 0x56, 0x88, 0x92, 0x4c, 0x2e, 0xf0, + 0x1a, 0xc4, 0xa6, 0x78, 0x62, 0xbc, 0xde, 0x00 }, + { 0xb5, 0x34, 0xb7, 0x36, 0xac, 0x2d, 0xae, 0x2f, + 0x9a, 0x1b, 0x98, 0x19, 0x83, 0x02, 0x81, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x74, 0x74, 0x69, 0x69, 0x53, 0x53, 0x4e, 0x4e, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe5, 0x3a, 0x5b, 0x84, 0x99, 0x46, 0x27, 0xf8, + 0x1d, 0xc2, 0xa3, 0x7c, 0x61, 0xbe, 0xdf, 0x00 }, + { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x0c, 0xaa, 0x5d, 0xfb, 0xae, 0x08, 0xff, 0x59, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xaf, 0x4e, 0x6d, 0x8c, 0x2b, 0xca, 0xe9, 0x08, + 0xa7, 0x46, 0x65, 0x84, 0x23, 0xc2, 0xe1, 0x00 }, + { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xbe, 0x5c, 0x7a, 0x98, 0x36, 0xd4, 0xf2, 0x10, + 0xae, 0x4c, 0x6a, 0x88, 0x26, 0xc4, 0xe2, 0x00 }, + { 0x11, 0xb7, 0x40, 0xe6, 0xb3, 0x15, 0xe2, 0x44, + 0x55, 0xf3, 0x04, 0xa2, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xb1, 0x52, 0x77, 0x94, 0x3d, 0xde, 0xfb, 0x18, + 0xa9, 0x4a, 0x6f, 0x8c, 0x25, 0xc6, 0xe3, 0x00 }, + { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x9c, 0x78, 0x54, 0xb0, 0x0c, 0xe8, 0xc4, 0x20, + 0xbc, 0x58, 0x74, 0x90, 0x2c, 0xc8, 0xe4, 0x00 }, + { 0x2b, 0x8d, 0x7a, 0xdc, 0x94, 0x32, 0xc5, 0x63, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x93, 0x76, 0x59, 0xbc, 0x07, 0xe2, 0xcd, 0x28, + 0xbb, 0x5e, 0x71, 0x94, 0x2f, 0xca, 0xe5, 0x00 }, + { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x82, 0x64, 0x4e, 0xa8, 0x1a, 0xfc, 0xd6, 0x30, + 0xb2, 0x54, 0x7e, 0x98, 0x2a, 0xcc, 0xe6, 0x00 }, + { 0x36, 0x90, 0x67, 0xc1, 0x89, 0x2f, 0xd8, 0x7e, + 0x48, 0xee, 0x19, 0xbf, 0xf7, 0x51, 0xa6, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x8d, 0x6a, 0x43, 0xa4, 0x11, 0xf6, 0xdf, 0x38, + 0xb5, 0x52, 0x7b, 0x9c, 0x29, 0xce, 0xe7, 0x00 }, + { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd8, 0x30, 0x08, 0xe0, 0x78, 0x90, 0xa8, 0x40, + 0x98, 0x70, 0x48, 0xa0, 0x38, 0xd0, 0xe8, 0x00 }, + { 0x5f, 0xf9, 0x13, 0xb5, 0xc7, 0x61, 0x8b, 0x2d, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xd7, 0x3e, 0x05, 0xec, 0x73, 0x9a, 0xa1, 0x48, + 0x9f, 0x76, 0x4d, 0xa4, 0x3b, 0xd2, 0xe9, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc6, 0x2c, 0x12, 0xf8, 0x6e, 0x84, 0xba, 0x50, + 0x96, 0x7c, 0x42, 0xa8, 0x3e, 0xd4, 0xea, 0x00 }, + { 0x42, 0xe4, 0x0e, 0xa8, 0xda, 0x7c, 0x96, 0x30, + 0x72, 0xd4, 0x3e, 0x98, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xc9, 0x22, 0x1f, 0xf4, 0x65, 0x8e, 0xb3, 0x58, + 0x91, 0x7a, 0x47, 0xac, 0x3d, 0xd6, 0xeb, 0x00 }, + { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xe4, 0x08, 0x3c, 0xd0, 0x54, 0xb8, 0x8c, 0x60, + 0x84, 0x68, 0x5c, 0xb0, 0x34, 0xd8, 0xec, 0x00 }, + { 0x78, 0xde, 0x34, 0x92, 0xfd, 0x5b, 0xb1, 0x17, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xeb, 0x06, 0x31, 0xdc, 0x5f, 0xb2, 0x85, 0x68, + 0x83, 0x6e, 0x59, 0xb4, 0x37, 0xda, 0xed, 0x00 }, + { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xfa, 0x14, 0x26, 0xc8, 0x42, 0xac, 0x9e, 0x70, + 0x8a, 0x64, 0x56, 0xb8, 0x32, 0xdc, 0xee, 0x00 }, + { 0x65, 0xc3, 0x29, 0x8f, 0xe0, 0x46, 0xac, 0x0a, + 0x6f, 0xc9, 0x23, 0x85, 0xea, 0x4c, 0xa6, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0xf5, 0x1a, 0x2b, 0xc4, 0x49, 0xa6, 0x97, 0x78, + 0x8d, 0x62, 0x53, 0xbc, 0x31, 0xde, 0xef, 0x00 }, + { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0xb7, 0x0c, 0xdc, 0x67, 0x61, 0xda, 0x0a, 0xb1, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xf0, 0xe0, 0xd0, 0xc0, 0xb0, 0xa0, 0x90, 0x80, + 0x70, 0x60, 0x50, 0x40, 0x30, 0x20, 0x10, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x5f, 0xae, 0xbd, 0x4c, 0x9b, 0x6a, 0x79, 0x88, + 0xd7, 0x26, 0x35, 0xc4, 0x13, 0xe2, 0xf1, 0x00 }, + { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00, + 0xe0, 0xc0, 0xa0, 0x80, 0x60, 0x40, 0x20, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x4e, 0xbc, 0xaa, 0x58, 0x86, 0x74, 0x62, 0x90, + 0xde, 0x2c, 0x3a, 0xc8, 0x16, 0xe4, 0xf2, 0x00 }, + { 0xaa, 0x11, 0xc1, 0x7a, 0x7c, 0xc7, 0x17, 0xac, + 0x06, 0xbd, 0x6d, 0xd6, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x10, 0x20, 0x70, 0x40, 0xd0, 0xe0, 0xb0, 0x80, + 0x90, 0xa0, 0xf0, 0xc0, 0x50, 0x60, 0x30, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x41, 0xb2, 0xa7, 0x54, 0x8d, 0x7e, 0x6b, 0x98, + 0xd9, 0x2a, 0x3f, 0xcc, 0x15, 0xe6, 0xf3, 0x00 }, + { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00, + 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x6c, 0x98, 0x84, 0x70, 0xbc, 0x48, 0x54, 0xa0, + 0xcc, 0x38, 0x24, 0xd0, 0x1c, 0xe8, 0xf4, 0x00 }, + { 0x90, 0x2b, 0xfb, 0x40, 0x5b, 0xe0, 0x30, 0x8b, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x30, 0x60, 0x90, 0xc0, 0x70, 0x20, 0xd0, 0x80, + 0xb0, 0xe0, 0x10, 0x40, 0xf0, 0xa0, 0x50, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x63, 0x96, 0x89, 0x7c, 0xb7, 0x42, 0x5d, 0xa8, + 0xcb, 0x3e, 0x21, 0xd4, 0x1f, 0xea, 0xf5, 0x00 }, + { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00, + 0x20, 0x40, 0xe0, 0x80, 0xa0, 0xc0, 0x60, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x72, 0x84, 0x9e, 0x68, 0xaa, 0x5c, 0x46, 0xb0, + 0xc2, 0x34, 0x2e, 0xd8, 0x1a, 0xec, 0xf6, 0x00 }, + { 0x8d, 0x36, 0xe6, 0x5d, 0x46, 0xfd, 0x2d, 0x96, + 0x1b, 0xa0, 0x70, 0xcb, 0xd0, 0x6b, 0xbb, 0x00 }, + { 0xd0, 0xa0, 0x30, 0x40, 0x10, 0x60, 0xf0, 0x80, + 0x50, 0x20, 0xb0, 0xc0, 0x90, 0xe0, 0x70, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x7d, 0x8a, 0x93, 0x64, 0xa1, 0x56, 0x4f, 0xb8, + 0xc5, 0x32, 0x2b, 0xdc, 0x19, 0xee, 0xf7, 0x00 }, + { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, + 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x28, 0xd0, 0xd8, 0x20, 0xc8, 0x30, 0x38, 0xc0, + 0xe8, 0x10, 0x18, 0xe0, 0x08, 0xf0, 0xf8, 0x00 }, + { 0xe4, 0x5f, 0x92, 0x29, 0x08, 0xb3, 0x7e, 0xc5, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x70, 0xe0, 0x50, 0xc0, 0x30, 0xa0, 0x10, 0x80, + 0xf0, 0x60, 0xd0, 0x40, 0xb0, 0x20, 0x90, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x27, 0xde, 0xd5, 0x2c, 0xc3, 0x3a, 0x31, 0xc8, + 0xef, 0x16, 0x1d, 0xe4, 0x0b, 0xf2, 0xf9, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00, + 0x60, 0xc0, 0x20, 0x80, 0xe0, 0x40, 0xa0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x36, 0xcc, 0xc2, 0x38, 0xde, 0x24, 0x2a, 0xd0, + 0xe6, 0x1c, 0x12, 0xe8, 0x0e, 0xf4, 0xfa, 0x00 }, + { 0xf9, 0x42, 0x8f, 0x34, 0x15, 0xae, 0x63, 0xd8, + 0x21, 0x9a, 0x57, 0xec, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x90, 0x20, 0xf0, 0x40, 0x50, 0xe0, 0x30, 0x80, + 0x10, 0xa0, 0x70, 0xc0, 0xd0, 0x60, 0xb0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x39, 0xc2, 0xcf, 0x34, 0xd5, 0x2e, 0x23, 0xd8, + 0xe1, 0x1a, 0x17, 0xec, 0x0d, 0xf6, 0xfb, 0x00 }, + { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00, + 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x14, 0xe8, 0xec, 0x10, 0xe4, 0x18, 0x1c, 0xe0, + 0xf4, 0x08, 0x0c, 0xf0, 0x04, 0xf8, 0xfc, 0x00 }, + { 0xc3, 0x78, 0xb5, 0x0e, 0x32, 0x89, 0x44, 0xff, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0xb0, 0x60, 0x10, 0xc0, 0xf0, 0x20, 0x50, 0x80, + 0x30, 0xe0, 0x90, 0x40, 0x70, 0xa0, 0xd0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x1b, 0xe6, 0xe1, 0x1c, 0xef, 0x12, 0x15, 0xe8, + 0xf3, 0x0e, 0x09, 0xf4, 0x07, 0xfa, 0xfd, 0x00 }, + { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00, + 0xa0, 0x40, 0x60, 0x80, 0x20, 0xc0, 0xe0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x0a, 0xf4, 0xf6, 0x08, 0xf2, 0x0c, 0x0e, 0xf0, + 0xfa, 0x04, 0x06, 0xf8, 0x02, 0xfc, 0xfe, 0x00 }, + { 0xde, 0x65, 0xa8, 0x13, 0x2f, 0x94, 0x59, 0xe2, + 0x3c, 0x87, 0x4a, 0xf1, 0xcd, 0x76, 0xbb, 0x00 }, + { 0x50, 0xa0, 0xb0, 0x40, 0x90, 0x60, 0x70, 0x80, + 0xd0, 0x20, 0x30, 0xc0, 0x10, 0xe0, 0xf0, 0x00 }, + { 0x69, 0x69, 0x74, 0x74, 0x4e, 0x4e, 0x53, 0x53, + 0x3a, 0x3a, 0x27, 0x27, 0x1d, 0x1d, 0x00, 0x00 }, + { 0x05, 0xfa, 0xfb, 0x04, 0xf9, 0x06, 0x07, 0xf8, + 0xfd, 0x02, 0x03, 0xfc, 0x01, 0xfe, 0xff, 0x00 } +}; +/* END CSTYLED */ +#else +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = { + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, + 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, + 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, + 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, + 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, + 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, + 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, + 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, + 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, + 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, + 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, + 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, + 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, + 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, + 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, + 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, + 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, + 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, + 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, + 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, + 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, + 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, + 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, + 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, + 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, + 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, + 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, + 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, + 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, + 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, + 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, + 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, + 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, + 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, + 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, + 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, + 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, + 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, + 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, + 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, + 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, + 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, + 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, + 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, + 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, + 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, + 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, + 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, + 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, + 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, + 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, + 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, + 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, + 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, + 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, + 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, + 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, + 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, + 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, + 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, + 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, + 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, + 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, + 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, + 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, + 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, + 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, + 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, + 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, + 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, + 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, + 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, + 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, + 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, + 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, + 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, + 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, + 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, + 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, + 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, + 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, + 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, + 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, + 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, + 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, + 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, + 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, + 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, + 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, + 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, + 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, + 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, + 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, + 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, + 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, + 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, + 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, + 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, + 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, + 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, + 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, + 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, + 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, + 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, + 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, + 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, + 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, + 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, + 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, + 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, + 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, + 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, + 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, + 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, + 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, + 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, + 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, + 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, + 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, + 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, + 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, + 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, + 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, + 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, + 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, + 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, + 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, + 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, + 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, + 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, + 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, + 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, + 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, + 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, + 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, + 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, + 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, + 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, + 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, + 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, + 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, + 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, + 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, + 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, + 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, + 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, + 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, + 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, + 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, + 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, + 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, + 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, + 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, + 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, + 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, + 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, + 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, + 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, + 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, + 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, + 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, + 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, + 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, + 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, + 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, + 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, + 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, + 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, + 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, + 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, + 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, + 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, + 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, + 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, + 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, + 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, + 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, + 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, + 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, + 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, + 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, + 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, + 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, + 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, + 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, + 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, + 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, + 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, + 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, + 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, + 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, + 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, + 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, + 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, + 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, + 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, + 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, + 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, + 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, + 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, + 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, + 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, + 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, + 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, + 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, + 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, + 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, + 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, + 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, + 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, + 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, + 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, + 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, + 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, + 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, + 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, + 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, + 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, + 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, + 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, + 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } +}; +/* END CSTYLED */ +#endif // ENDIANNESS +#endif /* defined(__powerpc__) */ diff --git a/module/zfs/vdev_raidz_math_powerpc_altivec_common.h b/module/zfs/vdev_raidz_math_powerpc_altivec_common.h new file mode 100644 index 000000000000..3842f5fd637c --- /dev/null +++ b/module/zfs/vdev_raidz_math_powerpc_altivec_common.h @@ -0,0 +1,690 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2019 Romain Dolbeau. All rights reserved. + * + */ + +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "%[w"#REG"]" +#define VR1_(_1, REG, ...) "%[w"#REG"]" +#define VR2_(_1, _2, REG, ...) "%[w"#REG"]" +#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]" +#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]" +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]" +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]" +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]" + +/* + * Here we need registers not used otherwise. + * They will be used in unused ASM for the case + * with more registers than required... but GCC + * will still need to make sure the constraints + * are correct, and duplicate constraints are illegal + * ... and we use the "register" number as a name + */ + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 36) +#define VR3(r...) VR3_(r, 36, 35) +#define VR4(r...) VR4_(r, 36, 35, 34, 33) +#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32) +#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31) +#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define VR(X) "%[w"#X"]" + +#define RVR0_(REG, ...) [w##REG] "v" (w##REG) +#define RVR1_(_1, REG, ...) [w##REG] "v" (w##REG) +#define RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG) +#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG) +#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG) +#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG) +#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG) +#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG) + +#define RVR0(r...) RVR0_(r) +#define RVR1(r...) RVR1_(r) +#define RVR2(r...) RVR2_(r, 36) +#define RVR3(r...) RVR3_(r, 36, 35) +#define RVR4(r...) RVR4_(r, 36, 35, 34, 33) +#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32) +#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31) +#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define RVR(X) [w##X] "v" (w##X) + +#define WVR0_(REG, ...) [w##REG] "=v" (w##REG) +#define WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG) +#define WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG) +#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG) +#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG) +#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG) +#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG) +#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG) + +#define WVR0(r...) WVR0_(r) +#define WVR1(r...) WVR1_(r) +#define WVR2(r...) WVR2_(r, 36) +#define WVR3(r...) WVR3_(r, 36, 35) +#define WVR4(r...) WVR4_(r, 36, 35, 34, 33) +#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32) +#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31) +#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define WVR(X) [w##X] "=v" (w##X) + +#define UVR0_(REG, ...) [w##REG] "+&v" (w##REG) +#define UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG) +#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG) + +#define UVR0(r...) UVR0_(r) +#define UVR1(r...) UVR1_(r) +#define UVR2(r...) UVR2_(r, 36) +#define UVR3(r...) UVR3_(r, 36, 35) +#define UVR4(r...) UVR4_(r, 36, 35, 34, 33) +#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32) +#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) +#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) + +#define UVR(X) [w##X] "+&v" (w##X) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +#define OFFSET(ptr, val) (((unsigned char *)(ptr))+val) + +extern const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "lvx 19,0,%[SRC2]\n" \ + "lvx 18,0,%[SRC3]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + "vxor " VR2(r) "," VR2(r) ",19\n" \ + "vxor " VR3(r) "," VR3(r) ",18\n" \ + "lvx 21,0,%[SRC4]\n" \ + "lvx 20,0,%[SRC5]\n" \ + "lvx 19,0,%[SRC6]\n" \ + "lvx 18,0,%[SRC7]\n" \ + "vxor " VR4(r) "," VR4(r) ",21\n" \ + "vxor " VR5(r) "," VR5(r) ",20\n" \ + "vxor " VR6(r) "," VR6(r) ",19\n" \ + "vxor " VR7(r) "," VR7(r) ",18\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \ + UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))), \ + [SRC4] "r" ((OFFSET(src, 64))), \ + [SRC5] "r" ((OFFSET(src, 80))), \ + [SRC6] "r" ((OFFSET(src, 96))), \ + [SRC7] "r" ((OFFSET(src, 112))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 4: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "lvx 19,0,%[SRC2]\n" \ + "lvx 18,0,%[SRC3]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + "vxor " VR2(r) "," VR2(r) ",19\n" \ + "vxor " VR3(r) "," VR3(r) ",18\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "lvx 21,0,%[SRC0]\n" \ + "lvx 20,0,%[SRC1]\n" \ + "vxor " VR0(r) "," VR0(r) ",21\n" \ + "vxor " VR1(r) "," VR1(r) ",20\n" \ + : UVR0(r), UVR1(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))) \ + : "v20", "v21"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vxor " VR4(r) "," VR4(r) "," VR0(r) "\n" \ + "vxor " VR5(r) "," VR5(r) "," VR1(r) "\n" \ + "vxor " VR6(r) "," VR6(r) "," VR2(r) "\n" \ + "vxor " VR7(r) "," VR7(r) "," VR3(r) "\n" \ + : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "vxor " VR2(r) "," VR2(r) "," VR0(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR1(r) "\n" \ + : UVR2(r), UVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + "vxor " VR4(r) "," VR4(r) "," VR4(r) "\n" \ + "vxor " VR5(r) "," VR5(r) "," VR5(r) "\n" \ + "vxor " VR6(r) "," VR6(r) "," VR6(r) "\n" \ + "vxor " VR7(r) "," VR7(r) "," VR7(r) "\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r)); \ + break; \ + case 4: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vxor " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ + break; \ + case 2: \ + __asm( \ + "vxor " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vxor " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + : WVR0(r), WVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "vor " VR4(r) "," VR0(r) "," VR0(r) "\n" \ + "vor " VR5(r) "," VR1(r) "," VR1(r) "\n" \ + "vor " VR6(r) "," VR2(r) "," VR2(r) "\n" \ + "vor " VR7(r) "," VR3(r) "," VR3(r) "\n" \ + : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ + break; \ + case 4: \ + __asm( \ + "vor " VR2(r) "," VR0(r) "," VR0(r) "\n" \ + "vor " VR3(r) "," VR1(r) "," VR1(r) "\n" \ + : WVR2(r), WVR3(r) \ + : RVR0(r), RVR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + "lvx " VR2(r) " ,0,%[SRC2]\n" \ + "lvx " VR3(r) " ,0,%[SRC3]\n" \ + "lvx " VR4(r) " ,0,%[SRC4]\n" \ + "lvx " VR5(r) " ,0,%[SRC5]\n" \ + "lvx " VR6(r) " ,0,%[SRC6]\n" \ + "lvx " VR7(r) " ,0,%[SRC7]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ + WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48))), \ + [SRC4] "r" ((OFFSET(src, 64))), \ + [SRC5] "r" ((OFFSET(src, 80))), \ + [SRC6] "r" ((OFFSET(src, 96))), \ + [SRC7] "r" ((OFFSET(src, 112)))); \ + break; \ + case 4: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + "lvx " VR2(r) " ,0,%[SRC2]\n" \ + "lvx " VR3(r) " ,0,%[SRC3]\n" \ + : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16))), \ + [SRC2] "r" ((OFFSET(src, 32))), \ + [SRC3] "r" ((OFFSET(src, 48)))); \ + break; \ + case 2: \ + __asm( \ + "lvx " VR0(r) " ,0,%[SRC0]\n" \ + "lvx " VR1(r) " ,0,%[SRC1]\n" \ + : WVR0(r), WVR1(r) \ + : [SRC0] "r" ((OFFSET(src, 0))), \ + [SRC1] "r" ((OFFSET(src, 16)))); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + "stvx " VR2(r) " ,0,%[DST2]\n" \ + "stvx " VR3(r) " ,0,%[DST3]\n" \ + "stvx " VR4(r) " ,0,%[DST4]\n" \ + "stvx " VR5(r) " ,0,%[DST5]\n" \ + "stvx " VR6(r) " ,0,%[DST6]\n" \ + "stvx " VR7(r) " ,0,%[DST7]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + [DST2] "r" ((OFFSET(dst, 32))), \ + [DST3] "r" ((OFFSET(dst, 48))), \ + [DST4] "r" ((OFFSET(dst, 64))), \ + [DST5] "r" ((OFFSET(dst, 80))), \ + [DST6] "r" ((OFFSET(dst, 96))), \ + [DST7] "r" ((OFFSET(dst, 112))), \ + RVR0(r), RVR1(r), RVR2(r), RVR3(r), \ + RVR4(r), RVR5(r), RVR6(r), RVR7(r) \ + : "memory"); \ + break; \ + case 4: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + "stvx " VR2(r) " ,0,%[DST2]\n" \ + "stvx " VR3(r) " ,0,%[DST3]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + [DST2] "r" ((OFFSET(dst, 32))), \ + [DST3] "r" ((OFFSET(dst, 48))), \ + RVR0(r), RVR1(r), RVR2(r), RVR3(r) \ + : "memory"); \ + break; \ + case 2: \ + __asm( \ + "stvx " VR0(r) " ,0,%[DST0]\n" \ + "stvx " VR1(r) " ,0,%[DST1]\n" \ + : : [DST0] "r" ((OFFSET(dst, 0))), \ + [DST1] "r" ((OFFSET(dst, 16))), \ + RVR0(r), RVR1(r) : "memory"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a numbered variable is + */ +#define _00 "17" +#define _1d "16" +#define _temp0 "19" +#define _temp1 "18" + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "vspltisb " VR(16) ",14\n" \ + "vspltisb " VR(17) ",15\n" \ + "vaddubm " VR(16) "," VR(17) "," VR(16) "\n" \ + "vxor " VR(17) "," VR(17) "," VR(17) "\n" \ + : WVR(16), WVR(17)); \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ + "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ + "vcmpgtsb 21," VR(17) "," VR2(r) "\n" \ + "vcmpgtsb 20," VR(17) "," VR3(r) "\n" \ + "vand 19,19," VR(16) "\n" \ + "vand 18,18," VR(16) "\n" \ + "vand 21,21," VR(16) "\n" \ + "vand 20,20," VR(16) "\n" \ + "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n" \ + "vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n" \ + "vxor " VR0(r) ",19," VR0(r) "\n" \ + "vxor " VR1(r) ",18," VR1(r) "\n" \ + "vxor " VR2(r) ",21," VR2(r) "\n" \ + "vxor " VR3(r) ",20," VR3(r) "\n" \ + : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19", "v20", "v21"); \ + break; \ + case 2: \ + __asm( \ + "vcmpgtsb 19," VR(17) "," VR0(r) "\n" \ + "vcmpgtsb 18," VR(17) "," VR1(r) "\n" \ + "vand 19,19," VR(16) "\n" \ + "vand 18,18," VR(16) "\n" \ + "vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n" \ + "vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n" \ + "vxor " VR0(r) ",19," VR0(r) "\n" \ + "vxor " VR1(r) ",18," VR1(r) "\n" \ + : UVR0(r), UVR1(r) \ + : RVR(17), RVR(16) \ + : "v18", "v19"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +/* + * Unfortunately cannot use the macro, because GCC + * will try to use the macro name and not value + * later on... + * Kept as a reference to what a register is + * (here we're using actual registers for the + * clobbered ones) + */ +#define _0f "15" +#define _a_save "14" +#define _b_save "13" +#define _lt_mod_a "12" +#define _lt_clmul_a "11" +#define _lt_mod_b "10" +#define _lt_clmul_b "15" + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + /* lts for upper part */ \ + "vspltisb 15,15\n" \ + "lvx 10,0,%[lt0]\n" \ + "lvx 11,0,%[lt1]\n" \ + /* upper part */ \ + "vand 14," VR0(r) ",15\n" \ + "vand 13," VR1(r) ",15\n" \ + "vspltisb 15,4\n" \ + "vsrab " VR0(r) "," VR0(r) ",15\n" \ + "vsrab " VR1(r) "," VR1(r) ",15\n" \ + \ + "vperm 12,10,10," VR0(r) "\n" \ + "vperm 10,10,10," VR1(r) "\n" \ + "vperm 15,11,11," VR0(r) "\n" \ + "vperm 11,11,11," VR1(r) "\n" \ + \ + "vxor " VR0(r) ",15,12\n" \ + "vxor " VR1(r) ",11,10\n" \ + /* lts for lower part */ \ + "lvx 10,0,%[lt2]\n" \ + "lvx 15,0,%[lt3]\n" \ + /* lower part */ \ + "vperm 12,10,10,14\n" \ + "vperm 10,10,10,13\n" \ + "vperm 11,15,15,14\n" \ + "vperm 15,15,15,13\n" \ + \ + "vxor " VR0(r) "," VR0(r) ",12\n" \ + "vxor " VR1(r) "," VR1(r) ",10\n" \ + "vxor " VR0(r) "," VR0(r) ",11\n" \ + "vxor " VR1(r) "," VR1(r) ",15\n" \ + : UVR0(r), UVR1(r) \ + : [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])), \ + [lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])), \ + [lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])), \ + [lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0])) \ + : "v10", "v11", "v12", "v13", "v14", "v15"); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_23(r)); \ + _MULx2(c, R_01(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +/* Overkill... */ +#if 0 // defined(_KERNEL) +#define GEN_X_DEFINE_0_3() \ +register unsigned char w0 asm("0") __attribute__((vector_size(16))); \ +register unsigned char w1 asm("1") __attribute__((vector_size(16))); \ +register unsigned char w2 asm("2") __attribute__((vector_size(16))); \ +register unsigned char w3 asm("3") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ +register unsigned char w4 asm("4") __attribute__((vector_size(16))); \ +register unsigned char w5 asm("5") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ +register unsigned char w6 asm("6") __attribute__((vector_size(16))); \ +register unsigned char w7 asm("7") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ +register unsigned char w8 asm("8") __attribute__((vector_size(16))); \ +register unsigned char w9 asm("9") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ +register unsigned char w10 asm("10") __attribute__((vector_size(16))); \ +register unsigned char w11 asm("11") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ +register unsigned char w12 asm("12") __attribute__((vector_size(16))); \ +register unsigned char w13 asm("13") __attribute__((vector_size(16))); \ +register unsigned char w14 asm("14") __attribute__((vector_size(16))); \ +register unsigned char w15 asm("15") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ +register unsigned char w16 asm("16") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ +register unsigned char w17 asm("17") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ +register unsigned char w18 asm("18") __attribute__((vector_size(16))); \ +register unsigned char w19 asm("19") __attribute__((vector_size(16))); \ +register unsigned char w20 asm("20") __attribute__((vector_size(16))); \ +register unsigned char w21 asm("21") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ +register unsigned char w22 asm("22") __attribute__((vector_size(16))); \ +register unsigned char w23 asm("23") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ +register unsigned char w24 asm("24") __attribute__((vector_size(16))); \ +register unsigned char w25 asm("25") __attribute__((vector_size(16))); \ +register unsigned char w26 asm("26") __attribute__((vector_size(16))); \ +register unsigned char w27 asm("27") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ +register unsigned char w28 asm("28") __attribute__((vector_size(16))); \ +register unsigned char w29 asm("29") __attribute__((vector_size(16))); \ +register unsigned char w30 asm("30") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ +register unsigned char w31 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ +register unsigned char w32 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ +register unsigned char w33 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w34 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w35 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w36 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ +register unsigned char w37 asm("31") __attribute__((vector_size(16))); \ +register unsigned char w38 asm("31") __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#else +#define GEN_X_DEFINE_0_3() \ + unsigned char w0 __attribute__((vector_size(16))); \ + unsigned char w1 __attribute__((vector_size(16))); \ + unsigned char w2 __attribute__((vector_size(16))); \ + unsigned char w3 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_4_5() \ + unsigned char w4 __attribute__((vector_size(16))); \ + unsigned char w5 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_6_7() \ + unsigned char w6 __attribute__((vector_size(16))); \ + unsigned char w7 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_8_9() \ + unsigned char w8 __attribute__((vector_size(16))); \ + unsigned char w9 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_10_11() \ + unsigned char w10 __attribute__((vector_size(16))); \ + unsigned char w11 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_12_15() \ + unsigned char w12 __attribute__((vector_size(16))); \ + unsigned char w13 __attribute__((vector_size(16))); \ + unsigned char w14 __attribute__((vector_size(16))); \ + unsigned char w15 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_16() \ + unsigned char w16 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_17() \ + unsigned char w17 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_18_21() \ + unsigned char w18 __attribute__((vector_size(16))); \ + unsigned char w19 __attribute__((vector_size(16))); \ + unsigned char w20 __attribute__((vector_size(16))); \ + unsigned char w21 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_22_23() \ + unsigned char w22 __attribute__((vector_size(16))); \ + unsigned char w23 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_24_27() \ + unsigned char w24 __attribute__((vector_size(16))); \ + unsigned char w25 __attribute__((vector_size(16))); \ + unsigned char w26 __attribute__((vector_size(16))); \ + unsigned char w27 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_28_30() \ + unsigned char w28 __attribute__((vector_size(16))); \ + unsigned char w29 __attribute__((vector_size(16))); \ + unsigned char w30 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_31() \ + unsigned char w31 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_32() \ + unsigned char w32 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_33_36() \ + unsigned char w33 __attribute__((vector_size(16))); \ + unsigned char w34 __attribute__((vector_size(16))); \ + unsigned char w35 __attribute__((vector_size(16))); \ + unsigned char w36 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_37_38() \ + unsigned char w37 __attribute__((vector_size(16))); \ + unsigned char w38 __attribute__((vector_size(16))); +#define GEN_X_DEFINE_ALL() \ + GEN_X_DEFINE_0_3() \ + GEN_X_DEFINE_4_5() \ + GEN_X_DEFINE_6_7() \ + GEN_X_DEFINE_8_9() \ + GEN_X_DEFINE_10_11() \ + GEN_X_DEFINE_12_15() \ + GEN_X_DEFINE_16() \ + GEN_X_DEFINE_17() \ + GEN_X_DEFINE_18_21() \ + GEN_X_DEFINE_22_23() \ + GEN_X_DEFINE_24_27() \ + GEN_X_DEFINE_28_30() \ + GEN_X_DEFINE_31() \ + GEN_X_DEFINE_32() \ + GEN_X_DEFINE_33_36() \ + GEN_X_DEFINE_37_38() +#endif diff --git a/module/zfs/vdev_raidz_math_scalar.c b/module/zfs/vdev_raidz_math_scalar.c new file mode 100644 index 000000000000..cd742e146ca6 --- /dev/null +++ b/module/zfs/vdev_raidz_math_scalar.c @@ -0,0 +1,337 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +/* + * Provide native CPU scalar routines. + * Support 32bit and 64bit CPUs. + */ +#if ((~(0x0ULL)) >> 24) == 0xffULL +#define ELEM_SIZE 4 +typedef uint32_t iv_t; +#elif ((~(0x0ULL)) >> 56) == 0xffULL +#define ELEM_SIZE 8 +typedef uint64_t iv_t; +#endif + +/* + * Vector type used in scalar implementation + * + * The union is expected to be of native CPU register size. Since addition + * uses XOR operation, it can be performed an all byte elements at once. + * Multiplication requires per byte access. + */ +typedef union { + iv_t e; + uint8_t b[ELEM_SIZE]; +} v_t; + +/* + * Precomputed lookup tables for multiplication by a constant + * + * Reconstruction path requires multiplication by a constant factors. Instead of + * performing two step lookup (log & exp tables), a direct lookup can be used + * instead. Multiplication of element 'a' by a constant 'c' is obtained as: + * + * r = vdev_raidz_mul_lt[c_log][a]; + * + * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because + * they are faster to obtain while solving the syndrome equations. + * + * PERFORMANCE NOTE: + * Even though the complete lookup table uses 64kiB, only relatively small + * portion of it is used at the same time. Following shows number of accessed + * bytes for different cases: + * - 1 failed disk: 256B (1 mul. coefficient) + * - 2 failed disks: 512B (2 mul. coefficients) + * - 3 failed disks: 1536B (6 mul. coefficients) + * + * Size of actually accessed lookup table regions is only larger for + * reconstruction of 3 failed disks, when compared to traditional log/exp + * method. But since the result is obtained in one lookup step performance is + * doubled. + */ +static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256))); + +static void +raidz_init_scalar(void) +{ + int c, i; + for (c = 0; c < 256; c++) + for (i = 0; i < 256; i++) + vdev_raidz_mul_lt[c][i] = gf_mul(c, i); + +} + +#define PREFETCHNTA(ptr, offset) {} +#define PREFETCH(ptr, offset) {} + +#define XOR_ACC(src, acc) acc.e ^= ((v_t *)src)[0].e +#define XOR(src, acc) acc.e ^= src.e +#define ZERO(acc) acc.e = 0 +#define COPY(src, dst) dst = src +#define LOAD(src, val) val = ((v_t *)src)[0] +#define STORE(dst, val) ((v_t *)dst)[0] = val + +/* + * Constants used for optimized multiplication by 2. + */ +static const struct { + iv_t mod; + iv_t mask; + iv_t msb; +} scalar_mul2_consts = { +#if ELEM_SIZE == 8 + .mod = 0x1d1d1d1d1d1d1d1dULL, + .mask = 0xfefefefefefefefeULL, + .msb = 0x8080808080808080ULL, +#else + .mod = 0x1d1d1d1dULL, + .mask = 0xfefefefeULL, + .msb = 0x80808080ULL, +#endif +}; + +#define MUL2_SETUP() {} + +#define MUL2(a) \ +{ \ + iv_t _mask; \ + \ + _mask = (a).e & scalar_mul2_consts.msb; \ + _mask = (_mask << 1) - (_mask >> 7); \ + (a).e = ((a).e << 1) & scalar_mul2_consts.mask; \ + (a).e = (a).e ^ (_mask & scalar_mul2_consts.mod); \ +} + +#define MUL4(a) \ +{ \ + MUL2(a); \ + MUL2(a); \ +} + +#define MUL(c, a) \ +{ \ + const uint8_t *mul_lt = vdev_raidz_mul_lt[c]; \ + switch (ELEM_SIZE) { \ + case 8: \ + a.b[7] = mul_lt[a.b[7]]; \ + a.b[6] = mul_lt[a.b[6]]; \ + a.b[5] = mul_lt[a.b[5]]; \ + a.b[4] = mul_lt[a.b[4]]; \ + /* falls through */ \ + case 4: \ + a.b[3] = mul_lt[a.b[3]]; \ + a.b[2] = mul_lt[a.b[2]]; \ + a.b[1] = mul_lt[a.b[1]]; \ + a.b[0] = mul_lt[a.b[0]]; \ + break; \ + } \ +} + +#define raidz_math_begin() {} +#define raidz_math_end() {} + +#define SYN_STRIDE 1 + +#define ZERO_DEFINE() v_t d0 +#define ZERO_STRIDE 1 +#define ZERO_D d0 + +#define COPY_DEFINE() v_t d0 +#define COPY_STRIDE 1 +#define COPY_D d0 + +#define ADD_DEFINE() v_t d0 +#define ADD_STRIDE 1 +#define ADD_D d0 + +#define MUL_DEFINE() v_t d0 +#define MUL_STRIDE 1 +#define MUL_D d0 + +#define GEN_P_STRIDE 1 +#define GEN_P_DEFINE() v_t p0 +#define GEN_P_P p0 + +#define GEN_PQ_STRIDE 1 +#define GEN_PQ_DEFINE() v_t d0, c0 +#define GEN_PQ_D d0 +#define GEN_PQ_C c0 + +#define GEN_PQR_STRIDE 1 +#define GEN_PQR_DEFINE() v_t d0, c0 +#define GEN_PQR_D d0 +#define GEN_PQR_C c0 + +#define SYN_Q_DEFINE() v_t d0, x0 +#define SYN_Q_D d0 +#define SYN_Q_X x0 + + +#define SYN_R_DEFINE() v_t d0, x0 +#define SYN_R_D d0 +#define SYN_R_X x0 + + +#define SYN_PQ_DEFINE() v_t d0, x0 +#define SYN_PQ_D d0 +#define SYN_PQ_X x0 + + +#define REC_PQ_STRIDE 1 +#define REC_PQ_DEFINE() v_t x0, y0, t0 +#define REC_PQ_X x0 +#define REC_PQ_Y y0 +#define REC_PQ_T t0 + + +#define SYN_PR_DEFINE() v_t d0, x0 +#define SYN_PR_D d0 +#define SYN_PR_X x0 + +#define REC_PR_STRIDE 1 +#define REC_PR_DEFINE() v_t x0, y0, t0 +#define REC_PR_X x0 +#define REC_PR_Y y0 +#define REC_PR_T t0 + + +#define SYN_QR_DEFINE() v_t d0, x0 +#define SYN_QR_D d0 +#define SYN_QR_X x0 + + +#define REC_QR_STRIDE 1 +#define REC_QR_DEFINE() v_t x0, y0, t0 +#define REC_QR_X x0 +#define REC_QR_Y y0 +#define REC_QR_T t0 + + +#define SYN_PQR_DEFINE() v_t d0, x0 +#define SYN_PQR_D d0 +#define SYN_PQR_X x0 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() v_t x0, y0, z0, xs0, ys0 +#define REC_PQR_X x0 +#define REC_PQR_Y y0 +#define REC_PQR_Z z0 +#define REC_PQR_XS xs0 +#define REC_PQR_YS ys0 + +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(scalar); +DEFINE_REC_METHODS(scalar); + +boolean_t +raidz_will_scalar_work(void) +{ + return (B_TRUE); /* always */ +} + +const raidz_impl_ops_t vdev_raidz_scalar_impl = { + .init = raidz_init_scalar, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(scalar), + .rec = RAIDZ_REC_METHODS(scalar), + .is_supported = &raidz_will_scalar_work, + .name = "scalar" +}; + +/* Powers of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, + 0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26, + 0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9, + 0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0, + 0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35, + 0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23, + 0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0, + 0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1, + 0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc, + 0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0, + 0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f, + 0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2, + 0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88, + 0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce, + 0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93, + 0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc, + 0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9, + 0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54, + 0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa, + 0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73, + 0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e, + 0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff, + 0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4, + 0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41, + 0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e, + 0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6, + 0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef, + 0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09, + 0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5, + 0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16, + 0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83, + 0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01 +}; + +/* Logs of 2 in the RAID-Z Galois field. */ +const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = { + 0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6, + 0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b, + 0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81, + 0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71, + 0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21, + 0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45, + 0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9, + 0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6, + 0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd, + 0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88, + 0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd, + 0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40, + 0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e, + 0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d, + 0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b, + 0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57, + 0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d, + 0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18, + 0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c, + 0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e, + 0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd, + 0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61, + 0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e, + 0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2, + 0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76, + 0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6, + 0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa, + 0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a, + 0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51, + 0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7, + 0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8, + 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, +}; diff --git a/module/zfs/vdev_raidz_math_sse2.c b/module/zfs/vdev_raidz_math_sse2.c new file mode 100644 index 000000000000..56a0b123d952 --- /dev/null +++ b/module/zfs/vdev_raidz_math_sse2.c @@ -0,0 +1,631 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_SSE2) + +#include +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "xmm"#REG +#define VR1_(_1, REG, ...) "xmm"#REG +#define VR2_(_1, _2, REG, ...) "xmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG + +#define VR0(r...) VR0_(r, 1, 2, 3, 4, 5, 6) +#define VR1(r...) VR1_(r, 1, 2, 3, 4, 5, 6) +#define VR2(r...) VR2_(r, 1, 2, 3, 4, 5, 6) +#define VR3(r...) VR3_(r, 1, 2, 3, 4, 5, 6) +#define VR4(r...) VR4_(r, 1, 2, 3, 4, 5, 6) +#define VR5(r...) VR5_(r, 1, 2, 3, 4, 5, 6) +#define VR6(r...) VR6_(r, 1, 2, 3, 4, 5, 6) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5, 6) + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \ + "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 1: \ + __asm("pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "pxor %" VR0(r) ", %" VR4(r) "\n" \ + "pxor %" VR1(r) ", %" VR5(r) "\n" \ + "pxor %" VR2(r) ", %" VR6(r) "\n" \ + "pxor %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "pxor %" VR0(r) ", %" VR2(r) "\n" \ + "pxor %" VR1(r) ", %" VR3(r)); \ + break; \ + case 2: \ + __asm( \ + "pxor %" VR0(r) ", %" VR1(r)); \ + break; \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR4(r) "\n" \ + "movdqa %" VR1(r) ", %" VR5(r) "\n" \ + "movdqa %" VR2(r) ", %" VR6(r) "\n" \ + "movdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR2(r) "\n" \ + "movdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR1(r)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \ + "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 1: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + "movdqa %%" VR2(r)", 0x20(%[DST])\n" \ + "movdqa %%" VR3(r)", 0x30(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 1: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "movd %[mask], %%xmm15\n" \ + "pshufd $0x0, %%xmm15, %%xmm15\n" \ + : : [mask] "r" (0x1d1d1d1d)); \ +} + +#define _MUL2_x1(a0) \ +{ \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pcmpgtb %" a0", %xmm14\n" \ + "pand %xmm15, %xmm14\n" \ + "paddb %" a0", %" a0 "\n" \ + "pxor %xmm14, %" a0); \ +} + +#define _MUL2_x2(a0, a1) \ +{ \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pxor %xmm13, %xmm13\n" \ + "pcmpgtb %" a0", %xmm14\n" \ + "pcmpgtb %" a1", %xmm13\n" \ + "pand %xmm15, %xmm14\n" \ + "pand %xmm15, %xmm13\n" \ + "paddb %" a0", %" a0 "\n" \ + "paddb %" a1", %" a1 "\n" \ + "pxor %xmm14, %" a0 "\n" \ + "pxor %xmm13, %" a1); \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(VR0(r), VR1(r)); \ + _MUL2_x2(VR2(r), VR3(r)); \ + break; \ + case 2: \ + _MUL2_x2(VR0(r), VR1(r)); \ + break; \ + case 1: \ + _MUL2_x1(VR0(r)); \ + break; \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +/* General multiplication by adding powers of two */ + +#define _MUL_PARAM(x, in, acc) \ +{ \ + if (x & 0x01) { COPY(in, acc); } else { ZERO(acc); } \ + if (x & 0xfe) { MUL2(in); } \ + if (x & 0x02) { XOR(in, acc); } \ + if (x & 0xfc) { MUL2(in); } \ + if (x & 0x04) { XOR(in, acc); } \ + if (x & 0xf8) { MUL2(in); } \ + if (x & 0x08) { XOR(in, acc); } \ + if (x & 0xf0) { MUL2(in); } \ + if (x & 0x10) { XOR(in, acc); } \ + if (x & 0xe0) { MUL2(in); } \ + if (x & 0x20) { XOR(in, acc); } \ + if (x & 0xc0) { MUL2(in); } \ + if (x & 0x40) { XOR(in, acc); } \ + if (x & 0x80) { MUL2(in); XOR(in, acc); } \ +} + +#define _mul_x1_in 11 +#define _mul_x1_acc 12 + +#define MUL_x1_DEFINE(x) \ +static void \ +mul_x1_ ## x(void) { _MUL_PARAM(x, _mul_x1_in, _mul_x1_acc); } + +#define _mul_x2_in 9, 10 +#define _mul_x2_acc 11, 12 + +#define MUL_x2_DEFINE(x) \ +static void \ +mul_x2_ ## x(void) { _MUL_PARAM(x, _mul_x2_in, _mul_x2_acc); } + +MUL_x1_DEFINE(0); MUL_x1_DEFINE(1); MUL_x1_DEFINE(2); MUL_x1_DEFINE(3); +MUL_x1_DEFINE(4); MUL_x1_DEFINE(5); MUL_x1_DEFINE(6); MUL_x1_DEFINE(7); +MUL_x1_DEFINE(8); MUL_x1_DEFINE(9); MUL_x1_DEFINE(10); MUL_x1_DEFINE(11); +MUL_x1_DEFINE(12); MUL_x1_DEFINE(13); MUL_x1_DEFINE(14); MUL_x1_DEFINE(15); +MUL_x1_DEFINE(16); MUL_x1_DEFINE(17); MUL_x1_DEFINE(18); MUL_x1_DEFINE(19); +MUL_x1_DEFINE(20); MUL_x1_DEFINE(21); MUL_x1_DEFINE(22); MUL_x1_DEFINE(23); +MUL_x1_DEFINE(24); MUL_x1_DEFINE(25); MUL_x1_DEFINE(26); MUL_x1_DEFINE(27); +MUL_x1_DEFINE(28); MUL_x1_DEFINE(29); MUL_x1_DEFINE(30); MUL_x1_DEFINE(31); +MUL_x1_DEFINE(32); MUL_x1_DEFINE(33); MUL_x1_DEFINE(34); MUL_x1_DEFINE(35); +MUL_x1_DEFINE(36); MUL_x1_DEFINE(37); MUL_x1_DEFINE(38); MUL_x1_DEFINE(39); +MUL_x1_DEFINE(40); MUL_x1_DEFINE(41); MUL_x1_DEFINE(42); MUL_x1_DEFINE(43); +MUL_x1_DEFINE(44); MUL_x1_DEFINE(45); MUL_x1_DEFINE(46); MUL_x1_DEFINE(47); +MUL_x1_DEFINE(48); MUL_x1_DEFINE(49); MUL_x1_DEFINE(50); MUL_x1_DEFINE(51); +MUL_x1_DEFINE(52); MUL_x1_DEFINE(53); MUL_x1_DEFINE(54); MUL_x1_DEFINE(55); +MUL_x1_DEFINE(56); MUL_x1_DEFINE(57); MUL_x1_DEFINE(58); MUL_x1_DEFINE(59); +MUL_x1_DEFINE(60); MUL_x1_DEFINE(61); MUL_x1_DEFINE(62); MUL_x1_DEFINE(63); +MUL_x1_DEFINE(64); MUL_x1_DEFINE(65); MUL_x1_DEFINE(66); MUL_x1_DEFINE(67); +MUL_x1_DEFINE(68); MUL_x1_DEFINE(69); MUL_x1_DEFINE(70); MUL_x1_DEFINE(71); +MUL_x1_DEFINE(72); MUL_x1_DEFINE(73); MUL_x1_DEFINE(74); MUL_x1_DEFINE(75); +MUL_x1_DEFINE(76); MUL_x1_DEFINE(77); MUL_x1_DEFINE(78); MUL_x1_DEFINE(79); +MUL_x1_DEFINE(80); MUL_x1_DEFINE(81); MUL_x1_DEFINE(82); MUL_x1_DEFINE(83); +MUL_x1_DEFINE(84); MUL_x1_DEFINE(85); MUL_x1_DEFINE(86); MUL_x1_DEFINE(87); +MUL_x1_DEFINE(88); MUL_x1_DEFINE(89); MUL_x1_DEFINE(90); MUL_x1_DEFINE(91); +MUL_x1_DEFINE(92); MUL_x1_DEFINE(93); MUL_x1_DEFINE(94); MUL_x1_DEFINE(95); +MUL_x1_DEFINE(96); MUL_x1_DEFINE(97); MUL_x1_DEFINE(98); MUL_x1_DEFINE(99); +MUL_x1_DEFINE(100); MUL_x1_DEFINE(101); MUL_x1_DEFINE(102); MUL_x1_DEFINE(103); +MUL_x1_DEFINE(104); MUL_x1_DEFINE(105); MUL_x1_DEFINE(106); MUL_x1_DEFINE(107); +MUL_x1_DEFINE(108); MUL_x1_DEFINE(109); MUL_x1_DEFINE(110); MUL_x1_DEFINE(111); +MUL_x1_DEFINE(112); MUL_x1_DEFINE(113); MUL_x1_DEFINE(114); MUL_x1_DEFINE(115); +MUL_x1_DEFINE(116); MUL_x1_DEFINE(117); MUL_x1_DEFINE(118); MUL_x1_DEFINE(119); +MUL_x1_DEFINE(120); MUL_x1_DEFINE(121); MUL_x1_DEFINE(122); MUL_x1_DEFINE(123); +MUL_x1_DEFINE(124); MUL_x1_DEFINE(125); MUL_x1_DEFINE(126); MUL_x1_DEFINE(127); +MUL_x1_DEFINE(128); MUL_x1_DEFINE(129); MUL_x1_DEFINE(130); MUL_x1_DEFINE(131); +MUL_x1_DEFINE(132); MUL_x1_DEFINE(133); MUL_x1_DEFINE(134); MUL_x1_DEFINE(135); +MUL_x1_DEFINE(136); MUL_x1_DEFINE(137); MUL_x1_DEFINE(138); MUL_x1_DEFINE(139); +MUL_x1_DEFINE(140); MUL_x1_DEFINE(141); MUL_x1_DEFINE(142); MUL_x1_DEFINE(143); +MUL_x1_DEFINE(144); MUL_x1_DEFINE(145); MUL_x1_DEFINE(146); MUL_x1_DEFINE(147); +MUL_x1_DEFINE(148); MUL_x1_DEFINE(149); MUL_x1_DEFINE(150); MUL_x1_DEFINE(151); +MUL_x1_DEFINE(152); MUL_x1_DEFINE(153); MUL_x1_DEFINE(154); MUL_x1_DEFINE(155); +MUL_x1_DEFINE(156); MUL_x1_DEFINE(157); MUL_x1_DEFINE(158); MUL_x1_DEFINE(159); +MUL_x1_DEFINE(160); MUL_x1_DEFINE(161); MUL_x1_DEFINE(162); MUL_x1_DEFINE(163); +MUL_x1_DEFINE(164); MUL_x1_DEFINE(165); MUL_x1_DEFINE(166); MUL_x1_DEFINE(167); +MUL_x1_DEFINE(168); MUL_x1_DEFINE(169); MUL_x1_DEFINE(170); MUL_x1_DEFINE(171); +MUL_x1_DEFINE(172); MUL_x1_DEFINE(173); MUL_x1_DEFINE(174); MUL_x1_DEFINE(175); +MUL_x1_DEFINE(176); MUL_x1_DEFINE(177); MUL_x1_DEFINE(178); MUL_x1_DEFINE(179); +MUL_x1_DEFINE(180); MUL_x1_DEFINE(181); MUL_x1_DEFINE(182); MUL_x1_DEFINE(183); +MUL_x1_DEFINE(184); MUL_x1_DEFINE(185); MUL_x1_DEFINE(186); MUL_x1_DEFINE(187); +MUL_x1_DEFINE(188); MUL_x1_DEFINE(189); MUL_x1_DEFINE(190); MUL_x1_DEFINE(191); +MUL_x1_DEFINE(192); MUL_x1_DEFINE(193); MUL_x1_DEFINE(194); MUL_x1_DEFINE(195); +MUL_x1_DEFINE(196); MUL_x1_DEFINE(197); MUL_x1_DEFINE(198); MUL_x1_DEFINE(199); +MUL_x1_DEFINE(200); MUL_x1_DEFINE(201); MUL_x1_DEFINE(202); MUL_x1_DEFINE(203); +MUL_x1_DEFINE(204); MUL_x1_DEFINE(205); MUL_x1_DEFINE(206); MUL_x1_DEFINE(207); +MUL_x1_DEFINE(208); MUL_x1_DEFINE(209); MUL_x1_DEFINE(210); MUL_x1_DEFINE(211); +MUL_x1_DEFINE(212); MUL_x1_DEFINE(213); MUL_x1_DEFINE(214); MUL_x1_DEFINE(215); +MUL_x1_DEFINE(216); MUL_x1_DEFINE(217); MUL_x1_DEFINE(218); MUL_x1_DEFINE(219); +MUL_x1_DEFINE(220); MUL_x1_DEFINE(221); MUL_x1_DEFINE(222); MUL_x1_DEFINE(223); +MUL_x1_DEFINE(224); MUL_x1_DEFINE(225); MUL_x1_DEFINE(226); MUL_x1_DEFINE(227); +MUL_x1_DEFINE(228); MUL_x1_DEFINE(229); MUL_x1_DEFINE(230); MUL_x1_DEFINE(231); +MUL_x1_DEFINE(232); MUL_x1_DEFINE(233); MUL_x1_DEFINE(234); MUL_x1_DEFINE(235); +MUL_x1_DEFINE(236); MUL_x1_DEFINE(237); MUL_x1_DEFINE(238); MUL_x1_DEFINE(239); +MUL_x1_DEFINE(240); MUL_x1_DEFINE(241); MUL_x1_DEFINE(242); MUL_x1_DEFINE(243); +MUL_x1_DEFINE(244); MUL_x1_DEFINE(245); MUL_x1_DEFINE(246); MUL_x1_DEFINE(247); +MUL_x1_DEFINE(248); MUL_x1_DEFINE(249); MUL_x1_DEFINE(250); MUL_x1_DEFINE(251); +MUL_x1_DEFINE(252); MUL_x1_DEFINE(253); MUL_x1_DEFINE(254); MUL_x1_DEFINE(255); + +MUL_x2_DEFINE(0); MUL_x2_DEFINE(1); MUL_x2_DEFINE(2); MUL_x2_DEFINE(3); +MUL_x2_DEFINE(4); MUL_x2_DEFINE(5); MUL_x2_DEFINE(6); MUL_x2_DEFINE(7); +MUL_x2_DEFINE(8); MUL_x2_DEFINE(9); MUL_x2_DEFINE(10); MUL_x2_DEFINE(11); +MUL_x2_DEFINE(12); MUL_x2_DEFINE(13); MUL_x2_DEFINE(14); MUL_x2_DEFINE(15); +MUL_x2_DEFINE(16); MUL_x2_DEFINE(17); MUL_x2_DEFINE(18); MUL_x2_DEFINE(19); +MUL_x2_DEFINE(20); MUL_x2_DEFINE(21); MUL_x2_DEFINE(22); MUL_x2_DEFINE(23); +MUL_x2_DEFINE(24); MUL_x2_DEFINE(25); MUL_x2_DEFINE(26); MUL_x2_DEFINE(27); +MUL_x2_DEFINE(28); MUL_x2_DEFINE(29); MUL_x2_DEFINE(30); MUL_x2_DEFINE(31); +MUL_x2_DEFINE(32); MUL_x2_DEFINE(33); MUL_x2_DEFINE(34); MUL_x2_DEFINE(35); +MUL_x2_DEFINE(36); MUL_x2_DEFINE(37); MUL_x2_DEFINE(38); MUL_x2_DEFINE(39); +MUL_x2_DEFINE(40); MUL_x2_DEFINE(41); MUL_x2_DEFINE(42); MUL_x2_DEFINE(43); +MUL_x2_DEFINE(44); MUL_x2_DEFINE(45); MUL_x2_DEFINE(46); MUL_x2_DEFINE(47); +MUL_x2_DEFINE(48); MUL_x2_DEFINE(49); MUL_x2_DEFINE(50); MUL_x2_DEFINE(51); +MUL_x2_DEFINE(52); MUL_x2_DEFINE(53); MUL_x2_DEFINE(54); MUL_x2_DEFINE(55); +MUL_x2_DEFINE(56); MUL_x2_DEFINE(57); MUL_x2_DEFINE(58); MUL_x2_DEFINE(59); +MUL_x2_DEFINE(60); MUL_x2_DEFINE(61); MUL_x2_DEFINE(62); MUL_x2_DEFINE(63); +MUL_x2_DEFINE(64); MUL_x2_DEFINE(65); MUL_x2_DEFINE(66); MUL_x2_DEFINE(67); +MUL_x2_DEFINE(68); MUL_x2_DEFINE(69); MUL_x2_DEFINE(70); MUL_x2_DEFINE(71); +MUL_x2_DEFINE(72); MUL_x2_DEFINE(73); MUL_x2_DEFINE(74); MUL_x2_DEFINE(75); +MUL_x2_DEFINE(76); MUL_x2_DEFINE(77); MUL_x2_DEFINE(78); MUL_x2_DEFINE(79); +MUL_x2_DEFINE(80); MUL_x2_DEFINE(81); MUL_x2_DEFINE(82); MUL_x2_DEFINE(83); +MUL_x2_DEFINE(84); MUL_x2_DEFINE(85); MUL_x2_DEFINE(86); MUL_x2_DEFINE(87); +MUL_x2_DEFINE(88); MUL_x2_DEFINE(89); MUL_x2_DEFINE(90); MUL_x2_DEFINE(91); +MUL_x2_DEFINE(92); MUL_x2_DEFINE(93); MUL_x2_DEFINE(94); MUL_x2_DEFINE(95); +MUL_x2_DEFINE(96); MUL_x2_DEFINE(97); MUL_x2_DEFINE(98); MUL_x2_DEFINE(99); +MUL_x2_DEFINE(100); MUL_x2_DEFINE(101); MUL_x2_DEFINE(102); MUL_x2_DEFINE(103); +MUL_x2_DEFINE(104); MUL_x2_DEFINE(105); MUL_x2_DEFINE(106); MUL_x2_DEFINE(107); +MUL_x2_DEFINE(108); MUL_x2_DEFINE(109); MUL_x2_DEFINE(110); MUL_x2_DEFINE(111); +MUL_x2_DEFINE(112); MUL_x2_DEFINE(113); MUL_x2_DEFINE(114); MUL_x2_DEFINE(115); +MUL_x2_DEFINE(116); MUL_x2_DEFINE(117); MUL_x2_DEFINE(118); MUL_x2_DEFINE(119); +MUL_x2_DEFINE(120); MUL_x2_DEFINE(121); MUL_x2_DEFINE(122); MUL_x2_DEFINE(123); +MUL_x2_DEFINE(124); MUL_x2_DEFINE(125); MUL_x2_DEFINE(126); MUL_x2_DEFINE(127); +MUL_x2_DEFINE(128); MUL_x2_DEFINE(129); MUL_x2_DEFINE(130); MUL_x2_DEFINE(131); +MUL_x2_DEFINE(132); MUL_x2_DEFINE(133); MUL_x2_DEFINE(134); MUL_x2_DEFINE(135); +MUL_x2_DEFINE(136); MUL_x2_DEFINE(137); MUL_x2_DEFINE(138); MUL_x2_DEFINE(139); +MUL_x2_DEFINE(140); MUL_x2_DEFINE(141); MUL_x2_DEFINE(142); MUL_x2_DEFINE(143); +MUL_x2_DEFINE(144); MUL_x2_DEFINE(145); MUL_x2_DEFINE(146); MUL_x2_DEFINE(147); +MUL_x2_DEFINE(148); MUL_x2_DEFINE(149); MUL_x2_DEFINE(150); MUL_x2_DEFINE(151); +MUL_x2_DEFINE(152); MUL_x2_DEFINE(153); MUL_x2_DEFINE(154); MUL_x2_DEFINE(155); +MUL_x2_DEFINE(156); MUL_x2_DEFINE(157); MUL_x2_DEFINE(158); MUL_x2_DEFINE(159); +MUL_x2_DEFINE(160); MUL_x2_DEFINE(161); MUL_x2_DEFINE(162); MUL_x2_DEFINE(163); +MUL_x2_DEFINE(164); MUL_x2_DEFINE(165); MUL_x2_DEFINE(166); MUL_x2_DEFINE(167); +MUL_x2_DEFINE(168); MUL_x2_DEFINE(169); MUL_x2_DEFINE(170); MUL_x2_DEFINE(171); +MUL_x2_DEFINE(172); MUL_x2_DEFINE(173); MUL_x2_DEFINE(174); MUL_x2_DEFINE(175); +MUL_x2_DEFINE(176); MUL_x2_DEFINE(177); MUL_x2_DEFINE(178); MUL_x2_DEFINE(179); +MUL_x2_DEFINE(180); MUL_x2_DEFINE(181); MUL_x2_DEFINE(182); MUL_x2_DEFINE(183); +MUL_x2_DEFINE(184); MUL_x2_DEFINE(185); MUL_x2_DEFINE(186); MUL_x2_DEFINE(187); +MUL_x2_DEFINE(188); MUL_x2_DEFINE(189); MUL_x2_DEFINE(190); MUL_x2_DEFINE(191); +MUL_x2_DEFINE(192); MUL_x2_DEFINE(193); MUL_x2_DEFINE(194); MUL_x2_DEFINE(195); +MUL_x2_DEFINE(196); MUL_x2_DEFINE(197); MUL_x2_DEFINE(198); MUL_x2_DEFINE(199); +MUL_x2_DEFINE(200); MUL_x2_DEFINE(201); MUL_x2_DEFINE(202); MUL_x2_DEFINE(203); +MUL_x2_DEFINE(204); MUL_x2_DEFINE(205); MUL_x2_DEFINE(206); MUL_x2_DEFINE(207); +MUL_x2_DEFINE(208); MUL_x2_DEFINE(209); MUL_x2_DEFINE(210); MUL_x2_DEFINE(211); +MUL_x2_DEFINE(212); MUL_x2_DEFINE(213); MUL_x2_DEFINE(214); MUL_x2_DEFINE(215); +MUL_x2_DEFINE(216); MUL_x2_DEFINE(217); MUL_x2_DEFINE(218); MUL_x2_DEFINE(219); +MUL_x2_DEFINE(220); MUL_x2_DEFINE(221); MUL_x2_DEFINE(222); MUL_x2_DEFINE(223); +MUL_x2_DEFINE(224); MUL_x2_DEFINE(225); MUL_x2_DEFINE(226); MUL_x2_DEFINE(227); +MUL_x2_DEFINE(228); MUL_x2_DEFINE(229); MUL_x2_DEFINE(230); MUL_x2_DEFINE(231); +MUL_x2_DEFINE(232); MUL_x2_DEFINE(233); MUL_x2_DEFINE(234); MUL_x2_DEFINE(235); +MUL_x2_DEFINE(236); MUL_x2_DEFINE(237); MUL_x2_DEFINE(238); MUL_x2_DEFINE(239); +MUL_x2_DEFINE(240); MUL_x2_DEFINE(241); MUL_x2_DEFINE(242); MUL_x2_DEFINE(243); +MUL_x2_DEFINE(244); MUL_x2_DEFINE(245); MUL_x2_DEFINE(246); MUL_x2_DEFINE(247); +MUL_x2_DEFINE(248); MUL_x2_DEFINE(249); MUL_x2_DEFINE(250); MUL_x2_DEFINE(251); +MUL_x2_DEFINE(252); MUL_x2_DEFINE(253); MUL_x2_DEFINE(254); MUL_x2_DEFINE(255); + + + +typedef void (*mul_fn_ptr_t)(void); + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x1_mul_fns[256] = { + mul_x1_0, mul_x1_1, mul_x1_2, mul_x1_3, mul_x1_4, mul_x1_5, + mul_x1_6, mul_x1_7, mul_x1_8, mul_x1_9, mul_x1_10, mul_x1_11, + mul_x1_12, mul_x1_13, mul_x1_14, mul_x1_15, mul_x1_16, mul_x1_17, + mul_x1_18, mul_x1_19, mul_x1_20, mul_x1_21, mul_x1_22, mul_x1_23, + mul_x1_24, mul_x1_25, mul_x1_26, mul_x1_27, mul_x1_28, mul_x1_29, + mul_x1_30, mul_x1_31, mul_x1_32, mul_x1_33, mul_x1_34, mul_x1_35, + mul_x1_36, mul_x1_37, mul_x1_38, mul_x1_39, mul_x1_40, mul_x1_41, + mul_x1_42, mul_x1_43, mul_x1_44, mul_x1_45, mul_x1_46, mul_x1_47, + mul_x1_48, mul_x1_49, mul_x1_50, mul_x1_51, mul_x1_52, mul_x1_53, + mul_x1_54, mul_x1_55, mul_x1_56, mul_x1_57, mul_x1_58, mul_x1_59, + mul_x1_60, mul_x1_61, mul_x1_62, mul_x1_63, mul_x1_64, mul_x1_65, + mul_x1_66, mul_x1_67, mul_x1_68, mul_x1_69, mul_x1_70, mul_x1_71, + mul_x1_72, mul_x1_73, mul_x1_74, mul_x1_75, mul_x1_76, mul_x1_77, + mul_x1_78, mul_x1_79, mul_x1_80, mul_x1_81, mul_x1_82, mul_x1_83, + mul_x1_84, mul_x1_85, mul_x1_86, mul_x1_87, mul_x1_88, mul_x1_89, + mul_x1_90, mul_x1_91, mul_x1_92, mul_x1_93, mul_x1_94, mul_x1_95, + mul_x1_96, mul_x1_97, mul_x1_98, mul_x1_99, mul_x1_100, mul_x1_101, + mul_x1_102, mul_x1_103, mul_x1_104, mul_x1_105, mul_x1_106, mul_x1_107, + mul_x1_108, mul_x1_109, mul_x1_110, mul_x1_111, mul_x1_112, mul_x1_113, + mul_x1_114, mul_x1_115, mul_x1_116, mul_x1_117, mul_x1_118, mul_x1_119, + mul_x1_120, mul_x1_121, mul_x1_122, mul_x1_123, mul_x1_124, mul_x1_125, + mul_x1_126, mul_x1_127, mul_x1_128, mul_x1_129, mul_x1_130, mul_x1_131, + mul_x1_132, mul_x1_133, mul_x1_134, mul_x1_135, mul_x1_136, mul_x1_137, + mul_x1_138, mul_x1_139, mul_x1_140, mul_x1_141, mul_x1_142, mul_x1_143, + mul_x1_144, mul_x1_145, mul_x1_146, mul_x1_147, mul_x1_148, mul_x1_149, + mul_x1_150, mul_x1_151, mul_x1_152, mul_x1_153, mul_x1_154, mul_x1_155, + mul_x1_156, mul_x1_157, mul_x1_158, mul_x1_159, mul_x1_160, mul_x1_161, + mul_x1_162, mul_x1_163, mul_x1_164, mul_x1_165, mul_x1_166, mul_x1_167, + mul_x1_168, mul_x1_169, mul_x1_170, mul_x1_171, mul_x1_172, mul_x1_173, + mul_x1_174, mul_x1_175, mul_x1_176, mul_x1_177, mul_x1_178, mul_x1_179, + mul_x1_180, mul_x1_181, mul_x1_182, mul_x1_183, mul_x1_184, mul_x1_185, + mul_x1_186, mul_x1_187, mul_x1_188, mul_x1_189, mul_x1_190, mul_x1_191, + mul_x1_192, mul_x1_193, mul_x1_194, mul_x1_195, mul_x1_196, mul_x1_197, + mul_x1_198, mul_x1_199, mul_x1_200, mul_x1_201, mul_x1_202, mul_x1_203, + mul_x1_204, mul_x1_205, mul_x1_206, mul_x1_207, mul_x1_208, mul_x1_209, + mul_x1_210, mul_x1_211, mul_x1_212, mul_x1_213, mul_x1_214, mul_x1_215, + mul_x1_216, mul_x1_217, mul_x1_218, mul_x1_219, mul_x1_220, mul_x1_221, + mul_x1_222, mul_x1_223, mul_x1_224, mul_x1_225, mul_x1_226, mul_x1_227, + mul_x1_228, mul_x1_229, mul_x1_230, mul_x1_231, mul_x1_232, mul_x1_233, + mul_x1_234, mul_x1_235, mul_x1_236, mul_x1_237, mul_x1_238, mul_x1_239, + mul_x1_240, mul_x1_241, mul_x1_242, mul_x1_243, mul_x1_244, mul_x1_245, + mul_x1_246, mul_x1_247, mul_x1_248, mul_x1_249, mul_x1_250, mul_x1_251, + mul_x1_252, mul_x1_253, mul_x1_254, mul_x1_255 +}; + +static const mul_fn_ptr_t __attribute__((aligned(256))) +gf_x2_mul_fns[256] = { + mul_x2_0, mul_x2_1, mul_x2_2, mul_x2_3, mul_x2_4, mul_x2_5, + mul_x2_6, mul_x2_7, mul_x2_8, mul_x2_9, mul_x2_10, mul_x2_11, + mul_x2_12, mul_x2_13, mul_x2_14, mul_x2_15, mul_x2_16, mul_x2_17, + mul_x2_18, mul_x2_19, mul_x2_20, mul_x2_21, mul_x2_22, mul_x2_23, + mul_x2_24, mul_x2_25, mul_x2_26, mul_x2_27, mul_x2_28, mul_x2_29, + mul_x2_30, mul_x2_31, mul_x2_32, mul_x2_33, mul_x2_34, mul_x2_35, + mul_x2_36, mul_x2_37, mul_x2_38, mul_x2_39, mul_x2_40, mul_x2_41, + mul_x2_42, mul_x2_43, mul_x2_44, mul_x2_45, mul_x2_46, mul_x2_47, + mul_x2_48, mul_x2_49, mul_x2_50, mul_x2_51, mul_x2_52, mul_x2_53, + mul_x2_54, mul_x2_55, mul_x2_56, mul_x2_57, mul_x2_58, mul_x2_59, + mul_x2_60, mul_x2_61, mul_x2_62, mul_x2_63, mul_x2_64, mul_x2_65, + mul_x2_66, mul_x2_67, mul_x2_68, mul_x2_69, mul_x2_70, mul_x2_71, + mul_x2_72, mul_x2_73, mul_x2_74, mul_x2_75, mul_x2_76, mul_x2_77, + mul_x2_78, mul_x2_79, mul_x2_80, mul_x2_81, mul_x2_82, mul_x2_83, + mul_x2_84, mul_x2_85, mul_x2_86, mul_x2_87, mul_x2_88, mul_x2_89, + mul_x2_90, mul_x2_91, mul_x2_92, mul_x2_93, mul_x2_94, mul_x2_95, + mul_x2_96, mul_x2_97, mul_x2_98, mul_x2_99, mul_x2_100, mul_x2_101, + mul_x2_102, mul_x2_103, mul_x2_104, mul_x2_105, mul_x2_106, mul_x2_107, + mul_x2_108, mul_x2_109, mul_x2_110, mul_x2_111, mul_x2_112, mul_x2_113, + mul_x2_114, mul_x2_115, mul_x2_116, mul_x2_117, mul_x2_118, mul_x2_119, + mul_x2_120, mul_x2_121, mul_x2_122, mul_x2_123, mul_x2_124, mul_x2_125, + mul_x2_126, mul_x2_127, mul_x2_128, mul_x2_129, mul_x2_130, mul_x2_131, + mul_x2_132, mul_x2_133, mul_x2_134, mul_x2_135, mul_x2_136, mul_x2_137, + mul_x2_138, mul_x2_139, mul_x2_140, mul_x2_141, mul_x2_142, mul_x2_143, + mul_x2_144, mul_x2_145, mul_x2_146, mul_x2_147, mul_x2_148, mul_x2_149, + mul_x2_150, mul_x2_151, mul_x2_152, mul_x2_153, mul_x2_154, mul_x2_155, + mul_x2_156, mul_x2_157, mul_x2_158, mul_x2_159, mul_x2_160, mul_x2_161, + mul_x2_162, mul_x2_163, mul_x2_164, mul_x2_165, mul_x2_166, mul_x2_167, + mul_x2_168, mul_x2_169, mul_x2_170, mul_x2_171, mul_x2_172, mul_x2_173, + mul_x2_174, mul_x2_175, mul_x2_176, mul_x2_177, mul_x2_178, mul_x2_179, + mul_x2_180, mul_x2_181, mul_x2_182, mul_x2_183, mul_x2_184, mul_x2_185, + mul_x2_186, mul_x2_187, mul_x2_188, mul_x2_189, mul_x2_190, mul_x2_191, + mul_x2_192, mul_x2_193, mul_x2_194, mul_x2_195, mul_x2_196, mul_x2_197, + mul_x2_198, mul_x2_199, mul_x2_200, mul_x2_201, mul_x2_202, mul_x2_203, + mul_x2_204, mul_x2_205, mul_x2_206, mul_x2_207, mul_x2_208, mul_x2_209, + mul_x2_210, mul_x2_211, mul_x2_212, mul_x2_213, mul_x2_214, mul_x2_215, + mul_x2_216, mul_x2_217, mul_x2_218, mul_x2_219, mul_x2_220, mul_x2_221, + mul_x2_222, mul_x2_223, mul_x2_224, mul_x2_225, mul_x2_226, mul_x2_227, + mul_x2_228, mul_x2_229, mul_x2_230, mul_x2_231, mul_x2_232, mul_x2_233, + mul_x2_234, mul_x2_235, mul_x2_236, mul_x2_237, mul_x2_238, mul_x2_239, + mul_x2_240, mul_x2_241, mul_x2_242, mul_x2_243, mul_x2_244, mul_x2_245, + mul_x2_246, mul_x2_247, mul_x2_248, mul_x2_249, mul_x2_250, mul_x2_251, + mul_x2_252, mul_x2_253, mul_x2_254, mul_x2_255 +}; + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + COPY(r, _mul_x2_in); \ + gf_x2_mul_fns[c](); \ + COPY(_mul_x2_acc, r); \ + break; \ + case 1: \ + COPY(r, _mul_x1_in); \ + gf_x1_mul_fns[c](); \ + COPY(_mul_x1_acc, r); \ + break; \ + default: \ + VERIFY(0); \ + } \ +} + + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 2 +#define MUL_DEFINE() MUL2_SETUP() +#define MUL_D 0, 1 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() MUL2_SETUP() +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() MUL2_SETUP() +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() MUL2_SETUP() +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 1 +#define REC_PQR_DEFINE() MUL2_SETUP() +#define REC_PQR_X 0 +#define REC_PQR_Y 1 +#define REC_PQR_Z 2 +#define REC_PQR_XS 3 +#define REC_PQR_YS 4 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(sse2); +DEFINE_REC_METHODS(sse2); + +static boolean_t +raidz_will_sse2_work(void) +{ + return (kfpu_allowed() && zfs_sse_available() && zfs_sse2_available()); +} + +const raidz_impl_ops_t vdev_raidz_sse2_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(sse2), + .rec = RAIDZ_REC_METHODS(sse2), + .is_supported = &raidz_will_sse2_work, + .name = "sse2" +}; + +#endif /* defined(__x86_64) && defined(HAVE_SSE2) */ diff --git a/module/zfs/vdev_raidz_math_ssse3.c b/module/zfs/vdev_raidz_math_ssse3.c new file mode 100644 index 000000000000..5ddc079a4f5d --- /dev/null +++ b/module/zfs/vdev_raidz_math_ssse3.c @@ -0,0 +1,2477 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2016 Gvozden NeÅ¡ković. All rights reserved. + */ + +#include + +#if defined(__x86_64) && defined(HAVE_SSSE3) + +#include +#include + +#ifdef __linux__ +#define __asm __asm__ __volatile__ +#endif + +#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N +#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) + +#define VR0_(REG, ...) "xmm"#REG +#define VR1_(_1, REG, ...) "xmm"#REG +#define VR2_(_1, _2, REG, ...) "xmm"#REG +#define VR3_(_1, _2, _3, REG, ...) "xmm"#REG +#define VR4_(_1, _2, _3, _4, REG, ...) "xmm"#REG +#define VR5_(_1, _2, _3, _4, _5, REG, ...) "xmm"#REG +#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "xmm"#REG +#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "xmm"#REG + +#define VR0(r...) VR0_(r) +#define VR1(r...) VR1_(r) +#define VR2(r...) VR2_(r, 1) +#define VR3(r...) VR3_(r, 1, 2) +#define VR4(r...) VR4_(r, 1, 2) +#define VR5(r...) VR5_(r, 1, 2, 3) +#define VR6(r...) VR6_(r, 1, 2, 3, 4) +#define VR7(r...) VR7_(r, 1, 2, 3, 4, 5) + +#define R_01(REG1, REG2, ...) REG1, REG2 +#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 +#define R_23(REG...) _R_23(REG, 1, 2, 3) + +#define ZFS_ASM_BUG() ASSERT(0) + +const uint8_t gf_clmul_mod_lt[4*256][16]; + +#define ELEM_SIZE 16 + +typedef struct v { + uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); +} v_t; + + +#define XOR_ACC(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + "pxor 0x20(%[SRC]), %%" VR2(r) "\n" \ + "pxor 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "pxor 0x00(%[SRC]), %%" VR0(r) "\n" \ + "pxor 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define XOR(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "pxor %" VR0(r) ", %" VR4(r) "\n" \ + "pxor %" VR1(r) ", %" VR5(r) "\n" \ + "pxor %" VR2(r) ", %" VR6(r) "\n" \ + "pxor %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "pxor %" VR0(r) ", %" VR2(r) "\n" \ + "pxor %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define ZERO(r...) XOR(r, r) + +#define COPY(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 8: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR4(r) "\n" \ + "movdqa %" VR1(r) ", %" VR5(r) "\n" \ + "movdqa %" VR2(r) ", %" VR6(r) "\n" \ + "movdqa %" VR3(r) ", %" VR7(r)); \ + break; \ + case 4: \ + __asm( \ + "movdqa %" VR0(r) ", %" VR2(r) "\n" \ + "movdqa %" VR1(r) ", %" VR3(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define LOAD(src, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + "movdqa 0x20(%[SRC]), %%" VR2(r) "\n" \ + "movdqa 0x30(%[SRC]), %%" VR3(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + case 2: \ + __asm( \ + "movdqa 0x00(%[SRC]), %%" VR0(r) "\n" \ + "movdqa 0x10(%[SRC]), %%" VR1(r) "\n" \ + : : [SRC] "r" (src)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define STORE(dst, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + "movdqa %%" VR2(r)", 0x20(%[DST])\n" \ + "movdqa %%" VR3(r)", 0x30(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + case 2: \ + __asm( \ + "movdqa %%" VR0(r)", 0x00(%[DST])\n" \ + "movdqa %%" VR1(r)", 0x10(%[DST])\n" \ + : : [DST] "r" (dst)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2_SETUP() \ +{ \ + __asm( \ + "movd %[mask], %%xmm15\n" \ + "pshufd $0x0, %%xmm15, %%xmm15\n" \ + : : [mask] "r" (0x1d1d1d1d)); \ +} + +#define _MUL2_x2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + "pxor %xmm14, %xmm14\n" \ + "pxor %xmm13, %xmm13\n" \ + "pcmpgtb %" VR0(r)", %xmm14\n" \ + "pcmpgtb %" VR1(r)", %xmm13\n" \ + "pand %xmm15, %xmm14\n" \ + "pand %xmm15, %xmm13\n" \ + "paddb %" VR0(r)", %" VR0(r) "\n" \ + "paddb %" VR1(r)", %" VR1(r) "\n" \ + "pxor %xmm14, %" VR0(r) "\n" \ + "pxor %xmm13, %" VR1(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL2(r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MUL2_x2(R_01(r)); \ + _MUL2_x2(R_23(r)); \ + break; \ + case 2: \ + _MUL2_x2(r); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL4(r...) \ +{ \ + MUL2(r); \ + MUL2(r); \ +} + +#define _0f "xmm15" +#define _a_save "xmm14" +#define _b_save "xmm13" +#define _lt_mod_a "xmm12" +#define _lt_clmul_a "xmm11" +#define _lt_mod_b "xmm10" +#define _lt_clmul_b "xmm15" + +#define _MULx2(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 2: \ + __asm( \ + /* lts for upper part */ \ + "movd %[mask], %%" _0f "\n" \ + "pshufd $0x0, %%" _0f ", %%" _0f "\n" \ + "movdqa 0x00(%[lt]), %%" _lt_mod_a "\n" \ + "movdqa 0x10(%[lt]), %%" _lt_clmul_a "\n" \ + /* upper part */ \ + "movdqa %%" VR0(r) ", %%" _a_save "\n" \ + "movdqa %%" VR1(r) ", %%" _b_save "\n" \ + "psraw $0x4, %%" VR0(r) "\n" \ + "psraw $0x4, %%" VR1(r) "\n" \ + "pand %%" _0f ", %%" _a_save "\n" \ + "pand %%" _0f ", %%" _b_save "\n" \ + "pand %%" _0f ", %%" VR0(r) "\n" \ + "pand %%" _0f ", %%" VR1(r) "\n" \ + \ + "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ + "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ + \ + "pshufb %%" VR0(r) ",%%" _lt_mod_a "\n" \ + "pshufb %%" VR1(r) ",%%" _lt_mod_b "\n" \ + "pshufb %%" VR0(r) ",%%" _lt_clmul_a "\n" \ + "pshufb %%" VR1(r) ",%%" _lt_clmul_b "\n" \ + \ + "pxor %%" _lt_mod_a ",%%" _lt_clmul_a "\n" \ + "pxor %%" _lt_mod_b ",%%" _lt_clmul_b "\n" \ + "movdqa %%" _lt_clmul_a ",%%" VR0(r) "\n" \ + "movdqa %%" _lt_clmul_b ",%%" VR1(r) "\n" \ + /* lts for lower part */ \ + "movdqa 0x20(%[lt]), %%" _lt_mod_a "\n" \ + "movdqa 0x30(%[lt]), %%" _lt_clmul_a "\n" \ + "movdqa %%" _lt_mod_a ", %%" _lt_mod_b "\n" \ + "movdqa %%" _lt_clmul_a ", %%" _lt_clmul_b "\n" \ + /* lower part */ \ + "pshufb %%" _a_save ",%%" _lt_mod_a "\n" \ + "pshufb %%" _b_save ",%%" _lt_mod_b "\n" \ + "pshufb %%" _a_save ",%%" _lt_clmul_a "\n" \ + "pshufb %%" _b_save ",%%" _lt_clmul_b "\n" \ + \ + "pxor %%" _lt_mod_a ",%%" VR0(r) "\n" \ + "pxor %%" _lt_mod_b ",%%" VR1(r) "\n" \ + "pxor %%" _lt_clmul_a ",%%" VR0(r) "\n" \ + "pxor %%" _lt_clmul_b ",%%" VR1(r) "\n" \ + : : [mask] "r" (0x0f0f0f0f), \ + [lt] "r" (gf_clmul_mod_lt[4*(c)])); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define MUL(c, r...) \ +{ \ + switch (REG_CNT(r)) { \ + case 4: \ + _MULx2(c, R_23(r)); \ + _MULx2(c, R_01(r)); \ + break; \ + case 2: \ + _MULx2(c, R_01(r)); \ + break; \ + default: \ + ZFS_ASM_BUG(); \ + } \ +} + +#define raidz_math_begin() kfpu_begin() +#define raidz_math_end() kfpu_end() + + +#define SYN_STRIDE 4 + +#define ZERO_STRIDE 4 +#define ZERO_DEFINE() {} +#define ZERO_D 0, 1, 2, 3 + +#define COPY_STRIDE 4 +#define COPY_DEFINE() {} +#define COPY_D 0, 1, 2, 3 + +#define ADD_STRIDE 4 +#define ADD_DEFINE() {} +#define ADD_D 0, 1, 2, 3 + +#define MUL_STRIDE 4 +#define MUL_DEFINE() {} +#define MUL_D 0, 1, 2, 3 + +#define GEN_P_STRIDE 4 +#define GEN_P_DEFINE() {} +#define GEN_P_P 0, 1, 2, 3 + +#define GEN_PQ_STRIDE 4 +#define GEN_PQ_DEFINE() {} +#define GEN_PQ_D 0, 1, 2, 3 +#define GEN_PQ_C 4, 5, 6, 7 + +#define GEN_PQR_STRIDE 4 +#define GEN_PQR_DEFINE() {} +#define GEN_PQR_D 0, 1, 2, 3 +#define GEN_PQR_C 4, 5, 6, 7 + +#define SYN_Q_DEFINE() {} +#define SYN_Q_D 0, 1, 2, 3 +#define SYN_Q_X 4, 5, 6, 7 + +#define SYN_R_DEFINE() {} +#define SYN_R_D 0, 1, 2, 3 +#define SYN_R_X 4, 5, 6, 7 + +#define SYN_PQ_DEFINE() {} +#define SYN_PQ_D 0, 1, 2, 3 +#define SYN_PQ_X 4, 5, 6, 7 + +#define REC_PQ_STRIDE 2 +#define REC_PQ_DEFINE() {} +#define REC_PQ_X 0, 1 +#define REC_PQ_Y 2, 3 +#define REC_PQ_T 4, 5 + +#define SYN_PR_DEFINE() {} +#define SYN_PR_D 0, 1, 2, 3 +#define SYN_PR_X 4, 5, 6, 7 + +#define REC_PR_STRIDE 2 +#define REC_PR_DEFINE() {} +#define REC_PR_X 0, 1 +#define REC_PR_Y 2, 3 +#define REC_PR_T 4, 5 + +#define SYN_QR_DEFINE() {} +#define SYN_QR_D 0, 1, 2, 3 +#define SYN_QR_X 4, 5, 6, 7 + +#define REC_QR_STRIDE 2 +#define REC_QR_DEFINE() {} +#define REC_QR_X 0, 1 +#define REC_QR_Y 2, 3 +#define REC_QR_T 4, 5 + +#define SYN_PQR_DEFINE() {} +#define SYN_PQR_D 0, 1, 2, 3 +#define SYN_PQR_X 4, 5, 6, 7 + +#define REC_PQR_STRIDE 2 +#define REC_PQR_DEFINE() {} +#define REC_PQR_X 0, 1 +#define REC_PQR_Y 2, 3 +#define REC_PQR_Z 4, 5 +#define REC_PQR_XS 6, 7 +#define REC_PQR_YS 8, 9 + + +#include +#include "vdev_raidz_math_impl.h" + +DEFINE_GEN_METHODS(ssse3); +DEFINE_REC_METHODS(ssse3); + +static boolean_t +raidz_will_ssse3_work(void) +{ + return (kfpu_allowed() && zfs_sse_available() && + zfs_sse2_available() && zfs_ssse3_available()); +} + +const raidz_impl_ops_t vdev_raidz_ssse3_impl = { + .init = NULL, + .fini = NULL, + .gen = RAIDZ_GEN_METHODS(ssse3), + .rec = RAIDZ_REC_METHODS(ssse3), + .is_supported = &raidz_will_ssse3_work, + .name = "ssse3" +}; + +#endif /* defined(__x86_64) && defined(HAVE_SSSE3) */ + + +#if defined(__x86_64) +#if defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) +/* BEGIN CSTYLED */ +const uint8_t +__attribute__((aligned(256))) gf_clmul_mod_lt[4*256][16] = +{ + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, + 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, + 0x10, 0x12, 0x14, 0x16, 0x18, 0x1a, 0x1c, 0x1e }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x03, 0x06, 0x05, 0x0c, 0x0f, 0x0a, 0x09, + 0x18, 0x1b, 0x1e, 0x1d, 0x14, 0x17, 0x12, 0x11 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x04, 0x08, 0x0c, 0x10, 0x14, 0x18, 0x1c, + 0x20, 0x24, 0x28, 0x2c, 0x30, 0x34, 0x38, 0x3c }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x05, 0x0a, 0x0f, 0x14, 0x11, 0x1e, 0x1b, + 0x28, 0x2d, 0x22, 0x27, 0x3c, 0x39, 0x36, 0x33 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x06, 0x0c, 0x0a, 0x18, 0x1e, 0x14, 0x12, + 0x30, 0x36, 0x3c, 0x3a, 0x28, 0x2e, 0x24, 0x22 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x07, 0x0e, 0x09, 0x1c, 0x1b, 0x12, 0x15, + 0x38, 0x3f, 0x36, 0x31, 0x24, 0x23, 0x2a, 0x2d }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x08, 0x10, 0x18, 0x20, 0x28, 0x30, 0x38, + 0x40, 0x48, 0x50, 0x58, 0x60, 0x68, 0x70, 0x78 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x09, 0x12, 0x1b, 0x24, 0x2d, 0x36, 0x3f, + 0x48, 0x41, 0x5a, 0x53, 0x6c, 0x65, 0x7e, 0x77 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0a, 0x14, 0x1e, 0x28, 0x22, 0x3c, 0x36, + 0x50, 0x5a, 0x44, 0x4e, 0x78, 0x72, 0x6c, 0x66 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0b, 0x16, 0x1d, 0x2c, 0x27, 0x3a, 0x31, + 0x58, 0x53, 0x4e, 0x45, 0x74, 0x7f, 0x62, 0x69 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0c, 0x18, 0x14, 0x30, 0x3c, 0x28, 0x24, + 0x60, 0x6c, 0x78, 0x74, 0x50, 0x5c, 0x48, 0x44 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0d, 0x1a, 0x17, 0x34, 0x39, 0x2e, 0x23, + 0x68, 0x65, 0x72, 0x7f, 0x5c, 0x51, 0x46, 0x4b }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0e, 0x1c, 0x12, 0x38, 0x36, 0x24, 0x2a, + 0x70, 0x7e, 0x6c, 0x62, 0x48, 0x46, 0x54, 0x5a }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x0f, 0x1e, 0x11, 0x3c, 0x33, 0x22, 0x2d, + 0x78, 0x77, 0x66, 0x69, 0x44, 0x4b, 0x5a, 0x55 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x11, 0x22, 0x33, 0x44, 0x55, 0x66, 0x77, + 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x12, 0x24, 0x36, 0x48, 0x5a, 0x6c, 0x7e, + 0x90, 0x82, 0xb4, 0xa6, 0xd8, 0xca, 0xfc, 0xee }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xf5, 0xe8, 0xcf, 0xd2, 0x81, 0x9c, 0xbb, 0xa6 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x13, 0x26, 0x35, 0x4c, 0x5f, 0x6a, 0x79, + 0x98, 0x8b, 0xbe, 0xad, 0xd4, 0xc7, 0xf2, 0xe1 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x14, 0x28, 0x3c, 0x50, 0x44, 0x78, 0x6c, + 0xa0, 0xb4, 0x88, 0x9c, 0xf0, 0xe4, 0xd8, 0xcc }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xd2, 0xcf, 0xe8, 0xf5, 0xbb, 0xa6, 0x81, 0x9c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x15, 0x2a, 0x3f, 0x54, 0x41, 0x7e, 0x6b, + 0xa8, 0xbd, 0x82, 0x97, 0xfc, 0xe9, 0xd6, 0xc3 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x16, 0x2c, 0x3a, 0x58, 0x4e, 0x74, 0x62, + 0xb0, 0xa6, 0x9c, 0x8a, 0xe8, 0xfe, 0xc4, 0xd2 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x69, 0x74, 0x53, 0x4e, + 0xcf, 0xd2, 0xf5, 0xe8, 0xa6, 0xbb, 0x9c, 0x81 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x17, 0x2e, 0x39, 0x5c, 0x4b, 0x72, 0x65, + 0xb8, 0xaf, 0x96, 0x81, 0xe4, 0xf3, 0xca, 0xdd }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x18, 0x30, 0x28, 0x60, 0x78, 0x50, 0x48, + 0xc0, 0xd8, 0xf0, 0xe8, 0xa0, 0xb8, 0x90, 0x88 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x9c, 0x81, 0xbb, 0xa6, 0xd2, 0xcf, 0xf5, 0xe8 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x19, 0x32, 0x2b, 0x64, 0x7d, 0x56, 0x4f, + 0xc8, 0xd1, 0xfa, 0xe3, 0xac, 0xb5, 0x9e, 0x87 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1a, 0x34, 0x2e, 0x68, 0x72, 0x5c, 0x46, + 0xd0, 0xca, 0xe4, 0xfe, 0xb8, 0xa2, 0x8c, 0x96 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x4e, 0x53, 0x69, 0x74, + 0x81, 0x9c, 0xa6, 0xbb, 0xcf, 0xd2, 0xe8, 0xf5 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1b, 0x36, 0x2d, 0x6c, 0x77, 0x5a, 0x41, + 0xd8, 0xc3, 0xee, 0xf5, 0xb4, 0xaf, 0x82, 0x99 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1c, 0x38, 0x24, 0x70, 0x6c, 0x48, 0x54, + 0xe0, 0xfc, 0xd8, 0xc4, 0x90, 0x8c, 0xa8, 0xb4 }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xa6, 0xbb, 0x81, 0x9c, 0xf5, 0xe8, 0xd2, 0xcf }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1d, 0x3a, 0x27, 0x74, 0x69, 0x4e, 0x53, + 0xe8, 0xf5, 0xd2, 0xcf, 0x9c, 0x81, 0xa6, 0xbb }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1e, 0x3c, 0x22, 0x78, 0x66, 0x44, 0x5a, + 0xf0, 0xee, 0xcc, 0xd2, 0x88, 0x96, 0xb4, 0xaa }, + { 0x00, 0x1d, 0x27, 0x3a, 0x53, 0x4e, 0x74, 0x69, + 0xbb, 0xa6, 0x9c, 0x81, 0xe8, 0xf5, 0xcf, 0xd2 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x1f, 0x3e, 0x21, 0x7c, 0x63, 0x42, 0x5d, + 0xf8, 0xe7, 0xc6, 0xd9, 0x84, 0x9b, 0xba, 0xa5 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xcd, 0xf7, 0xb9, 0x83, 0x25, 0x1f, 0x51, 0x6b }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x21, 0x42, 0x63, 0x84, 0xa5, 0xc6, 0xe7, + 0x08, 0x29, 0x4a, 0x6b, 0x8c, 0xad, 0xce, 0xef }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x22, 0x44, 0x66, 0x88, 0xaa, 0xcc, 0xee, + 0x10, 0x32, 0x54, 0x76, 0x98, 0xba, 0xdc, 0xfe }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x23, 0x46, 0x65, 0x8c, 0xaf, 0xca, 0xe9, + 0x18, 0x3b, 0x5e, 0x7d, 0x94, 0xb7, 0xd2, 0xf1 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x24, 0x48, 0x6c, 0x90, 0xb4, 0xd8, 0xfc, + 0x20, 0x04, 0x68, 0x4c, 0xb0, 0x94, 0xf8, 0xdc }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xf7, 0xcd, 0x83, 0xb9, 0x02, 0x38, 0x76, 0x4c }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x25, 0x4a, 0x6f, 0x94, 0xb1, 0xde, 0xfb, + 0x28, 0x0d, 0x62, 0x47, 0xbc, 0x99, 0xf6, 0xd3 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x26, 0x4c, 0x6a, 0x98, 0xbe, 0xd4, 0xf2, + 0x30, 0x16, 0x7c, 0x5a, 0xa8, 0x8e, 0xe4, 0xc2 }, + { 0x00, 0x3a, 0x74, 0x4e, 0xf5, 0xcf, 0x81, 0xbb, + 0xea, 0xd0, 0x9e, 0xa4, 0x1f, 0x25, 0x6b, 0x51 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x28, 0x50, 0x78, 0xa0, 0x88, 0xf0, 0xd8, + 0x40, 0x68, 0x10, 0x38, 0xe0, 0xc8, 0xb0, 0x98 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xb9, 0x83, 0xd0, 0xea, 0x6b, 0x51, 0x02, 0x38 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x29, 0x52, 0x7b, 0xa4, 0x8d, 0xf6, 0xdf, + 0x48, 0x61, 0x1a, 0x33, 0xec, 0xc5, 0xbe, 0x97 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2a, 0x54, 0x7e, 0xa8, 0x82, 0xfc, 0xd6, + 0x50, 0x7a, 0x04, 0x2e, 0xf8, 0xd2, 0xac, 0x86 }, + { 0x00, 0x3a, 0x69, 0x53, 0xd2, 0xe8, 0xbb, 0x81, + 0xa4, 0x9e, 0xcd, 0xf7, 0x76, 0x4c, 0x1f, 0x25 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2b, 0x56, 0x7d, 0xac, 0x87, 0xfa, 0xd1, + 0x58, 0x73, 0x0e, 0x25, 0xf4, 0xdf, 0xa2, 0x89 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2c, 0x58, 0x74, 0xb0, 0x9c, 0xe8, 0xc4, + 0x60, 0x4c, 0x38, 0x14, 0xd0, 0xfc, 0x88, 0xa4 }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x83, 0xb9, 0xea, 0xd0, 0x4c, 0x76, 0x25, 0x1f }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2d, 0x5a, 0x77, 0xb4, 0x99, 0xee, 0xc3, + 0x68, 0x45, 0x32, 0x1f, 0xdc, 0xf1, 0x86, 0xab }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2e, 0x5c, 0x72, 0xb8, 0x96, 0xe4, 0xca, + 0x70, 0x5e, 0x2c, 0x02, 0xc8, 0xe6, 0x94, 0xba }, + { 0x00, 0x3a, 0x69, 0x53, 0xcf, 0xf5, 0xa6, 0x9c, + 0x9e, 0xa4, 0xf7, 0xcd, 0x51, 0x6b, 0x38, 0x02 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x2f, 0x5e, 0x71, 0xbc, 0x93, 0xe2, 0xcd, + 0x78, 0x57, 0x26, 0x09, 0xc4, 0xeb, 0x9a, 0xb5 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x25, 0x02, 0x6b, 0x4c, 0xb9, 0x9e, 0xf7, 0xd0 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x31, 0x62, 0x53, 0xc4, 0xf5, 0xa6, 0x97, + 0x88, 0xb9, 0xea, 0xdb, 0x4c, 0x7d, 0x2e, 0x1f }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x32, 0x64, 0x56, 0xc8, 0xfa, 0xac, 0x9e, + 0x90, 0xa2, 0xf4, 0xc6, 0x58, 0x6a, 0x3c, 0x0e }, + { 0x00, 0x27, 0x4e, 0x69, 0x9c, 0xbb, 0xd2, 0xf5, + 0x38, 0x1f, 0x76, 0x51, 0xa4, 0x83, 0xea, 0xcd }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x33, 0x66, 0x55, 0xcc, 0xff, 0xaa, 0x99, + 0x98, 0xab, 0xfe, 0xcd, 0x54, 0x67, 0x32, 0x01 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x34, 0x68, 0x5c, 0xd0, 0xe4, 0xb8, 0x8c, + 0xa0, 0x94, 0xc8, 0xfc, 0x70, 0x44, 0x18, 0x2c }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x1f, 0x38, 0x51, 0x76, 0x9e, 0xb9, 0xd0, 0xf7 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x35, 0x6a, 0x5f, 0xd4, 0xe1, 0xbe, 0x8b, + 0xa8, 0x9d, 0xc2, 0xf7, 0x7c, 0x49, 0x16, 0x23 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x36, 0x6c, 0x5a, 0xd8, 0xee, 0xb4, 0x82, + 0xb0, 0x86, 0xdc, 0xea, 0x68, 0x5e, 0x04, 0x32 }, + { 0x00, 0x27, 0x4e, 0x69, 0x81, 0xa6, 0xcf, 0xe8, + 0x02, 0x25, 0x4c, 0x6b, 0x83, 0xa4, 0xcd, 0xea }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x37, 0x6e, 0x59, 0xdc, 0xeb, 0xb2, 0x85, + 0xb8, 0x8f, 0xd6, 0xe1, 0x64, 0x53, 0x0a, 0x3d }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x38, 0x70, 0x48, 0xe0, 0xd8, 0x90, 0xa8, + 0xc0, 0xf8, 0xb0, 0x88, 0x20, 0x18, 0x50, 0x68 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x51, 0x76, 0x02, 0x25, 0xf7, 0xd0, 0xa4, 0x83 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x39, 0x72, 0x4b, 0xe4, 0xdd, 0x96, 0xaf, + 0xc8, 0xf1, 0xba, 0x83, 0x2c, 0x15, 0x5e, 0x67 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3a, 0x74, 0x4e, 0xe8, 0xd2, 0x9c, 0xa6, + 0xd0, 0xea, 0xa4, 0x9e, 0x38, 0x02, 0x4c, 0x76 }, + { 0x00, 0x27, 0x53, 0x74, 0xa6, 0x81, 0xf5, 0xd2, + 0x4c, 0x6b, 0x1f, 0x38, 0xea, 0xcd, 0xb9, 0x9e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3b, 0x76, 0x4d, 0xec, 0xd7, 0x9a, 0xa1, + 0xd8, 0xe3, 0xae, 0x95, 0x34, 0x0f, 0x42, 0x79 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3c, 0x78, 0x44, 0xf0, 0xcc, 0x88, 0xb4, + 0xe0, 0xdc, 0x98, 0xa4, 0x10, 0x2c, 0x68, 0x54 }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x6b, 0x4c, 0x38, 0x1f, 0xd0, 0xf7, 0x83, 0xa4 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3d, 0x7a, 0x47, 0xf4, 0xc9, 0x8e, 0xb3, + 0xe8, 0xd5, 0x92, 0xaf, 0x1c, 0x21, 0x66, 0x5b }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3e, 0x7c, 0x42, 0xf8, 0xc6, 0x84, 0xba, + 0xf0, 0xce, 0x8c, 0xb2, 0x08, 0x36, 0x74, 0x4a }, + { 0x00, 0x27, 0x53, 0x74, 0xbb, 0x9c, 0xe8, 0xcf, + 0x76, 0x51, 0x25, 0x02, 0xcd, 0xea, 0x9e, 0xb9 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d, 0x1d }, + { 0x00, 0x3f, 0x7e, 0x41, 0xfc, 0xc3, 0x82, 0xbd, + 0xf8, 0xc7, 0x86, 0xb9, 0x04, 0x3b, 0x7a, 0x45 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x87, 0xf3, 0x6f, 0x1b, 0x4a, 0x3e, 0xa2, 0xd6 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x41, 0x82, 0xc3, 0x04, 0x45, 0x86, 0xc7, + 0x08, 0x49, 0x8a, 0xcb, 0x0c, 0x4d, 0x8e, 0xcf }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x42, 0x84, 0xc6, 0x08, 0x4a, 0x8c, 0xce, + 0x10, 0x52, 0x94, 0xd6, 0x18, 0x5a, 0x9c, 0xde }, + { 0x00, 0x74, 0xe8, 0x9c, 0xcd, 0xb9, 0x25, 0x51, + 0x9a, 0xee, 0x72, 0x06, 0x57, 0x23, 0xbf, 0xcb }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x43, 0x86, 0xc5, 0x0c, 0x4f, 0x8a, 0xc9, + 0x18, 0x5b, 0x9e, 0xdd, 0x14, 0x57, 0x92, 0xd1 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x44, 0x88, 0xcc, 0x10, 0x54, 0x98, 0xdc, + 0x20, 0x64, 0xa8, 0xec, 0x30, 0x74, 0xb8, 0xfc }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xbd, 0xc9, 0x55, 0x21, 0x6d, 0x19, 0x85, 0xf1 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x45, 0x8a, 0xcf, 0x14, 0x51, 0x9e, 0xdb, + 0x28, 0x6d, 0xa2, 0xe7, 0x3c, 0x79, 0xb6, 0xf3 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x46, 0x8c, 0xca, 0x18, 0x5e, 0x94, 0xd2, + 0x30, 0x76, 0xbc, 0xfa, 0x28, 0x6e, 0xa4, 0xe2 }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x47, 0x8e, 0xc9, 0x1c, 0x5b, 0x92, 0xd5, + 0x38, 0x7f, 0xb6, 0xf1, 0x24, 0x63, 0xaa, 0xed }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x48, 0x90, 0xd8, 0x20, 0x68, 0xb0, 0xf8, + 0x40, 0x08, 0xd0, 0x98, 0x60, 0x28, 0xf0, 0xb8 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xf3, 0x87, 0x06, 0x72, 0x04, 0x70, 0xf1, 0x85 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x49, 0x92, 0xdb, 0x24, 0x6d, 0xb6, 0xff, + 0x48, 0x01, 0xda, 0x93, 0x6c, 0x25, 0xfe, 0xb7 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4a, 0x94, 0xde, 0x28, 0x62, 0xbc, 0xf6, + 0x50, 0x1a, 0xc4, 0x8e, 0x78, 0x32, 0xec, 0xa6 }, + { 0x00, 0x74, 0xf5, 0x81, 0xf7, 0x83, 0x02, 0x76, + 0xee, 0x9a, 0x1b, 0x6f, 0x19, 0x6d, 0xec, 0x98 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4b, 0x96, 0xdd, 0x2c, 0x67, 0xba, 0xf1, + 0x58, 0x13, 0xce, 0x85, 0x74, 0x3f, 0xe2, 0xa9 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4c, 0x98, 0xd4, 0x30, 0x7c, 0xa8, 0xe4, + 0x60, 0x2c, 0xf8, 0xb4, 0x50, 0x1c, 0xc8, 0x84 }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xc9, 0xbd, 0x3c, 0x48, 0x23, 0x57, 0xd6, 0xa2 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4d, 0x9a, 0xd7, 0x34, 0x79, 0xae, 0xe3, + 0x68, 0x25, 0xf2, 0xbf, 0x5c, 0x11, 0xc6, 0x8b }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x74, 0xf5, 0x81, 0xea, 0x9e, 0x1f, 0x6b, + 0xd4, 0xa0, 0x21, 0x55, 0x3e, 0x4a, 0xcb, 0xbf }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x4f, 0x9e, 0xd1, 0x3c, 0x73, 0xa2, 0xed, + 0x78, 0x37, 0xe6, 0xa9, 0x44, 0x0b, 0xda, 0x95 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x6f, 0x06, 0xbd, 0xd4, 0xd6, 0xbf, 0x04, 0x6d }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x51, 0xa2, 0xf3, 0x44, 0x15, 0xe6, 0xb7, + 0x88, 0xd9, 0x2a, 0x7b, 0xcc, 0x9d, 0x6e, 0x3f }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x52, 0xa4, 0xf6, 0x48, 0x1a, 0xec, 0xbe, + 0x90, 0xc2, 0x34, 0x66, 0xd8, 0x8a, 0x7c, 0x2e }, + { 0x00, 0x69, 0xd2, 0xbb, 0xb9, 0xd0, 0x6b, 0x02, + 0x72, 0x1b, 0xa0, 0xc9, 0xcb, 0xa2, 0x19, 0x70 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x54, 0xa8, 0xfc, 0x50, 0x04, 0xf8, 0xac, + 0xa0, 0xf4, 0x08, 0x5c, 0xf0, 0xa4, 0x58, 0x0c }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x55, 0x3c, 0x87, 0xee, 0xf1, 0x98, 0x23, 0x4a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x55, 0xaa, 0xff, 0x54, 0x01, 0xfe, 0xab, + 0xa8, 0xfd, 0x02, 0x57, 0xfc, 0xa9, 0x56, 0x03 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x56, 0xac, 0xfa, 0x58, 0x0e, 0xf4, 0xa2, + 0xb0, 0xe6, 0x1c, 0x4a, 0xe8, 0xbe, 0x44, 0x12 }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x57, 0xae, 0xf9, 0x5c, 0x0b, 0xf2, 0xa5, + 0xb8, 0xef, 0x16, 0x41, 0xe4, 0xb3, 0x4a, 0x1d }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x58, 0xb0, 0xe8, 0x60, 0x38, 0xd0, 0x88, + 0xc0, 0x98, 0x70, 0x28, 0xa0, 0xf8, 0x10, 0x48 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x1b, 0x72, 0xd4, 0xbd, 0x98, 0xf1, 0x57, 0x3e }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x59, 0xb2, 0xeb, 0x64, 0x3d, 0xd6, 0x8f, + 0xc8, 0x91, 0x7a, 0x23, 0xac, 0xf5, 0x1e, 0x47 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5a, 0xb4, 0xee, 0x68, 0x32, 0xdc, 0x86, + 0xd0, 0x8a, 0x64, 0x3e, 0xb8, 0xe2, 0x0c, 0x56 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x83, 0xea, 0x4c, 0x25, + 0x06, 0x6f, 0xc9, 0xa0, 0x85, 0xec, 0x4a, 0x23 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5b, 0xb6, 0xed, 0x6c, 0x37, 0xda, 0x81, + 0xd8, 0x83, 0x6e, 0x35, 0xb4, 0xef, 0x02, 0x59 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5c, 0xb8, 0xe4, 0x70, 0x2c, 0xc8, 0x94, + 0xe0, 0xbc, 0x58, 0x04, 0x90, 0xcc, 0x28, 0x74 }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x21, 0x48, 0xee, 0x87, 0xbf, 0xd6, 0x70, 0x19 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5d, 0xba, 0xe7, 0x74, 0x29, 0xce, 0x93, + 0xe8, 0xb5, 0x52, 0x0f, 0x9c, 0xc1, 0x26, 0x7b }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5e, 0xbc, 0xe2, 0x78, 0x26, 0xc4, 0x9a, + 0xf0, 0xae, 0x4c, 0x12, 0x88, 0xd6, 0x34, 0x6a }, + { 0x00, 0x69, 0xcf, 0xa6, 0x9e, 0xf7, 0x51, 0x38, + 0x3c, 0x55, 0xf3, 0x9a, 0xa2, 0xcb, 0x6d, 0x04 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x3a, 0x3a, 0x3a, 0x3a, 0x27, 0x27, 0x27, 0x27 }, + { 0x00, 0x5f, 0xbe, 0xe1, 0x7c, 0x23, 0xc2, 0x9d, + 0xf8, 0xa7, 0x46, 0x19, 0x84, 0xdb, 0x3a, 0x65 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x4a, 0x04, 0xd6, 0x98, 0x6f, 0x21, 0xf3, 0xbd }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x61, 0xc2, 0xa3, 0x84, 0xe5, 0x46, 0x27, + 0x08, 0x69, 0xca, 0xab, 0x8c, 0xed, 0x4e, 0x2f }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x62, 0xc4, 0xa6, 0x88, 0xea, 0x4c, 0x2e, + 0x10, 0x72, 0xd4, 0xb6, 0x98, 0xfa, 0x5c, 0x3e }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x25, 0x6b, 0xb9, 0xf7, + 0x57, 0x19, 0xcb, 0x85, 0x72, 0x3c, 0xee, 0xa0 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x63, 0xc6, 0xa5, 0x8c, 0xef, 0x4a, 0x29, + 0x18, 0x7b, 0xde, 0xbd, 0x94, 0xf7, 0x52, 0x31 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x64, 0xc8, 0xac, 0x90, 0xf4, 0x58, 0x3c, + 0x20, 0x44, 0xe8, 0x8c, 0xb0, 0xd4, 0x78, 0x1c }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x70, 0x3e, 0xec, 0xa2, 0x48, 0x06, 0xd4, 0x9a }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x65, 0xca, 0xaf, 0x94, 0xf1, 0x5e, 0x3b, + 0x28, 0x4d, 0xe2, 0x87, 0xbc, 0xd9, 0x76, 0x13 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x66, 0xcc, 0xaa, 0x98, 0xfe, 0x54, 0x32, + 0x30, 0x56, 0xfc, 0x9a, 0xa8, 0xce, 0x64, 0x02 }, + { 0x00, 0x4e, 0x9c, 0xd2, 0x38, 0x76, 0xa4, 0xea, + 0x6d, 0x23, 0xf1, 0xbf, 0x55, 0x1b, 0xc9, 0x87 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x67, 0xce, 0xa9, 0x9c, 0xfb, 0x52, 0x35, + 0x38, 0x5f, 0xf6, 0x91, 0xa4, 0xc3, 0x6a, 0x0d }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x68, 0xd0, 0xb8, 0xa0, 0xc8, 0x70, 0x18, + 0x40, 0x28, 0x90, 0xf8, 0xe0, 0x88, 0x30, 0x58 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x3e, 0x70, 0xbf, 0xf1, 0x21, 0x6f, 0xa0, 0xee }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x69, 0xd2, 0xbb, 0xa4, 0xcd, 0x76, 0x1f, + 0x48, 0x21, 0x9a, 0xf3, 0xec, 0x85, 0x3e, 0x57 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6a, 0xd4, 0xbe, 0xa8, 0xc2, 0x7c, 0x16, + 0x50, 0x3a, 0x84, 0xee, 0xf8, 0x92, 0x2c, 0x46 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x1f, 0x51, 0x9e, 0xd0, + 0x23, 0x6d, 0xa2, 0xec, 0x3c, 0x72, 0xbd, 0xf3 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6b, 0xd6, 0xbd, 0xac, 0xc7, 0x7a, 0x11, + 0x58, 0x33, 0x8e, 0xe5, 0xf4, 0x9f, 0x22, 0x49 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6c, 0xd8, 0xb4, 0xb0, 0xdc, 0x68, 0x04, + 0x60, 0x0c, 0xb8, 0xd4, 0xd0, 0xbc, 0x08, 0x64 }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x04, 0x4a, 0x85, 0xcb, 0x06, 0x48, 0x87, 0xc9 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6d, 0xda, 0xb7, 0xb4, 0xd9, 0x6e, 0x03, + 0x68, 0x05, 0xb2, 0xdf, 0xdc, 0xb1, 0x06, 0x6b }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6e, 0xdc, 0xb2, 0xb8, 0xd6, 0x64, 0x0a, + 0x70, 0x1e, 0xac, 0xc2, 0xc8, 0xa6, 0x14, 0x7a }, + { 0x00, 0x4e, 0x81, 0xcf, 0x02, 0x4c, 0x83, 0xcd, + 0x19, 0x57, 0x98, 0xd6, 0x1b, 0x55, 0x9a, 0xd4 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x6f, 0xde, 0xb1, 0xbc, 0xd3, 0x62, 0x0d, + 0x78, 0x17, 0xa6, 0xc9, 0xc4, 0xab, 0x1a, 0x75 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xa2, 0xf1, 0x04, 0x57, 0xf3, 0xa0, 0x55, 0x06 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x71, 0xe2, 0x93, 0xc4, 0xb5, 0x26, 0x57, + 0x88, 0xf9, 0x6a, 0x1b, 0x4c, 0x3d, 0xae, 0xdf }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x72, 0xe4, 0x96, 0xc8, 0xba, 0x2c, 0x5e, + 0x90, 0xe2, 0x74, 0x06, 0x58, 0x2a, 0xbc, 0xce }, + { 0x00, 0x53, 0xa6, 0xf5, 0x51, 0x02, 0xf7, 0xa4, + 0xbf, 0xec, 0x19, 0x4a, 0xee, 0xbd, 0x48, 0x1b }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x73, 0xe6, 0x95, 0xcc, 0xbf, 0x2a, 0x59, + 0x98, 0xeb, 0x7e, 0x0d, 0x54, 0x27, 0xb2, 0xc1 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x74, 0xe8, 0x9c, 0xd0, 0xa4, 0x38, 0x4c, + 0xa0, 0xd4, 0x48, 0x3c, 0x70, 0x04, 0x98, 0xec }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x98, 0xcb, 0x3e, 0x6d, 0xd4, 0x87, 0x72, 0x21 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x75, 0xea, 0x9f, 0xd4, 0xa1, 0x3e, 0x4b, + 0xa8, 0xdd, 0x42, 0x37, 0x7c, 0x09, 0x96, 0xe3 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x76, 0xec, 0x9a, 0xd8, 0xae, 0x34, 0x42, + 0xb0, 0xc6, 0x5c, 0x2a, 0x68, 0x1e, 0x84, 0xf2 }, + { 0x00, 0x53, 0xa6, 0xf5, 0x4c, 0x1f, 0xea, 0xb9, + 0x85, 0xd6, 0x23, 0x70, 0xc9, 0x9a, 0x6f, 0x3c }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x77, 0xee, 0x99, 0xdc, 0xab, 0x32, 0x45, + 0xb8, 0xcf, 0x56, 0x21, 0x64, 0x13, 0x8a, 0xfd }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x78, 0xf0, 0x88, 0xe0, 0x98, 0x10, 0x68, + 0xc0, 0xb8, 0x30, 0x48, 0x20, 0x58, 0xd0, 0xa8 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xd6, 0x85, 0x6d, 0x3e, 0xbd, 0xee, 0x06, 0x55 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x79, 0xf2, 0x8b, 0xe4, 0x9d, 0x16, 0x6f, + 0xc8, 0xb1, 0x3a, 0x43, 0x2c, 0x55, 0xde, 0xa7 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7a, 0xf4, 0x8e, 0xe8, 0x92, 0x1c, 0x66, + 0xd0, 0xaa, 0x24, 0x5e, 0x38, 0x42, 0xcc, 0xb6 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x6b, 0x38, 0xd0, 0x83, + 0xcb, 0x98, 0x70, 0x23, 0xa0, 0xf3, 0x1b, 0x48 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7b, 0xf6, 0x8d, 0xec, 0x97, 0x1a, 0x61, + 0xd8, 0xa3, 0x2e, 0x55, 0x34, 0x4f, 0xc2, 0xb9 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7c, 0xf8, 0x84, 0xf0, 0x8c, 0x08, 0x74, + 0xe0, 0x9c, 0x18, 0x64, 0x10, 0x6c, 0xe8, 0x94 }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xec, 0xbf, 0x57, 0x04, 0x9a, 0xc9, 0x21, 0x72 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7d, 0xfa, 0x87, 0xf4, 0x89, 0x0e, 0x73, + 0xe8, 0x95, 0x12, 0x6f, 0x1c, 0x61, 0xe6, 0x9b }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7e, 0xfc, 0x82, 0xf8, 0x86, 0x04, 0x7a, + 0xf0, 0x8e, 0x0c, 0x72, 0x08, 0x76, 0xf4, 0x8a }, + { 0x00, 0x53, 0xbb, 0xe8, 0x76, 0x25, 0xcd, 0x9e, + 0xf1, 0xa2, 0x4a, 0x19, 0x87, 0xd4, 0x3c, 0x6f }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x00, 0x00, 0x1d, 0x1d, 0x1d, 0x1d, + 0x27, 0x27, 0x27, 0x27, 0x3a, 0x3a, 0x3a, 0x3a }, + { 0x00, 0x7f, 0xfe, 0x81, 0xfc, 0x83, 0x02, 0x7d, + 0xf8, 0x87, 0x06, 0x79, 0x04, 0x7b, 0xfa, 0x85 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x13, 0xfb, 0xde, 0x36, 0x94, 0x7c, 0x59, 0xb1 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x82, 0x04, 0x86, 0x08, 0x8a, 0x0c, 0x8e, + 0x10, 0x92, 0x14, 0x96, 0x18, 0x9a, 0x1c, 0x9e }, + { 0x00, 0xe8, 0xcd, 0x25, 0x87, 0x6f, 0x4a, 0xa2, + 0x0e, 0xe6, 0xc3, 0x2b, 0x89, 0x61, 0x44, 0xac }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x83, 0x06, 0x85, 0x0c, 0x8f, 0x0a, 0x89, + 0x18, 0x9b, 0x1e, 0x9d, 0x14, 0x97, 0x12, 0x91 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x84, 0x08, 0x8c, 0x10, 0x94, 0x18, 0x9c, + 0x20, 0xa4, 0x28, 0xac, 0x30, 0xb4, 0x38, 0xbc }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x29, 0xc1, 0xe4, 0x0c, 0xb3, 0x5b, 0x7e, 0x96 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x85, 0x0a, 0x8f, 0x14, 0x91, 0x1e, 0x9b, + 0x28, 0xad, 0x22, 0xa7, 0x3c, 0xb9, 0x36, 0xb3 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x86, 0x0c, 0x8a, 0x18, 0x9e, 0x14, 0x92, + 0x30, 0xb6, 0x3c, 0xba, 0x28, 0xae, 0x24, 0xa2 }, + { 0x00, 0xe8, 0xcd, 0x25, 0x9a, 0x72, 0x57, 0xbf, + 0x34, 0xdc, 0xf9, 0x11, 0xae, 0x46, 0x63, 0x8b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x87, 0x0e, 0x89, 0x1c, 0x9b, 0x12, 0x95, + 0x38, 0xbf, 0x36, 0xb1, 0x24, 0xa3, 0x2a, 0xad }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x88, 0x10, 0x98, 0x20, 0xa8, 0x30, 0xb8, + 0x40, 0xc8, 0x50, 0xd8, 0x60, 0xe8, 0x70, 0xf8 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x67, 0x8f, 0xb7, 0x5f, 0xda, 0x32, 0x0a, 0xe2 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x89, 0x12, 0x9b, 0x24, 0xad, 0x36, 0xbf, + 0x48, 0xc1, 0x5a, 0xd3, 0x6c, 0xe5, 0x7e, 0xf7 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8a, 0x14, 0x9e, 0x28, 0xa2, 0x3c, 0xb6, + 0x50, 0xda, 0x44, 0xce, 0x78, 0xf2, 0x6c, 0xe6 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xbd, 0x55, 0x6d, 0x85, + 0x7a, 0x92, 0xaa, 0x42, 0xc7, 0x2f, 0x17, 0xff }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8b, 0x16, 0x9d, 0x2c, 0xa7, 0x3a, 0xb1, + 0x58, 0xd3, 0x4e, 0xc5, 0x74, 0xff, 0x62, 0xe9 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8c, 0x18, 0x94, 0x30, 0xbc, 0x28, 0xa4, + 0x60, 0xec, 0x78, 0xf4, 0x50, 0xdc, 0x48, 0xc4 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x5d, 0xb5, 0x8d, 0x65, 0xfd, 0x15, 0x2d, 0xc5 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8d, 0x1a, 0x97, 0x34, 0xb9, 0x2e, 0xa3, + 0x68, 0xe5, 0x72, 0xff, 0x5c, 0xd1, 0x46, 0xcb }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8e, 0x1c, 0x92, 0x38, 0xb6, 0x24, 0xaa, + 0x70, 0xfe, 0x6c, 0xe2, 0x48, 0xc6, 0x54, 0xda }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x8f, 0x1e, 0x91, 0x3c, 0xb3, 0x22, 0xad, + 0x78, 0xf7, 0x66, 0xe9, 0x44, 0xcb, 0x5a, 0xd5 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xfb, 0x0e, 0x0c, 0xf9, 0x08, 0xfd, 0xff, 0x0a }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x91, 0x22, 0xb3, 0x44, 0xd5, 0x66, 0xf7, + 0x88, 0x19, 0xaa, 0x3b, 0xcc, 0x5d, 0xee, 0x7f }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x92, 0x24, 0xb6, 0x48, 0xda, 0x6c, 0xfe, + 0x90, 0x02, 0xb4, 0x26, 0xd8, 0x4a, 0xfc, 0x6e }, + { 0x00, 0xf5, 0xf7, 0x02, 0xf3, 0x06, 0x04, 0xf1, + 0xe6, 0x13, 0x11, 0xe4, 0x15, 0xe0, 0xe2, 0x17 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x93, 0x26, 0xb5, 0x4c, 0xdf, 0x6a, 0xf9, + 0x98, 0x0b, 0xbe, 0x2d, 0xd4, 0x47, 0xf2, 0x61 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x94, 0x28, 0xbc, 0x50, 0xc4, 0x78, 0xec, + 0xa0, 0x34, 0x88, 0x1c, 0xf0, 0x64, 0xd8, 0x4c }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xc1, 0x34, 0x36, 0xc3, 0x2f, 0xda, 0xd8, 0x2d }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x95, 0x2a, 0xbf, 0x54, 0xc1, 0x7e, 0xeb, + 0xa8, 0x3d, 0x82, 0x17, 0xfc, 0x69, 0xd6, 0x43 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x96, 0x2c, 0xba, 0x58, 0xce, 0x74, 0xe2, + 0xb0, 0x26, 0x9c, 0x0a, 0xe8, 0x7e, 0xc4, 0x52 }, + { 0x00, 0xf5, 0xf7, 0x02, 0xee, 0x1b, 0x19, 0xec, + 0xdc, 0x29, 0x2b, 0xde, 0x32, 0xc7, 0xc5, 0x30 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x97, 0x2e, 0xb9, 0x5c, 0xcb, 0x72, 0xe5, + 0xb8, 0x2f, 0x96, 0x01, 0xe4, 0x73, 0xca, 0x5d }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x98, 0x30, 0xa8, 0x60, 0xf8, 0x50, 0xc8, + 0xc0, 0x58, 0xf0, 0x68, 0xa0, 0x38, 0x90, 0x08 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x8f, 0x7a, 0x65, 0x90, 0x46, 0xb3, 0xac, 0x59 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x99, 0x32, 0xab, 0x64, 0xfd, 0x56, 0xcf, + 0xc8, 0x51, 0xfa, 0x63, 0xac, 0x35, 0x9e, 0x07 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9a, 0x34, 0xae, 0x68, 0xf2, 0x5c, 0xc6, + 0xd0, 0x4a, 0xe4, 0x7e, 0xb8, 0x22, 0x8c, 0x16 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xc9, 0x3c, 0x23, 0xd6, + 0x92, 0x67, 0x78, 0x8d, 0x5b, 0xae, 0xb1, 0x44 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9b, 0x36, 0xad, 0x6c, 0xf7, 0x5a, 0xc1, + 0xd8, 0x43, 0xee, 0x75, 0xb4, 0x2f, 0x82, 0x19 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xb5, 0x40, 0x5f, 0xaa, 0x61, 0x94, 0x8b, 0x7e }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9d, 0x3a, 0xa7, 0x74, 0xe9, 0x4e, 0xd3, + 0xe8, 0x75, 0xd2, 0x4f, 0x9c, 0x01, 0xa6, 0x3b }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9e, 0x3c, 0xa2, 0x78, 0xe6, 0x44, 0xda, + 0xf0, 0x6e, 0xcc, 0x52, 0x88, 0x16, 0xb4, 0x2a }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x74, 0x74, 0x69, 0x69, 0x4e, 0x4e, 0x53, 0x53 }, + { 0x00, 0x9f, 0x3e, 0xa1, 0x7c, 0xe3, 0x42, 0xdd, + 0xf8, 0x67, 0xc6, 0x59, 0x84, 0x1b, 0xba, 0x25 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xde, 0x0c, 0x67, 0xb5, 0xb1, 0x63, 0x08, 0xda }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa1, 0x42, 0xe3, 0x84, 0x25, 0xc6, 0x67, + 0x08, 0xa9, 0x4a, 0xeb, 0x8c, 0x2d, 0xce, 0x6f }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa2, 0x44, 0xe6, 0x88, 0x2a, 0xcc, 0x6e, + 0x10, 0xb2, 0x54, 0xf6, 0x98, 0x3a, 0xdc, 0x7e }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x6f, 0xbd, 0xd6, 0x04, + 0xc3, 0x11, 0x7a, 0xa8, 0xac, 0x7e, 0x15, 0xc7 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa3, 0x46, 0xe5, 0x8c, 0x2f, 0xca, 0x69, + 0x18, 0xbb, 0x5e, 0xfd, 0x94, 0x37, 0xd2, 0x71 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa4, 0x48, 0xec, 0x90, 0x34, 0xd8, 0x7c, + 0x20, 0x84, 0x68, 0xcc, 0xb0, 0x14, 0xf8, 0x5c }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xe4, 0x36, 0x5d, 0x8f, 0x96, 0x44, 0x2f, 0xfd }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa5, 0x4a, 0xef, 0x94, 0x31, 0xde, 0x7b, + 0x28, 0x8d, 0x62, 0xc7, 0xbc, 0x19, 0xf6, 0x53 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xd2, 0xb9, 0x6b, 0x72, 0xa0, 0xcb, 0x19, + 0xf9, 0x2b, 0x40, 0x92, 0x8b, 0x59, 0x32, 0xe0 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa7, 0x4e, 0xe9, 0x9c, 0x3b, 0xd2, 0x75, + 0x38, 0x9f, 0x76, 0xd1, 0xa4, 0x03, 0xea, 0x4d }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa8, 0x50, 0xf8, 0xa0, 0x08, 0xf0, 0x58, + 0x40, 0xe8, 0x10, 0xb8, 0xe0, 0x48, 0xb0, 0x18 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xaa, 0x78, 0x0e, 0xdc, 0xff, 0x2d, 0x5b, 0x89 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xa9, 0x52, 0xfb, 0xa4, 0x0d, 0xf6, 0x5f, + 0x48, 0xe1, 0x1a, 0xb3, 0xec, 0x45, 0xbe, 0x17 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaa, 0x54, 0xfe, 0xa8, 0x02, 0xfc, 0x56, + 0x50, 0xfa, 0x04, 0xae, 0xf8, 0x52, 0xac, 0x06 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x55, 0x87, 0xf1, 0x23, + 0xb7, 0x65, 0x13, 0xc1, 0xe2, 0x30, 0x46, 0x94 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xab, 0x56, 0xfd, 0xac, 0x07, 0xfa, 0x51, + 0x58, 0xf3, 0x0e, 0xa5, 0xf4, 0x5f, 0xa2, 0x09 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xac, 0x58, 0xf4, 0xb0, 0x1c, 0xe8, 0x44, + 0x60, 0xcc, 0x38, 0x94, 0xd0, 0x7c, 0x88, 0x24 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xad, 0x5a, 0xf7, 0xb4, 0x19, 0xee, 0x43, + 0x68, 0xc5, 0x32, 0x9f, 0xdc, 0x71, 0x86, 0x2b }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xae, 0x5c, 0xf2, 0xb8, 0x16, 0xe4, 0x4a, + 0x70, 0xde, 0x2c, 0x82, 0xc8, 0x66, 0x94, 0x3a }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x8d, 0x5f, 0x29, 0xfb, 0xc5, 0x17, 0x61, 0xb3 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xaf, 0x5e, 0xf1, 0xbc, 0x13, 0xe2, 0x4d, + 0x78, 0xd7, 0x26, 0x89, 0xc4, 0x6b, 0x9a, 0x35 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x36, 0xf9, 0xb5, 0x7a, 0x2d, 0xe2, 0xae, 0x61 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb1, 0x62, 0xd3, 0xc4, 0x75, 0xa6, 0x17, + 0x88, 0x39, 0xea, 0x5b, 0x4c, 0xfd, 0x2e, 0x9f }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb2, 0x64, 0xd6, 0xc8, 0x7a, 0xac, 0x1e, + 0x90, 0x22, 0xf4, 0x46, 0x58, 0xea, 0x3c, 0x8e }, + { 0x00, 0xcf, 0x83, 0x4c, 0x1b, 0xd4, 0x98, 0x57, + 0x2b, 0xe4, 0xa8, 0x67, 0x30, 0xff, 0xb3, 0x7c }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb3, 0x66, 0xd5, 0xcc, 0x7f, 0xaa, 0x19, + 0x98, 0x2b, 0xfe, 0x4d, 0x54, 0xe7, 0x32, 0x81 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb4, 0x68, 0xdc, 0xd0, 0x64, 0xb8, 0x0c, + 0xa0, 0x14, 0xc8, 0x7c, 0x70, 0xc4, 0x18, 0xac }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x0c, 0xc3, 0x8f, 0x40, 0x0a, 0xc5, 0x89, 0x46 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb5, 0x6a, 0xdf, 0xd4, 0x61, 0xbe, 0x0b, + 0xa8, 0x1d, 0xc2, 0x77, 0x7c, 0xc9, 0x16, 0xa3 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb6, 0x6c, 0xda, 0xd8, 0x6e, 0xb4, 0x02, + 0xb0, 0x06, 0xdc, 0x6a, 0x68, 0xde, 0x04, 0xb2 }, + { 0x00, 0xcf, 0x83, 0x4c, 0x06, 0xc9, 0x85, 0x4a, + 0x11, 0xde, 0x92, 0x5d, 0x17, 0xd8, 0x94, 0x5b }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb7, 0x6e, 0xd9, 0xdc, 0x6b, 0xb2, 0x05, + 0xb8, 0x0f, 0xd6, 0x61, 0x64, 0xd3, 0x0a, 0xbd }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb8, 0x70, 0xc8, 0xe0, 0x58, 0x90, 0x28, + 0xc0, 0x78, 0xb0, 0x08, 0x20, 0x98, 0x50, 0xe8 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x42, 0x8d, 0xdc, 0x13, 0x63, 0xac, 0xfd, 0x32 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xb9, 0x72, 0xcb, 0xe4, 0x5d, 0x96, 0x2f, + 0xc8, 0x71, 0xba, 0x03, 0x2c, 0x95, 0x5e, 0xe7 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xba, 0x74, 0xce, 0xe8, 0x52, 0x9c, 0x26, + 0xd0, 0x6a, 0xa4, 0x1e, 0x38, 0x82, 0x4c, 0xf6 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x21, 0xee, 0xbf, 0x70, + 0x5f, 0x90, 0xc1, 0x0e, 0x7e, 0xb1, 0xe0, 0x2f }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbc, 0x78, 0xc4, 0xf0, 0x4c, 0x88, 0x34, + 0xe0, 0x5c, 0x98, 0x24, 0x10, 0xac, 0x68, 0xd4 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbd, 0x7a, 0xc7, 0xf4, 0x49, 0x8e, 0x33, + 0xe8, 0x55, 0x92, 0x2f, 0x1c, 0xa1, 0x66, 0xdb }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbe, 0x7c, 0xc2, 0xf8, 0x46, 0x84, 0x3a, + 0xf0, 0x4e, 0x8c, 0x32, 0x08, 0xb6, 0x74, 0xca }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x65, 0xaa, 0xfb, 0x34, 0x59, 0x96, 0xc7, 0x08 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x3a, 0x3a, 0x27, 0x27, + 0x69, 0x69, 0x74, 0x74, 0x53, 0x53, 0x4e, 0x4e }, + { 0x00, 0xbf, 0x7e, 0xc1, 0xfc, 0x43, 0x82, 0x3d, + 0xf8, 0x47, 0x86, 0x39, 0x04, 0xbb, 0x7a, 0xc5 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x94, 0x08, 0xb1, 0x2d, 0xde, 0x42, 0xfb, 0x67 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc1, 0x82, 0x43, 0x04, 0xc5, 0x86, 0x47, + 0x08, 0xc9, 0x8a, 0x4b, 0x0c, 0xcd, 0x8e, 0x4f }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc2, 0x84, 0x46, 0x08, 0xca, 0x8c, 0x4e, + 0x10, 0xd2, 0x94, 0x56, 0x18, 0xda, 0x9c, 0x5e }, + { 0x00, 0x9c, 0x25, 0xb9, 0x4a, 0xd6, 0x6f, 0xf3, + 0x89, 0x15, 0xac, 0x30, 0xc3, 0x5f, 0xe6, 0x7a }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc3, 0x86, 0x45, 0x0c, 0xcf, 0x8a, 0x49, + 0x18, 0xdb, 0x9e, 0x5d, 0x14, 0xd7, 0x92, 0x51 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc4, 0x88, 0x4c, 0x10, 0xd4, 0x98, 0x5c, + 0x20, 0xe4, 0xa8, 0x6c, 0x30, 0xf4, 0xb8, 0x7c }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xae, 0x32, 0x8b, 0x17, 0xf9, 0x65, 0xdc, 0x40 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc5, 0x8a, 0x4f, 0x14, 0xd1, 0x9e, 0x5b, + 0x28, 0xed, 0xa2, 0x67, 0x3c, 0xf9, 0xb6, 0x73 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc6, 0x8c, 0x4a, 0x18, 0xde, 0x94, 0x52, + 0x30, 0xf6, 0xbc, 0x7a, 0x28, 0xee, 0xa4, 0x62 }, + { 0x00, 0x9c, 0x25, 0xb9, 0x57, 0xcb, 0x72, 0xee, + 0xb3, 0x2f, 0x96, 0x0a, 0xe4, 0x78, 0xc1, 0x5d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc7, 0x8e, 0x49, 0x1c, 0xdb, 0x92, 0x55, + 0x38, 0xff, 0xb6, 0x71, 0x24, 0xe3, 0xaa, 0x6d }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc8, 0x90, 0x58, 0x20, 0xe8, 0xb0, 0x78, + 0x40, 0x88, 0xd0, 0x18, 0x60, 0xa8, 0xf0, 0x38 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xe0, 0x7c, 0xd8, 0x44, 0x90, 0x0c, 0xa8, 0x34 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xc9, 0x92, 0x5b, 0x24, 0xed, 0xb6, 0x7f, + 0x48, 0x81, 0xda, 0x13, 0x6c, 0xa5, 0xfe, 0x37 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xca, 0x94, 0x5e, 0x28, 0xe2, 0xbc, 0x76, + 0x50, 0x9a, 0xc4, 0x0e, 0x78, 0xb2, 0xec, 0x26 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x70, 0xec, 0x48, 0xd4, + 0xfd, 0x61, 0xc5, 0x59, 0x8d, 0x11, 0xb5, 0x29 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcb, 0x96, 0x5d, 0x2c, 0xe7, 0xba, 0x71, + 0x58, 0x93, 0xce, 0x05, 0x74, 0xbf, 0xe2, 0x29 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcc, 0x98, 0x54, 0x30, 0xfc, 0xa8, 0x64, + 0x60, 0xac, 0xf8, 0x34, 0x50, 0x9c, 0xc8, 0x04 }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xda, 0x46, 0xe2, 0x7e, 0xb7, 0x2b, 0x8f, 0x13 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcd, 0x9a, 0x57, 0x34, 0xf9, 0xae, 0x63, + 0x68, 0xa5, 0xf2, 0x3f, 0x5c, 0x91, 0xc6, 0x0b }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xce, 0x9c, 0x52, 0x38, 0xf6, 0xa4, 0x6a, + 0x70, 0xbe, 0xec, 0x22, 0x48, 0x86, 0xd4, 0x1a }, + { 0x00, 0x9c, 0x38, 0xa4, 0x6d, 0xf1, 0x55, 0xc9, + 0xc7, 0x5b, 0xff, 0x63, 0xaa, 0x36, 0x92, 0x0e }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xcf, 0x9e, 0x51, 0x3c, 0xf3, 0xa2, 0x6d, + 0x78, 0xb7, 0xe6, 0x29, 0x44, 0x8b, 0xda, 0x15 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x7c, 0xfd, 0x63, 0xe2, 0x42, 0xc3, 0x5d, 0xdc }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd1, 0xa2, 0x73, 0x44, 0x95, 0xe6, 0x37, + 0x88, 0x59, 0x2a, 0xfb, 0xcc, 0x1d, 0x6e, 0xbf }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd2, 0xa4, 0x76, 0x48, 0x9a, 0xec, 0x3e, + 0x90, 0x42, 0x34, 0xe6, 0xd8, 0x0a, 0x7c, 0xae }, + { 0x00, 0x81, 0x1f, 0x9e, 0x3e, 0xbf, 0x21, 0xa0, + 0x61, 0xe0, 0x7e, 0xff, 0x5f, 0xde, 0x40, 0xc1 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd3, 0xa6, 0x75, 0x4c, 0x9f, 0xea, 0x39, + 0x98, 0x4b, 0x3e, 0xed, 0xd4, 0x07, 0x72, 0xa1 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd4, 0xa8, 0x7c, 0x50, 0x84, 0xf8, 0x2c, + 0xa0, 0x74, 0x08, 0xdc, 0xf0, 0x24, 0x58, 0x8c }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x46, 0xc7, 0x59, 0xd8, 0x65, 0xe4, 0x7a, 0xfb }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd5, 0xaa, 0x7f, 0x54, 0x81, 0xfe, 0x2b, + 0xa8, 0x7d, 0x02, 0xd7, 0xfc, 0x29, 0x56, 0x83 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd6, 0xac, 0x7a, 0x58, 0x8e, 0xf4, 0x22, + 0xb0, 0x66, 0x1c, 0xca, 0xe8, 0x3e, 0x44, 0x92 }, + { 0x00, 0x81, 0x1f, 0x9e, 0x23, 0xa2, 0x3c, 0xbd, + 0x5b, 0xda, 0x44, 0xc5, 0x78, 0xf9, 0x67, 0xe6 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd7, 0xae, 0x79, 0x5c, 0x8b, 0xf2, 0x25, + 0xb8, 0x6f, 0x16, 0xc1, 0xe4, 0x33, 0x4a, 0x9d }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd8, 0xb0, 0x68, 0x60, 0xb8, 0xd0, 0x08, + 0xc0, 0x18, 0x70, 0xa8, 0xa0, 0x78, 0x10, 0xc8 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x08, 0x89, 0x0a, 0x8b, 0x0c, 0x8d, 0x0e, 0x8f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xd9, 0xb2, 0x6b, 0x64, 0xbd, 0xd6, 0x0f, + 0xc8, 0x11, 0x7a, 0xa3, 0xac, 0x75, 0x1e, 0xc7 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xda, 0xb4, 0x6e, 0x68, 0xb2, 0xdc, 0x06, + 0xd0, 0x0a, 0x64, 0xbe, 0xb8, 0x62, 0x0c, 0xd6 }, + { 0x00, 0x81, 0x02, 0x83, 0x04, 0x85, 0x06, 0x87, + 0x15, 0x94, 0x17, 0x96, 0x11, 0x90, 0x13, 0x92 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdb, 0xb6, 0x6d, 0x6c, 0xb7, 0xda, 0x01, + 0xd8, 0x03, 0x6e, 0xb5, 0xb4, 0x6f, 0x02, 0xd9 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdc, 0xb8, 0x64, 0x70, 0xac, 0xc8, 0x14, + 0xe0, 0x3c, 0x58, 0x84, 0x90, 0x4c, 0x28, 0xf4 }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x32, 0xb3, 0x30, 0xb1, 0x2b, 0xaa, 0x29, 0xa8 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdd, 0xba, 0x67, 0x74, 0xa9, 0xce, 0x13, + 0xe8, 0x35, 0x52, 0x8f, 0x9c, 0x41, 0x26, 0xfb }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xde, 0xbc, 0x62, 0x78, 0xa6, 0xc4, 0x1a, + 0xf0, 0x2e, 0x4c, 0x92, 0x88, 0x56, 0x34, 0xea }, + { 0x00, 0x81, 0x02, 0x83, 0x19, 0x98, 0x1b, 0x9a, + 0x2f, 0xae, 0x2d, 0xac, 0x36, 0xb7, 0x34, 0xb5 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x4e, 0x4e, 0x53, 0x53, 0x69, 0x69, 0x74, 0x74 }, + { 0x00, 0xdf, 0xbe, 0x61, 0x7c, 0xa3, 0xc2, 0x1d, + 0xf8, 0x27, 0x46, 0x99, 0x84, 0x5b, 0x3a, 0xe5 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x59, 0xff, 0x08, 0xae, 0xfb, 0x5d, 0xaa, 0x0c }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe1, 0xc2, 0x23, 0x84, 0x65, 0x46, 0xa7, + 0x08, 0xe9, 0xca, 0x2b, 0x8c, 0x6d, 0x4e, 0xaf }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe2, 0xc4, 0x26, 0x88, 0x6a, 0x4c, 0xae, + 0x10, 0xf2, 0xd4, 0x36, 0x98, 0x7a, 0x5c, 0xbe }, + { 0x00, 0xa6, 0x51, 0xf7, 0xa2, 0x04, 0xf3, 0x55, + 0x44, 0xe2, 0x15, 0xb3, 0xe6, 0x40, 0xb7, 0x11 }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe3, 0xc6, 0x25, 0x8c, 0x6f, 0x4a, 0xa9, + 0x18, 0xfb, 0xde, 0x3d, 0x94, 0x77, 0x52, 0xb1 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe4, 0xc8, 0x2c, 0x90, 0x74, 0x58, 0xbc, + 0x20, 0xc4, 0xe8, 0x0c, 0xb0, 0x54, 0x78, 0x9c }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x63, 0xc5, 0x32, 0x94, 0xdc, 0x7a, 0x8d, 0x2b }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe5, 0xca, 0x2f, 0x94, 0x71, 0x5e, 0xbb, + 0x28, 0xcd, 0xe2, 0x07, 0xbc, 0x59, 0x76, 0x93 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe6, 0xcc, 0x2a, 0x98, 0x7e, 0x54, 0xb2, + 0x30, 0xd6, 0xfc, 0x1a, 0xa8, 0x4e, 0x64, 0x82 }, + { 0x00, 0xa6, 0x51, 0xf7, 0xbf, 0x19, 0xee, 0x48, + 0x7e, 0xd8, 0x2f, 0x89, 0xc1, 0x67, 0x90, 0x36 }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe7, 0xce, 0x29, 0x9c, 0x7b, 0x52, 0xb5, + 0x38, 0xdf, 0xf6, 0x11, 0xa4, 0x43, 0x6a, 0x8d }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe8, 0xd0, 0x38, 0xa0, 0x48, 0x70, 0x98, + 0x40, 0xa8, 0x90, 0x78, 0xe0, 0x08, 0x30, 0xd8 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x2d, 0x8b, 0x61, 0xc7, 0xb5, 0x13, 0xf9, 0x5f }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xe9, 0xd2, 0x3b, 0xa4, 0x4d, 0x76, 0x9f, + 0x48, 0xa1, 0x9a, 0x73, 0xec, 0x05, 0x3e, 0xd7 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xea, 0xd4, 0x3e, 0xa8, 0x42, 0x7c, 0x96, + 0x50, 0xba, 0x84, 0x6e, 0xf8, 0x12, 0x2c, 0xc6 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x98, 0x3e, 0xd4, 0x72, + 0x30, 0x96, 0x7c, 0xda, 0xa8, 0x0e, 0xe4, 0x42 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xeb, 0xd6, 0x3d, 0xac, 0x47, 0x7a, 0x91, + 0x58, 0xb3, 0x8e, 0x65, 0xf4, 0x1f, 0x22, 0xc9 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xec, 0xd8, 0x34, 0xb0, 0x5c, 0x68, 0x84, + 0x60, 0x8c, 0xb8, 0x54, 0xd0, 0x3c, 0x08, 0xe4 }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x17, 0xb1, 0x5b, 0xfd, 0x92, 0x34, 0xde, 0x78 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xed, 0xda, 0x37, 0xb4, 0x59, 0x6e, 0x83, + 0x68, 0x85, 0xb2, 0x5f, 0xdc, 0x31, 0x06, 0xeb }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xee, 0xdc, 0x32, 0xb8, 0x56, 0x64, 0x8a, + 0x70, 0x9e, 0xac, 0x42, 0xc8, 0x26, 0x14, 0xfa }, + { 0x00, 0xa6, 0x4c, 0xea, 0x85, 0x23, 0xc9, 0x6f, + 0x0a, 0xac, 0x46, 0xe0, 0x8f, 0x29, 0xc3, 0x65 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xef, 0xde, 0x31, 0xbc, 0x53, 0x62, 0x8d, + 0x78, 0x97, 0xa6, 0x49, 0xc4, 0x2b, 0x1a, 0xf5 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xb1, 0x0a, 0xda, 0x61, 0x67, 0xdc, 0x0c, 0xb7 }, + { 0x00, 0x10, 0x20, 0x30, 0x40, 0x50, 0x60, 0x70, + 0x80, 0x90, 0xa0, 0xb0, 0xc0, 0xd0, 0xe0, 0xf0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf1, 0xe2, 0x13, 0xc4, 0x35, 0x26, 0xd7, + 0x88, 0x79, 0x6a, 0x9b, 0x4c, 0xbd, 0xae, 0x5f }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0, + 0x00, 0x20, 0x40, 0x60, 0x80, 0xa0, 0xc0, 0xe0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf2, 0xe4, 0x16, 0xc8, 0x3a, 0x2c, 0xde, + 0x90, 0x62, 0x74, 0x86, 0x58, 0xaa, 0xbc, 0x4e }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xd6, 0x6d, 0xbd, 0x06, + 0xac, 0x17, 0xc7, 0x7c, 0x7a, 0xc1, 0x11, 0xaa }, + { 0x00, 0x30, 0x60, 0x50, 0xc0, 0xf0, 0xa0, 0x90, + 0x80, 0xb0, 0xe0, 0xd0, 0x40, 0x70, 0x20, 0x10 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf3, 0xe6, 0x15, 0xcc, 0x3f, 0x2a, 0xd9, + 0x98, 0x6b, 0x7e, 0x8d, 0x54, 0xa7, 0xb2, 0x41 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0, + 0x00, 0x40, 0x80, 0xc0, 0x00, 0x40, 0x80, 0xc0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf4, 0xe8, 0x1c, 0xd0, 0x24, 0x38, 0xcc, + 0xa0, 0x54, 0x48, 0xbc, 0x70, 0x84, 0x98, 0x6c }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x8b, 0x30, 0xe0, 0x5b, 0x40, 0xfb, 0x2b, 0x90 }, + { 0x00, 0x50, 0xa0, 0xf0, 0x40, 0x10, 0xe0, 0xb0, + 0x80, 0xd0, 0x20, 0x70, 0xc0, 0x90, 0x60, 0x30 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf5, 0xea, 0x1f, 0xd4, 0x21, 0x3e, 0xcb, + 0xa8, 0x5d, 0x42, 0xb7, 0x7c, 0x89, 0x96, 0x63 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20, + 0x00, 0x60, 0xc0, 0xa0, 0x80, 0xe0, 0x40, 0x20 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf6, 0xec, 0x1a, 0xd8, 0x2e, 0x34, 0xc2, + 0xb0, 0x46, 0x5c, 0xaa, 0x68, 0x9e, 0x84, 0x72 }, + { 0x00, 0xbb, 0x6b, 0xd0, 0xcb, 0x70, 0xa0, 0x1b, + 0x96, 0x2d, 0xfd, 0x46, 0x5d, 0xe6, 0x36, 0x8d }, + { 0x00, 0x70, 0xe0, 0x90, 0xc0, 0xb0, 0x20, 0x50, + 0x80, 0xf0, 0x60, 0x10, 0x40, 0x30, 0xa0, 0xd0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf7, 0xee, 0x19, 0xdc, 0x2b, 0x32, 0xc5, + 0xb8, 0x4f, 0x56, 0xa1, 0x64, 0x93, 0x8a, 0x7d }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, + 0x00, 0x80, 0x00, 0x80, 0x00, 0x80, 0x00, 0x80 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf8, 0xf0, 0x08, 0xe0, 0x18, 0x10, 0xe8, + 0xc0, 0x38, 0x30, 0xc8, 0x20, 0xd8, 0xd0, 0x28 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xc5, 0x7e, 0xb3, 0x08, 0x29, 0x92, 0x5f, 0xe4 }, + { 0x00, 0x90, 0x20, 0xb0, 0x40, 0xd0, 0x60, 0xf0, + 0x80, 0x10, 0xa0, 0x30, 0xc0, 0x50, 0xe0, 0x70 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xf9, 0xf2, 0x0b, 0xe4, 0x1d, 0x16, 0xef, + 0xc8, 0x31, 0x3a, 0xc3, 0x2c, 0xd5, 0xde, 0x27 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60, + 0x00, 0xa0, 0x40, 0xe0, 0x80, 0x20, 0xc0, 0x60 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfa, 0xf4, 0x0e, 0xe8, 0x12, 0x1c, 0xe6, + 0xd0, 0x2a, 0x24, 0xde, 0x38, 0xc2, 0xcc, 0x36 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xec, 0x57, 0x9a, 0x21, + 0xd8, 0x63, 0xae, 0x15, 0x34, 0x8f, 0x42, 0xf9 }, + { 0x00, 0xb0, 0x60, 0xd0, 0xc0, 0x70, 0xa0, 0x10, + 0x80, 0x30, 0xe0, 0x50, 0x40, 0xf0, 0x20, 0x90 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfb, 0xf6, 0x0d, 0xec, 0x17, 0x1a, 0xe1, + 0xd8, 0x23, 0x2e, 0xd5, 0x34, 0xcf, 0xc2, 0x39 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40, + 0x00, 0xc0, 0x80, 0x40, 0x00, 0xc0, 0x80, 0x40 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfc, 0xf8, 0x04, 0xf0, 0x0c, 0x08, 0xf4, + 0xe0, 0x1c, 0x18, 0xe4, 0x10, 0xec, 0xe8, 0x14 }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xff, 0x44, 0x89, 0x32, 0x0e, 0xb5, 0x78, 0xc3 }, + { 0x00, 0xd0, 0xa0, 0x70, 0x40, 0x90, 0xe0, 0x30, + 0x80, 0x50, 0x20, 0xf0, 0xc0, 0x10, 0x60, 0xb0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfd, 0xfa, 0x07, 0xf4, 0x09, 0x0e, 0xf3, + 0xe8, 0x15, 0x12, 0xef, 0x1c, 0xe1, 0xe6, 0x1b }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0, + 0x00, 0xe0, 0xc0, 0x20, 0x80, 0x60, 0x40, 0xa0 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xfe, 0xfc, 0x02, 0xf8, 0x06, 0x04, 0xfa, + 0xf0, 0x0e, 0x0c, 0xf2, 0x08, 0xf6, 0xf4, 0x0a }, + { 0x00, 0xbb, 0x76, 0xcd, 0xf1, 0x4a, 0x87, 0x3c, + 0xe2, 0x59, 0x94, 0x2f, 0x13, 0xa8, 0x65, 0xde }, + { 0x00, 0xf0, 0xe0, 0x10, 0xc0, 0x30, 0x20, 0xd0, + 0x80, 0x70, 0x60, 0x90, 0x40, 0xb0, 0xa0, 0x50 }, + { 0x00, 0x00, 0x1d, 0x1d, 0x27, 0x27, 0x3a, 0x3a, + 0x53, 0x53, 0x4e, 0x4e, 0x74, 0x74, 0x69, 0x69 }, + { 0x00, 0xff, 0xfe, 0x01, 0xfc, 0x03, 0x02, 0xfd, + 0xf8, 0x07, 0x06, 0xf9, 0x04, 0xfb, 0xfa, 0x05 } +}; +/* END CSTYLED */ +#endif /* defined(HAVE_SSSE3) || defined(HAVE_AVX2) || defined(HAVE_AVX512BW) */ +#endif /* defined(__x86_64) */ diff --git a/module/zfs/vdev_rebuild.c b/module/zfs/vdev_rebuild.c new file mode 100644 index 000000000000..85ed8afe1cf4 --- /dev/null +++ b/module/zfs/vdev_rebuild.c @@ -0,0 +1,1108 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * + * Copyright (c) 2018, Intel Corporation. + * Copyright (c) 2020 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains the sequential reconstruction implementation for + * resilvering. This form of resilvering is internally referred to as device + * rebuild to avoid conflating it with the traditional healing reconstruction + * performed by the dsl scan code. + * + * When replacing a device, or scrubbing the pool, ZFS has historically used + * a process called resilvering which is a form of healing reconstruction. + * This approach has the advantage that as blocks are read from disk their + * checksums can be immediately verified and the data repaired. Unfortunately, + * it also results in a random IO pattern to the disk even when extra care + * is taken to sequentialize the IO as much as possible. This substantially + * increases the time required to resilver the pool and restore redundancy. + * + * For mirrored devices it's possible to implement an alternate sequential + * reconstruction strategy when resilvering. Sequential reconstruction + * behaves like a traditional RAID rebuild and reconstructs a device in LBA + * order without verifying the checksum. After this phase completes a second + * scrub phase is started to verify all of the checksums. This two phase + * process will take longer than the healing reconstruction described above. + * However, it has that advantage that after the reconstruction first phase + * completes redundancy has been restored. At this point the pool can incur + * another device failure without risking data loss. + * + * There are a few noteworthy limitations and other advantages of resilvering + * using sequential reconstruction vs healing reconstruction. + * + * Limitations: + * + * - Only supported for mirror vdev types. Due to the variable stripe + * width used by raidz sequential reconstruction is not possible. + * + * - Block checksums are not verified during sequential reconstuction. + * Similar to traditional RAID the parity/mirror data is reconstructed + * but cannot be immediately double checked. For this reason when the + * last active resilver completes the pool is automatically scrubbed. + * + * - Deferred resilvers using sequential reconstruction are not currently + * supported. When adding another vdev to an active top-level resilver + * it must be restarted. + * + * Advantages: + * + * - Sequential reconstuction is performed in LBA order which may be faster + * than healing reconstuction particularly when using using HDDs (or + * especially with SMR devices). Only allocated capacity is resilvered. + * + * - Sequential reconstruction is not constrained by ZFS block boundaries. + * This allows it to issue larger IOs to disk which span multiple blocks + * allowing all of these logical blocks to be repaired with a single IO. + * + * - Unlike a healing resilver or scrub which are pool wide operations, + * sequential reconstruction is handled by the top-level mirror vdevs. + * This allows for it to be started or canceled on a top-level vdev + * without impacting any other top-level vdevs in the pool. + * + * - Data only referenced by a pool checkpoint will be repaired because + * that space is reflected in the space maps. This differs for a + * healing resilver or scrub which will not repair that data. + */ + + +/* + * Maximum number of queued rebuild I/Os top-level vdev. The number of + * concurrent rebuild I/Os issued to the device is controlled by the + * zfs_vdev_rebuild_min_active and zfs_vdev_rebuild_max_active module + * options. + */ +unsigned int zfs_rebuild_queue_limit = 20; + +/* + * Size of rebuild reads; defaults to 1MiB and is capped at SPA_MAXBLOCKSIZE. + */ +unsigned long zfs_rebuild_max_segment = 1024 * 1024; + +/* + * For vdev_rebuild_initiate_sync() and vdev_rebuild_reset_sync(). + */ +static void vdev_rebuild_thread(void *arg); + +/* + * Clear the per-vdev rebuild bytes value for a vdev tree. + */ +static void +clear_rebuild_bytes(vdev_t *vd) +{ + vdev_stat_t *vs = &vd->vdev_stat; + + for (uint64_t i = 0; i < vd->vdev_children; i++) + clear_rebuild_bytes(vd->vdev_child[i]); + + mutex_enter(&vd->vdev_stat_lock); + vs->vs_rebuild_processed = 0; + mutex_exit(&vd->vdev_stat_lock); +} + +/* + * Determines whether a vdev_rebuild_thread() should be stopped. + */ +static boolean_t +vdev_rebuild_should_stop(vdev_t *vd) +{ + return (!vdev_writeable(vd) || vd->vdev_removing || + vd->vdev_rebuild_exit_wanted || + vd->vdev_rebuild_cancel_wanted || + vd->vdev_rebuild_reset_wanted); +} + +/* + * Determine if the rebuild should be canceled. This may happen when all + * vdevs with MISSING DTLs are detached. + */ +static boolean_t +vdev_rebuild_should_cancel(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + if (!vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * The sync task for updating the on-disk state of a rebuild. This is + * scheduled by vdev_rebuild_range(). + */ +static void +vdev_rebuild_update_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t txg = dmu_tx_get_txg(tx); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (vr->vr_scan_offset[txg & TXG_MASK] > 0) { + vrp->vrp_last_offset = vr->vr_scan_offset[txg & TXG_MASK]; + vr->vr_scan_offset[txg & TXG_MASK] = 0; + } + + vrp->vrp_scan_time_ms = vr->vr_prev_scan_time_ms + + NSEC2MSEC(gethrtime() - vr->vr_pass_start_time); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Initialize the on-disk state for a new rebuild, start the rebuild thread. + */ +static void +vdev_rebuild_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_rebuilding); + + spa_feature_incr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + mutex_enter(&vd->vdev_rebuild_lock); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + vrp->vrp_rebuild_state = VDEV_REBUILD_ACTIVE; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_start_time = gethrestime_sec(); + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* + * Rebuilds are currently only used when replacing a device, in which + * case there must be DTL_MISSING entries. In the future, we could + * allow rebuilds to be used in a way similar to a scrub. This would + * be useful because it would allow us to rebuild the space used by + * pool checkpoints. + */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu started", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_log_notify(spa_t *spa, vdev_t *vd, char *name) +{ + nvlist_t *aux = fnvlist_alloc(); + + fnvlist_add_string(aux, ZFS_EV_RESILVER_TYPE, "sequential"); + spa_event_notify(spa, vd, aux, name); + nvlist_free(aux); +} + +/* + * Called to request that a new rebuild be started. The feature will remain + * active for the duration of the rebuild, then revert to the enabled state. + */ +static void +vdev_rebuild_initiate(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(vd->vdev_top == vd); + ASSERT(MUTEX_HELD(&vd->vdev_rebuild_lock)); + ASSERT(!vd->vdev_rebuilding); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + vd->vdev_rebuilding = B_TRUE; + + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_rebuild_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_START); +} + +/* + * Update the on-disk state to completed when a rebuild finishes. + */ +static void +vdev_rebuild_complete_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_COMPLETE; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + vdev_dtl_reassess(vd, tx->tx_txg, vrp->vrp_max_txg, B_TRUE, B_TRUE); + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu complete", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + /* Handles detaching of spares */ + spa_async_request(spa, SPA_ASYNC_REBUILD_DONE); + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Update the on-disk state to canceled when a rebuild finishes. + */ +static void +vdev_rebuild_cancel_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + vrp->vrp_rebuild_state = VDEV_REBUILD_CANCELED; + vrp->vrp_end_time = gethrestime_sec(); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_feature_decr(vd->vdev_spa, SPA_FEATURE_DEVICE_REBUILD, tx); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu canceled", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + vdev_rebuild_log_notify(spa, vd, ESC_ZFS_RESILVER_FINISH); + + vd->vdev_rebuild_cancel_wanted = B_FALSE; + vd->vdev_rebuilding = B_FALSE; + mutex_exit(&vd->vdev_rebuild_lock); + + spa_notify_waiters(spa); + cv_broadcast(&vd->vdev_rebuild_cv); +} + +/* + * Resets the progress of a running rebuild. This will occur when a new + * vdev is added to rebuild. + */ +static void +vdev_rebuild_reset_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + ASSERT3P(vd->vdev_rebuild_thread, ==, NULL); + + vrp->vrp_last_offset = 0; + vrp->vrp_min_txg = 0; + vrp->vrp_max_txg = dmu_tx_get_txg(tx); + vrp->vrp_bytes_scanned = 0; + vrp->vrp_bytes_issued = 0; + vrp->vrp_bytes_rebuilt = 0; + vrp->vrp_bytes_est = 0; + vrp->vrp_scan_time_ms = 0; + vr->vr_prev_scan_time_ms = 0; + + /* See vdev_rebuild_initiate_sync comment */ + VERIFY(vdev_resilver_needed(vd, &vrp->vrp_min_txg, &vrp->vrp_max_txg)); + + VERIFY0(zap_update(vd->vdev_spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + + spa_history_log_internal(spa, "rebuild", tx, + "vdev_id=%llu vdev_guid=%llu reset", + (u_longlong_t)vd->vdev_id, (u_longlong_t)vd->vdev_guid); + + vd->vdev_rebuild_reset_wanted = B_FALSE; + ASSERT(vd->vdev_rebuilding); + + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * Clear the last rebuild status. + */ +void +vdev_rebuild_clear_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + objset_t *mos = spa_meta_objset(spa); + + mutex_enter(&vd->vdev_rebuild_lock); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD) || + vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE) { + mutex_exit(&vd->vdev_rebuild_lock); + return; + } + + clear_rebuild_bytes(vd); + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + + if (vd->vdev_top_zap != 0 && zap_contains(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS) == 0) { + VERIFY0(zap_update(mos, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp, tx)); + } + + mutex_exit(&vd->vdev_rebuild_lock); +} + +/* + * The zio_done_func_t callback for each rebuild I/O issued. It's responsible + * for updating the rebuild stats and limiting the number of in flight I/Os. + */ +static void +vdev_rebuild_cb(zio_t *zio) +{ + vdev_rebuild_t *vr = zio->io_private; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vdev_t *vd = vr->vr_top_vdev; + + mutex_enter(&vd->vdev_rebuild_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the top-level vdev was unavailable. + * Attempt to roll back to the last completed offset, in order + * resume from the correct location if the pool is resumed. + * (This works because spa_sync waits on spa_txg_zio before + * it runs sync tasks.) + */ + uint64_t *off = &vr->vr_scan_offset[zio->io_txg & TXG_MASK]; + *off = MIN(*off, zio->io_offset); + } else if (zio->io_error) { + vrp->vrp_errors++; + } + + abd_free(zio->io_abd); + + ASSERT3U(vd->vdev_rebuild_inflight, >, 0); + vd->vdev_rebuild_inflight--; + cv_broadcast(&vd->vdev_rebuild_io_cv); + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * Rebuild the data in this range by constructing a special dummy block + * pointer for the given range. It has no relation to any existing blocks + * in the pool. But by disabling checksum verification and issuing a scrub + * I/O mirrored vdevs will replicate the block using any available mirror + * leaf vdevs. + */ +static void +vdev_rebuild_rebuild_block(vdev_rebuild_t *vr, uint64_t start, uint64_t asize, + uint64_t txg) +{ + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + uint64_t psize = asize; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + blkptr_t blk, *bp = &blk; + BP_ZERO(bp); + + DVA_SET_VDEV(&bp->blk_dva[0], vd->vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], start); + DVA_SET_GANG(&bp->blk_dva[0], 0); + DVA_SET_ASIZE(&bp->blk_dva[0], asize); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(bp, psize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + + /* + * We increment the issued bytes by the asize rather than the psize + * so the scanned and issued bytes may be directly compared. This + * is consistent with the scrub/resilver issued reporting. + */ + vr->vr_pass_bytes_issued += asize; + vr->vr_rebuild_phys.vrp_bytes_issued += asize; + + zio_nowait(zio_read(spa->spa_txg_zio[txg & TXG_MASK], spa, bp, + abd_alloc(psize, B_FALSE), psize, vdev_rebuild_cb, vr, + ZIO_PRIORITY_REBUILD, ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL | + ZIO_FLAG_RESILVER, NULL)); +} + +/* + * Issues a rebuild I/O and takes care of rate limiting the number of queued + * rebuild I/Os. The provided start and size must be properly aligned for the + * top-level vdev type being rebuilt. + */ +static int +vdev_rebuild_range(vdev_rebuild_t *vr, uint64_t start, uint64_t size) +{ + uint64_t ms_id __maybe_unused = vr->vr_scan_msp->ms_id; + vdev_t *vd = vr->vr_top_vdev; + spa_t *spa = vd->vdev_spa; + + ASSERT3U(ms_id, ==, start >> vd->vdev_ms_shift); + ASSERT3U(ms_id, ==, (start + size - 1) >> vd->vdev_ms_shift); + + vr->vr_pass_bytes_scanned += size; + vr->vr_rebuild_phys.vrp_bytes_scanned += size; + + mutex_enter(&vd->vdev_rebuild_io_lock); + + /* Limit in flight rebuild I/Os */ + while (vd->vdev_rebuild_inflight >= zfs_rebuild_queue_limit) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + vd->vdev_rebuild_inflight++; + mutex_exit(&vd->vdev_rebuild_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + /* This is the first I/O for this txg. */ + if (vr->vr_scan_offset[txg & TXG_MASK] == 0) { + vr->vr_scan_offset[txg & TXG_MASK] = start; + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_rebuild_update_sync, + (void *)(uintptr_t)vd->vdev_id, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* When exiting write out our progress. */ + if (vdev_rebuild_should_stop(vd)) { + mutex_enter(&vd->vdev_rebuild_io_lock); + vd->vdev_rebuild_inflight--; + mutex_exit(&vd->vdev_rebuild_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_rebuild_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_rebuild_lock); + + vr->vr_scan_offset[txg & TXG_MASK] = start + size; + vdev_rebuild_rebuild_block(vr, start, size, txg); + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Split range into legally-sized logical chunks given the constraints of the + * top-level mirror vdev type. + */ +static uint64_t +vdev_rebuild_chunk_size(vdev_t *vd, uint64_t start, uint64_t size) +{ + uint64_t chunk_size, max_asize, max_segment; + + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); + + max_segment = MIN(P2ROUNDUP(zfs_rebuild_max_segment, + 1 << vd->vdev_ashift), SPA_MAXBLOCKSIZE); + max_asize = vdev_psize_to_asize(vd, max_segment); + chunk_size = MIN(size, max_asize); + + return (chunk_size); +} + +/* + * Issues rebuild I/Os for all ranges in the provided vr->vr_tree range tree. + */ +static int +vdev_rebuild_ranges(vdev_rebuild_t *vr) +{ + vdev_t *vd = vr->vr_top_vdev; + zfs_btree_t *t = &vr->vr_scan_tree->rt_root; + zfs_btree_index_t idx; + int error; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t start = rs_get_start(rs, vr->vr_scan_tree); + uint64_t size = rs_get_end(rs, vr->vr_scan_tree) - start; + + /* + * zfs_scan_suspend_progress can be set to disable rebuild + * progress for testing. See comment in dsl_scan_sync(). + */ + while (zfs_scan_suspend_progress && + !vdev_rebuild_should_stop(vd)) { + delay(hz); + } + + while (size > 0) { + uint64_t chunk_size; + + chunk_size = vdev_rebuild_chunk_size(vd, start, size); + + error = vdev_rebuild_range(vr, start, chunk_size); + if (error != 0) + return (error); + + size -= chunk_size; + start += chunk_size; + } + } + + return (0); +} + +/* + * Calculates the estimated capacity which remains to be scanned. Since + * we traverse the pool in metaslab order only allocated capacity beyond + * the vrp_last_offset need be considered. All lower offsets must have + * already been rebuilt and are thus already included in vrp_bytes_scanned. + */ +static void +vdev_rebuild_update_bytes_est(vdev_t *vd, uint64_t ms_id) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + uint64_t bytes_est = vrp->vrp_bytes_scanned; + + if (vrp->vrp_last_offset < vd->vdev_ms[ms_id]->ms_start) + return; + + for (uint64_t i = ms_id; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + bytes_est += metaslab_allocated_space(msp); + mutex_exit(&msp->ms_lock); + } + + vrp->vrp_bytes_est = bytes_est; +} + +/* + * Load from disk the top-level vdev's rebuild information. + */ +int +vdev_rebuild_load(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + spa_t *spa = vd->vdev_spa; + int err = 0; + + mutex_enter(&vd->vdev_rebuild_lock); + vd->vdev_rebuilding = B_FALSE; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + mutex_exit(&vd->vdev_rebuild_lock); + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(vd->vdev_top == vd); + + err = zap_lookup(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_VDEV_REBUILD_PHYS, sizeof (uint64_t), + REBUILD_PHYS_ENTRIES, vrp); + + /* + * A missing or damaged VDEV_TOP_ZAP_VDEV_REBUILD_PHYS should + * not prevent a pool from being imported. Clear the rebuild + * status allowing a new resilver/rebuild to be started. + */ + if (err == ENOENT || err == EOVERFLOW || err == ECKSUM) { + bzero(vrp, sizeof (uint64_t) * REBUILD_PHYS_ENTRIES); + } else if (err) { + mutex_exit(&vd->vdev_rebuild_lock); + return (err); + } + + vr->vr_prev_scan_time_ms = vrp->vrp_scan_time_ms; + vr->vr_top_vdev = vd; + + mutex_exit(&vd->vdev_rebuild_lock); + + return (0); +} + +/* + * Each scan thread is responsible for rebuilding a top-level vdev. The + * rebuild progress in tracked on-disk in VDEV_TOP_ZAP_VDEV_REBUILD_PHYS. + */ +static void +vdev_rebuild_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + /* + * If there's a scrub in process request that it be stopped. This + * is not required for a correct rebuild, but we do want rebuilds to + * emulate the resilver behavior as much as possible. + */ + dsl_pool_t *dsl = spa_get_dsl(spa); + if (dsl_scan_scrubbing(dsl)) + dsl_scan_cancel(dsl); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + mutex_enter(&vd->vdev_rebuild_lock); + + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_rebuild_thread, !=, NULL); + ASSERT(vd->vdev_rebuilding); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_DEVICE_REBUILD)); + ASSERT3B(vd->vdev_rebuild_cancel_wanted, ==, B_FALSE); + ASSERT3B(vd->vdev_rebuild_reset_wanted, ==, B_FALSE); + + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + vr->vr_top_vdev = vd; + vr->vr_scan_msp = NULL; + vr->vr_scan_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + vr->vr_pass_start_time = gethrtime(); + vr->vr_pass_bytes_scanned = 0; + vr->vr_pass_bytes_issued = 0; + + uint64_t update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, 0); + + clear_rebuild_bytes(vr->vr_top_vdev); + + mutex_exit(&vd->vdev_rebuild_lock); + + /* + * Systematically walk the metaslabs and issue rebuild I/Os for + * all ranges in the allocated space map. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + vr->vr_scan_msp = msp; + + /* + * Removal of vdevs from the vdev tree may eliminate the need + * for the rebuild, in which case it should be canceled. The + * vdev_rebuild_cancel_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (vdev_rebuild_should_cancel(vd)) { + vd->vdev_rebuild_cancel_wanted = B_TRUE; + error = EINTR; + break; + } + + ASSERT0(range_tree_space(vr->vr_scan_tree)); + + /* + * Disable any new allocations to this metaslab and wait + * for any writes inflight to complete. This is needed to + * ensure all allocated ranges are rebuilt. + */ + metaslab_disable(msp); + spa_config_exit(spa, SCL_CONFIG, FTAG); + txg_wait_synced(dsl, 0); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * When a metaslab has been allocated from read its allocated + * ranges from the space map object in to the vr_scan_tree. + * Then add inflight / unflushed ranges and remove inflight / + * unflushed frees. This is the minimum range to be rebuilt. + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + vr->vr_scan_tree, SM_ALLOC)); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space( + msp->ms_allocating[i])); + } + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, vr->vr_scan_tree); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, vr->vr_scan_tree); + + /* + * Remove ranges which have already been rebuilt based + * on the last offset. This can happen when restarting + * a scan after exporting and re-importing the pool. + */ + range_tree_clear(vr->vr_scan_tree, 0, + vrp->vrp_last_offset); + } + + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + /* + * To provide an accurate estimate re-calculate the estimated + * size every 5 minutes to account for recent allocations and + * frees made space maps which have not yet been rebuilt. + */ + if (gethrtime() > update_est_time + SEC2NSEC(300)) { + update_est_time = gethrtime(); + vdev_rebuild_update_bytes_est(vd, i); + } + + /* + * Walk the allocated space map and issue the rebuild I/O. + */ + error = vdev_rebuild_ranges(vr); + range_tree_vacate(vr->vr_scan_tree, NULL, NULL); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + metaslab_enable(msp, B_FALSE, B_FALSE); + + if (error != 0) + break; + } + + range_tree_destroy(vr->vr_scan_tree); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* Wait for any remaining rebuild I/O to complete */ + mutex_enter(&vd->vdev_rebuild_io_lock); + while (vd->vdev_rebuild_inflight > 0) + cv_wait(&vd->vdev_rebuild_io_cv, &vd->vdev_rebuild_io_lock); + + mutex_exit(&vd->vdev_rebuild_io_lock); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + dsl_pool_t *dp = spa_get_dsl(spa); + dmu_tx_t *tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (error == 0) { + /* + * After a successful rebuild clear the DTLs of all ranges + * which were missing when the rebuild was started. These + * ranges must have been rebuilt as a consequence of rebuilding + * all allocated space. Note that unlike a scrub or resilver + * the rebuild operation will reconstruct data only referenced + * by a pool checkpoint. See the dsl_scan_done() comments. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_complete_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_cancel_wanted) { + /* + * The rebuild operation was canceled. This will occur when + * a device participating in the rebuild is detached. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_cancel_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else if (vd->vdev_rebuild_reset_wanted) { + /* + * Reset the running rebuild without canceling and restarting + * it. This will occur when a new device is attached and must + * participate in the rebuild. + */ + dsl_sync_task_nowait(dp, vdev_rebuild_reset_sync, + (void *)(uintptr_t)vd->vdev_id, 0, + ZFS_SPACE_CHECK_NONE, tx); + } else { + /* + * The rebuild operation should be suspended. This may occur + * when detaching a child vdev or when exporting the pool. The + * rebuild is left in the active state so it will be resumed. + */ + ASSERT(vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + vd->vdev_rebuilding = B_FALSE; + } + + dmu_tx_commit(tx); + + vd->vdev_rebuild_thread = NULL; + mutex_exit(&vd->vdev_rebuild_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + cv_broadcast(&vd->vdev_rebuild_cv); + + thread_exit(); +} + +/* + * Returns B_TRUE if any top-level vdev are rebuilding. + */ +boolean_t +vdev_rebuild_active(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + boolean_t ret = B_FALSE; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) { + ret = vdev_rebuild_active(vd->vdev_child[i]); + if (ret) + return (ret); + } + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + ret = (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE); + mutex_exit(&vd->vdev_rebuild_lock); + } + + return (ret); +} + +/* + * Start a rebuild operation. The rebuild may be restarted when the + * top-level vdev is currently actively rebuilding. + */ +void +vdev_rebuild(vdev_t *vd) +{ + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp __maybe_unused = &vr->vr_rebuild_phys; + + ASSERT(vd->vdev_top == vd); + ASSERT(vdev_is_concrete(vd)); + ASSERT(!vd->vdev_removing); + ASSERT(spa_feature_is_enabled(vd->vdev_spa, + SPA_FEATURE_DEVICE_REBUILD)); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuilding) { + ASSERT3U(vrp->vrp_rebuild_state, ==, VDEV_REBUILD_ACTIVE); + + /* + * Signal a running rebuild operation that it should restart + * from the beginning because a new device was attached. The + * vdev_rebuild_reset_wanted flag is set until the sync task + * completes. This may be after the rebuild thread exits. + */ + if (!vd->vdev_rebuild_reset_wanted) + vd->vdev_rebuild_reset_wanted = B_TRUE; + } else { + vdev_rebuild_initiate(vd); + } + mutex_exit(&vd->vdev_rebuild_lock); +} + +static void +vdev_rebuild_restart_impl(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_restart_impl(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + vdev_rebuild_t *vr = &vd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&vd->vdev_rebuild_lock); + if (vrp->vrp_rebuild_state == VDEV_REBUILD_ACTIVE && + vdev_writeable(vd) && !vd->vdev_rebuilding) { + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_DEVICE_REBUILD)); + vd->vdev_rebuilding = B_TRUE; + vd->vdev_rebuild_thread = thread_create(NULL, 0, + vdev_rebuild_thread, vd, 0, &p0, TS_RUN, + maxclsyspri); + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Conditionally restart all of the vdev_rebuild_thread's for a pool. The + * feature flag must be active and the rebuild in the active state. This + * cannot be used to start a new rebuild. + */ +void +vdev_rebuild_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + vdev_rebuild_restart_impl(spa->spa_root_vdev); +} + +/* + * Stop and wait for all of the vdev_rebuild_thread's associated with the + * vdev tree provide to be terminated (canceled or stopped). + */ +void +vdev_rebuild_stop_wait(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (vd == spa->spa_root_vdev) { + for (uint64_t i = 0; i < vd->vdev_children; i++) + vdev_rebuild_stop_wait(vd->vdev_child[i]); + + } else if (vd->vdev_top_zap != 0) { + ASSERT(vd == vd->vdev_top); + + mutex_enter(&vd->vdev_rebuild_lock); + if (vd->vdev_rebuild_thread != NULL) { + vd->vdev_rebuild_exit_wanted = B_TRUE; + while (vd->vdev_rebuilding) { + cv_wait(&vd->vdev_rebuild_cv, + &vd->vdev_rebuild_lock); + } + vd->vdev_rebuild_exit_wanted = B_FALSE; + } + mutex_exit(&vd->vdev_rebuild_lock); + } +} + +/* + * Stop all rebuild operations but leave them in the active state so they + * will be resumed when importing the pool. + */ +void +vdev_rebuild_stop_all(spa_t *spa) +{ + vdev_rebuild_stop_wait(spa->spa_root_vdev); +} + +/* + * Rebuild statistics reported per top-level vdev. + */ +int +vdev_rebuild_get_stats(vdev_t *tvd, vdev_rebuild_stat_t *vrs) +{ + spa_t *spa = tvd->vdev_spa; + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REBUILD)) + return (SET_ERROR(ENOTSUP)); + + if (tvd != tvd->vdev_top || tvd->vdev_top_zap == 0) + return (SET_ERROR(EINVAL)); + + int error = zap_contains(spa_meta_objset(spa), + tvd->vdev_top_zap, VDEV_TOP_ZAP_VDEV_REBUILD_PHYS); + + if (error == ENOENT) { + bzero(vrs, sizeof (vdev_rebuild_stat_t)); + vrs->vrs_state = VDEV_REBUILD_NONE; + error = 0; + } else if (error == 0) { + vdev_rebuild_t *vr = &tvd->vdev_rebuild_config; + vdev_rebuild_phys_t *vrp = &vr->vr_rebuild_phys; + + mutex_enter(&tvd->vdev_rebuild_lock); + vrs->vrs_state = vrp->vrp_rebuild_state; + vrs->vrs_start_time = vrp->vrp_start_time; + vrs->vrs_end_time = vrp->vrp_end_time; + vrs->vrs_scan_time_ms = vrp->vrp_scan_time_ms; + vrs->vrs_bytes_scanned = vrp->vrp_bytes_scanned; + vrs->vrs_bytes_issued = vrp->vrp_bytes_issued; + vrs->vrs_bytes_rebuilt = vrp->vrp_bytes_rebuilt; + vrs->vrs_bytes_est = vrp->vrp_bytes_est; + vrs->vrs_errors = vrp->vrp_errors; + vrs->vrs_pass_time_ms = NSEC2MSEC(gethrtime() - + vr->vr_pass_start_time); + vrs->vrs_pass_bytes_scanned = vr->vr_pass_bytes_scanned; + vrs->vrs_pass_bytes_issued = vr->vr_pass_bytes_issued; + mutex_exit(&tvd->vdev_rebuild_lock); + } + + return (error); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, rebuild_max_segment, ULONG, ZMOD_RW, + "Max segment size in bytes of rebuild reads"); +/* END CSTYLED */ diff --git a/module/zfs/vdev_removal.c b/module/zfs/vdev_removal.c new file mode 100644 index 000000000000..56e420871f61 --- /dev/null +++ b/module/zfs/vdev_removal.c @@ -0,0 +1,2340 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2019, loli10K . All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * This file contains the necessary logic to remove vdevs from a + * storage pool. Currently, the only devices that can be removed + * are log, cache, and spare devices; and top level vdevs from a pool + * w/o raidz or mirrors. (Note that members of a mirror can be removed + * by the detach operation.) + * + * Log vdevs are removed by evacuating them and then turning the vdev + * into a hole vdev while holding spa config locks. + * + * Top level vdevs are removed and converted into an indirect vdev via + * a multi-step process: + * + * - Disable allocations from this device (spa_vdev_remove_top). + * + * - From a new thread (spa_vdev_remove_thread), copy data from + * the removing vdev to a different vdev. The copy happens in open + * context (spa_vdev_copy_impl) and issues a sync task + * (vdev_mapping_sync) so the sync thread can update the partial + * indirect mappings in core and on disk. + * + * - If a free happens during a removal, it is freed from the + * removing vdev, and if it has already been copied, from the new + * location as well (free_from_removing_vdev). + * + * - After the removal is completed, the copy thread converts the vdev + * into an indirect vdev (vdev_remove_complete) before instructing + * the sync thread to destroy the space maps and finish the removal + * (spa_finish_removal). + */ + +typedef struct vdev_copy_arg { + metaslab_t *vca_msp; + uint64_t vca_outstanding_bytes; + uint64_t vca_read_error_bytes; + uint64_t vca_write_error_bytes; + kcondvar_t vca_cv; + kmutex_t vca_lock; +} vdev_copy_arg_t; + +/* + * The maximum amount of memory we can use for outstanding i/o while + * doing a device removal. This determines how much i/o we can have + * in flight concurrently. + */ +int zfs_remove_max_copy_bytes = 64 * 1024 * 1024; + +/* + * The largest contiguous segment that we will attempt to allocate when + * removing a device. This can be no larger than SPA_MAXBLOCKSIZE. If + * there is a performance problem with attempting to allocate large blocks, + * consider decreasing this. + * + * See also the accessor function spa_remove_max_segment(). + */ +int zfs_remove_max_segment = SPA_MAXBLOCKSIZE; + +/* + * Ignore hard IO errors during device removal. When set if a device + * encounters hard IO error during the removal process the removal will + * not be cancelled. This can result in a normally recoverable block + * becoming permanently damaged and is not recommended. + */ +int zfs_removal_ignore_errors = 0; + +/* + * Allow a remap segment to span free chunks of at most this size. The main + * impact of a larger span is that we will read and write larger, more + * contiguous chunks, with more "unnecessary" data -- trading off bandwidth + * for iops. The value here was chosen to align with + * zfs_vdev_read_gap_limit, which is a similar concept when doing regular + * reads (but there's no reason it has to be the same). + * + * Additionally, a higher span will have the following relatively minor + * effects: + * - the mapping will be smaller, since one entry can cover more allocated + * segments + * - more of the fragmentation in the removing device will be preserved + * - we'll do larger allocations, which may fail and fall back on smaller + * allocations + */ +int vdev_removal_max_span = 32 * 1024; + +/* + * This is used by the test suite so that it can ensure that certain + * actions happen while in the middle of a removal. + */ +int zfs_removal_suspend_progress = 0; + +#define VDEV_REMOVAL_ZAP_OBJS "lzap" + +static void spa_vdev_remove_thread(void *arg); +static int spa_vdev_remove_cancel_impl(spa_t *spa); + +static void +spa_sync_removing_state(spa_t *spa, dmu_tx_t *tx) +{ + VERIFY0(zap_update(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys, tx)); +} + +static nvlist_t * +spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid) +{ + for (int i = 0; i < count; i++) { + uint64_t guid = + fnvlist_lookup_uint64(nvpp[i], ZPOOL_CONFIG_GUID); + + if (guid == target_guid) + return (nvpp[i]); + } + + return (NULL); +} + +static void +spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, + nvlist_t *dev_to_remove) +{ + nvlist_t **newdev = NULL; + + if (count > 1) + newdev = kmem_alloc((count - 1) * sizeof (void *), KM_SLEEP); + + for (int i = 0, j = 0; i < count; i++) { + if (dev[i] == dev_to_remove) + continue; + VERIFY(nvlist_dup(dev[i], &newdev[j++], KM_SLEEP) == 0); + } + + VERIFY(nvlist_remove(config, name, DATA_TYPE_NVLIST_ARRAY) == 0); + VERIFY(nvlist_add_nvlist_array(config, name, newdev, count - 1) == 0); + + for (int i = 0; i < count - 1; i++) + nvlist_free(newdev[i]); + + if (count > 1) + kmem_free(newdev, (count - 1) * sizeof (void *)); +} + +static spa_vdev_removal_t * +spa_vdev_removal_create(vdev_t *vd) +{ + spa_vdev_removal_t *svr = kmem_zalloc(sizeof (*svr), KM_SLEEP); + mutex_init(&svr->svr_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&svr->svr_cv, NULL, CV_DEFAULT, NULL); + svr->svr_allocd_segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + svr->svr_vdev_id = vd->vdev_id; + + for (int i = 0; i < TXG_SIZE; i++) { + svr->svr_frees[i] = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + list_create(&svr->svr_new_segments[i], + sizeof (vdev_indirect_mapping_entry_t), + offsetof(vdev_indirect_mapping_entry_t, vime_node)); + } + + return (svr); +} + +void +spa_vdev_removal_destroy(spa_vdev_removal_t *svr) +{ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + ASSERT0(svr->svr_max_offset_to_sync[i]); + range_tree_destroy(svr->svr_frees[i]); + list_destroy(&svr->svr_new_segments[i]); + } + + range_tree_destroy(svr->svr_allocd_segs); + mutex_destroy(&svr->svr_lock); + cv_destroy(&svr->svr_cv); + kmem_free(svr, sizeof (*svr)); +} + +/* + * This is called as a synctask in the txg in which we will mark this vdev + * as removing (in the config stored in the MOS). + * + * It begins the evacuation of a toplevel vdev by: + * - initializing the spa_removing_phys which tracks this removal + * - computing the amount of space to remove for accounting purposes + * - dirtying all dbufs in the spa_config_object + * - creating the spa_vdev_removal + * - starting the spa_vdev_remove_thread + */ +static void +vdev_remove_initiate_sync(void *arg, dmu_tx_t *tx) +{ + int vdev_id = (uintptr_t)arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + objset_t *mos = spa->spa_dsl_pool->dp_meta_objset; + spa_vdev_removal_t *svr = NULL; + uint64_t txg __maybe_unused = dmu_tx_get_txg(tx); + + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + svr = spa_vdev_removal_create(vd); + + ASSERT(vd->vdev_removing); + ASSERT3P(vd->vdev_indirect_mapping, ==, NULL); + + spa_feature_incr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + /* + * By activating the OBSOLETE_COUNTS feature, we prevent + * the pool from being downgraded and ensure that the + * refcounts are precise. + */ + spa_feature_incr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + uint64_t one = 1; + VERIFY0(zap_add(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, sizeof (one), 1, + &one, tx)); + boolean_t are_precise __maybe_unused; + ASSERT0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + ASSERT3B(are_precise, ==, B_TRUE); + } + + vic->vic_mapping_object = vdev_indirect_mapping_alloc(mos, tx); + vd->vdev_indirect_mapping = + vdev_indirect_mapping_open(mos, vic->vic_mapping_object); + vic->vic_births_object = vdev_indirect_births_alloc(mos, tx); + vd->vdev_indirect_births = + vdev_indirect_births_open(mos, vic->vic_births_object); + spa->spa_removing_phys.sr_removing_vdev = vd->vdev_id; + spa->spa_removing_phys.sr_start_time = gethrestime_sec(); + spa->spa_removing_phys.sr_end_time = 0; + spa->spa_removing_phys.sr_state = DSS_SCANNING; + spa->spa_removing_phys.sr_to_copy = 0; + spa->spa_removing_phys.sr_copied = 0; + + /* + * Note: We can't use vdev_stat's vs_alloc for sr_to_copy, because + * there may be space in the defer tree, which is free, but still + * counted in vs_alloc. + */ + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *ms = vd->vdev_ms[i]; + if (ms->ms_sm == NULL) + continue; + + spa->spa_removing_phys.sr_to_copy += + metaslab_allocated_space(ms); + + /* + * Space which we are freeing this txg does not need to + * be copied. + */ + spa->spa_removing_phys.sr_to_copy -= + range_tree_space(ms->ms_freeing); + + ASSERT0(range_tree_space(ms->ms_freed)); + for (int t = 0; t < TXG_SIZE; t++) + ASSERT0(range_tree_space(ms->ms_allocating[t])); + } + + /* + * Sync tasks are called before metaslab_sync(), so there should + * be no already-synced metaslabs in the TXG_CLEAN list. + */ + ASSERT3P(txg_list_head(&vd->vdev_ms_list, TXG_CLEAN(txg)), ==, NULL); + + spa_sync_removing_state(spa, tx); + + /* + * All blocks that we need to read the most recent mapping must be + * stored on concrete vdevs. Therefore, we must dirty anything that + * is read before spa_remove_init(). Specifically, the + * spa_config_object. (Note that although we already modified the + * spa_config_object in spa_sync_removing_state, that may not have + * modified all blocks of the object.) + */ + dmu_object_info_t doi; + VERIFY0(dmu_object_info(mos, DMU_POOL_DIRECTORY_OBJECT, &doi)); + for (uint64_t offset = 0; offset < doi.doi_max_offset; ) { + dmu_buf_t *dbuf; + VERIFY0(dmu_buf_hold(mos, DMU_POOL_DIRECTORY_OBJECT, + offset, FTAG, &dbuf, 0)); + dmu_buf_will_dirty(dbuf, tx); + offset += dbuf->db_size; + dmu_buf_rele(dbuf, FTAG); + } + + /* + * Now that we've allocated the im_object, dirty the vdev to ensure + * that the object gets written to the config on disk. + */ + vdev_config_dirty(vd); + + zfs_dbgmsg("starting removal thread for vdev %llu (%px) in txg %llu " + "im_obj=%llu", vd->vdev_id, vd, dmu_tx_get_txg(tx), + vic->vic_mapping_object); + + spa_history_log_internal(spa, "vdev remove started", tx, + "%s vdev %llu %s", spa_name(spa), (u_longlong_t)vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); + /* + * Setting spa_vdev_removal causes subsequent frees to call + * free_from_removing_vdev(). Note that we don't need any locking + * because we are the sync thread, and metaslab_free_impl() is only + * called from syncing context (potentially from a zio taskq thread, + * but in any case only when there are outstanding free i/os, which + * there are not). + */ + ASSERT3P(spa->spa_vdev_removal, ==, NULL); + spa->spa_vdev_removal = svr; + svr->svr_thread = thread_create(NULL, 0, + spa_vdev_remove_thread, spa, 0, &p0, TS_RUN, minclsyspri); +} + +/* + * When we are opening a pool, we must read the mapping for each + * indirect vdev in order from most recently removed to least + * recently removed. We do this because the blocks for the mapping + * of older indirect vdevs may be stored on more recently removed vdevs. + * In order to read each indirect mapping object, we must have + * initialized all more recently removed vdevs. + */ +int +spa_remove_init(spa_t *spa) +{ + int error; + + error = zap_lookup(spa->spa_dsl_pool->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_REMOVING, sizeof (uint64_t), + sizeof (spa->spa_removing_phys) / sizeof (uint64_t), + &spa->spa_removing_phys); + + if (error == ENOENT) { + spa->spa_removing_phys.sr_state = DSS_NONE; + spa->spa_removing_phys.sr_removing_vdev = -1; + spa->spa_removing_phys.sr_prev_indirect_vdev = -1; + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); + } else if (error != 0) { + return (error); + } + + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) { + /* + * We are currently removing a vdev. Create and + * initialize a spa_vdev_removal_t from the bonus + * buffer of the removing vdevs vdev_im_object, and + * initialize its partial mapping. + */ + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, + spa->spa_removing_phys.sr_removing_vdev); + + if (vd == NULL) { + spa_config_exit(spa, SCL_STATE, FTAG); + return (EINVAL); + } + + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT(vdev_is_concrete(vd)); + spa_vdev_removal_t *svr = spa_vdev_removal_create(vd); + ASSERT3U(svr->svr_vdev_id, ==, vd->vdev_id); + ASSERT(vd->vdev_removing); + + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + spa_config_exit(spa, SCL_STATE, FTAG); + + spa->spa_vdev_removal = svr; + } + + spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != UINT64_MAX) { + vdev_t *vd = vdev_lookup_top(spa, indirect_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + vd->vdev_indirect_mapping = vdev_indirect_mapping_open( + spa->spa_meta_objset, vic->vic_mapping_object); + vd->vdev_indirect_births = vdev_indirect_births_open( + spa->spa_meta_objset, vic->vic_births_object); + + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + spa_config_exit(spa, SCL_STATE, FTAG); + + /* + * Now that we've loaded all the indirect mappings, we can allow + * reads from other blocks (e.g. via predictive prefetch). + */ + spa->spa_indirect_vdevs_loaded = B_TRUE; + return (0); +} + +void +spa_restart_removal(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + /* + * In general when this function is called there is no + * removal thread running. The only scenario where this + * is not true is during spa_import() where this function + * is called twice [once from spa_import_impl() and + * spa_async_resume()]. Thus, in the scenario where we + * import a pool that has an ongoing removal we don't + * want to spawn a second thread. + */ + if (svr->svr_thread != NULL) + return; + + if (!spa_writeable(spa)) + return; + + zfs_dbgmsg("restarting removal of %llu", svr->svr_vdev_id); + svr->svr_thread = thread_create(NULL, 0, spa_vdev_remove_thread, spa, + 0, &p0, TS_RUN, minclsyspri); +} + +/* + * Process freeing from a device which is in the middle of being removed. + * We must handle this carefully so that we attempt to copy freed data, + * and we correctly free already-copied data. + */ +void +free_from_removing_vdev(vdev_t *vd, uint64_t offset, uint64_t size) +{ + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t txg = spa_syncing_txg(spa); + uint64_t max_offset_yet = 0; + + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT3U(vd->vdev_indirect_config.vic_mapping_object, ==, + vdev_indirect_mapping_object(vim)); + ASSERT3U(vd->vdev_id, ==, svr->svr_vdev_id); + + mutex_enter(&svr->svr_lock); + + /* + * Remove the segment from the removing vdev's spacemap. This + * ensures that we will not attempt to copy this space (if the + * removal thread has not yet visited it), and also ensures + * that we know what is actually allocated on the new vdevs + * (needed if we cancel the removal). + * + * Note: we must do the metaslab_free_concrete() with the svr_lock + * held, so that the remove_thread can not load this metaslab and then + * visit this offset between the time that we metaslab_free_concrete() + * and when we check to see if it has been visited. + * + * Note: The checkpoint flag is set to false as having/taking + * a checkpoint and removing a device can't happen at the same + * time. + */ + ASSERT(!spa_has_checkpoint(spa)); + metaslab_free_concrete(vd, offset, size, B_FALSE); + + uint64_t synced_size = 0; + uint64_t synced_offset = 0; + uint64_t max_offset_synced = vdev_indirect_mapping_max_offset(vim); + if (offset < max_offset_synced) { + /* + * The mapping for this offset is already on disk. + * Free from the new location. + * + * Note that we use svr_max_synced_offset because it is + * updated atomically with respect to the in-core mapping. + * By contrast, vim_max_offset is not. + * + * This block may be split between a synced entry and an + * in-flight or unvisited entry. Only process the synced + * portion of it here. + */ + synced_size = MIN(size, max_offset_synced - offset); + synced_offset = offset; + + ASSERT3U(max_offset_yet, <=, max_offset_synced); + max_offset_yet = max_offset_synced; + + DTRACE_PROBE3(remove__free__synced, + spa_t *, spa, + uint64_t, offset, + uint64_t, synced_size); + + size -= synced_size; + offset += synced_size; + } + + /* + * Look at all in-flight txgs starting from the currently syncing one + * and see if a section of this free is being copied. By starting from + * this txg and iterating forward, we might find that this region + * was copied in two different txgs and handle it appropriately. + */ + for (int i = 0; i < TXG_CONCURRENT_STATES; i++) { + int txgoff = (txg + i) & TXG_MASK; + if (size > 0 && offset < svr->svr_max_offset_to_sync[txgoff]) { + /* + * The mapping for this offset is in flight, and + * will be synced in txg+i. + */ + uint64_t inflight_size = MIN(size, + svr->svr_max_offset_to_sync[txgoff] - offset); + + DTRACE_PROBE4(remove__free__inflight, + spa_t *, spa, + uint64_t, offset, + uint64_t, inflight_size, + uint64_t, txg + i); + + /* + * We copy data in order of increasing offset. + * Therefore the max_offset_to_sync[] must increase + * (or be zero, indicating that nothing is being + * copied in that txg). + */ + if (svr->svr_max_offset_to_sync[txgoff] != 0) { + ASSERT3U(svr->svr_max_offset_to_sync[txgoff], + >=, max_offset_yet); + max_offset_yet = + svr->svr_max_offset_to_sync[txgoff]; + } + + /* + * We've already committed to copying this segment: + * we have allocated space elsewhere in the pool for + * it and have an IO outstanding to copy the data. We + * cannot free the space before the copy has + * completed, or else the copy IO might overwrite any + * new data. To free that space, we record the + * segment in the appropriate svr_frees tree and free + * the mapped space later, in the txg where we have + * completed the copy and synced the mapping (see + * vdev_mapping_sync). + */ + range_tree_add(svr->svr_frees[txgoff], + offset, inflight_size); + size -= inflight_size; + offset += inflight_size; + + /* + * This space is already accounted for as being + * done, because it is being copied in txg+i. + * However, if i!=0, then it is being copied in + * a future txg. If we crash after this txg + * syncs but before txg+i syncs, then the space + * will be free. Therefore we must account + * for the space being done in *this* txg + * (when it is freed) rather than the future txg + * (when it will be copied). + */ + ASSERT3U(svr->svr_bytes_done[txgoff], >=, + inflight_size); + svr->svr_bytes_done[txgoff] -= inflight_size; + svr->svr_bytes_done[txg & TXG_MASK] += inflight_size; + } + } + ASSERT0(svr->svr_max_offset_to_sync[TXG_CLEAN(txg) & TXG_MASK]); + + if (size > 0) { + /* + * The copy thread has not yet visited this offset. Ensure + * that it doesn't. + */ + + DTRACE_PROBE3(remove__free__unvisited, + spa_t *, spa, + uint64_t, offset, + uint64_t, size); + + if (svr->svr_allocd_segs != NULL) + range_tree_clear(svr->svr_allocd_segs, offset, size); + + /* + * Since we now do not need to copy this data, for + * accounting purposes we have done our job and can count + * it as completed. + */ + svr->svr_bytes_done[txg & TXG_MASK] += size; + } + mutex_exit(&svr->svr_lock); + + /* + * Now that we have dropped svr_lock, process the synced portion + * of this free. + */ + if (synced_size > 0) { + vdev_indirect_mark_obsolete(vd, synced_offset, synced_size); + + /* + * Note: this can only be called from syncing context, + * and the vdev_indirect_mapping is only changed from the + * sync thread, so we don't need svr_lock while doing + * metaslab_free_impl_cb. + */ + boolean_t checkpoint = B_FALSE; + vdev_indirect_ops.vdev_op_remap(vd, synced_offset, synced_size, + metaslab_free_impl_cb, &checkpoint); + } +} + +/* + * Stop an active removal and update the spa_removing phys. + */ +static void +spa_finish_removal(spa_t *spa, dsl_scan_state_t state, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + ASSERT3U(dmu_tx_get_txg(tx), ==, spa_syncing_txg(spa)); + + /* Ensure the removal thread has completed before we free the svr. */ + spa_vdev_remove_suspend(spa); + + ASSERT(state == DSS_FINISHED || state == DSS_CANCELED); + + if (state == DSS_FINISHED) { + spa_removing_phys_t *srp = &spa->spa_removing_phys; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + + if (srp->sr_prev_indirect_vdev != -1) { + vdev_t *pvd; + pvd = vdev_lookup_top(spa, + srp->sr_prev_indirect_vdev); + ASSERT3P(pvd->vdev_ops, ==, &vdev_indirect_ops); + } + + vic->vic_prev_indirect_vdev = srp->sr_prev_indirect_vdev; + srp->sr_prev_indirect_vdev = vd->vdev_id; + } + spa->spa_removing_phys.sr_state = state; + spa->spa_removing_phys.sr_end_time = gethrestime_sec(); + + spa->spa_vdev_removal = NULL; + spa_vdev_removal_destroy(svr); + + spa_sync_removing_state(spa, tx); + spa_notify_waiters(spa); + + vdev_config_dirty(spa->spa_root_vdev); +} + +static void +free_mapped_segment_cb(void *arg, uint64_t offset, uint64_t size) +{ + vdev_t *vd = arg; + vdev_indirect_mark_obsolete(vd, offset, size); + boolean_t checkpoint = B_FALSE; + vdev_indirect_ops.vdev_op_remap(vd, offset, size, + metaslab_free_impl_cb, &checkpoint); +} + +/* + * On behalf of the removal thread, syncs an incremental bit more of + * the indirect mapping to disk and updates the in-memory mapping. + * Called as a sync task in every txg that the removal thread makes progress. + */ +static void +vdev_mapping_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic __maybe_unused = &vd->vdev_indirect_config; + uint64_t txg = dmu_tx_get_txg(tx); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT(vic->vic_mapping_object != 0); + ASSERT3U(txg, ==, spa_syncing_txg(spa)); + + vdev_indirect_mapping_add_entries(vim, + &svr->svr_new_segments[txg & TXG_MASK], tx); + vdev_indirect_births_add_entry(vd->vdev_indirect_births, + vdev_indirect_mapping_max_offset(vim), dmu_tx_get_txg(tx), tx); + + /* + * Free the copied data for anything that was freed while the + * mapping entries were in flight. + */ + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_frees[txg & TXG_MASK], + free_mapped_segment_cb, vd); + ASSERT3U(svr->svr_max_offset_to_sync[txg & TXG_MASK], >=, + vdev_indirect_mapping_max_offset(vim)); + svr->svr_max_offset_to_sync[txg & TXG_MASK] = 0; + mutex_exit(&svr->svr_lock); + + spa_sync_removing_state(spa, tx); +} + +typedef struct vdev_copy_segment_arg { + spa_t *vcsa_spa; + dva_t *vcsa_dest_dva; + uint64_t vcsa_txg; + range_tree_t *vcsa_obsolete_segs; +} vdev_copy_segment_arg_t; + +static void +unalloc_seg(void *arg, uint64_t start, uint64_t size) +{ + vdev_copy_segment_arg_t *vcsa = arg; + spa_t *spa = vcsa->vcsa_spa; + blkptr_t bp = { { { {0} } } }; + + BP_SET_BIRTH(&bp, TXG_INITIAL, TXG_INITIAL); + BP_SET_LSIZE(&bp, size); + BP_SET_PSIZE(&bp, size); + BP_SET_COMPRESS(&bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(&bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(&bp, DMU_OT_NONE); + BP_SET_LEVEL(&bp, 0); + BP_SET_DEDUP(&bp, 0); + BP_SET_BYTEORDER(&bp, ZFS_HOST_BYTEORDER); + + DVA_SET_VDEV(&bp.blk_dva[0], DVA_GET_VDEV(vcsa->vcsa_dest_dva)); + DVA_SET_OFFSET(&bp.blk_dva[0], + DVA_GET_OFFSET(vcsa->vcsa_dest_dva) + start); + DVA_SET_ASIZE(&bp.blk_dva[0], size); + + zio_free(spa, vcsa->vcsa_txg, &bp); +} + +/* + * All reads and writes associated with a call to spa_vdev_copy_segment() + * are done. + */ +static void +spa_vdev_copy_segment_done(zio_t *zio) +{ + vdev_copy_segment_arg_t *vcsa = zio->io_private; + + range_tree_vacate(vcsa->vcsa_obsolete_segs, + unalloc_seg, vcsa); + range_tree_destroy(vcsa->vcsa_obsolete_segs); + kmem_free(vcsa, sizeof (*vcsa)); + + spa_config_exit(zio->io_spa, SCL_STATE, zio->io_spa); +} + +/* + * The write of the new location is done. + */ +static void +spa_vdev_copy_segment_write_done(zio_t *zio) +{ + vdev_copy_arg_t *vca = zio->io_private; + + abd_free(zio->io_abd); + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes -= zio->io_size; + + if (zio->io_error != 0) + vca->vca_write_error_bytes += zio->io_size; + + cv_signal(&vca->vca_cv); + mutex_exit(&vca->vca_lock); +} + +/* + * The read of the old location is done. The parent zio is the write to + * the new location. Allow it to start. + */ +static void +spa_vdev_copy_segment_read_done(zio_t *zio) +{ + vdev_copy_arg_t *vca = zio->io_private; + + if (zio->io_error != 0) { + mutex_enter(&vca->vca_lock); + vca->vca_read_error_bytes += zio->io_size; + mutex_exit(&vca->vca_lock); + } + + zio_nowait(zio_unique_parent(zio)); +} + +/* + * If the old and new vdevs are mirrors, we will read both sides of the old + * mirror, and write each copy to the corresponding side of the new mirror. + * If the old and new vdevs have a different number of children, we will do + * this as best as possible. Since we aren't verifying checksums, this + * ensures that as long as there's a good copy of the data, we'll have a + * good copy after the removal, even if there's silent damage to one side + * of the mirror. If we're removing a mirror that has some silent damage, + * we'll have exactly the same damage in the new location (assuming that + * the new location is also a mirror). + * + * We accomplish this by creating a tree of zio_t's, with as many writes as + * there are "children" of the new vdev (a non-redundant vdev counts as one + * child, a 2-way mirror has 2 children, etc). Each write has an associated + * read from a child of the old vdev. Typically there will be the same + * number of children of the old and new vdevs. However, if there are more + * children of the new vdev, some child(ren) of the old vdev will be issued + * multiple reads. If there are more children of the old vdev, some copies + * will be dropped. + * + * For example, the tree of zio_t's for a 2-way mirror is: + * + * null + * / \ + * write(new vdev, child 0) write(new vdev, child 1) + * | | + * read(old vdev, child 0) read(old vdev, child 1) + * + * Child zio's complete before their parents complete. However, zio's + * created with zio_vdev_child_io() may be issued before their children + * complete. In this case we need to make sure that the children (reads) + * complete before the parents (writes) are *issued*. We do this by not + * calling zio_nowait() on each write until its corresponding read has + * completed. + * + * The spa_config_lock must be held while zio's created by + * zio_vdev_child_io() are in progress, to ensure that the vdev tree does + * not change (e.g. due to a concurrent "zpool attach/detach"). The "null" + * zio is needed to release the spa_config_lock after all the reads and + * writes complete. (Note that we can't grab the config lock for each read, + * because it is not reentrant - we could deadlock with a thread waiting + * for a write lock.) + */ +static void +spa_vdev_copy_one_child(vdev_copy_arg_t *vca, zio_t *nzio, + vdev_t *source_vd, uint64_t source_offset, + vdev_t *dest_child_vd, uint64_t dest_offset, int dest_id, uint64_t size) +{ + ASSERT3U(spa_config_held(nzio->io_spa, SCL_ALL, RW_READER), !=, 0); + + /* + * If the destination child in unwritable then there is no point + * in issuing the source reads which cannot be written. + */ + if (!vdev_writeable(dest_child_vd)) + return; + + mutex_enter(&vca->vca_lock); + vca->vca_outstanding_bytes += size; + mutex_exit(&vca->vca_lock); + + abd_t *abd = abd_alloc_for_io(size, B_FALSE); + + vdev_t *source_child_vd = NULL; + if (source_vd->vdev_ops == &vdev_mirror_ops && dest_id != -1) { + /* + * Source and dest are both mirrors. Copy from the same + * child id as we are copying to (wrapping around if there + * are more dest children than source children). If the + * preferred source child is unreadable select another. + */ + for (int i = 0; i < source_vd->vdev_children; i++) { + source_child_vd = source_vd->vdev_child[ + (dest_id + i) % source_vd->vdev_children]; + if (vdev_readable(source_child_vd)) + break; + } + } else { + source_child_vd = source_vd; + } + + /* + * There should always be at least one readable source child or + * the pool would be in a suspended state. Somehow selecting an + * unreadable child would result in IO errors, the removal process + * being cancelled, and the pool reverting to its pre-removal state. + */ + ASSERT3P(source_child_vd, !=, NULL); + + zio_t *write_zio = zio_vdev_child_io(nzio, NULL, + dest_child_vd, dest_offset, abd, size, + ZIO_TYPE_WRITE, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_write_done, vca); + + zio_nowait(zio_vdev_child_io(write_zio, NULL, + source_child_vd, source_offset, abd, size, + ZIO_TYPE_READ, ZIO_PRIORITY_REMOVAL, + ZIO_FLAG_CANFAIL, + spa_vdev_copy_segment_read_done, vca)); +} + +/* + * Allocate a new location for this segment, and create the zio_t's to + * read from the old location and write to the new location. + */ +static int +spa_vdev_copy_segment(vdev_t *vd, range_tree_t *segs, + uint64_t maxalloc, uint64_t txg, + vdev_copy_arg_t *vca, zio_alloc_list_t *zal) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_indirect_mapping_entry_t *entry; + dva_t dst = {{ 0 }}; + uint64_t start = range_tree_min(segs); + ASSERT0(P2PHASE(start, 1 << spa->spa_min_ashift)); + + ASSERT3U(maxalloc, <=, SPA_MAXBLOCKSIZE); + ASSERT0(P2PHASE(maxalloc, 1 << spa->spa_min_ashift)); + + uint64_t size = range_tree_span(segs); + if (range_tree_span(segs) > maxalloc) { + /* + * We can't allocate all the segments. Prefer to end + * the allocation at the end of a segment, thus avoiding + * additional split blocks. + */ + range_seg_max_t search; + zfs_btree_index_t where; + rs_set_start(&search, segs, start + maxalloc); + rs_set_end(&search, segs, start + maxalloc); + (void) zfs_btree_find(&segs->rt_root, &search, &where); + range_seg_t *rs = zfs_btree_prev(&segs->rt_root, &where, + &where); + if (rs != NULL) { + size = rs_get_end(rs, segs) - start; + } else { + /* + * There are no segments that end before maxalloc. + * I.e. the first segment is larger than maxalloc, + * so we must split it. + */ + size = maxalloc; + } + } + ASSERT3U(size, <=, maxalloc); + ASSERT0(P2PHASE(size, 1 << spa->spa_min_ashift)); + + /* + * An allocation class might not have any remaining vdevs or space + */ + metaslab_class_t *mc = mg->mg_class; + if (mc != spa_normal_class(spa) && mc->mc_groups <= 1) + mc = spa_normal_class(spa); + int error = metaslab_alloc_dva(spa, mc, size, &dst, 0, NULL, txg, 0, + zal, 0); + if (error == ENOSPC && mc != spa_normal_class(spa)) { + error = metaslab_alloc_dva(spa, spa_normal_class(spa), size, + &dst, 0, NULL, txg, 0, zal, 0); + } + if (error != 0) + return (error); + + /* + * Determine the ranges that are not actually needed. Offsets are + * relative to the start of the range to be copied (i.e. relative to the + * local variable "start"). + */ + range_tree_t *obsolete_segs = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + + zfs_btree_index_t where; + range_seg_t *rs = zfs_btree_first(&segs->rt_root, &where); + ASSERT3U(rs_get_start(rs, segs), ==, start); + uint64_t prev_seg_end = rs_get_end(rs, segs); + while ((rs = zfs_btree_next(&segs->rt_root, &where, &where)) != NULL) { + if (rs_get_start(rs, segs) >= start + size) { + break; + } else { + range_tree_add(obsolete_segs, + prev_seg_end - start, + rs_get_start(rs, segs) - prev_seg_end); + } + prev_seg_end = rs_get_end(rs, segs); + } + /* We don't end in the middle of an obsolete range */ + ASSERT3U(start + size, <=, prev_seg_end); + + range_tree_clear(segs, start, size); + + /* + * We can't have any padding of the allocated size, otherwise we will + * misunderstand what's allocated, and the size of the mapping. We + * prevent padding by ensuring that all devices in the pool have the + * same ashift, and the allocation size is a multiple of the ashift. + */ + VERIFY3U(DVA_GET_ASIZE(&dst), ==, size); + + entry = kmem_zalloc(sizeof (vdev_indirect_mapping_entry_t), KM_SLEEP); + DVA_MAPPING_SET_SRC_OFFSET(&entry->vime_mapping, start); + entry->vime_mapping.vimep_dst = dst; + if (spa_feature_is_enabled(spa, SPA_FEATURE_OBSOLETE_COUNTS)) { + entry->vime_obsolete_count = range_tree_space(obsolete_segs); + } + + vdev_copy_segment_arg_t *vcsa = kmem_zalloc(sizeof (*vcsa), KM_SLEEP); + vcsa->vcsa_dest_dva = &entry->vime_mapping.vimep_dst; + vcsa->vcsa_obsolete_segs = obsolete_segs; + vcsa->vcsa_spa = spa; + vcsa->vcsa_txg = txg; + + /* + * See comment before spa_vdev_copy_one_child(). + */ + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + zio_t *nzio = zio_null(spa->spa_txg_zio[txg & TXG_MASK], spa, NULL, + spa_vdev_copy_segment_done, vcsa, 0); + vdev_t *dest_vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dst)); + if (dest_vd->vdev_ops == &vdev_mirror_ops) { + for (int i = 0; i < dest_vd->vdev_children; i++) { + vdev_t *child = dest_vd->vdev_child[i]; + spa_vdev_copy_one_child(vca, nzio, vd, start, + child, DVA_GET_OFFSET(&dst), i, size); + } + } else { + spa_vdev_copy_one_child(vca, nzio, vd, start, + dest_vd, DVA_GET_OFFSET(&dst), -1, size); + } + zio_nowait(nzio); + + list_insert_tail(&svr->svr_new_segments[txg & TXG_MASK], entry); + ASSERT3U(start + size, <=, vd->vdev_ms_count << vd->vdev_ms_shift); + vdev_dirty(vd, 0, NULL, txg); + + return (0); +} + +/* + * Complete the removal of a toplevel vdev. This is called as a + * synctask in the same txg that we will sync out the new config (to the + * MOS object) which indicates that this vdev is indirect. + */ +static void +vdev_remove_complete_sync(void *arg, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = arg; + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(svr->svr_bytes_done[i]); + } + + ASSERT3U(spa->spa_removing_phys.sr_copied, ==, + spa->spa_removing_phys.sr_to_copy); + + vdev_destroy_spacemaps(vd, tx); + + /* destroy leaf zaps, if any */ + ASSERT3P(svr->svr_zaplist, !=, NULL); + for (nvpair_t *pair = nvlist_next_nvpair(svr->svr_zaplist, NULL); + pair != NULL; + pair = nvlist_next_nvpair(svr->svr_zaplist, pair)) { + vdev_destroy_unlink_zap(vd, fnvpair_value_uint64(pair), tx); + } + fnvlist_free(svr->svr_zaplist); + + spa_finish_removal(dmu_tx_pool(tx)->dp_spa, DSS_FINISHED, tx); + /* vd->vdev_path is not available here */ + spa_history_log_internal(spa, "vdev remove completed", tx, + "%s vdev %llu", spa_name(spa), (u_longlong_t)vd->vdev_id); +} + +static void +vdev_remove_enlist_zaps(vdev_t *vd, nvlist_t *zlist) +{ + ASSERT3P(zlist, !=, NULL); + ASSERT3P(vd->vdev_ops, !=, &vdev_raidz_ops); + + if (vd->vdev_leaf_zap != 0) { + char zkey[32]; + (void) snprintf(zkey, sizeof (zkey), "%s-%llu", + VDEV_REMOVAL_ZAP_OBJS, (u_longlong_t)vd->vdev_leaf_zap); + fnvlist_add_uint64(zlist, zkey, vd->vdev_leaf_zap); + } + + for (uint64_t id = 0; id < vd->vdev_children; id++) { + vdev_remove_enlist_zaps(vd->vdev_child[id], zlist); + } +} + +static void +vdev_remove_replace_with_indirect(vdev_t *vd, uint64_t txg) +{ + vdev_t *ivd; + dmu_tx_t *tx; + spa_t *spa = vd->vdev_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + /* + * First, build a list of leaf zaps to be destroyed. + * This is passed to the sync context thread, + * which does the actual unlinking. + */ + svr->svr_zaplist = fnvlist_alloc(); + vdev_remove_enlist_zaps(vd, svr->svr_zaplist); + + ivd = vdev_add_parent(vd, &vdev_indirect_ops); + ivd->vdev_removing = 0; + + vd->vdev_leaf_zap = 0; + + vdev_remove_child(ivd, vd); + vdev_compact_children(ivd); + + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); + + mutex_enter(&svr->svr_lock); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); + + /* After this, we can not use svr. */ + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, vdev_remove_complete_sync, svr, + 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); +} + +/* + * Complete the removal of a toplevel vdev. This is called in open + * context by the removal thread after we have copied all vdev's data. + */ +static void +vdev_remove_complete(spa_t *spa) +{ + uint64_t txg; + + /* + * Wait for any deferred frees to be synced before we call + * vdev_metaslab_fini() + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + txg = spa_vdev_enter(spa); + vdev_t *vd = vdev_lookup_top(spa, spa->spa_vdev_removal->svr_vdev_id); + ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT3P(vd->vdev_autotrim_thread, ==, NULL); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + + zfs_dbgmsg("finishing device removal for vdev %llu in txg %llu", + vd->vdev_id, txg); + + /* + * Discard allocation state. + */ + if (vd->vdev_mg != NULL) { + vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + vd->vdev_mg = NULL; + spa_log_sm_set_blocklimit(spa); + } + ASSERT0(vd->vdev_stat.vs_space); + ASSERT0(vd->vdev_stat.vs_dspace); + + vdev_remove_replace_with_indirect(vd, txg); + + /* + * We now release the locks, allowing spa_sync to run and finish the + * removal via vdev_remove_complete_sync in syncing context. + * + * Note that we hold on to the vdev_t that has been replaced. Since + * it isn't part of the vdev tree any longer, it can't be concurrently + * manipulated, even while we don't have the config lock. + */ + (void) spa_vdev_exit(spa, NULL, txg, 0); + + /* + * Top ZAP should have been transferred to the indirect vdev in + * vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_top_zap); + + /* + * Leaf ZAP should have been moved in vdev_remove_replace_with_indirect. + */ + ASSERT0(vd->vdev_leaf_zap); + + txg = spa_vdev_enter(spa); + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + /* + * Request to update the config and the config cachefile. + */ + vdev_config_dirty(spa->spa_root_vdev); + (void) spa_vdev_exit(spa, vd, txg, 0); + + if (ev != NULL) + spa_event_post(ev); +} + +/* + * Evacuates a segment of size at most max_alloc from the vdev + * via repeated calls to spa_vdev_copy_segment. If an allocation + * fails, the pool is probably too fragmented to handle such a + * large size, so decrease max_alloc so that the caller will not try + * this size again this txg. + */ +static void +spa_vdev_copy_impl(vdev_t *vd, spa_vdev_removal_t *svr, vdev_copy_arg_t *vca, + uint64_t *max_alloc, dmu_tx_t *tx) +{ + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + mutex_enter(&svr->svr_lock); + + /* + * Determine how big of a chunk to copy. We can allocate up + * to max_alloc bytes, and we can span up to vdev_removal_max_span + * bytes of unallocated space at a time. "segs" will track the + * allocated segments that we are copying. We may also be copying + * free segments (of up to vdev_removal_max_span bytes). + */ + range_tree_t *segs = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + for (;;) { + range_tree_t *rt = svr->svr_allocd_segs; + range_seg_t *rs = range_tree_first(rt); + + if (rs == NULL) + break; + + uint64_t seg_length; + + if (range_tree_is_empty(segs)) { + /* need to truncate the first seg based on max_alloc */ + seg_length = MIN(rs_get_end(rs, rt) - rs_get_start(rs, + rt), *max_alloc); + } else { + if (rs_get_start(rs, rt) - range_tree_max(segs) > + vdev_removal_max_span) { + /* + * Including this segment would cause us to + * copy a larger unneeded chunk than is allowed. + */ + break; + } else if (rs_get_end(rs, rt) - range_tree_min(segs) > + *max_alloc) { + /* + * This additional segment would extend past + * max_alloc. Rather than splitting this + * segment, leave it for the next mapping. + */ + break; + } else { + seg_length = rs_get_end(rs, rt) - + rs_get_start(rs, rt); + } + } + + range_tree_add(segs, rs_get_start(rs, rt), seg_length); + range_tree_remove(svr->svr_allocd_segs, + rs_get_start(rs, rt), seg_length); + } + + if (range_tree_is_empty(segs)) { + mutex_exit(&svr->svr_lock); + range_tree_destroy(segs); + return; + } + + if (svr->svr_max_offset_to_sync[txg & TXG_MASK] == 0) { + dsl_sync_task_nowait(dmu_tx_pool(tx), vdev_mapping_sync, + svr, 0, ZFS_SPACE_CHECK_NONE, tx); + } + + svr->svr_max_offset_to_sync[txg & TXG_MASK] = range_tree_max(segs); + + /* + * Note: this is the amount of *allocated* space + * that we are taking care of each txg. + */ + svr->svr_bytes_done[txg & TXG_MASK] += range_tree_space(segs); + + mutex_exit(&svr->svr_lock); + + zio_alloc_list_t zal; + metaslab_trace_init(&zal); + uint64_t thismax = SPA_MAXBLOCKSIZE; + while (!range_tree_is_empty(segs)) { + int error = spa_vdev_copy_segment(vd, + segs, thismax, txg, vca, &zal); + + if (error == ENOSPC) { + /* + * Cut our segment in half, and don't try this + * segment size again this txg. Note that the + * allocation size must be aligned to the highest + * ashift in the pool, so that the allocation will + * not be padded out to a multiple of the ashift, + * which could cause us to think that this mapping + * is larger than we intended. + */ + ASSERT3U(spa->spa_max_ashift, >=, SPA_MINBLOCKSHIFT); + ASSERT3U(spa->spa_max_ashift, ==, spa->spa_min_ashift); + uint64_t attempted = + MIN(range_tree_span(segs), thismax); + thismax = P2ROUNDUP(attempted / 2, + 1 << spa->spa_max_ashift); + /* + * The minimum-size allocation can not fail. + */ + ASSERT3U(attempted, >, 1 << spa->spa_max_ashift); + *max_alloc = attempted - (1 << spa->spa_max_ashift); + } else { + ASSERT0(error); + + /* + * We've performed an allocation, so reset the + * alloc trace list. + */ + metaslab_trace_fini(&zal); + metaslab_trace_init(&zal); + } + } + metaslab_trace_fini(&zal); + range_tree_destroy(segs); +} + +/* + * The size of each removal mapping is limited by the tunable + * zfs_remove_max_segment, but we must adjust this to be a multiple of the + * pool's ashift, so that we don't try to split individual sectors regardless + * of the tunable value. (Note that device removal requires that all devices + * have the same ashift, so there's no difference between spa_min_ashift and + * spa_max_ashift.) The raw tunable should not be used elsewhere. + */ +uint64_t +spa_remove_max_segment(spa_t *spa) +{ + return (P2ROUNDUP(zfs_remove_max_segment, 1 << spa->spa_max_ashift)); +} + +/* + * The removal thread operates in open context. It iterates over all + * allocated space in the vdev, by loading each metaslab's spacemap. + * For each contiguous segment of allocated space (capping the segment + * size at SPA_MAXBLOCKSIZE), we: + * - Allocate space for it on another vdev. + * - Create a new mapping from the old location to the new location + * (as a record in svr_new_segments). + * - Initiate a physical read zio to get the data off the removing disk. + * - In the read zio's done callback, initiate a physical write zio to + * write it to the new vdev. + * Note that all of this will take effect when a particular TXG syncs. + * The sync thread ensures that all the phys reads and writes for the syncing + * TXG have completed (see spa_txg_zio) and writes the new mappings to disk + * (see vdev_mapping_sync()). + */ +static void +spa_vdev_remove_thread(void *arg) +{ + spa_t *spa = arg; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_copy_arg_t vca; + uint64_t max_alloc = spa_remove_max_segment(spa); + uint64_t last_txg = 0; + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + uint64_t start_offset = vdev_indirect_mapping_max_offset(vim); + + ASSERT3P(vd->vdev_ops, !=, &vdev_indirect_ops); + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_removing); + ASSERT(vd->vdev_indirect_config.vic_mapping_object != 0); + ASSERT(vim != NULL); + + mutex_init(&vca.vca_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&vca.vca_cv, NULL, CV_DEFAULT, NULL); + vca.vca_outstanding_bytes = 0; + vca.vca_read_error_bytes = 0; + vca.vca_write_error_bytes = 0; + + mutex_enter(&svr->svr_lock); + + /* + * Start from vim_max_offset so we pick up where we left off + * if we are restarting the removal after opening the pool. + */ + uint64_t msi; + for (msi = start_offset >> vd->vdev_ms_shift; + msi < vd->vdev_ms_count && !svr->svr_thread_exit; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + ASSERT3U(msi, <=, vd->vdev_ms_count); + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_sync_lock); + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT0(range_tree_space(msp->ms_allocating[i])); + } + + /* + * If the metaslab has ever been allocated from (ms_sm!=NULL), + * read the allocated segments from the space map object + * into svr_allocd_segs. Since we do this while holding + * svr_lock and ms_sync_lock, concurrent frees (which + * would have modified the space map) will wait for us + * to finish loading the spacemap, and then take the + * appropriate action (see free_from_removing_vdev()). + */ + if (msp->ms_sm != NULL) { + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); + range_tree_walk(msp->ms_freeing, + range_tree_remove, svr->svr_allocd_segs); + + /* + * When we are resuming from a paused removal (i.e. + * when importing a pool with a removal in progress), + * discard any state that we have already processed. + */ + range_tree_clear(svr->svr_allocd_segs, 0, start_offset); + } + mutex_exit(&msp->ms_lock); + mutex_exit(&msp->ms_sync_lock); + + vca.vca_msp = msp; + zfs_dbgmsg("copying %llu segments for metaslab %llu", + zfs_btree_numnodes(&svr->svr_allocd_segs->rt_root), + msp->ms_id); + + while (!svr->svr_thread_exit && + !range_tree_is_empty(svr->svr_allocd_segs)) { + + mutex_exit(&svr->svr_lock); + + /* + * We need to periodically drop the config lock so that + * writers can get in. Additionally, we can't wait + * for a txg to sync while holding a config lock + * (since a waiting writer could cause a 3-way deadlock + * with the sync thread, which also gets a config + * lock for reader). So we can't hold the config lock + * while calling dmu_tx_assign(). + */ + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * This delay will pause the removal around the point + * specified by zfs_removal_suspend_progress. We do this + * solely from the test suite or during debugging. + */ + uint64_t bytes_copied = + spa->spa_removing_phys.sr_copied; + for (int i = 0; i < TXG_SIZE; i++) + bytes_copied += svr->svr_bytes_done[i]; + while (zfs_removal_suspend_progress && + !svr->svr_thread_exit) + delay(hz); + + mutex_enter(&vca.vca_lock); + while (vca.vca_outstanding_bytes > + zfs_remove_max_copy_bytes) { + cv_wait(&vca.vca_cv, &vca.vca_lock); + } + mutex_exit(&vca.vca_lock); + + dmu_tx_t *tx = + dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + /* + * Reacquire the vdev_config lock. The vdev_t + * that we're removing may have changed, e.g. due + * to a vdev_attach or vdev_detach. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vd = vdev_lookup_top(spa, svr->svr_vdev_id); + + if (txg != last_txg) + max_alloc = spa_remove_max_segment(spa); + last_txg = txg; + + spa_vdev_copy_impl(vd, svr, &vca, &max_alloc, tx); + + dmu_tx_commit(tx); + mutex_enter(&svr->svr_lock); + } + + mutex_enter(&vca.vca_lock); + if (zfs_removal_ignore_errors == 0 && + (vca.vca_read_error_bytes > 0 || + vca.vca_write_error_bytes > 0)) { + svr->svr_thread_exit = B_TRUE; + } + mutex_exit(&vca.vca_lock); + } + + mutex_exit(&svr->svr_lock); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * Wait for all copies to finish before cleaning up the vca. + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + ASSERT0(vca.vca_outstanding_bytes); + + mutex_destroy(&vca.vca_lock); + cv_destroy(&vca.vca_cv); + + if (svr->svr_thread_exit) { + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, NULL, NULL); + svr->svr_thread = NULL; + cv_broadcast(&svr->svr_cv); + mutex_exit(&svr->svr_lock); + + /* + * During the removal process an unrecoverable read or write + * error was encountered. The removal process must be + * cancelled or this damage may become permanent. + */ + if (zfs_removal_ignore_errors == 0 && + (vca.vca_read_error_bytes > 0 || + vca.vca_write_error_bytes > 0)) { + zfs_dbgmsg("canceling removal due to IO errors: " + "[read_error_bytes=%llu] [write_error_bytes=%llu]", + vca.vca_read_error_bytes, + vca.vca_write_error_bytes); + spa_vdev_remove_cancel_impl(spa); + } + } else { + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + vdev_remove_complete(spa); + } + + thread_exit(); +} + +void +spa_vdev_remove_suspend(spa_t *spa) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + + if (svr == NULL) + return; + + mutex_enter(&svr->svr_lock); + svr->svr_thread_exit = B_TRUE; + while (svr->svr_thread != NULL) + cv_wait(&svr->svr_cv, &svr->svr_lock); + svr->svr_thread_exit = B_FALSE; + mutex_exit(&svr->svr_lock); +} + +/* ARGSUSED */ +static int +spa_vdev_remove_cancel_check(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + + if (spa->spa_vdev_removal == NULL) + return (ENOTACTIVE); + return (0); +} + +/* + * Cancel a removal by freeing all entries from the partial mapping + * and marking the vdev as no longer being removing. + */ +/* ARGSUSED */ +static void +spa_vdev_remove_cancel_sync(void *arg, dmu_tx_t *tx) +{ + spa_t *spa = dmu_tx_pool(tx)->dp_spa; + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + vdev_t *vd = vdev_lookup_top(spa, svr->svr_vdev_id); + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + objset_t *mos = spa->spa_meta_objset; + + ASSERT3P(svr->svr_thread, ==, NULL); + + spa_feature_decr(spa, SPA_FEATURE_DEVICE_REMOVAL, tx); + + boolean_t are_precise; + VERIFY0(vdev_obsolete_counts_are_precise(vd, &are_precise)); + if (are_precise) { + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_OBSOLETE_COUNTS_ARE_PRECISE, tx)); + } + + uint64_t obsolete_sm_object; + VERIFY0(vdev_obsolete_sm_object(vd, &obsolete_sm_object)); + if (obsolete_sm_object != 0) { + ASSERT(vd->vdev_obsolete_sm != NULL); + ASSERT3U(obsolete_sm_object, ==, + space_map_object(vd->vdev_obsolete_sm)); + + space_map_free(vd->vdev_obsolete_sm, tx); + VERIFY0(zap_remove(spa->spa_meta_objset, vd->vdev_top_zap, + VDEV_TOP_ZAP_INDIRECT_OBSOLETE_SM, tx)); + space_map_close(vd->vdev_obsolete_sm); + vd->vdev_obsolete_sm = NULL; + spa_feature_decr(spa, SPA_FEATURE_OBSOLETE_COUNTS, tx); + } + for (int i = 0; i < TXG_SIZE; i++) { + ASSERT(list_is_empty(&svr->svr_new_segments[i])); + ASSERT3U(svr->svr_max_offset_to_sync[i], <=, + vdev_indirect_mapping_max_offset(vim)); + } + + for (uint64_t msi = 0; msi < vd->vdev_ms_count; msi++) { + metaslab_t *msp = vd->vdev_ms[msi]; + + if (msp->ms_start >= vdev_indirect_mapping_max_offset(vim)) + break; + + ASSERT0(range_tree_space(svr->svr_allocd_segs)); + + mutex_enter(&msp->ms_lock); + + /* + * Assert nothing in flight -- ms_*tree is empty. + */ + for (int i = 0; i < TXG_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_allocating[i])); + for (int i = 0; i < TXG_DEFER_SIZE; i++) + ASSERT0(range_tree_space(msp->ms_defer[i])); + ASSERT0(range_tree_space(msp->ms_freed)); + + if (msp->ms_sm != NULL) { + mutex_enter(&svr->svr_lock); + VERIFY0(space_map_load(msp->ms_sm, + svr->svr_allocd_segs, SM_ALLOC)); + + range_tree_walk(msp->ms_unflushed_allocs, + range_tree_add, svr->svr_allocd_segs); + range_tree_walk(msp->ms_unflushed_frees, + range_tree_remove, svr->svr_allocd_segs); + range_tree_walk(msp->ms_freeing, + range_tree_remove, svr->svr_allocd_segs); + + /* + * Clear everything past what has been synced, + * because we have not allocated mappings for it yet. + */ + uint64_t syncd = vdev_indirect_mapping_max_offset(vim); + uint64_t sm_end = msp->ms_sm->sm_start + + msp->ms_sm->sm_size; + if (sm_end > syncd) + range_tree_clear(svr->svr_allocd_segs, + syncd, sm_end - syncd); + + mutex_exit(&svr->svr_lock); + } + mutex_exit(&msp->ms_lock); + + mutex_enter(&svr->svr_lock); + range_tree_vacate(svr->svr_allocd_segs, + free_mapped_segment_cb, vd); + mutex_exit(&svr->svr_lock); + } + + /* + * Note: this must happen after we invoke free_mapped_segment_cb, + * because it adds to the obsolete_segments. + */ + range_tree_vacate(vd->vdev_obsolete_segments, NULL, NULL); + + ASSERT3U(vic->vic_mapping_object, ==, + vdev_indirect_mapping_object(vd->vdev_indirect_mapping)); + vdev_indirect_mapping_close(vd->vdev_indirect_mapping); + vd->vdev_indirect_mapping = NULL; + vdev_indirect_mapping_free(mos, vic->vic_mapping_object, tx); + vic->vic_mapping_object = 0; + + ASSERT3U(vic->vic_births_object, ==, + vdev_indirect_births_object(vd->vdev_indirect_births)); + vdev_indirect_births_close(vd->vdev_indirect_births); + vd->vdev_indirect_births = NULL; + vdev_indirect_births_free(mos, vic->vic_births_object, tx); + vic->vic_births_object = 0; + + /* + * We may have processed some frees from the removing vdev in this + * txg, thus increasing svr_bytes_done; discard that here to + * satisfy the assertions in spa_vdev_removal_destroy(). + * Note that future txg's can not have any bytes_done, because + * future TXG's are only modified from open context, and we have + * already shut down the copying thread. + */ + svr->svr_bytes_done[dmu_tx_get_txg(tx) & TXG_MASK] = 0; + spa_finish_removal(spa, DSS_CANCELED, tx); + + vd->vdev_removing = B_FALSE; + vdev_config_dirty(vd); + + zfs_dbgmsg("canceled device removal for vdev %llu in %llu", + vd->vdev_id, dmu_tx_get_txg(tx)); + spa_history_log_internal(spa, "vdev remove canceled", tx, + "%s vdev %llu %s", spa_name(spa), + (u_longlong_t)vd->vdev_id, + (vd->vdev_path != NULL) ? vd->vdev_path : "-"); +} + +static int +spa_vdev_remove_cancel_impl(spa_t *spa) +{ + uint64_t vdid = spa->spa_vdev_removal->svr_vdev_id; + + int error = dsl_sync_task(spa->spa_name, spa_vdev_remove_cancel_check, + spa_vdev_remove_cancel_sync, NULL, 0, + ZFS_SPACE_CHECK_EXTRA_RESERVED); + + if (error == 0) { + spa_config_enter(spa, SCL_ALLOC | SCL_VDEV, FTAG, RW_WRITER); + vdev_t *vd = vdev_lookup_top(spa, vdid); + metaslab_group_activate(vd->vdev_mg); + spa_config_exit(spa, SCL_ALLOC | SCL_VDEV, FTAG); + } + + return (error); +} + +int +spa_vdev_remove_cancel(spa_t *spa) +{ + spa_vdev_remove_suspend(spa); + + if (spa->spa_vdev_removal == NULL) + return (ENOTACTIVE); + + return (spa_vdev_remove_cancel_impl(spa)); +} + +void +svr_sync(spa_t *spa, dmu_tx_t *tx) +{ + spa_vdev_removal_t *svr = spa->spa_vdev_removal; + int txgoff = dmu_tx_get_txg(tx) & TXG_MASK; + + if (svr == NULL) + return; + + /* + * This check is necessary so that we do not dirty the + * DIRECTORY_OBJECT via spa_sync_removing_state() when there + * is nothing to do. Dirtying it every time would prevent us + * from syncing-to-convergence. + */ + if (svr->svr_bytes_done[txgoff] == 0) + return; + + /* + * Update progress accounting. + */ + spa->spa_removing_phys.sr_copied += svr->svr_bytes_done[txgoff]; + svr->svr_bytes_done[txgoff] = 0; + + spa_sync_removing_state(spa, tx); +} + +static void +vdev_remove_make_hole_and_free(vdev_t *vd) +{ + uint64_t id = vd->vdev_id; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + vdev_free(vd); + + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + +/* + * Remove a log device. The config lock is held for the specified TXG. + */ +static int +spa_vdev_remove_log(vdev_t *vd, uint64_t *txg) +{ + metaslab_group_t *mg = vd->vdev_mg; + spa_t *spa = vd->vdev_spa; + int error = 0; + + ASSERT(vd->vdev_islog); + ASSERT(vd == vd->vdev_top); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Cancel any initialize or TRIM which was in progress. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_CANCELED); + vdev_trim_stop_all(vd, VDEV_TRIM_CANCELED); + vdev_autotrim_stop_wait(vd); + + /* + * Evacuate the device. We don't hold the config lock as + * writer since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (vd->vdev_stat.vs_alloc != 0) + error = spa_reset_logs(spa); + + *txg = spa_vdev_config_enter(spa); + + if (error != 0) { + metaslab_group_activate(mg); + return (error); + } + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + + /* + * When the log space map feature is enabled we look at + * the vdev's top_zap to find the on-disk flush data of + * the metaslab we just flushed. Thus, while removing a + * log vdev we make sure to call vdev_metaslab_fini() + * first, which removes all metaslabs of this vdev from + * spa_metaslabs_by_flushed before vdev_remove_empty() + * destroys the top_zap of this log vdev. + * + * This avoids the scenario where we flush a metaslab + * from the log vdev being removed that doesn't have a + * top_zap and end up failing to lookup its on-disk flush + * data. + * + * We don't call metaslab_group_destroy() right away + * though (it will be called in vdev_free() later) as + * during metaslab_sync() of metaslabs from other vdevs + * we may touch the metaslab group of this vdev through + * metaslab_class_histogram_verify() + */ + vdev_metaslab_fini(vd); + spa_log_sm_set_blocklimit(spa); + + spa_vdev_config_exit(spa, NULL, *txg, 0, FTAG); + *txg = spa_vdev_config_enter(spa); + + sysevent_t *ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_DEV); + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + /* The top ZAP should have been destroyed by vdev_remove_empty. */ + ASSERT0(vd->vdev_top_zap); + /* The leaf ZAP should have been destroyed by vdev_dtl_sync. */ + ASSERT0(vd->vdev_leaf_zap); + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + ASSERT0(vd->vdev_stat.vs_alloc); + + /* + * Clean up the vdev namespace. + */ + vdev_remove_make_hole_and_free(vd); + + if (ev != NULL) + spa_event_post(ev); + + return (0); +} + +static int +spa_vdev_remove_top_check(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + + if (vd != vd->vdev_top) + return (SET_ERROR(ENOTSUP)); + + if (!vdev_is_concrete(vd)) + return (SET_ERROR(ENOTSUP)); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_DEVICE_REMOVAL)) + return (SET_ERROR(ENOTSUP)); + + /* available space in the pool's normal class */ + uint64_t available = dsl_dir_space_available( + spa->spa_dsl_pool->dp_root_dir, NULL, 0, B_TRUE); + + metaslab_class_t *mc = vd->vdev_mg->mg_class; + + /* + * When removing a vdev from an allocation class that has + * remaining vdevs, include available space from the class. + */ + if (mc != spa_normal_class(spa) && mc->mc_groups > 1) { + uint64_t class_avail = metaslab_class_get_space(mc) - + metaslab_class_get_alloc(mc); + + /* add class space, adjusted for overhead */ + available += (class_avail * 94) / 100; + } + + /* + * There has to be enough free space to remove the + * device and leave double the "slop" space (i.e. we + * must leave at least 3% of the pool free, in addition to + * the normal slop space). + */ + if (available < vd->vdev_stat.vs_dspace + spa_get_slop_space(spa)) { + return (SET_ERROR(ENOSPC)); + } + + /* + * There can not be a removal in progress. + */ + if (spa->spa_removing_phys.sr_state == DSS_SCANNING) + return (SET_ERROR(EBUSY)); + + /* + * The device must have all its data. + */ + if (!vdev_dtl_empty(vd, DTL_MISSING) || + !vdev_dtl_empty(vd, DTL_OUTAGE)) + return (SET_ERROR(EBUSY)); + + /* + * The device must be healthy. + */ + if (!vdev_readable(vd)) + return (SET_ERROR(EIO)); + + /* + * All vdevs in normal class must have the same ashift. + */ + if (spa->spa_max_ashift != spa->spa_min_ashift) { + return (SET_ERROR(EINVAL)); + } + + /* + * All vdevs in normal class must have the same ashift + * and not be raidz. + */ + vdev_t *rvd = spa->spa_root_vdev; + int num_indirect = 0; + for (uint64_t id = 0; id < rvd->vdev_children; id++) { + vdev_t *cvd = rvd->vdev_child[id]; + if (cvd->vdev_ashift != 0 && !cvd->vdev_islog) + ASSERT3U(cvd->vdev_ashift, ==, spa->spa_max_ashift); + if (cvd->vdev_ops == &vdev_indirect_ops) + num_indirect++; + if (!vdev_is_concrete(cvd)) + continue; + if (cvd->vdev_ops == &vdev_raidz_ops) + return (SET_ERROR(EINVAL)); + /* + * Need the mirror to be mirror of leaf vdevs only + */ + if (cvd->vdev_ops == &vdev_mirror_ops) { + for (uint64_t cid = 0; + cid < cvd->vdev_children; cid++) { + if (!cvd->vdev_child[cid]->vdev_ops-> + vdev_op_leaf) + return (SET_ERROR(EINVAL)); + } + } + } + + return (0); +} + +/* + * Initiate removal of a top-level vdev, reducing the total space in the pool. + * The config lock is held for the specified TXG. Once initiated, + * evacuation of all allocated space (copying it to other vdevs) happens + * in the background (see spa_vdev_remove_thread()), and can be canceled + * (see spa_vdev_remove_cancel()). If successful, the vdev will + * be transformed to an indirect vdev (see spa_vdev_remove_complete()). + */ +static int +spa_vdev_remove_top(vdev_t *vd, uint64_t *txg) +{ + spa_t *spa = vd->vdev_spa; + int error; + + /* + * Check for errors up-front, so that we don't waste time + * passivating the metaslab group and clearing the ZIL if there + * are errors. + */ + error = spa_vdev_remove_top_check(vd); + if (error != 0) + return (error); + + /* + * Stop allocating from this vdev. Note that we must check + * that this is not the only device in the pool before + * passivating, otherwise we will not be able to make + * progress because we can't allocate from any vdevs. + * The above check for sufficient free space serves this + * purpose. + */ + metaslab_group_t *mg = vd->vdev_mg; + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + *txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * We must ensure that no "stubby" log blocks are allocated + * on the device to be removed. These blocks could be + * written at any time, including while we are in the middle + * of copying them. + */ + error = spa_reset_logs(spa); + + /* + * We stop any initializing and TRIM that is currently in progress + * but leave the state as "active". This will allow the process to + * resume if the removal is canceled sometime later. + */ + vdev_initialize_stop_all(vd, VDEV_INITIALIZE_ACTIVE); + vdev_trim_stop_all(vd, VDEV_TRIM_ACTIVE); + vdev_autotrim_stop_wait(vd); + + *txg = spa_vdev_config_enter(spa); + + /* + * Things might have changed while the config lock was dropped + * (e.g. space usage). Check for errors again. + */ + if (error == 0) + error = spa_vdev_remove_top_check(vd); + + if (error != 0) { + metaslab_group_activate(mg); + spa_async_request(spa, SPA_ASYNC_INITIALIZE_RESTART); + spa_async_request(spa, SPA_ASYNC_TRIM_RESTART); + spa_async_request(spa, SPA_ASYNC_AUTOTRIM_RESTART); + return (error); + } + + vd->vdev_removing = B_TRUE; + + vdev_dirty_leaves(vd, VDD_DTL, *txg); + vdev_config_dirty(vd); + dmu_tx_t *tx = dmu_tx_create_assigned(spa->spa_dsl_pool, *txg); + dsl_sync_task_nowait(spa->spa_dsl_pool, + vdev_remove_initiate_sync, + (void *)(uintptr_t)vd->vdev_id, 0, ZFS_SPACE_CHECK_NONE, tx); + dmu_tx_commit(tx); + + return (0); +} + +/* + * Remove a device from the pool. + * + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ +int +spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) +{ + vdev_t *vd; + nvlist_t **spares, **l2cache, *nv; + uint64_t txg = 0; + uint_t nspares, nl2cache; + int error = 0, error_log; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); + sysevent_t *ev = NULL; + char *vd_type = NULL, *vd_path = NULL; + + ASSERT(spa_writeable(spa)); + + if (!locked) + txg = spa_vdev_enter(spa); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (spa_feature_is_active(spa, SPA_FEATURE_POOL_CHECKPOINT)) { + error = (spa_has_checkpoint(spa)) ? + ZFS_ERR_CHECKPOINT_EXISTS : ZFS_ERR_DISCARDING_CHECKPOINT; + + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); + } + + vd = spa_lookup_by_guid(spa, guid, B_FALSE); + + if (spa->spa_spares.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, &spares, &nspares) == 0 && + (nv = spa_nvlist_lookup_by_guid(spares, nspares, guid)) != NULL) { + /* + * Only remove the hot spare if it's not currently in use + * in this pool. + */ + if (vd == NULL || unspare) { + if (vd == NULL) + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + ev = spa_event_create(spa, vd, NULL, + ESC_ZFS_VDEV_REMOVE_AUX); + + vd_type = VDEV_TYPE_SPARE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + spa_vdev_remove_aux(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares, nv); + spa_load_spares(spa); + spa->spa_spares.sav_sync = B_TRUE; + } else { + error = SET_ERROR(EBUSY); + } + } else if (spa->spa_l2cache.sav_vdevs != NULL && + nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0 && + (nv = spa_nvlist_lookup_by_guid(l2cache, nl2cache, guid)) != NULL) { + vd_type = VDEV_TYPE_L2CACHE; + vd_path = spa_strdup(fnvlist_lookup_string( + nv, ZPOOL_CONFIG_PATH)); + /* + * Cache devices can always be removed. + */ + vd = spa_lookup_by_guid(spa, guid, B_TRUE); + + /* + * Stop trimming the cache device. We need to release the + * config lock to allow the syncing of TRIM transactions + * without releasing the spa_namespace_lock. The same + * strategy is employed in spa_vdev_remove_top(). + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, VDEV_TRIM_CANCELED, NULL); + mutex_exit(&vd->vdev_trim_lock); + txg = spa_vdev_config_enter(spa); + + ev = spa_event_create(spa, vd, NULL, ESC_ZFS_VDEV_REMOVE_AUX); + spa_vdev_remove_aux(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); + spa_load_l2cache(spa); + spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + vd_type = VDEV_TYPE_LOG; + vd_path = spa_strdup((vd->vdev_path != NULL) ? + vd->vdev_path : "-"); + error = spa_vdev_remove_log(vd, &txg); + } else if (vd != NULL) { + ASSERT(!locked); + error = spa_vdev_remove_top(vd, &txg); + } else { + /* + * There is no vdev of any kind with the specified guid. + */ + error = SET_ERROR(ENOENT); + } + + error_log = error; + + if (!locked) + error = spa_vdev_exit(spa, NULL, txg, error); + + /* + * Logging must be done outside the spa config lock. Otherwise, + * this code path could end up holding the spa config lock while + * waiting for a txg_sync so it can write to the internal log. + * Doing that would prevent the txg sync from actually happening, + * causing a deadlock. + */ + if (error_log == 0 && vd_type != NULL && vd_path != NULL) { + spa_history_log_internal(spa, "vdev remove", NULL, + "%s vdev (%s) %s", spa_name(spa), vd_type, vd_path); + } + if (vd_path != NULL) + spa_strfree(vd_path); + + if (ev != NULL) + spa_event_post(ev); + + return (error); +} + +int +spa_removal_get_stats(spa_t *spa, pool_removal_stat_t *prs) +{ + prs->prs_state = spa->spa_removing_phys.sr_state; + + if (prs->prs_state == DSS_NONE) + return (SET_ERROR(ENOENT)); + + prs->prs_removing_vdev = spa->spa_removing_phys.sr_removing_vdev; + prs->prs_start_time = spa->spa_removing_phys.sr_start_time; + prs->prs_end_time = spa->spa_removing_phys.sr_end_time; + prs->prs_to_copy = spa->spa_removing_phys.sr_to_copy; + prs->prs_copied = spa->spa_removing_phys.sr_copied; + + prs->prs_mapping_memory = 0; + uint64_t indirect_vdev_id = + spa->spa_removing_phys.sr_prev_indirect_vdev; + while (indirect_vdev_id != -1) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[indirect_vdev_id]; + vdev_indirect_config_t *vic = &vd->vdev_indirect_config; + vdev_indirect_mapping_t *vim = vd->vdev_indirect_mapping; + + ASSERT3P(vd->vdev_ops, ==, &vdev_indirect_ops); + prs->prs_mapping_memory += vdev_indirect_mapping_size(vim); + indirect_vdev_id = vic->vic_prev_indirect_vdev; + } + + return (0); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_ignore_errors, INT, ZMOD_RW, + "Ignore hard IO errors when removing device"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_, remove_max_segment, INT, ZMOD_RW, + "Largest contiguous segment to allocate when removing device"); + +ZFS_MODULE_PARAM(zfs_vdev, vdev_, removal_max_span, INT, ZMOD_RW, + "Largest span of free chunks a remap segment can span"); + +ZFS_MODULE_PARAM(zfs_vdev, zfs_, removal_suspend_progress, INT, ZMOD_RW, + "Pause device removal after this many bytes are copied " + "(debug use only - causes removal to hang)"); +/* END CSTYLED */ + +EXPORT_SYMBOL(free_from_removing_vdev); +EXPORT_SYMBOL(spa_removal_get_stats); +EXPORT_SYMBOL(spa_remove_init); +EXPORT_SYMBOL(spa_restart_removal); +EXPORT_SYMBOL(spa_vdev_removal_destroy); +EXPORT_SYMBOL(spa_vdev_remove); +EXPORT_SYMBOL(spa_vdev_remove_cancel); +EXPORT_SYMBOL(spa_vdev_remove_suspend); +EXPORT_SYMBOL(svr_sync); diff --git a/module/zfs/vdev_root.c b/module/zfs/vdev_root.c new file mode 100644 index 000000000000..9e8aac7d03de --- /dev/null +++ b/module/zfs/vdev_root.c @@ -0,0 +1,158 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include + +/* + * Virtual device vector for the pool's root vdev. + */ + +static uint64_t +vdev_root_core_tvds(vdev_t *vd) +{ + uint64_t tvds = 0; + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (!cvd->vdev_ishole && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { + tvds++; + } + } + + return (tvds); +} + +/* + * We should be able to tolerate one failure with absolutely no damage + * to our metadata. Two failures will take out space maps, a bunch of + * indirect block trees, meta dnodes, dnodes, etc. Probably not a happy + * place to live. When we get smarter, we can liberalize this policy. + * e.g. If we haven't lost two consecutive top-level vdevs, then we are + * probably fine. Adding bean counters during alloc/free can make this + * future guesswork more accurate. + */ +static boolean_t +too_many_errors(vdev_t *vd, uint64_t numerrors) +{ + uint64_t tvds; + + if (numerrors == 0) + return (B_FALSE); + + tvds = vdev_root_core_tvds(vd); + ASSERT3U(numerrors, <=, tvds); + + if (numerrors == tvds) + return (B_TRUE); + + return (numerrors > spa_missing_tvds_allowed(vd->vdev_spa)); +} + +static int +vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, + uint64_t *ashift, uint64_t *pshift) +{ + spa_t *spa = vd->vdev_spa; + int lasterror = 0; + int numerrors = 0; + + if (vd->vdev_children == 0) { + vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; + return (SET_ERROR(EINVAL)); + } + + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error && !cvd->vdev_islog && + cvd->vdev_ops != &vdev_indirect_ops) { + lasterror = cvd->vdev_open_error; + numerrors++; + } + } + + if (spa_load_state(spa) != SPA_LOAD_NONE) + spa_set_missing_tvds(spa, numerrors); + + if (too_many_errors(vd, numerrors)) { + vd->vdev_stat.vs_aux = VDEV_AUX_NO_REPLICAS; + return (lasterror); + } + + *asize = 0; + *max_asize = 0; + *ashift = 0; + *pshift = 0; + + return (0); +} + +static void +vdev_root_close(vdev_t *vd) +{ + for (int c = 0; c < vd->vdev_children; c++) + vdev_close(vd->vdev_child[c]); +} + +static void +vdev_root_state_change(vdev_t *vd, int faulted, int degraded) +{ + if (too_many_errors(vd, faulted)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_NO_REPLICAS); + } else if (degraded || faulted) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); + } else { + vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); + } +} + +vdev_ops_t vdev_root_ops = { + .vdev_op_open = vdev_root_open, + .vdev_op_close = vdev_root_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = NULL, /* not applicable to the root */ + .vdev_op_io_done = NULL, /* not applicable to the root */ + .vdev_op_state_change = vdev_root_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ +}; diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c new file mode 100644 index 000000000000..3f8c34806020 --- /dev/null +++ b/module/zfs/vdev_trim.c @@ -0,0 +1,1701 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2019 by Lawrence Livermore National Security, LLC. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * TRIM is a feature which is used to notify a SSD that some previously + * written space is no longer allocated by the pool. This is useful because + * writes to a SSD must be performed to blocks which have first been erased. + * Ensuring the SSD always has a supply of erased blocks for new writes + * helps prevent the performance from deteriorating. + * + * There are two supported TRIM methods; manual and automatic. + * + * Manual TRIM: + * + * A manual TRIM is initiated by running the 'zpool trim' command. A single + * 'vdev_trim' thread is created for each leaf vdev, and it is responsible for + * managing that vdev TRIM process. This involves iterating over all the + * metaslabs, calculating the unallocated space ranges, and then issuing the + * required TRIM I/Os. + * + * While a metaslab is being actively trimmed it is not eligible to perform + * new allocations. After traversing all of the metaslabs the thread is + * terminated. Finally, both the requested options and current progress of + * the TRIM are regularly written to the pool. This allows the TRIM to be + * suspended and resumed as needed. + * + * Automatic TRIM: + * + * An automatic TRIM is enabled by setting the 'autotrim' pool property + * to 'on'. When enabled, a `vdev_autotrim' thread is created for each + * top-level (not leaf) vdev in the pool. These threads perform the same + * core TRIM process as a manual TRIM, but with a few key differences. + * + * 1) Automatic TRIM happens continuously in the background and operates + * solely on recently freed blocks (ms_trim not ms_allocatable). + * + * 2) Each thread is associated with a top-level (not leaf) vdev. This has + * the benefit of simplifying the threading model, it makes it easier + * to coordinate administrative commands, and it ensures only a single + * metaslab is disabled at a time. Unlike manual TRIM, this means each + * 'vdev_autotrim' thread is responsible for issuing TRIM I/Os for its + * children. + * + * 3) There is no automatic TRIM progress information stored on disk, nor + * is it reported by 'zpool status'. + * + * While the automatic TRIM process is highly effective it is more likely + * than a manual TRIM to encounter tiny ranges. Ranges less than or equal to + * 'zfs_trim_extent_bytes_min' (32k) are considered too small to efficiently + * TRIM and are skipped. This means small amounts of freed space may not + * be automatically trimmed. + * + * Furthermore, devices with attached hot spares and devices being actively + * replaced are skipped. This is done to avoid adding additional stress to + * a potentially unhealthy device and to minimize the required rebuild time. + * + * For this reason it may be beneficial to occasionally manually TRIM a pool + * even when automatic TRIM is enabled. + */ + +/* + * Maximum size of TRIM I/O, ranges will be chunked in to 128MiB lengths. + */ +unsigned int zfs_trim_extent_bytes_max = 128 * 1024 * 1024; + +/* + * Minimum size of TRIM I/O, extents smaller than 32Kib will be skipped. + */ +unsigned int zfs_trim_extent_bytes_min = 32 * 1024; + +/* + * Skip uninitialized metaslabs during the TRIM process. This option is + * useful for pools constructed from large thinly-provisioned devices where + * TRIM operations are slow. As a pool ages an increasing fraction of + * the pools metaslabs will be initialized progressively degrading the + * usefulness of this option. This setting is stored when starting a + * manual TRIM and will persist for the duration of the requested TRIM. + */ +unsigned int zfs_trim_metaslab_skip = 0; + +/* + * Maximum number of queued TRIM I/Os per leaf vdev. The number of + * concurrent TRIM I/Os issued to the device is controlled by the + * zfs_vdev_trim_min_active and zfs_vdev_trim_max_active module options. + */ +unsigned int zfs_trim_queue_limit = 10; + +/* + * The minimum number of transaction groups between automatic trims of a + * metaslab. This setting represents a trade-off between issuing more + * efficient TRIM operations, by allowing them to be aggregated longer, + * and issuing them promptly so the trimmed space is available. Note + * that this value is a minimum; metaslabs can be trimmed less frequently + * when there are a large number of ranges which need to be trimmed. + * + * Increasing this value will allow frees to be aggregated for a longer + * time. This can result is larger TRIM operations, and increased memory + * usage in order to track the ranges to be trimmed. Decreasing this value + * has the opposite effect. The default value of 32 was determined though + * testing to be a reasonable compromise. + */ +unsigned int zfs_trim_txg_batch = 32; + +/* + * The trim_args are a control structure which describe how a leaf vdev + * should be trimmed. The core elements are the vdev, the metaslab being + * trimmed and a range tree containing the extents to TRIM. All provided + * ranges must be within the metaslab. + */ +typedef struct trim_args { + /* + * These fields are set by the caller of vdev_trim_ranges(). + */ + vdev_t *trim_vdev; /* Leaf vdev to TRIM */ + metaslab_t *trim_msp; /* Disabled metaslab */ + range_tree_t *trim_tree; /* TRIM ranges (in metaslab) */ + trim_type_t trim_type; /* Manual or auto TRIM */ + uint64_t trim_extent_bytes_max; /* Maximum TRIM I/O size */ + uint64_t trim_extent_bytes_min; /* Minimum TRIM I/O size */ + enum trim_flag trim_flags; /* TRIM flags (secure) */ + + /* + * These fields are updated by vdev_trim_ranges(). + */ + hrtime_t trim_start_time; /* Start time */ + uint64_t trim_bytes_done; /* Bytes trimmed */ +} trim_args_t; + +/* + * Determines whether a vdev_trim_thread() should be stopped. + */ +static boolean_t +vdev_trim_should_stop(vdev_t *vd) +{ + return (vd->vdev_trim_exit_wanted || !vdev_writeable(vd) || + vd->vdev_detached || vd->vdev_top->vdev_removing); +} + +/* + * Determines whether a vdev_autotrim_thread() should be stopped. + */ +static boolean_t +vdev_autotrim_should_stop(vdev_t *tvd) +{ + return (tvd->vdev_autotrim_exit_wanted || + !vdev_writeable(tvd) || tvd->vdev_removing || + spa_get_autotrim(tvd->vdev_spa) == SPA_AUTOTRIM_OFF); +} + +/* + * The sync task for updating the on-disk state of a manual TRIM. This + * is scheduled by vdev_trim_change_state(). + */ +static void +vdev_trim_zap_update_sync(void *arg, dmu_tx_t *tx) +{ + /* + * We pass in the guid instead of the vdev_t since the vdev may + * have been freed prior to the sync task being processed. This + * happens when a vdev is detached as we call spa_config_vdev_exit(), + * stop the trimming thread, schedule the sync task, and free + * the vdev. Later when the scheduled sync task is invoked, it would + * find that the vdev has been freed. + */ + uint64_t guid = *(uint64_t *)arg; + uint64_t txg = dmu_tx_get_txg(tx); + kmem_free(arg, sizeof (uint64_t)); + + vdev_t *vd = spa_lookup_by_guid(tx->tx_pool->dp_spa, guid, B_FALSE); + if (vd == NULL || vd->vdev_top->vdev_removing || !vdev_is_concrete(vd)) + return; + + uint64_t last_offset = vd->vdev_trim_offset[txg & TXG_MASK]; + vd->vdev_trim_offset[txg & TXG_MASK] = 0; + + VERIFY3U(vd->vdev_leaf_zap, !=, 0); + + objset_t *mos = vd->vdev_spa->spa_meta_objset; + + if (last_offset > 0 || vd->vdev_trim_last_offset == UINT64_MAX) { + + if (vd->vdev_trim_last_offset == UINT64_MAX) + last_offset = 0; + + vd->vdev_trim_last_offset = last_offset; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, + sizeof (last_offset), 1, &last_offset, tx)); + } + + if (vd->vdev_trim_action_time > 0) { + uint64_t val = (uint64_t)vd->vdev_trim_action_time; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_TRIM_ACTION_TIME, sizeof (val), + 1, &val, tx)); + } + + if (vd->vdev_trim_rate > 0) { + uint64_t rate = (uint64_t)vd->vdev_trim_rate; + + if (rate == UINT64_MAX) + rate = 0; + + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, + VDEV_LEAF_ZAP_TRIM_RATE, sizeof (rate), 1, &rate, tx)); + } + + uint64_t partial = vd->vdev_trim_partial; + if (partial == UINT64_MAX) + partial = 0; + + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, + sizeof (partial), 1, &partial, tx)); + + uint64_t secure = vd->vdev_trim_secure; + if (secure == UINT64_MAX) + secure = 0; + + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, + sizeof (secure), 1, &secure, tx)); + + + uint64_t trim_state = vd->vdev_trim_state; + VERIFY0(zap_update(mos, vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, + sizeof (trim_state), 1, &trim_state, tx)); +} + +/* + * Update the on-disk state of a manual TRIM. This is called to request + * that a TRIM be started/suspended/canceled, or to change one of the + * TRIM options (partial, secure, rate). + */ +static void +vdev_trim_change_state(vdev_t *vd, vdev_trim_state_t new_state, + uint64_t rate, boolean_t partial, boolean_t secure) +{ + ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); + spa_t *spa = vd->vdev_spa; + + if (new_state == vd->vdev_trim_state) + return; + + /* + * Copy the vd's guid, this will be freed by the sync task. + */ + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* + * If we're suspending, then preserve the original start time. + */ + if (vd->vdev_trim_state != VDEV_TRIM_SUSPENDED) { + vd->vdev_trim_action_time = gethrestime_sec(); + } + + /* + * If we're activating, then preserve the requested rate and trim + * method. Setting the last offset and rate to UINT64_MAX is used + * as a sentinel to indicate they should be reset to default values. + */ + if (new_state == VDEV_TRIM_ACTIVE) { + if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE || + vd->vdev_trim_state == VDEV_TRIM_CANCELED) { + vd->vdev_trim_last_offset = UINT64_MAX; + vd->vdev_trim_rate = UINT64_MAX; + vd->vdev_trim_partial = UINT64_MAX; + vd->vdev_trim_secure = UINT64_MAX; + } + + if (rate != 0) + vd->vdev_trim_rate = rate; + + if (partial != 0) + vd->vdev_trim_partial = partial; + + if (secure != 0) + vd->vdev_trim_secure = secure; + } + + boolean_t resumed = !!(vd->vdev_trim_state == VDEV_TRIM_SUSPENDED); + vd->vdev_trim_state = new_state; + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_sync_task_nowait(spa_get_dsl(spa), vdev_trim_zap_update_sync, + guid, 2, ZFS_SPACE_CHECK_NONE, tx); + + switch (new_state) { + case VDEV_TRIM_ACTIVE: + spa_event_notify(spa, vd, NULL, + resumed ? ESC_ZFS_TRIM_RESUME : ESC_ZFS_TRIM_START); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s activated", vd->vdev_path); + break; + case VDEV_TRIM_SUSPENDED: + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_SUSPEND); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s suspended", vd->vdev_path); + break; + case VDEV_TRIM_CANCELED: + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_CANCEL); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s canceled", vd->vdev_path); + break; + case VDEV_TRIM_COMPLETE: + spa_event_notify(spa, vd, NULL, ESC_ZFS_TRIM_FINISH); + spa_history_log_internal(spa, "trim", tx, + "vdev=%s complete", vd->vdev_path); + break; + default: + panic("invalid state %llu", (unsigned long long)new_state); + } + + dmu_tx_commit(tx); + + if (new_state != VDEV_TRIM_ACTIVE) + spa_notify_waiters(spa); +} + +/* + * The zio_done_func_t done callback for each manual TRIM issued. It is + * responsible for updating the TRIM stats, reissuing failed TRIM I/Os, + * and limiting the number of in flight TRIM I/Os. + */ +static void +vdev_trim_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + if (zio->io_error == ENXIO && !vdev_writeable(vd)) { + /* + * The I/O failed because the vdev was unavailable; roll the + * last offset back. (This works because spa_sync waits on + * spa_txg_zio before it runs sync tasks.) + */ + uint64_t *offset = + &vd->vdev_trim_offset[zio->io_txg & TXG_MASK]; + *offset = MIN(*offset, zio->io_offset); + } else { + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_MANUAL, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + vd->vdev_trim_bytes_done += zio->io_orig_size; + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_MANUAL], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_MANUAL]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * The zio_done_func_t done callback for each automatic TRIM issued. It + * is responsible for updating the TRIM stats and limiting the number of + * in flight TRIM I/Os. Automatic TRIM I/Os are best effort and are + * never reissued on failure. + */ +static void +vdev_autotrim_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_AUTO, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_AUTO], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_AUTO]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} + +/* + * The zio_done_func_t done callback for each TRIM issued via + * vdev_trim_simple(). It is responsible for updating the TRIM stats and + * limiting the number of in flight TRIM I/Os. Simple TRIM I/Os are best + * effort and are never reissued on failure. + */ +static void +vdev_trim_simple_cb(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + mutex_enter(&vd->vdev_trim_io_lock); + + if (zio->io_error != 0) { + vd->vdev_stat.vs_trim_errors++; + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 0, 0, 0, 0, 1, zio->io_orig_size); + } else { + spa_iostats_trim_add(vd->vdev_spa, TRIM_TYPE_SIMPLE, + 1, zio->io_orig_size, 0, 0, 0, 0); + } + + ASSERT3U(vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE], >, 0); + vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE]--; + cv_broadcast(&vd->vdev_trim_io_cv); + mutex_exit(&vd->vdev_trim_io_lock); + + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); +} +/* + * Returns the average trim rate in bytes/sec for the ta->trim_vdev. + */ +static uint64_t +vdev_trim_calculate_rate(trim_args_t *ta) +{ + return (ta->trim_bytes_done * 1000 / + (NSEC2MSEC(gethrtime() - ta->trim_start_time) + 1)); +} + +/* + * Issues a physical TRIM and takes care of rate limiting (bytes/sec) + * and number of concurrent TRIM I/Os. + */ +static int +vdev_trim_range(trim_args_t *ta, uint64_t start, uint64_t size) +{ + vdev_t *vd = ta->trim_vdev; + spa_t *spa = vd->vdev_spa; + void *cb; + + mutex_enter(&vd->vdev_trim_io_lock); + + /* + * Limit manual TRIM I/Os to the requested rate. This does not + * apply to automatic TRIM since no per vdev rate can be specified. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + while (vd->vdev_trim_rate != 0 && !vdev_trim_should_stop(vd) && + vdev_trim_calculate_rate(ta) > vd->vdev_trim_rate) { + cv_timedwait_sig(&vd->vdev_trim_io_cv, + &vd->vdev_trim_io_lock, ddi_get_lbolt() + + MSEC_TO_TICK(10)); + } + } + ta->trim_bytes_done += size; + + /* Limit in flight trimming I/Os */ + while (vd->vdev_trim_inflight[0] + vd->vdev_trim_inflight[1] + + vd->vdev_trim_inflight[2] >= zfs_trim_queue_limit) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + vd->vdev_trim_inflight[ta->trim_type]++; + mutex_exit(&vd->vdev_trim_io_lock); + + dmu_tx_t *tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + uint64_t txg = dmu_tx_get_txg(tx); + + spa_config_enter(spa, SCL_STATE_ALL, vd, RW_READER); + mutex_enter(&vd->vdev_trim_lock); + + if (ta->trim_type == TRIM_TYPE_MANUAL && + vd->vdev_trim_offset[txg & TXG_MASK] == 0) { + uint64_t *guid = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); + *guid = vd->vdev_guid; + + /* This is the first write of this txg. */ + dsl_sync_task_nowait(spa_get_dsl(spa), + vdev_trim_zap_update_sync, guid, 2, + ZFS_SPACE_CHECK_RESERVED, tx); + } + + /* + * We know the vdev_t will still be around since all consumers of + * vdev_free must stop the trimming first. + */ + if ((ta->trim_type == TRIM_TYPE_MANUAL && + vdev_trim_should_stop(vd)) || + (ta->trim_type == TRIM_TYPE_AUTO && + vdev_autotrim_should_stop(vd->vdev_top))) { + mutex_enter(&vd->vdev_trim_io_lock); + vd->vdev_trim_inflight[ta->trim_type]--; + mutex_exit(&vd->vdev_trim_io_lock); + spa_config_exit(vd->vdev_spa, SCL_STATE_ALL, vd); + mutex_exit(&vd->vdev_trim_lock); + dmu_tx_commit(tx); + return (SET_ERROR(EINTR)); + } + mutex_exit(&vd->vdev_trim_lock); + + if (ta->trim_type == TRIM_TYPE_MANUAL) + vd->vdev_trim_offset[txg & TXG_MASK] = start + size; + + if (ta->trim_type == TRIM_TYPE_MANUAL) { + cb = vdev_trim_cb; + } else if (ta->trim_type == TRIM_TYPE_AUTO) { + cb = vdev_autotrim_cb; + } else { + cb = vdev_trim_simple_cb; + } + + zio_nowait(zio_trim(spa->spa_txg_zio[txg & TXG_MASK], vd, + start, size, cb, NULL, ZIO_PRIORITY_TRIM, ZIO_FLAG_CANFAIL, + ta->trim_flags)); + /* vdev_trim_cb and vdev_autotrim_cb release SCL_STATE_ALL */ + + dmu_tx_commit(tx); + + return (0); +} + +/* + * Issues TRIM I/Os for all ranges in the provided ta->trim_tree range tree. + * Additional parameters describing how the TRIM should be performed must + * be set in the trim_args structure. See the trim_args definition for + * additional information. + */ +static int +vdev_trim_ranges(trim_args_t *ta) +{ + vdev_t *vd = ta->trim_vdev; + zfs_btree_t *t = &ta->trim_tree->rt_root; + zfs_btree_index_t idx; + uint64_t extent_bytes_max = ta->trim_extent_bytes_max; + uint64_t extent_bytes_min = ta->trim_extent_bytes_min; + spa_t *spa = vd->vdev_spa; + + ta->trim_start_time = gethrtime(); + ta->trim_bytes_done = 0; + + for (range_seg_t *rs = zfs_btree_first(t, &idx); rs != NULL; + rs = zfs_btree_next(t, &idx, &idx)) { + uint64_t size = rs_get_end(rs, ta->trim_tree) - rs_get_start(rs, + ta->trim_tree); + + if (extent_bytes_min && size < extent_bytes_min) { + spa_iostats_trim_add(spa, ta->trim_type, + 0, 0, 1, size, 0, 0); + continue; + } + + /* Split range into legally-sized physical chunks */ + uint64_t writes_required = ((size - 1) / extent_bytes_max) + 1; + + for (uint64_t w = 0; w < writes_required; w++) { + int error; + + error = vdev_trim_range(ta, VDEV_LABEL_START_SIZE + + rs_get_start(rs, ta->trim_tree) + + (w *extent_bytes_max), MIN(size - + (w * extent_bytes_max), extent_bytes_max)); + if (error != 0) { + return (error); + } + } + } + + return (0); +} + +/* + * Calculates the completion percentage of a manual TRIM. + */ +static void +vdev_trim_calculate_progress(vdev_t *vd) +{ + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + vd->vdev_trim_bytes_est = 0; + vd->vdev_trim_bytes_done = 0; + + for (uint64_t i = 0; i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + mutex_enter(&msp->ms_lock); + + uint64_t ms_free = msp->ms_size - + metaslab_allocated_space(msp); + + if (vd->vdev_top->vdev_ops == &vdev_raidz_ops) + ms_free /= vd->vdev_top->vdev_children; + + /* + * Convert the metaslab range to a physical range + * on our vdev. We use this to determine if we are + * in the middle of this metaslab range. + */ + range_seg64_t logical_rs, physical_rs; + logical_rs.rs_start = msp->ms_start; + logical_rs.rs_end = msp->ms_start + msp->ms_size; + vdev_xlate(vd, &logical_rs, &physical_rs); + + if (vd->vdev_trim_last_offset <= physical_rs.rs_start) { + vd->vdev_trim_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } else if (vd->vdev_trim_last_offset > physical_rs.rs_end) { + vd->vdev_trim_bytes_done += ms_free; + vd->vdev_trim_bytes_est += ms_free; + mutex_exit(&msp->ms_lock); + continue; + } + + /* + * If we get here, we're in the middle of trimming this + * metaslab. Load it and walk the free tree for more + * accurate progress estimation. + */ + VERIFY0(metaslab_load(msp)); + + range_tree_t *rt = msp->ms_allocatable; + zfs_btree_t *bt = &rt->rt_root; + zfs_btree_index_t idx; + for (range_seg_t *rs = zfs_btree_first(bt, &idx); + rs != NULL; rs = zfs_btree_next(bt, &idx, &idx)) { + logical_rs.rs_start = rs_get_start(rs, rt); + logical_rs.rs_end = rs_get_end(rs, rt); + vdev_xlate(vd, &logical_rs, &physical_rs); + + uint64_t size = physical_rs.rs_end - + physical_rs.rs_start; + vd->vdev_trim_bytes_est += size; + if (vd->vdev_trim_last_offset >= physical_rs.rs_end) { + vd->vdev_trim_bytes_done += size; + } else if (vd->vdev_trim_last_offset > + physical_rs.rs_start && + vd->vdev_trim_last_offset <= + physical_rs.rs_end) { + vd->vdev_trim_bytes_done += + vd->vdev_trim_last_offset - + physical_rs.rs_start; + } + } + mutex_exit(&msp->ms_lock); + } +} + +/* + * Load from disk the vdev's manual TRIM information. This includes the + * state, progress, and options provided when initiating the manual TRIM. + */ +static int +vdev_trim_load(vdev_t *vd) +{ + int err = 0; + ASSERT(spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_READER) || + spa_config_held(vd->vdev_spa, SCL_CONFIG, RW_WRITER)); + ASSERT(vd->vdev_leaf_zap != 0); + + if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE || + vd->vdev_trim_state == VDEV_TRIM_SUSPENDED) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_LAST_OFFSET, + sizeof (vd->vdev_trim_last_offset), 1, + &vd->vdev_trim_last_offset); + if (err == ENOENT) { + vd->vdev_trim_last_offset = 0; + err = 0; + } + + if (err == 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_RATE, + sizeof (vd->vdev_trim_rate), 1, + &vd->vdev_trim_rate); + if (err == ENOENT) { + vd->vdev_trim_rate = 0; + err = 0; + } + } + + if (err == 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_PARTIAL, + sizeof (vd->vdev_trim_partial), 1, + &vd->vdev_trim_partial); + if (err == ENOENT) { + vd->vdev_trim_partial = 0; + err = 0; + } + } + + if (err == 0) { + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_SECURE, + sizeof (vd->vdev_trim_secure), 1, + &vd->vdev_trim_secure); + if (err == ENOENT) { + vd->vdev_trim_secure = 0; + err = 0; + } + } + } + + vdev_trim_calculate_progress(vd); + + return (err); +} + +/* + * Convert the logical range into a physical range and add it to the + * range tree passed in the trim_args_t. + */ +static void +vdev_trim_range_add(void *arg, uint64_t start, uint64_t size) +{ + trim_args_t *ta = arg; + vdev_t *vd = ta->trim_vdev; + range_seg64_t logical_rs, physical_rs; + logical_rs.rs_start = start; + logical_rs.rs_end = start + size; + + /* + * Every range to be trimmed must be part of ms_allocatable. + * When ZFS_DEBUG_TRIM is set load the metaslab to verify this + * is always the case. + */ + if (zfs_flags & ZFS_DEBUG_TRIM) { + metaslab_t *msp = ta->trim_msp; + VERIFY0(metaslab_load(msp)); + VERIFY3B(msp->ms_loaded, ==, B_TRUE); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); + } + + ASSERT(vd->vdev_ops->vdev_op_leaf); + vdev_xlate(vd, &logical_rs, &physical_rs); + + IMPLY(vd->vdev_top == vd, + logical_rs.rs_start == physical_rs.rs_start); + IMPLY(vd->vdev_top == vd, + logical_rs.rs_end == physical_rs.rs_end); + + /* + * Only a manual trim will be traversing the vdev sequentially. + * For an auto trim all valid ranges should be added. + */ + if (ta->trim_type == TRIM_TYPE_MANUAL) { + + /* Only add segments that we have not visited yet */ + if (physical_rs.rs_end <= vd->vdev_trim_last_offset) + return; + + /* Pick up where we left off mid-range. */ + if (vd->vdev_trim_last_offset > physical_rs.rs_start) { + ASSERT3U(physical_rs.rs_end, >, + vd->vdev_trim_last_offset); + physical_rs.rs_start = vd->vdev_trim_last_offset; + } + } + + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + /* + * With raidz, it's possible that the logical range does not live on + * this leaf vdev. We only add the physical range to this vdev's if it + * has a length greater than 0. + */ + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(ta->trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } +} + +/* + * Each manual TRIM thread is responsible for trimming the unallocated + * space for each leaf vdev. This is accomplished by sequentially iterating + * over its top-level metaslabs and issuing TRIM I/O for the space described + * by its ms_allocatable. While a metaslab is undergoing trimming it is + * not eligible for new allocations. + */ +static void +vdev_trim_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + trim_args_t ta; + int error = 0; + + /* + * The VDEV_LEAF_ZAP_TRIM_* entries may have been updated by + * vdev_trim(). Wait for the updated values to be reflected + * in the zap in order to start with the requested settings. + */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_trim_last_offset = 0; + vd->vdev_trim_rate = 0; + vd->vdev_trim_partial = 0; + vd->vdev_trim_secure = 0; + + VERIFY0(vdev_trim_load(vd)); + + ta.trim_vdev = vd; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = zfs_trim_extent_bytes_min; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_MANUAL; + ta.trim_flags = 0; + + /* + * When a secure TRIM has been requested infer that the intent + * is that everything must be trimmed. Override the default + * minimum TRIM size to prevent ranges from being skipped. + */ + if (vd->vdev_trim_secure) { + ta.trim_flags |= ZIO_TRIM_SECURE; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + } + + uint64_t ms_count = 0; + for (uint64_t i = 0; !vd->vdev_detached && + i < vd->vdev_top->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_top->vdev_ms[i]; + + /* + * If we've expanded the top-level vdev or it's our + * first pass, calculate our progress. + */ + if (vd->vdev_top->vdev_ms_count != ms_count) { + vdev_trim_calculate_progress(vd); + ms_count = vd->vdev_top->vdev_ms_count; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + metaslab_disable(msp); + mutex_enter(&msp->ms_lock); + VERIFY0(metaslab_load(msp)); + + /* + * If a partial TRIM was requested skip metaslabs which have + * never been initialized and thus have never been written. + */ + if (msp->ms_sm == NULL && vd->vdev_trim_partial) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + vdev_trim_calculate_progress(vd); + continue; + } + + ta.trim_msp = msp; + range_tree_walk(msp->ms_allocatable, vdev_trim_range_add, &ta); + range_tree_vacate(msp->ms_trim, NULL, NULL); + mutex_exit(&msp->ms_lock); + + error = vdev_trim_ranges(&ta); + metaslab_enable(msp, B_TRUE, B_FALSE); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + if (error != 0) + break; + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[0] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_destroy(ta.trim_tree); + + mutex_enter(&vd->vdev_trim_lock); + if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + ASSERT(vd->vdev_trim_thread != NULL || vd->vdev_trim_inflight[0] == 0); + + /* + * Drop the vdev_trim_lock while we sync out the txg since it's + * possible that a device might be trying to come online and must + * check to see if it needs to restart a trim. That thread will be + * holding the spa_config_lock which would prevent the txg_wait_synced + * from completing. + */ + mutex_exit(&vd->vdev_trim_lock); + txg_wait_synced(spa_get_dsl(spa), 0); + mutex_enter(&vd->vdev_trim_lock); + + vd->vdev_trim_thread = NULL; + cv_broadcast(&vd->vdev_trim_cv); + mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); +} + +/* + * Initiates a manual TRIM for the vdev_t. Callers must hold vdev_trim_lock, + * the vdev_t must be a leaf and cannot already be manually trimming. + */ +void +vdev_trim(vdev_t *vd, uint64_t rate, boolean_t partial, boolean_t secure) +{ + ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_trim_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, rate, partial, secure); + vd->vdev_trim_thread = thread_create(NULL, 0, + vdev_trim_thread, vd, 0, &p0, TS_RUN, maxclsyspri); +} + +/* + * Wait for the trimming thread to be terminated (canceled or stopped). + */ +static void +vdev_trim_stop_wait_impl(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); + + while (vd->vdev_trim_thread != NULL) + cv_wait(&vd->vdev_trim_cv, &vd->vdev_trim_lock); + + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + vd->vdev_trim_exit_wanted = B_FALSE; +} + +/* + * Wait for vdev trim threads which were listed to cleanly exit. + */ +void +vdev_trim_stop_wait(spa_t *spa, list_t *vd_list) +{ + vdev_t *vd; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + while ((vd = list_remove_head(vd_list)) != NULL) { + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop_wait_impl(vd); + mutex_exit(&vd->vdev_trim_lock); + } +} + +/* + * Stop trimming a device, with the resultant trimming state being tgt_state. + * For blocking behavior pass NULL for vd_list. Otherwise, when a list_t is + * provided the stopping vdev is inserted in to the list. Callers are then + * required to call vdev_trim_stop_wait() to block for all the trim threads + * to exit. The caller must hold vdev_trim_lock and must not be writing to + * the spa config, as the trimming thread may try to enter the config as a + * reader before exiting. + */ +void +vdev_trim_stop(vdev_t *vd, vdev_trim_state_t tgt_state, list_t *vd_list) +{ + ASSERT(!spa_config_held(vd->vdev_spa, SCL_CONFIG|SCL_STATE, RW_WRITER)); + ASSERT(MUTEX_HELD(&vd->vdev_trim_lock)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + + /* + * Allow cancel requests to proceed even if the trim thread has + * stopped. + */ + if (vd->vdev_trim_thread == NULL && tgt_state != VDEV_TRIM_CANCELED) + return; + + vdev_trim_change_state(vd, tgt_state, 0, 0, 0); + vd->vdev_trim_exit_wanted = B_TRUE; + + if (vd_list == NULL) { + vdev_trim_stop_wait_impl(vd); + } else { + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + list_insert_tail(vd_list, vd); + } +} + +/* + * Requests that all listed vdevs stop trimming. + */ +static void +vdev_trim_stop_all_impl(vdev_t *vd, vdev_trim_state_t tgt_state, + list_t *vd_list) +{ + if (vd->vdev_ops->vdev_op_leaf && vdev_is_concrete(vd)) { + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_stop(vd, tgt_state, vd_list); + mutex_exit(&vd->vdev_trim_lock); + return; + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_trim_stop_all_impl(vd->vdev_child[i], tgt_state, + vd_list); + } +} + +/* + * Convenience function to stop trimming of a vdev tree and set all trim + * thread pointers to NULL. + */ +void +vdev_trim_stop_all(vdev_t *vd, vdev_trim_state_t tgt_state) +{ + spa_t *spa = vd->vdev_spa; + list_t vd_list; + vdev_t *vd_l2cache; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + list_create(&vd_list, sizeof (vdev_t), + offsetof(vdev_t, vdev_trim_node)); + + vdev_trim_stop_all_impl(vd, tgt_state, &vd_list); + + /* + * Iterate over cache devices and request stop trimming the + * whole device in case we export the pool or remove the cache + * device prematurely. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vd_l2cache = spa->spa_l2cache.sav_vdevs[i]; + vdev_trim_stop_all_impl(vd_l2cache, tgt_state, &vd_list); + } + + vdev_trim_stop_wait(spa, &vd_list); + + if (vd->vdev_spa->spa_sync_on) { + /* Make sure that our state has been synced to disk */ + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + } + + list_destroy(&vd_list); +} + +/* + * Conditionally restarts a manual TRIM given its on-disk state. + */ +void +vdev_trim_restart(vdev_t *vd) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(!spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER)); + + if (vd->vdev_leaf_zap != 0) { + mutex_enter(&vd->vdev_trim_lock); + uint64_t trim_state = VDEV_TRIM_NONE; + int err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_STATE, + sizeof (trim_state), 1, &trim_state); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_trim_state = trim_state; + + uint64_t timestamp = 0; + err = zap_lookup(vd->vdev_spa->spa_meta_objset, + vd->vdev_leaf_zap, VDEV_LEAF_ZAP_TRIM_ACTION_TIME, + sizeof (timestamp), 1, ×tamp); + ASSERT(err == 0 || err == ENOENT); + vd->vdev_trim_action_time = timestamp; + + if (vd->vdev_trim_state == VDEV_TRIM_SUSPENDED || + vd->vdev_offline) { + /* load progress for reporting, but don't resume */ + VERIFY0(vdev_trim_load(vd)); + } else if (vd->vdev_trim_state == VDEV_TRIM_ACTIVE && + vdev_writeable(vd) && !vd->vdev_top->vdev_removing && + vd->vdev_trim_thread == NULL) { + VERIFY0(vdev_trim_load(vd)); + vdev_trim(vd, vd->vdev_trim_rate, + vd->vdev_trim_partial, vd->vdev_trim_secure); + } + + mutex_exit(&vd->vdev_trim_lock); + } + + for (uint64_t i = 0; i < vd->vdev_children; i++) { + vdev_trim_restart(vd->vdev_child[i]); + } +} + +/* + * Used by the automatic TRIM when ZFS_DEBUG_TRIM is set to verify that + * every TRIM range is contained within ms_allocatable. + */ +static void +vdev_trim_range_verify(void *arg, uint64_t start, uint64_t size) +{ + trim_args_t *ta = arg; + metaslab_t *msp = ta->trim_msp; + + VERIFY3B(msp->ms_loaded, ==, B_TRUE); + VERIFY3U(msp->ms_disabled, >, 0); + VERIFY(range_tree_contains(msp->ms_allocatable, start, size)); +} + +/* + * Each automatic TRIM thread is responsible for managing the trimming of a + * top-level vdev in the pool. No automatic TRIM state is maintained on-disk. + * + * N.B. This behavior is different from a manual TRIM where a thread + * is created for each leaf vdev, instead of each top-level vdev. + */ +static void +vdev_autotrim_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + int shift = 0; + + mutex_enter(&vd->vdev_autotrim_lock); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(vd->vdev_autotrim_thread, !=, NULL); + mutex_exit(&vd->vdev_autotrim_lock); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + uint64_t extent_bytes_max = zfs_trim_extent_bytes_max; + uint64_t extent_bytes_min = zfs_trim_extent_bytes_min; + + while (!vdev_autotrim_should_stop(vd)) { + int txgs_per_trim = MAX(zfs_trim_txg_batch, 1); + boolean_t issued_trim = B_FALSE; + + /* + * All of the metaslabs are divided in to groups of size + * num_metaslabs / zfs_trim_txg_batch. Each of these groups + * is composed of metaslabs which are spread evenly over the + * device. + * + * For example, when zfs_trim_txg_batch = 32 (default) then + * group 0 will contain metaslabs 0, 32, 64, ...; + * group 1 will contain metaslabs 1, 33, 65, ...; + * group 2 will contain metaslabs 2, 34, 66, ...; and so on. + * + * On each pass through the while() loop one of these groups + * is selected. This is accomplished by using a shift value + * to select the starting metaslab, then striding over the + * metaslabs using the zfs_trim_txg_batch size. This is + * done to accomplish two things. + * + * 1) By dividing the metaslabs in to groups, and making sure + * that each group takes a minimum of one txg to process. + * Then zfs_trim_txg_batch controls the minimum number of + * txgs which must occur before a metaslab is revisited. + * + * 2) Selecting non-consecutive metaslabs distributes the + * TRIM commands for a group evenly over the entire device. + * This can be advantageous for certain types of devices. + */ + for (uint64_t i = shift % txgs_per_trim; i < vd->vdev_ms_count; + i += txgs_per_trim) { + metaslab_t *msp = vd->vdev_ms[i]; + range_tree_t *trim_tree; + + spa_config_exit(spa, SCL_CONFIG, FTAG); + metaslab_disable(msp); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + mutex_enter(&msp->ms_lock); + + /* + * Skip the metaslab when it has never been allocated + * or when there are no recent frees to trim. + */ + if (msp->ms_sm == NULL || + range_tree_is_empty(msp->ms_trim)) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + /* + * Skip the metaslab when it has already been disabled. + * This may happen when a manual TRIM or initialize + * operation is running concurrently. In the case + * of a manual TRIM, the ms_trim tree will have been + * vacated. Only ranges added after the manual TRIM + * disabled the metaslab will be included in the tree. + * These will be processed when the automatic TRIM + * next revisits this metaslab. + */ + if (msp->ms_disabled > 1) { + mutex_exit(&msp->ms_lock); + metaslab_enable(msp, B_FALSE, B_FALSE); + continue; + } + + /* + * Allocate an empty range tree which is swapped in + * for the existing ms_trim tree while it is processed. + */ + trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, + 0, 0); + range_tree_swap(&msp->ms_trim, &trim_tree); + ASSERT(range_tree_is_empty(msp->ms_trim)); + + /* + * There are two cases when constructing the per-vdev + * trim trees for a metaslab. If the top-level vdev + * has no children then it is also a leaf and should + * be trimmed. Otherwise our children are the leaves + * and a trim tree should be constructed for each. + */ + trim_args_t *tap; + uint64_t children = vd->vdev_children; + if (children == 0) { + children = 1; + tap = kmem_zalloc(sizeof (trim_args_t) * + children, KM_SLEEP); + tap[0].trim_vdev = vd; + } else { + tap = kmem_zalloc(sizeof (trim_args_t) * + children, KM_SLEEP); + + for (uint64_t c = 0; c < children; c++) { + tap[c].trim_vdev = vd->vdev_child[c]; + } + } + + for (uint64_t c = 0; c < children; c++) { + trim_args_t *ta = &tap[c]; + vdev_t *cvd = ta->trim_vdev; + + ta->trim_msp = msp; + ta->trim_extent_bytes_max = extent_bytes_max; + ta->trim_extent_bytes_min = extent_bytes_min; + ta->trim_type = TRIM_TYPE_AUTO; + ta->trim_flags = 0; + + if (cvd->vdev_detached || + !vdev_writeable(cvd) || + !cvd->vdev_has_trim || + cvd->vdev_trim_thread != NULL) { + continue; + } + + /* + * When a device has an attached hot spare, or + * is being replaced it will not be trimmed. + * This is done to avoid adding additional + * stress to a potentially unhealthy device, + * and to minimize the required rebuild time. + */ + if (!cvd->vdev_ops->vdev_op_leaf) + continue; + + ta->trim_tree = range_tree_create(NULL, + RANGE_SEG64, NULL, 0, 0); + range_tree_walk(trim_tree, + vdev_trim_range_add, ta); + } + + mutex_exit(&msp->ms_lock); + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * Issue the TRIM I/Os for all ranges covered by the + * TRIM trees. These ranges are safe to TRIM because + * no new allocations will be performed until the call + * to metaslab_enabled() below. + */ + for (uint64_t c = 0; c < children; c++) { + trim_args_t *ta = &tap[c]; + + /* + * Always yield to a manual TRIM if one has + * been started for the child vdev. + */ + if (ta->trim_tree == NULL || + ta->trim_vdev->vdev_trim_thread != NULL) { + continue; + } + + /* + * After this point metaslab_enable() must be + * called with the sync flag set. This is done + * here because vdev_trim_ranges() is allowed + * to be interrupted (EINTR) before issuing all + * of the required TRIM I/Os. + */ + issued_trim = B_TRUE; + + int error = vdev_trim_ranges(ta); + if (error) + break; + } + + /* + * Verify every range which was trimmed is still + * contained within the ms_allocatable tree. + */ + if (zfs_flags & ZFS_DEBUG_TRIM) { + mutex_enter(&msp->ms_lock); + VERIFY0(metaslab_load(msp)); + VERIFY3P(tap[0].trim_msp, ==, msp); + range_tree_walk(trim_tree, + vdev_trim_range_verify, &tap[0]); + mutex_exit(&msp->ms_lock); + } + + range_tree_vacate(trim_tree, NULL, NULL); + range_tree_destroy(trim_tree); + + metaslab_enable(msp, issued_trim, B_FALSE); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + for (uint64_t c = 0; c < children; c++) { + trim_args_t *ta = &tap[c]; + + if (ta->trim_tree == NULL) + continue; + + range_tree_vacate(ta->trim_tree, NULL, NULL); + range_tree_destroy(ta->trim_tree); + } + + kmem_free(tap, sizeof (trim_args_t) * children); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * After completing the group of metaslabs wait for the next + * open txg. This is done to make sure that a minimum of + * zfs_trim_txg_batch txgs will occur before these metaslabs + * are trimmed again. + */ + txg_wait_open(spa_get_dsl(spa), 0, issued_trim); + + shift++; + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + } + + for (uint64_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_trim_io_lock); + + while (cvd->vdev_trim_inflight[1] > 0) { + cv_wait(&cvd->vdev_trim_io_cv, + &cvd->vdev_trim_io_lock); + } + mutex_exit(&cvd->vdev_trim_io_lock); + } + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + /* + * When exiting because the autotrim property was set to off, then + * abandon any unprocessed ms_trim ranges to reclaim the memory. + */ + if (spa_get_autotrim(spa) == SPA_AUTOTRIM_OFF) { + for (uint64_t i = 0; i < vd->vdev_ms_count; i++) { + metaslab_t *msp = vd->vdev_ms[i]; + + mutex_enter(&msp->ms_lock); + range_tree_vacate(msp->ms_trim, NULL, NULL); + mutex_exit(&msp->ms_lock); + } + } + + mutex_enter(&vd->vdev_autotrim_lock); + ASSERT(vd->vdev_autotrim_thread != NULL); + vd->vdev_autotrim_thread = NULL; + cv_broadcast(&vd->vdev_autotrim_cv); + mutex_exit(&vd->vdev_autotrim_lock); + + thread_exit(); +} + +/* + * Starts an autotrim thread, if needed, for each top-level vdev which can be + * trimmed. A top-level vdev which has been evacuated will never be trimmed. + */ +void +vdev_autotrim(spa_t *spa) +{ + vdev_t *root_vd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) { + vdev_t *tvd = root_vd->vdev_child[i]; + + mutex_enter(&tvd->vdev_autotrim_lock); + if (vdev_writeable(tvd) && !tvd->vdev_removing && + tvd->vdev_autotrim_thread == NULL) { + ASSERT3P(tvd->vdev_top, ==, tvd); + + tvd->vdev_autotrim_thread = thread_create(NULL, 0, + vdev_autotrim_thread, tvd, 0, &p0, TS_RUN, + maxclsyspri); + ASSERT(tvd->vdev_autotrim_thread != NULL); + } + mutex_exit(&tvd->vdev_autotrim_lock); + } +} + +/* + * Wait for the vdev_autotrim_thread associated with the passed top-level + * vdev to be terminated (canceled or stopped). + */ +void +vdev_autotrim_stop_wait(vdev_t *tvd) +{ + mutex_enter(&tvd->vdev_autotrim_lock); + if (tvd->vdev_autotrim_thread != NULL) { + tvd->vdev_autotrim_exit_wanted = B_TRUE; + + while (tvd->vdev_autotrim_thread != NULL) { + cv_wait(&tvd->vdev_autotrim_cv, + &tvd->vdev_autotrim_lock); + } + + ASSERT3P(tvd->vdev_autotrim_thread, ==, NULL); + tvd->vdev_autotrim_exit_wanted = B_FALSE; + } + mutex_exit(&tvd->vdev_autotrim_lock); +} + +/* + * Wait for all of the vdev_autotrim_thread associated with the pool to + * be terminated (canceled or stopped). + */ +void +vdev_autotrim_stop_all(spa_t *spa) +{ + vdev_t *root_vd = spa->spa_root_vdev; + + for (uint64_t i = 0; i < root_vd->vdev_children; i++) + vdev_autotrim_stop_wait(root_vd->vdev_child[i]); +} + +/* + * Conditionally restart all of the vdev_autotrim_thread's for the pool. + */ +void +vdev_autotrim_restart(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + if (spa->spa_autotrim) + vdev_autotrim(spa); +} + +static void +vdev_trim_l2arc_thread(void *arg) +{ + vdev_t *vd = arg; + spa_t *spa = vd->vdev_spa; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + trim_args_t ta; + range_seg64_t physical_rs; + + ASSERT(vdev_is_concrete(vd)); + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + vd->vdev_trim_last_offset = 0; + vd->vdev_trim_rate = 0; + vd->vdev_trim_partial = 0; + vd->vdev_trim_secure = 0; + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_MANUAL; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + physical_rs.rs_start = vd->vdev_trim_bytes_done = 0; + physical_rs.rs_end = vd->vdev_trim_bytes_est = + vdev_get_min_asize(vd); + + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + + mutex_enter(&vd->vdev_trim_lock); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + mutex_exit(&vd->vdev_trim_lock); + + (void) vdev_trim_ranges(&ta); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + mutex_enter(&vd->vdev_trim_lock); + if (!vd->vdev_trim_exit_wanted && vdev_writeable(vd)) { + vdev_trim_change_state(vd, VDEV_TRIM_COMPLETE, + vd->vdev_trim_rate, vd->vdev_trim_partial, + vd->vdev_trim_secure); + } + ASSERT(vd->vdev_trim_thread != NULL || + vd->vdev_trim_inflight[TRIM_TYPE_MANUAL] == 0); + + /* + * Drop the vdev_trim_lock while we sync out the txg since it's + * possible that a device might be trying to come online and + * must check to see if it needs to restart a trim. That thread + * will be holding the spa_config_lock which would prevent the + * txg_wait_synced from completing. Same strategy as in + * vdev_trim_thread(). + */ + mutex_exit(&vd->vdev_trim_lock); + txg_wait_synced(spa_get_dsl(vd->vdev_spa), 0); + mutex_enter(&vd->vdev_trim_lock); + + /* + * Update the header of the cache device here, before + * broadcasting vdev_trim_cv which may lead to the removal + * of the device. The same applies for setting l2ad_trim_all to + * false. + */ + spa_config_enter(vd->vdev_spa, SCL_L2ARC, vd, + RW_READER); + bzero(dev->l2ad_dev_hdr, dev->l2ad_dev_hdr_asize); + l2arc_dev_hdr_update(dev); + spa_config_exit(vd->vdev_spa, SCL_L2ARC, vd); + + vd->vdev_trim_thread = NULL; + if (vd->vdev_trim_state == VDEV_TRIM_COMPLETE) + dev->l2ad_trim_all = B_FALSE; + + cv_broadcast(&vd->vdev_trim_cv); + mutex_exit(&vd->vdev_trim_lock); + + thread_exit(); +} + +/* + * Punches out TRIM threads for the L2ARC devices in a spa and assigns them + * to vd->vdev_trim_thread variable. This facilitates the management of + * trimming the whole cache device using TRIM_TYPE_MANUAL upon addition + * to a pool or pool creation or when the header of the device is invalid. + */ +void +vdev_trim_l2arc(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + /* + * Locate the spa's l2arc devices and kick off TRIM threads. + */ + for (int i = 0; i < spa->spa_l2cache.sav_count; i++) { + vdev_t *vd = spa->spa_l2cache.sav_vdevs[i]; + l2arc_dev_t *dev = l2arc_vdev_get(vd); + + if (dev == NULL || !dev->l2ad_trim_all) { + /* + * Don't attempt TRIM if the vdev is UNAVAIL or if the + * cache device was not marked for whole device TRIM + * (ie l2arc_trim_ahead = 0, or the L2ARC device header + * is valid with trim_state = VDEV_TRIM_COMPLETE and + * l2ad_log_entries > 0). + */ + continue; + } + + mutex_enter(&vd->vdev_trim_lock); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(vdev_is_concrete(vd)); + ASSERT3P(vd->vdev_trim_thread, ==, NULL); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_trim_exit_wanted); + ASSERT(!vd->vdev_top->vdev_removing); + vdev_trim_change_state(vd, VDEV_TRIM_ACTIVE, 0, 0, 0); + vd->vdev_trim_thread = thread_create(NULL, 0, + vdev_trim_l2arc_thread, vd, 0, &p0, TS_RUN, maxclsyspri); + mutex_exit(&vd->vdev_trim_lock); + } +} + +/* + * A wrapper which calls vdev_trim_ranges(). It is intended to be called + * on leaf vdevs. + */ +int +vdev_trim_simple(vdev_t *vd, uint64_t start, uint64_t size) +{ + trim_args_t ta; + range_seg64_t physical_rs; + int error; + physical_rs.rs_start = start; + physical_rs.rs_end = start + size; + + ASSERT(vdev_is_concrete(vd)); + ASSERT(vd->vdev_ops->vdev_op_leaf); + ASSERT(!vd->vdev_detached); + ASSERT(!vd->vdev_top->vdev_removing); + + bzero(&ta, sizeof (ta)); + ta.trim_vdev = vd; + ta.trim_tree = range_tree_create(NULL, RANGE_SEG64, NULL, 0, 0); + ta.trim_type = TRIM_TYPE_SIMPLE; + ta.trim_extent_bytes_max = zfs_trim_extent_bytes_max; + ta.trim_extent_bytes_min = SPA_MINBLOCKSIZE; + ta.trim_flags = 0; + + ASSERT3U(physical_rs.rs_end, >=, physical_rs.rs_start); + + if (physical_rs.rs_end > physical_rs.rs_start) { + range_tree_add(ta.trim_tree, physical_rs.rs_start, + physical_rs.rs_end - physical_rs.rs_start); + } else { + ASSERT3U(physical_rs.rs_end, ==, physical_rs.rs_start); + } + + error = vdev_trim_ranges(&ta); + + mutex_enter(&vd->vdev_trim_io_lock); + while (vd->vdev_trim_inflight[TRIM_TYPE_SIMPLE] > 0) { + cv_wait(&vd->vdev_trim_io_cv, &vd->vdev_trim_io_lock); + } + mutex_exit(&vd->vdev_trim_io_lock); + + range_tree_vacate(ta.trim_tree, NULL, NULL); + range_tree_destroy(ta.trim_tree); + + return (error); +} + +EXPORT_SYMBOL(vdev_trim); +EXPORT_SYMBOL(vdev_trim_stop); +EXPORT_SYMBOL(vdev_trim_stop_all); +EXPORT_SYMBOL(vdev_trim_stop_wait); +EXPORT_SYMBOL(vdev_trim_restart); +EXPORT_SYMBOL(vdev_autotrim); +EXPORT_SYMBOL(vdev_autotrim_stop_all); +EXPORT_SYMBOL(vdev_autotrim_stop_wait); +EXPORT_SYMBOL(vdev_autotrim_restart); +EXPORT_SYMBOL(vdev_trim_l2arc); +EXPORT_SYMBOL(vdev_trim_simple); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_max, UINT, ZMOD_RW, + "Max size of TRIM commands, larger will be split"); + +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, extent_bytes_min, UINT, ZMOD_RW, + "Min size of TRIM commands, smaller will be skipped"); + +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, metaslab_skip, UINT, ZMOD_RW, + "Skip metaslabs which have never been initialized"); + +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, txg_batch, UINT, ZMOD_RW, + "Min number of txgs to aggregate frees before issuing TRIM"); + +ZFS_MODULE_PARAM(zfs_trim, zfs_trim_, queue_limit, UINT, ZMOD_RW, + "Max queued TRIMs outstanding per leaf vdev"); +/* END CSTYLED */ diff --git a/module/zfs/zap.c b/module/zfs/zap.c new file mode 100644 index 000000000000..c0c280c52076 --- /dev/null +++ b/module/zfs/zap.c @@ -0,0 +1,1384 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + */ + +/* + * This file contains the top half of the zfs directory structure + * implementation. The bottom half is in zap_leaf.c. + * + * The zdir is an extendable hash data structure. There is a table of + * pointers to buckets (zap_t->zd_data->zd_leafs). The buckets are + * each a constant size and hold a variable number of directory entries. + * The buckets (aka "leaf nodes") are implemented in zap_leaf.c. + * + * The pointer table holds a power of 2 number of pointers. + * (1<zd_data->zd_phys->zd_prefix_len). The bucket pointed to + * by the pointer at index i in the table holds entries whose hash value + * has a zd_prefix_len - bit prefix + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * If zap_iterate_prefetch is set, we will prefetch the entire ZAP object + * (all leaf blocks) when we start iterating over it. + * + * For zap_cursor_init(), the callers all intend to iterate through all the + * entries. There are a few cases where an error (typically i/o error) could + * cause it to bail out early. + * + * For zap_cursor_init_serialized(), there are callers that do the iteration + * outside of ZFS. Typically they would iterate over everything, but we + * don't have control of that. E.g. zfs_ioc_snapshot_list_next(), + * zcp_snapshots_iter(), and other iterators over things in the MOS - these + * are called by /sbin/zfs and channel programs. The other example is + * zfs_readdir() which iterates over directory entries for the getdents() + * syscall. /sbin/ls iterates to the end (unless it receives a signal), but + * userland doesn't have to. + * + * Given that the ZAP entries aren't returned in a specific order, the only + * legitimate use cases for partial iteration would be: + * + * 1. Pagination: e.g. you only want to display 100 entries at a time, so you + * get the first 100 and then wait for the user to hit "next page", which + * they may never do). + * + * 2. You want to know if there are more than X entries, without relying on + * the zfs-specific implementation of the directory's st_size (which is + * the number of entries). + */ +int zap_iterate_prefetch = B_TRUE; + +int fzap_default_block_shift = 14; /* 16k blocksize */ + +extern inline zap_phys_t *zap_f_phys(zap_t *zap); + +static uint64_t zap_allocate_blocks(zap_t *zap, int nblocks); + +void +fzap_byteswap(void *vbuf, size_t size) +{ + uint64_t block_type = *(uint64_t *)vbuf; + + if (block_type == ZBT_LEAF || block_type == BSWAP_64(ZBT_LEAF)) + zap_leaf_byteswap(vbuf, size); + else { + /* it's a ptrtbl block */ + byteswap_uint64_array(vbuf, size); + } +} + +void +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + zap->zap_ismicro = FALSE; + + zap->zap_dbu.dbu_evict_func_sync = zap_evict_sync; + zap->zap_dbu.dbu_evict_func_async = NULL; + + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, 0); + zap->zap_f.zap_block_shift = highbit64(zap->zap_dbuf->db_size) - 1; + + zap_phys_t *zp = zap_f_phys(zap); + /* + * explicitly zero it since it might be coming from an + * initialized microzap + */ + bzero(zap->zap_dbuf->db_data, zap->zap_dbuf->db_size); + zp->zap_block_type = ZBT_HEADER; + zp->zap_magic = ZAP_MAGIC; + + zp->zap_ptrtbl.zt_shift = ZAP_EMBEDDED_PTRTBL_SHIFT(zap); + + zp->zap_freeblk = 2; /* block 1 will be the first leaf */ + zp->zap_num_leafs = 1; + zp->zap_num_entries = 0; + zp->zap_salt = zap->zap_salt; + zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; + + /* block 1 will be the first leaf */ + for (int i = 0; i < (1<zap_ptrtbl.zt_shift); i++) + ZAP_EMBEDDED_PTRTBL_ENT(zap, i) = 1; + + /* + * set up block 1 - the first leaf + */ + dmu_buf_t *db; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + 1<l_dbuf = db; + + zap_leaf_init(l, zp->zap_normflags != 0); + + kmem_free(l, sizeof (zap_leaf_t)); + dmu_buf_rele(db, FTAG); +} + +static int +zap_tryupgradedir(zap_t *zap, dmu_tx_t *tx) +{ + if (RW_WRITE_HELD(&zap->zap_rwlock)) + return (1); + if (rw_tryupgrade(&zap->zap_rwlock)) { + dmu_buf_will_dirty(zap->zap_dbuf, tx); + return (1); + } + return (0); +} + +/* + * Generic routines for dealing with the pointer & cookie tables. + */ + +static int +zap_table_grow(zap_t *zap, zap_table_phys_t *tbl, + void (*transfer_func)(const uint64_t *src, uint64_t *dst, int n), + dmu_tx_t *tx) +{ + uint64_t newblk; + int bs = FZAP_BLOCK_SHIFT(zap); + int hepb = 1<<(bs-4); + /* hepb = half the number of entries in a block */ + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + ASSERT(tbl->zt_numblks > 0); + + if (tbl->zt_nextblk != 0) { + newblk = tbl->zt_nextblk; + } else { + newblk = zap_allocate_blocks(zap, tbl->zt_numblks * 2); + tbl->zt_nextblk = newblk; + ASSERT0(tbl->zt_blks_copied); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + tbl->zt_blk << bs, tbl->zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + } + + /* + * Copy the ptrtbl from the old to new location. + */ + + uint64_t b = tbl->zt_blks_copied; + dmu_buf_t *db_old; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + b) << bs, FTAG, &db_old, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + + /* first half of entries in old[b] go to new[2*b+0] */ + dmu_buf_t *db_new; + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+0) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func(db_old->db_data, db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + /* second half of entries in old[b] go to new[2*b+1] */ + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + (newblk + 2*b+1) << bs, FTAG, &db_new, DMU_READ_NO_PREFETCH)); + dmu_buf_will_dirty(db_new, tx); + transfer_func((uint64_t *)db_old->db_data + hepb, + db_new->db_data, hepb); + dmu_buf_rele(db_new, FTAG); + + dmu_buf_rele(db_old, FTAG); + + tbl->zt_blks_copied++; + + dprintf("copied block %llu of %llu\n", + tbl->zt_blks_copied, tbl->zt_numblks); + + if (tbl->zt_blks_copied == tbl->zt_numblks) { + (void) dmu_free_range(zap->zap_objset, zap->zap_object, + tbl->zt_blk << bs, tbl->zt_numblks << bs, tx); + + tbl->zt_blk = newblk; + tbl->zt_numblks *= 2; + tbl->zt_shift++; + tbl->zt_nextblk = 0; + tbl->zt_blks_copied = 0; + + dprintf("finished; numblocks now %llu (%uk entries)\n", + tbl->zt_numblks, 1<<(tbl->zt_shift-10)); + } + + return (0); +} + +static int +zap_table_store(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t val, + dmu_tx_t *tx) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(tbl->zt_blk != 0); + + dprintf("storing %llx at index %llx\n", val, idx); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + dmu_buf_t *db; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db, tx); + + if (tbl->zt_nextblk != 0) { + uint64_t idx2 = idx * 2; + uint64_t blk2 = idx2 >> (bs-3); + uint64_t off2 = idx2 & ((1<<(bs-3))-1); + dmu_buf_t *db2; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (tbl->zt_nextblk + blk2) << bs, FTAG, &db2, + DMU_READ_NO_PREFETCH); + if (err != 0) { + dmu_buf_rele(db, FTAG); + return (err); + } + dmu_buf_will_dirty(db2, tx); + ((uint64_t *)db2->db_data)[off2] = val; + ((uint64_t *)db2->db_data)[off2+1] = val; + dmu_buf_rele(db2, FTAG); + } + + ((uint64_t *)db->db_data)[off] = val; + dmu_buf_rele(db, FTAG); + + return (0); +} + +static int +zap_table_load(zap_t *zap, zap_table_phys_t *tbl, uint64_t idx, uint64_t *valp) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + uint64_t blk = idx >> (bs-3); + uint64_t off = idx & ((1<<(bs-3))-1); + + /* + * Note: this is equivalent to dmu_buf_hold(), but we use + * _dnode_enter / _by_dnode because it's faster because we don't + * have to hold the dnode. + */ + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + dmu_buf_t *db; + int err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_blk + blk) << bs, FTAG, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err != 0) + return (err); + *valp = ((uint64_t *)db->db_data)[off]; + dmu_buf_rele(db, FTAG); + + if (tbl->zt_nextblk != 0) { + /* + * read the nextblk for the sake of i/o error checking, + * so that zap_table_load() will catch errors for + * zap_table_store. + */ + blk = (idx*2) >> (bs-3); + + dn = dmu_buf_dnode_enter(zap->zap_dbuf); + err = dmu_buf_hold_by_dnode(dn, + (tbl->zt_nextblk + blk) << bs, FTAG, &db, + DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err == 0) + dmu_buf_rele(db, FTAG); + } + return (err); +} + +/* + * Routines for growing the ptrtbl. + */ + +static void +zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) +{ + for (int i = 0; i < n; i++) { + uint64_t lb = src[i]; + dst[2 * i + 0] = lb; + dst[2 * i + 1] = lb; + } +} + +static int +zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) +{ + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) + return (SET_ERROR(ENOSPC)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* + * We are outgrowing the "embedded" ptrtbl (the one + * stored in the header block). Give it its own entire + * block, which will double the size of the ptrtbl. + */ + ASSERT3U(zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + ASSERT0(zap_f_phys(zap)->zap_ptrtbl.zt_blk); + + uint64_t newblk = zap_allocate_blocks(zap, 1); + dmu_buf_t *db_new; + int err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + newblk << FZAP_BLOCK_SHIFT(zap), FTAG, &db_new, + DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); + dmu_buf_will_dirty(db_new, tx); + zap_ptrtbl_transfer(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + db_new->db_data, 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap)); + dmu_buf_rele(db_new, FTAG); + + zap_f_phys(zap)->zap_ptrtbl.zt_blk = newblk; + zap_f_phys(zap)->zap_ptrtbl.zt_numblks = 1; + zap_f_phys(zap)->zap_ptrtbl.zt_shift++; + + ASSERT3U(1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift, ==, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << + (FZAP_BLOCK_SHIFT(zap)-3)); + + return (0); + } else { + return (zap_table_grow(zap, &zap_f_phys(zap)->zap_ptrtbl, + zap_ptrtbl_transfer, tx)); + } +} + +static void +zap_increment_num_entries(zap_t *zap, int delta, dmu_tx_t *tx) +{ + dmu_buf_will_dirty(zap->zap_dbuf, tx); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); + ASSERT(delta > 0 || zap_f_phys(zap)->zap_num_entries >= -delta); + zap_f_phys(zap)->zap_num_entries += delta; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); +} + +static uint64_t +zap_allocate_blocks(zap_t *zap, int nblocks) +{ + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + uint64_t newblk = zap_f_phys(zap)->zap_freeblk; + zap_f_phys(zap)->zap_freeblk += nblocks; + return (newblk); +} + +static void +zap_leaf_evict_sync(void *dbu) +{ + zap_leaf_t *l = dbu; + + rw_destroy(&l->l_rwlock); + kmem_free(l, sizeof (zap_leaf_t)); +} + +static zap_leaf_t * +zap_create_leaf(zap_t *zap, dmu_tx_t *tx) +{ + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + rw_init(&l->l_rwlock, NULL, RW_NOLOCKDEP, NULL); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = zap_allocate_blocks(zap, 1); + l->l_dbuf = NULL; + + VERIFY0(dmu_buf_hold(zap->zap_objset, zap->zap_object, + l->l_blkid << FZAP_BLOCK_SHIFT(zap), NULL, &l->l_dbuf, + DMU_READ_NO_PREFETCH)); + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + VERIFY3P(NULL, ==, dmu_buf_set_user(l->l_dbuf, &l->l_dbu)); + dmu_buf_will_dirty(l->l_dbuf, tx); + + zap_leaf_init(l, zap->zap_normflags != 0); + + zap_f_phys(zap)->zap_num_leafs++; + + return (l); +} + +int +fzap_count(zap_t *zap, uint64_t *count) +{ + ASSERT(!zap->zap_ismicro); + mutex_enter(&zap->zap_f.zap_num_entries_mtx); /* unnecessary */ + *count = zap_f_phys(zap)->zap_num_entries; + mutex_exit(&zap->zap_f.zap_num_entries_mtx); + return (0); +} + +/* + * Routines for obtaining zap_leaf_t's + */ + +void +zap_put_leaf(zap_leaf_t *l) +{ + rw_exit(&l->l_rwlock); + dmu_buf_rele(l->l_dbuf, NULL); +} + +static zap_leaf_t * +zap_open_leaf(uint64_t blkid, dmu_buf_t *db) +{ + ASSERT(blkid != 0); + + zap_leaf_t *l = kmem_zalloc(sizeof (zap_leaf_t), KM_SLEEP); + rw_init(&l->l_rwlock, NULL, RW_DEFAULT, NULL); + rw_enter(&l->l_rwlock, RW_WRITER); + l->l_blkid = blkid; + l->l_bs = highbit64(db->db_size) - 1; + l->l_dbuf = db; + + dmu_buf_init_user(&l->l_dbu, zap_leaf_evict_sync, NULL, &l->l_dbuf); + zap_leaf_t *winner = dmu_buf_set_user(db, &l->l_dbu); + + rw_exit(&l->l_rwlock); + if (winner != NULL) { + /* someone else set it first */ + zap_leaf_evict_sync(&l->l_dbu); + l = winner; + } + + /* + * lhr_pad was previously used for the next leaf in the leaf + * chain. There should be no chained leafs (as we have removed + * support for them). + */ + ASSERT0(zap_leaf_phys(l)->l_hdr.lh_pad1); + + /* + * There should be more hash entries than there can be + * chunks to put in the hash table + */ + ASSERT3U(ZAP_LEAF_HASH_NUMENTRIES(l), >, ZAP_LEAF_NUMCHUNKS(l) / 3); + + /* The chunks should begin at the end of the hash table */ + ASSERT3P(&ZAP_LEAF_CHUNK(l, 0), ==, (zap_leaf_chunk_t *) + &zap_leaf_phys(l)->l_hash[ZAP_LEAF_HASH_NUMENTRIES(l)]); + + /* The chunks should end at the end of the block */ + ASSERT3U((uintptr_t)&ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)) - + (uintptr_t)zap_leaf_phys(l), ==, l->l_dbuf->db_size); + + return (l); +} + +static int +zap_get_leaf_byblk(zap_t *zap, uint64_t blkid, dmu_tx_t *tx, krw_t lt, + zap_leaf_t **lp) +{ + dmu_buf_t *db; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + /* + * If system crashed just after dmu_free_long_range in zfs_rmnode, we + * would be left with an empty xattr dir in delete queue. blkid=0 + * would be passed in when doing zfs_purgedir. If that's the case we + * should just return immediately. The underlying objects should + * already be freed, so this should be perfectly fine. + */ + if (blkid == 0) + return (SET_ERROR(ENOENT)); + + int bs = FZAP_BLOCK_SHIFT(zap); + dnode_t *dn = dmu_buf_dnode_enter(zap->zap_dbuf); + int err = dmu_buf_hold_by_dnode(dn, + blkid << bs, NULL, &db, DMU_READ_NO_PREFETCH); + dmu_buf_dnode_exit(zap->zap_dbuf); + if (err != 0) + return (err); + + ASSERT3U(db->db_object, ==, zap->zap_object); + ASSERT3U(db->db_offset, ==, blkid << bs); + ASSERT3U(db->db_size, ==, 1 << bs); + ASSERT(blkid != 0); + + zap_leaf_t *l = dmu_buf_get_user(db); + + if (l == NULL) + l = zap_open_leaf(blkid, db); + + rw_enter(&l->l_rwlock, lt); + /* + * Must lock before dirtying, otherwise zap_leaf_phys(l) could change, + * causing ASSERT below to fail. + */ + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + ASSERT3U(l->l_blkid, ==, blkid); + ASSERT3P(l->l_dbuf, ==, db); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_block_type, ==, ZBT_LEAF); + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + *lp = l; + return (0); +} + +static int +zap_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t *valp) +{ + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + ASSERT3U(idx, <, + (1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift)); + *valp = ZAP_EMBEDDED_PTRTBL_ENT(zap, idx); + return (0); + } else { + return (zap_table_load(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, valp)); + } +} + +static int +zap_set_idx_to_blk(zap_t *zap, uint64_t idx, uint64_t blk, dmu_tx_t *tx) +{ + ASSERT(tx != NULL); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + if (zap_f_phys(zap)->zap_ptrtbl.zt_blk == 0) { + ZAP_EMBEDDED_PTRTBL_ENT(zap, idx) = blk; + return (0); + } else { + return (zap_table_store(zap, &zap_f_phys(zap)->zap_ptrtbl, + idx, blk, tx)); + } +} + +static int +zap_deref_leaf(zap_t *zap, uint64_t h, dmu_tx_t *tx, krw_t lt, zap_leaf_t **lp) +{ + uint64_t blk; + + ASSERT(zap->zap_dbuf == NULL || + zap_f_phys(zap) == zap->zap_dbuf->db_data); + + /* Reality check for corrupt zap objects (leaf or header). */ + if ((zap_f_phys(zap)->zap_block_type != ZBT_LEAF && + zap_f_phys(zap)->zap_block_type != ZBT_HEADER) || + zap_f_phys(zap)->zap_magic != ZAP_MAGIC) { + return (SET_ERROR(EIO)); + } + + uint64_t idx = ZAP_HASH_IDX(h, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + int err = zap_idx_to_blk(zap, idx, &blk); + if (err != 0) + return (err); + err = zap_get_leaf_byblk(zap, blk, tx, lt, lp); + + ASSERT(err || + ZAP_HASH_IDX(h, zap_leaf_phys(*lp)->l_hdr.lh_prefix_len) == + zap_leaf_phys(*lp)->l_hdr.lh_prefix); + return (err); +} + +static int +zap_expand_leaf(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx, zap_leaf_t **lp) +{ + zap_t *zap = zn->zn_zap; + uint64_t hash = zn->zn_hash; + int err; + int old_prefix_len = zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + ASSERT3U(old_prefix_len, <=, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + if (zap_tryupgradedir(zap, tx) == 0 || + old_prefix_len == zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + /* We failed to upgrade, or need to grow the pointer table */ + objset_t *os = zap->zap_objset; + uint64_t object = zap->zap_object; + + zap_put_leaf(l); + zap_unlockdir(zap, tag); + err = zap_lockdir(os, object, tx, RW_WRITER, + FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return (err); + ASSERT(!zap->zap_ismicro); + + while (old_prefix_len == + zap_f_phys(zap)->zap_ptrtbl.zt_shift) { + err = zap_grow_ptrtbl(zap, tx); + if (err != 0) + return (err); + } + + err = zap_deref_leaf(zap, hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len != old_prefix_len) { + /* it split while our locks were down */ + *lp = l; + return (0); + } + } + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + ASSERT3U(old_prefix_len, <, zap_f_phys(zap)->zap_ptrtbl.zt_shift); + ASSERT3U(ZAP_HASH_IDX(hash, old_prefix_len), ==, + zap_leaf_phys(l)->l_hdr.lh_prefix); + + int prefix_diff = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + (old_prefix_len + 1); + uint64_t sibling = + (ZAP_HASH_IDX(hash, old_prefix_len + 1) | 1) << prefix_diff; + + /* check for i/o errors before doing zap_leaf_split */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + uint64_t blk; + err = zap_idx_to_blk(zap, sibling + i, &blk); + if (err != 0) + return (err); + ASSERT3U(blk, ==, l->l_blkid); + } + + zap_leaf_t *nl = zap_create_leaf(zap, tx); + zap_leaf_split(l, nl, zap->zap_normflags != 0); + + /* set sibling pointers */ + for (int i = 0; i < (1ULL << prefix_diff); i++) { + err = zap_set_idx_to_blk(zap, sibling + i, nl->l_blkid, tx); + ASSERT0(err); /* we checked for i/o errors above */ + } + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_prefix_len, >, 0); + + if (hash & (1ULL << (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) { + /* we want the sibling */ + zap_put_leaf(l); + *lp = nl; + } else { + zap_put_leaf(nl); + *lp = l; + } + + return (0); +} + +static void +zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, + void *tag, dmu_tx_t *tx) +{ + zap_t *zap = zn->zn_zap; + int shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + int leaffull = (zap_leaf_phys(l)->l_hdr.lh_prefix_len == shift && + zap_leaf_phys(l)->l_hdr.lh_nfree < ZAP_LEAF_LOW_WATER); + + zap_put_leaf(l); + + if (leaffull || zap_f_phys(zap)->zap_ptrtbl.zt_nextblk) { + /* + * We are in the middle of growing the pointer table, or + * this leaf will soon make us grow it. + */ + if (zap_tryupgradedir(zap, tx) == 0) { + objset_t *os = zap->zap_objset; + uint64_t zapobj = zap->zap_object; + + zap_unlockdir(zap, tag); + int err = zap_lockdir(os, zapobj, tx, + RW_WRITER, FALSE, FALSE, tag, &zn->zn_zap); + zap = zn->zn_zap; + if (err != 0) + return; + } + + /* could have finished growing while our locks were down */ + if (zap_f_phys(zap)->zap_ptrtbl.zt_shift == shift) + (void) zap_grow_ptrtbl(zap, tx); + } +} + +static int +fzap_checkname(zap_name_t *zn) +{ + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + return (0); +} + +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ + /* Only integer sizes supported by C */ + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (SET_ERROR(EINVAL)); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (SET_ERROR(E2BIG)); + + return (0); +} + +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err = fzap_checkname(zn); + if (err != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + +/* + * Routines for manipulating attributes. + */ +int +fzap_lookup(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, void *buf, + char *realname, int rn_len, boolean_t *ncp) +{ + zap_leaf_t *l; + zap_entry_handle_t zeh; + + int err = fzap_checkname(zn); + if (err != 0) + return (err); + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + + err = zap_entry_read(&zeh, integer_size, num_integers, buf); + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); + if (ncp) { + *ncp = zap_entry_normalization_conflict(&zeh, + zn, NULL, zn->zn_zap); + } + } + + zap_put_leaf(l); + return (err); +} + +int +fzap_add_cd(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, uint32_t cd, void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + ASSERT(!zap->zap_ismicro); + ASSERT(fzap_check(zn, integer_size, num_integers) == 0); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + err = SET_ERROR(EEXIST); + goto out; + } + if (err != ENOENT) + goto out; + + err = zap_entry_create(l, zn, cd, + integer_size, num_integers, val, &zeh); + + if (err == 0) { + zap_increment_num_entries(zap, 1, tx); + } else if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) { + goto retry; + } else if (err == ENOSPC) { + /* + * If we failed to expand the leaf, then bailout + * as there is no point trying + * zap_put_leaf_maybe_grow_ptrtbl(). + */ + return (err); + } + } + +out: + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + return (err); +} + +int +fzap_add(zap_name_t *zn, + uint64_t integer_size, uint64_t num_integers, + const void *val, void *tag, dmu_tx_t *tx) +{ + int err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + return (fzap_add_cd(zn, integer_size, num_integers, + val, ZAP_NEED_CD, tag, tx)); +} + +int +fzap_update(zap_name_t *zn, + int integer_size, uint64_t num_integers, const void *val, + void *tag, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + boolean_t create; + zap_entry_handle_t zeh; + zap_t *zap = zn->zn_zap; + + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + err = fzap_check(zn, integer_size, num_integers); + if (err != 0) + return (err); + + err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); +retry: + err = zap_leaf_lookup(l, zn, &zeh); + create = (err == ENOENT); + ASSERT(err == 0 || err == ENOENT); + + if (create) { + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); + if (err == 0) + zap_increment_num_entries(zap, 1, tx); + } else { + err = zap_entry_update(&zeh, integer_size, num_integers, val); + } + + if (err == EAGAIN) { + err = zap_expand_leaf(zn, l, tag, tx, &l); + zap = zn->zn_zap; /* zap_expand_leaf() may change zap */ + if (err == 0) + goto retry; + } + + if (zap != NULL) + zap_put_leaf_maybe_grow_ptrtbl(zn, l, tag, tx); + return (err); +} + +int +fzap_length(zap_name_t *zn, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + goto out; + + if (integer_size != 0) + *integer_size = zeh.zeh_integer_size; + if (num_integers != 0) + *num_integers = zeh.zeh_num_integers; +out: + zap_put_leaf(l); + return (err); +} + +int +fzap_remove(zap_name_t *zn, dmu_tx_t *tx) +{ + zap_leaf_t *l; + int err; + zap_entry_handle_t zeh; + + err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, tx, RW_WRITER, &l); + if (err != 0) + return (err); + err = zap_leaf_lookup(l, zn, &zeh); + if (err == 0) { + zap_entry_remove(&zeh); + zap_increment_num_entries(zn->zn_zap, -1, tx); + } + zap_put_leaf(l); + return (err); +} + +void +fzap_prefetch(zap_name_t *zn) +{ + uint64_t blk; + zap_t *zap = zn->zn_zap; + + uint64_t idx = ZAP_HASH_IDX(zn->zn_hash, + zap_f_phys(zap)->zap_ptrtbl.zt_shift); + if (zap_idx_to_blk(zap, idx, &blk) != 0) + return; + int bs = FZAP_BLOCK_SHIFT(zap); + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, blk << bs, 1 << bs, + ZIO_PRIORITY_SYNC_READ); +} + +/* + * Helper functions for consumers. + */ + +uint64_t +zap_create_link(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, dmu_tx_t *tx) +{ + return (zap_create_link_dnsize(os, ot, parent_obj, name, 0, tx)); +} + +uint64_t +zap_create_link_dnsize(objset_t *os, dmu_object_type_t ot, uint64_t parent_obj, + const char *name, int dnodesize, dmu_tx_t *tx) +{ + uint64_t new_obj; + + new_obj = zap_create_dnsize(os, ot, DMU_OT_NONE, 0, dnodesize, tx); + VERIFY(new_obj != 0); + VERIFY0(zap_add(os, parent_obj, name, sizeof (uint64_t), 1, &new_obj, + tx)); + + return (new_obj); +} + +int +zap_value_search(objset_t *os, uint64_t zapobj, uint64_t value, uint64_t mask, + char *name) +{ + zap_cursor_t zc; + int err; + + if (mask == 0) + mask = -1ULL; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, zapobj); + (err = zap_cursor_retrieve(&zc, za)) == 0; + zap_cursor_advance(&zc)) { + if ((za->za_first_integer & mask) == (value & mask)) { + (void) strlcpy(name, za->za_name, MAXNAMELEN); + break; + } + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + err = zap_add(os, intoobj, za->za_name, + 8, 1, &za->za_first_integer, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join_key(objset_t *os, uint64_t fromobj, uint64_t intoobj, + uint64_t value, dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + err = zap_add(os, intoobj, za->za_name, + 8, 1, &value, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_join_increment(objset_t *os, uint64_t fromobj, uint64_t intoobj, + dmu_tx_t *tx) +{ + zap_cursor_t zc; + int err = 0; + + zap_attribute_t *za = kmem_alloc(sizeof (*za), KM_SLEEP); + for (zap_cursor_init(&zc, os, fromobj); + zap_cursor_retrieve(&zc, za) == 0; + (void) zap_cursor_advance(&zc)) { + uint64_t delta = 0; + + if (za->za_integer_length != 8 || za->za_num_integers != 1) { + err = SET_ERROR(EINVAL); + break; + } + + err = zap_lookup(os, intoobj, za->za_name, 8, 1, &delta); + if (err != 0 && err != ENOENT) + break; + delta += za->za_first_integer; + err = zap_update(os, intoobj, za->za_name, 8, 1, &delta, tx); + if (err != 0) + break; + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (*za)); + return (err); +} + +int +zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_remove(os, obj, name, tx)); +} + +int +zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)value); + return (zap_lookup(os, obj, name, 8, 1, &value)); +} + +int +zap_add_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_add(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_update_int_key(objset_t *os, uint64_t obj, + uint64_t key, uint64_t value, dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_update(os, obj, name, 8, 1, &value, tx)); +} + +int +zap_lookup_int_key(objset_t *os, uint64_t obj, uint64_t key, uint64_t *valuep) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_lookup(os, obj, name, 8, 1, valuep)); +} + +int +zap_increment(objset_t *os, uint64_t obj, const char *name, int64_t delta, + dmu_tx_t *tx) +{ + uint64_t value = 0; + + if (delta == 0) + return (0); + + int err = zap_lookup(os, obj, name, 8, 1, &value); + if (err != 0 && err != ENOENT) + return (err); + value += delta; + if (value == 0) + err = zap_remove(os, obj, name, tx); + else + err = zap_update(os, obj, name, 8, 1, &value, tx); + return (err); +} + +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + return (zap_increment(os, obj, name, delta, tx)); +} + +/* + * Routines for iterating over the attributes. + */ + +int +fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) +{ + int err = ENOENT; + zap_entry_handle_t zeh; + zap_leaf_t *l; + + /* retrieve the next entry at or after zc_hash/zc_cd */ + /* if no entry, return ENOENT */ + + /* + * If we are reading from the beginning, we're almost certain to + * iterate over the entire ZAP object. If there are multiple leaf + * blocks (freeblk > 2), prefetch the whole object (up to + * dmu_prefetch_max bytes), so that we read the leaf blocks + * concurrently. (Unless noprefetch was requested via + * zap_cursor_init_noprefetch()). + */ + if (zc->zc_hash == 0 && zap_iterate_prefetch && + zc->zc_prefetch && zap_f_phys(zap)->zap_freeblk > 2) { + dmu_prefetch(zc->zc_objset, zc->zc_zapobj, 0, 0, + zap_f_phys(zap)->zap_freeblk << FZAP_BLOCK_SHIFT(zap), + ZIO_PRIORITY_ASYNC_READ); + } + + if (zc->zc_leaf && + (ZAP_HASH_IDX(zc->zc_hash, + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix_len) != + zap_leaf_phys(zc->zc_leaf)->l_hdr.lh_prefix)) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + +again: + if (zc->zc_leaf == NULL) { + err = zap_deref_leaf(zap, zc->zc_hash, NULL, RW_READER, + &zc->zc_leaf); + if (err != 0) + return (err); + } else { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + } + l = zc->zc_leaf; + + err = zap_leaf_lookup_closest(l, zc->zc_hash, zc->zc_cd, &zeh); + + if (err == ENOENT) { + if (zap_leaf_phys(l)->l_hdr.lh_prefix_len == 0) { + zc->zc_hash = -1ULL; + zc->zc_cd = 0; + } else { + uint64_t nocare = (1ULL << + (64 - zap_leaf_phys(l)->l_hdr.lh_prefix_len)) - 1; + + zc->zc_hash = (zc->zc_hash & ~nocare) + nocare + 1; + zc->zc_cd = 0; + + if (zc->zc_hash == 0) { + zc->zc_hash = -1ULL; + } else { + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + goto again; + } + } + } + + if (err == 0) { + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + za->za_integer_length = zeh.zeh_integer_size; + za->za_num_integers = zeh.zeh_num_integers; + if (zeh.zeh_num_integers == 0) { + za->za_first_integer = 0; + } else { + err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); + ASSERT(err == 0 || err == EOVERFLOW); + } + err = zap_entry_read_name(zap, &zeh, + sizeof (za->za_name), za->za_name); + ASSERT(err == 0); + + za->za_normalization_conflict = + zap_entry_normalization_conflict(&zeh, + NULL, za->za_name, zap); + } + rw_exit(&zc->zc_leaf->l_rwlock); + return (err); +} + +static void +zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) +{ + uint64_t lastblk = 0; + + /* + * NB: if a leaf has more pointers than an entire ptrtbl block + * can hold, then it'll be accounted for more than once, since + * we won't have lastblk. + */ + for (int i = 0; i < len; i++) { + zap_leaf_t *l; + + if (tbl[i] == lastblk) + continue; + lastblk = tbl[i]; + + int err = zap_get_leaf_byblk(zap, tbl[i], NULL, RW_READER, &l); + if (err == 0) { + zap_leaf_stats(zap, l, zs); + zap_put_leaf(l); + } + } +} + +void +fzap_get_stats(zap_t *zap, zap_stats_t *zs) +{ + int bs = FZAP_BLOCK_SHIFT(zap); + zs->zs_blocksize = 1ULL << bs; + + /* + * Set zap_phys_t fields + */ + zs->zs_num_leafs = zap_f_phys(zap)->zap_num_leafs; + zs->zs_num_entries = zap_f_phys(zap)->zap_num_entries; + zs->zs_num_blocks = zap_f_phys(zap)->zap_freeblk; + zs->zs_block_type = zap_f_phys(zap)->zap_block_type; + zs->zs_magic = zap_f_phys(zap)->zap_magic; + zs->zs_salt = zap_f_phys(zap)->zap_salt; + + /* + * Set zap_ptrtbl fields + */ + zs->zs_ptrtbl_len = 1ULL << zap_f_phys(zap)->zap_ptrtbl.zt_shift; + zs->zs_ptrtbl_nextblk = zap_f_phys(zap)->zap_ptrtbl.zt_nextblk; + zs->zs_ptrtbl_blks_copied = + zap_f_phys(zap)->zap_ptrtbl.zt_blks_copied; + zs->zs_ptrtbl_zt_blk = zap_f_phys(zap)->zap_ptrtbl.zt_blk; + zs->zs_ptrtbl_zt_numblks = zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + zs->zs_ptrtbl_zt_shift = zap_f_phys(zap)->zap_ptrtbl.zt_shift; + + if (zap_f_phys(zap)->zap_ptrtbl.zt_numblks == 0) { + /* the ptrtbl is entirely in the header block. */ + zap_stats_ptrtbl(zap, &ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), + 1 << ZAP_EMBEDDED_PTRTBL_SHIFT(zap), zs); + } else { + dmu_prefetch(zap->zap_objset, zap->zap_object, 0, + zap_f_phys(zap)->zap_ptrtbl.zt_blk << bs, + zap_f_phys(zap)->zap_ptrtbl.zt_numblks << bs, + ZIO_PRIORITY_SYNC_READ); + + for (int b = 0; b < zap_f_phys(zap)->zap_ptrtbl.zt_numblks; + b++) { + dmu_buf_t *db; + int err; + + err = dmu_buf_hold(zap->zap_objset, zap->zap_object, + (zap_f_phys(zap)->zap_ptrtbl.zt_blk + b) << bs, + FTAG, &db, DMU_READ_NO_PREFETCH); + if (err == 0) { + zap_stats_ptrtbl(zap, db->db_data, + 1<<(bs-3), zs); + dmu_buf_rele(db, FTAG); + } + } + } +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, , zap_iterate_prefetch, INT, ZMOD_RW, + "When iterating ZAP object, prefetch it"); +/* END CSTYLED */ diff --git a/module/zfs/zap_leaf.c b/module/zfs/zap_leaf.c new file mode 100644 index 000000000000..aa6c298c3b4b --- /dev/null +++ b/module/zfs/zap_leaf.c @@ -0,0 +1,849 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +/* + * The 512-byte leaf is broken into 32 16-byte chunks. + * chunk number n means l_chunk[n], even though the header precedes it. + * the names are stored null-terminated. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); + +#define CHAIN_END 0xffff /* end of the chunk chain */ + +#define LEAF_HASH(l, h) \ + ((ZAP_LEAF_HASH_NUMENTRIES(l)-1) & \ + ((h) >> \ + (64 - ZAP_LEAF_HASH_SHIFT(l) - zap_leaf_phys(l)->l_hdr.lh_prefix_len))) + +#define LEAF_HASH_ENTPTR(l, h) (&zap_leaf_phys(l)->l_hash[LEAF_HASH(l, h)]) + +extern inline zap_leaf_phys_t *zap_leaf_phys(zap_leaf_t *l); + +static void +zap_memset(void *a, int c, size_t n) +{ + char *cp = a; + char *cpend = cp + n; + + while (cp < cpend) + *cp++ = c; +} + +static void +stv(int len, void *addr, uint64_t value) +{ + switch (len) { + case 1: + *(uint8_t *)addr = value; + return; + case 2: + *(uint16_t *)addr = value; + return; + case 4: + *(uint32_t *)addr = value; + return; + case 8: + *(uint64_t *)addr = value; + return; + default: + cmn_err(CE_PANIC, "bad int len %d", len); + } +} + +static uint64_t +ldv(int len, const void *addr) +{ + switch (len) { + case 1: + return (*(uint8_t *)addr); + case 2: + return (*(uint16_t *)addr); + case 4: + return (*(uint32_t *)addr); + case 8: + return (*(uint64_t *)addr); + default: + cmn_err(CE_PANIC, "bad int len %d", len); + } + return (0xFEEDFACEDEADBEEFULL); +} + +void +zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) +{ + zap_leaf_t l; + dmu_buf_t l_dbuf; + + l_dbuf.db_data = buf; + l.l_bs = highbit64(size) - 1; + l.l_dbuf = &l_dbuf; + + buf->l_hdr.lh_block_type = BSWAP_64(buf->l_hdr.lh_block_type); + buf->l_hdr.lh_prefix = BSWAP_64(buf->l_hdr.lh_prefix); + buf->l_hdr.lh_magic = BSWAP_32(buf->l_hdr.lh_magic); + buf->l_hdr.lh_nfree = BSWAP_16(buf->l_hdr.lh_nfree); + buf->l_hdr.lh_nentries = BSWAP_16(buf->l_hdr.lh_nentries); + buf->l_hdr.lh_prefix_len = BSWAP_16(buf->l_hdr.lh_prefix_len); + buf->l_hdr.lh_freelist = BSWAP_16(buf->l_hdr.lh_freelist); + + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(&l); i++) + buf->l_hash[i] = BSWAP_16(buf->l_hash[i]); + + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(&l); i++) { + zap_leaf_chunk_t *lc = &ZAP_LEAF_CHUNK(&l, i); + struct zap_leaf_entry *le; + + switch (lc->l_free.lf_type) { + case ZAP_CHUNK_ENTRY: + le = &lc->l_entry; + + le->le_type = BSWAP_8(le->le_type); + le->le_value_intlen = BSWAP_8(le->le_value_intlen); + le->le_next = BSWAP_16(le->le_next); + le->le_name_chunk = BSWAP_16(le->le_name_chunk); + le->le_name_numints = BSWAP_16(le->le_name_numints); + le->le_value_chunk = BSWAP_16(le->le_value_chunk); + le->le_value_numints = BSWAP_16(le->le_value_numints); + le->le_cd = BSWAP_32(le->le_cd); + le->le_hash = BSWAP_64(le->le_hash); + break; + case ZAP_CHUNK_FREE: + lc->l_free.lf_type = BSWAP_8(lc->l_free.lf_type); + lc->l_free.lf_next = BSWAP_16(lc->l_free.lf_next); + break; + case ZAP_CHUNK_ARRAY: + lc->l_array.la_type = BSWAP_8(lc->l_array.la_type); + lc->l_array.la_next = BSWAP_16(lc->l_array.la_next); + /* la_array doesn't need swapping */ + break; + default: + cmn_err(CE_PANIC, "bad leaf type %d", + lc->l_free.lf_type); + } + } +} + +void +zap_leaf_init(zap_leaf_t *l, boolean_t sort) +{ + l->l_bs = highbit64(l->l_dbuf->db_size) - 1; + zap_memset(&zap_leaf_phys(l)->l_hdr, 0, + sizeof (struct zap_leaf_header)); + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + ZAP_LEAF_CHUNK(l, i).l_free.lf_type = ZAP_CHUNK_FREE; + ZAP_LEAF_CHUNK(l, i).l_free.lf_next = i+1; + } + ZAP_LEAF_CHUNK(l, ZAP_LEAF_NUMCHUNKS(l)-1).l_free.lf_next = CHAIN_END; + zap_leaf_phys(l)->l_hdr.lh_block_type = ZBT_LEAF; + zap_leaf_phys(l)->l_hdr.lh_magic = ZAP_LEAF_MAGIC; + zap_leaf_phys(l)->l_hdr.lh_nfree = ZAP_LEAF_NUMCHUNKS(l); + if (sort) + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; +} + +/* + * Routines which manipulate leaf chunks (l_chunk[]). + */ + +static uint16_t +zap_leaf_chunk_alloc(zap_leaf_t *l) +{ + ASSERT(zap_leaf_phys(l)->l_hdr.lh_nfree > 0); + + int chunk = zap_leaf_phys(l)->l_hdr.lh_freelist; + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_free.lf_type, ==, ZAP_CHUNK_FREE); + + zap_leaf_phys(l)->l_hdr.lh_freelist = + ZAP_LEAF_CHUNK(l, chunk).l_free.lf_next; + + zap_leaf_phys(l)->l_hdr.lh_nfree--; + + return (chunk); +} + +static void +zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) +{ + struct zap_leaf_free *zlf = &ZAP_LEAF_CHUNK(l, chunk).l_free; + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_nfree, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT(zlf->lf_type != ZAP_CHUNK_FREE); + + zlf->lf_type = ZAP_CHUNK_FREE; + zlf->lf_next = zap_leaf_phys(l)->l_hdr.lh_freelist; + bzero(zlf->lf_pad, sizeof (zlf->lf_pad)); /* help it to compress */ + zap_leaf_phys(l)->l_hdr.lh_freelist = chunk; + + zap_leaf_phys(l)->l_hdr.lh_nfree++; +} + +/* + * Routines which manipulate leaf arrays (zap_leaf_array type chunks). + */ + +static uint16_t +zap_leaf_array_create(zap_leaf_t *l, const char *buf, + int integer_size, int num_integers) +{ + uint16_t chunk_head; + uint16_t *chunkp = &chunk_head; + int byten = 0; + uint64_t value = 0; + int shift = (integer_size - 1) * 8; + int len = num_integers; + + ASSERT3U(num_integers * integer_size, <=, ZAP_MAXVALUELEN); + + while (len > 0) { + uint16_t chunk = zap_leaf_chunk_alloc(l); + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + + la->la_type = ZAP_CHUNK_ARRAY; + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES; i++) { + if (byten == 0) + value = ldv(integer_size, buf); + la->la_array[i] = value >> shift; + value <<= 8; + if (++byten == integer_size) { + byten = 0; + buf += integer_size; + if (--len == 0) + break; + } + } + + *chunkp = chunk; + chunkp = &la->la_next; + } + *chunkp = CHAIN_END; + + return (chunk_head); +} + +static void +zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) +{ + uint16_t chunk = *chunkp; + + *chunkp = CHAIN_END; + + while (chunk != CHAIN_END) { + int nextchunk = ZAP_LEAF_CHUNK(l, chunk).l_array.la_next; + ASSERT3U(ZAP_LEAF_CHUNK(l, chunk).l_array.la_type, ==, + ZAP_CHUNK_ARRAY); + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + } +} + +/* array_len and buf_len are in integers, not bytes */ +static void +zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, + int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, + void *buf) +{ + int len = MIN(array_len, buf_len); + int byten = 0; + uint64_t value = 0; + char *p = buf; + + ASSERT3U(array_int_len, <=, buf_int_len); + + /* Fast path for one 8-byte integer */ + if (array_int_len == 8 && buf_int_len == 8 && len == 1) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + uint8_t *ip = la->la_array; + uint64_t *buf64 = buf; + + *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | + (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | + (uint64_t)ip[4] << 24 | (uint64_t)ip[5] << 16 | + (uint64_t)ip[6] << 8 | (uint64_t)ip[7]; + return; + } + + /* Fast path for an array of 1-byte integers (eg. the entry name) */ + if (array_int_len == 1 && buf_int_len == 1 && + buf_len > array_len + ZAP_LEAF_ARRAY_BYTES) { + while (chunk != CHAIN_END) { + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); + p += ZAP_LEAF_ARRAY_BYTES; + chunk = la->la_next; + } + return; + } + + while (len > 0) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + for (int i = 0; i < ZAP_LEAF_ARRAY_BYTES && len > 0; i++) { + value = (value << 8) | la->la_array[i]; + byten++; + if (byten == array_int_len) { + stv(buf_int_len, p, value); + byten = 0; + len--; + if (len == 0) + return; + p += buf_int_len; + } + } + chunk = la->la_next; + } +} + +static boolean_t +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, + int chunk, int array_numints) +{ + int bseen = 0; + + if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { + uint64_t *thiskey = + kmem_alloc(array_numints * sizeof (*thiskey), KM_SLEEP); + ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); + + zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, + sizeof (*thiskey), array_numints, thiskey); + boolean_t match = bcmp(thiskey, zn->zn_key_orig, + array_numints * sizeof (*thiskey)) == 0; + kmem_free(thiskey, array_numints * sizeof (*thiskey)); + return (match); + } + + ASSERT(zn->zn_key_intlen == 1); + if (zn->zn_matchtype & MT_NORMALIZE) { + char *thisname = kmem_alloc(array_numints, KM_SLEEP); + + zap_leaf_array_read(l, chunk, sizeof (char), array_numints, + sizeof (char), array_numints, thisname); + boolean_t match = zap_match(zn, thisname); + kmem_free(thisname, array_numints); + return (match); + } + + /* + * Fast path for exact matching. + * First check that the lengths match, so that we don't read + * past the end of the zn_key_orig array. + */ + if (array_numints != zn->zn_key_orig_numints) + return (B_FALSE); + while (bseen < array_numints) { + struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; + int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) + break; + chunk = la->la_next; + bseen += toread; + } + return (bseen == array_numints); +} + +/* + * Routines which manipulate leaf entries. + */ + +int +zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) +{ + struct zap_leaf_entry *le; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (uint16_t *chunkp = LEAF_HASH_ENTPTR(l, zn->zn_hash); + *chunkp != CHAIN_END; chunkp = &le->le_next) { + uint16_t chunk = *chunkp; + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_hash != zn->zn_hash) + continue; + + /* + * NB: the entry chain is always sorted by cd on + * normalized zap objects, so this will find the + * lowest-cd match for MT_NORMALIZE. + */ + ASSERT((zn->zn_matchtype == 0) || + (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); + if (zap_leaf_array_match(l, zn, le->le_name_chunk, + le->le_name_numints)) { + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + zeh->zeh_leaf = l; + return (0); + } + } + + return (SET_ERROR(ENOENT)); +} + +/* Return (h1,cd1 >= h2,cd2) */ +#define HCD_GTEQ(h1, cd1, h2, cd2) \ + ((h1 > h2) ? TRUE : ((h1 == h2 && cd1 >= cd2) ? TRUE : FALSE)) + +int +zap_leaf_lookup_closest(zap_leaf_t *l, + uint64_t h, uint32_t cd, zap_entry_handle_t *zeh) +{ + uint64_t besth = -1ULL; + uint32_t bestcd = -1U; + uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; + struct zap_leaf_entry *le; + + ASSERT3U(zap_leaf_phys(l)->l_hdr.lh_magic, ==, ZAP_LEAF_MAGIC); + + for (uint16_t lh = LEAF_HASH(l, h); lh <= bestlh; lh++) { + for (uint16_t chunk = zap_leaf_phys(l)->l_hash[lh]; + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (HCD_GTEQ(le->le_hash, le->le_cd, h, cd) && + HCD_GTEQ(besth, bestcd, le->le_hash, le->le_cd)) { + ASSERT3U(bestlh, >=, lh); + bestlh = lh; + besth = le->le_hash; + bestcd = le->le_cd; + + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_fakechunk = chunk; + zeh->zeh_chunkp = &zeh->zeh_fakechunk; + zeh->zeh_leaf = l; + } + } + } + + return (bestcd == -1U ? SET_ERROR(ENOENT) : 0); +} + +int +zap_entry_read(const zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, void *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (le->le_value_intlen > integer_size) + return (SET_ERROR(EINVAL)); + + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, + le->le_value_intlen, le->le_value_numints, + integer_size, num_integers, buf); + + if (zeh->zeh_num_integers > num_integers) + return (SET_ERROR(EOVERFLOW)); + return (0); + +} + +int +zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, + char *buf) +{ + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, + le->le_name_numints, 8, buflen / 8, buf); + } else { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_numints, 1, buflen, buf); + } + if (le->le_name_numints > buflen) + return (SET_ERROR(EOVERFLOW)); + return (0); +} + +int +zap_entry_update(zap_entry_handle_t *zeh, + uint8_t integer_size, uint64_t num_integers, const void *buf) +{ + zap_leaf_t *l = zeh->zeh_leaf; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); + + int delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); + + if ((int)zap_leaf_phys(l)->l_hdr.lh_nfree < delta_chunks) + return (SET_ERROR(EAGAIN)); + + zap_leaf_array_free(l, &le->le_value_chunk); + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; + return (0); +} + +void +zap_entry_remove(zap_entry_handle_t *zeh) +{ + zap_leaf_t *l = zeh->zeh_leaf; + + ASSERT3P(zeh->zeh_chunkp, !=, &zeh->zeh_fakechunk); + + uint16_t entry_chunk = *zeh->zeh_chunkp; + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry_chunk); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + zap_leaf_array_free(l, &le->le_name_chunk); + zap_leaf_array_free(l, &le->le_value_chunk); + + *zeh->zeh_chunkp = le->le_next; + zap_leaf_chunk_free(l, entry_chunk); + + zap_leaf_phys(l)->l_hdr.lh_nentries--; +} + +int +zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh) +{ + uint16_t chunk; + struct zap_leaf_entry *le; + uint64_t h = zn->zn_hash; + + uint64_t valuelen = integer_size * num_integers; + + int numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) + return (SET_ERROR(E2BIG)); + + if (cd == ZAP_NEED_CD) { + /* find the lowest unused cd */ + if (zap_leaf_phys(l)->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { + cd = 0; + + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_cd > cd) + break; + if (le->le_hash == h) { + ASSERT3U(cd, ==, le->le_cd); + cd++; + } + } + } else { + /* old unsorted format; do it the O(n^2) way */ + for (cd = 0; ; cd++) { + for (chunk = *LEAF_HASH_ENTPTR(l, h); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(l, chunk); + if (le->le_hash == h && + le->le_cd == cd) { + break; + } + } + /* If this cd is not in use, we are good. */ + if (chunk == CHAIN_END) + break; + } + } + /* + * We would run out of space in a block before we could + * store enough entries to run out of CD values. + */ + ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); + } + + if (zap_leaf_phys(l)->l_hdr.lh_nfree < numchunks) + return (SET_ERROR(EAGAIN)); + + /* make the entry */ + chunk = zap_leaf_chunk_alloc(l); + le = ZAP_LEAF_ENTRY(l, chunk); + le->le_type = ZAP_CHUNK_ENTRY; + le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, + zn->zn_key_intlen, zn->zn_key_orig_numints); + le->le_name_numints = zn->zn_key_orig_numints; + le->le_value_chunk = + zap_leaf_array_create(l, buf, integer_size, num_integers); + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; + le->le_hash = h; + le->le_cd = cd; + + /* link it into the hash chain */ + /* XXX if we did the search above, we could just use that */ + uint16_t *chunkp = zap_leaf_rehash_entry(l, chunk); + + zap_leaf_phys(l)->l_hdr.lh_nentries++; + + zeh->zeh_leaf = l; + zeh->zeh_num_integers = num_integers; + zeh->zeh_integer_size = le->le_value_intlen; + zeh->zeh_cd = le->le_cd; + zeh->zeh_hash = le->le_hash; + zeh->zeh_chunkp = chunkp; + + return (0); +} + +/* + * Determine if there is another entry with the same normalized form. + * For performance purposes, either zn or name must be provided (the + * other can be NULL). Note, there usually won't be any hash + * conflicts, in which case we don't need the concatenated/normalized + * form of the name. But all callers have one of these on hand anyway, + * so might as well take advantage. A cleaner but slower interface + * would accept neither argument, and compute the normalized name as + * needed (using zap_name_alloc(zap_entry_read_name(zeh))). + */ +boolean_t +zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, + const char *name, zap_t *zap) +{ + struct zap_leaf_entry *le; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + + for (uint16_t chunk = *LEAF_HASH_ENTPTR(zeh->zeh_leaf, zeh->zeh_hash); + chunk != CHAIN_END; chunk = le->le_next) { + le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, chunk); + if (le->le_hash != zeh->zeh_hash) + continue; + if (le->le_cd == zeh->zeh_cd) + continue; + + if (zn == NULL) { + zn = zap_name_alloc(zap, name, MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_leaf_array_match(zeh->zeh_leaf, zn, + le->le_name_chunk, le->le_name_numints)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for transferring entries between leafs. + */ + +static uint16_t * +zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + struct zap_leaf_entry *le2; + uint16_t *chunkp; + + /* + * keep the entry chain sorted by cd + * NB: this will not cause problems for unsorted leafs, though + * it is unnecessary there. + */ + for (chunkp = LEAF_HASH_ENTPTR(l, le->le_hash); + *chunkp != CHAIN_END; chunkp = &le2->le_next) { + le2 = ZAP_LEAF_ENTRY(l, *chunkp); + if (le2->le_cd > le->le_cd) + break; + } + + le->le_next = *chunkp; + *chunkp = entry; + return (chunkp); +} + +static uint16_t +zap_leaf_transfer_array(zap_leaf_t *l, uint16_t chunk, zap_leaf_t *nl) +{ + uint16_t new_chunk; + uint16_t *nchunkp = &new_chunk; + + while (chunk != CHAIN_END) { + uint16_t nchunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_array *nla = + &ZAP_LEAF_CHUNK(nl, nchunk).l_array; + struct zap_leaf_array *la = + &ZAP_LEAF_CHUNK(l, chunk).l_array; + int nextchunk = la->la_next; + + ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); + ASSERT3U(nchunk, <, ZAP_LEAF_NUMCHUNKS(l)); + + *nla = *la; /* structure assignment */ + + zap_leaf_chunk_free(l, chunk); + chunk = nextchunk; + *nchunkp = nchunk; + nchunkp = &nla->la_next; + } + *nchunkp = CHAIN_END; + return (new_chunk); +} + +static void +zap_leaf_transfer_entry(zap_leaf_t *l, int entry, zap_leaf_t *nl) +{ + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, entry); + ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); + + uint16_t chunk = zap_leaf_chunk_alloc(nl); + struct zap_leaf_entry *nle = ZAP_LEAF_ENTRY(nl, chunk); + *nle = *le; /* structure assignment */ + + (void) zap_leaf_rehash_entry(nl, chunk); + + nle->le_name_chunk = zap_leaf_transfer_array(l, le->le_name_chunk, nl); + nle->le_value_chunk = + zap_leaf_transfer_array(l, le->le_value_chunk, nl); + + zap_leaf_chunk_free(l, entry); + + zap_leaf_phys(l)->l_hdr.lh_nentries--; + zap_leaf_phys(nl)->l_hdr.lh_nentries++; +} + +/* + * Transfer the entries whose hash prefix ends in 1 to the new leaf. + */ +void +zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort) +{ + int bit = 64 - 1 - zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + /* set new prefix and prefix_len */ + zap_leaf_phys(l)->l_hdr.lh_prefix <<= 1; + zap_leaf_phys(l)->l_hdr.lh_prefix_len++; + zap_leaf_phys(nl)->l_hdr.lh_prefix = + zap_leaf_phys(l)->l_hdr.lh_prefix | 1; + zap_leaf_phys(nl)->l_hdr.lh_prefix_len = + zap_leaf_phys(l)->l_hdr.lh_prefix_len; + + /* break existing hash chains */ + zap_memset(zap_leaf_phys(l)->l_hash, CHAIN_END, + 2*ZAP_LEAF_HASH_NUMENTRIES(l)); + + if (sort) + zap_leaf_phys(l)->l_hdr.lh_flags |= ZLF_ENTRIES_CDSORTED; + + /* + * Transfer entries whose hash bit 'bit' is set to nl; rehash + * the remaining entries + * + * NB: We could find entries via the hashtable instead. That + * would be O(hashents+numents) rather than O(numblks+numents), + * but this accesses memory more sequentially, and when we're + * called, the block is usually pretty full. + */ + for (int i = 0; i < ZAP_LEAF_NUMCHUNKS(l); i++) { + struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, i); + if (le->le_type != ZAP_CHUNK_ENTRY) + continue; + + if (le->le_hash & (1ULL << bit)) + zap_leaf_transfer_entry(l, i, nl); + else + (void) zap_leaf_rehash_entry(l, i); + } +} + +void +zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) +{ + int n = zap_f_phys(zap)->zap_ptrtbl.zt_shift - + zap_leaf_phys(l)->l_hdr.lh_prefix_len; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_leafs_with_2n_pointers[n]++; + + + n = zap_leaf_phys(l)->l_hdr.lh_nentries/5; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_blocks_with_n5_entries[n]++; + + n = ((1<l_hdr.lh_nfree * (ZAP_LEAF_ARRAY_BYTES+1))*10 / + (1<zs_blocks_n_tenths_full[n]++; + + for (int i = 0; i < ZAP_LEAF_HASH_NUMENTRIES(l); i++) { + int nentries = 0; + int chunk = zap_leaf_phys(l)->l_hash[i]; + + while (chunk != CHAIN_END) { + struct zap_leaf_entry *le = + ZAP_LEAF_ENTRY(l, chunk); + + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * + le->le_value_intlen); + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_entries_using_n_chunks[n]++; + + chunk = le->le_next; + nentries++; + } + + n = nentries; + n = MIN(n, ZAP_HISTOGRAM_SIZE-1); + zs->zs_buckets_with_n_entries[n]++; + } +} diff --git a/module/zfs/zap_micro.c b/module/zfs/zap_micro.c new file mode 100644 index 000000000000..5d9bc2076068 --- /dev/null +++ b/module/zfs/zap_micro.c @@ -0,0 +1,1697 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Spectra Logic Corporation, All rights reserved. + * Copyright 2017 Nexenta Systems, Inc. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#endif + +extern inline mzap_phys_t *zap_m_phys(zap_t *zap); + +static int mzap_upgrade(zap_t **zapp, + void *tag, dmu_tx_t *tx, zap_flags_t flags); + +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap_f_phys(zap)->zap_flags); +} + +int +zap_hashbits(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} + +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} + +static uint64_t +zap_hash(zap_name_t *zn) +{ + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (int i = 0; i < zn->zn_key_norm_numints; + wp++, i++) { + uint64_t word = *wp; + + for (int j = 0; j < zn->zn_key_intlen; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + const uint8_t *cp = zn->zn_key_norm; + + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + int len = zn->zn_key_norm_numints - 1; + + ASSERT(zn->zn_key_intlen == 1); + for (int i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } + /* + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when + * choosing the bucket. + */ + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); + + return (h); +} + +static int +zap_normalize(zap_t *zap, const char *name, char *namenorm, int normflags) +{ + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + + size_t inlen = strlen(name) + 1; + size_t outlen = ZAP_MAXNAMELEN; + + int err = 0; + (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, + normflags | U8_TEXTPREP_IGNORE_NULL | U8_TEXTPREP_IGNORE_INVALID, + U8_UNICODE_LATEST, &err); + + return (err); +} + +boolean_t +zap_match(zap_name_t *zn, const char *matchname) +{ + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + + if (zn->zn_matchtype & MT_NORMALIZE) { + char norm[ZAP_MAXNAMELEN]; + + if (zap_normalize(zn->zn_zap, matchname, norm, + zn->zn_normflags) != 0) + return (B_FALSE); + + return (strcmp(zn->zn_key_norm, norm) == 0); + } else { + return (strcmp(zn->zn_key_orig, matchname) == 0); + } +} + +void +zap_name_free(zap_name_t *zn) +{ + kmem_free(zn, sizeof (zap_name_t)); +} + +zap_name_t * +zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; + zn->zn_matchtype = mt; + zn->zn_normflags = zap->zap_normflags; + + /* + * If we're dealing with a case sensitive lookup on a mixed or + * insensitive fs, remove U8_TEXTPREP_TOUPPER or the lookup + * will fold case to all caps overriding the lookup request. + */ + if (mt & MT_MATCH_CASE) + zn->zn_normflags &= ~U8_TEXTPREP_TOUPPER; + + if (zap->zap_normflags) { + /* + * We *must* use zap_normflags because this normalization is + * what the hash is computed from. + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zap->zap_normflags) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } else { + if (mt != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; + } + + zn->zn_hash = zap_hash(zn); + + if (zap->zap_normflags != zn->zn_normflags) { + /* + * We *must* use zn_normflags because this normalization is + * what the matching is based on. (Not the hash!) + */ + if (zap_normalize(zap, key, zn->zn_normbuf, + zn->zn_normflags) != 0) { + zap_name_free(zn); + return (NULL); + } + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; + } + + return (zn); +} + +static zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + ASSERT(zap->zap_normflags == 0); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = 0; + + zn->zn_hash = zap_hash(zn); + return (zn); +} + +static void +mzap_byteswap(mzap_phys_t *buf, size_t size) +{ + buf->mz_block_type = BSWAP_64(buf->mz_block_type); + buf->mz_salt = BSWAP_64(buf->mz_salt); + buf->mz_normflags = BSWAP_64(buf->mz_normflags); + int max = (size / MZAP_ENT_LEN) - 1; + for (int i = 0; i < max; i++) { + buf->mz_chunk[i].mze_value = + BSWAP_64(buf->mz_chunk[i].mze_value); + buf->mz_chunk[i].mze_cd = + BSWAP_32(buf->mz_chunk[i].mze_cd); + } +} + +void +zap_byteswap(void *buf, size_t size) +{ + uint64_t block_type = *(uint64_t *)buf; + + if (block_type == ZBT_MICRO || block_type == BSWAP_64(ZBT_MICRO)) { + /* ASSERT(magic == ZAP_LEAF_MAGIC); */ + mzap_byteswap(buf, size); + } else { + fzap_byteswap(buf, size); + } +} + +static int +mze_compare(const void *arg1, const void *arg2) +{ + const mzap_ent_t *mze1 = arg1; + const mzap_ent_t *mze2 = arg2; + + int cmp = TREE_CMP(mze1->mze_hash, mze2->mze_hash); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(mze1->mze_cd, mze2->mze_cd)); +} + +static void +mze_insert(zap_t *zap, int chunkid, uint64_t hash) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + mzap_ent_t *mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); + mze->mze_chunkid = chunkid; + mze->mze_hash = hash; + mze->mze_cd = MZE_PHYS(zap, mze)->mze_cd; + ASSERT(MZE_PHYS(zap, mze)->mze_name[0] != 0); + avl_add(&zap->zap_m.zap_avl, mze); +} + +static mzap_ent_t * +mze_find(zap_name_t *zn) +{ + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zn->zn_zap->zap_m.zap_avl; + + ASSERT(zn->zn_zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); + + mze_tofind.mze_hash = zn->zn_hash; + mze_tofind.mze_cd = 0; + + mze = avl_find(avl, &mze_tofind, &idx); + if (mze == NULL) + mze = avl_nearest(avl, idx, AVL_AFTER); + for (; mze && mze->mze_hash == zn->zn_hash; mze = AVL_NEXT(avl, mze)) { + ASSERT3U(mze->mze_cd, ==, MZE_PHYS(zn->zn_zap, mze)->mze_cd); + if (zap_match(zn, MZE_PHYS(zn->zn_zap, mze)->mze_name)) + return (mze); + } + + return (NULL); +} + +static uint32_t +mze_find_unused_cd(zap_t *zap, uint64_t hash) +{ + mzap_ent_t mze_tofind; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + + ASSERT(zap->zap_ismicro); + ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); + + mze_tofind.mze_hash = hash; + mze_tofind.mze_cd = 0; + + uint32_t cd = 0; + for (mzap_ent_t *mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + if (mze->mze_cd != cd) + break; + cd++; + } + + return (cd); +} + +/* + * Each mzap entry requires at max : 4 chunks + * 3 chunks for names + 1 chunk for value. + */ +#define MZAP_ENT_CHUNKS (1 + ZAP_LEAF_ARRAY_NCHUNKS(MZAP_NAME_LEN) + \ + ZAP_LEAF_ARRAY_NCHUNKS(sizeof (uint64_t))) + +/* + * Check if the current entry keeps the colliding entries under the fatzap leaf + * size. + */ +static boolean_t +mze_canfit_fzap_leaf(zap_name_t *zn, uint64_t hash) +{ + zap_t *zap = zn->zn_zap; + mzap_ent_t mze_tofind; + mzap_ent_t *mze; + avl_index_t idx; + avl_tree_t *avl = &zap->zap_m.zap_avl; + uint32_t mzap_ents = 0; + + mze_tofind.mze_hash = hash; + mze_tofind.mze_cd = 0; + + for (mze = avl_find(avl, &mze_tofind, &idx); + mze && mze->mze_hash == hash; mze = AVL_NEXT(avl, mze)) { + mzap_ents++; + } + + /* Include the new entry being added */ + mzap_ents++; + + return (ZAP_LEAF_NUMCHUNKS_DEF > (mzap_ents * MZAP_ENT_CHUNKS)); +} + +static void +mze_remove(zap_t *zap, mzap_ent_t *mze) +{ + ASSERT(zap->zap_ismicro); + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + avl_remove(&zap->zap_m.zap_avl, mze); + kmem_free(mze, sizeof (mzap_ent_t)); +} + +static void +mze_destroy(zap_t *zap) +{ + mzap_ent_t *mze; + void *avlcookie = NULL; + + while ((mze = avl_destroy_nodes(&zap->zap_m.zap_avl, &avlcookie))) + kmem_free(mze, sizeof (mzap_ent_t)); + avl_destroy(&zap->zap_m.zap_avl); +} + +static zap_t * +mzap_open(objset_t *os, uint64_t obj, dmu_buf_t *db) +{ + zap_t *winner; + uint64_t *zap_hdr = (uint64_t *)db->db_data; + uint64_t zap_block_type = zap_hdr[0]; + uint64_t zap_magic = zap_hdr[1]; + + ASSERT3U(MZAP_ENT_LEN, ==, sizeof (mzap_ent_phys_t)); + + zap_t *zap = kmem_zalloc(sizeof (zap_t), KM_SLEEP); + rw_init(&zap->zap_rwlock, NULL, RW_DEFAULT, NULL); + rw_enter(&zap->zap_rwlock, RW_WRITER); + zap->zap_objset = os; + zap->zap_object = obj; + zap->zap_dbuf = db; + + if (zap_block_type != ZBT_MICRO) { + mutex_init(&zap->zap_f.zap_num_entries_mtx, 0, MUTEX_DEFAULT, + 0); + zap->zap_f.zap_block_shift = highbit64(db->db_size) - 1; + if (zap_block_type != ZBT_HEADER || zap_magic != ZAP_MAGIC) { + winner = NULL; /* No actual winner here... */ + goto handle_winner; + } + } else { + zap->zap_ismicro = TRUE; + } + + /* + * Make sure that zap_ismicro is set before we let others see + * it, because zap_lockdir() checks zap_ismicro without the lock + * held. + */ + dmu_buf_init_user(&zap->zap_dbu, zap_evict_sync, NULL, &zap->zap_dbuf); + winner = dmu_buf_set_user(db, &zap->zap_dbu); + + if (winner != NULL) + goto handle_winner; + + if (zap->zap_ismicro) { + zap->zap_salt = zap_m_phys(zap)->mz_salt; + zap->zap_normflags = zap_m_phys(zap)->mz_normflags; + zap->zap_m.zap_num_chunks = db->db_size / MZAP_ENT_LEN - 1; + avl_create(&zap->zap_m.zap_avl, mze_compare, + sizeof (mzap_ent_t), offsetof(mzap_ent_t, mze_node)); + + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = + &zap_m_phys(zap)->mz_chunk[i]; + if (mze->mze_name[0]) { + zap_name_t *zn; + + zap->zap_m.zap_num_entries++; + zn = zap_name_alloc(zap, mze->mze_name, 0); + mze_insert(zap, i, zn->zn_hash); + zap_name_free(zn); + } + } + } else { + zap->zap_salt = zap_f_phys(zap)->zap_salt; + zap->zap_normflags = zap_f_phys(zap)->zap_normflags; + + ASSERT3U(sizeof (struct zap_leaf_header), ==, + 2*ZAP_LEAF_CHUNKSIZE); + + /* + * The embedded pointer table should not overlap the + * other members. + */ + ASSERT3P(&ZAP_EMBEDDED_PTRTBL_ENT(zap, 0), >, + &zap_f_phys(zap)->zap_salt); + + /* + * The embedded pointer table should end at the end of + * the block + */ + ASSERT3U((uintptr_t)&ZAP_EMBEDDED_PTRTBL_ENT(zap, + 1<zap_dbuf->db_size); + } + rw_exit(&zap->zap_rwlock); + return (zap); + +handle_winner: + rw_exit(&zap->zap_rwlock); + rw_destroy(&zap->zap_rwlock); + if (!zap->zap_ismicro) + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + kmem_free(zap, sizeof (zap_t)); + return (winner); +} + +/* + * This routine "consumes" the caller's hold on the dbuf, which must + * have the specified tag. + */ +static int +zap_lockdir_impl(dmu_buf_t *db, void *tag, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp) +{ + ASSERT0(db->db_offset); + objset_t *os = dmu_buf_get_objset(db); + uint64_t obj = db->db_object; + dmu_object_info_t doi; + + *zapp = NULL; + + dmu_object_info_from_db(db, &doi); + if (DMU_OT_BYTESWAP(doi.doi_type) != DMU_BSWAP_ZAP) + return (SET_ERROR(EINVAL)); + + zap_t *zap = dmu_buf_get_user(db); + if (zap == NULL) { + zap = mzap_open(os, obj, db); + if (zap == NULL) { + /* + * mzap_open() didn't like what it saw on-disk. + * Check for corruption! + */ + return (SET_ERROR(EIO)); + } + } + + /* + * We're checking zap_ismicro without the lock held, in order to + * tell what type of lock we want. Once we have some sort of + * lock, see if it really is the right type. In practice this + * can only be different if it was upgraded from micro to fat, + * and micro wanted WRITER but fat only needs READER. + */ + krw_t lt = (!zap->zap_ismicro && fatreader) ? RW_READER : lti; + rw_enter(&zap->zap_rwlock, lt); + if (lt != ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)) { + /* it was upgraded, now we only need reader */ + ASSERT(lt == RW_WRITER); + ASSERT(RW_READER == + ((!zap->zap_ismicro && fatreader) ? RW_READER : lti)); + rw_downgrade(&zap->zap_rwlock); + lt = RW_READER; + } + + zap->zap_objset = os; + + if (lt == RW_WRITER) + dmu_buf_will_dirty(db, tx); + + ASSERT3P(zap->zap_dbuf, ==, db); + + ASSERT(!zap->zap_ismicro || + zap->zap_m.zap_num_entries <= zap->zap_m.zap_num_chunks); + if (zap->zap_ismicro && tx && adding && + zap->zap_m.zap_num_entries == zap->zap_m.zap_num_chunks) { + uint64_t newsz = db->db_size + SPA_MINBLOCKSIZE; + if (newsz > MZAP_MAX_BLKSZ) { + dprintf("upgrading obj %llu: num_entries=%u\n", + obj, zap->zap_m.zap_num_entries); + *zapp = zap; + int err = mzap_upgrade(zapp, tag, tx, 0); + if (err != 0) + rw_exit(&zap->zap_rwlock); + return (err); + } + VERIFY0(dmu_object_set_blocksize(os, obj, newsz, 0, tx)); + zap->zap_m.zap_num_chunks = + db->db_size / MZAP_ENT_LEN - 1; + } + + *zapp = zap; + return (0); +} + +static int +zap_lockdir_by_dnode(dnode_t *dn, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + + int err = dmu_buf_hold_by_dnode(dn, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) { + return (err); + } +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif + + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) { + dmu_buf_rele(db, tag); + } + return (err); +} + +int +zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, + krw_t lti, boolean_t fatreader, boolean_t adding, void *tag, zap_t **zapp) +{ + dmu_buf_t *db; + + int err = dmu_buf_hold(os, obj, 0, tag, &db, DMU_READ_NO_PREFETCH); + if (err != 0) + return (err); +#ifdef ZFS_DEBUG + { + dmu_object_info_t doi; + dmu_object_info_from_db(db, &doi); + ASSERT3U(DMU_OT_BYTESWAP(doi.doi_type), ==, DMU_BSWAP_ZAP); + } +#endif + err = zap_lockdir_impl(db, tag, tx, lti, fatreader, adding, zapp); + if (err != 0) + dmu_buf_rele(db, tag); + return (err); +} + +void +zap_unlockdir(zap_t *zap, void *tag) +{ + rw_exit(&zap->zap_rwlock); + dmu_buf_rele(zap->zap_dbuf, tag); +} + +static int +mzap_upgrade(zap_t **zapp, void *tag, dmu_tx_t *tx, zap_flags_t flags) +{ + int err = 0; + zap_t *zap = *zapp; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + + int sz = zap->zap_dbuf->db_size; + mzap_phys_t *mzp = vmem_alloc(sz, KM_SLEEP); + bcopy(zap->zap_dbuf->db_data, mzp, sz); + int nchunks = zap->zap_m.zap_num_chunks; + + if (!flags) { + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err != 0) { + vmem_free(mzp, sz); + return (err); + } + } + + dprintf("upgrading obj=%llu with %u chunks\n", + zap->zap_object, nchunks); + /* XXX destroy the avl later, so we can use the stored hash value */ + mze_destroy(zap); + + fzap_upgrade(zap, tx, flags); + + for (int i = 0; i < nchunks; i++) { + mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; + if (mze->mze_name[0] == 0) + continue; + dprintf("adding %s=%llu\n", + mze->mze_name, mze->mze_value); + zap_name_t *zn = zap_name_alloc(zap, mze->mze_name, 0); + /* If we fail here, we would end up losing entries */ + VERIFY0(fzap_add_cd(zn, 8, 1, &mze->mze_value, mze->mze_cd, + tag, tx)); + zap = zn->zn_zap; /* fzap_add_cd() may change zap */ + zap_name_free(zn); + } + vmem_free(mzp, sz); + *zapp = zap; + return (0); +} + +/* + * The "normflags" determine the behavior of the matchtype_t which is + * passed to zap_lookup_norm(). Names which have the same normalized + * version will be stored with the same hash value, and therefore we can + * perform normalization-insensitive lookups. We can be Unicode form- + * insensitive and/or case-insensitive. The following flags are valid for + * "normflags": + * + * U8_TEXTPREP_NFC + * U8_TEXTPREP_NFD + * U8_TEXTPREP_NFKC + * U8_TEXTPREP_NFKD + * U8_TEXTPREP_TOUPPER + * + * The *_NF* (Normalization Form) flags are mutually exclusive; at most one + * of them may be supplied. + */ +void +mzap_create_impl(dnode_t *dn, int normflags, zap_flags_t flags, dmu_tx_t *tx) +{ + dmu_buf_t *db; + + VERIFY0(dmu_buf_hold_by_dnode(dn, 0, FTAG, &db, DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(db, tx); + mzap_phys_t *zp = db->db_data; + zp->mz_block_type = ZBT_MICRO; + zp->mz_salt = + ((uintptr_t)db ^ (uintptr_t)tx ^ (dn->dn_object << 1)) | 1ULL; + zp->mz_normflags = normflags; + + if (flags != 0) { + zap_t *zap; + /* Only fat zap supports flags; upgrade immediately. */ + VERIFY0(zap_lockdir_impl(db, FTAG, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY0(mzap_upgrade(&zap, FTAG, tx, flags)); + zap_unlockdir(zap, FTAG); + } else { + dmu_buf_rele(db, FTAG); + } +} + +static uint64_t +zap_create_impl(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) +{ + uint64_t obj; + + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + + if (allocated_dnode == NULL) { + dnode_t *dn; + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + &dn, FTAG, tx); + mzap_create_impl(dn, normflags, flags, tx); + dnode_rele(dn, FTAG); + } else { + obj = dmu_object_alloc_hold(os, ot, 1ULL << leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx); + mzap_create_impl(*allocated_dnode, normflags, flags, tx); + } + + return (obj); +} + +int +zap_create_claim(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_dnsize(os, obj, ot, bonustype, bonuslen, + 0, tx)); +} + +int +zap_create_claim_dnsize(objset_t *os, uint64_t obj, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, + 0, ot, bonustype, bonuslen, dnodesize, tx)); +} + +int +zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_claim_norm_dnsize(os, obj, normflags, ot, bonustype, + bonuslen, 0, tx)); +} + +int +zap_create_claim_norm_dnsize(objset_t *os, uint64_t obj, int normflags, + dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, + int dnodesize, dmu_tx_t *tx) +{ + dnode_t *dn; + int error; + + ASSERT3U(DMU_OT_BYTESWAP(ot), ==, DMU_BSWAP_ZAP); + error = dmu_object_claim_dnsize(os, obj, ot, 0, bonustype, bonuslen, + dnodesize, tx); + if (error != 0) + return (error); + + error = dnode_hold(os, obj, FTAG, &dn); + if (error != 0) + return (error); + + mzap_create_impl(dn, normflags, 0, tx); + + dnode_rele(dn, FTAG); + + return (0); +} + +uint64_t +zap_create(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_norm(os, 0, ot, bonustype, bonuslen, tx)); +} + +uint64_t +zap_create_dnsize(objset_t *os, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, 0, ot, bonustype, bonuslen, + dnodesize, tx)); +} + +uint64_t +zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_norm_dnsize(os, normflags, ot, bonustype, bonuslen, + 0, tx)); +} + +uint64_t +zap_create_norm_dnsize(objset_t *os, int normflags, dmu_object_type_t ot, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_impl(os, normflags, 0, ot, 0, 0, + bonustype, bonuslen, dnodesize, NULL, NULL, tx)); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + return (zap_create_flags_dnsize(os, normflags, flags, ot, + leaf_blockshift, indirect_blockshift, bonustype, bonuslen, 0, tx)); +} + +uint64_t +zap_create_flags_dnsize(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, dmu_tx_t *tx) +{ + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, NULL, NULL, + tx)); +} + +/* + * Create a zap object and return a pointer to the newly allocated dnode via + * the allocated_dnode argument. The returned dnode will be held and the + * caller is responsible for releasing the hold by calling dnode_rele(). + */ +uint64_t +zap_create_hold(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, int dnodesize, + dnode_t **allocated_dnode, void *tag, dmu_tx_t *tx) +{ + return (zap_create_impl(os, normflags, flags, ot, leaf_blockshift, + indirect_blockshift, bonustype, bonuslen, dnodesize, + allocated_dnode, tag, tx)); +} + +int +zap_destroy(objset_t *os, uint64_t zapobj, dmu_tx_t *tx) +{ + /* + * dmu_object_free will free the object number and free the + * data. Freeing the data will cause our pageout function to be + * called, which will destroy our data (zap_leaf_t's and zap_t). + */ + + return (dmu_object_free(os, zapobj, tx)); +} + +void +zap_evict_sync(void *dbu) +{ + zap_t *zap = dbu; + + rw_destroy(&zap->zap_rwlock); + + if (zap->zap_ismicro) + mze_destroy(zap); + else + mutex_destroy(&zap->zap_f.zap_num_entries_mtx); + + kmem_free(zap, sizeof (zap_t)); +} + +int +zap_count(objset_t *os, uint64_t zapobj, uint64_t *count) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + if (!zap->zap_ismicro) { + err = fzap_count(zap, count); + } else { + *count = zap->zap_m.zap_num_entries; + } + zap_unlockdir(zap, FTAG); + return (err); +} + +/* + * zn may be NULL; if not specified, it will be computed if needed. + * See also the comment above zap_entry_normalization_conflict(). + */ +static boolean_t +mzap_normalization_conflict(zap_t *zap, zap_name_t *zn, mzap_ent_t *mze) +{ + int direction = AVL_BEFORE; + boolean_t allocdzn = B_FALSE; + + if (zap->zap_normflags == 0) + return (B_FALSE); + +again: + for (mzap_ent_t *other = avl_walk(&zap->zap_m.zap_avl, mze, direction); + other && other->mze_hash == mze->mze_hash; + other = avl_walk(&zap->zap_m.zap_avl, other, direction)) { + + if (zn == NULL) { + zn = zap_name_alloc(zap, MZE_PHYS(zap, mze)->mze_name, + MT_NORMALIZE); + allocdzn = B_TRUE; + } + if (zap_match(zn, MZE_PHYS(zap, other)->mze_name)) { + if (allocdzn) + zap_name_free(zn); + return (B_TRUE); + } + } + + if (direction == AVL_BEFORE) { + direction = AVL_AFTER; + goto again; + } + + if (allocdzn) + zap_name_free(zn); + return (B_FALSE); +} + +/* + * Routines for manipulating attributes. + */ + +int +zap_lookup(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm(os, zapobj, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +static int +zap_lookup_impl(zap_t *zap, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + + if (!zap->zap_ismicro) { + err = fzap_lookup(zn, integer_size, num_integers, buf, + realname, rn_len, ncp); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (num_integers < 1) { + err = SET_ERROR(EOVERFLOW); + } else if (integer_size != 8) { + err = SET_ERROR(EINVAL); + } else { + *(uint64_t *)buf = + MZE_PHYS(zap, mze)->mze_value; + (void) strlcpy(realname, + MZE_PHYS(zap, mze)->mze_name, rn_len); + if (ncp) { + *ncp = mzap_normalization_conflict(zap, + zn, mze); + } + } + } + } + zap_name_free(zn); + return (err); +} + +int +zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_prefetch(objset_t *os, uint64_t zapobj, const char *name) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + zn = zap_name_alloc(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf) +{ + return (zap_lookup_norm_by_dnode(dn, name, integer_size, + num_integers, buf, 0, NULL, 0, NULL)); +} + +int +zap_lookup_norm_by_dnode(dnode_t *dn, const char *name, + uint64_t integer_size, uint64_t num_integers, void *buf, + matchtype_t mt, char *realname, int rn_len, + boolean_t *ncp) +{ + zap_t *zap; + + int err = zap_lockdir_by_dnode(dn, NULL, RW_READER, TRUE, FALSE, + FTAG, &zap); + if (err != 0) + return (err); + err = zap_lookup_impl(zap, name, integer_size, + num_integers, buf, mt, realname, rn_len, ncp); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_prefetch_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + fzap_prefetch(zn); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, 0, NULL, 0, NULL); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + +int +zap_length(objset_t *os, uint64_t zapobj, const char *name, + uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_length(zn, integer_size, num_integers); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + if (integer_size) + *integer_size = 8; + if (num_integers) + *num_integers = 1; + } + } + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +static void +mzap_addent(zap_name_t *zn, uint64_t value) +{ + zap_t *zap = zn->zn_zap; + int start = zap->zap_m.zap_alloc_next; + + ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); + +#ifdef ZFS_DEBUG + for (int i = 0; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; + ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); + } +#endif + + uint32_t cd = mze_find_unused_cd(zap, zn->zn_hash); + /* given the limited size of the microzap, this can't happen */ + ASSERT(cd < zap_maxcd(zap)); + +again: + for (int i = start; i < zap->zap_m.zap_num_chunks; i++) { + mzap_ent_phys_t *mze = &zap_m_phys(zap)->mz_chunk[i]; + if (mze->mze_name[0] == 0) { + mze->mze_value = value; + mze->mze_cd = cd; + (void) strlcpy(mze->mze_name, zn->zn_key_orig, + sizeof (mze->mze_name)); + zap->zap_m.zap_num_entries++; + zap->zap_m.zap_alloc_next = i+1; + if (zap->zap_m.zap_alloc_next == + zap->zap_m.zap_num_chunks) + zap->zap_m.zap_alloc_next = 0; + mze_insert(zap, i, zn->zn_hash); + return; + } + } + if (start != 0) { + start = 0; + goto again; + } + cmn_err(CE_PANIC, "out of entries!"); +} + +static int +zap_add_impl(zap_t *zap, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx, void *tag) +{ + const uint64_t *intval = val; + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, key, 0); + if (zn == NULL) { + zap_unlockdir(zap, tag); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_add(zn, integer_size, num_integers, val, tag, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(key) >= MZAP_NAME_LEN || + !mze_canfit_fzap_leaf(zn, zn->zn_hash)) { + err = mzap_upgrade(&zn->zn_zap, tag, tx, 0); + if (err == 0) { + err = fzap_add(zn, integer_size, num_integers, val, + tag, tx); + } + zap = zn->zn_zap; /* fzap_add() may change zap */ + } else { + if (mze_find(zn) != NULL) { + err = SET_ERROR(EEXIST); + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, tag); + return (err); +} + +int +zap_add(objset_t *os, uint64_t zapobj, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_by_dnode(dnode_t *dn, const char *key, + int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + err = zap_add_impl(zap, key, integer_size, num_integers, val, tx, FTAG); + /* zap_add_impl() calls zap_unlockdir() */ + return (err); +} + +int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_add(zn, integer_size, num_integers, val, FTAG, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_update(objset_t *os, uint64_t zapobj, const char *name, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + const uint64_t *intval = val; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc(zap, name, 0); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + if (!zap->zap_ismicro) { + err = fzap_update(zn, integer_size, num_integers, val, + FTAG, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else if (integer_size != 8 || num_integers != 1 || + strlen(name) >= MZAP_NAME_LEN) { + dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", + zapobj, integer_size, num_integers, name); + err = mzap_upgrade(&zn->zn_zap, FTAG, tx, 0); + if (err == 0) { + err = fzap_update(zn, integer_size, num_integers, + val, FTAG, tx); + } + zap = zn->zn_zap; /* fzap_update() may change zap */ + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze != NULL) { + MZE_PHYS(zap, mze)->mze_value = *intval; + } else { + mzap_addent(zn, *intval); + } + } + ASSERT(zap == zn->zn_zap); + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_update(zn, integer_size, num_integers, val, FTAG, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) +{ + return (zap_remove_norm(os, zapobj, name, 0, tx)); +} + +static int +zap_remove_impl(zap_t *zap, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + int err = 0; + + zap_name_t *zn = zap_name_alloc(zap, name, mt); + if (zn == NULL) + return (SET_ERROR(ENOTSUP)); + if (!zap->zap_ismicro) { + err = fzap_remove(zn, tx); + } else { + mzap_ent_t *mze = mze_find(zn); + if (mze == NULL) { + err = SET_ERROR(ENOENT); + } else { + zap->zap_m.zap_num_entries--; + bzero(&zap_m_phys(zap)->mz_chunk[mze->mze_chunkid], + sizeof (mzap_ent_phys_t)); + mze_remove(zap, mze); + } + } + zap_name_free(zn); + return (err); +} + +int +zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, + matchtype_t mt, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, mt, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_by_dnode(dnode_t *dn, const char *name, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + + err = zap_lockdir_by_dnode(dn, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err) + return (err); + err = zap_remove_impl(zap, name, 0, tx); + zap_unlockdir(zap, FTAG); + return (err); +} + +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + zap_name_t *zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap, FTAG); + return (SET_ERROR(ENOTSUP)); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap, FTAG); + return (err); +} + +/* + * Routines for iterating over the attributes. + */ + +static void +zap_cursor_init_impl(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized, boolean_t prefetch) +{ + zc->zc_objset = os; + zc->zc_zap = NULL; + zc->zc_leaf = NULL; + zc->zc_zapobj = zapobj; + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; + zc->zc_prefetch = prefetch; +} +void +zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, + uint64_t serialized) +{ + zap_cursor_init_impl(zc, os, zapobj, serialized, B_TRUE); +} + +/* + * Initialize a cursor at the beginning of the ZAP object. The entire + * ZAP object will be prefetched. + */ +void +zap_cursor_init(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_TRUE); +} + +/* + * Initialize a cursor at the beginning, but request that we not prefetch + * the entire ZAP object. + */ +void +zap_cursor_init_noprefetch(zap_cursor_t *zc, objset_t *os, uint64_t zapobj) +{ + zap_cursor_init_impl(zc, os, zapobj, 0, B_FALSE); +} + +void +zap_cursor_fini(zap_cursor_t *zc) +{ + if (zc->zc_zap) { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + zap_unlockdir(zc->zc_zap, NULL); + zc->zc_zap = NULL; + } + if (zc->zc_leaf) { + rw_enter(&zc->zc_leaf->l_rwlock, RW_READER); + zap_put_leaf(zc->zc_leaf); + zc->zc_leaf = NULL; + } + zc->zc_objset = NULL; +} + +uint64_t +zap_cursor_serialize(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return (-1ULL); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); +} + +int +zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) +{ + int err; + + if (zc->zc_hash == -1ULL) + return (SET_ERROR(ENOENT)); + + if (zc->zc_zap == NULL) { + int hb; + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, NULL, &zc->zc_zap); + if (err != 0) + return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT(zc->zc_hash == 0); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); + } + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_retrieve(zc->zc_zap, zc, za); + } else { + avl_index_t idx; + mzap_ent_t mze_tofind; + + mze_tofind.mze_hash = zc->zc_hash; + mze_tofind.mze_cd = zc->zc_cd; + + mzap_ent_t *mze = + avl_find(&zc->zc_zap->zap_m.zap_avl, &mze_tofind, &idx); + if (mze == NULL) { + mze = avl_nearest(&zc->zc_zap->zap_m.zap_avl, + idx, AVL_AFTER); + } + if (mze) { + mzap_ent_phys_t *mzep = MZE_PHYS(zc->zc_zap, mze); + ASSERT3U(mze->mze_cd, ==, mzep->mze_cd); + za->za_normalization_conflict = + mzap_normalization_conflict(zc->zc_zap, NULL, mze); + za->za_integer_length = 8; + za->za_num_integers = 1; + za->za_first_integer = mzep->mze_value; + (void) strlcpy(za->za_name, mzep->mze_name, + sizeof (za->za_name)); + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_cd; + err = 0; + } else { + zc->zc_hash = -1ULL; + err = SET_ERROR(ENOENT); + } + } + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); +} + +void +zap_cursor_advance(zap_cursor_t *zc) +{ + if (zc->zc_hash == -1ULL) + return; + zc->zc_cd++; +} + +int +zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) +{ + zap_t *zap; + + int err = + zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, FTAG, &zap); + if (err != 0) + return (err); + + bzero(zs, sizeof (zap_stats_t)); + + if (zap->zap_ismicro) { + zs->zs_blocksize = zap->zap_dbuf->db_size; + zs->zs_num_entries = zap->zap_m.zap_num_entries; + zs->zs_num_blocks = 1; + } else { + fzap_get_stats(zap, zs); + } + zap_unlockdir(zap, FTAG); + return (0); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zap_create); +EXPORT_SYMBOL(zap_create_dnsize); +EXPORT_SYMBOL(zap_create_norm); +EXPORT_SYMBOL(zap_create_norm_dnsize); +EXPORT_SYMBOL(zap_create_flags); +EXPORT_SYMBOL(zap_create_flags_dnsize); +EXPORT_SYMBOL(zap_create_claim); +EXPORT_SYMBOL(zap_create_claim_norm); +EXPORT_SYMBOL(zap_create_claim_norm_dnsize); +EXPORT_SYMBOL(zap_create_hold); +EXPORT_SYMBOL(zap_destroy); +EXPORT_SYMBOL(zap_lookup); +EXPORT_SYMBOL(zap_lookup_by_dnode); +EXPORT_SYMBOL(zap_lookup_norm); +EXPORT_SYMBOL(zap_lookup_uint64); +EXPORT_SYMBOL(zap_contains); +EXPORT_SYMBOL(zap_prefetch); +EXPORT_SYMBOL(zap_prefetch_uint64); +EXPORT_SYMBOL(zap_add); +EXPORT_SYMBOL(zap_add_by_dnode); +EXPORT_SYMBOL(zap_add_uint64); +EXPORT_SYMBOL(zap_update); +EXPORT_SYMBOL(zap_update_uint64); +EXPORT_SYMBOL(zap_length); +EXPORT_SYMBOL(zap_length_uint64); +EXPORT_SYMBOL(zap_remove); +EXPORT_SYMBOL(zap_remove_by_dnode); +EXPORT_SYMBOL(zap_remove_norm); +EXPORT_SYMBOL(zap_remove_uint64); +EXPORT_SYMBOL(zap_count); +EXPORT_SYMBOL(zap_value_search); +EXPORT_SYMBOL(zap_join); +EXPORT_SYMBOL(zap_join_increment); +EXPORT_SYMBOL(zap_add_int); +EXPORT_SYMBOL(zap_remove_int); +EXPORT_SYMBOL(zap_lookup_int); +EXPORT_SYMBOL(zap_increment_int); +EXPORT_SYMBOL(zap_add_int_key); +EXPORT_SYMBOL(zap_lookup_int_key); +EXPORT_SYMBOL(zap_increment); +EXPORT_SYMBOL(zap_cursor_init); +EXPORT_SYMBOL(zap_cursor_fini); +EXPORT_SYMBOL(zap_cursor_retrieve); +EXPORT_SYMBOL(zap_cursor_advance); +EXPORT_SYMBOL(zap_cursor_serialize); +EXPORT_SYMBOL(zap_cursor_init_serialized); +EXPORT_SYMBOL(zap_get_stats); +#endif diff --git a/module/zfs/zcp.c b/module/zfs/zcp.c new file mode 100644 index 000000000000..793e0e4f0b75 --- /dev/null +++ b/module/zfs/zcp.c @@ -0,0 +1,1456 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2018 by Delphix. All rights reserved. + */ + +/* + * ZFS Channel Programs (ZCP) + * + * The ZCP interface allows various ZFS commands and operations ZFS + * administrative operations (e.g. creating and destroying snapshots, typically + * performed via an ioctl to /dev/zfs by the zfs(8) command and + * libzfs/libzfs_core) to be run * programmatically as a Lua script. A ZCP + * script is run as a dsl_sync_task and fully executed during one transaction + * group sync. This ensures that no other changes can be written concurrently + * with a running Lua script. Combining multiple calls to the exposed ZFS + * functions into one script gives a number of benefits: + * + * 1. Atomicity. For some compound or iterative operations, it's useful to be + * able to guarantee that the state of a pool has not changed between calls to + * ZFS. + * + * 2. Performance. If a large number of changes need to be made (e.g. deleting + * many filesystems), there can be a significant performance penalty as a + * result of the need to wait for a transaction group sync to pass for every + * single operation. When expressed as a single ZCP script, all these changes + * can be performed at once in one txg sync. + * + * A modified version of the Lua 5.2 interpreter is used to run channel program + * scripts. The Lua 5.2 manual can be found at: + * + * http://www.lua.org/manual/5.2/ + * + * If being run by a user (via an ioctl syscall), executing a ZCP script + * requires root privileges in the global zone. + * + * Scripts are passed to zcp_eval() as a string, then run in a synctask by + * zcp_eval_sync(). Arguments can be passed into the Lua script as an nvlist, + * which will be converted to a Lua table. Similarly, values returned from + * a ZCP script will be converted to an nvlist. See zcp_lua_to_nvlist_impl() + * for details on exact allowed types and conversion. + * + * ZFS functionality is exposed to a ZCP script as a library of function calls. + * These calls are sorted into submodules, such as zfs.list and zfs.sync, for + * iterators and synctasks, respectively. Each of these submodules resides in + * its own source file, with a zcp_*_info structure describing each library + * call in the submodule. + * + * Error handling in ZCP scripts is handled by a number of different methods + * based on severity: + * + * 1. Memory and time limits are in place to prevent a channel program from + * consuming excessive system or running forever. If one of these limits is + * hit, the channel program will be stopped immediately and return from + * zcp_eval() with an error code. No attempt will be made to roll back or undo + * any changes made by the channel program before the error occurred. + * Consumers invoking zcp_eval() from elsewhere in the kernel may pass a time + * limit of 0, disabling the time limit. + * + * 2. Internal Lua errors can occur as a result of a syntax error, calling a + * library function with incorrect arguments, invoking the error() function, + * failing an assert(), or other runtime errors. In these cases the channel + * program will stop executing and return from zcp_eval() with an error code. + * In place of a return value, an error message will also be returned in the + * 'result' nvlist containing information about the error. No attempt will be + * made to roll back or undo any changes made by the channel program before the + * error occurred. + * + * 3. If an error occurs inside a ZFS library call which returns an error code, + * the error is returned to the Lua script to be handled as desired. + * + * In the first two cases, Lua's error-throwing mechanism is used, which + * longjumps out of the script execution with luaL_error() and returns with the + * error. + * + * See zfs-program(8) for more information on high level usage. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifndef KM_NORMALPRI +#define KM_NORMALPRI 0 +#endif + +#define ZCP_NVLIST_MAX_DEPTH 20 + +uint64_t zfs_lua_check_instrlimit_interval = 100; +unsigned long zfs_lua_max_instrlimit = ZCP_MAX_INSTRLIMIT; +unsigned long zfs_lua_max_memlimit = ZCP_MAX_MEMLIMIT; + +/* + * Forward declarations for mutually recursive functions + */ +static int zcp_nvpair_value_to_lua(lua_State *, nvpair_t *, char *, int); +static int zcp_lua_to_nvlist_impl(lua_State *, int, nvlist_t *, const char *, + int); + +/* + * The outer-most error callback handler for use with lua_pcall(). On + * error Lua will call this callback with a single argument that + * represents the error value. In most cases this will be a string + * containing an error message, but channel programs can use Lua's + * error() function to return arbitrary objects as errors. This callback + * returns (on the Lua stack) the original error object along with a traceback. + * + * Fatal Lua errors can occur while resources are held, so we also call any + * registered cleanup function here. + */ +static int +zcp_error_handler(lua_State *state) +{ + const char *msg; + + zcp_cleanup(state); + + VERIFY3U(1, ==, lua_gettop(state)); + msg = lua_tostring(state, 1); + luaL_traceback(state, state, msg, 1); + return (1); +} + +int +zcp_argerror(lua_State *state, int narg, const char *msg, ...) +{ + va_list alist; + + va_start(alist, msg); + const char *buf = lua_pushvfstring(state, msg, alist); + va_end(alist); + + return (luaL_argerror(state, narg, buf)); +} + +/* + * Install a new cleanup function, which will be invoked with the given + * opaque argument if a fatal error causes the Lua interpreter to longjump out + * of a function call. + * + * If an error occurs, the cleanup function will be invoked exactly once and + * then unregistered. + * + * Returns the registered cleanup handler so the caller can deregister it + * if no error occurs. + */ +zcp_cleanup_handler_t * +zcp_register_cleanup(lua_State *state, zcp_cleanup_t cleanfunc, void *cleanarg) +{ + zcp_run_info_t *ri = zcp_run_info(state); + + zcp_cleanup_handler_t *zch = kmem_alloc(sizeof (*zch), KM_SLEEP); + zch->zch_cleanup_func = cleanfunc; + zch->zch_cleanup_arg = cleanarg; + list_insert_head(&ri->zri_cleanup_handlers, zch); + + return (zch); +} + +void +zcp_deregister_cleanup(lua_State *state, zcp_cleanup_handler_t *zch) +{ + zcp_run_info_t *ri = zcp_run_info(state); + list_remove(&ri->zri_cleanup_handlers, zch); + kmem_free(zch, sizeof (*zch)); +} + +/* + * Execute the currently registered cleanup handlers then free them and + * destroy the handler list. + */ +void +zcp_cleanup(lua_State *state) +{ + zcp_run_info_t *ri = zcp_run_info(state); + + for (zcp_cleanup_handler_t *zch = + list_remove_head(&ri->zri_cleanup_handlers); zch != NULL; + zch = list_remove_head(&ri->zri_cleanup_handlers)) { + zch->zch_cleanup_func(zch->zch_cleanup_arg); + kmem_free(zch, sizeof (*zch)); + } +} + +/* + * Convert the lua table at the given index on the Lua stack to an nvlist + * and return it. + * + * If the table can not be converted for any reason, NULL is returned and + * an error message is pushed onto the Lua stack. + */ +static nvlist_t * +zcp_table_to_nvlist(lua_State *state, int index, int depth) +{ + nvlist_t *nvl; + /* + * Converting a Lua table to an nvlist with key uniqueness checking is + * O(n^2) in the number of keys in the nvlist, which can take a long + * time when we return a large table from a channel program. + * Furthermore, Lua's table interface *almost* guarantees unique keys + * on its own (details below). Therefore, we don't use fnvlist_alloc() + * here to avoid the built-in uniqueness checking. + * + * The *almost* is because it's possible to have key collisions between + * e.g. the string "1" and the number 1, or the string "true" and the + * boolean true, so we explicitly check that when we're looking at a + * key which is an integer / boolean or a string that can be parsed as + * one of those types. In the worst case this could still devolve into + * O(n^2), so we only start doing these checks on boolean/integer keys + * once we've seen a string key which fits this weird usage pattern. + * + * Ultimately, we still want callers to know that the keys in this + * nvlist are unique, so before we return this we set the nvlist's + * flags to reflect that. + */ + VERIFY0(nvlist_alloc(&nvl, 0, KM_SLEEP)); + + /* + * Push an empty stack slot where lua_next() will store each + * table key. + */ + lua_pushnil(state); + boolean_t saw_str_could_collide = B_FALSE; + while (lua_next(state, index) != 0) { + /* + * The next key-value pair from the table at index is + * now on the stack, with the key at stack slot -2 and + * the value at slot -1. + */ + int err = 0; + char buf[32]; + const char *key = NULL; + boolean_t key_could_collide = B_FALSE; + + switch (lua_type(state, -2)) { + case LUA_TSTRING: + key = lua_tostring(state, -2); + + /* check if this could collide with a number or bool */ + long long tmp; + int parselen; + if ((sscanf(key, "%lld%n", &tmp, &parselen) > 0 && + parselen == strlen(key)) || + strcmp(key, "true") == 0 || + strcmp(key, "false") == 0) { + key_could_collide = B_TRUE; + saw_str_could_collide = B_TRUE; + } + break; + case LUA_TBOOLEAN: + key = (lua_toboolean(state, -2) == B_TRUE ? + "true" : "false"); + if (saw_str_could_collide) { + key_could_collide = B_TRUE; + } + break; + case LUA_TNUMBER: + VERIFY3U(sizeof (buf), >, + snprintf(buf, sizeof (buf), "%lld", + (longlong_t)lua_tonumber(state, -2))); + key = buf; + if (saw_str_could_collide) { + key_could_collide = B_TRUE; + } + break; + default: + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Invalid key " + "type '%s' in table", + lua_typename(state, lua_type(state, -2))); + return (NULL); + } + /* + * Check for type-mismatched key collisions, and throw an error. + */ + if (key_could_collide && nvlist_exists(nvl, key)) { + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Collision of " + "key '%s' in table", key); + return (NULL); + } + /* + * Recursively convert the table value and insert into + * the new nvlist with the parsed key. To prevent + * stack overflow on circular or heavily nested tables, + * we track the current nvlist depth. + */ + if (depth >= ZCP_NVLIST_MAX_DEPTH) { + fnvlist_free(nvl); + (void) lua_pushfstring(state, "Maximum table " + "depth (%d) exceeded for table", + ZCP_NVLIST_MAX_DEPTH); + return (NULL); + } + err = zcp_lua_to_nvlist_impl(state, -1, nvl, key, + depth + 1); + if (err != 0) { + fnvlist_free(nvl); + /* + * Error message has been pushed to the lua + * stack by the recursive call. + */ + return (NULL); + } + /* + * Pop the value pushed by lua_next(). + */ + lua_pop(state, 1); + } + + /* + * Mark the nvlist as having unique keys. This is a little ugly, but we + * ensured above that there are no duplicate keys in the nvlist. + */ + nvl->nvl_nvflag |= NV_UNIQUE_NAME; + + return (nvl); +} + +/* + * Convert a value from the given index into the lua stack to an nvpair, adding + * it to an nvlist with the given key. + * + * Values are converted as follows: + * + * string -> string + * number -> int64 + * boolean -> boolean + * nil -> boolean (no value) + * + * Lua tables are converted to nvlists and then inserted. The table's keys + * are converted to strings then used as keys in the nvlist to store each table + * element. Keys are converted as follows: + * + * string -> no change + * number -> "%lld" + * boolean -> "true" | "false" + * nil -> error + * + * In the case of a key collision, an error is thrown. + * + * If an error is encountered, a nonzero error code is returned, and an error + * string will be pushed onto the Lua stack. + */ +static int +zcp_lua_to_nvlist_impl(lua_State *state, int index, nvlist_t *nvl, + const char *key, int depth) +{ + /* + * Verify that we have enough remaining space in the lua stack to parse + * a key-value pair and push an error. + */ + if (!lua_checkstack(state, 3)) { + (void) lua_pushstring(state, "Lua stack overflow"); + return (1); + } + + index = lua_absindex(state, index); + + switch (lua_type(state, index)) { + case LUA_TNIL: + fnvlist_add_boolean(nvl, key); + break; + case LUA_TBOOLEAN: + fnvlist_add_boolean_value(nvl, key, + lua_toboolean(state, index)); + break; + case LUA_TNUMBER: + fnvlist_add_int64(nvl, key, lua_tonumber(state, index)); + break; + case LUA_TSTRING: + fnvlist_add_string(nvl, key, lua_tostring(state, index)); + break; + case LUA_TTABLE: { + nvlist_t *value_nvl = zcp_table_to_nvlist(state, index, depth); + if (value_nvl == NULL) + return (SET_ERROR(EINVAL)); + + fnvlist_add_nvlist(nvl, key, value_nvl); + fnvlist_free(value_nvl); + break; + } + default: + (void) lua_pushfstring(state, + "Invalid value type '%s' for key '%s'", + lua_typename(state, lua_type(state, index)), key); + return (SET_ERROR(EINVAL)); + } + + return (0); +} + +/* + * Convert a lua value to an nvpair, adding it to an nvlist with the given key. + */ +static void +zcp_lua_to_nvlist(lua_State *state, int index, nvlist_t *nvl, const char *key) +{ + /* + * On error, zcp_lua_to_nvlist_impl pushes an error string onto the Lua + * stack before returning with a nonzero error code. If an error is + * returned, throw a fatal lua error with the given string. + */ + if (zcp_lua_to_nvlist_impl(state, index, nvl, key, 0) != 0) + (void) lua_error(state); +} + +static int +zcp_lua_to_nvlist_helper(lua_State *state) +{ + nvlist_t *nv = (nvlist_t *)lua_touserdata(state, 2); + const char *key = (const char *)lua_touserdata(state, 1); + zcp_lua_to_nvlist(state, 3, nv, key); + return (0); +} + +static void +zcp_convert_return_values(lua_State *state, nvlist_t *nvl, + const char *key, int *result) +{ + int err; + VERIFY3U(1, ==, lua_gettop(state)); + lua_pushcfunction(state, zcp_lua_to_nvlist_helper); + lua_pushlightuserdata(state, (char *)key); + lua_pushlightuserdata(state, nvl); + lua_pushvalue(state, 1); + lua_remove(state, 1); + err = lua_pcall(state, 3, 0, 0); /* zcp_lua_to_nvlist_helper */ + if (err != 0) { + zcp_lua_to_nvlist(state, 1, nvl, ZCP_RET_ERROR); + *result = SET_ERROR(ECHRNG); + } +} + +/* + * Push a Lua table representing nvl onto the stack. If it can't be + * converted, return EINVAL, fill in errbuf, and push nothing. errbuf may + * be specified as NULL, in which case no error string will be output. + * + * Most nvlists are converted as simple key->value Lua tables, but we make + * an exception for the case where all nvlist entries are BOOLEANs (a string + * key without a value). In Lua, a table key pointing to a value of Nil + * (no value) is equivalent to the key not existing, so a BOOLEAN nvlist + * entry can't be directly converted to a Lua table entry. Nvlists of entirely + * BOOLEAN entries are frequently used to pass around lists of datasets, so for + * convenience we check for this case, and convert it to a simple Lua array of + * strings. + */ +int +zcp_nvlist_to_lua(lua_State *state, nvlist_t *nvl, + char *errbuf, int errbuf_len) +{ + nvpair_t *pair; + lua_newtable(state); + boolean_t has_values = B_FALSE; + /* + * If the list doesn't have any values, just convert it to a string + * array. + */ + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + if (nvpair_type(pair) != DATA_TYPE_BOOLEAN) { + has_values = B_TRUE; + break; + } + } + if (!has_values) { + int i = 1; + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + (void) lua_pushinteger(state, i); + (void) lua_pushstring(state, nvpair_name(pair)); + (void) lua_settable(state, -3); + i++; + } + } else { + for (pair = nvlist_next_nvpair(nvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvl, pair)) { + int err = zcp_nvpair_value_to_lua(state, pair, + errbuf, errbuf_len); + if (err != 0) { + lua_pop(state, 1); + return (err); + } + (void) lua_setfield(state, -2, nvpair_name(pair)); + } + } + return (0); +} + +/* + * Push a Lua object representing the value of "pair" onto the stack. + * + * Only understands boolean_value, string, int64, nvlist, + * string_array, and int64_array type values. For other + * types, returns EINVAL, fills in errbuf, and pushes nothing. + */ +static int +zcp_nvpair_value_to_lua(lua_State *state, nvpair_t *pair, + char *errbuf, int errbuf_len) +{ + int err = 0; + + if (pair == NULL) { + lua_pushnil(state); + return (0); + } + + switch (nvpair_type(pair)) { + case DATA_TYPE_BOOLEAN_VALUE: + (void) lua_pushboolean(state, + fnvpair_value_boolean_value(pair)); + break; + case DATA_TYPE_STRING: + (void) lua_pushstring(state, fnvpair_value_string(pair)); + break; + case DATA_TYPE_INT64: + (void) lua_pushinteger(state, fnvpair_value_int64(pair)); + break; + case DATA_TYPE_NVLIST: + err = zcp_nvlist_to_lua(state, + fnvpair_value_nvlist(pair), errbuf, errbuf_len); + break; + case DATA_TYPE_STRING_ARRAY: { + char **strarr; + uint_t nelem; + (void) nvpair_value_string_array(pair, &strarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushstring(state, strarr[i]); + (void) lua_settable(state, -3); + } + break; + } + case DATA_TYPE_UINT64_ARRAY: { + uint64_t *intarr; + uint_t nelem; + (void) nvpair_value_uint64_array(pair, &intarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushinteger(state, intarr[i]); + (void) lua_settable(state, -3); + } + break; + } + case DATA_TYPE_INT64_ARRAY: { + int64_t *intarr; + uint_t nelem; + (void) nvpair_value_int64_array(pair, &intarr, &nelem); + lua_newtable(state); + for (int i = 0; i < nelem; i++) { + (void) lua_pushinteger(state, i + 1); + (void) lua_pushinteger(state, intarr[i]); + (void) lua_settable(state, -3); + } + break; + } + default: { + if (errbuf != NULL) { + (void) snprintf(errbuf, errbuf_len, + "Unhandled nvpair type %d for key '%s'", + nvpair_type(pair), nvpair_name(pair)); + } + return (SET_ERROR(EINVAL)); + } + } + return (err); +} + +int +zcp_dataset_hold_error(lua_State *state, dsl_pool_t *dp, const char *dsname, + int error) +{ + if (error == ENOENT) { + (void) zcp_argerror(state, 1, "no such dataset '%s'", dsname); + return (0); /* not reached; zcp_argerror will longjmp */ + } else if (error == EXDEV) { + (void) zcp_argerror(state, 1, + "dataset '%s' is not in the target pool '%s'", + dsname, spa_name(dp->dp_spa)); + return (0); /* not reached; zcp_argerror will longjmp */ + } else if (error == EIO) { + (void) luaL_error(state, + "I/O error while accessing dataset '%s'", dsname); + return (0); /* not reached; luaL_error will longjmp */ + } else if (error != 0) { + (void) luaL_error(state, + "unexpected error %d while accessing dataset '%s'", + error, dsname); + return (0); /* not reached; luaL_error will longjmp */ + } + return (0); +} + +/* + * Note: will longjmp (via lua_error()) on error. + * Assumes that the dsname is argument #1 (for error reporting purposes). + */ +dsl_dataset_t * +zcp_dataset_hold(lua_State *state, dsl_pool_t *dp, const char *dsname, + void *tag) +{ + dsl_dataset_t *ds; + int error = dsl_dataset_hold(dp, dsname, tag, &ds); + (void) zcp_dataset_hold_error(state, dp, dsname, error); + return (ds); +} + +static int zcp_debug(lua_State *); +static zcp_lib_info_t zcp_debug_info = { + .name = "debug", + .func = zcp_debug, + .pargs = { + { .za_name = "debug string", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_debug(lua_State *state) +{ + const char *dbgstring; + zcp_run_info_t *ri = zcp_run_info(state); + zcp_lib_info_t *libinfo = &zcp_debug_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + dbgstring = lua_tostring(state, 1); + + zfs_dbgmsg("txg %lld ZCP: %s", ri->zri_tx->tx_txg, dbgstring); + + return (0); +} + +static int zcp_exists(lua_State *); +static zcp_lib_info_t zcp_exists_info = { + .name = "exists", + .func = zcp_exists, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_exists(lua_State *state) +{ + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + zcp_lib_info_t *libinfo = &zcp_exists_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + const char *dsname = lua_tostring(state, 1); + + dsl_dataset_t *ds; + int error = dsl_dataset_hold(dp, dsname, FTAG, &ds); + if (error == 0) { + dsl_dataset_rele(ds, FTAG); + lua_pushboolean(state, B_TRUE); + } else if (error == ENOENT) { + lua_pushboolean(state, B_FALSE); + } else if (error == EXDEV) { + return (luaL_error(state, "dataset '%s' is not in the " + "target pool", dsname)); + } else if (error == EIO) { + return (luaL_error(state, "I/O error opening dataset '%s'", + dsname)); + } else if (error != 0) { + return (luaL_error(state, "unexpected error %d", error)); + } + + return (1); +} + +/* + * Allocate/realloc/free a buffer for the lua interpreter. + * + * When nsize is 0, behaves as free() and returns NULL. + * + * If ptr is NULL, behaves as malloc() and returns an allocated buffer of size + * at least nsize. + * + * Otherwise, behaves as realloc(), changing the allocation from osize to nsize. + * Shrinking the buffer size never fails. + * + * The original allocated buffer size is stored as a uint64 at the beginning of + * the buffer to avoid actually reallocating when shrinking a buffer, since lua + * requires that this operation never fail. + */ +static void * +zcp_lua_alloc(void *ud, void *ptr, size_t osize, size_t nsize) +{ + zcp_alloc_arg_t *allocargs = ud; + int flags = (allocargs->aa_must_succeed) ? + KM_SLEEP : (KM_NOSLEEP | KM_NORMALPRI); + + if (nsize == 0) { + if (ptr != NULL) { + int64_t *allocbuf = (int64_t *)ptr - 1; + int64_t allocsize = *allocbuf; + ASSERT3S(allocsize, >, 0); + ASSERT3S(allocargs->aa_alloc_remaining + allocsize, <=, + allocargs->aa_alloc_limit); + allocargs->aa_alloc_remaining += allocsize; + vmem_free(allocbuf, allocsize); + } + return (NULL); + } else if (ptr == NULL) { + int64_t *allocbuf; + int64_t allocsize = nsize + sizeof (int64_t); + + if (!allocargs->aa_must_succeed && + (allocsize <= 0 || + allocsize > allocargs->aa_alloc_remaining)) { + return (NULL); + } + + allocbuf = vmem_alloc(allocsize, flags); + if (allocbuf == NULL) { + return (NULL); + } + allocargs->aa_alloc_remaining -= allocsize; + + *allocbuf = allocsize; + return (allocbuf + 1); + } else if (nsize <= osize) { + /* + * If shrinking the buffer, lua requires that the reallocation + * never fail. + */ + return (ptr); + } else { + ASSERT3U(nsize, >, osize); + + uint64_t *luabuf = zcp_lua_alloc(ud, NULL, 0, nsize); + if (luabuf == NULL) { + return (NULL); + } + (void) memcpy(luabuf, ptr, osize); + VERIFY3P(zcp_lua_alloc(ud, ptr, osize, 0), ==, NULL); + return (luabuf); + } +} + +/* ARGSUSED */ +static void +zcp_lua_counthook(lua_State *state, lua_Debug *ar) +{ + lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + zcp_run_info_t *ri = lua_touserdata(state, -1); + + /* + * Check if we were canceled while waiting for the + * txg to sync or from our open context thread + */ + if (ri->zri_canceled || + (!ri->zri_sync && issig(JUSTLOOKING) && issig(FORREAL))) { + ri->zri_canceled = B_TRUE; + (void) lua_pushstring(state, "Channel program was canceled."); + (void) lua_error(state); + /* Unreachable */ + } + + /* + * Check how many instructions the channel program has + * executed so far, and compare against the limit. + */ + ri->zri_curinstrs += zfs_lua_check_instrlimit_interval; + if (ri->zri_maxinstrs != 0 && ri->zri_curinstrs > ri->zri_maxinstrs) { + ri->zri_timed_out = B_TRUE; + (void) lua_pushstring(state, + "Channel program timed out."); + (void) lua_error(state); + /* Unreachable */ + } +} + +static int +zcp_panic_cb(lua_State *state) +{ + panic("unprotected error in call to Lua API (%s)\n", + lua_tostring(state, -1)); + return (0); +} + +static void +zcp_eval_impl(dmu_tx_t *tx, zcp_run_info_t *ri) +{ + int err; + lua_State *state = ri->zri_state; + + VERIFY3U(3, ==, lua_gettop(state)); + + /* finish initializing our runtime state */ + ri->zri_pool = dmu_tx_pool(tx); + ri->zri_tx = tx; + list_create(&ri->zri_cleanup_handlers, sizeof (zcp_cleanup_handler_t), + offsetof(zcp_cleanup_handler_t, zch_node)); + + /* + * Store the zcp_run_info_t struct for this run in the Lua registry. + * Registry entries are not directly accessible by the Lua scripts but + * can be accessed by our callbacks. + */ + lua_pushlightuserdata(state, ri); + lua_setfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + VERIFY3U(3, ==, lua_gettop(state)); + + /* + * Tell the Lua interpreter to call our handler every count + * instructions. Channel programs that execute too many instructions + * should die with ETIME. + */ + (void) lua_sethook(state, zcp_lua_counthook, LUA_MASKCOUNT, + zfs_lua_check_instrlimit_interval); + + /* + * Tell the Lua memory allocator to stop using KM_SLEEP before handing + * off control to the channel program. Channel programs that use too + * much memory should die with ENOSPC. + */ + ri->zri_allocargs->aa_must_succeed = B_FALSE; + + /* + * Call the Lua function that open-context passed us. This pops the + * function and its input from the stack and pushes any return + * or error values. + */ + err = lua_pcall(state, 1, LUA_MULTRET, 1); + + /* + * Let Lua use KM_SLEEP while we interpret the return values. + */ + ri->zri_allocargs->aa_must_succeed = B_TRUE; + + /* + * Remove the error handler callback from the stack. At this point, + * there shouldn't be any cleanup handler registered in the handler + * list (zri_cleanup_handlers), regardless of whether it ran or not. + */ + list_destroy(&ri->zri_cleanup_handlers); + lua_remove(state, 1); + + switch (err) { + case LUA_OK: { + /* + * Lua supports returning multiple values in a single return + * statement. Return values will have been pushed onto the + * stack: + * 1: Return value 1 + * 2: Return value 2 + * 3: etc... + * To simplify the process of retrieving a return value from a + * channel program, we disallow returning more than one value + * to ZFS from the Lua script, yielding a singleton return + * nvlist of the form { "return": Return value 1 }. + */ + int return_count = lua_gettop(state); + + if (return_count == 1) { + ri->zri_result = 0; + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_RETURN, &ri->zri_result); + } else if (return_count > 1) { + ri->zri_result = SET_ERROR(ECHRNG); + lua_settop(state, 0); + (void) lua_pushfstring(state, "Multiple return " + "values not supported"); + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); + } + break; + } + case LUA_ERRRUN: + case LUA_ERRGCMM: { + /* + * The channel program encountered a fatal error within the + * script, such as failing an assertion, or calling a function + * with incompatible arguments. The error value and the + * traceback generated by zcp_error_handler() should be on the + * stack. + */ + VERIFY3U(1, ==, lua_gettop(state)); + if (ri->zri_timed_out) { + ri->zri_result = SET_ERROR(ETIME); + } else if (ri->zri_canceled) { + ri->zri_result = SET_ERROR(EINTR); + } else { + ri->zri_result = SET_ERROR(ECHRNG); + } + + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); + + if (ri->zri_result == ETIME && ri->zri_outnvl != NULL) { + (void) nvlist_add_uint64(ri->zri_outnvl, + ZCP_ARG_INSTRLIMIT, ri->zri_curinstrs); + } + break; + } + case LUA_ERRERR: { + /* + * The channel program encountered a fatal error within the + * script, and we encountered another error while trying to + * compute the traceback in zcp_error_handler(). We can only + * return the error message. + */ + VERIFY3U(1, ==, lua_gettop(state)); + if (ri->zri_timed_out) { + ri->zri_result = SET_ERROR(ETIME); + } else if (ri->zri_canceled) { + ri->zri_result = SET_ERROR(EINTR); + } else { + ri->zri_result = SET_ERROR(ECHRNG); + } + + zcp_convert_return_values(state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); + break; + } + case LUA_ERRMEM: + /* + * Lua ran out of memory while running the channel program. + * There's not much we can do. + */ + ri->zri_result = SET_ERROR(ENOSPC); + break; + default: + VERIFY0(err); + } +} + +static void +zcp_pool_error(zcp_run_info_t *ri, const char *poolname) +{ + ri->zri_result = SET_ERROR(ECHRNG); + lua_settop(ri->zri_state, 0); + (void) lua_pushfstring(ri->zri_state, "Could not open pool: %s", + poolname); + zcp_convert_return_values(ri->zri_state, ri->zri_outnvl, + ZCP_RET_ERROR, &ri->zri_result); + +} + +/* + * This callback is called when txg_wait_synced_sig encountered a signal. + * The txg_wait_synced_sig will continue to wait for the txg to complete + * after calling this callback. + */ +/* ARGSUSED */ +static void +zcp_eval_sig(void *arg, dmu_tx_t *tx) +{ + zcp_run_info_t *ri = arg; + + ri->zri_canceled = B_TRUE; +} + +static void +zcp_eval_sync(void *arg, dmu_tx_t *tx) +{ + zcp_run_info_t *ri = arg; + + /* + * Open context should have setup the stack to contain: + * 1: Error handler callback + * 2: Script to run (converted to a Lua function) + * 3: nvlist input to function (converted to Lua table or nil) + */ + VERIFY3U(3, ==, lua_gettop(ri->zri_state)); + + zcp_eval_impl(tx, ri); +} + +static void +zcp_eval_open(zcp_run_info_t *ri, const char *poolname) +{ + int error; + dsl_pool_t *dp; + dmu_tx_t *tx; + + /* + * See comment from the same assertion in zcp_eval_sync(). + */ + VERIFY3U(3, ==, lua_gettop(ri->zri_state)); + + error = dsl_pool_hold(poolname, FTAG, &dp); + if (error != 0) { + zcp_pool_error(ri, poolname); + return; + } + + /* + * As we are running in open-context, we have no transaction associated + * with the channel program. At the same time, functions from the + * zfs.check submodule need to be associated with a transaction as + * they are basically dry-runs of their counterparts in the zfs.sync + * submodule. These functions should be able to run in open-context. + * Therefore we create a new transaction that we later abort once + * the channel program has been evaluated. + */ + tx = dmu_tx_create_dd(dp->dp_mos_dir); + + zcp_eval_impl(tx, ri); + + dmu_tx_abort(tx); + + dsl_pool_rele(dp, FTAG); +} + +int +zcp_eval(const char *poolname, const char *program, boolean_t sync, + uint64_t instrlimit, uint64_t memlimit, nvpair_t *nvarg, nvlist_t *outnvl) +{ + int err; + lua_State *state; + zcp_run_info_t runinfo; + + if (instrlimit > zfs_lua_max_instrlimit) + return (SET_ERROR(EINVAL)); + if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) + return (SET_ERROR(EINVAL)); + + zcp_alloc_arg_t allocargs = { + .aa_must_succeed = B_TRUE, + .aa_alloc_remaining = (int64_t)memlimit, + .aa_alloc_limit = (int64_t)memlimit, + }; + + /* + * Creates a Lua state with a memory allocator that uses KM_SLEEP. + * This should never fail. + */ + state = lua_newstate(zcp_lua_alloc, &allocargs); + VERIFY(state != NULL); + (void) lua_atpanic(state, zcp_panic_cb); + + /* + * Load core Lua libraries we want access to. + */ + VERIFY3U(1, ==, luaopen_base(state)); + lua_pop(state, 1); + VERIFY3U(1, ==, luaopen_coroutine(state)); + lua_setglobal(state, LUA_COLIBNAME); + VERIFY0(lua_gettop(state)); + VERIFY3U(1, ==, luaopen_string(state)); + lua_setglobal(state, LUA_STRLIBNAME); + VERIFY0(lua_gettop(state)); + VERIFY3U(1, ==, luaopen_table(state)); + lua_setglobal(state, LUA_TABLIBNAME); + VERIFY0(lua_gettop(state)); + + /* + * Load globally visible variables such as errno aliases. + */ + zcp_load_globals(state); + VERIFY0(lua_gettop(state)); + + /* + * Load ZFS-specific modules. + */ + lua_newtable(state); + VERIFY3U(1, ==, zcp_load_list_lib(state)); + lua_setfield(state, -2, "list"); + VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_FALSE)); + lua_setfield(state, -2, "check"); + VERIFY3U(1, ==, zcp_load_synctask_lib(state, B_TRUE)); + lua_setfield(state, -2, "sync"); + VERIFY3U(1, ==, zcp_load_get_lib(state)); + lua_pushcclosure(state, zcp_debug_info.func, 0); + lua_setfield(state, -2, zcp_debug_info.name); + lua_pushcclosure(state, zcp_exists_info.func, 0); + lua_setfield(state, -2, zcp_exists_info.name); + lua_setglobal(state, "zfs"); + VERIFY0(lua_gettop(state)); + + /* + * Push the error-callback that calculates Lua stack traces on + * unexpected failures. + */ + lua_pushcfunction(state, zcp_error_handler); + VERIFY3U(1, ==, lua_gettop(state)); + + /* + * Load the actual script as a function onto the stack as text ("t"). + * The only valid error condition is a syntax error in the script. + * ERRMEM should not be possible because our allocator is using + * KM_SLEEP. ERRGCMM should not be possible because we have not added + * any objects with __gc metamethods to the interpreter that could + * fail. + */ + err = luaL_loadbufferx(state, program, strlen(program), + "channel program", "t"); + if (err == LUA_ERRSYNTAX) { + fnvlist_add_string(outnvl, ZCP_RET_ERROR, + lua_tostring(state, -1)); + lua_close(state); + return (SET_ERROR(EINVAL)); + } + VERIFY0(err); + VERIFY3U(2, ==, lua_gettop(state)); + + /* + * Convert the input nvlist to a Lua object and put it on top of the + * stack. + */ + char errmsg[128]; + err = zcp_nvpair_value_to_lua(state, nvarg, + errmsg, sizeof (errmsg)); + if (err != 0) { + fnvlist_add_string(outnvl, ZCP_RET_ERROR, errmsg); + lua_close(state); + return (SET_ERROR(EINVAL)); + } + VERIFY3U(3, ==, lua_gettop(state)); + + runinfo.zri_state = state; + runinfo.zri_allocargs = &allocargs; + runinfo.zri_outnvl = outnvl; + runinfo.zri_result = 0; + runinfo.zri_cred = CRED(); + runinfo.zri_proc = curproc; + runinfo.zri_timed_out = B_FALSE; + runinfo.zri_canceled = B_FALSE; + runinfo.zri_sync = sync; + runinfo.zri_space_used = 0; + runinfo.zri_curinstrs = 0; + runinfo.zri_maxinstrs = instrlimit; + runinfo.zri_new_zvols = fnvlist_alloc(); + + if (sync) { + err = dsl_sync_task_sig(poolname, NULL, zcp_eval_sync, + zcp_eval_sig, &runinfo, 0, ZFS_SPACE_CHECK_ZCP_EVAL); + if (err != 0) + zcp_pool_error(&runinfo, poolname); + } else { + zcp_eval_open(&runinfo, poolname); + } + lua_close(state); + + /* + * Create device minor nodes for any new zvols. + */ + for (nvpair_t *pair = nvlist_next_nvpair(runinfo.zri_new_zvols, NULL); + pair != NULL; + pair = nvlist_next_nvpair(runinfo.zri_new_zvols, pair)) { + zvol_create_minor(nvpair_name(pair)); + } + fnvlist_free(runinfo.zri_new_zvols); + + return (runinfo.zri_result); +} + +/* + * Retrieve metadata about the currently running channel program. + */ +zcp_run_info_t * +zcp_run_info(lua_State *state) +{ + zcp_run_info_t *ri; + + lua_getfield(state, LUA_REGISTRYINDEX, ZCP_RUN_INFO_KEY); + ri = lua_touserdata(state, -1); + lua_pop(state, 1); + return (ri); +} + +/* + * Argument Parsing + * ================ + * + * The Lua language allows methods to be called with any number + * of arguments of any type. When calling back into ZFS we need to sanitize + * arguments from channel programs to make sure unexpected arguments or + * arguments of the wrong type result in clear error messages. To do this + * in a uniform way all callbacks from channel programs should use the + * zcp_parse_args() function to interpret inputs. + * + * Positional vs Keyword Arguments + * =============================== + * + * Every callback function takes a fixed set of required positional arguments + * and optional keyword arguments. For example, the destroy function takes + * a single positional string argument (the name of the dataset to destroy) + * and an optional "defer" keyword boolean argument. When calling lua functions + * with parentheses, only positional arguments can be used: + * + * zfs.sync.snapshot("rpool@snap") + * + * To use keyword arguments functions should be called with a single argument + * that is a lua table containing mappings of integer -> positional arguments + * and string -> keyword arguments: + * + * zfs.sync.snapshot({1="rpool@snap", defer=true}) + * + * The lua language allows curly braces to be used in place of parenthesis as + * syntactic sugar for this calling convention: + * + * zfs.sync.snapshot{"rpool@snap", defer=true} + */ + +/* + * Throw an error and print the given arguments. If there are too many + * arguments to fit in the output buffer, only the error format string is + * output. + */ +static void +zcp_args_error(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs, const char *fmt, ...) +{ + int i; + char errmsg[512]; + size_t len = sizeof (errmsg); + size_t msglen = 0; + va_list argp; + + va_start(argp, fmt); + VERIFY3U(len, >, vsnprintf(errmsg, len, fmt, argp)); + va_end(argp); + + /* + * Calculate the total length of the final string, including extra + * formatting characters. If the argument dump would be too large, + * only print the error string. + */ + msglen = strlen(errmsg); + msglen += strlen(fname) + 4; /* : + {} + null terminator */ + for (i = 0; pargs[i].za_name != NULL; i++) { + msglen += strlen(pargs[i].za_name); + msglen += strlen(lua_typename(state, pargs[i].za_lua_type)); + if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) + msglen += 5; /* < + ( + )> + , */ + else + msglen += 4; /* < + ( + )> */ + } + for (i = 0; kwargs[i].za_name != NULL; i++) { + msglen += strlen(kwargs[i].za_name); + msglen += strlen(lua_typename(state, kwargs[i].za_lua_type)); + if (kwargs[i + 1].za_name != NULL) + msglen += 4; /* =( + ) + , */ + else + msglen += 3; /* =( + ) */ + } + + if (msglen >= len) + (void) luaL_error(state, errmsg); + + VERIFY3U(len, >, strlcat(errmsg, ": ", len)); + VERIFY3U(len, >, strlcat(errmsg, fname, len)); + VERIFY3U(len, >, strlcat(errmsg, "{", len)); + for (i = 0; pargs[i].za_name != NULL; i++) { + VERIFY3U(len, >, strlcat(errmsg, "<", len)); + VERIFY3U(len, >, strlcat(errmsg, pargs[i].za_name, len)); + VERIFY3U(len, >, strlcat(errmsg, "(", len)); + VERIFY3U(len, >, strlcat(errmsg, + lua_typename(state, pargs[i].za_lua_type), len)); + VERIFY3U(len, >, strlcat(errmsg, ")>", len)); + if (pargs[i + 1].za_name != NULL || kwargs[0].za_name != NULL) { + VERIFY3U(len, >, strlcat(errmsg, ", ", len)); + } + } + for (i = 0; kwargs[i].za_name != NULL; i++) { + VERIFY3U(len, >, strlcat(errmsg, kwargs[i].za_name, len)); + VERIFY3U(len, >, strlcat(errmsg, "=(", len)); + VERIFY3U(len, >, strlcat(errmsg, + lua_typename(state, kwargs[i].za_lua_type), len)); + VERIFY3U(len, >, strlcat(errmsg, ")", len)); + if (kwargs[i + 1].za_name != NULL) { + VERIFY3U(len, >, strlcat(errmsg, ", ", len)); + } + } + VERIFY3U(len, >, strlcat(errmsg, "}", len)); + + (void) luaL_error(state, errmsg); + panic("unreachable code"); +} + +static void +zcp_parse_table_args(lua_State *state, const char *fname, + const zcp_arg_t *pargs, const zcp_arg_t *kwargs) +{ + int i; + int type; + + for (i = 0; pargs[i].za_name != NULL; i++) { + /* + * Check the table for this positional argument, leaving it + * on the top of the stack once we finish validating it. + */ + lua_pushinteger(state, i + 1); + lua_gettable(state, 1); + + type = lua_type(state, -1); + if (type == LUA_TNIL) { + zcp_args_error(state, fname, pargs, kwargs, + "too few arguments"); + panic("unreachable code"); + } else if (type != pargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "arg %d wrong type (is '%s', expected '%s')", + i + 1, lua_typename(state, type), + lua_typename(state, pargs[i].za_lua_type)); + panic("unreachable code"); + } + + /* + * Remove the positional argument from the table. + */ + lua_pushinteger(state, i + 1); + lua_pushnil(state); + lua_settable(state, 1); + } + + for (i = 0; kwargs[i].za_name != NULL; i++) { + /* + * Check the table for this keyword argument, which may be + * nil if it was omitted. Leave the value on the top of + * the stack after validating it. + */ + lua_getfield(state, 1, kwargs[i].za_name); + + type = lua_type(state, -1); + if (type != LUA_TNIL && type != kwargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "kwarg '%s' wrong type (is '%s', expected '%s')", + kwargs[i].za_name, lua_typename(state, type), + lua_typename(state, kwargs[i].za_lua_type)); + panic("unreachable code"); + } + + /* + * Remove the keyword argument from the table. + */ + lua_pushnil(state); + lua_setfield(state, 1, kwargs[i].za_name); + } + + /* + * Any entries remaining in the table are invalid inputs, print + * an error message based on what the entry is. + */ + lua_pushnil(state); + if (lua_next(state, 1)) { + if (lua_isnumber(state, -2) && lua_tointeger(state, -2) > 0) { + zcp_args_error(state, fname, pargs, kwargs, + "too many positional arguments"); + } else if (lua_isstring(state, -2)) { + zcp_args_error(state, fname, pargs, kwargs, + "invalid kwarg '%s'", lua_tostring(state, -2)); + } else { + zcp_args_error(state, fname, pargs, kwargs, + "kwarg keys must be strings"); + } + panic("unreachable code"); + } + + lua_remove(state, 1); +} + +static void +zcp_parse_pos_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs) +{ + int i; + int type; + + for (i = 0; pargs[i].za_name != NULL; i++) { + type = lua_type(state, i + 1); + if (type == LUA_TNONE) { + zcp_args_error(state, fname, pargs, kwargs, + "too few arguments"); + panic("unreachable code"); + } else if (type != pargs[i].za_lua_type) { + zcp_args_error(state, fname, pargs, kwargs, + "arg %d wrong type (is '%s', expected '%s')", + i + 1, lua_typename(state, type), + lua_typename(state, pargs[i].za_lua_type)); + panic("unreachable code"); + } + } + if (lua_gettop(state) != i) { + zcp_args_error(state, fname, pargs, kwargs, + "too many positional arguments"); + panic("unreachable code"); + } + + for (i = 0; kwargs[i].za_name != NULL; i++) { + lua_pushnil(state); + } +} + +/* + * Checks the current Lua stack against an expected set of positional and + * keyword arguments. If the stack does not match the expected arguments + * aborts the current channel program with a useful error message, otherwise + * it re-arranges the stack so that it contains the positional arguments + * followed by the keyword argument values in declaration order. Any missing + * keyword argument will be represented by a nil value on the stack. + * + * If the stack contains exactly one argument of type LUA_TTABLE the curly + * braces calling convention is assumed, otherwise the stack is parsed for + * positional arguments only. + * + * This function should be used by every function callback. It should be called + * before the callback manipulates the Lua stack as it assumes the stack + * represents the function arguments. + */ +void +zcp_parse_args(lua_State *state, const char *fname, const zcp_arg_t *pargs, + const zcp_arg_t *kwargs) +{ + if (lua_gettop(state) == 1 && lua_istable(state, 1)) { + zcp_parse_table_args(state, fname, pargs, kwargs); + } else { + zcp_parse_pos_args(state, fname, pargs, kwargs); + } +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_instrlimit, ULONG, ZMOD_RW, + "Max instruction limit that can be specified for a channel program"); + +ZFS_MODULE_PARAM(zfs_lua, zfs_lua_, max_memlimit, ULONG, ZMOD_RW, + "Max memory limit that can be specified for a channel program"); +/* END CSTYLED */ diff --git a/module/zfs/zcp_get.c b/module/zfs/zcp_get.c new file mode 100644 index 000000000000..7256e4de1915 --- /dev/null +++ b/module/zfs/zcp_get.c @@ -0,0 +1,813 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef _KERNEL +#include +#include +#endif + +static int +get_objset_type(dsl_dataset_t *ds, zfs_type_t *type) +{ + int error; + objset_t *os; + error = dmu_objset_from_ds(ds, &os); + if (error != 0) + return (error); + if (ds->ds_is_snapshot) { + *type = ZFS_TYPE_SNAPSHOT; + } else { + switch (os->os_phys->os_type) { + case DMU_OST_ZFS: + *type = ZFS_TYPE_FILESYSTEM; + break; + case DMU_OST_ZVOL: + *type = ZFS_TYPE_VOLUME; + break; + default: + return (EINVAL); + } + } + return (0); +} + +/* + * Returns the string name of ds's type in str (a buffer which should be + * at least 12 bytes long). + */ +static int +get_objset_type_name(dsl_dataset_t *ds, char *str) +{ + int error; + zfs_type_t type; + error = get_objset_type(ds, &type); + if (error != 0) + return (error); + switch (type) { + case ZFS_TYPE_SNAPSHOT: + (void) strlcpy(str, "snapshot", ZAP_MAXVALUELEN); + break; + case ZFS_TYPE_FILESYSTEM: + (void) strlcpy(str, "filesystem", ZAP_MAXVALUELEN); + break; + case ZFS_TYPE_VOLUME: + (void) strlcpy(str, "volume", ZAP_MAXVALUELEN); + break; + default: + return (EINVAL); + } + return (0); +} + +/* + * Determines the source of a property given its setpoint and + * property type. It pushes the source to the lua stack. + */ +static void +get_prop_src(lua_State *state, const char *setpoint, zfs_prop_t prop) +{ + if (zfs_prop_readonly(prop) || (prop == ZFS_PROP_VERSION)) { + lua_pushnil(state); + } else { + const char *src; + if (strcmp("", setpoint) == 0) { + src = "default"; + } else { + src = setpoint; + } + (void) lua_pushstring(state, src); + } +} + +/* + * Given an error encountered while getting properties, either longjmp's for + * a fatal error or pushes nothing to the stack for a non fatal one. + */ +static int +zcp_handle_error(lua_State *state, const char *dataset_name, + const char *property_name, int error) +{ + ASSERT3S(error, !=, 0); + if (error == ENOENT) { + return (0); + } else if (error == EINVAL) { + return (luaL_error(state, + "property '%s' is not a valid property on dataset '%s'", + property_name, dataset_name)); + } else if (error == EIO) { + return (luaL_error(state, + "I/O error while retrieving property '%s' on dataset '%s'", + property_name, dataset_name)); + } else { + return (luaL_error(state, "unexpected error %d while " + "retrieving property '%s' on dataset '%s'", + error, property_name, dataset_name)); + } +} + +/* + * Look up a user defined property in the zap object. If it exists, push it + * and the setpoint onto the stack, otherwise don't push anything. + */ +static int +zcp_get_user_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, + const char *property_name) +{ + int error; + char *buf; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.get_prop call + * without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + buf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + error = dsl_prop_get_ds(ds, property_name, 1, ZAP_MAXVALUELEN, + buf, setpoint); + dsl_dataset_rele(ds, FTAG); + + if (error != 0) { + kmem_free(buf, ZAP_MAXVALUELEN); + return (zcp_handle_error(state, dataset_name, property_name, + error)); + } + (void) lua_pushstring(state, buf); + (void) lua_pushstring(state, setpoint); + kmem_free(buf, ZAP_MAXVALUELEN); + return (2); +} + +/* + * Check if the property we're looking for is stored in the ds_dir. If so, + * return it in the 'val' argument. Return 0 on success and ENOENT and if + * the property is not present. + */ +static int +get_dsl_dir_prop(dsl_dataset_t *ds, zfs_prop_t zfs_prop, + uint64_t *val) +{ + dsl_dir_t *dd = ds->ds_dir; + mutex_enter(&dd->dd_lock); + switch (zfs_prop) { + case ZFS_PROP_USEDSNAP: + *val = dsl_dir_get_usedsnap(dd); + break; + case ZFS_PROP_USEDCHILD: + *val = dsl_dir_get_usedchild(dd); + break; + case ZFS_PROP_USEDDS: + *val = dsl_dir_get_usedds(dd); + break; + case ZFS_PROP_USEDREFRESERV: + *val = dsl_dir_get_usedrefreserv(dd); + break; + case ZFS_PROP_LOGICALUSED: + *val = dsl_dir_get_logicalused(dd); + break; + default: + mutex_exit(&dd->dd_lock); + return (SET_ERROR(ENOENT)); + } + mutex_exit(&dd->dd_lock); + return (0); +} + +/* + * Check if the property we're looking for is stored at the dsl_dataset or + * dsl_dir level. If so, push the property value and source onto the lua stack + * and return 0. If it is not present or a failure occurs in lookup, return a + * non-zero error value. + */ +static int +get_special_prop(lua_State *state, dsl_dataset_t *ds, const char *dsname, + zfs_prop_t zfs_prop) +{ + int error = 0; + objset_t *os; + uint64_t numval = 0; + char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + char setpoint[ZFS_MAX_DATASET_NAME_LEN] = + "Internal error - setpoint not determined"; + zfs_type_t ds_type; + zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); + (void) get_objset_type(ds, &ds_type); + + switch (zfs_prop) { + case ZFS_PROP_REFRATIO: + numval = dsl_get_refratio(ds); + break; + case ZFS_PROP_USED: + numval = dsl_get_used(ds); + break; + case ZFS_PROP_CLONES: { + nvlist_t *clones = fnvlist_alloc(); + error = get_clones_stat_impl(ds, clones); + if (error == 0) { + /* push list to lua stack */ + VERIFY0(zcp_nvlist_to_lua(state, clones, NULL, 0ULL)); + /* source */ + (void) lua_pushnil(state); + } + nvlist_free(clones); + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + case ZFS_PROP_COMPRESSRATIO: + numval = dsl_get_compressratio(ds); + break; + case ZFS_PROP_CREATION: + numval = dsl_get_creation(ds); + break; + case ZFS_PROP_REFERENCED: + numval = dsl_get_referenced(ds); + break; + case ZFS_PROP_AVAILABLE: + numval = dsl_get_available(ds); + break; + case ZFS_PROP_LOGICALREFERENCED: + numval = dsl_get_logicalreferenced(ds); + break; + case ZFS_PROP_CREATETXG: + numval = dsl_get_creationtxg(ds); + break; + case ZFS_PROP_GUID: + numval = dsl_get_guid(ds); + break; + case ZFS_PROP_UNIQUE: + numval = dsl_get_unique(ds); + break; + case ZFS_PROP_OBJSETID: + numval = dsl_get_objsetid(ds); + break; + case ZFS_PROP_ORIGIN: + dsl_dir_get_origin(ds->ds_dir, strval); + break; + case ZFS_PROP_USERACCOUNTING: + error = dmu_objset_from_ds(ds, &os); + if (error == 0) + numval = dmu_objset_userspace_present(os); + break; + case ZFS_PROP_WRITTEN: + error = dsl_get_written(ds, &numval); + break; + case ZFS_PROP_TYPE: + error = get_objset_type_name(ds, strval); + break; + case ZFS_PROP_PREV_SNAP: + error = dsl_get_prev_snap(ds, strval); + break; + case ZFS_PROP_NAME: + dsl_dataset_name(ds, strval); + break; + case ZFS_PROP_MOUNTPOINT: + error = dsl_get_mountpoint(ds, dsname, strval, setpoint); + break; + case ZFS_PROP_VERSION: + /* should be a snapshot or filesystem */ + ASSERT(ds_type != ZFS_TYPE_VOLUME); + error = dmu_objset_from_ds(ds, &os); + /* look in the master node for the version */ + if (error == 0) { + error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + sizeof (numval), 1, &numval); + } + break; + case ZFS_PROP_DEFER_DESTROY: + numval = dsl_get_defer_destroy(ds); + break; + case ZFS_PROP_USERREFS: + numval = dsl_get_userrefs(ds); + break; + case ZFS_PROP_FILESYSTEM_COUNT: + error = dsl_dir_get_filesystem_count(ds->ds_dir, &numval); + (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); + break; + case ZFS_PROP_SNAPSHOT_COUNT: + error = dsl_dir_get_snapshot_count(ds->ds_dir, &numval); + (void) strlcpy(setpoint, "", ZFS_MAX_DATASET_NAME_LEN); + break; + case ZFS_PROP_NUMCLONES: + numval = dsl_get_numclones(ds); + break; + case ZFS_PROP_INCONSISTENT: + numval = dsl_get_inconsistent(ds); + break; + case ZFS_PROP_IVSET_GUID: + if (dsl_dataset_is_zapified(ds)) { + error = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_object, DS_FIELD_IVSET_GUID, + sizeof (numval), 1, &numval); + } else { + error = ENOENT; + } + break; + case ZFS_PROP_RECEIVE_RESUME_TOKEN: { + char *token = get_receive_resume_stats_impl(ds); + + (void) strlcpy(strval, token, ZAP_MAXVALUELEN); + if (strcmp(strval, "") == 0) { + char *childval = get_child_receive_stats(ds); + + (void) strlcpy(strval, childval, ZAP_MAXVALUELEN); + if (strcmp(strval, "") == 0) + error = ENOENT; + + kmem_strfree(childval); + } + kmem_strfree(token); + break; + } + case ZFS_PROP_VOLSIZE: + ASSERT(ds_type == ZFS_TYPE_VOLUME || + ds_type == ZFS_TYPE_SNAPSHOT); + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", + sizeof (numval), 1, &numval); + } + if (error == 0) + (void) strlcpy(setpoint, dsname, + ZFS_MAX_DATASET_NAME_LEN); + + break; + case ZFS_PROP_VOLBLOCKSIZE: { + ASSERT(ds_type == ZFS_TYPE_VOLUME); + dmu_object_info_t doi; + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + error = dmu_object_info(os, ZVOL_OBJ, &doi); + if (error == 0) + numval = doi.doi_data_block_size; + } + break; + } + + case ZFS_PROP_KEYSTATUS: + case ZFS_PROP_KEYFORMAT: { + /* provide defaults in case no crypto obj exists */ + setpoint[0] = '\0'; + if (zfs_prop == ZFS_PROP_KEYSTATUS) + numval = ZFS_KEYSTATUS_NONE; + else + numval = ZFS_KEYFORMAT_NONE; + + nvlist_t *nvl, *propval; + nvl = fnvlist_alloc(); + dsl_dataset_crypt_stats(ds, nvl); + if (nvlist_lookup_nvlist(nvl, zfs_prop_to_name(zfs_prop), + &propval) == 0) { + char *source; + + (void) nvlist_lookup_uint64(propval, ZPROP_VALUE, + &numval); + if (nvlist_lookup_string(propval, ZPROP_SOURCE, + &source) == 0) + strlcpy(setpoint, source, sizeof (setpoint)); + } + nvlist_free(nvl); + break; + } + + default: + /* Did not match these props, check in the dsl_dir */ + error = get_dsl_dir_prop(ds, zfs_prop, &numval); + } + if (error != 0) { + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + + switch (prop_type) { + case PROP_TYPE_NUMBER: { + (void) lua_pushnumber(state, numval); + break; + } + case PROP_TYPE_STRING: { + (void) lua_pushstring(state, strval); + break; + } + case PROP_TYPE_INDEX: { + const char *propval; + error = zfs_prop_index_to_string(zfs_prop, numval, &propval); + if (error != 0) { + kmem_free(strval, ZAP_MAXVALUELEN); + return (error); + } + (void) lua_pushstring(state, propval); + break; + } + } + kmem_free(strval, ZAP_MAXVALUELEN); + + /* Push the source to the stack */ + get_prop_src(state, setpoint, zfs_prop); + return (0); +} + +/* + * Look up a property and its source in the zap object. If the value is + * present and successfully retrieved, push the value and source on the + * lua stack and return 0. On failure, return a non-zero error value. + */ +static int +get_zap_prop(lua_State *state, dsl_dataset_t *ds, zfs_prop_t zfs_prop) +{ + int error = 0; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + char *strval = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + uint64_t numval; + const char *prop_name = zfs_prop_to_name(zfs_prop); + zprop_type_t prop_type = zfs_prop_get_type(zfs_prop); + + if (prop_type == PROP_TYPE_STRING) { + /* Push value to lua stack */ + error = dsl_prop_get_ds(ds, prop_name, 1, + ZAP_MAXVALUELEN, strval, setpoint); + if (error == 0) + (void) lua_pushstring(state, strval); + } else { + error = dsl_prop_get_ds(ds, prop_name, sizeof (numval), + 1, &numval, setpoint); + +#ifdef _KERNEL + /* Fill in temporary value for prop, if applicable */ + (void) zfs_get_temporary_prop(ds, zfs_prop, &numval, setpoint); +#else + return (luaL_error(state, + "temporary properties only supported in kernel mode", + prop_name)); +#endif + /* Push value to lua stack */ + if (prop_type == PROP_TYPE_INDEX) { + const char *propval; + error = zfs_prop_index_to_string(zfs_prop, numval, + &propval); + if (error == 0) + (void) lua_pushstring(state, propval); + } else { + if (error == 0) + (void) lua_pushnumber(state, numval); + } + } + kmem_free(strval, ZAP_MAXVALUELEN); + if (error == 0) + get_prop_src(state, setpoint, zfs_prop); + return (error); +} + +/* + * Determine whether property is valid for a given dataset + */ +boolean_t +prop_valid_for_ds(dsl_dataset_t *ds, zfs_prop_t zfs_prop) +{ + int error; + zfs_type_t zfs_type; + + /* properties not supported */ + if ((zfs_prop == ZFS_PROP_ISCSIOPTIONS) || + (zfs_prop == ZFS_PROP_MOUNTED)) + return (B_FALSE); + + /* if we want the origin prop, ds must be a clone */ + if ((zfs_prop == ZFS_PROP_ORIGIN) && (!dsl_dir_is_clone(ds->ds_dir))) + return (B_FALSE); + + error = get_objset_type(ds, &zfs_type); + if (error != 0) + return (B_FALSE); + return (zfs_prop_valid_for_type(zfs_prop, zfs_type, B_FALSE)); +} + +/* + * Look up a given dataset property. On success return 2, the number of + * values pushed to the lua stack (property value and source). On a fatal + * error, longjmp. On a non fatal error push nothing. + */ +static int +zcp_get_system_prop(lua_State *state, dsl_pool_t *dp, const char *dataset_name, + zfs_prop_t zfs_prop) +{ + int error; + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.get_prop call + * without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + /* Check that the property is valid for the given dataset */ + const char *prop_name = zfs_prop_to_name(zfs_prop); + if (!prop_valid_for_ds(ds, zfs_prop)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + /* Check if the property can be accessed directly */ + error = get_special_prop(state, ds, dataset_name, zfs_prop); + if (error == 0) { + dsl_dataset_rele(ds, FTAG); + /* The value and source have been pushed by get_special_prop */ + return (2); + } + if (error != ENOENT) { + dsl_dataset_rele(ds, FTAG); + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + + /* If we were unable to find it, look in the zap object */ + error = get_zap_prop(state, ds, zfs_prop); + dsl_dataset_rele(ds, FTAG); + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + /* The value and source have been pushed by get_zap_prop */ + return (2); +} + +#ifdef _KERNEL +static zfs_userquota_prop_t +get_userquota_prop(const char *prop_name) +{ + zfs_userquota_prop_t type; + /* Figure out the property type ({user|group}{quota|used}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(prop_name, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + return (type); +} + +/* + * Given the name of a zfs_userquota_prop, this function determines the + * prop type as well as the numeric group/user ids based on the string + * following the '@' in the property name. On success, returns 0. On failure, + * returns a non-zero error. + * 'domain' must be free'd by caller using kmem_strfree() + */ +static int +parse_userquota_prop(const char *prop_name, zfs_userquota_prop_t *type, + char **domain, uint64_t *rid) +{ + char *cp, *end, *domain_val; + + *type = get_userquota_prop(prop_name); + if (*type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + *rid = 0; + cp = strchr(prop_name, '@') + 1; + if (strncmp(cp, "S-1-", 4) == 0) { + /* + * It's a numeric SID (eg "S-1-234-567-89") and we want to + * separate the domain id and the rid + */ + int domain_len = strrchr(cp, '-') - cp; + domain_val = kmem_alloc(domain_len + 1, KM_SLEEP); + (void) strncpy(domain_val, cp, domain_len); + domain_val[domain_len] = '\0'; + cp += domain_len + 1; + + (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); + if (*end != '\0') { + kmem_strfree(domain_val); + return (EINVAL); + } + } else { + /* It's only a user/group ID (eg "12345"), just get the rid */ + domain_val = NULL; + (void) ddi_strtoll(cp, &end, 10, (longlong_t *)rid); + if (*end != '\0') + return (EINVAL); + } + *domain = domain_val; + return (0); +} + +/* + * Look up {user|group}{quota|used} property for given dataset. On success + * push the value (quota or used amount) and the setpoint. On failure, push + * a lua error. + */ +static int +zcp_get_userquota_prop(lua_State *state, dsl_pool_t *dp, + const char *dataset_name, const char *prop_name) +{ + zfsvfs_t *zfvp; + zfsvfs_t *zfsvfs; + int error; + zfs_userquota_prop_t type; + char *domain; + uint64_t rid, value = 0; + objset_t *os; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + error = parse_userquota_prop(prop_name, &type, &domain, &rid); + if (error == 0) { + error = dmu_objset_from_ds(ds, &os); + if (error == 0) { + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create_impl(&zfvp, zfsvfs, os); + if (error == 0) { + error = zfs_userspace_one(zfvp, type, domain, + rid, &value); + zfsvfs_free(zfvp); + } + } + if (domain != NULL) + kmem_strfree(domain); + } + dsl_dataset_rele(ds, FTAG); + + if ((value == 0) && ((type == ZFS_PROP_USERQUOTA) || + (type == ZFS_PROP_GROUPQUOTA))) + error = SET_ERROR(ENOENT); + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + prop_name, error)); + } + + (void) lua_pushnumber(state, value); + (void) lua_pushstring(state, dataset_name); + return (2); +} +#endif + +/* + * Determines the name of the snapshot referenced in the written property + * name. Returns snapshot name in snap_name, a buffer that must be at least + * as large as ZFS_MAX_DATASET_NAME_LEN + */ +static void +parse_written_prop(const char *dataset_name, const char *prop_name, + char *snap_name) +{ + ASSERT(zfs_prop_written(prop_name)); + const char *name = prop_name + ZFS_WRITTEN_PROP_PREFIX_LEN; + if (strchr(name, '@') == NULL) { + (void) snprintf(snap_name, ZFS_MAX_DATASET_NAME_LEN, "%s@%s", + dataset_name, name); + } else { + (void) strlcpy(snap_name, name, ZFS_MAX_DATASET_NAME_LEN); + } +} + +/* + * Look up written@ property for given dataset. On success + * push the value and the setpoint. If error is fatal, we will + * longjmp, otherwise push nothing. + */ +static int +zcp_get_written_prop(lua_State *state, dsl_pool_t *dp, + const char *dataset_name, const char *prop_name) +{ + char snap_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t used, comp, uncomp; + dsl_dataset_t *old; + int error = 0; + + parse_written_prop(dataset_name, prop_name, snap_name); + dsl_dataset_t *new = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (new == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + error = dsl_dataset_hold(dp, snap_name, FTAG, &old); + if (error != 0) { + dsl_dataset_rele(new, FTAG); + return (zcp_dataset_hold_error(state, dp, snap_name, + error)); + } + error = dsl_dataset_space_written(old, new, + &used, &comp, &uncomp); + + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + + if (error != 0) { + return (zcp_handle_error(state, dataset_name, + snap_name, error)); + } + (void) lua_pushnumber(state, used); + (void) lua_pushstring(state, dataset_name); + return (2); +} + +static int zcp_get_prop(lua_State *state); +static zcp_lib_info_t zcp_get_prop_info = { + .name = "get_prop", + .func = zcp_get_prop, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "property", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_get_prop(lua_State *state) +{ + const char *dataset_name; + const char *property_name; + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + zcp_lib_info_t *libinfo = &zcp_get_prop_info; + + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + + dataset_name = lua_tostring(state, 1); + property_name = lua_tostring(state, 2); + + /* User defined property */ + if (zfs_prop_user(property_name)) { + return (zcp_get_user_prop(state, dp, + dataset_name, property_name)); + } + /* userspace property */ + if (zfs_prop_userquota(property_name)) { +#ifdef _KERNEL + return (zcp_get_userquota_prop(state, dp, + dataset_name, property_name)); +#else + return (luaL_error(state, + "user quota properties only supported in kernel mode", + property_name)); +#endif + } + /* written@ property */ + if (zfs_prop_written(property_name)) { + return (zcp_get_written_prop(state, dp, + dataset_name, property_name)); + } + + zfs_prop_t zfs_prop = zfs_name_to_prop(property_name); + /* Valid system property */ + if (zfs_prop != ZPROP_INVAL) { + return (zcp_get_system_prop(state, dp, dataset_name, + zfs_prop)); + } + + /* Invalid property name */ + return (luaL_error(state, + "'%s' is not a valid property", property_name)); +} + +int +zcp_load_get_lib(lua_State *state) +{ + lua_pushcclosure(state, zcp_get_prop_info.func, 0); + lua_setfield(state, -2, zcp_get_prop_info.name); + + return (1); +} diff --git a/module/zfs/zcp_global.c b/module/zfs/zcp_global.c new file mode 100644 index 000000000000..8e166e0736d6 --- /dev/null +++ b/module/zfs/zcp_global.c @@ -0,0 +1,89 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + */ + +#include + +#include +#include + +typedef struct zcp_errno_global { + const char *zeg_name; + int zeg_errno; +} zcp_errno_global_t; + +static const zcp_errno_global_t errno_globals[] = { + {"EPERM", EPERM}, + {"ENOENT", ENOENT}, + {"ESRCH", ESRCH}, + {"EINTR", EINTR}, + {"EIO", EIO}, + {"ENXIO", ENXIO}, + {"E2BIG", E2BIG}, + {"ENOEXEC", ENOEXEC}, + {"EBADF", EBADF}, + {"ECHILD", ECHILD}, + {"EAGAIN", EAGAIN}, + {"ENOMEM", ENOMEM}, + {"EACCES", EACCES}, + {"EFAULT", EFAULT}, + {"ENOTBLK", ENOTBLK}, + {"EBUSY", EBUSY}, + {"EEXIST", EEXIST}, + {"EXDEV", EXDEV}, + {"ENODEV", ENODEV}, + {"ENOTDIR", ENOTDIR}, + {"EISDIR", EISDIR}, + {"EINVAL", EINVAL}, + {"ENFILE", ENFILE}, + {"EMFILE", EMFILE}, + {"ENOTTY", ENOTTY}, + {"ETXTBSY", ETXTBSY}, + {"EFBIG", EFBIG}, + {"ENOSPC", ENOSPC}, + {"ESPIPE", ESPIPE}, + {"EROFS", EROFS}, + {"EMLINK", EMLINK}, + {"EPIPE", EPIPE}, + {"EDOM", EDOM}, + {"ERANGE", ERANGE}, + {"EDEADLK", EDEADLK}, + {"ENOLCK", ENOLCK}, + {"ECANCELED", ECANCELED}, + {"ENOTSUP", ENOTSUP}, + {"EDQUOT", EDQUOT}, + {"ENAMETOOLONG", ENAMETOOLONG}, + {0, 0} +}; + +static void +zcp_load_errno_globals(lua_State *state) +{ + const zcp_errno_global_t *global = errno_globals; + while (global->zeg_name != NULL) { + lua_pushnumber(state, (lua_Number)global->zeg_errno); + lua_setglobal(state, global->zeg_name); + global++; + } +} + +void +zcp_load_globals(lua_State *state) +{ + zcp_load_errno_globals(state); +} diff --git a/module/zfs/zcp_iter.c b/module/zfs/zcp_iter.c new file mode 100644 index 000000000000..f727c56f212d --- /dev/null +++ b/module/zfs/zcp_iter.c @@ -0,0 +1,751 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2018 by Delphix. All rights reserved. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "zfs_comutil.h" + +typedef int (zcp_list_func_t)(lua_State *); +typedef struct zcp_list_info { + const char *name; + zcp_list_func_t *func; + zcp_list_func_t *gc; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; +} zcp_list_info_t; + +static int +zcp_clones_iter(lua_State *state) +{ + int err; + char clonename[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds, *clone; + zap_attribute_t za; + zap_cursor_t zc; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (dsl_dataset_phys(ds)->ds_next_clones_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + zap_cursor_init_serialized(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + err = dsl_dataset_hold_obj(dp, za.za_first_integer, FTAG, &clone); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from " + "dsl_dataset_hold_obj(za_first_integer)", err)); + } + + dsl_dir_name(clone->ds_dir, clonename); + dsl_dataset_rele(clone, FTAG); + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, clonename); + return (1); +} + +static int zcp_clones_list(lua_State *); +static zcp_list_info_t zcp_clones_list_info = { + .name = "clones", + .func = zcp_clones_list, + .gc = NULL, + .pargs = { + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_clones_list(lua_State *state) +{ + const char *snapname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + + /* + * zcp_dataset_hold will either successfully return the requested + * dataset or throw a lua error and longjmp out of the zfs.list.clones + * call without returning. + */ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + boolean_t issnap = ds->ds_is_snapshot; + uint64_t cursor = 0; + uint64_t dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (!issnap) { + return (zcp_argerror(state, 1, "%s is not a snapshot", + snapname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_clones_iter, 2); + return (1); +} + +static int +zcp_snapshots_iter(lua_State *state) +{ + int err; + char snapname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + objset_t *os; + char *p; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + dsl_dataset_name(ds, snapname); + VERIFY3U(sizeof (snapname), >, + strlcat(snapname, "@", sizeof (snapname))); + + p = strchr(snapname, '\0'); + VERIFY0(dmu_objset_from_ds(ds, &os)); + err = dmu_snapshot_list_next(os, + sizeof (snapname) - (p - snapname), p, NULL, &cursor, NULL); + dsl_dataset_rele(ds, FTAG); + + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dmu_snapshot_list_next()", err)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, snapname); + return (1); +} + +static int zcp_snapshots_list(lua_State *); +static zcp_list_info_t zcp_snapshots_list_info = { + .name = "snapshots", + .func = zcp_snapshots_list, + .gc = NULL, + .pargs = { + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_snapshots_list(lua_State *state) +{ + const char *fsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + boolean_t issnap; + uint64_t dsobj; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + issnap = ds->ds_is_snapshot; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, + "argument %s cannot be a snapshot", fsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, 0); + lua_pushcclosure(state, &zcp_snapshots_iter, 2); + return (1); +} + +static int +zcp_children_iter(lua_State *state) +{ + int err; + char childname[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + dsl_dataset_t *ds; + objset_t *os; + char *p; + + err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + dsl_dataset_name(ds, childname); + VERIFY3U(sizeof (childname), >, + strlcat(childname, "/", sizeof (childname))); + p = strchr(childname, '\0'); + + VERIFY0(dmu_objset_from_ds(ds, &os)); + do { + err = dmu_dir_list_next(os, + sizeof (childname) - (p - childname), p, NULL, &cursor); + } while (err == 0 && zfs_dataset_name_hidden(childname)); + dsl_dataset_rele(ds, FTAG); + + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dmu_dir_list_next()", + err)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, childname); + return (1); +} + +static int zcp_children_list(lua_State *); +static zcp_list_info_t zcp_children_list_info = { + .name = "children", + .func = zcp_children_list, + .gc = NULL, + .pargs = { + { .za_name = "filesystem | volume", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_children_list(lua_State *state) +{ + const char *fsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + boolean_t issnap; + uint64_t dsobj; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, fsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + issnap = ds->ds_is_snapshot; + dsobj = ds->ds_object; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, + "argument %s cannot be a snapshot", fsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, 0); + lua_pushcclosure(state, &zcp_children_iter, 2); + return (1); +} + +static int +zcp_user_props_list_gc(lua_State *state) +{ + nvlist_t **props = lua_touserdata(state, 1); + if (*props != NULL) + fnvlist_free(*props); + return (0); +} + +static int +zcp_user_props_iter(lua_State *state) +{ + char *source, *val; + nvlist_t *nvprop; + nvlist_t **props = lua_touserdata(state, lua_upvalueindex(1)); + nvpair_t *pair = lua_touserdata(state, lua_upvalueindex(2)); + + do { + pair = nvlist_next_nvpair(*props, pair); + if (pair == NULL) { + fnvlist_free(*props); + *props = NULL; + return (0); + } + } while (!zfs_prop_user(nvpair_name(pair))); + + lua_pushlightuserdata(state, pair); + lua_replace(state, lua_upvalueindex(2)); + + nvprop = fnvpair_value_nvlist(pair); + val = fnvlist_lookup_string(nvprop, ZPROP_VALUE); + source = fnvlist_lookup_string(nvprop, ZPROP_SOURCE); + + (void) lua_pushstring(state, nvpair_name(pair)); + (void) lua_pushstring(state, val); + (void) lua_pushstring(state, source); + return (3); +} + +static int zcp_user_props_list(lua_State *); +static zcp_list_info_t zcp_user_props_list_info = { + .name = "user_properties", + .func = zcp_user_props_list, + .gc = zcp_user_props_list_gc, + .pargs = { + { .za_name = "filesystem | snapshot | volume", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * 'properties' was the initial name for 'user_properties' seen + * above. 'user_properties' is a better name as it distinguishes + * these properties from 'system_properties' which are different. + * In order to avoid breaking compatibility between different + * versions of ZFS, we declare 'properties' as an alias for + * 'user_properties'. + */ +static zcp_list_info_t zcp_props_list_info = { + .name = "properties", + .func = zcp_user_props_list, + .gc = zcp_user_props_list_gc, + .pargs = { + { .za_name = "filesystem | snapshot | volume", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_user_props_list(lua_State *state) +{ + const char *dsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + objset_t *os; + nvlist_t **props = lua_newuserdata(state, sizeof (nvlist_t *)); + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + VERIFY0(dmu_objset_from_ds(ds, &os)); + VERIFY0(dsl_prop_get_all(os, props)); + dsl_dataset_rele(ds, FTAG); + + /* + * Set the metatable for the properties list to free it on + * completion. + */ + luaL_getmetatable(state, zcp_user_props_list_info.name); + (void) lua_setmetatable(state, -2); + + lua_pushlightuserdata(state, NULL); + lua_pushcclosure(state, &zcp_user_props_iter, 2); + return (1); +} + + +/* + * Populate nv with all valid system properties and their values for the given + * dataset. + */ +static void +zcp_dataset_system_props(dsl_dataset_t *ds, nvlist_t *nv) +{ + for (int prop = ZFS_PROP_TYPE; prop < ZFS_NUM_PROPS; prop++) { + /* Do not display hidden props */ + if (!zfs_prop_visible(prop)) + continue; + /* Do not display props not valid for this dataset */ + if (!prop_valid_for_ds(ds, prop)) + continue; + fnvlist_add_boolean(nv, zfs_prop_to_name(prop)); + } +} + +static int zcp_system_props_list(lua_State *); +static zcp_list_info_t zcp_system_props_list_info = { + .name = "system_properties", + .func = zcp_system_props_list, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * Get a list of all visible system properties and their values for a given + * dataset. Returned on the stack as a Lua table. + */ +static int +zcp_system_props_list(lua_State *state) +{ + int error; + char errbuf[128]; + const char *dataset_name; + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + zcp_list_info_t *libinfo = &zcp_system_props_list_info; + zcp_parse_args(state, libinfo->name, libinfo->pargs, libinfo->kwargs); + dataset_name = lua_tostring(state, 1); + nvlist_t *nv = fnvlist_alloc(); + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dataset_name, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + /* Get the names of all valid system properties for this dataset */ + zcp_dataset_system_props(ds, nv); + dsl_dataset_rele(ds, FTAG); + + /* push list as lua table */ + error = zcp_nvlist_to_lua(state, nv, errbuf, sizeof (errbuf)); + nvlist_free(nv); + if (error != 0) { + return (luaL_error(state, + "Error returning nvlist: %s", errbuf)); + } + return (1); +} + +static int +zcp_bookmarks_iter(lua_State *state) +{ + char ds_name[ZFS_MAX_DATASET_NAME_LEN]; + char bookmark_name[ZFS_MAX_DATASET_NAME_LEN]; + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + zap_attribute_t za; + zap_cursor_t zc; + + int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (!dsl_dataset_is_zapified(ds)) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + err = zap_lookup(dp->dp_meta_objset, ds->ds_object, + DS_FIELD_BOOKMARK_NAMES, sizeof (ds->ds_bookmarks_obj), 1, + &ds->ds_bookmarks_obj); + if (err != 0 && err != ENOENT) { + dsl_dataset_rele(ds, FTAG); + return (luaL_error(state, + "unexpected error %d from zap_lookup()", err)); + } + if (ds->ds_bookmarks_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + /* Store the dataset's name so we can append the bookmark's name */ + dsl_dataset_name(ds, ds_name); + + zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_bookmarks_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + /* Create the full "pool/fs#bookmark" string to return */ + int n = snprintf(bookmark_name, ZFS_MAX_DATASET_NAME_LEN, "%s#%s", + ds_name, za.za_name); + if (n >= ZFS_MAX_DATASET_NAME_LEN) { + return (luaL_error(state, + "unexpected error %d from snprintf()", ENAMETOOLONG)); + } + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, bookmark_name); + return (1); +} + +static int zcp_bookmarks_list(lua_State *); +static zcp_list_info_t zcp_bookmarks_list_info = { + .name = "bookmarks", + .func = zcp_bookmarks_list, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +static int +zcp_bookmarks_list(lua_State *state) +{ + const char *dsname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + boolean_t issnap = ds->ds_is_snapshot; + uint64_t dsobj = ds->ds_object; + uint64_t cursor = 0; + dsl_dataset_rele(ds, FTAG); + + if (issnap) { + return (zcp_argerror(state, 1, "%s is a snapshot", dsname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_bookmarks_iter, 2); + return (1); +} + +static int +zcp_holds_iter(lua_State *state) +{ + uint64_t dsobj = lua_tonumber(state, lua_upvalueindex(1)); + uint64_t cursor = lua_tonumber(state, lua_upvalueindex(2)); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + dsl_dataset_t *ds; + zap_attribute_t za; + zap_cursor_t zc; + + int err = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + if (err == ENOENT) { + return (0); + } else if (err != 0) { + return (luaL_error(state, + "unexpected error %d from dsl_dataset_hold_obj(dsobj)", + err)); + } + + if (dsl_dataset_phys(ds)->ds_userrefs_obj == 0) { + dsl_dataset_rele(ds, FTAG); + return (0); + } + + zap_cursor_init_serialized(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + dsl_dataset_phys(ds)->ds_userrefs_obj, cursor); + dsl_dataset_rele(ds, FTAG); + + err = zap_cursor_retrieve(&zc, &za); + if (err != 0) { + zap_cursor_fini(&zc); + if (err != ENOENT) { + return (luaL_error(state, + "unexpected error %d from zap_cursor_retrieve()", + err)); + } + return (0); + } + zap_cursor_advance(&zc); + cursor = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + + lua_pushnumber(state, cursor); + lua_replace(state, lua_upvalueindex(2)); + + (void) lua_pushstring(state, za.za_name); + (void) lua_pushnumber(state, za.za_first_integer); + return (2); +} + +static int zcp_holds_list(lua_State *); +static zcp_list_info_t zcp_holds_list_info = { + .name = "holds", + .func = zcp_holds_list, + .gc = NULL, + .pargs = { + { .za_name = "snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + } +}; + +/* + * Iterate over all the holds for a given dataset. Each iteration returns + * a hold's tag and its timestamp as an integer. + */ +static int +zcp_holds_list(lua_State *state) +{ + const char *snapname = lua_tostring(state, 1); + dsl_pool_t *dp = zcp_run_info(state)->zri_pool; + + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, snapname, FTAG); + if (ds == NULL) + return (1); /* not reached; zcp_dataset_hold() longjmp'd */ + + boolean_t issnap = ds->ds_is_snapshot; + uint64_t dsobj = ds->ds_object; + uint64_t cursor = 0; + dsl_dataset_rele(ds, FTAG); + + if (!issnap) { + return (zcp_argerror(state, 1, "%s is not a snapshot", + snapname)); + } + + lua_pushnumber(state, dsobj); + lua_pushnumber(state, cursor); + lua_pushcclosure(state, &zcp_holds_iter, 2); + return (1); +} + +static int +zcp_list_func(lua_State *state) +{ + zcp_list_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); + + zcp_parse_args(state, info->name, info->pargs, info->kwargs); + + return (info->func(state)); +} + +int +zcp_load_list_lib(lua_State *state) +{ + int i; + zcp_list_info_t *zcp_list_funcs[] = { + &zcp_children_list_info, + &zcp_snapshots_list_info, + &zcp_user_props_list_info, + &zcp_props_list_info, + &zcp_clones_list_info, + &zcp_system_props_list_info, + &zcp_bookmarks_list_info, + &zcp_holds_list_info, + NULL + }; + + lua_newtable(state); + + for (i = 0; zcp_list_funcs[i] != NULL; i++) { + zcp_list_info_t *info = zcp_list_funcs[i]; + + if (info->gc != NULL) { + /* + * If the function requires garbage collection, create + * a metatable with its name and register the __gc + * function. + */ + (void) luaL_newmetatable(state, info->name); + (void) lua_pushstring(state, "__gc"); + lua_pushcfunction(state, info->gc); + lua_settable(state, -3); + lua_pop(state, 1); + } + + lua_pushlightuserdata(state, info); + lua_pushcclosure(state, &zcp_list_func, 1); + lua_setfield(state, -2, info->name); + info++; + } + + return (1); +} diff --git a/module/zfs/zcp_set.c b/module/zfs/zcp_set.c new file mode 100644 index 000000000000..cebb56a5f181 --- /dev/null +++ b/module/zfs/zcp_set.c @@ -0,0 +1,100 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyrigh 2020 Joyent, Inc. + */ + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +static void +zcp_set_user_prop(lua_State *state, dsl_pool_t *dp, const char *dsname, + const char *prop_name, const char *prop_val, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = zcp_dataset_hold(state, dp, dsname, FTAG); + if (ds == NULL) + return; /* not reached; zcp_dataset_hold() longjmp'd */ + + nvlist_t *nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, prop_name, prop_val); + + dsl_props_set_sync_impl(ds, ZPROP_SRC_LOCAL, nvl, tx); + + fnvlist_free(nvl); + dsl_dataset_rele(ds, FTAG); +} + +int +zcp_set_prop_check(void *arg, dmu_tx_t *tx) +{ + zcp_set_prop_arg_t *args = arg; + const char *prop_name = args->prop; + dsl_props_set_arg_t dpsa = { + .dpsa_dsname = args->dsname, + .dpsa_source = ZPROP_SRC_LOCAL, + }; + nvlist_t *nvl = NULL; + int ret = 0; + + /* + * Only user properties are currently supported. When non-user + * properties are supported, we will want to use + * zfs_valid_proplist() to verify the properties. + */ + if (!zfs_prop_user(prop_name)) { + return (EINVAL); + } + + nvl = fnvlist_alloc(); + fnvlist_add_string(nvl, args->prop, args->val); + dpsa.dpsa_props = nvl; + + ret = dsl_props_set_check(&dpsa, tx); + nvlist_free(nvl); + + return (ret); +} + +void +zcp_set_prop_sync(void *arg, dmu_tx_t *tx) +{ + zcp_set_prop_arg_t *args = arg; + zcp_run_info_t *ri = zcp_run_info(args->state); + dsl_pool_t *dp = ri->zri_pool; + + const char *dsname = args->dsname; + const char *prop_name = args->prop; + const char *prop_val = args->val; + + if (zfs_prop_user(prop_name)) { + zcp_set_user_prop(args->state, dp, dsname, prop_name, + prop_val, tx); + } +} diff --git a/module/zfs/zcp_synctask.c b/module/zfs/zcp_synctask.c new file mode 100644 index 000000000000..4e0fa0d85cbf --- /dev/null +++ b/module/zfs/zcp_synctask.c @@ -0,0 +1,544 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016, 2017 by Delphix. All rights reserved. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright 2020 Joyent, Inc. + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DST_AVG_BLKSHIFT 14 + +typedef struct zcp_inherit_prop_arg { + lua_State *zipa_state; + const char *zipa_prop; + dsl_props_set_arg_t zipa_dpsa; +} zcp_inherit_prop_arg_t; + +typedef int (zcp_synctask_func_t)(lua_State *, boolean_t, nvlist_t *); +typedef struct zcp_synctask_info { + const char *name; + zcp_synctask_func_t *func; + const zcp_arg_t pargs[4]; + const zcp_arg_t kwargs[2]; + zfs_space_check_t space_check; + int blocks_modified; +} zcp_synctask_info_t; + +/* + * Generic synctask interface for channel program syncfuncs. + * + * To perform some action in syncing context, we'd generally call + * dsl_sync_task(), but since the Lua script is already running inside a + * synctask we need to leave out some actions (such as acquiring the config + * rwlock and performing space checks). + * + * If 'sync' is false, executes a dry run and returns the error code. + * + * If we are not running in syncing context and we are not doing a dry run + * (meaning we are running a zfs.sync function in open-context) then we + * return a Lua error. + * + * This function also handles common fatal error cases for channel program + * library functions. If a fatal error occurs, err_dsname will be the dataset + * name reported in error messages, if supplied. + */ +static int +zcp_sync_task(lua_State *state, dsl_checkfunc_t *checkfunc, + dsl_syncfunc_t *syncfunc, void *arg, boolean_t sync, const char *err_dsname) +{ + int err; + zcp_run_info_t *ri = zcp_run_info(state); + + err = checkfunc(arg, ri->zri_tx); + if (!sync) + return (err); + + if (!ri->zri_sync) { + return (luaL_error(state, "running functions from the zfs.sync " + "submodule requires passing sync=TRUE to " + "lzc_channel_program() (i.e. do not specify the \"-n\" " + "command line argument)")); + } + + if (err == 0) { + syncfunc(arg, ri->zri_tx); + } else if (err == EIO) { + if (err_dsname != NULL) { + return (luaL_error(state, + "I/O error while accessing dataset '%s'", + err_dsname)); + } else { + return (luaL_error(state, + "I/O error while accessing dataset.")); + } + } + + return (err); +} + + +static int zcp_synctask_destroy(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_destroy_info = { + .name = "destroy", + .func = zcp_synctask_destroy, + .pargs = { + {.za_name = "filesystem | snapshot", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {.za_name = "defer", .za_lua_type = LUA_TBOOLEAN}, + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_DESTROY, + .blocks_modified = 0 +}; + +/* ARGSUSED */ +static int +zcp_synctask_destroy(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *dsname = lua_tostring(state, 1); + + boolean_t issnap = (strchr(dsname, '@') != NULL); + + if (!issnap && !lua_isnil(state, 2)) { + return (luaL_error(state, + "'deferred' kwarg only supported for snapshots: %s", + dsname)); + } + + if (issnap) { + dsl_destroy_snapshot_arg_t ddsa = { 0 }; + ddsa.ddsa_name = dsname; + if (!lua_isnil(state, 2)) { + ddsa.ddsa_defer = lua_toboolean(state, 2); + } else { + ddsa.ddsa_defer = B_FALSE; + } + + err = zcp_sync_task(state, dsl_destroy_snapshot_check, + dsl_destroy_snapshot_sync, &ddsa, sync, dsname); + } else { + dsl_destroy_head_arg_t ddha = { 0 }; + ddha.ddha_name = dsname; + + err = zcp_sync_task(state, dsl_destroy_head_check, + dsl_destroy_head_sync, &ddha, sync, dsname); + } + + return (err); +} + +static int zcp_synctask_promote(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_promote_info = { + .name = "promote", + .func = zcp_synctask_promote, + .pargs = { + {.za_name = "clone", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 3 +}; + +static int +zcp_synctask_promote(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + dsl_dataset_promote_arg_t ddpa = { 0 }; + const char *dsname = lua_tostring(state, 1); + zcp_run_info_t *ri = zcp_run_info(state); + + ddpa.ddpa_clonename = dsname; + ddpa.err_ds = err_details; + ddpa.cr = ri->zri_cred; + ddpa.proc = ri->zri_proc; + + /* + * If there was a snapshot name conflict, then err_ds will be filled + * with a list of conflicting snapshot names. + */ + err = zcp_sync_task(state, dsl_dataset_promote_check, + dsl_dataset_promote_sync, &ddpa, sync, dsname); + + return (err); +} + +static int zcp_synctask_rollback(lua_State *, boolean_t, nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_rollback_info = { + .name = "rollback", + .func = zcp_synctask_rollback, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 1, + .pargs = { + {.za_name = "filesystem", .za_lua_type = LUA_TSTRING}, + {0, 0} + }, + .kwargs = { + {0, 0} + } +}; + +static int +zcp_synctask_rollback(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *dsname = lua_tostring(state, 1); + dsl_dataset_rollback_arg_t ddra = { 0 }; + + ddra.ddra_fsname = dsname; + ddra.ddra_result = err_details; + + err = zcp_sync_task(state, dsl_dataset_rollback_check, + dsl_dataset_rollback_sync, &ddra, sync, dsname); + + return (err); +} + +static int zcp_synctask_snapshot(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_snapshot_info = { + .name = "snapshot", + .func = zcp_synctask_snapshot, + .pargs = { + {.za_name = "filesystem@snapname | volume@snapname", + .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_NORMAL, + .blocks_modified = 3 +}; + +/* ARGSUSED */ +static int +zcp_synctask_snapshot(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + dsl_dataset_snapshot_arg_t ddsa = { 0 }; + const char *dsname = lua_tostring(state, 1); + zcp_run_info_t *ri = zcp_run_info(state); + + /* + * On old pools, the ZIL must not be active when a snapshot is created, + * but we can't suspend the ZIL because we're already in syncing + * context. + */ + if (spa_version(ri->zri_pool->dp_spa) < SPA_VERSION_FAST_SNAP) { + return (SET_ERROR(ENOTSUP)); + } + + /* + * We only allow for a single snapshot rather than a list, so the + * error list output is unnecessary. + */ + ddsa.ddsa_errors = NULL; + ddsa.ddsa_props = NULL; + ddsa.ddsa_cr = ri->zri_cred; + ddsa.ddsa_proc = ri->zri_proc; + ddsa.ddsa_snaps = fnvlist_alloc(); + fnvlist_add_boolean(ddsa.ddsa_snaps, dsname); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, ddsa.ddsa_snaps); + + err = zcp_sync_task(state, dsl_dataset_snapshot_check, + dsl_dataset_snapshot_sync, &ddsa, sync, dsname); + + if (err == 0) { + /* + * We may need to create a new device minor node for this + * dataset (if it is a zvol and the "snapdev" property is set). + * Save it in the nvlist so that it can be processed in open + * context. + */ + fnvlist_add_boolean(ri->zri_new_zvols, dsname); + } + + zcp_deregister_cleanup(state, zch); + fnvlist_free(ddsa.ddsa_snaps); + + return (err); +} + +static int zcp_synctask_inherit_prop(lua_State *, boolean_t, + nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_inherit_prop_info = { + .name = "inherit", + .func = zcp_synctask_inherit_prop, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 2, /* 2 * numprops */ + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING }, + { .za_name = "property", .za_lua_type = LUA_TSTRING }, + { NULL, 0 } + }, + .kwargs = { + { NULL, 0 } + }, +}; + +static int +zcp_synctask_inherit_prop_check(void *arg, dmu_tx_t *tx) +{ + zcp_inherit_prop_arg_t *args = arg; + zfs_prop_t prop = zfs_name_to_prop(args->zipa_prop); + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(args->zipa_prop)) + return (0); + + return (EINVAL); + } + + if (zfs_prop_readonly(prop)) + return (EINVAL); + + if (!zfs_prop_inheritable(prop)) + return (EINVAL); + + return (dsl_props_set_check(&args->zipa_dpsa, tx)); +} + +static void +zcp_synctask_inherit_prop_sync(void *arg, dmu_tx_t *tx) +{ + zcp_inherit_prop_arg_t *args = arg; + dsl_props_set_arg_t *dpsa = &args->zipa_dpsa; + + dsl_props_set_sync(dpsa, tx); +} + +static int +zcp_synctask_inherit_prop(lua_State *state, boolean_t sync, + nvlist_t *err_details) +{ + int err; + zcp_inherit_prop_arg_t zipa = { 0 }; + dsl_props_set_arg_t *dpsa = &zipa.zipa_dpsa; + + const char *dsname = lua_tostring(state, 1); + const char *prop = lua_tostring(state, 2); + + zipa.zipa_state = state; + zipa.zipa_prop = prop; + dpsa->dpsa_dsname = dsname; + dpsa->dpsa_source = ZPROP_SRC_INHERITED; + dpsa->dpsa_props = fnvlist_alloc(); + fnvlist_add_boolean(dpsa->dpsa_props, prop); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, dpsa->dpsa_props); + + err = zcp_sync_task(state, zcp_synctask_inherit_prop_check, + zcp_synctask_inherit_prop_sync, &zipa, sync, dsname); + + zcp_deregister_cleanup(state, zch); + fnvlist_free(dpsa->dpsa_props); + + return (err); +} + +static int zcp_synctask_bookmark(lua_State *, boolean_t, nvlist_t *); +static zcp_synctask_info_t zcp_synctask_bookmark_info = { + .name = "bookmark", + .func = zcp_synctask_bookmark, + .pargs = { + {.za_name = "snapshot | bookmark", .za_lua_type = LUA_TSTRING}, + {.za_name = "bookmark", .za_lua_type = LUA_TSTRING}, + {NULL, 0} + }, + .kwargs = { + {NULL, 0} + }, + .space_check = ZFS_SPACE_CHECK_NORMAL, + .blocks_modified = 1, +}; + +/* ARGSUSED */ +static int +zcp_synctask_bookmark(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + const char *source = lua_tostring(state, 1); + const char *new = lua_tostring(state, 2); + + nvlist_t *bmarks = fnvlist_alloc(); + fnvlist_add_string(bmarks, new, source); + + zcp_cleanup_handler_t *zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, bmarks); + + dsl_bookmark_create_arg_t dbca = { + .dbca_bmarks = bmarks, + .dbca_errors = NULL, + }; + err = zcp_sync_task(state, dsl_bookmark_create_check, + dsl_bookmark_create_sync, &dbca, sync, source); + + zcp_deregister_cleanup(state, zch); + fnvlist_free(bmarks); + + return (err); +} + +static int zcp_synctask_set_prop(lua_State *, boolean_t, nvlist_t *err_details); +static zcp_synctask_info_t zcp_synctask_set_prop_info = { + .name = "set_prop", + .func = zcp_synctask_set_prop, + .space_check = ZFS_SPACE_CHECK_RESERVED, + .blocks_modified = 2, + .pargs = { + { .za_name = "dataset", .za_lua_type = LUA_TSTRING}, + { .za_name = "property", .za_lua_type = LUA_TSTRING}, + { .za_name = "value", .za_lua_type = LUA_TSTRING}, + { NULL, 0 } + }, + .kwargs = { + { NULL, 0 } + } +}; + +static int +zcp_synctask_set_prop(lua_State *state, boolean_t sync, nvlist_t *err_details) +{ + int err; + zcp_set_prop_arg_t args = { 0 }; + + const char *dsname = lua_tostring(state, 1); + const char *prop = lua_tostring(state, 2); + const char *val = lua_tostring(state, 3); + + args.state = state; + args.dsname = dsname; + args.prop = prop; + args.val = val; + + err = zcp_sync_task(state, zcp_set_prop_check, zcp_set_prop_sync, + &args, sync, dsname); + + return (err); +} + +static int +zcp_synctask_wrapper(lua_State *state) +{ + int err; + zcp_cleanup_handler_t *zch; + int num_ret = 1; + nvlist_t *err_details = fnvlist_alloc(); + + /* + * Make sure err_details is properly freed, even if a fatal error is + * thrown during the synctask. + */ + zch = zcp_register_cleanup(state, + (zcp_cleanup_t *)&fnvlist_free, err_details); + + zcp_synctask_info_t *info = lua_touserdata(state, lua_upvalueindex(1)); + boolean_t sync = lua_toboolean(state, lua_upvalueindex(2)); + + zcp_run_info_t *ri = zcp_run_info(state); + dsl_pool_t *dp = ri->zri_pool; + + /* MOS space is triple-dittoed, so we multiply by 3. */ + uint64_t funcspace = + ((uint64_t)info->blocks_modified << DST_AVG_BLKSHIFT) * 3; + + zcp_parse_args(state, info->name, info->pargs, info->kwargs); + + err = 0; + if (info->space_check != ZFS_SPACE_CHECK_NONE) { + uint64_t quota = dsl_pool_unreserved_space(dp, + info->space_check); + uint64_t used = dsl_dir_phys(dp->dp_root_dir)->dd_used_bytes + + ri->zri_space_used; + + if (used + funcspace > quota) { + err = SET_ERROR(ENOSPC); + } + } + + if (err == 0) { + err = info->func(state, sync, err_details); + } + + if (err == 0) { + ri->zri_space_used += funcspace; + } + + lua_pushnumber(state, (lua_Number)err); + if (fnvlist_num_pairs(err_details) > 0) { + (void) zcp_nvlist_to_lua(state, err_details, NULL, 0); + num_ret++; + } + + zcp_deregister_cleanup(state, zch); + fnvlist_free(err_details); + + return (num_ret); +} + +int +zcp_load_synctask_lib(lua_State *state, boolean_t sync) +{ + int i; + zcp_synctask_info_t *zcp_synctask_funcs[] = { + &zcp_synctask_destroy_info, + &zcp_synctask_promote_info, + &zcp_synctask_rollback_info, + &zcp_synctask_snapshot_info, + &zcp_synctask_inherit_prop_info, + &zcp_synctask_bookmark_info, + &zcp_synctask_set_prop_info, + NULL + }; + + lua_newtable(state); + + for (i = 0; zcp_synctask_funcs[i] != NULL; i++) { + zcp_synctask_info_t *info = zcp_synctask_funcs[i]; + lua_pushlightuserdata(state, info); + lua_pushboolean(state, sync); + lua_pushcclosure(state, &zcp_synctask_wrapper, 2); + lua_setfield(state, -2, info->name); + info++; + } + + return (1); +} diff --git a/module/zfs/zfeature.c b/module/zfs/zfeature.c new file mode 100644 index 000000000000..3757443a5a68 --- /dev/null +++ b/module/zfs/zfeature.c @@ -0,0 +1,526 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2011, 2015 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include "zfeature_common.h" +#include + +/* + * ZFS Feature Flags + * ----------------- + * + * ZFS feature flags are used to provide fine-grained versioning to the ZFS + * on-disk format. Once enabled on a pool feature flags replace the old + * spa_version() number. + * + * Each new on-disk format change will be given a uniquely identifying string + * GUID rather than a version number. This avoids the problem of different + * organizations creating new on-disk formats with the same version number. To + * keep feature GUIDs unique they should consist of the reverse dns name of the + * organization which implemented the feature and a short name for the feature, + * separated by a colon (e.g. com.delphix:async_destroy). + * + * Reference Counts + * ---------------- + * + * Within each pool features can be in one of three states: disabled, enabled, + * or active. These states are differentiated by a reference count stored on + * disk for each feature: + * + * 1) If there is no reference count stored on disk the feature is disabled. + * 2) If the reference count is 0 a system administrator has enabled the + * feature, but the feature has not been used yet, so no on-disk + * format changes have been made. + * 3) If the reference count is greater than 0 the feature is active. + * The format changes required by the feature are currently on disk. + * Note that if the feature's format changes are reversed the feature + * may choose to set its reference count back to 0. + * + * Feature flags makes no differentiation between non-zero reference counts + * for an active feature (e.g. a reference count of 1 means the same thing as a + * reference count of 27834721), but feature implementations may choose to use + * the reference count to store meaningful information. For example, a new RAID + * implementation might set the reference count to the number of vdevs using + * it. If all those disks are removed from the pool the feature goes back to + * having a reference count of 0. + * + * It is the responsibility of the individual features to maintain a non-zero + * reference count as long as the feature's format changes are present on disk. + * + * Dependencies + * ------------ + * + * Each feature may depend on other features. The only effect of this + * relationship is that when a feature is enabled all of its dependencies are + * automatically enabled as well. Any future work to support disabling of + * features would need to ensure that features cannot be disabled if other + * enabled features depend on them. + * + * On-disk Format + * -------------- + * + * When feature flags are enabled spa_version() is set to SPA_VERSION_FEATURES + * (5000). In order for this to work the pool is automatically upgraded to + * SPA_VERSION_BEFORE_FEATURES (28) first, so all pre-feature flags on disk + * format changes will be in use. + * + * Information about features is stored in 3 ZAP objects in the pool's MOS. + * These objects are linked to by the following names in the pool directory + * object: + * + * 1) features_for_read: feature GUID -> reference count + * Features needed to open the pool for reading. + * 2) features_for_write: feature GUID -> reference count + * Features needed to open the pool for writing. + * 3) feature_descriptions: feature GUID -> descriptive string + * A human readable string. + * + * All enabled features appear in either features_for_read or + * features_for_write, but not both. + * + * To open a pool in read-only mode only the features listed in + * features_for_read need to be supported. + * + * To open the pool in read-write mode features in both features_for_read and + * features_for_write need to be supported. + * + * Some features may be required to read the ZAP objects containing feature + * information. To allow software to check for compatibility with these features + * before the pool is opened their names must be stored in the label in a + * new "features_for_read" entry (note that features that are only required + * to write to a pool never need to be stored in the label since the + * features_for_write ZAP object can be read before the pool is written to). + * To save space in the label features must be explicitly marked as needing to + * be written to the label. Also, reference counts are not stored in the label, + * instead any feature whose reference count drops to 0 is removed from the + * label. + * + * Adding New Features + * ------------------- + * + * Features must be registered in zpool_feature_init() function in + * zfeature_common.c using the zfeature_register() function. This function + * has arguments to specify if the feature should be stored in the + * features_for_read or features_for_write ZAP object and if it needs to be + * written to the label when active. + * + * Once a feature is registered it will appear as a "feature@" + * property which can be set by an administrator. Feature implementors should + * use the spa_feature_is_enabled() and spa_feature_is_active() functions to + * query the state of a feature and the spa_feature_incr() and + * spa_feature_decr() functions to change an enabled feature's reference count. + * Reference counts may only be updated in the syncing context. + * + * Features may not perform enable-time initialization. Instead, any such + * initialization should occur when the feature is first used. This design + * enforces that on-disk changes be made only when features are used. Code + * should only check if a feature is enabled using spa_feature_is_enabled(), + * not by relying on any feature specific metadata existing. If a feature is + * enabled, but the feature's metadata is not on disk yet then it should be + * created as needed. + * + * As an example, consider the com.delphix:async_destroy feature. This feature + * relies on the existence of a bptree in the MOS that store blocks for + * asynchronous freeing. This bptree is not created when async_destroy is + * enabled. Instead, when a dataset is destroyed spa_feature_is_enabled() is + * called to check if async_destroy is enabled. If it is and the bptree object + * does not exist yet, the bptree object is created as part of the dataset + * destroy and async_destroy's reference count is incremented to indicate it + * has made an on-disk format change. Later, after the destroyed dataset's + * blocks have all been asynchronously freed there is no longer any use for the + * bptree object, so it is destroyed and async_destroy's reference count is + * decremented back to 0 to indicate that it has undone its on-disk format + * changes. + */ + +typedef enum { + FEATURE_ACTION_INCR, + FEATURE_ACTION_DECR, +} feature_action_t; + +/* + * Checks that the active features in the pool are supported by + * this software. Adds each unsupported feature (name -> description) to + * the supplied nvlist. + */ +boolean_t +spa_features_check(spa_t *spa, boolean_t for_write, + nvlist_t *unsup_feat, nvlist_t *enabled_feat) +{ + objset_t *os = spa->spa_meta_objset; + boolean_t supported; + zap_cursor_t *zc; + zap_attribute_t *za; + uint64_t obj = for_write ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + char *buf; + + zc = kmem_alloc(sizeof (zap_cursor_t), KM_SLEEP); + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + buf = kmem_alloc(MAXPATHLEN, KM_SLEEP); + + supported = B_TRUE; + for (zap_cursor_init(zc, os, obj); + zap_cursor_retrieve(zc, za) == 0; + zap_cursor_advance(zc)) { + ASSERT(za->za_integer_length == sizeof (uint64_t) && + za->za_num_integers == 1); + + if (NULL != enabled_feat) { + fnvlist_add_uint64(enabled_feat, za->za_name, + za->za_first_integer); + } + + if (za->za_first_integer != 0 && + !zfeature_is_supported(za->za_name)) { + supported = B_FALSE; + + if (NULL != unsup_feat) { + char *desc = ""; + + if (zap_lookup(os, spa->spa_feat_desc_obj, + za->za_name, 1, MAXPATHLEN, buf) == 0) + desc = buf; + + VERIFY(nvlist_add_string(unsup_feat, + za->za_name, desc) == 0); + } + } + } + zap_cursor_fini(zc); + + kmem_free(buf, MAXPATHLEN); + kmem_free(za, sizeof (zap_attribute_t)); + kmem_free(zc, sizeof (zap_cursor_t)); + + return (supported); +} + +/* + * Use an in-memory cache of feature refcounts for quick retrieval. + * + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb, zhack, and spa_add_feature_stats(). + */ +int +feature_get_refcount(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + ASSERT(VALID_FEATURE_FID(feature->fi_feature)); + if (spa->spa_feat_refcount_cache[feature->fi_feature] == + SPA_FEATURE_DISABLED) { + return (SET_ERROR(ENOTSUP)); + } + *res = spa->spa_feat_refcount_cache[feature->fi_feature]; + return (0); +} + +/* + * Note: well-designed features will not need to use this; they should + * use spa_feature_is_enabled() and spa_feature_is_active() instead. + * However, this is non-static for zdb and zhack. + */ +int +feature_get_refcount_from_disk(spa_t *spa, zfeature_info_t *feature, + uint64_t *res) +{ + int err; + uint64_t refcount; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + /* + * If the pool is currently being created, the feature objects may not + * have been allocated yet. Act as though all features are disabled. + */ + if (zapobj == 0) + return (SET_ERROR(ENOTSUP)); + + err = zap_lookup(spa->spa_meta_objset, zapobj, + feature->fi_guid, sizeof (uint64_t), 1, &refcount); + if (err != 0) { + if (err == ENOENT) + return (SET_ERROR(ENOTSUP)); + else + return (err); + } + *res = refcount; + return (0); +} + + +static int +feature_get_enabled_txg(spa_t *spa, zfeature_info_t *feature, uint64_t *res) +{ + uint64_t enabled_txg_obj __maybe_unused = spa->spa_feat_enabled_txg_obj; + + ASSERT(zfeature_depends_on(feature->fi_feature, + SPA_FEATURE_ENABLED_TXG)); + + if (!spa_feature_is_enabled(spa, feature->fi_feature)) { + return (SET_ERROR(ENOTSUP)); + } + + ASSERT(enabled_txg_obj != 0); + + VERIFY0(zap_lookup(spa->spa_meta_objset, spa->spa_feat_enabled_txg_obj, + feature->fi_guid, sizeof (uint64_t), 1, res)); + + return (0); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_sync(spa_t *spa, zfeature_info_t *feature, uint64_t refcount, + dmu_tx_t *tx) +{ + ASSERT(VALID_FEATURE_OR_NONE(feature->fi_feature)); + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + VERIFY0(zap_update(spa->spa_meta_objset, zapobj, feature->fi_guid, + sizeof (uint64_t), 1, &refcount, tx)); + + /* + * feature_sync is called directly from zhack, allowing the + * creation of arbitrary features whose fi_feature field may + * be greater than SPA_FEATURES. When called from zhack, the + * zfeature_info_t object's fi_feature field will be set to + * SPA_FEATURE_NONE. + */ + if (feature->fi_feature != SPA_FEATURE_NONE) { + uint64_t *refcount_cache = + &spa->spa_feat_refcount_cache[feature->fi_feature]; + VERIFY3U(*refcount_cache, ==, + atomic_swap_64(refcount_cache, refcount)); + } + + if (refcount == 0) + spa_deactivate_mos_feature(spa, feature->fi_guid); + else if (feature->fi_flags & ZFEATURE_FLAG_MOS) + spa_activate_mos_feature(spa, feature->fi_guid, tx); +} + +/* + * This function is non-static for zhack; it should otherwise not be used + * outside this file. + */ +void +feature_enable_sync(spa_t *spa, zfeature_info_t *feature, dmu_tx_t *tx) +{ + uint64_t initial_refcount = + (feature->fi_flags & ZFEATURE_FLAG_ACTIVATE_ON_ENABLE) ? 1 : 0; + uint64_t zapobj = (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + /* + * If the feature is already enabled, ignore the request. + */ + if (zap_contains(spa->spa_meta_objset, zapobj, feature->fi_guid) == 0) + return; + + for (int i = 0; feature->fi_depends[i] != SPA_FEATURE_NONE; i++) + spa_feature_enable(spa, feature->fi_depends[i], tx); + + VERIFY0(zap_update(spa->spa_meta_objset, spa->spa_feat_desc_obj, + feature->fi_guid, 1, strlen(feature->fi_desc) + 1, + feature->fi_desc, tx)); + + feature_sync(spa, feature, initial_refcount, tx); + + if (spa_feature_is_enabled(spa, SPA_FEATURE_ENABLED_TXG)) { + uint64_t enabling_txg = dmu_tx_get_txg(tx); + + if (spa->spa_feat_enabled_txg_obj == 0ULL) { + spa->spa_feat_enabled_txg_obj = + zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_ENABLED_TXG, tx); + } + spa_feature_incr(spa, SPA_FEATURE_ENABLED_TXG, tx); + + VERIFY0(zap_add(spa->spa_meta_objset, + spa->spa_feat_enabled_txg_obj, feature->fi_guid, + sizeof (uint64_t), 1, &enabling_txg, tx)); + } + + /* + * Errata #4 is mostly a problem with encrypted datasets, but it + * is also a problem where the old encryption feature did not + * depend on the bookmark_v2 feature. If the pool does not have + * any encrypted datasets we can resolve this issue simply by + * enabling this dependency. + */ + if (spa->spa_errata == ZPOOL_ERRATA_ZOL_8308_ENCRYPTION && + spa_feature_is_enabled(spa, SPA_FEATURE_ENCRYPTION) && + !spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION) && + feature->fi_feature == SPA_FEATURE_BOOKMARK_V2) + spa->spa_errata = 0; +} + +static void +feature_do_action(spa_t *spa, spa_feature_t fid, feature_action_t action, + dmu_tx_t *tx) +{ + uint64_t refcount = 0; + zfeature_info_t *feature = &spa_feature_table[fid]; + uint64_t zapobj __maybe_unused = + (feature->fi_flags & ZFEATURE_FLAG_READONLY_COMPAT) ? + spa->spa_feat_for_write_obj : spa->spa_feat_for_read_obj; + + ASSERT(VALID_FEATURE_FID(fid)); + ASSERT(0 != zapobj); + ASSERT(zfeature_is_valid_guid(feature->fi_guid)); + + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + + VERIFY3U(feature_get_refcount(spa, feature, &refcount), !=, ENOTSUP); + + switch (action) { + case FEATURE_ACTION_INCR: + VERIFY3U(refcount, !=, UINT64_MAX); + refcount++; + break; + case FEATURE_ACTION_DECR: + VERIFY3U(refcount, !=, 0); + refcount--; + break; + default: + ASSERT(0); + break; + } + + feature_sync(spa, feature, refcount, tx); +} + +void +spa_feature_create_zap_objects(spa_t *spa, dmu_tx_t *tx) +{ + /* + * We create feature flags ZAP objects in two instances: during pool + * creation and during pool upgrade. + */ + ASSERT((!spa->spa_sync_on && tx->tx_txg == TXG_INITIAL) || + dsl_pool_sync_context(spa_get_dsl(spa))); + + spa->spa_feat_for_read_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_READ, tx); + spa->spa_feat_for_write_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURES_FOR_WRITE, tx); + spa->spa_feat_desc_obj = zap_create_link(spa->spa_meta_objset, + DMU_OTN_ZAP_METADATA, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_FEATURE_DESCRIPTIONS, tx); +} + +/* + * Enable any required dependencies, then enable the requested feature. + */ +void +spa_feature_enable(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + ASSERT3U(spa_version(spa), >=, SPA_VERSION_FEATURES); + ASSERT(VALID_FEATURE_FID(fid)); + feature_enable_sync(spa, &spa_feature_table[fid], tx); +} + +void +spa_feature_incr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_INCR, tx); +} + +void +spa_feature_decr(spa_t *spa, spa_feature_t fid, dmu_tx_t *tx) +{ + feature_do_action(spa, fid, FEATURE_ACTION_DECR, tx); +} + +boolean_t +spa_feature_is_enabled(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount = 0; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0); +} + +boolean_t +spa_feature_is_active(spa_t *spa, spa_feature_t fid) +{ + int err; + uint64_t refcount = 0; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_refcount(spa, &spa_feature_table[fid], &refcount); + ASSERT(err == 0 || err == ENOTSUP); + return (err == 0 && refcount > 0); +} + +/* + * For the feature specified by fid (which must depend on + * SPA_FEATURE_ENABLED_TXG), return the TXG at which it was enabled in the + * OUT txg argument. + * + * Returns B_TRUE if the feature is enabled, in which case txg will be filled + * with the transaction group in which the specified feature was enabled. + * Returns B_FALSE otherwise (i.e. if the feature is not enabled). + */ +boolean_t +spa_feature_enabled_txg(spa_t *spa, spa_feature_t fid, uint64_t *txg) +{ + int err; + + ASSERT(VALID_FEATURE_FID(fid)); + if (spa_version(spa) < SPA_VERSION_FEATURES) + return (B_FALSE); + + err = feature_get_enabled_txg(spa, &spa_feature_table[fid], txg); + ASSERT(err == 0 || err == ENOTSUP); + + return (err == 0); +} diff --git a/module/zfs/zfs_byteswap.c b/module/zfs/zfs_byteswap.c new file mode 100644 index 000000000000..cd35849c3f37 --- /dev/null +++ b/module/zfs/zfs_byteswap.c @@ -0,0 +1,211 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +#ifndef _KERNEL +static +#endif +void +zfs_oldace_byteswap(ace_t *ace, int ace_cnt) +{ + int i; + + for (i = 0; i != ace_cnt; i++, ace++) { + ace->a_who = BSWAP_32(ace->a_who); + ace->a_access_mask = BSWAP_32(ace->a_access_mask); + ace->a_flags = BSWAP_16(ace->a_flags); + ace->a_type = BSWAP_16(ace->a_type); + } +} + +/* + * swap ace_t and ace_object_t + */ +#ifndef _KERNEL +static +#endif +void +zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) +{ + caddr_t end; + caddr_t ptr; + zfs_ace_t *zacep = NULL; + ace_t *acep; + uint16_t entry_type; + size_t entry_size; + int ace_type; + + end = (caddr_t)buf + size; + ptr = buf; + + while (ptr < end) { + if (zfs_layout) { + /* + * Avoid overrun. Embedded aces can have one + * of several sizes. We don't know exactly + * how many our present, only the size of the + * buffer containing them. That size may be + * larger than needed to hold the aces + * present. As long as we do not do any + * swapping beyond the end of our block we are + * okay. It is safe to swap any non-ace data + * within the block since it is just zeros. + */ + if (ptr + sizeof (zfs_ace_hdr_t) > end) { + break; + } + zacep = (zfs_ace_t *)ptr; + zacep->z_hdr.z_access_mask = + BSWAP_32(zacep->z_hdr.z_access_mask); + zacep->z_hdr.z_flags = BSWAP_16(zacep->z_hdr.z_flags); + ace_type = zacep->z_hdr.z_type = + BSWAP_16(zacep->z_hdr.z_type); + entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; + } else { + /* Overrun avoidance */ + if (ptr + sizeof (ace_t) > end) { + break; + } + acep = (ace_t *)ptr; + acep->a_access_mask = BSWAP_32(acep->a_access_mask); + acep->a_flags = BSWAP_16(acep->a_flags); + ace_type = acep->a_type = BSWAP_16(acep->a_type); + acep->a_who = BSWAP_32(acep->a_who); + entry_type = acep->a_flags & ACE_TYPE_FLAGS; + } + switch (entry_type) { + case ACE_OWNER: + case ACE_EVERYONE: + case (ACE_IDENTIFIER_GROUP | ACE_GROUP): + entry_size = zfs_layout ? + sizeof (zfs_ace_hdr_t) : sizeof (ace_t); + break; + case ACE_IDENTIFIER_GROUP: + default: + /* Overrun avoidance */ + if (zfs_layout) { + if (ptr + sizeof (zfs_ace_t) <= end) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } else { + entry_size = sizeof (zfs_ace_t); + break; + } + } + switch (ace_type) { + case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: + case ACE_ACCESS_DENIED_OBJECT_ACE_TYPE: + case ACE_SYSTEM_AUDIT_OBJECT_ACE_TYPE: + case ACE_SYSTEM_ALARM_OBJECT_ACE_TYPE: + entry_size = zfs_layout ? + sizeof (zfs_object_ace_t) : + sizeof (ace_object_t); + break; + default: + entry_size = zfs_layout ? sizeof (zfs_ace_t) : + sizeof (ace_t); + break; + } + } + ptr = ptr + entry_size; + } +} + +/* ARGSUSED */ +void +zfs_oldacl_byteswap(void *buf, size_t size) +{ + int cnt; + + /* + * Arggh, since we don't know how many ACEs are in + * the array, we have to swap the entire block + */ + + cnt = size / sizeof (ace_t); + + zfs_oldace_byteswap((ace_t *)buf, cnt); +} + +/* ARGSUSED */ +void +zfs_acl_byteswap(void *buf, size_t size) +{ + zfs_ace_byteswap(buf, size, B_TRUE); +} + +void +zfs_znode_byteswap(void *buf, size_t size) +{ + znode_phys_t *zp = buf; + + ASSERT(size >= sizeof (znode_phys_t)); + + zp->zp_crtime[0] = BSWAP_64(zp->zp_crtime[0]); + zp->zp_crtime[1] = BSWAP_64(zp->zp_crtime[1]); + zp->zp_atime[0] = BSWAP_64(zp->zp_atime[0]); + zp->zp_atime[1] = BSWAP_64(zp->zp_atime[1]); + zp->zp_mtime[0] = BSWAP_64(zp->zp_mtime[0]); + zp->zp_mtime[1] = BSWAP_64(zp->zp_mtime[1]); + zp->zp_ctime[0] = BSWAP_64(zp->zp_ctime[0]); + zp->zp_ctime[1] = BSWAP_64(zp->zp_ctime[1]); + zp->zp_gen = BSWAP_64(zp->zp_gen); + zp->zp_mode = BSWAP_64(zp->zp_mode); + zp->zp_size = BSWAP_64(zp->zp_size); + zp->zp_parent = BSWAP_64(zp->zp_parent); + zp->zp_links = BSWAP_64(zp->zp_links); + zp->zp_xattr = BSWAP_64(zp->zp_xattr); + zp->zp_rdev = BSWAP_64(zp->zp_rdev); + zp->zp_flags = BSWAP_64(zp->zp_flags); + zp->zp_uid = BSWAP_64(zp->zp_uid); + zp->zp_gid = BSWAP_64(zp->zp_gid); + zp->zp_zap = BSWAP_64(zp->zp_zap); + zp->zp_pad[0] = BSWAP_64(zp->zp_pad[0]); + zp->zp_pad[1] = BSWAP_64(zp->zp_pad[1]); + zp->zp_pad[2] = BSWAP_64(zp->zp_pad[2]); + + zp->zp_acl.z_acl_extern_obj = BSWAP_64(zp->zp_acl.z_acl_extern_obj); + zp->zp_acl.z_acl_size = BSWAP_32(zp->zp_acl.z_acl_size); + zp->zp_acl.z_acl_version = BSWAP_16(zp->zp_acl.z_acl_version); + zp->zp_acl.z_acl_count = BSWAP_16(zp->zp_acl.z_acl_count); + if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { + zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], + ZFS_ACE_SPACE); + } else { + zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], + ACE_SLOT_CNT); + } +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_oldacl_byteswap); +EXPORT_SYMBOL(zfs_acl_byteswap); +EXPORT_SYMBOL(zfs_znode_byteswap); +#endif diff --git a/module/zfs/zfs_fm.c b/module/zfs/zfs_fm.c new file mode 100644 index 000000000000..579aa0380411 --- /dev/null +++ b/module/zfs/zfs_fm.c @@ -0,0 +1,1083 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +/* + * This general routine is responsible for generating all the different ZFS + * ereports. The payload is dependent on the class, and which arguments are + * supplied to the function: + * + * EREPORT POOL VDEV IO + * block X X X + * data X X + * device X X + * pool X + * + * If we are in a loading state, all errors are chained together by the same + * SPA-wide ENA (Error Numeric Association). + * + * For isolated I/O requests, we get the ENA from the zio_t. The propagation + * gets very complicated due to RAID-Z, gang blocks, and vdev caching. We want + * to chain together all ereports associated with a logical piece of data. For + * read I/Os, there are basically three 'types' of I/O, which form a roughly + * layered diagram: + * + * +---------------+ + * | Aggregate I/O | No associated logical data or device + * +---------------+ + * | + * V + * +---------------+ Reads associated with a piece of logical data. + * | Read I/O | This includes reads on behalf of RAID-Z, + * +---------------+ mirrors, gang blocks, retries, etc. + * | + * V + * +---------------+ Reads associated with a particular device, but + * | Physical I/O | no logical data. Issued as part of vdev caching + * +---------------+ and I/O aggregation. + * + * Note that 'physical I/O' here is not the same terminology as used in the rest + * of ZIO. Typically, 'physical I/O' simply means that there is no attached + * blockpointer. But I/O with no associated block pointer can still be related + * to a logical piece of data (i.e. RAID-Z requests). + * + * Purely physical I/O always have unique ENAs. They are not related to a + * particular piece of logical data, and therefore cannot be chained together. + * We still generate an ereport, but the DE doesn't correlate it with any + * logical piece of data. When such an I/O fails, the delegated I/O requests + * will issue a retry, which will trigger the 'real' ereport with the correct + * ENA. + * + * We keep track of the ENA for a ZIO chain through the 'io_logical' member. + * When a new logical I/O is issued, we set this to point to itself. Child I/Os + * then inherit this pointer, so that when it is first set subsequent failures + * will use the same ENA. For vdev cache fill and queue aggregation I/O, + * this pointer is set to NULL, and no ereport will be generated (since it + * doesn't actually correspond to any particular device or piece of data, + * and the caller will always retry without caching or queueing anyway). + * + * For checksum errors, we want to include more information about the actual + * error which occurs. Accordingly, we build an ereport when the error is + * noticed, but instead of sending it in immediately, we hang it off of the + * io_cksum_report field of the logical IO. When the logical IO completes + * (successfully or not), zfs_ereport_finish_checksum() is called with the + * good and bad versions of the buffer (if available), and we annotate the + * ereport with information about the differences. + */ +#ifdef _KERNEL +void +zfs_zevent_post_cb(nvlist_t *nvl, nvlist_t *detector) +{ + if (nvl) + fm_nvlist_destroy(nvl, FM_NVA_FREE); + + if (detector) + fm_nvlist_destroy(detector, FM_NVA_FREE); +} + +/* + * We want to rate limit ZIO delay and checksum events so as to not + * flood ZED when a disk is acting up. + * + * Returns 1 if we're ratelimiting, 0 if not. + */ +static int +zfs_is_ratelimiting_event(const char *subclass, vdev_t *vd) +{ + int rc = 0; + /* + * __ratelimit() returns 1 if we're *not* ratelimiting and 0 if we + * are. Invert it to get our return value. + */ + if (strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) { + rc = !zfs_ratelimit(&vd->vdev_delay_rl); + } else if (strcmp(subclass, FM_EREPORT_ZFS_CHECKSUM) == 0) { + rc = !zfs_ratelimit(&vd->vdev_checksum_rl); + } + + if (rc) { + /* We're rate limiting */ + fm_erpt_dropped_increment(); + } + + return (rc); +} + +/* + * Return B_TRUE if the event actually posted, B_FALSE if not. + */ +static boolean_t +zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, + const char *subclass, spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, + zio_t *zio, uint64_t stateoroffset, uint64_t size) +{ + nvlist_t *ereport, *detector; + + uint64_t ena; + char class[64]; + + if (!zfs_ereport_is_valid(subclass, spa, vd, zio)) + return (B_FALSE); + + if ((ereport = fm_nvlist_create(NULL)) == NULL) + return (B_FALSE); + + if ((detector = fm_nvlist_create(NULL)) == NULL) { + fm_nvlist_destroy(ereport, FM_NVA_FREE); + return (B_FALSE); + } + + /* + * Serialize ereport generation + */ + mutex_enter(&spa->spa_errlist_lock); + + /* + * Determine the ENA to use for this event. If we are in a loading + * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use + * a root zio-wide ENA. Otherwise, simply use a unique ENA. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE) { + if (spa->spa_ena == 0) + spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); + ena = spa->spa_ena; + } else if (zio != NULL && zio->io_logical != NULL) { + if (zio->io_logical->io_ena == 0) + zio->io_logical->io_ena = + fm_ena_generate(0, FM_ENA_FMT1); + ena = zio->io_logical->io_ena; + } else { + ena = fm_ena_generate(0, FM_ENA_FMT1); + } + + /* + * Construct the full class, detector, and other standard FMA fields. + */ + (void) snprintf(class, sizeof (class), "%s.%s", + ZFS_ERROR_CLASS, subclass); + + fm_fmri_zfs_set(detector, FM_ZFS_SCHEME_VERSION, spa_guid(spa), + vd != NULL ? vd->vdev_guid : 0); + + fm_ereport_set(ereport, FM_EREPORT_VERSION, class, ena, detector, NULL); + + /* + * Construct the per-ereport payload, depending on which parameters are + * passed in. + */ + + /* + * Generic payload members common to all ereports. + */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_POOL, DATA_TYPE_STRING, spa_name(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, DATA_TYPE_UINT64, + (uint64_t)spa_state(spa), + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, + (int32_t)spa_load_state(spa), NULL); + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, + DATA_TYPE_STRING, + spa_get_failmode(spa) == ZIO_FAILURE_MODE_WAIT ? + FM_EREPORT_FAILMODE_WAIT : + spa_get_failmode(spa) == ZIO_FAILURE_MODE_CONTINUE ? + FM_EREPORT_FAILMODE_CONTINUE : FM_EREPORT_FAILMODE_PANIC, + NULL); + + if (vd != NULL) { + vdev_t *pvd = vd->vdev_parent; + vdev_queue_t *vq = &vd->vdev_queue; + vdev_stat_t *vs = &vd->vdev_stat; + vdev_t *spare_vd; + uint64_t *spare_guids; + char **spare_paths; + int i, spare_count; + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, + DATA_TYPE_UINT64, vd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, + DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); + if (vd->vdev_path != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + DATA_TYPE_STRING, vd->vdev_path, NULL); + if (vd->vdev_devid != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, + DATA_TYPE_STRING, vd->vdev_devid, NULL); + if (vd->vdev_fru != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, + DATA_TYPE_STRING, vd->vdev_fru, NULL); + if (vd->vdev_enc_sysfs_path != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, + DATA_TYPE_STRING, vd->vdev_enc_sysfs_path, NULL); + if (vd->vdev_ashift) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ASHIFT, + DATA_TYPE_UINT64, vd->vdev_ashift, NULL); + + if (vq != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_COMP_TS, + DATA_TYPE_UINT64, vq->vq_io_complete_ts, NULL); + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELTA_TS, + DATA_TYPE_UINT64, vq->vq_io_delta_ts, NULL); + } + + if (vs != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_READ_ERRORS, + DATA_TYPE_UINT64, vs->vs_read_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_WRITE_ERRORS, + DATA_TYPE_UINT64, vs->vs_write_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_CKSUM_ERRORS, + DATA_TYPE_UINT64, vs->vs_checksum_errors, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DELAYS, + DATA_TYPE_UINT64, vs->vs_slow_ios, + NULL); + } + + if (pvd != NULL) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID, + DATA_TYPE_UINT64, pvd->vdev_guid, + FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE, + DATA_TYPE_STRING, pvd->vdev_ops->vdev_op_type, + NULL); + if (pvd->vdev_path) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH, + DATA_TYPE_STRING, pvd->vdev_path, NULL); + if (pvd->vdev_devid) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PARENT_DEVID, + DATA_TYPE_STRING, pvd->vdev_devid, NULL); + } + + spare_count = spa->spa_spares.sav_count; + spare_paths = kmem_zalloc(sizeof (char *) * spare_count, + KM_SLEEP); + spare_guids = kmem_zalloc(sizeof (uint64_t) * spare_count, + KM_SLEEP); + + for (i = 0; i < spare_count; i++) { + spare_vd = spa->spa_spares.sav_vdevs[i]; + if (spare_vd) { + spare_paths[i] = spare_vd->vdev_path; + spare_guids[i] = spare_vd->vdev_guid; + } + } + + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_PATHS, + DATA_TYPE_STRING_ARRAY, spare_count, spare_paths, + FM_EREPORT_PAYLOAD_ZFS_VDEV_SPARE_GUIDS, + DATA_TYPE_UINT64_ARRAY, spare_count, spare_guids, NULL); + + kmem_free(spare_guids, sizeof (uint64_t) * spare_count); + kmem_free(spare_paths, sizeof (char *) * spare_count); + } + + if (zio != NULL) { + /* + * Payload common to all I/Os. + */ + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_ERR, + DATA_TYPE_INT32, zio->io_error, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_FLAGS, + DATA_TYPE_INT32, zio->io_flags, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_STAGE, + DATA_TYPE_UINT32, zio->io_stage, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_PIPELINE, + DATA_TYPE_UINT32, zio->io_pipeline, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELAY, + DATA_TYPE_UINT64, zio->io_delay, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_TIMESTAMP, + DATA_TYPE_UINT64, zio->io_timestamp, NULL); + fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_ZIO_DELTA, + DATA_TYPE_UINT64, zio->io_delta, NULL); + + /* + * If the 'size' parameter is non-zero, it indicates this is a + * RAID-Z or other I/O where the physical offset and length are + * provided for us, instead of within the zio_t. + */ + if (vd != NULL) { + if (size) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, stateoroffset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, size, NULL); + else + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET, + DATA_TYPE_UINT64, zio->io_offset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE, + DATA_TYPE_UINT64, zio->io_size, NULL); + } + } else if (vd != NULL) { + /* + * If we have a vdev but no zio, this is a device fault, and the + * 'stateoroffset' parameter indicates the previous state of the + * vdev. + */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, + DATA_TYPE_UINT64, stateoroffset, NULL); + } + + /* + * Payload for I/Os with corresponding logical information. + */ + if (zb != NULL && (zio == NULL || zio->io_logical != NULL)) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJSET, + DATA_TYPE_UINT64, zb->zb_objset, + FM_EREPORT_PAYLOAD_ZFS_ZIO_OBJECT, + DATA_TYPE_UINT64, zb->zb_object, + FM_EREPORT_PAYLOAD_ZFS_ZIO_LEVEL, + DATA_TYPE_INT64, zb->zb_level, + FM_EREPORT_PAYLOAD_ZFS_ZIO_BLKID, + DATA_TYPE_UINT64, zb->zb_blkid, NULL); + } + + mutex_exit(&spa->spa_errlist_lock); + + *ereport_out = ereport; + *detector_out = detector; + return (B_TRUE); +} + +/* if it's <= 128 bytes, save the corruption directly */ +#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) + +#define MAX_RANGES 16 + +typedef struct zfs_ecksum_info { + /* histograms of set and cleared bits by bit number in a 64-bit word */ + uint32_t zei_histogram_set[sizeof (uint64_t) * NBBY]; + uint32_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; + + /* inline arrays of bits set and cleared. */ + uint64_t zei_bits_set[ZFM_MAX_INLINE]; + uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; + + /* + * for each range, the number of bits set and cleared. The Hamming + * distance between the good and bad buffers is the sum of them all. + */ + uint32_t zei_range_sets[MAX_RANGES]; + uint32_t zei_range_clears[MAX_RANGES]; + + struct zei_ranges { + uint32_t zr_start; + uint32_t zr_end; + } zei_ranges[MAX_RANGES]; + + size_t zei_range_count; + uint32_t zei_mingap; + uint32_t zei_allowed_mingap; + +} zfs_ecksum_info_t; + +static void +update_histogram(uint64_t value_arg, uint32_t *hist, uint32_t *count) +{ + size_t i; + size_t bits = 0; + uint64_t value = BE_64(value_arg); + + /* We store the bits in big-endian (largest-first) order */ + for (i = 0; i < 64; i++) { + if (value & (1ull << i)) { + hist[63 - i]++; + ++bits; + } + } + /* update the count of bits changed */ + *count += bits; +} + +/* + * We've now filled up the range array, and need to increase "mingap" and + * shrink the range list accordingly. zei_mingap is always the smallest + * distance between array entries, so we set the new_allowed_gap to be + * one greater than that. We then go through the list, joining together + * any ranges which are closer than the new_allowed_gap. + * + * By construction, there will be at least one. We also update zei_mingap + * to the new smallest gap, to prepare for our next invocation. + */ +static void +zei_shrink_ranges(zfs_ecksum_info_t *eip) +{ + uint32_t mingap = UINT32_MAX; + uint32_t new_allowed_gap = eip->zei_mingap + 1; + + size_t idx, output; + size_t max = eip->zei_range_count; + + struct zei_ranges *r = eip->zei_ranges; + + ASSERT3U(eip->zei_range_count, >, 0); + ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); + + output = idx = 0; + while (idx < max - 1) { + uint32_t start = r[idx].zr_start; + uint32_t end = r[idx].zr_end; + + while (idx < max - 1) { + idx++; + + uint32_t nstart = r[idx].zr_start; + uint32_t nend = r[idx].zr_end; + + uint32_t gap = nstart - end; + if (gap < new_allowed_gap) { + end = nend; + continue; + } + if (gap < mingap) + mingap = gap; + break; + } + r[output].zr_start = start; + r[output].zr_end = end; + output++; + } + ASSERT3U(output, <, eip->zei_range_count); + eip->zei_range_count = output; + eip->zei_mingap = mingap; + eip->zei_allowed_mingap = new_allowed_gap; +} + +static void +zei_add_range(zfs_ecksum_info_t *eip, int start, int end) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + + if (count >= MAX_RANGES) { + zei_shrink_ranges(eip); + count = eip->zei_range_count; + } + if (count == 0) { + eip->zei_mingap = UINT32_MAX; + eip->zei_allowed_mingap = 1; + } else { + int gap = start - r[count - 1].zr_end; + + if (gap < eip->zei_allowed_mingap) { + r[count - 1].zr_end = end; + return; + } + if (gap < eip->zei_mingap) + eip->zei_mingap = gap; + } + r[count].zr_start = start; + r[count].zr_end = end; + eip->zei_range_count++; +} + +static size_t +zei_range_total_size(zfs_ecksum_info_t *eip) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + size_t result = 0; + size_t idx; + + for (idx = 0; idx < count; idx++) + result += (r[idx].zr_end - r[idx].zr_start); + + return (result); +} + +static zfs_ecksum_info_t * +annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, + const abd_t *goodabd, const abd_t *badabd, size_t size, + boolean_t drop_if_identical) +{ + const uint64_t *good; + const uint64_t *bad; + + uint64_t allset = 0; + uint64_t allcleared = 0; + + size_t nui64s = size / sizeof (uint64_t); + + size_t inline_size; + int no_inline = 0; + size_t idx; + size_t range; + + size_t offset = 0; + ssize_t start = -1; + + zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); + + /* don't do any annotation for injected checksum errors */ + if (info != NULL && info->zbc_injected) + return (eip); + + if (info != NULL && info->zbc_has_cksum) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_expected) / sizeof (uint64_t), + (uint64_t *)&info->zbc_expected, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_actual) / sizeof (uint64_t), + (uint64_t *)&info->zbc_actual, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, + DATA_TYPE_STRING, + info->zbc_checksum_name, + NULL); + + if (info->zbc_byteswapped) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, + DATA_TYPE_BOOLEAN, 1, + NULL); + } + } + + if (badabd == NULL || goodabd == NULL) + return (eip); + + ASSERT3U(nui64s, <=, UINT32_MAX); + ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, <=, UINT32_MAX); + + good = (const uint64_t *) abd_borrow_buf_copy((abd_t *)goodabd, size); + bad = (const uint64_t *) abd_borrow_buf_copy((abd_t *)badabd, size); + + /* build up the range list by comparing the two buffers. */ + for (idx = 0; idx < nui64s; idx++) { + if (good[idx] == bad[idx]) { + if (start == -1) + continue; + + zei_add_range(eip, start, idx); + start = -1; + } else { + if (start != -1) + continue; + + start = idx; + } + } + if (start != -1) + zei_add_range(eip, start, idx); + + /* See if it will fit in our inline buffers */ + inline_size = zei_range_total_size(eip); + if (inline_size > ZFM_MAX_INLINE) + no_inline = 1; + + /* + * If there is no change and we want to drop if the buffers are + * identical, do so. + */ + if (inline_size == 0 && drop_if_identical) { + kmem_free(eip, sizeof (*eip)); + abd_return_buf((abd_t *)goodabd, (void *)good, size); + abd_return_buf((abd_t *)badabd, (void *)bad, size); + return (NULL); + } + + /* + * Now walk through the ranges, filling in the details of the + * differences. Also convert our uint64_t-array offsets to byte + * offsets. + */ + for (range = 0; range < eip->zei_range_count; range++) { + size_t start = eip->zei_ranges[range].zr_start; + size_t end = eip->zei_ranges[range].zr_end; + + for (idx = start; idx < end; idx++) { + uint64_t set, cleared; + + // bits set in bad, but not in good + set = ((~good[idx]) & bad[idx]); + // bits set in good, but not in bad + cleared = (good[idx] & (~bad[idx])); + + allset |= set; + allcleared |= cleared; + + if (!no_inline) { + ASSERT3U(offset, <, inline_size); + eip->zei_bits_set[offset] = set; + eip->zei_bits_cleared[offset] = cleared; + offset++; + } + + update_histogram(set, eip->zei_histogram_set, + &eip->zei_range_sets[range]); + update_histogram(cleared, eip->zei_histogram_cleared, + &eip->zei_range_clears[range]); + } + + /* convert to byte offsets */ + eip->zei_ranges[range].zr_start *= sizeof (uint64_t); + eip->zei_ranges[range].zr_end *= sizeof (uint64_t); + } + + abd_return_buf((abd_t *)goodabd, (void *)good, size); + abd_return_buf((abd_t *)badabd, (void *)bad, size); + + eip->zei_allowed_mingap *= sizeof (uint64_t); + inline_size *= sizeof (uint64_t); + + /* fill in ereport */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, + DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, + (uint32_t *)eip->zei_ranges, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, + DATA_TYPE_UINT32, eip->zei_allowed_mingap, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, + NULL); + + if (!no_inline) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_cleared, + NULL); + } else { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, + DATA_TYPE_UINT32_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, + DATA_TYPE_UINT32_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, + NULL); + } + return (eip); +} +#endif + +/* + * Make sure our event is still valid for the given zio/vdev/pool. For example, + * we don't want to keep logging events for a faulted or missing vdev. + */ +boolean_t +zfs_ereport_is_valid(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio) +{ +#ifdef _KERNEL + /* + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. + */ + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) + return (B_FALSE); + + /* + * If we are in the middle of opening a pool, and the previous attempt + * failed, don't bother logging any new ereports - we're just going to + * get the same diagnosis anyway. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE && + spa->spa_last_open_failed) + return (B_FALSE); + + if (zio != NULL) { + /* + * If this is not a read or write zio, ignore the error. This + * can occur if the DKIOCFLUSHWRITECACHE ioctl fails. + */ + if (zio->io_type != ZIO_TYPE_READ && + zio->io_type != ZIO_TYPE_WRITE) + return (B_FALSE); + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return (B_FALSE); + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return (B_FALSE); + } + } + + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return (B_FALSE); + + /* Ignore bogus delay events (like from ioctls or unqueued IOs) */ + if ((strcmp(subclass, FM_EREPORT_ZFS_DELAY) == 0) && + (zio != NULL) && (!zio->io_timestamp)) { + return (B_FALSE); + } +#endif + return (B_TRUE); +} + +/* + * Return 0 if event was posted, EINVAL if there was a problem posting it or + * EBUSY if the event was rate limited. + */ +int +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, + const zbookmark_phys_t *zb, zio_t *zio, uint64_t stateoroffset, + uint64_t size) +{ + int rc = 0; +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + + if (zfs_is_ratelimiting_event(subclass, vd)) + return (SET_ERROR(EBUSY)); + + if (!zfs_ereport_start(&ereport, &detector, subclass, spa, vd, + zb, zio, stateoroffset, size)) + return (SET_ERROR(EINVAL)); /* couldn't post event */ + + if (ereport == NULL) + return (SET_ERROR(EINVAL)); + + /* Cleanup is handled by the callback function */ + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); +#endif + return (rc); +} + +void +zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, + struct zio *zio, uint64_t offset, uint64_t length, void *arg, + zio_bad_cksum_t *info) +{ + zio_cksum_report_t *report; + +#ifdef _KERNEL + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return; +#endif + + report = kmem_zalloc(sizeof (*report), KM_SLEEP); + + if (zio->io_vsd != NULL) + zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); + else + zio_vsd_default_cksum_report(zio, report, arg); + + /* copy the checksum failure information if it was provided */ + if (info != NULL) { + report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); + bcopy(info, report->zcr_ckinfo, sizeof (*info)); + } + + report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_length = length; + +#ifdef _KERNEL + zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zb, zio, offset, length); + + if (report->zcr_ereport == NULL) { + zfs_ereport_free_checksum(report); + return; + } +#endif + + mutex_enter(&spa->spa_errlist_lock); + report->zcr_next = zio->io_logical->io_cksum_report; + zio->io_logical->io_cksum_report = report; + mutex_exit(&spa->spa_errlist_lock); +} + +void +zfs_ereport_finish_checksum(zio_cksum_report_t *report, const abd_t *good_data, + const abd_t *bad_data, boolean_t drop_if_identical) +{ +#ifdef _KERNEL + zfs_ecksum_info_t *info; + + info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, + good_data, bad_data, report->zcr_length, drop_if_identical); + if (info != NULL) + zfs_zevent_post(report->zcr_ereport, + report->zcr_detector, zfs_zevent_post_cb); + else + zfs_zevent_post_cb(report->zcr_ereport, report->zcr_detector); + + report->zcr_ereport = report->zcr_detector = NULL; + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + +void +zfs_ereport_free_checksum(zio_cksum_report_t *rpt) +{ +#ifdef _KERNEL + if (rpt->zcr_ereport != NULL) { + fm_nvlist_destroy(rpt->zcr_ereport, + FM_NVA_FREE); + fm_nvlist_destroy(rpt->zcr_detector, + FM_NVA_FREE); + } +#endif + rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); + + if (rpt->zcr_ckinfo != NULL) + kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); + + kmem_free(rpt, sizeof (*rpt)); +} + + +int +zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, const zbookmark_phys_t *zb, + struct zio *zio, uint64_t offset, uint64_t length, + const abd_t *good_data, const abd_t *bad_data, zio_bad_cksum_t *zbc) +{ + int rc = 0; +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + zfs_ecksum_info_t *info; + + if (zfs_is_ratelimiting_event(FM_EREPORT_ZFS_CHECKSUM, vd)) + return (EBUSY); + + if (!zfs_ereport_start(&ereport, &detector, FM_EREPORT_ZFS_CHECKSUM, + spa, vd, zb, zio, offset, length) || (ereport == NULL)) { + return (SET_ERROR(EINVAL)); + } + + info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, + B_FALSE); + + if (info != NULL) { + rc = zfs_zevent_post(ereport, detector, zfs_zevent_post_cb); + kmem_free(info, sizeof (*info)); + } +#endif + return (rc); +} + +/* + * The 'sysevent.fs.zfs.*' events are signals posted to notify user space of + * change in the pool. All sysevents are listed in sys/sysevent/eventdefs.h + * and are designed to be consumed by the ZFS Event Daemon (ZED). For + * additional details refer to the zed(8) man page. + */ +nvlist_t * +zfs_event_create(spa_t *spa, vdev_t *vd, const char *type, const char *name, + nvlist_t *aux) +{ + nvlist_t *resource = NULL; +#ifdef _KERNEL + char class[64]; + + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return (NULL); + + if ((resource = fm_nvlist_create(NULL)) == NULL) + return (NULL); + + (void) snprintf(class, sizeof (class), "%s.%s.%s", type, + ZFS_ERROR_CLASS, name); + VERIFY0(nvlist_add_uint8(resource, FM_VERSION, FM_RSRC_VERSION)); + VERIFY0(nvlist_add_string(resource, FM_CLASS, class)); + VERIFY0(nvlist_add_string(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL, spa_name(spa))); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, spa_guid(spa))); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_STATE, spa_state(spa))); + VERIFY0(nvlist_add_int32(resource, + FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, spa_load_state(spa))); + + if (vd) { + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_GUID, vd->vdev_guid)); + VERIFY0(nvlist_add_uint64(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_STATE, vd->vdev_state)); + if (vd->vdev_path != NULL) + VERIFY0(nvlist_add_string(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, vd->vdev_path)); + if (vd->vdev_devid != NULL) + VERIFY0(nvlist_add_string(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, vd->vdev_devid)); + if (vd->vdev_fru != NULL) + VERIFY0(nvlist_add_string(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, vd->vdev_fru)); + if (vd->vdev_enc_sysfs_path != NULL) + VERIFY0(nvlist_add_string(resource, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, + vd->vdev_enc_sysfs_path)); + } + + /* also copy any optional payload data */ + if (aux) { + nvpair_t *elem = NULL; + + while ((elem = nvlist_next_nvpair(aux, elem)) != NULL) + (void) nvlist_add_nvpair(resource, elem); + } + +#endif + return (resource); +} + +static void +zfs_post_common(spa_t *spa, vdev_t *vd, const char *type, const char *name, + nvlist_t *aux) +{ +#ifdef _KERNEL + nvlist_t *resource; + + resource = zfs_event_create(spa, vd, type, name, aux); + if (resource) + zfs_zevent_post(resource, NULL, zfs_zevent_post_cb); +#endif +} + +/* + * The 'resource.fs.zfs.removed' event is an internal signal that the given vdev + * has been removed from the system. This will cause the DE to ignore any + * recent I/O errors, inferring that they are due to the asynchronous device + * removal. + */ +void +zfs_post_remove(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_REMOVED, NULL); +} + +/* + * The 'resource.fs.zfs.autoreplace' event is an internal signal that the pool + * has the 'autoreplace' property set, and therefore any broken vdevs will be + * handled by higher level logic, and no vdev fault should be generated. + */ +void +zfs_post_autoreplace(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_AUTOREPLACE, NULL); +} + +/* + * The 'resource.fs.zfs.statechange' event is an internal signal that the + * given vdev has transitioned its state to DEGRADED or HEALTHY. This will + * cause the retire agent to repair any outstanding fault management cases + * open because the device was not found (fault.fs.zfs.device). + */ +void +zfs_post_state_change(spa_t *spa, vdev_t *vd, uint64_t laststate) +{ +#ifdef _KERNEL + nvlist_t *aux; + + /* + * Add optional supplemental keys to payload + */ + aux = fm_nvlist_create(NULL); + if (vd && aux) { + if (vd->vdev_physpath) { + (void) nvlist_add_string(aux, + FM_EREPORT_PAYLOAD_ZFS_VDEV_PHYSPATH, + vd->vdev_physpath); + } + if (vd->vdev_enc_sysfs_path) { + (void) nvlist_add_string(aux, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH, + vd->vdev_enc_sysfs_path); + } + + (void) nvlist_add_uint64(aux, + FM_EREPORT_PAYLOAD_ZFS_VDEV_LASTSTATE, laststate); + } + + zfs_post_common(spa, vd, FM_RSRC_CLASS, FM_RESOURCE_STATECHANGE, + aux); + + if (aux) + fm_nvlist_destroy(aux, FM_NVA_FREE); +#endif +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_ereport_post); +EXPORT_SYMBOL(zfs_ereport_is_valid); +EXPORT_SYMBOL(zfs_ereport_post_checksum); +EXPORT_SYMBOL(zfs_post_remove); +EXPORT_SYMBOL(zfs_post_autoreplace); +EXPORT_SYMBOL(zfs_post_state_change); +#endif /* _KERNEL */ diff --git a/module/zfs/zfs_fuid.c b/module/zfs/zfs_fuid.c new file mode 100644 index 000000000000..015dde4811e4 --- /dev/null +++ b/module/zfs/zfs_fuid.c @@ -0,0 +1,815 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#ifdef _KERNEL +#include +#include +#include +#endif +#include + +/* + * FUID Domain table(s). + * + * The FUID table is stored as a packed nvlist of an array + * of nvlists which contain an index, domain string and offset + * + * During file system initialization the nvlist(s) are read and + * two AVL trees are created. One tree is keyed by the index number + * and the other by the domain string. Nodes are never removed from + * trees, but new entries may be added. If a new entry is added then + * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then + * be responsible for calling zfs_fuid_sync() to sync the changes to disk. + * + */ + +#define FUID_IDX "fuid_idx" +#define FUID_DOMAIN "fuid_domain" +#define FUID_OFFSET "fuid_offset" +#define FUID_NVP_ARRAY "fuid_nvlist" + +typedef struct fuid_domain { + avl_node_t f_domnode; + avl_node_t f_idxnode; + ksiddomain_t *f_ksid; + uint64_t f_idx; +} fuid_domain_t; + +static char *nulldomain = ""; + +/* + * Compare two indexes. + */ +static int +idx_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; + + return (TREE_CMP(node1->f_idx, node2->f_idx)); +} + +/* + * Compare two domain strings. + */ +static int +domain_compare(const void *arg1, const void *arg2) +{ + const fuid_domain_t *node1 = (const fuid_domain_t *)arg1; + const fuid_domain_t *node2 = (const fuid_domain_t *)arg2; + int val; + + val = strcmp(node1->f_ksid->kd_name, node2->f_ksid->kd_name); + + return (TREE_ISIGN(val)); +} + +void +zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); +} + +/* + * load initial fuid domain and idx trees. This function is used by + * both the kernel and zdb. + */ +uint64_t +zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, + avl_tree_t *domain_tree) +{ + dmu_buf_t *db; + uint64_t fuid_size; + + ASSERT(fuid_obj != 0); + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, + FTAG, &db)); + fuid_size = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + if (fuid_size) { + nvlist_t **fuidnvp; + nvlist_t *nvp = NULL; + uint_t count; + char *packed; + int i; + + packed = kmem_alloc(fuid_size, KM_SLEEP); + VERIFY(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH) == 0); + VERIFY(nvlist_unpack(packed, fuid_size, + &nvp, 0) == 0); + VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, + &fuidnvp, &count) == 0); + + for (i = 0; i != count; i++) { + fuid_domain_t *domnode; + char *domain; + uint64_t idx; + + VERIFY(nvlist_lookup_string(fuidnvp[i], FUID_DOMAIN, + &domain) == 0); + VERIFY(nvlist_lookup_uint64(fuidnvp[i], FUID_IDX, + &idx) == 0); + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + + domnode->f_idx = idx; + domnode->f_ksid = ksid_lookupdomain(domain); + avl_add(idx_tree, domnode); + avl_add(domain_tree, domnode); + } + nvlist_free(nvp); + kmem_free(packed, fuid_size); + } + return (fuid_size); +} + +void +zfs_fuid_table_destroy(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + fuid_domain_t *domnode; + void *cookie; + + cookie = NULL; + while ((domnode = avl_destroy_nodes(domain_tree, &cookie))) + ksiddomain_rele(domnode->f_ksid); + + avl_destroy(domain_tree); + cookie = NULL; + while ((domnode = avl_destroy_nodes(idx_tree, &cookie))) + kmem_free(domnode, sizeof (fuid_domain_t)); + avl_destroy(idx_tree); +} + +char * +zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + + searchnode.f_idx = idx; + + findnode = avl_find(idx_tree, &searchnode, &loc); + + return (findnode ? findnode->f_ksid->kd_name : nulldomain); +} + +#ifdef _KERNEL +/* + * Load the fuid table(s) into memory. + */ +static void +zfs_fuid_init(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + if (zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + + zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + + (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); + if (zfsvfs->z_fuid_obj != 0) { + zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, + zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, + &zfsvfs->z_fuid_domain); + } + + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * sync out AVL trees to persistent storage. + */ +void +zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + nvlist_t *nvp; + nvlist_t **fuids; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + fuid_domain_t *domnode; + int numnodes; + int i; + + if (!zfsvfs->z_fuid_dirty) { + return; + } + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + /* + * First see if table needs to be created? + */ + if (zfsvfs->z_fuid_obj == 0) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); + fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); + for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { + VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, numnodes) == 0); + for (i = 0; i != numnodes; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, numnodes * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + zfsvfs->z_fuid_dirty = B_FALSE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Query domain table for a given domain. + * + * If domain isn't found and addok is set, it is added to AVL trees and + * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be + * necessary for the caller or another thread to detect the dirty table + * and sync out the changes. + */ +int +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, + char **retdomain, boolean_t addok) +{ + fuid_domain_t searchnode, *findnode; + avl_index_t loc; + krw_t rw = RW_READER; + + /* + * If the dummy "nobody" domain then return an index of 0 + * to cause the created FUID to be a standard POSIX id + * for the user nobody. + */ + if (domain[0] == '\0') { + if (retdomain) + *retdomain = nulldomain; + return (0); + } + + searchnode.f_ksid = ksid_lookupdomain(domain); + if (retdomain) + *retdomain = searchnode.f_ksid->kd_name; + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs); + +retry: + rw_enter(&zfsvfs->z_fuid_lock, rw); + findnode = avl_find(&zfsvfs->z_fuid_domain, &searchnode, &loc); + + if (findnode) { + rw_exit(&zfsvfs->z_fuid_lock); + ksiddomain_rele(searchnode.f_ksid); + return (findnode->f_idx); + } else if (addok) { + fuid_domain_t *domnode; + uint64_t retidx; + + if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { + rw_exit(&zfsvfs->z_fuid_lock); + rw = RW_WRITER; + goto retry; + } + + domnode = kmem_alloc(sizeof (fuid_domain_t), KM_SLEEP); + domnode->f_ksid = searchnode.f_ksid; + + retidx = domnode->f_idx = avl_numnodes(&zfsvfs->z_fuid_idx) + 1; + + avl_add(&zfsvfs->z_fuid_domain, domnode); + avl_add(&zfsvfs->z_fuid_idx, domnode); + zfsvfs->z_fuid_dirty = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); + return (retidx); + } else { + rw_exit(&zfsvfs->z_fuid_lock); + return (-1); + } +} + +/* + * Query domain table by index, returning domain string + * + * Returns a pointer from an avl node of the domain string. + * + */ +const char * +zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) +{ + char *domain; + + if (idx == 0 || !zfsvfs->z_use_fuids) + return (NULL); + + if (!zfsvfs->z_fuid_loaded) + zfs_fuid_init(zfsvfs); + + rw_enter(&zfsvfs->z_fuid_lock, RW_READER); + + if (zfsvfs->z_fuid_obj || zfsvfs->z_fuid_dirty) + domain = zfs_fuid_idx_domain(&zfsvfs->z_fuid_idx, idx); + else + domain = nulldomain; + rw_exit(&zfsvfs->z_fuid_lock); + + ASSERT(domain); + return (domain); +} + +void +zfs_fuid_map_ids(znode_t *zp, cred_t *cr, uid_t *uidp, uid_t *gidp) +{ + *uidp = zfs_fuid_map_id(ZTOZSB(zp), KUID_TO_SUID(ZTOUID(zp)), + cr, ZFS_OWNER); + *gidp = zfs_fuid_map_id(ZTOZSB(zp), KGID_TO_SGID(ZTOGID(zp)), + cr, ZFS_GROUP); +} + +#ifdef __FreeBSD__ +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + + if (index == 0) + return (fuid); + + return (UID_NOBODY); +} +#elif defined(__linux__) +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + /* + * The Linux port only supports POSIX IDs, use the passed id. + */ + return (fuid); +} + +#else +uid_t +zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, + cred_t *cr, zfs_fuid_type_t type) +{ + uint32_t index = FUID_INDEX(fuid); + const char *domain; + uid_t id; + + if (index == 0) + return (fuid); + + domain = zfs_fuid_find_by_idx(zfsvfs, index); + ASSERT(domain != NULL); + + if (type == ZFS_OWNER || type == ZFS_ACE_USER) { + (void) kidmap_getuidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } else { + (void) kidmap_getgidbysid(crgetzone(cr), domain, + FUID_RID(fuid), &id); + } + return (id); +} +#endif + +/* + * Add a FUID node to the list of fuid's being created for this + * ACL + * + * If ACL has multiple domains, then keep only one copy of each unique + * domain. + */ +void +zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, + uint64_t idx, uint64_t id, zfs_fuid_type_t type) +{ + zfs_fuid_t *fuid; + zfs_fuid_domain_t *fuid_domain; + zfs_fuid_info_t *fuidp; + uint64_t fuididx; + boolean_t found = B_FALSE; + + if (*fuidpp == NULL) + *fuidpp = zfs_fuid_info_alloc(); + + fuidp = *fuidpp; + /* + * First find fuid domain index in linked list + * + * If one isn't found then create an entry. + */ + + for (fuididx = 1, fuid_domain = list_head(&fuidp->z_domains); + fuid_domain; fuid_domain = list_next(&fuidp->z_domains, + fuid_domain), fuididx++) { + if (idx == fuid_domain->z_domidx) { + found = B_TRUE; + break; + } + } + + if (!found) { + fuid_domain = kmem_alloc(sizeof (zfs_fuid_domain_t), KM_SLEEP); + fuid_domain->z_domain = domain; + fuid_domain->z_domidx = idx; + list_insert_tail(&fuidp->z_domains, fuid_domain); + fuidp->z_domain_str_sz += strlen(domain) + 1; + fuidp->z_domain_cnt++; + } + + if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + + /* + * Now allocate fuid entry and add it on the end of the list + */ + + fuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + fuid->z_id = id; + fuid->z_domidx = idx; + fuid->z_logfuid = FUID_ENCODE(fuididx, rid); + + list_insert_tail(&fuidp->z_fuids, fuid); + fuidp->z_fuid_cnt++; + } else { + if (type == ZFS_OWNER) + fuidp->z_fuid_owner = FUID_ENCODE(fuididx, rid); + else + fuidp->z_fuid_group = FUID_ENCODE(fuididx, rid); + } +} + +#ifdef HAVE_KSID +/* + * Create a file system FUID, based on information in the users cred + * + * If cred contains KSID_OWNER then it should be used to determine + * the uid otherwise cred's uid will be used. By default cred's gid + * is used unless it's an ephemeral ID in which case KSID_GROUP will + * be used if it exists. + */ +uint64_t +zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, + cred_t *cr, zfs_fuid_info_t **fuidp) +{ + uint64_t idx; + ksid_t *ksid; + uint32_t rid; + char *kdomain; + const char *domain; + uid_t id; + + VERIFY(type == ZFS_OWNER || type == ZFS_GROUP); + + ksid = crgetsid(cr, (type == ZFS_OWNER) ? KSID_OWNER : KSID_GROUP); + + if (!zfsvfs->z_use_fuids || (ksid == NULL)) { + id = (type == ZFS_OWNER) ? crgetuid(cr) : crgetgid(cr); + + if (IS_EPHEMERAL(id)) + return ((type == ZFS_OWNER) ? UID_NOBODY : GID_NOBODY); + + return ((uint64_t)id); + } + + /* + * ksid is present and FUID is supported + */ + id = (type == ZFS_OWNER) ? ksid_getid(ksid) : crgetgid(cr); + + if (!IS_EPHEMERAL(id)) + return ((uint64_t)id); + + if (type == ZFS_GROUP) + id = ksid_getid(ksid); + + rid = ksid_getrid(ksid); + domain = ksid_getdomain(ksid); + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); + + zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); + + return (FUID_ENCODE(idx, rid)); +} +#endif /* HAVE_KSID */ + +/* + * Create a file system FUID for an ACL ace + * or a chown/chgrp of the file. + * This is similar to zfs_fuid_create_cred, except that + * we can't find the domain + rid information in the + * cred. Instead we have to query Winchester for the + * domain and rid. + * + * During replay operations the domain+rid information is + * found in the zfs_fuid_info_t that the replay code has + * attached to the zfsvfs of the file system. + */ +uint64_t +zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, + zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) +{ +#ifdef HAVE_KSID + const char *domain; + char *kdomain; + uint32_t fuid_idx = FUID_INDEX(id); + uint32_t rid = 0; + idmap_stat status; + uint64_t idx = UID_NOBODY; + zfs_fuid_t *zfuid = NULL; + zfs_fuid_info_t *fuidp = NULL; + + /* + * If POSIX ID, or entry is already a FUID then + * just return the id + * + * We may also be handed an already FUID'ized id via + * chmod. + */ + + if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) + return (id); + + if (zfsvfs->z_replay) { + fuidp = zfsvfs->z_fuid_replay; + + /* + * If we are passed an ephemeral id, but no + * fuid_info was logged then return NOBODY. + * This is most likely a result of idmap service + * not being available. + */ + if (fuidp == NULL) + return (UID_NOBODY); + + VERIFY3U(type, >=, ZFS_OWNER); + VERIFY3U(type, <=, ZFS_ACE_GROUP); + + switch (type) { + case ZFS_ACE_USER: + case ZFS_ACE_GROUP: + zfuid = list_head(&fuidp->z_fuids); + rid = FUID_RID(zfuid->z_logfuid); + idx = FUID_INDEX(zfuid->z_logfuid); + break; + case ZFS_OWNER: + rid = FUID_RID(fuidp->z_fuid_owner); + idx = FUID_INDEX(fuidp->z_fuid_owner); + break; + case ZFS_GROUP: + rid = FUID_RID(fuidp->z_fuid_group); + idx = FUID_INDEX(fuidp->z_fuid_group); + break; + }; + domain = fuidp->z_domain_table[idx - 1]; + } else { + if (type == ZFS_OWNER || type == ZFS_ACE_USER) + status = kidmap_getsidbyuid(crgetzone(cr), id, + &domain, &rid); + else + status = kidmap_getsidbygid(crgetzone(cr), id, + &domain, &rid); + + if (status != 0) { + /* + * When returning nobody we will need to + * make a dummy fuid table entry for logging + * purposes. + */ + rid = UID_NOBODY; + domain = nulldomain; + } + } + + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); + + if (!zfsvfs->z_replay) + zfs_fuid_node_add(fuidpp, kdomain, + rid, idx, id, type); + else if (zfuid != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + return (FUID_ENCODE(idx, rid)); +#else + /* + * The Linux port only supports POSIX IDs, use the passed id. + */ + return (id); +#endif +} + +void +zfs_fuid_destroy(zfsvfs_t *zfsvfs) +{ + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + if (!zfsvfs->z_fuid_loaded) { + rw_exit(&zfsvfs->z_fuid_lock); + return; + } + zfs_fuid_table_destroy(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * Allocate zfs_fuid_info for tracking FUIDs created during + * zfs_mknode, VOP_SETATTR() or VOP_SETSECATTR() + */ +zfs_fuid_info_t * +zfs_fuid_info_alloc(void) +{ + zfs_fuid_info_t *fuidp; + + fuidp = kmem_zalloc(sizeof (zfs_fuid_info_t), KM_SLEEP); + list_create(&fuidp->z_domains, sizeof (zfs_fuid_domain_t), + offsetof(zfs_fuid_domain_t, z_next)); + list_create(&fuidp->z_fuids, sizeof (zfs_fuid_t), + offsetof(zfs_fuid_t, z_next)); + return (fuidp); +} + +/* + * Release all memory associated with zfs_fuid_info_t + */ +void +zfs_fuid_info_free(zfs_fuid_info_t *fuidp) +{ + zfs_fuid_t *zfuid; + zfs_fuid_domain_t *zdomain; + + while ((zfuid = list_head(&fuidp->z_fuids)) != NULL) { + list_remove(&fuidp->z_fuids, zfuid); + kmem_free(zfuid, sizeof (zfs_fuid_t)); + } + + if (fuidp->z_domain_table != NULL) + kmem_free(fuidp->z_domain_table, + (sizeof (char *)) * fuidp->z_domain_cnt); + + while ((zdomain = list_head(&fuidp->z_domains)) != NULL) { + list_remove(&fuidp->z_domains, zdomain); + kmem_free(zdomain, sizeof (zfs_fuid_domain_t)); + } + + kmem_free(fuidp, sizeof (zfs_fuid_info_t)); +} + +/* + * Check to see if id is a groupmember. If cred + * has ksid info then sidlist is checked first + * and if still not found then POSIX groups are checked + * + * Will use a straight FUID compare when possible. + */ +boolean_t +zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) +{ +#ifdef HAVE_KSID + uid_t gid; + +#ifdef illumos + ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); + + if (ksid && ksidlist) { + int i; + ksid_t *ksid_groups; + uint32_t idx = FUID_INDEX(id); + uint32_t rid = FUID_RID(id); + + ksid_groups = ksidlist->ksl_sids; + + for (i = 0; i != ksidlist->ksl_nsid; i++) { + if (idx == 0) { + if (id != IDMAP_WK_CREATOR_GROUP_GID && + id == ksid_groups[i].ks_id) { + return (B_TRUE); + } + } else { + const char *domain; + + domain = zfs_fuid_find_by_idx(zfsvfs, idx); + ASSERT(domain != NULL); + + if (strcmp(domain, + IDMAP_WK_CREATOR_SID_AUTHORITY) == 0) + return (B_FALSE); + + if ((strcmp(domain, + ksid_groups[i].ks_domain->kd_name) == 0) && + rid == ksid_groups[i].ks_rid) + return (B_TRUE); + } + } + } +#endif /* illumos */ + + /* + * Not found in ksidlist, check posix groups + */ + gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); + return (groupmember(gid, cr)); +#else + return (B_TRUE); +#endif +} + +void +zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +int +zfs_id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, size_t len, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (SET_ERROR(ENOENT)); + } + fuid = FUID_ENCODE(domainid, rid); + (void) snprintf(buf, len, "%llx", (longlong_t)fuid); + return (0); +} +#endif diff --git a/module/zfs/zfs_ioctl.c b/module/zfs/zfs_ioctl.c new file mode 100644 index 000000000000..7f623bb046ea --- /dev/null +++ b/module/zfs/zfs_ioctl.c @@ -0,0 +1,7629 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Portions Copyright 2011 Martin Matuska + * Copyright 2015, OmniTI Computer Consulting, Inc. All rights reserved. + * Portions Copyright 2012 Pawel Jakub Dawidek + * Copyright (c) 2014, 2016 Joyent, Inc. All rights reserved. + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2014, Joyent, Inc. All rights reserved. + * Copyright (c) 2011, 2020 by Delphix. All rights reserved. + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + * Copyright (c) 2013 Steven Hartland. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Toomas Soome + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2018, loli10K . All rights reserved. + * Copyright 2017 RackTop Systems. + * Copyright (c) 2017 Open-E, Inc. All Rights Reserved. + * Copyright (c) 2019 Datto Inc. + * Copyright (c) 2019, 2020 by Christian Schwarz. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +/* + * ZFS ioctls. + * + * This file handles the ioctls to /dev/zfs, used for configuring ZFS storage + * pools and filesystems, e.g. with /sbin/zfs and /sbin/zpool. + * + * There are two ways that we handle ioctls: the legacy way where almost + * all of the logic is in the ioctl callback, and the new way where most + * of the marshalling is handled in the common entry point, zfsdev_ioctl(). + * + * Non-legacy ioctls should be registered by calling + * zfs_ioctl_register() from zfs_ioctl_init(). The ioctl is invoked + * from userland by lzc_ioctl(). + * + * The registration arguments are as follows: + * + * const char *name + * The name of the ioctl. This is used for history logging. If the + * ioctl returns successfully (the callback returns 0), and allow_log + * is true, then a history log entry will be recorded with the input & + * output nvlists. The log entry can be printed with "zpool history -i". + * + * zfs_ioc_t ioc + * The ioctl request number, which userland will pass to ioctl(2). + * We want newer versions of libzfs and libzfs_core to run against + * existing zfs kernel modules (i.e. a deferred reboot after an update). + * Therefore the ioctl numbers cannot change from release to release. + * + * zfs_secpolicy_func_t *secpolicy + * This function will be called before the zfs_ioc_func_t, to + * determine if this operation is permitted. It should return EPERM + * on failure, and 0 on success. Checks include determining if the + * dataset is visible in this zone, and if the user has either all + * zfs privileges in the zone (SYS_MOUNT), or has been granted permission + * to do this operation on this dataset with "zfs allow". + * + * zfs_ioc_namecheck_t namecheck + * This specifies what to expect in the zfs_cmd_t:zc_name -- a pool + * name, a dataset name, or nothing. If the name is not well-formed, + * the ioctl will fail and the callback will not be called. + * Therefore, the callback can assume that the name is well-formed + * (e.g. is null-terminated, doesn't have more than one '@' character, + * doesn't have invalid characters). + * + * zfs_ioc_poolcheck_t pool_check + * This specifies requirements on the pool state. If the pool does + * not meet them (is suspended or is readonly), the ioctl will fail + * and the callback will not be called. If any checks are specified + * (i.e. it is not POOL_CHECK_NONE), namecheck must not be NO_NAME. + * Multiple checks can be or-ed together (e.g. POOL_CHECK_SUSPENDED | + * POOL_CHECK_READONLY). + * + * zfs_ioc_key_t *nvl_keys + * The list of expected/allowable innvl input keys. This list is used + * to validate the nvlist input to the ioctl. + * + * boolean_t smush_outnvlist + * If smush_outnvlist is true, then the output is presumed to be a + * list of errors, and it will be "smushed" down to fit into the + * caller's buffer, by removing some entries and replacing them with a + * single "N_MORE_ERRORS" entry indicating how many were removed. See + * nvlist_smush() for details. If smush_outnvlist is false, and the + * outnvlist does not fit into the userland-provided buffer, then the + * ioctl will fail with ENOMEM. + * + * zfs_ioc_func_t *func + * The callback function that will perform the operation. + * + * The callback should return 0 on success, or an error number on + * failure. If the function fails, the userland ioctl will return -1, + * and errno will be set to the callback's return value. The callback + * will be called with the following arguments: + * + * const char *name + * The name of the pool or dataset to operate on, from + * zfs_cmd_t:zc_name. The 'namecheck' argument specifies the + * expected type (pool, dataset, or none). + * + * nvlist_t *innvl + * The input nvlist, deserialized from zfs_cmd_t:zc_nvlist_src. Or + * NULL if no input nvlist was provided. Changes to this nvlist are + * ignored. If the input nvlist could not be deserialized, the + * ioctl will fail and the callback will not be called. + * + * nvlist_t *outnvl + * The output nvlist, initially empty. The callback can fill it in, + * and it will be returned to userland by serializing it into + * zfs_cmd_t:zc_nvlist_dst. If it is non-empty, and serialization + * fails (e.g. because the caller didn't supply a large enough + * buffer), then the overall ioctl will fail. See the + * 'smush_nvlist' argument above for additional behaviors. + * + * There are two typical uses of the output nvlist: + * - To return state, e.g. property values. In this case, + * smush_outnvlist should be false. If the buffer was not large + * enough, the caller will reallocate a larger buffer and try + * the ioctl again. + * + * - To return multiple errors from an ioctl which makes on-disk + * changes. In this case, smush_outnvlist should be true. + * Ioctls which make on-disk modifications should generally not + * use the outnvl if they succeed, because the caller can not + * distinguish between the operation failing, and + * deserialization failing. + * + * IOCTL Interface Errors + * + * The following ioctl input errors can be returned: + * ZFS_ERR_IOC_CMD_UNAVAIL the ioctl number is not supported by kernel + * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel + * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing + * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "zfs_namecheck.h" +#include "zfs_prop.h" +#include "zfs_deleg.h" +#include "zfs_comutil.h" + +#include +#include +#include + +kmutex_t zfsdev_state_lock; +zfsdev_state_t *zfsdev_state_list; + +/* + * Limit maximum nvlist size. We don't want users passing in insane values + * for zc->zc_nvlist_src_size, since we will need to allocate that much memory. + * Defaults to 0=auto which is handled by platform code. + */ +unsigned long zfs_max_nvlist_src_size = 0; + +uint_t zfs_fsyncer_key; +uint_t zfs_allow_log_key; + +/* DATA_TYPE_ANY is used when zkey_type can vary. */ +#define DATA_TYPE_ANY DATA_TYPE_UNKNOWN + +typedef struct zfs_ioc_vec { + zfs_ioc_legacy_func_t *zvec_legacy_func; + zfs_ioc_func_t *zvec_func; + zfs_secpolicy_func_t *zvec_secpolicy; + zfs_ioc_namecheck_t zvec_namecheck; + boolean_t zvec_allow_log; + zfs_ioc_poolcheck_t zvec_pool_check; + boolean_t zvec_smush_outnvlist; + const char *zvec_name; + const zfs_ioc_key_t *zvec_nvl_keys; + size_t zvec_nvl_key_count; +} zfs_ioc_vec_t; + +/* This array is indexed by zfs_userquota_prop_t */ +static const char *userquota_perms[] = { + ZFS_DELEG_PERM_USERUSED, + ZFS_DELEG_PERM_USERQUOTA, + ZFS_DELEG_PERM_GROUPUSED, + ZFS_DELEG_PERM_GROUPQUOTA, + ZFS_DELEG_PERM_USEROBJUSED, + ZFS_DELEG_PERM_USEROBJQUOTA, + ZFS_DELEG_PERM_GROUPOBJUSED, + ZFS_DELEG_PERM_GROUPOBJQUOTA, + ZFS_DELEG_PERM_PROJECTUSED, + ZFS_DELEG_PERM_PROJECTQUOTA, + ZFS_DELEG_PERM_PROJECTOBJUSED, + ZFS_DELEG_PERM_PROJECTOBJQUOTA, +}; + +static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); +static int zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc); +static int zfs_check_settable(const char *name, nvpair_t *property, + cred_t *cr); +static int zfs_check_clearable(char *dataset, nvlist_t *props, + nvlist_t **errors); +static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, + boolean_t *); +int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t *); +static int get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp); + +static void +history_str_free(char *buf) +{ + kmem_free(buf, HIS_MAX_RECORD_LEN); +} + +static char * +history_str_get(zfs_cmd_t *zc) +{ + char *buf; + + if (zc->zc_history == 0) + return (NULL); + + buf = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); + if (copyinstr((void *)(uintptr_t)zc->zc_history, + buf, HIS_MAX_RECORD_LEN, NULL) != 0) { + history_str_free(buf); + return (NULL); + } + + buf[HIS_MAX_RECORD_LEN -1] = '\0'; + + return (buf); +} + +/* + * Return non-zero if the spa version is less than requested version. + */ +static int +zfs_earlier_version(const char *name, int version) +{ + spa_t *spa; + + if (spa_open(name, &spa, FTAG) == 0) { + if (spa_version(spa) < version) { + spa_close(spa, FTAG); + return (1); + } + spa_close(spa, FTAG); + } + return (0); +} + +/* + * Return TRUE if the ZPL version is less than requested version. + */ +static boolean_t +zpl_earlier_version(const char *name, int version) +{ + objset_t *os; + boolean_t rc = B_TRUE; + + if (dmu_objset_hold(name, FTAG, &os) == 0) { + uint64_t zplversion; + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (B_TRUE); + } + /* XXX reading from non-owned objset */ + if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) + rc = zplversion < version; + dmu_objset_rele(os, FTAG); + } + return (rc); +} + +static void +zfs_log_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *buf; + + if ((buf = history_str_get(zc)) == NULL) + return; + + if (spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (spa_version(spa) >= SPA_VERSION_ZPOOL_HISTORY) + (void) spa_history_log(spa, buf); + spa_close(spa, FTAG); + } + history_str_free(buf); +} + +/* + * Policy for top-level read operations (list pools). Requires no privileges, + * and can be used in the local zone, as there is no associated dataset. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_none(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (0); +} + +/* + * Policy for dataset read operations (list children, get statistics). Requires + * no privileges, but must be visible in the local zone. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_read(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (INGLOBALZONE(curproc) || + zone_dataset_visible(zc->zc_name, NULL)) + return (0); + + return (SET_ERROR(ENOENT)); +} + +static int +zfs_dozonecheck_impl(const char *dataset, uint64_t zoned, cred_t *cr) +{ + int writable = 1; + + /* + * The dataset must be visible by this zone -- check this first + * so they don't see EPERM on something they shouldn't know about. + */ + if (!INGLOBALZONE(curproc) && + !zone_dataset_visible(dataset, &writable)) + return (SET_ERROR(ENOENT)); + + if (INGLOBALZONE(curproc)) { + /* + * If the fs is zoned, only root can access it from the + * global zone. + */ + if (secpolicy_zfs(cr) && zoned) + return (SET_ERROR(EPERM)); + } else { + /* + * If we are in a local zone, the 'zoned' property must be set. + */ + if (!zoned) + return (SET_ERROR(EPERM)); + + /* must be writable by this zone */ + if (!writable) + return (SET_ERROR(EPERM)); + } + return (0); +} + +static int +zfs_dozonecheck(const char *dataset, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_integer(dataset, zfs_prop_to_name(ZFS_PROP_ZONED), + &zoned, NULL)) + return (SET_ERROR(ENOENT)); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_dozonecheck_ds(const char *dataset, dsl_dataset_t *ds, cred_t *cr) +{ + uint64_t zoned; + + if (dsl_prop_get_int_ds(ds, zfs_prop_to_name(ZFS_PROP_ZONED), &zoned)) + return (SET_ERROR(ENOENT)); + + return (zfs_dozonecheck_impl(dataset, zoned, cr)); +} + +static int +zfs_secpolicy_write_perms_ds(const char *name, dsl_dataset_t *ds, + const char *perm, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck_ds(name, ds, cr); + if (error == 0) { + error = secpolicy_zfs(cr); + if (error != 0) + error = dsl_deleg_access_impl(ds, perm, cr); + } + return (error); +} + +static int +zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) +{ + int error; + dsl_dataset_t *ds; + dsl_pool_t *dp; + + /* + * First do a quick check for root in the global zone, which + * is allowed to do all write_perms. This ensures that zfs_ioc_* + * will get to handle nonexistent datasets. + */ + if (INGLOBALZONE(curproc) && secpolicy_zfs(cr) == 0) + return (0); + + error = dsl_pool_hold(name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = zfs_secpolicy_write_perms_ds(name, ds, perm, cr); + + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +/* + * Policy for setting the security label property. + * + * Returns 0 for success, non-zero for access and other errors. + */ +static int +zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) +{ +#ifdef HAVE_MLSLABEL + char ds_hexsl[MAXNAMELEN]; + bslabel_t ds_sl, new_sl; + boolean_t new_default = FALSE; + uint64_t zoned; + int needed_priv = -1; + int error; + + /* First get the existing dataset label. */ + error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error != 0) + return (SET_ERROR(EPERM)); + + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + new_default = TRUE; + + /* The label must be translatable */ + if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) + return (SET_ERROR(EINVAL)); + + /* + * In a non-global zone, disallow attempts to set a label that + * doesn't match that of the zone; otherwise no other checks + * are needed. + */ + if (!INGLOBALZONE(curproc)) { + if (new_default || !blequal(&new_sl, CR_SL(CRED()))) + return (SET_ERROR(EPERM)); + return (0); + } + + /* + * For global-zone datasets (i.e., those whose zoned property is + * "off", verify that the specified new label is valid for the + * global zone. + */ + if (dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (SET_ERROR(EPERM)); + if (!zoned) { + if (zfs_check_global_label(name, strval) != 0) + return (SET_ERROR(EPERM)); + } + + /* + * If the existing dataset label is nondefault, check if the + * dataset is mounted (label cannot be changed while mounted). + * Get the zfsvfs_t; if there isn't one, then the dataset isn't + * mounted (or isn't a dataset, doesn't exist, ...). + */ + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { + objset_t *os; + static char *setsl_tag = "setsl_tag"; + + /* + * Try to own the dataset; abort if there is any error, + * (e.g., already mounted, in use, or other error). + */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, B_TRUE, + setsl_tag, &os); + if (error != 0) + return (SET_ERROR(EPERM)); + + dmu_objset_disown(os, B_TRUE, setsl_tag); + + if (new_default) { + needed_priv = PRIV_FILE_DOWNGRADE_SL; + goto out_check; + } + + if (hexstr_to_label(strval, &new_sl) != 0) + return (SET_ERROR(EPERM)); + + if (blstrictdom(&ds_sl, &new_sl)) + needed_priv = PRIV_FILE_DOWNGRADE_SL; + else if (blstrictdom(&new_sl, &ds_sl)) + needed_priv = PRIV_FILE_UPGRADE_SL; + } else { + /* dataset currently has a default label */ + if (!new_default) + needed_priv = PRIV_FILE_UPGRADE_SL; + } + +out_check: + if (needed_priv != -1) + return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); + return (0); +#else + return (SET_ERROR(ENOTSUP)); +#endif /* HAVE_MLSLABEL */ +} + +static int +zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, + cred_t *cr) +{ + char *strval; + + /* + * Check permissions for special properties. + */ + switch (prop) { + default: + break; + case ZFS_PROP_ZONED: + /* + * Disallow setting of 'zoned' from within a local zone. + */ + if (!INGLOBALZONE(curproc)) + return (SET_ERROR(EPERM)); + break; + + case ZFS_PROP_QUOTA: + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (!INGLOBALZONE(curproc)) { + uint64_t zoned; + char setpoint[ZFS_MAX_DATASET_NAME_LEN]; + /* + * Unprivileged users are allowed to modify the + * limit on things *under* (ie. contained by) + * the thing they own. + */ + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, setpoint)) + return (SET_ERROR(EPERM)); + if (!zoned || strlen(dsname) <= strlen(setpoint)) + return (SET_ERROR(EPERM)); + } + break; + + case ZFS_PROP_MLSLABEL: + if (!is_system_labeled()) + return (SET_ERROR(EPERM)); + + if (nvpair_value_string(propval, &strval) == 0) { + int err; + + err = zfs_set_slabel_policy(dsname, strval, CRED()); + if (err != 0) + return (err); + } + break; + } + + return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_set_fsacl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + error = zfs_dozonecheck(zc->zc_name, cr); + if (error != 0) + return (error); + + /* + * permission to set permissions will be evaluated later in + * dsl_deleg_can_allow() + */ + return (0); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_rollback(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_send(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + char *cp; + int error; + + /* + * Generate the current snapshot name from the given objsetid, then + * use that name for the secpolicy/zone checks. + */ + cp = strchr(zc->zc_name, '@'); + if (cp == NULL) + return (SET_ERROR(EINVAL)); + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dsl_dataset_name(ds, zc->zc_name); + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, ds, + ZFS_DELEG_PERM_SEND, cr); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_send_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_SEND, cr)); +} + +static int +zfs_secpolicy_share(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (SET_ERROR(ENOTSUP)); +} + +static int +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (SET_ERROR(ENOTSUP)); +} + +static int +zfs_get_parent(const char *datasetname, char *parent, int parentsize) +{ + char *cp; + + /* + * Remove the @bla or /bla from the end of the name to get the parent. + */ + (void) strncpy(parent, datasetname, parentsize); + cp = strrchr(parent, '@'); + if (cp != NULL) { + cp[0] = '\0'; + } else { + cp = strrchr(parent, '/'); + if (cp == NULL) + return (SET_ERROR(ENOENT)); + cp[0] = '\0'; + } + + return (0); +} + +int +zfs_secpolicy_destroy_perms(const char *name, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(name, ZFS_DELEG_PERM_DESTROY, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_destroy(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); +} + +/* + * Destroying snapshots with delegated permissions requires + * descendant mount and destroy permissions. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvlist_t *snaps; + nvpair_t *pair, *nextpair; + int error = 0; + + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nextpair) { + nextpair = nvlist_next_nvpair(snaps, pair); + error = zfs_secpolicy_destroy_perms(nvpair_name(pair), cr); + if (error == ENOENT) { + /* + * Ignore any snapshots that don't exist (we consider + * them "already destroyed"). Remove the name from the + * nvl here in case the snapshot is created between + * now and when we try to destroy it (in which case + * we don't want to destroy it since we haven't + * checked for permission). + */ + fnvlist_remove_nvpair(snaps, pair); + error = 0; + } + if (error != 0) + break; + } + + return (error); +} + +int +zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) +{ + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_RENAME, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(from, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + if ((error = zfs_get_parent(to, parentname, + sizeof (parentname))) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_rename(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_rename_perms(zc->zc_name, zc->zc_value, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_promote(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + dsl_pool_t *dp; + dsl_dataset_t *clone; + int error; + + error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_PROMOTE, cr); + if (error != 0) + return (error); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &clone); + + if (error == 0) { + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_t *origin = NULL; + dsl_dir_t *dd; + dd = clone->ds_dir; + + error = dsl_dataset_hold_obj(dd->dd_pool, + dsl_dir_phys(dd)->dd_origin_obj, FTAG, &origin); + if (error != 0) { + dsl_dataset_rele(clone, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = zfs_secpolicy_write_perms_ds(zc->zc_name, clone, + ZFS_DELEG_PERM_MOUNT, cr); + + dsl_dataset_name(origin, parentname); + if (error == 0) { + error = zfs_secpolicy_write_perms_ds(parentname, origin, + ZFS_DELEG_PERM_PROMOTE, cr); + } + dsl_dataset_rele(clone, FTAG); + dsl_dataset_rele(origin, FTAG); + } + dsl_pool_rele(dp, FTAG); + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_recv(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RECEIVE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_MOUNT, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CREATE, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_recv_new(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_recv(zc, innvl, cr)); +} + +int +zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)); +} + +/* + * Check for permission to create each snapshot in the nvlist. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvlist_t *snaps; + int error = 0; + nvpair_t *pair; + + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + char *name = nvpair_name(pair); + char *atp = strchr(name, '@'); + + if (atp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *atp = '\0'; + error = zfs_secpolicy_snapshot_perms(name, cr); + *atp = '@'; + if (error != 0) + break; + } + return (error); +} + +/* + * Check for permission to create each bookmark in the nvlist. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_bookmark(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error = 0; + + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); + + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_BOOKMARK, cr); + *hashp = '#'; + if (error != 0) + break; + } + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_destroy_bookmarks(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair, *nextpair; + int error = 0; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nextpair) { + char *name = nvpair_name(pair); + char *hashp = strchr(name, '#'); + nextpair = nvlist_next_nvpair(innvl, pair); + + if (hashp == NULL) { + error = SET_ERROR(EINVAL); + break; + } + + *hashp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_DESTROY, cr); + *hashp = '#'; + if (error == ENOENT) { + /* + * Ignore any filesystems that don't exist (we consider + * their bookmarks "already destroyed"). Remove + * the name from the nvl here in case the filesystem + * is created between now and when we try to destroy + * the bookmark (in which case we don't want to + * destroy it since we haven't checked for permission). + */ + fnvlist_remove_nvpair(innvl, pair); + error = 0; + } + if (error != 0) + break; + } + + return (error); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_log_history(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + /* + * Even root must have a proper TSD so that we know what pool + * to log to. + */ + if (tsd_get(zfs_allow_log_key) == NULL) + return (SET_ERROR(EPERM)); + return (0); +} + +static int +zfs_secpolicy_create_clone(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + int error; + char *origin; + + if ((error = zfs_get_parent(zc->zc_name, parentname, + sizeof (parentname))) != 0) + return (error); + + if (nvlist_lookup_string(innvl, "origin", &origin) == 0 && + (error = zfs_secpolicy_write_perms(origin, + ZFS_DELEG_PERM_CLONE, cr)) != 0) + return (error); + + if ((error = zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_CREATE, cr)) != 0) + return (error); + + return (zfs_secpolicy_write_perms(parentname, + ZFS_DELEG_PERM_MOUNT, cr)); +} + +/* + * Policy for pool operations - create/destroy pools, add vdevs, etc. Requires + * SYS_CONFIG privilege, which is not available in a local zone. + */ +/* ARGSUSED */ +int +zfs_secpolicy_config(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + if (secpolicy_sys_config(cr, B_FALSE) != 0) + return (SET_ERROR(EPERM)); + + return (0); +} + +/* + * Policy for object to name lookups. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_diff(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int error; + + if ((error = secpolicy_sys_config(cr, B_FALSE)) == 0) + return (0); + + error = zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_DIFF, cr); + return (error); +} + +/* + * Policy for fault injection. Requires all privileges. + */ +/* ARGSUSED */ +static int +zfs_secpolicy_inject(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (secpolicy_zinject(cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_inherit_prop(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + zfs_prop_t prop = zfs_name_to_prop(zc->zc_value); + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(zc->zc_value)) + return (SET_ERROR(EINVAL)); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_USERPROP, cr)); + } else { + return (zfs_secpolicy_setprop(zc->zc_name, prop, + NULL, cr)); + } +} + +static int +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, innvl, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + if (zc->zc_value[0] == 0) { + /* + * They are asking about a posix uid/gid. If it's + * themself, allow it. + */ + if (zc->zc_objset_type == ZFS_PROP_USERUSED || + zc->zc_objset_type == ZFS_PROP_USERQUOTA || + zc->zc_objset_type == ZFS_PROP_USEROBJUSED || + zc->zc_objset_type == ZFS_PROP_USEROBJQUOTA) { + if (zc->zc_guid == crgetuid(cr)) + return (0); + } else if (zc->zc_objset_type == ZFS_PROP_GROUPUSED || + zc->zc_objset_type == ZFS_PROP_GROUPQUOTA || + zc->zc_objset_type == ZFS_PROP_GROUPOBJUSED || + zc->zc_objset_type == ZFS_PROP_GROUPOBJQUOTA) { + if (groupmember(zc->zc_guid, cr)) + return (0); + } + /* else is for project quota/used */ + } + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, innvl, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, + NULL, cr)); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair; + nvlist_t *holds; + int error; + + holds = fnvlist_lookup_nvlist(innvl, "holds"); + + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_HOLD, cr); + if (error != 0) + return (error); + } + return (0); +} + +/* ARGSUSED */ +static int +zfs_secpolicy_release(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + nvpair_t *pair; + int error; + + for (pair = nvlist_next_nvpair(innvl, NULL); pair != NULL; + pair = nvlist_next_nvpair(innvl, pair)) { + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + error = dmu_fsname(nvpair_name(pair), fsname); + if (error != 0) + return (error); + error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_RELEASE, cr); + if (error != 0) + return (error); + } + return (0); +} + +/* + * Policy for allowing temporary snapshots to be taken or released + */ +static int +zfs_secpolicy_tmp_snapshot(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + /* + * A temporary snapshot is the same as a snapshot, + * hold, destroy and release all rolled into one. + * Delegated diff alone is sufficient that we allow this. + */ + int error; + + if ((error = zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_DIFF, cr)) == 0) + return (0); + + error = zfs_secpolicy_snapshot_perms(zc->zc_name, cr); + + if (innvl != NULL) { + if (error == 0) + error = zfs_secpolicy_hold(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_release(zc, innvl, cr); + if (error == 0) + error = zfs_secpolicy_destroy(zc, innvl, cr); + } + return (error); +} + +static int +zfs_secpolicy_load_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_LOAD_KEY, cr)); +} + +static int +zfs_secpolicy_change_key(zfs_cmd_t *zc, nvlist_t *innvl, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_CHANGE_KEY, cr)); +} + +/* + * Returns the nvlist as specified by the user in the zfs_cmd_t. + */ +static int +get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) +{ + char *packed; + int error; + nvlist_t *list = NULL; + + /* + * Read in and unpack the user-supplied nvlist. + */ + if (size == 0) + return (SET_ERROR(EINVAL)); + + packed = vmem_alloc(size, KM_SLEEP); + + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { + vmem_free(packed, size); + return (SET_ERROR(EFAULT)); + } + + if ((error = nvlist_unpack(packed, size, &list, 0)) != 0) { + vmem_free(packed, size); + return (error); + } + + vmem_free(packed, size); + + *nvp = list; + return (0); +} + +/* + * Reduce the size of this nvlist until it can be serialized in 'max' bytes. + * Entries will be removed from the end of the nvlist, and one int32 entry + * named "N_MORE_ERRORS" will be added indicating how many entries were + * removed. + */ +static int +nvlist_smush(nvlist_t *errors, size_t max) +{ + size_t size; + + size = fnvlist_size(errors); + + if (size > max) { + nvpair_t *more_errors; + int n = 0; + + if (max < 1024) + return (SET_ERROR(ENOMEM)); + + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, 0); + more_errors = nvlist_prev_nvpair(errors, NULL); + + do { + nvpair_t *pair = nvlist_prev_nvpair(errors, + more_errors); + fnvlist_remove_nvpair(errors, pair); + n++; + size = fnvlist_size(errors); + } while (size > max); + + fnvlist_remove_nvpair(errors, more_errors); + fnvlist_add_int32(errors, ZPROP_N_MORE_ERRORS, n); + ASSERT3U(fnvlist_size(errors), <=, max); + } + + return (0); +} + +static int +put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) +{ + char *packed = NULL; + int error = 0; + size_t size; + + size = fnvlist_size(nvl); + + if (size > zc->zc_nvlist_dst_size) { + error = SET_ERROR(ENOMEM); + } else { + packed = fnvlist_pack(nvl, &size); + if (ddi_copyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, + size, zc->zc_iflags) != 0) + error = SET_ERROR(EFAULT); + fnvlist_pack_free(packed, size); + } + + zc->zc_nvlist_dst_size = size; + zc->zc_nvlist_dst_filled = B_TRUE; + return (error); +} + +int +getzfsvfs_impl(objset_t *os, zfsvfs_t **zfvp) +{ + int error = 0; + if (dmu_objset_type(os) != DMU_OST_ZFS) { + return (SET_ERROR(EINVAL)); + } + + mutex_enter(&os->os_user_ptr_lock); + *zfvp = dmu_objset_get_user(os); + /* bump s_active only when non-zero to prevent umount race */ + error = zfs_vfs_ref(zfvp); + mutex_exit(&os->os_user_ptr_lock); + return (error); +} + +int +getzfsvfs(const char *dsname, zfsvfs_t **zfvp) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error != 0) + return (error); + + error = getzfsvfs_impl(os, zfvp); + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * Find a zfsvfs_t for a mounted filesystem, or create our own, in which + * case its z_sb will be NULL, and it will be opened as the owner. + * If 'writer' is set, the z_teardown_lock will be held for RW_WRITER, + * which prevents all inode ops from running. + */ +static int +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer) +{ + int error = 0; + + if (getzfsvfs(name, zfvp) != 0) + error = zfsvfs_create(name, B_FALSE, zfvp); + if (error == 0) { + rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER : + RW_READER, tag); + if ((*zfvp)->z_unmounted) { + /* + * XXX we could probably try again, since the unmounting + * thread should be just about to disassociate the + * objset from the zfsvfs. + */ + rrm_exit(&(*zfvp)->z_teardown_lock, tag); + return (SET_ERROR(EBUSY)); + } + } + return (error); +} + +static void +zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +{ + rrm_exit(&zfsvfs->z_teardown_lock, tag); + + if (zfs_vfs_held(zfsvfs)) { + zfs_vfs_rele(zfsvfs); + } else { + dmu_objset_disown(zfsvfs->z_os, B_TRUE, zfsvfs); + zfsvfs_free(zfsvfs); + } +} + +static int +zfs_ioc_pool_create(zfs_cmd_t *zc) +{ + int error; + nvlist_t *config, *props = NULL; + nvlist_t *rootprops = NULL; + nvlist_t *zplprops = NULL; + dsl_crypto_params_t *dcp = NULL; + char *spa_name = zc->zc_name; + boolean_t unload_wkey = B_TRUE; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config))) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + nvlist_free(config); + return (error); + } + + if (props) { + nvlist_t *nvl = NULL; + nvlist_t *hidden_args = NULL; + uint64_t version = SPA_VERSION; + char *tname; + + (void) nvlist_lookup_uint64(props, + zpool_prop_to_name(ZPOOL_PROP_VERSION), &version); + if (!SPA_VERSION_IS_SUPPORTED(version)) { + error = SET_ERROR(EINVAL); + goto pool_props_bad; + } + (void) nvlist_lookup_nvlist(props, ZPOOL_ROOTFS_PROPS, &nvl); + if (nvl) { + error = nvlist_dup(nvl, &rootprops, KM_SLEEP); + if (error != 0) + goto pool_props_bad; + (void) nvlist_remove_all(props, ZPOOL_ROOTFS_PROPS); + } + + (void) nvlist_lookup_nvlist(props, ZPOOL_HIDDEN_ARGS, + &hidden_args); + error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, + rootprops, hidden_args, &dcp); + if (error != 0) + goto pool_props_bad; + (void) nvlist_remove_all(props, ZPOOL_HIDDEN_ARGS); + + VERIFY(nvlist_alloc(&zplprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops_root(version, rootprops, + zplprops, NULL); + if (error != 0) + goto pool_props_bad; + + if (nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_TNAME), &tname) == 0) + spa_name = tname; + } + + error = spa_create(zc->zc_name, config, props, zplprops, dcp); + + /* + * Set the remaining root properties + */ + if (!error && (error = zfs_set_prop_nvlist(spa_name, + ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) { + (void) spa_destroy(spa_name); + unload_wkey = B_FALSE; /* spa_destroy() unloads wrapping keys */ + } + +pool_props_bad: + nvlist_free(rootprops); + nvlist_free(zplprops); + nvlist_free(config); + nvlist_free(props); + dsl_crypto_params_free(dcp, unload_wkey && !!error); + + return (error); +} + +static int +zfs_ioc_pool_destroy(zfs_cmd_t *zc) +{ + int error; + zfs_log_history(zc); + error = spa_destroy(zc->zc_name); + + return (error); +} + +static int +zfs_ioc_pool_import(zfs_cmd_t *zc) +{ + nvlist_t *config, *props = NULL; + uint64_t guid; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) != 0) + return (error); + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + nvlist_free(config); + return (error); + } + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || + guid != zc->zc_guid) + error = SET_ERROR(EINVAL); + else + error = spa_import(zc->zc_name, config, props, zc->zc_cookie); + + if (zc->zc_nvlist_dst != 0) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + } + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_pool_export(zfs_cmd_t *zc) +{ + int error; + boolean_t force = (boolean_t)zc->zc_cookie; + boolean_t hardforce = (boolean_t)zc->zc_guid; + + zfs_log_history(zc); + error = spa_export(zc->zc_name, NULL, force, hardforce); + + return (error); +} + +static int +zfs_ioc_pool_configs(zfs_cmd_t *zc) +{ + nvlist_t *configs; + int error; + + if ((configs = spa_all_configs(&zc->zc_cookie)) == NULL) + return (SET_ERROR(EEXIST)); + + error = put_nvlist(zc, configs); + + nvlist_free(configs); + + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * + * outputs: + * zc_cookie real errno + * zc_nvlist_dst config nvlist + * zc_nvlist_dst_size size of config nvlist + */ +static int +zfs_ioc_pool_stats(zfs_cmd_t *zc) +{ + nvlist_t *config; + int error; + int ret = 0; + + error = spa_get_stats(zc->zc_name, &config, zc->zc_value, + sizeof (zc->zc_value)); + + if (config != NULL) { + ret = put_nvlist(zc, config); + nvlist_free(config); + + /* + * The config may be present even if 'error' is non-zero. + * In this case we return success, and preserve the real errno + * in 'zc_cookie'. + */ + zc->zc_cookie = error; + } else { + ret = error; + } + + return (ret); +} + +/* + * Try to import the given pool, returning pool stats as appropriate so that + * user land knows which devices are available and overall pool health. + */ +static int +zfs_ioc_pool_tryimport(zfs_cmd_t *zc) +{ + nvlist_t *tryconfig, *config = NULL; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &tryconfig)) != 0) + return (error); + + config = spa_tryimport(tryconfig); + + nvlist_free(tryconfig); + + if (config == NULL) + return (SET_ERROR(EINVAL)); + + error = put_nvlist(zc, config); + nvlist_free(config); + + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * zc_cookie scan func (pool_scan_func_t) + * zc_flags scrub pause/resume flag (pool_scrub_cmd_t) + */ +static int +zfs_ioc_pool_scan(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if (zc->zc_flags >= POOL_SCRUB_FLAGS_END) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_flags == POOL_SCRUB_PAUSE) + error = spa_scrub_pause_resume(spa, POOL_SCRUB_PAUSE); + else if (zc->zc_cookie == POOL_SCAN_NONE) + error = spa_scan_stop(spa); + else + error = spa_scan(spa, zc->zc_cookie); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_freeze(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + spa_freeze(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_pool_upgrade(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (zc->zc_cookie < spa_version(spa) || + !SPA_VERSION_IS_SUPPORTED(zc->zc_cookie)) { + spa_close(spa, FTAG); + return (SET_ERROR(EINVAL)); + } + + spa_upgrade(spa, zc->zc_cookie); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_history(zfs_cmd_t *zc) +{ + spa_t *spa; + char *hist_buf; + uint64_t size; + int error; + + if ((size = zc->zc_history_len) == 0) + return (SET_ERROR(EINVAL)); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + hist_buf = vmem_alloc(size, KM_SLEEP); + if ((error = spa_history_get(spa, &zc->zc_history_offset, + &zc->zc_history_len, hist_buf)) == 0) { + error = ddi_copyout(hist_buf, + (void *)(uintptr_t)zc->zc_history, + zc->zc_history_len, zc->zc_iflags); + } + + spa_close(spa, FTAG); + vmem_free(hist_buf, size); + return (error); +} + +static int +zfs_ioc_pool_reguid(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error == 0) { + error = spa_change_guid(spa); + spa_close(spa, FTAG); + } + return (error); +} + +static int +zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) +{ + return (dsl_dsobj_to_dsname(zc->zc_name, zc->zc_obj, zc->zc_value)); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_value name of object + */ +static int +zfs_ioc_obj_to_path(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, + FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele_flags(os, B_TRUE, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele_flags(os, B_TRUE, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_stat stats on object + * zc_value path to object + */ +static int +zfs_ioc_obj_to_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, + FTAG, &os)) != 0) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele_flags(os, B_TRUE, FTAG); + return (SET_ERROR(EINVAL)); + } + error = zfs_obj_to_stats(os, zc->zc_obj, &zc->zc_stat, zc->zc_value, + sizeof (zc->zc_value)); + dmu_objset_rele_flags(os, B_TRUE, FTAG); + + return (error); +} + +static int +zfs_ioc_vdev_add(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *config; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config); + if (error == 0) { + error = spa_vdev_add(spa, config); + nvlist_free(config); + } + spa_close(spa, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of the pool + * zc_guid guid of vdev to remove + * zc_cookie cancel removal + */ +static int +zfs_ioc_vdev_remove(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + if (zc->zc_cookie != 0) { + error = spa_vdev_remove_cancel(spa); + } else { + error = spa_vdev_remove(spa, zc->zc_guid, B_FALSE); + } + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_set_state(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + switch (zc->zc_cookie) { + case VDEV_STATE_ONLINE: + error = vdev_online(spa, zc->zc_guid, zc->zc_obj, &newstate); + break; + + case VDEV_STATE_OFFLINE: + error = vdev_offline(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_FAULTED: + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL && + zc->zc_obj != VDEV_AUX_EXTERNAL_PERSIST) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); + break; + + case VDEV_STATE_DEGRADED: + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); + break; + + default: + error = SET_ERROR(EINVAL); + } + zc->zc_cookie = newstate; + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_attach(zfs_cmd_t *zc) +{ + spa_t *spa; + nvlist_t *config; + int replacing = zc->zc_cookie; + int rebuild = zc->zc_simple; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) == 0) { + error = spa_vdev_attach(spa, zc->zc_guid, config, replacing, + rebuild); + nvlist_free(config); + } + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_detach(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); + + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_split(zfs_cmd_t *zc) +{ + spa_t *spa; + nvlist_t *config, *props = NULL; + int error; + boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config))) { + spa_close(spa, FTAG); + return (error); + } + + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + spa_close(spa, FTAG); + nvlist_free(config); + return (error); + } + + error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); + + spa_close(spa, FTAG); + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setfru(zfs_cmd_t *zc) +{ + spa_t *spa; + char *fru = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setfru(spa, guid, fru); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_objset_stats_impl(zfs_cmd_t *zc, objset_t *os) +{ + int error = 0; + nvlist_t *nv; + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_all(os, &nv)) == 0) { + dmu_objset_stats(os, nv); + /* + * NB: zvol_get_stats() will read the objset contents, + * which we aren't supposed to do with a + * DS_MODE_USER hold, because it could be + * inconsistent. So this is a bit of a workaround... + * XXX reading without owning + */ + if (!zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZVOL) { + error = zvol_get_stats(os, nv); + if (error == EIO) { + nvlist_free(nv); + return (error); + } + VERIFY0(error); + } + if (error == 0) + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_objset_stats(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error == 0) { + error = zfs_ioc_objset_stats_impl(zc, os); + dmu_objset_rele(os, FTAG); + } + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_nvlist_dst received property nvlist + * zc_nvlist_dst_size size of received property nvlist + * + * Gets received properties (distinct from local properties on or after + * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from + * local property values. + */ +static int +zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) +{ + int error = 0; + nvlist_t *nv; + + /* + * Without this check, we would return local property values if the + * caller has not already received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + if (!dsl_prop_get_hasrecvd(zc->zc_name)) + return (SET_ERROR(ENOTSUP)); + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_received(zc->zc_name, &nv)) == 0) { + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + return (error); +} + +static int +nvl_add_zplprop(objset_t *os, nvlist_t *props, zfs_prop_t prop) +{ + uint64_t value; + int error; + + /* + * zfs_get_zplprop() will either find a value or give us + * the default value (if there is one). + */ + if ((error = zfs_get_zplprop(os, prop, &value)) != 0) + return (error); + VERIFY(nvlist_add_uint64(props, zfs_prop_to_name(prop), value) == 0); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for zpl property nvlist + * + * outputs: + * zc_nvlist_dst zpl property nvlist + * zc_nvlist_dst_size size of zpl property nvlist + */ +static int +zfs_ioc_objset_zplprops(zfs_cmd_t *zc) +{ + objset_t *os; + int err; + + /* XXX reading without owning */ + if ((err = dmu_objset_hold(zc->zc_name, FTAG, &os))) + return (err); + + dmu_objset_fast_stat(os, &zc->zc_objset_stats); + + /* + * NB: nvl_add_zplprop() will read the objset contents, + * which we aren't supposed to do with a DS_MODE_USER + * hold, because it could be inconsistent. + */ + if (zc->zc_nvlist_dst != 0 && + !zc->zc_objset_stats.dds_inconsistent && + dmu_objset_type(os) == DMU_OST_ZFS) { + nvlist_t *nv; + + VERIFY(nvlist_alloc(&nv, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if ((err = nvl_add_zplprop(os, nv, ZFS_PROP_VERSION)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_NORMALIZE)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_UTF8ONLY)) == 0 && + (err = nvl_add_zplprop(os, nv, ZFS_PROP_CASE)) == 0) + err = put_nvlist(zc, nv); + nvlist_free(nv); + } else { + err = SET_ERROR(ENOENT); + } + dmu_objset_rele(os, FTAG); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_name name of next filesystem + * zc_cookie zap cursor + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_dataset_list_next(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + char *p; + size_t orig_len = strlen(zc->zc_name); + +top: + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os))) { + if (error == ENOENT) + error = SET_ERROR(ESRCH); + return (error); + } + + p = strrchr(zc->zc_name, '/'); + if (p == NULL || p[1] != '\0') + (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); + p = zc->zc_name + strlen(zc->zc_name); + + do { + error = dmu_dir_list_next(os, + sizeof (zc->zc_name) - (p - zc->zc_name), p, + NULL, &zc->zc_cookie); + if (error == ENOENT) + error = SET_ERROR(ESRCH); + } while (error == 0 && zfs_dataset_name_hidden(zc->zc_name)); + dmu_objset_rele(os, FTAG); + + /* + * If it's an internal dataset (ie. with a '$' in its name), + * don't try to get stats for it, otherwise we'll return ENOENT. + */ + if (error == 0 && strchr(zc->zc_name, '$') == NULL) { + error = zfs_ioc_objset_stats(zc); /* fill in the stats */ + if (error == ENOENT) { + /* We lost a race with destroy, get the next one. */ + zc->zc_name[orig_len] = '\0'; + goto top; + } + } + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_nvlist_src iteration range nvlist + * zc_nvlist_src_size size of iteration range nvlist + * + * outputs: + * zc_name name of next snapshot + * zc_objset_stats stats + * zc_nvlist_dst property nvlist + * zc_nvlist_dst_size size of property nvlist + */ +static int +zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) +{ + int error; + objset_t *os, *ossnap; + dsl_dataset_t *ds; + uint64_t min_txg = 0, max_txg = 0; + + if (zc->zc_nvlist_src_size != 0) { + nvlist_t *props = NULL; + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props); + if (error != 0) + return (error); + (void) nvlist_lookup_uint64(props, SNAP_ITER_MIN_TXG, + &min_txg); + (void) nvlist_lookup_uint64(props, SNAP_ITER_MAX_TXG, + &max_txg); + nvlist_free(props); + } + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) { + return (error == ENOENT ? SET_ERROR(ESRCH) : error); + } + + /* + * A dataset name of maximum length cannot have any snapshots, + * so exit immediately. + */ + if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= + ZFS_MAX_DATASET_NAME_LEN) { + dmu_objset_rele(os, FTAG); + return (SET_ERROR(ESRCH)); + } + + while (error == 0) { + if (issig(JUSTLOOKING) && issig(FORREAL)) { + error = SET_ERROR(EINTR); + break; + } + + error = dmu_snapshot_list_next(os, + sizeof (zc->zc_name) - strlen(zc->zc_name), + zc->zc_name + strlen(zc->zc_name), &zc->zc_obj, + &zc->zc_cookie, NULL); + if (error == ENOENT) { + error = SET_ERROR(ESRCH); + break; + } else if (error != 0) { + break; + } + + error = dsl_dataset_hold_obj(dmu_objset_pool(os), zc->zc_obj, + FTAG, &ds); + if (error != 0) + break; + + if ((min_txg != 0 && dsl_get_creationtxg(ds) < min_txg) || + (max_txg != 0 && dsl_get_creationtxg(ds) > max_txg)) { + dsl_dataset_rele(ds, FTAG); + /* undo snapshot name append */ + *(strchr(zc->zc_name, '@') + 1) = '\0'; + /* skip snapshot */ + continue; + } + + if (zc->zc_simple) { + dsl_dataset_rele(ds, FTAG); + break; + } + + if ((error = dmu_objset_from_ds(ds, &ossnap)) != 0) { + dsl_dataset_rele(ds, FTAG); + break; + } + if ((error = zfs_ioc_objset_stats_impl(zc, ossnap)) != 0) { + dsl_dataset_rele(ds, FTAG); + break; + } + dsl_dataset_rele(ds, FTAG); + break; + } + + dmu_objset_rele(os, FTAG); + /* if we failed, undo the @ that we tacked on to zc_name */ + if (error != 0) + *strchr(zc->zc_name, '@') = '\0'; + return (error); +} + +static int +zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + uint64_t *valary; + unsigned int vallen; + const char *domain; + char *dash; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + int err; + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) != 0) + return (SET_ERROR(EINVAL)); + } + + /* + * A correctly constructed propname is encoded as + * userquota@-. + */ + if ((dash = strchr(propname, '-')) == NULL || + nvpair_value_uint64_array(pair, &valary, &vallen) != 0 || + vallen != 3) + return (SET_ERROR(EINVAL)); + + domain = dash + 1; + type = valary[0]; + rid = valary[1]; + quota = valary[2]; + + err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_FALSE); + if (err == 0) { + err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } + + return (err); +} + +/* + * If the named property is one that has a special function to set its value, + * return 0 on success and a positive error code on failure; otherwise if it is + * not one of the special properties handled by this function, return -1. + * + * XXX: It would be better for callers of the property interface if we handled + * these special cases in dsl_prop.c (in the dsl layer). + */ +static int +zfs_prop_set_special(const char *dsname, zprop_source_t source, + nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval = 0; + char *strval = NULL; + int err = -1; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_userquota(propname)) + return (zfs_prop_set_userquota(dsname, pair)); + return (-1); + } + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + /* all special properties are numeric except for keylocation */ + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) { + strval = fnvpair_value_string(pair); + } else { + intval = fnvpair_value_uint64(pair); + } + + switch (prop) { + case ZFS_PROP_QUOTA: + err = dsl_dir_set_quota(dsname, source, intval); + break; + case ZFS_PROP_REFQUOTA: + err = dsl_dataset_set_refquota(dsname, source, intval); + break; + case ZFS_PROP_FILESYSTEM_LIMIT: + case ZFS_PROP_SNAPSHOT_LIMIT: + if (intval == UINT64_MAX) { + /* clearing the limit, just do it */ + err = 0; + } else { + err = dsl_dir_activate_fs_ss_limit(dsname); + } + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; + case ZFS_PROP_KEYLOCATION: + err = dsl_crypto_can_set_keylocation(dsname, strval); + + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; + case ZFS_PROP_RESERVATION: + err = dsl_dir_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_REFRESERVATION: + err = dsl_dataset_set_refreservation(dsname, source, intval); + break; + case ZFS_PROP_COMPRESSION: + err = dsl_dataset_set_compression(dsname, source, intval); + /* + * Set err to -1 to force the zfs_set_prop_nvlist code down the + * default path to set the value in the nvlist. + */ + if (err == 0) + err = -1; + break; + case ZFS_PROP_VOLSIZE: + err = zvol_set_volsize(dsname, intval); + break; + case ZFS_PROP_SNAPDEV: + err = zvol_set_snapdev(dsname, source, intval); + break; + case ZFS_PROP_VOLMODE: + err = zvol_set_volmode(dsname, source, intval); + break; + case ZFS_PROP_VERSION: + { + zfsvfs_t *zfsvfs; + + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs, B_TRUE)) != 0) + break; + + err = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); + + if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t *zc; + + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strlcpy(zc->zc_name, dsname, + sizeof (zc->zc_name)); + (void) zfs_ioc_userspace_upgrade(zc); + (void) zfs_ioc_id_quota_upgrade(zc); + kmem_free(zc, sizeof (zfs_cmd_t)); + } + break; + } + default: + err = -1; + } + + return (err); +} + +/* + * This function is best effort. If it fails to set any of the given properties, + * it continues to set as many as it can and returns the last error + * encountered. If the caller provides a non-NULL errlist, it will be filled in + * with the list of names of all the properties that failed along with the + * corresponding error numbers. + * + * If every property is set successfully, zero is returned and errlist is not + * modified. + */ +int +zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, + nvlist_t *errlist) +{ + nvpair_t *pair; + nvpair_t *propval; + int rv = 0; + uint64_t intval; + char *strval; + + nvlist_t *genericnvl = fnvlist_alloc(); + nvlist_t *retrynvl = fnvlist_alloc(); +retry: + pair = NULL; + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + int err = 0; + + /* decode the property value */ + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + attrs = fnvpair_value_nvlist(pair); + if (nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) != 0) + err = SET_ERROR(EINVAL); + } + + /* Validate value type */ + if (err == 0 && source == ZPROP_SRC_INHERITED) { + /* inherited properties are expected to be booleans */ + if (nvpair_type(propval) != DATA_TYPE_BOOLEAN) + err = SET_ERROR(EINVAL); + } else if (err == 0 && prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (nvpair_type(propval) != DATA_TYPE_STRING) + err = SET_ERROR(EINVAL); + } else if (zfs_prop_userquota(propname)) { + if (nvpair_type(propval) != + DATA_TYPE_UINT64_ARRAY) + err = SET_ERROR(EINVAL); + } else { + err = SET_ERROR(EINVAL); + } + } else if (err == 0) { + if (nvpair_type(propval) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) + err = SET_ERROR(EINVAL); + } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { + const char *unused; + + intval = fnvpair_value_uint64(propval); + + switch (zfs_prop_get_type(prop)) { + case PROP_TYPE_NUMBER: + break; + case PROP_TYPE_STRING: + err = SET_ERROR(EINVAL); + break; + case PROP_TYPE_INDEX: + if (zfs_prop_index_to_string(prop, + intval, &unused) != 0) + err = + SET_ERROR(ZFS_ERR_BADPROP); + break; + default: + cmn_err(CE_PANIC, + "unknown property type"); + } + } else { + err = SET_ERROR(EINVAL); + } + } + + /* Validate permissions */ + if (err == 0) + err = zfs_check_settable(dsname, pair, CRED()); + + if (err == 0) { + if (source == ZPROP_SRC_INHERITED) + err = -1; /* does not need special handling */ + else + err = zfs_prop_set_special(dsname, source, + pair); + if (err == -1) { + /* + * For better performance we build up a list of + * properties to set in a single transaction. + */ + err = nvlist_add_nvpair(genericnvl, pair); + } else if (err != 0 && nvl != retrynvl) { + /* + * This may be a spurious error caused by + * receiving quota and reservation out of order. + * Try again in a second pass. + */ + err = nvlist_add_nvpair(retrynvl, pair); + } + } + + if (err != 0) { + if (errlist != NULL) + fnvlist_add_int32(errlist, propname, err); + rv = err; + } + } + + if (nvl != retrynvl && !nvlist_empty(retrynvl)) { + nvl = retrynvl; + goto retry; + } + + if (!nvlist_empty(genericnvl) && + dsl_props_set(dsname, source, genericnvl) != 0) { + /* + * If this fails, we still want to set as many properties as we + * can, so try setting them individually. + */ + pair = NULL; + while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + int err = 0; + + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + attrs = fnvpair_value_nvlist(pair); + propval = fnvlist_lookup_nvpair(attrs, + ZPROP_VALUE); + } + + if (nvpair_type(propval) == DATA_TYPE_STRING) { + strval = fnvpair_value_string(propval); + err = dsl_prop_set_string(dsname, propname, + source, strval); + } else if (nvpair_type(propval) == DATA_TYPE_BOOLEAN) { + err = dsl_prop_inherit(dsname, propname, + source); + } else { + intval = fnvpair_value_uint64(propval); + err = dsl_prop_set_int(dsname, propname, source, + intval); + } + + if (err != 0) { + if (errlist != NULL) { + fnvlist_add_int32(errlist, propname, + err); + } + rv = err; + } + } + } + nvlist_free(genericnvl); + nvlist_free(retrynvl); + + return (rv); +} + +/* + * Check that all the properties are valid user properties. + */ +static int +zfs_check_userprops(nvlist_t *nvl) +{ + nvpair_t *pair = NULL; + + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + + if (!zfs_prop_user(propname) || + nvpair_type(pair) != DATA_TYPE_STRING) + return (SET_ERROR(EINVAL)); + + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (SET_ERROR(ENAMETOOLONG)); + + if (strlen(fnvpair_value_string(pair)) >= ZAP_MAXVALUELEN) + return (SET_ERROR(E2BIG)); + } + return (0); +} + +static void +props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) +{ + nvpair_t *pair; + + VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + pair = NULL; + while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { + if (nvlist_exists(skipped, nvpair_name(pair))) + continue; + + VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); + } +} + +static int +clear_received_props(const char *dsname, nvlist_t *props, + nvlist_t *skipped) +{ + int err = 0; + nvlist_t *cleared_props = NULL; + props_skip(props, skipped, &cleared_props); + if (!nvlist_empty(cleared_props)) { + /* + * Acts on local properties until the dataset has received + * properties at least once on or after SPA_VERSION_RECVD_PROPS. + */ + zprop_source_t flags = (ZPROP_SRC_NONE | + (dsl_prop_get_hasrecvd(dsname) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(dsname, flags, cleared_props, NULL); + } + nvlist_free(cleared_props); + return (err); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to set + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_cookie received properties flag + * + * outputs: + * zc_nvlist_dst{_size} error for each unapplied received property + */ +static int +zfs_ioc_set_prop(zfs_cmd_t *zc) +{ + nvlist_t *nvl; + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : + ZPROP_SRC_LOCAL); + nvlist_t *errors; + int error; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &nvl)) != 0) + return (error); + + if (received) { + nvlist_t *origprops; + + if (dsl_prop_get_received(zc->zc_name, &origprops) == 0) { + (void) clear_received_props(zc->zc_name, + origprops, nvl); + nvlist_free(origprops); + } + + error = dsl_prop_set_hasrecvd(zc->zc_name); + } + + errors = fnvlist_alloc(); + if (error == 0) + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, errors); + + if (zc->zc_nvlist_dst != 0 && errors != NULL) { + (void) put_nvlist(zc, errors); + } + + nvlist_free(errors); + nvlist_free(nvl); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value name of property to inherit + * zc_cookie revert to received value if TRUE + * + * outputs: none + */ +static int +zfs_ioc_inherit_prop(zfs_cmd_t *zc) +{ + const char *propname = zc->zc_value; + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received + ? ZPROP_SRC_NONE /* revert to received value, if any */ + : ZPROP_SRC_INHERITED); /* explicitly inherit */ + nvlist_t *dummy; + nvpair_t *pair; + zprop_type_t type; + int err; + + if (!received) { + /* + * Only check this in the non-received case. We want to allow + * 'inherit -S' to revert non-inheritable properties like quota + * and reservation to the received or default values even though + * they are not considered inheritable. + */ + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + return (SET_ERROR(EINVAL)); + } + + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(propname)) + return (SET_ERROR(EINVAL)); + + type = PROP_TYPE_STRING; + } else if (prop == ZFS_PROP_VOLSIZE || prop == ZFS_PROP_VERSION) { + return (SET_ERROR(EINVAL)); + } else { + type = zfs_prop_get_type(prop); + } + + /* + * zfs_prop_set_special() expects properties in the form of an + * nvpair with type info. + */ + dummy = fnvlist_alloc(); + + switch (type) { + case PROP_TYPE_STRING: + VERIFY(0 == nvlist_add_string(dummy, propname, "")); + break; + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); + break; + default: + err = SET_ERROR(EINVAL); + goto errout; + } + + pair = nvlist_next_nvpair(dummy, NULL); + if (pair == NULL) { + err = SET_ERROR(EINVAL); + } else { + err = zfs_prop_set_special(zc->zc_name, source, pair); + if (err == -1) /* property is not "special", needs handling */ + err = dsl_prop_inherit(zc->zc_name, zc->zc_value, + source); + } + +errout: + nvlist_free(dummy); + return (err); +} + +static int +zfs_ioc_pool_set_props(zfs_cmd_t *zc) +{ + nvlist_t *props; + spa_t *spa; + int error; + nvpair_t *pair; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) + return (error); + + /* + * If the only property is the configfile, then just do a spa_lookup() + * to handle the faulted case. + */ + pair = nvlist_next_nvpair(props, NULL); + if (pair != NULL && strcmp(nvpair_name(pair), + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && + nvlist_next_nvpair(props, pair) == NULL) { + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_write_cachefile(spa, B_FALSE, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + if (spa != NULL) { + nvlist_free(props); + return (0); + } + } + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + nvlist_free(props); + return (error); + } + + error = spa_prop_set(spa, props); + + nvlist_free(props); + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_pool_get_props(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + nvlist_t *nvp = NULL; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + /* + * If the pool is faulted, there may be properties we can still + * get (such as altroot and cachefile), so attempt to get them + * anyway. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) + error = spa_prop_get(spa, &nvp); + mutex_exit(&spa_namespace_lock); + } else { + error = spa_prop_get(spa, &nvp); + spa_close(spa, FTAG); + } + + if (error == 0 && zc->zc_nvlist_dst != 0) + error = put_nvlist(zc, nvp); + else + error = SET_ERROR(EFAULT); + + nvlist_free(nvp); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_src{_size} nvlist of delegated permissions + * zc_perm_action allow/unallow flag + * + * outputs: none + */ +static int +zfs_ioc_set_fsacl(zfs_cmd_t *zc) +{ + int error; + nvlist_t *fsaclnv = NULL; + + if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &fsaclnv)) != 0) + return (error); + + /* + * Verify nvlist is constructed correctly + */ + if ((error = zfs_deleg_verify_nvlist(fsaclnv)) != 0) { + nvlist_free(fsaclnv); + return (SET_ERROR(EINVAL)); + } + + /* + * If we don't have PRIV_SYS_MOUNT, then validate + * that user is allowed to hand out each permission in + * the nvlist(s) + */ + + error = secpolicy_zfs(CRED()); + if (error != 0) { + if (zc->zc_perm_action == B_FALSE) { + error = dsl_deleg_can_allow(zc->zc_name, + fsaclnv, CRED()); + } else { + error = dsl_deleg_can_unallow(zc->zc_name, + fsaclnv, CRED()); + } + } + + if (error == 0) + error = dsl_deleg_set(zc->zc_name, fsaclnv, zc->zc_perm_action); + + nvlist_free(fsaclnv); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of delegated permissions + */ +static int +zfs_ioc_get_fsacl(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_deleg_get(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + +/* ARGSUSED */ +static void +zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + + zfs_create_fs(os, cr, zct->zct_zplprops, tx); +} + +#define ZFS_PROP_UNDEFINED ((uint64_t)-1) + +/* + * inputs: + * os parent objset pointer (NULL if root fs) + * fuids_ok fuids allowed in this version of the spa? + * sa_ok SAs allowed in this version of the spa? + * createprops list of properties requested by creator + * + * outputs: + * zplprops values for the zplprops we attach to the master node object + * is_ci true if requested file system will be purely case-insensitive + * + * Determine the settings for utf8only, normalization and + * casesensitivity. Specific values may have been requested by the + * creator and/or we can inherit values from the parent dataset. If + * the file system is of too early a vintage, a creator can not + * request settings for these properties, even if the requested + * setting is the default value. We don't actually want to create dsl + * properties for these, so remove them from the source nvlist after + * processing. + */ +static int +zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, + boolean_t fuids_ok, boolean_t sa_ok, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + uint64_t sense = ZFS_PROP_UNDEFINED; + uint64_t norm = ZFS_PROP_UNDEFINED; + uint64_t u8 = ZFS_PROP_UNDEFINED; + int error; + + ASSERT(zplprops != NULL); + + /* parent dataset must be a filesystem */ + if (os != NULL && os->os_phys->os_type != DMU_OST_ZFS) + return (SET_ERROR(ZFS_ERR_WRONG_PARENT)); + + /* + * Pull out creator prop choices, if any. + */ + if (createprops) { + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_VERSION), &zplver); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), &norm); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), &u8); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY)); + (void) nvlist_lookup_uint64(createprops, + zfs_prop_to_name(ZFS_PROP_CASE), &sense); + (void) nvlist_remove_all(createprops, + zfs_prop_to_name(ZFS_PROP_CASE)); + } + + /* + * If the zpl version requested is whacky or the file system + * or pool is version is too "young" to support normalization + * and the creator tried to set a value for one of the props, + * error out. + */ + if ((zplver < ZPL_VERSION_INITIAL || zplver > ZPL_VERSION) || + (zplver >= ZPL_VERSION_FUID && !fuids_ok) || + (zplver >= ZPL_VERSION_SA && !sa_ok) || + (zplver < ZPL_VERSION_NORMALIZATION && + (norm != ZFS_PROP_UNDEFINED || u8 != ZFS_PROP_UNDEFINED || + sense != ZFS_PROP_UNDEFINED))) + return (SET_ERROR(ENOTSUP)); + + /* + * Put the version in the zplprops + */ + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_VERSION), zplver) == 0); + + if (norm == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &norm)) != 0) + return (error); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_NORMALIZE), norm) == 0); + + /* + * If we're normalizing, names must always be valid UTF-8 strings. + */ + if (norm) + u8 = 1; + if (u8 == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &u8)) != 0) + return (error); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_UTF8ONLY), u8) == 0); + + if (sense == ZFS_PROP_UNDEFINED && + (error = zfs_get_zplprop(os, ZFS_PROP_CASE, &sense)) != 0) + return (error); + VERIFY(nvlist_add_uint64(zplprops, + zfs_prop_to_name(ZFS_PROP_CASE), sense) == 0); + + if (is_ci) + *is_ci = (sense == ZFS_CASE_INSENSITIVE); + + return (0); +} + +static int +zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok, sa_ok; + uint64_t zplver = ZPL_VERSION; + objset_t *os = NULL; + char parentname[ZFS_MAX_DATASET_NAME_LEN]; + spa_t *spa; + uint64_t spa_vers; + int error; + + zfs_get_parent(dataset, parentname, sizeof (parentname)); + + if ((error = spa_open(dataset, &spa, FTAG)) != 0) + return (error); + + spa_vers = spa_version(spa); + spa_close(spa, FTAG); + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); + + /* + * Open parent object set so we can inherit zplprop values. + */ + if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) + return (error); + + error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, sa_ok, createprops, + zplprops, is_ci); + dmu_objset_rele(os, FTAG); + return (error); +} + +static int +zfs_fill_zplprops_root(uint64_t spa_vers, nvlist_t *createprops, + nvlist_t *zplprops, boolean_t *is_ci) +{ + boolean_t fuids_ok; + boolean_t sa_ok; + uint64_t zplver = ZPL_VERSION; + int error; + + zplver = zfs_zpl_version_map(spa_vers); + fuids_ok = (zplver >= ZPL_VERSION_FUID); + sa_ok = (zplver >= ZPL_VERSION_SA); + + error = zfs_fill_zplprops_impl(NULL, zplver, fuids_ok, sa_ok, + createprops, zplprops, is_ci); + return (error); +} + +/* + * innvl: { + * "type" -> dmu_objset_type_t (int32) + * (optional) "props" -> { prop -> value } + * (optional) "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) + * } + * + * outnvl: propname -> error code (int32) + */ + +static const zfs_ioc_key_t zfs_keys_create[] = { + {"type", DATA_TYPE_INT32, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_create(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error = 0; + zfs_creat_t zct = { 0 }; + nvlist_t *nvprops = NULL; + nvlist_t *hidden_args = NULL; + void (*cbfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); + dmu_objset_type_t type; + boolean_t is_insensitive = B_FALSE; + dsl_crypto_params_t *dcp = NULL; + + type = (dmu_objset_type_t)fnvlist_lookup_int32(innvl, "type"); + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); + + switch (type) { + case DMU_OST_ZFS: + cbfunc = zfs_create_cb; + break; + + case DMU_OST_ZVOL: + cbfunc = zvol_create_cb; + break; + + default: + cbfunc = NULL; + break; + } + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + zct.zct_props = nvprops; + + if (cbfunc == NULL) + return (SET_ERROR(EINVAL)); + + if (type == DMU_OST_ZVOL) { + uint64_t volsize, volblocksize; + + if (nvprops == NULL) + return (SET_ERROR(EINVAL)); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) != 0) + return (SET_ERROR(EINVAL)); + + if ((error = nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize)) != 0 && error != ENOENT) + return (SET_ERROR(EINVAL)); + + if (error != 0) + volblocksize = zfs_prop_default_numeric( + ZFS_PROP_VOLBLOCKSIZE); + + if ((error = zvol_check_volblocksize(fsname, + volblocksize)) != 0 || + (error = zvol_check_volsize(volsize, + volblocksize)) != 0) + return (error); + } else if (type == DMU_OST_ZFS) { + int error; + + /* + * We have to have normalization and + * case-folding flags correct when we do the + * file system creation, so go figure them out + * now. + */ + VERIFY(nvlist_alloc(&zct.zct_zplprops, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + error = zfs_fill_zplprops(fsname, nvprops, + zct.zct_zplprops, &is_insensitive); + if (error != 0) { + nvlist_free(zct.zct_zplprops); + return (error); + } + } + + error = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, nvprops, + hidden_args, &dcp); + if (error != 0) { + nvlist_free(zct.zct_zplprops); + return (error); + } + + error = dmu_objset_create(fsname, type, + is_insensitive ? DS_FLAG_CI_DATASET : 0, dcp, cbfunc, &zct); + + nvlist_free(zct.zct_zplprops); + dsl_crypto_params_free(dcp, !!error); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); + if (error != 0) { + spa_t *spa; + int error2; + + /* + * Volumes will return EBUSY and cannot be destroyed + * until all asynchronous minor handling (e.g. from + * setting the volmode property) has completed. Wait for + * the spa_zvol_taskq to drain then retry. + */ + error2 = dsl_destroy_head(fsname); + while ((error2 == EBUSY) && (type == DMU_OST_ZVOL)) { + error2 = spa_open(fsname, &spa, FTAG); + if (error2 == 0) { + taskq_wait(spa->spa_zvol_taskq); + spa_close(spa, FTAG); + } + error2 = dsl_destroy_head(fsname); + } + } + } + return (error); +} + +/* + * innvl: { + * "origin" -> name of origin snapshot + * (optional) "props" -> { prop -> value } + * (optional) "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) + * } + * + * outputs: + * outnvl: propname -> error code (int32) + */ +static const zfs_ioc_key_t zfs_keys_clone[] = { + {"origin", DATA_TYPE_STRING, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_clone(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error = 0; + nvlist_t *nvprops = NULL; + char *origin_name; + + origin_name = fnvlist_lookup_string(innvl, "origin"); + (void) nvlist_lookup_nvlist(innvl, "props", &nvprops); + + if (strchr(fsname, '@') || + strchr(fsname, '%')) + return (SET_ERROR(EINVAL)); + + if (dataset_namecheck(origin_name, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + error = dmu_objset_clone(fsname, origin_name); + + /* + * It would be nice to do this atomically. + */ + if (error == 0) { + error = zfs_set_prop_nvlist(fsname, ZPROP_SRC_LOCAL, + nvprops, outnvl); + if (error != 0) + (void) dsl_destroy_head(fsname); + } + return (error); +} + +static const zfs_ioc_key_t zfs_keys_remap[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_remap(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + /* This IOCTL is no longer supported. */ + return (0); +} + +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional) "props" -> { prop -> value (string) } + * } + * + * outnvl: snapshot -> error code (int32) + */ +static const zfs_ioc_key_t zfs_keys_snapshot[] = { + {"snaps", DATA_TYPE_NVLIST, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_snapshot(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + nvlist_t *snaps; + nvlist_t *props = NULL; + int error, poollen; + nvpair_t *pair; + + (void) nvlist_lookup_nvlist(innvl, "props", &props); + if (!nvlist_empty(props) && + zfs_earlier_version(poolname, SPA_VERSION_SNAP_PROPS)) + return (SET_ERROR(ENOTSUP)); + if ((error = zfs_check_userprops(props)) != 0) + return (error); + + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + char *cp = strchr(name, '@'); + + /* + * The snap name must contain an @, and the part after it must + * contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + /* + * The snap must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + /* + * Check for permission to set the properties on the fs. + */ + if (!nvlist_empty(props)) { + *cp = '\0'; + error = zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_USERPROP, CRED()); + *cp = '@'; + if (error != 0) + return (error); + } + + /* This must be the only snap of this fs. */ + for (nvpair_t *pair2 = nvlist_next_nvpair(snaps, pair); + pair2 != NULL; pair2 = nvlist_next_nvpair(snaps, pair2)) { + if (strncmp(name, nvpair_name(pair2), cp - name + 1) + == 0) { + return (SET_ERROR(EXDEV)); + } + } + } + + error = dsl_dataset_snapshot(snaps, props, outnvl); + + return (error); +} + +/* + * innvl: "message" -> string + */ +static const zfs_ioc_key_t zfs_keys_log_history[] = { + {"message", DATA_TYPE_STRING, 0}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_log_history(const char *unused, nvlist_t *innvl, nvlist_t *outnvl) +{ + char *message; + spa_t *spa; + int error; + char *poolname; + + /* + * The poolname in the ioctl is not set, we get it from the TSD, + * which was set at the end of the last successful ioctl that allows + * logging. The secpolicy func already checked that it is set. + * Only one log ioctl is allowed after each successful ioctl, so + * we clear the TSD here. + */ + poolname = tsd_get(zfs_allow_log_key); + if (poolname == NULL) + return (SET_ERROR(EINVAL)); + (void) tsd_set(zfs_allow_log_key, NULL); + error = spa_open(poolname, &spa, FTAG); + kmem_strfree(poolname); + if (error != 0) + return (error); + + message = fnvlist_lookup_string(innvl, "message"); + + if (spa_version(spa) < SPA_VERSION_ZPOOL_HISTORY) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + error = spa_history_log(spa, message); + spa_close(spa, FTAG); + return (error); +} + +/* + * This ioctl is used to set the bootenv configuration on the current + * pool. This configuration is stored in the second padding area of the label, + * and it is used by the GRUB bootloader used on Linux to store the contents + * of the grubenv file. The file is stored as raw ASCII, and is protected by + * an embedded checksum. By default, GRUB will check if the boot filesystem + * supports storing the environment data in a special location, and if so, + * will invoke filesystem specific logic to retrieve it. This can be overridden + * by a variable, should the user so desire. + */ +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_set_bootenv[] = { + {"envmap", DATA_TYPE_STRING, 0}, +}; + +static int +zfs_ioc_set_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + char *envmap; + int error; + spa_t *spa; + + envmap = fnvlist_lookup_string(innvl, "envmap"); + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_write_bootenv(spa->spa_root_vdev, envmap); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_get_bootenv[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bootenv(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + spa_vdev_state_enter(spa, SCL_ALL); + error = vdev_label_read_bootenv(spa->spa_root_vdev, outnvl); + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (error); +} + +/* + * The dp_config_rwlock must not be held when calling this, because the + * unmount may need to write out data. + * + * This function is best-effort. Callers must deal gracefully if it + * remains mounted (or is remounted after this call). + * + * Returns 0 if the argument is not a snapshot, or it is not currently a + * filesystem, or we were able to unmount it. Returns error code otherwise. + */ +void +zfs_unmount_snap(const char *snapname) +{ + if (strchr(snapname, '@') == NULL) + return; + + (void) zfsctl_snapshot_unmount((char *)snapname, MNT_FORCE); +} + +/* ARGSUSED */ +static int +zfs_unmount_snap_cb(const char *snapname, void *arg) +{ + zfs_unmount_snap(snapname); + return (0); +} + +/* + * When a clone is destroyed, its origin may also need to be destroyed, + * in which case it must be unmounted. This routine will do that unmount + * if necessary. + */ +void +zfs_destroy_unmount_origin(const char *fsname) +{ + int error; + objset_t *os; + dsl_dataset_t *ds; + + error = dmu_objset_hold(fsname, FTAG, &os); + if (error != 0) + return; + ds = dmu_objset_ds(os); + if (dsl_dir_is_clone(ds->ds_dir) && DS_IS_DEFER_DESTROY(ds->ds_prev)) { + char originname[ZFS_MAX_DATASET_NAME_LEN]; + dsl_dataset_name(ds->ds_prev, originname); + dmu_objset_rele(os, FTAG); + zfs_unmount_snap(originname); + } else { + dmu_objset_rele(os, FTAG); + } +} + +/* + * innvl: { + * "snaps" -> { snapshot1, snapshot2 } + * (optional boolean) "defer" + * } + * + * outnvl: snapshot -> error code (int32) + */ +static const zfs_ioc_key_t zfs_keys_destroy_snaps[] = { + {"snaps", DATA_TYPE_NVLIST, 0}, + {"defer", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_destroy_snaps(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int poollen; + nvlist_t *snaps; + nvpair_t *pair; + boolean_t defer; + spa_t *spa; + + snaps = fnvlist_lookup_nvlist(innvl, "snaps"); + defer = nvlist_exists(innvl, "defer"); + + poollen = strlen(poolname); + for (pair = nvlist_next_nvpair(snaps, NULL); pair != NULL; + pair = nvlist_next_nvpair(snaps, pair)) { + const char *name = nvpair_name(pair); + + /* + * The snap must be in the specified pool to prevent the + * invalid removal of zvol minors below. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '@')) + return (SET_ERROR(EXDEV)); + + zfs_unmount_snap(nvpair_name(pair)); + if (spa_open(name, &spa, FTAG) == 0) { + zvol_remove_minors(spa, name, B_TRUE); + spa_close(spa, FTAG); + } + } + + return (dsl_destroy_snapshots_nvl(snaps, defer, outnvl)); +} + +/* + * Create bookmarks. The bookmark names are of the form #. + * All bookmarks and snapshots must be in the same pool. + * dsl_bookmark_create_nvl_validate describes the nvlist schema in more detail. + * + * innvl: { + * new_bookmark1 -> existing_snapshot, + * new_bookmark2 -> existing_bookmark, + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +static const zfs_ioc_key_t zfs_keys_bookmark[] = { + {"...", DATA_TYPE_STRING, ZK_WILDCARDLIST}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_bookmark(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (dsl_bookmark_create(innvl, outnvl)); +} + +/* + * innvl: { + * property 1, property 2, ... + * } + * + * outnvl: { + * bookmark name 1 -> { property 1, property 2, ... }, + * bookmark name 2 -> { property 1, property 2, ... } + * } + * + */ +static const zfs_ioc_key_t zfs_keys_get_bookmarks[] = { + {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST | ZK_OPTIONAL}, +}; + +static int +zfs_ioc_get_bookmarks(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (dsl_get_bookmarks(fsname, innvl, outnvl)); +} + +/* + * innvl is not used. + * + * outnvl: { + * property 1, property 2, ... + * } + * + */ +static const zfs_ioc_key_t zfs_keys_get_bookmark_props[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_bookmark_props(const char *bookmark, nvlist_t *innvl, + nvlist_t *outnvl) +{ + char fsname[ZFS_MAX_DATASET_NAME_LEN]; + char *bmname; + + bmname = strchr(bookmark, '#'); + if (bmname == NULL) + return (SET_ERROR(EINVAL)); + bmname++; + + (void) strlcpy(fsname, bookmark, sizeof (fsname)); + *(strchr(fsname, '#')) = '\0'; + + return (dsl_get_bookmark_props(fsname, bmname, outnvl)); +} + +/* + * innvl: { + * bookmark name 1, bookmark name 2 + * } + * + * outnvl: bookmark -> error code (int32) + * + */ +static const zfs_ioc_key_t zfs_keys_destroy_bookmarks[] = { + {"...", DATA_TYPE_BOOLEAN, ZK_WILDCARDLIST}, +}; + +static int +zfs_ioc_destroy_bookmarks(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + int error, poollen; + + poollen = strlen(poolname); + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + const char *name = nvpair_name(pair); + const char *cp = strchr(name, '#'); + + /* + * The bookmark name must contain an #, and the part after it + * must contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + + /* + * The bookmark must be in the specified pool. + */ + if (strncmp(name, poolname, poollen) != 0 || + (name[poollen] != '/' && name[poollen] != '#')) + return (SET_ERROR(EXDEV)); + } + + error = dsl_bookmark_destroy(innvl, outnvl); + return (error); +} + +static const zfs_ioc_key_t zfs_keys_channel_program[] = { + {"program", DATA_TYPE_STRING, 0}, + {"arg", DATA_TYPE_ANY, 0}, + {"sync", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, + {"instrlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"memlimit", DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_channel_program(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + char *program; + uint64_t instrlimit, memlimit; + boolean_t sync_flag; + nvpair_t *nvarg = NULL; + + program = fnvlist_lookup_string(innvl, ZCP_ARG_PROGRAM); + if (0 != nvlist_lookup_boolean_value(innvl, ZCP_ARG_SYNC, &sync_flag)) { + sync_flag = B_TRUE; + } + if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_INSTRLIMIT, &instrlimit)) { + instrlimit = ZCP_DEFAULT_INSTRLIMIT; + } + if (0 != nvlist_lookup_uint64(innvl, ZCP_ARG_MEMLIMIT, &memlimit)) { + memlimit = ZCP_DEFAULT_MEMLIMIT; + } + nvarg = fnvlist_lookup_nvpair(innvl, ZCP_ARG_ARGLIST); + + if (instrlimit == 0 || instrlimit > zfs_lua_max_instrlimit) + return (SET_ERROR(EINVAL)); + if (memlimit == 0 || memlimit > zfs_lua_max_memlimit) + return (SET_ERROR(EINVAL)); + + return (zcp_eval(poolname, program, sync_flag, instrlimit, memlimit, + nvarg, outnvl)); +} + +/* + * innvl: unused + * outnvl: empty + */ +static const zfs_ioc_key_t zfs_keys_pool_checkpoint[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_pool_checkpoint(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + return (spa_checkpoint(poolname)); +} + +/* + * innvl: unused + * outnvl: empty + */ +static const zfs_ioc_key_t zfs_keys_pool_discard_checkpoint[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_pool_discard_checkpoint(const char *poolname, nvlist_t *innvl, + nvlist_t *outnvl) +{ + return (spa_checkpoint_discard(poolname)); +} + +/* + * inputs: + * zc_name name of dataset to destroy + * zc_defer_destroy mark for deferred destroy + * + * outputs: none + */ +static int +zfs_ioc_destroy(zfs_cmd_t *zc) +{ + objset_t *os; + dmu_objset_type_t ost; + int err; + + err = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (err != 0) + return (err); + ost = dmu_objset_type(os); + dmu_objset_rele(os, FTAG); + + if (ost == DMU_OST_ZFS) + zfs_unmount_snap(zc->zc_name); + + if (strchr(zc->zc_name, '@')) { + err = dsl_destroy_snapshot(zc->zc_name, zc->zc_defer_destroy); + } else { + err = dsl_destroy_head(zc->zc_name); + if (err == EEXIST) { + /* + * It is possible that the given DS may have + * hidden child (%recv) datasets - "leftovers" + * resulting from the previously interrupted + * 'zfs receive'. + * + * 6 extra bytes for /%recv + */ + char namebuf[ZFS_MAX_DATASET_NAME_LEN + 6]; + + if (snprintf(namebuf, sizeof (namebuf), "%s/%s", + zc->zc_name, recv_clone_name) >= + sizeof (namebuf)) + return (SET_ERROR(EINVAL)); + + /* + * Try to remove the hidden child (%recv) and after + * that try to remove the target dataset. + * If the hidden child (%recv) does not exist + * the original error (EEXIST) will be returned + */ + err = dsl_destroy_head(namebuf); + if (err == 0) + err = dsl_destroy_head(zc->zc_name); + else if (err == ENOENT) + err = SET_ERROR(EEXIST); + } + } + + return (err); +} + +/* + * innvl: { + * "initialize_command" -> POOL_INITIALIZE_{CANCEL|START|SUSPEND} (uint64) + * "initialize_vdevs": { -> guids to initialize (nvlist) + * "vdev_path_1": vdev_guid_1, (uint64), + * "vdev_path_2": vdev_guid_2, (uint64), + * ... + * }, + * } + * + * outnvl: { + * "initialize_vdevs": { -> initialization errors (nvlist) + * "vdev_path_1": errno, see function body for possible errnos (uint64) + * "vdev_path_2": errno, ... (uint64) + * ... + * } + * } + * + * EINVAL is returned for an unknown commands or if any of the provided vdev + * guids have be specified with a type other than uint64. + */ +static const zfs_ioc_key_t zfs_keys_pool_initialize[] = { + {ZPOOL_INITIALIZE_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_INITIALIZE_VDEVS, DATA_TYPE_NVLIST, 0} +}; + +static int +zfs_ioc_pool_initialize(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_INITIALIZE_COMMAND, + &cmd_type) != 0) { + return (SET_ERROR(EINVAL)); + } + + if (!(cmd_type == POOL_INITIALIZE_CANCEL || + cmd_type == POOL_INITIALIZE_START || + cmd_type == POOL_INITIALIZE_SUSPEND)) { + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_INITIALIZE_VDEVS, + &vdev_guids) != 0) { + return (SET_ERROR(EINVAL)); + } + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid; + if (nvpair_value_uint64(pair, &vdev_guid) != 0) { + return (SET_ERROR(EINVAL)); + } + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = spa_vdev_initialize(spa, vdev_guids, cmd_type, + vdev_errlist); + + if (fnvlist_size(vdev_errlist) > 0) { + fnvlist_add_nvlist(outnvl, ZPOOL_INITIALIZE_VDEVS, + vdev_errlist); + } + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* + * innvl: { + * "trim_command" -> POOL_TRIM_{CANCEL|START|SUSPEND} (uint64) + * "trim_vdevs": { -> guids to TRIM (nvlist) + * "vdev_path_1": vdev_guid_1, (uint64), + * "vdev_path_2": vdev_guid_2, (uint64), + * ... + * }, + * "trim_rate" -> Target TRIM rate in bytes/sec. + * "trim_secure" -> Set to request a secure TRIM. + * } + * + * outnvl: { + * "trim_vdevs": { -> TRIM errors (nvlist) + * "vdev_path_1": errno, see function body for possible errnos (uint64) + * "vdev_path_2": errno, ... (uint64) + * ... + * } + * } + * + * EINVAL is returned for an unknown commands or if any of the provided vdev + * guids have be specified with a type other than uint64. + */ +static const zfs_ioc_key_t zfs_keys_pool_trim[] = { + {ZPOOL_TRIM_COMMAND, DATA_TYPE_UINT64, 0}, + {ZPOOL_TRIM_VDEVS, DATA_TYPE_NVLIST, 0}, + {ZPOOL_TRIM_RATE, DATA_TYPE_UINT64, ZK_OPTIONAL}, + {ZPOOL_TRIM_SECURE, DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_pool_trim(const char *poolname, nvlist_t *innvl, nvlist_t *outnvl) +{ + uint64_t cmd_type; + if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_COMMAND, &cmd_type) != 0) + return (SET_ERROR(EINVAL)); + + if (!(cmd_type == POOL_TRIM_CANCEL || + cmd_type == POOL_TRIM_START || + cmd_type == POOL_TRIM_SUSPEND)) { + return (SET_ERROR(EINVAL)); + } + + nvlist_t *vdev_guids; + if (nvlist_lookup_nvlist(innvl, ZPOOL_TRIM_VDEVS, &vdev_guids) != 0) + return (SET_ERROR(EINVAL)); + + for (nvpair_t *pair = nvlist_next_nvpair(vdev_guids, NULL); + pair != NULL; pair = nvlist_next_nvpair(vdev_guids, pair)) { + uint64_t vdev_guid; + if (nvpair_value_uint64(pair, &vdev_guid) != 0) { + return (SET_ERROR(EINVAL)); + } + } + + /* Optional, defaults to maximum rate when not provided */ + uint64_t rate; + if (nvlist_lookup_uint64(innvl, ZPOOL_TRIM_RATE, &rate) != 0) + rate = 0; + + /* Optional, defaults to standard TRIM when not provided */ + boolean_t secure; + if (nvlist_lookup_boolean_value(innvl, ZPOOL_TRIM_SECURE, + &secure) != 0) { + secure = B_FALSE; + } + + spa_t *spa; + int error = spa_open(poolname, &spa, FTAG); + if (error != 0) + return (error); + + nvlist_t *vdev_errlist = fnvlist_alloc(); + int total_errors = spa_vdev_trim(spa, vdev_guids, cmd_type, + rate, !!zfs_trim_metaslab_skip, secure, vdev_errlist); + + if (fnvlist_size(vdev_errlist) > 0) + fnvlist_add_nvlist(outnvl, ZPOOL_TRIM_VDEVS, vdev_errlist); + + fnvlist_free(vdev_errlist); + + spa_close(spa, FTAG); + return (total_errors > 0 ? EINVAL : 0); +} + +/* + * This ioctl waits for activity of a particular type to complete. If there is + * no activity of that type in progress, it returns immediately, and the + * returned value "waited" is false. If there is activity in progress, and no + * tag is passed in, the ioctl blocks until all activity of that type is + * complete, and then returns with "waited" set to true. + * + * If a tag is provided, it identifies a particular instance of an activity to + * wait for. Currently, this is only valid for use with 'initialize', because + * that is the only activity for which there can be multiple instances running + * concurrently. In the case of 'initialize', the tag corresponds to the guid of + * the vdev on which to wait. + * + * If a thread waiting in the ioctl receives a signal, the call will return + * immediately, and the return value will be EINTR. + * + * innvl: { + * "wait_activity" -> int32_t + * (optional) "wait_tag" -> uint64_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_pool_wait[] = { + {ZPOOL_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, + {ZPOOL_WAIT_TAG, DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_wait(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t activity; + uint64_t tag; + boolean_t waited; + int error; + + if (nvlist_lookup_int32(innvl, ZPOOL_WAIT_ACTIVITY, &activity) != 0) + return (EINVAL); + + if (nvlist_lookup_uint64(innvl, ZPOOL_WAIT_TAG, &tag) == 0) + error = spa_wait_tag(name, activity, tag, &waited); + else + error = spa_wait(name, activity, &waited); + + if (error == 0) + fnvlist_add_boolean_value(outnvl, ZPOOL_WAIT_WAITED, waited); + + return (error); +} + +/* + * This ioctl waits for activity of a particular type to complete. If there is + * no activity of that type in progress, it returns immediately, and the + * returned value "waited" is false. If there is activity in progress, and no + * tag is passed in, the ioctl blocks until all activity of that type is + * complete, and then returns with "waited" set to true. + * + * If a thread waiting in the ioctl receives a signal, the call will return + * immediately, and the return value will be EINTR. + * + * innvl: { + * "wait_activity" -> int32_t + * } + * + * outnvl: "waited" -> boolean_t + */ +static const zfs_ioc_key_t zfs_keys_fs_wait[] = { + {ZFS_WAIT_ACTIVITY, DATA_TYPE_INT32, 0}, +}; + +static int +zfs_ioc_wait_fs(const char *name, nvlist_t *innvl, nvlist_t *outnvl) +{ + int32_t activity; + boolean_t waited = B_FALSE; + int error; + dsl_pool_t *dp; + dsl_dir_t *dd; + dsl_dataset_t *ds; + + if (nvlist_lookup_int32(innvl, ZFS_WAIT_ACTIVITY, &activity) != 0) + return (SET_ERROR(EINVAL)); + + if (activity >= ZFS_WAIT_NUM_ACTIVITIES || activity < 0) + return (SET_ERROR(EINVAL)); + + if ((error = dsl_pool_hold(name, FTAG, &dp)) != 0) + return (error); + + if ((error = dsl_dataset_hold(dp, name, FTAG, &ds)) != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + dd = ds->ds_dir; + mutex_enter(&dd->dd_activity_lock); + dd->dd_activity_waiters++; + + /* + * We get a long-hold here so that the dsl_dataset_t and dsl_dir_t + * aren't evicted while we're waiting. Normally this is prevented by + * holding the pool, but we can't do that while we're waiting since + * that would prevent TXGs from syncing out. Some of the functionality + * of long-holds (e.g. preventing deletion) is unnecessary for this + * case, since we would cancel the waiters before proceeding with a + * deletion. An alternative mechanism for keeping the dataset around + * could be developed but this is simpler. + */ + dsl_dataset_long_hold(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + error = dsl_dir_wait(dd, ds, activity, &waited); + + dsl_dataset_long_rele(ds, FTAG); + dd->dd_activity_waiters--; + if (dd->dd_activity_waiters == 0) + cv_signal(&dd->dd_activity_cv); + mutex_exit(&dd->dd_activity_lock); + + dsl_dataset_rele(ds, FTAG); + + if (error == 0) + fnvlist_add_boolean_value(outnvl, ZFS_WAIT_WAITED, waited); + + return (error); +} + +/* + * fsname is name of dataset to rollback (to most recent snapshot) + * + * innvl may contain name of expected target snapshot + * + * outnvl: "target" -> name of most recent snapshot + * } + */ +static const zfs_ioc_key_t zfs_keys_rollback[] = { + {"target", DATA_TYPE_STRING, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_rollback(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + zfsvfs_t *zfsvfs; + zvol_state_handle_t *zv; + char *target = NULL; + int error; + + (void) nvlist_lookup_string(innvl, "target", &target); + if (target != NULL) { + const char *cp = strchr(target, '@'); + + /* + * The snap name must contain an @, and the part after it must + * contain only valid characters. + */ + if (cp == NULL || + zfs_component_namecheck(cp + 1, NULL, NULL) != 0) + return (SET_ERROR(EINVAL)); + } + + if (getzfsvfs(fsname, &zfsvfs) == 0) { + dsl_dataset_t *ds; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + int resume_err; + + error = dsl_dataset_rollback(fsname, target, zfsvfs, + outnvl); + resume_err = zfs_resume_fs(zfsvfs, ds); + error = error ? error : resume_err; + } + zfs_vfs_rele(zfsvfs); + } else if ((zv = zvol_suspend(fsname)) != NULL) { + error = dsl_dataset_rollback(fsname, target, zvol_tag(zv), + outnvl); + zvol_resume(zv); + } else { + error = dsl_dataset_rollback(fsname, target, NULL, outnvl); + } + return (error); +} + +static int +recursive_unmount(const char *fsname, void *arg) +{ + const char *snapname = arg; + char *fullname; + + fullname = kmem_asprintf("%s@%s", fsname, snapname); + zfs_unmount_snap(fullname); + kmem_strfree(fullname); + + return (0); +} + +/* + * + * snapname is the snapshot to redact. + * innvl: { + * "bookname" -> (string) + * shortname of the redaction bookmark to generate + * "snapnv" -> (nvlist, values ignored) + * snapshots to redact snapname with respect to + * } + * + * outnvl is unused + */ + +/* ARGSUSED */ +static const zfs_ioc_key_t zfs_keys_redact[] = { + {"bookname", DATA_TYPE_STRING, 0}, + {"snapnv", DATA_TYPE_NVLIST, 0}, +}; +static int +zfs_ioc_redact(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + nvlist_t *redactnvl = NULL; + char *redactbook = NULL; + + if (nvlist_lookup_nvlist(innvl, "snapnv", &redactnvl) != 0) + return (SET_ERROR(EINVAL)); + if (fnvlist_num_pairs(redactnvl) == 0) + return (SET_ERROR(ENXIO)); + if (nvlist_lookup_string(innvl, "bookname", &redactbook) != 0) + return (SET_ERROR(EINVAL)); + + return (dmu_redact_snap(snapname, redactnvl, redactbook)); +} + +/* + * inputs: + * zc_name old name of dataset + * zc_value new name of dataset + * zc_cookie recursive flag (only valid for snapshots) + * + * outputs: none + */ +static int +zfs_ioc_rename(zfs_cmd_t *zc) +{ + objset_t *os; + dmu_objset_type_t ost; + boolean_t recursive = zc->zc_cookie & 1; + char *at; + int err; + + /* "zfs rename" from and to ...%recv datasets should both fail */ + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + zc->zc_value[sizeof (zc->zc_value) - 1] = '\0'; + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || + dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_name, '%') || strchr(zc->zc_value, '%')) + return (SET_ERROR(EINVAL)); + + err = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (err != 0) + return (err); + ost = dmu_objset_type(os); + dmu_objset_rele(os, FTAG); + + at = strchr(zc->zc_name, '@'); + if (at != NULL) { + /* snaps must be in same fs */ + int error; + + if (strncmp(zc->zc_name, zc->zc_value, at - zc->zc_name + 1)) + return (SET_ERROR(EXDEV)); + *at = '\0'; + if (ost == DMU_OST_ZFS) { + error = dmu_objset_find(zc->zc_name, + recursive_unmount, at + 1, + recursive ? DS_FIND_CHILDREN : 0); + if (error != 0) { + *at = '@'; + return (error); + } + } + error = dsl_dataset_rename_snapshot(zc->zc_name, + at + 1, strchr(zc->zc_value, '@') + 1, recursive); + *at = '@'; + + return (error); + } else { + return (dsl_dir_rename(zc->zc_name, zc->zc_value)); + } +} + +static int +zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) +{ + const char *propname = nvpair_name(pair); + boolean_t issnap = (strchr(dsname, '@') != NULL); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval, compval; + int err; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if ((err = zfs_secpolicy_write_perms(dsname, + ZFS_DELEG_PERM_USERPROP, cr))) + return (err); + return (0); + } + + if (!issnap && zfs_prop_userquota(propname)) { + const char *perm = NULL; + const char *uq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; + const char *gq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; + const char *uiq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_USEROBJQUOTA]; + const char *giq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPOBJQUOTA]; + const char *pq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_PROJECTQUOTA]; + const char *piq_prefix = zfs_userquota_prop_prefixes[\ + ZFS_PROP_PROJECTOBJQUOTA]; + + if (strncmp(propname, uq_prefix, + strlen(uq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_USERQUOTA; + } else if (strncmp(propname, uiq_prefix, + strlen(uiq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_USEROBJQUOTA; + } else if (strncmp(propname, gq_prefix, + strlen(gq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_GROUPQUOTA; + } else if (strncmp(propname, giq_prefix, + strlen(giq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_GROUPOBJQUOTA; + } else if (strncmp(propname, pq_prefix, + strlen(pq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_PROJECTQUOTA; + } else if (strncmp(propname, piq_prefix, + strlen(piq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_PROJECTOBJQUOTA; + } else { + /* {USER|GROUP|PROJECT}USED are read-only */ + return (SET_ERROR(EINVAL)); + } + + if ((err = zfs_secpolicy_write_perms(dsname, perm, cr))) + return (err); + return (0); + } + + return (SET_ERROR(EINVAL)); + } + + if (issnap) + return (SET_ERROR(EINVAL)); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_value_uint64(pair, &intval) == 0) { + compval = ZIO_COMPRESS_ALGO(intval); + if (compval >= ZIO_COMPRESS_GZIP_1 && + compval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(dsname, + SPA_VERSION_GZIP_COMPRESSION)) { + return (SET_ERROR(ENOTSUP)); + } + + if (compval == ZIO_COMPRESS_ZLE && + zfs_earlier_version(dsname, + SPA_VERSION_ZLE_COMPRESSION)) + return (SET_ERROR(ENOTSUP)); + + if (compval == ZIO_COMPRESS_LZ4) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LZ4_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + + if (compval == ZIO_COMPRESS_ZSTD) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_ZSTD_COMPRESS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + } + break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) + return (SET_ERROR(ENOTSUP)); + break; + + case ZFS_PROP_VOLBLOCKSIZE: + case ZFS_PROP_RECORDSIZE: + /* Record sizes above 128k need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval > SPA_OLD_MAXBLOCKSIZE) { + spa_t *spa; + + /* + * We don't allow setting the property above 1MB, + * unless the tunable has been changed. + */ + if (intval > zfs_max_recordsize || + intval > SPA_MAXBLOCKSIZE) + return (SET_ERROR(ERANGE)); + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_BLOCKS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + + case ZFS_PROP_DNODESIZE: + /* Dnode sizes above 512 need the feature to be enabled */ + if (nvpair_value_uint64(pair, &intval) == 0 && + intval != ZFS_DNSIZE_LEGACY) { + spa_t *spa; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, + SPA_FEATURE_LARGE_DNODE)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + + case ZFS_PROP_SPECIAL_SMALL_BLOCKS: + /* + * This property could require the allocation classes + * feature to be active for setting, however we allow + * it so that tests of settable properties succeed. + * The CLI will issue a warning in this case. + */ + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) + return (SET_ERROR(ENOTSUP)); + break; + + case ZFS_PROP_ACLINHERIT: + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval == ZFS_ACL_PASSTHROUGH_X && + zfs_earlier_version(dsname, + SPA_VERSION_PASSTHROUGH_X)) + return (SET_ERROR(ENOTSUP)); + } + break; + case ZFS_PROP_CHECKSUM: + case ZFS_PROP_DEDUP: + { + spa_feature_t feature; + spa_t *spa; + int err; + + /* dedup feature version checks */ + if (prop == ZFS_PROP_DEDUP && + zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (SET_ERROR(ENOTSUP)); + + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + /* check prop value is enabled in features */ + feature = zio_checksum_to_feature( + intval & ZIO_CHECKSUM_MASK); + if (feature == SPA_FEATURE_NONE) + break; + + if ((err = spa_open(dsname, &spa, FTAG)) != 0) + return (err); + + if (!spa_feature_is_enabled(spa, feature)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + spa_close(spa, FTAG); + } + break; + } + + default: + break; + } + + return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); +} + +/* + * Removes properties from the given props list that fail permission checks + * needed to clear them and to restore them in case of a receive error. For each + * property, make sure we have both set and inherit permissions. + * + * Returns the first error encountered if any permission checks fail. If the + * caller provides a non-NULL errlist, it also gives the complete list of names + * of all the properties that failed a permission check along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property checks out successfully, zero is returned and the list + * pointed at by errlist is NULL. + */ +static int +zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) +{ + zfs_cmd_t *zc; + nvpair_t *pair, *next_pair; + nvlist_t *errors; + int err, rv = 0; + + if (props == NULL) + return (0); + + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strlcpy(zc->zc_name, dataset, sizeof (zc->zc_name)); + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + next_pair = nvlist_next_nvpair(props, pair); + + (void) strlcpy(zc->zc_value, nvpair_name(pair), + sizeof (zc->zc_value)); + if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || + (err = zfs_secpolicy_inherit_prop(zc, NULL, CRED())) != 0) { + VERIFY(nvlist_remove_nvpair(props, pair) == 0); + VERIFY(nvlist_add_int32(errors, + zc->zc_value, err) == 0); + } + pair = next_pair; + } + kmem_free(zc, sizeof (zfs_cmd_t)); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; + } else { + VERIFY(nvpair_value_int32(pair, &rv) == 0); + } + + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); +} + +static boolean_t +propval_equals(nvpair_t *p1, nvpair_t *p2) +{ + if (nvpair_type(p1) == DATA_TYPE_NVLIST) { + /* dsl_prop_get_all_impl() format */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p1) == 0); + } + + if (nvpair_type(p2) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p2) == 0); + } + + if (nvpair_type(p1) != nvpair_type(p2)) + return (B_FALSE); + + if (nvpair_type(p1) == DATA_TYPE_STRING) { + char *valstr1, *valstr2; + + VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); + VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); + return (strcmp(valstr1, valstr2) == 0); + } else { + uint64_t intval1, intval2; + + VERIFY(nvpair_value_uint64(p1, &intval1) == 0); + VERIFY(nvpair_value_uint64(p2, &intval2) == 0); + return (intval1 == intval2); + } +} + +/* + * Remove properties from props if they are not going to change (as determined + * by comparison with origprops). Remove them from origprops as well, since we + * do not need to clear or restore properties that won't change. + */ +static void +props_reduce(nvlist_t *props, nvlist_t *origprops) +{ + nvpair_t *pair, *next_pair; + + if (origprops == NULL) + return; /* all props need to be received */ + + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + const char *propname = nvpair_name(pair); + nvpair_t *match; + + next_pair = nvlist_next_nvpair(props, pair); + + if ((nvlist_lookup_nvpair(origprops, propname, + &match) != 0) || !propval_equals(pair, match)) + goto next; /* need to set received value */ + + /* don't clear the existing received value */ + (void) nvlist_remove_nvpair(origprops, match); + /* don't bother receiving the property */ + (void) nvlist_remove_nvpair(props, pair); +next: + pair = next_pair; + } +} + +/* + * Extract properties that cannot be set PRIOR to the receipt of a dataset. + * For example, refquota cannot be set until after the receipt of a dataset, + * because in replication streams, an older/earlier snapshot may exceed the + * refquota. We want to receive the older/earlier snapshot, but setting + * refquota pre-receipt will set the dsl's ACTUAL quota, which will prevent + * the older/earlier snapshot from being received (with EDQUOT). + * + * The ZFS test "zfs_receive_011_pos" demonstrates such a scenario. + * + * libzfs will need to be judicious handling errors encountered by props + * extracted by this function. + */ +static nvlist_t * +extract_delay_props(nvlist_t *props) +{ + nvlist_t *delayprops; + nvpair_t *nvp, *tmp; + static const zfs_prop_t delayable[] = { + ZFS_PROP_REFQUOTA, + ZFS_PROP_KEYLOCATION, + 0 + }; + int i; + + VERIFY(nvlist_alloc(&delayprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + for (nvp = nvlist_next_nvpair(props, NULL); nvp != NULL; + nvp = nvlist_next_nvpair(props, nvp)) { + /* + * strcmp() is safe because zfs_prop_to_name() always returns + * a bounded string. + */ + for (i = 0; delayable[i] != 0; i++) { + if (strcmp(zfs_prop_to_name(delayable[i]), + nvpair_name(nvp)) == 0) { + break; + } + } + if (delayable[i] != 0) { + tmp = nvlist_prev_nvpair(props, nvp); + VERIFY(nvlist_add_nvpair(delayprops, nvp) == 0); + VERIFY(nvlist_remove_nvpair(props, nvp) == 0); + nvp = tmp; + } + } + + if (nvlist_empty(delayprops)) { + nvlist_free(delayprops); + delayprops = NULL; + } + return (delayprops); +} + +static void +zfs_allow_log_destroy(void *arg) +{ + char *poolname = arg; + + if (poolname != NULL) + kmem_strfree(poolname); +} + +#ifdef ZFS_DEBUG +static boolean_t zfs_ioc_recv_inject_err; +#endif + +/* + * nvlist 'errors' is always allocated. It will contain descriptions of + * encountered errors, if any. It's the callers responsibility to free. + */ +static int +zfs_ioc_recv_impl(char *tofs, char *tosnap, char *origin, nvlist_t *recvprops, + nvlist_t *localprops, nvlist_t *hidden_args, boolean_t force, + boolean_t resumable, int input_fd, + dmu_replay_record_t *begin_record, uint64_t *read_bytes, + uint64_t *errflags, nvlist_t **errors) +{ + dmu_recv_cookie_t drc; + int error = 0; + int props_error = 0; + offset_t off, noff; + nvlist_t *local_delayprops = NULL; + nvlist_t *recv_delayprops = NULL; + nvlist_t *origprops = NULL; /* existing properties */ + nvlist_t *origrecvd = NULL; /* existing received properties */ + boolean_t first_recvd_props = B_FALSE; + boolean_t tofs_was_redacted; + zfs_file_t *input_fp; + + *read_bytes = 0; + *errflags = 0; + *errors = fnvlist_alloc(); + off = 0; + + if ((error = zfs_file_get(input_fd, &input_fp))) + return (error); + + noff = off = zfs_file_off(input_fp); + error = dmu_recv_begin(tofs, tosnap, begin_record, force, + resumable, localprops, hidden_args, origin, &drc, input_fp, + &off); + if (error != 0) + goto out; + tofs_was_redacted = dsl_get_redacted(drc.drc_ds); + + /* + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. + */ + if (recvprops != NULL && !drc.drc_newfs) { + if (spa_version(dsl_dataset_get_spa(drc.drc_ds)) >= + SPA_VERSION_RECVD_PROPS && + !dsl_prop_get_hasrecvd(tofs)) + first_recvd_props = B_TRUE; + + /* + * If new received properties are supplied, they are to + * completely replace the existing received properties, + * so stash away the existing ones. + */ + if (dsl_prop_get_received(tofs, &origrecvd) == 0) { + nvlist_t *errlist = NULL; + /* + * Don't bother writing a property if its value won't + * change (and avoid the unnecessary security checks). + * + * The first receive after SPA_VERSION_RECVD_PROPS is a + * special case where we blow away all local properties + * regardless. + */ + if (!first_recvd_props) + props_reduce(recvprops, origrecvd); + if (zfs_check_clearable(tofs, origrecvd, &errlist) != 0) + (void) nvlist_merge(*errors, errlist, 0); + nvlist_free(errlist); + + if (clear_received_props(tofs, origrecvd, + first_recvd_props ? NULL : recvprops) != 0) + *errflags |= ZPROP_ERR_NOCLEAR; + } else { + *errflags |= ZPROP_ERR_NOCLEAR; + } + } + + /* + * Stash away existing properties so we can restore them on error unless + * we're doing the first receive after SPA_VERSION_RECVD_PROPS, in which + * case "origrecvd" will take care of that. + */ + if (localprops != NULL && !drc.drc_newfs && !first_recvd_props) { + objset_t *os; + if (dmu_objset_hold(tofs, FTAG, &os) == 0) { + if (dsl_prop_get_all(os, &origprops) != 0) { + *errflags |= ZPROP_ERR_NOCLEAR; + } + dmu_objset_rele(os, FTAG); + } else { + *errflags |= ZPROP_ERR_NOCLEAR; + } + } + + if (recvprops != NULL) { + props_error = dsl_prop_set_hasrecvd(tofs); + + if (props_error == 0) { + recv_delayprops = extract_delay_props(recvprops); + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + recvprops, *errors); + } + } + + if (localprops != NULL) { + nvlist_t *oprops = fnvlist_alloc(); + nvlist_t *xprops = fnvlist_alloc(); + nvpair_t *nvp = NULL; + + while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { + if (nvpair_type(nvp) == DATA_TYPE_BOOLEAN) { + /* -x property */ + const char *name = nvpair_name(nvp); + zfs_prop_t prop = zfs_name_to_prop(name); + if (prop != ZPROP_INVAL) { + if (!zfs_prop_inheritable(prop)) + continue; + } else if (!zfs_prop_user(name)) + continue; + fnvlist_add_boolean(xprops, name); + } else { + /* -o property=value */ + fnvlist_add_nvpair(oprops, nvp); + } + } + + local_delayprops = extract_delay_props(oprops); + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, + oprops, *errors); + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, + xprops, *errors); + + nvlist_free(oprops); + nvlist_free(xprops); + } + + error = dmu_recv_stream(&drc, &off); + + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; + zvol_state_handle_t *zv = NULL; + + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + dsl_dataset_t *ds; + int end_err; + boolean_t stream_is_redacted = DMU_GET_FEATUREFLAGS( + begin_record->drr_u.drr_begin. + drr_versioninfo) & DMU_BACKUP_FEATURE_REDACTED; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc, zfsvfs); + /* + * If the dataset was not redacted, but we received a + * redacted stream onto it, we need to unmount the + * dataset. Otherwise, resume the filesystem. + */ + if (error == 0 && !drc.drc_newfs && + stream_is_redacted && !tofs_was_redacted) { + error = zfs_end_fs(zfsvfs, ds); + } else if (error == 0) { + error = zfs_resume_fs(zfsvfs, ds); + } + error = error ? error : end_err; + zfs_vfs_rele(zfsvfs); + } else if ((zv = zvol_suspend(tofs)) != NULL) { + error = dmu_recv_end(&drc, zvol_tag(zv)); + zvol_resume(zv); + } else { + error = dmu_recv_end(&drc, NULL); + } + + /* Set delayed properties now, after we're done receiving. */ + if (recv_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + recv_delayprops, *errors); + } + if (local_delayprops != NULL && error == 0) { + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, + local_delayprops, *errors); + } + } + + /* + * Merge delayed props back in with initial props, in case + * we're DEBUG and zfs_ioc_recv_inject_err is set (which means + * we have to make sure clear_received_props() includes + * the delayed properties). + * + * Since zfs_ioc_recv_inject_err is only in DEBUG kernels, + * using ASSERT() will be just like a VERIFY. + */ + if (recv_delayprops != NULL) { + ASSERT(nvlist_merge(recvprops, recv_delayprops, 0) == 0); + nvlist_free(recv_delayprops); + } + if (local_delayprops != NULL) { + ASSERT(nvlist_merge(localprops, local_delayprops, 0) == 0); + nvlist_free(local_delayprops); + } + *read_bytes = off - noff; + +#ifdef ZFS_DEBUG + if (zfs_ioc_recv_inject_err) { + zfs_ioc_recv_inject_err = B_FALSE; + error = 1; + } +#endif + + /* + * On error, restore the original props. + */ + if (error != 0 && recvprops != NULL && !drc.drc_newfs) { + if (clear_received_props(tofs, recvprops, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ + *errflags |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(tofs); + } + + if (origrecvd == NULL && !drc.drc_newfs) { + /* We failed to stash the original properties. */ + *errflags |= ZPROP_ERR_NORESTORE; + } + + /* + * dsl_props_set() will not convert RECEIVED to LOCAL on or + * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL + * explicitly if we're restoring local properties cleared in the + * first new-style receive. + */ + if (origrecvd != NULL && + zfs_set_prop_nvlist(tofs, (first_recvd_props ? + ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), + origrecvd, NULL) != 0) { + /* + * We stashed the original properties but failed to + * restore them. + */ + *errflags |= ZPROP_ERR_NORESTORE; + } + } + if (error != 0 && localprops != NULL && !drc.drc_newfs && + !first_recvd_props) { + nvlist_t *setprops; + nvlist_t *inheritprops; + nvpair_t *nvp; + + if (origprops == NULL) { + /* We failed to stash the original properties. */ + *errflags |= ZPROP_ERR_NORESTORE; + goto out; + } + + /* Restore original props */ + setprops = fnvlist_alloc(); + inheritprops = fnvlist_alloc(); + nvp = NULL; + while ((nvp = nvlist_next_nvpair(localprops, nvp)) != NULL) { + const char *name = nvpair_name(nvp); + const char *source; + nvlist_t *attrs; + + if (!nvlist_exists(origprops, name)) { + /* + * Property was not present or was explicitly + * inherited before the receive, restore this. + */ + fnvlist_add_boolean(inheritprops, name); + continue; + } + attrs = fnvlist_lookup_nvlist(origprops, name); + source = fnvlist_lookup_string(attrs, ZPROP_SOURCE); + + /* Skip received properties */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) + continue; + + if (strcmp(source, tofs) == 0) { + /* Property was locally set */ + fnvlist_add_nvlist(setprops, name, attrs); + } else { + /* Property was implicitly inherited */ + fnvlist_add_boolean(inheritprops, name); + } + } + + if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_LOCAL, setprops, + NULL) != 0) + *errflags |= ZPROP_ERR_NORESTORE; + if (zfs_set_prop_nvlist(tofs, ZPROP_SRC_INHERITED, inheritprops, + NULL) != 0) + *errflags |= ZPROP_ERR_NORESTORE; + + nvlist_free(setprops); + nvlist_free(inheritprops); + } +out: + zfs_file_put(input_fd); + nvlist_free(origrecvd); + nvlist_free(origprops); + + if (error == 0) + error = props_error; + + return (error); +} + +/* + * inputs: + * zc_name name of containing filesystem (unused) + * zc_nvlist_src{_size} nvlist of properties to apply + * zc_nvlist_conf{_size} nvlist of properties to exclude + * (DATA_TYPE_BOOLEAN) and override (everything else) + * zc_value name of snapshot to create + * zc_string name of clone origin (if DRR_FLAG_CLONE) + * zc_cookie file descriptor to recv from + * zc_begin_record the BEGIN record of the stream (not byteswapped) + * zc_guid force flag + * + * outputs: + * zc_cookie number of bytes read + * zc_obj zprop_errflags_t + * zc_nvlist_dst{_size} error for each unapplied received property + */ +static int +zfs_ioc_recv(zfs_cmd_t *zc) +{ + dmu_replay_record_t begin_record; + nvlist_t *errors = NULL; + nvlist_t *recvdprops = NULL; + nvlist_t *localprops = NULL; + char *origin = NULL; + char *tosnap; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + int error = 0; + + if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || + strchr(zc->zc_value, '@') == NULL || + strchr(zc->zc_value, '%')) + return (SET_ERROR(EINVAL)); + + (void) strlcpy(tofs, zc->zc_value, sizeof (tofs)); + tosnap = strchr(tofs, '@'); + *tosnap++ = '\0'; + + if (zc->zc_nvlist_src != 0 && + (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &recvdprops)) != 0) + return (error); + + if (zc->zc_nvlist_conf != 0 && + (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &localprops)) != 0) + return (error); + + if (zc->zc_string[0]) + origin = zc->zc_string; + + begin_record.drr_type = DRR_BEGIN; + begin_record.drr_payloadlen = 0; + begin_record.drr_u.drr_begin = zc->zc_begin_record; + + error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvdprops, localprops, + NULL, zc->zc_guid, B_FALSE, zc->zc_cookie, &begin_record, + &zc->zc_cookie, &zc->zc_obj, &errors); + nvlist_free(recvdprops); + nvlist_free(localprops); + + /* + * Now that all props, initial and delayed, are set, report the prop + * errors to the caller. + */ + if (zc->zc_nvlist_dst_size != 0 && errors != NULL && + (nvlist_smush(errors, zc->zc_nvlist_dst_size) != 0 || + put_nvlist(zc, errors) != 0)) { + /* + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. + */ + error = SET_ERROR(EINVAL); + } + + nvlist_free(errors); + + return (error); +} + +/* + * innvl: { + * "snapname" -> full name of the snapshot to create + * (optional) "props" -> received properties to set (nvlist) + * (optional) "localprops" -> override and exclude properties (nvlist) + * (optional) "origin" -> name of clone origin (DRR_FLAG_CLONE) + * "begin_record" -> non-byteswapped dmu_replay_record_t + * "input_fd" -> file descriptor to read stream from (int32) + * (optional) "force" -> force flag (value ignored) + * (optional) "resumable" -> resumable flag (value ignored) + * (optional) "cleanup_fd" -> unused + * (optional) "action_handle" -> unused + * (optional) "hidden_args" -> { "wkeydata" -> value } + * } + * + * outnvl: { + * "read_bytes" -> number of bytes read + * "error_flags" -> zprop_errflags_t + * "errors" -> error for each unapplied received property (nvlist) + * } + */ +static const zfs_ioc_key_t zfs_keys_recv_new[] = { + {"snapname", DATA_TYPE_STRING, 0}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"localprops", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"origin", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"begin_record", DATA_TYPE_BYTE_ARRAY, 0}, + {"input_fd", DATA_TYPE_INT32, 0}, + {"force", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"resumable", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, + {"action_handle", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_recv_new(const char *fsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + dmu_replay_record_t *begin_record; + uint_t begin_record_size; + nvlist_t *errors = NULL; + nvlist_t *recvprops = NULL; + nvlist_t *localprops = NULL; + nvlist_t *hidden_args = NULL; + char *snapname; + char *origin = NULL; + char *tosnap; + char tofs[ZFS_MAX_DATASET_NAME_LEN]; + boolean_t force; + boolean_t resumable; + uint64_t read_bytes = 0; + uint64_t errflags = 0; + int input_fd = -1; + int error; + + snapname = fnvlist_lookup_string(innvl, "snapname"); + + if (dataset_namecheck(snapname, NULL, NULL) != 0 || + strchr(snapname, '@') == NULL || + strchr(snapname, '%')) + return (SET_ERROR(EINVAL)); + + (void) strlcpy(tofs, snapname, sizeof (tofs)); + tosnap = strchr(tofs, '@'); + *tosnap++ = '\0'; + + error = nvlist_lookup_string(innvl, "origin", &origin); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_byte_array(innvl, "begin_record", + (uchar_t **)&begin_record, &begin_record_size); + if (error != 0 || begin_record_size != sizeof (*begin_record)) + return (SET_ERROR(EINVAL)); + + input_fd = fnvlist_lookup_int32(innvl, "input_fd"); + + force = nvlist_exists(innvl, "force"); + resumable = nvlist_exists(innvl, "resumable"); + + /* we still use "props" here for backwards compatibility */ + error = nvlist_lookup_nvlist(innvl, "props", &recvprops); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_nvlist(innvl, "localprops", &localprops); + if (error && error != ENOENT) + return (error); + + error = nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); + if (error && error != ENOENT) + return (error); + + error = zfs_ioc_recv_impl(tofs, tosnap, origin, recvprops, localprops, + hidden_args, force, resumable, input_fd, begin_record, + &read_bytes, &errflags, &errors); + + fnvlist_add_uint64(outnvl, "read_bytes", read_bytes); + fnvlist_add_uint64(outnvl, "error_flags", errflags); + fnvlist_add_nvlist(outnvl, "errors", errors); + + nvlist_free(errors); + nvlist_free(recvprops); + nvlist_free(localprops); + + return (error); +} + +typedef struct dump_bytes_io { + zfs_file_t *dbi_fp; + caddr_t dbi_buf; + int dbi_len; + int dbi_err; +} dump_bytes_io_t; + +static void +dump_bytes_cb(void *arg) +{ + dump_bytes_io_t *dbi = (dump_bytes_io_t *)arg; + zfs_file_t *fp; + caddr_t buf; + + fp = dbi->dbi_fp; + buf = dbi->dbi_buf; + + dbi->dbi_err = zfs_file_write(fp, buf, dbi->dbi_len, NULL); +} + +static int +dump_bytes(objset_t *os, void *buf, int len, void *arg) +{ + dump_bytes_io_t dbi; + + dbi.dbi_fp = arg; + dbi.dbi_buf = buf; + dbi.dbi_len = len; + +#if defined(HAVE_LARGE_STACKS) + dump_bytes_cb(&dbi); +#else + /* + * The vn_rdwr() call is performed in a taskq to ensure that there is + * always enough stack space to write safely to the target filesystem. + * The ZIO_TYPE_FREE threads are used because there can be a lot of + * them and they are used in vdev_file.c for a similar purpose. + */ + spa_taskq_dispatch_sync(dmu_objset_spa(os), ZIO_TYPE_FREE, + ZIO_TASKQ_ISSUE, dump_bytes_cb, &dbi, TQ_SLEEP); +#endif /* HAVE_LARGE_STACKS */ + + return (dbi.dbi_err); +} + +/* + * inputs: + * zc_name name of snapshot to send + * zc_cookie file descriptor to send stream to + * zc_obj fromorigin flag (mutually exclusive with zc_fromobj) + * zc_sendobj objsetid of snapshot to send + * zc_fromobj objsetid of incremental fromsnap (may be zero) + * zc_guid if set, estimate size of stream only. zc_cookie is ignored. + * output size in zc_objset_type. + * zc_flags lzc_send_flags + * + * outputs: + * zc_objset_type estimated size, if zc_guid is set + * + * NOTE: This is no longer the preferred interface, any new functionality + * should be added to zfs_ioc_send_new() instead. + */ +static int +zfs_ioc_send(zfs_cmd_t *zc) +{ + int error; + offset_t off; + boolean_t estimate = (zc->zc_guid != 0); + boolean_t embedok = (zc->zc_flags & 0x1); + boolean_t large_block_ok = (zc->zc_flags & 0x2); + boolean_t compressok = (zc->zc_flags & 0x4); + boolean_t rawok = (zc->zc_flags & 0x8); + boolean_t savedok = (zc->zc_flags & 0x10); + + if (zc->zc_obj != 0) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (dsl_dir_is_clone(tosnap->ds_dir)) + zc->zc_fromobj = + dsl_dir_phys(tosnap->ds_dir)->dd_origin_obj; + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } + + if (estimate) { + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold_obj(dp, zc->zc_sendobj, + FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (zc->zc_fromobj != 0) { + error = dsl_dataset_hold_obj(dp, zc->zc_fromobj, + FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + } + + error = dmu_send_estimate_fast(tosnap, fromsnap, NULL, + compressok || rawok, savedok, &zc->zc_objset_type); + + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } else { + zfs_file_t *fp; + dmu_send_outparams_t out = {0}; + + if ((error = zfs_file_get(zc->zc_cookie, &fp))) + return (error); + + off = zfs_file_off(fp); + out.dso_outfunc = dump_bytes; + out.dso_arg = fp; + out.dso_dryrun = B_FALSE; + error = dmu_send_obj(zc->zc_name, zc->zc_sendobj, + zc->zc_fromobj, embedok, large_block_ok, compressok, + rawok, savedok, zc->zc_cookie, &off, &out); + + zfs_file_put(zc->zc_cookie); + } + return (error); +} + +/* + * inputs: + * zc_name name of snapshot on which to report progress + * zc_cookie file descriptor of send stream + * + * outputs: + * zc_cookie number of bytes written in send stream thus far + * zc_objset_type logical size of data traversed by send thus far + */ +static int +zfs_ioc_send_progress(zfs_cmd_t *zc) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + dmu_sendstatus_t *dsp = NULL; + int error; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + mutex_enter(&ds->ds_sendstream_lock); + + /* + * Iterate over all the send streams currently active on this dataset. + * If there's one which matches the specified file descriptor _and_ the + * stream was started by the current process, return the progress of + * that stream. + */ + + for (dsp = list_head(&ds->ds_sendstreams); dsp != NULL; + dsp = list_next(&ds->ds_sendstreams, dsp)) { + if (dsp->dss_outfd == zc->zc_cookie && + zfs_proc_is_caller(dsp->dss_proc)) + break; + } + + if (dsp != NULL) { + zc->zc_cookie = atomic_cas_64((volatile uint64_t *)dsp->dss_off, + 0, 0); + /* This is the closest thing we have to atomic_read_64. */ + zc->zc_objset_type = atomic_cas_64(&dsp->dss_blocks, 0, 0); + } else { + error = SET_ERROR(ENOENT); + } + + mutex_exit(&ds->ds_sendstream_lock); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +static int +zfs_ioc_inject_fault(zfs_cmd_t *zc) +{ + int id, error; + + error = zio_inject_fault(zc->zc_name, (int)zc->zc_guid, &id, + &zc->zc_inject_record); + + if (error == 0) + zc->zc_guid = (uint64_t)id; + + return (error); +} + +static int +zfs_ioc_clear_fault(zfs_cmd_t *zc) +{ + return (zio_clear_fault((int)zc->zc_guid)); +} + +static int +zfs_ioc_inject_list_next(zfs_cmd_t *zc) +{ + int id = (int)zc->zc_guid; + int error; + + error = zio_inject_list_next(&id, zc->zc_name, sizeof (zc->zc_name), + &zc->zc_inject_record); + + zc->zc_guid = id; + + return (error); +} + +static int +zfs_ioc_error_log(zfs_cmd_t *zc) +{ + spa_t *spa; + int error; + size_t count = (size_t)zc->zc_nvlist_dst_size; + + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + return (error); + + error = spa_get_errlog(spa, (void *)(uintptr_t)zc->zc_nvlist_dst, + &count); + if (error == 0) + zc->zc_nvlist_dst_size = count; + else + zc->zc_nvlist_dst_size = spa_get_errlog_size(spa); + + spa_close(spa, FTAG); + + return (error); +} + +static int +zfs_ioc_clear(zfs_cmd_t *zc) +{ + spa_t *spa; + vdev_t *vd; + int error; + + /* + * On zpool clear we also fix up missing slogs + */ + mutex_enter(&spa_namespace_lock); + spa = spa_lookup(zc->zc_name); + if (spa == NULL) { + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(EIO)); + } + if (spa_get_log_state(spa) == SPA_LOG_MISSING) { + /* we need to let spa_open/spa_load clear the chains */ + spa_set_log_state(spa, SPA_LOG_CLEAR); + } + spa->spa_last_open_failed = 0; + mutex_exit(&spa_namespace_lock); + + if (zc->zc_cookie & ZPOOL_NO_REWIND) { + error = spa_open(zc->zc_name, &spa, FTAG); + } else { + nvlist_t *policy; + nvlist_t *config = NULL; + + if (zc->zc_nvlist_src == 0) + return (SET_ERROR(EINVAL)); + + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { + error = spa_open_rewind(zc->zc_name, &spa, FTAG, + policy, &config); + if (config != NULL) { + int err; + + if ((err = put_nvlist(zc, config)) != 0) + error = err; + nvlist_free(config); + } + nvlist_free(policy); + } + } + + if (error != 0) + return (error); + + /* + * If multihost is enabled, resuming I/O is unsafe as another + * host may have imported the pool. + */ + if (spa_multihost(spa) && spa_suspended(spa)) + return (SET_ERROR(EINVAL)); + + spa_vdev_state_enter(spa, SCL_NONE); + + if (zc->zc_guid == 0) { + vd = NULL; + } else { + vd = spa_lookup_by_guid(spa, zc->zc_guid, B_TRUE); + if (vd == NULL) { + error = SET_ERROR(ENODEV); + (void) spa_vdev_state_exit(spa, NULL, error); + spa_close(spa, FTAG); + return (error); + } + } + + vdev_clear(spa, vd); + + (void) spa_vdev_state_exit(spa, spa_suspended(spa) ? + NULL : spa->spa_root_vdev, 0); + + /* + * Resume any suspended I/Os. + */ + if (zio_resume(spa) != 0) + error = SET_ERROR(EIO); + + spa_close(spa, FTAG); + + return (error); +} + +/* + * Reopen all the vdevs associated with the pool. + * + * innvl: { + * "scrub_restart" -> when true and scrub is running, allow to restart + * scrub as the side effect of the reopen (boolean). + * } + * + * outnvl is unused + */ +static const zfs_ioc_key_t zfs_keys_pool_reopen[] = { + {"scrub_restart", DATA_TYPE_BOOLEAN_VALUE, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_pool_reopen(const char *pool, nvlist_t *innvl, nvlist_t *outnvl) +{ + spa_t *spa; + int error; + boolean_t rc, scrub_restart = B_TRUE; + + if (innvl) { + error = nvlist_lookup_boolean_value(innvl, + "scrub_restart", &rc); + if (error == 0) + scrub_restart = rc; + } + + error = spa_open(pool, &spa, FTAG); + if (error != 0) + return (error); + + spa_vdev_state_enter(spa, SCL_NONE); + + /* + * If the scrub_restart flag is B_FALSE and a scrub is already + * in progress then set spa_scrub_reopen flag to B_TRUE so that + * we don't restart the scrub as a side effect of the reopen. + * Otherwise, let vdev_open() decided if a resilver is required. + */ + + spa->spa_scrub_reopen = (!scrub_restart && + dsl_scan_scrubbing(spa->spa_dsl_pool)); + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + + (void) spa_vdev_state_exit(spa, NULL, 0); + spa_close(spa, FTAG); + return (0); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_string name of conflicting snapshot, if there is one + */ +static int +zfs_ioc_promote(zfs_cmd_t *zc) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds, *ods; + char origin[ZFS_MAX_DATASET_NAME_LEN]; + char *cp; + int error; + + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0 || + strchr(zc->zc_name, '%')) + return (SET_ERROR(EINVAL)); + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &ds); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dir_is_clone(ds->ds_dir)) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EINVAL)); + } + + error = dsl_dataset_hold_obj(dp, + dsl_dir_phys(ds->ds_dir)->dd_origin_obj, FTAG, &ods); + if (error != 0) { + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + dsl_dataset_name(ods, origin); + dsl_dataset_rele(ods, FTAG); + dsl_dataset_rele(ds, FTAG); + dsl_pool_rele(dp, FTAG); + + /* + * We don't need to unmount *all* the origin fs's snapshots, but + * it's easier. + */ + cp = strchr(origin, '@'); + if (cp) + *cp = '\0'; + (void) dmu_objset_find(origin, + zfs_unmount_snap_cb, NULL, DS_FIND_SNAPSHOTS); + return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); +} + +/* + * Retrieve a single {user|group|project}{used|quota}@... property. + * + * inputs: + * zc_name name of filesystem + * zc_objset_type zfs_userquota_prop_t + * zc_value domain name (eg. "S-1-234-567-89") + * zc_guid RID/UID/GID + * + * outputs: + * zc_cookie property value + */ +static int +zfs_ioc_userspace_one(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (SET_ERROR(EINVAL)); + + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) + return (error); + + error = zfs_userspace_one(zfsvfs, + zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_objset_type zfs_userquota_prop_t + * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) + * + * outputs: + * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) + * zc_cookie zap cursor + */ +static int +zfs_ioc_userspace_many(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int bufsize = zc->zc_nvlist_dst_size; + + if (bufsize <= 0) + return (SET_ERROR(ENOMEM)); + + int error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs, B_FALSE); + if (error != 0) + return (error); + + void *buf = vmem_alloc(bufsize, KM_SLEEP); + + error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, + buf, &zc->zc_nvlist_dst_size); + + if (error == 0) { + error = xcopyout(buf, + (void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size); + } + vmem_free(buf, bufsize); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error = 0; + zfsvfs_t *zfsvfs; + + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { + /* + * If userused is not enabled, it may be because the + * objset needs to be closed & reopened (to grow the + * objset_phys_t). Suspend/resume the fs will do that. + */ + dsl_dataset_t *ds, *newds; + + ds = dmu_objset_ds(zfsvfs->z_os); + error = zfs_suspend_fs(zfsvfs); + if (error == 0) { + dmu_objset_refresh_ownership(ds, &newds, + B_TRUE, zfsvfs); + error = zfs_resume_fs(zfsvfs, newds); + } + } + if (error == 0) + error = dmu_objset_userspace_upgrade(zfsvfs->z_os); + zfs_vfs_rele(zfsvfs); + } else { + /* XXX kind of reading contents without owning */ + error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); + if (error != 0) + return (error); + + error = dmu_objset_userspace_upgrade(os); + dmu_objset_rele_flags(os, B_TRUE, FTAG); + } + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_id_quota_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error; + + error = dmu_objset_hold_flags(zc->zc_name, B_TRUE, FTAG, &os); + if (error != 0) + return (error); + + if (dmu_objset_userobjspace_upgradable(os) || + dmu_objset_projectquota_upgradable(os)) { + mutex_enter(&os->os_upgrade_lock); + if (os->os_upgrade_id == 0) { + /* clear potential error code and retry */ + os->os_upgrade_status = 0; + mutex_exit(&os->os_upgrade_lock); + + dmu_objset_id_quota_upgrade(os); + } else { + mutex_exit(&os->os_upgrade_lock); + } + + dsl_pool_rele(dmu_objset_pool(os), FTAG); + + taskq_wait_id(os->os_spa->spa_upgrade_taskq, os->os_upgrade_id); + error = os->os_upgrade_status; + } else { + dsl_pool_rele(dmu_objset_pool(os), FTAG); + } + + dsl_dataset_rele_flags(dmu_objset_ds(os), DS_HOLD_FLAG_DECRYPT, FTAG); + + return (error); +} + +static int +zfs_ioc_share(zfs_cmd_t *zc) +{ + return (SET_ERROR(ENOSYS)); +} + +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +/* + * inputs: + * zc_name name of containing filesystem + * zc_obj object # beyond which we want next in-use object # + * + * outputs: + * zc_obj next in-use object # + */ +static int +zfs_ioc_next_obj(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error != 0) + return (error); + + error = dmu_object_next(os, &zc->zc_obj, B_FALSE, 0); + + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value prefix name for snapshot + * zc_cleanup_fd cleanup-on-exit file descriptor for calling process + * + * outputs: + * zc_value short name of new snapshot + */ +static int +zfs_ioc_tmp_snapshot(zfs_cmd_t *zc) +{ + char *snap_name; + char *hold_name; + int error; + minor_t minor; + + error = zfs_onexit_fd_hold(zc->zc_cleanup_fd, &minor); + if (error != 0) + return (error); + + snap_name = kmem_asprintf("%s-%016llx", zc->zc_value, + (u_longlong_t)ddi_get_lbolt64()); + hold_name = kmem_asprintf("%%%s", zc->zc_value); + + error = dsl_dataset_snapshot_tmp(zc->zc_name, snap_name, minor, + hold_name); + if (error == 0) + (void) strlcpy(zc->zc_value, snap_name, + sizeof (zc->zc_value)); + kmem_strfree(snap_name); + kmem_strfree(hold_name); + zfs_onexit_fd_rele(zc->zc_cleanup_fd); + return (error); +} + +/* + * inputs: + * zc_name name of "to" snapshot + * zc_value name of "from" snapshot + * zc_cookie file descriptor to write diff data on + * + * outputs: + * dmu_diff_record_t's to the file descriptor + */ +static int +zfs_ioc_diff(zfs_cmd_t *zc) +{ + zfs_file_t *fp; + offset_t off; + int error; + + if ((error = zfs_file_get(zc->zc_cookie, &fp))) + return (error); + + off = zfs_file_off(fp); + error = dmu_diff(zc->zc_name, zc->zc_value, fp, &off); + + zfs_file_put(zc->zc_cookie); + + return (error); +} + +static int +zfs_ioc_smb_acl(zfs_cmd_t *zc) +{ + return (SET_ERROR(ENOTSUP)); +} + +/* + * innvl: { + * "holds" -> { snapname -> holdname (string), ... } + * (optional) "cleanup_fd" -> fd (int32) + * } + * + * outnvl: { + * snapname -> error value (int32) + * ... + * } + */ +static const zfs_ioc_key_t zfs_keys_hold[] = { + {"holds", DATA_TYPE_NVLIST, 0}, + {"cleanup_fd", DATA_TYPE_INT32, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_hold(const char *pool, nvlist_t *args, nvlist_t *errlist) +{ + nvpair_t *pair; + nvlist_t *holds; + int cleanup_fd = -1; + int error; + minor_t minor = 0; + + holds = fnvlist_lookup_nvlist(args, "holds"); + + /* make sure the user didn't pass us any invalid (empty) tags */ + for (pair = nvlist_next_nvpair(holds, NULL); pair != NULL; + pair = nvlist_next_nvpair(holds, pair)) { + char *htag; + + error = nvpair_value_string(pair, &htag); + if (error != 0) + return (SET_ERROR(error)); + + if (strlen(htag) == 0) + return (SET_ERROR(EINVAL)); + } + + if (nvlist_lookup_int32(args, "cleanup_fd", &cleanup_fd) == 0) { + error = zfs_onexit_fd_hold(cleanup_fd, &minor); + if (error != 0) + return (SET_ERROR(error)); + } + + error = dsl_dataset_user_hold(holds, minor, errlist); + if (minor != 0) + zfs_onexit_fd_rele(cleanup_fd); + return (SET_ERROR(error)); +} + +/* + * innvl is not used. + * + * outnvl: { + * holdname -> time added (uint64 seconds since epoch) + * ... + * } + */ +static const zfs_ioc_key_t zfs_keys_get_holds[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_get_holds(const char *snapname, nvlist_t *args, nvlist_t *outnvl) +{ + return (dsl_dataset_get_holds(snapname, outnvl)); +} + +/* + * innvl: { + * snapname -> { holdname, ... } + * ... + * } + * + * outnvl: { + * snapname -> error value (int32) + * ... + * } + */ +static const zfs_ioc_key_t zfs_keys_release[] = { + {"...", DATA_TYPE_NVLIST, ZK_WILDCARDLIST}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_release(const char *pool, nvlist_t *holds, nvlist_t *errlist) +{ + return (dsl_dataset_user_release(holds, errlist)); +} + +/* + * inputs: + * zc_guid flags (ZEVENT_NONBLOCK) + * zc_cleanup_fd zevent file descriptor + * + * outputs: + * zc_nvlist_dst next nvlist event + * zc_cookie dropped events since last get + */ +static int +zfs_ioc_events_next(zfs_cmd_t *zc) +{ + zfs_zevent_t *ze; + nvlist_t *event = NULL; + minor_t minor; + uint64_t dropped = 0; + int error; + + error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (error != 0) + return (error); + + do { + error = zfs_zevent_next(ze, &event, + &zc->zc_nvlist_dst_size, &dropped); + if (event != NULL) { + zc->zc_cookie = dropped; + error = put_nvlist(zc, event); + nvlist_free(event); + } + + if (zc->zc_guid & ZEVENT_NONBLOCK) + break; + + if ((error == 0) || (error != ENOENT)) + break; + + error = zfs_zevent_wait(ze); + if (error != 0) + break; + } while (1); + + zfs_zevent_fd_rele(zc->zc_cleanup_fd); + + return (error); +} + +/* + * outputs: + * zc_cookie cleared events count + */ +static int +zfs_ioc_events_clear(zfs_cmd_t *zc) +{ + int count; + + zfs_zevent_drain_all(&count); + zc->zc_cookie = count; + + return (0); +} + +/* + * inputs: + * zc_guid eid | ZEVENT_SEEK_START | ZEVENT_SEEK_END + * zc_cleanup zevent file descriptor + */ +static int +zfs_ioc_events_seek(zfs_cmd_t *zc) +{ + zfs_zevent_t *ze; + minor_t minor; + int error; + + error = zfs_zevent_fd_hold(zc->zc_cleanup_fd, &minor, &ze); + if (error != 0) + return (error); + + error = zfs_zevent_seek(ze, zc->zc_guid); + zfs_zevent_fd_rele(zc->zc_cleanup_fd); + + return (error); +} + +/* + * inputs: + * zc_name name of later filesystem or snapshot + * zc_value full name of old snapshot or bookmark + * + * outputs: + * zc_cookie space in bytes + * zc_objset_type compressed space in bytes + * zc_perm_action uncompressed space in bytes + */ +static int +zfs_ioc_space_written(zfs_cmd_t *zc) +{ + int error; + dsl_pool_t *dp; + dsl_dataset_t *new; + + error = dsl_pool_hold(zc->zc_name, FTAG, &dp); + if (error != 0) + return (error); + error = dsl_dataset_hold(dp, zc->zc_name, FTAG, &new); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + if (strchr(zc->zc_value, '#') != NULL) { + zfs_bookmark_phys_t bmp; + error = dsl_bookmark_lookup(dp, zc->zc_value, + new, &bmp); + if (error == 0) { + error = dsl_dataset_space_written_bookmark(&bmp, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + } + } else { + dsl_dataset_t *old; + error = dsl_dataset_hold(dp, zc->zc_value, FTAG, &old); + + if (error == 0) { + error = dsl_dataset_space_written(old, new, + &zc->zc_cookie, + &zc->zc_objset_type, &zc->zc_perm_action); + dsl_dataset_rele(old, FTAG); + } + } + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); +} + +/* + * innvl: { + * "firstsnap" -> snapshot name + * } + * + * outnvl: { + * "used" -> space in bytes + * "compressed" -> compressed space in bytes + * "uncompressed" -> uncompressed space in bytes + * } + */ +static const zfs_ioc_key_t zfs_keys_space_snaps[] = { + {"firstsnap", DATA_TYPE_STRING, 0}, +}; + +static int +zfs_ioc_space_snaps(const char *lastsnap, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error; + dsl_pool_t *dp; + dsl_dataset_t *new, *old; + char *firstsnap; + uint64_t used, comp, uncomp; + + firstsnap = fnvlist_lookup_string(innvl, "firstsnap"); + + error = dsl_pool_hold(lastsnap, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, lastsnap, FTAG, &new); + if (error == 0 && !new->ds_is_snapshot) { + dsl_dataset_rele(new, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + error = dsl_dataset_hold(dp, firstsnap, FTAG, &old); + if (error == 0 && !old->ds_is_snapshot) { + dsl_dataset_rele(old, FTAG); + error = SET_ERROR(EINVAL); + } + if (error != 0) { + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + error = dsl_dataset_space_wouldfree(old, new, &used, &comp, &uncomp); + dsl_dataset_rele(old, FTAG); + dsl_dataset_rele(new, FTAG); + dsl_pool_rele(dp, FTAG); + fnvlist_add_uint64(outnvl, "used", used); + fnvlist_add_uint64(outnvl, "compressed", comp); + fnvlist_add_uint64(outnvl, "uncompressed", uncomp); + return (error); +} + +/* + * innvl: { + * "fd" -> file descriptor to write stream to (int32) + * (optional) "fromsnap" -> full snap name to send an incremental from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted + * (optional) "rawok" -> (value ignored) + * presence indicates raw encrypted records should be used. + * (optional) "savedok" -> (value ignored) + * presence indicates we should send a partially received snapshot + * (optional) "resume_object" and "resume_offset" -> (uint64) + * if present, resume send stream from specified object and offset. + * (optional) "redactbook" -> (string) + * if present, use this bookmark's redaction list to generate a redacted + * send stream + * } + * + * outnvl is unused + */ +static const zfs_ioc_key_t zfs_keys_send_new[] = { + {"fd", DATA_TYPE_INT32, 0}, + {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"savedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"resume_object", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"resume_offset", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_send_new(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int error; + offset_t off; + char *fromname = NULL; + int fd; + zfs_file_t *fp; + boolean_t largeblockok; + boolean_t embedok; + boolean_t compressok; + boolean_t rawok; + boolean_t savedok; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; + char *redactbook = NULL; + + fd = fnvlist_lookup_int32(innvl, "fd"); + + (void) nvlist_lookup_string(innvl, "fromsnap", &fromname); + + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); + rawok = nvlist_exists(innvl, "rawok"); + savedok = nvlist_exists(innvl, "savedok"); + + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + + (void) nvlist_lookup_string(innvl, "redactbook", &redactbook); + + if ((error = zfs_file_get(fd, &fp))) + return (error); + + off = zfs_file_off(fp); + + dmu_send_outparams_t out = {0}; + out.dso_outfunc = dump_bytes; + out.dso_arg = fp; + out.dso_dryrun = B_FALSE; + error = dmu_send(snapname, fromname, embedok, largeblockok, + compressok, rawok, savedok, resumeobj, resumeoff, + redactbook, fd, &off, &out); + + zfs_file_put(fd); + return (error); +} + +/* ARGSUSED */ +static int +send_space_sum(objset_t *os, void *buf, int len, void *arg) +{ + uint64_t *size = arg; + *size += len; + return (0); +} + +/* + * Determine approximately how large a zfs send stream will be -- the number + * of bytes that will be written to the fd supplied to zfs_ioc_send_new(). + * + * innvl: { + * (optional) "from" -> full snap or bookmark name to send an incremental + * from + * (optional) "largeblockok" -> (value ignored) + * indicates that blocks > 128KB are permitted + * (optional) "embedok" -> (value ignored) + * presence indicates DRR_WRITE_EMBEDDED records are permitted + * (optional) "compressok" -> (value ignored) + * presence indicates compressed DRR_WRITE records are permitted + * (optional) "rawok" -> (value ignored) + * presence indicates raw encrypted records should be used. + * (optional) "fd" -> file descriptor to use as a cookie for progress + * tracking (int32) + * } + * + * outnvl: { + * "space" -> bytes of space (uint64) + * } + */ +static const zfs_ioc_key_t zfs_keys_send_space[] = { + {"from", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"fromsnap", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"largeblockok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"embedok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"compressok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"rawok", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, + {"fd", DATA_TYPE_INT32, ZK_OPTIONAL}, + {"redactbook", DATA_TYPE_STRING, ZK_OPTIONAL}, + {"resumeobj", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"resumeoff", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"bytes", DATA_TYPE_UINT64, ZK_OPTIONAL}, +}; + +static int +zfs_ioc_send_space(const char *snapname, nvlist_t *innvl, nvlist_t *outnvl) +{ + dsl_pool_t *dp; + dsl_dataset_t *tosnap; + dsl_dataset_t *fromsnap = NULL; + int error; + char *fromname = NULL; + char *redactlist_book = NULL; + boolean_t largeblockok; + boolean_t embedok; + boolean_t compressok; + boolean_t rawok; + boolean_t savedok; + uint64_t space = 0; + boolean_t full_estimate = B_FALSE; + uint64_t resumeobj = 0; + uint64_t resumeoff = 0; + uint64_t resume_bytes = 0; + int32_t fd = -1; + zfs_bookmark_phys_t zbm = {0}; + + error = dsl_pool_hold(snapname, FTAG, &dp); + if (error != 0) + return (error); + + error = dsl_dataset_hold(dp, snapname, FTAG, &tosnap); + if (error != 0) { + dsl_pool_rele(dp, FTAG); + return (error); + } + (void) nvlist_lookup_int32(innvl, "fd", &fd); + + largeblockok = nvlist_exists(innvl, "largeblockok"); + embedok = nvlist_exists(innvl, "embedok"); + compressok = nvlist_exists(innvl, "compressok"); + rawok = nvlist_exists(innvl, "rawok"); + savedok = nvlist_exists(innvl, "savedok"); + boolean_t from = (nvlist_lookup_string(innvl, "from", &fromname) == 0); + boolean_t altbook = (nvlist_lookup_string(innvl, "redactbook", + &redactlist_book) == 0); + + (void) nvlist_lookup_uint64(innvl, "resume_object", &resumeobj); + (void) nvlist_lookup_uint64(innvl, "resume_offset", &resumeoff); + (void) nvlist_lookup_uint64(innvl, "bytes", &resume_bytes); + + if (altbook) { + full_estimate = B_TRUE; + } else if (from) { + if (strchr(fromname, '#')) { + error = dsl_bookmark_lookup(dp, fromname, tosnap, &zbm); + + /* + * dsl_bookmark_lookup() will fail with EXDEV if + * the from-bookmark and tosnap are at the same txg. + * However, it's valid to do a send (and therefore, + * a send estimate) from and to the same time point, + * if the bookmark is redacted (the incremental send + * can change what's redacted on the target). In + * this case, dsl_bookmark_lookup() fills in zbm + * but returns EXDEV. Ignore this error. + */ + if (error == EXDEV && zbm.zbm_redaction_obj != 0 && + zbm.zbm_guid == + dsl_dataset_phys(tosnap)->ds_guid) + error = 0; + + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + if (zbm.zbm_redaction_obj != 0 || !(zbm.zbm_flags & + ZBM_FLAG_HAS_FBN)) { + full_estimate = B_TRUE; + } + } else if (strchr(fromname, '@')) { + error = dsl_dataset_hold(dp, fromname, FTAG, &fromsnap); + if (error != 0) { + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (error); + } + + if (!dsl_dataset_is_before(tosnap, fromsnap, 0)) { + full_estimate = B_TRUE; + dsl_dataset_rele(fromsnap, FTAG); + } + } else { + /* + * from is not properly formatted as a snapshot or + * bookmark + */ + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + return (SET_ERROR(EINVAL)); + } + } + + if (full_estimate) { + dmu_send_outparams_t out = {0}; + offset_t off = 0; + out.dso_outfunc = send_space_sum; + out.dso_arg = &space; + out.dso_dryrun = B_TRUE; + /* + * We have to release these holds so dmu_send can take them. It + * will do all the error checking we need. + */ + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + error = dmu_send(snapname, fromname, embedok, largeblockok, + compressok, rawok, savedok, resumeobj, resumeoff, + redactlist_book, fd, &off, &out); + } else { + error = dmu_send_estimate_fast(tosnap, fromsnap, + (from && strchr(fromname, '#') != NULL ? &zbm : NULL), + compressok || rawok, savedok, &space); + space -= resume_bytes; + if (fromsnap != NULL) + dsl_dataset_rele(fromsnap, FTAG); + dsl_dataset_rele(tosnap, FTAG); + dsl_pool_rele(dp, FTAG); + } + + fnvlist_add_uint64(outnvl, "space", space); + + return (error); +} + +/* + * Sync the currently open TXG to disk for the specified pool. + * This is somewhat similar to 'zfs_sync()'. + * For cases that do not result in error this ioctl will wait for + * the currently open TXG to commit before returning back to the caller. + * + * innvl: { + * "force" -> when true, force uberblock update even if there is no dirty data. + * In addition this will cause the vdev configuration to be written + * out including updating the zpool cache file. (boolean_t) + * } + * + * onvl is unused + */ +static const zfs_ioc_key_t zfs_keys_pool_sync[] = { + {"force", DATA_TYPE_BOOLEAN_VALUE, 0}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_pool_sync(const char *pool, nvlist_t *innvl, nvlist_t *onvl) +{ + int err; + boolean_t force = B_FALSE; + spa_t *spa; + + if ((err = spa_open(pool, &spa, FTAG)) != 0) + return (err); + + if (innvl) + force = fnvlist_lookup_boolean_value(innvl, "force"); + + if (force) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_WRITER); + vdev_config_dirty(spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + txg_wait_synced(spa_get_dsl(spa), 0); + + spa_close(spa, FTAG); + + return (err); +} + +/* + * Load a user's wrapping key into the kernel. + * innvl: { + * "hidden_args" -> { "wkeydata" -> value } + * raw uint8_t array of encryption wrapping key data (32 bytes) + * (optional) "noop" -> (value ignored) + * presence indicated key should only be verified, not loaded + * } + */ +static const zfs_ioc_key_t zfs_keys_load_key[] = { + {"hidden_args", DATA_TYPE_NVLIST, 0}, + {"noop", DATA_TYPE_BOOLEAN, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_load_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret; + dsl_crypto_params_t *dcp = NULL; + nvlist_t *hidden_args; + boolean_t noop = nvlist_exists(innvl, "noop"); + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = SET_ERROR(EINVAL); + goto error; + } + + hidden_args = fnvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS); + + ret = dsl_crypto_params_create_nvlist(DCP_CMD_NONE, NULL, + hidden_args, &dcp); + if (ret != 0) + goto error; + + ret = spa_keystore_load_wkey(dsname, dcp, noop); + if (ret != 0) + goto error; + + dsl_crypto_params_free(dcp, noop); + + return (0); + +error: + dsl_crypto_params_free(dcp, B_TRUE); + return (ret); +} + +/* + * Unload a user's wrapping key from the kernel. + * Both innvl and outnvl are unused. + */ +static const zfs_ioc_key_t zfs_keys_unload_key[] = { + /* no nvl keys */ +}; + +/* ARGSUSED */ +static int +zfs_ioc_unload_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret = 0; + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = (SET_ERROR(EINVAL)); + goto out; + } + + ret = spa_keystore_unload_wkey(dsname); + if (ret != 0) + goto out; + +out: + return (ret); +} + +/* + * Changes a user's wrapping key used to decrypt a dataset. The keyformat, + * keylocation, pbkdf2salt, and pbkdf2iters properties can also be specified + * here to change how the key is derived in userspace. + * + * innvl: { + * "hidden_args" (optional) -> { "wkeydata" -> value } + * raw uint8_t array of new encryption wrapping key data (32 bytes) + * "props" (optional) -> { prop -> value } + * } + * + * outnvl is unused + */ +static const zfs_ioc_key_t zfs_keys_change_key[] = { + {"crypt_cmd", DATA_TYPE_UINT64, ZK_OPTIONAL}, + {"hidden_args", DATA_TYPE_NVLIST, ZK_OPTIONAL}, + {"props", DATA_TYPE_NVLIST, ZK_OPTIONAL}, +}; + +/* ARGSUSED */ +static int +zfs_ioc_change_key(const char *dsname, nvlist_t *innvl, nvlist_t *outnvl) +{ + int ret; + uint64_t cmd = DCP_CMD_NONE; + dsl_crypto_params_t *dcp = NULL; + nvlist_t *args = NULL, *hidden_args = NULL; + + if (strchr(dsname, '@') != NULL || strchr(dsname, '%') != NULL) { + ret = (SET_ERROR(EINVAL)); + goto error; + } + + (void) nvlist_lookup_uint64(innvl, "crypt_cmd", &cmd); + (void) nvlist_lookup_nvlist(innvl, "props", &args); + (void) nvlist_lookup_nvlist(innvl, ZPOOL_HIDDEN_ARGS, &hidden_args); + + ret = dsl_crypto_params_create_nvlist(cmd, args, hidden_args, &dcp); + if (ret != 0) + goto error; + + ret = spa_keystore_change_key(dsname, dcp); + if (ret != 0) + goto error; + + dsl_crypto_params_free(dcp, B_FALSE); + + return (0); + +error: + dsl_crypto_params_free(dcp, B_TRUE); + return (ret); +} + +static zfs_ioc_vec_t zfs_ioc_vec[ZFS_IOC_LAST - ZFS_IOC_FIRST]; + +static void +zfs_ioctl_register_legacy(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + boolean_t log_history, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; + + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); + + vec->zvec_legacy_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_allow_log = log_history; + vec->zvec_pool_check = pool_check; +} + +/* + * See the block comment at the beginning of this file for details on + * each argument to this function. + */ +void +zfs_ioctl_register(const char *name, zfs_ioc_t ioc, zfs_ioc_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_namecheck_t namecheck, + zfs_ioc_poolcheck_t pool_check, boolean_t smush_outnvlist, + boolean_t allow_log, const zfs_ioc_key_t *nvl_keys, size_t num_keys) +{ + zfs_ioc_vec_t *vec = &zfs_ioc_vec[ioc - ZFS_IOC_FIRST]; + + ASSERT3U(ioc, >=, ZFS_IOC_FIRST); + ASSERT3U(ioc, <, ZFS_IOC_LAST); + ASSERT3P(vec->zvec_legacy_func, ==, NULL); + ASSERT3P(vec->zvec_func, ==, NULL); + + /* if we are logging, the name must be valid */ + ASSERT(!allow_log || namecheck != NO_NAME); + + vec->zvec_name = name; + vec->zvec_func = func; + vec->zvec_secpolicy = secpolicy; + vec->zvec_namecheck = namecheck; + vec->zvec_pool_check = pool_check; + vec->zvec_smush_outnvlist = smush_outnvlist; + vec->zvec_allow_log = allow_log; + vec->zvec_nvl_keys = nvl_keys; + vec->zvec_nvl_key_count = num_keys; +} + +static void +zfs_ioctl_register_pool(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, boolean_t log_history, + zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + POOL_NAME, log_history, pool_check); +} + +void +zfs_ioctl_register_dataset_nolog(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy, zfs_ioc_poolcheck_t pool_check) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, pool_check); +} + +static void +zfs_ioctl_register_pool_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +{ + zfs_ioctl_register_legacy(ioc, func, zfs_secpolicy_config, + POOL_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); +} + +static void +zfs_ioctl_register_pool_meta(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + NO_NAME, B_FALSE, POOL_CHECK_NONE); +} + +static void +zfs_ioctl_register_dataset_read_secpolicy(zfs_ioc_t ioc, + zfs_ioc_legacy_func_t *func, zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_FALSE, POOL_CHECK_SUSPENDED); +} + +static void +zfs_ioctl_register_dataset_read(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func) +{ + zfs_ioctl_register_dataset_read_secpolicy(ioc, func, + zfs_secpolicy_read); +} + +static void +zfs_ioctl_register_dataset_modify(zfs_ioc_t ioc, zfs_ioc_legacy_func_t *func, + zfs_secpolicy_func_t *secpolicy) +{ + zfs_ioctl_register_legacy(ioc, func, secpolicy, + DATASET_NAME, B_TRUE, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); +} + +static void +zfs_ioctl_init(void) +{ + zfs_ioctl_register("snapshot", ZFS_IOC_SNAPSHOT, + zfs_ioc_snapshot, zfs_secpolicy_snapshot, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_snapshot, ARRAY_SIZE(zfs_keys_snapshot)); + + zfs_ioctl_register("log_history", ZFS_IOC_LOG_HISTORY, + zfs_ioc_log_history, zfs_secpolicy_log_history, NO_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_log_history, ARRAY_SIZE(zfs_keys_log_history)); + + zfs_ioctl_register("space_snaps", ZFS_IOC_SPACE_SNAPS, + zfs_ioc_space_snaps, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_space_snaps, ARRAY_SIZE(zfs_keys_space_snaps)); + + zfs_ioctl_register("send", ZFS_IOC_SEND_NEW, + zfs_ioc_send_new, zfs_secpolicy_send_new, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_send_new, ARRAY_SIZE(zfs_keys_send_new)); + + zfs_ioctl_register("send_space", ZFS_IOC_SEND_SPACE, + zfs_ioc_send_space, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_send_space, ARRAY_SIZE(zfs_keys_send_space)); + + zfs_ioctl_register("create", ZFS_IOC_CREATE, + zfs_ioc_create, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_create, ARRAY_SIZE(zfs_keys_create)); + + zfs_ioctl_register("clone", ZFS_IOC_CLONE, + zfs_ioc_clone, zfs_secpolicy_create_clone, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_clone, ARRAY_SIZE(zfs_keys_clone)); + + zfs_ioctl_register("remap", ZFS_IOC_REMAP, + zfs_ioc_remap, zfs_secpolicy_none, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_remap, ARRAY_SIZE(zfs_keys_remap)); + + zfs_ioctl_register("destroy_snaps", ZFS_IOC_DESTROY_SNAPS, + zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_destroy_snaps, ARRAY_SIZE(zfs_keys_destroy_snaps)); + + zfs_ioctl_register("hold", ZFS_IOC_HOLD, + zfs_ioc_hold, zfs_secpolicy_hold, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_hold, ARRAY_SIZE(zfs_keys_hold)); + zfs_ioctl_register("release", ZFS_IOC_RELEASE, + zfs_ioc_release, zfs_secpolicy_release, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_release, ARRAY_SIZE(zfs_keys_release)); + + zfs_ioctl_register("get_holds", ZFS_IOC_GET_HOLDS, + zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_get_holds, ARRAY_SIZE(zfs_keys_get_holds)); + + zfs_ioctl_register("rollback", ZFS_IOC_ROLLBACK, + zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_rollback, ARRAY_SIZE(zfs_keys_rollback)); + + zfs_ioctl_register("bookmark", ZFS_IOC_BOOKMARK, + zfs_ioc_bookmark, zfs_secpolicy_bookmark, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_bookmark, ARRAY_SIZE(zfs_keys_bookmark)); + + zfs_ioctl_register("get_bookmarks", ZFS_IOC_GET_BOOKMARKS, + zfs_ioc_get_bookmarks, zfs_secpolicy_read, DATASET_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, + zfs_keys_get_bookmarks, ARRAY_SIZE(zfs_keys_get_bookmarks)); + + zfs_ioctl_register("get_bookmark_props", ZFS_IOC_GET_BOOKMARK_PROPS, + zfs_ioc_get_bookmark_props, zfs_secpolicy_read, ENTITY_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_FALSE, zfs_keys_get_bookmark_props, + ARRAY_SIZE(zfs_keys_get_bookmark_props)); + + zfs_ioctl_register("destroy_bookmarks", ZFS_IOC_DESTROY_BOOKMARKS, + zfs_ioc_destroy_bookmarks, zfs_secpolicy_destroy_bookmarks, + POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_destroy_bookmarks, + ARRAY_SIZE(zfs_keys_destroy_bookmarks)); + + zfs_ioctl_register("receive", ZFS_IOC_RECV_NEW, + zfs_ioc_recv_new, zfs_secpolicy_recv_new, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_recv_new, ARRAY_SIZE(zfs_keys_recv_new)); + zfs_ioctl_register("load-key", ZFS_IOC_LOAD_KEY, + zfs_ioc_load_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_load_key, ARRAY_SIZE(zfs_keys_load_key)); + zfs_ioctl_register("unload-key", ZFS_IOC_UNLOAD_KEY, + zfs_ioc_unload_key, zfs_secpolicy_load_key, + DATASET_NAME, POOL_CHECK_SUSPENDED, B_TRUE, B_TRUE, + zfs_keys_unload_key, ARRAY_SIZE(zfs_keys_unload_key)); + zfs_ioctl_register("change-key", ZFS_IOC_CHANGE_KEY, + zfs_ioc_change_key, zfs_secpolicy_change_key, + DATASET_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, + B_TRUE, B_TRUE, zfs_keys_change_key, + ARRAY_SIZE(zfs_keys_change_key)); + + zfs_ioctl_register("sync", ZFS_IOC_POOL_SYNC, + zfs_ioc_pool_sync, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_sync, ARRAY_SIZE(zfs_keys_pool_sync)); + zfs_ioctl_register("reopen", ZFS_IOC_POOL_REOPEN, zfs_ioc_pool_reopen, + zfs_secpolicy_config, POOL_NAME, POOL_CHECK_SUSPENDED, B_TRUE, + B_TRUE, zfs_keys_pool_reopen, ARRAY_SIZE(zfs_keys_pool_reopen)); + + zfs_ioctl_register("channel_program", ZFS_IOC_CHANNEL_PROGRAM, + zfs_ioc_channel_program, zfs_secpolicy_config, + POOL_NAME, POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, + B_TRUE, zfs_keys_channel_program, + ARRAY_SIZE(zfs_keys_channel_program)); + + zfs_ioctl_register("redact", ZFS_IOC_REDACT, + zfs_ioc_redact, zfs_secpolicy_config, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_redact, ARRAY_SIZE(zfs_keys_redact)); + + zfs_ioctl_register("zpool_checkpoint", ZFS_IOC_POOL_CHECKPOINT, + zfs_ioc_pool_checkpoint, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_checkpoint, ARRAY_SIZE(zfs_keys_pool_checkpoint)); + + zfs_ioctl_register("zpool_discard_checkpoint", + ZFS_IOC_POOL_DISCARD_CHECKPOINT, zfs_ioc_pool_discard_checkpoint, + zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_discard_checkpoint, + ARRAY_SIZE(zfs_keys_pool_discard_checkpoint)); + + zfs_ioctl_register("initialize", ZFS_IOC_POOL_INITIALIZE, + zfs_ioc_pool_initialize, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_initialize, ARRAY_SIZE(zfs_keys_pool_initialize)); + + zfs_ioctl_register("trim", ZFS_IOC_POOL_TRIM, + zfs_ioc_pool_trim, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_TRUE, B_TRUE, + zfs_keys_pool_trim, ARRAY_SIZE(zfs_keys_pool_trim)); + + zfs_ioctl_register("wait", ZFS_IOC_WAIT, + zfs_ioc_wait, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_pool_wait, ARRAY_SIZE(zfs_keys_pool_wait)); + + zfs_ioctl_register("wait_fs", ZFS_IOC_WAIT_FS, + zfs_ioc_wait_fs, zfs_secpolicy_none, DATASET_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_FALSE, + zfs_keys_fs_wait, ARRAY_SIZE(zfs_keys_fs_wait)); + + zfs_ioctl_register("set_bootenv", ZFS_IOC_SET_BOOTENV, + zfs_ioc_set_bootenv, zfs_secpolicy_config, POOL_NAME, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY, B_FALSE, B_TRUE, + zfs_keys_set_bootenv, ARRAY_SIZE(zfs_keys_set_bootenv)); + + zfs_ioctl_register("get_bootenv", ZFS_IOC_GET_BOOTENV, + zfs_ioc_get_bootenv, zfs_secpolicy_none, POOL_NAME, + POOL_CHECK_SUSPENDED, B_FALSE, B_TRUE, + zfs_keys_get_bootenv, ARRAY_SIZE(zfs_keys_get_bootenv)); + + /* IOCTLS that use the legacy function signature */ + + zfs_ioctl_register_legacy(ZFS_IOC_POOL_FREEZE, zfs_ioc_pool_freeze, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_READONLY); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_CREATE, zfs_ioc_pool_create, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SCAN, + zfs_ioc_pool_scan); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_UPGRADE, + zfs_ioc_pool_upgrade); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ADD, + zfs_ioc_vdev_add); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_REMOVE, + zfs_ioc_vdev_remove); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SET_STATE, + zfs_ioc_vdev_set_state); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_ATTACH, + zfs_ioc_vdev_attach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_DETACH, + zfs_ioc_vdev_detach); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETPATH, + zfs_ioc_vdev_setpath); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SETFRU, + zfs_ioc_vdev_setfru); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_SET_PROPS, + zfs_ioc_pool_set_props); + zfs_ioctl_register_pool_modify(ZFS_IOC_VDEV_SPLIT, + zfs_ioc_vdev_split); + zfs_ioctl_register_pool_modify(ZFS_IOC_POOL_REGUID, + zfs_ioc_pool_reguid); + + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_CONFIGS, + zfs_ioc_pool_configs, zfs_secpolicy_none); + zfs_ioctl_register_pool_meta(ZFS_IOC_POOL_TRYIMPORT, + zfs_ioc_pool_tryimport, zfs_secpolicy_config); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_FAULT, + zfs_ioc_inject_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_CLEAR_FAULT, + zfs_ioc_clear_fault, zfs_secpolicy_inject); + zfs_ioctl_register_pool_meta(ZFS_IOC_INJECT_LIST_NEXT, + zfs_ioc_inject_list_next, zfs_secpolicy_inject); + + /* + * pool destroy, and export don't log the history as part of + * zfsdev_ioctl, but rather zfs_ioc_pool_export + * does the logging of those commands. + */ + zfs_ioctl_register_pool(ZFS_IOC_POOL_DESTROY, zfs_ioc_pool_destroy, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_ioctl_register_pool(ZFS_IOC_POOL_EXPORT, zfs_ioc_pool_export, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_STATS, zfs_ioc_pool_stats, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_PROPS, zfs_ioc_pool_get_props, + zfs_secpolicy_read, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_ERROR_LOG, zfs_ioc_error_log, + zfs_secpolicy_inject, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_ioctl_register_pool(ZFS_IOC_DSOBJ_TO_DSNAME, + zfs_ioc_dsobj_to_dsname, + zfs_secpolicy_diff, B_FALSE, POOL_CHECK_SUSPENDED); + zfs_ioctl_register_pool(ZFS_IOC_POOL_GET_HISTORY, + zfs_ioc_pool_get_history, + zfs_secpolicy_config, B_FALSE, POOL_CHECK_SUSPENDED); + + zfs_ioctl_register_pool(ZFS_IOC_POOL_IMPORT, zfs_ioc_pool_import, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_NONE); + + zfs_ioctl_register_pool(ZFS_IOC_CLEAR, zfs_ioc_clear, + zfs_secpolicy_config, B_TRUE, POOL_CHECK_READONLY); + + zfs_ioctl_register_dataset_read(ZFS_IOC_SPACE_WRITTEN, + zfs_ioc_space_written); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_RECVD_PROPS, + zfs_ioc_objset_recvd_props); + zfs_ioctl_register_dataset_read(ZFS_IOC_NEXT_OBJ, + zfs_ioc_next_obj); + zfs_ioctl_register_dataset_read(ZFS_IOC_GET_FSACL, + zfs_ioc_get_fsacl); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_STATS, + zfs_ioc_objset_stats); + zfs_ioctl_register_dataset_read(ZFS_IOC_OBJSET_ZPLPROPS, + zfs_ioc_objset_zplprops); + zfs_ioctl_register_dataset_read(ZFS_IOC_DATASET_LIST_NEXT, + zfs_ioc_dataset_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SNAPSHOT_LIST_NEXT, + zfs_ioc_snapshot_list_next); + zfs_ioctl_register_dataset_read(ZFS_IOC_SEND_PROGRESS, + zfs_ioc_send_progress); + + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_DIFF, + zfs_ioc_diff, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_STATS, + zfs_ioc_obj_to_stats, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_OBJ_TO_PATH, + zfs_ioc_obj_to_path, zfs_secpolicy_diff); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_ONE, + zfs_ioc_userspace_one, zfs_secpolicy_userspace_one); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_USERSPACE_MANY, + zfs_ioc_userspace_many, zfs_secpolicy_userspace_many); + zfs_ioctl_register_dataset_read_secpolicy(ZFS_IOC_SEND, + zfs_ioc_send, zfs_secpolicy_send); + + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_PROP, zfs_ioc_set_prop, + zfs_secpolicy_none); + zfs_ioctl_register_dataset_modify(ZFS_IOC_DESTROY, zfs_ioc_destroy, + zfs_secpolicy_destroy); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RENAME, zfs_ioc_rename, + zfs_secpolicy_rename); + zfs_ioctl_register_dataset_modify(ZFS_IOC_RECV, zfs_ioc_recv, + zfs_secpolicy_recv); + zfs_ioctl_register_dataset_modify(ZFS_IOC_PROMOTE, zfs_ioc_promote, + zfs_secpolicy_promote); + zfs_ioctl_register_dataset_modify(ZFS_IOC_INHERIT_PROP, + zfs_ioc_inherit_prop, zfs_secpolicy_inherit_prop); + zfs_ioctl_register_dataset_modify(ZFS_IOC_SET_FSACL, zfs_ioc_set_fsacl, + zfs_secpolicy_set_fsacl); + + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SHARE, zfs_ioc_share, + zfs_secpolicy_share, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_SMB_ACL, zfs_ioc_smb_acl, + zfs_secpolicy_smb_acl, POOL_CHECK_NONE); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_USERSPACE_UPGRADE, + zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + zfs_ioctl_register_dataset_nolog(ZFS_IOC_TMP_SNAPSHOT, + zfs_ioc_tmp_snapshot, zfs_secpolicy_tmp_snapshot, + POOL_CHECK_SUSPENDED | POOL_CHECK_READONLY); + + zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_NEXT, zfs_ioc_events_next, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_CLEAR, zfs_ioc_events_clear, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); + zfs_ioctl_register_legacy(ZFS_IOC_EVENTS_SEEK, zfs_ioc_events_seek, + zfs_secpolicy_config, NO_NAME, B_FALSE, POOL_CHECK_NONE); + + zfs_ioctl_init_os(); +} + +/* + * Verify that for non-legacy ioctls the input nvlist + * pairs match against the expected input. + * + * Possible errors are: + * ZFS_ERR_IOC_ARG_UNAVAIL An unrecognized nvpair was encountered + * ZFS_ERR_IOC_ARG_REQUIRED A required nvpair is missing + * ZFS_ERR_IOC_ARG_BADTYPE Invalid type for nvpair + */ +static int +zfs_check_input_nvpairs(nvlist_t *innvl, const zfs_ioc_vec_t *vec) +{ + const zfs_ioc_key_t *nvl_keys = vec->zvec_nvl_keys; + boolean_t required_keys_found = B_FALSE; + + /* + * examine each input pair + */ + for (nvpair_t *pair = nvlist_next_nvpair(innvl, NULL); + pair != NULL; pair = nvlist_next_nvpair(innvl, pair)) { + char *name = nvpair_name(pair); + data_type_t type = nvpair_type(pair); + boolean_t identified = B_FALSE; + + /* + * check pair against the documented names and type + */ + for (int k = 0; k < vec->zvec_nvl_key_count; k++) { + /* if not a wild card name, check for an exact match */ + if ((nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) == 0 && + strcmp(nvl_keys[k].zkey_name, name) != 0) + continue; + + identified = B_TRUE; + + if (nvl_keys[k].zkey_type != DATA_TYPE_ANY && + nvl_keys[k].zkey_type != type) { + return (SET_ERROR(ZFS_ERR_IOC_ARG_BADTYPE)); + } + + if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) + continue; + + required_keys_found = B_TRUE; + break; + } + + /* allow an 'optional' key, everything else is invalid */ + if (!identified && + (strcmp(name, "optional") != 0 || + type != DATA_TYPE_NVLIST)) { + return (SET_ERROR(ZFS_ERR_IOC_ARG_UNAVAIL)); + } + } + + /* verify that all required keys were found */ + for (int k = 0; k < vec->zvec_nvl_key_count; k++) { + if (nvl_keys[k].zkey_flags & ZK_OPTIONAL) + continue; + + if (nvl_keys[k].zkey_flags & ZK_WILDCARDLIST) { + /* at least one non-optional key is expected here */ + if (!required_keys_found) + return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); + continue; + } + + if (!nvlist_exists(innvl, nvl_keys[k].zkey_name)) + return (SET_ERROR(ZFS_ERR_IOC_ARG_REQUIRED)); + } + + return (0); +} + +static int +pool_status_check(const char *name, zfs_ioc_namecheck_t type, + zfs_ioc_poolcheck_t check) +{ + spa_t *spa; + int error; + + ASSERT(type == POOL_NAME || type == DATASET_NAME || + type == ENTITY_NAME); + + if (check & POOL_CHECK_NONE) + return (0); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if ((check & POOL_CHECK_SUSPENDED) && spa_suspended(spa)) + error = SET_ERROR(EAGAIN); + else if ((check & POOL_CHECK_READONLY) && !spa_writeable(spa)) + error = SET_ERROR(EROFS); + spa_close(spa, FTAG); + } + return (error); +} + +int +zfsdev_getminor(int fd, minor_t *minorp) +{ + zfsdev_state_t *zs, *fpd; + zfs_file_t *fp; + int rc; + + ASSERT(!MUTEX_HELD(&zfsdev_state_lock)); + + if ((rc = zfs_file_get(fd, &fp))) + return (rc); + + fpd = zfs_file_private(fp); + if (fpd == NULL) + return (SET_ERROR(EBADF)); + + mutex_enter(&zfsdev_state_lock); + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + + if (zs->zs_minor == -1) + continue; + + if (fpd == zs) { + *minorp = fpd->zs_minor; + mutex_exit(&zfsdev_state_lock); + return (0); + } + } + + mutex_exit(&zfsdev_state_lock); + + return (SET_ERROR(EBADF)); +} + +static void * +zfsdev_get_state_impl(minor_t minor, enum zfsdev_state_type which) +{ + zfsdev_state_t *zs; + + for (zs = zfsdev_state_list; zs != NULL; zs = zs->zs_next) { + if (zs->zs_minor == minor) { + smp_rmb(); + switch (which) { + case ZST_ONEXIT: + return (zs->zs_onexit); + case ZST_ZEVENT: + return (zs->zs_zevent); + case ZST_ALL: + return (zs); + } + } + } + + return (NULL); +} + +void * +zfsdev_get_state(minor_t minor, enum zfsdev_state_type which) +{ + void *ptr; + + ptr = zfsdev_get_state_impl(minor, which); + + return (ptr); +} + +/* + * Find a free minor number. The zfsdev_state_list is expected to + * be short since it is only a list of currently open file handles. + */ +minor_t +zfsdev_minor_alloc(void) +{ + static minor_t last_minor = 0; + minor_t m; + + ASSERT(MUTEX_HELD(&zfsdev_state_lock)); + + for (m = last_minor + 1; m != last_minor; m++) { + if (m > ZFSDEV_MAX_MINOR) + m = 1; + if (zfsdev_get_state_impl(m, ZST_ALL) == NULL) { + last_minor = m; + return (m); + } + } + + return (0); +} + +long +zfsdev_ioctl_common(uint_t vecnum, zfs_cmd_t *zc, int flag) +{ + int error, cmd; + const zfs_ioc_vec_t *vec; + char *saved_poolname = NULL; + uint64_t max_nvlist_src_size; + size_t saved_poolname_len = 0; + nvlist_t *innvl = NULL; + fstrans_cookie_t cookie; + + cmd = vecnum; + error = 0; + if (vecnum >= sizeof (zfs_ioc_vec) / sizeof (zfs_ioc_vec[0])) + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); + + vec = &zfs_ioc_vec[vecnum]; + + /* + * The registered ioctl list may be sparse, verify that either + * a normal or legacy handler are registered. + */ + if (vec->zvec_func == NULL && vec->zvec_legacy_func == NULL) + return (SET_ERROR(ZFS_ERR_IOC_CMD_UNAVAIL)); + + zc->zc_iflags = flag & FKIOCTL; + max_nvlist_src_size = zfs_max_nvlist_src_size_os(); + if (zc->zc_nvlist_src_size > max_nvlist_src_size) { + /* + * Make sure the user doesn't pass in an insane value for + * zc_nvlist_src_size. We have to check, since we will end + * up allocating that much memory inside of get_nvlist(). This + * prevents a nefarious user from allocating tons of kernel + * memory. + * + * Also, we return EINVAL instead of ENOMEM here. The reason + * being that returning ENOMEM from an ioctl() has a special + * connotation; that the user's size value is too small and + * needs to be expanded to hold the nvlist. See + * zcmd_expand_dst_nvlist() for details. + */ + error = SET_ERROR(EINVAL); /* User's size too big */ + + } else if (zc->zc_nvlist_src_size != 0) { + error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &innvl); + if (error != 0) + goto out; + } + + /* + * Ensure that all pool/dataset names are valid before we pass down to + * the lower layers. + */ + zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + switch (vec->zvec_namecheck) { + case POOL_NAME: + if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case DATASET_NAME: + if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) + error = SET_ERROR(EINVAL); + else + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + break; + + case ENTITY_NAME: + if (entity_namecheck(zc->zc_name, NULL, NULL) != 0) { + error = SET_ERROR(EINVAL); + } else { + error = pool_status_check(zc->zc_name, + vec->zvec_namecheck, vec->zvec_pool_check); + } + break; + + case NO_NAME: + break; + } + /* + * Ensure that all input pairs are valid before we pass them down + * to the lower layers. + * + * The vectored functions can use fnvlist_lookup_{type} for any + * required pairs since zfs_check_input_nvpairs() confirmed that + * they exist and are of the correct type. + */ + if (error == 0 && vec->zvec_func != NULL) { + error = zfs_check_input_nvpairs(innvl, vec); + if (error != 0) + goto out; + } + + if (error == 0) { + cookie = spl_fstrans_mark(); + error = vec->zvec_secpolicy(zc, innvl, CRED()); + spl_fstrans_unmark(cookie); + } + + if (error != 0) + goto out; + + /* legacy ioctls can modify zc_name */ + /* + * Can't use kmem_strdup() as we might truncate the string and + * kmem_strfree() would then free with incorrect size. + */ + saved_poolname_len = strlen(zc->zc_name) + 1; + saved_poolname = kmem_alloc(saved_poolname_len, KM_SLEEP); + + strlcpy(saved_poolname, zc->zc_name, saved_poolname_len); + saved_poolname[strcspn(saved_poolname, "/@#")] = '\0'; + + if (vec->zvec_func != NULL) { + nvlist_t *outnvl; + int puterror = 0; + spa_t *spa; + nvlist_t *lognv = NULL; + + ASSERT(vec->zvec_legacy_func == NULL); + + /* + * Add the innvl to the lognv before calling the func, + * in case the func changes the innvl. + */ + if (vec->zvec_allow_log) { + lognv = fnvlist_alloc(); + fnvlist_add_string(lognv, ZPOOL_HIST_IOCTL, + vec->zvec_name); + if (!nvlist_empty(innvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_INPUT_NVL, + innvl); + } + } + + outnvl = fnvlist_alloc(); + cookie = spl_fstrans_mark(); + error = vec->zvec_func(zc->zc_name, innvl, outnvl); + spl_fstrans_unmark(cookie); + + /* + * Some commands can partially execute, modify state, and still + * return an error. In these cases, attempt to record what + * was modified. + */ + if ((error == 0 || + (cmd == ZFS_IOC_CHANNEL_PROGRAM && error != EINVAL)) && + vec->zvec_allow_log && + spa_open(zc->zc_name, &spa, FTAG) == 0) { + if (!nvlist_empty(outnvl)) { + fnvlist_add_nvlist(lognv, ZPOOL_HIST_OUTPUT_NVL, + outnvl); + } + if (error != 0) { + fnvlist_add_int64(lognv, ZPOOL_HIST_ERRNO, + error); + } + (void) spa_history_log_nvl(spa, lognv); + spa_close(spa, FTAG); + } + fnvlist_free(lognv); + + if (!nvlist_empty(outnvl) || zc->zc_nvlist_dst_size != 0) { + int smusherror = 0; + if (vec->zvec_smush_outnvlist) { + smusherror = nvlist_smush(outnvl, + zc->zc_nvlist_dst_size); + } + if (smusherror == 0) + puterror = put_nvlist(zc, outnvl); + } + + if (puterror != 0) + error = puterror; + + nvlist_free(outnvl); + } else { + cookie = spl_fstrans_mark(); + error = vec->zvec_legacy_func(zc); + spl_fstrans_unmark(cookie); + } + +out: + nvlist_free(innvl); + if (error == 0 && vec->zvec_allow_log) { + char *s = tsd_get(zfs_allow_log_key); + if (s != NULL) + kmem_strfree(s); + (void) tsd_set(zfs_allow_log_key, kmem_strdup(saved_poolname)); + } + if (saved_poolname != NULL) + kmem_free(saved_poolname, saved_poolname_len); + + return (error); +} + +int +zfs_kmod_init(void) +{ + int error; + + if ((error = zvol_init()) != 0) + return (error); + + spa_init(SPA_MODE_READ | SPA_MODE_WRITE); + zfs_init(); + + zfs_ioctl_init(); + + mutex_init(&zfsdev_state_lock, NULL, MUTEX_DEFAULT, NULL); + zfsdev_state_list = kmem_zalloc(sizeof (zfsdev_state_t), KM_SLEEP); + zfsdev_state_list->zs_minor = -1; + + if ((error = zfsdev_attach()) != 0) + goto out; + + tsd_create(&zfs_fsyncer_key, NULL); + tsd_create(&rrw_tsd_key, rrw_tsd_destroy); + tsd_create(&zfs_allow_log_key, zfs_allow_log_destroy); + + return (0); +out: + zfs_fini(); + spa_fini(); + zvol_fini(); + + return (error); +} + +void +zfs_kmod_fini(void) +{ + zfsdev_state_t *zs, *zsnext = NULL; + + zfsdev_detach(); + + mutex_destroy(&zfsdev_state_lock); + + for (zs = zfsdev_state_list; zs != NULL; zs = zsnext) { + zsnext = zs->zs_next; + if (zs->zs_onexit) + zfs_onexit_destroy(zs->zs_onexit); + if (zs->zs_zevent) + zfs_zevent_destroy(zs->zs_zevent); + kmem_free(zs, sizeof (zfsdev_state_t)); + } + + zfs_fini(); + spa_fini(); + zvol_fini(); + + tsd_destroy(&zfs_fsyncer_key); + tsd_destroy(&rrw_tsd_key); + tsd_destroy(&zfs_allow_log_key); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, max_nvlist_src_size, ULONG, ZMOD_RW, + "Maximum size in bytes allowed for src nvlist passed with ZFS ioctls"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_log.c b/module/zfs/zfs_log.c new file mode 100644 index 000000000000..4eae855f4274 --- /dev/null +++ b/module/zfs/zfs_log.c @@ -0,0 +1,774 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2015, 2018 by Delphix. All rights reserved. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * These zfs_log_* functions must be called within a dmu tx, in one + * of 2 contexts depending on zilog->z_replay: + * + * Non replay mode + * --------------- + * We need to record the transaction so that if it is committed to + * the Intent Log then it can be replayed. An intent log transaction + * structure (itx_t) is allocated and all the information necessary to + * possibly replay the transaction is saved in it. The itx is then assigned + * a sequence number and inserted in the in-memory list anchored in the zilog. + * + * Replay mode + * ----------- + * We need to mark the intent log record as replayed in the log header. + * This is done in the same transaction as the replay so that they + * commit atomically. + */ + +int +zfs_log_create_txtype(zil_create_t type, vsecattr_t *vsecp, vattr_t *vap) +{ + int isxvattr = (vap->va_mask & ATTR_XVATTR); + switch (type) { + case Z_FILE: + if (vsecp == NULL && !isxvattr) + return (TX_CREATE); + if (vsecp && isxvattr) + return (TX_CREATE_ACL_ATTR); + if (vsecp) + return (TX_CREATE_ACL); + else + return (TX_CREATE_ATTR); + /*NOTREACHED*/ + case Z_DIR: + if (vsecp == NULL && !isxvattr) + return (TX_MKDIR); + if (vsecp && isxvattr) + return (TX_MKDIR_ACL_ATTR); + if (vsecp) + return (TX_MKDIR_ACL); + else + return (TX_MKDIR_ATTR); + case Z_XATTRDIR: + return (TX_MKXATTR); + } + ASSERT(0); + return (TX_MAX_TYPE); +} + +/* + * build up the log data necessary for logging xvattr_t + * First lr_attr_t is initialized. following the lr_attr_t + * is the mapsize and attribute bitmap copied from the xvattr_t. + * Following the bitmap and bitmapsize two 64 bit words are reserved + * for the create time which may be set. Following the create time + * records a single 64 bit integer which has the bits to set on + * replay for the xvattr. + */ +static void +zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + uint32_t *bitmap; + uint64_t *attrs; + uint64_t *crtime; + xoptattr_t *xoap; + void *scanstamp; + int i; + + xoap = xva_getxoptattr(xvap); + ASSERT(xoap); + + lrattr->lr_attr_masksize = xvap->xva_mapsize; + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != xvap->xva_mapsize; i++, bitmap++) { + *bitmap = xvap->xva_reqattrmap[i]; + } + + /* Now pack the attributes up in a single uint64_t */ + attrs = (uint64_t *)bitmap; + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + *attrs = 0; + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + *attrs |= (xoap->xoa_readonly == 0) ? 0 : + XAT0_READONLY; + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + *attrs |= (xoap->xoa_hidden == 0) ? 0 : + XAT0_HIDDEN; + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + *attrs |= (xoap->xoa_system == 0) ? 0 : + XAT0_SYSTEM; + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + *attrs |= (xoap->xoa_archive == 0) ? 0 : + XAT0_ARCHIVE; + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + *attrs |= (xoap->xoa_immutable == 0) ? 0 : + XAT0_IMMUTABLE; + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + *attrs |= (xoap->xoa_nounlink == 0) ? 0 : + XAT0_NOUNLINK; + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + *attrs |= (xoap->xoa_appendonly == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + *attrs |= (xoap->xoa_opaque == 0) ? 0 : + XAT0_APPENDONLY; + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + *attrs |= (xoap->xoa_nodump == 0) ? 0 : + XAT0_NODUMP; + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + *attrs |= (xoap->xoa_av_quarantined == 0) ? 0 : + XAT0_AV_QUARANTINED; + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + *attrs |= (xoap->xoa_av_modified == 0) ? 0 : + XAT0_AV_MODIFIED; + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); + + bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); + } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + /* + * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid + * at the same time, so we can share the same space. + */ + bcopy(&xoap->xoa_projid, scanstamp, sizeof (uint64_t)); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + *attrs |= (xoap->xoa_reparse == 0) ? 0 : + XAT0_REPARSE; + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + *attrs |= (xoap->xoa_offline == 0) ? 0 : + XAT0_OFFLINE; + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + *attrs |= (xoap->xoa_sparse == 0) ? 0 : + XAT0_SPARSE; + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) + *attrs |= (xoap->xoa_projinherit == 0) ? 0 : + XAT0_PROJINHERIT; +} + +static void * +zfs_log_fuid_ids(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_t *zfuid; + uint64_t *fuidloc = start; + + /* First copy in the ACE FUIDs */ + for (zfuid = list_head(&fuidp->z_fuids); zfuid; + zfuid = list_next(&fuidp->z_fuids, zfuid)) { + *fuidloc++ = zfuid->z_logfuid; + } + return (fuidloc); +} + + +static void * +zfs_log_fuid_domains(zfs_fuid_info_t *fuidp, void *start) +{ + zfs_fuid_domain_t *zdomain; + + /* now copy in the domain info, if any */ + if (fuidp->z_domain_str_sz != 0) { + for (zdomain = list_head(&fuidp->z_domains); zdomain; + zdomain = list_next(&fuidp->z_domains, zdomain)) { + bcopy((void *)zdomain->z_domain, start, + strlen(zdomain->z_domain) + 1); + start = (caddr_t)start + + strlen(zdomain->z_domain) + 1; + } + } + return (start); +} + +/* + * If zp is an xattr node, check whether the xattr owner is unlinked. + * We don't want to log anything if the owner is unlinked. + */ +static int +zfs_xattr_owner_unlinked(znode_t *zp) +{ + int unlinked = 0; + znode_t *dzp; +#ifdef __FreeBSD__ + znode_t *tzp = zp; + + /* + * zrele drops the vnode lock which violates the VOP locking contract + * on FreeBSD. See comment at the top of zfs_replay.c for more detail. + */ + /* + * if zp is XATTR node, keep walking up via z_xattr_parent until we + * get the owner + */ + while (tzp->z_pflags & ZFS_XATTR) { + ASSERT3U(zp->z_xattr_parent, !=, 0); + if (zfs_zget(ZTOZSB(tzp), tzp->z_xattr_parent, &dzp) != 0) { + unlinked = 1; + break; + } + + if (tzp != zp) + zrele(tzp); + tzp = dzp; + unlinked = tzp->z_unlinked; + } + if (tzp != zp) + zrele(tzp); +#else + zhold(zp); + /* + * if zp is XATTR node, keep walking up via z_xattr_parent until we + * get the owner + */ + while (zp->z_pflags & ZFS_XATTR) { + ASSERT3U(zp->z_xattr_parent, !=, 0); + if (zfs_zget(ZTOZSB(zp), zp->z_xattr_parent, &dzp) != 0) { + unlinked = 1; + break; + } + + zrele(zp); + zp = dzp; + unlinked = zp->z_unlinked; + } + zrele(zp); +#endif + return (unlinked); +} + +/* + * Handles TX_CREATE, TX_CREATE_ATTR, TX_MKDIR, TX_MKDIR_ATTR and + * TK_MKXATTR transactions. + * + * TX_CREATE and TX_MKDIR are standard creates, but they may have FUID + * domain information appended prior to the name. In this case the + * uid/gid in the log record will be a log centric FUID. + * + * TX_CREATE_ACL_ATTR and TX_MKDIR_ACL_ATTR handle special creates that + * may contain attributes, ACL and optional fuid information. + * + * TX_CREATE_ACL and TX_MKDIR_ACL handle special creates that specify + * and ACL and normal users/groups in the ACEs. + * + * There may be an optional xvattr attribute information similar + * to zfs_log_setattr. + * + * Also, after the file name "domain" strings may be appended. + */ +void +zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, vsecattr_t *vsecp, + zfs_fuid_info_t *fuidp, vattr_t *vap) +{ + itx_t *itx; + lr_create_t *lr; + lr_acl_create_t *lracl; + size_t aclsize = 0; + size_t xvatsize = 0; + size_t txsize; + xvattr_t *xvap = (xvattr_t *)vap; + void *end; + size_t lrsize; + size_t namesize = strlen(name) + 1; + size_t fuidsz = 0; + + if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) + return; + + /* + * If we have FUIDs present then add in space for + * domains and ACE fuid's if any. + */ + if (fuidp) { + fuidsz += fuidp->z_domain_str_sz; + fuidsz += fuidp->z_fuid_cnt * sizeof (uint64_t); + } + + if (vap->va_mask & ATTR_XVATTR) + xvatsize = ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if ((int)txtype == TX_CREATE_ATTR || (int)txtype == TX_MKDIR_ATTR || + (int)txtype == TX_CREATE || (int)txtype == TX_MKDIR || + (int)txtype == TX_MKXATTR) { + txsize = sizeof (*lr) + namesize + fuidsz + xvatsize; + lrsize = sizeof (*lr); + } else { + txsize = + sizeof (lr_acl_create_t) + namesize + fuidsz + + ZIL_ACE_LENGTH(aclsize) + xvatsize; + lrsize = sizeof (lr_acl_create_t); + } + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + /* Store dnode slot count in 8 bits above object id. */ + LR_FOID_SET_SLOTS(lr->lr_foid, zp->z_dnodesize >> DNODE_SHIFT); + lr->lr_mode = zp->z_mode; + if (!IS_EPHEMERAL(KUID_TO_SUID(ZTOUID(zp)))) { + lr->lr_uid = (uint64_t)KUID_TO_SUID(ZTOUID(zp)); + } else { + lr->lr_uid = fuidp->z_fuid_owner; + } + if (!IS_EPHEMERAL(KGID_TO_SGID(ZTOGID(zp)))) { + lr->lr_gid = (uint64_t)KGID_TO_SGID(ZTOGID(zp)); + } else { + lr->lr_gid = fuidp->z_fuid_group; + } + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), + lr->lr_crtime, sizeof (uint64_t) * 2); + + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_RDEV(ZTOZSB(zp)), &lr->lr_rdev, + sizeof (lr->lr_rdev)) != 0) + lr->lr_rdev = 0; + + /* + * Fill in xvattr info if any + */ + if (vap->va_mask & ATTR_XVATTR) { + zfs_log_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), xvap); + end = (caddr_t)lr + lrsize + xvatsize; + } else { + end = (caddr_t)lr + lrsize; + } + + /* Now fill in any ACL info */ + + if (vsecp) { + lracl = (lr_acl_create_t *)&itx->itx_lr; + lracl->lr_aclcnt = vsecp->vsa_aclcnt; + lracl->lr_acl_bytes = aclsize; + lracl->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lracl->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_aclflags & VSA_ACE_ACLFLAGS) + lracl->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lracl->lr_acl_flags = 0; + + bcopy(vsecp->vsa_aclentp, end, aclsize); + end = (caddr_t)end + ZIL_ACE_LENGTH(aclsize); + } + + /* drop in FUID info */ + if (fuidp) { + end = zfs_log_fuid_ids(fuidp, end); + end = zfs_log_fuid_domains(fuidp, end); + } + /* + * Now place file name in log record + */ + bcopy(name, end, namesize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles both TX_REMOVE and TX_RMDIR transactions. + */ +void +zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, char *name, uint64_t foid, boolean_t unlinked) +{ + itx_t *itx; + lr_remove_t *lr; + size_t namesize = strlen(name) + 1; + + if (zil_replaying(zilog, tx) || zfs_xattr_owner_unlinked(dzp)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_remove_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + itx->itx_oid = foid; + + /* + * Object ids can be re-instantiated in the next txg so + * remove any async transactions to avoid future leaks. + * This can happen if a fsync occurs on the re-instantiated + * object for a WR_INDIRECT or WR_NEED_COPY write, which gets + * the new file data and flushes a write record for the old object. + */ + if (unlinked) { + ASSERT((txtype & ~TX_CI) == TX_REMOVE); + zil_remove_async(zilog, foid); + } + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_LINK transactions. + */ +void +zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name) +{ + itx_t *itx; + lr_link_t *lr; + size_t namesize = strlen(name) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize); + lr = (lr_link_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_link_obj = zp->z_id; + bcopy(name, (char *)(lr + 1), namesize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_SYMLINK transactions. + */ +void +zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *dzp, znode_t *zp, char *name, char *link) +{ + itx_t *itx; + lr_create_t *lr; + size_t namesize = strlen(name) + 1; + size_t linksize = strlen(link) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); + lr = (lr_create_t *)&itx->itx_lr; + lr->lr_doid = dzp->z_id; + lr->lr_foid = zp->z_id; + lr->lr_uid = KUID_TO_SUID(ZTOUID(zp)); + lr->lr_gid = KGID_TO_SGID(ZTOGID(zp)); + lr->lr_mode = zp->z_mode; + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_GEN(ZTOZSB(zp)), &lr->lr_gen, + sizeof (uint64_t)); + (void) sa_lookup(zp->z_sa_hdl, SA_ZPL_CRTIME(ZTOZSB(zp)), + lr->lr_crtime, sizeof (uint64_t) * 2); + bcopy(name, (char *)(lr + 1), namesize); + bcopy(link, (char *)(lr + 1) + namesize, linksize); + + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_RENAME transactions. + */ +void +zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, + znode_t *sdzp, char *sname, znode_t *tdzp, char *dname, znode_t *szp) +{ + itx_t *itx; + lr_rename_t *lr; + size_t snamesize = strlen(sname) + 1; + size_t dnamesize = strlen(dname) + 1; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); + lr = (lr_rename_t *)&itx->itx_lr; + lr->lr_sdoid = sdzp->z_id; + lr->lr_tdoid = tdzp->z_id; + bcopy(sname, (char *)(lr + 1), snamesize); + bcopy(dname, (char *)(lr + 1) + snamesize, dnamesize); + itx->itx_oid = szp->z_id; + + zil_itx_assign(zilog, itx, tx); +} + +/* + * zfs_log_write() handles TX_WRITE transactions. The specified callback is + * called as soon as the write is on stable storage (be it via a DMU sync or a + * ZIL commit). + */ +long zfs_immediate_write_sz = 32768; + +void +zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, offset_t off, ssize_t resid, int ioflag, + zil_callback_t callback, void *callback_data) +{ + dmu_buf_impl_t *db = (dmu_buf_impl_t *)sa_get_db(zp->z_sa_hdl); + uint32_t blocksize = zp->z_blksz; + itx_wr_state_t write_state; + uintptr_t fsync_cnt; + + if (zil_replaying(zilog, tx) || zp->z_unlinked || + zfs_xattr_owner_unlinked(zp)) { + if (callback != NULL) + callback(callback_data); + return; + } + + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + resid >= zfs_immediate_write_sz) + write_state = WR_INDIRECT; + else if (ioflag & (O_SYNC | O_DSYNC)) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + if ((fsync_cnt = (uintptr_t)tsd_get(zfs_fsyncer_key)) != 0) { + (void) tsd_set(zfs_fsyncer_key, (void *)(fsync_cnt - 1)); + } + + while (resid) { + itx_t *itx; + lr_write_t *lr; + itx_wr_state_t wr_state = write_state; + ssize_t len = resid; + + /* + * A WR_COPIED record must fit entirely in one log block. + * Large writes can use WR_NEED_COPY, which the ZIL will + * split into multiple records across several log blocks + * if necessary. + */ + if (wr_state == WR_COPIED && + resid > zil_max_copied_data(zilog)) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(off, blocksize), resid); + + itx = zil_itx_create(txtype, sizeof (*lr) + + (wr_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + + DB_DNODE_ENTER(db); + if (wr_state == WR_COPIED && dmu_read_by_dnode(DB_DNODE(db), + off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + wr_state = WR_NEED_COPY; + } + DB_DNODE_EXIT(db); + + itx->itx_wr_state = wr_state; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = ZTOZSB(zp); + + if (!(ioflag & (O_SYNC | O_DSYNC)) && (zp->z_sync_cnt == 0) && + (fsync_cnt == 0)) + itx->itx_sync = B_FALSE; + + itx->itx_callback = callback; + itx->itx_callback_data = callback_data; + zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; + } +} + +/* + * Handles TX_TRUNCATE transactions. + */ +void +zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, uint64_t off, uint64_t len) +{ + itx_t *itx; + lr_truncate_t *lr; + + if (zil_replaying(zilog, tx) || zp->z_unlinked || + zfs_xattr_owner_unlinked(zp)) + return; + + itx = zil_itx_create(txtype, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_SETATTR transactions. + */ +void +zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, + znode_t *zp, vattr_t *vap, uint_t mask_applied, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + lr_setattr_t *lr; + xvattr_t *xvap = (xvattr_t *)vap; + size_t recsize = sizeof (lr_setattr_t); + void *start; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + /* + * If XVATTR set, then log record size needs to allow + * for lr_attr_t + xvattr mask, mapsize and create time + * plus actual attribute values + */ + if (vap->va_mask & ATTR_XVATTR) + recsize = sizeof (*lr) + ZIL_XVAT_SIZE(xvap->xva_mapsize); + + if (fuidp) + recsize += fuidp->z_domain_str_sz; + + itx = zil_itx_create(txtype, recsize); + lr = (lr_setattr_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + lr->lr_mask = (uint64_t)mask_applied; + lr->lr_mode = (uint64_t)vap->va_mode; + if ((mask_applied & ATTR_UID) && IS_EPHEMERAL(vap->va_uid)) + lr->lr_uid = fuidp->z_fuid_owner; + else + lr->lr_uid = (uint64_t)vap->va_uid; + + if ((mask_applied & ATTR_GID) && IS_EPHEMERAL(vap->va_gid)) + lr->lr_gid = fuidp->z_fuid_group; + else + lr->lr_gid = (uint64_t)vap->va_gid; + + lr->lr_size = (uint64_t)vap->va_size; + ZFS_TIME_ENCODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_ENCODE(&vap->va_mtime, lr->lr_mtime); + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & ATTR_XVATTR) { + zfs_log_xvattr((lr_attr_t *)start, xvap); + start = (caddr_t)start + ZIL_XVAT_SIZE(xvap->xva_mapsize); + } + + /* + * Now stick on domain information if any on end + */ + + if (fuidp) + (void) zfs_log_fuid_domains(fuidp, start); + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* + * Handles TX_ACL transactions. + */ +void +zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, + vsecattr_t *vsecp, zfs_fuid_info_t *fuidp) +{ + itx_t *itx; + lr_acl_v0_t *lrv0; + lr_acl_t *lr; + int txtype; + int lrsize; + size_t txsize; + size_t aclbytes = vsecp->vsa_aclentsz; + + if (zil_replaying(zilog, tx) || zp->z_unlinked) + return; + + txtype = (ZTOZSB(zp)->z_version < ZPL_VERSION_FUID) ? + TX_ACL_V0 : TX_ACL; + + if (txtype == TX_ACL) + lrsize = sizeof (*lr); + else + lrsize = sizeof (*lrv0); + + txsize = lrsize + + ((txtype == TX_ACL) ? ZIL_ACE_LENGTH(aclbytes) : aclbytes) + + (fuidp ? fuidp->z_domain_str_sz : 0) + + sizeof (uint64_t) * (fuidp ? fuidp->z_fuid_cnt : 0); + + itx = zil_itx_create(txtype, txsize); + + lr = (lr_acl_t *)&itx->itx_lr; + lr->lr_foid = zp->z_id; + if (txtype == TX_ACL) { + lr->lr_acl_bytes = aclbytes; + lr->lr_domcnt = fuidp ? fuidp->z_domain_cnt : 0; + lr->lr_fuidcnt = fuidp ? fuidp->z_fuid_cnt : 0; + if (vsecp->vsa_mask & VSA_ACE_ACLFLAGS) + lr->lr_acl_flags = (uint64_t)vsecp->vsa_aclflags; + else + lr->lr_acl_flags = 0; + } + lr->lr_aclcnt = (uint64_t)vsecp->vsa_aclcnt; + + if (txtype == TX_ACL_V0) { + lrv0 = (lr_acl_v0_t *)lr; + bcopy(vsecp->vsa_aclentp, (ace_t *)(lrv0 + 1), aclbytes); + } else { + void *start = (ace_t *)(lr + 1); + + bcopy(vsecp->vsa_aclentp, start, aclbytes); + + start = (caddr_t)start + ZIL_ACE_LENGTH(aclbytes); + + if (fuidp) { + start = zfs_log_fuid_ids(fuidp, start); + (void) zfs_log_fuid_domains(fuidp, start); + } + } + + itx->itx_sync = (zp->z_sync_cnt != 0); + zil_itx_assign(zilog, itx, tx); +} + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, immediate_write_sz, LONG, ZMOD_RW, + "Largest data block to write to zil"); +/* END CSTYLED */ diff --git a/module/zfs/zfs_onexit.c b/module/zfs/zfs_onexit.c new file mode 100644 index 000000000000..2a1332e715ee --- /dev/null +++ b/module/zfs/zfs_onexit.c @@ -0,0 +1,173 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2020 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ZFS kernel routines may add/delete callback routines to be invoked + * upon process exit (triggered via the close operation from the /dev/zfs + * driver). + * + * These cleanup callbacks are intended to allow for the accumulation + * of kernel state across multiple ioctls. User processes participate + * simply by opening ZFS_DEV. This causes the ZFS driver to do create + * some private data for the file descriptor and generating a unique + * minor number. The process then passes along that file descriptor to + * each ioctl that might have a cleanup operation. + * + * Consumers of the onexit routines should call zfs_onexit_fd_hold() early + * on to validate the given fd and add a reference to its file table entry. + * This allows the consumer to do its work and then add a callback, knowing + * that zfs_onexit_add_cb() won't fail with EBADF. When finished, consumers + * should call zfs_onexit_fd_rele(). + * + * A simple example is zfs_ioc_recv(), where we might create an AVL tree + * with dataset/GUID mappings and then reuse that tree on subsequent + * zfs_ioc_recv() calls. + * + * On the first zfs_ioc_recv() call, dmu_recv_stream() will kmem_alloc() + * the AVL tree and pass it along with a callback function to + * zfs_onexit_add_cb(). The zfs_onexit_add_cb() routine will register the + * callback and return an action handle. + * + * The action handle is then passed from user space to subsequent + * zfs_ioc_recv() calls, so that dmu_recv_stream() can fetch its AVL tree + * by calling zfs_onexit_cb_data() with the device minor number and + * action handle. + * + * If the user process exits abnormally, the callback is invoked implicitly + * as part of the driver close operation. Once the user space process is + * finished with the accumulated kernel state, it can also just call close(2) + * on the cleanup fd to trigger the cleanup callback. + */ + +void +zfs_onexit_init(zfs_onexit_t **zop) +{ + zfs_onexit_t *zo; + + zo = *zop = kmem_zalloc(sizeof (zfs_onexit_t), KM_SLEEP); + mutex_init(&zo->zo_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zo->zo_actions, sizeof (zfs_onexit_action_node_t), + offsetof(zfs_onexit_action_node_t, za_link)); +} + +void +zfs_onexit_destroy(zfs_onexit_t *zo) +{ + zfs_onexit_action_node_t *ap; + + mutex_enter(&zo->zo_lock); + while ((ap = list_head(&zo->zo_actions)) != NULL) { + list_remove(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + ap->za_func(ap->za_data); + kmem_free(ap, sizeof (zfs_onexit_action_node_t)); + mutex_enter(&zo->zo_lock); + } + mutex_exit(&zo->zo_lock); + + list_destroy(&zo->zo_actions); + mutex_destroy(&zo->zo_lock); + kmem_free(zo, sizeof (zfs_onexit_t)); +} + +/* + * Consumers might need to operate by minor number instead of fd, since + * they might be running in another thread (e.g. txg_sync_thread). Callers + * of this function must call zfs_onexit_fd_rele() when they're finished + * using the minor number. + */ +int +zfs_onexit_fd_hold(int fd, minor_t *minorp) +{ + zfs_onexit_t *zo = NULL; + int error; + + error = zfsdev_getminor(fd, minorp); + if (error) { + zfs_onexit_fd_rele(fd); + return (error); + } + + zo = zfsdev_get_state(*minorp, ZST_ONEXIT); + if (zo == NULL) { + zfs_onexit_fd_rele(fd); + return (SET_ERROR(EBADF)); + } + return (0); +} + +void +zfs_onexit_fd_rele(int fd) +{ + zfs_file_put(fd); +} + +static int +zfs_onexit_minor_to_state(minor_t minor, zfs_onexit_t **zo) +{ + *zo = zfsdev_get_state(minor, ZST_ONEXIT); + if (*zo == NULL) + return (SET_ERROR(EBADF)); + + return (0); +} + +/* + * Add a callback to be invoked when the calling process exits. + */ +int +zfs_onexit_add_cb(minor_t minor, void (*func)(void *), void *data, + uint64_t *action_handle) +{ + zfs_onexit_t *zo; + zfs_onexit_action_node_t *ap; + int error; + + error = zfs_onexit_minor_to_state(minor, &zo); + if (error) + return (error); + + ap = kmem_alloc(sizeof (zfs_onexit_action_node_t), KM_SLEEP); + list_link_init(&ap->za_link); + ap->za_func = func; + ap->za_data = data; + + mutex_enter(&zo->zo_lock); + list_insert_tail(&zo->zo_actions, ap); + mutex_exit(&zo->zo_lock); + if (action_handle) + *action_handle = (uint64_t)(uintptr_t)ap; + + return (0); +} diff --git a/module/zfs/zfs_quota.c b/module/zfs/zfs_quota.c new file mode 100644 index 000000000000..e61db5c7ab83 --- /dev/null +++ b/module/zfs/zfs_quota.c @@ -0,0 +1,476 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011 Pawel Jakub Dawidek . + * All rights reserved. + * Copyright (c) 2012, 2015, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include + +int +zpl_get_file_info(dmu_object_type_t bonustype, const void *data, + zfs_file_info_t *zoi) +{ + /* + * Is it a valid type of object to track? + */ + if (bonustype != DMU_OT_ZNODE && bonustype != DMU_OT_SA) + return (SET_ERROR(ENOENT)); + + zoi->zfi_project = ZFS_DEFAULT_PROJID; + + /* + * If we have a NULL data pointer + * then assume the id's aren't changing and + * return EEXIST to the dmu to let it know to + * use the same ids + */ + if (data == NULL) + return (SET_ERROR(EEXIST)); + + if (bonustype == DMU_OT_ZNODE) { + const znode_phys_t *znp = data; + zoi->zfi_user = znp->zp_uid; + zoi->zfi_group = znp->zp_gid; + zoi->zfi_generation = znp->zp_gen; + return (0); + } + + const sa_hdr_phys_t *sap = data; + if (sap->sa_magic == 0) { + /* + * This should only happen for newly created files + * that haven't had the znode data filled in yet. + */ + zoi->zfi_user = 0; + zoi->zfi_group = 0; + zoi->zfi_generation = 0; + return (0); + } + + sa_hdr_phys_t sa = *sap; + boolean_t swap = B_FALSE; + if (sa.sa_magic == BSWAP_32(SA_MAGIC)) { + sa.sa_magic = SA_MAGIC; + sa.sa_layout_info = BSWAP_16(sa.sa_layout_info); + swap = B_TRUE; + } + VERIFY3U(sa.sa_magic, ==, SA_MAGIC); + + int hdrsize = sa_hdrsize(&sa); + VERIFY3U(hdrsize, >=, sizeof (sa_hdr_phys_t)); + + uintptr_t data_after_hdr = (uintptr_t)data + hdrsize; + zoi->zfi_user = *((uint64_t *)(data_after_hdr + SA_UID_OFFSET)); + zoi->zfi_group = *((uint64_t *)(data_after_hdr + SA_GID_OFFSET)); + zoi->zfi_generation = *((uint64_t *)(data_after_hdr + SA_GEN_OFFSET)); + uint64_t flags = *((uint64_t *)(data_after_hdr + SA_FLAGS_OFFSET)); + if (swap) + flags = BSWAP_64(flags); + + if (flags & ZFS_PROJID) { + zoi->zfi_project = + *((uint64_t *)(data_after_hdr + SA_PROJID_OFFSET)); + } + + if (swap) { + zoi->zfi_user = BSWAP_64(zoi->zfi_user); + zoi->zfi_group = BSWAP_64(zoi->zfi_group); + zoi->zfi_project = BSWAP_64(zoi->zfi_project); + zoi->zfi_generation = BSWAP_64(zoi->zfi_generation); + } + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = zfs_strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + case ZFS_PROP_USEROBJUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + case ZFS_PROP_GROUPOBJUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_PROJECTUSED: + case ZFS_PROP_PROJECTOBJUSED: + return (DMU_PROJECTUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + case ZFS_PROP_USEROBJQUOTA: + return (zfsvfs->z_userobjquota_obj); + case ZFS_PROP_GROUPOBJQUOTA: + return (zfsvfs->z_groupobjquota_obj); + case ZFS_PROP_PROJECTQUOTA: + return (zfsvfs->z_projectquota_obj); + case ZFS_PROP_PROJECTOBJQUOTA: + return (zfsvfs->z_projectobjquota_obj); + default: + return (ZFS_NO_OBJECT); + } +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + int offset = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) && + !dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) { + *bufsizep = 0; + return (0); + } + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) + offset = DMU_OBJACCT_PREFIX_LEN; + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + /* + * skip object quota (with zap name prefix DMU_OBJACCT_PREFIX) + * when dealing with block quota and vice versa. + */ + if ((offset > 0) != (strncmp(za.za_name, DMU_OBJACCT_PREFIX, + DMU_OBJACCT_PREFIX_LEN) == 0)) + continue; + + fuidstr_to_sid(zfsvfs, za.za_name + offset, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + int offset = 0; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if ((type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_USEROBJQUOTA || type == ZFS_PROP_GROUPOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED || + type == ZFS_PROP_PROJECTOBJQUOTA) && + !dmu_objset_userobjspace_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + + if (type == ZFS_PROP_PROJECTQUOTA || type == ZFS_PROP_PROJECTUSED || + type == ZFS_PROP_PROJECTOBJQUOTA || + type == ZFS_PROP_PROJECTOBJUSED) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + } + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == ZFS_NO_OBJECT) + return (0); + + if (type == ZFS_PROP_USEROBJUSED || type == ZFS_PROP_GROUPOBJUSED || + type == ZFS_PROP_PROJECTOBJUSED) { + strlcpy(buf, DMU_OBJACCT_PREFIX, DMU_OBJACCT_PREFIX_LEN + 1); + offset = DMU_OBJACCT_PREFIX_LEN; + } + + err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf + offset, + sizeof (buf) - offset, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (SET_ERROR(ENOTSUP)); + + switch (type) { + case ZFS_PROP_USERQUOTA: + objp = &zfsvfs->z_userquota_obj; + break; + case ZFS_PROP_GROUPQUOTA: + objp = &zfsvfs->z_groupquota_obj; + break; + case ZFS_PROP_USEROBJQUOTA: + objp = &zfsvfs->z_userobjquota_obj; + break; + case ZFS_PROP_GROUPOBJQUOTA: + objp = &zfsvfs->z_groupobjquota_obj; + break; + case ZFS_PROP_PROJECTQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectquota_obj; + break; + case ZFS_PROP_PROJECTOBJQUOTA: + if (!dmu_objset_projectquota_enabled(zfsvfs->z_os)) + return (SET_ERROR(ENOTSUP)); + if (!zpl_is_valid_projid(rid)) + return (SET_ERROR(EINVAL)); + + objp = &zfsvfs->z_projectobjquota_obj; + break; + default: + return (SET_ERROR(EINVAL)); + } + + err = zfs_id_to_fuidstr(zfsvfs, domain, rid, buf, sizeof (buf), B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_id_overobjquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20 + DMU_OBJACCT_PREFIX_LEN]; + uint64_t used, quota, quotaobj; + int err; + + if (!dmu_objset_userobjspace_present(zfsvfs->z_os)) { + if (dmu_objset_userobjspace_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectobjquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userobjquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupobjquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), DMU_OBJACCT_PREFIX "%llx", + (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overblockquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + char buf[20]; + uint64_t used, quota, quotaobj; + int err; + + if (usedobj == DMU_PROJECTUSED_OBJECT) { + if (!dmu_objset_projectquota_present(zfsvfs->z_os)) { + if (dmu_objset_projectquota_upgradable(zfsvfs->z_os)) { + dsl_pool_config_enter( + dmu_objset_pool(zfsvfs->z_os), FTAG); + dmu_objset_id_quota_upgrade(zfsvfs->z_os); + dsl_pool_config_exit( + dmu_objset_pool(zfsvfs->z_os), FTAG); + } + return (B_FALSE); + } + quotaobj = zfsvfs->z_projectquota_obj; + } else if (usedobj == DMU_USERUSED_OBJECT) { + quotaobj = zfsvfs->z_userquota_obj; + } else if (usedobj == DMU_GROUPUSED_OBJECT) { + quotaobj = zfsvfs->z_groupquota_obj; + } else { + return (B_FALSE); + } + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)id); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +boolean_t +zfs_id_overquota(zfsvfs_t *zfsvfs, uint64_t usedobj, uint64_t id) +{ + return (zfs_id_overblockquota(zfsvfs, usedobj, id) || + zfs_id_overobjquota(zfsvfs, usedobj, id)); +} + +EXPORT_SYMBOL(zpl_get_file_info); +EXPORT_SYMBOL(zfs_userspace_one); +EXPORT_SYMBOL(zfs_userspace_many); +EXPORT_SYMBOL(zfs_set_userquota); +EXPORT_SYMBOL(zfs_id_overblockquota); +EXPORT_SYMBOL(zfs_id_overobjquota); +EXPORT_SYMBOL(zfs_id_overquota); diff --git a/module/zfs/zfs_ratelimit.c b/module/zfs/zfs_ratelimit.c new file mode 100644 index 000000000000..b18b480ce527 --- /dev/null +++ b/module/zfs/zfs_ratelimit.c @@ -0,0 +1,99 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, Lawrence Livermore National Security, LLC. + */ + +#include + +/* + * Initialize rate limit struct + * + * rl: zfs_ratelimit_t struct + * burst: Number to allow in an interval before rate limiting + * interval: Interval time in seconds + */ +void +zfs_ratelimit_init(zfs_ratelimit_t *rl, unsigned int *burst, + unsigned int interval) +{ + rl->count = 0; + rl->start = 0; + rl->interval = interval; + rl->burst = burst; + mutex_init(&rl->lock, NULL, MUTEX_DEFAULT, NULL); +} + +/* + * Finalize rate limit struct + * + * rl: zfs_ratelimit_t struct + */ +void +zfs_ratelimit_fini(zfs_ratelimit_t *rl) +{ + mutex_destroy(&rl->lock); +} + +/* + * Re-implementation of the kernel's __ratelimit() function + * + * We had to write our own rate limiter because the kernel's __ratelimit() + * function annoyingly prints out how many times it rate limited to the kernel + * logs (and there's no way to turn it off): + * + * __ratelimit: 59 callbacks suppressed + * + * If the kernel ever allows us to disable these prints, we should go back to + * using __ratelimit() instead. + * + * Return values are the same as __ratelimit(): + * + * 0: If we're rate limiting + * 1: If we're not rate limiting. + */ +int +zfs_ratelimit(zfs_ratelimit_t *rl) +{ + hrtime_t now; + + hrtime_t elapsed; + int error = 1; + + mutex_enter(&rl->lock); + + now = gethrtime(); + elapsed = now - rl->start; + + rl->count++; + if (NSEC2SEC(elapsed) >= rl->interval) { + rl->start = now; + rl->count = 0; + } else { + if (rl->count >= *rl->burst) { + error = 0; /* We're ratelimiting */ + } + } + mutex_exit(&rl->lock); + + return (error); +} diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c new file mode 100644 index 000000000000..cba5e8c9cd0b --- /dev/null +++ b/module/zfs/zfs_replay.c @@ -0,0 +1,992 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012 Cyril Plisko. All rights reserved. + * Copyright (c) 2013, 2017 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * NB: FreeBSD expects to be able to do vnode locking in lookup and + * hold the locks across all subsequent VOPs until vput is called. + * This means that its zfs vnops routines can't do any internal locking. + * In order to have the same contract as the Linux vnops there would + * needed to be duplicate locked vnops. If the vnops were used more widely + * in common code this would likely be preferable. However, currently + * this is the only file where this is the case. + */ + +/* + * Functions to replay ZFS intent log (ZIL) records + * The functions are called through a function vector (zfs_replay_vector) + * which is indexed by the transaction type. + */ + +static void +zfs_init_vattr(vattr_t *vap, uint64_t mask, uint64_t mode, + uint64_t uid, uint64_t gid, uint64_t rdev, uint64_t nodeid) +{ + bzero(vap, sizeof (*vap)); + vap->va_mask = (uint_t)mask; + vap->va_mode = mode; +#ifdef __FreeBSD__ + vap->va_type = IFTOVT(mode); +#endif + vap->va_uid = (uid_t)(IS_EPHEMERAL(uid)) ? -1 : uid; + vap->va_gid = (gid_t)(IS_EPHEMERAL(gid)) ? -1 : gid; + vap->va_rdev = zfs_cmpldev(rdev); + vap->va_nodeid = nodeid; +} + +/* ARGSUSED */ +static int +zfs_replay_error(void *arg1, void *arg2, boolean_t byteswap) +{ + return (SET_ERROR(ENOTSUP)); +} + +static void +zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) +{ + xoptattr_t *xoap = NULL; + uint64_t *attrs; + uint64_t *crtime; + uint32_t *bitmap; + void *scanstamp; + int i; + + xvap->xva_vattr.va_mask |= ATTR_XVATTR; + if ((xoap = xva_getxoptattr(xvap)) == NULL) { + xvap->xva_vattr.va_mask &= ~ATTR_XVATTR; /* shouldn't happen */ + return; + } + + ASSERT(lrattr->lr_attr_masksize == xvap->xva_mapsize); + + bitmap = &lrattr->lr_attr_bitmap; + for (i = 0; i != lrattr->lr_attr_masksize; i++, bitmap++) + xvap->xva_reqattrmap[i] = *bitmap; + + attrs = (uint64_t *)(lrattr + lrattr->lr_attr_masksize - 1); + crtime = attrs + 1; + scanstamp = (caddr_t)(crtime + 2); + + if (XVA_ISSET_REQ(xvap, XAT_HIDDEN)) + xoap->xoa_hidden = ((*attrs & XAT0_HIDDEN) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SYSTEM)) + xoap->xoa_system = ((*attrs & XAT0_SYSTEM) != 0); + if (XVA_ISSET_REQ(xvap, XAT_ARCHIVE)) + xoap->xoa_archive = ((*attrs & XAT0_ARCHIVE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_READONLY)) + xoap->xoa_readonly = ((*attrs & XAT0_READONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) + xoap->xoa_immutable = ((*attrs & XAT0_IMMUTABLE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) + xoap->xoa_nounlink = ((*attrs & XAT0_NOUNLINK) != 0); + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) + xoap->xoa_appendonly = ((*attrs & XAT0_APPENDONLY) != 0); + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) + xoap->xoa_nodump = ((*attrs & XAT0_NODUMP) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OPAQUE)) + xoap->xoa_opaque = ((*attrs & XAT0_OPAQUE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) + xoap->xoa_av_modified = ((*attrs & XAT0_AV_MODIFIED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) + xoap->xoa_av_quarantined = + ((*attrs & XAT0_AV_QUARANTINED) != 0); + if (XVA_ISSET_REQ(xvap, XAT_CREATETIME)) + ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { + ASSERT(!XVA_ISSET_REQ(xvap, XAT_PROJID)); + + bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); + } else if (XVA_ISSET_REQ(xvap, XAT_PROJID)) { + /* + * XAT_PROJID and XAT_AV_SCANSTAMP will never be valid + * at the same time, so we can share the same space. + */ + bcopy(scanstamp, &xoap->xoa_projid, sizeof (uint64_t)); + } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_OFFLINE)) + xoap->xoa_offline = ((*attrs & XAT0_OFFLINE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_SPARSE)) + xoap->xoa_sparse = ((*attrs & XAT0_SPARSE) != 0); + if (XVA_ISSET_REQ(xvap, XAT_PROJINHERIT)) + xoap->xoa_projinherit = ((*attrs & XAT0_PROJINHERIT) != 0); +} + +static int +zfs_replay_domain_cnt(uint64_t uid, uint64_t gid) +{ + uint64_t uid_idx; + uint64_t gid_idx; + int domcnt = 0; + + uid_idx = FUID_INDEX(uid); + gid_idx = FUID_INDEX(gid); + if (uid_idx) + domcnt++; + if (gid_idx > 0 && gid_idx != uid_idx) + domcnt++; + + return (domcnt); +} + +static void * +zfs_replay_fuid_domain_common(zfs_fuid_info_t *fuid_infop, void *start, + int domcnt) +{ + int i; + + for (i = 0; i != domcnt; i++) { + fuid_infop->z_domain_table[i] = start; + start = (caddr_t)start + strlen(start) + 1; + } + + return (start); +} + +/* + * Set the uid/gid in the fuid_info structure. + */ +static void +zfs_replay_fuid_ugid(zfs_fuid_info_t *fuid_infop, uint64_t uid, uint64_t gid) +{ + /* + * If owner or group are log specific FUIDs then slurp up + * domain information and build zfs_fuid_info_t + */ + if (IS_EPHEMERAL(uid)) + fuid_infop->z_fuid_owner = uid; + + if (IS_EPHEMERAL(gid)) + fuid_infop->z_fuid_group = gid; +} + +/* + * Load fuid domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuid_domain(void *buf, void **end, uint64_t uid, uint64_t gid) +{ + int domcnt; + + zfs_fuid_info_t *fuid_infop; + + fuid_infop = zfs_fuid_info_alloc(); + + domcnt = zfs_replay_domain_cnt(uid, gid); + + if (domcnt == 0) + return (fuid_infop); + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + fuid_infop->z_domain_cnt = domcnt; + *end = zfs_replay_fuid_domain_common(fuid_infop, buf, domcnt); + return (fuid_infop); +} + +/* + * load zfs_fuid_t's and fuid_domains into fuid_info_t + */ +static zfs_fuid_info_t * +zfs_replay_fuids(void *start, void **end, int idcnt, int domcnt, uint64_t uid, + uint64_t gid) +{ + uint64_t *log_fuid = (uint64_t *)start; + zfs_fuid_info_t *fuid_infop; + int i; + + fuid_infop = zfs_fuid_info_alloc(); + fuid_infop->z_domain_cnt = domcnt; + + fuid_infop->z_domain_table = + kmem_zalloc(domcnt * sizeof (char *), KM_SLEEP); + + for (i = 0; i != idcnt; i++) { + zfs_fuid_t *zfuid; + + zfuid = kmem_alloc(sizeof (zfs_fuid_t), KM_SLEEP); + zfuid->z_logfuid = *log_fuid; + zfuid->z_id = -1; + zfuid->z_domidx = 0; + list_insert_tail(&fuid_infop->z_fuids, zfuid); + log_fuid++; + } + + zfs_replay_fuid_ugid(fuid_infop, uid, gid); + + *end = zfs_replay_fuid_domain_common(fuid_infop, log_fuid, domcnt); + return (fuid_infop); +} + +static void +zfs_replay_swap_attrs(lr_attr_t *lrattr) +{ + /* swap the lr_attr structure */ + byteswap_uint32_array(lrattr, sizeof (*lrattr)); + /* swap the bitmap */ + byteswap_uint32_array(lrattr + 1, (lrattr->lr_attr_masksize - 1) * + sizeof (uint32_t)); + /* swap the attributes, create time + 64 bit word for attributes */ + byteswap_uint64_array((caddr_t)(lrattr + 1) + (sizeof (uint32_t) * + (lrattr->lr_attr_masksize - 1)), 3 * sizeof (uint64_t)); +} + +/* + * Replay file create with optional ACL, xvattr information as well + * as option FUID information. + */ +static int +zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_create_t *lracl = arg2; + char *name = NULL; /* location determined later */ + lr_create_t *lr = (lr_create_t *)lracl; + znode_t *dzp; + znode_t *zp; + xvattr_t xva; + int vflg = 0; + vsecattr_t vsec = { 0 }; + lr_attr_t *lrattr; + void *aclstart; + void *fuidstart; + size_t xvatlen = 0; + uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; + int error; + + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); + if (byteswap) { + byteswap_uint64_array(lracl, sizeof (*lracl)); + if (txtype == TX_CREATE_ACL_ATTR || + txtype == TX_MKDIR_ACL_ATTR) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + zfs_replay_swap_attrs(lrattr); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + aclstart = (caddr_t)(lracl + 1) + xvatlen; + zfs_ace_byteswap(aclstart, lracl->lr_acl_bytes, B_FALSE); + /* swap fuids */ + if (lracl->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes), + lracl->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time, generation number, and dnode size. The generic + * zfs_create() has no concept of these attributes, so we smuggle + * the values inside the vattr's otherwise unused va_ctime, + * va_nblocks, and va_fsid fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) + goto bail; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + switch (txtype) { + case TX_CREATE_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_CREATE_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + xva.xva_vattr.va_mask |= ATTR_XVATTR; + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, &vsec); + break; + case TX_MKDIR_ACL: + aclstart = (caddr_t)(lracl + 1); + fuidstart = (caddr_t)aclstart + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + /*FALLTHROUGH*/ + case TX_MKDIR_ACL_ATTR: + if (name == NULL) { + lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr(lrattr, &xva); + } + vsec.vsa_mask = VSA_ACE | VSA_ACE_ACLFLAGS; + vsec.vsa_aclentp = (caddr_t)(lracl + 1) + xvatlen; + vsec.vsa_aclcnt = lracl->lr_aclcnt; + vsec.vsa_aclentsz = lracl->lr_acl_bytes; + vsec.vsa_aclflags = lracl->lr_acl_flags; + if (zfsvfs->z_fuid_replay == NULL) { + fuidstart = (caddr_t)(lracl + 1) + xvatlen + + ZIL_ACE_LENGTH(lracl->lr_acl_bytes); + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, + (void *)&name, lracl->lr_fuidcnt, lracl->lr_domcnt, + lr->lr_uid, lr->lr_gid); + } + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, &vsec); + break; + default: + error = SET_ERROR(ENOTSUP); + } + +bail: + if (error == 0 && zp != NULL) { +#ifdef __FreeBSD__ + VOP_UNLOCK1(ZTOV(zp)); +#endif + zrele(zp); + } + zrele(dzp); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + + return (error); +} + +static int +zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_create_t *lr = arg2; + char *name = NULL; /* location determined later */ + char *link; /* symlink content follows name */ + znode_t *dzp; + znode_t *zp = NULL; + xvattr_t xva; + int vflg = 0; + size_t lrsize = sizeof (lr_create_t); + lr_attr_t *lrattr; + void *start; + size_t xvatlen; + uint64_t txtype; + uint64_t objid; + uint64_t dnodesize; + int error; + + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + objid = LR_FOID_GET_OBJ(lr->lr_foid); + dnodesize = LR_FOID_GET_SLOTS(lr->lr_foid) << DNODE_SHIFT; + + xva_init(&xva); + zfs_init_vattr(&xva.xva_vattr, ATTR_MODE | ATTR_UID | ATTR_GID, + lr->lr_mode, lr->lr_uid, lr->lr_gid, lr->lr_rdev, objid); + + /* + * All forms of zfs create (create, mkdir, mkxattrdir, symlink) + * eventually end up in zfs_mknode(), which assigns the object's + * creation time, generation number, and dnode slot count. The + * generic zfs_create() has no concept of these attributes, so + * we smuggle the values inside the vattr's otherwise unused + * va_ctime, va_nblocks, and va_fsid fields. + */ + ZFS_TIME_DECODE(&xva.xva_vattr.va_ctime, lr->lr_crtime); + xva.xva_vattr.va_nblocks = lr->lr_gen; + xva.xva_vattr.va_fsid = dnodesize; + + error = dnode_try_claim(zfsvfs->z_os, objid, dnodesize >> DNODE_SHIFT); + if (error) + goto out; + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + /* + * Symlinks don't have fuid info, and CIFS never creates + * symlinks. + * + * The _ATTR versions will grab the fuid info in their subcases. + */ + if ((int)lr->lr_common.lrc_txtype != TX_SYMLINK && + (int)lr->lr_common.lrc_txtype != TX_MKDIR_ATTR && + (int)lr->lr_common.lrc_txtype != TX_CREATE_ATTR) { + start = (lr + 1); + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + } + + switch (txtype) { + case TX_CREATE_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_CREATE: + if (name == NULL) + name = (char *)start; + + error = zfs_create(dzp, name, &xva.xva_vattr, + 0, 0, &zp, kcred, vflg, NULL); + break; + case TX_MKDIR_ATTR: + lrattr = (lr_attr_t *)(caddr_t)(lr + 1); + xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + zfs_replay_xvattr((lr_attr_t *)((caddr_t)lr + lrsize), &xva); + start = (caddr_t)(lr + 1) + xvatlen; + zfsvfs->z_fuid_replay = + zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + name = (char *)start; + + /*FALLTHROUGH*/ + case TX_MKDIR: + if (name == NULL) + name = (char *)(lr + 1); + + error = zfs_mkdir(dzp, name, &xva.xva_vattr, + &zp, kcred, vflg, NULL); + break; + case TX_MKXATTR: + error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &zp, kcred); + break; + case TX_SYMLINK: + name = (char *)(lr + 1); + link = name + strlen(name) + 1; + error = zfs_symlink(dzp, name, &xva.xva_vattr, + link, &zp, kcred, vflg); + break; + default: + error = SET_ERROR(ENOTSUP); + } + +out: + if (error == 0 && zp != NULL) { +#ifdef __FreeBSD__ + VOP_UNLOCK1(ZTOV(zp)); +#endif + zrele(zp); + } + zrele(dzp); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + return (error); +} + +static int +zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_remove_t *lr = arg2; + char *name = (char *)(lr + 1); /* name follows lr_remove_t */ + znode_t *dzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + switch ((int)lr->lr_common.lrc_txtype) { + case TX_REMOVE: + error = zfs_remove(dzp, name, kcred, vflg); + break; + case TX_RMDIR: + error = zfs_rmdir(dzp, name, NULL, kcred, vflg); + break; + default: + error = SET_ERROR(ENOTSUP); + } + + zrele(dzp); + + return (error); +} + +static int +zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_link_t *lr = arg2; + char *name = (char *)(lr + 1); /* name follows lr_link_t */ + znode_t *dzp, *zp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_doid, &dzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_link_obj, &zp)) != 0) { + zrele(dzp); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = zfs_link(dzp, zp, name, kcred, vflg); + zrele(zp); + zrele(dzp); + + return (error); +} + +static int +zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_rename_t *lr = arg2; + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; + znode_t *sdzp, *tdzp; + int error; + int vflg = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_sdoid, &sdzp)) != 0) + return (error); + + if ((error = zfs_zget(zfsvfs, lr->lr_tdoid, &tdzp)) != 0) { + zrele(sdzp); + return (error); + } + + if (lr->lr_common.lrc_txtype & TX_CI) + vflg |= FIGNORECASE; + + error = zfs_rename(sdzp, sname, tdzp, tname, kcred, vflg); + + zrele(tdzp); + zrele(sdzp); + return (error); +} + +static int +zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_write_t *lr = arg2; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + znode_t *zp; + int error; + uint64_t eod, offset, length; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { + /* + * As we can log writes out of order, it's possible the + * file has been removed. In this case just drop the write + * and return success. + */ + if (error == ENOENT) + error = 0; + return (error); + } + + offset = lr->lr_offset; + length = lr->lr_length; + eod = offset + length; /* end of data for this write */ + + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. This needs to be done within the single dmu + * transaction created within vn_rdwr -> zfs_write. So a possible + * new end of file is passed through in zfsvfs->z_replay_eof + */ + + zfsvfs->z_replay_eof = 0; /* 0 means don't change end of file */ + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + if (zp->z_size < eod) + zfsvfs->z_replay_eof = eod; + } + error = zfs_write_simple(zp, data, length, offset, NULL); + zrele(zp); + zfsvfs->z_replay_eof = 0; /* safety */ + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_write_t *lr = arg2; + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + +top: + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_size) { + dmu_tx_t *tx = dmu_tx_create(zfsvfs->z_os); + + zp->z_size = end; + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_FALSE); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + zrele(zp); + if (error == ERESTART) { + dmu_tx_wait(tx); + dmu_tx_abort(tx); + goto top; + } + dmu_tx_abort(tx); + return (error); + } + (void) sa_update(zp->z_sa_hdl, SA_ZPL_SIZE(zfsvfs), + (void *)&zp->z_size, sizeof (uint64_t), tx); + + /* Ensure the replayed seq is updated */ + (void) zil_replaying(zfsvfs->z_log, tx); + + dmu_tx_commit(tx); + } + + zrele(zp); + + return (error); +} + +static int +zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_truncate_t *lr = arg2; + znode_t *zp; + flock64_t fl; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&fl, sizeof (fl)); + fl.l_type = F_WRLCK; + fl.l_whence = SEEK_SET; + fl.l_start = lr->lr_offset; + fl.l_len = lr->lr_length; + + error = zfs_space(zp, F_FREESP, &fl, O_RDWR | O_LARGEFILE, + lr->lr_offset, kcred); + + zrele(zp); + + return (error); +} + +static int +zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_setattr_t *lr = arg2; + znode_t *zp; + xvattr_t xva; + vattr_t *vap = &xva.xva_vattr; + int error; + void *start; + + xva_init(&xva); + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((lr->lr_mask & ATTR_XVATTR) && + zfsvfs->z_version >= ZPL_VERSION_INITIAL) + zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, + lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); + + vap->va_size = lr->lr_size; + ZFS_TIME_DECODE(&vap->va_atime, lr->lr_atime); + ZFS_TIME_DECODE(&vap->va_mtime, lr->lr_mtime); + gethrestime(&vap->va_ctime); + vap->va_mask |= ATTR_CTIME; + + /* + * Fill in xvattr_t portions if necessary. + */ + + start = (lr_setattr_t *)(lr + 1); + if (vap->va_mask & ATTR_XVATTR) { + zfs_replay_xvattr((lr_attr_t *)start, &xva); + start = (caddr_t)start + + ZIL_XVAT_SIZE(((lr_attr_t *)start)->lr_attr_masksize); + } else + xva.xva_vattr.va_mask &= ~ATTR_XVATTR; + + zfsvfs->z_fuid_replay = zfs_replay_fuid_domain(start, &start, + lr->lr_uid, lr->lr_gid); + + error = zfs_setattr(zp, vap, 0, kcred); + + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + zfsvfs->z_fuid_replay = NULL; + zrele(zp); + + return (error); +} + +static int +zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_v0_t *lr = arg2; + ace_t *ace = (ace_t *)(lr + 1); /* ace array follows lr_acl_t */ + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_oldace_byteswap(ace, lr->lr_aclcnt); + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentsz = sizeof (ace_t) * vsa.vsa_aclcnt; + vsa.vsa_aclflags = 0; + vsa.vsa_aclentp = ace; + + error = zfs_setsecattr(zp, &vsa, 0, kcred); + + zrele(zp); + + return (error); +} + +/* + * Replaying ACLs is complicated by FUID support. + * The log record may contain some optional data + * to be used for replaying FUID's. These pieces + * are the actual FUIDs that were created initially. + * The FUID table index may no longer be valid and + * during zfs_create() a new index may be assigned. + * Because of this the log will contain the original + * domain+rid in order to create a new FUID. + * + * The individual ACEs may contain an ephemeral uid/gid which is no + * longer valid and will need to be replaced with an actual FUID. + * + */ +static int +zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) +{ + zfsvfs_t *zfsvfs = arg1; + lr_acl_t *lr = arg2; + ace_t *ace = (ace_t *)(lr + 1); + vsecattr_t vsa; + znode_t *zp; + int error; + + if (byteswap) { + byteswap_uint64_array(lr, sizeof (*lr)); + zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); + if (lr->lr_fuidcnt) { + byteswap_uint64_array((caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes), + lr->lr_fuidcnt * sizeof (uint64_t)); + } + } + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + bzero(&vsa, sizeof (vsa)); + vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; + vsa.vsa_aclcnt = lr->lr_aclcnt; + vsa.vsa_aclentp = ace; + vsa.vsa_aclentsz = lr->lr_acl_bytes; + vsa.vsa_aclflags = lr->lr_acl_flags; + + if (lr->lr_fuidcnt) { + void *fuidstart = (caddr_t)ace + + ZIL_ACE_LENGTH(lr->lr_acl_bytes); + + zfsvfs->z_fuid_replay = + zfs_replay_fuids(fuidstart, &fuidstart, + lr->lr_fuidcnt, lr->lr_domcnt, 0, 0); + } + + error = zfs_setsecattr(zp, &vsa, 0, kcred); + + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); + + zfsvfs->z_fuid_replay = NULL; + zrele(zp); + + return (error); +} + +/* + * Callback vectors for replaying records + */ +zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { + zfs_replay_error, /* no such type */ + zfs_replay_create, /* TX_CREATE */ + zfs_replay_create, /* TX_MKDIR */ + zfs_replay_create, /* TX_MKXATTR */ + zfs_replay_create, /* TX_SYMLINK */ + zfs_replay_remove, /* TX_REMOVE */ + zfs_replay_remove, /* TX_RMDIR */ + zfs_replay_link, /* TX_LINK */ + zfs_replay_rename, /* TX_RENAME */ + zfs_replay_write, /* TX_WRITE */ + zfs_replay_truncate, /* TX_TRUNCATE */ + zfs_replay_setattr, /* TX_SETATTR */ + zfs_replay_acl_v0, /* TX_ACL_V0 */ + zfs_replay_acl, /* TX_ACL */ + zfs_replay_create_acl, /* TX_CREATE_ACL */ + zfs_replay_create, /* TX_CREATE_ATTR */ + zfs_replay_create_acl, /* TX_CREATE_ACL_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL */ + zfs_replay_create, /* TX_MKDIR_ATTR */ + zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ +}; diff --git a/module/zfs/zfs_rlock.c b/module/zfs/zfs_rlock.c new file mode 100644 index 000000000000..06a5e031a7df --- /dev/null +++ b/module/zfs/zfs_rlock.c @@ -0,0 +1,691 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2012, 2018 by Delphix. All rights reserved. + */ + +/* + * This file contains the code to implement file range locking in + * ZFS, although there isn't much specific to ZFS (all that comes to mind is + * support for growing the blocksize). + * + * Interface + * --------- + * Defined in zfs_rlock.h but essentially: + * lr = rangelock_enter(zp, off, len, lock_type); + * rangelock_reduce(lr, off, len); // optional + * rangelock_exit(lr); + * + * Range locking rules + * -------------------- + * 1. When truncating a file (zfs_create, zfs_setattr, zfs_space) the whole + * file range needs to be locked as RL_WRITER. Only then can the pages be + * freed etc and zp_size reset. zp_size must be set within range lock. + * 2. For writes and punching holes (zfs_write & zfs_space) just the range + * being written or freed needs to be locked as RL_WRITER. + * Multiple writes at the end of the file must coordinate zp_size updates + * to ensure data isn't lost. A compare and swap loop is currently used + * to ensure the file size is at least the offset last written. + * 3. For reads (zfs_read, zfs_get_data & zfs_putapage) just the range being + * read needs to be locked as RL_READER. A check against zp_size can then + * be made for reading beyond end of file. + * + * AVL tree + * -------- + * An AVL tree is used to maintain the state of the existing ranges + * that are locked for exclusive (writer) or shared (reader) use. + * The starting range offset is used for searching and sorting the tree. + * + * Common case + * ----------- + * The (hopefully) usual case is of no overlaps or contention for locks. On + * entry to rangelock_enter(), a locked_range_t is allocated; the tree + * searched that finds no overlap, and *this* locked_range_t is placed in the + * tree. + * + * Overlaps/Reference counting/Proxy locks + * --------------------------------------- + * The avl code only allows one node at a particular offset. Also it's very + * inefficient to search through all previous entries looking for overlaps + * (because the very 1st in the ordered list might be at offset 0 but + * cover the whole file). + * So this implementation uses reference counts and proxy range locks. + * Firstly, only reader locks use reference counts and proxy locks, + * because writer locks are exclusive. + * When a reader lock overlaps with another then a proxy lock is created + * for that range and replaces the original lock. If the overlap + * is exact then the reference count of the proxy is simply incremented. + * Otherwise, the proxy lock is split into smaller lock ranges and + * new proxy locks created for non overlapping ranges. + * The reference counts are adjusted accordingly. + * Meanwhile, the original lock is kept around (this is the callers handle) + * and its offset and length are used when releasing the lock. + * + * Thread coordination + * ------------------- + * In order to make wakeups efficient and to ensure multiple continuous + * readers on a range don't starve a writer for the same range lock, + * two condition variables are allocated in each rl_t. + * If a writer (or reader) can't get a range it initialises the writer + * (or reader) cv; sets a flag saying there's a writer (or reader) waiting; + * and waits on that cv. When a thread unlocks that range it wakes up all + * writers then all readers before destroying the lock. + * + * Append mode writes + * ------------------ + * Append mode writes need to lock a range at the end of a file. + * The offset of the end of the file is determined under the + * range locking mutex, and the lock type converted from RL_APPEND to + * RL_WRITER and the range locked. + * + * Grow block handling + * ------------------- + * ZFS supports multiple block sizes, up to 16MB. The smallest + * block size is used for the file which is grown as needed. During this + * growth all other writers and readers must be excluded. + * So if the block size needs to be grown then the whole file is + * exclusively locked, then later the caller will reduce the lock + * range to just the range to be written using rangelock_reduce(). + */ + +#include +#include + + +/* + * AVL comparison function used to order range locks + * Locks are ordered on the start offset of the range. + */ +static int +zfs_rangelock_compare(const void *arg1, const void *arg2) +{ + const zfs_locked_range_t *rl1 = (const zfs_locked_range_t *)arg1; + const zfs_locked_range_t *rl2 = (const zfs_locked_range_t *)arg2; + + return (TREE_CMP(rl1->lr_offset, rl2->lr_offset)); +} + +/* + * The callback is invoked when acquiring a RL_WRITER or RL_APPEND lock. + * It must convert RL_APPEND to RL_WRITER (starting at the end of the file), + * and may increase the range that's locked for RL_WRITER. + */ +void +zfs_rangelock_init(zfs_rangelock_t *rl, zfs_rangelock_cb_t *cb, void *arg) +{ + mutex_init(&rl->rl_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&rl->rl_tree, zfs_rangelock_compare, + sizeof (zfs_locked_range_t), offsetof(zfs_locked_range_t, lr_node)); + rl->rl_cb = cb; + rl->rl_arg = arg; +} + +void +zfs_rangelock_fini(zfs_rangelock_t *rl) +{ + mutex_destroy(&rl->rl_lock); + avl_destroy(&rl->rl_tree); +} + +/* + * Check if a write lock can be grabbed. If not, fail immediately or sleep and + * recheck until available, depending on the value of the "nonblock" parameter. + */ +static boolean_t +zfs_rangelock_enter_writer(zfs_rangelock_t *rl, zfs_locked_range_t *new, + boolean_t nonblock) +{ + avl_tree_t *tree = &rl->rl_tree; + zfs_locked_range_t *lr; + avl_index_t where; + uint64_t orig_off = new->lr_offset; + uint64_t orig_len = new->lr_length; + zfs_rangelock_type_t orig_type = new->lr_type; + + for (;;) { + /* + * Call callback which can modify new->r_off,len,type. + * Note, the callback is used by the ZPL to handle appending + * and changing blocksizes. It isn't needed for zvols. + */ + if (rl->rl_cb != NULL) { + rl->rl_cb(new, rl->rl_arg); + } + + /* + * If the type was APPEND, the callback must convert it to + * WRITER. + */ + ASSERT3U(new->lr_type, ==, RL_WRITER); + + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(tree) == 0) { + avl_add(tree, new); + return (B_TRUE); + } + + /* + * Look for any locks in the range. + */ + lr = avl_find(tree, new, &where); + if (lr != NULL) + goto wait; /* already locked at same offset */ + + lr = avl_nearest(tree, where, AVL_AFTER); + if (lr != NULL && + lr->lr_offset < new->lr_offset + new->lr_length) + goto wait; + + lr = avl_nearest(tree, where, AVL_BEFORE); + if (lr != NULL && + lr->lr_offset + lr->lr_length > new->lr_offset) + goto wait; + + avl_insert(tree, new, where); + return (B_TRUE); +wait: + if (nonblock) + return (B_FALSE); + if (!lr->lr_write_wanted) { + cv_init(&lr->lr_write_cv, NULL, CV_DEFAULT, NULL); + lr->lr_write_wanted = B_TRUE; + } + cv_wait(&lr->lr_write_cv, &rl->rl_lock); + + /* reset to original */ + new->lr_offset = orig_off; + new->lr_length = orig_len; + new->lr_type = orig_type; + } +} + +/* + * If this is an original (non-proxy) lock then replace it by + * a proxy and return the proxy. + */ +static zfs_locked_range_t * +zfs_rangelock_proxify(avl_tree_t *tree, zfs_locked_range_t *lr) +{ + zfs_locked_range_t *proxy; + + if (lr->lr_proxy) + return (lr); /* already a proxy */ + + ASSERT3U(lr->lr_count, ==, 1); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); + avl_remove(tree, lr); + lr->lr_count = 0; + + /* create a proxy range lock */ + proxy = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); + proxy->lr_offset = lr->lr_offset; + proxy->lr_length = lr->lr_length; + proxy->lr_count = 1; + proxy->lr_type = RL_READER; + proxy->lr_proxy = B_TRUE; + proxy->lr_write_wanted = B_FALSE; + proxy->lr_read_wanted = B_FALSE; + avl_add(tree, proxy); + + return (proxy); +} + +/* + * Split the range lock at the supplied offset + * returning the *front* proxy. + */ +static zfs_locked_range_t * +zfs_rangelock_split(avl_tree_t *tree, zfs_locked_range_t *lr, uint64_t off) +{ + zfs_locked_range_t *rear; + + ASSERT3U(lr->lr_length, >, 1); + ASSERT3U(off, >, lr->lr_offset); + ASSERT3U(off, <, lr->lr_offset + lr->lr_length); + ASSERT(lr->lr_write_wanted == B_FALSE); + ASSERT(lr->lr_read_wanted == B_FALSE); + + /* create the rear proxy range lock */ + rear = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); + rear->lr_offset = off; + rear->lr_length = lr->lr_offset + lr->lr_length - off; + rear->lr_count = lr->lr_count; + rear->lr_type = RL_READER; + rear->lr_proxy = B_TRUE; + rear->lr_write_wanted = B_FALSE; + rear->lr_read_wanted = B_FALSE; + + zfs_locked_range_t *front = zfs_rangelock_proxify(tree, lr); + front->lr_length = off - lr->lr_offset; + + avl_insert_here(tree, rear, front, AVL_AFTER); + return (front); +} + +/* + * Create and add a new proxy range lock for the supplied range. + */ +static void +zfs_rangelock_new_proxy(avl_tree_t *tree, uint64_t off, uint64_t len) +{ + zfs_locked_range_t *lr; + + ASSERT(len != 0); + lr = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); + lr->lr_offset = off; + lr->lr_length = len; + lr->lr_count = 1; + lr->lr_type = RL_READER; + lr->lr_proxy = B_TRUE; + lr->lr_write_wanted = B_FALSE; + lr->lr_read_wanted = B_FALSE; + avl_add(tree, lr); +} + +static void +zfs_rangelock_add_reader(avl_tree_t *tree, zfs_locked_range_t *new, + zfs_locked_range_t *prev, avl_index_t where) +{ + zfs_locked_range_t *next; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; + + /* + * prev arrives either: + * - pointing to an entry at the same offset + * - pointing to the entry with the closest previous offset whose + * range may overlap with the new range + * - null, if there were no ranges starting before the new one + */ + if (prev != NULL) { + if (prev->lr_offset + prev->lr_length <= off) { + prev = NULL; + } else if (prev->lr_offset != off) { + /* + * convert to proxy if needed then + * split this entry and bump ref count + */ + prev = zfs_rangelock_split(tree, prev, off); + prev = AVL_NEXT(tree, prev); /* move to rear range */ + } + } + ASSERT((prev == NULL) || (prev->lr_offset == off)); + + if (prev != NULL) + next = prev; + else + next = avl_nearest(tree, where, AVL_AFTER); + + if (next == NULL || off + len <= next->lr_offset) { + /* no overlaps, use the original new rl_t in the tree */ + avl_insert(tree, new, where); + return; + } + + if (off < next->lr_offset) { + /* Add a proxy for initial range before the overlap */ + zfs_rangelock_new_proxy(tree, off, next->lr_offset - off); + } + + new->lr_count = 0; /* will use proxies in tree */ + /* + * We now search forward through the ranges, until we go past the end + * of the new range. For each entry we make it a proxy if it + * isn't already, then bump its reference count. If there's any + * gaps between the ranges then we create a new proxy range. + */ + for (prev = NULL; next; prev = next, next = AVL_NEXT(tree, next)) { + if (off + len <= next->lr_offset) + break; + if (prev != NULL && prev->lr_offset + prev->lr_length < + next->lr_offset) { + /* there's a gap */ + ASSERT3U(next->lr_offset, >, + prev->lr_offset + prev->lr_length); + zfs_rangelock_new_proxy(tree, + prev->lr_offset + prev->lr_length, + next->lr_offset - + (prev->lr_offset + prev->lr_length)); + } + if (off + len == next->lr_offset + next->lr_length) { + /* exact overlap with end */ + next = zfs_rangelock_proxify(tree, next); + next->lr_count++; + return; + } + if (off + len < next->lr_offset + next->lr_length) { + /* new range ends in the middle of this block */ + next = zfs_rangelock_split(tree, next, off + len); + next->lr_count++; + return; + } + ASSERT3U(off + len, >, next->lr_offset + next->lr_length); + next = zfs_rangelock_proxify(tree, next); + next->lr_count++; + } + + /* Add the remaining end range. */ + zfs_rangelock_new_proxy(tree, prev->lr_offset + prev->lr_length, + (off + len) - (prev->lr_offset + prev->lr_length)); +} + +/* + * Check if a reader lock can be grabbed. If not, fail immediately or sleep and + * recheck until available, depending on the value of the "nonblock" parameter. + */ +static boolean_t +zfs_rangelock_enter_reader(zfs_rangelock_t *rl, zfs_locked_range_t *new, + boolean_t nonblock) +{ + avl_tree_t *tree = &rl->rl_tree; + zfs_locked_range_t *prev, *next; + avl_index_t where; + uint64_t off = new->lr_offset; + uint64_t len = new->lr_length; + + /* + * Look for any writer locks in the range. + */ +retry: + prev = avl_find(tree, new, &where); + if (prev == NULL) + prev = avl_nearest(tree, where, AVL_BEFORE); + + /* + * Check the previous range for a writer lock overlap. + */ + if (prev && (off < prev->lr_offset + prev->lr_length)) { + if ((prev->lr_type == RL_WRITER) || (prev->lr_write_wanted)) { + if (nonblock) + return (B_FALSE); + if (!prev->lr_read_wanted) { + cv_init(&prev->lr_read_cv, + NULL, CV_DEFAULT, NULL); + prev->lr_read_wanted = B_TRUE; + } + cv_wait(&prev->lr_read_cv, &rl->rl_lock); + goto retry; + } + if (off + len < prev->lr_offset + prev->lr_length) + goto got_lock; + } + + /* + * Search through the following ranges to see if there's + * write lock any overlap. + */ + if (prev != NULL) + next = AVL_NEXT(tree, prev); + else + next = avl_nearest(tree, where, AVL_AFTER); + for (; next != NULL; next = AVL_NEXT(tree, next)) { + if (off + len <= next->lr_offset) + goto got_lock; + if ((next->lr_type == RL_WRITER) || (next->lr_write_wanted)) { + if (nonblock) + return (B_FALSE); + if (!next->lr_read_wanted) { + cv_init(&next->lr_read_cv, + NULL, CV_DEFAULT, NULL); + next->lr_read_wanted = B_TRUE; + } + cv_wait(&next->lr_read_cv, &rl->rl_lock); + goto retry; + } + if (off + len <= next->lr_offset + next->lr_length) + goto got_lock; + } + +got_lock: + /* + * Add the read lock, which may involve splitting existing + * locks and bumping ref counts (r_count). + */ + zfs_rangelock_add_reader(tree, new, prev, where); + return (B_TRUE); +} + +/* + * Lock a range (offset, length) as either shared (RL_READER) or exclusive + * (RL_WRITER or RL_APPEND). If RL_APPEND is specified, rl_cb() will convert + * it to a RL_WRITER lock (with the offset at the end of the file). Returns + * the range lock structure for later unlocking (or reduce range if the + * entire file is locked as RL_WRITER), or NULL if nonblock is true and the + * lock could not be acquired immediately. + */ +static zfs_locked_range_t * +zfs_rangelock_enter_impl(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type, boolean_t nonblock) +{ + zfs_locked_range_t *new; + + ASSERT(type == RL_READER || type == RL_WRITER || type == RL_APPEND); + + new = kmem_alloc(sizeof (zfs_locked_range_t), KM_SLEEP); + new->lr_rangelock = rl; + new->lr_offset = off; + if (len + off < off) /* overflow */ + len = UINT64_MAX - off; + new->lr_length = len; + new->lr_count = 1; /* assume it's going to be in the tree */ + new->lr_type = type; + new->lr_proxy = B_FALSE; + new->lr_write_wanted = B_FALSE; + new->lr_read_wanted = B_FALSE; + + mutex_enter(&rl->rl_lock); + if (type == RL_READER) { + /* + * First check for the usual case of no locks + */ + if (avl_numnodes(&rl->rl_tree) == 0) { + avl_add(&rl->rl_tree, new); + } else if (!zfs_rangelock_enter_reader(rl, new, nonblock)) { + kmem_free(new, sizeof (*new)); + new = NULL; + } + } else if (!zfs_rangelock_enter_writer(rl, new, nonblock)) { + kmem_free(new, sizeof (*new)); + new = NULL; + } + mutex_exit(&rl->rl_lock); + return (new); +} + +zfs_locked_range_t * +zfs_rangelock_enter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type) +{ + return (zfs_rangelock_enter_impl(rl, off, len, type, B_FALSE)); +} + +zfs_locked_range_t * +zfs_rangelock_tryenter(zfs_rangelock_t *rl, uint64_t off, uint64_t len, + zfs_rangelock_type_t type) +{ + return (zfs_rangelock_enter_impl(rl, off, len, type, B_TRUE)); +} + +/* + * Safely free the zfs_locked_range_t. + */ +static void +zfs_rangelock_free(zfs_locked_range_t *lr) +{ + if (lr->lr_write_wanted) + cv_destroy(&lr->lr_write_cv); + + if (lr->lr_read_wanted) + cv_destroy(&lr->lr_read_cv); + + kmem_free(lr, sizeof (zfs_locked_range_t)); +} + +/* + * Unlock a reader lock + */ +static void +zfs_rangelock_exit_reader(zfs_rangelock_t *rl, zfs_locked_range_t *remove, + list_t *free_list) +{ + avl_tree_t *tree = &rl->rl_tree; + uint64_t len; + + /* + * The common case is when the remove entry is in the tree + * (cnt == 1) meaning there's been no other reader locks overlapping + * with this one. Otherwise the remove entry will have been + * removed from the tree and replaced by proxies (one or + * more ranges mapping to the entire range). + */ + if (remove->lr_count == 1) { + avl_remove(tree, remove); + if (remove->lr_write_wanted) + cv_broadcast(&remove->lr_write_cv); + if (remove->lr_read_wanted) + cv_broadcast(&remove->lr_read_cv); + list_insert_tail(free_list, remove); + } else { + ASSERT0(remove->lr_count); + ASSERT0(remove->lr_write_wanted); + ASSERT0(remove->lr_read_wanted); + /* + * Find start proxy representing this reader lock, + * then decrement ref count on all proxies + * that make up this range, freeing them as needed. + */ + zfs_locked_range_t *lr = avl_find(tree, remove, NULL); + ASSERT3P(lr, !=, NULL); + ASSERT3U(lr->lr_count, !=, 0); + ASSERT3U(lr->lr_type, ==, RL_READER); + zfs_locked_range_t *next = NULL; + for (len = remove->lr_length; len != 0; lr = next) { + len -= lr->lr_length; + if (len != 0) { + next = AVL_NEXT(tree, lr); + ASSERT3P(next, !=, NULL); + ASSERT3U(lr->lr_offset + lr->lr_length, ==, + next->lr_offset); + ASSERT3U(next->lr_count, !=, 0); + ASSERT3U(next->lr_type, ==, RL_READER); + } + lr->lr_count--; + if (lr->lr_count == 0) { + avl_remove(tree, lr); + if (lr->lr_write_wanted) + cv_broadcast(&lr->lr_write_cv); + if (lr->lr_read_wanted) + cv_broadcast(&lr->lr_read_cv); + list_insert_tail(free_list, lr); + } + } + kmem_free(remove, sizeof (zfs_locked_range_t)); + } +} + +/* + * Unlock range and destroy range lock structure. + */ +void +zfs_rangelock_exit(zfs_locked_range_t *lr) +{ + zfs_rangelock_t *rl = lr->lr_rangelock; + list_t free_list; + zfs_locked_range_t *free_lr; + + ASSERT(lr->lr_type == RL_WRITER || lr->lr_type == RL_READER); + ASSERT(lr->lr_count == 1 || lr->lr_count == 0); + ASSERT(!lr->lr_proxy); + + /* + * The free list is used to defer the cv_destroy() and + * subsequent kmem_free until after the mutex is dropped. + */ + list_create(&free_list, sizeof (zfs_locked_range_t), + offsetof(zfs_locked_range_t, lr_node)); + + mutex_enter(&rl->rl_lock); + if (lr->lr_type == RL_WRITER) { + /* writer locks can't be shared or split */ + avl_remove(&rl->rl_tree, lr); + if (lr->lr_write_wanted) + cv_broadcast(&lr->lr_write_cv); + if (lr->lr_read_wanted) + cv_broadcast(&lr->lr_read_cv); + list_insert_tail(&free_list, lr); + } else { + /* + * lock may be shared, let rangelock_exit_reader() + * release the lock and free the zfs_locked_range_t. + */ + zfs_rangelock_exit_reader(rl, lr, &free_list); + } + mutex_exit(&rl->rl_lock); + + while ((free_lr = list_remove_head(&free_list)) != NULL) + zfs_rangelock_free(free_lr); + + list_destroy(&free_list); +} + +/* + * Reduce range locked as RL_WRITER from whole file to specified range. + * Asserts the whole file is exclusively locked and so there's only one + * entry in the tree. + */ +void +zfs_rangelock_reduce(zfs_locked_range_t *lr, uint64_t off, uint64_t len) +{ + zfs_rangelock_t *rl = lr->lr_rangelock; + + /* Ensure there are no other locks */ + ASSERT3U(avl_numnodes(&rl->rl_tree), ==, 1); + ASSERT3U(lr->lr_offset, ==, 0); + ASSERT3U(lr->lr_type, ==, RL_WRITER); + ASSERT(!lr->lr_proxy); + ASSERT3U(lr->lr_length, ==, UINT64_MAX); + ASSERT3U(lr->lr_count, ==, 1); + + mutex_enter(&rl->rl_lock); + lr->lr_offset = off; + lr->lr_length = len; + mutex_exit(&rl->rl_lock); + if (lr->lr_write_wanted) + cv_broadcast(&lr->lr_write_cv); + if (lr->lr_read_wanted) + cv_broadcast(&lr->lr_read_cv); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zfs_rangelock_init); +EXPORT_SYMBOL(zfs_rangelock_fini); +EXPORT_SYMBOL(zfs_rangelock_enter); +EXPORT_SYMBOL(zfs_rangelock_tryenter); +EXPORT_SYMBOL(zfs_rangelock_exit); +EXPORT_SYMBOL(zfs_rangelock_reduce); +#endif diff --git a/module/zfs/zfs_sa.c b/module/zfs/zfs_sa.c new file mode 100644 index 000000000000..cbb773ffbdfa --- /dev/null +++ b/module/zfs/zfs_sa.c @@ -0,0 +1,445 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +/* + * ZPL attribute registration table. + * Order of attributes doesn't matter + * a unique value will be assigned for each + * attribute that is file system specific + * + * This is just the set of ZPL attributes that this + * version of ZFS deals with natively. The file system + * could have other attributes stored in files, but they will be + * ignored. The SA framework will preserve them, just that + * this version of ZFS won't change or delete them. + */ + +sa_attr_reg_t zfs_attr_table[ZPL_END+1] = { + {"ZPL_ATIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 0}, + {"ZPL_MTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 1}, + {"ZPL_CTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 2}, + {"ZPL_CRTIME", sizeof (uint64_t) * 2, SA_UINT64_ARRAY, 3}, + {"ZPL_GEN", sizeof (uint64_t), SA_UINT64_ARRAY, 4}, + {"ZPL_MODE", sizeof (uint64_t), SA_UINT64_ARRAY, 5}, + {"ZPL_SIZE", sizeof (uint64_t), SA_UINT64_ARRAY, 6}, + {"ZPL_PARENT", sizeof (uint64_t), SA_UINT64_ARRAY, 7}, + {"ZPL_LINKS", sizeof (uint64_t), SA_UINT64_ARRAY, 8}, + {"ZPL_XATTR", sizeof (uint64_t), SA_UINT64_ARRAY, 9}, + {"ZPL_RDEV", sizeof (uint64_t), SA_UINT64_ARRAY, 10}, + {"ZPL_FLAGS", sizeof (uint64_t), SA_UINT64_ARRAY, 11}, + {"ZPL_UID", sizeof (uint64_t), SA_UINT64_ARRAY, 12}, + {"ZPL_GID", sizeof (uint64_t), SA_UINT64_ARRAY, 13}, + {"ZPL_PAD", sizeof (uint64_t) * 4, SA_UINT64_ARRAY, 14}, + {"ZPL_ZNODE_ACL", 88, SA_UINT8_ARRAY, 15}, + {"ZPL_DACL_COUNT", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {"ZPL_SYMLINK", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_SCANSTAMP", 32, SA_UINT8_ARRAY, 0}, + {"ZPL_DACL_ACES", 0, SA_ACL, 0}, + {"ZPL_DXATTR", 0, SA_UINT8_ARRAY, 0}, + {"ZPL_PROJID", sizeof (uint64_t), SA_UINT64_ARRAY, 0}, + {NULL, 0, 0, 0} +}; + +#ifdef _KERNEL +int +zfs_sa_readlink(znode_t *zp, uio_t *uio) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + size_t bufsz; + int error; + + bufsz = zp->z_size; + if (bufsz + ZFS_OLD_ZNODE_PHYS_SIZE <= db->db_size) { + error = uiomove((caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, + MIN((size_t)bufsz, uio_resid(uio)), UIO_READ, uio); + } else { + dmu_buf_t *dbp; + if ((error = dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, + 0, FTAG, &dbp, DMU_READ_NO_PREFETCH)) == 0) { + error = uiomove(dbp->db_data, + MIN((size_t)bufsz, uio_resid(uio)), UIO_READ, uio); + dmu_buf_rele(dbp, FTAG); + } + } + return (error); +} + +void +zfs_sa_symlink(znode_t *zp, char *link, int len, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + + if (ZFS_OLD_ZNODE_PHYS_SIZE + len <= dmu_bonus_max()) { + VERIFY0(dmu_set_bonus(db, len + ZFS_OLD_ZNODE_PHYS_SIZE, tx)); + if (len) { + bcopy(link, (caddr_t)db->db_data + + ZFS_OLD_ZNODE_PHYS_SIZE, len); + } + } else { + dmu_buf_t *dbp; + + zfs_grow_blocksize(zp, len, tx); + VERIFY0(dmu_buf_hold(ZTOZSB(zp)->z_os, zp->z_id, 0, FTAG, &dbp, + DMU_READ_NO_PREFETCH)); + + dmu_buf_will_dirty(dbp, tx); + + ASSERT3U(len, <=, dbp->db_size); + bcopy(link, dbp->db_data, len); + dmu_buf_rele(dbp, FTAG); + } +} + +void +zfs_sa_get_scanstamp(znode_t *zp, xvattr_t *xvap) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + xoptattr_t *xoap; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) { + if (sa_lookup(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp)) != 0) + return; + } else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + if (!(zp->z_pflags & ZFS_BONUS_SCANSTAMP)) + return; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + + if (len <= doi.doi_bonus_size) { + (void) memcpy(xoap->xoa_av_scanstamp, + (caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + sizeof (xoap->xoa_av_scanstamp)); + } + } + XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); +} + +void +zfs_sa_set_scanstamp(znode_t *zp, xvattr_t *xvap, dmu_tx_t *tx) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + xoptattr_t *xoap; + + ASSERT(MUTEX_HELD(&zp->z_lock)); + VERIFY((xoap = xva_getxoptattr(xvap)) != NULL); + if (zp->z_is_sa) + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_SCANSTAMP(zfsvfs), + &xoap->xoa_av_scanstamp, + sizeof (xoap->xoa_av_scanstamp), tx)); + else { + dmu_object_info_t doi; + dmu_buf_t *db = sa_get_db(zp->z_sa_hdl); + int len; + + sa_object_info(zp->z_sa_hdl, &doi); + len = sizeof (xoap->xoa_av_scanstamp) + + ZFS_OLD_ZNODE_PHYS_SIZE; + if (len > doi.doi_bonus_size) + VERIFY(dmu_set_bonus(db, len, tx) == 0); + (void) memcpy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + xoap->xoa_av_scanstamp, sizeof (xoap->xoa_av_scanstamp)); + + zp->z_pflags |= ZFS_BONUS_SCANSTAMP; + VERIFY(0 == sa_update(zp->z_sa_hdl, SA_ZPL_FLAGS(zfsvfs), + &zp->z_pflags, sizeof (uint64_t), tx)); + } +} + +int +zfs_sa_get_xattr(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + char *obj; + int size; + int error; + + ASSERT(RW_LOCK_HELD(&zp->z_xattr_lock)); + ASSERT(!zp->z_xattr_cached); + ASSERT(zp->z_is_sa); + + error = sa_size(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), &size); + if (error) { + if (error == ENOENT) + return nvlist_alloc(&zp->z_xattr_cached, + NV_UNIQUE_NAME, KM_SLEEP); + else + return (error); + } + + obj = vmem_alloc(size, KM_SLEEP); + + error = sa_lookup(zp->z_sa_hdl, SA_ZPL_DXATTR(zfsvfs), obj, size); + if (error == 0) + error = nvlist_unpack(obj, size, &zp->z_xattr_cached, KM_SLEEP); + + vmem_free(obj, size); + + return (error); +} + +int +zfs_sa_set_xattr(znode_t *zp) +{ + zfsvfs_t *zfsvfs = ZTOZSB(zp); + dmu_tx_t *tx; + char *obj; + size_t size; + int error; + + ASSERT(RW_WRITE_HELD(&zp->z_xattr_lock)); + ASSERT(zp->z_xattr_cached); + ASSERT(zp->z_is_sa); + + error = nvlist_size(zp->z_xattr_cached, &size, NV_ENCODE_XDR); + if ((error == 0) && (size > SA_ATTR_MAX_LEN)) + error = SET_ERROR(EFBIG); + if (error) + goto out; + + obj = vmem_alloc(size, KM_SLEEP); + + error = nvlist_pack(zp->z_xattr_cached, &obj, &size, + NV_ENCODE_XDR, KM_SLEEP); + if (error) + goto out_free; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_sa_create(tx, size); + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + int count = 0; + sa_bulk_attr_t bulk[2]; + uint64_t ctime[2]; + + zfs_tstamp_update_setup(zp, STATE_CHANGED, NULL, ctime); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_DXATTR(zfsvfs), + NULL, obj, size); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), + NULL, &ctime, 16); + VERIFY0(sa_bulk_update(zp->z_sa_hdl, bulk, count, tx)); + + dmu_tx_commit(tx); + } +out_free: + vmem_free(obj, size); +out: + return (error); +} + +/* + * I'm not convinced we should do any of this upgrade. + * since the SA code can read both old/new znode formats + * with probably little to no performance difference. + * + * All new files will be created with the new format. + */ + +void +zfs_sa_upgrade(sa_handle_t *hdl, dmu_tx_t *tx) +{ + dmu_buf_t *db = sa_get_db(hdl); + znode_t *zp = sa_get_userdata(hdl); + zfsvfs_t *zfsvfs = ZTOZSB(zp); + int count = 0; + sa_bulk_attr_t *bulk, *sa_attrs; + zfs_acl_locator_cb_t locate = { 0 }; + uint64_t uid, gid, mode, rdev, xattr, parent, tmp_gen; + uint64_t crtime[2], mtime[2], ctime[2], atime[2]; + uint64_t links; + zfs_acl_phys_t znode_acl; + char scanstamp[AV_SCANSTAMP_SZ]; + boolean_t drop_lock = B_FALSE; + + /* + * No upgrade if ACL isn't cached + * since we won't know which locks are held + * and ready the ACL would require special "locked" + * interfaces that would be messy + */ + if (zp->z_acl_cached == NULL || Z_ISLNK(ZTOTYPE(zp))) + return; + + /* + * If the z_lock is held and we aren't the owner + * the just return since we don't want to deadlock + * trying to update the status of z_is_sa. This + * file can then be upgraded at a later time. + * + * Otherwise, we know we are doing the + * sa_update() that caused us to enter this function. + */ + if (MUTEX_NOT_HELD(&zp->z_lock)) { + if (mutex_tryenter(&zp->z_lock) == 0) + return; + else + drop_lock = B_TRUE; + } + + /* First do a bulk query of the attributes that aren't cached */ + bulk = kmem_alloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, &mtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_CRTIME(zfsvfs), NULL, &crtime, 16); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_PARENT(zfsvfs), NULL, &parent, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_XATTR(zfsvfs), NULL, &xattr, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_RDEV(zfsvfs), NULL, &rdev, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_GEN(zfsvfs), NULL, &tmp_gen, 8); + SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ZNODE_ACL(zfsvfs), NULL, + &znode_acl, 88); + + if (sa_bulk_lookup_locked(hdl, bulk, count) != 0) + goto done; + + if (dmu_objset_projectquota_enabled(hdl->sa_os) && + !(zp->z_pflags & ZFS_PROJID)) { + zp->z_pflags |= ZFS_PROJID; + zp->z_projid = ZFS_DEFAULT_PROJID; + } + + /* + * While the order here doesn't matter its best to try and organize + * it is such a way to pick up an already existing layout number + */ + count = 0; + sa_attrs = kmem_zalloc(sizeof (sa_bulk_attr_t) * ZPL_END, KM_SLEEP); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MODE(zfsvfs), NULL, &mode, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SIZE(zfsvfs), NULL, + &zp->z_size, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GEN(zfsvfs), + NULL, &tmp_gen, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_UID(zfsvfs), NULL, &uid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_GID(zfsvfs), NULL, &gid, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PARENT(zfsvfs), + NULL, &parent, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_FLAGS(zfsvfs), NULL, + &zp->z_pflags, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_ATIME(zfsvfs), NULL, + &atime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_MTIME(zfsvfs), NULL, + &mtime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CTIME(zfsvfs), NULL, + &ctime, 16); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_CRTIME(zfsvfs), NULL, + &crtime, 16); + links = ZTONLNK(zp); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_LINKS(zfsvfs), NULL, + &links, 8); + if (dmu_objset_projectquota_enabled(hdl->sa_os)) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_PROJID(zfsvfs), NULL, + &zp->z_projid, 8); + if (Z_ISBLK(ZTOTYPE(zp)) || Z_ISCHR(ZTOTYPE(zp))) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_RDEV(zfsvfs), NULL, + &rdev, 8); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_COUNT(zfsvfs), NULL, + &zp->z_acl_cached->z_acl_count, 8); + + if (zp->z_acl_cached->z_version < ZFS_ACL_VERSION_FUID) + zfs_acl_xform(zp, zp->z_acl_cached, CRED()); + + locate.cb_aclp = zp->z_acl_cached; + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_DACL_ACES(zfsvfs), + zfs_acl_data_locator, &locate, zp->z_acl_cached->z_acl_bytes); + + if (xattr) + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_XATTR(zfsvfs), + NULL, &xattr, 8); + + /* if scanstamp then add scanstamp */ + + if (zp->z_pflags & ZFS_BONUS_SCANSTAMP) { + bcopy((caddr_t)db->db_data + ZFS_OLD_ZNODE_PHYS_SIZE, + scanstamp, AV_SCANSTAMP_SZ); + SA_ADD_BULK_ATTR(sa_attrs, count, SA_ZPL_SCANSTAMP(zfsvfs), + NULL, scanstamp, AV_SCANSTAMP_SZ); + zp->z_pflags &= ~ZFS_BONUS_SCANSTAMP; + } + + VERIFY(dmu_set_bonustype(db, DMU_OT_SA, tx) == 0); + VERIFY(sa_replace_all_by_template_locked(hdl, sa_attrs, + count, tx) == 0); + if (znode_acl.z_acl_extern_obj) + VERIFY(0 == dmu_object_free(zfsvfs->z_os, + znode_acl.z_acl_extern_obj, tx)); + + zp->z_is_sa = B_TRUE; + kmem_free(sa_attrs, sizeof (sa_bulk_attr_t) * ZPL_END); +done: + kmem_free(bulk, sizeof (sa_bulk_attr_t) * ZPL_END); + if (drop_lock) + mutex_exit(&zp->z_lock); +} + +void +zfs_sa_upgrade_txholds(dmu_tx_t *tx, znode_t *zp) +{ + if (!ZTOZSB(zp)->z_use_sa || zp->z_is_sa) + return; + + + dmu_tx_hold_sa(tx, zp->z_sa_hdl, B_TRUE); + + if (zfs_external_acl(zp)) { + dmu_tx_hold_free(tx, zfs_external_acl(zp), 0, + DMU_OBJECT_END); + } +} + +EXPORT_SYMBOL(zfs_attr_table); +EXPORT_SYMBOL(zfs_sa_readlink); +EXPORT_SYMBOL(zfs_sa_symlink); +EXPORT_SYMBOL(zfs_sa_get_scanstamp); +EXPORT_SYMBOL(zfs_sa_set_scanstamp); +EXPORT_SYMBOL(zfs_sa_get_xattr); +EXPORT_SYMBOL(zfs_sa_set_xattr); +EXPORT_SYMBOL(zfs_sa_upgrade); +EXPORT_SYMBOL(zfs_sa_upgrade_txholds); + +#endif diff --git a/module/zfs/zil.c b/module/zfs/zil.c new file mode 100644 index 000000000000..9dc20ba14f37 --- /dev/null +++ b/module/zfs/zil.c @@ -0,0 +1,3688 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2018 by Delphix. All rights reserved. + * Copyright (c) 2014 Integros [integros.com] + * Copyright (c) 2018 Datto Inc. + */ + +/* Portions Copyright 2010 Robert Milkowski */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The ZFS Intent Log (ZIL) saves "transaction records" (itxs) of system + * calls that change the file system. Each itx has enough information to + * be able to replay them after a system crash, power loss, or + * equivalent failure mode. These are stored in memory until either: + * + * 1. they are committed to the pool by the DMU transaction group + * (txg), at which point they can be discarded; or + * 2. they are committed to the on-disk ZIL for the dataset being + * modified (e.g. due to an fsync, O_DSYNC, or other synchronous + * requirement). + * + * In the event of a crash or power loss, the itxs contained by each + * dataset's on-disk ZIL will be replayed when that dataset is first + * instantiated (e.g. if the dataset is a normal filesystem, when it is + * first mounted). + * + * As hinted at above, there is one ZIL per dataset (both the in-memory + * representation, and the on-disk representation). The on-disk format + * consists of 3 parts: + * + * - a single, per-dataset, ZIL header; which points to a chain of + * - zero or more ZIL blocks; each of which contains + * - zero or more ZIL records + * + * A ZIL record holds the information necessary to replay a single + * system call transaction. A ZIL block can hold many ZIL records, and + * the blocks are chained together, similarly to a singly linked list. + * + * Each ZIL block contains a block pointer (blkptr_t) to the next ZIL + * block in the chain, and the ZIL header points to the first block in + * the chain. + * + * Note, there is not a fixed place in the pool to hold these ZIL + * blocks; they are dynamically allocated and freed as needed from the + * blocks available on the pool, though they can be preferentially + * allocated from a dedicated "log" vdev. + */ + +/* + * This controls the amount of time that a ZIL block (lwb) will remain + * "open" when it isn't "full", and it has a thread waiting for it to be + * committed to stable storage. Please refer to the zil_commit_waiter() + * function (and the comments within it) for more details. + */ +int zfs_commit_timeout_pct = 5; + +/* + * See zil.h for more information about these fields. + */ +zil_stats_t zil_stats = { + { "zil_commit_count", KSTAT_DATA_UINT64 }, + { "zil_commit_writer_count", KSTAT_DATA_UINT64 }, + { "zil_itx_count", KSTAT_DATA_UINT64 }, + { "zil_itx_indirect_count", KSTAT_DATA_UINT64 }, + { "zil_itx_indirect_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_copied_count", KSTAT_DATA_UINT64 }, + { "zil_itx_copied_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_needcopy_count", KSTAT_DATA_UINT64 }, + { "zil_itx_needcopy_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_count", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_normal_bytes", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_count", KSTAT_DATA_UINT64 }, + { "zil_itx_metaslab_slog_bytes", KSTAT_DATA_UINT64 }, +}; + +static kstat_t *zil_ksp; + +/* + * Disable intent logging replay. This global ZIL switch affects all pools. + */ +int zil_replay_disable = 0; + +/* + * Disable the DKIOCFLUSHWRITECACHE commands that are normally sent to + * the disk(s) by the ZIL after an LWB write has completed. Setting this + * will cause ZIL corruption on power loss if a volatile out-of-order + * write cache is enabled. + */ +int zil_nocacheflush = 0; + +/* + * Limit SLOG write size per commit executed with synchronous priority. + * Any writes above that will be executed with lower (asynchronous) priority + * to limit potential SLOG device abuse by single active ZIL writer. + */ +unsigned long zil_slog_bulk = 768 * 1024; + +static kmem_cache_t *zil_lwb_cache; +static kmem_cache_t *zil_zcw_cache; + +#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ + sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) + +static int +zil_bp_compare(const void *x1, const void *x2) +{ + const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; + const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; + + int cmp = TREE_CMP(DVA_GET_VDEV(dva1), DVA_GET_VDEV(dva2)); + if (likely(cmp)) + return (cmp); + + return (TREE_CMP(DVA_GET_OFFSET(dva1), DVA_GET_OFFSET(dva2))); +} + +static void +zil_bp_tree_init(zilog_t *zilog) +{ + avl_create(&zilog->zl_bp_tree, zil_bp_compare, + sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); +} + +static void +zil_bp_tree_fini(zilog_t *zilog) +{ + avl_tree_t *t = &zilog->zl_bp_tree; + zil_bp_node_t *zn; + void *cookie = NULL; + + while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zn, sizeof (zil_bp_node_t)); + + avl_destroy(t); +} + +int +zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) +{ + avl_tree_t *t = &zilog->zl_bp_tree; + const dva_t *dva; + zil_bp_node_t *zn; + avl_index_t where; + + if (BP_IS_EMBEDDED(bp)) + return (0); + + dva = BP_IDENTITY(bp); + + if (avl_find(t, dva, &where) != NULL) + return (SET_ERROR(EEXIST)); + + zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); + zn->zn_dva = *dva; + avl_insert(t, zn, where); + + return (0); +} + +static zil_header_t * +zil_header_in_syncing_context(zilog_t *zilog) +{ + return ((zil_header_t *)zilog->zl_header); +} + +static void +zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) +{ + zio_cksum_t *zc = &bp->blk_cksum; + + zc->zc_word[ZIL_ZC_GUID_0] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_GUID_1] = spa_get_random(-1ULL); + zc->zc_word[ZIL_ZC_OBJSET] = dmu_objset_id(zilog->zl_os); + zc->zc_word[ZIL_ZC_SEQ] = 1ULL; +} + +/* + * Read a log block and make sure it's valid. + */ +static int +zil_read_log_block(zilog_t *zilog, boolean_t decrypt, const blkptr_t *bp, + blkptr_t *nbp, void *dst, char **end) +{ + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_phys_t zb; + int error; + + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + + if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + zio_flags |= ZIO_FLAG_SPECULATIVE; + + if (!decrypt) + zio_flags |= ZIO_FLAG_RAW; + + SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, + &abuf, ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + if (error == 0) { + zio_cksum_t cksum = bp->blk_cksum; + + /* + * Validate the checksummed log block. + * + * Sequence numbers should be... sequential. The checksum + * verifier for the next block should be bp's checksum plus 1. + * + * Also check the log chain linkage and size used. + */ + cksum.zc_word[ZIL_ZC_SEQ]++; + + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = abuf->b_data; + char *lr = (char *)(zilc + 1); + uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); + + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + error = SET_ERROR(ECKSUM); + } else { + ASSERT3U(len, <=, SPA_OLD_MAXBLOCKSIZE); + bcopy(lr, dst, len); + *end = (char *)dst + len; + *nbp = zilc->zc_next_blk; + } + } else { + char *lr = abuf->b_data; + uint64_t size = BP_GET_LSIZE(bp); + zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; + + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + (zilc->zc_nused > (size - sizeof (*zilc)))) { + error = SET_ERROR(ECKSUM); + } else { + ASSERT3U(zilc->zc_nused, <=, + SPA_OLD_MAXBLOCKSIZE); + bcopy(lr, dst, zilc->zc_nused); + *end = (char *)dst + zilc->zc_nused; + *nbp = zilc->zc_next_blk; + } + } + + arc_buf_destroy(abuf, &abuf); + } + + return (error); +} + +/* + * Read a TX_WRITE log data block. + */ +static int +zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) +{ + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + const blkptr_t *bp = &lr->lr_blkptr; + arc_flags_t aflags = ARC_FLAG_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_phys_t zb; + int error; + + if (BP_IS_HOLE(bp)) { + if (wbuf != NULL) + bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); + return (0); + } + + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + + /* + * If we are not using the resulting data, we are just checking that + * it hasn't been corrupted so we don't need to waste CPU time + * decompressing and decrypting it. + */ + if (wbuf == NULL) + zio_flags |= ZIO_FLAG_RAW; + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + error = arc_read(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + if (error == 0) { + if (wbuf != NULL) + bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + arc_buf_destroy(abuf, &abuf); + } + + return (error); +} + +/* + * Parse the intent log, and call parse_func for each valid record within. + */ +int +zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, + zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg, + boolean_t decrypt) +{ + const zil_header_t *zh = zilog->zl_header; + boolean_t claimed = !!zh->zh_claim_txg; + uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; + uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; + uint64_t max_blk_seq = 0; + uint64_t max_lr_seq = 0; + uint64_t blk_count = 0; + uint64_t lr_count = 0; + blkptr_t blk, next_blk; + char *lrbuf, *lrp; + int error = 0; + + bzero(&next_blk, sizeof (blkptr_t)); + + /* + * Old logs didn't record the maximum zh_claim_lr_seq. + */ + if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + claim_lr_seq = UINT64_MAX; + + /* + * Starting at the block pointed to by zh_log we read the log chain. + * For each block in the chain we strongly check that block to + * ensure its validity. We stop when an invalid block is found. + * For each block pointer in the chain we call parse_blk_func(). + * For each record in each valid block we call parse_lr_func(). + * If the log has been claimed, stop if we encounter a sequence + * number greater than the highest claimed sequence number. + */ + lrbuf = zio_buf_alloc(SPA_OLD_MAXBLOCKSIZE); + zil_bp_tree_init(zilog); + + for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { + uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + int reclen; + char *end = NULL; + + if (blk_seq > claim_blk_seq) + break; + + error = parse_blk_func(zilog, &blk, arg, txg); + if (error != 0) + break; + ASSERT3U(max_blk_seq, <, blk_seq); + max_blk_seq = blk_seq; + blk_count++; + + if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) + break; + + error = zil_read_log_block(zilog, decrypt, &blk, &next_blk, + lrbuf, &end); + if (error != 0) + break; + + for (lrp = lrbuf; lrp < end; lrp += reclen) { + lr_t *lr = (lr_t *)lrp; + reclen = lr->lrc_reclen; + ASSERT3U(reclen, >=, sizeof (lr_t)); + if (lr->lrc_seq > claim_lr_seq) + goto done; + + error = parse_lr_func(zilog, lr, arg, txg); + if (error != 0) + goto done; + ASSERT3U(max_lr_seq, <, lr->lrc_seq); + max_lr_seq = lr->lrc_seq; + lr_count++; + } + } +done: + zilog->zl_parse_error = error; + zilog->zl_parse_blk_seq = max_blk_seq; + zilog->zl_parse_lr_seq = max_lr_seq; + zilog->zl_parse_blk_count = blk_count; + zilog->zl_parse_lr_count = lr_count; + + ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || + (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq) || + (decrypt && error == EIO)); + + zil_bp_tree_fini(zilog); + zio_buf_free(lrbuf, SPA_OLD_MAXBLOCKSIZE); + + return (error); +} + +/* ARGSUSED */ +static int +zil_clear_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + ASSERT(!BP_IS_HOLE(bp)); + + /* + * As we call this function from the context of a rewind to a + * checkpoint, each ZIL block whose txg is later than the txg + * that we rewind to is invalid. Thus, we return -1 so + * zil_parse() doesn't attempt to read it. + */ + if (bp->blk_birth >= first_txg) + return (-1); + + if (zil_bp_tree_add(zilog, bp) != 0) + return (0); + + zio_free(zilog->zl_spa, first_txg, bp); + return (0); +} + +/* ARGSUSED */ +static int +zil_noop_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + return (0); +} + +static int +zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) +{ + /* + * Claim log block if not already committed and not already claimed. + * If tx == NULL, just verify that the block is claimable. + */ + if (BP_IS_HOLE(bp) || bp->blk_birth < first_txg || + zil_bp_tree_add(zilog, bp) != 0) + return (0); + + return (zio_wait(zio_claim(NULL, zilog->zl_spa, + tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); +} + +static int +zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) +{ + lr_write_t *lr = (lr_write_t *)lrc; + int error; + + if (lrc->lrc_txtype != TX_WRITE) + return (0); + + /* + * If the block is not readable, don't claim it. This can happen + * in normal operation when a log block is written to disk before + * some of the dmu_sync() blocks it points to. In this case, the + * transaction cannot have been committed to anyone (we would have + * waited for all writes to be stable first), so it is semantically + * correct to declare this the end of the log. + */ + if (lr->lr_blkptr.blk_birth >= first_txg) { + error = zil_read_log_data(zilog, lr, NULL); + if (error != 0) + return (error); + } + + return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); +} + +/* ARGSUSED */ +static int +zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) +{ + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); +} + +static int +zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) +{ + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + + /* + * If we previously claimed it, we need to free it. + */ + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && + bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0 && + !BP_IS_HOLE(bp)) + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); +} + +static int +zil_lwb_vdev_compare(const void *x1, const void *x2) +{ + const uint64_t v1 = ((zil_vdev_node_t *)x1)->zv_vdev; + const uint64_t v2 = ((zil_vdev_node_t *)x2)->zv_vdev; + + return (TREE_CMP(v1, v2)); +} + +static lwb_t * +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, boolean_t slog, uint64_t txg, + boolean_t fastwrite) +{ + lwb_t *lwb; + + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = *bp; + lwb->lwb_fastwrite = fastwrite; + lwb->lwb_slog = slog; + lwb->lwb_state = LWB_STATE_CLOSED; + lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_max_txg = txg; + lwb->lwb_write_zio = NULL; + lwb->lwb_root_zio = NULL; + lwb->lwb_tx = NULL; + lwb->lwb_issued_timestamp = 0; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); + } + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + VERIFY(list_is_empty(&lwb->lwb_itxs)); + + return (lwb); +} + +static void +zil_free_lwb(zilog_t *zilog, lwb_t *lwb) +{ + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + ASSERT(!MUTEX_HELD(&lwb->lwb_vdev_lock)); + VERIFY(list_is_empty(&lwb->lwb_waiters)); + VERIFY(list_is_empty(&lwb->lwb_itxs)); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + ASSERT3P(lwb->lwb_write_zio, ==, NULL); + ASSERT3P(lwb->lwb_root_zio, ==, NULL); + ASSERT3U(lwb->lwb_max_txg, <=, spa_syncing_txg(zilog->zl_spa)); + ASSERT(lwb->lwb_state == LWB_STATE_CLOSED || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + + /* + * Clear the zilog's field to indicate this lwb is no longer + * valid, and prevent use-after-free errors. + */ + if (zilog->zl_last_lwb_opened == lwb) + zilog->zl_last_lwb_opened = NULL; + + kmem_cache_free(zil_lwb_cache, lwb); +} + +/* + * Called when we create in-memory log transactions so that we know + * to cleanup the itxs at the end of spa_sync(). + */ +static void +zilog_dirty(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + dsl_dataset_t *ds = dmu_objset_ds(zilog->zl_os); + + ASSERT(spa_writeable(zilog->zl_spa)); + + if (ds->ds_is_snapshot) + panic("dirtying snapshot!"); + + if (txg_list_add(&dp->dp_dirty_zilogs, zilog, txg)) { + /* up the hold count until we can be written out */ + dmu_buf_add_ref(ds->ds_dbuf, zilog); + + zilog->zl_dirty_max_txg = MAX(txg, zilog->zl_dirty_max_txg); + } +} + +/* + * Determine if the zil is dirty in the specified txg. Callers wanting to + * ensure that the dirty state does not change must hold the itxg_lock for + * the specified txg. Holding the lock will ensure that the zil cannot be + * dirtied (zil_itx_assign) or cleaned (zil_clean) while we check its current + * state. + */ +static boolean_t __maybe_unused +zilog_is_dirty_in_txg(zilog_t *zilog, uint64_t txg) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, txg & TXG_MASK)) + return (B_TRUE); + return (B_FALSE); +} + +/* + * Determine if the zil is dirty. The zil is considered dirty if it has + * any pending itx records that have not been cleaned by zil_clean(). + */ +static boolean_t +zilog_is_dirty(zilog_t *zilog) +{ + dsl_pool_t *dp = zilog->zl_dmu_pool; + + for (int t = 0; t < TXG_SIZE; t++) { + if (txg_list_member(&dp->dp_dirty_zilogs, zilog, t)) + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Create an on-disk intent log. + */ +static lwb_t * +zil_create(zilog_t *zilog) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb = NULL; + uint64_t txg = 0; + dmu_tx_t *tx = NULL; + blkptr_t blk; + int error = 0; + boolean_t fastwrite = FALSE; + boolean_t slog = FALSE; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + ASSERT(zh->zh_claim_txg == 0); + ASSERT(zh->zh_replay_seq == 0); + + blk = zh->zh_log; + + /* + * Allocate an initial log block if: + * - there isn't one already + * - the existing block is the wrong endianness + */ + if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + if (!BP_IS_HOLE(&blk)) { + zio_free(zilog->zl_spa, txg, &blk); + BP_ZERO(&blk); + } + + error = zio_alloc_zil(zilog->zl_spa, zilog->zl_os, txg, &blk, + ZIL_MIN_BLKSZ, &slog); + fastwrite = TRUE; + + if (error == 0) + zil_init_log_chain(zilog, &blk); + } + + /* + * Allocate a log write block (lwb) for the first log block. + */ + if (error == 0) + lwb = zil_alloc_lwb(zilog, &blk, slog, txg, fastwrite); + + /* + * If we just allocated the first log block, commit our transaction + * and wait for zil_sync() to stuff the block pointer into zh_log. + * (zh is part of the MOS, so we cannot modify it in open context.) + */ + if (tx != NULL) { + dmu_tx_commit(tx); + txg_wait_synced(zilog->zl_dmu_pool, txg); + } + + ASSERT(error != 0 || bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); + IMPLY(error == 0, lwb != NULL); + + return (lwb); +} + +/* + * In one tx, free all log blocks and clear the log header. If keep_first + * is set, then we're replaying a log with no content. We want to keep the + * first block, however, so that the first synchronous transaction doesn't + * require a txg_wait_synced() in zil_create(). We don't need to + * txg_wait_synced() here either when keep_first is set, because both + * zil_create() and zil_destroy() will wait for any in-progress destroys + * to complete. + */ +void +zil_destroy(zilog_t *zilog, boolean_t keep_first) +{ + const zil_header_t *zh = zilog->zl_header; + lwb_t *lwb; + dmu_tx_t *tx; + uint64_t txg; + + /* + * Wait for any previous destroy to complete. + */ + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + + zilog->zl_old_header = *zh; /* debugging aid */ + + if (BP_IS_HOLE(&zh->zh_log)) + return; + + tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + mutex_enter(&zilog->zl_lock); + + ASSERT3U(zilog->zl_destroy_txg, <, txg); + zilog->zl_destroy_txg = txg; + zilog->zl_keep_first = keep_first; + + if (!list_is_empty(&zilog->zl_lwb_list)) { + ASSERT(zh->zh_claim_txg == 0); + VERIFY(!keep_first); + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, + &lwb->lwb_blk); + + list_remove(&zilog->zl_lwb_list, lwb); + if (lwb->lwb_buf != NULL) + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zio_free(zilog->zl_spa, txg, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + } + } else if (!keep_first) { + zil_destroy_sync(zilog, tx); + } + mutex_exit(&zilog->zl_lock); + + dmu_tx_commit(tx); +} + +void +zil_destroy_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zilog->zl_header->zh_claim_txg, B_FALSE); +} + +int +zil_claim(dsl_pool_t *dp, dsl_dataset_t *ds, void *txarg) +{ + dmu_tx_t *tx = txarg; + zilog_t *zilog; + uint64_t first_txg; + zil_header_t *zh; + objset_t *os; + int error; + + error = dmu_objset_own_obj(dp, ds->ds_object, + DMU_OST_ANY, B_FALSE, B_FALSE, FTAG, &os); + if (error != 0) { + /* + * EBUSY indicates that the objset is inconsistent, in which + * case it can not have a ZIL. + */ + if (error != EBUSY) { + cmn_err(CE_WARN, "can't open objset for %llu, error %u", + (unsigned long long)ds->ds_object, error); + } + + return (0); + } + + zilog = dmu_objset_zil(os); + zh = zil_header_in_syncing_context(zilog); + ASSERT3U(tx->tx_txg, ==, spa_first_txg(zilog->zl_spa)); + first_txg = spa_min_claim_txg(zilog->zl_spa); + + /* + * If the spa_log_state is not set to be cleared, check whether + * the current uberblock is a checkpoint one and if the current + * header has been claimed before moving on. + * + * If the current uberblock is a checkpointed uberblock then + * one of the following scenarios took place: + * + * 1] We are currently rewinding to the checkpoint of the pool. + * 2] We crashed in the middle of a checkpoint rewind but we + * did manage to write the checkpointed uberblock to the + * vdev labels, so when we tried to import the pool again + * the checkpointed uberblock was selected from the import + * procedure. + * + * In both cases we want to zero out all the ZIL blocks, except + * the ones that have been claimed at the time of the checkpoint + * (their zh_claim_txg != 0). The reason is that these blocks + * may be corrupted since we may have reused their locations on + * disk after we took the checkpoint. + * + * We could try to set spa_log_state to SPA_LOG_CLEAR earlier + * when we first figure out whether the current uberblock is + * checkpointed or not. Unfortunately, that would discard all + * the logs, including the ones that are claimed, and we would + * leak space. + */ + if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR || + (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0)) { + if (!BP_IS_HOLE(&zh->zh_log)) { + (void) zil_parse(zilog, zil_clear_log_block, + zil_noop_log_record, tx, first_txg, B_FALSE); + } + BP_ZERO(&zh->zh_log); + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_objset_disown(os, B_FALSE, FTAG); + return (0); + } + + /* + * If we are not rewinding and opening the pool normally, then + * the min_claim_txg should be equal to the first txg of the pool. + */ + ASSERT3U(first_txg, ==, spa_first_txg(zilog->zl_spa)); + + /* + * Claim all log blocks if we haven't already done so, and remember + * the highest claimed sequence number. This ensures that if we can + * read only part of the log now (e.g. due to a missing device), + * but we can read the entire log later, we will not try to replay + * or destroy beyond the last block we successfully claimed. + */ + ASSERT3U(zh->zh_claim_txg, <=, first_txg); + if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { + (void) zil_parse(zilog, zil_claim_log_block, + zil_claim_log_record, tx, first_txg, B_FALSE); + zh->zh_claim_txg = first_txg; + zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; + zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; + if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) + zh->zh_flags |= ZIL_REPLAY_NEEDED; + zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; + if (os->os_encrypted) + os->os_next_write_raw[tx->tx_txg & TXG_MASK] = B_TRUE; + dsl_dataset_dirty(dmu_objset_ds(os), tx); + } + + ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); + dmu_objset_disown(os, B_FALSE, FTAG); + return (0); +} + +/* + * Check the log by walking the log chain. + * Checksum errors are ok as they indicate the end of the chain. + * Any other error (no device or read failure) returns an error. + */ +/* ARGSUSED */ +int +zil_check_log_chain(dsl_pool_t *dp, dsl_dataset_t *ds, void *tx) +{ + zilog_t *zilog; + objset_t *os; + blkptr_t *bp; + int error; + + ASSERT(tx == NULL); + + error = dmu_objset_from_ds(ds, &os); + if (error != 0) { + cmn_err(CE_WARN, "can't open objset %llu, error %d", + (unsigned long long)ds->ds_object, error); + return (0); + } + + zilog = dmu_objset_zil(os); + bp = (blkptr_t *)&zilog->zl_header->zh_log; + + if (!BP_IS_HOLE(bp)) { + vdev_t *vd; + boolean_t valid = B_TRUE; + + /* + * Check the first block and determine if it's on a log device + * which may have been removed or faulted prior to loading this + * pool. If so, there's no point in checking the rest of the + * log as its content should have already been synced to the + * pool. + */ + spa_config_enter(os->os_spa, SCL_STATE, FTAG, RW_READER); + vd = vdev_lookup_top(os->os_spa, DVA_GET_VDEV(&bp->blk_dva[0])); + if (vd->vdev_islog && vdev_is_dead(vd)) + valid = vdev_log_state_valid(vd); + spa_config_exit(os->os_spa, SCL_STATE, FTAG); + + if (!valid) + return (0); + + /* + * Check whether the current uberblock is checkpointed (e.g. + * we are rewinding) and whether the current header has been + * claimed or not. If it hasn't then skip verifying it. We + * do this because its ZIL blocks may be part of the pool's + * state before the rewind, which is no longer valid. + */ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + if (zilog->zl_spa->spa_uberblock.ub_checkpoint_txg != 0 && + zh->zh_claim_txg == 0) + return (0); + } + + /* + * Because tx == NULL, zil_claim_log_block() will not actually claim + * any blocks, but just determine whether it is possible to do so. + * In addition to checking the log chain, zil_claim_log_block() + * will invoke zio_claim() with a done func of spa_claim_notify(), + * which will update spa_max_claim_txg. See spa_load() for details. + */ + error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, + zilog->zl_header->zh_claim_txg ? -1ULL : + spa_min_claim_txg(os->os_spa), B_FALSE); + + return ((error == ECKSUM || error == ENOENT) ? 0 : error); +} + +/* + * When an itx is "skipped", this function is used to properly mark the + * waiter as "done, and signal any thread(s) waiting on it. An itx can + * be skipped (and not committed to an lwb) for a variety of reasons, + * one of them being that the itx was committed via spa_sync(), prior to + * it being committed to an lwb; this can happen if a thread calling + * zil_commit() is racing with spa_sync(). + */ +static void +zil_commit_waiter_skip(zil_commit_waiter_t *zcw) +{ + mutex_enter(&zcw->zcw_lock); + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + mutex_exit(&zcw->zcw_lock); +} + +/* + * This function is used when the given waiter is to be linked into an + * lwb's "lwb_waiter" list; i.e. when the itx is committed to the lwb. + * At this point, the waiter will no longer be referenced by the itx, + * and instead, will be referenced by the lwb. + */ +static void +zil_commit_waiter_link_lwb(zil_commit_waiter_t *zcw, lwb_t *lwb) +{ + /* + * The lwb_waiters field of the lwb is protected by the zilog's + * zl_lock, thus it must be held when calling this function. + */ + ASSERT(MUTEX_HELD(&lwb->lwb_zilog->zl_lock)); + + mutex_enter(&zcw->zcw_lock); + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT3P(lwb, !=, NULL); + ASSERT(lwb->lwb_state == LWB_STATE_OPENED || + lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_WRITE_DONE); + + list_insert_tail(&lwb->lwb_waiters, zcw); + zcw->zcw_lwb = lwb; + mutex_exit(&zcw->zcw_lock); +} + +/* + * This function is used when zio_alloc_zil() fails to allocate a ZIL + * block, and the given waiter must be linked to the "nolwb waiters" + * list inside of zil_process_commit_list(). + */ +static void +zil_commit_waiter_link_nolwb(zil_commit_waiter_t *zcw, list_t *nolwb) +{ + mutex_enter(&zcw->zcw_lock); + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + list_insert_tail(nolwb, zcw); + mutex_exit(&zcw->zcw_lock); +} + +void +zil_lwb_add_block(lwb_t *lwb, const blkptr_t *bp) +{ + avl_tree_t *t = &lwb->lwb_vdev_tree; + avl_index_t where; + zil_vdev_node_t *zv, zvsearch; + int ndvas = BP_GET_NDVAS(bp); + int i; + + if (zil_nocacheflush) + return; + + mutex_enter(&lwb->lwb_vdev_lock); + for (i = 0; i < ndvas; i++) { + zvsearch.zv_vdev = DVA_GET_VDEV(&bp->blk_dva[i]); + if (avl_find(t, &zvsearch, &where) == NULL) { + zv = kmem_alloc(sizeof (*zv), KM_SLEEP); + zv->zv_vdev = zvsearch.zv_vdev; + avl_insert(t, zv, where); + } + } + mutex_exit(&lwb->lwb_vdev_lock); +} + +static void +zil_lwb_flush_defer(lwb_t *lwb, lwb_t *nlwb) +{ + avl_tree_t *src = &lwb->lwb_vdev_tree; + avl_tree_t *dst = &nlwb->lwb_vdev_tree; + void *cookie = NULL; + zil_vdev_node_t *zv; + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(nlwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * While 'lwb' is at a point in its lifetime where lwb_vdev_tree does + * not need the protection of lwb_vdev_lock (it will only be modified + * while holding zilog->zl_lock) as its writes and those of its + * children have all completed. The younger 'nlwb' may be waiting on + * future writes to additional vdevs. + */ + mutex_enter(&nlwb->lwb_vdev_lock); + /* + * Tear down the 'lwb' vdev tree, ensuring that entries which do not + * exist in 'nlwb' are moved to it, freeing any would-be duplicates. + */ + while ((zv = avl_destroy_nodes(src, &cookie)) != NULL) { + avl_index_t where; + + if (avl_find(dst, zv, &where) == NULL) { + avl_insert(dst, zv, where); + } else { + kmem_free(zv, sizeof (*zv)); + } + } + mutex_exit(&nlwb->lwb_vdev_lock); +} + +void +zil_lwb_add_txg(lwb_t *lwb, uint64_t txg) +{ + lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); +} + +/* + * This function is a called after all vdevs associated with a given lwb + * write have completed their DKIOCFLUSHWRITECACHE command; or as soon + * as the lwb write completes, if "zil_nocacheflush" is set. Further, + * all "previous" lwb's will have completed before this function is + * called; i.e. this function is called for all previous lwbs before + * it's called for "this" lwb (enforced via zio the dependencies + * configured in zil_lwb_set_zio_dependency()). + * + * The intention is for this function to be called as soon as the + * contents of an lwb are considered "stable" on disk, and will survive + * any sudden loss of power. At this point, any threads waiting for the + * lwb to reach this state are signalled, and the "waiter" structures + * are marked "done". + */ +static void +zil_lwb_flush_vdevs_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + zilog_t *zilog = lwb->lwb_zilog; + dmu_tx_t *tx = lwb->lwb_tx; + zil_commit_waiter_t *zcw; + itx_t *itx; + + spa_config_exit(zilog->zl_spa, SCL_STATE, lwb); + + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + + mutex_enter(&zilog->zl_lock); + + /* + * Ensure the lwb buffer pointer is cleared before releasing the + * txg. If we have had an allocation failure and the txg is + * waiting to sync then we want zil_sync() to remove the lwb so + * that it's not picked up as the next new one in + * zil_process_commit_list(). zil_sync() will only remove the + * lwb if lwb_buf is null. + */ + lwb->lwb_buf = NULL; + lwb->lwb_tx = NULL; + + ASSERT3U(lwb->lwb_issued_timestamp, >, 0); + zilog->zl_last_lwb_latency = gethrtime() - lwb->lwb_issued_timestamp; + + lwb->lwb_root_zio = NULL; + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_WRITE_DONE); + lwb->lwb_state = LWB_STATE_FLUSH_DONE; + + if (zilog->zl_last_lwb_opened == lwb) { + /* + * Remember the highest committed log sequence number + * for ztest. We only update this value when all the log + * writes succeeded, because ztest wants to ASSERT that + * it got the whole log chain. + */ + zilog->zl_commit_lr_seq = zilog->zl_lr_seq; + } + + while ((itx = list_head(&lwb->lwb_itxs)) != NULL) { + list_remove(&lwb->lwb_itxs, itx); + zil_itx_destroy(itx); + } + + while ((zcw = list_head(&lwb->lwb_waiters)) != NULL) { + mutex_enter(&zcw->zcw_lock); + + ASSERT(list_link_active(&zcw->zcw_node)); + list_remove(&lwb->lwb_waiters, zcw); + + ASSERT3P(zcw->zcw_lwb, ==, lwb); + zcw->zcw_lwb = NULL; + + zcw->zcw_zio_error = zio->io_error; + + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + zcw->zcw_done = B_TRUE; + cv_broadcast(&zcw->zcw_cv); + + mutex_exit(&zcw->zcw_lock); + } + + mutex_exit(&zilog->zl_lock); + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + dmu_tx_commit(tx); +} + +/* + * This is called when an lwb's write zio completes. The callback's + * purpose is to issue the DKIOCFLUSHWRITECACHE commands for the vdevs + * in the lwb's lwb_vdev_tree. The tree will contain the vdevs involved + * in writing out this specific lwb's data, and in the case that cache + * flushes have been deferred, vdevs involved in writing the data for + * previous lwbs. The writes corresponding to all the vdevs in the + * lwb_vdev_tree will have completed by the time this is called, due to + * the zio dependencies configured in zil_lwb_set_zio_dependency(), + * which takes deferred flushes into account. The lwb will be "done" + * once zil_lwb_flush_vdevs_done() is called, which occurs in the zio + * completion callback for the lwb's root zio. + */ +static void +zil_lwb_write_done(zio_t *zio) +{ + lwb_t *lwb = zio->io_private; + spa_t *spa = zio->io_spa; + zilog_t *zilog = lwb->lwb_zilog; + avl_tree_t *t = &lwb->lwb_vdev_tree; + void *cookie = NULL; + zil_vdev_node_t *zv; + lwb_t *nlwb; + + ASSERT3S(spa_config_held(spa, SCL_STATE, RW_READER), !=, 0); + + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); + ASSERT(!BP_IS_GANG(zio->io_bp)); + ASSERT(!BP_IS_HOLE(zio->io_bp)); + ASSERT(BP_GET_FILL(zio->io_bp) == 0); + + abd_put(zio->io_abd); + + mutex_enter(&zilog->zl_lock); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_ISSUED); + lwb->lwb_state = LWB_STATE_WRITE_DONE; + lwb->lwb_write_zio = NULL; + lwb->lwb_fastwrite = FALSE; + nlwb = list_next(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + + if (avl_numnodes(t) == 0) + return; + + /* + * If there was an IO error, we're not going to call zio_flush() + * on these vdevs, so we simply empty the tree and free the + * nodes. We avoid calling zio_flush() since there isn't any + * good reason for doing so, after the lwb block failed to be + * written out. + */ + if (zio->io_error != 0) { + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(zv, sizeof (*zv)); + return; + } + + /* + * If this lwb does not have any threads waiting for it to + * complete, we want to defer issuing the DKIOCFLUSHWRITECACHE + * command to the vdevs written to by "this" lwb, and instead + * rely on the "next" lwb to handle the DKIOCFLUSHWRITECACHE + * command for those vdevs. Thus, we merge the vdev tree of + * "this" lwb with the vdev tree of the "next" lwb in the list, + * and assume the "next" lwb will handle flushing the vdevs (or + * deferring the flush(s) again). + * + * This is a useful performance optimization, especially for + * workloads with lots of async write activity and few sync + * write and/or fsync activity, as it has the potential to + * coalesce multiple flush commands to a vdev into one. + */ + if (list_head(&lwb->lwb_waiters) == NULL && nlwb != NULL) { + zil_lwb_flush_defer(lwb, nlwb); + ASSERT(avl_is_empty(&lwb->lwb_vdev_tree)); + return; + } + + while ((zv = avl_destroy_nodes(t, &cookie)) != NULL) { + vdev_t *vd = vdev_lookup_top(spa, zv->zv_vdev); + if (vd != NULL) + zio_flush(lwb->lwb_root_zio, vd); + kmem_free(zv, sizeof (*zv)); + } +} + +static void +zil_lwb_set_zio_dependency(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *last_lwb_opened = zilog->zl_last_lwb_opened; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(MUTEX_HELD(&zilog->zl_lock)); + + /* + * The zilog's "zl_last_lwb_opened" field is used to build the + * lwb/zio dependency chain, which is used to preserve the + * ordering of lwb completions that is required by the semantics + * of the ZIL. Each new lwb zio becomes a parent of the + * "previous" lwb zio, such that the new lwb's zio cannot + * complete until the "previous" lwb's zio completes. + * + * This is required by the semantics of zil_commit(); the commit + * waiters attached to the lwbs will be woken in the lwb zio's + * completion callback, so this zio dependency graph ensures the + * waiters are woken in the correct order (the same order the + * lwbs were created). + */ + if (last_lwb_opened != NULL && + last_lwb_opened->lwb_state != LWB_STATE_FLUSH_DONE) { + ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || + last_lwb_opened->lwb_state == LWB_STATE_ISSUED || + last_lwb_opened->lwb_state == LWB_STATE_WRITE_DONE); + + ASSERT3P(last_lwb_opened->lwb_root_zio, !=, NULL); + zio_add_child(lwb->lwb_root_zio, + last_lwb_opened->lwb_root_zio); + + /* + * If the previous lwb's write hasn't already completed, + * we also want to order the completion of the lwb write + * zios (above, we only order the completion of the lwb + * root zios). This is required because of how we can + * defer the DKIOCFLUSHWRITECACHE commands for each lwb. + * + * When the DKIOCFLUSHWRITECACHE commands are deferred, + * the previous lwb will rely on this lwb to flush the + * vdevs written to by that previous lwb. Thus, we need + * to ensure this lwb doesn't issue the flush until + * after the previous lwb's write completes. We ensure + * this ordering by setting the zio parent/child + * relationship here. + * + * Without this relationship on the lwb's write zio, + * it's possible for this lwb's write to complete prior + * to the previous lwb's write completing; and thus, the + * vdevs for the previous lwb would be flushed prior to + * that lwb's data being written to those vdevs (the + * vdevs are flushed in the lwb write zio's completion + * handler, zil_lwb_write_done()). + */ + if (last_lwb_opened->lwb_state != LWB_STATE_WRITE_DONE) { + ASSERT(last_lwb_opened->lwb_state == LWB_STATE_OPENED || + last_lwb_opened->lwb_state == LWB_STATE_ISSUED); + + ASSERT3P(last_lwb_opened->lwb_write_zio, !=, NULL); + zio_add_child(lwb->lwb_write_zio, + last_lwb_opened->lwb_write_zio); + } + } +} + + +/* + * This function's purpose is to "open" an lwb such that it is ready to + * accept new itxs being committed to it. To do this, the lwb's zio + * structures are created, and linked to the lwb. This function is + * idempotent; if the passed in lwb has already been opened, this + * function is essentially a no-op. + */ +static void +zil_lwb_write_open(zilog_t *zilog, lwb_t *lwb) +{ + zbookmark_phys_t zb; + zio_priority_t prio; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb, !=, NULL); + EQUIV(lwb->lwb_root_zio == NULL, lwb->lwb_state == LWB_STATE_CLOSED); + EQUIV(lwb->lwb_root_zio != NULL, lwb->lwb_state == LWB_STATE_OPENED); + + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); + + /* Lock so zil_sync() doesn't fastwrite_unmark after zio is created */ + mutex_enter(&zilog->zl_lock); + if (lwb->lwb_root_zio == NULL) { + abd_t *lwb_abd = abd_get_from_buf(lwb->lwb_buf, + BP_GET_LSIZE(&lwb->lwb_blk)); + + if (!lwb->lwb_fastwrite) { + metaslab_fastwrite_mark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 1; + } + + if (!lwb->lwb_slog || zilog->zl_cur_used <= zil_slog_bulk) + prio = ZIO_PRIORITY_SYNC_WRITE; + else + prio = ZIO_PRIORITY_ASYNC_WRITE; + + lwb->lwb_root_zio = zio_root(zilog->zl_spa, + zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + + lwb->lwb_write_zio = zio_rewrite(lwb->lwb_root_zio, + zilog->zl_spa, 0, &lwb->lwb_blk, lwb_abd, + BP_GET_LSIZE(&lwb->lwb_blk), zil_lwb_write_done, lwb, + prio, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | + ZIO_FLAG_FASTWRITE, &zb); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + + lwb->lwb_state = LWB_STATE_OPENED; + + zil_lwb_set_zio_dependency(zilog, lwb); + zilog->zl_last_lwb_opened = lwb; + } + mutex_exit(&zilog->zl_lock); + + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); +} + +/* + * Define a limited set of intent log block sizes. + * + * These must be a multiple of 4KB. Note only the amount used (again + * aligned to 4KB) actually gets written. However, we can't always just + * allocate SPA_OLD_MAXBLOCKSIZE as the slog space could be exhausted. + */ +struct { + uint64_t limit; + uint64_t blksz; +} zil_block_buckets[] = { + { 4096, 4096 }, /* non TX_WRITE */ + { 8192 + 4096, 8192 + 4096 }, /* database */ + { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ + { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ + { 131072, 131072 }, /* < 128KB writes */ + { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ + { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ +}; + +/* + * Maximum block size used by the ZIL. This is picked up when the ZIL is + * initialized. Otherwise this should not be used directly; see + * zl_max_block_size instead. + */ +int zil_maxblocksize = SPA_OLD_MAXBLOCKSIZE; + +/* + * Start a log block write and advance to the next log block. + * Calls are serialized. + */ +static lwb_t * +zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) +{ + lwb_t *nlwb = NULL; + zil_chain_t *zilc; + spa_t *spa = zilog->zl_spa; + blkptr_t *bp; + dmu_tx_t *tx; + uint64_t txg; + uint64_t zil_blksz, wsz; + int i, error; + boolean_t slog; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb->lwb_root_zio, !=, NULL); + ASSERT3P(lwb->lwb_write_zio, !=, NULL); + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + zilc = (zil_chain_t *)lwb->lwb_buf; + bp = &zilc->zc_next_blk; + } else { + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; + } + + ASSERT(lwb->lwb_nused <= lwb->lwb_sz); + + /* + * Allocate the next block and save its address in this block + * before writing it in order to establish the log chain. + * Note that if the allocation of nlwb synced before we wrote + * the block that points at it (lwb), we'd leak it if we crashed. + * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). + * We dirty the dataset to ensure that zil_sync() will be called + * to clean up in the event of allocation failure or I/O failure. + */ + + tx = dmu_tx_create(zilog->zl_os); + + /* + * Since we are not going to create any new dirty data, and we + * can even help with clearing the existing dirty data, we + * should not be subject to the dirty data based delays. We + * use TXG_NOTHROTTLE to bypass the delay mechanism. + */ + VERIFY0(dmu_tx_assign(tx, TXG_WAIT | TXG_NOTHROTTLE)); + + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + lwb->lwb_tx = tx; + + /* + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on size used in the last block. + * - first find the smallest bucket that will fit the block from a + * limited set of block sizes. This is because it's faster to write + * blocks allocated from the same metaslab as they are adjacent or + * close. + * - next find the maximum from the new suggested size and an array of + * previous sizes. This lessens a picket fence effect of wrongly + * guessing the size if we have a stream of say 2k, 64k, 2k, 64k + * requests. + * + * Note we only write what is used, but we can't just allocate + * the maximum block size because we can exhaust the available + * pool log space. + */ + zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + for (i = 0; zil_blksz > zil_block_buckets[i].limit; i++) + continue; + zil_blksz = MIN(zil_block_buckets[i].blksz, zilog->zl_max_block_size); + zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; + for (i = 0; i < ZIL_PREV_BLKS; i++) + zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); + + BP_ZERO(bp); + error = zio_alloc_zil(spa, zilog->zl_os, txg, bp, zil_blksz, &slog); + if (slog) { + ZIL_STAT_BUMP(zil_itx_metaslab_slog_count); + ZIL_STAT_INCR(zil_itx_metaslab_slog_bytes, lwb->lwb_nused); + } else { + ZIL_STAT_BUMP(zil_itx_metaslab_normal_count); + ZIL_STAT_INCR(zil_itx_metaslab_normal_bytes, lwb->lwb_nused); + } + if (error == 0) { + ASSERT3U(bp->blk_birth, ==, txg); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + + /* + * Allocate a new log write block (lwb). + */ + nlwb = zil_alloc_lwb(zilog, bp, slog, txg, TRUE); + } + + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + /* For Slim ZIL only write what is used. */ + wsz = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(wsz, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_write_zio, wsz); + + } else { + wsz = lwb->lwb_sz; + } + + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; + + /* + * clear unused data for security + */ + bzero(lwb->lwb_buf + lwb->lwb_nused, wsz - lwb->lwb_nused); + + spa_config_enter(zilog->zl_spa, SCL_STATE, lwb, RW_READER); + + zil_lwb_add_block(lwb, &lwb->lwb_blk); + lwb->lwb_issued_timestamp = gethrtime(); + lwb->lwb_state = LWB_STATE_ISSUED; + + zio_nowait(lwb->lwb_root_zio); + zio_nowait(lwb->lwb_write_zio); + + /* + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). + */ + return (nlwb); +} + +/* + * Maximum amount of write data that can be put into single log block. + */ +uint64_t +zil_max_log_data(zilog_t *zilog) +{ + return (zilog->zl_max_block_size - + sizeof (zil_chain_t) - sizeof (lr_write_t)); +} + +/* + * Maximum amount of log space we agree to waste to reduce number of + * WR_NEED_COPY chunks to reduce zl_get_data() overhead (~12%). + */ +static inline uint64_t +zil_max_waste_space(zilog_t *zilog) +{ + return (zil_max_log_data(zilog) / 8); +} + +/* + * Maximum amount of write data for WR_COPIED. For correctness, consumers + * must fall back to WR_NEED_COPY if we can't fit the entire record into one + * maximum sized log block, because each WR_COPIED record must fit in a + * single log block. For space efficiency, we want to fit two records into a + * max-sized log block. + */ +uint64_t +zil_max_copied_data(zilog_t *zilog) +{ + return ((zilog->zl_max_block_size - sizeof (zil_chain_t)) / 2 - + sizeof (lr_write_t)); +} + +static lwb_t * +zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) +{ + lr_t *lrcb, *lrc; + lr_write_t *lrwb, *lrw; + char *lr_buf; + uint64_t dlen, dnow, lwb_sp, reclen, txg, max_log_data; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT3P(lwb, !=, NULL); + ASSERT3P(lwb->lwb_buf, !=, NULL); + + zil_lwb_write_open(zilog, lwb); + + lrc = &itx->itx_lr; + lrw = (lr_write_t *)lrc; + + /* + * A commit itx doesn't represent any on-disk state; instead + * it's simply used as a place holder on the commit list, and + * provides a mechanism for attaching a "commit waiter" onto the + * correct lwb (such that the waiter can be signalled upon + * completion of that lwb). Thus, we don't process this itx's + * log record if it's a commit itx (these itx's don't have log + * records), and instead link the itx's waiter onto the lwb's + * list of waiters. + * + * For more details, see the comment above zil_commit(). + */ + if (lrc->lrc_txtype == TX_COMMIT) { + mutex_enter(&zilog->zl_lock); + zil_commit_waiter_link_lwb(itx->itx_private, lwb); + itx->itx_private = NULL; + mutex_exit(&zilog->zl_lock); + return (lwb); + } + + if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + dlen = P2ROUNDUP_TYPED( + lrw->lr_length, sizeof (uint64_t), uint64_t); + } else { + dlen = 0; + } + reclen = lrc->lrc_reclen; + zilog->zl_cur_used += (reclen + dlen); + txg = lrc->lrc_txg; + + ASSERT3U(zilog->zl_cur_used, <, UINT64_MAX - (reclen + dlen)); + +cont: + /* + * If this record won't fit in the current log block, start a new one. + * For WR_NEED_COPY optimize layout for minimal number of chunks. + */ + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + max_log_data = zil_max_log_data(zilog); + if (reclen > lwb_sp || (reclen + dlen > lwb_sp && + lwb_sp < zil_max_waste_space(zilog) && + (dlen % max_log_data == 0 || + lwb_sp < reclen + dlen % max_log_data))) { + lwb = zil_lwb_write_issue(zilog, lwb); + if (lwb == NULL) + return (NULL); + zil_lwb_write_open(zilog, lwb); + ASSERT(LWB_EMPTY(lwb)); + lwb_sp = lwb->lwb_sz - lwb->lwb_nused; + + /* + * There must be enough space in the new, empty log block to + * hold reclen. For WR_COPIED, we need to fit the whole + * record in one block, and reclen is the header size + the + * data size. For WR_NEED_COPY, we can create multiple + * records, splitting the data into multiple blocks, so we + * only need to fit one word of data per block; in this case + * reclen is just the header size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + } + + dnow = MIN(dlen, lwb_sp - reclen); + lr_buf = lwb->lwb_buf + lwb->lwb_nused; + bcopy(lrc, lr_buf, reclen); + lrcb = (lr_t *)lr_buf; /* Like lrc, but inside lwb. */ + lrwb = (lr_write_t *)lrcb; /* Like lrw, but inside lwb. */ + + ZIL_STAT_BUMP(zil_itx_count); + + /* + * If it's a write, fetch the data or get its blkptr as appropriate. + */ + if (lrc->lrc_txtype == TX_WRITE) { + if (txg > spa_freeze_txg(zilog->zl_spa)) + txg_wait_synced(zilog->zl_dmu_pool, txg); + if (itx->itx_wr_state == WR_COPIED) { + ZIL_STAT_BUMP(zil_itx_copied_count); + ZIL_STAT_INCR(zil_itx_copied_bytes, lrw->lr_length); + } else { + char *dbuf; + int error; + + if (itx->itx_wr_state == WR_NEED_COPY) { + dbuf = lr_buf + reclen; + lrcb->lrc_reclen += dnow; + if (lrwb->lr_length > dnow) + lrwb->lr_length = dnow; + lrw->lr_offset += dnow; + lrw->lr_length -= dnow; + ZIL_STAT_BUMP(zil_itx_needcopy_count); + ZIL_STAT_INCR(zil_itx_needcopy_bytes, dnow); + } else { + ASSERT3S(itx->itx_wr_state, ==, WR_INDIRECT); + dbuf = NULL; + ZIL_STAT_BUMP(zil_itx_indirect_count); + ZIL_STAT_INCR(zil_itx_indirect_bytes, + lrw->lr_length); + } + + /* + * We pass in the "lwb_write_zio" rather than + * "lwb_root_zio" so that the "lwb_write_zio" + * becomes the parent of any zio's created by + * the "zl_get_data" callback. The vdevs are + * flushed after the "lwb_write_zio" completes, + * so we want to make sure that completion + * callback waits for these additional zio's, + * such that the vdevs used by those zio's will + * be included in the lwb's vdev tree, and those + * vdevs will be properly flushed. If we passed + * in "lwb_root_zio" here, then these additional + * vdevs may not be flushed; e.g. if these zio's + * completed after "lwb_write_zio" completed. + */ + error = zilog->zl_get_data(itx->itx_private, + lrwb, dbuf, lwb, lwb->lwb_write_zio); + + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } + if (error != 0) { + ASSERT(error == ENOENT || error == EEXIST || + error == EALREADY); + return (lwb); + } + } + } + + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + lrcb->lrc_seq = ++zilog->zl_lr_seq; + lwb->lwb_nused += reclen + dnow; + + zil_lwb_add_txg(lwb, txg); + + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); + ASSERT0(P2PHASE(lwb->lwb_nused, sizeof (uint64_t))); + + dlen -= dnow; + if (dlen > 0) { + zilog->zl_cur_used += reclen; + goto cont; + } + + return (lwb); +} + +itx_t * +zil_itx_create(uint64_t txtype, size_t lrsize) +{ + size_t itxsize; + itx_t *itx; + + lrsize = P2ROUNDUP_TYPED(lrsize, sizeof (uint64_t), size_t); + itxsize = offsetof(itx_t, itx_lr) + lrsize; + + itx = zio_data_buf_alloc(itxsize); + itx->itx_lr.lrc_txtype = txtype; + itx->itx_lr.lrc_reclen = lrsize; + itx->itx_lr.lrc_seq = 0; /* defensive */ + itx->itx_sync = B_TRUE; /* default is synchronous */ + itx->itx_callback = NULL; + itx->itx_callback_data = NULL; + itx->itx_size = itxsize; + + return (itx); +} + +void +zil_itx_destroy(itx_t *itx) +{ + IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); + IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); + + if (itx->itx_callback != NULL) + itx->itx_callback(itx->itx_callback_data); + + zio_data_buf_free(itx, itx->itx_size); +} + +/* + * Free up the sync and async itxs. The itxs_t has already been detached + * so no locks are needed. + */ +static void +zil_itxg_clean(itxs_t *itxs) +{ + itx_t *itx; + list_t *list; + avl_tree_t *t; + void *cookie; + itx_async_node_t *ian; + + list = &itxs->i_sync_list; + while ((itx = list_head(list)) != NULL) { + /* + * In the general case, commit itxs will not be found + * here, as they'll be committed to an lwb via + * zil_lwb_commit(), and free'd in that function. Having + * said that, it is still possible for commit itxs to be + * found here, due to the following race: + * + * - a thread calls zil_commit() which assigns the + * commit itx to a per-txg i_sync_list + * - zil_itxg_clean() is called (e.g. via spa_sync()) + * while the waiter is still on the i_sync_list + * + * There's nothing to prevent syncing the txg while the + * waiter is on the i_sync_list. This normally doesn't + * happen because spa_sync() is slower than zil_commit(), + * but if zil_commit() calls txg_wait_synced() (e.g. + * because zil_create() or zil_commit_writer_stall() is + * called) we will hit this case. + */ + if (itx->itx_lr.lrc_txtype == TX_COMMIT) + zil_commit_waiter_skip(itx->itx_private); + + list_remove(list, itx); + zil_itx_destroy(itx); + } + + cookie = NULL; + t = &itxs->i_async_tree; + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list = &ian->ia_list; + while ((itx = list_head(list)) != NULL) { + list_remove(list, itx); + /* commit itxs should never be on the async lists. */ + ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); + zil_itx_destroy(itx); + } + list_destroy(list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + avl_destroy(t); + + kmem_free(itxs, sizeof (itxs_t)); +} + +static int +zil_aitx_compare(const void *x1, const void *x2) +{ + const uint64_t o1 = ((itx_async_node_t *)x1)->ia_foid; + const uint64_t o2 = ((itx_async_node_t *)x2)->ia_foid; + + return (TREE_CMP(o1, o2)); +} + +/* + * Remove all async itx with the given oid. + */ +void +zil_remove_async(zilog_t *zilog, uint64_t oid) +{ + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; + list_t clean_list; + itx_t *itx; + + ASSERT(oid != 0); + list_create(&clean_list, sizeof (itx_t), offsetof(itx_t, itx_node)); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * Locate the object node and append its list. + */ + t = &itxg->itxg_itxs->i_async_tree; + ian = avl_find(t, &oid, &where); + if (ian != NULL) + list_move_tail(&clean_list, &ian->ia_list); + mutex_exit(&itxg->itxg_lock); + } + while ((itx = list_head(&clean_list)) != NULL) { + list_remove(&clean_list, itx); + /* commit itxs should never be on the async lists. */ + ASSERT3U(itx->itx_lr.lrc_txtype, !=, TX_COMMIT); + zil_itx_destroy(itx); + } + list_destroy(&clean_list); +} + +void +zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) +{ + uint64_t txg; + itxg_t *itxg; + itxs_t *itxs, *clean = NULL; + + /* + * Ensure the data of a renamed file is committed before the rename. + */ + if ((itx->itx_lr.lrc_txtype & ~TX_CI) == TX_RENAME) + zil_async_to_sync(zilog, itx->itx_oid); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) + txg = ZILTEST_TXG; + else + txg = dmu_tx_get_txg(tx); + + itxg = &zilog->zl_itxg[txg & TXG_MASK]; + mutex_enter(&itxg->itxg_lock); + itxs = itxg->itxg_itxs; + if (itxg->itxg_txg != txg) { + if (itxs != NULL) { + /* + * The zil_clean callback hasn't got around to cleaning + * this itxg. Save the itxs for release below. + * This should be rare. + */ + zfs_dbgmsg("zil_itx_assign: missed itx cleanup for " + "txg %llu", itxg->itxg_txg); + clean = itxg->itxg_itxs; + } + itxg->itxg_txg = txg; + itxs = itxg->itxg_itxs = kmem_zalloc(sizeof (itxs_t), + KM_SLEEP); + + list_create(&itxs->i_sync_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + avl_create(&itxs->i_async_tree, zil_aitx_compare, + sizeof (itx_async_node_t), + offsetof(itx_async_node_t, ia_node)); + } + if (itx->itx_sync) { + list_insert_tail(&itxs->i_sync_list, itx); + } else { + avl_tree_t *t = &itxs->i_async_tree; + uint64_t foid = + LR_FOID_GET_OBJ(((lr_ooo_t *)&itx->itx_lr)->lr_foid); + itx_async_node_t *ian; + avl_index_t where; + + ian = avl_find(t, &foid, &where); + if (ian == NULL) { + ian = kmem_alloc(sizeof (itx_async_node_t), + KM_SLEEP); + list_create(&ian->ia_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + ian->ia_foid = foid; + avl_insert(t, ian, where); + } + list_insert_tail(&ian->ia_list, itx); + } + + itx->itx_lr.lrc_txg = dmu_tx_get_txg(tx); + + /* + * We don't want to dirty the ZIL using ZILTEST_TXG, because + * zil_clean() will never be called using ZILTEST_TXG. Thus, we + * need to be careful to always dirty the ZIL using the "real" + * TXG (not itxg_txg) even when the SPA is frozen. + */ + zilog_dirty(zilog, dmu_tx_get_txg(tx)); + mutex_exit(&itxg->itxg_lock); + + /* Release the old itxs now we've dropped the lock */ + if (clean != NULL) + zil_itxg_clean(clean); +} + +/* + * If there are any in-memory intent log transactions which have now been + * synced then start up a taskq to free them. We should only do this after we + * have written out the uberblocks (i.e. txg has been committed) so that + * don't inadvertently clean out in-memory log records that would be required + * by zil_commit(). + */ +void +zil_clean(zilog_t *zilog, uint64_t synced_txg) +{ + itxg_t *itxg = &zilog->zl_itxg[synced_txg & TXG_MASK]; + itxs_t *clean_me; + + ASSERT3U(synced_txg, <, ZILTEST_TXG); + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_itxs == NULL || itxg->itxg_txg == ZILTEST_TXG) { + mutex_exit(&itxg->itxg_lock); + return; + } + ASSERT3U(itxg->itxg_txg, <=, synced_txg); + ASSERT3U(itxg->itxg_txg, !=, 0); + clean_me = itxg->itxg_itxs; + itxg->itxg_itxs = NULL; + itxg->itxg_txg = 0; + mutex_exit(&itxg->itxg_lock); + /* + * Preferably start a task queue to free up the old itxs but + * if taskq_dispatch can't allocate resources to do that then + * free it in-line. This should be rare. Note, using TQ_SLEEP + * created a bad performance problem. + */ + ASSERT3P(zilog->zl_dmu_pool, !=, NULL); + ASSERT3P(zilog->zl_dmu_pool->dp_zil_clean_taskq, !=, NULL); + taskqid_t id = taskq_dispatch(zilog->zl_dmu_pool->dp_zil_clean_taskq, + (void (*)(void *))zil_itxg_clean, clean_me, TQ_NOSLEEP); + if (id == TASKQID_INVALID) + zil_itxg_clean(clean_me); +} + +/* + * This function will traverse the queue of itxs that need to be + * committed, and move them onto the ZIL's zl_itx_commit_list. + */ +static void +zil_get_commit_list(zilog_t *zilog) +{ + uint64_t otxg, txg; + list_t *commit_list = &zilog->zl_itx_commit_list; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + /* + * This is inherently racy, since there is nothing to prevent + * the last synced txg from changing. That's okay since we'll + * only commit things in the future. + */ + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * If we're adding itx records to the zl_itx_commit_list, + * then the zil better be dirty in this "txg". We can assert + * that here since we're holding the itxg_lock which will + * prevent spa_sync from cleaning it. Once we add the itxs + * to the zl_itx_commit_list we must commit it to disk even + * if it's unnecessary (i.e. the txg was synced). + */ + ASSERT(zilog_is_dirty_in_txg(zilog, txg) || + spa_freeze_txg(zilog->zl_spa) != UINT64_MAX); + list_move_tail(commit_list, &itxg->itxg_itxs->i_sync_list); + + mutex_exit(&itxg->itxg_lock); + } +} + +/* + * Move the async itxs for a specified object to commit into sync lists. + */ +void +zil_async_to_sync(zilog_t *zilog, uint64_t foid) +{ + uint64_t otxg, txg; + itx_async_node_t *ian; + avl_tree_t *t; + avl_index_t where; + + if (spa_freeze_txg(zilog->zl_spa) != UINT64_MAX) /* ziltest support */ + otxg = ZILTEST_TXG; + else + otxg = spa_last_synced_txg(zilog->zl_spa) + 1; + + /* + * This is inherently racy, since there is nothing to prevent + * the last synced txg from changing. + */ + for (txg = otxg; txg < (otxg + TXG_CONCURRENT_STATES); txg++) { + itxg_t *itxg = &zilog->zl_itxg[txg & TXG_MASK]; + + mutex_enter(&itxg->itxg_lock); + if (itxg->itxg_txg != txg) { + mutex_exit(&itxg->itxg_lock); + continue; + } + + /* + * If a foid is specified then find that node and append its + * list. Otherwise walk the tree appending all the lists + * to the sync list. We add to the end rather than the + * beginning to ensure the create has happened. + */ + t = &itxg->itxg_itxs->i_async_tree; + if (foid != 0) { + ian = avl_find(t, &foid, &where); + if (ian != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + } + } else { + void *cookie = NULL; + + while ((ian = avl_destroy_nodes(t, &cookie)) != NULL) { + list_move_tail(&itxg->itxg_itxs->i_sync_list, + &ian->ia_list); + list_destroy(&ian->ia_list); + kmem_free(ian, sizeof (itx_async_node_t)); + } + } + mutex_exit(&itxg->itxg_lock); + } +} + +/* + * This function will prune commit itxs that are at the head of the + * commit list (it won't prune past the first non-commit itx), and + * either: a) attach them to the last lwb that's still pending + * completion, or b) skip them altogether. + * + * This is used as a performance optimization to prevent commit itxs + * from generating new lwbs when it's unnecessary to do so. + */ +static void +zil_prune_commit_list(zilog_t *zilog) +{ + itx_t *itx; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { + lr_t *lrc = &itx->itx_lr; + if (lrc->lrc_txtype != TX_COMMIT) + break; + + mutex_enter(&zilog->zl_lock); + + lwb_t *last_lwb = zilog->zl_last_lwb_opened; + if (last_lwb == NULL || + last_lwb->lwb_state == LWB_STATE_FLUSH_DONE) { + /* + * All of the itxs this waiter was waiting on + * must have already completed (or there were + * never any itx's for it to wait on), so it's + * safe to skip this waiter and mark it done. + */ + zil_commit_waiter_skip(itx->itx_private); + } else { + zil_commit_waiter_link_lwb(itx->itx_private, last_lwb); + itx->itx_private = NULL; + } + + mutex_exit(&zilog->zl_lock); + + list_remove(&zilog->zl_itx_commit_list, itx); + zil_itx_destroy(itx); + } + + IMPLY(itx != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); +} + +static void +zil_commit_writer_stall(zilog_t *zilog) +{ + /* + * When zio_alloc_zil() fails to allocate the next lwb block on + * disk, we must call txg_wait_synced() to ensure all of the + * lwbs in the zilog's zl_lwb_list are synced and then freed (in + * zil_sync()), such that any subsequent ZIL writer (i.e. a call + * to zil_process_commit_list()) will have to call zil_create(), + * and start a new ZIL chain. + * + * Since zil_alloc_zil() failed, the lwb that was previously + * issued does not have a pointer to the "next" lwb on disk. + * Thus, if another ZIL writer thread was to allocate the "next" + * on-disk lwb, that block could be leaked in the event of a + * crash (because the previous lwb on-disk would not point to + * it). + * + * We must hold the zilog's zl_issuer_lock while we do this, to + * ensure no new threads enter zil_process_commit_list() until + * all lwb's in the zl_lwb_list have been synced and freed + * (which is achieved via the txg_wait_synced() call). + */ + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + txg_wait_synced(zilog->zl_dmu_pool, 0); + ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); +} + +/* + * This function will traverse the commit list, creating new lwbs as + * needed, and committing the itxs from the commit list to these newly + * created lwbs. Additionally, as a new lwb is created, the previous + * lwb will be issued to the zio layer to be written to disk. + */ +static void +zil_process_commit_list(zilog_t *zilog) +{ + spa_t *spa = zilog->zl_spa; + list_t nolwb_itxs; + list_t nolwb_waiters; + lwb_t *lwb; + itx_t *itx; + + ASSERT(MUTEX_HELD(&zilog->zl_issuer_lock)); + + /* + * Return if there's nothing to commit before we dirty the fs by + * calling zil_create(). + */ + if (list_head(&zilog->zl_itx_commit_list) == NULL) + return; + + list_create(&nolwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&nolwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) { + lwb = zil_create(zilog); + } else { + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + } + + while ((itx = list_head(&zilog->zl_itx_commit_list)) != NULL) { + lr_t *lrc = &itx->itx_lr; + uint64_t txg = lrc->lrc_txg; + + ASSERT3U(txg, !=, 0); + + if (lrc->lrc_txtype == TX_COMMIT) { + DTRACE_PROBE2(zil__process__commit__itx, + zilog_t *, zilog, itx_t *, itx); + } else { + DTRACE_PROBE2(zil__process__normal__itx, + zilog_t *, zilog, itx_t *, itx); + } + + list_remove(&zilog->zl_itx_commit_list, itx); + + boolean_t synced = txg <= spa_last_synced_txg(spa); + boolean_t frozen = txg > spa_freeze_txg(spa); + + /* + * If the txg of this itx has already been synced out, then + * we don't need to commit this itx to an lwb. This is + * because the data of this itx will have already been + * written to the main pool. This is inherently racy, and + * it's still ok to commit an itx whose txg has already + * been synced; this will result in a write that's + * unnecessary, but will do no harm. + * + * With that said, we always want to commit TX_COMMIT itxs + * to an lwb, regardless of whether or not that itx's txg + * has been synced out. We do this to ensure any OPENED lwb + * will always have at least one zil_commit_waiter_t linked + * to the lwb. + * + * As a counter-example, if we skipped TX_COMMIT itx's + * whose txg had already been synced, the following + * situation could occur if we happened to be racing with + * spa_sync: + * + * 1. We commit a non-TX_COMMIT itx to an lwb, where the + * itx's txg is 10 and the last synced txg is 9. + * 2. spa_sync finishes syncing out txg 10. + * 3. We move to the next itx in the list, it's a TX_COMMIT + * whose txg is 10, so we skip it rather than committing + * it to the lwb used in (1). + * + * If the itx that is skipped in (3) is the last TX_COMMIT + * itx in the commit list, than it's possible for the lwb + * used in (1) to remain in the OPENED state indefinitely. + * + * To prevent the above scenario from occurring, ensuring + * that once an lwb is OPENED it will transition to ISSUED + * and eventually DONE, we always commit TX_COMMIT itx's to + * an lwb here, even if that itx's txg has already been + * synced. + * + * Finally, if the pool is frozen, we _always_ commit the + * itx. The point of freezing the pool is to prevent data + * from being written to the main pool via spa_sync, and + * instead rely solely on the ZIL to persistently store the + * data; i.e. when the pool is frozen, the last synced txg + * value can't be trusted. + */ + if (frozen || !synced || lrc->lrc_txtype == TX_COMMIT) { + if (lwb != NULL) { + lwb = zil_lwb_commit(zilog, itx, lwb); + + if (lwb == NULL) + list_insert_tail(&nolwb_itxs, itx); + else + list_insert_tail(&lwb->lwb_itxs, itx); + } else { + if (lrc->lrc_txtype == TX_COMMIT) { + zil_commit_waiter_link_nolwb( + itx->itx_private, &nolwb_waiters); + } + + list_insert_tail(&nolwb_itxs, itx); + } + } else { + ASSERT3S(lrc->lrc_txtype, !=, TX_COMMIT); + zil_itx_destroy(itx); + } + } + + if (lwb == NULL) { + /* + * This indicates zio_alloc_zil() failed to allocate the + * "next" lwb on-disk. When this happens, we must stall + * the ZIL write pipeline; see the comment within + * zil_commit_writer_stall() for more details. + */ + zil_commit_writer_stall(zilog); + + /* + * Additionally, we have to signal and mark the "nolwb" + * waiters as "done" here, since without an lwb, we + * can't do this via zil_lwb_flush_vdevs_done() like + * normal. + */ + zil_commit_waiter_t *zcw; + while ((zcw = list_head(&nolwb_waiters)) != NULL) { + zil_commit_waiter_skip(zcw); + list_remove(&nolwb_waiters, zcw); + } + + /* + * And finally, we have to destroy the itx's that + * couldn't be committed to an lwb; this will also call + * the itx's callback if one exists for the itx. + */ + while ((itx = list_head(&nolwb_itxs)) != NULL) { + list_remove(&nolwb_itxs, itx); + zil_itx_destroy(itx); + } + } else { + ASSERT(list_is_empty(&nolwb_waiters)); + ASSERT3P(lwb, !=, NULL); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_WRITE_DONE); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_FLUSH_DONE); + + /* + * At this point, the ZIL block pointed at by the "lwb" + * variable is in one of the following states: "closed" + * or "open". + * + * If it's "closed", then no itxs have been committed to + * it, so there's no point in issuing its zio (i.e. it's + * "empty"). + * + * If it's "open", then it contains one or more itxs that + * eventually need to be committed to stable storage. In + * this case we intentionally do not issue the lwb's zio + * to disk yet, and instead rely on one of the following + * two mechanisms for issuing the zio: + * + * 1. Ideally, there will be more ZIL activity occurring + * on the system, such that this function will be + * immediately called again (not necessarily by the same + * thread) and this lwb's zio will be issued via + * zil_lwb_commit(). This way, the lwb is guaranteed to + * be "full" when it is issued to disk, and we'll make + * use of the lwb's size the best we can. + * + * 2. If there isn't sufficient ZIL activity occurring on + * the system, such that this lwb's zio isn't issued via + * zil_lwb_commit(), zil_commit_waiter() will issue the + * lwb's zio. If this occurs, the lwb is not guaranteed + * to be "full" by the time its zio is issued, and means + * the size of the lwb was "too large" given the amount + * of ZIL activity occurring on the system at that time. + * + * We do this for a couple of reasons: + * + * 1. To try and reduce the number of IOPs needed to + * write the same number of itxs. If an lwb has space + * available in its buffer for more itxs, and more itxs + * will be committed relatively soon (relative to the + * latency of performing a write), then it's beneficial + * to wait for these "next" itxs. This way, more itxs + * can be committed to stable storage with fewer writes. + * + * 2. To try and use the largest lwb block size that the + * incoming rate of itxs can support. Again, this is to + * try and pack as many itxs into as few lwbs as + * possible, without significantly impacting the latency + * of each individual itx. + */ + } +} + +/* + * This function is responsible for ensuring the passed in commit waiter + * (and associated commit itx) is committed to an lwb. If the waiter is + * not already committed to an lwb, all itxs in the zilog's queue of + * itxs will be processed. The assumption is the passed in waiter's + * commit itx will found in the queue just like the other non-commit + * itxs, such that when the entire queue is processed, the waiter will + * have been committed to an lwb. + * + * The lwb associated with the passed in waiter is not guaranteed to + * have been issued by the time this function completes. If the lwb is + * not issued, we rely on future calls to zil_commit_writer() to issue + * the lwb, or the timeout mechanism found in zil_commit_waiter(). + */ +static void +zil_commit_writer(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); + ASSERT(spa_writeable(zilog->zl_spa)); + + mutex_enter(&zilog->zl_issuer_lock); + + if (zcw->zcw_lwb != NULL || zcw->zcw_done) { + /* + * It's possible that, while we were waiting to acquire + * the "zl_issuer_lock", another thread committed this + * waiter to an lwb. If that occurs, we bail out early, + * without processing any of the zilog's queue of itxs. + * + * On certain workloads and system configurations, the + * "zl_issuer_lock" can become highly contended. In an + * attempt to reduce this contention, we immediately drop + * the lock if the waiter has already been processed. + * + * We've measured this optimization to reduce CPU spent + * contending on this lock by up to 5%, using a system + * with 32 CPUs, low latency storage (~50 usec writes), + * and 1024 threads performing sync writes. + */ + goto out; + } + + ZIL_STAT_BUMP(zil_commit_writer_count); + + zil_get_commit_list(zilog); + zil_prune_commit_list(zilog); + zil_process_commit_list(zilog); + +out: + mutex_exit(&zilog->zl_issuer_lock); +} + +static void +zil_commit_waiter_timeout(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + ASSERT3B(zcw->zcw_done, ==, B_FALSE); + + lwb_t *lwb = zcw->zcw_lwb; + ASSERT3P(lwb, !=, NULL); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_CLOSED); + + /* + * If the lwb has already been issued by another thread, we can + * immediately return since there's no work to be done (the + * point of this function is to issue the lwb). Additionally, we + * do this prior to acquiring the zl_issuer_lock, to avoid + * acquiring it when it's not necessary to do so. + */ + if (lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_WRITE_DONE || + lwb->lwb_state == LWB_STATE_FLUSH_DONE) + return; + + /* + * In order to call zil_lwb_write_issue() we must hold the + * zilog's "zl_issuer_lock". We can't simply acquire that lock, + * since we're already holding the commit waiter's "zcw_lock", + * and those two locks are acquired in the opposite order + * elsewhere. + */ + mutex_exit(&zcw->zcw_lock); + mutex_enter(&zilog->zl_issuer_lock); + mutex_enter(&zcw->zcw_lock); + + /* + * Since we just dropped and re-acquired the commit waiter's + * lock, we have to re-check to see if the waiter was marked + * "done" during that process. If the waiter was marked "done", + * the "lwb" pointer is no longer valid (it can be free'd after + * the waiter is marked "done"), so without this check we could + * wind up with a use-after-free error below. + */ + if (zcw->zcw_done) + goto out; + + ASSERT3P(lwb, ==, zcw->zcw_lwb); + + /* + * We've already checked this above, but since we hadn't acquired + * the zilog's zl_issuer_lock, we have to perform this check a + * second time while holding the lock. + * + * We don't need to hold the zl_lock since the lwb cannot transition + * from OPENED to ISSUED while we hold the zl_issuer_lock. The lwb + * _can_ transition from ISSUED to DONE, but it's OK to race with + * that transition since we treat the lwb the same, whether it's in + * the ISSUED or DONE states. + * + * The important thing, is we treat the lwb differently depending on + * if it's ISSUED or OPENED, and block any other threads that might + * attempt to issue this lwb. For that reason we hold the + * zl_issuer_lock when checking the lwb_state; we must not call + * zil_lwb_write_issue() if the lwb had already been issued. + * + * See the comment above the lwb_state_t structure definition for + * more details on the lwb states, and locking requirements. + */ + if (lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_WRITE_DONE || + lwb->lwb_state == LWB_STATE_FLUSH_DONE) + goto out; + + ASSERT3S(lwb->lwb_state, ==, LWB_STATE_OPENED); + + /* + * As described in the comments above zil_commit_waiter() and + * zil_process_commit_list(), we need to issue this lwb's zio + * since we've reached the commit waiter's timeout and it still + * hasn't been issued. + */ + lwb_t *nlwb = zil_lwb_write_issue(zilog, lwb); + + IMPLY(nlwb != NULL, lwb->lwb_state != LWB_STATE_OPENED); + + /* + * Since the lwb's zio hadn't been issued by the time this thread + * reached its timeout, we reset the zilog's "zl_cur_used" field + * to influence the zil block size selection algorithm. + * + * By having to issue the lwb's zio here, it means the size of the + * lwb was too large, given the incoming throughput of itxs. By + * setting "zl_cur_used" to zero, we communicate this fact to the + * block size selection algorithm, so it can take this information + * into account, and potentially select a smaller size for the + * next lwb block that is allocated. + */ + zilog->zl_cur_used = 0; + + if (nlwb == NULL) { + /* + * When zil_lwb_write_issue() returns NULL, this + * indicates zio_alloc_zil() failed to allocate the + * "next" lwb on-disk. When this occurs, the ZIL write + * pipeline must be stalled; see the comment within the + * zil_commit_writer_stall() function for more details. + * + * We must drop the commit waiter's lock prior to + * calling zil_commit_writer_stall() or else we can wind + * up with the following deadlock: + * + * - This thread is waiting for the txg to sync while + * holding the waiter's lock; txg_wait_synced() is + * used within txg_commit_writer_stall(). + * + * - The txg can't sync because it is waiting for this + * lwb's zio callback to call dmu_tx_commit(). + * + * - The lwb's zio callback can't call dmu_tx_commit() + * because it's blocked trying to acquire the waiter's + * lock, which occurs prior to calling dmu_tx_commit() + */ + mutex_exit(&zcw->zcw_lock); + zil_commit_writer_stall(zilog); + mutex_enter(&zcw->zcw_lock); + } + +out: + mutex_exit(&zilog->zl_issuer_lock); + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); +} + +/* + * This function is responsible for performing the following two tasks: + * + * 1. its primary responsibility is to block until the given "commit + * waiter" is considered "done". + * + * 2. its secondary responsibility is to issue the zio for the lwb that + * the given "commit waiter" is waiting on, if this function has + * waited "long enough" and the lwb is still in the "open" state. + * + * Given a sufficient amount of itxs being generated and written using + * the ZIL, the lwb's zio will be issued via the zil_lwb_commit() + * function. If this does not occur, this secondary responsibility will + * ensure the lwb is issued even if there is not other synchronous + * activity on the system. + * + * For more details, see zil_process_commit_list(); more specifically, + * the comment at the bottom of that function. + */ +static void +zil_commit_waiter(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + ASSERT(!MUTEX_HELD(&zilog->zl_lock)); + ASSERT(!MUTEX_HELD(&zilog->zl_issuer_lock)); + ASSERT(spa_writeable(zilog->zl_spa)); + + mutex_enter(&zcw->zcw_lock); + + /* + * The timeout is scaled based on the lwb latency to avoid + * significantly impacting the latency of each individual itx. + * For more details, see the comment at the bottom of the + * zil_process_commit_list() function. + */ + int pct = MAX(zfs_commit_timeout_pct, 1); + hrtime_t sleep = (zilog->zl_last_lwb_latency * pct) / 100; + hrtime_t wakeup = gethrtime() + sleep; + boolean_t timedout = B_FALSE; + + while (!zcw->zcw_done) { + ASSERT(MUTEX_HELD(&zcw->zcw_lock)); + + lwb_t *lwb = zcw->zcw_lwb; + + /* + * Usually, the waiter will have a non-NULL lwb field here, + * but it's possible for it to be NULL as a result of + * zil_commit() racing with spa_sync(). + * + * When zil_clean() is called, it's possible for the itxg + * list (which may be cleaned via a taskq) to contain + * commit itxs. When this occurs, the commit waiters linked + * off of these commit itxs will not be committed to an + * lwb. Additionally, these commit waiters will not be + * marked done until zil_commit_waiter_skip() is called via + * zil_itxg_clean(). + * + * Thus, it's possible for this commit waiter (i.e. the + * "zcw" variable) to be found in this "in between" state; + * where it's "zcw_lwb" field is NULL, and it hasn't yet + * been skipped, so it's "zcw_done" field is still B_FALSE. + */ + IMPLY(lwb != NULL, lwb->lwb_state != LWB_STATE_CLOSED); + + if (lwb != NULL && lwb->lwb_state == LWB_STATE_OPENED) { + ASSERT3B(timedout, ==, B_FALSE); + + /* + * If the lwb hasn't been issued yet, then we + * need to wait with a timeout, in case this + * function needs to issue the lwb after the + * timeout is reached; responsibility (2) from + * the comment above this function. + */ + int rc = cv_timedwait_hires(&zcw->zcw_cv, + &zcw->zcw_lock, wakeup, USEC2NSEC(1), + CALLOUT_FLAG_ABSOLUTE); + + if (rc != -1 || zcw->zcw_done) + continue; + + timedout = B_TRUE; + zil_commit_waiter_timeout(zilog, zcw); + + if (!zcw->zcw_done) { + /* + * If the commit waiter has already been + * marked "done", it's possible for the + * waiter's lwb structure to have already + * been freed. Thus, we can only reliably + * make these assertions if the waiter + * isn't done. + */ + ASSERT3P(lwb, ==, zcw->zcw_lwb); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_OPENED); + } + } else { + /* + * If the lwb isn't open, then it must have already + * been issued. In that case, there's no need to + * use a timeout when waiting for the lwb to + * complete. + * + * Additionally, if the lwb is NULL, the waiter + * will soon be signaled and marked done via + * zil_clean() and zil_itxg_clean(), so no timeout + * is required. + */ + + IMPLY(lwb != NULL, + lwb->lwb_state == LWB_STATE_ISSUED || + lwb->lwb_state == LWB_STATE_WRITE_DONE || + lwb->lwb_state == LWB_STATE_FLUSH_DONE); + cv_wait(&zcw->zcw_cv, &zcw->zcw_lock); + } + } + + mutex_exit(&zcw->zcw_lock); +} + +static zil_commit_waiter_t * +zil_alloc_commit_waiter(void) +{ + zil_commit_waiter_t *zcw = kmem_cache_alloc(zil_zcw_cache, KM_SLEEP); + + cv_init(&zcw->zcw_cv, NULL, CV_DEFAULT, NULL); + mutex_init(&zcw->zcw_lock, NULL, MUTEX_DEFAULT, NULL); + list_link_init(&zcw->zcw_node); + zcw->zcw_lwb = NULL; + zcw->zcw_done = B_FALSE; + zcw->zcw_zio_error = 0; + + return (zcw); +} + +static void +zil_free_commit_waiter(zil_commit_waiter_t *zcw) +{ + ASSERT(!list_link_active(&zcw->zcw_node)); + ASSERT3P(zcw->zcw_lwb, ==, NULL); + ASSERT3B(zcw->zcw_done, ==, B_TRUE); + mutex_destroy(&zcw->zcw_lock); + cv_destroy(&zcw->zcw_cv); + kmem_cache_free(zil_zcw_cache, zcw); +} + +/* + * This function is used to create a TX_COMMIT itx and assign it. This + * way, it will be linked into the ZIL's list of synchronous itxs, and + * then later committed to an lwb (or skipped) when + * zil_process_commit_list() is called. + */ +static void +zil_commit_itx_assign(zilog_t *zilog, zil_commit_waiter_t *zcw) +{ + dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); + VERIFY0(dmu_tx_assign(tx, TXG_WAIT)); + + itx_t *itx = zil_itx_create(TX_COMMIT, sizeof (lr_t)); + itx->itx_sync = B_TRUE; + itx->itx_private = zcw; + + zil_itx_assign(zilog, itx, tx); + + dmu_tx_commit(tx); +} + +/* + * Commit ZFS Intent Log transactions (itxs) to stable storage. + * + * When writing ZIL transactions to the on-disk representation of the + * ZIL, the itxs are committed to a Log Write Block (lwb). Multiple + * itxs can be committed to a single lwb. Once a lwb is written and + * committed to stable storage (i.e. the lwb is written, and vdevs have + * been flushed), each itx that was committed to that lwb is also + * considered to be committed to stable storage. + * + * When an itx is committed to an lwb, the log record (lr_t) contained + * by the itx is copied into the lwb's zio buffer, and once this buffer + * is written to disk, it becomes an on-disk ZIL block. + * + * As itxs are generated, they're inserted into the ZIL's queue of + * uncommitted itxs. The semantics of zil_commit() are such that it will + * block until all itxs that were in the queue when it was called, are + * committed to stable storage. + * + * If "foid" is zero, this means all "synchronous" and "asynchronous" + * itxs, for all objects in the dataset, will be committed to stable + * storage prior to zil_commit() returning. If "foid" is non-zero, all + * "synchronous" itxs for all objects, but only "asynchronous" itxs + * that correspond to the foid passed in, will be committed to stable + * storage prior to zil_commit() returning. + * + * Generally speaking, when zil_commit() is called, the consumer doesn't + * actually care about _all_ of the uncommitted itxs. Instead, they're + * simply trying to waiting for a specific itx to be committed to disk, + * but the interface(s) for interacting with the ZIL don't allow such + * fine-grained communication. A better interface would allow a consumer + * to create and assign an itx, and then pass a reference to this itx to + * zil_commit(); such that zil_commit() would return as soon as that + * specific itx was committed to disk (instead of waiting for _all_ + * itxs to be committed). + * + * When a thread calls zil_commit() a special "commit itx" will be + * generated, along with a corresponding "waiter" for this commit itx. + * zil_commit() will wait on this waiter's CV, such that when the waiter + * is marked done, and signaled, zil_commit() will return. + * + * This commit itx is inserted into the queue of uncommitted itxs. This + * provides an easy mechanism for determining which itxs were in the + * queue prior to zil_commit() having been called, and which itxs were + * added after zil_commit() was called. + * + * The commit it is special; it doesn't have any on-disk representation. + * When a commit itx is "committed" to an lwb, the waiter associated + * with it is linked onto the lwb's list of waiters. Then, when that lwb + * completes, each waiter on the lwb's list is marked done and signaled + * -- allowing the thread waiting on the waiter to return from zil_commit(). + * + * It's important to point out a few critical factors that allow us + * to make use of the commit itxs, commit waiters, per-lwb lists of + * commit waiters, and zio completion callbacks like we're doing: + * + * 1. The list of waiters for each lwb is traversed, and each commit + * waiter is marked "done" and signaled, in the zio completion + * callback of the lwb's zio[*]. + * + * * Actually, the waiters are signaled in the zio completion + * callback of the root zio for the DKIOCFLUSHWRITECACHE commands + * that are sent to the vdevs upon completion of the lwb zio. + * + * 2. When the itxs are inserted into the ZIL's queue of uncommitted + * itxs, the order in which they are inserted is preserved[*]; as + * itxs are added to the queue, they are added to the tail of + * in-memory linked lists. + * + * When committing the itxs to lwbs (to be written to disk), they + * are committed in the same order in which the itxs were added to + * the uncommitted queue's linked list(s); i.e. the linked list of + * itxs to commit is traversed from head to tail, and each itx is + * committed to an lwb in that order. + * + * * To clarify: + * + * - the order of "sync" itxs is preserved w.r.t. other + * "sync" itxs, regardless of the corresponding objects. + * - the order of "async" itxs is preserved w.r.t. other + * "async" itxs corresponding to the same object. + * - the order of "async" itxs is *not* preserved w.r.t. other + * "async" itxs corresponding to different objects. + * - the order of "sync" itxs w.r.t. "async" itxs (or vice + * versa) is *not* preserved, even for itxs that correspond + * to the same object. + * + * For more details, see: zil_itx_assign(), zil_async_to_sync(), + * zil_get_commit_list(), and zil_process_commit_list(). + * + * 3. The lwbs represent a linked list of blocks on disk. Thus, any + * lwb cannot be considered committed to stable storage, until its + * "previous" lwb is also committed to stable storage. This fact, + * coupled with the fact described above, means that itxs are + * committed in (roughly) the order in which they were generated. + * This is essential because itxs are dependent on prior itxs. + * Thus, we *must not* deem an itx as being committed to stable + * storage, until *all* prior itxs have also been committed to + * stable storage. + * + * To enforce this ordering of lwb zio's, while still leveraging as + * much of the underlying storage performance as possible, we rely + * on two fundamental concepts: + * + * 1. The creation and issuance of lwb zio's is protected by + * the zilog's "zl_issuer_lock", which ensures only a single + * thread is creating and/or issuing lwb's at a time + * 2. The "previous" lwb is a child of the "current" lwb + * (leveraging the zio parent-child dependency graph) + * + * By relying on this parent-child zio relationship, we can have + * many lwb zio's concurrently issued to the underlying storage, + * but the order in which they complete will be the same order in + * which they were created. + */ +void +zil_commit(zilog_t *zilog, uint64_t foid) +{ + /* + * We should never attempt to call zil_commit on a snapshot for + * a couple of reasons: + * + * 1. A snapshot may never be modified, thus it cannot have any + * in-flight itxs that would have modified the dataset. + * + * 2. By design, when zil_commit() is called, a commit itx will + * be assigned to this zilog; as a result, the zilog will be + * dirtied. We must not dirty the zilog of a snapshot; there's + * checks in the code that enforce this invariant, and will + * cause a panic if it's not upheld. + */ + ASSERT3B(dmu_objset_is_snapshot(zilog->zl_os), ==, B_FALSE); + + if (zilog->zl_sync == ZFS_SYNC_DISABLED) + return; + + if (!spa_writeable(zilog->zl_spa)) { + /* + * If the SPA is not writable, there should never be any + * pending itxs waiting to be committed to disk. If that + * weren't true, we'd skip writing those itxs out, and + * would break the semantics of zil_commit(); thus, we're + * verifying that truth before we return to the caller. + */ + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + for (int i = 0; i < TXG_SIZE; i++) + ASSERT3P(zilog->zl_itxg[i].itxg_itxs, ==, NULL); + return; + } + + /* + * If the ZIL is suspended, we don't want to dirty it by calling + * zil_commit_itx_assign() below, nor can we write out + * lwbs like would be done in zil_commit_write(). Thus, we + * simply rely on txg_wait_synced() to maintain the necessary + * semantics, and avoid calling those functions altogether. + */ + if (zilog->zl_suspend > 0) { + txg_wait_synced(zilog->zl_dmu_pool, 0); + return; + } + + zil_commit_impl(zilog, foid); +} + +void +zil_commit_impl(zilog_t *zilog, uint64_t foid) +{ + ZIL_STAT_BUMP(zil_commit_count); + + /* + * Move the "async" itxs for the specified foid to the "sync" + * queues, such that they will be later committed (or skipped) + * to an lwb when zil_process_commit_list() is called. + * + * Since these "async" itxs must be committed prior to this + * call to zil_commit returning, we must perform this operation + * before we call zil_commit_itx_assign(). + */ + zil_async_to_sync(zilog, foid); + + /* + * We allocate a new "waiter" structure which will initially be + * linked to the commit itx using the itx's "itx_private" field. + * Since the commit itx doesn't represent any on-disk state, + * when it's committed to an lwb, rather than copying the its + * lr_t into the lwb's buffer, the commit itx's "waiter" will be + * added to the lwb's list of waiters. Then, when the lwb is + * committed to stable storage, each waiter in the lwb's list of + * waiters will be marked "done", and signalled. + * + * We must create the waiter and assign the commit itx prior to + * calling zil_commit_writer(), or else our specific commit itx + * is not guaranteed to be committed to an lwb prior to calling + * zil_commit_waiter(). + */ + zil_commit_waiter_t *zcw = zil_alloc_commit_waiter(); + zil_commit_itx_assign(zilog, zcw); + + zil_commit_writer(zilog, zcw); + zil_commit_waiter(zilog, zcw); + + if (zcw->zcw_zio_error != 0) { + /* + * If there was an error writing out the ZIL blocks that + * this thread is waiting on, then we fallback to + * relying on spa_sync() to write out the data this + * thread is waiting on. Obviously this has performance + * implications, but the expectation is for this to be + * an exceptional case, and shouldn't occur often. + */ + DTRACE_PROBE2(zil__commit__io__error, + zilog_t *, zilog, zil_commit_waiter_t *, zcw); + txg_wait_synced(zilog->zl_dmu_pool, 0); + } + + zil_free_commit_waiter(zcw); +} + +/* + * Called in syncing context to free committed log blocks and update log header. + */ +void +zil_sync(zilog_t *zilog, dmu_tx_t *tx) +{ + zil_header_t *zh = zil_header_in_syncing_context(zilog); + uint64_t txg = dmu_tx_get_txg(tx); + spa_t *spa = zilog->zl_spa; + uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; + lwb_t *lwb; + + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + + mutex_enter(&zilog->zl_lock); + + ASSERT(zilog->zl_stop_sync == 0); + + if (*replayed_seq != 0) { + ASSERT(zh->zh_replay_seq < *replayed_seq); + zh->zh_replay_seq = *replayed_seq; + *replayed_seq = 0; + } + + if (zilog->zl_destroy_txg == txg) { + blkptr_t blk = zh->zh_log; + + ASSERT(list_head(&zilog->zl_lwb_list) == NULL); + + bzero(zh, sizeof (zil_header_t)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); + + if (zilog->zl_keep_first) { + /* + * If this block was part of log chain that couldn't + * be claimed because a device was missing during + * zil_claim(), but that device later returns, + * then this block could erroneously appear valid. + * To guard against this, assign a new GUID to the new + * log chain so it doesn't matter what blk points to. + */ + zil_init_log_chain(zilog, &blk); + zh->zh_log = blk; + } + } + + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { + zh->zh_log = lwb->lwb_blk; + if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) + break; + list_remove(&zilog->zl_lwb_list, lwb); + zio_free(spa, txg, &lwb->lwb_blk); + zil_free_lwb(zilog, lwb); + + /* + * If we don't have anything left in the lwb list then + * we've had an allocation failure and we need to zero + * out the zil_header blkptr so that we don't end + * up freeing the same block twice. + */ + if (list_head(&zilog->zl_lwb_list) == NULL) + BP_ZERO(&zh->zh_log); + } + + /* + * Remove fastwrite on any blocks that have been pre-allocated for + * the next commit. This prevents fastwrite counter pollution by + * unused, long-lived LWBs. + */ + for (; lwb != NULL; lwb = list_next(&zilog->zl_lwb_list, lwb)) { + if (lwb->lwb_fastwrite && !lwb->lwb_write_zio) { + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); + lwb->lwb_fastwrite = 0; + } + } + + mutex_exit(&zilog->zl_lock); +} + +/* ARGSUSED */ +static int +zil_lwb_cons(void *vbuf, void *unused, int kmflag) +{ + lwb_t *lwb = vbuf; + list_create(&lwb->lwb_itxs, sizeof (itx_t), offsetof(itx_t, itx_node)); + list_create(&lwb->lwb_waiters, sizeof (zil_commit_waiter_t), + offsetof(zil_commit_waiter_t, zcw_node)); + avl_create(&lwb->lwb_vdev_tree, zil_lwb_vdev_compare, + sizeof (zil_vdev_node_t), offsetof(zil_vdev_node_t, zv_node)); + mutex_init(&lwb->lwb_vdev_lock, NULL, MUTEX_DEFAULT, NULL); + return (0); +} + +/* ARGSUSED */ +static void +zil_lwb_dest(void *vbuf, void *unused) +{ + lwb_t *lwb = vbuf; + mutex_destroy(&lwb->lwb_vdev_lock); + avl_destroy(&lwb->lwb_vdev_tree); + list_destroy(&lwb->lwb_waiters); + list_destroy(&lwb->lwb_itxs); +} + +void +zil_init(void) +{ + zil_lwb_cache = kmem_cache_create("zil_lwb_cache", + sizeof (lwb_t), 0, zil_lwb_cons, zil_lwb_dest, NULL, NULL, NULL, 0); + + zil_zcw_cache = kmem_cache_create("zil_zcw_cache", + sizeof (zil_commit_waiter_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + zil_ksp = kstat_create("zfs", 0, "zil", "misc", + KSTAT_TYPE_NAMED, sizeof (zil_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zil_ksp != NULL) { + zil_ksp->ks_data = &zil_stats; + kstat_install(zil_ksp); + } +} + +void +zil_fini(void) +{ + kmem_cache_destroy(zil_zcw_cache); + kmem_cache_destroy(zil_lwb_cache); + + if (zil_ksp != NULL) { + kstat_delete(zil_ksp); + zil_ksp = NULL; + } +} + +void +zil_set_sync(zilog_t *zilog, uint64_t sync) +{ + zilog->zl_sync = sync; +} + +void +zil_set_logbias(zilog_t *zilog, uint64_t logbias) +{ + zilog->zl_logbias = logbias; +} + +zilog_t * +zil_alloc(objset_t *os, zil_header_t *zh_phys) +{ + zilog_t *zilog; + + zilog = kmem_zalloc(sizeof (zilog_t), KM_SLEEP); + + zilog->zl_header = zh_phys; + zilog->zl_os = os; + zilog->zl_spa = dmu_objset_spa(os); + zilog->zl_dmu_pool = dmu_objset_pool(os); + zilog->zl_destroy_txg = TXG_INITIAL - 1; + zilog->zl_logbias = dmu_objset_logbias(os); + zilog->zl_sync = dmu_objset_syncprop(os); + zilog->zl_dirty_max_txg = 0; + zilog->zl_last_lwb_opened = NULL; + zilog->zl_last_lwb_latency = 0; + zilog->zl_max_block_size = zil_maxblocksize; + + mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zilog->zl_issuer_lock, NULL, MUTEX_DEFAULT, NULL); + + for (int i = 0; i < TXG_SIZE; i++) { + mutex_init(&zilog->zl_itxg[i].itxg_lock, NULL, + MUTEX_DEFAULT, NULL); + } + + list_create(&zilog->zl_lwb_list, sizeof (lwb_t), + offsetof(lwb_t, lwb_node)); + + list_create(&zilog->zl_itx_commit_list, sizeof (itx_t), + offsetof(itx_t, itx_node)); + + cv_init(&zilog->zl_cv_suspend, NULL, CV_DEFAULT, NULL); + + return (zilog); +} + +void +zil_free(zilog_t *zilog) +{ + int i; + + zilog->zl_stop_sync = 1; + + ASSERT0(zilog->zl_suspend); + ASSERT0(zilog->zl_suspending); + + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + list_destroy(&zilog->zl_lwb_list); + + ASSERT(list_is_empty(&zilog->zl_itx_commit_list)); + list_destroy(&zilog->zl_itx_commit_list); + + for (i = 0; i < TXG_SIZE; i++) { + /* + * It's possible for an itx to be generated that doesn't dirty + * a txg (e.g. ztest TX_TRUNCATE). So there's no zil_clean() + * callback to remove the entry. We remove those here. + * + * Also free up the ziltest itxs. + */ + if (zilog->zl_itxg[i].itxg_itxs) + zil_itxg_clean(zilog->zl_itxg[i].itxg_itxs); + mutex_destroy(&zilog->zl_itxg[i].itxg_lock); + } + + mutex_destroy(&zilog->zl_issuer_lock); + mutex_destroy(&zilog->zl_lock); + + cv_destroy(&zilog->zl_cv_suspend); + + kmem_free(zilog, sizeof (zilog_t)); +} + +/* + * Open an intent log. + */ +zilog_t * +zil_open(objset_t *os, zil_get_data_t *get_data) +{ + zilog_t *zilog = dmu_objset_zil(os); + + ASSERT3P(zilog->zl_get_data, ==, NULL); + ASSERT3P(zilog->zl_last_lwb_opened, ==, NULL); + ASSERT(list_is_empty(&zilog->zl_lwb_list)); + + zilog->zl_get_data = get_data; + + return (zilog); +} + +/* + * Close an intent log. + */ +void +zil_close(zilog_t *zilog) +{ + lwb_t *lwb; + uint64_t txg; + + if (!dmu_objset_is_snapshot(zilog->zl_os)) { + zil_commit(zilog, 0); + } else { + ASSERT3P(list_tail(&zilog->zl_lwb_list), ==, NULL); + ASSERT0(zilog->zl_dirty_max_txg); + ASSERT3B(zilog_is_dirty(zilog), ==, B_FALSE); + } + + mutex_enter(&zilog->zl_lock); + lwb = list_tail(&zilog->zl_lwb_list); + if (lwb == NULL) + txg = zilog->zl_dirty_max_txg; + else + txg = MAX(zilog->zl_dirty_max_txg, lwb->lwb_max_txg); + mutex_exit(&zilog->zl_lock); + + /* + * We need to use txg_wait_synced() to wait long enough for the + * ZIL to be clean, and to wait for all pending lwbs to be + * written out. + */ + if (txg != 0) + txg_wait_synced(zilog->zl_dmu_pool, txg); + + if (zilog_is_dirty(zilog)) + zfs_dbgmsg("zil (%px) is dirty, txg %llu", zilog, txg); + if (txg < spa_freeze_txg(zilog->zl_spa)) + VERIFY(!zilog_is_dirty(zilog)); + + zilog->zl_get_data = NULL; + + /* + * We should have only one lwb left on the list; remove it now. + */ + mutex_enter(&zilog->zl_lock); + lwb = list_head(&zilog->zl_lwb_list); + if (lwb != NULL) { + ASSERT3P(lwb, ==, list_tail(&zilog->zl_lwb_list)); + ASSERT3S(lwb->lwb_state, !=, LWB_STATE_ISSUED); + + if (lwb->lwb_fastwrite) + metaslab_fastwrite_unmark(zilog->zl_spa, &lwb->lwb_blk); + + list_remove(&zilog->zl_lwb_list, lwb); + zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); + zil_free_lwb(zilog, lwb); + } + mutex_exit(&zilog->zl_lock); +} + +static char *suspend_tag = "zil suspending"; + +/* + * Suspend an intent log. While in suspended mode, we still honor + * synchronous semantics, but we rely on txg_wait_synced() to do it. + * On old version pools, we suspend the log briefly when taking a + * snapshot so that it will have an empty intent log. + * + * Long holds are not really intended to be used the way we do here -- + * held for such a short time. A concurrent caller of dsl_dataset_long_held() + * could fail. Therefore we take pains to only put a long hold if it is + * actually necessary. Fortunately, it will only be necessary if the + * objset is currently mounted (or the ZVOL equivalent). In that case it + * will already have a long hold, so we are not really making things any worse. + * + * Ideally, we would locate the existing long-holder (i.e. the zfsvfs_t or + * zvol_state_t), and use their mechanism to prevent their hold from being + * dropped (e.g. VFS_HOLD()). However, that would be even more pain for + * very little gain. + * + * if cookiep == NULL, this does both the suspend & resume. + * Otherwise, it returns with the dataset "long held", and the cookie + * should be passed into zil_resume(). + */ +int +zil_suspend(const char *osname, void **cookiep) +{ + objset_t *os; + zilog_t *zilog; + const zil_header_t *zh; + int error; + + error = dmu_objset_hold(osname, suspend_tag, &os); + if (error != 0) + return (error); + zilog = dmu_objset_zil(os); + + mutex_enter(&zilog->zl_lock); + zh = zilog->zl_header; + + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (SET_ERROR(EBUSY)); + } + + /* + * Don't put a long hold in the cases where we can avoid it. This + * is when there is no cookie so we are doing a suspend & resume + * (i.e. called from zil_vdev_offline()), and there's nothing to do + * for the suspend because it's already suspended, or there's no ZIL. + */ + if (cookiep == NULL && !zilog->zl_suspending && + (zilog->zl_suspend > 0 || BP_IS_HOLE(&zh->zh_log))) { + mutex_exit(&zilog->zl_lock); + dmu_objset_rele(os, suspend_tag); + return (0); + } + + dsl_dataset_long_hold(dmu_objset_ds(os), suspend_tag); + dsl_pool_rele(dmu_objset_pool(os), suspend_tag); + + zilog->zl_suspend++; + + if (zilog->zl_suspend > 1) { + /* + * Someone else is already suspending it. + * Just wait for them to finish. + */ + + while (zilog->zl_suspending) + cv_wait(&zilog->zl_cv_suspend, &zilog->zl_lock); + mutex_exit(&zilog->zl_lock); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); + } + + /* + * If there is no pointer to an on-disk block, this ZIL must not + * be active (e.g. filesystem not mounted), so there's nothing + * to clean up. + */ + if (BP_IS_HOLE(&zh->zh_log)) { + ASSERT(cookiep != NULL); /* fast path already handled */ + + *cookiep = os; + mutex_exit(&zilog->zl_lock); + return (0); + } + + /* + * The ZIL has work to do. Ensure that the associated encryption + * key will remain mapped while we are committing the log by + * grabbing a reference to it. If the key isn't loaded we have no + * choice but to return an error until the wrapping key is loaded. + */ + if (os->os_encrypted && + dsl_dataset_create_key_mapping(dmu_objset_ds(os)) != 0) { + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); + return (SET_ERROR(EACCES)); + } + + zilog->zl_suspending = B_TRUE; + mutex_exit(&zilog->zl_lock); + + /* + * We need to use zil_commit_impl to ensure we wait for all + * LWB_STATE_OPENED and LWB_STATE_ISSUED lwbs to be committed + * to disk before proceeding. If we used zil_commit instead, it + * would just call txg_wait_synced(), because zl_suspend is set. + * txg_wait_synced() doesn't wait for these lwb's to be + * LWB_STATE_FLUSH_DONE before returning. + */ + zil_commit_impl(zilog, 0); + + /* + * Now that we've ensured all lwb's are LWB_STATE_FLUSH_DONE, we + * use txg_wait_synced() to ensure the data from the zilog has + * migrated to the main pool before calling zil_destroy(). + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zil_destroy(zilog, B_FALSE); + + mutex_enter(&zilog->zl_lock); + zilog->zl_suspending = B_FALSE; + cv_broadcast(&zilog->zl_cv_suspend); + mutex_exit(&zilog->zl_lock); + + if (os->os_encrypted) + dsl_dataset_remove_key_mapping(dmu_objset_ds(os)); + + if (cookiep == NULL) + zil_resume(os); + else + *cookiep = os; + return (0); +} + +void +zil_resume(void *cookie) +{ + objset_t *os = cookie; + zilog_t *zilog = dmu_objset_zil(os); + + mutex_enter(&zilog->zl_lock); + ASSERT(zilog->zl_suspend != 0); + zilog->zl_suspend--; + mutex_exit(&zilog->zl_lock); + dsl_dataset_long_rele(dmu_objset_ds(os), suspend_tag); + dsl_dataset_rele(dmu_objset_ds(os), suspend_tag); +} + +typedef struct zil_replay_arg { + zil_replay_func_t **zr_replay; + void *zr_arg; + boolean_t zr_byteswap; + char *zr_lr; +} zil_replay_arg_t; + +static int +zil_replay_error(zilog_t *zilog, lr_t *lr, int error) +{ + char name[ZFS_MAX_DATASET_NAME_LEN]; + + zilog->zl_replaying_seq--; /* didn't actually replay this one */ + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, + (u_longlong_t)lr->lrc_seq, + (u_longlong_t)(lr->lrc_txtype & ~TX_CI), + (lr->lrc_txtype & TX_CI) ? "CI" : ""); + + return (error); +} + +static int +zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) +{ + zil_replay_arg_t *zr = zra; + const zil_header_t *zh = zilog->zl_header; + uint64_t reclen = lr->lrc_reclen; + uint64_t txtype = lr->lrc_txtype; + int error = 0; + + zilog->zl_replaying_seq = lr->lrc_seq; + + if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ + return (0); + + if (lr->lrc_txg < claim_txg) /* already committed */ + return (0); + + /* Strip case-insensitive bit, still present in log record */ + txtype &= ~TX_CI; + + if (txtype == 0 || txtype >= TX_MAX_TYPE) + return (zil_replay_error(zilog, lr, EINVAL)); + + /* + * If this record type can be logged out of order, the object + * (lr_foid) may no longer exist. That's legitimate, not an error. + */ + if (TX_OOO(txtype)) { + error = dmu_object_info(zilog->zl_os, + LR_FOID_GET_OBJ(((lr_ooo_t *)lr)->lr_foid), NULL); + if (error == ENOENT || error == EEXIST) + return (0); + } + + /* + * Make a copy of the data so we can revise and extend it. + */ + bcopy(lr, zr->zr_lr, reclen); + + /* + * If this is a TX_WRITE with a blkptr, suck in the data. + */ + if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { + error = zil_read_log_data(zilog, (lr_write_t *)lr, + zr->zr_lr + reclen); + if (error != 0) + return (zil_replay_error(zilog, lr, error)); + } + + /* + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different record types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. + */ + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lr, reclen); + + /* + * We must now do two things atomically: replay this log record, + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. + */ + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); + if (error != 0) { + /* + * The DMU's dnode layer doesn't see removes until the txg + * commits, so a subsequent claim can spuriously fail with + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. Note that we + * specify B_FALSE for byteswap now, so we don't do it twice. + */ + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); + if (error != 0) + return (zil_replay_error(zilog, lr, error)); + } + return (0); +} + +/* ARGSUSED */ +static int +zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) +{ + zilog->zl_replay_blks++; + + return (0); +} + +/* + * If this dataset has a non-empty intent log, replay it and destroy it. + */ +void +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) +{ + zilog_t *zilog = dmu_objset_zil(os); + const zil_header_t *zh = zilog->zl_header; + zil_replay_arg_t zr; + + if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { + zil_destroy(zilog, B_TRUE); + return; + } + + zr.zr_replay = replay_func; + zr.zr_arg = arg; + zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); + zr.zr_lr = vmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + + /* + * Wait for in-progress removes to sync before starting replay. + */ + txg_wait_synced(zilog->zl_dmu_pool, 0); + + zilog->zl_replay = B_TRUE; + zilog->zl_replay_time = ddi_get_lbolt(); + ASSERT(zilog->zl_replay_blks == 0); + (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, + zh->zh_claim_txg, B_TRUE); + vmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); + + zil_destroy(zilog, B_FALSE); + txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; +} + +boolean_t +zil_replaying(zilog_t *zilog, dmu_tx_t *tx) +{ + if (zilog->zl_sync == ZFS_SYNC_DISABLED) + return (B_TRUE); + + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return (B_TRUE); + } + + return (B_FALSE); +} + +/* ARGSUSED */ +int +zil_reset(const char *osname, void *arg) +{ + int error; + + error = zil_suspend(osname, NULL); + /* EACCES means crypto key not loaded */ + if ((error == EACCES) || (error == EBUSY)) + return (SET_ERROR(error)); + if (error != 0) + return (SET_ERROR(EEXIST)); + return (0); +} + +EXPORT_SYMBOL(zil_alloc); +EXPORT_SYMBOL(zil_free); +EXPORT_SYMBOL(zil_open); +EXPORT_SYMBOL(zil_close); +EXPORT_SYMBOL(zil_replay); +EXPORT_SYMBOL(zil_replaying); +EXPORT_SYMBOL(zil_destroy); +EXPORT_SYMBOL(zil_destroy_sync); +EXPORT_SYMBOL(zil_itx_create); +EXPORT_SYMBOL(zil_itx_destroy); +EXPORT_SYMBOL(zil_itx_assign); +EXPORT_SYMBOL(zil_commit); +EXPORT_SYMBOL(zil_claim); +EXPORT_SYMBOL(zil_check_log_chain); +EXPORT_SYMBOL(zil_sync); +EXPORT_SYMBOL(zil_clean); +EXPORT_SYMBOL(zil_suspend); +EXPORT_SYMBOL(zil_resume); +EXPORT_SYMBOL(zil_lwb_add_block); +EXPORT_SYMBOL(zil_bp_tree_add); +EXPORT_SYMBOL(zil_set_sync); +EXPORT_SYMBOL(zil_set_logbias); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs, zfs_, commit_timeout_pct, INT, ZMOD_RW, + "ZIL block open timeout percentage"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, replay_disable, INT, ZMOD_RW, + "Disable intent logging replay"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, nocacheflush, INT, ZMOD_RW, + "Disable ZIL cache flushes"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, slog_bulk, ULONG, ZMOD_RW, + "Limit in bytes slog sync writes per commit"); + +ZFS_MODULE_PARAM(zfs_zil, zil_, maxblocksize, INT, ZMOD_RW, + "Limit in bytes of ZIL log block size"); +/* END CSTYLED */ diff --git a/module/zfs/zio.c b/module/zfs/zio.c new file mode 100644 index 000000000000..2628cc029d49 --- /dev/null +++ b/module/zfs/zio.c @@ -0,0 +1,4969 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2011, 2019 by Delphix. All rights reserved. + * Copyright (c) 2011 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * ========================================================================== + * I/O type descriptions + * ========================================================================== + */ +const char *zio_type_name[ZIO_TYPES] = { + /* + * Note: Linux kernel thread name length is limited + * so these names will differ from upstream open zfs. + */ + "z_null", "z_rd", "z_wr", "z_fr", "z_cl", "z_ioctl", "z_trim" +}; + +int zio_dva_throttle_enabled = B_TRUE; +int zio_deadman_log_all = B_FALSE; + +/* + * ========================================================================== + * I/O kmem caches + * ========================================================================== + */ +kmem_cache_t *zio_cache; +kmem_cache_t *zio_link_cache; +kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#if defined(ZFS_DEBUG) && !defined(_KERNEL) +uint64_t zio_buf_cache_allocs[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +uint64_t zio_buf_cache_frees[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; +#endif + +/* Mark IOs as "slow" if they take longer than 30 seconds */ +int zio_slow_io_ms = (30 * MILLISEC); + +#define BP_SPANB(indblkshift, level) \ + (((uint64_t)1) << ((level) * ((indblkshift) - SPA_BLKPTRSHIFT))) +#define COMPARE_META_LEVEL 0x80000000ul +/* + * The following actions directly effect the spa's sync-to-convergence logic. + * The values below define the sync pass when we start performing the action. + * Care should be taken when changing these values as they directly impact + * spa_sync() performance. Tuning these values may introduce subtle performance + * pathologies and should only be done in the context of performance analysis. + * These tunables will eventually be removed and replaced with #defines once + * enough analysis has been done to determine optimal values. + * + * The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that + * regular blocks are not deferred. + * + * Starting in sync pass 8 (zfs_sync_pass_dont_compress), we disable + * compression (including of metadata). In practice, we don't have this + * many sync passes, so this has no effect. + * + * The original intent was that disabling compression would help the sync + * passes to converge. However, in practice disabling compression increases + * the average number of sync passes, because when we turn compression off, a + * lot of block's size will change and thus we have to re-allocate (not + * overwrite) them. It also increases the number of 128KB allocations (e.g. + * for indirect blocks and spacemaps) because these will not be compressed. + * The 128K allocations are especially detrimental to performance on highly + * fragmented systems, which may have very few free segments of this size, + * and may need to load new metaslabs to satisfy 128K allocations. + */ +int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */ +int zfs_sync_pass_dont_compress = 8; /* don't compress starting in this pass */ +int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */ + +/* + * An allocating zio is one that either currently has the DVA allocate + * stage set or will have it later in its lifetime. + */ +#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) + +/* + * Enable smaller cores by excluding metadata + * allocations as well. + */ +int zio_exclude_metadata = 0; +int zio_requeue_io_start_cut_in_line = 1; + +#ifdef ZFS_DEBUG +int zio_buf_debug_limit = 16384; +#else +int zio_buf_debug_limit = 0; +#endif + +static inline void __zio_execute(zio_t *zio); + +static void zio_taskq_dispatch(zio_t *, zio_taskq_type_t, boolean_t); + +void +zio_init(void) +{ + size_t c; + vmem_t *data_alloc_arena = NULL; + + zio_cache = kmem_cache_create("zio_cache", + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_link_cache = kmem_cache_create("zio_link_cache", + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + + /* + * For small buffers, we want a cache for each multiple of + * SPA_MINBLOCKSIZE. For larger buffers, we want a cache + * for each quarter-power of 2. + */ + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { + size_t size = (c + 1) << SPA_MINBLOCKSHIFT; + size_t p2 = size; + size_t align = 0; + size_t data_cflags, cflags; + + data_cflags = KMC_NODEBUG; + cflags = (zio_exclude_metadata || size > zio_buf_debug_limit) ? + KMC_NODEBUG : 0; + +#if defined(_ILP32) && defined(_KERNEL) + /* + * Cache size limited to 1M on 32-bit platforms until ARC + * buffers no longer require virtual address space. + */ + if (size > zfs_max_recordsize) + break; +#endif + + while (!ISP2(p2)) + p2 &= p2 - 1; + +#ifndef _KERNEL + /* + * If we are using watchpoints, put each buffer on its own page, + * to eliminate the performance overhead of trapping to the + * kernel when modifying a non-watched buffer that shares the + * page with a watched buffer. + */ + if (arc_watch && !IS_P2ALIGNED(size, PAGESIZE)) + continue; + /* + * Here's the problem - on 4K native devices in userland on + * Linux using O_DIRECT, buffers must be 4K aligned or I/O + * will fail with EINVAL, causing zdb (and others) to coredump. + * Since userland probably doesn't need optimized buffer caches, + * we just force 4K alignment on everything. + */ + align = 8 * SPA_MINBLOCKSIZE; +#else + if (size < PAGESIZE) { + align = SPA_MINBLOCKSIZE; + } else if (IS_P2ALIGNED(size, p2 >> 2)) { + align = PAGESIZE; + } +#endif + + if (align != 0) { + char name[36]; + (void) snprintf(name, sizeof (name), "zio_buf_%lu", + (ulong_t)size); + zio_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, NULL, cflags); + + (void) snprintf(name, sizeof (name), "zio_data_buf_%lu", + (ulong_t)size); + zio_data_buf_cache[c] = kmem_cache_create(name, size, + align, NULL, NULL, NULL, NULL, + data_alloc_arena, data_cflags); + } + } + + while (--c != 0) { + ASSERT(zio_buf_cache[c] != NULL); + if (zio_buf_cache[c - 1] == NULL) + zio_buf_cache[c - 1] = zio_buf_cache[c]; + + ASSERT(zio_data_buf_cache[c] != NULL); + if (zio_data_buf_cache[c - 1] == NULL) + zio_data_buf_cache[c - 1] = zio_data_buf_cache[c]; + } + + zio_inject_init(); + + lz4_init(); +} + +void +zio_fini(void) +{ + size_t c; + kmem_cache_t *last_cache = NULL; + kmem_cache_t *last_data_cache = NULL; + + for (c = 0; c < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT; c++) { +#ifdef _ILP32 + /* + * Cache size limited to 1M on 32-bit platforms until ARC + * buffers no longer require virtual address space. + */ + if (((c + 1) << SPA_MINBLOCKSHIFT) > zfs_max_recordsize) + break; +#endif +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + if (zio_buf_cache_allocs[c] != zio_buf_cache_frees[c]) + (void) printf("zio_fini: [%d] %llu != %llu\n", + (int)((c + 1) << SPA_MINBLOCKSHIFT), + (long long unsigned)zio_buf_cache_allocs[c], + (long long unsigned)zio_buf_cache_frees[c]); +#endif + if (zio_buf_cache[c] != last_cache) { + last_cache = zio_buf_cache[c]; + kmem_cache_destroy(zio_buf_cache[c]); + } + zio_buf_cache[c] = NULL; + + if (zio_data_buf_cache[c] != last_data_cache) { + last_data_cache = zio_data_buf_cache[c]; + kmem_cache_destroy(zio_data_buf_cache[c]); + } + zio_data_buf_cache[c] = NULL; + } + + kmem_cache_destroy(zio_link_cache); + kmem_cache_destroy(zio_cache); + + zio_inject_fini(); + + lz4_fini(); +} + +/* + * ========================================================================== + * Allocate and free I/O buffers + * ========================================================================== + */ + +/* + * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a + * crashdump if the kernel panics, so use it judiciously. Obviously, it's + * useful to inspect ZFS metadata, but if possible, we should avoid keeping + * excess / transient data in-core during a crashdump. + */ +void * +zio_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_allocs[c], 1); +#endif + + return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); +} + +/* + * Use zio_data_buf_alloc to allocate data. The data will not appear in a + * crashdump if the kernel panics. This exists so that we will limit the amount + * of ZFS data that shows up in a kernel crashdump. (Thus reducing the amount + * of kernel heap dumped to disk when the kernel panics) + */ +void * +zio_data_buf_alloc(size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); +} + +void +zio_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); +#if defined(ZFS_DEBUG) && !defined(_KERNEL) + atomic_add_64(&zio_buf_cache_frees[c], 1); +#endif + + kmem_cache_free(zio_buf_cache[c], buf); +} + +void +zio_data_buf_free(void *buf, size_t size) +{ + size_t c = (size - 1) >> SPA_MINBLOCKSHIFT; + + VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + + kmem_cache_free(zio_data_buf_cache[c], buf); +} + +static void +zio_abd_free(void *abd, size_t size) +{ + abd_free((abd_t *)abd); +} + +/* + * ========================================================================== + * Push and pop I/O transform buffers + * ========================================================================== + */ +void +zio_push_transform(zio_t *zio, abd_t *data, uint64_t size, uint64_t bufsize, + zio_transform_func_t *transform) +{ + zio_transform_t *zt = kmem_alloc(sizeof (zio_transform_t), KM_SLEEP); + + zt->zt_orig_abd = zio->io_abd; + zt->zt_orig_size = zio->io_size; + zt->zt_bufsize = bufsize; + zt->zt_transform = transform; + + zt->zt_next = zio->io_transform_stack; + zio->io_transform_stack = zt; + + zio->io_abd = data; + zio->io_size = size; +} + +void +zio_pop_transforms(zio_t *zio) +{ + zio_transform_t *zt; + + while ((zt = zio->io_transform_stack) != NULL) { + if (zt->zt_transform != NULL) + zt->zt_transform(zio, + zt->zt_orig_abd, zt->zt_orig_size); + + if (zt->zt_bufsize != 0) + abd_free(zio->io_abd); + + zio->io_abd = zt->zt_orig_abd; + zio->io_size = zt->zt_orig_size; + zio->io_transform_stack = zt->zt_next; + + kmem_free(zt, sizeof (zio_transform_t)); + } +} + +/* + * ========================================================================== + * I/O transform callbacks for subblocks, decompression, and decryption + * ========================================================================== + */ +static void +zio_subblock(zio_t *zio, abd_t *data, uint64_t size) +{ + ASSERT(zio->io_size > size); + + if (zio->io_type == ZIO_TYPE_READ) + abd_copy(data, zio->io_abd, size); +} + +static void +zio_decompress(zio_t *zio, abd_t *data, uint64_t size) +{ + if (zio->io_error == 0) { + void *tmp = abd_borrow_buf(data, size); + int ret = zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), + zio->io_abd, tmp, zio->io_size, size, + &zio->io_prop.zp_complevel); + abd_return_buf_copy(data, tmp, size); + + if (zio_injection_enabled && ret == 0) + ret = zio_handle_fault_injection(zio, EINVAL); + + if (ret != 0) + zio->io_error = SET_ERROR(EIO); + } +} + +static void +zio_decrypt(zio_t *zio, abd_t *data, uint64_t size) +{ + int ret; + void *tmp; + blkptr_t *bp = zio->io_bp; + spa_t *spa = zio->io_spa; + uint64_t dsobj = zio->io_bookmark.zb_objset; + uint64_t lsize = BP_GET_LSIZE(bp); + dmu_object_type_t ot = BP_GET_TYPE(bp); + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + ASSERT(BP_USES_CRYPT(bp)); + ASSERT3U(size, !=, 0); + + if (zio->io_error != 0) + return; + + /* + * Verify the cksum of MACs stored in an indirect bp. It will always + * be possible to verify this since it does not require an encryption + * key. + */ + if (BP_HAS_INDIRECT_MAC_CKSUM(bp)) { + zio_crypt_decode_mac_bp(bp, mac); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF) { + /* + * We haven't decompressed the data yet, but + * zio_crypt_do_indirect_mac_checksum() requires + * decompressed data to be able to parse out the MACs + * from the indirect block. We decompress it now and + * throw away the result after we are finished. + */ + tmp = zio_buf_alloc(lsize); + ret = zio_decompress_data(BP_GET_COMPRESS(bp), + zio->io_abd, tmp, zio->io_size, lsize, + &zio->io_prop.zp_complevel); + if (ret != 0) { + ret = SET_ERROR(EIO); + goto error; + } + ret = zio_crypt_do_indirect_mac_checksum(B_FALSE, + tmp, lsize, BP_SHOULD_BYTESWAP(bp), mac); + zio_buf_free(tmp, lsize); + } else { + ret = zio_crypt_do_indirect_mac_checksum_abd(B_FALSE, + zio->io_abd, size, BP_SHOULD_BYTESWAP(bp), mac); + } + abd_copy(data, zio->io_abd, size); + + if (zio_injection_enabled && ot != DMU_OT_DNODE && ret == 0) { + ret = zio_handle_decrypt_injection(spa, + &zio->io_bookmark, ot, ECKSUM); + } + if (ret != 0) + goto error; + + return; + } + + /* + * If this is an authenticated block, just check the MAC. It would be + * nice to separate this out into its own flag, but for the moment + * enum zio_flag is out of bits. + */ + if (BP_IS_AUTHENTICATED(bp)) { + if (ot == DMU_OT_OBJSET) { + ret = spa_do_crypt_objset_mac_abd(B_FALSE, spa, + dsobj, zio->io_abd, size, BP_SHOULD_BYTESWAP(bp)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + ret = spa_do_crypt_mac_abd(B_FALSE, spa, dsobj, + zio->io_abd, size, mac); + if (zio_injection_enabled && ret == 0) { + ret = zio_handle_decrypt_injection(spa, + &zio->io_bookmark, ot, ECKSUM); + } + } + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + } + + zio_crypt_decode_params_bp(bp, salt, iv); + + if (ot == DMU_OT_INTENT_LOG) { + tmp = abd_borrow_buf_copy(zio->io_abd, sizeof (zil_chain_t)); + zio_crypt_decode_mac_zil(tmp, mac); + abd_return_buf(zio->io_abd, tmp, sizeof (zil_chain_t)); + } else { + zio_crypt_decode_mac_bp(bp, mac); + } + + ret = spa_do_crypt_abd(B_FALSE, spa, &zio->io_bookmark, BP_GET_TYPE(bp), + BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), salt, iv, mac, size, data, + zio->io_abd, &no_crypt); + if (no_crypt) + abd_copy(data, zio->io_abd, size); + + if (ret != 0) + goto error; + + return; + +error: + /* assert that the key was found unless this was speculative */ + ASSERT(ret != EACCES || (zio->io_flags & ZIO_FLAG_SPECULATIVE)); + + /* + * If there was a decryption / authentication error return EIO as + * the io_error. If this was not a speculative zio, create an ereport. + */ + if (ret == ECKSUM) { + zio->io_error = SET_ERROR(EIO); + if ((zio->io_flags & ZIO_FLAG_SPECULATIVE) == 0) { + spa_log_error(spa, &zio->io_bookmark); + zfs_ereport_post(FM_EREPORT_ZFS_AUTHENTICATION, + spa, NULL, &zio->io_bookmark, zio, 0, 0); + } + } else { + zio->io_error = ret; + } +} + +/* + * ========================================================================== + * I/O parent/child relationships and pipeline interlocks + * ========================================================================== + */ +zio_t * +zio_walk_parents(zio_t *cio, zio_link_t **zl) +{ + list_t *pl = &cio->io_parent_list; + + *zl = (*zl == NULL) ? list_head(pl) : list_next(pl, *zl); + if (*zl == NULL) + return (NULL); + + ASSERT((*zl)->zl_child == cio); + return ((*zl)->zl_parent); +} + +zio_t * +zio_walk_children(zio_t *pio, zio_link_t **zl) +{ + list_t *cl = &pio->io_child_list; + + ASSERT(MUTEX_HELD(&pio->io_lock)); + + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); + if (*zl == NULL) + return (NULL); + + ASSERT((*zl)->zl_parent == pio); + return ((*zl)->zl_child); +} + +zio_t * +zio_unique_parent(zio_t *cio) +{ + zio_link_t *zl = NULL; + zio_t *pio = zio_walk_parents(cio, &zl); + + VERIFY3P(zio_walk_parents(cio, &zl), ==, NULL); + return (pio); +} + +void +zio_add_child(zio_t *pio, zio_t *cio) +{ + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT3S(cio->io_child_type, <=, pio->io_child_type); + + zl->zl_parent = pio; + zl->zl_child = cio; + + mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + list_insert_head(&cio->io_parent_list, zl); + + pio->io_child_count++; + cio->io_parent_count++; + + mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); +} + +static void +zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) +{ + ASSERT(zl->zl_parent == pio); + ASSERT(zl->zl_child == cio); + + mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); + + list_remove(&pio->io_child_list, zl); + list_remove(&cio->io_parent_list, zl); + + pio->io_child_count--; + cio->io_parent_count--; + + mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); + kmem_cache_free(zio_link_cache, zl); +} + +static boolean_t +zio_wait_for_children(zio_t *zio, uint8_t childbits, enum zio_wait_type wait) +{ + boolean_t waiting = B_FALSE; + + mutex_enter(&zio->io_lock); + ASSERT(zio->io_stall == NULL); + for (int c = 0; c < ZIO_CHILD_TYPES; c++) { + if (!(ZIO_CHILD_BIT_IS_SET(childbits, c))) + continue; + + uint64_t *countp = &zio->io_children[c][wait]; + if (*countp != 0) { + zio->io_stage >>= 1; + ASSERT3U(zio->io_stage, !=, ZIO_STAGE_OPEN); + zio->io_stall = countp; + waiting = B_TRUE; + break; + } + } + mutex_exit(&zio->io_lock); + return (waiting); +} + +__attribute__((always_inline)) +static inline void +zio_notify_parent(zio_t *pio, zio_t *zio, enum zio_wait_type wait, + zio_t **next_to_executep) +{ + uint64_t *countp = &pio->io_children[zio->io_child_type][wait]; + int *errorp = &pio->io_child_error[zio->io_child_type]; + + mutex_enter(&pio->io_lock); + if (zio->io_error && !(zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + *errorp = zio_worst_error(*errorp, zio->io_error); + pio->io_reexecute |= zio->io_reexecute; + ASSERT3U(*countp, >, 0); + + (*countp)--; + + if (*countp == 0 && pio->io_stall == countp) { + zio_taskq_type_t type = + pio->io_stage < ZIO_STAGE_VDEV_IO_START ? ZIO_TASKQ_ISSUE : + ZIO_TASKQ_INTERRUPT; + pio->io_stall = NULL; + mutex_exit(&pio->io_lock); + + /* + * If we can tell the caller to execute this parent next, do + * so. Otherwise dispatch the parent zio as its own task. + * + * Having the caller execute the parent when possible reduces + * locking on the zio taskq's, reduces context switch + * overhead, and has no recursion penalty. Note that one + * read from disk typically causes at least 3 zio's: a + * zio_null(), the logical zio_read(), and then a physical + * zio. When the physical ZIO completes, we are able to call + * zio_done() on all 3 of these zio's from one invocation of + * zio_execute() by returning the parent back to + * zio_execute(). Since the parent isn't executed until this + * thread returns back to zio_execute(), the caller should do + * so promptly. + * + * In other cases, dispatching the parent prevents + * overflowing the stack when we have deeply nested + * parent-child relationships, as we do with the "mega zio" + * of writes for spa_sync(), and the chain of ZIL blocks. + */ + if (next_to_executep != NULL && *next_to_executep == NULL) { + *next_to_executep = pio; + } else { + zio_taskq_dispatch(pio, type, B_FALSE); + } + } else { + mutex_exit(&pio->io_lock); + } +} + +static void +zio_inherit_child_errors(zio_t *zio, enum zio_child c) +{ + if (zio->io_child_error[c] != 0 && zio->io_error == 0) + zio->io_error = zio->io_child_error[c]; +} + +int +zio_bookmark_compare(const void *x1, const void *x2) +{ + const zio_t *z1 = x1; + const zio_t *z2 = x2; + + if (z1->io_bookmark.zb_objset < z2->io_bookmark.zb_objset) + return (-1); + if (z1->io_bookmark.zb_objset > z2->io_bookmark.zb_objset) + return (1); + + if (z1->io_bookmark.zb_object < z2->io_bookmark.zb_object) + return (-1); + if (z1->io_bookmark.zb_object > z2->io_bookmark.zb_object) + return (1); + + if (z1->io_bookmark.zb_level < z2->io_bookmark.zb_level) + return (-1); + if (z1->io_bookmark.zb_level > z2->io_bookmark.zb_level) + return (1); + + if (z1->io_bookmark.zb_blkid < z2->io_bookmark.zb_blkid) + return (-1); + if (z1->io_bookmark.zb_blkid > z2->io_bookmark.zb_blkid) + return (1); + + if (z1 < z2) + return (-1); + if (z1 > z2) + return (1); + + return (0); +} + +/* + * ========================================================================== + * Create the various types of I/O (read, write, free, etc) + * ========================================================================== + */ +static zio_t * +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + abd_t *data, uint64_t lsize, uint64_t psize, zio_done_func_t *done, + void *private, zio_type_t type, zio_priority_t priority, + enum zio_flag flags, vdev_t *vd, uint64_t offset, + const zbookmark_phys_t *zb, enum zio_stage stage, + enum zio_stage pipeline) +{ + zio_t *zio; + + IMPLY(type != ZIO_TYPE_TRIM, psize <= SPA_MAXBLOCKSIZE); + ASSERT(P2PHASE(psize, SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(offset, SPA_MINBLOCKSIZE) == 0); + + ASSERT(!vd || spa_config_held(spa, SCL_STATE_ALL, RW_READER)); + ASSERT(!bp || !(flags & ZIO_FLAG_CONFIG_WRITER)); + ASSERT(vd || stage == ZIO_STAGE_OPEN); + + IMPLY(lsize != psize, (flags & ZIO_FLAG_RAW_COMPRESS) != 0); + + zio = kmem_cache_alloc(zio_cache, KM_SLEEP); + bzero(zio, sizeof (zio_t)); + + mutex_init(&zio->io_lock, NULL, MUTEX_NOLOCKDEP, NULL); + cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + metaslab_trace_init(&zio->io_alloc_list); + + if (vd != NULL) + zio->io_child_type = ZIO_CHILD_VDEV; + else if (flags & ZIO_FLAG_GANG_CHILD) + zio->io_child_type = ZIO_CHILD_GANG; + else if (flags & ZIO_FLAG_DDT_CHILD) + zio->io_child_type = ZIO_CHILD_DDT; + else + zio->io_child_type = ZIO_CHILD_LOGICAL; + + if (bp != NULL) { + zio->io_bp = (blkptr_t *)bp; + zio->io_bp_copy = *bp; + zio->io_bp_orig = *bp; + if (type != ZIO_TYPE_WRITE || + zio->io_child_type == ZIO_CHILD_DDT) + zio->io_bp = &zio->io_bp_copy; /* so caller can free */ + if (zio->io_child_type == ZIO_CHILD_LOGICAL) + zio->io_logical = zio; + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; + } + + zio->io_spa = spa; + zio->io_txg = txg; + zio->io_done = done; + zio->io_private = private; + zio->io_type = type; + zio->io_priority = priority; + zio->io_vd = vd; + zio->io_offset = offset; + zio->io_orig_abd = zio->io_abd = data; + zio->io_orig_size = zio->io_size = psize; + zio->io_lsize = lsize; + zio->io_orig_flags = zio->io_flags = flags; + zio->io_orig_stage = zio->io_stage = stage; + zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_pipeline_trace = ZIO_STAGE_OPEN; + + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); + + if (zb != NULL) + zio->io_bookmark = *zb; + + if (pio != NULL) { + if (zio->io_metaslab_class == NULL) + zio->io_metaslab_class = pio->io_metaslab_class; + if (zio->io_logical == NULL) + zio->io_logical = pio->io_logical; + if (zio->io_child_type == ZIO_CHILD_GANG) + zio->io_gang_leader = pio->io_gang_leader; + zio_add_child(pio, zio); + } + + taskq_init_ent(&zio->io_tqent); + + return (zio); +} + +static void +zio_destroy(zio_t *zio) +{ + metaslab_trace_fini(&zio->io_alloc_list); + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); + mutex_destroy(&zio->io_lock); + cv_destroy(&zio->io_cv); + kmem_cache_free(zio_cache, zio); +} + +zio_t * +zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, + void *private, enum zio_flag flags) +{ + zio_t *zio; + + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); + + return (zio); +} + +zio_t * +zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) +{ + return (zio_null(NULL, spa, NULL, done, private, flags)); +} + +static int +zfs_blkptr_verify_log(spa_t *spa, const blkptr_t *bp, + enum blk_verify_flag blk_verify, const char *fmt, ...) +{ + va_list adx; + char buf[256]; + + va_start(adx, fmt); + (void) vsnprintf(buf, sizeof (buf), fmt, adx); + va_end(adx); + + switch (blk_verify) { + case BLK_VERIFY_HALT: + dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); + zfs_panic_recover("%s: %s", spa_name(spa), buf); + break; + case BLK_VERIFY_LOG: + zfs_dbgmsg("%s: %s", spa_name(spa), buf); + break; + case BLK_VERIFY_ONLY: + break; + } + + return (1); +} + +/* + * Verify the block pointer fields contain reasonable values. This means + * it only contains known object types, checksum/compression identifiers, + * block sizes within the maximum allowed limits, valid DVAs, etc. + * + * If everything checks out B_TRUE is returned. The zfs_blkptr_verify + * argument controls the behavior when an invalid field is detected. + * + * Modes for zfs_blkptr_verify: + * 1) BLK_VERIFY_ONLY (evaluate the block) + * 2) BLK_VERIFY_LOG (evaluate the block and log problems) + * 3) BLK_VERIFY_HALT (call zfs_panic_recover on error) + */ +boolean_t +zfs_blkptr_verify(spa_t *spa, const blkptr_t *bp, boolean_t config_held, + enum blk_verify_flag blk_verify) +{ + int errors = 0; + + if (!DMU_OT_IS_VALID(BP_GET_TYPE(bp))) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid TYPE %llu", + bp, (longlong_t)BP_GET_TYPE(bp)); + } + if (BP_GET_CHECKSUM(bp) >= ZIO_CHECKSUM_FUNCTIONS || + BP_GET_CHECKSUM(bp) <= ZIO_CHECKSUM_ON) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid CHECKSUM %llu", + bp, (longlong_t)BP_GET_CHECKSUM(bp)); + } + if (BP_GET_COMPRESS(bp) >= ZIO_COMPRESS_FUNCTIONS || + BP_GET_COMPRESS(bp) <= ZIO_COMPRESS_ON) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid COMPRESS %llu", + bp, (longlong_t)BP_GET_COMPRESS(bp)); + } + if (BP_GET_LSIZE(bp) > SPA_MAXBLOCKSIZE) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid LSIZE %llu", + bp, (longlong_t)BP_GET_LSIZE(bp)); + } + if (BP_GET_PSIZE(bp) > SPA_MAXBLOCKSIZE) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid PSIZE %llu", + bp, (longlong_t)BP_GET_PSIZE(bp)); + } + + if (BP_IS_EMBEDDED(bp)) { + if (BPE_GET_ETYPE(bp) >= NUM_BP_EMBEDDED_TYPES) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p has invalid ETYPE %llu", + bp, (longlong_t)BPE_GET_ETYPE(bp)); + } + } + + /* + * Do not verify individual DVAs if the config is not trusted. This + * will be done once the zio is executed in vdev_mirror_map_alloc. + */ + if (!spa->spa_trust_config) + return (B_TRUE); + + if (!config_held) + spa_config_enter(spa, SCL_VDEV, bp, RW_READER); + else + ASSERT(spa_config_held(spa, SCL_VDEV, RW_WRITER)); + /* + * Pool-specific checks. + * + * Note: it would be nice to verify that the blk_birth and + * BP_PHYSICAL_BIRTH() are not too large. However, spa_freeze() + * allows the birth time of log blocks (and dmu_sync()-ed blocks + * that are in the log) to be arbitrarily large. + */ + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + uint64_t vdevid = DVA_GET_VDEV(&bp->blk_dva[i]); + + if (vdevid >= spa->spa_root_vdev->vdev_children) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; + if (vd == NULL) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + if (vd->vdev_ops == &vdev_hole_ops) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has hole VDEV %llu", + bp, i, (longlong_t)vdevid); + continue; + } + if (vd->vdev_ops == &vdev_missing_ops) { + /* + * "missing" vdevs are valid during import, but we + * don't have their detailed info (e.g. asize), so + * we can't perform any more checks on them. + */ + continue; + } + uint64_t offset = DVA_GET_OFFSET(&bp->blk_dva[i]); + uint64_t asize = DVA_GET_ASIZE(&bp->blk_dva[i]); + if (BP_IS_GANG(bp)) + asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (offset + asize > vd->vdev_asize) { + errors += zfs_blkptr_verify_log(spa, bp, blk_verify, + "blkptr at %p DVA %u has invalid OFFSET %llu", + bp, i, (longlong_t)offset); + } + } + if (errors > 0) + dprintf_bp(bp, "blkptr at %p dprintf_bp():", bp); + if (!config_held) + spa_config_exit(spa, SCL_VDEV, bp); + + return (errors == 0); +} + +boolean_t +zfs_dva_valid(spa_t *spa, const dva_t *dva, const blkptr_t *bp) +{ + uint64_t vdevid = DVA_GET_VDEV(dva); + + if (vdevid >= spa->spa_root_vdev->vdev_children) + return (B_FALSE); + + vdev_t *vd = spa->spa_root_vdev->vdev_child[vdevid]; + if (vd == NULL) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_hole_ops) + return (B_FALSE); + + if (vd->vdev_ops == &vdev_missing_ops) { + return (B_FALSE); + } + + uint64_t offset = DVA_GET_OFFSET(dva); + uint64_t asize = DVA_GET_ASIZE(dva); + + if (BP_IS_GANG(bp)) + asize = vdev_psize_to_asize(vd, SPA_GANGBLOCKSIZE); + if (offset + asize > vd->vdev_asize) + return (B_FALSE); + + return (B_TRUE); +} + +zio_t * +zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, + abd_t *data, uint64_t size, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, const zbookmark_phys_t *zb) +{ + zio_t *zio; + + (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, + BLK_VERIFY_HALT); + + zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, + data, size, size, done, private, + ZIO_TYPE_READ, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); + + return (zio); +} + +zio_t * +zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, + abd_t *data, uint64_t lsize, uint64_t psize, const zio_prop_t *zp, + zio_done_func_t *ready, zio_done_func_t *children_ready, + zio_done_func_t *physdone, zio_done_func_t *done, + void *private, zio_priority_t priority, enum zio_flag flags, + const zbookmark_phys_t *zb) +{ + zio_t *zio; + + ASSERT(zp->zp_checksum >= ZIO_CHECKSUM_OFF && + zp->zp_checksum < ZIO_CHECKSUM_FUNCTIONS && + zp->zp_compress >= ZIO_COMPRESS_OFF && + zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && + DMU_OT_IS_VALID(zp->zp_type) && + zp->zp_level < 32 && + zp->zp_copies > 0 && + zp->zp_copies <= spa_max_replication(spa)); + + zio = zio_create(pio, spa, txg, bp, data, lsize, psize, done, private, + ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); + + zio->io_ready = ready; + zio->io_children_ready = children_ready; + zio->io_physdone = physdone; + zio->io_prop = *zp; + + /* + * Data can be NULL if we are going to call zio_write_override() to + * provide the already-allocated BP. But we may need the data to + * verify a dedup hit (if requested). In this case, don't try to + * dedup (just take the already-allocated BP verbatim). Encrypted + * dedup blocks need data as well so we also disable dedup in this + * case. + */ + if (data == NULL && + (zio->io_prop.zp_dedup_verify || zio->io_prop.zp_encrypt)) { + zio->io_prop.zp_dedup = zio->io_prop.zp_dedup_verify = B_FALSE; + } + + return (zio); +} + +zio_t * +zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, abd_t *data, + uint64_t size, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, zbookmark_phys_t *zb) +{ + zio_t *zio; + + zio = zio_create(pio, spa, txg, bp, data, size, size, done, private, + ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_IO_REWRITE, NULL, 0, zb, + ZIO_STAGE_OPEN, ZIO_REWRITE_PIPELINE); + + return (zio); +} + +void +zio_write_override(zio_t *zio, blkptr_t *bp, int copies, boolean_t nopwrite) +{ + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + + /* + * We must reset the io_prop to match the values that existed + * when the bp was first written by dmu_sync() keeping in mind + * that nopwrite and dedup are mutually exclusive. + */ + zio->io_prop.zp_dedup = nopwrite ? B_FALSE : zio->io_prop.zp_dedup; + zio->io_prop.zp_nopwrite = nopwrite; + zio->io_prop.zp_copies = copies; + zio->io_bp_override = bp; +} + +void +zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) +{ + + (void) zfs_blkptr_verify(spa, bp, B_FALSE, BLK_VERIFY_HALT); + + /* + * The check for EMBEDDED is a performance optimization. We + * process the free here (by ignoring it) rather than + * putting it on the list and then processing it in zio_free_sync(). + */ + if (BP_IS_EMBEDDED(bp)) + return; + metaslab_check_free(spa, bp); + + /* + * Frees that are for the currently-syncing txg, are not going to be + * deferred, and which will not need to do a read (i.e. not GANG or + * DEDUP), can be processed immediately. Otherwise, put them on the + * in-memory list for later processing. + * + * Note that we only defer frees after zfs_sync_pass_deferred_free + * when the log space map feature is disabled. [see relevant comment + * in spa_sync_iterate_to_convergence()] + */ + if (BP_IS_GANG(bp) || + BP_GET_DEDUP(bp) || + txg != spa->spa_syncing_txg || + (spa_sync_pass(spa) >= zfs_sync_pass_deferred_free && + !spa_feature_is_active(spa, SPA_FEATURE_LOG_SPACEMAP))) { + bplist_append(&spa->spa_free_bplist[txg & TXG_MASK], bp); + } else { + VERIFY3P(zio_free_sync(NULL, spa, txg, bp, 0), ==, NULL); + } +} + +/* + * To improve performance, this function may return NULL if we were able + * to do the free immediately. This avoids the cost of creating a zio + * (and linking it to the parent, etc). + */ +zio_t * +zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + enum zio_flag flags) +{ + ASSERT(!BP_IS_HOLE(bp)); + ASSERT(spa_syncing_txg(spa) == txg); + + if (BP_IS_EMBEDDED(bp)) + return (NULL); + + metaslab_check_free(spa, bp); + arc_freed(spa, bp); + dsl_scan_freed(spa, bp); + + if (BP_IS_GANG(bp) || BP_GET_DEDUP(bp)) { + /* + * GANG and DEDUP blocks can induce a read (for the gang block + * header, or the DDT), so issue them asynchronously so that + * this thread is not tied up. + */ + enum zio_stage stage = + ZIO_FREE_PIPELINE | ZIO_STAGE_ISSUE_ASYNC; + + return (zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + BP_GET_PSIZE(bp), NULL, NULL, + ZIO_TYPE_FREE, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, stage)); + } else { + metaslab_free(spa, bp, txg, B_FALSE); + return (NULL); + } +} + +zio_t * +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags) +{ + zio_t *zio; + + (void) zfs_blkptr_verify(spa, bp, flags & ZIO_FLAG_CONFIG_WRITER, + BLK_VERIFY_HALT); + + if (BP_IS_EMBEDDED(bp)) + return (zio_null(pio, spa, NULL, NULL, NULL, 0)); + + /* + * A claim is an allocation of a specific block. Claims are needed + * to support immediate writes in the intent log. The issue is that + * immediate writes contain committed data, but in a txg that was + * *not* committed. Upon opening the pool after an unclean shutdown, + * the intent log claims all blocks that contain immediate write data + * so that the SPA knows they're in use. + * + * All claims *must* be resolved in the first txg -- before the SPA + * starts allocating blocks -- so that nothing is allocated twice. + * If txg == 0 we just verify that the block is claimable. + */ + ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, + spa_min_claim_txg(spa)); + ASSERT(txg == spa_min_claim_txg(spa) || txg == 0); + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ + + zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), + BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, + flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_CLAIM_PIPELINE); + ASSERT0(zio->io_queued_timestamp); + + return (zio); +} + +zio_t * +zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, + zio_done_func_t *done, void *private, enum zio_flag flags) +{ + zio_t *zio; + int c; + + if (vd->vdev_children == 0) { + zio = zio_create(pio, spa, 0, NULL, NULL, 0, 0, done, private, + ZIO_TYPE_IOCTL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, + ZIO_STAGE_OPEN, ZIO_IOCTL_PIPELINE); + + zio->io_cmd = cmd; + } else { + zio = zio_null(pio, spa, NULL, NULL, NULL, flags); + + for (c = 0; c < vd->vdev_children; c++) + zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, + done, private, flags)); + } + + return (zio); +} + +zio_t * +zio_trim(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + zio_done_func_t *done, void *private, zio_priority_t priority, + enum zio_flag flags, enum trim_flag trim_flags) +{ + zio_t *zio; + + ASSERT0(vd->vdev_children); + ASSERT0(P2PHASE(offset, 1ULL << vd->vdev_ashift)); + ASSERT0(P2PHASE(size, 1ULL << vd->vdev_ashift)); + ASSERT3U(size, !=, 0); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, NULL, size, size, done, + private, ZIO_TYPE_TRIM, priority, flags | ZIO_FLAG_PHYSICAL, + vd, offset, NULL, ZIO_STAGE_OPEN, ZIO_TRIM_PIPELINE); + zio->io_trim_flags = trim_flags; + + return (zio); +} + +zio_t * +zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + abd_t *data, int checksum, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_READ, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_READ_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + return (zio); +} + +zio_t * +zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, + abd_t *data, int checksum, zio_done_func_t *done, void *private, + zio_priority_t priority, enum zio_flag flags, boolean_t labels) +{ + zio_t *zio; + + ASSERT(vd->vdev_children == 0); + ASSERT(!labels || offset + size <= VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE); + ASSERT3U(offset + size, <=, vd->vdev_psize); + + zio = zio_create(pio, vd->vdev_spa, 0, NULL, data, size, size, done, + private, ZIO_TYPE_WRITE, priority, flags | ZIO_FLAG_PHYSICAL, vd, + offset, NULL, ZIO_STAGE_OPEN, ZIO_WRITE_PHYS_PIPELINE); + + zio->io_prop.zp_checksum = checksum; + + if (zio_checksum_table[checksum].ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + /* + * zec checksums are necessarily destructive -- they modify + * the end of the write buffer to hold the verifier/checksum. + * Therefore, we must make a local copy in case the data is + * being written to multiple places in parallel. + */ + abd_t *wbuf = abd_alloc_sametype(data, size); + abd_copy(wbuf, data, size); + + zio_push_transform(zio, wbuf, size, size, NULL); + } + + return (zio); +} + +/* + * Create a child I/O to do some work for us. + */ +zio_t * +zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, + abd_t *data, uint64_t size, int type, zio_priority_t priority, + enum zio_flag flags, zio_done_func_t *done, void *private) +{ + enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; + zio_t *zio; + + /* + * vdev child I/Os do not propagate their error to the parent. + * Therefore, for correct operation the caller *must* check for + * and handle the error in the child i/o's done callback. + * The only exceptions are i/os that we don't care about + * (OPTIONAL or REPAIR). + */ + ASSERT((flags & ZIO_FLAG_OPTIONAL) || (flags & ZIO_FLAG_IO_REPAIR) || + done != NULL); + + if (type == ZIO_TYPE_READ && bp != NULL) { + /* + * If we have the bp, then the child should perform the + * checksum and the parent need not. This pushes error + * detection as close to the leaves as possible and + * eliminates redundant checksums in the interior nodes. + */ + pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; + pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; + } + + if (vd->vdev_ops->vdev_op_leaf) { + ASSERT0(vd->vdev_children); + offset += VDEV_LABEL_START_SIZE; + } + + flags |= ZIO_VDEV_CHILD_FLAGS(pio); + + /* + * If we've decided to do a repair, the write is not speculative -- + * even if the original read was. + */ + if (flags & ZIO_FLAG_IO_REPAIR) + flags &= ~ZIO_FLAG_SPECULATIVE; + + /* + * If we're creating a child I/O that is not associated with a + * top-level vdev, then the child zio is not an allocating I/O. + * If this is a retried I/O then we ignore it since we will + * have already processed the original allocating I/O. + */ + if (flags & ZIO_FLAG_IO_ALLOCATING && + (vd != vd->vdev_top || (flags & ZIO_FLAG_IO_RETRY))) { + ASSERT(pio->io_metaslab_class != NULL); + ASSERT(pio->io_metaslab_class->mc_alloc_throttle_enabled); + ASSERT(type == ZIO_TYPE_WRITE); + ASSERT(priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(!(flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(!(pio->io_flags & ZIO_FLAG_IO_REWRITE) || + pio->io_child_type == ZIO_CHILD_GANG); + + flags &= ~ZIO_FLAG_IO_ALLOCATING; + } + + + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, size, + done, private, type, priority, flags, vd, offset, &pio->io_bookmark, + ZIO_STAGE_VDEV_IO_START >> 1, pipeline); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + + zio->io_physdone = pio->io_physdone; + if (vd->vdev_ops->vdev_op_leaf && zio->io_logical != NULL) + zio->io_logical->io_phys_children++; + + return (zio); +} + +zio_t * +zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, abd_t *data, uint64_t size, + zio_type_t type, zio_priority_t priority, enum zio_flag flags, + zio_done_func_t *done, void *private) +{ + zio_t *zio; + + ASSERT(vd->vdev_ops->vdev_op_leaf); + + zio = zio_create(NULL, vd->vdev_spa, 0, NULL, + data, size, size, done, private, type, priority, + flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY | ZIO_FLAG_DELEGATED, + vd, offset, NULL, + ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); + + return (zio); +} + +void +zio_flush(zio_t *zio, vdev_t *vd) +{ + zio_nowait(zio_ioctl(zio, zio->io_spa, vd, DKIOCFLUSHWRITECACHE, + NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); +} + +void +zio_shrink(zio_t *zio, uint64_t size) +{ + ASSERT3P(zio->io_executor, ==, NULL); + ASSERT3U(zio->io_orig_size, ==, zio->io_size); + ASSERT3U(size, <=, zio->io_size); + + /* + * We don't shrink for raidz because of problems with the + * reconstruction when reading back less than the block size. + * Note, BP_IS_RAIDZ() assumes no compression. + */ + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + if (!BP_IS_RAIDZ(zio->io_bp)) { + /* we are not doing a raw write */ + ASSERT3U(zio->io_size, ==, zio->io_lsize); + zio->io_orig_size = zio->io_size = zio->io_lsize = size; + } +} + +/* + * ========================================================================== + * Prepare to read and write logical blocks + * ========================================================================== + */ + +static zio_t * +zio_read_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + uint64_t psize = + BP_IS_EMBEDDED(bp) ? BPE_GET_PSIZE(bp) : BP_GET_PSIZE(bp); + + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + zio->io_child_type == ZIO_CHILD_LOGICAL && + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decompress); + } + + if (((BP_IS_PROTECTED(bp) && !(zio->io_flags & ZIO_FLAG_RAW_ENCRYPT)) || + BP_HAS_INDIRECT_MAC_CKSUM(bp)) && + zio->io_child_type == ZIO_CHILD_LOGICAL) { + zio_push_transform(zio, abd_alloc_sametype(zio->io_abd, psize), + psize, psize, zio_decrypt); + } + + if (BP_IS_EMBEDDED(bp) && BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA) { + int psize = BPE_GET_PSIZE(bp); + void *data = abd_borrow_buf(zio->io_abd, psize); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + decode_embedded_bp_compressed(bp, data); + abd_return_buf_copy(zio->io_abd, data, psize); + } else { + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + } + + if (!DMU_OT_IS_METADATA(BP_GET_TYPE(bp)) && BP_GET_LEVEL(bp) == 0) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) + zio->io_pipeline = ZIO_DDT_READ_PIPELINE; + + return (zio); +} + +static zio_t * +zio_write_bp_init(zio_t *zio) +{ + if (!IO_IS_ALLOCATING(zio)) + return (zio); + + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + + if (zio->io_bp_override) { + blkptr_t *bp = zio->io_bp; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(bp->blk_birth != zio->io_txg); + ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); + + *bp = *zio->io_bp_override; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (BP_IS_EMBEDDED(bp)) + return (zio); + + /* + * If we've been overridden and nopwrite is set then + * set the flag accordingly to indicate that a nopwrite + * has already occurred. + */ + if (!BP_IS_HOLE(bp) && zp->zp_nopwrite) { + ASSERT(!zp->zp_dedup); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, zp->zp_checksum); + zio->io_flags |= ZIO_FLAG_NOPWRITE; + return (zio); + } + + ASSERT(!zp->zp_nopwrite); + + if (BP_IS_HOLE(bp) || !zp->zp_dedup) + return (zio); + + ASSERT((zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || zp->zp_dedup_verify); + + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum && + !zp->zp_encrypt) { + BP_SET_DEDUP(bp, 1); + zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; + return (zio); + } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + } + + return (zio); +} + +static zio_t * +zio_write_compress(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_prop_t *zp = &zio->io_prop; + enum zio_compress compress = zp->zp_compress; + blkptr_t *bp = zio->io_bp; + uint64_t lsize = zio->io_lsize; + uint64_t psize = zio->io_size; + int pass = 1; + + /* + * If our children haven't all reached the ready stage, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_LOGICAL_BIT | + ZIO_CHILD_GANG_BIT, ZIO_WAIT_READY)) { + return (NULL); + } + + if (!IO_IS_ALLOCATING(zio)) + return (zio); + + if (zio->io_children_ready != NULL) { + /* + * Now that all our children are ready, run the callback + * associated with this zio in case it wants to modify the + * data to be written. + */ + ASSERT3U(zp->zp_level, >, 0); + zio->io_children_ready(zio); + } + + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + ASSERT(zio->io_bp_override == NULL); + + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg) { + /* + * We're rewriting an existing block, which means we're + * working on behalf of spa_sync(). For spa_sync() to + * converge, it must eventually be the case that we don't + * have to allocate new blocks. But compression changes + * the blocksize, which forces a reallocate, and makes + * convergence take longer. Therefore, after the first + * few passes, stop compressing to ensure convergence. + */ + pass = spa_sync_pass(spa); + + ASSERT(zio->io_txg == spa_syncing_txg(spa)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!BP_GET_DEDUP(bp)); + + if (pass >= zfs_sync_pass_dont_compress) + compress = ZIO_COMPRESS_OFF; + + /* Make sure someone doesn't change their mind on overwrites */ + ASSERT(BP_IS_EMBEDDED(bp) || MIN(zp->zp_copies + BP_IS_GANG(bp), + spa_max_replication(spa)) == BP_GET_NDVAS(bp)); + } + + /* If it's a compressed write that is not raw, compress the buffer. */ + if (compress != ZIO_COMPRESS_OFF && + !(zio->io_flags & ZIO_FLAG_RAW_COMPRESS)) { + void *cbuf = zio_buf_alloc(lsize); + psize = zio_compress_data(compress, zio->io_abd, cbuf, lsize, + zp->zp_complevel); + if (psize == 0 || psize >= lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + } else if (!zp->zp_dedup && !zp->zp_encrypt && + psize <= BPE_PAYLOAD_SIZE && + zp->zp_level == 0 && !DMU_OT_HAS_FILL(zp->zp_type) && + spa_feature_is_enabled(spa, SPA_FEATURE_EMBEDDED_DATA)) { + encode_embedded_bp_compressed(bp, + cbuf, compress, lsize, psize); + BPE_SET_ETYPE(bp, BP_EMBEDDED_TYPE_DATA); + BP_SET_TYPE(bp, zio->io_prop.zp_type); + BP_SET_LEVEL(bp, zio->io_prop.zp_level); + zio_buf_free(cbuf, lsize); + bp->blk_birth = zio->io_txg; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + ASSERT(spa_feature_is_active(spa, + SPA_FEATURE_EMBEDDED_DATA)); + return (zio); + } else { + /* + * Round up compressed size up to the ashift + * of the smallest-ashift device, and zero the tail. + * This ensures that the compressed size of the BP + * (and thus compressratio property) are correct, + * in that we charge for the padding used to fill out + * the last sector. + */ + ASSERT3U(spa->spa_min_ashift, >=, SPA_MINBLOCKSHIFT); + size_t rounded = (size_t)P2ROUNDUP(psize, + 1ULL << spa->spa_min_ashift); + if (rounded >= lsize) { + compress = ZIO_COMPRESS_OFF; + zio_buf_free(cbuf, lsize); + psize = lsize; + } else { + abd_t *cdata = abd_get_from_buf(cbuf, lsize); + abd_take_ownership_of_buf(cdata, B_TRUE); + abd_zero_off(cdata, psize, rounded - psize); + psize = rounded; + zio_push_transform(zio, cdata, + psize, lsize, NULL); + } + } + + /* + * We were unable to handle this as an override bp, treat + * it as a regular write I/O. + */ + zio->io_bp_override = NULL; + *bp = zio->io_bp_orig; + zio->io_pipeline = zio->io_orig_pipeline; + + } else if ((zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) != 0 && + zp->zp_type == DMU_OT_DNODE) { + /* + * The DMU actually relies on the zio layer's compression + * to free metadnode blocks that have had all contained + * dnodes freed. As a result, even when doing a raw + * receive, we must check whether the block can be compressed + * to a hole. + */ + psize = zio_compress_data(ZIO_COMPRESS_EMPTY, + zio->io_abd, NULL, lsize, zp->zp_complevel); + if (psize == 0 || psize >= lsize) + compress = ZIO_COMPRESS_OFF; + } else { + ASSERT3U(psize, !=, 0); + } + + /* + * The final pass of spa_sync() must be all rewrites, but the first + * few passes offer a trade-off: allocating blocks defers convergence, + * but newly allocated blocks are sequential, so they can be written + * to disk faster. Therefore, we allow the first few passes of + * spa_sync() to allocate new blocks, but force rewrites after that. + * There should only be a handful of blocks after pass 1 in any case. + */ + if (!BP_IS_HOLE(bp) && bp->blk_birth == zio->io_txg && + BP_GET_PSIZE(bp) == psize && + pass >= zfs_sync_pass_rewrite) { + VERIFY3U(psize, !=, 0); + enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + + zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; + zio->io_flags |= ZIO_FLAG_IO_REWRITE; + } else { + BP_ZERO(bp); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + } + + if (psize == 0) { + if (zio->io_bp_orig.blk_birth != 0 && + spa_feature_is_active(spa, SPA_FEATURE_HOLE_BIRTH)) { + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_BIRTH(bp, zio->io_txg, 0); + } + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + } else { + ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); + BP_SET_LSIZE(bp, lsize); + BP_SET_TYPE(bp, zp->zp_type); + BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, compress); + BP_SET_CHECKSUM(bp, zp->zp_checksum); + BP_SET_DEDUP(bp, zp->zp_dedup); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + if (zp->zp_dedup) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!zp->zp_encrypt || + DMU_OT_IS_ENCRYPTED(zp->zp_type)); + zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; + } + if (zp->zp_nopwrite) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline |= ZIO_STAGE_NOP_WRITE; + } + } + return (zio); +} + +static zio_t * +zio_free_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (BP_GET_DEDUP(bp)) + zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; + } + + ASSERT3P(zio->io_bp, ==, &zio->io_bp_copy); + + return (zio); +} + +/* + * ========================================================================== + * Execute the I/O pipeline + * ========================================================================== + */ + +static void +zio_taskq_dispatch(zio_t *zio, zio_taskq_type_t q, boolean_t cutinline) +{ + spa_t *spa = zio->io_spa; + zio_type_t t = zio->io_type; + int flags = (cutinline ? TQ_FRONT : 0); + + /* + * If we're a config writer or a probe, the normal issue and + * interrupt threads may all be blocked waiting for the config lock. + * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. + */ + if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) + t = ZIO_TYPE_NULL; + + /* + * A similar issue exists for the L2ARC write thread until L2ARC 2.0. + */ + if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) + t = ZIO_TYPE_NULL; + + /* + * If this is a high priority I/O, then use the high priority taskq if + * available. + */ + if ((zio->io_priority == ZIO_PRIORITY_NOW || + zio->io_priority == ZIO_PRIORITY_SYNC_WRITE) && + spa->spa_zio_taskq[t][q + 1].stqs_count != 0) + q++; + + ASSERT3U(q, <, ZIO_TASKQ_TYPES); + + /* + * NB: We are assuming that the zio can only be dispatched + * to a single taskq at a time. It would be a grievous error + * to dispatch the zio to another taskq at the same time. + */ + ASSERT(taskq_empty_ent(&zio->io_tqent)); + spa_taskq_dispatch_ent(spa, t, q, (task_func_t *)zio_execute, zio, + flags, &zio->io_tqent); +} + +static boolean_t +zio_taskq_member(zio_t *zio, zio_taskq_type_t q) +{ + spa_t *spa = zio->io_spa; + + taskq_t *tq = taskq_of_curthread(); + + for (zio_type_t t = 0; t < ZIO_TYPES; t++) { + spa_taskqs_t *tqs = &spa->spa_zio_taskq[t][q]; + uint_t i; + for (i = 0; i < tqs->stqs_count; i++) { + if (tqs->stqs_taskq[i] == tq) + return (B_TRUE); + } + } + + return (B_FALSE); +} + +static zio_t * +zio_issue_async(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + + return (NULL); +} + +void +zio_interrupt(zio_t *zio) +{ + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); +} + +void +zio_delay_interrupt(zio_t *zio) +{ + /* + * The timeout_generic() function isn't defined in userspace, so + * rather than trying to implement the function, the zio delay + * functionality has been disabled for userspace builds. + */ + +#ifdef _KERNEL + /* + * If io_target_timestamp is zero, then no delay has been registered + * for this IO, thus jump to the end of this function and "skip" the + * delay; issuing it directly to the zio layer. + */ + if (zio->io_target_timestamp != 0) { + hrtime_t now = gethrtime(); + + if (now >= zio->io_target_timestamp) { + /* + * This IO has already taken longer than the target + * delay to complete, so we don't want to delay it + * any longer; we "miss" the delay and issue it + * directly to the zio layer. This is likely due to + * the target latency being set to a value less than + * the underlying hardware can satisfy (e.g. delay + * set to 1ms, but the disks take 10ms to complete an + * IO request). + */ + + DTRACE_PROBE2(zio__delay__miss, zio_t *, zio, + hrtime_t, now); + + zio_interrupt(zio); + } else { + taskqid_t tid; + hrtime_t diff = zio->io_target_timestamp - now; + clock_t expire_at_tick = ddi_get_lbolt() + + NSEC_TO_TICK(diff); + + DTRACE_PROBE3(zio__delay__hit, zio_t *, zio, + hrtime_t, now, hrtime_t, diff); + + if (NSEC_TO_TICK(diff) == 0) { + /* Our delay is less than a jiffy - just spin */ + zfs_sleep_until(zio->io_target_timestamp); + zio_interrupt(zio); + } else { + /* + * Use taskq_dispatch_delay() in the place of + * OpenZFS's timeout_generic(). + */ + tid = taskq_dispatch_delay(system_taskq, + (task_func_t *)zio_interrupt, + zio, TQ_NOSLEEP, expire_at_tick); + if (tid == TASKQID_INVALID) { + /* + * Couldn't allocate a task. Just + * finish the zio without a delay. + */ + zio_interrupt(zio); + } + } + } + return; + } +#endif + DTRACE_PROBE1(zio__delay__skip, zio_t *, zio); + zio_interrupt(zio); +} + +static void +zio_deadman_impl(zio_t *pio, int ziodepth) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + vdev_t *vd = pio->io_vd; + + if (zio_deadman_log_all || (vd != NULL && vd->vdev_ops->vdev_op_leaf)) { + vdev_queue_t *vq = vd ? &vd->vdev_queue : NULL; + zbookmark_phys_t *zb = &pio->io_bookmark; + uint64_t delta = gethrtime() - pio->io_timestamp; + uint64_t failmode = spa_get_deadman_failmode(pio->io_spa); + + zfs_dbgmsg("slow zio[%d]: zio=%px timestamp=%llu " + "delta=%llu queued=%llu io=%llu " + "path=%s last=%llu " + "type=%d priority=%d flags=0x%x " + "stage=0x%x pipeline=0x%x pipeline-trace=0x%x " + "objset=%llu object=%llu level=%llu blkid=%llu " + "offset=%llu size=%llu error=%d", + ziodepth, pio, pio->io_timestamp, + delta, pio->io_delta, pio->io_delay, + vd ? vd->vdev_path : "NULL", vq ? vq->vq_io_complete_ts : 0, + pio->io_type, pio->io_priority, pio->io_flags, + pio->io_stage, pio->io_pipeline, pio->io_pipeline_trace, + zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + pio->io_offset, pio->io_size, pio->io_error); + zfs_ereport_post(FM_EREPORT_ZFS_DEADMAN, + pio->io_spa, vd, zb, pio, 0, 0); + + if (failmode == ZIO_FAILURE_MODE_CONTINUE && + taskq_empty_ent(&pio->io_tqent)) { + zio_interrupt(pio); + } + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_deadman_impl(cio, ziodepth + 1); + } + mutex_exit(&pio->io_lock); +} + +/* + * Log the critical information describing this zio and all of its children + * using the zfs_dbgmsg() interface then post deadman event for the ZED. + */ +void +zio_deadman(zio_t *pio, char *tag) +{ + spa_t *spa = pio->io_spa; + char *name = spa_name(spa); + + if (!zfs_deadman_enabled || spa_suspended(spa)) + return; + + zio_deadman_impl(pio, 0); + + switch (spa_get_deadman_failmode(spa)) { + case ZIO_FAILURE_MODE_WAIT: + zfs_dbgmsg("%s waiting for hung I/O to pool '%s'", tag, name); + break; + + case ZIO_FAILURE_MODE_CONTINUE: + zfs_dbgmsg("%s restarting hung I/O for pool '%s'", tag, name); + break; + + case ZIO_FAILURE_MODE_PANIC: + fm_panic("%s determined I/O to pool '%s' is hung.", tag, name); + break; + } +} + +/* + * Execute the I/O pipeline until one of the following occurs: + * (1) the I/O completes; (2) the pipeline stalls waiting for + * dependent child I/Os; (3) the I/O issues, so we're waiting + * for an I/O completion interrupt; (4) the I/O is delegated by + * vdev-level caching or aggregation; (5) the I/O is deferred + * due to vdev-level queueing; (6) the I/O is handed off to + * another thread. In all cases, the pipeline stops whenever + * there's no CPU work; it never burns a thread in cv_wait_io(). + * + * There's no locking on io_stage because there's no legitimate way + * for multiple threads to be attempting to process the same I/O. + */ +static zio_pipe_stage_t *zio_pipeline[]; + +/* + * zio_execute() is a wrapper around the static function + * __zio_execute() so that we can force __zio_execute() to be + * inlined. This reduces stack overhead which is important + * because __zio_execute() is called recursively in several zio + * code paths. zio_execute() itself cannot be inlined because + * it is externally visible. + */ +void +zio_execute(zio_t *zio) +{ + fstrans_cookie_t cookie; + + cookie = spl_fstrans_mark(); + __zio_execute(zio); + spl_fstrans_unmark(cookie); +} + +/* + * Used to determine if in the current context the stack is sized large + * enough to allow zio_execute() to be called recursively. A minimum + * stack size of 16K is required to avoid needing to re-dispatch the zio. + */ +static boolean_t +zio_execute_stack_check(zio_t *zio) +{ +#if !defined(HAVE_LARGE_STACKS) + dsl_pool_t *dp = spa_get_dsl(zio->io_spa); + + /* Executing in txg_sync_thread() context. */ + if (dp && curthread == dp->dp_tx.tx_sync_thread) + return (B_TRUE); + + /* Pool initialization outside of zio_taskq context. */ + if (dp && spa_is_initializing(dp->dp_spa) && + !zio_taskq_member(zio, ZIO_TASKQ_ISSUE) && + !zio_taskq_member(zio, ZIO_TASKQ_ISSUE_HIGH)) + return (B_TRUE); +#endif /* HAVE_LARGE_STACKS */ + + return (B_FALSE); +} + +__attribute__((always_inline)) +static inline void +__zio_execute(zio_t *zio) +{ + ASSERT3U(zio->io_queued_timestamp, >, 0); + + while (zio->io_stage < ZIO_STAGE_DONE) { + enum zio_stage pipeline = zio->io_pipeline; + enum zio_stage stage = zio->io_stage; + + zio->io_executor = curthread; + + ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(ISP2(stage)); + ASSERT(zio->io_stall == NULL); + + do { + stage <<= 1; + } while ((stage & pipeline) == 0); + + ASSERT(stage <= ZIO_STAGE_DONE); + + /* + * If we are in interrupt context and this pipeline stage + * will grab a config lock that is held across I/O, + * or may wait for an I/O that needs an interrupt thread + * to complete, issue async to avoid deadlock. + * + * For VDEV_IO_START, we cut in line so that the io will + * be sent to disk promptly. + */ + if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && + zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { + boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); + return; + } + + /* + * If the current context doesn't have large enough stacks + * the zio must be issued asynchronously to prevent overflow. + */ + if (zio_execute_stack_check(zio)) { + boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); + return; + } + + zio->io_stage = stage; + zio->io_pipeline_trace |= zio->io_stage; + + /* + * The zio pipeline stage returns the next zio to execute + * (typically the same as this one), or NULL if we should + * stop. + */ + zio = zio_pipeline[highbit64(stage) - 1](zio); + + if (zio == NULL) + return; + } +} + + +/* + * ========================================================================== + * Initiate I/O, either sync or async + * ========================================================================== + */ +int +zio_wait(zio_t *zio) +{ + /* + * Some routines, like zio_free_sync(), may return a NULL zio + * to avoid the performance overhead of creating and then destroying + * an unneeded zio. For the callers' simplicity, we accept a NULL + * zio and ignore it. + */ + if (zio == NULL) + return (0); + + long timeout = MSEC_TO_TICK(zfs_deadman_ziotime_ms); + int error; + + ASSERT3S(zio->io_stage, ==, ZIO_STAGE_OPEN); + ASSERT3P(zio->io_executor, ==, NULL); + + zio->io_waiter = curthread; + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); + + __zio_execute(zio); + + mutex_enter(&zio->io_lock); + while (zio->io_executor != NULL) { + error = cv_timedwait_io(&zio->io_cv, &zio->io_lock, + ddi_get_lbolt() + timeout); + + if (zfs_deadman_enabled && error == -1 && + gethrtime() - zio->io_queued_timestamp > + spa_deadman_ziotime(zio->io_spa)) { + mutex_exit(&zio->io_lock); + timeout = MSEC_TO_TICK(zfs_deadman_checktime_ms); + zio_deadman(zio, FTAG); + mutex_enter(&zio->io_lock); + } + } + mutex_exit(&zio->io_lock); + + error = zio->io_error; + zio_destroy(zio); + + return (error); +} + +void +zio_nowait(zio_t *zio) +{ + /* + * See comment in zio_wait(). + */ + if (zio == NULL) + return; + + ASSERT3P(zio->io_executor, ==, NULL); + + if (zio->io_child_type == ZIO_CHILD_LOGICAL && + zio_unique_parent(zio) == NULL) { + zio_t *pio; + + /* + * This is a logical async I/O with no parent to wait for it. + * We add it to the spa_async_root_zio "Godfather" I/O which + * will ensure they complete prior to unloading the pool. + */ + spa_t *spa = zio->io_spa; + kpreempt_disable(); + pio = spa->spa_async_zio_root[CPU_SEQID]; + kpreempt_enable(); + + zio_add_child(pio, zio); + } + + ASSERT0(zio->io_queued_timestamp); + zio->io_queued_timestamp = gethrtime(); + __zio_execute(zio); +} + +/* + * ========================================================================== + * Reexecute, cancel, or suspend/resume failed I/O + * ========================================================================== + */ + +static void +zio_reexecute(zio_t *pio) +{ + zio_t *cio, *cio_next; + + ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); + ASSERT(pio->io_gang_leader == NULL); + ASSERT(pio->io_gang_tree == NULL); + + pio->io_flags = pio->io_orig_flags; + pio->io_stage = pio->io_orig_stage; + pio->io_pipeline = pio->io_orig_pipeline; + pio->io_reexecute = 0; + pio->io_flags |= ZIO_FLAG_REEXECUTED; + pio->io_pipeline_trace = 0; + pio->io_error = 0; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_state[w] = 0; + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + pio->io_child_error[c] = 0; + + if (IO_IS_ALLOCATING(pio)) + BP_ZERO(pio->io_bp); + + /* + * As we reexecute pio's children, new children could be created. + * New children go to the head of pio's io_child_list, however, + * so we will (correctly) not reexecute them. The key is that + * the remainder of pio's io_child_list, from 'cio_next' onward, + * cannot be affected by any side effects of reexecuting 'cio'. + */ + zio_link_t *zl = NULL; + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w]++; + mutex_exit(&pio->io_lock); + zio_reexecute(cio); + mutex_enter(&pio->io_lock); + } + mutex_exit(&pio->io_lock); + + /* + * Now that all children have been reexecuted, execute the parent. + * We don't reexecute "The Godfather" I/O here as it's the + * responsibility of the caller to wait on it. + */ + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) { + pio->io_queued_timestamp = gethrtime(); + __zio_execute(pio); + } +} + +void +zio_suspend(spa_t *spa, zio_t *zio, zio_suspend_reason_t reason) +{ + if (spa_get_failmode(spa) == ZIO_FAILURE_MODE_PANIC) + fm_panic("Pool '%s' has encountered an uncorrectable I/O " + "failure and the failure mode property for this pool " + "is set to panic.", spa_name(spa)); + + cmn_err(CE_WARN, "Pool '%s' has encountered an uncorrectable I/O " + "failure and has been suspended.\n", spa_name(spa)); + + zfs_ereport_post(FM_EREPORT_ZFS_IO_FAILURE, spa, NULL, + NULL, NULL, 0, 0); + + mutex_enter(&spa->spa_suspend_lock); + + if (spa->spa_suspend_zio_root == NULL) + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); + + spa->spa_suspended = reason; + + if (zio != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); + ASSERT(zio != spa->spa_suspend_zio_root); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio_unique_parent(zio) == NULL); + ASSERT(zio->io_stage == ZIO_STAGE_DONE); + zio_add_child(spa->spa_suspend_zio_root, zio); + } + + mutex_exit(&spa->spa_suspend_lock); +} + +int +zio_resume(spa_t *spa) +{ + zio_t *pio; + + /* + * Reexecute all previously suspended i/o. + */ + mutex_enter(&spa->spa_suspend_lock); + spa->spa_suspended = ZIO_SUSPEND_NONE; + cv_broadcast(&spa->spa_suspend_cv); + pio = spa->spa_suspend_zio_root; + spa->spa_suspend_zio_root = NULL; + mutex_exit(&spa->spa_suspend_lock); + + if (pio == NULL) + return (0); + + zio_reexecute(pio); + return (zio_wait(pio)); +} + +void +zio_resume_wait(spa_t *spa) +{ + mutex_enter(&spa->spa_suspend_lock); + while (spa_suspended(spa)) + cv_wait(&spa->spa_suspend_cv, &spa->spa_suspend_lock); + mutex_exit(&spa->spa_suspend_lock); +} + +/* + * ========================================================================== + * Gang blocks. + * + * A gang block is a collection of small blocks that looks to the DMU + * like one large block. When zio_dva_allocate() cannot find a block + * of the requested size, due to either severe fragmentation or the pool + * being nearly full, it calls zio_write_gang_block() to construct the + * block from smaller fragments. + * + * A gang block consists of a gang header (zio_gbh_phys_t) and up to + * three (SPA_GBH_NBLKPTRS) gang members. The gang header is just like + * an indirect block: it's an array of block pointers. It consumes + * only one sector and hence is allocatable regardless of fragmentation. + * The gang header's bps point to its gang members, which hold the data. + * + * Gang blocks are self-checksumming, using the bp's + * as the verifier to ensure uniqueness of the SHA256 checksum. + * Critically, the gang block bp's blk_cksum is the checksum of the data, + * not the gang header. This ensures that data block signatures (needed for + * deduplication) are independent of how the block is physically stored. + * + * Gang blocks can be nested: a gang member may itself be a gang block. + * Thus every gang block is a tree in which root and all interior nodes are + * gang headers, and the leaves are normal blocks that contain user data. + * The root of the gang tree is called the gang leader. + * + * To perform any operation (read, rewrite, free, claim) on a gang block, + * zio_gang_assemble() first assembles the gang tree (minus data leaves) + * in the io_gang_tree field of the original logical i/o by recursively + * reading the gang leader and all gang headers below it. This yields + * an in-core tree containing the contents of every gang header and the + * bps for every constituent of the gang block. + * + * With the gang tree now assembled, zio_gang_issue() just walks the gang tree + * and invokes a callback on each bp. To free a gang block, zio_gang_issue() + * calls zio_free_gang() -- a trivial wrapper around zio_free() -- for each bp. + * zio_claim_gang() provides a similarly trivial wrapper for zio_claim(). + * zio_read_gang() is a wrapper around zio_read() that omits reading gang + * headers, since we already have those in io_gang_tree. zio_rewrite_gang() + * performs a zio_rewrite() of the data or, for gang headers, a zio_rewrite() + * of the gang header plus zio_checksum_compute() of the data to update the + * gang header's blk_cksum as described above. + * + * The two-phase assemble/issue model solves the problem of partial failure -- + * what if you'd freed part of a gang block but then couldn't read the + * gang header for another part? Assembling the entire gang tree first + * ensures that all the necessary gang header I/O has succeeded before + * starting the actual work of free, claim, or write. Once the gang tree + * is assembled, free and claim are in-memory operations that cannot fail. + * + * In the event that a gang write fails, zio_dva_unallocate() walks the + * gang tree to immediately free (i.e. insert back into the space map) + * everything we've allocated. This ensures that we don't get ENOSPC + * errors during repeated suspend/resume cycles due to a flaky device. + * + * Gang rewrites only happen during sync-to-convergence. If we can't assemble + * the gang tree, we won't modify the block, so we can safely defer the free + * (knowing that the block is still intact). If we *can* assemble the gang + * tree, then even if some of the rewrites fail, zio_dva_unallocate() will free + * each constituent bp and we can allocate a new block on the next sync pass. + * + * In all cases, the gang tree allows complete recovery from partial failure. + * ========================================================================== + */ + +static void +zio_gang_issue_func_done(zio_t *zio) +{ + abd_put(zio->io_abd); +} + +static zio_t * +zio_read_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + if (gn != NULL) + return (pio); + + return (zio_read(pio, pio->io_spa, bp, abd_get_offset(data, offset), + BP_GET_PSIZE(bp), zio_gang_issue_func_done, + NULL, pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark)); +} + +static zio_t * +zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + zio_t *zio; + + if (gn != NULL) { + abd_t *gbh_abd = + abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + gbh_abd, SPA_GANGBLOCKSIZE, zio_gang_issue_func_done, NULL, + pio->io_priority, ZIO_GANG_CHILD_FLAGS(pio), + &pio->io_bookmark); + /* + * As we rewrite each gang header, the pipeline will compute + * a new gang block header checksum for it; but no one will + * compute a new data checksum, so we do that here. The one + * exception is the gang leader: the pipeline already computed + * its data checksum because that stage precedes gang assembly. + * (Presently, nothing actually uses interior data checksums; + * this is just good hygiene.) + */ + if (gn != pio->io_gang_leader->io_gang_tree) { + abd_t *buf = abd_get_offset(data, offset); + + zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), + buf, BP_GET_PSIZE(bp)); + + abd_put(buf); + } + /* + * If we are here to damage data for testing purposes, + * leave the GBH alone so that we can detect the damage. + */ + if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } else { + zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, + abd_get_offset(data, offset), BP_GET_PSIZE(bp), + zio_gang_issue_func_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + } + + return (zio); +} + +/* ARGSUSED */ +static zio_t * +zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + zio_t *zio = zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + ZIO_GANG_CHILD_FLAGS(pio)); + if (zio == NULL) { + zio = zio_null(pio, pio->io_spa, + NULL, NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)); + } + return (zio); +} + +/* ARGSUSED */ +static zio_t * +zio_claim_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, abd_t *data, + uint64_t offset) +{ + return (zio_claim(pio, pio->io_spa, pio->io_txg, bp, + NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); +} + +static zio_gang_issue_func_t *zio_gang_issue_func[ZIO_TYPES] = { + NULL, + zio_read_gang, + zio_rewrite_gang, + zio_free_gang, + zio_claim_gang, + NULL +}; + +static void zio_gang_tree_assemble_done(zio_t *zio); + +static zio_gang_node_t * +zio_gang_node_alloc(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn; + + ASSERT(*gnpp == NULL); + + gn = kmem_zalloc(sizeof (*gn), KM_SLEEP); + gn->gn_gbh = zio_buf_alloc(SPA_GANGBLOCKSIZE); + *gnpp = gn; + + return (gn); +} + +static void +zio_gang_node_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + ASSERT(gn->gn_child[g] == NULL); + + zio_buf_free(gn->gn_gbh, SPA_GANGBLOCKSIZE); + kmem_free(gn, sizeof (*gn)); + *gnpp = NULL; +} + +static void +zio_gang_tree_free(zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = *gnpp; + + if (gn == NULL) + return; + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) + zio_gang_tree_free(&gn->gn_child[g]); + + zio_gang_node_free(gnpp); +} + +static void +zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) +{ + zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); + abd_t *gbh_abd = abd_get_from_buf(gn->gn_gbh, SPA_GANGBLOCKSIZE); + + ASSERT(gio->io_gang_leader == gio); + ASSERT(BP_IS_GANG(bp)); + + zio_nowait(zio_read(gio, gio->io_spa, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_gang_tree_assemble_done, gn, gio->io_priority, + ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); +} + +static void +zio_gang_tree_assemble_done(zio_t *zio) +{ + zio_t *gio = zio->io_gang_leader; + zio_gang_node_t *gn = zio->io_private; + blkptr_t *bp = zio->io_bp; + + ASSERT(gio == zio_unique_parent(zio)); + ASSERT(zio->io_child_count == 0); + + if (zio->io_error) + return; + + /* this ABD was created from a linear buf in zio_gang_tree_assemble */ + if (BP_SHOULD_BYTESWAP(bp)) + byteswap_uint64_array(abd_to_buf(zio->io_abd), zio->io_size); + + ASSERT3P(abd_to_buf(zio->io_abd), ==, gn->gn_gbh); + ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + + abd_put(zio->io_abd); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (!BP_IS_GANG(gbp)) + continue; + zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); + } +} + +static void +zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, abd_t *data, + uint64_t offset) +{ + zio_t *gio = pio->io_gang_leader; + zio_t *zio; + + ASSERT(BP_IS_GANG(bp) == !!gn); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); + + /* + * If you're a gang header, your data is in gn->gn_gbh. + * If you're a gang member, your data is in 'data' and gn == NULL. + */ + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data, offset); + + if (gn != NULL) { + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); + + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; + if (BP_IS_HOLE(gbp)) + continue; + zio_gang_tree_issue(zio, gn->gn_child[g], gbp, data, + offset); + offset += BP_GET_PSIZE(gbp); + } + } + + if (gn == gio->io_gang_tree) + ASSERT3U(gio->io_size, ==, offset); + + if (zio != pio) + zio_nowait(zio); +} + +static zio_t * +zio_gang_assemble(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + zio->io_gang_leader = zio; + + zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); + + return (zio); +} + +static zio_t * +zio_gang_issue(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + if (zio->io_child_error[ZIO_CHILD_GANG] == 0) + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_abd, + 0); + else + zio_gang_tree_free(&zio->io_gang_tree); + + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + return (zio); +} + +static void +zio_write_gang_member_ready(zio_t *zio) +{ + zio_t *pio = zio_unique_parent(zio); + dva_t *cdva = zio->io_bp->blk_dva; + dva_t *pdva = pio->io_bp->blk_dva; + uint64_t asize; + zio_t *gio __maybe_unused = zio->io_gang_leader; + + if (BP_IS_HOLE(zio->io_bp)) + return; + + ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); + + ASSERT(zio->io_child_type == ZIO_CHILD_GANG); + ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); + + mutex_enter(&pio->io_lock); + for (int d = 0; d < BP_GET_NDVAS(zio->io_bp); d++) { + ASSERT(DVA_GET_GANG(&pdva[d])); + asize = DVA_GET_ASIZE(&pdva[d]); + asize += DVA_GET_ASIZE(&cdva[d]); + DVA_SET_ASIZE(&pdva[d], asize); + } + mutex_exit(&pio->io_lock); +} + +static void +zio_write_gang_done(zio_t *zio) +{ + /* + * The io_abd field will be NULL for a zio with no data. The io_flags + * will initially have the ZIO_FLAG_NODATA bit flag set, but we can't + * check for it here as it is cleared in zio_ready. + */ + if (zio->io_abd != NULL) + abd_put(zio->io_abd); +} + +static zio_t * +zio_write_gang_block(zio_t *pio) +{ + spa_t *spa = pio->io_spa; + metaslab_class_t *mc = spa_normal_class(spa); + blkptr_t *bp = pio->io_bp; + zio_t *gio = pio->io_gang_leader; + zio_t *zio; + zio_gang_node_t *gn, **gnpp; + zio_gbh_phys_t *gbh; + abd_t *gbh_abd; + uint64_t txg = pio->io_txg; + uint64_t resid = pio->io_size; + uint64_t lsize; + int copies = gio->io_prop.zp_copies; + int gbh_copies; + zio_prop_t zp; + int error; + boolean_t has_data = !(pio->io_flags & ZIO_FLAG_NODATA); + + /* + * encrypted blocks need DVA[2] free so encrypted gang headers can't + * have a third copy. + */ + gbh_copies = MIN(copies + 1, spa_max_replication(spa)); + if (gio->io_prop.zp_encrypt && gbh_copies >= SPA_DVAS_PER_BP) + gbh_copies = SPA_DVAS_PER_BP - 1; + + int flags = METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER; + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + flags |= METASLAB_ASYNC_ALLOC; + VERIFY(zfs_refcount_held(&mc->mc_alloc_slots[pio->io_allocator], + pio)); + + /* + * The logical zio has already placed a reservation for + * 'copies' allocation slots but gang blocks may require + * additional copies. These additional copies + * (i.e. gbh_copies - copies) are guaranteed to succeed + * since metaslab_class_throttle_reserve() always allows + * additional reservations for gang blocks. + */ + VERIFY(metaslab_class_throttle_reserve(mc, gbh_copies - copies, + pio->io_allocator, pio, flags)); + } + + error = metaslab_alloc(spa, mc, SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, flags, + &pio->io_alloc_list, pio, pio->io_allocator); + if (error) { + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + /* + * If we failed to allocate the gang block header then + * we remove any additional allocation reservations that + * we placed here. The original reservation will + * be removed when the logical I/O goes to the ready + * stage. + */ + metaslab_class_throttle_unreserve(mc, + gbh_copies - copies, pio->io_allocator, pio); + } + + pio->io_error = error; + return (pio); + } + + if (pio == gio) { + gnpp = &gio->io_gang_tree; + } else { + gnpp = pio->io_private; + ASSERT(pio->io_ready == zio_write_gang_member_ready); + } + + gn = zio_gang_node_alloc(gnpp); + gbh = gn->gn_gbh; + bzero(gbh, SPA_GANGBLOCKSIZE); + gbh_abd = abd_get_from_buf(gbh, SPA_GANGBLOCKSIZE); + + /* + * Create the gang header. + */ + zio = zio_rewrite(pio, spa, txg, bp, gbh_abd, SPA_GANGBLOCKSIZE, + zio_write_gang_done, NULL, pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + /* + * Create and nowait the gang children. + */ + for (int g = 0; resid != 0; resid -= lsize, g++) { + lsize = P2ROUNDUP(resid / (SPA_GBH_NBLKPTRS - g), + SPA_MINBLOCKSIZE); + ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); + + zp.zp_checksum = gio->io_prop.zp_checksum; + zp.zp_compress = ZIO_COMPRESS_OFF; + zp.zp_complevel = gio->io_prop.zp_complevel; + zp.zp_type = DMU_OT_NONE; + zp.zp_level = 0; + zp.zp_copies = gio->io_prop.zp_copies; + zp.zp_dedup = B_FALSE; + zp.zp_dedup_verify = B_FALSE; + zp.zp_nopwrite = B_FALSE; + zp.zp_encrypt = gio->io_prop.zp_encrypt; + zp.zp_byteorder = gio->io_prop.zp_byteorder; + bzero(zp.zp_salt, ZIO_DATA_SALT_LEN); + bzero(zp.zp_iv, ZIO_DATA_IV_LEN); + bzero(zp.zp_mac, ZIO_DATA_MAC_LEN); + + zio_t *cio = zio_write(zio, spa, txg, &gbh->zg_blkptr[g], + has_data ? abd_get_offset(pio->io_abd, pio->io_size - + resid) : NULL, lsize, lsize, &zp, + zio_write_gang_member_ready, NULL, NULL, + zio_write_gang_done, &gn->gn_child[g], pio->io_priority, + ZIO_GANG_CHILD_FLAGS(pio), &pio->io_bookmark); + + if (pio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(pio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(has_data); + + /* + * Gang children won't throttle but we should + * account for their work, so reserve an allocation + * slot for them here. + */ + VERIFY(metaslab_class_throttle_reserve(mc, + zp.zp_copies, cio->io_allocator, cio, flags)); + } + zio_nowait(cio); + } + + /* + * Set pio's pipeline to just wait for zio to finish. + */ + pio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + /* + * We didn't allocate this bp, so make sure it doesn't get unmarked. + */ + pio->io_flags &= ~ZIO_FLAG_FASTWRITE; + + zio_nowait(zio); + + return (pio); +} + +/* + * The zio_nop_write stage in the pipeline determines if allocating a + * new bp is necessary. The nopwrite feature can handle writes in + * either syncing or open context (i.e. zil writes) and as a result is + * mutually exclusive with dedup. + * + * By leveraging a cryptographically secure checksum, such as SHA256, we + * can compare the checksums of the new data and the old to determine if + * allocating a new block is required. Note that our requirements for + * cryptographic strength are fairly weak: there can't be any accidental + * hash collisions, but we don't need to be secure against intentional + * (malicious) collisions. To trigger a nopwrite, you have to be able + * to write the file to begin with, and triggering an incorrect (hash + * collision) nopwrite is no worse than simply writing to the file. + * That said, there are no known attacks against the checksum algorithms + * used for nopwrite, assuming that the salt and the checksums + * themselves remain secret. + */ +static zio_t * +zio_nop_write(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + zio_prop_t *zp = &zio->io_prop; + + ASSERT(BP_GET_LEVEL(bp) == 0); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(zp->zp_nopwrite); + ASSERT(!zp->zp_dedup); + ASSERT(zio->io_bp_override == NULL); + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Check to see if the original bp and the new bp have matching + * characteristics (i.e. same checksum, compression algorithms, etc). + * If they don't then just continue with the pipeline which will + * allocate a new bp. + */ + if (BP_IS_HOLE(bp_orig) || + !(zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE) || + BP_IS_ENCRYPTED(bp) || BP_IS_ENCRYPTED(bp_orig) || + BP_GET_CHECKSUM(bp) != BP_GET_CHECKSUM(bp_orig) || + BP_GET_COMPRESS(bp) != BP_GET_COMPRESS(bp_orig) || + BP_GET_DEDUP(bp) != BP_GET_DEDUP(bp_orig) || + zp->zp_copies != BP_GET_NDVAS(bp_orig)) + return (zio); + + /* + * If the checksums match then reset the pipeline so that we + * avoid allocating a new bp and issuing any I/O. + */ + if (ZIO_CHECKSUM_EQUAL(bp->blk_cksum, bp_orig->blk_cksum)) { + ASSERT(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_NOPWRITE); + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(bp_orig)); + ASSERT3U(BP_GET_LSIZE(bp), ==, BP_GET_LSIZE(bp_orig)); + ASSERT(zp->zp_compress != ZIO_COMPRESS_OFF); + ASSERT(bcmp(&bp->blk_prop, &bp_orig->blk_prop, + sizeof (uint64_t)) == 0); + + /* + * If we're overwriting a block that is currently on an + * indirect vdev, then ignore the nopwrite request and + * allow a new block to be allocated on a concrete vdev. + */ + spa_config_enter(zio->io_spa, SCL_VDEV, FTAG, RW_READER); + vdev_t *tvd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(&bp->blk_dva[0])); + if (tvd->vdev_ops == &vdev_indirect_ops) { + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + return (zio); + } + spa_config_exit(zio->io_spa, SCL_VDEV, FTAG); + + *bp = *bp_orig; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + zio->io_flags |= ZIO_FLAG_NOPWRITE; + } + + return (zio); +} + +/* + * ========================================================================== + * Dedup + * ========================================================================== + */ +static void +zio_ddt_child_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp; + zio_t *pio = zio_unique_parent(zio); + + mutex_enter(&pio->io_lock); + ddp = ddt_phys_select(dde, bp); + if (zio->io_error == 0) + ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + + if (zio->io_error == 0 && dde->dde_repair_abd == NULL) + dde->dde_repair_abd = zio->io_abd; + else + abd_free(zio->io_abd); + mutex_exit(&pio->io_lock); +} + +static zio_t * +zio_ddt_read_start(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = ddt_repair_start(ddt, bp); + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + blkptr_t blk; + + ASSERT(zio->io_vsd == NULL); + zio->io_vsd = dde; + + if (ddp_self == NULL) + return (zio); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + continue; + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, + &blk); + zio_nowait(zio_read(zio, zio->io_spa, &blk, + abd_alloc_for_io(zio->io_size, B_TRUE), + zio->io_size, zio_ddt_child_read_done, dde, + zio->io_priority, ZIO_DDT_CHILD_FLAGS(zio) | + ZIO_FLAG_DONT_PROPAGATE, &zio->io_bookmark)); + } + return (zio); + } + + zio_nowait(zio_read(zio, zio->io_spa, bp, + zio->io_abd, zio->io_size, NULL, NULL, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); + + return (zio); +} + +static zio_t * +zio_ddt_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_DDT_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_vsd; + if (ddt == NULL) { + ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); + return (zio); + } + if (dde == NULL) { + zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + return (NULL); + } + if (dde->dde_repair_abd != NULL) { + abd_copy(zio->io_abd, dde->dde_repair_abd, + zio->io_size); + zio->io_child_error[ZIO_CHILD_DDT] = 0; + } + ddt_repair_done(ddt, dde); + zio->io_vsd = NULL; + } + + ASSERT(zio->io_vsd == NULL); + + return (zio); +} + +static boolean_t +zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) +{ + spa_t *spa = zio->io_spa; + boolean_t do_raw = !!(zio->io_flags & ZIO_FLAG_RAW); + + ASSERT(!(zio->io_bp_override && do_raw)); + + /* + * Note: we compare the original data, not the transformed data, + * because when zio->io_bp is an override bp, we will not have + * pushed the I/O transforms. That's an important optimization + * because otherwise we'd compress/encrypt all dmu_sync() data twice. + * However, we should never get a raw, override zio so in these + * cases we can compare the io_abd directly. This is useful because + * it allows us to do dedup verification even if we don't have access + * to the original data (for instance, if the encryption keys aren't + * loaded). + */ + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + zio_t *lio = dde->dde_lead_zio[p]; + + if (lio != NULL && do_raw) { + return (lio->io_size != zio->io_size || + abd_cmp(zio->io_abd, lio->io_abd) != 0); + } else if (lio != NULL) { + return (lio->io_orig_size != zio->io_orig_size || + abd_cmp(zio->io_orig_abd, lio->io_orig_abd) != 0); + } + } + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + + if (ddp->ddp_phys_birth != 0 && do_raw) { + blkptr_t blk = *zio->io_bp; + uint64_t psize; + abd_t *tmpabd; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + psize = BP_GET_PSIZE(&blk); + + if (psize != zio->io_size) + return (B_TRUE); + + ddt_exit(ddt); + + tmpabd = abd_alloc_for_io(psize, B_TRUE); + + error = zio_wait(zio_read(NULL, spa, &blk, tmpabd, + psize, NULL, NULL, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_RAW, &zio->io_bookmark)); + + if (error == 0) { + if (abd_cmp(tmpabd, zio->io_abd) != 0) + error = SET_ERROR(ENOENT); + } + + abd_free(tmpabd); + ddt_enter(ddt); + return (error != 0); + } else if (ddp->ddp_phys_birth != 0) { + arc_buf_t *abuf = NULL; + arc_flags_t aflags = ARC_FLAG_WAIT; + blkptr_t blk = *zio->io_bp; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + + if (BP_GET_LSIZE(&blk) != zio->io_orig_size) + return (B_TRUE); + + ddt_exit(ddt); + + error = arc_read(NULL, spa, &blk, + arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zio->io_bookmark); + + if (error == 0) { + if (abd_cmp_buf(zio->io_orig_abd, abuf->b_data, + zio->io_orig_size) != 0) + error = SET_ERROR(ENOENT); + arc_buf_destroy(abuf, &abuf); + } + + ddt_enter(ddt); + return (error != 0); + } + } + + return (B_FALSE); +} + +static void +zio_ddt_child_write_ready(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *pio; + + if (zio->io_error) + return; + + ddt_enter(ddt); + + ASSERT(dde->dde_lead_zio[p] == zio); + + ddt_phys_fill(ddp, zio->io_bp); + + zio_link_t *zl = NULL; + while ((pio = zio_walk_parents(zio, &zl)) != NULL) + ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + + ddt_exit(ddt); +} + +static void +zio_ddt_child_write_done(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + zio_link_t *zl = NULL; + while (zio_walk_parents(zio, &zl) != NULL) + ddt_phys_addref(ddp); + } else { + ddt_phys_clear(ddp); + } + + ddt_exit(ddt); +} + +static zio_t * +zio_ddt_write(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t txg = zio->io_txg; + zio_prop_t *zp = &zio->io_prop; + int p = zp->zp_copies; + zio_t *cio = NULL; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); + ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + ASSERT(!(zio->io_bp_override && (zio->io_flags & ZIO_FLAG_RAW))); + + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = &dde->dde_phys[p]; + + if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { + /* + * If we're using a weak checksum, upgrade to a strong checksum + * and try again. If we're already using a strong checksum, + * we can't resolve it, so just convert to an ordinary write. + * (And automatically e-mail a paper to Nature?) + */ + if (!(zio_checksum_table[zp->zp_checksum].ci_flags & + ZCHECKSUM_FLAG_DEDUP)) { + zp->zp_checksum = spa_dedup_checksum(spa); + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + BP_ZERO(bp); + } else { + zp->zp_dedup = B_FALSE; + BP_SET_DEDUP(bp, B_FALSE); + } + ASSERT(!BP_GET_DEDUP(bp)); + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (zio); + } + + if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { + if (ddp->ddp_phys_birth != 0) + ddt_bp_fill(ddp, bp, txg); + if (dde->dde_lead_zio[p] != NULL) + zio_add_child(zio, dde->dde_lead_zio[p]); + else + ddt_phys_addref(ddp); + } else if (zio->io_bp_override) { + ASSERT(bp->blk_birth == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_fill(ddp, bp); + ddt_phys_addref(ddp); + } else { + cio = zio_write(zio, spa, txg, bp, zio->io_orig_abd, + zio->io_orig_size, zio->io_orig_size, zp, + zio_ddt_child_write_ready, NULL, NULL, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_abd, zio->io_size, 0, NULL); + dde->dde_lead_zio[p] = cio; + } + + ddt_exit(ddt); + + zio_nowait(cio); + + return (zio); +} + +ddt_entry_t *freedde; /* for debugging */ + +static zio_t * +zio_ddt_free(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + ddt_enter(ddt); + freedde = dde = ddt_lookup(ddt, bp, B_TRUE); + if (dde) { + ddp = ddt_phys_select(dde, bp); + if (ddp) + ddt_phys_decref(ddp); + } + ddt_exit(ddt); + + return (zio); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ + +static zio_t * +zio_io_to_allocate(spa_t *spa, int allocator) +{ + zio_t *zio; + + ASSERT(MUTEX_HELD(&spa->spa_alloc_locks[allocator])); + + zio = avl_first(&spa->spa_alloc_trees[allocator]); + if (zio == NULL) + return (NULL); + + ASSERT(IO_IS_ALLOCATING(zio)); + + /* + * Try to place a reservation for this zio. If we're unable to + * reserve then we throttle. + */ + ASSERT3U(zio->io_allocator, ==, allocator); + if (!metaslab_class_throttle_reserve(zio->io_metaslab_class, + zio->io_prop.zp_copies, zio->io_allocator, zio, 0)) { + return (NULL); + } + + avl_remove(&spa->spa_alloc_trees[allocator], zio); + ASSERT3U(zio->io_stage, <, ZIO_STAGE_DVA_ALLOCATE); + + return (zio); +} + +static zio_t * +zio_dva_throttle(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + zio_t *nio; + metaslab_class_t *mc; + + /* locate an appropriate allocation class */ + mc = spa_preferred_class(spa, zio->io_size, zio->io_prop.zp_type, + zio->io_prop.zp_level, zio->io_prop.zp_zpl_smallblk); + + if (zio->io_priority == ZIO_PRIORITY_SYNC_WRITE || + !mc->mc_alloc_throttle_enabled || + zio->io_child_type == ZIO_CHILD_GANG || + zio->io_flags & ZIO_FLAG_NODATA) { + return (zio); + } + + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + ASSERT3U(zio->io_queued_timestamp, >, 0); + ASSERT(zio->io_stage == ZIO_STAGE_DVA_THROTTLE); + + zbookmark_phys_t *bm = &zio->io_bookmark; + /* + * We want to try to use as many allocators as possible to help improve + * performance, but we also want logically adjacent IOs to be physically + * adjacent to improve sequential read performance. We chunk each object + * into 2^20 block regions, and then hash based on the objset, object, + * level, and region to accomplish both of these goals. + */ + zio->io_allocator = cityhash4(bm->zb_objset, bm->zb_object, + bm->zb_level, bm->zb_blkid >> 20) % spa->spa_alloc_count; + mutex_enter(&spa->spa_alloc_locks[zio->io_allocator]); + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio->io_metaslab_class = mc; + avl_add(&spa->spa_alloc_trees[zio->io_allocator], zio); + nio = zio_io_to_allocate(spa, zio->io_allocator); + mutex_exit(&spa->spa_alloc_locks[zio->io_allocator]); + return (nio); +} + +static void +zio_allocate_dispatch(spa_t *spa, int allocator) +{ + zio_t *zio; + + mutex_enter(&spa->spa_alloc_locks[allocator]); + zio = zio_io_to_allocate(spa, allocator); + mutex_exit(&spa->spa_alloc_locks[allocator]); + if (zio == NULL) + return; + + ASSERT3U(zio->io_stage, ==, ZIO_STAGE_DVA_THROTTLE); + ASSERT0(zio->io_error); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_TRUE); +} + +static zio_t * +zio_dva_allocate(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + metaslab_class_t *mc; + blkptr_t *bp = zio->io_bp; + int error; + int flags = 0; + + if (zio->io_gang_leader == NULL) { + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + zio->io_gang_leader = zio; + } + + ASSERT(BP_IS_HOLE(bp)); + ASSERT0(BP_GET_NDVAS(bp)); + ASSERT3U(zio->io_prop.zp_copies, >, 0); + ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); + ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); + + flags |= (zio->io_flags & ZIO_FLAG_FASTWRITE) ? METASLAB_FASTWRITE : 0; + if (zio->io_flags & ZIO_FLAG_NODATA) + flags |= METASLAB_DONT_THROTTLE; + if (zio->io_flags & ZIO_FLAG_GANG_CHILD) + flags |= METASLAB_GANG_CHILD; + if (zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE) + flags |= METASLAB_ASYNC_ALLOC; + + /* + * if not already chosen, locate an appropriate allocation class + */ + mc = zio->io_metaslab_class; + if (mc == NULL) { + mc = spa_preferred_class(spa, zio->io_size, + zio->io_prop.zp_type, zio->io_prop.zp_level, + zio->io_prop.zp_zpl_smallblk); + zio->io_metaslab_class = mc; + } + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio, zio->io_allocator); + + /* + * Fallback to normal class when an alloc class is full + */ + if (error == ENOSPC && mc != spa_normal_class(spa)) { + /* + * If throttling, transfer reservation over to normal class. + * The io_allocator slot can remain the same even though we + * are switching classes. + */ + if (mc->mc_alloc_throttle_enabled && + (zio->io_flags & ZIO_FLAG_IO_ALLOCATING)) { + metaslab_class_throttle_unreserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio); + zio->io_flags &= ~ZIO_FLAG_IO_ALLOCATING; + + mc = spa_normal_class(spa); + VERIFY(metaslab_class_throttle_reserve(mc, + zio->io_prop.zp_copies, zio->io_allocator, zio, + flags | METASLAB_MUST_RESERVE)); + } else { + mc = spa_normal_class(spa); + } + zio->io_metaslab_class = mc; + + error = metaslab_alloc(spa, mc, zio->io_size, bp, + zio->io_prop.zp_copies, zio->io_txg, NULL, flags, + &zio->io_alloc_list, zio, zio->io_allocator); + } + + if (error != 0) { + zfs_dbgmsg("%s: metaslab allocation failure: zio %px, " + "size %llu, error %d", spa_name(spa), zio, zio->io_size, + error); + if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) + return (zio_write_gang_block(zio)); + zio->io_error = error; + } + + return (zio); +} + +static zio_t * +zio_dva_free(zio_t *zio) +{ + metaslab_free(zio->io_spa, zio->io_bp, zio->io_txg, B_FALSE); + + return (zio); +} + +static zio_t * +zio_dva_claim(zio_t *zio) +{ + int error; + + error = metaslab_claim(zio->io_spa, zio->io_bp, zio->io_txg); + if (error) + zio->io_error = error; + + return (zio); +} + +/* + * Undo an allocation. This is used by zio_done() when an I/O fails + * and we want to give back the block we just allocated. + * This handles both normal blocks and gang blocks. + */ +static void +zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) +{ + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); + ASSERT(zio->io_bp_override == NULL); + + if (!BP_IS_HOLE(bp)) + metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); + + if (gn != NULL) { + for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { + zio_dva_unallocate(zio, gn->gn_child[g], + &gn->gn_gbh->zg_blkptr[g]); + } + } +} + +/* + * Try to allocate an intent log block. Return 0 on success, errno on failure. + */ +int +zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, + uint64_t size, boolean_t *slog) +{ + int error = 1; + zio_alloc_list_t io_alloc_list; + + ASSERT(txg > spa_syncing_txg(spa)); + + metaslab_trace_init(&io_alloc_list); + + /* + * Block pointer fields are useful to metaslabs for stats and debugging. + * Fill in the obvious ones before calling into metaslab_alloc(). + */ + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_PSIZE(new_bp, size); + BP_SET_LEVEL(new_bp, 0); + + /* + * When allocating a zil block, we don't have information about + * the final destination of the block except the objset it's part + * of, so we just hash the objset ID to pick the allocator to get + * some parallelism. + */ + error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, + txg, NULL, METASLAB_FASTWRITE, &io_alloc_list, NULL, + cityhash4(0, 0, 0, os->os_dsl_dataset->ds_object) % + spa->spa_alloc_count); + if (error == 0) { + *slog = TRUE; + } else { + error = metaslab_alloc(spa, spa_normal_class(spa), size, + new_bp, 1, txg, NULL, METASLAB_FASTWRITE, + &io_alloc_list, NULL, cityhash4(0, 0, 0, + os->os_dsl_dataset->ds_object) % spa->spa_alloc_count); + if (error == 0) + *slog = FALSE; + } + metaslab_trace_fini(&io_alloc_list); + + if (error == 0) { + BP_SET_LSIZE(new_bp, size); + BP_SET_PSIZE(new_bp, size); + BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(new_bp, + spa_version(spa) >= SPA_VERSION_SLIM_ZIL + ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); + BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); + BP_SET_LEVEL(new_bp, 0); + BP_SET_DEDUP(new_bp, 0); + BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); + + /* + * encrypted blocks will require an IV and salt. We generate + * these now since we will not be rewriting the bp at + * rewrite time. + */ + if (os->os_encrypted) { + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t salt[ZIO_DATA_SALT_LEN]; + + BP_SET_CRYPT(new_bp, B_TRUE); + VERIFY0(spa_crypt_get_salt(spa, + dmu_objset_id(os), salt)); + VERIFY0(zio_crypt_generate_iv(iv)); + + zio_crypt_encode_params_bp(new_bp, salt, iv); + } + } else { + zfs_dbgmsg("%s: zil block allocation failure: " + "size %llu, error %d", spa_name(spa), size, error); + } + + return (error); +} + +/* + * ========================================================================== + * Read and write to physical devices + * ========================================================================== + */ + +/* + * Issue an I/O to the underlying vdev. Typically the issue pipeline + * stops after this stage and will resume upon I/O completion. + * However, there are instances where the vdev layer may need to + * continue the pipeline when an I/O was not issued. Since the I/O + * that was sent to the vdev layer might be different than the one + * currently active in the pipeline (see vdev_queue_io()), we explicitly + * force the underlying vdev layers to call either zio_execute() or + * zio_interrupt() to ensure that the pipeline continues with the correct I/O. + */ +static zio_t * +zio_vdev_io_start(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + uint64_t align; + spa_t *spa = zio->io_spa; + + zio->io_delay = 0; + + ASSERT(zio->io_error == 0); + ASSERT(zio->io_child_error[ZIO_CHILD_VDEV] == 0); + + if (vd == NULL) { + if (!(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_enter(spa, SCL_ZIO, zio, RW_READER); + + /* + * The mirror_ops handle multiple DVAs in a single BP. + */ + vdev_mirror_ops.vdev_op_io_start(zio); + return (NULL); + } + + ASSERT3P(zio->io_logical, !=, zio); + if (zio->io_type == ZIO_TYPE_WRITE) { + ASSERT(spa->spa_trust_config); + + /* + * Note: the code can handle other kinds of writes, + * but we don't expect them. + */ + if (zio->io_vd->vdev_removing) { + ASSERT(zio->io_flags & + (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | + ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); + } + } + + align = 1ULL << vd->vdev_top->vdev_ashift; + + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && + P2PHASE(zio->io_size, align) != 0) { + /* Transform logical writes to be a full physical block size. */ + uint64_t asize = P2ROUNDUP(zio->io_size, align); + abd_t *abuf = abd_alloc_sametype(zio->io_abd, asize); + ASSERT(vd == vd->vdev_top); + if (zio->io_type == ZIO_TYPE_WRITE) { + abd_copy(abuf, zio->io_abd, zio->io_size); + abd_zero_off(abuf, zio->io_size, asize - zio->io_size); + } + zio_push_transform(zio, abuf, asize, asize, zio_subblock); + } + + /* + * If this is not a physical io, make sure that it is properly aligned + * before proceeding. + */ + if (!(zio->io_flags & ZIO_FLAG_PHYSICAL)) { + ASSERT0(P2PHASE(zio->io_offset, align)); + ASSERT0(P2PHASE(zio->io_size, align)); + } else { + /* + * For physical writes, we allow 512b aligned writes and assume + * the device will perform a read-modify-write as necessary. + */ + ASSERT0(P2PHASE(zio->io_offset, SPA_MINBLOCKSIZE)); + ASSERT0(P2PHASE(zio->io_size, SPA_MINBLOCKSIZE)); + } + + VERIFY(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + + /* + * If this is a repair I/O, and there's no self-healing involved -- + * that is, we're just resilvering what we expect to resilver -- + * then don't do the I/O unless zio's txg is actually in vd's DTL. + * This prevents spurious resilvering. + * + * There are a few ways that we can end up creating these spurious + * resilver i/os: + * + * 1. A resilver i/o will be issued if any DVA in the BP has a + * dirty DTL. The mirror code will issue resilver writes to + * each DVA, including the one(s) that are not on vdevs with dirty + * DTLs. + * + * 2. With nested replication, which happens when we have a + * "replacing" or "spare" vdev that's a child of a mirror or raidz. + * For example, given mirror(replacing(A+B), C), it's likely that + * only A is out of date (it's the new device). In this case, we'll + * read from C, then use the data to resilver A+B -- but we don't + * actually want to resilver B, just A. The top-level mirror has no + * way to know this, so instead we just discard unnecessary repairs + * as we work our way down the vdev tree. + * + * 3. ZTEST also creates mirrors of mirrors, mirrors of raidz, etc. + * The same logic applies to any form of nested replication: ditto + * + mirror, RAID-Z + replacing, etc. + * + * However, indirect vdevs point off to other vdevs which may have + * DTL's, so we never bypass them. The child i/os on concrete vdevs + * will be properly bypassed instead. + */ + if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && + zio->io_txg != 0 && /* not a delegated i/o */ + vd->vdev_ops != &vdev_indirect_ops && + !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio_vdev_io_bypass(zio); + return (zio); + } + + if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM)) { + + if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio)) + return (zio); + + if ((zio = vdev_queue_io(zio)) == NULL) + return (NULL); + + if (!vdev_accessible(vd, zio)) { + zio->io_error = SET_ERROR(ENXIO); + zio_interrupt(zio); + return (NULL); + } + zio->io_delay = gethrtime(); + } + + vd->vdev_ops->vdev_op_io_start(zio); + return (NULL); +} + +static zio_t * +zio_vdev_io_done(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + vdev_ops_t *ops = vd ? vd->vdev_ops : &vdev_mirror_ops; + boolean_t unexpected_error = B_FALSE; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + ASSERT(zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_WRITE || zio->io_type == ZIO_TYPE_TRIM); + + if (zio->io_delay) + zio->io_delay = gethrtime() - zio->io_delay; + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf) { + + vdev_queue_io_done(zio); + + if (zio->io_type == ZIO_TYPE_WRITE) + vdev_cache_write(zio); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_device_injections(vd, zio, + EIO, EILSEQ); + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_label_injection(zio, EIO); + + if (zio->io_error && zio->io_type != ZIO_TYPE_TRIM) { + if (!vdev_accessible(vd, zio)) { + zio->io_error = SET_ERROR(ENXIO); + } else { + unexpected_error = B_TRUE; + } + } + } + + ops->vdev_op_io_done(zio); + + if (unexpected_error) + VERIFY(vdev_probe(vd, zio) == NULL); + + return (zio); +} + +/* + * This function is used to change the priority of an existing zio that is + * currently in-flight. This is used by the arc to upgrade priority in the + * event that a demand read is made for a block that is currently queued + * as a scrub or async read IO. Otherwise, the high priority read request + * would end up having to wait for the lower priority IO. + */ +void +zio_change_priority(zio_t *pio, zio_priority_t priority) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { + vdev_queue_change_io_priority(pio, priority); + } else { + pio->io_priority = priority; + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_change_priority(cio, priority); + } + mutex_exit(&pio->io_lock); +} + +/* + * For non-raidz ZIOs, we can just copy aside the bad data read from the + * disk, and use that to finish the checksum ereport later. + */ +static void +zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, + const abd_t *good_buf) +{ + /* no processing needed */ + zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); +} + +/*ARGSUSED*/ +void +zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) +{ + void *abd = abd_alloc_sametype(zio->io_abd, zio->io_size); + + abd_copy(abd, zio->io_abd, zio->io_size); + + zcr->zcr_cbinfo = zio->io_size; + zcr->zcr_cbdata = abd; + zcr->zcr_finish = zio_vsd_default_cksum_finish; + zcr->zcr_free = zio_abd_free; +} + +static zio_t * +zio_vdev_io_assess(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + + if (zio_wait_for_children(zio, ZIO_CHILD_VDEV_BIT, ZIO_WAIT_DONE)) { + return (NULL); + } + + if (vd == NULL && !(zio->io_flags & ZIO_FLAG_CONFIG_WRITER)) + spa_config_exit(zio->io_spa, SCL_ZIO, zio); + + if (zio->io_vsd != NULL) { + zio->io_vsd_ops->vsd_free(zio); + zio->io_vsd = NULL; + } + + if (zio_injection_enabled && zio->io_error == 0) + zio->io_error = zio_handle_fault_injection(zio, EIO); + + /* + * If the I/O failed, determine whether we should attempt to retry it. + * + * On retry, we cut in line in the issue queue, since we don't want + * compression/checksumming/etc. work to prevent our (cheap) IO reissue. + */ + if (zio->io_error && vd == NULL && + !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DONT_QUEUE)); /* not a leaf */ + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_BYPASS)); /* not a leaf */ + zio->io_error = 0; + zio->io_flags |= ZIO_FLAG_IO_RETRY | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; + zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, + zio_requeue_io_start_cut_in_line); + return (NULL); + } + + /* + * If we got an error on a leaf device, convert it to ENXIO + * if the device is not accessible at all. + */ + if (zio->io_error && vd != NULL && vd->vdev_ops->vdev_op_leaf && + !vdev_accessible(vd, zio)) + zio->io_error = SET_ERROR(ENXIO); + + /* + * If we can't write to an interior vdev (mirror or RAID-Z), + * set vdev_cant_write so that we stop trying to allocate from it. + */ + if (zio->io_error == ENXIO && zio->io_type == ZIO_TYPE_WRITE && + vd != NULL && !vd->vdev_ops->vdev_op_leaf) { + vd->vdev_cant_write = B_TRUE; + } + + /* + * If a cache flush returns ENOTSUP or ENOTTY, we know that no future + * attempts will ever succeed. In this case we set a persistent + * boolean flag so that we don't bother with it in the future. + */ + if ((zio->io_error == ENOTSUP || zio->io_error == ENOTTY) && + zio->io_type == ZIO_TYPE_IOCTL && + zio->io_cmd == DKIOCFLUSHWRITECACHE && vd != NULL) + vd->vdev_nowritecache = B_TRUE; + + if (zio->io_error) + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (vd != NULL && vd->vdev_ops->vdev_op_leaf && + zio->io_physdone != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_DELEGATED)); + ASSERT(zio->io_child_type == ZIO_CHILD_VDEV); + zio->io_physdone(zio->io_logical); + } + + return (zio); +} + +void +zio_vdev_io_reissue(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_stage >>= 1; +} + +void +zio_vdev_io_redone(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); + + zio->io_stage >>= 1; +} + +void +zio_vdev_io_bypass(zio_t *zio) +{ + ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); + ASSERT(zio->io_error == 0); + + zio->io_flags |= ZIO_FLAG_IO_BYPASS; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; +} + +/* + * ========================================================================== + * Encrypt and store encryption parameters + * ========================================================================== + */ + + +/* + * This function is used for ZIO_STAGE_ENCRYPT. It is responsible for + * managing the storage of encryption parameters and passing them to the + * lower-level encryption functions. + */ +static zio_t * +zio_encrypt(zio_t *zio) +{ + zio_prop_t *zp = &zio->io_prop; + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t psize = BP_GET_PSIZE(bp); + uint64_t dsobj = zio->io_bookmark.zb_objset; + dmu_object_type_t ot = BP_GET_TYPE(bp); + void *enc_buf = NULL; + abd_t *eabd = NULL; + uint8_t salt[ZIO_DATA_SALT_LEN]; + uint8_t iv[ZIO_DATA_IV_LEN]; + uint8_t mac[ZIO_DATA_MAC_LEN]; + boolean_t no_crypt = B_FALSE; + + /* the root zio already encrypted the data */ + if (zio->io_child_type == ZIO_CHILD_GANG) + return (zio); + + /* only ZIL blocks are re-encrypted on rewrite */ + if (!IO_IS_ALLOCATING(zio) && ot != DMU_OT_INTENT_LOG) + return (zio); + + if (!(zp->zp_encrypt || BP_IS_ENCRYPTED(bp))) { + BP_SET_CRYPT(bp, B_FALSE); + return (zio); + } + + /* if we are doing raw encryption set the provided encryption params */ + if (zio->io_flags & ZIO_FLAG_RAW_ENCRYPT) { + ASSERT0(BP_GET_LEVEL(bp)); + BP_SET_CRYPT(bp, B_TRUE); + BP_SET_BYTEORDER(bp, zp->zp_byteorder); + if (ot != DMU_OT_OBJSET) + zio_crypt_encode_mac_bp(bp, zp->zp_mac); + + /* dnode blocks must be written out in the provided byteorder */ + if (zp->zp_byteorder != ZFS_HOST_BYTEORDER && + ot == DMU_OT_DNODE) { + void *bswap_buf = zio_buf_alloc(psize); + abd_t *babd = abd_get_from_buf(bswap_buf, psize); + + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + abd_copy_to_buf(bswap_buf, zio->io_abd, psize); + dmu_ot_byteswap[DMU_OT_BYTESWAP(ot)].ob_func(bswap_buf, + psize); + + abd_take_ownership_of_buf(babd, B_TRUE); + zio_push_transform(zio, babd, psize, psize, NULL); + } + + if (DMU_OT_IS_ENCRYPTED(ot)) + zio_crypt_encode_params_bp(bp, zp->zp_salt, zp->zp_iv); + return (zio); + } + + /* indirect blocks only maintain a cksum of the lower level MACs */ + if (BP_GET_LEVEL(bp) > 0) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(zio_crypt_do_indirect_mac_checksum_abd(B_TRUE, + zio->io_orig_abd, BP_GET_LSIZE(bp), BP_SHOULD_BYTESWAP(bp), + mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (zio); + } + + /* + * Objset blocks are a special case since they have 2 256-bit MACs + * embedded within them. + */ + if (ot == DMU_OT_OBJSET) { + ASSERT0(DMU_OT_IS_ENCRYPTED(ot)); + ASSERT3U(BP_GET_COMPRESS(bp), ==, ZIO_COMPRESS_OFF); + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_objset_mac_abd(B_TRUE, spa, dsobj, + zio->io_abd, psize, BP_SHOULD_BYTESWAP(bp))); + return (zio); + } + + /* unencrypted object types are only authenticated with a MAC */ + if (!DMU_OT_IS_ENCRYPTED(ot)) { + BP_SET_CRYPT(bp, B_TRUE); + VERIFY0(spa_do_crypt_mac_abd(B_TRUE, spa, dsobj, + zio->io_abd, psize, mac)); + zio_crypt_encode_mac_bp(bp, mac); + return (zio); + } + + /* + * Later passes of sync-to-convergence may decide to rewrite data + * in place to avoid more disk reallocations. This presents a problem + * for encryption because this constitutes rewriting the new data with + * the same encryption key and IV. However, this only applies to blocks + * in the MOS (particularly the spacemaps) and we do not encrypt the + * MOS. We assert that the zio is allocating or an intent log write + * to enforce this. + */ + ASSERT(IO_IS_ALLOCATING(zio) || ot == DMU_OT_INTENT_LOG); + ASSERT(BP_GET_LEVEL(bp) == 0 || ot == DMU_OT_INTENT_LOG); + ASSERT(spa_feature_is_active(spa, SPA_FEATURE_ENCRYPTION)); + ASSERT3U(psize, !=, 0); + + enc_buf = zio_buf_alloc(psize); + eabd = abd_get_from_buf(enc_buf, psize); + abd_take_ownership_of_buf(eabd, B_TRUE); + + /* + * For an explanation of what encryption parameters are stored + * where, see the block comment in zio_crypt.c. + */ + if (ot == DMU_OT_INTENT_LOG) { + zio_crypt_decode_params_bp(bp, salt, iv); + } else { + BP_SET_CRYPT(bp, B_TRUE); + } + + /* Perform the encryption. This should not fail */ + VERIFY0(spa_do_crypt_abd(B_TRUE, spa, &zio->io_bookmark, + BP_GET_TYPE(bp), BP_GET_DEDUP(bp), BP_SHOULD_BYTESWAP(bp), + salt, iv, mac, psize, zio->io_abd, eabd, &no_crypt)); + + /* encode encryption metadata into the bp */ + if (ot == DMU_OT_INTENT_LOG) { + /* + * ZIL blocks store the MAC in the embedded checksum, so the + * transform must always be applied. + */ + zio_crypt_encode_mac_zil(enc_buf, mac); + zio_push_transform(zio, eabd, psize, psize, NULL); + } else { + BP_SET_CRYPT(bp, B_TRUE); + zio_crypt_encode_params_bp(bp, salt, iv); + zio_crypt_encode_mac_bp(bp, mac); + + if (no_crypt) { + ASSERT3U(ot, ==, DMU_OT_DNODE); + abd_free(eabd); + } else { + zio_push_transform(zio, eabd, psize, psize, NULL); + } + } + + return (zio); +} + +/* + * ========================================================================== + * Generate and verify checksums + * ========================================================================== + */ +static zio_t * +zio_checksum_generate(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + enum zio_checksum checksum; + + if (bp == NULL) { + /* + * This is zio_write_phys(). + * We're either generating a label checksum, or none at all. + */ + checksum = zio->io_prop.zp_checksum; + + if (checksum == ZIO_CHECKSUM_OFF) + return (zio); + + ASSERT(checksum == ZIO_CHECKSUM_LABEL); + } else { + if (BP_IS_GANG(bp) && zio->io_child_type == ZIO_CHILD_GANG) { + ASSERT(!IO_IS_ALLOCATING(zio)); + checksum = ZIO_CHECKSUM_GANG_HEADER; + } else { + checksum = BP_GET_CHECKSUM(bp); + } + } + + zio_checksum_compute(zio, checksum, zio->io_abd, zio->io_size); + + return (zio); +} + +static zio_t * +zio_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t info; + blkptr_t *bp = zio->io_bp; + int error; + + ASSERT(zio->io_vd != NULL); + + if (bp == NULL) { + /* + * This is zio_read_phys(). + * We're either verifying a label checksum, or nothing at all. + */ + if (zio->io_prop.zp_checksum == ZIO_CHECKSUM_OFF) + return (zio); + + ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); + } + + if ((error = zio_checksum_error(zio, &info)) != 0) { + zio->io_error = error; + if (error == ECKSUM && + !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_checksum_errors++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, &zio->io_bookmark, zio, + zio->io_offset, zio->io_size, NULL, &info); + } + } + + return (zio); +} + +/* + * Called by RAID-Z to ensure we don't compute the checksum twice. + */ +void +zio_checksum_verified(zio_t *zio) +{ + zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; +} + +/* + * ========================================================================== + * Error rank. Error are ranked in the order 0, ENXIO, ECKSUM, EIO, other. + * An error of 0 indicates success. ENXIO indicates whole-device failure, + * which may be transient (e.g. unplugged) or permanent. ECKSUM and EIO + * indicate errors that are specific to one I/O, and most likely permanent. + * Any other error is presumed to be worse because we weren't expecting it. + * ========================================================================== + */ +int +zio_worst_error(int e1, int e2) +{ + static int zio_error_rank[] = { 0, ENXIO, ECKSUM, EIO }; + int r1, r2; + + for (r1 = 0; r1 < sizeof (zio_error_rank) / sizeof (int); r1++) + if (e1 == zio_error_rank[r1]) + break; + + for (r2 = 0; r2 < sizeof (zio_error_rank) / sizeof (int); r2++) + if (e2 == zio_error_rank[r2]) + break; + + return (r1 > r2 ? e1 : e2); +} + +/* + * ========================================================================== + * I/O completion + * ========================================================================== + */ +static zio_t * +zio_ready(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + zio_t *pio, *pio_next; + zio_link_t *zl = NULL; + + if (zio_wait_for_children(zio, ZIO_CHILD_GANG_BIT | ZIO_CHILD_DDT_BIT, + ZIO_WAIT_READY)) { + return (NULL); + } + + if (zio->io_ready) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp) || + (zio->io_flags & ZIO_FLAG_NOPWRITE)); + ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); + + zio->io_ready(zio); + } + + if (bp != NULL && bp != &zio->io_bp_copy) + zio->io_bp_copy = *bp; + + if (zio->io_error != 0) { + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(IO_IS_ALLOCATING(zio)); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_metaslab_class != NULL); + + /* + * We were unable to allocate anything, unreserve and + * issue the next I/O to allocate. + */ + metaslab_class_throttle_unreserve( + zio->io_metaslab_class, zio->io_prop.zp_copies, + zio->io_allocator, zio); + zio_allocate_dispatch(zio->io_spa, zio->io_allocator); + } + } + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_READY] = 1; + pio = zio_walk_parents(zio, &zl); + mutex_exit(&zio->io_lock); + + /* + * As we notify zio's parents, new parents could be added. + * New parents go to the head of zio's io_parent_list, however, + * so we will (correctly) not notify them. The remainder of zio's + * io_parent_list, from 'pio_next' onward, cannot change because + * all parents must wait for us to be done before they can be done. + */ + for (; pio != NULL; pio = pio_next) { + pio_next = zio_walk_parents(zio, &zl); + zio_notify_parent(pio, zio, ZIO_WAIT_READY, NULL); + } + + if (zio->io_flags & ZIO_FLAG_NODATA) { + if (BP_IS_GANG(bp)) { + zio->io_flags &= ~ZIO_FLAG_NODATA; + } else { + ASSERT((uintptr_t)zio->io_abd < SPA_MAXBLOCKSIZE); + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } + } + + if (zio_injection_enabled && + zio->io_spa->spa_syncing_txg == zio->io_txg) + zio_handle_ignored_writes(zio); + + return (zio); +} + +/* + * Update the allocation throttle accounting. + */ +static void +zio_dva_throttle_done(zio_t *zio) +{ + zio_t *lio __maybe_unused = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + vdev_t *vd = zio->io_vd; + int flags = METASLAB_ASYNC_ALLOC; + + ASSERT3P(zio->io_bp, !=, NULL); + ASSERT3U(zio->io_type, ==, ZIO_TYPE_WRITE); + ASSERT3U(zio->io_priority, ==, ZIO_PRIORITY_ASYNC_WRITE); + ASSERT3U(zio->io_child_type, ==, ZIO_CHILD_VDEV); + ASSERT(vd != NULL); + ASSERT3P(vd, ==, vd->vdev_top); + ASSERT(zio_injection_enabled || !(zio->io_flags & ZIO_FLAG_IO_RETRY)); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT(zio->io_flags & ZIO_FLAG_IO_ALLOCATING); + ASSERT(!(lio->io_flags & ZIO_FLAG_IO_REWRITE)); + ASSERT(!(lio->io_orig_flags & ZIO_FLAG_NODATA)); + + /* + * Parents of gang children can have two flavors -- ones that + * allocated the gang header (will have ZIO_FLAG_IO_REWRITE set) + * and ones that allocated the constituent blocks. The allocation + * throttle needs to know the allocating parent zio so we must find + * it here. + */ + if (pio->io_child_type == ZIO_CHILD_GANG) { + /* + * If our parent is a rewrite gang child then our grandparent + * would have been the one that performed the allocation. + */ + if (pio->io_flags & ZIO_FLAG_IO_REWRITE) + pio = zio_unique_parent(pio); + flags |= METASLAB_GANG_CHILD; + } + + ASSERT(IO_IS_ALLOCATING(pio)); + ASSERT3P(zio, !=, zio->io_logical); + ASSERT(zio->io_logical != NULL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REPAIR)); + ASSERT0(zio->io_flags & ZIO_FLAG_NOPWRITE); + ASSERT(zio->io_metaslab_class != NULL); + + mutex_enter(&pio->io_lock); + metaslab_group_alloc_decrement(zio->io_spa, vd->vdev_id, pio, flags, + pio->io_allocator, B_TRUE); + mutex_exit(&pio->io_lock); + + metaslab_class_throttle_unreserve(zio->io_metaslab_class, 1, + pio->io_allocator, pio); + + /* + * Call into the pipeline to see if there is more work that + * needs to be done. If there is work to be done it will be + * dispatched to another taskq thread. + */ + zio_allocate_dispatch(zio->io_spa, pio->io_allocator); +} + +static zio_t * +zio_done(zio_t *zio) +{ + /* + * Always attempt to keep stack usage minimal here since + * we can be called recursively up to 19 levels deep. + */ + const uint64_t psize = zio->io_size; + zio_t *pio, *pio_next; + zio_link_t *zl = NULL; + + /* + * If our children haven't all completed, + * wait for them and then repeat this pipeline stage. + */ + if (zio_wait_for_children(zio, ZIO_CHILD_ALL_BITS, ZIO_WAIT_DONE)) { + return (NULL); + } + + /* + * If the allocation throttle is enabled, then update the accounting. + * We only track child I/Os that are part of an allocating async + * write. We must do this since the allocation is performed + * by the logical I/O but the actual write is done by child I/Os. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING && + zio->io_child_type == ZIO_CHILD_VDEV) { + ASSERT(zio->io_metaslab_class != NULL); + ASSERT(zio->io_metaslab_class->mc_alloc_throttle_enabled); + zio_dva_throttle_done(zio); + } + + /* + * If the allocation throttle is enabled, verify that + * we have decremented the refcounts for every I/O that was throttled. + */ + if (zio->io_flags & ZIO_FLAG_IO_ALLOCATING) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_priority == ZIO_PRIORITY_ASYNC_WRITE); + ASSERT(zio->io_bp != NULL); + + metaslab_group_alloc_verify(zio->io_spa, zio->io_bp, zio, + zio->io_allocator); + VERIFY(zfs_refcount_not_held( + &zio->io_metaslab_class->mc_alloc_slots[zio->io_allocator], + zio)); + } + + + for (int c = 0; c < ZIO_CHILD_TYPES; c++) + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + ASSERT(zio->io_children[c][w] == 0); + + if (zio->io_bp != NULL && !BP_IS_EMBEDDED(zio->io_bp)) { + ASSERT(zio->io_bp->blk_pad[0] == 0); + ASSERT(zio->io_bp->blk_pad[1] == 0); + ASSERT(bcmp(zio->io_bp, &zio->io_bp_copy, + sizeof (blkptr_t)) == 0 || + (zio->io_bp == zio_unique_parent(zio)->io_bp)); + if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(zio->io_bp) && + zio->io_bp_override == NULL && + !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { + ASSERT3U(zio->io_prop.zp_copies, <=, + BP_GET_NDVAS(zio->io_bp)); + ASSERT(BP_COUNT_GANG(zio->io_bp) == 0 || + (BP_COUNT_GANG(zio->io_bp) == + BP_GET_NDVAS(zio->io_bp))); + } + if (zio->io_flags & ZIO_FLAG_NOPWRITE) + VERIFY(BP_EQUAL(zio->io_bp, &zio->io_bp_orig)); + } + + /* + * If there were child vdev/gang/ddt errors, they apply to us now. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); + zio_inherit_child_errors(zio, ZIO_CHILD_GANG); + zio_inherit_child_errors(zio, ZIO_CHILD_DDT); + + /* + * If the I/O on the transformed data was successful, generate any + * checksum reports now while we still have the transformed data. + */ + if (zio->io_error == 0) { + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + uint64_t align = zcr->zcr_align; + uint64_t asize = P2ROUNDUP(psize, align); + abd_t *adata = zio->io_abd; + + if (asize != psize) { + adata = abd_alloc(asize, B_TRUE); + abd_copy(adata, zio->io_abd, psize); + abd_zero_off(adata, psize, asize - psize); + } + + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, adata); + zfs_ereport_free_checksum(zcr); + + if (asize != psize) + abd_free(adata); + } + } + + zio_pop_transforms(zio); /* note: may set zio->io_error */ + + vdev_stat_update(zio, psize); + + /* + * If this I/O is attached to a particular vdev is slow, exceeding + * 30 seconds to complete, post an error described the I/O delay. + * We ignore these errors if the device is currently unavailable. + */ + if (zio->io_delay >= MSEC2NSEC(zio_slow_io_ms)) { + if (zio->io_vd != NULL && !vdev_is_dead(zio->io_vd)) { + /* + * We want to only increment our slow IO counters if + * the IO is valid (i.e. not if the drive is removed). + * + * zfs_ereport_post() will also do these checks, but + * it can also ratelimit and have other failures, so we + * need to increment the slow_io counters independent + * of it. + */ + if (zfs_ereport_is_valid(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, zio)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + zio->io_vd->vdev_stat.vs_slow_ios++; + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_DELAY, + zio->io_spa, zio->io_vd, &zio->io_bookmark, + zio, 0, 0); + } + } + } + + if (zio->io_error) { + /* + * If this I/O is attached to a particular vdev, + * generate an error message describing the I/O failure + * at the block level. We ignore these errors if the + * device is currently unavailable. + */ + if (zio->io_error != ECKSUM && zio->io_vd != NULL && + !vdev_is_dead(zio->io_vd)) { + mutex_enter(&zio->io_vd->vdev_stat_lock); + if (zio->io_type == ZIO_TYPE_READ) { + zio->io_vd->vdev_stat.vs_read_errors++; + } else if (zio->io_type == ZIO_TYPE_WRITE) { + zio->io_vd->vdev_stat.vs_write_errors++; + } + mutex_exit(&zio->io_vd->vdev_stat_lock); + + zfs_ereport_post(FM_EREPORT_ZFS_IO, zio->io_spa, + zio->io_vd, &zio->io_bookmark, zio, 0, 0); + } + + if ((zio->io_error == EIO || !(zio->io_flags & + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + zio == zio->io_logical) { + /* + * For logical I/O requests, tell the SPA to log the + * error and generate a logical data ereport. + */ + spa_log_error(zio->io_spa, &zio->io_bookmark); + zfs_ereport_post(FM_EREPORT_ZFS_DATA, zio->io_spa, + NULL, &zio->io_bookmark, zio, 0, 0); + } + } + + if (zio->io_error && zio == zio->io_logical) { + /* + * Determine whether zio should be reexecuted. This will + * propagate all the way to the root via zio_notify_parent(). + */ + ASSERT(zio->io_vd == NULL && zio->io_bp != NULL); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (IO_IS_ALLOCATING(zio) && + !(zio->io_flags & ZIO_FLAG_CANFAIL)) { + if (zio->io_error != ENOSPC) + zio->io_reexecute |= ZIO_REEXECUTE_NOW; + else + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + } + + if ((zio->io_type == ZIO_TYPE_READ || + zio->io_type == ZIO_TYPE_FREE) && + !(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && + zio->io_error == ENXIO && + spa_load_state(zio->io_spa) == SPA_LOAD_NONE && + spa_get_failmode(zio->io_spa) != ZIO_FAILURE_MODE_CONTINUE) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) + zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + /* + * Here is a possibly good place to attempt to do + * either combinatorial reconstruction or error correction + * based on checksums. It also might be a good place + * to send out preliminary ereports before we suspend + * processing. + */ + } + + /* + * If there were logical child errors, they apply to us now. + * We defer this until now to avoid conflating logical child + * errors with errors that happened to the zio itself when + * updating vdev stats and reporting FMA events above. + */ + zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + + if ((zio->io_error || zio->io_reexecute) && + IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && + !(zio->io_flags & (ZIO_FLAG_IO_REWRITE | ZIO_FLAG_NOPWRITE))) + zio_dva_unallocate(zio, zio->io_gang_tree, zio->io_bp); + + zio_gang_tree_free(&zio->io_gang_tree); + + /* + * Godfather I/Os should never suspend. + */ + if ((zio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) + zio->io_reexecute &= ~ZIO_REEXECUTE_SUSPEND; + + if (zio->io_reexecute) { + /* + * This is a logical I/O that wants to reexecute. + * + * Reexecute is top-down. When an i/o fails, if it's not + * the root, it simply notifies its parent and sticks around. + * The parent, seeing that it still has children in zio_done(), + * does the same. This percolates all the way up to the root. + * The root i/o will reexecute or suspend the entire tree. + * + * This approach ensures that zio_reexecute() honors + * all the original i/o dependency relationships, e.g. + * parents not executing until children are ready. + */ + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + zio->io_gang_leader = NULL; + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * "The Godfather" I/O monitors its children but is + * not a true parent to them. It will track them through + * the pipeline but severs its ties whenever they get into + * trouble (e.g. suspended). This allows "The Godfather" + * I/O to return status without blocking. + */ + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; + pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + + if ((pio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + zio_remove_child(pio, zio, remove_zl); + /* + * This is a rare code path, so we don't + * bother with "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, + NULL); + } + } + + if ((pio = zio_unique_parent(zio)) != NULL) { + /* + * We're not a root i/o, so there's nothing to do + * but notify our parent. Don't propagate errors + * upward since we haven't permanently failed yet. + */ + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); + zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; + /* + * This is a rare code path, so we don't bother with + * "next_to_execute". + */ + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, NULL); + } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { + /* + * We'd fail again if we reexecuted now, so suspend + * until conditions improve (e.g. device comes online). + */ + zio_suspend(zio->io_spa, zio, ZIO_SUSPEND_IOERR); + } else { + /* + * Reexecution is potentially a huge amount of work. + * Hand it off to the otherwise-unused claim taskq. + */ + ASSERT(taskq_empty_ent(&zio->io_tqent)); + spa_taskq_dispatch_ent(zio->io_spa, + ZIO_TYPE_CLAIM, ZIO_TASKQ_ISSUE, + (task_func_t *)zio_reexecute, zio, 0, + &zio->io_tqent); + } + return (NULL); + } + + ASSERT(zio->io_child_count == 0); + ASSERT(zio->io_reexecute == 0); + ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + + /* + * Report any checksum errors, since the I/O is complete. + */ + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, NULL); + zfs_ereport_free_checksum(zcr); + } + + if (zio->io_flags & ZIO_FLAG_FASTWRITE && zio->io_bp && + !BP_IS_HOLE(zio->io_bp) && !BP_IS_EMBEDDED(zio->io_bp) && + !(zio->io_flags & ZIO_FLAG_NOPWRITE)) { + metaslab_fastwrite_unmark(zio->io_spa, zio->io_bp); + } + + /* + * It is the responsibility of the done callback to ensure that this + * particular zio is no longer discoverable for adoption, and as + * such, cannot acquire any new parents. + */ + if (zio->io_done) + zio->io_done(zio); + + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * We are done executing this zio. We may want to execute a parent + * next. See the comment in zio_notify_parent(). + */ + zio_t *next_to_execute = NULL; + zl = NULL; + for (pio = zio_walk_parents(zio, &zl); pio != NULL; pio = pio_next) { + zio_link_t *remove_zl = zl; + pio_next = zio_walk_parents(zio, &zl); + zio_remove_child(pio, zio, remove_zl); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE, &next_to_execute); + } + + if (zio->io_waiter != NULL) { + mutex_enter(&zio->io_lock); + zio->io_executor = NULL; + cv_broadcast(&zio->io_cv); + mutex_exit(&zio->io_lock); + } else { + zio_destroy(zio); + } + + return (next_to_execute); +} + +/* + * ========================================================================== + * I/O pipeline definition + * ========================================================================== + */ +static zio_pipe_stage_t *zio_pipeline[] = { + NULL, + zio_read_bp_init, + zio_write_bp_init, + zio_free_bp_init, + zio_issue_async, + zio_write_compress, + zio_encrypt, + zio_checksum_generate, + zio_nop_write, + zio_ddt_read_start, + zio_ddt_read_done, + zio_ddt_write, + zio_ddt_free, + zio_gang_assemble, + zio_gang_issue, + zio_dva_throttle, + zio_dva_allocate, + zio_dva_free, + zio_dva_claim, + zio_ready, + zio_vdev_io_start, + zio_vdev_io_done, + zio_vdev_io_assess, + zio_checksum_verify, + zio_done +}; + + + + +/* + * Compare two zbookmark_phys_t's to see which we would reach first in a + * pre-order traversal of the object tree. + * + * This is simple in every case aside from the meta-dnode object. For all other + * objects, we traverse them in order (object 1 before object 2, and so on). + * However, all of these objects are traversed while traversing object 0, since + * the data it points to is the list of objects. Thus, we need to convert to a + * canonical representation so we can compare meta-dnode bookmarks to + * non-meta-dnode bookmarks. + * + * We do this by calculating "equivalents" for each field of the zbookmark. + * zbookmarks outside of the meta-dnode use their own object and level, and + * calculate the level 0 equivalent (the first L0 blkid that is contained in the + * blocks this bookmark refers to) by multiplying their blkid by their span + * (the number of L0 blocks contained within one block at their level). + * zbookmarks inside the meta-dnode calculate their object equivalent + * (which is L0equiv * dnodes per data block), use 0 for their L0equiv, and use + * level + 1<<31 (any value larger than a level could ever be) for their level. + * This causes them to always compare before a bookmark in their object + * equivalent, compare appropriately to bookmarks in other objects, and to + * compare appropriately to other bookmarks in the meta-dnode. + */ +int +zbookmark_compare(uint16_t dbss1, uint8_t ibs1, uint16_t dbss2, uint8_t ibs2, + const zbookmark_phys_t *zb1, const zbookmark_phys_t *zb2) +{ + /* + * These variables represent the "equivalent" values for the zbookmark, + * after converting zbookmarks inside the meta dnode to their + * normal-object equivalents. + */ + uint64_t zb1obj, zb2obj; + uint64_t zb1L0, zb2L0; + uint64_t zb1level, zb2level; + + if (zb1->zb_object == zb2->zb_object && + zb1->zb_level == zb2->zb_level && + zb1->zb_blkid == zb2->zb_blkid) + return (0); + + IMPLY(zb1->zb_level > 0, ibs1 >= SPA_MINBLOCKSHIFT); + IMPLY(zb2->zb_level > 0, ibs2 >= SPA_MINBLOCKSHIFT); + + /* + * BP_SPANB calculates the span in blocks. + */ + zb1L0 = (zb1->zb_blkid) * BP_SPANB(ibs1, zb1->zb_level); + zb2L0 = (zb2->zb_blkid) * BP_SPANB(ibs2, zb2->zb_level); + + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { + zb1obj = zb1L0 * (dbss1 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb1L0 = 0; + zb1level = zb1->zb_level + COMPARE_META_LEVEL; + } else { + zb1obj = zb1->zb_object; + zb1level = zb1->zb_level; + } + + if (zb2->zb_object == DMU_META_DNODE_OBJECT) { + zb2obj = zb2L0 * (dbss2 << (SPA_MINBLOCKSHIFT - DNODE_SHIFT)); + zb2L0 = 0; + zb2level = zb2->zb_level + COMPARE_META_LEVEL; + } else { + zb2obj = zb2->zb_object; + zb2level = zb2->zb_level; + } + + /* Now that we have a canonical representation, do the comparison. */ + if (zb1obj != zb2obj) + return (zb1obj < zb2obj ? -1 : 1); + else if (zb1L0 != zb2L0) + return (zb1L0 < zb2L0 ? -1 : 1); + else if (zb1level != zb2level) + return (zb1level > zb2level ? -1 : 1); + /* + * This can (theoretically) happen if the bookmarks have the same object + * and level, but different blkids, if the block sizes are not the same. + * There is presently no way to change the indirect block sizes + */ + return (0); +} + +/* + * This function checks the following: given that last_block is the place that + * our traversal stopped last time, does that guarantee that we've visited + * every node under subtree_root? Therefore, we can't just use the raw output + * of zbookmark_compare. We have to pass in a modified version of + * subtree_root; by incrementing the block id, and then checking whether + * last_block is before or equal to that, we can tell whether or not having + * visited last_block implies that all of subtree_root's children have been + * visited. + */ +boolean_t +zbookmark_subtree_completed(const dnode_phys_t *dnp, + const zbookmark_phys_t *subtree_root, const zbookmark_phys_t *last_block) +{ + zbookmark_phys_t mod_zb = *subtree_root; + mod_zb.zb_blkid++; + ASSERT(last_block->zb_level == 0); + + /* The objset_phys_t isn't before anything. */ + if (dnp == NULL) + return (B_FALSE); + + /* + * We pass in 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT) for the + * data block size in sectors, because that variable is only used if + * the bookmark refers to a block in the meta-dnode. Since we don't + * know without examining it what object it refers to, and there's no + * harm in passing in this value in other cases, we always pass it in. + * + * We pass in 0 for the indirect block size shift because zb2 must be + * level 0. The indirect block size is only used to calculate the span + * of the bookmark, but since the bookmark must be level 0, the span is + * always 1, so the math works out. + * + * If you make changes to how the zbookmark_compare code works, be sure + * to make sure that this code still works afterwards. + */ + return (zbookmark_compare(dnp->dn_datablkszsec, dnp->dn_indblkshift, + 1ULL << (DNODE_BLOCK_SHIFT - SPA_MINBLOCKSHIFT), 0, &mod_zb, + last_block) <= 0); +} + +EXPORT_SYMBOL(zio_type_name); +EXPORT_SYMBOL(zio_buf_alloc); +EXPORT_SYMBOL(zio_data_buf_alloc); +EXPORT_SYMBOL(zio_buf_free); +EXPORT_SYMBOL(zio_data_buf_free); + +/* BEGIN CSTYLED */ +ZFS_MODULE_PARAM(zfs_zio, zio_, slow_io_ms, INT, ZMOD_RW, + "Max I/O completion time (milliseconds) before marking it as slow"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, requeue_io_start_cut_in_line, INT, ZMOD_RW, + "Prioritize requeued I/O"); + +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_deferred_free, INT, ZMOD_RW, + "Defer frees starting in this pass"); + +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_dont_compress, INT, ZMOD_RW, + "Don't compress starting in this pass"); + +ZFS_MODULE_PARAM(zfs, zfs_, sync_pass_rewrite, INT, ZMOD_RW, + "Rewrite new bps starting in this pass"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, dva_throttle_enabled, INT, ZMOD_RW, + "Throttle block allocations in the ZIO pipeline"); + +ZFS_MODULE_PARAM(zfs_zio, zio_, deadman_log_all, INT, ZMOD_RW, + "Log all slow ZIOs, not just those with vdevs"); +/* END CSTYLED */ diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c new file mode 100644 index 000000000000..f8fee78c6068 --- /dev/null +++ b/module/zfs/zio_checksum.c @@ -0,0 +1,570 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2013, 2016 by Delphix. All rights reserved. + * Copyright 2013 Saso Kiselkov. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Checksum vectors. + * + * In the SPA, everything is checksummed. We support checksum vectors + * for three distinct reasons: + * + * 1. Different kinds of data need different levels of protection. + * For SPA metadata, we always want a very strong checksum. + * For user data, we let users make the trade-off between speed + * and checksum strength. + * + * 2. Cryptographic hash and MAC algorithms are an area of active research. + * It is likely that in future hash functions will be at least as strong + * as current best-of-breed, and may be substantially faster as well. + * We want the ability to take advantage of these new hashes as soon as + * they become available. + * + * 3. If someone develops hardware that can compute a strong hash quickly, + * we want the ability to take advantage of that hardware. + * + * Of course, we don't want a checksum upgrade to invalidate existing + * data, so we store the checksum *function* in eight bits of the bp. + * This gives us room for up to 256 different checksum functions. + * + * When writing a block, we always checksum it with the latest-and-greatest + * checksum function of the appropriate strength. When reading a block, + * we compare the expected checksum against the actual checksum, which we + * compute via the checksum function specified by BP_GET_CHECKSUM(bp). + * + * SALTED CHECKSUMS + * + * To enable the use of less secure hash algorithms with dedup, we + * introduce the notion of salted checksums (MACs, really). A salted + * checksum is fed both a random 256-bit value (the salt) and the data + * to be checksummed. This salt is kept secret (stored on the pool, but + * never shown to the user). Thus even if an attacker knew of collision + * weaknesses in the hash algorithm, they won't be able to mount a known + * plaintext attack on the DDT, since the actual hash value cannot be + * known ahead of time. How the salt is used is algorithm-specific + * (some might simply prefix it to the data block, others might need to + * utilize a full-blown HMAC). On disk the salt is stored in a ZAP + * object in the MOS (DMU_POOL_CHECKSUM_SALT). + * + * CONTEXT TEMPLATES + * + * Some hashing algorithms need to perform a substantial amount of + * initialization work (e.g. salted checksums above may need to pre-hash + * the salt) before being able to process data. Performing this + * redundant work for each block would be wasteful, so we instead allow + * a checksum algorithm to do the work once (the first time it's used) + * and then keep this pre-initialized context as a template inside the + * spa_t (spa_cksum_tmpls). If the zio_checksum_info_t contains + * non-NULL ci_tmpl_init and ci_tmpl_free callbacks, they are used to + * construct and destruct the pre-initialized checksum context. The + * pre-initialized context is then reused during each checksum + * invocation and passed to the checksum function. + */ + +/*ARGSUSED*/ +static void +abd_checksum_off(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); +} + +/*ARGSUSED*/ +static void +abd_fletcher_2_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_native, zcp); +} + +/*ARGSUSED*/ +static void +abd_fletcher_2_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_init(zcp); + (void) abd_iterate_func(abd, 0, size, + fletcher_2_incremental_byteswap, zcp); +} + +static inline void +abd_fletcher_4_impl(abd_t *abd, uint64_t size, zio_abd_checksum_data_t *acdp) +{ + fletcher_4_abd_ops.acf_init(acdp); + abd_iterate_func(abd, 0, size, fletcher_4_abd_ops.acf_iter, acdp); + fletcher_4_abd_ops.acf_fini(acdp); +} + +/*ARGSUSED*/ +void +abd_fletcher_4_native(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_4_ctx_t ctx; + + zio_abd_checksum_data_t acd = { + .acd_byteorder = ZIO_CHECKSUM_NATIVE, + .acd_zcp = zcp, + .acd_ctx = &ctx + }; + + abd_fletcher_4_impl(abd, size, &acd); + +} + +/*ARGSUSED*/ +void +abd_fletcher_4_byteswap(abd_t *abd, uint64_t size, + const void *ctx_template, zio_cksum_t *zcp) +{ + fletcher_4_ctx_t ctx; + + zio_abd_checksum_data_t acd = { + .acd_byteorder = ZIO_CHECKSUM_BYTESWAP, + .acd_zcp = zcp, + .acd_ctx = &ctx + }; + + abd_fletcher_4_impl(abd, size, &acd); +} + +zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { + {{NULL, NULL}, NULL, NULL, 0, "inherit"}, + {{NULL, NULL}, NULL, NULL, 0, "on"}, + {{abd_checksum_off, abd_checksum_off}, + NULL, NULL, 0, "off"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "label"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_EMBEDDED, + "gang_header"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog"}, + {{abd_fletcher_2_native, abd_fletcher_2_byteswap}, + NULL, NULL, 0, "fletcher2"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA, "fletcher4"}, + {{abd_checksum_SHA256, abd_checksum_SHA256}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha256"}, + {{abd_fletcher_4_native, abd_fletcher_4_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_EMBEDDED, "zilog2"}, + {{abd_checksum_off, abd_checksum_off}, + NULL, NULL, 0, "noparity"}, + {{abd_checksum_SHA512_native, abd_checksum_SHA512_byteswap}, + NULL, NULL, ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_NOPWRITE, "sha512"}, + {{abd_checksum_skein_native, abd_checksum_skein_byteswap}, + abd_checksum_skein_tmpl_init, abd_checksum_skein_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_DEDUP | + ZCHECKSUM_FLAG_SALTED | ZCHECKSUM_FLAG_NOPWRITE, "skein"}, +#if !defined(__FreeBSD__) + {{abd_checksum_edonr_native, abd_checksum_edonr_byteswap}, + abd_checksum_edonr_tmpl_init, abd_checksum_edonr_tmpl_free, + ZCHECKSUM_FLAG_METADATA | ZCHECKSUM_FLAG_SALTED | + ZCHECKSUM_FLAG_NOPWRITE, "edonr"}, +#endif +}; + +/* + * The flag corresponding to the "verify" in dedup=[checksum,]verify + * must be cleared first, so callers should use ZIO_CHECKSUM_MASK. + */ +spa_feature_t +zio_checksum_to_feature(enum zio_checksum cksum) +{ + VERIFY((cksum & ~ZIO_CHECKSUM_MASK) == 0); + + switch (cksum) { + case ZIO_CHECKSUM_SHA512: + return (SPA_FEATURE_SHA512); + case ZIO_CHECKSUM_SKEIN: + return (SPA_FEATURE_SKEIN); +#if !defined(__FreeBSD__) + case ZIO_CHECKSUM_EDONR: + return (SPA_FEATURE_EDONR); +#endif + default: + return (SPA_FEATURE_NONE); + } +} + +enum zio_checksum +zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) +{ + ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (ZIO_CHECKSUM_ON_VALUE); + + return (child); +} + +enum zio_checksum +zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, + enum zio_checksum parent) +{ + ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (spa_dedup_checksum(spa)); + + if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) + return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); + + ASSERT((zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_flags & + ZCHECKSUM_FLAG_DEDUP) || + (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); + + return (child); +} + +/* + * Set the external verifier for a gang block based on , + * a tuple which is guaranteed to be unique for the life of the pool. + */ +static void +zio_checksum_gang_verifier(zio_cksum_t *zcp, const blkptr_t *bp) +{ + const dva_t *dva = BP_IDENTITY(bp); + uint64_t txg = BP_PHYSICAL_BIRTH(bp); + + ASSERT(BP_IS_GANG(bp)); + + ZIO_SET_CHECKSUM(zcp, DVA_GET_VDEV(dva), DVA_GET_OFFSET(dva), txg, 0); +} + +/* + * Set the external verifier for a label block based on its offset. + * The vdev is implicit, and the txg is unknowable at pool open time -- + * hence the logic in vdev_uberblock_load() to find the most recent copy. + */ +static void +zio_checksum_label_verifier(zio_cksum_t *zcp, uint64_t offset) +{ + ZIO_SET_CHECKSUM(zcp, offset, 0, 0, 0); +} + +/* + * Calls the template init function of a checksum which supports context + * templates and installs the template into the spa_t. + */ +static void +zio_checksum_template_init(enum zio_checksum checksum, spa_t *spa) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + if (ci->ci_tmpl_init == NULL) + return; + if (spa->spa_cksum_tmpls[checksum] != NULL) + return; + + VERIFY(ci->ci_tmpl_free != NULL); + mutex_enter(&spa->spa_cksum_tmpls_lock); + if (spa->spa_cksum_tmpls[checksum] == NULL) { + spa->spa_cksum_tmpls[checksum] = + ci->ci_tmpl_init(&spa->spa_cksum_salt); + VERIFY(spa->spa_cksum_tmpls[checksum] != NULL); + } + mutex_exit(&spa->spa_cksum_tmpls_lock); +} + +/* convenience function to update a checksum to accommodate an encryption MAC */ +static void +zio_checksum_handle_crypt(zio_cksum_t *cksum, zio_cksum_t *saved, boolean_t xor) +{ + /* + * Weak checksums do not have their entropy spread evenly + * across the bits of the checksum. Therefore, when truncating + * a weak checksum we XOR the first 2 words with the last 2 so + * that we don't "lose" any entropy unnecessarily. + */ + if (xor) { + cksum->zc_word[0] ^= cksum->zc_word[2]; + cksum->zc_word[1] ^= cksum->zc_word[3]; + } + + cksum->zc_word[2] = saved->zc_word[2]; + cksum->zc_word[3] = saved->zc_word[3]; +} + +/* + * Generate the checksum. + */ +void +zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, + abd_t *abd, uint64_t size) +{ + static const uint64_t zec_magic = ZEC_MAGIC; + blkptr_t *bp = zio->io_bp; + uint64_t offset = zio->io_offset; + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t cksum, saved; + spa_t *spa = zio->io_spa; + boolean_t insecure = (ci->ci_flags & ZCHECKSUM_FLAG_DEDUP) == 0; + + ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(ci->ci_func[0] != NULL); + + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + zio_eck_t eck; + size_t eck_offset; + + bzero(&saved, sizeof (zio_cksum_t)); + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t zilc; + abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); + + size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, + uint64_t); + eck = zilc.zc_eck; + eck_offset = offsetof(zil_chain_t, zc_eck); + } else { + eck_offset = size - sizeof (zio_eck_t); + abd_copy_to_buf_off(&eck, abd, eck_offset, + sizeof (zio_eck_t)); + } + + if (checksum == ZIO_CHECKSUM_GANG_HEADER) { + zio_checksum_gang_verifier(&eck.zec_cksum, bp); + } else if (checksum == ZIO_CHECKSUM_LABEL) { + zio_checksum_label_verifier(&eck.zec_cksum, offset); + } else { + saved = eck.zec_cksum; + eck.zec_cksum = bp->blk_cksum; + } + + abd_copy_from_buf_off(abd, &zec_magic, + eck_offset + offsetof(zio_eck_t, zec_magic), + sizeof (zec_magic)); + abd_copy_from_buf_off(abd, &eck.zec_cksum, + eck_offset + offsetof(zio_eck_t, zec_cksum), + sizeof (zio_cksum_t)); + + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &cksum); + if (bp != NULL && BP_USES_CRYPT(bp) && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_checksum_handle_crypt(&cksum, &saved, insecure); + + abd_copy_from_buf_off(abd, &cksum, + eck_offset + offsetof(zio_eck_t, zec_cksum), + sizeof (zio_cksum_t)); + } else { + saved = bp->blk_cksum; + ci->ci_func[0](abd, size, spa->spa_cksum_tmpls[checksum], + &cksum); + if (BP_USES_CRYPT(bp) && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + zio_checksum_handle_crypt(&cksum, &saved, insecure); + bp->blk_cksum = cksum; + } +} + +int +zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, + enum zio_checksum checksum, abd_t *abd, uint64_t size, uint64_t offset, + zio_bad_cksum_t *info) +{ + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + zio_cksum_t actual_cksum, expected_cksum; + zio_eck_t eck; + int byteswap; + + if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) + return (SET_ERROR(EINVAL)); + + zio_checksum_template_init(checksum, spa); + + if (ci->ci_flags & ZCHECKSUM_FLAG_EMBEDDED) { + zio_cksum_t verifier; + size_t eck_offset; + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t zilc; + uint64_t nused; + + abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); + + eck = zilc.zc_eck; + eck_offset = offsetof(zil_chain_t, zc_eck) + + offsetof(zio_eck_t, zec_cksum); + + if (eck.zec_magic == ZEC_MAGIC) { + nused = zilc.zc_nused; + } else if (eck.zec_magic == BSWAP_64(ZEC_MAGIC)) { + nused = BSWAP_64(zilc.zc_nused); + } else { + return (SET_ERROR(ECKSUM)); + } + + if (nused > size) { + return (SET_ERROR(ECKSUM)); + } + + size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + } else { + eck_offset = size - sizeof (zio_eck_t); + abd_copy_to_buf_off(&eck, abd, eck_offset, + sizeof (zio_eck_t)); + eck_offset += offsetof(zio_eck_t, zec_cksum); + } + + if (checksum == ZIO_CHECKSUM_GANG_HEADER) + zio_checksum_gang_verifier(&verifier, bp); + else if (checksum == ZIO_CHECKSUM_LABEL) + zio_checksum_label_verifier(&verifier, offset); + else + verifier = bp->blk_cksum; + + byteswap = (eck.zec_magic == BSWAP_64(ZEC_MAGIC)); + + if (byteswap) + byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); + + expected_cksum = eck.zec_cksum; + + abd_copy_from_buf_off(abd, &verifier, eck_offset, + sizeof (zio_cksum_t)); + + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); + + abd_copy_from_buf_off(abd, &expected_cksum, eck_offset, + sizeof (zio_cksum_t)); + + if (byteswap) { + byteswap_uint64_array(&expected_cksum, + sizeof (zio_cksum_t)); + } + } else { + byteswap = BP_SHOULD_BYTESWAP(bp); + expected_cksum = bp->blk_cksum; + ci->ci_func[byteswap](abd, size, + spa->spa_cksum_tmpls[checksum], &actual_cksum); + } + + /* + * MAC checksums are a special case since half of this checksum will + * actually be the encryption MAC. This will be verified by the + * decryption process, so we just check the truncated checksum now. + * Objset blocks use embedded MACs so we don't truncate the checksum + * for them. + */ + if (bp != NULL && BP_USES_CRYPT(bp) && + BP_GET_TYPE(bp) != DMU_OT_OBJSET) { + if (!(ci->ci_flags & ZCHECKSUM_FLAG_DEDUP)) { + actual_cksum.zc_word[0] ^= actual_cksum.zc_word[2]; + actual_cksum.zc_word[1] ^= actual_cksum.zc_word[3]; + } + + actual_cksum.zc_word[2] = 0; + actual_cksum.zc_word[3] = 0; + expected_cksum.zc_word[2] = 0; + expected_cksum.zc_word[3] = 0; + } + + if (info != NULL) { + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + } + + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) + return (SET_ERROR(ECKSUM)); + + return (0); +} + +int +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) +{ + blkptr_t *bp = zio->io_bp; + uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : + (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); + int error; + uint64_t size = (bp == NULL ? zio->io_size : + (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); + uint64_t offset = zio->io_offset; + abd_t *data = zio->io_abd; + spa_t *spa = zio->io_spa; + + error = zio_checksum_error_impl(spa, bp, checksum, data, size, + offset, info); + + if (zio_injection_enabled && error == 0 && zio->io_error == 0) { + error = zio_handle_fault_injection(zio, ECKSUM); + if (error != 0) + info->zbc_injected = 1; + } + + return (error); +} + +/* + * Called by a spa_t that's about to be deallocated. This steps through + * all of the checksum context templates and deallocates any that were + * initialized using the algorithm-specific template init function. + */ +void +zio_checksum_templates_free(spa_t *spa) +{ + for (enum zio_checksum checksum = 0; + checksum < ZIO_CHECKSUM_FUNCTIONS; checksum++) { + if (spa->spa_cksum_tmpls[checksum] != NULL) { + zio_checksum_info_t *ci = &zio_checksum_table[checksum]; + + VERIFY(ci->ci_tmpl_free != NULL); + ci->ci_tmpl_free(spa->spa_cksum_tmpls[checksum]); + spa->spa_cksum_tmpls[checksum] = NULL; + } + } +} diff --git a/module/zfs/zio_compress.c b/module/zfs/zio_compress.c new file mode 100644 index 000000000000..2db3cec35d5d --- /dev/null +++ b/module/zfs/zio_compress.c @@ -0,0 +1,220 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * Copyright (c) 2013 by Saso Kiselkov. All rights reserved. + */ + +/* + * Copyright (c) 2013, 2018 by Delphix. All rights reserved. + * Copyright (c) 2019, Klara Inc. + * Copyright (c) 2019, Allan Jude + */ + +#include +#include +#include +#include +#include +#include + +/* + * If nonzero, every 1/X decompression attempts will fail, simulating + * an undetected memory error. + */ +unsigned long zio_decompress_fail_fraction = 0; + +/* + * Compression vectors. + */ +zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { + {"inherit", 0, NULL, NULL, NULL}, + {"on", 0, NULL, NULL, NULL}, + {"uncompressed", 0, NULL, NULL, NULL}, + {"lzjb", 0, lzjb_compress, lzjb_decompress, NULL}, + {"empty", 0, NULL, NULL, NULL}, + {"gzip-1", 1, gzip_compress, gzip_decompress, NULL}, + {"gzip-2", 2, gzip_compress, gzip_decompress, NULL}, + {"gzip-3", 3, gzip_compress, gzip_decompress, NULL}, + {"gzip-4", 4, gzip_compress, gzip_decompress, NULL}, + {"gzip-5", 5, gzip_compress, gzip_decompress, NULL}, + {"gzip-6", 6, gzip_compress, gzip_decompress, NULL}, + {"gzip-7", 7, gzip_compress, gzip_decompress, NULL}, + {"gzip-8", 8, gzip_compress, gzip_decompress, NULL}, + {"gzip-9", 9, gzip_compress, gzip_decompress, NULL}, + {"zle", 64, zle_compress, zle_decompress, NULL}, + {"lz4", 0, lz4_compress_zfs, lz4_decompress_zfs, NULL}, + {"zstd", ZIO_ZSTD_LEVEL_DEFAULT, zfs_zstd_compress, + zfs_zstd_decompress, zfs_zstd_decompress_level}, +}; + +uint8_t +zio_complevel_select(spa_t *spa, enum zio_compress compress, uint8_t child, + uint8_t parent) +{ + uint8_t result; + + if (!ZIO_COMPRESS_HASLEVEL(compress)) + return (0); + + result = child; + if (result == ZIO_COMPLEVEL_INHERIT) + result = parent; + + return (result); +} + +enum zio_compress +zio_compress_select(spa_t *spa, enum zio_compress child, + enum zio_compress parent) +{ + enum zio_compress result; + + ASSERT(child < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); + ASSERT(parent != ZIO_COMPRESS_INHERIT); + + result = child; + if (result == ZIO_COMPRESS_INHERIT) + result = parent; + + if (result == ZIO_COMPRESS_ON) { + if (spa_feature_is_active(spa, SPA_FEATURE_LZ4_COMPRESS)) + result = ZIO_COMPRESS_LZ4_ON_VALUE; + else + result = ZIO_COMPRESS_LEGACY_ON_VALUE; + } + + return (result); +} + +/*ARGSUSED*/ +static int +zio_compress_zeroed_cb(void *data, size_t len, void *private) +{ + uint64_t *end = (uint64_t *)((char *)data + len); + for (uint64_t *word = (uint64_t *)data; word < end; word++) + if (*word != 0) + return (1); + + return (0); +} + +size_t +zio_compress_data(enum zio_compress c, abd_t *src, void *dst, size_t s_len, + uint8_t level) +{ + size_t c_len, d_len; + uint8_t complevel; + zio_compress_info_t *ci = &zio_compress_table[c]; + + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + + /* + * If the data is all zeroes, we don't even need to allocate + * a block for it. We indicate this by returning zero size. + */ + if (abd_iterate_func(src, 0, s_len, zio_compress_zeroed_cb, NULL) == 0) + return (0); + + if (c == ZIO_COMPRESS_EMPTY) + return (s_len); + + /* Compress at least 12.5% */ + d_len = s_len - (s_len >> 3); + + complevel = ci->ci_level; + + if (c == ZIO_COMPRESS_ZSTD) { + /* If we don't know the level, we can't compress it */ + if (level == ZIO_COMPLEVEL_INHERIT) + return (s_len); + + if (level == ZIO_COMPLEVEL_DEFAULT) + complevel = ZIO_ZSTD_LEVEL_DEFAULT; + else + complevel = level; + + ASSERT3U(complevel, !=, ZIO_COMPLEVEL_INHERIT); + } + + /* No compression algorithms can read from ABDs directly */ + void *tmp = abd_borrow_buf_copy(src, s_len); + c_len = ci->ci_compress(tmp, dst, s_len, d_len, complevel); + abd_return_buf(src, tmp, s_len); + + if (c_len > d_len) + return (s_len); + + ASSERT3U(c_len, <=, d_len); + return (c_len); +} + +int +zio_decompress_data_buf(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level) +{ + zio_compress_info_t *ci = &zio_compress_table[c]; + if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) + return (SET_ERROR(EINVAL)); + + if (ci->ci_decompress_level != NULL && level != NULL) + return (ci->ci_decompress_level(src, dst, s_len, d_len, level)); + + return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); +} + +int +zio_decompress_data(enum zio_compress c, abd_t *src, void *dst, + size_t s_len, size_t d_len, uint8_t *level) +{ + void *tmp = abd_borrow_buf_copy(src, s_len); + int ret = zio_decompress_data_buf(c, tmp, dst, s_len, d_len, level); + abd_return_buf(src, tmp, s_len); + + /* + * Decompression shouldn't fail, because we've already verified + * the checksum. However, for extra protection (e.g. against bitflips + * in non-ECC RAM), we handle this error (and test it). + */ + if (zio_decompress_fail_fraction != 0 && + spa_get_random(zio_decompress_fail_fraction) == 0) + ret = SET_ERROR(EINVAL); + + return (ret); +} + +int +zio_compress_to_feature(enum zio_compress comp) +{ + switch (comp) { + case ZIO_COMPRESS_ZSTD: + return (SPA_FEATURE_ZSTD_COMPRESS); + default: + /* fallthru */; + } + return (SPA_FEATURE_NONE); +} diff --git a/module/zfs/zio_inject.c b/module/zfs/zio_inject.c new file mode 100644 index 000000000000..fb8ce0916eb5 --- /dev/null +++ b/module/zfs/zio_inject.c @@ -0,0 +1,966 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2015 by Delphix. All rights reserved. + * Copyright (c) 2017, Intel Corporation. + */ + +/* + * ZFS fault injection + * + * To handle fault injection, we keep track of a series of zinject_record_t + * structures which describe which logical block(s) should be injected with a + * fault. These are kept in a global list. Each record corresponds to a given + * spa_t and maintains a special hold on the spa_t so that it cannot be deleted + * or exported while the injection record exists. + * + * Device level injection is done using the 'zi_guid' field. If this is set, it + * means that the error is destined for a particular device, not a piece of + * data. + * + * This is a rather poor data structure and algorithm, but we don't expect more + * than a few faults at any one time, so it should be sufficient for our needs. + */ + +#include +#include +#include +#include +#include +#include +#include + +uint32_t zio_injection_enabled = 0; + +/* + * Data describing each zinject handler registered on the system, and + * contains the list node linking the handler in the global zinject + * handler list. + */ +typedef struct inject_handler { + int zi_id; + spa_t *zi_spa; + zinject_record_t zi_record; + uint64_t *zi_lanes; + int zi_next_lane; + list_node_t zi_link; +} inject_handler_t; + +/* + * List of all zinject handlers registered on the system, protected by + * the inject_lock defined below. + */ +static list_t inject_handlers; + +/* + * This protects insertion into, and traversal of, the inject handler + * list defined above; as well as the inject_delay_count. Any time a + * handler is inserted or removed from the list, this lock should be + * taken as a RW_WRITER; and any time traversal is done over the list + * (without modification to it) this lock should be taken as a RW_READER. + */ +static krwlock_t inject_lock; + +/* + * This holds the number of zinject delay handlers that have been + * registered on the system. It is protected by the inject_lock defined + * above. Thus modifications to this count must be a RW_WRITER of the + * inject_lock, and reads of this count must be (at least) a RW_READER + * of the lock. + */ +static int inject_delay_count = 0; + +/* + * This lock is used only in zio_handle_io_delay(), refer to the comment + * in that function for more details. + */ +static kmutex_t inject_delay_mtx; + +/* + * Used to assign unique identifying numbers to each new zinject handler. + */ +static int inject_next_id = 1; + +/* + * Test if the requested frequency was triggered + */ +static boolean_t +freq_triggered(uint32_t frequency) +{ + /* + * zero implies always (100%) + */ + if (frequency == 0) + return (B_TRUE); + + /* + * Note: we still handle legacy (unscaled) frequency values + */ + uint32_t maximum = (frequency <= 100) ? 100 : ZI_PERCENTAGE_MAX; + + return (spa_get_random(maximum) < frequency); +} + +/* + * Returns true if the given record matches the I/O in progress. + */ +static boolean_t +zio_match_handler(const zbookmark_phys_t *zb, uint64_t type, int dva, + zinject_record_t *record, int error) +{ + /* + * Check for a match against the MOS, which is based on type + */ + if (zb->zb_objset == DMU_META_OBJSET && + record->zi_objset == DMU_META_OBJSET && + record->zi_object == DMU_META_DNODE_OBJECT) { + if (record->zi_type == DMU_OT_NONE || + type == record->zi_type) + return (freq_triggered(record->zi_freq)); + else + return (B_FALSE); + } + + /* + * Check for an exact match. + */ + if (zb->zb_objset == record->zi_objset && + zb->zb_object == record->zi_object && + zb->zb_level == record->zi_level && + zb->zb_blkid >= record->zi_start && + zb->zb_blkid <= record->zi_end && + (record->zi_dvas == 0 || (record->zi_dvas & (1ULL << dva))) && + error == record->zi_error) { + return (freq_triggered(record->zi_freq)); + } + + return (B_FALSE); +} + +/* + * Panic the system when a config change happens in the function + * specified by tag. + */ +void +zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa) + continue; + + if (handler->zi_record.zi_type == type && + strcmp(tag, handler->zi_record.zi_func) == 0) + panic("Panic requested in function %s\n", tag); + } + + rw_exit(&inject_lock); +} + +/* + * Inject a decryption failure. Decryption failures can occur in + * both the ARC and the ZIO layers. + */ +int +zio_handle_decrypt_injection(spa_t *spa, const zbookmark_phys_t *zb, + uint64_t type, int error) +{ + int ret = 0; + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DECRYPT_FAULT) + continue; + + if (zio_match_handler(zb, type, ZI_NO_DVA, + &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + return (ret); +} + +/* + * If this is a physical I/O for a vdev child determine which DVA it is + * for. We iterate backwards through the DVAs matching on the offset so + * that we end up with ZI_NO_DVA (-1) if we don't find a match. + */ +static int +zio_match_dva(zio_t *zio) +{ + int i = ZI_NO_DVA; + + if (zio->io_bp != NULL && zio->io_vd != NULL && + zio->io_child_type == ZIO_CHILD_VDEV) { + for (i = BP_GET_NDVAS(zio->io_bp) - 1; i >= 0; i--) { + dva_t *dva = &zio->io_bp->blk_dva[i]; + uint64_t off = DVA_GET_OFFSET(dva); + vdev_t *vd = vdev_lookup_top(zio->io_spa, + DVA_GET_VDEV(dva)); + + /* Compensate for vdev label added to leaves */ + if (zio->io_vd->vdev_ops->vdev_op_leaf) + off += VDEV_LABEL_START_SIZE; + + if (zio->io_vd == vd && zio->io_offset == off) + break; + } + } + + return (i); +} + + +/* + * Determine if the I/O in question should return failure. Returns the errno + * to be returned to the caller. + */ +int +zio_handle_fault_injection(zio_t *zio, int error) +{ + int ret = 0; + inject_handler_t *handler; + + /* + * Ignore I/O not associated with any logical data. + */ + if (zio->io_logical == NULL) + return (0); + + /* + * Currently, we only support fault injection on reads. + */ + if (zio->io_type != ZIO_TYPE_READ) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_DATA_FAULT) + continue; + + /* If this handler matches, return the specified error */ + if (zio_match_handler(&zio->io_logical->io_bookmark, + zio->io_bp ? BP_GET_TYPE(zio->io_bp) : DMU_OT_NONE, + zio_match_dva(zio), &handler->zi_record, error)) { + ret = error; + break; + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +/* + * Determine if the zio is part of a label update and has an injection + * handler associated with that portion of the label. Currently, we + * allow error injection in either the nvlist or the uberblock region of + * of the vdev label. + */ +int +zio_handle_label_injection(zio_t *zio, int error) +{ + inject_handler_t *handler; + vdev_t *vd = zio->io_vd; + uint64_t offset = zio->io_offset; + int label; + int ret = 0; + + if (offset >= VDEV_LABEL_START_SIZE && + offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + uint64_t start = handler->zi_record.zi_start; + uint64_t end = handler->zi_record.zi_end; + + if (handler->zi_record.zi_cmd != ZINJECT_LABEL_FAULT) + continue; + + /* + * The injection region is the relative offsets within a + * vdev label. We must determine the label which is being + * updated and adjust our region accordingly. + */ + label = vdev_label_number(vd->vdev_psize, offset); + start = vdev_label_offset(vd->vdev_psize, label, start); + end = vdev_label_offset(vd->vdev_psize, label, end); + + if (zio->io_vd->vdev_guid == handler->zi_record.zi_guid && + (offset >= start && offset <= end)) { + ret = error; + break; + } + } + rw_exit(&inject_lock); + return (ret); +} + +/*ARGSUSED*/ +static int +zio_inject_bitflip_cb(void *data, size_t len, void *private) +{ + zio_t *zio __maybe_unused = private; + uint8_t *buffer = data; + uint_t byte = spa_get_random(len); + + ASSERT(zio->io_type == ZIO_TYPE_READ); + + /* flip a single random bit in an abd data buffer */ + buffer[byte] ^= 1 << spa_get_random(8); + + return (1); /* stop after first flip */ +} + +static int +zio_handle_device_injection_impl(vdev_t *vd, zio_t *zio, int err1, int err2) +{ + inject_handler_t *handler; + int ret = 0; + + /* + * We skip over faults in the labels unless it's during + * device open (i.e. zio == NULL). + */ + if (zio != NULL) { + uint64_t offset = zio->io_offset; + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + } + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (handler->zi_record.zi_cmd != ZINJECT_DEVICE_FAULT) + continue; + + if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_failfast && + (zio == NULL || (zio->io_flags & + (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { + continue; + } + + /* Handle type specific I/O failures */ + if (zio != NULL && + handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + + if (handler->zi_record.zi_error == err1 || + handler->zi_record.zi_error == err2) { + /* + * limit error injection if requested + */ + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + + /* + * For a failed open, pretend like the device + * has gone away. + */ + if (err1 == ENXIO) + vd->vdev_stat.vs_aux = + VDEV_AUX_OPEN_FAILED; + + /* + * Treat these errors as if they had been + * retried so that all the appropriate stats + * and FMA events are generated. + */ + if (!handler->zi_record.zi_failfast && + zio != NULL) + zio->io_flags |= ZIO_FLAG_IO_RETRY; + + /* + * EILSEQ means flip a bit after a read + */ + if (handler->zi_record.zi_error == EILSEQ) { + if (zio == NULL) + break; + + /* locate buffer data and flip a bit */ + (void) abd_iterate_func(zio->io_abd, 0, + zio->io_size, zio_inject_bitflip_cb, + zio); + break; + } + + ret = handler->zi_record.zi_error; + break; + } + if (handler->zi_record.zi_error == ENXIO) { + ret = SET_ERROR(EIO); + break; + } + } + } + + rw_exit(&inject_lock); + + return (ret); +} + +int +zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) +{ + return (zio_handle_device_injection_impl(vd, zio, error, INT_MAX)); +} + +int +zio_handle_device_injections(vdev_t *vd, zio_t *zio, int err1, int err2) +{ + return (zio_handle_device_injection_impl(vd, zio, err1, err2)); +} + +/* + * Simulate hardware that ignores cache flushes. For requested number + * of seconds nix the actual writing to disk. + */ +void +zio_handle_ignored_writes(zio_t *zio) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) + continue; + + /* + * Positive duration implies # of seconds, negative + * a number of txgs + */ + if (handler->zi_record.zi_timer == 0) { + if (handler->zi_record.zi_duration > 0) + handler->zi_record.zi_timer = ddi_get_lbolt64(); + else + handler->zi_record.zi_timer = zio->io_txg; + } + + /* Have a "problem" writing 60% of the time */ + if (spa_get_random(100) < 60) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + break; + } + + rw_exit(&inject_lock); +} + +void +spa_handle_ignored_writes(spa_t *spa) +{ + inject_handler_t *handler; + + if (zio_injection_enabled == 0) + return; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa || + handler->zi_record.zi_cmd != ZINJECT_IGNORED_WRITES) + continue; + + if (handler->zi_record.zi_duration > 0) { + VERIFY(handler->zi_record.zi_timer == 0 || + ddi_time_after64( + (int64_t)handler->zi_record.zi_timer + + handler->zi_record.zi_duration * hz, + ddi_get_lbolt64())); + } else { + /* duration is negative so the subtraction here adds */ + VERIFY(handler->zi_record.zi_timer == 0 || + handler->zi_record.zi_timer - + handler->zi_record.zi_duration >= + spa_syncing_txg(spa)); + } + } + + rw_exit(&inject_lock); +} + +hrtime_t +zio_handle_io_delay(zio_t *zio) +{ + vdev_t *vd = zio->io_vd; + inject_handler_t *min_handler = NULL; + hrtime_t min_target = 0; + + rw_enter(&inject_lock, RW_READER); + + /* + * inject_delay_count is a subset of zio_injection_enabled that + * is only incremented for delay handlers. These checks are + * mainly added to remind the reader why we're not explicitly + * checking zio_injection_enabled like the other functions. + */ + IMPLY(inject_delay_count > 0, zio_injection_enabled > 0); + IMPLY(zio_injection_enabled == 0, inject_delay_count == 0); + + /* + * If there aren't any inject delay handlers registered, then we + * can short circuit and simply return 0 here. A value of zero + * informs zio_delay_interrupt() that this request should not be + * delayed. This short circuit keeps us from acquiring the + * inject_delay_mutex unnecessarily. + */ + if (inject_delay_count == 0) { + rw_exit(&inject_lock); + return (0); + } + + /* + * Each inject handler has a number of "lanes" associated with + * it. Each lane is able to handle requests independently of one + * another, and at a latency defined by the inject handler + * record's zi_timer field. Thus if a handler in configured with + * a single lane with a 10ms latency, it will delay requests + * such that only a single request is completed every 10ms. So, + * if more than one request is attempted per each 10ms interval, + * the average latency of the requests will be greater than + * 10ms; but if only a single request is submitted each 10ms + * interval the average latency will be 10ms. + * + * We need to acquire this mutex to prevent multiple concurrent + * threads being assigned to the same lane of a given inject + * handler. The mutex allows us to perform the following two + * operations atomically: + * + * 1. determine the minimum handler and minimum target + * value of all the possible handlers + * 2. update that minimum handler's lane array + * + * Without atomicity, two (or more) threads could pick the same + * lane in step (1), and then conflict with each other in step + * (2). This could allow a single lane handler to process + * multiple requests simultaneously, which shouldn't be possible. + */ + mutex_enter(&inject_delay_mtx); + + for (inject_handler_t *handler = list_head(&inject_handlers); + handler != NULL; handler = list_next(&inject_handlers, handler)) { + if (handler->zi_record.zi_cmd != ZINJECT_DELAY_IO) + continue; + + if (!freq_triggered(handler->zi_record.zi_freq)) + continue; + + if (vd->vdev_guid != handler->zi_record.zi_guid) + continue; + + /* + * Defensive; should never happen as the array allocation + * occurs prior to inserting this handler on the list. + */ + ASSERT3P(handler->zi_lanes, !=, NULL); + + /* + * This should never happen, the zinject command should + * prevent a user from setting an IO delay with zero lanes. + */ + ASSERT3U(handler->zi_record.zi_nlanes, !=, 0); + + ASSERT3U(handler->zi_record.zi_nlanes, >, + handler->zi_next_lane); + + /* + * We want to issue this IO to the lane that will become + * idle the soonest, so we compare the soonest this + * specific handler can complete the IO with all other + * handlers, to find the lowest value of all possible + * lanes. We then use this lane to submit the request. + * + * Since each handler has a constant value for its + * delay, we can just use the "next" lane for that + * handler; as it will always be the lane with the + * lowest value for that particular handler (i.e. the + * lane that will become idle the soonest). This saves a + * scan of each handler's lanes array. + * + * There's two cases to consider when determining when + * this specific IO request should complete. If this + * lane is idle, we want to "submit" the request now so + * it will complete after zi_timer milliseconds. Thus, + * we set the target to now + zi_timer. + * + * If the lane is busy, we want this request to complete + * zi_timer milliseconds after the lane becomes idle. + * Since the 'zi_lanes' array holds the time at which + * each lane will become idle, we use that value to + * determine when this request should complete. + */ + hrtime_t idle = handler->zi_record.zi_timer + gethrtime(); + hrtime_t busy = handler->zi_record.zi_timer + + handler->zi_lanes[handler->zi_next_lane]; + hrtime_t target = MAX(idle, busy); + + if (min_handler == NULL) { + min_handler = handler; + min_target = target; + continue; + } + + ASSERT3P(min_handler, !=, NULL); + ASSERT3U(min_target, !=, 0); + + /* + * We don't yet increment the "next lane" variable since + * we still might find a lower value lane in another + * handler during any remaining iterations. Once we're + * sure we've selected the absolute minimum, we'll claim + * the lane and increment the handler's "next lane" + * field below. + */ + + if (target < min_target) { + min_handler = handler; + min_target = target; + } + } + + /* + * 'min_handler' will be NULL if no IO delays are registered for + * this vdev, otherwise it will point to the handler containing + * the lane that will become idle the soonest. + */ + if (min_handler != NULL) { + ASSERT3U(min_target, !=, 0); + min_handler->zi_lanes[min_handler->zi_next_lane] = min_target; + + /* + * If we've used all possible lanes for this handler, + * loop back and start using the first lane again; + * otherwise, just increment the lane index. + */ + min_handler->zi_next_lane = (min_handler->zi_next_lane + 1) % + min_handler->zi_record.zi_nlanes; + } + + mutex_exit(&inject_delay_mtx); + rw_exit(&inject_lock); + + return (min_target); +} + +static int +zio_calculate_range(const char *pool, zinject_record_t *record) +{ + dsl_pool_t *dp; + dsl_dataset_t *ds; + objset_t *os = NULL; + dnode_t *dn = NULL; + int error; + + /* + * Obtain the dnode for object using pool, objset, and object + */ + error = dsl_pool_hold(pool, FTAG, &dp); + if (error) + return (error); + + error = dsl_dataset_hold_obj(dp, record->zi_objset, FTAG, &ds); + dsl_pool_rele(dp, FTAG); + if (error) + return (error); + + error = dmu_objset_from_ds(ds, &os); + dsl_dataset_rele(ds, FTAG); + if (error) + return (error); + + error = dnode_hold(os, record->zi_object, FTAG, &dn); + if (error) + return (error); + + /* + * Translate the range into block IDs + */ + if (record->zi_start != 0 || record->zi_end != -1ULL) { + record->zi_start >>= dn->dn_datablkshift; + record->zi_end >>= dn->dn_datablkshift; + } + if (record->zi_level > 0) { + if (record->zi_level >= dn->dn_nlevels) { + dnode_rele(dn, FTAG); + return (SET_ERROR(EDOM)); + } + + if (record->zi_start != 0 || record->zi_end != 0) { + int shift = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + + for (int level = record->zi_level; level > 0; level--) { + record->zi_start >>= shift; + record->zi_end >>= shift; + } + } + } + + dnode_rele(dn, FTAG); + return (0); +} + +/* + * Create a new handler for the given record. We add it to the list, adding + * a reference to the spa_t in the process. We increment zio_injection_enabled, + * which is the switch to trigger all fault injection. + */ +int +zio_inject_fault(char *name, int flags, int *id, zinject_record_t *record) +{ + inject_handler_t *handler; + int error; + spa_t *spa; + + /* + * If this is pool-wide metadata, make sure we unload the corresponding + * spa_t, so that the next attempt to load it will trigger the fault. + * We call spa_reset() to unload the pool appropriately. + */ + if (flags & ZINJECT_UNLOAD_SPA) + if ((error = spa_reset(name)) != 0) + return (error); + + if (record->zi_cmd == ZINJECT_DELAY_IO) { + /* + * A value of zero for the number of lanes or for the + * delay time doesn't make sense. + */ + if (record->zi_timer == 0 || record->zi_nlanes == 0) + return (SET_ERROR(EINVAL)); + + /* + * The number of lanes is directly mapped to the size of + * an array used by the handler. Thus, to ensure the + * user doesn't trigger an allocation that's "too large" + * we cap the number of lanes here. + */ + if (record->zi_nlanes >= UINT16_MAX) + return (SET_ERROR(EINVAL)); + } + + /* + * If the supplied range was in bytes -- calculate the actual blkid + */ + if (flags & ZINJECT_CALC_RANGE) { + error = zio_calculate_range(name, record); + if (error != 0) + return (error); + } + + if (!(flags & ZINJECT_NULL)) { + /* + * spa_inject_ref() will add an injection reference, which will + * prevent the pool from being removed from the namespace while + * still allowing it to be unloaded. + */ + if ((spa = spa_inject_addref(name)) == NULL) + return (SET_ERROR(ENOENT)); + + handler = kmem_alloc(sizeof (inject_handler_t), KM_SLEEP); + + handler->zi_spa = spa; + handler->zi_record = *record; + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + handler->zi_lanes = kmem_zalloc( + sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes, KM_SLEEP); + handler->zi_next_lane = 0; + } else { + handler->zi_lanes = NULL; + handler->zi_next_lane = 0; + } + + rw_enter(&inject_lock, RW_WRITER); + + /* + * We can't move this increment into the conditional + * above because we need to hold the RW_WRITER lock of + * inject_lock, and we don't want to hold that while + * allocating the handler's zi_lanes array. + */ + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >=, 0); + inject_delay_count++; + ASSERT3S(inject_delay_count, >, 0); + } + + *id = handler->zi_id = inject_next_id++; + list_insert_tail(&inject_handlers, handler); + atomic_inc_32(&zio_injection_enabled); + + rw_exit(&inject_lock); + } + + /* + * Flush the ARC, so that any attempts to read this data will end up + * going to the ZIO layer. Note that this is a little overkill, but + * we don't have the necessary ARC interfaces to do anything else, and + * fault injection isn't a performance critical path. + */ + if (flags & ZINJECT_FLUSH_ARC) + /* + * We must use FALSE to ensure arc_flush returns, since + * we're not preventing concurrent ARC insertions. + */ + arc_flush(NULL, FALSE); + + return (0); +} + +/* + * Returns the next record with an ID greater than that supplied to the + * function. Used to iterate over all handlers in the system. + */ +int +zio_inject_list_next(int *id, char *name, size_t buflen, + zinject_record_t *record) +{ + inject_handler_t *handler; + int ret; + + mutex_enter(&spa_namespace_lock); + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id > *id) + break; + + if (handler) { + *record = handler->zi_record; + *id = handler->zi_id; + (void) strncpy(name, spa_name(handler->zi_spa), buflen); + ret = 0; + } else { + ret = SET_ERROR(ENOENT); + } + + rw_exit(&inject_lock); + mutex_exit(&spa_namespace_lock); + + return (ret); +} + +/* + * Clear the fault handler with the given identifier, or return ENOENT if none + * exists. + */ +int +zio_clear_fault(int id) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_WRITER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) + if (handler->zi_id == id) + break; + + if (handler == NULL) { + rw_exit(&inject_lock); + return (SET_ERROR(ENOENT)); + } + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3S(inject_delay_count, >, 0); + inject_delay_count--; + ASSERT3S(inject_delay_count, >=, 0); + } + + list_remove(&inject_handlers, handler); + rw_exit(&inject_lock); + + if (handler->zi_record.zi_cmd == ZINJECT_DELAY_IO) { + ASSERT3P(handler->zi_lanes, !=, NULL); + kmem_free(handler->zi_lanes, sizeof (*handler->zi_lanes) * + handler->zi_record.zi_nlanes); + } else { + ASSERT3P(handler->zi_lanes, ==, NULL); + } + + spa_inject_delref(handler->zi_spa); + kmem_free(handler, sizeof (inject_handler_t)); + atomic_dec_32(&zio_injection_enabled); + + return (0); +} + +void +zio_inject_init(void) +{ + rw_init(&inject_lock, NULL, RW_DEFAULT, NULL); + mutex_init(&inject_delay_mtx, NULL, MUTEX_DEFAULT, NULL); + list_create(&inject_handlers, sizeof (inject_handler_t), + offsetof(inject_handler_t, zi_link)); +} + +void +zio_inject_fini(void) +{ + list_destroy(&inject_handlers); + mutex_destroy(&inject_delay_mtx); + rw_destroy(&inject_lock); +} + +#if defined(_KERNEL) +EXPORT_SYMBOL(zio_injection_enabled); +EXPORT_SYMBOL(zio_inject_fault); +EXPORT_SYMBOL(zio_inject_list_next); +EXPORT_SYMBOL(zio_clear_fault); +EXPORT_SYMBOL(zio_handle_fault_injection); +EXPORT_SYMBOL(zio_handle_device_injection); +EXPORT_SYMBOL(zio_handle_label_injection); +#endif diff --git a/module/zfs/zle.c b/module/zfs/zle.c new file mode 100644 index 000000000000..0decebb13ca7 --- /dev/null +++ b/module/zfs/zle.c @@ -0,0 +1,91 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Zero-length encoding. This is a fast and simple algorithm to eliminate + * runs of zeroes. Each chunk of compressed data begins with a length byte, b. + * If b < n (where n is the compression parameter) then the next b + 1 bytes + * are literal values. If b >= n then the next (256 - b + 1) bytes are zero. + */ +#include +#include +#include + +size_t +zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end - 1) { + uchar_t *first = src; + uchar_t *len = dst++; + if (src[0] == 0) { + uchar_t *last = src + (256 - n); + while (src < MIN(last, s_end) && src[0] == 0) + src++; + *len = src - first - 1 + n; + } else { + uchar_t *last = src + n; + if (d_end - dst < n) + break; + while (src < MIN(last, s_end) - 1 && (src[0] | src[1])) + *dst++ = *src++; + if (src[0]) + *dst++ = *src++; + *len = src - first - 1; + } + } + return (src == s_end ? dst - (uchar_t *)d_start : s_len); +} + +int +zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end) { + int len = 1 + *src++; + if (len <= n) { + if (src + len > s_end || dst + len > d_end) + return (-1); + while (len-- != 0) + *dst++ = *src++; + } else { + len -= n; + if (dst + len > d_end) + return (-1); + while (len-- != 0) + *dst++ = 0; + } + } + return (dst == d_end ? 0 : -1); +} diff --git a/module/zfs/zrlock.c b/module/zfs/zrlock.c new file mode 100644 index 000000000000..a4def6053622 --- /dev/null +++ b/module/zfs/zrlock.c @@ -0,0 +1,188 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2014, 2015 by Delphix. All rights reserved. + * Copyright 2016 The MathWorks, Inc. All rights reserved. + */ + +/* + * A Zero Reference Lock (ZRL) is a reference count that can lock out new + * references only when the count is zero and only without waiting if the count + * is not already zero. It is similar to a read-write lock in that it allows + * multiple readers and only a single writer, but it does not allow a writer to + * block while waiting for readers to exit, and therefore the question of + * reader/writer priority is moot (no WRWANT bit). Since the equivalent of + * rw_enter(&lock, RW_WRITER) is disallowed and only tryenter() is allowed, it + * is perfectly safe for the same reader to acquire the same lock multiple + * times. The fact that a ZRL is reentrant for readers (through multiple calls + * to zrl_add()) makes it convenient for determining whether something is + * actively referenced without the fuss of flagging lock ownership across + * function calls. + */ +#include +#include + +/* + * A ZRL can be locked only while there are zero references, so ZRL_LOCKED is + * treated as zero references. + */ +#define ZRL_LOCKED -1 +#define ZRL_DESTROYED -2 + +void +zrl_init(zrlock_t *zrl) +{ + mutex_init(&zrl->zr_mtx, NULL, MUTEX_DEFAULT, NULL); + zrl->zr_refcount = 0; + cv_init(&zrl->zr_cv, NULL, CV_DEFAULT, NULL); +#ifdef ZFS_DEBUG + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; +#endif +} + +void +zrl_destroy(zrlock_t *zrl) +{ + ASSERT0(zrl->zr_refcount); + + mutex_destroy(&zrl->zr_mtx); + zrl->zr_refcount = ZRL_DESTROYED; + cv_destroy(&zrl->zr_cv); +} + +void +zrl_add_impl(zrlock_t *zrl, const char *zc) +{ + for (;;) { + uint32_t n = (uint32_t)zrl->zr_refcount; + while (n != ZRL_LOCKED) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, n, n + 1); + if (cas == n) { + ASSERT3S((int32_t)n, >=, 0); +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + DTRACE_PROBE3(zrlock__reentry, + zrlock_t *, zrl, + kthread_t *, curthread, + uint32_t, n); + } + zrl->zr_owner = curthread; + zrl->zr_caller = zc; +#endif + return; + } + n = cas; + } + + mutex_enter(&zrl->zr_mtx); + while (zrl->zr_refcount == ZRL_LOCKED) { + cv_wait(&zrl->zr_cv, &zrl->zr_mtx); + } + mutex_exit(&zrl->zr_mtx); + } +} + +void +zrl_remove(zrlock_t *zrl) +{ + uint32_t n; + +#ifdef ZFS_DEBUG + if (zrl->zr_owner == curthread) { + zrl->zr_owner = NULL; + zrl->zr_caller = NULL; + } +#endif + n = atomic_dec_32_nv((uint32_t *)&zrl->zr_refcount); + ASSERT3S((int32_t)n, >=, 0); +} + +int +zrl_tryenter(zrlock_t *zrl) +{ + uint32_t n = (uint32_t)zrl->zr_refcount; + + if (n == 0) { + uint32_t cas = atomic_cas_32( + (uint32_t *)&zrl->zr_refcount, 0, ZRL_LOCKED); + if (cas == 0) { +#ifdef ZFS_DEBUG + ASSERT3P(zrl->zr_owner, ==, NULL); + zrl->zr_owner = curthread; +#endif + return (1); + } + } + + ASSERT3S((int32_t)n, >, ZRL_DESTROYED); + + return (0); +} + +void +zrl_exit(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, ==, ZRL_LOCKED); + + mutex_enter(&zrl->zr_mtx); +#ifdef ZFS_DEBUG + ASSERT3P(zrl->zr_owner, ==, curthread); + zrl->zr_owner = NULL; + membar_producer(); /* make sure the owner store happens first */ +#endif + zrl->zr_refcount = 0; + cv_broadcast(&zrl->zr_cv); + mutex_exit(&zrl->zr_mtx); +} + +int +zrl_is_zero(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); + + return (zrl->zr_refcount <= 0); +} + +int +zrl_is_locked(zrlock_t *zrl) +{ + ASSERT3S(zrl->zr_refcount, >, ZRL_DESTROYED); + + return (zrl->zr_refcount == ZRL_LOCKED); +} + +#ifdef ZFS_DEBUG +kthread_t * +zrl_owner(zrlock_t *zrl) +{ + return (zrl->zr_owner); +} +#endif + +#if defined(_KERNEL) + +EXPORT_SYMBOL(zrl_add_impl); +EXPORT_SYMBOL(zrl_remove); + +#endif diff --git a/module/zfs/zthr.c b/module/zfs/zthr.c new file mode 100644 index 000000000000..fdc4b863382c --- /dev/null +++ b/module/zfs/zthr.c @@ -0,0 +1,536 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2017, 2020 by Delphix. All rights reserved. + */ + +/* + * ZTHR Infrastructure + * =================== + * + * ZTHR threads are used for isolated operations that span multiple txgs + * within a SPA. They generally exist from SPA creation/loading and until + * the SPA is exported/destroyed. The ideal requirements for an operation + * to be modeled with a zthr are the following: + * + * 1] The operation needs to run over multiple txgs. + * 2] There is be a single point of reference in memory or on disk that + * indicates whether the operation should run/is running or has + * stopped. + * + * If the operation satisfies the above then the following rules guarantee + * a certain level of correctness: + * + * 1] Any thread EXCEPT the zthr changes the work indicator from stopped + * to running but not the opposite. + * 2] Only the zthr can change the work indicator from running to stopped + * (e.g. when it is done) but not the opposite. + * + * This way a normal zthr cycle should go like this: + * + * 1] An external thread changes the work indicator from stopped to + * running and wakes up the zthr. + * 2] The zthr wakes up, checks the indicator and starts working. + * 3] When the zthr is done, it changes the indicator to stopped, allowing + * a new cycle to start. + * + * Besides being awakened by other threads, a zthr can be configured + * during creation to wakeup on its own after a specified interval + * [see zthr_create_timer()]. + * + * Note: ZTHR threads are NOT a replacement for generic threads! Please + * ensure that they fit your use-case well before using them. + * + * == ZTHR creation + * + * Every zthr needs three inputs to start running: + * + * 1] A user-defined checker function (checkfunc) that decides whether + * the zthr should start working or go to sleep. The function should + * return TRUE when the zthr needs to work or FALSE to let it sleep, + * and should adhere to the following signature: + * boolean_t checkfunc_name(void *args, zthr_t *t); + * + * 2] A user-defined ZTHR function (func) which the zthr executes when + * it is not sleeping. The function should adhere to the following + * signature type: + * void func_name(void *args, zthr_t *t); + * + * 3] A void args pointer that will be passed to checkfunc and func + * implicitly by the infrastructure. + * + * The reason why the above API needs two different functions, + * instead of one that both checks and does the work, has to do with + * the zthr's internal state lock (zthr_state_lock) and the allowed + * cancellation windows. We want to hold the zthr_state_lock while + * running checkfunc but not while running func. This way the zthr + * can be cancelled while doing work and not while checking for work. + * + * To start a zthr: + * zthr_t *zthr_pointer = zthr_create(checkfunc, func, args); + * or + * zthr_t *zthr_pointer = zthr_create_timer(checkfunc, func, + * args, max_sleep); + * + * After that you should be able to wakeup, cancel, and resume the + * zthr from another thread using the zthr_pointer. + * + * NOTE: ZTHR threads could potentially wake up spuriously and the + * user should take this into account when writing a checkfunc. + * [see ZTHR state transitions] + * + * == ZTHR wakeup + * + * ZTHR wakeup should be used when new work is added for the zthr. The + * sleeping zthr will wakeup, see that it has more work to complete + * and proceed. This can be invoked from open or syncing context. + * + * To wakeup a zthr: + * zthr_wakeup(zthr_t *t) + * + * == ZTHR cancellation and resumption + * + * ZTHR threads must be cancelled when their SPA is being exported + * or when they need to be paused so they don't interfere with other + * operations. + * + * To cancel a zthr: + * zthr_cancel(zthr_pointer); + * + * To resume it: + * zthr_resume(zthr_pointer); + * + * ZTHR cancel and resume should be invoked in open context during the + * lifecycle of the pool as it is imported, exported or destroyed. + * + * A zthr will implicitly check if it has received a cancellation + * signal every time func returns and every time it wakes up [see + * ZTHR state transitions below]. + * + * At times, waiting for the zthr's func to finish its job may take + * time. This may be very time-consuming for some operations that + * need to cancel the SPA's zthrs (e.g spa_export). For this scenario + * the user can explicitly make their ZTHR function aware of incoming + * cancellation signals using zthr_iscancelled(). A common pattern for + * that looks like this: + * + * int + * func_name(void *args, zthr_t *t) + * { + * ... ... + * while (!work_done && !zthr_iscancelled(t)) { + * ... ... + * } + * } + * + * == ZTHR cleanup + * + * Cancelling a zthr doesn't clean up its metadata (internal locks, + * function pointers to func and checkfunc, etc..). This is because + * we want to keep them around in case we want to resume the execution + * of the zthr later. Similarly for zthrs that exit themselves. + * + * To completely cleanup a zthr, cancel it first to ensure that it + * is not running and then use zthr_destroy(). + * + * == ZTHR state transitions + * + * zthr creation + * + + * | + * | woke up + * | +--------------+ sleep + * | | ^ + * | | | + * | | | FALSE + * | | | + * v v FALSE + + * cancelled? +---------> checkfunc? + * + ^ + + * | | | + * | | | TRUE + * | | | + * | | func returned v + * | +---------------+ func + * | + * | TRUE + * | + * v + * zthr stopped running + * + * == Implementation of ZTHR requests + * + * ZTHR cancel and resume are requests on a zthr to change its + * internal state. These requests are serialized using the + * zthr_request_lock, while changes in its internal state are + * protected by the zthr_state_lock. A request will first acquire + * the zthr_request_lock and then immediately acquire the + * zthr_state_lock. We do this so that incoming requests are + * serialized using the request lock, while still allowing us + * to use the state lock for thread communication via zthr_cv. + * + * ZTHR wakeup broadcasts to zthr_cv, causing sleeping threads + * to wakeup. It acquires the zthr_state_lock but not the + * zthr_request_lock, so that a wakeup on a zthr in the middle + * of being cancelled will not block. + */ + +#include +#include + +struct zthr { + /* running thread doing the work */ + kthread_t *zthr_thread; + + /* lock protecting internal data & invariants */ + kmutex_t zthr_state_lock; + + /* mutex that serializes external requests */ + kmutex_t zthr_request_lock; + + /* notification mechanism for requests */ + kcondvar_t zthr_cv; + + /* flag set to true if we are canceling the zthr */ + boolean_t zthr_cancel; + + /* flag set to true if we are waiting for the zthr to finish */ + boolean_t zthr_haswaiters; + kcondvar_t zthr_wait_cv; + /* + * maximum amount of time that the zthr is spent sleeping; + * if this is 0, the thread doesn't wake up until it gets + * signaled. + */ + hrtime_t zthr_sleep_timeout; + + /* consumer-provided callbacks & data */ + zthr_checkfunc_t *zthr_checkfunc; + zthr_func_t *zthr_func; + void *zthr_arg; +}; + +static void +zthr_procedure(void *arg) +{ + zthr_t *t = arg; + + mutex_enter(&t->zthr_state_lock); + ASSERT3P(t->zthr_thread, ==, curthread); + + while (!t->zthr_cancel) { + if (t->zthr_checkfunc(t->zthr_arg, t)) { + mutex_exit(&t->zthr_state_lock); + t->zthr_func(t->zthr_arg, t); + mutex_enter(&t->zthr_state_lock); + } else { + /* + * cv_wait_sig() is used instead of cv_wait() in + * order to prevent this process from incorrectly + * contributing to the system load average when idle. + */ + if (t->zthr_sleep_timeout == 0) { + cv_wait_sig(&t->zthr_cv, &t->zthr_state_lock); + } else { + (void) cv_timedwait_sig_hires(&t->zthr_cv, + &t->zthr_state_lock, t->zthr_sleep_timeout, + MSEC2NSEC(1), 0); + } + } + if (t->zthr_haswaiters) { + t->zthr_haswaiters = B_FALSE; + cv_broadcast(&t->zthr_wait_cv); + } + } + + /* + * Clear out the kernel thread metadata and notify the + * zthr_cancel() thread that we've stopped running. + */ + t->zthr_thread = NULL; + t->zthr_cancel = B_FALSE; + cv_broadcast(&t->zthr_cv); + + mutex_exit(&t->zthr_state_lock); + thread_exit(); +} + +zthr_t * +zthr_create(const char *zthr_name, zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg) +{ + return (zthr_create_timer(zthr_name, checkfunc, + func, arg, (hrtime_t)0)); +} + +/* + * Create a zthr with specified maximum sleep time. If the time + * in sleeping state exceeds max_sleep, a wakeup(do the check and + * start working if required) will be triggered. + */ +zthr_t * +zthr_create_timer(const char *zthr_name, zthr_checkfunc_t *checkfunc, + zthr_func_t *func, void *arg, hrtime_t max_sleep) +{ + zthr_t *t = kmem_zalloc(sizeof (*t), KM_SLEEP); + mutex_init(&t->zthr_state_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&t->zthr_request_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&t->zthr_cv, NULL, CV_DEFAULT, NULL); + cv_init(&t->zthr_wait_cv, NULL, CV_DEFAULT, NULL); + + mutex_enter(&t->zthr_state_lock); + t->zthr_checkfunc = checkfunc; + t->zthr_func = func; + t->zthr_arg = arg; + t->zthr_sleep_timeout = max_sleep; + + t->zthr_thread = thread_create_named(zthr_name, NULL, 0, + zthr_procedure, t, 0, &p0, TS_RUN, minclsyspri); + + mutex_exit(&t->zthr_state_lock); + + return (t); +} + +void +zthr_destroy(zthr_t *t) +{ + ASSERT(!MUTEX_HELD(&t->zthr_state_lock)); + ASSERT(!MUTEX_HELD(&t->zthr_request_lock)); + VERIFY3P(t->zthr_thread, ==, NULL); + mutex_destroy(&t->zthr_request_lock); + mutex_destroy(&t->zthr_state_lock); + cv_destroy(&t->zthr_cv); + cv_destroy(&t->zthr_wait_cv); + kmem_free(t, sizeof (*t)); +} + +/* + * Wake up the zthr if it is sleeping. If the thread has been cancelled + * or is in the process of being cancelled, this is a no-op. + */ +void +zthr_wakeup(zthr_t *t) +{ + mutex_enter(&t->zthr_state_lock); + + /* + * There are 5 states that we can find the zthr when issuing + * this broadcast: + * + * [1] The common case of the thread being asleep, at which + * point the broadcast will wake it up. + * [2] The thread has been cancelled. Waking up a cancelled + * thread is a no-op. Any work that is still left to be + * done should be handled the next time the thread is + * resumed. + * [3] The thread is doing work and is already up, so this + * is basically a no-op. + * [4] The thread was just created/resumed, in which case the + * behavior is similar to [3]. + * [5] The thread is in the middle of being cancelled, which + * will be a no-op. + */ + cv_broadcast(&t->zthr_cv); + + mutex_exit(&t->zthr_state_lock); +} + +/* + * Sends a cancel request to the zthr and blocks until the zthr is + * cancelled. If the zthr is not running (e.g. has been cancelled + * already), this is a no-op. Note that this function should not be + * called from syncing context as it could deadlock with the zthr_func. + */ +void +zthr_cancel(zthr_t *t) +{ + mutex_enter(&t->zthr_request_lock); + mutex_enter(&t->zthr_state_lock); + + /* + * Since we are holding the zthr_state_lock at this point + * we can find the state in one of the following 4 states: + * + * [1] The thread has already been cancelled, therefore + * there is nothing for us to do. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. + * [3] The thread is doing work, in which case we just set + * the flag and wait for it to finish. + * [4] The thread was just created/resumed, in which case + * the behavior is similar to [3]. + * + * Since requests are serialized, by the time that we get + * control back we expect that the zthr is cancelled and + * not running anymore. + */ + if (t->zthr_thread != NULL) { + t->zthr_cancel = B_TRUE; + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + while (t->zthr_thread != NULL) + cv_wait(&t->zthr_cv, &t->zthr_state_lock); + + ASSERT(!t->zthr_cancel); + } + + mutex_exit(&t->zthr_state_lock); + mutex_exit(&t->zthr_request_lock); +} + +/* + * Sends a resume request to the supplied zthr. If the zthr is already + * running this is a no-op. Note that this function should not be + * called from syncing context as it could deadlock with the zthr_func. + */ +void +zthr_resume(zthr_t *t) +{ + mutex_enter(&t->zthr_request_lock); + mutex_enter(&t->zthr_state_lock); + + ASSERT3P(&t->zthr_checkfunc, !=, NULL); + ASSERT3P(&t->zthr_func, !=, NULL); + ASSERT(!t->zthr_cancel); + ASSERT(!t->zthr_haswaiters); + + /* + * There are 4 states that we find the zthr in at this point + * given the locks that we hold: + * + * [1] The zthr was cancelled, so we spawn a new thread for + * the zthr (common case). + * [2] The zthr is running at which point this is a no-op. + * [3] The zthr is sleeping at which point this is a no-op. + * [4] The zthr was just spawned at which point this is a + * no-op. + */ + if (t->zthr_thread == NULL) { + t->zthr_thread = thread_create(NULL, 0, zthr_procedure, t, + 0, &p0, TS_RUN, minclsyspri); + } + + mutex_exit(&t->zthr_state_lock); + mutex_exit(&t->zthr_request_lock); +} + +/* + * This function is intended to be used by the zthr itself + * (specifically the zthr_func callback provided) to check + * if another thread has signaled it to stop running before + * doing some expensive operation. + * + * returns TRUE if we are in the middle of trying to cancel + * this thread. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_iscancelled(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + + /* + * The majority of the functions here grab zthr_request_lock + * first and then zthr_state_lock. This function only grabs + * the zthr_state_lock. That is because this function should + * only be called from the zthr_func to check if someone has + * issued a zthr_cancel() on the thread. If there is a zthr_cancel() + * happening concurrently, attempting to grab the request lock + * here would result in a deadlock. + * + * By grabbing only the zthr_state_lock this function is allowed + * to run concurrently with a zthr_cancel() request. + */ + mutex_enter(&t->zthr_state_lock); + boolean_t cancelled = t->zthr_cancel; + mutex_exit(&t->zthr_state_lock); + return (cancelled); +} + +/* + * Wait for the zthr to finish its current function. Similar to + * zthr_iscancelled, you can use zthr_has_waiters to have the zthr_func end + * early. Unlike zthr_cancel, the thread is not destroyed. If the zthr was + * sleeping or cancelled, return immediately. + */ +void +zthr_wait_cycle_done(zthr_t *t) +{ + mutex_enter(&t->zthr_state_lock); + + /* + * Since we are holding the zthr_state_lock at this point + * we can find the state in one of the following 5 states: + * + * [1] The thread has already cancelled, therefore + * there is nothing for us to do. + * [2] The thread is sleeping so we set the flag, broadcast + * the CV and wait for it to exit. + * [3] The thread is doing work, in which case we just set + * the flag and wait for it to finish. + * [4] The thread was just created/resumed, in which case + * the behavior is similar to [3]. + * [5] The thread is the middle of being cancelled, which is + * similar to [3]. We'll wait for the cancel, which is + * waiting for the zthr func. + * + * Since requests are serialized, by the time that we get + * control back we expect that the zthr has completed it's + * zthr_func. + */ + if (t->zthr_thread != NULL) { + t->zthr_haswaiters = B_TRUE; + + /* broadcast in case the zthr is sleeping */ + cv_broadcast(&t->zthr_cv); + + while ((t->zthr_haswaiters) && (t->zthr_thread != NULL)) + cv_wait(&t->zthr_wait_cv, &t->zthr_state_lock); + + ASSERT(!t->zthr_haswaiters); + } + + mutex_exit(&t->zthr_state_lock); +} + +/* + * This function is intended to be used by the zthr itself + * to check if another thread is waiting on it to finish + * + * returns TRUE if we have been asked to finish. + * + * returns FALSE otherwise. + */ +boolean_t +zthr_has_waiters(zthr_t *t) +{ + ASSERT3P(t->zthr_thread, ==, curthread); + + mutex_enter(&t->zthr_state_lock); + + /* + * Similarly to zthr_iscancelled(), we only grab the + * zthr_state_lock so that the zthr itself can use this + * to check for the request. + */ + boolean_t has_waiters = t->zthr_haswaiters; + mutex_exit(&t->zthr_state_lock); + return (has_waiters); +} diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c new file mode 100644 index 000000000000..2b20b02e4942 --- /dev/null +++ b/module/zfs/zvol.c @@ -0,0 +1,1729 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (C) 2008-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Rewritten for Linux by Brian Behlendorf . + * LLNL-CODE-403049. + * + * ZFS volume emulation driver. + * + * Makes a DMU object look like a volume of arbitrary size, up to 2^64 bytes. + * Volumes are accessed through the symbolic links named: + * + * /dev// + * + * Volumes are persistent through reboot and module load. No user command + * needs to be run before opening and using a device. + * + * Copyright 2014 Nexenta Systems, Inc. All rights reserved. + * Copyright (c) 2016 Actifio, Inc. All rights reserved. + * Copyright (c) 2012, 2019 by Delphix. All rights reserved. + */ + +/* + * Note on locking of zvol state structures. + * + * These structures are used to maintain internal state used to emulate block + * devices on top of zvols. In particular, management of device minor number + * operations - create, remove, rename, and set_snapdev - involves access to + * these structures. The zvol_state_lock is primarily used to protect the + * zvol_state_list. The zv->zv_state_lock is used to protect the contents + * of the zvol_state_t structures, as well as to make sure that when the + * time comes to remove the structure from the list, it is not in use, and + * therefore, it can be taken off zvol_state_list and freed. + * + * The zv_suspend_lock was introduced to allow for suspending I/O to a zvol, + * e.g. for the duration of receive and rollback operations. This lock can be + * held for significant periods of time. Given that it is undesirable to hold + * mutexes for long periods of time, the following lock ordering applies: + * - take zvol_state_lock if necessary, to protect zvol_state_list + * - take zv_suspend_lock if necessary, by the code path in question + * - take zv_state_lock to protect zvol_state_t + * + * The minor operations are issued to spa->spa_zvol_taskq queues, that are + * single-threaded (to preserve order of minor operations), and are executed + * through the zvol_task_cb that dispatches the specific operations. Therefore, + * these operations are serialized per pool. Consequently, we can be certain + * that for a given zvol, there is only one operation at a time in progress. + * That is why one can be sure that first, zvol_state_t for a given zvol is + * allocated and placed on zvol_state_list, and then other minor operations + * for this zvol are going to proceed in the order of issue. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + + +unsigned int zvol_inhibit_dev = 0; +unsigned int zvol_volmode = ZFS_VOLMODE_GEOM; + +struct hlist_head *zvol_htable; +list_t zvol_state_list; +krwlock_t zvol_state_lock; +const zvol_platform_ops_t *ops; + +typedef enum { + ZVOL_ASYNC_REMOVE_MINORS, + ZVOL_ASYNC_RENAME_MINORS, + ZVOL_ASYNC_SET_SNAPDEV, + ZVOL_ASYNC_SET_VOLMODE, + ZVOL_ASYNC_MAX +} zvol_async_op_t; + +typedef struct { + zvol_async_op_t op; + char pool[MAXNAMELEN]; + char name1[MAXNAMELEN]; + char name2[MAXNAMELEN]; + zprop_source_t source; + uint64_t value; +} zvol_task_t; + +uint64_t +zvol_name_hash(const char *name) +{ + int i; + uint64_t crc = -1ULL; + const uint8_t *p = (const uint8_t *)name; + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + for (i = 0; i < MAXNAMELEN - 1 && *p; i++, p++) { + crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (*p)) & 0xFF]; + } + return (crc); +} + +/* + * Find a zvol_state_t given the name and hash generated by zvol_name_hash. + * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, + * return (NULL) without the taking locks. The zv_suspend_lock is always taken + * before zv_state_lock. The mode argument indicates the mode (including none) + * for zv_suspend_lock to be taken. + */ +zvol_state_t * +zvol_find_by_name_hash(const char *name, uint64_t hash, int mode) +{ + zvol_state_t *zv; + struct hlist_node *p = NULL; + + rw_enter(&zvol_state_lock, RW_READER); + hlist_for_each(p, ZVOL_HT_HEAD(hash)) { + zv = hlist_entry(p, zvol_state_t, zv_hlink); + mutex_enter(&zv->zv_state_lock); + if (zv->zv_hash == hash && + strncmp(zv->zv_name, name, MAXNAMELEN) == 0) { + /* + * this is the right zvol, take the locks in the + * right order + */ + if (mode != RW_NONE && + !rw_tryenter(&zv->zv_suspend_lock, mode)) { + mutex_exit(&zv->zv_state_lock); + rw_enter(&zv->zv_suspend_lock, mode); + mutex_enter(&zv->zv_state_lock); + /* + * zvol cannot be renamed as we continue + * to hold zvol_state_lock + */ + ASSERT(zv->zv_hash == hash && + strncmp(zv->zv_name, name, MAXNAMELEN) + == 0); + } + rw_exit(&zvol_state_lock); + return (zv); + } + mutex_exit(&zv->zv_state_lock); + } + rw_exit(&zvol_state_lock); + + return (NULL); +} + +/* + * Find a zvol_state_t given the name. + * If found, return with zv_suspend_lock and zv_state_lock taken, otherwise, + * return (NULL) without the taking locks. The zv_suspend_lock is always taken + * before zv_state_lock. The mode argument indicates the mode (including none) + * for zv_suspend_lock to be taken. + */ +static zvol_state_t * +zvol_find_by_name(const char *name, int mode) +{ + return (zvol_find_by_name_hash(name, zvol_name_hash(name), mode)); +} + +/* + * ZFS_IOC_CREATE callback handles dmu zvol and zap object creation. + */ +void +zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +{ + zfs_creat_t *zct = arg; + nvlist_t *nvprops = zct->zct_props; + int error; + uint64_t volblocksize, volsize; + + VERIFY(nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE), &volsize) == 0); + if (nvlist_lookup_uint64(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &volblocksize) != 0) + volblocksize = zfs_prop_default_numeric(ZFS_PROP_VOLBLOCKSIZE); + + /* + * These properties must be removed from the list so the generic + * property setting step won't apply to them. + */ + VERIFY(nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLSIZE)) == 0); + (void) nvlist_remove_all(nvprops, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE)); + + error = dmu_object_claim(os, ZVOL_OBJ, DMU_OT_ZVOL, volblocksize, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_create_claim(os, ZVOL_ZAP_OBJ, DMU_OT_ZVOL_PROP, + DMU_OT_NONE, 0, tx); + ASSERT(error == 0); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); + ASSERT(error == 0); +} + +/* + * ZFS_IOC_OBJSET_STATS entry point. + */ +int +zvol_get_stats(objset_t *os, nvlist_t *nv) +{ + int error; + dmu_object_info_t *doi; + uint64_t val; + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); + if (error) + return (SET_ERROR(error)); + + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLSIZE, val); + doi = kmem_alloc(sizeof (dmu_object_info_t), KM_SLEEP); + error = dmu_object_info(os, ZVOL_OBJ, doi); + + if (error == 0) { + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_VOLBLOCKSIZE, + doi->doi_data_block_size); + } + + kmem_free(doi, sizeof (dmu_object_info_t)); + + return (SET_ERROR(error)); +} + +/* + * Sanity check volume size. + */ +int +zvol_check_volsize(uint64_t volsize, uint64_t blocksize) +{ + if (volsize == 0) + return (SET_ERROR(EINVAL)); + + if (volsize % blocksize != 0) + return (SET_ERROR(EINVAL)); + +#ifdef _ILP32 + if (volsize - 1 > SPEC_MAXOFFSET_T) + return (SET_ERROR(EOVERFLOW)); +#endif + return (0); +} + +/* + * Ensure the zap is flushed then inform the VFS of the capacity change. + */ +static int +zvol_update_volsize(uint64_t volsize, objset_t *os) +{ + dmu_tx_t *tx; + int error; + uint64_t txg; + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_mark_netfree(tx); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (SET_ERROR(error)); + } + txg = dmu_tx_get_txg(tx); + + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, + &volsize, tx); + dmu_tx_commit(tx); + + txg_wait_synced(dmu_objset_pool(os), txg); + + if (error == 0) + error = dmu_free_long_range(os, + ZVOL_OBJ, volsize, DMU_OBJECT_END); + + return (error); +} + +/* + * Set ZFS_PROP_VOLSIZE set entry point. Note that modifying the volume + * size will result in a udev "change" event being generated. + */ +int +zvol_set_volsize(const char *name, uint64_t volsize) +{ + objset_t *os = NULL; + uint64_t readonly; + int error; + boolean_t owned = B_FALSE; + + error = dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_READONLY), &readonly, NULL); + if (error != 0) + return (SET_ERROR(error)); + if (readonly) + return (SET_ERROR(EROFS)); + + zvol_state_t *zv = zvol_find_by_name(name, RW_READER); + + ASSERT(zv == NULL || (MUTEX_HELD(&zv->zv_state_lock) && + RW_READ_HELD(&zv->zv_suspend_lock))); + + if (zv == NULL || zv->zv_objset == NULL) { + if (zv != NULL) + rw_exit(&zv->zv_suspend_lock); + if ((error = dmu_objset_own(name, DMU_OST_ZVOL, B_FALSE, B_TRUE, + FTAG, &os)) != 0) { + if (zv != NULL) + mutex_exit(&zv->zv_state_lock); + return (SET_ERROR(error)); + } + owned = B_TRUE; + if (zv != NULL) + zv->zv_objset = os; + } else { + os = zv->zv_objset; + } + + dmu_object_info_t *doi = kmem_alloc(sizeof (*doi), KM_SLEEP); + + if ((error = dmu_object_info(os, ZVOL_OBJ, doi)) || + (error = zvol_check_volsize(volsize, doi->doi_data_block_size))) + goto out; + + error = zvol_update_volsize(volsize, os); + if (error == 0 && zv != NULL) { + zv->zv_volsize = volsize; + zv->zv_changed = 1; + } +out: + kmem_free(doi, sizeof (dmu_object_info_t)); + + if (owned) { + dmu_objset_disown(os, B_TRUE, FTAG); + if (zv != NULL) + zv->zv_objset = NULL; + } else { + rw_exit(&zv->zv_suspend_lock); + } + + if (zv != NULL) + mutex_exit(&zv->zv_state_lock); + + if (error == 0 && zv != NULL) + ops->zv_update_volsize(zv, volsize); + + return (SET_ERROR(error)); +} + +/* + * Sanity check volume block size. + */ +int +zvol_check_volblocksize(const char *name, uint64_t volblocksize) +{ + /* Record sizes above 128k need the feature to be enabled */ + if (volblocksize > SPA_OLD_MAXBLOCKSIZE) { + spa_t *spa; + int error; + + if ((error = spa_open(name, &spa, FTAG)) != 0) + return (error); + + if (!spa_feature_is_enabled(spa, SPA_FEATURE_LARGE_BLOCKS)) { + spa_close(spa, FTAG); + return (SET_ERROR(ENOTSUP)); + } + + /* + * We don't allow setting the property above 1MB, + * unless the tunable has been changed. + */ + if (volblocksize > zfs_max_recordsize) + return (SET_ERROR(EDOM)); + + spa_close(spa, FTAG); + } + + if (volblocksize < SPA_MINBLOCKSIZE || + volblocksize > SPA_MAXBLOCKSIZE || + !ISP2(volblocksize)) + return (SET_ERROR(EDOM)); + + return (0); +} + +/* + * Set ZFS_PROP_VOLBLOCKSIZE set entry point. + */ +int +zvol_set_volblocksize(const char *name, uint64_t volblocksize) +{ + zvol_state_t *zv; + dmu_tx_t *tx; + int error; + + zv = zvol_find_by_name(name, RW_READER); + + if (zv == NULL) + return (SET_ERROR(ENXIO)); + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + + if (zv->zv_flags & ZVOL_RDONLY) { + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + return (SET_ERROR(EROFS)); + } + + tx = dmu_tx_create(zv->zv_objset); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, + volblocksize, 0, tx); + if (error == ENOTSUP) + error = SET_ERROR(EBUSY); + dmu_tx_commit(tx); + if (error == 0) + zv->zv_volblocksize = volblocksize; + } + + mutex_exit(&zv->zv_state_lock); + rw_exit(&zv->zv_suspend_lock); + + return (SET_ERROR(error)); +} + +/* + * Replay a TX_TRUNCATE ZIL transaction if asked. TX_TRUNCATE is how we + * implement DKIOCFREE/free-long-range. + */ +static int +zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_truncate_t *lr = arg2; + uint64_t offset, length; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + return (dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, offset, length)); +} + +/* + * Replay a TX_WRITE ZIL transaction that didn't get committed + * after a system failure + */ +static int +zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) +{ + zvol_state_t *zv = arg1; + lr_write_t *lr = arg2; + objset_t *os = zv->zv_objset; + char *data = (char *)(lr + 1); /* data follows lr_write_t */ + uint64_t offset, length; + dmu_tx_t *tx; + int error; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + dmu_write(os, ZVOL_OBJ, offset, length, data, tx); + dmu_tx_commit(tx); + } + + return (error); +} + +static int +zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) +{ + return (SET_ERROR(ENOTSUP)); +} + +/* + * Callback vectors for replaying records. + * Only TX_WRITE and TX_TRUNCATE are needed for zvol. + */ +zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { + zvol_replay_err, /* no such transaction type */ + zvol_replay_err, /* TX_CREATE */ + zvol_replay_err, /* TX_MKDIR */ + zvol_replay_err, /* TX_MKXATTR */ + zvol_replay_err, /* TX_SYMLINK */ + zvol_replay_err, /* TX_REMOVE */ + zvol_replay_err, /* TX_RMDIR */ + zvol_replay_err, /* TX_LINK */ + zvol_replay_err, /* TX_RENAME */ + zvol_replay_write, /* TX_WRITE */ + zvol_replay_truncate, /* TX_TRUNCATE */ + zvol_replay_err, /* TX_SETATTR */ + zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ATTR */ + zvol_replay_err, /* TX_CREATE_ACL_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL */ + zvol_replay_err, /* TX_MKDIR_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ + zvol_replay_err, /* TX_WRITE2 */ +}; + +/* + * zvol_log_write() handles synchronous writes using TX_WRITE ZIL transactions. + * + * We store data in the log buffers if it's small enough. + * Otherwise we will later flush the data out via dmu_sync(). + */ +ssize_t zvol_immediate_write_sz = 32768; + +void +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, uint64_t offset, + uint64_t size, int sync) +{ + uint32_t blocksize = zv->zv_volblocksize; + zilog_t *zilog = zv->zv_zilog; + itx_wr_state_t write_state; + + if (zil_replaying(zilog, tx)) + return; + + if (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + write_state = WR_INDIRECT; + else if (!spa_has_slogs(zilog->zl_spa) && + size >= blocksize && blocksize > zvol_immediate_write_sz) + write_state = WR_INDIRECT; + else if (sync) + write_state = WR_COPIED; + else + write_state = WR_NEED_COPY; + + while (size) { + itx_t *itx; + lr_write_t *lr; + itx_wr_state_t wr_state = write_state; + ssize_t len = size; + + if (wr_state == WR_COPIED && size > zil_max_copied_data(zilog)) + wr_state = WR_NEED_COPY; + else if (wr_state == WR_INDIRECT) + len = MIN(blocksize - P2PHASE(offset, blocksize), size); + + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (wr_state == WR_COPIED ? len : 0)); + lr = (lr_write_t *)&itx->itx_lr; + if (wr_state == WR_COPIED && dmu_read_by_dnode(zv->zv_dn, + offset, len, lr+1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + wr_state = WR_NEED_COPY; + } + + itx->itx_wr_state = wr_state; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = offset; + lr->lr_length = len; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + itx->itx_private = zv; + itx->itx_sync = sync; + + (void) zil_itx_assign(zilog, itx, tx); + + offset += len; + size -= len; + } +} + +/* + * Log a DKIOCFREE/free-long-range to the ZIL with TX_TRUNCATE. + */ +void +zvol_log_truncate(zvol_state_t *zv, dmu_tx_t *tx, uint64_t off, uint64_t len, + boolean_t sync) +{ + itx_t *itx; + lr_truncate_t *lr; + zilog_t *zilog = zv->zv_zilog; + + if (zil_replaying(zilog, tx)) + return; + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + lr = (lr_truncate_t *)&itx->itx_lr; + lr->lr_foid = ZVOL_OBJ; + lr->lr_offset = off; + lr->lr_length = len; + + itx->itx_sync = sync; + zil_itx_assign(zilog, itx, tx); +} + + +/* ARGSUSED */ +static void +zvol_get_done(zgd_t *zgd, int error) +{ + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_rangelock_exit(zgd->zgd_lr); + + kmem_free(zgd, sizeof (zgd_t)); +} + +/* + * Get data to generate a TX_WRITE intent log record. + */ +int +zvol_get_data(void *arg, lr_write_t *lr, char *buf, struct lwb *lwb, zio_t *zio) +{ + zvol_state_t *zv = arg; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ASSERT3P(lwb, !=, NULL); + ASSERT3P(zio, !=, NULL); + ASSERT3U(size, !=, 0); + + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_lwb = lwb; + + /* + * Write records come in two flavors: immediate and indirect. + * For small writes it's cheaper to store the data with the + * log record (immediate); for large writes it's cheaper to + * sync the data and get a pointer to it (indirect) so that + * we don't have to write the data twice. + */ + if (buf != NULL) { /* immediate write */ + zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, + size, RL_READER); + error = dmu_read_by_dnode(zv->zv_dn, offset, size, buf, + DMU_READ_NO_PREFETCH); + } else { /* indirect write */ + /* + * Have to lock the whole block to ensure when it's written out + * and its checksum is being calculated that no one can change + * the data. Contrarily to zfs_get_data we need not re-check + * blocksize after we get the lock because it cannot be changed. + */ + size = zv->zv_volblocksize; + offset = P2ALIGN_TYPED(offset, size, uint64_t); + zgd->zgd_lr = zfs_rangelock_enter(&zv->zv_rangelock, offset, + size, RL_READER); + error = dmu_buf_hold_by_dnode(zv->zv_dn, offset, zgd, &db, + DMU_READ_NO_PREFETCH); + if (error == 0) { + blkptr_t *bp = &lr->lr_blkptr; + + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db != NULL); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zvol_get_done, zgd); + + if (error == 0) + return (0); + } + } + + zvol_get_done(zgd, error); + + return (SET_ERROR(error)); +} + +/* + * The zvol_state_t's are inserted into zvol_state_list and zvol_htable. + */ + +void +zvol_insert(zvol_state_t *zv) +{ + ASSERT(RW_WRITE_HELD(&zvol_state_lock)); + list_insert_head(&zvol_state_list, zv); + hlist_add_head(&zv->zv_hlink, ZVOL_HT_HEAD(zv->zv_hash)); +} + +/* + * Simply remove the zvol from to list of zvols. + */ +static void +zvol_remove(zvol_state_t *zv) +{ + ASSERT(RW_WRITE_HELD(&zvol_state_lock)); + list_remove(&zvol_state_list, zv); + hlist_del(&zv->zv_hlink); +} + +/* + * Setup zv after we just own the zv->objset + */ +static int +zvol_setup_zv(zvol_state_t *zv) +{ + uint64_t volsize; + int error; + uint64_t ro; + objset_t *os = zv->zv_objset; + + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(RW_LOCK_HELD(&zv->zv_suspend_lock)); + + zv->zv_zilog = NULL; + zv->zv_flags &= ~ZVOL_WRITTEN_TO; + + error = dsl_prop_get_integer(zv->zv_name, "readonly", &ro, NULL); + if (error) + return (SET_ERROR(error)); + + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) + return (SET_ERROR(error)); + + error = dnode_hold(os, ZVOL_OBJ, FTAG, &zv->zv_dn); + if (error) + return (SET_ERROR(error)); + + ops->zv_set_capacity(zv, volsize >> 9); + zv->zv_volsize = volsize; + + if (ro || dmu_objset_is_snapshot(os) || + !spa_writeable(dmu_objset_spa(os))) { + ops->zv_set_disk_ro(zv, 1); + zv->zv_flags |= ZVOL_RDONLY; + } else { + ops->zv_set_disk_ro(zv, 0); + zv->zv_flags &= ~ZVOL_RDONLY; + } + return (0); +} + +/* + * Shutdown every zv_objset related stuff except zv_objset itself. + * The is the reverse of zvol_setup_zv. + */ +static void +zvol_shutdown_zv(zvol_state_t *zv) +{ + ASSERT(MUTEX_HELD(&zv->zv_state_lock) && + RW_LOCK_HELD(&zv->zv_suspend_lock)); + + if (zv->zv_flags & ZVOL_WRITTEN_TO) { + ASSERT(zv->zv_zilog != NULL); + zil_close(zv->zv_zilog); + } + + zv->zv_zilog = NULL; + + dnode_rele(zv->zv_dn, FTAG); + zv->zv_dn = NULL; + + /* + * Evict cached data. We must write out any dirty data before + * disowning the dataset. + */ + if (zv->zv_flags & ZVOL_WRITTEN_TO) + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + (void) dmu_objset_evict_dbufs(zv->zv_objset); +} + +/* + * return the proper tag for rollback and recv + */ +void * +zvol_tag(zvol_state_t *zv) +{ + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + return (zv->zv_open_count > 0 ? zv : NULL); +} + +/* + * Suspend the zvol for recv and rollback. + */ +zvol_state_t * +zvol_suspend(const char *name) +{ + zvol_state_t *zv; + + zv = zvol_find_by_name(name, RW_WRITER); + + if (zv == NULL) + return (NULL); + + /* block all I/O, release in zvol_resume. */ + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + + atomic_inc(&zv->zv_suspend_ref); + + if (zv->zv_open_count > 0) + zvol_shutdown_zv(zv); + + /* + * do not hold zv_state_lock across suspend/resume to + * avoid locking up zvol lookups + */ + mutex_exit(&zv->zv_state_lock); + + /* zv_suspend_lock is released in zvol_resume() */ + return (zv); +} + +int +zvol_resume(zvol_state_t *zv) +{ + int error = 0; + + ASSERT(RW_WRITE_HELD(&zv->zv_suspend_lock)); + + mutex_enter(&zv->zv_state_lock); + + if (zv->zv_open_count > 0) { + VERIFY0(dmu_objset_hold(zv->zv_name, zv, &zv->zv_objset)); + VERIFY3P(zv->zv_objset->os_dsl_dataset->ds_owner, ==, zv); + VERIFY(dsl_dataset_long_held(zv->zv_objset->os_dsl_dataset)); + dmu_objset_rele(zv->zv_objset, zv); + + error = zvol_setup_zv(zv); + } + + mutex_exit(&zv->zv_state_lock); + + rw_exit(&zv->zv_suspend_lock); + /* + * We need this because we don't hold zvol_state_lock while releasing + * zv_suspend_lock. zvol_remove_minors_impl thus cannot check + * zv_suspend_lock to determine it is safe to free because rwlock is + * not inherent atomic. + */ + atomic_dec(&zv->zv_suspend_ref); + + return (SET_ERROR(error)); +} + +int +zvol_first_open(zvol_state_t *zv, boolean_t readonly) +{ + objset_t *os; + int error, locked = 0; + boolean_t ro; + + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + /* + * In all other cases the spa_namespace_lock is taken before the + * bdev->bd_mutex lock. But in this case the Linux __blkdev_get() + * function calls fops->open() with the bdev->bd_mutex lock held. + * This deadlock can be easily observed with zvols used as vdevs. + * + * To avoid a potential lock inversion deadlock we preemptively + * try to take the spa_namespace_lock(). Normally it will not + * be contended and this is safe because spa_open_common() handles + * the case where the caller already holds the spa_namespace_lock. + * + * When it is contended we risk a lock inversion if we were to + * block waiting for the lock. Luckily, the __blkdev_get() + * function allows us to return -ERESTARTSYS which will result in + * bdev->bd_mutex being dropped, reacquired, and fops->open() being + * called again. This process can be repeated safely until both + * locks are acquired. + */ + if (!mutex_owned(&spa_namespace_lock)) { + locked = mutex_tryenter(&spa_namespace_lock); + if (!locked) + return (SET_ERROR(EINTR)); + } + + ro = (readonly || (strchr(zv->zv_name, '@') != NULL)); + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, ro, B_TRUE, zv, &os); + if (error) + goto out_mutex; + + zv->zv_objset = os; + + error = zvol_setup_zv(zv); + + if (error) { + dmu_objset_disown(os, 1, zv); + zv->zv_objset = NULL; + } + +out_mutex: + if (locked) + mutex_exit(&spa_namespace_lock); + return (SET_ERROR(error)); +} + +void +zvol_last_close(zvol_state_t *zv) +{ + ASSERT(RW_READ_HELD(&zv->zv_suspend_lock)); + ASSERT(MUTEX_HELD(&zv->zv_state_lock)); + + zvol_shutdown_zv(zv); + + dmu_objset_disown(zv->zv_objset, 1, zv); + zv->zv_objset = NULL; +} + +typedef struct minors_job { + list_t *list; + list_node_t link; + /* input */ + char *name; + /* output */ + int error; +} minors_job_t; + +/* + * Prefetch zvol dnodes for the minors_job + */ +static void +zvol_prefetch_minors_impl(void *arg) +{ + minors_job_t *job = arg; + char *dsname = job->name; + objset_t *os = NULL; + + job->error = dmu_objset_own(dsname, DMU_OST_ZVOL, B_TRUE, B_TRUE, + FTAG, &os); + if (job->error == 0) { + dmu_prefetch(os, ZVOL_OBJ, 0, 0, 0, ZIO_PRIORITY_SYNC_READ); + dmu_objset_disown(os, B_TRUE, FTAG); + } +} + +/* + * Mask errors to continue dmu_objset_find() traversal + */ +static int +zvol_create_snap_minor_cb(const char *dsname, void *arg) +{ + minors_job_t *j = arg; + list_t *minors_list = j->list; + const char *name = j->name; + + ASSERT0(MUTEX_HELD(&spa_namespace_lock)); + + /* skip the designated dataset */ + if (name && strcmp(dsname, name) == 0) + return (0); + + /* at this point, the dsname should name a snapshot */ + if (strchr(dsname, '@') == 0) { + dprintf("zvol_create_snap_minor_cb(): " + "%s is not a snapshot name\n", dsname); + } else { + minors_job_t *job; + char *n = kmem_strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); + } + + return (0); +} + +/* + * Mask errors to continue dmu_objset_find() traversal + */ +static int +zvol_create_minors_cb(const char *dsname, void *arg) +{ + uint64_t snapdev; + int error; + list_t *minors_list = arg; + + ASSERT0(MUTEX_HELD(&spa_namespace_lock)); + + error = dsl_prop_get_integer(dsname, "snapdev", &snapdev, NULL); + if (error) + return (0); + + /* + * Given the name and the 'snapdev' property, create device minor nodes + * with the linkages to zvols/snapshots as needed. + * If the name represents a zvol, create a minor node for the zvol, then + * check if its snapshots are 'visible', and if so, iterate over the + * snapshots and create device minor nodes for those. + */ + if (strchr(dsname, '@') == 0) { + minors_job_t *job; + char *n = kmem_strdup(dsname); + if (n == NULL) + return (0); + + job = kmem_alloc(sizeof (minors_job_t), KM_SLEEP); + job->name = n; + job->list = minors_list; + job->error = 0; + list_insert_tail(minors_list, job); + /* don't care if dispatch fails, because job->error is 0 */ + taskq_dispatch(system_taskq, zvol_prefetch_minors_impl, job, + TQ_SLEEP); + + if (snapdev == ZFS_SNAPDEV_VISIBLE) { + /* + * traverse snapshots only, do not traverse children, + * and skip the 'dsname' + */ + error = dmu_objset_find(dsname, + zvol_create_snap_minor_cb, (void *)job, + DS_FIND_SNAPSHOTS); + } + } else { + dprintf("zvol_create_minors_cb(): %s is not a zvol name\n", + dsname); + } + + return (0); +} + +/* + * Create minors for the specified dataset, including children and snapshots. + * Pay attention to the 'snapdev' property and iterate over the snapshots + * only if they are 'visible'. This approach allows one to assure that the + * snapshot metadata is read from disk only if it is needed. + * + * The name can represent a dataset to be recursively scanned for zvols and + * their snapshots, or a single zvol snapshot. If the name represents a + * dataset, the scan is performed in two nested stages: + * - scan the dataset for zvols, and + * - for each zvol, create a minor node, then check if the zvol's snapshots + * are 'visible', and only then iterate over the snapshots if needed + * + * If the name represents a snapshot, a check is performed if the snapshot is + * 'visible' (which also verifies that the parent is a zvol), and if so, + * a minor node for that snapshot is created. + */ +void +zvol_create_minors_recursive(const char *name) +{ + list_t minors_list; + minors_job_t *job; + + if (zvol_inhibit_dev) + return; + + /* + * This is the list for prefetch jobs. Whenever we found a match + * during dmu_objset_find, we insert a minors_job to the list and do + * taskq_dispatch to parallel prefetch zvol dnodes. Note we don't need + * any lock because all list operation is done on the current thread. + * + * We will use this list to do zvol_create_minor_impl after prefetch + * so we don't have to traverse using dmu_objset_find again. + */ + list_create(&minors_list, sizeof (minors_job_t), + offsetof(minors_job_t, link)); + + + if (strchr(name, '@') != NULL) { + uint64_t snapdev; + + int error = dsl_prop_get_integer(name, "snapdev", + &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) + (void) ops->zv_create_minor(name); + } else { + fstrans_cookie_t cookie = spl_fstrans_mark(); + (void) dmu_objset_find(name, zvol_create_minors_cb, + &minors_list, DS_FIND_CHILDREN); + spl_fstrans_unmark(cookie); + } + + taskq_wait_outstanding(system_taskq, 0); + + /* + * Prefetch is completed, we can do zvol_create_minor_impl + * sequentially. + */ + while ((job = list_head(&minors_list)) != NULL) { + list_remove(&minors_list, job); + if (!job->error) + (void) ops->zv_create_minor(job->name); + kmem_strfree(job->name); + kmem_free(job, sizeof (minors_job_t)); + } + + list_destroy(&minors_list); +} + +void +zvol_create_minor(const char *name) +{ + /* + * Note: the dsl_pool_config_lock must not be held. + * Minor node creation needs to obtain the zvol_state_lock. + * zvol_open() obtains the zvol_state_lock and then the dsl pool + * config lock. Therefore, we can't have the config lock now if + * we are going to wait for the zvol_state_lock, because it + * would be a lock order inversion which could lead to deadlock. + */ + + if (zvol_inhibit_dev) + return; + + if (strchr(name, '@') != NULL) { + uint64_t snapdev; + + int error = dsl_prop_get_integer(name, + "snapdev", &snapdev, NULL); + + if (error == 0 && snapdev == ZFS_SNAPDEV_VISIBLE) + (void) ops->zv_create_minor(name); + } else { + (void) ops->zv_create_minor(name); + } +} + +/* + * Remove minors for specified dataset including children and snapshots. + */ + +void +zvol_remove_minors_impl(const char *name) +{ + zvol_state_t *zv, *zv_next; + int namelen = ((name) ? strlen(name) : 0); + taskqid_t t; + list_t free_list; + + if (zvol_inhibit_dev) + return; + + list_create(&free_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); + + rw_enter(&zvol_state_lock, RW_WRITER); + + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + mutex_enter(&zv->zv_state_lock); + if (name == NULL || strcmp(zv->zv_name, name) == 0 || + (strncmp(zv->zv_name, name, namelen) == 0 && + (zv->zv_name[namelen] == '/' || + zv->zv_name[namelen] == '@'))) { + /* + * By holding zv_state_lock here, we guarantee that no + * one is currently using this zv + */ + + /* If in use, leave alone */ + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) { + mutex_exit(&zv->zv_state_lock); + continue; + } + + zvol_remove(zv); + + /* + * Cleared while holding zvol_state_lock as a writer + * which will prevent zvol_open() from opening it. + */ + ops->zv_clear_private(zv); + + /* Drop zv_state_lock before zvol_free() */ + mutex_exit(&zv->zv_state_lock); + + /* Try parallel zv_free, if failed do it in place */ + t = taskq_dispatch(system_taskq, + (task_func_t *)ops->zv_free, zv, TQ_SLEEP); + if (t == TASKQID_INVALID) + list_insert_head(&free_list, zv); + } else { + mutex_exit(&zv->zv_state_lock); + } + } + rw_exit(&zvol_state_lock); + + /* Drop zvol_state_lock before calling zvol_free() */ + while ((zv = list_head(&free_list)) != NULL) { + list_remove(&free_list, zv); + ops->zv_free(zv); + } +} + +/* Remove minor for this specific volume only */ +static void +zvol_remove_minor_impl(const char *name) +{ + zvol_state_t *zv = NULL, *zv_next; + + if (zvol_inhibit_dev) + return; + + rw_enter(&zvol_state_lock, RW_WRITER); + + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + mutex_enter(&zv->zv_state_lock); + if (strcmp(zv->zv_name, name) == 0) { + /* + * By holding zv_state_lock here, we guarantee that no + * one is currently using this zv + */ + + /* If in use, leave alone */ + if (zv->zv_open_count > 0 || + atomic_read(&zv->zv_suspend_ref)) { + mutex_exit(&zv->zv_state_lock); + continue; + } + zvol_remove(zv); + + ops->zv_clear_private(zv); + mutex_exit(&zv->zv_state_lock); + break; + } else { + mutex_exit(&zv->zv_state_lock); + } + } + + /* Drop zvol_state_lock before calling zvol_free() */ + rw_exit(&zvol_state_lock); + + if (zv != NULL) + ops->zv_free(zv); +} + +/* + * Rename minors for specified dataset including children and snapshots. + */ +static void +zvol_rename_minors_impl(const char *oldname, const char *newname) +{ + zvol_state_t *zv, *zv_next; + int oldnamelen, newnamelen; + + if (zvol_inhibit_dev) + return; + + oldnamelen = strlen(oldname); + newnamelen = strlen(newname); + + rw_enter(&zvol_state_lock, RW_READER); + + for (zv = list_head(&zvol_state_list); zv != NULL; zv = zv_next) { + zv_next = list_next(&zvol_state_list, zv); + + mutex_enter(&zv->zv_state_lock); + + if (strcmp(zv->zv_name, oldname) == 0) { + ops->zv_rename_minor(zv, newname); + } else if (strncmp(zv->zv_name, oldname, oldnamelen) == 0 && + (zv->zv_name[oldnamelen] == '/' || + zv->zv_name[oldnamelen] == '@')) { + char *name = kmem_asprintf("%s%c%s", newname, + zv->zv_name[oldnamelen], + zv->zv_name + oldnamelen + 1); + ops->zv_rename_minor(zv, name); + kmem_strfree(name); + } + + mutex_exit(&zv->zv_state_lock); + } + + rw_exit(&zvol_state_lock); +} + +typedef struct zvol_snapdev_cb_arg { + uint64_t snapdev; +} zvol_snapdev_cb_arg_t; + +static int +zvol_set_snapdev_cb(const char *dsname, void *param) +{ + zvol_snapdev_cb_arg_t *arg = param; + + if (strchr(dsname, '@') == NULL) + return (0); + + switch (arg->snapdev) { + case ZFS_SNAPDEV_VISIBLE: + (void) ops->zv_create_minor(dsname); + break; + case ZFS_SNAPDEV_HIDDEN: + (void) zvol_remove_minor_impl(dsname); + break; + } + + return (0); +} + +static void +zvol_set_snapdev_impl(char *name, uint64_t snapdev) +{ + zvol_snapdev_cb_arg_t arg = {snapdev}; + fstrans_cookie_t cookie = spl_fstrans_mark(); + /* + * The zvol_set_snapdev_sync() sets snapdev appropriately + * in the dataset hierarchy. Here, we only scan snapshots. + */ + dmu_objset_find(name, zvol_set_snapdev_cb, &arg, DS_FIND_SNAPSHOTS); + spl_fstrans_unmark(cookie); +} + +typedef struct zvol_volmode_cb_arg { + uint64_t volmode; +} zvol_volmode_cb_arg_t; + +static void +zvol_set_volmode_impl(char *name, uint64_t volmode) +{ + fstrans_cookie_t cookie = spl_fstrans_mark(); + + if (strchr(name, '@') != NULL) + return; + + /* + * It's unfortunate we need to remove minors before we create new ones: + * this is necessary because our backing gendisk (zvol_state->zv_disk) + * could be different when we set, for instance, volmode from "geom" + * to "dev" (or vice versa). + * A possible optimization is to modify our consumers so we don't get + * called when "volmode" does not change. + */ + switch (volmode) { + case ZFS_VOLMODE_NONE: + (void) zvol_remove_minor_impl(name); + break; + case ZFS_VOLMODE_GEOM: + case ZFS_VOLMODE_DEV: + (void) zvol_remove_minor_impl(name); + (void) ops->zv_create_minor(name); + break; + case ZFS_VOLMODE_DEFAULT: + (void) zvol_remove_minor_impl(name); + if (zvol_volmode == ZFS_VOLMODE_NONE) + break; + else /* if zvol_volmode is invalid defaults to "geom" */ + (void) ops->zv_create_minor(name); + break; + } + + spl_fstrans_unmark(cookie); +} + +static zvol_task_t * +zvol_task_alloc(zvol_async_op_t op, const char *name1, const char *name2, + uint64_t value) +{ + zvol_task_t *task; + char *delim; + + /* Never allow tasks on hidden names. */ + if (name1[0] == '$') + return (NULL); + + task = kmem_zalloc(sizeof (zvol_task_t), KM_SLEEP); + task->op = op; + task->value = value; + delim = strchr(name1, '/'); + strlcpy(task->pool, name1, delim ? (delim - name1 + 1) : MAXNAMELEN); + + strlcpy(task->name1, name1, MAXNAMELEN); + if (name2 != NULL) + strlcpy(task->name2, name2, MAXNAMELEN); + + return (task); +} + +static void +zvol_task_free(zvol_task_t *task) +{ + kmem_free(task, sizeof (zvol_task_t)); +} + +/* + * The worker thread function performed asynchronously. + */ +static void +zvol_task_cb(void *arg) +{ + zvol_task_t *task = arg; + + switch (task->op) { + case ZVOL_ASYNC_REMOVE_MINORS: + zvol_remove_minors_impl(task->name1); + break; + case ZVOL_ASYNC_RENAME_MINORS: + zvol_rename_minors_impl(task->name1, task->name2); + break; + case ZVOL_ASYNC_SET_SNAPDEV: + zvol_set_snapdev_impl(task->name1, task->value); + break; + case ZVOL_ASYNC_SET_VOLMODE: + zvol_set_volmode_impl(task->name1, task->value); + break; + default: + VERIFY(0); + break; + } + + zvol_task_free(task); +} + +typedef struct zvol_set_prop_int_arg { + const char *zsda_name; + uint64_t zsda_value; + zprop_source_t zsda_source; + dmu_tx_t *zsda_tx; +} zvol_set_prop_int_arg_t; + +/* + * Sanity check the dataset for safe use by the sync task. No additional + * conditions are imposed. + */ +static int +zvol_set_snapdev_check(void *arg, dmu_tx_t *tx) +{ + zvol_set_prop_int_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + int error; + + error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); + if (error != 0) + return (error); + + dsl_dir_rele(dd, FTAG); + + return (error); +} + +/* ARGSUSED */ +static int +zvol_set_snapdev_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + char dsname[MAXNAMELEN]; + zvol_task_t *task; + uint64_t snapdev; + + dsl_dataset_name(ds, dsname); + if (dsl_prop_get_int_ds(ds, "snapdev", &snapdev) != 0) + return (0); + task = zvol_task_alloc(ZVOL_ASYNC_SET_SNAPDEV, dsname, NULL, snapdev); + if (task == NULL) + return (0); + + (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, + task, TQ_SLEEP); + return (0); +} + +/* + * Traverse all child datasets and apply snapdev appropriately. + * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel + * dataset and read the effective "snapdev" on every child in the callback + * function: this is because the value is not guaranteed to be the same in the + * whole dataset hierarchy. + */ +static void +zvol_set_snapdev_sync(void *arg, dmu_tx_t *tx) +{ + zvol_set_prop_int_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + dsl_dataset_t *ds; + int error; + + VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); + zsda->zsda_tx = tx; + + error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); + if (error == 0) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_SNAPDEV), + zsda->zsda_source, sizeof (zsda->zsda_value), 1, + &zsda->zsda_value, zsda->zsda_tx); + dsl_dataset_rele(ds, FTAG); + } + dmu_objset_find_dp(dp, dd->dd_object, zvol_set_snapdev_sync_cb, + zsda, DS_FIND_CHILDREN); + + dsl_dir_rele(dd, FTAG); +} + +int +zvol_set_snapdev(const char *ddname, zprop_source_t source, uint64_t snapdev) +{ + zvol_set_prop_int_arg_t zsda; + + zsda.zsda_name = ddname; + zsda.zsda_source = source; + zsda.zsda_value = snapdev; + + return (dsl_sync_task(ddname, zvol_set_snapdev_check, + zvol_set_snapdev_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); +} + +/* + * Sanity check the dataset for safe use by the sync task. No additional + * conditions are imposed. + */ +static int +zvol_set_volmode_check(void *arg, dmu_tx_t *tx) +{ + zvol_set_prop_int_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + int error; + + error = dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL); + if (error != 0) + return (error); + + dsl_dir_rele(dd, FTAG); + + return (error); +} + +/* ARGSUSED */ +static int +zvol_set_volmode_sync_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg) +{ + char dsname[MAXNAMELEN]; + zvol_task_t *task; + uint64_t volmode; + + dsl_dataset_name(ds, dsname); + if (dsl_prop_get_int_ds(ds, "volmode", &volmode) != 0) + return (0); + task = zvol_task_alloc(ZVOL_ASYNC_SET_VOLMODE, dsname, NULL, volmode); + if (task == NULL) + return (0); + + (void) taskq_dispatch(dp->dp_spa->spa_zvol_taskq, zvol_task_cb, + task, TQ_SLEEP); + return (0); +} + +/* + * Traverse all child datasets and apply volmode appropriately. + * We call dsl_prop_set_sync_impl() here to set the value only on the toplevel + * dataset and read the effective "volmode" on every child in the callback + * function: this is because the value is not guaranteed to be the same in the + * whole dataset hierarchy. + */ +static void +zvol_set_volmode_sync(void *arg, dmu_tx_t *tx) +{ + zvol_set_prop_int_arg_t *zsda = arg; + dsl_pool_t *dp = dmu_tx_pool(tx); + dsl_dir_t *dd; + dsl_dataset_t *ds; + int error; + + VERIFY0(dsl_dir_hold(dp, zsda->zsda_name, FTAG, &dd, NULL)); + zsda->zsda_tx = tx; + + error = dsl_dataset_hold(dp, zsda->zsda_name, FTAG, &ds); + if (error == 0) { + dsl_prop_set_sync_impl(ds, zfs_prop_to_name(ZFS_PROP_VOLMODE), + zsda->zsda_source, sizeof (zsda->zsda_value), 1, + &zsda->zsda_value, zsda->zsda_tx); + dsl_dataset_rele(ds, FTAG); + } + + dmu_objset_find_dp(dp, dd->dd_object, zvol_set_volmode_sync_cb, + zsda, DS_FIND_CHILDREN); + + dsl_dir_rele(dd, FTAG); +} + +int +zvol_set_volmode(const char *ddname, zprop_source_t source, uint64_t volmode) +{ + zvol_set_prop_int_arg_t zsda; + + zsda.zsda_name = ddname; + zsda.zsda_source = source; + zsda.zsda_value = volmode; + + return (dsl_sync_task(ddname, zvol_set_volmode_check, + zvol_set_volmode_sync, &zsda, 0, ZFS_SPACE_CHECK_NONE)); +} + +void +zvol_remove_minors(spa_t *spa, const char *name, boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_REMOVE_MINORS, name, NULL, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != TASKQID_INVALID)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +void +zvol_rename_minors(spa_t *spa, const char *name1, const char *name2, + boolean_t async) +{ + zvol_task_t *task; + taskqid_t id; + + task = zvol_task_alloc(ZVOL_ASYNC_RENAME_MINORS, name1, name2, ~0ULL); + if (task == NULL) + return; + + id = taskq_dispatch(spa->spa_zvol_taskq, zvol_task_cb, task, TQ_SLEEP); + if ((async == B_FALSE) && (id != TASKQID_INVALID)) + taskq_wait_id(spa->spa_zvol_taskq, id); +} + +boolean_t +zvol_is_zvol(const char *name) +{ + + return (ops->zv_is_zvol(name)); +} + +void +zvol_register_ops(const zvol_platform_ops_t *zvol_ops) +{ + ops = zvol_ops; +} + +int +zvol_init_impl(void) +{ + int i; + + list_create(&zvol_state_list, sizeof (zvol_state_t), + offsetof(zvol_state_t, zv_next)); + rw_init(&zvol_state_lock, NULL, RW_DEFAULT, NULL); + + zvol_htable = kmem_alloc(ZVOL_HT_SIZE * sizeof (struct hlist_head), + KM_SLEEP); + for (i = 0; i < ZVOL_HT_SIZE; i++) + INIT_HLIST_HEAD(&zvol_htable[i]); + + return (0); +} + +void +zvol_fini_impl(void) +{ + zvol_remove_minors_impl(NULL); + + /* + * The call to "zvol_remove_minors_impl" may dispatch entries to + * the system_taskq, but it doesn't wait for those entries to + * complete before it returns. Thus, we must wait for all of the + * removals to finish, before we can continue. + */ + taskq_wait_outstanding(system_taskq, 0); + + kmem_free(zvol_htable, ZVOL_HT_SIZE * sizeof (struct hlist_head)); + list_destroy(&zvol_state_list); + rw_destroy(&zvol_state_lock); +} diff --git a/module/zstd/Makefile.in b/module/zstd/Makefile.in new file mode 100644 index 000000000000..eea749ea8a5a --- /dev/null +++ b/module/zstd/Makefile.in @@ -0,0 +1,35 @@ +ifneq ($(KBUILD_EXTMOD),) +src = @abs_srcdir@ +obj = @abs_builddir@ +zstd_include = $(src)/include +else +zstd_include = $(srctree)/$(src)/include +endif + +MODULE := zzstd + +obj-$(CONFIG_ZFS) := $(MODULE).o + +asflags-y := -I$(zstd_include) +ccflags-y := -I$(zstd_include) + +# Zstd uses -O3 by default, so we should follow +ccflags-y += -O3 + +# -fno-tree-vectorize gets set for gcc in zstd/common/compiler.h +# Set it for other compilers, too. +$(obj)/lib/zstd.o: c_flags += -fno-tree-vectorize + +# Quiet warnings about frame size due to unused code in unmodified zstd lib +$(obj)/lib/zstd.o: c_flags += -Wframe-larger-than=20480 + +# Disable aarch64 neon SIMD instructions for kernel mode +$(obj)/lib/zstd.o: c_flags += -include $(zstd_include)/aarch64_compat.h -include $(zstd_include)/zstd_compat_wrapper.h -Wp,-w + +$(obj)/zfs_zstd.o: c_flags += -include $(zstd_include)/zstd_compat_wrapper.h + +$(MODULE)-objs += zfs_zstd.o +$(MODULE)-objs += lib/zstd.o + +all: + mkdir -p lib diff --git a/module/zstd/README.md b/module/zstd/README.md new file mode 100644 index 000000000000..f8e127736aac --- /dev/null +++ b/module/zstd/README.md @@ -0,0 +1,65 @@ +# ZSTD-On-ZFS Library Manual + +## Introduction + +This subtree contains the ZSTD library used in ZFS. It is heavily cut-down by +dropping any unneeded files, and combined into a single file, but otherwise is +intentionally unmodified. Please do not alter the file containing the zstd +library, besides upgrading to a newer ZSTD release. + +Tree structure: + +* `zfs_zstd.c` is the actual `zzstd` kernel module. +* `lib/` contains the the unmodified, [_"amalgamated"_](https://github.com/facebook/zstd/blob/dev/contrib/single_file_libs/README.md) + version of the `Zstandard` library, generated from our template file +* `zstd-in.c` is our template file for generating the library +* `include/`: This directory contains supplemental includes for platform + compatibility, which are not expected to be used by ZFS elsewhere in the + future. Thus we keep them private to ZSTD. + +## Updating ZSTD + +To update ZSTD the following steps need to be taken: + +1. Grab the latest release of [ZSTD](https://github.com/facebook/zstd/releases). +2. Update `module/zstd/zstd-in.c` if required. (see + `zstd/contrib/single_file_libs/zstd-in.c` in the zstd repository) +3. Generate the "single-file-library" and put it to `module/zstd/lib/`. +4. Copy the following files to `module/zstd/lib/`: + - `zstd/lib/zstd.h` + - `zstd/lib/common/zstd_errors.h` + +This can be done using a few shell commands from inside the zfs repo: + +~~~sh +cd PATH/TO/ZFS + +url="https://github.com/facebook/zstd" +release="$(curl -s "${url}"/releases/latest | grep -oP '(?<=v)[\d\.]+')" +zstd="/tmp/zstd-${release}/" + +wget -O /tmp/zstd.tar.gz \ + "${url}/releases/download/v${release}/zstd-${release}.tar.gz" +tar -C /tmp -xzf /tmp/zstd.tar.gz + +cp ${zstd}/lib/zstd.h module/zstd/lib/ +cp ${zstd}/lib/zstd_errors.h module/zstd/lib/ +${zstd}/contrib/single_file_libs/combine.sh \ + -r ${zstd}/lib -o module/zstd/lib/zstd.c module/zstd/zstd-in.c +~~~ + +Note: if the zstd library for zfs is updated to a newer version, +the macro list in include/zstd_compat_wrapper.h usually needs to be updated. +this can be done with some hand crafting of the output of the following +script: nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable + + +## Altering ZSTD and breaking changes + +If ZSTD made changes that break compatibility or you need to make breaking +changes to the way we handle ZSTD, it is required to maintain backwards +compatibility. + +We already save the ZSTD version number within the block header to be used +to add future compatibility checks and/or fixes. However, currently it is +not actually used in such a way. diff --git a/module/zstd/include/aarch64_compat.h b/module/zstd/include/aarch64_compat.h new file mode 100644 index 000000000000..088517d3d23b --- /dev/null +++ b/module/zstd/include/aarch64_compat.h @@ -0,0 +1,37 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2018-2020, Sebastian Gottschall + */ + +#ifdef _KERNEL +#undef __aarch64__ +#endif diff --git a/module/zstd/include/limits.h b/module/zstd/include/limits.h new file mode 100644 index 000000000000..3bf5b67765ae --- /dev/null +++ b/module/zstd/include/limits.h @@ -0,0 +1,63 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_LIMITS_H +#define _ZSTD_LIMITS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_LIMITS_H */ diff --git a/module/zstd/include/stddef.h b/module/zstd/include/stddef.h new file mode 100644 index 000000000000..3f46fb8b033e --- /dev/null +++ b/module/zstd/include/stddef.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDDEF_H +#define _ZSTD_STDDEF_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDDEF_H */ diff --git a/module/zstd/include/stdint.h b/module/zstd/include/stdint.h new file mode 100644 index 000000000000..2d98a556c23e --- /dev/null +++ b/module/zstd/include/stdint.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDINT_H +#define _ZSTD_STDINT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include +#elif defined(__linux__) +#include +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDINT_H */ diff --git a/module/zstd/include/stdio.h b/module/zstd/include/stdio.h new file mode 100644 index 000000000000..5a7c6ec69916 --- /dev/null +++ b/module/zstd/include/stdio.h @@ -0,0 +1,54 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDIO_H +#define _ZSTD_STDIO_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _KERNEL + +#include_next + +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDIO_H */ diff --git a/module/zstd/include/stdlib.h b/module/zstd/include/stdlib.h new file mode 100644 index 000000000000..c341a0c84884 --- /dev/null +++ b/module/zstd/include/stdlib.h @@ -0,0 +1,58 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STDLIB_H +#define _ZSTD_STDLIB_H + +#ifdef __cplusplus +extern "C" { +#endif + +#undef GCC_VERSION + +/* + * Define calloc, malloc, free to make building work. They are never really used + * in zstdlib.c since allocation is done in zstd.c. + */ +#define calloc(n, sz) NULL +#define malloc(sz) NULL +#define free(ptr) + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STDLIB_H */ diff --git a/module/zstd/include/string.h b/module/zstd/include/string.h new file mode 100644 index 000000000000..78998d3c4655 --- /dev/null +++ b/module/zstd/include/string.h @@ -0,0 +1,62 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2014-2019, Allan Jude + * Copyright (c) 2020, Brian Behlendorf + * Copyright (c) 2020, Michael Niewöhner + */ + +#ifndef _ZSTD_STRING_H +#define _ZSTD_STRING_H + +#ifdef __cplusplus +extern "C" { +#endif + +#ifdef _KERNEL + +#if defined(__FreeBSD__) +#include /* memcpy, memset */ +#elif defined(__linux__) +#include /* memcpy, memset */ +#else +#error "Unsupported platform" +#endif + +#else /* !_KERNEL */ +#include_next +#endif /* _KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _ZSTD_STRING_H */ diff --git a/module/zstd/include/zstd_compat_wrapper.h b/module/zstd/include/zstd_compat_wrapper.h new file mode 100644 index 000000000000..5cca517b5508 --- /dev/null +++ b/module/zstd/include/zstd_compat_wrapper.h @@ -0,0 +1,460 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2020, Sebastian Gottschall + */ + +/* + * This wrapper fixes a problem, in case the ZFS filesystem driver, is compiled + * staticly into the kernel. + * This will cause a symbol collision with the older in-kernel zstd library. + * The following macros will simply rename all local zstd symbols and references + * + * Note: if the zstd library for zfs is updated to a newer version, this macro + * list usually needs to be updated. + * this can be done with some hand crafting of the output of the following + * script + * nm zstd.o | awk '{print "#define "$3 " zfs_" $3}' > macrotable + */ + +#define BIT_initDStream zfs_BIT_initDStream +#define BIT_mask zfs_BIT_mask +#define BIT_reloadDStream zfs_BIT_reloadDStream +#define ERR_getErrorString zfs_ERR_getErrorString +#define FSE_NCountWriteBound zfs_FSE_NCountWriteBound +#define FSE_buildCTable zfs_FSE_buildCTable +#define FSE_buildCTable_raw zfs_FSE_buildCTable_raw +#define FSE_buildCTable_rle zfs_FSE_buildCTable_rle +#define FSE_buildCTable_wksp zfs_FSE_buildCTable_wksp +#define FSE_buildDTable zfs_FSE_buildDTable +#define FSE_buildDTable_raw zfs_FSE_buildDTable_raw +#define FSE_buildDTable_rle zfs_FSE_buildDTable_rle +#define FSE_compress zfs_FSE_compress +#define FSE_compress2 zfs_FSE_compress2 +#define FSE_compressBound zfs_FSE_compressBound +#define FSE_compress_usingCTable zfs_FSE_compress_usingCTable +#define FSE_compress_usingCTable_generic zfs_FSE_compress_usingCTable_generic +#define FSE_compress_wksp zfs_FSE_compress_wksp +#define FSE_createCTable zfs_FSE_createCTable +#define FSE_createDTable zfs_FSE_createDTable +#define FSE_decompress zfs_FSE_decompress +#define FSE_decompress_usingDTable zfs_FSE_decompress_usingDTable +#define FSE_decompress_wksp zfs_FSE_decompress_wksp +#define FSE_freeCTable zfs_FSE_freeCTable +#define FSE_freeDTable zfs_FSE_freeDTable +#define FSE_getErrorName zfs_FSE_getErrorName +#define FSE_normalizeCount zfs_FSE_normalizeCount +#define FSE_optimalTableLog zfs_FSE_optimalTableLog +#define FSE_optimalTableLog_internal zfs_FSE_optimalTableLog_internal +#define FSE_readNCount zfs_FSE_readNCount +#define FSE_versionNumber zfs_FSE_versionNumber +#define FSE_writeNCount zfs_FSE_writeNCount +#define HIST_count zfs_HIST_count +#define HIST_countFast zfs_HIST_countFast +#define HIST_countFast_wksp zfs_HIST_countFast_wksp +#define HIST_count_parallel_wksp zfs_HIST_count_parallel_wksp +#define HIST_count_simple zfs_HIST_count_simple +#define HIST_count_wksp zfs_HIST_count_wksp +#define HUF_buildCTable zfs_HUF_buildCTable +#define HUF_buildCTable_wksp zfs_HUF_buildCTable_wksp +#define HUF_compress zfs_HUF_compress +#define HUF_compress1X zfs_HUF_compress1X +#define HUF_compress1X_repeat zfs_HUF_compress1X_repeat +#define HUF_compress1X_usingCTable zfs_HUF_compress1X_usingCTable +#define HUF_compress1X_wksp zfs_HUF_compress1X_wksp +#define HUF_compress2 zfs_HUF_compress2 +#define HUF_compress4X_repeat zfs_HUF_compress4X_repeat +#define HUF_compress4X_usingCTable zfs_HUF_compress4X_usingCTable +#define HUF_compress4X_wksp zfs_HUF_compress4X_wksp +#define HUF_compressBound zfs_HUF_compressBound +#define HUF_compressWeights zfs_HUF_compressWeights +#define HUF_decompress zfs_HUF_decompress +#define HUF_decompress1X1 zfs_HUF_decompress1X1 +#define HUF_decompress1X1_DCtx zfs_HUF_decompress1X1_DCtx +#define HUF_decompress1X1_DCtx_wksp zfs_HUF_decompress1X1_DCtx_wksp +#define HUF_decompress1X1_DCtx_wksp_bmi2 zfs_HUF_decompress1X1_DCtx_wksp_bmi2 +#define HUF_decompress1X1_usingDTable zfs_HUF_decompress1X1_usingDTable +#define HUF_decompress1X2 zfs_HUF_decompress1X2 +#define HUF_decompress1X2_DCtx zfs_HUF_decompress1X2_DCtx +#define HUF_decompress1X2_DCtx_wksp zfs_HUF_decompress1X2_DCtx_wksp +#define HUF_decompress1X2_usingDTable zfs_HUF_decompress1X2_usingDTable +#define HUF_decompress1X_DCtx zfs_HUF_decompress1X_DCtx +#define HUF_decompress1X_DCtx_wksp zfs_HUF_decompress1X_DCtx_wksp +#define HUF_decompress1X_usingDTable zfs_HUF_decompress1X_usingDTable +#define HUF_decompress1X_usingDTable_bmi2 zfs_HUF_decompress1X_usingDTable_bmi2 +#define HUF_decompress4X1 zfs_HUF_decompress4X1 +#define HUF_decompress4X1_DCtx zfs_HUF_decompress4X1_DCtx +#define HUF_decompress4X1_DCtx_wksp zfs_HUF_decompress4X1_DCtx_wksp +#define HUF_decompress4X1_usingDTable zfs_HUF_decompress4X1_usingDTable +#define HUF_decompress4X2 zfs_HUF_decompress4X2 +#define HUF_decompress4X2_DCtx zfs_HUF_decompress4X2_DCtx +#define HUF_decompress4X2_DCtx_wksp zfs_HUF_decompress4X2_DCtx_wksp +#define HUF_decompress4X2_usingDTable zfs_HUF_decompress4X2_usingDTable +#define HUF_decompress4X_DCtx zfs_HUF_decompress4X_DCtx +#define HUF_decompress4X_hufOnly zfs_HUF_decompress4X_hufOnly +#define HUF_decompress4X_hufOnly_wksp zfs_HUF_decompress4X_hufOnly_wksp +#define HUF_decompress4X_hufOnly_wksp_bmi2 \ + zfs_HUF_decompress4X_hufOnly_wksp_bmi2 +#define HUF_decompress4X_usingDTable zfs_HUF_decompress4X_usingDTable +#define HUF_decompress4X_usingDTable_bmi2 zfs_HUF_decompress4X_usingDTable_bmi2 +#define HUF_estimateCompressedSize zfs_HUF_estimateCompressedSize +#define HUF_fillDTableX2Level2 zfs_HUF_fillDTableX2Level2 +#define HUF_getErrorName zfs_HUF_getErrorName +#define HUF_getNbBits zfs_HUF_getNbBits +#define HUF_optimalTableLog zfs_HUF_optimalTableLog +#define HUF_readCTable zfs_HUF_readCTable +#define HUF_readDTableX1 zfs_HUF_readDTableX1 +#define HUF_readDTableX1_wksp zfs_HUF_readDTableX1_wksp +#define HUF_readDTableX2 zfs_HUF_readDTableX2 +#define HUF_readDTableX2_wksp zfs_HUF_readDTableX2_wksp +#define HUF_readStats zfs_HUF_readStats +#define HUF_selectDecoder zfs_HUF_selectDecoder +#define HUF_setMaxHeight zfs_HUF_setMaxHeight +#define HUF_validateCTable zfs_HUF_validateCTable +#define HUF_writeCTable zfs_HUF_writeCTable +#define LL_base zfs_LL_base +#define LL_bits zfs_LL_bits +#define LL_defaultDTable zfs_LL_defaultDTable +#define LL_defaultNorm zfs_LL_defaultNorm +#define ML_base zfs_ML_base +#define ML_bits zfs_ML_bits +#define ML_defaultDTable zfs_ML_defaultDTable +#define ML_defaultNorm zfs_ML_defaultNorm +#define OF_base zfs_OF_base +#define OF_bits zfs_OF_bits +#define OF_defaultDTable zfs_OF_defaultDTable +#define OF_defaultNorm zfs_OF_defaultNorm +#define POOL_add zfs_POOL_add +#define POOL_create zfs_POOL_create +#define POOL_create_advanced zfs_POOL_create_advanced +#define POOL_free zfs_POOL_free +#define POOL_resize zfs_POOL_resize +#define POOL_sizeof zfs_POOL_sizeof +#define POOL_tryAdd zfs_POOL_tryAdd +#define ZSTD_CCtxParams_getParameter zfs_ZSTD_CCtxParams_getParameter +#define ZSTD_CCtxParams_init zfs_ZSTD_CCtxParams_init +#define ZSTD_CCtxParams_init_advanced zfs_ZSTD_CCtxParams_init_advanced +#define ZSTD_CCtxParams_reset zfs_ZSTD_CCtxParams_reset +#define ZSTD_CCtxParams_setParameter zfs_ZSTD_CCtxParams_setParameter +#define ZSTD_CCtx_getParameter zfs_ZSTD_CCtx_getParameter +#define ZSTD_CCtx_loadDictionary zfs_ZSTD_CCtx_loadDictionary +#define ZSTD_CCtx_loadDictionary_advanced zfs_ZSTD_CCtx_loadDictionary_advanced +#define ZSTD_CCtx_loadDictionary_byReference \ + zfs_ZSTD_CCtx_loadDictionary_byReference +#define ZSTD_CCtx_refCDict zfs_ZSTD_CCtx_refCDict +#define ZSTD_CCtx_refPrefix zfs_ZSTD_CCtx_refPrefix +#define ZSTD_CCtx_refPrefix_advanced zfs_ZSTD_CCtx_refPrefix_advanced +#define ZSTD_CCtx_reset zfs_ZSTD_CCtx_reset +#define ZSTD_CCtx_setParameter zfs_ZSTD_CCtx_setParameter +#define ZSTD_CCtx_setParametersUsingCCtxParams \ + zfs_ZSTD_CCtx_setParametersUsingCCtxParams +#define ZSTD_CCtx_setPledgedSrcSize zfs_ZSTD_CCtx_setPledgedSrcSize +#define ZSTD_CStreamInSize zfs_ZSTD_CStreamInSize +#define ZSTD_CStreamOutSize zfs_ZSTD_CStreamOutSize +#define ZSTD_DCtx_loadDictionary zfs_ZSTD_DCtx_loadDictionary +#define ZSTD_DCtx_loadDictionary_advanced zfs_ZSTD_DCtx_loadDictionary_advanced +#define ZSTD_DCtx_loadDictionary_byReference \ + zfs_ZSTD_DCtx_loadDictionary_byReference +#define ZSTD_DCtx_refDDict zfs_ZSTD_DCtx_refDDict +#define ZSTD_DCtx_refPrefix zfs_ZSTD_DCtx_refPrefix +#define ZSTD_DCtx_refPrefix_advanced zfs_ZSTD_DCtx_refPrefix_advanced +#define ZSTD_DCtx_reset zfs_ZSTD_DCtx_reset +#define ZSTD_DCtx_setFormat zfs_ZSTD_DCtx_setFormat +#define ZSTD_DCtx_setMaxWindowSize zfs_ZSTD_DCtx_setMaxWindowSize +#define ZSTD_DCtx_setParameter zfs_ZSTD_DCtx_setParameter +#define ZSTD_DDict_dictContent zfs_ZSTD_DDict_dictContent +#define ZSTD_DDict_dictSize zfs_ZSTD_DDict_dictSize +#define ZSTD_DStreamInSize zfs_ZSTD_DStreamInSize +#define ZSTD_DStreamOutSize zfs_ZSTD_DStreamOutSize +#define ZSTD_DUBT_findBestMatch zfs_ZSTD_DUBT_findBestMatch +#define ZSTD_NCountCost zfs_ZSTD_NCountCost +#define ZSTD_XXH64_digest zfs_ZSTD_XXH64_digest +#define ZSTD_adjustCParams zfs_ZSTD_adjustCParams +#define ZSTD_assignParamsToCCtxParams zfs_ZSTD_assignParamsToCCtxParams +#define ZSTD_buildCTable zfs_ZSTD_buildCTable +#define ZSTD_buildFSETable zfs_ZSTD_buildFSETable +#define ZSTD_buildSeqStore zfs_ZSTD_buildSeqStore +#define ZSTD_buildSeqTable zfs_ZSTD_buildSeqTable +#define ZSTD_cParam_getBounds zfs_ZSTD_cParam_getBounds +#define ZSTD_cParam_withinBounds zfs_ZSTD_cParam_withinBounds +#define ZSTD_calloc zfs_ZSTD_calloc +#define ZSTD_checkCParams zfs_ZSTD_checkCParams +#define ZSTD_checkContinuity zfs_ZSTD_checkContinuity +#define ZSTD_compress zfs_ZSTD_compress +#define ZSTD_compress2 zfs_ZSTD_compress2 +#define ZSTD_compressBegin zfs_ZSTD_compressBegin +#define ZSTD_compressBegin_advanced zfs_ZSTD_compressBegin_advanced +#define ZSTD_compressBegin_advanced_internal \ + zfs_ZSTD_compressBegin_advanced_internal +#define ZSTD_compressBegin_usingCDict zfs_ZSTD_compressBegin_usingCDict +#define ZSTD_compressBegin_usingCDict_advanced \ + zfs_ZSTD_compressBegin_usingCDict_advanced +#define ZSTD_compressBegin_usingDict zfs_ZSTD_compressBegin_usingDict +#define ZSTD_compressBlock zfs_ZSTD_compressBlock +#define ZSTD_compressBlock_btlazy2 zfs_ZSTD_compressBlock_btlazy2 +#define ZSTD_compressBlock_btlazy2_dictMatchState \ + zfs_ZSTD_compressBlock_btlazy2_dictMatchState +#define ZSTD_compressBlock_btlazy2_extDict \ + zfs_ZSTD_compressBlock_btlazy2_extDict +#define ZSTD_compressBlock_btopt zfs_ZSTD_compressBlock_btopt +#define ZSTD_compressBlock_btopt_dictMatchState \ + zfs_ZSTD_compressBlock_btopt_dictMatchState +#define ZSTD_compressBlock_btopt_extDict zfs_ZSTD_compressBlock_btopt_extDict +#define ZSTD_compressBlock_btultra zfs_ZSTD_compressBlock_btultra +#define ZSTD_compressBlock_btultra2 zfs_ZSTD_compressBlock_btultra2 +#define ZSTD_compressBlock_btultra_dictMatchState \ + zfs_ZSTD_compressBlock_btultra_dictMatchState +#define ZSTD_compressBlock_btultra_extDict \ + zfs_ZSTD_compressBlock_btultra_extDict +#define ZSTD_compressBlock_doubleFast zfs_ZSTD_compressBlock_doubleFast +#define ZSTD_compressBlock_doubleFast_dictMatchState \ + zfs_ZSTD_compressBlock_doubleFast_dictMatchState +#define ZSTD_compressBlock_doubleFast_extDict \ + zfs_ZSTD_compressBlock_doubleFast_extDict +#define ZSTD_compressBlock_doubleFast_extDict_generic \ + zfs_ZSTD_compressBlock_doubleFast_extDict_generic +#define ZSTD_compressBlock_fast zfs_ZSTD_compressBlock_fast +#define ZSTD_compressBlock_fast_dictMatchState \ + zfs_ZSTD_compressBlock_fast_dictMatchState +#define ZSTD_compressBlock_fast_extDict zfs_ZSTD_compressBlock_fast_extDict +#define ZSTD_compressBlock_fast_extDict_generic \ + zfs_ZSTD_compressBlock_fast_extDict_generic +#define ZSTD_compressBlock_greedy zfs_ZSTD_compressBlock_greedy +#define ZSTD_compressBlock_greedy_dictMatchState \ + zfs_ZSTD_compressBlock_greedy_dictMatchState +#define ZSTD_compressBlock_greedy_extDict zfs_ZSTD_compressBlock_greedy_extDict +#define ZSTD_compressBlock_internal zfs_ZSTD_compressBlock_internal +#define ZSTD_compressBlock_lazy zfs_ZSTD_compressBlock_lazy +#define ZSTD_compressBlock_lazy2 zfs_ZSTD_compressBlock_lazy2 +#define ZSTD_compressBlock_lazy2_dictMatchState \ + zfs_ZSTD_compressBlock_lazy2_dictMatchState +#define ZSTD_compressBlock_lazy2_extDict zfs_ZSTD_compressBlock_lazy2_extDict +#define ZSTD_compressBlock_lazy_dictMatchState \ + zfs_ZSTD_compressBlock_lazy_dictMatchState +#define ZSTD_compressBlock_lazy_extDict zfs_ZSTD_compressBlock_lazy_extDict +#define ZSTD_compressBound zfs_ZSTD_compressBound +#define ZSTD_compressCCtx zfs_ZSTD_compressCCtx +#define ZSTD_compressContinue zfs_ZSTD_compressContinue +#define ZSTD_compressContinue_internal zfs_ZSTD_compressContinue_internal +#define ZSTD_compressEnd zfs_ZSTD_compressEnd +#define ZSTD_compressLiterals zfs_ZSTD_compressLiterals +#define ZSTD_compressRleLiteralsBlock zfs_ZSTD_compressRleLiteralsBlock +#define ZSTD_compressStream zfs_ZSTD_compressStream +#define ZSTD_compressStream2 zfs_ZSTD_compressStream2 +#define ZSTD_compressStream2_simpleArgs zfs_ZSTD_compressStream2_simpleArgs +#define ZSTD_compressSuperBlock zfs_ZSTD_compressSuperBlock +#define ZSTD_compress_advanced zfs_ZSTD_compress_advanced +#define ZSTD_compress_advanced_internal zfs_ZSTD_compress_advanced_internal +#define ZSTD_compress_internal zfs_ZSTD_compress_internal +#define ZSTD_compress_usingCDict zfs_ZSTD_compress_usingCDict +#define ZSTD_compress_usingCDict_advanced zfs_ZSTD_compress_usingCDict_advanced +#define ZSTD_compress_usingDict zfs_ZSTD_compress_usingDict +#define ZSTD_copyCCtx zfs_ZSTD_copyCCtx +#define ZSTD_copyDCtx zfs_ZSTD_copyDCtx +#define ZSTD_copyDDictParameters zfs_ZSTD_copyDDictParameters +#define ZSTD_count zfs_ZSTD_count +#define ZSTD_count_2segments zfs_ZSTD_count_2segments +#define ZSTD_createCCtx zfs_ZSTD_createCCtx +#define ZSTD_createCCtxParams zfs_ZSTD_createCCtxParams +#define ZSTD_createCCtx_advanced zfs_ZSTD_createCCtx_advanced +#define ZSTD_createCDict zfs_ZSTD_createCDict +#define ZSTD_createCDict_advanced zfs_ZSTD_createCDict_advanced +#define ZSTD_createCDict_byReference zfs_ZSTD_createCDict_byReference +#define ZSTD_createCStream zfs_ZSTD_createCStream +#define ZSTD_createCStream_advanced zfs_ZSTD_createCStream_advanced +#define ZSTD_createDCtx zfs_ZSTD_createDCtx +#define ZSTD_createDCtx_advanced zfs_ZSTD_createDCtx_advanced +#define ZSTD_createDDict zfs_ZSTD_createDDict +#define ZSTD_createDDict_advanced zfs_ZSTD_createDDict_advanced +#define ZSTD_createDDict_byReference zfs_ZSTD_createDDict_byReference +#define ZSTD_createDStream zfs_ZSTD_createDStream +#define ZSTD_createDStream_advanced zfs_ZSTD_createDStream_advanced +#define ZSTD_crossEntropyCost zfs_ZSTD_crossEntropyCost +#define ZSTD_cycleLog zfs_ZSTD_cycleLog +#define ZSTD_dParam_getBounds zfs_ZSTD_dParam_getBounds +#define ZSTD_decodeLiteralsBlock zfs_ZSTD_decodeLiteralsBlock +#define ZSTD_decodeSeqHeaders zfs_ZSTD_decodeSeqHeaders +#define ZSTD_decodingBufferSize_min zfs_ZSTD_decodingBufferSize_min +#define ZSTD_decompress zfs_ZSTD_decompress +#define ZSTD_decompressBegin zfs_ZSTD_decompressBegin +#define ZSTD_decompressBegin_usingDDict zfs_ZSTD_decompressBegin_usingDDict +#define ZSTD_decompressBegin_usingDict zfs_ZSTD_decompressBegin_usingDict +#define ZSTD_decompressBlock zfs_ZSTD_decompressBlock +#define ZSTD_decompressBlock_internal zfs_ZSTD_decompressBlock_internal +#define ZSTD_decompressBound zfs_ZSTD_decompressBound +#define ZSTD_decompressContinue zfs_ZSTD_decompressContinue +#define ZSTD_decompressContinueStream zfs_ZSTD_decompressContinueStream +#define ZSTD_decompressDCtx zfs_ZSTD_decompressDCtx +#define ZSTD_decompressMultiFrame zfs_ZSTD_decompressMultiFrame +#define ZSTD_decompressStream zfs_ZSTD_decompressStream +#define ZSTD_decompressStream_simpleArgs zfs_ZSTD_decompressStream_simpleArgs +#define ZSTD_decompress_usingDDict zfs_ZSTD_decompress_usingDDict +#define ZSTD_decompress_usingDict zfs_ZSTD_decompress_usingDict +#define ZSTD_defaultCParameters zfs_ZSTD_defaultCParameters +#define ZSTD_did_fieldSize zfs_ZSTD_did_fieldSize +#define ZSTD_encodeSequences zfs_ZSTD_encodeSequences +#define ZSTD_encodeSequences_default zfs_ZSTD_encodeSequences_default +#define ZSTD_endStream zfs_ZSTD_endStream +#define ZSTD_estimateCCtxSize zfs_ZSTD_estimateCCtxSize +#define ZSTD_estimateCCtxSize_usingCCtxParams \ + zfs_ZSTD_estimateCCtxSize_usingCCtxParams +#define ZSTD_estimateCCtxSize_usingCParams \ + zfs_ZSTD_estimateCCtxSize_usingCParams +#define ZSTD_estimateCDictSize zfs_ZSTD_estimateCDictSize +#define ZSTD_estimateCDictSize_advanced zfs_ZSTD_estimateCDictSize_advanced +#define ZSTD_estimateCStreamSize zfs_ZSTD_estimateCStreamSize +#define ZSTD_estimateCStreamSize_usingCCtxParams \ + zfs_ZSTD_estimateCStreamSize_usingCCtxParams +#define ZSTD_estimateCStreamSize_usingCParams \ + zfs_ZSTD_estimateCStreamSize_usingCParams +#define ZSTD_estimateDCtxSize zfs_ZSTD_estimateDCtxSize +#define ZSTD_estimateDDictSize zfs_ZSTD_estimateDDictSize +#define ZSTD_estimateDStreamSize zfs_ZSTD_estimateDStreamSize +#define ZSTD_estimateDStreamSize_fromFrame \ + zfs_ZSTD_estimateDStreamSize_fromFrame +#define ZSTD_fcs_fieldSize zfs_ZSTD_fcs_fieldSize +#define ZSTD_fillDoubleHashTable zfs_ZSTD_fillDoubleHashTable +#define ZSTD_fillHashTable zfs_ZSTD_fillHashTable +#define ZSTD_findDecompressedSize zfs_ZSTD_findDecompressedSize +#define ZSTD_findFrameCompressedSize zfs_ZSTD_findFrameCompressedSize +#define ZSTD_findFrameSizeInfo zfs_ZSTD_findFrameSizeInfo +#define ZSTD_flushStream zfs_ZSTD_flushStream +#define ZSTD_frameHeaderSize zfs_ZSTD_frameHeaderSize +#define ZSTD_free zfs_ZSTD_free +#define ZSTD_freeCCtx zfs_ZSTD_freeCCtx +#define ZSTD_freeCCtxParams zfs_ZSTD_freeCCtxParams +#define ZSTD_freeCDict zfs_ZSTD_freeCDict +#define ZSTD_freeCStream zfs_ZSTD_freeCStream +#define ZSTD_freeDCtx zfs_ZSTD_freeDCtx +#define ZSTD_freeDDict zfs_ZSTD_freeDDict +#define ZSTD_freeDStream zfs_ZSTD_freeDStream +#define ZSTD_fseBitCost zfs_ZSTD_fseBitCost +#define ZSTD_getBlockSize zfs_ZSTD_getBlockSize +#define ZSTD_getCParams zfs_ZSTD_getCParams +#define ZSTD_getCParamsFromCCtxParams zfs_ZSTD_getCParamsFromCCtxParams +#define ZSTD_getCParamsFromCDict zfs_ZSTD_getCParamsFromCDict +#define ZSTD_getCParams_internal zfs_ZSTD_getCParams_internal +#define ZSTD_getDDict zfs_ZSTD_getDDict +#define ZSTD_getDecompressedSize zfs_ZSTD_getDecompressedSize +#define ZSTD_getDictID_fromDDict zfs_ZSTD_getDictID_fromDDict +#define ZSTD_getDictID_fromDict zfs_ZSTD_getDictID_fromDict +#define ZSTD_getDictID_fromFrame zfs_ZSTD_getDictID_fromFrame +#define ZSTD_getErrorCode zfs_ZSTD_getErrorCode +#define ZSTD_getErrorName zfs_ZSTD_getErrorName +#define ZSTD_getErrorString zfs_ZSTD_getErrorString +#define ZSTD_getFrameContentSize zfs_ZSTD_getFrameContentSize +#define ZSTD_getFrameHeader zfs_ZSTD_getFrameHeader +#define ZSTD_getFrameHeader_advanced zfs_ZSTD_getFrameHeader_advanced +#define ZSTD_getFrameProgression zfs_ZSTD_getFrameProgression +#define ZSTD_getParams zfs_ZSTD_getParams +#define ZSTD_getSeqStore zfs_ZSTD_getSeqStore +#define ZSTD_getSequences zfs_ZSTD_getSequences +#define ZSTD_getcBlockSize zfs_ZSTD_getcBlockSize +#define ZSTD_hashPtr zfs_ZSTD_hashPtr +#define ZSTD_initCDict_internal zfs_ZSTD_initCDict_internal +#define ZSTD_initCStream zfs_ZSTD_initCStream +#define ZSTD_initCStream_advanced zfs_ZSTD_initCStream_advanced +#define ZSTD_initCStream_internal zfs_ZSTD_initCStream_internal +#define ZSTD_initCStream_srcSize zfs_ZSTD_initCStream_srcSize +#define ZSTD_initCStream_usingCDict zfs_ZSTD_initCStream_usingCDict +#define ZSTD_initCStream_usingCDict_advanced \ + zfs_ZSTD_initCStream_usingCDict_advanced +#define ZSTD_initCStream_usingDict zfs_ZSTD_initCStream_usingDict +#define ZSTD_initDDict_internal zfs_ZSTD_initDDict_internal +#define ZSTD_initDStream zfs_ZSTD_initDStream +#define ZSTD_initDStream_usingDDict zfs_ZSTD_initDStream_usingDDict +#define ZSTD_initDStream_usingDict zfs_ZSTD_initDStream_usingDict +#define ZSTD_initFseState zfs_ZSTD_initFseState +#define ZSTD_initStaticCCtx zfs_ZSTD_initStaticCCtx +#define ZSTD_initStaticCDict zfs_ZSTD_initStaticCDict +#define ZSTD_initStaticCStream zfs_ZSTD_initStaticCStream +#define ZSTD_initStaticDCtx zfs_ZSTD_initStaticDCtx +#define ZSTD_initStaticDDict zfs_ZSTD_initStaticDDict +#define ZSTD_initStaticDStream zfs_ZSTD_initStaticDStream +#define ZSTD_initStats_ultra zfs_ZSTD_initStats_ultra +#define ZSTD_insertAndFindFirstIndex zfs_ZSTD_insertAndFindFirstIndex +#define ZSTD_insertAndFindFirstIndexHash3 zfs_ZSTD_insertAndFindFirstIndexHash3 +#define ZSTD_insertAndFindFirstIndex_internal \ + zfs_ZSTD_insertAndFindFirstIndex_internal +#define ZSTD_insertBlock zfs_ZSTD_insertBlock +#define ZSTD_invalidateRepCodes zfs_ZSTD_invalidateRepCodes +#define ZSTD_isFrame zfs_ZSTD_isFrame +#define ZSTD_ldm_adjustParameters zfs_ZSTD_ldm_adjustParameters +#define ZSTD_ldm_blockCompress zfs_ZSTD_ldm_blockCompress +#define ZSTD_ldm_fillHashTable zfs_ZSTD_ldm_fillHashTable +#define ZSTD_ldm_generateSequences zfs_ZSTD_ldm_generateSequences +#define ZSTD_ldm_getMaxNbSeq zfs_ZSTD_ldm_getMaxNbSeq +#define ZSTD_ldm_getTableSize zfs_ZSTD_ldm_getTableSize +#define ZSTD_ldm_skipSequences zfs_ZSTD_ldm_skipSequences +#define ZSTD_loadCEntropy zfs_ZSTD_loadCEntropy +#define ZSTD_loadDEntropy zfs_ZSTD_loadDEntropy +#define ZSTD_loadDictionaryContent zfs_ZSTD_loadDictionaryContent +#define ZSTD_makeCCtxParamsFromCParams zfs_ZSTD_makeCCtxParamsFromCParams +#define ZSTD_malloc zfs_ZSTD_malloc +#define ZSTD_maxCLevel zfs_ZSTD_maxCLevel +#define ZSTD_minCLevel zfs_ZSTD_minCLevel +#define ZSTD_nextInputType zfs_ZSTD_nextInputType +#define ZSTD_nextSrcSizeToDecompress zfs_ZSTD_nextSrcSizeToDecompress +#define ZSTD_noCompressLiterals zfs_ZSTD_noCompressLiterals +#define ZSTD_referenceExternalSequences zfs_ZSTD_referenceExternalSequences +#define ZSTD_rescaleFreqs zfs_ZSTD_rescaleFreqs +#define ZSTD_resetCCtx_internal zfs_ZSTD_resetCCtx_internal +#define ZSTD_resetCCtx_usingCDict zfs_ZSTD_resetCCtx_usingCDict +#define ZSTD_resetCStream zfs_ZSTD_resetCStream +#define ZSTD_resetDStream zfs_ZSTD_resetDStream +#define ZSTD_resetSeqStore zfs_ZSTD_resetSeqStore +#define ZSTD_reset_compressedBlockState zfs_ZSTD_reset_compressedBlockState +#define ZSTD_safecopy zfs_ZSTD_safecopy +#define ZSTD_selectBlockCompressor zfs_ZSTD_selectBlockCompressor +#define ZSTD_selectEncodingType zfs_ZSTD_selectEncodingType +#define ZSTD_seqToCodes zfs_ZSTD_seqToCodes +#define ZSTD_sizeof_CCtx zfs_ZSTD_sizeof_CCtx +#define ZSTD_sizeof_CDict zfs_ZSTD_sizeof_CDict +#define ZSTD_sizeof_CStream zfs_ZSTD_sizeof_CStream +#define ZSTD_sizeof_DCtx zfs_ZSTD_sizeof_DCtx +#define ZSTD_sizeof_DDict zfs_ZSTD_sizeof_DDict +#define ZSTD_sizeof_DStream zfs_ZSTD_sizeof_DStream +#define ZSTD_toFlushNow zfs_ZSTD_toFlushNow +#define ZSTD_updateRep zfs_ZSTD_updateRep +#define ZSTD_updateStats zfs_ZSTD_updateStats +#define ZSTD_updateTree zfs_ZSTD_updateTree +#define ZSTD_versionNumber zfs_ZSTD_versionNumber +#define ZSTD_versionString zfs_ZSTD_versionString +#define ZSTD_writeFrameHeader zfs_ZSTD_writeFrameHeader +#define ZSTD_writeLastEmptyBlock zfs_ZSTD_writeLastEmptyBlock +#define algoTime zfs_algoTime +#define attachDictSizeCutoffs zfs_attachDictSizeCutoffs +#define g_ctx zfs_g_ctx +#define g_debuglevel zfs_g_debuglevel +#define kInverseProbabilityLog256 zfs_kInverseProbabilityLog256 +#define repStartValue zfs_repStartValue +#define FSE_isError zfs_FSE_isError +#define HUF_isError zfs_HUF_isError diff --git a/module/zstd/lib/zstd.c b/module/zstd/lib/zstd.c new file mode 100644 index 000000000000..949b8e47ec27 --- /dev/null +++ b/module/zstd/lib/zstd.c @@ -0,0 +1,27826 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved. + * Copyright (c) 2019-2020, Michael Niewöhner. All rights reserved. + */ + +#define MEM_MODULE +#define XXH_NAMESPACE ZSTD_ +#define XXH_PRIVATE_API +#define XXH_INLINE_ALL +#define ZSTD_LEGACY_SUPPORT 0 +#define ZSTD_LIB_DICTBUILDER 0 +#define ZSTD_LIB_DEPRECATED 0 +#define ZSTD_NOBENCH + +/**** start inlining common/debug.c ****/ +/* ****************************************************************** + * debug + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* + * This module only hosts one global variable + * which can be used to dynamically influence the verbosity of traces, + * such as DEBUGLOG and RAWLOG + */ + +/**** start inlining debug.h ****/ +/* ****************************************************************** + * debug + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* + * The purpose of this header is to enable debug functions. + * They regroup assert(), DEBUGLOG() and RAWLOG() for run-time, + * and DEBUG_STATIC_ASSERT() for compile-time. + * + * By default, DEBUGLEVEL==0, which means run-time debug is disabled. + * + * Level 1 enables assert() only. + * Starting level 2, traces can be generated and pushed to stderr. + * The higher the level, the more verbose the traces. + * + * It's possible to dynamically adjust level using variable g_debug_level, + * which is only declared if DEBUGLEVEL>=2, + * and is a global variable, not multi-thread protected (use with care) + */ + +#ifndef DEBUG_H_12987983217 +#define DEBUG_H_12987983217 + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* static assert is triggered at compile time, leaving no runtime artefact. + * static assert only works with compile-time constants. + * Also, this variant can only be used inside a function. */ +#define DEBUG_STATIC_ASSERT(c) (void)sizeof(char[(c) ? 1 : -1]) + + +/* DEBUGLEVEL is expected to be defined externally, + * typically through compiler command line. + * Value must be a number. */ +#ifndef DEBUGLEVEL +# define DEBUGLEVEL 0 +#endif + + +/* DEBUGFILE can be defined externally, + * typically through compiler command line. + * note : currently useless. + * Value must be stderr or stdout */ +#ifndef DEBUGFILE +# define DEBUGFILE stderr +#endif + + +/* recommended values for DEBUGLEVEL : + * 0 : release mode, no debug, all run-time checks disabled + * 1 : enables assert() only, no display + * 2 : reserved, for currently active debug path + * 3 : events once per object lifetime (CCtx, CDict, etc.) + * 4 : events once per frame + * 5 : events once per block + * 6 : events once per sequence (verbose) + * 7+: events at every position (*very* verbose) + * + * It's generally inconvenient to output traces > 5. + * In which case, it's possible to selectively trigger high verbosity levels + * by modifying g_debug_level. + */ + +#if (DEBUGLEVEL>=1) +# include +#else +# ifndef assert /* assert may be already defined, due to prior #include */ +# define assert(condition) ((void)0) /* disable assert (default) */ +# endif +#endif + +#if (DEBUGLEVEL>=2) +# include +extern int g_debuglevel; /* the variable is only declared, + it actually lives in debug.c, + and is shared by the whole process. + It's not thread-safe. + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +# define RAWLOG(l, ...) { \ + if (l<=g_debuglevel) { \ + fprintf(stderr, __VA_ARGS__); \ + } } +# define DEBUGLOG(l, ...) { \ + if (l<=g_debuglevel) { \ + fprintf(stderr, __FILE__ ": " __VA_ARGS__); \ + fprintf(stderr, " \n"); \ + } } +#else +# define RAWLOG(l, ...) {} /* disabled */ +# define DEBUGLOG(l, ...) {} /* disabled */ +#endif + + +#if defined (__cplusplus) +} +#endif + +#endif /* DEBUG_H_12987983217 */ +/**** ended inlining debug.h ****/ + +int g_debuglevel = DEBUGLEVEL; +/**** ended inlining common/debug.c ****/ +/**** start inlining common/entropy_common.c ****/ +/* ****************************************************************** + * Common functions of New Generation Entropy library + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************* +* Dependencies +***************************************/ +/**** start inlining mem.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef MEM_H_MODULE +#define MEM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-**************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ +#include /* memcpy */ + + +/*-**************************************** +* Compiler specifics +******************************************/ +#if defined(_MSC_VER) /* Visual Studio */ +# include /* _byteswap_ulong */ +# include /* _byteswap_* */ +#endif +#if defined(__GNUC__) +# define MEM_STATIC static __inline __attribute__((unused)) +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define MEM_STATIC static inline +#elif defined(_MSC_VER) +# define MEM_STATIC static __inline +#else +# define MEM_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + +#ifndef __has_builtin +# define __has_builtin(x) 0 /* compat. with non-clang compilers */ +#endif + +/* code only tested on 32 and 64 bits systems */ +#define MEM_STATIC_ASSERT(c) { enum { MEM_static_assert = 1/(int)(!!(c)) }; } +MEM_STATIC void MEM_check(void) { MEM_STATIC_ASSERT((sizeof(size_t)==4) || (sizeof(size_t)==8)); } + +/* detects whether we are being compiled under msan */ +#if defined (__has_feature) +# if __has_feature(memory_sanitizer) +# define MEMORY_SANITIZER 1 +# endif +#endif + +#if defined (MEMORY_SANITIZER) +/* Not all platforms that support msan provide sanitizers/msan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +#include /* intptr_t */ + +/* Make memory region fully initialized (without changing its contents). */ +void __msan_unpoison(const volatile void *a, size_t size); + +/* Make memory region fully uninitialized (without changing its contents). + This is a legacy interface that does not update origin information. Use + __msan_allocated_memory() instead. */ +void __msan_poison(const volatile void *a, size_t size); + +/* Returns the offset of the first (at least partially) poisoned byte in the + memory range, or -1 if the whole range is good. */ +intptr_t __msan_test_shadow(const volatile void *x, size_t size); +#endif + +/* detects whether we are being compiled under asan */ +#if defined (__has_feature) +# if __has_feature(address_sanitizer) +# define ADDRESS_SANITIZER 1 +# endif +#elif defined(__SANITIZE_ADDRESS__) +# define ADDRESS_SANITIZER 1 +#endif + +#if defined (ADDRESS_SANITIZER) +/* Not all platforms that support asan provide sanitizers/asan_interface.h. + * We therefore declare the functions we need ourselves, rather than trying to + * include the header file... */ + +/** + * Marks a memory region ([addr, addr+size)) as unaddressable. + * + * This memory must be previously allocated by your program. Instrumented + * code is forbidden from accessing addresses in this region until it is + * unpoisoned. This function is not guaranteed to poison the entire region - + * it could poison only a subregion of [addr, addr+size) due to ASan + * alignment restrictions. + * + * \note This function is not thread-safe because no two threads can poison or + * unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_poison_memory_region(void const volatile *addr, size_t size); + +/** + * Marks a memory region ([addr, addr+size)) as addressable. + * + * This memory must be previously allocated by your program. Accessing + * addresses in this region is allowed until this region is poisoned again. + * This function could unpoison a super-region of [addr, addr+size) due + * to ASan alignment restrictions. + * + * \note This function is not thread-safe because no two threads can + * poison or unpoison memory in the same memory region simultaneously. + * + * \param addr Start of memory region. + * \param size Size of memory region. */ +void __asan_unpoison_memory_region(void const volatile *addr, size_t size); +#endif + + +/*-************************************************************** +* Basic Types +*****************************************************************/ +#if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef int16_t S16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; + typedef int64_t S64; +#else +# include +#if CHAR_BIT != 8 +# error "this implementation requires char to be exactly 8-bit type" +#endif + typedef unsigned char BYTE; +#if USHRT_MAX != 65535 +# error "this implementation requires short to be exactly 16-bit type" +#endif + typedef unsigned short U16; + typedef signed short S16; +#if UINT_MAX != 4294967295 +# error "this implementation requires int to be exactly 32-bit type" +#endif + typedef unsigned int U32; + typedef signed int S32; +/* note : there are no limits defined for long long type in C90. + * limits exist in C99, however, in such case, is preferred */ + typedef unsigned long long U64; + typedef signed long long S64; +#endif + + +/*-************************************************************** +* Memory I/O +*****************************************************************/ +/* MEM_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (i.e., not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method is portable but violate C standard. + * It can generate buggy code on targets depending on alignment. + * In some circumstances, it's the only known way to get the most performance (i.e. GCC + ARMv6) + * See http://fastcompression.blogspot.fr/2015/08/accessing-unaligned-memory.html for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef MEM_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define MEM_FORCE_MEMORY_ACCESS 2 +# elif defined(__INTEL_COMPILER) || defined(__GNUC__) || defined(__ICCARM__) +# define MEM_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +MEM_STATIC unsigned MEM_32bits(void) { return sizeof(size_t)==4; } +MEM_STATIC unsigned MEM_64bits(void) { return sizeof(size_t)==8; } + +MEM_STATIC unsigned MEM_isLittleEndian(void) +{ + const union { U32 u; BYTE c[4]; } one = { 1 }; /* don't use static : performance detrimental */ + return one.c[0]; +} + +#if defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==2) + +/* violates C standard, by lying on structure alignment. +Only use if no other choice to achieve best performance on target platform */ +MEM_STATIC U16 MEM_read16(const void* memPtr) { return *(const U16*) memPtr; } +MEM_STATIC U32 MEM_read32(const void* memPtr) { return *(const U32*) memPtr; } +MEM_STATIC U64 MEM_read64(const void* memPtr) { return *(const U64*) memPtr; } +MEM_STATIC size_t MEM_readST(const void* memPtr) { return *(const size_t*) memPtr; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { *(U16*)memPtr = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { *(U32*)memPtr = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { *(U64*)memPtr = value; } + +#elif defined(MEM_FORCE_MEMORY_ACCESS) && (MEM_FORCE_MEMORY_ACCESS==1) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +#if defined(_MSC_VER) || (defined(__INTEL_COMPILER) && defined(WIN32)) + __pragma( pack(push, 1) ) + typedef struct { U16 v; } unalign16; + typedef struct { U32 v; } unalign32; + typedef struct { U64 v; } unalign64; + typedef struct { size_t v; } unalignArch; + __pragma( pack(pop) ) +#else + typedef struct { U16 v; } __attribute__((packed)) unalign16; + typedef struct { U32 v; } __attribute__((packed)) unalign32; + typedef struct { U64 v; } __attribute__((packed)) unalign64; + typedef struct { size_t v; } __attribute__((packed)) unalignArch; +#endif + +MEM_STATIC U16 MEM_read16(const void* ptr) { return ((const unalign16*)ptr)->v; } +MEM_STATIC U32 MEM_read32(const void* ptr) { return ((const unalign32*)ptr)->v; } +MEM_STATIC U64 MEM_read64(const void* ptr) { return ((const unalign64*)ptr)->v; } +MEM_STATIC size_t MEM_readST(const void* ptr) { return ((const unalignArch*)ptr)->v; } + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) { ((unalign16*)memPtr)->v = value; } +MEM_STATIC void MEM_write32(void* memPtr, U32 value) { ((unalign32*)memPtr)->v = value; } +MEM_STATIC void MEM_write64(void* memPtr, U64 value) { ((unalign64*)memPtr)->v = value; } + +#else + +/* default method, safe and standard. + can sometimes prove slower */ + +MEM_STATIC U16 MEM_read16(const void* memPtr) +{ + U16 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U32 MEM_read32(const void* memPtr) +{ + U32 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC U64 MEM_read64(const void* memPtr) +{ + U64 val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC size_t MEM_readST(const void* memPtr) +{ + size_t val; memcpy(&val, memPtr, sizeof(val)); return val; +} + +MEM_STATIC void MEM_write16(void* memPtr, U16 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write32(void* memPtr, U32 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +MEM_STATIC void MEM_write64(void* memPtr, U64 value) +{ + memcpy(memPtr, &value, sizeof(value)); +} + +#endif /* MEM_FORCE_MEMORY_ACCESS */ + +MEM_STATIC U32 MEM_swap32(U32 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_ulong(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap32)) + return __builtin_bswap32(in); +#else + return ((in << 24) & 0xff000000 ) | + ((in << 8) & 0x00ff0000 ) | + ((in >> 8) & 0x0000ff00 ) | + ((in >> 24) & 0x000000ff ); +#endif +} + +MEM_STATIC U64 MEM_swap64(U64 in) +{ +#if defined(_MSC_VER) /* Visual Studio */ + return _byteswap_uint64(in); +#elif (defined (__GNUC__) && (__GNUC__ * 100 + __GNUC_MINOR__ >= 403)) \ + || (defined(__clang__) && __has_builtin(__builtin_bswap64)) + return __builtin_bswap64(in); +#else + return ((in << 56) & 0xff00000000000000ULL) | + ((in << 40) & 0x00ff000000000000ULL) | + ((in << 24) & 0x0000ff0000000000ULL) | + ((in << 8) & 0x000000ff00000000ULL) | + ((in >> 8) & 0x00000000ff000000ULL) | + ((in >> 24) & 0x0000000000ff0000ULL) | + ((in >> 40) & 0x000000000000ff00ULL) | + ((in >> 56) & 0x00000000000000ffULL); +#endif +} + +MEM_STATIC size_t MEM_swapST(size_t in) +{ + if (MEM_32bits()) + return (size_t)MEM_swap32((U32)in); + else + return (size_t)MEM_swap64((U64)in); +} + +/*=== Little endian r/w ===*/ + +MEM_STATIC U16 MEM_readLE16(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read16(memPtr); + else { + const BYTE* p = (const BYTE*)memPtr; + return (U16)(p[0] + (p[1]<<8)); + } +} + +MEM_STATIC void MEM_writeLE16(void* memPtr, U16 val) +{ + if (MEM_isLittleEndian()) { + MEM_write16(memPtr, val); + } else { + BYTE* p = (BYTE*)memPtr; + p[0] = (BYTE)val; + p[1] = (BYTE)(val>>8); + } +} + +MEM_STATIC U32 MEM_readLE24(const void* memPtr) +{ + return MEM_readLE16(memPtr) + (((const BYTE*)memPtr)[2] << 16); +} + +MEM_STATIC void MEM_writeLE24(void* memPtr, U32 val) +{ + MEM_writeLE16(memPtr, (U16)val); + ((BYTE*)memPtr)[2] = (BYTE)(val>>16); +} + +MEM_STATIC U32 MEM_readLE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read32(memPtr); + else + return MEM_swap32(MEM_read32(memPtr)); +} + +MEM_STATIC void MEM_writeLE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, val32); + else + MEM_write32(memPtr, MEM_swap32(val32)); +} + +MEM_STATIC U64 MEM_readLE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_read64(memPtr); + else + return MEM_swap64(MEM_read64(memPtr)); +} + +MEM_STATIC void MEM_writeLE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, val64); + else + MEM_write64(memPtr, MEM_swap64(val64)); +} + +MEM_STATIC size_t MEM_readLEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readLE32(memPtr); + else + return (size_t)MEM_readLE64(memPtr); +} + +MEM_STATIC void MEM_writeLEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeLE32(memPtr, (U32)val); + else + MEM_writeLE64(memPtr, (U64)val); +} + +/*=== Big endian r/w ===*/ + +MEM_STATIC U32 MEM_readBE32(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap32(MEM_read32(memPtr)); + else + return MEM_read32(memPtr); +} + +MEM_STATIC void MEM_writeBE32(void* memPtr, U32 val32) +{ + if (MEM_isLittleEndian()) + MEM_write32(memPtr, MEM_swap32(val32)); + else + MEM_write32(memPtr, val32); +} + +MEM_STATIC U64 MEM_readBE64(const void* memPtr) +{ + if (MEM_isLittleEndian()) + return MEM_swap64(MEM_read64(memPtr)); + else + return MEM_read64(memPtr); +} + +MEM_STATIC void MEM_writeBE64(void* memPtr, U64 val64) +{ + if (MEM_isLittleEndian()) + MEM_write64(memPtr, MEM_swap64(val64)); + else + MEM_write64(memPtr, val64); +} + +MEM_STATIC size_t MEM_readBEST(const void* memPtr) +{ + if (MEM_32bits()) + return (size_t)MEM_readBE32(memPtr); + else + return (size_t)MEM_readBE64(memPtr); +} + +MEM_STATIC void MEM_writeBEST(void* memPtr, size_t val) +{ + if (MEM_32bits()) + MEM_writeBE32(memPtr, (U32)val); + else + MEM_writeBE64(memPtr, (U64)val); +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* MEM_H_MODULE */ +/**** ended inlining mem.h ****/ +/**** start inlining error_private.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* Note : this module is expected to remain private, do not expose it */ + +#ifndef ERROR_H_MODULE +#define ERROR_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + + +/* **************************************** +* Dependencies +******************************************/ +#include /* size_t */ +/**** start inlining zstd_errors.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDERRORLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#endif + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ERRORS_H_398273423 */ +/**** ended inlining zstd_errors.h ****/ + + +/* **************************************** +* Compiler-specific +******************************************/ +#if defined(__GNUC__) +# define ERR_STATIC static __attribute__((unused)) +#elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define ERR_STATIC static inline +#elif defined(_MSC_VER) +# define ERR_STATIC static __inline +#else +# define ERR_STATIC static /* this version may generate warnings for unused static functions; disable the relevant warning */ +#endif + + +/*-**************************************** +* Customization (error_public.h) +******************************************/ +typedef ZSTD_ErrorCode ERR_enum; +#define PREFIX(name) ZSTD_error_##name + + +/*-**************************************** +* Error codes handling +******************************************/ +#undef ERROR /* already defined on Visual Studio */ +#define ERROR(name) ZSTD_ERROR(name) +#define ZSTD_ERROR(name) ((size_t)-PREFIX(name)) + +ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + +ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + +/* check and forward error code */ +#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } + + +/*-**************************************** +* Error Strings +******************************************/ + +const char* ERR_getErrorString(ERR_enum code); /* error_private.c */ + +ERR_STATIC const char* ERR_getErrorName(size_t code) +{ + return ERR_getErrorString(ERR_getErrorCode(code)); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* ERROR_H_MODULE */ +/**** ended inlining error_private.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ +/**** start inlining fse.h ****/ +/* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef FSE_H +#define FSE_H + + +/*-***************************************** +* Dependencies +******************************************/ +#include /* size_t, ptrdiff_t */ + + +/*-***************************************** +* FSE_PUBLIC_API : control library symbols visibility +******************************************/ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define FSE_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define FSE_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define FSE_PUBLIC_API __declspec(dllimport) /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define FSE_PUBLIC_API +#endif + +/*------ Version ------*/ +#define FSE_VERSION_MAJOR 0 +#define FSE_VERSION_MINOR 9 +#define FSE_VERSION_RELEASE 0 + +#define FSE_LIB_VERSION FSE_VERSION_MAJOR.FSE_VERSION_MINOR.FSE_VERSION_RELEASE +#define FSE_QUOTE(str) #str +#define FSE_EXPAND_AND_QUOTE(str) FSE_QUOTE(str) +#define FSE_VERSION_STRING FSE_EXPAND_AND_QUOTE(FSE_LIB_VERSION) + +#define FSE_VERSION_NUMBER (FSE_VERSION_MAJOR *100*100 + FSE_VERSION_MINOR *100 + FSE_VERSION_RELEASE) +FSE_PUBLIC_API unsigned FSE_versionNumber(void); /**< library version number; to be used when checking dll version */ + + +/*-**************************************** +* FSE simple functions +******************************************/ +/*! FSE_compress() : + Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. + 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). + @return : size of compressed data (<= dstCapacity). + Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. + if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +*/ +FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/*! FSE_decompress(): + Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', + into already allocated destination buffer 'dst', of size 'dstCapacity'. + @return : size of regenerated data (<= maxDstSize), + or an error code, which can be tested using FSE_isError() . + + ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! + Why ? : making this distinction requires a header. + Header management is intentionally delegated to the user layer, which can better manage special cases. +*/ +FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, + const void* cSrc, size_t cSrcSize); + + +/*-***************************************** +* Tool functions +******************************************/ +FSE_PUBLIC_API size_t FSE_compressBound(size_t size); /* maximum compressed size */ + +/* Error Management */ +FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return value is an error code */ +FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +/*-***************************************** +* FSE advanced functions +******************************************/ +/*! FSE_compress2() : + Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' + Both parameters can be defined as '0' to mean : use default value + @return : size of compressed data + Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! + if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. + if FSE_isError(return), it's an error code. +*/ +FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); + + +/*-***************************************** +* FSE detailed API +******************************************/ +/*! +FSE_compress() does the following: +1. count symbol occurrence from source[] into table count[] (see hist.h) +2. normalize counters so that sum(count[]) == Power_of_2 (2^tableLog) +3. save normalized counters to memory buffer using writeNCount() +4. build encoding table 'CTable' from normalized counters +5. encode the data stream using encoding table 'CTable' + +FSE_decompress() does the following: +1. read normalized counters with readNCount() +2. build decoding table 'DTable' from normalized counters +3. decode the data stream using decoding table 'DTable' + +The following API allows targeting specific sub-functions for advanced tasks. +For example, it's possible to compress several blocks using the same 'CTable', +or to save and provide normalized distribution using external method. +*/ + +/* *** COMPRESSION *** */ + +/*! FSE_optimalTableLog(): + dynamically downsize 'tableLog' when conditions are met. + It saves CPU time, by using smaller tables, while preserving or even improving compression ratio. + @return : recommended tableLog (necessarily <= 'maxTableLog') */ +FSE_PUBLIC_API unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_normalizeCount(): + normalize counts so that sum(count[]) == Power_of_2 (2^tableLog) + 'normalizedCounter' is a table of short, of minimum size (maxSymbolValue+1). + @return : tableLog, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_normalizeCount(short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t srcSize, unsigned maxSymbolValue); + +/*! FSE_NCountWriteBound(): + Provides the maximum possible size of an FSE normalized table, given 'maxSymbolValue' and 'tableLog'. + Typically useful for allocation purpose. */ +FSE_PUBLIC_API size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_writeNCount(): + Compactly save 'normalizedCounter' into 'buffer'. + @return : size of the compressed table, + or an errorCode, which can be tested using FSE_isError(). */ +FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, + unsigned maxSymbolValue, unsigned tableLog); + +/*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ +typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + +/*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). + @return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildCTable(FSE_CTable* ct, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_compress_usingCTable(): + Compress `src` using `ct` into `dst` which must be already allocated. + @return : size of compressed data (<= `dstCapacity`), + or 0 if compressed data could not fit into `dst`, + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_compress_usingCTable (void* dst, size_t dstCapacity, const void* src, size_t srcSize, const FSE_CTable* ct); + +/*! +Tutorial : +---------- +The first step is to count all symbols. FSE_count() does this job very fast. +Result will be saved into 'count', a table of unsigned int, which must be already allocated, and have 'maxSymbolValuePtr[0]+1' cells. +'src' is a table of bytes of size 'srcSize'. All values within 'src' MUST be <= maxSymbolValuePtr[0] +maxSymbolValuePtr[0] will be updated, with its real value (necessarily <= original value) +FSE_count() will return the number of occurrence of the most frequent symbol. +This can be used to know if there is a single symbol within 'src', and to quickly evaluate its compressibility. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). + +The next step is to normalize the frequencies. +FSE_normalizeCount() will ensure that sum of frequencies is == 2 ^'tableLog'. +It also guarantees a minimum of 1 to any Symbol with frequency >= 1. +You can use 'tableLog'==0 to mean "use default tableLog value". +If you are unsure of which tableLog value to use, you can ask FSE_optimalTableLog(), +which will provide the optimal valid tableLog given sourceSize, maxSymbolValue, and a user-defined maximum (0 means "default"). + +The result of FSE_normalizeCount() will be saved into a table, +called 'normalizedCounter', which is a table of signed short. +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValue+1' cells. +The return value is tableLog if everything proceeded as expected. +It is 0 if there is a single symbol within distribution. +If there is an error (ex: invalid tableLog value), the function will return an ErrorCode (which can be tested using FSE_isError()). + +'normalizedCounter' can be saved in a compact manner to a memory area using FSE_writeNCount(). +'buffer' must be already allocated. +For guaranteed success, buffer size must be at least FSE_headerBound(). +The result of the function is the number of bytes written into 'buffer'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError(); ex : buffer size too small). + +'normalizedCounter' can then be used to create the compression table 'CTable'. +The space required by 'CTable' must be already allocated, using FSE_createCTable(). +You can then use FSE_buildCTable() to fill 'CTable'. +If there is an error, both functions will return an ErrorCode (which can be tested using FSE_isError()). + +'CTable' can then be used to compress 'src', with FSE_compress_usingCTable(). +Similar to FSE_count(), the convention is that 'src' is assumed to be a table of char of size 'srcSize' +The function returns the size of compressed data (without header), necessarily <= `dstCapacity`. +If it returns '0', compressed data could not fit into 'dst'. +If there is an error, the function will return an ErrorCode (which can be tested using FSE_isError()). +*/ + + +/* *** DECOMPRESSION *** */ + +/*! FSE_readNCount(): + Read compactly saved 'normalizedCounter' from 'rBuffer'. + @return : size read from 'rBuffer', + or an errorCode, which can be tested using FSE_isError(). + maxSymbolValuePtr[0] and tableLogPtr[0] will also be updated with their respective values */ +FSE_PUBLIC_API size_t FSE_readNCount (short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize); + +/*! Constructor and Destructor of FSE_DTable. + Note that its size depends on 'tableLog' */ +typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); + +/*! FSE_buildDTable(): + Builds 'dt', which must be already allocated, using FSE_createDTable(). + return : 0, or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); + +/*! FSE_decompress_usingDTable(): + Decompress compressed source `cSrc` of size `cSrcSize` using `dt` + into `dst` which must be already allocated. + @return : size of regenerated data (necessarily <= `dstCapacity`), + or an errorCode, which can be tested using FSE_isError() */ +FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + +/*! +Tutorial : +---------- +(Note : these functions only decompress FSE-compressed blocks. + If block is uncompressed, use memcpy() instead + If block is a single repeated byte, use memset() instead ) + +The first step is to obtain the normalized frequencies of symbols. +This can be performed by FSE_readNCount() if it was saved using FSE_writeNCount(). +'normalizedCounter' must be already allocated, and have at least 'maxSymbolValuePtr[0]+1' cells of signed short. +In practice, that means it's necessary to know 'maxSymbolValue' beforehand, +or size the table to handle worst case situations (typically 256). +FSE_readNCount() will provide 'tableLog' and 'maxSymbolValue'. +The result of FSE_readNCount() is the number of bytes read from 'rBuffer'. +Note that 'rBufferSize' must be at least 4 bytes, even if useful information is less than that. +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +The next step is to build the decompression tables 'FSE_DTable' from 'normalizedCounter'. +This is performed by the function FSE_buildDTable(). +The space required by 'FSE_DTable' must be already allocated using FSE_createDTable(). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). + +`FSE_DTable` can then be used to decompress `cSrc`, with FSE_decompress_usingDTable(). +`cSrcSize` must be strictly correct, otherwise decompression will fail. +FSE_decompress_usingDTable() result will tell how many bytes were regenerated (<=`dstCapacity`). +If there is an error, the function will return an error code, which can be tested using FSE_isError(). (ex: dst buffer too small) +*/ + +#endif /* FSE_H */ + +#if defined(FSE_STATIC_LINKING_ONLY) && !defined(FSE_H_FSE_STATIC_LINKING_ONLY) +#define FSE_H_FSE_STATIC_LINKING_ONLY + +/* *** Dependency *** */ +/**** start inlining bitstream.h ****/ +/* ****************************************************************** + * bitstream + * Part of FSE library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ +#ifndef BITSTREAM_H_MODULE +#define BITSTREAM_H_MODULE + +#if defined (__cplusplus) +extern "C" { +#endif + +/* +* This API consists of small unitary functions, which must be inlined for best performance. +* Since link-time-optimization is not available for all compilers, +* these functions are defined into a .h to be included. +*/ + +/*-**************************************** +* Dependencies +******************************************/ +/**** skipping file: mem.h ****/ +/**** start inlining compiler.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPILER_H +#define ZSTD_COMPILER_H + +/*-******************************************************* +* Compiler specifics +*********************************************************/ +/* force inlining */ + +#if !defined(ZSTD_NO_INLINE) +#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#else + +#define INLINE_KEYWORD +#define FORCE_INLINE_ATTR + +#endif + +/** + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR +/** + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers + * performance. + * + * gcc-4.8 and gcc-4.9 have been shown to benefit from leaving off the + * always_inline attribute. + * + * clang up to 5.0.0 (trunk) benefit tremendously from the always_inline + * attribute. + */ +#if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 +# define HINT_INLINE static INLINE_KEYWORD +#else +# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR +#endif + +/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +#if defined(__GNUC__) +# define UNUSED_ATTR __attribute__((unused)) +#else +# define UNUSED_ATTR +#endif + +/* force no inlining */ +#ifdef _MSC_VER +# define FORCE_NOINLINE static __declspec(noinline) +#else +# if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_NOINLINE static __attribute__((__noinline__)) +# else +# define FORCE_NOINLINE static +# endif +#endif + +/* target attribute */ +#ifndef __has_attribute + #define __has_attribute(x) 0 /* Compatibility with non-clang compilers. */ +#endif +#if defined(__GNUC__) || defined(__ICCARM__) +# define TARGET_ATTRIBUTE(target) __attribute__((__target__(target))) +#else +# define TARGET_ATTRIBUTE(target) +#endif + +/* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ +#ifndef DYNAMIC_BMI2 + #if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ + && (defined(__x86_64__) || defined(_M_X86)) \ + && !defined(__BMI2__) + # define DYNAMIC_BMI2 1 + #else + # define DYNAMIC_BMI2 0 + #endif +#endif + +/* prefetch + * can be disabled, by declaring NO_PREFETCH build macro */ +#if defined(NO_PREFETCH) +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +#else +# if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_I86)) /* _mm_prefetch() is not defined outside of x86/x64 */ +# include /* https://msdn.microsoft.com/fr-fr/library/84szxsww(v=vs.90).aspx */ +# define PREFETCH_L1(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T0) +# define PREFETCH_L2(ptr) _mm_prefetch((const char*)(ptr), _MM_HINT_T1) +# elif defined(__aarch64__) +# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) +# elif defined(__GNUC__) && ( (__GNUC__ >= 4) || ( (__GNUC__ == 3) && (__GNUC_MINOR__ >= 1) ) ) +# define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) +# define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) +# else +# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ +# endif +#endif /* NO_PREFETCH */ + +#define CACHELINE_SIZE 64 + +#define PREFETCH_AREA(p, s) { \ + const char* const _ptr = (const char*)(p); \ + size_t const _size = (size_t)(s); \ + size_t _pos; \ + for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ + PREFETCH_L2(_ptr + _pos); \ + } \ +} + +/* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax */ +#if !defined(__INTEL_COMPILER) && !defined(__clang__) && defined(__GNUC__) +# if (__GNUC__ == 4 && __GNUC_MINOR__ > 3) || (__GNUC__ >= 5) +# define DONT_VECTORIZE __attribute__((optimize("no-tree-vectorize"))) +# else +# define DONT_VECTORIZE _Pragma("GCC optimize(\"no-tree-vectorize\")") +# endif +#else +# define DONT_VECTORIZE +#endif + +/* Tell the compiler that a branch is likely or unlikely. + * Only use these macros if it causes the compiler to generate better code. + * If you can remove a LIKELY/UNLIKELY annotation without speed changes in gcc + * and clang, please do. + */ +#if defined(__GNUC__) +#define LIKELY(x) (__builtin_expect((x), 1)) +#define UNLIKELY(x) (__builtin_expect((x), 0)) +#else +#define LIKELY(x) (x) +#define UNLIKELY(x) (x) +#endif + +/* disable warnings */ +#ifdef _MSC_VER /* Visual Studio */ +# include /* For Visual 2005 */ +# pragma warning(disable : 4100) /* disable: C4100: unreferenced formal parameter */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +# pragma warning(disable : 4214) /* disable: C4214: non-int bitfields */ +# pragma warning(disable : 4324) /* disable: C4324: padded structure */ +#endif + +#endif /* ZSTD_COMPILER_H */ +/**** ended inlining compiler.h ****/ +/**** skipping file: debug.h ****/ +/**** skipping file: error_private.h ****/ + + +/*========================================= +* Target specific +=========================================*/ +#if defined(__BMI__) && defined(__GNUC__) +# include /* support for bextr (experimental) */ +#elif defined(__ICCARM__) +# include +#endif + +#define STREAM_ACCUMULATOR_MIN_32 25 +#define STREAM_ACCUMULATOR_MIN_64 57 +#define STREAM_ACCUMULATOR_MIN ((U32)(MEM_32bits() ? STREAM_ACCUMULATOR_MIN_32 : STREAM_ACCUMULATOR_MIN_64)) + + +/*-****************************************** +* bitStream encoding API (write forward) +********************************************/ +/* bitStream can mix input from multiple sources. + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ +typedef struct { + size_t bitContainer; + unsigned bitPos; + char* startPtr; + char* ptr; + char* endPtr; +} BIT_CStream_t; + +MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); +MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + +/* Start with initCStream, providing the size of buffer to write into. +* bitStream will never write outside of this buffer. +* `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. +* +* bits are first added to a local register. +* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. +* Writing data into memory is an explicit operation, performed by the flushBits function. +* Hence keep track how many bits are potentially stored into local register to avoid register overflow. +* After a flushBits, a maximum of 7 bits might still be stored into local register. +* +* Avoid storing elements of more than 24 bits if you want compatibility with 32-bits bitstream readers. +* +* Last operation is to close the bitStream. +* The function returns the final size of CStream in bytes. +* If data couldn't fit into `dstBuffer`, it will return a 0 ( == not storable) +*/ + + +/*-******************************************** +* bitStream decoding API (read backward) +**********************************************/ +typedef struct { + size_t bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; +} BIT_DStream_t; + +typedef enum { BIT_DStream_unfinished = 0, + BIT_DStream_endOfBuffer = 1, + BIT_DStream_completed = 2, + BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ + /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ + +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + +/* Start by invoking BIT_initDStream(). +* A chunk of the bitStream is then stored into a local register. +* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). +* You can then retrieve bitFields stored into the local register, **in reverse order**. +* Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. +* A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +* Otherwise, it can be less than that, so proceed accordingly. +* Checking if DStream has reached its end can be performed with BIT_endOfDStream(). +*/ + + +/*-**************************************** +* unsafe API +******************************************/ +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); +/* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + +MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); +/* unsafe version; does not check buffer overflow */ + +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); +/* faster, but works only if nbBits >= 1 */ + + + +/*-************************************************************** +* Internal functions +****************************************************************/ +MEM_STATIC unsigned BIT_highbit32 (U32 val) +{ + assert(val != 0); + { +# if defined(_MSC_VER) /* Visual */ + unsigned long r=0; + return _BitScanReverse ( &r, val ) ? (unsigned)r : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) /* Use GCC Intrinsic */ + return __builtin_clz (val) ^ 31; +# elif defined(__ICCARM__) /* IAR Intrinsic */ + return 31 - __CLZ(val); +# else /* Software version */ + static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, + 11, 14, 16, 18, 22, 25, 3, 30, + 8, 12, 20, 28, 15, 17, 24, 7, + 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +# endif + } +} + +/*===== Local Constants =====*/ +static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, + 0x3F, 0x7F, 0xFF, 0x1FF, 0x3FF, 0x7FF, + 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF, 0x1FFFF, + 0x3FFFF, 0x7FFFF, 0xFFFFF, 0x1FFFFF, 0x3FFFFF, 0x7FFFFF, + 0xFFFFFF, 0x1FFFFFF, 0x3FFFFFF, 0x7FFFFFF, 0xFFFFFFF, 0x1FFFFFFF, + 0x3FFFFFFF, 0x7FFFFFFF}; /* up to 31 bits */ +#define BIT_MASK_SIZE (sizeof(BIT_mask) / sizeof(BIT_mask[0])) + +/*-************************************************************** +* bitStream encoding +****************************************************************/ +/*! BIT_initCStream() : + * `dstCapacity` must be > sizeof(size_t) + * @return : 0 if success, + * otherwise an error code (can be tested using ERR_isError()) */ +MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + void* startPtr, size_t dstCapacity) +{ + bitC->bitContainer = 0; + bitC->bitPos = 0; + bitC->startPtr = (char*)startPtr; + bitC->ptr = bitC->startPtr; + bitC->endPtr = bitC->startPtr + dstCapacity - sizeof(bitC->bitContainer); + if (dstCapacity <= sizeof(bitC->bitContainer)) return ERROR(dstSize_tooSmall); + return 0; +} + +/*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) +{ + MEM_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); + bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_addBitsFast() : + * works only if `value` is _clean_, + * meaning all high bits above nbBits are 0 */ +MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, + size_t value, unsigned nbBits) +{ + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); + bitC->bitContainer |= value << bitC->bitPos; + bitC->bitPos += nbBits; +} + +/*! BIT_flushBitsFast() : + * assumption : bitContainer has not overflowed + * unsafe version; does not check buffer overflow */ +MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes*8; +} + +/*! BIT_flushBits() : + * assumption : bitContainer has not overflowed + * safe version; check for buffer overflow, and prevents it. + * note : does not signal buffer overflow. + * overflow will be revealed later on using BIT_closeCStream() */ +MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC) +{ + size_t const nbBytes = bitC->bitPos >> 3; + assert(bitC->bitPos < sizeof(bitC->bitContainer) * 8); + assert(bitC->ptr <= bitC->endPtr); + MEM_writeLEST(bitC->ptr, bitC->bitContainer); + bitC->ptr += nbBytes; + if (bitC->ptr > bitC->endPtr) bitC->ptr = bitC->endPtr; + bitC->bitPos &= 7; + bitC->bitContainer >>= nbBytes*8; +} + +/*! BIT_closeCStream() : + * @return : size of CStream, in bytes, + * or 0 if it could not fit into dstBuffer */ +MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) +{ + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ + return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); +} + + +/*-******************************************************** +* bitStream decoding +**********************************************************/ +/*! BIT_initDStream() : + * Initialize a BIT_DStream_t. + * `bitD` : a pointer to an already allocated BIT_DStream_t structure. + * `srcSize` must be the *exact* size of the bitStream, in bytes. + * @return : size of stream (== srcSize), or an errorCode if a problem is detected + */ +MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize) +{ + if (srcSize < 1) { memset(bitD, 0, sizeof(*bitD)); return ERROR(srcSize_wrong); } + + bitD->start = (const char*)srcBuffer; + bitD->limitPtr = bitD->start + sizeof(bitD->bitContainer); + + if (srcSize >= sizeof(bitD->bitContainer)) { /* normal case */ + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { + case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + /* fall-through */ + + case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + /* fall-through */ + + case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + /* fall-through */ + + case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; + /* fall-through */ + + case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; + /* fall-through */ + + case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; + /* fall-through */ + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; + bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; + } + + return srcSize; +} + +MEM_STATIC size_t BIT_getUpperBits(size_t bitContainer, U32 const start) +{ + return bitContainer >> start; +} + +MEM_STATIC size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) +{ + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ + assert(nbBits < BIT_MASK_SIZE); + return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; +} + +MEM_STATIC size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +{ + assert(nbBits < BIT_MASK_SIZE); + return bitContainer & BIT_mask[nbBits]; +} + +/*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +MEM_STATIC size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) +{ + /* arbitrate between double-shift and shift+mask */ +#if 1 + /* if bitD->bitsConsumed + nbBits > sizeof(bitD->bitContainer)*8, + * bitstream is likely corrupted, and result is undefined */ + return BIT_getMiddleBits(bitD->bitContainer, (sizeof(bitD->bitContainer)*8) - bitD->bitsConsumed - nbBits, nbBits); +#else + /* this code path is slower on my os-x laptop */ + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + return ((bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> 1) >> ((regMask-nbBits) & regMask); +#endif +} + +/*! BIT_lookBitsFast() : + * unsafe version; only works if nbBits >= 1 */ +MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) +{ + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); +} + +MEM_STATIC void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) +{ + bitD->bitsConsumed += nbBits; +} + +/*! BIT_readBits() : + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) +{ + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_readBitsFast() : + * unsafe version; only works only if nbBits >= 1 */ +MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) +{ + size_t const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); + BIT_skipBits(bitD, nbBits); + return value; +} + +/*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! + * 2. Returns BIT_DStream_overflow when bitD->ptr < bitD->limitPtr, at this + * point you must use BIT_reloadDStream() to reload. + */ +MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) +{ + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; + assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); + bitD->ptr -= bitD->bitsConsumed >> 3; + bitD->bitsConsumed &= 7; + bitD->bitContainer = MEM_readLEST(bitD->ptr); + return BIT_DStream_unfinished; +} + +/*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . + * This function is safe, it guarantees it will not read beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) +{ + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + return BIT_DStream_overflow; + + if (bitD->ptr >= bitD->limitPtr) { + return BIT_reloadDStreamFast(bitD); + } + if (bitD->ptr == bitD->start) { + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } + /* start < ptr < limitPtr */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { + nbBytes = (U32)(bitD->ptr - bitD->start); /* ptr > start */ + result = BIT_DStream_endOfBuffer; + } + bitD->ptr -= nbBytes; + bitD->bitsConsumed -= nbBytes*8; + bitD->bitContainer = MEM_readLEST(bitD->ptr); /* reminder : srcSize > sizeof(bitD->bitContainer), otherwise bitD->ptr == bitD->start */ + return result; + } +} + +/*! BIT_endOfDStream() : + * @return : 1 if DStream has _exactly_ reached its end (all bits consumed). + */ +MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) +{ + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); +} + +#if defined (__cplusplus) +} +#endif + +#endif /* BITSTREAM_H_MODULE */ +/**** ended inlining bitstream.h ****/ + + +/* ***************************************** +* Static allocation +*******************************************/ +/* FSE buffer bounds */ +#define FSE_NCOUNTBOUND 512 +#define FSE_BLOCKBOUND(size) (size + (size>>7) + 4 /* fse states */ + sizeof(size_t) /* bitContainer */) +#define FSE_COMPRESSBOUND(size) (FSE_NCOUNTBOUND + FSE_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* It is possible to statically allocate FSE CTable/DTable as a table of FSE_CTable/FSE_DTable using below macros */ +#define FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) (1 + (1<<(maxTableLog-1)) + ((maxSymbolValue+1)*2)) +#define FSE_DTABLE_SIZE_U32(maxTableLog) (1 + (1< 12) ? (1 << (maxTableLog - 2)) : 1024) ) +size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + +size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +/**< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ + +size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); +/**< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` must be >= `(1<= BIT_DStream_completed + +When it's done, verify decompression is fully completed, by checking both DStream and the relevant states. +Checking if DStream has reached its end is performed by : + BIT_endOfDStream(&DStream); +Check also the states. There might be some symbols left there, if some high probability ones (>50%) are possible. + FSE_endOfDState(&DState); +*/ + + +/* ***************************************** +* FSE unsafe API +*******************************************/ +static unsigned char FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD); +/* faster, but works only if nbBits is always >= 1 (otherwise, result will be corrupted) */ + + +/* ***************************************** +* Implementation of inlined functions +*******************************************/ +typedef struct { + int deltaFindState; + U32 deltaNbBits; +} FSE_symbolCompressionTransform; /* total 8 bytes */ + +MEM_STATIC void FSE_initCState(FSE_CState_t* statePtr, const FSE_CTable* ct) +{ + const void* ptr = ct; + const U16* u16ptr = (const U16*) ptr; + const U32 tableLog = MEM_read16(ptr); + statePtr->value = (ptrdiff_t)1<stateTable = u16ptr+2; + statePtr->symbolTT = ct + 1 + (tableLog ? (1<<(tableLog-1)) : 1); + statePtr->stateLog = tableLog; +} + + +/*! FSE_initCState2() : +* Same as FSE_initCState(), but the first symbol to include (which will be the last to be read) +* uses the smallest state value possible, saving the cost of this symbol */ +MEM_STATIC void FSE_initCState2(FSE_CState_t* statePtr, const FSE_CTable* ct, U32 symbol) +{ + FSE_initCState(statePtr, ct); + { const FSE_symbolCompressionTransform symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* stateTable = (const U16*)(statePtr->stateTable); + U32 nbBitsOut = (U32)((symbolTT.deltaNbBits + (1<<15)) >> 16); + statePtr->value = (nbBitsOut << 16) - symbolTT.deltaNbBits; + statePtr->value = stateTable[(statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } +} + +MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, unsigned symbol) +{ + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); + BIT_addBits(bitC, statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; +} + +MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) +{ + BIT_addBits(bitC, statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); +} + + +/* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. + * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + return (symbolTT[symbolValue].deltaNbBits + ((1<<16)-1)) >> 16; +} + +/* FSE_bitCost() : + * Approximate symbol cost, as fractional value, using fixed-point format (accuracyLog fractional bits) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ +MEM_STATIC U32 FSE_bitCost(const void* symbolTTPtr, U32 tableLog, U32 symbolValue, U32 accuracyLog) +{ + const FSE_symbolCompressionTransform* symbolTT = (const FSE_symbolCompressionTransform*) symbolTTPtr; + U32 const minNbBits = symbolTT[symbolValue].deltaNbBits >> 16; + U32 const threshold = (minNbBits+1) << 16; + assert(tableLog < 16); + assert(accuracyLog < 31-tableLog); /* ensure enough room for renormalization double shift */ + { U32 const tableSize = 1 << tableLog; + U32 const deltaFromThreshold = threshold - (symbolTT[symbolValue].deltaNbBits + tableSize); + U32 const normalizedDeltaFromThreshold = (deltaFromThreshold << accuracyLog) >> tableLog; /* linear interpolation (very approximate) */ + U32 const bitMultiplier = 1 << accuracyLog; + assert(symbolTT[symbolValue].deltaNbBits + tableSize <= threshold); + assert(normalizedDeltaFromThreshold <= bitMultiplier); + return (minNbBits+1)*bitMultiplier - normalizedDeltaFromThreshold; + } +} + + +/* ====== Decompression ====== */ + +typedef struct { + U16 tableLog; + U16 fastMode; +} FSE_DTableHeader; /* sizeof U32 */ + +typedef struct +{ + unsigned short newState; + unsigned char symbol; + unsigned char nbBits; +} FSE_decode_t; /* size == U32 */ + +MEM_STATIC void FSE_initDState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD, const FSE_DTable* dt) +{ + const void* ptr = dt; + const FSE_DTableHeader* const DTableH = (const FSE_DTableHeader*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +MEM_STATIC BYTE FSE_peekSymbol(const FSE_DState_t* DStatePtr) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + return DInfo.symbol; +} + +MEM_STATIC void FSE_updateState(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.newState + lowBits; +} + +MEM_STATIC BYTE FSE_decodeSymbol(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBits(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +/*! FSE_decodeSymbolFast() : + unsafe, only works if no symbol has a probability > 50% */ +MEM_STATIC BYTE FSE_decodeSymbolFast(FSE_DState_t* DStatePtr, BIT_DStream_t* bitD) +{ + FSE_decode_t const DInfo = ((const FSE_decode_t*)(DStatePtr->table))[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + BYTE const symbol = DInfo.symbol; + size_t const lowBits = BIT_readBitsFast(bitD, nbBits); + + DStatePtr->state = DInfo.newState + lowBits; + return symbol; +} + +MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) +{ + return DStatePtr->state == 0; +} + + + +#ifndef FSE_COMMONDEFS_ONLY + +/* ************************************************************** +* Tuning parameters +****************************************************************/ +/*!MEMORY_USAGE : +* Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.) +* Increasing memory usage improves compression ratio +* Reduced memory usage can improve speed, due to cache effect +* Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */ +#ifndef FSE_MAX_MEMORY_USAGE +# define FSE_MAX_MEMORY_USAGE 14 +#endif +#ifndef FSE_DEFAULT_MEMORY_USAGE +# define FSE_DEFAULT_MEMORY_USAGE 13 +#endif + +/*!FSE_MAX_SYMBOL_VALUE : +* Maximum symbol value authorized. +* Required for proper stack allocation */ +#ifndef FSE_MAX_SYMBOL_VALUE +# define FSE_MAX_SYMBOL_VALUE 255 +#endif + +/* ************************************************************** +* template functions type & suffix +****************************************************************/ +#define FSE_FUNCTION_TYPE BYTE +#define FSE_FUNCTION_EXTENSION +#define FSE_DECODE_TYPE FSE_decode_t + + +#endif /* !FSE_COMMONDEFS_ONLY */ + + +/* *************************************************************** +* Constants +*****************************************************************/ +#define FSE_MAX_TABLELOG (FSE_MAX_MEMORY_USAGE-2) +#define FSE_MAX_TABLESIZE (1U< FSE_TABLELOG_ABSOLUTE_MAX +# error "FSE_MAX_TABLELOG > FSE_TABLELOG_ABSOLUTE_MAX is not supported" +#endif + +#define FSE_TABLESTEP(tableSize) ((tableSize>>1) + (tableSize>>3) + 3) + + +#endif /* FSE_STATIC_LINKING_ONLY */ + + +#if defined (__cplusplus) +} +#endif +/**** ended inlining fse.h ****/ +#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ +/**** start inlining huf.h ****/ +/* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef HUF_H_298734234 +#define HUF_H_298734234 + +/* *** Dependencies *** */ +#include /* size_t */ + + +/* *** library symbols visibility *** */ +/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, + * HUF symbols remain "private" (internal symbols for library only). + * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +# define HUF_PUBLIC_API __declspec(dllexport) +#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +#else +# define HUF_PUBLIC_API +#endif + + +/* ========================== */ +/* *** simple functions *** */ +/* ========================== */ + +/** HUF_compress() : + * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. + * 'dst' buffer must be already allocated. + * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). + * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. + * @return : size of compressed data (<= `dstCapacity`). + * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! + * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) + */ +HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + +/** HUF_decompress() : + * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', + * into already allocated buffer 'dst', of minimum size 'dstSize'. + * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. + * Note : in contrast with FSE, HUF_decompress can regenerate + * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, + * because it knows size to regenerate (originalSize). + * @return : size of regenerated data (== originalSize), + * or an error code, which can be tested using HUF_isError() + */ +HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize); + + +/* *** Tool functions *** */ +#define HUF_BLOCKSIZE_MAX (128 * 1024) /**< maximum input size for a single block compressed with HUF_compress */ +HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /**< maximum compressed size (worst case) */ + +/* Error Management */ +HUF_PUBLIC_API unsigned HUF_isError(size_t code); /**< tells if a return value is an error code */ +HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /**< provides error code string (useful for debugging) */ + + +/* *** Advanced function *** */ + +/** HUF_compress2() : + * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. + * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . + * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog); + +/** HUF_compress4X_wksp() : + * Same as HUF_compress2(), but uses externally allocated `workSpace`. + * `workspace` must have minimum alignment of 4, and be at least as large as HUF_WORKSPACE_SIZE */ +#define HUF_WORKSPACE_SIZE ((6 << 10) + 256) +#define HUF_WORKSPACE_SIZE_U32 (HUF_WORKSPACE_SIZE / sizeof(U32)) +HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize); + +#endif /* HUF_H_298734234 */ + +/* ****************************************************************** + * WARNING !! + * The following section contains advanced and experimental definitions + * which shall never be used in the context of a dynamic library, + * because they are not guaranteed to remain stable in the future. + * Only consider them in association with static linking. + * *****************************************************************/ +#if defined(HUF_STATIC_LINKING_ONLY) && !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +#define HUF_H_HUF_STATIC_LINKING_ONLY + +/* *** Dependencies *** */ +/**** skipping file: mem.h ****/ + + +/* *** Constants *** */ +#define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_ABSOLUTEMAX_TABLELOG */ +#define HUF_TABLELOG_DEFAULT 11 /* default tableLog value when none specified */ +#define HUF_SYMBOLVALUE_MAX 255 + +#define HUF_TABLELOG_ABSOLUTEMAX 15 /* absolute limit of HUF_MAX_TABLELOG. Beyond that value, code does not work */ +#if (HUF_TABLELOG_MAX > HUF_TABLELOG_ABSOLUTEMAX) +# error "HUF_TABLELOG_MAX is too large !" +#endif + + +/* **************************************** +* Static allocation +******************************************/ +/* HUF buffer bounds */ +#define HUF_CTABLEBOUND 129 +#define HUF_BLOCKBOUND(size) (size + (size>>8) + 8) /* only true when incompressible is pre-filtered with fast heuristic */ +#define HUF_COMPRESSBOUND(size) (HUF_CTABLEBOUND + HUF_BLOCKBOUND(size)) /* Macro version, useful for static allocation */ + +/* static allocation of HUF's Compression Table */ +#define HUF_CTABLE_SIZE_U32(maxSymbolValue) ((maxSymbolValue)+1) /* Use tables of U32, for proper alignment */ +#define HUF_CTABLE_SIZE(maxSymbolValue) (HUF_CTABLE_SIZE_U32(maxSymbolValue) * sizeof(U32)) +#define HUF_CREATE_STATIC_CTABLE(name, maxSymbolValue) \ + U32 name##hb[HUF_CTABLE_SIZE_U32(maxSymbolValue)]; \ + void* name##hv = &(name##hb); \ + HUF_CElt* name = (HUF_CElt*)(name##hv) /* no final ; */ + +/* static allocation of HUF's DTable */ +typedef U32 HUF_DTable; +#define HUF_DTABLE_SIZE(maxTableLog) (1 + (1<<(maxTableLog))) +#define HUF_CREATE_STATIC_DTABLEX1(DTable, maxTableLog) \ + HUF_DTable DTable[HUF_DTABLE_SIZE((maxTableLog)-1)] = { ((U32)((maxTableLog)-1) * 0x01000001) } +#define HUF_CREATE_STATIC_DTABLEX2(DTable, maxTableLog) \ + HUF_DTable DTable[HUF_DTABLE_SIZE(maxTableLog)] = { ((U32)(maxTableLog) * 0x01000001) } + + +/* **************************************** +* Advanced decompression functions +******************************************/ +size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +#endif + +size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< decodes RLE and uncompressed */ +size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< considers RLE and uncompressed as errors */ +size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ +#endif + + +/* **************************************** + * HUF detailed API + * ****************************************/ + +/*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") + * 2. (optional) refine tableLog using HUF_optimalTableLog() + * 3. build Huffman table from count using HUF_buildCTable() + * 4. save Huffman table to memory buffer using HUF_writeCTable() + * 5. encode the data stream using HUF_compress4X_usingCTable() + * + * The following API allows targeting specific sub-functions for advanced tasks. + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +typedef struct HUF_CElt_s HUF_CElt; /* incomplete type */ +size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +typedef enum { + HUF_repeat_none, /**< Cannot use the previous table */ + HUF_repeat_check, /**< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /**< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; +/** HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) +#define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) +size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, + void* workSpace, size_t wkspSize); + +/*! HUF_readStats() : + * Read compact Huffman tree, saved by HUF_writeCTable(). + * `huffWeight` is destination buffer. + * @return : size read from `src` , or an error Code . + * Note : Needed by HUF_readCTable() and HUF_readDTableXn() . */ +size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize); + +/** HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned *hasZeroWeights); + +/** HUF_getNbBits() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX + * Note 1 : is not inlined, as HUF_CElt definition is private + * Note 2 : const void* used, so that it can provide a statically allocated table as argument (which uses type U32) */ +U32 HUF_getNbBits(const void* symbolTable, U32 symbolValue); + +/* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics + * 2. build Huffman table from save, using HUF_readDTableX?() + * 3. decode 1 or 4 segments in parallel using HUF_decompress?X?_usingDTable() + */ + +/** HUF_selectDecoder() : + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . + * Assumption : 0 < dstSize <= 128 KB */ +U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + +/** + * The minimum workspace size for the `workSpace` used in + * HUF_readDTableX1_wksp() and HUF_readDTableX2_wksp(). + * + * The space used depends on HUF_TABLELOG_MAX, ranging from ~1500 bytes when + * HUF_TABLE_LOG_MAX=12 to ~1850 bytes when HUF_TABLE_LOG_MAX=15. + * Buffer overflow errors may potentially occur if code modifications result in + * a required workspace size greater than that specified in the following + * macro. + */ +#define HUF_DECOMPRESS_WORKSPACE_SIZE (2 << 10) +#define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +#endif + +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif + + +/* ====================== */ +/* single stream variants */ +/* ====================== */ + +size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /**< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +/** HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. + * If it doesn't, it sets *repeat = HUF_repeat_none, and it sets hufTable to the table used. + * If preferRepeat then the old table will always be used if valid. */ +size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /**< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2); + +size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +#endif + +size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< single-symbol decoder */ +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< single-symbol decoder */ +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /**< double-symbols decoder */ +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /**< double-symbols decoder */ +#endif + +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /**< automatic selection of sing or double symbol decoder, based on DTable */ +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif +#ifndef HUF_FORCE_DECOMPRESS_X1 +size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +#endif + +/* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); +#endif +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); + +#endif /* HUF_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif +/**** ended inlining huf.h ****/ + + +/*=== Version ===*/ +unsigned FSE_versionNumber(void) { return FSE_VERSION_NUMBER; } + + +/*=== Error Management ===*/ +unsigned FSE_isError(size_t code) { return ERR_isError(code); } +const char* FSE_getErrorName(size_t code) { return ERR_getErrorName(code); } + +unsigned HUF_isError(size_t code) { return ERR_isError(code); } +const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + + +/*-************************************************************** +* FSE NCount encoding-decoding +****************************************************************/ +size_t FSE_readNCount (short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +{ + const BYTE* const istart = (const BYTE*) headerBuffer; + const BYTE* const iend = istart + hbSize; + const BYTE* ip = istart; + int nbBits; + int remaining; + int threshold; + U32 bitStream; + int bitCount; + unsigned charnum = 0; + int previous0 = 0; + + if (hbSize < 4) { + /* This function only works when hbSize >= 4 */ + char buffer[4]; + memset(buffer, 0, sizeof(buffer)); + memcpy(buffer, headerBuffer, hbSize); + { size_t const countSize = FSE_readNCount(normalizedCounter, maxSVPtr, tableLogPtr, + buffer, sizeof(buffer)); + if (FSE_isError(countSize)) return countSize; + if (countSize > hbSize) return ERROR(corruption_detected); + return countSize; + } } + assert(hbSize >= 4); + + /* init */ + memset(normalizedCounter, 0, (*maxSVPtr+1) * sizeof(normalizedCounter[0])); /* all symbols not present in NCount have a frequency of 0 */ + bitStream = MEM_readLE32(ip); + nbBits = (bitStream & 0xF) + FSE_MIN_TABLELOG; /* extract tableLog */ + if (nbBits > FSE_TABLELOG_ABSOLUTE_MAX) return ERROR(tableLog_tooLarge); + bitStream >>= 4; + bitCount = 4; + *tableLogPtr = nbBits; + remaining = (1<1) & (charnum<=*maxSVPtr)) { + if (previous0) { + unsigned n0 = charnum; + while ((bitStream & 0xFFFF) == 0xFFFF) { + n0 += 24; + if (ip < iend-5) { + ip += 2; + bitStream = MEM_readLE32(ip) >> bitCount; + } else { + bitStream >>= 16; + bitCount += 16; + } } + while ((bitStream & 3) == 3) { + n0 += 3; + bitStream >>= 2; + bitCount += 2; + } + n0 += bitStream & 3; + bitCount += 2; + if (n0 > *maxSVPtr) return ERROR(maxSymbolValue_tooSmall); + while (charnum < n0) normalizedCounter[charnum++] = 0; + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + assert((bitCount >> 3) <= 3); /* For first condition to work */ + ip += bitCount>>3; + bitCount &= 7; + bitStream = MEM_readLE32(ip) >> bitCount; + } else { + bitStream >>= 2; + } } + { int const max = (2*threshold-1) - remaining; + int count; + + if ((bitStream & (threshold-1)) < (U32)max) { + count = bitStream & (threshold-1); + bitCount += nbBits-1; + } else { + count = bitStream & (2*threshold-1); + if (count >= threshold) count -= max; + bitCount += nbBits; + } + + count--; /* extra accuracy */ + remaining -= count < 0 ? -count : count; /* -1 means +1 */ + normalizedCounter[charnum++] = (short)count; + previous0 = !count; + while (remaining < threshold) { + nbBits--; + threshold >>= 1; + } + + if ((ip <= iend-7) || (ip + (bitCount>>3) <= iend-4)) { + ip += bitCount>>3; + bitCount &= 7; + } else { + bitCount -= (int)(8 * (iend - 4 - ip)); + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> (bitCount & 31); + } } /* while ((remaining>1) & (charnum<=*maxSVPtr)) */ + if (remaining != 1) return ERROR(corruption_detected); + if (bitCount > 32) return ERROR(corruption_detected); + *maxSVPtr = charnum-1; + + ip += (bitCount+7)>>3; + return ip-istart; +} + + +/*! HUF_readStats() : + Read compact Huffman tree, saved by HUF_writeCTable(). + `huffWeight` is destination buffer. + `rankStats` is assumed to be a table of at least HUF_TABLELOG_MAX U32. + @return : size read from `src` , or an error Code . + Note : Needed by HUF_readCTable() and HUF_readDTableX?() . +*/ +size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize) +{ + U32 weightTotal; + const BYTE* ip = (const BYTE*) src; + size_t iSize; + size_t oSize; + + if (!srcSize) return ERROR(srcSize_wrong); + iSize = ip[0]; + /* memset(huffWeight, 0, hwSize); *//* is not necessary, even though some analyzer complain ... */ + + if (iSize >= 128) { /* special header */ + oSize = iSize - 127; + iSize = ((oSize+1)/2); + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + if (oSize >= hwSize) return ERROR(corruption_detected); + ip += 1; + { U32 n; + for (n=0; n> 4; + huffWeight[n+1] = ip[n/2] & 15; + } } } + else { /* header compressed with FSE (normal case) */ + FSE_DTable fseWorkspace[FSE_DTABLE_SIZE_U32(6)]; /* 6 is max possible tableLog for HUF header (maybe even 5, to be tested) */ + if (iSize+1 > srcSize) return ERROR(srcSize_wrong); + oSize = FSE_decompress_wksp(huffWeight, hwSize-1, ip+1, iSize, fseWorkspace, 6); /* max (hwSize-1) values decoded, as last one is implied */ + if (FSE_isError(oSize)) return oSize; + } + + /* collect weight stats */ + memset(rankStats, 0, (HUF_TABLELOG_MAX + 1) * sizeof(U32)); + weightTotal = 0; + { U32 n; for (n=0; n= HUF_TABLELOG_MAX) return ERROR(corruption_detected); + rankStats[huffWeight[n]]++; + weightTotal += (1 << huffWeight[n]) >> 1; + } } + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ + { U32 const tableLog = BIT_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; + U32 const verif = 1 << BIT_highbit32(rest); + U32 const lastWeight = BIT_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; + } } + + /* check tree construction validity */ + if ((rankStats[1] < 2) || (rankStats[1] & 1)) return ERROR(corruption_detected); /* by construction : at least 2 elts of rank 1, must be even */ + + /* results */ + *nbSymbolsPtr = (U32)(oSize+1); + return iSize+1; +} +/**** ended inlining common/entropy_common.c ****/ +/**** start inlining common/error_private.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* The purpose of this file is to have a single list of error strings embedded in binary */ + +/**** skipping file: error_private.h ****/ + +const char* ERR_getErrorString(ERR_enum code) +{ +#ifdef ZSTD_STRIP_ERROR_STRINGS + (void)code; + return "Error strings stripped"; +#else + static const char* const notErrorCode = "Unspecified error code"; + switch( code ) + { + case PREFIX(no_error): return "No error detected"; + case PREFIX(GENERIC): return "Error (generic)"; + case PREFIX(prefix_unknown): return "Unknown frame descriptor"; + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; + case PREFIX(corruption_detected): return "Corrupted block detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; + case PREFIX(workSpace_tooSmall): return "workSpace buffer is not large enough"; + case PREFIX(stage_wrong): return "Operation not authorized at current processing stage"; + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(maxCode): + default: return notErrorCode; + } +#endif +} +/**** ended inlining common/error_private.c ****/ +/**** start inlining common/fse_decompress.c ****/ +/* ****************************************************************** + * FSE : Finite State Entropy decoder + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +/**** skipping file: bitstream.h ****/ +/**** skipping file: compiler.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: fse.h ****/ +/**** skipping file: error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError +#define FSE_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ + + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ +FSE_DTable* FSE_createDTable (unsigned tableLog) +{ + if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; + return (FSE_DTable*)malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +} + +void FSE_freeDTable (FSE_DTable* dt) +{ + free(dt); +} + +size_t FSE_buildDTable(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ + FSE_DECODE_TYPE* const tableDecode = (FSE_DECODE_TYPE*) (tdPtr); + U16 symbolNext[FSE_MAX_SYMBOL_VALUE+1]; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize-1; + + /* Sanity Checks */ + if (maxSymbolValue > FSE_MAX_SYMBOL_VALUE) return ERROR(maxSymbolValue_tooLarge); + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); + + /* Init, lay down lowprob symbols */ + { FSE_DTableHeader DTableH; + DTableH.tableLog = (U16)tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + symbolNext[s] = normalizedCounter[s]; + } } } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } } + if (position!=0) return ERROR(GENERIC); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { U32 u; + for (u=0; utableLog = 0; + DTableH->fastMode = 0; + + cell->newState = 0; + cell->symbol = symbolValue; + cell->nbBits = 0; + + return 0; +} + + +size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +{ + void* ptr = dt; + FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; + void* dPtr = dt + 1; + FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; + const unsigned tableSize = 1 << nbBits; + const unsigned tableMask = tableSize - 1; + const unsigned maxSV1 = tableMask+1; + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) return ERROR(GENERIC); /* min size */ + + /* Build Decoding Table */ + DTableH->tableLog = (U16)nbBits; + DTableH->fastMode = 1; + for (s=0; s sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[1] = FSE_GETSYMBOL(&state2); + + if (FSE_MAX_TABLELOG*4+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + { if (BIT_reloadDStream(&bitD) > BIT_DStream_unfinished) { op+=2; break; } } + + op[2] = FSE_GETSYMBOL(&state1); + + if (FSE_MAX_TABLELOG*2+7 > sizeof(bitD.bitContainer)*8) /* This test must be static */ + BIT_reloadDStream(&bitD); + + op[3] = FSE_GETSYMBOL(&state2); + } + + /* tail */ + /* note : BIT_reloadDStream(&bitD) >= FSE_DStream_partiallyFilled; Ends at exactly BIT_DStream_completed */ + while (1) { + if (op>(omax-2)) return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state1); + if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state2); + break; + } + + if (op>(omax-2)) return ERROR(dstSize_tooSmall); + *op++ = FSE_GETSYMBOL(&state2); + if (BIT_reloadDStream(&bitD)==BIT_DStream_overflow) { + *op++ = FSE_GETSYMBOL(&state1); + break; + } } + + return op-ostart; +} + + +size_t FSE_decompress_usingDTable(void* dst, size_t originalSize, + const void* cSrc, size_t cSrcSize, + const FSE_DTable* dt) +{ + const void* ptr = dt; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ + if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); + return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +} + + +size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, FSE_DTable* workSpace, unsigned maxLog) +{ + const BYTE* const istart = (const BYTE*)cSrc; + const BYTE* ip = istart; + short counting[FSE_MAX_SYMBOL_VALUE+1]; + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + + /* normal FSE decoding mode */ + size_t const NCountLength = FSE_readNCount (counting, &maxSymbolValue, &tableLog, istart, cSrcSize); + if (FSE_isError(NCountLength)) return NCountLength; + /* if (NCountLength >= cSrcSize) return ERROR(srcSize_wrong); */ /* too small input size; supposed to be already checked in NCountLength, only remaining case : NCountLength==cSrcSize */ + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + ip += NCountLength; + cSrcSize -= NCountLength; + + CHECK_F( FSE_buildDTable (workSpace, counting, maxSymbolValue, tableLog) ); + + return FSE_decompress_usingDTable (dst, dstCapacity, ip, cSrcSize, workSpace); /* always return, even if it is an error code */ +} + + +typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; + +size_t FSE_decompress(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize) +{ + DTable_max_t dt; /* Static analyzer seems unable to understand this table will be properly initialized later */ + return FSE_decompress_wksp(dst, dstCapacity, cSrc, cSrcSize, dt, FSE_MAX_TABLELOG); +} + + + +#endif /* FSE_COMMONDEFS_ONLY */ +/**** ended inlining common/fse_decompress.c ****/ +/**** start inlining common/pool.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* ====== Dependencies ======= */ +#include /* size_t */ +/**** skipping file: debug.h ****/ +/**** start inlining zstd_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CCOMMON_H_MODULE +#define ZSTD_CCOMMON_H_MODULE + +/* this module contains definitions which must be identical + * across compression, decompression and dictBuilder. + * It also contains a few functions useful to at least 2 of them + * and which benefit from being inlined */ + +/*-************************************* +* Dependencies +***************************************/ +#ifdef __aarch64__ +#include +#endif +/**** skipping file: compiler.h ****/ +/**** skipping file: mem.h ****/ +/**** skipping file: debug.h ****/ +/**** skipping file: error_private.h ****/ +#define ZSTD_STATIC_LINKING_ONLY +/**** start inlining ../zstd.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + +/* ====== Dependency ======*/ +#include /* INT_MAX */ +#include /* size_t */ + + +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDLIB_API ZSTDLIB_VISIBILITY +#endif + + +/******************************************************************************* + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 5 + +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) +ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */ + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) +ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +/*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/*************************************** +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * They return an error otherwise. */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression work is performed in parallel, within worker threads. + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +typedef enum { + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. + */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + +typedef enum { + + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/**************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /**< start of input buffer */ + size_t size; /**< size of input buffer */ + size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /**< start of output buffer */ + size_t size; /**< size of output buffer */ + size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + + + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: + * + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). + */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/************************** +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*********************************** + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/******************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/******************************************************************************* + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/**************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 3, then this is seqDef.offset - 3 + * If seqDef.offset < 3, then this is the corresponding repeat offset + * But if seqDef.offset < 3 and litLength == 0, this is the + * repeat offset before the corresponding repeat offset + * And if seqDef.offset == 3 and litLength == 0, this is the + * most recent repeat offset - 1 + */ + unsigned int offset; + unsigned int litLength; /* Literal length */ + unsigned int matchLength; /* Match length */ + /* 0 when seq not rep and seqDef.offset otherwise + * when litLength == 0 this will be <= 4, otherwise <= 3 like normal + */ + unsigned int rep; +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /**< dispatch table : larger == faster, more memory */ + unsigned searchLog; /**< nb of searches : larger == more compression, slower */ + unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ +} ZSTD_frameParameters; + +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) + */ +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +/*! ZSTD_getSequences() : + * Extract sequences from the sequence store + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 + * @return : number of sequences extracted + */ +ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + + +/*************************************** +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. + */ +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*************************************** +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats + */ +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/******************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ +/**! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/**! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/**! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. + */ +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + + +/*===== Advanced Streaming decompression functions =====*/ +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/********************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ + +/** + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ + +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*- + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ + +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif +/**** ended inlining ../zstd.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: huf.h ****/ +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY /* XXH64_state_t */ +#endif +/**** start inlining xxhash.h ****/ +/* + * xxHash - Extremely Fast Hash algorithm + * Header File + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + +/* Notice extracted from xxHash homepage : + +xxHash is an extremely fast Hash algorithm, running at RAM speed limits. +It also successfully passes all tests from the SMHasher suite. + +Comparison (single thread, Windows Seven 32 bits, using SMHasher on a Core 2 Duo @3GHz) + +Name Speed Q.Score Author +xxHash 5.4 GB/s 10 +CrapWow 3.2 GB/s 2 Andrew +MumurHash 3a 2.7 GB/s 10 Austin Appleby +SpookyHash 2.0 GB/s 10 Bob Jenkins +SBox 1.4 GB/s 9 Bret Mulvey +Lookup3 1.2 GB/s 9 Bob Jenkins +SuperFastHash 1.2 GB/s 1 Paul Hsieh +CityHash64 1.05 GB/s 10 Pike & Alakuijala +FNV 0.55 GB/s 5 Fowler, Noll, Vo +CRC32 0.43 GB/s 9 +MD5-32 0.33 GB/s 10 Ronald L. Rivest +SHA1-32 0.28 GB/s 10 + +Q.Score is a measure of quality of the hash function. +It depends on successfully passing SMHasher test set. +10 is a perfect score. + +A 64-bits version, named XXH64, is available since r35. +It offers much better speed, but for 64-bits applications only. +Name Speed on 64 bits Speed on 32 bits +XXH64 13.8 GB/s 1.9 GB/s +XXH32 6.8 GB/s 6.0 GB/s +*/ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef XXHASH_H_5627135585666179 +#define XXHASH_H_5627135585666179 1 + + +/* **************************** +* Definitions +******************************/ +#include /* size_t */ +typedef enum { XXH_OK=0, XXH_ERROR } XXH_errorcode; + + +/* **************************** +* API modifier +******************************/ +/** XXH_PRIVATE_API +* This is useful if you want to include xxhash functions in `static` mode +* in order to inline them, and remove their symbol from the public list. +* Methodology : +* #define XXH_PRIVATE_API +* #include "xxhash.h" +* `xxhash.c` is automatically included. +* It's not useful to compile and link it as a separate module anymore. +*/ +#ifdef XXH_PRIVATE_API +# ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +# endif +# if defined(__GNUC__) +# define XXH_PUBLIC_API static __inline __attribute__((unused)) +# elif defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) +# define XXH_PUBLIC_API static inline +# elif defined(_MSC_VER) +# define XXH_PUBLIC_API static __inline +# else +# define XXH_PUBLIC_API static /* this version may generate warnings for unused static functions; disable the relevant warning */ +# endif +#else +# define XXH_PUBLIC_API /* do nothing */ +#endif /* XXH_PRIVATE_API */ + +/*!XXH_NAMESPACE, aka Namespace Emulation : + +If you want to include _and expose_ xxHash functions from within your own library, +but also want to avoid symbol collisions with another library which also includes xxHash, + +you can use XXH_NAMESPACE, to automatically prefix any public symbol from xxhash library +with the value of XXH_NAMESPACE (so avoid to keep it NULL and avoid numeric values). + +Note that no change is required within the calling program as long as it includes `xxhash.h` : +regular symbol name will be automatically translated by this header. +*/ +#ifdef XXH_NAMESPACE +# define XXH_CAT(A,B) A##B +# define XXH_NAME2(A,B) XXH_CAT(A,B) +# define XXH32 XXH_NAME2(XXH_NAMESPACE, XXH32) +# define XXH64 XXH_NAME2(XXH_NAMESPACE, XXH64) +# define XXH_versionNumber XXH_NAME2(XXH_NAMESPACE, XXH_versionNumber) +# define XXH32_createState XXH_NAME2(XXH_NAMESPACE, XXH32_createState) +# define XXH64_createState XXH_NAME2(XXH_NAMESPACE, XXH64_createState) +# define XXH32_freeState XXH_NAME2(XXH_NAMESPACE, XXH32_freeState) +# define XXH64_freeState XXH_NAME2(XXH_NAMESPACE, XXH64_freeState) +# define XXH32_reset XXH_NAME2(XXH_NAMESPACE, XXH32_reset) +# define XXH64_reset XXH_NAME2(XXH_NAMESPACE, XXH64_reset) +# define XXH32_update XXH_NAME2(XXH_NAMESPACE, XXH32_update) +# define XXH64_update XXH_NAME2(XXH_NAMESPACE, XXH64_update) +# define XXH32_digest XXH_NAME2(XXH_NAMESPACE, XXH32_digest) +# define XXH64_digest XXH_NAME2(XXH_NAMESPACE, XXH64_digest) +# define XXH32_copyState XXH_NAME2(XXH_NAMESPACE, XXH32_copyState) +# define XXH64_copyState XXH_NAME2(XXH_NAMESPACE, XXH64_copyState) +# define XXH32_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH32_canonicalFromHash) +# define XXH64_canonicalFromHash XXH_NAME2(XXH_NAMESPACE, XXH64_canonicalFromHash) +# define XXH32_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH32_hashFromCanonical) +# define XXH64_hashFromCanonical XXH_NAME2(XXH_NAMESPACE, XXH64_hashFromCanonical) +#endif + + +/* ************************************* +* Version +***************************************/ +#define XXH_VERSION_MAJOR 0 +#define XXH_VERSION_MINOR 6 +#define XXH_VERSION_RELEASE 2 +#define XXH_VERSION_NUMBER (XXH_VERSION_MAJOR *100*100 + XXH_VERSION_MINOR *100 + XXH_VERSION_RELEASE) +XXH_PUBLIC_API unsigned XXH_versionNumber (void); + + +/* **************************** +* Simple Hash Functions +******************************/ +typedef unsigned int XXH32_hash_t; +typedef unsigned long long XXH64_hash_t; + +XXH_PUBLIC_API XXH32_hash_t XXH32 (const void* input, size_t length, unsigned int seed); +XXH_PUBLIC_API XXH64_hash_t XXH64 (const void* input, size_t length, unsigned long long seed); + +/*! +XXH32() : + Calculate the 32-bits hash of sequence "length" bytes stored at memory address "input". + The memory between input & input+length must be valid (allocated and read-accessible). + "seed" can be used to alter the result predictably. + Speed on Core 2 Duo @ 3 GHz (single thread, SMHasher benchmark) : 5.4 GB/s +XXH64() : + Calculate the 64-bits hash of sequence of length "len" stored at memory address "input". + "seed" can be used to alter the result predictably. + This function runs 2x faster on 64-bits systems, but slower on 32-bits systems (see benchmark). +*/ + + +/* **************************** +* Streaming Hash Functions +******************************/ +typedef struct XXH32_state_s XXH32_state_t; /* incomplete type */ +typedef struct XXH64_state_s XXH64_state_t; /* incomplete type */ + +/*! State allocation, compatible with dynamic libraries */ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void); +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr); + + +/* hash streaming */ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset (XXH32_state_t* statePtr, unsigned int seed); +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH32_hash_t XXH32_digest (const XXH32_state_t* statePtr); + +XXH_PUBLIC_API XXH_errorcode XXH64_reset (XXH64_state_t* statePtr, unsigned long long seed); +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* statePtr, const void* input, size_t length); +XXH_PUBLIC_API XXH64_hash_t XXH64_digest (const XXH64_state_t* statePtr); + +/* +These functions generate the xxHash of an input provided in multiple segments. +Note that, for small input, they are slower than single-call functions, due to state management. +For small input, prefer `XXH32()` and `XXH64()` . + +XXH state must first be allocated, using XXH*_createState() . + +Start a new hash by initializing state with a seed, using XXH*_reset(). + +Then, feed the hash state by calling XXH*_update() as many times as necessary. +Obviously, input must be allocated and read accessible. +The function returns an error code, with 0 meaning OK, and any other value meaning there is an error. + +Finally, a hash value can be produced anytime, by using XXH*_digest(). +This function returns the nn-bits hash as an int or long long. + +It's still possible to continue inserting input into the hash state after a digest, +and generate some new hashes later on, by calling again XXH*_digest(). + +When done, free XXH state space if it was allocated dynamically. +*/ + + +/* ************************** +* Utils +****************************/ +#if !(defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)) /* ! C99 */ +# define restrict /* disable restrict */ +#endif + +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dst_state, const XXH32_state_t* restrict src_state); +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dst_state, const XXH64_state_t* restrict src_state); + + +/* ************************** +* Canonical representation +****************************/ +/* Default result type for XXH functions are primitive unsigned 32 and 64 bits. +* The canonical representation uses human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file / memory, and remain comparable on different systems and programs. +*/ +typedef struct { unsigned char digest[4]; } XXH32_canonical_t; +typedef struct { unsigned char digest[8]; } XXH64_canonical_t; + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash); +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash); + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src); +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src); + +#endif /* XXHASH_H_5627135585666179 */ + + + +/* ================================================================================================ + This section contains definitions which are not guaranteed to remain stable. + They may change in future versions, becoming incompatible with a different version of the library. + They shall only be used with static linking. + Never use these definitions in association with dynamic linking ! +=================================================================================================== */ +#if defined(XXH_STATIC_LINKING_ONLY) && !defined(XXH_STATIC_H_3543687687345) +#define XXH_STATIC_H_3543687687345 + +/* These definitions are only meant to allow allocation of XXH state + statically, on stack, or in a struct for example. + Do not use members directly. */ + + struct XXH32_state_s { + unsigned total_len_32; + unsigned large_len; + unsigned v1; + unsigned v2; + unsigned v3; + unsigned v4; + unsigned mem32[4]; /* buffer defined as U32 for alignment */ + unsigned memsize; + unsigned reserved; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH32_state_t */ + + struct XXH64_state_s { + unsigned long long total_len; + unsigned long long v1; + unsigned long long v2; + unsigned long long v3; + unsigned long long v4; + unsigned long long mem64[4]; /* buffer defined as U64 for alignment */ + unsigned memsize; + unsigned reserved[2]; /* never read nor write, will be removed in a future version */ + }; /* typedef'd to XXH64_state_t */ + + +# ifdef XXH_PRIVATE_API +/**** start inlining xxhash.c ****/ +/* + * xxHash - Fast Hash algorithm + * Copyright (c) 2012-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - xxHash homepage: http://www.xxhash.com + * - xxHash source repository : https://github.com/Cyan4973/xxHash + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +*/ + + +/* ************************************* +* Tuning parameters +***************************************/ +/*!XXH_FORCE_MEMORY_ACCESS : + * By default, access to unaligned memory is controlled by `memcpy()`, which is safe and portable. + * Unfortunately, on some target/compiler combinations, the generated assembly is sub-optimal. + * The below switch allow to select different access method for improved performance. + * Method 0 (default) : use `memcpy()`. Safe and portable. + * Method 1 : `__packed` statement. It depends on compiler extension (ie, not portable). + * This method is safe if your compiler supports it, and *generally* as fast or faster than `memcpy`. + * Method 2 : direct access. This method doesn't depend on compiler but violate C standard. + * It can generate buggy code on targets which do not support unaligned memory accesses. + * But in some circumstances, it's the only known way to get the most performance (ie GCC + ARMv6) + * See http://stackoverflow.com/a/32095106/646947 for details. + * Prefer these methods in priority order (0 > 1 > 2) + */ +#ifndef XXH_FORCE_MEMORY_ACCESS /* can be defined externally, on command line for example */ +# if defined(__GNUC__) && ( defined(__ARM_ARCH_6__) || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) || defined(__ARM_ARCH_6T2__) ) +# define XXH_FORCE_MEMORY_ACCESS 2 +# elif (defined(__INTEL_COMPILER) && !defined(WIN32)) || \ + (defined(__GNUC__) && ( defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) )) || \ + defined(__ICCARM__) +# define XXH_FORCE_MEMORY_ACCESS 1 +# endif +#endif + +/*!XXH_ACCEPT_NULL_INPUT_POINTER : + * If the input pointer is a null pointer, xxHash default behavior is to trigger a memory access error, since it is a bad pointer. + * When this option is enabled, xxHash output for null input pointers will be the same as a null-length input. + * By default, this option is disabled. To enable it, uncomment below define : + */ +/* #define XXH_ACCEPT_NULL_INPUT_POINTER 1 */ + +/*!XXH_FORCE_NATIVE_FORMAT : + * By default, xxHash library provides endian-independent Hash values, based on little-endian convention. + * Results are therefore identical for little-endian and big-endian CPU. + * This comes at a performance cost for big-endian CPU, since some swapping is required to emulate little-endian format. + * Should endian-independence be of no importance for your application, you may set the #define below to 1, + * to improve speed for Big-endian CPU. + * This option has no impact on Little_Endian CPU. + */ +#ifndef XXH_FORCE_NATIVE_FORMAT /* can be defined externally */ +# define XXH_FORCE_NATIVE_FORMAT 0 +#endif + +/*!XXH_FORCE_ALIGN_CHECK : + * This is a minor performance trick, only useful with lots of very small keys. + * It means : check for aligned/unaligned input. + * The check costs one initial branch per hash; set to 0 when the input data + * is guaranteed to be aligned. + */ +#ifndef XXH_FORCE_ALIGN_CHECK /* can be defined externally */ +# if defined(__i386) || defined(_M_IX86) || defined(__x86_64__) || defined(_M_X64) +# define XXH_FORCE_ALIGN_CHECK 0 +# else +# define XXH_FORCE_ALIGN_CHECK 1 +# endif +#endif + + +/* ************************************* +* Includes & Memory related functions +***************************************/ +/* Modify the local functions below should you wish to use some other memory routines */ +/* for malloc(), free() */ +#include +#include /* size_t */ +static void* XXH_malloc(size_t s) { return malloc(s); } +static void XXH_free (void* p) { free(p); } +/* for memcpy() */ +#include +static void* XXH_memcpy(void* dest, const void* src, size_t size) { return memcpy(dest,src,size); } + +#ifndef XXH_STATIC_LINKING_ONLY +# define XXH_STATIC_LINKING_ONLY +#endif +/**** skipping file: xxhash.h ****/ + + +/* ************************************* +* Compiler Specific Options +***************************************/ +#if (defined(__GNUC__) && !defined(__STRICT_ANSI__)) || defined(__cplusplus) || defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L /* C99 */ +# define INLINE_KEYWORD inline +#else +# define INLINE_KEYWORD +#endif + +#if defined(__GNUC__) || defined(__ICCARM__) +# define FORCE_INLINE_ATTR __attribute__((always_inline)) +#elif defined(_MSC_VER) +# define FORCE_INLINE_ATTR __forceinline +#else +# define FORCE_INLINE_ATTR +#endif + +#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR + + +#ifdef _MSC_VER +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************* +* Basic Types +***************************************/ +#ifndef MEM_MODULE +# define MEM_MODULE +# if !defined (__VMS) && (defined (__cplusplus) || (defined (__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) /* C99 */) ) +# include + typedef uint8_t BYTE; + typedef uint16_t U16; + typedef uint32_t U32; + typedef int32_t S32; + typedef uint64_t U64; +# else + typedef unsigned char BYTE; + typedef unsigned short U16; + typedef unsigned int U32; + typedef signed int S32; + typedef unsigned long long U64; /* if your compiler doesn't support unsigned long long, replace by another 64-bit type here. Note that xxhash.h will also need to be updated. */ +# endif +#endif + + +#if (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==2)) + +/* Force direct memory access. Only works on CPU which support unaligned memory access in hardware */ +static U32 XXH_read32(const void* memPtr) { return *(const U32*) memPtr; } +static U64 XXH_read64(const void* memPtr) { return *(const U64*) memPtr; } + +#elif (defined(XXH_FORCE_MEMORY_ACCESS) && (XXH_FORCE_MEMORY_ACCESS==1)) + +/* __pack instructions are safer, but compiler specific, hence potentially problematic for some compilers */ +/* currently only defined for gcc and icc */ +typedef union { U32 u32; U64 u64; } __attribute__((packed)) unalign; + +static U32 XXH_read32(const void* ptr) { return ((const unalign*)ptr)->u32; } +static U64 XXH_read64(const void* ptr) { return ((const unalign*)ptr)->u64; } + +#else + +/* portable and safe solution. Generally efficient. + * see : http://stackoverflow.com/a/32095106/646947 + */ + +static U32 XXH_read32(const void* memPtr) +{ + U32 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +static U64 XXH_read64(const void* memPtr) +{ + U64 val; + memcpy(&val, memPtr, sizeof(val)); + return val; +} + +#endif /* XXH_FORCE_DIRECT_MEMORY_ACCESS */ + + +/* **************************************** +* Compiler-specific Functions and Macros +******************************************/ +#define GCC_VERSION (__GNUC__ * 100 + __GNUC_MINOR__) + +/* Note : although _rotl exists for minGW (GCC under windows), performance seems poor */ +#if defined(_MSC_VER) +# define XXH_rotl32(x,r) _rotl(x,r) +# define XXH_rotl64(x,r) _rotl64(x,r) +#else +#if defined(__ICCARM__) +# include +# define XXH_rotl32(x,r) __ROR(x,(32 - r)) +#else +# define XXH_rotl32(x,r) ((x << r) | (x >> (32 - r))) +#endif +# define XXH_rotl64(x,r) ((x << r) | (x >> (64 - r))) +#endif + +#if defined(_MSC_VER) /* Visual Studio */ +# define XXH_swap32 _byteswap_ulong +# define XXH_swap64 _byteswap_uint64 +#elif GCC_VERSION >= 403 +# define XXH_swap32 __builtin_bswap32 +# define XXH_swap64 __builtin_bswap64 +#else +static U32 XXH_swap32 (U32 x) +{ + return ((x << 24) & 0xff000000 ) | + ((x << 8) & 0x00ff0000 ) | + ((x >> 8) & 0x0000ff00 ) | + ((x >> 24) & 0x000000ff ); +} +static U64 XXH_swap64 (U64 x) +{ + return ((x << 56) & 0xff00000000000000ULL) | + ((x << 40) & 0x00ff000000000000ULL) | + ((x << 24) & 0x0000ff0000000000ULL) | + ((x << 8) & 0x000000ff00000000ULL) | + ((x >> 8) & 0x00000000ff000000ULL) | + ((x >> 24) & 0x0000000000ff0000ULL) | + ((x >> 40) & 0x000000000000ff00ULL) | + ((x >> 56) & 0x00000000000000ffULL); +} +#endif + + +/* ************************************* +* Architecture Macros +***************************************/ +typedef enum { XXH_bigEndian=0, XXH_littleEndian=1 } XXH_endianess; + +/* XXH_CPU_LITTLE_ENDIAN can be defined externally, for example on the compiler command line */ +#ifndef XXH_CPU_LITTLE_ENDIAN + static const int g_one = 1; +# define XXH_CPU_LITTLE_ENDIAN (*(const char*)(&g_one)) +#endif + + +/* *************************** +* Memory reads +*****************************/ +typedef enum { XXH_aligned, XXH_unaligned } XXH_alignment; + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read32(ptr) : XXH_swap32(XXH_read32(ptr)); + else + return endian==XXH_littleEndian ? *(const U32*)ptr : XXH_swap32(*(const U32*)ptr); +} + +FORCE_INLINE_TEMPLATE U32 XXH_readLE32(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE32_align(ptr, endian, XXH_unaligned); +} + +static U32 XXH_readBE32(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap32(XXH_read32(ptr)) : XXH_read32(ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64_align(const void* ptr, XXH_endianess endian, XXH_alignment align) +{ + if (align==XXH_unaligned) + return endian==XXH_littleEndian ? XXH_read64(ptr) : XXH_swap64(XXH_read64(ptr)); + else + return endian==XXH_littleEndian ? *(const U64*)ptr : XXH_swap64(*(const U64*)ptr); +} + +FORCE_INLINE_TEMPLATE U64 XXH_readLE64(const void* ptr, XXH_endianess endian) +{ + return XXH_readLE64_align(ptr, endian, XXH_unaligned); +} + +static U64 XXH_readBE64(const void* ptr) +{ + return XXH_CPU_LITTLE_ENDIAN ? XXH_swap64(XXH_read64(ptr)) : XXH_read64(ptr); +} + + +/* ************************************* +* Macros +***************************************/ +#define XXH_STATIC_ASSERT(c) { enum { XXH_static_assert = 1/(int)(!!(c)) }; } /* use only *after* variable declarations */ + + +/* ************************************* +* Constants +***************************************/ +static const U32 PRIME32_1 = 2654435761U; +static const U32 PRIME32_2 = 2246822519U; +static const U32 PRIME32_3 = 3266489917U; +static const U32 PRIME32_4 = 668265263U; +static const U32 PRIME32_5 = 374761393U; + +static const U64 PRIME64_1 = 11400714785074694791ULL; +static const U64 PRIME64_2 = 14029467366897019727ULL; +static const U64 PRIME64_3 = 1609587929392839161ULL; +static const U64 PRIME64_4 = 9650029242287828579ULL; +static const U64 PRIME64_5 = 2870177450012600261ULL; + +XXH_PUBLIC_API unsigned XXH_versionNumber (void) { return XXH_VERSION_NUMBER; } + + +/* ************************** +* Utils +****************************/ +XXH_PUBLIC_API void XXH32_copyState(XXH32_state_t* restrict dstState, const XXH32_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + +XXH_PUBLIC_API void XXH64_copyState(XXH64_state_t* restrict dstState, const XXH64_state_t* restrict srcState) +{ + memcpy(dstState, srcState, sizeof(*dstState)); +} + + +/* *************************** +* Simple Hash Functions +*****************************/ + +static U32 XXH32_round(U32 seed, U32 input) +{ + seed += input * PRIME32_2; + seed = XXH_rotl32(seed, 13); + seed *= PRIME32_1; + return seed; +} + +FORCE_INLINE_TEMPLATE U32 XXH32_endian_align(const void* input, size_t len, U32 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* bEnd = p + len; + U32 h32; +#define XXH_get32bits(p) XXH_readLE32_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)16; + } +#endif + + if (len>=16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = seed + PRIME32_1 + PRIME32_2; + U32 v2 = seed + PRIME32_2; + U32 v3 = seed + 0; + U32 v4 = seed - PRIME32_1; + + do { + v1 = XXH32_round(v1, XXH_get32bits(p)); p+=4; + v2 = XXH32_round(v2, XXH_get32bits(p)); p+=4; + v3 = XXH32_round(v3, XXH_get32bits(p)); p+=4; + v4 = XXH32_round(v4, XXH_get32bits(p)); p+=4; + } while (p<=limit); + + h32 = XXH_rotl32(v1, 1) + XXH_rotl32(v2, 7) + XXH_rotl32(v3, 12) + XXH_rotl32(v4, 18); + } else { + h32 = seed + PRIME32_5; + } + + h32 += (U32) len; + + while (p+4<=bEnd) { + h32 += XXH_get32bits(p) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4 ; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32 (const void* input, size_t len, unsigned int seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH32_CREATESTATE_STATIC(state); + XXH32_reset(state, seed); + XXH32_update(state, input, len); + return XXH32_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 3) == 0) { /* Input is 4-bytes aligned, leverage the speed benefit */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH32_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +static U64 XXH64_round(U64 acc, U64 input) +{ + acc += input * PRIME64_2; + acc = XXH_rotl64(acc, 31); + acc *= PRIME64_1; + return acc; +} + +static U64 XXH64_mergeRound(U64 acc, U64 val) +{ + val = XXH64_round(0, val); + acc ^= val; + acc = acc * PRIME64_1 + PRIME64_4; + return acc; +} + +FORCE_INLINE_TEMPLATE U64 XXH64_endian_align(const void* input, size_t len, U64 seed, XXH_endianess endian, XXH_alignment align) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + U64 h64; +#define XXH_get64bits(p) XXH_readLE64_align(p, endian, align) + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (p==NULL) { + len=0; + bEnd=p=(const BYTE*)(size_t)32; + } +#endif + + if (len>=32) { + const BYTE* const limit = bEnd - 32; + U64 v1 = seed + PRIME64_1 + PRIME64_2; + U64 v2 = seed + PRIME64_2; + U64 v3 = seed + 0; + U64 v4 = seed - PRIME64_1; + + do { + v1 = XXH64_round(v1, XXH_get64bits(p)); p+=8; + v2 = XXH64_round(v2, XXH_get64bits(p)); p+=8; + v3 = XXH64_round(v3, XXH_get64bits(p)); p+=8; + v4 = XXH64_round(v4, XXH_get64bits(p)); p+=8; + } while (p<=limit); + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + + } else { + h64 = seed + PRIME64_5; + } + + h64 += (U64) len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_get64bits(p)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_get32bits(p)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64 (const void* input, size_t len, unsigned long long seed) +{ +#if 0 + /* Simple version, good for code maintenance, but unfortunately slow for small inputs */ + XXH64_CREATESTATE_STATIC(state); + XXH64_reset(state, seed); + XXH64_update(state, input, len); + return XXH64_digest(state); +#else + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if (XXH_FORCE_ALIGN_CHECK) { + if ((((size_t)input) & 7)==0) { /* Input is aligned, let's leverage the speed advantage */ + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_aligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_aligned); + } } + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_endian_align(input, len, seed, XXH_littleEndian, XXH_unaligned); + else + return XXH64_endian_align(input, len, seed, XXH_bigEndian, XXH_unaligned); +#endif +} + + +/* ************************************************** +* Advanced Hash Functions +****************************************************/ + +XXH_PUBLIC_API XXH32_state_t* XXH32_createState(void) +{ + return (XXH32_state_t*)XXH_malloc(sizeof(XXH32_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH32_freeState(XXH32_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + +XXH_PUBLIC_API XXH64_state_t* XXH64_createState(void) +{ + return (XXH64_state_t*)XXH_malloc(sizeof(XXH64_state_t)); +} +XXH_PUBLIC_API XXH_errorcode XXH64_freeState(XXH64_state_t* statePtr) +{ + XXH_free(statePtr); + return XXH_OK; +} + + +/*** Hash feed ***/ + +XXH_PUBLIC_API XXH_errorcode XXH32_reset(XXH32_state_t* statePtr, unsigned int seed) +{ + XXH32_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-4); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME32_1 + PRIME32_2; + state.v2 = seed + PRIME32_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME32_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +XXH_PUBLIC_API XXH_errorcode XXH64_reset(XXH64_state_t* statePtr, unsigned long long seed) +{ + XXH64_state_t state; /* using a local state to memcpy() in order to avoid strict-aliasing warnings */ + memset(&state, 0, sizeof(state)-8); /* do not write into reserved, for future removal */ + state.v1 = seed + PRIME64_1 + PRIME64_2; + state.v2 = seed + PRIME64_2; + state.v3 = seed + 0; + state.v4 = seed - PRIME64_1; + memcpy(statePtr, &state, sizeof(state)); + return XXH_OK; +} + + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH32_update_endian (XXH32_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len_32 += (unsigned)len; + state->large_len |= (len>=16) | (state->total_len_32>=16); + + if (state->memsize + len < 16) { /* fill in tmp buffer */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, len); + state->memsize += (unsigned)len; + return XXH_OK; + } + + if (state->memsize) { /* some data left from previous update */ + XXH_memcpy((BYTE*)(state->mem32) + state->memsize, input, 16-state->memsize); + { const U32* p32 = state->mem32; + state->v1 = XXH32_round(state->v1, XXH_readLE32(p32, endian)); p32++; + state->v2 = XXH32_round(state->v2, XXH_readLE32(p32, endian)); p32++; + state->v3 = XXH32_round(state->v3, XXH_readLE32(p32, endian)); p32++; + state->v4 = XXH32_round(state->v4, XXH_readLE32(p32, endian)); p32++; + } + p += 16-state->memsize; + state->memsize = 0; + } + + if (p <= bEnd-16) { + const BYTE* const limit = bEnd - 16; + U32 v1 = state->v1; + U32 v2 = state->v2; + U32 v3 = state->v3; + U32 v4 = state->v4; + + do { + v1 = XXH32_round(v1, XXH_readLE32(p, endian)); p+=4; + v2 = XXH32_round(v2, XXH_readLE32(p, endian)); p+=4; + v3 = XXH32_round(v3, XXH_readLE32(p, endian)); p+=4; + v4 = XXH32_round(v4, XXH_readLE32(p, endian)); p+=4; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem32, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH32_update (XXH32_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH32_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U32 XXH32_digest_endian (const XXH32_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem32; + const BYTE* const bEnd = (const BYTE*)(state->mem32) + state->memsize; + U32 h32; + + if (state->large_len) { + h32 = XXH_rotl32(state->v1, 1) + XXH_rotl32(state->v2, 7) + XXH_rotl32(state->v3, 12) + XXH_rotl32(state->v4, 18); + } else { + h32 = state->v3 /* == seed */ + PRIME32_5; + } + + h32 += state->total_len_32; + + while (p+4<=bEnd) { + h32 += XXH_readLE32(p, endian) * PRIME32_3; + h32 = XXH_rotl32(h32, 17) * PRIME32_4; + p+=4; + } + + while (p> 15; + h32 *= PRIME32_2; + h32 ^= h32 >> 13; + h32 *= PRIME32_3; + h32 ^= h32 >> 16; + + return h32; +} + + +XXH_PUBLIC_API unsigned int XXH32_digest (const XXH32_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH32_digest_endian(state_in, XXH_littleEndian); + else + return XXH32_digest_endian(state_in, XXH_bigEndian); +} + + + +/* **** XXH64 **** */ + +FORCE_INLINE_TEMPLATE XXH_errorcode XXH64_update_endian (XXH64_state_t* state, const void* input, size_t len, XXH_endianess endian) +{ + const BYTE* p = (const BYTE*)input; + const BYTE* const bEnd = p + len; + +#ifdef XXH_ACCEPT_NULL_INPUT_POINTER + if (input==NULL) return XXH_ERROR; +#endif + + state->total_len += len; + + if (state->memsize + len < 32) { /* fill in tmp buffer */ + if (input != NULL) { + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, len); + } + state->memsize += (U32)len; + return XXH_OK; + } + + if (state->memsize) { /* tmp buffer is full */ + XXH_memcpy(((BYTE*)state->mem64) + state->memsize, input, 32-state->memsize); + state->v1 = XXH64_round(state->v1, XXH_readLE64(state->mem64+0, endian)); + state->v2 = XXH64_round(state->v2, XXH_readLE64(state->mem64+1, endian)); + state->v3 = XXH64_round(state->v3, XXH_readLE64(state->mem64+2, endian)); + state->v4 = XXH64_round(state->v4, XXH_readLE64(state->mem64+3, endian)); + p += 32-state->memsize; + state->memsize = 0; + } + + if (p+32 <= bEnd) { + const BYTE* const limit = bEnd - 32; + U64 v1 = state->v1; + U64 v2 = state->v2; + U64 v3 = state->v3; + U64 v4 = state->v4; + + do { + v1 = XXH64_round(v1, XXH_readLE64(p, endian)); p+=8; + v2 = XXH64_round(v2, XXH_readLE64(p, endian)); p+=8; + v3 = XXH64_round(v3, XXH_readLE64(p, endian)); p+=8; + v4 = XXH64_round(v4, XXH_readLE64(p, endian)); p+=8; + } while (p<=limit); + + state->v1 = v1; + state->v2 = v2; + state->v3 = v3; + state->v4 = v4; + } + + if (p < bEnd) { + XXH_memcpy(state->mem64, p, (size_t)(bEnd-p)); + state->memsize = (unsigned)(bEnd-p); + } + + return XXH_OK; +} + +XXH_PUBLIC_API XXH_errorcode XXH64_update (XXH64_state_t* state_in, const void* input, size_t len) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_update_endian(state_in, input, len, XXH_littleEndian); + else + return XXH64_update_endian(state_in, input, len, XXH_bigEndian); +} + + + +FORCE_INLINE_TEMPLATE U64 XXH64_digest_endian (const XXH64_state_t* state, XXH_endianess endian) +{ + const BYTE * p = (const BYTE*)state->mem64; + const BYTE* const bEnd = (const BYTE*)state->mem64 + state->memsize; + U64 h64; + + if (state->total_len >= 32) { + U64 const v1 = state->v1; + U64 const v2 = state->v2; + U64 const v3 = state->v3; + U64 const v4 = state->v4; + + h64 = XXH_rotl64(v1, 1) + XXH_rotl64(v2, 7) + XXH_rotl64(v3, 12) + XXH_rotl64(v4, 18); + h64 = XXH64_mergeRound(h64, v1); + h64 = XXH64_mergeRound(h64, v2); + h64 = XXH64_mergeRound(h64, v3); + h64 = XXH64_mergeRound(h64, v4); + } else { + h64 = state->v3 + PRIME64_5; + } + + h64 += (U64) state->total_len; + + while (p+8<=bEnd) { + U64 const k1 = XXH64_round(0, XXH_readLE64(p, endian)); + h64 ^= k1; + h64 = XXH_rotl64(h64,27) * PRIME64_1 + PRIME64_4; + p+=8; + } + + if (p+4<=bEnd) { + h64 ^= (U64)(XXH_readLE32(p, endian)) * PRIME64_1; + h64 = XXH_rotl64(h64, 23) * PRIME64_2 + PRIME64_3; + p+=4; + } + + while (p> 33; + h64 *= PRIME64_2; + h64 ^= h64 >> 29; + h64 *= PRIME64_3; + h64 ^= h64 >> 32; + + return h64; +} + + +XXH_PUBLIC_API unsigned long long XXH64_digest (const XXH64_state_t* state_in) +{ + XXH_endianess endian_detected = (XXH_endianess)XXH_CPU_LITTLE_ENDIAN; + + if ((endian_detected==XXH_littleEndian) || XXH_FORCE_NATIVE_FORMAT) + return XXH64_digest_endian(state_in, XXH_littleEndian); + else + return XXH64_digest_endian(state_in, XXH_bigEndian); +} + + +/* ************************** +* Canonical representation +****************************/ + +/*! Default XXH result types are basic unsigned 32 and 64 bits. +* The canonical representation follows human-readable write convention, aka big-endian (large digits first). +* These functions allow transformation of hash result into and from its canonical format. +* This way, hash values can be written into a file or buffer, and remain comparable across different systems and programs. +*/ + +XXH_PUBLIC_API void XXH32_canonicalFromHash(XXH32_canonical_t* dst, XXH32_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH32_canonical_t) == sizeof(XXH32_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap32(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API void XXH64_canonicalFromHash(XXH64_canonical_t* dst, XXH64_hash_t hash) +{ + XXH_STATIC_ASSERT(sizeof(XXH64_canonical_t) == sizeof(XXH64_hash_t)); + if (XXH_CPU_LITTLE_ENDIAN) hash = XXH_swap64(hash); + memcpy(dst, &hash, sizeof(*dst)); +} + +XXH_PUBLIC_API XXH32_hash_t XXH32_hashFromCanonical(const XXH32_canonical_t* src) +{ + return XXH_readBE32(src); +} + +XXH_PUBLIC_API XXH64_hash_t XXH64_hashFromCanonical(const XXH64_canonical_t* src) +{ + return XXH_readBE64(src); +} +/**** ended inlining xxhash.c ****/ +# endif + +#endif /* XXH_STATIC_LINKING_ONLY && XXH_STATIC_H_3543687687345 */ + + +#if defined (__cplusplus) +} +#endif +/**** ended inlining xxhash.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ---- static assert (debug) --- */ +#define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) +#define ZSTD_isError ERR_isError /* for inlining */ +#define FSE_isError ERR_isError +#define HUF_isError ERR_isError + + +/*-************************************* +* shared macros +***************************************/ +#undef MIN +#undef MAX +#define MIN(a,b) ((a)<(b) ? (a) : (b)) +#define MAX(a,b) ((a)>(b) ? (a) : (b)) + +/** + * Ignore: this is an internal helper. + * + * This is a helper function to help force C99-correctness during compilation. + * Under strict compilation modes, variadic macro arguments can't be empty. + * However, variadic function arguments can be. Using a function therefore lets + * us statically check that at least one (string) argument was passed, + * independent of the compilation flags. + */ +static INLINE_KEYWORD UNUSED_ATTR +void _force_has_format_string(const char *format, ...) { + (void)format; +} + +/** + * Ignore: this is an internal helper. + * + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +#define _FORCE_HAS_FORMAT_STRING(...) \ + if (0) { \ + _force_has_format_string(__VA_ARGS__); \ + } + +/** + * Return the specified error if the condition evaluates to true. + * + * In debug modes, prints additional information. + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +#define RETURN_ERROR_IF(cond, err, ...) \ + if (cond) { \ + RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(cond), ZSTD_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } + +/** + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +#define RETURN_ERROR(err, ...) \ + do { \ + RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(ERROR(err))); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return ERROR(err); \ + } while(0); + +/** + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +#define FORWARD_IF_ERROR(err, ...) \ + do { \ + size_t const err_code = (err); \ + if (ERR_isError(err_code)) { \ + RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ + __FILE__, __LINE__, ZSTD_QUOTE(err), ERR_getErrorName(err_code)); \ + _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ + RAWLOG(3, ": " __VA_ARGS__); \ + RAWLOG(3, "\n"); \ + return err_code; \ + } \ + } while(0); + + +/*-************************************* +* Common constants +***************************************/ +#define ZSTD_OPT_NUM (1<<12) + +#define ZSTD_REP_NUM 3 /* number of repcodes */ +#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +static const U32 repStartValue[ZSTD_REP_NUM] = { 1, 4, 8 }; + +#define KB *(1 <<10) +#define MB *(1 <<20) +#define GB *(1U<<30) + +#define BIT7 128 +#define BIT6 64 +#define BIT5 32 +#define BIT4 16 +#define BIT1 2 +#define BIT0 1 + +#define ZSTD_WINDOWLOG_ABSOLUTEMIN 10 +static const size_t ZSTD_fcs_fieldSize[4] = { 0, 2, 4, 8 }; +static const size_t ZSTD_did_fieldSize[4] = { 0, 1, 2, 4 }; + +#define ZSTD_FRAMEIDSIZE 4 /* magic number size */ + +#define ZSTD_BLOCKHEADERSIZE 3 /* C standard doesn't allow `static const` variable to be init using another `static const` variable */ +static const size_t ZSTD_blockHeaderSize = ZSTD_BLOCKHEADERSIZE; +typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + +#define ZSTD_FRAMECHECKSUMSIZE 4 + +#define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ + +#define HufLog 12 +typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + +#define LONGNBSEQ 0x7F00 + +#define MINMATCH 3 + +#define Litbits 8 +#define MaxLit ((1<= 8 || (ovtype == ZSTD_no_overlap && diff <= -WILDCOPY_VECLEN)); + + if (ovtype == ZSTD_overlap_src_before_dst && diff < WILDCOPY_VECLEN) { + /* Handle short offset copies. */ + do { + COPY8(op, ip) + } while (op < oend); + } else { + assert(diff >= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); + /* Separate out the first COPY16() call because the copy length is + * almost certain to be short, so the branches have different + * probabilities. Since it is almost certain to be short, only do + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +#ifndef __aarch64__ + do { + COPY16(op, ip); + } + while (op < oend); +#else + COPY16(op, ip); + if (op >= oend) return; + do { + COPY16(op, ip); + COPY16(op, ip); + } + while (op < oend); +#endif + } +} + +MEM_STATIC size_t ZSTD_limitCopy(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + size_t const length = MIN(dstCapacity, srcSize); + if (length > 0) { + memcpy(dst, src, length); + } + return length; +} + +/* define "workspace is too large" as this number of times larger than needed */ +#define ZSTD_WORKSPACETOOLARGE_FACTOR 3 + +/* when workspace is continuously too large + * during at least this number of times, + * context's memory usage is considered wasteful, + * because it's sized to handle a worst case scenario which rarely happens. + * In which case, resize it down to free some memory */ +#define ZSTD_WORKSPACETOOLARGE_MAXDURATION 128 + + +/*-******************************************* +* Private declarations +*********************************************/ +typedef struct seqDef_s { + U32 offset; + U16 litLength; + U16 matchLength; +} seqDef; + +typedef struct { + seqDef* sequencesStart; + seqDef* sequences; + BYTE* litStart; + BYTE* lit; + BYTE* llCode; + BYTE* mlCode; + BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + U32 longLengthID; /* 0 == no longLength; 1 == Lit.longLength; 2 == Match.longLength; */ + U32 longLengthPos; +} seqStore_t; + +typedef struct { + U32 litLength; + U32 matchLength; +} ZSTD_sequenceLength; + +/** + * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences + * indicated by longLengthPos and longLengthID, and adds MINMATCH back to matchLength. + */ +MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) +{ + ZSTD_sequenceLength seqLen; + seqLen.litLength = seq->litLength; + seqLen.matchLength = seq->matchLength + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthID == 1) { + seqLen.litLength += 0xFFFF; + } + if (seqStore->longLengthID == 2) { + seqLen.matchLength += 0xFFFF; + } + } + return seqLen; +} + +/** + * Contains the compressed frame size and an upper-bound for the decompressed frame size. + * Note: before using `compressedSize`, check for errors using ZSTD_isError(). + * similarly, before using `decompressedBound`, check for errors using: + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ +typedef struct { + size_t compressedSize; + unsigned long long decompressedBound; +} ZSTD_frameSizeInfo; /* decompress & legacy */ + +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + +/* custom memory allocation functions */ +void* ZSTD_malloc(size_t size, ZSTD_customMem customMem); +void* ZSTD_calloc(size_t size, ZSTD_customMem customMem); +void ZSTD_free(void* ptr, ZSTD_customMem customMem); + + +MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +{ + assert(val != 0); + { +# if defined(_MSC_VER) /* Visual */ + unsigned long r=0; + return _BitScanReverse(&r, val) ? (unsigned)r : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) /* GCC Intrinsic */ + return __builtin_clz (val) ^ 31; +# elif defined(__ICCARM__) /* IAR Intrinsic */ + return 31 - __CLZ(val); +# else /* Software version */ + static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; + U32 v = val; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +# endif + } +} + + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx); /* zstdmt, adaptive_compression (shouldn't get this definition from here) */ + + +typedef struct { + blockType_e blockType; + U32 lastBlock; + U32 origSize; +} blockProperties_t; /* declared here for decompress and fullbench */ + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +/* Used by: decompress, fullbench (does not get its definition from here) */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + +/*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +/* Used by: decompress, fullbench (does not get its definition from here) */ +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_CCOMMON_H_MODULE */ +/**** ended inlining zstd_internal.h ****/ +/**** start inlining pool.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef POOL_H +#define POOL_H + +#if defined (__cplusplus) +extern "C" { +#endif + + +#include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_customMem */ +/**** skipping file: ../zstd.h ****/ + +typedef struct POOL_ctx_s POOL_ctx; + +/*! POOL_create() : + * Create a thread pool with at most `numThreads` threads. + * `numThreads` must be at least 1. + * The maximum number of queued jobs before blocking is `queueSize`. + * @return : POOL_ctx pointer on success, else NULL. +*/ +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize); + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, + ZSTD_customMem customMem); + +/*! POOL_free() : + * Free a thread pool returned by POOL_create(). + */ +void POOL_free(POOL_ctx* ctx); + +/*! POOL_resize() : + * Expands or shrinks pool's number of threads. + * This is more efficient than releasing + creating a new context, + * since it tries to preserve and re-use existing threads. + * `numThreads` must be at least 1. + * @return : 0 when resize was successful, + * !0 (typically 1) if there is an error. + * note : only numThreads can be resized, queueSize remains unchanged. + */ +int POOL_resize(POOL_ctx* ctx, size_t numThreads); + +/*! POOL_sizeof() : + * @return threadpool memory usage + * note : compatible with NULL (returns 0 in this case) + */ +size_t POOL_sizeof(POOL_ctx* ctx); + +/*! POOL_function : + * The function type that can be added to a thread pool. + */ +typedef void (*POOL_function)(void*); + +/*! POOL_add() : + * Add the job `function(opaque)` to the thread pool. `ctx` must be valid. + * Possibly blocks until there is room in the queue. + * Note : The function may be executed asynchronously, + * therefore, `opaque` must live until function has been completed. + */ +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque); + + +/*! POOL_tryAdd() : + * Add the job `function(opaque)` to thread pool _if_ a worker is available. + * Returns immediately even if not (does not block). + * @return : 1 if successful, 0 if not. + */ +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque); + + +#if defined (__cplusplus) +} +#endif + +#endif +/**** ended inlining pool.h ****/ + +/* ====== Compiler specifics ====== */ +#if defined(_MSC_VER) +# pragma warning(disable : 4204) /* disable: C4204: non-constant aggregate initializer */ +#endif + + +#ifdef ZSTD_MULTITHREAD + +/**** start inlining threading.h ****/ +/** + * Copyright (c) 2016 Tino Reichardt + * All rights reserved. + * + * You can contact the author at: + * - zstdmt source repository: https://github.com/mcmilk/zstdmt + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef THREADING_H_938743 +#define THREADING_H_938743 + +/**** skipping file: debug.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +#if defined(ZSTD_MULTITHREAD) && defined(_WIN32) + +/** + * Windows minimalist Pthread Wrapper, based on : + * http://www.cse.wustl.edu/~schmidt/win32-cv-1.html + */ +#ifdef WINVER +# undef WINVER +#endif +#define WINVER 0x0600 + +#ifdef _WIN32_WINNT +# undef _WIN32_WINNT +#endif +#define _WIN32_WINNT 0x0600 + +#ifndef WIN32_LEAN_AND_MEAN +# define WIN32_LEAN_AND_MEAN +#endif + +#undef ERROR /* reported already defined on VS 2015 (Rich Geldreich) */ +#include +#undef ERROR +#define ERROR(name) ZSTD_ERROR(name) + + +/* mutex */ +#define ZSTD_pthread_mutex_t CRITICAL_SECTION +#define ZSTD_pthread_mutex_init(a, b) ((void)(b), InitializeCriticalSection((a)), 0) +#define ZSTD_pthread_mutex_destroy(a) DeleteCriticalSection((a)) +#define ZSTD_pthread_mutex_lock(a) EnterCriticalSection((a)) +#define ZSTD_pthread_mutex_unlock(a) LeaveCriticalSection((a)) + +/* condition variable */ +#define ZSTD_pthread_cond_t CONDITION_VARIABLE +#define ZSTD_pthread_cond_init(a, b) ((void)(b), InitializeConditionVariable((a)), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) SleepConditionVariableCS((a), (b), INFINITE) +#define ZSTD_pthread_cond_signal(a) WakeConditionVariable((a)) +#define ZSTD_pthread_cond_broadcast(a) WakeAllConditionVariable((a)) + +/* ZSTD_pthread_create() and ZSTD_pthread_join() */ +typedef struct { + HANDLE handle; + void* (*start_routine)(void*); + void* arg; +} ZSTD_pthread_t; + +int ZSTD_pthread_create(ZSTD_pthread_t* thread, const void* unused, + void* (*start_routine) (void*), void* arg); + +int ZSTD_pthread_join(ZSTD_pthread_t thread, void** value_ptr); + +/** + * add here more wrappers as required + */ + + +#elif defined(ZSTD_MULTITHREAD) /* posix assumed ; need a better detection method */ +/* === POSIX Systems === */ +# include + +#if DEBUGLEVEL < 1 + +#define ZSTD_pthread_mutex_t pthread_mutex_t +#define ZSTD_pthread_mutex_init(a, b) pthread_mutex_init((a), (b)) +#define ZSTD_pthread_mutex_destroy(a) pthread_mutex_destroy((a)) +#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock((a)) +#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock((a)) + +#define ZSTD_pthread_cond_t pthread_cond_t +#define ZSTD_pthread_cond_init(a, b) pthread_cond_init((a), (b)) +#define ZSTD_pthread_cond_destroy(a) pthread_cond_destroy((a)) +#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait((a), (b)) +#define ZSTD_pthread_cond_signal(a) pthread_cond_signal((a)) +#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast((a)) + +#define ZSTD_pthread_t pthread_t +#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) + +#else /* DEBUGLEVEL >= 1 */ + +/* Debug implementation of threading. + * In this implementation we use pointers for mutexes and condition variables. + * This way, if we forget to init/destroy them the program will crash or ASAN + * will report leaks. + */ + +#define ZSTD_pthread_mutex_t pthread_mutex_t* +int ZSTD_pthread_mutex_init(ZSTD_pthread_mutex_t* mutex, pthread_mutexattr_t const* attr); +int ZSTD_pthread_mutex_destroy(ZSTD_pthread_mutex_t* mutex); +#define ZSTD_pthread_mutex_lock(a) pthread_mutex_lock(*(a)) +#define ZSTD_pthread_mutex_unlock(a) pthread_mutex_unlock(*(a)) + +#define ZSTD_pthread_cond_t pthread_cond_t* +int ZSTD_pthread_cond_init(ZSTD_pthread_cond_t* cond, pthread_condattr_t const* attr); +int ZSTD_pthread_cond_destroy(ZSTD_pthread_cond_t* cond); +#define ZSTD_pthread_cond_wait(a, b) pthread_cond_wait(*(a), *(b)) +#define ZSTD_pthread_cond_signal(a) pthread_cond_signal(*(a)) +#define ZSTD_pthread_cond_broadcast(a) pthread_cond_broadcast(*(a)) + +#define ZSTD_pthread_t pthread_t +#define ZSTD_pthread_create(a, b, c, d) pthread_create((a), (b), (c), (d)) +#define ZSTD_pthread_join(a, b) pthread_join((a),(b)) + +#endif + +#else /* ZSTD_MULTITHREAD not defined */ +/* No multithreading support */ + +typedef int ZSTD_pthread_mutex_t; +#define ZSTD_pthread_mutex_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_mutex_destroy(a) ((void)(a)) +#define ZSTD_pthread_mutex_lock(a) ((void)(a)) +#define ZSTD_pthread_mutex_unlock(a) ((void)(a)) + +typedef int ZSTD_pthread_cond_t; +#define ZSTD_pthread_cond_init(a, b) ((void)(a), (void)(b), 0) +#define ZSTD_pthread_cond_destroy(a) ((void)(a)) +#define ZSTD_pthread_cond_wait(a, b) ((void)(a), (void)(b)) +#define ZSTD_pthread_cond_signal(a) ((void)(a)) +#define ZSTD_pthread_cond_broadcast(a) ((void)(a)) + +/* do not use ZSTD_pthread_t */ + +#endif /* ZSTD_MULTITHREAD */ + +#if defined (__cplusplus) +} +#endif + +#endif /* THREADING_H_938743 */ +/**** ended inlining threading.h ****/ + +/* A job is a function and an opaque argument */ +typedef struct POOL_job_s { + POOL_function function; + void *opaque; +} POOL_job; + +struct POOL_ctx_s { + ZSTD_customMem customMem; + /* Keep track of the threads */ + ZSTD_pthread_t* threads; + size_t threadCapacity; + size_t threadLimit; + + /* The queue is a circular buffer */ + POOL_job *queue; + size_t queueHead; + size_t queueTail; + size_t queueSize; + + /* The number of threads working on jobs */ + size_t numThreadsBusy; + /* Indicates if the queue is empty */ + int queueEmpty; + + /* The mutex protects the queue */ + ZSTD_pthread_mutex_t queueMutex; + /* Condition variable for pushers to wait on when the queue is full */ + ZSTD_pthread_cond_t queuePushCond; + /* Condition variables for poppers to wait on when the queue is empty */ + ZSTD_pthread_cond_t queuePopCond; + /* Indicates if the queue is shutting down */ + int shutdown; +}; + +/* POOL_thread() : + * Work thread for the thread pool. + * Waits for jobs and executes them. + * @returns : NULL on failure else non-null. + */ +static void* POOL_thread(void* opaque) { + POOL_ctx* const ctx = (POOL_ctx*)opaque; + if (!ctx) { return NULL; } + for (;;) { + /* Lock the mutex and wait for a non-empty queue or until shutdown */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + + while ( ctx->queueEmpty + || (ctx->numThreadsBusy >= ctx->threadLimit) ) { + if (ctx->shutdown) { + /* even if !queueEmpty, (possible if numThreadsBusy >= threadLimit), + * a few threads will be shutdown while !queueEmpty, + * but enough threads will remain active to finish the queue */ + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return opaque; + } + ZSTD_pthread_cond_wait(&ctx->queuePopCond, &ctx->queueMutex); + } + /* Pop a job off the queue */ + { POOL_job const job = ctx->queue[ctx->queueHead]; + ctx->queueHead = (ctx->queueHead + 1) % ctx->queueSize; + ctx->numThreadsBusy++; + ctx->queueEmpty = ctx->queueHead == ctx->queueTail; + /* Unlock the mutex, signal a pusher, and run the job */ + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + + job.function(job.opaque); + + /* If the intended queue size was 0, signal after finishing job */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + ctx->numThreadsBusy--; + if (ctx->queueSize == 1) { + ZSTD_pthread_cond_signal(&ctx->queuePushCond); + } + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + } + } /* for (;;) */ + assert(0); /* Unreachable */ +} + +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); +} + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, + ZSTD_customMem customMem) { + POOL_ctx* ctx; + /* Check parameters */ + if (!numThreads) { return NULL; } + /* Allocate the context and zero initialize */ + ctx = (POOL_ctx*)ZSTD_calloc(sizeof(POOL_ctx), customMem); + if (!ctx) { return NULL; } + /* Initialize the job queue. + * It needs one extra space since one space is wasted to differentiate + * empty and full queues. + */ + ctx->queueSize = queueSize + 1; + ctx->queue = (POOL_job*)ZSTD_malloc(ctx->queueSize * sizeof(POOL_job), customMem); + ctx->queueHead = 0; + ctx->queueTail = 0; + ctx->numThreadsBusy = 0; + ctx->queueEmpty = 1; + { + int error = 0; + error |= ZSTD_pthread_mutex_init(&ctx->queueMutex, NULL); + error |= ZSTD_pthread_cond_init(&ctx->queuePushCond, NULL); + error |= ZSTD_pthread_cond_init(&ctx->queuePopCond, NULL); + if (error) { POOL_free(ctx); return NULL; } + } + ctx->shutdown = 0; + /* Allocate space for the thread handles */ + ctx->threads = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), customMem); + ctx->threadCapacity = 0; + ctx->customMem = customMem; + /* Check for errors */ + if (!ctx->threads || !ctx->queue) { POOL_free(ctx); return NULL; } + /* Initialize the threads */ + { size_t i; + for (i = 0; i < numThreads; ++i) { + if (ZSTD_pthread_create(&ctx->threads[i], NULL, &POOL_thread, ctx)) { + ctx->threadCapacity = i; + POOL_free(ctx); + return NULL; + } } + ctx->threadCapacity = numThreads; + ctx->threadLimit = numThreads; + } + return ctx; +} + +/*! POOL_join() : + Shutdown the queue, wake any sleeping threads, and join all of the threads. +*/ +static void POOL_join(POOL_ctx* ctx) { + /* Shut down the queue */ + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + ctx->shutdown = 1; + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + /* Wake up sleeping threads */ + ZSTD_pthread_cond_broadcast(&ctx->queuePushCond); + ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); + /* Join all of the threads */ + { size_t i; + for (i = 0; i < ctx->threadCapacity; ++i) { + ZSTD_pthread_join(ctx->threads[i], NULL); /* note : could fail */ + } } +} + +void POOL_free(POOL_ctx *ctx) { + if (!ctx) { return; } + POOL_join(ctx); + ZSTD_pthread_mutex_destroy(&ctx->queueMutex); + ZSTD_pthread_cond_destroy(&ctx->queuePushCond); + ZSTD_pthread_cond_destroy(&ctx->queuePopCond); + ZSTD_free(ctx->queue, ctx->customMem); + ZSTD_free(ctx->threads, ctx->customMem); + ZSTD_free(ctx, ctx->customMem); +} + + + +size_t POOL_sizeof(POOL_ctx *ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + return sizeof(*ctx) + + ctx->queueSize * sizeof(POOL_job) + + ctx->threadCapacity * sizeof(ZSTD_pthread_t); +} + + +/* @return : 0 on success, 1 on error */ +static int POOL_resize_internal(POOL_ctx* ctx, size_t numThreads) +{ + if (numThreads <= ctx->threadCapacity) { + if (!numThreads) return 1; + ctx->threadLimit = numThreads; + return 0; + } + /* numThreads > threadCapacity */ + { ZSTD_pthread_t* const threadPool = (ZSTD_pthread_t*)ZSTD_malloc(numThreads * sizeof(ZSTD_pthread_t), ctx->customMem); + if (!threadPool) return 1; + /* replace existing thread pool */ + memcpy(threadPool, ctx->threads, ctx->threadCapacity * sizeof(*threadPool)); + ZSTD_free(ctx->threads, ctx->customMem); + ctx->threads = threadPool; + /* Initialize additional threads */ + { size_t threadId; + for (threadId = ctx->threadCapacity; threadId < numThreads; ++threadId) { + if (ZSTD_pthread_create(&threadPool[threadId], NULL, &POOL_thread, ctx)) { + ctx->threadCapacity = threadId; + return 1; + } } + } } + /* successfully expanded */ + ctx->threadCapacity = numThreads; + ctx->threadLimit = numThreads; + return 0; +} + +/* @return : 0 on success, 1 on error */ +int POOL_resize(POOL_ctx* ctx, size_t numThreads) +{ + int result; + if (ctx==NULL) return 1; + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + result = POOL_resize_internal(ctx, numThreads); + ZSTD_pthread_cond_broadcast(&ctx->queuePopCond); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return result; +} + +/** + * Returns 1 if the queue is full and 0 otherwise. + * + * When queueSize is 1 (pool was created with an intended queueSize of 0), + * then a queue is empty if there is a thread free _and_ no job is waiting. + */ +static int isQueueFull(POOL_ctx const* ctx) { + if (ctx->queueSize > 1) { + return ctx->queueHead == ((ctx->queueTail + 1) % ctx->queueSize); + } else { + return (ctx->numThreadsBusy == ctx->threadLimit) || + !ctx->queueEmpty; + } +} + + +static void POOL_add_internal(POOL_ctx* ctx, POOL_function function, void *opaque) +{ + POOL_job const job = {function, opaque}; + assert(ctx != NULL); + if (ctx->shutdown) return; + + ctx->queueEmpty = 0; + ctx->queue[ctx->queueTail] = job; + ctx->queueTail = (ctx->queueTail + 1) % ctx->queueSize; + ZSTD_pthread_cond_signal(&ctx->queuePopCond); +} + +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + /* Wait until there is space in the queue for the new job */ + while (isQueueFull(ctx) && (!ctx->shutdown)) { + ZSTD_pthread_cond_wait(&ctx->queuePushCond, &ctx->queueMutex); + } + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); +} + + +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) +{ + assert(ctx != NULL); + ZSTD_pthread_mutex_lock(&ctx->queueMutex); + if (isQueueFull(ctx)) { + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 0; + } + POOL_add_internal(ctx, function, opaque); + ZSTD_pthread_mutex_unlock(&ctx->queueMutex); + return 1; +} + + +#else /* ZSTD_MULTITHREAD not defined */ + +/* ========================== */ +/* No multi-threading support */ +/* ========================== */ + + +/* We don't need any data, but if it is empty, malloc() might return NULL. */ +struct POOL_ctx_s { + int dummy; +}; +static POOL_ctx g_ctx; + +POOL_ctx* POOL_create(size_t numThreads, size_t queueSize) { + return POOL_create_advanced(numThreads, queueSize, ZSTD_defaultCMem); +} + +POOL_ctx* POOL_create_advanced(size_t numThreads, size_t queueSize, ZSTD_customMem customMem) { + (void)numThreads; + (void)queueSize; + (void)customMem; + return &g_ctx; +} + +void POOL_free(POOL_ctx* ctx) { + assert(!ctx || ctx == &g_ctx); + (void)ctx; +} + +int POOL_resize(POOL_ctx* ctx, size_t numThreads) { + (void)ctx; (void)numThreads; + return 0; +} + +void POOL_add(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); +} + +int POOL_tryAdd(POOL_ctx* ctx, POOL_function function, void* opaque) { + (void)ctx; + function(opaque); + return 1; +} + +size_t POOL_sizeof(POOL_ctx* ctx) { + if (ctx==NULL) return 0; /* supports sizeof NULL */ + assert(ctx == &g_ctx); + return sizeof(*ctx); +} + +#endif /* ZSTD_MULTITHREAD */ +/**** ended inlining common/pool.c ****/ +/**** start inlining common/zstd_common.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + + +/*-************************************* +* Dependencies +***************************************/ +#include /* malloc, calloc, free */ +#include /* memset */ +/**** skipping file: error_private.h ****/ +/**** skipping file: zstd_internal.h ****/ + + +/*-**************************************** +* Version +******************************************/ +unsigned ZSTD_versionNumber(void) { return ZSTD_VERSION_NUMBER; } + +const char* ZSTD_versionString(void) { return ZSTD_VERSION_STRING; } + + +/*-**************************************** +* ZSTD Error Management +******************************************/ +#undef ZSTD_isError /* defined within zstd_internal.h */ +/*! ZSTD_isError() : + * tells if a return value is an error code + * symbol is required for external callers */ +unsigned ZSTD_isError(size_t code) { return ERR_isError(code); } + +/*! ZSTD_getErrorName() : + * provides error code string from function result (useful for debugging) */ +const char* ZSTD_getErrorName(size_t code) { return ERR_getErrorName(code); } + +/*! ZSTD_getError() : + * convert a `size_t` function result into a proper ZSTD_errorCode enum */ +ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + +/*! ZSTD_getErrorString() : + * provides error code string from enum */ +const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } + + + +/*=************************************************************** +* Custom allocator +****************************************************************/ +void* ZSTD_malloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) + return customMem.customAlloc(customMem.opaque, size); + return malloc(size); +} + +void* ZSTD_calloc(size_t size, ZSTD_customMem customMem) +{ + if (customMem.customAlloc) { + /* calloc implemented as malloc+memset; + * not as efficient as calloc, but next best guess for custom malloc */ + void* const ptr = customMem.customAlloc(customMem.opaque, size); + memset(ptr, 0, size); + return ptr; + } + return calloc(1, size); +} + +void ZSTD_free(void* ptr, ZSTD_customMem customMem) +{ + if (ptr!=NULL) { + if (customMem.customFree) + customMem.customFree(customMem.opaque, ptr); + else + free(ptr); + } +} +/**** ended inlining common/zstd_common.c ****/ + +/**** start inlining compress/fse_compress.c ****/ +/* ****************************************************************** + * FSE : Finite State Entropy encoder + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* malloc, free, qsort */ +#include /* memcpy, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/debug.h ****/ +/**** start inlining hist.h ****/ +/* ****************************************************************** + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* --- dependencies --- */ +#include /* size_t */ + + +/* --- simple histogram functions --- */ + +/*! HIST_count(): + * Provides the precise count of each byte within a table 'count'. + * 'count' is a table of unsigned int, of minimum size (*maxSymbolValuePtr+1). + * Updates *maxSymbolValuePtr with actual largest symbol value detected. + * @return : count of the most frequent symbol (which isn't identified). + * or an error code, which can be tested using HIST_isError(). + * note : if return == srcSize, there is only one symbol. + */ +size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); + +unsigned HIST_isError(size_t code); /**< tells if a return value is an error code */ + + +/* --- advanced histogram functions --- */ + +#define HIST_WKSP_SIZE_U32 1024 +#define HIST_WKSP_SIZE (HIST_WKSP_SIZE_U32 * sizeof(unsigned)) +/** HIST_count_wksp() : + * Same as HIST_count(), but using an externally provided scratch buffer. + * Benefit is this function will use very little stack space. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize, + void* workSpace, size_t workSpaceSize); + +/** HIST_countFast() : + * same as HIST_count(), but blindly trusts that all byte values within src are <= *maxSymbolValuePtr. + * This function is unsafe, and will segfault if any value within `src` is `> *maxSymbolValuePtr` + */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); + +/** HIST_countFast_wksp() : + * Same as HIST_countFast(), but using an externally provided scratch buffer. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize, + void* workSpace, size_t workSpaceSize); + +/*! HIST_count_simple() : + * Same as HIST_countFast(), this function is unsafe, + * and will segfault if any value within `src` is `> *maxSymbolValuePtr`. + * It is also a bit slower for large inputs. + * However, it does not need any additional memory (not even on stack). + * @return : count of the most frequent symbol. + * Note this function doesn't produce any error (i.e. it must succeed). + */ +unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize); +/**** ended inlining hist.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +/**** skipping file: ../common/error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define FSE_isError ERR_isError + + +/* ************************************************************** +* Templates +****************************************************************/ +/* + designed to be included + for type-specific functions (template emulation in C) + Objective is to write these functions only once, for improved maintenance +*/ + +/* safety checks */ +#ifndef FSE_FUNCTION_EXTENSION +# error "FSE_FUNCTION_EXTENSION must be defined" +#endif +#ifndef FSE_FUNCTION_TYPE +# error "FSE_FUNCTION_TYPE must be defined" +#endif + +/* Function names */ +#define FSE_CAT(X,Y) X##Y +#define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) +#define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + + +/* Function templates */ + +/* FSE_buildCTable_wksp() : + * Same as FSE_buildCTable(), but using an externally allocated scratch buffer (`workSpace`). + * wkspSize should be sized to handle worst case situation, which is `1<>1 : 1) ; + FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); + U32 const step = FSE_TABLESTEP(tableSize); + U32 cumul[FSE_MAX_SYMBOL_VALUE+2]; + + FSE_FUNCTION_TYPE* const tableSymbol = (FSE_FUNCTION_TYPE*)workSpace; + U32 highThreshold = tableSize-1; + + /* CTable header */ + if (((size_t)1 << tableLog) * sizeof(FSE_FUNCTION_TYPE) > wkspSize) return ERROR(tableLog_tooLarge); + tableU16[-2] = (U16) tableLog; + tableU16[-1] = (U16) maxSymbolValue; + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : + * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ + #endif + + /* symbol start positions */ + { U32 u; + cumul[0] = 0; + for (u=1; u <= maxSymbolValue+1; u++) { + if (normalizedCounter[u-1]==-1) { /* Low proba symbol */ + cumul[u] = cumul[u-1] + 1; + tableSymbol[highThreshold--] = (FSE_FUNCTION_TYPE)(u-1); + } else { + cumul[u] = cumul[u-1] + normalizedCounter[u-1]; + } } + cumul[maxSymbolValue+1] = tableSize+1; + } + + /* Spread symbols */ + { U32 position = 0; + U32 symbol; + for (symbol=0; symbol<=maxSymbolValue; symbol++) { + int nbOccurrences; + int const freq = normalizedCounter[symbol]; + for (nbOccurrences=0; nbOccurrences highThreshold) + position = (position + step) & tableMask; /* Low proba area */ + } } + + assert(position==0); /* Must have initialized all positions */ + } + + /* Build table */ + { U32 u; for (u=0; u> 3) + 3; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ +} + +static size_t +FSE_writeNCount_generic (void* header, size_t headerBufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, + unsigned writeIsSafe) +{ + BYTE* const ostart = (BYTE*) header; + BYTE* out = ostart; + BYTE* const oend = ostart + headerBufferSize; + int nbBits; + const int tableSize = 1 << tableLog; + int remaining; + int threshold; + U32 bitStream = 0; + int bitCount = 0; + unsigned symbol = 0; + unsigned const alphabetSize = maxSymbolValue + 1; + int previousIs0 = 0; + + /* Table Size */ + bitStream += (tableLog-FSE_MIN_TABLELOG) << bitCount; + bitCount += 4; + + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; + nbBits = tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { + unsigned start = symbol; + while ((symbol < alphabetSize) && !normalizedCounter[symbol]) symbol++; + if (symbol == alphabetSize) break; /* incorrect distribution */ + while (symbol >= start+24) { + start+=24; + bitStream += 0xFFFFU << bitCount; + if ((!writeIsSafe) && (out > oend-2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE) bitStream; + out[1] = (BYTE)(bitStream>>8); + out+=2; + bitStream>>=16; + } + while (symbol >= start+3) { + start+=3; + bitStream += 3 << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; + bitCount += 2; + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + { int count = normalizedCounter[symbol++]; + int const max = (2*threshold-1) - remaining; + remaining -= count < 0 ? -count : count; + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ + bitStream += count << bitCount; + bitCount += nbBits; + bitCount -= (count>=1; } + } + if (bitCount>16) { + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out += 2; + bitStream >>= 16; + bitCount -= 16; + } } + + if (remaining != 1) + return ERROR(GENERIC); /* incorrect normalized distribution */ + assert(symbol <= alphabetSize); + + /* flush remaining bitStream */ + if ((!writeIsSafe) && (out > oend - 2)) + return ERROR(dstSize_tooSmall); /* Buffer overflow */ + out[0] = (BYTE)bitStream; + out[1] = (BYTE)(bitStream>>8); + out+= (bitCount+7) /8; + + return (out-ostart); +} + + +size_t FSE_writeNCount (void* buffer, size_t bufferSize, + const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog) +{ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported */ + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported */ + + if (bufferSize < FSE_NCountWriteBound(maxSymbolValue, tableLog)) + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 0); + + return FSE_writeNCount_generic(buffer, bufferSize, normalizedCounter, maxSymbolValue, tableLog, 1 /* write in buffer is safe */); +} + + +/*-************************************************************** +* FSE Compression Code +****************************************************************/ + +FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +{ + size_t size; + if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; + size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); + return (FSE_CTable*)malloc(size); +} + +void FSE_freeCTable (FSE_CTable* ct) { free(ct); } + +/* provides the minimum logSize to safely represent a distribution */ +static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) +{ + U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; + U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +} + +unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) +{ + U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (maxBitsSrc < tableLog) tableLog = maxBitsSrc; /* Accuracy can be reduced */ + if (minBits > tableLog) tableLog = minBits; /* Need a minimum to safely represent all symbol values */ + if (tableLog < FSE_MIN_TABLELOG) tableLog = FSE_MIN_TABLELOG; + if (tableLog > FSE_MAX_TABLELOG) tableLog = FSE_MAX_TABLELOG; + return tableLog; +} + +unsigned FSE_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 2); +} + + +/* Secondary normalization method. + To be used when primary method fails. */ + +static size_t FSE_normalizeM2(short* norm, U32 tableLog, const unsigned* count, size_t total, U32 maxSymbolValue) +{ + short const NOT_YET_ASSIGNED = -2; + U32 s; + U32 distributed = 0; + U32 ToDistribute; + + /* Init */ + U32 const lowThreshold = (U32)(total >> tableLog); + U32 lowOne = (U32)((total * 3) >> (tableLog + 1)); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == 0) { + norm[s]=0; + continue; + } + if (count[s] <= lowThreshold) { + norm[s] = -1; + distributed++; + total -= count[s]; + continue; + } + if (count[s] <= lowOne) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } + + norm[s]=NOT_YET_ASSIGNED; + } + ToDistribute = (1 << tableLog) - distributed; + + if (ToDistribute == 0) + return 0; + + if ((total / ToDistribute) > lowOne) { + /* risk of rounding to zero */ + lowOne = (U32)((total * 3) / (ToDistribute * 2)); + for (s=0; s<=maxSymbolValue; s++) { + if ((norm[s] == NOT_YET_ASSIGNED) && (count[s] <= lowOne)) { + norm[s] = 1; + distributed++; + total -= count[s]; + continue; + } } + ToDistribute = (1 << tableLog) - distributed; + } + + if (distributed == maxSymbolValue+1) { + /* all values are pretty poor; + probably incompressible data (should have already been detected); + find max, then give all remaining points to max */ + U32 maxV = 0, maxC = 0; + for (s=0; s<=maxSymbolValue; s++) + if (count[s] > maxC) { maxV=s; maxC=count[s]; } + norm[maxV] += (short)ToDistribute; + return 0; + } + + if (total == 0) { + /* all of the symbols were low enough for the lowOne or lowThreshold */ + for (s=0; ToDistribute > 0; s = (s+1)%(maxSymbolValue+1)) + if (norm[s] > 0) { ToDistribute--; norm[s]++; } + return 0; + } + + { U64 const vStepLog = 62 - tableLog; + U64 const mid = (1ULL << (vStepLog-1)) - 1; + U64 const rStep = ((((U64)1<> vStepLog); + U32 const sEnd = (U32)(end >> vStepLog); + U32 const weight = sEnd - sStart; + if (weight < 1) + return ERROR(GENERIC); + norm[s] = (short)weight; + tmpTotal = end; + } } } + + return 0; +} + + +size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + const unsigned* count, size_t total, + unsigned maxSymbolValue) +{ + /* Sanity checks */ + if (tableLog==0) tableLog = FSE_DEFAULT_TABLELOG; + if (tableLog < FSE_MIN_TABLELOG) return ERROR(GENERIC); /* Unsupported size */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); /* Unsupported size */ + if (tableLog < FSE_minTableLog(total, maxSymbolValue)) return ERROR(GENERIC); /* Too small tableLog, compression potentially impossible */ + + { static U32 const rtbTable[] = { 0, 473195, 504333, 520860, 550000, 700000, 750000, 830000 }; + U64 const scale = 62 - tableLog; + U64 const step = ((U64)1<<62) / total; /* <== here, one division ! */ + U64 const vStep = 1ULL<<(scale-20); + int stillToDistribute = 1<> tableLog); + + for (s=0; s<=maxSymbolValue; s++) { + if (count[s] == total) return 0; /* rle special case */ + if (count[s] == 0) { normalizedCounter[s]=0; continue; } + if (count[s] <= lowThreshold) { + normalizedCounter[s] = -1; + stillToDistribute--; + } else { + short proba = (short)((count[s]*step) >> scale); + if (proba<8) { + U64 restToBeat = vStep * rtbTable[proba]; + proba += (count[s]*step) - ((U64)proba< restToBeat; + } + if (proba > largestP) { largestP=proba; largest=s; } + normalizedCounter[s] = proba; + stillToDistribute -= proba; + } } + if (-stillToDistribute >= (normalizedCounter[largest] >> 1)) { + /* corner case, need another normalization method */ + size_t const errorCode = FSE_normalizeM2(normalizedCounter, tableLog, count, total, maxSymbolValue); + if (FSE_isError(errorCode)) return errorCode; + } + else normalizedCounter[largest] += (short)stillToDistribute; + } + +#if 0 + { /* Print Table (debug) */ + U32 s; + U32 nTotal = 0; + for (s=0; s<=maxSymbolValue; s++) + RAWLOG(2, "%3i: %4i \n", s, normalizedCounter[s]); + for (s=0; s<=maxSymbolValue; s++) + nTotal += abs(normalizedCounter[s]); + if (nTotal != (1U<>1); /* assumption : tableLog >= 1 */ + FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); + unsigned s; + + /* Sanity checks */ + if (nbBits < 1) return ERROR(GENERIC); /* min size */ + + /* header */ + tableU16[-2] = (U16) nbBits; + tableU16[-1] = (U16) maxSymbolValue; + + /* Build table */ + for (s=0; s FSE_MAX_TABLELOG*4+7 ) && (srcSize & 2)) { /* test bit 2 */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + FSE_FLUSHBITS(&bitC); + } + + /* 2 or 4 encoding per loop */ + while ( ip>istart ) { + + FSE_encodeSymbol(&bitC, &CState2, *--ip); + + if (sizeof(bitC.bitContainer)*8 < FSE_MAX_TABLELOG*2+7 ) /* this test must be static */ + FSE_FLUSHBITS(&bitC); + + FSE_encodeSymbol(&bitC, &CState1, *--ip); + + if (sizeof(bitC.bitContainer)*8 > FSE_MAX_TABLELOG*4+7 ) { /* this test must be static */ + FSE_encodeSymbol(&bitC, &CState2, *--ip); + FSE_encodeSymbol(&bitC, &CState1, *--ip); + } + + FSE_FLUSHBITS(&bitC); + } + + FSE_flushCState(&bitC, &CState2); + FSE_flushCState(&bitC, &CState1); + return BIT_closeCStream(&bitC); +} + +size_t FSE_compress_usingCTable (void* dst, size_t dstSize, + const void* src, size_t srcSize, + const FSE_CTable* ct) +{ + unsigned const fast = (dstSize >= FSE_BLOCKBOUND(srcSize)); + + if (fast) + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 1); + else + return FSE_compress_usingCTable_generic(dst, dstSize, src, srcSize, ct, 0); +} + + +size_t FSE_compressBound(size_t size) { return FSE_COMPRESSBOUND(size); } + +/* FSE_compress_wksp() : + * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). + * `wkspSize` size must be `(1< not compressible */ + if (maxCount < (srcSize >> 7)) return 0; /* Heuristic : not compressible enough */ + } + + tableLog = FSE_optimalTableLog(tableLog, srcSize, maxSymbolValue); + CHECK_F( FSE_normalizeCount(norm, tableLog, count, srcSize, maxSymbolValue) ); + + /* Write table description header */ + { CHECK_V_F(nc_err, FSE_writeNCount(op, oend-op, norm, maxSymbolValue, tableLog) ); + op += nc_err; + } + + /* Compress */ + CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, scratchBufferSize) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, oend - op, src, srcSize, CTable) ); + if (cSize == 0) return 0; /* not enough space for compressed data */ + op += cSize; + } + + /* check compressibility */ + if ( (size_t)(op-ostart) >= srcSize-1 ) return 0; + + return op-ostart; +} + +typedef struct { + FSE_CTable CTable_max[FSE_CTABLE_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)]; + BYTE scratchBuffer[1 << FSE_MAX_TABLELOG]; +} fseWkspMax_t; + +size_t FSE_compress2 (void* dst, size_t dstCapacity, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog) +{ + fseWkspMax_t scratchBuffer; + DEBUG_STATIC_ASSERT(sizeof(scratchBuffer) >= FSE_WKSP_SIZE_U32(FSE_MAX_TABLELOG, FSE_MAX_SYMBOL_VALUE)); /* compilation failures here means scratchBuffer is not large enough */ + if (tableLog > FSE_MAX_TABLELOG) return ERROR(tableLog_tooLarge); + return FSE_compress_wksp(dst, dstCapacity, src, srcSize, maxSymbolValue, tableLog, &scratchBuffer, sizeof(scratchBuffer)); +} + +size_t FSE_compress (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return FSE_compress2(dst, dstCapacity, src, srcSize, FSE_MAX_SYMBOL_VALUE, FSE_DEFAULT_TABLELOG); +} + + +#endif /* FSE_COMMONDEFS_ONLY */ +/**** ended inlining compress/fse_compress.c ****/ +/**** start inlining compress/hist.c ****/ +/* ****************************************************************** + * hist : Histogram functions + * part of Finite State Entropy project + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* --- dependencies --- */ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/debug.h ****/ +/**** skipping file: ../common/error_private.h ****/ +/**** skipping file: hist.h ****/ + + +/* --- Error management --- */ +unsigned HIST_isError(size_t code) { return ERR_isError(code); } + +/*-************************************************************** + * Histogram functions + ****************************************************************/ +unsigned HIST_count_simple(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) +{ + const BYTE* ip = (const BYTE*)src; + const BYTE* const end = ip + srcSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned largestCount=0; + + memset(count, 0, (maxSymbolValue+1) * sizeof(*count)); + if (srcSize==0) { *maxSymbolValuePtr = 0; return 0; } + + while (ip largestCount) largestCount = count[s]; + } + + return largestCount; +} + +typedef enum { trustInput, checkMaxSymbolValue } HIST_checkInput_e; + +/* HIST_count_parallel_wksp() : + * store histogram into 4 intermediate tables, recombined at the end. + * this design makes better use of OoO cpus, + * and is noticeably faster when some values are heavily repeated. + * But it needs some additional workspace for intermediate tables. + * `workSpace` size must be a table of size >= HIST_WKSP_SIZE_U32. + * @return : largest histogram frequency, + * or an error code (notably when histogram would be larger than *maxSymbolValuePtr). */ +static size_t HIST_count_parallel_wksp( + unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + HIST_checkInput_e check, + U32* const workSpace) +{ + const BYTE* ip = (const BYTE*)source; + const BYTE* const iend = ip+sourceSize; + unsigned maxSymbolValue = *maxSymbolValuePtr; + unsigned max=0; + U32* const Counting1 = workSpace; + U32* const Counting2 = Counting1 + 256; + U32* const Counting3 = Counting2 + 256; + U32* const Counting4 = Counting3 + 256; + + memset(workSpace, 0, 4*256*sizeof(unsigned)); + + /* safety checks */ + if (!sourceSize) { + memset(count, 0, maxSymbolValue + 1); + *maxSymbolValuePtr = 0; + return 0; + } + if (!maxSymbolValue) maxSymbolValue = 255; /* 0 == default */ + + /* by stripes of 16 bytes */ + { U32 cached = MEM_read32(ip); ip += 4; + while (ip < iend-15) { + U32 c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + c = cached; cached = MEM_read32(ip); ip += 4; + Counting1[(BYTE) c ]++; + Counting2[(BYTE)(c>>8) ]++; + Counting3[(BYTE)(c>>16)]++; + Counting4[ c>>24 ]++; + } + ip-=4; + } + + /* finish last symbols */ + while (ipmaxSymbolValue; s--) { + Counting1[s] += Counting2[s] + Counting3[s] + Counting4[s]; + if (Counting1[s]) return ERROR(maxSymbolValue_tooSmall); + } } + + { U32 s; + if (maxSymbolValue > 255) maxSymbolValue = 255; + for (s=0; s<=maxSymbolValue; s++) { + count[s] = Counting1[s] + Counting2[s] + Counting3[s] + Counting4[s]; + if (count[s] > max) max = count[s]; + } } + + while (!count[maxSymbolValue]) maxSymbolValue--; + *maxSymbolValuePtr = maxSymbolValue; + return (size_t)max; +} + +/* HIST_countFast_wksp() : + * Same as HIST_countFast(), but using an externally provided scratch buffer. + * `workSpace` is a writable buffer which must be 4-bytes aligned, + * `workSpaceSize` must be >= HIST_WKSP_SIZE + */ +size_t HIST_countFast_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if (sourceSize < 1500) /* heuristic threshold */ + return HIST_count_simple(count, maxSymbolValuePtr, source, sourceSize); + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, trustInput, (U32*)workSpace); +} + +/* fast variant (unsafe : won't check if src contains values beyond count[] limit) */ +size_t HIST_countFast(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, tmpCounters, sizeof(tmpCounters)); +} + +/* HIST_count_wksp() : + * Same as HIST_count(), but using an externally provided scratch buffer. + * `workSpace` size must be table of >= HIST_WKSP_SIZE_U32 unsigned */ +size_t HIST_count_wksp(unsigned* count, unsigned* maxSymbolValuePtr, + const void* source, size_t sourceSize, + void* workSpace, size_t workSpaceSize) +{ + if ((size_t)workSpace & 3) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (workSpaceSize < HIST_WKSP_SIZE) return ERROR(workSpace_tooSmall); + if (*maxSymbolValuePtr < 255) + return HIST_count_parallel_wksp(count, maxSymbolValuePtr, source, sourceSize, checkMaxSymbolValue, (U32*)workSpace); + *maxSymbolValuePtr = 255; + return HIST_countFast_wksp(count, maxSymbolValuePtr, source, sourceSize, workSpace, workSpaceSize); +} + +size_t HIST_count(unsigned* count, unsigned* maxSymbolValuePtr, + const void* src, size_t srcSize) +{ + unsigned tmpCounters[HIST_WKSP_SIZE_U32]; + return HIST_count_wksp(count, maxSymbolValuePtr, src, srcSize, tmpCounters, sizeof(tmpCounters)); +} +/**** ended inlining compress/hist.c ****/ +/**** start inlining compress/huf_compress.c ****/ +/* ****************************************************************** + * Huffman encoder, part of New Generation Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * - Public forum : https://groups.google.com/forum/#!forum/lz4c + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Compiler specifics +****************************************************************/ +#ifdef _MSC_VER /* Visual Studio */ +# pragma warning(disable : 4127) /* disable: C4127: conditional expression is constant */ +#endif + + +/* ************************************************************** +* Includes +****************************************************************/ +#include /* memcpy, memset */ +#include /* printf (debug) */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +/**** skipping file: hist.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_optimalTableLog_internal */ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/error_private.h ****/ + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError +#define HUF_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) /* use only *after* variable declarations */ + + +/* ************************************************************** +* Utils +****************************************************************/ +unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue) +{ + return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); +} + + +/* ******************************************************* +* HUF : Huffman block compression +*********************************************************/ +/* HUF_compressWeights() : + * Same as FSE_compress(), but dedicated to huff0's weights compression. + * The use case needs much less stack memory. + * Note : all elements within weightTable are supposed to be <= HUF_TABLELOG_MAX. + */ +#define MAX_FSE_TABLELOG_FOR_HUFF_HEADER 6 +static size_t HUF_compressWeights (void* dst, size_t dstSize, const void* weightTable, size_t wtSize) +{ + BYTE* const ostart = (BYTE*) dst; + BYTE* op = ostart; + BYTE* const oend = ostart + dstSize; + + unsigned maxSymbolValue = HUF_TABLELOG_MAX; + U32 tableLog = MAX_FSE_TABLELOG_FOR_HUFF_HEADER; + + FSE_CTable CTable[FSE_CTABLE_SIZE_U32(MAX_FSE_TABLELOG_FOR_HUFF_HEADER, HUF_TABLELOG_MAX)]; + BYTE scratchBuffer[1< not compressible */ + } + + tableLog = FSE_optimalTableLog(tableLog, wtSize, maxSymbolValue); + CHECK_F( FSE_normalizeCount(norm, tableLog, count, wtSize, maxSymbolValue) ); + + /* Write table description header */ + { CHECK_V_F(hSize, FSE_writeNCount(op, (size_t)(oend-op), norm, maxSymbolValue, tableLog) ); + op += hSize; + } + + /* Compress */ + CHECK_F( FSE_buildCTable_wksp(CTable, norm, maxSymbolValue, tableLog, scratchBuffer, sizeof(scratchBuffer)) ); + { CHECK_V_F(cSize, FSE_compress_usingCTable(op, (size_t)(oend - op), weightTable, wtSize, CTable) ); + if (cSize == 0) return 0; /* not enough space for compressed data */ + op += cSize; + } + + return (size_t)(op-ostart); +} + + +struct HUF_CElt_s { + U16 val; + BYTE nbBits; +}; /* typedef'd to HUF_CElt within "huf.h" */ + +/*! HUF_writeCTable() : + `CTable` : Huffman tree to save, using huf representation. + @return : size of saved CTable */ +size_t HUF_writeCTable (void* dst, size_t maxDstSize, + const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +{ + BYTE bitsToWeight[HUF_TABLELOG_MAX + 1]; /* precomputed conversion table */ + BYTE huffWeight[HUF_SYMBOLVALUE_MAX]; + BYTE* op = (BYTE*)dst; + U32 n; + + /* check conditions */ + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + + /* convert to weight */ + bitsToWeight[0] = 0; + for (n=1; n1) & (hSize < maxSymbolValue/2)) { /* FSE compressed */ + op[0] = (BYTE)hSize; + return hSize+1; + } } + + /* write raw values as 4-bits (max : 15) */ + if (maxSymbolValue > (256-128)) return ERROR(GENERIC); /* should not happen : likely means source cannot be compressed */ + if (((maxSymbolValue+1)/2) + 1 > maxDstSize) return ERROR(dstSize_tooSmall); /* not enough space within dst buffer */ + op[0] = (BYTE)(128 /*special case*/ + (maxSymbolValue-1)); + huffWeight[maxSymbolValue] = 0; /* to be sure it doesn't cause msan issue in final combination */ + for (n=0; n HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; + for (n=1; n<=tableLog; n++) { + U32 current = nextRankStart; + nextRankStart += (rankVal[n] << (n-1)); + rankVal[n] = current; + } } + + /* fill nbBits */ + *hasZeroWeights = 0; + { U32 n; for (n=0; nn=tableLog+1 */ + U16 valPerRank[HUF_TABLELOG_MAX+2] = {0}; + { U32 n; for (n=0; n0; n--) { /* start at n=tablelog <-> w=1 */ + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + /* assign value within rank, symbol order */ + { U32 n; for (n=0; n maxNbBits */ + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; + const U32 baseCost = 1 << (largestBits - maxNbBits); + int n = (int)lastNonNull; + + while (huffNode[n].nbBits > maxNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); + huffNode[n].nbBits = (BYTE)maxNbBits; + n --; + } /* n stops at huffNode[n].nbBits <= maxNbBits */ + while (huffNode[n].nbBits == maxNbBits) n--; /* n end at index of smallest symbol using < maxNbBits */ + + /* renorm totalCost */ + totalCost >>= (largestBits - maxNbBits); /* note : totalCost is necessarily a multiple of baseCost */ + + /* repay normalized cost */ + { U32 const noSymbol = 0xF0F0F0F0; + U32 rankLast[HUF_TABLELOG_MAX+2]; + + /* Get pos of last (smallest) symbol per rank */ + memset(rankLast, 0xF0, sizeof(rankLast)); + { U32 currentNbBits = maxNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; + currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ + rankLast[maxNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; + if (highPos == noSymbol) continue; + if (lowPos == noSymbol) break; + { U32 const highTotal = huffNode[highPos].count; + U32 const lowTotal = 2 * huffNode[lowPos].count; + if (highTotal <= lowTotal) break; + } } + /* only triggered when no more rank 1 symbol left => find closest one (note : there is necessarily at least one !) */ + /* HUF_MAX_TABLELOG test just to please gcc 5+; but it should not be necessary */ + while ((nBitsToDecrease<=HUF_TABLELOG_MAX) && (rankLast[nBitsToDecrease] == noSymbol)) + nBitsToDecrease ++; + totalCost -= 1 << (nBitsToDecrease-1); + if (rankLast[nBitsToDecrease-1] == noSymbol) + rankLast[nBitsToDecrease-1] = rankLast[nBitsToDecrease]; /* this rank is no longer empty */ + huffNode[rankLast[nBitsToDecrease]].nbBits ++; + if (rankLast[nBitsToDecrease] == 0) /* special case, reached largest symbol */ + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; + if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } } /* while (totalCost > 0) */ + + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ + if (rankLast[1] == noSymbol) { /* special case : no rank 1 symbol (using maxNbBits-1); let's create one from largest rank 0 (using maxNbBits) */ + while (huffNode[n].nbBits == maxNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); + totalCost++; + continue; + } + huffNode[ rankLast[1] + 1 ].nbBits--; + rankLast[1]++; + totalCost ++; + } } } /* there are several too large elements (at least >= 2) */ + + return maxNbBits; +} + +typedef struct { + U32 base; + U32 current; +} rankPos; + +typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; + +#define RANK_POSITION_TABLE_SIZE 32 + +typedef struct { + huffNodeTable huffNodeTbl; + rankPos rankPosition[RANK_POSITION_TABLE_SIZE]; +} HUF_buildCTable_wksp_tables; + +static void HUF_sort(nodeElt* huffNode, const unsigned* count, U32 maxSymbolValue, rankPos* rankPosition) +{ + U32 n; + + memset(rankPosition, 0, sizeof(*rankPosition) * RANK_POSITION_TABLE_SIZE); + for (n=0; n<=maxSymbolValue; n++) { + U32 r = BIT_highbit32(count[n] + 1); + rankPosition[r].base ++; + } + for (n=30; n>0; n--) rankPosition[n-1].base += rankPosition[n].base; + for (n=0; n<32; n++) rankPosition[n].current = rankPosition[n].base; + for (n=0; n<=maxSymbolValue; n++) { + U32 const c = count[n]; + U32 const r = BIT_highbit32(c+1) + 1; + U32 pos = rankPosition[r].current++; + while ((pos > rankPosition[r].base) && (c > huffNode[pos-1].count)) { + huffNode[pos] = huffNode[pos-1]; + pos--; + } + huffNode[pos].count = c; + huffNode[pos].byte = (BYTE)n; + } +} + + +/** HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). + */ +#define STARTNODE (HUF_SYMBOLVALUE_MAX+1) + +size_t HUF_buildCTable_wksp (HUF_CElt* tree, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) +{ + HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)workSpace; + nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; + + /* safety checks */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) + return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) + return ERROR(maxSymbolValue_tooLarge); + memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); + + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; + lowS = nonNullRank; nodeRoot = nodeNb + lowS - 1; lowN = nodeNb; + huffNode[nodeNb].count = huffNode[lowS].count + huffNode[lowS-1].count; + huffNode[lowS].parent = huffNode[lowS-1].parent = (U16)nodeNb; + nodeNb++; lowS-=2; + for (n=nodeNb; n<=nodeRoot; n++) huffNode[n].count = (U32)(1U<<30); + huffNode0[0].count = (U32)(1U<<31); /* fake entry, strong barrier */ + + /* create parents */ + while (nodeNb <= nodeRoot) { + int const n1 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + int const n2 = (huffNode[lowS].count < huffNode[lowN].count) ? lowS-- : lowN++; + huffNode[nodeNb].count = huffNode[n1].count + huffNode[n2].count; + huffNode[n1].parent = huffNode[n2].parent = (U16)nodeNb; + nodeNb++; + } + + /* distribute weights (unlimited tree height) */ + huffNode[nodeRoot].nbBits = 0; + for (n=nodeRoot-1; n>=STARTNODE; n--) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + + /* enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + + /* fill result into tree (val, nbBits) */ + { U16 nbPerRank[HUF_TABLELOG_MAX+1] = {0}; + U16 valPerRank[HUF_TABLELOG_MAX+1] = {0}; + int const alphabetSize = (int)(maxSymbolValue + 1); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + for (n=0; n<=nonNullRank; n++) + nbPerRank[huffNode[n].nbBits]++; + /* determine stating value per rank */ + { U16 min = 0; + for (n=(int)maxNbBits; n>0; n--) { + valPerRank[n] = min; /* get starting value within each rank */ + min += nbPerRank[n]; + min >>= 1; + } } + for (n=0; n> 3; +} + +int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { + int bad = 0; + int s; + for (s = 0; s <= (int)maxSymbolValue; ++s) { + bad |= (count[s] != 0) & (CTable[s].nbBits == 0); + } + return !bad; +} + +size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } + +FORCE_INLINE_TEMPLATE void +HUF_encodeSymbol(BIT_CStream_t* bitCPtr, U32 symbol, const HUF_CElt* CTable) +{ + BIT_addBitsFast(bitCPtr, CTable[symbol].val, CTable[symbol].nbBits); +} + +#define HUF_FLUSHBITS(s) BIT_flushBits(s) + +#define HUF_FLUSHBITS_1(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*2+7) HUF_FLUSHBITS(stream) + +#define HUF_FLUSHBITS_2(stream) \ + if (sizeof((stream)->bitContainer)*8 < HUF_TABLELOG_MAX*4+7) HUF_FLUSHBITS(stream) + +FORCE_INLINE_TEMPLATE size_t +HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + size_t n; + BIT_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ + { size_t const initErr = BIT_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + n = srcSize & ~3; /* join to mod 4 */ + switch (srcSize & 3) + { + case 3 : HUF_encodeSymbol(&bitC, ip[n+ 2], CTable); + HUF_FLUSHBITS_2(&bitC); + /* fall-through */ + case 2 : HUF_encodeSymbol(&bitC, ip[n+ 1], CTable); + HUF_FLUSHBITS_1(&bitC); + /* fall-through */ + case 1 : HUF_encodeSymbol(&bitC, ip[n+ 0], CTable); + HUF_FLUSHBITS(&bitC); + /* fall-through */ + case 0 : /* fall-through */ + default: break; + } + + for (; n>0; n-=4) { /* note : n&3==0 at this stage */ + HUF_encodeSymbol(&bitC, ip[n- 1], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 2], CTable); + HUF_FLUSHBITS_2(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 3], CTable); + HUF_FLUSHBITS_1(&bitC); + HUF_encodeSymbol(&bitC, ip[n- 4], CTable); + HUF_FLUSHBITS(&bitC); + } + + return BIT_closeCStream(&bitC); +} + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +HUF_compress1X_usingCTable_internal_bmi2(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + if (bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +} + +#else + +static size_t +HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, const int bmi2) +{ + (void)bmi2; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); +} + +#endif + +size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + + +static size_t +HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable, int bmi2) +{ + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + if (dstSize < 6 + 1 + 1 + 1 + 8) return 0; /* minimum space to compress successfully */ + if (srcSize < 12) return 0; /* no saving possible : too small input */ + op += 6; /* jumpTable */ + + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); + if (cSize==0) return 0; + assert(cSize <= 65535); + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; + } + + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); + { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); + if (cSize==0) return 0; + op += cSize; + } + + return (size_t)(op-ostart); +} + +size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +{ + return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +} + +typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + +static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, + HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) +{ + size_t const cSize = (nbStreams==HUF_singleStream) ? + HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : + HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; + /* check compressibility */ + assert(op >= ostart); + if ((size_t)(op-ostart) >= srcSize-1) { return 0; } + return (size_t)(op-ostart); +} + +typedef struct { + unsigned count[HUF_SYMBOLVALUE_MAX + 1]; + HUF_CElt CTable[HUF_SYMBOLVALUE_MAX + 1]; + HUF_buildCTable_wksp_tables buildCTable_wksp; +} HUF_compress_tables_t; + +/* HUF_compress_internal() : + * `workSpace` must a table of at least HUF_WORKSPACE_SIZE_U32 unsigned */ +static size_t +HUF_compress_internal (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, + HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, + const int bmi2) +{ + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)workSpace; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + + HUF_STATIC_ASSERT(sizeof(*table) <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ + if (((size_t)workSpace & 3) != 0) return ERROR(GENERIC); /* must be aligned on 4-bytes boundaries */ + if (wkspSize < HUF_WORKSPACE_SIZE) return ERROR(workSpace_tooSmall); + if (!srcSize) return 0; /* Uncompressed */ + if (!dstSize) return 0; /* cannot fit anything within dst budget */ + if (srcSize > HUF_BLOCKSIZE_MAX) return ERROR(srcSize_wrong); /* current block size limit */ + if (huffLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); + if (!maxSymbolValue) maxSymbolValue = HUF_SYMBOLVALUE_MAX; + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ + if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } + + /* Scan input and build symbol stats */ + { CHECK_V_F(largest, HIST_count_wksp (table->count, &maxSymbolValue, (const BYTE*)src, srcSize, workSpace, wkspSize) ); + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } + + /* Check validity of previous table */ + if ( repeat + && *repeat == HUF_repeat_check + && !HUF_validateCTable(oldHufTable, table->count, maxSymbolValue)) { + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ + if (preferRepeat && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } + + /* Build Huffman Tree */ + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->buildCTable_wksp, sizeof(table->buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; + /* Zero unused symbols in CTable, so we can check it for validity */ + memset(table->CTable + (maxSymbolValue + 1), 0, + sizeof(table->CTable) - ((maxSymbolValue + 1) * sizeof(HUF_CElt))); + } + + /* Write table description header */ + { CHECK_V_F(hSize, HUF_writeCTable (op, dstSize, table->CTable, maxSymbolValue, huffLog) ); + /* Check if using previous huffman table is beneficial */ + if (repeat && *repeat != HUF_repeat_none) { + size_t const oldSize = HUF_estimateCompressedSize(oldHufTable, table->count, maxSymbolValue); + size_t const newSize = HUF_estimateCompressedSize(table->CTable, table->count, maxSymbolValue); + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, oldHufTable, bmi2); + } } + + /* Use the new huffman table */ + if (hSize + 12ul >= srcSize) { return 0; } + op += hSize; + if (repeat) { *repeat = HUF_repeat_none; } + if (oldHufTable) + memcpy(oldHufTable, table->CTable, sizeof(table->CTable)); /* Save new table */ + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, + nbStreams, table->CTable, bmi2); +} + + +size_t HUF_compress1X_wksp (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, + repeat, preferRepeat, bmi2); +} + +size_t HUF_compress1X (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog) +{ + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + return HUF_compress1X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * provide workspace to generate compression tables */ +size_t HUF_compress4X_wksp (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, + NULL, NULL, 0, 0 /*bmi2*/); +} + +/* HUF_compress4X_repeat(): + * compress input using 4 streams. + * re-use an existing huffman compression table */ +size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, + HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2) +{ + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, + hufTable, repeat, preferRepeat, bmi2); +} + +size_t HUF_compress2 (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog) +{ + unsigned workSpace[HUF_WORKSPACE_SIZE_U32]; + return HUF_compress4X_wksp(dst, dstSize, src, srcSize, maxSymbolValue, huffLog, workSpace, sizeof(workSpace)); +} + +size_t HUF_compress (void* dst, size_t maxDstSize, const void* src, size_t srcSize) +{ + return HUF_compress2(dst, maxDstSize, src, srcSize, 255, HUF_TABLELOG_DEFAULT); +} +/**** ended inlining compress/huf_compress.c ****/ +/**** start inlining compress/zstd_compress_literals.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_literals.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_LITERALS_H +#define ZSTD_COMPRESS_LITERALS_H + +/**** start inlining zstd_compress_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* This header contains definitions + * that shall **only** be used by modules within lib/compress. + */ + +#ifndef ZSTD_COMPRESS_H +#define ZSTD_COMPRESS_H + +/*-************************************* +* Dependencies +***************************************/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** start inlining zstd_cwksp.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_CWKSP_H +#define ZSTD_CWKSP_H + +/*-************************************* +* Dependencies +***************************************/ +/**** skipping file: ../common/zstd_internal.h ****/ + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-************************************* +* Constants +***************************************/ + +/* Since the workspace is effectively its own little malloc implementation / + * arena, when we run under ASAN, we should similarly insert redzones between + * each internal element of the workspace, so ASAN will catch overruns that + * reach outside an object but that stay inside the workspace. + * + * This defines the size of that redzone. + */ +#ifndef ZSTD_CWKSP_ASAN_REDZONE_SIZE +#define ZSTD_CWKSP_ASAN_REDZONE_SIZE 128 +#endif + +/*-************************************* +* Structures +***************************************/ +typedef enum { + ZSTD_cwksp_alloc_objects, + ZSTD_cwksp_alloc_buffers, + ZSTD_cwksp_alloc_aligned +} ZSTD_cwksp_alloc_phase_e; + +/** + * Zstd fits all its internal datastructures into a single continuous buffer, + * so that it only needs to perform a single OS allocation (or so that a buffer + * can be provided to it and it can perform no allocations at all). This buffer + * is called the workspace. + * + * Several optimizations complicate that process of allocating memory ranges + * from this workspace for each internal datastructure: + * + * - These different internal datastructures have different setup requirements: + * + * - The static objects need to be cleared once and can then be trivially + * reused for each compression. + * + * - Various buffers don't need to be initialized at all--they are always + * written into before they're read. + * + * - The matchstate tables have a unique requirement that they don't need + * their memory to be totally cleared, but they do need the memory to have + * some bound, i.e., a guarantee that all values in the memory they've been + * allocated is less than some maximum value (which is the starting value + * for the indices that they will then use for compression). When this + * guarantee is provided to them, they can use the memory without any setup + * work. When it can't, they have to clear the area. + * + * - These buffers also have different alignment requirements. + * + * - We would like to reuse the objects in the workspace for multiple + * compressions without having to perform any expensive reallocation or + * reinitialization work. + * + * - We would like to be able to efficiently reuse the workspace across + * multiple compressions **even when the compression parameters change** and + * we need to resize some of the objects (where possible). + * + * To attempt to manage this buffer, given these constraints, the ZSTD_cwksp + * abstraction was created. It works as follows: + * + * Workspace Layout: + * + * [ ... workspace ... ] + * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: + * + * - Static objects: this is optionally the enclosing ZSTD_CCtx or ZSTD_CDict, + * so that literally everything fits in a single buffer. Note: if present, + * this must be the first object in the workspace, since ZSTD_free{CCtx, + * CDict}() rely on a pointer comparison to see whether one or two frees are + * required. + * + * - Fixed size objects: these are fixed-size, fixed-count objects that are + * nonetheless "dynamically" allocated in the workspace so that we can + * control how they're initialized separately from the broader ZSTD_CCtx. + * Examples: + * - Entropy Workspace + * - 2 x ZSTD_compressedBlockState_t + * - CDict dictionary contents + * + * - Tables: these are any of several different datastructures (hash tables, + * chain tables, binary trees) that all respect a common format: they are + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. + * + * - Aligned: these buffers are used for various purposes that require 4 byte + * alignment, but don't require any initialization before they're used. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can + * be moved around at no cost for a new compression. + * + * Allocating Memory: + * + * The various types of objects must be allocated in order, so they can be + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects + * 2. Buffers + * 3. Aligned + * 4. Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +typedef struct { + void* workspace; + void* workspaceEnd; + + void* objectEnd; + void* tableEnd; + void* tableValidEnd; + void* allocStart; + + int allocFailed; + int workspaceOversizedDuration; + ZSTD_cwksp_alloc_phase_e phase; +} ZSTD_cwksp; + +/*-************************************* +* Functions +***************************************/ + +MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); + +MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; + assert(ws->workspace <= ws->objectEnd); + assert(ws->objectEnd <= ws->tableEnd); + assert(ws->objectEnd <= ws->tableValidEnd); + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); +} + +/** + * Align must be a power of 2. + */ +MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { + size_t const mask = align - 1; + assert((align & mask) == 0); + return (size + mask) & ~mask; +} + +/** + * Use this to determine how much space in the workspace we will consume to + * allocate this object. (Normally it should be exactly the size of the object, + * but under special conditions, like ASAN, where we pad each object, it might + * be larger.) + * + * Since tables aren't currently redzoned, you don't need to call through this + * to figure out how much space you need for the matchState tables. Everything + * else is though. + */ +MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + return size + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#else + return size; +#endif +} + +MEM_STATIC void ZSTD_cwksp_internal_advance_phase( + ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase) { + assert(phase >= ws->phase); + if (phase > ws->phase) { + if (ws->phase < ZSTD_cwksp_alloc_buffers && + phase >= ZSTD_cwksp_alloc_buffers) { + ws->tableValidEnd = ws->objectEnd; + } + if (ws->phase < ZSTD_cwksp_alloc_aligned && + phase >= ZSTD_cwksp_alloc_aligned) { + /* If unaligned allocations down from a too-large top have left us + * unaligned, we need to realign our alloc ptr. Technically, this + * can consume space that is unaccounted for in the neededSpace + * calculation. However, I believe this can only happen when the + * workspace is too large, and specifically when it is too large + * by a larger margin than the space that will be consumed. */ + /* TODO: cleaner, compiler warning friendly way to do this??? */ + ws->allocStart = (BYTE*)ws->allocStart - ((size_t)ws->allocStart & (sizeof(U32)-1)); + if (ws->allocStart < ws->tableValidEnd) { + ws->tableValidEnd = ws->allocStart; + } + } + ws->phase = phase; + } +} + +/** + * Returns whether this object/buffer/etc was allocated in this workspace. + */ +MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) { + return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); +} + +/** + * Internal function. Do not use directly. + */ +MEM_STATIC void* ZSTD_cwksp_reserve_internal( + ZSTD_cwksp* ws, size_t bytes, ZSTD_cwksp_alloc_phase_e phase) { + void* alloc; + void* bottom = ws->tableEnd; + ZSTD_cwksp_internal_advance_phase(ws, phase); + alloc = (BYTE *)ws->allocStart - bytes; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* over-reserve space */ + alloc = (BYTE *)alloc - 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#endif + + DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); + if (alloc < bottom) { + DEBUGLOG(4, "cwksp: alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + if (alloc < ws->tableValidEnd) { + ws->tableValidEnd = alloc; + } + ws->allocStart = alloc; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on + * either size. */ + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +/** + * Reserves and returns unaligned memory. + */ +MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) { + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); +} + +/** + * Reserves and returns memory sized on and aligned on sizeof(unsigned). + */ +MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) { + assert((bytes & (sizeof(U32)-1)) == 0); + return ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, sizeof(U32)), ZSTD_cwksp_alloc_aligned); +} + +/** + * Aligned on sizeof(unsigned). These buffers have the special property that + * their values remain constrained, allowing us to re-use them without + * memset()-ing them. + */ +MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) { + const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; + void* alloc = ws->tableEnd; + void* end = (BYTE *)alloc + bytes; + void* top = ws->allocStart; + + DEBUGLOG(5, "cwksp: reserving %p table %zd bytes, %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + assert((bytes & (sizeof(U32)-1)) == 0); + ZSTD_cwksp_internal_advance_phase(ws, phase); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(end <= top); + if (end > top) { + DEBUGLOG(4, "cwksp: table alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + ws->tableEnd = end; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +/** + * Aligned on sizeof(void*). + */ +MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) { + size_t roundedBytes = ZSTD_cwksp_align(bytes, sizeof(void*)); + void* alloc = ws->objectEnd; + void* end = (BYTE*)alloc + roundedBytes; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* over-reserve space */ + end = (BYTE *)end + 2 * ZSTD_CWKSP_ASAN_REDZONE_SIZE; +#endif + + DEBUGLOG(5, + "cwksp: reserving %p object %zd bytes (rounded to %zd), %zd bytes remaining", + alloc, bytes, roundedBytes, ZSTD_cwksp_available_space(ws) - roundedBytes); + assert(((size_t)alloc & (sizeof(void*)-1)) == 0); + assert((bytes & (sizeof(void*)-1)) == 0); + ZSTD_cwksp_assert_internal_consistency(ws); + /* we must be in the first phase, no advance is possible */ + if (ws->phase != ZSTD_cwksp_alloc_objects || end > ws->workspaceEnd) { + DEBUGLOG(4, "cwksp: object alloc failed!"); + ws->allocFailed = 1; + return NULL; + } + ws->objectEnd = end; + ws->tableEnd = end; + ws->tableValidEnd = end; + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + /* Move alloc so there's ZSTD_CWKSP_ASAN_REDZONE_SIZE unused space on + * either size. */ + alloc = (BYTE *)alloc + ZSTD_CWKSP_ASAN_REDZONE_SIZE; + __asan_unpoison_memory_region(alloc, bytes); +#endif + + return alloc; +} + +MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_dirty"); + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table re-use logic is sound, and that we don't + * access table space that we haven't cleaned, we re-"poison" the table + * space every time we mark it dirty. */ + { + size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; + assert(__msan_test_shadow(ws->objectEnd, size) == -1); + __msan_poison(ws->objectEnd, size); + } +#endif + + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + ws->tableValidEnd = ws->objectEnd; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +MEM_STATIC void ZSTD_cwksp_mark_tables_clean(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_mark_tables_clean"); + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; + } + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * Zero the part of the allocated tables not already marked clean. + */ +MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: ZSTD_cwksp_clean_tables"); + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { + memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); + } + ZSTD_cwksp_mark_tables_clean(ws); +} + +/** + * Invalidates table allocations. + * All other allocations remain valid. + */ +MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: clearing tables!"); + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + { + size_t size = (BYTE*)ws->tableValidEnd - (BYTE*)ws->objectEnd; + __asan_poison_memory_region(ws->objectEnd, size); + } +#endif + + ws->tableEnd = ws->objectEnd; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * Invalidates all buffer, aligned, and table allocations. + * Object allocations remain valid. + */ +MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + DEBUGLOG(4, "cwksp: clearing!"); + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the context re-use logic is sound, and that we don't + * access stuff that this compression hasn't initialized, we re-"poison" + * the workspace (or at least the non-static, non-table parts of it) + * every time we start a new compression. */ + { + size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->tableValidEnd; + __msan_poison(ws->tableValidEnd, size); + } +#endif + +#if defined (ADDRESS_SANITIZER) && !defined (ZSTD_ASAN_DONT_POISON_WORKSPACE) + { + size_t size = (BYTE*)ws->workspaceEnd - (BYTE*)ws->objectEnd; + __asan_poison_memory_region(ws->objectEnd, size); + } +#endif + + ws->tableEnd = ws->objectEnd; + ws->allocStart = ws->workspaceEnd; + ws->allocFailed = 0; + if (ws->phase > ZSTD_cwksp_alloc_buffers) { + ws->phase = ZSTD_cwksp_alloc_buffers; + } + ZSTD_cwksp_assert_internal_consistency(ws); +} + +/** + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed + * buffer, if present, must be separately freed). + */ +MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size) { + DEBUGLOG(4, "cwksp: init'ing workspace with %zd bytes", size); + assert(((size_t)start & (sizeof(void*)-1)) == 0); /* ensure correct alignment */ + ws->workspace = start; + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; + ws->phase = ZSTD_cwksp_alloc_objects; + ZSTD_cwksp_clear(ws); + ws->workspaceOversizedDuration = 0; + ZSTD_cwksp_assert_internal_consistency(ws); +} + +MEM_STATIC size_t ZSTD_cwksp_create(ZSTD_cwksp* ws, size_t size, ZSTD_customMem customMem) { + void* workspace = ZSTD_malloc(size, customMem); + DEBUGLOG(4, "cwksp: creating new workspace with %zd bytes", size); + RETURN_ERROR_IF(workspace == NULL, memory_allocation, "NULL pointer!"); + ZSTD_cwksp_init(ws, workspace, size); + return 0; +} + +MEM_STATIC void ZSTD_cwksp_free(ZSTD_cwksp* ws, ZSTD_customMem customMem) { + void *ptr = ws->workspace; + DEBUGLOG(4, "cwksp: freeing workspace"); + memset(ws, 0, sizeof(ZSTD_cwksp)); + ZSTD_free(ptr, customMem); +} + +/** + * Moves the management of a workspace from one cwksp to another. The src cwksp + * is left in an invalid state (src must be re-init()'ed before its used again). + */ +MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + *dst = *src; + memset(src, 0, sizeof(ZSTD_cwksp)); +} + +MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +} + +MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; +} + +/*-************************************* +* Functions Checking Free Space +***************************************/ + +MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws) { + return (size_t)((BYTE*)ws->allocStart - (BYTE*)ws->tableEnd); +} + +MEM_STATIC int ZSTD_cwksp_check_available(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_available_space(ws) >= additionalNeededSpace; +} + +MEM_STATIC int ZSTD_cwksp_check_too_large(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_check_available( + ws, additionalNeededSpace * ZSTD_WORKSPACETOOLARGE_FACTOR); +} + +MEM_STATIC int ZSTD_cwksp_check_wasteful(ZSTD_cwksp* ws, size_t additionalNeededSpace) { + return ZSTD_cwksp_check_too_large(ws, additionalNeededSpace) + && ws->workspaceOversizedDuration > ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +MEM_STATIC void ZSTD_cwksp_bump_oversized_duration( + ZSTD_cwksp* ws, size_t additionalNeededSpace) { + if (ZSTD_cwksp_check_too_large(ws, additionalNeededSpace)) { + ws->workspaceOversizedDuration++; + } else { + ws->workspaceOversizedDuration = 0; + } +} + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_CWKSP_H */ +/**** ended inlining zstd_cwksp.h ****/ +#ifdef ZSTD_MULTITHREAD +/**** start inlining zstdmt_compress.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + #ifndef ZSTDMT_COMPRESS_H + #define ZSTDMT_COMPRESS_H + + #if defined (__cplusplus) + extern "C" { + #endif + + +/* Note : This is an internal API. + * These APIs used to be exposed with ZSTDLIB_API, + * because it used to be the only way to invoke MT compression. + * Now, it's recommended to use ZSTD_compress2 and ZSTD_compressStream2() + * instead. + * + * If you depend on these APIs and can't switch, then define + * ZSTD_LEGACY_MULTITHREADED_API when making the dynamic library. + * However, we may completely remove these functions in a future + * release, so please switch soon. + * + * This API requires ZSTD_MULTITHREAD to be defined during compilation, + * otherwise ZSTDMT_createCCtx*() will fail. + */ + +#ifdef ZSTD_LEGACY_MULTITHREADED_API +# define ZSTDMT_API ZSTDLIB_API +#else +# define ZSTDMT_API +#endif + +/* === Dependencies === */ +#include /* size_t */ +#define ZSTD_STATIC_LINKING_ONLY /* ZSTD_parameters */ +/**** skipping file: ../zstd.h ****/ + + +/* === Constants === */ +#ifndef ZSTDMT_NBWORKERS_MAX +# define ZSTDMT_NBWORKERS_MAX 200 +#endif +#ifndef ZSTDMT_JOBSIZE_MIN +# define ZSTDMT_JOBSIZE_MIN (1 MB) +#endif +#define ZSTDMT_JOBLOG_MAX (MEM_32bits() ? 29 : 30) +#define ZSTDMT_JOBSIZE_MAX (MEM_32bits() ? (512 MB) : (1024 MB)) + + +/* === Memory management === */ +typedef struct ZSTDMT_CCtx_s ZSTDMT_CCtx; +/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ +ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx(unsigned nbWorkers); +/* Requires ZSTD_MULTITHREAD to be defined during compilation, otherwise it will return NULL. */ +ZSTDMT_API ZSTDMT_CCtx* ZSTDMT_createCCtx_advanced(unsigned nbWorkers, + ZSTD_customMem cMem); +ZSTDMT_API size_t ZSTDMT_freeCCtx(ZSTDMT_CCtx* mtctx); + +ZSTDMT_API size_t ZSTDMT_sizeof_CCtx(ZSTDMT_CCtx* mtctx); + + +/* === Simple one-pass compression function === */ + +ZSTDMT_API size_t ZSTDMT_compressCCtx(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + + + +/* === Streaming functions === */ + +ZSTDMT_API size_t ZSTDMT_initCStream(ZSTDMT_CCtx* mtctx, int compressionLevel); +ZSTDMT_API size_t ZSTDMT_resetCStream(ZSTDMT_CCtx* mtctx, unsigned long long pledgedSrcSize); /**< if srcSize is not known at reset time, use ZSTD_CONTENTSIZE_UNKNOWN. Note: for compatibility with older programs, 0 means the same as ZSTD_CONTENTSIZE_UNKNOWN, but it will change in the future to mean "empty" */ + +ZSTDMT_API size_t ZSTDMT_nextInputSizeHint(const ZSTDMT_CCtx* mtctx); +ZSTDMT_API size_t ZSTDMT_compressStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDMT_API size_t ZSTDMT_flushStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_endStream(ZSTDMT_CCtx* mtctx, ZSTD_outBuffer* output); /**< @return : 0 == all flushed; >0 : still some data to be flushed; or an error code (ZSTD_isError()) */ + + +/* === Advanced functions and parameters === */ + +ZSTDMT_API size_t ZSTDMT_compress_advanced(ZSTDMT_CCtx* mtctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_parameters params, + int overlapLog); + +ZSTDMT_API size_t ZSTDMT_initCStream_advanced(ZSTDMT_CCtx* mtctx, + const void* dict, size_t dictSize, /* dict can be released after init, a local copy is preserved within zcs */ + ZSTD_parameters params, + unsigned long long pledgedSrcSize); /* pledgedSrcSize is optional and can be zero == unknown */ + +ZSTDMT_API size_t ZSTDMT_initCStream_usingCDict(ZSTDMT_CCtx* mtctx, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fparams, + unsigned long long pledgedSrcSize); /* note : zero means empty */ + +/* ZSTDMT_parameter : + * List of parameters that can be set using ZSTDMT_setMTCtxParameter() */ +typedef enum { + ZSTDMT_p_jobSize, /* Each job is compressed in parallel. By default, this value is dynamically determined depending on compression parameters. Can be set explicitly here. */ + ZSTDMT_p_overlapLog, /* Each job may reload a part of previous job to enhance compression ratio; 0 == no overlap, 6(default) == use 1/8th of window, >=9 == use full window. This is a "sticky" parameter : its value will be re-used on next compression job */ + ZSTDMT_p_rsyncable /* Enables rsyncable mode. */ +} ZSTDMT_parameter; + +/* ZSTDMT_setMTCtxParameter() : + * allow setting individual parameters, one at a time, among a list of enums defined in ZSTDMT_parameter. + * The function must be called typically after ZSTD_createCCtx() but __before ZSTDMT_init*() !__ + * Parameters not explicitly reset by ZSTDMT_init*() remain the same in consecutive compression sessions. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_setMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int value); + +/* ZSTDMT_getMTCtxParameter() : + * Query the ZSTDMT_CCtx for a parameter value. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) */ +ZSTDMT_API size_t ZSTDMT_getMTCtxParameter(ZSTDMT_CCtx* mtctx, ZSTDMT_parameter parameter, int* value); + + +/*! ZSTDMT_compressStream_generic() : + * Combines ZSTDMT_compressStream() with optional ZSTDMT_flushStream() or ZSTDMT_endStream() + * depending on flush directive. + * @return : minimum amount of data still to be flushed + * 0 if fully flushed + * or an error code + * note : needs to be init using any ZSTD_initCStream*() variant */ +ZSTDMT_API size_t ZSTDMT_compressStream_generic(ZSTDMT_CCtx* mtctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* ======================================================== + * === Private interface, for use by ZSTD_compress.c === + * === Not exposed in libzstd. Never invoke directly === + * ======================================================== */ + + /*! ZSTDMT_toFlushNow() + * Tell how many bytes are ready to be flushed immediately. + * Probe the oldest active job (not yet entirely flushed) and check its output buffer. + * If return 0, it means there is no active job, + * or, it means oldest job is still active, but everything produced has been flushed so far, + * therefore flushing is limited by speed of oldest job. */ +size_t ZSTDMT_toFlushNow(ZSTDMT_CCtx* mtctx); + +/*! ZSTDMT_CCtxParam_setMTCtxParameter() + * like ZSTDMT_setMTCtxParameter(), but into a ZSTD_CCtx_Params */ +size_t ZSTDMT_CCtxParam_setMTCtxParameter(ZSTD_CCtx_params* params, ZSTDMT_parameter parameter, int value); + +/*! ZSTDMT_CCtxParam_setNbWorkers() + * Set nbWorkers, and clamp it. + * Also reset jobSize and overlapLog */ +size_t ZSTDMT_CCtxParam_setNbWorkers(ZSTD_CCtx_params* params, unsigned nbWorkers); + +/*! ZSTDMT_updateCParams_whileCompressing() : + * Updates only a selected set of compression parameters, to remain compatible with current frame. + * New parameters will be applied to next compression job. */ +void ZSTDMT_updateCParams_whileCompressing(ZSTDMT_CCtx* mtctx, const ZSTD_CCtx_params* cctxParams); + +/*! ZSTDMT_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads. + */ +ZSTD_frameProgression ZSTDMT_getFrameProgression(ZSTDMT_CCtx* mtctx); + + +/*! ZSTDMT_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTDMT_initCStream_internal(ZSTDMT_CCtx* zcs, + const void* dict, size_t dictSize, ZSTD_dictContentType_e dictContentType, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, unsigned long long pledgedSrcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDMT_COMPRESS_H */ +/**** ended inlining zstdmt_compress.h ****/ +#endif + +#if defined (__cplusplus) +extern "C" { +#endif + + +/*-************************************* +* Constants +***************************************/ +#define kSearchStrength 8 +#define HASH_READ_SIZE 8 +#define ZSTD_DUBT_UNSORTED_MARK 1 /* For btlazy2 strategy, index ZSTD_DUBT_UNSORTED_MARK==1 means "unsorted". + It could be confused for a real successor at index "1", if sorted as larger than its predecessor. + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. + The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +/*-************************************* +* Context memory management +***************************************/ +typedef enum { ZSTDcs_created=0, ZSTDcs_init, ZSTDcs_ongoing, ZSTDcs_ending } ZSTD_compressionStage_e; +typedef enum { zcss_init=0, zcss_load, zcss_flush } ZSTD_cStreamStage; + +typedef struct ZSTD_prefixDict_s { + const void* dict; + size_t dictSize; + ZSTD_dictContentType_e dictContentType; +} ZSTD_prefixDict; + +typedef struct { + void* dictBuffer; + void const* dict; + size_t dictSize; + ZSTD_dictContentType_e dictContentType; + ZSTD_CDict* cdict; +} ZSTD_localDict; + +typedef struct { + U32 CTable[HUF_CTABLE_SIZE_U32(255)]; + HUF_repeat repeatMode; +} ZSTD_hufCTables_t; + +typedef struct { + FSE_CTable offcodeCTable[FSE_CTABLE_SIZE_U32(OffFSELog, MaxOff)]; + FSE_CTable matchlengthCTable[FSE_CTABLE_SIZE_U32(MLFSELog, MaxML)]; + FSE_CTable litlengthCTable[FSE_CTABLE_SIZE_U32(LLFSELog, MaxLL)]; + FSE_repeat offcode_repeatMode; + FSE_repeat matchlength_repeatMode; + FSE_repeat litlength_repeatMode; +} ZSTD_fseCTables_t; + +typedef struct { + ZSTD_hufCTables_t huf; + ZSTD_fseCTables_t fse; +} ZSTD_entropyCTables_t; + +typedef struct { + U32 off; + U32 len; +} ZSTD_match_t; + +typedef struct { + int price; + U32 off; + U32 mlen; + U32 litlen; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_optimal_t; + +typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + +typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ + ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ + ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ + U32 matchLengthSum; /* nb of matchLength codes */ + U32 offCodeSum; /* nb of offset codes */ + U32 litSumBasePrice; /* to compare to log2(litfreq) */ + U32 litLengthSumBasePrice; /* to compare to log2(llfreq) */ + U32 matchLengthSumBasePrice;/* to compare to log2(mlfreq) */ + U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ + ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ + ZSTD_literalCompressionMode_e literalCompressionMode; +} optState_t; + +typedef struct { + ZSTD_entropyCTables_t entropy; + U32 rep[ZSTD_REP_NUM]; +} ZSTD_compressedBlockState_t; + +typedef struct { + BYTE const* nextSrc; /* next block here to continue on current prefix */ + BYTE const* base; /* All regular indexes relative to this position */ + BYTE const* dictBase; /* extDict indexes relative to this position */ + U32 dictLimit; /* below that point, need extDict */ + U32 lowLimit; /* below that point, no more valid data */ +} ZSTD_window_t; + +typedef struct ZSTD_matchState_t ZSTD_matchState_t; +struct ZSTD_matchState_t { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary, within context's referential. + * When loadedDictEnd != 0, a dictionary is in use, and still valid. + * This relies on a mechanism to set loadedDictEnd=0 when dictionary is no longer within distance. + * Such mechanism is provided within ZSTD_window_enforceMaxDist() and ZSTD_checkDictValidity(). + * When dict referential is copied into active context (i.e. not attached), + * loadedDictEnd == dictSize, since referential starts from zero. + */ + U32 nextToUpdate; /* index from which to continue table update */ + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + U32* hashTable; + U32* hashTable3; + U32* chainTable; + optState_t opt; /* optimal parser state */ + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; +}; + +typedef struct { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; + ZSTD_matchState_t matchState; +} ZSTD_blockState_t; + +typedef struct { + U32 offset; + U32 checksum; +} ldmEntry_t; + +typedef struct { + ZSTD_window_t window; /* State for the window round buffer management */ + ldmEntry_t* hashTable; + U32 loadedDictEnd; + BYTE* bucketOffsets; /* Next position in bucket to insert entry */ + U64 hashPower; /* Used to compute the rolling hash. + * Depends on ldmParams.minMatchLength */ +} ldmState_t; + +typedef struct { + U32 enableLdm; /* 1 if enable long distance matching */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ + U32 hashRateLog; /* Log number of entries to skip */ + U32 windowLog; /* Window log for the LDM */ +} ldmParams_t; + +typedef struct { + U32 offset; + U32 litLength; + U32 matchLength; +} rawSeq; + +typedef struct { + rawSeq* seq; /* The start of the sequences */ + size_t pos; /* The position where reading stopped. <= size. */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity starting from `seq` pointer */ +} rawSeqStore_t; + +typedef struct { + int collectSequences; + ZSTD_Sequence* seqStart; + size_t seqIndex; + size_t maxSequences; +} SeqCollector; + +struct ZSTD_CCtx_params_s { + ZSTD_format_e format; + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; + + int compressionLevel; + int forceWindow; /* force back-references to respect limit of + * 1< 63) ? ZSTD_highbit32(litLength) + LL_deltaCode : LL_Code[litLength]; +} + +/* ZSTD_MLcode() : + * note : mlBase = matchLength - MINMATCH; + * because it's the format it's stored in seqStore->sequences */ +MEM_STATIC U32 ZSTD_MLcode(U32 mlBase) +{ + static const BYTE ML_Code[128] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, + 32, 32, 33, 33, 34, 34, 35, 35, 36, 36, 36, 36, 37, 37, 37, 37, + 38, 38, 38, 38, 38, 38, 38, 38, 39, 39, 39, 39, 39, 39, 39, 39, + 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, 40, + 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, 41, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, + 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42, 42 }; + static const U32 ML_deltaCode = 36; + return (mlBase > 127) ? ZSTD_highbit32(mlBase) + ML_deltaCode : ML_Code[mlBase]; +} + +typedef struct repcodes_s { + U32 rep[3]; +} repcodes_t; + +MEM_STATIC repcodes_t ZSTD_updateRep(U32 const rep[3], U32 const offset, U32 const ll0) +{ + repcodes_t newReps; + if (offset >= ZSTD_REP_NUM) { /* full offset */ + newReps.rep[2] = rep[1]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = offset - ZSTD_REP_MOVE; + } else { /* repcode */ + U32 const repCode = offset + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + newReps.rep[2] = (repCode >= 2) ? rep[1] : rep[2]; + newReps.rep[1] = rep[0]; + newReps.rep[0] = currentOffset; + } else { /* repCode == 0 */ + memcpy(&newReps, rep, sizeof(newReps)); + } + } + return newReps; +} + +/* ZSTD_cParam_withinBounds: + * @return 1 if value is within cParam bounds, + * 0 otherwise */ +MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) +{ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); + if (ZSTD_isError(bounds.error)) return 0; + if (value < bounds.lowerBound) return 0; + if (value > bounds.upperBound) return 0; + return 1; +} + +/* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) +{ + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); + memcpy((BYTE*)dst + ZSTD_blockHeaderSize, src, srcSize); + return ZSTD_blockHeaderSize + srcSize; +} + +MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) +{ + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); + RETURN_ERROR_IF(dstCapacity < 4, dstSize_tooSmall, ""); + MEM_writeLE24(op, cBlockHeader); + op[3] = src; + return 4; +} + + +/* ZSTD_minGain() : + * minimum compression required + * to generate a compress block or a compressed literals section. + * note : use same formula for both situations */ +MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) +{ + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); + return (srcSize >> minlog) + 2; +} + +MEM_STATIC int ZSTD_disableLiteralsCompression(const ZSTD_CCtx_params* cctxParams) +{ + switch (cctxParams->literalCompressionMode) { + case ZSTD_lcm_huffman: + return 0; + case ZSTD_lcm_uncompressed: + return 1; + default: + assert(0 /* impossible: pre-validated */); + /* fall-through */ + case ZSTD_lcm_auto: + return (cctxParams->cParams.strategy == ZSTD_fast) && (cctxParams->cParams.targetLength > 0); + } +} + +/*! ZSTD_safecopyLiterals() : + * memcpy() function that won't read beyond more than WILDCOPY_OVERLENGTH bytes past ilimit_w. + * Only called when the sequence ends past ilimit_w, so it only needs to be optimized for single + * large copies. + */ +static void ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE const* ilimit_w) { + assert(iend > ilimit_w); + if (ip <= ilimit_w) { + ZSTD_wildcopy(op, ip, ilimit_w - ip, ZSTD_no_overlap); + op += ilimit_w - ip; + ip = ilimit_w; + } + while (ip < iend) *op++ = *ip++; +} + +/*! ZSTD_storeSeq() : + * Store a sequence (litlen, litPtr, offCode and mlBase) into seqStore_t. + * `offCode` : distance to match + ZSTD_REP_MOVE (values <= ZSTD_REP_MOVE are repCodes). + * `mlBase` : matchLength - MINMATCH + * Allowed to overread literals up to litLimit. +*/ +HINT_INLINE UNUSED_ATTR +void ZSTD_storeSeq(seqStore_t* seqStorePtr, size_t litLength, const BYTE* literals, const BYTE* litLimit, U32 offCode, size_t mlBase) +{ + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; + BYTE const* const litEnd = literals + litLength; +#if defined(DEBUGLEVEL) && (DEBUGLEVEL >= 6) + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); + DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", + pos, (U32)litLength, (U32)mlBase+MINMATCH, (U32)offCode); + } +#endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); + /* copy Literals */ + assert(seqStorePtr->maxNbLit <= 128 KB); + assert(seqStorePtr->lit + litLength <= seqStorePtr->litStart + seqStorePtr->maxNbLit); + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. + * First copy 16 bytes, because literals are likely short. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); + } + } else { + ZSTD_safecopyLiterals(seqStorePtr->lit, literals, litEnd, litLimit_w); + } + seqStorePtr->lit += litLength; + + /* literal Length */ + if (litLength>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 1; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ + seqStorePtr->sequences[0].offset = offCode + 1; + + /* match Length */ + if (mlBase>0xFFFF) { + assert(seqStorePtr->longLengthID == 0); /* there can only be a single long length */ + seqStorePtr->longLengthID = 2; + seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + } + seqStorePtr->sequences[0].matchLength = (U16)mlBase; + + seqStorePtr->sequences++; +} + + +/*-************************************* +* Match length counter +***************************************/ +static unsigned ZSTD_NbCommonBytes (size_t val) +{ + if (MEM_isLittleEndian()) { + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + return _BitScanForward64( &r, (U64)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_ctzll((U64)val) >> 3); +# else + static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, + 0, 3, 1, 3, 1, 4, 2, 7, + 0, 2, 3, 6, 1, 5, 3, 5, + 1, 3, 4, 4, 2, 5, 6, 7, + 7, 0, 1, 2, 3, 3, 4, 6, + 2, 6, 5, 5, 3, 4, 5, 6, + 7, 1, 2, 4, 6, 4, 4, 5, + 7, 2, 6, 5, 7, 6, 7, 7 }; + return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r=0; + return _BitScanForward( &r, (U32)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_ctz((U32)val) >> 3); +# else + static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, + 3, 2, 2, 1, 3, 2, 0, 1, + 3, 3, 1, 2, 2, 2, 2, 0, + 3, 1, 2, 0, 1, 0, 1, 1 }; + return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +# endif + } + } else { /* Big Endian CPU */ + if (MEM_64bits()) { +# if defined(_MSC_VER) && defined(_WIN64) + unsigned long r = 0; + return _BitScanReverse64( &r, val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 4) + return (__builtin_clzll(val) >> 3); +# else + unsigned r; + const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ + if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } + if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } + r += (!val); + return r; +# endif + } else { /* 32 bits */ +# if defined(_MSC_VER) + unsigned long r = 0; + return _BitScanReverse( &r, (unsigned long)val ) ? (unsigned)(r >> 3) : 0; +# elif defined(__GNUC__) && (__GNUC__ >= 3) + return (__builtin_clz((U32)val) >> 3); +# else + unsigned r; + if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } + r += (!val); + return r; +# endif + } } +} + + +MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) +{ + const BYTE* const pStart = pIn; + const BYTE* const pInLoopLimit = pInLimit - (sizeof(size_t)-1); + + if (pIn < pInLoopLimit) { + { size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (diff) return ZSTD_NbCommonBytes(diff); } + pIn+=sizeof(size_t); pMatch+=sizeof(size_t); + while (pIn < pInLoopLimit) { + size_t const diff = MEM_readST(pMatch) ^ MEM_readST(pIn); + if (!diff) { pIn+=sizeof(size_t); pMatch+=sizeof(size_t); continue; } + pIn += ZSTD_NbCommonBytes(diff); + return (size_t)(pIn - pStart); + } } + if (MEM_64bits() && (pIn<(pInLimit-3)) && (MEM_read32(pMatch) == MEM_read32(pIn))) { pIn+=4; pMatch+=4; } + if ((pIn<(pInLimit-1)) && (MEM_read16(pMatch) == MEM_read16(pIn))) { pIn+=2; pMatch+=2; } + if ((pIn> (32-h) ; } +MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ + +static const U32 prime4bytes = 2654435761U; +static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } + +static const U64 prime5bytes = 889523592379ULL; +static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } + +static const U64 prime6bytes = 227718039650203ULL; +static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } + +static const U64 prime7bytes = 58295818150454627ULL; +static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } + +static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } + +MEM_STATIC size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) +{ + switch(mls) + { + default: + case 4: return ZSTD_hash4Ptr(p, hBits); + case 5: return ZSTD_hash5Ptr(p, hBits); + case 6: return ZSTD_hash6Ptr(p, hBits); + case 7: return ZSTD_hash7Ptr(p, hBits); + case 8: return ZSTD_hash8Ptr(p, hBits); + } +} + +/** ZSTD_ipow() : + * Return base^exponent. + */ +static U64 ZSTD_ipow(U64 base, U64 exponent) +{ + U64 power = 1; + while (exponent) { + if (exponent & 1) power *= base; + exponent >>= 1; + base *= base; + } + return power; +} + +#define ZSTD_ROLL_HASH_CHAR_OFFSET 10 + +/** ZSTD_rollingHash_append() : + * Add the buffer to the hash value. + */ +static U64 ZSTD_rollingHash_append(U64 hash, void const* buf, size_t size) +{ + BYTE const* istart = (BYTE const*)buf; + size_t pos; + for (pos = 0; pos < size; ++pos) { + hash *= prime8bytes; + hash += istart[pos] + ZSTD_ROLL_HASH_CHAR_OFFSET; + } + return hash; +} + +/** ZSTD_rollingHash_compute() : + * Compute the rolling hash value of the buffer. + */ +MEM_STATIC U64 ZSTD_rollingHash_compute(void const* buf, size_t size) +{ + return ZSTD_rollingHash_append(0, buf, size); +} + +/** ZSTD_rollingHash_primePower() : + * Compute the primePower to be passed to ZSTD_rollingHash_rotate() for a hash + * over a window of length bytes. + */ +MEM_STATIC U64 ZSTD_rollingHash_primePower(U32 length) +{ + return ZSTD_ipow(prime8bytes, length - 1); +} + +/** ZSTD_rollingHash_rotate() : + * Rotate the rolling hash by one byte. + */ +MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 primePower) +{ + hash -= (toRemove + ZSTD_ROLL_HASH_CHAR_OFFSET) * primePower; + hash *= prime8bytes; + hash += toAdd + ZSTD_ROLL_HASH_CHAR_OFFSET; + return hash; +} + +/*-************************************* +* Round buffer management +***************************************/ +#if (ZSTD_WINDOWLOG_MAX_64 > 31) +# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" +#endif +/* Max current allowed */ +#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) +/* Maximum chunk size before overflow correction needs to be called again */ +#define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ + - ZSTD_CURRENT_MAX) /* Maximum beginning lowLimit */ + +/** + * ZSTD_window_clear(): + * Clears the window containing the history by simply setting it to empty. + */ +MEM_STATIC void ZSTD_window_clear(ZSTD_window_t* window) +{ + size_t const endT = (size_t)(window->nextSrc - window->base); + U32 const end = (U32)endT; + + window->lowLimit = end; + window->dictLimit = end; +} + +/** + * ZSTD_window_hasExtDict(): + * Returns non-zero if the window has a non-empty extDict. + */ +MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) +{ + return window.lowLimit < window.dictLimit; +} + +/** + * ZSTD_matchState_dictMode(): + * Inspects the provided matchState and figures out what dictMode should be + * passed to the compressor. + */ +MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) +{ + return ZSTD_window_hasExtDict(ms->window) ? + ZSTD_extDict : + ms->dictMatchState != NULL ? + ZSTD_dictMatchState : + ZSTD_noDict; +} + +/** + * ZSTD_window_needOverflowCorrection(): + * Returns non-zero if the indices are getting too large and need overflow + * protection. + */ +MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + void const* srcEnd) +{ + U32 const current = (U32)((BYTE const*)srcEnd - window.base); + return current > ZSTD_CURRENT_MAX; +} + +/** + * ZSTD_window_correctOverflow(): + * Reduces the indices to protect from index overflow. + * Returns the correction made to the indices, which must be applied to every + * stored index. + * + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + * NOTE: (maxDist & cycleMask) must be zero. + */ +MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) +{ + /* preemptive overflow correction: + * 1. correction is large enough: + * lowLimit > (3<<29) ==> current > 3<<29 + 1< (3<<29 + 1< (3<<29) - (1< (3<<29) - (1<<30) (NOTE: chainLog <= 30) + * > 1<<29 + * + * 2. (ip+ZSTD_CHUNKSIZE_MAX - cctx->base) doesn't overflow: + * After correction, current is less than (1<base < 1<<32. + * 3. (cctx->lowLimit + 1< 3<<29 + 1<base); + U32 const currentCycle0 = current & cycleMask; + /* Exclude zero so that newCurrent - maxDist >= 1. */ + U32 const currentCycle1 = currentCycle0 == 0 ? (1U << cycleLog) : currentCycle0; + U32 const newCurrent = currentCycle1 + maxDist; + U32 const correction = current - newCurrent; + assert((maxDist & cycleMask) == 0); + assert(current > newCurrent); + /* Loose bound, should be around 1<<29 (see above) */ + assert(correction > 1<<28); + + window->base += correction; + window->dictBase += correction; + if (window->lowLimit <= correction) window->lowLimit = 1; + else window->lowLimit -= correction; + if (window->dictLimit <= correction) window->dictLimit = 1; + else window->dictLimit -= correction; + + /* Ensure we can still reference the full window. */ + assert(newCurrent >= maxDist); + assert(newCurrent - maxDist >= 1); + /* Ensure that lowLimit and dictLimit didn't underflow. */ + assert(window->lowLimit <= newCurrent); + assert(window->dictLimit <= newCurrent); + + DEBUGLOG(4, "Correction of 0x%x bytes to lowLimit=0x%x", correction, + window->lowLimit); + return correction; +} + +/** + * ZSTD_window_enforceMaxDist(): + * Updates lowLimit so that: + * (srcEnd - base) - lowLimit == maxDist + loadedDictEnd + * + * It ensures index is valid as long as index >= lowLimit. + * This must be called before a block compression call. + * + * loadedDictEnd is only defined if a dictionary is in use for current compression. + * As the name implies, loadedDictEnd represents the index at end of dictionary. + * The value lies within context's referential, it can be directly compared to blockEndIdx. + * + * If loadedDictEndPtr is NULL, no dictionary is in use, and we use loadedDictEnd == 0. + * If loadedDictEndPtr is not NULL, we set it to zero after updating lowLimit. + * This is because dictionaries are allowed to be referenced fully + * as long as the last byte of the dictionary is in the window. + * Once input has progressed beyond window size, dictionary cannot be referenced anymore. + * + * In normal dict mode, the dictionary lies between lowLimit and dictLimit. + * In dictMatchState mode, lowLimit and dictLimit are the same, + * and the dictionary is below them. + * forceWindow and dictMatchState are therefore incompatible. + */ +MEM_STATIC void +ZSTD_window_enforceMaxDist(ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, + const ZSTD_matchState_t** dictMatchStatePtr) +{ + U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; + DEBUGLOG(5, "ZSTD_window_enforceMaxDist: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u", + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + + /* - When there is no dictionary : loadedDictEnd == 0. + In which case, the test (blockEndIdx > maxDist) is merely to avoid + overflowing next operation `newLowLimit = blockEndIdx - maxDist`. + - When there is a standard dictionary : + Index referential is copied from the dictionary, + which means it starts from 0. + In which case, loadedDictEnd == dictSize, + and it makes sense to compare `blockEndIdx > maxDist + dictSize` + since `blockEndIdx` also starts from zero. + - When there is an attached dictionary : + loadedDictEnd is expressed within the referential of the context, + so it can be directly compared against blockEndIdx. + */ + if (blockEndIdx > maxDist + loadedDictEnd) { + U32 const newLowLimit = blockEndIdx - maxDist; + if (window->lowLimit < newLowLimit) window->lowLimit = newLowLimit; + if (window->dictLimit < window->lowLimit) { + DEBUGLOG(5, "Update dictLimit to match lowLimit, from %u to %u", + (unsigned)window->dictLimit, (unsigned)window->lowLimit); + window->dictLimit = window->lowLimit; + } + /* On reaching window size, dictionaries are invalidated */ + if (loadedDictEndPtr) *loadedDictEndPtr = 0; + if (dictMatchStatePtr) *dictMatchStatePtr = NULL; + } +} + +/* Similar to ZSTD_window_enforceMaxDist(), + * but only invalidates dictionary + * when input progresses beyond window size. + * assumption : loadedDictEndPtr and dictMatchStatePtr are valid (non NULL) + * loadedDictEnd uses same referential as window->base + * maxDist is the window size */ +MEM_STATIC void +ZSTD_checkDictValidity(const ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, + const ZSTD_matchState_t** dictMatchStatePtr) +{ + assert(loadedDictEndPtr != NULL); + assert(dictMatchStatePtr != NULL); + { U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = *loadedDictEndPtr; + DEBUGLOG(5, "ZSTD_checkDictValidity: blockEndIdx=%u, maxDist=%u, loadedDictEnd=%u", + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + + if (blockEndIdx > loadedDictEnd + maxDist) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; + *dictMatchStatePtr = NULL; + } else { + if (*loadedDictEndPtr != 0) { + DEBUGLOG(6, "dictionary considered valid for current block"); + } } } +} + +MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + memset(window, 0, sizeof(*window)); + window->base = (BYTE const*)""; + window->dictBase = (BYTE const*)""; + window->dictLimit = 1; /* start from 1, so that 1st position is valid */ + window->lowLimit = 1; /* it ensures first and later CCtx usages compress the same */ + window->nextSrc = window->base + 1; /* see issue #1241 */ +} + +/** + * ZSTD_window_update(): + * Updates the window by appending [src, src + srcSize) to the window. + * If it is not contiguous, the current prefix becomes the extDict, and we + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize) +{ + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; + DEBUGLOG(5, "ZSTD_window_update"); + if (srcSize == 0) + return contiguous; + assert(window->base != NULL); + assert(window->dictBase != NULL); + /* Check if blocks follow each other */ + if (src != window->nextSrc) { + /* not contiguous */ + size_t const distanceFromBase = (size_t)(window->nextSrc - window->base); + DEBUGLOG(5, "Non contiguous blocks, new segment starts at %u", window->dictLimit); + window->lowLimit = window->dictLimit; + assert(distanceFromBase == (size_t)(U32)distanceFromBase); /* should never overflow */ + window->dictLimit = (U32)distanceFromBase; + window->dictBase = window->base; + window->base = ip - distanceFromBase; + /* ms->nextToUpdate = window->dictLimit; */ + if (window->dictLimit - window->lowLimit < HASH_READ_SIZE) window->lowLimit = window->dictLimit; /* too small extDict */ + contiguous = 0; + } + window->nextSrc = ip + srcSize; + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { + ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; + U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; + window->lowLimit = lowLimitMax; + DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); + } + return contiguous; +} + +/** + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog) +{ + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; + U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + return matchLowest; +} + +/** + * Returns the lowest allowed match index in the prefix. + */ +MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 current, unsigned windowLog) +{ + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.dictLimit; + U32 const withinWindow = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + U32 const isDictionary = (ms->loadedDictEnd != 0); + U32 const matchLowest = isDictionary ? lowestValid : withinWindow; + return matchLowest; +} + + + +/* debug functions */ +#if (DEBUGLEVEL>=2) + +MEM_STATIC double ZSTD_fWeight(U32 rawStat) +{ + U32 const fp_accuracy = 8; + U32 const fp_multiplier = (1 << fp_accuracy); + U32 const newStat = rawStat + 1; + U32 const hb = ZSTD_highbit32(newStat); + U32 const BWeight = hb * fp_multiplier; + U32 const FWeight = (newStat << fp_accuracy) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + fp_accuracy < 31); + return (double)weight / fp_multiplier; +} + +/* display a table content, + * listing each element, its frequency, and its predicted bit cost */ +MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) +{ + unsigned u, sum; + for (u=0, sum=0; u<=max; u++) sum += table[u]; + DEBUGLOG(2, "total nb elts: %u", sum); + for (u=0; u<=max; u++) { + DEBUGLOG(2, "%2u: %5u (%.2f)", + u, table[u], ZSTD_fWeight(sum) - ZSTD_fWeight(table[u]) ); + } +} + +#endif + + +#if defined (__cplusplus) +} +#endif + +/* =============================================================== + * Shared internal declarations + * These prototypes may be called from sources not in lib/compress + * =============================================================== */ + +/* ZSTD_loadCEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * return : size of dictionary header (size of magic number + dict ID + entropy tables) + * assumptions : magic number supposed already checked + * and dictSize >= 8 */ +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + short* offcodeNCount, unsigned* offcodeMaxValue, + const void* const dict, size_t dictSize); + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + +/* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress + * ============================================================== */ + +/* ZSTD_getCParamsFromCCtxParams() : + * cParams are built depending on compressionLevel, src size hints, + * LDM and manually set compression parameters. + * Note: srcSizeHint == 0 means 0! + */ +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize); + +/*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. + * expects params to be valid. + * must receive dict, or cdict, or none, but not both. + * @return : 0, or an error code */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); + +void ZSTD_resetSeqStore(seqStore_t* ssPtr); + +/*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict); + +/* ZSTD_compressBegin_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize); + +/* ZSTD_compress_advanced_internal() : + * Private use only. To be called from zstdmt_compress.c. */ +size_t ZSTD_compress_advanced_internal(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_CCtx_params* params); + + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapacity` is too small ( 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + +#endif /* ZSTD_COMPRESS_H */ +/**** ended inlining zstd_compress_internal.h ****/ + + +size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, int disableLiteralCompression, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, + const int bmi2); + +#endif /* ZSTD_COMPRESS_LITERALS_H */ +/**** ended inlining zstd_compress_literals.h ****/ + +size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + BYTE* const ostart = (BYTE* const)dst; + U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); + + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) + { + case 1: /* 2 - 1 - 5 */ + ostart[0] = (BYTE)((U32)set_basic + (srcSize<<3)); + break; + case 2: /* 2 - 2 - 12 */ + MEM_writeLE16(ostart, (U16)((U32)set_basic + (1<<2) + (srcSize<<4))); + break; + case 3: /* 2 - 2 - 20 */ + MEM_writeLE32(ostart, (U32)((U32)set_basic + (3<<2) + (srcSize<<4))); + break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); + } + + memcpy(ostart + flSize, src, srcSize); + DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; +} + +size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + BYTE* const ostart = (BYTE* const)dst; + U32 const flSize = 1 + (srcSize>31) + (srcSize>4095); + + (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ + + switch(flSize) + { + case 1: /* 2 - 1 - 5 */ + ostart[0] = (BYTE)((U32)set_rle + (srcSize<<3)); + break; + case 2: /* 2 - 2 - 12 */ + MEM_writeLE16(ostart, (U16)((U32)set_rle + (1<<2) + (srcSize<<4))); + break; + case 3: /* 2 - 2 - 20 */ + MEM_writeLE32(ostart, (U32)((U32)set_rle + (3<<2) + (srcSize<<4))); + break; + default: /* not necessary : flSize is {1,2,3} */ + assert(0); + } + + ostart[flSize] = *(const BYTE*)src; + DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); + return flSize+1; +} + +size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_strategy strategy, int disableLiteralCompression, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, + const int bmi2) +{ + size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + + DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", + disableLiteralCompression, (U32)srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + /* small ? don't even attempt compression (speed opt) */ +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; + int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; + cLitSize = singleStream ? + HUF_compress1X_repeat( + ostart+lhSize, dstCapacity-lhSize, src, srcSize, + HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2) : + HUF_compress4X_repeat( + ostart+lhSize, dstCapacity-lhSize, src, srcSize, + HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, + (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ + DEBUGLOG(5, "Reusing previous huffman table"); + hType = set_repeat; + } + } + + if ((cLitSize==0) | (cLitSize >= srcSize - minGain) | ERR_isError(cLitSize)) { + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + } + if (cLitSize==1) { + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); + } + + if (hType == set_compressed) { + /* using a newly constructed table */ + nextHuf->repeatMode = HUF_repeat_check; + } + + /* Build header */ + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ + { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); + } + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)srcSize, (U32)(lhSize+cLitSize)); + return lhSize+cLitSize; +} +/**** ended inlining compress/zstd_compress_literals.c ****/ +/**** start inlining compress/zstd_compress_sequences.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_sequences.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_SEQUENCES_H +#define ZSTD_COMPRESS_SEQUENCES_H + +/**** skipping file: ../common/fse.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + +typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +} ZSTD_defaultPolicy_e; + +symbolEncodingType_e +ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, + ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy); + +size_t +ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, + const FSE_CTable* prevCTable, size_t prevCTableSize, + void* entropyWorkspace, size_t entropyWorkspaceSize); + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max); + +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max); +#endif /* ZSTD_COMPRESS_SEQUENCES_H */ +/**** ended inlining zstd_compress_sequences.h ****/ + +/** + * -log2(x / 256) lookup table for x in [0, 256). + * If x == 0: Return 0 + * Else: Return floor(-log2(x / 256) * 256) + */ +static unsigned const kInverseProbabilityLog256[256] = { + 0, 2048, 1792, 1642, 1536, 1453, 1386, 1329, 1280, 1236, 1197, 1162, + 1130, 1100, 1073, 1047, 1024, 1001, 980, 960, 941, 923, 906, 889, + 874, 859, 844, 830, 817, 804, 791, 779, 768, 756, 745, 734, + 724, 714, 704, 694, 685, 676, 667, 658, 650, 642, 633, 626, + 618, 610, 603, 595, 588, 581, 574, 567, 561, 554, 548, 542, + 535, 529, 523, 517, 512, 506, 500, 495, 489, 484, 478, 473, + 468, 463, 458, 453, 448, 443, 438, 434, 429, 424, 420, 415, + 411, 407, 402, 398, 394, 390, 386, 382, 377, 373, 370, 366, + 362, 358, 354, 350, 347, 343, 339, 336, 332, 329, 325, 322, + 318, 315, 311, 308, 305, 302, 298, 295, 292, 289, 286, 282, + 279, 276, 273, 270, 267, 264, 261, 258, 256, 253, 250, 247, + 244, 241, 239, 236, 233, 230, 228, 225, 222, 220, 217, 215, + 212, 209, 207, 204, 202, 199, 197, 194, 192, 190, 187, 185, + 182, 180, 178, 175, 173, 171, 168, 166, 164, 162, 159, 157, + 155, 153, 151, 149, 146, 144, 142, 140, 138, 136, 134, 132, + 130, 128, 126, 123, 121, 119, 117, 115, 114, 112, 110, 108, + 106, 104, 102, 100, 98, 96, 94, 93, 91, 89, 87, 85, + 83, 82, 80, 78, 76, 74, 73, 71, 69, 67, 66, 64, + 62, 61, 59, 57, 55, 54, 52, 50, 49, 47, 46, 44, + 42, 41, 39, 37, 36, 34, 33, 31, 30, 28, 26, 25, + 23, 22, 20, 19, 17, 16, 14, 13, 11, 10, 8, 7, + 5, 4, 2, 1, +}; + +static unsigned ZSTD_getFSEMaxSymbolValue(FSE_CTable const* ctable) { + void const* ptr = ctable; + U16 const* u16ptr = (U16 const*)ptr; + U32 const maxSymbolValue = MEM_read16(u16ptr + 1); + return maxSymbolValue; +} + +/** + * Returns the cost in bytes of encoding the normalized count header. + * Returns an error if any of the helper functions return an error. + */ +static size_t ZSTD_NCountCost(unsigned const* count, unsigned const max, + size_t const nbSeq, unsigned const FSELog) +{ + BYTE wksp[FSE_NCOUNTBOUND]; + S16 norm[MaxSeq + 1]; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq, max), ""); + return FSE_writeNCount(wksp, sizeof(wksp), norm, max, tableLog); +} + +/** + * Returns the cost in bits of encoding the distribution described by count + * using the entropy bound. + */ +static size_t ZSTD_entropyCost(unsigned const* count, unsigned const max, size_t const total) +{ + unsigned cost = 0; + unsigned s; + for (s = 0; s <= max; ++s) { + unsigned norm = (unsigned)((256 * count[s]) / total); + if (count[s] != 0 && norm == 0) + norm = 1; + assert(count[s] < total); + cost += count[s] * kInverseProbabilityLog256[norm]; + } + return cost >> 8; +} + +/** + * Returns the cost in bits of encoding the distribution in count using ctable. + * Returns an error if ctable cannot represent all the symbols in count. + */ +size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, + unsigned const* count, + unsigned const max) +{ + unsigned const kAccuracyLog = 8; + size_t cost = 0; + unsigned s; + FSE_CState_t cstate; + FSE_initCState(&cstate, ctable); + if (ZSTD_getFSEMaxSymbolValue(ctable) < max) { + DEBUGLOG(5, "Repeat FSE_CTable has maxSymbolValue %u < %u", + ZSTD_getFSEMaxSymbolValue(ctable), max); + return ERROR(GENERIC); + } + for (s = 0; s <= max; ++s) { + unsigned const tableLog = cstate.stateLog; + unsigned const badCost = (tableLog + 1) << kAccuracyLog; + unsigned const bitCost = FSE_bitCost(cstate.symbolTT, tableLog, s, kAccuracyLog); + if (count[s] == 0) + continue; + if (bitCost >= badCost) { + DEBUGLOG(5, "Repeat FSE_CTable has Prob[%u] == 0", s); + return ERROR(GENERIC); + } + cost += (size_t)count[s] * bitCost; + } + return cost >> kAccuracyLog; +} + +/** + * Returns the cost in bits of encoding the distribution in count using the + * table described by norm. The max symbol support by norm is assumed >= max. + * norm must be valid for every symbol with non-zero probability in count. + */ +size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + unsigned const* count, unsigned const max) +{ + unsigned const shift = 8 - accuracyLog; + size_t cost = 0; + unsigned s; + assert(accuracyLog <= 8); + for (s = 0; s <= max; ++s) { + unsigned const normAcc = (norm[s] != -1) ? (unsigned)norm[s] : 1; + unsigned const norm256 = normAcc << shift; + assert(norm256 > 0); + assert(norm256 < 256); + cost += count[s] * kInverseProbabilityLog256[norm256]; + } + return cost >> 8; +} + +symbolEncodingType_e +ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, + ZSTD_defaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) +{ + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { + /* Prefer set_basic over set_rle when there are 2 or less symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ + DEBUGLOG(5, "Selected set_basic"); + return set_basic; + } + DEBUGLOG(5, "Selected set_rle"); + return set_rle; + } + if (strategy < ZSTD_lazy) { + if (isDefaultAllowed) { + size_t const staticFse_nbSeq_max = 1000; + size_t const mult = 10 - strategy; + size_t const baseLog = 3; + size_t const dynamicFse_nbSeq_min = (((size_t)1 << defaultNormLog) * mult) >> baseLog; /* 28-36 for offset, 56-72 for lengths */ + assert(defaultNormLog >= 5 && defaultNormLog <= 6); /* xx_DEFAULTNORMLOG */ + assert(mult <= 9 && mult >= 7); + if ( (*repeatMode == FSE_repeat_valid) + && (nbSeq < staticFse_nbSeq_max) ) { + DEBUGLOG(5, "Selected set_repeat"); + return set_repeat; + } + if ( (nbSeq < dynamicFse_nbSeq_min) + || (mostFrequent < (nbSeq >> (defaultNormLog-1))) ) { + DEBUGLOG(5, "Selected set_basic"); + /* The format allows default tables to be repeated, but it isn't useful. + * When using simple heuristics to select encoding type, we don't want + * to confuse these tables with dictionaries. When running more careful + * analysis, we don't need to waste time checking both repeating tables + * and default tables. + */ + *repeatMode = FSE_repeat_none; + return set_basic; + } + } + } else { + size_t const basicCost = isDefaultAllowed ? ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, count, max) : ERROR(GENERIC); + size_t const repeatCost = *repeatMode != FSE_repeat_none ? ZSTD_fseBitCost(prevCTable, count, max) : ERROR(GENERIC); + size_t const NCountCost = ZSTD_NCountCost(count, max, nbSeq, FSELog); + size_t const compressedCost = (NCountCost << 3) + ZSTD_entropyCost(count, max, nbSeq); + + if (isDefaultAllowed) { + assert(!ZSTD_isError(basicCost)); + assert(!(*repeatMode == FSE_repeat_valid && ZSTD_isError(repeatCost))); + } + assert(!ZSTD_isError(NCountCost)); + assert(compressedCost < ERROR(maxCode)); + DEBUGLOG(5, "Estimated bit costs: basic=%u\trepeat=%u\tcompressed=%u", + (unsigned)basicCost, (unsigned)repeatCost, (unsigned)compressedCost); + if (basicCost <= repeatCost && basicCost <= compressedCost) { + DEBUGLOG(5, "Selected set_basic"); + assert(isDefaultAllowed); + *repeatMode = FSE_repeat_none; + return set_basic; + } + if (repeatCost <= compressedCost) { + DEBUGLOG(5, "Selected set_repeat"); + assert(!ZSTD_isError(repeatCost)); + return set_repeat; + } + assert(compressedCost < basicCost && compressedCost < repeatCost); + } + DEBUGLOG(5, "Selected set_compressed"); + *repeatMode = FSE_repeat_check; + return set_compressed; +} + +size_t +ZSTD_buildCTable(void* dst, size_t dstCapacity, + FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, + const FSE_CTable* prevCTable, size_t prevCTableSize, + void* entropyWorkspace, size_t entropyWorkspaceSize) +{ + BYTE* op = (BYTE*)dst; + const BYTE* const oend = op + dstCapacity; + DEBUGLOG(6, "ZSTD_buildCTable (dstCapacity=%u)", (unsigned)dstCapacity); + + switch (type) { + case set_rle: + FORWARD_IF_ERROR(FSE_buildCTable_rle(nextCTable, (BYTE)max), ""); + RETURN_ERROR_IF(dstCapacity==0, dstSize_tooSmall, "not enough space"); + *op = codeTable[0]; + return 1; + case set_repeat: + memcpy(nextCTable, prevCTable, prevCTableSize); + return 0; + case set_basic: + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, defaultNorm, defaultMax, defaultNormLog, entropyWorkspace, entropyWorkspaceSize), ""); /* note : could be pre-calculated */ + return 0; + case set_compressed: { + S16 norm[MaxSeq + 1]; + size_t nbSeq_1 = nbSeq; + const U32 tableLog = FSE_optimalTableLog(FSELog, nbSeq, max); + if (count[codeTable[nbSeq-1]] > 1) { + count[codeTable[nbSeq-1]]--; + nbSeq_1--; + } + assert(nbSeq_1 > 1); + FORWARD_IF_ERROR(FSE_normalizeCount(norm, tableLog, count, nbSeq_1, max), ""); + { size_t const NCountSize = FSE_writeNCount(op, oend - op, norm, max, tableLog); /* overflow protected */ + FORWARD_IF_ERROR(NCountSize, "FSE_writeNCount failed"); + FORWARD_IF_ERROR(FSE_buildCTable_wksp(nextCTable, norm, max, tableLog, entropyWorkspace, entropyWorkspaceSize), ""); + return NCountSize; + } + } + default: assert(0); RETURN_ERROR(GENERIC, "impossible to reach"); + } +} + +FORCE_INLINE_TEMPLATE size_t +ZSTD_encodeSequences_body( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; + FSE_CState_t stateOffsetBits; + FSE_CState_t stateLitLength; + + RETURN_ERROR_IF( + ERR_isError(BIT_initCStream(&blockStream, dst, dstCapacity)), + dstSize_tooSmall, "not enough space remaining"); + DEBUGLOG(6, "available space for bitstream : %i (dstCapacity=%u)", + (int)(blockStream.endPtr - blockStream.startPtr), + (unsigned)dstCapacity); + + /* first symbols */ + FSE_initCState2(&stateMatchLength, CTable_MatchLength, mlCodeTable[nbSeq-1]); + FSE_initCState2(&stateOffsetBits, CTable_OffsetBits, ofCodeTable[nbSeq-1]); + FSE_initCState2(&stateLitLength, CTable_LitLength, llCodeTable[nbSeq-1]); + BIT_addBits(&blockStream, sequences[nbSeq-1].litLength, LL_bits[llCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[nbSeq-1].matchLength, ML_bits[mlCodeTable[nbSeq-1]]); + if (MEM_32bits()) BIT_flushBits(&blockStream); + if (longOffsets) { + U32 const ofBits = ofCodeTable[nbSeq-1]; + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, extraBits); + BIT_flushBits(&blockStream); + } + BIT_addBits(&blockStream, sequences[nbSeq-1].offset >> extraBits, + ofBits - extraBits); + } else { + BIT_addBits(&blockStream, sequences[nbSeq-1].offset, ofCodeTable[nbSeq-1]); + } + BIT_flushBits(&blockStream); + + { size_t n; + for (n=nbSeq-2 ; n= 64-7-(LLFSELog+MLFSELog+OffFSELog))) + BIT_flushBits(&blockStream); /* (7)*/ + BIT_addBits(&blockStream, sequences[n].litLength, llBits); + if (MEM_32bits() && ((llBits+mlBits)>24)) BIT_flushBits(&blockStream); + BIT_addBits(&blockStream, sequences[n].matchLength, mlBits); + if (MEM_32bits() || (ofBits+mlBits+llBits > 56)) BIT_flushBits(&blockStream); + if (longOffsets) { + unsigned const extraBits = ofBits - MIN(ofBits, STREAM_ACCUMULATOR_MIN-1); + if (extraBits) { + BIT_addBits(&blockStream, sequences[n].offset, extraBits); + BIT_flushBits(&blockStream); /* (7)*/ + } + BIT_addBits(&blockStream, sequences[n].offset >> extraBits, + ofBits - extraBits); /* 31 */ + } else { + BIT_addBits(&blockStream, sequences[n].offset, ofBits); /* 31 */ + } + BIT_flushBits(&blockStream); /* (7)*/ + DEBUGLOG(7, "remaining space : %i", (int)(blockStream.endPtr - blockStream.ptr)); + } } + + DEBUGLOG(6, "ZSTD_encodeSequences: flushing ML state with %u bits", stateMatchLength.stateLog); + FSE_flushCState(&blockStream, &stateMatchLength); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing Off state with %u bits", stateOffsetBits.stateLog); + FSE_flushCState(&blockStream, &stateOffsetBits); + DEBUGLOG(6, "ZSTD_encodeSequences: flushing LL state with %u bits", stateLitLength.stateLog); + FSE_flushCState(&blockStream, &stateLitLength); + + { size_t const streamSize = BIT_closeCStream(&blockStream); + RETURN_ERROR_IF(streamSize==0, dstSize_tooSmall, "not enough space"); + return streamSize; + } +} + +static size_t +ZSTD_encodeSequences_default( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + + +#if DYNAMIC_BMI2 + +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_encodeSequences_bmi2( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets) +{ + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} + +#endif + +size_t ZSTD_encodeSequences( + void* dst, size_t dstCapacity, + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, + seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) +{ + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); +#if DYNAMIC_BMI2 + if (bmi2) { + return ZSTD_encodeSequences_bmi2(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); + } +#endif + (void)bmi2; + return ZSTD_encodeSequences_default(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, longOffsets); +} +/**** ended inlining compress/zstd_compress_sequences.c ****/ +/**** start inlining compress/zstd_compress_superblock.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + /*-************************************* + * Dependencies + ***************************************/ +/**** start inlining zstd_compress_superblock.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMPRESS_ADVANCED_H +#define ZSTD_COMPRESS_ADVANCED_H + +/*-************************************* +* Dependencies +***************************************/ + +/**** skipping file: ../zstd.h ****/ + +/*-************************************* +* Target Compressed Block Size +***************************************/ + +/* ZSTD_compressSuperBlock() : + * Used to compress a super block when targetCBlockSize is being used. + * The given block will be compressed into multiple sub blocks that are around targetCBlockSize. */ +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + void const* src, size_t srcSize, + unsigned lastBlock); + +#endif /* ZSTD_COMPRESS_ADVANCED_H */ +/**** ended inlining zstd_compress_superblock.h ****/ + +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: hist.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_compress_sequences.h ****/ +/**** skipping file: zstd_compress_literals.h ****/ + +/*-************************************* +* Superblock entropy buffer structs +***************************************/ +/** ZSTD_hufCTablesMetadata_t : + * Stores Literals Block Type for a super-block in hType, and + * huffman tree description in hufDesBuffer. + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildSuperBlockEntropy_literal() */ +typedef struct { + symbolEncodingType_e hType; + BYTE hufDesBuffer[500]; /* TODO give name to this value */ + size_t hufDesSize; +} ZSTD_hufCTablesMetadata_t; + +/** ZSTD_fseCTablesMetadata_t : + * Stores symbol compression modes for a super-block in {ll, ol, ml}Type, and + * fse tables in fseTablesBuffer. + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildSuperBlockEntropy_sequences() */ +typedef struct { + symbolEncodingType_e llType; + symbolEncodingType_e ofType; + symbolEncodingType_e mlType; + BYTE fseTablesBuffer[500]; /* TODO give name to this value */ + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_compressSubBlock_sequences() */ +} ZSTD_fseCTablesMetadata_t; + +typedef struct { + ZSTD_hufCTablesMetadata_t hufMetadata; + ZSTD_fseCTablesMetadata_t fseMetadata; +} ZSTD_entropyCTablesMetadata_t; + + +/** ZSTD_buildSuperBlockEntropy_literal() : + * Builds entropy for the super-block literals. + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * @return : size of huffman description table or error code */ +static size_t ZSTD_buildSuperBlockEntropy_literal(void* const src, size_t srcSize, + const ZSTD_hufCTables_t* prevHuf, + ZSTD_hufCTables_t* nextHuf, + ZSTD_hufCTablesMetadata_t* hufMetadata, + const int disableLiteralsCompression, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; + const size_t nodeWkspSize = wkspEnd-nodeWksp; + unsigned maxSymbolValue = 255; + unsigned huffLog = HUF_TABLELOG_DEFAULT; + HUF_repeat repeat = prevHuf->repeatMode; + + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_literal (srcSize=%zu)", srcSize); + + /* Prepare nextEntropy assuming reusing the existing table */ + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + + if (disableLiteralsCompression) { + DEBUGLOG(5, "set_basic - disabled"); + hufMetadata->hType = set_basic; + return 0; + } + + /* small ? don't even attempt compression (speed opt) */ +# define COMPRESS_LITERALS_SIZE_MIN 63 + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Scan input and build symbol stats */ + { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; + } + } + + /* Validate the previous Huffman table */ + if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); + huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; + { /* Build and write the CTable */ + size_t const newCSize = HUF_estimateCompressedSize( + (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); + size_t const hSize = HUF_writeCTable( + hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), + (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog); + /* Check against repeating the previous CTable */ + if (repeat != HUF_repeat_none) { + size_t const oldCSize = HUF_estimateCompressedSize( + (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); + if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { + DEBUGLOG(5, "set_repeat - smaller"); + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_repeat; + return 0; + } + } + if (newCSize + hSize >= srcSize) { + DEBUGLOG(5, "set_basic - no gains"); + memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); + hufMetadata->hType = set_basic; + return 0; + } + DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); + hufMetadata->hType = set_compressed; + nextHuf->repeatMode = HUF_repeat_check; + return hSize; + } + } +} + +/** ZSTD_buildSuperBlockEntropy_sequences() : + * Builds entropy for the super-block sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * @return : size of fse tables or error code */ +static size_t ZSTD_buildSuperBlockEntropy_sequences(seqStore_t* seqStorePtr, + const ZSTD_fseCTables_t* prevEntropy, + ZSTD_fseCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize) +{ + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; + BYTE* const countWkspStart = wkspStart; + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (MaxSeq + 1) * sizeof(unsigned); + BYTE* const cTableWksp = countWkspStart + countWkspSize; + const size_t cTableWkspSize = wkspEnd-cTableWksp; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + FSE_CTable* CTable_LitLength = nextEntropy->litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->matchlengthCTable; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; + + assert(cTableWkspSize >= (1 << MaxFSELog) * sizeof(FSE_FUNCTION_TYPE)); + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy_sequences (nbSeq=%zu)", nbSeq); + memset(workspace, 0, wkspSize); + + fseMetadata->lastCountSize = 0; + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + /* build CTable for Literal Lengths */ + { U32 LLtype; + unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, llCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->litlength_repeatMode = prevEntropy->litlength_repeatMode; + LLtype = ZSTD_selectEncodingType(&nextEntropy->litlength_repeatMode, + countWksp, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, + countWksp, max, llCodeTable, nbSeq, LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, sizeof(prevEntropy->litlengthCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); + if (LLtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->llType = (symbolEncodingType_e) LLtype; + } } + /* build CTable for Offsets */ + { U32 Offtype; + unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, ofCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, + countWksp, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, + countWksp, max, ofCodeTable, nbSeq, OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, sizeof(prevEntropy->offcodeCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); + if (Offtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->ofType = (symbolEncodingType_e) Offtype; + } } + /* build CTable for MatchLengths */ + { U32 MLtype; + unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp(countWksp, &max, mlCodeTable, nbSeq, workspace, wkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->matchlength_repeatMode = prevEntropy->matchlength_repeatMode; + MLtype = ZSTD_selectEncodingType(&nextEntropy->matchlength_repeatMode, + countWksp, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable(op, oend - op, CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, + countWksp, max, mlCodeTable, nbSeq, ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, sizeof(prevEntropy->matchlengthCTable), + cTableWksp, cTableWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); + if (MLtype == set_compressed) + fseMetadata->lastCountSize = countSize; + op += countSize; + fseMetadata->mlType = (symbolEncodingType_e) MLtype; + } } + assert((size_t) (op-ostart) <= sizeof(fseMetadata->fseTablesBuffer)); + return op-ostart; +} + + +/** ZSTD_buildSuperBlockEntropy() : + * Builds entropy for the super-block. + * @return : 0 on success or error code */ +static size_t +ZSTD_buildSuperBlockEntropy(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize) +{ + size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; + DEBUGLOG(5, "ZSTD_buildSuperBlockEntropy"); + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildSuperBlockEntropy_literal(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_disableLiteralsCompression(cctxParams), + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildSuperBlockEntropy_literal failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildSuperBlockEntropy_sequences(seqStorePtr, + &prevEntropy->fse, &nextEntropy->fse, + cctxParams, + &entropyMetadata->fseMetadata, + workspace, wkspSize); + FORWARD_IF_ERROR(entropyMetadata->fseMetadata.fseTablesSize, "ZSTD_buildSuperBlockEntropy_sequences failed"); + return 0; +} + +/** ZSTD_compressSubBlock_literal() : + * Compresses literals section for a sub-block. + * When we have to write the Huffman table we will sometimes choose a header + * size larger than necessary. This is because we have to pick the header size + * before we know the table size + compressed size, so we have a bound on the + * table size. If we guessed incorrectly, we fall back to uncompressed literals. + * + * We write the header when writeEntropy=1 and set entropyWrriten=1 when we succeeded + * in writing the header, otherwise it is set to 0. + * + * hufMetadata->hType has literals block type info. + * If it is set_basic, all sub-blocks literals section will be Raw_Literals_Block. + * If it is set_rle, all sub-blocks literals section will be RLE_Literals_Block. + * If it is set_compressed, first sub-block's literals section will be Compressed_Literals_Block + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block + * Or 0 if it unable to compress. + * Or error code */ +static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + const BYTE* literals, size_t litSize, + void* dst, size_t dstSize, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + + (void)bmi2; /* TODO bmi2... */ + + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; + if (litSize == 0 || hufMetadata->hType == set_basic) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } else if (hufMetadata->hType == set_rle) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using rle literal"); + return ZSTD_compressRleLiteralsBlock(dst, dstSize, literals, litSize); + } + + assert(litSize > 0); + assert(hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat); + + if (writeEntropy && hufMetadata->hType == set_compressed) { + memcpy(op, hufMetadata->hufDesBuffer, hufMetadata->hufDesSize); + op += hufMetadata->hufDesSize; + cLitSize += hufMetadata->hufDesSize; + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + + /* TODO bmi2 */ + { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) + : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { + DEBUGLOG(5, "Failed to write entropy tables %s", ZSTD_getErrorName(cSize)); + return 0; + } + /* If we expand and we aren't writing a header then emit uncompressed */ + if (!writeEntropy && cLitSize >= litSize) { + DEBUGLOG(5, "ZSTD_compressSubBlock_literal using raw literal because uncompressible"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + /* If we are writing headers then allow expansion that doesn't change our header size. */ + if (lhSize < (size_t)(3 + (cLitSize >= 1 KB) + (cLitSize >= 16 KB))) { + assert(cLitSize > litSize); + DEBUGLOG(5, "Literals expanded beyond allowed header size"); + return ZSTD_noCompressLiterals(dst, dstSize, literals, litSize); + } + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (cSize=%zu)", cSize); + } + + /* Build header */ + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ + { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ + { U32 const lhc = hType + (2 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ + { U32 const lhc = hType + (3 << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); + break; + } + default: /* not possible : lhSize is {3,4,5} */ + assert(0); + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); + return op-ostart; +} + +static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { + const seqDef* const sstart = sequences; + const seqDef* const send = sequences + nbSeq; + const seqDef* sp = sstart; + size_t matchLengthSum = 0; + size_t litLengthSum = 0; + while (send-sp > 0) { + ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); + litLengthSum += seqLen.litLength; + matchLengthSum += seqLen.matchLength; + sp++; + } + assert(litLengthSum <= litSize); + if (!lastSequence) { + assert(litLengthSum == litSize); + } + return matchLengthSum + litSize; +} + +/** ZSTD_compressSubBlock_sequences() : + * Compresses sequences section for a sub-block. + * fseMetadata->llType, fseMetadata->ofType, and fseMetadata->mlType have + * symbol compression modes for the super-block. + * The first successfully compressed block will have these in its header. + * We set entropyWritten=1 when we succeed in compressing the sequences. + * The following sub-blocks will always have repeat mode. + * @return : compressed size of sequences section of a sub-block + * Or 0 if it is unable to compress + * Or error code. */ +static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + const seqDef* sequences, size_t nbSeq, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, int writeEntropy, int* entropyWritten) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + BYTE* seqHead; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (nbSeq=%zu, writeEntropy=%d, longOffsets=%d)", nbSeq, writeEntropy, longOffsets); + + *entropyWritten = 0; + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); + if (nbSeq < 0x7F) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { + return op - ostart; + } + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (seqHeadSize=%u)", (unsigned)(op-ostart)); + + if (writeEntropy) { + const U32 LLtype = fseMetadata->llType; + const U32 Offtype = fseMetadata->ofType; + const U32 MLtype = fseMetadata->mlType; + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (fseTablesSize=%zu)", fseMetadata->fseTablesSize); + *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + memcpy(op, fseMetadata->fseTablesBuffer, fseMetadata->fseTablesSize); + op += fseMetadata->fseTablesSize; + } else { + const U32 repeat = set_repeat; + *seqHead = (BYTE)((repeat<<6) + (repeat<<4) + (repeat<<2)); + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, oend - op, + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (writeEntropy && fseMetadata->lastCountSize && fseMetadata->lastCountSize + bitstreamSize < 4) { + /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(fseMetadata->lastCountSize + bitstreamSize == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } +#endif + DEBUGLOG(5, "ZSTD_compressSubBlock_sequences (bitstreamSize=%zu)", bitstreamSize); + } + + /* zstd versions <= 1.4.0 mistakenly report error when + * sequences section body size is less than 3 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1664. + * This can happen when the previous sequences section block is compressed + * with rle mode and the current block's sequences section is compressed + * with repeat mode where sequences section body size can be 1 byte. + */ +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + if (op-seqHead < 4) { + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.4.0 by emitting " + "an uncompressed block when sequences are < 4 bytes"); + return 0; + } +#endif + + *entropyWritten = 1; + return op - ostart; +} + +/** ZSTD_compressSubBlock() : + * Compresses a single sub-block. + * @return : compressed size of the sub-block + * Or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const seqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const int bmi2, + int writeLitEntropy, int writeSeqEntropy, + int* litEntropyWritten, int* seqEntropyWritten, + U32 lastBlock) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart + ZSTD_blockHeaderSize; + DEBUGLOG(5, "ZSTD_compressSubBlock (litSize=%zu, nbSeq=%zu, writeLitEntropy=%d, writeSeqEntropy=%d, lastBlock=%d)", + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, + op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; + } + { size_t cSeqSize = ZSTD_compressSubBlock_sequences(&entropy->fse, + &entropyMetadata->fseMetadata, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, + op, oend-op, + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ + { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } + return op-ostart; +} + +static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, + const ZSTD_hufCTables_t* huf, + const ZSTD_hufCTablesMetadata_t* hufMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = 255; + size_t literalSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + + if (hufMetadata->hType == set_basic) return litSize; + else if (hufMetadata->hType == set_rle) return 1; + else if (hufMetadata->hType == set_compressed || hufMetadata->hType == set_repeat) { + size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)literals, litSize, workspace, wkspSize); + if (ZSTD_isError(largest)) return litSize; + { size_t cLitSizeEstimate = HUF_estimateCompressedSize((const HUF_CElt*)huf->CTable, countWksp, maxSymbolValue); + if (writeEntropy) cLitSizeEstimate += hufMetadata->hufDesSize; + return cLitSizeEstimate + literalSectionHeaderSize; + } } + assert(0); /* impossible */ + return 0; +} + +static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U32* additionalBits, + short const* defaultNorm, U32 defaultNormLog, + void* workspace, size_t wkspSize) +{ + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; + const BYTE* const ctStart = ctp; + const BYTE* const ctEnd = ctStart + nbSeq; + size_t cSymbolTypeSizeEstimateInBits = 0; + unsigned max = maxCode; + + HIST_countFast_wksp(countWksp, &max, codeTable, nbSeq, workspace, wkspSize); /* can't fail */ + if (type == set_basic) { + cSymbolTypeSizeEstimateInBits = ZSTD_crossEntropyCost(defaultNorm, defaultNormLog, countWksp, max); + } else if (type == set_rle) { + cSymbolTypeSizeEstimateInBits = 0; + } else if (type == set_compressed || type == set_repeat) { + cSymbolTypeSizeEstimateInBits = ZSTD_fseBitCost(fseCTable, countWksp, max); + } + if (ZSTD_isError(cSymbolTypeSizeEstimateInBits)) return nbSeq * 10; + while (ctp < ctEnd) { + if (additionalBits) cSymbolTypeSizeEstimateInBits += additionalBits[*ctp]; + else cSymbolTypeSizeEstimateInBits += *ctp; /* for offset, offset code is also the number of additional bits */ + ctp++; + } + return cSymbolTypeSizeEstimateInBits / 8; +} + +static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_fseCTables_t* fseTables, + const ZSTD_fseCTablesMetadata_t* fseMetadata, + void* workspace, size_t wkspSize, + int writeEntropy) +{ + size_t sequencesSectionHeaderSize = 3; /* Use hard coded size of 3 bytes */ + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, MaxOff, + nbSeq, fseTables->offcodeCTable, NULL, + OF_defaultNorm, OF_defaultNormLog, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->llType, llCodeTable, MaxLL, + nbSeq, fseTables->litlengthCTable, LL_bits, + LL_defaultNorm, LL_defaultNormLog, + workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateSubBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, MaxML, + nbSeq, fseTables->matchlengthCTable, ML_bits, + ML_defaultNorm, ML_defaultNormLog, + workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; +} + +static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, + size_t nbSeq, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, + int writeLitEntropy, int writeSeqEntropy) { + size_t cSizeEstimate = 0; + cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, + &entropy->huf, &entropyMetadata->hufMetadata, + workspace, wkspSize, writeLitEntropy); + cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); + return cSizeEstimate + ZSTD_blockHeaderSize; +} + +static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +{ + if (fseMetadata->llType == set_compressed || fseMetadata->llType == set_rle) + return 1; + if (fseMetadata->mlType == set_compressed || fseMetadata->mlType == set_rle) + return 1; + if (fseMetadata->ofType == set_compressed || fseMetadata->ofType == set_rle) + return 1; + return 0; +} + +/** ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. + * Entropy will be written to the first block. + * The following blocks will use repeat mode to compress. + * All sub-blocks are compressed blocks (no raw or rle blocks). + * @return : compressed size of the super block (which is multiple ZSTD blocks) + * Or 0 if it failed to compress. */ +static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) +{ + const seqDef* const sstart = seqStorePtr->sequencesStart; + const seqDef* const send = seqStorePtr->sequences; + const seqDef* sp = sstart; + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; + size_t targetCBlockSize = cctxParams->targetCBlockSize; + size_t litSize, seqCount; + int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; + int writeSeqEntropy = 1; + int lastSequence = 0; + + DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", + (unsigned)(lend-lp), (unsigned)(send-sstart)); + + litSize = 0; + seqCount = 0; + do { + size_t cBlockSizeEstimate = 0; + if (sstart == send) { + lastSequence = 1; + } else { + const seqDef* const sequence = sp + seqCount; + lastSequence = sequence == send - 1; + litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; + seqCount++; + } + if (lastSequence) { + assert(lp <= lend); + assert(litSize <= (size_t)(lend - lp)); + litSize = (size_t)(lend - lp); + } + /* I think there is an optimization opportunity here. + * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful + * since it recalculates estimate from scratch. + * For example, it would recount literal distribution and symbol codes everytime. + */ + cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, + &nextCBlock->entropy, entropyMetadata, + workspace, wkspSize, writeLitEntropy, writeSeqEntropy); + if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { + int litEntropyWritten = 0; + int seqEntropyWritten = 0; + const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); + const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, + sp, seqCount, + lp, litSize, + llCodePtr, mlCodePtr, ofCodePtr, + cctxParams, + op, oend-op, + bmi2, writeLitEntropy, writeSeqEntropy, + &litEntropyWritten, &seqEntropyWritten, + lastBlock && lastSequence); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); + if (cSize > 0 && cSize < decompressedSize) { + DEBUGLOG(5, "Committed the sub-block"); + assert(ip + decompressedSize <= iend); + ip += decompressedSize; + sp += seqCount; + lp += litSize; + op += cSize; + llCodePtr += seqCount; + mlCodePtr += seqCount; + ofCodePtr += seqCount; + litSize = 0; + seqCount = 0; + /* Entropy only needs to be written once */ + if (litEntropyWritten) { + writeLitEntropy = 0; + } + if (seqEntropyWritten) { + writeSeqEntropy = 0; + } + } + } + } while (!lastSequence); + if (writeLitEntropy) { + DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); + memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ + DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); + return 0; + } + if (ip < iend) { + size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); + DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { + seqDef const* seq; + repcodes_t rep; + memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { + rep = ZSTD_updateRep(rep.rep, seq->offset - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } + DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); + return op-ostart; +} + +size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + void const* src, size_t srcSize, + unsigned lastBlock) { + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildSuperBlockEntropy(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, + zc->blockState.nextCBlock, + &entropyMetadata, + &zc->appliedParams, + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */); +} +/**** ended inlining compress/zstd_compress_superblock.c ****/ +/**** start inlining compress/zstd_compress.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/*-************************************* +* Dependencies +***************************************/ +#include /* INT_MAX */ +#include /* memset */ +/**** start inlining ../common/cpu.h ****/ +/* + * Copyright (c) 2018-2020, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_COMMON_CPU_H +#define ZSTD_COMMON_CPU_H + +/** + * Implementation taken from folly/CpuId.h + * https://github.com/facebook/folly/blob/master/folly/CpuId.h + */ + +#include + +/**** skipping file: mem.h ****/ + +#ifdef _MSC_VER +#include +#endif + +typedef struct { + U32 f1c; + U32 f1d; + U32 f7b; + U32 f7c; +} ZSTD_cpuid_t; + +MEM_STATIC ZSTD_cpuid_t ZSTD_cpuid(void) { + U32 f1c = 0; + U32 f1d = 0; + U32 f7b = 0; + U32 f7c = 0; +#if defined(_MSC_VER) && (defined(_M_X64) || defined(_M_IX86)) + int reg[4]; + __cpuid((int*)reg, 0); + { + int const n = reg[0]; + if (n >= 1) { + __cpuid((int*)reg, 1); + f1c = (U32)reg[2]; + f1d = (U32)reg[3]; + } + if (n >= 7) { + __cpuidex((int*)reg, 7, 0); + f7b = (U32)reg[1]; + f7c = (U32)reg[2]; + } + } +#elif defined(__i386__) && defined(__PIC__) && !defined(__clang__) && defined(__GNUC__) + /* The following block like the normal cpuid branch below, but gcc + * reserves ebx for use of its pic register so we must specially + * handle the save and restore to avoid clobbering the register + */ + U32 n; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(n) + : "a"(0) + : "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "popl %%ebx\n\t" + : "=a"(f1a), "=c"(f1c), "=d"(f1d) + : "a"(1)); + } + if (n >= 7) { + __asm__( + "pushl %%ebx\n\t" + "cpuid\n\t" + "movl %%ebx, %%eax\n\t" + "popl %%ebx" + : "=a"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#elif defined(__x86_64__) || defined(_M_X64) || defined(__i386__) + U32 n; + __asm__("cpuid" : "=a"(n) : "a"(0) : "ebx", "ecx", "edx"); + if (n >= 1) { + U32 f1a; + __asm__("cpuid" : "=a"(f1a), "=c"(f1c), "=d"(f1d) : "a"(1) : "ebx"); + } + if (n >= 7) { + U32 f7a; + __asm__("cpuid" + : "=a"(f7a), "=b"(f7b), "=c"(f7c) + : "a"(7), "c"(0) + : "edx"); + } +#endif + { + ZSTD_cpuid_t cpuid; + cpuid.f1c = f1c; + cpuid.f1d = f1d; + cpuid.f7b = f7b; + cpuid.f7c = f7c; + return cpuid; + } +} + +#define X(name, r, bit) \ + MEM_STATIC int ZSTD_cpuid_##name(ZSTD_cpuid_t const cpuid) { \ + return ((cpuid.r) & (1U << bit)) != 0; \ + } + +/* cpuid(1): Processor Info and Feature Bits. */ +#define C(name, bit) X(name, f1c, bit) + C(sse3, 0) + C(pclmuldq, 1) + C(dtes64, 2) + C(monitor, 3) + C(dscpl, 4) + C(vmx, 5) + C(smx, 6) + C(eist, 7) + C(tm2, 8) + C(ssse3, 9) + C(cnxtid, 10) + C(fma, 12) + C(cx16, 13) + C(xtpr, 14) + C(pdcm, 15) + C(pcid, 17) + C(dca, 18) + C(sse41, 19) + C(sse42, 20) + C(x2apic, 21) + C(movbe, 22) + C(popcnt, 23) + C(tscdeadline, 24) + C(aes, 25) + C(xsave, 26) + C(osxsave, 27) + C(avx, 28) + C(f16c, 29) + C(rdrand, 30) +#undef C +#define D(name, bit) X(name, f1d, bit) + D(fpu, 0) + D(vme, 1) + D(de, 2) + D(pse, 3) + D(tsc, 4) + D(msr, 5) + D(pae, 6) + D(mce, 7) + D(cx8, 8) + D(apic, 9) + D(sep, 11) + D(mtrr, 12) + D(pge, 13) + D(mca, 14) + D(cmov, 15) + D(pat, 16) + D(pse36, 17) + D(psn, 18) + D(clfsh, 19) + D(ds, 21) + D(acpi, 22) + D(mmx, 23) + D(fxsr, 24) + D(sse, 25) + D(sse2, 26) + D(ss, 27) + D(htt, 28) + D(tm, 29) + D(pbe, 31) +#undef D + +/* cpuid(7): Extended Features. */ +#define B(name, bit) X(name, f7b, bit) + B(bmi1, 3) + B(hle, 4) + B(avx2, 5) + B(smep, 7) + B(bmi2, 8) + B(erms, 9) + B(invpcid, 10) + B(rtm, 11) + B(mpx, 14) + B(avx512f, 16) + B(avx512dq, 17) + B(rdseed, 18) + B(adx, 19) + B(smap, 20) + B(avx512ifma, 21) + B(pcommit, 22) + B(clflushopt, 23) + B(clwb, 24) + B(avx512pf, 26) + B(avx512er, 27) + B(avx512cd, 28) + B(sha, 29) + B(avx512bw, 30) + B(avx512vl, 31) +#undef B +#define C(name, bit) X(name, f7c, bit) + C(prefetchwt1, 0) + C(avx512vbmi, 1) +#undef C + +#undef X + +#endif /* ZSTD_COMMON_CPU_H */ +/**** ended inlining ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: hist.h ****/ +#define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_compress_sequences.h ****/ +/**** skipping file: zstd_compress_literals.h ****/ +/**** start inlining zstd_fast.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_FAST_H +#define ZSTD_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm); +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_fast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ +/**** ended inlining zstd_fast.h ****/ +/**** start inlining zstd_double_fast.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_DOUBLE_FAST_H +#define ZSTD_DOUBLE_FAST_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: zstd_compress_internal.h ****/ + +void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm); +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_doubleFast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_DOUBLE_FAST_H */ +/**** ended inlining zstd_double_fast.h ****/ +/**** start inlining zstd_lazy.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LAZY_H +#define ZSTD_LAZY_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + +void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_LAZY_H */ +/**** ended inlining zstd_lazy.h ****/ +/**** start inlining zstd_opt.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_OPT_H +#define ZSTD_OPT_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ + +/* used in ZSTD_loadDictionaryContent() */ +void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); + +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_OPT_H */ +/**** ended inlining zstd_opt.h ****/ +/**** start inlining zstd_ldm.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LDM_H +#define ZSTD_LDM_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: ../zstd.h ****/ + +/*-************************************* +* Long distance matching +***************************************/ + +#define ZSTD_LDM_DEFAULT_WINDOW_LOG ZSTD_WINDOWLOG_LIMIT_DEFAULT + +void ZSTD_ldm_fillHashTable( + ldmState_t* state, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params); + +/** + * ZSTD_ldm_generateSequences(): + * + * Generates the sequences using the long distance match finder. + * Generates long range matching sequences in `sequences`, which parse a prefix + * of the source. `sequences` must be large enough to store every sequence, + * which can be checked with `ZSTD_ldm_getMaxNbSeq()`. + * @returns 0 or an error code. + * + * NOTE: The user must have called ZSTD_window_update() for all of the input + * they have, even if they pass it to ZSTD_ldm_generateSequences() in chunks. + * NOTE: This function returns an error if it runs out of space to store + * sequences. + */ +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldms, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + +/** + * ZSTD_ldm_blockCompress(): + * + * Compresses a block using the predefined sequences, along with a secondary + * block compressor. The literals section of every sequence is passed to the + * secondary block compressor, and those sequences are interspersed with the + * predefined sequences. Returns the length of the last literals. + * Updates `rawSeqStore.pos` to indicate how many sequences have been consumed. + * `rawSeqStore.seq` may also be updated to split the last sequence between two + * blocks. + * @return The length of the last literals. + * + * NOTE: The source must be at most the maximum block size, but the predefined + * sequences can be any size, and may be longer than the block. In the case that + * they are longer than the block, the last sequences may need to be split into + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +/** + * ZSTD_ldm_skipSequences(): + * + * Skip past `srcSize` bytes worth of sequences in `rawSeqStore`. + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + +/** ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is + * disabled. + */ +size_t ZSTD_ldm_getTableSize(ldmParams_t params); + +/** ZSTD_ldm_getSeqSpace() : + * Return an upper bound on the number of sequences that can be produced by + * the long distance matcher, or 0 if LDM is disabled. + */ +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); + +/** ZSTD_ldm_adjustParameters() : + * If the params->hashRateLog is not set, set it to its default value based on + * windowLog and params->hashLog. + * + * Ensures that params->bucketSizeLog is <= params->hashLog (setting it to + * params->hashLog if it is not). + * + * Ensures that the minMatchLength >= targetLength during optimal parsing. + */ +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_FAST_H */ +/**** ended inlining zstd_ldm.h ****/ +/**** skipping file: zstd_compress_superblock.h ****/ + + +/*-************************************* +* Helper functions +***************************************/ +/* ZSTD_compressBound() + * Note that the result from this function is only compatible with the "normal" + * full-block strategy. + * When there are a lot of small blocks due to frequent flush in streaming mode + * the overhead of headers can make the compressed data to be larger than the + * return value of ZSTD_compressBound(). + */ +size_t ZSTD_compressBound(size_t srcSize) { + return ZSTD_COMPRESSBOUND(srcSize); +} + + +/*-************************************* +* Context memory management +***************************************/ +struct ZSTD_CDict_s { + const void* dictContent; + size_t dictContentSize; + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; + ZSTD_matchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +}; /* typedef'd to ZSTD_CDict within "zstd.h" */ + +ZSTD_CCtx* ZSTD_createCCtx(void) +{ + return ZSTD_createCCtx_advanced(ZSTD_defaultCMem); +} + +static void ZSTD_initCCtx(ZSTD_CCtx* cctx, ZSTD_customMem memManager) +{ + assert(cctx != NULL); + memset(cctx, 0, sizeof(*cctx)); + cctx->customMem = memManager; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + { size_t const err = ZSTD_CCtx_reset(cctx, ZSTD_reset_parameters); + assert(!ZSTD_isError(err)); + (void)err; + } +} + +ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem) +{ + ZSTD_STATIC_ASSERT(zcss_init==0); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN==(0ULL - 1)); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + { ZSTD_CCtx* const cctx = (ZSTD_CCtx*)ZSTD_malloc(sizeof(ZSTD_CCtx), customMem); + if (!cctx) return NULL; + ZSTD_initCCtx(cctx, customMem); + return cctx; + } +} + +ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) +{ + ZSTD_cwksp ws; + ZSTD_CCtx* cctx; + if (workspaceSize <= sizeof(ZSTD_CCtx)) return NULL; /* minimum size */ + if ((size_t)workspace & 7) return NULL; /* must be 8-aligned */ + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + + cctx = (ZSTD_CCtx*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CCtx)); + if (cctx == NULL) return NULL; + + memset(cctx, 0, sizeof(ZSTD_CCtx)); + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + + /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ + if (!ZSTD_cwksp_check_available(&cctx->workspace, HUF_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, HUF_WORKSPACE_SIZE); + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; +} + +/** + * Clears and frees all of the dictionaries in the CCtx. + */ +static void ZSTD_clearAllDicts(ZSTD_CCtx* cctx) +{ + ZSTD_free(cctx->localDict.dictBuffer, cctx->customMem); + ZSTD_freeCDict(cctx->localDict.cdict); + memset(&cctx->localDict, 0, sizeof(cctx->localDict)); + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); + cctx->cdict = NULL; +} + +static size_t ZSTD_sizeof_localDict(ZSTD_localDict dict) +{ + size_t const bufferSize = dict.dictBuffer != NULL ? dict.dictSize : 0; + size_t const cdictSize = ZSTD_sizeof_CDict(dict.cdict); + return bufferSize + cdictSize; +} + +static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) +{ + assert(cctx != NULL); + assert(cctx->staticSize == 0); + ZSTD_clearAllDicts(cctx); +#ifdef ZSTD_MULTITHREAD + ZSTDMT_freeCCtx(cctx->mtctx); cctx->mtctx = NULL; +#endif + ZSTD_cwksp_free(&cctx->workspace, cctx->customMem); +} + +size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); + { + int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); + if (!cctxInWorkspace) { + ZSTD_free(cctx, cctx->customMem); + } + } + return 0; +} + + +static size_t ZSTD_sizeof_mtctx(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + return ZSTDMT_sizeof_CCtx(cctx->mtctx); +#else + (void)cctx; + return 0; +#endif +} + + +size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx) +{ + if (cctx==NULL) return 0; /* support sizeof on NULL */ + /* cctx may be in the workspace */ + return (cctx->workspace.workspace == cctx ? 0 : sizeof(*cctx)) + + ZSTD_cwksp_sizeof(&cctx->workspace) + + ZSTD_sizeof_localDict(cctx->localDict) + + ZSTD_sizeof_mtctx(cctx); +} + +size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) +{ + return ZSTD_sizeof_CCtx(zcs); /* same object */ +} + +/* private API call, for dictBuilder only */ +const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + +static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params cctxParams; + memset(&cctxParams, 0, sizeof(cctxParams)); + cctxParams.cParams = cParams; + cctxParams.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + assert(!ZSTD_checkCParams(cParams)); + cctxParams.fParams.contentSizeFlag = 1; + return cctxParams; +} + +static ZSTD_CCtx_params* ZSTD_createCCtxParams_advanced( + ZSTD_customMem customMem) +{ + ZSTD_CCtx_params* params; + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + params = (ZSTD_CCtx_params*)ZSTD_calloc( + sizeof(ZSTD_CCtx_params), customMem); + if (!params) { return NULL; } + params->customMem = customMem; + params->compressionLevel = ZSTD_CLEVEL_DEFAULT; + params->fParams.contentSizeFlag = 1; + return params; +} + +ZSTD_CCtx_params* ZSTD_createCCtxParams(void) +{ + return ZSTD_createCCtxParams_advanced(ZSTD_defaultCMem); +} + +size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params) +{ + if (params == NULL) { return 0; } + ZSTD_free(params, params->customMem); + return 0; +} + +size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params) +{ + return ZSTD_CCtxParams_init(params, ZSTD_CLEVEL_DEFAULT); +} + +size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) { + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + memset(cctxParams, 0, sizeof(*cctxParams)); + cctxParams->compressionLevel = compressionLevel; + cctxParams->fParams.contentSizeFlag = 1; + return 0; +} + +size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +{ + RETURN_ERROR_IF(!cctxParams, GENERIC, "NULL pointer!"); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + memset(cctxParams, 0, sizeof(*cctxParams)); + assert(!ZSTD_checkCParams(params.cParams)); + cctxParams->cParams = params.cParams; + cctxParams->fParams = params.fParams; + cctxParams->compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + return 0; +} + +/* ZSTD_assignParamsToCCtxParams() : + * params is presumed valid at this stage */ +static ZSTD_CCtx_params ZSTD_assignParamsToCCtxParams( + const ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +{ + ZSTD_CCtx_params ret = *cctxParams; + assert(!ZSTD_checkCParams(params->cParams)); + ret.cParams = params->cParams; + ret.fParams = params->fParams; + ret.compressionLevel = ZSTD_CLEVEL_DEFAULT; /* should not matter, as all cParams are presumed properly defined */ + return ret; +} + +ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + + switch(param) + { + case ZSTD_c_compressionLevel: + bounds.lowerBound = ZSTD_minCLevel(); + bounds.upperBound = ZSTD_maxCLevel(); + return bounds; + + case ZSTD_c_windowLog: + bounds.lowerBound = ZSTD_WINDOWLOG_MIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + + case ZSTD_c_hashLog: + bounds.lowerBound = ZSTD_HASHLOG_MIN; + bounds.upperBound = ZSTD_HASHLOG_MAX; + return bounds; + + case ZSTD_c_chainLog: + bounds.lowerBound = ZSTD_CHAINLOG_MIN; + bounds.upperBound = ZSTD_CHAINLOG_MAX; + return bounds; + + case ZSTD_c_searchLog: + bounds.lowerBound = ZSTD_SEARCHLOG_MIN; + bounds.upperBound = ZSTD_SEARCHLOG_MAX; + return bounds; + + case ZSTD_c_minMatch: + bounds.lowerBound = ZSTD_MINMATCH_MIN; + bounds.upperBound = ZSTD_MINMATCH_MAX; + return bounds; + + case ZSTD_c_targetLength: + bounds.lowerBound = ZSTD_TARGETLENGTH_MIN; + bounds.upperBound = ZSTD_TARGETLENGTH_MAX; + return bounds; + + case ZSTD_c_strategy: + bounds.lowerBound = ZSTD_STRATEGY_MIN; + bounds.upperBound = ZSTD_STRATEGY_MAX; + return bounds; + + case ZSTD_c_contentSizeFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_checksumFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_dictIDFlag: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_nbWorkers: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_NBWORKERS_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_jobSize: + bounds.lowerBound = 0; +#ifdef ZSTD_MULTITHREAD + bounds.upperBound = ZSTDMT_JOBSIZE_MAX; +#else + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_overlapLog: +#ifdef ZSTD_MULTITHREAD + bounds.lowerBound = ZSTD_OVERLAPLOG_MIN; + bounds.upperBound = ZSTD_OVERLAPLOG_MAX; +#else + bounds.lowerBound = 0; + bounds.upperBound = 0; +#endif + return bounds; + + case ZSTD_c_enableLongDistanceMatching: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_ldmHashLog: + bounds.lowerBound = ZSTD_LDM_HASHLOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHLOG_MAX; + return bounds; + + case ZSTD_c_ldmMinMatch: + bounds.lowerBound = ZSTD_LDM_MINMATCH_MIN; + bounds.upperBound = ZSTD_LDM_MINMATCH_MAX; + return bounds; + + case ZSTD_c_ldmBucketSizeLog: + bounds.lowerBound = ZSTD_LDM_BUCKETSIZELOG_MIN; + bounds.upperBound = ZSTD_LDM_BUCKETSIZELOG_MAX; + return bounds; + + case ZSTD_c_ldmHashRateLog: + bounds.lowerBound = ZSTD_LDM_HASHRATELOG_MIN; + bounds.upperBound = ZSTD_LDM_HASHRATELOG_MAX; + return bounds; + + /* experimental parameters */ + case ZSTD_c_rsyncable: + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_forceMaxWindow : + bounds.lowerBound = 0; + bounds.upperBound = 1; + return bounds; + + case ZSTD_c_format: + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + bounds.lowerBound = ZSTD_f_zstd1; + bounds.upperBound = ZSTD_f_zstd1_magicless; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_forceAttachDict: + ZSTD_STATIC_ASSERT(ZSTD_dictDefaultAttach < ZSTD_dictForceCopy); + bounds.lowerBound = ZSTD_dictDefaultAttach; + bounds.upperBound = ZSTD_dictForceLoad; /* note : how to ensure at compile time that this is the highest value enum ? */ + return bounds; + + case ZSTD_c_literalCompressionMode: + ZSTD_STATIC_ASSERT(ZSTD_lcm_auto < ZSTD_lcm_huffman && ZSTD_lcm_huffman < ZSTD_lcm_uncompressed); + bounds.lowerBound = ZSTD_lcm_auto; + bounds.upperBound = ZSTD_lcm_uncompressed; + return bounds; + + case ZSTD_c_targetCBlockSize: + bounds.lowerBound = ZSTD_TARGETCBLOCKSIZE_MIN; + bounds.upperBound = ZSTD_TARGETCBLOCKSIZE_MAX; + return bounds; + + case ZSTD_c_srcSizeHint: + bounds.lowerBound = ZSTD_SRCSIZEHINT_MIN; + bounds.upperBound = ZSTD_SRCSIZEHINT_MAX; + return bounds; + + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; + } +} + +/* ZSTD_cParam_clampBounds: + * Clamps the value into the bounded range. + */ +static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) +{ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); + if (ZSTD_isError(bounds.error)) return bounds.error; + if (*value < bounds.lowerBound) *value = bounds.lowerBound; + if (*value > bounds.upperBound) *value = bounds.upperBound; + return 0; +} + +#define BOUNDCHECK(cParam, val) { \ + RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ + parameter_outOfBound, "Param out of bounds"); \ +} + + +static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +{ + switch(param) + { + case ZSTD_c_compressionLevel: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + return 1; + + case ZSTD_c_format: + case ZSTD_c_windowLog: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow : + case ZSTD_c_nbWorkers: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + default: + return 0; + } +} + +size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParameter (%i, %i)", (int)param, value); + if (cctx->streamStage != zcss_init) { + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { + RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); + } } + + switch(param) + { + case ZSTD_c_nbWorkers: + RETURN_ERROR_IF((value!=0) && cctx->staticSize, parameter_unsupported, + "MT not compatible with static alloc"); + break; + + case ZSTD_c_compressionLevel: + case ZSTD_c_windowLog: + case ZSTD_c_hashLog: + case ZSTD_c_chainLog: + case ZSTD_c_searchLog: + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: + case ZSTD_c_ldmHashRateLog: + case ZSTD_c_format: + case ZSTD_c_contentSizeFlag: + case ZSTD_c_checksumFlag: + case ZSTD_c_dictIDFlag: + case ZSTD_c_forceMaxWindow: + case ZSTD_c_forceAttachDict: + case ZSTD_c_literalCompressionMode: + case ZSTD_c_jobSize: + case ZSTD_c_overlapLog: + case ZSTD_c_rsyncable: + case ZSTD_c_enableLongDistanceMatching: + case ZSTD_c_ldmHashLog: + case ZSTD_c_ldmMinMatch: + case ZSTD_c_ldmBucketSizeLog: + case ZSTD_c_targetCBlockSize: + case ZSTD_c_srcSizeHint: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return ZSTD_CCtxParams_setParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + ZSTD_cParameter param, int value) +{ + DEBUGLOG(4, "ZSTD_CCtxParams_setParameter (%i, %i)", (int)param, value); + switch(param) + { + case ZSTD_c_format : + BOUNDCHECK(ZSTD_c_format, value); + CCtxParams->format = (ZSTD_format_e)value; + return (size_t)CCtxParams->format; + + case ZSTD_c_compressionLevel : { + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + if (value) { /* 0 : does not change current level */ + CCtxParams->compressionLevel = value; + } + if (CCtxParams->compressionLevel >= 0) return (size_t)CCtxParams->compressionLevel; + return 0; /* return type (size_t) cannot represent negative values */ + } + + case ZSTD_c_windowLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_windowLog, value); + CCtxParams->cParams.windowLog = (U32)value; + return CCtxParams->cParams.windowLog; + + case ZSTD_c_hashLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_hashLog, value); + CCtxParams->cParams.hashLog = (U32)value; + return CCtxParams->cParams.hashLog; + + case ZSTD_c_chainLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_chainLog, value); + CCtxParams->cParams.chainLog = (U32)value; + return CCtxParams->cParams.chainLog; + + case ZSTD_c_searchLog : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_searchLog, value); + CCtxParams->cParams.searchLog = (U32)value; + return (size_t)value; + + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); + CCtxParams->cParams.minMatch = value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); + CCtxParams->cParams.targetLength = value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_strategy, value); + CCtxParams->cParams.strategy = (ZSTD_strategy)value; + return (size_t)CCtxParams->cParams.strategy; + + case ZSTD_c_contentSizeFlag : + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; + return CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; + return CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); + CCtxParams->fParams.noDictIDFlag = !value; + return !CCtxParams->fParams.noDictIDFlag; + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); + return CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; + BOUNDCHECK(ZSTD_c_forceAttachDict, pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_literalCompressionMode_e lcm = (ZSTD_literalCompressionMode_e)value; + BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } + + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + CCtxParams->nbWorkers = value; + return CCtxParams->nbWorkers; +#endif + + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + /* Adjust to the minimum non-default value. */ + if (value != 0 && value < ZSTDMT_JOBSIZE_MIN) + value = ZSTDMT_JOBSIZE_MIN; + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(param, &value), ""); + assert(value >= 0); + CCtxParams->jobSize = value; + return CCtxParams->jobSize; +#endif + + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->overlapLog = value; + return CCtxParams->overlapLog; +#endif + + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR_IF(value!=0, parameter_unsupported, "not compiled with multithreading"); + return 0; +#else + FORWARD_IF_ERROR(ZSTD_cParam_clampBounds(ZSTD_c_overlapLog, &value), ""); + CCtxParams->rsyncable = value; + return CCtxParams->rsyncable; +#endif + + case ZSTD_c_enableLongDistanceMatching : + CCtxParams->ldmParams.enableLdm = (value!=0); + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); + CCtxParams->ldmParams.hashLog = value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); + CCtxParams->ldmParams.minMatchLength = value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); + CCtxParams->ldmParams.bucketSizeLog = value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + RETURN_ERROR_IF(value > ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN, + parameter_outOfBound, "Param out of bounds!"); + CCtxParams->ldmParams.hashRateLog = value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); + CCtxParams->targetCBlockSize = value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; + return CCtxParams->srcSizeHint; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +} + +size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value) +{ + return ZSTD_CCtxParams_getParameter(&cctx->requestedParams, param, value); +} + +size_t ZSTD_CCtxParams_getParameter( + ZSTD_CCtx_params* CCtxParams, ZSTD_cParameter param, int* value) +{ + switch(param) + { + case ZSTD_c_format : + *value = CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; + break; + case ZSTD_c_windowLog : + *value = (int)CCtxParams->cParams.windowLog; + break; + case ZSTD_c_hashLog : + *value = (int)CCtxParams->cParams.hashLog; + break; + case ZSTD_c_chainLog : + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : + *value = CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : + *value = CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : + *value = CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : + *value = (unsigned)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; + break; + case ZSTD_c_checksumFlag : + *value = CCtxParams->fParams.checksumFlag; + break; + case ZSTD_c_dictIDFlag : + *value = !CCtxParams->fParams.noDictIDFlag; + break; + case ZSTD_c_forceMaxWindow : + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : + *value = CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : + *value = CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : +#ifndef ZSTD_MULTITHREAD + assert(CCtxParams->nbWorkers == 0); +#endif + *value = CCtxParams->nbWorkers; + break; + case ZSTD_c_jobSize : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + assert(CCtxParams->jobSize <= INT_MAX); + *value = (int)CCtxParams->jobSize; + break; +#endif + case ZSTD_c_overlapLog : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->overlapLog; + break; +#endif + case ZSTD_c_rsyncable : +#ifndef ZSTD_MULTITHREAD + RETURN_ERROR(parameter_unsupported, "not compiled with multithreading"); +#else + *value = CCtxParams->rsyncable; + break; +#endif + case ZSTD_c_enableLongDistanceMatching : + *value = CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : + *value = CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : + *value = CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : + *value = CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : + *value = CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; + break; + case ZSTD_c_srcSizeHint : + *value = (int)CCtxParams->srcSizeHint; + break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +} + +/** ZSTD_CCtx_setParametersUsingCCtxParams() : + * just applies `params` into `cctx` + * no action is performed, parameters are merely stored. + * If ZSTDMT is enabled, parameters are pushed to cctx->mtctx. + * This is possible even if a compression is ongoing. + * In which case, new parameters will be applied on the fly, starting with next compression job. + */ +size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_CCtx_setParametersUsingCCtxParams"); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "The context is in the wrong stage!"); + RETURN_ERROR_IF(cctx->cdict, stage_wrong, + "Can't override parameters with cdict attached (some must " + "be inherited from the cdict)."); + + cctx->requestedParams = *params; + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; + return 0; +} + +/** + * Initializes the local dict using the requested parameters. + * NOTE: This does not use the pledged src size, because it may be used for more + * than one compression. + */ +static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) +{ + ZSTD_localDict* const dl = &cctx->localDict; + ZSTD_compressionParameters const cParams = ZSTD_getCParamsFromCCtxParams( + &cctx->requestedParams, ZSTD_CONTENTSIZE_UNKNOWN, dl->dictSize); + if (dl->dict == NULL) { + /* No local dictionary. */ + assert(dl->dictBuffer == NULL); + assert(dl->cdict == NULL); + assert(dl->dictSize == 0); + return 0; + } + if (dl->cdict != NULL) { + assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ + return 0; + } + assert(dl->dictSize > 0); + assert(cctx->cdict == NULL); + assert(cctx->prefixDict.dict == NULL); + + dl->cdict = ZSTD_createCDict_advanced( + dl->dict, + dl->dictSize, + ZSTD_dlm_byRef, + dl->dictContentType, + cParams, + cctx->customMem); + RETURN_ERROR_IF(!dl->cdict, memory_allocation, "ZSTD_createCDict_advanced failed"); + cctx->cdict = dl->cdict; + return 0; +} + +size_t ZSTD_CCtx_loadDictionary_advanced( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't load a dictionary when ctx is not in init stage."); + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "no malloc for static CCtx"); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); + ZSTD_clearAllDicts(cctx); /* in case one already exists */ + if (dict == NULL || dictSize == 0) /* no dictionary mode */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { + void* dictBuffer = ZSTD_malloc(dictSize, cctx->customMem); + RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); + memcpy(dictBuffer, dict, dictSize); + cctx->localDict.dictBuffer = dictBuffer; + cctx->localDict.dict = dictBuffer; + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; + return 0; +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference( + ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize) +{ + return ZSTD_CCtx_loadDictionary_advanced( + cctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + + +size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a dict when ctx not in init stage."); + /* Free the existing local cdict (if any) to save memory. */ + ZSTD_clearAllDicts(cctx); + cctx->cdict = cdict; + return 0; +} + +size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_CCtx_refPrefix_advanced(cctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + +size_t ZSTD_CCtx_refPrefix_advanced( + ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't ref a prefix when ctx not in init stage."); + ZSTD_clearAllDicts(cctx); + if (prefix != NULL && prefixSize > 0) { + cctx->prefixDict.dict = prefix; + cctx->prefixDict.dictSize = prefixSize; + cctx->prefixDict.dictContentType = dictContentType; + } + return 0; +} + +/*! ZSTD_CCtx_reset() : + * Also dumps dictionary */ +size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + cctx->streamStage = zcss_init; + cctx->pledgedSrcSizePlusOne = 0; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't reset parameters only when not in init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } + return 0; +} + + +/** ZSTD_checkCParams() : + control CParam values remain within authorized range. + @return : 0, or an error code if one value is beyond authorized range */ +size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) +{ + BOUNDCHECK(ZSTD_c_windowLog, (int)cParams.windowLog); + BOUNDCHECK(ZSTD_c_chainLog, (int)cParams.chainLog); + BOUNDCHECK(ZSTD_c_hashLog, (int)cParams.hashLog); + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); + BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); + return 0; +} + +/** ZSTD_clampCParams() : + * make CParam values within valid range. + * @return : valid CParams */ +static ZSTD_compressionParameters +ZSTD_clampCParams(ZSTD_compressionParameters cParams) +{ +# define CLAMP_TYPE(cParam, val, type) { \ + ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ + if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ + } +# define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); + CLAMP(ZSTD_c_hashLog, cParams.hashLog); + CLAMP(ZSTD_c_searchLog, cParams.searchLog); + CLAMP(ZSTD_c_minMatch, cParams.minMatch); + CLAMP(ZSTD_c_targetLength,cParams.targetLength); + CLAMP_TYPE(ZSTD_c_strategy,cParams.strategy, ZSTD_strategy); + return cParams; +} + +/** ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat) +{ + U32 const btScale = ((U32)strat >= (U32)ZSTD_btlazy2); + return hashLog - btScale; +} + +/** ZSTD_adjustCParams_internal() : + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ +static ZSTD_compressionParameters +ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize) +{ + static const U64 minSrcSize = 513; /* (1<<9) + 1 */ + static const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + + if (dictSize && srcSize == ZSTD_CONTENTSIZE_UNKNOWN) + srcSize = minSrcSize; + + /* resize windowLog if input is small enough, to use less memory */ + if ( (srcSize < maxWindowResize) + && (dictSize < maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : + ZSTD_highbit32(tSize-1) + 1; + if (cPar.windowLog > srcLog) cPar.windowLog = srcLog; + } + if (cPar.hashLog > cPar.windowLog+1) cPar.hashLog = cPar.windowLog+1; + { U32 const cycleLog = ZSTD_cycleLog(cPar.chainLog, cPar.strategy); + if (cycleLog > cPar.windowLog) + cPar.chainLog -= (cycleLog - cPar.windowLog); + } + + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + + return cPar; +} + +ZSTD_compressionParameters +ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize) +{ + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize); +} + +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize); + +ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize) +{ + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { + srcSizeHint = CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize); + if (CCtxParams->ldmParams.enableLdm) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + if (CCtxParams->cParams.windowLog) cParams.windowLog = CCtxParams->cParams.windowLog; + if (CCtxParams->cParams.hashLog) cParams.hashLog = CCtxParams->cParams.hashLog; + if (CCtxParams->cParams.chainLog) cParams.chainLog = CCtxParams->cParams.chainLog; + if (CCtxParams->cParams.searchLog) cParams.searchLog = CCtxParams->cParams.searchLog; + if (CCtxParams->cParams.minMatch) cParams.minMatch = CCtxParams->cParams.minMatch; + if (CCtxParams->cParams.targetLength) cParams.targetLength = CCtxParams->cParams.targetLength; + if (CCtxParams->cParams.strategy) cParams.strategy = CCtxParams->cParams.strategy; + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ + return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize); +} + +static size_t +ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + const U32 forCCtx) +{ + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = (forCCtx && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + /* We don't use ZSTD_cwksp_alloc_size() here because the tables aren't + * surrounded by redzones in ASAN. */ + size_t const tableSpace = chainSize * sizeof(U32) + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = + ZSTD_cwksp_alloc_size((MaxML+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_alloc_size((1<strategy >= ZSTD_btopt)) + ? optPotentialSpace + : 0; + DEBUGLOG(4, "chainSize: %u - hSize: %u - h3Size: %u", + (U32)chainSize, (U32)hSize, (U32)h3Size); + return tableSpace + optSpace; +} + +size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); + U32 const divider = (cParams.minMatch==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(params->ldmParams); + size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(ZSTD_ldm_getMaxNbSeq(params->ldmParams, blockSize) * sizeof(rawSeq)); + + /* estimateCCtxSize is for one-shot compression. So no buffers should + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + size_t const bufferSpace = ZSTD_cwksp_alloc_size(0) + + ZSTD_cwksp_alloc_size(0); + + size_t const cctxSpace = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)); + + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; + } +} + +size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCCtxSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCCtxSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + return ZSTD_estimateCCtxSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCCtxSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCCtxSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) +{ + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0); + size_t const CCtxSize = ZSTD_estimateCCtxSize_usingCCtxParams(params); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); + size_t const inBuffSize = ((size_t)1 << cParams.windowLog) + blockSize; + size_t const outBuffSize = ZSTD_compressBound(blockSize) + 1; + size_t const streamingSize = ZSTD_cwksp_alloc_size(inBuffSize) + + ZSTD_cwksp_alloc_size(outBuffSize); + + return CCtxSize + streamingSize; + } +} + +size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams) +{ + ZSTD_CCtx_params const params = ZSTD_makeCCtxParamsFromCParams(cParams); + return ZSTD_estimateCStreamSize_usingCCtxParams(¶ms); +} + +static size_t ZSTD_estimateCStreamSize_internal(int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, 0); + return ZSTD_estimateCStreamSize_usingCParams(cParams); +} + +size_t ZSTD_estimateCStreamSize(int compressionLevel) +{ + int level; + size_t memBudget = 0; + for (level=MIN(compressionLevel, 1); level<=compressionLevel; level++) { + size_t const newMB = ZSTD_estimateCStreamSize_internal(level); + if (newMB > memBudget) memBudget = newMB; + } + return memBudget; +} + +/* ZSTD_getFrameProgression(): + * tells how much data has been consumed (input) and produced (output) for current frame. + * able to count progression inside worker threads (non-blocking mode). + */ +ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_getFrameProgression(cctx->mtctx); + } +#endif + { ZSTD_frameProgression fp; + size_t const buffered = (cctx->inBuff == NULL) ? 0 : + cctx->inBuffPos - cctx->inToCompress; + if (buffered) assert(cctx->inBuffPos >= cctx->inToCompress); + assert(buffered <= ZSTD_BLOCKSIZE_MAX); + fp.ingested = cctx->consumedSrcSize + buffered; + fp.consumed = cctx->consumedSrcSize; + fp.produced = cctx->producedCSize; + fp.flushed = cctx->producedCSize; /* simplified; some data might still be left within streaming output buffer */ + fp.currentJobID = 0; + fp.nbActiveWorkers = 0; + return fp; +} } + +/*! ZSTD_toFlushNow() + * Only useful for multithreading scenarios currently (nbWorkers >= 1). + */ +size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + return ZSTDMT_toFlushNow(cctx->mtctx); + } +#endif + (void)cctx; + return 0; /* over-simplification; could also check if context is currently running in streaming mode, and in which case, report how many bytes are left to be flushed within output buffer */ +} + +static void ZSTD_assertEqualCParams(ZSTD_compressionParameters cParams1, + ZSTD_compressionParameters cParams2) +{ + (void)cParams1; + (void)cParams2; + assert(cParams1.windowLog == cParams2.windowLog); + assert(cParams1.chainLog == cParams2.chainLog); + assert(cParams1.hashLog == cParams2.hashLog); + assert(cParams1.searchLog == cParams2.searchLog); + assert(cParams1.minMatch == cParams2.minMatch); + assert(cParams1.targetLength == cParams2.targetLength); + assert(cParams1.strategy == cParams2.strategy); +} + +void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) +{ + int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + bs->rep[i] = repStartValue[i]; + bs->entropy.huf.repeatMode = HUF_repeat_none; + bs->entropy.fse.offcode_repeatMode = FSE_repeat_none; + bs->entropy.fse.matchlength_repeatMode = FSE_repeat_none; + bs->entropy.fse.litlength_repeatMode = FSE_repeat_none; +} + +/*! ZSTD_invalidateMatchState() + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) +{ + ZSTD_window_clear(&ms->window); + + ms->nextToUpdate = ms->window.dictLimit; + ms->loadedDictEnd = 0; + ms->opt.litLengthSum = 0; /* force reset of btopt stats */ + ms->dictMatchState = NULL; +} + +/** + * Indicates whether this compression proceeds directly from user-provided + * source buffer to user-provided destination buffer (ZSTDb_not_buffered), or + * whether the context needs to buffer the input/output (ZSTDb_buffered). + */ +typedef enum { + ZSTDb_not_buffered, + ZSTDb_buffered +} ZSTD_buffered_policy_e; + +/** + * Controls, for this matchState reset, whether the tables need to be cleared / + * prepared for the coming compression (ZSTDcrp_makeClean), or whether the + * tables can be left unclean (ZSTDcrp_leaveDirty), because we know that a + * subsequent operation will overwrite the table space anyways (e.g., copying + * the matchState contents in from a CDict). + */ +typedef enum { + ZSTDcrp_makeClean, + ZSTDcrp_leaveDirty +} ZSTD_compResetPolicy_e; + +/** + * Controls, for this matchState reset, whether indexing can continue where it + * left off (ZSTDirp_continue), or whether it needs to be restarted from zero + * (ZSTDirp_reset). + */ +typedef enum { + ZSTDirp_continue, + ZSTDirp_reset +} ZSTD_indexResetPolicy_e; + +typedef enum { + ZSTD_resetTarget_CDict, + ZSTD_resetTarget_CCtx +} ZSTD_resetTarget_e; + +static size_t +ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +{ + size_t const chainSize = (cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cParams->chainLog); + size_t const hSize = ((size_t)1) << cParams->hashLog; + U32 const hashLog3 = ((forWho == ZSTD_resetTarget_CCtx) && cParams->minMatch==3) ? MIN(ZSTD_HASHLOG3_MAX, cParams->windowLog) : 0; + size_t const h3Size = hashLog3 ? ((size_t)1) << hashLog3 : 0; + + DEBUGLOG(4, "reset indices : %u", forceResetIndex == ZSTDirp_reset); + if (forceResetIndex == ZSTDirp_reset) { + ZSTD_window_init(&ms->window); + ZSTD_cwksp_mark_tables_dirty(ws); + } + + ms->hashLog3 = hashLog3; + + ZSTD_invalidateMatchState(ms); + + assert(!ZSTD_cwksp_reserve_failed(ws)); /* check that allocation hasn't already failed */ + + ZSTD_cwksp_clear_tables(ws); + + DEBUGLOG(5, "reserving table space"); + /* table Space */ + ms->hashTable = (U32*)ZSTD_cwksp_reserve_table(ws, hSize * sizeof(U32)); + ms->chainTable = (U32*)ZSTD_cwksp_reserve_table(ws, chainSize * sizeof(U32)); + ms->hashTable3 = (U32*)ZSTD_cwksp_reserve_table(ws, h3Size * sizeof(U32)); + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + + DEBUGLOG(4, "reset table : %u", crp!=ZSTDcrp_leaveDirty); + if (crp!=ZSTDcrp_leaveDirty) { + /* reset tables only */ + ZSTD_cwksp_clean_tables(ws); + } + + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); + ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); + ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); + ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); + ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + } + + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, + "failed a workspace allocation in ZSTD_reset_matchState"); + + return 0; +} + +/* ZSTD_indexTooCloseToMax() : + * minor optimization : prefer memset() rather than reduceIndex() + * which is measurably slow in some circumstances (reported for Visual Studio). + * Works when re-using a context for a lot of smallish inputs : + * if all inputs are smaller than ZSTD_INDEXOVERFLOW_MARGIN, + * memset() will be triggered before reduceIndex(). + */ +#define ZSTD_INDEXOVERFLOW_MARGIN (16 MB) +static int ZSTD_indexTooCloseToMax(ZSTD_window_t w) +{ + return (size_t)(w.nextSrc - w.base) > (ZSTD_CURRENT_MAX - ZSTD_INDEXOVERFLOW_MARGIN); +} + +/*! ZSTD_resetCCtx_internal() : + note : `params` are assumed fully validated at this stage */ +static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + ZSTD_CCtx_params params, + U64 const pledgedSrcSize, + ZSTD_compResetPolicy_e const crp, + ZSTD_buffered_policy_e const zbuff) +{ + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u", + (U32)pledgedSrcSize, params.cParams.windowLog); + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + + zc->isFirstBlock = 1; + + if (params.ldmParams.enableLdm) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(¶ms.ldmParams, ¶ms.cParams); + assert(params.ldmParams.hashLog >= params.ldmParams.bucketSizeLog); + assert(params.ldmParams.hashRateLog < 32); + zc->ldmState.hashPower = ZSTD_rollingHash_primePower(params.ldmParams.minMatchLength); + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params.cParams.windowLog), pledgedSrcSize)); + size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); + U32 const divider = (params.cParams.minMatch==3) ? 3 : 4; + size_t const maxNbSeq = blockSize / divider; + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); + size_t const buffOutSize = (zbuff==ZSTDb_buffered) ? ZSTD_compressBound(blockSize)+1 : 0; + size_t const buffInSize = (zbuff==ZSTDb_buffered) ? windowSize + blockSize : 0; + size_t const matchStateSize = ZSTD_sizeof_matchState(¶ms.cParams, /* forCCtx */ 1); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(params.ldmParams, blockSize); + + ZSTD_indexResetPolicy_e needsIndexReset = zc->initialized ? ZSTDirp_continue : ZSTDirp_reset; + + if (ZSTD_indexTooCloseToMax(zc->blockState.matchState.window)) { + needsIndexReset = ZSTDirp_reset; + } + + if (!zc->staticSize) ZSTD_cwksp_bump_oversized_duration(ws, 0); + + /* Check if workspace is large enough, alloc a new one if needed */ + { size_t const cctxSpace = zc->staticSize ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + size_t const entropySpace = ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) + ZSTD_cwksp_alloc_size(buffOutSize); + size_t const ldmSpace = ZSTD_ldm_getTableSize(params.ldmParams); + size_t const ldmSeqSpace = ZSTD_cwksp_alloc_size(maxNbLdmSeq * sizeof(rawSeq)); + + size_t const neededSpace = + cctxSpace + + entropySpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + + bufferSpace; + + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); + + DEBUGLOG(4, "Need %zuKB workspace, including %zuKB for match state, and %zuKB for buffers", + neededSpace>>10, matchStateSize>>10, bufferSpace>>10); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + + if (workspaceTooSmall || workspaceWasteful) { + DEBUGLOG(4, "Resize workspaceSize from %zuKB to %zuKB", + ZSTD_cwksp_sizeof(ws) >> 10, + neededSpace >> 10); + + RETURN_ERROR_IF(zc->staticSize, memory_allocation, "static cctx : no resize"); + + needsIndexReset = ZSTDirp_reset; + + ZSTD_cwksp_free(ws, zc->customMem); + FORWARD_IF_ERROR(ZSTD_cwksp_create(ws, neededSpace, zc->customMem), ""); + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. + * entropyWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); + zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, HUF_WORKSPACE_SIZE); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); + } } + + ZSTD_cwksp_clear(ws); + + /* init params */ + zc->appliedParams = params; + zc->blockState.matchState.cParams = params.cParams; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; + if (pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN) + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); + zc->blockSize = blockSize; + + XXH64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; + zc->dictID = 0; + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + + /* buffers */ + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); + zc->outBuffSize = buffOutSize; + zc->outBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffOutSize); + + /* ldm bucketOffsets table */ + if (params.ldmParams.enableLdm) { + /* TODO: avoid memset? */ + size_t const ldmBucketSize = + ((size_t)1) << (params.ldmParams.hashLog - + params.ldmParams.bucketSizeLog); + zc->ldmState.bucketOffsets = ZSTD_cwksp_reserve_buffer(ws, ldmBucketSize); + memset(zc->ldmState.bucketOffsets, 0, ldmBucketSize); + } + + /* sequences storage */ + ZSTD_referenceExternalSequences(zc, NULL, 0); + zc->seqStore.maxNbSeq = maxNbSeq; + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); + + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &zc->blockState.matchState, + ws, + ¶ms.cParams, + crp, + needsIndexReset, + ZSTD_resetTarget_CCtx), ""); + + /* ldm hash table */ + if (params.ldmParams.enableLdm) { + /* TODO: avoid memset? */ + size_t const ldmHSize = ((size_t)1) << params.ldmParams.hashLog; + zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); + memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); + zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); + zc->maxNbLdmSequences = maxNbLdmSeq; + + ZSTD_window_init(&zc->ldmState.window); + ZSTD_window_clear(&zc->ldmState.window); + zc->ldmState.loadedDictEnd = 0; + } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); + zc->initialized = 1; + + return 0; + } +} + +/* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; + * do not use with extDict variant ! */ +void ZSTD_invalidateRepCodes(ZSTD_CCtx* cctx) { + int i; + for (i=0; iblockState.prevCBlock->rep[i] = 0; + assert(!ZSTD_window_hasExtDict(cctx->blockState.matchState.window)); +} + +/* These are the approximate sizes for each strategy past which copying the + * dictionary tables into the working context is faster than using them + * in-place. + */ +static const size_t attachDictSizeCutoffs[ZSTD_STRATEGY_MAX+1] = { + 8 KB, /* unused */ + 8 KB, /* ZSTD_fast */ + 16 KB, /* ZSTD_dfast */ + 32 KB, /* ZSTD_greedy */ + 32 KB, /* ZSTD_lazy */ + 32 KB, /* ZSTD_lazy2 */ + 32 KB, /* ZSTD_btlazy2 */ + 32 KB, /* ZSTD_btopt */ + 8 KB, /* ZSTD_btultra */ + 8 KB /* ZSTD_btultra2 */ +}; + +static int ZSTD_shouldAttachDict(const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize) +{ + size_t cutoff = attachDictSizeCutoffs[cdict->matchState.cParams.strategy]; + return ( pledgedSrcSize <= cutoff + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || params->attachDictPref == ZSTD_dictForceAttach ) + && params->attachDictPref != ZSTD_dictForceCopy + && !params->forceWindow; /* dictMatchState isn't correctly + * handled in _enforceMaxDist */ +} + +static size_t +ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + { const ZSTD_compressionParameters* const cdict_cParams = &cdict->matchState.cParams; + unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Resize working context table params for input only, since the dict + * has its own tables. */ + /* pledgeSrcSize == 0 means 0! */ + params.cParams = ZSTD_adjustCParams_internal(*cdict_cParams, pledgedSrcSize, 0); + params.cParams.windowLog = windowLog; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_makeClean, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + } + + { const U32 cdictEnd = (U32)( cdict->matchState.window.nextSrc + - cdict->matchState.window.base); + const U32 cdictLen = cdictEnd - cdict->matchState.window.dictLimit; + if (cdictLen == 0) { + /* don't even attach dictionaries with no contents */ + DEBUGLOG(4, "skipping attaching empty dictionary"); + } else { + DEBUGLOG(4, "attaching dictionary into context"); + cctx->blockState.matchState.dictMatchState = &cdict->matchState; + + /* prep working match state so dict matches never have negative indices + * when they are translated to the working context's index space. */ + if (cctx->blockState.matchState.window.dictLimit < cdictEnd) { + cctx->blockState.matchState.window.nextSrc = + cctx->blockState.matchState.window.base + cdictEnd; + ZSTD_window_clear(&cctx->blockState.matchState.window); + } + /* loadedDictEnd is expressed within the referential of the active context */ + cctx->blockState.matchState.loadedDictEnd = cctx->blockState.matchState.window.dictLimit; + } } + + cctx->dictID = cdict->dictID; + + /* copy block state */ + memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + const ZSTD_compressionParameters *cdict_cParams = &cdict->matchState.cParams; + + DEBUGLOG(4, "copying dictionary into context"); + + { unsigned const windowLog = params.cParams.windowLog; + assert(windowLog != 0); + /* Copy only compression parameters related to tables. */ + params.cParams = *cdict_cParams; + params.cParams.windowLog = windowLog; + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, params, pledgedSrcSize, + ZSTDcrp_leaveDirty, zbuff), ""); + assert(cctx->appliedParams.cParams.strategy == cdict_cParams->strategy); + assert(cctx->appliedParams.cParams.hashLog == cdict_cParams->hashLog); + assert(cctx->appliedParams.cParams.chainLog == cdict_cParams->chainLog); + } + + ZSTD_cwksp_mark_tables_dirty(&cctx->workspace); + + /* copy tables */ + { size_t const chainSize = (cdict_cParams->strategy == ZSTD_fast) ? 0 : ((size_t)1 << cdict_cParams->chainLog); + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + + memcpy(cctx->blockState.matchState.hashTable, + cdict->matchState.hashTable, + hSize * sizeof(U32)); + memcpy(cctx->blockState.matchState.chainTable, + cdict->matchState.chainTable, + chainSize * sizeof(U32)); + } + + /* Zero the hashTable3, since the cdict never fills it */ + { int const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ + { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; + ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + + cctx->dictID = cdict->dictID; + + /* copy block state */ + memcpy(cctx->blockState.prevCBlock, &cdict->cBlockState, sizeof(cdict->cBlockState)); + + return 0; +} + +/* We have a choice between copying the dictionary context into the working + * context, or referencing the dictionary context from the working context + * in-place. We decide here which strategy to use. */ +static size_t ZSTD_resetCCtx_usingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + + DEBUGLOG(4, "ZSTD_resetCCtx_usingCDict (pledgedSrcSize=%u)", + (unsigned)pledgedSrcSize); + + if (ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) { + return ZSTD_resetCCtx_byAttachingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } else { + return ZSTD_resetCCtx_byCopyingCDict( + cctx, cdict, *params, pledgedSrcSize, zbuff); + } +} + +/*! ZSTD_copyCCtx_internal() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * The "context", in this case, refers to the hash and chain tables, + * entropy tables, and dictionary references. + * `windowLog` value is enforced if != 0, otherwise value is copied from srcCCtx. + * @return : 0, or an error code */ +static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + const ZSTD_CCtx* srcCCtx, + ZSTD_frameParameters fParams, + U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + DEBUGLOG(5, "ZSTD_copyCCtx_internal"); + RETURN_ERROR_IF(srcCCtx->stage!=ZSTDcs_init, stage_wrong, + "Can't copy a ctx that's not in init stage."); + + memcpy(&dstCCtx->customMem, &srcCCtx->customMem, sizeof(ZSTD_customMem)); + { ZSTD_CCtx_params params = dstCCtx->requestedParams; + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + params.fParams = fParams; + ZSTD_resetCCtx_internal(dstCCtx, params, pledgedSrcSize, + ZSTDcrp_leaveDirty, zbuff); + assert(dstCCtx->appliedParams.cParams.windowLog == srcCCtx->appliedParams.cParams.windowLog); + assert(dstCCtx->appliedParams.cParams.strategy == srcCCtx->appliedParams.cParams.strategy); + assert(dstCCtx->appliedParams.cParams.hashLog == srcCCtx->appliedParams.cParams.hashLog); + assert(dstCCtx->appliedParams.cParams.chainLog == srcCCtx->appliedParams.cParams.chainLog); + assert(dstCCtx->blockState.matchState.hashLog3 == srcCCtx->blockState.matchState.hashLog3); + } + + ZSTD_cwksp_mark_tables_dirty(&dstCCtx->workspace); + + /* copy tables */ + { size_t const chainSize = (srcCCtx->appliedParams.cParams.strategy == ZSTD_fast) ? 0 : ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog); + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; + int const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + memcpy(dstCCtx->blockState.matchState.hashTable, + srcCCtx->blockState.matchState.hashTable, + hSize * sizeof(U32)); + memcpy(dstCCtx->blockState.matchState.chainTable, + srcCCtx->blockState.matchState.chainTable, + chainSize * sizeof(U32)); + memcpy(dstCCtx->blockState.matchState.hashTable3, + srcCCtx->blockState.matchState.hashTable3, + h3Size * sizeof(U32)); + } + + ZSTD_cwksp_mark_tables_clean(&dstCCtx->workspace); + + /* copy dictionary offsets */ + { + const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; + ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; + } + dstCCtx->dictID = srcCCtx->dictID; + + /* copy block state */ + memcpy(dstCCtx->blockState.prevCBlock, srcCCtx->blockState.prevCBlock, sizeof(*srcCCtx->blockState.prevCBlock)); + + return 0; +} + +/*! ZSTD_copyCCtx() : + * Duplicate an existing context `srcCCtx` into another one `dstCCtx`. + * Only works during stage ZSTDcs_init (i.e. after creation, but before first call to ZSTD_compressContinue()). + * pledgedSrcSize==0 means "unknown". +* @return : 0, or an error code */ +size_t ZSTD_copyCCtx(ZSTD_CCtx* dstCCtx, const ZSTD_CCtx* srcCCtx, unsigned long long pledgedSrcSize) +{ + ZSTD_frameParameters fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + ZSTD_buffered_policy_e const zbuff = (ZSTD_buffered_policy_e)(srcCCtx->inBuffSize>0); + ZSTD_STATIC_ASSERT((U32)ZSTDb_buffered==1); + if (pledgedSrcSize==0) pledgedSrcSize = ZSTD_CONTENTSIZE_UNKNOWN; + fParams.contentSizeFlag = (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN); + + return ZSTD_copyCCtx_internal(dstCCtx, srcCCtx, + fParams, pledgedSrcSize, + zbuff); +} + + +#define ZSTD_ROWSIZE 16 +/*! ZSTD_reduceTable() : + * reduce table indexes by `reducerValue`, or squash to zero. + * PreserveMark preserves "unsorted mark" for btlazy2 strategy. + * It must be set to a clear 0/1 value, to remove branch during inlining. + * Presume table size is a multiple of ZSTD_ROWSIZE + * to help auto-vectorization */ +FORCE_INLINE_TEMPLATE void +ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerValue, int const preserveMark) +{ + int const nbRows = (int)size / ZSTD_ROWSIZE; + int cellNb = 0; + int rowNb; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ + assert(size < (1U<<31)); /* can be casted to int */ + +#if defined (MEMORY_SANITIZER) && !defined (ZSTD_MSAN_DONT_POISON_WORKSPACE) + /* To validate that the table re-use logic is sound, and that we don't + * access table space that we haven't cleaned, we re-"poison" the table + * space every time we mark it dirty. + * + * This function however is intended to operate on those dirty tables and + * re-clean them. So when this function is used correctly, we can unpoison + * the memory it operated on. This introduces a blind spot though, since + * if we now try to operate on __actually__ poisoned memory, we will not + * detect that. */ + __msan_unpoison(table, size * sizeof(U32)); +#endif + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { + int column; + for (column=0; columncParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); + } + + if (params->cParams.strategy != ZSTD_fast) { + U32 const chainSize = (U32)1 << params->cParams.chainLog; + if (params->cParams.strategy == ZSTD_btlazy2) + ZSTD_reduceTable_btlazy2(ms->chainTable, chainSize, reducerValue); + else + ZSTD_reduceTable(ms->chainTable, chainSize, reducerValue); + } + + if (ms->hashLog3) { + U32 const h3Size = (U32)1 << ms->hashLog3; + ZSTD_reduceTable(ms->hashTable3, h3Size, reducerValue); + } +} + + +/*-******************************************************* +* Block entropic compression +*********************************************************/ + +/* See doc/zstd_compression_format.md for detailed format description */ + +void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) +{ + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; + BYTE* const ofCodeTable = seqStorePtr->ofCode; + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; ulongLengthID==1) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthID==2) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; +} + +/* ZSTD_useTargetCBlockSize(): + * Returns if target compressed block size param is being used. + * If used, compression will do best effort to make a compressed block size to be around targetCBlockSize. + * Returns 1 if true, 0 otherwise. */ +static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) +{ + DEBUGLOG(5, "ZSTD_useTargetCBlockSize (targetCBlockSize=%zu)", cctxParams->targetCBlockSize); + return (cctxParams->targetCBlockSize != 0); +} + +/* ZSTD_compressSequences_internal(): + * actually compresses both literals and sequences */ +MEM_STATIC size_t +ZSTD_compressSequences_internal(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + void* entropyWorkspace, size_t entropyWkspSize, + const int bmi2) +{ + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned count[MaxSeq+1]; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + U32 LLtype, Offtype, MLtype; /* compressed, raw or rle */ + const seqDef* const sequences = seqStorePtr->sequencesStart; + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* seqHead; + BYTE* lastNCount = NULL; + + DEBUGLOG(5, "ZSTD_compressSequences_internal (nbSeq=%zu)", nbSeq); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<litStart; + size_t const litSize = (size_t)(seqStorePtr->lit - literals); + size_t const cSize = ZSTD_compressLiterals( + &prevEntropy->huf, &nextEntropy->huf, + cctxParams->cParams.strategy, + ZSTD_disableLiteralsCompression(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, + bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; + } + + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, "Can't fit seq hdr in output buf!"); + if (nbSeq < 128) { + *op++ = (BYTE)nbSeq; + } else if (nbSeq < LONGNBSEQ) { + op[0] = (BYTE)((nbSeq>>8) + 0x80); + op[1] = (BYTE)nbSeq; + op+=2; + } else { + op[0]=0xFF; + MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)); + op+=3; + } + assert(op <= oend); + if (nbSeq==0) { + /* Copy the old tables over as if we repeated them */ + memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ + seqHead = op++; + assert(op <= oend); + + /* convert length/distances into codes */ + ZSTD_seqToCodes(seqStorePtr); + /* build CTable for Literal Lengths */ + { unsigned max = MaxLL; + size_t const mostFrequent = HIST_countFast_wksp(count, &max, llCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building LL table"); + nextEntropy->fse.litlength_repeatMode = prevEntropy->fse.litlength_repeatMode; + LLtype = ZSTD_selectEncodingType(&nextEntropy->fse.litlength_repeatMode, + count, max, mostFrequent, nbSeq, + LLFSELog, prevEntropy->fse.litlengthCTable, + LL_defaultNorm, LL_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(set_basic < set_compressed && set_rle < set_compressed); + assert(!(LLtype < set_compressed && nextEntropy->fse.litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_LitLength, LLFSELog, (symbolEncodingType_e)LLtype, + count, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->fse.litlengthCTable, + sizeof(prevEntropy->fse.litlengthCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for LitLens failed"); + if (LLtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + /* build CTable for Offsets */ + { unsigned max = MaxOff; + size_t const mostFrequent = HIST_countFast_wksp( + count, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ + ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->fse.offcode_repeatMode = prevEntropy->fse.offcode_repeatMode; + Offtype = ZSTD_selectEncodingType(&nextEntropy->fse.offcode_repeatMode, + count, max, mostFrequent, nbSeq, + OffFSELog, prevEntropy->fse.offcodeCTable, + OF_defaultNorm, OF_defaultNormLog, + defaultPolicy, strategy); + assert(!(Offtype < set_compressed && nextEntropy->fse.offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)Offtype, + count, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->fse.offcodeCTable, + sizeof(prevEntropy->fse.offcodeCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for Offsets failed"); + if (Offtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + /* build CTable for MatchLengths */ + { unsigned max = MaxML; + size_t const mostFrequent = HIST_countFast_wksp( + count, &max, mlCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + DEBUGLOG(5, "Building ML table (remaining space : %i)", (int)(oend-op)); + nextEntropy->fse.matchlength_repeatMode = prevEntropy->fse.matchlength_repeatMode; + MLtype = ZSTD_selectEncodingType(&nextEntropy->fse.matchlength_repeatMode, + count, max, mostFrequent, nbSeq, + MLFSELog, prevEntropy->fse.matchlengthCTable, + ML_defaultNorm, ML_defaultNormLog, + ZSTD_defaultAllowed, strategy); + assert(!(MLtype < set_compressed && nextEntropy->fse.matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), + CTable_MatchLength, MLFSELog, (symbolEncodingType_e)MLtype, + count, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->fse.matchlengthCTable, + sizeof(prevEntropy->fse.matchlengthCTable), + entropyWorkspace, entropyWkspSize); + FORWARD_IF_ERROR(countSize, "ZSTD_buildCTable for MatchLengths failed"); + if (MLtype == set_compressed) + lastNCount = op; + op += countSize; + assert(op <= oend); + } } + + *seqHead = (BYTE)((LLtype<<6) + (Offtype<<4) + (MLtype<<2)); + + { size_t const bitstreamSize = ZSTD_encodeSequences( + op, (size_t)(oend - op), + CTable_MatchLength, mlCodeTable, + CTable_OffsetBits, ofCodeTable, + CTable_LitLength, llCodeTable, + sequences, nbSeq, + longOffsets, bmi2); + FORWARD_IF_ERROR(bitstreamSize, "ZSTD_encodeSequences failed"); + op += bitstreamSize; + assert(op <= oend); + /* zstd versions <= 1.3.4 mistakenly report corruption when + * FSE_readNCount() receives a buffer < 4 bytes. + * Fixed by https://github.com/facebook/zstd/pull/1146. + * This can happen when the last set_compressed table present is 2 + * bytes and the bitstream is only one byte. + * In this exceedingly rare case, we will simply emit an uncompressed + * block, since it isn't worth optimizing. + */ + if (lastNCount && (op - lastNCount) < 4) { + /* NCountSize >= 2 && bitstreamSize > 0 ==> lastCountSize == 3 */ + assert(op - lastNCount == 3); + DEBUGLOG(5, "Avoiding bug in zstd decoder in versions <= 1.3.4 by " + "emitting an uncompressed block."); + return 0; + } + } + + DEBUGLOG(5, "compressed block size : %u", (unsigned)(op - ostart)); + return (size_t)(op - ostart); +} + +MEM_STATIC size_t +ZSTD_compressSequences(seqStore_t* seqStorePtr, + const ZSTD_entropyCTables_t* prevEntropy, + ZSTD_entropyCTables_t* nextEntropy, + const ZSTD_CCtx_params* cctxParams, + void* dst, size_t dstCapacity, + size_t srcSize, + void* entropyWorkspace, size_t entropyWkspSize, + int bmi2) +{ + size_t const cSize = ZSTD_compressSequences_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, + dst, dstCapacity, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ + if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) + return 0; /* block not compressed */ + FORWARD_IF_ERROR(cSize, "ZSTD_compressSequences_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } + + return cSize; +} + +/* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_dictMode_e dictMode) +{ + static const ZSTD_blockCompressor blockCompressor[3][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, + ZSTD_compressBlock_doubleFast, + ZSTD_compressBlock_greedy, + ZSTD_compressBlock_lazy, + ZSTD_compressBlock_lazy2, + ZSTD_compressBlock_btlazy2, + ZSTD_compressBlock_btopt, + ZSTD_compressBlock_btultra, + ZSTD_compressBlock_btultra2 }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, + ZSTD_compressBlock_doubleFast_extDict, + ZSTD_compressBlock_greedy_extDict, + ZSTD_compressBlock_lazy_extDict, + ZSTD_compressBlock_lazy2_extDict, + ZSTD_compressBlock_btlazy2_extDict, + ZSTD_compressBlock_btopt_extDict, + ZSTD_compressBlock_btultra_extDict, + ZSTD_compressBlock_btultra_extDict }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, + ZSTD_compressBlock_doubleFast_dictMatchState, + ZSTD_compressBlock_greedy_dictMatchState, + ZSTD_compressBlock_lazy_dictMatchState, + ZSTD_compressBlock_lazy2_dictMatchState, + ZSTD_compressBlock_btlazy2_dictMatchState, + ZSTD_compressBlock_btopt_dictMatchState, + ZSTD_compressBlock_btultra_dictMatchState, + ZSTD_compressBlock_btultra_dictMatchState } + }; + ZSTD_blockCompressor selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + + assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); + selectedCompressor = blockCompressor[(int)dictMode][(int)strat]; + assert(selectedCompressor != NULL); + return selectedCompressor; +} + +static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) +{ + memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; +} + +void ZSTD_resetSeqStore(seqStore_t* ssPtr) +{ + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthID = 0; +} + +typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + +static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +{ + ZSTD_matchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); + if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { + ZSTD_ldm_skipSequences(&zc->externSeqStore, srcSize, zc->appliedParams.cParams.minMatch); + return ZSTDbss_noCompress; /* don't even attempt compression below a certain srcSize */ + } + ZSTD_resetSeqStore(&(zc->seqStore)); + /* required for optimal parser to read stats from dictionary */ + ms->opt.symbolCosts = &zc->blockState.prevCBlock->entropy; + /* tell the optimal parser how we expect to compress literals */ + ms->opt.literalCompressionMode = zc->appliedParams.literalCompressionMode; + /* a gap between an attached dict and the current window is not safe, + * they must remain adjacent, + * and when that stops being the case, the dict must be unset */ + assert(ms->dictMatchState == NULL || ms->loadedDictEnd == ms->window.dictLimit); + + /* limited update after a very long match */ + { const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 current = (U32)(istart-base); + if (sizeof(ptrdiff_t)==8) assert(istart - base < (ptrdiff_t)(U32)(-1)); /* ensure no overflow */ + if (current > ms->nextToUpdate + 384) + ms->nextToUpdate = current - MIN(192, (U32)(current - ms->nextToUpdate - 384)); + } + + /* select and store sequences */ + { ZSTD_dictMode_e const dictMode = ZSTD_matchState_dictMode(ms); + size_t lastLLSize; + { int i; + for (i = 0; i < ZSTD_REP_NUM; ++i) + zc->blockState.nextCBlock->rep[i] = zc->blockState.prevCBlock->rep[i]; + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(!zc->appliedParams.ldmParams.enableLdm); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm) { + rawSeqStore_t ldmSeqStore = {NULL, 0, 0, 0}; + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ + FORWARD_IF_ERROR(ZSTD_ldm_generateSequences(&zc->ldmState, &ldmSeqStore, + &zc->appliedParams.ldmParams, + src, srcSize), ""); + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&ldmSeqStore, + ms, &zc->seqStore, + zc->blockState.nextCBlock->rep, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); + } else { /* not long range mode */ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, dictMode); + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } + return ZSTDbss_compress; +} + +static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) +{ + const seqStore_t* seqStore = ZSTD_getSeqStore(zc); + const seqDef* seqs = seqStore->sequencesStart; + size_t seqsSize = seqStore->sequences - seqs; + + ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; + size_t i; size_t position; int repIdx; + + assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); + for (i = 0, position = 0; i < seqsSize; ++i) { + outSeqs[i].offset = seqs[i].offset; + outSeqs[i].litLength = seqs[i].litLength; + outSeqs[i].matchLength = seqs[i].matchLength + MINMATCH; + + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthID == 1) { + outSeqs[i].litLength += 0x10000; + } else if (seqStore->longLengthID == 2) { + outSeqs[i].matchLength += 0x10000; + } + } + + if (outSeqs[i].offset <= ZSTD_REP_NUM) { + outSeqs[i].rep = outSeqs[i].offset; + repIdx = (unsigned int)i - outSeqs[i].offset; + + if (outSeqs[i].litLength == 0) { + if (outSeqs[i].offset < 3) { + --repIdx; + } else { + repIdx = (unsigned int)i - 1; + } + ++outSeqs[i].rep; + } + assert(repIdx >= -3); + outSeqs[i].offset = repIdx >= 0 ? outSeqs[repIdx].offset : repStartValue[-repIdx - 1]; + if (outSeqs[i].rep == 4) { + --outSeqs[i].offset; + } + } else { + outSeqs[i].offset -= ZSTD_REP_NUM; + } + + position += outSeqs[i].litLength; + outSeqs[i].matchPos = (unsigned int)position; + position += outSeqs[i].matchLength; + } + zc->seqCollector.seqIndex += seqsSize; +} + +size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) +{ + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst = ZSTD_malloc(dstCapacity, ZSTD_defaultCMem); + SeqCollector seqCollector; + + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + + seqCollector.collectSequences = 1; + seqCollector.seqStart = outSeqs; + seqCollector.seqIndex = 0; + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + + ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); + ZSTD_free(dst, ZSTD_defaultCMem); + return zc->seqCollector.seqIndex; +} + +/* Returns true if the given block is a RLE block */ +static int ZSTD_isRLE(const BYTE *ip, size_t length) { + size_t i; + if (length < 2) return 1; + for (i = 1; i < length; ++i) { + if (ip[0] != ip[i]) return 0; + } + return 1; +} + +/* Returns true if the given block may be RLE. + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +static int ZSTD_maybeRLE(seqStore_t const* seqStore) +{ + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); + + return nbSeqs < 4 && nbLits < 10; +} + +static void ZSTD_confirmRepcodesAndEntropyTables(ZSTD_CCtx* zc) +{ + ZSTD_compressedBlockState_t* const tmp = zc->blockState.prevCBlock; + zc->blockState.prevCBlock = zc->blockState.nextCBlock; + zc->blockState.nextCBlock = tmp; +} + +static size_t ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) +{ + /* This the upper bound for the length of an rle block. + * This isn't the actual upper bound. Finding the real threshold + * needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + DEBUGLOG(5, "ZSTD_compressBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } + } + + if (zc->seqCollector.collectSequences) { + ZSTD_copyBlockSequences(zc); + return 0; + } + + /* encode sequences and literals */ + cSize = ZSTD_compressSequences(&zc->seqStore, + &zc->blockState.prevCBlock->entropy, &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + dst, dstCapacity, + srcSize, + zc->entropyWorkspace, HUF_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + cSize < rleMaxLength && + ZSTD_isRLE(ip, srcSize)) + { + cSize = 1; + op[0] = ip[0]; + } + +out: + if (!ZSTD_isError(cSize) && cSize > 1) { + ZSTD_confirmRepcodesAndEntropyTables(zc); + } + /* We check that dictionaries have offset codes available for the first + * block. After the first block, the offcode table might not have large + * enough codes to represent the offsets in the data. + */ + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const size_t bss, U32 lastBlock) +{ + DEBUGLOG(6, "Attempting ZSTD_compressSuperBlock()"); + if (bss == ZSTDbss_compress) { + if (/* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 + */ + !zc->isFirstBlock && + ZSTD_maybeRLE(&zc->seqStore) && + ZSTD_isRLE((BYTE const*)src, srcSize)) + { + return ZSTD_rleCompressBlock(dst, dstCapacity, *(BYTE const*)src, srcSize, lastBlock); + } + /* Attempt superblock compression. + * + * Note that compressed size of ZSTD_compressSuperBlock() is not bound by the + * standard ZSTD_compressBound(). This is a problem, because even if we have + * space now, taking an extra byte now could cause us to run out of space later + * and violate ZSTD_compressBound(). + * + * Define blockBound(blockSize) = blockSize + ZSTD_blockHeaderSize. + * + * In order to respect ZSTD_compressBound() we must attempt to emit a raw + * uncompressed block in these cases: + * * cSize == 0: Return code for an uncompressed block. + * * cSize == dstSize_tooSmall: We may have expanded beyond blockBound(srcSize). + * ZSTD_noCompressBlock() will return dstSize_tooSmall if we are really out of + * output space. + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ + { + size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { + size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_confirmRepcodesAndEntropyTables(zc); + return cSize; + } + } + } + } + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. + * The decoder will be able to stream this block since it is uncompressed. + */ + return ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); +} + +static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastBlock) +{ + size_t cSize = 0; + const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + DEBUGLOG(5, "ZSTD_compressBlock_targetCBlockSize (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u, srcSize=%zu)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, (unsigned)zc->blockState.matchState.nextToUpdate, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + + cSize = ZSTD_compressBlock_targetCBlockSize_body(zc, dst, dstCapacity, src, srcSize, bss, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize_body failed"); + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; + + return cSize; +} + +static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, + void const* iend) +{ + if (ZSTD_window_needOverflowCorrection(ms->window, iend)) { + U32 const maxDist = (U32)1 << params->cParams.windowLog; + U32 const cycleLog = ZSTD_cycleLog(params->cParams.chainLog, params->cParams.strategy); + U32 const correction = ZSTD_window_correctOverflow(&ms->window, cycleLog, maxDist, ip); + ZSTD_STATIC_ASSERT(ZSTD_CHAINLOG_MAX <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX_32 <= 30); + ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); + ZSTD_cwksp_mark_tables_dirty(ws); + ZSTD_reduceIndex(ms, params, correction); + ZSTD_cwksp_mark_tables_clean(ws); + if (ms->nextToUpdate < correction) ms->nextToUpdate = 0; + else ms->nextToUpdate -= correction; + /* invalidate dictionaries on overflow correction */ + ms->loadedDictEnd = 0; + ms->dictMatchState = NULL; + } +} + +/*! ZSTD_compress_frameChunk() : +* Compress a chunk of data into one or multiple blocks. +* All blocks will be terminated, all input will be consumed. +* Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. +* Frame is supposed already started (header already produced) +* @return : compressed size, or an error code +*/ +static size_t ZSTD_compress_frameChunk (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastFrameChunk) +{ + size_t blockSize = cctx->blockSize; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; + + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + + DEBUGLOG(5, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + XXH64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + + RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); + ZSTD_checkDictValidity(&ms->window, ip + blockSize, maxDist, &ms->loadedDictEnd, &ms->dictMatchState); + + /* Ensure hash/chain table insertion resumes no sooner than lowlimit */ + if (ms->nextToUpdate < ms->window.lowLimit) ms->nextToUpdate = ms->window.lowLimit; + + { size_t cSize; + if (ZSTD_useTargetCBlockSize(&cctx->appliedParams)) { + cSize = ZSTD_compressBlock_targetCBlockSize(cctx, op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_targetCBlockSize failed"); + assert(cSize > 0); + assert(cSize <= blockSize + ZSTD_blockHeaderSize); + } else { + cSize = ZSTD_compressBlock_internal(cctx, + op+ZSTD_blockHeaderSize, dstCapacity-ZSTD_blockHeaderSize, + ip, blockSize, 1 /* frame */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressBlock_internal failed"); + + if (cSize == 0) { /* block is not compressible */ + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + } else { + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } + } + + + ip += blockSize; + assert(remaining >= blockSize); + remaining -= blockSize; + op += cSize; + assert(dstCapacity >= cSize); + dstCapacity -= cSize; + cctx->isFirstBlock = 0; + DEBUGLOG(5, "ZSTD_compress_frameChunk: adding a block of size %u", + (unsigned)cSize); + } } + + if (lastFrameChunk && (op>ostart)) cctx->stage = ZSTDcs_ending; + return (size_t)(op-ostart); +} + + +static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, + const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) +{ BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; + U32 const windowSize = (U32)1 << params->cParams.windowLog; + U32 const singleSegment = params->fParams.contentSizeFlag && (windowSize >= pledgedSrcSize); + BYTE const windowLogByte = (BYTE)((params->cParams.windowLog - ZSTD_WINDOWLOG_ABSOLUTEMIN) << 3); + U32 const fcsCode = params->fParams.contentSizeFlag ? + (pledgedSrcSize>=256) + (pledgedSrcSize>=65536+256) + (pledgedSrcSize>=0xFFFFFFFFU) : 0; /* 0-3 */ + BYTE const frameHeaderDescriptionByte = (BYTE)(dictIDSizeCode + (checksumFlag<<2) + (singleSegment<<5) + (fcsCode<<6) ); + size_t pos=0; + + assert(!(params->fParams.contentSizeFlag && pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN)); + RETURN_ERROR_IF(dstCapacity < ZSTD_FRAMEHEADERSIZE_MAX, dstSize_tooSmall, + "dst buf is too small to fit worst-case frame header size."); + DEBUGLOG(4, "ZSTD_writeFrameHeader : dictIDFlag : %u ; dictID : %u ; dictIDSizeCode : %u", + !params->fParams.noDictIDFlag, (unsigned)dictID, (unsigned)dictIDSizeCode); + + if (params->format == ZSTD_f_zstd1) { + MEM_writeLE32(dst, ZSTD_MAGICNUMBER); + pos = 4; + } + op[pos++] = frameHeaderDescriptionByte; + if (!singleSegment) op[pos++] = windowLogByte; + switch(dictIDSizeCode) + { + default: assert(0); /* impossible */ + case 0 : break; + case 1 : op[pos] = (BYTE)(dictID); pos++; break; + case 2 : MEM_writeLE16(op+pos, (U16)dictID); pos+=2; break; + case 3 : MEM_writeLE32(op+pos, dictID); pos+=4; break; + } + switch(fcsCode) + { + default: assert(0); /* impossible */ + case 0 : if (singleSegment) op[pos++] = (BYTE)(pledgedSrcSize); break; + case 1 : MEM_writeLE16(op+pos, (U16)(pledgedSrcSize-256)); pos+=2; break; + case 2 : MEM_writeLE32(op+pos, (U32)(pledgedSrcSize)); pos+=4; break; + case 3 : MEM_writeLE64(op+pos, (U64)(pledgedSrcSize)); pos+=8; break; + } + return pos; +} + +/* ZSTD_writeLastEmptyBlock() : + * output an empty Block with end-of-frame mark to complete a frame + * @return : size of data written into `dst` (== ZSTD_blockHeaderSize (defined in zstd_internal.h)) + * or an error code if `dstCapacity` is too small (stage != ZSTDcs_init, stage_wrong, + "wrong cctx stage"); + RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm, + parameter_unsupported, + "incompatible with ldm"); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + return 0; +} + + +static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) +{ + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", + cctx->stage, (unsigned)srcSize); + RETURN_ERROR_IF(cctx->stage==ZSTDcs_created, stage_wrong, + "missing init (ZSTD_compressBegin)"); + + if (frame && (cctx->stage==ZSTDcs_init)) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, + cctx->pledgedSrcSizePlusOne-1, cctx->dictID); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + assert(fhSize <= dstCapacity); + dstCapacity -= fhSize; + dst = (char*)dst + fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (!srcSize) return fhSize; /* do not generate an empty block if no input */ + + if (!ZSTD_window_update(&ms->window, src, srcSize)) { + ms->nextToUpdate = ms->window.dictLimit; + } + if (cctx->appliedParams.ldmParams.enableLdm) { + ZSTD_window_update(&cctx->ldmState.window, src, srcSize); + } + + if (!frame) { + /* overflow check and correction for block mode */ + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, + src, (BYTE const*)src + srcSize); + } + + DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); + FORWARD_IF_ERROR(cSize, "%s", frame ? "ZSTD_compress_frameChunk failed" : "ZSTD_compressBlock_internal failed"); + cctx->consumedSrcSize += srcSize; + cctx->producedCSize += (cSize + fhSize); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + RETURN_ERROR_IF( + cctx->consumedSrcSize+1 > cctx->pledgedSrcSizePlusOne, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize >= %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + return cSize + fhSize; + } +} + +size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); +} + + +size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) +{ + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); + return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); +} + +size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); + { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); +} + +/*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, + ZSTD_dictTableLoadMethod_e dtlm) +{ + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + + ZSTD_window_update(&ms->window, src, srcSize); + ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); + + if (params->ldmParams.enableLdm && ls != NULL) { + ZSTD_window_update(&ls->window, src, srcSize); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); + } + + /* Assert that we the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + + if (srcSize <= HASH_READ_SIZE) return 0; + + while (iend - ip > HASH_READ_SIZE) { + size_t const remaining = (size_t)(iend - ip); + size_t const chunk = MIN(remaining, ZSTD_CHUNKSIZE_MAX); + const BYTE* const ichunk = ip + chunk; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, ichunk); + + if (params->ldmParams.enableLdm && ls != NULL) + ZSTD_ldm_fillHashTable(ls, (const BYTE*)src, (const BYTE*)src + srcSize, ¶ms->ldmParams); + + switch(params->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, ichunk, dtlm); + break; + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, ichunk, dtlm); + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + if (chunk >= HASH_READ_SIZE) + ZSTD_insertAndFindFirstIndex(ms, ichunk-HASH_READ_SIZE); + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + if (chunk >= HASH_READ_SIZE) + ZSTD_updateTree(ms, ichunk-HASH_READ_SIZE, ichunk); + break; + + default: + assert(0); /* not possible : not a valid strategy id */ + } + + ip = ichunk; + } + + ms->nextToUpdate = (U32)(iend - ms->window.base); + return 0; +} + + +/* Dictionaries that assign zero probability to symbols that show up causes problems + when FSE encoding. Refuse dictionaries that assign zero probability to symbols + that we may encounter during compression. + NOTE: This behavior is not standard and could be improved in the future. */ +static size_t ZSTD_checkDictNCount(short* normalizedCounter, unsigned dictMaxSymbolValue, unsigned maxSymbolValue) { + U32 s; + RETURN_ERROR_IF(dictMaxSymbolValue < maxSymbolValue, dictionary_corrupted, "dict fse tables don't have all symbols"); + for (s = 0; s <= maxSymbolValue; ++s) { + RETURN_ERROR_IF(normalizedCounter[s] == 0, dictionary_corrupted, "dict fse tables don't have all symbols"); + } + return 0; +} + +size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + short* offcodeNCount, unsigned* offcodeMaxValue, + const void* const dict, size_t dictSize) +{ + const BYTE* dictPtr = (const BYTE*)dict; /* skip magic num and dict ID */ + const BYTE* const dictEnd = dictPtr + dictSize; + dictPtr += 8; + bs->entropy.huf.repeatMode = HUF_repeat_check; + + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, + dictEnd-dictPtr, &hasZeroWeights); + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ + if (!hasZeroWeights) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + + { unsigned offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* Defer checking offcodeMaxValue because we need to know the size of the dictionary content */ + /* fill all offset symbols to avoid garbage at end of table */ + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.offcodeCTable, + offcodeNCount, MaxOff, offcodeLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + /* Every match length code must have non-zero probability */ + FORWARD_IF_ERROR( ZSTD_checkDictNCount(matchlengthNCount, matchlengthMaxValue, MaxML), ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.matchlengthCTable, + matchlengthNCount, matchlengthMaxValue, matchlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + /* Every literal length code must have non-zero probability */ + FORWARD_IF_ERROR( ZSTD_checkDictNCount(litlengthNCount, litlengthMaxValue, MaxLL), ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( + bs->entropy.fse.litlengthCTable, + litlengthNCount, litlengthMaxValue, litlengthLog, + workspace, HUF_WORKSPACE_SIZE)), + dictionary_corrupted, ""); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + bs->rep[0] = MEM_readLE32(dictPtr+0); + bs->rep[1] = MEM_readLE32(dictPtr+4); + bs->rep[2] = MEM_readLE32(dictPtr+8); + dictPtr += 12; + + return dictPtr - (const BYTE*)dict; +} + +/* Dictionary format : + * See : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#dictionary-format + */ +/*! ZSTD_loadZstdDictionary() : + * @return : dictID, or an error code + * assumptions : magic number supposed already checked + * dictSize supposed >= 8 + */ +static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, + void* workspace) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff; + size_t dictID; + size_t eSize; + + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= 8); + assert(MEM_readLE32(dictPtr) == ZSTD_MAGIC_DICTIONARY); + + dictID = params->fParams.noDictIDFlag ? 0 : MEM_readLE32(dictPtr + 4 /* skip magic number */ ); + eSize = ZSTD_loadCEntropy(bs, workspace, offcodeNCount, &offcodeMaxValue, dict, dictSize); + FORWARD_IF_ERROR(eSize, "ZSTD_loadCEntropy failed"); + dictPtr += eSize; + + { size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + U32 offcodeMax = MaxOff; + if (dictContentSize <= ((U32)-1) - 128 KB) { + U32 const maxOffset = (U32)dictContentSize + 128 KB; /* The maximum offset that must be supported */ + offcodeMax = ZSTD_highbit32(maxOffset); /* Calculate minimum offset code required to represent maxOffset */ + } + /* All offset values <= dictContentSize + 128 KB must be representable */ + FORWARD_IF_ERROR(ZSTD_checkDictNCount(offcodeNCount, offcodeMaxValue, MIN(offcodeMax, MaxOff)), ""); + /* All repCodes must be <= dictContentSize and != 0*/ + { U32 u; + for (u=0; u<3; u++) { + RETURN_ERROR_IF(bs->rep[u] == 0, dictionary_corrupted, ""); + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } + + bs->entropy.fse.offcode_repeatMode = FSE_repeat_valid; + bs->entropy.fse.matchlength_repeatMode = FSE_repeat_valid; + bs->entropy.fse.litlength_repeatMode = FSE_repeat_valid; + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( + ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); + return dictID; + } +} + +/** ZSTD_compress_insertDictionary() : +* @return : dictID, or an error code */ +static size_t +ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_matchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + void* workspace) +{ + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); + if ((dict==NULL) || (dictSize<8)) { + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + return 0; + } + + ZSTD_reset_compressedBlockState(bs); + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) + return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( + ms, ls, ws, params, dict, dictSize, dtlm); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ + } + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( + bs, ms, ws, params, dict, dictSize, dtlm, workspace); +} + +#define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) +#define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6) + +/*! ZSTD_compressBegin_internal() : + * @return : 0, or an error code */ +static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, U64 pledgedSrcSize, + ZSTD_buffered_policy_e zbuff) +{ + DEBUGLOG(4, "ZSTD_compressBegin_internal: wlog=%u", params->cParams.windowLog); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if ( (cdict) + && (cdict->dictContentSize > 0) + && ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0) + && (params->attachDictPref != ZSTD_dictForceLoad) ) { + return ZSTD_resetCCtx_usingCDict(cctx, cdict, params, pledgedSrcSize, zbuff); + } + + FORWARD_IF_ERROR( ZSTD_resetCCtx_internal(cctx, *params, pledgedSrcSize, + ZSTDcrp_makeClean, zbuff) , ""); + { size_t const dictID = cdict ? + ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, dictContentType, dtlm, + cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, + dictContentType, dtlm, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; + } + return 0; +} + +size_t ZSTD_compressBegin_advanced_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_advanced_internal: wlog=%u", params->cParams.windowLog); + /* compression parameters verification and optimization */ + FORWARD_IF_ERROR( ZSTD_checkCParams(params->cParams) , ""); + return ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, dtlm, + cdict, + params, pledgedSrcSize, + ZSTDb_not_buffered); +} + +/*! ZSTD_compressBegin_advanced() : +* @return : 0, or an error code */ +size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pledgedSrcSize) +{ + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + return ZSTD_compressBegin_advanced_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, + NULL /*cdict*/, + &cctxParams, pledgedSrcSize); +} + +size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); + return ZSTD_compressBegin_internal(cctx, dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); +} + +size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) +{ + return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); +} + + +/*! ZSTD_writeEpilogue() : +* Ends a frame. +* @return : nb of bytes written into dst (or an error code) */ +static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) +{ + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { + fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; + cctx->stage = ZSTDcs_ongoing; + } + + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); + MEM_writeLE32(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } + + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) XXH64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); + DEBUGLOG(4, "ZSTD_writeEpilogue: write checksum : %08X", (unsigned)checksum); + MEM_writeLE32(op, checksum); + op += 4; + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ + return op-ostart; +} + +size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, + dst, dstCapacity, src, srcSize, + 1 /* frame mode */, 1 /* last chunk */); + FORWARD_IF_ERROR(cSize, "ZSTD_compressContinue_internal failed"); + endResult = ZSTD_writeEpilogue(cctx, (char*)dst + cSize, dstCapacity-cSize); + FORWARD_IF_ERROR(endResult, "ZSTD_writeEpilogue failed"); + assert(!(cctx->appliedParams.fParams.contentSizeFlag && cctx->pledgedSrcSizePlusOne == 0)); + if (cctx->pledgedSrcSizePlusOne != 0) { /* control src size */ + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_UNKNOWN == (unsigned long long)-1); + DEBUGLOG(4, "end of frame : controlling src size"); + RETURN_ERROR_IF( + cctx->pledgedSrcSizePlusOne != cctx->consumedSrcSize+1, + srcSize_wrong, + "error : pledgedSrcSize = %u, while realSrcSize = %u", + (unsigned)cctx->pledgedSrcSizePlusOne-1, + (unsigned)cctx->consumedSrcSize); + } + return cSize + endResult; +} + + +static size_t ZSTD_compress_internal (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_parameters* params) +{ + ZSTD_CCtx_params const cctxParams = + ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, params); + DEBUGLOG(4, "ZSTD_compress_internal"); + return ZSTD_compress_advanced_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + &cctxParams); +} + +size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced"); + FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); + return ZSTD_compress_internal(cctx, + dst, dstCapacity, + src, srcSize, + dict, dictSize, + ¶ms); +} + +/* Internal */ +size_t ZSTD_compress_advanced_internal( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + const ZSTD_CCtx_params* params) +{ + DEBUGLOG(4, "ZSTD_compress_advanced_internal (srcSize:%u)", (unsigned)srcSize); + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + int compressionLevel) +{ + ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, srcSize, dict ? dictSize : 0); + ZSTD_CCtx_params cctxParams = ZSTD_assignParamsToCCtxParams(&cctx->requestedParams, ¶ms); + DEBUGLOG(4, "ZSTD_compress_usingDict (srcSize=%u)", (unsigned)srcSize); + assert(params.fParams.contentSizeFlag == 1); + return ZSTD_compress_advanced_internal(cctx, dst, dstCapacity, src, srcSize, dict, dictSize, &cctxParams); +} + +size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_compressCCtx (srcSize=%u)", (unsigned)srcSize); + assert(cctx != NULL); + return ZSTD_compress_usingDict(cctx, dst, dstCapacity, src, srcSize, NULL, 0, compressionLevel); +} + +size_t ZSTD_compress(void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel) +{ + size_t result; + ZSTD_CCtx ctxBody; + ZSTD_initCCtx(&ctxBody, ZSTD_defaultCMem); + result = ZSTD_compressCCtx(&ctxBody, dst, dstCapacity, src, srcSize, compressionLevel); + ZSTD_freeCCtxContent(&ctxBody); /* can't free ctxBody itself, as it's on stack; free only heap content */ + return result; +} + + +/* ===== Dictionary API ===== */ + +/*! ZSTD_estimateCDictSize_advanced() : + * Estimate amount of memory that will be needed to create a dictionary with following arguments */ +size_t ZSTD_estimateCDictSize_advanced( + size_t dictSize, ZSTD_compressionParameters cParams, + ZSTD_dictLoadMethod_e dictLoadMethod) +{ + DEBUGLOG(5, "sizeof(ZSTD_CDict) : %u", (unsigned)sizeof(ZSTD_CDict)); + return ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void *)))); +} + +size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + return ZSTD_estimateCDictSize_advanced(dictSize, cParams, ZSTD_dlm_byCopy); +} + +size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support sizeof on NULL */ + DEBUGLOG(5, "sizeof(*cdict) : %u", (unsigned)sizeof(*cdict)); + /* cdict may be in the workspace */ + return (cdict->workspace.workspace == cdict ? 0 : sizeof(*cdict)) + + ZSTD_cwksp_sizeof(&cdict->workspace); +} + +static size_t ZSTD_initCDict_internal( + ZSTD_CDict* cdict, + const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + DEBUGLOG(3, "ZSTD_initCDict_internal (dictContentType:%u)", (unsigned)dictContentType); + assert(!ZSTD_checkCParams(cParams)); + cdict->matchState.cParams = cParams; + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dictBuffer) || (!dictSize)) { + cdict->dictContent = dictBuffer; + } else { + void *internalBuffer = ZSTD_cwksp_reserve_object(&cdict->workspace, ZSTD_cwksp_align(dictSize, sizeof(void*))); + RETURN_ERROR_IF(!internalBuffer, memory_allocation, "NULL pointer!"); + cdict->dictContent = internalBuffer; + memcpy(internalBuffer, dictBuffer, dictSize); + } + cdict->dictContentSize = dictSize; + + cdict->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cdict->workspace, HUF_WORKSPACE_SIZE); + + + /* Reset the state to no dictionary */ + ZSTD_reset_compressedBlockState(&cdict->cBlockState); + FORWARD_IF_ERROR(ZSTD_reset_matchState( + &cdict->matchState, + &cdict->workspace, + &cParams, + ZSTDcrp_makeClean, + ZSTDirp_reset, + ZSTD_resetTarget_CDict), ""); + /* (Maybe) load the dictionary + * Skips loading the dictionary if it is < 8 bytes. + */ + { ZSTD_CCtx_params params; + memset(¶ms, 0, sizeof(params)); + params.compressionLevel = ZSTD_CLEVEL_DEFAULT; + params.fParams.contentSizeFlag = 1; + params.cParams = cParams; + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, + dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; + } + } + + return 0; +} + +ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, ZSTD_customMem customMem) +{ + DEBUGLOG(3, "ZSTD_createCDict_advanced, mode %u", (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))); + void* const workspace = ZSTD_malloc(workspaceSize, customMem); + ZSTD_cwksp ws; + ZSTD_CDict* cdict; + + if (!workspace) { + ZSTD_free(workspace, customMem); + return NULL; + } + + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + assert(cdict != NULL); + ZSTD_cwksp_move(&cdict->workspace, &ws); + cdict->customMem = customMem; + cdict->compressionLevel = 0; /* signals advanced API usage */ + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dictBuffer, dictSize, + dictLoadMethod, dictContentType, + cParams) )) { + ZSTD_freeCDict(cdict); + return NULL; + } + + return cdict; + } +} + +ZSTD_CDict* ZSTD_createCDict(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + ZSTD_CDict* cdict = ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byCopy, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); + if (cdict) + cdict->compressionLevel = compressionLevel == 0 ? ZSTD_CLEVEL_DEFAULT : compressionLevel; + return cdict; +} + +ZSTD_CDict* ZSTD_createCDict_byReference(const void* dict, size_t dictSize, int compressionLevel) +{ + ZSTD_compressionParameters cParams = ZSTD_getCParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize); + return ZSTD_createCDict_advanced(dict, dictSize, + ZSTD_dlm_byRef, ZSTD_dct_auto, + cParams, ZSTD_defaultCMem); +} + +size_t ZSTD_freeCDict(ZSTD_CDict* cdict) +{ + if (cdict==NULL) return 0; /* support free on NULL */ + { ZSTD_customMem const cMem = cdict->customMem; + int cdictInWorkspace = ZSTD_cwksp_owns_buffer(&cdict->workspace, cdict); + ZSTD_cwksp_free(&cdict->workspace, cMem); + if (!cdictInWorkspace) { + ZSTD_free(cdict, cMem); + } + return 0; + } +} + +/*! ZSTD_initStaticCDict_advanced() : + * Generate a digested dictionary in provided memory area. + * workspace: The memory area to emplace the dictionary into. + * Provided pointer must 8-bytes aligned. + * It must outlive dictionary usage. + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level + * into its relevants cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. + */ +const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) +{ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 + : ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(dictSize, sizeof(void*)))) + + ZSTD_cwksp_alloc_size(HUF_WORKSPACE_SIZE) + + matchStateSize; + ZSTD_CDict* cdict; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { + ZSTD_cwksp ws; + ZSTD_cwksp_init(&ws, workspace, workspaceSize); + cdict = (ZSTD_CDict*)ZSTD_cwksp_reserve_object(&ws, sizeof(ZSTD_CDict)); + if (cdict == NULL) return NULL; + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + + DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", + (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cParams) )) + return NULL; + + return cdict; +} + +ZSTD_compressionParameters ZSTD_getCParamsFromCDict(const ZSTD_CDict* cdict) +{ + assert(cdict != NULL); + return cdict->matchState.cParams; +} + +/* ZSTD_compressBegin_usingCDict_advanced() : + * cdict must be != NULL */ +size_t ZSTD_compressBegin_usingCDict_advanced( + ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, + ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict_advanced"); + RETURN_ERROR_IF(cdict==NULL, dictionary_wrong, "NULL pointer!"); + { ZSTD_CCtx_params params = cctx->requestedParams; + params.cParams = ( pledgedSrcSize < ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF + || pledgedSrcSize < cdict->dictContentSize * ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER + || pledgedSrcSize == ZSTD_CONTENTSIZE_UNKNOWN + || cdict->compressionLevel == 0 ) + && (params.attachDictPref != ZSTD_dictForceLoad) ? + ZSTD_getCParamsFromCDict(cdict) + : ZSTD_getCParams(cdict->compressionLevel, + pledgedSrcSize, + cdict->dictContentSize); + /* Increase window log to fit the entire dictionary and source if the + * source size is known. Limit the increase to 19, which is the + * window log for compression level 1 with the largest source size. + */ + if (pledgedSrcSize != ZSTD_CONTENTSIZE_UNKNOWN) { + U32 const limitedSrcSize = (U32)MIN(pledgedSrcSize, 1U << 19); + U32 const limitedSrcLog = limitedSrcSize > 1 ? ZSTD_highbit32(limitedSrcSize - 1) + 1 : 1; + params.cParams.windowLog = MAX(params.cParams.windowLog, limitedSrcLog); + } + params.fParams = fParams; + return ZSTD_compressBegin_internal(cctx, + NULL, 0, ZSTD_dct_auto, ZSTD_dtlm_fast, + cdict, + ¶ms, pledgedSrcSize, + ZSTDb_not_buffered); + } +} + +/* ZSTD_compressBegin_usingCDict() : + * pledgedSrcSize=0 means "unknown" + * if pledgedSrcSize>0, it will enable contentSizeFlag */ +size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + DEBUGLOG(4, "ZSTD_compressBegin_usingCDict : dictIDFlag == %u", !fParams.noDictIDFlag); + return ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); +} + +size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) +{ + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_advanced(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ + return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); +} + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Faster startup than ZSTD_compress_usingDict(), recommended when same dictionary is used multiple times. + * Note that compression parameters are decided at CDict creation time + * while frame parameters are hardcoded */ +size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict) +{ + ZSTD_frameParameters const fParams = { 1 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compress_usingCDict_advanced(cctx, dst, dstCapacity, src, srcSize, cdict, fParams); +} + + + +/* ****************************************************************** +* Streaming +********************************************************************/ + +ZSTD_CStream* ZSTD_createCStream(void) +{ + DEBUGLOG(3, "ZSTD_createCStream"); + return ZSTD_createCStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_CStream* ZSTD_initStaticCStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticCCtx(workspace, workspaceSize); +} + +ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem) +{ /* CStream and CCtx are now same object */ + return ZSTD_createCCtx_advanced(customMem); +} + +size_t ZSTD_freeCStream(ZSTD_CStream* zcs) +{ + return ZSTD_freeCCtx(zcs); /* same object */ +} + + + +/*====== Initialization ======*/ + +size_t ZSTD_CStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_CStreamOutSize(void) +{ + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; +} + +static size_t ZSTD_resetCStream_internal(ZSTD_CStream* cctx, + const void* const dict, size_t const dictSize, ZSTD_dictContentType_e const dictContentType, + const ZSTD_CDict* const cdict, + ZSTD_CCtx_params params, unsigned long long const pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_resetCStream_internal"); + /* Finalize the compression parameters */ + params.cParams = ZSTD_getCParamsFromCCtxParams(¶ms, pledgedSrcSize, dictSize); + /* params are supposed to be fully validated at this point */ + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, dictContentType, ZSTD_dtlm_fast, + cdict, + ¶ms, pledgedSrcSize, + ZSTDb_buffered) , ""); + + cctx->inToCompress = 0; + cctx->inBuffPos = 0; + cctx->inBuffTarget = cctx->blockSize + + (cctx->blockSize == pledgedSrcSize); /* for small input: avoid automatic flush on reaching end of block, since it would require to add a 3-bytes null block to end frame */ + cctx->outBuffContentSize = cctx->outBuffFlushedSize = 0; + cctx->streamStage = zcss_load; + cctx->frameEnded = 0; + return 0; /* ready to go */ +} + +/* ZSTD_resetCStream(): + * pledgedSrcSize == 0 means "unknown" */ +size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_resetCStream: pledgedSrcSize = %u", (unsigned)pledgedSrcSize); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +/*! ZSTD_initCStream_internal() : + * Note : for lib/compress only. Used by zstdmt_compress.c. + * Assumption 1 : params are valid + * Assumption 2 : either dict, or cdict, is defined, not both */ +size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_internal"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + zcs->requestedParams = *params; + assert(!((dict) && (cdict))); /* either dict or cdict, not both */ + if (dict) { + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + } else { + /* Dictionary is cleared if !cdict */ + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + } + return 0; +} + +/* ZSTD_initCStream_usingCDict_advanced() : + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters */ +size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + zcs->requestedParams.fParams = fParams; + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + +/* note : cdict must outlive compression session */ +size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingCDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, cdict) , ""); + return 0; +} + + +/* ZSTD_initCStream_advanced() : + * pledgedSrcSize must be exact. + * if srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * dict is loaded with default parameters ZSTD_dct_auto and ZSTD_dlm_byCopy. */ +size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, unsigned long long pss) +{ + /* for compatibility with older programs relying on this behavior. + * Users should now specify ZSTD_CONTENTSIZE_UNKNOWN. + * This line will be removed in the future. + */ + U64 const pledgedSrcSize = (pss==0 && params.fParams.contentSizeFlag==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_advanced"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + FORWARD_IF_ERROR( ZSTD_checkCParams(params.cParams) , ""); + zcs->requestedParams = ZSTD_assignParamsToCCtxParams(&zcs->requestedParams, ¶ms); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, const void* dict, size_t dictSize, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_loadDictionary(zcs, dict, dictSize) , ""); + return 0; +} + +size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, int compressionLevel, unsigned long long pss) +{ + /* temporary : 0 interpreted as "unknown" during transition period. + * Users willing to specify "unknown" **must** use ZSTD_CONTENTSIZE_UNKNOWN. + * 0 will be interpreted as "empty" in the future. + */ + U64 const pledgedSrcSize = (pss==0) ? ZSTD_CONTENTSIZE_UNKNOWN : pss; + DEBUGLOG(4, "ZSTD_initCStream_srcSize"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize) , ""); + return 0; +} + +size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) +{ + DEBUGLOG(4, "ZSTD_initCStream"); + FORWARD_IF_ERROR( ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_refCDict(zcs, NULL) , ""); + FORWARD_IF_ERROR( ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel) , ""); + return 0; +} + +/*====== Compression ======*/ + +static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) +{ + size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; + if (hintInSize==0) hintInSize = cctx->blockSize; + return hintInSize; +} + +/** ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants + * non-static, because can be called from zstdmt_compress.c + * @return : hint size for next input */ +static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) +{ + const char* const istart = (const char*)input->src; + const char* const iend = input->size != 0 ? istart + input->size : istart; + const char* ip = input->pos != 0 ? istart + input->pos : istart; + char* const ostart = (char*)output->dst; + char* const oend = output->size != 0 ? ostart + output->size : ostart; + char* op = output->pos != 0 ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ + DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + assert(output->pos <= output->size); + assert(input->pos <= input->size); + + while (someMoreWork) { + switch(zcs->streamStage) + { + case zcss_init: + RETURN_ERROR(init_missing, "call ZSTD_initCStream() first!"); + + case zcss_load: + if ( (flushMode == ZSTD_e_end) + && ((size_t)(oend-op) >= ZSTD_compressBound(iend-ip)) /* enough dstCapacity */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ + size_t const cSize = ZSTD_compressEnd(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); + ip = iend; + op += cSize; + zcs->frameEnded = 1; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + someMoreWork = 0; break; + } + /* complete loading into inBuffer */ + { size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; + if (loaded != 0) + ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ + someMoreWork = 0; break; + } + if ( (flushMode == ZSTD_e_flush) + && (zcs->inBuffPos == zcs->inToCompress) ) { + /* empty */ + someMoreWork = 0; break; + } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); + { void* cDst; + size_t cSize; + size_t const iSize = zcs->inBuffPos - zcs->inToCompress; + size_t oSize = oend-op; + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + if (oSize >= ZSTD_compressBound(iSize)) + cDst = op; /* compress into output buffer, to skip flush stage */ + else + cDst = zcs->outBuff, oSize = zcs->outBuffSize; + cSize = lastBlock ? + ZSTD_compressEnd(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : + ZSTD_compressContinue(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ + zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; + if (zcs->inBuffTarget > zcs->inBuffSize) + zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; + if (cDst == op) { /* no need to flush */ + op += cSize; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed directly in outBuffer"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + } + break; + } + zcs->outBuffContentSize = cSize; + zcs->outBuffFlushedSize = 0; + zcs->streamStage = zcss_flush; /* pass-through to flush stage */ + } + /* fall-through */ + case zcss_flush: + DEBUGLOG(5, "flush stage"); + { size_t const toFlush = zcs->outBuffContentSize - zcs->outBuffFlushedSize; + size_t const flushed = ZSTD_limitCopy(op, (size_t)(oend-op), + zcs->outBuff + zcs->outBuffFlushedSize, toFlush); + DEBUGLOG(5, "toFlush: %u into %u ==> flushed: %u", + (unsigned)toFlush, (unsigned)(oend-op), (unsigned)flushed); + if (flushed) + op += flushed; + zcs->outBuffFlushedSize += flushed; + if (toFlush!=flushed) { + /* flush not fully completed, presumably because dst is too small */ + assert(op==oend); + someMoreWork = 0; + break; + } + zcs->outBuffContentSize = zcs->outBuffFlushedSize = 0; + if (zcs->frameEnded) { + DEBUGLOG(5, "Frame completed on flush"); + someMoreWork = 0; + ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + break; + } + zcs->streamStage = zcss_load; + break; + } + + default: /* impossible */ + assert(0); + } + } + + input->pos = ip - istart; + output->pos = op - ostart; + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); +} + +static size_t ZSTD_nextInputSizeHint_MTorST(const ZSTD_CCtx* cctx) +{ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers >= 1) { + assert(cctx->mtctx != NULL); + return ZSTDMT_nextInputSizeHint(cctx->mtctx); + } +#endif + return ZSTD_nextInputSizeHint(cctx); + +} + +size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + FORWARD_IF_ERROR( ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue) , ""); + return ZSTD_nextInputSizeHint_MTorST(zcs); +} + + +size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp) +{ + DEBUGLOG(5, "ZSTD_compressStream2, endOp=%u ", (unsigned)endOp); + /* check conditions */ + RETURN_ERROR_IF(output->pos > output->size, GENERIC, "invalid buffer"); + RETURN_ERROR_IF(input->pos > input->size, GENERIC, "invalid buffer"); + assert(cctx!=NULL); + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ + memset(&cctx->prefixDict, 0, sizeof(cctx->prefixDict)); /* single usage */ + assert(prefixDict.dict==NULL || cctx->cdict==NULL); /* only one can be set */ + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); + if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = input->size + 1; /* auto-fix pledgedSrcSize */ + params.cParams = ZSTD_getCParamsFromCCtxParams( + &cctx->requestedParams, cctx->pledgedSrcSizePlusOne-1, 0 /*dictSize*/); + + +#ifdef ZSTD_MULTITHREAD + if ((cctx->pledgedSrcSizePlusOne-1) <= ZSTDMT_JOBSIZE_MIN) { + params.nbWorkers = 0; /* do not invoke multi-threading when src size is too small */ + } + if (params.nbWorkers > 0) { + /* mt context creation */ + if (cctx->mtctx == NULL) { + DEBUGLOG(4, "ZSTD_compressStream2: creating new mtctx for nbWorkers=%u", + params.nbWorkers); + cctx->mtctx = ZSTDMT_createCCtx_advanced((U32)params.nbWorkers, cctx->customMem); + RETURN_ERROR_IF(cctx->mtctx == NULL, memory_allocation, "NULL pointer!"); + } + /* mt compression */ + DEBUGLOG(4, "call ZSTDMT_initCStream_internal as nbWorkers=%u", params.nbWorkers); + FORWARD_IF_ERROR( ZSTDMT_initCStream_internal( + cctx->mtctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, params, cctx->pledgedSrcSizePlusOne-1) , ""); + cctx->streamStage = zcss_load; + cctx->appliedParams.nbWorkers = params.nbWorkers; + } else +#endif + { FORWARD_IF_ERROR( ZSTD_resetCStream_internal(cctx, + prefixDict.dict, prefixDict.dictSize, prefixDict.dictContentType, + cctx->cdict, + params, cctx->pledgedSrcSizePlusOne-1) , ""); + assert(cctx->streamStage == zcss_load); + assert(cctx->appliedParams.nbWorkers == 0); + } } + /* end of transparent initialization stage */ + + /* compression stage */ +#ifdef ZSTD_MULTITHREAD + if (cctx->appliedParams.nbWorkers > 0) { + int const forceMaxProgress = (endOp == ZSTD_e_flush || endOp == ZSTD_e_end); + size_t flushMin; + assert(forceMaxProgress || endOp == ZSTD_e_continue /* Protection for a new flush type */); + if (cctx->cParamsChanged) { + ZSTDMT_updateCParams_whileCompressing(cctx->mtctx, &cctx->requestedParams); + cctx->cParamsChanged = 0; + } + do { + flushMin = ZSTDMT_compressStream_generic(cctx->mtctx, output, input, endOp); + if ( ZSTD_isError(flushMin) + || (endOp == ZSTD_e_end && flushMin == 0) ) { /* compression completed */ + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + } + FORWARD_IF_ERROR(flushMin, "ZSTDMT_compressStream_generic failed"); + } while (forceMaxProgress && flushMin != 0 && output->pos < output->size); + DEBUGLOG(5, "completed ZSTD_compressStream2 delegating to ZSTDMT_compressStream_generic"); + /* Either we don't require maximum forward progress, we've finished the + * flush, or we are out of output space. + */ + assert(!forceMaxProgress || flushMin == 0 || output->pos == output->size); + return flushMin; + } +#endif + FORWARD_IF_ERROR( ZSTD_compressStream_generic(cctx, output, input, endOp) , ""); + DEBUGLOG(5, "completed ZSTD_compressStream2"); + return cctx->outBuffContentSize - cctx->outBuffFlushedSize; /* remaining to flush */ +} + +size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; +} + +size_t ZSTD_compress2(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(4, "ZSTD_compress2 (srcSize=%u)", (unsigned)srcSize); + ZSTD_CCtx_reset(cctx, ZSTD_reset_session_only); + { size_t oPos = 0; + size_t iPos = 0; + size_t const result = ZSTD_compressStream2_simpleArgs(cctx, + dst, dstCapacity, &oPos, + src, srcSize, &iPos, + ZSTD_e_end); + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); + RETURN_ERROR(dstSize_tooSmall, ""); + } + assert(iPos == srcSize); /* all input is expected consumed */ + return oPos; + } +} + +/*====== Finalize ======*/ + +/*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ +size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = { NULL, 0, 0 }; + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); +} + + +size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) +{ + ZSTD_inBuffer input = { NULL, 0, 0 }; + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); + FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; + size_t const checksumSize = (size_t)(zcs->frameEnded ? 0 : zcs->appliedParams.fParams.checksumFlag * 4); + size_t const toFlush = remainingToFlush + lastBlockSize + checksumSize; + DEBUGLOG(4, "ZSTD_endStream : remaining to flush : %u", (unsigned)toFlush); + return toFlush; + } +} + + +/*-===== Pre-defined compression levels =====-*/ + +#define ZSTD_MAX_CLEVEL 22 +int ZSTD_maxCLevel(void) { return ZSTD_MAX_CLEVEL; } +int ZSTD_minCLevel(void) { return (int)-ZSTD_TARGETLENGTH_MAX; } + +static const ZSTD_compressionParameters ZSTD_defaultCParameters[4][ZSTD_MAX_CLEVEL+1] = { +{ /* "default" - for any srcSize > 256 KB */ + /* W, C, H, S, L, TL, strat */ + { 19, 12, 13, 1, 6, 1, ZSTD_fast }, /* base for negative levels */ + { 19, 13, 14, 1, 7, 0, ZSTD_fast }, /* level 1 */ + { 20, 15, 16, 1, 6, 0, ZSTD_fast }, /* level 2 */ + { 21, 16, 17, 1, 5, 0, ZSTD_dfast }, /* level 3 */ + { 21, 18, 18, 1, 5, 0, ZSTD_dfast }, /* level 4 */ + { 21, 18, 19, 2, 5, 2, ZSTD_greedy }, /* level 5 */ + { 21, 19, 19, 3, 5, 4, ZSTD_greedy }, /* level 6 */ + { 21, 19, 19, 3, 5, 8, ZSTD_lazy }, /* level 7 */ + { 21, 19, 19, 3, 5, 16, ZSTD_lazy2 }, /* level 8 */ + { 21, 19, 20, 4, 5, 16, ZSTD_lazy2 }, /* level 9 */ + { 22, 20, 21, 4, 5, 16, ZSTD_lazy2 }, /* level 10 */ + { 22, 21, 22, 4, 5, 16, ZSTD_lazy2 }, /* level 11 */ + { 22, 21, 22, 5, 5, 16, ZSTD_lazy2 }, /* level 12 */ + { 22, 21, 22, 5, 5, 32, ZSTD_btlazy2 }, /* level 13 */ + { 22, 22, 23, 5, 5, 32, ZSTD_btlazy2 }, /* level 14 */ + { 22, 23, 23, 6, 5, 32, ZSTD_btlazy2 }, /* level 15 */ + { 22, 22, 22, 5, 5, 48, ZSTD_btopt }, /* level 16 */ + { 23, 23, 22, 5, 4, 64, ZSTD_btopt }, /* level 17 */ + { 23, 23, 22, 6, 3, 64, ZSTD_btultra }, /* level 18 */ + { 23, 24, 22, 7, 3,256, ZSTD_btultra2}, /* level 19 */ + { 25, 25, 23, 7, 3,256, ZSTD_btultra2}, /* level 20 */ + { 26, 26, 24, 7, 3,512, ZSTD_btultra2}, /* level 21 */ + { 27, 27, 25, 9, 3,999, ZSTD_btultra2}, /* level 22 */ +}, +{ /* for srcSize <= 256 KB */ + /* W, C, H, S, L, T, strat */ + { 18, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 18, 13, 14, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 18, 14, 14, 1, 5, 0, ZSTD_dfast }, /* level 2 */ + { 18, 16, 16, 1, 4, 0, ZSTD_dfast }, /* level 3 */ + { 18, 16, 17, 2, 5, 2, ZSTD_greedy }, /* level 4.*/ + { 18, 18, 18, 3, 5, 2, ZSTD_greedy }, /* level 5.*/ + { 18, 18, 19, 3, 5, 4, ZSTD_lazy }, /* level 6.*/ + { 18, 18, 19, 4, 4, 4, ZSTD_lazy }, /* level 7 */ + { 18, 18, 19, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 18, 18, 19, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 18, 18, 19, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 18, 18, 19, 5, 4, 12, ZSTD_btlazy2 }, /* level 11.*/ + { 18, 19, 19, 7, 4, 12, ZSTD_btlazy2 }, /* level 12.*/ + { 18, 18, 19, 4, 4, 16, ZSTD_btopt }, /* level 13 */ + { 18, 18, 19, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 18, 18, 19, 6, 3,128, ZSTD_btopt }, /* level 15.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 18, 19, 19, 6, 3,128, ZSTD_btultra2}, /* level 18.*/ + { 18, 19, 19, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 18, 19, 19, 10, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 18, 19, 19, 12, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 18, 19, 19, 13, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 128 KB */ + /* W, C, H, S, L, T, strat */ + { 17, 12, 12, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 17, 12, 13, 1, 6, 0, ZSTD_fast }, /* level 1 */ + { 17, 13, 15, 1, 5, 0, ZSTD_fast }, /* level 2 */ + { 17, 15, 16, 2, 5, 0, ZSTD_dfast }, /* level 3 */ + { 17, 17, 17, 2, 4, 0, ZSTD_dfast }, /* level 4 */ + { 17, 16, 17, 3, 4, 2, ZSTD_greedy }, /* level 5 */ + { 17, 17, 17, 3, 4, 4, ZSTD_lazy }, /* level 6 */ + { 17, 17, 17, 3, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 17, 17, 17, 4, 4, 8, ZSTD_lazy2 }, /* level 8 */ + { 17, 17, 17, 5, 4, 8, ZSTD_lazy2 }, /* level 9 */ + { 17, 17, 17, 6, 4, 8, ZSTD_lazy2 }, /* level 10 */ + { 17, 17, 17, 5, 4, 8, ZSTD_btlazy2 }, /* level 11 */ + { 17, 18, 17, 7, 4, 12, ZSTD_btlazy2 }, /* level 12 */ + { 17, 18, 17, 3, 4, 12, ZSTD_btopt }, /* level 13.*/ + { 17, 18, 17, 4, 3, 32, ZSTD_btopt }, /* level 14.*/ + { 17, 18, 17, 6, 3,256, ZSTD_btopt }, /* level 15.*/ + { 17, 18, 17, 6, 3,128, ZSTD_btultra }, /* level 16.*/ + { 17, 18, 17, 8, 3,256, ZSTD_btultra }, /* level 17.*/ + { 17, 18, 17, 10, 3,512, ZSTD_btultra }, /* level 18.*/ + { 17, 18, 17, 5, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 17, 18, 17, 7, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 17, 18, 17, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 17, 18, 17, 11, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +{ /* for srcSize <= 16 KB */ + /* W, C, H, S, L, T, strat */ + { 14, 12, 13, 1, 5, 1, ZSTD_fast }, /* base for negative levels */ + { 14, 14, 15, 1, 5, 0, ZSTD_fast }, /* level 1 */ + { 14, 14, 15, 1, 4, 0, ZSTD_fast }, /* level 2 */ + { 14, 14, 15, 2, 4, 0, ZSTD_dfast }, /* level 3 */ + { 14, 14, 14, 4, 4, 2, ZSTD_greedy }, /* level 4 */ + { 14, 14, 14, 3, 4, 4, ZSTD_lazy }, /* level 5.*/ + { 14, 14, 14, 4, 4, 8, ZSTD_lazy2 }, /* level 6 */ + { 14, 14, 14, 6, 4, 8, ZSTD_lazy2 }, /* level 7 */ + { 14, 14, 14, 8, 4, 8, ZSTD_lazy2 }, /* level 8.*/ + { 14, 15, 14, 5, 4, 8, ZSTD_btlazy2 }, /* level 9.*/ + { 14, 15, 14, 9, 4, 8, ZSTD_btlazy2 }, /* level 10.*/ + { 14, 15, 14, 3, 4, 12, ZSTD_btopt }, /* level 11.*/ + { 14, 15, 14, 4, 3, 24, ZSTD_btopt }, /* level 12.*/ + { 14, 15, 14, 5, 3, 32, ZSTD_btultra }, /* level 13.*/ + { 14, 15, 15, 6, 3, 64, ZSTD_btultra }, /* level 14.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra }, /* level 15.*/ + { 14, 15, 15, 5, 3, 48, ZSTD_btultra2}, /* level 16.*/ + { 14, 15, 15, 6, 3,128, ZSTD_btultra2}, /* level 17.*/ + { 14, 15, 15, 7, 3,256, ZSTD_btultra2}, /* level 18.*/ + { 14, 15, 15, 8, 3,256, ZSTD_btultra2}, /* level 19.*/ + { 14, 15, 15, 8, 3,512, ZSTD_btultra2}, /* level 20.*/ + { 14, 15, 15, 9, 3,512, ZSTD_btultra2}, /* level 21.*/ + { 14, 15, 15, 10, 3,999, ZSTD_btultra2}, /* level 22.*/ +}, +}; + +/*! ZSTD_getCParams_internal() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. */ +static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + int const unknown = srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN; + size_t const addedSize = unknown && dictSize > 0 ? 500 : 0; + U64 const rSize = unknown && dictSize == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : srcSizeHint+dictSize+addedSize; + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); + int row = compressionLevel; + DEBUGLOG(5, "ZSTD_getCParams_internal (cLevel=%i)", compressionLevel); + if (compressionLevel == 0) row = ZSTD_CLEVEL_DEFAULT; /* 0 == default */ + if (compressionLevel < 0) row = 0; /* entry 0 is baseline for fast mode */ + if (compressionLevel > ZSTD_MAX_CLEVEL) row = ZSTD_MAX_CLEVEL; + { ZSTD_compressionParameters cp = ZSTD_defaultCParameters[tableID][row]; + if (compressionLevel < 0) cp.targetLength = (unsigned)(-compressionLevel); /* acceleration factor */ + /* refine parameters based on srcSize & dictSize */ + return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize); + } +} + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Size values are optional, provide 0 if not known or unused */ +ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) +{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); + memset(¶ms, 0, sizeof(params)); + params.cParams = cParams; + params.fParams.contentSizeFlag = 1; + return params; +} + +/*! ZSTD_getParams() : + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize); +} +/**** ended inlining compress/zstd_compress.c ****/ +/**** start inlining compress/zstd_double_fast.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_double_fast.h ****/ + + +void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashLarge = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32 const mls = cParams->minMatch; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash tables. + * Insert the other positions into the large hash table if their entry + * is empty. + */ + for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + U32 i; + for (i = 0; i < fastHashFillStep; ++i) { + size_t const smHash = ZSTD_hashPtr(ip + i, hBitsS, mls); + size_t const lgHash = ZSTD_hashPtr(ip + i, hBitsL, 8); + if (i == 0) + hashSmall[smHash] = current + i; + if (i == 0 || hashLarge[lgHash] == 0) + hashLarge[lgHash] = current + i; + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; + } } +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_doubleFast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */, ZSTD_dictMode_e const dictMode) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + const U32 hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + const U32 hBitsS = cParams->chainLog; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + /* presumes that, if there is a dictionary, it must be using Attach mode */ + const U32 prefixLowestIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixLowest = base + prefixLowestIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = + dictMode == ZSTD_dictMatchState ? + &dms->cParams : NULL; + const U32* const dictHashLong = dictMode == ZSTD_dictMatchState ? + dms->hashTable : NULL; + const U32* const dictHashSmall = dictMode == ZSTD_dictMatchState ? + dms->chainTable : NULL; + const U32 dictStartIndex = dictMode == ZSTD_dictMatchState ? + dms->window.dictLimit : 0; + const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? + dms->window.base : NULL; + const BYTE* const dictStart = dictMode == ZSTD_dictMatchState ? + dictBase + dictStartIndex : NULL; + const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? + dms->window.nextSrc : NULL; + const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? + prefixLowestIndex - (U32)(dictEnd - dictBase) : + 0; + const U32 dictHBitsL = dictMode == ZSTD_dictMatchState ? + dictCParams->hashLog : hBitsL; + const U32 dictHBitsS = dictMode == ZSTD_dictMatchState ? + dictCParams->chainLog : hBitsS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_generic"); + + assert(dictMode == ZSTD_noDict || dictMode == ZSTD_dictMatchState); + + /* if a dictionary is attached, it must be within window range */ + if (dictMode == ZSTD_dictMatchState) { + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + } + + /* init */ + ip += (dictAndPrefixLength == 0); + if (dictMode == ZSTD_noDict) { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + if (dictMode == ZSTD_dictMatchState) { + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + } + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); + size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); + size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; + const BYTE* matchLong = base + matchIndexL; + const BYTE* match = base + matchIndexS; + const U32 repIndex = current + 1 - offset_1; + const BYTE* repMatch = (dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + hashLong[h2] = hashSmall[h] = current; /* update hash tables */ + + /* check dictMatchState repcode */ + if (dictMode == ZSTD_dictMatchState + && ((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + goto _match_stored; + } + + /* check noDict repcode */ + if ( dictMode == ZSTD_noDict + && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + goto _match_stored; + } + + if (matchIndexL > prefixLowestIndex) { + /* check prefix long match */ + if (MEM_read64(matchLong) == MEM_read64(ip)) { + mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; + offset = (U32)(ip-matchLong); + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dictMatchState long match */ + U32 const dictMatchIndexL = dictHashLong[dictHL]; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + + if (dictMatchL > dictStart && MEM_read64(dictMatchL) == MEM_read64(ip)) { + mLength = ZSTD_count_2segments(ip+8, dictMatchL+8, iend, dictEnd, prefixLowest) + 8; + offset = (U32)(current - dictMatchIndexL - dictIndexDelta); + while (((ip>anchor) & (dictMatchL>dictStart)) && (ip[-1] == dictMatchL[-1])) { ip--; dictMatchL--; mLength++; } /* catch up */ + goto _match_found; + } } + + if (matchIndexS > prefixLowestIndex) { + /* check prefix short match */ + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dictMatchState short match */ + U32 const dictMatchIndexS = dictHashSmall[dictHS]; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + + if (match > dictStart && MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } } + + ip += ((ip-anchor) >> kSearchStrength) + 1; +#if defined(__aarch64__) + PREFETCH_L1(ip+256); +#endif + continue; + +_search_next_long: + + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = current + 1; + + /* check prefix long +1 match */ + if (matchIndexL3 > prefixLowestIndex) { + if (MEM_read64(matchL3) == MEM_read64(ip+1)) { + mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; + ip++; + offset = (U32)(ip-matchL3); + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } + } else if (dictMode == ZSTD_dictMatchState) { + /* check dict long +1 match */ + U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { + mLength = ZSTD_count_2segments(ip+1+8, dictMatchL3+8, iend, dictEnd, prefixLowest) + 8; + ip++; + offset = (U32)(current + 1 - dictMatchIndexL3 - dictIndexDelta); + while (((ip>anchor) & (dictMatchL3>dictStart)) && (ip[-1] == dictMatchL3[-1])) { ip--; dictMatchL3--; mLength++; } /* catch up */ + goto _match_found; + } } } + + /* if no long +1 match, explore the short match we found */ + if (dictMode == ZSTD_dictMatchState && matchIndexS < prefixLowestIndex) { + mLength = ZSTD_count_2segments(ip+4, match+4, iend, dictEnd, prefixLowest) + 4; + offset = (U32)(current - matchIndexS); + while (((ip>anchor) & (match>dictStart)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } else { + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + offset = (U32)(ip - match); + while (((ip>anchor) & (match>prefixLowest)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + + /* fall-through */ + +_match_found: + offset_2 = offset_1; + offset_1 = offset; + + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + +_match_stored: + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = current+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + if (dictMode == ZSTD_dictMatchState) { + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = dictMode == ZSTD_dictMatchState + && repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } + + if (dictMode == ZSTD_noDict) { + while ( (ip <= ilimit) + && ( (offset_2>0) + & (MEM_read32(ip) == MEM_read32(ip - offset_2)) )) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + } /* while (ip < ilimit) */ + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_noDict); + case 5 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_noDict); + case 6 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_noDict); + case 7 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_noDict); + } +} + + +size_t ZSTD_compressBlock_doubleFast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const U32 mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 4, ZSTD_dictMatchState); + case 5 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 5, ZSTD_dictMatchState); + case 6 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 6, ZSTD_dictMatchState); + case 7 : + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, 7, ZSTD_dictMatchState); + } +} + + +static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +{ + ZSTD_compressionParameters const* cParams = &ms->cParams; + U32* const hashLong = ms->hashTable; + U32 const hBitsL = cParams->hashLog; + U32* const hashSmall = ms->chainTable; + U32 const hBitsS = cParams->chainLog; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); + const U32 dictStartIndex = lowLimit; + const U32 dictLimit = ms->window.dictLimit; + const U32 prefixStartIndex = (dictLimit > lowLimit) ? dictLimit : lowLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dictBase + prefixStartIndex; + U32 offset_1=rep[0], offset_2=rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_extDict_generic (srcSize=%zu)", srcSize); + + /* if extDict is invalidated due to maxDistance, switch to "regular" variant */ + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_doubleFast_generic(ms, seqStore, rep, src, srcSize, mls, ZSTD_noDict); + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t hSmall = ZSTD_hashPtr(ip, hBitsS, mls); + const U32 matchIndex = hashSmall[hSmall]; + const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + + const size_t hLong = ZSTD_hashPtr(ip, hBitsL, 8); + const U32 matchLongIndex = hashLong[hLong]; + const BYTE* const matchLongBase = matchLongIndex < prefixStartIndex ? dictBase : base; + const BYTE* matchLong = matchLongBase + matchLongIndex; + + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; /* offset_1 expected <= current +1 */ + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = current; /* update hash table */ + + if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ + & (repIndex > dictStartIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchLongIndex < prefixStartIndex ? dictStart : prefixStart; + U32 offset; + mLength = ZSTD_count_2segments(ip+8, matchLong+8, iend, matchEnd, prefixStart) + 8; + offset = current - matchLongIndex; + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); + U32 const matchIndex3 = hashLong[h3]; + const BYTE* const match3Base = matchIndex3 < prefixStartIndex ? dictBase : base; + const BYTE* match3 = match3Base + matchIndex3; + U32 offset; + hashLong[h3] = current + 1; + if ( (matchIndex3 > dictStartIndex) && (MEM_read64(match3) == MEM_read64(ip+1)) ) { + const BYTE* const matchEnd = matchIndex3 < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex3 < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+9, match3+8, iend, matchEnd, prefixStart) + 8; + ip++; + offset = current+1 - matchIndex3; + while (((ip>anchor) & (match3>lowMatchPtr)) && (ip[-1] == match3[-1])) { ip--; match3--; mLength++; } /* catch up */ + } else { + const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; + mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; + offset = current - matchIndex; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + } + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; + continue; + } } + + /* move to next sequence start */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Complementary insertion */ + /* done after iLimit test, as candidates could be > iend-8 */ + { U32 const indexToInsert = current+2; + hashLong[ZSTD_hashPtr(base+indexToInsert, hBitsL, 8)] = indexToInsert; + hashLong[ZSTD_hashPtr(ip-2, hBitsL, 8)] = (U32)(ip-2-base); + hashSmall[ZSTD_hashPtr(base+indexToInsert, hBitsS, mls)] = indexToInsert; + hashSmall[ZSTD_hashPtr(ip-1, hBitsS, mls)] = (U32)(ip-1-base); + } + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ + & (repIndex2 > dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_doubleFast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + } +} +/**** ended inlining compress/zstd_double_fast.c ****/ +/**** start inlining compress/zstd_fast.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_fast.h ****/ + + +void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hBits = cParams->hashLog; + U32 const mls = cParams->minMatch; + const BYTE* const base = ms->window.base; + const BYTE* ip = base + ms->nextToUpdate; + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ + for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { + U32 const current = (U32)(ip - base); + size_t const hash0 = ZSTD_hashPtr(ip, hBits, mls); + hashTable[hash0] = current; + if (dtlm == ZSTD_dtlm_fast) continue; + /* Only load extra positions for ZSTD_dtlm_full */ + { U32 p; + for (p = 1; p < fastHashFillStep; ++p) { + size_t const hash = ZSTD_hashPtr(ip + p, hBits, mls); + if (hashTable[hash] == 0) { /* not yet filled */ + hashTable[hash] = current + p; + } } } } +} + + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_fast_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + /* We check ip0 (ip + 0) and ip1 (ip + 1) each loop */ + const BYTE* ip0 = istart; + const BYTE* ip1; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 prefixStartIndex = ZSTD_getLowestPrefixIndex(ms, endIndex, cParams->windowLog); + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); + ip0 += (ip0 == prefixStart); + ip1 = ip0 + 1; + { U32 const current = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; + if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; + } + + /* Main Search Loop */ +#ifdef __INTEL_COMPILER + /* From intel 'The vector pragma indicates that the loop should be + * vectorized if it is legal to do so'. Can be used together with + * #pragma ivdep (but have opted to exclude that because intel + * warns against using it).*/ + #pragma vector always +#endif + while (ip1 < ilimit) { /* < instead of <=, because check at ip0+2 */ + size_t mLength; + BYTE const* ip2 = ip0 + 2; + size_t const h0 = ZSTD_hashPtr(ip0, hlog, mls); + U32 const val0 = MEM_read32(ip0); + size_t const h1 = ZSTD_hashPtr(ip1, hlog, mls); + U32 const val1 = MEM_read32(ip1); + U32 const current0 = (U32)(ip0-base); + U32 const current1 = (U32)(ip1-base); + U32 const matchIndex0 = hashTable[h0]; + U32 const matchIndex1 = hashTable[h1]; + BYTE const* repMatch = ip2 - offset_1; + const BYTE* match0 = base + matchIndex0; + const BYTE* match1 = base + matchIndex1; + U32 offcode; + +#if defined(__aarch64__) + PREFETCH_L1(ip0+256); +#endif + + hashTable[h0] = current0; /* update hash table */ + hashTable[h1] = current1; /* update hash table */ + + assert(ip0 + 1 == ip1); + + if ((offset_1 > 0) & (MEM_read32(repMatch) == MEM_read32(ip2))) { + mLength = (ip2[-1] == repMatch[-1]) ? 1 : 0; + ip0 = ip2 - mLength; + match0 = repMatch - mLength; + mLength += 4; + offcode = 0; + goto _match; + } + if ((matchIndex0 > prefixStartIndex) && MEM_read32(match0) == val0) { + /* found a regular match */ + goto _offset; + } + if ((matchIndex1 > prefixStartIndex) && MEM_read32(match1) == val1) { + /* found a regular match after one literal */ + ip0 = ip1; + match0 = match1; + goto _offset; + } + { size_t const step = ((size_t)(ip0-anchor) >> (kSearchStrength - 1)) + stepSize; + assert(step >= 2); + ip0 += step; + ip1 += step; + continue; + } +_offset: /* Requires: ip0, match0 */ + /* Compute the offset code */ + offset_2 = offset_1; + offset_1 = (U32)(ip0-match0); + offcode = offset_1 + ZSTD_REP_MOVE; + mLength = 4; + /* Count the backwards match length */ + while (((ip0>anchor) & (match0>prefixStart)) + && (ip0[-1] == match0[-1])) { ip0--; match0--; mLength++; } /* catch up */ + +_match: /* Requires: ip0, match0, offcode */ + /* Count the forward length */ + mLength += ZSTD_count(ip0+mLength, match0+mLength, iend); + ZSTD_storeSeq(seqStore, (size_t)(ip0-anchor), anchor, iend, offcode, mLength-MINMATCH); + /* match found */ + ip0 += mLength; + anchor = ip0; + + if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+current0+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + if (offset_2 > 0) { /* offset_2==0 means offset_2 is invalidated */ + while ( (ip0 <= ilimit) && (MEM_read32(ip0) == MEM_read32(ip0 - offset_2)) ) { + /* store sequence */ + size_t const rLength = ZSTD_count(ip0+4, ip0+4-offset_2, iend) + 4; + { U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; } /* swap offset_2 <=> offset_1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; + ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, 0 /*offCode*/, rLength-MINMATCH); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } + ip1 = ip0 + 1; + } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, 7); + } +} + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_fast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; + U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; + const U32* const dictHashTable = dms->hashTable; + const U32 dictStartIndex = dms->window.dictLimit; + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); + const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); + const U32 dictHLog = dictCParams->hashLog; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; + const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + + /* ensure there will be no no underflow + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); + ip += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + + /* Main Search Loop */ + while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ + size_t mLength; + size_t const h = ZSTD_hashPtr(ip, hlog, mls); + U32 const current = (U32)(ip-base); + U32 const matchIndex = hashTable[h]; + const BYTE* match = base + matchIndex; + const U32 repIndex = current + 1 - offset_1; + const BYTE* repMatch = (repIndex < prefixStartIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + hashTable[h] = current; /* update hash table */ + + if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, mLength-MINMATCH); + } else if ( (matchIndex <= prefixStartIndex) ) { + size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); + U32 const dictMatchIndex = dictHashTable[dictHash]; + const BYTE* dictMatch = dictBase + dictMatchIndex; + if (dictMatchIndex <= dictStartIndex || + MEM_read32(dictMatch) != MEM_read32(ip)) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } else { + /* found a dict match */ + U32 const offset = (U32)(current-dictMatchIndex-dictIndexDelta); + mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; + while (((ip>anchor) & (dictMatch>dictStart)) + && (ip[-1] == dictMatch[-1])) { + ip--; dictMatch--; mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } + } else if (MEM_read32(match) != MEM_read32(ip)) { + /* it's not a match, and we're not going to check the dictionary */ + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } else { + /* found a regular match */ + U32 const offset = (U32)(ip-match); + mLength = ZSTD_count(ip+4, match+4, iend) + 4; + while (((ip>anchor) & (match>prefixStart)) + && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + } + + /* match found */ + ip += mLength; + anchor = ip; + + if (ip <= ilimit) { + /* Fill Table */ + assert(base+current+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; /* here because current+2 could be > iend-8 */ + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, repLength2-MINMATCH); + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } + } + } + + /* save reps for next block */ + rep[0] = offset_1 ? offset_1 : offsetSaved; + rep[1] = offset_2 ? offset_2 : offsetSaved; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + +size_t ZSTD_compressBlock_fast_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + assert(ms->dictMatchState != NULL); + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_dictMatchState_generic(ms, seqStore, rep, src, srcSize, 7); + } +} + + +static size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); + const U32 dictStartIndex = lowLimit; + const BYTE* const dictStart = dictBase + dictStartIndex; + const U32 dictLimit = ms->window.dictLimit; + const U32 prefixStartIndex = dictLimit < lowLimit ? lowLimit : dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const dictEnd = dictBase + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_fast_extDict_generic (offset_1=%u)", offset_1); + + /* switch to "regular" variant if extDict is invalidated due to maxDistance */ + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast_generic(ms, seqStore, rep, src, srcSize, mls); + + /* Search Loop */ + while (ip < ilimit) { /* < instead of <=, because (ip+1) */ + const size_t h = ZSTD_hashPtr(ip, hlog, mls); + const U32 matchIndex = hashTable[h]; + const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; + const BYTE* match = matchBase + matchIndex; + const U32 current = (U32)(ip-base); + const U32 repIndex = current + 1 - offset_1; + const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + hashTable[h] = current; /* update hash table */ + DEBUGLOG(7, "offset_1 = %u , current = %u", offset_1, current); + assert(offset_1 <= current +1); /* check repIndex */ + + if ( (((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ & (repIndex > dictStartIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; + ip++; + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, 0, rLength-MINMATCH); + ip += rLength; + anchor = ip; + } else { + if ( (matchIndex < dictStartIndex) || + (MEM_read32(match) != MEM_read32(ip)) ) { + assert(stepSize >= 1); + ip += ((ip-anchor) >> kSearchStrength) + stepSize; + continue; + } + { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; + const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; + U32 const offset = current - matchIndex; + size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; + while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = offset; /* update offset history */ + ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, offset + ZSTD_REP_MOVE, mLength-MINMATCH); + ip += mLength; + anchor = ip; + } } + + if (ip <= ilimit) { + /* Fill Table */ + hashTable[ZSTD_hashPtr(base+current+2, hlog, mls)] = current+2; + hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); + /* check immediate repcode */ + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; + if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (repIndex2 > dictStartIndex)) /* intentional overflow */ + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, 0 /*offcode*/, repLength2-MINMATCH); + hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; + ip += repLength2; + anchor = ip; + continue; + } + break; + } } } + + /* save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_fast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + U32 const mls = ms->cParams.minMatch; + switch(mls) + { + default: /* includes case 3 */ + case 4 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 4); + case 5 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 5); + case 6 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 6); + case 7 : + return ZSTD_compressBlock_fast_extDict_generic(ms, seqStore, rep, src, srcSize, 7); + } +} +/**** ended inlining compress/zstd_fast.c ****/ +/**** start inlining compress/zstd_lazy.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: zstd_lazy.h ****/ + + +/*-************************************* +* Binary Tree search +***************************************/ + +static void +ZSTD_updateDUBT(ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + if (idx != target) + DEBUGLOG(7, "ZSTD_updateDUBT, from %u to %u (dictLimit:%u)", + idx, target, ms->window.dictLimit); + assert(ip + 8 <= iend); /* condition for ZSTD_hashPtr */ + (void)iend; + + assert(idx >= ms->window.dictLimit); /* condition for valid base+idx */ + for ( ; idx < target ; idx++) { + size_t const h = ZSTD_hashPtr(base + idx, hashLog, mls); /* assumption : ip + 8 <= iend */ + U32 const matchIndex = hashTable[h]; + + U32* const nextCandidatePtr = bt + 2*(idx&btMask); + U32* const sortMarkPtr = nextCandidatePtr + 1; + + DEBUGLOG(8, "ZSTD_updateDUBT: insert %u", idx); + hashTable[h] = idx; /* Update Hash Table */ + *nextCandidatePtr = matchIndex; /* update BT like a chain */ + *sortMarkPtr = ZSTD_DUBT_UNSORTED_MARK; + } + ms->nextToUpdate = target; +} + + +/** ZSTD_insertDUBT1() : + * sort one already inserted but unsorted position + * assumption : current >= btlow == (current - btmask) + * doesn't fail */ +static void +ZSTD_insertDUBT1(ZSTD_matchState_t* ms, + U32 current, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const ip = (current>=dictLimit) ? base + current : dictBase + current; + const BYTE* const iend = (current>=dictLimit) ? inputEnd : dictBase + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 matchIndex = *smallerPtr; /* this candidate is unsorted : next sorted candidate is reached through *smallerPtr, while *largerPtr contains previous unsorted candidate (which is already saved and can be overwritten) */ + U32 dummy32; /* to be nullified at the end */ + U32 const windowValid = ms->window.lowLimit; + U32 const maxDistance = 1U << cParams->windowLog; + U32 const windowLow = (current - windowValid > maxDistance) ? current - maxDistance : windowValid; + + + DEBUGLOG(8, "ZSTD_insertDUBT1(%u) (dictLimit=%u, lowLimit=%u)", + current, dictLimit, windowLow); + assert(current >= btLow); + assert(ip < iend); /* condition for ZSTD_count */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + /* note : all candidates are now supposed sorted, + * but it's still possible to have nextPtr[1] == ZSTD_DUBT_UNSORTED_MARK + * when a real index has the same value as ZSTD_DUBT_UNSORTED_MARK */ + + if ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit) /* both in current segment*/ + || (current < dictLimit) /* both in extDict */) { + const BYTE* const mBase = ( (dictMode != ZSTD_extDict) + || (matchIndex+matchLength >= dictLimit)) ? + base : dictBase; + assert( (matchIndex+matchLength >= dictLimit) /* might be wrong if extDict is incorrectly set to 0 */ + || (current < dictLimit) ); + match = mBase + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* preparation for next read of match[matchLength] */ + } + + DEBUGLOG(8, "ZSTD_insertDUBT1: comparing %u with %u : found %u common bytes ", + current, matchIndex, (U32)matchLength); + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is smaller : next => %u", + matchIndex, btLow, nextPtr[1]); + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + DEBUGLOG(8, "ZSTD_insertDUBT1: %u (>btLow=%u) is larger => %u", + matchIndex, btLow, nextPtr[0]); + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; +} + + +static size_t +ZSTD_DUBT_findBetterDictMatch ( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + size_t bestLength, + U32 nbCompares, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_matchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 dictMatchIndex = dictHashTable[h]; + + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + U32 const current = (U32)(ip-base); + const BYTE* const dictBase = dms->window.base; + const BYTE* const dictEnd = dms->window.nextSrc; + U32 const dictHighLimit = (U32)(dms->window.nextSrc - dms->window.base); + U32 const dictLowLimit = dms->window.lowLimit; + U32 const dictIndexDelta = ms->window.lowLimit - dictHighLimit; + + U32* const dictBt = dms->chainTable; + U32 const btLog = dmsCParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= dictHighLimit - dictLowLimit) ? dictLowLimit : dictHighLimit - btMask; + + size_t commonLengthSmaller=0, commonLengthLarger=0; + + (void)dictMode; + assert(dictMode == ZSTD_dictMatchState); + + while (nbCompares-- && (dictMatchIndex > dictLowLimit)) { + U32* const nextPtr = dictBt + 2*(dictMatchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dictBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (dictMatchIndex+matchLength >= dictHighLimit) + match = base + dictMatchIndex + dictIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", + current, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, ZSTD_REP_MOVE + current - matchIndex, dictMatchIndex, matchIndex); + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + if (dictMatchIndex <= btLow) { break; } /* beyond tree size, stop the search */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } + } + + if (bestLength >= MINMATCH) { + U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + current, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + +} + + +static size_t +ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const unsortLimit = MAX(btLow, windowLow); + + U32* nextCandidate = bt + 2*(matchIndex&btMask); + U32* unsortedMark = bt + 2*(matchIndex&btMask) + 1; + U32 nbCompares = 1U << cParams->searchLog; + U32 nbCandidates = nbCompares; + U32 previousCandidate = 0; + + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch (%u) ", current); + assert(ip <= iend-8); /* required for h calculation */ + + /* reach end of unsorted candidates list */ + while ( (matchIndex > unsortLimit) + && (*unsortedMark == ZSTD_DUBT_UNSORTED_MARK) + && (nbCandidates > 1) ) { + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch: candidate %u is unsorted", + matchIndex); + *unsortedMark = previousCandidate; /* the unsortedMark becomes a reversed chain, to move up back to original position */ + previousCandidate = matchIndex; + matchIndex = *nextCandidate; + nextCandidate = bt + 2*(matchIndex&btMask); + unsortedMark = bt + 2*(matchIndex&btMask) + 1; + nbCandidates --; + } + + /* nullify last candidate if it's still unsorted + * simplification, detrimental to compression ratio, beneficial for speed */ + if ( (matchIndex > unsortLimit) + && (*unsortedMark==ZSTD_DUBT_UNSORTED_MARK) ) { + DEBUGLOG(7, "ZSTD_DUBT_findBestMatch: nullify last unsorted candidate %u", + matchIndex); + *nextCandidate = *unsortedMark = 0; + } + + /* batch sort stacked candidates */ + matchIndex = previousCandidate; + while (matchIndex) { /* will end on matchIndex == 0 */ + U32* const nextCandidateIdxPtr = bt + 2*(matchIndex&btMask) + 1; + U32 const nextCandidateIdx = *nextCandidateIdxPtr; + ZSTD_insertDUBT1(ms, matchIndex, iend, + nbCandidates, unsortLimit, dictMode); + matchIndex = nextCandidateIdx; + nbCandidates++; + } + + /* find longest match */ + { size_t commonLengthSmaller = 0, commonLengthLarger = 0; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current + 8 + 1; + U32 dummy32; /* to be nullified at the end */ + size_t bestLength = 0; + + matchIndex = hashTable[h]; + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex > windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match; + + if ((dictMode != ZSTD_extDict) || (matchIndex+matchLength >= dictLimit)) { + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(current-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) + bestLength = matchLength, *offsetPtr = ZSTD_REP_MOVE + current - matchIndex; + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any + * further in this loop, make sure we + * skip checking in the dictionary. */ + } + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, + offsetPtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > current+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { + U32 const mIndex = current - ((U32)*offsetPtr - ZSTD_REP_MOVE); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + current, (U32)bestLength, (U32)*offsetPtr, mIndex); + } + return bestLength; + } +} + + +/** ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +FORCE_INLINE_TEMPLATE size_t +ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) +{ + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); + return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); +} + + +static size_t +ZSTD_BtFindBestMatch_selectMLS ( ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + } +} + + +static size_t ZSTD_BtFindBestMatch_dictMatchState_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + } +} + + +static size_t ZSTD_BtFindBestMatch_extDict_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); + case 5 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); + case 7 : + case 6 : return ZSTD_BtFindBestMatch(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + } +} + + + +/* ********************************* +* Hash Chain +***********************************/ +#define NEXT_IN_CHAIN(d, mask) chainTable[(d) & (mask)] + +/* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, + const BYTE* ip, U32 const mls) +{ + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; + U32* const chainTable = ms->chainTable; + const U32 chainMask = (1 << cParams->chainLog) - 1; + const BYTE* const base = ms->window.base; + const U32 target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + + while(idx < target) { /* catch up */ + size_t const h = ZSTD_hashPtr(base+idx, hashLog, mls); + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; +} + +U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); +} + + +/* inlining is important to hardwire a hot branch (template emulation) */ +FORCE_INLINE_TEMPLATE +size_t ZSTD_HcFindBestMatch_generic ( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const chainTable = ms->chainTable; + const U32 chainSize = (1 << cParams->chainLog); + const U32 chainMask = chainSize-1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const U32 current = (U32)(ip-base); + const U32 maxDistance = 1U << cParams->windowLog; + const U32 lowestValid = ms->window.lowLimit; + const U32 withinMaxDistance = (current - lowestValid > maxDistance) ? current - maxDistance : lowestValid; + const U32 isDictionary = (ms->loadedDictEnd != 0); + const U32 lowLimit = isDictionary ? lowestValid : withinMaxDistance; + const U32 minChain = current > chainSize ? current - chainSize : 0; + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + + /* HC4 match finder */ + U32 matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); + + for ( ; (matchIndex>lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ + if (match[ml] == ip[ml]) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; + assert(match+4 <= dictEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dictEnd, prefixStart) + 4; + } + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = current - matchIndex + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= minChain) break; + matchIndex = NEXT_IN_CHAIN(matchIndex, chainMask); + } + + if (dictMode == ZSTD_dictMatchState) { + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const U32* const dmsChainTable = dms->chainTable; + const U32 dmsChainSize = (1 << dms->cParams.chainLog); + const U32 dmsChainMask = dmsChainSize - 1; + const U32 dmsLowestIndex = dms->window.dictLimit; + const BYTE* const dmsBase = dms->window.base; + const BYTE* const dmsEnd = dms->window.nextSrc; + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + const U32 dmsMinChain = dmsSize > dmsChainSize ? dmsSize - dmsChainSize : 0; + + matchIndex = dms->hashTable[ZSTD_hashPtr(ip, dms->cParams.hashLog, mls)]; + + for ( ; (matchIndex>dmsLowestIndex) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + const BYTE* const match = dmsBase + matchIndex; + assert(match+4 <= dmsEnd); + if (MEM_read32(match) == MEM_read32(ip)) /* assumption : matchIndex <= dictLimit-4 (by table construction) */ + currentMl = ZSTD_count_2segments(ip+4, match+4, iLimit, dmsEnd, prefixStart) + 4; + + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; + *offsetPtr = current - (matchIndex + dmsIndexDelta) + ZSTD_REP_MOVE; + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + + if (matchIndex <= dmsMinChain) break; + matchIndex = dmsChainTable[matchIndex & dmsChainMask]; + } + } + + return ml; +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_noDict); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_noDict); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_noDict); + } +} + + +static size_t ZSTD_HcFindBestMatch_dictMatchState_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_dictMatchState); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_dictMatchState); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_dictMatchState); + } +} + + +FORCE_INLINE_TEMPLATE size_t ZSTD_HcFindBestMatch_extDict_selectMLS ( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* const iLimit, + size_t* offsetPtr) +{ + switch(ms->cParams.minMatch) + { + default : /* includes case 3 */ + case 4 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 4, ZSTD_extDict); + case 5 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 5, ZSTD_extDict); + case 7 : + case 6 : return ZSTD_HcFindBestMatch_generic(ms, ip, iLimit, offsetPtr, 6, ZSTD_extDict); + } +} + + +/* ******************************* +* Common parser - lazy strategy +*********************************/ +typedef enum { search_hashChain, search_binaryTree } searchMethod_e; + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth, + ZSTD_dictMode_e const dictMode) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 prefixLowestIndex = ms->window.dictLimit; + const BYTE* const prefixLowest = base + prefixLowestIndex; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f const searchMax = dictMode == ZSTD_dictMatchState ? + (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_dictMatchState_selectMLS + : ZSTD_HcFindBestMatch_dictMatchState_selectMLS) : + (searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_selectMLS + : ZSTD_HcFindBestMatch_selectMLS); + U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = dictMode == ZSTD_dictMatchState ? + dms->window.dictLimit : 0; + const BYTE* const dictBase = dictMode == ZSTD_dictMatchState ? + dms->window.base : NULL; + const BYTE* const dictLowest = dictMode == ZSTD_dictMatchState ? + dictBase + dictLowestIndex : NULL; + const BYTE* const dictEnd = dictMode == ZSTD_dictMatchState ? + dms->window.nextSrc : NULL; + const U32 dictIndexDelta = dictMode == ZSTD_dictMatchState ? + prefixLowestIndex - (U32)(dictEnd - dictBase) : + 0; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictLowest)); + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_generic (dictMode=%u)", (U32)dictMode); + + /* init */ + ip += (dictAndPrefixLength == 0); + if (dictMode == ZSTD_noDict) { + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, ms->cParams.windowLog); + U32 const maxRep = current - windowLow; + if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; + if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; + } + if (dictMode == ZSTD_dictMatchState) { + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + } + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + + /* check repCode */ + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) + 1 - offset_1; + const BYTE* repMatch = (dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + if (depth==0) goto _storeSequence; + } + } + if ( dictMode == ZSTD_noDict + && ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1)))) { + matchLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + if (depth==0) goto _storeSequence; + } + + /* first search (depth 0) */ + { size_t offsetFound = 999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + } + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + if (dictMode == ZSTD_dictMatchState) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; + if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) + matchLength = mlRep, offset = 0, start = ip; + } + } + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* NOTE: + * start[-offset+ZSTD_REP_MOVE-1] is undefined behavior. + * (-offset+ZSTD_REP_MOVE-1) is unsigned, and is added to start, which + * overflows the pointer, which is undefined behavior. + */ + /* catch up */ + if (offset) { + if (dictMode == ZSTD_noDict) { + while ( ((start > anchor) & (start - (offset-ZSTD_REP_MOVE) > prefixLowest)) + && (start[-1] == (start-(offset-ZSTD_REP_MOVE))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (dictMode == ZSTD_dictMatchState) { + U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + if (dictMode == ZSTD_dictMatchState) { + while (ip <= ilimit) { + U32 const current2 = (U32)(ip-base); + U32 const repIndex = current2 - offset_2; + const BYTE* repMatch = dictMode == ZSTD_dictMatchState + && repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; + if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset_2 <=> offset_1 */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; + } + break; + } + } + + if (dictMode == ZSTD_noDict) { + while ( ((ip <= ilimit) & (offset_2>0)) + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap repcodes */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + + /* Save reps for next block */ + rep[0] = offset_1 ? offset_1 : savedOffset; + rep[1] = offset_2 ? offset_2 : savedOffset; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); +} + + +FORCE_INLINE_TEMPLATE +size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) +{ + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const dictStart = dictBase + ms->window.lowLimit; + const U32 windowLog = ms->cParams.windowLog; + + typedef size_t (*searchMax_f)( + ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iLimit, size_t* offsetPtr); + searchMax_f searchMax = searchMethod==search_binaryTree ? ZSTD_BtFindBestMatch_extDict_selectMLS : ZSTD_HcFindBestMatch_extDict_selectMLS; + + U32 offset_1 = rep[0], offset_2 = rep[1]; + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic"); + + /* init */ + ip += (ip == prefixStart); + + /* Match Loop */ +#if defined(__GNUC__) && defined(__x86_64__) + /* I've measured random a 5% speed loss on levels 5 & 6 (greedy) when the + * code alignment is perturbed. To fix the instability align the loop on 32-bytes. + */ + __asm__(".p2align 5"); +#endif + while (ip < ilimit) { + size_t matchLength=0; + size_t offset=0; + const BYTE* start=ip+1; + U32 current = (U32)(ip-base); + + /* check repCode */ + { const U32 windowLow = ZSTD_getLowestMatchIndex(ms, current+1, windowLog); + const U32 repIndex = (U32)(current+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repEnd, prefixStart) + 4; + if (depth==0) goto _storeSequence; + } } + + /* first search (depth 0) */ + { size_t offsetFound = 999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offsetFound); + if (ml2 > matchLength) + matchLength = ml2, start = ip, offset=offsetFound; + } + + if (matchLength < 4) { + ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ + continue; + } + + /* let's try to find a better solution */ + if (depth>=1) + while (ip= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); + int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 1 */ + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; /* search a better one */ + } } + + /* let's find an even better one */ + if ((depth==2) && (ip= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 1); + if ((repLength >= 4) && (gain2 > gain1)) + matchLength = repLength, offset = 0, start = ip; + } } + + /* search match, depth 2 */ + { size_t offset2=999999999; + size_t const ml2 = searchMax(ms, ip, iend, &offset2); + int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)offset2+1)); /* raw approx */ + int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offset+1) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { + matchLength = ml2, offset = offset2, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ + if (offset) { + U32 const matchIndex = (U32)((start-base) - (offset - ZSTD_REP_MOVE)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + offset_2 = offset_1; offset_1 = (U32)(offset - ZSTD_REP_MOVE); + } + + /* store sequence */ +_storeSequence: + { size_t const litLength = start - anchor; + ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offset, matchLength-MINMATCH); + anchor = ip = start + matchLength; + } + + /* check immediate repcode */ + while (ip <= ilimit) { + const U32 repCurrent = (U32)(ip-base); + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, repCurrent, windowLog); + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; + if (((U32)((dictLimit-1) - repIndex) >= 3) & (repIndex > windowLow)) /* intentional overflow */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + offset = offset_2; offset_2 = offset_1; offset_1 = (U32)offset; /* swap offset history */ + ZSTD_storeSeq(seqStore, 0, anchor, iend, 0, matchLength-MINMATCH); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } + break; + } } + + /* Save reps for next block */ + rep[0] = offset_1; + rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); +} + +size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); +} + +size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); +} + +size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + +{ + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); +} +/**** ended inlining compress/zstd_lazy.c ****/ +/**** start inlining compress/zstd_ldm.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_ldm.h ****/ + +/**** skipping file: ../common/debug.h ****/ +/**** skipping file: zstd_fast.h ****/ +/**** skipping file: zstd_double_fast.h ****/ + +#define LDM_BUCKET_SIZE_LOG 3 +#define LDM_MIN_MATCH_LENGTH 64 +#define LDM_HASH_RLOG 7 +#define LDM_HASH_CHAR_OFFSET 10 + +void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams) +{ + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); + if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; + if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; + if (cParams->strategy >= ZSTD_btopt) { + /* Get out of the way of the optimal parser */ + U32 const minMatch = MAX(cParams->targetLength, params->minMatchLength); + assert(minMatch >= ZSTD_LDM_MINMATCH_MIN); + assert(minMatch <= ZSTD_LDM_MINMATCH_MAX); + params->minMatchLength = minMatch; + } + if (params->hashLog == 0) { + params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); + assert(params->hashLog <= ZSTD_HASHLOG_MAX); + } + if (params->hashRateLog == 0) { + params->hashRateLog = params->windowLog < params->hashLog + ? 0 + : params->windowLog - params->hashLog; + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); +} + +size_t ZSTD_ldm_getTableSize(ldmParams_t params) +{ + size_t const ldmHSize = ((size_t)1) << params.hashLog; + size_t const ldmBucketSizeLog = MIN(params.bucketSizeLog, params.hashLog); + size_t const ldmBucketSize = ((size_t)1) << (params.hashLog - ldmBucketSizeLog); + size_t const totalSize = ZSTD_cwksp_alloc_size(ldmBucketSize) + + ZSTD_cwksp_alloc_size(ldmHSize * sizeof(ldmEntry_t)); + return params.enableLdm ? totalSize : 0; +} + +size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) +{ + return params.enableLdm ? (maxChunkSize / params.minMatchLength) : 0; +} + +/** ZSTD_ldm_getSmallHash() : + * numBits should be <= 32 + * If numBits==0, returns 0. + * @return : the most significant numBits of value. */ +static U32 ZSTD_ldm_getSmallHash(U64 value, U32 numBits) +{ + assert(numBits <= 32); + return numBits == 0 ? 0 : (U32)(value >> (64 - numBits)); +} + +/** ZSTD_ldm_getChecksum() : + * numBitsToDiscard should be <= 32 + * @return : the next most significant 32 bits after numBitsToDiscard */ +static U32 ZSTD_ldm_getChecksum(U64 hash, U32 numBitsToDiscard) +{ + assert(numBitsToDiscard <= 32); + return (hash >> (64 - 32 - numBitsToDiscard)) & 0xFFFFFFFF; +} + +/** ZSTD_ldm_getTag() ; + * Given the hash, returns the most significant numTagBits bits + * after (32 + hbits) bits. + * + * If there are not enough bits remaining, return the last + * numTagBits bits. */ +static U32 ZSTD_ldm_getTag(U64 hash, U32 hbits, U32 numTagBits) +{ + assert(numTagBits < 32 && hbits <= 32); + if (32 - hbits < numTagBits) { + return hash & (((U32)1 << numTagBits) - 1); + } else { + return (hash >> (32 - hbits - numTagBits)) & (((U32)1 << numTagBits) - 1); + } +} + +/** ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ +static ldmEntry_t* ZSTD_ldm_getBucket( + ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) +{ + return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); +} + +/** ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ +static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, + ldmParams_t const ldmParams) +{ + BYTE* const bucketOffsets = ldmState->bucketOffsets; + *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + bucketOffsets[hash]) = entry; + bucketOffsets[hash]++; + bucketOffsets[hash] &= ((U32)1 << ldmParams.bucketSizeLog) - 1; +} + +/** ZSTD_ldm_makeEntryAndInsertByTag() : + * + * Gets the small hash, checksum, and tag from the rollingHash. + * + * If the tag matches (1 << ldmParams.hashRateLog)-1, then + * creates an ldmEntry from the offset, and inserts it into the hash table. + * + * hBits is the length of the small hash, which is the most significant hBits + * of rollingHash. The checksum is the next 32 most significant bits, followed + * by ldmParams.hashRateLog bits that make up the tag. */ +static void ZSTD_ldm_makeEntryAndInsertByTag(ldmState_t* ldmState, + U64 const rollingHash, + U32 const hBits, + U32 const offset, + ldmParams_t const ldmParams) +{ + U32 const tag = ZSTD_ldm_getTag(rollingHash, hBits, ldmParams.hashRateLog); + U32 const tagMask = ((U32)1 << ldmParams.hashRateLog) - 1; + if (tag == tagMask) { + U32 const hash = ZSTD_ldm_getSmallHash(rollingHash, hBits); + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + ldmEntry_t entry; + entry.offset = offset; + entry.checksum = checksum; + ZSTD_ldm_insertEntry(ldmState, hash, entry, ldmParams); + } +} + +/** ZSTD_ldm_countBackwardsMatch() : + * Returns the number of bytes that match backwards before pIn and pMatch. + * + * We count only bytes where pMatch >= pBase and pIn >= pAnchor. */ +static size_t ZSTD_ldm_countBackwardsMatch( + const BYTE* pIn, const BYTE* pAnchor, + const BYTE* pMatch, const BYTE* pBase) +{ + size_t matchLength = 0; + while (pIn > pAnchor && pMatch > pBase && pIn[-1] == pMatch[-1]) { + pIn--; + pMatch--; + matchLength++; + } + return matchLength; +} + +/** ZSTD_ldm_fillFastTables() : + * + * Fills the relevant tables for the ZSTD_fast and ZSTD_dfast strategies. + * This is similar to ZSTD_loadDictionaryContent. + * + * The tables for the other strategies are filled within their + * block compressors. */ +static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + void const* end) +{ + const BYTE* const iend = (const BYTE*)end; + + switch(ms->cParams.strategy) + { + case ZSTD_fast: + ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); + break; + + case ZSTD_dfast: + ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: + case ZSTD_btlazy2: + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: + break; + default: + assert(0); /* not possible : not a valid strategy id */ + } + + return 0; +} + +/** ZSTD_ldm_fillLdmHashTable() : + * + * Fills hashTable from (lastHashed + 1) to iend (non-inclusive). + * lastHash is the rolling hash that corresponds to lastHashed. + * + * Returns the rolling hash corresponding to position iend-1. */ +static U64 ZSTD_ldm_fillLdmHashTable(ldmState_t* state, + U64 lastHash, const BYTE* lastHashed, + const BYTE* iend, const BYTE* base, + U32 hBits, ldmParams_t const ldmParams) +{ + U64 rollingHash = lastHash; + const BYTE* cur = lastHashed + 1; + + while (cur < iend) { + rollingHash = ZSTD_rollingHash_rotate(rollingHash, cur[-1], + cur[ldmParams.minMatchLength-1], + state->hashPower); + ZSTD_ldm_makeEntryAndInsertByTag(state, + rollingHash, hBits, + (U32)(cur - base), ldmParams); + ++cur; + } + return rollingHash; +} + +void ZSTD_ldm_fillHashTable( + ldmState_t* state, const BYTE* ip, + const BYTE* iend, ldmParams_t const* params) +{ + DEBUGLOG(5, "ZSTD_ldm_fillHashTable"); + if ((size_t)(iend - ip) >= params->minMatchLength) { + U64 startingHash = ZSTD_rollingHash_compute(ip, params->minMatchLength); + ZSTD_ldm_fillLdmHashTable( + state, startingHash, ip, iend - params->minMatchLength, state->window.base, + params->hashLog - params->bucketSizeLog, + *params); + } +} + + +/** ZSTD_ldm_limitTableUpdate() : + * + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) +{ + U32 const current = (U32)(anchor - ms->window.base); + if (current > ms->nextToUpdate + 1024) { + ms->nextToUpdate = + current - MIN(512, current - ms->nextToUpdate - 1024); + } +} + +static size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + /* LDM parameters */ + int const extDict = ZSTD_window_hasExtDict(ldmState->window); + U32 const minMatchLength = params->minMatchLength; + U64 const hashPower = ldmState->hashPower; + U32 const hBits = params->hashLog - params->bucketSizeLog; + U32 const ldmBucketSize = 1U << params->bucketSizeLog; + U32 const hashRateLog = params->hashRateLog; + U32 const ldmTagMask = (1U << params->hashRateLog) - 1; + /* Prefix and extDict parameters */ + U32 const dictLimit = ldmState->window.dictLimit; + U32 const lowestIndex = extDict ? ldmState->window.lowLimit : dictLimit; + BYTE const* const base = ldmState->window.base; + BYTE const* const dictBase = extDict ? ldmState->window.dictBase : NULL; + BYTE const* const dictStart = extDict ? dictBase + lowestIndex : NULL; + BYTE const* const dictEnd = extDict ? dictBase + dictLimit : NULL; + BYTE const* const lowPrefixPtr = base + dictLimit; + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + BYTE const* const ilimit = iend - MAX(minMatchLength, HASH_READ_SIZE); + /* Input positions */ + BYTE const* anchor = istart; + BYTE const* ip = istart; + /* Rolling hash */ + BYTE const* lastHashed = NULL; + U64 rollingHash = 0; + + while (ip <= ilimit) { + size_t mLength; + U32 const current = (U32)(ip - base); + size_t forwardMatchLength = 0, backwardMatchLength = 0; + ldmEntry_t* bestEntry = NULL; + if (ip != istart) { + rollingHash = ZSTD_rollingHash_rotate(rollingHash, lastHashed[0], + lastHashed[minMatchLength], + hashPower); + } else { + rollingHash = ZSTD_rollingHash_compute(ip, minMatchLength); + } + lastHashed = ip; + + /* Do not insert and do not look for a match */ + if (ZSTD_ldm_getTag(rollingHash, hBits, hashRateLog) != ldmTagMask) { + ip++; + continue; + } + + /* Get the best entry and compute the match lengths */ + { + ldmEntry_t* const bucket = + ZSTD_ldm_getBucket(ldmState, + ZSTD_ldm_getSmallHash(rollingHash, hBits), + *params); + ldmEntry_t* cur; + size_t bestMatchLength = 0; + U32 const checksum = ZSTD_ldm_getChecksum(rollingHash, hBits); + + for (cur = bucket; cur < bucket + ldmBucketSize; ++cur) { + size_t curForwardMatchLength, curBackwardMatchLength, + curTotalMatchLength; + if (cur->checksum != checksum || cur->offset <= lowestIndex) { + continue; + } + if (extDict) { + BYTE const* const curMatchBase = + cur->offset < dictLimit ? dictBase : base; + BYTE const* const pMatch = curMatchBase + cur->offset; + BYTE const* const matchEnd = + cur->offset < dictLimit ? dictEnd : iend; + BYTE const* const lowMatchPtr = + cur->offset < dictLimit ? dictStart : lowPrefixPtr; + + curForwardMatchLength = ZSTD_count_2segments( + ip, pMatch, iend, + matchEnd, lowPrefixPtr); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowMatchPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } else { /* !extDict */ + BYTE const* const pMatch = base + cur->offset; + curForwardMatchLength = ZSTD_count(ip, pMatch, iend); + if (curForwardMatchLength < minMatchLength) { + continue; + } + curBackwardMatchLength = + ZSTD_ldm_countBackwardsMatch(ip, anchor, pMatch, + lowPrefixPtr); + curTotalMatchLength = curForwardMatchLength + + curBackwardMatchLength; + } + + if (curTotalMatchLength > bestMatchLength) { + bestMatchLength = curTotalMatchLength; + forwardMatchLength = curForwardMatchLength; + backwardMatchLength = curBackwardMatchLength; + bestEntry = cur; + } + } + } + + /* No match found -- continue searching */ + if (bestEntry == NULL) { + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, + hBits, current, + *params); + ip++; + continue; + } + + /* Match found */ + mLength = forwardMatchLength + backwardMatchLength; + ip -= backwardMatchLength; + + { + /* Store the sequence: + * ip = current - backwardMatchLength + * The match is at (bestEntry->offset - backwardMatchLength) + */ + U32 const matchIndex = bestEntry->offset; + U32 const offset = current - matchIndex; + rawSeq* const seq = rawSeqStore->seq + rawSeqStore->size; + + /* Out of sequence storage */ + if (rawSeqStore->size == rawSeqStore->capacity) + return ERROR(dstSize_tooSmall); + seq->litLength = (U32)(ip - anchor); + seq->matchLength = (U32)mLength; + seq->offset = offset; + rawSeqStore->size++; + } + + /* Insert the current entry into the hash table */ + ZSTD_ldm_makeEntryAndInsertByTag(ldmState, rollingHash, hBits, + (U32)(lastHashed - base), + *params); + + assert(ip + backwardMatchLength == lastHashed); + + /* Fill the hash table from lastHashed+1 to ip+mLength*/ + /* Heuristic: don't need to fill the entire table at end of block */ + if (ip + mLength <= ilimit) { + rollingHash = ZSTD_ldm_fillLdmHashTable( + ldmState, rollingHash, lastHashed, + ip + mLength, base, hBits, *params); + lastHashed = ip + mLength - 1; + } + ip += mLength; + anchor = ip; + } + return iend - anchor; +} + +/*! ZSTD_ldm_reduceTable() : + * reduce table indexes by `reducerValue` */ +static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + U32 const reducerValue) +{ + U32 u; + for (u = 0; u < size; u++) { + if (table[u].offset < reducerValue) table[u].offset = 0; + else table[u].offset -= reducerValue; + } +} + +size_t ZSTD_ldm_generateSequences( + ldmState_t* ldmState, rawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) +{ + U32 const maxDist = 1U << params->windowLog; + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + size_t const kMaxChunkSize = 1 << 20; + size_t const nbChunks = (srcSize / kMaxChunkSize) + ((srcSize % kMaxChunkSize) != 0); + size_t chunk; + size_t leftoverSize = 0; + + assert(ZSTD_CHUNKSIZE_MAX >= kMaxChunkSize); + /* Check that ZSTD_window_update() has been called for this chunk prior + * to passing it to this function. + */ + assert(ldmState->window.nextSrc >= (BYTE const*)src + srcSize); + /* The input could be very large (in zstdmt), so it must be broken up into + * chunks to enforce the maximum distance and handle overflow correction. + */ + assert(sequences->pos <= sequences->size); + assert(sequences->size <= sequences->capacity); + for (chunk = 0; chunk < nbChunks && sequences->size < sequences->capacity; ++chunk) { + BYTE const* const chunkStart = istart + chunk * kMaxChunkSize; + size_t const remaining = (size_t)(iend - chunkStart); + BYTE const *const chunkEnd = + (remaining < kMaxChunkSize) ? iend : chunkStart + kMaxChunkSize; + size_t const chunkSize = chunkEnd - chunkStart; + size_t newLeftoverSize; + size_t const prevSize = sequences->size; + + assert(chunkStart < iend); + /* 1. Perform overflow correction if necessary. */ + if (ZSTD_window_needOverflowCorrection(ldmState->window, chunkEnd)) { + U32 const ldmHSize = 1U << params->hashLog; + U32 const correction = ZSTD_window_correctOverflow( + &ldmState->window, /* cycleLog */ 0, maxDist, chunkStart); + ZSTD_ldm_reduceTable(ldmState->hashTable, ldmHSize, correction); + /* invalidate dictionaries on overflow correction */ + ldmState->loadedDictEnd = 0; + } + /* 2. We enforce the maximum offset allowed. + * + * kMaxChunkSize should be small enough that we don't lose too much of + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the + * the offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may + * be split into two sequences. This condition holds when using + * ZSTD_window_enforceMaxDist(), but if we move to checking offsets + * against maxDist directly, we'll have to carefully handle that case. + */ + ZSTD_window_enforceMaxDist(&ldmState->window, chunkEnd, maxDist, &ldmState->loadedDictEnd, NULL); + /* 3. Generate the sequences for the chunk, and get newLeftoverSize. */ + newLeftoverSize = ZSTD_ldm_generateSequences_internal( + ldmState, sequences, params, chunkStart, chunkSize); + if (ZSTD_isError(newLeftoverSize)) + return newLeftoverSize; + /* 4. We add the leftover literals from previous iterations to the first + * newly generated sequence, or add the `newLeftoverSize` if none are + * generated. + */ + /* Prepend the leftover literals from the last call */ + if (prevSize < sequences->size) { + sequences->seq[prevSize].litLength += (U32)leftoverSize; + leftoverSize = newLeftoverSize; + } else { + assert(newLeftoverSize == chunkSize); + leftoverSize += chunkSize; + } + } + return 0; +} + +void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; + if (srcSize <= seq->litLength) { + /* Skip past srcSize literals */ + seq->litLength -= (U32)srcSize; + return; + } + srcSize -= seq->litLength; + seq->litLength = 0; + if (srcSize < seq->matchLength) { + /* Skip past the first srcSize of the match */ + seq->matchLength -= (U32)srcSize; + if (seq->matchLength < minMatch) { + /* The match is too short, omit it */ + if (rawSeqStore->pos + 1 < rawSeqStore->size) { + seq[1].litLength += seq[0].matchLength; + } + rawSeqStore->pos++; + } + return; + } + srcSize -= seq->matchLength; + seq->matchLength = 0; + rawSeqStore->pos++; + } +} + +/** + * If the sequence length is longer than remaining then the sequence is split + * between this block and the next. + * + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) +{ + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; + assert(sequence.offset > 0); + /* Likely: No partial sequence */ + if (remaining >= sequence.litLength + sequence.matchLength) { + rawSeqStore->pos++; + return sequence; + } + /* Cut the sequence short (offset == 0 ==> rest is literals). */ + if (remaining <= sequence.litLength) { + sequence.offset = 0; + } else if (remaining < sequence.litLength + sequence.matchLength) { + sequence.matchLength = remaining - sequence.litLength; + if (sequence.matchLength < minMatch) { + sequence.offset = 0; + } + } + /* Skip past `remaining` bytes for the future sequences. */ + ZSTD_ldm_skipSequences(rawSeqStore, remaining, minMatch); + return sequence; +} + +size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; + ZSTD_blockCompressor const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; + BYTE const* const iend = istart + srcSize; + /* Input positions */ + BYTE const* ip = istart; + + DEBUGLOG(5, "ZSTD_ldm_blockCompress: srcSize=%zu", srcSize); + assert(rawSeqStore->pos <= rawSeqStore->size); + assert(rawSeqStore->size <= rawSeqStore->capacity); + /* Loop through each sequence and apply the block compressor to the lits */ + while (rawSeqStore->pos < rawSeqStore->size && ip < iend) { + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); + int i; + /* End signal */ + if (sequence.offset == 0) + break; + + assert(ip + sequence.litLength + sequence.matchLength <= iend); + + /* Fill tables for block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; + /* Update the repcodes */ + for (i = ZSTD_REP_NUM - 1; i > 0; i--) + rep[i] = rep[i-1]; + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, + sequence.offset + ZSTD_REP_MOVE, + sequence.matchLength - MINMATCH); + ip += sequence.matchLength; + } + } + /* Fill the tables for the block compressor */ + ZSTD_ldm_limitTableUpdate(ms, ip); + ZSTD_ldm_fillFastTables(ms, ip); + /* Compress the last literals */ + return blockCompressor(ms, seqStore, rep, ip, iend - ip); +} +/**** ended inlining compress/zstd_ldm.c ****/ +/**** start inlining compress/zstd_opt.c ****/ +/* + * Copyright (c) 2016-2020, Przemyslaw Skibinski, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/**** skipping file: zstd_compress_internal.h ****/ +/**** skipping file: hist.h ****/ +/**** skipping file: zstd_opt.h ****/ + + +#define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ +#define ZSTD_FREQ_DIV 4 /* log factor when using previous stats to init next stats */ +#define ZSTD_MAX_PRICE (1<<30) + +#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + +/*-************************************* +* Price functions for optimal parser +***************************************/ + +#if 0 /* approximation at bit level */ +# define BITCOST_ACCURACY 0 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat) ((void)opt, ZSTD_bitWeight(stat)) +#elif 0 /* fractional bit accuracy */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) +#else /* opt==approx, ultra==accurate */ +# define BITCOST_ACCURACY 8 +# define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) +#endif + +MEM_STATIC U32 ZSTD_bitWeight(U32 stat) +{ + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); +} + +MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) +{ + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); + return weight; +} + +#if (DEBUGLEVEL>=2) +/* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +MEM_STATIC double ZSTD_fCost(U32 price) +{ + return (double)price / (BITCOST_MULTIPLIER*8); +} +#endif + +static int ZSTD_compressedLiterals(optState_t const* const optPtr) +{ + return optPtr->literalCompressionMode != ZSTD_lcm_uncompressed; +} + +static void ZSTD_setBasePrices(optState_t* optPtr, int optLevel) +{ + if (ZSTD_compressedLiterals(optPtr)) + optPtr->litSumBasePrice = WEIGHT(optPtr->litSum, optLevel); + optPtr->litLengthSumBasePrice = WEIGHT(optPtr->litLengthSum, optLevel); + optPtr->matchLengthSumBasePrice = WEIGHT(optPtr->matchLengthSum, optLevel); + optPtr->offCodeSumBasePrice = WEIGHT(optPtr->offCodeSum, optLevel); +} + + +/* ZSTD_downscaleStat() : + * reduce all elements in table by a factor 2^(ZSTD_FREQ_DIV+malus) + * return the resulting sum of elements */ +static U32 ZSTD_downscaleStat(unsigned* table, U32 lastEltIndex, int malus) +{ + U32 s, sum=0; + DEBUGLOG(5, "ZSTD_downscaleStat (nbElts=%u)", (unsigned)lastEltIndex+1); + assert(ZSTD_FREQ_DIV+malus > 0 && ZSTD_FREQ_DIV+malus < 31); + for (s=0; s> (ZSTD_FREQ_DIV+malus)); + sum += table[s]; + } + return sum; +} + +/* ZSTD_rescaleFreqs() : + * if first block (detected by optPtr->litLengthSum == 0) : init statistics + * take hints from dictionary if there is one + * or init from zero, using src for literals stats, or flat 1 for match symbols + * otherwise downscale existing stats, to be used as seed for next block. + */ +static void +ZSTD_rescaleFreqs(optState_t* const optPtr, + const BYTE* const src, size_t const srcSize, + int const optLevel) +{ + int const compressedLiterals = ZSTD_compressedLiterals(optPtr); + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + + if (optPtr->litLengthSum == 0) { /* first block : init */ + if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ + DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { + /* huffman table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; + for (lit=0; lit<=MaxLit; lit++) { + U32 const scaleLog = 11; /* scale to 2K */ + U32 const bitCost = HUF_getNbBits(optPtr->symbolCosts->huf.CTable, lit); + assert(bitCost <= scaleLog); + optPtr->litFreq[lit] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litSum += optPtr->litFreq[lit]; + } } + + { unsigned ll; + FSE_CState_t llstate; + FSE_initCState(&llstate, optPtr->symbolCosts->fse.litlengthCTable); + optPtr->litLengthSum = 0; + for (ll=0; ll<=MaxLL; ll++) { + U32 const scaleLog = 10; /* scale to 1K */ + U32 const bitCost = FSE_getMaxNbBits(llstate.symbolTT, ll); + assert(bitCost < scaleLog); + optPtr->litLengthFreq[ll] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->litLengthSum += optPtr->litLengthFreq[ll]; + } } + + { unsigned ml; + FSE_CState_t mlstate; + FSE_initCState(&mlstate, optPtr->symbolCosts->fse.matchlengthCTable); + optPtr->matchLengthSum = 0; + for (ml=0; ml<=MaxML; ml++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(mlstate.symbolTT, ml); + assert(bitCost < scaleLog); + optPtr->matchLengthFreq[ml] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->matchLengthSum += optPtr->matchLengthFreq[ml]; + } } + + { unsigned of; + FSE_CState_t ofstate; + FSE_initCState(&ofstate, optPtr->symbolCosts->fse.offcodeCTable); + optPtr->offCodeSum = 0; + for (of=0; of<=MaxOff; of++) { + U32 const scaleLog = 10; + U32 const bitCost = FSE_getMaxNbBits(ofstate.symbolTT, of); + assert(bitCost < scaleLog); + optPtr->offCodeFreq[of] = bitCost ? 1 << (scaleLog-bitCost) : 1 /*minimum to calculate cost*/; + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + + } else { /* not a dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ + optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + } + + { unsigned ll; + for (ll=0; ll<=MaxLL; ll++) + optPtr->litLengthFreq[ll] = 1; + } + optPtr->litLengthSum = MaxLL+1; + + { unsigned ml; + for (ml=0; ml<=MaxML; ml++) + optPtr->matchLengthFreq[ml] = 1; + } + optPtr->matchLengthSum = MaxML+1; + + { unsigned of; + for (of=0; of<=MaxOff; of++) + optPtr->offCodeFreq[of] = 1; + } + optPtr->offCodeSum = MaxOff+1; + + } + + } else { /* new block : re-use previous statistics, scaled down */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_downscaleStat(optPtr->litFreq, MaxLit, 1); + optPtr->litLengthSum = ZSTD_downscaleStat(optPtr->litLengthFreq, MaxLL, 0); + optPtr->matchLengthSum = ZSTD_downscaleStat(optPtr->matchLengthFreq, MaxML, 0); + optPtr->offCodeSum = ZSTD_downscaleStat(optPtr->offCodeFreq, MaxOff, 0); + } + + ZSTD_setBasePrices(optPtr, optLevel); +} + +/* ZSTD_rawLiteralsCost() : + * price of literals (only) in specified segment (which length can be 0). + * does not include price of literalLength symbol */ +static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) +{ + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) + return (litLength << 3) * BITCOST_MULTIPLIER; /* Uncompressed - 8 bytes per literal. */ + + if (optPtr->priceType == zop_predef) + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ + { U32 price = litLength * optPtr->litSumBasePrice; + U32 u; + for (u=0; u < litLength; u++) { + assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ + price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); + } + return price; + } +} + +/* ZSTD_litLengthPrice() : + * cost of literalLength symbol */ +static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optPtr, int optLevel) +{ + if (optPtr->priceType == zop_predef) return WEIGHT(litLength, optLevel); + + /* dynamic statistics */ + { U32 const llCode = ZSTD_LLcode(litLength); + return (LL_bits[llCode] * BITCOST_MULTIPLIER) + + optPtr->litLengthSumBasePrice + - WEIGHT(optPtr->litLengthFreq[llCode], optLevel); + } +} + +/* ZSTD_getMatchPrice() : + * Provides the cost of the match part (offset + matchLength) of a sequence + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. + * optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) */ +FORCE_INLINE_TEMPLATE U32 +ZSTD_getMatchPrice(U32 const offset, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) +{ + U32 price; + U32 const offCode = ZSTD_highbit32(offset+1); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + + if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ + return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); + if ((optLevel<2) /*static*/ && offCode >= 20) + price += (offCode-19)*2 * BITCOST_MULTIPLIER; /* handicap for long distance offsets, favor decompression speed */ + + /* match Length */ + { U32 const mlCode = ZSTD_MLcode(mlBase); + price += (ML_bits[mlCode] * BITCOST_MULTIPLIER) + (optPtr->matchLengthSumBasePrice - WEIGHT(optPtr->matchLengthFreq[mlCode], optLevel)); + } + + price += BITCOST_MULTIPLIER / 5; /* heuristic : make matches a bit more costly to favor less sequences -> faster decompression speed */ + + DEBUGLOG(8, "ZSTD_getMatchPrice(ml:%u) = %u", matchLength, price); + return price; +} + +/* ZSTD_updateStats() : + * assumption : literals + litLengtn <= iend */ +static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, + U32 offsetCode, U32 matchLength) +{ + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { + U32 u; + for (u=0; u < litLength; u++) + optPtr->litFreq[literals[u]] += ZSTD_LITFREQ_ADD; + optPtr->litSum += litLength*ZSTD_LITFREQ_ADD; + } + + /* literal Length */ + { U32 const llCode = ZSTD_LLcode(litLength); + optPtr->litLengthFreq[llCode]++; + optPtr->litLengthSum++; + } + + /* match offset code (0-2=>repCode; 3+=>offset+2) */ + { U32 const offCode = ZSTD_highbit32(offsetCode+1); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; + } + + /* match Length */ + { U32 const mlBase = matchLength - MINMATCH; + U32 const mlCode = ZSTD_MLcode(mlBase); + optPtr->matchLengthFreq[mlCode]++; + optPtr->matchLengthSum++; + } +} + + +/* ZSTD_readMINMATCH() : + * function safe only for comparisons + * assumption : memPtr must be at least 4 bytes before end of buffer */ +MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) +{ + switch (length) + { + default : + case 4 : return MEM_read32(memPtr); + case 3 : if (MEM_isLittleEndian()) + return MEM_read32(memPtr)<<8; + else + return MEM_read32(memPtr)>>8; + } +} + + +/* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +static U32 ZSTD_insertAndFindFirstIndexHash3 (ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip) +{ + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; + const BYTE* const base = ms->window.base; + U32 idx = *nextToUpdate3; + U32 const target = (U32)(ip - base); + size_t const hash3 = ZSTD_hash3Ptr(ip, hashLog3); + assert(hashLog3 > 0); + + while(idx < target) { + hashTable3[ZSTD_hash3Ptr(base+idx, hashLog3)] = idx; + idx++; + } + + *nextToUpdate3 = target; + return hashTable3[hash3]; +} + + +/*-************************************* +* Binary Tree search +***************************************/ +/** ZSTD_insertBt1() : add one or multiple positions to tree. + * ip : assumed <= iend-8 . + * @return : nb of positions added */ +static U32 ZSTD_insertBt1( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const mls, const int extDict) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hashLog = cParams->hashLog; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask = (1 << btLog) - 1; + U32 matchIndex = hashTable[h]; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const U32 dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + const BYTE* match; + const U32 current = (U32)(ip-base); + const U32 btLow = btMask >= current ? 0 : current - btMask; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = smallerPtr + 1; + U32 dummy32; /* to be nullified at the end */ + U32 const windowLow = ms->window.lowLimit; + U32 matchEndIdx = current+8+1; + size_t bestLength = 8; + U32 nbCompares = 1U << cParams->searchLog; +#ifdef ZSTD_C_PREDICT + U32 predictedSmall = *(bt + 2*((current-1)&btMask) + 0); + U32 predictedLarge = *(bt + 2*((current-1)&btMask) + 1); + predictedSmall += (predictedSmall>0); + predictedLarge += (predictedLarge>0); +#endif /* ZSTD_C_PREDICT */ + + DEBUGLOG(8, "ZSTD_insertBt1 (%u)", current); + + assert(ip <= iend-8); /* required for h calculation */ + hashTable[h] = current; /* Update Hash Table */ + + assert(windowLow > 0); + while (nbCompares-- && (matchIndex >= windowLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(matchIndex < current); + +#ifdef ZSTD_C_PREDICT /* note : can create issues when hlog small <= 11 */ + const U32* predictPtr = bt + 2*((matchIndex-1) & btMask); /* written this way, as bt is a roll buffer */ + if (matchIndex == predictedSmall) { + /* no need to check length, result known */ + *smallerPtr = matchIndex; + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new "smaller" => larger of match */ + matchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + predictedSmall = predictPtr[1] + (predictPtr[1]>0); + continue; + } + if (matchIndex == predictedLarge) { + *largerPtr = matchIndex; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + predictedLarge = predictPtr[0] + (predictPtr[0]>0); + continue; + } +#endif + + if (!extDict || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* might be wrong if actually extDict */ + match = base + matchIndex; + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iend); + } else { + match = dictBase + matchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iend, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* to prepare for next usage of match[matchLength] */ + } + + if (matchLength > bestLength) { + bestLength = matchLength; + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + } + + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + break; /* drop , to guarantee consistency ; miss a bit of compression, but other solutions can corrupt tree */ + } + + if (match[matchLength] < ip[matchLength]) { /* necessarily within buffer */ + /* match is smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + smallerPtr = nextPtr+1; /* new "candidate" => larger than match, which was smaller than target */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous and closer to current */ + } else { + /* match is larger than current */ + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop searching */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + { U32 positions = 0; + if (bestLength > 384) positions = MIN(192, (U32)(bestLength - 384)); /* speed optimization */ + assert(matchEndIdx > current + 8); + return MAX(positions, matchEndIdx - (current + 8)); + } +} + +FORCE_INLINE_TEMPLATE +void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) +{ + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; + DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { + U32 const forward = ZSTD_insertBt1(ms, base+idx, iend, mls, dictMode == ZSTD_extDict); + assert(idx < (U32)(idx + forward)); + idx += forward; + } + assert((size_t)(ip - base) <= (size_t)(U32)(-1)); + assert((size_t)(iend - base) <= (size_t)(U32)(-1)); + ms->nextToUpdate = target; +} + +void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); +} + +FORCE_INLINE_TEMPLATE +U32 ZSTD_insertBtAndGetAllMatches ( + ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ + const U32 lengthToBeat, + U32 const mls /* template */) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + const BYTE* const base = ms->window.base; + U32 const current = (U32)(ip-base); + U32 const hashLog = cParams->hashLog; + U32 const minMatch = (mls==3) ? 3 : 4; + U32* const hashTable = ms->hashTable; + size_t const h = ZSTD_hashPtr(ip, hashLog, mls); + U32 matchIndex = hashTable[h]; + U32* const bt = ms->chainTable; + U32 const btLog = cParams->chainLog - 1; + U32 const btMask= (1U << btLog) - 1; + size_t commonLengthSmaller=0, commonLengthLarger=0; + const BYTE* const dictBase = ms->window.dictBase; + U32 const dictLimit = ms->window.dictLimit; + const BYTE* const dictEnd = dictBase + dictLimit; + const BYTE* const prefixStart = base + dictLimit; + U32 const btLow = (btMask >= current) ? 0 : current - btMask; + U32 const windowLow = ZSTD_getLowestMatchIndex(ms, current, cParams->windowLog); + U32 const matchLow = windowLow ? windowLow : 1; + U32* smallerPtr = bt + 2*(current&btMask); + U32* largerPtr = bt + 2*(current&btMask) + 1; + U32 matchEndIdx = current+8+1; /* farthest referenced position of any match => detects repetitive patterns */ + U32 dummy32; /* to be nullified at the end */ + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + + const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; + const BYTE* const dmsEnd = dictMode == ZSTD_dictMatchState ? dms->window.nextSrc : NULL; + U32 const dmsHighLimit = dictMode == ZSTD_dictMatchState ? (U32)(dmsEnd - dmsBase) : 0; + U32 const dmsLowLimit = dictMode == ZSTD_dictMatchState ? dms->window.lowLimit : 0; + U32 const dmsIndexDelta = dictMode == ZSTD_dictMatchState ? windowLow - dmsHighLimit : 0; + U32 const dmsHashLog = dictMode == ZSTD_dictMatchState ? dmsCParams->hashLog : hashLog; + U32 const dmsBtLog = dictMode == ZSTD_dictMatchState ? dmsCParams->chainLog - 1 : btLog; + U32 const dmsBtMask = dictMode == ZSTD_dictMatchState ? (1U << dmsBtLog) - 1 : 0; + U32 const dmsBtLow = dictMode == ZSTD_dictMatchState && dmsBtMask < dmsHighLimit - dmsLowLimit ? dmsHighLimit - dmsBtMask : dmsLowLimit; + + size_t bestLength = lengthToBeat-1; + DEBUGLOG(8, "ZSTD_insertBtAndGetAllMatches: current=%u", current); + + /* check repCode */ + assert(ll0 <= 1); /* necessarily 1 or 0 */ + { U32 const lastR = ZSTD_REP_NUM + ll0; + U32 repCode; + for (repCode = ll0; repCode < lastR; repCode++) { + U32 const repOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + U32 const repIndex = current - repOffset; + U32 repLen = 0; + assert(current >= dictLimit); + if (repOffset-1 /* intentional overflow, discards 0 and -1 */ < current-dictLimit) { /* equivalent to `current > repIndex >= dictLimit` */ + /* We must validate the repcode offset because when we're using a dictionary the + * valid offset range shrinks when the dictionary goes out of bounds. + */ + if ((repIndex >= windowLow) & (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(ip - repOffset, minMatch))) { + repLen = (U32)ZSTD_count(ip+minMatch, ip+minMatch-repOffset, iLimit) + minMatch; + } + } else { /* repIndex < dictLimit || repIndex >= current */ + const BYTE* const repMatch = dictMode == ZSTD_dictMatchState ? + dmsBase + repIndex - dmsIndexDelta : + dictBase + repIndex; + assert(current >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < current - windowLow) /* equivalent to `current > repIndex >= windowLow` */ + & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < current - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `current > repIndex >= dmsLowLimit` */ + & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } + /* save longer solution */ + if (repLen > bestLength) { + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; + matches[mnum].off = repCode - ll0; + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) + | (ip+repLen == iLimit) ) { /* best possible */ + return mnum; + } } } } + + /* HC3 match finder */ + if ((mls == 3) /*static*/ && (bestLength < mls)) { + U32 const matchIndex3 = ZSTD_insertAndFindFirstIndexHash3(ms, nextToUpdate3, ip); + if ((matchIndex3 >= matchLow) + & (current - matchIndex3 < (1<<18)) /*heuristic : longer distance likely too expensive*/ ) { + size_t mlen; + if ((dictMode == ZSTD_noDict) /*static*/ || (dictMode == ZSTD_dictMatchState) /*static*/ || (matchIndex3 >= dictLimit)) { + const BYTE* const match = base + matchIndex3; + mlen = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex3; + mlen = ZSTD_count_2segments(ip, match, iLimit, dictEnd, prefixStart); + } + + /* save best solution */ + if (mlen >= mls /* == 3 > bestLength */) { + DEBUGLOG(8, "found small match with hlog3, of length %u", + (U32)mlen); + bestLength = mlen; + assert(current > matchIndex3); + assert(mnum==0); /* no prior solution */ + matches[0].off = (current - matchIndex3) + ZSTD_REP_MOVE; + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | + (ip+mlen == iLimit) ) { /* best possible length */ + ms->nextToUpdate = current+1; /* skip insertion */ + return 1; + } } } + /* no dictMatchState lookup: dicts don't have a populated HC3 table */ + } + + hashTable[h] = current; /* Update Hash Table */ + + while (nbCompares-- && (matchIndex >= matchLow)) { + U32* const nextPtr = bt + 2*(matchIndex & btMask); + const BYTE* match; + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + assert(current > matchIndex); + + if ((dictMode == ZSTD_noDict) || (dictMode == ZSTD_dictMatchState) || (matchIndex+matchLength >= dictLimit)) { + assert(matchIndex+matchLength >= dictLimit); /* ensure the condition is correct when !extDict */ + match = base + matchIndex; + if (matchIndex >= dictLimit) assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count(ip+matchLength, match+matchLength, iLimit); + } else { + match = dictBase + matchIndex; + assert(memcmp(match, ip, matchLength) == 0); /* ensure early section of match is equal as expected */ + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dictEnd, prefixStart); + if (matchIndex+matchLength >= dictLimit) + match = base + matchIndex; /* prepare for match[matchLength] read */ + } + + if (matchLength > bestLength) { + DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", + (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + if (dictMode == ZSTD_dictMatchState) nbCompares = 0; /* break should also skip searching dms */ + break; /* drop, to preserve bt consistency (miss a little bit of compression) */ + } + } + + if (match[matchLength] < ip[matchLength]) { + /* match smaller than current */ + *smallerPtr = matchIndex; /* update smaller idx */ + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + if (matchIndex <= btLow) { smallerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + smallerPtr = nextPtr+1; /* new candidate => larger than match, which was smaller than current */ + matchIndex = nextPtr[1]; /* new matchIndex, larger than previous, closer to current */ + } else { + *largerPtr = matchIndex; + commonLengthLarger = matchLength; + if (matchIndex <= btLow) { largerPtr=&dummy32; break; } /* beyond tree size, stop the search */ + largerPtr = nextPtr; + matchIndex = nextPtr[0]; + } } + + *smallerPtr = *largerPtr = 0; + + if (dictMode == ZSTD_dictMatchState && nbCompares) { + size_t const dmsH = ZSTD_hashPtr(ip, dmsHashLog, mls); + U32 dictMatchIndex = dms->hashTable[dmsH]; + const U32* const dmsBt = dms->chainTable; + commonLengthSmaller = commonLengthLarger = 0; + while (nbCompares-- && (dictMatchIndex > dmsLowLimit)) { + const U32* const nextPtr = dmsBt + 2*(dictMatchIndex & dmsBtMask); + size_t matchLength = MIN(commonLengthSmaller, commonLengthLarger); /* guaranteed minimum nb of common bytes */ + const BYTE* match = dmsBase + dictMatchIndex; + matchLength += ZSTD_count_2segments(ip+matchLength, match+matchLength, iLimit, dmsEnd, prefixStart); + if (dictMatchIndex+matchLength >= dmsHighLimit) + match = base + dictMatchIndex + dmsIndexDelta; /* to prepare for next usage of match[matchLength] */ + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; + DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", + (U32)matchLength, current - matchIndex, current - matchIndex + ZSTD_REP_MOVE); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; + matches[mnum].off = (current - matchIndex) + ZSTD_REP_MOVE; + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) + | (ip+matchLength == iLimit) /* equal : no way to know if inf or sup */) { + break; /* drop, to guarantee consistency (miss a little bit of compression) */ + } + } + + if (dictMatchIndex <= dmsBtLow) { break; } /* beyond tree size, stop the search */ + if (match[matchLength] < ip[matchLength]) { + commonLengthSmaller = matchLength; /* all smaller will now have at least this guaranteed common length */ + dictMatchIndex = nextPtr[1]; /* new matchIndex larger than previous (closer to current) */ + } else { + /* match is larger than current */ + commonLengthLarger = matchLength; + dictMatchIndex = nextPtr[0]; + } + } + } + + assert(matchEndIdx > current+8); + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + return mnum; +} + + +FORCE_INLINE_TEMPLATE U32 ZSTD_BtGetAllMatches ( + ZSTD_match_t* matches, /* store result (match found, increasing size) in this table */ + ZSTD_matchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, const BYTE* const iHighLimit, const ZSTD_dictMode_e dictMode, + const U32 rep[ZSTD_REP_NUM], + U32 const ll0, + U32 const lengthToBeat) +{ + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const matchLengthSearch = cParams->minMatch; + DEBUGLOG(8, "ZSTD_BtGetAllMatches"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateTree_internal(ms, ip, iHighLimit, matchLengthSearch, dictMode); + switch(matchLengthSearch) + { + case 3 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 3); + default : + case 4 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 4); + case 5 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 5); + case 7 : + case 6 : return ZSTD_insertBtAndGetAllMatches(matches, ms, nextToUpdate3, ip, iHighLimit, dictMode, rep, ll0, lengthToBeat, 6); + } +} + + +/*-******************************* +* Optimal parser +*********************************/ + + +static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +{ + return sol.litlen + sol.mlen; +} + +#if 0 /* debug */ + +static void +listStats(const U32* table, int lastEltID) +{ + int const nbElts = lastEltID + 1; + int enb; + for (enb=0; enb < nbElts; enb++) { + (void)table; + /* RAWLOG(2, "%3i:%3i, ", enb, table[enb]); */ + RAWLOG(2, "%4i,", table[enb]); + } + RAWLOG(2, " \n"); +} + +#endif + +FORCE_INLINE_TEMPLATE size_t +ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const int optLevel, + const ZSTD_dictMode_e dictMode) +{ + optState_t* const optStatePtr = &ms->opt; + const BYTE* const istart = (const BYTE*)src; + const BYTE* ip = istart; + const BYTE* anchor = istart; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + const BYTE* const base = ms->window.base; + const BYTE* const prefixStart = base + ms->window.dictLimit; + const ZSTD_compressionParameters* const cParams = &ms->cParams; + + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); + U32 const minMatch = (cParams->minMatch == 3) ? 3 : 4; + U32 nextToUpdate3 = ms->nextToUpdate; + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; + ZSTD_optimal_t lastSequence; + + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_opt_generic: current=%u, prefix=%u, nextToUpdate=%u", + (U32)(ip - base), ms->window.dictLimit, ms->nextToUpdate); + assert(optLevel <= 2); + ZSTD_rescaleFreqs(optStatePtr, (const BYTE*)src, srcSize, optLevel); + ip += (ip==prefixStart); + + /* Match Loop */ + while (ip < ilimit) { + U32 cur, last_pos = 0; + + /* find first match */ + { U32 const litlen = (U32)(ip - anchor); + U32 const ll0 = !litlen; + U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, ip, iend, dictMode, rep, ll0, minMatch); + if (!nbMatches) { ip++; continue; } + + /* initialize opt[0] */ + { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; + U32 const maxOffset = matches[nbMatches-1].off; + DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", + nbMatches, maxML, maxOffset, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + lastSequence.litlen = litlen; + lastSequence.mlen = maxML; + lastSequence.off = maxOffset; + DEBUGLOG(6, "large match (%u>%u), immediate encoding", + maxML, sufficient_len); + cur = 0; + last_pos = ZSTD_totalLen(lastSequence); + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + { U32 const literalsPrice = opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { + opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { + U32 const matchPrice = ZSTD_getMatchPrice(offset, pos, optStatePtr, optLevel); + U32 const sequencePrice = literalsPrice + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = sequencePrice; + } } + last_pos = pos-1; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; + assert(cur < ZSTD_OPT_NUM); + DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) + + /* Fix current position with one literal if cheaper */ + { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; + int const price = opt[cur-1].price + + ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) + + ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) + - ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { + DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); + opt[cur].mlen = 0; + opt[cur].off = 0; + opt[cur].litlen = litlen; + opt[cur].price = price; + } else { + DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), + opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); + } + } + + /* Set the repcodes of the current position. We must do it here + * because we rely on the repcodes of the 2nd to last sequence being + * correct to set the next chunks repcodes during the backward + * traversal. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); + assert(cur >= opt[cur].mlen); + if (opt[cur].mlen != 0) { + U32 const prev = cur - opt[cur].mlen; + repcodes_t newReps = ZSTD_updateRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); + memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); + } else { + memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ + if (inr > ilimit) continue; + + if (cur == last_pos) break; + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { + DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + { U32 const ll0 = (opt[cur].mlen != 0); + U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; + U32 const previousPrice = opt[cur].price; + U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); + U32 const nbMatches = ZSTD_BtGetAllMatches(matches, ms, &nextToUpdate3, inr, iend, dictMode, opt[cur].rep, ll0, minMatch); + U32 matchNb; + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); + continue; + } + + { U32 const maxML = matches[nbMatches-1].len; + DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", + inr-istart, cur, nbMatches, maxML); + + if ( (maxML > sufficient_len) + || (cur + maxML >= ZSTD_OPT_NUM) ) { + lastSequence.mlen = maxML; + lastSequence.off = matches[nbMatches-1].off; + lastSequence.litlen = litlen; + cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ + last_pos = cur + ZSTD_totalLen(lastSequence); + if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ + goto _shortestPath; + } } + + /* set prices using matches found at position == cur */ + for (matchNb = 0; matchNb < nbMatches; matchNb++) { + U32 const offset = matches[matchNb].off; + U32 const lastML = matches[matchNb].len; + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + + DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; + int const price = basePrice + ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ + opt[pos].mlen = mlen; + opt[pos].off = offset; + opt[pos].litlen = litlen; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } + } /* for (cur = 1; cur <= last_pos; cur++) */ + + lastSequence = opt[last_pos]; + cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ + assert(cur < ZSTD_OPT_NUM); /* control overflow*/ + +_shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); + + /* Set the next chunk's repcodes based on the repcodes of the beginning + * of the last match, and the last sequence. This avoids us having to + * update them while traversing the sequences. + */ + if (lastSequence.mlen != 0) { + repcodes_t reps = ZSTD_updateRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); + memcpy(rep, &reps, sizeof(reps)); + } else { + memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); + } + + { U32 const storeEnd = cur + 1; + U32 storeStart = storeEnd; + U32 seqPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; + assert(storeEnd < ZSTD_OPT_NUM); + DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); + opt[storeEnd] = lastSequence; + while (seqPos > 0) { + U32 const backDist = ZSTD_totalLen(opt[seqPos]); + storeStart--; + DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", + seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); + opt[storeStart] = opt[seqPos]; + seqPos = (seqPos > backDist) ? seqPos - backDist : 0; + } + + /* save sequences */ + DEBUGLOG(6, "sending selected sequences into seqStore") + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; + U32 const offCode = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ + ip = anchor + llen; /* last "sequence" is a bunch of literals => don't progress anchor */ + continue; /* will finish */ + } + + assert(anchor + llen <= iend); + ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); + ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen-MINMATCH); + anchor += advance; + ip = anchor; + } } + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ + + /* Return the last literals size */ + return (size_t)(iend - anchor); +} + + +size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_noDict); +} + + +/* used in 2-pass strategy */ +static U32 ZSTD_upscaleStat(unsigned* table, U32 lastEltIndex, int bonus) +{ + U32 s, sum=0; + assert(ZSTD_FREQ_DIV+bonus >= 0); + for (s=0; slitSum = ZSTD_upscaleStat(optPtr->litFreq, MaxLit, 0); + optPtr->litLengthSum = ZSTD_upscaleStat(optPtr->litLengthFreq, MaxLL, 0); + optPtr->matchLengthSum = ZSTD_upscaleStat(optPtr->matchLengthFreq, MaxML, 0); + optPtr->offCodeSum = ZSTD_upscaleStat(optPtr->offCodeFreq, MaxOff, 0); +} + +/* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. + * this function cannot error, hence its contract must be respected. + */ +static void +ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + memcpy(tmpRep, rep, sizeof(tmpRep)); + + DEBUGLOG(4, "ZSTD_initStats_ultra (srcSize=%zu)", srcSize); + assert(ms->opt.litLengthSum == 0); /* first block */ + assert(seqStore->sequences == seqStore->sequencesStart); /* no ldm */ + assert(ms->window.dictLimit == ms->window.lowLimit); /* no dictionary */ + assert(ms->window.dictLimit - ms->nextToUpdate <= 1); /* no prefix (note: intentional overflow, defined as 2-complement) */ + + ZSTD_compressBlock_opt_generic(ms, seqStore, tmpRep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); /* generate stats into ms->opt*/ + + /* invalidate first scan from history */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; + ms->window.lowLimit = ms->window.dictLimit; + ms->nextToUpdate = ms->window.dictLimit; + + /* re-inforce weight of collected statistics */ + ZSTD_upscaleStats(&ms->opt); +} + +size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btultra2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + U32 const current = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + + /* 2-pass strategy: + * this strategy makes a first pass over first block to collect statistics + * and seed next round's statistics with it. + * After 1st pass, function forgets everything, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), + * the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ + && (current == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ + && (srcSize > ZSTD_PREDEF_THRESHOLD) + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_noDict); +} + +size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_dictMatchState); +} + +size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /*optLevel*/, ZSTD_extDict); +} + +size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +{ + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /*optLevel*/, ZSTD_extDict); +} + +/* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ +/**** ended inlining compress/zstd_opt.c ****/ + +/**** start inlining decompress/huf_decompress.c ****/ +/* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library + * Copyright (c) 2013-2020, Yann Collet, Facebook, Inc. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. +****************************************************************** */ + +/* ************************************************************** +* Dependencies +****************************************************************/ +#include /* memcpy, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/bitstream.h ****/ +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/error_private.h ****/ + +/* ************************************************************** +* Macros +****************************************************************/ + +/* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. + */ +#if defined(HUF_FORCE_DECOMPRESS_X1) && \ + defined(HUF_FORCE_DECOMPRESS_X2) +#error "Cannot force the use of the X1 and X2 decoders at the same time!" +#endif + + +/* ************************************************************** +* Error Management +****************************************************************/ +#define HUF_isError ERR_isError + + +/* ************************************************************** +* Byte alignment for workSpace management +****************************************************************/ +#define HUF_ALIGN(x, a) HUF_ALIGN_MASK((x), (a) - 1) +#define HUF_ALIGN_MASK(x, mask) (((x) + (mask)) & ~(mask)) + + +/* ************************************************************** +* BMI2 Variant Wrappers +****************************************************************/ +#if DYNAMIC_BMI2 + +#define HUF_DGEN(fn) \ + \ + static size_t fn##_default( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static TARGET_ATTRIBUTE("bmi2") size_t fn##_bmi2( \ + void* dst, size_t dstSize, \ + const void* cSrc, size_t cSrcSize, \ + const HUF_DTable* DTable) \ + { \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + if (bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#else + +#define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ + size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ + { \ + (void)bmi2; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +#endif + + +/*-***************************/ +/* generic DTableDesc */ +/*-***************************/ +typedef struct { BYTE maxTableLog; BYTE tableType; BYTE tableLog; BYTE reserved; } DTableDesc; + +static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) +{ + DTableDesc dtd; + memcpy(&dtd, table, sizeof(dtd)); + return dtd; +} + + +#ifndef HUF_FORCE_DECOMPRESS_X2 + +/*-***************************/ +/* single-symbol decoding */ +/*-***************************/ +typedef struct { BYTE byte; BYTE nbBits; } HUF_DEltX1; /* single-symbol decoding */ + +size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +{ + U32 tableLog = 0; + U32 nbSymbols = 0; + size_t iSize; + void* const dtPtr = DTable + 1; + HUF_DEltX1* const dt = (HUF_DEltX1*)dtPtr; + + U32* rankVal; + BYTE* huffWeight; + size_t spaceUsed32 = 0; + + rankVal = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_ABSOLUTEMAX + 1; + huffWeight = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(huffWeight, HUF_SYMBOLVALUE_MAX + 1, rankVal, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* Table header */ + { DTableDesc dtd = HUF_getDTableDesc(DTable); + if (tableLog > (U32)(dtd.maxTableLog+1)) return ERROR(tableLog_tooLarge); /* DTable too small, Huffman tree cannot fit in */ + dtd.tableType = 0; + dtd.tableLog = (BYTE)tableLog; + memcpy(DTable, &dtd, sizeof(dtd)); + } + + /* Calculate starting value for each rank */ + { U32 n, nextRankStart = 0; + for (n=1; n> 1; + size_t const uStart = rankVal[w]; + size_t const uEnd = uStart + length; + size_t u; + HUF_DEltX1 D; + D.byte = (BYTE)n; + D.nbBits = (BYTE)(tableLog + 1 - w); + rankVal[w] = (U32)uEnd; + if (length < 4) { + /* Use length in the loop bound so the compiler knows it is short. */ + for (u = 0; u < length; ++u) + dt[uStart + u] = D; + } else { + /* Unroll the loop 4 times, we know it is a power of 2. */ + for (u = uStart; u < uEnd; u += 4) { + dt[u + 0] = D; + dt[u + 1] = D; + dt[u + 2] = D; + dt[u + 3] = D; + } } } } + return iSize; +} + +size_t HUF_readDTableX1(HUF_DTable* DTable, const void* src, size_t srcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX1_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); +} + +FORCE_INLINE_TEMPLATE BYTE +HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(Dstream, dtLog); /* note : dtLog >= 1 */ + BYTE const c = dt[val].byte; + BIT_skipBits(Dstream, dt[val].nbBits); + return c; +} + +#define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ + *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) + +#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) + +HINT_INLINE size_t +HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 4 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-3)) { + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_1(p, bitDPtr); + HUF_DECODE_SYMBOLX1_2(p, bitDPtr); + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + } + + /* [0-3] symbols remaining */ + if (MEM_32bits()) + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd)) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + /* no more data to retrieve from bitstream, no need to reload */ + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + + return pEnd-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BYTE* op = (BYTE*)dst; + BYTE* const oend = op + dstSize; + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + HUF_decodeStreamX1(op, &bitD, oend, dt, dtLog); + + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + return dstSize; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - 3; + const void* const dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + const size_t segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + U32 endSignal = 1; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* up to 16 symbols per loop (4 symbols per stream) in 64-bit mode */ + for ( ; (endSignal) & (op4 < olimit) ; ) { + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_1(op1, &bitD1); + HUF_DECODE_SYMBOLX1_1(op2, &bitD2); + HUF_DECODE_SYMBOLX1_1(op3, &bitD3); + HUF_DECODE_SYMBOLX1_1(op4, &bitD4); + HUF_DECODE_SYMBOLX1_2(op1, &bitD1); + HUF_DECODE_SYMBOLX1_2(op2, &bitD2); + HUF_DECODE_SYMBOLX1_2(op3, &bitD3); + HUF_DECODE_SYMBOLX1_2(op4, &bitD4); + HUF_DECODE_SYMBOLX1_0(op1, &bitD1); + HUF_DECODE_SYMBOLX1_0(op2, &bitD2); + HUF_DECODE_SYMBOLX1_0(op3, &bitD3); + HUF_DECODE_SYMBOLX1_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; + } + + /* check corruption */ + /* note : should not be necessary : op# advance in lock step, and we control op4. + * but curiously, binary generated by gcc 7.2 & 7.3 with -mbmi2 runs faster when >=1 test is present */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 supposed already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX1(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX1(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX1(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX1(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + + +typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, + const void *cSrc, + size_t cSrcSize, + const HUF_DTable *DTable); + +HUF_DGEN(HUF_decompress1X1_usingDTable_internal) +HUF_DGEN(HUF_decompress4X1_usingDTable_internal) + + + +size_t HUF_decompress1X1_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) return ERROR(GENERIC); + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X1_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X1_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); + return HUF_decompress1X1_DCtx (DTable, dst, dstSize, cSrc, cSrcSize); +} + +size_t HUF_decompress4X1_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 0) return ERROR(GENERIC); + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp (dctx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); +} + + +size_t HUF_decompress4X1_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} +size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX1(DTable, HUF_TABLELOG_MAX); + return HUF_decompress4X1_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +#endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +#ifndef HUF_FORCE_DECOMPRESS_X1 + +/* *************************/ +/* double-symbols decoding */ +/* *************************/ + +typedef struct { U16 sequence; BYTE nbBits; BYTE length; } HUF_DEltX2; /* double-symbols decoding */ +typedef struct { BYTE symbol; BYTE weight; } sortedSymbol_t; +typedef U32 rankValCol_t[HUF_TABLELOG_MAX + 1]; +typedef rankValCol_t rankVal_t[HUF_TABLELOG_MAX]; + + +/* HUF_fillDTableX2Level2() : + * `rankValOrigin` must be a table of at least (HUF_TABLELOG_MAX + 1) U32 */ +static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 sizeLog, const U32 consumed, + const U32* rankValOrigin, const int minWeight, + const sortedSymbol_t* sortedSymbols, const U32 sortedListSize, + U32 nbBitsBaseline, U16 baseSeq) +{ + HUF_DEltX2 DElt; + U32 rankVal[HUF_TABLELOG_MAX + 1]; + + /* get pre-calculated rankVal */ + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill skipped values */ + if (minWeight>1) { + U32 i, skipSize = rankVal[minWeight]; + MEM_writeLE16(&(DElt.sequence), baseSeq); + DElt.nbBits = (BYTE)(consumed); + DElt.length = 1; + for (i = 0; i < skipSize; i++) + DTable[i] = DElt; + } + + /* fill DTable */ + { U32 s; for (s=0; s= 1 */ + + rankVal[weight] += length; + } } +} + + +static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, const U32 sortedListSize, + const U32* rankStart, rankVal_t rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) +{ + U32 rankVal[HUF_TABLELOG_MAX + 1]; + const int scaleLog = nbBitsBaseline - targetLog; /* note : targetLog >= srcLog, hence scaleLog <= 1 */ + const U32 minBits = nbBitsBaseline - maxWeight; + U32 s; + + memcpy(rankVal, rankValOrigin, sizeof(rankVal)); + + /* fill DTable */ + for (s=0; s= minBits) { /* enough room for a second symbol */ + U32 sortedRank; + int minWeight = nbBits + scaleLog; + if (minWeight < 1) minWeight = 1; + sortedRank = rankStart[minWeight]; + HUF_fillDTableX2Level2(DTable+start, targetLog-nbBits, nbBits, + rankValOrigin[nbBits], minWeight, + sortedList+sortedRank, sortedListSize-sortedRank, + nbBitsBaseline, symbol); + } else { + HUF_DEltX2 DElt; + MEM_writeLE16(&(DElt.sequence), symbol); + DElt.nbBits = (BYTE)(nbBits); + DElt.length = 1; + { U32 const end = start + length; + U32 u; + for (u = start; u < end; u++) DTable[u] = DElt; + } } + rankVal[weight] += length; + } +} + +size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize) +{ + U32 tableLog, maxW, sizeOfSort, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); + U32 const maxTableLog = dtd.maxTableLog; + size_t iSize; + void* dtPtr = DTable+1; /* force compiler to avoid strict-aliasing */ + HUF_DEltX2* const dt = (HUF_DEltX2*)dtPtr; + U32 *rankStart; + + rankValCol_t* rankVal; + U32* rankStats; + U32* rankStart0; + sortedSymbol_t* sortedSymbol; + BYTE* weightList; + size_t spaceUsed32 = 0; + + rankVal = (rankValCol_t *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += (sizeof(rankValCol_t) * HUF_TABLELOG_MAX) >> 2; + rankStats = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 1; + rankStart0 = (U32 *)workSpace + spaceUsed32; + spaceUsed32 += HUF_TABLELOG_MAX + 2; + sortedSymbol = (sortedSymbol_t *)workSpace + (spaceUsed32 * sizeof(U32)) / sizeof(sortedSymbol_t); + spaceUsed32 += HUF_ALIGN(sizeof(sortedSymbol_t) * (HUF_SYMBOLVALUE_MAX + 1), sizeof(U32)) >> 2; + weightList = (BYTE *)((U32 *)workSpace + spaceUsed32); + spaceUsed32 += HUF_ALIGN(HUF_SYMBOLVALUE_MAX + 1, sizeof(U32)) >> 2; + + if ((spaceUsed32 << 2) > wkspSize) return ERROR(tableLog_tooLarge); + + rankStart = rankStart0 + 1; + memset(rankStats, 0, sizeof(U32) * (2 * HUF_TABLELOG_MAX + 2 + 1)); + + DEBUG_STATIC_ASSERT(sizeof(HUF_DEltX2) == sizeof(HUF_DTable)); /* if compiler fails here, assertion is wrong */ + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + + iSize = HUF_readStats(weightList, HUF_SYMBOLVALUE_MAX + 1, rankStats, &nbSymbols, &tableLog, src, srcSize); + if (HUF_isError(iSize)) return iSize; + + /* check result */ + if (tableLog > maxTableLog) return ERROR(tableLog_tooLarge); /* DTable can't fit code depth */ + + /* find maxWeight */ + for (maxW = tableLog; rankStats[maxW]==0; maxW--) {} /* necessarily finds a solution before 0 */ + + /* Get start index of each weight */ + { U32 w, nextRankStart = 0; + for (w=1; w> consumed; + } } } } + + HUF_fillDTableX2(dt, maxTableLog, + sortedSymbol, sizeOfSort, + rankStart0, rankVal, maxW, + tableLog+1); + + dtd.tableLog = (BYTE)maxTableLog; + dtd.tableType = 1; + memcpy(DTable, &dtd, sizeof(dtd)); + return iSize; +} + +size_t HUF_readDTableX2(HUF_DTable* DTable, const void* src, size_t srcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_readDTableX2_wksp(DTable, src, srcSize, + workSpace, sizeof(workSpace)); +} + + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 2); + BIT_skipBits(DStream, dt[val].nbBits); + return dt[val].length; +} + +FORCE_INLINE_TEMPLATE U32 +HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, const U32 dtLog) +{ + size_t const val = BIT_lookBitsFast(DStream, dtLog); /* note : dtLog >= 1 */ + memcpy(op, dt+val, 1); + if (dt[val].length==1) BIT_skipBits(DStream, dt[val].nbBits); + else { + if (DStream->bitsConsumed < (sizeof(DStream->bitContainer)*8)) { + BIT_skipBits(DStream, dt[val].nbBits); + if (DStream->bitsConsumed > (sizeof(DStream->bitContainer)*8)) + /* ugly hack; works only because it's the last symbol. Note : can't easily extract nbBits from just this symbol */ + DStream->bitsConsumed = (sizeof(DStream->bitContainer)*8); + } } + return 1; +} + +#define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ + if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ + if (MEM_64bits()) \ + ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) + +HINT_INLINE size_t +HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, + const HUF_DEltX2* const dt, const U32 dtLog) +{ + BYTE* const pStart = p; + + /* up to 8 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p < pEnd-(sizeof(bitDPtr->bitContainer)-1))) { + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_1(p, bitDPtr); + HUF_DECODE_SYMBOLX2_2(p, bitDPtr); + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + } + + /* closer to end : up to 2 symbols at a time */ + while ((BIT_reloadDStream(bitDPtr) == BIT_DStream_unfinished) & (p <= pEnd-2)) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); + + while (p <= pEnd-2) + HUF_DECODE_SYMBOLX2_0(p, bitDPtr); /* no need to reload : reached the end of DStream */ + + if (p < pEnd) + p += HUF_decodeLastSymbolX2(p, bitDPtr, dt, dtLog); + + return p-pStart; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress1X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + BIT_DStream_t bitD; + + /* Init */ + CHECK_F( BIT_initDStream(&bitD, cSrc, cSrcSize) ); + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + HUF_decodeStreamX2(ostart, &bitD, oend, dt, dtd.tableLog); + } + + /* check */ + if (!BIT_endOfDStream(&bitD)) return ERROR(corruption_detected); + + /* decoded size */ + return dstSize; +} + +FORCE_INLINE_TEMPLATE size_t +HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; + BYTE* const oend = ostart + dstSize; + BYTE* const olimit = oend - (sizeof(size_t)-1); + const void* const dtPtr = DTable+1; + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + + /* Init */ + BIT_DStream_t bitD1; + BIT_DStream_t bitD2; + BIT_DStream_t bitD3; + BIT_DStream_t bitD4; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); + size_t const length4 = cSrcSize - (length1 + length2 + length3 + 6); + const BYTE* const istart1 = istart + 6; /* jumpTable */ + const BYTE* const istart2 = istart1 + length1; + const BYTE* const istart3 = istart2 + length2; + const BYTE* const istart4 = istart3 + length3; + size_t const segmentSize = (dstSize+3) / 4; + BYTE* const opStart2 = ostart + segmentSize; + BYTE* const opStart3 = opStart2 + segmentSize; + BYTE* const opStart4 = opStart3 + segmentSize; + BYTE* op1 = ostart; + BYTE* op2 = opStart2; + BYTE* op3 = opStart3; + BYTE* op4 = opStart4; + U32 endSignal = 1; + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); + CHECK_F( BIT_initDStream(&bitD4, istart4, length4) ); + + /* 16-32 symbols per loop (4-8 symbols per stream) */ + for ( ; (endSignal) & (op4 < olimit); ) { +#if defined(__clang__) && (defined(__x86_64__) || defined(__i386__)) + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + endSignal &= BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished; + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal &= BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished; + endSignal &= BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished; +#else + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_1(op1, &bitD1); + HUF_DECODE_SYMBOLX2_1(op2, &bitD2); + HUF_DECODE_SYMBOLX2_1(op3, &bitD3); + HUF_DECODE_SYMBOLX2_1(op4, &bitD4); + HUF_DECODE_SYMBOLX2_2(op1, &bitD1); + HUF_DECODE_SYMBOLX2_2(op2, &bitD2); + HUF_DECODE_SYMBOLX2_2(op3, &bitD3); + HUF_DECODE_SYMBOLX2_2(op4, &bitD4); + HUF_DECODE_SYMBOLX2_0(op1, &bitD1); + HUF_DECODE_SYMBOLX2_0(op2, &bitD2); + HUF_DECODE_SYMBOLX2_0(op3, &bitD3); + HUF_DECODE_SYMBOLX2_0(op4, &bitD4); + endSignal = (U32)LIKELY( + (BIT_reloadDStreamFast(&bitD1) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD2) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD3) == BIT_DStream_unfinished) + & (BIT_reloadDStreamFast(&bitD4) == BIT_DStream_unfinished)); +#endif + } + + /* check corruption */ + if (op1 > opStart2) return ERROR(corruption_detected); + if (op2 > opStart3) return ERROR(corruption_detected); + if (op3 > opStart4) return ERROR(corruption_detected); + /* note : op4 already verified within main loop */ + + /* finish bitStreams one by one */ + HUF_decodeStreamX2(op1, &bitD1, opStart2, dt, dtLog); + HUF_decodeStreamX2(op2, &bitD2, opStart3, dt, dtLog); + HUF_decodeStreamX2(op3, &bitD3, opStart4, dt, dtLog); + HUF_decodeStreamX2(op4, &bitD4, oend, dt, dtLog); + + /* check */ + { U32 const endCheck = BIT_endOfDStream(&bitD1) & BIT_endOfDStream(&bitD2) & BIT_endOfDStream(&bitD3) & BIT_endOfDStream(&bitD4); + if (!endCheck) return ERROR(corruption_detected); } + + /* decoded size */ + return dstSize; + } +} + +HUF_DGEN(HUF_decompress1X2_usingDTable_internal) +HUF_DGEN(HUF_decompress4X2_usingDTable_internal) + +size_t HUF_decompress1X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) return ERROR(GENERIC); + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +} + + +size_t HUF_decompress1X2_DCtx(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X2_DCtx_wksp(DCtx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); + return HUF_decompress1X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +size_t HUF_decompress4X2_usingDTable( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc dtd = HUF_getDTableDesc(DTable); + if (dtd.tableType != 1) return ERROR(GENERIC); + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +} + +static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, + workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} + +size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +} + + +size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + +size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + HUF_CREATE_STATIC_DTABLEX2(DTable, HUF_TABLELOG_MAX); + return HUF_decompress4X2_DCtx(DTable, dst, dstSize, cSrc, cSrcSize); +} + +#endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +/* ***********************************/ +/* Universal decompression selectors */ +/* ***********************************/ + +size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#else + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#endif +} + +size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, + const void* cSrc, size_t cSrcSize, + const HUF_DTable* DTable) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#else + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +#endif +} + + +#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) +typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +static const algo_time_t algoTime[16 /* Quantization */][3 /* single, double, quad */] = +{ + /* single, double, quad */ + {{0,0}, {1,1}, {2,2}}, /* Q==0 : impossible */ + {{0,0}, {1,1}, {2,2}}, /* Q==1 : impossible */ + {{ 38,130}, {1313, 74}, {2151, 38}}, /* Q == 2 : 12-18% */ + {{ 448,128}, {1353, 74}, {2238, 41}}, /* Q == 3 : 18-25% */ + {{ 556,128}, {1353, 74}, {2238, 47}}, /* Q == 4 : 25-32% */ + {{ 714,128}, {1418, 74}, {2436, 53}}, /* Q == 5 : 32-38% */ + {{ 883,128}, {1437, 74}, {2464, 61}}, /* Q == 6 : 38-44% */ + {{ 897,128}, {1515, 75}, {2622, 68}}, /* Q == 7 : 44-50% */ + {{ 926,128}, {1613, 75}, {2730, 75}}, /* Q == 8 : 50-56% */ + {{ 947,128}, {1729, 77}, {3359, 77}}, /* Q == 9 : 56-62% */ + {{1107,128}, {2083, 81}, {4006, 84}}, /* Q ==10 : 62-69% */ + {{1177,128}, {2379, 87}, {4785, 88}}, /* Q ==11 : 69-75% */ + {{1242,128}, {2415, 93}, {5155, 84}}, /* Q ==12 : 75-81% */ + {{1349,128}, {2644,106}, {5260,106}}, /* Q ==13 : 81-87% */ + {{1455,128}, {2422,124}, {4174,124}}, /* Q ==14 : 87-93% */ + {{ 722,128}, {1891,145}, {1936,146}}, /* Q ==15 : 93-99% */ +}; +#endif + +/** HUF_selectDecoder() : + * Tells which decoder is likely to decode faster, + * based on a set of pre-computed metrics. + * @return : 0==HUF_decompress4X1, 1==HUF_decompress4X2 . + * Assumption : 0 < dstSize <= 128 KB */ +U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) +{ + assert(dstSize > 0); + assert(dstSize <= 128*1024); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dstSize; + (void)cSrcSize; + return 0; +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dstSize; + (void)cSrcSize; + return 1; +#else + /* decoder timing evaluation */ + { U32 const Q = (cSrcSize >= dstSize) ? 15 : (U32)(cSrcSize * 16 / dstSize); /* Q < 16 */ + U32 const D256 = (U32)(dstSize >> 8); + U32 const DTime0 = algoTime[Q][0].tableTime + (algoTime[Q][0].decode256Time * D256); + U32 DTime1 = algoTime[Q][1].tableTime + (algoTime[Q][1].decode256Time * D256); + DTime1 += DTime1 >> 3; /* advantage to algorithm using less memory, to reduce cache eviction */ + return DTime1 < DTime0; + } +#endif +} + + +typedef size_t (*decompressionAlgo)(void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); + +size_t HUF_decompress (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ +#if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + static const decompressionAlgo decompress[2] = { HUF_decompress4X1, HUF_decompress4X2 }; +#endif + + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1(dst, dstSize, cSrc, cSrcSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2(dst, dstSize, cSrc, cSrcSize); +#else + return decompress[algoNb](dst, dstSize, cSrc, cSrcSize); +#endif + } +} + +size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize); +#else + return algoNb ? HUF_decompress4X2_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) : + HUF_decompress4X1_DCtx(dctx, dst, dstSize, cSrc, cSrcSize) ; +#endif + } +} + +size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress4X_hufOnly_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, + size_t dstSize, const void* cSrc, + size_t cSrcSize, void* workSpace, + size_t wkspSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#else + return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize): + HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +#endif + } +} + +size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, + void* workSpace, size_t wkspSize) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize > dstSize) return ERROR(corruption_detected); /* invalid */ + if (cSrcSize == dstSize) { memcpy(dst, cSrc, dstSize); return dstSize; } /* not compressed */ + if (cSrcSize == 1) { memset(dst, *(const BYTE*)cSrc, dstSize); return dstSize; } /* RLE */ + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, + cSrcSize, workSpace, wkspSize); +#endif + } +} + +size_t HUF_decompress1X_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize) +{ + U32 workSpace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; + return HUF_decompress1X_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, + workSpace, sizeof(workSpace)); +} + + +size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#else + return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#endif +} + +#ifndef HUF_FORCE_DECOMPRESS_X2 +size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + + return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +} +#endif + +size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) +{ + DTableDesc const dtd = HUF_getDTableDesc(DTable); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); + return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); + return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#else + return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : + HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); +#endif +} + +size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) +{ + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); + if (cSrcSize == 0) return ERROR(corruption_detected); + + { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +#if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#else + return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : + HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); +#endif + } +} +/**** ended inlining decompress/huf_decompress.c ****/ +/**** start inlining decompress/zstd_ddict.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* zstd_ddict.c : + * concentrates all logic that needs to know the internals of ZSTD_DDict object */ + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** start inlining zstd_decompress_internal.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* zstd_decompress_internal: + * objects and definitions shared within lib/decompress modules */ + + #ifndef ZSTD_DECOMPRESS_INTERNAL_H + #define ZSTD_DECOMPRESS_INTERNAL_H + + +/*-******************************************************* + * Dependencies + *********************************************************/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + + + +/*-******************************************************* + * Constants + *********************************************************/ +static const U32 LL_base[MaxLL+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 18, 20, 22, 24, 28, 32, 40, + 48, 64, 0x80, 0x100, 0x200, 0x400, 0x800, 0x1000, + 0x2000, 0x4000, 0x8000, 0x10000 }; + +static const U32 OF_base[MaxOff+1] = { + 0, 1, 1, 5, 0xD, 0x1D, 0x3D, 0x7D, + 0xFD, 0x1FD, 0x3FD, 0x7FD, 0xFFD, 0x1FFD, 0x3FFD, 0x7FFD, + 0xFFFD, 0x1FFFD, 0x3FFFD, 0x7FFFD, 0xFFFFD, 0x1FFFFD, 0x3FFFFD, 0x7FFFFD, + 0xFFFFFD, 0x1FFFFFD, 0x3FFFFFD, 0x7FFFFFD, 0xFFFFFFD, 0x1FFFFFFD, 0x3FFFFFFD, 0x7FFFFFFD }; + +static const U32 OF_bits[MaxOff+1] = { + 0, 1, 2, 3, 4, 5, 6, 7, + 8, 9, 10, 11, 12, 13, 14, 15, + 16, 17, 18, 19, 20, 21, 22, 23, + 24, 25, 26, 27, 28, 29, 30, 31 }; + +static const U32 ML_base[MaxML+1] = { + 3, 4, 5, 6, 7, 8, 9, 10, + 11, 12, 13, 14, 15, 16, 17, 18, + 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, + 35, 37, 39, 41, 43, 47, 51, 59, + 67, 83, 99, 0x83, 0x103, 0x203, 0x403, 0x803, + 0x1003, 0x2003, 0x4003, 0x8003, 0x10003 }; + + +/*-******************************************************* + * Decompression types + *********************************************************/ + typedef struct { + U32 fastMode; + U32 tableLog; + } ZSTD_seqSymbol_header; + + typedef struct { + U16 nextState; + BYTE nbAdditionalBits; + BYTE nbBits; + U32 baseValue; + } ZSTD_seqSymbol; + + #define SEQSYMBOL_TABLE_SIZE(log) (1 + (1 << (log))) + +typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ + HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; +} ZSTD_entropyDTables_t; + +typedef enum { ZSTDds_getFrameHeaderSize, ZSTDds_decodeFrameHeader, + ZSTDds_decodeBlockHeader, ZSTDds_decompressBlock, + ZSTDds_decompressLastBlock, ZSTDds_checkChecksum, + ZSTDds_decodeSkippableHeader, ZSTDds_skipFrame } ZSTD_dStage; + +typedef enum { zdss_init=0, zdss_loadHeader, + zdss_read, zdss_load, zdss_flush } ZSTD_dStreamStage; + +typedef enum { + ZSTD_use_indefinitely = -1, /* Use the dictionary indefinitely */ + ZSTD_dont_use = 0, /* Do not use the dictionary (if one exists free it) */ + ZSTD_use_once = 1 /* Use the dictionary once and set to ZSTD_dont_use */ +} ZSTD_dictUses_e; + +typedef enum { + ZSTD_obm_buffered = 0, /* Buffer the output */ + ZSTD_obm_stable = 1 /* ZSTD_outBuffer is stable */ +} ZSTD_outBufferMode_e; + +struct ZSTD_DCtx_s +{ + const ZSTD_seqSymbol* LLTptr; + const ZSTD_seqSymbol* MLTptr; + const ZSTD_seqSymbol* OFTptr; + const HUF_DTable* HUFptr; + ZSTD_entropyDTables_t entropy; + U32 workspace[HUF_DECOMPRESS_WORKSPACE_SIZE_U32]; /* space needed when building huffman tables */ + const void* previousDstEnd; /* detect continuity */ + const void* prefixStart; /* start of current segment */ + const void* virtualStart; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ + size_t expected; + ZSTD_frameHeader fParams; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ + ZSTD_dStage stage; + U32 litEntropy; + U32 fseEntropy; + XXH64_state_t xxhState; + size_t headerSize; + ZSTD_format_e format; + const BYTE* litPtr; + ZSTD_customMem customMem; + size_t litSize; + size_t rleSize; + size_t staticSize; + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + + /* dictionary */ + ZSTD_DDict* ddictLocal; + const ZSTD_DDict* ddict; /* set by ZSTD_initDStream_usingDDict(), or ZSTD_DCtx_refDDict() */ + U32 dictID; + int ddictIsCold; /* if == 1 : dictionary is "new" for working context, and presumed "cold" (not in cpu cache) */ + ZSTD_dictUses_e dictUses; + + /* streaming */ + ZSTD_dStreamStage streamStage; + char* inBuff; + size_t inBuffSize; + size_t inPos; + size_t maxWindowSize; + char* outBuff; + size_t outBuffSize; + size_t outStart; + size_t outEnd; + size_t lhSize; + void* legacyContext; + U32 previousLegacyVersion; + U32 legacyVersion; + U32 hostageByte; + int noForwardProgress; + ZSTD_outBufferMode_e outBufferMode; + ZSTD_outBuffer expectedOutBuffer; + + /* workspace */ + BYTE litBuffer[ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH]; + BYTE headerBuffer[ZSTD_FRAMEHEADERSIZE_MAX]; + + size_t oversizedDuration; + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + void const* dictContentBeginForFuzzing; + void const* dictContentEndForFuzzing; +#endif +}; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + + +/*-******************************************************* + * Shared internal functions + *********************************************************/ + +/*! ZSTD_loadDEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * @return : size of dictionary header (size of magic number + dict ID + entropy tables) */ +size_t ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + const void* const dict, size_t const dictSize); + +/*! ZSTD_checkContinuity() : + * check if next `dst` follows previous position, where decompression ended. + * If yes, do nothing (continue on current segment). + * If not, classify previous segment as "external dictionary", and start a new segment. + * This function cannot fail. */ +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst); + + +#endif /* ZSTD_DECOMPRESS_INTERNAL_H */ +/**** ended inlining zstd_decompress_internal.h ****/ +/**** start inlining zstd_ddict.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +#ifndef ZSTD_DDICT_H +#define ZSTD_DDICT_H + +/*-******************************************************* + * Dependencies + *********************************************************/ +#include /* size_t */ +/**** skipping file: ../zstd.h ****/ + + +/*-******************************************************* + * Interface + *********************************************************/ + +/* note: several prototypes are already published in `zstd.h` : + * ZSTD_createDDict() + * ZSTD_createDDict_byReference() + * ZSTD_createDDict_advanced() + * ZSTD_freeDDict() + * ZSTD_initStaticDDict() + * ZSTD_sizeof_DDict() + * ZSTD_estimateDDictSize() + * ZSTD_getDictID_fromDict() + */ + +const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict); +size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict); + +void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + + + +#endif /* ZSTD_DDICT_H */ +/**** ended inlining zstd_ddict.h ****/ + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) +/**** start inlining ../legacy/zstd_legacy.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_LEGACY_H +#define ZSTD_LEGACY_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +/**** skipping file: ../common/mem.h ****/ +/**** skipping file: ../common/error_private.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ + +#if !defined (ZSTD_LEGACY_SUPPORT) || (ZSTD_LEGACY_SUPPORT == 0) +# undef ZSTD_LEGACY_SUPPORT +# define ZSTD_LEGACY_SUPPORT 8 +#endif + +#if (ZSTD_LEGACY_SUPPORT <= 1) +/**** start inlining zstd_v01.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V01_H_28739879432 +#define ZSTD_V01_H_28739879432 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv01_decompress() : decompress ZSTD frames compliant with v0.1.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv01_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv01_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.1.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv01_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv01_isError() : tells if the result of ZSTDv01_decompress() is an error +*/ +unsigned ZSTDv01_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv01_Dctx_s ZSTDv01_Dctx; +ZSTDv01_Dctx* ZSTDv01_createDCtx(void); +size_t ZSTDv01_freeDCtx(ZSTDv01_Dctx* dctx); + +size_t ZSTDv01_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv01_resetDCtx(ZSTDv01_Dctx* dctx); + +size_t ZSTDv01_nextSrcSizeToDecompress(ZSTDv01_Dctx* dctx); +size_t ZSTDv01_decompressContinue(ZSTDv01_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv01_magicNumber 0xFD2FB51E /* Big Endian version */ +#define ZSTDv01_magicNumberLE 0x1EB52FFD /* Little Endian version */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V01_H_28739879432 */ +/**** ended inlining zstd_v01.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) +/**** start inlining zstd_v02.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V02_H_4174539423 +#define ZSTD_V02_H_4174539423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv02_decompress() : decompress ZSTD frames compliant with v0.2.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv02_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv02_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.2.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv02_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv02_isError() : tells if the result of ZSTDv02_decompress() is an error +*/ +unsigned ZSTDv02_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv02_Dctx_s ZSTDv02_Dctx; +ZSTDv02_Dctx* ZSTDv02_createDCtx(void); +size_t ZSTDv02_freeDCtx(ZSTDv02_Dctx* dctx); + +size_t ZSTDv02_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv02_resetDCtx(ZSTDv02_Dctx* dctx); + +size_t ZSTDv02_nextSrcSizeToDecompress(ZSTDv02_Dctx* dctx); +size_t ZSTDv02_decompressContinue(ZSTDv02_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv02_magicNumber 0xFD2FB522 /* v0.2 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V02_H_4174539423 */ +/**** ended inlining zstd_v02.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) +/**** start inlining zstd_v03.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V03_H_298734209782 +#define ZSTD_V03_H_298734209782 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv03_decompress() : decompress ZSTD frames compliant with v0.3.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv03_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv03_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.3.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ + void ZSTDv03_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + + /** +ZSTDv03_isError() : tells if the result of ZSTDv03_decompress() is an error +*/ +unsigned ZSTDv03_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv03_Dctx_s ZSTDv03_Dctx; +ZSTDv03_Dctx* ZSTDv03_createDCtx(void); +size_t ZSTDv03_freeDCtx(ZSTDv03_Dctx* dctx); + +size_t ZSTDv03_decompressDCtx(void* ctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + +/* ************************************* +* Streaming functions +***************************************/ +size_t ZSTDv03_resetDCtx(ZSTDv03_Dctx* dctx); + +size_t ZSTDv03_nextSrcSizeToDecompress(ZSTDv03_Dctx* dctx); +size_t ZSTDv03_decompressContinue(ZSTDv03_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv03_magicNumber 0xFD2FB523 /* v0.3 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V03_H_298734209782 */ +/**** ended inlining zstd_v03.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) +/**** start inlining zstd_v04.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_V04_H_91868324769238 +#define ZSTD_V04_H_91868324769238 + +#if defined (__cplusplus) +extern "C" { +#endif + +/* ************************************* +* Includes +***************************************/ +#include /* size_t */ + + +/* ************************************* +* Simple one-step function +***************************************/ +/** +ZSTDv04_decompress() : decompress ZSTD frames compliant with v0.4.x format + compressedSize : is the exact source size + maxOriginalSize : is the size of the 'dst' buffer, which must be already allocated. + It must be equal or larger than originalSize, otherwise decompression will fail. + return : the number of bytes decompressed into destination buffer (originalSize) + or an errorCode if it fails (which can be tested using ZSTDv01_isError()) +*/ +size_t ZSTDv04_decompress( void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + /** + ZSTDv04_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.4.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ + void ZSTDv04_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/** +ZSTDv04_isError() : tells if the result of ZSTDv04_decompress() is an error +*/ +unsigned ZSTDv04_isError(size_t code); + + +/* ************************************* +* Advanced functions +***************************************/ +typedef struct ZSTDv04_Dctx_s ZSTDv04_Dctx; +ZSTDv04_Dctx* ZSTDv04_createDCtx(void); +size_t ZSTDv04_freeDCtx(ZSTDv04_Dctx* dctx); + +size_t ZSTDv04_decompressDCtx(ZSTDv04_Dctx* dctx, + void* dst, size_t maxOriginalSize, + const void* src, size_t compressedSize); + + +/* ************************************* +* Direct Streaming +***************************************/ +size_t ZSTDv04_resetDCtx(ZSTDv04_Dctx* dctx); + +size_t ZSTDv04_nextSrcSizeToDecompress(ZSTDv04_Dctx* dctx); +size_t ZSTDv04_decompressContinue(ZSTDv04_Dctx* dctx, void* dst, size_t maxDstSize, const void* src, size_t srcSize); +/** + Use above functions alternatively. + ZSTD_nextSrcSizeToDecompress() tells how much bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() will use previous data blocks to improve compression if they are located prior to current block. + Result is the number of bytes regenerated within 'dst'. + It can be zero, which is not an error; it just means ZSTD_decompressContinue() has decoded some header. +*/ + + +/* ************************************* +* Buffered Streaming +***************************************/ +typedef struct ZBUFFv04_DCtx_s ZBUFFv04_DCtx; +ZBUFFv04_DCtx* ZBUFFv04_createDCtx(void); +size_t ZBUFFv04_freeDCtx(ZBUFFv04_DCtx* dctx); + +size_t ZBUFFv04_decompressInit(ZBUFFv04_DCtx* dctx); +size_t ZBUFFv04_decompressWithDictionary(ZBUFFv04_DCtx* dctx, const void* dict, size_t dictSize); + +size_t ZBUFFv04_decompressContinue(ZBUFFv04_DCtx* dctx, void* dst, size_t* maxDstSizePtr, const void* src, size_t* srcSizePtr); + +/** ************************************************ +* Streaming decompression +* +* A ZBUFF_DCtx object is required to track streaming operation. +* Use ZBUFF_createDCtx() and ZBUFF_freeDCtx() to create/release resources. +* Use ZBUFF_decompressInit() to start a new decompression operation. +* ZBUFF_DCtx objects can be reused multiple times. +* +* Optionally, a reference to a static dictionary can be set, using ZBUFF_decompressWithDictionary() +* It must be the same content as the one set during compression phase. +* Dictionary content must remain accessible during the decompression process. +* +* Use ZBUFF_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *maxDstSizePtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *maxDstSizePtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of dst will be overwritten (up to *maxDstSizePtr) at each function call, so save its content if it matters or change dst. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to improve latency) +* or 0 when a frame is completely decoded +* or an error code, which can be tested using ZBUFF_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFF_recommendedDInSize / ZBUFF_recommendedDOutSize +* output : ZBUFF_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when it's decoded. +* input : ZBUFF_recommendedDInSize==128Kb+3; just follow indications from ZBUFF_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* **************************************************/ +unsigned ZBUFFv04_isError(size_t errorCode); +const char* ZBUFFv04_getErrorName(size_t errorCode); + + +/** The below functions provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are not compulsory, they just tend to offer better latency */ +size_t ZBUFFv04_recommendedDInSize(void); +size_t ZBUFFv04_recommendedDOutSize(void); + + +/* ************************************* +* Prefix - version detection +***************************************/ +#define ZSTDv04_magicNumber 0xFD2FB524 /* v0.4 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_V04_H_91868324769238 */ +/**** ended inlining zstd_v04.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) +/**** start inlining zstd_v05.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv05_H +#define ZSTDv05_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/*-************************************* +* Dependencies +***************************************/ +#include /* size_t */ +/**** skipping file: ../common/mem.h ****/ + + +/* ************************************* +* Simple functions +***************************************/ +/*! ZSTDv05_decompress() : + `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail. + `dstCapacity` must be large enough, equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv05_isError()) */ +size_t ZSTDv05_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + + /** + ZSTDv05_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.5.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. + */ +void ZSTDv05_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/* ************************************* +* Helper functions +***************************************/ +/* Error Management */ +unsigned ZSTDv05_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +const char* ZSTDv05_getErrorName(size_t code); /*!< provides readable string for an error code */ + + +/* ************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv05_DCtx_s ZSTDv05_DCtx; +ZSTDv05_DCtx* ZSTDv05_createDCtx(void); +size_t ZSTDv05_freeDCtx(ZSTDv05_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv05_decompressDCtx() : +* Same as ZSTDv05_decompress(), but requires an already allocated ZSTDv05_DCtx (see ZSTDv05_createDCtx()) */ +size_t ZSTDv05_decompressDCtx(ZSTDv05_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* Simple Dictionary API +*************************/ +/*! ZSTDv05_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted. +* Note : dict can be NULL, in which case, it's equivalent to ZSTDv05_decompressDCtx() */ +size_t ZSTDv05_decompress_usingDict(ZSTDv05_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + +/*-************************ +* Advanced Streaming API +***************************/ +typedef enum { ZSTDv05_fast, ZSTDv05_greedy, ZSTDv05_lazy, ZSTDv05_lazy2, ZSTDv05_btlazy2, ZSTDv05_opt, ZSTDv05_btopt } ZSTDv05_strategy; +typedef struct { + U64 srcSize; + U32 windowLog; /* the only useful information to retrieve */ + U32 contentLog; U32 hashLog; U32 searchLog; U32 searchLength; U32 targetLength; ZSTDv05_strategy strategy; +} ZSTDv05_parameters; +size_t ZSTDv05_getFrameParams(ZSTDv05_parameters* params, const void* src, size_t srcSize); + +size_t ZSTDv05_decompressBegin_usingDict(ZSTDv05_DCtx* dctx, const void* dict, size_t dictSize); +void ZSTDv05_copyDCtx(ZSTDv05_DCtx* dstDCtx, const ZSTDv05_DCtx* srcDCtx); +size_t ZSTDv05_nextSrcSizeToDecompress(ZSTDv05_DCtx* dctx); +size_t ZSTDv05_decompressContinue(ZSTDv05_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* ZBUFF API +*************************/ +typedef struct ZBUFFv05_DCtx_s ZBUFFv05_DCtx; +ZBUFFv05_DCtx* ZBUFFv05_createDCtx(void); +size_t ZBUFFv05_freeDCtx(ZBUFFv05_DCtx* dctx); + +size_t ZBUFFv05_decompressInit(ZBUFFv05_DCtx* dctx); +size_t ZBUFFv05_decompressInitDictionary(ZBUFFv05_DCtx* dctx, const void* dict, size_t dictSize); + +size_t ZBUFFv05_decompressContinue(ZBUFFv05_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression +* +* A ZBUFFv05_DCtx object is required to track streaming operations. +* Use ZBUFFv05_createDCtx() and ZBUFFv05_freeDCtx() to create/release resources. +* Use ZBUFFv05_decompressInit() to start a new decompression operation, +* or ZBUFFv05_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv05_DCtx objects can be reused multiple times. +* +* Use ZBUFFv05_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of @dst will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters or change @dst. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency) +* or 0 when a frame is completely decoded +* or an error code, which can be tested using ZBUFFv05_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv05_recommendedDInSize() / ZBUFFv05_recommendedDOutSize() +* output : ZBUFFv05_recommendedDOutSize==128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv05_recommendedDInSize==128Kb+3; just follow indications from ZBUFFv05_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +unsigned ZBUFFv05_isError(size_t errorCode); +const char* ZBUFFv05_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, and tend to offer better latency */ +size_t ZBUFFv05_recommendedDInSize(void); +size_t ZBUFFv05_recommendedDOutSize(void); + + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv05_MAGICNUMBER 0xFD2FB525 /* v0.5 */ + + + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv0505_H */ +/**** ended inlining zstd_v05.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) +/**** start inlining zstd_v06.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv06_H +#define ZSTDv06_H + +#if defined (__cplusplus) +extern "C" { +#endif + +/*====== Dependency ======*/ +#include /* size_t */ + + +/*====== Export for Windows ======*/ +/*! +* ZSTDv06_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +*/ +#if defined(_WIN32) && defined(ZSTDv06_DLL_EXPORT) && (ZSTDv06_DLL_EXPORT==1) +# define ZSTDLIBv06_API __declspec(dllexport) +#else +# define ZSTDLIBv06_API +#endif + + +/* ************************************* +* Simple functions +***************************************/ +/*! ZSTDv06_decompress() : + `compressedSize` : is the _exact_ size of the compressed blob, otherwise decompression will fail. + `dstCapacity` must be large enough, equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv06_isError()) */ +ZSTDLIBv06_API size_t ZSTDv06_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/** +ZSTDv06_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.6.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. +*/ +void ZSTDv06_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/* ************************************* +* Helper functions +***************************************/ +ZSTDLIBv06_API size_t ZSTDv06_compressBound(size_t srcSize); /*!< maximum compressed size (worst case scenario) */ + +/* Error Management */ +ZSTDLIBv06_API unsigned ZSTDv06_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIBv06_API const char* ZSTDv06_getErrorName(size_t code); /*!< provides readable string for an error code */ + + +/* ************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv06_DCtx_s ZSTDv06_DCtx; +ZSTDLIBv06_API ZSTDv06_DCtx* ZSTDv06_createDCtx(void); +ZSTDLIBv06_API size_t ZSTDv06_freeDCtx(ZSTDv06_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv06_decompressDCtx() : +* Same as ZSTDv06_decompress(), but requires an already allocated ZSTDv06_DCtx (see ZSTDv06_createDCtx()) */ +ZSTDLIBv06_API size_t ZSTDv06_decompressDCtx(ZSTDv06_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-*********************** +* Dictionary API +*************************/ +/*! ZSTDv06_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression, otherwise regenerated data will be corrupted. +* Note : dict can be NULL, in which case, it's equivalent to ZSTDv06_decompressDCtx() */ +ZSTDLIBv06_API size_t ZSTDv06_decompress_usingDict(ZSTDv06_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*-************************ +* Advanced Streaming API +***************************/ +struct ZSTDv06_frameParams_s { unsigned long long frameContentSize; unsigned windowLog; }; +typedef struct ZSTDv06_frameParams_s ZSTDv06_frameParams; + +ZSTDLIBv06_API size_t ZSTDv06_getFrameParams(ZSTDv06_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +ZSTDLIBv06_API size_t ZSTDv06_decompressBegin_usingDict(ZSTDv06_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIBv06_API void ZSTDv06_copyDCtx(ZSTDv06_DCtx* dctx, const ZSTDv06_DCtx* preparedDCtx); + +ZSTDLIBv06_API size_t ZSTDv06_nextSrcSizeToDecompress(ZSTDv06_DCtx* dctx); +ZSTDLIBv06_API size_t ZSTDv06_decompressContinue(ZSTDv06_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + + +/* ************************************* +* ZBUFF API +***************************************/ + +typedef struct ZBUFFv06_DCtx_s ZBUFFv06_DCtx; +ZSTDLIBv06_API ZBUFFv06_DCtx* ZBUFFv06_createDCtx(void); +ZSTDLIBv06_API size_t ZBUFFv06_freeDCtx(ZBUFFv06_DCtx* dctx); + +ZSTDLIBv06_API size_t ZBUFFv06_decompressInit(ZBUFFv06_DCtx* dctx); +ZSTDLIBv06_API size_t ZBUFFv06_decompressInitDictionary(ZBUFFv06_DCtx* dctx, const void* dict, size_t dictSize); + +ZSTDLIBv06_API size_t ZBUFFv06_decompressContinue(ZBUFFv06_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression howto +* +* A ZBUFFv06_DCtx object is required to track streaming operations. +* Use ZBUFFv06_createDCtx() and ZBUFFv06_freeDCtx() to create/release resources. +* Use ZBUFFv06_decompressInit() to start a new decompression operation, +* or ZBUFFv06_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv06_DCtx objects can be re-init multiple times. +* +* Use ZBUFFv06_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency), +* or 0 when a frame is completely decoded, +* or an error code, which can be tested using ZBUFFv06_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv06_recommendedDInSize() and ZBUFFv06_recommendedDOutSize() +* output : ZBUFFv06_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv06_recommendedDInSize == 128KB + 3; +* just follow indications from ZBUFFv06_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +ZSTDLIBv06_API unsigned ZBUFFv06_isError(size_t errorCode); +ZSTDLIBv06_API const char* ZBUFFv06_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, they tend to offer better latency */ +ZSTDLIBv06_API size_t ZBUFFv06_recommendedDInSize(void); +ZSTDLIBv06_API size_t ZBUFFv06_recommendedDOutSize(void); + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv06_MAGICNUMBER 0xFD2FB526 /* v0.6 */ + + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv06_BUFFERED_H */ +/**** ended inlining zstd_v06.h ****/ +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) +/**** start inlining zstd_v07.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTDv07_H_235446 +#define ZSTDv07_H_235446 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*====== Dependency ======*/ +#include /* size_t */ + + +/*====== Export for Windows ======*/ +/*! +* ZSTDv07_DLL_EXPORT : +* Enable exporting of functions when building a Windows DLL +*/ +#if defined(_WIN32) && defined(ZSTDv07_DLL_EXPORT) && (ZSTDv07_DLL_EXPORT==1) +# define ZSTDLIBv07_API __declspec(dllexport) +#else +# define ZSTDLIBv07_API +#endif + + +/* ************************************* +* Simple API +***************************************/ +/*! ZSTDv07_getDecompressedSize() : +* @return : decompressed size if known, 0 otherwise. + note 1 : if `0`, follow up with ZSTDv07_getFrameParams() to know precise failure cause. + note 2 : decompressed size could be wrong or intentionally modified ! + always ensure results fit within application's authorized limits */ +unsigned long long ZSTDv07_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTDv07_decompress() : + `compressedSize` : must be _exact_ size of compressed input, otherwise decompression will fail. + `dstCapacity` must be equal or larger than originalSize. + @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + or an errorCode if it fails (which can be tested using ZSTDv07_isError()) */ +ZSTDLIBv07_API size_t ZSTDv07_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/** +ZSTDv07_findFrameSizeInfoLegacy() : get the source length and decompressed bound of a ZSTD frame compliant with v0.7.x format + srcSize : The size of the 'src' buffer, at least as large as the frame pointed to by 'src' + cSize (output parameter) : the number of bytes that would be read to decompress this frame + or an error code if it fails (which can be tested using ZSTDv01_isError()) + dBound (output parameter) : an upper-bound for the decompressed size of the data in the frame + or ZSTD_CONTENTSIZE_ERROR if an error occurs + + note : assumes `cSize` and `dBound` are _not_ NULL. +*/ +void ZSTDv07_findFrameSizeInfoLegacy(const void *src, size_t srcSize, + size_t* cSize, unsigned long long* dBound); + +/*====== Helper functions ======*/ +ZSTDLIBv07_API unsigned ZSTDv07_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIBv07_API const char* ZSTDv07_getErrorName(size_t code); /*!< provides readable string from an error code */ + + +/*-************************************* +* Explicit memory management +***************************************/ +/** Decompression context */ +typedef struct ZSTDv07_DCtx_s ZSTDv07_DCtx; +ZSTDLIBv07_API ZSTDv07_DCtx* ZSTDv07_createDCtx(void); +ZSTDLIBv07_API size_t ZSTDv07_freeDCtx(ZSTDv07_DCtx* dctx); /*!< @return : errorCode */ + +/** ZSTDv07_decompressDCtx() : +* Same as ZSTDv07_decompress(), requires an allocated ZSTDv07_DCtx (see ZSTDv07_createDCtx()) */ +ZSTDLIBv07_API size_t ZSTDv07_decompressDCtx(ZSTDv07_DCtx* ctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*-************************ +* Simple dictionary API +***************************/ +/*! ZSTDv07_decompress_usingDict() : +* Decompression using a pre-defined Dictionary content (see dictBuilder). +* Dictionary must be identical to the one used during compression. +* Note : This function load the dictionary, resulting in a significant startup time */ +ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDict(ZSTDv07_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*-************************** +* Advanced Dictionary API +****************************/ +/*! ZSTDv07_createDDict() : +* Create a digested dictionary, ready to start decompression operation without startup delay. +* `dict` can be released after creation */ +typedef struct ZSTDv07_DDict_s ZSTDv07_DDict; +ZSTDLIBv07_API ZSTDv07_DDict* ZSTDv07_createDDict(const void* dict, size_t dictSize); +ZSTDLIBv07_API size_t ZSTDv07_freeDDict(ZSTDv07_DDict* ddict); + +/*! ZSTDv07_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Faster startup than ZSTDv07_decompress_usingDict(), recommended when same dictionary is used multiple times. */ +ZSTDLIBv07_API size_t ZSTDv07_decompress_usingDDict(ZSTDv07_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTDv07_DDict* ddict); + +typedef struct { + unsigned long long frameContentSize; + unsigned windowSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTDv07_frameParams; + +ZSTDLIBv07_API size_t ZSTDv07_getFrameParams(ZSTDv07_frameParams* fparamsPtr, const void* src, size_t srcSize); /**< doesn't consume input */ + + + + +/* ************************************* +* Streaming functions +***************************************/ +typedef struct ZBUFFv07_DCtx_s ZBUFFv07_DCtx; +ZSTDLIBv07_API ZBUFFv07_DCtx* ZBUFFv07_createDCtx(void); +ZSTDLIBv07_API size_t ZBUFFv07_freeDCtx(ZBUFFv07_DCtx* dctx); + +ZSTDLIBv07_API size_t ZBUFFv07_decompressInit(ZBUFFv07_DCtx* dctx); +ZSTDLIBv07_API size_t ZBUFFv07_decompressInitDictionary(ZBUFFv07_DCtx* dctx, const void* dict, size_t dictSize); + +ZSTDLIBv07_API size_t ZBUFFv07_decompressContinue(ZBUFFv07_DCtx* dctx, + void* dst, size_t* dstCapacityPtr, + const void* src, size_t* srcSizePtr); + +/*-*************************************************************************** +* Streaming decompression howto +* +* A ZBUFFv07_DCtx object is required to track streaming operations. +* Use ZBUFFv07_createDCtx() and ZBUFFv07_freeDCtx() to create/release resources. +* Use ZBUFFv07_decompressInit() to start a new decompression operation, +* or ZBUFFv07_decompressInitDictionary() if decompression requires a dictionary. +* Note that ZBUFFv07_DCtx objects can be re-init multiple times. +* +* Use ZBUFFv07_decompressContinue() repetitively to consume your input. +* *srcSizePtr and *dstCapacityPtr can be any size. +* The function will report how many bytes were read or written by modifying *srcSizePtr and *dstCapacityPtr. +* Note that it may not consume the entire input, in which case it's up to the caller to present remaining input again. +* The content of `dst` will be overwritten (up to *dstCapacityPtr) at each function call, so save its content if it matters, or change `dst`. +* @return : a hint to preferred nb of bytes to use as input for next function call (it's only a hint, to help latency), +* or 0 when a frame is completely decoded, +* or an error code, which can be tested using ZBUFFv07_isError(). +* +* Hint : recommended buffer sizes (not compulsory) : ZBUFFv07_recommendedDInSize() and ZBUFFv07_recommendedDOutSize() +* output : ZBUFFv07_recommendedDOutSize== 128 KB block size is the internal unit, it ensures it's always possible to write a full block when decoded. +* input : ZBUFFv07_recommendedDInSize == 128KB + 3; +* just follow indications from ZBUFFv07_decompressContinue() to minimize latency. It should always be <= 128 KB + 3 . +* *******************************************************************************/ + + +/* ************************************* +* Tool functions +***************************************/ +ZSTDLIBv07_API unsigned ZBUFFv07_isError(size_t errorCode); +ZSTDLIBv07_API const char* ZBUFFv07_getErrorName(size_t errorCode); + +/** Functions below provide recommended buffer sizes for Compression or Decompression operations. +* These sizes are just hints, they tend to offer better latency */ +ZSTDLIBv07_API size_t ZBUFFv07_recommendedDInSize(void); +ZSTDLIBv07_API size_t ZBUFFv07_recommendedDOutSize(void); + + +/*-************************************* +* Constants +***************************************/ +#define ZSTDv07_MAGICNUMBER 0xFD2FB527 /* v0.7 */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTDv07_H_235446 */ +/**** ended inlining zstd_v07.h ****/ +#endif + +/** ZSTD_isLegacy() : + @return : > 0 if supported by legacy decoder. 0 otherwise. + return value is the version. +*/ +MEM_STATIC unsigned ZSTD_isLegacy(const void* src, size_t srcSize) +{ + U32 magicNumberLE; + if (srcSize<4) return 0; + magicNumberLE = MEM_readLE32(src); + switch(magicNumberLE) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case ZSTDv01_magicNumberLE:return 1; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case ZSTDv02_magicNumber : return 2; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case ZSTDv03_magicNumber : return 3; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case ZSTDv04_magicNumber : return 4; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case ZSTDv05_MAGICNUMBER : return 5; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case ZSTDv06_MAGICNUMBER : return 6; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case ZSTDv07_MAGICNUMBER : return 7; +#endif + default : return 0; + } +} + + +MEM_STATIC unsigned long long ZSTD_getDecompressedSize_legacy(const void* src, size_t srcSize) +{ + U32 const version = ZSTD_isLegacy(src, srcSize); + if (version < 5) return 0; /* no decompressed size in frame header, or not a legacy format */ +#if (ZSTD_LEGACY_SUPPORT <= 5) + if (version==5) { + ZSTDv05_parameters fParams; + size_t const frResult = ZSTDv05_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.srcSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + if (version==6) { + ZSTDv06_frameParams fParams; + size_t const frResult = ZSTDv06_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.frameContentSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + if (version==7) { + ZSTDv07_frameParams fParams; + size_t const frResult = ZSTDv07_getFrameParams(&fParams, src, srcSize); + if (frResult != 0) return 0; + return fParams.frameContentSize; + } +#endif + return 0; /* should not be possible */ +} + + +MEM_STATIC size_t ZSTD_decompressLegacy( + void* dst, size_t dstCapacity, + const void* src, size_t compressedSize, + const void* dict,size_t dictSize) +{ + U32 const version = ZSTD_isLegacy(src, compressedSize); + (void)dst; (void)dstCapacity; (void)dict; (void)dictSize; /* unused when ZSTD_LEGACY_SUPPORT >= 8 */ + switch(version) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case 1 : + return ZSTDv01_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case 2 : + return ZSTDv02_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case 3 : + return ZSTDv03_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + return ZSTDv04_decompress(dst, dstCapacity, src, compressedSize); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { size_t result; + ZSTDv05_DCtx* const zd = ZSTDv05_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv05_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv05_freeDCtx(zd); + return result; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { size_t result; + ZSTDv06_DCtx* const zd = ZSTDv06_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv06_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv06_freeDCtx(zd); + return result; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { size_t result; + ZSTDv07_DCtx* const zd = ZSTDv07_createDCtx(); + if (zd==NULL) return ERROR(memory_allocation); + result = ZSTDv07_decompress_usingDict(zd, dst, dstCapacity, src, compressedSize, dict, dictSize); + ZSTDv07_freeDCtx(zd); + return result; + } +#endif + default : + return ERROR(prefix_unknown); + } +} + +MEM_STATIC ZSTD_frameSizeInfo ZSTD_findFrameSizeInfoLegacy(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo; + U32 const version = ZSTD_isLegacy(src, srcSize); + switch(version) + { +#if (ZSTD_LEGACY_SUPPORT <= 1) + case 1 : + ZSTDv01_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 2) + case 2 : + ZSTDv02_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 3) + case 3 : + ZSTDv03_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + ZSTDv04_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + ZSTDv05_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + ZSTDv06_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + ZSTDv07_findFrameSizeInfoLegacy(src, srcSize, + &frameSizeInfo.compressedSize, + &frameSizeInfo.decompressedBound); + break; +#endif + default : + frameSizeInfo.compressedSize = ERROR(prefix_unknown); + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + break; + } + if (!ZSTD_isError(frameSizeInfo.compressedSize) && frameSizeInfo.compressedSize > srcSize) { + frameSizeInfo.compressedSize = ERROR(srcSize_wrong); + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + } + return frameSizeInfo; +} + +MEM_STATIC size_t ZSTD_findFrameCompressedSizeLegacy(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo = ZSTD_findFrameSizeInfoLegacy(src, srcSize); + return frameSizeInfo.compressedSize; +} + +MEM_STATIC size_t ZSTD_freeLegacyStreamContext(void* legacyContext, U32 version) +{ + switch(version) + { + default : + case 1 : + case 2 : + case 3 : + (void)legacyContext; + return ERROR(version_unsupported); +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : return ZBUFFv04_freeDCtx((ZBUFFv04_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : return ZBUFFv05_freeDCtx((ZBUFFv05_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : return ZBUFFv06_freeDCtx((ZBUFFv06_DCtx*)legacyContext); +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : return ZBUFFv07_freeDCtx((ZBUFFv07_DCtx*)legacyContext); +#endif + } +} + + +MEM_STATIC size_t ZSTD_initLegacyStream(void** legacyContext, U32 prevVersion, U32 newVersion, + const void* dict, size_t dictSize) +{ + DEBUGLOG(5, "ZSTD_initLegacyStream for v0.%u", newVersion); + if (prevVersion != newVersion) ZSTD_freeLegacyStreamContext(*legacyContext, prevVersion); + switch(newVersion) + { + default : + case 1 : + case 2 : + case 3 : + (void)dict; (void)dictSize; + return 0; +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + { + ZBUFFv04_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv04_createDCtx() : (ZBUFFv04_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv04_decompressInit(dctx); + ZBUFFv04_decompressWithDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { + ZBUFFv05_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv05_createDCtx() : (ZBUFFv05_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv05_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { + ZBUFFv06_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv06_createDCtx() : (ZBUFFv06_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv06_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { + ZBUFFv07_DCtx* dctx = (prevVersion != newVersion) ? ZBUFFv07_createDCtx() : (ZBUFFv07_DCtx*)*legacyContext; + if (dctx==NULL) return ERROR(memory_allocation); + ZBUFFv07_decompressInitDictionary(dctx, dict, dictSize); + *legacyContext = dctx; + return 0; + } +#endif + } +} + + + +MEM_STATIC size_t ZSTD_decompressLegacyStream(void* legacyContext, U32 version, + ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + DEBUGLOG(5, "ZSTD_decompressLegacyStream for v0.%u", version); + switch(version) + { + default : + case 1 : + case 2 : + case 3 : + (void)legacyContext; (void)output; (void)input; + return ERROR(version_unsupported); +#if (ZSTD_LEGACY_SUPPORT <= 4) + case 4 : + { + ZBUFFv04_DCtx* dctx = (ZBUFFv04_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv04_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 5) + case 5 : + { + ZBUFFv05_DCtx* dctx = (ZBUFFv05_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv05_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 6) + case 6 : + { + ZBUFFv06_DCtx* dctx = (ZBUFFv06_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv06_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif +#if (ZSTD_LEGACY_SUPPORT <= 7) + case 7 : + { + ZBUFFv07_DCtx* dctx = (ZBUFFv07_DCtx*) legacyContext; + const void* src = (const char*)input->src + input->pos; + size_t readSize = input->size - input->pos; + void* dst = (char*)output->dst + output->pos; + size_t decodedSize = output->size - output->pos; + size_t const hintSize = ZBUFFv07_decompressContinue(dctx, dst, &decodedSize, src, &readSize); + output->pos += decodedSize; + input->pos += readSize; + return hintSize; + } +#endif + } +} + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_LEGACY_H */ +/**** ended inlining ../legacy/zstd_legacy.h ****/ +#endif + + + +/*-******************************************************* +* Types +*********************************************************/ +struct ZSTD_DDict_s { + void* dictBuffer; + const void* dictContent; + size_t dictSize; + ZSTD_entropyDTables_t entropy; + U32 dictID; + U32 entropyPresent; + ZSTD_customMem cMem; +}; /* typedef'd to ZSTD_DDict within "zstd.h" */ + +const void* ZSTD_DDict_dictContent(const ZSTD_DDict* ddict) +{ + assert(ddict != NULL); + return ddict->dictContent; +} + +size_t ZSTD_DDict_dictSize(const ZSTD_DDict* ddict) +{ + assert(ddict != NULL); + return ddict->dictSize; +} + +void ZSTD_copyDDictParameters(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_copyDDictParameters"); + assert(dctx != NULL); + assert(ddict != NULL); + dctx->dictID = ddict->dictID; + dctx->prefixStart = ddict->dictContent; + dctx->virtualStart = ddict->dictContent; + dctx->dictEnd = (const BYTE*)ddict->dictContent + ddict->dictSize; + dctx->previousDstEnd = dctx->dictEnd; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif + if (ddict->entropyPresent) { + dctx->litEntropy = 1; + dctx->fseEntropy = 1; + dctx->LLTptr = ddict->entropy.LLTable; + dctx->MLTptr = ddict->entropy.MLTable; + dctx->OFTptr = ddict->entropy.OFTable; + dctx->HUFptr = ddict->entropy.hufTable; + dctx->entropy.rep[0] = ddict->entropy.rep[0]; + dctx->entropy.rep[1] = ddict->entropy.rep[1]; + dctx->entropy.rep[2] = ddict->entropy.rep[2]; + } else { + dctx->litEntropy = 0; + dctx->fseEntropy = 0; + } +} + + +static size_t +ZSTD_loadEntropy_intoDDict(ZSTD_DDict* ddict, + ZSTD_dictContentType_e dictContentType) +{ + ddict->dictID = 0; + ddict->entropyPresent = 0; + if (dictContentType == ZSTD_dct_rawContent) return 0; + + if (ddict->dictSize < 8) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + { U32 const magic = MEM_readLE32(ddict->dictContent); + if (magic != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_fullDict) + return ERROR(dictionary_corrupted); /* only accept specified dictionaries */ + return 0; /* pure content mode */ + } + } + ddict->dictID = MEM_readLE32((const char*)ddict->dictContent + ZSTD_FRAMEIDSIZE); + + /* load entropy tables */ + RETURN_ERROR_IF(ZSTD_isError(ZSTD_loadDEntropy( + &ddict->entropy, ddict->dictContent, ddict->dictSize)), + dictionary_corrupted, ""); + ddict->entropyPresent = 1; + return 0; +} + + +static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + if ((dictLoadMethod == ZSTD_dlm_byRef) || (!dict) || (!dictSize)) { + ddict->dictBuffer = NULL; + ddict->dictContent = dict; + if (!dict) dictSize = 0; + } else { + void* const internalBuffer = ZSTD_malloc(dictSize, ddict->cMem); + ddict->dictBuffer = internalBuffer; + ddict->dictContent = internalBuffer; + if (!internalBuffer) return ERROR(memory_allocation); + memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; + ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); + + return 0; +} + +ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem) +{ + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_DDict* const ddict = (ZSTD_DDict*) ZSTD_malloc(sizeof(ZSTD_DDict), customMem); + if (ddict == NULL) return NULL; + ddict->cMem = customMem; + { size_t const initResult = ZSTD_initDDict_internal(ddict, + dict, dictSize, + dictLoadMethod, dictContentType); + if (ZSTD_isError(initResult)) { + ZSTD_freeDDict(ddict); + return NULL; + } } + return ddict; + } +} + +/*! ZSTD_createDDict() : +* Create a digested dictionary, to start decompression without startup delay. +* `dict` content is copied inside DDict. +* Consequently, `dict` can be released after `ZSTD_DDict` creation */ +ZSTD_DDict* ZSTD_createDDict(const void* dict, size_t dictSize) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + return ZSTD_createDDict_advanced(dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto, allocator); +} + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, to start decompression without startup delay. + * Dictionary content is simply referenced, it will be accessed during decompression. + * Warning : dictBuffer must outlive DDict (DDict must be freed before dictBuffer) */ +ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize) +{ + ZSTD_customMem const allocator = { NULL, NULL, NULL }; + return ZSTD_createDDict_advanced(dictBuffer, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto, allocator); +} + + +const ZSTD_DDict* ZSTD_initStaticDDict( + void* sBuffer, size_t sBufferSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + size_t const neededSpace = sizeof(ZSTD_DDict) + + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); + ZSTD_DDict* const ddict = (ZSTD_DDict*)sBuffer; + assert(sBuffer != NULL); + assert(dict != NULL); + if ((size_t)sBuffer & 7) return NULL; /* 8-aligned */ + if (sBufferSize < neededSpace) return NULL; + if (dictLoadMethod == ZSTD_dlm_byCopy) { + memcpy(ddict+1, dict, dictSize); /* local copy */ + dict = ddict+1; + } + if (ZSTD_isError( ZSTD_initDDict_internal(ddict, + dict, dictSize, + ZSTD_dlm_byRef, dictContentType) )) + return NULL; + return ddict; +} + + +size_t ZSTD_freeDDict(ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; /* support free on NULL */ + { ZSTD_customMem const cMem = ddict->cMem; + ZSTD_free(ddict->dictBuffer, cMem); + ZSTD_free(ddict, cMem); + return 0; + } +} + +/*! ZSTD_estimateDDictSize() : + * Estimate amount of memory that will be needed to create a dictionary for decompression. + * Note : dictionary created by reference using ZSTD_dlm_byRef are smaller */ +size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod) +{ + return sizeof(ZSTD_DDict) + (dictLoadMethod == ZSTD_dlm_byRef ? 0 : dictSize); +} + +size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; /* support sizeof on NULL */ + return sizeof(*ddict) + (ddict->dictBuffer ? ddict->dictSize : 0) ; +} + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) +{ + if (ddict==NULL) return 0; + return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); +} +/**** ended inlining decompress/zstd_ddict.c ****/ +/**** start inlining decompress/zstd_decompress.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +/* *************************************************************** +* Tuning parameters +*****************************************************************/ +/*! + * HEAPMODE : + * Select how default decompression function ZSTD_decompress() allocates its context, + * on stack (0), or into heap (1, default; requires malloc()). + * Note that functions with explicit context such as ZSTD_decompressDCtx() are unaffected. + */ +#ifndef ZSTD_HEAPMODE +# define ZSTD_HEAPMODE 1 +#endif + +/*! +* LEGACY_SUPPORT : +* if set to 1+, ZSTD_decompress() can decode older formats (v0.1+) +*/ +#ifndef ZSTD_LEGACY_SUPPORT +# define ZSTD_LEGACY_SUPPORT 0 +#endif + +/*! + * MAXWINDOWSIZE_DEFAULT : + * maximum window size accepted by DStream __by default__. + * Frames requiring more memory will be rejected. + * It's possible to set a different limit using ZSTD_DCtx_setMaxWindowSize(). + */ +#ifndef ZSTD_MAXWINDOWSIZE_DEFAULT +# define ZSTD_MAXWINDOWSIZE_DEFAULT (((U32)1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + 1) +#endif + +/*! + * NO_FORWARD_PROGRESS_MAX : + * maximum allowed nb of calls to ZSTD_decompressStream() + * without any forward progress + * (defined as: no byte read from input, and no byte flushed to output) + * before triggering an error. + */ +#ifndef ZSTD_NO_FORWARD_PROGRESS_MAX +# define ZSTD_NO_FORWARD_PROGRESS_MAX 16 +#endif + + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ +/**** skipping file: zstd_ddict.h ****/ +/**** start inlining zstd_decompress_block.h ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + + +#ifndef ZSTD_DEC_BLOCK_H +#define ZSTD_DEC_BLOCK_H + +/*-******************************************************* + * Dependencies + *********************************************************/ +#include /* size_t */ +/**** skipping file: ../zstd.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ + + +/* === Prototypes === */ + +/* note: prototypes already published within `zstd.h` : + * ZSTD_decompressBlock() + */ + +/* note: prototypes already published within `zstd_internal.h` : + * ZSTD_getcBlockSize() + * ZSTD_decodeSeqHeaders() + */ + + +/* ZSTD_decompressBlock_internal() : + * decompress block, starting at `src`, + * into destination buffer `dst`. + * @return : decompressed block size, + * or an error code (which can be tested using ZSTD_isError()) + */ +size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, const int frame); + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) + * this function must be called with valid parameters only + * (dt is large enough, normalizedCounter distribution total is a power of 2, max is within range, etc.) + * in which case it cannot fail. + * Internal use only. + */ +void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog); + + +#endif /* ZSTD_DEC_BLOCK_H */ +/**** ended inlining zstd_decompress_block.h ****/ + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) +/**** skipping file: ../legacy/zstd_legacy.h ****/ +#endif + + +/*-************************************************************* +* Context management +***************************************************************/ +size_t ZSTD_sizeof_DCtx (const ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support sizeof NULL */ + return sizeof(*dctx) + + ZSTD_sizeof_DDict(dctx->ddictLocal) + + dctx->inBuffSize + dctx->outBuffSize; +} + +size_t ZSTD_estimateDCtxSize(void) { return sizeof(ZSTD_DCtx); } + + +static size_t ZSTD_startingInputLength(ZSTD_format_e format) +{ + size_t const startingInputLength = ZSTD_FRAMEHEADERSIZE_PREFIX(format); + /* only supports formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless */ + assert( (format == ZSTD_f_zstd1) || (format == ZSTD_f_zstd1_magicless) ); + return startingInputLength; +} + +static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +{ + dctx->format = ZSTD_f_zstd1; /* ZSTD_decompressBegin() invokes ZSTD_startingInputLength() with argument dctx->format */ + dctx->staticSize = 0; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + dctx->ddict = NULL; + dctx->ddictLocal = NULL; + dctx->dictEnd = NULL; + dctx->ddictIsCold = 0; + dctx->dictUses = ZSTD_dont_use; + dctx->inBuff = NULL; + dctx->inBuffSize = 0; + dctx->outBuffSize = 0; + dctx->streamStage = zdss_init; + dctx->legacyContext = NULL; + dctx->previousLegacyVersion = 0; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; + dctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + dctx->outBufferMode = ZSTD_obm_buffered; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentEndForFuzzing = NULL; +#endif +} + +ZSTD_DCtx* ZSTD_initStaticDCtx(void *workspace, size_t workspaceSize) +{ + ZSTD_DCtx* const dctx = (ZSTD_DCtx*) workspace; + + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + if (workspaceSize < sizeof(ZSTD_DCtx)) return NULL; /* minimum size */ + + ZSTD_initDCtx_internal(dctx); + dctx->staticSize = workspaceSize; + dctx->inBuff = (char*)(dctx+1); + return dctx; +} + +ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem) +{ + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + { ZSTD_DCtx* const dctx = (ZSTD_DCtx*)ZSTD_malloc(sizeof(*dctx), customMem); + if (!dctx) return NULL; + dctx->customMem = customMem; + ZSTD_initDCtx_internal(dctx); + return dctx; + } +} + +ZSTD_DCtx* ZSTD_createDCtx(void) +{ + DEBUGLOG(3, "ZSTD_createDCtx"); + return ZSTD_createDCtx_advanced(ZSTD_defaultCMem); +} + +static void ZSTD_clearDict(ZSTD_DCtx* dctx) +{ + ZSTD_freeDDict(dctx->ddictLocal); + dctx->ddictLocal = NULL; + dctx->ddict = NULL; + dctx->dictUses = ZSTD_dont_use; +} + +size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx) +{ + if (dctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, "not compatible with static DCtx"); + { ZSTD_customMem const cMem = dctx->customMem; + ZSTD_clearDict(dctx); + ZSTD_free(dctx->inBuff, cMem); + dctx->inBuff = NULL; +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (dctx->legacyContext) + ZSTD_freeLegacyStreamContext(dctx->legacyContext, dctx->previousLegacyVersion); +#endif + ZSTD_free(dctx, cMem); + return 0; + } +} + +/* no longer useful */ +void ZSTD_copyDCtx(ZSTD_DCtx* dstDCtx, const ZSTD_DCtx* srcDCtx) +{ + size_t const toCopy = (size_t)((char*)(&dstDCtx->inBuff) - (char*)dstDCtx); + memcpy(dstDCtx, srcDCtx, toCopy); /* no need to copy workspace */ +} + + +/*-************************************************************* + * Frame header decoding + ***************************************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +unsigned ZSTD_isFrame(const void* buffer, size_t size) +{ + if (size < ZSTD_FRAMEIDSIZE) return 0; + { U32 const magic = MEM_readLE32(buffer); + if (magic == ZSTD_MAGICNUMBER) return 1; + if ((magic & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) return 1; + } +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(buffer, size)) return 1; +#endif + return 0; +} + +/** ZSTD_frameHeaderSize_internal() : + * srcSize must be large enough to reach header size fields. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless. + * @return : size of the Frame Header + * or an error code, which can be tested with ZSTD_isError() */ +static size_t ZSTD_frameHeaderSize_internal(const void* src, size_t srcSize, ZSTD_format_e format) +{ + size_t const minInputSize = ZSTD_startingInputLength(format); + RETURN_ERROR_IF(srcSize < minInputSize, srcSize_wrong, ""); + + { BYTE const fhd = ((const BYTE*)src)[minInputSize-1]; + U32 const dictID= fhd & 3; + U32 const singleSegment = (fhd >> 5) & 1; + U32 const fcsId = fhd >> 6; + return minInputSize + !singleSegment + + ZSTD_did_fieldSize[dictID] + ZSTD_fcs_fieldSize[fcsId] + + (singleSegment && !fcsId); + } +} + +/** ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_frameHeaderSize_prefix. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) +{ + return ZSTD_frameHeaderSize_internal(src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameHeader_advanced() : + * decode Frame Header, or require larger `srcSize`. + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) +{ + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + + memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ + if (srcSize < minInputSize) return minInputSize; + RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); + + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + /* skippable frame */ + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + memset(zfhPtr, 0, sizeof(*zfhPtr)); + zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + zfhPtr->frameType = ZSTD_skippableFrame; + return 0; + } + RETURN_ERROR(prefix_unknown, ""); + } + + /* ensure there is enough `srcSize` to fully read/decode frame header */ + { size_t const fhsize = ZSTD_frameHeaderSize_internal(src, srcSize, format); + if (srcSize < fhsize) return fhsize; + zfhPtr->headerSize = (U32)fhsize; + } + + { BYTE const fhdByte = ip[minInputSize-1]; + size_t pos = minInputSize; + U32 const dictIDSizeCode = fhdByte&3; + U32 const checksumFlag = (fhdByte>>2)&1; + U32 const singleSegment = (fhdByte>>5)&1; + U32 const fcsID = fhdByte>>6; + U64 windowSize = 0; + U32 dictID = 0; + U64 frameContentSize = ZSTD_CONTENTSIZE_UNKNOWN; + RETURN_ERROR_IF((fhdByte & 0x08) != 0, frameParameter_unsupported, + "reserved bits, must be zero"); + + if (!singleSegment) { + BYTE const wlByte = ip[pos++]; + U32 const windowLog = (wlByte >> 3) + ZSTD_WINDOWLOG_ABSOLUTEMIN; + RETURN_ERROR_IF(windowLog > ZSTD_WINDOWLOG_MAX, frameParameter_windowTooLarge, ""); + windowSize = (1ULL << windowLog); + windowSize += (windowSize >> 3) * (wlByte&7); + } + switch(dictIDSizeCode) + { + default: assert(0); /* impossible */ + case 0 : break; + case 1 : dictID = ip[pos]; pos++; break; + case 2 : dictID = MEM_readLE16(ip+pos); pos+=2; break; + case 3 : dictID = MEM_readLE32(ip+pos); pos+=4; break; + } + switch(fcsID) + { + default: assert(0); /* impossible */ + case 0 : if (singleSegment) frameContentSize = ip[pos]; break; + case 1 : frameContentSize = MEM_readLE16(ip+pos)+256; break; + case 2 : frameContentSize = MEM_readLE32(ip+pos); break; + case 3 : frameContentSize = MEM_readLE64(ip+pos); break; + } + if (singleSegment) windowSize = frameContentSize; + + zfhPtr->frameType = ZSTD_frame; + zfhPtr->frameContentSize = frameContentSize; + zfhPtr->windowSize = windowSize; + zfhPtr->blockSizeMax = (unsigned) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + zfhPtr->dictID = dictID; + zfhPtr->checksumFlag = checksumFlag; + } + return 0; +} + +/** ZSTD_getFrameHeader() : + * decode Frame Header, or require larger `srcSize`. + * note : this function does not consume input, it only reads it. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) +{ + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); +} + + +/** ZSTD_getFrameContentSize() : + * compatible with legacy mode + * @return : decompressed size of the single frame pointed to be `src` if known, otherwise + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ +unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) +{ +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + unsigned long long const ret = ZSTD_getDecompressedSize_legacy(src, srcSize); + return ret == 0 ? ZSTD_CONTENTSIZE_UNKNOWN : ret; + } +#endif + { ZSTD_frameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { + return 0; + } else { + return zfh.frameContentSize; + } } +} + +static size_t readSkippableFrameSize(void const* src, size_t srcSize) +{ + size_t const skippableHeaderSize = ZSTD_SKIPPABLEHEADERSIZE; + U32 sizeU32; + + RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); + { + size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } +} + +/** ZSTD_findDecompressedSize() : + * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames + * @return : decompressed size of the frames contained */ +unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + if (ZSTD_isError(skippableSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } + + { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; + + /* check for overflow */ + if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; + totalDstSize += ret; + } + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); + if (ZSTD_isError(frameSrcSize)) { + return ZSTD_CONTENTSIZE_ERROR; + } + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; + } + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + if (srcSize) return ZSTD_CONTENTSIZE_ERROR; + + return totalDstSize; +} + +/** ZSTD_getDecompressedSize() : + * compatible with legacy mode + * @return : decompressed size if known, 0 otherwise + note : 0 can mean any of the following : + - frame content is empty + - decompressed size field is not present in frame header + - frame header unknown / not supported + - frame header not complete (`srcSize` too small) */ +unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize) +{ + unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); + ZSTD_STATIC_ASSERT(ZSTD_CONTENTSIZE_ERROR < ZSTD_CONTENTSIZE_UNKNOWN); + return (ret >= ZSTD_CONTENTSIZE_ERROR) ? 0 : ret; +} + + +/** ZSTD_decodeFrameHeader() : + * `headerSize` must be the size provided by ZSTD_frameHeaderSize(). + * @return : 0 if success, or an error code, which can be tested using ZSTD_isError() */ +static size_t ZSTD_decodeFrameHeader(ZSTD_DCtx* dctx, const void* src, size_t headerSize) +{ + size_t const result = ZSTD_getFrameHeader_advanced(&(dctx->fParams), src, headerSize, dctx->format); + if (ZSTD_isError(result)) return result; /* invalid header */ + RETURN_ERROR_IF(result>0, srcSize_wrong, "headerSize too small"); +#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + /* Skip the dictID check in fuzzing mode, because it makes the search + * harder. + */ + RETURN_ERROR_IF(dctx->fParams.dictID && (dctx->dictID != dctx->fParams.dictID), + dictionary_wrong, ""); +#endif + if (dctx->fParams.checksumFlag) XXH64_reset(&dctx->xxhState, 0); + return 0; +} + +static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) +{ + ZSTD_frameSizeInfo frameSizeInfo; + frameSizeInfo.compressedSize = ret; + frameSizeInfo.decompressedBound = ZSTD_CONTENTSIZE_ERROR; + return frameSizeInfo; +} + +static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) +{ + ZSTD_frameSizeInfo frameSizeInfo; + memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) + return ZSTD_findFrameSizeInfoLegacy(src, srcSize); +#endif + + if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || + frameSizeInfo.compressedSize <= srcSize); + return frameSizeInfo; + } else { + const BYTE* ip = (const BYTE*)src; + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ + { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + } + + ip += zfh.headerSize; + remainingSize -= zfh.headerSize; + + /* Iterate over each block */ + while (1) { + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) + return ZSTD_errorFrameSizeInfo(cBlockSize); + + if (ZSTD_blockHeaderSize + cBlockSize > remainingSize) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + + ip += ZSTD_blockHeaderSize + cBlockSize; + remainingSize -= ZSTD_blockHeaderSize + cBlockSize; + nbBlocks++; + + if (blockProperties.lastBlock) break; + } + + /* Final frame content checksum */ + if (zfh.checksumFlag) { + if (remainingSize < 4) + return ZSTD_errorFrameSizeInfo(ERROR(srcSize_wrong)); + ip += 4; + } + + frameSizeInfo.compressedSize = ip - ipstart; + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize + : nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } +} + +/** ZSTD_findFrameCompressedSize() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the compressed size of the frame starting at `src` */ +size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) +{ + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + return frameSizeInfo.compressedSize; +} + +/** ZSTD_decompressBound() : + * compatible with legacy mode + * `src` must point to the start of a ZSTD frame or a skippeable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) +{ + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { + ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) + return ZSTD_CONTENTSIZE_ERROR; + assert(srcSize >= compressedSize); + src = (const BYTE*)src + compressedSize; + srcSize -= compressedSize; + bound += decompressedBound; + } + return bound; +} + + +/*-************************************************************* + * Frame decoding + ***************************************************************/ + +/** ZSTD_insertBlock() : + * insert `src` block into `dctx` history. Useful to track uncompressed blocks. */ +size_t ZSTD_insertBlock(ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize) +{ + DEBUGLOG(5, "ZSTD_insertBlock: %u bytes", (unsigned)blockSize); + ZSTD_checkContinuity(dctx, blockStart); + dctx->previousDstEnd = (const char*)blockStart + blockSize; + return blockSize; +} + + +static size_t ZSTD_copyRawBlock(void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_copyRawBlock"); + if (dst == NULL) { + if (srcSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + RETURN_ERROR_IF(srcSize > dstCapacity, dstSize_tooSmall, ""); + memcpy(dst, src, srcSize); + return srcSize; +} + +static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + BYTE b, + size_t regenSize) +{ + if (dst == NULL) { + if (regenSize == 0) return 0; + RETURN_ERROR(dstBuffer_null, ""); + } + RETURN_ERROR_IF(regenSize > dstCapacity, dstSize_tooSmall, ""); + memset(dst, b, regenSize); + return regenSize; +} + + +/*! ZSTD_decompressFrame() : + * @dctx must be properly initialized + * will update *srcPtr and *srcSizePtr, + * to make *srcPtr progress by one frame. */ +static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void** srcPtr, size_t *srcSizePtr) +{ + const BYTE* ip = (const BYTE*)(*srcPtr); + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = dstCapacity != 0 ? ostart + dstCapacity : ostart; + BYTE* op = ostart; + size_t remainingSrcSize = *srcSizePtr; + + DEBUGLOG(4, "ZSTD_decompressFrame (srcSize:%i)", (int)*srcSizePtr); + + /* check */ + RETURN_ERROR_IF( + remainingSrcSize < ZSTD_FRAMEHEADERSIZE_MIN(dctx->format)+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + + /* Frame Header */ + { size_t const frameHeaderSize = ZSTD_frameHeaderSize_internal( + ip, ZSTD_FRAMEHEADERSIZE_PREFIX(dctx->format), dctx->format); + if (ZSTD_isError(frameHeaderSize)) return frameHeaderSize; + RETURN_ERROR_IF(remainingSrcSize < frameHeaderSize+ZSTD_blockHeaderSize, + srcSize_wrong, ""); + FORWARD_IF_ERROR( ZSTD_decodeFrameHeader(dctx, ip, frameHeaderSize) , ""); + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + + /* Loop on each block */ + while (1) { + size_t decodedSize; + blockProperties_t blockProperties; + size_t const cBlockSize = ZSTD_getcBlockSize(ip, remainingSrcSize, &blockProperties); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + + ip += ZSTD_blockHeaderSize; + remainingSrcSize -= ZSTD_blockHeaderSize; + RETURN_ERROR_IF(cBlockSize > remainingSrcSize, srcSize_wrong, ""); + + switch(blockProperties.blockType) + { + case bt_compressed: + decodedSize = ZSTD_decompressBlock_internal(dctx, op, oend-op, ip, cBlockSize, /* frame */ 1); + break; + case bt_raw : + decodedSize = ZSTD_copyRawBlock(op, oend-op, ip, cBlockSize); + break; + case bt_rle : + decodedSize = ZSTD_setRleBlock(op, oend-op, *ip, blockProperties.origSize); + break; + case bt_reserved : + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + + if (ZSTD_isError(decodedSize)) return decodedSize; + if (dctx->fParams.checksumFlag) + XXH64_update(&dctx->xxhState, op, decodedSize); + if (decodedSize != 0) + op += decodedSize; + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; + if (blockProperties.lastBlock) break; + } + + if (dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) { + RETURN_ERROR_IF((U64)(op-ostart) != dctx->fParams.frameContentSize, + corruption_detected, ""); + } + if (dctx->fParams.checksumFlag) { /* Frame content checksum verification */ + U32 const checkCalc = (U32)XXH64_digest(&dctx->xxhState); + U32 checkRead; + RETURN_ERROR_IF(remainingSrcSize<4, checksum_wrong, ""); + checkRead = MEM_readLE32(ip); + RETURN_ERROR_IF(checkRead != checkCalc, checksum_wrong, ""); + ip += 4; + remainingSrcSize -= 4; + } + + /* Allow caller to get size read */ + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return op-ostart; +} + +static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, + const ZSTD_DDict* ddict) +{ + void* const dststart = dst; + int moreThan1Frame = 0; + + DEBUGLOG(5, "ZSTD_decompressMultiFrame"); + assert(dict==NULL || ddict==NULL); /* either dict or ddict set, not both */ + + if (ddict) { + dict = ZSTD_DDict_dictContent(ddict); + dictSize = ZSTD_DDict_dictSize(ddict); + } + + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT >= 1) + if (ZSTD_isLegacy(src, srcSize)) { + size_t decodedSize; + size_t const frameSize = ZSTD_findFrameCompressedSizeLegacy(src, srcSize); + if (ZSTD_isError(frameSize)) return frameSize; + RETURN_ERROR_IF(dctx->staticSize, memory_allocation, + "legacy support is not compatible with static dctx"); + + decodedSize = ZSTD_decompressLegacy(dst, dstCapacity, src, frameSize, dict, dictSize); + if (ZSTD_isError(decodedSize)) return decodedSize; + + assert(decodedSize <=- dstCapacity); + dst = (BYTE*)dst + decodedSize; + dstCapacity -= decodedSize; + + src = (const BYTE*)src + frameSize; + srcSize -= frameSize; + + continue; + } +#endif + + { U32 const magicNumber = MEM_readLE32(src); + DEBUGLOG(4, "reading magic number %08X (expecting %08X)", + (unsigned)magicNumber, ZSTD_MAGICNUMBER); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); + FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; + continue; + } } + + if (ddict) { + /* we were called from ZSTD_decompress_usingDDict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(dctx, ddict), ""); + } else { + /* this will initialize correctly with no dict if dict == NULL, so + * use this in all cases but ddict */ + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDict(dctx, dict, dictSize), ""); + } + ZSTD_checkContinuity(dctx, dst); + + { const size_t res = ZSTD_decompressFrame(dctx, dst, dstCapacity, + &src, &srcSize); + RETURN_ERROR_IF( + (ZSTD_getErrorCode(res) == ZSTD_error_prefix_unknown) + && (moreThan1Frame==1), + srcSize_wrong, + "at least one frame successfully completed, but following " + "bytes are garbage: it's more likely to be a srcSize error, " + "specifying more bytes than compressed size of frame(s). This " + "error message replaces ERROR(prefix_unknown), which would be " + "confusing, as the first header is actually correct. Note that " + "one could be unlucky, it might be a corruption error instead, " + "happening right at the place where we expect zstd magic " + "bytes. But this is _much_ less likely than a srcSize field " + "error."); + if (ZSTD_isError(res)) return res; + assert(res <= dstCapacity); + if (res != 0) + dst = (BYTE*)dst + res; + dstCapacity -= res; + } + moreThan1Frame = 1; + } /* while (srcSize >= ZSTD_frameHeaderSize_prefix) */ + + RETURN_ERROR_IF(srcSize, srcSize_wrong, "input not entirely consumed"); + + return (BYTE*)dst - (BYTE*)dststart; +} + +size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize) +{ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, dict, dictSize, NULL); +} + + +static ZSTD_DDict const* ZSTD_getDDict(ZSTD_DCtx* dctx) +{ + switch (dctx->dictUses) { + default: + assert(0 /* Impossible */); + /* fall-through */ + case ZSTD_dont_use: + ZSTD_clearDict(dctx); + return NULL; + case ZSTD_use_indefinitely: + return dctx->ddict; + case ZSTD_use_once: + dctx->dictUses = ZSTD_dont_use; + return dctx->ddict; + } +} + +size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + return ZSTD_decompress_usingDDict(dctx, dst, dstCapacity, src, srcSize, ZSTD_getDDict(dctx)); +} + + +size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ +#if defined(ZSTD_HEAPMODE) && (ZSTD_HEAPMODE>=1) + size_t regenSize; + ZSTD_DCtx* const dctx = ZSTD_createDCtx(); + RETURN_ERROR_IF(dctx==NULL, memory_allocation, "NULL pointer!"); + regenSize = ZSTD_decompressDCtx(dctx, dst, dstCapacity, src, srcSize); + ZSTD_freeDCtx(dctx); + return regenSize; +#else /* stack mode */ + ZSTD_DCtx dctx; + ZSTD_initDCtx_internal(&dctx); + return ZSTD_decompressDCtx(&dctx, dst, dstCapacity, src, srcSize); +#endif +} + + +/*-************************************** +* Advanced Streaming Decompression API +* Bufferless and synchronous +****************************************/ +size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + +/** + * Similar to ZSTD_nextSrcSizeToDecompress(), but when when a block input can be streamed, + * we allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce + * output, and avoid copying the input. + * + * @param inputSize - The total amount of input that the caller currently has. + */ +static size_t ZSTD_nextSrcSizeToDecompressWithInputSize(ZSTD_DCtx* dctx, size_t inputSize) { + if (!(dctx->stage == ZSTDds_decompressBlock || dctx->stage == ZSTDds_decompressLastBlock)) + return dctx->expected; + if (dctx->bType != bt_raw) + return dctx->expected; + return MIN(MAX(inputSize, 1), dctx->expected); +} + +ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx) { + switch(dctx->stage) + { + default: /* should not happen */ + assert(0); + case ZSTDds_getFrameHeaderSize: + case ZSTDds_decodeFrameHeader: + return ZSTDnit_frameHeader; + case ZSTDds_decodeBlockHeader: + return ZSTDnit_blockHeader; + case ZSTDds_decompressBlock: + return ZSTDnit_block; + case ZSTDds_decompressLastBlock: + return ZSTDnit_lastBlock; + case ZSTDds_checkChecksum: + return ZSTDnit_checksum; + case ZSTDds_decodeSkippableHeader: + case ZSTDds_skipFrame: + return ZSTDnit_skippableFrame; + } +} + +static int ZSTD_isSkipFrame(ZSTD_DCtx* dctx) { return dctx->stage == ZSTDds_skipFrame; } + +/** ZSTD_decompressContinue() : + * srcSize : must be the exact nb of bytes expected (see ZSTD_nextSrcSizeToDecompress()) + * @return : nb of bytes generated into `dst` (necessarily <= `dstCapacity) + * or an error code, which can be tested using ZSTD_isError() */ +size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) +{ + DEBUGLOG(5, "ZSTD_decompressContinue (srcSize:%u)", (unsigned)srcSize); + /* Sanity check */ + RETURN_ERROR_IF(srcSize != ZSTD_nextSrcSizeToDecompressWithInputSize(dctx, srcSize), srcSize_wrong, "not allowed"); + if (dstCapacity) ZSTD_checkContinuity(dctx, dst); + + switch (dctx->stage) + { + case ZSTDds_getFrameHeaderSize : + assert(src != NULL); + if (dctx->format == ZSTD_f_zstd1) { /* allows header */ + assert(srcSize >= ZSTD_FRAMEIDSIZE); /* to read skippable magic number */ + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = ZSTD_SKIPPABLEHEADERSIZE - srcSize; /* remaining to load to get full skippable frame header */ + dctx->stage = ZSTDds_decodeSkippableHeader; + return 0; + } } + dctx->headerSize = ZSTD_frameHeaderSize_internal(src, srcSize, dctx->format); + if (ZSTD_isError(dctx->headerSize)) return dctx->headerSize; + memcpy(dctx->headerBuffer, src, srcSize); + dctx->expected = dctx->headerSize - srcSize; + dctx->stage = ZSTDds_decodeFrameHeader; + return 0; + + case ZSTDds_decodeFrameHeader: + assert(src != NULL); + memcpy(dctx->headerBuffer + (dctx->headerSize - srcSize), src, srcSize); + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(dctx, dctx->headerBuffer, dctx->headerSize), ""); + dctx->expected = ZSTD_blockHeaderSize; + dctx->stage = ZSTDds_decodeBlockHeader; + return 0; + + case ZSTDds_decodeBlockHeader: + { blockProperties_t bp; + size_t const cBlockSize = ZSTD_getcBlockSize(src, ZSTD_blockHeaderSize, &bp); + if (ZSTD_isError(cBlockSize)) return cBlockSize; + RETURN_ERROR_IF(cBlockSize > dctx->fParams.blockSizeMax, corruption_detected, "Block Size Exceeds Maximum"); + dctx->expected = cBlockSize; + dctx->bType = bp.blockType; + dctx->rleSize = bp.origSize; + if (cBlockSize) { + dctx->stage = bp.lastBlock ? ZSTDds_decompressLastBlock : ZSTDds_decompressBlock; + return 0; + } + /* empty block */ + if (bp.lastBlock) { + if (dctx->fParams.checksumFlag) { + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* end of frame */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->expected = ZSTD_blockHeaderSize; /* jump to next header */ + dctx->stage = ZSTDds_decodeBlockHeader; + } + return 0; + } + + case ZSTDds_decompressLastBlock: + case ZSTDds_decompressBlock: + DEBUGLOG(5, "ZSTD_decompressContinue: case ZSTDds_decompressBlock"); + { size_t rSize; + switch(dctx->bType) + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); + rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : + assert(srcSize <= dctx->expected); + rSize = ZSTD_copyRawBlock(dst, dstCapacity, src, srcSize); + FORWARD_IF_ERROR(rSize, "ZSTD_copyRawBlock failed"); + assert(rSize == srcSize); + dctx->expected -= rSize; + break; + case bt_rle : + rSize = ZSTD_setRleBlock(dst, dstCapacity, *(const BYTE*)src, dctx->rleSize); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_reserved : /* should never happen */ + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } + FORWARD_IF_ERROR(rSize, ""); + RETURN_ERROR_IF(rSize > dctx->fParams.blockSizeMax, corruption_detected, "Decompressed Block Size Exceeds Maximum"); + DEBUGLOG(5, "ZSTD_decompressContinue: decoded size from block : %u", (unsigned)rSize); + dctx->decodedSize += rSize; + if (dctx->fParams.checksumFlag) XXH64_update(&dctx->xxhState, dst, rSize); + dctx->previousDstEnd = (char*)dst + rSize; + + /* Stay on the same stage until we are finished streaming the block. */ + if (dctx->expected > 0) { + return rSize; + } + + if (dctx->stage == ZSTDds_decompressLastBlock) { /* end of frame */ + DEBUGLOG(4, "ZSTD_decompressContinue: decoded size from frame : %u", (unsigned)dctx->decodedSize); + RETURN_ERROR_IF( + dctx->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && dctx->decodedSize != dctx->fParams.frameContentSize, + corruption_detected, ""); + if (dctx->fParams.checksumFlag) { /* another round for frame checksum */ + dctx->expected = 4; + dctx->stage = ZSTDds_checkChecksum; + } else { + dctx->expected = 0; /* ends here */ + dctx->stage = ZSTDds_getFrameHeaderSize; + } + } else { + dctx->stage = ZSTDds_decodeBlockHeader; + dctx->expected = ZSTD_blockHeaderSize; + } + return rSize; + } + + case ZSTDds_checkChecksum: + assert(srcSize == 4); /* guaranteed by dctx->expected */ + { U32 const h32 = (U32)XXH64_digest(&dctx->xxhState); + U32 const check32 = MEM_readLE32(src); + DEBUGLOG(4, "ZSTD_decompressContinue: checksum : calculated %08X :: %08X read", (unsigned)h32, (unsigned)check32); + RETURN_ERROR_IF(check32 != h32, checksum_wrong, ""); + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + } + + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); + memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; + return 0; + + case ZSTDds_skipFrame: + dctx->expected = 0; + dctx->stage = ZSTDds_getFrameHeaderSize; + return 0; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ + } +} + + +static size_t ZSTD_refDictContent(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dict - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dict; + dctx->previousDstEnd = (const char*)dict + dictSize; +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION + dctx->dictContentBeginForFuzzing = dctx->prefixStart; + dctx->dictContentEndForFuzzing = dctx->previousDstEnd; +#endif + return 0; +} + +/*! ZSTD_loadDEntropy() : + * dict : must point at beginning of a valid zstd dictionary. + * @return : size of entropy tables read */ +size_t +ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + const void* const dict, size_t const dictSize) +{ + const BYTE* dictPtr = (const BYTE*)dict; + const BYTE* const dictEnd = dictPtr + dictSize; + + RETURN_ERROR_IF(dictSize <= 8, dictionary_corrupted, "dict is too small"); + assert(MEM_readLE32(dict) == ZSTD_MAGIC_DICTIONARY); /* dict must be valid */ + dictPtr += 8; /* skip header = magic + dictID */ + + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, OFTable) == offsetof(ZSTD_entropyDTables_t, LLTable) + sizeof(entropy->LLTable)); + ZSTD_STATIC_ASSERT(offsetof(ZSTD_entropyDTables_t, MLTable) == offsetof(ZSTD_entropyDTables_t, OFTable) + sizeof(entropy->OFTable)); + ZSTD_STATIC_ASSERT(sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable) >= HUF_DECOMPRESS_WORKSPACE_SIZE); + { void* const workspace = &entropy->LLTable; /* use fse tables as temporary workspace; implies fse tables are grouped together */ + size_t const workspaceSize = sizeof(entropy->LLTable) + sizeof(entropy->OFTable) + sizeof(entropy->MLTable); +#ifdef HUF_FORCE_DECOMPRESS_X1 + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, + workspace, workspaceSize); +#else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, + workspace, workspaceSize); +#endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; + } + + { short offcodeNCount[MaxOff+1]; + unsigned offcodeMaxValue = MaxOff, offcodeLog; + size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeMaxValue > MaxOff, dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->OFTable, + offcodeNCount, offcodeMaxValue, + OF_base, OF_bits, + offcodeLog); + dictPtr += offcodeHeaderSize; + } + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; + size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthMaxValue > MaxML, dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->MLTable, + matchlengthNCount, matchlengthMaxValue, + ML_base, ML_bits, + matchlengthLog); + dictPtr += matchlengthHeaderSize; + } + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; + size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthMaxValue > MaxLL, dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + ZSTD_buildFSETable( entropy->LLTable, + litlengthNCount, litlengthMaxValue, + LL_base, LL_bits, + litlengthLog); + dictPtr += litlengthHeaderSize; + } + + RETURN_ERROR_IF(dictPtr+12 > dictEnd, dictionary_corrupted, ""); + { int i; + size_t const dictContentSize = (size_t)(dictEnd - (dictPtr+12)); + for (i=0; i<3; i++) { + U32 const rep = MEM_readLE32(dictPtr); dictPtr += 4; + RETURN_ERROR_IF(rep==0 || rep > dictContentSize, + dictionary_corrupted, ""); + entropy->rep[i] = rep; + } } + + return dictPtr - (const BYTE*)dict; +} + +static size_t ZSTD_decompress_insertDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + if (dictSize < 8) return ZSTD_refDictContent(dctx, dict, dictSize); + { U32 const magic = MEM_readLE32(dict); + if (magic != ZSTD_MAGIC_DICTIONARY) { + return ZSTD_refDictContent(dctx, dict, dictSize); /* pure content mode */ + } } + dctx->dictID = MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); + + /* load entropy tables */ + { size_t const eSize = ZSTD_loadDEntropy(&dctx->entropy, dict, dictSize); + RETURN_ERROR_IF(ZSTD_isError(eSize), dictionary_corrupted, ""); + dict = (const char*)dict + eSize; + dictSize -= eSize; + } + dctx->litEntropy = dctx->fseEntropy = 1; + + /* reference dictionary content */ + return ZSTD_refDictContent(dctx, dict, dictSize); +} + +size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) +{ + assert(dctx != NULL); + dctx->expected = ZSTD_startingInputLength(dctx->format); /* dctx->format must be properly set */ + dctx->stage = ZSTDds_getFrameHeaderSize; + dctx->decodedSize = 0; + dctx->previousDstEnd = NULL; + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; + dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; + dctx->MLTptr = dctx->entropy.MLTable; + dctx->OFTptr = dctx->entropy.OFTable; + dctx->HUFptr = dctx->entropy.hufTable; + return 0; +} + +size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (dict && dictSize) + RETURN_ERROR_IF( + ZSTD_isError(ZSTD_decompress_insertDictionary(dctx, dict, dictSize)), + dictionary_corrupted, ""); + return 0; +} + + +/* ====== ZSTD_DDict ====== */ + +size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + DEBUGLOG(4, "ZSTD_decompressBegin_usingDDict"); + assert(dctx != NULL); + if (ddict) { + const char* const dictStart = (const char*)ZSTD_DDict_dictContent(ddict); + size_t const dictSize = ZSTD_DDict_dictSize(ddict); + const void* const dictEnd = dictStart + dictSize; + dctx->ddictIsCold = (dctx->dictEnd != dictEnd); + DEBUGLOG(4, "DDict is %s", + dctx->ddictIsCold ? "~cold~" : "hot!"); + } + FORWARD_IF_ERROR( ZSTD_decompressBegin(dctx) , ""); + if (ddict) { /* NULL ddict is equivalent to no dictionary */ + ZSTD_copyDDictParameters(dctx, ddict); + } + return 0; +} + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) +{ + if (dictSize < 8) return 0; + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) return 0; + return MEM_readLE32((const char*)dict + ZSTD_FRAMEIDSIZE); +} + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompress frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. + * Needed dictionary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ +unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) +{ + ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +} + + +/*! ZSTD_decompress_usingDDict() : +* Decompression using a pre-digested Dictionary +* Use dictionary without significant overhead. */ +size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict) +{ + /* pass content and size in case legacy frames are encountered */ + return ZSTD_decompressMultiFrame(dctx, dst, dstCapacity, src, srcSize, + NULL, 0, + ddict); +} + + +/*===================================== +* Streaming decompression +*====================================*/ + +ZSTD_DStream* ZSTD_createDStream(void) +{ + DEBUGLOG(3, "ZSTD_createDStream"); + return ZSTD_createDStream_advanced(ZSTD_defaultCMem); +} + +ZSTD_DStream* ZSTD_initStaticDStream(void *workspace, size_t workspaceSize) +{ + return ZSTD_initStaticDCtx(workspace, workspaceSize); +} + +ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem) +{ + return ZSTD_createDCtx_advanced(customMem); +} + +size_t ZSTD_freeDStream(ZSTD_DStream* zds) +{ + return ZSTD_freeDCtx(zds); +} + + +/* *** Initialization *** */ + +size_t ZSTD_DStreamInSize(void) { return ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize; } +size_t ZSTD_DStreamOutSize(void) { return ZSTD_BLOCKSIZE_MAX; } + +size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (dict && dictSize != 0) { + dctx->ddictLocal = ZSTD_createDDict_advanced(dict, dictSize, dictLoadMethod, dictContentType, dctx->customMem); + RETURN_ERROR_IF(dctx->ddictLocal == NULL, memory_allocation, "NULL pointer!"); + dctx->ddict = dctx->ddictLocal; + dctx->dictUses = ZSTD_use_indefinitely; + } + return 0; +} + +size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byRef, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize) +{ + return ZSTD_DCtx_loadDictionary_advanced(dctx, dict, dictSize, ZSTD_dlm_byCopy, ZSTD_dct_auto); +} + +size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType) +{ + FORWARD_IF_ERROR(ZSTD_DCtx_loadDictionary_advanced(dctx, prefix, prefixSize, ZSTD_dlm_byRef, dictContentType), ""); + dctx->dictUses = ZSTD_use_once; + return 0; +} + +size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize) +{ + return ZSTD_DCtx_refPrefix_advanced(dctx, prefix, prefixSize, ZSTD_dct_rawContent); +} + + +/* ZSTD_initDStream_usingDict() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize) +{ + DEBUGLOG(4, "ZSTD_initDStream_usingDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(zds, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_loadDictionary(zds, dict, dictSize) , ""); + return ZSTD_startingInputLength(zds->format); +} + +/* note : this variant can't fail */ +size_t ZSTD_initDStream(ZSTD_DStream* zds) +{ + DEBUGLOG(4, "ZSTD_initDStream"); + return ZSTD_initDStream_usingDDict(zds, NULL); +} + +/* ZSTD_initDStream_usingDDict() : + * ddict will just be referenced, and must outlive decompression session + * this function cannot fail */ +size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) +{ + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +} + +/* ZSTD_resetDStream() : + * return : expected size, aka ZSTD_startingInputLength(). + * this function cannot fail */ +size_t ZSTD_resetDStream(ZSTD_DStream* dctx) +{ + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); +} + + +size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + if (ddict) { + dctx->ddict = ddict; + dctx->dictUses = ZSTD_use_indefinitely; + } + return 0; +} + +/* ZSTD_DCtx_setMaxWindowSize() : + * note : no direct equivalence in ZSTD_DCtx_setParameter, + * since this version sets windowSize, and the other sets windowLog */ +size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(ZSTD_d_windowLogMax); + size_t const min = (size_t)1 << bounds.lowerBound; + size_t const max = (size_t)1 << bounds.upperBound; + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + RETURN_ERROR_IF(maxWindowSize < min, parameter_outOfBound, ""); + RETURN_ERROR_IF(maxWindowSize > max, parameter_outOfBound, ""); + dctx->maxWindowSize = maxWindowSize; + return 0; +} + +size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format) +{ + return ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, format); +} + +ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) +{ + ZSTD_bounds bounds = { 0, 0, 0 }; + switch(dParam) { + case ZSTD_d_windowLogMax: + bounds.lowerBound = ZSTD_WINDOWLOG_ABSOLUTEMIN; + bounds.upperBound = ZSTD_WINDOWLOG_MAX; + return bounds; + case ZSTD_d_format: + bounds.lowerBound = (int)ZSTD_f_zstd1; + bounds.upperBound = (int)ZSTD_f_zstd1_magicless; + ZSTD_STATIC_ASSERT(ZSTD_f_zstd1 < ZSTD_f_zstd1_magicless); + return bounds; + case ZSTD_d_stableOutBuffer: + bounds.lowerBound = (int)ZSTD_obm_buffered; + bounds.upperBound = (int)ZSTD_obm_stable; + return bounds; + default:; + } + bounds.error = ERROR(parameter_unsupported); + return bounds; +} + +/* ZSTD_dParam_withinBounds: + * @return 1 if value is within dParam bounds, + * 0 otherwise */ +static int ZSTD_dParam_withinBounds(ZSTD_dParameter dParam, int value) +{ + ZSTD_bounds const bounds = ZSTD_dParam_getBounds(dParam); + if (ZSTD_isError(bounds.error)) return 0; + if (value < bounds.lowerBound) return 0; + if (value > bounds.upperBound) return 0; + return 1; +} + +#define CHECK_DBOUNDS(p,v) { \ + RETURN_ERROR_IF(!ZSTD_dParam_withinBounds(p, v), parameter_outOfBound, ""); \ +} + +size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value) +{ + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + switch(dParam) { + case ZSTD_d_windowLogMax: + if (value == 0) value = ZSTD_WINDOWLOG_LIMIT_DEFAULT; + CHECK_DBOUNDS(ZSTD_d_windowLogMax, value); + dctx->maxWindowSize = ((size_t)1) << value; + return 0; + case ZSTD_d_format: + CHECK_DBOUNDS(ZSTD_d_format, value); + dctx->format = (ZSTD_format_e)value; + return 0; + case ZSTD_d_stableOutBuffer: + CHECK_DBOUNDS(ZSTD_d_stableOutBuffer, value); + dctx->outBufferMode = (ZSTD_outBufferMode_e)value; + return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +} + +size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) +{ + if ( (reset == ZSTD_reset_session_only) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(dctx->streamStage != zdss_init, stage_wrong, ""); + ZSTD_clearDict(dctx); + dctx->format = ZSTD_f_zstd1; + dctx->maxWindowSize = ZSTD_MAXWINDOWSIZE_DEFAULT; + } + return 0; +} + + +size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) +{ + return ZSTD_sizeof_DCtx(dctx); +} + +size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) +{ + size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + unsigned long long const neededRBSize = windowSize + blockSize + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, + frameParameter_windowTooLarge, ""); + return minRBSize; +} + +size_t ZSTD_estimateDStreamSize(size_t windowSize) +{ + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); + size_t const inBuffSize = blockSize; /* no block can be larger */ + size_t const outBuffSize = ZSTD_decodingBufferSize_min(windowSize, ZSTD_CONTENTSIZE_UNKNOWN); + return ZSTD_estimateDCtxSize() + inBuffSize + outBuffSize; +} + +size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) +{ + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ + ZSTD_frameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); + RETURN_ERROR_IF(zfh.windowSize > windowSizeMax, + frameParameter_windowTooLarge, ""); + return ZSTD_estimateDStreamSize((size_t)zfh.windowSize); +} + + +/* ***** Decompression ***** */ + +static int ZSTD_DCtx_isOverflow(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + return (zds->inBuffSize + zds->outBuffSize) >= (neededInBuffSize + neededOutBuffSize) * ZSTD_WORKSPACETOOLARGE_FACTOR; +} + +static void ZSTD_DCtx_updateOversizedDuration(ZSTD_DStream* zds, size_t const neededInBuffSize, size_t const neededOutBuffSize) +{ + if (ZSTD_DCtx_isOverflow(zds, neededInBuffSize, neededOutBuffSize)) + zds->oversizedDuration++; + else + zds->oversizedDuration = 0; +} + +static int ZSTD_DCtx_isOversizedTooLong(ZSTD_DStream* zds) +{ + return zds->oversizedDuration >= ZSTD_WORKSPACETOOLARGE_MAXDURATION; +} + +/* Checks that the output buffer hasn't changed if ZSTD_obm_stable is used. */ +static size_t ZSTD_checkOutBuffer(ZSTD_DStream const* zds, ZSTD_outBuffer const* output) +{ + ZSTD_outBuffer const expect = zds->expectedOutBuffer; + /* No requirement when ZSTD_obm_stable is not enabled. */ + if (zds->outBufferMode != ZSTD_obm_stable) + return 0; + /* Any buffer is allowed in zdss_init, this must be the same for every other call until + * the context is reset. + */ + if (zds->streamStage == zdss_init) + return 0; + /* The buffer must match our expectation exactly. */ + if (expect.dst == output->dst && expect.pos == output->pos && expect.size == output->size) + return 0; + RETURN_ERROR(dstBuffer_wrong, "ZSTD_obm_stable enabled but output differs!"); +} + +/* Calls ZSTD_decompressContinue() with the right parameters for ZSTD_decompressStream() + * and updates the stage and the output buffer state. This call is extracted so it can be + * used both when reading directly from the ZSTD_inBuffer, and in buffered input mode. + * NOTE: You must break after calling this function since the streamStage is modified. + */ +static size_t ZSTD_decompressContinueStream( + ZSTD_DStream* zds, char** op, char* oend, + void const* src, size_t srcSize) { + int const isSkipFrame = ZSTD_isSkipFrame(zds); + if (zds->outBufferMode == ZSTD_obm_buffered) { + size_t const dstSize = isSkipFrame ? 0 : zds->outBuffSize - zds->outStart; + size_t const decodedSize = ZSTD_decompressContinue(zds, + zds->outBuff + zds->outStart, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + if (!decodedSize && !isSkipFrame) { + zds->streamStage = zdss_read; + } else { + zds->outEnd = zds->outStart + decodedSize; + zds->streamStage = zdss_flush; + } + } else { + /* Write directly into the output buffer */ + size_t const dstSize = isSkipFrame ? 0 : oend - *op; + size_t const decodedSize = ZSTD_decompressContinue(zds, *op, dstSize, src, srcSize); + FORWARD_IF_ERROR(decodedSize, ""); + *op += decodedSize; + /* Flushing is not needed. */ + zds->streamStage = zdss_read; + assert(*op <= oend); + assert(zds->outBufferMode == ZSTD_obm_stable); + } + return 0; +} + +size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input) +{ + const char* const src = (const char*)input->src; + const char* const istart = input->pos != 0 ? src + input->pos : src; + const char* const iend = input->size != 0 ? src + input->size : src; + const char* ip = istart; + char* const dst = (char*)output->dst; + char* const ostart = output->pos != 0 ? dst + output->pos : dst; + char* const oend = output->size != 0 ? dst + output->size : dst; + char* op = ostart; + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, + "forbidden. in: pos: %u vs size: %u", + (U32)input->pos, (U32)input->size); + RETURN_ERROR_IF( + output->pos > output->size, + dstSize_tooSmall, + "forbidden. out: pos: %u vs size: %u", + (U32)output->pos, (U32)output->size); + DEBUGLOG(5, "input size : %u", (U32)(input->size - input->pos)); + FORWARD_IF_ERROR(ZSTD_checkOutBuffer(zds, output), ""); + + while (someMoreWork) { + switch(zds->streamStage) + { + case zdss_init : + DEBUGLOG(5, "stage zdss_init => transparent reset "); + zds->streamStage = zdss_loadHeader; + zds->lhSize = zds->inPos = zds->outStart = zds->outEnd = 0; + zds->legacyVersion = 0; + zds->hostageByte = 0; + zds->expectedOutBuffer = *output; + /* fall-through */ + + case zdss_loadHeader : + DEBUGLOG(5, "stage zdss_loadHeader (srcSize : %u)", (U32)(iend - ip)); +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + if (zds->legacyVersion) { + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, zds->legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; + return hint; + } } +#endif + { size_t const hSize = ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format); + DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { +#if defined(ZSTD_LEGACY_SUPPORT) && (ZSTD_LEGACY_SUPPORT>=1) + U32 const legacyVersion = ZSTD_isLegacy(istart, iend-istart); + if (legacyVersion) { + ZSTD_DDict const* const ddict = ZSTD_getDDict(zds); + const void* const dict = ddict ? ZSTD_DDict_dictContent(ddict) : NULL; + size_t const dictSize = ddict ? ZSTD_DDict_dictSize(ddict) : 0; + DEBUGLOG(5, "ZSTD_decompressStream: detected legacy version v0.%u", legacyVersion); + RETURN_ERROR_IF(zds->staticSize, memory_allocation, + "legacy support is incompatible with static dctx"); + FORWARD_IF_ERROR(ZSTD_initLegacyStream(&zds->legacyContext, + zds->previousLegacyVersion, legacyVersion, + dict, dictSize), ""); + zds->legacyVersion = zds->previousLegacyVersion = legacyVersion; + { size_t const hint = ZSTD_decompressLegacyStream(zds->legacyContext, legacyVersion, output, input); + if (hint==0) zds->streamStage = zdss_init; /* or stay in stage zdss_loadHeader */ + return hint; + } } +#endif + return hSize; /* error */ + } + if (hSize != 0) { /* need more input */ + size_t const toLoad = hSize - zds->lhSize; /* if hSize!=0, hSize > zds->lhSize */ + size_t const remainingInput = (size_t)(iend-ip); + assert(iend >= ip); + if (toLoad > remainingInput) { /* not enough input to load full header */ + if (remainingInput > 0) { + memcpy(zds->headerBuffer + zds->lhSize, ip, remainingInput); + zds->lhSize += remainingInput; + } + input->pos = input->size; + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); + memcpy(zds->headerBuffer + zds->lhSize, ip, toLoad); zds->lhSize = hSize; ip += toLoad; + break; + } } + + /* check for single-pass mode opportunity */ + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { + size_t const cSize = ZSTD_findFrameCompressedSize(istart, iend-istart); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, oend-op, istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") + ip = istart + cSize; + op += decompressedSize; + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } } + + /* Check output buffer is large enough for ZSTD_odm_stable. */ + if (zds->outBufferMode == ZSTD_obm_stable + && zds->fParams.frameType != ZSTD_skippableFrame + && zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && (U64)(size_t)(oend-op) < zds->fParams.frameContentSize) { + RETURN_ERROR(dstSize_tooSmall, "ZSTD_obm_stable passed but ZSTD_outBuffer is too small"); + } + + /* Consume header (see ZSTDds_decodeFrameHeader) */ + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + + if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { + FORWARD_IF_ERROR(ZSTD_decodeFrameHeader(zds, zds->headerBuffer, zds->lhSize), ""); + zds->expected = ZSTD_blockHeaderSize; + zds->stage = ZSTDds_decodeBlockHeader; + } + + /* control buffer memory usage */ + DEBUGLOG(4, "Control max memory usage (%u KB <= max %u KB)", + (U32)(zds->fParams.windowSize >>10), + (U32)(zds->maxWindowSize >> 10) ); + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_obm_buffered + ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); + + { int const tooSmall = (zds->inBuffSize < neededInBuffSize) || (zds->outBuffSize < neededOutBuffSize); + int const tooLarge = ZSTD_DCtx_isOversizedTooLong(zds); + + if (tooSmall || tooLarge) { + size_t const bufferSize = neededInBuffSize + neededOutBuffSize; + DEBUGLOG(4, "inBuff : from %u to %u", + (U32)zds->inBuffSize, (U32)neededInBuffSize); + DEBUGLOG(4, "outBuff : from %u to %u", + (U32)zds->outBuffSize, (U32)neededOutBuffSize); + if (zds->staticSize) { /* static DCtx */ + DEBUGLOG(4, "staticSize : %u", (U32)zds->staticSize); + assert(zds->staticSize >= sizeof(ZSTD_DCtx)); /* controlled at init */ + RETURN_ERROR_IF( + bufferSize > zds->staticSize - sizeof(ZSTD_DCtx), + memory_allocation, ""); + } else { + ZSTD_free(zds->inBuff, zds->customMem); + zds->inBuffSize = 0; + zds->outBuffSize = 0; + zds->inBuff = (char*)ZSTD_malloc(bufferSize, zds->customMem); + RETURN_ERROR_IF(zds->inBuff == NULL, memory_allocation, ""); + } + zds->inBuffSize = neededInBuffSize; + zds->outBuff = zds->inBuff + zds->inBuffSize; + zds->outBuffSize = neededOutBuffSize; + } } } + zds->streamStage = zdss_read; + /* fall-through */ + + case zdss_read: + DEBUGLOG(5, "stage zdss_read"); + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip); + DEBUGLOG(5, "neededInSize = %u", (U32)neededInSize); + if (neededInSize==0) { /* end of frame */ + zds->streamStage = zdss_init; + someMoreWork = 0; + break; + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; + } } + if (ip==iend) { someMoreWork = 0; break; } /* no more input */ + zds->streamStage = zdss_load; + /* fall-through */ + + case zdss_load: + { size_t const neededInSize = ZSTD_nextSrcSizeToDecompress(zds); + size_t const toLoad = neededInSize - zds->inPos; + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ + assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { + RETURN_ERROR_IF(toLoad > zds->inBuffSize - zds->inPos, + corruption_detected, + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, iend-ip); + } + ip += loadedSize; + zds->inPos += loadedSize; + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ + zds->inPos = 0; /* input is consumed */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, zds->inBuff, neededInSize), ""); + /* Function modifies the stage so we must break */ + break; + } + case zdss_flush: + { size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, oend-op, zds->outBuff + zds->outStart, toFlushSize); + op += flushedSize; + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) + && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); + zds->outStart = zds->outEnd = 0; + } + break; + } } + /* cannot complete flush */ + someMoreWork = 0; + break; + + default: + assert(0); /* impossible */ + RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ + } } + + /* result */ + input->pos = (size_t)(ip - (const char*)(input->src)); + output->pos = (size_t)(op - (char*)(output->dst)); + + /* Update the expected output buffer for ZSTD_obm_stable. */ + zds->expectedOutBuffer = *output; + + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { + RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); + RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); + assert(0); + } + } else { + zds->noForwardProgress = 0; + } + { size_t nextSrcSizeHint = ZSTD_nextSrcSizeToDecompress(zds); + if (!nextSrcSizeHint) { /* frame fully decoded */ + if (zds->outEnd == zds->outStart) { /* output fully flushed */ + if (zds->hostageByte) { + if (input->pos >= input->size) { + /* can't release hostage (not present) */ + zds->streamStage = zdss_read; + return 1; + } + input->pos++; /* release hostage */ + } /* zds->hostageByte */ + return 0; + } /* zds->outEnd == zds->outStart */ + if (!zds->hostageByte) { /* output not fully flushed; keep last byte as hostage; will be released when all output is flushed */ + input->pos--; /* note : pos > 0, otherwise, impossible to finish reading last block */ + zds->hostageByte=1; + } + return 1; + } /* nextSrcSizeHint==0 */ + nextSrcSizeHint += ZSTD_blockHeaderSize * (ZSTD_nextInputType(zds) == ZSTDnit_block); /* preload header of next block */ + assert(zds->inPos <= nextSrcSizeHint); + nextSrcSizeHint -= zds->inPos; /* part already loaded*/ + return nextSrcSizeHint; + } +} + +size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) +{ + ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; + ZSTD_inBuffer input = { src, srcSize, *srcPos }; + /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ + size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); + *dstPos = output.pos; + *srcPos = input.pos; + return cErr; +} +/**** ended inlining decompress/zstd_decompress.c ****/ +/**** start inlining decompress/zstd_decompress_block.c ****/ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +/* zstd_decompress_block : + * this module takes care of decompressing _compressed_ block */ + +/*-******************************************************* +* Dependencies +*********************************************************/ +#include /* memcpy, memmove, memset */ +/**** skipping file: ../common/compiler.h ****/ +/**** skipping file: ../common/cpu.h ****/ +/**** skipping file: ../common/mem.h ****/ +#define FSE_STATIC_LINKING_ONLY +/**** skipping file: ../common/fse.h ****/ +#define HUF_STATIC_LINKING_ONLY +/**** skipping file: ../common/huf.h ****/ +/**** skipping file: ../common/zstd_internal.h ****/ +/**** skipping file: zstd_decompress_internal.h ****/ +/**** skipping file: zstd_ddict.h ****/ +/**** skipping file: zstd_decompress_block.h ****/ + +/*_******************************************************* +* Macros +**********************************************************/ + +/* These two optional macros force the use one way or another of the two + * ZSTD_decompressSequences implementations. You can't force in both directions + * at the same time. + */ +#if defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +#error "Cannot force the use of the short and the long ZSTD_decompressSequences variants!" +#endif + + +/*_******************************************************* +* Memory operations +**********************************************************/ +static void ZSTD_copy4(void* dst, const void* src) { memcpy(dst, src, 4); } + + +/*-************************************************************* + * Block decoding + ***************************************************************/ + +/*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr) +{ + RETURN_ERROR_IF(srcSize < ZSTD_blockHeaderSize, srcSize_wrong, ""); + + { U32 const cBlockHeader = MEM_readLE24(src); + U32 const cSize = cBlockHeader >> 3; + bpPtr->lastBlock = cBlockHeader & 1; + bpPtr->blockType = (blockType_e)((cBlockHeader >> 1) & 3); + bpPtr->origSize = cSize; /* only useful for RLE */ + if (bpPtr->blockType == bt_rle) return 1; + RETURN_ERROR_IF(bpPtr->blockType == bt_reserved, corruption_detected, ""); + return cSize; + } +} + + +/* Hidden declaration for fullbench */ +size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize); +/*! ZSTD_decodeLiteralsBlock() : + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize) /* note : srcSize < BLOCKSIZE */ +{ + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock"); + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); + + { const BYTE* const istart = (const BYTE*) src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); + + switch(litEncType) + { + case set_repeat: + DEBUGLOG(5, "set_repeat flag : re-using stats from previous compressed literals block"); + RETURN_ERROR_IF(dctx->litEntropy==0, dictionary_corrupted, ""); + /* fall-through */ + + case set_compressed: + RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ + /* 2 - 2 - 10 - 10 */ + singleStream = !lhlCode; + lhSize = 3; + litSize = (lhc >> 4) & 0x3FF; + litCSize = (lhc >> 14) & 0x3FF; + break; + case 2: + /* 2 - 2 - 14 - 14 */ + lhSize = 4; + litSize = (lhc >> 4) & 0x3FFF; + litCSize = lhc >> 18; + break; + case 3: + /* 2 - 2 - 18 - 18 */ + lhSize = 5; + litSize = (lhc >> 4) & 0x3FFFF; + litCSize = (lhc >> 22) + ((size_t)istart[4] << 10); + break; + } + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + + /* prefetch huffman table if cold */ + if (dctx->ddictIsCold && (litSize > 768 /* heuristic */)) { + PREFETCH_AREA(dctx->HUFptr, sizeof(dctx->entropy.hufTable)); + } + + if (litEncType==set_repeat) { + if (singleStream) { + hufSuccess = HUF_decompress1X_usingDTable_bmi2( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, dctx->bmi2); + } else { + hufSuccess = HUF_decompress4X_usingDTable_bmi2( + dctx->litBuffer, litSize, istart+lhSize, litCSize, + dctx->HUFptr, dctx->bmi2); + } + } else { + if (singleStream) { +#if defined(HUF_FORCE_DECOMPRESS_X2) + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace)); +#else + hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), dctx->bmi2); +#endif + } else { + hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, + sizeof(dctx->workspace), dctx->bmi2); + } + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); + + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + dctx->litEntropy = 1; + if (litEncType==set_compressed) dctx->HUFptr = dctx->entropy.hufTable; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return litCSize + lhSize; + } + + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = MEM_readLE24(istart) >> 4; + break; + } + + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ + RETURN_ERROR_IF(litSize+lhSize > srcSize, corruption_detected, ""); + memcpy(dctx->litBuffer, istart+lhSize, litSize); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + memset(dctx->litBuffer + dctx->litSize, 0, WILDCOPY_OVERLENGTH); + return lhSize+litSize; + } + /* direct reference into compressed stream */ + dctx->litPtr = istart+lhSize; + dctx->litSize = litSize; + return lhSize+litSize; + } + + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ + lhSize = 1; + litSize = istart[0] >> 3; + break; + case 1: + lhSize = 2; + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; + litSize = MEM_readLE24(istart) >> 4; + RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); + memset(dctx->litBuffer, istart[lhSize], litSize + WILDCOPY_OVERLENGTH); + dctx->litPtr = dctx->litBuffer; + dctx->litSize = litSize; + return lhSize+1; + } + default: + RETURN_ERROR(corruption_detected, "impossible"); + } + } +} + +/* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/master/doc/zstd_compression_format.md#default-distributions + * They were generated programmatically with following method : + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables + * - pretify output, report below, test with fuzzer to ensure it's correct */ + +/* Default FSE distribution table for Literal Lengths */ +static const ZSTD_seqSymbol LL_defaultDTable[(1<tableLog = 0; + DTableH->fastMode = 0; + + cell->nbBits = 0; + cell->nextState = 0; + assert(nbAddBits < 255); + cell->nbAdditionalBits = (BYTE)nbAddBits; + cell->baseValue = baseValue; +} + + +/* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) + * cannot fail if input is valid => + * all inputs are presumed validated at this stage */ +void +ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + const short* normalizedCounter, unsigned maxSymbolValue, + const U32* baseValue, const U32* nbAdditionalBits, + unsigned tableLog) +{ + ZSTD_seqSymbol* const tableDecode = dt+1; + U16 symbolNext[MaxSeq+1]; + + U32 const maxSV1 = maxSymbolValue + 1; + U32 const tableSize = 1 << tableLog; + U32 highThreshold = tableSize-1; + + /* Sanity Checks */ + assert(maxSymbolValue <= MaxSeq); + assert(tableLog <= MaxFSELog); + + /* Init, lay down lowprob symbols */ + { ZSTD_seqSymbol_header DTableH; + DTableH.tableLog = tableLog; + DTableH.fastMode = 1; + { S16 const largeLimit= (S16)(1 << (tableLog-1)); + U32 s; + for (s=0; s= largeLimit) DTableH.fastMode=0; + assert(normalizedCounter[s]>=0); + symbolNext[s] = (U16)normalizedCounter[s]; + } } } + memcpy(dt, &DTableH, sizeof(DTableH)); + } + + /* Spread symbols */ + { U32 const tableMask = tableSize-1; + U32 const step = FSE_TABLESTEP(tableSize); + U32 s, position = 0; + for (s=0; s highThreshold) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } + + /* Build Decoding table */ + { U32 u; + for (u=0; u max, corruption_detected, ""); + { U32 const symbol = *(const BYTE*)src; + U32 const baseline = baseValue[symbol]; + U32 const nbBits = nbAdditionalBits[symbol]; + ZSTD_buildSeqTable_rle(DTableSpace, baseline, nbBits); + } + *DTablePtr = DTableSpace; + return 1; + case set_basic : + *DTablePtr = defaultTable; + return 0; + case set_repeat: + RETURN_ERROR_IF(!flagRepeatTable, corruption_detected, ""); + /* prefetch FSE table if used */ + if (ddictIsCold && (nbSeq > 24 /* heuristic */)) { + const void* const pStart = *DTablePtr; + size_t const pSize = sizeof(ZSTD_seqSymbol) * (SEQSYMBOL_TABLE_SIZE(maxLog)); + PREFETCH_AREA(pStart, pSize); + } + return 0; + case set_compressed : + { unsigned tableLog; + S16 norm[MaxSeq+1]; + size_t const headerSize = FSE_readNCount(norm, &max, &tableLog, src, srcSize); + RETURN_ERROR_IF(FSE_isError(headerSize), corruption_detected, ""); + RETURN_ERROR_IF(tableLog > maxLog, corruption_detected, ""); + ZSTD_buildFSETable(DTableSpace, norm, max, baseValue, nbAdditionalBits, tableLog); + *DTablePtr = DTableSpace; + return headerSize; + } + default : + assert(0); + RETURN_ERROR(GENERIC, "impossible"); + } +} + +size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize) +{ + const BYTE* const istart = (const BYTE* const)src; + const BYTE* const iend = istart + srcSize; + const BYTE* ip = istart; + int nbSeq; + DEBUGLOG(5, "ZSTD_decodeSeqHeaders"); + + /* check */ + RETURN_ERROR_IF(srcSize < MIN_SEQUENCES_SIZE, srcSize_wrong, ""); + + /* SeqHead */ + nbSeq = *ip++; + if (!nbSeq) { + *nbSeqPtr=0; + RETURN_ERROR_IF(srcSize != 1, srcSize_wrong, ""); + return 1; + } + if (nbSeq > 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); + nbSeq = MEM_readLE16(ip) + LONGNBSEQ, ip+=2; + } else { + RETURN_ERROR_IF(ip >= iend, srcSize_wrong, ""); + nbSeq = ((nbSeq-0x80)<<8) + *ip++; + } + } + *nbSeqPtr = nbSeq; + + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ + { size_t const llhSize = ZSTD_buildSeqTable(dctx->entropy.LLTable, &dctx->LLTptr, + LLtype, MaxLL, LLFSELog, + ip, iend-ip, + LL_base, LL_bits, + LL_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(llhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += llhSize; + } + + { size_t const ofhSize = ZSTD_buildSeqTable(dctx->entropy.OFTable, &dctx->OFTptr, + OFtype, MaxOff, OffFSELog, + ip, iend-ip, + OF_base, OF_bits, + OF_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(ofhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += ofhSize; + } + + { size_t const mlhSize = ZSTD_buildSeqTable(dctx->entropy.MLTable, &dctx->MLTptr, + MLtype, MaxML, MLFSELog, + ip, iend-ip, + ML_base, ML_bits, + ML_defaultDTable, dctx->fseEntropy, + dctx->ddictIsCold, nbSeq); + RETURN_ERROR_IF(ZSTD_isError(mlhSize), corruption_detected, "ZSTD_buildSeqTable failed"); + ip += mlhSize; + } + } + + return ip-istart; +} + + +typedef struct { + size_t litLength; + size_t matchLength; + size_t offset; + const BYTE* match; +} seq_t; + +typedef struct { + size_t state; + const ZSTD_seqSymbol* table; +} ZSTD_fseState; + +typedef struct { + BIT_DStream_t DStream; + ZSTD_fseState stateLL; + ZSTD_fseState stateOffb; + ZSTD_fseState stateML; + size_t prevOffset[ZSTD_REP_NUM]; + const BYTE* prefixStart; + const BYTE* dictEnd; + size_t pos; +} seqState_t; + +/*! ZSTD_overlapCopy8() : + * Copies 8 bytes from ip to op and updates op and ip where ip <= op. + * If the offset is < 8 then the offset is spread to at least 8 bytes. + * + * Precondition: *ip <= *op + * Postcondition: *op - *op >= 8 + */ +HINT_INLINE void ZSTD_overlapCopy8(BYTE** op, BYTE const** ip, size_t offset) { + assert(*ip <= *op); + if (offset < 8) { + /* close range match, overlap */ + static const U32 dec32table[] = { 0, 1, 2, 1, 4, 4, 4, 4 }; /* added */ + static const int dec64table[] = { 8, 8, 8, 7, 8, 9,10,11 }; /* subtracted */ + int const sub2 = dec64table[offset]; + (*op)[0] = (*ip)[0]; + (*op)[1] = (*ip)[1]; + (*op)[2] = (*ip)[2]; + (*op)[3] = (*ip)[3]; + *ip += dec32table[offset]; + ZSTD_copy4(*op+4, *ip); + *ip -= sub2; + } else { + ZSTD_copy8(*op, *ip); + } + *ip += 8; + *op += 8; + assert(*op - *ip >= 8); +} + +/*! ZSTD_safecopy() : + * Specialized version of memcpy() that is allowed to READ up to WILDCOPY_OVERLENGTH past the input buffer + * and write up to 16 bytes past oend_w (op >= oend_w is allowed). + * This function is only called in the uncommon case where the sequence is near the end of the block. It + * should be fast for a single long sequence, but can be slow for several short sequences. + * + * @param ovtype controls the overlap detection + * - ZSTD_no_overlap: The source and destination are guaranteed to be at least WILDCOPY_VECLEN bytes apart. + * - ZSTD_overlap_src_before_dst: The src and dst may overlap and may be any distance apart. + * The src buffer must be before the dst buffer. + */ +static void ZSTD_safecopy(BYTE* op, BYTE* const oend_w, BYTE const* ip, ptrdiff_t length, ZSTD_overlap_e ovtype) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + + assert((ovtype == ZSTD_no_overlap && (diff <= -8 || diff >= 8 || op >= oend_w)) || + (ovtype == ZSTD_overlap_src_before_dst && diff >= 0)); + + if (length < 8) { + /* Handle short lengths. */ + while (op < oend) *op++ = *ip++; + return; + } + if (ovtype == ZSTD_overlap_src_before_dst) { + /* Copy 8 bytes and ensure the offset >= 8 when there can be overlap. */ + assert(length >= 8); + ZSTD_overlapCopy8(&op, &ip, diff); + assert(op - ip >= 8); + assert(op <= oend); + } + + if (oend <= oend_w) { + /* No risk of overwrite. */ + ZSTD_wildcopy(op, ip, length, ovtype); + return; + } + if (op <= oend_w) { + /* Wildcopy until we get close to the end. */ + assert(oend > oend_w); + ZSTD_wildcopy(op, ip, oend_w - op, ovtype); + ip += oend_w - op; + op = oend_w; + } + /* Handle the leftovers. */ + while (op < oend) *op++ = *ip++; +} + +/* ZSTD_execSequenceEnd(): + * This version handles cases that are near the end of the output buffer. It requires + * more careful checks to make sure there is no overflow. By separating out these hard + * and unlikely cases, we can speed up the common cases. + * + * NOTE: This function needs to be fast for a single long sequence, but doesn't need + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ +FORCE_NOINLINE +size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; + + /* bounds checks : careful of address space overflow in 32-bit mode */ + RETURN_ERROR_IF(sequenceLength > (size_t)(oend - op), dstSize_tooSmall, "last match must fit within dstBuffer"); + RETURN_ERROR_IF(sequence.litLength > (size_t)(litLimit - *litPtr), corruption_detected, "try to read beyond literal buffer"); + assert(op < op + sequenceLength); + assert(oLitEnd < op + sequenceLength); + + /* copy literals */ + ZSTD_safecopy(op, oend_w, *litPtr, sequence.litLength, ZSTD_no_overlap); + op = oLitEnd; + *litPtr = iLitEnd; + + /* copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix */ + RETURN_ERROR_IF(sequence.offset > (size_t)(oLitEnd - virtualStart), corruption_detected, ""); + match = dictEnd - (prefixStart-match); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } } + ZSTD_safecopy(op, oend_w, match, sequence.matchLength, ZSTD_overlap_src_before_dst); + return sequenceLength; +} + +HINT_INLINE +size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, + const BYTE* const prefixStart, const BYTE* const virtualStart, const BYTE* const dictEnd) +{ + BYTE* const oLitEnd = op + sequence.litLength; + size_t const sequenceLength = sequence.litLength + sequence.matchLength; + BYTE* const oMatchEnd = op + sequenceLength; /* risk : address space overflow (32-bits) */ + BYTE* const oend_w = oend - WILDCOPY_OVERLENGTH; /* risk : address space underflow on oend=NULL */ + const BYTE* const iLitEnd = *litPtr + sequence.litLength; + const BYTE* match = oLitEnd - sequence.offset; + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend + * - 32-bit mode and the match length overflows + */ + if (UNLIKELY( + iLitEnd > litLimit || + oMatchEnd > oend_w || + (MEM_32bits() && (size_t)(oend - op) < sequenceLength + WILDCOPY_OVERLENGTH))) + return ZSTD_execSequenceEnd(op, oend, sequence, litPtr, litLimit, prefixStart, virtualStart, dictEnd); + + /* Assumptions (everything else goes into ZSTD_execSequenceEnd()) */ + assert(op <= oLitEnd /* No overflow */); + assert(oLitEnd < oMatchEnd /* Non-zero match & no overflow */); + assert(oMatchEnd <= oend /* No underflow */); + assert(iLitEnd <= litLimit /* Literal length is in bounds */); + assert(oLitEnd <= oend_w /* Can wildcopy literals */); + assert(oMatchEnd <= oend_w /* Can wildcopy matches */); + + /* Copy Literals: + * Split out litLength <= 16 since it is nearly always true. +1.6% on gcc-9. + * We likely don't need the full 32-byte wildcopy. + */ + assert(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(op, (*litPtr)); + if (UNLIKELY(sequence.litLength > 16)) { + ZSTD_wildcopy(op+16, (*litPtr)+16, sequence.litLength-16, ZSTD_no_overlap); + } + op = oLitEnd; + *litPtr = iLitEnd; /* update for next sequence */ + + /* Copy Match */ + if (sequence.offset > (size_t)(oLitEnd - prefixStart)) { + /* offset beyond prefix -> go into extDict */ + RETURN_ERROR_IF(UNLIKELY(sequence.offset > (size_t)(oLitEnd - virtualStart)), corruption_detected, ""); + match = dictEnd + (match - prefixStart); + if (match + sequence.matchLength <= dictEnd) { + memmove(oLitEnd, match, sequence.matchLength); + return sequenceLength; + } + /* span extDict & currentPrefixSegment */ + { size_t const length1 = dictEnd - match; + memmove(oLitEnd, match, length1); + op = oLitEnd + length1; + sequence.matchLength -= length1; + match = prefixStart; + } } + /* Match within prefix of 1 or more bytes */ + assert(op <= oMatchEnd); + assert(oMatchEnd <= oend_w); + assert(match >= prefixStart); + assert(sequence.matchLength >= 1); + + /* Nearly all offsets are >= WILDCOPY_VECLEN bytes, which means we can use wildcopy + * without overlap checking. + */ + if (LIKELY(sequence.offset >= WILDCOPY_VECLEN)) { + /* We bet on a full wildcopy for matches, since we expect matches to be + * longer than literals (in general). In silesia, ~10% of matches are longer + * than 16 bytes. + */ + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength, ZSTD_no_overlap); + return sequenceLength; + } + assert(sequence.offset < WILDCOPY_VECLEN); + + /* Copy 8 bytes and spread the offset to be >= 8. */ + ZSTD_overlapCopy8(&op, &match, sequence.offset); + + /* If the match length is > 8 bytes, then continue with the wildcopy. */ + if (sequence.matchLength > 8) { + assert(op < oMatchEnd); + ZSTD_wildcopy(op, match, (ptrdiff_t)sequence.matchLength-8, ZSTD_overlap_src_before_dst); + } + return sequenceLength; +} + +static void +ZSTD_initFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, const ZSTD_seqSymbol* dt) +{ + const void* ptr = dt; + const ZSTD_seqSymbol_header* const DTableH = (const ZSTD_seqSymbol_header*)ptr; + DStatePtr->state = BIT_readBits(bitD, DTableH->tableLog); + DEBUGLOG(6, "ZSTD_initFseState : val=%u using %u bits", + (U32)DStatePtr->state, DTableH->tableLog); + BIT_reloadDStream(bitD); + DStatePtr->table = dt + 1; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseState(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD) +{ + ZSTD_seqSymbol const DInfo = DStatePtr->table[DStatePtr->state]; + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.nextState + lowBits; +} + +FORCE_INLINE_TEMPLATE void +ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, ZSTD_seqSymbol const DInfo) +{ + U32 const nbBits = DInfo.nbBits; + size_t const lowBits = BIT_readBits(bitD, nbBits); + DStatePtr->state = DInfo.nextState + lowBits; +} + +/* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum + * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +#define LONG_OFFSETS_MAX_EXTRA_BITS_32 \ + (ZSTD_WINDOWLOG_MAX_32 > STREAM_ACCUMULATOR_MIN_32 \ + ? ZSTD_WINDOWLOG_MAX_32 - STREAM_ACCUMULATOR_MIN_32 \ + : 0) + +typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; +typedef enum { ZSTD_p_noPrefetch=0, ZSTD_p_prefetch=1 } ZSTD_prefetch_e; + +FORCE_INLINE_TEMPLATE seq_t +ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const ZSTD_prefetch_e prefetch) +{ + seq_t seq; + ZSTD_seqSymbol const llDInfo = seqState->stateLL.table[seqState->stateLL.state]; + ZSTD_seqSymbol const mlDInfo = seqState->stateML.table[seqState->stateML.state]; + ZSTD_seqSymbol const ofDInfo = seqState->stateOffb.table[seqState->stateOffb.state]; + U32 const llBase = llDInfo.baseValue; + U32 const mlBase = mlDInfo.baseValue; + U32 const ofBase = ofDInfo.baseValue; + BYTE const llBits = llDInfo.nbAdditionalBits; + BYTE const mlBits = mlDInfo.nbAdditionalBits; + BYTE const ofBits = ofDInfo.nbAdditionalBits; + BYTE const totalBits = llBits+mlBits+ofBits; + + /* sequence */ + { size_t offset; + if (ofBits > 1) { + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); + assert(ofBits <= MaxOff); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { + U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); + if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); + assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); + } + seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } else { + U32 const ll0 = (llBase == 0); + if (LIKELY((ofBits == 0))) { + if (LIKELY(!ll0)) + offset = seqState->prevOffset[0]; + else { + offset = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset; + } + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; + temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; + } } } + seq.offset = offset; + } + + seq.matchLength = mlBase; + if (mlBits > 0) + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) + BIT_reloadDStream(&seqState->DStream); + if (MEM_64bits() && UNLIKELY(totalBits >= STREAM_ACCUMULATOR_MIN_64-(LLFSELog+MLFSELog+OffFSELog))) + BIT_reloadDStream(&seqState->DStream); + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + + seq.litLength = llBase; + if (llBits > 0) + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) + BIT_reloadDStream(&seqState->DStream); + + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + + if (prefetch == ZSTD_p_prefetch) { + size_t const pos = seqState->pos + seq.litLength; + const BYTE* const matchBase = (seq.offset > pos) ? seqState->dictEnd : seqState->prefixStart; + seq.match = matchBase + pos - seq.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. + * No consequence though : no memory access will occur, offset is only used for prefetching */ + seqState->pos = pos + seq.matchLength; + } + + /* ANS state update + * gcc-9.0.0 does 2.5% worse with ZSTD_updateFseStateWithDInfo(). + * clang-9.2.0 does 7% worse with ZSTD_updateFseState(). + * Naturally it seems like ZSTD_updateFseStateWithDInfo() should be the + * better option, so it is the default for other compilers. But, if you + * measure that it is worse, please put up a pull request. + */ + { +#if defined(__GNUC__) && !defined(__clang__) + const int kUseUpdateFseState = 1; +#else + const int kUseUpdateFseState = 0; +#endif + if (kUseUpdateFseState) { + ZSTD_updateFseState(&seqState->stateLL, &seqState->DStream); /* <= 9 bits */ + ZSTD_updateFseState(&seqState->stateML, &seqState->DStream); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseState(&seqState->stateOffb, &seqState->DStream); /* <= 8 bits */ + } else { + ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llDInfo); /* <= 9 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlDInfo); /* <= 9 bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ + ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofDInfo); /* <= 8 bits */ + } + } + + return seq; +} + +#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) +{ + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ + if (dctx->dictContentEndForFuzzing == NULL) return 0; + /* Dictionary is our prefix. */ + if (prefixStart == dctx->dictContentBeginForFuzzing) return 1; + /* Dictionary is not our ext-dict. */ + if (dctx->dictEnd != dctx->dictContentEndForFuzzing) return 0; + /* Dictionary is not within our window size. */ + if ((size_t)(oLitEnd - prefixStart) >= windowSize) return 0; + /* Dictionary is active. */ + return 1; +} + +MEM_STATIC void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) +{ + size_t const windowSize = dctx->fParams.windowSize; + size_t const sequenceSize = seq.litLength + seq.matchLength; + BYTE const* const oLitEnd = op + seq.litLength; + DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + assert(op <= oend); + assert((size_t)(oend - op) >= sequenceSize); + assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); + if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { + size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); + /* Offset must be within the dictionary. */ + assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); + assert(seq.offset <= windowSize + dictSize); + } else { + /* Offset must be within our window. */ + assert(seq.offset <= windowSize); + } +} +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +FORCE_INLINE_TEMPLATE size_t +DONT_VECTORIZE +ZSTD_decompressSequences_body( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + DEBUGLOG(5, "ZSTD_decompressSequences_body"); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { + seqState_t seqState; + size_t error = 0; + dctx->fseEntropy = 1; + { U32 i; for (i=0; ientropy.rep[i]; } + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + + ZSTD_STATIC_ASSERT( + BIT_DStream_unfinished < BIT_DStream_completed && + BIT_DStream_endOfBuffer < BIT_DStream_completed && + BIT_DStream_completed < BIT_DStream_overflow); + +#if defined(__GNUC__) && defined(__x86_64__) + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression + * speed swings based on the alignment of the decompression loop. This + * performance swing is caused by parts of the decompression loop falling + * out of the DSB. The entire decompression loop should fit in the DSB, + * when it can't we get much worse performance. You can measure if you've + * hit the good case or the bad case with this perf command for some + * compressed file test.zst: + * + * perf stat -e cycles -e instructions -e idq.all_dsb_cycles_any_uops \ + * -e idq.all_mite_cycles_any_uops -- ./zstd -tq test.zst + * + * If you see most cycles served out of the MITE you've hit the bad case. + * If you see most cycles served out of the DSB you've hit the good case. + * If it is pretty even then you may be in an okay case. + * + * I've been able to reproduce this issue on the following CPUs: + * - Kabylake: Macbook Pro (15-inch, 2019) 2.4 GHz Intel Core i9 + * Use Instruments->Counters to get DSB/MITE cycles. + * I never got performance swings, but I was able to + * go from the good case of mostly DSB to half of the + * cycles served from MITE. + * - Coffeelake: Intel i9-9900k + * + * I haven't been able to reproduce the instability or DSB misses on any + * of the following CPUS: + * - Haswell + * - Broadwell: Intel(R) Xeon(R) CPU E5-2680 v4 @ 2.40GH + * - Skylake + * + * If you are seeing performance stability this script can help test. + * It tests on 4 commits in zstd where I saw performance change. + * + * https://gist.github.com/terrelln/9889fc06a423fd5ca6e99351564473f4 + */ + __asm__(".p2align 5"); + __asm__("nop"); + __asm__(".p2align 4"); +#endif + for ( ; ; ) { + seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, ZSTD_p_noPrefetch); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); + if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); +#endif + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + BIT_reloadDStream(&(seqState.DStream)); + /* gcc and clang both don't like early returns in this loop. + * gcc doesn't like early breaks either. + * Instead save an error and report it at the end. + * When there is an error, don't increment op, so we don't + * overwrite. + */ + if (UNLIKELY(ZSTD_isError(oneSeqSize))) error = oneSeqSize; + else op += oneSeqSize; + if (UNLIKELY(!--nbSeq)) break; + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); + if (ZSTD_isError(error)) return error; + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); + RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +static size_t +ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +FORCE_INLINE_TEMPLATE size_t +ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE* const)dst; + BYTE* const oend = ostart + maxDstSize; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); + (void)frame; + + /* Regen sequences */ + if (nbSeq) { +#define STORED_SEQS 4 +#define STORED_SEQS_MASK (STORED_SEQS-1) +#define ADVANCED_SEQS 4 + seq_t sequences[STORED_SEQS]; + int const seqAdvance = MIN(nbSeq, ADVANCED_SEQS); + seqState_t seqState; + int seqNb; + dctx->fseEntropy = 1; + { int i; for (i=0; ientropy.rep[i]; } + seqState.prefixStart = prefixStart; + seqState.pos = (size_t)(op-prefixStart); + seqState.dictEnd = dictEnd; + assert(dst != NULL); + assert(iend >= ip); + RETURN_ERROR_IF( + ERR_isError(BIT_initDStream(&seqState.DStream, ip, iend-ip)), + corruption_detected, ""); + ZSTD_initFseState(&seqState.stateLL, &seqState.DStream, dctx->LLTptr); + ZSTD_initFseState(&seqState.stateOffb, &seqState.DStream, dctx->OFTptr); + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ + for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNbentropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ + { size_t const lastLLSize = litEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + memcpy(op, litPtr, lastLLSize); + op += lastLLSize; + } + } + + return op-ostart; +} + +static size_t +ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + + +#if DYNAMIC_BMI2 + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static TARGET_ATTRIBUTE("bmi2") size_t +DONT_VECTORIZE +ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +static TARGET_ATTRIBUTE("bmi2") size_t +ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +#endif /* DYNAMIC_BMI2 */ + +typedef size_t (*ZSTD_decompressSequences_t)( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame); + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG +static size_t +ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequences"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +/* ZSTD_decompressSequencesLong() : + * decompression function triggered when a minimum share of offsets is considered "long", + * aka out of cache. + * note : "long" definition seems overloaded here, sometimes meaning "wider than bitstream register", and sometimes meaning "farther than memory cache distance". + * This function will try to mitigate main memory latency through the use of prefetching */ +static size_t +ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, + const ZSTD_longOffset_e isLongOffset, + const int frame) +{ + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); +#if DYNAMIC_BMI2 + if (dctx->bmi2) { + return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); + } +#endif + return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); +} +#endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +/* ZSTD_getLongOffsetsShare() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) + * compared to maximum possible of (1< 22) total += 1; + } + + assert(tableLog <= OffFSELog); + total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + + return total; +} +#endif + +size_t +ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, const int frame) +{ /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; + /* isLongOffset must be true if there are long offsets. + * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. + * We don't expect that to be the case in 64-bit mode. + * In block mode, window size is not known, so we have to be conservative. + * (note: but it could be evaluated from current-lowLimit) + */ + ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + + RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize); + DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; + } + + /* Build Decoding Tables */ + { + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. + */ +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; +#endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); + if (ZSTD_isError(seqHSize)) return seqHSize; + ip += seqHSize; + srcSize -= seqHSize; + + RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + if ( !usePrefetchDecoder + && (!frame || (dctx->fParams.windowSize > (1<<24))) + && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ + U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); + U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ + usePrefetchDecoder = (shareLongOffsets >= minShare); + } +#endif + + dctx->ddictIsCold = 0; + +#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + if (usePrefetchDecoder) +#endif +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +#endif + +#ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); +#endif + } +} + + +void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst) +{ + if (dst != dctx->previousDstEnd) { /* not contiguous */ + dctx->dictEnd = dctx->previousDstEnd; + dctx->virtualStart = (const char*)dst - ((const char*)(dctx->previousDstEnd) - (const char*)(dctx->prefixStart)); + dctx->prefixStart = dst; + dctx->previousDstEnd = dst; + } +} + + +size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize) +{ + size_t dSize; + ZSTD_checkContinuity(dctx, dst); + dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; +} +/**** ended inlining decompress/zstd_decompress_block.c ****/ diff --git a/module/zstd/lib/zstd.h b/module/zstd/lib/zstd.h new file mode 100644 index 000000000000..b6772f8818a7 --- /dev/null +++ b/module/zstd/lib/zstd.h @@ -0,0 +1,2115 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. All rights reserved. + */ + +#if defined (__cplusplus) +extern "C" { +#endif + +#ifndef ZSTD_H_235446 +#define ZSTD_H_235446 + +/* ====== Dependency ======*/ +#include /* INT_MAX */ +#include /* size_t */ + + +/* ===== ZSTDLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDLIB_API __declspec(dllexport) ZSTDLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDLIB_API __declspec(dllimport) ZSTDLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDLIB_API ZSTDLIB_VISIBILITY +#endif + + +/******************************************************************************* + Introduction + + zstd, short for Zstandard, is a fast lossless compression algorithm, targeting + real-time compression scenarios at zlib-level and better compression ratios. + The zstd compression library provides in-memory compression and decompression + functions. + + The library supports regular compression levels from 1 up to ZSTD_maxCLevel(), + which is currently 22. Levels >= 20, labeled `--ultra`, should be used with + caution, as they require more memory. The library also offers negative + compression levels, which extend the range of speed vs. ratio preferences. + The lower the level, the faster the speed (at the cost of compression). + + Compression can be done in: + - a single step (described as Simple API) + - a single step, reusing a context (described as Explicit context) + - unbounded multiple steps (described as Streaming compression) + + The compression ratio achievable on small data can be highly improved using + a dictionary. Dictionary compression can be performed in: + - a single step (described as Simple dictionary API) + - a single step, reusing a dictionary (described as Bulk-processing + dictionary API) + + Advanced experimental functions can be accessed using + `#define ZSTD_STATIC_LINKING_ONLY` before including zstd.h. + + Advanced experimental APIs should never be used with a dynamically-linked + library. They are not "stable"; their definitions or signatures may change in + the future. Only static linking is allowed. +*******************************************************************************/ + +/*------ Version ------*/ +#define ZSTD_VERSION_MAJOR 1 +#define ZSTD_VERSION_MINOR 4 +#define ZSTD_VERSION_RELEASE 5 + +#define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) +ZSTDLIB_API unsigned ZSTD_versionNumber(void); /**< to check runtime library version */ + +#define ZSTD_LIB_VERSION ZSTD_VERSION_MAJOR.ZSTD_VERSION_MINOR.ZSTD_VERSION_RELEASE +#define ZSTD_QUOTE(str) #str +#define ZSTD_EXPAND_AND_QUOTE(str) ZSTD_QUOTE(str) +#define ZSTD_VERSION_STRING ZSTD_EXPAND_AND_QUOTE(ZSTD_LIB_VERSION) +ZSTDLIB_API const char* ZSTD_versionString(void); /* requires v1.3.0+ */ + +/* ************************************* + * Default constant + ***************************************/ +#ifndef ZSTD_CLEVEL_DEFAULT +# define ZSTD_CLEVEL_DEFAULT 3 +#endif + +/* ************************************* + * Constants + ***************************************/ + +/* All magic numbers are supposed read/written to/from files/memory using little-endian convention */ +#define ZSTD_MAGICNUMBER 0xFD2FB528 /* valid since v0.8.0 */ +#define ZSTD_MAGIC_DICTIONARY 0xEC30A437 /* valid since v0.7.0 */ +#define ZSTD_MAGIC_SKIPPABLE_START 0x184D2A50 /* all 16 values, from 0x184D2A50 to 0x184D2A5F, signal the beginning of a skippable frame */ +#define ZSTD_MAGIC_SKIPPABLE_MASK 0xFFFFFFF0 + +#define ZSTD_BLOCKSIZELOG_MAX 17 +#define ZSTD_BLOCKSIZE_MAX (1<= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*! ZSTD_decompress() : + * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. + * `dstCapacity` is an upper bound of originalSize to regenerate. + * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. + * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), + * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + +/*! ZSTD_getFrameContentSize() : requires v1.3.0+ + * `src` should point to the start of a ZSTD encoded frame. + * `srcSize` must be at least as large as the frame header. + * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. + * @return : - decompressed size of `src` frame content, if known + * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) + * note 1 : a 0 return value means the frame is valid but "empty". + * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * Optionally, application can rely on some implicit limit, + * as ZSTD_decompress() only needs an upper bound of decompressed size. + * (For example, data could be necessarily cut into blocks <= 16 KB). + * note 3 : decompressed size is always present when compression is completed using single-pass functions, + * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). + * note 4 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure return value fits within application's authorized limits. + * Each application can set its own limits. + * note 6 : This function replaces ZSTD_getDecompressedSize() */ +#define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) +#define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) +ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +/*! ZSTD_getDecompressedSize() : + * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_findFrameCompressedSize() : + * `src` should point to the start of a ZSTD frame or skippable frame. + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, + * or an error code if input is invalid */ +ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +/*====== Helper functions ======*/ +#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed */ +ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ + + +/*************************************** +* Explicit context +***************************************/ +/*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. + * Note 2 : In multi-threaded environments, + * use one different context per thread for parallel execution. + */ +typedef struct ZSTD_CCtx_s ZSTD_CCtx; +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); + +/*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. + * Important : in order to behave similarly to `ZSTD_compress()`, + * this function compresses at requested compression level, + * __ignoring any other parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + int compressionLevel); + +/*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, + * and re-use it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ +typedef struct ZSTD_DCtx_s ZSTD_DCtx; +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx(void); +ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); + +/*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. + * Compatible with sticky parameters. + */ +ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced compression API +***************************************/ + +/* API design : + * Parameters are pushed one by one into an existing context, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! + * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supercedes all other "advanced" API entry points in the experimental section. + * In the future, we expect to remove from experimental API entry points which are redundant with this API. + */ + + +/* Compression strategies, listed from fastest to strongest */ +typedef enum { ZSTD_fast=1, + ZSTD_dfast=2, + ZSTD_greedy=3, + ZSTD_lazy=4, + ZSTD_lazy2=5, + ZSTD_btlazy2=6, + ZSTD_btopt=7, + ZSTD_btultra=8, + ZSTD_btultra2=9 + /* note : new strategies _might_ be added in the future. + Only the order (from fast to strong) is guaranteed */ +} ZSTD_strategy; + + +typedef enum { + + /* compression parameters + * Note: When compressing with a ZSTD_CDict these parameters are superseded + * by the parameters used to construct the ZSTD_CDict. + * See ZSTD_CCtx_refCDict() for more info (superseded-by-cdict). */ + ZSTD_c_compressionLevel=100, /* Set compression parameters according to pre-defined cLevel table. + * Note that exact compression parameters are dynamically determined, + * depending on both compression level and srcSize (when known). + * Default level is ZSTD_CLEVEL_DEFAULT==3. + * Special: value 0 means default, which is controlled by ZSTD_CLEVEL_DEFAULT. + * Note 1 : it's possible to pass a negative compression level. + * Note 2 : setting a level does not automatically set all other compression parameters + * to default. Setting this will however eventually dynamically impact the compression + * parameters which have not been manually set. The manually set + * ones will 'stick'. */ + /* Advanced compression parameters : + * It's possible to pin down compression parameters to some specific values. + * In which case, these values are no longer dynamically selected by the compressor */ + ZSTD_c_windowLog=101, /* Maximum allowed back-reference distance, expressed as power of 2. + * This will set a memory budget for streaming decompression, + * with larger values requiring more memory + * and typically compressing more. + * Must be clamped between ZSTD_WINDOWLOG_MIN and ZSTD_WINDOWLOG_MAX. + * Special: value 0 means "use default windowLog". + * Note: Using a windowLog greater than ZSTD_WINDOWLOG_LIMIT_DEFAULT + * requires explicitly allowing such size at streaming decompression stage. */ + ZSTD_c_hashLog=102, /* Size of the initial probe table, as a power of 2. + * Resulting memory usage is (1 << (hashLog+2)). + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX. + * Larger tables improve compression ratio of strategies <= dFast, + * and improve speed of strategies > dFast. + * Special: value 0 means "use default hashLog". */ + ZSTD_c_chainLog=103, /* Size of the multi-probe search table, as a power of 2. + * Resulting memory usage is (1 << (chainLog+2)). + * Must be clamped between ZSTD_CHAINLOG_MIN and ZSTD_CHAINLOG_MAX. + * Larger tables result in better and slower compression. + * This parameter is useless for "fast" strategy. + * It's still useful when using "dfast" strategy, + * in which case it defines a secondary probe table. + * Special: value 0 means "use default chainLog". */ + ZSTD_c_searchLog=104, /* Number of search attempts, as a power of 2. + * More attempts result in better and slower compression. + * This parameter is useless for "fast" and "dFast" strategies. + * Special: value 0 means "use default searchLog". */ + ZSTD_c_minMatch=105, /* Minimum size of searched matches. + * Note that Zstandard can still find matches of smaller size, + * it just tweaks its search algorithm to look for this size and larger. + * Larger values increase compression and decompression speed, but decrease ratio. + * Must be clamped between ZSTD_MINMATCH_MIN and ZSTD_MINMATCH_MAX. + * Note that currently, for all strategies < btopt, effective minimum is 4. + * , for all strategies > fast, effective maximum is 6. + * Special: value 0 means "use default minMatchLength". */ + ZSTD_c_targetLength=106, /* Impact of this field depends on strategy. + * For strategies btopt, btultra & btultra2: + * Length of Match considered "good enough" to stop search. + * Larger values make compression stronger, and slower. + * For strategy fast: + * Distance between match sampling. + * Larger values make compression faster, and weaker. + * Special: value 0 means "use default targetLength". */ + ZSTD_c_strategy=107, /* See ZSTD_strategy enum definition. + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ + + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio + * for large inputs, by finding large matches at long distance. + * It increases memory usage and window size. + * Note: enabling this parameter increases default ZSTD_c_windowLog to 128 MB + * except when expressly set to a different value. */ + ZSTD_c_ldmHashLog=161, /* Size of the table for long distance matching, as a power of 2. + * Larger values increase memory usage and compression ratio, + * but decrease compression speed. + * Must be clamped between ZSTD_HASHLOG_MIN and ZSTD_HASHLOG_MAX + * default: windowlog - 7. + * Special: value 0 means "automatically determine hashlog". */ + ZSTD_c_ldmMinMatch=162, /* Minimum match size for long distance matcher. + * Larger/too small values usually decrease compression ratio. + * Must be clamped between ZSTD_LDM_MINMATCH_MIN and ZSTD_LDM_MINMATCH_MAX. + * Special: value 0 means "use default value" (default: 64). */ + ZSTD_c_ldmBucketSizeLog=163, /* Log size of each bucket in the LDM hash table for collision resolution. + * Larger values improve collision resolution but decrease compression speed. + * The maximum value is ZSTD_LDM_BUCKETSIZELOG_MAX. + * Special: value 0 means "use default value" (default: 3). */ + ZSTD_c_ldmHashRateLog=164, /* Frequency of inserting/looking up entries into the LDM hash table. + * Must be clamped between 0 and (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN). + * Default is MAX(0, (windowLog - ldmHashLog)), optimizing hash table usage. + * Larger values improve compression speed. + * Deviating far from default value will likely result in a compression ratio decrease. + * Special: value 0 means "automatically determine hashRateLog". */ + + /* frame parameters */ + ZSTD_c_contentSizeFlag=200, /* Content size will be written into frame header _whenever known_ (default:1) + * Content size must be known at the beginning of compression. + * This is automatically the case when using ZSTD_compress2(), + * For streaming scenarios, content size must be provided with ZSTD_CCtx_setPledgedSrcSize() */ + ZSTD_c_checksumFlag=201, /* A 32-bits checksum of content is written at end of frame (default:0) */ + ZSTD_c_dictIDFlag=202, /* When applicable, dictionary's ID is written into frame header (default:1) */ + + /* multi-threading parameters */ + /* These parameters are only useful if multi-threading is enabled (compiled with build macro ZSTD_MULTITHREAD). + * They return an error otherwise. */ + ZSTD_c_nbWorkers=400, /* Select how many threads will be spawned to compress in parallel. + * When nbWorkers >= 1, triggers asynchronous mode when used with ZSTD_compressStream*() : + * ZSTD_compressStream*() consumes input and flush output if possible, but immediately gives back control to caller, + * while compression work is performed in parallel, within worker threads. + * (note : a strong exception to this rule is when first invocation of ZSTD_compressStream2() sets ZSTD_e_end : + * in which case, ZSTD_compressStream2() delegates to ZSTD_compress2(), which is always a blocking call). + * More workers improve speed, but also increase memory usage. + * Default value is `0`, aka "single-threaded mode" : no worker is spawned, compression is performed inside Caller's thread, all invocations are blocking */ + ZSTD_c_jobSize=401, /* Size of a compression job. This value is enforced only when nbWorkers >= 1. + * Each compression job is completed in parallel, so this value can indirectly impact the nb of active threads. + * 0 means default, which is dynamically determined based on compression parameters. + * Job size must be a minimum of overlap size, or 1 MB, whichever is largest. + * The minimum size is automatically and transparently enforced. */ + ZSTD_c_overlapLog=402, /* Control the overlap size, as a fraction of window size. + * The overlap size is an amount of data reloaded from previous job at the beginning of a new job. + * It helps preserve compression ratio, while each job is compressed in parallel. + * This value is enforced only when nbWorkers >= 1. + * Larger values increase compression ratio, but decrease speed. + * Possible values range from 0 to 9 : + * - 0 means "default" : value will be determined by the library, depending on strategy + * - 1 means "no overlap" + * - 9 means "full overlap", using a full window size. + * Each intermediate rank increases/decreases load size by a factor 2 : + * 9: full window; 8: w/2; 7: w/4; 6: w/8; 5:w/16; 4: w/32; 3:w/64; 2:w/128; 1:no overlap; 0:default + * default value varies between 6 and 9, depending on strategy */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_c_rsyncable + * ZSTD_c_format + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode + * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. + */ + ZSTD_c_experimentalParam1=500, + ZSTD_c_experimentalParam2=10, + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, + ZSTD_c_experimentalParam6=1003, + ZSTD_c_experimentalParam7=1004 +} ZSTD_cParameter; + +typedef struct { + size_t error; + int lowerBound; + int upperBound; +} ZSTD_bounds; + +/*! ZSTD_cParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - lower and upper bounds, both inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter cParam); + +/*! ZSTD_CCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_cParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_cParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is generally only possible during frame initialization (before starting compression). + * Exception : when using multi-threading mode (nbWorkers >= 1), + * the following parameters can be updated _during_ compression (within same frame): + * => compressionLevel, hashLog, chainLog, searchLog, minMatch, targetLength and strategy. + * new parameters will be active for next job only (after a flush()). + * @return : an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtx_setPledgedSrcSize() : + * Total input data size to be compressed as a single frame. + * Value will be written in frame header, unless if explicitly forbidden using ZSTD_c_contentSizeFlag. + * This value will also be controlled at end of frame, and trigger an error if not respected. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : pledgedSrcSize==0 actually means zero, aka an empty frame. + * In order to mean "unknown content size", pass constant ZSTD_CONTENTSIZE_UNKNOWN. + * ZSTD_CONTENTSIZE_UNKNOWN is default value for any new frame. + * Note 2 : pledgedSrcSize is only valid once, for the next frame. + * It's discarded at the end of the frame, and replaced by ZSTD_CONTENTSIZE_UNKNOWN. + * Note 3 : Whenever all input data is provided and consumed in a single round, + * for example with ZSTD_compress2(), + * or invoking immediately ZSTD_compressStream2(,,,ZSTD_e_end), + * this value is automatically overridden by srcSize instead. + */ +ZSTDLIB_API size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize); + +typedef enum { + ZSTD_reset_session_only = 1, + ZSTD_reset_parameters = 2, + ZSTD_reset_session_and_parameters = 3 +} ZSTD_ResetDirective; + +/*! ZSTD_CCtx_reset() : + * There are 2 different things that can be reset, independently or jointly : + * - The session : will stop compressing current frame, and make CCtx ready to start a new one. + * Useful after an error, or to interrupt any ongoing compression. + * Any internal data not yet flushed is cancelled. + * Compression parameters and dictionary remain unchanged. + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". + * This removes any reference to any dictionary too. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. + */ +ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + +/*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. + * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_compress2( ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize); + + +/*************************************** +* Advanced decompression API +***************************************/ + +/* The advanced API pushes parameters one by one into an existing DCtx context. + * Parameters are sticky, and remain valid for all following frames + * using the same DCtx context. + * It's possible to reset parameters to default values using ZSTD_DCtx_reset(). + * Note : This API is compatible with existing ZSTD_decompressDCtx() and ZSTD_decompressStream(). + * Therefore, no new decompression function is necessary. + */ + +typedef enum { + + ZSTD_d_windowLogMax=100, /* Select a size limit (in power of 2) beyond which + * the streaming API will refuse to allocate memory buffer + * in order to protect the host from unreasonable memory requirements. + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT). + * Special: value 0 means "use default maximum windowLog". */ + + /* note : additional experimental parameters are also available + * within the experimental section of the API. + * At the time of this writing, they include : + * ZSTD_d_format + * ZSTD_d_stableOutBuffer + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001 + +} ZSTD_dParameter; + +/*! ZSTD_dParam_getBounds() : + * All parameters must belong to an interval with lower and upper bounds, + * otherwise they will either trigger an error or be automatically clamped. + * @return : a structure, ZSTD_bounds, which contains + * - an error status field, which must be tested using ZSTD_isError() + * - both lower and upper bounds, inclusive + */ +ZSTDLIB_API ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam); + +/*! ZSTD_DCtx_setParameter() : + * Set one compression parameter, selected by enum ZSTD_dParameter. + * All parameters have valid bounds. Bounds can be queried using ZSTD_dParam_getBounds(). + * Providing a value beyond bound will either clamp it, or trigger an error (depending on parameter). + * Setting a parameter is only possible during frame initialization (before starting decompression). + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int value); + +/*! ZSTD_DCtx_reset() : + * Return a DCtx to clean state. + * Session and parameters can be reset jointly or separately. + * Parameters can only be reset when no active frame is being decompressed. + * @return : 0, or an error code, which can be tested with ZSTD_isError() + */ +ZSTDLIB_API size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset); + + +/**************************** +* Streaming +****************************/ + +typedef struct ZSTD_inBuffer_s { + const void* src; /**< start of input buffer */ + size_t size; /**< size of input buffer */ + size_t pos; /**< position where reading stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_inBuffer; + +typedef struct ZSTD_outBuffer_s { + void* dst; /**< start of output buffer */ + size_t size; /**< size of output buffer */ + size_t pos; /**< position where writing stopped. Will be updated. Necessarily 0 <= pos <= size */ +} ZSTD_outBuffer; + + + +/*-*********************************************************************** +* Streaming compression - HowTo +* +* A ZSTD_CStream object is required to track streaming operation. +* Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. +* ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. +* +* For parallel execution, use one separate ZSTD_CStream per thread. +* +* note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. +* +* Parameters are sticky : when starting a new compression on the same context, +* it will re-use the same sticky parameters as previous compression session. +* When in doubt, it's recommended to fully initialize the context before usage. +* Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), +* ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +* set more specific parameters, the pledged source size, or load a dictionary. +* +* Use ZSTD_compressStream2() with ZSTD_e_continue as many times as necessary to +* consume input stream. The function will automatically update both `pos` +* fields within `input` and `output`. +* Note that the function may not consume the entire input, for example, because +* the output buffer is already full, in which case `input.pos < input.size`. +* The caller must check if input has been entirely consumed. +* If not, the caller must make some room to receive more compressed data, +* and then present again remaining input data. +* note: ZSTD_e_continue is guaranteed to make some forward progress when called, +* but doesn't guarantee maximal forward progress. This is especially relevant +* when compressing with multiple threads. The call won't block if it can +* consume some input, but if it can't it will wait for some, but not all, +* output to be flushed. +* @return : provides a minimum amount of data remaining to be flushed from internal buffers +* or an error code, which can be tested using ZSTD_isError(). +* +* At any moment, it's possible to flush whatever data might remain stuck within internal buffer, +* using ZSTD_compressStream2() with ZSTD_e_flush. `output->pos` will be updated. +* Note that, if `output->size` is too small, a single invocation with ZSTD_e_flush might not be enough (return code > 0). +* In which case, make some room to receive more compressed data, and call again ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_flush until it returns 0, at which point you can change the +* operation. +* note: ZSTD_e_flush will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if internal buffers are entirely flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* Calling ZSTD_compressStream2() with ZSTD_e_end instructs to finish a frame. +* It will perform a flush and write frame epilogue. +* The epilogue is required for decoders to consider a frame completed. +* flush operation is the same, and follows same rules as calling ZSTD_compressStream2() with ZSTD_e_flush. +* You must continue calling ZSTD_compressStream2() with ZSTD_e_end until it returns 0, at which point you are free to +* start a new frame. +* note: ZSTD_e_end will flush as much output as possible, meaning when compressing with multiple threads, it will +* block until the flush is complete or the output buffer is full. +* @return : 0 if frame fully completed and fully flushed, +* >0 if some data still present within internal buffer (the value is minimal estimation of remaining size), +* or an error code, which can be tested using ZSTD_isError(). +* +* *******************************************************************/ + +typedef ZSTD_CCtx ZSTD_CStream; /**< CCtx and CStream are now effectively same object (>= v1.3.0) */ + /* Continue to distinguish them for compatibility with older versions <= v1.2.0 */ +/*===== ZSTD_CStream management functions =====*/ +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream(void); +ZSTDLIB_API size_t ZSTD_freeCStream(ZSTD_CStream* zcs); + +/*===== Streaming compression functions =====*/ +typedef enum { + ZSTD_e_continue=0, /* collect more data, encoder decides when to output compressed result, for optimal compression ratio */ + ZSTD_e_flush=1, /* flush any data provided so far, + * it creates (at least) one new block, that can be decoded immediately on reception; + * frame will continue: any future data can still reference previously compressed data, improving compression. + * note : multithreaded compression will block to flush as much output as possible. */ + ZSTD_e_end=2 /* flush any remaining data _and_ close current frame. + * note that frame is only closed after compressed data is fully flushed (return value == 0). + * After that point, any additional data starts a new frame. + * note : each frame is independent (does not reference any content from previous frame). + : note : multithreaded compression will block to flush as much output as possible. */ +} ZSTD_EndDirective; + +/*! ZSTD_compressStream2() : + * Behaves about the same as ZSTD_compressStream, with additional control on end directive. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - Compression parameters cannot be changed once compression is started (save a list of exceptions in multi-threading mode) + * - output->pos must be <= dstCapacity, input->pos must be <= srcSize + * - output->pos and input->pos will be updated. They are guaranteed to remain below their respective limit. + * - When nbWorkers==0 (default), function is blocking : it completes its job before returning to caller. + * - When nbWorkers>=1, function is non-blocking : it just acquires a copy of input, and distributes jobs to internal worker threads, flush whatever is available, + * and then immediately returns, just indicating that there is some data remaining to be flushed. + * The function nonetheless guarantees forward progress : it will return only after it reads or write at least 1+ byte. + * - Exception : if the first call requests a ZSTD_e_end directive and provides enough dstCapacity, the function delegates to ZSTD_compress2() which is always blocking. + * - @return provides a minimum amount of data remaining to be flushed from internal buffers + * or an error code, which can be tested using ZSTD_isError(). + * if @return != 0, flush is not fully completed, there is still some data left within internal buffers. + * This is useful for ZSTD_e_flush, since in this case more flushes are necessary to empty all buffers. + * For ZSTD_e_end, @return == 0 when internal buffers are fully flushed and frame is completed. + * - after a ZSTD_e_end directive, if internal buffer is not fully flushed (@return != 0), + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective endOp); + + +/* These buffer sizes are softly recommended. + * They are not required : ZSTD_compressStream*() happily accepts any buffer size, for both input and output. + * Respecting the recommended size just makes it a bit easier for ZSTD_compressStream*(), + * reducing the amount of memory shuffling and buffering, resulting in minor performance savings. + * + * However, note that these recommendations are from the perspective of a C caller program. + * If the streaming interface is invoked from some other language, + * especially managed ones such as Java or Go, through a foreign function interface such as jni or cgo, + * a major performance rule is to reduce crossing such interface to an absolute minimum. + * It's not rare that performance ends being spent more into the interface, rather than compression itself. + * In which cases, prefer using large buffers, as large as practical, + * for both input and output, to reduce the nb of roundtrips. + */ +ZSTDLIB_API size_t ZSTD_CStreamInSize(void); /**< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /**< recommended size for output buffer. Guarantee to successfully flush at least one complete compressed block. */ + + +/* ***************************************************************************** + * This following is a legacy streaming API. + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. + * Advanced parameters and dictionary compression can only be used through the + * new API. + ******************************************************************************/ + +/*! + * Equivalent to: + * + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + */ +ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); +/*! + * Alternative for ZSTD_compressStream2(zcs, output, input, ZSTD_e_continue). + * NOTE: The return value is different. ZSTD_compressStream() returns a hint for + * the next read size (if non-zero and not an error). ZSTD_compressStream2() + * returns the minimum nb of bytes left to flush (if non-zero and not an error). + */ +ZSTDLIB_API size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuffer* input); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_flush). */ +ZSTDLIB_API size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); +/*! Equivalent to ZSTD_compressStream2(zcs, output, &emptyInput, ZSTD_e_end). */ +ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + + +/*-*************************************************************************** +* Streaming decompression - HowTo +* +* A ZSTD_DStream object is required to track streaming operations. +* Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +* ZSTD_DStream objects can be re-used multiple times. +* +* Use ZSTD_initDStream() to start a new decompression operation. +* @return : recommended first input size +* Alternatively, use advanced API to set specific properties. +* +* Use ZSTD_decompressStream() repetitively to consume your input. +* The function will update both `pos` fields. +* If `input.pos < input.size`, some input has not been consumed. +* It's up to the caller to present again remaining data. +* The function tries to flush all data decoded immediately, respecting output buffer size. +* If `output.pos < output.size`, decoder has flushed everything it could. +* But if `output.pos == output.size`, there might be some data left within internal buffers., +* In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. +* Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. +* @return : 0 when a frame is completely decoded and fully flushed, +* or an error code, which can be tested using ZSTD_isError(), +* or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : +* the return value is a suggested next input size (just a hint for better latency) +* that will never request more than the remaining frame size. +* *******************************************************************************/ + +typedef ZSTD_DCtx ZSTD_DStream; /**< DCtx and DStream are now effectively same object (>= v1.3.0) */ + /* For compatibility with versions <= v1.2.0, prefer differentiating them. */ +/*===== ZSTD_DStream management functions =====*/ +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream(void); +ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); + +/*===== Streaming decompression functions =====*/ + +/* This function is redundant with the advanced API and equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ +ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + +ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + +ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +ZSTDLIB_API size_t ZSTD_DStreamOutSize(void); /*!< recommended size for output buffer. Guarantee to successfully flush at least one complete block in all circumstances. */ + + +/************************** +* Simple dictionary API +***************************/ +/*! ZSTD_compress_usingDict() : + * Compression at an explicit compression level using a Dictionary. + * A dictionary can be any arbitrary data segment (also called a prefix), + * or a buffer with specified information (see dictBuilder/zdict.h). + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note 2 : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_compress_usingDict(ZSTD_CCtx* ctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + int compressionLevel); + +/*! ZSTD_decompress_usingDict() : + * Decompression using a known Dictionary. + * Dictionary must be identical to the one used during compression. + * Note : This function loads the dictionary, resulting in significant startup delay. + * It's intended for a dictionary used only once. + * Note : When `dict == NULL || dictSize < 8` no dictionary is used. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize); + + +/*********************************** + * Bulk processing dictionary API + **********************************/ +typedef struct ZSTD_CDict_s ZSTD_CDict; + +/*! ZSTD_createCDict() : + * When compressing multiple messages or blocks using the same dictionary, + * it's recommended to digest the dictionary only once, since it's a costly operation. + * ZSTD_createCDict() will create a state from digesting a dictionary. + * The resulting state can be used for future compression operations with very limited startup cost. + * ZSTD_CDict can be created once and shared by multiple threads concurrently, since its usage is read-only. + * @dictBuffer can be released after ZSTD_CDict creation, because its content is copied within CDict. + * Note 1 : Consider experimental function `ZSTD_createCDict_byReference()` if you prefer to not duplicate @dictBuffer content. + * Note 2 : A ZSTD_CDict can be created from an empty @dictBuffer, + * in which case the only thing that it transports is the @compressionLevel. + * This can be useful in a pipeline featuring ZSTD_compress_usingCDict() exclusively, + * expecting a ZSTD_CDict parameter with any data, including those without a known dictionary. */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict(const void* dictBuffer, size_t dictSize, + int compressionLevel); + +/*! ZSTD_freeCDict() : + * Function frees memory allocated by ZSTD_createCDict(). */ +ZSTDLIB_API size_t ZSTD_freeCDict(ZSTD_CDict* CDict); + +/*! ZSTD_compress_usingCDict() : + * Compression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. + * Note : compression level is _decided at dictionary creation time_, + * and frame parameters are hardcoded (dictID=yes, contentSize=yes, checksum=no) */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict); + + +typedef struct ZSTD_DDict_s ZSTD_DDict; + +/*! ZSTD_createDDict() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * dictBuffer can be released after DDict creation, as its content is copied inside DDict. */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_freeDDict() : + * Function frees memory allocated with ZSTD_createDDict() */ +ZSTDLIB_API size_t ZSTD_freeDDict(ZSTD_DDict* ddict); + +/*! ZSTD_decompress_usingDDict() : + * Decompression using a digested Dictionary. + * Recommended when same dictionary is used multiple times. */ +ZSTDLIB_API size_t ZSTD_decompress_usingDDict(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_DDict* ddict); + + +/******************************** + * Dictionary helper functions + *******************************/ + +/*! ZSTD_getDictID_fromDict() : + * Provides the dictID stored within dictionary. + * if @return == 0, the dictionary is not conformant with Zstandard specification. + * It can still be loaded, but as a content-only dictionary. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize); + +/*! ZSTD_getDictID_fromDDict() : + * Provides the dictID of the dictionary loaded into `ddict`. + * If @return == 0, the dictionary is not conformant to Zstandard specification, or empty. + * Non-conformant dictionaries can still be loaded, but as content-only dictionaries. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + +/*! ZSTD_getDictID_fromFrame() : + * Provides the dictID required to decompressed the frame stored within `src`. + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). + * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. + * When identifying the exact failure cause, it's possible to use ZSTD_getFrameHeader(), which will provide a more precise error code. */ +ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + + +/******************************************************************************* + * Advanced dictionary and prefix API + * + * This API allows dictionaries to be used with ZSTD_compress2(), + * ZSTD_compressStream2(), and ZSTD_decompress(). Dictionaries are sticky, and + * only reset with the context is reset with ZSTD_reset_parameters or + * ZSTD_reset_session_and_parameters. Prefixes are single-use. + ******************************************************************************/ + + +/*! ZSTD_CCtx_loadDictionary() : + * Create an internal CDict from `dict` buffer. + * Decompression will have to use same dictionary. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. + * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, + * compression parameters can no longer be changed after loading a dictionary. + * Note 3 :`dict` content will be copied internally. + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() + * to precisely select how dictionary content must be interpreted. */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_refCDict() : + * Reference a prepared dictionary, to be used for all next compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. + * The ignored parameters will be used again if the CCtx is returned to no-dictionary mode. + * The dictionary will remain valid for future compressed frames using same CCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Referencing a NULL CDict means "return to no-dictionary mode". + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Note 2 : CDict is just referenced, its lifetime must outlive its usage within CCtx. */ +ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + +/*! ZSTD_CCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) for next compressed frame. + * A prefix is **only used once**. Tables are discarded at end of frame (ZSTD_e_end). + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. + * Its content must remain unmodified during compression. + * Note 2 : If the intention is to diff some large src data blob with some prior version of itself, + * ensure that the window size is large enough to contain the entire source. + * See ZSTD_c_windowLog. + * Note 3 : Referencing a prefix involves building tables, which are dependent on compression parameters. + * It's a CPU consuming operation, with non-negligible impact on latency. + * If there is a need to use the same prefix multiple times, consider loadDictionary instead. + * Note 4 : By default, the prefix is interpreted as raw content (ZSTD_dct_rawContent). + * Use experimental ZSTD_CCtx_refPrefix_advanced() to alter dictionary interpretation. */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + +/*! ZSTD_DCtx_loadDictionary() : + * Create an internal DDict from dict buffer, + * to be used to decompress next frames. + * The dictionary remains valid for all future frames, until explicitly invalidated. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". + * Note 1 : Loading a dictionary involves building tables, + * which has a non-negligible impact on CPU usage and latency. + * It's recommended to "load once, use many times", to amortize the cost + * Note 2 :`dict` content will be copied internally, so `dict` can be released after loading. + * Use ZSTD_DCtx_loadDictionary_byReference() to reference dictionary content instead. + * Note 3 : Use ZSTD_DCtx_loadDictionary_advanced() to take control of + * how dictionary content is loaded and interpreted. + */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_refDDict() : + * Reference a prepared dictionary, to be used to decompress next frames. + * The dictionary remains active for decompression of future frames using same DCtx. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Currently, only one dictionary can be managed. + * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +/*! ZSTD_DCtx_refPrefix() : + * Reference a prefix (single-usage dictionary) to decompress next frame. + * This is the reverse operation of ZSTD_CCtx_refPrefix(), + * and must use the same prefix as the one used during compression. + * Prefix is **only used once**. Reference is discarded at end of frame. + * End of frame is reached when ZSTD_decompressStream() returns 0. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Note 1 : Adding any prefix (including NULL) invalidates any previously set prefix or dictionary + * Note 2 : Prefix buffer is referenced. It **must** outlive decompression. + * Prefix buffer must remain unmodified up to the end of frame, + * reached when ZSTD_decompressStream() returns 0. + * Note 3 : By default, the prefix is treated as raw content (ZSTD_dct_rawContent). + * Use ZSTD_CCtx_refPrefix_advanced() to alter dictMode (Experimental section) + * Note 4 : Referencing a raw content prefix has almost no cpu nor memory cost. + * A full dictionary is more costly, as it requires building tables. + */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix(ZSTD_DCtx* dctx, + const void* prefix, size_t prefixSize); + +/* === Memory management === */ + +/*! ZSTD_sizeof_*() : + * These functions give the _current_ memory usage of selected object. + * Note that object memory usage can evolve (increase or decrease) over time. */ +ZSTDLIB_API size_t ZSTD_sizeof_CCtx(const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_sizeof_DCtx(const ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs); +ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); +ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); +ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + +#endif /* ZSTD_H_235446 */ + + +/* ************************************************************************************** + * ADVANCED AND EXPERIMENTAL FUNCTIONS + **************************************************************************************** + * The definitions in the following section are considered experimental. + * They are provided for advanced scenarios. + * They should never be used with a dynamic library, as prototypes may change in the future. + * Use them only in association with static linking. + * ***************************************************************************************/ + +#if defined(ZSTD_STATIC_LINKING_ONLY) && !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) +#define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + +/**************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** + * The following symbols and constants + * are not planned to join "stable API" status in the near future. + * They can still change in future versions. + * Some of them are planned to remain in the static_only section indefinitely. + * Some of them might be removed in the future (especially when redundant with existing stable functions) + * ***************************************************************************************/ + +#define ZSTD_FRAMEHEADERSIZE_PREFIX(format) ((format) == ZSTD_f_zstd1 ? 5 : 1) /* minimum input size required to query frame header size */ +#define ZSTD_FRAMEHEADERSIZE_MIN(format) ((format) == ZSTD_f_zstd1 ? 6 : 2) +#define ZSTD_FRAMEHEADERSIZE_MAX 18 /* can be useful for static allocation */ +#define ZSTD_SKIPPABLEHEADERSIZE 8 + +/* compression parameter bounds */ +#define ZSTD_WINDOWLOG_MAX_32 30 +#define ZSTD_WINDOWLOG_MAX_64 31 +#define ZSTD_WINDOWLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_WINDOWLOG_MAX_32 : ZSTD_WINDOWLOG_MAX_64)) +#define ZSTD_WINDOWLOG_MIN 10 +#define ZSTD_HASHLOG_MAX ((ZSTD_WINDOWLOG_MAX < 30) ? ZSTD_WINDOWLOG_MAX : 30) +#define ZSTD_HASHLOG_MIN 6 +#define ZSTD_CHAINLOG_MAX_32 29 +#define ZSTD_CHAINLOG_MAX_64 30 +#define ZSTD_CHAINLOG_MAX ((int)(sizeof(size_t) == 4 ? ZSTD_CHAINLOG_MAX_32 : ZSTD_CHAINLOG_MAX_64)) +#define ZSTD_CHAINLOG_MIN ZSTD_HASHLOG_MIN +#define ZSTD_SEARCHLOG_MAX (ZSTD_WINDOWLOG_MAX-1) +#define ZSTD_SEARCHLOG_MIN 1 +#define ZSTD_MINMATCH_MAX 7 /* only for ZSTD_fast, other strategies are limited to 6 */ +#define ZSTD_MINMATCH_MIN 3 /* only for ZSTD_btopt+, faster strategies are limited to 4 */ +#define ZSTD_TARGETLENGTH_MAX ZSTD_BLOCKSIZE_MAX +#define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ +#define ZSTD_STRATEGY_MIN ZSTD_fast +#define ZSTD_STRATEGY_MAX ZSTD_btultra2 + + +#define ZSTD_OVERLAPLOG_MIN 0 +#define ZSTD_OVERLAPLOG_MAX 9 + +#define ZSTD_WINDOWLOG_LIMIT_DEFAULT 27 /* by default, the streaming decoder will refuse any frame + * requiring larger than (1< 3, then this is seqDef.offset - 3 + * If seqDef.offset < 3, then this is the corresponding repeat offset + * But if seqDef.offset < 3 and litLength == 0, this is the + * repeat offset before the corresponding repeat offset + * And if seqDef.offset == 3 and litLength == 0, this is the + * most recent repeat offset - 1 + */ + unsigned int offset; + unsigned int litLength; /* Literal length */ + unsigned int matchLength; /* Match length */ + /* 0 when seq not rep and seqDef.offset otherwise + * when litLength == 0 this will be <= 4, otherwise <= 3 like normal + */ + unsigned int rep; +} ZSTD_Sequence; + +typedef struct { + unsigned windowLog; /**< largest match distance : larger == more compression, more memory needed during decompression */ + unsigned chainLog; /**< fully searched segment : larger == more compression, slower, more memory (useless for fast) */ + unsigned hashLog; /**< dispatch table : larger == faster, more memory */ + unsigned searchLog; /**< nb of searches : larger == more compression, slower */ + unsigned minMatch; /**< match length searched : larger == faster decompression, sometimes less compression */ + unsigned targetLength; /**< acceptable match size for optimal parser (only) : larger == more compression, slower */ + ZSTD_strategy strategy; /**< see ZSTD_strategy definition above */ +} ZSTD_compressionParameters; + +typedef struct { + int contentSizeFlag; /**< 1: content size will be in frame header (when known) */ + int checksumFlag; /**< 1: generate a 32-bits checksum using XXH64 algorithm at end of frame, for error detection */ + int noDictIDFlag; /**< 1: no dictID will be saved into frame header (dictID is only useful for dictionary compression) */ +} ZSTD_frameParameters; + +typedef struct { + ZSTD_compressionParameters cParams; + ZSTD_frameParameters fParams; +} ZSTD_parameters; + +typedef enum { + ZSTD_dct_auto = 0, /* dictionary is "full" when starting with ZSTD_MAGIC_DICTIONARY, otherwise it is "rawContent" */ + ZSTD_dct_rawContent = 1, /* ensures dictionary is always loaded as rawContent, even if it starts with ZSTD_MAGIC_DICTIONARY */ + ZSTD_dct_fullDict = 2 /* refuses to load a dictionary if it does not respect Zstandard's specification, starting with ZSTD_MAGIC_DICTIONARY */ +} ZSTD_dictContentType_e; + +typedef enum { + ZSTD_dlm_byCopy = 0, /**< Copy dictionary content internally */ + ZSTD_dlm_byRef = 1 /**< Reference dictionary content -- the dictionary buffer must outlive its users. */ +} ZSTD_dictLoadMethod_e; + +typedef enum { + ZSTD_f_zstd1 = 0, /* zstd frame format, specified in zstd_compression_format.md (default) */ + ZSTD_f_zstd1_magicless = 1 /* Variant of zstd frame format, without initial 4-bytes magic number. + * Useful to save 4 bytes per generated frame. + * Decoder cannot recognise automatically this format, requiring this instruction. */ +} ZSTD_format_e; + +typedef enum { + /* Note: this enum and the behavior it controls are effectively internal + * implementation details of the compressor. They are expected to continue + * to evolve and should be considered only in the context of extremely + * advanced performance tuning. + * + * Zstd currently supports the use of a CDict in three ways: + * + * - The contents of the CDict can be copied into the working context. This + * means that the compression can search both the dictionary and input + * while operating on a single set of internal tables. This makes + * the compression faster per-byte of input. However, the initial copy of + * the CDict's tables incurs a fixed cost at the beginning of the + * compression. For small compressions (< 8 KB), that copy can dominate + * the cost of the compression. + * + * - The CDict's tables can be used in-place. In this model, compression is + * slower per input byte, because the compressor has to search two sets of + * tables. However, this model incurs no start-up cost (as long as the + * working context's tables can be reused). For small inputs, this can be + * faster than copying the CDict's tables. + * + * - The CDict's tables are not used at all, and instead we use the working + * context alone to reload the dictionary and use params based on the source + * size. See ZSTD_compress_insertDictionary() and ZSTD_compress_usingDict(). + * This method is effective when the dictionary sizes are very small relative + * to the input size, and the input size is fairly large to begin with. + * + * Zstd has a simple internal heuristic that selects which strategy to use + * at the beginning of a compression. However, if experimentation shows that + * Zstd is making poor choices, it is possible to override that choice with + * this enum. + */ + ZSTD_dictDefaultAttach = 0, /* Use the default heuristic. */ + ZSTD_dictForceAttach = 1, /* Never copy the dictionary. */ + ZSTD_dictForceCopy = 2, /* Always copy the dictionary. */ + ZSTD_dictForceLoad = 3 /* Always reload the dictionary */ +} ZSTD_dictAttachPref_e; + +typedef enum { + ZSTD_lcm_auto = 0, /**< Automatically determine the compression mode based on the compression level. + * Negative compression levels will be uncompressed, and positive compression + * levels will be compressed. */ + ZSTD_lcm_huffman = 1, /**< Always attempt Huffman compression. Uncompressed literals will still be + * emitted if Huffman compression is not profitable. */ + ZSTD_lcm_uncompressed = 2 /**< Always emit uncompressed literals. */ +} ZSTD_literalCompressionMode_e; + + +/*************************************** +* Frame size functions +***************************************/ + +/*! ZSTD_findDecompressedSize() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - decompressed size of all data in all successive frames + * - if the decompressed size cannot be determined: ZSTD_CONTENTSIZE_UNKNOWN + * - if an error occurred: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : decompressed size is an optional field, that may not be present, especially in streaming mode. + * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. + * In which case, it's necessary to use streaming mode to decompress data. + * note 2 : decompressed size is always present when compression is done with ZSTD_compress() + * note 3 : decompressed size can be very large (64-bits value), + * potentially larger than what local system can handle as a single memory segment. + * In which case, it's necessary to use streaming mode to decompress data. + * note 4 : If source is untrusted, decompressed size could be wrong or intentionally modified. + * Always ensure result fits within application's authorized limits. + * Each application can set its own limits. + * note 5 : ZSTD_findDecompressedSize handles multiple frames, and so it must traverse the input to + * read each contained frame header. This is fast as most of the data is skipped, + * however it does mean that all frame data must be present and valid. */ +ZSTDLIB_API unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize); + +/*! ZSTD_decompressBound() : + * `src` should point to the start of a series of ZSTD encoded and/or skippable frames + * `srcSize` must be the _exact_ size of this series + * (i.e. there should be a frame boundary at `src + srcSize`) + * @return : - upper-bound for the decompressed size of all data in all successive frames + * - if an error occured: ZSTD_CONTENTSIZE_ERROR + * + * note 1 : an error can occur if `src` contains an invalid or incorrectly formatted frame. + * note 2 : the upper-bound is exact when the decompressed size field is available in every ZSTD encoded frame of `src`. + * in this case, `ZSTD_findDecompressedSize` and `ZSTD_decompressBound` return the same value. + * note 3 : when the decompressed size field isn't available, the upper-bound for that frame is calculated by: + * upper-bound = # blocks * min(128 KB, Window_Size) + */ +ZSTDLIB_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + +/*! ZSTD_frameHeaderSize() : + * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ +ZSTDLIB_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + +/*! ZSTD_getSequences() : + * Extract sequences from the sequence store + * zc can be used to insert custom compression params. + * This function invokes ZSTD_compress2 + * @return : number of sequences extracted + */ +ZSTDLIB_API size_t ZSTD_getSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize); + + +/*************************************** +* Memory management +***************************************/ + +/*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough + * for any compression level up to selected one. + * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate + * does not include space for a window buffer. + * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * + * When srcSize can be bound by a known and rather "small" value, + * this fact can be used to provide a tighter estimation + * because the CCtx compression context will need less memory. + * This tighter estimation can be provided by more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * + * Note 2 : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +ZSTDLIB_API size_t ZSTD_estimateCCtxSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDCtxSize(void); + +/*! ZSTD_estimateCStreamSize() : + * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. + * It will also consider src size to be arbitrarily "large", which is worst case. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. + * ZSTD_DStream memory budget depends on window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. + * In this case, get total size by adding ZSTD_estimate?DictSize */ +ZSTDLIB_API size_t ZSTD_estimateCStreamSize(int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); +ZSTDLIB_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize(size_t windowSize); +ZSTDLIB_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + +/*! ZSTD_estimate?DictSize() : + * ZSTD_estimateCDictSize() will bet that src size is relatively "small", and content is copied, like ZSTD_createCDict(). + * ZSTD_estimateCDictSize_advanced() makes it possible to control compression parameters precisely, like ZSTD_createCDict_advanced(). + * Note : dictionaries created by reference (`ZSTD_dlm_byRef`) are logically smaller. + */ +ZSTDLIB_API size_t ZSTD_estimateCDictSize(size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_estimateCDictSize_advanced(size_t dictSize, ZSTD_compressionParameters cParams, ZSTD_dictLoadMethod_e dictLoadMethod); +ZSTDLIB_API size_t ZSTD_estimateDDictSize(size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod); + +/*! ZSTD_initStatic*() : + * Initialize an object using a pre-allocated fixed-size buffer. + * workspace: The memory area to emplace the object into. + * Provided pointer *must be 8-bytes aligned*. + * Buffer must outlive object. + * workspaceSize: Use ZSTD_estimate*Size() to determine + * how large workspace must be to support target scenario. + * @return : pointer to object (same address as workspace, just different type), + * or NULL if error (size too small, incorrect alignment, etc.) + * Note : zstd will never resize nor malloc() when using a static buffer. + * If the object requires more memory than available, + * zstd will just error out (typically ZSTD_error_memory_allocation). + * Note 2 : there is no corresponding "free" function. + * Since workspace is allocated externally, it must be freed externally too. + * Note 3 : cParams : use ZSTD_getCParams() to convert a compression level + * into its associated cParams. + * Limitation 1 : currently not compatible with internal dictionary creation, triggered by + * ZSTD_CCtx_loadDictionary(), ZSTD_initCStream_usingDict() or ZSTD_initDStream_usingDict(). + * Limitation 2 : static cctx currently not compatible with multi-threading. + * Limitation 3 : static dctx is incompatible with legacy support. + */ +ZSTDLIB_API ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_CStream* ZSTD_initStaticCStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticCCtx() */ + +ZSTDLIB_API ZSTD_DCtx* ZSTD_initStaticDCtx(void* workspace, size_t workspaceSize); +ZSTDLIB_API ZSTD_DStream* ZSTD_initStaticDStream(void* workspace, size_t workspaceSize); /**< same as ZSTD_initStaticDCtx() */ + +ZSTDLIB_API const ZSTD_CDict* ZSTD_initStaticCDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams); + +ZSTDLIB_API const ZSTD_DDict* ZSTD_initStaticDDict( + void* workspace, size_t workspaceSize, + const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType); + + +/*! Custom memory allocation : + * These prototypes make it possible to pass your own allocation/free functions. + * ZSTD_customMem is provided at creation time, using ZSTD_create*_advanced() variants listed below. + * All allocation/free operations will be completed using these custom variants instead of regular ones. + */ +typedef void* (*ZSTD_allocFunction) (void* opaque, size_t size); +typedef void (*ZSTD_freeFunction) (void* opaque, void* address); +typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; +static ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /**< this constant defers to stdlib's functions */ + +ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DCtx* ZSTD_createDCtx_advanced(ZSTD_customMem customMem); +ZSTDLIB_API ZSTD_DStream* ZSTD_createDStream_advanced(ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams, + ZSTD_customMem customMem); + +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_advanced(const void* dict, size_t dictSize, + ZSTD_dictLoadMethod_e dictLoadMethod, + ZSTD_dictContentType_e dictContentType, + ZSTD_customMem customMem); + + + +/*************************************** +* Advanced compression functions +***************************************/ + +/*! ZSTD_createCDict_byReference() : + * Create a digested dictionary for compression + * Dictionary content is just referenced, not duplicated. + * As a consequence, `dictBuffer` **must** outlive CDict, + * and its content must remain unmodified throughout the lifetime of CDict. + * note: equivalent to ZSTD_createCDict_advanced(), with dictLoadMethod==ZSTD_dlm_byRef */ +ZSTDLIB_API ZSTD_CDict* ZSTD_createCDict_byReference(const void* dictBuffer, size_t dictSize, int compressionLevel); + +/*! ZSTD_getCParams() : + * @return ZSTD_compressionParameters structure for a selected compression level and estimated srcSize. + * `estimatedSrcSize` value is optional, select 0 if not known */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_getParams() : + * same as ZSTD_getCParams(), but @return a full `ZSTD_parameters` object instead of sub-component `ZSTD_compressionParameters`. + * All fields of `ZSTD_frameParameters` are set to default : contentSize=1, checksum=0, noDictID=0 */ +ZSTDLIB_API ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long estimatedSrcSize, size_t dictSize); + +/*! ZSTD_checkCParams() : + * Ensure param values remain within authorized range. + * @return 0 on success, or an error code (can be checked with ZSTD_isError()) */ +ZSTDLIB_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + +/*! ZSTD_adjustCParams() : + * optimize params for a given `srcSize` and `dictSize`. + * `srcSize` can be unknown, in which case use ZSTD_CONTENTSIZE_UNKNOWN. + * `dictSize` must be `0` when there is no dictionary. + * cPar can be invalid : all parameters will be clamped within valid range in the @return struct. + * This function never fails (wide contract) */ +ZSTDLIB_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + +/*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning on reaching v1.5.x */ +ZSTDLIB_API size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict,size_t dictSize, + ZSTD_parameters params); + +/*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now REDUNDANT. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will be marked as deprecated and generate compilation warning in some future version */ +ZSTDLIB_API size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams); + + +/*! ZSTD_CCtx_loadDictionary_byReference() : + * Same as ZSTD_CCtx_loadDictionary(), but dictionary content is referenced, instead of being copied into CCtx. + * It saves some memory, but also requires that `dict` outlives its usage within `cctx` */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_byReference(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + +/*! ZSTD_CCtx_loadDictionary_advanced() : + * Same as ZSTD_CCtx_loadDictionary(), but gives finer control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_CCtx_refPrefix_advanced() : + * Same as ZSTD_CCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/* === experimental parameters === */ +/* these parameters can be used with ZSTD_setParameter() + * they are not guaranteed to remain supported in the future */ + + /* Enables rsyncable mode, + * which makes compressed files more rsync friendly + * by adding periodic synchronization points to the compressed data. + * The target average block size is ZSTD_c_jobSize / 2. + * It's possible to modify the job size to increase or decrease + * the granularity of the synchronization point. + * Once the jobSize is smaller than the window size, + * it will result in compression ratio degradation. + * NOTE 1: rsyncable mode only works when multithreading is enabled. + * NOTE 2: rsyncable performs poorly in combination with long range mode, + * since it will decrease the effectiveness of synchronization points, + * though mileage may vary. + * NOTE 3: Rsyncable mode limits maximum compression speed to ~400 MB/s. + * If the selected compression level is already running significantly slower, + * the overall speed won't be significantly impacted. + */ + #define ZSTD_c_rsyncable ZSTD_c_experimentalParam1 + +/* Select a compression format. + * The value must be of type ZSTD_format_e. + * See ZSTD_format_e enum definition for details */ +#define ZSTD_c_format ZSTD_c_experimentalParam2 + +/* Force back-reference distances to remain < windowSize, + * even when referencing into Dictionary content (default:0) */ +#define ZSTD_c_forceMaxWindow ZSTD_c_experimentalParam3 + +/* Controls whether the contents of a CDict + * are used in place, or copied into the working context. + * Accepts values from the ZSTD_dictAttachPref_e enum. + * See the comments on that enum for an explanation of the feature. */ +#define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +/* Controls how the literals are compressed (default is auto). + * The value must be of type ZSTD_literalCompressionMode_e. + * See ZSTD_literalCompressionMode_t enum definition for details. + */ +#define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +/* Tries to fit compressed block size to be around targetCBlockSize. + * No target when targetCBlockSize == 0. + * There is no guarantee on compressed block size (default:0) */ +#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 + +/* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, + * but compression ratio may regress significantly if guess considerably underestimates */ +#define ZSTD_c_srcSizeHint ZSTD_c_experimentalParam7 + +/*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. + * @return : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtx_getParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int* value); + + +/*! ZSTD_CCtx_params : + * Quick howto : + * - ZSTD_createCCtxParams() : Create a ZSTD_CCtx_params structure + * - ZSTD_CCtxParams_setParameter() : Push parameters one by one into + * an existing ZSTD_CCtx_params structure. + * This is similar to + * ZSTD_CCtx_setParameter(). + * - ZSTD_CCtx_setParametersUsingCCtxParams() : Apply parameters to + * an existing CCtx. + * These parameters will be applied to + * all subsequent frames. + * - ZSTD_compressStream2() : Do compression using the CCtx. + * - ZSTD_freeCCtxParams() : Free the memory. + * + * This can be used with ZSTD_estimateCCtxSize_advanced_usingCCtxParams() + * for static allocation of CCtx for single-threaded compression. + */ +ZSTDLIB_API ZSTD_CCtx_params* ZSTD_createCCtxParams(void); +ZSTDLIB_API size_t ZSTD_freeCCtxParams(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_reset() : + * Reset params to default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_reset(ZSTD_CCtx_params* params); + +/*! ZSTD_CCtxParams_init() : + * Initializes the compression parameters of cctxParams according to + * compression level. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel); + +/*! ZSTD_CCtxParams_init_advanced() : + * Initializes the compression and frame parameters of cctxParams according to + * params. All other parameters are reset to their default values. + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params); + +/*! ZSTD_CCtxParams_setParameter() : + * Similar to ZSTD_CCtx_setParameter. + * Set one compression parameter, selected by enum ZSTD_cParameter. + * Parameters must be applied to a ZSTD_CCtx using ZSTD_CCtx_setParametersUsingCCtxParams(). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int value); + +/*! ZSTD_CCtxParams_getParameter() : + * Similar to ZSTD_CCtx_getParameter. + * Get the requested value of one compression parameter, selected by enum ZSTD_cParameter. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_CCtxParams_getParameter(ZSTD_CCtx_params* params, ZSTD_cParameter param, int* value); + +/*! ZSTD_CCtx_setParametersUsingCCtxParams() : + * Apply a set of ZSTD_CCtx_params to the compression context. + * This can be done even after compression is started, + * if nbWorkers==0, this will have no impact until a new compression is started. + * if nbWorkers>=1, new parameters will be picked up at next job, + * with a few restrictions (windowLog, pledgedSrcSize, nbWorkers, jobSize, and overlapLog are not updated). + */ +ZSTDLIB_API size_t ZSTD_CCtx_setParametersUsingCCtxParams( + ZSTD_CCtx* cctx, const ZSTD_CCtx_params* params); + +/*! ZSTD_compressStream2_simpleArgs() : + * Same as ZSTD_compressStream2(), + * but using only integral types as arguments. + * This variant might be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_compressStream2_simpleArgs ( + ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp); + + +/*************************************** +* Advanced decompression functions +***************************************/ + +/*! ZSTD_isFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier. + * Note : Frame Identifier is 4 bytes. If `size < 4`, @return will always be 0. + * Note 2 : Legacy Frame Identifiers are considered valid only if Legacy Support is enabled. + * Note 3 : Skippable Frame Identifiers are considered valid. */ +ZSTDLIB_API unsigned ZSTD_isFrame(const void* buffer, size_t size); + +/*! ZSTD_createDDict_byReference() : + * Create a digested dictionary, ready to start decompression operation without startup delay. + * Dictionary content is referenced, and therefore stays in dictBuffer. + * It is important that dictBuffer outlives DDict, + * it must remain read accessible throughout the lifetime of DDict */ +ZSTDLIB_API ZSTD_DDict* ZSTD_createDDict_byReference(const void* dictBuffer, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_byReference() : + * Same as ZSTD_DCtx_loadDictionary(), + * but references `dict` content instead of copying it into `dctx`. + * This saves memory if `dict` remains around., + * However, it's imperative that `dict` remains accessible (and unmodified) while being used, so it must outlive decompression. */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_byReference(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); + +/*! ZSTD_DCtx_loadDictionary_advanced() : + * Same as ZSTD_DCtx_loadDictionary(), + * but gives direct control over + * how to load the dictionary (by copy ? by reference ?) + * and how to interpret it (automatic ? force raw mode ? full mode only ?). */ +ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary_advanced(ZSTD_DCtx* dctx, const void* dict, size_t dictSize, ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_refPrefix_advanced() : + * Same as ZSTD_DCtx_refPrefix(), but gives finer control over + * how to interpret prefix content (automatic ? force raw mode (default) ? full mode only ?) */ +ZSTDLIB_API size_t ZSTD_DCtx_refPrefix_advanced(ZSTD_DCtx* dctx, const void* prefix, size_t prefixSize, ZSTD_dictContentType_e dictContentType); + +/*! ZSTD_DCtx_setMaxWindowSize() : + * Refuses allocating internal buffers for frames requiring a window size larger than provided limit. + * This protects a decoder context from reserving too much memory for itself (potential attack scenario). + * This parameter is only useful in streaming mode, since no internal buffer is allocated in single-pass mode. + * By default, a decompression context accepts all window sizes <= (1 << ZSTD_WINDOWLOG_LIMIT_DEFAULT) + * @return : 0, or an error code (which can be tested using ZSTD_isError()). + */ +ZSTDLIB_API size_t ZSTD_DCtx_setMaxWindowSize(ZSTD_DCtx* dctx, size_t maxWindowSize); + +/* ZSTD_d_format + * experimental parameter, + * allowing selection between ZSTD_format_e input compression formats + */ +#define ZSTD_d_format ZSTD_d_experimentalParam1 +/* ZSTD_d_stableOutBuffer + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * + * Tells the decompressor that the ZSTD_outBuffer will ALWAYS be the same + * between calls, except for the modifications that zstd makes to pos (the + * caller must not modify pos). This is checked by the decompressor, and + * decompression will fail if it ever changes. Therefore the ZSTD_outBuffer + * MUST be large enough to fit the entire decompressed frame. This will be + * checked when the frame content size is known. The data in the ZSTD_outBuffer + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * + * When this flags is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. + * If you need to avoid the input buffer allocation use the buffer-less + * streaming API. + * + * NOTE: So long as the ZSTD_outBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds + * memory. However, decompression WILL fail if you violate the preconditions. + * + * WARNING: The data in the ZSTD_outBuffer in the range [dst, dst + pos) MUST + * not be modified during decompression or you will get data corruption. This + * is because zstd needs to reference data in the ZSTD_outBuffer to regenerate + * matches. Normally zstd maintains its own buffer for this purpose, but passing + * this flag tells zstd to use the user provided buffer. + */ +#define ZSTD_d_stableOutBuffer ZSTD_d_experimentalParam2 + +/*! ZSTD_DCtx_setFormat() : + * Instruct the decoder context about what kind of data to decode next. + * This instruction is mandatory to decode data without a fully-formed header, + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ +ZSTDLIB_API size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + +/*! ZSTD_decompressStream_simpleArgs() : + * Same as ZSTD_decompressStream(), + * but using only integral types as arguments. + * This can be helpful for binders from dynamic languages + * which have troubles handling structures containing memory pointers. + */ +ZSTDLIB_API size_t ZSTD_decompressStream_simpleArgs ( + ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos); + + +/******************************************************************** +* Advanced streaming functions +* Warning : most of these functions are now redundant with the Advanced API. +* Once Advanced API reaches "stable" status, +* redundant functions will be deprecated, and then at some point removed. +********************************************************************/ + +/*===== Advanced Streaming compression functions =====*/ +/**! ZSTD_initCStream_srcSize() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * pledgedSrcSize must be correct. If it is not known at init time, use + * ZSTD_CONTENTSIZE_UNKNOWN. Note that, for compatibility with older programs, + * "0" also disables frame content size field. It may be enabled in the future. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingDict() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * Creates of an internal CDict (incompatible with static CCtx), except if + * dict == NULL or dictSize < 8, in which case no dict is used. + * Note: dict is loaded with ZSTD_dct_auto (treated as a full zstd dictionary if + * it begins with ZSTD_MAGIC_DICTIONARY, else as raw content) and ZSTD_dlm_byCopy. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + +/**! ZSTD_initCStream_advanced() : + * This function is deprecated, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd parameter and leave the rest as-is. + * for ((param, value) : params) { + * ZSTD_CCtx_setParameter(zcs, param, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * + * dict is loaded with ZSTD_dct_auto and ZSTD_dlm_byCopy. + * pledgedSrcSize must be correct. + * If srcSize is not known at init time, use value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, + unsigned long long pledgedSrcSize); + +/**! ZSTD_initCStream_usingCDict() : + * This function is deprecated, and equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * note : cdict will just be referenced, and must outlive compression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + +/**! ZSTD_initCStream_usingCDict_advanced() : + * This function is DEPRECATED, and is approximately equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. + * for ((fParam, value) : fParams) { + * ZSTD_CCtx_setParameter(zcs, fParam, value); + * } + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * + * same as ZSTD_initCStream_usingCDict(), with control over frame parameters. + * pledgedSrcSize must be correct. If srcSize is not known at init time, use + * value ZSTD_CONTENTSIZE_UNKNOWN. + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t +ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, + unsigned long long pledgedSrcSize); + +/*! ZSTD_resetCStream() : + * This function is deprecated, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * + * start a new frame, using same parameters from previous frame. + * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. + * For the time being, pledgedSrcSize==0 is interpreted as "srcSize unknown" for compatibility with older programs, + * but it will change to mean "empty" in future version, so use macro ZSTD_CONTENTSIZE_UNKNOWN instead. + * @return : 0, or an error code (which can be tested using ZSTD_isError()) + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +typedef struct { + unsigned long long ingested; /* nb input bytes read and buffered */ + unsigned long long consumed; /* nb input bytes actually compressed */ + unsigned long long produced; /* nb of compressed bytes generated and buffered */ + unsigned long long flushed; /* nb of compressed bytes flushed : not provided; can be tracked from caller side */ + unsigned currentJobID; /* MT only : latest started job nb */ + unsigned nbActiveWorkers; /* MT only : nb of workers actively compressing at probe time */ +} ZSTD_frameProgression; + +/* ZSTD_getFrameProgression() : + * tells how much data has been ingested (read from input) + * consumed (input actually compressed) and produced (output) for current frame. + * Note : (ingested - consumed) is amount of input data buffered internally, not yet compressed. + * Aggregates progression inside active worker threads. + */ +ZSTDLIB_API ZSTD_frameProgression ZSTD_getFrameProgression(const ZSTD_CCtx* cctx); + +/*! ZSTD_toFlushNow() : + * Tell how many bytes are ready to be flushed immediately. + * Useful for multithreading scenarios (nbWorkers >= 1). + * Probe the oldest active job, defined as oldest job not yet entirely flushed, + * and check its output buffer. + * @return : amount of data stored in oldest job and ready to be flushed immediately. + * if @return == 0, it means either : + * + there is no active job (could be checked with ZSTD_frameProgression()), or + * + oldest job is still actively compressing data, + * but everything it has produced has also been flushed so far, + * therefore flush speed is limited by production speed of oldest job + * irrespective of the speed of concurrent (and newer) jobs. + */ +ZSTDLIB_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + + +/*===== Advanced Streaming decompression functions =====*/ +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + +/** + * This function is deprecated, and is equivalent to: + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading + * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ +ZSTDLIB_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + +/********************************************************************* +* Buffer-less and synchronous inner streaming functions +* +* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +* But it's also a complex one, with several restrictions, documented below. +* Prefer normal streaming API for an easier experience. +********************************************************************* */ + +/** + Buffer-less streaming compression (synchronous mode) + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. + ZSTD_CCtx object can be re-used multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression, + or ZSTD_compressBegin_advanced(), for finer parameter control. + It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : + - ZSTD_compressContinue() has no internal buffer. It uses externally provided buffers only. + - Interface is synchronous : input is consumed entirely and produces 1+ compressed blocks. + - Caller must ensure there is enough space in `dst` to store compressed data under worst case scenario. + Worst case evaluation is provided by ZSTD_compressBound(). + ZSTD_compressContinue() doesn't guarantee recover after a failed compression. + - ZSTD_compressContinue() presumes prior input ***is still accessible and unmodified*** (up to maximum distance size, see WindowLog). + It remembers all previous contiguous blocks, plus one separated memory segment (which can itself consists of multiple contiguous blocks) + - ZSTD_compressContinue() detects that prior input has been overwritten when `src` buffer overlaps. + In which case, it will "discard" the relevant memory section from its history. + + Finish a frame with ZSTD_compressEnd(), which will write the last block(s) and optional checksum. + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + + `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. +*/ + +/*===== Buffer-less streaming compression functions =====*/ +ZSTDLIB_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); +ZSTDLIB_API size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /**< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /**< note: fails if cdict==NULL */ +ZSTDLIB_API size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ +ZSTDLIB_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /**< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + +/*- + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. + A ZSTD_DCtx object can be re-used multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. + @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. + >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. + For example, do not allocate memory blindly, check that `windowSize` is within expectation. + Each application can set its own limits, depending on local restrictions. + For extended interoperability, it is recommended to support `windowSize` of at least 8 MB. + + ZSTD_decompressContinue() needs previous data blocks during decompression, up to `windowSize` bytes. + ZSTD_decompressContinue() is very sensitive to contiguity, + if 2 blocks don't follow each other, make sure that either the compressor breaks contiguity at the same place, + or that previous contiguous segment is large enough to properly handle maximum back-reference distance. + There are multiple ways to guarantee this condition. + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), + which can @return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. + At which point, decoding can resume from the beginning of the buffer. + Note that already decoded data stored in the buffer should be flushed before being overwritten. + + There are alternatives possible, for example using two or more buffers of size `windowSize` each, though they consume more memory. + + Finally, if you control the compression process, you can also ignore all buffer size rules, + as long as the encoder and decoder progress in "lock-step", + aka use exactly the same buffer sizes, break contiguity at the same place, etc. + + Once buffers are setup, start decompression, with ZSTD_decompressBegin(). + If decompression requires a dictionary, use ZSTD_decompressBegin_usingDict() or ZSTD_decompressBegin_usingDDict(). + + Then use ZSTD_nextSrcSizeToDecompress() and ZSTD_decompressContinue() alternatively. + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + + @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + + A frame is fully decoded when ZSTD_nextSrcSizeToDecompress() returns zero. + Context can then be reset to start a new decompression. + + Note : it's possible to know if next input to present is a header or a block, using ZSTD_nextInputType(). + This information is not required to properly decode a frame. + + == Special case : skippable frames == + + Skippable frames allow integration of user-defined data into a flow of concatenated frames. + Skippable frames will be ignored (skipped) by decompressor. + The format of skippable frames is as follows : + a) Skippable frame ID - 4 Bytes, Little endian format, any value from 0x184D2A50 to 0x184D2A5F + b) Frame Size - 4 Bytes, Little endian format, unsigned 32-bits + c) Frame Content - any content (User Data) of length equal to Frame Size + For skippable frames ZSTD_getFrameHeader() returns zfhPtr->frameType==ZSTD_skippableFrame. + For skippable frames ZSTD_decompressContinue() always returns 0 : it only skips the content. +*/ + +/*===== Buffer-less streaming decompression functions =====*/ +typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +typedef struct { + unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ + unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ + unsigned blockSizeMax; + ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ + unsigned headerSize; + unsigned dictID; + unsigned checksumFlag; +} ZSTD_frameHeader; + +/*! ZSTD_getFrameHeader() : + * decode Frame Header, or requires larger `srcSize`. + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +ZSTDLIB_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /**< doesn't consume input */ +/*! ZSTD_getFrameHeader_advanced() : + * same as ZSTD_getFrameHeader(), + * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +ZSTDLIB_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); +ZSTDLIB_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /**< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + +ZSTDLIB_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDict(ZSTD_DCtx* dctx, const void* dict, size_t dictSize); +ZSTDLIB_API size_t ZSTD_decompressBegin_usingDDict(ZSTD_DCtx* dctx, const ZSTD_DDict* ddict); + +ZSTDLIB_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); +ZSTDLIB_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +/* misc */ +ZSTDLIB_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); +typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; +ZSTDLIB_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + + +/* ============================ */ +/** Block level API */ +/* ============================ */ + +/*! + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. + + A few rules to respect : + - Compressing and decompressing require a context structure + + Use ZSTD_createCCtx() and ZSTD_createDCtx() + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary + + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. + Frame metadata is not that costly, and quickly becomes negligible as source size grows larger than a block. + - When a block is considered not compressible enough, ZSTD_compressBlock() result will be 0 (zero) ! + ===> In which case, nothing is produced into `dst` ! + + User __must__ test for such outcome and deal directly with uncompressed data + + A block cannot be declared incompressible if ZSTD_compressBlock() return value was != 0. + Doing so would mess up with statistics history, leading to potential data corruption. + + ZSTD_decompressBlock() _doesn't accept uncompressed data as input_ !! + + In case of multiple successive blocks, should some of them be uncompressed, + decoder must be informed of their existence in order to follow proper history. + Use ZSTD_insertBlock() for such a case. +*/ + +/*===== Raw zstd block functions =====*/ +ZSTDLIB_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); +ZSTDLIB_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); +ZSTDLIB_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /**< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + +#endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +#if defined (__cplusplus) +} +#endif diff --git a/module/zstd/lib/zstd_errors.h b/module/zstd/lib/zstd_errors.h new file mode 100644 index 000000000000..998398e7e57f --- /dev/null +++ b/module/zstd/lib/zstd_errors.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the + * LICENSE file in the root directory of this source tree) and the GPLv2 (found + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + */ + +#ifndef ZSTD_ERRORS_H_398273423 +#define ZSTD_ERRORS_H_398273423 + +#if defined (__cplusplus) +extern "C" { +#endif + +/*===== dependency =====*/ +#include /* size_t */ + + +/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +#ifndef ZSTDERRORLIB_VISIBILITY +# if defined(__GNUC__) && (__GNUC__ >= 4) +# define ZSTDERRORLIB_VISIBILITY __attribute__ ((visibility ("default"))) +# else +# define ZSTDERRORLIB_VISIBILITY +# endif +#endif +#if defined(ZSTD_DLL_EXPORT) && (ZSTD_DLL_EXPORT==1) +# define ZSTDERRORLIB_API __declspec(dllexport) ZSTDERRORLIB_VISIBILITY +#elif defined(ZSTD_DLL_IMPORT) && (ZSTD_DLL_IMPORT==1) +# define ZSTDERRORLIB_API __declspec(dllimport) ZSTDERRORLIB_VISIBILITY /* It isn't required but allows to generate better code, saving a function pointer load from the IAT and an indirect jump.*/ +#else +# define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY +#endif + +/*-********************************************* + * Error codes list + *-********************************************* + * Error codes _values_ are pinned down since v1.3.1 only. + * Therefore, don't rely on values if you may link to any version < v1.3.1. + * + * Only values < 100 are considered stable. + * + * note 1 : this API shall be used with static linking only. + * dynamic linking is not yet officially supported. + * note 2 : Prefer relying on the enum than on its value whenever possible + * This is the only supported way to use the error list < v1.3.1 + * note 3 : ZSTD_isError() is always correct, whatever the library version. + **********************************************/ +typedef enum { + ZSTD_error_no_error = 0, + ZSTD_error_GENERIC = 1, + ZSTD_error_prefix_unknown = 10, + ZSTD_error_version_unsupported = 12, + ZSTD_error_frameParameter_unsupported = 14, + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, + ZSTD_error_workSpace_tooSmall= 66, + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ +} ZSTD_ErrorCode; + +/*! ZSTD_getErrorCode() : + convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, + which can be used to compare with enum list published above */ +ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); +ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /**< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +#if defined (__cplusplus) +} +#endif + +#endif /* ZSTD_ERRORS_H_398273423 */ diff --git a/module/zstd/zfs_zstd.c b/module/zstd/zfs_zstd.c new file mode 100644 index 000000000000..431801d47c37 --- /dev/null +++ b/module/zstd/zfs_zstd.c @@ -0,0 +1,738 @@ +/* + * BSD 3-Clause New License (https://spdx.org/licenses/BSD-3-Clause.html) + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2018, Klara Inc. + * Copyright (c) 2016-2018, Allan Jude + * Copyright (c) 2018-2020, Sebastian Gottschall + * Copyright (c) 2019-2020, Michael Niewöhner + * Copyright (c) 2020, The FreeBSD Foundation [1] + * + * [1] Portions of this software were developed by Allan Jude + * under sponsorship from the FreeBSD Foundation. + */ + +#include +#include +#include +#include +#include +#include + +#define ZSTD_STATIC_LINKING_ONLY +#include "lib/zstd.h" +#include "lib/zstd_errors.h" + +kstat_t *zstd_ksp = NULL; + +typedef struct zstd_stats { + kstat_named_t zstd_stat_alloc_fail; + kstat_named_t zstd_stat_alloc_fallback; + kstat_named_t zstd_stat_com_alloc_fail; + kstat_named_t zstd_stat_dec_alloc_fail; + kstat_named_t zstd_stat_com_inval; + kstat_named_t zstd_stat_dec_inval; + kstat_named_t zstd_stat_dec_header_inval; + kstat_named_t zstd_stat_com_fail; + kstat_named_t zstd_stat_dec_fail; +} zstd_stats_t; + +static zstd_stats_t zstd_stats = { + { "alloc_fail", KSTAT_DATA_UINT64 }, + { "alloc_fallback", KSTAT_DATA_UINT64 }, + { "compress_alloc_fail", KSTAT_DATA_UINT64 }, + { "decompress_alloc_fail", KSTAT_DATA_UINT64 }, + { "compress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_level_invalid", KSTAT_DATA_UINT64 }, + { "decompress_header_invalid", KSTAT_DATA_UINT64 }, + { "compress_failed", KSTAT_DATA_UINT64 }, + { "decompress_failed", KSTAT_DATA_UINT64 }, +}; + +/* Enums describing the allocator type specified by kmem_type in zstd_kmem */ +enum zstd_kmem_type { + ZSTD_KMEM_UNKNOWN = 0, + /* Allocation type using kmem_vmalloc */ + ZSTD_KMEM_DEFAULT, + /* Pool based allocation using mempool_alloc */ + ZSTD_KMEM_POOL, + /* Reserved fallback memory for decompression only */ + ZSTD_KMEM_DCTX, + ZSTD_KMEM_COUNT, +}; + +/* Structure for pooled memory objects */ +struct zstd_pool { + void *mem; + size_t size; + kmutex_t barrier; + hrtime_t timeout; +}; + +/* Global structure for handling memory allocations */ +struct zstd_kmem { + enum zstd_kmem_type kmem_type; + size_t kmem_size; + struct zstd_pool *pool; +}; + +/* Fallback memory structure used for decompression only if memory runs out */ +struct zstd_fallback_mem { + size_t mem_size; + void *mem; + kmutex_t barrier; +}; + +struct zstd_levelmap { + int16_t zstd_level; + enum zio_zstd_levels level; +}; + +/* + * ZSTD memory handlers + * + * For decompression we use a different handler which also provides fallback + * memory allocation in case memory runs out. + * + * The ZSTD handlers were split up for the most simplified implementation. + */ +static void *zstd_alloc(void *opaque, size_t size); +static void *zstd_dctx_alloc(void *opaque, size_t size); +static void zstd_free(void *opaque, void *ptr); + +/* Compression memory handler */ +static const ZSTD_customMem zstd_malloc = { + zstd_alloc, + zstd_free, + NULL, +}; + +/* Decompression memory handler */ +static const ZSTD_customMem zstd_dctx_malloc = { + zstd_dctx_alloc, + zstd_free, + NULL, +}; + +/* Level map for converting ZFS internal levels to ZSTD levels and vice versa */ +static struct zstd_levelmap zstd_levels[] = { + {ZIO_ZSTD_LEVEL_1, ZIO_ZSTD_LEVEL_1}, + {ZIO_ZSTD_LEVEL_2, ZIO_ZSTD_LEVEL_2}, + {ZIO_ZSTD_LEVEL_3, ZIO_ZSTD_LEVEL_3}, + {ZIO_ZSTD_LEVEL_4, ZIO_ZSTD_LEVEL_4}, + {ZIO_ZSTD_LEVEL_5, ZIO_ZSTD_LEVEL_5}, + {ZIO_ZSTD_LEVEL_6, ZIO_ZSTD_LEVEL_6}, + {ZIO_ZSTD_LEVEL_7, ZIO_ZSTD_LEVEL_7}, + {ZIO_ZSTD_LEVEL_8, ZIO_ZSTD_LEVEL_8}, + {ZIO_ZSTD_LEVEL_9, ZIO_ZSTD_LEVEL_9}, + {ZIO_ZSTD_LEVEL_10, ZIO_ZSTD_LEVEL_10}, + {ZIO_ZSTD_LEVEL_11, ZIO_ZSTD_LEVEL_11}, + {ZIO_ZSTD_LEVEL_12, ZIO_ZSTD_LEVEL_12}, + {ZIO_ZSTD_LEVEL_13, ZIO_ZSTD_LEVEL_13}, + {ZIO_ZSTD_LEVEL_14, ZIO_ZSTD_LEVEL_14}, + {ZIO_ZSTD_LEVEL_15, ZIO_ZSTD_LEVEL_15}, + {ZIO_ZSTD_LEVEL_16, ZIO_ZSTD_LEVEL_16}, + {ZIO_ZSTD_LEVEL_17, ZIO_ZSTD_LEVEL_17}, + {ZIO_ZSTD_LEVEL_18, ZIO_ZSTD_LEVEL_18}, + {ZIO_ZSTD_LEVEL_19, ZIO_ZSTD_LEVEL_19}, + {-1, ZIO_ZSTD_LEVEL_FAST_1}, + {-2, ZIO_ZSTD_LEVEL_FAST_2}, + {-3, ZIO_ZSTD_LEVEL_FAST_3}, + {-4, ZIO_ZSTD_LEVEL_FAST_4}, + {-5, ZIO_ZSTD_LEVEL_FAST_5}, + {-6, ZIO_ZSTD_LEVEL_FAST_6}, + {-7, ZIO_ZSTD_LEVEL_FAST_7}, + {-8, ZIO_ZSTD_LEVEL_FAST_8}, + {-9, ZIO_ZSTD_LEVEL_FAST_9}, + {-10, ZIO_ZSTD_LEVEL_FAST_10}, + {-20, ZIO_ZSTD_LEVEL_FAST_20}, + {-30, ZIO_ZSTD_LEVEL_FAST_30}, + {-40, ZIO_ZSTD_LEVEL_FAST_40}, + {-50, ZIO_ZSTD_LEVEL_FAST_50}, + {-60, ZIO_ZSTD_LEVEL_FAST_60}, + {-70, ZIO_ZSTD_LEVEL_FAST_70}, + {-80, ZIO_ZSTD_LEVEL_FAST_80}, + {-90, ZIO_ZSTD_LEVEL_FAST_90}, + {-100, ZIO_ZSTD_LEVEL_FAST_100}, + {-500, ZIO_ZSTD_LEVEL_FAST_500}, + {-1000, ZIO_ZSTD_LEVEL_FAST_1000}, +}; + +/* + * This variable represents the maximum count of the pool based on the number + * of CPUs plus some buffer. We default to cpu count * 4, see init_zstd. + */ +static int pool_count = 16; + +#define ZSTD_POOL_MAX pool_count +#define ZSTD_POOL_TIMEOUT 60 * 2 + +static struct zstd_fallback_mem zstd_dctx_fallback; +static struct zstd_pool *zstd_mempool_cctx; +static struct zstd_pool *zstd_mempool_dctx; + +/* + * Try to get a cached allocated buffer from memory pool or allocate a new one + * if necessary. If a object is older than 2 minutes and does not fit the + * requested size, it will be released and a new cached entry will be allocated. + * If other pooled objects are detected without being used for 2 minutes, they + * will be released, too. + * + * The concept is that high frequency memory allocations of bigger objects are + * expensive. So if a lot of work is going on, allocations will be kept for a + * while and can be reused in that time frame. + * + * The scheduled release will be updated every time a object is reused. + */ +static void * +zstd_mempool_alloc(struct zstd_pool *zstd_mempool, size_t size) +{ + struct zstd_pool *pool; + struct zstd_kmem *mem = NULL; + + if (!zstd_mempool) { + return (NULL); + } + + /* Seek for preallocated memory slot and free obsolete slots */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + /* + * This lock is simply a marker for a pool object beeing in use. + * If it's already hold, it will be skipped. + * + * We need to create it before checking it to avoid race + * conditions caused by running in a threaded context. + * + * The lock is later released by zstd_mempool_free. + */ + if (mutex_tryenter(&pool->barrier)) { + /* + * Check if objects fits the size, if so we take it and + * update the timestamp. + */ + if (!mem && pool->mem && size <= pool->size) { + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + mem = pool->mem; + continue; + } + + /* Free memory if unused object older than 2 minutes */ + if (pool->mem && gethrestime_sec() > pool->timeout) { + vmem_free(pool->mem, pool->size); + pool->mem = NULL; + pool->size = 0; + pool->timeout = 0; + } + + mutex_exit(&pool->barrier); + } + } + + if (mem) { + return (mem); + } + + /* + * If no preallocated slot was found, try to fill in a new one. + * + * We run a similar algorithm twice here to avoid pool fragmentation. + * The first one may generate holes in the list if objects get released. + * We always make sure that these holes get filled instead of adding new + * allocations constantly at the end. + */ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + pool = &zstd_mempool[i]; + if (mutex_tryenter(&pool->barrier)) { + /* Object is free, try to allocate new one */ + if (!pool->mem) { + mem = vmem_alloc(size, KM_SLEEP); + pool->mem = mem; + + if (pool->mem) { + /* Keep track for later release */ + mem->pool = pool; + pool->size = size; + mem->kmem_type = ZSTD_KMEM_POOL; + mem->kmem_size = size; + } + } + + if (size <= pool->size) { + /* Update timestamp */ + pool->timeout = gethrestime_sec() + + ZSTD_POOL_TIMEOUT; + + return (pool->mem); + } + + mutex_exit(&pool->barrier); + } + } + + /* + * If the pool is full or the allocation failed, try lazy allocation + * instead. + */ + if (!mem) { + mem = vmem_alloc(size, KM_NOSLEEP); + if (mem) { + mem->pool = NULL; + mem->kmem_type = ZSTD_KMEM_DEFAULT; + mem->kmem_size = size; + } + } + + return (mem); +} + +/* Mark object as released by releasing the barrier mutex */ +static void +zstd_mempool_free(struct zstd_kmem *z) +{ + mutex_exit(&z->pool->barrier); +} + +/* Convert ZFS internal enum to ZSTD level */ +static int +zstd_enum_to_level(enum zio_zstd_levels level, int16_t *zstd_level) +{ + if (level > 0 && level <= ZIO_ZSTD_LEVEL_19) { + *zstd_level = zstd_levels[level - 1].zstd_level; + return (0); + } + if (level >= ZIO_ZSTD_LEVEL_FAST_1 && + level <= ZIO_ZSTD_LEVEL_FAST_1000) { + *zstd_level = zstd_levels[level - ZIO_ZSTD_LEVEL_FAST_1 + + ZIO_ZSTD_LEVEL_19].zstd_level; + return (0); + } + + /* Invalid/unknown zfs compression enum - this should never happen. */ + return (1); +} + +/* Compress block using zstd */ +size_t +zfs_zstd_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level) +{ + size_t c_len; + int16_t zstd_level; + zfs_zstdhdr_t *hdr; + ZSTD_CCtx *cctx; + + hdr = (zfs_zstdhdr_t *)d_start; + + /* Skip compression if the specified level is invalid */ + if (zstd_enum_to_level(level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_com_inval); + return (s_len); + } + + ASSERT3U(d_len, >=, sizeof (*hdr)); + ASSERT3U(d_len, <=, s_len); + ASSERT3U(zstd_level, !=, 0); + + cctx = ZSTD_createCCtx_advanced(zstd_malloc); + + /* + * Out of kernel memory, gently fall through - this will disable + * compression in zio_compress_data + */ + if (!cctx) { + ZSTDSTAT_BUMP(zstd_stat_com_alloc_fail); + return (s_len); + } + + /* Set the compression level */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_compressionLevel, zstd_level); + + /* Use the "magicless" zstd header which saves us 4 header bytes */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_format, ZSTD_f_zstd1_magicless); + + /* + * Disable redundant checksum calculation and content size storage since + * this is already done by ZFS itself. + */ + ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, 0); + ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, 0); + + c_len = ZSTD_compress2(cctx, + hdr->data, + d_len - sizeof (*hdr), + s_start, s_len); + + ZSTD_freeCCtx(cctx); + + /* Error in the compression routine, disable compression. */ + if (ZSTD_isError(c_len)) { + /* + * If we are aborting the compression because the saves are + * too small, that is not a failure. Everything else is a + * failure, so increment the compression failure counter. + */ + if (ZSTD_getErrorCode(c_len) != ZSTD_error_dstSize_tooSmall) { + ZSTDSTAT_BUMP(zstd_stat_com_fail); + } + return (s_len); + } + + /* + * Encode the compressed buffer size at the start. We'll need this in + * decompression to counter the effects of padding which might be added + * to the compressed buffer and which, if unhandled, would confuse the + * hell out of our decompression function. + */ + hdr->c_len = BE_32(c_len); + + /* + * Check version for overflow. + * The limit of 24 bits must not be exceeded. This allows a maximum + * version 1677.72.15 which we don't expect to be ever reached. + */ + ASSERT3U(ZSTD_VERSION_NUMBER, <=, 0xFFFFFF); + + /* + * Encode the compression level as well. We may need to know the + * original compression level if compressed_arc is disabled, to match + * the compression settings to write this block to the L2ARC. + * + * Encode the actual level, so if the enum changes in the future, we + * will be compatible. + * + * The upper 24 bits store the ZSTD version to be able to provide + * future compatibility, since new versions might enhance the + * compression algorithm in a way, where the compressed data will + * change. + * + * As soon as such incompatibility occurs, handling code needs to be + * added, differentiating between the versions. + */ + hdr->version = ZSTD_VERSION_NUMBER; + hdr->level = level; + hdr->raw_version_level = BE_32(hdr->raw_version_level); + + return (c_len + sizeof (*hdr)); +} + +/* Decompress block using zstd and return its stored level */ +int +zfs_zstd_decompress_level(void *s_start, void *d_start, size_t s_len, + size_t d_len, uint8_t *level) +{ + ZSTD_DCtx *dctx; + size_t result; + int16_t zstd_level; + uint32_t c_len; + const zfs_zstdhdr_t *hdr; + zfs_zstdhdr_t hdr_copy; + + hdr = (const zfs_zstdhdr_t *)s_start; + c_len = BE_32(hdr->c_len); + + /* + * Make a copy instead of directly converting the header, since we must + * not modify the original data that may be used again later. + */ + hdr_copy.raw_version_level = BE_32(hdr->raw_version_level); + + /* + * NOTE: We ignore the ZSTD version for now. As soon as any + * incompatibility occurrs, it has to be handled accordingly. + * The version can be accessed via `hdr_copy.version`. + */ + + /* + * Convert and check the level + * An invalid level is a strong indicator for data corruption! In such + * case return an error so the upper layers can try to fix it. + */ + if (zstd_enum_to_level(hdr_copy.level, &zstd_level)) { + ZSTDSTAT_BUMP(zstd_stat_dec_inval); + return (1); + } + + ASSERT3U(d_len, >=, s_len); + ASSERT3U(hdr_copy.level, !=, ZIO_COMPLEVEL_INHERIT); + + /* Invalid compressed buffer size encoded at start */ + if (c_len + sizeof (*hdr) > s_len) { + ZSTDSTAT_BUMP(zstd_stat_dec_header_inval); + return (1); + } + + dctx = ZSTD_createDCtx_advanced(zstd_dctx_malloc); + if (!dctx) { + ZSTDSTAT_BUMP(zstd_stat_dec_alloc_fail); + return (1); + } + + /* Set header type to "magicless" */ + ZSTD_DCtx_setParameter(dctx, ZSTD_d_format, ZSTD_f_zstd1_magicless); + + /* Decompress the data and release the context */ + result = ZSTD_decompressDCtx(dctx, d_start, d_len, hdr->data, c_len); + ZSTD_freeDCtx(dctx); + + /* + * Returns 0 on success (decompression function returned non-negative) + * and non-zero on failure (decompression function returned negative. + */ + if (ZSTD_isError(result)) { + ZSTDSTAT_BUMP(zstd_stat_dec_fail); + return (1); + } + + if (level) { + *level = hdr_copy.level; + } + + return (0); +} + +/* Decompress datablock using zstd */ +int +zfs_zstd_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, + int level __maybe_unused) +{ + + return (zfs_zstd_decompress_level(s_start, d_start, s_len, d_len, + NULL)); +} + +/* Allocator for zstd compression context using mempool_allocator */ +static void * +zstd_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_cctx, nbytes); + + if (!z) { + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + return (NULL); + } + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* + * Allocator for zstd decompression context using mempool_allocator with + * fallback to reserved memory if allocation fails + */ +static void * +zstd_dctx_alloc(void *opaque __maybe_unused, size_t size) +{ + size_t nbytes = sizeof (struct zstd_kmem) + size; + struct zstd_kmem *z = NULL; + enum zstd_kmem_type type = ZSTD_KMEM_DEFAULT; + + z = (struct zstd_kmem *)zstd_mempool_alloc(zstd_mempool_dctx, nbytes); + if (!z) { + /* Try harder, decompression shall not fail */ + z = vmem_alloc(nbytes, KM_SLEEP); + if (z) { + z->pool = NULL; + } + ZSTDSTAT_BUMP(zstd_stat_alloc_fail); + } else { + return ((void*)z + (sizeof (struct zstd_kmem))); + } + + /* Fallback if everything fails */ + if (!z) { + /* + * Barrier since we only can handle it in a single thread. All + * other following threads need to wait here until decompression + * is completed. zstd_free will release this barrier later. + */ + mutex_enter(&zstd_dctx_fallback.barrier); + + z = zstd_dctx_fallback.mem; + type = ZSTD_KMEM_DCTX; + ZSTDSTAT_BUMP(zstd_stat_alloc_fallback); + } + + /* Allocation should always be successful */ + if (!z) { + return (NULL); + } + + z->kmem_type = type; + z->kmem_size = nbytes; + + return ((void*)z + (sizeof (struct zstd_kmem))); +} + +/* Free allocated memory by its specific type */ +static void +zstd_free(void *opaque __maybe_unused, void *ptr) +{ + struct zstd_kmem *z = (ptr - sizeof (struct zstd_kmem)); + enum zstd_kmem_type type; + + ASSERT3U(z->kmem_type, <, ZSTD_KMEM_COUNT); + ASSERT3U(z->kmem_type, >, ZSTD_KMEM_UNKNOWN); + + type = z->kmem_type; + switch (type) { + case ZSTD_KMEM_DEFAULT: + vmem_free(z, z->kmem_size); + break; + case ZSTD_KMEM_POOL: + zstd_mempool_free(z); + break; + case ZSTD_KMEM_DCTX: + mutex_exit(&zstd_dctx_fallback.barrier); + break; + default: + break; + } +} + +/* Allocate fallback memory to ensure safe decompression */ +static void __init +create_fallback_mem(struct zstd_fallback_mem *mem, size_t size) +{ + mem->mem_size = size; + mem->mem = vmem_zalloc(mem->mem_size, KM_SLEEP); + mutex_init(&mem->barrier, NULL, MUTEX_DEFAULT, NULL); +} + +/* Initialize memory pool barrier mutexes */ +static void __init +zstd_mempool_init(void) +{ + zstd_mempool_cctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + zstd_mempool_dctx = (struct zstd_pool *) + kmem_zalloc(ZSTD_POOL_MAX * sizeof (struct zstd_pool), KM_SLEEP); + + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + mutex_init(&zstd_mempool_cctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + mutex_init(&zstd_mempool_dctx[i].barrier, NULL, + MUTEX_DEFAULT, NULL); + } +} + +/* Initialize zstd-related memory handling */ +static int __init +zstd_meminit(void) +{ + zstd_mempool_init(); + + /* + * Estimate the size of the fallback decompression context. + * The expected size on x64 with current ZSTD should be about 160 KB. + */ + create_fallback_mem(&zstd_dctx_fallback, + P2ROUNDUP(ZSTD_estimateDCtxSize() + sizeof (struct zstd_kmem), + PAGESIZE)); + + return (0); +} + +/* Release object from pool and free memory */ +static void __exit +release_pool(struct zstd_pool *pool) +{ + mutex_destroy(&pool->barrier); + vmem_free(pool->mem, pool->size); + pool->mem = NULL; + pool->size = 0; +} + +/* Release memory pool objects */ +static void __exit +zstd_mempool_deinit(void) +{ + for (int i = 0; i < ZSTD_POOL_MAX; i++) { + release_pool(&zstd_mempool_cctx[i]); + release_pool(&zstd_mempool_dctx[i]); + } + + kmem_free(zstd_mempool_dctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + kmem_free(zstd_mempool_cctx, ZSTD_POOL_MAX * sizeof (struct zstd_pool)); + zstd_mempool_dctx = NULL; + zstd_mempool_cctx = NULL; +} + +extern int __init +zstd_init(void) +{ + /* Set pool size by using maximum sane thread count * 4 */ + pool_count = (boot_ncpus * 4); + zstd_meminit(); + + /* Initialize kstat */ + zstd_ksp = kstat_create("zfs", 0, "zstd", "misc", + KSTAT_TYPE_NAMED, sizeof (zstd_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (zstd_ksp != NULL) { + zstd_ksp->ks_data = &zstd_stats; + kstat_install(zstd_ksp); + } + + return (0); +} + +extern void __exit +zstd_fini(void) +{ + /* Deinitialize kstat */ + if (zstd_ksp != NULL) { + kstat_delete(zstd_ksp); + zstd_ksp = NULL; + } + + /* Release fallback memory */ + vmem_free(zstd_dctx_fallback.mem, zstd_dctx_fallback.mem_size); + mutex_destroy(&zstd_dctx_fallback.barrier); + + /* Deinit memory pool */ + zstd_mempool_deinit(); +} + +#if defined(_KERNEL) +module_init(zstd_init); +module_exit(zstd_fini); + +ZFS_MODULE_DESCRIPTION("ZSTD Compression for ZFS"); +ZFS_MODULE_LICENSE("BSD"); +ZFS_MODULE_VERSION(ZSTD_VERSION_STRING); + +EXPORT_SYMBOL(zfs_zstd_compress); +EXPORT_SYMBOL(zfs_zstd_decompress_level); +EXPORT_SYMBOL(zfs_zstd_decompress); +#endif diff --git a/module/zstd/zstd-in.c b/module/zstd/zstd-in.c new file mode 100644 index 000000000000..121f375e5515 --- /dev/null +++ b/module/zstd/zstd-in.c @@ -0,0 +1,68 @@ +/* + * BSD 3-Clause Clear License + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from this + * software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE + * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGE. + */ + +/* + * Copyright (c) 2016-2020, Yann Collet, Facebook, Inc. + * Copyright (c) 2019-2020, Michael Niewöhner + */ + +#define MEM_MODULE +#define XXH_NAMESPACE ZSTD_ +#define XXH_PRIVATE_API +#define XXH_INLINE_ALL +#define ZSTD_LEGACY_SUPPORT 0 +#define ZSTD_LIB_DICTBUILDER 0 +#define ZSTD_LIB_DEPRECATED 0 +#define ZSTD_NOBENCH + +#include "common/debug.c" +#include "common/entropy_common.c" +#include "common/error_private.c" +#include "common/fse_decompress.c" +#include "common/pool.c" +#include "common/zstd_common.c" + +#include "compress/fse_compress.c" +#include "compress/hist.c" +#include "compress/huf_compress.c" +#include "compress/zstd_compress_literals.c" +#include "compress/zstd_compress_sequences.c" +#include "compress/zstd_compress_superblock.c" +#include "compress/zstd_compress.c" +#include "compress/zstd_double_fast.c" +#include "compress/zstd_fast.c" +#include "compress/zstd_lazy.c" +#include "compress/zstd_ldm.c" +#include "compress/zstd_opt.c" + +#include "decompress/huf_decompress.c" +#include "decompress/zstd_ddict.c" +#include "decompress/zstd_decompress.c" +#include "decompress/zstd_decompress_block.c" diff --git a/rpm/Makefile.am b/rpm/Makefile.am new file mode 100644 index 000000000000..f2cf72cef13c --- /dev/null +++ b/rpm/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = generic redhat diff --git a/rpm/generic/.gitignore b/rpm/generic/.gitignore new file mode 100644 index 000000000000..7f5daafdd6d4 --- /dev/null +++ b/rpm/generic/.gitignore @@ -0,0 +1,3 @@ +/zfs-dkms.spec +/zfs-kmod.spec +/zfs.spec diff --git a/rpm/generic/Makefile.am b/rpm/generic/Makefile.am new file mode 100644 index 000000000000..89b13640d622 --- /dev/null +++ b/rpm/generic/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = zfs.spec.in zfs-kmod.spec.in zfs-dkms.spec.in diff --git a/rpm/generic/zfs-dkms.spec.in b/rpm/generic/zfs-dkms.spec.in new file mode 100644 index 000000000000..5b05e1c332ab --- /dev/null +++ b/rpm/generic/zfs-dkms.spec.in @@ -0,0 +1,102 @@ +%{?!packager: %define packager Brian Behlendorf } + +%if ! 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version} +%define not_rpm 1 +%endif + +# Exclude input files from mangling +%global __brp_mangle_shebangs_exclude_from ^/usr/src/.*$ + +%define module @PACKAGE@ +%define mkconf scripts/dkms.mkconf + +Name: %{module}-dkms + +Version: @VERSION@ +Release: @RELEASE@%{?dist} +Summary: Kernel module(s) (dkms) + +Group: System Environment/Kernel +License: @ZFS_META_LICENSE@ +URL: https://zfsonlinux.org/ +Source0: %{module}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +BuildArch: noarch + +Requires: dkms >= 2.2.0.3 +Requires: gcc, make, perl, diffutils +%if 0%{?rhel}%{?fedora}%{?mageia}%{?suse_version} +Requires: kernel-devel +Obsoletes: spl-dkms +%endif +Provides: %{module}-kmod = %{version} +AutoReqProv: no + +%description +This package contains the dkms ZFS kernel modules. + +%prep +%setup -q -n %{module}-%{version} + +%build +%{mkconf} -n %{module} -v %{version} -f dkms.conf + +%install +if [ "$RPM_BUILD_ROOT" != "/" ]; then + rm -rf $RPM_BUILD_ROOT +fi +mkdir -p $RPM_BUILD_ROOT/usr/src/ +cp -rf ${RPM_BUILD_DIR}/%{module}-%{version} $RPM_BUILD_ROOT/usr/src/ + +%clean +if [ "$RPM_BUILD_ROOT" != "/" ]; then + rm -rf $RPM_BUILD_ROOT +fi + +%files +%defattr(-,root,root) +/usr/src/%{module}-%{version} + +%post +for POSTINST in /usr/lib/dkms/common.postinst; do + if [ -f $POSTINST ]; then + $POSTINST %{module} %{version} + exit $? + fi + echo "WARNING: $POSTINST does not exist." +done +echo -e "ERROR: DKMS version is too old and %{module} was not" +echo -e "built with legacy DKMS support." +echo -e "You must either rebuild %{module} with legacy postinst" +echo -e "support or upgrade DKMS to a more current version." +exit 1 + +%preun +# Are we doing an upgrade? +if [ "$1" = "1" -o "$1" = "upgrade" ] ; then + # Yes we are. Are we upgrading to a new ZFS version? + NEWEST_VER=$(dkms status zfs | sed 's/,//g' | sort -r -V | awk '/installed/{print $2; exit}') + if [ "$NEWEST_VER" != "%{version}" ] ; then + # Yes, it's a new ZFS version. We'll uninstall the old module + # later on in this script. + true + else + # No, it's probably an upgrade of the same ZFS version + # to a new distro (zfs-dkms-0.7.12.fc28->zfs-dkms-0.7.12.fc29). + # Don't remove our modules, since the rebuild for the new + # distro will automatically delete the old modules. + exit 0 + fi +fi + +# If we're here then we're doing an uninstall (not upgrade). +CONFIG_H="/var/lib/dkms/%{module}/%{version}/*/*/%{module}_config.h" +SPEC_META_ALIAS="@PACKAGE@-@VERSION@-@RELEASE@" +DKMS_META_ALIAS=`cat $CONFIG_H 2>/dev/null | + awk -F'"' '/META_ALIAS\s+"/ { print $2; exit 0 }'` +if [ "$SPEC_META_ALIAS" = "$DKMS_META_ALIAS" ]; then + echo -e + echo -e "Uninstall of %{module} module ($SPEC_META_ALIAS) beginning:" + dkms remove -m %{module} -v %{version} --all %{!?not_rpm:--rpm_safe_upgrade} +fi +exit 0 diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in new file mode 100644 index 000000000000..bc033733afc4 --- /dev/null +++ b/rpm/generic/zfs-kmod.spec.in @@ -0,0 +1,164 @@ +%define module @PACKAGE@ + +%if !%{defined ksrc} +%if 0%{?rhel}%{?fedora} +%define ksrc ${kernel_version##*___} +%else +%define ksrc "$( \ + if [ -e "/usr/src/linux-${kernel_version%%___*}" ]; then \ + echo "/usr/src/linux-${kernel_version%%___*}"; \ + elif [ -e "/lib/modules/${kernel_version%%___*}/source" ]; then \ + echo "/lib/modules/${kernel_version%%___*}/source"; \ + else \ + echo "/lib/modules/${kernel_version%%___*}/build"; \ + fi)" +%endif +%endif + +%if !%{defined kobj} +%if 0%{?rhel}%{?fedora} +%define kobj ${kernel_version##*___} +%else +%define kobj "$( \ + if [ -e "/usr/src/linux-${kernel_version%%___*}" ]; then \ + echo "/usr/src/linux-${kernel_version%%___*}"; \ + else \ + echo "/lib/modules/${kernel_version%%___*}/build"; \ + fi)" +%endif +%endif + +#define repo rpmfusion +#define repo chaos + +# (un)define the next line to either build for the newest or all current kernels +%define buildforkernels newest +#define buildforkernels current +#define buildforkernels akmod + +%bcond_with debug +%bcond_with debuginfo + + +Name: %{module}-kmod + +Version: @VERSION@ +Release: @RELEASE@%{?dist} +Summary: Kernel module(s) + +Group: System Environment/Kernel +License: @ZFS_META_LICENSE@ +URL: https://zfsonlinux.org/ +Source0: %{module}-%{version}.tar.gz +Source10: kmodtool +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id} -u -n) +%if 0%{?rhel}%{?fedora} +BuildRequires: gcc, make +BuildRequires: elfutils-libelf-devel +%endif + +# The developments headers will conflict with the dkms packages. +Conflicts: %{module}-dkms + +%if %{defined repo} + +# Building for a repository use the proper build-sysbuild package +# to determine which kernel-devel packages should be installed. +BuildRequires: %{_bindir}/kmodtool +%{!?kernels:BuildRequires: buildsys-build-%{repo}-kerneldevpkgs-%{?buildforkernels:%{buildforkernels}}%{!?buildforkernels:current}-%{_target_cpu}} + +%else + +# Building local packages attempt to to use the installed kernel. +%{?rhel:BuildRequires: kernel-devel} +%{?fedora:BuildRequires: kernel-devel} +%{?suse_version:BuildRequires: kernel-source} + +%if !%{defined kernels} && !%{defined build_src_rpm} + %if 0%{?rhel}%{?fedora}%{?suse_version} + %define kernels %(ls -1 /usr/src/kernels) + %else + %define kernels %(ls -1 /lib/modules) + %endif +%endif +%endif + +# LDFLAGS are not sanitized by arch/*/Makefile for these architectures. +%ifarch ppc ppc64 ppc64le aarch64 +%global __global_ldflags %{nil} +%endif + +%if 0%{?fedora} >= 17 +%define prefix /usr +%endif + +# Kmodtool does its magic here. A patched version of kmodtool is shipped +# with the source rpm until kmod development packages are supported upstream. +# https://bugzilla.rpmfusion.org/show_bug.cgi?id=2714 +%{expand:%(bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{name} %{?buildforkernels:--%{buildforkernels}} --devel %{?prefix:--prefix "%{?prefix}"} %{?kernels:--for-kernels "%{?kernels}"} %{?kernelbuildroot:--buildroot "%{?kernelbuildroot}"} --obsolete-name spl --obsolete-version 0.8 2>/dev/null) } + + +%description +This package contains the ZFS kernel modules. + +%prep +# Error out if there was something wrong with kmodtool. +%{?kmodtool_check} + +# Print kmodtool output for debugging purposes: +bash %{SOURCE10} --target %{_target_cpu} %{?repo:--repo %{?repo}} --kmodname %{name} %{?buildforkernels:--%{buildforkernels}} --devel %{?prefix:--prefix "%{?prefix}"} %{?kernels:--for-kernels "%{?kernels}"} %{?kernelbuildroot:--buildroot "%{?kernelbuildroot}"} --obsolete-name spl --obsolete-version 0.8 2>/dev/null + +%if %{with debug} + %define debug --enable-debug +%else + %define debug --disable-debug +%endif + +%if %{with debuginfo} + %define debuginfo --enable-debuginfo +%else + %define debuginfo --disable-debuginfo +%endif + +# Leverage VPATH from configure to avoid making multiple copies. +%define _configure ../%{module}-%{version}/configure + +%setup -q -c -T -a 0 + +for kernel_version in %{?kernel_versions}; do + %{__mkdir} _kmod_build_${kernel_version%%___*} +done + +%build +for kernel_version in %{?kernel_versions}; do + cd _kmod_build_${kernel_version%%___*} + %configure \ + --with-config=kernel \ + --with-linux=%{ksrc} \ + --with-linux-obj=%{kobj} \ + %{debug} \ + %{debuginfo} + make %{?_smp_mflags} + cd .. +done + + +%install +rm -rf ${RPM_BUILD_ROOT} + +# Relies on the kernel 'modules_install' make target. +for kernel_version in %{?kernel_versions}; do + cd _kmod_build_${kernel_version%%___*} + make install \ + DESTDIR=${RPM_BUILD_ROOT} \ + %{?prefix:INSTALL_MOD_PATH=%{?prefix}} \ + INSTALL_MOD_DIR=%{kmodinstdir_postfix} + cd .. +done +# find-debuginfo.sh only considers executables +chmod u+x ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/*/*/* +%{?akmod_install} + + +%clean +rm -rf $RPM_BUILD_ROOT diff --git a/rpm/generic/zfs.spec.in b/rpm/generic/zfs.spec.in new file mode 100644 index 000000000000..e715c8569a9e --- /dev/null +++ b/rpm/generic/zfs.spec.in @@ -0,0 +1,523 @@ +%global _sbindir /sbin +%global _libdir /%{_lib} + +# Set the default udev directory based on distribution. +%if %{undefined _udevdir} +%if 0%{?fedora} >= 17 || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +%global _udevdir %{_prefix}/lib/udev +%else +%global _udevdir /lib/udev +%endif +%endif + +# Set the default udevrule directory based on distribution. +%if %{undefined _udevruledir} +%if 0%{?fedora} >= 17 || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +%global _udevruledir %{_prefix}/lib/udev/rules.d +%else +%global _udevruledir /lib/udev/rules.d +%endif +%endif + +# Set the default dracut directory based on distribution. +%if %{undefined _dracutdir} +%if 0%{?fedora} >= 17 || 0%{?rhel} >= 7 || 0%{?centos} >= 7 +%global _dracutdir %{_prefix}/lib/dracut +%else +%global _dracutdir %{_prefix}/share/dracut +%endif +%endif + +%if %{undefined _initconfdir} +%global _initconfdir /etc/sysconfig +%endif + +%if %{undefined _unitdir} +%global _unitdir %{_prefix}/lib/systemd/system +%endif + +%if %{undefined _presetdir} +%global _presetdir %{_prefix}/lib/systemd/system-preset +%endif + +%if %{undefined _modulesloaddir} +%global _modulesloaddir %{_prefix}/lib/modules-load.d +%endif + +%if %{undefined _systemdgeneratordir} +%global _systemdgeneratordir %{_prefix}/lib/systemd/system-generators +%endif + +%if %{undefined _pkgconfigdir} +%global _pkgconfigdir %{_prefix}/%{_lib}/pkgconfig +%endif + +%bcond_with debug +%bcond_with debuginfo +%bcond_with asan +%bcond_with systemd +%bcond_with pam + +# Generic enable switch for systemd +%if %{with systemd} +%define _systemd 1 +%endif + +# RHEL >= 7 comes with systemd +%if 0%{?rhel} >= 7 +%define _systemd 1 +%endif + +# Fedora >= 15 comes with systemd, but only >= 18 has +# the proper macros +%if 0%{?fedora} >= 18 +%define _systemd 1 +%endif + +# opensuse >= 12.1 comes with systemd, but only >= 13.1 +# has the proper macros +%if 0%{?suse_version} >= 1310 +%define _systemd 1 +%endif + +# When not specified default to distribution provided version. This +# is normally Python 3, but for RHEL <= 7 only Python 2 is provided. +%if %{undefined __use_python} +%if 0%{?rhel} && 0%{?rhel} <= 7 +%define __python /usr/bin/python2 +%define __python_pkg_version 2 +%define __python_cffi_pkg python-cffi +%define __python_setuptools_pkg python-setuptools +%else +%define __python /usr/bin/python3 +%define __python_pkg_version 3 +%define __python_cffi_pkg python3-cffi +%define __python_setuptools_pkg python3-setuptools +%endif +%else +%define __python %{__use_python} +%define __python_pkg_version %{__use_python_pkg_version} +%define __python_cffi_pkg python%{__python_pkg_version}-cffi +%define __python_setuptools_pkg python%{__python_pkg_version}-setuptools +%endif +%define __python_sitelib %(%{__python} -Esc "from distutils.sysconfig import get_python_lib; print(get_python_lib())") + +# By default python-pyzfs is enabled, with the exception of +# RHEL 6 which by default uses Python 2.6 which is too old. +%if 0%{?rhel} == 6 +%bcond_with pyzfs +%else +%bcond_without pyzfs +%endif + +Name: @PACKAGE@ +Version: @VERSION@ +Release: @RELEASE@%{?dist} +Summary: Commands to control the kernel modules and libraries + +Group: System Environment/Kernel +License: @ZFS_META_LICENSE@ +URL: https://zfsonlinux.org/ +Source0: %{name}-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) +Requires: libzpool2 = %{version} +Requires: libnvpair1 = %{version} +Requires: libuutil1 = %{version} +Requires: libzfs2 = %{version} +Requires: %{name}-kmod = %{version} +Provides: %{name}-kmod-common = %{version} +Obsoletes: spl + +# zfs-fuse provides the same commands and man pages that ZoL does. Renaming +# those on either side would conflict with all available documentation. +Conflicts: zfs-fuse + +%if 0%{?rhel}%{?fedora}%{?suse_version} +BuildRequires: gcc, make +BuildRequires: zlib-devel +BuildRequires: libuuid-devel +BuildRequires: libblkid-devel +BuildRequires: libudev-devel +BuildRequires: libattr-devel +BuildRequires: openssl-devel +%if 0%{?fedora} >= 28 || 0%{?rhel} >= 8 || 0%{?centos} >= 8 +BuildRequires: libtirpc-devel +%endif +Requires: openssl +%if 0%{?_systemd} +BuildRequires: systemd +%endif +%endif + +%if 0%{?_systemd} +Requires(post): systemd +Requires(preun): systemd +Requires(postun): systemd +%endif + +# The zpool iostat/status -c scripts call some utilities like lsblk and iostat +Requires: util-linux +Requires: sysstat + +%description +This package contains the core ZFS command line utilities. + +%package -n libzpool2 +Summary: Native ZFS pool library for Linux +Group: System Environment/Kernel + +%description -n libzpool2 +This package contains the zpool library, which provides support +for managing zpools + +%post -n libzpool2 -p /sbin/ldconfig +%postun -n libzpool2 -p /sbin/ldconfig + +%package -n libnvpair1 +Summary: Solaris name-value library for Linux +Group: System Environment/Kernel + +%description -n libnvpair1 +This package contains routines for packing and unpacking name-value +pairs. This functionality is used to portably transport data across +process boundaries, between kernel and user space, and can be used +to write self describing data structures on disk. + +%post -n libnvpair1 -p /sbin/ldconfig +%postun -n libnvpair1 -p /sbin/ldconfig + +%package -n libuutil1 +Summary: Solaris userland utility library for Linux +Group: System Environment/Kernel + +%description -n libuutil1 +This library provides a variety of compatibility functions for ZFS on Linux: + * libspl: The Solaris Porting Layer userland library, which provides APIs + that make it possible to run Solaris user code in a Linux environment + with relatively minimal modification. + * libavl: The Adelson-Velskii Landis balanced binary tree manipulation + library. + * libefi: The Extensible Firmware Interface library for GUID disk + partitioning. + * libshare: NFS, SMB, and iSCSI service integration for ZFS. + +%post -n libuutil1 -p /sbin/ldconfig +%postun -n libuutil1 -p /sbin/ldconfig + +%package -n libzfs2 +Summary: Native ZFS filesystem library for Linux +Group: System Environment/Kernel + +%description -n libzfs2 +This package provides support for managing ZFS filesystems + +%post -n libzfs2 -p /sbin/ldconfig +%postun -n libzfs2 -p /sbin/ldconfig + +%package -n libzfs2-devel +Summary: Development headers +Group: System Environment/Kernel +Requires: libzfs2 = %{version} +Requires: libzpool2 = %{version} +Requires: libnvpair1 = %{version} +Requires: libuutil1 = %{version} +Provides: libzpool2-devel +Provides: libnvpair1-devel +Provides: libuutil1-devel +Obsoletes: zfs-devel + +%description -n libzfs2-devel +This package contains the header files needed for building additional +applications against the ZFS libraries. + +%package test +Summary: Test infrastructure +Group: System Environment/Kernel +Requires: %{name}%{?_isa} = %{version}-%{release} +Requires: parted +Requires: lsscsi +Requires: mdadm +Requires: bc +Requires: ksh +Requires: fio +Requires: acl +Requires: sudo +Requires: sysstat +Requires: libaio +Requires: python%{__python_pkg_version} +%if 0%{?rhel}%{?fedora}%{?suse_version} +BuildRequires: libaio-devel +%endif +AutoReqProv: no + +%description test +This package contains test infrastructure and support scripts for +validating the file system. + +%package dracut +Summary: Dracut module +Group: System Environment/Kernel +BuildArch: noarch +Requires: %{name} >= %{version} +Requires: dracut +Requires: /usr/bin/awk +Requires: grep + +%description dracut +This package contains a dracut module used to construct an initramfs +image which is ZFS aware. + +%if %{with pyzfs} +%package -n python%{__python_pkg_version}-pyzfs +Summary: Python %{python_version} wrapper for libzfs_core +Group: Development/Languages/Python +License: Apache-2.0 +BuildArch: noarch +Requires: libzfs2 = %{version} +Requires: libnvpair1 = %{version} +Requires: libffi +Requires: python%{__python_pkg_version} +Requires: %{__python_cffi_pkg} +%if 0%{?rhel}%{?fedora}%{?suse_version} +BuildRequires: python%{__python_pkg_version}-devel +BuildRequires: %{__python_cffi_pkg} +BuildRequires: %{__python_setuptools_pkg} +BuildRequires: libffi-devel +%endif + +%description -n python%{__python_pkg_version}-pyzfs +This package provides a python wrapper for the libzfs_core C library. +%endif + +%if 0%{?_initramfs} +%package initramfs +Summary: Initramfs module +Group: System Environment/Kernel +Requires: %{name}%{?_isa} = %{version}-%{release} +Requires: %{name} = %{version}-%{release} +Requires: initramfs-tools + +%description initramfs +This package contains a initramfs module used to construct an initramfs +image which is ZFS aware. +%endif + +%prep +%if %{with debug} + %define debug --enable-debug +%else + %define debug --disable-debug +%endif + +%if %{with debuginfo} + %define debuginfo --enable-debuginfo +%else + %define debuginfo --disable-debuginfo +%endif + +%if %{with asan} + %define asan --enable-asan +%else + %define asan --disable-asan +%endif + +%if 0%{?_systemd} + %define systemd --enable-systemd --with-systemdunitdir=%{_unitdir} --with-systemdpresetdir=%{_presetdir} --with-systemdmodulesloaddir=%{_modulesloaddir} --with-systemdgeneratordir=%{_systemdgeneratordir} --disable-sysvinit + %define systemd_svcs zfs-import-cache.service zfs-import-scan.service zfs-mount.service zfs-share.service zfs-zed.service zfs.target zfs-import.target zfs-volume-wait.service zfs-volumes.target +%else + %define systemd --enable-sysvinit --disable-systemd +%endif + +%if %{with pyzfs} + %define pyzfs --enable-pyzfs +%else + %define pyzfs --disable-pyzfs +%endif + +%if %{with pam} + %define pam --enable-pam +%else + %define pam --disable-pam +%endif + +%setup -q + +%build +%configure \ + --with-config=user \ + --with-udevdir=%{_udevdir} \ + --with-udevruledir=%{_udevruledir} \ + --with-dracutdir=%{_dracutdir} \ + --with-pamconfigsdir=%{_datadir}/pam-configs \ + --with-pammoduledir=%{_libdir}/security \ + --with-python=%{__python} \ + --with-pkgconfigdir=%{_pkgconfigdir} \ + --disable-static \ + %{debug} \ + %{debuginfo} \ + %{asan} \ + %{systemd} \ + %{pam} \ + %{pyzfs} +make %{?_smp_mflags} + +%install +%{__rm} -rf $RPM_BUILD_ROOT +make install DESTDIR=%{?buildroot} +find %{?buildroot}%{_libdir} -name '*.la' -exec rm -f {} \; +%if 0%{!?__brp_mangle_shebangs:1} +find %{?buildroot}%{_bindir} \ + \( -name arc_summary -or -name arcstat -or -name dbufstat \) \ + -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; +find %{?buildroot}%{_datadir} \ + \( -name test-runner.py -or -name zts-report.py \) \ + -exec %{__sed} -i 's|^#!.*|#!%{__python}|' {} \; +%endif + +%post +%if 0%{?_systemd} +%if 0%{?systemd_post:1} +%systemd_post %{systemd_svcs} +%else +if [ "$1" = "1" -o "$1" = "install" ] ; then + # Initial installation + systemctl preset %{systemd_svcs} >/dev/null || true +fi +%endif +%else +if [ -x /sbin/chkconfig ]; then + /sbin/chkconfig --add zfs-import + /sbin/chkconfig --add zfs-mount + /sbin/chkconfig --add zfs-share + /sbin/chkconfig --add zfs-zed +fi +%endif +exit 0 + +# On RHEL/CentOS 7 the static nodes aren't refreshed by default after +# installing a package. This is the default behavior for Fedora. +%posttrans +%if 0%{?rhel} == 7 || 0%{?centos} == 7 +systemctl restart kmod-static-nodes +systemctl restart systemd-tmpfiles-setup-dev +udevadm trigger +%endif + +%preun +%if 0%{?_systemd} +%if 0%{?systemd_preun:1} +%systemd_preun %{systemd_svcs} +%else +if [ "$1" = "0" -o "$1" = "remove" ] ; then + # Package removal, not upgrade + systemctl --no-reload disable %{systemd_svcs} >/dev/null || true + systemctl stop %{systemd_svcs} >/dev/null || true +fi +%endif +%else +if [ "$1" = "0" -o "$1" = "remove" ] && [ -x /sbin/chkconfig ]; then + /sbin/chkconfig --del zfs-import + /sbin/chkconfig --del zfs-mount + /sbin/chkconfig --del zfs-share + /sbin/chkconfig --del zfs-zed +fi +%endif +exit 0 + +%postun +%if 0%{?_systemd} +%if 0%{?systemd_postun:1} +%systemd_postun %{systemd_svcs} +%else +systemctl --system daemon-reload >/dev/null || true +%endif +%endif + +%files +# Core utilities +%{_sbindir}/* +%{_bindir}/raidz_test +%{_bindir}/zgenhostid +%{_bindir}/zvol_wait +# Optional Python 2/3 scripts +%{_bindir}/arc_summary +%{_bindir}/arcstat +%{_bindir}/dbufstat +# Man pages +%{_mandir}/man1/* +%{_mandir}/man5/* +%{_mandir}/man8/* +# Configuration files and scripts +%{_libexecdir}/%{name} +%{_udevdir}/vdev_id +%{_udevdir}/zvol_id +%{_udevdir}/rules.d/* +%if ! 0%{?_systemd} || 0%{?_initramfs} +# Files needed for sysvinit and initramfs-tools +%{_sysconfdir}/%{name}/zfs-functions +%config(noreplace) %{_initconfdir}/zfs +%else +%exclude %{_sysconfdir}/%{name}/zfs-functions +%exclude %{_initconfdir}/zfs +%endif +%if 0%{?_systemd} +%{_unitdir}/* +%{_presetdir}/* +%{_modulesloaddir}/* +%{_systemdgeneratordir}/* +%else +%config(noreplace) %{_sysconfdir}/init.d/* +%endif +%config(noreplace) %{_sysconfdir}/%{name}/zed.d/* +%config(noreplace) %{_sysconfdir}/%{name}/zpool.d/* +%config(noreplace) %{_sysconfdir}/%{name}/vdev_id.conf.*.example +%attr(440, root, root) %config(noreplace) %{_sysconfdir}/sudoers.d/* +%if %{with pam} +%{_libdir}/security/* +%{_datadir}/pam-configs/* +%endif + +%files -n libzpool2 +%{_libdir}/libzpool.so.* + +%files -n libnvpair1 +%{_libdir}/libnvpair.so.* + +%files -n libuutil1 +%{_libdir}/libuutil.so.* + +%files -n libzfs2 +%{_libdir}/libzfs*.so.* + +%files -n libzfs2-devel +%{_pkgconfigdir}/libzfs.pc +%{_pkgconfigdir}/libzfs_core.pc +%{_libdir}/*.so +%{_includedir}/* +%doc AUTHORS COPYRIGHT LICENSE NOTICE README.md + +%files test +%{_datadir}/%{name} + +%files dracut +%doc contrib/dracut/README.dracut.markdown +%{_dracutdir}/modules.d/* + +%if %{with pyzfs} +%files -n python%{__python_pkg_version}-pyzfs +%doc contrib/pyzfs/README +%doc contrib/pyzfs/LICENSE +%defattr(-,root,root,-) +%{__python_sitelib}/libzfs_core/* +%{__python_sitelib}/pyzfs* +%endif + +%if 0%{?_initramfs} +%files initramfs +%doc contrib/initramfs/README.initramfs.markdown +/usr/share/initramfs-tools/* +%else +# Since we're not building the initramfs package, +# ignore those files. +%exclude /usr/share/initramfs-tools +%endif diff --git a/rpm/redhat/.gitignore b/rpm/redhat/.gitignore new file mode 100644 index 000000000000..7f5daafdd6d4 --- /dev/null +++ b/rpm/redhat/.gitignore @@ -0,0 +1,3 @@ +/zfs-dkms.spec +/zfs-kmod.spec +/zfs.spec diff --git a/rpm/redhat/Makefile.am b/rpm/redhat/Makefile.am new file mode 100644 index 000000000000..89b13640d622 --- /dev/null +++ b/rpm/redhat/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST = zfs.spec.in zfs-kmod.spec.in zfs-dkms.spec.in diff --git a/rpm/redhat/zfs-dkms.spec.in b/rpm/redhat/zfs-dkms.spec.in new file mode 120000 index 000000000000..ffa051baaf03 --- /dev/null +++ b/rpm/redhat/zfs-dkms.spec.in @@ -0,0 +1 @@ +../generic/zfs-dkms.spec.in \ No newline at end of file diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in new file mode 100644 index 000000000000..6d928ec74ca7 --- /dev/null +++ b/rpm/redhat/zfs-kmod.spec.in @@ -0,0 +1,88 @@ +%bcond_with debug +%bcond_with debuginfo + +Name: @PACKAGE@-kmod +Version: @VERSION@ +Release: @RELEASE@%{?dist} + +Summary: Kernel module(s) +Group: System Environment/Kernel +License: @ZFS_META_LICENSE@ +URL: https://zfsonlinux.org/ +BuildRequires: %kernel_module_package_buildreqs +Source0: @PACKAGE@-%{version}.tar.gz +BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n) + +# Additional dependency information for the kmod sub-package must be specified +# by generating a preamble text file which kmodtool can append to the spec file. +%(/bin/echo -e "\ +Requires: @PACKAGE@ = %{version}\n\ +Conflicts: @PACKAGE@-dkms\n\n" > %{_sourcedir}/kmod-preamble\n\ +Obsoletes: spl-kmod) + +# LDFLAGS are not sanitized by arch/*/Makefile for these architectures. +%ifarch ppc ppc64 ppc64le aarch64 +%global __global_ldflags %{nil} +%endif + +%description +This package contains the ZFS kernel modules. + +%define kmod_name @PACKAGE@ + +%kernel_module_package -n %{kmod_name} -p %{_sourcedir}/kmod-preamble + +%define ksrc %{_usrsrc}/kernels/%{kverrel} +%define kobj %{ksrc} + +%package -n kmod-%{kmod_name}-devel +Summary: ZFS kernel module(s) devel common +Group: System Environment/Kernel +Provides: kmod-spl-devel = %{version} + +%description -n kmod-%{kmod_name}-devel +This package provides the header files and objects to build kernel modules. + +%prep +if ! [ -d "%{ksrc}" ]; then + echo "Kernel build directory isn't set properly, cannot continue" + exit 1 +fi + +%if %{with debug} +%define debug --enable-debug +%else +%define debug --disable-debug +%endif + +%if %{with debuginfo} +%define debuginfo --enable-debuginfo +%else +%define debuginfo --disable-debuginfo +%endif + +%setup -n %{kmod_name}-%{version} +%build +%configure \ + --with-config=kernel \ + --with-linux=%{ksrc} \ + --with-linux-obj=%{kobj} \ + %{debug} \ + %{debuginfo} +make %{?_smp_mflags} + +%install +make install \ + DESTDIR=${RPM_BUILD_ROOT} \ + INSTALL_MOD_DIR=extra/%{kmod_name} +%{__rm} -f %{buildroot}/lib/modules/%{kverrel}/modules.* + +# find-debuginfo.sh only considers executables +%{__chmod} u+x %{buildroot}/lib/modules/%{kverrel}/extra/*/*/* + +%clean +rm -rf $RPM_BUILD_ROOT + +%files -n kmod-%{kmod_name}-devel +%{_usrsrc}/%{kmod_name}-%{version} +%{_usrsrc}/spl-%{version} diff --git a/rpm/redhat/zfs.spec.in b/rpm/redhat/zfs.spec.in new file mode 120000 index 000000000000..4c8079166ff8 --- /dev/null +++ b/rpm/redhat/zfs.spec.in @@ -0,0 +1 @@ +../generic/zfs.spec.in \ No newline at end of file diff --git a/scripts/.gitignore b/scripts/.gitignore new file mode 100644 index 000000000000..5621a6e147a0 --- /dev/null +++ b/scripts/.gitignore @@ -0,0 +1 @@ +common.sh diff --git a/scripts/Makefile.am b/scripts/Makefile.am new file mode 100644 index 000000000000..9d39947525a3 --- /dev/null +++ b/scripts/Makefile.am @@ -0,0 +1,77 @@ +pkgdatadir = $(datadir)/@PACKAGE@ + +dist_pkgdata_SCRIPTS = \ + zimport.sh \ + zfs.sh \ + zfs-tests.sh \ + zloop.sh \ + zfs-helpers.sh + +EXTRA_DIST = \ + commitcheck.sh \ + common.sh.in \ + cstyle.pl \ + dkms.mkconf \ + dkms.postbuild \ + enum-extract.pl \ + kmodtool \ + make_gitrev.sh \ + man-dates.sh \ + paxcheck.sh \ + zfs2zol-patch.sed \ + zol2zfs-patch.sed + +define EXTRA_ENVIRONMENT + +# Only required for in-tree use +export INTREE="yes" +export GDB="libtool --mode=execute gdb" +export LDMOD=/sbin/insmod + +export CMD_DIR=@abs_top_builddir@/cmd +export UDEV_RULE_DIR=@abs_top_builddir@/udev/rules.d +export ZEDLET_ETC_DIR=$$CMD_DIR/zed/zed.d +export ZEDLET_LIBEXEC_DIR=$$CMD_DIR/zed/zed.d +export ZPOOL_SCRIPT_DIR=$$CMD_DIR/zpool/zpool.d +export ZPOOL_SCRIPTS_PATH=$$CMD_DIR/zpool/zpool.d +export CONTRIB_DIR=@abs_top_builddir@/contrib +export LIB_DIR=@abs_top_builddir@/lib + +export INSTALL_UDEV_DIR=@udevdir@ +export INSTALL_UDEV_RULE_DIR=@udevruledir@ +export INSTALL_MOUNT_HELPER_DIR=@mounthelperdir@ +export INSTALL_SYSCONF_DIR=@sysconfdir@ +export INSTALL_PYTHON_DIR=@pythonsitedir@ + +export KMOD_SPL=@abs_top_builddir@/module/spl/spl.ko +export KMOD_ZAVL=@abs_top_builddir@/module/avl/zavl.ko +export KMOD_ZNVPAIR=@abs_top_builddir@/module/nvpair/znvpair.ko +export KMOD_ZUNICODE=@abs_top_builddir@/module/unicode/zunicode.ko +export KMOD_ZCOMMON=@abs_top_builddir@/module/zcommon/zcommon.ko +export KMOD_ZLUA=@abs_top_builddir@/module/lua/zlua.ko +export KMOD_ICP=@abs_top_builddir@/module/icp/icp.ko +export KMOD_ZFS=@abs_top_builddir@/module/zfs/zfs.ko +export KMOD_FREEBSD=@abs_top_builddir@/module/openzfs.ko +export KMOD_ZZSTD=@abs_top_builddir@/module/zstd/zzstd.ko +endef + +export EXTRA_ENVIRONMENT + +all-local: + -$(SED) -e '\|^export BIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ + -e '\|^export SBIN_DIR=|s|$$|@abs_top_builddir@/bin|' \ + -e '\|^export ZTS_DIR=|s|$$|@abs_top_srcdir@/tests|' \ + -e '\|^export SCRIPT_DIR=|s|$$|@abs_top_srcdir@/scripts|' \ + $(abs_top_srcdir)/scripts/common.sh.in >common.sh + -echo "$$EXTRA_ENVIRONMENT" >>common.sh + +clean-local: + -$(RM) common.sh + +install-data-hook: + -$(SED) -e '\|^export BIN_DIR=|s|$$|@bindir@|' \ + -e '\|^export SBIN_DIR=|s|$$|@sbindir@|' \ + -e '\|^export ZTS_DIR=|s|$$|@datadir@/@PACKAGE@|' \ + -e '\|^export SCRIPT_DIR=|s|$$|@datadir@/@PACKAGE@|' \ + $(abs_top_srcdir)/scripts/common.sh.in \ + >$(DESTDIR)$(datadir)/@PACKAGE@/common.sh diff --git a/scripts/commitcheck.sh b/scripts/commitcheck.sh new file mode 100755 index 000000000000..c7515c23e1d0 --- /dev/null +++ b/scripts/commitcheck.sh @@ -0,0 +1,218 @@ +#!/usr/bin/env bash + +REF="HEAD" + +# test a url +function test_url() +{ + url="$1" + if ! curl --output /dev/null --max-time 60 \ + --silent --head --fail "$url" ; then + echo "\"$url\" is unreachable" + return 1 + fi + + return 0 +} + +# test commit body for length +# lines containing urls are exempt for the length limit. +function test_commit_bodylength() +{ + length="72" + body=$(git log -n 1 --pretty=%b "$REF" | grep -Ev "http(s)*://" | grep -E -m 1 ".{$((length + 1))}") + if [ -n "$body" ]; then + echo "error: commit message body contains line over ${length} characters" + return 1 + fi + + return 0 +} + +# check for a tagged line +function check_tagged_line() +{ + regex='^\s*'"$1"':\s[[:print:]]+\s<[[:graph:]]+>$' + foundline=$(git log -n 1 "$REF" | grep -E -m 1 "$regex") + if [ -z "$foundline" ]; then + echo "error: missing \"$1\"" + return 1 + fi + + return 0 +} + +# check for a tagged line and check that the link is valid +function check_tagged_line_with_url() +{ + regex='^\s*'"$1"':\s\K([[:graph:]]+)$' + foundline=$(git log -n 1 "$REF" | grep -Po "$regex") + if [ -z "$foundline" ]; then + echo "error: missing \"$1\"" + return 1 + fi + + OLDIFS=$IFS + IFS=$'\n' + for url in $(echo -e "$foundline"); do + if ! test_url "$url"; then + return 1 + fi + done + IFS=$OLDIFS + + return 0 +} + +# check commit message for a normal commit +function new_change_commit() +{ + error=0 + + # subject is not longer than 72 characters + long_subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '.{73}') + if [ -n "$long_subject" ]; then + echo "error: commit subject over 72 characters" + error=1 + fi + + # need a signed off by + if ! check_tagged_line "Signed-off-by" ; then + error=1 + fi + + # ensure that no lines in the body of the commit are over 72 characters + if ! test_commit_bodylength ; then + error=1 + fi + + return $error +} + +function is_openzfs_port() +{ + # subject starts with OpenZFS means it's an openzfs port + subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS') + if [ -n "$subject" ]; then + return 0 + fi + + return 1 +} + +function openzfs_port_commit() +{ + error=0 + + # subject starts with OpenZFS dddd + subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^OpenZFS [[:digit:]]+(, [[:digit:]]+)* - ') + if [ -z "$subject" ]; then + echo "error: OpenZFS patch ports must have a subject line that starts with \"OpenZFS dddd - \"" + error=1 + fi + + # need an authored by line + if ! check_tagged_line "Authored by" ; then + error=1 + fi + + # need a reviewed by line + if ! check_tagged_line "Reviewed by" ; then + error=1 + fi + + # need ported by line + if ! check_tagged_line "Ported-by" ; then + error=1 + fi + + # need a url to openzfs commit and it should be valid + if ! check_tagged_line_with_url "OpenZFS-commit" ; then + error=1 + fi + + # need a url to illumos issue and it should be valid + if ! check_tagged_line_with_url "OpenZFS-issue" ; then + error=1 + fi + + return $error +} + +function is_coverity_fix() +{ + # subject starts with Fix coverity defects means it's a coverity fix + subject=$(git log -n 1 --pretty=%s "$REF" | grep -E -m 1 '^Fix coverity defects') + if [ -n "$subject" ]; then + return 0 + fi + + return 1 +} + +function coverity_fix_commit() +{ + error=0 + + # subject starts with Fix coverity defects: CID dddd, dddd... + subject=$(git log -n 1 --pretty=%s "$REF" | + grep -E -m 1 'Fix coverity defects: CID [[:digit:]]+(, [[:digit:]]+)*') + if [ -z "$subject" ]; then + echo "error: Coverity defect fixes must have a subject line that starts with \"Fix coverity defects: CID dddd\"" + error=1 + fi + + # need a signed off by + if ! check_tagged_line "Signed-off-by" ; then + error=1 + fi + + # test each summary line for the proper format + OLDIFS=$IFS + IFS=$'\n' + for line in $(git log -n 1 --pretty=%b "$REF" | grep -E '^CID'); do + echo "$line" | grep -E '^CID [[:digit:]]+: ([[:graph:]]+|[[:space:]])+ \(([[:upper:]]|\_)+\)' > /dev/null + # shellcheck disable=SC2181 + if [[ $? -ne 0 ]]; then + echo "error: commit message has an improperly formatted CID defect line" + error=1 + fi + done + IFS=$OLDIFS + + # ensure that no lines in the body of the commit are over 72 characters + if ! test_commit_bodylength; then + error=1 + fi + + return $error +} + +if [ -n "$1" ]; then + REF="$1" +fi + +# if openzfs port, test against that +if is_openzfs_port; then + if ! openzfs_port_commit ; then + exit 1 + else + exit 0 + fi +fi + +# if coverity fix, test against that +if is_coverity_fix; then + if ! coverity_fix_commit; then + exit 1 + else + exit 0 + fi +fi + +# have a normal commit +if ! new_change_commit ; then + exit 1 +fi + +exit 0 diff --git a/scripts/common.sh.in b/scripts/common.sh.in new file mode 100644 index 000000000000..2d9d9c786622 --- /dev/null +++ b/scripts/common.sh.in @@ -0,0 +1,21 @@ +#!/bin/sh + +# Directories +export BIN_DIR= +export SBIN_DIR= +export ZTS_DIR= +export SCRIPT_DIR= + +# General commands +export ZDB=${ZDB:-$SBIN_DIR/zdb} +export ZFS=${ZFS:-$SBIN_DIR/zfs} +export ZPOOL=${ZPOOL:-$SBIN_DIR/zpool} +export ZTEST=${ZTEST:-$SBIN_DIR/ztest} +export ZFS_SH=${ZFS_SH:-$SCRIPT_DIR/zfs.sh} + +# Test Suite +export RUNFILE_DIR=${RUNFILE_DIR:-$ZTS_DIR/runfiles} +export TEST_RUNNER=${TEST_RUNNER:-$ZTS_DIR/test-runner/bin/test-runner.py} +export ZTS_REPORT=${ZTS_REPORT:-$ZTS_DIR/test-runner/bin/zts-report.py} +export STF_TOOLS=${STF_TOOLS:-$ZTS_DIR/test-runner} +export STF_SUITE=${STF_SUITE:-$ZTS_DIR/zfs-tests} diff --git a/scripts/cstyle.pl b/scripts/cstyle.pl new file mode 100755 index 000000000000..d19718ecf443 --- /dev/null +++ b/scripts/cstyle.pl @@ -0,0 +1,1018 @@ +#!/usr/bin/env perl +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# +# Copyright 2016 Nexenta Systems, Inc. +# +# Copyright 2008 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# @(#)cstyle 1.58 98/09/09 (from shannon) +#ident "%Z%%M% %I% %E% SMI" +# +# cstyle - check for some common stylistic errors. +# +# cstyle is a sort of "lint" for C coding style. +# It attempts to check for the style used in the +# kernel, sometimes known as "Bill Joy Normal Form". +# +# There's a lot this can't check for, like proper indentation +# of code blocks. There's also a lot more this could check for. +# +# A note to the non perl literate: +# +# perl regular expressions are pretty much like egrep +# regular expressions, with the following special symbols +# +# \s any space character +# \S any non-space character +# \w any "word" character [a-zA-Z0-9_] +# \W any non-word character +# \d a digit [0-9] +# \D a non-digit +# \b word boundary (between \w and \W) +# \B non-word boundary +# + +require 5.0; +use warnings; +use IO::File; +use Getopt::Std; +use strict; + +my $usage = +"usage: cstyle [-cghpvCP] [-o constructs] file ... + -c check continuation indentation inside functions + -g print github actions' workflow commands + -h perform heuristic checks that are sometimes wrong + -p perform some of the more picky checks + -v verbose + -C don't check anything in header block comments + -P check for use of non-POSIX types + -o constructs + allow a comma-separated list of optional constructs: + doxygen allow doxygen-style block comments (/** /*!) + splint allow splint-style lint comments (/*@ ... @*/) +"; + +my %opts; + +if (!getopts("cgho:pvCP", \%opts)) { + print $usage; + exit 2; +} + +my $check_continuation = $opts{'c'}; +my $github_workflow = $opts{'g'} || $ENV{'CI'}; +my $heuristic = $opts{'h'}; +my $picky = $opts{'p'}; +my $verbose = $opts{'v'}; +my $ignore_hdr_comment = $opts{'C'}; +my $check_posix_types = $opts{'P'}; + +my $doxygen_comments = 0; +my $splint_comments = 0; + +if (defined($opts{'o'})) { + for my $x (split /,/, $opts{'o'}) { + if ($x eq "doxygen") { + $doxygen_comments = 1; + } elsif ($x eq "splint") { + $splint_comments = 1; + } else { + print "cstyle: unrecognized construct \"$x\"\n"; + print $usage; + exit 2; + } + } +} + +my ($filename, $line, $prev); # shared globals + +my $fmt; +my $hdr_comment_start; + +if ($verbose) { + $fmt = "%s: %d: %s\n%s\n"; +} else { + $fmt = "%s: %d: %s\n"; +} + +if ($doxygen_comments) { + # doxygen comments look like "/*!" or "/**"; allow them. + $hdr_comment_start = qr/^\s*\/\*[\!\*]?$/; +} else { + $hdr_comment_start = qr/^\s*\/\*$/; +} + +# Note, following must be in single quotes so that \s and \w work right. +my $typename = '(int|char|short|long|unsigned|float|double' . + '|\w+_t|struct\s+\w+|union\s+\w+|FILE)'; + +# mapping of old types to POSIX compatible types +my %old2posix = ( + 'unchar' => 'uchar_t', + 'ushort' => 'ushort_t', + 'uint' => 'uint_t', + 'ulong' => 'ulong_t', + 'u_int' => 'uint_t', + 'u_short' => 'ushort_t', + 'u_long' => 'ulong_t', + 'u_char' => 'uchar_t', + 'quad' => 'quad_t' +); + +my $lint_re = qr/\/\*(?: + ARGSUSED[0-9]*|NOTREACHED|LINTLIBRARY|VARARGS[0-9]*| + CONSTCOND|CONSTANTCOND|CONSTANTCONDITION|EMPTY| + FALLTHRU|FALLTHROUGH|LINTED.*?|PRINTFLIKE[0-9]*| + PROTOLIB[0-9]*|SCANFLIKE[0-9]*|CSTYLED.*? + )\*\//x; + +my $splint_re = qr/\/\*@.*?@\*\//x; + +my $warlock_re = qr/\/\*\s*(?: + VARIABLES\ PROTECTED\ BY| + MEMBERS\ PROTECTED\ BY| + ALL\ MEMBERS\ PROTECTED\ BY| + READ-ONLY\ VARIABLES:| + READ-ONLY\ MEMBERS:| + VARIABLES\ READABLE\ WITHOUT\ LOCK:| + MEMBERS\ READABLE\ WITHOUT\ LOCK:| + LOCKS\ COVERED\ BY| + LOCK\ UNNEEDED\ BECAUSE| + LOCK\ NEEDED:| + LOCK\ HELD\ ON\ ENTRY:| + READ\ LOCK\ HELD\ ON\ ENTRY:| + WRITE\ LOCK\ HELD\ ON\ ENTRY:| + LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| + READ\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| + WRITE\ LOCK\ ACQUIRED\ AS\ SIDE\ EFFECT:| + LOCK\ RELEASED\ AS\ SIDE\ EFFECT:| + LOCK\ UPGRADED\ AS\ SIDE\ EFFECT:| + LOCK\ DOWNGRADED\ AS\ SIDE\ EFFECT:| + FUNCTIONS\ CALLED\ THROUGH\ POINTER| + FUNCTIONS\ CALLED\ THROUGH\ MEMBER| + LOCK\ ORDER: + )/x; + +my $err_stat = 0; # exit status + +if ($#ARGV >= 0) { + foreach my $arg (@ARGV) { + my $fh = new IO::File $arg, "r"; + if (!defined($fh)) { + printf "%s: can not open\n", $arg; + } else { + &cstyle($arg, $fh); + close $fh; + } + } +} else { + &cstyle("", *STDIN); +} +exit $err_stat; + +my $no_errs = 0; # set for CSTYLED-protected lines + +sub err($) { + my ($error) = @_; + unless ($no_errs) { + if ($verbose) { + printf $fmt, $filename, $., $error, $line; + } else { + printf $fmt, $filename, $., $error; + } + if ($github_workflow) { + printf "::error file=%s,line=%s::%s\n", $filename, $., $error; + } + $err_stat = 1; + } +} + +sub err_prefix($$) { + my ($prevline, $error) = @_; + my $out = $prevline."\n".$line; + unless ($no_errs) { + if ($verbose) { + printf $fmt, $filename, $., $error, $out; + } else { + printf $fmt, $filename, $., $error; + } + $err_stat = 1; + } +} + +sub err_prev($) { + my ($error) = @_; + unless ($no_errs) { + if ($verbose) { + printf $fmt, $filename, $. - 1, $error, $prev; + } else { + printf $fmt, $filename, $. - 1, $error; + } + $err_stat = 1; + } +} + +sub cstyle($$) { + +my ($fn, $filehandle) = @_; +$filename = $fn; # share it globally + +my $in_cpp = 0; +my $next_in_cpp = 0; + +my $in_comment = 0; +my $in_header_comment = 0; +my $comment_done = 0; +my $in_warlock_comment = 0; +my $in_function = 0; +my $in_function_header = 0; +my $function_header_full_indent = 0; +my $in_declaration = 0; +my $note_level = 0; +my $nextok = 0; +my $nocheck = 0; + +my $in_string = 0; + +my ($okmsg, $comment_prefix); + +$line = ''; +$prev = ''; +reset_indent(); + +line: while (<$filehandle>) { + s/\r?\n$//; # strip return and newline + + # save the original line, then remove all text from within + # double or single quotes, we do not want to check such text. + + $line = $_; + + # + # C allows strings to be continued with a backslash at the end of + # the line. We translate that into a quoted string on the previous + # line followed by an initial quote on the next line. + # + # (we assume that no-one will use backslash-continuation with character + # constants) + # + $_ = '"' . $_ if ($in_string && !$nocheck && !$in_comment); + + # + # normal strings and characters + # + s/'([^\\']|\\[^xX0]|\\0[0-9]*|\\[xX][0-9a-fA-F]*)'/''/g; + s/"([^\\"]|\\.)*"/\"\"/g; + + # + # detect string continuation + # + if ($nocheck || $in_comment) { + $in_string = 0; + } else { + # + # Now that all full strings are replaced with "", we check + # for unfinished strings continuing onto the next line. + # + $in_string = + (s/([^"](?:"")*)"([^\\"]|\\.)*\\$/$1""/ || + s/^("")*"([^\\"]|\\.)*\\$/""/); + } + + # + # figure out if we are in a cpp directive + # + $in_cpp = $next_in_cpp || /^\s*#/; # continued or started + $next_in_cpp = $in_cpp && /\\$/; # only if continued + + # strip off trailing backslashes, which appear in long macros + s/\s*\\$//; + + # an /* END CSTYLED */ comment ends a no-check block. + if ($nocheck) { + if (/\/\* *END *CSTYLED *\*\//) { + $nocheck = 0; + } else { + reset_indent(); + next line; + } + } + + # a /*CSTYLED*/ comment indicates that the next line is ok. + if ($nextok) { + if ($okmsg) { + err($okmsg); + } + $nextok = 0; + $okmsg = 0; + if (/\/\* *CSTYLED.*\*\//) { + /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; + $okmsg = $1; + $nextok = 1; + } + $no_errs = 1; + } elsif ($no_errs) { + $no_errs = 0; + } + + # check length of line. + # first, a quick check to see if there is any chance of being too long. + if (($line =~ tr/\t/\t/) * 7 + length($line) > 80) { + # yes, there is a chance. + # replace tabs with spaces and check again. + my $eline = $line; + 1 while $eline =~ + s/\t+/' ' x (length($&) * 8 - length($`) % 8)/e; + if (length($eline) > 80) { + err("line > 80 characters"); + } + } + + # ignore NOTE(...) annotations (assumes NOTE is on lines by itself). + if ($note_level || /\b_?NOTE\s*\(/) { # if in NOTE or this is NOTE + s/[^()]//g; # eliminate all non-parens + $note_level += s/\(//g - length; # update paren nest level + next; + } + + # a /* BEGIN CSTYLED */ comment starts a no-check block. + if (/\/\* *BEGIN *CSTYLED *\*\//) { + $nocheck = 1; + } + + # a /*CSTYLED*/ comment indicates that the next line is ok. + if (/\/\* *CSTYLED.*\*\//) { + /^.*\/\* *CSTYLED *(.*) *\*\/.*$/; + $okmsg = $1; + $nextok = 1; + } + if (/\/\/ *CSTYLED/) { + /^.*\/\/ *CSTYLED *(.*)$/; + $okmsg = $1; + $nextok = 1; + } + + # universal checks; apply to everything + if (/\t +\t/) { + err("spaces between tabs"); + } + if (/ \t+ /) { + err("tabs between spaces"); + } + if (/\s$/) { + err("space or tab at end of line"); + } + if (/[^ \t(]\/\*/ && !/\w\(\/\*.*\*\/\);/) { + err("comment preceded by non-blank"); + } + + # is this the beginning or ending of a function? + # (not if "struct foo\n{\n") + if (/^\{$/ && $prev =~ /\)\s*(const\s*)?(\/\*.*\*\/\s*)?\\?$/) { + $in_function = 1; + $in_declaration = 1; + $in_function_header = 0; + $function_header_full_indent = 0; + $prev = $line; + next line; + } + if (/^\}\s*(\/\*.*\*\/\s*)*$/) { + if ($prev =~ /^\s*return\s*;/) { + err_prev("unneeded return at end of function"); + } + $in_function = 0; + reset_indent(); # we don't check between functions + $prev = $line; + next line; + } + if ($in_function_header && ! /^ (\w|\.)/ ) { + if (/^\{\}$/ # empty functions + || /;/ #run function with multiline arguments + || /#/ #preprocessor commands + || /^[^\s\\]*\(.*\)$/ #functions without ; at the end + || /^$/ #function declaration can't have empty line + ) { + $in_function_header = 0; + $function_header_full_indent = 0; + } elsif ($prev =~ /^__attribute__/) { #__attribute__((*)) + $in_function_header = 0; + $function_header_full_indent = 0; + $prev = $line; + next line; + } elsif ($picky && ! (/^\t/ && $function_header_full_indent != 0)) { + + err("continuation line should be indented by 4 spaces"); + } + } + + # + # If this matches something of form "foo(", it's probably a function + # definition, unless it ends with ") bar;", in which case it's a declaration + # that uses a macro to generate the type. + # + if (/^\w+\(/ && !/\) \w+;/) { + $in_function_header = 1; + if (/\($/) { + $function_header_full_indent = 1; + } + } + if ($in_function_header && /^\{$/) { + $in_function_header = 0; + $function_header_full_indent = 0; + $in_function = 1; + } + if ($in_function_header && /\);$/) { + $in_function_header = 0; + $function_header_full_indent = 0; + } + if ($in_function_header && /\{$/ ) { + if ($picky) { + err("opening brace on same line as function header"); + } + $in_function_header = 0; + $function_header_full_indent = 0; + $in_function = 1; + next line; + } + + if ($in_warlock_comment && /\*\//) { + $in_warlock_comment = 0; + $prev = $line; + next line; + } + + # a blank line terminates the declarations within a function. + # XXX - but still a problem in sub-blocks. + if ($in_declaration && /^$/) { + $in_declaration = 0; + } + + if ($comment_done) { + $in_comment = 0; + $in_header_comment = 0; + $comment_done = 0; + } + # does this looks like the start of a block comment? + if (/$hdr_comment_start/) { + if (!/^\t*\/\*/) { + err("block comment not indented by tabs"); + } + $in_comment = 1; + /^(\s*)\//; + $comment_prefix = $1; + if ($comment_prefix eq "") { + $in_header_comment = 1; + } + $prev = $line; + next line; + } + # are we still in the block comment? + if ($in_comment) { + if (/^$comment_prefix \*\/$/) { + $comment_done = 1; + } elsif (/\*\//) { + $comment_done = 1; + err("improper block comment close") + unless ($ignore_hdr_comment && $in_header_comment); + } elsif (!/^$comment_prefix \*[ \t]/ && + !/^$comment_prefix \*$/) { + err("improper block comment") + unless ($ignore_hdr_comment && $in_header_comment); + } + } + + if ($in_header_comment && $ignore_hdr_comment) { + $prev = $line; + next line; + } + + # check for errors that might occur in comments and in code. + + # allow spaces to be used to draw pictures in all comments. + if (/[^ ] / && !/".* .*"/ && !$in_comment) { + err("spaces instead of tabs"); + } + if (/^ / && !/^ \*[ \t\/]/ && !/^ \*$/ && + (!/^ (\w|\.)/ || $in_function != 0)) { + err("indent by spaces instead of tabs"); + } + if (/^\t+ [^ \t\*]/ || /^\t+ \S/ || /^\t+ \S/) { + err("continuation line not indented by 4 spaces"); + } + if (/$warlock_re/ && !/\*\//) { + $in_warlock_comment = 1; + $prev = $line; + next line; + } + if (/^\s*\/\*./ && !/^\s*\/\*.*\*\// && !/$hdr_comment_start/) { + err("improper first line of block comment"); + } + + if ($in_comment) { # still in comment, don't do further checks + $prev = $line; + next line; + } + + if ((/[^(]\/\*\S/ || /^\/\*\S/) && + !(/$lint_re/ || ($splint_comments && /$splint_re/))) { + err("missing blank after open comment"); + } + if (/\S\*\/[^)]|\S\*\/$/ && + !(/$lint_re/ || ($splint_comments && /$splint_re/))) { + err("missing blank before close comment"); + } + if (/\/\/\S/) { # C++ comments + err("missing blank after start comment"); + } + # check for unterminated single line comments, but allow them when + # they are used to comment out the argument list of a function + # declaration. + if (/\S.*\/\*/ && !/\S.*\/\*.*\*\// && !/\(\/\*/) { + err("unterminated single line comment"); + } + + if (/^(#else|#endif|#include)(.*)$/) { + $prev = $line; + if ($picky) { + my $directive = $1; + my $clause = $2; + # Enforce ANSI rules for #else and #endif: no noncomment + # identifiers are allowed after #endif or #else. Allow + # C++ comments since they seem to be a fact of life. + if ((($1 eq "#endif") || ($1 eq "#else")) && + ($clause ne "") && + (!($clause =~ /^\s+\/\*.*\*\/$/)) && + (!($clause =~ /^\s+\/\/.*$/))) { + err("non-comment text following " . + "$directive (or malformed $directive " . + "directive)"); + } + } + next line; + } + + # + # delete any comments and check everything else. Note that + # ".*?" is a non-greedy match, so that we don't get confused by + # multiple comments on the same line. + # + s/\/\*.*?\*\///g; + s/\/\/.*$//; # C++ comments + + # delete any trailing whitespace; we have already checked for that. + s/\s*$//; + + # following checks do not apply to text in comments. + + if (/[^<>\s][!<>=]=/ || /[^<>][!<>=]=[^\s,]/ || + (/[^->]>[^,=>\s]/ && !/[^->]>$/) || + (/[^<]<[^,=<\s]/ && !/[^<]<$/) || + /[^<\s]<[^<]/ || /[^->\s]>[^>]/) { + err("missing space around relational operator"); + } + if (/\S>>=/ || /\S<<=/ || />>=\S/ || /<<=\S/ || /\S[-+*\/&|^%]=/ || + (/[^-+*\/&|^%!<>=\s]=[^=]/ && !/[^-+*\/&|^%!<>=\s]=$/) || + (/[^!<>=]=[^=\s]/ && !/[^!<>=]=$/)) { + # XXX - should only check this for C++ code + # XXX - there are probably other forms that should be allowed + if (!/\soperator=/) { + err("missing space around assignment operator"); + } + } + if (/[,;]\S/ && !/\bfor \(;;\)/) { + err("comma or semicolon followed by non-blank"); + } + # allow "for" statements to have empty "while" clauses + if (/\s[,;]/ && !/^[\t]+;$/ && !/^\s*for \([^;]*; ;[^;]*\)/) { + err("comma or semicolon preceded by blank"); + } + if (/^\s*(&&|\|\|)/) { + err("improper boolean continuation"); + } + if (/\S *(&&|\|\|)/ || /(&&|\|\|) *\S/) { + err("more than one space around boolean operator"); + } + if (/\b(for|if|while|switch|sizeof|return|case)\(/) { + err("missing space between keyword and paren"); + } + if (/(\b(for|if|while|switch|return)\b.*){2,}/ && !/^#define/) { + # multiple "case" and "sizeof" allowed + err("more than one keyword on line"); + } + if (/\b(for|if|while|switch|sizeof|return|case)\s\s+\(/ && + !/^#if\s+\(/) { + err("extra space between keyword and paren"); + } + # try to detect "func (x)" but not "if (x)" or + # "#define foo (x)" or "int (*func)();" + if (/\w\s\(/) { + my $s = $_; + # strip off all keywords on the line + s/\b(for|if|while|switch|return|case|sizeof)\s\(/XXX(/g; + s/#elif\s\(/XXX(/g; + s/^#define\s+\w+\s+\(/XXX(/; + # do not match things like "void (*f)();" + # or "typedef void (func_t)();" + s/\w\s\(+\*/XXX(*/g; + s/\b($typename|void)\s+\(+/XXX(/og; + if (/\w\s\(/) { + err("extra space between function name and left paren"); + } + $_ = $s; + } + # try to detect "int foo(x)", but not "extern int foo(x);" + # XXX - this still trips over too many legitimate things, + # like "int foo(x,\n\ty);" +# if (/^(\w+(\s|\*)+)+\w+\(/ && !/\)[;,](\s|)*$/ && +# !/^(extern|static)\b/) { +# err("return type of function not on separate line"); +# } + # this is a close approximation + if (/^(\w+(\s|\*)+)+\w+\(.*\)(\s|)*$/ && + !/^(extern|static)\b/) { + err("return type of function not on separate line"); + } + if (/^#define /) { + err("#define followed by space instead of tab"); + } + if (/^\s*return\W[^;]*;/ && !/^\s*return\s*\(.*\);/) { + err("unparenthesized return expression"); + } + if (/\bsizeof\b/ && !/\bsizeof\s*\(.*\)/) { + err("unparenthesized sizeof expression"); + } + if (/\(\s/) { + err("whitespace after left paren"); + } + # Allow "for" statements to have empty "continue" clauses. + # Allow right paren on its own line unless we're being picky (-p). + if (/\s\)/ && !/^\s*for \([^;]*;[^;]*; \)/ && ($picky || !/^\s*\)/)) { + err("whitespace before right paren"); + } + if (/^\s*\(void\)[^ ]/) { + err("missing space after (void) cast"); + } + if (/\S\{/ && !/\{\{/) { + err("missing space before left brace"); + } + if ($in_function && /^\s+\{/ && + ($prev =~ /\)\s*$/ || $prev =~ /\bstruct\s+\w+$/)) { + err("left brace starting a line"); + } + if (/\}(else|while)/) { + err("missing space after right brace"); + } + if (/\}\s\s+(else|while)/) { + err("extra space after right brace"); + } + if (/\b_VOID\b|\bVOID\b|\bSTATIC\b/) { + err("obsolete use of VOID or STATIC"); + } + if (/\b$typename\*/o) { + err("missing space between type name and *"); + } + if (/^\s+#/) { + err("preprocessor statement not in column 1"); + } + if (/^#\s/) { + err("blank after preprocessor #"); + } + if (/!\s*(strcmp|strncmp|bcmp)\s*\(/) { + err("don't use boolean ! with comparison functions"); + } + + # + # We completely ignore, for purposes of indentation: + # * lines outside of functions + # * preprocessor lines + # + if ($check_continuation && $in_function && !$in_cpp) { + process_indent($_); + } + if ($picky) { + # try to detect spaces after casts, but allow (e.g.) + # "sizeof (int) + 1", "void (*funcptr)(int) = foo;", and + # "int foo(int) __NORETURN;" + if ((/^\($typename( \*+)?\)\s/o || + /\W\($typename( \*+)?\)\s/o) && + !/sizeof\s*\($typename( \*)?\)\s/o && + !/\($typename( \*+)?\)\s+=[^=]/o) { + err("space after cast"); + } + if (/\b$typename\s*\*\s/o && + !/\b$typename\s*\*\s+const\b/o) { + err("unary * followed by space"); + } + } + if ($check_posix_types) { + # try to detect old non-POSIX types. + # POSIX requires all non-standard typedefs to end in _t, + # but historically these have been used. + if (/\b(unchar|ushort|uint|ulong|u_int|u_short|u_long|u_char|quad)\b/) { + err("non-POSIX typedef $1 used: use $old2posix{$1} instead"); + } + } + if ($heuristic) { + # cannot check this everywhere due to "struct {\n...\n} foo;" + if ($in_function && !$in_declaration && + /\}./ && !/\}\s+=/ && !/\{.*\}[;,]$/ && !/\}(\s|)*$/ && + !/\} (else|while)/ && !/\}\}/) { + err("possible bad text following right brace"); + } + # cannot check this because sub-blocks in + # the middle of code are ok + if ($in_function && /^\s+\{/) { + err("possible left brace starting a line"); + } + } + if (/^\s*else\W/) { + if ($prev =~ /^\s*\}$/) { + err_prefix($prev, + "else and right brace should be on same line"); + } + } + $prev = $line; +} + +if ($prev eq "") { + err("last line in file is blank"); +} + +} + +# +# Continuation-line checking +# +# The rest of this file contains the code for the continuation checking +# engine. It's a pretty simple state machine which tracks the expression +# depth (unmatched '('s and '['s). +# +# Keep in mind that the argument to process_indent() has already been heavily +# processed; all comments have been replaced by control-A, and the contents of +# strings and character constants have been elided. +# + +my $cont_in; # currently inside of a continuation +my $cont_off; # skipping an initializer or definition +my $cont_noerr; # suppress cascading errors +my $cont_start; # the line being continued +my $cont_base; # the base indentation +my $cont_first; # this is the first line of a statement +my $cont_multiseg; # this continuation has multiple segments + +my $cont_special; # this is a C statement (if, for, etc.) +my $cont_macro; # this is a macro +my $cont_case; # this is a multi-line case + +my @cont_paren; # the stack of unmatched ( and [s we've seen + +sub +reset_indent() +{ + $cont_in = 0; + $cont_off = 0; +} + +sub +delabel($) +{ + # + # replace labels with tabs. Note that there may be multiple + # labels on a line. + # + local $_ = $_[0]; + + while (/^(\t*)( *(?:(?:\w+\s*)|(?:case\b[^:]*)): *)(.*)$/) { + my ($pre_tabs, $label, $rest) = ($1, $2, $3); + $_ = $pre_tabs; + while ($label =~ s/^([^\t]*)(\t+)//) { + $_ .= "\t" x (length($2) + length($1) / 8); + } + $_ .= ("\t" x (length($label) / 8)).$rest; + } + + return ($_); +} + +sub +process_indent($) +{ + require strict; + local $_ = $_[0]; # preserve the global $_ + + s///g; # No comments + s/\s+$//; # Strip trailing whitespace + + return if (/^$/); # skip empty lines + + # regexps used below; keywords taking (), macros, and continued cases + my $special = '(?:(?:\}\s*)?else\s+)?(?:if|for|while|switch)\b'; + my $macro = '[A-Z_][A-Z_0-9]*\('; + my $case = 'case\b[^:]*$'; + + # skip over enumerations, array definitions, initializers, etc. + if ($cont_off <= 0 && !/^\s*$special/ && + (/(?:(?:\b(?:enum|struct|union)\s*[^\{]*)|(?:\s+=\s*))\{/ || + (/^\s*\{/ && $prev =~ /=\s*(?:\/\*.*\*\/\s*)*$/))) { + $cont_in = 0; + $cont_off = tr/{/{/ - tr/}/}/; + return; + } + if ($cont_off) { + $cont_off += tr/{/{/ - tr/}/}/; + return; + } + + if (!$cont_in) { + $cont_start = $line; + + if (/^\t* /) { + err("non-continuation indented 4 spaces"); + $cont_noerr = 1; # stop reporting + } + $_ = delabel($_); # replace labels with tabs + + # check if the statement is complete + return if (/^\s*\}?$/); + return if (/^\s*\}?\s*else\s*\{?$/); + return if (/^\s*do\s*\{?$/); + return if (/\{$/); + return if (/\}[,;]?$/); + + # Allow macros on their own lines + return if (/^\s*[A-Z_][A-Z_0-9]*$/); + + # cases we don't deal with, generally non-kosher + if (/\{/) { + err("stuff after {"); + return; + } + + # Get the base line, and set up the state machine + /^(\t*)/; + $cont_base = $1; + $cont_in = 1; + @cont_paren = (); + $cont_first = 1; + $cont_multiseg = 0; + + # certain things need special processing + $cont_special = /^\s*$special/? 1 : 0; + $cont_macro = /^\s*$macro/? 1 : 0; + $cont_case = /^\s*$case/? 1 : 0; + } else { + $cont_first = 0; + + # Strings may be pulled back to an earlier (half-)tabstop + unless ($cont_noerr || /^$cont_base / || + (/^\t*(?: )?(?:gettext\()?\"/ && !/^$cont_base\t/)) { + err_prefix($cont_start, + "continuation should be indented 4 spaces"); + } + } + + my $rest = $_; # keeps the remainder of the line + + # + # The split matches 0 characters, so that each 'special' character + # is processed separately. Parens and brackets are pushed and + # popped off the @cont_paren stack. For normal processing, we wait + # until a ; or { terminates the statement. "special" processing + # (if/for/while/switch) is allowed to stop when the stack empties, + # as is macro processing. Case statements are terminated with a : + # and an empty paren stack. + # + foreach $_ (split /[^\(\)\[\]\{\}\;\:]*/) { + next if (length($_) == 0); + + # rest contains the remainder of the line + my $rxp = "[^\Q$_\E]*\Q$_\E"; + $rest =~ s/^$rxp//; + + if (/\(/ || /\[/) { + push @cont_paren, $_; + } elsif (/\)/ || /\]/) { + my $cur = $_; + tr/\)\]/\(\[/; + + my $old = (pop @cont_paren); + if (!defined($old)) { + err("unexpected '$cur'"); + $cont_in = 0; + last; + } elsif ($old ne $_) { + err("'$cur' mismatched with '$old'"); + $cont_in = 0; + last; + } + + # + # If the stack is now empty, do special processing + # for if/for/while/switch and macro statements. + # + next if (@cont_paren != 0); + if ($cont_special) { + if ($rest =~ /^\s*\{?$/) { + $cont_in = 0; + last; + } + if ($rest =~ /^\s*;$/) { + err("empty if/for/while body ". + "not on its own line"); + $cont_in = 0; + last; + } + if (!$cont_first && $cont_multiseg == 1) { + err_prefix($cont_start, + "multiple statements continued ". + "over multiple lines"); + $cont_multiseg = 2; + } elsif ($cont_multiseg == 0) { + $cont_multiseg = 1; + } + # We've finished this section, start + # processing the next. + goto section_ended; + } + if ($cont_macro) { + if ($rest =~ /^$/) { + $cont_in = 0; + last; + } + } + } elsif (/\;/) { + if ($cont_case) { + err("unexpected ;"); + } elsif (!$cont_special) { + err("unexpected ;") if (@cont_paren != 0); + if (!$cont_first && $cont_multiseg == 1) { + err_prefix($cont_start, + "multiple statements continued ". + "over multiple lines"); + $cont_multiseg = 2; + } elsif ($cont_multiseg == 0) { + $cont_multiseg = 1; + } + if ($rest =~ /^$/) { + $cont_in = 0; + last; + } + if ($rest =~ /^\s*special/) { + err("if/for/while/switch not started ". + "on its own line"); + } + goto section_ended; + } + } elsif (/\{/) { + err("{ while in parens/brackets") if (@cont_paren != 0); + err("stuff after {") if ($rest =~ /[^\s}]/); + $cont_in = 0; + last; + } elsif (/\}/) { + err("} while in parens/brackets") if (@cont_paren != 0); + if (!$cont_special && $rest !~ /^\s*(while|else)\b/) { + if ($rest =~ /^$/) { + err("unexpected }"); + } else { + err("stuff after }"); + } + $cont_in = 0; + last; + } + } elsif (/\:/ && $cont_case && @cont_paren == 0) { + err("stuff after multi-line case") if ($rest !~ /$^/); + $cont_in = 0; + last; + } + next; +section_ended: + # End of a statement or if/while/for loop. Reset + # cont_special and cont_macro based on the rest of the + # line. + $cont_special = ($rest =~ /^\s*$special/)? 1 : 0; + $cont_macro = ($rest =~ /^\s*$macro/)? 1 : 0; + $cont_case = 0; + next; + } + $cont_noerr = 0 if (!$cont_in); +} diff --git a/scripts/dkms.mkconf b/scripts/dkms.mkconf new file mode 100755 index 000000000000..28d9609f721b --- /dev/null +++ b/scripts/dkms.mkconf @@ -0,0 +1,120 @@ +#!/bin/sh + +PROG=$0 + +pkgcfg=/etc/sysconfig/zfs + +while getopts "n:v:c:f:" opt; do + case $opt in + n) pkgname=$OPTARG ;; + v) pkgver=$OPTARG ;; + c) pkgcfg=$OPTARG ;; + f) filename=$OPTARG ;; + esac +done + +if [ -z "${pkgname}" ] || [ -z "${pkgver}" ] || [ -z "${filename}" ]; then + echo "Usage: $PROG -n -v -c -f " + exit 1 +fi + +cat >${filename} < -k -n " \ + "-t -v " + exit 1 +fi + +cp "${tree}/${pkgname}/${pkgver}/build/zfs_config.h" \ + "${tree}/${pkgname}/${pkgver}/build/module/Module.symvers" \ + "${tree}/${pkgname}/${pkgver}/${kver}/${arch}/" diff --git a/scripts/enum-extract.pl b/scripts/enum-extract.pl new file mode 100755 index 000000000000..5112cc807f67 --- /dev/null +++ b/scripts/enum-extract.pl @@ -0,0 +1,58 @@ +#!/usr/bin/perl -w + +my $usage = <) { + # comments + s/\/\*.*\*\///; + if (m/\/\*/) { + while ($_ .= <>) { + last if s/\/\*.*\*\///s; + } + } + + # preprocessor stuff + next if /^#/; + + # find our enum + $in_enum = 1 if s/^\s*enum\s+${enum}(?:\s|$)//; + next unless $in_enum; + + # remove explicit values + s/\s*=[^,]+,/,/g; + + # extract each identifier + while (m/\b([a-z_][a-z0-9_]*)\b/ig) { + print $1, "\n"; + } + + # + # don't exit: there may be multiple versions of the same enum, e.g. + # inside different #ifdef blocks. Let's explicitly return all of + # them and let external tooling deal with it. + # + $in_enum = 0 if m/}\s*;/; +} + +exit 0; diff --git a/scripts/kmodtool b/scripts/kmodtool new file mode 100755 index 000000000000..240cde3106a7 --- /dev/null +++ b/scripts/kmodtool @@ -0,0 +1,611 @@ +#!/usr/bin/env bash + +# kmodtool - Helper script for building kernel module RPMs +# Copyright (c) 2003-2012 Ville Skyttä , +# Thorsten Leemhuis +# Nicolas Chauvet +# +# Permission is hereby granted, free of charge, to any person obtaining +# a copy of this software and associated documentation files (the +# "Software"), to deal in the Software without restriction, including +# without limitation the rights to use, copy, modify, merge, publish, +# distribute, sublicense, and/or sell copies of the Software, and to +# permit persons to whom the Software is furnished to do so, subject to +# the following conditions: +# +# The above copyright notice and this permission notice shall be +# included in all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +# LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +# OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +# WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +shopt -s extglob + +myprog="kmodtool-${repo}" +myver="0.12.1" + +kmodname= +build_kernels="current" +kernels_known_variants= +kernel_versions= +kernel_versions_to_build_for= +prefix= +filterfile= +target= +buildroot= + +error_out() +{ + local errorlevel=${1} + shift + echo "Error: $@" >&2 + # the next line is not multi-line safe -- not needed *yet* + echo "%global kmodtool_check echo \"kmodtool error: $@\"; exit ${errorlevel};" + exit ${errorlevel} +} + +print_rpmtemplate_header() +{ + echo + echo '%global kmodinstdir_prefix '${prefix}/lib/modules/ + echo '%global kmodinstdir_postfix '/extra/${kmodname}/ + echo '%global kernel_versions '${kernel_versions} + echo +} + +print_akmodtemplate () +{ + echo + cat <= %{?epoch:%{epoch}:}%{version} +Provides: ${kmodname}-kmod = %{?epoch:%{epoch}:}%{version}-%{release} +EOF + + if [[ ${obsolete_name} ]]; then + echo "Provides: akmod-${obsolete_name} = ${obsolete_version}" + echo "Obsoletes: akmod-${obsolete_name} < ${obsolete_version}" + fi + + cat < /dev/null & + +%files -n akmod-${kmodname} +%defattr(-,root,root,-) +%{_usrsrc}/akmods/* + +EOF +} + +print_akmodmeta () +{ + cat <= %{?epoch:%{epoch}:}%{version} + +%if 0%{?rhel} == 6 || 0%{?centos} == 6 +Requires(post): module-init-tools +Requires(postun): module-init-tools +%else +Requires(post): kmod +Requires(postun): kmod +%endif +EOF + + if [[ ${obsolete_name} ]]; then + echo "Provides: kmod-${obsolete_name}-${kernel_uname_r} = ${obsolete_version}" + echo "Obsoletes: kmod-${obsolete_name}-${kernel_uname_r} < ${obsolete_version}" + fi + + # second part + if [[ ! "${customkernel}" ]]; then + cat < /dev/null || : +%postun -n kmod-${kmodname}-${kernel_uname_r} +${prefix}${depmod_path} -aF /boot/System.map-${kernel_uname_r} ${kernel_uname_r} &> /dev/null || : + +EOF + else + cat < /dev/null || : +%postun -n kmod-${kmodname}-${kernel_uname_r} +[[ "\$(uname -r)" == "${kernel_uname_r}" ]] && ${prefix}${depmod_path} -a > /dev/null || : + +EOF + fi + + # third part + cat <= %{?epoch:%{epoch}:}%{version}-%{release}" + fi + + if [[ ${obsolete_name} ]]; then + echo "Provides: kmod-${obsolete_name}-devel = ${obsolete_version}" + echo "Obsoletes: kmod-${obsolete_name}-devel < ${obsolete_version}" + fi + + cat < objects for the newest kernel. + +%files -n kmod-${kmodname}-devel +%defattr(644,root,root,755) +%{_usrsrc}/${kmodname}-%{version} +EOF + if [[ ${obsolete_name} ]]; then + echo "%{_usrsrc}/${obsolete_name}-%{version}" + fi + + for kernel in ${1}; do + local kernel_uname_r=${kernel} + echo "%exclude %{_usrsrc}/${kmodname}-%{version}/${kernel_uname_r}" + if [[ ${obsolete_name} ]]; then + echo "%exclude %{_usrsrc}/${obsolete_name}-%{version}/${kernel_uname_r}" + fi + done + + echo + echo +} + +print_rpmtemplate_per_kmoddevelpkg () +{ + if [[ "${1}" == "--custom" ]]; then + shift + local customkernel=true + elif [[ "${1}" == "--redhat" ]]; then + # this is needed for akmods + shift + local redhatkernel=true + fi + + local kernel_uname_r=${1} + local kernel_variant="${2:+-${2}}" + + # first part + cat <= %{?epoch:%{epoch}:}%{version}-%{release} +%{?KmodsMetaRequires:Requires: %{?KmodsMetaRequires}} +EOF + + if [[ ${obsolete_name} ]]; then + echo "Provides: kmod-${obsolete_name}${kernel_variant} = ${obsolete_version}" + echo "Obsoletes: kmod-${obsolete_name}${kernel_variant} < ${obsolete_version}" + fi + + cat < -- filter the results with grep --file " + echo " --for-kernels -- created templates only for these kernels" + echo " --kmodname -- name of the kmod (required)" + echo " --devel -- make kmod-devel package" + echo " --noakmod -- no akmod package" + echo " --repo -- use buildsys-build--kerneldevpkgs" + echo " --target -- target-arch (required)" + echo " --buildroot

-- Build root (place to look for build files)" +} + +while [ "${1}" ] ; do + case "${1}" in + --filterfile) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide path to a filter-file together with --filterfile" >&2 + elif [[ ! -e "${1}" ]]; then + error_out 2 "Filterfile ${1} not found" >&2 + fi + filterfile="${1}" + shift + ;; + --kmodname) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide the name of the kmod together with --kmodname" >&2 + fi + # strip pending -kmod + kmodname="${1%%-kmod}" + shift + ;; + --devel) + shift + devel="true" + ;; + --prefix) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide a prefix with --prefix" >&2 + fi + prefix="${1}" + shift + ;; + --repo) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide the name of the repo together with --repo" >&2 + fi + repo=${1} + shift + ;; + --for-kernels) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide the name of the kmod together with --kmodname" >&2 + fi + for_kernels="${1}" + shift + ;; + --noakmod) + shift + noakmod="true" + ;; + --obsolete-name) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide the name of the kmod to obsolete together with --obsolete-name" >&2 + fi + obsolete_name="${1}" + shift + ;; + --obsolete-version) + shift + if [[ ! "${1}" ]] ; then + error_out 2 "Please provide the version of the kmod to obsolete together with --obsolete-version" >&2 + fi + obsolete_version="${1}" + shift + ;; + --target) + shift + target="${1}" + shift + ;; + --akmod) + shift + build_kernels="akmod" + ;; + --newest) + shift + build_kernels="newest" + ;; + --current) + shift + build_kernels="current" + ;; + --buildroot) + shift + buildroot="${1}" + shift + ;; + --help) + myprog_help + exit 0 + ;; + --version) + echo "${myprog} ${myver}" + exit 0 + ;; + *) + echo "Error: Unknown option '${1}'." >&2 + usage >&2 + exit 2 + ;; + esac +done + +if [[ -e ./kmodtool-kernel-variants ]]; then + kernels_known_variants="$(cat ./kmodtool-kernel-variants)" +elif [[ -e /usr/share/kmodtool/kernel-variants ]] ; then + kernels_known_variants="$(cat /usr/share/kmodtool/kernel-variants)" +else + kernels_known_variants="@(smp?(-debug)|PAE?(-debug)|debug|kdump|xen|kirkwood|highbank|imx|omap|tegra)" +fi + +# general sanity checks +if [[ ! "${target}" ]]; then + error_out 2 "please pass target arch with --target" +elif [[ ! "${kmodname}" ]]; then + error_out 2 "please pass kmodname with --kmodname" +elif [[ ! "${kernels_known_variants}" ]] ; then + error_out 2 "could not determine known variants" +elif ( [[ "${obsolete_name}" ]] && [[ ! "${obsolete_version}" ]] ) || ( [[ ! "${obsolete_name}" ]] && [[ "${obsolete_version}" ]] ) ; then + error_out 2 "you need to provide both --obsolete-name and --obsolete-version" +fi + +# go +if [[ "${for_kernels}" ]]; then + # this is easy: + print_customrpmtemplate "${for_kernels}" +elif [[ "${build_kernels}" == "akmod" ]]; then + # do only a akmod package + print_akmodtemplate + print_akmodmeta +else + # seems we are on out own to decide for which kernels to build + + # we need more sanity checks in this case + if [[ ! "${repo}" ]]; then + error_out 2 "please provide repo name with --repo" + elif ! $(which buildsys-build-${repo}-kerneldevpkgs &> /dev/null) ; then + error_out 2 "buildsys-build-${repo}-kerneldevpkgs not found" + fi + + # call buildsys-build-${repo}-kerneldevpkgs to get the list of kernels + cmdoptions="--target ${target}" + + # filterfile to filter list of kernels? + if [[ "${filterfile}" ]] ; then + cmdoptions="${cmdoptions} --filterfile ${filterfile}" + fi + + kernel_versions_to_build_for="$(buildsys-build-${repo}-kerneldevpkgs --${build_kernels} ${cmdoptions})" + returncode=$? + if (( ${returncode} != 0 )); then + error_out 2 "buildsys-build-${repo}-kerneldevpkgs failed: $(buildsys-build-${repo}-kerneldevpkgs --${build_kernels} ${cmdoptions})" + fi + + if [[ "${build_kernels}" == "current" ]] && [[ ! "${noakmod}" ]]; then + print_akmodtemplate + fi + + print_rpmtemplate +fi diff --git a/scripts/make_gitrev.sh b/scripts/make_gitrev.sh new file mode 100755 index 000000000000..f5c743b13799 --- /dev/null +++ b/scripts/make_gitrev.sh @@ -0,0 +1,78 @@ +#!/bin/sh + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# Copyright (c) 2018 by Delphix. All rights reserved. +# Copyright (c) 2018 by Matthew Thode. All rights reserved. + +# +# Generate zfs_gitrev.h. Note that we need to do this for every +# invocation of `make`, including for incremental builds. Therefore we +# can't use a zfs_gitrev.h.in file which would be processed only when +# `configure` is run. +# + +set -e -u + +dist=no +distdir=. +while getopts D: flag +do + case $flag in + \?) echo "Usage: $0 [-D distdir] [file]" >&2; exit 1;; + D) dist=yes; distdir=${OPTARG};; + esac +done +shift $((OPTIND - 1)) + +top_srcdir="$(dirname "$0")/.." +GITREV="${1:-include/zfs_gitrev.h}" + +# GITREV should be a relative path (relative to top_builddir or distdir) +case "${GITREV}" in + /*) echo "Error: ${GITREV} should be a relative path" >&2 + exit 1;; +esac + +ZFS_GITREV=$({ cd "${top_srcdir}" && + git describe --always --long --dirty 2>/dev/null; } || :) + +if [ "x${ZFS_GITREV}" = x ] +then + # If the source directory is not a git repository, check if the file + # already exists (in the source) + if [ -f "${top_srcdir}/${GITREV}" ] + then + ZFS_GITREV="$(sed -n \ + '1s/^#define[[:blank:]]ZFS_META_GITREV "\([^"]*\)"$/\1/p' \ + "${top_srcdir}/${GITREV}")" + fi +elif [ ${dist} = yes ] +then + # Append -dist when creating distributed sources from a git repository + ZFS_GITREV="${ZFS_GITREV}-dist" +fi +ZFS_GITREV=${ZFS_GITREV:-unknown} + +GITREVTMP="${GITREV}~" +printf '#define\tZFS_META_GITREV "%s"\n' "${ZFS_GITREV}" >"${GITREVTMP}" +GITREV="${distdir}/${GITREV}" +if cmp -s "${GITREV}" "${GITREVTMP}" +then + rm -f "${GITREVTMP}" +else + mv -f "${GITREVTMP}" "${GITREV}" +fi diff --git a/scripts/man-dates.sh b/scripts/man-dates.sh new file mode 100755 index 000000000000..186d94639a56 --- /dev/null +++ b/scripts/man-dates.sh @@ -0,0 +1,12 @@ +#!/bin/sh + +# This script updates the date lines in the man pages to the date of the last +# commit to that file. + +set -eu + +find man -type f | while read -r i ; do + git_date=$(git log -1 --date=short --format="%ad" -- "$i") + [ "x$git_date" = "x" ] && continue + sed -i "s|^\.Dd.*|.Dd $(date -d "$git_date" "+%B %-d, %Y")|" "$i" +done diff --git a/scripts/paxcheck.sh b/scripts/paxcheck.sh new file mode 100755 index 000000000000..87e817500b1b --- /dev/null +++ b/scripts/paxcheck.sh @@ -0,0 +1,44 @@ +#!/bin/sh + +# shellcheck disable=SC2039 +if ! type scanelf > /dev/null 2>&1; then + echo "scanelf (from pax-utils) is required for these checks." >&2 + exit 3 +fi + +RET=0 + +# check for exec stacks +OUT=$(scanelf -qyRAF '%e %p' "$1") + +if [ x"${OUT}" != x ]; then + RET=2 + echo "The following files contain writable and executable sections" + echo " Files with such sections will not work properly (or at all!) on some" + echo " architectures/operating systems." + echo " For more information, see:" + echo " https://wiki.gentoo.org/wiki/Hardened/GNU_stack_quickstart" + echo + echo "${OUT}" + echo +fi + + +# check for TEXTRELS +OUT=$(scanelf -qyRAF '%T %p' "$1") + +if [ x"${OUT}" != x ]; then + RET=2 + echo "The following files contain runtime text relocations" + echo " Text relocations force the dynamic linker to perform extra" + echo " work at startup, waste system resources, and may pose a security" + echo " risk. On some architectures, the code may not even function" + echo " properly, if at all." + echo " For more information, see:" + echo " https://wiki.gentoo.org/wiki/Hardened/HOWTO_locate_and_fix_textrels" + echo + echo "${OUT}" + echo +fi + +exit $RET diff --git a/scripts/zfs-helpers.sh b/scripts/zfs-helpers.sh new file mode 100755 index 000000000000..02b4922006f2 --- /dev/null +++ b/scripts/zfs-helpers.sh @@ -0,0 +1,194 @@ +#!/bin/sh +# +# This script is designed to facilitate in-tree development and testing +# by installing symlinks on your system which refer to in-tree helper +# utilities. These helper utilities must be installed to in order to +# exercise all ZFS functionality. By using symbolic links and keeping +# the scripts in-tree during development they can be easily modified +# and those changes tracked. +# +# Use the following configuration option to override the installation +# paths for these scripts. The correct path is automatically set for +# most distributions but you can optionally set it for your environment. +# +# --with-mounthelperdir=DIR install mount.zfs in dir [/sbin] +# --with-udevdir=DIR install udev helpers [default=check] +# --with-udevruledir=DIR install udev rules [default=UDEVDIR/rules.d] +# --sysconfdir=DIR install zfs configuration files [PREFIX/etc] +# + +BASE_DIR=$(dirname "$0") +SCRIPT_COMMON=common.sh +if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then + . "${BASE_DIR}/${SCRIPT_COMMON}" +else + echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 +fi + +PROG=zfs-helpers.sh +DRYRUN="no" +INSTALL="no" +REMOVE="no" +VERBOSE="no" + +fail() { + echo "${PROG}: $1" >&2 + exit 1 +} + +msg() { + if [ "$VERBOSE" = "yes" ]; then + echo "$@" + fi +} + +usage() { +cat << EOF +USAGE: +$0 [dhirv] + +DESCRIPTION: + Install/remove the ZFS helper utilities. + +OPTIONS: + -d Dry run + -h Show this message + -i Install the helper utilities + -r Remove the helper utilities + -v Verbose + +$0 -iv +$0 -r + +EOF +} + +while getopts 'hdirv' OPTION; do + case $OPTION in + h) + usage + exit 1 + ;; + d) + DRYRUN="yes" + ;; + i) + INSTALL="yes" + ;; + r) + REMOVE="yes" + ;; + v) + VERBOSE="yes" + ;; + ?) + usage + exit + ;; + esac +done + +if [ "$INSTALL" = "yes" ] && [ "$REMOVE" = "yes" ]; then + fail "Specify -i or -r but not both" +fi + +if [ "$INSTALL" = "no" ] && [ "$REMOVE" = "no" ]; then + fail "Either -i or -r must be specified" +fi + +if [ "$(id -u)" != "0" ]; then + fail "Must run as root" +fi + +if [ "$INTREE" != "yes" ]; then + fail "Must be run in-tree" +fi + +if [ "$VERBOSE" = "yes" ]; then + echo "--- Configuration ---" + echo "udevdir: $INSTALL_UDEV_DIR" + echo "udevruledir: $INSTALL_UDEV_RULE_DIR" + echo "mounthelperdir: $INSTALL_MOUNT_HELPER_DIR" + echo "sysconfdir: $INSTALL_SYSCONF_DIR" + echo "pythonsitedir: $INSTALL_PYTHON_DIR" + echo "dryrun: $DRYRUN" + echo +fi + +install() { + src=$1 + dst=$2 + + if [ -h "$dst" ]; then + echo "Symlink exists: $dst" + elif [ -e "$dst" ]; then + echo "File exists: $dst" + elif [ ! -e "$src" ]; then + echo "Source missing: $src" + else + msg "ln -s $src $dst" + + if [ "$DRYRUN" = "no" ]; then + DIR=$(dirname "$dst") + mkdir -p "$DIR" >/dev/null 2>&1 + ln -s "$src" "$dst" + fi + fi +} + +remove() { + dst=$1 + + if [ -h "$dst" ]; then + msg "rm $dst" + rm "$dst" + DIR=$(dirname "$dst") + rmdir "$DIR" >/dev/null 2>&1 + elif [ -e "$dst" ]; then + echo "Expected symlink: $dst" + fi +} + +if [ "${INSTALL}" = "yes" ]; then + install "$CMD_DIR/mount_zfs/mount.zfs" \ + "$INSTALL_MOUNT_HELPER_DIR/mount.zfs" + install "$CMD_DIR/fsck_zfs/fsck.zfs" \ + "$INSTALL_MOUNT_HELPER_DIR/fsck.zfs" + install "$CMD_DIR/zvol_id/zvol_id" \ + "$INSTALL_UDEV_DIR/zvol_id" + install "$CMD_DIR/vdev_id/vdev_id" \ + "$INSTALL_UDEV_DIR/vdev_id" + install "$UDEV_RULE_DIR/60-zvol.rules" \ + "$INSTALL_UDEV_RULE_DIR/60-zvol.rules" + install "$UDEV_RULE_DIR/69-vdev.rules" \ + "$INSTALL_UDEV_RULE_DIR/69-vdev.rules" + install "$UDEV_RULE_DIR/90-zfs.rules" \ + "$INSTALL_UDEV_RULE_DIR/90-zfs.rules" + install "$CMD_DIR/zpool/zpool.d" \ + "$INSTALL_SYSCONF_DIR/zfs/zpool.d" + install "$CONTRIB_DIR/pyzfs/libzfs_core" \ + "$INSTALL_PYTHON_DIR/libzfs_core" + # Ideally we would install these in the configured ${libdir}, which is + # by default "/usr/local/lib and unfortunately not included in the + # dynamic linker search path. + install "$(find "$LIB_DIR/libzfs_core" -type f -name 'libzfs_core.so*')" \ + "/lib/libzfs_core.so" + install "$(find "$LIB_DIR/libnvpair" -type f -name 'libnvpair.so*')" \ + "/lib/libnvpair.so" + ldconfig +else + remove "$INSTALL_MOUNT_HELPER_DIR/mount.zfs" + remove "$INSTALL_MOUNT_HELPER_DIR/fsck.zfs" + remove "$INSTALL_UDEV_DIR/zvol_id" + remove "$INSTALL_UDEV_DIR/vdev_id" + remove "$INSTALL_UDEV_RULE_DIR/60-zvol.rules" + remove "$INSTALL_UDEV_RULE_DIR/69-vdev.rules" + remove "$INSTALL_UDEV_RULE_DIR/90-zfs.rules" + remove "$INSTALL_SYSCONF_DIR/zfs/zpool.d" + remove "$INSTALL_PYTHON_DIR/libzfs_core" + remove "/lib/libzfs_core.so" + remove "/lib/libnvpair.so" + ldconfig +fi + +exit 0 diff --git a/scripts/zfs-tests.sh b/scripts/zfs-tests.sh new file mode 100755 index 000000000000..ae927691139f --- /dev/null +++ b/scripts/zfs-tests.sh @@ -0,0 +1,706 @@ +#!/bin/sh +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License, Version 1.0 only +# (the "License"). You may not use this file except in compliance +# with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +BASE_DIR=$(dirname "$0") +SCRIPT_COMMON=common.sh +if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then +. "${BASE_DIR}/${SCRIPT_COMMON}" +else +echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 +fi + +PROG=zfs-tests.sh +VERBOSE="no" +QUIET="" +CLEANUP="yes" +CLEANUPALL="no" +LOOPBACK="yes" +STACK_TRACER="no" +FILESIZE="4G" +DEFAULT_RUNFILES="common.run,$(uname | tr '[:upper:]' '[:lower:]').run" +RUNFILES=${RUNFILES:-$DEFAULT_RUNFILES} +FILEDIR=${FILEDIR:-/var/tmp} +DISKS=${DISKS:-""} +SINGLETEST="" +SINGLETESTUSER="root" +TAGS="" +ITERATIONS=1 +ZFS_DBGMSG="$STF_SUITE/callbacks/zfs_dbgmsg.ksh" +ZFS_DMESG="$STF_SUITE/callbacks/zfs_dmesg.ksh" +UNAME=$(uname -s) + +# Override some defaults if on FreeBSD +if [ "$UNAME" = "FreeBSD" ] ; then + TESTFAIL_CALLBACKS=${TESTFAIL_CALLBACKS:-"$ZFS_DMESG"} + LOSETUP=/sbin/mdconfig + DMSETUP=/sbin/gpart +else + ZFS_MMP="$STF_SUITE/callbacks/zfs_mmp.ksh" + TESTFAIL_CALLBACKS=${TESTFAIL_CALLBACKS:-"$ZFS_DBGMSG:$ZFS_DMESG:$ZFS_MMP"} + LOSETUP=${LOSETUP:-/sbin/losetup} + DMSETUP=${DMSETUP:-/sbin/dmsetup} +fi + +# +# Log an informational message when additional verbosity is enabled. +# +msg() { + if [ "$VERBOSE" = "yes" ]; then + echo "$@" + fi +} + +# +# Log a failure message, cleanup, and return an error. +# +fail() { + echo "$PROG: $1" >&2 + cleanup + exit 1 +} + +cleanup_freebsd_loopback() { + for TEST_LOOPBACK in ${LOOPBACKS}; do + if [ -c "/dev/${TEST_LOOPBACK}" ]; then + sudo "${LOSETUP}" -d -u "${TEST_LOOPBACK}" || + echo "Failed to destroy: ${TEST_LOOPBACK}" + fi + done +} + +cleanup_linux_loopback() { + for TEST_LOOPBACK in ${LOOPBACKS}; do + LOOP_DEV=$(basename "$TEST_LOOPBACK") + DM_DEV=$(sudo "${DMSETUP}" ls 2>/dev/null | \ + grep "${LOOP_DEV}" | cut -f1) + + if [ -n "$DM_DEV" ]; then + sudo "${DMSETUP}" remove "${DM_DEV}" || + echo "Failed to remove: ${DM_DEV}" + fi + + if [ -n "${TEST_LOOPBACK}" ]; then + sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" || + echo "Failed to remove: ${TEST_LOOPBACK}" + fi + done +} + +# +# Attempt to remove loopback devices and files which where created earlier +# by this script to run the test framework. The '-k' option may be passed +# to the script to suppress cleanup for debugging purposes. +# +cleanup() { + if [ "$CLEANUP" = "no" ]; then + return 0 + fi + + + if [ "$LOOPBACK" = "yes" ]; then + if [ "$UNAME" = "FreeBSD" ] ; then + cleanup_freebsd_loopback + else + cleanup_linux_loopback + fi + fi + + for TEST_FILE in ${FILES}; do + rm -f "${TEST_FILE}" >/dev/null 2>&1 + done + + if [ "$STF_PATH_REMOVE" = "yes" ] && [ -d "$STF_PATH" ]; then + rm -Rf "$STF_PATH" + fi +} +trap cleanup EXIT + +# +# Attempt to remove all testpools (testpool.XXX), unopened dm devices, +# loopback devices, and files. This is a useful way to cleanup a previous +# test run failure which has left the system in an unknown state. This can +# be dangerous and should only be used in a dedicated test environment. +# +cleanup_all() { + TEST_POOLS=$(sudo "$ZPOOL" list -H -o name | grep testpool) + if [ "$UNAME" = "FreeBSD" ] ; then + TEST_LOOPBACKS=$(sudo "${LOSETUP}" -l) + else + TEST_LOOPBACKS=$(sudo "${LOSETUP}" -a|grep file-vdev|cut -f1 -d:) + fi + TEST_FILES=$(ls /var/tmp/file-vdev* 2>/dev/null) + + msg + msg "--- Cleanup ---" + msg "Removing pool(s): $(echo "${TEST_POOLS}" | tr '\n' ' ')" + for TEST_POOL in $TEST_POOLS; do + sudo "$ZPOOL" destroy "${TEST_POOL}" + done + + if [ "$UNAME" != "FreeBSD" ] ; then + msg "Removing dm(s): $(sudo "${DMSETUP}" ls | + grep loop | tr '\n' ' ')" + sudo "${DMSETUP}" remove_all + fi + + msg "Removing loopback(s): $(echo "${TEST_LOOPBACKS}" | tr '\n' ' ')" + for TEST_LOOPBACK in $TEST_LOOPBACKS; do + if [ "$UNAME" = "FreeBSD" ] ; then + sudo "${LOSETUP}" -d -u "${TEST_LOOPBACK}" + else + sudo "${LOSETUP}" -d "${TEST_LOOPBACK}" + fi + done + + msg "Removing files(s): $(echo "${TEST_FILES}" | tr '\n' ' ')" + for TEST_FILE in $TEST_FILES; do + sudo rm -f "${TEST_FILE}" + done +} + +# +# Takes a name as the only arguments and looks for the following variations +# on that name. If one is found it is returned. +# +# $RUNFILE_DIR/ +# $RUNFILE_DIR/.run +# +# .run +# +find_runfile() { + NAME=$1 + RESULT="" + + if [ -f "$RUNFILE_DIR/$NAME" ]; then + RESULT="$RUNFILE_DIR/$NAME" + elif [ -f "$RUNFILE_DIR/$NAME.run" ]; then + RESULT="$RUNFILE_DIR/$NAME.run" + elif [ -f "$NAME" ]; then + RESULT="$NAME" + elif [ -f "$NAME.run" ]; then + RESULT="$NAME.run" + fi + + echo "$RESULT" +} + +# +# Symlink file if it appears under any of the given paths. +# +create_links() { + dir_list="$1" + file_list="$2" + + [ -n "$STF_PATH" ] || fail "STF_PATH wasn't correctly set" + + for i in $file_list; do + for j in $dir_list; do + [ ! -e "$STF_PATH/$i" ] || continue + + if [ ! -d "$j/$i" ] && [ -e "$j/$i" ]; then + ln -sf "$j/$i" "$STF_PATH/$i" || \ + fail "Couldn't link $i" + break + fi + done + + [ ! -e "$STF_PATH/$i" ] && \ + STF_MISSING_BIN="$STF_MISSING_BIN $i" + done + STF_MISSING_BIN=${STF_MISSING_BIN# } +} + +# +# Constrain the path to limit the available binaries to a known set. +# When running in-tree a top level ./bin/ directory is created for +# convenience, otherwise a temporary directory is used. +# +constrain_path() { + . "$STF_SUITE/include/commands.cfg" + + # On FreeBSD, base system zfs utils are in /sbin and OpenZFS utils + # install to /usr/local/sbin. To avoid testing the wrong utils we + # need /usr/local to come before / in the path search order. + SYSTEM_DIRS="/usr/local/bin /usr/local/sbin" + SYSTEM_DIRS="$SYSTEM_DIRS /usr/bin /usr/sbin /bin /sbin" + + if [ "$INTREE" = "yes" ]; then + # Constrained path set to ./zfs/bin/ + STF_PATH="$BIN_DIR" + STF_PATH_REMOVE="no" + STF_MISSING_BIN="" + if [ ! -d "$STF_PATH" ]; then + mkdir "$STF_PATH" + chmod 755 "$STF_PATH" || fail "Couldn't chmod $STF_PATH" + fi + + # Special case links for standard zfs utilities + DIRS="$(find "$CMD_DIR" -type d \( ! -name .deps -a \ + ! -name .libs \) -print | tr '\n' ' ')" + create_links "$DIRS" "$ZFS_FILES" + + # Special case links for zfs test suite utilities + DIRS="$(find "$STF_SUITE" -type d \( ! -name .deps -a \ + ! -name .libs \) -print | tr '\n' ' ')" + create_links "$DIRS" "$ZFSTEST_FILES" + else + # Constrained path set to /var/tmp/constrained_path.* + SYSTEMDIR=${SYSTEMDIR:-/var/tmp/constrained_path.XXXX} + STF_PATH=$(mktemp -d "$SYSTEMDIR") + STF_PATH_REMOVE="yes" + STF_MISSING_BIN="" + + chmod 755 "$STF_PATH" || fail "Couldn't chmod $STF_PATH" + + # Special case links for standard zfs utilities + create_links "$SYSTEM_DIRS" "$ZFS_FILES" + + # Special case links for zfs test suite utilities + create_links "$STF_SUITE/bin" "$ZFSTEST_FILES" + fi + + # Standard system utilities + SYSTEM_FILES="$SYSTEM_FILES_COMMON" + if [ "$UNAME" = "FreeBSD" ] ; then + SYSTEM_FILES="$SYSTEM_FILES $SYSTEM_FILES_FREEBSD" + else + SYSTEM_FILES="$SYSTEM_FILES $SYSTEM_FILES_LINUX" + fi + create_links "$SYSTEM_DIRS" "$SYSTEM_FILES" + + # Exceptions + ln -fs "$STF_PATH/awk" "$STF_PATH/nawk" + if [ "$UNAME" = "Linux" ] ; then + ln -fs /sbin/fsck.ext4 "$STF_PATH/fsck" + ln -fs /sbin/mkfs.ext4 "$STF_PATH/newfs" + ln -fs "$STF_PATH/gzip" "$STF_PATH/compress" + ln -fs "$STF_PATH/gunzip" "$STF_PATH/uncompress" + ln -fs "$STF_PATH/exportfs" "$STF_PATH/share" + ln -fs "$STF_PATH/exportfs" "$STF_PATH/unshare" + elif [ "$UNAME" = "FreeBSD" ] ; then + ln -fs /usr/local/bin/ksh93 "$STF_PATH/ksh" + fi +} + +# +# Output a useful usage message. +# +usage() { +cat << EOF +USAGE: +$0 [hvqxkfS] [-s SIZE] [-r RUNFILES] [-t PATH] [-u USER] + +DESCRIPTION: + ZFS Test Suite launch script + +OPTIONS: + -h Show this message + -v Verbose zfs-tests.sh output + -q Quiet test-runner output + -x Remove all testpools, dm, lo, and files (unsafe) + -k Disable cleanup after test failure + -f Use files only, disables block device tests + -S Enable stack tracer (negative performance impact) + -c Only create and populate constrained path + -n NFSFILE Use the nfsfile to determine the NFS configuration + -I NUM Number of iterations + -d DIR Use DIR for files and loopback devices + -s SIZE Use vdevs of SIZE (default: 4G) + -r RUNFILES Run tests in RUNFILES (default: ${DEFAULT_RUNFILES}) + -t PATH Run single test at PATH relative to test suite + -T TAGS Comma separated list of tags (default: 'functional') + -u USER Run single test as USER (default: root) + +EXAMPLES: +# Run the default (linux) suite of tests and output the configuration used. +$0 -v + +# Run a smaller suite of tests designed to run more quickly. +$0 -r linux-fast + +# Run a single test +$0 -t tests/functional/cli_root/zfs_bookmark/zfs_bookmark_cliargs.ksh + +# Cleanup a previous run of the test suite prior to testing, run the +# default (linux) suite of tests and perform no cleanup on exit. +$0 -x + +EOF +} + +while getopts 'hvqxkfScn:d:s:r:?t:T:u:I:' OPTION; do + case $OPTION in + h) + usage + exit 1 + ;; + v) + # shellcheck disable=SC2034 + VERBOSE="yes" + ;; + q) + QUIET="yes" + ;; + x) + CLEANUPALL="yes" + ;; + k) + CLEANUP="no" + ;; + f) + LOOPBACK="no" + ;; + S) + STACK_TRACER="yes" + ;; + c) + constrain_path + exit + ;; + n) + nfsfile=$OPTARG + [ -f "$nfsfile" ] || fail "Cannot read file: $nfsfile" + export NFS=1 + . "$nfsfile" + ;; + d) + FILEDIR="$OPTARG" + ;; + I) + ITERATIONS="$OPTARG" + if [ "$ITERATIONS" -le 0 ]; then + fail "Iterations must be greater than 0." + fi + ;; + s) + FILESIZE="$OPTARG" + ;; + r) + RUNFILES="$OPTARG" + ;; + t) + if [ -n "$SINGLETEST" ]; then + fail "-t can only be provided once." + fi + SINGLETEST="$OPTARG" + ;; + T) + TAGS="$OPTARG" + ;; + u) + SINGLETESTUSER="$OPTARG" + ;; + ?) + usage + exit + ;; + esac +done + +shift $((OPTIND-1)) + +FILES=${FILES:-"$FILEDIR/file-vdev0 $FILEDIR/file-vdev1 $FILEDIR/file-vdev2"} +LOOPBACKS=${LOOPBACKS:-""} + +if [ -n "$SINGLETEST" ]; then + if [ -n "$TAGS" ]; then + fail "-t and -T are mutually exclusive." + fi + RUNFILE_DIR="/var/tmp" + RUNFILES="zfs-tests.$$.run" + SINGLEQUIET="False" + + if [ -n "$QUIET" ]; then + SINGLEQUIET="True" + fi + + cat >$RUNFILE_DIR/$RUNFILES << EOF +[DEFAULT] +pre = +quiet = $SINGLEQUIET +pre_user = root +user = $SINGLETESTUSER +timeout = 600 +post_user = root +post = +outputdir = /var/tmp/test_results +EOF + SINGLETESTDIR=$(dirname "$SINGLETEST") + SINGLETESTFILE=$(basename "$SINGLETEST") + SETUPSCRIPT= + CLEANUPSCRIPT= + + if [ -f "$STF_SUITE/$SINGLETESTDIR/setup.ksh" ]; then + SETUPSCRIPT="setup" + fi + + if [ -f "$STF_SUITE/$SINGLETESTDIR/cleanup.ksh" ]; then + CLEANUPSCRIPT="cleanup" + fi + + cat >>$RUNFILE_DIR/$RUNFILES << EOF + +[$SINGLETESTDIR] +tests = ['$SINGLETESTFILE'] +pre = $SETUPSCRIPT +post = $CLEANUPSCRIPT +tags = ['functional'] +EOF +fi + +# +# Use default tag if none was specified +# +TAGS=${TAGS:='functional'} + +# +# Attempt to locate the runfiles describing the test workload. +# +R="" +IFS=, +for RUNFILE in $RUNFILES; do + if [ -n "$RUNFILE" ]; then + SAVED_RUNFILE="$RUNFILE" + RUNFILE=$(find_runfile "$RUNFILE") + [ -z "$RUNFILE" ] && fail "Cannot find runfile: $SAVED_RUNFILE" + R="$R,$RUNFILE" + fi + + if [ ! -r "$RUNFILE" ]; then + fail "Cannot read runfile: $RUNFILE" + fi +done +unset IFS +RUNFILES=${R#,} + +# +# This script should not be run as root. Instead the test user, which may +# be a normal user account, needs to be configured such that it can +# run commands via sudo passwordlessly. +# +if [ "$(id -u)" = "0" ]; then + fail "This script must not be run as root." +fi + +if [ "$(sudo whoami)" != "root" ]; then + fail "Passwordless sudo access required." +fi + +# +# Constrain the available binaries to a known set. +# +constrain_path + +# +# Check if ksh exists +# +if [ "$UNAME" = "FreeBSD" ]; then + sudo ln -fs /usr/local/bin/ksh93 /bin/ksh +fi +[ -e "$STF_PATH/ksh" ] || fail "This test suite requires ksh." +[ -e "$STF_SUITE/include/default.cfg" ] || fail \ + "Missing $STF_SUITE/include/default.cfg file." + +# +# Verify the ZFS module stack is loaded. +# +if [ "$STACK_TRACER" = "yes" ]; then + sudo "${ZFS_SH}" -S >/dev/null 2>&1 +else + sudo "${ZFS_SH}" >/dev/null 2>&1 +fi + +# +# Attempt to cleanup all previous state for a new test run. +# +if [ "$CLEANUPALL" = "yes" ]; then + cleanup_all +fi + +# +# By default preserve any existing pools +# NOTE: Since 'zpool list' outputs a newline-delimited list convert $KEEP from +# space-delimited to newline-delimited. +# +if [ -z "${KEEP}" ]; then + KEEP="$(sudo "$ZPOOL" list -H -o name)" + if [ -z "${KEEP}" ]; then + KEEP="rpool" + fi +else + KEEP="$(echo "$KEEP" | tr '[:blank:]' '\n')" +fi + +# +# NOTE: The following environment variables are undocumented +# and should be used for testing purposes only: +# +# __ZFS_POOL_EXCLUDE - don't iterate over the pools it lists +# __ZFS_POOL_RESTRICT - iterate only over the pools it lists +# +# See libzfs/libzfs_config.c for more information. +# +if [ "$UNAME" = "FreeBSD" ] ; then + __ZFS_POOL_EXCLUDE="$(echo "$KEEP" | tr -s '\n' ' ')" +else + __ZFS_POOL_EXCLUDE="$(echo "$KEEP" | sed ':a;N;s/\n/ /g;ba')" +fi + +. "$STF_SUITE/include/default.cfg" + +msg +msg "--- Configuration ---" +msg "Runfiles: $RUNFILES" +msg "STF_TOOLS: $STF_TOOLS" +msg "STF_SUITE: $STF_SUITE" +msg "STF_PATH: $STF_PATH" + +# +# No DISKS have been provided so a basic file or loopback based devices +# must be created for the test suite to use. +# +if [ -z "${DISKS}" ]; then + # + # Create sparse files for the test suite. These may be used + # directory or have loopback devices layered on them. + # + for TEST_FILE in ${FILES}; do + [ -f "$TEST_FILE" ] && fail "Failed file exists: ${TEST_FILE}" + truncate -s "${FILESIZE}" "${TEST_FILE}" || + fail "Failed creating: ${TEST_FILE} ($?)" + done + + # + # If requested setup loopback devices backed by the sparse files. + # + if [ "$LOOPBACK" = "yes" ]; then + test -x "$LOSETUP" || fail "$LOSETUP utility must be installed" + + for TEST_FILE in ${FILES}; do + if [ "$UNAME" = "FreeBSD" ] ; then + MDDEVICE=$(sudo "${LOSETUP}" -a -t vnode -f "${TEST_FILE}") + if [ -z "$MDDEVICE" ] ; then + fail "Failed: ${TEST_FILE} -> loopback" + fi + DISKS="$DISKS $MDDEVICE" + LOOPBACKS="$LOOPBACKS $MDDEVICE" + else + TEST_LOOPBACK=$(sudo "${LOSETUP}" -f) + sudo "${LOSETUP}" "${TEST_LOOPBACK}" "${TEST_FILE}" || + fail "Failed: ${TEST_FILE} -> ${TEST_LOOPBACK}" + BASELOOPBACK=$(basename "$TEST_LOOPBACK") + DISKS="$DISKS $BASELOOPBACK" + LOOPBACKS="$LOOPBACKS $TEST_LOOPBACK" + fi + done + DISKS=${DISKS# } + LOOPBACKS=${LOOPBACKS# } + else + DISKS="$FILES" + fi +fi + +NUM_DISKS=$(echo "${DISKS}" | awk '{print NF}') +[ "$NUM_DISKS" -lt 3 ] && fail "Not enough disks ($NUM_DISKS/3 minimum)" + +# +# Disable SELinux until the ZFS Test Suite has been updated accordingly. +# +if [ -x "$STF_PATH/setenforce" ]; then + sudo setenforce permissive >/dev/null 2>&1 +fi + +# +# Enable internal ZFS debug log and clear it. +# +if [ -e /sys/module/zfs/parameters/zfs_dbgmsg_enable ]; then + sudo /bin/sh -c "echo 1 >/sys/module/zfs/parameters/zfs_dbgmsg_enable" + sudo /bin/sh -c "echo 0 >/proc/spl/kstat/zfs/dbgmsg" +fi + +msg "FILEDIR: $FILEDIR" +msg "FILES: $FILES" +msg "LOOPBACKS: $LOOPBACKS" +msg "DISKS: $DISKS" +msg "NUM_DISKS: $NUM_DISKS" +msg "FILESIZE: $FILESIZE" +msg "ITERATIONS: $ITERATIONS" +msg "TAGS: $TAGS" +msg "STACK_TRACER: $STACK_TRACER" +msg "Keep pool(s): $KEEP" +msg "Missing util(s): $STF_MISSING_BIN" +msg "" + +export STF_TOOLS +export STF_SUITE +export STF_PATH +export DISKS +export FILEDIR +export KEEP +export __ZFS_POOL_EXCLUDE +export TESTFAIL_CALLBACKS +export PATH=$STF_PATH + +if [ "$UNAME" = "FreeBSD" ] ; then + mkdir -p "$FILEDIR" || true + RESULTS_FILE=$(mktemp -u "${FILEDIR}/zts-results.XXXX") + REPORT_FILE=$(mktemp -u "${FILEDIR}/zts-report.XXXX") +else + RESULTS_FILE=$(mktemp -u -t zts-results.XXXX -p "$FILEDIR") + REPORT_FILE=$(mktemp -u -t zts-report.XXXX -p "$FILEDIR") +fi + +# +# Run all the tests as specified. +# +msg "${TEST_RUNNER} ${QUIET:+-q}" \ + "-c \"${RUNFILES}\"" \ + "-T \"${TAGS}\"" \ + "-i \"${STF_SUITE}\"" \ + "-I \"${ITERATIONS}\"" +${TEST_RUNNER} ${QUIET:+-q} \ + -c "${RUNFILES}" \ + -T "${TAGS}" \ + -i "${STF_SUITE}" \ + -I "${ITERATIONS}" \ + 2>&1 | tee "$RESULTS_FILE" + +# +# Analyze the results. +# +${ZTS_REPORT} "$RESULTS_FILE" >"$REPORT_FILE" +RESULT=$? +cat "$REPORT_FILE" + +RESULTS_DIR=$(awk '/^Log directory/ { print $3 }' "$RESULTS_FILE") +if [ -d "$RESULTS_DIR" ]; then + cat "$RESULTS_FILE" "$REPORT_FILE" >"$RESULTS_DIR/results" +fi + +rm -f "$RESULTS_FILE" "$REPORT_FILE" + +if [ -n "$SINGLETEST" ]; then + rm -f "$RUNFILES" >/dev/null 2>&1 +fi + +exit ${RESULT} diff --git a/scripts/zfs.sh b/scripts/zfs.sh new file mode 100755 index 000000000000..e676fc295fa8 --- /dev/null +++ b/scripts/zfs.sh @@ -0,0 +1,279 @@ +#!/bin/sh +# +# A simple script to load/unload the ZFS module stack. +# + +BASE_DIR=$(dirname "$0") +SCRIPT_COMMON=common.sh +if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then + . "${BASE_DIR}/${SCRIPT_COMMON}" +else + echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 +fi + +PROG=zfs.sh +VERBOSE="no" +UNLOAD="no" +STACK_TRACER="no" + +ZED_PIDFILE=${ZED_PIDFILE:-/var/run/zed.pid} +LDMOD=${LDMOD:-/sbin/modprobe} + +KMOD_ZLIB_DEFLATE=${KMOD_ZLIB_DEFLATE:-zlib_deflate} +KMOD_ZLIB_INFLATE=${KMOD_ZLIB_INFLATE:-zlib_inflate} +KMOD_SPL=${KMOD_SPL:-spl} +KMOD_ZAVL=${KMOD_ZAVL:-zavl} +KMOD_ZNVPAIR=${KMOD_ZNVPAIR:-znvpair} +KMOD_ZUNICODE=${KMOD_ZUNICODE:-zunicode} +KMOD_ZCOMMON=${KMOD_ZCOMMON:-zcommon} +KMOD_ZLUA=${KMOD_ZLUA:-zlua} +KMOD_ICP=${KMOD_ICP:-icp} +KMOD_ZFS=${KMOD_ZFS:-zfs} +KMOD_FREEBSD=${KMOD_FREEBSD:-openzfs} +KMOD_ZZSTD=${KMOD_ZZSTD:-zzstd} + + +usage() { +cat << EOF +USAGE: +$0 [hvudS] [module-options] + +DESCRIPTION: + Load/unload the ZFS module stack. + +OPTIONS: + -h Show this message + -v Verbose + -u Unload modules + -S Enable kernel stack tracer +EOF +} + +while getopts 'hvuS' OPTION; do + case $OPTION in + h) + usage + exit 1 + ;; + v) + VERBOSE="yes" + ;; + u) + UNLOAD="yes" + ;; + S) + STACK_TRACER="yes" + ;; + ?) + usage + exit + ;; + esac +done + +kill_zed() { + if [ -f "$ZED_PIDFILE" ]; then + PID=$(cat "$ZED_PIDFILE") + kill "$PID" + fi +} + +check_modules_linux() { + LOADED_MODULES="" + MISSING_MODULES="" + + for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR $KMOD_ZUNICODE $KMOD_ZCOMMON \ + $KMOD_ZLUA $KMOD_ZZSTD $KMOD_ICP $KMOD_ZFS; do + NAME=$(basename "$KMOD" .ko) + + if lsmod | grep -E -q "^${NAME}"; then + LOADED_MODULES="$LOADED_MODULES\t$NAME\n" + fi + + if ! modinfo "$KMOD" >/dev/null 2>&1; then + MISSING_MODULES="$MISSING_MODULES\t${KMOD}\n" + fi + done + + if [ -n "$LOADED_MODULES" ]; then + printf "Unload the kernel modules by running '%s -u':\n" "$PROG" + printf "%b" "$LOADED_MODULES" + exit 1 + fi + + if [ -n "$MISSING_MODULES" ]; then + printf "The following kernel modules can not be found:\n" + printf "%b" "$MISSING_MODULES" + exit 1 + fi + + return 0 +} + +load_module_linux() { + KMOD=$1 + + FILE=$(modinfo "$KMOD" | awk '/^filename:/ {print $2}') + VERSION=$(modinfo "$KMOD" | awk '/^version:/ {print $2}') + + if [ "$VERBOSE" = "yes" ]; then + echo "Loading: $FILE ($VERSION)" + fi + + $LDMOD "$KMOD" >/dev/null 2>&1 + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + echo "Failed to load $KMOD" + return 1 + fi + + return 0 +} + +load_modules_freebsd() { + kldload "$KMOD_FREEBSD" || return 1 + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully loaded ZFS module stack" + fi + + return 0 +} + +load_modules_linux() { + mkdir -p /etc/zfs + + if modinfo "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1; then + modprobe "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1 + fi + + if modinfo "$KMOD_ZLIB_INFLATE">/dev/null 2>&1; then + modprobe "$KMOD_ZLIB_INFLATE" >/dev/null 2>&1 + fi + + for KMOD in $KMOD_SPL $KMOD_ZAVL $KMOD_ZNVPAIR \ + $KMOD_ZUNICODE $KMOD_ZCOMMON $KMOD_ZLUA $KMOD_ZZSTD \ + $KMOD_ICP $KMOD_ZFS; do + load_module_linux "$KMOD" || return 1 + done + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully loaded ZFS module stack" + fi + + return 0 +} + +unload_module_linux() { + KMOD=$1 + + NAME=$(basename "$KMOD" .ko) + FILE=$(modinfo "$KMOD" | awk '/^filename:/ {print $2}') + VERSION=$(modinfo "$KMOD" | awk '/^version:/ {print $2}') + + if [ "$VERBOSE" = "yes" ]; then + echo "Unloading: $KMOD ($VERSION)" + fi + + rmmod "$NAME" || echo "Failed to unload $NAME" + + return 0 +} + +unload_modules_freebsd() { + kldunload "$KMOD_FREEBSD" || echo "Failed to unload $KMOD_FREEBSD" + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully unloaded ZFS module stack" + fi + + return 0 +} + +unload_modules_linux() { + for KMOD in $KMOD_ZFS $KMOD_ICP $KMOD_ZZSTD $KMOD_ZLUA $KMOD_ZCOMMON \ + $KMOD_ZUNICODE $KMOD_ZNVPAIR $KMOD_ZAVL $KMOD_SPL; do + NAME=$(basename "$KMOD" .ko) + USE_COUNT=$(lsmod | grep -E "^${NAME} " | awk '{print $3}') + + if [ "$USE_COUNT" = "0" ] ; then + unload_module_linux "$KMOD" || return 1 + fi + done + + if modinfo "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1; then + modprobe -r "$KMOD_ZLIB_DEFLATE" >/dev/null 2>&1 + fi + + if modinfo "$KMOD_ZLIB_INFLATE">/dev/null 2>&1; then + modprobe -r "$KMOD_ZLIB_INFLATE" >/dev/null 2>&1 + fi + + if [ "$VERBOSE" = "yes" ]; then + echo "Successfully unloaded ZFS module stack" + fi + + return 0 +} + +stack_clear_linux() { + STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size + STACK_TRACER_ENABLED=/proc/sys/kernel/stack_tracer_enabled + + if [ "$STACK_TRACER" = "yes" ] && [ -e "$STACK_MAX_SIZE" ]; then + echo 1 >"$STACK_TRACER_ENABLED" + echo 0 >"$STACK_MAX_SIZE" + fi +} + +stack_check_linux() { + STACK_MAX_SIZE=/sys/kernel/debug/tracing/stack_max_size + STACK_TRACE=/sys/kernel/debug/tracing/stack_trace + STACK_LIMIT=15362 + + if [ -e "$STACK_MAX_SIZE" ]; then + STACK_SIZE=$(cat "$STACK_MAX_SIZE") + + if [ "$STACK_SIZE" -ge "$STACK_LIMIT" ]; then + echo + echo "Warning: max stack size $STACK_SIZE bytes" + cat "$STACK_TRACE" + fi + fi +} + +if [ "$(id -u)" != 0 ]; then + echo "Must run as root" + exit 1 +fi + +UNAME=$(uname -s) + +if [ "$UNLOAD" = "yes" ]; then + kill_zed + umount -t zfs -a + case $UNAME in + FreeBSD) + unload_modules_freebsd + ;; + Linux) + stack_check_linux + unload_modules_linux + ;; + esac +else + case $UNAME in + FreeBSD) + load_modules_freebsd + ;; + Linux) + stack_clear_linux + check_modules_linux + load_modules_linux "$@" + udevadm trigger + udevadm settle + ;; + esac +fi + +exit 0 diff --git a/scripts/zfs2zol-patch.sed b/scripts/zfs2zol-patch.sed new file mode 100755 index 000000000000..99824d6dd4af --- /dev/null +++ b/scripts/zfs2zol-patch.sed @@ -0,0 +1,32 @@ +#!/bin/sed -f + +s:usr/src/uts/common/fs/zfs/sys:include/sys:g +s:usr/src/uts/common/fs/zfs:module/zfs:g +s:usr/src/lib/libzpool:lib/libzpool:g +s:usr/src/cmd:cmd:g +s:usr/src/common/nvpair:module/nvpair:g +s:usr/src/lib/libzfs/common/libzfs.h:include/libzfs.h:g +s:usr/src/man/man1m/zfs.1m:man/man8/zfs.8:g +s:usr/src/uts/common/sys:include/sys:g +s:usr/src/lib/libzfs_core/common/libzfs_core.h:include/libzfs_core.h:g +s:usr/src/lib/libzfs/common:lib/libzfs:g +s:usr/src/lib/libzfs_core/common:lib/libzfs_core:g +s:lib/libzpool/common/sys:include/sys:g +s:lib/libzpool/common:lib/libzpool:g + +s:usr/src/test/zfs-tests/include:tests/zfs-tests/include:g +s:usr/src/test/zfs-tests/runfiles:tests/runfiles:g +s:usr/src/test/zfs-tests/tests/functional:tests/zfs-tests/tests/functional:g +s:usr/src/test/zfs-tests/tests/perf:tests/zfs-tests/tests/perf:g +s:usr/src/test/test-runner/cmd/run.py:tests/test-runner/cmd/test-runner.py:g +s/usr\/src\/common\/zfs\/\(.*\)\.c/module\/zcommon\/\1.c/g + +# crypto framework +s:usr/src/common/crypto:module/icp/algs:g +s:usr/src/uts/common/crypto/io:module/icp/io:g + +# Headers +s:usr/src/common/zfs/\(.*\)\.h:include/\1.h:g + +# Man pages +s:usr/src/man:man:g diff --git a/scripts/zimport.sh b/scripts/zimport.sh new file mode 100755 index 000000000000..304ab7623d42 --- /dev/null +++ b/scripts/zimport.sh @@ -0,0 +1,517 @@ +#!/usr/bin/env bash +# +# Verify that an assortment of known good reference pools can be imported +# using different versions of the ZoL code. +# +# By default references pools for the major ZFS implementation will be +# checked against the most recent ZoL tags and the master development branch. +# Alternate tags or branches may be verified with the '-s option. +# Passing the keyword "installed" will instruct the script to test whatever +# version is installed. +# +# Preferentially a reference pool is used for all tests. However, if one +# does not exist and the pool-tag matches one of the src-tags then a new +# reference pool will be created using binaries from that source build. +# This is particularly useful when you need to test your changes before +# opening a pull request. The keyword 'all' can be used as short hand +# refer to all available reference pools. +# +# New reference pools may be added by placing a bzip2 compressed tarball +# of the pool in the scripts/zfs-images directory and then passing +# the -p option. To increase the test coverage reference pools +# should be collected for all the major ZFS implementations. Having these +# pools easily available is also helpful to the developers. +# +# Care should be taken to run these tests with a kernel supported by all +# the listed tags. Otherwise build failure will cause false positives. +# +# +# EXAMPLES: +# +# The following example will verify the zfs-0.6.2 tag, the master branch, +# and the installed zfs version can correctly import the listed pools. +# Note there is no reference pool available for master and installed but +# because binaries are available one is automatically constructed. The +# working directory is also preserved between runs (-k) preventing the +# need to rebuild from source for multiple runs. +# +# zimport.sh -k -f /var/tmp/zimport \ +# -s "zfs-0.6.2 master installed" \ +# -p "zevo-1.1.1 zol-0.6.2 zol-0.6.2-173 master installed" +# +# --------------------- ZFS on Linux Source Versions -------------- +# zfs-0.6.2 master 0.6.2-175_g36eb554 +# ----------------------------------------------------------------- +# Clone ZFS Local Local Skip +# Build ZFS Pass Pass Skip +# ----------------------------------------------------------------- +# zevo-1.1.1 Pass Pass Pass +# zol-0.6.2 Pass Pass Pass +# zol-0.6.2-173 Fail Pass Pass +# master Pass Pass Pass +# installed Pass Pass Pass +# + +BASE_DIR=$(dirname "$0") +SCRIPT_COMMON=common.sh +if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then + . "${BASE_DIR}/${SCRIPT_COMMON}" +else + echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 +fi + +PROG=zimport.sh +SRC_TAGS="zfs-0.6.5.11 master" +POOL_TAGS="all master" +POOL_CREATE_OPTIONS= +TEST_DIR=$(mktemp -u -d -p /var/tmp zimport.XXXXXXXX) +KEEP="no" +VERBOSE="no" +COLOR="yes" +REPO="https://github.com/zfsonlinux" +IMAGES_DIR="$SCRIPTDIR/zfs-images/" +IMAGES_TAR="https://github.com/zfsonlinux/zfs-images/tarball/master" +ERROR=0 + +CONFIG_LOG="configure.log" +CONFIG_OPTIONS=${CONFIG_OPTIONS:-""} +MAKE_LOG="make.log" +MAKE_OPTIONS=${MAKE_OPTIONS:-"-s -j$(nproc)"} + +COLOR_GREEN="\033[0;32m" +COLOR_RED="\033[0;31m" +COLOR_BROWN="\033[0;33m" +COLOR_RESET="\033[0m" + +usage() { +cat << EOF +USAGE: +zimport.sh [hvl] [-r repo] [-s src-tag] [-i pool-dir] [-p pool-tag] + [-f path] [-o options] + +DESCRIPTION: + ZPOOL import verification tests + +OPTIONS: + -h Show this message + -v Verbose + -c No color + -k Keep temporary directory + -r Source repository ($REPO) + -s ... Verify ZoL versions with the listed tags + -i Pool image directory + -p ... Verify pools created with the listed tags + -f Temporary directory to use + -o Additional options to pass to 'zpool create' + +EOF +} + +while getopts 'hvckr:s:i:p:f:o:?' OPTION; do + case $OPTION in + h) + usage + exit 1 + ;; + v) + VERBOSE="yes" + ;; + c) + COLOR="no" + ;; + k) + KEEP="yes" + ;; + r) + REPO="$OPTARG" + ;; + s) + SRC_TAGS="$OPTARG" + ;; + i) + IMAGES_DIR="$OPTARG" + ;; + p) + POOL_TAGS="$OPTARG" + ;; + f) + TEST_DIR="$OPTARG" + ;; + o) + POOL_CREATE_OPTIONS="$OPTARG" + ;; + ?) + usage + exit 1 + ;; + esac +done + +# +# Verify the module start is not loaded +# +if lsmod | grep zfs >/dev/null; then + echo "ZFS modules must be unloaded" + exit 1 +fi + +# +# Create a random directory tree of files and sub-directories to +# to act as a copy source for the various regression tests. +# +populate() { + local ROOT=$1 + local MAX_DIR_SIZE=$2 + local MAX_FILE_SIZE=$3 + + # shellcheck disable=SC2086 + mkdir -p $ROOT/{a,b,c,d,e,f,g}/{h,i} + DIRS=$(find "$ROOT") + + for DIR in $DIRS; do + COUNT=$((RANDOM % MAX_DIR_SIZE)) + + # shellcheck disable=SC2034 + for i in $(seq $COUNT); do + FILE=$(mktemp -p "$DIR") + SIZE=$((RANDOM % MAX_FILE_SIZE)) + dd if=/dev/urandom of="$FILE" bs=1k \ + count="$SIZE" &>/dev/null + done + done + + return 0 +} + +SRC_DIR=$(mktemp -d -p /var/tmp/ zfs.src.XXXXXXXX) +trap 'rm -Rf "$SRC_DIR"' INT TERM EXIT +populate "$SRC_DIR" 10 100 + +SRC_DIR="$TEST_DIR/src" +SRC_DIR_ZFS="$SRC_DIR/zfs" + +if [ "$COLOR" = "no" ]; then + COLOR_GREEN="" + COLOR_BROWN="" + COLOR_RED="" + COLOR_RESET="" +fi + +pass_nonewline() { + echo -n -e "${COLOR_GREEN}Pass${COLOR_RESET}\t\t" +} + +skip_nonewline() { + echo -n -e "${COLOR_BROWN}Skip${COLOR_RESET}\t\t" +} + +fail_nonewline() { + echo -n -e "${COLOR_RED}Fail${COLOR_RESET}\t\t" +} + +# +# Log a failure message, cleanup, and return an error. +# +fail() { + echo -e "$PROG: $1" >&2 + $ZFS_SH -u >/dev/null 2>&1 + exit 1 +} + +# +# Set several helper variables which are derived from a source tag. +# +# ZFS_TAG - The passed zfs-x.y.z tag +# ZFS_DIR - The zfs directory name +# ZFS_URL - The zfs github URL to fetch the tarball +# +src_set_vars() { + local TAG=$1 + + ZFS_TAG="$TAG" + ZFS_DIR="$SRC_DIR_ZFS/$ZFS_TAG" + ZFS_URL="$REPO/zfs/tarball/$ZFS_TAG" + + if [ "$TAG" = "installed" ]; then + ZPOOL_CMD=$(command -v zpool) + ZFS_CMD=$(command -v zfs) + ZFS_SH="/usr/share/zfs/zfs.sh" + else + ZPOOL_CMD="./cmd/zpool/zpool" + ZFS_CMD="./cmd/zfs/zfs" + ZFS_SH="./scripts/zfs.sh" + fi +} + +# +# Set several helper variables which are derived from a pool name such +# as zol-0.6.x, zevo-1.1.1, etc. These refer to example pools from various +# ZFS implementations which are used to verify compatibility. +# +# POOL_TAG - The example pools name in scripts/zfs-images/. +# POOL_BZIP - The full path to the example bzip2 compressed pool. +# POOL_DIR - The top level test path for this pool. +# POOL_DIR_PRISTINE - The directory containing a pristine version of the pool. +# POOL_DIR_COPY - The directory containing a working copy of the pool. +# POOL_DIR_SRC - Location of a source build if it exists for this pool. +# +pool_set_vars() { + local TAG=$1 + + POOL_TAG=$TAG + POOL_BZIP=$IMAGES_DIR/$POOL_TAG.tar.bz2 + POOL_DIR=$TEST_DIR/pools/$POOL_TAG + POOL_DIR_PRISTINE=$POOL_DIR/pristine + POOL_DIR_COPY=$POOL_DIR/copy + POOL_DIR_SRC="$SRC_DIR_ZFS/${POOL_TAG//zol/zfs}" +} + +# +# Construct a non-trivial pool given a specific version of the source. More +# interesting pools provide better test coverage so this function should +# extended as needed to create more realistic pools. +# +pool_create() { + pool_set_vars "$1" + src_set_vars "$1" + + if [ "$POOL_TAG" != "installed" ]; then + cd "$POOL_DIR_SRC" || fail "Failed 'cd $POOL_DIR_SRC'" + fi + + $ZFS_SH zfs="spa_config_path=$POOL_DIR_PRISTINE" || \ + fail "Failed to load kmods" + + # Create a file vdev RAIDZ pool. + truncate -s 1G \ + "$POOL_DIR_PRISTINE/vdev1" "$POOL_DIR_PRISTINE/vdev2" \ + "$POOL_DIR_PRISTINE/vdev3" "$POOL_DIR_PRISTINE/vdev4" || \ + fail "Failed 'truncate -s 1G ...'" + # shellcheck disable=SC2086 + $ZPOOL_CMD create $POOL_CREATE_OPTIONS "$POOL_TAG" raidz \ + "$POOL_DIR_PRISTINE/vdev1" "$POOL_DIR_PRISTINE/vdev2" \ + "$POOL_DIR_PRISTINE/vdev3" "$POOL_DIR_PRISTINE/vdev4" || \ + fail "Failed '$ZPOOL_CMD create $POOL_CREATE_OPTIONS $POOL_TAG ...'" + + # Create a pool/fs filesystem with some random contents. + $ZFS_CMD create "$POOL_TAG/fs" || \ + fail "Failed '$ZFS_CMD create $POOL_TAG/fs'" + populate "/$POOL_TAG/fs/" 10 100 + + # Snapshot that filesystem, clone it, remove the files/dirs, + # replace them with new files/dirs. + $ZFS_CMD snap "$POOL_TAG/fs@snap" || \ + fail "Failed '$ZFS_CMD snap $POOL_TAG/fs@snap'" + $ZFS_CMD clone "$POOL_TAG/fs@snap" "$POOL_TAG/clone" || \ + fail "Failed '$ZFS_CMD clone $POOL_TAG/fs@snap $POOL_TAG/clone'" + # shellcheck disable=SC2086 + rm -Rf /$POOL_TAG/clone/* + populate "/$POOL_TAG/clone/" 10 100 + + # Scrub the pool, delay slightly, then export it. It is now + # somewhat interesting for testing purposes. + $ZPOOL_CMD scrub "$POOL_TAG" || \ + fail "Failed '$ZPOOL_CMD scrub $POOL_TAG'" + sleep 10 + $ZPOOL_CMD export "$POOL_TAG" || \ + fail "Failed '$ZPOOL_CMD export $POOL_TAG'" + + $ZFS_SH -u || fail "Failed to unload kmods" +} + +# If the zfs-images directory doesn't exist fetch a copy from Github then +# cache it in the $TEST_DIR and update $IMAGES_DIR. +if [ ! -d "$IMAGES_DIR" ]; then + IMAGES_DIR="$TEST_DIR/zfs-images" + mkdir -p "$IMAGES_DIR" + curl -sL "$IMAGES_TAR" | \ + tar -xz -C "$IMAGES_DIR" --strip-components=1 || \ + fail "Failed to download pool images" +fi + +# Given the available images in the zfs-images directory substitute the +# list of available images for the reserved keyword 'all'. +for TAG in $POOL_TAGS; do + + if [ "$TAG" = "all" ]; then + # shellcheck disable=SC2010 + ALL_TAGS=$(ls "$IMAGES_DIR" | grep "tar.bz2" | \ + sed 's/.tar.bz2//' | tr '\n' ' ') + NEW_TAGS="$NEW_TAGS $ALL_TAGS" + else + NEW_TAGS="$NEW_TAGS $TAG" + fi +done +POOL_TAGS="$NEW_TAGS" + +if [ "$VERBOSE" = "yes" ]; then + echo "---------------------------- Options ----------------------------" + echo "VERBOSE=$VERBOSE" + echo "KEEP=$KEEP" + echo "REPO=$REPO" + echo "SRC_TAGS=$SRC_TAGS" + echo "POOL_TAGS=$POOL_TAGS" + echo "PATH=$TEST_DIR" + echo "POOL_CREATE_OPTIONS=$POOL_CREATE_OPTIONS" + echo +fi + +if [ ! -d "$TEST_DIR" ]; then + mkdir -p "$TEST_DIR" +fi + +if [ ! -d "$SRC_DIR" ]; then + mkdir -p "$SRC_DIR" +fi + +# Print a header for all tags which are being tested. +echo "--------------------- ZFS on Linux Source Versions --------------" +printf "%-16s" " " +for TAG in $SRC_TAGS; do + src_set_vars "$TAG" + + if [ "$TAG" = "installed" ]; then + ZFS_VERSION=$(modinfo zfs | awk '/version:/ { print $2; exit }') + if [ -n "$ZFS_VERSION" ]; then + printf "%-16s" "$ZFS_VERSION" + else + fail "ZFS is not installed" + fi + else + printf "%-16s" "$TAG" + fi +done +echo -e "\n-----------------------------------------------------------------" + +# +# Attempt to generate the tarball from your local git repository, if that +# fails then attempt to download the tarball from Github. +# +printf "%-16s" "Clone ZFS" +for TAG in $SRC_TAGS; do + src_set_vars "$TAG" + + if [ -d "$ZFS_DIR" ]; then + skip_nonewline + elif [ "$ZFS_TAG" = "installed" ]; then + skip_nonewline + else + cd "$SRC_DIR" || fail "Failed 'cd $SRC_DIR'" + + if [ ! -d "$SRC_DIR_ZFS" ]; then + mkdir -p "$SRC_DIR_ZFS" + fi + + git archive --format=tar --prefix="$ZFS_TAG/ $ZFS_TAG" \ + -o "$SRC_DIR_ZFS/$ZFS_TAG.tar" &>/dev/null || \ + rm "$SRC_DIR_ZFS/$ZFS_TAG.tar" + if [ -s "$SRC_DIR_ZFS/$ZFS_TAG.tar" ]; then + tar -xf "$SRC_DIR_ZFS/$ZFS_TAG.tar" -C "$SRC_DIR_ZFS" + rm "$SRC_DIR_ZFS/$ZFS_TAG.tar" + echo -n -e "${COLOR_GREEN}Local${COLOR_RESET}\t\t" + else + mkdir -p "$ZFS_DIR" || fail "Failed to create $ZFS_DIR" + curl -sL "$ZFS_URL" | tar -xz -C "$ZFS_DIR" \ + --strip-components=1 || \ + fail "Failed to download $ZFS_URL" + echo -n -e "${COLOR_GREEN}Remote${COLOR_RESET}\t\t" + fi + fi +done +printf "\n" + +# Build the listed tags +printf "%-16s" "Build ZFS" +for TAG in $SRC_TAGS; do + src_set_vars "$TAG" + + if [ -f "$ZFS_DIR/module/zfs/zfs.ko" ]; then + skip_nonewline + elif [ "$ZFS_TAG" = "installed" ]; then + skip_nonewline + else + cd "$ZFS_DIR" || fail "Failed 'cd $ZFS_DIR'" + make distclean &>/dev/null + ./autogen.sh >>"$CONFIG_LOG" 2>&1 || \ + fail "Failed ZFS 'autogen.sh'" + # shellcheck disable=SC2086 + ./configure $CONFIG_OPTIONS >>"$CONFIG_LOG" 2>&1 || \ + fail "Failed ZFS 'configure $CONFIG_OPTIONS'" + # shellcheck disable=SC2086 + make $MAKE_OPTIONS >>"$MAKE_LOG" 2>&1 || \ + fail "Failed ZFS 'make $MAKE_OPTIONS'" + pass_nonewline + fi +done +printf "\n" +echo "-----------------------------------------------------------------" + +# Either create a new pool using 'zpool create', or alternately restore an +# existing pool from another ZFS implementation for compatibility testing. +for TAG in $POOL_TAGS; do + pool_set_vars "$TAG" + SKIP=0 + + printf "%-16s" "$POOL_TAG" + rm -Rf "$POOL_DIR" + mkdir -p "$POOL_DIR_PRISTINE" + + # Use the existing compressed image if available. + if [ -f "$POOL_BZIP" ]; then + tar -xjf "$POOL_BZIP" -C "$POOL_DIR_PRISTINE" \ + --strip-components=1 || \ + fail "Failed 'tar -xjf $POOL_BZIP" + # Use the installed version to create the pool. + elif [ "$TAG" = "installed" ]; then + pool_create "$TAG" + # A source build is available to create the pool. + elif [ -d "$POOL_DIR_SRC" ]; then + pool_create "$TAG" + else + SKIP=1 + fi + + # Verify 'zpool import' works for all listed source versions. + for SRC_TAG in $SRC_TAGS; do + + if [ $SKIP -eq 1 ]; then + skip_nonewline + continue + fi + + src_set_vars "$SRC_TAG" + if [ "$SRC_TAG" != "installed" ]; then + cd "$ZFS_DIR" || fail "Failed 'cd $ZFS_DIR'" + fi + $ZFS_SH zfs="spa_config_path=$POOL_DIR_COPY" + + cp -a --sparse=always "$POOL_DIR_PRISTINE" \ + "$POOL_DIR_COPY" || \ + fail "Failed to copy $POOL_DIR_PRISTINE to $POOL_DIR_COPY" + POOL_NAME=$($ZPOOL_CMD import -d "$POOL_DIR_COPY" | \ + awk '/pool:/ { print $2; exit 0 }') + + $ZPOOL_CMD import -N -d "$POOL_DIR_COPY" \ + "$POOL_NAME" &>/dev/null + # shellcheck disable=SC2181 + if [ $? -ne 0 ]; then + fail_nonewline + ERROR=1 + else + $ZPOOL_CMD export "$POOL_NAME" || \ + fail "Failed to export pool" + pass_nonewline + fi + + rm -Rf "$POOL_DIR_COPY" + + $ZFS_SH -u || fail "Failed to unload kmods" + done + printf "\n" +done + +if [ "$KEEP" = "no" ]; then + rm -Rf "$TEST_DIR" +fi + +exit $ERROR diff --git a/scripts/zloop.sh b/scripts/zloop.sh new file mode 100755 index 000000000000..3d9baaf0e2b8 --- /dev/null +++ b/scripts/zloop.sh @@ -0,0 +1,304 @@ +#!/usr/bin/env bash + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2015 by Delphix. All rights reserved. +# Copyright (C) 2016 Lawrence Livermore National Security, LLC. +# + +BASE_DIR=$(dirname "$0") +SCRIPT_COMMON=common.sh +if [ -f "${BASE_DIR}/${SCRIPT_COMMON}" ]; then + . "${BASE_DIR}/${SCRIPT_COMMON}" +else + echo "Missing helper script ${SCRIPT_COMMON}" && exit 1 +fi + +# shellcheck disable=SC2034 +PROG=zloop.sh +GDB=${GDB:-gdb} + +DEFAULTWORKDIR=/var/tmp +DEFAULTCOREDIR=/var/tmp/zloop + +function usage +{ + echo -e "\n$0 [-t ] [ -s ] [-c ]" \ + "[ -- [extra ztest parameters]]\n" \ + "\n" \ + " This script runs ztest repeatedly with randomized arguments.\n" \ + " If a crash is encountered, the ztest logs, any associated\n" \ + " vdev files, and core file (if one exists) are moved to the\n" \ + " output directory ($DEFAULTCOREDIR by default). Any options\n" \ + " after the -- end-of-options marker will be passed to ztest.\n" \ + "\n" \ + " Options:\n" \ + " -t Total time to loop for, in seconds. If not provided,\n" \ + " zloop runs forever.\n" \ + " -s Size of vdev devices.\n" \ + " -f Specify working directory for ztest vdev files.\n" \ + " -c Specify a core dump directory to use.\n" \ + " -m Max number of core dumps to allow before exiting.\n" \ + " -l Create 'ztest.core.N' symlink to core directory.\n" \ + " -h Print this help message.\n" \ + "" >&2 +} + +function or_die +{ + # shellcheck disable=SC2068 + $@ + # shellcheck disable=SC2181 + if [[ $? -ne 0 ]]; then + # shellcheck disable=SC2145 + echo "Command failed: $@" + exit 1 + fi +} + +case $(uname) in +FreeBSD) + coreglob="z*.core" + ;; +Linux) + # core file helpers + origcorepattern="$(cat /proc/sys/kernel/core_pattern)" + coreglob="$(grep -E -o '^([^|%[:space:]]*)' /proc/sys/kernel/core_pattern)*" + + if [[ $coreglob = "*" ]]; then + echo "Setting core file pattern..." + echo "core" > /proc/sys/kernel/core_pattern + coreglob="$(grep -E -o '^([^|%[:space:]]*)' \ + /proc/sys/kernel/core_pattern)*" + fi + ;; +*) + exit 1 + ;; +esac + +function core_file +{ + # shellcheck disable=SC2012 disable=2086 + printf "%s" "$(ls -tr1 $coreglob 2> /dev/null | head -1)" +} + +function core_prog +{ + prog=$ZTEST + core_id=$($GDB --batch -c "$1" | grep "Core was generated by" | \ + tr \' ' ') + # shellcheck disable=SC2076 + if [[ "$core_id" =~ "zdb " ]]; then + prog=$ZDB + fi + printf "%s" "$prog" +} + +function store_core +{ + core="$(core_file)" + if [[ $ztrc -ne 0 ]] || [[ -f "$core" ]]; then + df -h "$workdir" >>ztest.out + coreid=$(date "+zloop-%y%m%d-%H%M%S") + foundcrashes=$((foundcrashes + 1)) + + # zdb debugging + zdbcmd="$ZDB -U "$workdir/zpool.cache" -dddMmDDG ztest" + zdbdebug=$($zdbcmd 2>&1) + echo -e "$zdbcmd\n" >>ztest.zdb + echo "$zdbdebug" >>ztest.zdb + + dest=$coredir/$coreid + or_die mkdir -p "$dest" + or_die mkdir -p "$dest/vdev" + + if [[ $symlink -ne 0 ]]; then + or_die ln -sf "$dest" ztest.core.$foundcrashes + fi + + echo "*** ztest crash found - moving logs to $dest" + + or_die mv ztest.history "$dest/" + or_die mv ztest.zdb "$dest/" + or_die mv ztest.out "$dest/" + or_die mv "$workdir/ztest*" "$dest/vdev/" + + if [[ -e "$workdir/zpool.cache" ]]; then + or_die mv "$workdir/zpool.cache" "$dest/vdev/" + fi + + # check for core + if [[ -f "$core" ]]; then + coreprog=$(core_prog "$core") + coredebug=$($GDB --batch --quiet \ + -ex "set print thread-events off" \ + -ex "printf \"*\n* Backtrace \n*\n\"" \ + -ex "bt" \ + -ex "printf \"*\n* Libraries \n*\n\"" \ + -ex "info sharedlib" \ + -ex "printf \"*\n* Threads (full) \n*\n\"" \ + -ex "info threads" \ + -ex "printf \"*\n* Backtraces \n*\n\"" \ + -ex "thread apply all bt" \ + -ex "printf \"*\n* Backtraces (full) \n*\n\"" \ + -ex "thread apply all bt full" \ + -ex "quit" "$coreprog" "$core" 2>&1 | \ + grep -v "New LWP") + + # Dump core + logs to stored directory + echo "$coredebug" >>"$dest/ztest.gdb" + or_die mv "$core" "$dest/" + + # Record info in cores logfile + echo "*** core @ $coredir/$coreid/$core:" | \ + tee -a ztest.cores + fi + + if [[ $coremax -gt 0 ]] && + [[ $foundcrashes -ge $coremax ]]; then + echo "exiting... max $coremax allowed cores" + exit 1 + else + echo "continuing..." + fi + fi +} + +# parse arguments +# expected format: zloop [-t timeout] [-c coredir] [-- extra ztest args] +coredir=$DEFAULTCOREDIR +basedir=$DEFAULTWORKDIR +rundir="zloop-run" +timeout=0 +size="512m" +coremax=0 +symlink=0 +while getopts ":ht:m:s:c:f:l" opt; do + case $opt in + t ) [[ $OPTARG -gt 0 ]] && timeout=$OPTARG ;; + m ) [[ $OPTARG -gt 0 ]] && coremax=$OPTARG ;; + s ) [[ $OPTARG ]] && size=$OPTARG ;; + c ) [[ $OPTARG ]] && coredir=$OPTARG ;; + f ) [[ $OPTARG ]] && basedir=$(readlink -f "$OPTARG") ;; + l ) symlink=1 ;; + h ) usage + exit 2 + ;; + * ) echo "Invalid argument: -$OPTARG"; + usage + exit 1 + esac +done +# pass remaining arguments on to ztest +shift $((OPTIND - 1)) + +# enable core dumps +ulimit -c unlimited +export ASAN_OPTIONS=abort_on_error=1:disable_coredump=0 + +if [[ -f "$(core_file)" ]]; then + echo -n "There's a core dump here you might want to look at first... " + core_file + echo + exit 1 +fi + +if [[ ! -d $coredir ]]; then + echo "core dump directory ($coredir) does not exist, creating it." + or_die mkdir -p "$coredir" +fi + +if [[ ! -w $coredir ]]; then + echo "core dump directory ($coredir) is not writable." + exit 1 +fi + +or_die rm -f ztest.history +or_die rm -f ztest.zdb +or_die rm -f ztest.cores + +ztrc=0 # ztest return value +foundcrashes=0 # number of crashes found so far +starttime=$(date +%s) +curtime=$starttime + +# if no timeout was specified, loop forever. +while [[ $timeout -eq 0 ]] || [[ $curtime -le $((starttime + timeout)) ]]; do + zopt="-G -VVVVV" + + # start each run with an empty directory + workdir="$basedir/$rundir" + or_die rm -rf "$workdir" + or_die mkdir "$workdir" + + # switch between common arrangements & fully randomized + if [[ $((RANDOM % 2)) -eq 0 ]]; then + mirrors=2 + raidz=0 + parity=1 + vdevs=2 + else + mirrors=$(((RANDOM % 3) * 1)) + parity=$(((RANDOM % 3) + 1)) + raidz=$((((RANDOM % 9) + parity + 1) * (RANDOM % 2))) + vdevs=$(((RANDOM % 3) + 3)) + fi + align=$(((RANDOM % 2) * 3 + 9)) + runtime=$((RANDOM % 100)) + passtime=$((RANDOM % (runtime / 3 + 1) + 10)) + + zopt="$zopt -m $mirrors" + zopt="$zopt -r $raidz" + zopt="$zopt -R $parity" + zopt="$zopt -v $vdevs" + zopt="$zopt -a $align" + zopt="$zopt -T $runtime" + zopt="$zopt -P $passtime" + zopt="$zopt -s $size" + zopt="$zopt -f $workdir" + + # shellcheck disable=SC2124 + cmd="$ZTEST $zopt $@" + desc="$(date '+%m/%d %T') $cmd" + echo "$desc" | tee -a ztest.history + echo "$desc" >>ztest.out + $cmd >>ztest.out 2>&1 + ztrc=$? + grep -E '===|WARNING' ztest.out >>ztest.history + + store_core + + curtime=$(date +%s) +done + +echo "zloop finished, $foundcrashes crashes found" + +# restore core pattern. +case $(uname) in +Linux) + echo "$origcorepattern" > /proc/sys/kernel/core_pattern + ;; +*) + ;; +esac + +uptime >>ztest.out + +if [[ $foundcrashes -gt 0 ]]; then + exit 1 +fi diff --git a/scripts/zol2zfs-patch.sed b/scripts/zol2zfs-patch.sed new file mode 100755 index 000000000000..bb6d9faac450 --- /dev/null +++ b/scripts/zol2zfs-patch.sed @@ -0,0 +1,20 @@ +#!/bin/sed -f + +s:cmd:usr/src/cmd:g +s:include/libzfs.h:usr/src/lib/libzfs/common/libzfs.h:g +s:include/libzfs_core.h:usr/src/lib/libzfs_core/common/libzfs_core.h:g +s:include/sys:lib/libzpool/common/sys:g +s:include/sys:usr/src/uts/common/fs/zfs/sys:g +s:include/sys:usr/src/uts/common/sys:g +s:include/zfs_fletcher.h:usr/src/common/zfs/zfs_fletcher.h:g +s:include:usr/src/common/zfs:g +s:lib/libzfs:usr/src/lib/libzfs/common:g +s:lib/libzfs_core:usr/src/lib/libzfs_core/common:g +s:lib/libzpool:lib/libzpool/common:g +s:lib/libzpool:usr/src/lib/libzpool:g +s:man/man5/zpool-features.5:usr/src/man/man5/zpool-features.5:g +s:man/man8/zfs.8:usr/src/man/man1m/zfs.1m:g +s:module/nvpair:usr/src/common/nvpair:g +s:module/zcommon:usr/src/common/zfs/:g +s:module/zfs:usr/src/uts/common/fs/zfs:g +s:tests/zfs-tests:test/zfs-tests:g diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 000000000000..f13b19613b01 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,3 @@ +SUBDIRS = runfiles test-runner zfs-tests + +EXTRA_DIST = README.md diff --git a/tests/README.md b/tests/README.md new file mode 100644 index 000000000000..72b994fa9fa9 --- /dev/null +++ b/tests/README.md @@ -0,0 +1,152 @@ +# ZFS Test Suite README + +1) Building and installing the ZFS Test Suite + +The ZFS Test Suite runs under the test-runner framework. This framework +is built along side the standard ZFS utilities and is included as part of +zfs-test package. The zfs-test package can be built from source as follows: + + $ ./configure + $ make pkg-utils + +The resulting packages can be installed using the rpm or dpkg command as +appropriate for your distributions. Alternately, if you have installed +ZFS from a distributions repository (not from source) the zfs-test package +may be provided for your distribution. + + - Installed from source + $ rpm -ivh ./zfs-test*.rpm, or + $ dpkg -i ./zfs-test*.deb, + + - Installed from package repository + $ yum install zfs-test + $ apt-get install zfs-test + +2) Running the ZFS Test Suite + +The pre-requisites for running the ZFS Test Suite are: + + * Three scratch disks + * Specify the disks you wish to use in the $DISKS variable, as a + space delimited list like this: DISKS='vdb vdc vdd'. By default + the zfs-tests.sh script will construct three loopback devices to + be used for testing: DISKS='loop0 loop1 loop2'. + * A non-root user with a full set of basic privileges and the ability + to sudo(8) to root without a password to run the test. + * Specify any pools you wish to preserve as a space delimited list in + the $KEEP variable. All pools detected at the start of testing are + added automatically. + * The ZFS Test Suite will add users and groups to test machine to + verify functionality. Therefore it is strongly advised that a + dedicated test machine, which can be a VM, be used for testing. + +Once the pre-requisites are satisfied simply run the zfs-tests.sh script: + + $ /usr/share/zfs/zfs-tests.sh + +Alternately, the zfs-tests.sh script can be run from the source tree to allow +developers to rapidly validate their work. In this mode the ZFS utilities and +modules from the source tree will be used (rather than those installed on the +system). In order to avoid certain types of failures you will need to ensure +the ZFS udev rules are installed. This can be done manually or by ensuring +some version of ZFS is installed on the system. + + $ ./scripts/zfs-tests.sh + +The following zfs-tests.sh options are supported: + + -v Verbose zfs-tests.sh output When specified additional + information describing the test environment will be logged + prior to invoking test-runner. This includes the runfile + being used, the DISKS targeted, pools to keep, etc. + + -q Quiet test-runner output. When specified it is passed to + test-runner(1) which causes output to be written to the + console only for tests that do not pass and the results + summary. + + -x Remove all testpools, dm, lo, and files (unsafe). When + specified the script will attempt to remove any leftover + configuration from a previous test run. This includes + destroying any pools named testpool, unused DM devices, + and loopback devices backed by file-vdevs. This operation + can be DANGEROUS because it is possible that the script + will mistakenly remove a resource not related to the testing. + + -k Disable cleanup after test failure. When specified the + zfs-tests.sh script will not perform any additional cleanup + when test-runner exists. This is useful when the results of + a specific test need to be preserved for further analysis. + + -f Use sparse files directly instead of loopback devices for + the testing. When running in this mode certain tests will + be skipped which depend on real block devices. + + -c Only create and populate constrained path + + -I NUM Number of iterations + + -d DIR Create sparse files for vdevs in the DIR directory. By + default these files are created under /var/tmp/. + + -s SIZE Use vdevs of SIZE (default: 4G) + + -r RUNFILES Run tests in RUNFILES (default: common.run,linux.run) + + -t PATH Run single test at PATH relative to test suite + + -T TAGS Comma separated list of tags (default: 'functional') + + -u USER Run single test as USER (default: root) + + +The ZFS Test Suite allows the user to specify a subset of the tests via a +runfile or list of tags. + +The format of the runfile is explained in test-runner(1), and +the files that zfs-tests.sh uses are available for reference under +/usr/share/zfs/runfiles. To specify a custom runfile, use the -r option: + + $ /usr/share/zfs/zfs-tests.sh -r my_tests.run + +Otherwise user can set needed tags to run only specific tests. + +3) Test results + +While the ZFS Test Suite is running, one informational line is printed at the +end of each test, and a results summary is printed at the end of the run. The +results summary includes the location of the complete logs, which is logged in +the form `/var/tmp/test_results/[ISO 8601 date]`. A normal test run launched +with the `zfs-tests.sh` wrapper script will look something like this: + + $ /usr/share/zfs/zfs-tests.sh -v -d /tmp/test + + --- Configuration --- + Runfile: /usr/share/zfs/runfiles/linux.run + STF_TOOLS: /usr/share/zfs/test-runner + STF_SUITE: /usr/share/zfs/zfs-tests + STF_PATH: /var/tmp/constrained_path.G0Sf + FILEDIR: /tmp/test + FILES: /tmp/test/file-vdev0 /tmp/test/file-vdev1 /tmp/test/file-vdev2 + LOOPBACKS: /dev/loop0 /dev/loop1 /dev/loop2 + DISKS: loop0 loop1 loop2 + NUM_DISKS: 3 + FILESIZE: 4G + ITERATIONS: 1 + TAGS: functional + Keep pool(s): rpool + + + /usr/share/zfs/test-runner/bin/test-runner.py -c /usr/share/zfs/runfiles/linux.run \ + -T functional -i /usr/share/zfs/zfs-tests -I 1 + Test: /usr/share/zfs/zfs-tests/tests/functional/arc/setup (run as root) [00:00] [PASS] + ...more than 1100 additional tests... + Test: /usr/share/zfs/zfs-tests/tests/functional/zvol/zvol_swap/cleanup (run as root) [00:00] [PASS] + + Results Summary + SKIP 52 + PASS 1129 + + Running Time: 02:35:33 + Percent passed: 95.6% + Log directory: /var/tmp/test_results/20180515T054509 diff --git a/tests/runfiles/Makefile.am b/tests/runfiles/Makefile.am new file mode 100644 index 000000000000..c7cf2a20c1d5 --- /dev/null +++ b/tests/runfiles/Makefile.am @@ -0,0 +1,8 @@ +pkgdatadir = $(datadir)/@PACKAGE@/runfiles +dist_pkgdata_DATA = \ + common.run \ + freebsd.run \ + linux.run \ + longevity.run \ + perf-regression.run \ + sunos.run diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run new file mode 100644 index 000000000000..615c4efc461b --- /dev/null +++ b/tests/runfiles/common.run @@ -0,0 +1,900 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/alloc_class] +tests = ['alloc_class_001_pos', 'alloc_class_002_neg', 'alloc_class_003_pos', + 'alloc_class_004_pos', 'alloc_class_005_pos', 'alloc_class_006_pos', + 'alloc_class_007_pos', 'alloc_class_008_pos', 'alloc_class_009_pos', + 'alloc_class_010_pos', 'alloc_class_011_neg', 'alloc_class_012_pos', + 'alloc_class_013_pos'] +tags = ['functional', 'alloc_class'] + +[tests/functional/atime] +tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] +tags = ['functional', 'atime'] + +[tests/functional/bootfs] +tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', + 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', + 'bootfs_008_pos'] +tags = ['functional', 'bootfs'] + +[tests/functional/btree] +tests = ['btree_positive', 'btree_negative'] +tags = ['functional', 'btree'] +pre = +post = + +[tests/functional/cache] +tests = ['cache_001_pos', 'cache_002_pos', 'cache_003_pos', 'cache_004_neg', + 'cache_005_neg', 'cache_006_pos', 'cache_007_neg', 'cache_008_neg', + 'cache_009_pos', 'cache_010_pos', 'cache_011_pos', 'cache_012_pos'] +tags = ['functional', 'cache'] + +[tests/functional/cachefile] +tests = ['cachefile_001_pos', 'cachefile_002_pos', 'cachefile_003_pos', + 'cachefile_004_pos'] +tags = ['functional', 'cachefile'] + +[tests/functional/casenorm] +tests = ['case_all_values', 'norm_all_values', 'mixed_create_failure', + 'sensitive_none_lookup', 'sensitive_none_delete', + 'sensitive_formd_lookup', 'sensitive_formd_delete', + 'insensitive_none_lookup', 'insensitive_none_delete', + 'insensitive_formd_lookup', 'insensitive_formd_delete', + 'mixed_none_lookup', 'mixed_none_lookup_ci', 'mixed_none_delete', + 'mixed_formd_lookup', 'mixed_formd_lookup_ci', 'mixed_formd_delete'] +tags = ['functional', 'casenorm'] + +[tests/functional/channel_program/lua_core] +tests = ['tst.args_to_lua', 'tst.divide_by_zero', 'tst.exists', + 'tst.integer_illegal', 'tst.integer_overflow', 'tst.language_functions_neg', + 'tst.language_functions_pos', 'tst.large_prog', 'tst.libraries', + 'tst.memory_limit', 'tst.nested_neg', 'tst.nested_pos', 'tst.nvlist_to_lua', + 'tst.recursive_neg', 'tst.recursive_pos', 'tst.return_large', + 'tst.return_nvlist_neg', 'tst.return_nvlist_pos', + 'tst.return_recursive_table', 'tst.stack_gsub', 'tst.timeout'] +tags = ['functional', 'channel_program', 'lua_core'] + +[tests/functional/channel_program/synctask_core] +tests = ['tst.destroy_fs', 'tst.destroy_snap', 'tst.get_count_and_limit', + 'tst.get_index_props', 'tst.get_mountpoint', 'tst.get_neg', + 'tst.get_number_props', 'tst.get_string_props', 'tst.get_type', + 'tst.get_userquota', 'tst.get_written', 'tst.inherit', 'tst.list_bookmarks', + 'tst.list_children', 'tst.list_clones', 'tst.list_holds', + 'tst.list_snapshots', 'tst.list_system_props', + 'tst.list_user_props', 'tst.parse_args_neg','tst.promote_conflict', + 'tst.promote_multiple', 'tst.promote_simple', 'tst.rollback_mult', + 'tst.rollback_one', 'tst.set_props', 'tst.snapshot_destroy', 'tst.snapshot_neg', + 'tst.snapshot_recursive', 'tst.snapshot_simple', + 'tst.bookmark.create', 'tst.bookmark.copy', + 'tst.terminate_by_signal' + ] +tags = ['functional', 'channel_program', 'synctask_core'] + +[tests/functional/checksum] +tests = ['run_sha2_test', 'run_skein_test', 'filetest_001_pos'] +tags = ['functional', 'checksum'] + +[tests/functional/clean_mirror] +tests = [ 'clean_mirror_001_pos', 'clean_mirror_002_pos', + 'clean_mirror_003_pos', 'clean_mirror_004_pos'] +tags = ['functional', 'clean_mirror'] + +[tests/functional/cli_root/zdb] +tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', + 'zdb_006_pos', 'zdb_args_neg', 'zdb_args_pos', + 'zdb_block_size_histogram', 'zdb_checksum', 'zdb_decompress', + 'zdb_display_block', 'zdb_object_range_neg', 'zdb_object_range_pos', + 'zdb_objset_id', 'zdb_decompress_zstd'] +pre = +post = +tags = ['functional', 'cli_root', 'zdb'] + +[tests/functional/cli_root/zfs] +tests = ['zfs_001_neg', 'zfs_002_pos'] +tags = ['functional', 'cli_root', 'zfs'] + +[tests/functional/cli_root/zfs_bookmark] +tests = ['zfs_bookmark_cliargs'] +tags = ['functional', 'cli_root', 'zfs_bookmark'] + +[tests/functional/cli_root/zfs_change-key] +tests = ['zfs_change-key', 'zfs_change-key_child', 'zfs_change-key_format', + 'zfs_change-key_inherit', 'zfs_change-key_load', 'zfs_change-key_location', + 'zfs_change-key_pbkdf2iters', 'zfs_change-key_clones'] +tags = ['functional', 'cli_root', 'zfs_change-key'] + +[tests/functional/cli_root/zfs_clone] +tests = ['zfs_clone_001_neg', 'zfs_clone_002_pos', 'zfs_clone_003_pos', + 'zfs_clone_004_pos', 'zfs_clone_005_pos', 'zfs_clone_006_pos', + 'zfs_clone_007_pos', 'zfs_clone_008_neg', 'zfs_clone_009_neg', + 'zfs_clone_010_pos', 'zfs_clone_encrypted', 'zfs_clone_deeply_nested'] +tags = ['functional', 'cli_root', 'zfs_clone'] + +[tests/functional/cli_root/zfs_copies] +tests = ['zfs_copies_001_pos', 'zfs_copies_002_pos', 'zfs_copies_003_pos', + 'zfs_copies_004_neg', 'zfs_copies_005_neg', 'zfs_copies_006_pos'] +tags = ['functional', 'cli_root', 'zfs_copies'] + +[tests/functional/cli_root/zfs_create] +tests = ['zfs_create_001_pos', 'zfs_create_002_pos', 'zfs_create_003_pos', + 'zfs_create_004_pos', 'zfs_create_005_pos', 'zfs_create_006_pos', + 'zfs_create_007_pos', 'zfs_create_008_neg', 'zfs_create_009_neg', + 'zfs_create_010_neg', 'zfs_create_011_pos', 'zfs_create_012_pos', + 'zfs_create_013_pos', 'zfs_create_014_pos', 'zfs_create_encrypted', + 'zfs_create_crypt_combos', 'zfs_create_dryrun', 'zfs_create_verbose'] +tags = ['functional', 'cli_root', 'zfs_create'] + +[tests/functional/cli_root/zfs_destroy] +tests = ['zfs_clone_livelist_condense_and_disable', + 'zfs_clone_livelist_condense_races', 'zfs_destroy_001_pos', + 'zfs_destroy_002_pos', 'zfs_destroy_003_pos', + 'zfs_destroy_004_pos', 'zfs_destroy_005_neg', 'zfs_destroy_006_neg', + 'zfs_destroy_007_neg', 'zfs_destroy_008_pos', 'zfs_destroy_009_pos', + 'zfs_destroy_010_pos', 'zfs_destroy_011_pos', 'zfs_destroy_012_pos', + 'zfs_destroy_013_neg', 'zfs_destroy_014_pos', 'zfs_destroy_015_pos', + 'zfs_destroy_016_pos', 'zfs_destroy_clone_livelist', + 'zfs_destroy_dev_removal', 'zfs_destroy_dev_removal_condense'] +tags = ['functional', 'cli_root', 'zfs_destroy'] + +[tests/functional/cli_root/zfs_diff] +tests = ['zfs_diff_changes', 'zfs_diff_cliargs', 'zfs_diff_timestamp', + 'zfs_diff_types', 'zfs_diff_encrypted'] +tags = ['functional', 'cli_root', 'zfs_diff'] + +[tests/functional/cli_root/zfs_get] +tests = ['zfs_get_001_pos', 'zfs_get_002_pos', 'zfs_get_003_pos', + 'zfs_get_004_pos', 'zfs_get_005_neg', 'zfs_get_006_neg', 'zfs_get_007_neg', + 'zfs_get_008_pos', 'zfs_get_009_pos', 'zfs_get_010_neg'] +tags = ['functional', 'cli_root', 'zfs_get'] + +[tests/functional/cli_root/zfs_ids_to_path] +tests = ['zfs_ids_to_path_001_pos'] +tags = ['functional', 'cli_root', 'zfs_ids_to_path'] + +[tests/functional/cli_root/zfs_inherit] +tests = ['zfs_inherit_001_neg', 'zfs_inherit_002_neg', 'zfs_inherit_003_pos', + 'zfs_inherit_mountpoint'] +tags = ['functional', 'cli_root', 'zfs_inherit'] + +[tests/functional/cli_root/zfs_load-key] +tests = ['zfs_load-key', 'zfs_load-key_all', 'zfs_load-key_file', + 'zfs_load-key_location', 'zfs_load-key_noop', 'zfs_load-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_load-key'] + +[tests/functional/cli_root/zfs_mount] +tests = ['zfs_mount_001_pos', 'zfs_mount_002_pos', 'zfs_mount_003_pos', + 'zfs_mount_004_pos', 'zfs_mount_005_pos', 'zfs_mount_007_pos', + 'zfs_mount_009_neg', 'zfs_mount_010_neg', 'zfs_mount_011_neg', + 'zfs_mount_012_pos', 'zfs_mount_all_001_pos', 'zfs_mount_encrypted', + 'zfs_mount_remount', 'zfs_mount_all_fail', 'zfs_mount_all_mountpoints', + 'zfs_mount_test_race'] +tags = ['functional', 'cli_root', 'zfs_mount'] + +[tests/functional/cli_root/zfs_program] +tests = ['zfs_program_json'] +tags = ['functional', 'cli_root', 'zfs_program'] + +[tests/functional/cli_root/zfs_promote] +tests = ['zfs_promote_001_pos', 'zfs_promote_002_pos', 'zfs_promote_003_pos', + 'zfs_promote_004_pos', 'zfs_promote_005_pos', 'zfs_promote_006_neg', + 'zfs_promote_007_neg', 'zfs_promote_008_pos', 'zfs_promote_encryptionroot'] +tags = ['functional', 'cli_root', 'zfs_promote'] + +[tests/functional/cli_root/zfs_property] +tests = ['zfs_written_property_001_pos'] +tags = ['functional', 'cli_root', 'zfs_property'] + +[tests/functional/cli_root/zfs_receive] +tests = ['zfs_receive_001_pos', 'zfs_receive_002_pos', 'zfs_receive_003_pos', + 'zfs_receive_004_neg', 'zfs_receive_005_neg', 'zfs_receive_006_pos', + 'zfs_receive_007_neg', 'zfs_receive_008_pos', 'zfs_receive_009_neg', + 'zfs_receive_010_pos', 'zfs_receive_011_pos', 'zfs_receive_012_pos', + 'zfs_receive_013_pos', 'zfs_receive_014_pos', 'zfs_receive_015_pos', + 'zfs_receive_016_pos', 'receive-o-x_props_override', + 'zfs_receive_from_encrypted', 'zfs_receive_to_encrypted', + 'zfs_receive_raw', 'zfs_receive_raw_incremental', 'zfs_receive_-e', + 'zfs_receive_raw_-d', 'zfs_receive_from_zstd'] +tags = ['functional', 'cli_root', 'zfs_receive'] + +[tests/functional/cli_root/zfs_rename] +tests = ['zfs_rename_001_pos', 'zfs_rename_002_pos', 'zfs_rename_003_pos', + 'zfs_rename_004_neg', 'zfs_rename_005_neg', 'zfs_rename_006_pos', + 'zfs_rename_007_pos', 'zfs_rename_008_pos', 'zfs_rename_009_neg', + 'zfs_rename_010_neg', 'zfs_rename_011_pos', 'zfs_rename_012_neg', + 'zfs_rename_013_pos', 'zfs_rename_014_neg', 'zfs_rename_encrypted_child', + 'zfs_rename_to_encrypted', 'zfs_rename_mountpoint'] +tags = ['functional', 'cli_root', 'zfs_rename'] + +[tests/functional/cli_root/zfs_reservation] +tests = ['zfs_reservation_001_pos', 'zfs_reservation_002_pos'] +tags = ['functional', 'cli_root', 'zfs_reservation'] + +[tests/functional/cli_root/zfs_rollback] +tests = ['zfs_rollback_001_pos', 'zfs_rollback_002_pos', + 'zfs_rollback_003_neg', 'zfs_rollback_004_neg'] +tags = ['functional', 'cli_root', 'zfs_rollback'] + +[tests/functional/cli_root/zfs_send] +tests = ['zfs_send_001_pos', 'zfs_send_002_pos', 'zfs_send_003_pos', + 'zfs_send_004_neg', 'zfs_send_005_pos', 'zfs_send_006_pos', + 'zfs_send_007_pos', 'zfs_send_encrypted', 'zfs_send_raw', + 'zfs_send_sparse', 'zfs_send-b'] +tags = ['functional', 'cli_root', 'zfs_send'] + +[tests/functional/cli_root/zfs_set] +tests = ['cache_001_pos', 'cache_002_neg', 'canmount_001_pos', + 'canmount_002_pos', 'canmount_003_pos', 'canmount_004_pos', + 'checksum_001_pos', 'compression_001_pos', 'mountpoint_001_pos', + 'mountpoint_002_pos', 'reservation_001_neg', 'user_property_002_pos', + 'share_mount_001_neg', 'snapdir_001_pos', 'onoffs_001_pos', + 'user_property_001_pos', 'user_property_003_neg', 'readonly_001_pos', + 'user_property_004_pos', 'version_001_neg', 'zfs_set_001_neg', + 'zfs_set_002_neg', 'zfs_set_003_neg', 'property_alias_001_pos', + 'mountpoint_003_pos', 'ro_props_001_pos', 'zfs_set_keylocation', + 'zfs_set_feature_activation'] +tags = ['functional', 'cli_root', 'zfs_set'] + +[tests/functional/cli_root/zfs_share] +tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', + 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', + 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] +tags = ['functional', 'cli_root', 'zfs_share'] + +[tests/functional/cli_root/zfs_snapshot] +tests = ['zfs_snapshot_001_neg', 'zfs_snapshot_002_neg', + 'zfs_snapshot_003_neg', 'zfs_snapshot_004_neg', 'zfs_snapshot_005_neg', + 'zfs_snapshot_006_pos', 'zfs_snapshot_007_neg', 'zfs_snapshot_008_neg', + 'zfs_snapshot_009_pos'] +tags = ['functional', 'cli_root', 'zfs_snapshot'] + +[tests/functional/cli_root/zfs_unload-key] +tests = ['zfs_unload-key', 'zfs_unload-key_all', 'zfs_unload-key_recursive'] +tags = ['functional', 'cli_root', 'zfs_unload-key'] + +[tests/functional/cli_root/zfs_unmount] +tests = ['zfs_unmount_001_pos', 'zfs_unmount_002_pos', 'zfs_unmount_003_pos', + 'zfs_unmount_004_pos', 'zfs_unmount_005_pos', 'zfs_unmount_006_pos', + 'zfs_unmount_007_neg', 'zfs_unmount_008_neg', 'zfs_unmount_009_pos', + 'zfs_unmount_all_001_pos', 'zfs_unmount_nested', 'zfs_unmount_unload_keys'] +tags = ['functional', 'cli_root', 'zfs_unmount'] + +[tests/functional/cli_root/zfs_unshare] +tests = ['zfs_unshare_001_pos', 'zfs_unshare_002_pos', 'zfs_unshare_003_pos', + 'zfs_unshare_004_neg', 'zfs_unshare_005_neg', 'zfs_unshare_006_pos', + 'zfs_unshare_007_pos'] +tags = ['functional', 'cli_root', 'zfs_unshare'] + +[tests/functional/cli_root/zfs_upgrade] +tests = ['zfs_upgrade_001_pos', 'zfs_upgrade_002_pos', 'zfs_upgrade_003_pos', + 'zfs_upgrade_004_pos', 'zfs_upgrade_005_pos', 'zfs_upgrade_006_neg', + 'zfs_upgrade_007_neg'] +tags = ['functional', 'cli_root', 'zfs_upgrade'] + +[tests/functional/cli_root/zfs_wait] +tests = ['zfs_wait_deleteq'] +tags = ['functional', 'cli_root', 'zfs_wait'] + +[tests/functional/cli_root/zpool] +tests = ['zpool_001_neg', 'zpool_002_pos', 'zpool_003_pos', 'zpool_colors'] +tags = ['functional', 'cli_root', 'zpool'] + +[tests/functional/cli_root/zpool_add] +tests = ['zpool_add_001_pos', 'zpool_add_002_pos', 'zpool_add_003_pos', + 'zpool_add_004_pos', 'zpool_add_006_pos', 'zpool_add_007_neg', + 'zpool_add_008_neg', 'zpool_add_009_neg', 'zpool_add_010_pos', + 'add-o_ashift', 'add_prop_ashift'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_attach] +tests = ['zpool_attach_001_neg', 'attach-o_ashift'] +tags = ['functional', 'cli_root', 'zpool_attach'] + +[tests/functional/cli_root/zpool_clear] +tests = ['zpool_clear_001_pos', 'zpool_clear_002_neg', 'zpool_clear_003_neg', + 'zpool_clear_readonly'] +tags = ['functional', 'cli_root', 'zpool_clear'] + +[tests/functional/cli_root/zpool_create] +tests = ['zpool_create_001_pos', 'zpool_create_002_pos', + 'zpool_create_003_pos', 'zpool_create_004_pos', 'zpool_create_005_pos', + 'zpool_create_006_pos', 'zpool_create_007_neg', 'zpool_create_008_pos', + 'zpool_create_009_neg', 'zpool_create_010_neg', 'zpool_create_011_neg', + 'zpool_create_012_neg', 'zpool_create_014_neg', 'zpool_create_015_neg', + 'zpool_create_017_neg', 'zpool_create_018_pos', 'zpool_create_019_pos', + 'zpool_create_020_pos', 'zpool_create_021_pos', 'zpool_create_022_pos', + 'zpool_create_023_neg', 'zpool_create_024_pos', + 'zpool_create_encrypted', 'zpool_create_crypt_combos', + 'zpool_create_features_001_pos', 'zpool_create_features_002_pos', + 'zpool_create_features_003_pos', 'zpool_create_features_004_neg', + 'zpool_create_features_005_pos', + 'create-o_ashift', 'zpool_create_tempname'] +tags = ['functional', 'cli_root', 'zpool_create'] + +[tests/functional/cli_root/zpool_destroy] +tests = ['zpool_destroy_001_pos', 'zpool_destroy_002_pos', + 'zpool_destroy_003_neg'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_destroy'] + +[tests/functional/cli_root/zpool_detach] +tests = ['zpool_detach_001_neg'] +tags = ['functional', 'cli_root', 'zpool_detach'] + +[tests/functional/cli_root/zpool_events] +tests = ['zpool_events_clear', 'zpool_events_cliargs', 'zpool_events_follow', + 'zpool_events_poolname', 'zpool_events_errors'] +tags = ['functional', 'cli_root', 'zpool_events'] + +[tests/functional/cli_root/zpool_export] +tests = ['zpool_export_001_pos', 'zpool_export_002_pos', + 'zpool_export_003_neg', 'zpool_export_004_pos'] +tags = ['functional', 'cli_root', 'zpool_export'] + +[tests/functional/cli_root/zpool_get] +tests = ['zpool_get_001_pos', 'zpool_get_002_pos', 'zpool_get_003_pos', + 'zpool_get_004_neg', 'zpool_get_005_pos'] +tags = ['functional', 'cli_root', 'zpool_get'] + +[tests/functional/cli_root/zpool_history] +tests = ['zpool_history_001_neg', 'zpool_history_002_pos'] +tags = ['functional', 'cli_root', 'zpool_history'] + +[tests/functional/cli_root/zpool_import] +tests = ['zpool_import_001_pos', 'zpool_import_002_pos', + 'zpool_import_003_pos', 'zpool_import_004_pos', 'zpool_import_005_pos', + 'zpool_import_006_pos', 'zpool_import_007_pos', 'zpool_import_008_pos', + 'zpool_import_009_neg', 'zpool_import_010_pos', 'zpool_import_011_neg', + 'zpool_import_012_pos', 'zpool_import_013_neg', 'zpool_import_014_pos', + 'zpool_import_015_pos', + 'zpool_import_features_001_pos', 'zpool_import_features_002_neg', + 'zpool_import_features_003_pos', 'zpool_import_missing_001_pos', + 'zpool_import_missing_002_pos', 'zpool_import_missing_003_pos', + 'zpool_import_rename_001_pos', 'zpool_import_all_001_pos', + 'zpool_import_encrypted', 'zpool_import_encrypted_load', + 'zpool_import_errata3', 'zpool_import_errata4', + 'import_cachefile_device_added', + 'import_cachefile_device_removed', + 'import_cachefile_device_replaced', + 'import_cachefile_mirror_attached', + 'import_cachefile_mirror_detached', + 'import_cachefile_shared_device', + 'import_devices_missing', + 'import_paths_changed', + 'import_rewind_config_changed', + 'import_rewind_device_replaced'] +tags = ['functional', 'cli_root', 'zpool_import'] + +[tests/functional/cli_root/zpool_labelclear] +tests = ['zpool_labelclear_active', 'zpool_labelclear_exported', + 'zpool_labelclear_removed', 'zpool_labelclear_valid'] +pre = +post = +tags = ['functional', 'cli_root', 'zpool_labelclear'] + +[tests/functional/cli_root/zpool_initialize] +tests = ['zpool_initialize_attach_detach_add_remove', + 'zpool_initialize_import_export', + 'zpool_initialize_offline_export_import_online', + 'zpool_initialize_online_offline', + 'zpool_initialize_split', + 'zpool_initialize_start_and_cancel_neg', + 'zpool_initialize_start_and_cancel_pos', + 'zpool_initialize_suspend_resume', + 'zpool_initialize_unsupported_vdevs', + 'zpool_initialize_verify_checksums', + 'zpool_initialize_verify_initialized'] +pre = +tags = ['functional', 'cli_root', 'zpool_initialize'] + +[tests/functional/cli_root/zpool_offline] +tests = ['zpool_offline_001_pos', 'zpool_offline_002_neg', + 'zpool_offline_003_pos'] +tags = ['functional', 'cli_root', 'zpool_offline'] + +[tests/functional/cli_root/zpool_online] +tests = ['zpool_online_001_pos', 'zpool_online_002_neg'] +tags = ['functional', 'cli_root', 'zpool_online'] + +[tests/functional/cli_root/zpool_remove] +tests = ['zpool_remove_001_neg', 'zpool_remove_002_pos', + 'zpool_remove_003_pos'] +tags = ['functional', 'cli_root', 'zpool_remove'] + +[tests/functional/cli_root/zpool_replace] +tests = ['zpool_replace_001_neg', 'replace-o_ashift', 'replace_prop_ashift'] +tags = ['functional', 'cli_root', 'zpool_replace'] + +[tests/functional/cli_root/zpool_resilver] +tests = ['zpool_resilver_bad_args', 'zpool_resilver_restart'] +tags = ['functional', 'cli_root', 'zpool_resilver'] + +[tests/functional/cli_root/zpool_scrub] +tests = ['zpool_scrub_001_neg', 'zpool_scrub_002_pos', 'zpool_scrub_003_pos', + 'zpool_scrub_004_pos', 'zpool_scrub_005_pos', + 'zpool_scrub_encrypted_unloaded', 'zpool_scrub_print_repairing', + 'zpool_scrub_offline_device', 'zpool_scrub_multiple_copies'] +tags = ['functional', 'cli_root', 'zpool_scrub'] + +[tests/functional/cli_root/zpool_set] +tests = ['zpool_set_001_pos', 'zpool_set_002_neg', 'zpool_set_003_neg', + 'zpool_set_ashift', 'zpool_set_features'] +tags = ['functional', 'cli_root', 'zpool_set'] + +[tests/functional/cli_root/zpool_split] +tests = ['zpool_split_cliargs', 'zpool_split_devices', + 'zpool_split_encryption', 'zpool_split_props', 'zpool_split_vdevs', + 'zpool_split_resilver', 'zpool_split_indirect'] +tags = ['functional', 'cli_root', 'zpool_split'] + +[tests/functional/cli_root/zpool_status] +tests = ['zpool_status_001_pos', 'zpool_status_002_pos'] +tags = ['functional', 'cli_root', 'zpool_status'] + +[tests/functional/cli_root/zpool_sync] +tests = ['zpool_sync_001_pos', 'zpool_sync_002_neg'] +tags = ['functional', 'cli_root', 'zpool_sync'] + +[tests/functional/cli_root/zpool_trim] +tests = ['zpool_trim_attach_detach_add_remove', + 'zpool_trim_import_export', 'zpool_trim_multiple', 'zpool_trim_neg', + 'zpool_trim_offline_export_import_online', 'zpool_trim_online_offline', + 'zpool_trim_partial', 'zpool_trim_rate', 'zpool_trim_rate_neg', + 'zpool_trim_secure', 'zpool_trim_split', 'zpool_trim_start_and_cancel_neg', + 'zpool_trim_start_and_cancel_pos', 'zpool_trim_suspend_resume', + 'zpool_trim_unsupported_vdevs', 'zpool_trim_verify_checksums', + 'zpool_trim_verify_trimmed'] +tags = ['functional', 'zpool_trim'] + +[tests/functional/cli_root/zpool_upgrade] +tests = ['zpool_upgrade_001_pos', 'zpool_upgrade_002_pos', + 'zpool_upgrade_003_pos', 'zpool_upgrade_004_pos', + 'zpool_upgrade_005_neg', 'zpool_upgrade_006_neg', + 'zpool_upgrade_007_pos', 'zpool_upgrade_008_pos', + 'zpool_upgrade_009_neg'] +tags = ['functional', 'cli_root', 'zpool_upgrade'] + +[tests/functional/cli_root/zpool_wait] +tests = ['zpool_wait_discard', 'zpool_wait_freeing', + 'zpool_wait_initialize_basic', 'zpool_wait_initialize_cancel', + 'zpool_wait_initialize_flag', 'zpool_wait_multiple', + 'zpool_wait_no_activity', 'zpool_wait_remove', 'zpool_wait_remove_cancel', + 'zpool_wait_trim_basic', 'zpool_wait_trim_cancel', 'zpool_wait_trim_flag', + 'zpool_wait_usage'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_root/zpool_wait/scan] +tests = ['zpool_wait_replace_cancel', 'zpool_wait_rebuild', + 'zpool_wait_resilver', 'zpool_wait_scrub_cancel', + 'zpool_wait_replace', 'zpool_wait_scrub_basic', 'zpool_wait_scrub_flag'] +tags = ['functional', 'cli_root', 'zpool_wait'] + +[tests/functional/cli_user/misc] +tests = ['zdb_001_neg', 'zfs_001_neg', 'zfs_allow_001_neg', + 'zfs_clone_001_neg', 'zfs_create_001_neg', 'zfs_destroy_001_neg', + 'zfs_get_001_neg', 'zfs_inherit_001_neg', 'zfs_mount_001_neg', + 'zfs_promote_001_neg', 'zfs_receive_001_neg', 'zfs_rename_001_neg', + 'zfs_rollback_001_neg', 'zfs_send_001_neg', 'zfs_set_001_neg', + 'zfs_share_001_neg', 'zfs_snapshot_001_neg', 'zfs_unallow_001_neg', + 'zfs_unmount_001_neg', 'zfs_unshare_001_neg', 'zfs_upgrade_001_neg', + 'zpool_001_neg', 'zpool_add_001_neg', 'zpool_attach_001_neg', + 'zpool_clear_001_neg', 'zpool_create_001_neg', 'zpool_destroy_001_neg', + 'zpool_detach_001_neg', 'zpool_export_001_neg', 'zpool_get_001_neg', + 'zpool_history_001_neg', 'zpool_import_001_neg', 'zpool_import_002_neg', + 'zpool_offline_001_neg', 'zpool_online_001_neg', 'zpool_remove_001_neg', + 'zpool_replace_001_neg', 'zpool_scrub_001_neg', 'zpool_set_001_neg', + 'zpool_status_001_neg', 'zpool_upgrade_001_neg', 'arcstat_001_pos', + 'arc_summary_001_pos', 'arc_summary_002_neg', 'zpool_wait_privilege'] +user = +tags = ['functional', 'cli_user', 'misc'] + +[tests/functional/cli_user/zfs_list] +tests = ['zfs_list_001_pos', 'zfs_list_002_pos', 'zfs_list_003_pos', + 'zfs_list_004_neg', 'zfs_list_007_pos', 'zfs_list_008_neg'] +user = +tags = ['functional', 'cli_user', 'zfs_list'] + +[tests/functional/cli_user/zpool_iostat] +tests = ['zpool_iostat_001_neg', 'zpool_iostat_002_pos', + 'zpool_iostat_003_neg', 'zpool_iostat_004_pos', + 'zpool_iostat_005_pos', 'zpool_iostat_-c_disable', + 'zpool_iostat_-c_homedir', 'zpool_iostat_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_iostat'] + +[tests/functional/cli_user/zpool_list] +tests = ['zpool_list_001_pos', 'zpool_list_002_neg'] +user = +tags = ['functional', 'cli_user', 'zpool_list'] + +[tests/functional/cli_user/zpool_status] +tests = ['zpool_status_003_pos', 'zpool_status_-c_disable', + 'zpool_status_-c_homedir', 'zpool_status_-c_searchpath'] +user = +tags = ['functional', 'cli_user', 'zpool_status'] + +[tests/functional/compression] +tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', + 'l2arc_compressed_arc', 'l2arc_compressed_arc_disabled', + 'l2arc_encrypted', 'l2arc_encrypted_no_compressed_arc'] +tags = ['functional', 'compression'] + +[tests/functional/cp_files] +tests = ['cp_files_001_pos'] +tags = ['functional', 'cp_files'] + +[tests/functional/ctime] +tests = ['ctime_001_pos' ] +tags = ['functional', 'ctime'] + +[tests/functional/delegate] +tests = ['zfs_allow_001_pos', 'zfs_allow_002_pos', 'zfs_allow_003_pos', + 'zfs_allow_004_pos', 'zfs_allow_005_pos', 'zfs_allow_006_pos', + 'zfs_allow_007_pos', 'zfs_allow_008_pos', 'zfs_allow_009_neg', + 'zfs_allow_010_pos', 'zfs_allow_011_neg', 'zfs_allow_012_neg', + 'zfs_unallow_001_pos', 'zfs_unallow_002_pos', 'zfs_unallow_003_pos', + 'zfs_unallow_004_pos', 'zfs_unallow_005_pos', 'zfs_unallow_006_pos', + 'zfs_unallow_007_neg', 'zfs_unallow_008_neg'] +tags = ['functional', 'delegate'] + +[tests/functional/exec] +tests = ['exec_001_pos', 'exec_002_neg'] +tags = ['functional', 'exec'] + +[tests/functional/features/async_destroy] +tests = ['async_destroy_001_pos'] +tags = ['functional', 'features', 'async_destroy'] + +[tests/functional/features/large_dnode] +tests = ['large_dnode_001_pos', 'large_dnode_003_pos', 'large_dnode_004_neg', + 'large_dnode_005_pos', 'large_dnode_007_neg', 'large_dnode_009_pos'] +tags = ['functional', 'features', 'large_dnode'] + +[tests/functional/grow] +pre = +post = +tests = ['grow_pool_001_pos', 'grow_replicas_001_pos'] +tags = ['functional', 'grow'] + +[tests/functional/history] +tests = ['history_001_pos', 'history_002_pos', 'history_003_pos', + 'history_004_pos', 'history_005_neg', 'history_006_neg', + 'history_007_pos', 'history_008_pos', 'history_009_pos', + 'history_010_pos'] +tags = ['functional', 'history'] + +[tests/functional/hkdf] +tests = ['run_hkdf_test'] +tags = ['functional', 'hkdf'] + +[tests/functional/inheritance] +tests = ['inherit_001_pos'] +pre = +tags = ['functional', 'inheritance'] + +[tests/functional/io] +tests = ['sync', 'psync', 'posixaio', 'mmap'] +tags = ['functional', 'io'] + +[tests/functional/inuse] +tests = ['inuse_004_pos', 'inuse_005_pos', 'inuse_008_pos', 'inuse_009_pos'] +post = +tags = ['functional', 'inuse'] + +[tests/functional/large_files] +tests = ['large_files_001_pos', 'large_files_002_pos'] +tags = ['functional', 'large_files'] + +[tests/functional/largest_pool] +tests = ['largest_pool_001_pos'] +pre = +post = +tags = ['functional', 'largest_pool'] + +[tests/functional/limits] +tests = ['filesystem_count', 'filesystem_limit', 'snapshot_count', + 'snapshot_limit'] +tags = ['functional', 'limits'] + +[tests/functional/link_count] +tests = ['link_count_001', 'link_count_root_inode'] +tags = ['functional', 'link_count'] + +[tests/functional/migration] +tests = ['migration_001_pos', 'migration_002_pos', 'migration_003_pos', + 'migration_004_pos', 'migration_005_pos', 'migration_006_pos', + 'migration_007_pos', 'migration_008_pos', 'migration_009_pos', + 'migration_010_pos', 'migration_011_pos', 'migration_012_pos'] +tags = ['functional', 'migration'] + +[tests/functional/mmap] +tests = ['mmap_write_001_pos', 'mmap_read_001_pos'] +tags = ['functional', 'mmap'] + +[tests/functional/mount] +tests = ['umount_001', 'umountall_001'] +tags = ['functional', 'mount'] + +[tests/functional/mv_files] +tests = ['mv_files_001_pos', 'mv_files_002_pos', 'random_creation'] +tags = ['functional', 'mv_files'] + +[tests/functional/nestedfs] +tests = ['nestedfs_001_pos'] +tags = ['functional', 'nestedfs'] + +[tests/functional/no_space] +tests = ['enospc_001_pos', 'enospc_002_pos', 'enospc_003_pos', + 'enospc_df'] +tags = ['functional', 'no_space'] + +[tests/functional/nopwrite] +tests = ['nopwrite_copies', 'nopwrite_mtime', 'nopwrite_negative', + 'nopwrite_promoted_clone', 'nopwrite_recsize', 'nopwrite_sync', + 'nopwrite_varying_compression', 'nopwrite_volume'] +tags = ['functional', 'nopwrite'] + +[tests/functional/online_offline] +tests = ['online_offline_001_pos', 'online_offline_002_neg', + 'online_offline_003_neg'] +tags = ['functional', 'online_offline'] + +[tests/functional/persist_l2arc] +tests = ['persist_l2arc_001_pos', 'persist_l2arc_002_pos', + 'persist_l2arc_003_neg', 'persist_l2arc_004_pos', 'persist_l2arc_005_pos', + 'persist_l2arc_006_pos', 'persist_l2arc_007_pos', 'persist_l2arc_008_pos'] +tags = ['functional', 'persist_l2arc'] + +[tests/functional/pool_checkpoint] +tests = ['checkpoint_after_rewind', 'checkpoint_big_rewind', + 'checkpoint_capacity', 'checkpoint_conf_change', 'checkpoint_discard', + 'checkpoint_discard_busy', 'checkpoint_discard_many', + 'checkpoint_indirect', 'checkpoint_invalid', 'checkpoint_lun_expsz', + 'checkpoint_open', 'checkpoint_removal', 'checkpoint_rewind', + 'checkpoint_ro_rewind', 'checkpoint_sm_scale', 'checkpoint_twice', + 'checkpoint_vdev_add', 'checkpoint_zdb', 'checkpoint_zhack_feat'] +tags = ['functional', 'pool_checkpoint'] +timeout = 1800 + +[tests/functional/pool_names] +tests = ['pool_names_001_pos', 'pool_names_002_neg'] +pre = +post = +tags = ['functional', 'pool_names'] + +[tests/functional/poolversion] +tests = ['poolversion_001_pos', 'poolversion_002_pos'] +tags = ['functional', 'poolversion'] + +[tests/functional/pyzfs] +tests = ['pyzfs_unittest'] +pre = +post = +tags = ['functional', 'pyzfs'] + +[tests/functional/quota] +tests = ['quota_001_pos', 'quota_002_pos', 'quota_003_pos', + 'quota_004_pos', 'quota_005_pos', 'quota_006_neg'] +tags = ['functional', 'quota'] + +[tests/functional/redacted_send] +tests = ['redacted_compressed', 'redacted_contents', 'redacted_deleted', + 'redacted_disabled_feature', 'redacted_embedded', 'redacted_holes', + 'redacted_incrementals', 'redacted_largeblocks', 'redacted_many_clones', + 'redacted_mixed_recsize', 'redacted_mounts', 'redacted_negative', + 'redacted_origin', 'redacted_props', 'redacted_resume', 'redacted_size', + 'redacted_volume'] +tags = ['functional', 'redacted_send'] + +[tests/functional/raidz] +tests = ['raidz_001_neg', 'raidz_002_pos'] +tags = ['functional', 'raidz'] + +[tests/functional/redundancy] +tests = ['redundancy_001_pos', 'redundancy_002_pos', 'redundancy_003_pos', + 'redundancy_004_neg'] +tags = ['functional', 'redundancy'] + +[tests/functional/refquota] +tests = ['refquota_001_pos', 'refquota_002_pos', 'refquota_003_pos', + 'refquota_004_pos', 'refquota_005_pos', 'refquota_006_neg', + 'refquota_007_neg', 'refquota_008_neg'] +tags = ['functional', 'refquota'] + +[tests/functional/refreserv] +tests = ['refreserv_001_pos', 'refreserv_002_pos', 'refreserv_003_pos', + 'refreserv_004_pos', 'refreserv_005_pos', 'refreserv_multi_raidz', + 'refreserv_raidz'] +tags = ['functional', 'refreserv'] + +[tests/functional/removal] +pre = +tests = ['removal_all_vdev', 'removal_cancel', 'removal_check_space', + 'removal_condense_export', 'removal_multiple_indirection', + 'removal_nopwrite', 'removal_remap_deadlists', + 'removal_resume_export', 'removal_sanity', 'removal_with_add', + 'removal_with_create_fs', 'removal_with_dedup', + 'removal_with_errors', 'removal_with_export', + 'removal_with_ganging', 'removal_with_faulted', + 'removal_with_remove', 'removal_with_scrub', 'removal_with_send', + 'removal_with_send_recv', 'removal_with_snapshot', + 'removal_with_write', 'removal_with_zdb', 'remove_expanded', + 'remove_mirror', 'remove_mirror_sanity', 'remove_raidz', + 'remove_indirect'] +tags = ['functional', 'removal'] + +[tests/functional/rename_dirs] +tests = ['rename_dirs_001_pos'] +tags = ['functional', 'rename_dirs'] + +[tests/functional/replacement] +tests = ['attach_import', 'attach_multiple', 'attach_rebuild', + 'attach_resilver', 'detach', 'rebuild_disabled_feature', + 'rebuild_multiple', 'rebuild_raidz', 'replace_import', 'replace_rebuild', + 'replace_resilver', 'resilver_restart_001', 'resilver_restart_002', + 'scrub_cancel'] +tags = ['functional', 'replacement'] + +[tests/functional/reservation] +tests = ['reservation_001_pos', 'reservation_002_pos', 'reservation_003_pos', + 'reservation_004_pos', 'reservation_005_pos', 'reservation_006_pos', + 'reservation_007_pos', 'reservation_008_pos', 'reservation_009_pos', + 'reservation_010_pos', 'reservation_011_pos', 'reservation_012_pos', + 'reservation_013_pos', 'reservation_014_pos', 'reservation_015_pos', + 'reservation_016_pos', 'reservation_017_pos', 'reservation_018_pos', + 'reservation_019_pos', 'reservation_020_pos', 'reservation_021_neg', + 'reservation_022_pos'] +tags = ['functional', 'reservation'] + +[tests/functional/rootpool] +tests = ['rootpool_002_neg', 'rootpool_003_neg', 'rootpool_007_pos'] +tags = ['functional', 'rootpool'] + +[tests/functional/rsend] +tests = ['recv_dedup', 'recv_dedup_encrypted_zvol', 'rsend_001_pos', + 'rsend_002_pos', 'rsend_003_pos', 'rsend_004_pos', 'rsend_005_pos', + 'rsend_006_pos', 'rsend_007_pos', 'rsend_008_pos', 'rsend_009_pos', + 'rsend_010_pos', 'rsend_011_pos', 'rsend_012_pos', 'rsend_013_pos', + 'rsend_014_pos', 'rsend_016_neg', 'rsend_019_pos', 'rsend_020_pos', + 'rsend_021_pos', 'rsend_022_pos', 'rsend_024_pos', + 'send-c_verify_ratio', 'send-c_verify_contents', 'send-c_props', + 'send-c_incremental', 'send-c_volume', 'send-c_zstreamdump', + 'send-c_lz4_disabled', 'send-c_recv_lz4_disabled', + 'send-c_mixed_compression', 'send-c_stream_size_estimate', + 'send-c_embedded_blocks', 'send-c_resume', 'send-cpL_varied_recsize', + 'send-c_recv_dedup', 'send-L_toggle', 'send_encrypted_hierarchy', + 'send_encrypted_props', 'send_encrypted_truncated_files', + 'send_freeobjects', 'send_realloc_files', + 'send_realloc_encrypted_files', 'send_spill_block', 'send_holds', + 'send_hole_birth', 'send_mixed_raw', 'send-wR_encrypted_zvol', + 'send_partial_dataset'] +tags = ['functional', 'rsend'] + +[tests/functional/scrub_mirror] +tests = ['scrub_mirror_001_pos', 'scrub_mirror_002_pos', + 'scrub_mirror_003_pos', 'scrub_mirror_004_pos'] +tags = ['functional', 'scrub_mirror'] + +[tests/functional/slog] +tests = ['slog_001_pos', 'slog_002_pos', 'slog_003_pos', 'slog_004_pos', + 'slog_005_pos', 'slog_006_pos', 'slog_007_pos', 'slog_008_neg', + 'slog_009_neg', 'slog_010_neg', 'slog_011_neg', 'slog_012_neg', + 'slog_013_pos', 'slog_014_pos', 'slog_015_neg', 'slog_replay_fs_001', + 'slog_replay_fs_002', 'slog_replay_volume'] +tags = ['functional', 'slog'] + +[tests/functional/snapshot] +tests = ['clone_001_pos', 'rollback_001_pos', 'rollback_002_pos', + 'rollback_003_pos', 'snapshot_001_pos', 'snapshot_002_pos', + 'snapshot_003_pos', 'snapshot_004_pos', 'snapshot_005_pos', + 'snapshot_006_pos', 'snapshot_007_pos', 'snapshot_008_pos', + 'snapshot_009_pos', 'snapshot_010_pos', 'snapshot_011_pos', + 'snapshot_012_pos', 'snapshot_013_pos', 'snapshot_014_pos', + 'snapshot_017_pos'] +tags = ['functional', 'snapshot'] + +[tests/functional/snapused] +tests = ['snapused_001_pos', 'snapused_002_pos', 'snapused_003_pos', + 'snapused_004_pos', 'snapused_005_pos'] +tags = ['functional', 'snapused'] + +[tests/functional/sparse] +tests = ['sparse_001_pos'] +tags = ['functional', 'sparse'] + +[tests/functional/suid] +tests = ['suid_write_to_suid', 'suid_write_to_sgid', 'suid_write_to_suid_sgid', + 'suid_write_to_none'] +tags = ['functional', 'suid'] + +[tests/functional/threadsappend] +tests = ['threadsappend_001_pos'] +tags = ['functional', 'threadsappend'] + +[tests/functional/trim] +tests = ['autotrim_integrity', 'autotrim_config', 'autotrim_trim_integrity', + 'trim_integrity', 'trim_config', 'trim_l2arc'] +tags = ['functional', 'trim'] + +[tests/functional/truncate] +tests = ['truncate_001_pos', 'truncate_002_pos', 'truncate_timestamps'] +tags = ['functional', 'truncate'] + +[tests/functional/upgrade] +tests = ['upgrade_userobj_001_pos', 'upgrade_readonly_pool'] +tags = ['functional', 'upgrade'] + +[tests/functional/userquota] +tests = [ + 'userquota_001_pos', 'userquota_002_pos', 'userquota_003_pos', + 'userquota_004_pos', 'userquota_005_neg', 'userquota_006_pos', + 'userquota_007_pos', 'userquota_008_pos', 'userquota_009_pos', + 'userquota_010_pos', 'userquota_011_pos', 'userquota_012_neg', + 'userspace_001_pos', 'userspace_002_pos'] +tags = ['functional', 'userquota'] + +[tests/functional/vdev_zaps] +tests = ['vdev_zaps_001_pos', 'vdev_zaps_002_pos', 'vdev_zaps_003_pos', + 'vdev_zaps_004_pos', 'vdev_zaps_005_pos', 'vdev_zaps_006_pos', + 'vdev_zaps_007_pos'] +tags = ['functional', 'vdev_zaps'] + +[tests/functional/write_dirs] +tests = ['write_dirs_001_pos', 'write_dirs_002_pos'] +tags = ['functional', 'write_dirs'] + +[tests/functional/xattr] +tests = ['xattr_001_pos', 'xattr_002_neg', 'xattr_003_neg', 'xattr_004_pos', + 'xattr_005_pos', 'xattr_006_pos', 'xattr_007_neg', + 'xattr_011_pos', 'xattr_012_pos', 'xattr_013_pos'] +tags = ['functional', 'xattr'] + +[tests/functional/zvol/zvol_ENOSPC] +tests = ['zvol_ENOSPC_001_pos'] +tags = ['functional', 'zvol', 'zvol_ENOSPC'] + +[tests/functional/zvol/zvol_cli] +tests = ['zvol_cli_001_pos', 'zvol_cli_002_pos', 'zvol_cli_003_neg'] +tags = ['functional', 'zvol', 'zvol_cli'] + +[tests/functional/zvol/zvol_misc] +tests = ['zvol_misc_002_pos', 'zvol_misc_hierarchy', 'zvol_misc_rename_inuse', + 'zvol_misc_snapdev', 'zvol_misc_volmode', 'zvol_misc_zil'] +tags = ['functional', 'zvol', 'zvol_misc'] + +[tests/functional/zvol/zvol_swap] +tests = ['zvol_swap_001_pos', 'zvol_swap_002_pos', 'zvol_swap_004_pos'] +tags = ['functional', 'zvol', 'zvol_swap'] + +[tests/functional/libzfs] +tests = ['many_fds', 'libzfs_input'] +tags = ['functional', 'libzfs'] + +[tests/functional/log_spacemap] +tests = ['log_spacemap_import_logs'] +pre = +post = +tags = ['functional', 'log_spacemap'] diff --git a/tests/runfiles/freebsd.run b/tests/runfiles/freebsd.run new file mode 100644 index 000000000000..c7ca1d769fc3 --- /dev/null +++ b/tests/runfiles/freebsd.run @@ -0,0 +1,27 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/cli_root/zfs_jail:FreeBSD] +tests = ['zfs_jail_001_pos'] +tags = ['functional', 'cli_root', 'zfs_jail'] diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run new file mode 100644 index 000000000000..b6508a5cb3cf --- /dev/null +++ b/tests/runfiles/linux.run @@ -0,0 +1,179 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/acl/posix:Linux] +tests = ['posix_001_pos', 'posix_002_pos', 'posix_003_pos'] +tags = ['functional', 'acl', 'posix'] + +[tests/functional/arc:Linux] +tests = ['dbufstats_001_pos', 'dbufstats_002_pos', 'dbufstats_003_pos', + 'arcstats_runtime_tuning'] +tags = ['functional', 'arc'] + +[tests/functional/atime:Linux] +tests = ['atime_003_pos', 'root_relatime_on'] +tags = ['functional', 'atime'] + +[tests/functional/chattr:Linux] +tests = ['chattr_001_pos', 'chattr_002_neg'] +tags = ['functional', 'chattr'] + +[tests/functional/checksum:Linux] +tests = ['run_edonr_test'] +tags = ['functional', 'checksum'] + +[tests/functional/cli_root/zfs:Linux] +tests = ['zfs_003_neg'] +tags = ['functional', 'cli_root', 'zfs'] + +[tests/functional/cli_root/zfs_mount:Linux] +tests = ['zfs_mount_006_pos', 'zfs_mount_008_pos', 'zfs_multi_mount'] +tags = ['functional', 'cli_root', 'zfs_mount'] + +[tests/functional/cli_root/zfs_share:Linux] +tests = ['zfs_share_005_pos', 'zfs_share_007_neg', 'zfs_share_009_neg', + 'zfs_share_012_pos'] +tags = ['functional', 'cli_root', 'zfs_share'] + +[tests/functional/cli_root/zfs_sysfs:Linux] +tests = ['zfeature_set_unsupported', 'zfs_get_unsupported', + 'zfs_set_unsupported', 'zfs_sysfs_live', 'zpool_get_unsupported', + 'zpool_set_unsupported'] +tags = ['functional', 'cli_root', 'zfs_sysfs'] + +[tests/functional/cli_root/zpool_add:Linux] +tests = ['add_nested_replacing_spare'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_expand:Linux] +tests = ['zpool_expand_001_pos', 'zpool_expand_002_pos', + 'zpool_expand_003_neg', 'zpool_expand_004_pos', 'zpool_expand_005_pos'] +tags = ['functional', 'cli_root', 'zpool_expand'] + +[tests/functional/cli_root/zpool_reopen:Linux] +tests = ['zpool_reopen_001_pos', 'zpool_reopen_002_pos', + 'zpool_reopen_003_pos', 'zpool_reopen_004_pos', 'zpool_reopen_005_pos', + 'zpool_reopen_006_neg', 'zpool_reopen_007_pos'] +tags = ['functional', 'cli_root', 'zpool_reopen'] + +[tests/functional/cli_root/zpool_split:Linux] +tests = ['zpool_split_wholedisk'] +tags = ['functional', 'cli_root', 'zpool_split'] + +[tests/functional/compression:Linux] +tests = ['compress_004_pos'] +tags = ['functional', 'compression'] + +[tests/functional/deadman:Linux] +tests = ['deadman_sync', 'deadman_zio'] +pre = +post = +tags = ['functional', 'deadman'] + +[tests/functional/devices:Linux] +tests = ['devices_001_pos', 'devices_002_neg', 'devices_003_pos'] +tags = ['functional', 'devices'] + +[tests/functional/events:Linux] +tests = ['events_001_pos', 'events_002_pos', 'zed_rc_filter'] +tags = ['functional', 'events'] + +[tests/functional/fallocate:Linux] +tests = ['fallocate_prealloc', 'fallocate_punch-hole'] +tags = ['functional', 'fallocate'] + +[tests/functional/fault:Linux] +tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_replace_001_pos', + 'auto_spare_001_pos', 'auto_spare_002_pos', 'auto_spare_multiple', + 'auto_spare_ashift', 'auto_spare_shared', 'decrypt_fault', + 'decompress_fault', 'scrub_after_resilver', 'zpool_status_-s'] +tags = ['functional', 'fault'] + +[tests/functional/features/large_dnode:Linux] +tests = ['large_dnode_002_pos', 'large_dnode_006_pos', 'large_dnode_008_pos'] +tags = ['functional', 'features', 'large_dnode'] + +[tests/functional/io:Linux] +tests = ['libaio'] +tags = ['functional', 'io'] + +[tests/functional/mmap:Linux] +tests = ['mmap_libaio_001_pos'] +tags = ['functional', 'mmap'] + +[tests/functional/mmp:Linux] +tests = ['mmp_on_thread', 'mmp_on_uberblocks', 'mmp_on_off', 'mmp_interval', + 'mmp_active_import', 'mmp_inactive_import', 'mmp_exported_import', + 'mmp_write_uberblocks', 'mmp_reset_interval', 'multihost_history', + 'mmp_on_zdb', 'mmp_write_distribution', 'mmp_hostid'] +tags = ['functional', 'mmp'] + +[tests/functional/mount:Linux] +tests = ['umount_unlinked_drain'] +tags = ['functional', 'mount'] + +[tests/functional/pam:Linux] +tests = ['pam_basic', 'pam_nounmount'] +tags = ['functional', 'pam'] + +[tests/functional/procfs:Linux] +tests = ['procfs_list_basic', 'procfs_list_concurrent_readers', + 'procfs_list_stale_read', 'pool_state'] +tags = ['functional', 'procfs'] + +[tests/functional/projectquota:Linux] +tests = ['projectid_001_pos', 'projectid_002_pos', 'projectid_003_pos', + 'projectquota_001_pos', 'projectquota_002_pos', 'projectquota_003_pos', + 'projectquota_004_neg', 'projectquota_005_pos', 'projectquota_006_pos', + 'projectquota_007_pos', 'projectquota_008_pos', 'projectquota_009_pos', + 'projectspace_001_pos', 'projectspace_002_pos', 'projectspace_003_pos', + 'projectspace_004_pos', + 'projecttree_001_pos', 'projecttree_002_pos', 'projecttree_003_neg'] +tags = ['functional', 'projectquota'] + +[tests/functional/rsend:Linux] +tests = ['send_realloc_dnode_size', 'send_encrypted_files'] +tags = ['functional', 'rsend'] + +[tests/functional/snapshot:Linux] +tests = ['snapshot_015_pos', 'snapshot_016_pos'] +tags = ['functional', 'snapshot'] + +[tests/functional/tmpfile:Linux] +tests = ['tmpfile_001_pos', 'tmpfile_002_pos', 'tmpfile_003_pos', + 'tmpfile_stat_mode'] +tags = ['functional', 'tmpfile'] + +[tests/functional/upgrade:Linux] +tests = ['upgrade_projectquota_001_pos'] +tags = ['functional', 'upgrade'] + +[tests/functional/user_namespace:Linux] +tests = ['user_namespace_001'] +tags = ['functional', 'user_namespace'] + +[tests/functional/userquota:Linux] +tests = ['groupspace_001_pos', 'groupspace_002_pos', 'groupspace_003_pos', + 'userquota_013_pos', 'userspace_003_pos'] +tags = ['functional', 'userquota'] diff --git a/tests/runfiles/longevity.run b/tests/runfiles/longevity.run new file mode 100644 index 000000000000..fde2ef6ab92e --- /dev/null +++ b/tests/runfiles/longevity.run @@ -0,0 +1,23 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# + +[DEFAULT] +quiet = False +user = root +timeout = 10800 +outputdir = /var/tmp/test_results + +[/opt/zfs-tests/tests/longevity] +tests = ['slop_space_test'] diff --git a/tests/runfiles/perf-regression.run b/tests/runfiles/perf-regression.run new file mode 100644 index 000000000000..ec081040d54d --- /dev/null +++ b/tests/runfiles/perf-regression.run @@ -0,0 +1,33 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2015, 2016 by Delphix. All rights reserved. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 0 +post_user = root +post = cleanup +outputdir = /var/tmp/test_results +tags = ['perf'] + +[tests/perf/regression] +tests = ['sequential_writes', 'sequential_reads', 'sequential_reads_arc_cached', + 'sequential_reads_arc_cached_clone', 'sequential_reads_dbuf_cached', + 'random_reads', 'random_writes', 'random_readwrite', 'random_writes_zil', + 'random_readwrite_fixed'] +post = +tags = ['perf', 'regression'] diff --git a/tests/runfiles/sunos.run b/tests/runfiles/sunos.run new file mode 100644 index 000000000000..9ba00f452ea4 --- /dev/null +++ b/tests/runfiles/sunos.run @@ -0,0 +1,53 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 600 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['functional'] + +[tests/functional/inuse:illumos] +tests = ['inuse_001_pos', 'inuse_003_pos', 'inuse_006_pos', 'inuse_007_pos'] +post = +tags = ['functional', 'inuse'] + +[tests/functional/cli_root/zpool_add:illumos] +tests = ['zpool_add_005_pos'] +tags = ['functional', 'cli_root', 'zpool_add'] + +[tests/functional/cli_root/zpool_create:illumos] +tests = ['zpool_create_016_pos'] +tags = ['functional', 'cli_root', 'zpool_create'] + +[tests/functional/privilege] +tests = ['privilege_001_pos', 'privilege_002_pos'] +tags = ['functional', 'privilege'] + +[tests/functional/xattr:illumos] +tests = ['xattr_008_pos', 'xattr_009_neg', 'xattr_010_neg'] +tags = ['functional', 'xattr'] + +[tests/functional/zvol/zvol_misc:illumos] +tests = ['zvol_misc_001_neg', 'zvol_misc_003_neg', 'zvol_misc_004_pos', + 'zvol_misc_005_neg', 'zvol_misc_006_pos'] +tags = ['functional', 'zvol', 'zvol_misc'] + +[tests/functional/zvol/zvol_swap:illumos] +tests = ['zvol_swap_003_pos', 'zvol_swap_005_pos', 'zvol_swap_006_pos'] +tags = ['functional', 'zvol', 'zvol_swap'] diff --git a/tests/test-runner/Makefile.am b/tests/test-runner/Makefile.am new file mode 100644 index 000000000000..db3d966142d6 --- /dev/null +++ b/tests/test-runner/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = bin include man diff --git a/tests/test-runner/bin/.gitignore b/tests/test-runner/bin/.gitignore new file mode 100644 index 000000000000..ff7e2f8fcc76 --- /dev/null +++ b/tests/test-runner/bin/.gitignore @@ -0,0 +1,2 @@ +test-runner.py +zts-report.py diff --git a/tests/test-runner/bin/Makefile.am b/tests/test-runner/bin/Makefile.am new file mode 100644 index 000000000000..e11e55fffdeb --- /dev/null +++ b/tests/test-runner/bin/Makefile.am @@ -0,0 +1,8 @@ +include $(top_srcdir)/config/Substfiles.am + +pkgdatadir = $(datadir)/@PACKAGE@/test-runner/bin +pkgdata_SCRIPTS = \ + test-runner.py \ + zts-report.py + +SUBSTFILES += $(pkgdata_SCRIPTS) diff --git a/tests/test-runner/bin/test-runner.py.in b/tests/test-runner/bin/test-runner.py.in new file mode 100755 index 000000000000..bbabf247c1dc --- /dev/null +++ b/tests/test-runner/bin/test-runner.py.in @@ -0,0 +1,1056 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2012, 2018 by Delphix. All rights reserved. +# Copyright (c) 2019 Datto Inc. +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +# some python 2.7 system don't have a configparser shim +try: + import configparser +except ImportError: + import ConfigParser as configparser + +import os +import sys +import ctypes + +from datetime import datetime +from optparse import OptionParser +from pwd import getpwnam +from pwd import getpwuid +from select import select +from subprocess import PIPE +from subprocess import Popen +from threading import Timer +from time import time + +BASEDIR = '/var/tmp/test_results' +TESTDIR = '/usr/share/zfs/' +KILL = 'kill' +TRUE = 'true' +SUDO = 'sudo' +LOG_FILE = 'LOG_FILE' +LOG_OUT = 'LOG_OUT' +LOG_ERR = 'LOG_ERR' +LOG_FILE_OBJ = None + +# some python 2.7 system don't have a concept of monotonic time +CLOCK_MONOTONIC_RAW = 4 # see + + +class timespec(ctypes.Structure): + _fields_ = [ + ('tv_sec', ctypes.c_long), + ('tv_nsec', ctypes.c_long) + ] + + +librt = ctypes.CDLL('librt.so.1', use_errno=True) +clock_gettime = librt.clock_gettime +clock_gettime.argtypes = [ctypes.c_int, ctypes.POINTER(timespec)] + + +def monotonic_time(): + t = timespec() + if clock_gettime(CLOCK_MONOTONIC_RAW, ctypes.pointer(t)) != 0: + errno_ = ctypes.get_errno() + raise OSError(errno_, os.strerror(errno_)) + return t.tv_sec + t.tv_nsec * 1e-9 + + +class Result(object): + total = 0 + runresults = {'PASS': 0, 'FAIL': 0, 'SKIP': 0, 'KILLED': 0, 'RERAN': 0} + + def __init__(self): + self.starttime = None + self.returncode = None + self.runtime = '' + self.stdout = [] + self.stderr = [] + self.result = '' + + def done(self, proc, killed, reran): + """ + Finalize the results of this Cmd. + """ + Result.total += 1 + m, s = divmod(monotonic_time() - self.starttime, 60) + self.runtime = '%02d:%02d' % (m, s) + self.returncode = proc.returncode + if reran is True: + Result.runresults['RERAN'] += 1 + if killed: + self.result = 'KILLED' + Result.runresults['KILLED'] += 1 + elif self.returncode == 0: + self.result = 'PASS' + Result.runresults['PASS'] += 1 + elif self.returncode == 4: + self.result = 'SKIP' + Result.runresults['SKIP'] += 1 + elif self.returncode != 0: + self.result = 'FAIL' + Result.runresults['FAIL'] += 1 + + +class Output(object): + """ + This class is a slightly modified version of the 'Stream' class found + here: http://goo.gl/aSGfv + """ + def __init__(self, stream): + self.stream = stream + self._buf = b'' + self.lines = [] + + def fileno(self): + return self.stream.fileno() + + def read(self, drain=0): + """ + Read from the file descriptor. If 'drain' set, read until EOF. + """ + while self._read() is not None: + if not drain: + break + + def _read(self): + """ + Read up to 4k of data from this output stream. Collect the output + up to the last newline, and append it to any leftover data from a + previous call. The lines are stored as a (timestamp, data) tuple + for easy sorting/merging later. + """ + fd = self.fileno() + buf = os.read(fd, 4096) + if not buf: + return None + if b'\n' not in buf: + self._buf += buf + return [] + + buf = self._buf + buf + tmp, rest = buf.rsplit(b'\n', 1) + self._buf = rest + now = datetime.now() + rows = tmp.split(b'\n') + self.lines += [(now, r) for r in rows] + + +class Cmd(object): + verified_users = [] + + def __init__(self, pathname, identifier=None, outputdir=None, + timeout=None, user=None, tags=None): + self.pathname = pathname + self.identifier = identifier + self.outputdir = outputdir or 'BASEDIR' + """ + The timeout for tests is measured in wall-clock time + """ + self.timeout = timeout + self.user = user or '' + self.killed = False + self.reran = None + self.result = Result() + + if self.timeout is None: + self.timeout = 60 + + def __str__(self): + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Timeout: %d +User: %s +''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user) + + def kill_cmd(self, proc, keyboard_interrupt=False): + """ + Kill a running command due to timeout, or ^C from the keyboard. If + sudo is required, this user was verified previously. + """ + self.killed = True + do_sudo = len(self.user) != 0 + signal = '-TERM' + + cmd = [SUDO, KILL, signal, str(proc.pid)] + if not do_sudo: + del cmd[0] + + try: + kp = Popen(cmd) + kp.wait() + except Exception: + pass + + """ + If this is not a user-initiated kill and the test has not been + reran before we consider if the test needs to be reran: + If the test has spent some time hibernating and didn't run the whole + length of time before being timed out we will rerun the test. + """ + if keyboard_interrupt is False and self.reran is None: + runtime = monotonic_time() - self.result.starttime + if int(self.timeout) > runtime: + self.killed = False + self.reran = False + self.run(False) + self.reran = True + + def update_cmd_privs(self, cmd, user): + """ + If a user has been specified to run this Cmd and we're not already + running as that user, prepend the appropriate sudo command to run + as that user. + """ + me = getpwuid(os.getuid()) + + if not user or user is me: + if os.path.isfile(cmd+'.ksh') and os.access(cmd+'.ksh', os.X_OK): + cmd += '.ksh' + if os.path.isfile(cmd+'.sh') and os.access(cmd+'.sh', os.X_OK): + cmd += '.sh' + return cmd + + if not os.path.isfile(cmd): + if os.path.isfile(cmd+'.ksh') and os.access(cmd+'.ksh', os.X_OK): + cmd += '.ksh' + if os.path.isfile(cmd+'.sh') and os.access(cmd+'.sh', os.X_OK): + cmd += '.sh' + + ret = '%s -E -u %s %s' % (SUDO, user, cmd) + return ret.split(' ') + + def collect_output(self, proc): + """ + Read from stdout/stderr as data becomes available, until the + process is no longer running. Return the lines from the stdout and + stderr Output objects. + """ + out = Output(proc.stdout) + err = Output(proc.stderr) + res = [] + while proc.returncode is None: + proc.poll() + res = select([out, err], [], [], .1) + for fd in res[0]: + fd.read() + for fd in res[0]: + fd.read(drain=1) + + return out.lines, err.lines + + def run(self, dryrun): + """ + This is the main function that runs each individual test. + Determine whether or not the command requires sudo, and modify it + if needed. Run the command, and update the result object. + """ + if dryrun is True: + print(self) + return + + privcmd = self.update_cmd_privs(self.pathname, self.user) + try: + old = os.umask(0) + if not os.path.isdir(self.outputdir): + os.makedirs(self.outputdir, mode=0o777) + os.umask(old) + except OSError as e: + fail('%s' % e) + + self.result.starttime = monotonic_time() + proc = Popen(privcmd, stdout=PIPE, stderr=PIPE) + # Allow a special timeout value of 0 to mean infinity + if int(self.timeout) == 0: + self.timeout = sys.maxsize + t = Timer(int(self.timeout), self.kill_cmd, [proc]) + + try: + t.start() + self.result.stdout, self.result.stderr = self.collect_output(proc) + except KeyboardInterrupt: + self.kill_cmd(proc, True) + fail('\nRun terminated at user request.') + finally: + t.cancel() + + if self.reran is not False: + self.result.done(proc, self.killed, self.reran) + + def skip(self): + """ + Initialize enough of the test result that we can log a skipped + command. + """ + Result.total += 1 + Result.runresults['SKIP'] += 1 + self.result.stdout = self.result.stderr = [] + self.result.starttime = monotonic_time() + m, s = divmod(monotonic_time() - self.result.starttime, 60) + self.result.runtime = '%02d:%02d' % (m, s) + self.result.result = 'SKIP' + + def log(self, options, suppress_console=False): + """ + This function is responsible for writing all output. This includes + the console output, the logfile of all results (with timestamped + merged stdout and stderr), and for each test, the unmodified + stdout/stderr/merged in its own file. + """ + + logname = getpwuid(os.getuid()).pw_name + rer = '' + if self.reran is True: + rer = ' (RERAN)' + user = ' (run as %s)' % (self.user if len(self.user) else logname) + if self.identifier: + msga = 'Test (%s): %s%s ' % (self.identifier, self.pathname, user) + else: + msga = 'Test: %s%s ' % (self.pathname, user) + msgb = '[%s] [%s]%s\n' % (self.result.runtime, self.result.result, rer) + pad = ' ' * (80 - (len(msga) + len(msgb))) + result_line = msga + pad + msgb + + # The result line is always written to the log file. If -q was + # specified only failures are written to the console, otherwise + # the result line is written to the console. The console output + # may be suppressed by calling log() with suppress_console=True. + write_log(bytearray(result_line, encoding='utf-8'), LOG_FILE) + if not suppress_console: + if not options.quiet: + write_log(result_line, LOG_OUT) + elif options.quiet and self.result.result != 'PASS': + write_log(result_line, LOG_OUT) + + lines = sorted(self.result.stdout + self.result.stderr, + key=lambda x: x[0]) + + # Write timestamped output (stdout and stderr) to the logfile + for dt, line in lines: + timestamp = bytearray(dt.strftime("%H:%M:%S.%f ")[:11], + encoding='utf-8') + write_log(b'%s %s\n' % (timestamp, line), LOG_FILE) + + # Write the separate stdout/stderr/merged files, if the data exists + if len(self.result.stdout): + with open(os.path.join(self.outputdir, 'stdout'), 'wb') as out: + for _, line in self.result.stdout: + os.write(out.fileno(), b'%s\n' % line) + if len(self.result.stderr): + with open(os.path.join(self.outputdir, 'stderr'), 'wb') as err: + for _, line in self.result.stderr: + os.write(err.fileno(), b'%s\n' % line) + if len(self.result.stdout) and len(self.result.stderr): + with open(os.path.join(self.outputdir, 'merged'), 'wb') as merged: + for _, line in lines: + os.write(merged.fileno(), b'%s\n' % line) + + +class Test(Cmd): + props = ['outputdir', 'timeout', 'user', 'pre', 'pre_user', 'post', + 'post_user', 'failsafe', 'failsafe_user', 'tags'] + + def __init__(self, pathname, + pre=None, pre_user=None, post=None, post_user=None, + failsafe=None, failsafe_user=None, tags=None, **kwargs): + super(Test, self).__init__(pathname, **kwargs) + self.pre = pre or '' + self.pre_user = pre_user or '' + self.post = post or '' + self.post_user = post_user or '' + self.failsafe = failsafe or '' + self.failsafe_user = failsafe_user or '' + self.tags = tags or [] + + def __str__(self): + post_user = pre_user = failsafe_user = '' + if len(self.pre_user): + pre_user = ' (as %s)' % (self.pre_user) + if len(self.post_user): + post_user = ' (as %s)' % (self.post_user) + if len(self.failsafe_user): + failsafe_user = ' (as %s)' % (self.failsafe_user) + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Timeout: %d +User: %s +Pre: %s%s +Post: %s%s +Failsafe: %s%s +Tags: %s +''' % (self.pathname, self.identifier, self.outputdir, self.timeout, self.user, + self.pre, pre_user, self.post, post_user, self.failsafe, + failsafe_user, self.tags) + + def verify(self): + """ + Check the pre/post/failsafe scripts, user and Test. Omit the Test from + this run if there are any problems. + """ + files = [self.pre, self.pathname, self.post, self.failsafe] + users = [self.pre_user, self.user, self.post_user, self.failsafe_user] + + for f in [f for f in files if len(f)]: + if not verify_file(f): + write_log("Warning: Test '%s' not added to this run because" + " it failed verification.\n" % f, LOG_ERR) + return False + + for user in [user for user in users if len(user)]: + if not verify_user(user): + write_log("Not adding Test '%s' to this run.\n" % + self.pathname, LOG_ERR) + return False + + return True + + def run(self, options): + """ + Create Cmd instances for the pre/post/failsafe scripts. If the pre + script doesn't pass, skip this Test. Run the post script regardless. + If the Test is killed, also run the failsafe script. + """ + odir = os.path.join(self.outputdir, os.path.basename(self.pre)) + pretest = Cmd(self.pre, identifier=self.identifier, outputdir=odir, + timeout=self.timeout, user=self.pre_user) + test = Cmd(self.pathname, identifier=self.identifier, + outputdir=self.outputdir, timeout=self.timeout, + user=self.user) + odir = os.path.join(self.outputdir, os.path.basename(self.failsafe)) + failsafe = Cmd(self.failsafe, identifier=self.identifier, + outputdir=odir, timeout=self.timeout, + user=self.failsafe_user) + odir = os.path.join(self.outputdir, os.path.basename(self.post)) + posttest = Cmd(self.post, identifier=self.identifier, outputdir=odir, + timeout=self.timeout, user=self.post_user) + + cont = True + if len(pretest.pathname): + pretest.run(options.dryrun) + cont = pretest.result.result == 'PASS' + pretest.log(options) + + if cont: + test.run(options.dryrun) + if test.result.result == 'KILLED' and len(failsafe.pathname): + failsafe.run(options.dryrun) + failsafe.log(options, suppress_console=True) + else: + test.skip() + + test.log(options) + + if len(posttest.pathname): + posttest.run(options.dryrun) + posttest.log(options) + + +class TestGroup(Test): + props = Test.props + ['tests'] + + def __init__(self, pathname, tests=None, **kwargs): + super(TestGroup, self).__init__(pathname, **kwargs) + self.tests = tests or [] + + def __str__(self): + post_user = pre_user = failsafe_user = '' + if len(self.pre_user): + pre_user = ' (as %s)' % (self.pre_user) + if len(self.post_user): + post_user = ' (as %s)' % (self.post_user) + if len(self.failsafe_user): + failsafe_user = ' (as %s)' % (self.failsafe_user) + return '''\ +Pathname: %s +Identifier: %s +Outputdir: %s +Tests: %s +Timeout: %s +User: %s +Pre: %s%s +Post: %s%s +Failsafe: %s%s +Tags: %s +''' % (self.pathname, self.identifier, self.outputdir, self.tests, + self.timeout, self.user, self.pre, pre_user, self.post, post_user, + self.failsafe, failsafe_user, self.tags) + + def verify(self): + """ + Check the pre/post/failsafe scripts, user and tests in this TestGroup. + Omit the TestGroup entirely, or simply delete the relevant tests in the + group, if that's all that's required. + """ + # If the pre/post/failsafe scripts are relative pathnames, convert to + # absolute, so they stand a chance of passing verification. + if len(self.pre) and not os.path.isabs(self.pre): + self.pre = os.path.join(self.pathname, self.pre) + if len(self.post) and not os.path.isabs(self.post): + self.post = os.path.join(self.pathname, self.post) + if len(self.failsafe) and not os.path.isabs(self.failsafe): + self.post = os.path.join(self.pathname, self.post) + + auxfiles = [self.pre, self.post, self.failsafe] + users = [self.pre_user, self.user, self.post_user, self.failsafe_user] + + for f in [f for f in auxfiles if len(f)]: + if f != self.failsafe and self.pathname != os.path.dirname(f): + write_log("Warning: TestGroup '%s' not added to this run. " + "Auxiliary script '%s' exists in a different " + "directory.\n" % (self.pathname, f), LOG_ERR) + return False + + if not verify_file(f): + write_log("Warning: TestGroup '%s' not added to this run. " + "Auxiliary script '%s' failed verification.\n" % + (self.pathname, f), LOG_ERR) + return False + + for user in [user for user in users if len(user)]: + if not verify_user(user): + write_log("Not adding TestGroup '%s' to this run.\n" % + self.pathname, LOG_ERR) + return False + + # If one of the tests is invalid, delete it, log it, and drive on. + for test in self.tests: + if not verify_file(os.path.join(self.pathname, test)): + del self.tests[self.tests.index(test)] + write_log("Warning: Test '%s' removed from TestGroup '%s' " + "because it failed verification.\n" % + (test, self.pathname), LOG_ERR) + + return len(self.tests) != 0 + + def run(self, options): + """ + Create Cmd instances for the pre/post/failsafe scripts. If the pre + script doesn't pass, skip all the tests in this TestGroup. Run the + post script regardless. Run the failsafe script when a test is killed. + """ + # tags assigned to this test group also include the test names + if options.tags and not set(self.tags).intersection(set(options.tags)): + return + + odir = os.path.join(self.outputdir, os.path.basename(self.pre)) + pretest = Cmd(self.pre, outputdir=odir, timeout=self.timeout, + user=self.pre_user, identifier=self.identifier) + odir = os.path.join(self.outputdir, os.path.basename(self.post)) + posttest = Cmd(self.post, outputdir=odir, timeout=self.timeout, + user=self.post_user, identifier=self.identifier) + + cont = True + if len(pretest.pathname): + pretest.run(options.dryrun) + cont = pretest.result.result == 'PASS' + pretest.log(options) + + for fname in self.tests: + odir = os.path.join(self.outputdir, fname) + test = Cmd(os.path.join(self.pathname, fname), outputdir=odir, + timeout=self.timeout, user=self.user, + identifier=self.identifier) + odir = os.path.join(odir, os.path.basename(self.failsafe)) + failsafe = Cmd(self.failsafe, outputdir=odir, timeout=self.timeout, + user=self.failsafe_user, identifier=self.identifier) + if cont: + test.run(options.dryrun) + if test.result.result == 'KILLED' and len(failsafe.pathname): + failsafe.run(options.dryrun) + failsafe.log(options, suppress_console=True) + else: + test.skip() + + test.log(options) + + if len(posttest.pathname): + posttest.run(options.dryrun) + posttest.log(options) + + +class TestRun(object): + props = ['quiet', 'outputdir'] + + def __init__(self, options): + self.tests = {} + self.testgroups = {} + self.starttime = time() + self.timestamp = datetime.now().strftime('%Y%m%dT%H%M%S') + self.outputdir = os.path.join(options.outputdir, self.timestamp) + self.setup_logging(options) + self.defaults = [ + ('outputdir', BASEDIR), + ('quiet', False), + ('timeout', 60), + ('user', ''), + ('pre', ''), + ('pre_user', ''), + ('post', ''), + ('post_user', ''), + ('failsafe', ''), + ('failsafe_user', ''), + ('tags', []) + ] + + def __str__(self): + s = 'TestRun:\n outputdir: %s\n' % self.outputdir + s += 'TESTS:\n' + for key in sorted(self.tests.keys()): + s += '%s%s' % (self.tests[key].__str__(), '\n') + s += 'TESTGROUPS:\n' + for key in sorted(self.testgroups.keys()): + s += '%s%s' % (self.testgroups[key].__str__(), '\n') + return s + + def addtest(self, pathname, options): + """ + Create a new Test, and apply any properties that were passed in + from the command line. If it passes verification, add it to the + TestRun. + """ + test = Test(pathname) + for prop in Test.props: + setattr(test, prop, getattr(options, prop)) + + if test.verify(): + self.tests[pathname] = test + + def addtestgroup(self, dirname, filenames, options): + """ + Create a new TestGroup, and apply any properties that were passed + in from the command line. If it passes verification, add it to the + TestRun. + """ + if dirname not in self.testgroups: + testgroup = TestGroup(dirname) + for prop in Test.props: + setattr(testgroup, prop, getattr(options, prop)) + + # Prevent pre/post/failsafe scripts from running as regular tests + for f in [testgroup.pre, testgroup.post, testgroup.failsafe]: + if f in filenames: + del filenames[filenames.index(f)] + + self.testgroups[dirname] = testgroup + self.testgroups[dirname].tests = sorted(filenames) + + testgroup.verify() + + def read(self, options): + """ + Read in the specified runfiles, and apply the TestRun properties + listed in the 'DEFAULT' section to our TestRun. Then read each + section, and apply the appropriate properties to the Test or + TestGroup. Properties from individual sections override those set + in the 'DEFAULT' section. If the Test or TestGroup passes + verification, add it to the TestRun. + """ + config = configparser.RawConfigParser() + parsed = config.read(options.runfiles) + failed = options.runfiles - set(parsed) + if len(failed): + files = ' '.join(sorted(failed)) + fail("Couldn't read config files: %s" % files) + + for opt in TestRun.props: + if config.has_option('DEFAULT', opt): + setattr(self, opt, config.get('DEFAULT', opt)) + self.outputdir = os.path.join(self.outputdir, self.timestamp) + + testdir = options.testdir + + for section in config.sections(): + if 'tests' in config.options(section): + parts = section.split(':', 1) + sectiondir = parts[0] + identifier = parts[1] if len(parts) == 2 else None + if os.path.isdir(sectiondir): + pathname = sectiondir + elif os.path.isdir(os.path.join(testdir, sectiondir)): + pathname = os.path.join(testdir, sectiondir) + else: + pathname = sectiondir + + testgroup = TestGroup(os.path.abspath(pathname), + identifier=identifier) + for prop in TestGroup.props: + for sect in ['DEFAULT', section]: + if config.has_option(sect, prop): + if prop == 'tags': + setattr(testgroup, prop, + eval(config.get(sect, prop))) + elif prop == 'failsafe': + failsafe = config.get(sect, prop) + setattr(testgroup, prop, + os.path.join(testdir, failsafe)) + else: + setattr(testgroup, prop, + config.get(sect, prop)) + + # Repopulate tests using eval to convert the string to a list + testgroup.tests = eval(config.get(section, 'tests')) + + if testgroup.verify(): + self.testgroups[section] = testgroup + else: + test = Test(section) + for prop in Test.props: + for sect in ['DEFAULT', section]: + if config.has_option(sect, prop): + if prop == 'failsafe': + failsafe = config.get(sect, prop) + setattr(test, prop, + os.path.join(testdir, failsafe)) + else: + setattr(test, prop, config.get(sect, prop)) + + if test.verify(): + self.tests[section] = test + + def write(self, options): + """ + Create a configuration file for editing and later use. The + 'DEFAULT' section of the config file is created from the + properties that were specified on the command line. Tests are + simply added as sections that inherit everything from the + 'DEFAULT' section. TestGroups are the same, except they get an + option including all the tests to run in that directory. + """ + + defaults = dict([(prop, getattr(options, prop)) for prop, _ in + self.defaults]) + config = configparser.RawConfigParser(defaults) + + for test in sorted(self.tests.keys()): + config.add_section(test) + + for testgroup in sorted(self.testgroups.keys()): + config.add_section(testgroup) + config.set(testgroup, 'tests', self.testgroups[testgroup].tests) + + try: + with open(options.template, 'w') as f: + return config.write(f) + except IOError: + fail('Could not open \'%s\' for writing.' % options.template) + + def complete_outputdirs(self): + """ + Collect all the pathnames for Tests, and TestGroups. Work + backwards one pathname component at a time, to create a unique + directory name in which to deposit test output. Tests will be able + to write output files directly in the newly modified outputdir. + TestGroups will be able to create one subdirectory per test in the + outputdir, and are guaranteed uniqueness because a group can only + contain files in one directory. Pre and post tests will create a + directory rooted at the outputdir of the Test or TestGroup in + question for their output. Failsafe scripts will create a directory + rooted at the outputdir of each Test for their output. + """ + done = False + components = 0 + tmp_dict = dict(list(self.tests.items()) + + list(self.testgroups.items())) + total = len(tmp_dict) + base = self.outputdir + + while not done: + paths = [] + components -= 1 + for testfile in list(tmp_dict.keys()): + uniq = '/'.join(testfile.split('/')[components:]).lstrip('/') + if uniq not in paths: + paths.append(uniq) + tmp_dict[testfile].outputdir = os.path.join(base, uniq) + else: + break + done = total == len(paths) + + def setup_logging(self, options): + """ + This function creates the output directory and gets a file object + for the logfile. This function must be called before write_log() + can be used. + """ + if options.dryrun is True: + return + + global LOG_FILE_OBJ + if options.cmd != 'wrconfig': + try: + old = os.umask(0) + os.makedirs(self.outputdir, mode=0o777) + os.umask(old) + filename = os.path.join(self.outputdir, 'log') + LOG_FILE_OBJ = open(filename, buffering=0, mode='wb') + except OSError as e: + fail('%s' % e) + + def run(self, options): + """ + Walk through all the Tests and TestGroups, calling run(). + """ + try: + os.chdir(self.outputdir) + except OSError: + fail('Could not change to directory %s' % self.outputdir) + # make a symlink to the output for the currently running test + logsymlink = os.path.join(self.outputdir, '../current') + if os.path.islink(logsymlink): + os.unlink(logsymlink) + if not os.path.exists(logsymlink): + os.symlink(self.outputdir, logsymlink) + else: + write_log('Could not make a symlink to directory %s\n' % + self.outputdir, LOG_ERR) + iteration = 0 + while iteration < options.iterations: + for test in sorted(self.tests.keys()): + self.tests[test].run(options) + for testgroup in sorted(self.testgroups.keys()): + self.testgroups[testgroup].run(options) + iteration += 1 + + def summary(self): + if Result.total == 0: + return 2 + + print('\nResults Summary') + for key in list(Result.runresults.keys()): + if Result.runresults[key] != 0: + print('%s\t% 4d' % (key, Result.runresults[key])) + + m, s = divmod(time() - self.starttime, 60) + h, m = divmod(m, 60) + print('\nRunning Time:\t%02d:%02d:%02d' % (h, m, s)) + print('Percent passed:\t%.1f%%' % ((float(Result.runresults['PASS']) / + float(Result.total)) * 100)) + print('Log directory:\t%s' % self.outputdir) + + if Result.runresults['FAIL'] > 0: + return 1 + + if Result.runresults['KILLED'] > 0: + return 1 + + if Result.runresults['RERAN'] > 0: + return 3 + + return 0 + + +def write_log(msg, target): + """ + Write the provided message to standard out, standard error or + the logfile. If specifying LOG_FILE, then `msg` must be a bytes + like object. This way we can still handle output from tests that + may be in unexpected encodings. + """ + if target == LOG_OUT: + os.write(sys.stdout.fileno(), bytearray(msg, encoding='utf-8')) + elif target == LOG_ERR: + os.write(sys.stderr.fileno(), bytearray(msg, encoding='utf-8')) + elif target == LOG_FILE: + os.write(LOG_FILE_OBJ.fileno(), msg) + else: + fail('log_msg called with unknown target "%s"' % target) + + +def verify_file(pathname): + """ + Verify that the supplied pathname is an executable regular file. + """ + if os.path.isdir(pathname) or os.path.islink(pathname): + return False + + for ext in '', '.ksh', '.sh': + script_path = pathname + ext + if os.path.isfile(script_path) and os.access(script_path, os.X_OK): + return True + + return False + + +def verify_user(user): + """ + Verify that the specified user exists on this system, and can execute + sudo without being prompted for a password. + """ + testcmd = [SUDO, '-n', '-u', user, TRUE] + + if user in Cmd.verified_users: + return True + + try: + getpwnam(user) + except KeyError: + write_log("Warning: user '%s' does not exist.\n" % user, + LOG_ERR) + return False + + p = Popen(testcmd) + p.wait() + if p.returncode != 0: + write_log("Warning: user '%s' cannot use passwordless sudo.\n" % user, + LOG_ERR) + return False + else: + Cmd.verified_users.append(user) + + return True + + +def find_tests(testrun, options): + """ + For the given list of pathnames, add files as Tests. For directories, + if do_groups is True, add the directory as a TestGroup. If False, + recursively search for executable files. + """ + + for p in sorted(options.pathnames): + if os.path.isdir(p): + for dirname, _, filenames in os.walk(p): + if options.do_groups: + testrun.addtestgroup(dirname, filenames, options) + else: + for f in sorted(filenames): + testrun.addtest(os.path.join(dirname, f), options) + else: + testrun.addtest(p, options) + + +def fail(retstr, ret=1): + print('%s: %s' % (sys.argv[0], retstr)) + exit(ret) + + +def options_cb(option, opt_str, value, parser): + path_options = ['outputdir', 'template', 'testdir'] + + if option.dest == 'runfiles' and '-w' in parser.rargs or \ + option.dest == 'template' and '-c' in parser.rargs: + fail('-c and -w are mutually exclusive.') + + if opt_str in parser.rargs: + fail('%s may only be specified once.' % opt_str) + + if option.dest == 'runfiles': + parser.values.cmd = 'rdconfig' + value = set(os.path.abspath(p) for p in value.split(',')) + if option.dest == 'template': + parser.values.cmd = 'wrconfig' + if option.dest == 'tags': + value = [x.strip() for x in value.split(',')] + + if option.dest in path_options: + setattr(parser.values, option.dest, os.path.abspath(value)) + else: + setattr(parser.values, option.dest, value) + + +def parse_args(): + parser = OptionParser() + parser.add_option('-c', action='callback', callback=options_cb, + type='string', dest='runfiles', metavar='runfiles', + help='Specify tests to run via config files.') + parser.add_option('-d', action='store_true', default=False, dest='dryrun', + help='Dry run. Print tests, but take no other action.') + parser.add_option('-g', action='store_true', default=False, + dest='do_groups', help='Make directories TestGroups.') + parser.add_option('-o', action='callback', callback=options_cb, + default=BASEDIR, dest='outputdir', type='string', + metavar='outputdir', help='Specify an output directory.') + parser.add_option('-i', action='callback', callback=options_cb, + default=TESTDIR, dest='testdir', type='string', + metavar='testdir', help='Specify a test directory.') + parser.add_option('-p', action='callback', callback=options_cb, + default='', dest='pre', metavar='script', + type='string', help='Specify a pre script.') + parser.add_option('-P', action='callback', callback=options_cb, + default='', dest='post', metavar='script', + type='string', help='Specify a post script.') + parser.add_option('-q', action='store_true', default=False, dest='quiet', + help='Silence on the console during a test run.') + parser.add_option('-s', action='callback', callback=options_cb, + default='', dest='failsafe', metavar='script', + type='string', help='Specify a failsafe script.') + parser.add_option('-S', action='callback', callback=options_cb, + default='', dest='failsafe_user', + metavar='failsafe_user', type='string', + help='Specify a user to execute the failsafe script.') + parser.add_option('-t', action='callback', callback=options_cb, default=60, + dest='timeout', metavar='seconds', type='int', + help='Timeout (in seconds) for an individual test.') + parser.add_option('-u', action='callback', callback=options_cb, + default='', dest='user', metavar='user', type='string', + help='Specify a different user name to run as.') + parser.add_option('-w', action='callback', callback=options_cb, + default=None, dest='template', metavar='template', + type='string', help='Create a new config file.') + parser.add_option('-x', action='callback', callback=options_cb, default='', + dest='pre_user', metavar='pre_user', type='string', + help='Specify a user to execute the pre script.') + parser.add_option('-X', action='callback', callback=options_cb, default='', + dest='post_user', metavar='post_user', type='string', + help='Specify a user to execute the post script.') + parser.add_option('-T', action='callback', callback=options_cb, default='', + dest='tags', metavar='tags', type='string', + help='Specify tags to execute specific test groups.') + parser.add_option('-I', action='callback', callback=options_cb, default=1, + dest='iterations', metavar='iterations', type='int', + help='Number of times to run the test run.') + (options, pathnames) = parser.parse_args() + + if not options.runfiles and not options.template: + options.cmd = 'runtests' + + if options.runfiles and len(pathnames): + fail('Extraneous arguments.') + + options.pathnames = [os.path.abspath(path) for path in pathnames] + + return options + + +def main(): + options = parse_args() + testrun = TestRun(options) + + if options.cmd == 'runtests': + find_tests(testrun, options) + elif options.cmd == 'rdconfig': + testrun.read(options) + elif options.cmd == 'wrconfig': + find_tests(testrun, options) + testrun.write(options) + exit(0) + else: + fail('Unknown command specified') + + testrun.complete_outputdirs() + testrun.run(options) + exit(testrun.summary()) + + +if __name__ == '__main__': + main() diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in new file mode 100755 index 000000000000..6b5cd191c5e0 --- /dev/null +++ b/tests/test-runner/bin/zts-report.py.in @@ -0,0 +1,390 @@ +#!/usr/bin/env @PYTHON_SHEBANG@ + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Delphix. All rights reserved. +# Copyright (c) 2018 by Lawrence Livermore National Security, LLC. +# +# This script must remain compatible with Python 2.6+ and Python 3.4+. +# + +import os +import re +import sys + +# +# This script parses the stdout of zfstest, which has this format: +# +# Test: /path/to/testa (run as root) [00:00] [PASS] +# Test: /path/to/testb (run as jkennedy) [00:00] [PASS] +# Test: /path/to/testc (run as root) [00:00] [FAIL] +# [...many more results...] +# +# Results Summary +# FAIL 22 +# SKIP 32 +# PASS 1156 +# +# Running Time: 02:50:31 +# Percent passed: 95.5% +# Log directory: /var/tmp/test_results/20180615T205926 +# + +# +# Common generic reasons for a test or test group to be skipped. +# +# Some test cases are known to fail in ways which are not harmful or dangerous. +# In these cases simply mark the test as a known failure until it can be +# updated and the issue resolved. Note that it's preferable to open a unique +# issue on the GitHub issue tracker for each test case failure. +# +known_reason = 'Known issue' + +# +# Some tests require that a test user be able to execute the zfs utilities. +# This may not be possible when testing in-tree due to the default permissions +# on the user's home directory. When testing this can be resolved by granting +# group read access. +# +# chmod 0750 $HOME +# +exec_reason = 'Test user execute permissions required for utilities' + +# +# Some tests require a minimum python version of 3.5 and will be skipped when +# the default system version is too old. There may also be tests which require +# additional python modules be installed, for example python-cffi is required +# by the pyzfs tests. +# +python_reason = 'Python v3.5 or newer required' +python_deps_reason = 'Python modules missing: python-cffi' + +# +# Some tests require the O_TMPFILE flag which was first introduced in the +# 3.11 kernel. +# +tmpfile_reason = 'Kernel O_TMPFILE support required' + +# +# Some tests require that the NFS client and server utilities be installed. +# +share_reason = 'NFS client and server utilities required' + +# +# Some tests require that the lsattr utility support the project id feature. +# +project_id_reason = 'lsattr with set/show project ID required' + +# +# Some tests require that the kernel support user namespaces. +# +user_ns_reason = 'Kernel user namespace support required' + +# +# Some rewind tests can fail since nothing guarantees that old MOS blocks +# are not overwritten. Snapshots protect datasets and data files but not +# the MOS. Reasonable efforts are made in the test case to increase the +# odds that some txgs will have their MOS data left untouched, but it is +# never a sure thing. +# +rewind_reason = 'Arbitrary pool rewind is not guaranteed' + +# +# Some tests may by structured in a way that relies on exact knowledge +# of how much free space in available in a pool. These tests cannot be +# made completely reliable because the internal details of how free space +# is managed are not exposed to user space. +# +enospc_reason = 'Exact free space reporting is not guaranteed' + +# +# Some tests require a minimum version of the fio benchmark utility. +# Older distributions such as CentOS 6.x only provide fio-2.0.13. +# +fio_reason = 'Fio v2.3 or newer required' + +# +# Some tests require that the DISKS provided support the discard operation. +# Normally this is not an issue because loop back devices are used for DISKS +# and they support discard (TRIM/UNMAP). +# +trim_reason = 'DISKS must support discard (TRIM/UNMAP)' + +# +# Some tests are not applicable to a platform or need to be updated to operate +# in the manor required by the platform. Any tests which are skipped for this +# reason will be suppressed in the final analysis output. +# +na_reason = "Not applicable" + +summary = { + 'total': float(0), + 'passed': float(0), + 'logfile': "Could not determine logfile location." +} + +# +# These tests are known to fail, thus we use this list to prevent these +# failures from failing the job as a whole; only unexpected failures +# bubble up to cause this script to exit with a non-zero exit status. +# +# Format: { 'test-name': ['expected result', 'issue-number | reason'] } +# +# For each known failure it is recommended to link to a GitHub issue by +# setting the reason to the issue number. Alternately, one of the generic +# reasons listed above can be used. +# +known = { + 'casenorm/mixed_none_lookup_ci': ['FAIL', '7633'], + 'casenorm/mixed_formd_lookup_ci': ['FAIL', '7633'], + 'cli_root/zfs_unshare/zfs_unshare_002_pos': ['SKIP', na_reason], + 'cli_root/zfs_unshare/zfs_unshare_006_pos': ['SKIP', na_reason], + 'cli_user/misc/zfs_share_001_neg': ['SKIP', na_reason], + 'cli_user/misc/zfs_unshare_001_neg': ['SKIP', na_reason], + 'privilege/setup': ['SKIP', na_reason], + 'refreserv/refreserv_004_pos': ['FAIL', known_reason], + 'rootpool/setup': ['SKIP', na_reason], + 'rsend/rsend_008_pos': ['SKIP', '6066'], + 'vdev_zaps/vdev_zaps_007_pos': ['FAIL', known_reason], +} + +if sys.platform.startswith('freebsd'): + known.update({ + 'cli_root/zpool_wait/zpool_wait_trim_basic': ['SKIP', trim_reason], + 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], + 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], + 'link_count/link_count_001': ['SKIP', na_reason], + }) +elif sys.platform.startswith('linux'): + known.update({ + 'casenorm/mixed_formd_lookup': ['FAIL', '7633'], + 'casenorm/mixed_formd_delete': ['FAIL', '7633'], + 'casenorm/sensitive_formd_lookup': ['FAIL', '7633'], + 'casenorm/sensitive_formd_delete': ['FAIL', '7633'], + 'removal/removal_with_zdb': ['SKIP', known_reason], + }) + + +# +# These tests may occasionally fail or be skipped. We want there failures +# to be reported but only unexpected failures should bubble up to cause +# this script to exit with a non-zero exit status. +# +# Format: { 'test-name': ['expected result', 'issue-number | reason'] } +# +# For each known failure it is recommended to link to a GitHub issue by +# setting the reason to the issue number. Alternately, one of the generic +# reasons listed above can be used. +# +maybe = { + 'alloc_class/alloc_class_012_pos': ['FAIL', '9142'], + 'alloc_class/alloc_class_013_pos': ['FAIL', '9142'], + 'chattr/setup': ['SKIP', exec_reason], + 'cli_root/zdb/zdb_006_pos': ['FAIL', known_reason], + 'cli_root/zfs_get/zfs_get_004_pos': ['FAIL', known_reason], + 'cli_root/zfs_get/zfs_get_009_pos': ['SKIP', '5479'], + 'cli_root/zfs_share/setup': ['SKIP', share_reason], + 'cli_root/zfs_snapshot/zfs_snapshot_002_neg': ['FAIL', known_reason], + 'cli_root/zfs_unshare/setup': ['SKIP', share_reason], + 'cli_root/zpool_add/zpool_add_004_pos': ['FAIL', known_reason], + 'cli_root/zpool_destroy/zpool_destroy_001_pos': ['SKIP', '6145'], + 'cli_root/zpool_import/import_rewind_device_replaced': + ['FAIL', rewind_reason], + 'cli_root/zpool_import/import_rewind_config_changed': + ['FAIL', rewind_reason], + 'cli_root/zpool_import/zpool_import_missing_003_pos': ['SKIP', '6839'], + 'cli_root/zpool_trim/setup': ['SKIP', trim_reason], + 'cli_root/zpool_upgrade/zpool_upgrade_004_pos': ['FAIL', '6141'], + 'delegate/setup': ['SKIP', exec_reason], + 'history/history_004_pos': ['FAIL', '7026'], + 'history/history_005_neg': ['FAIL', '6680'], + 'history/history_006_neg': ['FAIL', '5657'], + 'history/history_008_pos': ['FAIL', known_reason], + 'history/history_010_pos': ['SKIP', exec_reason], + 'io/mmap': ['SKIP', fio_reason], + 'largest_pool/largest_pool_001_pos': ['FAIL', known_reason], + 'mmp/mmp_on_uberblocks': ['FAIL', known_reason], + 'pyzfs/pyzfs_unittest': ['SKIP', python_deps_reason], + 'no_space/enospc_002_pos': ['FAIL', enospc_reason], + 'projectquota/setup': ['SKIP', exec_reason], + 'redundancy/redundancy_004_neg': ['FAIL', '7290'], + 'reservation/reservation_008_pos': ['FAIL', '7741'], + 'reservation/reservation_018_pos': ['FAIL', '5642'], + 'rsend/rsend_019_pos': ['FAIL', '6086'], + 'rsend/rsend_020_pos': ['FAIL', '6446'], + 'rsend/rsend_021_pos': ['FAIL', '6446'], + 'rsend/rsend_024_pos': ['FAIL', '5665'], + 'rsend/send-c_volume': ['FAIL', '6087'], + 'rsend/send_partial_dataset': ['FAIL', known_reason], + 'snapshot/clone_001_pos': ['FAIL', known_reason], + 'snapshot/snapshot_009_pos': ['FAIL', '7961'], + 'snapshot/snapshot_010_pos': ['FAIL', '7961'], + 'snapused/snapused_004_pos': ['FAIL', '5513'], + 'tmpfile/setup': ['SKIP', tmpfile_reason], + 'threadsappend/threadsappend_001_pos': ['FAIL', '6136'], + 'trim/setup': ['SKIP', trim_reason], + 'upgrade/upgrade_projectquota_001_pos': ['SKIP', project_id_reason], + 'user_namespace/setup': ['SKIP', user_ns_reason], + 'userquota/setup': ['SKIP', exec_reason], + 'vdev_zaps/vdev_zaps_004_pos': ['FAIL', '6935'], + 'zvol/zvol_ENOSPC/zvol_ENOSPC_001_pos': ['FAIL', '5848'], + 'pam/setup': ['SKIP', "pamtester might be not available"], +} + +if sys.platform.startswith('freebsd'): + maybe.update({ + 'cli_root/zfs_copies/zfs_copies_002_pos': ['FAIL', known_reason], + 'cli_root/zfs_inherit/zfs_inherit_001_neg': ['FAIL', known_reason], + 'cli_root/zfs_share/zfs_share_011_pos': ['FAIL', known_reason], + 'cli_root/zfs_share/zfs_share_concurrent_shares': + ['FAIL', known_reason], + 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], + 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], + 'removal/removal_condense_export': ['FAIL', known_reason], + 'removal/removal_with_export': ['FAIL', known_reason], + 'resilver/resilver_restart_001': ['FAIL', known_reason], + 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], + }) +elif sys.platform.startswith('linux'): + maybe.update({ + 'alloc_class/alloc_class_009_pos': ['FAIL', known_reason], + 'alloc_class/alloc_class_010_pos': ['FAIL', known_reason], + 'alloc_class/alloc_class_011_neg': ['FAIL', known_reason], + 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], + 'cli_root/zpool_expand/zpool_expand_001_pos': ['FAIL', known_reason], + 'cli_root/zpool_expand/zpool_expand_005_pos': ['FAIL', known_reason], + 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], + 'limits/filesystem_limit': ['SKIP', known_reason], + 'limits/snapshot_limit': ['SKIP', known_reason], + 'mmp/mmp_exported_import': ['FAIL', known_reason], + 'mmp/mmp_inactive_import': ['FAIL', known_reason], + 'refreserv/refreserv_raidz': ['FAIL', known_reason], + 'rsend/rsend_007_pos': ['FAIL', known_reason], + 'rsend/rsend_010_pos': ['FAIL', known_reason], + 'rsend/rsend_011_pos': ['FAIL', known_reason], + 'snapshot/rollback_003_pos': ['FAIL', known_reason], + }) + + +def usage(s): + print(s) + sys.exit(1) + + +def process_results(pathname): + try: + f = open(pathname) + except IOError as e: + print('Error opening file: %s' % e) + sys.exit(1) + + prefix = '/zfs-tests/tests/functional/' + pattern = \ + r'^Test(?:\s+\(\S+\))?:' + \ + r'\s*\S*%s(\S+)\s*\(run as (\S+)\)\s*\[(\S+)\]\s*\[(\S+)\]' \ + % prefix + pattern_log = r'^\s*Log directory:\s*(\S*)' + + d = {} + for line in f.readlines(): + m = re.match(pattern, line) + if m and len(m.groups()) == 4: + summary['total'] += 1 + if m.group(4) == "PASS": + summary['passed'] += 1 + d[m.group(1)] = m.group(4) + continue + + m = re.match(pattern_log, line) + if m: + summary['logfile'] = m.group(1) + + return d + + +if __name__ == "__main__": + if len(sys.argv) != 2: + usage('usage: %s ' % sys.argv[0]) + results = process_results(sys.argv[1]) + + if summary['total'] == 0: + print("\n\nNo test results were found.") + print("Log directory: %s" % summary['logfile']) + sys.exit(0) + + expected = [] + unexpected = [] + + for test in list(results.keys()): + if results[test] == "PASS": + continue + + setup = test.replace(os.path.basename(test), "setup") + if results[test] == "SKIP" and test != setup: + if setup in known and known[setup][0] == "SKIP": + continue + if setup in maybe and maybe[setup][0] == "SKIP": + continue + + if ((test not in known or results[test] not in known[test][0]) and + (test not in maybe or results[test] not in maybe[test][0])): + unexpected.append(test) + else: + expected.append(test) + + print("\nTests with results other than PASS that are expected:") + for test in sorted(expected): + issue_url = 'https://github.com/openzfs/zfs/issues/' + + # Include the reason why the result is expected, given the following: + # 1. Suppress test results which set the "Not applicable" reason. + # 2. Numerical reasons are assumed to be GitHub issue numbers. + # 3. When an entire test group is skipped only report the setup reason. + if test in known: + if known[test][1] == na_reason: + continue + elif known[test][1].isdigit(): + expect = issue_url + known[test][1] + else: + expect = known[test][1] + elif test in maybe: + if maybe[test][1].isdigit(): + expect = issue_url + maybe[test][1] + else: + expect = maybe[test][1] + elif setup in known and known[setup][0] == "SKIP" and setup != test: + continue + elif setup in maybe and maybe[setup][0] == "SKIP" and setup != test: + continue + else: + expect = "UNKNOWN REASON" + print(" %s %s (%s)" % (results[test], test, expect)) + + print("\nTests with result of PASS that are unexpected:") + for test in sorted(known.keys()): + # We probably should not be silently ignoring the case + # where "test" is not in "results". + if test not in results or results[test] != "PASS": + continue + print(" %s %s (expected %s)" % (results[test], test, + known[test][0])) + + print("\nTests with results other than PASS that are unexpected:") + for test in sorted(unexpected): + expect = "PASS" if test not in known else known[test][0] + print(" %s %s (expected %s)" % (results[test], test, expect)) + + if len(unexpected) == 0: + sys.exit(0) + else: + sys.exit(1) diff --git a/tests/test-runner/include/Makefile.am b/tests/test-runner/include/Makefile.am new file mode 100644 index 000000000000..d3eeb32de914 --- /dev/null +++ b/tests/test-runner/include/Makefile.am @@ -0,0 +1,5 @@ +pkgdatadir = $(datadir)/@PACKAGE@/test-runner/include + +dist_pkgdata_DATA = \ + logapi.shlib \ + stf.shlib diff --git a/tests/test-runner/include/logapi.shlib b/tests/test-runner/include/logapi.shlib new file mode 100644 index 000000000000..aa6e7c0f65fa --- /dev/null +++ b/tests/test-runner/include/logapi.shlib @@ -0,0 +1,530 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2012, 2020 by Delphix. All rights reserved. +# + +. ${STF_TOOLS}/include/stf.shlib + +# Output an assertion +# +# $@ - assertion text + +function log_assert +{ + _printline ASSERTION: "$@" +} + +# Output a comment +# +# $@ - comment text + +function log_note +{ + _printline NOTE: "$@" +} + +# Execute and print command with status where success equals non-zero result +# +# $@ - command to execute +# +# return 0 if command fails, otherwise return 1 + +function log_neg +{ + log_neg_expect "" "$@" + return $? +} + +# Execute a positive test and exit $STF_FAIL is test fails +# +# $@ - command to execute + +function log_must +{ + log_pos "$@" + (( $? != 0 )) && log_fail +} + +# Execute a positive test but retry the command on failure if the output +# matches an expected pattern. Otherwise behave like log_must and exit +# $STF_FAIL is test fails. +# +# $1 - retry keyword +# $2 - retry attempts +# $3-$@ - command to execute +# +function log_must_retry +{ + typeset out="" + typeset logfile="/tmp/log.$$" + typeset status=1 + typeset expect=$1 + typeset retry=$2 + typeset delay=1 + shift 2 + + while [[ -e $logfile ]]; do + logfile="$logfile.$$" + done + + while (( $retry > 0 )); do + "$@" 2>$logfile + status=$? + out="cat $logfile" + + if (( $status == 0 )); then + $out | egrep -i "internal error|assertion failed" \ + > /dev/null 2>&1 + # internal error or assertion failed + if [[ $? -eq 0 ]]; then + print -u2 $($out) + _printerror "$@" "internal error or" \ + " assertion failure exited $status" + status=1 + else + [[ -n $LOGAPI_DEBUG ]] && print $($out) + _printsuccess "$@" + fi + break + else + $out | grep -i "$expect" > /dev/null 2>&1 + if (( $? == 0 )); then + print -u2 $($out) + _printerror "$@" "Retry in $delay seconds" + sleep $delay + + (( retry=retry - 1 )) + (( delay=delay * 2 )) + else + break; + fi + fi + done + + if (( $status != 0 )) ; then + print -u2 $($out) + _printerror "$@" "exited $status" + fi + + _recursive_output $logfile "false" + return $status +} + +# Execute a positive test and exit $STF_FAIL is test fails after being +# retried up to 5 times when the command returns the keyword "busy". +# +# $@ - command to execute +function log_must_busy +{ + log_must_retry "busy" 5 "$@" + (( $? != 0 )) && log_fail +} + +# Execute a negative test and exit $STF_FAIL if test passes +# +# $@ - command to execute + +function log_mustnot +{ + log_neg "$@" + (( $? != 0 )) && log_fail +} + +# Execute a negative test with keyword expected, and exit +# $STF_FAIL if test passes +# +# $1 - keyword expected +# $2-$@ - command to execute + +function log_mustnot_expect +{ + log_neg_expect "$@" + (( $? != 0 )) && log_fail +} + +# Signal numbers are platform-dependent +case $(uname) in +Darwin|FreeBSD) + SIGBUS=10 + SIGSEGV=11 + ;; +illumos|Linux|*) + SIGBUS=7 + SIGSEGV=11 + ;; +esac +EXIT_SUCCESS=0 +EXIT_NOTFOUND=127 +EXIT_SIGNAL=256 +EXIT_SIGBUS=$((EXIT_SIGNAL + SIGBUS)) +EXIT_SIGSEGV=$((EXIT_SIGNAL + SIGSEGV)) + +# Execute and print command with status where success equals non-zero result +# or output includes expected keyword +# +# $1 - keyword expected +# $2-$@ - command to execute +# +# return 0 if command fails, or the output contains the keyword expected, +# return 1 otherwise + +function log_neg_expect +{ + typeset out="" + typeset logfile="/tmp/log.$$" + typeset ret=1 + typeset expect=$1 + shift + + while [[ -e $logfile ]]; do + logfile="$logfile.$$" + done + + "$@" 2>$logfile + typeset status=$? + out="cat $logfile" + + # unexpected status + if (( $status == EXIT_SUCCESS )); then + print -u2 $($out) + _printerror "$@" "unexpectedly exited $status" + # missing binary + elif (( $status == EXIT_NOTFOUND )); then + print -u2 $($out) + _printerror "$@" "unexpectedly exited $status (File not found)" + # bus error - core dump + elif (( $status == EXIT_SIGBUS )); then + print -u2 $($out) + _printerror "$@" "unexpectedly exited $status (Bus Error)" + # segmentation violation - core dump + elif (( $status == EXIT_SIGSEGV )); then + print -u2 $($out) + _printerror "$@" "unexpectedly exited $status (SEGV)" + else + $out | egrep -i "internal error|assertion failed" \ + > /dev/null 2>&1 + # internal error or assertion failed + if (( $? == 0 )); then + print -u2 $($out) + _printerror "$@" "internal error or assertion failure" \ + " exited $status" + elif [[ -n $expect ]] ; then + $out | grep -i "$expect" > /dev/null 2>&1 + if (( $? == 0 )); then + ret=0 + else + print -u2 $($out) + _printerror "$@" "unexpectedly exited $status" + fi + else + ret=0 + fi + + if (( $ret == 0 )); then + [[ -n $LOGAPI_DEBUG ]] && print $($out) + _printsuccess "$@" "exited $status" + fi + fi + _recursive_output $logfile "false" + return $ret +} + +# Execute and print command with status where success equals zero result +# +# $@ command to execute +# +# return command exit status + +function log_pos +{ + typeset out="" + typeset logfile="/tmp/log.$$" + + while [[ -e $logfile ]]; do + logfile="$logfile.$$" + done + + "$@" 2>$logfile + typeset status=$? + out="cat $logfile" + + if (( $status != 0 )) ; then + print -u2 $($out) + _printerror "$@" "exited $status" + else + $out | egrep -i "internal error|assertion failed" \ + > /dev/null 2>&1 + # internal error or assertion failed + if [[ $? -eq 0 ]]; then + print -u2 $($out) + _printerror "$@" "internal error or assertion failure" \ + " exited $status" + status=1 + else + [[ -n $LOGAPI_DEBUG ]] && print $($out) + _printsuccess "$@" + fi + fi + _recursive_output $logfile "false" + return $status +} + +# Set an exit handler +# +# $@ - function(s) to perform on exit + +function log_onexit +{ + _CLEANUP=("$*") +} + +# Push an exit handler on the cleanup stack +# +# $@ - function(s) to perform on exit + +function log_onexit_push +{ + _CLEANUP+=("$*") +} + +# Pop an exit handler off the cleanup stack + +function log_onexit_pop +{ + _CLEANUP=("${_CLEANUP[@]:0:${#_CLEANUP[@]}-1}") +} + +# +# Exit functions +# + +# Perform cleanup and exit $STF_PASS +# +# $@ - message text + +function log_pass +{ + _endlog $STF_PASS "$@" +} + +# Perform cleanup and exit $STF_FAIL +# +# $@ - message text + +function log_fail +{ + _endlog $STF_FAIL "$@" +} + +# Perform cleanup and exit $STF_UNRESOLVED +# +# $@ - message text + +function log_unresolved +{ + _endlog $STF_UNRESOLVED "$@" +} + +# Perform cleanup and exit $STF_NOTINUSE +# +# $@ - message text + +function log_notinuse +{ + _endlog $STF_NOTINUSE "$@" +} + +# Perform cleanup and exit $STF_UNSUPPORTED +# +# $@ - message text + +function log_unsupported +{ + _endlog $STF_UNSUPPORTED "$@" +} + +# Perform cleanup and exit $STF_UNTESTED +# +# $@ - message text + +function log_untested +{ + _endlog $STF_UNTESTED "$@" +} + +# Perform cleanup and exit $STF_UNINITIATED +# +# $@ - message text + +function log_uninitiated +{ + _endlog $STF_UNINITIATED "$@" +} + +# Perform cleanup and exit $STF_NORESULT +# +# $@ - message text + +function log_noresult +{ + _endlog $STF_NORESULT "$@" +} + +# Perform cleanup and exit $STF_WARNING +# +# $@ - message text + +function log_warning +{ + _endlog $STF_WARNING "$@" +} + +# Perform cleanup and exit $STF_TIMED_OUT +# +# $@ - message text + +function log_timed_out +{ + _endlog $STF_TIMED_OUT "$@" +} + +# Perform cleanup and exit $STF_OTHER +# +# $@ - message text + +function log_other +{ + _endlog $STF_OTHER "$@" +} + +function set_main_pid +{ + _MAINPID=$1 +} + +# +# Internal functions +# + +# Execute custom callback scripts on test failure +# +# callback script paths are stored in TESTFAIL_CALLBACKS, delimited by ':'. + +function _execute_testfail_callbacks +{ + typeset callback + + print "$TESTFAIL_CALLBACKS:" | while read -d ":" callback; do + if [[ -n "$callback" ]] ; then + log_note "Performing test-fail callback ($callback)" + $callback + fi + done +} + +# Perform cleanup and exit +# +# $1 - stf exit code +# $2-$n - message text + +function _endlog +{ + typeset logfile="/tmp/log.$$" + _recursive_output $logfile + + typeset exitcode=$1 + shift + (( ${#@} > 0 )) && _printline "$@" + + # + # If we're running in a subshell then just exit and let + # the parent handle the failures + # + if [[ -n "$_MAINPID" && $$ != "$_MAINPID" ]]; then + log_note "subshell exited: "$_MAINPID + exit $exitcode + fi + + if [[ $exitcode == $STF_FAIL ]] ; then + _execute_testfail_callbacks + fi + + typeset stack=("${_CLEANUP[@]}") + log_onexit "" + typeset i=${#stack[@]} + while (( i-- )); do + typeset cleanup="${stack[i]}" + log_note "Performing local cleanup via log_onexit ($cleanup)" + $cleanup + done + + exit $exitcode +} + +# Output a formatted line +# +# $@ - message text + +function _printline +{ + print "$@" +} + +# Output an error message +# +# $@ - message text + +function _printerror +{ + _printline ERROR: "$@" +} + +# Output a success message +# +# $@ - message text + +function _printsuccess +{ + _printline SUCCESS: "$@" +} + +# Output logfiles recursively +# +# $1 - start file +# $2 - indicate whether output the start file itself, default as yes. + +function _recursive_output #logfile +{ + typeset logfile=$1 + + while [[ -e $logfile ]]; do + if [[ -z $2 || $logfile != $1 ]]; then + cat $logfile + fi + rm -f $logfile + logfile="$logfile.$$" + done +} diff --git a/tests/test-runner/include/stf.shlib b/tests/test-runner/include/stf.shlib new file mode 100644 index 000000000000..ea879a84c131 --- /dev/null +++ b/tests/test-runner/include/stf.shlib @@ -0,0 +1,57 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or http://www.opensolaris.org/os/licensing. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# +# Copyright (c) 2012 by Delphix. All rights reserved. +# + + +STF_PASS=0 +STF_FAIL=1 +STF_UNRESOLVED=2 +STF_NOTINUSE=3 +STF_UNSUPPORTED=4 +STF_UNTESTED=5 +STF_UNINITIATED=6 +STF_NORESULT=7 +STF_WARNING=8 +STF_TIMED_OUT=9 +STF_OTHER=10 + +# do this to use the names: eval echo \$STF_RESULT_NAME_${result} +STF_RESULT_NAME_0="PASS" +STF_RESULT_NAME_1="FAIL" +STF_RESULT_NAME_2="UNRESOLVED" +STF_RESULT_NAME_3="NOTINUSE" +STF_RESULT_NAME_4="UNSUPPORTED" +STF_RESULT_NAME_5="UNTESTED" +STF_RESULT_NAME_6="UNINITIATED" +STF_RESULT_NAME_7="NORESULT" +STF_RESULT_NAME_8="WARNING" +STF_RESULT_NAME_9="TIMED_OUT" +STF_RESULT_NAME_10="OTHER" + +# do this to use the array: ${STF_RESULT_NAMES[$result]} +STF_RESULT_NAMES=( "PASS" "FAIL" "UNRESOLVED" "NOTINUSE" "UNSUPPORTED" \ + "UNTESTED" "UNINITIATED" "NORESULT" "WARNING" "TIMED_OUT" "OTHER" ) diff --git a/tests/test-runner/man/Makefile.am b/tests/test-runner/man/Makefile.am new file mode 100644 index 000000000000..a7017f5f0535 --- /dev/null +++ b/tests/test-runner/man/Makefile.am @@ -0,0 +1,4 @@ +dist_man_MANS = test-runner.1 + +install-data-local: + $(INSTALL) -d -m 0755 "$(DESTDIR)$(mandir)/man1" diff --git a/tests/test-runner/man/test-runner.1 b/tests/test-runner/man/test-runner.1 new file mode 100644 index 000000000000..19c636c8cb7c --- /dev/null +++ b/tests/test-runner/man/test-runner.1 @@ -0,0 +1,386 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright (c) 2012 by Delphix. All rights reserved. +.\" +.TH run 1 "23 Sep 2012" +.SH NAME +run \- find, execute, and log the results of tests +.SH SYNOPSIS +.LP +.nf +\fBrun\fR [\fB-dgq] [\fB-o\fR \fIoutputdir\fR] [\fB-pP\fR \fIscript\fR] [\fB-t\fR \fIseconds\fR] [\fB-uxX\fR \fIusername\fR] + \fIpathname\fR ... +.fi + +.LP +.nf +\fBrun\fR \fB-w\fR \fIrunfile\fR [\fB-gq\fR] [\fB-o\fR \fIoutputdir\fR] [\fB-pP\fR \fIscript\fR] [\fB-t\fR \fIseconds\fR] + [\fB-uxX\fR \fIusername\fR] \fIpathname\fR ... +.fi + +.LP +.nf +\fBrun\fR \fB-c\fR \fIrunfile\fR [\fB-dq\fR] +.fi + +.LP +.nf +\fBrun\fR [\fB-h\fR] +.fi + +.SH DESCRIPTION +.sp +.LP +The \fBrun\fR command has three basic modes of operation. With neither the +\fB-c\fR nor the \fB-w\fR option, \fBrun\fR processes the arguments provided on +the command line, adding them to the list for this run. If a specified +\fIpathname\fR is an executable file, it is added as a test. If a specified +\fIpathname\fR is a directory, the behavior depends upon the \fB-g\fR option. +If \fB-g\fR is specified, the directory is treated as a test group. See the +section on "Test Groups" below. Without the \fB-g\fR option, \fBrun\fR simply +descends into the directory looking for executable files. The tests are then +executed, and the results are logged. + +With the \fB-w\fR option, \fBrun\fR finds tests in the manner described above. +Rather than executing the tests and logging the results, the test configuration +is stored in a \fIrunfile\fR which can be used in future invocations, or edited +to modify which tests are executed and which options are applied. Options +included on the command line with \fB-w\fR become defaults in the +\fIrunfile\fR. + +With the \fB-c\fR option, \fBrun\fR parses a \fIrunfile\fR, which can specify a +series of tests and test groups to be executed. The tests are then executed, +and the results are logged. +.sp +.SS "Test Groups" +.sp +.LP +A test group is comprised of a set of executable files, all of which exist in +one directory. The options specified on the command line or in a \fIrunfile\fR +apply to individual tests in the group. The exception is options pertaining to +pre and post scripts, which act on all tests as a group. Rather than running +before and after each test, these scripts are run only once each at the start +and end of the test group. +.SS "Test Execution" +.sp +.LP +The specified tests run serially, and are typically assigned results according +to exit values. Tests that exit zero and non-zero are marked "PASS" and "FAIL" +respectively. When a pre script fails for a test group, only the post script is +executed, and the remaining tests are marked "SKIPPED." Any test that exceeds +its \fItimeout\fR is terminated, and marked "KILLED." + +By default, tests are executed with the credentials of the \fBrun\fR script. +Executing tests with other credentials is done via \fBsudo\fR(1m), which must +be configured to allow execution without prompting for a password. Environment +variables from the calling shell are available to individual tests. During test +execution, the working directory is changed to \fIoutputdir\fR. +.SS "Output Logging" +.sp +.LP +By default, \fBrun\fR will print one line on standard output at the conclusion +of each test indicating the test name, result and elapsed time. Additionally, +for each invocation of \fBrun\fR, a directory is created using the ISO 8601 +date format. Within this directory is a file named \fIlog\fR containing all the +test output with timestamps, and a directory for each test. Within the test +directories, there is one file each for standard output, standard error and +merged output. The default location for the \fIoutputdir\fR is +\fI/var/tmp/test_results\fR. +.SS "Runfiles" +.sp +.LP +The \fIrunfile\fR is an ini style configuration file that describes a test run. +The file has one section named "DEFAULT," which contains configuration option +names and their values in "name = value" format. The values in this section +apply to all the subsequent sections, unless they are also specified there, in +which case the default is overridden. The remaining section names are the +absolute pathnames of files and directories, describing tests and test groups +respectively. The legal option names are: +.sp +.ne 2 +.na +\fBoutputdir\fR = \fIpathname\fR +.ad +.sp .6 +.RS 4n +The name of the directory that holds test logs. +.RE +.sp +.ne 2 +.na +\fBpre\fR = \fIscript\fR +.ad +.sp .6 +.RS 4n +Run \fIscript\fR prior to the test or test group. +.RE +.sp +.ne 2 +.na +\fBpre_user\fR = \fIusername\fR +.ad +.sp .6 +.RS 4n +Execute the pre script as \fIusername\fR. +.RE +.sp +.ne 2 +.na +\fBpost\fR = \fIscript\fR +.ad +.sp .6 +.RS 4n +Run \fIscript\fR after the test or test group. +.RE +.sp +.ne 2 +.na +\fBpost_user\fR = \fIusername\fR +.ad +.sp .6 +.RS 4n +Execute the post script as \fIusername\fR. +.RE +.sp +.ne 2 +.na +\fBquiet\fR = [\fITrue\fR|\fIFalse\fR] +.ad +.sp .6 +.RS 4n +If set to True, only the results summary is printed to standard out. +.RE +.sp +.ne 2 +.na +\fBtests\fR = [\fI'filename'\fR [,...]] +.ad +.sp .6 +.RS 4n +Specify a list of \fIfilenames\fR for this test group. Only the basename of the +absolute path is required. This option is only valid for test groups, and each +\fIfilename\fR must be single quoted. +.RE +.sp +.ne 2 +.na +\fBtimeout\fR = \fIn\fR +.ad +.sp .6 +.RS 4n +A timeout value of \fIn\fR seconds. +.RE +.sp +.ne 2 +.na +\fBuser\fR = \fIusername\fR +.ad +.sp .6 +.RS 4n +Execute the test or test group as \fIusername\fR. +.RE + +.SH OPTIONS +.sp +.LP +The following options are available for the \fBrun\fR command. +.sp +.ne 2 +.na +\fB-c\fR \fIrunfile\fR +.ad +.RS 6n +Specify a \fIrunfile\fR to be consumed by the run command. +.RE + +.ne 2 +.na +\fB-d\fR +.ad +.RS 6n +Dry run mode. Execute no tests, but print a description of each test that would +have been run. +.RE + +.ne 2 +.na +\fB-g\fR +.ad +.RS 6n +Create test groups from any directories found while searching for tests. +.RE + +.ne 2 +.na +\fB-o\fR \fIoutputdir\fR +.ad +.RS 6n +Specify the directory in which to write test results. +.RE + +.ne 2 +.na +\fB-p\fR \fIscript\fR +.ad +.RS 6n +Run \fIscript\fR prior to any test or test group. +.RE + +.ne 2 +.na +\fB-P\fR \fIscript\fR +.ad +.RS 6n +Run \fIscript\fR after any test or test group. +.RE + +.ne 2 +.na +\fB-q\fR +.ad +.RS 6n +Print only the results summary to the standard output. +.RE + +.ne 2 +.na +\fB-s\fR \fIscript\fR +.ad +.RS 6n +Run \fIscript\fR as a failsafe after any test is killed. +.RE + +.ne 2 +.na +\fB-S\fR \fIusername\fR +.ad +.RS 6n +Execute the failsafe script as \fIusername\fR. +.RE + +.ne 2 +.na +\fB-t\fR \fIn\fR +.ad +.RS 6n +Specify a timeout value of \fIn\fR seconds per test. +.RE + +.ne 2 +.na +\fB-u\fR \fIusername\fR +.ad +.RS 6n +Execute tests or test groups as \fIusername\fR. +.RE + +.ne 2 +.na +\fB-w\fR \fIrunfile\fR +.ad +.RS 6n +Specify the name of the \fIrunfile\fR to create. +.RE + +.ne 2 +.na +\fB-x\fR \fIusername\fR +.ad +.RS 6n +Execute the pre script as \fIusername\fR. +.RE + +.ne 2 +.na +\fB-X\fR \fIusername\fR +.ad +.RS 6n +Execute the post script as \fIusername\fR. +.RE + +.SH EXAMPLES +.LP +\fBExample 1\fR Running ad-hoc tests. +.sp +.LP +This example demonstrates the simplest invocation of \fBrun\fR. + +.sp +.in +2 +.nf +% \fBrun my-tests\fR +Test: /home/jkennedy/my-tests/test-01 [00:02] [PASS] +Test: /home/jkennedy/my-tests/test-02 [00:04] [PASS] +Test: /home/jkennedy/my-tests/test-03 [00:01] [PASS] + +Results Summary +PASS 3 + +Running Time: 00:00:07 +Percent passed: 100.0% +Log directory: /var/tmp/test_results/20120923T180654 +.fi +.in -2 + +.LP +\fBExample 2\fR Creating a \fIrunfile\fR for future use. +.sp +.LP +This example demonstrates creating a \fIrunfile\fR with non default options. + +.sp +.in +2 +.nf +% \fBrun -p setup -x root -g -w new-tests.run new-tests\fR +% \fBcat new-tests.run\fR +[DEFAULT] +pre = setup +post_user = +quiet = False +user = +timeout = 60 +post = +pre_user = root +outputdir = /var/tmp/test_results + +[/home/jkennedy/new-tests] +tests = ['test-01', 'test-02', 'test-03'] +.fi +.in -2 + +.SH EXIT STATUS +.sp +.LP +The following exit values are returned: +.sp +.ne 2 +.na +\fB\fB0\fR\fR +.ad +.sp .6 +.RS 4n +Successful completion. +.RE +.sp +.ne 2 +.na +\fB\fB1\fR\fR +.ad +.sp .6 +.RS 4n +An error occurred. +.RE + +.SH SEE ALSO +.sp +.LP +\fBsudo\fR(1m) diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am new file mode 100644 index 000000000000..ef4e6be9e980 --- /dev/null +++ b/tests/zfs-tests/Makefile.am @@ -0,0 +1 @@ +SUBDIRS = cmd include tests callbacks diff --git a/tests/zfs-tests/callbacks/Makefile.am b/tests/zfs-tests/callbacks/Makefile.am new file mode 100644 index 000000000000..512a737bb5c9 --- /dev/null +++ b/tests/zfs-tests/callbacks/Makefile.am @@ -0,0 +1,6 @@ +pkgdatadir = $(datadir)/@PACKAGE@/zfs-tests/callbacks +dist_pkgdata_SCRIPTS = \ + zfs_failsafe.ksh \ + zfs_dbgmsg.ksh \ + zfs_dmesg.ksh \ + zfs_mmp.ksh diff --git a/tests/zfs-tests/callbacks/zfs_dbgmsg.ksh b/tests/zfs-tests/callbacks/zfs_dbgmsg.ksh new file mode 100755 index 000000000000..be001ad9daf7 --- /dev/null +++ b/tests/zfs-tests/callbacks/zfs_dbgmsg.ksh @@ -0,0 +1,29 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# + +# $1: number of lines to output (default: 200) +typeset lines=${1:-200} + +echo "=================================================================" +echo " Tailing last $lines lines of zfs_dbgmsg log" +echo "=================================================================" + +sudo tail -n $lines /proc/spl/kstat/zfs/dbgmsg + +echo "=================================================================" +echo " End of zfs_dbgmsg log" +echo "=================================================================" diff --git a/tests/zfs-tests/callbacks/zfs_dmesg.ksh b/tests/zfs-tests/callbacks/zfs_dmesg.ksh new file mode 100755 index 000000000000..3d8191de57a0 --- /dev/null +++ b/tests/zfs-tests/callbacks/zfs_dmesg.ksh @@ -0,0 +1,30 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2016 by Delphix. All rights reserved. +# Copyright (c) 2017 Lawrence Livermore National Security, LLC. +# + +# $1: number of lines to output (default: 200) +typeset lines=${1:-200} + +echo "=================================================================" +echo " Tailing last $lines lines of dmesg log" +echo "=================================================================" + +sudo dmesg | tail -n $lines + +echo "=================================================================" +echo " End of dmesg log" +echo "=================================================================" diff --git a/tests/zfs-tests/callbacks/zfs_failsafe.ksh b/tests/zfs-tests/callbacks/zfs_failsafe.ksh new file mode 100755 index 000000000000..0d14df7012df --- /dev/null +++ b/tests/zfs-tests/callbacks/zfs_failsafe.ksh @@ -0,0 +1,8 @@ +#!/bin/ksh + +# Commands to perform failsafe-critical cleanup after a test is killed. +# +# This should only be used to ensure the system is restored to a functional +# state in the event of tests being killed (preventing normal cleanup). + +zinject -c all diff --git a/tests/zfs-tests/callbacks/zfs_mmp.ksh b/tests/zfs-tests/callbacks/zfs_mmp.ksh new file mode 100755 index 000000000000..402e0b03f618 --- /dev/null +++ b/tests/zfs-tests/callbacks/zfs_mmp.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2017 by Lawrence Livermore National Security. +# All rights reserved. +# + +# $1: number of lines to output (default: 40) +typeset lines=${1:-40} +typeset history=$( +#include +#include +#include +#include +#include + +#define BUFSIZE 256 + +int seed = 0; +int stress_timeout = 180; +int contents_frequency = 100; +int tree_limit = 64 * 1024; +boolean_t stress_only = B_FALSE; + +static void +usage(int exit_value) +{ + (void) fprintf(stderr, "Usage:\tbtree_test -n \n"); + (void) fprintf(stderr, "\tbtree_test -s [-r ] [-l ] " + "[-t timeout>] [-c check_contents]\n"); + (void) fprintf(stderr, "\tbtree_test [-r ] [-l ] " + "[-t timeout>] [-c check_contents]\n"); + (void) fprintf(stderr, "\n With the -n option, run the named " + "negative test. With the -s option,\n"); + (void) fprintf(stderr, " run the stress test according to the " + "other options passed. With\n"); + (void) fprintf(stderr, " neither, run all the positive tests, " + "including the stress test with\n"); + (void) fprintf(stderr, " the default options.\n"); + (void) fprintf(stderr, "\n Options that control the stress test\n"); + (void) fprintf(stderr, "\t-c stress iterations after which to compare " + "tree contents [default: 100]\n"); + (void) fprintf(stderr, "\t-l the largest value to allow in the tree " + "[default: 1M]\n"); + (void) fprintf(stderr, "\t-r random seed [default: from " + "gettimeofday()]\n"); + (void) fprintf(stderr, "\t-t seconds to let the stress test run " + "[default: 180]\n"); + exit(exit_value); +} + +typedef struct int_node { + avl_node_t node; + uint64_t data; +} int_node_t; + +/* + * Utility functions + */ + +static int +avl_compare(const void *v1, const void *v2) +{ + const int_node_t *n1 = v1; + const int_node_t *n2 = v2; + uint64_t a = n1->data; + uint64_t b = n2->data; + + return (TREE_CMP(a, b)); +} + +static int +zfs_btree_compare(const void *v1, const void *v2) +{ + const uint64_t *a = v1; + const uint64_t *b = v2; + + return (TREE_CMP(*a, *b)); +} + +static void +verify_contents(avl_tree_t *avl, zfs_btree_t *bt) +{ + static int count = 0; + zfs_btree_index_t bt_idx = {0}; + int_node_t *node; + uint64_t *data; + + boolean_t forward = count % 2 == 0 ? B_TRUE : B_FALSE; + count++; + + ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); + if (forward == B_TRUE) { + node = avl_first(avl); + data = zfs_btree_first(bt, &bt_idx); + } else { + node = avl_last(avl); + data = zfs_btree_last(bt, &bt_idx); + } + + while (node != NULL) { + ASSERT3U(*data, ==, node->data); + if (forward == B_TRUE) { + data = zfs_btree_next(bt, &bt_idx, &bt_idx); + node = AVL_NEXT(avl, node); + } else { + data = zfs_btree_prev(bt, &bt_idx, &bt_idx); + node = AVL_PREV(avl, node); + } + } +} + +static void +verify_node(avl_tree_t *avl, zfs_btree_t *bt, int_node_t *node) +{ + zfs_btree_index_t bt_idx = {0}; + zfs_btree_index_t bt_idx2 = {0}; + int_node_t *inp; + uint64_t data = node->data; + uint64_t *rv = NULL; + + ASSERT3U(avl_numnodes(avl), ==, zfs_btree_numnodes(bt)); + ASSERT3P((rv = (uint64_t *)zfs_btree_find(bt, &data, &bt_idx)), !=, + NULL); + ASSERT3S(*rv, ==, data); + ASSERT3P(zfs_btree_get(bt, &bt_idx), !=, NULL); + ASSERT3S(data, ==, *(uint64_t *)zfs_btree_get(bt, &bt_idx)); + + if ((inp = AVL_NEXT(avl, node)) != NULL) { + ASSERT3P((rv = zfs_btree_next(bt, &bt_idx, &bt_idx2)), !=, + NULL); + ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); + ASSERT3S(inp->data, ==, *rv); + } else { + ASSERT3U(data, ==, *(uint64_t *)zfs_btree_last(bt, &bt_idx)); + } + + if ((inp = AVL_PREV(avl, node)) != NULL) { + ASSERT3P((rv = zfs_btree_prev(bt, &bt_idx, &bt_idx2)), !=, + NULL); + ASSERT3P(rv, ==, zfs_btree_get(bt, &bt_idx2)); + ASSERT3S(inp->data, ==, *rv); + } else { + ASSERT3U(data, ==, *(uint64_t *)zfs_btree_first(bt, &bt_idx)); + } +} + +/* + * Tests + */ + +/* Verify that zfs_btree_find works correctly with a NULL index. */ +static int +find_without_index(zfs_btree_t *bt, char *why) +{ + u_longlong_t *p, i = 12345; + + zfs_btree_add(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) == NULL || + *p != i) { + snprintf(why, BUFSIZE, "Unexpectedly found %llu\n", + p == NULL ? 0 : *p); + return (1); + } + + i++; + + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, NULL)) != NULL) { + snprintf(why, BUFSIZE, "Found bad value: %llu\n", *p); + return (1); + } + + return (0); +} + +/* Verify simple insertion and removal from the tree. */ +static int +insert_find_remove(zfs_btree_t *bt, char *why) +{ + u_longlong_t *p, i = 12345; + zfs_btree_index_t bt_idx = {0}; + + /* Insert 'i' into the tree, and attempt to find it again. */ + zfs_btree_add(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { + snprintf(why, BUFSIZE, "Didn't find value in tree\n"); + return (1); + } else if (*p != i) { + snprintf(why, BUFSIZE, "Found (%llu) in tree\n", *p); + return (1); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 1); + zfs_btree_verify(bt); + + /* Remove 'i' from the tree, and verify it is not found. */ + zfs_btree_remove(bt, &i); + if ((p = (u_longlong_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + snprintf(why, BUFSIZE, "Found removed value (%llu)\n", *p); + return (1); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + zfs_btree_verify(bt); + + return (0); +} + +/* + * Add a number of random entries into a btree and avl tree. Then walk them + * backwards and forwards while emptying the tree, verifying the trees look + * the same. + */ +static int +drain_tree(zfs_btree_t *bt, char *why) +{ + uint64_t *p; + avl_tree_t avl; + int i = 0; + int_node_t *node; + avl_index_t avl_idx = {0}; + zfs_btree_index_t bt_idx = {0}; + + avl_create(&avl, avl_compare, sizeof (int_node_t), + offsetof(int_node_t, node)); + + /* Fill both trees with the same data */ + for (i = 0; i < 64 * 1024; i++) { + void *ret; + + u_longlong_t randval = random(); + node = malloc(sizeof (int_node_t)); + if ((p = (uint64_t *)zfs_btree_find(bt, &randval, &bt_idx)) != + NULL) { + continue; + } + zfs_btree_add_idx(bt, &randval, &bt_idx); + + node->data = randval; + if ((ret = avl_find(&avl, node, &avl_idx)) != NULL) { + snprintf(why, BUFSIZE, "Found in avl: %llu\n", randval); + return (1); + } + avl_insert(&avl, node, avl_idx); + } + + /* Remove data from either side of the trees, comparing the data */ + while (avl_numnodes(&avl) != 0) { + uint64_t *data; + + ASSERT3U(avl_numnodes(&avl), ==, zfs_btree_numnodes(bt)); + if (avl_numnodes(&avl) % 2 == 0) { + node = avl_first(&avl); + data = zfs_btree_first(bt, &bt_idx); + } else { + node = avl_last(&avl); + data = zfs_btree_last(bt, &bt_idx); + } + ASSERT3U(node->data, ==, *data); + zfs_btree_remove_idx(bt, &bt_idx); + avl_remove(&avl, node); + + if (avl_numnodes(&avl) == 0) { + break; + } + + node = avl_first(&avl); + ASSERT3U(node->data, ==, + *(uint64_t *)zfs_btree_first(bt, NULL)); + node = avl_last(&avl); + ASSERT3U(node->data, ==, *(uint64_t *)zfs_btree_last(bt, NULL)); + } + ASSERT3S(zfs_btree_numnodes(bt), ==, 0); + + void *avl_cookie = NULL; + while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) + free(node); + avl_destroy(&avl); + + return (0); +} + +/* + * This test uses an avl and btree, and continually processes new random + * values. Each value is either removed or inserted, depending on whether + * or not it is found in the tree. The test periodically checks that both + * trees have the same data and does consistency checks. This stress + * option can also be run on its own from the command line. + */ +static int +stress_tree(zfs_btree_t *bt, char *why) +{ + avl_tree_t avl; + int_node_t *node; + struct timeval tp; + time_t t0; + int insertions = 0, removals = 0, iterations = 0; + u_longlong_t max = 0, min = UINT64_MAX; + + (void) gettimeofday(&tp, NULL); + t0 = tp.tv_sec; + + avl_create(&avl, avl_compare, sizeof (int_node_t), + offsetof(int_node_t, node)); + + while (1) { + zfs_btree_index_t bt_idx = {0}; + avl_index_t avl_idx = {0}; + + uint64_t randval = random() % tree_limit; + node = malloc(sizeof (*node)); + node->data = randval; + + max = randval > max ? randval : max; + min = randval < min ? randval : min; + + void *ret = avl_find(&avl, node, &avl_idx); + if (ret == NULL) { + insertions++; + avl_insert(&avl, node, avl_idx); + ASSERT3P(zfs_btree_find(bt, &randval, &bt_idx), ==, + NULL); + zfs_btree_add_idx(bt, &randval, &bt_idx); + verify_node(&avl, bt, node); + } else { + removals++; + verify_node(&avl, bt, ret); + zfs_btree_remove(bt, &randval); + avl_remove(&avl, ret); + free(ret); + free(node); + } + + zfs_btree_verify(bt); + + iterations++; + if (iterations % contents_frequency == 0) { + verify_contents(&avl, bt); + } + + zfs_btree_verify(bt); + + (void) gettimeofday(&tp, NULL); + if (tp.tv_sec > t0 + stress_timeout) { + fprintf(stderr, "insertions/removals: %u/%u\nmax/min: " + "%llu/%llu\n", insertions, removals, max, min); + break; + } + } + + void *avl_cookie = NULL; + while ((node = avl_destroy_nodes(&avl, &avl_cookie)) != NULL) + free(node); + avl_destroy(&avl); + + if (stress_only) { + zfs_btree_index_t *idx = NULL; + uint64_t *rv; + + while ((rv = zfs_btree_destroy_nodes(bt, &idx)) != NULL) + ; + zfs_btree_verify(bt); + } + + return (0); +} + +/* + * Verify inserting a duplicate value will cause a crash. + * Note: negative test; return of 0 is a failure. + */ +static int +insert_duplicate(zfs_btree_t *bt) +{ + uint64_t *p, i = 23456; + zfs_btree_index_t bt_idx = {0}; + + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + fprintf(stderr, "Found value in empty tree.\n"); + return (0); + } + zfs_btree_add_idx(bt, &i, &bt_idx); + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) == NULL) { + fprintf(stderr, "Did not find expected value.\n"); + return (0); + } + + /* Crash on inserting a duplicate */ + zfs_btree_add_idx(bt, &i, NULL); + + return (0); +} + +/* + * Verify removing a non-existent value will cause a crash. + * Note: negative test; return of 0 is a failure. + */ +static int +remove_missing(zfs_btree_t *bt) +{ + uint64_t *p, i = 23456; + zfs_btree_index_t bt_idx = {0}; + + if ((p = (uint64_t *)zfs_btree_find(bt, &i, &bt_idx)) != NULL) { + fprintf(stderr, "Found value in empty tree.\n"); + return (0); + } + + /* Crash removing a nonexistent entry */ + zfs_btree_remove(bt, &i); + + return (0); +} + +static int +do_negative_test(zfs_btree_t *bt, char *test_name) +{ + int rval = 0; + struct rlimit rlim = {0}; + setrlimit(RLIMIT_CORE, &rlim); + + if (strcmp(test_name, "insert_duplicate") == 0) { + rval = insert_duplicate(bt); + } else if (strcmp(test_name, "remove_missing") == 0) { + rval = remove_missing(bt); + } + + /* + * Return 0, since callers will expect non-zero return values for + * these tests, and we should have crashed before getting here anyway. + */ + (void) fprintf(stderr, "Test: %s returned %d.\n", test_name, rval); + return (0); +} + +typedef struct btree_test { + const char *name; + int (*func)(zfs_btree_t *, char *); +} btree_test_t; + +static btree_test_t test_table[] = { + { "insert_find_remove", insert_find_remove }, + { "find_without_index", find_without_index }, + { "drain_tree", drain_tree }, + { "stress_tree", stress_tree }, + { NULL, NULL } +}; + +int +main(int argc, char *argv[]) +{ + char *negative_test = NULL; + int failed_tests = 0; + struct timeval tp; + zfs_btree_t bt; + char c; + + while ((c = getopt(argc, argv, "c:l:n:r:st:")) != -1) { + switch (c) { + case 'c': + contents_frequency = atoi(optarg); + break; + case 'l': + tree_limit = atoi(optarg); + break; + case 'n': + negative_test = optarg; + break; + case 'r': + seed = atoi(optarg); + break; + case 's': + stress_only = B_TRUE; + break; + case 't': + stress_timeout = atoi(optarg); + break; + case 'h': + default: + usage(1); + break; + } + } + argc -= optind; + argv += optind; + optind = 1; + + + if (seed == 0) { + (void) gettimeofday(&tp, NULL); + seed = tp.tv_sec; + } + srandom(seed); + + zfs_btree_init(); + zfs_btree_create(&bt, zfs_btree_compare, sizeof (uint64_t)); + + /* + * This runs the named negative test. None of them should + * return, as they both cause crashes. + */ + if (negative_test) { + return (do_negative_test(&bt, negative_test)); + } + + fprintf(stderr, "Seed: %u\n", seed); + + /* + * This is a stress test that does operations on a btree over the + * requested timeout period, verifying them against identical + * operations in an avl tree. + */ + if (stress_only != 0) { + return (stress_tree(&bt, NULL)); + } + + /* Do the positive tests */ + btree_test_t *test = &test_table[0]; + while (test->name) { + int retval; + uint64_t *rv; + char why[BUFSIZE] = {0}; + zfs_btree_index_t *idx = NULL; + + (void) fprintf(stdout, "%-20s", test->name); + retval = test->func(&bt, why); + + if (retval == 0) { + (void) fprintf(stdout, "ok\n"); + } else { + (void) fprintf(stdout, "failed with %d\n", retval); + if (strlen(why) != 0) + (void) fprintf(stdout, "\t%s\n", why); + why[0] = '\0'; + failed_tests++; + } + + /* Remove all the elements and re-verify the tree */ + while ((rv = zfs_btree_destroy_nodes(&bt, &idx)) != NULL) + ; + zfs_btree_verify(&bt); + + test++; + } + + zfs_btree_verify(&bt); + zfs_btree_fini(); + + return (failed_tests); +} diff --git a/tests/zfs-tests/cmd/chg_usr_exec/.gitignore b/tests/zfs-tests/cmd/chg_usr_exec/.gitignore new file mode 100644 index 000000000000..a8b44df7c5bf --- /dev/null +++ b/tests/zfs-tests/cmd/chg_usr_exec/.gitignore @@ -0,0 +1 @@ +/chg_usr_exec diff --git a/tests/zfs-tests/cmd/chg_usr_exec/Makefile.am b/tests/zfs-tests/cmd/chg_usr_exec/Makefile.am new file mode 100644 index 000000000000..6f2968f1fac4 --- /dev/null +++ b/tests/zfs-tests/cmd/chg_usr_exec/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = chg_usr_exec +chg_usr_exec_SOURCES = chg_usr_exec.c diff --git a/tests/zfs-tests/cmd/chg_usr_exec/chg_usr_exec.c b/tests/zfs-tests/cmd/chg_usr_exec/chg_usr_exec.c new file mode 100644 index 000000000000..1fa9e88a6fde --- /dev/null +++ b/tests/zfs-tests/cmd/chg_usr_exec/chg_usr_exec.c @@ -0,0 +1,77 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include + +#define EXECSHELL "/bin/sh" + +int +main(int argc, char *argv[]) +{ + char *plogin = NULL; + char cmds[BUFSIZ] = { 0 }; + char sep[] = " "; + struct passwd *ppw = NULL; + int i, len; + + if (argc < 3 || strlen(argv[1]) == 0) { + (void) printf("\tUsage: %s ...\n", argv[0]); + return (1); + } + + plogin = argv[1]; + len = 0; + for (i = 2; i < argc; i++) { + (void) snprintf(cmds+len, sizeof (cmds)-len, + "%s%s", argv[i], sep); + len += strlen(argv[i]) + strlen(sep); + } + + if ((ppw = getpwnam(plogin)) == NULL) { + perror("getpwnam"); + return (errno); + } + if (setgid(ppw->pw_gid) != 0) { + perror("setgid"); + return (errno); + } + if (setuid(ppw->pw_uid) != 0) { + perror("setuid"); + return (errno); + } + + if (execl(EXECSHELL, "sh", "-c", cmds, (char *)NULL) != 0) { + perror("execl: " EXECSHELL); + return (errno); + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/devname2devid/.gitignore b/tests/zfs-tests/cmd/devname2devid/.gitignore new file mode 100644 index 000000000000..fa9fb6c509a7 --- /dev/null +++ b/tests/zfs-tests/cmd/devname2devid/.gitignore @@ -0,0 +1 @@ +/devname2devid diff --git a/tests/zfs-tests/cmd/devname2devid/Makefile.am b/tests/zfs-tests/cmd/devname2devid/Makefile.am new file mode 100644 index 000000000000..b8b630dc2dd4 --- /dev/null +++ b/tests/zfs-tests/cmd/devname2devid/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +if WANT_DEVNAME2DEVID +pkgexec_PROGRAMS = devname2devid +devname2devid_SOURCES = devname2devid.c +devname2devid_CFLAGS = $(AM_CFLAGS) $(LIBUDEV_CFLAGS) +devname2devid_LDADD = $(LIBUDEV_LIBS) +endif diff --git a/tests/zfs-tests/cmd/devname2devid/devname2devid.c b/tests/zfs-tests/cmd/devname2devid/devname2devid.c new file mode 100644 index 000000000000..91e59c589fd5 --- /dev/null +++ b/tests/zfs-tests/cmd/devname2devid/devname2devid.c @@ -0,0 +1,160 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + * + * Copyright (c) 2016, Intel Corporation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Linux persistent device strings for vdev labels + * + * based on udev_device_get_devid() at zfs/lib/libzfs/libzfs_import.c + */ + +#define DEV_BYID_PATH "/dev/disk/by-id/" + +static int +udev_device_get_devid(struct udev_device *dev, char *bufptr, size_t buflen) +{ + struct udev_list_entry *entry; + const char *bus; + char devbyid[MAXPATHLEN]; + + /* The bus based by-id path is preferred */ + bus = udev_device_get_property_value(dev, "ID_BUS"); + + if (bus == NULL) { + const char *dm_uuid; + + /* + * For multipath nodes use the persistent uuid based identifier + * + * Example: 'dm-uuid-mpath-35000c5006304de3f' + */ + dm_uuid = udev_device_get_property_value(dev, "DM_UUID"); + if (dm_uuid != NULL) { + (void) snprintf(bufptr, buflen, "dm-uuid-%s", dm_uuid); + return (0); + } + return (ENODATA); + } + + /* + * locate the bus specific by-id link + * + * Example: 'scsi-MG03SCA300_350000494a8cb3d67-part1' + */ + (void) snprintf(devbyid, sizeof (devbyid), "%s%s-", DEV_BYID_PATH, bus); + entry = udev_device_get_devlinks_list_entry(dev); + while (entry != NULL) { + const char *name; + + name = udev_list_entry_get_name(entry); + if (strncmp(name, devbyid, strlen(devbyid)) == 0) { + name += strlen(DEV_BYID_PATH); + (void) stpncpy(bufptr, name, buflen - 1); + bufptr[buflen - 1] = '\0'; + return (0); + } + entry = udev_list_entry_get_next(entry); + } + + return (ENODATA); +} + +/* + * Usage: devname2devid + * + * Examples: + * # ./devname2devid /dev/sda1 + * devid scsi-350000394a8caede4-part1 + * + * # ./devname2devid /dev/dm-1 + * devid: 'dm-uuid-mpath-35000c5006304de3f' + * + * This program accepts a disk or disk slice path and prints a + * device id. + * + * Exit values: + * 0 - means success + * 1 - means failure + * + */ +int +main(int argc, char *argv[]) +{ + struct udev *udev; + struct udev_device *dev = NULL; + char devid[128], nodepath[MAXPATHLEN]; + char *device, *sysname; + int ret; + + if (argc == 1) { + (void) printf("%s [search path]\n", argv[0]); + exit(1); + } + device = argv[1]; + + if ((udev = udev_new()) == NULL) { + perror("udev_new"); + exit(1); + } + + /* resolve path to a runtime device node instance */ + if (realpath(device, nodepath) == NULL) { + perror("realpath"); + exit(1); + } + sysname = strrchr(nodepath, '/') + 1; + + if ((dev = udev_device_new_from_subsystem_sysname(udev, "block", + sysname)) == NULL) { + perror(sysname); + exit(1); + } + + if ((ret = udev_device_get_devid(dev, devid, sizeof (devid))) != 0) { + udev_device_unref(dev); + errno = ret; + perror(sysname); + exit(1); + } + + (void) printf("devid %s\n", devid); + + udev_device_unref(dev); + udev_unref(udev); + + return (0); +} diff --git a/tests/zfs-tests/cmd/dir_rd_update/.gitignore b/tests/zfs-tests/cmd/dir_rd_update/.gitignore new file mode 100644 index 000000000000..ec9a15f17a1d --- /dev/null +++ b/tests/zfs-tests/cmd/dir_rd_update/.gitignore @@ -0,0 +1 @@ +/dir_rd_update diff --git a/tests/zfs-tests/cmd/dir_rd_update/Makefile.am b/tests/zfs-tests/cmd/dir_rd_update/Makefile.am new file mode 100644 index 000000000000..27cc9e97e0e9 --- /dev/null +++ b/tests/zfs-tests/cmd/dir_rd_update/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = dir_rd_update +dir_rd_update_SOURCES = dir_rd_update.c diff --git a/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c b/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c new file mode 100644 index 000000000000..ea46f047a35e --- /dev/null +++ b/tests/zfs-tests/cmd/dir_rd_update/dir_rd_update.c @@ -0,0 +1,136 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Assertion: + * + * A read operation and directory update operation performed + * concurrently on the same directory can lead to deadlock + * on a UFS logging file system, but not on a ZFS file system. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#define TMP_DIR /tmp + +static char dirpath[256]; + +int +main(int argc, char **argv) +{ + char *cp1 = ""; + int i = 0; + int ret = 0; + int testdd = 0; + pid_t pid; + static const int op_num = 5; + + if (argc == 1) { + (void) printf("Usage: %s \n", argv[0]); + exit(-1); + } + for (i = 0; i < 256; i++) { + dirpath[i] = 0; + } + + cp1 = argv[1]; + if (strlen(cp1) >= (sizeof (dirpath) - strlen("TMP_DIR"))) { + (void) printf("The string length of mount point is " + "too large\n"); + exit(-1); + } + (void) strcpy(&dirpath[0], (const char *)cp1); + (void) strcat(&dirpath[strlen(dirpath)], "TMP_DIR"); + + ret = mkdir(dirpath, 0777); + if (ret != 0) { + if (errno != EEXIST) { + (void) printf("%s: mkdir(<%s>, 0777) failed: errno " + "(decimal)=%d\n", argv[0], dirpath, errno); + exit(-1); + } + } + testdd = open(dirpath, O_RDONLY|O_RSYNC|O_SYNC|O_DSYNC); + if (testdd < 0) { + (void) printf("%s: open(<%s>, O_RDONLY|O_RSYNC|O_SYNC|O_DSYNC)" + " failed: errno (decimal)=%d\n", argv[0], dirpath, errno); + exit(-1); + } else { + (void) close(testdd); + } + pid = fork(); + if (pid > 0) { + int fd = open(dirpath, O_RDONLY|O_RSYNC|O_SYNC|O_DSYNC); + char buf[16]; + int rdret; + int j = 0; + + if (fd < 0) { + (void) printf("%s: open <%s> again failed:" + " errno = %d\n", argv[0], dirpath, errno); + exit(-1); + } + + while (j < op_num) { + (void) sleep(1); + rdret = read(fd, buf, 16); + if (rdret == -1) { + (void) printf("readdir failed"); + } + j++; + } + (void) close(fd); + } else if (pid == 0) { + int fd = open(dirpath, O_RDONLY); + int chownret; + int k = 0; + + if (fd < 0) { + (void) printf("%s: open(<%s>, O_RDONLY) again failed:" + " errno (decimal)=%d\n", argv[0], dirpath, errno); + exit(-1); + } + + while (k < op_num) { + (void) sleep(1); + chownret = fchown(fd, 0, 0); + if (chownret == -1) { + (void) printf("chown failed"); + } + + k++; + } + (void) close(fd); + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/file_check/.gitignore b/tests/zfs-tests/cmd/file_check/.gitignore new file mode 100644 index 000000000000..24fe113221d2 --- /dev/null +++ b/tests/zfs-tests/cmd/file_check/.gitignore @@ -0,0 +1 @@ +/file_check diff --git a/tests/zfs-tests/cmd/file_check/Makefile.am b/tests/zfs-tests/cmd/file_check/Makefile.am new file mode 100644 index 000000000000..13027ef5bd02 --- /dev/null +++ b/tests/zfs-tests/cmd/file_check/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = file_check +file_check_SOURCES = file_check.c diff --git a/tests/zfs-tests/cmd/file_check/file_check.c b/tests/zfs-tests/cmd/file_check/file_check.c new file mode 100644 index 000000000000..5df0ea735bfd --- /dev/null +++ b/tests/zfs-tests/cmd/file_check/file_check.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "../file_common.h" + +static unsigned char bigbuffer[BIGBUFFERSIZE]; + +/* + * Given a filename, check that the file consists entirely + * of a particular pattern. If the pattern is not specified a + * default will be used. For default values see file_common.h + */ +int +main(int argc, char **argv) +{ + int bigfd; + long i, n; + unsigned char fillchar = DATA; + int bigbuffersize = BIGBUFFERSIZE; + int64_t read_count = 0; + + /* + * Validate arguments + */ + if (argc < 2) { + (void) printf("Usage: %s filename [pattern]\n", + argv[0]); + exit(1); + } + + if (argv[2]) { + fillchar = atoi(argv[2]); + } + + /* + * Read the file contents and check every character + * against the supplied pattern. Abort if the + * pattern check fails. + */ + if ((bigfd = open(argv[1], O_RDONLY)) == -1) { + (void) printf("open %s failed %d\n", argv[1], errno); + exit(1); + } + + do { + if ((n = read(bigfd, &bigbuffer, bigbuffersize)) == -1) { + (void) printf("read failed (%ld), %d\n", n, errno); + exit(errno); + } + + for (i = 0; i < n; i++) { + if (bigbuffer[i] != fillchar) { + (void) printf("error %s: 0x%x != 0x%x)\n", + argv[1], bigbuffer[i], fillchar); + exit(1); + } + } + + read_count += n; + } while (n == bigbuffersize); + + return (0); +} diff --git a/tests/zfs-tests/cmd/file_common.h b/tests/zfs-tests/cmd/file_common.h new file mode 100644 index 000000000000..64b1777a9ae8 --- /dev/null +++ b/tests/zfs-tests/cmd/file_common.h @@ -0,0 +1,70 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef FILE_COMMON_H +#define FILE_COMMON_H + +/* + * header file for file_* utilities. These utilities + * are used by the test cases to perform various file + * operations (append writes, for example). + */ + +#ifdef __cplusplus +extern "C" { +#endif + +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif + +#ifndef _LARGEFILE64_SOURCE +#define _LARGEFILE64_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define BLOCKSZ 8192 +#define DATA 0xa5 +#define DATA_RANGE 120 +#define BIGBUFFERSIZE 0x800000 +#define BIGFILESIZE 20 + +extern char *optarg; +extern int optind, opterr, optopt; + +#ifdef __cplusplus +} +#endif + +#endif /* FILE_COMMON_H */ diff --git a/tests/zfs-tests/cmd/file_trunc/.gitignore b/tests/zfs-tests/cmd/file_trunc/.gitignore new file mode 100644 index 000000000000..90b149ff51c3 --- /dev/null +++ b/tests/zfs-tests/cmd/file_trunc/.gitignore @@ -0,0 +1 @@ +/file_trunc diff --git a/tests/zfs-tests/cmd/file_trunc/Makefile.am b/tests/zfs-tests/cmd/file_trunc/Makefile.am new file mode 100644 index 000000000000..0455eb4a4633 --- /dev/null +++ b/tests/zfs-tests/cmd/file_trunc/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = file_trunc +file_trunc_SOURCES = file_trunc.c diff --git a/tests/zfs-tests/cmd/file_trunc/file_trunc.c b/tests/zfs-tests/cmd/file_trunc/file_trunc.c new file mode 100644 index 000000000000..69096752efa2 --- /dev/null +++ b/tests/zfs-tests/cmd/file_trunc/file_trunc.c @@ -0,0 +1,240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012, 2014 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define FSIZE 256*1024*1024 +#define BSIZE 512 + +/* Initialize Globals */ +static long fsize = FSIZE; +static size_t bsize = BSIZE; +static int count = 0; +static int rflag = 0; +static int seed = 0; +static int vflag = 0; +static int errflag = 0; +static off_t offset = 0; +static char *filename = NULL; + +static void usage(char *execname); +static void parse_options(int argc, char *argv[]); +static void do_write(int fd); +static void do_trunc(int fd); + +static void +usage(char *execname) +{ + (void) fprintf(stderr, + "usage: %s [-b blocksize] [-c count] [-f filesize]" + " [-o offset] [-s seed] [-r] [-v] filename\n", execname); + (void) exit(1); +} + +int +main(int argc, char *argv[]) +{ + int i = 0; + int fd = -1; + + parse_options(argc, argv); + + fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, 0666); + if (fd < 0) { + perror("open"); + exit(3); + } + + for (i = 0; count == 0 || i < count; i++) { + (void) do_write(fd); + (void) do_trunc(fd); + } + + (void) close(fd); + return (0); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + + extern char *optarg; + extern int optind, optopt; + + count = fsize / bsize; + seed = time(NULL); + while ((c = getopt(argc, argv, "b:c:f:o:rs:v")) != -1) { + switch (c) { + case 'b': + bsize = atoi(optarg); + break; + + case 'c': + count = atoi(optarg); + break; + + case 'f': + fsize = atoi(optarg); + break; + + case 'o': + offset = atoi(optarg); + break; + + case 'r': + rflag++; + break; + + case 's': + seed = atoi(optarg); + break; + + case 'v': + vflag++; + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an operand\n", optopt); + errflag++; + break; + + case '?': + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + + if (errflag) { + (void) usage(argv[0]); + } + } + if (argc <= optind) { + (void) fprintf(stderr, + "No filename specified\n"); + usage(argv[0]); + } + filename = argv[optind]; + + if (vflag) { + (void) fprintf(stderr, "Seed = %d\n", seed); + } + srandom(seed); +} + +static void +do_write(int fd) +{ + off_t roffset = 0; + char *buf = NULL; + char *rbuf = NULL; + + buf = (char *)calloc(1, bsize); + rbuf = (char *)calloc(1, bsize); + if (buf == NULL || rbuf == NULL) { + perror("malloc"); + exit(4); + } + + roffset = random() % fsize; + if (lseek64(fd, (offset + roffset), SEEK_SET) < 0) { + perror("lseek"); + exit(5); + } + + (void) strcpy(buf, "ZFS Test Suite Truncation Test"); + if (write(fd, buf, bsize) < bsize) { + perror("write"); + exit(6); + } + + if (rflag) { + if (lseek64(fd, (offset + roffset), SEEK_SET) < 0) { + perror("lseek"); + exit(7); + } + + if (read(fd, rbuf, bsize) < bsize) { + perror("read"); + exit(8); + } + + if (memcmp(buf, rbuf, bsize) != 0) { + perror("memcmp"); + exit(9); + } + } + if (vflag) { + (void) fprintf(stderr, + "Wrote to offset %" PRId64 "\n", (offset + roffset)); + if (rflag) { + (void) fprintf(stderr, + "Read back from offset %" PRId64 "\n", + (offset + roffset)); + } + } + + (void) free(buf); + (void) free(rbuf); +} + +static void +do_trunc(int fd) +{ + off_t roffset = 0; + + roffset = random() % fsize; + if (ftruncate64(fd, (offset + roffset)) < 0) { + perror("truncate"); + exit(7); + } + + if (vflag) { + (void) fprintf(stderr, "Truncated at offset %" PRId64 "\n", + (offset + roffset)); + } +} diff --git a/tests/zfs-tests/cmd/file_write/.gitignore b/tests/zfs-tests/cmd/file_write/.gitignore new file mode 100644 index 000000000000..9f691d580a57 --- /dev/null +++ b/tests/zfs-tests/cmd/file_write/.gitignore @@ -0,0 +1 @@ +/file_write diff --git a/tests/zfs-tests/cmd/file_write/Makefile.am b/tests/zfs-tests/cmd/file_write/Makefile.am new file mode 100644 index 000000000000..60895711e7dc --- /dev/null +++ b/tests/zfs-tests/cmd/file_write/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = file_write +file_write_SOURCES = file_write.c diff --git a/tests/zfs-tests/cmd/file_write/file_write.c b/tests/zfs-tests/cmd/file_write/file_write.c new file mode 100644 index 000000000000..45d296db43c6 --- /dev/null +++ b/tests/zfs-tests/cmd/file_write/file_write.c @@ -0,0 +1,258 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include "../file_common.h" +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned char bigbuffer[BIGBUFFERSIZE]; + +/* + * Writes (or appends) a given value to a file repeatedly. + * See header file for defaults. + */ + +static void usage(char *); + +/* + * psudo-randomize the buffer + */ +static void randomize_buffer(int block_size) { + int i; + char rnd = rand() & 0xff; + for (i = 0; i < block_size; i++) + bigbuffer[i] ^= rnd; +} + +int +main(int argc, char **argv) +{ + int bigfd; + int c; + int oflag = 0; + int err = 0; + int k; + long i; + int64_t good_writes = 0; + uchar_t nxtfillchar; + char *prog = argv[0]; + /* + * Default Parameters + */ + int write_count = BIGFILESIZE; + uchar_t fillchar = DATA; + int block_size = BLOCKSZ; + char *filename = NULL; + char *operation = NULL; + offset_t noffset, offset = 0; + int verbose = 0; + int rsync = 0; + int wsync = 0; + + /* + * Process Arguments + */ + while ((c = getopt(argc, argv, "b:c:d:s:f:o:vwr")) != -1) { + switch (c) { + case 'b': + block_size = atoi(optarg); + break; + case 'c': + write_count = atoi(optarg); + break; + case 'd': + if (optarg[0] == 'R') + fillchar = 'R'; /* R = random data */ + else + fillchar = atoi(optarg); + break; + case 's': + offset = atoll(optarg); + break; + case 'f': + filename = optarg; + break; + case 'o': + operation = optarg; + break; + case 'v': + verbose = 1; + break; + case 'w': + wsync = 1; + break; + case 'r': + rsync = 1; + break; + case '?': + (void) printf("unknown arg %c\n", optopt); + usage(prog); + break; + } + } + + /* + * Validate Parameters + */ + if (!filename) { + (void) printf("Filename not specified (-f )\n"); + err++; + } + + if (!operation) { + (void) printf("Operation not specified (-o ).\n"); + err++; + } + + if (block_size > BIGBUFFERSIZE) { + (void) printf("block_size is too large max==%d.\n", + BIGBUFFERSIZE); + err++; + } + + if (err) { + usage(prog); /* no return */ + return (1); + } + + /* + * Prepare the buffer and determine the requested operation + */ + nxtfillchar = fillchar; + k = 0; + + if (fillchar == 'R') + srand(time(NULL)); + + for (i = 0; i < block_size; i++) { + bigbuffer[i] = nxtfillchar; + + if (fillchar == 0) { + if ((k % DATA_RANGE) == 0) { + k = 0; + } + nxtfillchar = k++; + } else if (fillchar == 'R') { + nxtfillchar = rand() & 0xff; + } + } + + /* + * using the strncmp of operation will make the operation match the + * first shortest match - as the operations are unique from the first + * character this means that we match single character operations + */ + if ((strncmp(operation, "create", strlen(operation) + 1)) == 0 || + (strncmp(operation, "overwrite", strlen(operation) + 1)) == 0) { + oflag = (O_RDWR|O_CREAT); + } else if ((strncmp(operation, "append", strlen(operation) + 1)) == 0) { + oflag = (O_RDWR|O_APPEND); + } else { + (void) printf("valid operations are not '%s'\n", + operation); + usage(prog); + } + + if (rsync) { + oflag = oflag | O_RSYNC; + } + + if (wsync) { + oflag = oflag | O_SYNC; + } + + /* + * Given an operation (create/overwrite/append), open the file + * accordingly and perform a write of the appropriate type. + */ + if ((bigfd = open(filename, oflag, 0666)) == -1) { + (void) printf("open %s: failed [%s]%d. Aborting!\n", filename, + strerror(errno), errno); + exit(errno); + } + noffset = lseek64(bigfd, offset, SEEK_SET); + if (noffset != offset) { + (void) printf("llseek %s (%lld/%lld) failed [%s]%d.Aborting!\n", + filename, offset, noffset, strerror(errno), errno); + exit(errno); + } + + if (verbose) { + (void) printf("%s: block_size = %d, write_count = %d, " + "offset = %lld, ", filename, block_size, + write_count, offset); + if (fillchar == 'R') { + (void) printf("data = [random]\n"); + } else { + (void) printf("data = %s%d\n", + (fillchar == 0) ? "0->" : "", + (fillchar == 0) ? DATA_RANGE : fillchar); + } + } + + for (i = 0; i < write_count; i++) { + ssize_t n; + if (fillchar == 'R') + randomize_buffer(block_size); + + if ((n = write(bigfd, &bigbuffer, block_size)) == -1) { + (void) printf("write failed (%ld), good_writes = %" + PRId64 ", " "error: %s[%d]\n", + (long)n, good_writes, + strerror(errno), + errno); + exit(errno); + } + good_writes++; + } + + if (verbose) { + (void) printf("Success: good_writes = %" PRId64 "(%" + PRId64 ")\n", good_writes, (good_writes * block_size)); + } + + return (0); +} + +static void +usage(char *prog) +{ + (void) printf("Usage: %s [-v] -o {create,overwrite,append} -f file_name" + " [-b block_size]\n" + "\t[-s offset] [-c write_count] [-d data]\n\n" + "Where [data] equal to zero causes chars " + "0->%d to be repeated throughout, or [data]\n" + "equal to 'R' for psudorandom data.\n", + prog, DATA_RANGE); + + exit(1); +} diff --git a/tests/zfs-tests/cmd/get_diff/.gitignore b/tests/zfs-tests/cmd/get_diff/.gitignore new file mode 100644 index 000000000000..f5fc360a6839 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/.gitignore @@ -0,0 +1 @@ +/get_diff diff --git a/tests/zfs-tests/cmd/get_diff/Makefile.am b/tests/zfs-tests/cmd/get_diff/Makefile.am new file mode 100644 index 000000000000..06c39ddd81ce --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = get_diff +get_diff_SOURCES = get_diff.c diff --git a/tests/zfs-tests/cmd/get_diff/get_diff.c b/tests/zfs-tests/cmd/get_diff/get_diff.c new file mode 100644 index 000000000000..2799f46b0747 --- /dev/null +++ b/tests/zfs-tests/cmd/get_diff/get_diff.c @@ -0,0 +1,109 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +usage(char *msg, int exit_value) +{ + (void) fprintf(stderr, "get_diff file redacted_file\n"); + (void) fprintf(stderr, "%s\n", msg); + exit(exit_value); +} + +/* + * This utility compares two files, an original and its redacted counterpart + * (in that order). It compares the files 512 bytes at a time, printing out + * any ranges (as offset and length) where the redacted file does not match + * the original. This output is used to verify that the expected ranges of + * a redacted file do not contain the original data. + */ +int +main(int argc, char *argv[]) +{ + off_t diff_off = 0, diff_len = 0, off = 0; + int fd1, fd2; + char *fname1, *fname2; + char buf1[DEV_BSIZE], buf2[DEV_BSIZE]; + ssize_t bytes; + + if (argc != 3) + usage("Incorrect number of arguments.", 1); + + if ((fname1 = argv[1]) == NULL) + usage("Filename missing.", 1); + if ((fd1 = open(fname1, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open1 failed"); + exit(1); + } + + if ((fname2 = argv[2]) == NULL) + usage("Redacted filename missing.", 1); + if ((fd2 = open(fname2, O_LARGEFILE | O_RDONLY)) < 0) { + perror("open2 failed"); + exit(1); + } + + while ((bytes = pread(fd1, buf1, DEV_BSIZE, off)) > 0) { + if (pread(fd2, buf2, DEV_BSIZE, off) < 0) { + if (errno == EIO) { + /* + * A read in a redacted section of a file will + * fail with EIO. If we get EIO, continue on + * but ensure that a comparison of buf1 and + * buf2 will fail, indicating a redacted block. + */ + buf2[0] = ~buf1[0]; + } else { + perror("pread failed"); + exit(1); + } + } + if (memcmp(buf1, buf2, bytes) == 0) { + if (diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", + (long long)diff_off, (long long)diff_len); + assert(off == diff_off + diff_len); + diff_len = 0; + } + diff_off = 0; + } else { + if (diff_len == 0) + diff_off = off; + assert(off == diff_off + diff_len); + diff_len += bytes; + } + off += bytes; + } + + if (diff_len != 0 && diff_len != 0) { + (void) fprintf(stdout, "%lld,%lld\n", (long long)diff_off, + (long long)diff_len); + } + + (void) close(fd1); + (void) close(fd2); + + return (0); +} diff --git a/tests/zfs-tests/cmd/largest_file/.gitignore b/tests/zfs-tests/cmd/largest_file/.gitignore new file mode 100644 index 000000000000..f8f480d06542 --- /dev/null +++ b/tests/zfs-tests/cmd/largest_file/.gitignore @@ -0,0 +1 @@ +/largest_file diff --git a/tests/zfs-tests/cmd/largest_file/Makefile.am b/tests/zfs-tests/cmd/largest_file/Makefile.am new file mode 100644 index 000000000000..a3e4e9337cf0 --- /dev/null +++ b/tests/zfs-tests/cmd/largest_file/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = largest_file +largest_file_SOURCES = largest_file.c diff --git a/tests/zfs-tests/cmd/largest_file/largest_file.c b/tests/zfs-tests/cmd/largest_file/largest_file.c new file mode 100644 index 000000000000..00e1019cc8e4 --- /dev/null +++ b/tests/zfs-tests/cmd/largest_file/largest_file.c @@ -0,0 +1,145 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include "../file_common.h" +#include +#include +#include +#include +#include +#include + +/* + * -------------------------------------------------------------- + * + * Assertion: + * The last byte of the largest file size can be + * accessed without any errors. Also, the writing + * beyond the last byte of the largest file size + * will produce an errno of EFBIG. + * + * -------------------------------------------------------------- + * If the write() system call below returns a "1", + * then the last byte can be accessed. + * -------------------------------------------------------------- + */ +static void sigxfsz(int); +static void usage(char *); + +int +main(int argc, char **argv) +{ + int fd = 0; + offset_t offset = (MAXOFFSET_T - 1); + offset_t llseek_ret = 0; + int write_ret = 0; + int err = 0; + char mybuf[5] = "aaaa\0"; + char *testfile; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + struct sigaction sa; + + if (argc != 2) { + usage(argv[0]); + } + + if (sigemptyset(&sa.sa_mask) == -1) + return (errno); + sa.sa_flags = 0; + sa.sa_handler = sigxfsz; + if (sigaction(SIGXFSZ, &sa, NULL) == -1) + return (errno); + + testfile = strdup(argv[1]); + + fd = open(testfile, O_CREAT | O_RDWR, mode); + if (fd < 0) { + err = errno; + perror("Failed to create testfile"); + free(testfile); + return (err); + } + + llseek_ret = lseek64(fd, offset, SEEK_SET); + if (llseek_ret < 0) { + err = errno; + perror("Failed to seek to end of testfile"); + goto out; + } + + write_ret = write(fd, mybuf, 1); + if (write_ret < 0) { + err = errno; + perror("Failed to write to end of file"); + goto out; + } + + offset = 0; + llseek_ret = lseek64(fd, offset, SEEK_CUR); + if (llseek_ret < 0) { + err = errno; + perror("Failed to seek to end of file"); + goto out; + } + + write_ret = write(fd, mybuf, 1); + if (write_ret < 0) { + if (errno == EFBIG || errno == EINVAL) { + (void) printf("write errno=EFBIG|EINVAL: success\n"); + err = 0; + } else { + err = errno; + perror("Did not receive EFBIG"); + } + } else { + (void) printf("write completed successfully, test failed\n"); + err = 1; + } + +out: + (void) unlink(testfile); + free(testfile); + close(fd); + return (err); +} + +static void +usage(char *name) +{ + (void) printf("%s \n", name); + exit(1); +} + +/* ARGSUSED */ +static void +sigxfsz(int signo) +{ + (void) printf("\nlargest_file: sigxfsz() caught SIGXFSZ\n"); +} diff --git a/tests/zfs-tests/cmd/libzfs_input_check/.gitignore b/tests/zfs-tests/cmd/libzfs_input_check/.gitignore new file mode 100644 index 000000000000..c8796008483f --- /dev/null +++ b/tests/zfs-tests/cmd/libzfs_input_check/.gitignore @@ -0,0 +1 @@ +/libzfs_input_check diff --git a/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am b/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am new file mode 100644 index 000000000000..ba02f93fe2ab --- /dev/null +++ b/tests/zfs-tests/cmd/libzfs_input_check/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = libzfs_input_check + +libzfs_input_check_SOURCES = libzfs_input_check.c +libzfs_input_check_LDADD = \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c new file mode 100644 index 000000000000..9fee37357fc3 --- /dev/null +++ b/tests/zfs-tests/cmd/libzfs_input_check/libzfs_input_check.c @@ -0,0 +1,1060 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +#include +#include + +/* + * Test the nvpair inputs for the non-legacy zfs ioctl commands. + */ + +boolean_t unexpected_failures; +int zfs_fd; +const char *active_test; + +/* + * Tracks which zfs_ioc_t commands were tested + */ +boolean_t ioc_tested[ZFS_IOC_LAST - ZFS_IOC_FIRST]; + +/* + * Legacy ioctls that are skipped (for now) + */ +static unsigned ioc_skip[] = { + ZFS_IOC_POOL_CREATE, + ZFS_IOC_POOL_DESTROY, + ZFS_IOC_POOL_IMPORT, + ZFS_IOC_POOL_EXPORT, + ZFS_IOC_POOL_CONFIGS, + ZFS_IOC_POOL_STATS, + ZFS_IOC_POOL_TRYIMPORT, + ZFS_IOC_POOL_SCAN, + ZFS_IOC_POOL_FREEZE, + ZFS_IOC_POOL_UPGRADE, + ZFS_IOC_POOL_GET_HISTORY, + + ZFS_IOC_VDEV_ADD, + ZFS_IOC_VDEV_REMOVE, + ZFS_IOC_VDEV_SET_STATE, + ZFS_IOC_VDEV_ATTACH, + ZFS_IOC_VDEV_DETACH, + ZFS_IOC_VDEV_SETPATH, + ZFS_IOC_VDEV_SETFRU, + + ZFS_IOC_OBJSET_STATS, + ZFS_IOC_OBJSET_ZPLPROPS, + ZFS_IOC_DATASET_LIST_NEXT, + ZFS_IOC_SNAPSHOT_LIST_NEXT, + ZFS_IOC_SET_PROP, + ZFS_IOC_DESTROY, + ZFS_IOC_RENAME, + ZFS_IOC_RECV, + ZFS_IOC_SEND, + ZFS_IOC_INJECT_FAULT, + ZFS_IOC_CLEAR_FAULT, + ZFS_IOC_INJECT_LIST_NEXT, + ZFS_IOC_ERROR_LOG, + ZFS_IOC_CLEAR, + ZFS_IOC_PROMOTE, + ZFS_IOC_DSOBJ_TO_DSNAME, + ZFS_IOC_OBJ_TO_PATH, + ZFS_IOC_POOL_SET_PROPS, + ZFS_IOC_POOL_GET_PROPS, + ZFS_IOC_SET_FSACL, + ZFS_IOC_GET_FSACL, + ZFS_IOC_SHARE, + ZFS_IOC_INHERIT_PROP, + ZFS_IOC_SMB_ACL, + ZFS_IOC_USERSPACE_ONE, + ZFS_IOC_USERSPACE_MANY, + ZFS_IOC_USERSPACE_UPGRADE, + ZFS_IOC_OBJSET_RECVD_PROPS, + ZFS_IOC_VDEV_SPLIT, + ZFS_IOC_NEXT_OBJ, + ZFS_IOC_DIFF, + ZFS_IOC_TMP_SNAPSHOT, + ZFS_IOC_OBJ_TO_STATS, + ZFS_IOC_SPACE_WRITTEN, + ZFS_IOC_POOL_REGUID, + ZFS_IOC_SEND_PROGRESS, + ZFS_IOC_EVENTS_NEXT, + ZFS_IOC_EVENTS_CLEAR, + ZFS_IOC_EVENTS_SEEK, + ZFS_IOC_NEXTBOOT, + ZFS_IOC_JAIL, + ZFS_IOC_UNJAIL, +}; + + +#define IOC_INPUT_TEST(ioc, name, req, opt, err) \ + IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, B_FALSE) + +#define IOC_INPUT_TEST_WILD(ioc, name, req, opt, err) \ + IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, B_TRUE) + +#define IOC_INPUT_TEST_IMPL(ioc, name, req, opt, err, wild) \ + do { \ + active_test = __func__ + 5; \ + ioc_tested[ioc - ZFS_IOC_FIRST] = B_TRUE; \ + lzc_ioctl_test(ioc, name, req, opt, err, wild); \ + } while (0) + +/* + * run a zfs ioctl command, verify expected results and log failures + */ +static void +lzc_ioctl_run(zfs_ioc_t ioc, const char *name, nvlist_t *innvl, int expected) +{ + zfs_cmd_t zc = {"\0"}; + char *packed = NULL; + const char *variant; + size_t size = 0; + int error = 0; + + switch (expected) { + case ZFS_ERR_IOC_ARG_UNAVAIL: + variant = "unsupported input"; + break; + case ZFS_ERR_IOC_ARG_REQUIRED: + variant = "missing input"; + break; + case ZFS_ERR_IOC_ARG_BADTYPE: + variant = "invalid input type"; + break; + default: + variant = "valid input"; + break; + } + + packed = fnvlist_pack(innvl, &size); + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; + zc.zc_nvlist_src = (uint64_t)(uintptr_t)packed; + zc.zc_nvlist_src_size = size; + zc.zc_nvlist_dst_size = MAX(size * 2, 128 * 1024); + zc.zc_nvlist_dst = (uint64_t)(uintptr_t)malloc(zc.zc_nvlist_dst_size); + + if (zfs_ioctl_fd(zfs_fd, ioc, &zc) != 0) + error = errno; + + if (error != expected) { + unexpected_failures = B_TRUE; + (void) fprintf(stderr, "%s: Unexpected result with %s, " + "error %d (expecting %d)\n", + active_test, variant, error, expected); + } + + fnvlist_pack_free(packed, size); + free((void *)(uintptr_t)zc.zc_nvlist_dst); +} + +/* + * Test each ioc for the following ioctl input errors: + * ZFS_ERR_IOC_ARG_UNAVAIL an input argument is not supported by kernel + * ZFS_ERR_IOC_ARG_REQUIRED a required input argument is missing + * ZFS_ERR_IOC_ARG_BADTYPE an input argument has an invalid type + */ +static int +lzc_ioctl_test(zfs_ioc_t ioc, const char *name, nvlist_t *required, + nvlist_t *optional, int expected_error, boolean_t wildcard) +{ + nvlist_t *input = fnvlist_alloc(); + nvlist_t *future = fnvlist_alloc(); + int error = 0; + + if (required != NULL) { + for (nvpair_t *pair = nvlist_next_nvpair(required, NULL); + pair != NULL; pair = nvlist_next_nvpair(required, pair)) { + fnvlist_add_nvpair(input, pair); + } + } + if (optional != NULL) { + for (nvpair_t *pair = nvlist_next_nvpair(optional, NULL); + pair != NULL; pair = nvlist_next_nvpair(optional, pair)) { + fnvlist_add_nvpair(input, pair); + } + } + + /* + * Generic input run with 'optional' nvlist pair + */ + if (!wildcard) + fnvlist_add_nvlist(input, "optional", future); + lzc_ioctl_run(ioc, name, input, expected_error); + if (!wildcard) + fnvlist_remove(input, "optional"); + + /* + * Bogus input value + */ + if (!wildcard) { + fnvlist_add_string(input, "bogus_input", "bogus"); + lzc_ioctl_run(ioc, name, input, ZFS_ERR_IOC_ARG_UNAVAIL); + fnvlist_remove(input, "bogus_input"); + } + + /* + * Missing required inputs + */ + if (required != NULL) { + nvlist_t *empty = fnvlist_alloc(); + lzc_ioctl_run(ioc, name, empty, ZFS_ERR_IOC_ARG_REQUIRED); + nvlist_free(empty); + } + + /* + * Wrong nvpair type + */ + if (required != NULL || optional != NULL) { + /* + * switch the type of one of the input pairs + */ + for (nvpair_t *pair = nvlist_next_nvpair(input, NULL); + pair != NULL; pair = nvlist_next_nvpair(input, pair)) { + char pname[MAXNAMELEN]; + data_type_t ptype; + + strlcpy(pname, nvpair_name(pair), sizeof (pname)); + pname[sizeof (pname) - 1] = '\0'; + ptype = nvpair_type(pair); + fnvlist_remove_nvpair(input, pair); + + switch (ptype) { + case DATA_TYPE_STRING: + fnvlist_add_uint64(input, pname, 42); + break; + default: + fnvlist_add_string(input, pname, "bogus"); + break; + } + } + lzc_ioctl_run(ioc, name, input, ZFS_ERR_IOC_ARG_BADTYPE); + } + + nvlist_free(future); + nvlist_free(input); + + return (error); +} + +static void +test_pool_sync(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_boolean_value(required, "force", B_TRUE); + + IOC_INPUT_TEST(ZFS_IOC_POOL_SYNC, pool, required, NULL, 0); + + nvlist_free(required); +} + +static void +test_pool_reopen(const char *pool) +{ + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_boolean_value(optional, "scrub_restart", B_FALSE); + + IOC_INPUT_TEST(ZFS_IOC_POOL_REOPEN, pool, NULL, optional, 0); + + nvlist_free(optional); +} + +static void +test_pool_checkpoint(const char *pool) +{ + IOC_INPUT_TEST(ZFS_IOC_POOL_CHECKPOINT, pool, NULL, NULL, 0); +} + +static void +test_pool_discard_checkpoint(const char *pool) +{ + int err = lzc_pool_checkpoint(pool); + if (err == 0 || err == ZFS_ERR_CHECKPOINT_EXISTS) + IOC_INPUT_TEST(ZFS_IOC_POOL_DISCARD_CHECKPOINT, pool, NULL, + NULL, 0); +} + +static void +test_log_history(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_string(required, "message", "input check"); + + IOC_INPUT_TEST(ZFS_IOC_LOG_HISTORY, pool, required, NULL, 0); + + nvlist_free(required); +} + +static void +test_create(const char *pool) +{ + char dataset[MAXNAMELEN + 32]; + + (void) snprintf(dataset, sizeof (dataset), "%s/create-fs", pool); + + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + + fnvlist_add_int32(required, "type", DMU_OST_ZFS); + fnvlist_add_uint64(props, "recordsize", 8192); + fnvlist_add_nvlist(optional, "props", props); + + IOC_INPUT_TEST(ZFS_IOC_CREATE, dataset, required, optional, 0); + + nvlist_free(required); + nvlist_free(optional); +} + +static void +test_snapshot(const char *pool, const char *snapshot) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *snaps = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + + fnvlist_add_boolean(snaps, snapshot); + fnvlist_add_nvlist(required, "snaps", snaps); + + fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); + fnvlist_add_nvlist(optional, "props", props); + + IOC_INPUT_TEST(ZFS_IOC_SNAPSHOT, pool, required, optional, 0); + + nvlist_free(props); + nvlist_free(snaps); + nvlist_free(optional); + nvlist_free(required); +} + +static void +test_space_snaps(const char *snapshot) +{ + nvlist_t *required = fnvlist_alloc(); + fnvlist_add_string(required, "firstsnap", snapshot); + + IOC_INPUT_TEST(ZFS_IOC_SPACE_SNAPS, snapshot, required, NULL, 0); + + nvlist_free(required); +} + +static void +test_destroy_snaps(const char *pool, const char *snapshot) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *snaps = fnvlist_alloc(); + + fnvlist_add_boolean(snaps, snapshot); + fnvlist_add_nvlist(required, "snaps", snaps); + + IOC_INPUT_TEST(ZFS_IOC_DESTROY_SNAPS, pool, required, NULL, 0); + + nvlist_free(snaps); + nvlist_free(required); +} + + +static void +test_bookmark(const char *pool, const char *snapshot, const char *bookmark) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_string(required, bookmark, snapshot); + + IOC_INPUT_TEST_WILD(ZFS_IOC_BOOKMARK, pool, required, NULL, 0); + + nvlist_free(required); +} + +static void +test_get_bookmarks(const char *dataset) +{ + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_boolean(optional, "guid"); + fnvlist_add_boolean(optional, "createtxg"); + fnvlist_add_boolean(optional, "creation"); + + IOC_INPUT_TEST_WILD(ZFS_IOC_GET_BOOKMARKS, dataset, NULL, optional, 0); + + nvlist_free(optional); +} + +static void +test_destroy_bookmarks(const char *pool, const char *bookmark) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_boolean(required, bookmark); + + IOC_INPUT_TEST_WILD(ZFS_IOC_DESTROY_BOOKMARKS, pool, required, NULL, 0); + + nvlist_free(required); +} + +static void +test_clone(const char *snapshot, const char *clone) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + + fnvlist_add_string(required, "origin", snapshot); + + IOC_INPUT_TEST(ZFS_IOC_CLONE, clone, required, NULL, 0); + + nvlist_free(props); + nvlist_free(optional); + nvlist_free(required); +} + +static void +test_rollback(const char *dataset, const char *snapshot) +{ + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_string(optional, "target", snapshot); + + IOC_INPUT_TEST(ZFS_IOC_ROLLBACK, dataset, NULL, optional, B_FALSE); + + nvlist_free(optional); +} + +static void +test_hold(const char *pool, const char *snapshot) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *holds = fnvlist_alloc(); + + fnvlist_add_string(holds, snapshot, "libzfs_check_hold"); + fnvlist_add_nvlist(required, "holds", holds); + fnvlist_add_int32(optional, "cleanup_fd", zfs_fd); + + IOC_INPUT_TEST(ZFS_IOC_HOLD, pool, required, optional, 0); + + nvlist_free(holds); + nvlist_free(optional); + nvlist_free(required); +} + +static void +test_get_holds(const char *snapshot) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_HOLDS, snapshot, NULL, NULL, 0); +} + +static void +test_release(const char *pool, const char *snapshot) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *release = fnvlist_alloc(); + + fnvlist_add_boolean(release, "libzfs_check_hold"); + fnvlist_add_nvlist(required, snapshot, release); + + IOC_INPUT_TEST_WILD(ZFS_IOC_RELEASE, pool, required, NULL, 0); + + nvlist_free(release); + nvlist_free(required); +} + + +static void +test_send_new(const char *snapshot, int fd) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_int32(required, "fd", fd); + + fnvlist_add_boolean(optional, "largeblockok"); + fnvlist_add_boolean(optional, "embedok"); + fnvlist_add_boolean(optional, "compressok"); + fnvlist_add_boolean(optional, "rawok"); + + /* + * TODO - Resumable send is harder to set up. So we currently + * ignore testing for that variant. + */ +#if 0 + fnvlist_add_string(optional, "fromsnap", from); + fnvlist_add_uint64(optional, "resume_object", resumeobj); + fnvlist_add_uint64(optional, "resume_offset", offset); + fnvlist_add_boolean(optional, "savedok"); +#endif + IOC_INPUT_TEST(ZFS_IOC_SEND_NEW, snapshot, required, optional, 0); + + nvlist_free(optional); + nvlist_free(required); +} + +static void +test_recv_new(const char *dataset, int fd) +{ + dmu_replay_record_t drr = { 0 }; + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *props = fnvlist_alloc(); + char snapshot[MAXNAMELEN + 32]; + ssize_t count; + + int cleanup_fd = open(ZFS_DEV, O_RDWR); + + (void) snprintf(snapshot, sizeof (snapshot), "%s@replicant", dataset); + + count = pread(fd, &drr, sizeof (drr), 0); + if (count != sizeof (drr)) { + (void) fprintf(stderr, "could not read stream: %s\n", + strerror(errno)); + } + + fnvlist_add_string(required, "snapname", snapshot); + fnvlist_add_byte_array(required, "begin_record", (uchar_t *)&drr, + sizeof (drr)); + fnvlist_add_int32(required, "input_fd", fd); + + fnvlist_add_string(props, "org.openzfs:launch", "September 17th, 2013"); + fnvlist_add_nvlist(optional, "localprops", props); + fnvlist_add_boolean(optional, "force"); + fnvlist_add_int32(optional, "cleanup_fd", cleanup_fd); + + /* + * TODO - Resumable receive is harder to set up. So we currently + * ignore testing for one. + */ +#if 0 + fnvlist_add_nvlist(optional, "props", recvdprops); + fnvlist_add_string(optional, "origin", origin); + fnvlist_add_boolean(optional, "resumable"); + fnvlist_add_uint64(optional, "action_handle", *action_handle); +#endif + IOC_INPUT_TEST(ZFS_IOC_RECV_NEW, dataset, required, optional, + ZFS_ERR_STREAM_TRUNCATED); + + nvlist_free(props); + nvlist_free(optional); + nvlist_free(required); + + (void) close(cleanup_fd); +} + +static void +test_send_space(const char *snapshot1, const char *snapshot2) +{ + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_string(optional, "from", snapshot1); + fnvlist_add_boolean(optional, "largeblockok"); + fnvlist_add_boolean(optional, "embedok"); + fnvlist_add_boolean(optional, "compressok"); + fnvlist_add_boolean(optional, "rawok"); + + IOC_INPUT_TEST(ZFS_IOC_SEND_SPACE, snapshot2, NULL, optional, 0); + + nvlist_free(optional); +} + +static void +test_remap(const char *dataset) +{ + IOC_INPUT_TEST(ZFS_IOC_REMAP, dataset, NULL, NULL, 0); +} + +static void +test_channel_program(const char *pool) +{ + const char *program = + "arg = ...\n" + "argv = arg[\"argv\"]\n" + "return argv[1]"; + char *const argv[1] = { "Hello World!" }; + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *args = fnvlist_alloc(); + + fnvlist_add_string(required, "program", program); + fnvlist_add_string_array(args, "argv", argv, 1); + fnvlist_add_nvlist(required, "arg", args); + + fnvlist_add_boolean_value(optional, "sync", B_TRUE); + fnvlist_add_uint64(optional, "instrlimit", 1000 * 1000); + fnvlist_add_uint64(optional, "memlimit", 8192 * 1024); + + IOC_INPUT_TEST(ZFS_IOC_CHANNEL_PROGRAM, pool, required, optional, 0); + + nvlist_free(args); + nvlist_free(optional); + nvlist_free(required); +} + +#define WRAPPING_KEY_LEN 32 + +static void +test_load_key(const char *dataset) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *hidden = fnvlist_alloc(); + uint8_t keydata[WRAPPING_KEY_LEN] = {0}; + + fnvlist_add_uint8_array(hidden, "wkeydata", keydata, sizeof (keydata)); + fnvlist_add_nvlist(required, "hidden_args", hidden); + fnvlist_add_boolean(optional, "noop"); + + IOC_INPUT_TEST(ZFS_IOC_LOAD_KEY, dataset, required, optional, EINVAL); + nvlist_free(hidden); + nvlist_free(optional); + nvlist_free(required); +} + +static void +test_change_key(const char *dataset) +{ + IOC_INPUT_TEST(ZFS_IOC_CHANGE_KEY, dataset, NULL, NULL, EINVAL); +} + +static void +test_unload_key(const char *dataset) +{ + IOC_INPUT_TEST(ZFS_IOC_UNLOAD_KEY, dataset, NULL, NULL, EACCES); +} + +static void +test_vdev_initialize(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *vdev_guids = fnvlist_alloc(); + + fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef); + fnvlist_add_uint64(required, ZPOOL_INITIALIZE_COMMAND, + POOL_INITIALIZE_START); + fnvlist_add_nvlist(required, ZPOOL_INITIALIZE_VDEVS, vdev_guids); + + IOC_INPUT_TEST(ZFS_IOC_POOL_INITIALIZE, pool, required, NULL, EINVAL); + nvlist_free(vdev_guids); + nvlist_free(required); +} + +static void +test_vdev_trim(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + nvlist_t *vdev_guids = fnvlist_alloc(); + + fnvlist_add_uint64(vdev_guids, "path", 0xdeadbeefdeadbeef); + fnvlist_add_uint64(required, ZPOOL_TRIM_COMMAND, POOL_TRIM_START); + fnvlist_add_nvlist(required, ZPOOL_TRIM_VDEVS, vdev_guids); + fnvlist_add_uint64(optional, ZPOOL_TRIM_RATE, 1ULL << 30); + fnvlist_add_boolean_value(optional, ZPOOL_TRIM_SECURE, B_TRUE); + + IOC_INPUT_TEST(ZFS_IOC_POOL_TRIM, pool, required, optional, EINVAL); + nvlist_free(vdev_guids); + nvlist_free(optional); + nvlist_free(required); +} + +static int +zfs_destroy(const char *dataset) +{ + zfs_cmd_t zc = {"\0"}; + int err; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; + err = zfs_ioctl_fd(zfs_fd, ZFS_IOC_DESTROY, &zc); + + return (err == 0 ? 0 : errno); +} + +static void +test_redact(const char *snapshot1, const char *snapshot2) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *snapnv = fnvlist_alloc(); + char bookmark[MAXNAMELEN + 32]; + + fnvlist_add_string(required, "bookname", "testbookmark"); + fnvlist_add_boolean(snapnv, snapshot2); + fnvlist_add_nvlist(required, "snapnv", snapnv); + + IOC_INPUT_TEST(ZFS_IOC_REDACT, snapshot1, required, NULL, 0); + + nvlist_free(snapnv); + nvlist_free(required); + + strlcpy(bookmark, snapshot1, sizeof (bookmark)); + *strchr(bookmark, '@') = '\0'; + strlcat(bookmark, "#testbookmark", sizeof (bookmark) - + strlen(bookmark)); + zfs_destroy(bookmark); +} + +static void +test_get_bookmark_props(const char *bookmark) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOKMARK_PROPS, bookmark, NULL, NULL, 0); +} + +static void +test_wait(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + nvlist_t *optional = fnvlist_alloc(); + + fnvlist_add_int32(required, "wait_activity", 2); + fnvlist_add_uint64(optional, "wait_tag", 0xdeadbeefdeadbeef); + + IOC_INPUT_TEST(ZFS_IOC_WAIT, pool, required, optional, EINVAL); + + nvlist_free(required); + nvlist_free(optional); +} + +static void +test_wait_fs(const char *dataset) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_int32(required, "wait_activity", 2); + + IOC_INPUT_TEST(ZFS_IOC_WAIT_FS, dataset, required, NULL, EINVAL); + + nvlist_free(required); +} + +static void +test_get_bootenv(const char *pool) +{ + IOC_INPUT_TEST(ZFS_IOC_GET_BOOTENV, pool, NULL, NULL, 0); +} + +static void +test_set_bootenv(const char *pool) +{ + nvlist_t *required = fnvlist_alloc(); + + fnvlist_add_string(required, "envmap", "test"); + + IOC_INPUT_TEST(ZFS_IOC_SET_BOOTENV, pool, required, NULL, 0); + + nvlist_free(required); +} + +static void +zfs_ioc_input_tests(const char *pool) +{ + char filepath[] = "/tmp/ioc_test_file_XXXXXX"; + char dataset[ZFS_MAX_DATASET_NAME_LEN]; + char snapbase[ZFS_MAX_DATASET_NAME_LEN + 32]; + char snapshot[ZFS_MAX_DATASET_NAME_LEN + 32]; + char bookmark[ZFS_MAX_DATASET_NAME_LEN + 32]; + char backup[ZFS_MAX_DATASET_NAME_LEN]; + char clone[ZFS_MAX_DATASET_NAME_LEN]; + char clonesnap[ZFS_MAX_DATASET_NAME_LEN + 32]; + int tmpfd, err; + + /* + * Setup names and create a working dataset + */ + (void) snprintf(dataset, sizeof (dataset), "%s/test-fs", pool); + (void) snprintf(snapbase, sizeof (snapbase), "%s@snapbase", dataset); + (void) snprintf(snapshot, sizeof (snapshot), "%s@snapshot", dataset); + (void) snprintf(bookmark, sizeof (bookmark), "%s#bookmark", dataset); + (void) snprintf(clone, sizeof (clone), "%s/test-fs-clone", pool); + (void) snprintf(clonesnap, sizeof (clonesnap), "%s@snap", clone); + (void) snprintf(backup, sizeof (backup), "%s/backup", pool); + + err = lzc_create(dataset, DMU_OST_ZFS, NULL, NULL, 0); + if (err) { + (void) fprintf(stderr, "could not create '%s': %s\n", + dataset, strerror(errno)); + exit(2); + } + + tmpfd = mkstemp(filepath); + if (tmpfd < 0) { + (void) fprintf(stderr, "could not create '%s': %s\n", + filepath, strerror(errno)); + exit(2); + } + + /* + * run a test for each ioctl + * Note that some test build on previous test operations + */ + test_pool_sync(pool); + test_pool_reopen(pool); + test_pool_checkpoint(pool); + test_pool_discard_checkpoint(pool); + test_log_history(pool); + + test_create(dataset); + test_snapshot(pool, snapbase); + test_snapshot(pool, snapshot); + + test_space_snaps(snapshot); + test_send_space(snapbase, snapshot); + test_send_new(snapshot, tmpfd); + test_recv_new(backup, tmpfd); + + test_bookmark(pool, snapshot, bookmark); + test_get_bookmarks(dataset); + test_get_bookmark_props(bookmark); + test_destroy_bookmarks(pool, bookmark); + + test_hold(pool, snapshot); + test_get_holds(snapshot); + test_release(pool, snapshot); + + test_clone(snapshot, clone); + test_snapshot(pool, clonesnap); + test_redact(snapshot, clonesnap); + zfs_destroy(clonesnap); + zfs_destroy(clone); + + test_rollback(dataset, snapshot); + test_destroy_snaps(pool, snapshot); + test_destroy_snaps(pool, snapbase); + + test_remap(dataset); + test_channel_program(pool); + + test_load_key(dataset); + test_change_key(dataset); + test_unload_key(dataset); + + test_vdev_initialize(pool); + test_vdev_trim(pool); + + test_wait(pool); + test_wait_fs(dataset); + + test_set_bootenv(pool); + test_get_bootenv(pool); + + /* + * cleanup + */ + zfs_cmd_t zc = {"\0"}; + + nvlist_t *snaps = fnvlist_alloc(); + fnvlist_add_boolean(snaps, snapshot); + (void) lzc_destroy_snaps(snaps, B_FALSE, NULL); + nvlist_free(snaps); + + (void) zfs_destroy(dataset); + (void) zfs_destroy(backup); + + (void) close(tmpfd); + (void) unlink(filepath); + + /* + * All the unused slots should yield ZFS_ERR_IOC_CMD_UNAVAIL + */ + for (int i = 0; i < ARRAY_SIZE(ioc_skip); i++) { + if (ioc_tested[ioc_skip[i] - ZFS_IOC_FIRST]) + (void) fprintf(stderr, "cmd %d tested, not skipped!\n", + (int)(ioc_skip[i] - ZFS_IOC_FIRST)); + + ioc_tested[ioc_skip[i] - ZFS_IOC_FIRST] = B_TRUE; + } + + (void) strlcpy(zc.zc_name, pool, sizeof (zc.zc_name)); + zc.zc_name[sizeof (zc.zc_name) - 1] = '\0'; + + for (unsigned ioc = ZFS_IOC_FIRST; ioc < ZFS_IOC_LAST; ioc++) { + unsigned cmd = ioc - ZFS_IOC_FIRST; + + if (ioc_tested[cmd]) + continue; + + if (zfs_ioctl_fd(zfs_fd, ioc, &zc) != 0 && + errno != ZFS_ERR_IOC_CMD_UNAVAIL) { + (void) fprintf(stderr, "cmd %d is missing a test case " + "(%d)\n", cmd, errno); + } + } +} + +enum zfs_ioc_ref { +#ifdef __FreeBSD__ + ZFS_IOC_BASE = 0, +#else + ZFS_IOC_BASE = ('Z' << 8), +#endif + ZFS_IOC_PLATFORM_BASE = ZFS_IOC_BASE + 0x80, +}; + +/* + * Canonical reference check of /dev/zfs ioctl numbers. + * These cannot change and new ioctl numbers must be appended. + */ +static boolean_t +validate_ioc_values(void) +{ + boolean_t result = B_TRUE; + +#define CHECK(expr) do { \ + if (!(expr)) { \ + result = B_FALSE; \ + fprintf(stderr, "(%s) === FALSE\n", #expr); \ + } \ +} while (0) + + CHECK(ZFS_IOC_BASE + 0 == ZFS_IOC_POOL_CREATE); + CHECK(ZFS_IOC_BASE + 1 == ZFS_IOC_POOL_DESTROY); + CHECK(ZFS_IOC_BASE + 2 == ZFS_IOC_POOL_IMPORT); + CHECK(ZFS_IOC_BASE + 3 == ZFS_IOC_POOL_EXPORT); + CHECK(ZFS_IOC_BASE + 4 == ZFS_IOC_POOL_CONFIGS); + CHECK(ZFS_IOC_BASE + 5 == ZFS_IOC_POOL_STATS); + CHECK(ZFS_IOC_BASE + 6 == ZFS_IOC_POOL_TRYIMPORT); + CHECK(ZFS_IOC_BASE + 7 == ZFS_IOC_POOL_SCAN); + CHECK(ZFS_IOC_BASE + 8 == ZFS_IOC_POOL_FREEZE); + CHECK(ZFS_IOC_BASE + 9 == ZFS_IOC_POOL_UPGRADE); + CHECK(ZFS_IOC_BASE + 10 == ZFS_IOC_POOL_GET_HISTORY); + CHECK(ZFS_IOC_BASE + 11 == ZFS_IOC_VDEV_ADD); + CHECK(ZFS_IOC_BASE + 12 == ZFS_IOC_VDEV_REMOVE); + CHECK(ZFS_IOC_BASE + 13 == ZFS_IOC_VDEV_SET_STATE); + CHECK(ZFS_IOC_BASE + 14 == ZFS_IOC_VDEV_ATTACH); + CHECK(ZFS_IOC_BASE + 15 == ZFS_IOC_VDEV_DETACH); + CHECK(ZFS_IOC_BASE + 16 == ZFS_IOC_VDEV_SETPATH); + CHECK(ZFS_IOC_BASE + 17 == ZFS_IOC_VDEV_SETFRU); + CHECK(ZFS_IOC_BASE + 18 == ZFS_IOC_OBJSET_STATS); + CHECK(ZFS_IOC_BASE + 19 == ZFS_IOC_OBJSET_ZPLPROPS); + CHECK(ZFS_IOC_BASE + 20 == ZFS_IOC_DATASET_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 21 == ZFS_IOC_SNAPSHOT_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 22 == ZFS_IOC_SET_PROP); + CHECK(ZFS_IOC_BASE + 23 == ZFS_IOC_CREATE); + CHECK(ZFS_IOC_BASE + 24 == ZFS_IOC_DESTROY); + CHECK(ZFS_IOC_BASE + 25 == ZFS_IOC_ROLLBACK); + CHECK(ZFS_IOC_BASE + 26 == ZFS_IOC_RENAME); + CHECK(ZFS_IOC_BASE + 27 == ZFS_IOC_RECV); + CHECK(ZFS_IOC_BASE + 28 == ZFS_IOC_SEND); + CHECK(ZFS_IOC_BASE + 29 == ZFS_IOC_INJECT_FAULT); + CHECK(ZFS_IOC_BASE + 30 == ZFS_IOC_CLEAR_FAULT); + CHECK(ZFS_IOC_BASE + 31 == ZFS_IOC_INJECT_LIST_NEXT); + CHECK(ZFS_IOC_BASE + 32 == ZFS_IOC_ERROR_LOG); + CHECK(ZFS_IOC_BASE + 33 == ZFS_IOC_CLEAR); + CHECK(ZFS_IOC_BASE + 34 == ZFS_IOC_PROMOTE); + CHECK(ZFS_IOC_BASE + 35 == ZFS_IOC_SNAPSHOT); + CHECK(ZFS_IOC_BASE + 36 == ZFS_IOC_DSOBJ_TO_DSNAME); + CHECK(ZFS_IOC_BASE + 37 == ZFS_IOC_OBJ_TO_PATH); + CHECK(ZFS_IOC_BASE + 38 == ZFS_IOC_POOL_SET_PROPS); + CHECK(ZFS_IOC_BASE + 39 == ZFS_IOC_POOL_GET_PROPS); + CHECK(ZFS_IOC_BASE + 40 == ZFS_IOC_SET_FSACL); + CHECK(ZFS_IOC_BASE + 41 == ZFS_IOC_GET_FSACL); + CHECK(ZFS_IOC_BASE + 42 == ZFS_IOC_SHARE); + CHECK(ZFS_IOC_BASE + 43 == ZFS_IOC_INHERIT_PROP); + CHECK(ZFS_IOC_BASE + 44 == ZFS_IOC_SMB_ACL); + CHECK(ZFS_IOC_BASE + 45 == ZFS_IOC_USERSPACE_ONE); + CHECK(ZFS_IOC_BASE + 46 == ZFS_IOC_USERSPACE_MANY); + CHECK(ZFS_IOC_BASE + 47 == ZFS_IOC_USERSPACE_UPGRADE); + CHECK(ZFS_IOC_BASE + 48 == ZFS_IOC_HOLD); + CHECK(ZFS_IOC_BASE + 49 == ZFS_IOC_RELEASE); + CHECK(ZFS_IOC_BASE + 50 == ZFS_IOC_GET_HOLDS); + CHECK(ZFS_IOC_BASE + 51 == ZFS_IOC_OBJSET_RECVD_PROPS); + CHECK(ZFS_IOC_BASE + 52 == ZFS_IOC_VDEV_SPLIT); + CHECK(ZFS_IOC_BASE + 53 == ZFS_IOC_NEXT_OBJ); + CHECK(ZFS_IOC_BASE + 54 == ZFS_IOC_DIFF); + CHECK(ZFS_IOC_BASE + 55 == ZFS_IOC_TMP_SNAPSHOT); + CHECK(ZFS_IOC_BASE + 56 == ZFS_IOC_OBJ_TO_STATS); + CHECK(ZFS_IOC_BASE + 57 == ZFS_IOC_SPACE_WRITTEN); + CHECK(ZFS_IOC_BASE + 58 == ZFS_IOC_SPACE_SNAPS); + CHECK(ZFS_IOC_BASE + 59 == ZFS_IOC_DESTROY_SNAPS); + CHECK(ZFS_IOC_BASE + 60 == ZFS_IOC_POOL_REGUID); + CHECK(ZFS_IOC_BASE + 61 == ZFS_IOC_POOL_REOPEN); + CHECK(ZFS_IOC_BASE + 62 == ZFS_IOC_SEND_PROGRESS); + CHECK(ZFS_IOC_BASE + 63 == ZFS_IOC_LOG_HISTORY); + CHECK(ZFS_IOC_BASE + 64 == ZFS_IOC_SEND_NEW); + CHECK(ZFS_IOC_BASE + 65 == ZFS_IOC_SEND_SPACE); + CHECK(ZFS_IOC_BASE + 66 == ZFS_IOC_CLONE); + CHECK(ZFS_IOC_BASE + 67 == ZFS_IOC_BOOKMARK); + CHECK(ZFS_IOC_BASE + 68 == ZFS_IOC_GET_BOOKMARKS); + CHECK(ZFS_IOC_BASE + 69 == ZFS_IOC_DESTROY_BOOKMARKS); + CHECK(ZFS_IOC_BASE + 70 == ZFS_IOC_RECV_NEW); + CHECK(ZFS_IOC_BASE + 71 == ZFS_IOC_POOL_SYNC); + CHECK(ZFS_IOC_BASE + 72 == ZFS_IOC_CHANNEL_PROGRAM); + CHECK(ZFS_IOC_BASE + 73 == ZFS_IOC_LOAD_KEY); + CHECK(ZFS_IOC_BASE + 74 == ZFS_IOC_UNLOAD_KEY); + CHECK(ZFS_IOC_BASE + 75 == ZFS_IOC_CHANGE_KEY); + CHECK(ZFS_IOC_BASE + 76 == ZFS_IOC_REMAP); + CHECK(ZFS_IOC_BASE + 77 == ZFS_IOC_POOL_CHECKPOINT); + CHECK(ZFS_IOC_BASE + 78 == ZFS_IOC_POOL_DISCARD_CHECKPOINT); + CHECK(ZFS_IOC_BASE + 79 == ZFS_IOC_POOL_INITIALIZE); + CHECK(ZFS_IOC_BASE + 80 == ZFS_IOC_POOL_TRIM); + CHECK(ZFS_IOC_BASE + 81 == ZFS_IOC_REDACT); + CHECK(ZFS_IOC_BASE + 82 == ZFS_IOC_GET_BOOKMARK_PROPS); + CHECK(ZFS_IOC_BASE + 83 == ZFS_IOC_WAIT); + CHECK(ZFS_IOC_BASE + 84 == ZFS_IOC_WAIT_FS); + CHECK(ZFS_IOC_PLATFORM_BASE + 1 == ZFS_IOC_EVENTS_NEXT); + CHECK(ZFS_IOC_PLATFORM_BASE + 2 == ZFS_IOC_EVENTS_CLEAR); + CHECK(ZFS_IOC_PLATFORM_BASE + 3 == ZFS_IOC_EVENTS_SEEK); + CHECK(ZFS_IOC_PLATFORM_BASE + 4 == ZFS_IOC_NEXTBOOT); + CHECK(ZFS_IOC_PLATFORM_BASE + 5 == ZFS_IOC_JAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 6 == ZFS_IOC_UNJAIL); + CHECK(ZFS_IOC_PLATFORM_BASE + 7 == ZFS_IOC_SET_BOOTENV); + CHECK(ZFS_IOC_PLATFORM_BASE + 8 == ZFS_IOC_GET_BOOTENV); + +#undef CHECK + + return (result); +} + +int +main(int argc, const char *argv[]) +{ + if (argc != 2) { + (void) fprintf(stderr, "usage: %s \n", argv[0]); + exit(2); + } + + if (!validate_ioc_values()) { + (void) fprintf(stderr, "WARNING: zfs_ioc_t has binary " + "incompatible command values\n"); + exit(3); + } + + (void) libzfs_core_init(); + zfs_fd = open(ZFS_DEV, O_RDWR); + if (zfs_fd < 0) { + (void) fprintf(stderr, "open: %s\n", strerror(errno)); + libzfs_core_fini(); + exit(2); + } + + zfs_ioc_input_tests(argv[1]); + + (void) close(zfs_fd); + libzfs_core_fini(); + + return (unexpected_failures); +} diff --git a/tests/zfs-tests/cmd/mkbusy/.gitignore b/tests/zfs-tests/cmd/mkbusy/.gitignore new file mode 100644 index 000000000000..18d099c08eec --- /dev/null +++ b/tests/zfs-tests/cmd/mkbusy/.gitignore @@ -0,0 +1 @@ +/mkbusy diff --git a/tests/zfs-tests/cmd/mkbusy/Makefile.am b/tests/zfs-tests/cmd/mkbusy/Makefile.am new file mode 100644 index 000000000000..abae69dea8c7 --- /dev/null +++ b/tests/zfs-tests/cmd/mkbusy/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mkbusy +mkbusy_SOURCES = mkbusy.c diff --git a/tests/zfs-tests/cmd/mkbusy/mkbusy.c b/tests/zfs-tests/cmd/mkbusy/mkbusy.c new file mode 100644 index 000000000000..a03076ffc003 --- /dev/null +++ b/tests/zfs-tests/cmd/mkbusy/mkbusy.c @@ -0,0 +1,177 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * Make a directory busy. If the argument is an existing file or directory, + * simply open it directly and pause. If not, verify that the parent directory + * exists, and create a new file in that directory. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static void +usage(char *progname) +{ + (void) fprintf(stderr, "Usage: %s \n", progname); + exit(1); +} + +static void +fail(char *err, int rval) +{ + perror(err); + exit(rval); +} + +static void +daemonize(void) +{ + pid_t pid; + + if ((pid = fork()) < 0) { + fail("fork", 1); + } else if (pid != 0) { + (void) fprintf(stdout, "%ld\n", (long)pid); + exit(0); + } + + (void) setsid(); + (void) close(0); + (void) close(1); + (void) close(2); +} + +int +main(int argc, char *argv[]) +{ + int ret, c; + boolean_t isdir = B_FALSE; + boolean_t fflag = B_FALSE; + boolean_t rflag = B_FALSE; + struct stat sbuf; + char *fpath = NULL; + char *prog = argv[0]; + + while ((c = getopt(argc, argv, "fr")) != -1) { + switch (c) { + /* Open the file or directory read only */ + case 'r': + rflag = B_TRUE; + break; + /* Run in the foreground */ + case 'f': + fflag = B_TRUE; + break; + default: + usage(prog); + } + } + + argc -= optind; + argv += optind; + + if (argc != 1) + usage(prog); + + if ((ret = stat(argv[0], &sbuf)) != 0) { + char *arg, *dname, *fname; + int arglen; + char *slash; + int rc; + + /* + * The argument supplied doesn't exist. Copy the path, and + * remove the trailing slash if present. + */ + if ((arg = strdup(argv[0])) == NULL) + fail("strdup", 1); + arglen = strlen(arg); + if (arg[arglen - 1] == '/') + arg[arglen - 1] = '\0'; + + /* + * Get the directory and file names, using the current directory + * if the provided path doesn't specify a directory at all. + */ + if ((slash = strrchr(arg, '/')) == NULL) { + dname = strdup("."); + fname = strdup(arg); + } else { + *slash = '\0'; + dname = strdup(arg); + fname = strdup(slash + 1); + } + free(arg); + if (dname == NULL || fname == NULL) + fail("strdup", 1); + + /* The directory portion of the path must exist */ + if ((ret = stat(dname, &sbuf)) != 0 || !(sbuf.st_mode & + S_IFDIR)) + usage(prog); + + rc = asprintf(&fpath, "%s/%s", dname, fname); + free(dname); + free(fname); + if (rc == -1 || fpath == NULL) + fail("asprintf", 1); + + } else if ((sbuf.st_mode & S_IFMT) == S_IFREG || + (sbuf.st_mode & S_IFMT) == S_IFLNK || + (sbuf.st_mode & S_IFMT) == S_IFCHR || + (sbuf.st_mode & S_IFMT) == S_IFBLK) { + fpath = strdup(argv[0]); + } else if ((sbuf.st_mode & S_IFMT) == S_IFDIR) { + fpath = strdup(argv[0]); + isdir = B_TRUE; + } else { + usage(prog); + } + + if (fpath == NULL) + fail("strdup", 1); + + if (isdir == B_FALSE) { + int fd, flags; + mode_t mode = S_IRUSR | S_IWUSR; + + flags = rflag == B_FALSE ? O_CREAT | O_RDWR : O_RDONLY; + + if ((fd = open(fpath, flags, mode)) < 0) + fail("open", 1); + } else { + DIR *dp; + + if ((dp = opendir(fpath)) == NULL) + fail("opendir", 1); + } + free(fpath); + + if (fflag == B_FALSE) + daemonize(); + (void) pause(); + + /* NOTREACHED */ + return (0); +} diff --git a/tests/zfs-tests/cmd/mkfile/.gitignore b/tests/zfs-tests/cmd/mkfile/.gitignore new file mode 100644 index 000000000000..93e9a8a6ded4 --- /dev/null +++ b/tests/zfs-tests/cmd/mkfile/.gitignore @@ -0,0 +1 @@ +/mkfile diff --git a/tests/zfs-tests/cmd/mkfile/Makefile.am b/tests/zfs-tests/cmd/mkfile/Makefile.am new file mode 100644 index 000000000000..5f0e2e03efd9 --- /dev/null +++ b/tests/zfs-tests/cmd/mkfile/Makefile.am @@ -0,0 +1,8 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mkfile +mkfile_SOURCES = mkfile.c + +mkfile_LDADD = $(LTLIBINTL) diff --git a/tests/zfs-tests/cmd/mkfile/mkfile.c b/tests/zfs-tests/cmd/mkfile/mkfile.c new file mode 100644 index 000000000000..4cf3755faa6a --- /dev/null +++ b/tests/zfs-tests/cmd/mkfile/mkfile.c @@ -0,0 +1,282 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License, Version 1.0 only + * (the "License"). You may not use this file except in compliance + * with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define BLOCKSIZE 512 /* bytes */ +#define KILOBYTE 1024 +#define MEGABYTE (KILOBYTE * KILOBYTE) +#define GIGABYTE (KILOBYTE * MEGABYTE) + +#define FILE_MODE (S_ISVTX + S_IRUSR + S_IWUSR) + +static void usage(void); + +int +main(int argc, char **argv) +{ + char *opts; + off_t size; + size_t len; + size_t mult = 1; + char *buf = NULL; + size_t bufsz = 0; + int errors = 0; + int i; + int verbose = 0; /* option variable */ + int nobytes = 0; /* option variable */ + int saverr; + + if (argc == 1) + usage(); + + while (argv[1] && argv[1][0] == '-') { + opts = &argv[1][0]; + while (*(++opts)) { + switch (*opts) { + case 'v': + verbose++; + break; + case 'n': + nobytes++; + break; + default: + usage(); + } + } + argc--; + argv++; + } + if (argc < 3) + usage(); + + len = strlen(argv[1]); + if (len && isalpha(argv[1][len-1])) { + switch (argv[1][len-1]) { + case 'k': + case 'K': + mult = KILOBYTE; + break; + case 'b': + case 'B': + mult = BLOCKSIZE; + break; + case 'm': + case 'M': + mult = MEGABYTE; + break; + case 'g': + case 'G': + mult = GIGABYTE; + break; + default: + (void) fprintf(stderr, + gettext("unknown size %s\n"), argv[1]); + usage(); + } + + for (i = 0; i <= (len-2); i++) { + if (!isdigit(argv[1][i])) { + (void) fprintf(stderr, + gettext("unknown size %s\n"), argv[1]); + usage(); + } + } + argv[1][len-1] = '\0'; + } + size = ((off_t)atoll(argv[1]) * (off_t)mult); + + argv++; + argc--; + + while (argc > 1) { + int fd; + + if (verbose) + (void) fprintf(stdout, gettext("%s %lld bytes\n"), + argv[1], (offset_t)size); + fd = open(argv[1], O_CREAT|O_TRUNC|O_RDWR, FILE_MODE); + if (fd < 0) { + saverr = errno; + (void) fprintf(stderr, + gettext("Could not open %s: %s\n"), + argv[1], strerror(saverr)); + errors++; + argv++; + argc--; + continue; + } else if (fchown(fd, getuid(), getgid()) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not set owner/group of %s: %s\n"), + argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } else if (lseek(fd, (off_t)size-1, SEEK_SET) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not seek to offset %ld in %s: %s\n"), + (unsigned long)size-1, argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } else if (write(fd, "", 1) != 1) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not set length of %s: %s\n"), + argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } + + if (!nobytes) { + off_t written = 0; + struct stat64 st; + + if (lseek(fd, (off_t)0, SEEK_SET) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not seek to beginning of %s: %s\n"), + argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } + if (fstat64(fd, &st) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Could not fstat64 %s: %s\n"), + argv[1], strerror(saverr)); + (void) close(fd); + errors++; + argv++; + argc--; + continue; + } + if (bufsz != st.st_blksize) { + if (buf) + free(buf); + bufsz = (size_t)st.st_blksize; + buf = calloc(1, bufsz); + if (buf == NULL) { + (void) fprintf(stderr, gettext( + "Could not allocate buffer of" + " size %d\n"), (int)bufsz); + (void) close(fd); + bufsz = 0; + errors++; + argv++; + argc--; + continue; + } + } + while (written < size) { + ssize_t result; + size_t bytes = (size_t)MIN(bufsz, size-written); + + if ((result = write(fd, buf, bytes)) != + (ssize_t)bytes) { + saverr = errno; + if (result < 0) + result = 0; + written += result; + (void) fprintf(stderr, gettext( + "%s: initialized %lu of %lu bytes: %s\n"), + argv[1], (unsigned long)written, + (unsigned long)size, + strerror(saverr)); + errors++; + break; + } + written += bytes; + } + + /* + * A write(2) call in the above loop failed so + * close out this file and go on (error was + * already incremented when the write(2) failed). + */ + if (written < size) { + (void) close(fd); + argv++; + argc--; + continue; + } + } + if (close(fd) < 0) { + saverr = errno; + (void) fprintf(stderr, gettext( + "Error encountered when closing %s: %s\n"), + argv[1], strerror(saverr)); + errors++; + argv++; + argc--; + continue; + } + + /* + * Only set the modes (including the sticky bit) if we + * had no problems. It is not an error for the chmod(2) + * to fail, but do issue a warning. + */ + if (chmod(argv[1], FILE_MODE) < 0) + (void) fprintf(stderr, gettext( + "warning: couldn't set mode to %#o\n"), FILE_MODE); + + argv++; + argc--; + } + return (errors); +} + +static void usage() +{ + (void) fprintf(stderr, gettext( + "Usage: mkfile [-nv] [g|k|b|m] [] ...\n")); + exit(1); + /* NOTREACHED */ +} diff --git a/tests/zfs-tests/cmd/mkfiles/.gitignore b/tests/zfs-tests/cmd/mkfiles/.gitignore new file mode 100644 index 000000000000..cee4858b701b --- /dev/null +++ b/tests/zfs-tests/cmd/mkfiles/.gitignore @@ -0,0 +1 @@ +/mkfiles diff --git a/tests/zfs-tests/cmd/mkfiles/Makefile.am b/tests/zfs-tests/cmd/mkfiles/Makefile.am new file mode 100644 index 000000000000..54c21597f3eb --- /dev/null +++ b/tests/zfs-tests/cmd/mkfiles/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mkfiles +mkfiles_SOURCES = mkfiles.c diff --git a/tests/zfs-tests/cmd/mkfiles/mkfiles.c b/tests/zfs-tests/cmd/mkfiles/mkfiles.c new file mode 100644 index 000000000000..32abfd0c3d67 --- /dev/null +++ b/tests/zfs-tests/cmd/mkfiles/mkfiles.c @@ -0,0 +1,66 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +#define MAX_INT_LENGTH 10 + +static void +usage(char *msg, int exit_value) +{ + (void) fprintf(stderr, "mkfiles basename max_file [min_file]\n"); + (void) fprintf(stderr, "%s\n", msg); + exit(exit_value); +} + +int +main(int argc, char **argv) +{ + unsigned int numfiles = 0; + unsigned int first_file = 0; + unsigned int i; + char buf[MAXPATHLEN]; + + if (argc < 3 || argc > 4) + usage("Invalid number of arguments", -1); + + if (sscanf(argv[2], "%u", &numfiles) != 1) + usage("Invalid maximum file", -2); + + if (argc == 4 && sscanf(argv[3], "%u", &first_file) != 1) + usage("Invalid first file", -3); + + for (i = first_file; i < first_file + numfiles; i++) { + int fd; + (void) snprintf(buf, MAXPATHLEN, "%s%u", argv[1], i); + if ((fd = open(buf, O_CREAT | O_EXCL, O_RDWR)) == -1) { + (void) fprintf(stderr, "Failed to create %s %s\n", buf, + strerror(errno)); + return (-4); + } else if (fchown(fd, getuid(), getgid()) < 0) { + (void) fprintf(stderr, "Failed to chown %s %s\n", buf, + strerror(errno)); + return (-5); + } + (void) close(fd); + } + return (0); +} diff --git a/tests/zfs-tests/cmd/mktree/.gitignore b/tests/zfs-tests/cmd/mktree/.gitignore new file mode 100644 index 000000000000..588bc6d1cce6 --- /dev/null +++ b/tests/zfs-tests/cmd/mktree/.gitignore @@ -0,0 +1 @@ +/mktree diff --git a/tests/zfs-tests/cmd/mktree/Makefile.am b/tests/zfs-tests/cmd/mktree/Makefile.am new file mode 100644 index 000000000000..88c74ae0a346 --- /dev/null +++ b/tests/zfs-tests/cmd/mktree/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mktree +mktree_SOURCES = mktree.c diff --git a/tests/zfs-tests/cmd/mktree/mktree.c b/tests/zfs-tests/cmd/mktree/mktree.c new file mode 100644 index 000000000000..25b26c9e151b --- /dev/null +++ b/tests/zfs-tests/cmd/mktree/mktree.c @@ -0,0 +1,191 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#ifdef __linux__ +#include +#endif +#include +#include +#include + +#define TYPE_D 'D' +#define TYPE_F 'F' + +static char fdname[MAXPATHLEN] = {0}; +static char *pbasedir = NULL; +static int nlevel = 2; +static int ndir = 2; +static int nfile = 2; + +static void usage(char *this); +static void crtfile(char *pname); +static char *getfdname(char *pdir, char type, int level, int dir, int file); +static int mktree(char *pbasedir, int level); + +int +main(int argc, char *argv[]) +{ + int c, ret; + + while ((c = getopt(argc, argv, "b:l:d:f:")) != -1) { + switch (c) { + case 'b': + pbasedir = optarg; + break; + case 'l': + nlevel = atoi(optarg); + break; + case 'd': + ndir = atoi(optarg); + break; + case 'f': + nfile = atoi(optarg); + break; + case '?': + usage(argv[0]); + } + } + if (nlevel < 0 || ndir < 0 || nfile < 0 || pbasedir == NULL) { + usage(argv[0]); + } + + ret = mktree(pbasedir, 1); + + return (ret); +} + +static void +usage(char *this) +{ + (void) fprintf(stderr, + "\tUsage: %s -b -l [nlevel] -d [ndir] -f [nfile]\n", + this); + exit(1); +} + +static int +mktree(char *pdir, int level) +{ + int d, f; + char dname[MAXPATHLEN] = {0}; + char fname[MAXPATHLEN] = {0}; + + if (level > nlevel) { + return (1); + } + + for (d = 0; d < ndir; d++) { + (void) memset(dname, '\0', sizeof (dname)); + (void) strcpy(dname, getfdname(pdir, TYPE_D, level, d, 0)); + + if (mkdir(dname, 0777) != 0) { + (void) fprintf(stderr, "mkdir(%s) failed." + "\n[%d]: %s.\n", + dname, errno, strerror(errno)); + exit(errno); + } + + /* + * No sub-directory need be created, only create files in it. + */ + if (mktree(dname, level+1) != 0) { + for (f = 0; f < nfile; f++) { + (void) memset(fname, '\0', sizeof (fname)); + (void) strcpy(fname, + getfdname(dname, TYPE_F, level+1, d, f)); + crtfile(fname); + } + } + } + + for (f = 0; f < nfile; f++) { + (void) memset(fname, '\0', sizeof (fname)); + (void) strcpy(fname, getfdname(pdir, TYPE_F, level, d, f)); + crtfile(fname); + } + + return (0); +} + +static char * +getfdname(char *pdir, char type, int level, int dir, int file) +{ + size_t size = sizeof (fdname); + if (snprintf(fdname, size, "%s/%c-l%dd%df%d", pdir, type, level, dir, + file) >= size) { + (void) fprintf(stderr, "fdname truncated\n"); + exit(EINVAL); + } + return (fdname); +} + +static void +crtfile(char *pname) +{ + int fd = -1; + int i, size; + char *context = "0123456789ABCDF"; + char *pbuf; + + if (pname == NULL) { + exit(1); + } + + size = sizeof (char) * 1024; + pbuf = (char *)valloc(size); + for (i = 0; i < size / strlen(context); i++) { + int offset = i * strlen(context); + (void) snprintf(pbuf+offset, size-offset, "%s", context); + } + + if ((fd = open(pname, O_CREAT|O_RDWR, 0777)) < 0) { + (void) fprintf(stderr, "open(%s, O_CREAT|O_RDWR, 0777) failed." + "\n[%d]: %s.\n", pname, errno, strerror(errno)); + exit(errno); + } + if (write(fd, pbuf, 1024) < 1024) { + (void) fprintf(stderr, "write(fd, pbuf, 1024) failed." + "\n[%d]: %s.\n", errno, strerror(errno)); + exit(errno); + } + +#ifdef __linux__ + if (fsetxattr(fd, "user.xattr", pbuf, 1024, 0) < 0) { + (void) fprintf(stderr, "fsetxattr(fd, \"xattr\", pbuf, " + "1024, 0) failed.\n[%d]: %s.\n", errno, strerror(errno)); + exit(errno); + } +#endif + + (void) close(fd); + free(pbuf); +} diff --git a/tests/zfs-tests/cmd/mmap_exec/.gitignore b/tests/zfs-tests/cmd/mmap_exec/.gitignore new file mode 100644 index 000000000000..63a68bbc681e --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_exec/.gitignore @@ -0,0 +1 @@ +/mmap_exec diff --git a/tests/zfs-tests/cmd/mmap_exec/Makefile.am b/tests/zfs-tests/cmd/mmap_exec/Makefile.am new file mode 100644 index 000000000000..ab9f81be9463 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_exec/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mmap_exec +mmap_exec_SOURCES = mmap_exec.c diff --git a/tests/zfs-tests/cmd/mmap_exec/mmap_exec.c b/tests/zfs-tests/cmd/mmap_exec/mmap_exec.c new file mode 100644 index 000000000000..db90adbdca10 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_exec/mmap_exec.c @@ -0,0 +1,72 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +int +main(int argc, char *argv[]) +{ + int error, fd; + struct stat statbuf; + + if (argc != 2) { + (void) printf("Error: missing binary name.\n"); + (void) printf("Usage:\n\t%s \n", + argv[0]); + return (1); + } + + errno = 0; + + if ((fd = open(argv[1], O_RDONLY)) < 0) { + error = errno; + perror("open"); + return (error); + } + if (fstat(fd, &statbuf) < 0) { + error = errno; + perror("fstat"); + return (error); + } + + if (mmap(0, statbuf.st_size, + PROT_EXEC, MAP_SHARED, fd, 0) == MAP_FAILED) { + error = errno; + perror("mmap"); + return (error); + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/mmap_libaio/.gitignore b/tests/zfs-tests/cmd/mmap_libaio/.gitignore new file mode 100644 index 000000000000..792c8d3400b0 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_libaio/.gitignore @@ -0,0 +1 @@ +/mmap_libaio diff --git a/tests/zfs-tests/cmd/mmap_libaio/Makefile.am b/tests/zfs-tests/cmd/mmap_libaio/Makefile.am new file mode 100644 index 000000000000..25f9dda2b623 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_libaio/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +if WANT_MMAP_LIBAIO +pkgexec_PROGRAMS = mmap_libaio +mmap_libaio_SOURCES = mmap_libaio.c +mmap_libaio_CFLAGS = $(AM_CFLAGS) $(LIBAIO_CFLAGS) +mmap_libaio_LDADD = $(LIBAIO_LIBS) +endif diff --git a/tests/zfs-tests/cmd/mmap_libaio/mmap_libaio.c b/tests/zfs-tests/cmd/mmap_libaio/mmap_libaio.c new file mode 100644 index 000000000000..21119ebca9d6 --- /dev/null +++ b/tests/zfs-tests/cmd/mmap_libaio/mmap_libaio.c @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2018 Canonical. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +io_context_t io_ctx; + +static void +do_sync_io(struct iocb *iocb) +{ + struct io_event event; + struct iocb *iocbs[] = { iocb }; + struct timespec ts = { 30, 0 }; + + if (io_submit(io_ctx, 1, iocbs) != 1) + err(1, "io_submit failed"); + + if (io_getevents(io_ctx, 0, 1, &event, &ts) != 1) + err(1, "io_getevents failed"); +} + +int +main(int argc, char **argv) +{ + char *buf; + int page_size = getpagesize(); + int buf_size = strtol(argv[2], NULL, 0); + int rwfd; + struct iocb iocb; + + if (io_queue_init(1024, &io_ctx)) + err(1, "io_queue_init failed"); + + rwfd = open(argv[1], O_RDWR | O_CREAT, S_IRUSR | S_IWUSR); + if (rwfd < 0) + err(1, "open failed"); + + if (ftruncate(rwfd, buf_size) < 0) + err(1, "ftruncate failed"); + + buf = mmap(0, page_size, PROT_READ | PROT_WRITE, MAP_SHARED, rwfd, 0); + if (buf == MAP_FAILED) + err(1, "mmap failed"); + + (void) io_prep_pwrite(&iocb, rwfd, buf, buf_size, 0); + do_sync_io(&iocb); + + (void) io_prep_pread(&iocb, rwfd, buf, buf_size, 0); + do_sync_io(&iocb); + + if (close(rwfd)) + err(1, "close failed"); + + if (io_queue_release(io_ctx) != 0) + err(1, "io_queue_release failed"); + + return (0); +} diff --git a/tests/zfs-tests/cmd/mmapwrite/.gitignore b/tests/zfs-tests/cmd/mmapwrite/.gitignore new file mode 100644 index 000000000000..4e7043bbfd58 --- /dev/null +++ b/tests/zfs-tests/cmd/mmapwrite/.gitignore @@ -0,0 +1 @@ +/mmapwrite diff --git a/tests/zfs-tests/cmd/mmapwrite/Makefile.am b/tests/zfs-tests/cmd/mmapwrite/Makefile.am new file mode 100644 index 000000000000..b21b9e779bf2 --- /dev/null +++ b/tests/zfs-tests/cmd/mmapwrite/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = mmapwrite +mmapwrite_SOURCES = mmapwrite.c +mmapwrite_LDADD = -lpthread diff --git a/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c new file mode 100644 index 000000000000..458d6d8e402b --- /dev/null +++ b/tests/zfs-tests/cmd/mmapwrite/mmapwrite.c @@ -0,0 +1,162 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * -------------------------------------------------------------------- + * Bug Issue Id: #7512 + * The bug time sequence: + * 1. context #1, zfs_write assign a txg "n". + * 2. In the same process, context #2, mmap page fault (which means the mm_sem + * is hold) occurred, zfs_dirty_inode open a txg failed, and wait previous + * txg "n" completed. + * 3. context #1 call uiomove to write, however page fault is occurred in + * uiomove, which means it needs mm_sem, but mm_sem is hold by + * context #2, so it stuck and can't complete, then txg "n" will not + * complete. + * + * So context #1 and context #2 trap into the "dead lock". + * -------------------------------------------------------------------- + */ + +#define NORMAL_WRITE_TH_NUM 2 + +static void * +normal_writer(void *filename) +{ + char *file_path = filename; + int fd = -1; + ssize_t write_num = 0; + int page_size = getpagesize(); + + fd = open(file_path, O_RDWR | O_CREAT, 0777); + if (fd == -1) { + err(1, "failed to open %s", file_path); + } + + char *buf = malloc(1); + while (1) { + write_num = write(fd, buf, 1); + if (write_num == 0) { + err(1, "write failed!"); + break; + } + lseek(fd, page_size, SEEK_CUR); + } + + if (buf) { + free(buf); + } +} + +static void * +map_writer(void *filename) +{ + int fd = -1; + int ret = 0; + char *buf = NULL; + int page_size = getpagesize(); + int op_errno = 0; + char *file_path = filename; + + while (1) { + ret = access(file_path, F_OK); + if (ret) { + op_errno = errno; + if (op_errno == ENOENT) { + fd = open(file_path, O_RDWR | O_CREAT, 0777); + if (fd == -1) { + err(1, "open file failed"); + } + + ret = ftruncate(fd, page_size); + if (ret == -1) { + err(1, "truncate file failed"); + } + } else { + err(1, "access file failed!"); + } + } else { + fd = open(file_path, O_RDWR, 0777); + if (fd == -1) { + err(1, "open file failed"); + } + } + + if ((buf = mmap(NULL, page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, fd, 0)) == MAP_FAILED) { + err(1, "map file failed"); + } + + if (fd != -1) + close(fd); + + char s[10] = {0, }; + memcpy(buf, s, 10); + ret = munmap(buf, page_size); + if (ret != 0) { + err(1, "unmap file failed"); + } + } +} + +int +main(int argc, char **argv) +{ + pthread_t map_write_tid; + pthread_t normal_write_tid[NORMAL_WRITE_TH_NUM]; + int i = 0; + + if (argc != 3) { + (void) printf("usage: %s " + "\n", argv[0]); + exit(1); + } + + for (i = 0; i < NORMAL_WRITE_TH_NUM; i++) { + if (pthread_create(&normal_write_tid[i], NULL, normal_writer, + argv[1])) { + err(1, "pthread_create normal_writer failed."); + } + } + + if (pthread_create(&map_write_tid, NULL, map_writer, argv[2])) { + err(1, "pthread_create map_writer failed."); + } + + /* NOTREACHED */ + pthread_join(map_write_tid, NULL); + return (0); +} diff --git a/tests/zfs-tests/cmd/nvlist_to_lua/.gitignore b/tests/zfs-tests/cmd/nvlist_to_lua/.gitignore new file mode 100644 index 000000000000..b31db6454dce --- /dev/null +++ b/tests/zfs-tests/cmd/nvlist_to_lua/.gitignore @@ -0,0 +1 @@ +/nvlist_to_lua diff --git a/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am b/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am new file mode 100644 index 000000000000..511b6c6913bb --- /dev/null +++ b/tests/zfs-tests/cmd/nvlist_to_lua/Makefile.am @@ -0,0 +1,10 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = nvlist_to_lua + +nvlist_to_lua_SOURCES = nvlist_to_lua.c +nvlist_to_lua_LDADD = \ + $(abs_top_builddir)/lib/libzfs_core/libzfs_core.la \ + $(abs_top_builddir)/lib/libnvpair/libnvpair.la diff --git a/tests/zfs-tests/cmd/nvlist_to_lua/nvlist_to_lua.c b/tests/zfs-tests/cmd/nvlist_to_lua/nvlist_to_lua.c new file mode 100644 index 000000000000..b130d667f231 --- /dev/null +++ b/tests/zfs-tests/cmd/nvlist_to_lua/nvlist_to_lua.c @@ -0,0 +1,305 @@ +/* + * CDDL HEADER START + * + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2016 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include + +nvlist_t *nvl; +const char *pool; +boolean_t unexpected_failures; + +static boolean_t +nvlist_equal(nvlist_t *nvla, nvlist_t *nvlb) +{ + if (fnvlist_num_pairs(nvla) != fnvlist_num_pairs(nvlb)) + return (B_FALSE); + /* + * The nvlists have the same number of pairs and keys are unique, so + * if every key in A is also in B and assigned to the same value, the + * lists are identical. + */ + for (nvpair_t *pair = nvlist_next_nvpair(nvla, NULL); + pair != NULL; pair = nvlist_next_nvpair(nvla, pair)) { + char *key = nvpair_name(pair); + + if (!nvlist_exists(nvlb, key)) + return (B_FALSE); + + if (nvpair_type(pair) != + nvpair_type(fnvlist_lookup_nvpair(nvlb, key))) + return (B_FALSE); + + switch (nvpair_type(pair)) { + case DATA_TYPE_BOOLEAN_VALUE: + if (fnvpair_value_boolean_value(pair) != + fnvlist_lookup_boolean_value(nvlb, key)) { + return (B_FALSE); + } + break; + case DATA_TYPE_STRING: + if (strcmp(fnvpair_value_string(pair), + fnvlist_lookup_string(nvlb, key))) { + return (B_FALSE); + } + break; + case DATA_TYPE_INT64: + if (fnvpair_value_int64(pair) != + fnvlist_lookup_int64(nvlb, key)) { + return (B_FALSE); + } + break; + case DATA_TYPE_NVLIST: + if (!nvlist_equal(fnvpair_value_nvlist(pair), + fnvlist_lookup_nvlist(nvlb, key))) { + return (B_FALSE); + } + break; + default: + (void) printf("Unexpected type for nvlist_equal\n"); + return (B_FALSE); + } + } + return (B_TRUE); +} + +static void +test(const char *testname, boolean_t expect_success, boolean_t expect_match) +{ + char *progstr = "input = ...; return {output=input}"; + + nvlist_t *outnvl; + + (void) printf("\nrunning test '%s'; input:\n", testname); + dump_nvlist(nvl, 4); + + int err = lzc_channel_program(pool, progstr, + 10 * 1000 * 1000, 10 * 1024 * 1024, nvl, &outnvl); + + (void) printf("lzc_channel_program returned %u\n", err); + dump_nvlist(outnvl, 5); + + if (err == 0 && expect_match) { + /* + * Verify that outnvl is the same as input nvl, if we expect + * them to be. The input and output will never match if the + * input contains an array (since arrays are converted to lua + * tables), so this is only asserted for some test cases. + */ + nvlist_t *real_outnvl = fnvlist_lookup_nvlist(outnvl, "return"); + real_outnvl = fnvlist_lookup_nvlist(real_outnvl, "output"); + if (!nvlist_equal(nvl, real_outnvl)) { + unexpected_failures = B_TRUE; + (void) printf("unexpected input/output mismatch for " + "case: %s\n", testname); + } + } + if (err != 0 && expect_success) { + unexpected_failures = B_TRUE; + (void) printf("unexpected FAIL of case: %s\n", testname); + } + + fnvlist_free(nvl); + nvl = fnvlist_alloc(); +} + +static void +run_tests(void) +{ + const char *key = "key"; + + /* Note: maximum nvlist key length is 32KB */ + int len = 1024 * 31; + char *bigstring = malloc(len); + for (int i = 0; i < len; i++) + bigstring[i] = 'a' + i % 26; + bigstring[len - 1] = '\0'; + + nvl = fnvlist_alloc(); + + fnvlist_add_boolean(nvl, key); + test("boolean", B_TRUE, B_FALSE); + + fnvlist_add_boolean_value(nvl, key, B_TRUE); + test("boolean_value", B_FALSE, B_FALSE); + + fnvlist_add_byte(nvl, key, 1); + test("byte", B_FALSE, B_FALSE); + + fnvlist_add_int8(nvl, key, 1); + test("int8", B_FALSE, B_FALSE); + + fnvlist_add_uint8(nvl, key, 1); + test("uint8", B_FALSE, B_FALSE); + + fnvlist_add_int16(nvl, key, 1); + test("int16", B_FALSE, B_FALSE); + + fnvlist_add_uint16(nvl, key, 1); + test("uint16", B_FALSE, B_FALSE); + + fnvlist_add_int32(nvl, key, 1); + test("int32", B_FALSE, B_FALSE); + + fnvlist_add_uint32(nvl, key, 1); + test("uint32", B_FALSE, B_FALSE); + + fnvlist_add_int64(nvl, key, 1); + test("int64", B_TRUE, B_TRUE); + + fnvlist_add_uint64(nvl, key, 1); + test("uint64", B_FALSE, B_FALSE); + + fnvlist_add_string(nvl, key, "1"); + test("string", B_TRUE, B_TRUE); + + + { + nvlist_t *val = fnvlist_alloc(); + fnvlist_add_string(val, "subkey", "subvalue"); + fnvlist_add_nvlist(nvl, key, val); + fnvlist_free(val); + test("nvlist", B_TRUE, B_TRUE); + } + { + boolean_t val[2] = { B_FALSE, B_TRUE }; + fnvlist_add_boolean_array(nvl, key, val, 2); + test("boolean_array", B_FALSE, B_FALSE); + } + { + uchar_t val[2] = { 0, 1 }; + fnvlist_add_byte_array(nvl, key, val, 2); + test("byte_array", B_FALSE, B_FALSE); + } + { + int8_t val[2] = { 0, 1 }; + fnvlist_add_int8_array(nvl, key, val, 2); + test("int8_array", B_FALSE, B_FALSE); + } + { + uint8_t val[2] = { 0, 1 }; + fnvlist_add_uint8_array(nvl, key, val, 2); + test("uint8_array", B_FALSE, B_FALSE); + } + { + int16_t val[2] = { 0, 1 }; + fnvlist_add_int16_array(nvl, key, val, 2); + test("int16_array", B_FALSE, B_FALSE); + } + { + uint16_t val[2] = { 0, 1 }; + fnvlist_add_uint16_array(nvl, key, val, 2); + test("uint16_array", B_FALSE, B_FALSE); + } + { + int32_t val[2] = { 0, 1 }; + fnvlist_add_int32_array(nvl, key, val, 2); + test("int32_array", B_FALSE, B_FALSE); + } + { + uint32_t val[2] = { 0, 1 }; + fnvlist_add_uint32_array(nvl, key, val, 2); + test("uint32_array", B_FALSE, B_FALSE); + } + { + int64_t val[2] = { 0, 1 }; + fnvlist_add_int64_array(nvl, key, val, 2); + test("int64_array", B_TRUE, B_FALSE); + } + { + uint64_t val[2] = { 0, 1 }; + fnvlist_add_uint64_array(nvl, key, val, 2); + test("uint64_array", B_FALSE, B_FALSE); + } + { + char *const val[2] = { "0", "1" }; + fnvlist_add_string_array(nvl, key, val, 2); + test("string_array", B_TRUE, B_FALSE); + } + { + nvlist_t *val[2]; + val[0] = fnvlist_alloc(); + fnvlist_add_string(val[0], "subkey", "subvalue"); + val[1] = fnvlist_alloc(); + fnvlist_add_string(val[1], "subkey2", "subvalue2"); + fnvlist_add_nvlist_array(nvl, key, val, 2); + fnvlist_free(val[0]); + fnvlist_free(val[1]); + test("nvlist_array", B_FALSE, B_FALSE); + } + { + fnvlist_add_string(nvl, bigstring, "1"); + test("large_key", B_TRUE, B_TRUE); + } + { + fnvlist_add_string(nvl, key, bigstring); + test("large_value", B_TRUE, B_TRUE); + } + { + for (int i = 0; i < 1024; i++) { + char buf[32]; + (void) snprintf(buf, sizeof (buf), "key-%u", i); + fnvlist_add_int64(nvl, buf, i); + } + test("many_keys", B_TRUE, B_TRUE); + } +#ifndef __sparc__ + { + for (int i = 0; i < 10; i++) { + nvlist_t *newval = fnvlist_alloc(); + fnvlist_add_nvlist(newval, "key", nvl); + fnvlist_free(nvl); + nvl = newval; + } + test("deeply_nested_pos", B_TRUE, B_TRUE); + } + { + for (int i = 0; i < 90; i++) { + nvlist_t *newval = fnvlist_alloc(); + fnvlist_add_nvlist(newval, "key", nvl); + fnvlist_free(nvl); + nvl = newval; + } + test("deeply_nested_neg", B_FALSE, B_FALSE); + } +#endif + free(bigstring); + fnvlist_free(nvl); +} + +int +main(int argc, const char *argv[]) +{ + (void) libzfs_core_init(); + + if (argc != 2) { + (void) printf("usage: %s \n", + argv[0]); + exit(2); + } + pool = argv[1]; + + run_tests(); + + libzfs_core_fini(); + return (unexpected_failures); +} diff --git a/tests/zfs-tests/cmd/randfree_file/.gitignore b/tests/zfs-tests/cmd/randfree_file/.gitignore new file mode 100644 index 000000000000..0f5b394c5fbd --- /dev/null +++ b/tests/zfs-tests/cmd/randfree_file/.gitignore @@ -0,0 +1 @@ +/randfree_file diff --git a/tests/zfs-tests/cmd/randfree_file/Makefile.am b/tests/zfs-tests/cmd/randfree_file/Makefile.am new file mode 100644 index 000000000000..6306e0e75740 --- /dev/null +++ b/tests/zfs-tests/cmd/randfree_file/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = randfree_file +randfree_file_SOURCES = randfree_file.c diff --git a/tests/zfs-tests/cmd/randfree_file/randfree_file.c b/tests/zfs-tests/cmd/randfree_file/randfree_file.c new file mode 100644 index 000000000000..c708d647e8b9 --- /dev/null +++ b/tests/zfs-tests/cmd/randfree_file/randfree_file.c @@ -0,0 +1,125 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +#include "../file_common.h" +#include +#include +#include +#include +#include + +/* + * Create a file with assigned size and then free the specified + * section of the file + */ + +static void usage(char *progname); + +static void +usage(char *progname) +{ + (void) fprintf(stderr, + "usage: %s [-l filesize] [-s start-offset]" + "[-n section-len] filename\n", progname); + exit(1); +} + +int +main(int argc, char *argv[]) +{ + char *filename = NULL; + char *buf = NULL; + size_t filesize = 0; + off_t start_off = 0; + off_t off_len = 0; + int fd, ch; + mode_t mode = S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + + while ((ch = getopt(argc, argv, "l:s:n:")) != EOF) { + switch (ch) { + case 'l': + filesize = atoll(optarg); + break; + case 's': + start_off = atoll(optarg); + break; + case 'n': + off_len = atoll(optarg); + break; + default: + usage(argv[0]); + break; + } + } + + if (optind == argc - 1) + filename = argv[optind]; + else + usage(argv[0]); + + if ((fd = open(filename, O_RDWR | O_CREAT | O_TRUNC, mode)) < 0) { + perror("open"); + return (1); + } + + buf = (char *)calloc(1, filesize); + if (buf == NULL) { + perror("write"); + close(fd); + return (1); + } + memset(buf, 'c', filesize); + + if (write(fd, buf, filesize) < filesize) { + free(buf); + perror("write"); + close(fd); + return (1); + } + + free(buf); + +#if defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) + if (fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + start_off, off_len) < 0) { + perror("fallocate"); + close(fd); + return (1); + } +#else /* !(defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE)) */ + { + perror("FALLOC_FL_PUNCH_HOLE unsupported"); + close(fd); + return (1); + } +#endif /* defined(FALLOC_FL_PUNCH_HOLE) && defined(FALLOC_FL_KEEP_SIZE) */ + close(fd); + return (0); +} diff --git a/tests/zfs-tests/cmd/randwritecomp/.gitignore b/tests/zfs-tests/cmd/randwritecomp/.gitignore new file mode 100644 index 000000000000..fb231c678cb2 --- /dev/null +++ b/tests/zfs-tests/cmd/randwritecomp/.gitignore @@ -0,0 +1 @@ +/randwritecomp diff --git a/tests/zfs-tests/cmd/randwritecomp/Makefile.am b/tests/zfs-tests/cmd/randwritecomp/Makefile.am new file mode 100644 index 000000000000..0002291fa7bf --- /dev/null +++ b/tests/zfs-tests/cmd/randwritecomp/Makefile.am @@ -0,0 +1,9 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +DEFAULT_INCLUDES += \ + -I$(top_srcdir)/include + +pkgexec_PROGRAMS = randwritecomp +randwritecomp_SOURCES = randwritecomp.c diff --git a/tests/zfs-tests/cmd/randwritecomp/randwritecomp.c b/tests/zfs-tests/cmd/randwritecomp/randwritecomp.c new file mode 100644 index 000000000000..708d5ee90511 --- /dev/null +++ b/tests/zfs-tests/cmd/randwritecomp/randwritecomp.c @@ -0,0 +1,194 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2017 by Delphix. All rights reserved. + */ + +/* + * The following is defined so the source can use + * lrand48() and srand48(). + */ +#define __EXTENSIONS__ + +#include +#include +#include "../file_common.h" + +/* + * The following sample was derived from real-world data + * of a production Oracle database. + */ +static uint64_t size_distribution[] = { + 0, + 1499018, + 352084, + 1503485, + 4206227, + 5626657, + 5387001, + 3733756, + 2233094, + 874652, + 238635, + 81434, + 33357, + 13106, + 2009, + 1, + 23660, +}; + + +static uint64_t distribution_n; + +static uint8_t randbuf[BLOCKSZ]; + +static void +rwc_pwrite(int fd, const void *buf, size_t nbytes, off_t offset) +{ + size_t nleft = nbytes; + ssize_t nwrite = 0; + + nwrite = pwrite(fd, buf, nbytes, offset); + if (nwrite < 0) { + perror("pwrite"); + exit(EXIT_FAILURE); + } + + nleft -= nwrite; + if (nleft != 0) { + (void) fprintf(stderr, "warning: pwrite: " + "wrote %zu out of %zu bytes\n", + (nbytes - nleft), nbytes); + } +} + +static void +fillbuf(char *buf) +{ + uint64_t rv = lrand48() % distribution_n; + uint64_t sum = 0; + + uint64_t i; + for (i = 0; + i < sizeof (size_distribution) / sizeof (size_distribution[0]); + i++) { + sum += size_distribution[i]; + if (rv < sum) + break; + } + + bcopy(randbuf, buf, BLOCKSZ); + if (i == 0) + bzero(buf, BLOCKSZ - 10); + else if (i < 16) + bzero(buf, BLOCKSZ - i * 512 + 256); + /*LINTED: E_BAD_PTR_CAST_ALIGN*/ + ((uint32_t *)buf)[0] = lrand48(); +} + +static void +exit_usage(void) +{ + (void) printf("usage: "); + (void) printf("randwritecomp [-s] [nwrites]\n"); + exit(EXIT_FAILURE); +} + +static void +sequential_writes(int fd, char *buf, uint64_t nblocks, int64_t n) +{ + for (int64_t i = 0; n == -1 || i < n; i++) { + fillbuf(buf); + + static uint64_t j = 0; + if (j == 0) + j = lrand48() % nblocks; + rwc_pwrite(fd, buf, BLOCKSZ, j * BLOCKSZ); + j++; + if (j >= nblocks) + j = 0; + } +} + +static void +random_writes(int fd, char *buf, uint64_t nblocks, int64_t n) +{ + for (int64_t i = 0; n == -1 || i < n; i++) { + fillbuf(buf); + rwc_pwrite(fd, buf, BLOCKSZ, (lrand48() % nblocks) * BLOCKSZ); + } +} + +int +main(int argc, char *argv[]) +{ + int fd, err; + char *filename = NULL; + char buf[BLOCKSZ]; + struct stat ss; + uint64_t nblocks; + int64_t n = -1; + int sequential = 0; + + if (argc < 2) + exit_usage(); + + argv++; + if (strcmp("-s", argv[0]) == 0) { + sequential = 1; + argv++; + } + + if (argv[0] == NULL) + exit_usage(); + else + filename = argv[0]; + + argv++; + if (argv[0] != NULL) + n = strtoull(argv[0], NULL, 0); + + fd = open(filename, O_RDWR|O_CREAT, 0666); + err = fstat(fd, &ss); + if (err != 0) { + (void) fprintf(stderr, + "error: fstat returned error code %d\n", err); + exit(EXIT_FAILURE); + } + + nblocks = ss.st_size / BLOCKSZ; + if (nblocks == 0) { + (void) fprintf(stderr, "error: " + "file is too small (min allowed size is %d bytes)\n", + BLOCKSZ); + exit(EXIT_FAILURE); + } + + srand48(getpid()); + for (int i = 0; i < BLOCKSZ; i++) + randbuf[i] = lrand48(); + + distribution_n = 0; + for (uint64_t i = 0; + i < sizeof (size_distribution) / sizeof (size_distribution[0]); + i++) { + distribution_n += size_distribution[i]; + } + + if (sequential) + sequential_writes(fd, buf, nblocks, n); + else + random_writes(fd, buf, nblocks, n); + + return (0); +} diff --git a/tests/zfs-tests/cmd/readmmap/.gitignore b/tests/zfs-tests/cmd/readmmap/.gitignore new file mode 100644 index 000000000000..3799193a92be --- /dev/null +++ b/tests/zfs-tests/cmd/readmmap/.gitignore @@ -0,0 +1 @@ +/readmmap diff --git a/tests/zfs-tests/cmd/readmmap/Makefile.am b/tests/zfs-tests/cmd/readmmap/Makefile.am new file mode 100644 index 000000000000..9b735c287e69 --- /dev/null +++ b/tests/zfs-tests/cmd/readmmap/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = readmmap +readmmap_SOURCES = readmmap.c diff --git a/tests/zfs-tests/cmd/readmmap/readmmap.c b/tests/zfs-tests/cmd/readmmap/readmmap.c new file mode 100644 index 000000000000..e21c2c867d9a --- /dev/null +++ b/tests/zfs-tests/cmd/readmmap/readmmap.c @@ -0,0 +1,138 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * -------------------------------------------------------------- + * BugId 5047993 : Getting bad read data. + * + * Usage: readmmap + * + * where: + * filename is an absolute path to the file name. + * + * Return values: + * 1 : error + * 0 : no errors + * -------------------------------------------------------------- + */ +#include +#include +#include +#include +#include +#include +#include + +int +main(int argc, char **argv) +{ + char *filename = "badfile"; + size_t size = 4395; + size_t idx = 0; + char *buf = NULL; + char *map = NULL; + int fd = -1, bytes, retval = 0; + unsigned seed; + + if (argc < 2 || optind == argc) { + (void) fprintf(stderr, + "usage: %s \n", argv[0]); + exit(1); + } + + if ((buf = calloc(1, size)) == NULL) { + perror("calloc"); + exit(1); + } + + filename = argv[optind]; + + (void) remove(filename); + + fd = open(filename, O_RDWR|O_CREAT|O_TRUNC, 0666); + if (fd == -1) { + perror("open to create"); + retval = 1; + goto end; + } + + bytes = write(fd, buf, size); + if (bytes != size) { + (void) printf("short write: %d != %zd\n", bytes, size); + retval = 1; + goto end; + } + + map = mmap(0, size, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); + if (map == MAP_FAILED) { + perror("mmap"); + retval = 1; + goto end; + } + seed = time(NULL); + srandom(seed); + + idx = random() % size; + map[idx] = 1; + + if (msync(map, size, MS_SYNC) != 0) { + perror("msync"); + retval = 1; + goto end; + } + + if (munmap(map, size) != 0) { + perror("munmap"); + retval = 1; + goto end; + } + + bytes = pread(fd, buf, size, 0); + if (bytes != size) { + (void) printf("short read: %d != %zd\n", bytes, size); + retval = 1; + goto end; + } + + if (buf[idx] != 1) { + (void) printf( + "bad data from read! got buf[%zd]=%d, expected 1\n", + idx, buf[idx]); + retval = 1; + goto end; + } + + (void) printf("good data from read: buf[%zd]=1\n", idx); +end: + if (fd != -1) { + (void) close(fd); + } + if (buf != NULL) { + free(buf); + } + + return (retval); +} diff --git a/tests/zfs-tests/cmd/rename_dir/.gitignore b/tests/zfs-tests/cmd/rename_dir/.gitignore new file mode 100644 index 000000000000..39a0cb222ad1 --- /dev/null +++ b/tests/zfs-tests/cmd/rename_dir/.gitignore @@ -0,0 +1 @@ +/rename_dir diff --git a/tests/zfs-tests/cmd/rename_dir/Makefile.am b/tests/zfs-tests/cmd/rename_dir/Makefile.am new file mode 100644 index 000000000000..21971cd888fb --- /dev/null +++ b/tests/zfs-tests/cmd/rename_dir/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = rename_dir +rename_dir_SOURCES = rename_dir.c diff --git a/tests/zfs-tests/cmd/rename_dir/rename_dir.c b/tests/zfs-tests/cmd/rename_dir/rename_dir.c new file mode 100644 index 000000000000..5f80f7229462 --- /dev/null +++ b/tests/zfs-tests/cmd/rename_dir/rename_dir.c @@ -0,0 +1,88 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * Assertion: + * Create two directory trees in zfs filesystem, and rename + * directory across the directory structure. ZFS can handle + * the race situation. + */ + +/* + * Need to create the following directory structures before + * running this program: + * + * mkdir -p 1/2/3/4/5 a/b/c/d/e + */ + + +#include +#include +#include +#include + +int +main(int argc, char *argvp[]) +{ + int i = 1; + + switch (fork()) { + case -1: + perror("fork"); + exit(1); + break; + case 0: + while (i > 0) { + int c_count = 0; + if (rename("a/b/c", "1/2/3/c") == 0) + c_count++; + if (rename("1/2/3/c", "a/b/c") == 0) + c_count++; + if (c_count) { + (void) fprintf(stderr, "c_count: %d", c_count); + } + } + break; + default: + while (i > 0) { + int p_count = 0; + if (rename("1", "a/b/c/d/e/1") == 0) + p_count++; + if (rename("a/b/c/d/e/1", "1") == 0) + p_count++; + if (p_count) { + (void) fprintf(stderr, "p_count: %d", p_count); + } + } + break; + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/.gitignore b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/.gitignore new file mode 100644 index 000000000000..fc6323fb3ff3 --- /dev/null +++ b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/.gitignore @@ -0,0 +1 @@ +/rm_lnkcnt_zero_file diff --git a/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile.am b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile.am new file mode 100644 index 000000000000..90fc8d0541b6 --- /dev/null +++ b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = rm_lnkcnt_zero_file +rm_lnkcnt_zero_file_SOURCES = rm_lnkcnt_zero_file.c +rm_lnkcnt_zero_file_LDADD = -lpthread diff --git a/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c new file mode 100644 index 000000000000..e262ecefea92 --- /dev/null +++ b/tests/zfs-tests/cmd/rm_lnkcnt_zero_file/rm_lnkcnt_zero_file.c @@ -0,0 +1,159 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2012 by Delphix. All rights reserved. + */ + +/* + * -------------------------------------------------------------------- + * The purpose of this test is to see if the bug reported (#4723351) for + * UFS exists when using a ZFS file system. + * -------------------------------------------------------------------- + * + */ +#define _REENTRANT 1 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static char *filebase; + +static int +pickidx(void) +{ + return (random() % 1000); +} + +/* ARGSUSED */ +static void * +mover(void *a) +{ + char buf[256]; + int idx, len, ret; + + len = strlen(filebase) + 5; + + for (;;) { + idx = pickidx(); + (void) snprintf(buf, len, "%s.%03d", filebase, idx); + ret = rename(filebase, buf); + if (ret < 0 && errno != ENOENT) + (void) perror("renaming file"); + } + + return (NULL); +} + +/* ARGSUSED */ +static void * +cleaner(void *a) +{ + char buf[256]; + int idx, len, ret; + + len = strlen(filebase) + 5; + + for (;;) { + idx = pickidx(); + (void) snprintf(buf, len, "%s.%03d", filebase, idx); + ret = remove(buf); + if (ret < 0 && errno != ENOENT) + (void) perror("removing file"); + } + + return (NULL); +} + +static void * +writer(void *a) +{ + int *fd = (int *)a; + int ret; + + for (;;) { + if (*fd != -1) + (void) close (*fd); + + *fd = open(filebase, O_APPEND | O_RDWR | O_CREAT, 0644); + if (*fd == -1) { + perror("fail to open test file, refreshing it"); + continue; + } + + ret = write(*fd, "test\n", 5); + if (ret != 5) + perror("writing file"); + } + + return (NULL); +} + +int +main(int argc, char **argv) +{ + int fd; + pthread_t tid; + + if (argc == 1) { + (void) printf("Usage: %s \n", argv[0]); + exit(-1); + } + + filebase = argv[1]; + fd = open(filebase, O_APPEND | O_RDWR | O_CREAT, 0644); + if (fd < 0) { + perror("creating test file"); + exit(-1); + } + + (void) pthread_setconcurrency(4); /* 3 threads + main */ + (void) pthread_create(&tid, NULL, mover, NULL); + (void) pthread_create(&tid, NULL, cleaner, NULL); + (void) pthread_create(&tid, NULL, writer, (void *) &fd); + + for (;;) { + int ret; + struct stat st; + + ret = stat(filebase, &st); + if (ret == 0 && (st.st_nlink > 2 || st.st_nlink < 1)) { + (void) printf("st.st_nlink = %d, exiting\n", \ + (int)st.st_nlink); + exit(0); + } + (void) sleep(1); + } + + return (0); +} diff --git a/tests/zfs-tests/cmd/stride_dd/.gitignore b/tests/zfs-tests/cmd/stride_dd/.gitignore new file mode 100644 index 000000000000..7c072ee0dec6 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/.gitignore @@ -0,0 +1 @@ +/stride_dd diff --git a/tests/zfs-tests/cmd/stride_dd/Makefile.am b/tests/zfs-tests/cmd/stride_dd/Makefile.am new file mode 100644 index 000000000000..d6f1adbac2b7 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = stride_dd +stride_dd_SOURCES = stride_dd.c +stride_dd_LDADD = -lrt diff --git a/tests/zfs-tests/cmd/stride_dd/stride_dd.c b/tests/zfs-tests/cmd/stride_dd/stride_dd.c new file mode 100644 index 000000000000..88bd532923c7 --- /dev/null +++ b/tests/zfs-tests/cmd/stride_dd/stride_dd.c @@ -0,0 +1,214 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright (c) 2018 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include + +static int bsize = 0; +static int count = 0; +static char *ifile = NULL; +static char *ofile = NULL; +static int stride = 0; +static int seek = 0; +static char *execname = "stride_dd"; + +static void usage(void); +static void parse_options(int argc, char *argv[]); + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: %s -i inputfile -o outputfile -b blocksize -c count \n" + " -s stride [ -k seekblocks]\n" + "\n" + "Simplified version of dd that supports the stride option.\n" + "A stride of n means that for each block written, n - 1 blocks\n" + "are skipped in both the input and output file. A stride of 1\n" + "means that blocks are read and written consecutively.\n" + "All numeric parameters must be integers.\n" + "\n" + " inputfile: File to read from\n" + " outputfile: File to write to\n" + " blocksize: Size of each block to read/write\n" + " count: Number of blocks to read/write\n" + " stride: Read/write a block then skip (stride - 1) blocks\n" + " seekblocks: Number of blocks to skip at start of output\n", + execname); + (void) exit(1); +} + +static void +parse_options(int argc, char *argv[]) +{ + int c; + int errflag = 0; + + execname = argv[0]; + + extern char *optarg; + extern int optind, optopt; + + while ((c = getopt(argc, argv, ":b:c:i:o:s:k:")) != -1) { + switch (c) { + case 'b': + bsize = atoi(optarg); + break; + + case 'c': + count = atoi(optarg); + break; + + case 'i': + ifile = optarg; + break; + + case 'o': + ofile = optarg; + break; + + case 's': + stride = atoi(optarg); + break; + + case 'k': + seek = atoi(optarg); + break; + + case ':': + (void) fprintf(stderr, + "Option -%c requires an operand\n", optopt); + errflag++; + break; + + case '?': + default: + (void) fprintf(stderr, + "Unrecognized option: -%c\n", optopt); + errflag++; + break; + } + + if (errflag) { + (void) usage(); + } + } + + if (bsize <= 0 || count <= 0 || stride <= 0 || ifile == NULL || + ofile == NULL || seek < 0) { + (void) fprintf(stderr, + "Required parameter(s) missing or invalid.\n"); + (void) usage(); + } +} + +int +main(int argc, char *argv[]) +{ + int i; + int ifd; + int ofd; + void *buf; + int c; + + parse_options(argc, argv); + + ifd = open(ifile, O_RDONLY); + if (ifd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ifile); + perror("open"); + exit(2); + } + + ofd = open(ofile, O_WRONLY | O_CREAT, 0666); + if (ofd == -1) { + (void) fprintf(stderr, "%s: %s: ", execname, ofile); + perror("open"); + exit(2); + } + + /* + * We use valloc because some character block devices expect a + * page-aligned buffer. + */ + int err = posix_memalign(&buf, 4096, bsize); + if (err != 0) { + (void) fprintf(stderr, + "%s: %s\n", execname, strerror(err)); + exit(2); + } + + if (seek > 0) { + if (lseek(ofd, seek * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + + for (i = 0; i < count; i++) { + c = read(ifd, buf, bsize); + if (c != bsize) { + + perror("read"); + exit(2); + } + if (c != bsize) { + if (c < 0) { + perror("read"); + } else { + (void) fprintf(stderr, + "%s: unexpected short read, read %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + c = write(ofd, buf, bsize); + if (c != bsize) { + if (c < 0) { + perror("write"); + } else { + (void) fprintf(stderr, + "%s: unexpected short write, wrote %d " + "bytes, expected %d\n", execname, + c, bsize); + } + exit(2); + } + + if (stride > 1) { + if (lseek(ifd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("input lseek"); + exit(2); + } + if (lseek(ofd, (stride - 1) * bsize, SEEK_CUR) == -1) { + perror("output lseek"); + exit(2); + } + } + } + free(buf); + + (void) close(ofd); + (void) close(ifd); + + return (0); +} diff --git a/tests/zfs-tests/cmd/threadsappend/.gitignore b/tests/zfs-tests/cmd/threadsappend/.gitignore new file mode 100644 index 000000000000..4c8c8cdf34c1 --- /dev/null +++ b/tests/zfs-tests/cmd/threadsappend/.gitignore @@ -0,0 +1 @@ +/threadsappend diff --git a/tests/zfs-tests/cmd/threadsappend/Makefile.am b/tests/zfs-tests/cmd/threadsappend/Makefile.am new file mode 100644 index 000000000000..f030b42d50fe --- /dev/null +++ b/tests/zfs-tests/cmd/threadsappend/Makefile.am @@ -0,0 +1,7 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = threadsappend +threadsappend_SOURCES = threadsappend.c +threadsappend_LDADD = -lpthread diff --git a/tests/zfs-tests/cmd/threadsappend/threadsappend.c b/tests/zfs-tests/cmd/threadsappend/threadsappend.c new file mode 100644 index 000000000000..25710a3c12ef --- /dev/null +++ b/tests/zfs-tests/cmd/threadsappend/threadsappend.c @@ -0,0 +1,135 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Copyright (c) 2013 by Delphix. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * The size of the output file, "go.out", should be 80*8192*2 = 1310720 + * + * $ cd /tmp; go; ls -l go.out + * done. + * -rwxr-xr-x 1 jdm staff 1310720 Apr 13 19:45 go.out + * $ cd /zfs; go; ls -l go.out + * done. + * -rwxr-xr-x 1 jdm staff 663552 Apr 13 19:45 go.out + * + * The file on zfs is short as it does not appear that zfs is making the + * implicit seek to EOF and the actual write atomic. From the SUSv3 + * interface spec, behavior is undefined if concurrent writes are performed + * from multi-processes to a single file. So I don't know if this is a + * standards violation, but I cannot find any such disclaimers in our + * man pages. This issue came up at a customer site in another context, and + * the suggestion was to open the file with O_APPEND, but that wouldn't + * help with zfs(see 4977529). Also see bug# 5031301. + */ + +static int outfd = 0; + +static void * +go(void *data) +{ + int ret, i = 0, n = *(int *)data; + char buf[8192] = {0}; + (void) memset(buf, n, sizeof (buf)); + + for (i = 0; i < 80; i++) { + ret = write(outfd, buf, sizeof (buf)); + if (ret != sizeof (buf)) + perror("write"); + } + return (NULL); +} + +static void +usage(void) +{ + (void) fprintf(stderr, + "usage: zfs_threadsappend \n"); + exit(1); +} + +int +main(int argc, char **argv) +{ + pthread_t tid; + int ret = 0; + long ncpus = 0; + int i; + + if (argc != 2) { + usage(); + } + + ncpus = sysconf(_SC_NPROCESSORS_ONLN); + if (ncpus < 0) { + (void) fprintf(stderr, + "Invalid return from sysconf(_SC_NPROCESSORS_ONLN)" + " : errno (decimal)=%d\n", errno); + exit(1); + } + if (ncpus < 2) { + (void) fprintf(stderr, + "Must execute this binary on a multi-processor system\n"); + exit(1); + } + + outfd = open(argv[optind++], O_RDWR|O_CREAT|O_APPEND|O_TRUNC, 0777); + if (outfd == -1) { + (void) fprintf(stderr, + "zfs_threadsappend: " + "open(%s, O_RDWR|O_CREAT|O_APPEND|O_TRUNC, 0777)" + " failed\n", argv[optind]); + perror("open"); + exit(1); + } + + for (i = 0; i < 2; i++) { + ret = pthread_create(&tid, NULL, go, (void *)&i); + if (ret != 0) { + (void) fprintf(stderr, + "zfs_threadsappend: thr_create(#%d) " + "failed error=%d\n", i+1, ret); + exit(1); + } + } + + while (pthread_join(tid, NULL) == 0) + continue; + + return (0); +} diff --git a/tests/zfs-tests/cmd/user_ns_exec/.gitignore b/tests/zfs-tests/cmd/user_ns_exec/.gitignore new file mode 100644 index 000000000000..655867a640a3 --- /dev/null +++ b/tests/zfs-tests/cmd/user_ns_exec/.gitignore @@ -0,0 +1 @@ +/user_ns_exec diff --git a/tests/zfs-tests/cmd/user_ns_exec/Makefile.am b/tests/zfs-tests/cmd/user_ns_exec/Makefile.am new file mode 100644 index 000000000000..5b4bc9aaa683 --- /dev/null +++ b/tests/zfs-tests/cmd/user_ns_exec/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = user_ns_exec +user_ns_exec_SOURCES = user_ns_exec.c diff --git a/tests/zfs-tests/cmd/user_ns_exec/user_ns_exec.c b/tests/zfs-tests/cmd/user_ns_exec/user_ns_exec.c new file mode 100644 index 000000000000..cd46738bd0bd --- /dev/null +++ b/tests/zfs-tests/cmd/user_ns_exec/user_ns_exec.c @@ -0,0 +1,179 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define EXECSHELL "/bin/sh" +#define UIDMAP "0 100000 65536" + +static int +child_main(int argc, char *argv[], int sync_pipe) +{ + char sync_buf; + char cmds[BUFSIZ] = { 0 }; + char sep[] = " "; + int i, len; + + if (unshare(CLONE_NEWUSER | CLONE_NEWNS) != 0) { + perror("unshare"); + return (1); + } + + /* tell parent we entered the new namespace */ + if (write(sync_pipe, "1", 1) != 1) { + perror("write"); + return (1); + } + + /* wait for parent to setup the uid mapping */ + if (read(sync_pipe, &sync_buf, 1) != 1) { + (void) fprintf(stderr, "user namespace setup failed\n"); + return (1); + } + + close(sync_pipe); + + if (setuid(0) != 0) { + perror("setuid"); + return (1); + } + if (setgid(0) != 0) { + perror("setgid"); + return (1); + } + + len = 0; + for (i = 1; i < argc; i++) { + (void) snprintf(cmds+len, sizeof (cmds)-len, + "%s%s", argv[i], sep); + len += strlen(argv[i]) + strlen(sep); + } + + if (execl(EXECSHELL, "sh", "-c", cmds, (char *)NULL) != 0) { + perror("execl: " EXECSHELL); + return (1); + } + + return (0); +} + +static int +set_idmap(pid_t pid, const char *file) +{ + int result = 0; + int mapfd; + char path[PATH_MAX]; + + (void) snprintf(path, sizeof (path), "/proc/%d/%s", (int)pid, file); + + mapfd = open(path, O_WRONLY); + if (mapfd < 0) { + result = errno; + perror("open"); + return (errno); + } + + if (write(mapfd, UIDMAP, sizeof (UIDMAP)-1) != sizeof (UIDMAP)-1) { + perror("write"); + result = (errno); + } + + close(mapfd); + + return (result); +} + +int +main(int argc, char *argv[]) +{ + char sync_buf; + int result, wstatus; + int syncfd[2]; + pid_t child; + + if (argc < 2 || strlen(argv[1]) == 0) { + (void) printf("\tUsage: %s ...\n", argv[0]); + return (1); + } + + if (socketpair(AF_UNIX, SOCK_STREAM, 0, syncfd) != 0) { + perror("socketpair"); + return (1); + } + + child = fork(); + if (child == (pid_t)-1) { + perror("fork"); + return (1); + } + + if (child == 0) { + close(syncfd[0]); + return (child_main(argc, argv, syncfd[1])); + } + + close(syncfd[1]); + + result = 0; + /* wait for the child to have unshared its namespaces */ + if (read(syncfd[0], &sync_buf, 1) != 1) { + perror("read"); + kill(child, SIGKILL); + result = 1; + goto reap; + } + + /* write uid mapping */ + if (set_idmap(child, "uid_map") != 0 || + set_idmap(child, "gid_map") != 0) { + result = 1; + kill(child, SIGKILL); + goto reap; + } + + /* tell the child to proceed */ + if (write(syncfd[0], "1", 1) != 1) { + perror("write"); + kill(child, SIGKILL); + result = 1; + goto reap; + } + close(syncfd[0]); + +reap: + while (waitpid(child, &wstatus, 0) != child) + kill(child, SIGKILL); + if (result == 0) + result = WEXITSTATUS(wstatus); + + return (result); +} diff --git a/tests/zfs-tests/cmd/xattrtest/.gitignore b/tests/zfs-tests/cmd/xattrtest/.gitignore new file mode 100644 index 000000000000..7d2128383639 --- /dev/null +++ b/tests/zfs-tests/cmd/xattrtest/.gitignore @@ -0,0 +1 @@ +/xattrtest diff --git a/tests/zfs-tests/cmd/xattrtest/Makefile.am b/tests/zfs-tests/cmd/xattrtest/Makefile.am new file mode 100644 index 000000000000..7398ae634629 --- /dev/null +++ b/tests/zfs-tests/cmd/xattrtest/Makefile.am @@ -0,0 +1,6 @@ +include $(top_srcdir)/config/Rules.am + +pkgexecdir = $(datadir)/@PACKAGE@/zfs-tests/bin + +pkgexec_PROGRAMS = xattrtest +xattrtest_SOURCES = xattrtest.c diff --git a/tests/zfs-tests/cmd/xattrtest/xattrtest.c b/tests/zfs-tests/cmd/xattrtest/xattrtest.c new file mode 100644 index 000000000000..42c510ed082d --- /dev/null +++ b/tests/zfs-tests/cmd/xattrtest/xattrtest.c @@ -0,0 +1,718 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2016 Lawrence Livermore National Security, LLC. + */ + +/* + * An extended attribute (xattr) correctness test. This program creates + * N files and sets M attrs on them of size S. Optionally is will verify + * a pattern stored in the xattr. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +extern char *program_invocation_short_name; + +#define ERROR(fmt, ...) \ + fprintf(stderr, "%s: %s:%d: %s: " fmt "\n", \ + program_invocation_short_name, __FILE__, __LINE__, \ + __func__, ## __VA_ARGS__); + +static const char shortopts[] = "hvycdn:f:x:s:p:t:e:rRko:"; +static const struct option longopts[] = { + { "help", no_argument, 0, 'h' }, + { "verbose", no_argument, 0, 'v' }, + { "verify", no_argument, 0, 'y' }, + { "nth", required_argument, 0, 'n' }, + { "files", required_argument, 0, 'f' }, + { "xattrs", required_argument, 0, 'x' }, + { "size", required_argument, 0, 's' }, + { "path", required_argument, 0, 'p' }, + { "synccaches", no_argument, 0, 'c' }, + { "dropcaches", no_argument, 0, 'd' }, + { "script", required_argument, 0, 't' }, + { "seed", required_argument, 0, 'e' }, + { "random", no_argument, 0, 'r' }, + { "randomvalue", no_argument, 0, 'R' }, + { "keep", no_argument, 0, 'k' }, + { "only", required_argument, 0, 'o' }, + { 0, 0, 0, 0 } +}; + +enum phases { + PHASE_ALL = 0, + PHASE_CREATE, + PHASE_SETXATTR, + PHASE_GETXATTR, + PHASE_UNLINK, + PHASE_INVAL +}; + +static int verbose = 0; +static int verify = 0; +static int synccaches = 0; +static int dropcaches = 0; +static int nth = 0; +static int files = 1000; +static int xattrs = 1; +static int size = 6; +static int size_is_random = 0; +static int value_is_random = 0; +static int keep_files = 0; +static int phase = PHASE_ALL; +static char path[PATH_MAX] = "/tmp/xattrtest"; +static char script[PATH_MAX] = "/bin/true"; +static char xattrbytes[XATTR_SIZE_MAX]; + +static int +usage(int argc, char **argv) +{ + fprintf(stderr, + "usage: %s [-hvycdrRk] [-n ] [-f ] [-x ]\n" + " [-s ] [-p ] [-t